diff --git a/.cmake-format.py b/.cmake-format.py
new file mode 100644
index 0000000000000..62f5651fb1c43
--- /dev/null
+++ b/.cmake-format.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -----------------------------
+# Options affecting formatting.
+# -----------------------------
+with section("format"):
+
+    # How wide to allow formatted cmake files
+    line_width = 80
+
+# ------------------------------------------------
+# Options affecting comment reflow and formatting.
+# ------------------------------------------------
+with section("markup"):
+    # enable comment markup parsing and reflow
+    enable_markup = False
+
+    # If comment markup is enabled, don't reflow the first comment block in each
+    # listfile. Use this to preserve formatting of your copyright/license
+    # statements.
+    first_comment_is_literal = True
+
+# ----------------------------------
+# Options affecting listfile parsing
+# ----------------------------------
+with section("parse"):
+    # Additional FLAGS and KWARGS for custom commands
+    additional_commands = {
+        "cc_library": {
+            "kwargs": {
+                "SRCS": '*',
+                "DEPS": '*',
+            }
+        },
+        "nv_library": {
+            "kwargs": {
+                "SRCS": '*',
+                "DEPS": '*',
+            }
+        },
+        "hip_library": {
+            "kwargs": {
+                "SRCS": '*',
+                "DEPS": '*',
+            }
+        },
+        "xpu_library": {
+            "kwargs": {
+                "SRCS": '*',
+                "DEPS": '*',
+            }
+        },
+        "hip_library": {
+            "kwargs": {
+                "SRCS": '*',
+                "DEPS": '*',
+            }
+        },
+        "hip_library": {
+            "kwargs": {
+                "SRCS": '*',
+                "DEPS": '*',
+            }
+        },
+        "go_library": {
+            "kwargs": {
+                "SRCS": '*',
+                "DEPS": '*',
+            }
+        },
+        "copy": {
+            "kwargs": {
+                "SRCS": '*',
+                "DSTS": '*',
+            }
+        },
+        "cc_test": {
+            "kwargs": {
+                "SRCS": '*',
+                "DEPS": '*',
+            }
+        },
+        "nv_test": {
+            "kwargs": {
+                "SRCS": '*',
+                "DEPS": '*',
+            }
+        },
+        "hip_test": {
+            "kwargs": {
+                "SRCS": '*',
+                "DEPS": '*',
+            }
+        },
+        "xpu_test": {
+            "kwargs": {
+                "SRCS": '*',
+                "DEPS": '*',
+            }
+        },
+        "go_test": {
+            "kwargs": {
+                "SRCS": '*',
+                "DEPS": '*',
+            }
+        },
+        "py_test": {
+            "kwargs": {
+                "SRCS": '*',
+                "DEPS": '*',
+            }
+        }
+    }
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 39d9ae5e0dcd7..4b588cbeb91dc 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,14 +1,19 @@
 repos:
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
-    sha: v1.0.1
+    rev: v1.1.14
     hooks:
     -   id: remove-crlf
         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
--   repo: https://github.com/PaddlePaddle/mirrors-yapf.git
-    sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
+-   repo: https://github.com/google/yapf
+    sha: v0.32.0
     hooks:
     -   id: yapf
         files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
+        exclude: |
+            (?x)^(
+                python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py|
+                python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
+            )$
 -   repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.1.0
     hooks:
@@ -28,6 +33,10 @@ repos:
         entry: bash ./tools/codestyle/clang_format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps)$
+        exclude: |
+            (?x)^(
+                paddle/fluid/distributed/ps/thirdparty/round_robin.h
+            )$
 -   repo: local
     hooks:
     -   id: cpplint-cpp-source
@@ -55,3 +64,13 @@ repos:
             (?x)^(
                 paddle/utils/.*
             )$
+-   repo: https://github.com/cheshirekow/cmake-format-precommit
+    rev: v0.6.13
+    hooks:
+    -   id: cmake-format
+        # exclude paddle/fluid/operators/CMakeLists.txt, see the comment
+        # https://github.com/PaddlePaddle/Paddle/pull/43057#pullrequestreview-993471860
+        exclude: |
+            (?x)^(
+                paddle/fluid/operators/CMakeLists.txt
+            )$
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f3ed08d56e6d6..70eb5f11ea168 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -255,6 +255,7 @@ option(WITH_POCKETFFT    "Compile with pocketfft support"      ON)
 option(WITH_RECORD_BUILDTIME    "Compile PaddlePaddle with record all targets build time"       OFF)
 option(WITH_CUSTOM_DEVICE "Compile with custom device support"    OFF)
 option(WITH_ARM_BRPC "Supprot Brpc in Arm"    OFF)
+option(WITH_FLPS     "FL PS mode"    OFF)
 
 if(WITH_RECORD_BUILDTIME)
     set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}")
diff --git a/cmake/FindGperftools.cmake b/cmake/FindGperftools.cmake
index 318f9f5fd3b5a..bb76469c750b8 100644
--- a/cmake/FindGperftools.cmake
+++ b/cmake/FindGperftools.cmake
@@ -17,47 +17,46 @@
 #  GPERFTOOLS_LIBRARIES          The Gperftools libraries (tcmalloc & profiler)
 #  GPERFTOOLS_INCLUDE_DIR        The location of Gperftools headers
 
-find_library(GPERFTOOLS_TCMALLOC
+find_library(
+  GPERFTOOLS_TCMALLOC
   NAMES tcmalloc
   HINTS ${Gperftools_ROOT_DIR}/lib)
-  
-find_library(GPERFTOOLS_PROFILER
+
+find_library(
+  GPERFTOOLS_PROFILER
   NAMES profiler
   HINTS ${Gperftools_ROOT_DIR}/lib)
 
-find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER
+find_library(
+  GPERFTOOLS_TCMALLOC_AND_PROFILER
   NAMES tcmalloc_and_profiler
   HINTS ${Gperftools_ROOT_DIR}/lib)
 
-find_path(GPERFTOOLS_INCLUDE_DIR
+find_path(
+  GPERFTOOLS_INCLUDE_DIR
   NAMES gperftools/heap-profiler.h
   HINTS ${Gperftools_ROOT_DIR}/include)
 
 set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER})
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(
-  Gperftools
-  DEFAULT_MSG
-  GPERFTOOLS_LIBRARIES
-  GPERFTOOLS_INCLUDE_DIR)
+find_package_handle_standard_args(Gperftools DEFAULT_MSG GPERFTOOLS_LIBRARIES
+                                  GPERFTOOLS_INCLUDE_DIR)
 
 mark_as_advanced(
-  Gperftools_ROOT_DIR
-  GPERFTOOLS_TCMALLOC
-  GPERFTOOLS_PROFILER
-  GPERFTOOLS_TCMALLOC_AND_PROFILER
-  GPERFTOOLS_LIBRARIES
-  GPERFTOOLS_INCLUDE_DIR)
+  Gperftools_ROOT_DIR GPERFTOOLS_TCMALLOC GPERFTOOLS_PROFILER
+  GPERFTOOLS_TCMALLOC_AND_PROFILER GPERFTOOLS_LIBRARIES GPERFTOOLS_INCLUDE_DIR)
 
 # create IMPORTED targets
-if (Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc)
+if(Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc)
   add_library(gperftools::tcmalloc UNKNOWN IMPORTED)
-  set_target_properties(gperftools::tcmalloc PROPERTIES
-    IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC}
-    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
+  set_target_properties(
+    gperftools::tcmalloc
+    PROPERTIES IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC}
+               INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
   add_library(gperftools::profiler UNKNOWN IMPORTED)
-  set_target_properties(gperftools::profiler PROPERTIES
-    IMPORTED_LOCATION ${GPERFTOOLS_PROFILER}
-    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
+  set_target_properties(
+    gperftools::profiler
+    PROPERTIES IMPORTED_LOCATION ${GPERFTOOLS_PROFILER}
+               INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
 endif()
diff --git a/cmake/FindNumPy.cmake b/cmake/FindNumPy.cmake
index 8cdd642ac0131..fc7cdb8c1923c 100644
--- a/cmake/FindNumPy.cmake
+++ b/cmake/FindNumPy.cmake
@@ -14,13 +14,14 @@ if(NOT PYTHON_EXECUTABLE)
   endif()
 endif()
 
-if (PYTHON_EXECUTABLE)
+if(PYTHON_EXECUTABLE)
   # write a python script that finds the numpy path
   file(WRITE ${PROJECT_BINARY_DIR}/FindNumpyPath.py
-      "try: import numpy; print(numpy.get_include())\nexcept:pass\n")
+       "try: import numpy; print(numpy.get_include())\nexcept:pass\n")
 
   # execute the find script
-  exec_program("${PYTHON_EXECUTABLE}" ${PROJECT_BINARY_DIR}
+  exec_program(
+    "${PYTHON_EXECUTABLE}" ${PROJECT_BINARY_DIR}
     ARGS "FindNumpyPath.py"
     OUTPUT_VARIABLE NUMPY_PATH)
 elseif(_numpy_out)
@@ -28,10 +29,12 @@ elseif(_numpy_out)
 endif(PYTHON_EXECUTABLE)
 
 find_path(PYTHON_NUMPY_INCLUDE_DIR numpy/arrayobject.h
-  HINTS "${NUMPY_PATH}" "${PYTHON_INCLUDE_PATH}")
+          HINTS "${NUMPY_PATH}" "${PYTHON_INCLUDE_PATH}")
 
 if(PYTHON_NUMPY_INCLUDE_DIR)
-  set(PYTHON_NUMPY_FOUND 1 CACHE INTERNAL "Python numpy found")
+  set(PYTHON_NUMPY_FOUND
+      1
+      CACHE INTERNAL "Python numpy found")
 endif(PYTHON_NUMPY_INCLUDE_DIR)
 
 include(FindPackageHandleStandardArgs)
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 92a526a2b58a7..304246da4aea6 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -31,9 +31,9 @@ endif()
 
 ## Find MKLML First.
 if(WITH_MKLML)
-  include(external/mklml)       # download, install mklml package
+  include(external/mklml) # download, install mklml package
   set(CBLAS_PROVIDER MKLML)
-  set(CBLAS_INC_DIR  ${MKLML_INC_DIR})
+  set(CBLAS_INC_DIR ${MKLML_INC_DIR})
   set(CBLAS_LIBRARIES ${MKLML_LIB})
 
   add_definitions(-DPADDLE_WITH_MKLML)
@@ -43,40 +43,48 @@ if(WITH_MKLML)
   target_link_libraries(cblas dynload_mklml)
 
   message(STATUS "Found cblas and lapack in MKLML "
-    "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+                 "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
 endif()
 
 ## Then find openblas.
 if(NOT DEFINED CBLAS_PROVIDER)
-  set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
+  set(OPENBLAS_ROOT
+      $ENV{OPENBLAS_ROOT}
+      CACHE PATH "Folder contains Openblas")
   set(OPENBLAS_INCLUDE_SEARCH_PATHS
-          ${OPENBLAS_ROOT}/include
-          /usr/include
-          /usr/include/lapacke
-          /usr/include/openblas
-          /usr/local/opt/openblas/include)
+      ${OPENBLAS_ROOT}/include /usr/include /usr/include/lapacke
+      /usr/include/openblas /usr/local/opt/openblas/include)
   set(OPENBLAS_LIB_SEARCH_PATHS
-          ${OPENBLAS_ROOT}/lib
-          /usr/lib
-          /usr/lib/blas/openblas
-          /usr/lib/openblas
-          /usr/local/opt/openblas/lib)
-
-  find_path(OPENBLAS_INC_DIR NAMES cblas.h
-    PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH)
-  find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
+      ${OPENBLAS_ROOT}/lib /usr/lib /usr/lib/blas/openblas /usr/lib/openblas
+      /usr/local/opt/openblas/lib)
+
+  find_path(
+    OPENBLAS_INC_DIR
+    NAMES cblas.h
+    PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}
+    NO_DEFAULT_PATH)
+  find_path(
+    OPENBLAS_LAPACKE_INC_DIR
+    NAMES lapacke.h
     PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
-  find_path(OPENBLAS_CONFIG_INC_DIR NAMES openblas_config.h
+  find_path(
+    OPENBLAS_CONFIG_INC_DIR
+    NAMES openblas_config.h
     PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
-  find_library(OPENBLAS_LIB NAMES openblas
+  find_library(
+    OPENBLAS_LIB
+    NAMES openblas
     PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
 
-  if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_CONFIG_INC_DIR AND OPENBLAS_LIB)
+  if(OPENBLAS_LAPACKE_INC_DIR
+     AND OPENBLAS_INC_DIR
+     AND OPENBLAS_CONFIG_INC_DIR
+     AND OPENBLAS_LIB)
     file(READ "${OPENBLAS_CONFIG_INC_DIR}/openblas_config.h" config_file)
     string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file})
     string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp})
-    
-    if (${ver} VERSION_GREATER_EQUAL "0.3.5")
+
+    if(${ver} VERSION_GREATER_EQUAL "0.3.5")
       set(CBLAS_PROVIDER OPENBLAS)
       set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
       set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
@@ -84,52 +92,61 @@ if(NOT DEFINED CBLAS_PROVIDER)
       add_definitions(-DPADDLE_USE_OPENBLAS)
       add_definitions(-DLAPACK_FOUND)
 
-      message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-      message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
+      message(
+        STATUS
+          "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})"
+      )
+      message(
+        STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})"
+      )
     endif()
   endif()
 endif()
 
 ## Then find the reference-cblas if WITH_SYSTEM_BLAS.  www.netlib.org/blas/
 if(NOT DEFINED CBLAS_PROVIDER AND WITH_SYSTEM_BLAS)
-  set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
-    "Folder contains reference-cblas")
-  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
-    ${REFERENCE_CBLAS_ROOT}/include
-    /usr/include
-    /usr/include/cblas
-  )
+  set(REFERENCE_CBLAS_ROOT
+      $ENV{REFERENCE_CBLAS_ROOT}
+      CACHE PATH "Folder contains reference-cblas")
+  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include
+                                           /usr/include /usr/include/cblas)
   set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-    ${REFERENCE_CBLAS_ROOT}/lib
-    /usr/lib
-    /usr/lib/blas/reference/
-    /usr/lib/reference/
-  )
-
-  find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
-        ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
-  find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
-        ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
-  find_library(REFERENCE_BLAS_LIBRARY NAMES blas PATHS
-        ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
+      ${REFERENCE_CBLAS_ROOT}/lib /usr/lib /usr/lib/blas/reference/
+      /usr/lib/reference/)
+
+  find_path(
+    REFERENCE_CBLAS_INCLUDE_DIR
+    NAMES cblas.h
+    PATHS ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
+  find_library(
+    REFERENCE_CBLAS_LIBRARY
+    NAMES cblas
+    PATHS ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
+  find_library(
+    REFERENCE_BLAS_LIBRARY
+    NAMES blas
+    PATHS ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
 
   if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
     set(CBLAS_PROVIDER REFERENCE_CBLAS)
     set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
     set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
     add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
-    message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+    message(
+      STATUS
+        "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})"
+    )
   endif()
 endif()
 
 ## Then build openblas by external_project
 if(NOT DEFINED CBLAS_PROVIDER)
-  include(external/openblas)          # download, build, install openblas
+  include(external/openblas) # download, build, install openblas
   set(CBLAS_PROVIDER EXTERN_OPENBLAS)
   add_dependencies(cblas extern_openblas)
   add_definitions(-DPADDLE_USE_OPENBLAS)
   message(STATUS "Build OpenBLAS by External Project "
-    "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+                 "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
 endif()
 
 # FIXME(gangliao): generate cblas target to track all high performance
@@ -137,7 +154,8 @@ endif()
 
 include_directories(${CBLAS_INC_DIR})
 if(${CBLAS_PROVIDER} STREQUAL REFERENCE_CBLAS)
-  target_link_libraries(cblas gfortran ${CBLAS_LIBRARIES} ${REFERENCE_BLAS_LIBRARY})
+  target_link_libraries(cblas gfortran ${CBLAS_LIBRARIES}
+                        ${REFERENCE_BLAS_LIBRARY})
 elseif(NOT ${CBLAS_PROVIDER} STREQUAL MKLML)
   target_link_libraries(cblas ${CBLAS_LIBRARIES})
 endif()
diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake
index 5520720f7a6c7..85bc0e987a6b6 100644
--- a/cmake/ccache.cmake
+++ b/cmake/ccache.cmake
@@ -1,29 +1,34 @@
 # Use ccache if found ccache program
 
 if(NOT WIN32)
-    find_program(CCACHE_PATH ccache)
-    if(CCACHE_PATH)
-        execute_process(COMMAND ccache -V OUTPUT_VARIABLE ccache_output)
-        execute_process(COMMAND ccache -s cache directory OUTPUT_VARIABLE cache_directory)
-        string(REGEX MATCH "[0-9]+.[0-9]+" ccache_version ${ccache_output})
-        message(STATUS "ccache is founded, use ccache to speed up compile on Unix.")
-        # show statistics summary of ccache
-        message("ccache version\t\t\t    " ${ccache_version} "\n" ${cache_directory})
-        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH})
-        set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH})
-    endif(CCACHE_PATH)
+  find_program(CCACHE_PATH ccache)
+  if(CCACHE_PATH)
+    execute_process(COMMAND ccache -V OUTPUT_VARIABLE ccache_output)
+    execute_process(COMMAND ccache -s cache directory
+                    OUTPUT_VARIABLE cache_directory)
+    string(REGEX MATCH "[0-9]+.[0-9]+" ccache_version ${ccache_output})
+    message(STATUS "ccache is founded, use ccache to speed up compile on Unix.")
+    # show statistics summary of ccache
+    message("ccache version\t\t\t    " ${ccache_version} "\n"
+            ${cache_directory})
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH})
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH})
+  endif(CCACHE_PATH)
 elseif("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-    # (Note:zhouwei25) Only Ninja Generator can support sccache now
-    find_program(SCCACHE_PATH sccache)
+  # (Note:zhouwei25) Only Ninja Generator can support sccache now
+  find_program(SCCACHE_PATH sccache)
 
-    if(SCCACHE_PATH)
-        execute_process(COMMAND sccache -V OUTPUT_VARIABLE sccache_version)
-        message(STATUS "sccache is founded, use [${SCCACHE_PATH}] to speed up compile on Windows.")
+  if(SCCACHE_PATH)
+    execute_process(COMMAND sccache -V OUTPUT_VARIABLE sccache_version)
+    message(
+      STATUS
+        "sccache is founded, use [${SCCACHE_PATH}] to speed up compile on Windows."
+    )
 
-        set(CMAKE_C_COMPILER_LAUNCHER ${SCCACHE_PATH})
-        set(CMAKE_CXX_COMPILER_LAUNCHER ${SCCACHE_PATH})
-        # (Note:zhouwei25) sccache for cuda compiler has bug so that it can't be hit
-        # refer to https://github.com/mozilla/sccache/issues/1017, so we fix it
-        set(CMAKE_CUDA_COMPILER_LAUNCHER ${SCCACHE_PATH})
-    endif(SCCACHE_PATH)
+    set(CMAKE_C_COMPILER_LAUNCHER ${SCCACHE_PATH})
+    set(CMAKE_CXX_COMPILER_LAUNCHER ${SCCACHE_PATH})
+    # (Note:zhouwei25) sccache for cuda compiler has bug so that it can't be hit
+    # refer to https://github.com/mozilla/sccache/issues/1017, so we fix it
+    set(CMAKE_CUDA_COMPILER_LAUNCHER ${SCCACHE_PATH})
+  endif(SCCACHE_PATH)
 endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 5608b6f6f348b..91464b84ef029 100755
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -13,188 +13,195 @@
 # limitations under the License.
 
 if(NOT WITH_PYTHON)
-    add_definitions(-DPADDLE_NO_PYTHON)
+  add_definitions(-DPADDLE_NO_PYTHON)
 endif(NOT WITH_PYTHON)
 
 if(WITH_TESTING)
-    add_definitions(-DPADDLE_WITH_TESTING)
+  add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
 
 if(WITH_INFERENCE_API_TEST)
-    add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST)
+  add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST)
 endif(WITH_INFERENCE_API_TEST)
 
 if(NOT WITH_PROFILER)
-    add_definitions(-DPADDLE_DISABLE_PROFILER)
+  add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
 
 if(WITH_AVX AND AVX_FOUND)
-    set(SIMD_FLAG ${AVX_FLAG})
-    add_definitions(-DPADDLE_WITH_AVX)
+  set(SIMD_FLAG ${AVX_FLAG})
+  add_definitions(-DPADDLE_WITH_AVX)
 elseif(SSE3_FOUND AND NOT WIN32)
-    set(SIMD_FLAG ${SSE3_FLAG})
+  set(SIMD_FLAG ${SSE3_FLAG})
 endif()
 
-if (SSE3_FOUND)
-    # TODO: Runtime detection should be used here.
-    add_definitions(-DPADDLE_WITH_SSE3)
+if(SSE3_FOUND)
+  # TODO: Runtime detection should be used here.
+  add_definitions(-DPADDLE_WITH_SSE3)
 endif()
 
 if(WIN32)
   # windows header option for all targets.
   add_definitions(-D_XKEYCHECK_H)
-  # Use symbols instead of absolute path, reduce the cmake link command length. 
-  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
-  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
-  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
-  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1)
-  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1)
-  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1)
-  SET(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@")
-  SET(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@")
+  # Use symbols instead of absolute path, reduce the cmake link command length.
+  set(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
+  set(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
+  set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+  set(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+  set(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1)
+  set(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1)
+  set(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@")
+  set(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@")
 
   add_definitions(-DPADDLE_DLL_INFERENCE)
   # set definition for the dll export
-  if (NOT MSVC)
-    message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
+  if(NOT MSVC)
+    message(
+      FATAL
+      "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA."
+    )
   endif(NOT MSVC)
 endif(WIN32)
 
 if(WITH_MUSL)
-    add_definitions(-DPADDLE_WITH_MUSL)
+  add_definitions(-DPADDLE_WITH_MUSL)
 
-    message(STATUS, "Set compile option WITH_MKL=OFF when WITH_MUSL=ON")
-    SET(WITH_MKL OFF)
+  message(STATUS, "Set compile option WITH_MKL=OFF when WITH_MUSL=ON")
+  set(WITH_MKL OFF)
 
-    message(STATUS, "Set compile option WITH_GPU=OFF when WITH_MUSL=ON")
-    SET(WITH_GPU OFF)
+  message(STATUS, "Set compile option WITH_GPU=OFF when WITH_MUSL=ON")
+  set(WITH_GPU OFF)
 endif()
 
 if(WITH_PSLIB)
-    add_definitions(-DPADDLE_WITH_PSLIB)
+  add_definitions(-DPADDLE_WITH_PSLIB)
 endif()
 
 if(WITH_ARM_BRPC)
-    add_definitions(-DPADDLE_WITH_ARM_BRPC)
+  add_definitions(-DPADDLE_WITH_ARM_BRPC)
+endif()
+
+if(WITH_FLPS)
+  add_definitions(-DPADDLE_WITH_FLPS)
 endif()
 
 if(WITH_GLOO)
-    add_definitions(-DPADDLE_WITH_GLOO)
+  add_definitions(-DPADDLE_WITH_GLOO)
 endif()
 
 if(WITH_BOX_PS)
-    add_definitions(-DPADDLE_WITH_BOX_PS)
+  add_definitions(-DPADDLE_WITH_BOX_PS)
 endif()
 
 if(WITH_ASCEND)
-    add_definitions(-DPADDLE_WITH_ASCEND)
+  add_definitions(-DPADDLE_WITH_ASCEND)
 endif()
 
 if(WITH_ASCEND_CL)
-    add_definitions(-DPADDLE_WITH_ASCEND_CL)
+  add_definitions(-DPADDLE_WITH_ASCEND_CL)
 endif()
 
 if(WITH_ASCEND_INT64)
-    add_definitions(-DPADDLE_WITH_ASCEND_INT64)
+  add_definitions(-DPADDLE_WITH_ASCEND_INT64)
 endif()
 
 if(WITH_XPU)
-    message(STATUS "Compile with XPU!")
-    add_definitions(-DPADDLE_WITH_XPU)
+  message(STATUS "Compile with XPU!")
+  add_definitions(-DPADDLE_WITH_XPU)
 endif()
 
 if(WITH_XPU_KP)
-    message(STATUS "Compile with XPU_KP!")
-    add_definitions(-DPADDLE_WITH_XPU_KP)
+  message(STATUS "Compile with XPU_KP!")
+  add_definitions(-DPADDLE_WITH_XPU_KP)
 endif()
 
 if(WITH_IPU)
-    message(STATUS "Compile with IPU!")
-    add_definitions(-DPADDLE_WITH_IPU)
+  message(STATUS "Compile with IPU!")
+  add_definitions(-DPADDLE_WITH_IPU)
 endif()
 
 if(WITH_MLU)
-    message(STATUS "Compile with MLU!")
-    add_definitions(-DPADDLE_WITH_MLU)
+  message(STATUS "Compile with MLU!")
+  add_definitions(-DPADDLE_WITH_MLU)
 endif()
 
 if(WITH_GPU)
-    add_definitions(-DPADDLE_WITH_CUDA)
-    add_definitions(-DEIGEN_USE_GPU)
+  add_definitions(-DPADDLE_WITH_CUDA)
+  add_definitions(-DEIGEN_USE_GPU)
 
-    FIND_PACKAGE(CUDA REQUIRED)
+  find_package(CUDA REQUIRED)
 
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 10.1)
-        message(FATAL_ERROR "Paddle needs CUDA >= 10.1 to compile")
-    endif()
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 10.1)
+    message(FATAL_ERROR "Paddle needs CUDA >= 10.1 to compile")
+  endif()
 
-    if(NOT CUDNN_FOUND)
-        message(FATAL_ERROR "Paddle needs cudnn to compile")
-    endif()
+  if(NOT CUDNN_FOUND)
+    message(FATAL_ERROR "Paddle needs cudnn to compile")
+  endif()
 
-    if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle needs CUDNN >= 7.0 to compile")
-    endif()
+  if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+    message(FATAL_ERROR "Paddle needs CUDNN >= 7.0 to compile")
+  endif()
+
+  if(CUPTI_FOUND)
+    include_directories(${CUPTI_INCLUDE_DIR})
+    add_definitions(-DPADDLE_WITH_CUPTI)
+  else()
+    message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
+  endif()
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=\"${SIMD_FLAG}\"")
 
-    if(CUPTI_FOUND)
-        include_directories(${CUPTI_INCLUDE_DIR})
-        add_definitions(-DPADDLE_WITH_CUPTI)
+  # Include cuda and cudnn
+  include_directories(${CUDNN_INCLUDE_DIR})
+  include_directories(${CUDA_TOOLKIT_INCLUDE})
+
+  if(TENSORRT_FOUND)
+    if(WIN32)
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 9)
+        message(FATAL_ERROR "TensorRT needs CUDA >= 9.0 to compile on Windows")
+      endif()
     else()
-        message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
-    endif()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=\"${SIMD_FLAG}\"")
-
-    # Include cuda and cudnn
-    include_directories(${CUDNN_INCLUDE_DIR})
-    include_directories(${CUDA_TOOLKIT_INCLUDE})
-
-    if(TENSORRT_FOUND)
-        if(WIN32)
-            if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 9)
-                message(FATAL_ERROR "TensorRT needs CUDA >= 9.0 to compile on Windows")
-            endif()
-        else()
-            if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 8)
-                message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
-            endif()
-            if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-                message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
-            endif()
-            if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
-                message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
-            endif()
-        endif()
-        include_directories(${TENSORRT_INCLUDE_DIR})
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 8)
+        message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
+      endif()
+      if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+        message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
+      endif()
+      if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
+        message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
+      endif()
     endif()
+    include_directories(${TENSORRT_INCLUDE_DIR})
+  endif()
 elseif(WITH_ROCM)
-    add_definitions(-DPADDLE_WITH_HIP)
-    add_definitions(-DEIGEN_USE_GPU)
-    add_definitions(-DEIGEN_USE_HIP)
+  add_definitions(-DPADDLE_WITH_HIP)
+  add_definitions(-DEIGEN_USE_GPU)
+  add_definitions(-DEIGEN_USE_HIP)
 
-    if(NOT MIOPEN_FOUND)
-        message(FATAL_ERROR "Paddle needs MIOpen to compile")
-    endif()
+  if(NOT MIOPEN_FOUND)
+    message(FATAL_ERROR "Paddle needs MIOpen to compile")
+  endif()
 
-    if(${MIOPEN_VERSION} VERSION_LESS 2090)
-        message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile")
-    endif()
+  if(${MIOPEN_VERSION} VERSION_LESS 2090)
+    message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile")
+  endif()
 else()
-    add_definitions(-DHPPL_STUB_FUNC)
-    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
+  add_definitions(-DHPPL_STUB_FUNC)
+  list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 endif()
 
-if (WITH_MKLML AND MKLML_IOMP_LIB)
-    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
-    if(WIN32)
-        # openmp not support well for now on windows
-        set(OPENMP_FLAGS "")
-    else(WIN32)
-        set(OPENMP_FLAGS "-fopenmp")
-    endif(WIN32)
-    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
+if(WITH_MKLML AND MKLML_IOMP_LIB)
+  message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
+  if(WIN32)
+    # openmp not support well for now on windows
+    set(OPENMP_FLAGS "")
+  else(WIN32)
+    set(OPENMP_FLAGS "-fopenmp")
+  endif(WIN32)
+  set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+  set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
 endif()
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
@@ -205,25 +212,25 @@ if(WITH_DISTRIBUTE)
 endif()
 
 if(WITH_PSCORE)
-    add_definitions(-DPADDLE_WITH_PSCORE)
+  add_definitions(-DPADDLE_WITH_PSCORE)
 endif()
 
 if(WITH_HETERPS)
-    add_definitions(-DPADDLE_WITH_HETERPS)
+  add_definitions(-DPADDLE_WITH_HETERPS)
 endif()
 
 if(WITH_BRPC_RDMA)
-    add_definitions(-DPADDLE_WITH_BRPC_RDMA)
+  add_definitions(-DPADDLE_WITH_BRPC_RDMA)
 endif(WITH_BRPC_RDMA)
 
 if(ON_INFER)
-    add_definitions(-DPADDLE_ON_INFERENCE)
+  add_definitions(-DPADDLE_ON_INFERENCE)
 endif(ON_INFER)
 
 if(WITH_CRYPTO)
-    add_definitions(-DPADDLE_WITH_CRYPTO)
+  add_definitions(-DPADDLE_WITH_CRYPTO)
 endif(WITH_CRYPTO)
 
 if(WITH_CUSTOM_DEVICE AND NOT WIN32)
-    add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE)
+  add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE)
 endif()
diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake
index 598754bc9efaa..02c1a136280f7 100644
--- a/cmake/coveralls.cmake
+++ b/cmake/coveralls.cmake
@@ -5,107 +5,106 @@
 # Param _COVERALLS_UPLOAD       Upload the result to coveralls.
 # Param _CMAKE_SCRIPT_PATH      CMake script path.
 function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
-    # clean previous gcov data.
-    file(REMOVE_RECURSE ${PROJECT_BINARY_DIR}/*.gcda)
+  # clean previous gcov data.
+  file(REMOVE_RECURSE ${PROJECT_BINARY_DIR}/*.gcda)
 
-    # find curl for upload JSON soon.
-    if (_COVERALLS_UPLOAD)
-        find_program(CURL_EXECUTABLE curl)
-        if (NOT CURL_EXECUTABLE)
-            message(FATAL_ERROR "Coveralls: curl not found!")
-        endif()
+  # find curl for upload JSON soon.
+  if(_COVERALLS_UPLOAD)
+    find_program(CURL_EXECUTABLE curl)
+    if(NOT CURL_EXECUTABLE)
+      message(FATAL_ERROR "Coveralls: curl not found!")
     endif()
+  endif()
 
-    # When passing a CMake list to an external process, the list
-    # will be converted from the format "1;2;3" to "1 2 3".
-    set(COVERAGE_SRCS "")
-    foreach (SINGLE_SRC ${_COVERAGE_SRCS})
-        set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}")
-    endforeach()
+  # When passing a CMake list to an external process, the list
+  # will be converted from the format "1;2;3" to "1 2 3".
+  set(COVERAGE_SRCS "")
+  foreach(SINGLE_SRC ${_COVERAGE_SRCS})
+    set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}")
+  endforeach()
 
-    # query number of logical cores
-    cmake_host_system_information(RESULT core_size QUERY NUMBER_OF_LOGICAL_CORES)
-    # coveralls json file.
-    set(COVERALLS_FILE ${PROJECT_BINARY_DIR}/coveralls.json)
-    add_custom_target(coveralls_generate
-        # Run regress tests.
-        COMMAND ${CMAKE_CTEST_COMMAND}
-                -j ${core_size}
-                --output-on-failure
-        # Generate Gcov and translate it into coveralls JSON.
-        COMMAND ${CMAKE_COMMAND}
-                -DCOVERAGE_SRCS="${COVERAGE_SRCS}"
-                -DCOVERALLS_OUTPUT_FILE="${COVERALLS_FILE}"
-                -DCOV_PATH="${PROJECT_BINARY_DIR}"
-                -DPROJECT_ROOT="${PROJECT_SOURCE_DIR}"
-                -P "${_CMAKE_SCRIPT_PATH}/coverallsGcovJsons.cmake"
-        WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
-        COMMENT "Coveralls: generating coveralls output..."
-    )
+  # query number of logical cores
+  cmake_host_system_information(RESULT core_size QUERY NUMBER_OF_LOGICAL_CORES)
+  # coveralls json file.
+  set(COVERALLS_FILE ${PROJECT_BINARY_DIR}/coveralls.json)
+  add_custom_target(
+    coveralls_generate
+    # Run regress tests.
+    COMMAND ${CMAKE_CTEST_COMMAND} -j ${core_size} --output-on-failure
+    # Generate Gcov and translate it into coveralls JSON.
+    COMMAND
+      ${CMAKE_COMMAND} -DCOVERAGE_SRCS="${COVERAGE_SRCS}"
+      -DCOVERALLS_OUTPUT_FILE="${COVERALLS_FILE}"
+      -DCOV_PATH="${PROJECT_BINARY_DIR}" -DPROJECT_ROOT="${PROJECT_SOURCE_DIR}"
+      -P "${_CMAKE_SCRIPT_PATH}/coverallsGcovJsons.cmake"
+    WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+    COMMENT "Coveralls: generating coveralls output...")
 
-    if (_COVERALLS_UPLOAD)
-        message("COVERALLS UPLOAD: ON")
-        # Upload the JSON to coveralls.
-        add_custom_target(coveralls_upload
-            COMMAND ${CURL_EXECUTABLE}
-                    -S -F json_file=@${COVERALLS_FILE}
-                    https://coveralls.io/api/v1/jobs
-            DEPENDS coveralls_generate
-            WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
-            COMMENT "Coveralls: uploading coveralls output...")
+  if(_COVERALLS_UPLOAD)
+    message("COVERALLS UPLOAD: ON")
+    # Upload the JSON to coveralls.
+    add_custom_target(
+      coveralls_upload
+      COMMAND ${CURL_EXECUTABLE} -S -F json_file=@${COVERALLS_FILE}
+              https://coveralls.io/api/v1/jobs
+      DEPENDS coveralls_generate
+      WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+      COMMENT "Coveralls: uploading coveralls output...")
 
-        add_custom_target(coveralls DEPENDS coveralls_upload)
-    else()
-        message("COVERALLS UPLOAD: OFF")
-        add_custom_target(coveralls DEPENDS coveralls_generate)
-    endif()
+    add_custom_target(coveralls DEPENDS coveralls_upload)
+  else()
+    message("COVERALLS UPLOAD: OFF")
+    add_custom_target(coveralls DEPENDS coveralls_generate)
+  endif()
 endfunction()
 
 if(WITH_COVERAGE)
-    if (WITH_INCREMENTAL_COVERAGE)
-        # if *.h changed, generate coverage report totaly.
-        # if pybind.cc changed, generate coverage report totaly.
-        # Because if pybind.cc add '-g -O0 -fprofile-arcs -ftest-coverage' only, some testcase will fail.
-        if ( (NOT ("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL "")) OR ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" MATCHES "pybind.cc") )
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
-        endif()
-    else()
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
+  if(WITH_INCREMENTAL_COVERAGE)
+    # if *.h changed, generate coverage report totaly.
+    # if pybind.cc changed, generate coverage report totaly.
+    # Because if pybind.cc add '-g -O0 -fprofile-arcs -ftest-coverage' only, some testcase will fail.
+    if((NOT ("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL ""))
+       OR ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" MATCHES "pybind.cc"))
+      set(CMAKE_CXX_FLAGS
+          "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
+      set(CMAKE_C_FLAGS
+          "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
     endif()
-    set(EXCLUDE_DIRS
-        "demo/"
-        "build/"
-        "tests/"
-        ".test_env/"
-    )
+  else()
+    set(CMAKE_CXX_FLAGS
+        "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
+  endif()
+  set(EXCLUDE_DIRS "demo/" "build/" "tests/" ".test_env/")
 
-    if(WITH_GPU)
-        file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" ".c" "*.cu")
-    else()
-        file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" "*.c")
-    endif()
+  if(WITH_GPU)
+    file(
+      GLOB_RECURSE PADDLE_SOURCES
+      RELATIVE "${PROJECT_SOURCE_DIR}"
+      "*.cpp" "*.cc" ".c" "*.cu")
+  else()
+    file(
+      GLOB_RECURSE PADDLE_SOURCES
+      RELATIVE "${PROJECT_SOURCE_DIR}"
+      "*.cpp" "*.cc" "*.c")
+  endif()
 
-    # exclude trivial files in PADDLE_SOURCES
-    foreach(EXCLUDE_DIR ${EXCLUDE_DIRS})
-        foreach(TMP_PATH ${PADDLE_SOURCES})
-            string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND)
-            if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
-                list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH})
-            endif()
-        endforeach(TMP_PATH)
-    endforeach()
+  # exclude trivial files in PADDLE_SOURCES
+  foreach(EXCLUDE_DIR ${EXCLUDE_DIRS})
+    foreach(TMP_PATH ${PADDLE_SOURCES})
+      string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND)
+      if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
+        list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH})
+      endif()
+    endforeach(TMP_PATH)
+  endforeach()
 
-    # convert to absolute path
-    set(PADDLE_SRCS "")
-    foreach(PADDLE_SRC ${PADDLE_SOURCES})
-        set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}")
-    endforeach()
+  # convert to absolute path
+  set(PADDLE_SRCS "")
+  foreach(PADDLE_SRC ${PADDLE_SOURCES})
+    set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}")
+  endforeach()
 
-    code_coverage(
-        "${PADDLE_SRCS}"
-        ${COVERALLS_UPLOAD}
-        "${PROJECT_SOURCE_DIR}/cmake"
-    )
+  code_coverage("${PADDLE_SRCS}" ${COVERALLS_UPLOAD}
+                "${PROJECT_SOURCE_DIR}/cmake")
 endif()
diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake
index 4d813a0726dc0..6c1186f69f14d 100644
--- a/cmake/coverallsGcovJsons.cmake
+++ b/cmake/coverallsGcovJsons.cmake
@@ -32,7 +32,7 @@
 # https://coveralls.io/docs/api
 #
 
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+cmake_minimum_required(VERSION 2.8)
 
 # Since it's not possible to pass a CMake list properly in the
 # "1;2;3" format to an external process, we have replaced the
@@ -41,44 +41,42 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
 string(REGEX REPLACE "\\*" ";" COVERAGE_SRCS ${COVERAGE_SRCS})
 
 find_program(GCOV_EXECUTABLE gcov)
-if (NOT GCOV_EXECUTABLE)
-	message(FATAL_ERROR "gcov not found! Aborting...")
+if(NOT GCOV_EXECUTABLE)
+  message(FATAL_ERROR "gcov not found! Aborting...")
 endif()
 
 find_package(Git)
 
 # TODO: Add these git things to the coveralls json.
-if (GIT_FOUND)
-	# Branch.
-	execute_process(
-		COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD
-		WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-		OUTPUT_VARIABLE GIT_BRANCH
-		OUTPUT_STRIP_TRAILING_WHITESPACE
-	)
-
-	macro (git_log_format FORMAT_CHARS VAR_NAME)
-		execute_process(
-			COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%${FORMAT_CHARS}
-			WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-			OUTPUT_VARIABLE ${VAR_NAME}
-			OUTPUT_STRIP_TRAILING_WHITESPACE
-		)
-	endmacro()
-
-	git_log_format(an GIT_AUTHOR_EMAIL)
-	git_log_format(ae GIT_AUTHOR_EMAIL)
-	git_log_format(cn GIT_COMMITTER_NAME)
-	git_log_format(ce GIT_COMMITTER_EMAIL)
-	git_log_format(B GIT_COMMIT_MESSAGE)
-
-	message("Git exe: ${GIT_EXECUTABLE}")
-	message("Git branch: ${GIT_BRANCH}")
-	message("Git author: ${GIT_AUTHOR_NAME}")
-	message("Git e-mail: ${GIT_AUTHOR_EMAIL}")
-	message("Git commiter name: ${GIT_COMMITTER_NAME}")
-	message("Git commiter e-mail: ${GIT_COMMITTER_EMAIL}")
-	message("Git commit message: ${GIT_COMMIT_MESSAGE}")
+if(GIT_FOUND)
+  # Branch.
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+    OUTPUT_VARIABLE GIT_BRANCH
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  macro(git_log_format FORMAT_CHARS VAR_NAME)
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%${FORMAT_CHARS}
+      WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+      OUTPUT_VARIABLE ${VAR_NAME}
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+  endmacro()
+
+  git_log_format(an GIT_AUTHOR_EMAIL)
+  git_log_format(ae GIT_AUTHOR_EMAIL)
+  git_log_format(cn GIT_COMMITTER_NAME)
+  git_log_format(ce GIT_COMMITTER_EMAIL)
+  git_log_format(B GIT_COMMIT_MESSAGE)
+
+  message("Git exe: ${GIT_EXECUTABLE}")
+  message("Git branch: ${GIT_BRANCH}")
+  message("Git author: ${GIT_AUTHOR_NAME}")
+  message("Git e-mail: ${GIT_AUTHOR_EMAIL}")
+  message("Git commiter name: ${GIT_COMMITTER_NAME}")
+  message("Git commiter e-mail: ${GIT_COMMITTER_EMAIL}")
+  message("Git commit message: ${GIT_COMMIT_MESSAGE}")
 
 endif()
 
@@ -95,15 +93,15 @@ endif()
 #
 macro(get_source_path_from_gcov_filename _SRC_FILENAME _GCOV_FILENAME)
 
-	# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov 
-	# -> 
-	# #path#to#project#root#subdir#the_file.c.gcov   
-	get_filename_component(_GCOV_FILENAME_WEXT ${_GCOV_FILENAME} NAME)
+  # /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
+  # ->
+  # #path#to#project#root#subdir#the_file.c.gcov
+  get_filename_component(_GCOV_FILENAME_WEXT ${_GCOV_FILENAME} NAME)
 
-	# #path#to#project#root#subdir#the_file.c.gcov -> /path/to/project/root/subdir/the_file.c
-	string(REGEX REPLACE "\\.gcov$" "" SRC_FILENAME_TMP ${_GCOV_FILENAME_WEXT})
-	string(REGEX REPLACE "\#" "/" SRC_FILENAME_TMP ${SRC_FILENAME_TMP})
-	set(${_SRC_FILENAME} "${SRC_FILENAME_TMP}")
+  # #path#to#project#root#subdir#the_file.c.gcov -> /path/to/project/root/subdir/the_file.c
+  string(REGEX REPLACE "\\.gcov$" "" SRC_FILENAME_TMP ${_GCOV_FILENAME_WEXT})
+  string(REGEX REPLACE "\#" "/" SRC_FILENAME_TMP ${SRC_FILENAME_TMP})
+  set(${_SRC_FILENAME} "${SRC_FILENAME_TMP}")
 endmacro()
 
 ##############################################################################
@@ -117,26 +115,24 @@ message("===============================")
 # (The directories the .gcda files and .o files are found in)
 # and run gcov on those.
 foreach(GCDA ${GCDA_FILES})
-	get_filename_component(GCDA_DIR ${GCDA} PATH)
-
-	#
-	# The -p below refers to "Preserve path components",
-	# This means that the generated gcov filename of a source file will
-	# keep the original files entire filepath, but / is replaced with #.
-	# Example:
-	#
-	# /path/to/project/root/build/CMakeFiles/the_file.dir/subdir/the_file.c.gcda
-	# ------------------------------------------------------------------------------
-	# File '/path/to/project/root/subdir/the_file.c'
-	# Lines executed:68.34% of 199
-	# /path/to/project/root/subdir/the_file.c:creating '#path#to#project#root#subdir#the_file.c.gcov'
-	#
-	# If -p is not specified then the file is named only "the_file.c.gcov"
-	#
-	execute_process(
-		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null
-		WORKING_DIRECTORY ${GCDA_DIR}
-	)
+  get_filename_component(GCDA_DIR ${GCDA} PATH)
+
+  #
+  # The -p below refers to "Preserve path components",
+  # This means that the generated gcov filename of a source file will
+  # keep the original files entire filepath, but / is replaced with #.
+  # Example:
+  #
+  # /path/to/project/root/build/CMakeFiles/the_file.dir/subdir/the_file.c.gcda
+  # ------------------------------------------------------------------------------
+  # File '/path/to/project/root/subdir/the_file.c'
+  # Lines executed:68.34% of 199
+  # /path/to/project/root/subdir/the_file.c:creating '#path#to#project#root#subdir#the_file.c.gcov'
+  #
+  # If -p is not specified then the file is named only "the_file.c.gcov"
+  #
+  execute_process(COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA}
+                          >/dev/null WORKING_DIRECTORY ${GCDA_DIR})
 endforeach()
 
 # TODO: Make these be absolute path
@@ -164,9 +160,9 @@ file(GLOB_RECURSE ALL_GCOV_FILES "${COV_PATH}" "*.gcov")
 # ALL_GCOV_FILES =
 #				/path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
 #				/path/to/project/root/build/#path#to#project#root#subdir#other_file.c.gcov
-# 
+#
 # Result should be:
-# GCOV_FILES = 
+# GCOV_FILES =
 #				/path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
 #
 set(GCOV_FILES "")
@@ -176,29 +172,29 @@ message("===============================")
 
 set(COVERAGE_SRCS_REMAINING ${COVERAGE_SRCS})
 
-foreach (GCOV_FILE ${ALL_GCOV_FILES})
-
-	#
-	# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov 
-	# -> 
-	# /path/to/project/root/subdir/the_file.c 
-	get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
-
-	# Is this in the list of source files?
-	# TODO: We want to match against relative path filenames from the source file root...
-	list(FIND COVERAGE_SRCS ${GCOV_SRC_PATH} WAS_FOUND)
-
-	if (NOT WAS_FOUND EQUAL -1)
-		message("YES: ${GCOV_FILE}")
-		list(APPEND GCOV_FILES ${GCOV_FILE})
-
-		# We remove it from the list, so we don't bother searching for it again.
-		# Also files left in COVERAGE_SRCS_REMAINING after this loop ends should
-		# have coverage data generated from them (no lines are covered).
-		list(REMOVE_ITEM COVERAGE_SRCS_REMAINING ${GCOV_SRC_PATH})
-	else()
-		message("NO:  ${GCOV_FILE}")
-	endif()
+foreach(GCOV_FILE ${ALL_GCOV_FILES})
+
+  #
+  # /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
+  # ->
+  # /path/to/project/root/subdir/the_file.c
+  get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
+
+  # Is this in the list of source files?
+  # TODO: We want to match against relative path filenames from the source file root...
+  list(FIND COVERAGE_SRCS ${GCOV_SRC_PATH} WAS_FOUND)
+
+  if(NOT WAS_FOUND EQUAL -1)
+    message("YES: ${GCOV_FILE}")
+    list(APPEND GCOV_FILES ${GCOV_FILE})
+
+    # We remove it from the list, so we don't bother searching for it again.
+    # Also files left in COVERAGE_SRCS_REMAINING after this loop ends should
+    # have coverage data generated from them (no lines are covered).
+    list(REMOVE_ITEM COVERAGE_SRCS_REMAINING ${GCOV_SRC_PATH})
+  else()
+    message("NO:  ${GCOV_FILE}")
+  endif()
 endforeach()
 
 # TODO: Enable setting these
@@ -206,20 +202,18 @@ set(JSON_SERVICE_NAME "travis-ci")
 set(JSON_SERVICE_JOB_ID $ENV{TRAVIS_JOB_ID})
 
 set(JSON_TEMPLATE
-"{
+    "{
   \"service_name\": \"\@JSON_SERVICE_NAME\@\",
   \"service_job_id\": \"\@JSON_SERVICE_JOB_ID\@\",
   \"source_files\": \@JSON_GCOV_FILES\@
-}"
-)
+}")
 
 set(SRC_FILE_TEMPLATE
-"{
+    "{
       \"name\": \"\@GCOV_SRC_REL_PATH\@\",
       \"source_digest\": \"\@GCOV_CONTENTS_MD5\@\",
       \"coverage\": \@GCOV_FILE_COVERAGE\@
-  }"
-)
+  }")
 
 message("\nGenerate JSON for files:")
 message("=========================")
@@ -227,163 +221,163 @@ message("=========================")
 set(JSON_GCOV_FILES "[")
 
 # Read the GCOV files line by line and get the coverage data.
-foreach (GCOV_FILE ${GCOV_FILES})
-
-	get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
-	file(RELATIVE_PATH GCOV_SRC_REL_PATH "${PROJECT_ROOT}" "${GCOV_SRC_PATH}")
-
-	# The new coveralls API doesn't need the entire source (Yay!)
-	# However, still keeping that part for now. Will cleanup in the future.
-	file(MD5 "${GCOV_SRC_PATH}" GCOV_CONTENTS_MD5)
-	message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}")
-
-	# Loads the gcov file as a list of lines.
-	# (We first open the file and replace all occurrences of [] with _
-	#  because CMake will fail to parse a line containing unmatched brackets...
-	#  also the \ to escaped \n in macros screws up things.)
-	# https://public.kitware.com/Bug/view.php?id=15369
-	file(READ ${GCOV_FILE} GCOV_CONTENTS)
-	string(REPLACE "[" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
-	string(REPLACE "]" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
-	string(REPLACE "\\" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
-	file(WRITE ${GCOV_FILE}_tmp "${GCOV_CONTENTS}")
-
-	file(STRINGS ${GCOV_FILE}_tmp GCOV_LINES)
-	list(LENGTH GCOV_LINES LINE_COUNT)
-
-	# Instead of trying to parse the source from the
-	# gcov file, simply read the file contents from the source file.
-	# (Parsing it from the gcov is hard because C-code uses ; in many places
-	#  which also happens to be the same as the CMake list delimeter).
-	file(READ ${GCOV_SRC_PATH} GCOV_FILE_SOURCE)
-
-	string(REPLACE "\\" "\\\\" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	string(REGEX REPLACE "\"" "\\\\\"" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	string(REPLACE "\t" "\\\\t" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	string(REPLACE "\r" "\\\\r" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	string(REPLACE "\n" "\\\\n" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	# According to http://json.org/ these should be escaped as well.
-	# Don't know how to do that in CMake however...
-	#string(REPLACE "\b" "\\\\b" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	#string(REPLACE "\f" "\\\\f" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	#string(REGEX REPLACE "\u([a-fA-F0-9]{4})" "\\\\u\\1" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-
-	# We want a json array of coverage data as a single string
-	# start building them from the contents of the .gcov
-	set(GCOV_FILE_COVERAGE "[")
-
-	set(GCOV_LINE_COUNT 1) # Line number for the .gcov.
-	set(DO_SKIP 0)
-	foreach (GCOV_LINE ${GCOV_LINES})
-		#message("${GCOV_LINE}")
-		# Example of what we're parsing:
-		# Hitcount  |Line | Source
-		# "        8:   26:        if (!allowed || (strlen(allowed) == 0))"
-		string(REGEX REPLACE 
-			"^([^:]*):([^:]*):(.*)$" 
-			"\\1;\\2;\\3"
-			RES
-			"${GCOV_LINE}")
-
-		# Check if we should exclude lines using the Lcov syntax.
-		string(REGEX MATCH "LCOV_EXCL_START" START_SKIP "${GCOV_LINE}")
-		string(REGEX MATCH "LCOV_EXCL_END" END_SKIP "${GCOV_LINE}")
-		string(REGEX MATCH "LCOV_EXCL_LINE" LINE_SKIP "${GCOV_LINE}")
-
-		set(RESET_SKIP 0)
-		if (LINE_SKIP AND NOT DO_SKIP)
-			set(DO_SKIP 1)
-			set(RESET_SKIP 1)
-		endif()
-
-		if (START_SKIP)
-			set(DO_SKIP 1)
-			message("${GCOV_LINE_COUNT}: Start skip")
-		endif()
-
-		if (END_SKIP)
-			set(DO_SKIP 0)
-		endif()
-
-		list(LENGTH RES RES_COUNT)
-
-		if (RES_COUNT GREATER 2)
-			list(GET RES 0 HITCOUNT)
-			list(GET RES 1 LINE)
-			list(GET RES 2 SOURCE)
-
-			string(STRIP ${HITCOUNT} HITCOUNT)
-			string(STRIP ${LINE} LINE)
-
-			# Lines with 0 line numbers are metadata and can be ignored.
-			if (NOT ${LINE} EQUAL 0)
-				
-				if (DO_SKIP)
-					set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
-				else()
-					# Translate the hitcount into valid JSON values.
-					if (${HITCOUNT} STREQUAL "#####")
-						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
-					elseif (${HITCOUNT} STREQUAL "-")
-						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
-					else()
-						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}${HITCOUNT}, ")
-					endif()
-				endif()
-			endif()
-		else()
-			message(WARNING "Failed to properly parse line (RES_COUNT = ${RES_COUNT}) ${GCOV_FILE}:${GCOV_LINE_COUNT}\n-->${GCOV_LINE}")
-		endif()
-
-		if (RESET_SKIP)
-			set(DO_SKIP 0)
-		endif()
-		math(EXPR GCOV_LINE_COUNT "${GCOV_LINE_COUNT}+1")
-	endforeach()
-
-	message("${GCOV_LINE_COUNT} of ${LINE_COUNT} lines read!")
-
-	# Advanced way of removing the trailing comma in the JSON array.
-	# "[1, 2, 3, " -> "[1, 2, 3"
-	string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
-
-	# Append the trailing ] to complete the JSON array.
-	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
-
-	# Generate the final JSON for this file.
-	message("Generate JSON for file: ${GCOV_SRC_REL_PATH}...")
-	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
-
-	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
+foreach(GCOV_FILE ${GCOV_FILES})
+
+  get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
+  file(RELATIVE_PATH GCOV_SRC_REL_PATH "${PROJECT_ROOT}" "${GCOV_SRC_PATH}")
+
+  # The new coveralls API doesn't need the entire source (Yay!)
+  # However, still keeping that part for now. Will cleanup in the future.
+  file(MD5 "${GCOV_SRC_PATH}" GCOV_CONTENTS_MD5)
+  message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}")
+
+  # Loads the gcov file as a list of lines.
+  # (We first open the file and replace all occurrences of [] with _
+  #  because CMake will fail to parse a line containing unmatched brackets...
+  #  also the \ to escaped \n in macros screws up things.)
+  # https://public.kitware.com/Bug/view.php?id=15369
+  file(READ ${GCOV_FILE} GCOV_CONTENTS)
+  string(REPLACE "[" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
+  string(REPLACE "]" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
+  string(REPLACE "\\" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
+  file(WRITE ${GCOV_FILE}_tmp "${GCOV_CONTENTS}")
+
+  file(STRINGS ${GCOV_FILE}_tmp GCOV_LINES)
+  list(LENGTH GCOV_LINES LINE_COUNT)
+
+  # Instead of trying to parse the source from the
+  # gcov file, simply read the file contents from the source file.
+  # (Parsing it from the gcov is hard because C-code uses ; in many places
+  #  which also happens to be the same as the CMake list delimeter).
+  file(READ ${GCOV_SRC_PATH} GCOV_FILE_SOURCE)
+
+  string(REPLACE "\\" "\\\\" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+  string(REGEX REPLACE "\"" "\\\\\"" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+  string(REPLACE "\t" "\\\\t" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+  string(REPLACE "\r" "\\\\r" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+  string(REPLACE "\n" "\\\\n" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+  # According to http://json.org/ these should be escaped as well.
+  # Don't know how to do that in CMake however...
+  #string(REPLACE "\b" "\\\\b" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+  #string(REPLACE "\f" "\\\\f" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+  #string(REGEX REPLACE "\u([a-fA-F0-9]{4})" "\\\\u\\1" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+
+  # We want a json array of coverage data as a single string
+  # start building them from the contents of the .gcov
+  set(GCOV_FILE_COVERAGE "[")
+
+  set(GCOV_LINE_COUNT 1) # Line number for the .gcov.
+  set(DO_SKIP 0)
+  foreach(GCOV_LINE ${GCOV_LINES})
+    #message("${GCOV_LINE}")
+    # Example of what we're parsing:
+    # Hitcount  |Line | Source
+    # "        8:   26:        if (!allowed || (strlen(allowed) == 0))"
+    string(REGEX REPLACE "^([^:]*):([^:]*):(.*)$" "\\1;\\2;\\3" RES
+                         "${GCOV_LINE}")
+
+    # Check if we should exclude lines using the Lcov syntax.
+    string(REGEX MATCH "LCOV_EXCL_START" START_SKIP "${GCOV_LINE}")
+    string(REGEX MATCH "LCOV_EXCL_END" END_SKIP "${GCOV_LINE}")
+    string(REGEX MATCH "LCOV_EXCL_LINE" LINE_SKIP "${GCOV_LINE}")
+
+    set(RESET_SKIP 0)
+    if(LINE_SKIP AND NOT DO_SKIP)
+      set(DO_SKIP 1)
+      set(RESET_SKIP 1)
+    endif()
+
+    if(START_SKIP)
+      set(DO_SKIP 1)
+      message("${GCOV_LINE_COUNT}: Start skip")
+    endif()
+
+    if(END_SKIP)
+      set(DO_SKIP 0)
+    endif()
+
+    list(LENGTH RES RES_COUNT)
+
+    if(RES_COUNT GREATER 2)
+      list(GET RES 0 HITCOUNT)
+      list(GET RES 1 LINE)
+      list(GET RES 2 SOURCE)
+
+      string(STRIP ${HITCOUNT} HITCOUNT)
+      string(STRIP ${LINE} LINE)
+
+      # Lines with 0 line numbers are metadata and can be ignored.
+      if(NOT ${LINE} EQUAL 0)
+
+        if(DO_SKIP)
+          set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
+        else()
+          # Translate the hitcount into valid JSON values.
+          if(${HITCOUNT} STREQUAL "#####")
+            set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
+          elseif(${HITCOUNT} STREQUAL "-")
+            set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
+          else()
+            set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}${HITCOUNT}, ")
+          endif()
+        endif()
+      endif()
+    else()
+      message(
+        WARNING
+          "Failed to properly parse line (RES_COUNT = ${RES_COUNT}) ${GCOV_FILE}:${GCOV_LINE_COUNT}\n-->${GCOV_LINE}"
+      )
+    endif()
+
+    if(RESET_SKIP)
+      set(DO_SKIP 0)
+    endif()
+    math(EXPR GCOV_LINE_COUNT "${GCOV_LINE_COUNT}+1")
+  endforeach()
+
+  message("${GCOV_LINE_COUNT} of ${LINE_COUNT} lines read!")
+
+  # Advanced way of removing the trailing comma in the JSON array.
+  # "[1, 2, 3, " -> "[1, 2, 3"
+  string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
+
+  # Append the trailing ] to complete the JSON array.
+  set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
+
+  # Generate the final JSON for this file.
+  message("Generate JSON for file: ${GCOV_SRC_REL_PATH}...")
+  string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
+
+  set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
 endforeach()
 
 # Loop through all files we couldn't find any coverage for
 # as well, and generate JSON for those as well with 0% coverage.
 foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING})
 
-	# Loads the source file as a list of lines.
-	file(STRINGS ${NOT_COVERED_SRC} SRC_LINES)
+  # Loads the source file as a list of lines.
+  file(STRINGS ${NOT_COVERED_SRC} SRC_LINES)
 
-	set(GCOV_FILE_COVERAGE "[")
-	set(GCOV_FILE_SOURCE "")
+  set(GCOV_FILE_COVERAGE "[")
+  set(GCOV_FILE_SOURCE "")
 
-	foreach (SOURCE ${SRC_LINES})
-		set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
+  foreach(SOURCE ${SRC_LINES})
+    set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
 
-		string(REPLACE "\\" "\\\\" SOURCE "${SOURCE}")
-		string(REGEX REPLACE "\"" "\\\\\"" SOURCE "${SOURCE}")
-		string(REPLACE "\t" "\\\\t" SOURCE "${SOURCE}")
-		string(REPLACE "\r" "\\\\r" SOURCE "${SOURCE}")
-		set(GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}${SOURCE}\\n")
-	endforeach()
+    string(REPLACE "\\" "\\\\" SOURCE "${SOURCE}")
+    string(REGEX REPLACE "\"" "\\\\\"" SOURCE "${SOURCE}")
+    string(REPLACE "\t" "\\\\t" SOURCE "${SOURCE}")
+    string(REPLACE "\r" "\\\\r" SOURCE "${SOURCE}")
+    set(GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}${SOURCE}\\n")
+  endforeach()
 
-	# Remove trailing comma, and complete JSON array with ]
-	string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
-	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
+  # Remove trailing comma, and complete JSON array with ]
+  string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
+  set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
 
-	# Generate the final JSON for this file.
-	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
-	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
+  # Generate the final JSON for this file.
+  string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
+  set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
 endforeach()
 
 # Get rid of trailing comma.
@@ -395,7 +389,9 @@ message("Generate final JSON...")
 string(CONFIGURE ${JSON_TEMPLATE} JSON)
 
 file(WRITE "${COVERALLS_OUTPUT_FILE}" "${JSON}")
-message("###########################################################################")
-message("Generated coveralls JSON containing coverage data:") 
+message(
+  "###########################################################################")
+message("Generated coveralls JSON containing coverage data:")
 message("${COVERALLS_OUTPUT_FILE}")
-message("###########################################################################")
+message(
+  "###########################################################################")
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 4894d615c2a35..aa958786cb8f4 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -1,5 +1,5 @@
 if(NOT WITH_GPU)
-    return()
+  return()
 endif()
 
 if(WITH_NV_JETSON)
@@ -38,7 +38,9 @@ function(detect_installed_gpus out_variable)
   if(NOT CUDA_gpu_detect_output)
     set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
 
-    file(WRITE ${cufile} ""
+    file(
+      WRITE ${cufile}
+      ""
       "#include \"stdio.h\"\n"
       "#include \"cuda.h\"\n"
       "#include \"cuda_runtime.h\"\n"
@@ -54,55 +56,86 @@ function(detect_installed_gpus out_variable)
       "  return 0;\n"
       "}\n")
 
-    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}"
-                    "--run" "${cufile}"
-                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
-                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
-                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    execute_process(
+      COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${cufile}"
+      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+      RESULT_VARIABLE nvcc_res
+      OUTPUT_VARIABLE nvcc_out
+      ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 
     if(nvcc_res EQUAL 0)
       # only keep the last line of nvcc_out
-      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
-      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+      string(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+      string(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
       list(GET nvcc_out -1 nvcc_out)
       string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
-      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
+      set(CUDA_gpu_detect_output
+          ${nvcc_out}
+          CACHE INTERNAL
+                "Returned GPU architetures from detect_installed_gpus tool"
+                FORCE)
     endif()
   endif()
 
   if(NOT CUDA_gpu_detect_output)
-    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
-    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+    message(
+      STATUS
+        "Automatic GPU detection failed. Building for all known architectures.")
+    set(${out_variable}
+        ${paddle_known_gpu_archs}
+        PARENT_SCOPE)
   else()
-    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+    set(${out_variable}
+        ${CUDA_gpu_detect_output}
+        PARENT_SCOPE)
   endif()
 endfunction()
 
-
 ########################################################################
 # Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
 # Usage:
 #   select_nvcc_arch_flags(out_variable)
 function(select_nvcc_arch_flags out_variable)
   # List of arch names
-  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "All" "Manual")
+  set(archs_names
+      "Kepler"
+      "Maxwell"
+      "Pascal"
+      "Volta"
+      "Turing"
+      "Ampere"
+      "All"
+      "Manual")
   set(archs_name_default "Auto")
   list(APPEND archs_names "Auto")
 
   # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
-  set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
-  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
+  set(CUDA_ARCH_NAME
+      ${archs_name_default}
+      CACHE STRING "Select target NVIDIA GPU achitecture.")
+  set_property(CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names})
   mark_as_advanced(CUDA_ARCH_NAME)
 
   # verify CUDA_ARCH_NAME value
   if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
     string(REPLACE ";" ", " archs_names "${archs_names}")
-    message(FATAL_ERROR "Only ${archs_names} architectures names are supported.")
+    message(
+      FATAL_ERROR "Only ${archs_names} architectures names are supported.")
   endif()
 
   if(${CUDA_ARCH_NAME} STREQUAL "Manual")
-    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-    set(CUDA_ARCH_PTX ""                        CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    set(CUDA_ARCH_BIN
+        ${paddle_known_gpu_archs}
+        CACHE
+          STRING
+          "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported"
+    )
+    set(CUDA_ARCH_PTX
+        ""
+        CACHE
+          STRING
+          "Specify 'virtual' PTX architectures to build PTX intermediate code for"
+    )
     mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
   else()
     unset(CUDA_ARCH_BIN CACHE)
@@ -112,19 +145,19 @@ function(select_nvcc_arch_flags out_variable)
   if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
     set(cuda_arch_bin "30 35")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
-    if (WITH_NV_JETSON)
+    if(WITH_NV_JETSON)
       set(cuda_arch_bin "53")
     else()
       set(cuda_arch_bin "50")
     endif()
   elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
-    if (WITH_NV_JETSON)
+    if(WITH_NV_JETSON)
       set(cuda_arch_bin "62")
     else()
       set(cuda_arch_bin "60 61")
     endif()
   elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
-    if (WITH_NV_JETSON)
+    if(WITH_NV_JETSON)
       set(cuda_arch_bin "72")
     else()
       set(cuda_arch_bin "70")
@@ -132,35 +165,37 @@ function(select_nvcc_arch_flags out_variable)
   elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
     set(cuda_arch_bin "75")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
-    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0
+    if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0
       set(cuda_arch_bin "80")
-    elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.1+
+    elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.1+
       set(cuda_arch_bin "80 86")
     endif()
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(cuda_arch_bin ${paddle_known_gpu_archs})
   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
-    message(STATUS "WARNING: This is just a warning for publishing release.
+    message(
+      STATUS
+        "WARNING: This is just a warning for publishing release.
       You are building GPU version without supporting different architectures.
       So the wheel package may fail on other GPU architectures.
       You can add -DCUDA_ARCH_NAME=All in cmake command
       to get a full wheel package to resolve this warning.
       While, this version will still work on local GPU architecture.")
     detect_installed_gpus(cuda_arch_bin)
-  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
+  else() # (${CUDA_ARCH_NAME} STREQUAL "Manual")
     set(cuda_arch_bin ${CUDA_ARCH_BIN})
   endif()
 
   if(NEW_RELEASE_JIT)
-      set(cuda_arch_ptx "${cuda_arch_ptx}${cuda_arch_bin}")
-      set(cuda_arch_bin "")
+    set(cuda_arch_ptx "${cuda_arch_ptx}${cuda_arch_bin}")
+    set(cuda_arch_bin "")
   endif()
 
   # remove dots and convert to lists
   string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
   string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
   string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+  string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}")
 
   list(REMOVE_DUPLICATES cuda_arch_bin)
   list(REMOVE_DUPLICATES cuda_arch_ptx)
@@ -172,7 +207,8 @@ function(select_nvcc_arch_flags out_variable)
   foreach(arch ${cuda_arch_bin})
     if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
       # User explicitly specified PTX for the concrete BIN
-      string(APPEND nvcc_flags " -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}")
+      string(APPEND nvcc_flags
+             " -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}")
       string(APPEND nvcc_archs_readable " sm_${CMAKE_MATCH_1}")
     else()
       # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
@@ -183,34 +219,39 @@ function(select_nvcc_arch_flags out_variable)
 
   # Tell NVCC to add PTX intermediate code for the specified architectures
   foreach(arch ${cuda_arch_ptx})
-    string(APPEND nvcc_flags " -gencode arch=compute_${arch},code=compute_${arch}")
+    string(APPEND nvcc_flags
+           " -gencode arch=compute_${arch},code=compute_${arch}")
     string(APPEND nvcc_archs_readable " compute_${arch}")
   endforeach()
 
   string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
-  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
-  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+  set(${out_variable}
+      ${nvcc_flags}
+      PARENT_SCOPE)
+  set(${out_variable}_readable
+      ${nvcc_archs_readable}
+      PARENT_SCOPE)
 endfunction()
 
 message(STATUS "CUDA detected: " ${CMAKE_CUDA_COMPILER_VERSION})
-if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
+if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.2) # CUDA 11.0/11.1
+elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.2) # CUDA 11.0/11.1
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs11})
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+
+elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+
   set(paddle_known_gpu_archs "${paddle_known_gpu_archs11} 86")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 endif()
 
-if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
+if(NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
   add_definitions("-DTRT_PLUGIN_FP16_AVALIABLE")
 endif()
 
@@ -231,7 +272,7 @@ set(CMAKE_CUDA_STANDARD 14)
 
 # (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
 # So replace /W[1-4] with /W0
-if (WIN32)
+if(WIN32)
   string(REGEX REPLACE "/W[1-4]" " /W0 " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
 endif(WIN32)
 # in cuda9, suppress cuda warning on eigen
@@ -242,15 +283,16 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
 
 if(WIN32)
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"/wd4244 /wd4267 /wd4819 \"")
+  set(CMAKE_CUDA_FLAGS
+      "${CMAKE_CUDA_FLAGS} -Xcompiler \"/wd4244 /wd4267 /wd4819 \"")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /bigobj")
   if(MSVC_STATIC_CRT)
     foreach(flag_var
-        CMAKE_CUDA_FLAGS CMAKE_CUDA_FLAGS_DEBUG CMAKE_CUDA_FLAGS_RELEASE
-        CMAKE_CUDA_FLAGS_MINSIZEREL CMAKE_CUDA_FLAGS_RELWITHDEBINFO)
-        if(${flag_var} MATCHES "-MD")
-            string(REGEX REPLACE "-MD" "-MT" ${flag_var} "${${flag_var}}")
-        endif()
+            CMAKE_CUDA_FLAGS CMAKE_CUDA_FLAGS_DEBUG CMAKE_CUDA_FLAGS_RELEASE
+            CMAKE_CUDA_FLAGS_MINSIZEREL CMAKE_CUDA_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "-MD")
+        string(REGEX REPLACE "-MD" "-MT" ${flag_var} "${${flag_var}}")
+      endif()
     endforeach(flag_var)
   endif()
 endif()
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index c82847100abef..2e5131d217a50 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -1,107 +1,113 @@
 if(NOT WITH_GPU)
-    return()
+  return()
 endif()
 
 if(WIN32)
-    set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+  set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
 else(WIN32)
-    set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
+  set(CUDNN_ROOT
+      "/usr"
+      CACHE PATH "CUDNN ROOT")
 endif(WIN32)
 
-find_path(CUDNN_INCLUDE_DIR cudnn.h
-    PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
-    $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
-    NO_DEFAULT_PATH
-)
+find_path(
+  CUDNN_INCLUDE_DIR cudnn.h
+  PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include $ENV{CUDNN_ROOT}
+        $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
+  NO_DEFAULT_PATH)
 
 get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
 
 set(TARGET_ARCH "x86_64")
 if(NOT ${CMAKE_SYSTEM_PROCESSOR})
-    set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+  set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
 endif()
 
-list(APPEND CUDNN_CHECK_LIBRARY_DIRS
-    ${CUDNN_ROOT}
-    ${CUDNN_ROOT}/lib64
-    ${CUDNN_ROOT}/lib
-    ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
-    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
-    $ENV{CUDNN_ROOT}
-    $ENV{CUDNN_ROOT}/lib64
-    $ENV{CUDNN_ROOT}/lib
-    /usr/lib
-	${CUDA_TOOLKIT_ROOT_DIR}
-	${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
-	)
+list(
+  APPEND
+  CUDNN_CHECK_LIBRARY_DIRS
+  ${CUDNN_ROOT}
+  ${CUDNN_ROOT}/lib64
+  ${CUDNN_ROOT}/lib
+  ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+  ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
+  $ENV{CUDNN_ROOT}
+  $ENV{CUDNN_ROOT}/lib64
+  $ENV{CUDNN_ROOT}/lib
+  /usr/lib
+  ${CUDA_TOOLKIT_ROOT_DIR}
+  ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
 set(CUDNN_LIB_NAME "")
 
-if (LINUX)
-    set(CUDNN_LIB_NAME "libcudnn.so")
+if(LINUX)
+  set(CUDNN_LIB_NAME "libcudnn.so")
 endif(LINUX)
 
 if(WIN32)
-    # only support cudnn7
-    set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
+  # only support cudnn7
+  set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
 endif(WIN32)
 
 if(APPLE)
-    set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
+  set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
 endif(APPLE)
 
-find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
-    PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
-          NO_DEFAULT_PATH
-    DOC "Path to cuDNN library.")
-
+find_library(
+  CUDNN_LIBRARY
+  NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
+  PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
+  NO_DEFAULT_PATH
+  DOC "Path to cuDNN library.")
 
 if(CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY)
-    set(CUDNN_FOUND ON)
+  set(CUDNN_FOUND ON)
 else()
-    set(CUDNN_FOUND OFF)
+  set(CUDNN_FOUND OFF)
 endif()
 
-macro(find_cudnn_version cudnn_header_file) 
-    file(READ ${cudnn_header_file} CUDNN_VERSION_FILE_CONTENTS)
-    get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)
-
-    string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)"
-        CUDNN_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define CUDNN_VERSION +([0-9]+)" "\\1"
-        CUDNN_VERSION "${CUDNN_VERSION}")
-
-    if("${CUDNN_VERSION}" STREQUAL "2000")
-        message(STATUS "Current cuDNN version is v2. ")
+macro(find_cudnn_version cudnn_header_file)
+  file(READ ${cudnn_header_file} CUDNN_VERSION_FILE_CONTENTS)
+  get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)
+
+  string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)" CUDNN_VERSION
+               "${CUDNN_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define CUDNN_VERSION +([0-9]+)" "\\1" CUDNN_VERSION
+                       "${CUDNN_VERSION}")
+
+  if("${CUDNN_VERSION}" STREQUAL "2000")
+    message(STATUS "Current cuDNN version is v2. ")
+  else()
+    string(REGEX MATCH "define CUDNN_MAJOR +([0-9]+)" CUDNN_MAJOR_VERSION
+                 "${CUDNN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define CUDNN_MAJOR +([0-9]+)" "\\1"
+                         CUDNN_MAJOR_VERSION "${CUDNN_MAJOR_VERSION}")
+    string(REGEX MATCH "define CUDNN_MINOR +([0-9]+)" CUDNN_MINOR_VERSION
+                 "${CUDNN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define CUDNN_MINOR +([0-9]+)" "\\1"
+                         CUDNN_MINOR_VERSION "${CUDNN_MINOR_VERSION}")
+    string(REGEX MATCH "define CUDNN_PATCHLEVEL +([0-9]+)"
+                 CUDNN_PATCHLEVEL_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define CUDNN_PATCHLEVEL +([0-9]+)" "\\1"
+                         CUDNN_PATCHLEVEL_VERSION "${CUDNN_PATCHLEVEL_VERSION}")
+
+    if(NOT CUDNN_MAJOR_VERSION)
+      set(CUDNN_VERSION "???")
     else()
-        string(REGEX MATCH "define CUDNN_MAJOR +([0-9]+)" CUDNN_MAJOR_VERSION
-            "${CUDNN_VERSION_FILE_CONTENTS}")
-        string(REGEX REPLACE "define CUDNN_MAJOR +([0-9]+)" "\\1"
-            CUDNN_MAJOR_VERSION "${CUDNN_MAJOR_VERSION}")
-        string(REGEX MATCH "define CUDNN_MINOR +([0-9]+)" CUDNN_MINOR_VERSION
-            "${CUDNN_VERSION_FILE_CONTENTS}")
-        string(REGEX REPLACE "define CUDNN_MINOR +([0-9]+)" "\\1"
-            CUDNN_MINOR_VERSION "${CUDNN_MINOR_VERSION}")
-        string(REGEX MATCH "define CUDNN_PATCHLEVEL +([0-9]+)"
-            CUDNN_PATCHLEVEL_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
-        string(REGEX REPLACE "define CUDNN_PATCHLEVEL +([0-9]+)" "\\1"
-            CUDNN_PATCHLEVEL_VERSION "${CUDNN_PATCHLEVEL_VERSION}")
-
-        if(NOT CUDNN_MAJOR_VERSION)
-            set(CUDNN_VERSION "???")
-        else()
-            add_definitions("-DCUDNN_MAJOR_VERSION=\"${CUDNN_MAJOR_VERSION}\"")
-            math(EXPR CUDNN_VERSION
-                "${CUDNN_MAJOR_VERSION} * 1000 +
+      add_definitions("-DCUDNN_MAJOR_VERSION=\"${CUDNN_MAJOR_VERSION}\"")
+      math(EXPR CUDNN_VERSION "${CUDNN_MAJOR_VERSION} * 1000 +
                  ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
-            message(STATUS "Current cuDNN header is ${cudnn_header_file} "
-              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. ")
-        endif()
+      message(
+        STATUS
+          "Current cuDNN header is ${cudnn_header_file} "
+          "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. "
+      )
     endif()
+  endif()
 endmacro()
 
 if(CUDNN_FOUND)
-  find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn.h) 
-  if (NOT CUDNN_MAJOR_VERSION) 
-    find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn_version.h) 
+  find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn.h)
+  if(NOT CUDNN_MAJOR_VERSION)
+    find_cudnn_version(${CUDNN_INCLUDE_DIR}/cudnn_version.h)
   endif()
 endif()
diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake
index 2d7b1917b6873..6bf0141c208c7 100644
--- a/cmake/cupti.cmake
+++ b/cmake/cupti.cmake
@@ -1,44 +1,51 @@
 if(NOT WITH_GPU)
-    return()
+  return()
 endif()
 
-
-set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT")
-find_path(CUPTI_INCLUDE_DIR cupti.h
-        PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
-        $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
+set(CUPTI_ROOT
+    "/usr"
+    CACHE PATH "CUPTI ROOT")
+find_path(
+  CUPTI_INCLUDE_DIR cupti.h
+  PATHS ${CUPTI_ROOT}
+        ${CUPTI_ROOT}/include
+        $ENV{CUPTI_ROOT}
+        $ENV{CUPTI_ROOT}/include
         ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
         ${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/include
         ${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux/include
-        NO_DEFAULT_PATH
-        )
+  NO_DEFAULT_PATH)
 
 get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
 
 set(TARGET_ARCH "x86_64")
 if(NOT ${CMAKE_SYSTEM_PROCESSOR})
-    set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+  set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
 endif()
 
-list(APPEND CUPTI_CHECK_LIBRARY_DIRS
-        ${CUPTI_ROOT}
-        ${CUPTI_ROOT}/lib64
-        ${CUPTI_ROOT}/lib
-        ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
-        $ENV{CUPTI_ROOT}
-        $ENV{CUPTI_ROOT}/lib64
-        $ENV{CUPTI_ROOT}/lib
-        /usr/lib
-        ${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib64
-        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
-find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
-       PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
-       NO_DEFAULT_PATH
-       DOC "Path to cuPTI library.")
+list(
+  APPEND
+  CUPTI_CHECK_LIBRARY_DIRS
+  ${CUPTI_ROOT}
+  ${CUPTI_ROOT}/lib64
+  ${CUPTI_ROOT}/lib
+  ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+  $ENV{CUPTI_ROOT}
+  $ENV{CUPTI_ROOT}/lib64
+  $ENV{CUPTI_ROOT}/lib
+  /usr/lib
+  ${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib64
+  ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
+find_library(
+  CUPTI_LIBRARY
+  NAMES libcupti.so libcupti.dylib # libcupti_static.a
+  PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
+  NO_DEFAULT_PATH
+  DOC "Path to cuPTI library.")
 
 get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
 if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
-    set(CUPTI_FOUND ON)
+  set(CUPTI_FOUND ON)
 else()
-    set(CUPTI_FOUND OFF)
+  set(CUPTI_FOUND OFF)
 endif()
diff --git a/cmake/experimental.cmake b/cmake/experimental.cmake
index 55e7fe263f9dc..0e4b197645673 100644
--- a/cmake/experimental.cmake
+++ b/cmake/experimental.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/cmake/experiments/cuda_module_loading_lazy.cmake b/cmake/experiments/cuda_module_loading_lazy.cmake
index ef6a51b594b9e..0f0793a8ee32b 100644
--- a/cmake/experiments/cuda_module_loading_lazy.cmake
+++ b/cmake/experiments/cuda_module_loading_lazy.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,25 +16,35 @@
 # cuda moduel lazy loading is supported by CUDA 11.6+
 # this experiment option makes Paddle supports lazy loading before CUDA 11.6.
 
-option(EXP_CUDA_MODULE_LOADING_LAZY  "enable lazy cuda module loading" OFF)
-if (${EXP_CUDA_MODULE_LOADING_LAZY})
-  if (NOT ${ON_INFER} OR NOT ${LINUX})
-    message("EXP_CUDA_MODULE_LOADING_LAZY only works with ON_INFER=ON on Linux platforms")
+option(EXP_CUDA_MODULE_LOADING_LAZY "enable lazy cuda module loading" OFF)
+if(${EXP_CUDA_MODULE_LOADING_LAZY})
+  if(NOT ${ON_INFER} OR NOT ${LINUX})
+    message(
+      "EXP_CUDA_MODULE_LOADING_LAZY only works with ON_INFER=ON on Linux platforms"
+    )
     return()
-  endif ()
-  if (NOT ${CUDA_FOUND})
+  endif()
+  if(NOT ${CUDA_FOUND})
     message("EXP_CUDA_MODULE_LOADING_LAZY only works with CUDA")
     return()
-  endif ()
-  if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11.6")
+  endif()
+  if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11.6")
     message("cuda 11.6+ already support lazy module loading")
     return()
-  endif ()
+  endif()
 
-  message("for cuda before 11.6, libcudart.so must be used for the lazy module loading trick to work, instead of libcudart_static.a")
-  set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE BOOL "" FORCE)
+  message(
+    "for cuda before 11.6, libcudart.so must be used for the lazy module loading trick to work, instead of libcudart_static.a"
+  )
+  set(CUDA_USE_STATIC_CUDA_RUNTIME
+      OFF
+      CACHE BOOL "" FORCE)
   set(CMAKE_CUDA_FLAGS "--cudart shared")
   enable_language(CUDA)
-  set(CUDA_NVCC_EXECUTABLE "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" CACHE FILEPATH "" FORCE)
-  set(CMAKE_CUDA_COMPILER "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" CACHE FILEPATH "" FORCE)
+  set(CUDA_NVCC_EXECUTABLE
+      "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy"
+      CACHE FILEPATH "" FORCE)
+  set(CMAKE_CUDA_COMPILER
+      "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy"
+      CACHE FILEPATH "" FORCE)
 endif()
diff --git a/cmake/external/arm_brpc.cmake b/cmake/external/arm_brpc.cmake
index 83935ae0c6346..660261d3ffcce 100755
--- a/cmake/external/arm_brpc.cmake
+++ b/cmake/external/arm_brpc.cmake
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
 #find_package(OpenSSL REQUIRED)
 
@@ -25,52 +25,56 @@ INCLUDE(ExternalProject)
 #ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
 #SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY})
 
-IF((NOT DEFINED ARM_BRPC_NAME) OR (NOT DEFINED ARM_BRPC_URL))
-  SET(ARM_BRPC_VER "1.1.0" CACHE STRING "" FORCE)
-  SET(ARM_BRPC_NAME "arm_brpc" CACHE STRING "" FORCE)
-ENDIF()
+if((NOT DEFINED ARM_BRPC_NAME) OR (NOT DEFINED ARM_BRPC_URL))
+  set(ARM_BRPC_VER
+      "1.1.0"
+      CACHE STRING "" FORCE)
+  set(ARM_BRPC_NAME
+      "arm_brpc"
+      CACHE STRING "" FORCE)
+endif()
 
-MESSAGE(STATUS "ARM_BRPC_NAME: ${ARM_BRPC_NAME}, ARM_BRPC_URL: ${ARM_BRPC_URL}")
-SET(ARM_BRPC_PREFIX_DIR    "${THIRD_PARTY_PATH}/arm_brpc")
-SET(ARM_BRPC_PROJECT       "extern_arm_brpc")
-SET(ARM_BRPC_DOWNLOAD_DIR  "${ARM_BRPC_PREFIX_DIR}/src/${ARM_BRPC_PROJECT}")
-SET(ARM_BRPC_DST_DIR       "output")
-SET(ARM_BRPC_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(ARM_BRPC_INSTALL_DIR   ${ARM_BRPC_INSTALL_ROOT}/arm_brpc/output)
-SET(ARM_BRPC_ROOT          ${ARM_BRPC_INSTALL_DIR})
-SET(ARM_BRPC_INC_DIR       ${ARM_BRPC_ROOT}/include)
-SET(ARM_BRPC_LIB_DIR       ${ARM_BRPC_ROOT}/lib)
-SET(ARM_BRPC_LIB           ${ARM_BRPC_LIB_DIR}/libbrpc.a)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ARM_BRPC_ROOT}/lib")
+message(STATUS "ARM_BRPC_NAME: ${ARM_BRPC_NAME}, ARM_BRPC_URL: ${ARM_BRPC_URL}")
+set(ARM_BRPC_PREFIX_DIR "${THIRD_PARTY_PATH}/arm_brpc")
+set(ARM_BRPC_PROJECT "extern_arm_brpc")
+set(ARM_BRPC_DOWNLOAD_DIR "${ARM_BRPC_PREFIX_DIR}/src/${ARM_BRPC_PROJECT}")
+set(ARM_BRPC_DST_DIR "output")
+set(ARM_BRPC_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
+set(ARM_BRPC_INSTALL_DIR ${ARM_BRPC_INSTALL_ROOT}/arm_brpc/output)
+set(ARM_BRPC_ROOT ${ARM_BRPC_INSTALL_DIR})
+set(ARM_BRPC_INC_DIR ${ARM_BRPC_ROOT}/include)
+set(ARM_BRPC_LIB_DIR ${ARM_BRPC_ROOT}/lib)
+set(ARM_BRPC_LIB ${ARM_BRPC_LIB_DIR}/libbrpc.a)
+set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ARM_BRPC_ROOT}/lib")
 
-INCLUDE_DIRECTORIES(${ARM_BRPC_INSTALL_ROOT}/${ARM_BRPC_NAME}/output/include)
+include_directories(${ARM_BRPC_INSTALL_ROOT}/${ARM_BRPC_NAME}/output/include)
 
-FILE(WRITE ${ARM_BRPC_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(ARM_BRPC)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
+file(
+  WRITE ${ARM_BRPC_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(ARM_BRPC)\n" "cmake_minimum_required(VERSION 3.0)\n"
   "install(DIRECTORY ${ARM_BRPC_DST_DIR} ${ARM_BRPC_DST_DIR} \n"
   "        DESTINATION ${ARM_BRPC_NAME})\n")
-  
-SET(ARM_BRPC_URL "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/output.tar.gz" CACHE STRING "" FORCE)
+
+set(ARM_BRPC_URL
+    "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/output.tar.gz"
+    CACHE STRING "" FORCE)
 ExternalProject_Add(
-    ${ARM_BRPC_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${ARM_BRPC_PREFIX_DIR}
-    DOWNLOAD_DIR          ${ARM_BRPC_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      rm -rf output.tar.gz
-                          && wget --no-check-certificate ${ARM_BRPC_URL}
-                          && tar zxvf output.tar.gz
-    #DOWNLOAD_COMMAND      cp /home/wangbin44/Paddle/build/output.tar.gz . 
-    #                      && tar zxvf output.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${ARM_BRPC_INSTALL_ROOT}
-                          -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${ARM_BRPC_INSTALL_ROOT}
-                          -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-    BUILD_BYPRODUCTS      ${ARM_BRPC_LIB}
-)
+  ${ARM_BRPC_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  PREFIX ${ARM_BRPC_PREFIX_DIR}
+  DOWNLOAD_DIR ${ARM_BRPC_DOWNLOAD_DIR}
+  DOWNLOAD_COMMAND rm -rf output.tar.gz && wget --no-check-certificate
+                   ${ARM_BRPC_URL} && tar zxvf output.tar.gz
+  #DOWNLOAD_COMMAND      cp /home/wangbin44/Paddle/build/output.tar.gz .
+  #                      && tar zxvf output.tar.gz
+  DOWNLOAD_NO_PROGRESS 1
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ARM_BRPC_INSTALL_ROOT}
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+  CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ARM_BRPC_INSTALL_ROOT}
+                   -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+  BUILD_BYPRODUCTS ${ARM_BRPC_LIB})
 
-ADD_LIBRARY(arm_brpc STATIC IMPORTED GLOBAL)  # 直接导入已经生成的库
-SET_PROPERTY(TARGET arm_brpc PROPERTY IMPORTED_LOCATION ${ARM_BRPC_LIB})
-ADD_DEPENDENCIES(arm_brpc ${ARM_BRPC_PROJECT})
+add_library(arm_brpc STATIC IMPORTED GLOBAL) # 直接导入已经生成的库
+set_property(TARGET arm_brpc PROPERTY IMPORTED_LOCATION ${ARM_BRPC_LIB})
+add_dependencies(arm_brpc ${ARM_BRPC_PROJECT})
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
index d02f47142e775..3dbe7e6e8aa90 100644
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -12,21 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 #NOTE: Logic is from
 # https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt
 if(DEFINED ENV{ASCEND_CUSTOM_PATH})
-    set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH})
+  set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH})
 else()
-    set(ASCEND_DIR /usr/local/Ascend)
+  set(ASCEND_DIR /usr/local/Ascend)
 endif()
 
-if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h)
+if(EXISTS
+   ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h)
   # It means CANN 20.2 +
   add_definitions(-DPADDLE_WITH_ASCEND_STRING)
 endif()
 
-
 if(WITH_ASCEND OR WITH_ASCEND_CL)
   set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
   set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
@@ -36,28 +35,32 @@ if(WITH_ASCEND OR WITH_ASCEND_CL)
   set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
   set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
 
-  set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
+  set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR}
+                             ${ASCEND_ATC_DIR})
   set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
   set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
-  set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ATLAS_RUNTIME_INC_DIR
+      ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
   set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
   set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
-  set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
+  set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR}
+                            ${ATLAS_ATC_DIR})
 
   set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
   set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
   set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
-  INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
-
+  include_directories(${ATLAS_RUNTIME_INC_DIR})
 
-  ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
-  SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
+  add_library(ascend_ge SHARED IMPORTED GLOBAL)
+  set_property(TARGET ascend_ge PROPERTY IMPORTED_LOCATION
+                                         ${atlas_ge_runner_lib})
 
-  ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
-  SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
+  add_library(ascend_graph SHARED IMPORTED GLOBAL)
+  set_property(TARGET ascend_graph PROPERTY IMPORTED_LOCATION
+                                            ${atlas_graph_lib})
 
-  ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
-  SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
+  add_library(atlas_acl SHARED IMPORTED GLOBAL)
+  set_property(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
 
   add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
 endif()
@@ -73,52 +76,60 @@ if(WITH_ASCEND_CL)
 
   message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
   message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
-  INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR})
-  INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR})
+  include_directories(${FWKACLLIB_INC_DIR})
+  include_directories(${ACLLIB_INC_DIR})
 
-  ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
-  SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
+  add_library(ascendcl SHARED IMPORTED GLOBAL)
+  set_property(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
 
-  ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
-  SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
+  add_library(ascend_hccl SHARED IMPORTED GLOBAL)
+  set_property(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
 
-  ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
-  SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
+  add_library(acl_op_compiler SHARED IMPORTED GLOBAL)
+  set_property(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION
+                                               ${acl_op_compiler_lib})
   add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
 endif()
 
-if (WITH_ASCEND_CL)
-macro(find_ascend_toolkit_version ascend_toolkit_version_info) 
+if(WITH_ASCEND_CL)
+  macro(find_ascend_toolkit_version ascend_toolkit_version_info)
     file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS)
-    string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
-    string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
-    string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION})
-    STRING(SUBSTRING "${CANN_VERSION}000" 0 6 CANN_VERSION)
+    string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)"
+                 ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
+    string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" "\\1"
+                         ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
+    string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION
+                         ${ASCEND_TOOLKIT_VERSION})
+    string(SUBSTRING "${CANN_VERSION}000" 0 6 CANN_VERSION)
     add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}")
     if(NOT ASCEND_TOOLKIT_VERSION)
-        set(ASCEND_TOOLKIT_VERSION "???")
+      set(ASCEND_TOOLKIT_VERSION "???")
     else()
-        message(STATUS "Current Ascend Toolkit version is ${ASCEND_TOOLKIT_VERSION}")
+      message(
+        STATUS "Current Ascend Toolkit version is ${ASCEND_TOOLKIT_VERSION}")
     endif()
-endmacro()
+  endmacro()
 
-macro(find_ascend_driver_version ascend_driver_version_info) 
+  macro(find_ascend_driver_version ascend_driver_version_info)
     file(READ ${ascend_driver_version_info} ASCEND_DRIVER_VERSION_CONTENTS)
-    string(REGEX MATCH "Version=([0-9]+\.[0-9]+\.[0-9]+)" ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION_CONTENTS}")
-    string(REGEX REPLACE "Version=([0-9]+\.[0-9]+\.[0-9]+)" "\\1" ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION}")
+    string(REGEX MATCH "Version=([0-9]+\.[0-9]+\.[0-9]+)" ASCEND_DRIVER_VERSION
+                 "${ASCEND_DRIVER_VERSION_CONTENTS}")
+    string(REGEX REPLACE "Version=([0-9]+\.[0-9]+\.[0-9]+)" "\\1"
+                         ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION}")
     if(NOT ASCEND_DRIVER_VERSION)
-        set(ASCEND_DRIVER_VERSION "???")
+      set(ASCEND_DRIVER_VERSION "???")
     else()
-        message(STATUS "Current Ascend Driver version is ${ASCEND_DRIVER_VERSION}")
+      message(
+        STATUS "Current Ascend Driver version is ${ASCEND_DRIVER_VERSION}")
     endif()
-endmacro()
+  endmacro()
 
-if (WITH_ARM)
-  set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/arm64-linux)
-else()
-  set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/x86_64-linux)
-endif()
+  if(WITH_ARM)
+    set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/arm64-linux)
+  else()
+    set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/x86_64-linux)
+  endif()
 
-find_ascend_toolkit_version(${ASCEND_TOOLKIT_DIR}/ascend_toolkit_install.info)
-find_ascend_driver_version(${ASCEND_DIR}/driver/version.info)
+  find_ascend_toolkit_version(${ASCEND_TOOLKIT_DIR}/ascend_toolkit_install.info)
+  find_ascend_driver_version(${ASCEND_DIR}/driver/version.info)
 endif()
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index e47b608341bee..810796831e23e 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -14,7 +14,7 @@
 
 include(ExternalProject)
 
-set(BOOST_PROJECT       "extern_boost")
+set(BOOST_PROJECT "extern_boost")
 # To release PaddlePaddle as a pip package, we have to follow the
 # manylinux1 standard, which features as old Linux kernels and
 # compilers as possible and recommends CentOS 5. Indeed, the earliest
@@ -22,36 +22,41 @@ set(BOOST_PROJECT       "extern_boost")
 # version of boost, say, 1.66.0, doesn't build on CentOS 6.  We
 # checked that the devtools package of CentOS 6 installs boost 1.41.0.
 # So we use 1.41.0 here.
-set(BOOST_VER   "1.41.0")
+set(BOOST_VER "1.41.0")
 # boost_1_41_0_2021_10.tar.gz is almost the same with boost_1_41_0.tar.gz,
 # except in visualc.hpp i comment a warning of "unknown compiler version",
 # so if you need to change boost, you may need to block the warning similarly.
-set(BOOST_TAR   "boost_1_41_0_2021_10" CACHE STRING "" FORCE)
-set(BOOST_URL   "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
+set(BOOST_TAR
+    "boost_1_41_0_2021_10"
+    CACHE STRING "" FORCE)
+set(BOOST_URL
+    "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz"
+    CACHE STRING "" FORCE)
 
-MESSAGE(STATUS "BOOST_VERSION: ${BOOST_VER}, BOOST_URL: ${BOOST_URL}")
+message(STATUS "BOOST_VERSION: ${BOOST_VER}, BOOST_URL: ${BOOST_URL}")
 
 set(BOOST_PREFIX_DIR ${THIRD_PARTY_PATH}/boost)
-set(BOOST_INCLUDE_DIR "${THIRD_PARTY_PATH}/boost/src/extern_boost" CACHE PATH "boost include directory." FORCE)
+set(BOOST_INCLUDE_DIR
+    "${THIRD_PARTY_PATH}/boost/src/extern_boost"
+    CACHE PATH "boost include directory." FORCE)
 set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
 include_directories(${BOOST_INCLUDE_DIR})
 
 if(WIN32 AND MSVC_VERSION GREATER_EQUAL 1600)
-    add_definitions(-DBOOST_HAS_STATIC_ASSERT)
+  add_definitions(-DBOOST_HAS_STATIC_ASSERT)
 endif()
 
 ExternalProject_Add(
-    ${BOOST_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    URL                   ${BOOST_URL}
-    URL_MD5               51be7cc203628dc0848e97eee32d79e3
-    PREFIX                ${BOOST_PREFIX_DIR}
-    DOWNLOAD_NO_PROGRESS  1
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         ""
-    INSTALL_COMMAND       ""
-    UPDATE_COMMAND        ""
-    )
+  ${BOOST_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  URL ${BOOST_URL}
+  URL_MD5 51be7cc203628dc0848e97eee32d79e3
+  PREFIX ${BOOST_PREFIX_DIR}
+  DOWNLOAD_NO_PROGRESS 1
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  UPDATE_COMMAND "")
 
 add_library(boost INTERFACE)
 
diff --git a/cmake/external/box_ps.cmake b/cmake/external/box_ps.cmake
index 85e1f94fd2c67..2bb1fe0a0d1b0 100644
--- a/cmake/external/box_ps.cmake
+++ b/cmake/external/box_ps.cmake
@@ -12,48 +12,53 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(BOX_PS_PROJECT       "extern_box_ps")
-IF((NOT DEFINED BOX_PS_VER) OR (NOT DEFINED BOX_PS_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(BOX_PS_VER "0.1.1" CACHE STRING "" FORCE)
-  SET(BOX_PS_NAME "box_ps" CACHE STRING "" FORCE)
-  SET(BOX_PS_URL "http://box-ps.gz.bcebos.com/box_ps.tar.gz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "BOX_PS_NAME: ${BOX_PS_NAME}, BOX_PS_URL: ${BOX_PS_URL}")
-SET(BOX_PS_SOURCE_DIR    "${THIRD_PARTY_PATH}/box_ps")
-SET(BOX_PS_DOWNLOAD_DIR  "${BOX_PS_SOURCE_DIR}/src/${BOX_PS_PROJECT}")
-SET(BOX_PS_DST_DIR       "box_ps")
-SET(BOX_PS_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(BOX_PS_INSTALL_DIR   ${BOX_PS_INSTALL_ROOT}/${BOX_PS_DST_DIR})
-SET(BOX_PS_ROOT          ${BOX_PS_INSTALL_DIR})
-SET(BOX_PS_INC_DIR       ${BOX_PS_ROOT}/include)
-SET(BOX_PS_LIB_DIR       ${BOX_PS_ROOT}/lib)
-SET(BOX_PS_LIB           ${BOX_PS_LIB_DIR}/libbox_ps.so)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${BOX_PS_ROOT}/lib")
+set(BOX_PS_PROJECT "extern_box_ps")
+if((NOT DEFINED BOX_PS_VER) OR (NOT DEFINED BOX_PS_URL))
+  message(STATUS "use pre defined download url")
+  set(BOX_PS_VER
+      "0.1.1"
+      CACHE STRING "" FORCE)
+  set(BOX_PS_NAME
+      "box_ps"
+      CACHE STRING "" FORCE)
+  set(BOX_PS_URL
+      "http://box-ps.gz.bcebos.com/box_ps.tar.gz"
+      CACHE STRING "" FORCE)
+endif()
+message(STATUS "BOX_PS_NAME: ${BOX_PS_NAME}, BOX_PS_URL: ${BOX_PS_URL}")
+set(BOX_PS_SOURCE_DIR "${THIRD_PARTY_PATH}/box_ps")
+set(BOX_PS_DOWNLOAD_DIR "${BOX_PS_SOURCE_DIR}/src/${BOX_PS_PROJECT}")
+set(BOX_PS_DST_DIR "box_ps")
+set(BOX_PS_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
+set(BOX_PS_INSTALL_DIR ${BOX_PS_INSTALL_ROOT}/${BOX_PS_DST_DIR})
+set(BOX_PS_ROOT ${BOX_PS_INSTALL_DIR})
+set(BOX_PS_INC_DIR ${BOX_PS_ROOT}/include)
+set(BOX_PS_LIB_DIR ${BOX_PS_ROOT}/lib)
+set(BOX_PS_LIB ${BOX_PS_LIB_DIR}/libbox_ps.so)
+set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${BOX_PS_ROOT}/lib")
 
-INCLUDE_DIRECTORIES(${BOX_PS_INC_DIR})
-FILE(WRITE ${BOX_PS_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(BOX_PS)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
+include_directories(${BOX_PS_INC_DIR})
+file(
+  WRITE ${BOX_PS_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(BOX_PS)\n" "cmake_minimum_required(VERSION 3.0)\n"
   "install(DIRECTORY ${BOX_PS_NAME}/include ${BOX_PS_NAME}/lib \n"
   "        DESTINATION ${BOX_PS_DST_DIR})\n")
 ExternalProject_Add(
-    ${BOX_PS_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${BOX_PS_SOURCE_DIR}
-    DOWNLOAD_DIR          ${BOX_PS_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${BOX_PS_URL} -c -q -O ${BOX_PS_NAME}.tar.gz
-                          && tar zxvf ${BOX_PS_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${BOX_PS_INSTALL_ROOT}
-                          -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${BOX_PS_INSTALL_ROOT}
-                          -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-    BUILD_BYPRODUCTS      ${BOX_PS_LIB}
-)
-ADD_LIBRARY(box_ps SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET box_ps PROPERTY IMPORTED_LOCATION ${BOX_PS_LIB})
-ADD_DEPENDENCIES(box_ps ${BOX_PS_PROJECT})
+  ${BOX_PS_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  PREFIX ${BOX_PS_SOURCE_DIR}
+  DOWNLOAD_DIR ${BOX_PS_DOWNLOAD_DIR}
+  DOWNLOAD_COMMAND wget --no-check-certificate ${BOX_PS_URL} -c -q -O
+                   ${BOX_PS_NAME}.tar.gz && tar zxvf ${BOX_PS_NAME}.tar.gz
+  DOWNLOAD_NO_PROGRESS 1
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${BOX_PS_INSTALL_ROOT}
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+  CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BOX_PS_INSTALL_ROOT}
+                   -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+  BUILD_BYPRODUCTS ${BOX_PS_LIB})
+add_library(box_ps SHARED IMPORTED GLOBAL)
+set_property(TARGET box_ps PROPERTY IMPORTED_LOCATION ${BOX_PS_LIB})
+add_dependencies(box_ps ${BOX_PS_PROJECT})
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index c891708751aa8..4434e3fbed180 100755
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -12,66 +12,80 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
 find_package(OpenSSL REQUIRED)
 
 message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY})
 message(STATUS "crypto:" ${OPENSSL_CRYPTO_LIBRARY})
 
-ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY})
+add_library(ssl SHARED IMPORTED GLOBAL)
+set_property(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY})
 
-ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY})
+add_library(crypto SHARED IMPORTED GLOBAL)
+set_property(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY})
 
-SET(BRPC_PREFIX_DIR  ${THIRD_PARTY_PATH}/brpc)
-SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
-SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE)
-SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc library." FORCE)
+set(BRPC_PREFIX_DIR ${THIRD_PARTY_PATH}/brpc)
+set(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
+set(BRPC_INCLUDE_DIR
+    "${BRPC_INSTALL_DIR}/include"
+    CACHE PATH "brpc include directory." FORCE)
+set(BRPC_LIBRARIES
+    "${BRPC_INSTALL_DIR}/lib/libbrpc.a"
+    CACHE FILEPATH "brpc library." FORCE)
 
-INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
+include_directories(${BRPC_INCLUDE_DIR})
 
 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
-set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
+set(prefix_path
+    "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog"
+)
 
 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
-        extern_brpc
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        # TODO(gongwb): change to de newst repo when they changed
-        GIT_REPOSITORY  "https://github.com/wangjiawei04/brpc"
-        #GIT_REPOSITORY  "https://github.com/ziyoujiyi/brpc" # ssl error in the previous repo（can be mannual fixed）
-        GIT_TAG         "e203afb794caf027da0f1e0776443e7d20c0c28e"
-        PREFIX          ${BRPC_PREFIX_DIR}
-        UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                        -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR}
-                        -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib
-                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        -DCMAKE_PREFIX_PATH=${prefix_path}
-                        -DWITH_GLOG=ON
-                        -DIOBUF_WITH_HUGE_BLOCK=ON
-                        -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
-                        ${EXTERNAL_OPTIONAL_ARGS}
-        LIST_SEPARATOR  |
-        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
-                         -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib
-                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                         -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        BUILD_BYPRODUCTS ${BRPC_LIBRARIES}
-)
+  extern_brpc
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  # TODO(gongwb): change to de newst repo when they changed
+  GIT_REPOSITORY "https://github.com/wangjiawei04/brpc"
+  #GIT_REPOSITORY  "https://github.com/ziyoujiyi/brpc" # ssl error in the previous repo（can be mannual fixed）
+  GIT_TAG "e203afb794caf027da0f1e0776443e7d20c0c28e"
+  PREFIX ${BRPC_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+             -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+             -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR}
+             -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             -DCMAKE_PREFIX_PATH=${prefix_path}
+             -DWITH_GLOG=ON
+             -DIOBUF_WITH_HUGE_BLOCK=ON
+             -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
+             ${EXTERNAL_OPTIONAL_ARGS}
+  LIST_SEPARATOR |
+  CMAKE_CACHE_ARGS
+    -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
+    -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+  BUILD_BYPRODUCTS ${BRPC_LIBRARIES})
 
 # ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
-ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog snappy)
-ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
-ADD_DEPENDENCIES(brpc extern_brpc)
+add_dependencies(
+  extern_brpc
+  protobuf
+  ssl
+  crypto
+  leveldb
+  gflags
+  glog
+  snappy)
+add_library(brpc STATIC IMPORTED GLOBAL)
+set_property(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
+add_dependencies(brpc extern_brpc)
 
 add_definitions(-DBRPC_WITH_GLOG)
 
-LIST(APPEND external_project_dependencies brpc)
+list(APPEND external_project_dependencies brpc)
diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake
index 2ec9a3faa07b7..5dd84657c8605 100644
--- a/cmake/external/cinn.cmake
+++ b/cmake/external/cinn.cmake
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if (NOT WITH_CINN)
+if(NOT WITH_CINN)
   return()
 endif()
 
@@ -27,36 +27,33 @@ add_definitions(-w)
 include(ExternalProject)
 set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN)
 set(CINN_GIT_TAG release/v0.2)
-set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION}
-                       -DWITH_CUDA=${WITH_GPU}
-                       -DWITH_CUDNN=${WITH_GPU}
-                       -DWITH_MKL_CBLAS=${WITH_MKL}
-                       -DWITH_MKLDNN=${WITH_MKL}
-                       -DPUBLISH_LIBS=ON
-                       -DWITH_TESTING=ON
-)
+set(CINN_OPTIONAL_ARGS
+    -DPY_VERSION=${PY_VERSION}
+    -DWITH_CUDA=${WITH_GPU}
+    -DWITH_CUDNN=${WITH_GPU}
+    -DWITH_MKL_CBLAS=${WITH_MKL}
+    -DWITH_MKLDNN=${WITH_MKL}
+    -DPUBLISH_LIBS=ON
+    -DWITH_TESTING=ON)
 set(CINN_BUILD_COMMAND $(MAKE) cinnapi -j)
 ExternalProject_Add(
   external_cinn
   ${EXTERNAL_PROJECT_LOG_ARGS}
-  GIT_REPOSITORY   "${GIT_URL}/PaddlePaddle/CINN.git"
-  GIT_TAG          ${CINN_GIT_TAG}
-  PREFIX           ${CINN_PREFIX_DIR}
-  BUILD_COMMAND    ${CINN_BUILD_COMMAND}
-  INSTALL_COMMAND  ""
-  CMAKE_ARGS       ${CINN_OPTIONAL_ARGS})
+  GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/CINN.git"
+  GIT_TAG ${CINN_GIT_TAG}
+  PREFIX ${CINN_PREFIX_DIR}
+  BUILD_COMMAND ${CINN_BUILD_COMMAND}
+  INSTALL_COMMAND ""
+  CMAKE_ARGS ${CINN_OPTIONAL_ARGS})
 
-
-
-ExternalProject_Get_property(external_cinn BINARY_DIR)
-ExternalProject_Get_property(external_cinn SOURCE_DIR)
+ExternalProject_Get_Property(external_cinn BINARY_DIR)
+ExternalProject_Get_Property(external_cinn SOURCE_DIR)
 set(CINN_BINARY_DIR ${BINARY_DIR})
 set(CINN_SOURCE_DIR ${SOURCE_DIR})
 
 message(STATUS "CINN BINARY_DIR: ${CINN_BINARY_DIR}")
 message(STATUS "CINN SOURCE_DIR: ${CINN_SOURCE_DIR}")
 
-
 ######################################
 # Add CINN's dependencies header files
 ######################################
@@ -82,6 +79,7 @@ set(CINN_LIB_LOCATION "${CINN_BINARY_DIR}/dist/cinn/lib")
 set(CINN_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/cinn/include")
 
 add_library(cinn SHARED IMPORTED GLOBAL)
-set_target_properties(cinn PROPERTIES IMPORTED_LOCATION "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}")
+set_target_properties(cinn PROPERTIES IMPORTED_LOCATION
+                                      "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}")
 include_directories(${CINN_INCLUDE_DIR})
 add_dependencies(cinn external_cinn)
diff --git a/cmake/external/concurrentqueue.cmake b/cmake/external/concurrentqueue.cmake
index 9e4331ae6fdea..0ff3612efed4b 100644
--- a/cmake/external/concurrentqueue.cmake
+++ b/cmake/external/concurrentqueue.cmake
@@ -16,27 +16,32 @@ include(ExternalProject)
 
 set(CONCURRENTQUEUE_PROJECT "extern_concurrentqueue")
 set(CONCURRENTQUEUE_VER "v1.0.3")
-SET(CONCURRENTQUEUE_URL_MD5 118e5bb661b567634647312991e10222)
-set(CONCURRENTQUEUE_PREFIX_URL "https://github.com/cameron314/concurrentqueue/archive/refs/tags")
-set(CONCURRENTQUEUE_URL "${CONCURRENTQUEUE_PREFIX_URL}/${CONCURRENTQUEUE_VER}.tar.gz")
+set(CONCURRENTQUEUE_URL_MD5 118e5bb661b567634647312991e10222)
+set(CONCURRENTQUEUE_PREFIX_URL
+    "https://github.com/cameron314/concurrentqueue/archive/refs/tags")
+set(CONCURRENTQUEUE_URL
+    "${CONCURRENTQUEUE_PREFIX_URL}/${CONCURRENTQUEUE_VER}.tar.gz")
 
-MESSAGE(STATUS "CONCURRENTQUEUE_VERSION: ${CONCURRENTQUEUE_VER}, CONCURRENTQUEUE_URL: ${CONCURRENTQUEUE_URL}")
+message(
+  STATUS
+    "CONCURRENTQUEUE_VERSION: ${CONCURRENTQUEUE_VER}, CONCURRENTQUEUE_URL: ${CONCURRENTQUEUE_URL}"
+)
 
 set(CONCURRENTQUEUE_PREFIX_DIR ${THIRD_PARTY_PATH}/concurrentqueue)
 set(CONCURRENTQUEUE_SOURCE_DIR ${THIRD_PARTY_PATH}/concurrentqueue/src/)
-set(CONCURRENTQUEUE_INCLUDE_DIR "${CONCURRENTQUEUE_SOURCE_DIR}/extern_concurrentqueue")
+set(CONCURRENTQUEUE_INCLUDE_DIR
+    "${CONCURRENTQUEUE_SOURCE_DIR}/extern_concurrentqueue")
 
 ExternalProject_Add(
-    ${CONCURRENTQUEUE_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    URL                   ${CONCURRENTQUEUE_URL}
-    URL_MD5               ${CONCURRENTQUEUE_URL_MD5}
-    PREFIX                ${CONCURRENTQUEUE_PREFIX_DIR}
-    DOWNLOAD_NO_PROGRESS  1
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         ""
-    INSTALL_COMMAND       ""
-    UPDATE_COMMAND        ""
-    )
+  ${CONCURRENTQUEUE_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  URL ${CONCURRENTQUEUE_URL}
+  URL_MD5 ${CONCURRENTQUEUE_URL_MD5}
+  PREFIX ${CONCURRENTQUEUE_PREFIX_DIR}
+  DOWNLOAD_NO_PROGRESS 1
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  UPDATE_COMMAND "")
 
 include_directories(${CONCURRENTQUEUE_INCLUDE_DIR})
diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake
index 27a013c1763a7..ff4d3b5c9ea9e 100644
--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@@ -12,68 +12,77 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(CRYPTOPP_PREFIX_DIR  ${THIRD_PARTY_PATH}/cryptopp)
-SET(CRYPTOPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cryptopp)
-SET(CRYPTOPP_INCLUDE_DIR "${CRYPTOPP_INSTALL_DIR}/include" CACHE PATH "cryptopp include directory." FORCE)
-SET(CRYPTOPP_REPOSITORY ${GIT_URL}/weidai11/cryptopp.git)
-SET(CRYPTOPP_TAG        CRYPTOPP_8_2_0)
+set(CRYPTOPP_PREFIX_DIR ${THIRD_PARTY_PATH}/cryptopp)
+set(CRYPTOPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cryptopp)
+set(CRYPTOPP_INCLUDE_DIR
+    "${CRYPTOPP_INSTALL_DIR}/include"
+    CACHE PATH "cryptopp include directory." FORCE)
+set(CRYPTOPP_REPOSITORY ${GIT_URL}/weidai11/cryptopp.git)
+set(CRYPTOPP_TAG CRYPTOPP_8_2_0)
 
-IF(WIN32)
-  SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE)
+if(WIN32)
+  set(CRYPTOPP_LIBRARIES
+      "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib"
+      CACHE FILEPATH "cryptopp library." FORCE)
   # There is a compilation parameter "/FI\"winapifamily.h\"" or "/FIwinapifamily.h" can't be used correctly
   # with Ninja on Windows. The only difference between the patch file and original
   # file is that the compilation parameters are changed to '/nologo'. This
   # patch command can be removed when upgrading to a higher version.
   if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-    set(CRYPTOPP_PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different "${PADDLE_SOURCE_DIR}/patches/cryptopp/CMakeLists.txt" "<SOURCE_DIR>/")
+    set(CRYPTOPP_PATCH_COMMAND
+        ${CMAKE_COMMAND} -E copy_if_different
+        "${PADDLE_SOURCE_DIR}/patches/cryptopp/CMakeLists.txt" "<SOURCE_DIR>/")
   endif()
-ELSE(WIN32)
-  SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE)
-ENDIF(WIN32)
+else(WIN32)
+  set(CRYPTOPP_LIBRARIES
+      "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a"
+      CACHE FILEPATH "cryptopp library." FORCE)
+endif(WIN32)
 
-IF(APPLE AND WITH_ARM)
-  SET(CMAKE_CXX_FLAGS "-DCRYPTOPP_ARM_CRC32_AVAILABLE=0")
-ENDIF()
+if(APPLE AND WITH_ARM)
+  set(CMAKE_CXX_FLAGS "-DCRYPTOPP_ARM_CRC32_AVAILABLE=0")
+endif()
 
-set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
-                        -DBUILD_SHARED=ON
-                        -DBUILD_STATIC=ON
-                        -DBUILD_TESTING=OFF
-                        -DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib
-                        -DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR}
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-)
+set(CRYPTOPP_CMAKE_ARGS
+    ${COMMON_CMAKE_ARGS}
+    -DBUILD_SHARED=ON
+    -DBUILD_STATIC=ON
+    -DBUILD_TESTING=OFF
+    -DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib
+    -DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR}
+    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER})
 
-INCLUDE_DIRECTORIES(${CRYPTOPP_INCLUDE_DIR})
+include_directories(${CRYPTOPP_INCLUDE_DIR})
 
 ExternalProject_Add(
-    extern_cryptopp
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    GIT_REPOSITORY  ${CRYPTOPP_REPOSITORY}
-    GIT_TAG         ${CRYPTOPP_TAG}
-    PREFIX          ${CRYPTOPP_PREFIX_DIR}
-    UPDATE_COMMAND  ""
-    PATCH_COMMAND
-    COMMAND ${CMAKE_COMMAND} -E remove_directory "<SOURCE_DIR>/cmake/"
-    COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "<SOURCE_DIR>/cmake"
-    COMMAND cd "<SOURCE_DIR>/cmake" && git checkout tags/${CRYPTOPP_TAG} -b ${CRYPTOPP_TAG}
-    COMMAND ${CMAKE_COMMAND} -E copy_directory "<SOURCE_DIR>/cmake/" "<SOURCE_DIR>/"
-    COMMAND ${CRYPTOPP_PATCH_COMMAND}
-    INSTALL_DIR     ${CRYPTOPP_INSTALL_DIR}
-    CMAKE_ARGS ${CRYPTOPP_CMAKE_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CRYPTOPP_INSTALL_DIR}
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-    BUILD_BYPRODUCTS ${CRYPTOPP_LIBRARIES}
-)
+  extern_cryptopp
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${CRYPTOPP_REPOSITORY}
+  GIT_TAG ${CRYPTOPP_TAG}
+  PREFIX ${CRYPTOPP_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  PATCH_COMMAND
+  COMMAND ${CMAKE_COMMAND} -E remove_directory "<SOURCE_DIR>/cmake/"
+  COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "<SOURCE_DIR>/cmake"
+  COMMAND cd "<SOURCE_DIR>/cmake" && git checkout tags/${CRYPTOPP_TAG} -b
+          ${CRYPTOPP_TAG}
+  COMMAND ${CMAKE_COMMAND} -E copy_directory "<SOURCE_DIR>/cmake/"
+          "<SOURCE_DIR>/"
+  COMMAND ${CRYPTOPP_PATCH_COMMAND}
+  INSTALL_DIR ${CRYPTOPP_INSTALL_DIR}
+  CMAKE_ARGS ${CRYPTOPP_CMAKE_ARGS}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_INSTALL_PREFIX:PATH=${CRYPTOPP_INSTALL_DIR}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+  BUILD_BYPRODUCTS ${CRYPTOPP_LIBRARIES})
 
-ADD_LIBRARY(cryptopp STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET cryptopp PROPERTY IMPORTED_LOCATION ${CRYPTOPP_LIBRARIES})
-ADD_DEPENDENCIES(cryptopp extern_cryptopp)
+add_library(cryptopp STATIC IMPORTED GLOBAL)
+set_property(TARGET cryptopp PROPERTY IMPORTED_LOCATION ${CRYPTOPP_LIBRARIES})
+add_dependencies(cryptopp extern_cryptopp)
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
index f263086e8bef8..04fad252dac88 100644
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -14,32 +14,32 @@
 
 include(ExternalProject)
 
-# Note(zhouwei): extern_cub  has code __FILE_, If the path of extern_cub is changed, 
-# it will effect about 30+ cu files sccache hit and slow compile speed  on windows. 
+# Note(zhouwei): extern_cub  has code __FILE_, If the path of extern_cub is changed,
+# it will effect about 30+ cu files sccache hit and slow compile speed  on windows.
 # Therefore, a fixed CUB_PATH will be input to increase the sccache hit rate.
-set(CUB_PATH        "${THIRD_PARTY_PATH}/cub" CACHE STRING "A path setting for external_cub path.")
-set(CUB_PREFIX_DIR  ${CUB_PATH})
+set(CUB_PATH
+    "${THIRD_PARTY_PATH}/cub"
+    CACHE STRING "A path setting for external_cub path.")
+set(CUB_PREFIX_DIR ${CUB_PATH})
 
-set(CUB_REPOSITORY  ${GIT_URL}/NVlabs/cub.git)
-set(CUB_TAG         1.8.0)
+set(CUB_REPOSITORY ${GIT_URL}/NVlabs/cub.git)
+set(CUB_TAG 1.8.0)
 
-SET(CUB_INCLUDE_DIR  ${CUB_PREFIX_DIR}/src/extern_cub)
+set(CUB_INCLUDE_DIR ${CUB_PREFIX_DIR}/src/extern_cub)
 message("CUB_INCLUDE_DIR is ${CUB_INCLUDE_DIR}")
 include_directories(${CUB_INCLUDE_DIR})
 
 ExternalProject_Add(
   extern_cub
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  ${SHALLOW_CLONE}
-  GIT_REPOSITORY  ${CUB_REPOSITORY}
-  GIT_TAG         ${CUB_TAG}
-  PREFIX          ${CUB_PREFIX_DIR}
-  UPDATE_COMMAND    ""
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${CUB_REPOSITORY}
+  GIT_TAG ${CUB_TAG}
+  PREFIX ${CUB_PREFIX_DIR}
+  UPDATE_COMMAND ""
   CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-  TEST_COMMAND      ""
-)
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND "")
 
 add_library(cub INTERFACE)
 
diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake
index 711d6c5b10aac..9c22ee89d48ea 100644
--- a/cmake/external/dgc.cmake
+++ b/cmake/external/dgc.cmake
@@ -12,32 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(DGC_PREFIX_DIR  "${THIRD_PARTY_PATH}/dgc")
-SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc")
-SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
-SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE)
-SET(DGC_LIBRARIES   "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE)
-SET(DGC_URL         "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz")
-INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
+set(DGC_PREFIX_DIR "${THIRD_PARTY_PATH}/dgc")
+set(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc")
+set(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
+set(DGC_INCLUDE_DIR
+    "${DGC_INSTALL_DIR}/include"
+    CACHE PATH "dgc include directory." FORCE)
+set(DGC_LIBRARIES
+    "${DGC_INSTALL_DIR}/lib/libdgc.a"
+    CACHE FILEPATH "dgc library." FORCE)
+set(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz")
+include_directories(${DGC_INCLUDE_DIR})
 
 ExternalProject_Add(
-    extern_dgc
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    URL             ${DGC_URL}
-    URL_MD5         "94e6fa1bc97169d0e1aad44570fe3251"
-    PREFIX          "${DGC_PREFIX_DIR}"
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND make -j $(nproc)
-    INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/  ${DGC_INCLUDE_DIR}/dgc
-        && cp ${DGC_SOURCES_DIR}/build/lib/libdgc.a ${DGC_LIBRARIES}
-        && cp ${DGC_SOURCES_DIR}/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/
-    BUILD_IN_SOURCE 1
-    BUILD_BYPRODUCTS ${DGC_LIBRARIES}
-)
-
-ADD_LIBRARY(dgc STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES})
-ADD_DEPENDENCIES(dgc extern_dgc)
+  extern_dgc
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  URL ${DGC_URL}
+  URL_MD5 "94e6fa1bc97169d0e1aad44570fe3251"
+  PREFIX "${DGC_PREFIX_DIR}"
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND make -j $(nproc)
+  INSTALL_COMMAND
+    mkdir -p ${DGC_INSTALL_DIR}/lib/ ${DGC_INCLUDE_DIR}/dgc && cp
+    ${DGC_SOURCES_DIR}/build/lib/libdgc.a ${DGC_LIBRARIES} && cp
+    ${DGC_SOURCES_DIR}/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/
+  BUILD_IN_SOURCE 1
+  BUILD_BYPRODUCTS ${DGC_LIBRARIES})
 
+add_library(dgc STATIC IMPORTED GLOBAL)
+set_property(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES})
+add_dependencies(dgc extern_dgc)
diff --git a/cmake/external/dirent.cmake b/cmake/external/dirent.cmake
index 59caa43741595..51d8eaac29e7d 100644
--- a/cmake/external/dirent.cmake
+++ b/cmake/external/dirent.cmake
@@ -15,30 +15,28 @@
 # Note(chenxin33): dirent.h is only exist in Linux, so get it from github when build in windows.
 # use dirent tag v1.23.2 on 09/05//2018 https://github.com/tronkko/dirent.git
 
-INCLUDE (ExternalProject)
+include(ExternalProject)
 
-SET(DIRENT_PREFIX_DIR       ${THIRD_PARTY_PATH}/dirent)
-SET(DIRENT_INCLUDE_DIR      ${THIRD_PARTY_PATH}/dirent/src/extern_dirent/include)
+set(DIRENT_PREFIX_DIR ${THIRD_PARTY_PATH}/dirent)
+set(DIRENT_INCLUDE_DIR ${THIRD_PARTY_PATH}/dirent/src/extern_dirent/include)
 
 include_directories(${DIRENT_INCLUDE_DIR})
 
-set(DIRENT_REPOSITORY  ${GIT_URL}/tronkko/dirent)
-set(DIRENT_TAG         1.23.2)
+set(DIRENT_REPOSITORY ${GIT_URL}/tronkko/dirent)
+set(DIRENT_TAG 1.23.2)
 
 ExternalProject_Add(
   extern_dirent
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  ${SHALLOW_CLONE}
-  GIT_REPOSITORY    ${DIRENT_REPOSITORY}
-  GIT_TAG           ${DIRENT_TAG}
-  PREFIX            ${DIRENT_PREFIX_DIR}
-  UPDATE_COMMAND    ""
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${DIRENT_REPOSITORY}
+  GIT_TAG ${DIRENT_TAG}
+  PREFIX ${DIRENT_PREFIX_DIR}
+  UPDATE_COMMAND ""
   CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-  TEST_COMMAND      ""
-)
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND "")
 
 add_library(dirent INTERFACE)
 
-add_dependencies(dirent extern_dirent)
\ No newline at end of file
+add_dependencies(dirent extern_dirent)
diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake
index 1aeea752e6678..727202a434683 100644
--- a/cmake/external/dlpack.cmake
+++ b/cmake/external/dlpack.cmake
@@ -17,24 +17,22 @@ include(ExternalProject)
 set(DLPACK_PREFIX_DIR ${THIRD_PARTY_PATH}/dlpack)
 
 set(DLPACK_REPOSITORY ${GIT_URL}/dmlc/dlpack.git)
-set(DLPACK_TAG        v0.4)
+set(DLPACK_TAG v0.4)
 
-set(DLPACK_INCLUDE_DIR  ${THIRD_PARTY_PATH}/dlpack/src/extern_dlpack/include)
+set(DLPACK_INCLUDE_DIR ${THIRD_PARTY_PATH}/dlpack/src/extern_dlpack/include)
 include_directories(${DLPACK_INCLUDE_DIR})
 
 ExternalProject_Add(
   extern_dlpack
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  ${SHALLOW_CLONE}
-  GIT_REPOSITORY    ${DLPACK_REPOSITORY}
-  GIT_TAG           ${DLPACK_TAG}
-  PREFIX            ${DLPACK_PREFIX_DIR}
-  UPDATE_COMMAND    ""
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${DLPACK_REPOSITORY}
+  GIT_TAG ${DLPACK_TAG}
+  PREFIX ${DLPACK_PREFIX_DIR}
+  UPDATE_COMMAND ""
   CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-  TEST_COMMAND      ""
-)
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND "")
 
 add_library(dlpack INTERFACE)
 
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index f8bac96b68fa5..443b7aa7d56b7 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -18,39 +18,43 @@ include(ExternalProject)
 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3)
 set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
 set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git)
-set(EIGEN_TAG        f612df273689a19d25b45ca4f8269463207c4fee)
+set(EIGEN_TAG f612df273689a19d25b45ca4f8269463207c4fee)
 
 if(WIN32)
-    add_definitions(-DEIGEN_STRONG_INLINE=inline)
+  add_definitions(-DEIGEN_STRONG_INLINE=inline)
 elseif(LINUX)
-    if(WITH_ROCM)
-        # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC
-        # which will cause compiler error of using __host__ funciont in __host__ __device__
-        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src)
-        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst)
-        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorReductionGpu.h native_src1)
-        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h native_dst1)
-        set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst} && cp ${native_src1} ${native_dst1})
-    endif()
+  if(WITH_ROCM)
+    # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC
+    # which will cause compiler error of using __host__ funciont in __host__ __device__
+    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src)
+    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h
+         native_dst)
+    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorReductionGpu.h
+         native_src1)
+    file(
+      TO_NATIVE_PATH
+      ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+      native_dst1)
+    set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst} && cp ${native_src1}
+                            ${native_dst1})
+  endif()
 endif()
 
 set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR})
-INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
+include_directories(${EIGEN_INCLUDE_DIR})
 
 ExternalProject_Add(
-    extern_eigen3
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    GIT_REPOSITORY    ${EIGEN_REPOSITORY}
-    GIT_TAG           ${EIGEN_TAG}
-    PREFIX            ${EIGEN_PREFIX_DIR}
-    UPDATE_COMMAND    ""
-    PATCH_COMMAND     ${EIGEN_PATCH_COMMAND}
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   ""
-    TEST_COMMAND      ""
-)
+  extern_eigen3
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${EIGEN_REPOSITORY}
+  GIT_TAG ${EIGEN_TAG}
+  PREFIX ${EIGEN_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  PATCH_COMMAND ${EIGEN_PATCH_COMMAND}
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND "")
 
 add_library(eigen3 INTERFACE)
 
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 056ff32c8c0d9..783e1c0d442f7 100755
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -12,90 +12,94 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(GFLAGS_PREFIX_DIR  ${THIRD_PARTY_PATH}/gflags)
-SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
-SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
+set(GFLAGS_PREFIX_DIR ${THIRD_PARTY_PATH}/gflags)
+set(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
+set(GFLAGS_INCLUDE_DIR
+    "${GFLAGS_INSTALL_DIR}/include"
+    CACHE PATH "gflags include directory." FORCE)
 set(GFLAGS_REPOSITORY ${GIT_URL}/gflags/gflags.git)
 set(GFLAGS_TAG "v2.2.2")
-IF(WIN32)
-  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
-ELSE(WIN32)
-  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+if(WIN32)
+  set(GFLAGS_LIBRARIES
+      "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib"
+      CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+else(WIN32)
+  set(GFLAGS_LIBRARIES
+      "${GFLAGS_INSTALL_DIR}/lib/libgflags.a"
+      CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
   set(BUILD_COMMAND $(MAKE) --silent)
   set(INSTALL_COMMAND $(MAKE) install)
-ENDIF(WIN32)
+endif(WIN32)
 
-INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
+include_directories(${GFLAGS_INCLUDE_DIR})
 
 if(WITH_ARM_BRPC)
-    SET(ARM_GFLAGS_URL "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_gflags.tar.gz" CACHE STRING "" FORCE)
-    set(GFLAGS_SOURCE_DIR ${THIRD_PARTY_PATH}/gflags/src/extern_gflags)
-    FILE(WRITE ${GFLAGS_SOURCE_DIR}/CMakeLists.txt
-    "PROJECT(ARM_GFLAGS)\n"
-    "cmake_minimum_required(VERSION 3.0)\n"
+  set(ARM_GFLAGS_URL
+      "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_gflags.tar.gz"
+      CACHE STRING "" FORCE)
+  set(GFLAGS_SOURCE_DIR ${THIRD_PARTY_PATH}/gflags/src/extern_gflags)
+  file(
+    WRITE ${GFLAGS_SOURCE_DIR}/CMakeLists.txt
+    "PROJECT(ARM_GFLAGS)\n" "cmake_minimum_required(VERSION 3.0)\n"
     "install(DIRECTORY arm_gflags/bin  arm_gflags/include arm_gflags/lib \n"
     "        DESTINATION . USE_SOURCE_PERMISSIONS)\n")
-    ExternalProject_Add(
-        extern_gflags
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        ${SHALLOW_CLONE}
-        PREFIX          ${GFLAGS_PREFIX_DIR}
-        DOWNLOAD_DIR          ${GFLAGS_SOURCE_DIR}
-        DOWNLOAD_COMMAND    rm -rf arm_gflags.tar.gz && 
-                            wget --no-check-certificate ${ARM_GFLAGS_URL}
-                            && tar zxvf arm_gflags.tar.gz
-        #DOWNLOAD_COMMAND    cp /home/wangbin44/Paddle/build/arm_gflags.tar.gz .
-        #                    && tar zxvf arm_gflags.tar.gz
-        UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
-                        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
-                        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES}
-    )
+  ExternalProject_Add(
+    extern_gflags
+    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+    PREFIX ${GFLAGS_PREFIX_DIR}
+    DOWNLOAD_DIR ${GFLAGS_SOURCE_DIR}
+    DOWNLOAD_COMMAND rm -rf arm_gflags.tar.gz && wget --no-check-certificate
+                     ${ARM_GFLAGS_URL} && tar zxvf arm_gflags.tar.gz
+    #DOWNLOAD_COMMAND    cp /home/wangbin44/Paddle/build/arm_gflags.tar.gz .
+    #                    && tar zxvf arm_gflags.tar.gz
+    UPDATE_COMMAND ""
+    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
+               -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES})
 else()
-    ExternalProject_Add(
-        extern_gflags
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        ${SHALLOW_CLONE}
-        GIT_REPOSITORY  ${GFLAGS_REPOSITORY}
-        GIT_TAG         ${GFLAGS_TAG}
-        PREFIX          ${GFLAGS_PREFIX_DIR}
-        UPDATE_COMMAND  ""
-        BUILD_COMMAND   ${BUILD_COMMAND}
-        INSTALL_COMMAND ${INSTALL_COMMAND}
-        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                        -DBUILD_STATIC_LIBS=ON
-                        -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
-                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                        -DBUILD_TESTING=OFF
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        ${EXTERNAL_OPTIONAL_ARGS}
-        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
-                        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES}
-    )
+  ExternalProject_Add(
+    extern_gflags
+    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+    GIT_REPOSITORY ${GFLAGS_REPOSITORY}
+    GIT_TAG ${GFLAGS_TAG}
+    PREFIX ${GFLAGS_PREFIX_DIR}
+    UPDATE_COMMAND ""
+    BUILD_COMMAND ${BUILD_COMMAND}
+    INSTALL_COMMAND ${INSTALL_COMMAND}
+    CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+               -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+               -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+               -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+               -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+               -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+               -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+               -DBUILD_STATIC_LIBS=ON
+               -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
+               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+               -DBUILD_TESTING=OFF
+               -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+               ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS
+      -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
+      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES})
 endif()
 
-ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
-ADD_DEPENDENCIES(gflags extern_gflags)
+add_library(gflags STATIC IMPORTED GLOBAL)
+set_property(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
+add_dependencies(gflags extern_gflags)
 
 # On Windows (including MinGW), the Shlwapi library is used by gflags if available.
-if (WIN32)
+if(WIN32)
   include(CheckIncludeFileCXX)
   check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI)
-  if (HAVE_SHLWAPI)
+  if(HAVE_SHLWAPI)
     set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
   endif(HAVE_SHLWAPI)
-endif (WIN32)
+endif(WIN32)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index b2f3afdabf415..a9942a6bca67b 100755
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -12,86 +12,90 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(GLOG_PREFIX_DIR  ${THIRD_PARTY_PATH}/glog)
-SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
-SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
-SET(GLOG_REPOSITORY ${GIT_URL}/google/glog.git)
-SET(GLOG_TAG        v0.4.0)
+set(GLOG_PREFIX_DIR ${THIRD_PARTY_PATH}/glog)
+set(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
+set(GLOG_INCLUDE_DIR
+    "${GLOG_INSTALL_DIR}/include"
+    CACHE PATH "glog include directory." FORCE)
+set(GLOG_REPOSITORY ${GIT_URL}/google/glog.git)
+set(GLOG_TAG v0.4.0)
 
-IF(WIN32)
-  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/glog.lib" CACHE FILEPATH "glog library." FORCE)
-  SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
+if(WIN32)
+  set(GLOG_LIBRARIES
+      "${GLOG_INSTALL_DIR}/lib/glog.lib"
+      CACHE FILEPATH "glog library." FORCE)
+  set(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
-ELSE(WIN32)
-  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
-  SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-ENDIF(WIN32)
+else(WIN32)
+  set(GLOG_LIBRARIES
+      "${GLOG_INSTALL_DIR}/lib/libglog.a"
+      CACHE FILEPATH "glog library." FORCE)
+  set(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+endif(WIN32)
 
-INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
+include_directories(${GLOG_INCLUDE_DIR})
 
 if(WITH_ARM_BRPC)
-    SET(ARM_GLOG_URL "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_glog.tar.gz" CACHE STRING "" FORCE)
-    set(GLOG_SOURCE_DIR ${THIRD_PARTY_PATH}/glog/src/extern_glog)
-    FILE(WRITE ${GLOG_SOURCE_DIR}/CMakeLists.txt
-    "PROJECT(ARM_GLOGS)\n"
-    "cmake_minimum_required(VERSION 3.0)\n"
+  set(ARM_GLOG_URL
+      "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_glog.tar.gz"
+      CACHE STRING "" FORCE)
+  set(GLOG_SOURCE_DIR ${THIRD_PARTY_PATH}/glog/src/extern_glog)
+  file(
+    WRITE ${GLOG_SOURCE_DIR}/CMakeLists.txt
+    "PROJECT(ARM_GLOGS)\n" "cmake_minimum_required(VERSION 3.0)\n"
     "install(DIRECTORY arm_glog/include arm_glog/lib \n"
     "        DESTINATION . USE_SOURCE_PERMISSIONS)\n")
-    ExternalProject_Add(
-        extern_glog
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        ${SHALLOW_CLONE}
-        DEPENDS         gflags
-        PREFIX          ${GLOG_PREFIX_DIR}
-        DOWNLOAD_DIR          ${GLOG_SOURCE_DIR}
-        DOWNLOAD_COMMAND    rm -rf arm_glog.tar.gz &&
-                            wget --no-check-certificate ${ARM_GLOG_URL}
-                            && tar zxvf arm_glog.tar.gz
-        #DOWNLOAD_COMMAND    cp /home/wangbin44/Paddle/build/arm_glog.tar.gz .
-        #                    && tar zxvf arm_glog.tar.gz
-        UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
-                        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
-                        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        BUILD_BYPRODUCTS ${GLOG_LIBRARIES}
-    )
+  ExternalProject_Add(
+    extern_glog
+    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+    DEPENDS gflags
+    PREFIX ${GLOG_PREFIX_DIR}
+    DOWNLOAD_DIR ${GLOG_SOURCE_DIR}
+    DOWNLOAD_COMMAND rm -rf arm_glog.tar.gz && wget --no-check-certificate
+                     ${ARM_GLOG_URL} && tar zxvf arm_glog.tar.gz
+    #DOWNLOAD_COMMAND    cp /home/wangbin44/Paddle/build/arm_glog.tar.gz .
+    #                    && tar zxvf arm_glog.tar.gz
+    UPDATE_COMMAND ""
+    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
+               -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    BUILD_BYPRODUCTS ${GLOG_LIBRARIES})
 else()
-    ExternalProject_Add(
-        extern_glog
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        ${SHALLOW_CLONE}
-        GIT_REPOSITORY  ${GLOG_REPOSITORY}
-        GIT_TAG         ${GLOG_TAG}
-        DEPENDS         gflags
-        PREFIX          ${GLOG_PREFIX_DIR}
-        UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
-                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                        -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
-                        -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
-                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                        -DWITH_GFLAGS=OFF
-                        -DBUILD_TESTING=OFF
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        ${EXTERNAL_OPTIONAL_ARGS}
-        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
-                        -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
-                        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        BUILD_BYPRODUCTS ${GLOG_LIBRARIES}
-    )
+  ExternalProject_Add(
+    extern_glog
+    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+    GIT_REPOSITORY ${GLOG_REPOSITORY}
+    GIT_TAG ${GLOG_TAG}
+    DEPENDS gflags
+    PREFIX ${GLOG_PREFIX_DIR}
+    UPDATE_COMMAND ""
+    CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+               -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
+               -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+               -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+               -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+               -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+               -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+               -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
+               -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
+               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+               -DWITH_GFLAGS=OFF
+               -DBUILD_TESTING=OFF
+               -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+               ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS
+      -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
+      -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
+      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    BUILD_BYPRODUCTS ${GLOG_LIBRARIES})
 endif()
 
-ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
-ADD_DEPENDENCIES(glog extern_glog gflags)
-LINK_LIBRARIES(glog)
+add_library(glog STATIC IMPORTED GLOBAL)
+set_property(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
+add_dependencies(glog extern_glog gflags)
+link_libraries(glog)
diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
index 778d7c2a0ae29..cd7b254892ed1 100644
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -12,58 +12,65 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(GLOO_PROJECT       "extern_gloo")
-SET(GLOO_PREFIX_DIR    ${THIRD_PARTY_PATH}/gloo)
-SET(GLOO_SOURCE_DIR    ${THIRD_PARTY_PATH}/gloo/src/extern_gloo)
-SET(GLOO_INSTALL_DIR   ${THIRD_PARTY_PATH}/install/gloo)
-SET(GLOO_INCLUDE_DIR   "${GLOO_INSTALL_DIR}/include" CACHE PATH "gloo include directory." FORCE)
-SET(GLOO_LIBRARY_DIR   "${GLOO_INSTALL_DIR}/lib" CACHE PATH "gloo library directory." FORCE)
+set(GLOO_PROJECT "extern_gloo")
+set(GLOO_PREFIX_DIR ${THIRD_PARTY_PATH}/gloo)
+set(GLOO_SOURCE_DIR ${THIRD_PARTY_PATH}/gloo/src/extern_gloo)
+set(GLOO_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gloo)
+set(GLOO_INCLUDE_DIR
+    "${GLOO_INSTALL_DIR}/include"
+    CACHE PATH "gloo include directory." FORCE)
+set(GLOO_LIBRARY_DIR
+    "${GLOO_INSTALL_DIR}/lib"
+    CACHE PATH "gloo library directory." FORCE)
 # As we add extra features for gloo, we use the non-official repo
-SET(GLOO_REPOSITORY    ${GIT_URL}/sandyhouse/gloo.git)
-SET(GLOO_TAG           v0.0.2)
-SET(GLOO_LIBRARIES     "${GLOO_INSTALL_DIR}/lib/libgloo.a" CACHE FILEPATH "gloo library." FORCE)
+set(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git)
+set(GLOO_TAG v0.0.2)
+set(GLOO_LIBRARIES
+    "${GLOO_INSTALL_DIR}/lib/libgloo.a"
+    CACHE FILEPATH "gloo library." FORCE)
 
-INCLUDE_DIRECTORIES(${GLOO_INCLUDE_DIR})
+include_directories(${GLOO_INCLUDE_DIR})
 
 if(WITH_ASCEND OR WITH_ASCEND_CL)
   ExternalProject_Add(
-      ${GLOO_PROJECT}
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      ${SHALLOW_CLONE}
-      GIT_REPOSITORY        ${GLOO_REPOSITORY}
-      GIT_TAG               ${GLOO_TAG}
-      PREFIX                "${GLOO_PREFIX_DIR}"
-      UPDATE_COMMAND        ""
-      CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
-          && cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make
-          && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
-      INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
-      COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
-      BUILD_BYPRODUCTS     ${GLOO_LIBRARIES}
-  )
+    ${GLOO_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+    GIT_REPOSITORY ${GLOO_REPOSITORY}
+    GIT_TAG ${GLOO_TAG}
+    PREFIX "${GLOO_PREFIX_DIR}"
+    UPDATE_COMMAND ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND
+      mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake
+      .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make && mkdir -p
+      ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+    INSTALL_COMMAND ${CMAKE_COMMAND} -E copy
+                    ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/"
+            "${GLOO_INCLUDE_DIR}/gloo"
+    BUILD_BYPRODUCTS ${GLOO_LIBRARIES})
 else()
   ExternalProject_Add(
-      ${GLOO_PROJECT}
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      ${SHALLOW_CLONE}
-      GIT_REPOSITORY        ${GLOO_REPOSITORY}
-      GIT_TAG               ${GLOO_TAG}
-      PREFIX                "${GLOO_PREFIX_DIR}"
-      UPDATE_COMMAND        ""
-      CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
-          && cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make
-          && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
-      INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
-      COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
-      BUILD_BYPRODUCTS     ${GLOO_LIBRARIES}
-  )
+    ${GLOO_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+    GIT_REPOSITORY ${GLOO_REPOSITORY}
+    GIT_TAG ${GLOO_TAG}
+    PREFIX "${GLOO_PREFIX_DIR}"
+    UPDATE_COMMAND ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND
+      mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake
+      .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make && mkdir -p
+      ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+    INSTALL_COMMAND ${CMAKE_COMMAND} -E copy
+                    ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/"
+            "${GLOO_INCLUDE_DIR}/gloo"
+    BUILD_BYPRODUCTS ${GLOO_LIBRARIES})
 endif()
 
-
-ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES})
-ADD_DEPENDENCIES(gloo ${GLOO_PROJECT})
+add_library(gloo STATIC IMPORTED GLOBAL)
+set_property(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES})
+add_dependencies(gloo ${GLOO_PROJECT})
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 3c740af6e0b3f..00527ceecdc1f 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -14,79 +14,85 @@
 
 #FIXME:(gongwb) Move brpc's gtest dependency.
 
-IF(WITH_TESTING)
-    ENABLE_TESTING()
-ENDIF()
+if(WITH_TESTING)
+  enable_testing()
+endif()
 
-INCLUDE(GNUInstallDirs)
-INCLUDE(ExternalProject)
+include(GNUInstallDirs)
+include(ExternalProject)
 
-SET(GTEST_PREFIX_DIR    ${THIRD_PARTY_PATH}/gtest)
-SET(GTEST_INSTALL_DIR   ${THIRD_PARTY_PATH}/install/gtest)
-SET(GTEST_INCLUDE_DIR   "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
-set(GTEST_REPOSITORY    ${GIT_URL}/google/googletest.git)
-set(GTEST_TAG           release-1.8.1)
+set(GTEST_PREFIX_DIR ${THIRD_PARTY_PATH}/gtest)
+set(GTEST_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gtest)
+set(GTEST_INCLUDE_DIR
+    "${GTEST_INSTALL_DIR}/include"
+    CACHE PATH "gtest include directory." FORCE)
+set(GTEST_REPOSITORY ${GIT_URL}/google/googletest.git)
+set(GTEST_TAG release-1.8.1)
 
-INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR})
+include_directories(${GTEST_INCLUDE_DIR})
 
-IF(WIN32)
-    set(GTEST_LIBRARIES
-        "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
-    set(GTEST_MAIN_LIBRARIES
-        "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
-    string(REPLACE "/w " "" GTEST_CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-    string(REPLACE "/w " "" GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    string(REPLACE "/W0 " "" GTEST_CMAKE_C_FLAGS "${GTEST_CMAKE_C_FLAGS}")
-    string(REPLACE "/W0 " "" GTEST_CMAKE_CXX_FLAGS "${GTEST_CMAKE_CXX_FLAGS}")
-ELSE(WIN32)
-    set(GTEST_LIBRARIES
-        "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
-    set(GTEST_MAIN_LIBRARIES
-        "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
-    set(GTEST_CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-    set(GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-ENDIF(WIN32)
+if(WIN32)
+  set(GTEST_LIBRARIES
+      "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest.lib"
+      CACHE FILEPATH "gtest libraries." FORCE)
+  set(GTEST_MAIN_LIBRARIES
+      "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest_main.lib"
+      CACHE FILEPATH "gtest main libraries." FORCE)
+  string(REPLACE "/w " "" GTEST_CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+  string(REPLACE "/w " "" GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "/W0 " "" GTEST_CMAKE_C_FLAGS "${GTEST_CMAKE_C_FLAGS}")
+  string(REPLACE "/W0 " "" GTEST_CMAKE_CXX_FLAGS "${GTEST_CMAKE_CXX_FLAGS}")
+else(WIN32)
+  set(GTEST_LIBRARIES
+      "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a"
+      CACHE FILEPATH "gtest libraries." FORCE)
+  set(GTEST_MAIN_LIBRARIES
+      "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a"
+      CACHE FILEPATH "gtest main libraries." FORCE)
+  set(GTEST_CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+  set(GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+endif(WIN32)
 
-IF(WITH_MKLML)
-    # wait for mklml downloading completed
-    SET(GTEST_DEPENDS   ${MKLML_PROJECT})
-ENDIF()
+if(WITH_MKLML)
+  # wait for mklml downloading completed
+  set(GTEST_DEPENDS ${MKLML_PROJECT})
+endif()
 
 ExternalProject_Add(
-    extern_gtest
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    GIT_REPOSITORY  ${GTEST_REPOSITORY}
-    GIT_TAG         ${GTEST_TAG}
-    DEPENDS         ${GTEST_DEPENDS}
-    PREFIX          ${GTEST_PREFIX_DIR}
-    UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS}
-                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS=${GTEST_CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DBUILD_GMOCK=ON
-                    -Dgtest_disable_pthreads=ON
-                    -Dgtest_force_shared_crt=ON
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-    BUILD_BYPRODUCTS ${GTEST_LIBRARIES}
-    BUILD_BYPRODUCTS ${GTEST_MAIN_LIBRARIES}
-)
+  extern_gtest
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${GTEST_REPOSITORY}
+  GIT_TAG ${GTEST_TAG}
+  DEPENDS ${GTEST_DEPENDS}
+  PREFIX ${GTEST_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS}
+             -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS=${GTEST_CMAKE_C_FLAGS}
+             -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+             -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DBUILD_GMOCK=ON
+             -Dgtest_disable_pthreads=ON
+             -Dgtest_force_shared_crt=ON
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             ${EXTERNAL_OPTIONAL_ARGS}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+  BUILD_BYPRODUCTS ${GTEST_LIBRARIES}
+  BUILD_BYPRODUCTS ${GTEST_MAIN_LIBRARIES})
 
-ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES})
-ADD_DEPENDENCIES(gtest extern_gtest)
+add_library(gtest STATIC IMPORTED GLOBAL)
+set_property(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES})
+add_dependencies(gtest extern_gtest)
 
-ADD_LIBRARY(gtest_main STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
-ADD_DEPENDENCIES(gtest_main extern_gtest)
+add_library(gtest_main STATIC IMPORTED GLOBAL)
+set_property(TARGET gtest_main PROPERTY IMPORTED_LOCATION
+                                        ${GTEST_MAIN_LIBRARIES})
+add_dependencies(gtest_main extern_gtest)
diff --git a/cmake/external/lapack.cmake b/cmake/external/lapack.cmake
index 4cca61681c66c..43305223fe280 100644
--- a/cmake/external/lapack.cmake
+++ b/cmake/external/lapack.cmake
@@ -12,56 +12,68 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE (ExternalProject)
+include(ExternalProject)
 
-SET(LAPACK_PREFIX_DIR       ${THIRD_PARTY_PATH}/lapack)
-SET(LAPACK_SOURCE_DIR       ${THIRD_PARTY_PATH}/lapack/src/extern_lapack)
-SET(LAPACK_INSTALL_DIR      ${THIRD_PARTY_PATH}/install/lapack)
-SET(LAPACK_LIB_DIR          ${LAPACK_INSTALL_DIR}/lib)
+set(LAPACK_PREFIX_DIR ${THIRD_PARTY_PATH}/lapack)
+set(LAPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/lapack/src/extern_lapack)
+set(LAPACK_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lapack)
+set(LAPACK_LIB_DIR ${LAPACK_INSTALL_DIR}/lib)
 
 # Note(zhouwei): lapack need fortan compiler which many machines don't have, so use precompiled library.
 # use lapack tag v3.10.0 on 06/28/2021 https://github.com/Reference-LAPACK/lapack
 if(LINUX)
-    SET(LAPACK_VER  "lapack_lnx_v3.10.0.20210628" CACHE STRING "" FORCE)
-    SET(LAPACK_URL  "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.tar.gz" CACHE STRING "" FORCE)
-    SET(LAPACK_URL_MD5          71f8cc8237a8571692f3e07f9a4f25f6)
-    SET(GNU_RT_LIB_1            "${LAPACK_LIB_DIR}/libquadmath.so.0")
-    SET(GFORTRAN_LIB            "${LAPACK_LIB_DIR}/libgfortran.so.3")
-    SET(BLAS_LIB                "${LAPACK_LIB_DIR}/libblas.so.3")
-    SET(LAPACK_LIB              "${LAPACK_LIB_DIR}/liblapack.so.3")
+  set(LAPACK_VER
+      "lapack_lnx_v3.10.0.20210628"
+      CACHE STRING "" FORCE)
+  set(LAPACK_URL
+      "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.tar.gz"
+      CACHE STRING "" FORCE)
+  set(LAPACK_URL_MD5 71f8cc8237a8571692f3e07f9a4f25f6)
+  set(GNU_RT_LIB_1 "${LAPACK_LIB_DIR}/libquadmath.so.0")
+  set(GFORTRAN_LIB "${LAPACK_LIB_DIR}/libgfortran.so.3")
+  set(BLAS_LIB "${LAPACK_LIB_DIR}/libblas.so.3")
+  set(LAPACK_LIB "${LAPACK_LIB_DIR}/liblapack.so.3")
 elseif(WIN32)
-    # Refer to [lapack-for-windows] http://icl.cs.utk.edu/lapack-for-windows/lapack/#lapacke
-    SET(LAPACK_VER  "lapack_win_v3.10.0.20210628" CACHE STRING "" FORCE)
-    SET(LAPACK_URL  "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.zip" CACHE STRING "" FORCE)
-    SET(LAPACK_URL_MD5          590d080392dcd5abbd5dca767a50b63a)
-    SET(GNU_RT_LIB_1            "${LAPACK_LIB_DIR}/libquadmath-0.dll")
-    SET(GNU_RT_LIB_2            "${LAPACK_LIB_DIR}/libgcc_s_seh-1.dll")
-    SET(GFORTRAN_LIB            "${LAPACK_LIB_DIR}/libgfortran-3.dll")
-    SET(BLAS_LIB                "${LAPACK_LIB_DIR}/libblas.dll")
-    SET(LAPACK_LIB              "${LAPACK_LIB_DIR}/liblapack.dll")
+  # Refer to [lapack-for-windows] http://icl.cs.utk.edu/lapack-for-windows/lapack/#lapacke
+  set(LAPACK_VER
+      "lapack_win_v3.10.0.20210628"
+      CACHE STRING "" FORCE)
+  set(LAPACK_URL
+      "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.zip"
+      CACHE STRING "" FORCE)
+  set(LAPACK_URL_MD5 590d080392dcd5abbd5dca767a50b63a)
+  set(GNU_RT_LIB_1 "${LAPACK_LIB_DIR}/libquadmath-0.dll")
+  set(GNU_RT_LIB_2 "${LAPACK_LIB_DIR}/libgcc_s_seh-1.dll")
+  set(GFORTRAN_LIB "${LAPACK_LIB_DIR}/libgfortran-3.dll")
+  set(BLAS_LIB "${LAPACK_LIB_DIR}/libblas.dll")
+  set(LAPACK_LIB "${LAPACK_LIB_DIR}/liblapack.dll")
 else()
-    SET(LAPACK_VER  "lapack_mac_v3.10.0.20210628" CACHE STRING "" FORCE)
-    SET(LAPACK_URL  "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.tar.gz" CACHE STRING "" FORCE)
-    SET(LAPACK_URL_MD5          427aecf8dee8523de3566ca8e47944d7)
-    SET(GNU_RT_LIB_1            "${LAPACK_LIB_DIR}/libquadmath.0.dylib")
-    SET(GNU_RT_LIB_2            "${LAPACK_LIB_DIR}/libgcc_s.1.dylib")
-    SET(GFORTRAN_LIB            "${LAPACK_LIB_DIR}/libgfortran.5.dylib")
-    SET(BLAS_LIB                "${LAPACK_LIB_DIR}/libblas.3.dylib")
-    SET(LAPACK_LIB              "${LAPACK_LIB_DIR}/liblapack.3.dylib")
+  set(LAPACK_VER
+      "lapack_mac_v3.10.0.20210628"
+      CACHE STRING "" FORCE)
+  set(LAPACK_URL
+      "https://paddlepaddledeps.bj.bcebos.com/${LAPACK_VER}.tar.gz"
+      CACHE STRING "" FORCE)
+  set(LAPACK_URL_MD5 427aecf8dee8523de3566ca8e47944d7)
+  set(GNU_RT_LIB_1 "${LAPACK_LIB_DIR}/libquadmath.0.dylib")
+  set(GNU_RT_LIB_2 "${LAPACK_LIB_DIR}/libgcc_s.1.dylib")
+  set(GFORTRAN_LIB "${LAPACK_LIB_DIR}/libgfortran.5.dylib")
+  set(BLAS_LIB "${LAPACK_LIB_DIR}/libblas.3.dylib")
+  set(LAPACK_LIB "${LAPACK_LIB_DIR}/liblapack.3.dylib")
 endif()
 
 ExternalProject_Add(
-    extern_lapack
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    URL                     ${LAPACK_URL}
-    URL_MD5                 ${LAPACK_URL_MD5}
-    PREFIX                  ${LAPACK_PREFIX_DIR}
-    DOWNLOAD_NO_PROGRESS    1
-    PATCH_COMMAND           ""
-    UPDATE_COMMAND          ""
-    CONFIGURE_COMMAND       ""
-    BUILD_COMMAND           ""
-    INSTALL_COMMAND         ${CMAKE_COMMAND} -E copy_directory ${LAPACK_SOURCE_DIR} ${LAPACK_LIB_DIR}
-    BUILD_BYPRODUCTS        ${BLAS_LIB}
-    BUILD_BYPRODUCTS        ${LAPACK_LIB}
-)
+  extern_lapack
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  URL ${LAPACK_URL}
+  URL_MD5 ${LAPACK_URL_MD5}
+  PREFIX ${LAPACK_PREFIX_DIR}
+  DOWNLOAD_NO_PROGRESS 1
+  PATCH_COMMAND ""
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${LAPACK_SOURCE_DIR}
+                  ${LAPACK_LIB_DIR}
+  BUILD_BYPRODUCTS ${BLAS_LIB}
+  BUILD_BYPRODUCTS ${LAPACK_LIB})
diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake
index 65a21a87dbde2..b1f2345794e15 100644
--- a/cmake/external/leveldb.cmake
+++ b/cmake/external/leveldb.cmake
@@ -12,35 +12,39 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(LEVELDB_PREFIX_DIR ${THIRD_PARTY_PATH}/leveldb)
-SET(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb)
-SET(LEVELDB_INCLUDE_DIR "${LEVELDB_INSTALL_DIR}/include" CACHE PATH "leveldb include directory." FORCE)
-SET(LEVELDB_LIBRARIES "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a" CACHE FILEPATH "leveldb library." FORCE)
-INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR})
+set(LEVELDB_PREFIX_DIR ${THIRD_PARTY_PATH}/leveldb)
+set(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb)
+set(LEVELDB_INCLUDE_DIR
+    "${LEVELDB_INSTALL_DIR}/include"
+    CACHE PATH "leveldb include directory." FORCE)
+set(LEVELDB_LIBRARIES
+    "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a"
+    CACHE FILEPATH "leveldb library." FORCE)
+include_directories(${LEVELDB_INCLUDE_DIR})
 
 ExternalProject_Add(
-        extern_leveldb
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        PREFIX ${LEVELDB_PREFIX_DIR}
-        GIT_REPOSITORY "https://github.com/google/leveldb"
-        GIT_TAG v1.18
-        UPDATE_COMMAND ""
-        CONFIGURE_COMMAND ""
-        BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
-        INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/
-        && cp ${LEVELDB_PREFIX_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES}
-        && cp -r ${LEVELDB_PREFIX_DIR}/src/extern_leveldb/include ${LEVELDB_INSTALL_DIR}/
-        BUILD_IN_SOURCE 1
-        BUILD_BYPRODUCTS ${LEVELDB_LIBRARIES}
-)
+  extern_leveldb
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  PREFIX ${LEVELDB_PREFIX_DIR}
+  GIT_REPOSITORY "https://github.com/google/leveldb"
+  GIT_TAG v1.18
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
+  INSTALL_COMMAND
+    mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ && cp
+    ${LEVELDB_PREFIX_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES}
+    && cp -r ${LEVELDB_PREFIX_DIR}/src/extern_leveldb/include
+    ${LEVELDB_INSTALL_DIR}/
+  BUILD_IN_SOURCE 1
+  BUILD_BYPRODUCTS ${LEVELDB_LIBRARIES})
 
-ADD_DEPENDENCIES(extern_leveldb snappy)
+add_dependencies(extern_leveldb snappy)
 
-ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
-ADD_DEPENDENCIES(leveldb extern_leveldb)
-
-LIST(APPEND external_project_dependencies leveldb)
+add_library(leveldb STATIC IMPORTED GLOBAL)
+set_property(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
+add_dependencies(leveldb extern_leveldb)
 
+list(APPEND external_project_dependencies leveldb)
diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake
index a166e43c7b95e..28bf083f7791e 100644
--- a/cmake/external/libmct.cmake
+++ b/cmake/external/libmct.cmake
@@ -12,48 +12,54 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(LIBMCT_PROJECT       "extern_libmct")
-IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE)
-  SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE)
-  SET(LIBMCT_URL "https://pslib.bj.bcebos.com/libmct/libmct.tar.gz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}")
-SET(LIBMCT_PREFIX_DIR    "${THIRD_PARTY_PATH}/libmct")
-SET(LIBMCT_DOWNLOAD_DIR  "${LIBMCT_PREFIX_DIR}/src/${LIBMCT_PROJECT}")
-SET(LIBMCT_DST_DIR       "libmct")
-SET(LIBMCT_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(LIBMCT_INSTALL_DIR   ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR})
-SET(LIBMCT_ROOT          ${LIBMCT_INSTALL_DIR})
-SET(LIBMCT_INC_DIR       ${LIBMCT_ROOT}/include)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib")
+set(LIBMCT_PROJECT "extern_libmct")
+if((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL))
+  message(STATUS "use pre defined download url")
+  set(LIBMCT_VER
+      "0.1.0"
+      CACHE STRING "" FORCE)
+  set(LIBMCT_NAME
+      "libmct"
+      CACHE STRING "" FORCE)
+  set(LIBMCT_URL
+      "https://pslib.bj.bcebos.com/libmct/libmct.tar.gz"
+      CACHE STRING "" FORCE)
+endif()
+message(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}")
+set(LIBMCT_PREFIX_DIR "${THIRD_PARTY_PATH}/libmct")
+set(LIBMCT_DOWNLOAD_DIR "${LIBMCT_PREFIX_DIR}/src/${LIBMCT_PROJECT}")
+set(LIBMCT_DST_DIR "libmct")
+set(LIBMCT_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
+set(LIBMCT_INSTALL_DIR ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR})
+set(LIBMCT_ROOT ${LIBMCT_INSTALL_DIR})
+set(LIBMCT_INC_DIR ${LIBMCT_ROOT}/include)
+set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib")
 
-INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR})
+include_directories(${LIBMCT_INC_DIR})
 
-FILE(WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(LIBMCT)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
+file(
+  WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(LIBMCT)\n" "cmake_minimum_required(VERSION 3.0)\n"
   "install(DIRECTORY ${LIBMCT_NAME}/include ${LIBMCT_NAME}/lib \n"
   "        DESTINATION ${LIBMCT_DST_DIR})\n")
 
 ExternalProject_Add(
-    ${LIBMCT_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${LIBMCT_PREFIX_DIR}
-    DOWNLOAD_DIR          ${LIBMCT_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz
-                          && tar --no-same-owner -zxvf ${LIBMCT_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT}
-                          -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT}
-                          -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-)
+  ${LIBMCT_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  PREFIX ${LIBMCT_PREFIX_DIR}
+  DOWNLOAD_DIR ${LIBMCT_DOWNLOAD_DIR}
+  DOWNLOAD_COMMAND
+    wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz &&
+    tar --no-same-owner -zxvf ${LIBMCT_NAME}.tar.gz
+  DOWNLOAD_NO_PROGRESS 1
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT}
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+  CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT}
+                   -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE})
 
 add_library(libmct INTERFACE)
 
-ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT})
+add_dependencies(libmct ${LIBMCT_PROJECT})
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
index da7cb696ef8c7..1efb95cc0cfa9 100644
--- a/cmake/external/libxsmm.cmake
+++ b/cmake/external/libxsmm.cmake
@@ -12,34 +12,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE (ExternalProject)
+include(ExternalProject)
 
-SET(LIBXSMM_PREFIX_DIR ${THIRD_PARTY_PATH}/libxsmm)
-SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm)
-SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE)
-SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE)
-SET(LIBXSMM_LIB        "${LIBXSMM_LIBRARY_DIR}/libxsmm.a")
-SET(LIBXSMMNOBLAS_LIB  "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
+set(LIBXSMM_PREFIX_DIR ${THIRD_PARTY_PATH}/libxsmm)
+set(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm)
+set(LIBXSMM_INCLUDE_DIR
+    "${LIBXSMM_INSTALL_DIR}/include"
+    CACHE PATH "LIBXSMM include directory." FORCE)
+set(LIBXSMM_LIBRARY_DIR
+    "${LIBXSMM_INSTALL_DIR}/lib"
+    CACHE PATH "LIBXSMM library directory." FORCE)
+set(LIBXSMM_LIB "${LIBXSMM_LIBRARY_DIR}/libxsmm.a")
+set(LIBXSMMNOBLAS_LIB "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
 
 ExternalProject_Add(
-    extern_libxsmm
-    ${SHALLOW_CLONE}
-    GIT_REPOSITORY  "${GIT_URL}/hfp/libxsmm.git"
-    GIT_TAG         "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2"
-    PREFIX          ${LIBXSMM_PREFIX_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_IN_SOURCE 1
-    BUILD_COMMAND   $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install
-    INSTALL_COMMAND ""
-    BUILD_BYPRODUCTS ${LIBXSMM_LIB}
-    BUILD_BYPRODUCTS ${LIBXSMMNOBLAS_LIB}
-)
-ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIB}")
-SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMMNOBLAS_LIB}")
+  extern_libxsmm
+  ${SHALLOW_CLONE}
+  GIT_REPOSITORY "${GIT_URL}/hfp/libxsmm.git"
+  GIT_TAG "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2"
+  PREFIX ${LIBXSMM_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_IN_SOURCE 1
+  BUILD_COMMAND $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc
+                WARP=0 install
+  INSTALL_COMMAND ""
+  BUILD_BYPRODUCTS ${LIBXSMM_LIB}
+  BUILD_BYPRODUCTS ${LIBXSMMNOBLAS_LIB})
+add_library(libxsmm STATIC IMPORTED GLOBAL)
+set_property(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIB}")
+set_property(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMMNOBLAS_LIB}")
 
-MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
+message(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
 include_directories(${LIBXSMM_INCLUDE_DIR})
-ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
-ADD_DEPENDENCIES(libxsmm extern_libxsmm)
+add_definitions(-DPADDLE_WITH_LIBXSMM)
+add_dependencies(libxsmm extern_libxsmm)
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index 0031757467f37..b994f407604b4 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -18,32 +18,34 @@ if(NOT LINUX)
   return()
 endif()
 
-if (LITE_WITH_XPU)
+if(LITE_WITH_XPU)
   add_definitions(-DLITE_SUBGRAPH_WITH_XPU)
-  IF(WITH_AARCH64)
-    SET(XPU_SDK_ENV "kylin_aarch64")
-  ELSEIF(WITH_SUNWAY)
-    SET(XPU_SDK_ENV "deepin_sw6_64")
-  ELSEIF(WITH_BDCENTOS)
-    SET(XPU_SDK_ENV "bdcentos_x86_64")
-  ELSEIF(WITH_UBUNTU)
-    SET(XPU_SDK_ENV "ubuntu_x86_64")
-  ELSEIF(WITH_CENTOS)
-    SET(XPU_SDK_ENV "centos7_x86_64")
-  ELSE ()
-    SET(XPU_SDK_ENV "ubuntu_x86_64")
-  ENDIF()
+  if(WITH_AARCH64)
+    set(XPU_SDK_ENV "kylin_aarch64")
+  elseif(WITH_SUNWAY)
+    set(XPU_SDK_ENV "deepin_sw6_64")
+  elseif(WITH_BDCENTOS)
+    set(XPU_SDK_ENV "bdcentos_x86_64")
+  elseif(WITH_UBUNTU)
+    set(XPU_SDK_ENV "ubuntu_x86_64")
+  elseif(WITH_CENTOS)
+    set(XPU_SDK_ENV "centos7_x86_64")
+  else()
+    set(XPU_SDK_ENV "ubuntu_x86_64")
+  endif()
 endif()
 
-if (LITE_WITH_NNADAPTER)
-  add_definitions(-DLITE_SUBGRAPH_WITH_NNADAPTER) 
-  if (NNADAPTER_WITH_HUAWEI_ASCEND_NPU)
+if(LITE_WITH_NNADAPTER)
+  add_definitions(-DLITE_SUBGRAPH_WITH_NNADAPTER)
+  if(NNADAPTER_WITH_HUAWEI_ASCEND_NPU)
     add_definitions(-DLITE_SUBGRAPH_WITH_NPU)
-    set(NPU_SDK_ROOT "/usr/local/Ascend/ascend-toolkit/latest" CACHE STRING "default NPU SDK ROOT")
+    set(NPU_SDK_ROOT
+        "/usr/local/Ascend/ascend-toolkit/latest"
+        CACHE STRING "default NPU SDK ROOT")
   endif()
 endif()
 
-if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
+if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   include(ExternalProject)
   set(LITE_PROJECT extern_lite)
   set(LITE_PREFIX_DIR ${THIRD_PARTY_PATH}/lite)
@@ -61,109 +63,118 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   if(WITH_ARM)
     set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j)
     message(WARNING "BUILD_COMMAND: ${LITE_BUILD_COMMAND}")
-    set(LITE_OPTIONAL_ARGS -DWITH_MKL=OFF
-                           -DLITE_WITH_CUDA=OFF
-                           -DWITH_MKLDNN=OFF
-                           -DLITE_WITH_X86=OFF
-                           -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON
-                           -DLITE_WITH_PROFILE=OFF
-                           -DARM_TARGET_OS=armlinux
-                           -DWITH_LITE=ON
-                           -DWITH_PYTHON=OFF
-                           -DWITH_TESTING=OFF
-                           -DLITE_BUILD_EXTRA=ON
-                           -DLITE_WITH_XPU=${LITE_WITH_XPU}
-                           -DXPU_SDK_URL=${XPU_BASE_URL}
-                           -DXPU_SDK_ENV=${XPU_SDK_ENV}
-                           -DLITE_WITH_NNADAPTER=${LITE_WITH_NNADAPTER}
-                           -DNNADAPTER_WITH_HUAWEI_ASCEND_NPU=${NNADAPTER_WITH_HUAWEI_ASCEND_NPU}
-                           -DNNADAPTER_HUAWEI_ASCEND_NPU_SDK_ROOT=${NPU_SDK_ROOT}
-                           -DLITE_WITH_CODE_META_INFO=OFF
-                           -DLITE_WITH_ARM=ON)
+    set(LITE_OPTIONAL_ARGS
+        -DWITH_MKL=OFF
+        -DLITE_WITH_CUDA=OFF
+        -DWITH_MKLDNN=OFF
+        -DLITE_WITH_X86=OFF
+        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON
+        -DLITE_WITH_PROFILE=OFF
+        -DARM_TARGET_OS=armlinux
+        -DWITH_LITE=ON
+        -DWITH_PYTHON=OFF
+        -DWITH_TESTING=OFF
+        -DLITE_BUILD_EXTRA=ON
+        -DLITE_WITH_XPU=${LITE_WITH_XPU}
+        -DXPU_SDK_URL=${XPU_BASE_URL}
+        -DXPU_SDK_ENV=${XPU_SDK_ENV}
+        -DLITE_WITH_NNADAPTER=${LITE_WITH_NNADAPTER}
+        -DNNADAPTER_WITH_HUAWEI_ASCEND_NPU=${NNADAPTER_WITH_HUAWEI_ASCEND_NPU}
+        -DNNADAPTER_HUAWEI_ASCEND_NPU_SDK_ROOT=${NPU_SDK_ROOT}
+        -DLITE_WITH_CODE_META_INFO=OFF
+        -DLITE_WITH_ARM=ON)
     ExternalProject_Add(
       ${LITE_PROJECT}
       ${EXTERNAL_PROJECT_LOG_ARGS}
-      GIT_REPOSITORY      "${GIT_URL}/PaddlePaddle/Paddle-Lite.git"
-      GIT_TAG             ${LITE_GIT_TAG}
-      PREFIX              ${LITE_PREFIX_DIR}
-      PATCH_COMMAND       mkdir -p ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code && touch ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code/__generated_code__.cc && sed -i "/aarch64-linux-gnu-gcc/d" ${LITE_PREFIX_DIR}/src/extern_lite/cmake/os/armlinux.cmake && sed -i "/aarch64-linux-gnu-g++/d" ${LITE_PREFIX_DIR}/src/extern_lite/cmake/os/armlinux.cmake
-      UPDATE_COMMAND      ""
-      BUILD_COMMAND       ${LITE_BUILD_COMMAND}
-      INSTALL_COMMAND     ""
-      CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                          -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS}
-                          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                          -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                          -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                          -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                          ${EXTERNAL_OPTIONAL_ARGS}
-                          ${LITE_OPTIONAL_ARGS}
-    )
+      GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/Paddle-Lite.git"
+      GIT_TAG ${LITE_GIT_TAG}
+      PREFIX ${LITE_PREFIX_DIR}
+      PATCH_COMMAND
+        mkdir -p ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code && touch
+        ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code/__generated_code__.cc
+        && sed -i "/aarch64-linux-gnu-gcc/d"
+        ${LITE_PREFIX_DIR}/src/extern_lite/cmake/os/armlinux.cmake && sed -i
+        "/aarch64-linux-gnu-g++/d"
+        ${LITE_PREFIX_DIR}/src/extern_lite/cmake/os/armlinux.cmake
+      UPDATE_COMMAND ""
+      BUILD_COMMAND ${LITE_BUILD_COMMAND}
+      INSTALL_COMMAND ""
+      CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                 -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                 -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS}
+                 -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                 -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                 -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                 -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                 -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                 -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                 -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                 ${EXTERNAL_OPTIONAL_ARGS}
+                 ${LITE_OPTIONAL_ARGS})
   else()
     set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j)
-    set(LITE_OPTIONAL_ARGS -DWITH_MKL=ON
-                           -DLITE_WITH_CUDA=${WITH_GPU}
-                           -DWITH_MKLDNN=OFF
-                           -DLITE_WITH_X86=ON
-                           -DLITE_WITH_PROFILE=OFF
-                           -DWITH_LITE=OFF
-                           -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF
-                           -DWITH_PYTHON=OFF
-                           -DWITH_TESTING=OFF
-                           -DLITE_BUILD_EXTRA=ON
-                           -DCUDNN_ROOT=${CUDNN_ROOT}
-                           -DLITE_WITH_STATIC_CUDA=OFF
-                           -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME}
-                           -DLITE_WITH_XPU=${LITE_WITH_XPU}
-                           -DXPU_SDK_URL=${XPU_BASE_URL}
-                           -DXPU_SDK_ENV=${XPU_SDK_ENV}
-                           -DLITE_WITH_NNADAPTER=${LITE_WITH_NNADAPTER}
-                           -DNNADAPTER_WITH_HUAWEI_ASCEND_NPU=${NNADAPTER_WITH_HUAWEI_ASCEND_NPU}
-                           -DNNADAPTER_HUAWEI_ASCEND_NPU_SDK_ROOT=${NPU_SDK_ROOT}
-                           -DLITE_WITH_CODE_META_INFO=OFF
-                           -DLITE_WITH_ARM=OFF)
+    set(LITE_OPTIONAL_ARGS
+        -DWITH_MKL=ON
+        -DLITE_WITH_CUDA=${WITH_GPU}
+        -DWITH_MKLDNN=OFF
+        -DLITE_WITH_X86=ON
+        -DLITE_WITH_PROFILE=OFF
+        -DWITH_LITE=OFF
+        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF
+        -DWITH_PYTHON=OFF
+        -DWITH_TESTING=OFF
+        -DLITE_BUILD_EXTRA=ON
+        -DCUDNN_ROOT=${CUDNN_ROOT}
+        -DLITE_WITH_STATIC_CUDA=OFF
+        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME}
+        -DLITE_WITH_XPU=${LITE_WITH_XPU}
+        -DXPU_SDK_URL=${XPU_BASE_URL}
+        -DXPU_SDK_ENV=${XPU_SDK_ENV}
+        -DLITE_WITH_NNADAPTER=${LITE_WITH_NNADAPTER}
+        -DNNADAPTER_WITH_HUAWEI_ASCEND_NPU=${NNADAPTER_WITH_HUAWEI_ASCEND_NPU}
+        -DNNADAPTER_HUAWEI_ASCEND_NPU_SDK_ROOT=${NPU_SDK_ROOT}
+        -DLITE_WITH_CODE_META_INFO=OFF
+        -DLITE_WITH_ARM=OFF)
 
     ExternalProject_Add(
-        ${LITE_PROJECT}
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY      "${GIT_URL}/PaddlePaddle/Paddle-Lite.git"
-        GIT_TAG             ${LITE_GIT_TAG}
-        PREFIX              ${LITE_PREFIX_DIR}
-        UPDATE_COMMAND      ""
-        PATCH_COMMAND       sed -i "s?NNadapter_bridges_path = os.path.abspath('..')+\"\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?NNadapter_bridges_path = os.path.abspath(\'..\')+\"\/extern_lite\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?" ${LITE_PREFIX_DIR}/src/extern_lite//lite/tools/cmake_tools/record_supported_kernel_op.py
-        BUILD_COMMAND       ${LITE_BUILD_COMMAND}
-        INSTALL_COMMAND     ""
-        CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                            -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS}
-                            -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                            -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                            -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                            -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                            -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                            ${EXTERNAL_OPTIONAL_ARGS}
-                            ${LITE_OPTIONAL_ARGS}
-    )
+      ${LITE_PROJECT}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/Paddle-Lite.git"
+      GIT_TAG ${LITE_GIT_TAG}
+      PREFIX ${LITE_PREFIX_DIR}
+      UPDATE_COMMAND ""
+      PATCH_COMMAND
+        sed -i
+        "s?NNadapter_bridges_path = os.path.abspath('..')+\"\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?NNadapter_bridges_path = os.path.abspath(\'..\')+\"\/extern_lite\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?"
+        ${LITE_PREFIX_DIR}/src/extern_lite//lite/tools/cmake_tools/record_supported_kernel_op.py
+      BUILD_COMMAND ${LITE_BUILD_COMMAND}
+      INSTALL_COMMAND ""
+      CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                 -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                 -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS}
+                 -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                 -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                 -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                 -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                 -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                 -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                 -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                 ${EXTERNAL_OPTIONAL_ARGS}
+                 ${LITE_OPTIONAL_ARGS})
   endif()
-  ExternalProject_Get_property(${LITE_PROJECT} BINARY_DIR)
-  ExternalProject_Get_property(${LITE_PROJECT} SOURCE_DIR)
+  ExternalProject_Get_Property(${LITE_PROJECT} BINARY_DIR)
+  ExternalProject_Get_Property(${LITE_PROJECT} SOURCE_DIR)
   set(LITE_BINARY_DIR ${BINARY_DIR})
   set(LITE_SOURCE_DIR ${SOURCE_DIR})
 
 endif()
 
-if (WITH_ARM)
+if(WITH_ARM)
   if(LITE_WITH_XPU)
     set(LITE_OUTPUT_BIN_DIR inference_lite_lib.armlinux.armv8.xpu)
   elseif(LITE_WITH_NNADAPTER)
     message("Enable LITE_WITH_NNADAPTER")
-    if (NNADAPTER_WITH_HUAWEI_ASCEND_NPU)
+    if(NNADAPTER_WITH_HUAWEI_ASCEND_NPU)
       set(LITE_OUTPUT_BIN_DIR inference_lite_lib.armlinux.armv8.nnadapter)
     endif()
   else()
@@ -184,22 +195,32 @@ endif()
 
 function(external_lite_libs alias path)
   add_library(${alias} SHARED IMPORTED GLOBAL)
-  SET_PROPERTY(TARGET ${alias} PROPERTY IMPORTED_LOCATION
-               ${path})
-  if (LITE_PROJECT)
+  set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${path})
+  if(LITE_PROJECT)
     add_dependencies(${alias} ${LITE_PROJECT})
   endif()
 endfunction()
 
-external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so)
-set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so)
+external_lite_libs(
+  lite_full_static
+  ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so
+)
+set(LITE_SHARED_LIB
+    ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so
+)
 
-if (LITE_WITH_NNADAPTER)
-  set(LITE_NNADAPTER_LIB ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libnnadapter.so)
-  if (NNADAPTER_WITH_HUAWEI_ASCEND_NPU)
-    external_lite_libs(lite_nnadapter ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libnnadapter.so ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libhuawei_ascend_npu.so)
+if(LITE_WITH_NNADAPTER)
+  set(LITE_NNADAPTER_LIB
+      ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libnnadapter.so)
+  if(NNADAPTER_WITH_HUAWEI_ASCEND_NPU)
+    external_lite_libs(
+      lite_nnadapter
+      ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libnnadapter.so
+      ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libhuawei_ascend_npu.so)
     set(LITE_DEPS lite_full_static lite_nnadapter)
-    set(LITE_NNADAPTER_NPU_LIB ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libhuawei_ascend_npu.so)
+    set(LITE_NNADAPTER_NPU_LIB
+        ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libhuawei_ascend_npu.so
+    )
   endif()
 else()
   set(LITE_DEPS lite_full_static)
diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
index 5c48afa2806aa..8b33a73e24c8d 100644
--- a/cmake/external/llvm.cmake
+++ b/cmake/external/llvm.cmake
@@ -1,31 +1,33 @@
 include(FetchContent)
 
-set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/infrt/llvm_b5149f4e66a49a98b67e8e2de4e24a4af8e2781b.tar.gz)
+set(LLVM_DOWNLOAD_URL
+    https://paddle-inference-dist.bj.bcebos.com/infrt/llvm_b5149f4e66a49a98b67e8e2de4e24a4af8e2781b.tar.gz
+)
 set(LLVM_MD5 022819bb5760817013cf4b8a37e97d5e)
 
 set(FETCHCONTENT_BASE_DIR ${THIRD_PARTY_PATH}/llvm)
 set(FETCHCONTENT_QUIET OFF)
-FetchContent_Declare(external_llvm
+FetchContent_Declare(
+  external_llvm
   URL ${LLVM_DOWNLOAD_URL}
   URL_MD5 ${LLVM_MD5}
-  PREFIX ${THIRD_PARTY_PATH}/llvm
-  SOURCE_DIR ${THIRD_PARTY_PATH}/install/llvm
-)
-if (NOT LLVM_PATH)
+  PREFIX ${THIRD_PARTY_PATH}/llvm SOURCE_DIR ${THIRD_PARTY_PATH}/install/llvm)
+if(NOT LLVM_PATH)
   FetchContent_GetProperties(external_llvm)
-  if (NOT external_llvm_POPULATED)
+  if(NOT external_llvm_POPULATED)
     FetchContent_Populate(external_llvm)
   endif()
   set(LLVM_PATH ${THIRD_PARTY_PATH}/install/llvm)
   set(LLVM_DIR ${THIRD_PARTY_PATH}/install/llvm/lib/cmake/llvm)
   set(MLIR_DIR ${THIRD_PARTY_PATH}/install/llvm/lib/cmake/mlir)
-else ()
+else()
   set(LLVM_DIR ${LLVM_PATH}/lib/cmake/llvm)
   set(MLIR_DIR ${LLVM_PATH}/lib/cmake/mlir)
 endif()
 
-if (${CMAKE_CXX_COMPILER} STREQUAL "clang++")
-  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi")
+if(${CMAKE_CXX_COMPILER} STREQUAL "clang++")
+  set(CMAKE_EXE_LINKER_FLAGS
+      "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi")
 endif()
 
 message(STATUS "set LLVM_DIR: ${LLVM_DIR}")
@@ -66,8 +68,17 @@ cmake ../llvm  -G "Unix Makefiles" \
 
 add_definitions(${LLVM_DEFINITIONS})
 
-llvm_map_components_to_libnames(llvm_libs Support Core irreader
-        X86 executionengine orcjit mcjit all codegen)
+llvm_map_components_to_libnames(
+  llvm_libs
+  Support
+  Core
+  irreader
+  X86
+  executionengine
+  orcjit
+  mcjit
+  all
+  codegen)
 
 message(STATUS "LLVM libs: ${llvm_libs}")
 
@@ -75,23 +86,24 @@ get_property(mlir_libs GLOBAL PROPERTY MLIR_ALL_LIBS)
 message(STATUS "MLIR libs: ${mlir_libs}")
 add_definitions(${LLVM_DEFINITIONS})
 
-
 # The minimum needed libraries for MLIR IR parse and transform.
 set(MLIR_IR_LIBS MLIRAnalysis MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
 
-
 # tb_base is the name of a xxx.td file (without the .td suffix)
 function(mlir_tablegen_on td_base)
   set(options)
   set(oneValueArgs DIALECT)
-  cmake_parse_arguments(mlir_tablegen_on "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  cmake_parse_arguments(mlir_tablegen_on "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
 
   set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
   mlir_tablegen(${td_base}.hpp.inc -gen-op-decls)
   mlir_tablegen(${td_base}.cpp.inc -gen-op-defs)
-  if (mlir_tablegen_on_DIALECT)
-    mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls -dialect=${mlir_tablegen_on_DIALECT})
-    mlir_tablegen(${td_base}_dialect.cpp.inc --gen-dialect-defs -dialect=${mlir_tablegen_on_DIALECT})
+  if(mlir_tablegen_on_DIALECT)
+    mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls
+                  -dialect=${mlir_tablegen_on_DIALECT})
+    mlir_tablegen(${td_base}_dialect.cpp.inc --gen-dialect-defs
+                  -dialect=${mlir_tablegen_on_DIALECT})
   endif()
   add_public_tablegen_target(${td_base}_IncGen)
   add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
@@ -99,7 +111,9 @@ endfunction()
 
 function(mlir_add_rewriter td_base)
   set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
-  set(LLVM_TARGET_DEPENDS  ${LLVM_TARGET_DEPENDS} ${CMAKE_SOURCE_DIR}/paddle/infrt/dialect/infrt/ir/infrt_base.td)
+  set(LLVM_TARGET_DEPENDS
+      ${LLVM_TARGET_DEPENDS}
+      ${CMAKE_SOURCE_DIR}/paddle/infrt/dialect/infrt/ir/infrt_base.td)
   mlir_tablegen(${td_base}.cpp.inc -gen-rewriters)
   add_public_tablegen_target(MLIR${td_base}IncGen)
   add_dependencies(mlir-headers MLIR${td_base}IncGen)
@@ -108,7 +122,11 @@ endfunction()
 # Execute the mlir script with infrt-exec program.
 # @name: name of the test
 # @script: path to the mlir script file
-function (infrt_exec_check name script)
-  add_test(NAME ${name}
-    COMMAND sh -c "${CMAKE_BINARY_DIR}/paddle/infrt/host_context/infrt-exec -i ${CMAKE_CURRENT_SOURCE_DIR}/${script}| ${LLVM_PATH}/bin/FileCheck  ${CMAKE_CURRENT_SOURCE_DIR}/${script}")
+function(infrt_exec_check name script)
+  add_test(
+    NAME ${name}
+    COMMAND
+      sh -c
+      "${CMAKE_BINARY_DIR}/paddle/infrt/host_context/infrt-exec -i ${CMAKE_CURRENT_SOURCE_DIR}/${script}| ${LLVM_PATH}/bin/FileCheck  ${CMAKE_CURRENT_SOURCE_DIR}/${script}"
+  )
 endfunction()
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 8f955008fa079..dfa20dd631fc6 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -12,108 +12,131 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
-
-SET(MKLDNN_PROJECT        "extern_mkldnn")
-SET(MKLDNN_PREFIX_DIR     ${THIRD_PARTY_PATH}/mkldnn)
-SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
-SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
-SET(MKLDNN_REPOSITORY     ${GIT_URL}/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            9b186765dded79066e0cd9c17eb70b680b76fb8e)
+include(ExternalProject)
 
+set(MKLDNN_PROJECT "extern_mkldnn")
+set(MKLDNN_PREFIX_DIR ${THIRD_PARTY_PATH}/mkldnn)
+set(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
+set(MKLDNN_INC_DIR
+    "${MKLDNN_INSTALL_DIR}/include"
+    CACHE PATH "mkldnn include directory." FORCE)
+set(MKLDNN_REPOSITORY ${GIT_URL}/oneapi-src/oneDNN.git)
+set(MKLDNN_TAG 9b186765dded79066e0cd9c17eb70b680b76fb8e)
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
-INCLUDE(GNUInstallDirs)
-SET(LIBDIR "lib")
+include(GNUInstallDirs)
+set(LIBDIR "lib")
 if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$")
-  SET(LIBDIR "lib64")
+  set(LIBDIR "lib64")
 endif()
 
-MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/${LIBDIR} to runtime path")
-SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR}")
-
-INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
+message(STATUS "Set ${MKLDNN_INSTALL_DIR}/${LIBDIR} to runtime path")
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}"
+                        "${MKLDNN_INSTALL_DIR}/${LIBDIR}")
 
+include_directories(${MKLDNN_INC_DIR}
+)# For MKLDNN code to include internal headers.
 
-IF(NOT WIN32)
-    SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
-    SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
-    SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
-    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
-    SET(MKLDNN_CXXFLAG_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
-    SET(MKLDNN_CFLAG_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
-    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libdnnl.so" CACHE FILEPATH "mkldnn library." FORCE)
-ELSE()
-    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
-    SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS}")
-    string(REPLACE "/O2 " "" MKLDNN_CFLAG_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
-    string(REPLACE "/O2 " "" MKLDNN_CXXFLAG_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
-    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
-ENDIF(NOT WIN32)
+if(NOT WIN32)
+  set(MKLDNN_FLAG
+      "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds"
+  )
+  set(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
+  set(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
+  set(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
+  set(MKLDNN_CXXFLAG_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+  set(MKLDNN_CFLAG_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
+  set(MKLDNN_LIB
+      "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libdnnl.so"
+      CACHE FILEPATH "mkldnn library." FORCE)
+else()
+  set(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
+  set(MKLDNN_CFLAG "${CMAKE_C_FLAGS}")
+  string(REPLACE "/O2 " "" MKLDNN_CFLAG_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
+  string(REPLACE "/O2 " "" MKLDNN_CXXFLAG_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+  set(MKLDNN_LIB
+      "${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib"
+      CACHE FILEPATH "mkldnn library." FORCE)
+endif(NOT WIN32)
 
 ExternalProject_Add(
-    ${MKLDNN_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    GIT_REPOSITORY      ${MKLDNN_REPOSITORY}
-    GIT_TAG             ${MKLDNN_TAG}
-    DEPENDS             ${MKLDNN_DEPENDS}
-    PREFIX              ${MKLDNN_PREFIX_DIR}
-    UPDATE_COMMAND      ""
-    #BUILD_ALWAYS        1
-    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
-                        -DCMAKE_CXX_FLAGS_RELEASE=${MKLDNN_CXXFLAG_RELEASE}
-                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                        -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
-                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                        -DCMAKE_C_FLAGS_RELEASE=${MKLDNN_CFLAG_RELEASE}
-                        -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-                        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                        -DDNNL_BUILD_TESTS=OFF -DDNNL_BUILD_EXAMPLES=OFF
-    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-)
+  ${MKLDNN_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${MKLDNN_REPOSITORY}
+  GIT_TAG ${MKLDNN_TAG}
+  DEPENDS ${MKLDNN_DEPENDS}
+  PREFIX ${MKLDNN_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  #BUILD_ALWAYS        1
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
+             -DCMAKE_CXX_FLAGS_RELEASE=${MKLDNN_CXXFLAG_RELEASE}
+             -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
+             -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS_RELEASE=${MKLDNN_CFLAG_RELEASE}
+             -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
+             -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DDNNL_BUILD_TESTS=OFF
+             -DDNNL_BUILD_EXAMPLES=OFF
+  CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR})
 
-MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
+message(STATUS "MKLDNN library: ${MKLDNN_LIB}")
 add_definitions(-DPADDLE_WITH_MKLDNN)
 # copy the real so.0 lib to install dir
 # it can be directly contained in wheel or capi
 if(WIN32)
-    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll)
+  set(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll)
 
-    file(TO_NATIVE_PATH ${MKLDNN_INSTALL_DIR} NATIVE_MKLDNN_INSTALL_DIR)
-    file(TO_NATIVE_PATH ${MKLDNN_SHARED_LIB} NATIVE_MKLDNN_SHARED_LIB)
+  file(TO_NATIVE_PATH ${MKLDNN_INSTALL_DIR} NATIVE_MKLDNN_INSTALL_DIR)
+  file(TO_NATIVE_PATH ${MKLDNN_SHARED_LIB} NATIVE_MKLDNN_SHARED_LIB)
 
-    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_LIB}
-        COMMAND (copy ${NATIVE_MKLDNN_INSTALL_DIR}\\bin\\dnnl.dll ${NATIVE_MKLDNN_SHARED_LIB} /Y)
-        COMMAND dumpbin /exports ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll > ${MKLDNN_INSTALL_DIR}/bin/exports.txt
-        COMMAND echo LIBRARY mkldnn > ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def
-        COMMAND echo EXPORTS >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def
-        COMMAND echo off && (for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) && echo on
-        COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_LIB} /machine:x64
-        COMMENT "Generate mkldnn.lib manually--->"
-        DEPENDS ${MKLDNN_PROJECT}
-        VERBATIM)
-    ADD_CUSTOM_TARGET(mkldnn_cmd ALL DEPENDS ${MKLDNN_LIB})
+  add_custom_command(
+    OUTPUT ${MKLDNN_LIB}
+    COMMAND (copy ${NATIVE_MKLDNN_INSTALL_DIR}\\bin\\dnnl.dll
+             ${NATIVE_MKLDNN_SHARED_LIB} /Y)
+    COMMAND dumpbin /exports ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll >
+            ${MKLDNN_INSTALL_DIR}/bin/exports.txt
+    COMMAND echo LIBRARY mkldnn > ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def
+    COMMAND echo EXPORTS >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def
+    COMMAND
+      echo off && (for
+                   /f
+                   "skip=19 tokens=4"
+                   %A
+                   in
+                   (${MKLDNN_INSTALL_DIR}/bin/exports.txt)
+                   do
+                   echo
+                   %A
+                   >>
+                   ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) && echo on
+    COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_LIB}
+            /machine:x64
+    COMMENT "Generate mkldnn.lib manually--->"
+    DEPENDS ${MKLDNN_PROJECT}
+    VERBATIM)
+  add_custom_target(mkldnn_cmd ALL DEPENDS ${MKLDNN_LIB})
 else(WIN32)
-    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
-    SET(MKLDNN_SHARED_LIB_1 ${MKLDNN_INSTALL_DIR}/libdnnl.so.1)
-    SET(MKLDNN_SHARED_LIB_2 ${MKLDNN_INSTALL_DIR}/libdnnl.so.2)
-    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB_2}
-        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
-        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_1}
-        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_2}
-        DEPENDS ${MKLDNN_PROJECT})
-    ADD_CUSTOM_TARGET(mkldnn_cmd ALL DEPENDS ${MKLDNN_SHARED_LIB_2})
+  set(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
+  set(MKLDNN_SHARED_LIB_1 ${MKLDNN_INSTALL_DIR}/libdnnl.so.1)
+  set(MKLDNN_SHARED_LIB_2 ${MKLDNN_INSTALL_DIR}/libdnnl.so.2)
+  add_custom_command(
+    OUTPUT ${MKLDNN_SHARED_LIB_2}
+    COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
+    COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_1}
+    COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_2}
+    DEPENDS ${MKLDNN_PROJECT})
+  add_custom_target(mkldnn_cmd ALL DEPENDS ${MKLDNN_SHARED_LIB_2})
 endif(WIN32)
 
 # generate a static dummy target to track mkldnn dependencies
 # for cc_library(xxx SRCS xxx.c DEPS mkldnn)
 generate_dummy_static_lib(LIB_NAME "mkldnn" GENERATOR "mkldnn.cmake")
 
-TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_IOMP_LIB})
-ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT} mkldnn_cmd)
+target_link_libraries(mkldnn ${MKLDNN_LIB} ${MKLML_IOMP_LIB})
+add_dependencies(mkldnn ${MKLDNN_PROJECT} mkldnn_cmd)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index a2fd2fe03c162..90d61f47a52e8 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -12,59 +12,68 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
-SET(MKLML_INSTALL_DIR   ${THIRD_PARTY_PATH}/install/mklml)
-SET(MKLML_INC_DIR       ${MKLML_INSTALL_DIR}/include)
-SET(MKLML_LIB_DIR       ${MKLML_INSTALL_DIR}/lib)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_LIB_DIR}")
+include(ExternalProject)
+set(MKLML_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mklml)
+set(MKLML_INC_DIR ${MKLML_INSTALL_DIR}/include)
+set(MKLML_LIB_DIR ${MKLML_INSTALL_DIR}/lib)
+set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_LIB_DIR}")
 
-IF(WIN32)
-    SET(MKLML_VER "mklml_win_2019.0.5.20190502" CACHE STRING "" FORCE)
-    SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
-    SET(MKLML_URL_MD5             ff8c5237570f03eea37377ccfc95a08a)
-    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
-    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
-    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
-    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
-ELSE()
-    #TODO(intel-huying):
-    #  Now enable csrmm function in mklml library temporarily, it will be updated as offical version later.
-    SET(MKLML_VER "csrmm_mklml_lnx_2019.0.5" CACHE STRING "" FORCE)
-    SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
-    SET(MKLML_URL_MD5             bc6a7faea6a2a9ad31752386f3ae87da)
-    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
-    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
-    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
-    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.so)
-ENDIF()
+if(WIN32)
+  set(MKLML_VER
+      "mklml_win_2019.0.5.20190502"
+      CACHE STRING "" FORCE)
+  set(MKLML_URL
+      "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip"
+      CACHE STRING "" FORCE)
+  set(MKLML_URL_MD5 ff8c5237570f03eea37377ccfc95a08a)
+  set(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib)
+  set(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib)
+  set(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
+  set(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll)
+else()
+  #TODO(intel-huying):
+  #  Now enable csrmm function in mklml library temporarily, it will be updated as offical version later.
+  set(MKLML_VER
+      "csrmm_mklml_lnx_2019.0.5"
+      CACHE STRING "" FORCE)
+  set(MKLML_URL
+      "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz"
+      CACHE STRING "" FORCE)
+  set(MKLML_URL_MD5 bc6a7faea6a2a9ad31752386f3ae87da)
+  set(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
+  set(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
+  set(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
+  set(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
+endif()
 
-SET(MKLML_PROJECT           "extern_mklml")
-MESSAGE(STATUS      "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
-SET(MKLML_PREFIX_DIR        ${THIRD_PARTY_PATH}/mklml)
-SET(MKLML_SOURCE_DIR        ${THIRD_PARTY_PATH}/mklml/src/extern_mklml)
+set(MKLML_PROJECT "extern_mklml")
+message(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
+set(MKLML_PREFIX_DIR ${THIRD_PARTY_PATH}/mklml)
+set(MKLML_SOURCE_DIR ${THIRD_PARTY_PATH}/mklml/src/extern_mklml)
 
-# Ninja Generator can not establish the correct dependency relationship between the imported library with target, 
+# Ninja Generator can not establish the correct dependency relationship between the imported library with target,
 # the product file in the ExternalProject need to be specified manually, please refer to
 # https://stackoverflow.com/questions/54866067/cmake-and-ninja-missing-and-no-known-rule-to-make-it
 # It is the same to all other ExternalProject.
 ExternalProject_Add(
-    ${MKLML_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    URL                   ${MKLML_URL}
-    URL_MD5               ${MKLML_URL_MD5}
-    PREFIX                ${MKLML_PREFIX_DIR}
-    DOWNLOAD_NO_PROGRESS  1
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         ""
-    UPDATE_COMMAND        ""
-    INSTALL_COMMAND       ${CMAKE_COMMAND} -E copy_directory ${MKLML_SOURCE_DIR}/include ${MKLML_INC_DIR} &&
-                          ${CMAKE_COMMAND} -E copy_directory ${MKLML_SOURCE_DIR}/lib ${MKLML_LIB_DIR}
-    BUILD_BYPRODUCTS      ${MKLML_LIB}
-    BUILD_BYPRODUCTS      ${MKLML_IOMP_LIB}
-)
+  ${MKLML_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  URL ${MKLML_URL}
+  URL_MD5 ${MKLML_URL_MD5}
+  PREFIX ${MKLML_PREFIX_DIR}
+  DOWNLOAD_NO_PROGRESS 1
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  UPDATE_COMMAND ""
+  INSTALL_COMMAND
+    ${CMAKE_COMMAND} -E copy_directory ${MKLML_SOURCE_DIR}/include
+    ${MKLML_INC_DIR} && ${CMAKE_COMMAND} -E copy_directory
+    ${MKLML_SOURCE_DIR}/lib ${MKLML_LIB_DIR}
+  BUILD_BYPRODUCTS ${MKLML_LIB}
+  BUILD_BYPRODUCTS ${MKLML_IOMP_LIB})
 
-INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
+include_directories(${MKLML_INC_DIR})
 
-ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
-ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
+add_library(mklml SHARED IMPORTED GLOBAL)
+set_property(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
+add_dependencies(mklml ${MKLML_PROJECT})
diff --git a/cmake/external/onnxruntime.cmake b/cmake/external/onnxruntime.cmake
index 2162f87812d13..9ace4caafd12a 100644
--- a/cmake/external/onnxruntime.cmake
+++ b/cmake/external/onnxruntime.cmake
@@ -12,83 +12,114 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if (NOT WITH_ONNXRUNTIME)
+if(NOT WITH_ONNXRUNTIME)
   return()
-endif ()
+endif()
 
-if (WITH_ARM)
+if(WITH_ARM)
   message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu")
   return()
-endif ()
+endif()
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
 add_definitions(-DPADDLE_WITH_ONNXRUNTIME)
 
-SET(ONNXRUNTIME_PROJECT        "extern_onnxruntime")
-SET(ONNXRUNTIME_PREFIX_DIR     ${THIRD_PARTY_PATH}/onnxruntime)
-SET(ONNXRUNTIME_SOURCE_DIR     ${THIRD_PARTY_PATH}/onnxruntime/src/${ONNXRUNTIME_PROJECT})
-SET(ONNXRUNTIME_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/onnxruntime)
-SET(ONNXRUNTIME_INC_DIR        "${ONNXRUNTIME_INSTALL_DIR}/include" CACHE PATH "onnxruntime include directory." FORCE)
-SET(ONNXRUNTIME_LIB_DIR        "${ONNXRUNTIME_INSTALL_DIR}/lib" CACHE PATH "onnxruntime lib directory." FORCE)
-SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${ONNXRUNTIME_LIB_DIR}")
-
+set(ONNXRUNTIME_PROJECT "extern_onnxruntime")
+set(ONNXRUNTIME_PREFIX_DIR ${THIRD_PARTY_PATH}/onnxruntime)
+set(ONNXRUNTIME_SOURCE_DIR
+    ${THIRD_PARTY_PATH}/onnxruntime/src/${ONNXRUNTIME_PROJECT})
+set(ONNXRUNTIME_INSTALL_DIR ${THIRD_PARTY_PATH}/install/onnxruntime)
+set(ONNXRUNTIME_INC_DIR
+    "${ONNXRUNTIME_INSTALL_DIR}/include"
+    CACHE PATH "onnxruntime include directory." FORCE)
+set(ONNXRUNTIME_LIB_DIR
+    "${ONNXRUNTIME_INSTALL_DIR}/lib"
+    CACHE PATH "onnxruntime lib directory." FORCE)
+set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${ONNXRUNTIME_LIB_DIR}")
 
-if (WIN32)
-  SET(ONNXRUNTIME_URL             "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-win-x64-1.10.0.zip")
-elseif (APPLE)
-  SET(ONNXRUNTIME_URL           "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-osx-x86_64-1.10.0.tgz")
-else ()
-  SET(ONNXRUNTIME_URL             "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-linux-x64-1.10.0.tgz")
+if(WIN32)
+  set(ONNXRUNTIME_URL
+      "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-win-x64-1.10.0.zip"
+  )
+elseif(APPLE)
+  set(ONNXRUNTIME_URL
+      "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-osx-x86_64-1.10.0.tgz"
+  )
+else()
+  set(ONNXRUNTIME_URL
+      "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-linux-x64-1.10.0.tgz"
+  )
 endif()
 
+include_directories(${ONNXRUNTIME_INC_DIR}
+)# For ONNXRUNTIME code to include internal headers.
+if(WIN32)
+  set(ONNXRUNTIME_SOURCE_LIB
+      "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll"
+      CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  set(ONNXRUNTIME_SHARED_LIB
+      "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.dll"
+      CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+  set(ONNXRUNTIME_LIB
+      "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.lib"
+      CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+elseif(APPLE)
+  set(ONNXRUNTIME_SOURCE_LIB
+      "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.1.10.0.dylib"
+      CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  set(ONNXRUNTIME_LIB
+      "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.1.10.0.dylib"
+      CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+  set(ONNXRUNTIME_SHARED_LIB
+      ${ONNXRUNTIME_LIB}
+      CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+else()
+  set(ONNXRUNTIME_SOURCE_LIB
+      "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.so.1.10.0"
+      CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
+  set(ONNXRUNTIME_LIB
+      "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.so.1.10.0"
+      CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
+  set(ONNXRUNTIME_SHARED_LIB
+      ${ONNXRUNTIME_LIB}
+      CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
+endif()
 
-INCLUDE_DIRECTORIES(${ONNXRUNTIME_INC_DIR}) # For ONNXRUNTIME code to include internal headers.
-if (WIN32)
-  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
-  SET(ONNXRUNTIME_SHARED_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
-  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.lib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
-elseif (APPLE)
-  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
-  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
-  SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
-else ()
-  SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME source library." FORCE)
-  SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME static library." FORCE)
-  SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE)
-endif ()
-
-if (WIN32)
+if(WIN32)
   ExternalProject_Add(
-      ${ONNXRUNTIME_PROJECT}
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      URL                 ${ONNXRUNTIME_URL}
-      PREFIX              ${ONNXRUNTIME_PREFIX_DIR}
-      DOWNLOAD_NO_PROGRESS  1
-      CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         ""
-      UPDATE_COMMAND        ""
-      INSTALL_COMMAND       ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_SHARED_LIB} &&
-                            ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.lib ${ONNXRUNTIME_LIB} &&
-                            ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR}
-      BUILD_BYPRODUCTS      ${ONNXRUNTIME_LIB}
-  )
-else ()
+    ${ONNXRUNTIME_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    URL ${ONNXRUNTIME_URL}
+    PREFIX ${ONNXRUNTIME_PREFIX_DIR}
+    DOWNLOAD_NO_PROGRESS 1
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND
+      ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB}
+      ${ONNXRUNTIME_SHARED_LIB} && ${CMAKE_COMMAND} -E copy
+      ${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.lib ${ONNXRUNTIME_LIB} &&
+      ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include
+      ${ONNXRUNTIME_INC_DIR}
+    BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB})
+else()
   ExternalProject_Add(
     ${ONNXRUNTIME_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    URL                 ${ONNXRUNTIME_URL}
-    PREFIX              ${ONNXRUNTIME_PREFIX_DIR}
-    DOWNLOAD_NO_PROGRESS  1
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         ""
-    UPDATE_COMMAND        ""
-    INSTALL_COMMAND       ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_LIB} &&
-                          ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR}
-    BUILD_BYPRODUCTS      ${ONNXRUNTIME_LIB}
-  )
+    URL ${ONNXRUNTIME_URL}
+    PREFIX ${ONNXRUNTIME_PREFIX_DIR}
+    DOWNLOAD_NO_PROGRESS 1
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND
+      ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_LIB} &&
+      ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include
+      ${ONNXRUNTIME_INC_DIR}
+    BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB})
 endif()
 
-ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB})
-ADD_DEPENDENCIES(onnxruntime ${ONNXRUNTIME_PROJECT})
+add_library(onnxruntime STATIC IMPORTED GLOBAL)
+set_property(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB})
+add_dependencies(onnxruntime ${ONNXRUNTIME_PROJECT})
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index b099831738599..1cccfb86f4208 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -12,80 +12,84 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(CBLAS_PREFIX_DIR  ${THIRD_PARTY_PATH}/openblas)
-SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
-SET(CBLAS_REPOSITORY  ${GIT_URL}/xianyi/OpenBLAS.git)
-SET(CBLAS_TAG         v0.3.7)
+set(CBLAS_PREFIX_DIR ${THIRD_PARTY_PATH}/openblas)
+set(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
+set(CBLAS_REPOSITORY ${GIT_URL}/xianyi/OpenBLAS.git)
+set(CBLAS_TAG v0.3.7)
 if(APPLE AND WITH_ARM)
-  SET(CBLAS_TAG         v0.3.13)
+  set(CBLAS_TAG v0.3.13)
 endif()
 
 if(WITH_MIPS)
-  SET(CBLAS_TAG         v0.3.13)
+  set(CBLAS_TAG v0.3.13)
 endif()
 
-IF(NOT WIN32)
-    SET(CBLAS_LIBRARIES
-        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
-        CACHE FILEPATH "openblas library." FORCE)
-    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
-    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
+if(NOT WIN32)
+  set(CBLAS_LIBRARIES
+      "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "openblas library." FORCE)
+  set(CBLAS_INC_DIR
+      "${CBLAS_INSTALL_DIR}/include"
+      CACHE PATH "openblas include directory." FORCE)
+  set(OPENBLAS_CC
+      "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
 
-    IF(APPLE)
-        SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
-    ENDIF()
-    SET(OPTIONAL_ARGS "")
-    IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
-        SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
-    ENDIF()
+  if(APPLE)
+    set(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+  endif()
+  set(OPTIONAL_ARGS "")
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
+    set(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
+  endif()
 
-    SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
-    ExternalProject_Add(
-        extern_openblas
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        ${SHALLOW_CLONE}
-        GIT_REPOSITORY      ${CBLAS_REPOSITORY}
-        GIT_TAG             ${CBLAS_TAG}
-        PREFIX              ${CBLAS_PREFIX_DIR}
-        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
-        BUILD_IN_SOURCE     1
-        BUILD_COMMAND       make -j$(nproc) ${COMMON_ARGS} ${OPTIONAL_ARGS}
-        INSTALL_COMMAND     make install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR> 
-        UPDATE_COMMAND      ""
-        CONFIGURE_COMMAND   ""
-        BUILD_BYPRODUCTS    ${CBLAS_LIBRARIES}
-    )
-ELSE(NOT WIN32)
-    SET(CBLAS_LIBRARIES
-        "${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
-        CACHE FILEPATH "openblas library." FORCE)
-    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include/openblas" CACHE PATH "openblas include directory." FORCE)
-    ExternalProject_Add(
-        extern_openblas
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY      ${CBLAS_REPOSITORY}
-        GIT_TAG             ${CBLAS_TAG}
-        PREFIX              ${CBLAS_PREFIX_DIR}
-        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
-        BUILD_IN_SOURCE     0
-        UPDATE_COMMAND      ""
-        CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                            -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                            -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                            -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                            -DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR}
-                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                            -DBUILD_SHARED_LIBS=ON
-                            -DMSVC_STATIC_CRT=${MSVC_STATIC_CRT}
-                            ${EXTERNAL_OPTIONAL_ARGS}
-        CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
-                            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        # ninja need to know where openblas.lib comes from
-        BUILD_BYPRODUCTS    ${CBLAS_LIBRARIES}
-        )
-    SET(OPENBLAS_SHARED_LIB  ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX})
-ENDIF(NOT WIN32)
+  set(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
+  ExternalProject_Add(
+    extern_openblas
+    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+    GIT_REPOSITORY ${CBLAS_REPOSITORY}
+    GIT_TAG ${CBLAS_TAG}
+    PREFIX ${CBLAS_PREFIX_DIR}
+    INSTALL_DIR ${CBLAS_INSTALL_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND make -j$(nproc) ${COMMON_ARGS} ${OPTIONAL_ARGS}
+    INSTALL_COMMAND make install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
+    UPDATE_COMMAND ""
+    CONFIGURE_COMMAND ""
+    BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
+else(NOT WIN32)
+  set(CBLAS_LIBRARIES
+      "${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "openblas library." FORCE)
+  set(CBLAS_INC_DIR
+      "${CBLAS_INSTALL_DIR}/include/openblas"
+      CACHE PATH "openblas include directory." FORCE)
+  ExternalProject_Add(
+    extern_openblas
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY ${CBLAS_REPOSITORY}
+    GIT_TAG ${CBLAS_TAG}
+    PREFIX ${CBLAS_PREFIX_DIR}
+    INSTALL_DIR ${CBLAS_INSTALL_DIR}
+    BUILD_IN_SOURCE 0
+    UPDATE_COMMAND ""
+    CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+               -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+               -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+               -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+               -DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR}
+               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+               -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+               -DBUILD_SHARED_LIBS=ON
+               -DMSVC_STATIC_CRT=${MSVC_STATIC_CRT}
+               ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS
+      -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
+      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    # ninja need to know where openblas.lib comes from
+    BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
+  set(OPENBLAS_SHARED_LIB
+      ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX})
+endif(NOT WIN32)
diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake
index 2fc22578cae9d..8252b2a73e943 100644
--- a/cmake/external/paddle2onnx.cmake
+++ b/cmake/external/paddle2onnx.cmake
@@ -16,84 +16,91 @@ if(NOT WITH_ONNXRUNTIME)
   return()
 endif()
 
-if (WITH_ARM)
+if(WITH_ARM)
   message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu")
   return()
-endif ()
+endif()
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(PADDLE2ONNX_PROJECT        "extern_paddle2onnx")
-SET(PADDLE2ONNX_PREFIX_DIR     ${THIRD_PARTY_PATH}/paddle2onnx)
-SET(PADDLE2ONNX_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/paddle2onnx)
-SET(PADDLE2ONNX_INC_DIR        "${PADDLE2ONNX_INSTALL_DIR}/include" CACHE PATH "paddle2onnx include directory." FORCE)
-SET(PADDLE2ONNX_REPOSITORY     ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git)
-SET(PADDLE2ONNX_TAG            cpp)
-SET(LIBDIR "lib")
-SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}")
+set(PADDLE2ONNX_PROJECT "extern_paddle2onnx")
+set(PADDLE2ONNX_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle2onnx)
+set(PADDLE2ONNX_INSTALL_DIR ${THIRD_PARTY_PATH}/install/paddle2onnx)
+set(PADDLE2ONNX_INC_DIR
+    "${PADDLE2ONNX_INSTALL_DIR}/include"
+    CACHE PATH "paddle2onnx include directory." FORCE)
+set(PADDLE2ONNX_REPOSITORY ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git)
+set(PADDLE2ONNX_TAG cpp)
+set(LIBDIR "lib")
+set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}"
+                      "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}")
 
-INCLUDE_DIRECTORIES(${PADDLE2ONNX_INC_DIR}) # For PADDLE2ONNX code to include internal headers.
+include_directories(${PADDLE2ONNX_INC_DIR}
+)# For PADDLE2ONNX code to include internal headers.
 if(WIN32)
-    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib" CACHE FILEPATH "paddle2onnx static library." FORCE)
-    SET(PADDLE2ONNX_SHARED_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll" CACHE FILEPATH "paddle2onnx shared library." FORCE)
+  set(PADDLE2ONNX_LIB
+      "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib"
+      CACHE FILEPATH "paddle2onnx static library." FORCE)
+  set(PADDLE2ONNX_SHARED_LIB
+      "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll"
+      CACHE FILEPATH "paddle2onnx shared library." FORCE)
 elseif(APPLE)
-    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib" CACHE FILEPATH "PADDLE2ONNX library." FORCE)
+  set(PADDLE2ONNX_LIB
+      "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib"
+      CACHE FILEPATH "PADDLE2ONNX library." FORCE)
 else()
-    SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so" CACHE FILEPATH "PADDLE2ONNX library." FORCE)
+  set(PADDLE2ONNX_LIB
+      "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so"
+      CACHE FILEPATH "PADDLE2ONNX library." FORCE)
 endif(WIN32)
 
-
 # The protoc path is required to compile onnx.
 string(REPLACE "/" ";" PROTOC_BIN_PATH ${PROTOBUF_PROTOC_EXECUTABLE})
 list(POP_BACK PROTOC_BIN_PATH)
 list(JOIN PROTOC_BIN_PATH "/" PROTOC_BIN_PATH)
 
-
 set(PADDLE2ONNX_OPTIONAL_ARGS
-      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-      -DCMAKE_CXX_STANDARD=14
-      -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-      -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-      -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-      -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-      -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH}
-      -DWITH_STATIC=OFF
-      -DMSVC_STATIC_CRT=${MSVC_STATIC_CRT}
-      -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR}
-      -DCMAKE_INSTALL_LIBDIR=${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}
-      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-      -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-      ${EXTERNAL_OPTIONAL_ARGS}
-)
-
-if (WITH_PYTHON)
-  set(PADDLE2ONNX_OPTIONAL_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS}
-    -DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE}
-    -DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR}
-    -DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY}
-  )
-endif ()
+    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    -DCMAKE_CXX_STANDARD=14
+    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+    -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH}
+    -DWITH_STATIC=OFF
+    -DMSVC_STATIC_CRT=${MSVC_STATIC_CRT}
+    -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR}
+    -DCMAKE_INSTALL_LIBDIR=${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}
+    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+    ${EXTERNAL_OPTIONAL_ARGS})
 
+if(WITH_PYTHON)
+  set(PADDLE2ONNX_OPTIONAL_ARGS
+      ${PADDLE2ONNX_OPTIONAL_ARGS}
+      -DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE}
+      -DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR}
+      -DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY})
+endif()
 
 ExternalProject_Add(
-    ${PADDLE2ONNX_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    GIT_REPOSITORY      ${PADDLE2ONNX_REPOSITORY}
-    GIT_TAG             ${PADDLE2ONNX_TAG}
-    DEPENDS             protobuf
-    PREFIX              ${PADDLE2ONNX_PREFIX_DIR}
-    UPDATE_COMMAND      ""
-    CMAKE_ARGS       ${PADDLE2ONNX_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR}
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-    BUILD_BYPRODUCTS    ${PADDLE2ONNX_LIB}
-)
+  ${PADDLE2ONNX_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${PADDLE2ONNX_REPOSITORY}
+  GIT_TAG ${PADDLE2ONNX_TAG}
+  DEPENDS protobuf
+  PREFIX ${PADDLE2ONNX_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  CMAKE_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+  BUILD_BYPRODUCTS ${PADDLE2ONNX_LIB})
 
-ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB})
-ADD_DEPENDENCIES(paddle2onnx ${PADDLE2ONNX_PROJECT})
+add_library(paddle2onnx STATIC IMPORTED GLOBAL)
+set_property(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB})
+add_dependencies(paddle2onnx ${PADDLE2ONNX_PROJECT})
diff --git a/cmake/external/pocketfft.cmake b/cmake/external/pocketfft.cmake
index 7323f67d115e1..2d809bbcf03ec 100644
--- a/cmake/external/pocketfft.cmake
+++ b/cmake/external/pocketfft.cmake
@@ -14,30 +14,29 @@
 
 include(ExternalProject)
 
+set(POCKETFFT_PATH
+    "${THIRD_PARTY_PATH}/pocketfft"
+    CACHE STRING "A path setting for external_pocketfft path.")
+set(POCKETFFT_PREFIX_DIR ${POCKETFFT_PATH})
 
-set(POCKETFFT_PATH           "${THIRD_PARTY_PATH}/pocketfft" CACHE STRING "A path setting for external_pocketfft path.")
-set(POCKETFFT_PREFIX_DIR     ${POCKETFFT_PATH})
+set(POCKETFFT_REPOSITORY https://gitlab.mpcdf.mpg.de/mtr/pocketfft.git)
+set(POCKETFFT_TAG release_for_eigen)
 
-set(POCKETFFT_REPOSITORY  https://gitlab.mpcdf.mpg.de/mtr/pocketfft.git)
-set(POCKETFFT_TAG         release_for_eigen)
-
-SET(POCKETFFT_INCLUDE_DIR  ${POCKETFFT_PREFIX_DIR}/src)
+set(POCKETFFT_INCLUDE_DIR ${POCKETFFT_PREFIX_DIR}/src)
 message("POCKETFFT_INCLUDE_DIR is ${POCKETFFT_INCLUDE_DIR}")
 include_directories(${POCKETFFT_INCLUDE_DIR})
 
 ExternalProject_Add(
   extern_pocketfft
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  ${SHALLOW_CLONE}
-  GIT_REPOSITORY  ${POCKETFFT_REPOSITORY}
-  GIT_TAG         ${POCKETFFT_TAG}
-  PREFIX          ${POCKETFFT_PREFIX_DIR}
-  UPDATE_COMMAND    ""
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${POCKETFFT_REPOSITORY}
+  GIT_TAG ${POCKETFFT_TAG}
+  PREFIX ${POCKETFFT_PREFIX_DIR}
+  UPDATE_COMMAND ""
   CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-  TEST_COMMAND      ""
-)
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND "")
 
 add_library(pocketfft INTERFACE)
 
diff --git a/cmake/external/poplar.cmake b/cmake/external/poplar.cmake
index 8b2de14e96620..7589059e7b3e7 100644
--- a/cmake/external/poplar.cmake
+++ b/cmake/external/poplar.cmake
@@ -14,7 +14,12 @@
 
 macro(find_popart_version popart_version_file)
   file(READ ${popart_version_file} popart_version_file_content)
-  string(REGEX MATCH "(POPART_VERSION_STRING)[ \t\r\n](\")([0-9]+\.[0-9]+\.[0-9]+)(\\+)([A-Za-z0-9_]*)(\")" POPART_VERSION ${popart_version_file_content})
+  string(
+    REGEX
+      MATCH
+      "(POPART_VERSION_STRING)[ \t\r\n](\")([0-9]+\.[0-9]+\.[0-9]+)(\\+)([A-Za-z0-9_]*)(\")"
+      POPART_VERSION
+      ${popart_version_file_content})
   string(REPLACE "POPART_VERSION_STRING" "" POPART_VERSION "${POPART_VERSION}")
   string(REPLACE "\"" "" POPART_VERSION "${POPART_VERSION}")
   string(REPLACE " " "" POPART_VERSION "${POPART_VERSION}")
@@ -28,7 +33,11 @@ endmacro()
 if(WITH_IPU)
   set(POPLAR_DIR CACHE PATH "Path to a Poplar install")
   set(POPART_DIR CACHE PATH "Path to a Popart install")
-  set(POPLAR_SDK_DIR CACHE PATH "Path to an extracted SDK archive or to a Poplar & Popart install directory (Will populate POPLAR_DIR and POPART_DIR)")
+  set(POPLAR_SDK_DIR
+      CACHE
+        PATH
+        "Path to an extracted SDK archive or to a Poplar & Popart install directory (Will populate POPLAR_DIR and POPART_DIR)"
+  )
 
   # support setting SDK both from environment variable or command line arguments
 
@@ -36,10 +45,15 @@ if(WITH_IPU)
     set(POPLAR_SDK_DIR $ENV{POPLAR_SDK_DIR})
   endif()
   if(EXISTS ${POPLAR_SDK_DIR})
-    execute_process(COMMAND find ${POPLAR_SDK_DIR}/ -maxdepth 1 -type d -name "popart*"
-      OUTPUT_VARIABLE POPART_DIR OUTPUT_STRIP_TRAILING_WHITESPACE)
-    execute_process(COMMAND find ${POPLAR_SDK_DIR}/ -maxdepth 1 -type d -name "poplar-*" -o -name "poplar"
-      OUTPUT_VARIABLE POPLAR_DIR OUTPUT_STRIP_TRAILING_WHITESPACE)
+    execute_process(
+      COMMAND find ${POPLAR_SDK_DIR}/ -maxdepth 1 -type d -name "popart*"
+      OUTPUT_VARIABLE POPART_DIR
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    execute_process(
+      COMMAND find ${POPLAR_SDK_DIR}/ -maxdepth 1 -type d -name "poplar-*" -o
+              -name "poplar"
+      OUTPUT_VARIABLE POPLAR_DIR
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
   endif()
   if(DEFINED ENV{POPLAR_DIR})
     set(POPLAR_DIR $ENV{POPLAR_DIR})
@@ -51,7 +65,10 @@ if(WITH_IPU)
   if(EXISTS ${POPLAR_DIR})
     message("POPLAR_DIR is ${POPLAR_DIR}")
     if(NOT IS_DIRECTORY "${POPLAR_DIR}")
-      message(FATAL_ERROR "Couldn't find a \"poplar\" or \"poplar-*\" folder in '${POPLAR_SDK_DIR}'")
+      message(
+        FATAL_ERROR
+          "Couldn't find a \"poplar\" or \"poplar-*\" folder in '${POPLAR_SDK_DIR}'"
+      )
     endif()
     list(APPEND CMAKE_PREFIX_PATH ${POPLAR_DIR})
     set(ENABLE_POPLAR_CMD "source ${POPLAR_DIR}/enable.sh")
@@ -60,12 +77,16 @@ if(WITH_IPU)
     link_directories("${POPLAR_DIR}/lib")
   endif()
   if(NOT poplar_FOUND)
-      message(FATAL_ERROR "You must provide a path to a Poplar install using -DPOPLAR_DIR=/path/to/popart/build/install")
+    message(
+      FATAL_ERROR
+        "You must provide a path to a Poplar install using -DPOPLAR_DIR=/path/to/popart/build/install"
+    )
   endif()
   if(EXISTS ${POPART_DIR})
     message("POPART_DIR is ${POPART_DIR}")
     if(NOT IS_DIRECTORY "${POPART_DIR}")
-      message(FATAL_ERROR "Couldn't find a \"popart*\" folder in '${POPLAR_SDK_DIR}'")
+      message(
+        FATAL_ERROR "Couldn't find a \"popart*\" folder in '${POPLAR_SDK_DIR}'")
     endif()
     list(APPEND CMAKE_PREFIX_PATH ${POPART_DIR})
     set(ENABLE_POPART_CMD "source ${POPART_DIR}/enable.sh")
@@ -74,7 +95,10 @@ if(WITH_IPU)
     link_directories("${POPART_DIR}/lib")
   endif()
   if(NOT popart_FOUND)
-    message(FATAL_ERROR "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build")
+    message(
+      FATAL_ERROR
+        "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build"
+    )
   endif()
 
   find_popart_version("${POPART_DIR}/include/popart/version.hpp")
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 3a59ea6bc92a2..1368081b58fda 100755
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -12,304 +12,346 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
-IF(NOT WIN32)
-    FIND_PACKAGE(Protobuf QUIET)
-ENDIF(NOT WIN32)
+if(NOT WIN32)
+  find_package(Protobuf QUIET)
+endif(NOT WIN32)
 
-UNSET_VAR(PROTOBUF_INCLUDE_DIR)
-UNSET_VAR(PROTOBUF_FOUND)
-UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
-UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
-UNSET_VAR(PROTOBUF_LITE_LIBRARY)
-UNSET_VAR(PROTOBUF_LIBRARY)
-UNSET_VAR(PROTOBUF_INCLUDE_DIR)
-UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
+unset_var(PROTOBUF_INCLUDE_DIR)
+unset_var(PROTOBUF_FOUND)
+unset_var(PROTOBUF_PROTOC_EXECUTABLE)
+unset_var(PROTOBUF_PROTOC_LIBRARY)
+unset_var(PROTOBUF_LITE_LIBRARY)
+unset_var(PROTOBUF_LIBRARY)
+unset_var(PROTOBUF_INCLUDE_DIR)
+unset_var(Protobuf_PROTOC_EXECUTABLE)
 function(protobuf_generate_python SRCS)
-    # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
-    if(NOT ARGN)
-        message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
-        return()
-    endif()
-
-    if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
-        # Create an include path for each file specified
-        foreach(FIL ${ARGN})
-            get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-            get_filename_component(ABS_PATH ${ABS_FIL} PATH)
-            list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
-            if(${_contains_already} EQUAL -1)
-                list(APPEND _protobuf_include_path -I ${ABS_PATH})
-            endif()
-        endforeach()
-    else()
-        set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
-    endif()
-    if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
-        set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
-    endif()
+  # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
+  if(NOT ARGN)
+    message(
+      SEND_ERROR
+        "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
+    return()
+  endif()
 
-    if(DEFINED Protobuf_IMPORT_DIRS)
-        foreach(DIR ${Protobuf_IMPORT_DIRS})
-            get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
-            list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
-            if(${_contains_already} EQUAL -1)
-                list(APPEND _protobuf_include_path -I ${ABS_PATH})
-            endif()
-        endforeach()
-    endif()
-
-    set(${SRCS})
+  if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
+    # Create an include path for each file specified
     foreach(FIL ${ARGN})
-        get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-        get_filename_component(FIL_WE ${FIL} NAME_WE)
-        if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
-            get_filename_component(FIL_DIR ${FIL} DIRECTORY)
-            if(FIL_DIR)
-                set(FIL_WE "${FIL_DIR}/${FIL_WE}")
-            endif()
-        endif()
-        list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
-        add_custom_command(
-                OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
-                COMMAND  ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
-                DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE}
-                COMMENT "Running Python protocol buffer compiler on ${FIL}"
-                VERBATIM )
+      get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+      get_filename_component(ABS_PATH ${ABS_FIL} PATH)
+      list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+      if(${_contains_already} EQUAL -1)
+        list(APPEND _protobuf_include_path -I ${ABS_PATH})
+      endif()
+    endforeach()
+  else()
+    set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
+  endif()
+  if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
+    set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
+  endif()
+
+  if(DEFINED Protobuf_IMPORT_DIRS)
+    foreach(DIR ${Protobuf_IMPORT_DIRS})
+      get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
+      list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+      if(${_contains_already} EQUAL -1)
+        list(APPEND _protobuf_include_path -I ${ABS_PATH})
+      endif()
     endforeach()
+  endif()
 
-    set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+  set(${SRCS})
+  foreach(FIL ${ARGN})
+    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
+      get_filename_component(FIL_DIR ${FIL} DIRECTORY)
+      if(FIL_DIR)
+        set(FIL_WE "${FIL_DIR}/${FIL_WE}")
+      endif()
+    endif()
+    list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
+      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out
+              ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
+      DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE}
+      COMMENT "Running Python protocol buffer compiler on ${FIL}"
+      VERBATIM)
+  endforeach()
+
+  set(${SRCS}
+      ${${SRCS}}
+      PARENT_SCOPE)
 endfunction()
 
 # Print and set the protobuf library information,
 # finish this cmake process and exit from this file.
 macro(PROMPT_PROTOBUF_LIB)
-    SET(protobuf_DEPS ${ARGN})
+  set(protobuf_DEPS ${ARGN})
 
-    MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
-    MESSAGE(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}")
-    MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
-    MESSAGE(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}")
-    MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
-    INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+  message(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
+  message(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}")
+  message(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
+  message(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}")
+  message(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
+  include_directories(${PROTOBUF_INCLUDE_DIR})
 
-    # Assuming that all the protobuf libraries are of the same type.
-    IF(${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX})
-        SET(protobuf_LIBTYPE STATIC)
-    ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$")
-        SET(protobuf_LIBTYPE SHARED)
-    ELSE()
-        MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}")
-    ENDIF()
+  # Assuming that all the protobuf libraries are of the same type.
+  if(${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX})
+    set(protobuf_LIBTYPE STATIC)
+  elseif(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$")
+    set(protobuf_LIBTYPE SHARED)
+  else()
+    message(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}")
+  endif()
 
-    ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY})
+  add_library(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+  set_property(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY})
 
-    ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY})
+  add_library(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+  set_property(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION
+                                             ${PROTOBUF_LITE_LIBRARY})
 
-    ADD_LIBRARY(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY})
+  add_library(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+  set_property(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY})
 
-    ADD_EXECUTABLE(protoc IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE})
-    # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
-    # make `protobuf_generate_cpp` happy.
-    SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
+  add_executable(protoc IMPORTED GLOBAL)
+  set_property(TARGET protoc PROPERTY IMPORTED_LOCATION
+                                      ${PROTOBUF_PROTOC_EXECUTABLE})
+  # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
+  # make `protobuf_generate_cpp` happy.
+  set(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
 
-    FOREACH(dep ${protobuf_DEPS})
-        ADD_DEPENDENCIES(protobuf ${dep})
-        ADD_DEPENDENCIES(protobuf_lite ${dep})
-        ADD_DEPENDENCIES(libprotoc ${dep})
-        ADD_DEPENDENCIES(protoc ${dep})
-    ENDFOREACH()
+  foreach(dep ${protobuf_DEPS})
+    add_dependencies(protobuf ${dep})
+    add_dependencies(protobuf_lite ${dep})
+    add_dependencies(libprotoc ${dep})
+    add_dependencies(protoc ${dep})
+  endforeach()
 
-    RETURN()
+  return()
 endmacro()
 macro(SET_PROTOBUF_VERSION)
-    EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
-    STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
+  exec_program(
+    ${PROTOBUF_PROTOC_EXECUTABLE} ARGS
+    --version
+    OUTPUT_VARIABLE PROTOBUF_VERSION)
+  string(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
 endmacro()
 
-set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
-IF (WIN32)
-    SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
-ENDIF(WIN32)
+set(PROTOBUF_ROOT
+    ""
+    CACHE PATH "Folder contains protobuf")
+if(WIN32)
+  set(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
+endif(WIN32)
 
-if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
-    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LIBRARY protobuf libprotobuf.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
-    if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
-        SET(PROTOBUF_FOUND true)
-        message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
-        SET_PROTOBUF_VERSION()
-        PROMPT_PROTOBUF_LIB()
-    endif()
+if(NOT "${PROTOBUF_ROOT}" STREQUAL "")
+  find_path(
+    PROTOBUF_INCLUDE_DIR google/protobuf/message.h
+    PATHS ${PROTOBUF_ROOT}/include
+    NO_DEFAULT_PATH)
+  find_library(
+    PROTOBUF_LIBRARY protobuf libprotobuf.lib
+    PATHS ${PROTOBUF_ROOT}/lib
+    NO_DEFAULT_PATH)
+  find_library(
+    PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib
+    PATHS ${PROTOBUF_ROOT}/lib
+    NO_DEFAULT_PATH)
+  find_library(
+    PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib
+    PATHS ${PROTOBUF_ROOT}/lib
+    NO_DEFAULT_PATH)
+  find_program(
+    PROTOBUF_PROTOC_EXECUTABLE protoc
+    PATHS ${PROTOBUF_ROOT}/bin
+    NO_DEFAULT_PATH)
+  if(PROTOBUF_INCLUDE_DIR
+     AND PROTOBUF_LIBRARY
+     AND PROTOBUF_LITE_LIBRARY
+     AND PROTOBUF_PROTOC_LIBRARY
+     AND PROTOBUF_PROTOC_EXECUTABLE)
+    set(PROTOBUF_FOUND true)
+    message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
+    set_protobuf_version()
+    prompt_protobuf_lib()
+  endif()
 endif()
 
-FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
-    STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}")
-    SET(PROTOBUF_PREFIX_DIR  ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME})
-    SET(PROTOBUF_SOURCE_DIR  ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME}/src/${TARGET_NAME})
-    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME})
-
-    SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
-    SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
-    SET(${TARGET_NAME}_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}"
-         PARENT_SCOPE)
-    SET(${TARGET_NAME}_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}"
-         PARENT_SCOPE)
-    SET(${TARGET_NAME}_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}"
-         PARENT_SCOPE)
-    SET(${TARGET_NAME}_PROTOC_EXECUTABLE
-        "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}"
-         PARENT_SCOPE)
+function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
+  string(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}")
+  set(PROTOBUF_PREFIX_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME})
+  set(PROTOBUF_SOURCE_DIR
+      ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME}/src/${TARGET_NAME})
+  set(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME})
 
-    SET(OPTIONAL_CACHE_ARGS "")
-    SET(OPTIONAL_ARGS "")
-    IF(BUILD_FOR_HOST)
-        SET(OPTIONAL_ARGS "-Dprotobuf_WITH_ZLIB=OFF")
-    ELSE()
-        SET(OPTIONAL_ARGS
-            "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
-            "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
-            "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
-            "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
-            "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}"
-            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
-            "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
-            "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
-            "-Dprotobuf_WITH_ZLIB=ON"
-            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
-            ${EXTERNAL_OPTIONAL_ARGS})
-        SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
-    ENDIF()
-    IF(WIN32)
-        SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} 
-            "-DCMAKE_GENERATOR=${CMAKE_GENERATOR}"
-            "-DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM}"
-            "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
-    ENDIF()
+  set(${TARGET_NAME}_INCLUDE_DIR
+      "${PROTOBUF_INSTALL_DIR}/include"
+      PARENT_SCOPE)
+  set(PROTOBUF_INCLUDE_DIR
+      "${PROTOBUF_INSTALL_DIR}/include"
+      PARENT_SCOPE)
+  set(${TARGET_NAME}_LITE_LIBRARY
+      "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}"
+      PARENT_SCOPE)
+  set(${TARGET_NAME}_LIBRARY
+      "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}"
+      PARENT_SCOPE)
+  set(${TARGET_NAME}_PROTOC_LIBRARY
+      "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}"
+      PARENT_SCOPE)
+  set(${TARGET_NAME}_PROTOC_EXECUTABLE
+      "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}"
+      PARENT_SCOPE)
 
+  set(OPTIONAL_CACHE_ARGS "")
+  set(OPTIONAL_ARGS "")
+  if(BUILD_FOR_HOST)
+    set(OPTIONAL_ARGS "-Dprotobuf_WITH_ZLIB=OFF")
+  else()
+    set(OPTIONAL_ARGS
+        "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+        "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+        "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
+        "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
+        "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}"
+        "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+        "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
+        "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
+        "-Dprotobuf_WITH_ZLIB=ON"
+        "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
+        ${EXTERNAL_OPTIONAL_ARGS})
+    set(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
+  endif()
+  if(WIN32)
+    set(OPTIONAL_ARGS
+        ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR=${CMAKE_GENERATOR}"
+        "-DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM}"
+        "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
+  endif()
 
-    if(WITH_ONNXRUNTIME)
-        SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
-        SET(PROTOBUF_TAG         v3.18.0)
-    elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
-        SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
-        SET(PROTOBUF_TAG         v3.8.0)
-    elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
-        SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
-        SET(PROTOBUF_TAG         v3.8.0)
-    elseif(WITH_IPU)
-        SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
-        SET(PROTOBUF_TAG         d750fbf648256c7c631f51ffdbf67d7c18b0114e)
-    elseif(WIN32)
-        SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
-        # Change the tag to support building with vs2019
-        SET(PROTOBUF_TAG         01a05a53f40ca2ac5f0af10c6cc0810bee39b792)
-    else()
-        SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
-        SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
-    endif()
-    if(WITH_ARM_BRPC)
-        SET(ARM_PROTOBUF_URL "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_protobuf.tar.gz" CACHE STRING "" FORCE)
-        FILE(WRITE ${PROTOBUF_SOURCE_DIR}/CMakeLists.txt
-        "PROJECT(ARM_PROTOBUF)\n"
-        "cmake_minimum_required(VERSION 3.0)\n"
-        "install(DIRECTORY arm_protobuf/bin  arm_protobuf/include arm_protobuf/lib \n"
-	"        DESTINATION . USE_SOURCE_PERMISSIONS)\n")
-        ExternalProject_Add(
-            ${TARGET_NAME}
-            ${EXTERNAL_PROJECT_LOG_ARGS}
-            ${SHALLOW_CLONE}
-            PREFIX          ${PROTOBUF_PREFIX_DIR}
-            DOWNLOAD_DIR          ${PROTOBUF_SOURCE_DIR}
-            DOWNLOAD_COMMAND     rm -rf arm_protobuf.tar.gz
-                                 && wget --no-check-certificate ${ARM_PROTOBUF_URL}
-                                 && tar zxvf arm_protobuf.tar.gz
-            #DOWNLOAD_COMMAND    cp /home/wangbin44/Paddle/build/arm_protobuf.tar.gz .
-            #                    && tar zxvf arm_protobuf.tar.gz
-            UPDATE_COMMAND  ""
-            CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-                                -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-            CMAKE_CACHE_ARGS
-                                -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-                                -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-            BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}
-            BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}
-            BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}
-            BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}
-        )
-    else()
-        ExternalProject_Add(
-            ${TARGET_NAME}
-            ${EXTERNAL_PROJECT_LOG_ARGS}
-            ${SHALLOW_CLONE}
-            GIT_REPOSITORY  ${PROTOBUF_REPOSITORY}
-            GIT_TAG         ${PROTOBUF_TAG}
-            PREFIX          ${PROTOBUF_PREFIX_DIR}
-            UPDATE_COMMAND  ""
-            DEPENDS         zlib
-            CONFIGURE_COMMAND
-                            ${CMAKE_COMMAND} ${PROTOBUF_SOURCE_DIR}/cmake
-                            ${OPTIONAL_ARGS}
-                            -Dprotobuf_BUILD_TESTS=OFF
-                            -DCMAKE_SKIP_RPATH=ON
-                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-                            -DCMAKE_INSTALL_LIBDIR=lib
-                            -DBUILD_SHARED_LIBS=OFF
-            CMAKE_CACHE_ARGS
-                            -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-                            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-                            -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-                            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                            ${OPTIONAL_CACHE_ARGS}
-            BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}
-            BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}
-            BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}
-            BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}
-        )
-    endif()
-ENDFUNCTION()
+  if(WITH_ONNXRUNTIME)
+    set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
+    set(PROTOBUF_TAG v3.18.0)
+  elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+    set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
+    set(PROTOBUF_TAG v3.8.0)
+  elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
+    set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
+    set(PROTOBUF_TAG v3.8.0)
+  elseif(WITH_IPU)
+    set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
+    set(PROTOBUF_TAG d750fbf648256c7c631f51ffdbf67d7c18b0114e)
+  elseif(WIN32)
+    set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
+    # Change the tag to support building with vs2019
+    set(PROTOBUF_TAG 01a05a53f40ca2ac5f0af10c6cc0810bee39b792)
+  else()
+    set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
+    set(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546)
+  endif()
+  if(WITH_ARM_BRPC)
+    set(ARM_PROTOBUF_URL
+        "https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/arm_protobuf.tar.gz"
+        CACHE STRING "" FORCE)
+    file(
+      WRITE ${PROTOBUF_SOURCE_DIR}/CMakeLists.txt
+      "PROJECT(ARM_PROTOBUF)\n"
+      "cmake_minimum_required(VERSION 3.0)\n"
+      "install(DIRECTORY arm_protobuf/bin  arm_protobuf/include arm_protobuf/lib \n"
+      "        DESTINATION . USE_SOURCE_PERMISSIONS)\n")
+    ExternalProject_Add(
+      ${TARGET_NAME}
+      ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+      PREFIX ${PROTOBUF_PREFIX_DIR}
+      DOWNLOAD_DIR ${PROTOBUF_SOURCE_DIR}
+      DOWNLOAD_COMMAND rm -rf arm_protobuf.tar.gz && wget --no-check-certificate
+                       ${ARM_PROTOBUF_URL} && tar zxvf arm_protobuf.tar.gz
+      #DOWNLOAD_COMMAND    cp /home/wangbin44/Paddle/build/arm_protobuf.tar.gz .
+      #                    && tar zxvf arm_protobuf.tar.gz
+      UPDATE_COMMAND ""
+      CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
+                 -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+      CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
+                       -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+      BUILD_BYPRODUCTS
+        ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}
+      BUILD_BYPRODUCTS
+        ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}
+      BUILD_BYPRODUCTS
+        ${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}
+      BUILD_BYPRODUCTS
+        ${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX})
+  else()
+    ExternalProject_Add(
+      ${TARGET_NAME}
+      ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+      GIT_REPOSITORY ${PROTOBUF_REPOSITORY}
+      GIT_TAG ${PROTOBUF_TAG}
+      PREFIX ${PROTOBUF_PREFIX_DIR}
+      UPDATE_COMMAND ""
+      DEPENDS zlib
+      CONFIGURE_COMMAND
+        ${CMAKE_COMMAND} ${PROTOBUF_SOURCE_DIR}/cmake ${OPTIONAL_ARGS}
+        -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_SKIP_RPATH=ON
+        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+        -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
+        -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=OFF
+      CMAKE_CACHE_ARGS
+        -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
+        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+        ${OPTIONAL_CACHE_ARGS}
+      BUILD_BYPRODUCTS
+        ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}
+      BUILD_BYPRODUCTS
+        ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}
+      BUILD_BYPRODUCTS
+        ${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}
+      BUILD_BYPRODUCTS
+        ${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX})
+  endif()
+endfunction()
 
 if(WITH_ONNXRUNTIME)
-    SET(PROTOBUF_VERSION 3.18.0)
+  set(PROTOBUF_VERSION 3.18.0)
 elseif(WITH_ASCEND OR WITH_ASCEND_CL)
-    SET(PROTOBUF_VERSION 3.8.0)
+  set(PROTOBUF_VERSION 3.8.0)
 elseif(WITH_IPU)
-    SET(PROTOBUF_VERSION 3.6.1)
+  set(PROTOBUF_VERSION 3.6.1)
 elseif(WITH_ARM_BRPC)
-    SET(PROTOBUF_VERSION 3.7.1-baidu-ee-common)
+  set(PROTOBUF_VERSION 3.7.1-baidu-ee-common)
 else()
-    SET(PROTOBUF_VERSION 3.1.0)
+  set(PROTOBUF_VERSION 3.1.0)
 endif()
 
-IF(NOT PROTOBUF_FOUND)
-    build_protobuf(extern_protobuf FALSE)
+if(NOT PROTOBUF_FOUND)
+  build_protobuf(extern_protobuf FALSE)
 
-    SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR}
-        CACHE PATH "protobuf include directory." FORCE)
-    SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY}
-        CACHE FILEPATH "protobuf lite library." FORCE)
-    SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY}
-        CACHE FILEPATH "protobuf library." FORCE)
-    SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
-        CACHE FILEPATH "protoc library." FORCE)
+  set(PROTOBUF_INCLUDE_DIR
+      ${extern_protobuf_INCLUDE_DIR}
+      CACHE PATH "protobuf include directory." FORCE)
+  set(PROTOBUF_LITE_LIBRARY
+      ${extern_protobuf_LITE_LIBRARY}
+      CACHE FILEPATH "protobuf lite library." FORCE)
+  set(PROTOBUF_LIBRARY
+      ${extern_protobuf_LIBRARY}
+      CACHE FILEPATH "protobuf library." FORCE)
+  set(PROTOBUF_PROTOC_LIBRARY
+      ${extern_protobuf_PROTOC_LIBRARY}
+      CACHE FILEPATH "protoc library." FORCE)
 
-    SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
-        CACHE FILEPATH "protobuf executable." FORCE)
-    # `EXTERN_PROTOBUF_DEPEND` used in cmake function `proto_library` to ensure
-    # `protoc.exe` existed before calling it.
-    set(EXTERN_PROTOBUF_DEPEND extern_protobuf)
-    PROMPT_PROTOBUF_LIB(extern_protobuf)
-ENDIF(NOT PROTOBUF_FOUND)
+  set(PROTOBUF_PROTOC_EXECUTABLE
+      ${extern_protobuf_PROTOC_EXECUTABLE}
+      CACHE FILEPATH "protobuf executable." FORCE)
+  # `EXTERN_PROTOBUF_DEPEND` used in cmake function `proto_library` to ensure
+  # `protoc.exe` existed before calling it.
+  set(EXTERN_PROTOBUF_DEPEND extern_protobuf)
+  prompt_protobuf_lib(extern_protobuf)
+endif(NOT PROTOBUF_FOUND)
diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake
index 47a83d905e84f..1b1298d6c6c59 100644
--- a/cmake/external/pslib.cmake
+++ b/cmake/external/pslib.cmake
@@ -12,53 +12,58 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(PSLIB_PROJECT       "extern_pslib")
-IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(PSLIB_VER "0.1.1" CACHE STRING "" FORCE)
-  SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE)
-  SET(PSLIB_URL "https://pslib.bj.bcebos.com/pslib.tar.gz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}")
-SET(PSLIB_PREFIX_DIR    "${THIRD_PARTY_PATH}/pslib")
-SET(PSLIB_DOWNLOAD_DIR  "${PSLIB_PREFIX_DIR}/src/${PSLIB_PROJECT}")
-SET(PSLIB_DST_DIR       "pslib")
-SET(PSLIB_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(PSLIB_INSTALL_DIR   ${PSLIB_INSTALL_ROOT}/${PSLIB_DST_DIR})
-SET(PSLIB_ROOT          ${PSLIB_INSTALL_DIR})
-SET(PSLIB_INC_DIR       ${PSLIB_ROOT}/include)
-SET(PSLIB_LIB_DIR       ${PSLIB_ROOT}/lib)
-SET(PSLIB_LIB           ${PSLIB_LIB_DIR}/libps.so)
-SET(PSLIB_VERSION_PY    ${PSLIB_DOWNLOAD_DIR}/pslib/version.py)
-SET(PSLIB_IOMP_LIB      ${PSLIB_LIB_DIR}/libiomp5.so) #todo what is this
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_ROOT}/lib")
+set(PSLIB_PROJECT "extern_pslib")
+if((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL))
+  message(STATUS "use pre defined download url")
+  set(PSLIB_VER
+      "0.1.1"
+      CACHE STRING "" FORCE)
+  set(PSLIB_NAME
+      "pslib"
+      CACHE STRING "" FORCE)
+  set(PSLIB_URL
+      "https://pslib.bj.bcebos.com/pslib.tar.gz"
+      CACHE STRING "" FORCE)
+endif()
+message(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}")
+set(PSLIB_PREFIX_DIR "${THIRD_PARTY_PATH}/pslib")
+set(PSLIB_DOWNLOAD_DIR "${PSLIB_PREFIX_DIR}/src/${PSLIB_PROJECT}")
+set(PSLIB_DST_DIR "pslib")
+set(PSLIB_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
+set(PSLIB_INSTALL_DIR ${PSLIB_INSTALL_ROOT}/${PSLIB_DST_DIR})
+set(PSLIB_ROOT ${PSLIB_INSTALL_DIR})
+set(PSLIB_INC_DIR ${PSLIB_ROOT}/include)
+set(PSLIB_LIB_DIR ${PSLIB_ROOT}/lib)
+set(PSLIB_LIB ${PSLIB_LIB_DIR}/libps.so)
+set(PSLIB_VERSION_PY ${PSLIB_DOWNLOAD_DIR}/pslib/version.py)
+set(PSLIB_IOMP_LIB ${PSLIB_LIB_DIR}/libiomp5.so) #todo what is this
+set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_ROOT}/lib")
 
-INCLUDE_DIRECTORIES(${PSLIB_INC_DIR})
+include_directories(${PSLIB_INC_DIR})
 
-FILE(WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(PSLIB)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
+file(
+  WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(PSLIB)\n" "cmake_minimum_required(VERSION 3.0)\n"
   "install(DIRECTORY ${PSLIB_NAME}/include ${PSLIB_NAME}/lib \n"
   "        DESTINATION ${PSLIB_DST_DIR})\n")
 
 ExternalProject_Add(
-    ${PSLIB_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${PSLIB_PREFIX_DIR}
-    DOWNLOAD_DIR          ${PSLIB_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_NAME}.tar.gz
-                          && tar zxvf ${PSLIB_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT}
-                          -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT}
-                          -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-    BUILD_BYPRODUCTS      ${PSLIB_LIB}
-)
+  ${PSLIB_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  PREFIX ${PSLIB_PREFIX_DIR}
+  DOWNLOAD_DIR ${PSLIB_DOWNLOAD_DIR}
+  DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O
+                   ${PSLIB_NAME}.tar.gz && tar zxvf ${PSLIB_NAME}.tar.gz
+  DOWNLOAD_NO_PROGRESS 1
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT}
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+  CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT}
+                   -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+  BUILD_BYPRODUCTS ${PSLIB_LIB})
 
-ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
-ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
+add_library(pslib SHARED IMPORTED GLOBAL)
+set_property(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
+add_dependencies(pslib ${PSLIB_PROJECT})
diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake
index 27e2788aa21fe..eef91052a400e 100644
--- a/cmake/external/pslib_brpc.cmake
+++ b/cmake/external/pslib_brpc.cmake
@@ -12,52 +12,61 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(PSLIB_BRPC_PROJECT       "extern_pslib_brpc")
-IF((NOT DEFINED PSLIB_BRPC_NAME) OR (NOT DEFINED PSLIB_BRPC_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(PSLIB_BRPC_VER "0.1.0" CACHE STRING "" FORCE)
-  SET(PSLIB_BRPC_NAME "pslib_brpc" CACHE STRING "" FORCE)
-  SET(PSLIB_BRPC_URL "https://pslib.bj.bcebos.com/pslib_brpc.tar.gz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "PSLIB_BRPC_NAME: ${PSLIB_BRPC_NAME}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}")
-SET(PSLIB_BRPC_PREFIX_DIR    "${THIRD_PARTY_PATH}/pslib_brpc")
-SET(PSLIB_BRPC_DOWNLOAD_DIR  "${PSLIB_BRPC_PREFIX_DIR}/src/${PSLIB_BRPC_PROJECT}")
-SET(PSLIB_BRPC_DST_DIR       "pslib_brpc")
-SET(PSLIB_BRPC_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(PSLIB_BRPC_INSTALL_DIR   ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR})
-SET(PSLIB_BRPC_ROOT          ${PSLIB_BRPC_INSTALL_DIR})
-SET(PSLIB_BRPC_INC_DIR       ${PSLIB_BRPC_ROOT}/include)
-SET(PSLIB_BRPC_LIB_DIR       ${PSLIB_BRPC_ROOT}/lib)
-SET(PSLIB_BRPC_LIB           ${PSLIB_BRPC_LIB_DIR}/libbrpc.a)
-SET(PSLIB_BRPC_IOMP_LIB      ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib")
+set(PSLIB_BRPC_PROJECT "extern_pslib_brpc")
+if((NOT DEFINED PSLIB_BRPC_NAME) OR (NOT DEFINED PSLIB_BRPC_URL))
+  message(STATUS "use pre defined download url")
+  set(PSLIB_BRPC_VER
+      "0.1.0"
+      CACHE STRING "" FORCE)
+  set(PSLIB_BRPC_NAME
+      "pslib_brpc"
+      CACHE STRING "" FORCE)
+  set(PSLIB_BRPC_URL
+      "https://pslib.bj.bcebos.com/pslib_brpc.tar.gz"
+      CACHE STRING "" FORCE)
+endif()
+message(
+  STATUS
+    "PSLIB_BRPC_NAME: ${PSLIB_BRPC_NAME}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}")
+set(PSLIB_BRPC_PREFIX_DIR "${THIRD_PARTY_PATH}/pslib_brpc")
+set(PSLIB_BRPC_DOWNLOAD_DIR
+    "${PSLIB_BRPC_PREFIX_DIR}/src/${PSLIB_BRPC_PROJECT}")
+set(PSLIB_BRPC_DST_DIR "pslib_brpc")
+set(PSLIB_BRPC_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
+set(PSLIB_BRPC_INSTALL_DIR ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR})
+set(PSLIB_BRPC_ROOT ${PSLIB_BRPC_INSTALL_DIR})
+set(PSLIB_BRPC_INC_DIR ${PSLIB_BRPC_ROOT}/include)
+set(PSLIB_BRPC_LIB_DIR ${PSLIB_BRPC_ROOT}/lib)
+set(PSLIB_BRPC_LIB ${PSLIB_BRPC_LIB_DIR}/libbrpc.a)
+set(PSLIB_BRPC_IOMP_LIB ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this
+set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib")
 
-INCLUDE_DIRECTORIES(${PSLIB_BRPC_INC_DIR})
+include_directories(${PSLIB_BRPC_INC_DIR})
 
-FILE(WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(PSLIB_BRPC)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
+file(
+  WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(PSLIB_BRPC)\n" "cmake_minimum_required(VERSION 3.0)\n"
   "install(DIRECTORY ${PSLIB_BRPC_NAME}/include ${PSLIB_BRPC_NAME}/lib \n"
   "        DESTINATION ${PSLIB_BRPC_DST_DIR})\n")
 
 ExternalProject_Add(
-    ${PSLIB_BRPC_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${PSLIB_BRPC_PREFIX_DIR}
-    DOWNLOAD_DIR          ${PSLIB_BRPC_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_NAME}.tar.gz
-                          && tar zxvf ${PSLIB_BRPC_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT}
-                          -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_BRPC_INSTALL_ROOT}
-                          -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-    BUILD_BYPRODUCTS      ${PSLIB_BRPC_LIB}
-)
+  ${PSLIB_BRPC_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  PREFIX ${PSLIB_BRPC_PREFIX_DIR}
+  DOWNLOAD_DIR ${PSLIB_BRPC_DOWNLOAD_DIR}
+  DOWNLOAD_COMMAND
+    wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O
+    ${PSLIB_BRPC_NAME}.tar.gz && tar zxvf ${PSLIB_BRPC_NAME}.tar.gz
+  DOWNLOAD_NO_PROGRESS 1
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT}
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+  CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_BRPC_INSTALL_ROOT}
+                   -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+  BUILD_BYPRODUCTS ${PSLIB_BRPC_LIB})
 
-ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
-ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
+add_library(pslib_brpc SHARED IMPORTED GLOBAL)
+set_property(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
+add_dependencies(pslib_brpc ${PSLIB_BRPC_PROJECT})
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index f87e73081ffb7..e236767cec156 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -14,31 +14,29 @@
 
 include(ExternalProject)
 
-set(PYBIND_PREFIX_DIR     ${THIRD_PARTY_PATH}/pybind)
-SET(PYBIND_REPOSITORY     ${GIT_URL}/pybind/pybind11.git)
-SET(PYBIND_TAG            v2.4.3)
+set(PYBIND_PREFIX_DIR ${THIRD_PARTY_PATH}/pybind)
+set(PYBIND_REPOSITORY ${GIT_URL}/pybind/pybind11.git)
+set(PYBIND_TAG v2.4.3)
 
 set(PYBIND_INCLUDE_DIR ${THIRD_PARTY_PATH}/pybind/src/extern_pybind/include)
 include_directories(${PYBIND_INCLUDE_DIR})
 
 ExternalProject_Add(
-        extern_pybind
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        ${SHALLOW_CLONE}
-        GIT_REPOSITORY    ${PYBIND_REPOSITORY}
-        GIT_TAG           ${PYBIND_TAG}
-        PREFIX            ${PYBIND_PREFIX_DIR}
-        # If we explicitly leave the `UPDATE_COMMAND` of the ExternalProject_Add
-        # function in CMakeLists blank, it will cause another parameter GIT_TAG
-        # to be modified without triggering incremental compilation, and the
-        # third-party library version changes cannot be incorporated.
-        # reference: https://cmake.org/cmake/help/latest/module/ExternalProject.html
-        UPDATE_COMMAND    ""
-        CONFIGURE_COMMAND ""
-        BUILD_COMMAND     ""
-        INSTALL_COMMAND   ""
-        TEST_COMMAND      ""
-)
+  extern_pybind
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${PYBIND_REPOSITORY}
+  GIT_TAG ${PYBIND_TAG}
+  PREFIX ${PYBIND_PREFIX_DIR}
+  # If we explicitly leave the `UPDATE_COMMAND` of the ExternalProject_Add
+  # function in CMakeLists blank, it will cause another parameter GIT_TAG
+  # to be modified without triggering incremental compilation, and the
+  # third-party library version changes cannot be incorporated.
+  # reference: https://cmake.org/cmake/help/latest/module/ExternalProject.html
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND "")
 
 add_library(pybind INTERFACE)
 
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index ab3776084136e..bc58c9d7b6c35 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -12,68 +12,72 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(python_module)
+include(python_module)
 
-FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED)
-FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED)
+find_package(PythonInterp ${PY_VERSION} REQUIRED)
+find_package(PythonLibs ${PY_VERSION} REQUIRED)
 
 if(WIN32)
-    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-"from distutils import sysconfig as s;import sys;import struct;
+  execute_process(
+    COMMAND
+      "${PYTHON_EXECUTABLE}" "-c"
+      "from distutils import sysconfig as s;import sys;import struct;
 print(sys.prefix);
 print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
 "
-            RESULT_VARIABLE _PYTHON_SUCCESS
-            OUTPUT_VARIABLE _PYTHON_VALUES
-            ERROR_VARIABLE _PYTHON_ERROR_VALUE)
+    RESULT_VARIABLE _PYTHON_SUCCESS
+    OUTPUT_VARIABLE _PYTHON_VALUES
+    ERROR_VARIABLE _PYTHON_ERROR_VALUE)
 
-    if(NOT _PYTHON_SUCCESS EQUAL 0)
-        set(PYTHONLIBS_FOUND FALSE)
-        return()
-    endif()
+  if(NOT _PYTHON_SUCCESS EQUAL 0)
+    set(PYTHONLIBS_FOUND FALSE)
+    return()
+  endif()
 
-    # Convert the process output into a list
-    string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
-    string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
-    list(GET _PYTHON_VALUES 0 PYTHON_PREFIX)
-    list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX)
+  # Convert the process output into a list
+  string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
+  string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
+  list(GET _PYTHON_VALUES 0 PYTHON_PREFIX)
+  list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX)
 
-    # Make sure all directory separators are '/'
-    string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
+  # Make sure all directory separators are '/'
+  string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
 
-    set(PYTHON_LIBRARY
-            "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+  set(PYTHON_LIBRARY "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
 
-    # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
-    # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
-    if(NOT EXISTS "${PYTHON_LIBRARY}")
-        get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
-        set(PYTHON_LIBRARY
-                "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
-    endif()
+  # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
+  # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
+  if(NOT EXISTS "${PYTHON_LIBRARY}")
+    get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
+    set(PYTHON_LIBRARY
+        "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+  endif()
 
-    # raise an error if the python libs are still not found.
-    if(NOT EXISTS "${PYTHON_LIBRARY}")
-        message(FATAL_ERROR "Python libraries not found")
-    endif()
-    SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
+  # raise an error if the python libs are still not found.
+  if(NOT EXISTS "${PYTHON_LIBRARY}")
+    message(FATAL_ERROR "Python libraries not found")
+  endif()
+  set(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
 endif(WIN32)
 
 # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
-ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
+add_library(python SHARED IMPORTED GLOBAL)
+set_property(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
 
-SET(py_env "")
-IF(PYTHONINTERP_FOUND)
-    find_python_module(pip REQUIRED)
-    find_python_module(numpy REQUIRED)
-    find_python_module(wheel REQUIRED)
-    find_python_module(google.protobuf REQUIRED)
-    FIND_PACKAGE(NumPy REQUIRED)
-    IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
-        MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
+set(py_env "")
+if(PYTHONINTERP_FOUND)
+  find_python_module(pip REQUIRED)
+  find_python_module(numpy REQUIRED)
+  find_python_module(wheel REQUIRED)
+  find_python_module(google.protobuf REQUIRED)
+  find_package(NumPy REQUIRED)
+  if(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION}
+                                       VERSION_LESS "3.0.0")
+    message(
+      FATAL_ERROR
+        "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
         "please use pip to upgrade protobuf. pip install -U protobuf")
-    ENDIF()
-ENDIF(PYTHONINTERP_FOUND)
-INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
-INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
+  endif()
+endif(PYTHONINTERP_FOUND)
+include_directories(${PYTHON_INCLUDE_DIR})
+include_directories(${PYTHON_NUMPY_INCLUDE_DIR})
diff --git a/cmake/external/rocksdb.cmake b/cmake/external/rocksdb.cmake
index befbc8138fc50..2e90f50e3cdf2 100644
--- a/cmake/external/rocksdb.cmake
+++ b/cmake/external/rocksdb.cmake
@@ -12,40 +12,44 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(ROCKSDB_PREFIX_DIR ${THIRD_PARTY_PATH}/rocksdb)
-SET(ROCKSDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocksdb)
-SET(ROCKSDB_INCLUDE_DIR "${ROCKSDB_INSTALL_DIR}/include" CACHE PATH "rocksdb include directory." FORCE)
-SET(ROCKSDB_LIBRARIES "${ROCKSDB_INSTALL_DIR}/lib/librocksdb.a" CACHE FILEPATH "rocksdb library." FORCE)
-SET(ROCKSDB_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
-INCLUDE_DIRECTORIES(${ROCKSDB_INCLUDE_DIR})
+set(ROCKSDB_PREFIX_DIR ${THIRD_PARTY_PATH}/rocksdb)
+set(ROCKSDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocksdb)
+set(ROCKSDB_INCLUDE_DIR
+    "${ROCKSDB_INSTALL_DIR}/include"
+    CACHE PATH "rocksdb include directory." FORCE)
+set(ROCKSDB_LIBRARIES
+    "${ROCKSDB_INSTALL_DIR}/lib/librocksdb.a"
+    CACHE FILEPATH "rocksdb library." FORCE)
+set(ROCKSDB_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+include_directories(${ROCKSDB_INCLUDE_DIR})
 
 ExternalProject_Add(
-    extern_rocksdb
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX ${ROCKSDB_PREFIX_DIR}
-    GIT_REPOSITORY "https://github.com/facebook/rocksdb"
-    GIT_TAG v6.10.1
-    UPDATE_COMMAND ""
-    CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-               -DWITH_BZ2=OFF
-               -DWITH_GFLAGS=OFF
-               -DCMAKE_CXX_FLAGS=${ROCKSDB_CMAKE_CXX_FLAGS}
-               -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-#    BUILD_BYPRODUCTS ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/librocksdb.a
-    INSTALL_COMMAND mkdir -p ${ROCKSDB_INSTALL_DIR}/lib/ 
-        && cp ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/librocksdb.a ${ROCKSDB_LIBRARIES}
-        && cp -r ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/include ${ROCKSDB_INSTALL_DIR}/
-    BUILD_IN_SOURCE 1
-)
+  extern_rocksdb
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  PREFIX ${ROCKSDB_PREFIX_DIR}
+  GIT_REPOSITORY "https://github.com/facebook/rocksdb"
+  GIT_TAG v6.10.1
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DWITH_BZ2=OFF
+             -DWITH_GFLAGS=OFF
+             -DCMAKE_CXX_FLAGS=${ROCKSDB_CMAKE_CXX_FLAGS}
+             -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+  #    BUILD_BYPRODUCTS ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/librocksdb.a
+  INSTALL_COMMAND
+    mkdir -p ${ROCKSDB_INSTALL_DIR}/lib/ && cp
+    ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/librocksdb.a ${ROCKSDB_LIBRARIES}
+    && cp -r ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/include
+    ${ROCKSDB_INSTALL_DIR}/
+  BUILD_IN_SOURCE 1)
 
-ADD_DEPENDENCIES(extern_rocksdb snappy)
+add_dependencies(extern_rocksdb snappy)
 
-ADD_LIBRARY(rocksdb STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET rocksdb PROPERTY IMPORTED_LOCATION ${ROCKSDB_LIBRARIES})
-ADD_DEPENDENCIES(rocksdb extern_rocksdb)
-
-LIST(APPEND external_project_dependencies rocksdb)
+add_library(rocksdb STATIC IMPORTED GLOBAL)
+set_property(TARGET rocksdb PROPERTY IMPORTED_LOCATION ${ROCKSDB_LIBRARIES})
+add_dependencies(rocksdb extern_rocksdb)
 
+list(APPEND external_project_dependencies rocksdb)
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index 42320df13972a..dfb7192a71e66 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -12,58 +12,61 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-include (ExternalProject)
+include(ExternalProject)
 
 # NOTE: snappy is needed when linking with recordio
 
 set(SNAPPY_PREFIX_DIR ${THIRD_PARTY_PATH}/snappy)
 set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
-set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
+set(SNAPPY_INCLUDE_DIR
+    "${SNAPPY_INSTALL_DIR}/include"
+    CACHE PATH "snappy include directory." FORCE)
 
 if(WIN32)
-    SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
-    IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
-        add_custom_command(TARGET extern_snappy POST_BUILD
-                COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib
-                )
-    ENDIF()
-    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
+  set(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
+  if(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
+    add_custom_command(
+      TARGET extern_snappy
+      POST_BUILD
+      COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib
+              ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib)
+  endif()
+  set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
 else()
-    SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+  set(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
 endif()
 
 ExternalProject_Add(
-        extern_snappy
-        GIT_REPOSITORY "https://github.com/google/snappy"
-        GIT_TAG "1.1.7"
-        PREFIX          ${SNAPPY_PREFIX_DIR}
-        UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                        -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
-                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                        -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-                        -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                        -DBUILD_TESTING=OFF
-                        -DSNAPPY_BUILD_TESTS:BOOL=OFF
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        ${EXTERNAL_OPTIONAL_ARGS}
-        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
-                         -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
-                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                         -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        BUILD_BYPRODUCTS ${SNAPPY_LIBRARIES}
-)
+  extern_snappy
+  GIT_REPOSITORY "https://github.com/google/snappy"
+  GIT_TAG "1.1.7"
+  PREFIX ${SNAPPY_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+             -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
+             -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+             -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+             -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DBUILD_TESTING=OFF
+             -DSNAPPY_BUILD_TESTS:BOOL=OFF
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             ${EXTERNAL_OPTIONAL_ARGS}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
+    -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+  BUILD_BYPRODUCTS ${SNAPPY_LIBRARIES})
 
 add_library(snappy STATIC IMPORTED GLOBAL)
 set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
 
 include_directories(${SNAPPY_INCLUDE_DIR})
 add_dependencies(snappy extern_snappy)
-
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
index c4d978115bfb2..1047465095f42 100644
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -12,32 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
+set(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
 if(WITH_ASCEND OR WITH_ASCEND_CL)
-    SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
+  set(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
 else()
-    SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
+  set(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
 endif()
-SET(THREADPOOL_TAG        9a42ec1329f259a5f4881a291db1dcb8f2ad9040)
+set(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040)
 
-SET(THREADPOOL_INCLUDE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
-INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})
+set(THREADPOOL_INCLUDE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
+include_directories(${THREADPOOL_INCLUDE_DIR})
 
 ExternalProject_Add(
-    extern_threadpool
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    GIT_REPOSITORY  ${THREADPOOL_REPOSITORY}
-    GIT_TAG         ${THREADPOOL_TAG}
-    PREFIX          ${THREADPOOL_PREFIX_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   ""
-    TEST_COMMAND      ""
-)
+  extern_threadpool
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${THREADPOOL_REPOSITORY}
+  GIT_TAG ${THREADPOOL_TAG}
+  PREFIX ${THREADPOOL_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND "")
 
 add_library(simple_threadpool INTERFACE)
 
diff --git a/cmake/external/utf8proc.cmake b/cmake/external/utf8proc.cmake
index a5de5c15c3b51..13107c03cf171 100644
--- a/cmake/external/utf8proc.cmake
+++ b/cmake/external/utf8proc.cmake
@@ -12,40 +12,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(UTF8PROC_PREFIX_DIR    ${THIRD_PARTY_PATH}/utf8proc)
-SET(UTF8PROC_INSTALL_DIR   ${THIRD_PARTY_PATH}/install/utf8proc)
+set(UTF8PROC_PREFIX_DIR ${THIRD_PARTY_PATH}/utf8proc)
+set(UTF8PROC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/utf8proc)
 # As we add extra features for utf8proc, we use the non-official repo
-SET(UTF8PROC_REPOSITORY    ${GIT_URL}/JuliaStrings/utf8proc.git)
-SET(UTF8PROC_TAG           v2.6.1)
+set(UTF8PROC_REPOSITORY ${GIT_URL}/JuliaStrings/utf8proc.git)
+set(UTF8PROC_TAG v2.6.1)
 
-IF(WIN32)
-  SET(UTF8PROC_LIBRARIES     "${UTF8PROC_INSTALL_DIR}/lib/utf8proc_static.lib")
+if(WIN32)
+  set(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/utf8proc_static.lib")
   add_definitions(-DUTF8PROC_STATIC)
-ELSE(WIN32)
-  SET(UTF8PROC_LIBRARIES     "${UTF8PROC_INSTALL_DIR}/lib/libutf8proc.a")
-ENDIF(WIN32)
+else(WIN32)
+  set(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/libutf8proc.a")
+endif(WIN32)
 
-INCLUDE_DIRECTORIES(${UTF8PROC_INSTALL_DIR}/include)
+include_directories(${UTF8PROC_INSTALL_DIR}/include)
 
 ExternalProject_Add(
   extern_utf8proc
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  ${SHALLOW_CLONE}
-  GIT_REPOSITORY        ${UTF8PROC_REPOSITORY}
-  GIT_TAG               ${UTF8PROC_TAG}
-  PREFIX                ${UTF8PROC_PREFIX_DIR}
-  UPDATE_COMMAND        ""
-  CMAKE_ARGS            -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                        -DBUILD_SHARED=ON
-                        -DBUILD_STATIC=ON
-                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                        -DCMAKE_INSTALL_PREFIX:PATH=${UTF8PROC_INSTALL_DIR}
-                        -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
-  BUILD_BYPRODUCTS     ${UTF8PROC_LIBRARIES}
-)
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${UTF8PROC_REPOSITORY}
+  GIT_TAG ${UTF8PROC_TAG}
+  PREFIX ${UTF8PROC_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+             -DBUILD_SHARED=ON
+             -DBUILD_STATIC=ON
+             -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+             -DCMAKE_INSTALL_PREFIX:PATH=${UTF8PROC_INSTALL_DIR}
+             -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
+  BUILD_BYPRODUCTS ${UTF8PROC_LIBRARIES})
 
-ADD_LIBRARY(utf8proc STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET utf8proc PROPERTY IMPORTED_LOCATION ${UTF8PROC_LIBRARIES})
-ADD_DEPENDENCIES(utf8proc extern_utf8proc)
+add_library(utf8proc STATIC IMPORTED GLOBAL)
+set_property(TARGET utf8proc PROPERTY IMPORTED_LOCATION ${UTF8PROC_LIBRARIES})
+add_dependencies(utf8proc extern_utf8proc)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index f0d16fc7978e8..d38636c9c23a8 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -12,130 +12,139 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-IF(WITH_ROCM)
-    add_definitions(-DWARPCTC_WITH_HIP)
-ENDIF()
+if(WITH_ROCM)
+  add_definitions(-DWARPCTC_WITH_HIP)
+endif()
 
-SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
-SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
-# in case of low internet speed  
+set(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
+set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
+# in case of low internet speed
 #set(WARPCTC_REPOSITORY  https://gitee.com/tianjianhe/warp-ctc.git)
-set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
+set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git)
+set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
 
-SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
+set(WARPCTC_INCLUDE_DIR
+    "${WARPCTC_INSTALL_DIR}/include"
     CACHE PATH "Warp-ctc Directory" FORCE)
 # Used in unit test test_WarpCTCLayer
-SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib"
+set(WARPCTC_LIB_DIR
+    "${WARPCTC_INSTALL_DIR}/lib"
     CACHE PATH "Warp-ctc Library Directory" FORCE)
 
-IF(WIN32)
-    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
-            CACHE FILEPATH "Warp-ctc Library" FORCE)
+if(WIN32)
+  set(WARPCTC_LIBRARIES
+      "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "Warp-ctc Library" FORCE)
 else(WIN32)
-    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
-            CACHE FILEPATH "Warp-ctc Library" FORCE)
-ENDIF(WIN32)
+  set(WARPCTC_LIBRARIES
+      "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "Warp-ctc Library" FORCE)
+endif(WIN32)
 
-IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32)
-    SET(USE_OMP OFF)
-ELSE()
-    SET(USE_OMP ON)
-ENDIF()
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang"
+   OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"
+   OR WIN32)
+  set(USE_OMP OFF)
+else()
+  set(USE_OMP ON)
+endif()
 
 if(WITH_ASCEND OR WITH_ASCEND_CL)
-    ExternalProject_Add(
-        extern_warpctc
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        ${SHALLOW_CLONE}
-        GIT_REPOSITORY  ${WARPCTC_REPOSITORY}
-        GIT_TAG         ${WARPCTC_TAG}
-        PREFIX          ${WARPCTC_PREFIX_DIR}
-        #UPDATE_COMMAND  ""
-        PATCH_COMMAND   ""
-        BUILD_ALWAYS    1
-        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                        -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-                        -DWITH_GPU=${WITH_GPU}
-                        -DWITH_ROCM=${WITH_ROCM}
-                        -DWITH_OMP=${USE_OMP}
-                        -DWITH_TORCH=OFF
-                        -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-                        -DBUILD_SHARED=ON
-                        -DBUILD_TESTS=OFF
-                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        ${EXTERNAL_OPTIONAL_ARGS}
-        CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                         -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
-        BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES}
-    )
+  ExternalProject_Add(
+    extern_warpctc
+    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+    GIT_REPOSITORY ${WARPCTC_REPOSITORY}
+    GIT_TAG ${WARPCTC_TAG}
+    PREFIX ${WARPCTC_PREFIX_DIR}
+    #UPDATE_COMMAND  ""
+    PATCH_COMMAND ""
+    BUILD_ALWAYS 1
+    CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+               -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+               -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+               -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+               -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+               -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+               -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+               -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+               -DWITH_GPU=${WITH_GPU}
+               -DWITH_ROCM=${WITH_ROCM}
+               -DWITH_OMP=${USE_OMP}
+               -DWITH_TORCH=OFF
+               -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+               -DBUILD_SHARED=ON
+               -DBUILD_TESTS=OFF
+               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+               -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+               ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS
+      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+    BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES})
 else()
-    if(WIN32)
-        set(WARPCTC_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
-        set(WARPCTC_C_FLAGS_DEBUG $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
-        set(WARPCTC_C_FLAGS_RELEASE $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
-        set(WARPCTC_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
-        set(WARPCTC_CXX_FLAGS_RELEASE $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
-        set(WARPCTC_CXX_FLAGS_DEBUG $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
-    else()
-        set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
-        set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-        set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-        set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-        set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
-        set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
-    endif()
-    ExternalProject_Add(
-        extern_warpctc
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        ${SHALLOW_CLONE}
-        GIT_REPOSITORY  ${WARPCTC_REPOSITORY}
-        GIT_TAG         ${WARPCTC_TAG}
-        PREFIX          ${WARPCTC_PREFIX_DIR}
-        UPDATE_COMMAND  ""
-        PATCH_COMMAND   ""
-        #BUILD_ALWAYS    1
-        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
-                        -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG}
-                        -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE}
-                        -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS}
-                        -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE}
-                        -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG}
-                        -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-                        -DWITH_GPU=${WITH_GPU}
-                        -DWITH_ROCM=${WITH_ROCM}
-                        -DWITH_OMP=${USE_OMP}
-                        -DWITH_TORCH=OFF
-                        -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-                        -DBUILD_SHARED=ON
-                        -DBUILD_TESTS=OFF
-                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        ${EXTERNAL_OPTIONAL_ARGS}
-        CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                         -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
-        BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES}
-    )
+  if(WIN32)
+    set(WARPCTC_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
+    set(WARPCTC_C_FLAGS_DEBUG
+        $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+    set(WARPCTC_C_FLAGS_RELEASE
+        $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+    set(WARPCTC_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
+    set(WARPCTC_CXX_FLAGS_RELEASE
+        $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+    set(WARPCTC_CXX_FLAGS_DEBUG
+        $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+  else()
+    set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
+    set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+    set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+    set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+    set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+    set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+  endif()
+  ExternalProject_Add(
+    extern_warpctc
+    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+    GIT_REPOSITORY ${WARPCTC_REPOSITORY}
+    GIT_TAG ${WARPCTC_TAG}
+    PREFIX ${WARPCTC_PREFIX_DIR}
+    UPDATE_COMMAND ""
+    PATCH_COMMAND ""
+    #BUILD_ALWAYS    1
+    CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+               -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
+               -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG}
+               -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE}
+               -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS}
+               -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE}
+               -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG}
+               -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+               -DWITH_GPU=${WITH_GPU}
+               -DWITH_ROCM=${WITH_ROCM}
+               -DWITH_OMP=${USE_OMP}
+               -DWITH_TORCH=OFF
+               -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+               -DBUILD_SHARED=ON
+               -DBUILD_TESTS=OFF
+               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+               -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+               ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS
+      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+    BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES})
 endif()
 
-MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
+message(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
 get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
+include_directories(${WARPCTC_INCLUDE_DIR}
+)# For warpctc code to include its headers.
 
-ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
-ADD_DEPENDENCIES(warpctc extern_warpctc)
+add_library(warpctc SHARED IMPORTED GLOBAL)
+set_property(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
+add_dependencies(warpctc extern_warpctc)
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
index 6ad15b3730d1d..589056458c1f0 100644
--- a/cmake/external/xbyak.cmake
+++ b/cmake/external/xbyak.cmake
@@ -14,12 +14,12 @@
 
 include(ExternalProject)
 
-set(XBYAK_PROJECT       extern_xbyak)
-set(XBYAK_PREFIX_DIR    ${THIRD_PARTY_PATH}/xbyak)
-set(XBYAK_INSTALL_ROOT  ${THIRD_PARTY_PATH}/install/xbyak)
-set(XBYAK_INC_DIR       ${XBYAK_INSTALL_ROOT}/include)
-set(XBYAK_REPOSITORY    ${GIT_URL}/herumi/xbyak.git)
-set(XBYAK_TAG           v5.81) # Dec 19, 2019
+set(XBYAK_PROJECT extern_xbyak)
+set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak)
+set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak)
+set(XBYAK_INC_DIR ${XBYAK_INSTALL_ROOT}/include)
+set(XBYAK_REPOSITORY ${GIT_URL}/herumi/xbyak.git)
+set(XBYAK_TAG v5.81) # Dec 19, 2019
 
 include_directories(${XBYAK_INC_DIR})
 include_directories(${XBYAK_INC_DIR}/xbyak)
@@ -31,19 +31,17 @@ add_definitions(-DXBYAK64)
 add_definitions(-DXBYAK_NO_OP_NAMES)
 
 ExternalProject_Add(
-    ${XBYAK_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    GIT_REPOSITORY      ${XBYAK_REPOSITORY}
-    GIT_TAG             ${XBYAK_TAG}
-    DEPENDS             ""
-    PREFIX              ${XBYAK_PREFIX_DIR}
-    UPDATE_COMMAND      ""
-    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-)
+  ${XBYAK_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${XBYAK_REPOSITORY}
+  GIT_TAG ${XBYAK_TAG}
+  DEPENDS ""
+  PREFIX ${XBYAK_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+  CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
+                   -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE})
 
 add_library(xbyak INTERFACE)
 
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 43d5002fe3819..af27500398f57 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -1,127 +1,151 @@
-if (NOT WITH_XPU)
-    return()
+if(NOT WITH_XPU)
+  return()
 endif()
 
-INCLUDE(ExternalProject)
-SET(XPU_PROJECT                 "extern_xpu")
-SET(XPU_API_LIB_NAME            "libxpuapi.so")
-SET(XPU_RT_LIB_NAME             "libxpurt.so")
+include(ExternalProject)
+set(XPU_PROJECT "extern_xpu")
+set(XPU_API_LIB_NAME "libxpuapi.so")
+set(XPU_RT_LIB_NAME "libxpurt.so")
 
 if(NOT DEFINED XPU_BASE_URL)
-  SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220520")
+  set(XPU_BASE_URL_WITHOUT_DATE
+      "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
+  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220601")
 else()
-  SET(XPU_BASE_URL "${XPU_BASE_URL}")
+  set(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
 
 # ubuntu and centos: use output by XDNN API team
 if(NOT DEFINED XPU_XDNN_BASE_URL)
-  SET(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220520")
+  set(XPU_XDNN_BASE_URL_WITHOUT_DATE
+      "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
+  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220601")
 else()
-  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
+  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()
 
-IF(WITH_AARCH64)
-  SET(XPU_XRE_DIR_NAME "xre-kylin_aarch64")
-  SET(XPU_XDNN_DIR_NAME "XDNN-kylin_aarch64")
-  SET(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64")
-  SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
-ELSEIF(WITH_SUNWAY)
-  SET(XPU_XRE_DIR_NAME "xre-deepin_sw6_64")
-  SET(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64")
-  SET(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64")
-  SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
-ELSEIF(WITH_BDCENTOS)
-  SET(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64")
-  SET(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64")
-  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+if(WITH_AARCH64)
+  set(XPU_XRE_DIR_NAME "xre-kylin_aarch64")
+  set(XPU_XDNN_DIR_NAME "XDNN-kylin_aarch64")
+  set(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64")
+  set(XPU_XDNN_URL
+      "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz"
+      CACHE STRING "" FORCE)
+elseif(WITH_SUNWAY)
+  set(XPU_XRE_DIR_NAME "xre-deepin_sw6_64")
+  set(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64")
+  set(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64")
+  set(XPU_XDNN_URL
+      "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz"
+      CACHE STRING "" FORCE)
+elseif(WITH_BDCENTOS)
+  set(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64")
+  set(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64")
+  set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
   # ubuntu and centos: use output by XDNN API team
-  SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
-ELSEIF(WITH_UBUNTU)
-  SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
-  SET(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64")
-  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+  set(XPU_XDNN_URL
+      "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz"
+      CACHE STRING "" FORCE)
+elseif(WITH_UBUNTU)
+  set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
+  set(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64")
+  set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
   # ubuntu and centos: use output by XDNN API team
-  SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
-ELSEIF(WITH_CENTOS)
-  SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64")
-  SET(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64")
-  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+  set(XPU_XDNN_URL
+      "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz"
+      CACHE STRING "" FORCE)
+elseif(WITH_CENTOS)
+  set(XPU_XRE_DIR_NAME "xre-centos7_x86_64")
+  set(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64")
+  set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
   # ubuntu and centos: use output by XDNN API team
-  SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
-ELSE()
-  SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
-  SET(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64")
-  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+  set(XPU_XDNN_URL
+      "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz"
+      CACHE STRING "" FORCE)
+else()
+  set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
+  set(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64")
+  set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
   # default: use output by XDNN API team
-  SET(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
-ENDIF()
-
-SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
-SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220411/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
-SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE)
-
-SET(SNAPPY_PREFIX_DIR           "${THIRD_PARTY_PATH}/xpu")
-SET(XPU_DOWNLOAD_DIR            "${SNAPPY_PREFIX_DIR}/src/${XPU_PROJECT}")
-SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
-SET(XPU_INC_DIR                 "${THIRD_PARTY_PATH}/install/xpu/include")
-SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
-
-SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
-SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
-
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
+  set(XPU_XDNN_URL
+      "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz"
+      CACHE STRING "" FORCE)
+endif()
 
-FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(XPU)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
+set(XPU_XRE_URL
+    "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz"
+    CACHE STRING "" FORCE)
+set(XPU_XCCL_URL
+    "${XPU_BASE_URL_WITHOUT_DATE}/20220411/${XPU_XCCL_DIR_NAME}.tar.gz"
+    CACHE STRING "" FORCE)
+set(XPU_PACK_DEPENCE_URL
+    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh"
+    CACHE STRING "" FORCE)
+
+set(SNAPPY_PREFIX_DIR "${THIRD_PARTY_PATH}/xpu")
+set(XPU_DOWNLOAD_DIR "${SNAPPY_PREFIX_DIR}/src/${XPU_PROJECT}")
+set(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
+set(XPU_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include")
+set(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib")
+
+set(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
+set(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
+
+set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
+
+file(
+  WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(XPU)\n" "cmake_minimum_required(VERSION 3.0)\n"
   "install(DIRECTORY xpu/include xpu/lib \n"
   "        DESTINATION ${XPU_INSTALL_DIR})\n")
 
 ExternalProject_Add(
-    ${XPU_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${SNAPPY_PREFIX_DIR}
-    DOWNLOAD_DIR          ${XPU_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget ${XPU_PACK_DEPENCE_URL}
-                          && bash pack_paddle_depence.sh ${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL} ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME}
-
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
-    BUILD_BYPRODUCTS      ${XPU_API_LIB}
-    BUILD_BYPRODUCTS      ${XPU_RT_LIB}
-)
-
-INCLUDE_DIRECTORIES(${XPU_INC_DIR})
-ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
+  ${XPU_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  PREFIX ${SNAPPY_PREFIX_DIR}
+  DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
+  DOWNLOAD_COMMAND
+    wget ${XPU_PACK_DEPENCE_URL} && bash pack_paddle_depence.sh ${XPU_XRE_URL}
+    ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL} ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL}
+    ${XPU_XCCL_DIR_NAME}
+  DOWNLOAD_NO_PROGRESS 1
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
+  CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
+  BUILD_BYPRODUCTS ${XPU_API_LIB}
+  BUILD_BYPRODUCTS ${XPU_RT_LIB})
+
+include_directories(${XPU_INC_DIR})
+add_library(shared_xpuapi SHARED IMPORTED GLOBAL)
 set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
 
 # generate a static dummy target to track xpulib dependencies
 # for cc_library(xxx SRCS xxx.c DEPS xpulib)
 generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
 
-TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
+target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
 
-IF(WITH_XPU_BKCL)
-  MESSAGE(STATUS "Compile with XPU BKCL!")
-  ADD_DEFINITIONS(-DPADDLE_WITH_XPU_BKCL)
+if(WITH_XPU_BKCL)
+  message(STATUS "Compile with XPU BKCL!")
+  add_definitions(-DPADDLE_WITH_XPU_BKCL)
 
-  SET(XPU_BKCL_LIB_NAME         "libbkcl.so")
-  SET(XPU_BKCL_LIB              "${XPU_LIB_DIR}/${XPU_BKCL_LIB_NAME}")
-  SET(XPU_BKCL_INC_DIR          "${THIRD_PARTY_PATH}/install/xpu/include")
-  INCLUDE_DIRECTORIES(${XPU_BKCL_INC_DIR})
-  TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB})
-ELSE(WITH_XPU_BKCL)
-  TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
-ENDIF(WITH_XPU_BKCL)
+  set(XPU_BKCL_LIB_NAME "libbkcl.so")
+  set(XPU_BKCL_LIB "${XPU_LIB_DIR}/${XPU_BKCL_LIB_NAME}")
+  set(XPU_BKCL_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include")
+  include_directories(${XPU_BKCL_INC_DIR})
+  target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB})
+else(WITH_XPU_BKCL)
+  target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
+endif(WITH_XPU_BKCL)
 
-ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
+add_dependencies(xpulib ${XPU_PROJECT})
 
 # Ensure that xpu/api.h can be included without dependency errors.
-file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "")
-add_library(xpu_headers_dummy STATIC ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc)
+file(
+  GENERATE
+  OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc
+  CONTENT "")
+add_library(xpu_headers_dummy STATIC
+            ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc)
 add_dependencies(xpu_headers_dummy extern_xpu)
 link_libraries(xpu_headers_dummy)
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index fe17806e36274..6e685bbde402e 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -12,24 +12,39 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
 set(XXHASH_PREFIX_DIR ${THIRD_PARTY_PATH}/xxhash)
 set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash/src/extern_xxhash)
 set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
 set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
-set(XXHASH_REPOSITORY  ${GIT_URL}/Cyan4973/xxHash.git)
-set(XXHASH_TAG         v0.6.5)
+set(XXHASH_REPOSITORY ${GIT_URL}/Cyan4973/xxHash.git)
+set(XXHASH_TAG v0.6.5)
 
-INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
+include_directories(${XXHASH_INCLUDE_DIR})
 
-IF(APPLE)
-  SET(BUILD_CMD sed -i \"\" "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/Makefile && make lib)
-ELSEIF(UNIX)
-  SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/Makefile && make lib)
-ENDIF()
+if(APPLE)
+  set(BUILD_CMD
+      sed
+      -i
+      \"\"
+      "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g"
+      ${XXHASH_SOURCE_DIR}/Makefile
+      &&
+      make
+      lib)
+elseif(UNIX)
+  set(BUILD_CMD
+      sed
+      -i
+      "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g"
+      ${XXHASH_SOURCE_DIR}/Makefile
+      &&
+      make
+      lib)
+endif()
 
-if (WIN32)
+if(WIN32)
   set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib")
   set(XXHASH_CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4710 /wd4711")
   set(XXHASH_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4710 /wd4711")
@@ -37,53 +52,47 @@ else()
   set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
   set(XXHASH_CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
   set(XXHASH_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-endif ()
+endif()
 
 if(WIN32)
   ExternalProject_Add(
-      extern_xxhash
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      ${SHALLOW_CLONE}
-      GIT_REPOSITORY   ${XXHASH_REPOSITORY}
-      GIT_TAG          ${XXHASH_TAG}
-      PREFIX           ${XXHASH_PREFIX_DIR}
-      UPDATE_COMMAND   ""
-      PATCH_COMMAND    ""
-      CONFIGURE_COMMAND
-                      ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/cmake_unofficial
-                      -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR}
-                      -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
-                      -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                      -DBUILD_XXHSUM=OFF
-                      -DCMAKE_GENERATOR=${CMAKE_GENERATOR}
-                      -DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM}
-                      -DBUILD_SHARED_LIBS=OFF
-                      -DCMAKE_CXX_FLAGS=${XXHASH_CMAKE_CXX_FLAGS}
-                      -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                      -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                      -DCMAKE_C_FLAGS=${XXHASH_CMAKE_C_FLAGS}
-                      -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                      -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                      ${OPTIONAL_CACHE_ARGS}
-      TEST_COMMAND      ""
-      BUILD_BYPRODUCTS ${XXHASH_LIBRARIES}
-  )
+    extern_xxhash
+    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+    GIT_REPOSITORY ${XXHASH_REPOSITORY}
+    GIT_TAG ${XXHASH_TAG}
+    PREFIX ${XXHASH_PREFIX_DIR}
+    UPDATE_COMMAND ""
+    PATCH_COMMAND ""
+    CONFIGURE_COMMAND
+      ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/cmake_unofficial
+      -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR}
+      -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
+      -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DBUILD_XXHSUM=OFF
+      -DCMAKE_GENERATOR=${CMAKE_GENERATOR}
+      -DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM}
+      -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_FLAGS=${XXHASH_CMAKE_CXX_FLAGS}
+      -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+      -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+      -DCMAKE_C_FLAGS=${XXHASH_CMAKE_C_FLAGS}
+      -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+      -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} ${OPTIONAL_CACHE_ARGS}
+    TEST_COMMAND ""
+    BUILD_BYPRODUCTS ${XXHASH_LIBRARIES})
 else()
   ExternalProject_Add(
-      extern_xxhash
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      GIT_REPOSITORY   ${XXHASH_REPOSITORY}
-      GIT_TAG          ${XXHASH_TAG}
-      PREFIX           ${XXHASH_PREFIX_DIR}
-      UPDATE_COMMAND    ""
-      CONFIGURE_COMMAND ""
-      BUILD_IN_SOURCE   1
-      BUILD_COMMAND     ${BUILD_CMD}
-      INSTALL_COMMAND   make PREFIX=${XXHASH_INSTALL_DIR} install
-      TEST_COMMAND      ""
-      BUILD_BYPRODUCTS  ${XXHASH_LIBRARIES}
-  )
+    extern_xxhash
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY ${XXHASH_REPOSITORY}
+    GIT_TAG ${XXHASH_TAG}
+    PREFIX ${XXHASH_PREFIX_DIR}
+    UPDATE_COMMAND ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND ${BUILD_CMD}
+    INSTALL_COMMAND make PREFIX=${XXHASH_INSTALL_DIR} install
+    TEST_COMMAND ""
+    BUILD_BYPRODUCTS ${XXHASH_LIBRARIES})
 endif()
 
 add_library(xxhash STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 679e2064699e1..2cef053e32547 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -12,48 +12,57 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+include(ExternalProject)
 
-SET(ZLIB_PREFIX_DIR ${THIRD_PARTY_PATH}/zlib)
-SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
-SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
-SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)
+set(ZLIB_PREFIX_DIR ${THIRD_PARTY_PATH}/zlib)
+set(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
+set(ZLIB_ROOT
+    ${ZLIB_INSTALL_DIR}
+    CACHE FILEPATH "zlib root directory." FORCE)
+set(ZLIB_INCLUDE_DIR
+    "${ZLIB_INSTALL_DIR}/include"
+    CACHE PATH "zlib include directory." FORCE)
 set(ZLIB_REPOSITORY ${GIT_URL}/madler/zlib.git)
-set(ZLIB_TAG        v1.2.8)
+set(ZLIB_TAG v1.2.8)
 
-INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
-INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.
+include_directories(${ZLIB_INCLUDE_DIR}
+)# For zlib code to include its own headers.
+include_directories(${THIRD_PARTY_PATH}/install
+)# For Paddle code to include zlib.h.
 
-IF(WIN32)
-  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
-ELSE(WIN32)
-  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
-ENDIF(WIN32)
+if(WIN32)
+  set(ZLIB_LIBRARIES
+      "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib"
+      CACHE FILEPATH "zlib library." FORCE)
+else(WIN32)
+  set(ZLIB_LIBRARIES
+      "${ZLIB_INSTALL_DIR}/lib/libz.a"
+      CACHE FILEPATH "zlib library." FORCE)
+endif(WIN32)
 
 ExternalProject_Add(
-    extern_zlib
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    GIT_REPOSITORY  ${ZLIB_REPOSITORY}
-    GIT_TAG         ${ZLIB_TAG}
-    PREFIX          ${ZLIB_PREFIX_DIR}
-    UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                    -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
-                    -DBUILD_SHARED_LIBS=OFF
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_MACOSX_RPATH=ON
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-    BUILD_BYPRODUCTS ${ZLIB_LIBRARIES}
-)
+  extern_zlib
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${ZLIB_REPOSITORY}
+  GIT_TAG ${ZLIB_TAG}
+  PREFIX ${ZLIB_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+             -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+             -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
+             -DBUILD_SHARED_LIBS=OFF
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DCMAKE_MACOSX_RPATH=ON
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             ${EXTERNAL_OPTIONAL_ARGS}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+  BUILD_BYPRODUCTS ${ZLIB_LIBRARIES})
 
-ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
-ADD_DEPENDENCIES(zlib extern_zlib)
+add_library(zlib STATIC IMPORTED GLOBAL)
+set_property(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
+add_dependencies(zlib extern_zlib)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 0dbd3bc328314..e3c5545df8b27 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -5,33 +5,39 @@ include(CheckCXXSymbolExists)
 include(CheckTypeSize)
 
 function(CheckCompilerCXX14Flag)
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4)
-            message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.")
-        elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2)
-            message(WARNING "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2")
-        endif()
-    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
-        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
-        # https://gist.github.com/yamaya/2924292
-        if(APPLE)  # cmake < 3.0 compiler id "Clang" on Mac OS X
-            if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1)
-                message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
-            endif()
-        else()
-            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4)
-                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.")
-            endif()
-        endif()
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4)
+      message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.")
+    elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2)
+      message(
+        WARNING
+          "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2"
+      )
     endif()
+  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID
+                                                        STREQUAL "Clang")
+    # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
+    # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
+    # https://gist.github.com/yamaya/2924292
+    if(APPLE) # cmake < 3.0 compiler id "Clang" on Mac OS X
+      if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1)
+        message(
+          FATAL_ERROR
+            "Unsupported AppleClang version. AppleClang >= 5.1 required.")
+      endif()
+    else()
+      if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4)
+        message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.")
+      endif()
+    endif()
+  endif()
 endfunction()
 
-CheckCompilerCXX14Flag()
+checkcompilercxx14flag()
 if(NOT WIN32)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
 else()
-    set(CMAKE_CXX_STANDARD 14)
+  set(CMAKE_CXX_STANDARD 14)
 endif()
 
 # safe_set_flag
@@ -42,56 +48,58 @@ endif()
 # flag_name: the flag name for compiler, such as '-Werror' '-Wall' etc
 # rest arguments: not used.
 function(safe_set_flag is_c src_list flag_name)
-    string(REPLACE "-" "_" safe_name ${flag_name})
-    string(REPLACE "=" "_" safe_name ${safe_name})
+  string(REPLACE "-" "_" safe_name ${flag_name})
+  string(REPLACE "=" "_" safe_name ${safe_name})
 
-    if(${flag_name} MATCHES "fsanitize")
-        set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
-        set(CMAKE_REQUIRED_FLAGS ${flag_name})
-    endif()
+  if(${flag_name} MATCHES "fsanitize")
+    set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
+    set(CMAKE_REQUIRED_FLAGS ${flag_name})
+  endif()
 
-    if(is_c)
-        CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
-        set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
-    else()
-        CHECK_CXX_COMPILER_FLAG(${flag_name} CXX_COMPILER_SUPPORT_FLAG_${safe_name})
-        set(safe_name CXX_COMPILER_SUPPORT_FLAG_${safe_name})
-    endif()
-    if(${safe_name})
-        set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE)
-    endif()
+  if(is_c)
+    check_c_compiler_flag(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
+    set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
+  else()
+    check_cxx_compiler_flag(${flag_name} CXX_COMPILER_SUPPORT_FLAG_${safe_name})
+    set(safe_name CXX_COMPILER_SUPPORT_FLAG_${safe_name})
+  endif()
+  if(${safe_name})
+    set(${src_list}
+        "${${src_list}} ${flag_name}"
+        PARENT_SCOPE)
+  endif()
 
-    if(${flag_name} MATCHES "fsanitize")
-        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
-    endif()
+  if(${flag_name} MATCHES "fsanitize")
+    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
+  endif()
 endfunction()
 
 # helper macro to set cflag
 macro(safe_set_cflag src_list flag_name)
-    safe_set_flag(ON ${src_list} ${flag_name})
+  safe_set_flag(ON ${src_list} ${flag_name})
 endmacro()
 
 # helper macro to set cxxflag
 macro(safe_set_cxxflag src_list flag_name)
-    safe_set_flag(OFF ${src_list} ${flag_name})
+  safe_set_flag(OFF ${src_list} ${flag_name})
 endmacro()
 
 # helper macro to set nvcc flag
 macro(safe_set_nvflag flag_name)
-    string(REPLACE "-" "_" safe_name ${flag_name})
-    string(REPLACE "=" "_" safe_name ${safe_name})
-    CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
-    set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
-    if(${safe_name})
-        set(SAFE_GPU_COMMON_FLAGS "${SAFE_GPU_COMMON_FLAGS} -Xcompiler=\"${flag_name}\"")
-    endif()
+  string(REPLACE "-" "_" safe_name ${flag_name})
+  string(REPLACE "=" "_" safe_name ${safe_name})
+  check_c_compiler_flag(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
+  set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
+  if(${safe_name})
+    set(SAFE_GPU_COMMON_FLAGS
+        "${SAFE_GPU_COMMON_FLAGS} -Xcompiler=\"${flag_name}\"")
+  endif()
 endmacro()
 
-
-CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
+check_cxx_symbol_exists(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
   set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS)
-  CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS_HERE)
+  check_cxx_symbol_exists(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS_HERE)
   if(UINT64_MAX_EXISTS_HERE)
     set(CMAKE_REQUIRED_DEFINITIONS)
     add_definitions(-D__STDC_LIMIT_MACROS)
@@ -100,152 +108,151 @@ if(NOT UINT64_MAX_EXISTS)
   endif()
 endif()
 
-SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h")
-CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND)
-CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND)
+set(CMAKE_EXTRA_INCLUDE_FILES "pthread.h")
+check_type_size(pthread_spinlock_t SPINLOCK_FOUND)
+check_type_size(pthread_barrier_t BARRIER_FOUND)
 if(SPINLOCK_FOUND)
   add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK)
 endif(SPINLOCK_FOUND)
 if(BARRIER_FOUND)
   add_definitions(-DPADDLE_USE_PTHREAD_BARRIER)
 endif(BARRIER_FOUND)
-SET(CMAKE_EXTRA_INCLUDE_FILES "")
+set(CMAKE_EXTRA_INCLUDE_FILES "")
 
 # Only one sanitizer is allowed in compile time
 string(TOLOWER "${SANITIZER_TYPE}" sanitizer_type)
 if(sanitizer_type STREQUAL "address")
-    set(fsanitize "-fsanitize=address")
+  set(fsanitize "-fsanitize=address")
 elseif(sanitizer_type STREQUAL "leak")
-    set(fsanitize "-fsanitize=leak")
+  set(fsanitize "-fsanitize=leak")
 elseif(sanitizer_type STREQUAL "memory")
-    set(fsanitize "-fsanitize=memory")
+  set(fsanitize "-fsanitize=memory")
 elseif(sanitizer_type STREQUAL "thread")
-    set(fsanitize "-fsanitize=thread")
+  set(fsanitize "-fsanitize=thread")
 elseif(sanitizer_type STREQUAL "undefined")
-    set(fsanitize "-fsanitize=undefined")
+  set(fsanitize "-fsanitize=undefined")
 endif()
 
 # Common flags. the compiler flag used for C/C++ sources whenever release or debug
 # Do not care if this flag is support for gcc.
 
 # https://github.com/PaddlePaddle/Paddle/issues/12773
-if (NOT WIN32)
-set(COMMON_FLAGS
-    -fPIC
-    -fno-omit-frame-pointer
-    -Werror
-    -Wall
-    -Wextra
-    -Wnon-virtual-dtor
-    -Wdelete-non-virtual-dtor
-    -Wno-unused-parameter
-    -Wno-unused-function
-    -Wno-error=literal-suffix
-    -Wno-error=unused-local-typedefs
-    -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
-    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
-    -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
-    -Wimplicit-fallthrough=0 # Warning in tinyformat.h
-    ${fsanitize}
-)
-
-if(WITH_IPU)
-    set(COMMON_FLAGS ${COMMON_FLAGS} 
-        -Wno-sign-compare # Warnings in Popart
-        -Wno-non-virtual-dtor # Warnings in Popart
+if(NOT WIN32)
+  set(COMMON_FLAGS
+      -fPIC
+      -fno-omit-frame-pointer
+      -Werror
+      -Wall
+      -Wextra
+      -Wnon-virtual-dtor
+      -Wdelete-non-virtual-dtor
+      -Wno-unused-parameter
+      -Wno-unused-function
+      -Wno-error=literal-suffix
+      -Wno-error=unused-local-typedefs
+      -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3
+      -Wno-error=terminate # Warning in PADDLE_ENFORCE
+      -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
+      -Wimplicit-fallthrough=0 # Warning in tinyformat.h
+      ${fsanitize})
+
+  if(WITH_IPU)
+    set(COMMON_FLAGS ${COMMON_FLAGS} -Wno-sign-compare # Warnings in Popart
+                     -Wno-non-virtual-dtor # Warnings in Popart
     )
-endif()
+  endif()
 
-if(WITH_ASCEND_CL AND WITH_ARM_BRPC)
+  if(WITH_ASCEND_CL AND WITH_ARM_BRPC)
     set(COMMON_FLAGS ${COMMON_FLAGS} -faligned-new)
-endif()
+  endif()
 
-if(NOT APPLE)
+  if(NOT APPLE)
     if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM))
-        set(COMMON_FLAGS
-                ${COMMON_FLAGS}
-                -Wno-format-truncation # Warning in boost gcc 8.2
-                -Wno-error=parentheses # Warning in boost gcc 8.2
-                -Wno-error=catch-value # Warning in boost gcc 8.2
-                -Wno-error=nonnull-compare # Warning in boost gcc 8.2
-                -Wno-error=address # Warning in boost gcc 8.2
-                -Wno-ignored-qualifiers # Warning in boost gcc 8.2
-                -Wno-ignored-attributes # Warning in Eigen gcc 8.3
-                -Wno-parentheses # Warning in Eigen gcc 8.3
-                )
+      set(COMMON_FLAGS
+          ${COMMON_FLAGS}
+          -Wno-format-truncation # Warning in boost gcc 8.2
+          -Wno-error=parentheses # Warning in boost gcc 8.2
+          -Wno-error=catch-value # Warning in boost gcc 8.2
+          -Wno-error=nonnull-compare # Warning in boost gcc 8.2
+          -Wno-error=address # Warning in boost gcc 8.2
+          -Wno-ignored-qualifiers # Warning in boost gcc 8.2
+          -Wno-ignored-attributes # Warning in Eigen gcc 8.3
+          -Wno-parentheses # Warning in Eigen gcc 8.3
+      )
     endif()
-endif(NOT APPLE)
-
-set(GPU_COMMON_FLAGS
-    -fPIC
-    -fno-omit-frame-pointer
-    -Wnon-virtual-dtor
-    -Wdelete-non-virtual-dtor
-    -Wno-unused-parameter
-    -Wno-unused-function
-    -Wno-error=literal-suffix
-    -Wno-error=unused-local-typedefs
-    -Wno-error=unused-function  # Warnings in Numpy Header.
-    -Wno-error=array-bounds # Warnings in Eigen::array
-)
-if (NOT WITH_NV_JETSON AND NOT WITH_ARM AND NOT WITH_SW AND NOT WITH_MIPS)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
-endif()
+  endif(NOT APPLE)
+
+  set(GPU_COMMON_FLAGS
+      -fPIC
+      -fno-omit-frame-pointer
+      -Wnon-virtual-dtor
+      -Wdelete-non-virtual-dtor
+      -Wno-unused-parameter
+      -Wno-unused-function
+      -Wno-error=literal-suffix
+      -Wno-error=unused-local-typedefs
+      -Wno-error=unused-function # Warnings in Numpy Header.
+      -Wno-error=array-bounds # Warnings in Eigen::array
+  )
+  if(NOT WITH_NV_JETSON
+     AND NOT WITH_ARM
+     AND NOT WITH_SW
+     AND NOT WITH_MIPS)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
+  endif()
 endif(NOT WIN32)
 
-if (APPLE)
-    if(WITH_ARM)
-      set (CMAKE_OSX_ARCHITECTURES "arm64" CACHE STRING "Build architectures for OSX" FORCE)
-    else(WITH_ARM)
-     set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
-    endif(WITH_ARM)
-    # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
-    set (COMMON_FLAGS -Wno-deprecated-register)
+if(APPLE)
+  if(WITH_ARM)
+    set(CMAKE_OSX_ARCHITECTURES
+        "arm64"
+        CACHE STRING "Build architectures for OSX" FORCE)
+  else(WITH_ARM)
+    set(CMAKE_OSX_ARCHITECTURES
+        "x86_64"
+        CACHE STRING "Build architectures for OSX" FORCE)
+  endif(WITH_ARM)
+  # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
+  set(COMMON_FLAGS -Wno-deprecated-register)
 endif(APPLE)
 
 if(WITH_HETERPS AND WITH_PSLIB)
-    set(COMMON_FLAGS
-        -D_GLIBCXX_USE_CXX11_ABI=0
-        ${COMMON_FLAGS})
+  set(COMMON_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0 ${COMMON_FLAGS})
 
-    set(GPU_COMMON_FLAGS
-        -D_GLIBCXX_USE_CXX11_ABI=0
-        ${GPU_COMMON_FLAGS})
+  set(GPU_COMMON_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0 ${GPU_COMMON_FLAGS})
 endif()
 
 if(LINUX)
-    set(GPU_COMMON_FLAGS
-        -Wall
-        -Wextra
-        -Werror
-        ${GPU_COMMON_FLAGS})
+  set(GPU_COMMON_FLAGS -Wall -Wextra -Werror ${GPU_COMMON_FLAGS})
 endif(LINUX)
 
 foreach(flag ${COMMON_FLAGS})
-    safe_set_cflag(CMAKE_C_FLAGS ${flag})
-    safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
+  safe_set_cflag(CMAKE_C_FLAGS ${flag})
+  safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
 endforeach()
 
 set(SAFE_GPU_COMMON_FLAGS "")
 foreach(flag ${GPU_COMMON_FLAGS})
-    safe_set_nvflag(${flag})
+  safe_set_nvflag(${flag})
 endforeach()
 
 if(WITH_GPU)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
 endif()
 
 if(WITH_ROCM)
-    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
+  set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
 endif()
 
- # Disable -Werror, otherwise the compile will fail for rocblas_gemm_ex
+# Disable -Werror, otherwise the compile will fail for rocblas_gemm_ex
 if(WITH_ROCM)
-    string (REPLACE "-Werror" "-Wno-error" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-    string (REPLACE "-Werror" "-Wno-error" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
+  string(REPLACE "-Werror" "-Wno-error" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  string(REPLACE "-Werror" "-Wno-error" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
 endif()
 
 if(WITH_PSCORE OR WITH_PSLIB)
-    string (REPLACE "-Wnon-virtual-dtor" "-Wno-non-virtual-dtor" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-    string (REPLACE "-Wnon-virtual-dtor" "-Wno-non-virtual-dtor" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
+  string(REPLACE "-Wnon-virtual-dtor" "-Wno-non-virtual-dtor" CMAKE_CXX_FLAGS
+                 ${CMAKE_CXX_FLAGS})
+  string(REPLACE "-Wnon-virtual-dtor" "-Wno-non-virtual-dtor" CMAKE_C_FLAGS
+                 ${CMAKE_C_FLAGS})
 endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 35170b5198dc3..a6a7ab983b9f6 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -13,7 +13,6 @@
 # limitations under the License.
 #
 
-
 # generic.cmake defines CMakes functions that look like Bazel's
 # building rules (https://bazel.build/).
 #
@@ -96,9 +95,11 @@ if(NOT APPLE AND NOT WIN32)
   find_package(Threads REQUIRED)
   link_libraries(${CMAKE_THREAD_LIBS_INIT})
   if(WITH_PSLIB OR WITH_DISTRIBUTE)
-    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt -lz -lssl")
+    set(CMAKE_CXX_LINK_EXECUTABLE
+        "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt -lz -lssl")
   else()
-    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
+    set(CMAKE_CXX_LINK_EXECUTABLE
+        "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
   endif()
 endif()
 
@@ -107,7 +108,8 @@ set_property(GLOBAL PROPERTY FLUID_MODULES "")
 # for building inference libs
 function(find_fluid_modules TARGET_NAME)
   get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path
+                       ${__target_path})
   string(FIND "${__target_path}" "fluid" pos)
   if(pos GREATER 1)
     get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
@@ -121,7 +123,8 @@ set_property(GLOBAL PROPERTY PHI_MODULES "")
 # for building inference libs
 function(find_phi_modules TARGET_NAME)
   get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path
+                       ${__target_path})
   string(FIND "${__target_path}" "phi" pos)
   if(pos GREATER 1)
     get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
@@ -131,7 +134,7 @@ function(find_phi_modules TARGET_NAME)
 endfunction(find_phi_modules)
 
 function(common_link TARGET_NAME)
-  if (WITH_PROFILER)
+  if(WITH_PROFILER)
     target_link_libraries(${TARGET_NAME} gperftools::profiler)
   endif()
 endfunction()
@@ -141,7 +144,8 @@ endfunction()
 set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY)
 function(find_fluid_thirdparties TARGET_NAME)
   get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path
+                       ${__target_path})
   string(FIND "${__target_path}" "third_party" pos)
   if(pos GREATER 1)
     get_property(fluid_ GLOBAL PROPERTY FLUID_THIRD_PARTY)
@@ -162,13 +166,15 @@ function(create_static_lib TARGET_NAME)
     foreach(lib ${libs})
       list(APPEND dummy_list ${lib})
       list(LENGTH dummy_list listlen)
-      if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${libs_len}))
+      if((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL
+                                                 ${libs_len}))
         merge_static_libs(${TARGET_NAME}_dummy_${dummy_index} ${dummy_list})
         set(dummy_list)
-        list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_${dummy_index})
-        MATH(EXPR dummy_index "${dummy_index}+1")
+        list(APPEND ${TARGET_NAME}_dummy_list
+             ${TARGET_NAME}_dummy_${dummy_index})
+        math(EXPR dummy_index "${dummy_index}+1")
       endif()
-      MATH(EXPR dummy_offset "${dummy_offset}+1")
+      math(EXPR dummy_offset "${dummy_offset}+1")
     endforeach()
     merge_static_libs(${TARGET_NAME} ${${TARGET_NAME}_dummy_list})
   else()
@@ -180,7 +186,8 @@ function(create_dummy_static_lib TARGET_NAME)
   set(options "")
   set(oneValueArgs "")
   set(multiValueArgs LIBS DEPS LIMIT)
-  cmake_parse_arguments(merge "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  cmake_parse_arguments(merge "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
 
   list(REMOVE_DUPLICATES merge_LIBS)
   set(index 1)
@@ -191,17 +198,18 @@ function(create_dummy_static_lib TARGET_NAME)
   foreach(lib ${merge_LIBS})
     list(APPEND merge_list ${lib})
     list(LENGTH merge_list listlen)
-    if ((${listlen} GREATER ${limit}) OR (${offset} EQUAL ${libs_len}))
-      message("Merge and generate static library: ${TARGET_NAME}_static_${index}")
+    if((${listlen} GREATER ${limit}) OR (${offset} EQUAL ${libs_len}))
+      message(
+        "Merge and generate static library: ${TARGET_NAME}_static_${index}")
       merge_static_libs(${TARGET_NAME}_static_${index} ${merge_list})
       if(merge_DEPS)
         target_link_libraries(${TARGET_NAME}_static_${index} ${merge_DEPS})
       endif()
       set(merge_list)
       list(APPEND ${TARGET_NAME}_list ${TARGET_NAME}_static_${index})
-      MATH(EXPR index "${index}+1")
+      math(EXPR index "${index}+1")
     endif()
-    MATH(EXPR offset "${offset}+1")
+    math(EXPR offset "${offset}+1")
   endforeach()
   cc_library(${TARGET_NAME} DEPS ${${TARGET_NAME}_list})
 endfunction()
@@ -226,12 +234,14 @@ function(merge_static_libs TARGET_NAME)
   # Make the generated dummy source file depended on all static input
   # libs. If input lib changes,the source file is touched
   # which causes the desired effect (relink).
-  add_custom_command(OUTPUT ${target_SRCS}
+  add_custom_command(
+    OUTPUT ${target_SRCS}
     COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
     DEPENDS ${libs})
-  
-    # Generate dummy staic lib
-  generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:merge_static_libs")
+
+  # Generate dummy staic lib
+  generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS}
+                            GENERATOR "generic.cmake:merge_static_libs")
   target_link_libraries(${TARGET_NAME} ${libs_deps})
 
   # OSX: use 'libtool' to merge archives
@@ -240,29 +250,41 @@ function(merge_static_libs TARGET_NAME)
       # Get the file names of the libraries to be merged
       set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
     endforeach()
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    add_custom_command(
+      TARGET ${TARGET_NAME}
+      POST_BUILD
       COMMENT "Merge and generate static lib: lib${TARGET_NAME}.a"
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
-      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
-      )
+      COMMAND /usr/bin/libtool -static -o
+              "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
   endif()
 
   # LINUX: use "ar" to extract objects and re-add to a common lib
   if(LINUX)
-    set(mri_file ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.mri CACHE INTERNAL "phi_static.mri file")
-    get_property(ABS_MERGE_LIB_PATH TARGET ${TARGET_NAME} PROPERTY LOCATION)
+    set(mri_file
+        ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.mri
+        CACHE INTERNAL "phi_static.mri file")
+    get_property(
+      ABS_MERGE_LIB_PATH
+      TARGET ${TARGET_NAME}
+      PROPERTY LOCATION)
     file(WRITE ${mri_file} "create ${ABS_MERGE_LIB_PATH}\n")
 
     foreach(lib ${libs})
-      get_property(ABS_LIB_PATH TARGET ${lib} PROPERTY LOCATION)
+      get_property(
+        ABS_LIB_PATH
+        TARGET ${lib}
+        PROPERTY LOCATION)
       file(APPEND ${mri_file} "addlib ${ABS_LIB_PATH}\n")
     endforeach()
     file(APPEND ${mri_file} "save\nend\n")
 
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-        COMMENT "Merge and generate static lib: lib${TARGET_NAME}.a"
-        COMMAND ${CMAKE_AR} -M < ${mri_file}
-        COMMAND ${CMAKE_RANLIB} "$<TARGET_FILE:${TARGET_NAME}>")
+    add_custom_command(
+      TARGET ${TARGET_NAME}
+      POST_BUILD
+      COMMENT "Merge and generate static lib: lib${TARGET_NAME}.a"
+      COMMAND ${CMAKE_AR} -M < ${mri_file}
+      COMMAND ${CMAKE_RANLIB} "$<TARGET_FILE:${TARGET_NAME}>")
   endif()
 
   # Windows do not support gcc/nvcc combined compiling. Use msvc 'lib.exe' to merge libs.
@@ -271,60 +293,70 @@ function(merge_static_libs TARGET_NAME)
       set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
     endforeach()
     # msvc compiler will put libarary in directory of "/Release/xxxlib" by default
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    add_custom_command(
+      TARGET ${TARGET_NAME}
+      POST_BUILD
       COMMENT "Merge and generate static lib: lib${TARGET_NAME}.lib"
       COMMAND cmake -E make_directory $<TARGET_FILE_DIR:${TARGET_NAME}>
-      COMMAND lib /OUT:$<TARGET_FILE:${TARGET_NAME}> ${libfiles}
-      )
+      COMMAND lib /OUT:$<TARGET_FILE:${TARGET_NAME}> ${libfiles})
   endif()
 endfunction()
 
 function(check_coverage_opt TARGET_NAME SRCS)
   if(WITH_COVERAGE AND WITH_INCREMENTAL_COVERAGE)
     # if pybind.cc add '-g -O0 -fprofile-arcs -ftest-coverage' only, some testcase will fail.
-    if ("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL "" AND (NOT ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" MATCHES "pybind.cc")))
-      if (NOT ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" STREQUAL ""))
+    if("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL ""
+       AND (NOT ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" MATCHES "pybind.cc")))
+      if(NOT ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" STREQUAL ""))
         string(REPLACE "," ";" CC_FILE_LIST $ENV{PADDLE_GIT_DIFF_CC_FILE})
         set(use_coverage_opt FALSE)
-        FOREACH(cc_file ${CC_FILE_LIST})
+        foreach(cc_file ${CC_FILE_LIST})
           if("${SRCS};" MATCHES "${cc_file}")
             set(use_coverage_opt TRUE)
             break()
           endif()
-        ENDFOREACH(cc_file)
+        endforeach(cc_file)
 
-        if (use_coverage_opt)
+        if(use_coverage_opt)
           message(STATUS "cc changed, add coverage opt for ${TARGET_NAME}")
-          target_compile_options(${TARGET_NAME} PRIVATE -g -O0 -fprofile-arcs -ftest-coverage)
+          target_compile_options(${TARGET_NAME} PRIVATE -g -O0 -fprofile-arcs
+                                                        -ftest-coverage)
           target_link_libraries(${TARGET_NAME} -fprofile-arcs)
-          get_target_property(WH_TARGET_COMPILE_OPTIONS ${TARGET_NAME} COMPILE_OPTIONS)
-          message(STATUS "property for ${TARGET_NAME} is ${WH_TARGET_COMPILE_OPTIONS}")
+          get_target_property(WH_TARGET_COMPILE_OPTIONS ${TARGET_NAME}
+                              COMPILE_OPTIONS)
+          message(
+            STATUS "property for ${TARGET_NAME} is ${WH_TARGET_COMPILE_OPTIONS}"
+          )
         endif()
       endif()
     endif()
   endif()
 endfunction(check_coverage_opt)
 
-
 function(cc_library TARGET_NAME)
   set(options STATIC static SHARED shared INTERFACE interface)
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
-  cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
   if(WIN32)
-      # add libxxx.lib prefix in windows
-      set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
+    # add libxxx.lib prefix in windows
+    set(${TARGET_NAME}_LIB_NAME
+        "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}"
+        CACHE STRING "output library name for target ${TARGET_NAME}")
   endif(WIN32)
   if(cc_library_SRCS)
-      if(cc_library_SHARED OR cc_library_shared) # build *.so
-        add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
-      elseif(cc_library_INTERFACE OR cc_library_interface)
-        generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:cc_library")
-      else()
-        add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
-        find_fluid_modules(${TARGET_NAME})
-        find_phi_modules(${TARGET_NAME})
-      endif()
+    if(cc_library_SHARED OR cc_library_shared) # build *.so
+      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
+    elseif(cc_library_INTERFACE OR cc_library_interface)
+      generate_dummy_static_lib(
+        LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR
+        "generic.cmake:cc_library")
+    else()
+      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
+      find_fluid_modules(${TARGET_NAME})
+      find_phi_modules(${TARGET_NAME})
+    endif()
     if(cc_library_DEPS)
       # Don't need link libwarpctc.so
       if("${cc_library_DEPS};" MATCHES "warpctc;")
@@ -341,7 +373,8 @@ function(cc_library TARGET_NAME)
         if(WIN32)
           target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
         else(WIN32)
-          target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+          target_link_libraries(${TARGET_NAME}
+                                "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
         endif(WIN32)
       endif()
       # remove link to python, see notes at:
@@ -373,21 +406,26 @@ function(cc_library TARGET_NAME)
     if(cc_library_DEPS)
       list(REMOVE_DUPLICATES cc_library_DEPS)
 
-      generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:cc_library")
+      generate_dummy_static_lib(
+        LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR
+        "generic.cmake:cc_library")
 
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
     else()
-      message(FATAL_ERROR "Please specify source files or libraries in cc_library(${TARGET_NAME} ...).")
+      message(
+        FATAL_ERROR
+          "Please specify source files or libraries in cc_library(${TARGET_NAME} ...)."
+      )
     endif()
   endif(cc_library_SRCS)
 endfunction(cc_library)
 
-
 function(cc_binary TARGET_NAME)
   set(options "")
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
-  cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
   add_executable(${TARGET_NAME} ${cc_binary_SRCS})
   if(cc_binary_DEPS)
     target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS})
@@ -408,7 +446,8 @@ function(cc_test_build TARGET_NAME)
   if(WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
     if(WIN32)
       if("${cc_test_DEPS};" MATCHES "python;")
@@ -417,8 +456,25 @@ function(cc_test_build TARGET_NAME)
       endif()
     endif(WIN32)
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} paddle_gtest_main lod_tensor memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    target_link_libraries(
+      ${TARGET_NAME}
+      ${cc_test_DEPS}
+      ${os_dependency_modules}
+      paddle_gtest_main
+      lod_tensor
+      memory
+      gtest
+      gflags
+      glog)
+    add_dependencies(
+      ${TARGET_NAME}
+      ${cc_test_DEPS}
+      paddle_gtest_main
+      lod_tensor
+      memory
+      gtest
+      gflags
+      glog)
     common_link(${TARGET_NAME})
     if(WITH_ROCM)
       target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
@@ -431,74 +487,80 @@ function(cc_test_run TARGET_NAME)
   if(WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
     set(oneValueArgs "")
     set(multiValueArgs COMMAND ARGS)
-    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    add_test(NAME ${TARGET_NAME}
-	    COMMAND ${cc_test_COMMAND} ${cc_test_ARGS}
-            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    add_test(
+      NAME ${TARGET_NAME}
+      COMMAND ${cc_test_COMMAND} ${cc_test_ARGS}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_cpu_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_cudnn_deterministic=true)
     # No unit test should exceed 2 minutes.
-    if (WIN32)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
+    if(WIN32)
+      set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
     endif()
-    if (APPLE)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 20)
+    if(APPLE)
+      set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 20)
     endif()
   elseif(WITH_TESTING AND NOT TEST ${TARGET_NAME})
-    add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E echo CI skip ${TARGET_NAME}.)
+    add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E echo CI skip
+                                         ${TARGET_NAME}.)
   endif()
 endfunction()
 
 function(cc_test TARGET_NAME)
-    # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
-    # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files
+  # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
+  # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files
   # other than *.py are modified.
   if(WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS)
-    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cc_test_build(${TARGET_NAME}
-	    SRCS ${cc_test_SRCS}
-	    DEPS ${cc_test_DEPS})
+    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    cc_test_build(${TARGET_NAME} SRCS ${cc_test_SRCS} DEPS ${cc_test_DEPS})
     # we dont test hcom op, because it need complex configuration
     # with more than one machine
-    if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test"         OR
-            "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test"     OR
-            "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test"     OR
-            "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test"     OR
-            "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test"         OR
-            "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test"             OR
-            "${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test"        OR
-            "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test"))
-      cc_test_run(${TARGET_NAME}
-        COMMAND ${TARGET_NAME}
-        ARGS ${cc_test_ARGS})
+    if(NOT
+       ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test"
+        OR "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test"
+        OR "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test"
+        OR "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test"
+        OR "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test"
+        OR "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test"
+        OR "${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test"
+        OR "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test"))
+      cc_test_run(${TARGET_NAME} COMMAND ${TARGET_NAME} ARGS ${cc_test_ARGS})
     endif()
   elseif(WITH_TESTING AND NOT TEST ${TARGET_NAME})
-    add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E echo CI skip ${TARGET_NAME}.)
+    add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E echo CI skip
+                                         ${TARGET_NAME}.)
   endif()
 endfunction(cc_test)
 
 function(nv_library TARGET_NAME)
-  if (WITH_GPU)
+  if(WITH_GPU)
     set(options STATIC static SHARED shared)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
     if(nv_library_SRCS)
       # Attention:
       # 1. cuda_add_library is deprecated after cmake v3.10, use add_library for CUDA please.
       # 2. cuda_add_library does not support ccache.
       # Reference: https://cmake.org/cmake/help/v3.10/module/FindCUDA.html
-      if (nv_library_SHARED OR nv_library_shared) # build *.so
+      if(nv_library_SHARED OR nv_library_shared) # build *.so
         add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
       else()
         add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
         find_phi_modules(${TARGET_NAME})
       endif()
-      if (nv_library_DEPS)
+      if(nv_library_DEPS)
         add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
         target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
       endif()
@@ -506,13 +568,16 @@ function(nv_library TARGET_NAME)
       foreach(source_file ${nv_library_SRCS})
         string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
         if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-          list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND nv_library_HEADERS
+               ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
         endif()
       endforeach()
     else(nv_library_SRCS)
-      if (nv_library_DEPS)
+      if(nv_library_DEPS)
         list(REMOVE_DUPLICATES nv_library_DEPS)
-        generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:nv_library")
+        generate_dummy_static_lib(
+          LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR
+          "generic.cmake:nv_library")
 
         target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
         add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
@@ -520,76 +585,112 @@ function(nv_library TARGET_NAME)
         message(FATAL "Please specify source file or library in nv_library.")
       endif()
     endif(nv_library_SRCS)
-    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
-      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
+    if((CUDA_VERSION GREATER 9.2)
+       AND (CUDA_VERSION LESS 11.0)
+       AND (MSVC_VERSION LESS 1910))
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS
+                                                      ${WIN_PROPS})
     endif()
   endif()
 endfunction(nv_library)
 
 function(nv_binary TARGET_NAME)
-  if (WITH_GPU)
+  if(WITH_GPU)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${nv_binary_SRCS})
     if(nv_binary_DEPS)
       target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
       add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
       common_link(${TARGET_NAME})
     endif()
-    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
-      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
+    if((CUDA_VERSION GREATER 9.2)
+       AND (CUDA_VERSION LESS 11.0)
+       AND (MSVC_VERSION LESS 1910))
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS
+                                                      ${WIN_PROPS})
     endif()
   endif()
 endfunction(nv_binary)
 
 function(nv_test TARGET_NAME)
-    # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
-    # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files
+  # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
+  # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files
   # other than *.py are modified.
-  if (WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+  if(WITH_GPU
+     AND WITH_TESTING
+     AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
     # Attention:
     # 1. cuda_add_executable is deprecated after cmake v3.10, use cuda_add_executable for CUDA please.
     # 2. cuda_add_executable does not support ccache.
     # Reference: https://cmake.org/cmake/help/v3.10/module/FindCUDA.html
     add_executable(${TARGET_NAME} ${nv_test_SRCS})
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules})
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    target_link_libraries(
+      ${TARGET_NAME}
+      ${nv_test_DEPS}
+      paddle_gtest_main
+      lod_tensor
+      memory
+      gtest
+      gflags
+      glog
+      ${os_dependency_modules})
+    add_dependencies(
+      ${TARGET_NAME}
+      ${nv_test_DEPS}
+      paddle_gtest_main
+      lod_tensor
+      memory
+      gtest
+      gflags
+      glog)
     common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
-      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_cpu_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_cudnn_deterministic=true)
+    if((CUDA_VERSION GREATER 9.2)
+       AND (CUDA_VERSION LESS 11.0)
+       AND (MSVC_VERSION LESS 1910))
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS
+                                                      ${WIN_PROPS})
     endif()
   endif()
 endfunction(nv_test)
 
 function(hip_library TARGET_NAME)
-  if (WITH_ROCM)
+  if(WITH_ROCM)
     set(options STATIC static SHARED shared)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
     if(hip_library_SRCS)
       # FindHIP.cmake defined hip_add_library, HIP_SOURCE_PROPERTY_FORMAT is requried if no .cu files found
-      if(NOT (${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators" OR ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/phi/kernels"))
-       set_source_files_properties(${hip_library_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+      if(NOT (${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators"
+              OR ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/phi/kernels"))
+        set_source_files_properties(${hip_library_SRCS}
+                                    PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
       endif()
-      if (hip_library_SHARED OR hip_library_shared) # build *.so
+      if(hip_library_SHARED OR hip_library_shared) # build *.so
         hip_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS})
       else()
         hip_add_library(${TARGET_NAME} STATIC ${hip_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
         find_phi_modules(${TARGET_NAME})
       endif()
-      if (hip_library_DEPS)
+      if(hip_library_DEPS)
         add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
         target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
       endif()
@@ -597,13 +698,16 @@ function(hip_library TARGET_NAME)
       foreach(source_file ${hip_library_SRCS})
         string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
         if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-          list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND hip_library_HEADERS
+               ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
         endif()
       endforeach()
     else(hip_library_SRCS)
-      if (hip_library_DEPS)
+      if(hip_library_DEPS)
         list(REMOVE_DUPLICATES hip_library_DEPS)
-        generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:hip_library")
+        generate_dummy_static_lib(
+          LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR
+          "generic.cmake:hip_library")
 
         target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
         add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
@@ -615,11 +719,12 @@ function(hip_library TARGET_NAME)
 endfunction(hip_library)
 
 function(hip_binary TARGET_NAME)
-  if (WITH_ROCM)
+  if(WITH_ROCM)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
     # FindHIP.cmake defined hip_add_executable, HIP_SOURCE_PROPERTY_FORMAT is requried for .cc files
     hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
     if(hip_binary_DEPS)
@@ -634,42 +739,73 @@ function(hip_test TARGET_NAME)
   # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
   # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files
   # other than *.py are modified.
-  if (WITH_ROCM AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+  if(WITH_ROCM
+     AND WITH_TESTING
+     AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
     # FindHIP.cmake defined hip_add_executable, HIP_SOURCE_PROPERTY_FORMAT is requried for .cc files
     hip_add_executable(${TARGET_NAME} ${hip_test_SRCS})
     # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE
     target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt)
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules})
-    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    target_link_libraries(
+      ${TARGET_NAME}
+      ${hip_test_DEPS}
+      paddle_gtest_main
+      lod_tensor
+      memory
+      gtest
+      gflags
+      glog
+      ${os_dependency_modules})
+    add_dependencies(
+      ${TARGET_NAME}
+      ${hip_test_DEPS}
+      paddle_gtest_main
+      lod_tensor
+      memory
+      gtest
+      gflags
+      glog)
     common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH")
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_cpu_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_cudnn_deterministic=true)
+    set_property(
+      TEST ${TARGET_NAME}
+      PROPERTY
+        ENVIRONMENT
+        "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH"
+    )
   endif()
 endfunction(hip_test)
 
 function(xpu_library TARGET_NAME)
-  if (WITH_XPU_KP)
+  if(WITH_XPU_KP)
     set(options STATIC static SHARED shared)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(xpu_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(xpu_library "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
 
     if(xpu_library_SRCS)
-      if (xpu_library_SHARED OR xpu_library_shared) # build *.so
-        message(FATAL_ERROR "XPU kernel currently does not support dynamic links")
+      if(xpu_library_SHARED OR xpu_library_shared) # build *.so
+        message(
+          FATAL_ERROR "XPU kernel currently does not support dynamic links")
       else()
-        xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS})
+        xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS
+                        ${xpu_library_DEPS})
         find_fluid_modules(${TARGET_NAME})
         find_phi_modules(${TARGET_NAME})
       endif()
-      if (xpu_library_DEPS)
+      if(xpu_library_DEPS)
         add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
         target_link_libraries(${TARGET_NAME} ${xpu_library_DEPS})
       endif()
@@ -677,13 +813,16 @@ function(xpu_library TARGET_NAME)
       foreach(source_file ${xpu_library_SRCS})
         string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
         if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-          list(APPEND xpu_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND xpu_library_HEADERS
+               ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
         endif()
       endforeach()
     else(xpu_library_SRCS)
-      if (xpu_library_DEPS)
+      if(xpu_library_DEPS)
         list(REMOVE_DUPLICATES xpu_library_DEPS)
-        generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:xpu_library")
+        generate_dummy_static_lib(
+          LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR
+          "generic.cmake:xpu_library")
         target_link_libraries(${TARGET_NAME} ${xpu_library_DEPS})
         add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
       else()
@@ -694,11 +833,12 @@ function(xpu_library TARGET_NAME)
 endfunction(xpu_library)
 
 function(xpu_binary TARGET_NAME)
-  if (WITH_XPU_KP)
+  if(WITH_XPU_KP)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(xpu_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(xpu_binary "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${xpu_binary_SRCS})
     if(xpu_binary_DEPS)
       target_link_libraries(${TARGET_NAME} ${xpu_binary_DEPS})
@@ -712,21 +852,44 @@ function(xpu_test TARGET_NAME)
   # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
   # and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files
   # other than *.py are modified.
-  if (WITH_XPU_KP AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+  if(WITH_XPU_KP
+     AND WITH_TESTING
+     AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(xpu_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(xpu_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${xpu_test_SRCS})
     # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE
     target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt)
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(${TARGET_NAME} ${xpu_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules})
-    add_dependencies(${TARGET_NAME} ${xpu_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    target_link_libraries(
+      ${TARGET_NAME}
+      ${xpu_test_DEPS}
+      paddle_gtest_main
+      lod_tensor
+      memory
+      gtest
+      gflags
+      glog
+      ${os_dependency_modules})
+    add_dependencies(
+      ${TARGET_NAME}
+      ${xpu_test_DEPS}
+      paddle_gtest_main
+      lod_tensor
+      memory
+      gtest
+      gflags
+      glog)
     common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_cpu_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_cudnn_deterministic=true)
   endif()
 endfunction(xpu_test)
 
@@ -734,34 +897,36 @@ function(go_library TARGET_NAME)
   set(options STATIC static SHARED shared)
   set(oneValueArgs "")
   set(multiValueArgs DEPS)
-  cmake_parse_arguments(go_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  cmake_parse_arguments(go_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
 
-  if (go_library_SHARED OR go_library_shared)
+  if(go_library_SHARED OR go_library_shared)
     set(BUILD_MODE "-buildmode=c-shared")
-    set(${TARGET_NAME}_LIB_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
+    set(${TARGET_NAME}_LIB_NAME
+        "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}"
+        CACHE STRING "output library name for target ${TARGET_NAME}")
   else()
     set(BUILD_MODE "-buildmode=c-archive")
-    set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
+    set(${TARGET_NAME}_LIB_NAME
+        "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}"
+        CACHE STRING "output library name for target ${TARGET_NAME}")
   endif()
 
   set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
 
   # This custom command will always run since it depends on a not
   # existing file.
-  add_custom_command(
-    OUTPUT dummy_rebulid_${TARGET_NAME}
-    COMMAND cmake -E touch ${dummyfile}
-    )
+  add_custom_command(OUTPUT dummy_rebulid_${TARGET_NAME} COMMAND cmake -E touch
+                                                                 ${dummyfile})
   # Create a custom target that depends on the custom command output
   # file, so the custom command can be referenced as a dependency by
   # `add_dependencies`.
-  add_custom_target(rebuild_${TARGET_NAME}
-    DEPENDS dummy_rebulid_${TARGET_NAME}
-    )
+  add_custom_target(rebuild_${TARGET_NAME} DEPENDS dummy_rebulid_${TARGET_NAME})
 
   # Add dummy code to support `make target_name` under Terminal Command
-  file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";")
-  if (go_library_SHARED OR go_library_shared)
+  file(WRITE ${dummyfile}
+       "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";")
+  if(go_library_SHARED OR go_library_shared)
     add_library(${TARGET_NAME} SHARED ${dummyfile})
   else()
     add_library(${TARGET_NAME} STATIC ${dummyfile})
@@ -777,17 +942,26 @@ function(go_library TARGET_NAME)
   # rebuild will always happen.
   add_dependencies(${TARGET_NAME} rebuild_${TARGET_NAME})
 
-  set(${TARGET_NAME}_LIB_PATH "${CMAKE_CURRENT_BINARY_DIR}/${${TARGET_NAME}_LIB_NAME}" CACHE STRING "output library path for target ${TARGET_NAME}")
+  set(${TARGET_NAME}_LIB_PATH
+      "${CMAKE_CURRENT_BINARY_DIR}/${${TARGET_NAME}_LIB_NAME}"
+      CACHE STRING "output library path for target ${TARGET_NAME}")
 
-  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
-  string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  file(
+    GLOB GO_SOURCE
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "*.go")
+  string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR
+                 ${CMAKE_CURRENT_SOURCE_DIR})
 
-  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+  add_custom_command(
+    TARGET ${TARGET_NAME}
+    POST_BUILD
     COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
     # Golang build source code
-    COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
-    -o "${${TARGET_NAME}_LIB_PATH}"
-    "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
+    COMMAND
+      GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} -o
+      "${${TARGET_NAME}_LIB_PATH}"
+      "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
     # must run under GOPATH
     WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
   add_dependencies(${TARGET_NAME} go_vendor)
@@ -797,15 +971,21 @@ function(go_binary TARGET_NAME)
   set(options OPTIONAL)
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
-  cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR
+                 ${CMAKE_CURRENT_SOURCE_DIR})
 
-  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
-    -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
-    "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
+  add_custom_command(
+    OUTPUT ${TARGET_NAME}_timestamp
+    COMMAND
+      env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build -o
+      "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
+      "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
     WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
-  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS})
+  add_custom_target(
+    ${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp
+                               ${go_binary_DEPS})
   install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
 
   check_coverage_opt(${TARGET_NAME} ${go_binary_SRCS})
@@ -816,15 +996,21 @@ function(go_test TARGET_NAME)
   set(options OPTIONAL)
   set(oneValueArgs "")
   set(multiValueArgs DEPS)
-  cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  cmake_parse_arguments(go_test "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR
+                 ${CMAKE_CURRENT_SOURCE_DIR})
   add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
-  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race
-    -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
-    ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
+  add_custom_command(
+    TARGET ${TARGET_NAME}
+    POST_BUILD
+    COMMAND
+      env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race -c -o
+      "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
+      ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
     WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
-  add_test(NAME ${TARGET_NAME}
+  add_test(
+    NAME ${TARGET_NAME}
     COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 endfunction(go_test)
@@ -835,7 +1021,9 @@ endfunction(go_test)
 
 function(paddle_protobuf_generate_cpp SRCS HDRS)
   if(NOT ARGN)
-    message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files")
+    message(
+      SEND_ERROR
+        "Error: paddle_protobuf_generate_cpp() called without any proto files")
     return()
   endif()
 
@@ -852,40 +1040,45 @@ function(paddle_protobuf_generate_cpp SRCS HDRS)
     list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")
 
     add_custom_command(
-      OUTPUT "${_protobuf_protoc_src}"
-             "${_protobuf_protoc_hdr}"
-
+      OUTPUT "${_protobuf_protoc_src}" "${_protobuf_protoc_hdr}"
       COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
-      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-      -I${CMAKE_CURRENT_SOURCE_DIR}
-      --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
+      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} -I${CMAKE_CURRENT_SOURCE_DIR}
+              --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
       # Set `EXTERN_PROTOBUF_DEPEND` only if need to compile `protoc.exe`.
       DEPENDS ${ABS_FIL} ${EXTERN_PROTOBUF_DEPEND}
       COMMENT "Running C++ protocol buffer compiler on ${FIL}"
-      VERBATIM )
+      VERBATIM)
   endforeach()
 
   set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
-  set(${SRCS} ${${SRCS}} PARENT_SCOPE)
-  set(${HDRS} ${${HDRS}} PARENT_SCOPE)
+  set(${SRCS}
+      ${${SRCS}}
+      PARENT_SCOPE)
+  set(${HDRS}
+      ${${HDRS}}
+      PARENT_SCOPE)
 endfunction()
 
-
 function(proto_library TARGET_NAME)
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
-  cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
   set(proto_srcs)
   set(proto_hdrs)
   paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
-  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
+  cc_library(
+    ${TARGET_NAME}
+    SRCS ${proto_srcs}
+    DEPS ${proto_library_DEPS} protobuf)
   add_dependencies(extern_xxhash ${TARGET_NAME})
 endfunction()
 
 function(py_proto_compile TARGET_NAME)
   set(oneValueArgs "")
   set(multiValueArgs SRCS)
-  cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
   set(py_srcs)
   protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
   add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs} protobuf)
@@ -896,29 +1089,37 @@ function(py_test TARGET_NAME)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS ENVS)
-    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if(WITH_COVERAGE AND NOT (WITH_INCREMENTAL_COVERAGE AND "$ENV{PADDLE_GIT_DIFF_PY_FILE}" STREQUAL ""))
-      add_test(NAME ${TARGET_NAME}
-              COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
-              FLAGS_cpu_deterministic=true
-              PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
-              COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-              ${PYTHON_EXECUTABLE} -m coverage run --branch -p ${py_test_SRCS} ${py_test_ARGS}
-              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+
+    if(WITH_COVERAGE AND NOT (WITH_INCREMENTAL_COVERAGE
+                              AND "$ENV{PADDLE_GIT_DIFF_PY_FILE}" STREQUAL ""))
+      add_test(
+        NAME ${TARGET_NAME}
+        COMMAND
+          ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true
+          FLAGS_cudnn_deterministic=true FLAGS_cpu_deterministic=true
+          PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+          COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
+          ${PYTHON_EXECUTABLE} -m coverage run --branch -p ${py_test_SRCS}
+          ${py_test_ARGS}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     else()
-      add_test(NAME ${TARGET_NAME}
-               COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
-               FLAGS_cpu_deterministic=true ${py_test_ENVS}
-               ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
-               WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+      add_test(
+        NAME ${TARGET_NAME}
+        COMMAND
+          ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true
+          FLAGS_cudnn_deterministic=true FLAGS_cpu_deterministic=true
+          ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS}
+          ${py_test_ARGS}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endif()
 
-    if (WIN32)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
+    if(WIN32)
+      set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
     endif()
-    if (APPLE)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 20)
+    if(APPLE)
+      set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 20)
     endif()
 
   endif()
@@ -936,7 +1137,8 @@ function(grpc_library TARGET_NAME)
   set(oneValueArgs PROTO)
   set(multiValueArgs SRCS DEPS)
   set(options "")
-  cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
 
   message(STATUS "generating grpc ${grpc_library_PROTO}")
 
@@ -953,36 +1155,43 @@ function(grpc_library TARGET_NAME)
   cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
 
   add_custom_command(
-          OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
-          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-          ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
-          --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
-          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-          ARGS --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
-          "${ABS_PROTO}"
-          DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
+    OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
+    COMMAND
+      ${PROTOBUF_PROTOC_EXECUTABLE} ARGS --grpc_out
+      "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+      --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+    COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ARGS --cpp_out
+            "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" "${ABS_PROTO}"
+    DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
 
   # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
   # as compiler warnings instead of error. Should try remove the warnings also.
   set_source_files_properties(
     ${grpc_grpc_srcs}
     PROPERTIES
-    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+      COMPILE_FLAGS
+      "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
+  )
   cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
 
   set_source_files_properties(
     ${grpc_library_SRCS}
     PROPERTIES
-    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
+      COMPILE_FLAGS
+      "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
+  )
+  cc_library(
+    "${TARGET_NAME}"
+    SRCS "${grpc_library_SRCS}"
+    DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
 endfunction()
 
-
 function(brpc_library TARGET_NAME)
   set(oneValueArgs PROTO)
   set(multiValueArgs SRCS DEPS)
   set(options "")
-  cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
 
   message(STATUS "generating brpc ${brpc_library_PROTO}")
 
@@ -992,7 +1201,10 @@ function(brpc_library TARGET_NAME)
 
   paddle_protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}")
   cc_library("${TARGET_NAME}_proto" SRCS "${brpc_proto_srcs}")
-  cc_library("${TARGET_NAME}" SRCS "${brpc_library_SRCS}" DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}")
+  cc_library(
+    "${TARGET_NAME}"
+    SRCS "${brpc_library_SRCS}"
+    DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}")
 endfunction()
 
 # copy_if_different from src_file to dst_file At the beginning of the build.
@@ -1000,11 +1212,11 @@ function(copy_if_different src_file dst_file)
   get_filename_component(FILE_NAME ${dst_file} NAME_WE)
 
   # this is a dummy target for custom command, should always be run firstly to update ${dst_file}
-  add_custom_target(copy_${FILE_NAME}_command ALL
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src_file} ${dst_file}
-      COMMENT "copy_if_different ${dst_file}"
-      VERBATIM
-  )
+  add_custom_target(
+    copy_${FILE_NAME}_command ALL
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src_file} ${dst_file}
+    COMMENT "copy_if_different ${dst_file}"
+    VERBATIM)
 
   add_dependencies(extern_glog copy_${FILE_NAME}_command)
 endfunction()
@@ -1019,7 +1231,8 @@ function(generate_dummy_static_lib)
   set(options "")
   set(oneValueArgs LIB_NAME FILE_PATH GENERATOR CONTENT)
   set(multiValueArgs "")
-  cmake_parse_arguments(dummy "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  cmake_parse_arguments(dummy "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
   if(NOT dummy_LIB_NAME)
     message(FATAL_ERROR "You must provide a static lib name.")
   endif()
@@ -1033,45 +1246,55 @@ function(generate_dummy_static_lib)
     set(dummy_CONTENT "${dummy_LIB_NAME}_dummy.c for lib ${dummy_LIB_NAME}")
   endif()
 
-  configure_file(${PROJECT_SOURCE_DIR}/cmake/dummy.c.in ${dummy_FILE_PATH} @ONLY)
+  configure_file(${PROJECT_SOURCE_DIR}/cmake/dummy.c.in ${dummy_FILE_PATH}
+                 @ONLY)
   add_library(${dummy_LIB_NAME} STATIC ${dummy_FILE_PATH})
 endfunction()
 
 function(math_library TARGET)
-    # math_library is a function to create math library.
-    # The interface is the same as cc_library.
-    # But it handle split GPU/CPU code and link some common library.
-    set(cc_srcs)
-    set(cu_srcs)
-    set(hip_srcs)
-    set(math_common_deps device_context framework_proto enforce)
-    if (WITH_GPU)
-        if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)	
-            list(APPEND math_common_deps cub)
-	else()
-            list(APPEND math_common_deps)
-	endif()
+  # math_library is a function to create math library.
+  # The interface is the same as cc_library.
+  # But it handle split GPU/CPU code and link some common library.
+  set(cc_srcs)
+  set(cu_srcs)
+  set(hip_srcs)
+  set(math_common_deps device_context framework_proto enforce)
+  if(WITH_GPU)
+    if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+      list(APPEND math_common_deps cub)
+    else()
+      list(APPEND math_common_deps)
     endif()
-    set(multiValueArgs DEPS)
-    cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
+  endif()
+  set(multiValueArgs DEPS)
+  cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
 
-    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
-        list(APPEND cc_srcs ${TARGET}.cc)
-    endif()
-    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
-        list(APPEND cu_srcs ${TARGET}.cu)
-    endif()
-    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
-        list(APPEND cu_srcs ${TARGET}.cu.cc)
-    endif()
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+    list(APPEND cc_srcs ${TARGET}.cc)
+  endif()
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+    list(APPEND cu_srcs ${TARGET}.cu)
+  endif()
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+    list(APPEND cu_srcs ${TARGET}.cu.cc)
+  endif()
 
-    list(LENGTH cc_srcs cc_srcs_len)
-    if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
-    elseif (WITH_ROCM)
-        hip_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
-    elseif(${cc_srcs_len} GREATER 0)
-        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
-    endif()
+  list(LENGTH cc_srcs cc_srcs_len)
+  if(WITH_GPU)
+    nv_library(
+      ${TARGET}
+      SRCS ${cc_srcs} ${cu_srcs}
+      DEPS ${math_library_DEPS} ${math_common_deps})
+  elseif(WITH_ROCM)
+    hip_library(
+      ${TARGET}
+      SRCS ${cc_srcs} ${cu_srcs}
+      DEPS ${math_library_DEPS} ${math_common_deps})
+  elseif(${cc_srcs_len} GREATER 0)
+    cc_library(
+      ${TARGET}
+      SRCS ${cc_srcs}
+      DEPS ${math_library_DEPS} ${math_common_deps})
+  endif()
 endfunction()
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 14cb9e6f6be5a..3514882c944de 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -1,15 +1,27 @@
 if(NOT WITH_ROCM)
-    return()
+  return()
 endif()
 
 if(NOT DEFINED ENV{ROCM_PATH})
-    set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
-    set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
-    set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE PATH "Path to which clang has been installed")
+  set(ROCM_PATH
+      "/opt/rocm"
+      CACHE PATH "Path to which ROCm has been installed")
+  set(HIP_PATH
+      ${ROCM_PATH}/hip
+      CACHE PATH "Path to which HIP has been installed")
+  set(HIP_CLANG_PATH
+      ${ROCM_PATH}/llvm/bin
+      CACHE PATH "Path to which clang has been installed")
 else()
-    set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
-    set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
-    set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE PATH "Path to which clang has been installed")
+  set(ROCM_PATH
+      $ENV{ROCM_PATH}
+      CACHE PATH "Path to which ROCm has been installed")
+  set(HIP_PATH
+      ${ROCM_PATH}/hip
+      CACHE PATH "Path to which HIP has been installed")
+  set(HIP_CLANG_PATH
+      ${ROCM_PATH}/llvm/bin
+      CACHE PATH "Path to which clang has been installed")
 endif()
 set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
 
@@ -18,30 +30,39 @@ include_directories(${ROCM_PATH}/include)
 message(STATUS "HIP version: ${HIP_VERSION}")
 message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}")
 
-macro(find_hip_version hip_header_file) 
-    file(READ ${hip_header_file} HIP_VERSION_FILE_CONTENTS)
+macro(find_hip_version hip_header_file)
+  file(READ ${hip_header_file} HIP_VERSION_FILE_CONTENTS)
 
-    string(REGEX MATCH "define HIP_VERSION_MAJOR +([0-9]+)" HIP_MAJOR_VERSION
-        "${HIP_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define HIP_VERSION_MAJOR +([0-9]+)" "\\1"
-        HIP_MAJOR_VERSION "${HIP_MAJOR_VERSION}")
-    string(REGEX MATCH "define HIP_VERSION_MINOR +([0-9]+)" HIP_MINOR_VERSION
-        "${HIP_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define HIP_VERSION_MINOR +([0-9]+)" "\\1"
-        HIP_MINOR_VERSION "${HIP_MINOR_VERSION}")
-    string(REGEX MATCH "define HIP_VERSION_PATCH +([0-9]+)" HIP_PATCH_VERSION
-        "${HIP_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define HIP_VERSION_PATCH +([0-9]+)" "\\1"
-        HIP_PATCH_VERSION "${HIP_PATCH_VERSION}")
+  string(REGEX MATCH "define HIP_VERSION_MAJOR +([0-9]+)" HIP_MAJOR_VERSION
+               "${HIP_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define HIP_VERSION_MAJOR +([0-9]+)" "\\1"
+                       HIP_MAJOR_VERSION "${HIP_MAJOR_VERSION}")
+  string(REGEX MATCH "define HIP_VERSION_MINOR +([0-9]+)" HIP_MINOR_VERSION
+               "${HIP_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define HIP_VERSION_MINOR +([0-9]+)" "\\1"
+                       HIP_MINOR_VERSION "${HIP_MINOR_VERSION}")
+  string(REGEX MATCH "define HIP_VERSION_PATCH +([0-9]+)" HIP_PATCH_VERSION
+               "${HIP_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define HIP_VERSION_PATCH +([0-9]+)" "\\1"
+                       HIP_PATCH_VERSION "${HIP_PATCH_VERSION}")
 
-    if(NOT HIP_MAJOR_VERSION)
-        set(HIP_VERSION "???")
-        message(WARNING "Cannot find HIP version in ${HIP_PATH}/include/hip/hip_version.h")
-    else()
-        math(EXPR HIP_VERSION "${HIP_MAJOR_VERSION} * 10000000 + ${HIP_MINOR_VERSION} * 100000   + ${HIP_PATCH_VERSION}")
-        message(STATUS "Current HIP header is ${HIP_PATH}/include/hip/hip_version.h "
-          "Current HIP version is v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}.${HIP_PATCH_VERSION}. ")
-    endif()
+  if(NOT HIP_MAJOR_VERSION)
+    set(HIP_VERSION "???")
+    message(
+      WARNING "Cannot find HIP version in ${HIP_PATH}/include/hip/hip_version.h"
+    )
+  else()
+    math(
+      EXPR
+      HIP_VERSION
+      "${HIP_MAJOR_VERSION} * 10000000 + ${HIP_MINOR_VERSION} * 100000   + ${HIP_PATCH_VERSION}"
+    )
+    message(
+      STATUS
+        "Current HIP header is ${HIP_PATH}/include/hip/hip_version.h "
+        "Current HIP version is v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}.${HIP_PATCH_VERSION}. "
+    )
+  endif()
 endmacro()
 find_hip_version(${HIP_PATH}/include/hip/hip_version.h)
 
@@ -66,7 +87,8 @@ find_package_and_include(rocfft)
 # set CXX flags for HIP
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP")
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP")
 set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP)
 
 # define HIP_CXX_FLAGS
@@ -103,7 +125,6 @@ list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)
 list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906)
 list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908)
 
-
 if(HIP_COMPILER STREQUAL clang)
   set(hip_library_name amdhip64)
 else()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index a52047e16167d..bf69ddc8fb49a 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -13,290 +13,366 @@
 # limitations under the License.
 
 # make package for paddle fluid shared and static library
-set(PADDLE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_install_dir" CACHE STRING
-  "A path setting paddle shared and static libraries")
+set(PADDLE_INSTALL_DIR
+    "${CMAKE_BINARY_DIR}/paddle_install_dir"
+    CACHE STRING "A path setting paddle shared and static libraries")
+
+set(PADDLE_INFERENCE_INSTALL_DIR
+    "${CMAKE_BINARY_DIR}/paddle_inference_install_dir"
+    CACHE STRING "A path setting paddle inference shared and static libraries")
 
-set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir" CACHE STRING
-  "A path setting paddle inference shared and static libraries")
-  
 # At present, the size of static lib in Windows is very large,
 # so we need to crop the library size.
 if(WIN32)
-    #todo: remove the option 
-    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use dynamic."   OFF)
-    if(NOT PYTHON_EXECUTABLE)
-        FIND_PACKAGE(PythonInterp REQUIRED)
-    endif()
+  #todo: remove the option
+  option(WITH_STATIC_LIB
+         "Compile demo with static/shared library, default use dynamic." OFF)
+  if(NOT PYTHON_EXECUTABLE)
+    find_package(PythonInterp REQUIRED)
+  endif()
 endif()
 
 set(COPY_SCRIPT_DIR ${PADDLE_SOURCE_DIR}/cmake)
 function(copy TARGET)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DSTS)
-    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
-    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
-    if (NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
-        message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
-    endif ()
-    math(EXPR len "${copy_lib_SRCS_len} - 1")
-    foreach (index RANGE ${len})
-        list(GET copy_lib_SRCS ${index} src)
-        list(GET copy_lib_DSTS ${index} dst)
-        if (WIN32)   #windows
-            file(TO_NATIVE_PATH ${src} native_src)
-            file(TO_NATIVE_PATH ${dst} native_dst)
-            add_custom_command(TARGET ${TARGET} POST_BUILD
-                    COMMAND ${PYTHON_EXECUTABLE} ${COPY_SCRIPT_DIR}/copyfile.py ${native_src} ${native_dst})
-        else (WIN32) #not windows
-            add_custom_command(TARGET ${TARGET} POST_BUILD
-                    COMMAND mkdir -p "${dst}"
-                    COMMAND cp -r "${src}" "${dst}"
-                    COMMENT "copying ${src} -> ${dst}")
-        endif (WIN32) # not windows
-    endforeach ()
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DSTS)
+  cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+
+  list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
+  list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
+  if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
+    message(
+      FATAL_ERROR
+        "${TARGET} source numbers are not equal to destination numbers")
+  endif()
+  math(EXPR len "${copy_lib_SRCS_len} - 1")
+  foreach(index RANGE ${len})
+    list(GET copy_lib_SRCS ${index} src)
+    list(GET copy_lib_DSTS ${index} dst)
+    if(WIN32) #windows
+      file(TO_NATIVE_PATH ${src} native_src)
+      file(TO_NATIVE_PATH ${dst} native_dst)
+      add_custom_command(
+        TARGET ${TARGET}
+        POST_BUILD
+        COMMAND ${PYTHON_EXECUTABLE} ${COPY_SCRIPT_DIR}/copyfile.py
+                ${native_src} ${native_dst})
+    else(WIN32) #not windows
+      add_custom_command(
+        TARGET ${TARGET}
+        POST_BUILD
+        COMMAND mkdir -p "${dst}"
+        COMMAND cp -r "${src}" "${dst}"
+        COMMENT "copying ${src} -> ${dst}")
+    endif(WIN32) # not windows
+  endforeach()
 endfunction()
 
-function(copy_part_of_thrid_party TARGET DST) 
-    if(${CBLAS_PROVIDER} STREQUAL MKLML)
-        set(dst_dir "${DST}/third_party/install/mklml")
-        if(WIN32)
-            copy(${TARGET}
-                    SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_SHARED_LIB}
-                    ${MKLML_SHARED_IOMP_LIB} ${MKLML_INC_DIR}
-                    DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib
-                    ${dst_dir}/lib ${dst_dir})
-        else()
-            copy(${TARGET}
-                    SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
-                    DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir})
-            if(WITH_STRIP)
-                    add_custom_command(TARGET ${TARGET} POST_BUILD
-                            COMMAND strip -s ${dst_dir}/lib/libiomp5.so
-                            COMMAND strip -s ${dst_dir}/lib/libmklml_intel.so
-                            COMMENT "striping libiomp5.so\nstriping libmklml_intel.so")
-            endif()
-        endif()
-    elseif(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
-        set(dst_dir "${DST}/third_party/install/openblas")
-	if(WIN32)
-            copy(${TARGET}
-                    SRCS ${CBLAS_INSTALL_DIR}/lib ${OPENBLAS_SHARED_LIB} ${CBLAS_INSTALL_DIR}/include
-                    DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir})
-	else()
-            copy(${TARGET}
-                    SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
-                    DSTS ${dst_dir} ${dst_dir})
-	endif()
-    endif()
-
-    if(WITH_MKLDNN)
-        set(dst_dir "${DST}/third_party/install/mkldnn")
-        if(WIN32)
-            copy(${TARGET}
-                    SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}  ${MKLDNN_LIB}
-                    DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib)
-        else()
-            copy(${TARGET}
-                    SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
-                    DSTS ${dst_dir} ${dst_dir}/lib)
-            if(WITH_STRIP)
-                    add_custom_command(TARGET ${TARGET} POST_BUILD
-                            COMMAND strip -s ${dst_dir}/lib/libmkldnn.so.0
-                            COMMENT "striping libmkldnn.so.0")
-            endif()
-            add_custom_command(TARGET ${TARGET} POST_BUILD
-                    COMMAND ${CMAKE_COMMAND} -E create_symlink libmkldnn.so.0 ${dst_dir}/lib/libdnnl.so.1
-                    COMMAND ${CMAKE_COMMAND} -E create_symlink libmkldnn.so.0 ${dst_dir}/lib/libdnnl.so.2
-                    COMMENT "Make a symbol link of libmkldnn.so.0")
-        endif()
+function(copy_part_of_thrid_party TARGET DST)
+  if(${CBLAS_PROVIDER} STREQUAL MKLML)
+    set(dst_dir "${DST}/third_party/install/mklml")
+    if(WIN32)
+      copy(
+        ${TARGET}
+        SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_SHARED_LIB}
+             ${MKLML_SHARED_IOMP_LIB} ${MKLML_INC_DIR}
+        DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib
+             ${dst_dir})
+    else()
+      copy(
+        ${TARGET}
+        SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
+        DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir})
+      if(WITH_STRIP)
+        add_custom_command(
+          TARGET ${TARGET}
+          POST_BUILD
+          COMMAND strip -s ${dst_dir}/lib/libiomp5.so
+          COMMAND strip -s ${dst_dir}/lib/libmklml_intel.so
+          COMMENT "striping libiomp5.so\nstriping libmklml_intel.so")
+      endif()
     endif()
-
-    if (WITH_ONNXRUNTIME)
-        set(dst_dir "${DST}/third_party/install/onnxruntime")
-        copy(${TARGET}
-                SRCS ${ONNXRUNTIME_INC_DIR} ${ONNXRUNTIME_LIB_DIR}
-                DSTS ${dst_dir} ${dst_dir})
-
-        set(dst_dir "${DST}/third_party/install/paddle2onnx")
-        if(WIN32)
-            copy(${TARGET}
-                SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB} ${PADDLE2ONNX_LIB}
-                DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib)
-        else()
-            copy(${TARGET}
-                SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB}
-                DSTS ${dst_dir}/include ${dst_dir}/lib)
-        endif()
+  elseif(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
+    set(dst_dir "${DST}/third_party/install/openblas")
+    if(WIN32)
+      copy(
+        ${TARGET}
+        SRCS ${CBLAS_INSTALL_DIR}/lib ${OPENBLAS_SHARED_LIB}
+             ${CBLAS_INSTALL_DIR}/include
+        DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir})
+    else()
+      copy(
+        ${TARGET}
+        SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
+        DSTS ${dst_dir} ${dst_dir})
     endif()
-
-    set(dst_dir "${DST}/third_party/install/gflags")
-    copy(${TARGET}
-            SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
-            DSTS ${dst_dir} ${dst_dir}/lib)
-
-    set(dst_dir "${DST}/third_party/install/glog")
-    copy(${TARGET}
-            SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
-            DSTS ${dst_dir} ${dst_dir}/lib)
-
-    set(dst_dir "${DST}/third_party/install/utf8proc")
-    copy(${TARGET}
-            SRCS ${UTF8PROC_INSTALL_DIR}/include ${UTF8PROC_LIBRARIES}
-            DSTS ${dst_dir} ${dst_dir}/lib)
-
-    if (WITH_CRYPTO)
-        set(dst_dir "${DST}/third_party/install/cryptopp")
-        copy(${TARGET}
-            SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
-            DSTS ${dst_dir} ${dst_dir}/lib)
+  endif()
+
+  if(WITH_MKLDNN)
+    set(dst_dir "${DST}/third_party/install/mkldnn")
+    if(WIN32)
+      copy(
+        ${TARGET}
+        SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_LIB}
+        DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib)
+    else()
+      copy(
+        ${TARGET}
+        SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
+        DSTS ${dst_dir} ${dst_dir}/lib)
+      if(WITH_STRIP)
+        add_custom_command(
+          TARGET ${TARGET}
+          POST_BUILD
+          COMMAND strip -s ${dst_dir}/lib/libmkldnn.so.0
+          COMMENT "striping libmkldnn.so.0")
+      endif()
+      add_custom_command(
+        TARGET ${TARGET}
+        POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E create_symlink libmkldnn.so.0
+                ${dst_dir}/lib/libdnnl.so.1
+        COMMAND ${CMAKE_COMMAND} -E create_symlink libmkldnn.so.0
+                ${dst_dir}/lib/libdnnl.so.2
+        COMMENT "Make a symbol link of libmkldnn.so.0")
     endif()
-
-    set(dst_dir "${DST}/third_party/install/xxhash")
-    copy(${TARGET}
-        SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib)    
-
-    if (NOT PROTOBUF_FOUND OR WIN32)
-        set(dst_dir "${DST}/third_party/install/protobuf")
-        copy(${TARGET}
-                SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
-                DSTS ${dst_dir} ${dst_dir}/lib)
-    endif ()
-
-    if (LITE_BINARY_DIR)
-        set(dst_dir "${DST}/third_party/install/lite")
-        copy(${TARGET}
-                SRCS ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/*
-                DSTS ${dst_dir})
+  endif()
+
+  if(WITH_ONNXRUNTIME)
+    set(dst_dir "${DST}/third_party/install/onnxruntime")
+    copy(
+      ${TARGET}
+      SRCS ${ONNXRUNTIME_INC_DIR} ${ONNXRUNTIME_LIB_DIR}
+      DSTS ${dst_dir} ${dst_dir})
+
+    set(dst_dir "${DST}/third_party/install/paddle2onnx")
+    if(WIN32)
+      copy(
+        ${TARGET}
+        SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB}
+             ${PADDLE2ONNX_LIB}
+        DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib)
+    else()
+      copy(
+        ${TARGET}
+        SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB}
+        DSTS ${dst_dir}/include ${dst_dir}/lib)
     endif()
+  endif()
+
+  set(dst_dir "${DST}/third_party/install/gflags")
+  copy(
+    ${TARGET}
+    SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+
+  set(dst_dir "${DST}/third_party/install/glog")
+  copy(
+    ${TARGET}
+    SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+
+  set(dst_dir "${DST}/third_party/install/utf8proc")
+  copy(
+    ${TARGET}
+    SRCS ${UTF8PROC_INSTALL_DIR}/include ${UTF8PROC_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+
+  if(WITH_CRYPTO)
+    set(dst_dir "${DST}/third_party/install/cryptopp")
+    copy(
+      ${TARGET}
+      SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
+      DSTS ${dst_dir} ${dst_dir}/lib)
+  endif()
+
+  set(dst_dir "${DST}/third_party/install/xxhash")
+  copy(
+    ${TARGET}
+    SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+
+  if(NOT PROTOBUF_FOUND OR WIN32)
+    set(dst_dir "${DST}/third_party/install/protobuf")
+    copy(
+      ${TARGET}
+      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
+      DSTS ${dst_dir} ${dst_dir}/lib)
+  endif()
+
+  if(LITE_BINARY_DIR)
+    set(dst_dir "${DST}/third_party/install/lite")
+    copy(
+      ${TARGET}
+      SRCS ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/*
+      DSTS ${dst_dir})
+  endif()
 endfunction()
 
 # inference library for only inference
-set(inference_lib_deps third_party paddle_inference paddle_inference_c paddle_inference_shared paddle_inference_c_shared)
+set(inference_lib_deps third_party paddle_inference paddle_inference_c
+                       paddle_inference_shared paddle_inference_c_shared)
 add_custom_target(inference_lib_dist DEPENDS ${inference_lib_deps})
 
-
 set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/threadpool")
-copy(inference_lib_dist
-        SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
-        DSTS ${dst_dir})
+copy(
+  inference_lib_dist
+  SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
+  DSTS ${dst_dir})
 
 # GPU must copy externalErrorMsg.pb
-IF(WITH_GPU)
-    set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/externalError/data")
-    copy(inference_lib_dist
-            SRCS ${externalError_INCLUDE_DIR}
-            DSTS ${dst_dir})
-ENDIF()
-
-IF(WITH_XPU)
-    set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/install/xpu")
-    copy(inference_lib_dist
-        SRCS ${XPU_INC_DIR} ${XPU_LIB_DIR}
-        DSTS ${dst_dir} ${dst_dir})
-ENDIF()
+if(WITH_GPU)
+  set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/externalError/data")
+  copy(
+    inference_lib_dist
+    SRCS ${externalError_INCLUDE_DIR}
+    DSTS ${dst_dir})
+endif()
+
+if(WITH_XPU)
+  set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/install/xpu")
+  copy(
+    inference_lib_dist
+    SRCS ${XPU_INC_DIR} ${XPU_LIB_DIR}
+    DSTS ${dst_dir} ${dst_dir})
+endif()
 
 # CMakeCache Info
-copy(inference_lib_dist
-        SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-        DSTS ${PADDLE_INFERENCE_INSTALL_DIR})
+copy(
+  inference_lib_dist
+  SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR})
 
 copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 
 if(WIN32)
-    if(WITH_STATIC_LIB)
-        set(paddle_inference_lib $<TARGET_FILE_DIR:paddle_inference>/libpaddle_inference.lib
-                             $<TARGET_FILE_DIR:paddle_inference>/paddle_inference.*)
-    else()
-        set(paddle_inference_lib $<TARGET_FILE_DIR:paddle_inference_shared>/paddle_inference.dll
-                             $<TARGET_FILE_DIR:paddle_inference_shared>/paddle_inference.lib)
-    endif()
-    copy(inference_lib_dist
-            SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib}
-            DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
-            ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
+  if(WITH_STATIC_LIB)
+    set(paddle_inference_lib
+        $<TARGET_FILE_DIR:paddle_inference>/libpaddle_inference.lib
+        $<TARGET_FILE_DIR:paddle_inference>/paddle_inference.*)
+  else()
+    set(paddle_inference_lib
+        $<TARGET_FILE_DIR:paddle_inference_shared>/paddle_inference.dll
+        $<TARGET_FILE_DIR:paddle_inference_shared>/paddle_inference.lib)
+  endif()
+  copy(
+    inference_lib_dist
+    SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib}
+    DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include
+         ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
+         ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
 else(WIN32)
-    set(paddle_inference_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_inference.*)
-    copy(inference_lib_dist
-                SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib}
-                DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
+  set(paddle_inference_lib
+      ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_inference.*)
+  copy(
+    inference_lib_dist
+    SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib}
+    DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include
+         ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
 endif(WIN32)
 
-copy(inference_lib_dist
-        SRCS  ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/internal)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
+copy(
+  inference_lib_dist
+  SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/internal)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
 include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 
 # copy api headers for phi & custom op
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/phi/api/ext/*.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/phi/api/include/*.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/phi/api/all.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/phi/common/*.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/phi/core/macros.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/phi/core/visit_type.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/optional.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/none.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/flat_hash_map.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/extension.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/api/ext/*.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/api/include/*.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include/
+)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/api/all.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/common/*.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/core/macros.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/core/visit_type.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/optional.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/none.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/flat_hash_map.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/extension.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 
 # the header file of phi is copied to the experimental directory,
 # the include path of phi needs to be changed to adapt to inference api path
-add_custom_command(TARGET inference_lib_dist POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/phi_header.cmake"
-        COMMENT "Change phi header include path to adapt to inference api path")
+add_custom_command(
+  TARGET inference_lib_dist
+  POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/phi_header.cmake"
+  COMMENT "Change phi header include path to adapt to inference api path")
 
 # CAPI inference library for only inference
-set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
-"A path setting CAPI paddle inference shared")
+set(PADDLE_INFERENCE_C_INSTALL_DIR
+    "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir"
+    CACHE STRING "A path setting CAPI paddle inference shared")
 copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_C_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
-  set(paddle_inference_c_lib $<TARGET_FILE_DIR:paddle_inference_c>/paddle_inference_c.*)
+  set(paddle_inference_c_lib
+      $<TARGET_FILE_DIR:paddle_inference_c>/paddle_inference_c.*)
 else(WIN32)
-  set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.*)
+  set(paddle_inference_c_lib
+      ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.*
+  )
 endif(WIN32)
 
-copy(inference_lib_dist
-      SRCS  ${src_dir}/inference/capi_exp/pd_*.h  ${paddle_inference_c_lib}
-      DSTS  ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib)
+copy(
+  inference_lib_dist
+  SRCS ${src_dir}/inference/capi_exp/pd_*.h ${paddle_inference_c_lib}
+  DSTS ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include
+       ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib)
 
 if(WITH_STRIP AND NOT WIN32)
-        add_custom_command(TARGET inference_lib_dist POST_BUILD
-                COMMAND strip -s ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib/libpaddle_inference_c.so
-                COMMAND strip -s ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib/libpaddle_inference.so
-                COMMENT "striping libpaddle_inference_c.so\nstriping libpaddle_inference.so")
+  add_custom_command(
+    TARGET inference_lib_dist
+    POST_BUILD
+    COMMAND
+      strip -s
+      ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib/libpaddle_inference_c.so
+    COMMAND strip -s
+            ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib/libpaddle_inference.so
+    COMMENT "striping libpaddle_inference_c.so\nstriping libpaddle_inference.so"
+  )
 endif()
 
 # fluid library for both train and inference
@@ -306,36 +382,55 @@ add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})
 set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid")
 set(module "inference")
 if(WIN32)
-        copy(fluid_lib_dist
-                SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_inference_lib}
-                DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
-                )
-        else()
-        copy(fluid_lib_dist
-                SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_inference_lib}
-                DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} 
-                )
+  copy(
+    fluid_lib_dist
+    SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h
+         ${paddle_inference_lib}
+    DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+         ${dst_dir}/${module})
+else()
+  copy(
+    fluid_lib_dist
+    SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h
+         ${paddle_inference_lib}
+    DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module})
 endif()
 
 set(module "framework")
 set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto)
 add_dependencies(fluid_lib_dist ${framework_lib_deps})
-copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
-        ${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}/ir/memory_optimize_pass ${dst_dir}/${module}/ir ${dst_dir}/${module}/fleet)
+copy(
+  fluid_lib_dist
+  SRCS ${src_dir}/${module}/*.h
+       ${src_dir}/${module}/details/*.h
+       ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h
+       ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
+       ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h
+       ${src_dir}/${module}/ir/memory_optimize_pass/*.h
+       ${src_dir}/${module}/ir/*.h
+       ${src_dir}/${module}/fleet/*.h
+  DSTS ${dst_dir}/${module}
+       ${dst_dir}/${module}/details
+       ${dst_dir}/${module}
+       ${dst_dir}/${module}
+       ${dst_dir}/${module}
+       ${dst_dir}/${module}/ir/memory_optimize_pass
+       ${dst_dir}/${module}/ir
+       ${dst_dir}/${module}/fleet)
 
 set(module "operators")
-copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/reader/blocking_queue.h
-        DSTS ${dst_dir}/${module}/reader/
-        )
+copy(
+  fluid_lib_dist
+  SRCS ${src_dir}/${module}/reader/blocking_queue.h
+  DSTS ${dst_dir}/${module}/reader/)
 
 set(module "memory")
-copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation
-        )
+copy(
+  fluid_lib_dist
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
+       ${src_dir}/${module}/allocation/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
+       ${dst_dir}/${module}/allocation)
 
 set(module "platform")
 set(platform_lib_deps profiler_proto errors)
@@ -344,99 +439,113 @@ if(WITH_GPU)
 endif(WITH_GPU)
 
 add_dependencies(fluid_lib_dist ${platform_lib_deps})
-copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
-        )
+copy(
+  fluid_lib_dist
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h
+       ${src_dir}/${module}/details/*.h
+       ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload
+       ${dst_dir}/${module}/details ${dst_dir}/${module})
 
 set(module "string")
-copy(fluid_lib_dist
-        SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/*.h ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/tinyformat/*.h 
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
-        )
+copy(
+  fluid_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/*.h
+       ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/tinyformat/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat)
 
 set(module "imperative")
-copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/jit/*.h 
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/jit
-        )
+copy(
+  fluid_lib_dist
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/jit/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/jit)
 
 set(module "pybind")
-copy(fluid_lib_dist
-        SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
-        DSTS ${dst_dir}/${module}
-        )
+copy(
+  fluid_lib_dist
+  SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
+  DSTS ${dst_dir}/${module})
 
 set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/eigen3")
-copy(inference_lib_dist
-        SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
-        DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported)
+copy(
+  inference_lib_dist
+  SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src
+       ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
+  DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported)
 
 set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/boost")
-copy(inference_lib_dist
-        SRCS ${BOOST_INCLUDE_DIR}/boost
-        DSTS ${dst_dir})
+copy(
+  inference_lib_dist
+  SRCS ${BOOST_INCLUDE_DIR}/boost
+  DSTS ${dst_dir})
 
 set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/dlpack")
-copy(inference_lib_dist
-        SRCS ${DLPACK_INCLUDE_DIR}/dlpack
-        DSTS ${dst_dir})
+copy(
+  inference_lib_dist
+  SRCS ${DLPACK_INCLUDE_DIR}/dlpack
+  DSTS ${dst_dir})
 
 set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/install/zlib")
-copy(inference_lib_dist
-        SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
-        DSTS ${dst_dir} ${dst_dir}/lib)
-
+copy(
+  inference_lib_dist
+  SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib)
 
 # CMakeCache Info
-copy(fluid_lib_dist
-        SRCS ${PADDLE_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-        DSTS ${PADDLE_INSTALL_DIR} ${PADDLE_INSTALL_DIR}
-        )
+copy(
+  fluid_lib_dist
+  SRCS ${PADDLE_INFERENCE_INSTALL_DIR}/third_party
+       ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
+  DSTS ${PADDLE_INSTALL_DIR} ${PADDLE_INSTALL_DIR})
 
 # paddle fluid version
 function(version version_file)
-    execute_process(
-            COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-            OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-    file(WRITE ${version_file}
-            "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
-            "WITH_MKL: ${WITH_MKL}\n"
-            "WITH_MKLDNN: ${WITH_MKLDNN}\n"
-            "WITH_GPU: ${WITH_GPU}\n"
-            "WITH_ROCM: ${WITH_ROCM}\n"
-            "WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n"
-            "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n"
-            "WITH_IPU: ${WITH_IPU}\n")
-    if(WITH_GPU)
-        file(APPEND ${version_file}
-                "CUDA version: ${CUDA_VERSION}\n"
-                "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n")
-    endif()
-    if(WITH_ROCM)
-        file(APPEND ${version_file}
-                "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n"
-                "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n")
-    endif()
-    if(WITH_ASCEND_CL)
-        file(APPEND ${version_file}
-                "Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n"
-                "Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n")
-    endif()
-    if(WITH_IPU)
-        file(APPEND ${version_file}
-                "PopART version: ${POPART_VERSION}\n")
-    endif()
-    file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n")
-    if(TENSORRT_FOUND)
-        file(APPEND ${version_file}
-                "WITH_TENSORRT: ${TENSORRT_FOUND}\n" "TensorRT version: v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION}\n")
-    endif()
-    if(WITH_LITE)
-        file(APPEND ${version_file} "WITH_LITE: ${WITH_LITE}\n" "LITE_GIT_TAG: ${LITE_GIT_TAG}\n")
-    endif()
-    
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+    OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
+  file(
+    WRITE ${version_file}
+    "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
+    "WITH_MKL: ${WITH_MKL}\n"
+    "WITH_MKLDNN: ${WITH_MKLDNN}\n"
+    "WITH_GPU: ${WITH_GPU}\n"
+    "WITH_ROCM: ${WITH_ROCM}\n"
+    "WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n"
+    "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n"
+    "WITH_IPU: ${WITH_IPU}\n")
+  if(WITH_GPU)
+    file(APPEND ${version_file}
+         "CUDA version: ${CUDA_VERSION}\n"
+         "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n")
+  endif()
+  if(WITH_ROCM)
+    file(APPEND ${version_file}
+         "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n"
+         "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n")
+  endif()
+  if(WITH_ASCEND_CL)
+    file(APPEND ${version_file}
+         "Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n"
+         "Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n")
+  endif()
+  if(WITH_IPU)
+    file(APPEND ${version_file} "PopART version: ${POPART_VERSION}\n")
+  endif()
+  file(APPEND ${version_file}
+       "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n")
+  if(TENSORRT_FOUND)
+    file(
+      APPEND ${version_file}
+      "WITH_TENSORRT: ${TENSORRT_FOUND}\n"
+      "TensorRT version: v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION}\n"
+    )
+  endif()
+  if(WITH_LITE)
+    file(APPEND ${version_file} "WITH_LITE: ${WITH_LITE}\n"
+                                "LITE_GIT_TAG: ${LITE_GIT_TAG}\n")
+  endif()
+
 endfunction()
 version(${PADDLE_INSTALL_DIR}/version.txt)
 version(${PADDLE_INFERENCE_INSTALL_DIR}/version.txt)
diff --git a/cmake/infrt_lib.cmake b/cmake/infrt_lib.cmake
index 5b27c9d8400cc..21dcd0ef36d16 100644
--- a/cmake/infrt_lib.cmake
+++ b/cmake/infrt_lib.cmake
@@ -12,65 +12,74 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set(INFRT_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_infrt_install_dir" CACHE STRING
-  "A path setting paddle infrt shared and static libraries")
-  
+set(INFRT_INSTALL_DIR
+    "${CMAKE_BINARY_DIR}/paddle_infrt_install_dir"
+    CACHE STRING "A path setting paddle infrt shared and static libraries")
+
 function(copy TARGET)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DSTS)
-    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DSTS)
+  cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
 
-    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
-    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
-    if (NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
-        message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
-    endif ()
-    math(EXPR len "${copy_lib_SRCS_len} - 1")
-    foreach (index RANGE ${len})
-        list(GET copy_lib_SRCS ${index} src)
-        list(GET copy_lib_DSTS ${index} dst)
-        add_custom_command(TARGET ${TARGET} POST_BUILD
-                COMMAND mkdir -p "${dst}"
-                COMMAND cp -r "${src}" "${dst}"
-                COMMENT "copying ${src} -> ${dst}")
-    endforeach ()
+  list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
+  list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
+  if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
+    message(
+      FATAL_ERROR
+        "${TARGET} source numbers are not equal to destination numbers")
+  endif()
+  math(EXPR len "${copy_lib_SRCS_len} - 1")
+  foreach(index RANGE ${len})
+    list(GET copy_lib_SRCS ${index} src)
+    list(GET copy_lib_DSTS ${index} dst)
+    add_custom_command(
+      TARGET ${TARGET}
+      POST_BUILD
+      COMMAND mkdir -p "${dst}"
+      COMMAND cp -r "${src}" "${dst}"
+      COMMENT "copying ${src} -> ${dst}")
+  endforeach()
 endfunction()
 
-function(copy_part_of_thrid_party TARGET DST) 
-    set(dst_dir "${DST}/third_party/install/glog")
-    copy(${TARGET}
-            SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
-            DSTS ${dst_dir} ${dst_dir}/lib)
+function(copy_part_of_thrid_party TARGET DST)
+  set(dst_dir "${DST}/third_party/install/glog")
+  copy(
+    ${TARGET}
+    SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
 endfunction()
 
 # inference library for only inference
 set(infrt_lib_deps third_party infrt infrt_static)
 add_custom_target(infrt_lib_dist DEPENDS ${infrt_lib_deps})
 
-
 # CMakeCache Info
-copy(infrt_lib_dist
-        SRCS ${CMAKE_BINARY_DIR}/CMakeCache.txt
-        DSTS ${INFRT_INSTALL_DIR})
+copy(
+  infrt_lib_dist
+  SRCS ${CMAKE_BINARY_DIR}/CMakeCache.txt
+  DSTS ${INFRT_INSTALL_DIR})
 
 set(infrt_lib ${INFRT_BINARY_DIR}/libinfrt.*)
-copy(infrt_lib_dist
-    SRCS  ${INFRT_SOURCE_DIR}/api/infrt_api.h ${infrt_lib}
-    DSTS  ${INFRT_INSTALL_DIR}/infrt/include ${INFRT_INSTALL_DIR}/infrt/lib)
-
+copy(
+  infrt_lib_dist
+  SRCS ${INFRT_SOURCE_DIR}/api/infrt_api.h ${infrt_lib}
+  DSTS ${INFRT_INSTALL_DIR}/infrt/include ${INFRT_INSTALL_DIR}/infrt/lib)
 
-copy(infrt_lib_dist
-        SRCS  ${INFRT_BINARY_DIR}/paddle/framework.pb.h
-        DSTS  ${INFRT_INSTALL_DIR}/infrt/include/internal)
+copy(
+  infrt_lib_dist
+  SRCS ${INFRT_BINARY_DIR}/paddle/framework.pb.h
+  DSTS ${INFRT_INSTALL_DIR}/infrt/include/internal)
 
 # paddle fluid version
 function(version version_file)
-    execute_process(
-            COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-            OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-    file(WRITE ${version_file}  "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n")
-    file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n")
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+    OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
+  file(WRITE ${version_file} "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n")
+  file(APPEND ${version_file}
+       "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n")
 endfunction()
 version(${INFRT_INSTALL_DIR}/version.txt)
diff --git a/cmake/init.cmake b/cmake/init.cmake
index 0ebcdc8ceeebc..86c43cb233bfc 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -8,43 +8,44 @@
 # MINSIZEREL: default: "-O2 -g -DNDEBUG"
 
 if(NOT WIN32)
-    set(CMAKE_C_FLAGS_DEBUG "-g")
-    set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG")
-    set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
-    set(CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG")
+  set(CMAKE_C_FLAGS_DEBUG "-g")
+  set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG")
+  set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
+  set(CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG")
 
-    set(CMAKE_CXX_FLAGS_DEBUG "-g")
-    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
-    set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
-    
-    if(WITH_GPU)
-        set(CMAKE_CUDA_FLAGS_DEBUG "-g")
-        set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
-        set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
-        set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
-    endif()
+  set(CMAKE_CXX_FLAGS_DEBUG "-g")
+  set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
+  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
+  set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
+
+  if(WITH_GPU)
+    set(CMAKE_CUDA_FLAGS_DEBUG "-g")
+    set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
+    set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
+    set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
+  endif()
 else()
-    set(CMAKE_C_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1")
-    set(CMAKE_C_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG")
-    set(CMAKE_C_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG")
-    set(CMAKE_C_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG")
+  set(CMAKE_C_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1")
+  set(CMAKE_C_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG")
+  set(CMAKE_C_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG")
+  set(CMAKE_C_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG")
 
-    set(CMAKE_CXX_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1")
-    set(CMAKE_CXX_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG")
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG")
-    set(CMAKE_CXX_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG")
+  set(CMAKE_CXX_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1")
+  set(CMAKE_CXX_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG")
+  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG")
+  set(CMAKE_CXX_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG")
 
-    if(WITH_GPU)
-        set(CMAKE_CUDA_FLAGS_DEBUG "-Xcompiler=\"-MDd -Zi -Ob0 -Od /RTC1\"")
-        set(CMAKE_CUDA_FLAGS_RELEASE "-Xcompiler=\"-MD -O2 -Ob2\" -DNDEBUG")
-        set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-Xcompiler=\"-MD -Zi -O2 -Ob1\" -DNDEBUG")
-        set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Xcompiler=\"-MD -O1 -Ob1\" -DNDEBUG")
-    endif()
+  if(WITH_GPU)
+    set(CMAKE_CUDA_FLAGS_DEBUG "-Xcompiler=\"-MDd -Zi -Ob0 -Od /RTC1\"")
+    set(CMAKE_CUDA_FLAGS_RELEASE "-Xcompiler=\"-MD -O2 -Ob2\" -DNDEBUG")
+    set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO
+        "-Xcompiler=\"-MD -Zi -O2 -Ob1\" -DNDEBUG")
+    set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Xcompiler=\"-MD -O1 -Ob1\" -DNDEBUG")
+  endif()
 
-    # It can specify CUDA compile flag manualy,
-    # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
-    # because CUDA will update by nvidia, then error will occur.
-    # Now, it's only used in VS2015 + CUDA:[10.0, 10.2]
-    set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
+  # It can specify CUDA compile flag manualy,
+  # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
+  # because CUDA will update by nvidia, then error will occur.
+  # Now, it's only used in VS2015 + CUDA:[10.0, 10.2]
+  set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
 endif()
diff --git a/cmake/miopen.cmake b/cmake/miopen.cmake
index 493c37955f725..392ff0401eaef 100644
--- a/cmake/miopen.cmake
+++ b/cmake/miopen.cmake
@@ -1,65 +1,77 @@
 if(NOT WITH_ROCM)
-    return()
+  return()
 endif()
 
 # Now we don't support ROCm on windows
 if(WIN32)
-    return()
+  return()
 endif()
 
-set(MIOPEN_ROOT ${ROCM_PATH}/miopen CACHE PATH "MIOPEN ROOT")
+set(MIOPEN_ROOT
+    ${ROCM_PATH}/miopen
+    CACHE PATH "MIOPEN ROOT")
 
-find_path(MIOPEN_INCLUDE_DIR "miopen/miopen.h"
-    PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/include ${MIOPEN_ROOT}/local/include
-          $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/include $ENV{MIOPEN_ROOT}/local/include
-          NO_DEFAULT_PATH
-)
+find_path(
+  MIOPEN_INCLUDE_DIR "miopen/miopen.h"
+  PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/include ${MIOPEN_ROOT}/local/include
+        $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/include
+        $ENV{MIOPEN_ROOT}/local/include
+  NO_DEFAULT_PATH)
 
-find_library(MIOPEN_LIBRARY NAMES "libMIOpen.so"
-    PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/lib ${MIOPEN_ROOT}/lib64 ${__libpath_hist} 
-          $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/lib $ENV{MIOPEN_ROOT}/lib64 
-          NO_DEFAULT_PATH 
-    DOC "Path to MIOpen library.")
+find_library(
+  MIOPEN_LIBRARY
+  NAMES "libMIOpen.so"
+  PATHS ${MIOPEN_ROOT}
+        ${MIOPEN_ROOT}/lib
+        ${MIOPEN_ROOT}/lib64
+        ${__libpath_hist}
+        $ENV{MIOPEN_ROOT}
+        $ENV{MIOPEN_ROOT}/lib
+        $ENV{MIOPEN_ROOT}/lib64
+  NO_DEFAULT_PATH
+  DOC "Path to MIOpen library.")
 
 if(MIOPEN_INCLUDE_DIR AND MIOPEN_LIBRARY)
-    set(MIOPEN_FOUND ON)
+  set(MIOPEN_FOUND ON)
 else()
-    set(MIOPEN_FOUND OFF)
+  set(MIOPEN_FOUND OFF)
 endif()
 
-macro(find_miopen_version miopen_header_file) 
-    file(READ ${miopen_header_file} MIOPEN_VERSION_FILE_CONTENTS)
-    get_filename_component(MIOPEN_LIB_PATH ${MIOPEN_LIBRARY} DIRECTORY)
+macro(find_miopen_version miopen_header_file)
+  file(READ ${miopen_header_file} MIOPEN_VERSION_FILE_CONTENTS)
+  get_filename_component(MIOPEN_LIB_PATH ${MIOPEN_LIBRARY} DIRECTORY)
 
-    string(REGEX MATCH "define MIOPEN_VERSION_MAJOR +([0-9]+)" MIOPEN_MAJOR_VERSION
-        "${MIOPEN_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR +([0-9]+)" "\\1"
-        MIOPEN_MAJOR_VERSION "${MIOPEN_MAJOR_VERSION}")
-    string(REGEX MATCH "define MIOPEN_VERSION_MINOR +([0-9]+)" MIOPEN_MINOR_VERSION
-        "${MIOPEN_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define MIOPEN_VERSION_MINOR +([0-9]+)" "\\1"
-        MIOPEN_MINOR_VERSION "${MIOPEN_MINOR_VERSION}")
-    string(REGEX MATCH "define MIOPEN_VERSION_PATCH +([0-9]+)" MIOPEN_PATCH_VERSION
-        "${MIOPEN_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define MIOPEN_VERSION_PATCH +([0-9]+)" "\\1"
-        MIOPEN_PATCH_VERSION "${MIOPEN_PATCH_VERSION}")
-    string(REGEX MATCH "define MIOPEN_VERSION_TWEAK +([0-9]+)" MIOPEN_TWEAK_VERSION
-        "${MIOPEN_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define MIOPEN_VERSION_TWEAK +([0-9]+)" "\\1"
-        MIOPEN_TWEAK_VERSION "${MIOPEN_TWEAK_VERSION}")
+  string(REGEX MATCH "define MIOPEN_VERSION_MAJOR +([0-9]+)"
+               MIOPEN_MAJOR_VERSION "${MIOPEN_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR +([0-9]+)" "\\1"
+                       MIOPEN_MAJOR_VERSION "${MIOPEN_MAJOR_VERSION}")
+  string(REGEX MATCH "define MIOPEN_VERSION_MINOR +([0-9]+)"
+               MIOPEN_MINOR_VERSION "${MIOPEN_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define MIOPEN_VERSION_MINOR +([0-9]+)" "\\1"
+                       MIOPEN_MINOR_VERSION "${MIOPEN_MINOR_VERSION}")
+  string(REGEX MATCH "define MIOPEN_VERSION_PATCH +([0-9]+)"
+               MIOPEN_PATCH_VERSION "${MIOPEN_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define MIOPEN_VERSION_PATCH +([0-9]+)" "\\1"
+                       MIOPEN_PATCH_VERSION "${MIOPEN_PATCH_VERSION}")
+  string(REGEX MATCH "define MIOPEN_VERSION_TWEAK +([0-9]+)"
+               MIOPEN_TWEAK_VERSION "${MIOPEN_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define MIOPEN_VERSION_TWEAK +([0-9]+)" "\\1"
+                       MIOPEN_TWEAK_VERSION "${MIOPEN_TWEAK_VERSION}")
 
-    if(NOT MIOPEN_MAJOR_VERSION)
-        set(MIOPEN_VERSION "???")
-    else()
-        add_definitions("-DMIOPEN_MAJOR_VERSION=\"${MIOPEN_MAJOR_VERSION}\"")
-        math(EXPR MIOPEN_VERSION
-            "${MIOPEN_MAJOR_VERSION} * 1000 +
+  if(NOT MIOPEN_MAJOR_VERSION)
+    set(MIOPEN_VERSION "???")
+  else()
+    add_definitions("-DMIOPEN_MAJOR_VERSION=\"${MIOPEN_MAJOR_VERSION}\"")
+    math(EXPR MIOPEN_VERSION "${MIOPEN_MAJOR_VERSION} * 1000 +
              ${MIOPEN_MINOR_VERSION} * 10 + ${MIOPEN_PATCH_VERSION}")
-        message(STATUS "Current MIOpen header is ${MIOPEN_INCLUDE_DIR}/miopen/miopen.h "
-          "Current MIOpen version is v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}.${MIOPEN_PATCH_VERSION}. ")
-    endif()
+    message(
+      STATUS
+        "Current MIOpen header is ${MIOPEN_INCLUDE_DIR}/miopen/miopen.h "
+        "Current MIOpen version is v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}.${MIOPEN_PATCH_VERSION}. "
+    )
+  endif()
 endmacro()
 
 if(MIOPEN_FOUND)
-  find_miopen_version(${MIOPEN_INCLUDE_DIR}/miopen/version.h) 
+  find_miopen_version(${MIOPEN_INCLUDE_DIR}/miopen/version.h)
 endif()
diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake
index 9124fec0b856a..8ce3cd91ac82a 100644
--- a/cmake/nccl.cmake
+++ b/cmake/nccl.cmake
@@ -1,55 +1,59 @@
 if(NOT WITH_GPU)
-    return()
+  return()
 endif()
 
 # Now we don't support NCCL on windows
 if(WIN32)
-    return()
+  return()
 endif()
 
 if(WITH_NCCL)
-    set(NCCL_ROOT "/usr" CACHE PATH "NCCL ROOT")
-    find_path(NCCL_INCLUDE_DIR nccl.h
-        PATHS ${NCCL_ROOT} ${NCCL_ROOT}/include ${NCCL_ROOT}/local/include
-        $ENV{NCCL_ROOT} $ENV{NCCL_ROOT}/include $ENV{NCCL_ROOT}/local/include
-        NO_DEFAULT_PATH
-    )
+  set(NCCL_ROOT
+      "/usr"
+      CACHE PATH "NCCL ROOT")
+  find_path(
+    NCCL_INCLUDE_DIR nccl.h
+    PATHS ${NCCL_ROOT} ${NCCL_ROOT}/include ${NCCL_ROOT}/local/include
+          $ENV{NCCL_ROOT} $ENV{NCCL_ROOT}/include $ENV{NCCL_ROOT}/local/include
+    NO_DEFAULT_PATH)
 
-    file(READ ${NCCL_INCLUDE_DIR}/nccl.h NCCL_VERSION_FILE_CONTENTS)
+  file(READ ${NCCL_INCLUDE_DIR}/nccl.h NCCL_VERSION_FILE_CONTENTS)
 
-    string(REGEX MATCH "define NCCL_VERSION_CODE +([0-9]+)"
-        NCCL_VERSION "${NCCL_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define NCCL_VERSION_CODE +([0-9]+)" "\\1"
-        NCCL_VERSION "${NCCL_VERSION}")
+  string(REGEX MATCH "define NCCL_VERSION_CODE +([0-9]+)" NCCL_VERSION
+               "${NCCL_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define NCCL_VERSION_CODE +([0-9]+)" "\\1" NCCL_VERSION
+                       "${NCCL_VERSION}")
 
-    if("${NCCL_VERSION}" GREATER "2000")
-        message(STATUS "Current NCCL header is ${NCCL_INCLUDE_DIR}/nccl.h. "
-            "Current NCCL version is v${NCCL_VERSION}. ")
-    else()
-        # in old version nccl, it may not define NCCL_VERSION_CODE
-        string(REGEX MATCH "define NCCL_MAJOR +([0-9]+)" NCCL_MAJOR_VERSION
-            "${NCCL_VERSION_FILE_CONTENTS}")
-        string(REGEX REPLACE "define NCCL_MAJOR +([0-9]+)" "\\1"
-            NCCL_MAJOR_VERSION "${NCCL_MAJOR_VERSION}")
-        string(REGEX MATCH "define NCCL_MINOR +([0-9]+)" NCCL_MINOR_VERSION
-            "${NCCL_VERSION_FILE_CONTENTS}")
-        string(REGEX REPLACE "define NCCL_MINOR +([0-9]+)" "\\1"
-            NCCL_MINOR_VERSION "${NCCL_MINOR_VERSION}")
-        string(REGEX MATCH "define NCCL_PATCH +([0-9]+)"
-            NCCL_PATCH_VERSION "${NCCL_VERSION_FILE_CONTENTS}")
-        string(REGEX REPLACE "define NCCL_PATCH +([0-9]+)" "\\1"
-            NCCL_PATCH_VERSION "${NCCL_PATCH_VERSION}")
+  if("${NCCL_VERSION}" GREATER "2000")
+    message(STATUS "Current NCCL header is ${NCCL_INCLUDE_DIR}/nccl.h. "
+                   "Current NCCL version is v${NCCL_VERSION}. ")
+  else()
+    # in old version nccl, it may not define NCCL_VERSION_CODE
+    string(REGEX MATCH "define NCCL_MAJOR +([0-9]+)" NCCL_MAJOR_VERSION
+                 "${NCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define NCCL_MAJOR +([0-9]+)" "\\1" NCCL_MAJOR_VERSION
+                         "${NCCL_MAJOR_VERSION}")
+    string(REGEX MATCH "define NCCL_MINOR +([0-9]+)" NCCL_MINOR_VERSION
+                 "${NCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define NCCL_MINOR +([0-9]+)" "\\1" NCCL_MINOR_VERSION
+                         "${NCCL_MINOR_VERSION}")
+    string(REGEX MATCH "define NCCL_PATCH +([0-9]+)" NCCL_PATCH_VERSION
+                 "${NCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define NCCL_PATCH +([0-9]+)" "\\1" NCCL_PATCH_VERSION
+                         "${NCCL_PATCH_VERSION}")
 
-        if(NOT NCCL_MAJOR_VERSION)
-            set(NCCL_VERSION "0")
-        else()
-            math(EXPR NCCL_VERSION
-                "${NCCL_MAJOR_VERSION} * 1000 +
+    if(NOT NCCL_MAJOR_VERSION)
+      set(NCCL_VERSION "0")
+    else()
+      math(EXPR NCCL_VERSION "${NCCL_MAJOR_VERSION} * 1000 +
                  ${NCCL_MINOR_VERSION} * 100 + ${NCCL_PATCH_VERSION}")
-        endif()
-        add_definitions("-DNCCL_VERSION_CODE=$NCCL_VERSION")
-
-        message(STATUS "Current NCCL header is ${NCCL_INCLUDE_DIR}/nccl.h. "
-            "Current NCCL version is v${NCCL_MAJOR_VERSION}.${NCCL_MINOR_VERSION}.${NCCL_PATCH_VERSION} ")
     endif()
+    add_definitions("-DNCCL_VERSION_CODE=$NCCL_VERSION")
+
+    message(
+      STATUS
+        "Current NCCL header is ${NCCL_INCLUDE_DIR}/nccl.h. "
+        "Current NCCL version is v${NCCL_MAJOR_VERSION}.${NCCL_MINOR_VERSION}.${NCCL_PATCH_VERSION} "
+    )
+  endif()
 endif()
diff --git a/cmake/neuware.cmake b/cmake/neuware.cmake
index a371a0032d991..16dbf16899b5d 100644
--- a/cmake/neuware.cmake
+++ b/cmake/neuware.cmake
@@ -1,18 +1,18 @@
 if(NOT WITH_MLU)
-    return()
+  return()
 endif()
 
 if(NOT ENV{NEUWARE_HOME})
-    set(NEUWARE_HOME "/usr/local/neuware")
+  set(NEUWARE_HOME "/usr/local/neuware")
 else()
-    set(NEUWARE_HOME $ENV{NEUWARE_HOME})
+  set(NEUWARE_HOME $ENV{NEUWARE_HOME})
 endif()
 message(STATUS "NEUWARE_HOME: " ${NEUWARE_HOME})
 
 set(NEUWARE_INCLUDE_DIR ${NEUWARE_HOME}/include)
 set(NEUWARE_LIB_DIR ${NEUWARE_HOME}/lib64)
 
-INCLUDE_DIRECTORIES(${NEUWARE_INCLUDE_DIR})
+include_directories(${NEUWARE_INCLUDE_DIR})
 
 set(CNNL_LIB ${NEUWARE_LIB_DIR}/libcnnl.so)
 set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so)
@@ -23,10 +23,10 @@ generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake")
 set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB} ${CNPAPI_LIB})
 
 if(WITH_CNCL)
-      MESSAGE(STATUS "Compile with CNCL!")
-      ADD_DEFINITIONS(-DPADDLE_WITH_CNCL)
-      set(CNCL_LIB ${NEUWARE_LIB_DIR}/libcncl.so)
-      list(APPEND NEUWARE_LIB_DEPS ${CNCL_LIB})
+  message(STATUS "Compile with CNCL!")
+  add_definitions(-DPADDLE_WITH_CNCL)
+  set(CNCL_LIB ${NEUWARE_LIB_DIR}/libcncl.so)
+  list(APPEND NEUWARE_LIB_DEPS ${CNCL_LIB})
 endif()
 
-TARGET_LINK_LIBRARIES(neuware_lib ${NEUWARE_LIB_DEPS})
+target_link_libraries(neuware_lib ${NEUWARE_LIB_DEPS})
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 51e4bd3ac41c9..4e0cc1027eff0 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -3,538 +3,611 @@ include(unity_build)
 set(PART_CUDA_KERNEL_FILES)
 
 function(find_register FILENAME PATTERN OUTPUT)
-# find the op_name of REGISTER_OPERATOR(op_name, ...), REGISTER_OP_CPU_KERNEL(op_name, ...) , etc.
-# set op_name to OUTPUT
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs "")
-    file(READ ${FILENAME} CONTENT)
-    # message ("number of arguments sent to function: ${ARGC}")
-    # message ("all function arguments:               ${ARGV}")
-    # message("PATTERN ${PATTERN}")
-    string(REGEX MATCH "${PATTERN}\\([ \t\r\n]*[a-z0-9_]*," register "${CONTENT}")
-    if (NOT register STREQUAL "")
-        string(REPLACE "${PATTERN}(" "" register "${register}")
-        string(REPLACE "," "" register "${register}")
-        # [ \t\r\n]+ is used for blank characters.
-        # Here we use '+' instead of '*' since it is a REPLACE operation.
-        string(REGEX REPLACE "[ \t\r\n]+" "" register "${register}")
-    endif()
-    
-    set(${OUTPUT} ${register} PARENT_SCOPE)
+  # find the op_name of REGISTER_OPERATOR(op_name, ...), REGISTER_OP_CPU_KERNEL(op_name, ...) , etc.
+  # set op_name to OUTPUT
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs "")
+  file(READ ${FILENAME} CONTENT)
+  # message ("number of arguments sent to function: ${ARGC}")
+  # message ("all function arguments:               ${ARGV}")
+  # message("PATTERN ${PATTERN}")
+  string(REGEX MATCH "${PATTERN}\\([ \t\r\n]*[a-z0-9_]*," register "${CONTENT}")
+  if(NOT register STREQUAL "")
+    string(REPLACE "${PATTERN}(" "" register "${register}")
+    string(REPLACE "," "" register "${register}")
+    # [ \t\r\n]+ is used for blank characters.
+    # Here we use '+' instead of '*' since it is a REPLACE operation.
+    string(REGEX REPLACE "[ \t\r\n]+" "" register "${register}")
+  endif()
+
+  set(${OUTPUT}
+      ${register}
+      PARENT_SCOPE)
 endfunction()
 
 function(op_library TARGET)
-    # op_library is a function to create op library. The interface is same as
-    # cc_library. But it handle split GPU/CPU code and link some common library
-    # for ops.
-    set(cc_srcs)
-    set(cu_srcs)
-    set(hip_srcs)
-    set(cu_cc_srcs)
-    set(hip_cc_srcs)
-    set(xpu_cc_srcs)
-    set(xpu_kp_cc_srcs)
-    set(npu_cc_srcs)
-    set(mlu_cc_srcs)
-    set(cudnn_cu_cc_srcs)
-    set(miopen_cu_cc_srcs)
-    set(cudnn_cu_srcs)
-    set(miopen_cu_srcs)
-    set(CUDNN_FILE)
-    set(MIOPEN_FILE)
-    set(mkldnn_cc_srcs)
-    set(MKLDNN_FILE)
-    set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
-    if (WITH_ASCEND_CL)
-      set(op_common_deps ${op_common_deps} npu_op_runner)
-    endif()
-    if (WITH_MLU)
-      set(op_common_deps ${op_common_deps} mlu_baseop)
-    endif()
+  # op_library is a function to create op library. The interface is same as
+  # cc_library. But it handle split GPU/CPU code and link some common library
+  # for ops.
+  set(cc_srcs)
+  set(cu_srcs)
+  set(hip_srcs)
+  set(cu_cc_srcs)
+  set(hip_cc_srcs)
+  set(xpu_cc_srcs)
+  set(xpu_kp_cc_srcs)
+  set(npu_cc_srcs)
+  set(mlu_cc_srcs)
+  set(cudnn_cu_cc_srcs)
+  set(miopen_cu_cc_srcs)
+  set(cudnn_cu_srcs)
+  set(miopen_cu_srcs)
+  set(CUDNN_FILE)
+  set(MIOPEN_FILE)
+  set(mkldnn_cc_srcs)
+  set(MKLDNN_FILE)
+  set(op_common_deps operator op_registry math_function layer
+                     common_infer_shape_functions)
+  if(WITH_ASCEND_CL)
+    set(op_common_deps ${op_common_deps} npu_op_runner)
+  endif()
+  if(WITH_MLU)
+    set(op_common_deps ${op_common_deps} mlu_baseop)
+  endif()
 
-    # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
-    set(options UNITY)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    set(pybind_flag 0)
-    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
+  # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
+  set(options UNITY)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  set(pybind_flag 0)
+  cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
 
-    list(LENGTH op_library_SRCS op_library_SRCS_len)
-    if (${op_library_SRCS_len} EQUAL 0)
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
-            list(APPEND cc_srcs ${TARGET}.cc)
-        endif()
-        if(WITH_GPU)
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
-                list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
-            endif()
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
-                list(APPEND cu_srcs ${TARGET}.cu)
-            endif()
-            # rename in KP: .kps -> .cu
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
-                file(COPY ${TARGET}.kps DESTINATION  ${CMAKE_CURRENT_BINARY_DIR})
-                file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
-                list(APPEND cu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
-            endif()
-            if (WITH_NV_JETSON)
-                list(REMOVE_ITEM cu_srcs "decode_jpeg_op.cu")
-            endif()
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
-                set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
-                        ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
-                list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
-            endif()
-            string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
-                list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
-            endif()
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu)
-                list(APPEND cudnn_cu_srcs ${CUDNN_FILE}.cu)
-            endif()
-        endif()
-        if(WITH_ROCM)
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
-                list(APPEND hip_cc_srcs ${TARGET}.cu.cc)
-            endif()
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
-                list(APPEND hip_srcs ${TARGET}.cu)
-            endif()
-            # rename in KP: .kps -> .cu
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
-                file(COPY ${TARGET}.kps DESTINATION  ${CMAKE_CURRENT_BINARY_DIR})
-                file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
-                list(APPEND hip_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
-            endif()
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
-                set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
-                        ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
-                list(APPEND hip_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
-            endif()
-            string(REPLACE "_op" "_cudnn_op" MIOPEN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.cu.cc)
-                list(APPEND miopen_cu_cc_srcs ${MIOPEN_FILE}.cu.cc)
-            endif()
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.cu)
-                list(APPEND miopen_cu_srcs ${MIOPEN_FILE}.cu)
-            endif()
-        endif()
-        if(WITH_MKLDNN)
-            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc)
-                list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
-            endif()
-        endif()
-        if(WITH_XPU)
-            string(REPLACE "_op" "_op_xpu" XPU_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_FILE}.cc)
-                list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
-            endif()
-        endif()
-        if(WITH_XPU_KP)
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu)
-                list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu)
-            endif()
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
-                list(APPEND xpu_kp_cc_srcs ${TARGET}.kps)
-            endif()
-        endif()
-        if(WITH_ASCEND_CL)
-            string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
-                list(APPEND npu_cc_srcs ${NPU_FILE}.cc)
-            endif()
-        endif()
-        if(WITH_MLU)
-            string(REPLACE "_op" "_op_mlu" MLU_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MLU_FILE}.cc)
-                list(APPEND mlu_cc_srcs ${MLU_FILE}.cc)
-            endif()
-        endif()
-    else()
-        foreach(src ${op_library_SRCS})
-            if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$")
-                list(APPEND miopen_cu_srcs ${src})
-            elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu$")
-                list(APPEND hip_srcs ${src})
-            elseif(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu.cc$")
-                list(APPEND miopen_cu_cc_srcs ${src})
-            elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu.cc$")
-                list(APPEND hip_cc_srcs ${src})
-            elseif(WITH_GPU AND ${src} MATCHES ".*_cudnn_op.cu$")
-                list(APPEND cudnn_cu_srcs ${src})
-            elseif (WITH_GPU AND ${src} MATCHES ".*\\.cu$")
-                list(APPEND cu_srcs ${src})
-            elseif(WITH_GPU AND ${src} MATCHES ".*_cudnn_op.cu.cc$")
-                list(APPEND cudnn_cu_cc_srcs ${src})
-            elseif(WITH_GPU AND ${src} MATCHES ".*\\.cu.cc$")
-                list(APPEND cu_cc_srcs ${src})
-            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
-                list(APPEND mkldnn_cc_srcs ${src})
-            elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
-                list(APPEND xpu_cc_srcs ${src})
-            elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$")
-                list(APPEND xpu_kp_cc_srcs ${src})
-            elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$")
-                list(APPEND xpu_kp_cc_srcs ${src})
-            elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
-                list(APPEND npu_cc_srcs ${src})
-            elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
-                list(APPEND mlu_cc_srcs ${src})
-            elseif(${src} MATCHES ".*\\.cc$")
-                list(APPEND cc_srcs ${src})
-            else()
-                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu or .xpu")
-            endif()
-        endforeach()
+  list(LENGTH op_library_SRCS op_library_SRCS_len)
+  if(${op_library_SRCS_len} EQUAL 0)
+    if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+      list(APPEND cc_srcs ${TARGET}.cc)
+    endif()
+    if(WITH_GPU)
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+        list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
+      endif()
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+        list(APPEND cu_srcs ${TARGET}.cu)
+      endif()
+      # rename in KP: .kps -> .cu
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
+        file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+        file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps
+             ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
+        list(APPEND cu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
+      endif()
+      if(WITH_NV_JETSON)
+        list(REMOVE_ITEM cu_srcs "decode_jpeg_op.cu")
+      endif()
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+        set(PART_CUDA_KERNEL_FILES
+            ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
+            ${PART_CUDA_KERNEL_FILES}
+            PARENT_SCOPE)
+        list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+      endif()
+      string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
+        list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
+      endif()
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu)
+        list(APPEND cudnn_cu_srcs ${CUDNN_FILE}.cu)
+      endif()
+    endif()
+    if(WITH_ROCM)
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+        list(APPEND hip_cc_srcs ${TARGET}.cu.cc)
+      endif()
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+        list(APPEND hip_srcs ${TARGET}.cu)
+      endif()
+      # rename in KP: .kps -> .cu
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
+        file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+        file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps
+             ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
+        list(APPEND hip_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
+      endif()
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+        set(PART_CUDA_KERNEL_FILES
+            ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
+            ${PART_CUDA_KERNEL_FILES}
+            PARENT_SCOPE)
+        list(APPEND hip_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+      endif()
+      string(REPLACE "_op" "_cudnn_op" MIOPEN_FILE "${TARGET}")
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.cu.cc)
+        list(APPEND miopen_cu_cc_srcs ${MIOPEN_FILE}.cu.cc)
+      endif()
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.cu)
+        list(APPEND miopen_cu_srcs ${MIOPEN_FILE}.cu)
+      endif()
+    endif()
+    if(WITH_MKLDNN)
+      string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc)
+        list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
+      endif()
+    endif()
+    if(WITH_XPU)
+      string(REPLACE "_op" "_op_xpu" XPU_FILE "${TARGET}")
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_FILE}.cc)
+        list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
+      endif()
+    endif()
+    if(WITH_XPU_KP)
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu)
+        list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu)
+      endif()
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
+        list(APPEND xpu_kp_cc_srcs ${TARGET}.kps)
+      endif()
+    endif()
+    if(WITH_ASCEND_CL)
+      string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
+        list(APPEND npu_cc_srcs ${NPU_FILE}.cc)
+      endif()
     endif()
-    
-    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
-    list(LENGTH xpu_kp_cc_srcs xpu_kp_cc_srcs_len)
-    list(LENGTH cc_srcs cc_srcs_len)
-    if (${cc_srcs_len} EQUAL 0)
-        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
+    if(WITH_MLU)
+      string(REPLACE "_op" "_op_mlu" MLU_FILE "${TARGET}")
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MLU_FILE}.cc)
+        list(APPEND mlu_cc_srcs ${MLU_FILE}.cc)
+      endif()
     endif()
-    if (WIN32)
+  else()
+    foreach(src ${op_library_SRCS})
+      if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$")
+        list(APPEND miopen_cu_srcs ${src})
+      elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu$")
+        list(APPEND hip_srcs ${src})
+      elseif(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu.cc$")
+        list(APPEND miopen_cu_cc_srcs ${src})
+      elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu.cc$")
+        list(APPEND hip_cc_srcs ${src})
+      elseif(WITH_GPU AND ${src} MATCHES ".*_cudnn_op.cu$")
+        list(APPEND cudnn_cu_srcs ${src})
+      elseif(WITH_GPU AND ${src} MATCHES ".*\\.cu$")
+        list(APPEND cu_srcs ${src})
+      elseif(WITH_GPU AND ${src} MATCHES ".*_cudnn_op.cu.cc$")
+        list(APPEND cudnn_cu_cc_srcs ${src})
+      elseif(WITH_GPU AND ${src} MATCHES ".*\\.cu.cc$")
+        list(APPEND cu_cc_srcs ${src})
+      elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
+        list(APPEND mkldnn_cc_srcs ${src})
+      elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
+        list(APPEND xpu_cc_srcs ${src})
+      elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$")
+        list(APPEND xpu_kp_cc_srcs ${src})
+      elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$")
+        list(APPEND xpu_kp_cc_srcs ${src})
+      elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
+        list(APPEND npu_cc_srcs ${src})
+      elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
+        list(APPEND mlu_cc_srcs ${src})
+      elseif(${src} MATCHES ".*\\.cc$")
+        list(APPEND cc_srcs ${src})
+      else()
+        message(
+          FATAL_ERROR
+            "${TARGET} Source file ${src} should only be .cc or .cu or .xpu")
+      endif()
+    endforeach()
+  endif()
+
+  list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
+  list(LENGTH xpu_kp_cc_srcs xpu_kp_cc_srcs_len)
+  list(LENGTH cc_srcs cc_srcs_len)
+  if(${cc_srcs_len} EQUAL 0)
+    message(
+      FATAL_ERROR
+        "The op library ${TARGET} should contains at least one .cc file")
+  endif()
+  if(WIN32)
     # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
     foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
-        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
-          return()
-        endif()
+      if("${TARGET}" STREQUAL "${windows_unsupport_op}")
+        return()
+      endif()
     endforeach()
-    endif(WIN32)
+  endif(WIN32)
+
+  # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
+  if(WITH_UNITY_BUILD AND op_library_UNITY)
+    # Generate the unity target name by the directory where source files located.
+    string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET
+                   ${CMAKE_CURRENT_SOURCE_DIR})
+    string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET})
+    set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity")
+    if(NOT ${UNITY_TARGET} IN_LIST OP_LIBRARY)
+      set(OP_LIBRARY
+          ${UNITY_TARGET} ${OP_LIBRARY}
+          CACHE INTERNAL "op libs")
+    endif()
+  else()
+    set(OP_LIBRARY
+        ${TARGET} ${OP_LIBRARY}
+        CACHE INTERNAL "op libs")
+  endif()
 
+  list(LENGTH op_library_DEPS op_library_DEPS_len)
+  if(${op_library_DEPS_len} GREATER 0)
+    set(DEPS_OPS
+        ${TARGET} ${DEPS_OPS}
+        PARENT_SCOPE)
+  endif()
+  if(WITH_GPU)
     # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
     if(WITH_UNITY_BUILD AND op_library_UNITY)
-        # Generate the unity target name by the directory where source files located.
-        string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET ${CMAKE_CURRENT_SOURCE_DIR})
-        string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET})
-        set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity")
-        if(NOT ${UNITY_TARGET} IN_LIST OP_LIBRARY)
-            set(OP_LIBRARY ${UNITY_TARGET} ${OP_LIBRARY} CACHE INTERNAL "op libs")
-        endif()
+      # Combine the cc and cu source files.
+      compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${cu_cc_srcs}
+                                   ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs})
+      compose_unity_target_sources(${UNITY_TARGET} cu ${cudnn_cu_srcs}
+                                   ${cu_srcs})
+      if(TARGET ${UNITY_TARGET})
+        # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
+        target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources}
+                                               ${unity_target_cu_sources})
+      else()
+        # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files.
+        nv_library(
+          ${UNITY_TARGET}
+          SRCS ${unity_target_cc_sources} ${unity_target_cu_sources}
+          DEPS ${op_library_DEPS} ${op_common_deps})
+      endif()
+      # Add alias library to handle dependencies.
+      add_library(${TARGET} ALIAS ${UNITY_TARGET})
     else()
-        set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} CACHE INTERNAL "op libs")
+      nv_library(
+        ${TARGET}
+        SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cudnn_cu_srcs}
+             ${mkldnn_cc_srcs} ${cu_srcs}
+        DEPS ${op_library_DEPS} ${op_common_deps})
     endif()
-
-    list(LENGTH op_library_DEPS op_library_DEPS_len)
-    if (${op_library_DEPS_len} GREATER 0)
-        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
+  elseif(WITH_ROCM)
+    list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
+    list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
+    list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
+    list(REMOVE_ITEM hip_srcs "cholesky_solve_op.cu")
+    list(REMOVE_ITEM hip_srcs "lu_op.cu")
+    list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu")
+    list(REMOVE_ITEM hip_srcs "svd_op.cu")
+    list(REMOVE_ITEM hip_srcs "eigvalsh_op.cu")
+    list(REMOVE_ITEM hip_srcs "qr_op.cu")
+    list(REMOVE_ITEM hip_srcs "eigh_op.cu")
+    list(REMOVE_ITEM hip_srcs "lstsq_op.cu")
+    list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
+    list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
+    hip_library(
+      ${TARGET}
+      SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs}
+           ${mkldnn_cc_srcs} ${hip_srcs}
+      DEPS ${op_library_DEPS} ${op_common_deps})
+  elseif(WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
+    xpu_library(
+      ${TARGET}
+      SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs}
+      DEPS ${op_library_DEPS} ${op_common_deps})
+  else()
+    # deal with CANN version control while registering NPU operators before build
+    if(WITH_ASCEND_CL)
+      if(CANN_VERSION LESS 504000)
+        list(REMOVE_ITEM npu_cc_srcs "multinomial_op_npu.cc")
+        list(REMOVE_ITEM npu_cc_srcs "take_along_axis_op_npu.cc")
+      endif()
     endif()
-    if (WITH_GPU)
-        # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
-        if(WITH_UNITY_BUILD AND op_library_UNITY)
-            # Combine the cc and cu source files.
-            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs})
-            compose_unity_target_sources(${UNITY_TARGET} cu ${cudnn_cu_srcs} ${cu_srcs})
-            if(TARGET ${UNITY_TARGET})
-                # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
-                target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources} ${unity_target_cu_sources})
-            else()
-                # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files.
-                nv_library(${UNITY_TARGET} SRCS ${unity_target_cc_sources} ${unity_target_cu_sources} DEPS ${op_library_DEPS} ${op_common_deps})
-            endif()
-            # Add alias library to handle dependencies.
-            add_library(${TARGET} ALIAS ${UNITY_TARGET})
-        else()
-            nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cudnn_cu_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
-        endif()
-    elseif (WITH_ROCM)
-        list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
-        list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
-        list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
-        list(REMOVE_ITEM hip_srcs "cholesky_solve_op.cu")
-        list(REMOVE_ITEM hip_srcs "lu_op.cu")
-        list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu")
-        list(REMOVE_ITEM hip_srcs "svd_op.cu")
-        list(REMOVE_ITEM hip_srcs "eigvalsh_op.cu")
-        list(REMOVE_ITEM hip_srcs "qr_op.cu")
-        list(REMOVE_ITEM hip_srcs "eigh_op.cu")
-        list(REMOVE_ITEM hip_srcs "lstsq_op.cu")
-        list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
-        list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
-        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
-    elseif (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
-        xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps})
+    # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
+    if(WITH_UNITY_BUILD AND op_library_UNITY)
+      # Combine the cc source files.
+      compose_unity_target_sources(
+        ${UNITY_TARGET}
+        cc
+        ${cc_srcs}
+        ${mkldnn_cc_srcs}
+        ${xpu_cc_srcs}
+        ${npu_cc_srcs}
+        ${mlu_cc_srcs})
+      if(TARGET ${UNITY_TARGET})
+        # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
+        target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
+      else()
+        # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files.
+        cc_library(
+          ${UNITY_TARGET}
+          SRCS ${unity_target_cc_sources}
+          DEPS ${op_library_DEPS} ${op_common_deps})
+      endif()
+      # Add alias library to handle dependencies.
+      add_library(${TARGET} ALIAS ${UNITY_TARGET})
     else()
-        # deal with CANN version control while registering NPU operators before build
-        if (WITH_ASCEND_CL)
-            if (CANN_VERSION LESS 504000)
-                list(REMOVE_ITEM npu_cc_srcs "multinomial_op_npu.cc")
-                list(REMOVE_ITEM npu_cc_srcs "take_along_axis_op_npu.cc")
-            endif()
-        endif()
-        # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
-        if(WITH_UNITY_BUILD AND op_library_UNITY)
-            # Combine the cc source files.
-            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} ${mlu_cc_srcs})
-            if(TARGET ${UNITY_TARGET})
-                # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
-                target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
-            else()
-                # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files.
-                cc_library(${UNITY_TARGET} SRCS ${unity_target_cc_sources} DEPS ${op_library_DEPS} ${op_common_deps})
-            endif()
-            # Add alias library to handle dependencies.
-            add_library(${TARGET} ALIAS ${UNITY_TARGET})
-        else()
-            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} ${mlu_cc_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
-        endif()
+      cc_library(
+        ${TARGET}
+        SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs}
+             ${mlu_cc_srcs}
+        DEPS ${op_library_DEPS} ${op_common_deps})
     endif()
+  endif()
 
-    list(LENGTH cu_srcs cu_srcs_len)
-    list(LENGTH hip_srcs hip_srcs_len)
-    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-    list(LENGTH hip_cc_srcs hip_cc_srcs_len)
-    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
-    list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
-    list(LENGTH npu_cc_srcs npu_cc_srcs_len)
-    list(LENGTH mlu_cc_srcs mlu_cc_srcs_len)
+  list(LENGTH cu_srcs cu_srcs_len)
+  list(LENGTH hip_srcs hip_srcs_len)
+  list(LENGTH cu_cc_srcs cu_cc_srcs_len)
+  list(LENGTH hip_cc_srcs hip_cc_srcs_len)
+  list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+  list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
+  list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
+  list(LENGTH npu_cc_srcs npu_cc_srcs_len)
+  list(LENGTH mlu_cc_srcs mlu_cc_srcs_len)
 
-    # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op"
-    "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
+  # Define operators that don't need pybind here.
+  foreach(
+    manual_pybind_op
+    "compare_all_op"
+    "compare_op"
+    "logical_op"
+    "bitwise_op"
+    "nccl_op"
+    "tensor_array_read_write_op"
+    "tensorrt_engine_op"
+    "conv_fusion_op")
 
-        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
-            set(pybind_flag 1)
-        endif()
-    endforeach()
+    if("${TARGET}" STREQUAL "${manual_pybind_op}")
+      set(pybind_flag 1)
+    endif()
+  endforeach()
 
-    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
-    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
-    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
-    set(ORIGINAL_TARGET ${TARGET})
-    string(REGEX REPLACE "_op" "" TARGET "${TARGET}")
+  # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
+  # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
+  # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+  set(ORIGINAL_TARGET ${TARGET})
+  string(REGEX REPLACE "_op" "" TARGET "${TARGET}")
 
-    foreach(cc_src ${cc_srcs})
-        # pybind USE_OP_ITSELF
-        set(op_name "")
-        find_register(${cc_src} "REGISTER_OPERATOR" op_name)
-        if(NOT ${op_name} EQUAL "")
-            file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
-            # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn
-            set(TARGET ${op_name})  
-            set(pybind_flag 1)
-        endif()
-        
-        set(op_name "")
-        find_register(${cc_src} "REGISTER_OP_WITHOUT_GRADIENT" op_name)
-        if(NOT ${op_name} EQUAL "")
-            file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
-            # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn
-            set(TARGET ${op_name})  
-            set(pybind_flag 1)
-        endif()        
+  foreach(cc_src ${cc_srcs})
+    # pybind USE_OP_ITSELF
+    set(op_name "")
+    find_register(${cc_src} "REGISTER_OPERATOR" op_name)
+    if(NOT ${op_name} EQUAL "")
+      file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
+      # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn
+      set(TARGET ${op_name})
+      set(pybind_flag 1)
+    endif()
 
-        # pybind USE_OP_DEVICE_KERNEL for CPU
-        set(op_name "")
-        find_register(${cc_src} "REGISTER_OP_CPU_KERNEL" op_name)
-        if(NOT ${op_name} EQUAL "")
-            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CPU);\n")
-            # why change TARGET here?
-            # when building padle with on_infer, the REGISTER_OPERATOR(*_grad) will be removed before compiling (see details in remove_grad_op_and_kernel.py)
-            # in elementwise_op.cc, it will find REGISTER_OPERATOR(grad_add) and set TARGET to grad_add
-            # and, in the following "mkldnn" part, it will add USE_OP_DEVICE_KERNEL(grad_add, MKLDNN) to pybind.h
-            # however, grad_add has no mkldnn kernel. 
-            set(TARGET ${op_name})  
-            set(pybind_flag 1)
-        endif()
-    endforeach()
+    set(op_name "")
+    find_register(${cc_src} "REGISTER_OP_WITHOUT_GRADIENT" op_name)
+    if(NOT ${op_name} EQUAL "")
+      file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
+      # hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn
+      set(TARGET ${op_name})
+      set(pybind_flag 1)
+    endif()
 
-    # pybind USE_OP_DEVICE_KERNEL for CUDA
-    list (APPEND cu_srcs ${cu_cc_srcs})
-    # message("cu_srcs ${cu_srcs}")
-    foreach(cu_src ${cu_srcs})
-        set(op_name "")
-        find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name)
-        if(NOT ${op_name} EQUAL "")
-            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
-            set(pybind_flag 1)
-        endif()
-    endforeach()
+    # pybind USE_OP_DEVICE_KERNEL for CPU
+    set(op_name "")
+    find_register(${cc_src} "REGISTER_OP_CPU_KERNEL" op_name)
+    if(NOT ${op_name} EQUAL "")
+      file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CPU);\n")
+      # why change TARGET here?
+      # when building padle with on_infer, the REGISTER_OPERATOR(*_grad) will be removed before compiling (see details in remove_grad_op_and_kernel.py)
+      # in elementwise_op.cc, it will find REGISTER_OPERATOR(grad_add) and set TARGET to grad_add
+      # and, in the following "mkldnn" part, it will add USE_OP_DEVICE_KERNEL(grad_add, MKLDNN) to pybind.h
+      # however, grad_add has no mkldnn kernel.
+      set(TARGET ${op_name})
+      set(pybind_flag 1)
+    endif()
+  endforeach()
 
-    # pybind USE_OP_DEVICE_KERNEL for ROCm
-    list (APPEND hip_srcs ${hip_cc_srcs})
-    # message("hip_srcs ${hip_srcs}")
-    foreach(hip_src ${hip_srcs})
-        set(op_name "")
-        find_register(${hip_src} "REGISTER_OP_CUDA_KERNEL" op_name)
-        if(NOT ${op_name} EQUAL "")
-            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
-            set(pybind_flag 1)
-        endif()
-    endforeach()
+  # pybind USE_OP_DEVICE_KERNEL for CUDA
+  list(APPEND cu_srcs ${cu_cc_srcs})
+  # message("cu_srcs ${cu_srcs}")
+  foreach(cu_src ${cu_srcs})
+    set(op_name "")
+    find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name)
+    if(NOT ${op_name} EQUAL "")
+      file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
+      set(pybind_flag 1)
+    endif()
+  endforeach()
 
-    # pybind USE_OP_DEVICE_KERNEL for CUDNN/MIOPEN
-    list(APPEND cudnn_cu_srcs ${cudnn_cu_cc_srcs}) 
-    list(APPEND cudnn_cu_srcs ${miopen_cu_cc_srcs}) 
-    list(APPEND cudnn_cu_srcs ${miopen_cu_srcs})   
-    list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len) 
-    #message("cudnn_cu_srcs ${cudnn_cu_srcs}")
-    if(${cudnn_cu_srcs_len} GREATER 0 AND ${ORIGINAL_TARGET} STREQUAL "activation_op")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
-    else()
-    foreach(cudnn_src ${cudnn_cu_srcs})
-        set(op_name "")
-        find_register(${cudnn_src} "REGISTER_OP_KERNEL" op_name)
-        if(NOT ${op_name} EQUAL "")
-            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDNN);\n")
-            set(pybind_flag 1)
-        endif()
-    endforeach()
+  # pybind USE_OP_DEVICE_KERNEL for ROCm
+  list(APPEND hip_srcs ${hip_cc_srcs})
+  # message("hip_srcs ${hip_srcs}")
+  foreach(hip_src ${hip_srcs})
+    set(op_name "")
+    find_register(${hip_src} "REGISTER_OP_CUDA_KERNEL" op_name)
+    if(NOT ${op_name} EQUAL "")
+      file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
+      set(pybind_flag 1)
     endif()
+  endforeach()
 
+  # pybind USE_OP_DEVICE_KERNEL for CUDNN/MIOPEN
+  list(APPEND cudnn_cu_srcs ${cudnn_cu_cc_srcs})
+  list(APPEND cudnn_cu_srcs ${miopen_cu_cc_srcs})
+  list(APPEND cudnn_cu_srcs ${miopen_cu_srcs})
+  list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len)
+  #message("cudnn_cu_srcs ${cudnn_cu_srcs}")
+  if(${cudnn_cu_srcs_len} GREATER 0 AND ${ORIGINAL_TARGET} STREQUAL
+                                        "activation_op")
+    file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
+  else()
+    foreach(cudnn_src ${cudnn_cu_srcs})
+      set(op_name "")
+      find_register(${cudnn_src} "REGISTER_OP_KERNEL" op_name)
+      if(NOT ${op_name} EQUAL "")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDNN);\n")
+        set(pybind_flag 1)
+      endif()
+    endforeach()
+  endif()
 
-    if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
+  if(WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
     if(${ORIGINAL_TARGET} STREQUAL "activation_op")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, XPU);\n")
+      file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, XPU);\n")
     else()
-        foreach(xpu_src ${xpu_cc_srcs})
+      foreach(xpu_src ${xpu_cc_srcs})
         set(op_name "")
         find_register(${xpu_src} "REGISTER_OP_XPU_KERNEL" op_name)
         if(NOT ${op_name} EQUAL "")
-            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, XPU);\n")
-            set(pybind_flag 1)
+          file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, XPU);\n")
+          set(pybind_flag 1)
         else()
-            find_register(${xpu_src} "REGISTER_OP_XPU_KERNEL_FUNCTOR" op_name)
-            if(NOT ${op_name} EQUAL "")
-                file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, XPU);\n")
-                set(pybind_flag 1)
-            endif()
-        endif()
-        endforeach()
-    endif()
-    endif()
-
-    # pybind USE_OP_DEVICE_KERNEL for XPU KP
-    if (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
-        foreach(xpu_kp_src ${xpu_kp_cc_srcs})
-        set(op_name "")
-        find_register(${xpu_kp_src} "REGISTER_OP_KERNEL" op_name)
-        if(NOT ${op_name} EQUAL "")
-            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, KP);\n")
-            message(STATUS "Building KP Target: ${op_name}")
+          find_register(${xpu_src} "REGISTER_OP_XPU_KERNEL_FUNCTOR" op_name)
+          if(NOT ${op_name} EQUAL "")
+            file(APPEND ${pybind_file}
+                 "USE_OP_DEVICE_KERNEL(${op_name}, XPU);\n")
             set(pybind_flag 1)
+          endif()
         endif()
-        endforeach()
+      endforeach()
     endif()
+  endif()
 
-    # pybind USE_OP_DEVICE_KERNEL for NPU
-    if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
-        foreach(npu_src ${npu_cc_srcs})
-        set(op_name "")
-        find_register(${npu_src} "REGISTER_OP_NPU_KERNEL" op_name)
-        if(NOT ${op_name} EQUAL "")
-            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, NPU);\n")
-            set(pybind_flag 1)
-        endif()
-        endforeach()
-    endif()
+  # pybind USE_OP_DEVICE_KERNEL for XPU KP
+  if(WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
+    foreach(xpu_kp_src ${xpu_kp_cc_srcs})
+      set(op_name "")
+      find_register(${xpu_kp_src} "REGISTER_OP_KERNEL" op_name)
+      if(NOT ${op_name} EQUAL "")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, KP);\n")
+        message(STATUS "Building KP Target: ${op_name}")
+        set(pybind_flag 1)
+      endif()
+    endforeach()
+  endif()
 
-    # pybind USE_OP_DEVICE_KERNEL for MLU
-    if (WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0)
-        foreach(mlu_src ${mlu_cc_srcs})
-        set(op_name "")
-        find_register(${mlu_src} "REGISTER_OP_MLU_KERNEL" op_name)
-        if(NOT ${op_name} EQUAL "")
-            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MLU);\n")
-            set(pybind_flag 1)
-        endif()
-        endforeach()
-    endif()
+  # pybind USE_OP_DEVICE_KERNEL for NPU
+  if(WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
+    foreach(npu_src ${npu_cc_srcs})
+      set(op_name "")
+      find_register(${npu_src} "REGISTER_OP_NPU_KERNEL" op_name)
+      if(NOT ${op_name} EQUAL "")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, NPU);\n")
+        set(pybind_flag 1)
+      endif()
+    endforeach()
+  endif()
 
-    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
-    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
-      # Append first implemented MKLDNN activation operator
-      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
-      elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n")
-      elseif(${MKLDNN_FILE} STREQUAL "transpose_mkldnn_op")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, FP32);\n")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, S8);\n")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, U8);\n")
-      elseif(${MKLDNN_FILE} STREQUAL "fc_mkldnn_op")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, FP32);\n")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, S8);\n")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, U8);\n")
-      else()
-        foreach(mkldnn_src ${mkldnn_cc_srcs})
+  # pybind USE_OP_DEVICE_KERNEL for MLU
+  if(WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0)
+    foreach(mlu_src ${mlu_cc_srcs})
+      set(op_name "")
+      find_register(${mlu_src} "REGISTER_OP_MLU_KERNEL" op_name)
+      if(NOT ${op_name} EQUAL "")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MLU);\n")
+        set(pybind_flag 1)
+      endif()
+    endforeach()
+  endif()
+
+  # pybind USE_OP_DEVICE_KERNEL for MKLDNN
+  if(WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+    # Append first implemented MKLDNN activation operator
+    if(${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
+      file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
+    elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
+      file(APPEND ${pybind_file}
+           "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
+      file(APPEND ${pybind_file}
+           "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n")
+      file(APPEND ${pybind_file}
+           "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n")
+    elseif(${MKLDNN_FILE} STREQUAL "transpose_mkldnn_op")
+      file(APPEND ${pybind_file}
+           "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, FP32);\n")
+      file(APPEND ${pybind_file}
+           "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, S8);\n")
+      file(APPEND ${pybind_file}
+           "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, U8);\n")
+    elseif(${MKLDNN_FILE} STREQUAL "fc_mkldnn_op")
+      file(APPEND ${pybind_file}
+           "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, FP32);\n")
+      file(APPEND ${pybind_file}
+           "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, S8);\n")
+      file(APPEND ${pybind_file}
+           "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, U8);\n")
+    else()
+      foreach(mkldnn_src ${mkldnn_cc_srcs})
         set(op_name "")
         find_register(${mkldnn_src} "REGISTER_OP_KERNEL" op_name)
         if(NOT ${op_name} EQUAL "")
-            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MKLDNN);\n")
-            set(pybind_flag 1)
+          file(APPEND ${pybind_file}
+               "USE_OP_DEVICE_KERNEL(${op_name}, MKLDNN);\n")
+          set(pybind_flag 1)
         endif()
-        endforeach()        
-      endif()
+      endforeach()
     endif()
+  endif()
 
-    # pybind USE_NO_KERNEL_OP
-    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
-    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
-    string(REPLACE "_op" "" TARGET "${TARGET}")
-    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
-        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
-        set(pybind_flag 1)
-    endif()
+  # pybind USE_NO_KERNEL_OP
+  # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
+  string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
+  string(REPLACE "_op" "" TARGET "${TARGET}")
+  if(${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
+    file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
+    set(pybind_flag 1)
+  endif()
 
-    # pybind USE_OP
-    if (${pybind_flag} EQUAL 0)
-      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
-      if(${TARGET} STREQUAL "activation")
-        file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n")
-      elseif(${TARGET} STREQUAL "fake_dequantize")
-        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
-      elseif(${TARGET} STREQUAL "fake_quantize")
-        file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
-      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
-          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
-      else()
-        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
-      endif()
+  # pybind USE_OP
+  if(${pybind_flag} EQUAL 0)
+    # NOTE(*): activation use macro to regist the kernels, set use_op manually.
+    if(${TARGET} STREQUAL "activation")
+      file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n")
+    elseif(${TARGET} STREQUAL "fake_dequantize")
+      file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
+    elseif(${TARGET} STREQUAL "fake_quantize")
+      file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
+    elseif(${TARGET} STREQUAL "tensorrt_engine_op")
+      message(
+        STATUS
+          "Pybind skips [tensorrt_engine_op], for this OP is only used in inference"
+      )
+    else()
+      file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
     endif()
+  endif()
 endfunction()
 
 function(register_operators)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs EXCLUDES DEPS)
-    cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
-    string(REPLACE "_mkldnn" "" OPS "${OPS}")
-    string(REPLACE "_xpu" "" OPS "${OPS}")
-    string(REPLACE "_npu" "" OPS "${OPS}")
-    string(REPLACE "_mlu" "" OPS "${OPS}")
-    string(REPLACE ".cc" "" OPS "${OPS}")
-    list(REMOVE_DUPLICATES OPS)
-    list(LENGTH register_operators_DEPS register_operators_DEPS_len)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs EXCLUDES DEPS)
+  cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  file(
+    GLOB OPS
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "*_op.cc")
+  string(REPLACE "_mkldnn" "" OPS "${OPS}")
+  string(REPLACE "_xpu" "" OPS "${OPS}")
+  string(REPLACE "_npu" "" OPS "${OPS}")
+  string(REPLACE "_mlu" "" OPS "${OPS}")
+  string(REPLACE ".cc" "" OPS "${OPS}")
+  list(REMOVE_DUPLICATES OPS)
+  list(LENGTH register_operators_DEPS register_operators_DEPS_len)
 
-    foreach(src ${OPS})
-        list(FIND register_operators_EXCLUDES ${src} _index)
-        if (${_index} EQUAL -1)
-            if (${register_operators_DEPS_len} GREATER 0)
-                op_library(${src} UNITY DEPS ${register_operators_DEPS})
-            else()
-                op_library(${src} UNITY)
-            endif()
-        endif()
-    endforeach()
+  foreach(src ${OPS})
+    list(FIND register_operators_EXCLUDES ${src} _index)
+    if(${_index} EQUAL -1)
+      if(${register_operators_DEPS_len} GREATER 0)
+        op_library(${src} UNITY DEPS ${register_operators_DEPS})
+      else()
+        op_library(${src} UNITY)
+      endif()
+    endif()
+  endforeach()
 
-    # Complete the processing of `UNITY_TARGET`.
-    if(WITH_UNITY_BUILD)
-        finish_unity_target(cc)
-        if(WITH_GPU)
-            finish_unity_target(cu)
-        endif()
+  # Complete the processing of `UNITY_TARGET`.
+  if(WITH_UNITY_BUILD)
+    finish_unity_target(cc)
+    if(WITH_GPU)
+      finish_unity_target(cu)
     endif()
+  endif()
 endfunction()
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index f147ef3a586ed..4555d892f11ce 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -13,366 +13,485 @@
 # limitations under the License.
 
 function(generate_unify_header DIR_NAME)
-    set(options "")
-    set(oneValueArgs HEADER_NAME SKIP_SUFFIX)
-    set(multiValueArgs "")
-    cmake_parse_arguments(generate_unify_header "${options}" "${oneValueArgs}"
-        "${multiValueArgs}" ${ARGN})
+  set(options "")
+  set(oneValueArgs HEADER_NAME SKIP_SUFFIX)
+  set(multiValueArgs "")
+  cmake_parse_arguments(generate_unify_header "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
 
-    # get header name and suffix
-    set(header_name "${DIR_NAME}")
-    list(LENGTH generate_unify_header_HEADER_NAME generate_unify_header_HEADER_NAME_len)
-    if(${generate_unify_header_HEADER_NAME_len} GREATER 0)
-        set(header_name "${generate_unify_header_HEADER_NAME}")
-    endif()
-    set(skip_suffix "")
-    list(LENGTH generate_unify_header_SKIP_SUFFIX generate_unify_header_SKIP_SUFFIX_len)
-    if(${generate_unify_header_SKIP_SUFFIX_len} GREATER 0)
-        set(skip_suffix "${generate_unify_header_SKIP_SUFFIX}")
-    endif()
+  # get header name and suffix
+  set(header_name "${DIR_NAME}")
+  list(LENGTH generate_unify_header_HEADER_NAME
+       generate_unify_header_HEADER_NAME_len)
+  if(${generate_unify_header_HEADER_NAME_len} GREATER 0)
+    set(header_name "${generate_unify_header_HEADER_NAME}")
+  endif()
+  set(skip_suffix "")
+  list(LENGTH generate_unify_header_SKIP_SUFFIX
+       generate_unify_header_SKIP_SUFFIX_len)
+  if(${generate_unify_header_SKIP_SUFFIX_len} GREATER 0)
+    set(skip_suffix "${generate_unify_header_SKIP_SUFFIX}")
+  endif()
 
-    # generate target header file
-    set(header_file ${CMAKE_CURRENT_SOURCE_DIR}/include/${header_name}.h)
-    file(WRITE ${header_file} "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n")
+  # generate target header file
+  set(header_file ${CMAKE_CURRENT_SOURCE_DIR}/include/${header_name}.h)
+  file(
+    WRITE ${header_file}
+    "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n"
+  )
 
-    # get all top-level headers and write into header file
-    file(GLOB HEADERS "${CMAKE_CURRENT_SOURCE_DIR}\/${DIR_NAME}\/*.h")
-    foreach(header ${HEADERS})
-        if("${skip_suffix}" STREQUAL "")
-            string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header "${header}")
-            file(APPEND ${header_file} "#include \"${header}\"\n")
-        else()
-            string(FIND "${header}" "${skip_suffix}.h" skip_suffix_found)
-            if(${skip_suffix_found} EQUAL -1)
-                string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header "${header}")
-                file(APPEND ${header_file} "#include \"${header}\"\n")
-            endif()
-        endif()
-    endforeach()
-    # append header into extension.h
-    string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header_file "${header_file}")
-    file(APPEND ${phi_extension_header_file} "#include \"${header_file}\"\n")
+  # get all top-level headers and write into header file
+  file(GLOB HEADERS "${CMAKE_CURRENT_SOURCE_DIR}\/${DIR_NAME}\/*.h")
+  foreach(header ${HEADERS})
+    if("${skip_suffix}" STREQUAL "")
+      string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header "${header}")
+      file(APPEND ${header_file} "#include \"${header}\"\n")
+    else()
+      string(FIND "${header}" "${skip_suffix}.h" skip_suffix_found)
+      if(${skip_suffix_found} EQUAL -1)
+        string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header "${header}")
+        file(APPEND ${header_file} "#include \"${header}\"\n")
+      endif()
+    endif()
+  endforeach()
+  # append header into extension.h
+  string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header_file "${header_file}")
+  file(APPEND ${phi_extension_header_file} "#include \"${header_file}\"\n")
 endfunction()
 
 # call kernel_declare need to make sure whether the target of input exists
 function(kernel_declare TARGET_LIST)
-    foreach(kernel_path ${TARGET_LIST})
-        file(READ ${kernel_path} kernel_impl)
-        string(REGEX MATCH "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[ \t\r\n\/]*[a-z0-9_]*" first_registry "${kernel_impl}")
-        if (NOT first_registry STREQUAL "")
-            # some gpu kernel only can run on cuda, not support rocm, so we add this branch
-            if (WITH_ROCM)
-                string(FIND "${first_registry}" "cuda_only" pos)
-                if(pos GREATER 1)
-                    continue()
-                endif()
-            endif()
-            # parse the first kernel name
-            string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
-            string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
-            string(REPLACE "," "" kernel_name "${kernel_name}")
-            string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
-            string(REGEX REPLACE "//cuda_only" "" kernel_name "${kernel_name}")
-            # append kernel declare into declarations.h
-            # TODO(chenweihang): default declare ALL_LAYOUT for each kernel
-            if (${kernel_path} MATCHES "./cpu\/")
-                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
-            elseif (${kernel_path} MATCHES "./gpu\/")
-                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
-            elseif (${kernel_path} MATCHES "./xpu\/")
-                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
-            elseif (${kernel_path} MATCHES "./gpudnn\/")
-                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
-            elseif (${kernel_path} MATCHES "./kps\/")
-                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n")
-            else ()
-                # deal with device independent kernel, now we use CPU temporaary
-                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
-            endif()
+  foreach(kernel_path ${TARGET_LIST})
+    file(READ ${kernel_path} kernel_impl)
+    string(
+      REGEX
+        MATCH
+        "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[ \t\r\n\/]*[a-z0-9_]*"
+        first_registry
+        "${kernel_impl}")
+    if(NOT first_registry STREQUAL "")
+      # some gpu kernel only can run on cuda, not support rocm, so we add this branch
+      if(WITH_ROCM)
+        string(FIND "${first_registry}" "cuda_only" pos)
+        if(pos GREATER 1)
+          continue()
         endif()
-    endforeach()
+      endif()
+      # parse the first kernel name
+      string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
+      string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name
+                     "${kernel_name}")
+      string(REPLACE "," "" kernel_name "${kernel_name}")
+      string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
+      string(REGEX REPLACE "//cuda_only" "" kernel_name "${kernel_name}")
+      # append kernel declare into declarations.h
+      # TODO(chenweihang): default declare ALL_LAYOUT for each kernel
+      if(${kernel_path} MATCHES "./cpu\/")
+        file(APPEND ${kernel_declare_file}
+             "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
+      elseif(${kernel_path} MATCHES "./gpu\/")
+        file(APPEND ${kernel_declare_file}
+             "PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
+      elseif(${kernel_path} MATCHES "./xpu\/")
+        file(APPEND ${kernel_declare_file}
+             "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
+      elseif(${kernel_path} MATCHES "./gpudnn\/")
+        file(APPEND ${kernel_declare_file}
+             "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
+      elseif(${kernel_path} MATCHES "./kps\/")
+        file(APPEND ${kernel_declare_file}
+             "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n")
+      else()
+        # deal with device independent kernel, now we use CPU temporaary
+        file(APPEND ${kernel_declare_file}
+             "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
+      endif()
+    endif()
+  endforeach()
 endfunction()
 
 function(kernel_library TARGET)
-    set(common_srcs)
-    set(cpu_srcs)
-    set(gpu_srcs)
-    set(xpu_srcs)
-    set(gpudnn_srcs)
-    set(kps_srcs)
-    # parse and save the deps kerenl targets
-    set(all_srcs)
-    set(kernel_deps)
+  set(common_srcs)
+  set(cpu_srcs)
+  set(gpu_srcs)
+  set(xpu_srcs)
+  set(gpudnn_srcs)
+  set(kps_srcs)
+  # parse and save the deps kerenl targets
+  set(all_srcs)
+  set(kernel_deps)
 
-    set(oneValueArgs SUB_DIR)
-    set(multiValueArgs SRCS DEPS)
-    set(target_build_flag 1)
+  set(oneValueArgs SUB_DIR)
+  set(multiValueArgs SRCS DEPS)
+  set(target_build_flag 1)
 
-    cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
-        "${multiValueArgs}" ${ARGN})
+  cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
 
-    # used for cc_library selected_rows dir target
-    set(target_suffix "")
-    if ("${kernel_library_SUB_DIR}" STREQUAL "selected_rows")
-        set(target_suffix "_sr")
+  # used for cc_library selected_rows dir target
+  set(target_suffix "")
+  if("${kernel_library_SUB_DIR}" STREQUAL "selected_rows")
+    set(target_suffix "_sr")
+  endif()
+  if("${kernel_library_SUB_DIR}" STREQUAL "sparse")
+    set(target_suffix "_sp")
+  endif()
+
+  list(LENGTH kernel_library_SRCS kernel_library_SRCS_len)
+  # one kernel only match one impl file in each backend
+  if(${kernel_library_SRCS_len} EQUAL 0)
+    if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+      list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
     endif()
-    if ("${kernel_library_SUB_DIR}" STREQUAL "sparse")
-        set(target_suffix "_sp")
+    if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
+      list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
     endif()
-
-    list(LENGTH kernel_library_SRCS kernel_library_SRCS_len)
-    # one kernel only match one impl file in each backend
-    if (${kernel_library_SRCS_len} EQUAL 0)
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
-            list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
-            list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
-        endif()
-        if (WITH_GPU OR WITH_ROCM)
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
-                list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
-            endif()
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
-                list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
-            endif()
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
-                list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
-            endif()
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu)
-                list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu)
-            endif()
-        endif()
-        if (WITH_XPU)
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
-                list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
-            endif()
-        endif()
-        if (WITH_XPU_KP)
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
-                # Change XPU2 file suffix
-                # NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu
-                file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps)
-                file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
-                list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
-            endif()
-        endif()
-    else()
-        # TODO(chenweihang): impl compile by source later
+    if(WITH_GPU OR WITH_ROCM)
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
+        list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
+      endif()
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
+        list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
+      endif()
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+        list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+      endif()
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu)
+        list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu)
+      endif()
     endif()
-
-    list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.h)
-    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/impl/${TARGET}_impl.h)
-        list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/impl/${TARGET}_impl.h)
+    if(WITH_XPU)
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
+        list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
+      endif()
     endif()
-    list(APPEND all_srcs ${common_srcs})
-    list(APPEND all_srcs ${cpu_srcs})
-    list(APPEND all_srcs ${gpu_srcs})
-    list(APPEND all_srcs ${xpu_srcs})
-    list(APPEND all_srcs ${gpudnn_srcs})
-    list(APPEND all_srcs ${kps_srcs})
+    if(WITH_XPU_KP)
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+        # Change XPU2 file suffix
+        # NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu
+        file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu
+             DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps)
+        file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu
+             ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
+        list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
+      endif()
+    endif()
+  else()
+    # TODO(chenweihang): impl compile by source later
+  endif()
 
-    set(all_include_kernels)
-    set(all_kernel_name)
+  list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.h)
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/impl/${TARGET}_impl.h)
+    list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/impl/${TARGET}_impl.h)
+  endif()
+  list(APPEND all_srcs ${common_srcs})
+  list(APPEND all_srcs ${cpu_srcs})
+  list(APPEND all_srcs ${gpu_srcs})
+  list(APPEND all_srcs ${xpu_srcs})
+  list(APPEND all_srcs ${gpudnn_srcs})
+  list(APPEND all_srcs ${kps_srcs})
 
-    foreach(src ${all_srcs})
-        file(READ ${src} target_content)
-        # "kernels/xxx"(DenseTensor Kernel) can only include each other, but can't include "SUB_DIR/xxx" (such as selected_rows Kernel)
-        string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
-        list(APPEND all_include_kernels ${include_kernels})
+  set(all_include_kernels)
+  set(all_kernel_name)
 
-        # "SUB_DIR/xxx" can include "kernels/xx" and "SUB_DIR/xxx"
-        if (NOT "${kernel_library_SUB_DIR}" STREQUAL "")
-            string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
-            list(APPEND all_include_kernels ${include_kernels})
-        endif()
+  foreach(src ${all_srcs})
+    file(READ ${src} target_content)
+    # "kernels/xxx"(DenseTensor Kernel) can only include each other, but can't include "SUB_DIR/xxx" (such as selected_rows Kernel)
+    string(REGEX MATCHALL
+                 "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\""
+                 include_kernels ${target_content})
+    list(APPEND all_include_kernels ${include_kernels})
 
-        foreach(include_kernel ${all_include_kernels})
-            if ("${kernel_library_SUB_DIR}" STREQUAL "")
-                string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
-                string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
-                list(APPEND all_kernel_name ${kernel_name})
-            else()
-                # NOTE(dev): we should firstly match kernel_library_SUB_DIR.
-                if (${include_kernel} MATCHES "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/")
-                    string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
-                    # for selected_rows directory, add ${target_suffix}.
-                    string(REGEX REPLACE ".h\"" "${target_suffix}" kernel_name ${kernel_name})
-                    list(APPEND all_kernel_name ${kernel_name})
-                else()
-                    string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
-                    string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
-                    list(APPEND all_kernel_name ${kernel_name})
-                endif()
-            endif()
-            list(APPEND kernel_deps ${all_kernel_name})
-        endforeach()
+    # "SUB_DIR/xxx" can include "kernels/xx" and "SUB_DIR/xxx"
+    if(NOT "${kernel_library_SUB_DIR}" STREQUAL "")
+      string(
+        REGEX
+          MATCHALL
+          "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\""
+          include_kernels
+          ${target_content})
+      list(APPEND all_include_kernels ${include_kernels})
+    endif()
+
+    foreach(include_kernel ${all_include_kernels})
+      if("${kernel_library_SUB_DIR}" STREQUAL "")
+        string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name
+                             ${include_kernel})
+        string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
+        list(APPEND all_kernel_name ${kernel_name})
+      else()
+        # NOTE(dev): we should firstly match kernel_library_SUB_DIR.
+        if(${include_kernel} MATCHES
+           "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/")
+          string(
+            REGEX
+            REPLACE
+              "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" ""
+              kernel_name ${include_kernel})
+          # for selected_rows directory, add ${target_suffix}.
+          string(REGEX REPLACE ".h\"" "${target_suffix}" kernel_name
+                               ${kernel_name})
+          list(APPEND all_kernel_name ${kernel_name})
+        else()
+          string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" ""
+                               kernel_name ${include_kernel})
+          string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
+          list(APPEND all_kernel_name ${kernel_name})
+        endif()
+      endif()
+      list(APPEND kernel_deps ${all_kernel_name})
     endforeach()
-    list(REMOVE_DUPLICATES kernel_deps)
-    list(REMOVE_ITEM kernel_deps ${TARGET}${target_suffix})
+  endforeach()
+  list(REMOVE_DUPLICATES kernel_deps)
+  list(REMOVE_ITEM kernel_deps ${TARGET}${target_suffix})
 
-    list(LENGTH common_srcs common_srcs_len)
-    list(LENGTH cpu_srcs cpu_srcs_len)
-    list(LENGTH gpu_srcs gpu_srcs_len)
-    list(LENGTH xpu_srcs xpu_srcs_len)
-    list(LENGTH gpudnn_srcs gpudnn_srcs_len)
-    list(LENGTH kps_srcs kps_srcs_len)
+  list(LENGTH common_srcs common_srcs_len)
+  list(LENGTH cpu_srcs cpu_srcs_len)
+  list(LENGTH gpu_srcs gpu_srcs_len)
+  list(LENGTH xpu_srcs xpu_srcs_len)
+  list(LENGTH gpudnn_srcs gpudnn_srcs_len)
+  list(LENGTH kps_srcs kps_srcs_len)
 
-    # kernel source file level
-    # level 1: base device kernel (if any device or dnn kernel exists, the cpu_kernel must be exists!!!)
-    # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs
-    # = dnn srcs: gpudnn_srcs
-    # level 2: device-independent kernel
-    # - common_srcs
+  # kernel source file level
+  # level 1: base device kernel (if any device or dnn kernel exists, the cpu_kernel must be exists!!!)
+  # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs
+  # = dnn srcs: gpudnn_srcs
+  # level 2: device-independent kernel
+  # - common_srcs
 
-    set(partial_build_flag 0)
-    set(base_build_flag 0)
-    if (${common_srcs_len} GREATER 0)
-        set(partial_build_flag 1)
-    endif()
-    if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
-        set(base_build_flag 1)
-    endif()
+  set(partial_build_flag 0)
+  set(base_build_flag 0)
+  if(${common_srcs_len} GREATER 0)
+    set(partial_build_flag 1)
+  endif()
+  if(${cpu_srcs_len} GREATER 0
+     OR ${gpu_srcs_len} GREATER 0
+     OR ${xpu_srcs_len} GREATER 0
+     OR ${kps_srcs_len} GREATER 0)
+    set(base_build_flag 1)
+  endif()
 
-    # gpudnn or mkldnn needs to be compiled separately
-    set(dnn_kernels)
-    if (${gpudnn_srcs_len} GREATER 0)
-        if (WITH_GPU)
-            nv_library(${TARGET}_gpudnn${target_suffix} SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-        elseif (WITH_ROCM)
-            hip_library(${TARGET}_gpudnn${target_suffix} SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-        endif()
-        list(APPEND dnn_kernels ${TARGET}_gpudnn${target_suffix})
+  # gpudnn or mkldnn needs to be compiled separately
+  set(dnn_kernels)
+  if(${gpudnn_srcs_len} GREATER 0)
+    if(WITH_GPU)
+      nv_library(
+        ${TARGET}_gpudnn${target_suffix}
+        SRCS ${gpudnn_srcs}
+        DEPS ${kernel_library_DEPS} ${kernel_deps})
+    elseif(WITH_ROCM)
+      hip_library(
+        ${TARGET}_gpudnn${target_suffix}
+        SRCS ${gpudnn_srcs}
+        DEPS ${kernel_library_DEPS} ${kernel_deps})
     endif()
-    list(LENGTH dnn_kernels dnn_kernels_len)
+    list(APPEND dnn_kernels ${TARGET}_gpudnn${target_suffix})
+  endif()
+  list(LENGTH dnn_kernels dnn_kernels_len)
 
-    if (${partial_build_flag} EQUAL 0 AND ${base_build_flag} EQUAL 1)
-        if (WITH_GPU)
-            if (${dnn_kernels_len} GREATER 0)
-                nv_library(${TARGET}_base${target_suffix} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                nv_library(${TARGET}${target_suffix} DEPS ${TARGET}_base${target_suffix} ${dnn_kernels})
-            else()
-                nv_library(${TARGET}${target_suffix} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
-        elseif (WITH_ROCM)
-            if (${dnn_kernels_len} GREATER 0)
-                hip_library(${TARGET}_base${target_suffix} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                hip_library(${TARGET}${target_suffix} DEPS ${TARGET}_base${target_suffix} ${dnn_kernels})
-            else()
-                hip_library(${TARGET}${target_suffix} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
-        elseif (WITH_XPU_KP)
-            xpu_library(${TARGET}${target_suffix} SRCS ${cpu_srcs} ${kps_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-        else()
-            cc_library(${TARGET}${target_suffix} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-        endif()
-    elseif (${partial_build_flag} EQUAL 1 AND ${base_build_flag} EQUAL 1)
-        if (WITH_GPU)
-            nv_library(${TARGET}_base${target_suffix} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            nv_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${TARGET}_base${target_suffix} ${dnn_kernels})
-        elseif (WITH_ROCM)
-            hip_library(${TARGET}_base${target_suffix} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            hip_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${TARGET}_base${target_suffix} ${dnn_kernels})
-        elseif (WITH_XPU_KP)
-            xpu_library(${TARGET}_base${target_suffix} SRCS ${cpu_srcs} ${kps_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            xpu_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${TARGET}_base${target_suffix})
-        else()
-            cc_library(${TARGET}_base${target_suffix} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            cc_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${TARGET}_base${target_suffix})
-        endif()
-    elseif (${partial_build_flag} EQUAL 1 AND ${base_build_flag} EQUAL 0)
-        if (WITH_GPU)
-            nv_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-        elseif (WITH_ROCM)
-            hip_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-        elseif (WITH_XPU_KP)
-            xpu_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-        else()
-            cc_library(${TARGET}${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-        endif()
+  if(${partial_build_flag} EQUAL 0 AND ${base_build_flag} EQUAL 1)
+    if(WITH_GPU)
+      if(${dnn_kernels_len} GREATER 0)
+        nv_library(
+          ${TARGET}_base${target_suffix}
+          SRCS ${cpu_srcs} ${gpu_srcs}
+          DEPS ${kernel_library_DEPS} ${kernel_deps})
+        nv_library(${TARGET}${target_suffix} DEPS ${TARGET}_base${target_suffix}
+                                                  ${dnn_kernels})
+      else()
+        nv_library(
+          ${TARGET}${target_suffix}
+          SRCS ${cpu_srcs} ${gpu_srcs}
+          DEPS ${kernel_library_DEPS} ${kernel_deps})
+      endif()
+    elseif(WITH_ROCM)
+      if(${dnn_kernels_len} GREATER 0)
+        hip_library(
+          ${TARGET}_base${target_suffix}
+          SRCS ${cpu_srcs} ${gpu_srcs}
+          DEPS ${kernel_library_DEPS} ${kernel_deps})
+        hip_library(${TARGET}${target_suffix}
+                    DEPS ${TARGET}_base${target_suffix} ${dnn_kernels})
+      else()
+        hip_library(
+          ${TARGET}${target_suffix}
+          SRCS ${cpu_srcs} ${gpu_srcs}
+          DEPS ${kernel_library_DEPS} ${kernel_deps})
+      endif()
+    elseif(WITH_XPU_KP)
+      xpu_library(
+        ${TARGET}${target_suffix}
+        SRCS ${cpu_srcs} ${kps_srcs} ${xpu_srcs}
+        DEPS ${kernel_library_DEPS} ${kernel_deps})
+    else()
+      cc_library(
+        ${TARGET}${target_suffix}
+        SRCS ${cpu_srcs} ${xpu_srcs}
+        DEPS ${kernel_library_DEPS} ${kernel_deps})
+    endif()
+  elseif(${partial_build_flag} EQUAL 1 AND ${base_build_flag} EQUAL 1)
+    if(WITH_GPU)
+      nv_library(
+        ${TARGET}_base${target_suffix}
+        SRCS ${cpu_srcs} ${gpu_srcs}
+        DEPS ${kernel_library_DEPS} ${kernel_deps})
+      nv_library(
+        ${TARGET}${target_suffix}
+        SRCS ${common_srcs}
+        DEPS ${TARGET}_base${target_suffix} ${dnn_kernels})
+    elseif(WITH_ROCM)
+      hip_library(
+        ${TARGET}_base${target_suffix}
+        SRCS ${cpu_srcs} ${gpu_srcs}
+        DEPS ${kernel_library_DEPS} ${kernel_deps})
+      hip_library(
+        ${TARGET}${target_suffix}
+        SRCS ${common_srcs}
+        DEPS ${TARGET}_base${target_suffix} ${dnn_kernels})
+    elseif(WITH_XPU_KP)
+      xpu_library(
+        ${TARGET}_base${target_suffix}
+        SRCS ${cpu_srcs} ${kps_srcs} ${xpu_srcs}
+        DEPS ${kernel_library_DEPS} ${kernel_deps})
+      xpu_library(
+        ${TARGET}${target_suffix}
+        SRCS ${common_srcs}
+        DEPS ${TARGET}_base${target_suffix})
+    else()
+      cc_library(
+        ${TARGET}_base${target_suffix}
+        SRCS ${cpu_srcs} ${xpu_srcs}
+        DEPS ${kernel_library_DEPS} ${kernel_deps})
+      cc_library(
+        ${TARGET}${target_suffix}
+        SRCS ${common_srcs}
+        DEPS ${TARGET}_base${target_suffix})
+    endif()
+  elseif(${partial_build_flag} EQUAL 1 AND ${base_build_flag} EQUAL 0)
+    if(WITH_GPU)
+      nv_library(
+        ${TARGET}${target_suffix}
+        SRCS ${common_srcs}
+        DEPS ${kernel_library_DEPS} ${kernel_deps})
+    elseif(WITH_ROCM)
+      hip_library(
+        ${TARGET}${target_suffix}
+        SRCS ${common_srcs}
+        DEPS ${kernel_library_DEPS} ${kernel_deps})
+    elseif(WITH_XPU_KP)
+      xpu_library(
+        ${TARGET}${target_suffix}
+        SRCS ${common_srcs}
+        DEPS ${kernel_library_DEPS} ${kernel_deps})
     else()
-        set(target_build_flag 0)
+      cc_library(
+        ${TARGET}${target_suffix}
+        SRCS ${common_srcs}
+        DEPS ${kernel_library_DEPS} ${kernel_deps})
     endif()
+  else()
+    set(target_build_flag 0)
+  endif()
 
-    if (${target_build_flag} EQUAL 1)
-        if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
-            ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR
-            ${gpudnn_srcs_len} GREATER 0)
-            # append target into PHI_KERNELS property
-            get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
-            set(phi_kernels ${phi_kernels} ${TARGET}${target_suffix})
-            set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels})
-        endif()
+  if(${target_build_flag} EQUAL 1)
+    if(${common_srcs_len} GREATER 0
+       OR ${cpu_srcs_len} GREATER 0
+       OR ${gpu_srcs_len} GREATER 0
+       OR ${xpu_srcs_len} GREATER 0
+       OR ${kps_srcs_len} GREATER 0
+       OR ${gpudnn_srcs_len} GREATER 0)
+      # append target into PHI_KERNELS property
+      get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
+      set(phi_kernels ${phi_kernels} ${TARGET}${target_suffix})
+      set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels})
+    endif()
 
-        # parse kernel name and auto generate kernel declaration
-        # here, we don't need to check WITH_XXX, because if not WITH_XXX, the
-        # xxx_srcs_len will be equal to 0
-        if (${common_srcs_len} GREATER 0)
-            kernel_declare(${common_srcs})
-        endif()
-        if (${cpu_srcs_len} GREATER 0)
-            kernel_declare(${cpu_srcs})
-        endif()
-        if (${gpu_srcs_len} GREATER 0)
-            kernel_declare(${gpu_srcs})
-        endif()
-        if (${xpu_srcs_len} GREATER 0)
-            kernel_declare(${xpu_srcs})
-        endif()
-        if (${gpudnn_srcs_len} GREATER 0)
-            kernel_declare(${gpudnn_srcs})
-        endif()
-        if (${kps_srcs_len} GREATER 0)
-            kernel_declare(${kps_srcs})
-        endif()
+    # parse kernel name and auto generate kernel declaration
+    # here, we don't need to check WITH_XXX, because if not WITH_XXX, the
+    # xxx_srcs_len will be equal to 0
+    if(${common_srcs_len} GREATER 0)
+      kernel_declare(${common_srcs})
+    endif()
+    if(${cpu_srcs_len} GREATER 0)
+      kernel_declare(${cpu_srcs})
+    endif()
+    if(${gpu_srcs_len} GREATER 0)
+      kernel_declare(${gpu_srcs})
+    endif()
+    if(${xpu_srcs_len} GREATER 0)
+      kernel_declare(${xpu_srcs})
     endif()
+    if(${gpudnn_srcs_len} GREATER 0)
+      kernel_declare(${gpudnn_srcs})
+    endif()
+    if(${kps_srcs_len} GREATER 0)
+      kernel_declare(${kps_srcs})
+    endif()
+  endif()
 endfunction()
 
 function(register_kernels)
-    set(options "")
-    set(oneValueArgs SUB_DIR)
-    set(multiValueArgs EXCLUDES DEPS)
-    cmake_parse_arguments(register_kernels "${options}" "${oneValueArgs}"
-        "${multiValueArgs}" ${ARGN})
+  set(options "")
+  set(oneValueArgs SUB_DIR)
+  set(multiValueArgs EXCLUDES DEPS)
+  cmake_parse_arguments(register_kernels "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
 
-    file(GLOB KERNELS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_kernel.h")
-    string(REPLACE ".h" "" KERNELS "${KERNELS}")
-    list(LENGTH register_kernels_DEPS register_kernels_DEPS_len)
+  file(
+    GLOB KERNELS
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "*_kernel.h")
+  string(REPLACE ".h" "" KERNELS "${KERNELS}")
+  list(LENGTH register_kernels_DEPS register_kernels_DEPS_len)
 
-    foreach(target ${KERNELS})
-        list(FIND register_kernels_EXCLUDES ${target} _index)
-        if (${_index} EQUAL -1)
-            if (${register_kernels_DEPS_len} GREATER 0)
-                kernel_library(${target} DEPS ${register_kernels_DEPS} SUB_DIR ${register_kernels_SUB_DIR})
-            else()
-                kernel_library(${target} SUB_DIR ${register_kernels_SUB_DIR})
-            endif()
-        endif()
-    endforeach()
+  foreach(target ${KERNELS})
+    list(FIND register_kernels_EXCLUDES ${target} _index)
+    if(${_index} EQUAL -1)
+      if(${register_kernels_DEPS_len} GREATER 0)
+        kernel_library(${target} DEPS ${register_kernels_DEPS} SUB_DIR
+                       ${register_kernels_SUB_DIR})
+      else()
+        kernel_library(${target} SUB_DIR ${register_kernels_SUB_DIR})
+      endif()
+    endif()
+  endforeach()
 endfunction()
 
 function(append_op_util_declare TARGET)
-    file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content)
-    string(REGEX MATCH "(PD_REGISTER_BASE_KERNEL_NAME|PD_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
-    string(REPLACE "PD_REGISTER_ARG_MAPPING_FN" "PD_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
-    string(REPLACE "PD_REGISTER_BASE_KERNEL_NAME" "PD_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}")
-    string(APPEND util_declare ");\n")
-    file(APPEND ${op_utils_header} "${util_declare}")
+  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content)
+  string(
+    REGEX
+      MATCH
+      "(PD_REGISTER_BASE_KERNEL_NAME|PD_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*"
+      util_registrar
+      "${target_content}")
+  string(REPLACE "PD_REGISTER_ARG_MAPPING_FN" "PD_DECLARE_ARG_MAPPING_FN"
+                 util_declare "${util_registrar}")
+  string(REPLACE "PD_REGISTER_BASE_KERNEL_NAME" "PD_DECLARE_BASE_KERNEL_NAME"
+                 util_declare "${util_declare}")
+  string(APPEND util_declare ");\n")
+  file(APPEND ${op_utils_header} "${util_declare}")
 endfunction()
 
 function(register_op_utils TARGET_NAME)
-    set(utils_srcs)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs EXCLUDES DEPS)
-    cmake_parse_arguments(register_op_utils "${options}" "${oneValueArgs}"
-        "${multiValueArgs}" ${ARGN})
+  set(utils_srcs)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs EXCLUDES DEPS)
+  cmake_parse_arguments(register_op_utils "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
 
-    file(GLOB SIGNATURES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_sig.cc")
-    foreach(target ${SIGNATURES})
-        append_op_util_declare(${target})
-        list(APPEND utils_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${target})
-    endforeach()
+  file(
+    GLOB SIGNATURES
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "*_sig.cc")
+  foreach(target ${SIGNATURES})
+    append_op_util_declare(${target})
+    list(APPEND utils_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${target})
+  endforeach()
 
-    cc_library(${TARGET_NAME} SRCS ${utils_srcs} DEPS ${register_op_utils_DEPS})
+  cc_library(
+    ${TARGET_NAME}
+    SRCS ${utils_srcs}
+    DEPS ${register_op_utils_DEPS})
 endfunction()
diff --git a/cmake/phi_header.cmake b/cmake/phi_header.cmake
index b23b4086b18f2..fa5b6724ce89a 100644
--- a/cmake/phi_header.cmake
+++ b/cmake/phi_header.cmake
@@ -12,32 +12,42 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir")
+set(PADDLE_INFERENCE_INSTALL_DIR
+    "${CMAKE_BINARY_DIR}/paddle_inference_install_dir")
 
 function(phi_header_path_compat TARGET_PATH)
-message(STATUS "phi header path compat processing: ${TARGET_PATH}")
-string(FIND ${TARGET_PATH} "experimental" pos)
-if (pos GREATER 1)
+  message(STATUS "phi header path compat processing: ${TARGET_PATH}")
+  string(FIND ${TARGET_PATH} "experimental" pos)
+  if(pos GREATER 1)
     file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
     foreach(header ${HEADERS})
-        if (${header} MATCHES ".*.h$")
-            file(READ ${header} HEADER_CONTENT)
-            string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/" HEADER_CONTENT "${HEADER_CONTENT}")
-            string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/" HEADER_CONTENT "${HEADER_CONTENT}")
-            file(WRITE ${header} "${HEADER_CONTENT}")
-            message(STATUS "phi header path compat processing complete: ${header}")
-        endif()
+      if(${header} MATCHES ".*.h$")
+        file(READ ${header} HEADER_CONTENT)
+        string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/"
+                       HEADER_CONTENT "${HEADER_CONTENT}")
+        string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/"
+                       HEADER_CONTENT "${HEADER_CONTENT}")
+        file(WRITE ${header} "${HEADER_CONTENT}")
+        message(STATUS "phi header path compat processing complete: ${header}")
+      endif()
     endforeach()
-endif()
+  endif()
 endfunction()
 
-phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental)
-phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api)
-phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext)
-phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include)
-phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common)
-phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core)
+phi_header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental)
+phi_header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api)
+phi_header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext)
+phi_header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include)
+phi_header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common)
+phi_header_path_compat(
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core)
 
 # In order to be compatible with the original behavior, the header file name needs to be changed
-file(RENAME ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/extension.h
-            ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/ext_all.h)
+file(RENAME
+     ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/extension.h
+     ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/ext_all.h)
diff --git a/cmake/python_module.cmake b/cmake/python_module.cmake
index 1412b7f7f2060..9367435b61b55 100644
--- a/cmake/python_module.cmake
+++ b/cmake/python_module.cmake
@@ -2,42 +2,49 @@
 # Found at http://www.cmake.org/pipermail/cmake/2011-January/041666.html
 # To use do: find_python_module(PyQt4 REQUIRED)
 function(find_python_module module)
-    string(TOUPPER ${module} module_upper)
-    if(NOT PY_${module_upper})
-        if(ARGC GREATER 1 AND ARGV1 STREQUAL "REQUIRED")
-            set(${module}_FIND_REQUIRED TRUE)
-        else()
-            set(${module}_FIND_REQUIRED FALSE)
-        endif()
-        # A module's location is usually a directory, but for binary modules
-        # it's a .so file.
-        execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-            "import re, ${module}; print(re.compile('/__init__.py.*').sub('',${module}.__file__))"
-            RESULT_VARIABLE _${module}_status
-            OUTPUT_VARIABLE _${module}_location
-            ERROR_QUIET
-            OUTPUT_STRIP_TRAILING_WHITESPACE)
-        if(NOT _${module}_status)
-            set(PY_${module_upper} ${_${module}_location} CACHE STRING
-                "Location of Python module ${module}")
-        endif(NOT _${module}_status)
-    endif(NOT PY_${module_upper})
-    find_package_handle_standard_args(PY_${module} DEFAULT_MSG PY_${module_upper})
-    if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED)
-        message(FATAL_ERROR "python module ${module} is not found")
+  string(TOUPPER ${module} module_upper)
+  if(NOT PY_${module_upper})
+    if(ARGC GREATER 1 AND ARGV1 STREQUAL "REQUIRED")
+      set(${module}_FIND_REQUIRED TRUE)
+    else()
+      set(${module}_FIND_REQUIRED FALSE)
     endif()
-
-    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
-        "import sys, ${module}; sys.stdout.write(${module}.__version__)"
-        OUTPUT_VARIABLE _${module}_version
-        RESULT_VARIABLE _${module}_status
-        ERROR_QUIET
-        OUTPUT_STRIP_TRAILING_WHITESPACE)
+    # A module's location is usually a directory, but for binary modules
+    # it's a .so file.
+    execute_process(
+      COMMAND
+        "${PYTHON_EXECUTABLE}" "-c"
+        "import re, ${module}; print(re.compile('/__init__.py.*').sub('',${module}.__file__))"
+      RESULT_VARIABLE _${module}_status
+      OUTPUT_VARIABLE _${module}_location
+      ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
     if(NOT _${module}_status)
-        set(PY_${module_upper}_VERSION ${_${module}_version} CACHE STRING
-            "Version of Python module ${module}")
+      set(PY_${module_upper}
+          ${_${module}_location}
+          CACHE STRING "Location of Python module ${module}")
     endif(NOT _${module}_status)
+  endif(NOT PY_${module_upper})
+  find_package_handle_standard_args(PY_${module} DEFAULT_MSG PY_${module_upper})
+  if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED)
+    message(FATAL_ERROR "python module ${module} is not found")
+  endif()
+
+  execute_process(
+    COMMAND "${PYTHON_EXECUTABLE}" "-c"
+            "import sys, ${module}; sys.stdout.write(${module}.__version__)"
+    OUTPUT_VARIABLE _${module}_version
+    RESULT_VARIABLE _${module}_status
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(NOT _${module}_status)
+    set(PY_${module_upper}_VERSION
+        ${_${module}_version}
+        CACHE STRING "Version of Python module ${module}")
+  endif(NOT _${module}_status)
 
-    set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} PARENT_SCOPE)
-    set(PY_${module_upper}_VERSION ${PY_${module_upper}_VERSION} PARENT_SCOPE)
+  set(PY_${module_upper}_FOUND
+      ${PY_${module_upper}_FOUND}
+      PARENT_SCOPE)
+  set(PY_${module_upper}_VERSION
+      ${PY_${module_upper}_VERSION}
+      PARENT_SCOPE)
 endfunction(find_python_module)
diff --git a/cmake/rccl.cmake b/cmake/rccl.cmake
index f3a472ac930de..1f78c74f40e64 100644
--- a/cmake/rccl.cmake
+++ b/cmake/rccl.cmake
@@ -1,28 +1,30 @@
 if(NOT WITH_ROCM)
-    return()
+  return()
 endif()
 
 # Now we don't support RCCL on windows
 if(WIN32)
-    return()
+  return()
 endif()
 
 if(WITH_RCCL)
-    set(RCCL_ROOT ${ROCM_PATH}/rccl CACHE PATH "RCCL ROOT")
-    find_path(RCCL_INCLUDE_DIR rccl.h
-        PATHS ${RCCL_ROOT} ${RCCL_ROOT}/include ${RCCL_ROOT}/local/include
-        $ENV{RCCL_ROOT} $ENV{RCCL_ROOT}/include $ENV{RCCL_ROOT}/local/include
-        NO_DEFAULT_PATH
-    )
+  set(RCCL_ROOT
+      ${ROCM_PATH}/rccl
+      CACHE PATH "RCCL ROOT")
+  find_path(
+    RCCL_INCLUDE_DIR rccl.h
+    PATHS ${RCCL_ROOT} ${RCCL_ROOT}/include ${RCCL_ROOT}/local/include
+          $ENV{RCCL_ROOT} $ENV{RCCL_ROOT}/include $ENV{RCCL_ROOT}/local/include
+    NO_DEFAULT_PATH)
 
-    file(READ ${RCCL_INCLUDE_DIR}/rccl.h RCCL_VERSION_FILE_CONTENTS)
+  file(READ ${RCCL_INCLUDE_DIR}/rccl.h RCCL_VERSION_FILE_CONTENTS)
 
-    string(REGEX MATCH "define NCCL_VERSION_CODE +([0-9]+)"
-        RCCL_VERSION "${RCCL_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define NCCL_VERSION_CODE +([0-9]+)" "\\1"
-        RCCL_VERSION "${RCCL_VERSION}")
+  string(REGEX MATCH "define NCCL_VERSION_CODE +([0-9]+)" RCCL_VERSION
+               "${RCCL_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define NCCL_VERSION_CODE +([0-9]+)" "\\1" RCCL_VERSION
+                       "${RCCL_VERSION}")
 
-    # 2604 for ROCM3.5 and 2708 for ROCM 3.9
-    message(STATUS "Current RCCL header is ${RCCL_INCLUDE_DIR}/rccl.h. "
-            "Current RCCL version is v${RCCL_VERSION}. ")
+  # 2604 for ROCM3.5 and 2708 for ROCM 3.9
+  message(STATUS "Current RCCL header is ${RCCL_INCLUDE_DIR}/rccl.h. "
+                 "Current RCCL version is v${RCCL_VERSION}. ")
 endif()
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 566dc75fda019..ff8b9d6f9a9b4 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -4,49 +4,62 @@
 include(CheckCXXSourceRuns)
 include(CheckCXXSourceCompiles)
 
-if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    set(MMX_FLAG "-mmmx")
-    set(SSE2_FLAG "-msse2")
-    set(SSE3_FLAG "-msse3")
-    set(AVX_FLAG "-mavx")
-    set(AVX2_FLAG "-mavx2")
-    set(AVX512F_FLAG "-mavx512f")
+if(CMAKE_COMPILER_IS_GNUCC
+   OR CMAKE_COMPILER_IS_GNUCXX
+   OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(MMX_FLAG "-mmmx")
+  set(SSE2_FLAG "-msse2")
+  set(SSE3_FLAG "-msse3")
+  set(AVX_FLAG "-mavx")
+  set(AVX2_FLAG "-mavx2")
+  set(AVX512F_FLAG "-mavx512f")
 elseif(MSVC)
-    set(MMX_FLAG "/arch:MMX")
-    set(SSE2_FLAG "/arch:SSE2")
-    set(SSE3_FLAG "/arch:SSE3")
-    SET(AVX_FLAG "/arch:AVX")
-    SET(AVX2_FLAG "/arch:AVX2")
+  set(MMX_FLAG "/arch:MMX")
+  set(SSE2_FLAG "/arch:SSE2")
+  set(SSE3_FLAG "/arch:SSE3")
+  set(AVX_FLAG "/arch:AVX")
+  set(AVX2_FLAG "/arch:AVX2")
 endif()
 
 set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
 
 # Check  MMX
 set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
-set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
+set(MMX_FOUND_EXITCODE
+    1
+    CACHE STRING "Result from TRY_RUN" FORCE)
+check_cxx_source_runs(
+  "
 #include <mmintrin.h>
 int main()
 {
     _mm_setzero_si64();
     return 0;
-}" MMX_FOUND)
+}"
+  MMX_FOUND)
 
 # Check SSE2
 set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
-set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
+set(SSE2_FOUND_EXITCODE
+    1
+    CACHE STRING "Result from TRY_RUN" FORCE)
+check_cxx_source_runs(
+  "
 #include <emmintrin.h>
 int main()
 {
     _mm_setzero_si128();
     return 0;
-}" SSE2_FOUND)
+}"
+  SSE2_FOUND)
 
 # Check SSE3
 set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
-set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
+set(SSE3_FOUND_EXITCODE
+    1
+    CACHE STRING "Result from TRY_RUN" FORCE)
+check_cxx_source_runs(
+  "
 #include <pmmintrin.h>
 int main()
 {
@@ -55,12 +68,16 @@ int main()
     __m128d result = _mm_addsub_pd(a, b);
     result = _mm_movedup_pd(result);
     return 0;
-}" SSE3_FOUND)
+}"
+  SSE3_FOUND)
 
 # Check AVX
 set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
+set(AVX_FOUND_EXITCODE
+    1
+    CACHE STRING "Result from TRY_RUN" FORCE)
+check_cxx_source_runs(
+  "
 #include <immintrin.h>
 int main()
 {
@@ -68,24 +85,32 @@ int main()
     __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
     __m256 result = _mm256_add_ps (a, b);
     return 0;
-}" AVX_FOUND)
+}"
+  AVX_FOUND)
 
 # Check AVX 2
 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
+set(AVX2_FOUND_EXITCODE
+    1
+    CACHE STRING "Result from TRY_RUN" FORCE)
+check_cxx_source_runs(
+  "
 #include <immintrin.h>
 int main()
 {
     __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
     __m256i result = _mm256_abs_epi32 (a);
     return 0;
-}" AVX2_FOUND)
+}"
+  AVX2_FOUND)
 
 # Check AVX512F
 set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
+set(AVX512F_FOUND_EXITCODE
+    1
+    CACHE STRING "Result from TRY_RUN" FORCE)
+check_cxx_source_runs(
+  "
 #include <immintrin.h>
 int main()
 {
@@ -93,7 +118,9 @@ int main()
                                   13, -5, 6, -7, 9, 2, -6, 3);
     __m512i result = _mm512_abs_epi32 (a);
     return 0;
-}" AVX512F_FOUND)
+}"
+  AVX512F_FOUND)
 
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
-mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
+mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND
+                 AVX512F_FOUND)
diff --git a/cmake/system.cmake b/cmake/system.cmake
index c740136b93d52..0562077eae187 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,66 +25,82 @@ if(UNIX AND NOT APPLE)
   set(LINUX TRUE)
 endif(UNIX AND NOT APPLE)
 
-IF(WIN32)
-    SET(HOST_SYSTEM "win32")
-ELSE(WIN32)
-    IF(APPLE)
-        SET(HOST_SYSTEM "macosx")
-        EXEC_PROGRAM(sw_vers ARGS -productVersion OUTPUT_VARIABLE HOST_SYSTEM_VERSION)
-        STRING(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}")
-        IF(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET})
-            # Set cache variable - end user may change this during ccmake or cmake-gui configure.
-            SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
-                "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
-        ENDIF()
-        set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
-    ELSE(APPLE)
+if(WIN32)
+  set(HOST_SYSTEM "win32")
+else(WIN32)
+  if(APPLE)
+    set(HOST_SYSTEM "macosx")
+    exec_program(
+      sw_vers ARGS
+      -productVersion
+      OUTPUT_VARIABLE HOST_SYSTEM_VERSION)
+    string(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}")
+    if(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET})
+      # Set cache variable - end user may change this during ccmake or cmake-gui configure.
+      set(CMAKE_OSX_DEPLOYMENT_TARGET
+          ${MACOS_VERSION}
+          CACHE
+            STRING
+            "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value."
+      )
+    endif()
+    set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+  else(APPLE)
 
-        IF(EXISTS "/etc/issue")
-            FILE(READ "/etc/issue" LINUX_ISSUE)
-            IF(LINUX_ISSUE MATCHES "CentOS")
-                SET(HOST_SYSTEM "centos")
-            ELSEIF(LINUX_ISSUE MATCHES "Debian")
-                SET(HOST_SYSTEM "debian")
-            ELSEIF(LINUX_ISSUE MATCHES "Ubuntu")
-                SET(HOST_SYSTEM "ubuntu")
-            ELSEIF(LINUX_ISSUE MATCHES "Red Hat")
-                SET(HOST_SYSTEM "redhat")
-            ELSEIF(LINUX_ISSUE MATCHES "Fedora")
-                SET(HOST_SYSTEM "fedora")
-            ENDIF()
+    if(EXISTS "/etc/issue")
+      file(READ "/etc/issue" LINUX_ISSUE)
+      if(LINUX_ISSUE MATCHES "CentOS")
+        set(HOST_SYSTEM "centos")
+      elseif(LINUX_ISSUE MATCHES "Debian")
+        set(HOST_SYSTEM "debian")
+      elseif(LINUX_ISSUE MATCHES "Ubuntu")
+        set(HOST_SYSTEM "ubuntu")
+      elseif(LINUX_ISSUE MATCHES "Red Hat")
+        set(HOST_SYSTEM "redhat")
+      elseif(LINUX_ISSUE MATCHES "Fedora")
+        set(HOST_SYSTEM "fedora")
+      endif()
 
-            STRING(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION "${LINUX_ISSUE}")
-        ENDIF(EXISTS "/etc/issue")
+      string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION
+                   "${LINUX_ISSUE}")
+    endif(EXISTS "/etc/issue")
 
-        IF(EXISTS "/etc/redhat-release")
-            FILE(READ "/etc/redhat-release" LINUX_ISSUE)
-            IF(LINUX_ISSUE MATCHES "CentOS")
-                SET(HOST_SYSTEM "centos")
-            ENDIF()
-        ENDIF(EXISTS "/etc/redhat-release")
+    if(EXISTS "/etc/redhat-release")
+      file(READ "/etc/redhat-release" LINUX_ISSUE)
+      if(LINUX_ISSUE MATCHES "CentOS")
+        set(HOST_SYSTEM "centos")
+      endif()
+    endif(EXISTS "/etc/redhat-release")
 
-        IF(NOT HOST_SYSTEM)
-            SET(HOST_SYSTEM ${CMAKE_SYSTEM_NAME})
-        ENDIF()
+    if(NOT HOST_SYSTEM)
+      set(HOST_SYSTEM ${CMAKE_SYSTEM_NAME})
+    endif()
 
-    ENDIF(APPLE)
-ENDIF(WIN32)
+  endif(APPLE)
+endif(WIN32)
 
 # query number of logical cores
-CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
+cmake_host_system_information(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
 
-MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
+mark_as_advanced(HOST_SYSTEM CPU_CORES)
 
-MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
-MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
+message(
+  STATUS
+    "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
+message(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
 
 # external dependencies log output
-SET(EXTERNAL_PROJECT_LOG_ARGS
-    LOG_DOWNLOAD    0     # Wrap download in script to log output
-    LOG_UPDATE      1     # Wrap update in script to log output
-    LOG_CONFIGURE   1     # Wrap configure in script to log output
-    LOG_BUILD       0     # Wrap build in script to log output
-    LOG_TEST        1     # Wrap test in script to log output
-    LOG_INSTALL     0     # Wrap install in script to log output
+set(EXTERNAL_PROJECT_LOG_ARGS
+    LOG_DOWNLOAD
+    0 # Wrap download in script to log output
+    LOG_UPDATE
+    1 # Wrap update in script to log output
+    LOG_CONFIGURE
+    1 # Wrap configure in script to log output
+    LOG_BUILD
+    0 # Wrap build in script to log output
+    LOG_TEST
+    1 # Wrap test in script to log output
+    LOG_INSTALL
+    0 # Wrap install in script to log output
 )
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index e4b22befff850..5651ceb76e538 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -1,87 +1,103 @@
 if(NOT WITH_GPU OR NOT WITH_TENSORRT)
-    return()
+  return()
 endif()
 
 if(WIN32)
-    string(REPLACE "\\" "/" TENSORRT_ROOT "${TENSORRT_ROOT}")
-    set(TR_INFER_LIB nvinfer.lib)
-    set(TR_INFER_RT nvinfer.dll)
-    set(TR_INFER_PLUGIN_RT nvinfer_plugin.dll)
+  string(REPLACE "\\" "/" TENSORRT_ROOT "${TENSORRT_ROOT}")
+  set(TR_INFER_LIB nvinfer.lib)
+  set(TR_INFER_RT nvinfer.dll)
+  set(TR_INFER_PLUGIN_RT nvinfer_plugin.dll)
 else()
-    set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
-    set(TR_INFER_LIB libnvinfer.a)
-    set(TR_INFER_RT libnvinfer.so)
-    set(TR_INFER_PLUGIN_RT libnvinfer_plugin.so)
+  set(TENSORRT_ROOT
+      "/usr"
+      CACHE PATH "TENSORRT ROOT")
+  set(TR_INFER_LIB libnvinfer.a)
+  set(TR_INFER_RT libnvinfer.so)
+  set(TR_INFER_PLUGIN_RT libnvinfer_plugin.so)
 endif()
 
-find_path(TENSORRT_INCLUDE_DIR NvInfer.h
-    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
-    ${TENSORRT_ROOT}/include/${CMAKE_LIBRARY_ARCHITECTURE}
-    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
-    $ENV{TENSORRT_ROOT}/include/${CMAKE_LIBRARY_ARCHITECTURE}
-    NO_DEFAULT_PATH
-)
+find_path(
+  TENSORRT_INCLUDE_DIR NvInfer.h
+  PATHS ${TENSORRT_ROOT}
+        ${TENSORRT_ROOT}/include
+        ${TENSORRT_ROOT}/include/${CMAKE_LIBRARY_ARCHITECTURE}
+        $ENV{TENSORRT_ROOT}
+        $ENV{TENSORRT_ROOT}/include
+        $ENV{TENSORRT_ROOT}/include/${CMAKE_LIBRARY_ARCHITECTURE}
+  NO_DEFAULT_PATH)
 
-find_path(TENSORRT_LIBRARY_DIR NAMES ${TR_INFER_LIB} ${TR_INFER_RT}
-    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
-    ${TENSORRT_ROOT}/lib/${CMAKE_LIBRARY_ARCHITECTURE}
-    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
-    $ENV{TENSORRT_ROOT}/lib/${CMAKE_LIBRARY_ARCHITECTURE}
-    NO_DEFAULT_PATH
-    DOC "Path to TensorRT library."
-)
+find_path(
+  TENSORRT_LIBRARY_DIR
+  NAMES ${TR_INFER_LIB} ${TR_INFER_RT}
+  PATHS ${TENSORRT_ROOT}
+        ${TENSORRT_ROOT}/lib
+        ${TENSORRT_ROOT}/lib/${CMAKE_LIBRARY_ARCHITECTURE}
+        $ENV{TENSORRT_ROOT}
+        $ENV{TENSORRT_ROOT}/lib
+        $ENV{TENSORRT_ROOT}/lib/${CMAKE_LIBRARY_ARCHITECTURE}
+  NO_DEFAULT_PATH
+  DOC "Path to TensorRT library.")
 
-find_library(TENSORRT_LIBRARY NAMES ${TR_INFER_LIB} ${TR_INFER_RT}
-    PATHS ${TENSORRT_LIBRARY_DIR}
-    NO_DEFAULT_PATH
-    DOC "Path to TensorRT library.")
+find_library(
+  TENSORRT_LIBRARY
+  NAMES ${TR_INFER_LIB} ${TR_INFER_RT}
+  PATHS ${TENSORRT_LIBRARY_DIR}
+  NO_DEFAULT_PATH
+  DOC "Path to TensorRT library.")
 
 if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
-    set(TENSORRT_FOUND ON)
+  set(TENSORRT_FOUND ON)
 else()
-    set(TENSORRT_FOUND OFF)
-    message(WARNING "TensorRT is disabled. You are compiling PaddlePaddle with option -DWITH_TENSORRT=ON, but TensorRT is not found, please configure path to TensorRT with option -DTENSORRT_ROOT or install it.")
+  set(TENSORRT_FOUND OFF)
+  message(
+    WARNING
+      "TensorRT is disabled. You are compiling PaddlePaddle with option -DWITH_TENSORRT=ON, but TensorRT is not found, please configure path to TensorRT with option -DTENSORRT_ROOT or install it."
+  )
 endif()
 
 if(TENSORRT_FOUND)
-    file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
-    string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
-        "${TENSORRT_VERSION_FILE_CONTENTS}")
-    string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" TENSORRT_MINOR_VERSION
-        "${TENSORRT_VERSION_FILE_CONTENTS}")
-    string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" TENSORRT_PATCH_VERSION
-        "${TENSORRT_VERSION_FILE_CONTENTS}")
-    string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" TENSORRT_BUILD_VERSION
-        "${TENSORRT_VERSION_FILE_CONTENTS}")
+  file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
+  string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)"
+               TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
+  string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)"
+               TENSORRT_MINOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
+  string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)"
+               TENSORRT_PATCH_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
+  string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)"
+               TENSORRT_BUILD_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
 
-    if("${TENSORRT_MAJOR_VERSION}" STREQUAL "")
-        file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h TENSORRT_VERSION_FILE_CONTENTS)
-        string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
-        "${TENSORRT_VERSION_FILE_CONTENTS}")
-        string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" TENSORRT_MINOR_VERSION
-        "${TENSORRT_VERSION_FILE_CONTENTS}")
-        string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" TENSORRT_PATCH_VERSION
-        "${TENSORRT_VERSION_FILE_CONTENTS}")
-        string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" TENSORRT_BUILD_VERSION
-        "${TENSORRT_VERSION_FILE_CONTENTS}")
-    endif()
+  if("${TENSORRT_MAJOR_VERSION}" STREQUAL "")
+    file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h
+         TENSORRT_VERSION_FILE_CONTENTS)
+    string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)"
+                 TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
+    string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)"
+                 TENSORRT_MINOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
+    string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)"
+                 TENSORRT_PATCH_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
+    string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)"
+                 TENSORRT_BUILD_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
+  endif()
 
-    if("${TENSORRT_MAJOR_VERSION}" STREQUAL "")
-        message(SEND_ERROR "Failed to detect TensorRT version.")
-    endif()
+  if("${TENSORRT_MAJOR_VERSION}" STREQUAL "")
+    message(SEND_ERROR "Failed to detect TensorRT version.")
+  endif()
 
-    string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
-        TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
-    string(REGEX REPLACE "define NV_TENSORRT_MINOR +([0-9]+)" "\\1"
-        TENSORRT_MINOR_VERSION "${TENSORRT_MINOR_VERSION}")
-    string(REGEX REPLACE "define NV_TENSORRT_PATCH +([0-9]+)" "\\1"
-        TENSORRT_PATCH_VERSION "${TENSORRT_PATCH_VERSION}")
-    string(REGEX REPLACE "define NV_TENSORRT_BUILD +([0-9]+)" "\\1"
-        TENSORRT_BUILD_VERSION "${TENSORRT_BUILD_VERSION}")
+  string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
+                       TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
+  string(REGEX REPLACE "define NV_TENSORRT_MINOR +([0-9]+)" "\\1"
+                       TENSORRT_MINOR_VERSION "${TENSORRT_MINOR_VERSION}")
+  string(REGEX REPLACE "define NV_TENSORRT_PATCH +([0-9]+)" "\\1"
+                       TENSORRT_PATCH_VERSION "${TENSORRT_PATCH_VERSION}")
+  string(REGEX REPLACE "define NV_TENSORRT_BUILD +([0-9]+)" "\\1"
+                       TENSORRT_BUILD_VERSION "${TENSORRT_BUILD_VERSION}")
 
-    message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
-        "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION} ")
-    include_directories(${TENSORRT_INCLUDE_DIR})
-    link_directories(${TENSORRT_LIBRARY})
-    add_definitions(-DPADDLE_WITH_TENSORRT)
+  message(
+    STATUS
+      "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
+      "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION} "
+  )
+  include_directories(${TENSORRT_INCLUDE_DIR})
+  link_directories(${TENSORRT_LIBRARY})
+  add_definitions(-DPADDLE_WITH_TENSORRT)
 endif()
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index eb6fa4ee13c81..2004241ab1a76 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -15,10 +15,14 @@
 include(ExternalProject)
 # Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)
 
-set(THIRD_PARTY_PATH  "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
-    "A path setting third party libraries download & build directories.")
-set(THIRD_PARTY_CACHE_PATH     "${CMAKE_SOURCE_DIR}"    CACHE STRING
-    "A path cache third party source code to avoid repeated download.")
+set(THIRD_PARTY_PATH
+    "${CMAKE_BINARY_DIR}/third_party"
+    CACHE STRING
+          "A path setting third party libraries download & build directories.")
+set(THIRD_PARTY_CACHE_PATH
+    "${CMAKE_SOURCE_DIR}"
+    CACHE STRING
+          "A path cache third party source code to avoid repeated download.")
 
 set(THIRD_PARTY_BUILD_TYPE Release)
 set(third_party_deps)
@@ -39,389 +43,457 @@ set(third_party_deps)
 #            TAG        ${TARGET_TAG}
 #            DIR        ${TARGET_SOURCE_DIR})
 
-FUNCTION(cache_third_party TARGET)
-    SET(options "")
-    SET(oneValueArgs URL REPOSITORY TAG DIR)
-    SET(multiValueArgs "")
-    cmake_parse_arguments(cache_third_party "${optionps}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    STRING(REPLACE "extern_" "" TARGET_NAME ${TARGET})
-    STRING(REGEX REPLACE "[0-9]+" "" TARGET_NAME ${TARGET_NAME})
-    STRING(TOUPPER ${TARGET_NAME} TARGET_NAME)
-    IF(cache_third_party_REPOSITORY)
-        SET(${TARGET_NAME}_DOWNLOAD_CMD
-                GIT_REPOSITORY  ${cache_third_party_REPOSITORY})
-        IF(cache_third_party_TAG)
-            LIST(APPEND   ${TARGET_NAME}_DOWNLOAD_CMD
-                    GIT_TAG     ${cache_third_party_TAG})
-        ENDIF()
-    ELSEIF(cache_third_party_URL)
-        SET(${TARGET_NAME}_DOWNLOAD_CMD
-                URL             ${cache_third_party_URL})
-    ELSE()
-        MESSAGE(FATAL_ERROR    "Download link (Git repo or URL) must be specified for cache!")
-    ENDIF()
-    IF(WITH_TP_CACHE)
-        IF(NOT cache_third_party_DIR)
-            MESSAGE(FATAL_ERROR   "Please input the ${TARGET_NAME}_SOURCE_DIR for overwriting when -DWITH_TP_CACHE=ON")
-        ENDIF()
-        # Generate and verify cache dir for third_party source code
-        SET(cache_third_party_REPOSITORY ${cache_third_party_REPOSITORY} ${cache_third_party_URL})
-        IF(cache_third_party_REPOSITORY AND cache_third_party_TAG)
-            STRING(MD5 HASH_REPO ${cache_third_party_REPOSITORY})
-            STRING(MD5 HASH_GIT ${cache_third_party_TAG})
-            STRING(SUBSTRING ${HASH_REPO} 0 8 HASH_REPO)
-            STRING(SUBSTRING ${HASH_GIT} 0 8 HASH_GIT)
-            STRING(CONCAT HASH ${HASH_REPO} ${HASH_GIT})
-            # overwrite the original SOURCE_DIR when cache directory
-            SET(${cache_third_party_DIR} ${THIRD_PARTY_CACHE_PATH}/third_party/${TARGET}_${HASH})
-        ELSEIF(cache_third_party_REPOSITORY)
-            STRING(MD5 HASH_REPO ${cache_third_party_REPOSITORY})
-            STRING(SUBSTRING ${HASH_REPO} 0 16 HASH)
-            # overwrite the original SOURCE_DIR when cache directory
-            SET(${cache_third_party_DIR} ${THIRD_PARTY_CACHE_PATH}/third_party/${TARGET}_${HASH})
-        ENDIF()
-
-        IF(EXISTS ${${cache_third_party_DIR}})
-            # judge whether the cache dir is empty
-            FILE(GLOB files ${${cache_third_party_DIR}}/*)
-            LIST(LENGTH files files_len)
-            IF(files_len GREATER 0)
-                list(APPEND ${TARGET_NAME}_DOWNLOAD_CMD DOWNLOAD_COMMAND "")
-            ENDIF()
-        ENDIF()
-        SET(${cache_third_party_DIR} ${${cache_third_party_DIR}} PARENT_SCOPE)
-    ENDIF()
-
-    # Pass ${TARGET_NAME}_DOWNLOAD_CMD to parent scope, the double quotation marks can't be removed
-    SET(${TARGET_NAME}_DOWNLOAD_CMD "${${TARGET_NAME}_DOWNLOAD_CMD}" PARENT_SCOPE)
-ENDFUNCTION()
-
-MACRO(UNSET_VAR VAR_NAME)
-    UNSET(${VAR_NAME} CACHE)
-    UNSET(${VAR_NAME})
-ENDMACRO()
+function(cache_third_party TARGET)
+  set(options "")
+  set(oneValueArgs URL REPOSITORY TAG DIR)
+  set(multiValueArgs "")
+  cmake_parse_arguments(cache_third_party "${optionps}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+
+  string(REPLACE "extern_" "" TARGET_NAME ${TARGET})
+  string(REGEX REPLACE "[0-9]+" "" TARGET_NAME ${TARGET_NAME})
+  string(TOUPPER ${TARGET_NAME} TARGET_NAME)
+  if(cache_third_party_REPOSITORY)
+    set(${TARGET_NAME}_DOWNLOAD_CMD GIT_REPOSITORY
+                                    ${cache_third_party_REPOSITORY})
+    if(cache_third_party_TAG)
+      list(APPEND ${TARGET_NAME}_DOWNLOAD_CMD GIT_TAG ${cache_third_party_TAG})
+    endif()
+  elseif(cache_third_party_URL)
+    set(${TARGET_NAME}_DOWNLOAD_CMD URL ${cache_third_party_URL})
+  else()
+    message(
+      FATAL_ERROR "Download link (Git repo or URL) must be specified for cache!"
+    )
+  endif()
+  if(WITH_TP_CACHE)
+    if(NOT cache_third_party_DIR)
+      message(
+        FATAL_ERROR
+          "Please input the ${TARGET_NAME}_SOURCE_DIR for overwriting when -DWITH_TP_CACHE=ON"
+      )
+    endif()
+    # Generate and verify cache dir for third_party source code
+    set(cache_third_party_REPOSITORY ${cache_third_party_REPOSITORY}
+                                     ${cache_third_party_URL})
+    if(cache_third_party_REPOSITORY AND cache_third_party_TAG)
+      string(MD5 HASH_REPO ${cache_third_party_REPOSITORY})
+      string(MD5 HASH_GIT ${cache_third_party_TAG})
+      string(SUBSTRING ${HASH_REPO} 0 8 HASH_REPO)
+      string(SUBSTRING ${HASH_GIT} 0 8 HASH_GIT)
+      string(CONCAT HASH ${HASH_REPO} ${HASH_GIT})
+      # overwrite the original SOURCE_DIR when cache directory
+      set(${cache_third_party_DIR}
+          ${THIRD_PARTY_CACHE_PATH}/third_party/${TARGET}_${HASH})
+    elseif(cache_third_party_REPOSITORY)
+      string(MD5 HASH_REPO ${cache_third_party_REPOSITORY})
+      string(SUBSTRING ${HASH_REPO} 0 16 HASH)
+      # overwrite the original SOURCE_DIR when cache directory
+      set(${cache_third_party_DIR}
+          ${THIRD_PARTY_CACHE_PATH}/third_party/${TARGET}_${HASH})
+    endif()
+
+    if(EXISTS ${${cache_third_party_DIR}})
+      # judge whether the cache dir is empty
+      file(GLOB files ${${cache_third_party_DIR}}/*)
+      list(LENGTH files files_len)
+      if(files_len GREATER 0)
+        list(APPEND ${TARGET_NAME}_DOWNLOAD_CMD DOWNLOAD_COMMAND "")
+      endif()
+    endif()
+    set(${cache_third_party_DIR}
+        ${${cache_third_party_DIR}}
+        PARENT_SCOPE)
+  endif()
+
+  # Pass ${TARGET_NAME}_DOWNLOAD_CMD to parent scope, the double quotation marks can't be removed
+  set(${TARGET_NAME}_DOWNLOAD_CMD
+      "${${TARGET_NAME}_DOWNLOAD_CMD}"
+      PARENT_SCOPE)
+endfunction()
+
+macro(UNSET_VAR VAR_NAME)
+  unset(${VAR_NAME} CACHE)
+  unset(${VAR_NAME})
+endmacro()
 
 # Funciton to Download the dependencies during compilation
 # This function has 2 parameters, URL / DIRNAME:
 # 1. URL:           The download url of 3rd dependencies
 # 2. NAME:          The name of file, that determin the dirname
 #
-FUNCTION(file_download_and_uncompress URL NAME)
+function(file_download_and_uncompress URL NAME)
   set(options "")
   set(oneValueArgs MD5)
   set(multiValueArgs "")
-  cmake_parse_arguments(URL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}, MD5: ${URL_MD5}")
-  SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data PARENT_SCOPE)
+  cmake_parse_arguments(URL "${options}" "${oneValueArgs}" "${multiValueArgs}"
+                        ${ARGN})
+  message(STATUS "Download dependence[${NAME}] from ${URL}, MD5: ${URL_MD5}")
+  set(${NAME}_INCLUDE_DIR
+      ${THIRD_PARTY_PATH}/${NAME}/data
+      PARENT_SCOPE)
   ExternalProject_Add(
-      download_${NAME}
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      PREFIX                ${THIRD_PARTY_PATH}/${NAME}
-      URL                   ${URL}
-      URL_MD5               ${URL_MD5}
-      TIMEOUT               120
-      DOWNLOAD_DIR          ${THIRD_PARTY_PATH}/${NAME}/data/
-      SOURCE_DIR            ${THIRD_PARTY_PATH}/${NAME}/data/
-      DOWNLOAD_NO_PROGRESS  1
-      CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         ""
-      UPDATE_COMMAND        ""
-      INSTALL_COMMAND       ""
-    )
-  set(third_party_deps ${third_party_deps} download_${NAME} PARENT_SCOPE)
-ENDFUNCTION()
-
+    download_${NAME}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX ${THIRD_PARTY_PATH}/${NAME}
+    URL ${URL}
+    URL_MD5 ${URL_MD5}
+    TIMEOUT 120
+    DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
+    SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
+    DOWNLOAD_NO_PROGRESS 1
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND "")
+  set(third_party_deps
+      ${third_party_deps} download_${NAME}
+      PARENT_SCOPE)
+endfunction()
 
 # Correction of flags on different Platform(WIN/MAC) and Print Warning Message
-if (APPLE)
-    if(WITH_MKL)
-        MESSAGE(WARNING
-            "Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF.")
-        set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE)
-    endif()
+if(APPLE)
+  if(WITH_MKL)
+    message(
+      WARNING "Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF."
+    )
+    set(WITH_MKL
+        OFF
+        CACHE STRING "Disable MKL for building on mac" FORCE)
+  endif()
 endif()
 
 if(WIN32 OR APPLE)
-    MESSAGE(STATUS "Disable XBYAK in Windows and MacOS")
-    SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
-
-    if(WITH_LIBXSMM)
-        MESSAGE(WARNING
-            "Windows, Mac are not supported with libxsmm in Paddle yet."
-            "Force WITH_LIBXSMM=OFF")
-        SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE)
-    endif()
-
-    if(WITH_BOX_PS)
-        MESSAGE(WARNING
-            "Windows or Mac is not supported with BOX_PS in Paddle yet."
-            "Force WITH_BOX_PS=OFF")
-        SET(WITH_BOX_PS OFF CACHE STRING "Disable BOX_PS package in Windows and MacOS" FORCE)
-    endif()
-
-    if(WITH_PSLIB)
-        MESSAGE(WARNING
-            "Windows or Mac is not supported with PSLIB in Paddle yet."
-            "Force WITH_PSLIB=OFF")
-        SET(WITH_PSLIB OFF CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE)
-    endif()
-
-    if(WITH_ARM_BRPC)
-        MESSAGE(WARNING
-            "Windows or Mac is not supported with ARM_BRPC in Paddle yet."
-            "Force WITH_ARM_BRPC=OFF")
-        SET(WITH_ARM_BRPC OFF CACHE STRING "Disable ARM_BRPC package in Windows and MacOS" FORCE)
-    endif()
-
-    if(WITH_LIBMCT)
-        MESSAGE(WARNING
-            "Windows or Mac is not supported with LIBMCT in Paddle yet."
-            "Force WITH_LIBMCT=OFF")
-        SET(WITH_LIBMCT OFF CACHE STRING "Disable LIBMCT package in Windows and MacOS" FORCE)
-    endif()
-
-    if(WITH_PSLIB_BRPC)
-        MESSAGE(WARNING
-            "Windows or Mac is not supported with PSLIB_BRPC in Paddle yet."
-            "Force WITH_PSLIB_BRPC=OFF")
-        SET(WITH_PSLIB_BRPC OFF CACHE STRING "Disable PSLIB_BRPC package in Windows and MacOS" FORCE)
-    endif()
+  message(STATUS "Disable XBYAK in Windows and MacOS")
+  set(WITH_XBYAK
+      OFF
+      CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
+
+  if(WITH_LIBXSMM)
+    message(WARNING "Windows, Mac are not supported with libxsmm in Paddle yet."
+                    "Force WITH_LIBXSMM=OFF")
+    set(WITH_LIBXSMM
+        OFF
+        CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE)
+  endif()
+
+  if(WITH_BOX_PS)
+    message(WARNING "Windows or Mac is not supported with BOX_PS in Paddle yet."
+                    "Force WITH_BOX_PS=OFF")
+    set(WITH_BOX_PS
+        OFF
+        CACHE STRING "Disable BOX_PS package in Windows and MacOS" FORCE)
+  endif()
+
+  if(WITH_PSLIB)
+    message(WARNING "Windows or Mac is not supported with PSLIB in Paddle yet."
+                    "Force WITH_PSLIB=OFF")
+    set(WITH_PSLIB
+        OFF
+        CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE)
+  endif()
+
+  if(WITH_ARM_BRPC)
+    message(
+      WARNING "Windows or Mac is not supported with ARM_BRPC in Paddle yet."
+              "Force WITH_ARM_BRPC=OFF")
+    set(WITH_ARM_BRPC
+        OFF
+        CACHE STRING "Disable ARM_BRPC package in Windows and MacOS" FORCE)
+  endif()
+
+  if(WITH_LIBMCT)
+    message(WARNING "Windows or Mac is not supported with LIBMCT in Paddle yet."
+                    "Force WITH_LIBMCT=OFF")
+    set(WITH_LIBMCT
+        OFF
+        CACHE STRING "Disable LIBMCT package in Windows and MacOS" FORCE)
+  endif()
+
+  if(WITH_PSLIB_BRPC)
+    message(
+      WARNING "Windows or Mac is not supported with PSLIB_BRPC in Paddle yet."
+              "Force WITH_PSLIB_BRPC=OFF")
+    set(WITH_PSLIB_BRPC
+        OFF
+        CACHE STRING "Disable PSLIB_BRPC package in Windows and MacOS" FORCE)
+  endif()
 endif()
 
 set(WITH_MKLML ${WITH_MKL})
 if(NOT DEFINED WITH_MKLDNN)
-    if(WITH_MKL AND AVX2_FOUND)
-        set(WITH_MKLDNN ON)
-    else()
-        message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
-        set(WITH_MKLDNN OFF)
-    endif()
+  if(WITH_MKL AND AVX2_FOUND)
+    set(WITH_MKLDNN ON)
+  else()
+    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
+    set(WITH_MKLDNN OFF)
+  endif()
 endif()
 
-if(WIN32 OR APPLE OR NOT WITH_GPU OR ON_INFER)
-    set(WITH_DGC OFF)
+if(WIN32
+   OR APPLE
+   OR NOT WITH_GPU
+   OR ON_INFER)
+  set(WITH_DGC OFF)
 endif()
 
 if(${CMAKE_VERSION} VERSION_GREATER "3.5.2")
-    set(SHALLOW_CLONE "GIT_SHALLOW TRUE") # adds --depth=1 arg to git clone of External_Projects
+  set(SHALLOW_CLONE "GIT_SHALLOW TRUE"
+  )# adds --depth=1 arg to git clone of External_Projects
 endif()
 
 ########################### include third_party according to flags ###############################
-include(external/zlib)      # download, build, install zlib
-include(external/gflags)    # download, build, install gflags
-include(external/glog)      # download, build, install glog
-include(external/boost)     # download boost
-include(external/eigen)     # download eigen3
-include(external/threadpool)# download threadpool
-include(external/dlpack)    # download dlpack
-include(external/xxhash)    # download, build, install xxhash
-include(external/warpctc)   # download, build, install warpctc
-include(external/utf8proc)   # download, build, install utf8proc
-
-list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
-list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool extern_utf8proc)
-include(external/lapack)    # download, build, install lapack
-
-list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
-list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool extern_lapack)
-
-include(cblas)              	# find first, then download, build, install openblas
+include(external/zlib) # download, build, install zlib
+include(external/gflags) # download, build, install gflags
+include(external/glog) # download, build, install glog
+include(external/boost) # download boost
+include(external/eigen) # download eigen3
+include(external/threadpool) # download threadpool
+include(external/dlpack) # download dlpack
+include(external/xxhash) # download, build, install xxhash
+include(external/warpctc) # download, build, install warpctc
+include(external/utf8proc) # download, build, install utf8proc
+
+list(
+  APPEND
+  third_party_deps
+  extern_eigen3
+  extern_gflags
+  extern_glog
+  extern_boost
+  extern_xxhash)
+list(
+  APPEND
+  third_party_deps
+  extern_zlib
+  extern_dlpack
+  extern_warpctc
+  extern_threadpool
+  extern_utf8proc)
+include(external/lapack) # download, build, install lapack
+
+list(
+  APPEND
+  third_party_deps
+  extern_eigen3
+  extern_gflags
+  extern_glog
+  extern_boost
+  extern_xxhash)
+list(
+  APPEND
+  third_party_deps
+  extern_zlib
+  extern_dlpack
+  extern_warpctc
+  extern_threadpool
+  extern_lapack)
+
+include(cblas) # find first, then download, build, install openblas
 
 message(STATUS "CBLAS_PROVIDER: ${CBLAS_PROVIDER}")
 if(${CBLAS_PROVIDER} STREQUAL MKLML)
-    list(APPEND third_party_deps extern_mklml)
+  list(APPEND third_party_deps extern_mklml)
 elseif(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
-    list(APPEND third_party_deps extern_openblas)
+  list(APPEND third_party_deps extern_openblas)
 endif()
 
-
 if(WITH_MKLDNN)
-    include(external/mkldnn)    # download, build, install mkldnn
-    list(APPEND third_party_deps extern_mkldnn)
+  include(external/mkldnn) # download, build, install mkldnn
+  list(APPEND third_party_deps extern_mkldnn)
 endif()
 
-include(external/protobuf)  	# find first, then download, build, install protobuf
+include(external/protobuf) # find first, then download, build, install protobuf
 if(TARGET extern_protobuf)
-    list(APPEND third_party_deps extern_protobuf)
+  list(APPEND third_party_deps extern_protobuf)
 endif()
 
 if(WITH_PYTHON)
-    include(external/python)    # find python and python_module
-    include(external/pybind11)  # download pybind11
-    list(APPEND third_party_deps extern_pybind)
+  include(external/python) # find python and python_module
+  include(external/pybind11) # download pybind11
+  list(APPEND third_party_deps extern_pybind)
 endif()
 
-IF(WITH_TESTING OR WITH_DISTRIBUTE)
-    include(external/gtest)     # download, build, install gtest
-    list(APPEND third_party_deps extern_gtest)
-ENDIF()
+if(WITH_TESTING OR WITH_DISTRIBUTE)
+  include(external/gtest) # download, build, install gtest
+  list(APPEND third_party_deps extern_gtest)
+endif()
 
 if(WITH_ONNXRUNTIME)
-    include(external/onnxruntime)            # download, build, install onnxruntime、paddle2onnx
-    include(external/paddle2onnx)          
-    list(APPEND third_party_deps extern_onnxruntime extern_paddle2onnx)
+  include(external/onnxruntime
+  )# download, build, install onnxruntime、paddle2onnx
+  include(external/paddle2onnx)
+  list(APPEND third_party_deps extern_onnxruntime extern_paddle2onnx)
 endif()
 
 if(WITH_GPU)
-    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
-        include(external/cub)       # download cub
-        list(APPEND third_party_deps extern_cub)
-    endif()
-    set(URL  "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" CACHE STRING "" FORCE)
-    file_download_and_uncompress(${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa)   # download file externalErrorMsg.tar.gz
-    if(WITH_TESTING)
-        # copy externalErrorMsg.pb, just for unittest can get error message correctly.
-        set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data)
-        if(WIN32 AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja"))
-            set(DST_DIR1 ${CMAKE_BINARY_DIR}/paddle/fluid/third_party/externalError/data)
-        else()
-            set(DST_DIR1 ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data)
-        endif()
-        set(DST_DIR2 ${CMAKE_BINARY_DIR}/python/paddle/include/third_party/externalError/data)
-        add_custom_command(TARGET download_externalError POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR1}
-            COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR2}
-            COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+    include(external/cub) # download cub
+    list(APPEND third_party_deps extern_cub)
+  endif()
+  set(URL
+      "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz"
+      CACHE STRING "" FORCE)
+  file_download_and_uncompress(
+    ${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa
+  )# download file externalErrorMsg.tar.gz
+  if(WITH_TESTING)
+    # copy externalErrorMsg.pb, just for unittest can get error message correctly.
+    set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data)
+    if(WIN32 AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja"))
+      set(DST_DIR1
+          ${CMAKE_BINARY_DIR}/paddle/fluid/third_party/externalError/data)
+    else()
+      set(DST_DIR1 ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data)
     endif()
+    set(DST_DIR2
+        ${CMAKE_BINARY_DIR}/python/paddle/include/third_party/externalError/data
+    )
+    add_custom_command(
+      TARGET download_externalError
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR1}
+      COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR2}
+      COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR}")
+  endif()
 endif(WITH_GPU)
 
 if(WITH_XPU)
-    include(external/xpu)          # download, build, install xpu
-    list(APPEND third_party_deps extern_xpu)
+  include(external/xpu) # download, build, install xpu
+  list(APPEND third_party_deps extern_xpu)
 endif(WITH_XPU)
 
 if(WITH_MLU)
-    include(external/concurrentqueue) # download, build, install concurrentqueue
-    list(APPEND third_party_deps extern_concurrentqueue)
+  include(external/concurrentqueue) # download, build, install concurrentqueue
+  list(APPEND third_party_deps extern_concurrentqueue)
 endif(WITH_MLU)
 
 if(WITH_PSLIB)
-    include(external/pslib)          # download, build, install pslib
-    list(APPEND third_party_deps extern_pslib)
-    if(WITH_LIBMCT)
-        include(external/libmct)     # download, build, install libmct
-        list(APPEND third_party_deps extern_libxsmm)
-    endif()
-    if(WITH_PSLIB_BRPC)
-        include(external/pslib_brpc) # download, build, install pslib_brpc
-        list(APPEND third_party_deps extern_pslib_brpc)
-    else()    
-        include(external/snappy)
-        list(APPEND third_party_deps extern_snappy)
-
-        include(external/leveldb)
-        list(APPEND third_party_deps extern_leveldb)
-        if(NOT WITH_HETERPS)
-            include(external/brpc)
-            list(APPEND third_party_deps extern_brpc)
-        endif()
+  include(external/pslib) # download, build, install pslib
+  list(APPEND third_party_deps extern_pslib)
+  if(WITH_LIBMCT)
+    include(external/libmct) # download, build, install libmct
+    list(APPEND third_party_deps extern_libxsmm)
+  endif()
+  if(WITH_PSLIB_BRPC)
+    include(external/pslib_brpc) # download, build, install pslib_brpc
+    list(APPEND third_party_deps extern_pslib_brpc)
+  else()
+    include(external/snappy)
+    list(APPEND third_party_deps extern_snappy)
+
+    include(external/leveldb)
+    list(APPEND third_party_deps extern_leveldb)
+    if(NOT WITH_HETERPS)
+      include(external/brpc)
+      list(APPEND third_party_deps extern_brpc)
     endif()
+  endif()
 endif(WITH_PSLIB)
 
 if(NOT WIN32 AND NOT APPLE)
-    include(external/gloo)
-    list(APPEND third_party_deps extern_gloo)
+  include(external/gloo)
+  list(APPEND third_party_deps extern_gloo)
 endif()
 
 if(WITH_BOX_PS)
-    include(external/box_ps)
-    list(APPEND third_party_deps extern_box_ps)
+  include(external/box_ps)
+  list(APPEND third_party_deps extern_box_ps)
 endif(WITH_BOX_PS)
 
 if(WITH_ASCEND OR WITH_ASCEND_CL)
-    include(external/ascend)
-    if(WITH_ASCEND OR WITH_ASCEND_CL)
-        list(APPEND third_party_deps extern_ascend)
-    endif()
-    if(WITH_ASCEND_CL)
-        list(APPEND third_party_deps extern_ascend_cl)
-    endif()
-endif ()
+  include(external/ascend)
+  if(WITH_ASCEND OR WITH_ASCEND_CL)
+    list(APPEND third_party_deps extern_ascend)
+  endif()
+  if(WITH_ASCEND_CL)
+    list(APPEND third_party_deps extern_ascend_cl)
+  endif()
+endif()
 
-if (WITH_PSCORE)
-    include(external/snappy)
-    list(APPEND third_party_deps extern_snappy)
+if(WITH_PSCORE)
+  include(external/snappy)
+  list(APPEND third_party_deps extern_snappy)
 
-    include(external/leveldb)
-    list(APPEND third_party_deps extern_leveldb)
-    
-    if (WITH_ARM_BRPC)
-        include(external/arm_brpc)
-        list(APPEND third_party_deps extern_arm_brpc)
-    else()
-        include(external/brpc)
-        list(APPEND third_party_deps extern_brpc)
-    endif()
+  include(external/leveldb)
+  list(APPEND third_party_deps extern_leveldb)
+
+  if(WITH_ARM_BRPC)
+    include(external/arm_brpc)
+    list(APPEND third_party_deps extern_arm_brpc)
+  else()
+    include(external/brpc)
+    list(APPEND third_party_deps extern_brpc)
+  endif()
 
-    include(external/libmct)     # download, build, install libmct
-    list(APPEND third_party_deps extern_libmct)
+  include(external/libmct) # download, build, install libmct
+  list(APPEND third_party_deps extern_libmct)
 
-    include(external/rocksdb)     # download, build, install rocksdb
-    list(APPEND third_party_deps extern_rocksdb)
+  include(external/rocksdb) # download, build, install rocksdb
+  list(APPEND third_party_deps extern_rocksdb)
 endif()
 
 if(WITH_XBYAK)
-    include(external/xbyak)         # download, build, install xbyak
-    list(APPEND third_party_deps extern_xbyak)
+  include(external/xbyak) # download, build, install xbyak
+  list(APPEND third_party_deps extern_xbyak)
 endif()
 
 if(WITH_LIBXSMM)
-    include(external/libxsmm)       # download, build, install libxsmm
-    list(APPEND third_party_deps extern_libxsmm)
+  include(external/libxsmm) # download, build, install libxsmm
+  list(APPEND third_party_deps extern_libxsmm)
 endif()
 
 if(WITH_DGC)
-    message(STATUS "add dgc lib.")
-    include(external/dgc)           # download, build, install dgc
-    add_definitions(-DPADDLE_WITH_DGC)
-    list(APPEND third_party_deps extern_dgc)
+  message(STATUS "add dgc lib.")
+  include(external/dgc) # download, build, install dgc
+  add_definitions(-DPADDLE_WITH_DGC)
+  list(APPEND third_party_deps extern_dgc)
 endif()
 
-if (WITH_LITE)
-    message(STATUS "Compile Paddle with Lite Engine.")
-    include(external/lite)
-endif (WITH_LITE)
-
-if (WITH_CINN)
-    message(STATUS "Compile Paddle with CINN.")
-    include(external/cinn)
-    add_definitions(-DPADDLE_WITH_CINN)
-    if (WITH_GPU)
-        add_definitions(-DCINN_WITH_CUDA)
-        add_definitions(-DCINN_WITH_CUDNN)
-    endif (WITH_GPU)
-    if (WITH_MKL)
-        add_definitions(-DCINN_WITH_MKL_CBLAS)
-        add_definitions(-DCINN_WITH_MKLDNN)
-    endif (WITH_MKL)
-endif (WITH_CINN)
-
-if (WITH_CRYPTO)
-    include(external/cryptopp)   # download, build, install cryptopp
-    list(APPEND third_party_deps extern_cryptopp)
-    add_definitions(-DPADDLE_WITH_CRYPTO)
-endif (WITH_CRYPTO)
-
-if (WITH_POCKETFFT)
-    include(external/pocketfft)
-    list(APPEND third_party_deps extern_pocketfft)
-    add_definitions(-DPADDLE_WITH_POCKETFFT)
-endif (WITH_POCKETFFT)
-
-if (WIN32)
-    include(external/dirent)
-    list(APPEND third_party_deps extern_dirent)
-endif (WIN32)
-
-if (WITH_INFRT)
-    include(external/llvm)
-    list(APPEND third_party_deps ${llvm_libs})
+if(WITH_LITE)
+  message(STATUS "Compile Paddle with Lite Engine.")
+  include(external/lite)
+endif(WITH_LITE)
+
+if(WITH_CINN)
+  message(STATUS "Compile Paddle with CINN.")
+  include(external/cinn)
+  add_definitions(-DPADDLE_WITH_CINN)
+  if(WITH_GPU)
+    add_definitions(-DCINN_WITH_CUDA)
+    add_definitions(-DCINN_WITH_CUDNN)
+  endif(WITH_GPU)
+  if(WITH_MKL)
+    add_definitions(-DCINN_WITH_MKL_CBLAS)
+    add_definitions(-DCINN_WITH_MKLDNN)
+  endif(WITH_MKL)
+endif(WITH_CINN)
+
+if(WITH_CRYPTO)
+  include(external/cryptopp) # download, build, install cryptopp
+  list(APPEND third_party_deps extern_cryptopp)
+  add_definitions(-DPADDLE_WITH_CRYPTO)
+endif(WITH_CRYPTO)
+
+if(WITH_POCKETFFT)
+  include(external/pocketfft)
+  list(APPEND third_party_deps extern_pocketfft)
+  add_definitions(-DPADDLE_WITH_POCKETFFT)
+endif(WITH_POCKETFFT)
+
+if(WIN32)
+  include(external/dirent)
+  list(APPEND third_party_deps extern_dirent)
+endif(WIN32)
+
+if(WITH_INFRT)
+  include(external/llvm)
+  list(APPEND third_party_deps ${llvm_libs})
 endif()
 
-if (WITH_IPU)
-    include(external/poplar)
-    list(APPEND third_party_deps extern_poplar)
+if(WITH_IPU)
+  include(external/poplar)
+  list(APPEND third_party_deps extern_poplar)
 endif()
 
 add_custom_target(third_party ALL DEPENDS ${third_party_deps})
diff --git a/cmake/thrust.cmake b/cmake/thrust.cmake
index ff415b1e3c4bf..73c2c29847a34 100644
--- a/cmake/thrust.cmake
+++ b/cmake/thrust.cmake
@@ -1,6 +1,8 @@
 function(add_thrust_patches_if_necessary)
   set(thrust_detect_file ${PROJECT_BINARY_DIR}/detect_thrust.cu)
-  file(WRITE ${thrust_detect_file} ""
+  file(
+    WRITE ${thrust_detect_file}
+    ""
     "#include \"thrust/version.h\"\n"
     "#include \"thrust/shuffle.h\"\n"
     "#include \"stdio.h\"\n"
@@ -10,10 +12,11 @@ function(add_thrust_patches_if_necessary)
     "  return 0;\n"
     "}\n")
 
-  execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}"
-                  "--run" "${thrust_detect_file}"
-                  WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
-                  RESULT_VARIABLE nvcc_res ERROR_QUIET)
+  execute_process(
+    COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${thrust_detect_file}"
+    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+    RESULT_VARIABLE nvcc_res
+    ERROR_QUIET)
   if(NOT nvcc_res EQUAL 0)
     set(thrust_patches "${PADDLE_SOURCE_DIR}/patches/thrust")
     message(STATUS "Add thrust patches: ${thrust_patches}")
diff --git a/cmake/unity_build.cmake b/cmake/unity_build.cmake
index b7e5564b3a618..e18b2ef1ee686 100644
--- a/cmake/unity_build.cmake
+++ b/cmake/unity_build.cmake
@@ -1,12 +1,14 @@
 # Add the following code before all include to avoid compilation failure.
-set(UNITY_CC_BEFORE_CODE [[
+set(UNITY_CC_BEFORE_CODE
+    [[
 #ifndef NOMINMAX
 #define NOMINMAX
 #endif
 #ifndef _USE_MATH_DEFINES
 #define _USE_MATH_DEFINES
 #endif]])
-set(UNITY_CU_BEFORE_CODE [[
+set(UNITY_CU_BEFORE_CODE
+    [[
 #ifndef __CUDACC_VER_MAJOR__
 #define __CUDACC_VER_MAJOR__ CUDA_COMPILER_MAJOR_VERSION
 #endif
@@ -14,15 +16,13 @@ set(UNITY_CU_BEFORE_CODE [[
 #define __CUDACC_VER_MINOR__ CUDA_COMPILER_MINOR_VERSION
 #endif]])
 if(WITH_GPU)
-    string(REPLACE "." ";" CUDA_COMPILER_VERSION ${CMAKE_CUDA_COMPILER_VERSION})
-    list(GET CUDA_COMPILER_VERSION 0 CUDA_COMPILER_MAJOR_VERSION)
-    list(GET CUDA_COMPILER_VERSION 1 CUDA_COMPILER_MINOR_VERSION)
-    string(REPLACE
-        "CUDA_COMPILER_MAJOR_VERSION" ${CUDA_COMPILER_MAJOR_VERSION}
-        UNITY_CU_BEFORE_CODE ${UNITY_CU_BEFORE_CODE})
-    string(REPLACE
-        "CUDA_COMPILER_MINOR_VERSION" ${CUDA_COMPILER_MINOR_VERSION}
-        UNITY_CU_BEFORE_CODE ${UNITY_CU_BEFORE_CODE})
+  string(REPLACE "." ";" CUDA_COMPILER_VERSION ${CMAKE_CUDA_COMPILER_VERSION})
+  list(GET CUDA_COMPILER_VERSION 0 CUDA_COMPILER_MAJOR_VERSION)
+  list(GET CUDA_COMPILER_VERSION 1 CUDA_COMPILER_MINOR_VERSION)
+  string(REPLACE "CUDA_COMPILER_MAJOR_VERSION" ${CUDA_COMPILER_MAJOR_VERSION}
+                 UNITY_CU_BEFORE_CODE ${UNITY_CU_BEFORE_CODE})
+  string(REPLACE "CUDA_COMPILER_MINOR_VERSION" ${CUDA_COMPILER_MINOR_VERSION}
+                 UNITY_CU_BEFORE_CODE ${UNITY_CU_BEFORE_CODE})
 endif()
 
 # Group a list of source files that can be included together.
@@ -30,37 +30,43 @@ endif()
 # do not have to exist.
 # Here you need to specify the source type which belongs to cc or cu.
 function(register_unity_group TYPE)
-    # Get UNITY_TARGET from CMAKE_CURRENT_SOURCE_DIR.
-    string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET ${CMAKE_CURRENT_SOURCE_DIR})
-    string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET})
-    set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity")
+  # Get UNITY_TARGET from CMAKE_CURRENT_SOURCE_DIR.
+  string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET
+                 ${CMAKE_CURRENT_SOURCE_DIR})
+  string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET})
+  set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity")
 
-    # Variable unity_group_index is used to record the number of UNITY_TARGET groups.
-    get_property(unity_group_index GLOBAL PROPERTY ${UNITY_TARGET}_${TYPE}_group_index)
-    if("${unity_group_index}" STREQUAL "")
-        set(unity_group_index 0)
-    endif()
+  # Variable unity_group_index is used to record the number of UNITY_TARGET groups.
+  get_property(unity_group_index GLOBAL
+               PROPERTY ${UNITY_TARGET}_${TYPE}_group_index)
+  if("${unity_group_index}" STREQUAL "")
+    set(unity_group_index 0)
+  endif()
 
-    # Variable unity_group_sources is used to record the sources of one group.
-    set(unity_group_sources ${UNITY_TARGET}_${TYPE}_group_${unity_group_index}_sources)
-    set_property(GLOBAL PROPERTY ${unity_group_sources} "")
-    foreach(src ${ARGN})
-        # UB use absolute path of source.
-        if(NOT IS_ABSOLUTE ${src})
-            set(src ${CMAKE_CURRENT_SOURCE_DIR}/${src})
-        endif()
-        set_property(GLOBAL APPEND PROPERTY ${unity_group_sources} ${src})
-    endforeach()
-
-    # If unity_file does not exists, nv_library or cc_library will use
-    # dummy_file. Touch unity_file to avoid to use dummy file.
-    set(unity_file ${CMAKE_CURRENT_BINARY_DIR}/${UNITY_TARGET}_${unity_group_index}_${TYPE}.${TYPE})
-    if(NOT EXISTS ${unity_file})
-        file(TOUCH ${unity_file})
+  # Variable unity_group_sources is used to record the sources of one group.
+  set(unity_group_sources
+      ${UNITY_TARGET}_${TYPE}_group_${unity_group_index}_sources)
+  set_property(GLOBAL PROPERTY ${unity_group_sources} "")
+  foreach(src ${ARGN})
+    # UB use absolute path of source.
+    if(NOT IS_ABSOLUTE ${src})
+      set(src ${CMAKE_CURRENT_SOURCE_DIR}/${src})
     endif()
+    set_property(GLOBAL APPEND PROPERTY ${unity_group_sources} ${src})
+  endforeach()
+
+  # If unity_file does not exists, nv_library or cc_library will use
+  # dummy_file. Touch unity_file to avoid to use dummy file.
+  set(unity_file
+      ${CMAKE_CURRENT_BINARY_DIR}/${UNITY_TARGET}_${unity_group_index}_${TYPE}.${TYPE}
+  )
+  if(NOT EXISTS ${unity_file})
+    file(TOUCH ${unity_file})
+  endif()
 
-    math(EXPR unity_group_index "${unity_group_index} + 1")
-    set_property(GLOBAL PROPERTY ${UNITY_TARGET}_${TYPE}_group_index ${unity_group_index})
+  math(EXPR unity_group_index "${unity_group_index} + 1")
+  set_property(GLOBAL PROPERTY ${UNITY_TARGET}_${TYPE}_group_index
+                               ${unity_group_index})
 endfunction(register_unity_group)
 
 # Combine the original source files used by `TARGET`, then use
@@ -72,81 +78,105 @@ endfunction(register_unity_group)
 # directory on Windows.
 # Here you need to specify the source type which belongs to cc or cu.
 function(compose_unity_target_sources TARGET TYPE)
-    # Variable unity_target_sources represents the source file used in TARGET
-    set(unity_target_sources "")
-    get_property(unity_group_index_max GLOBAL PROPERTY ${TARGET}_${TYPE}_group_index)
-    foreach(src ${ARGN})
-        set(unity_file "")
-        # Note(zhouwei25): UB use the path releative to CMAKE_SOURCE_DIR.
-        # If use absolute path, sccache/ccache hit rate will be reduced.
-        if(IS_ABSOLUTE ${src})
-            set(src_absolute_path ${src})
-            file(RELATIVE_PATH src_relative_path ${CMAKE_SOURCE_DIR} ${src})
-        else()
-            set(src_absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${src})
-            file(RELATIVE_PATH src_relative_path ${CMAKE_SOURCE_DIR} ${src_absolute_path})
-        endif()
-        # If `unity_group_index_max` is empty, there is no combination
-        # relationship.
-        # TODO(Avin0323): Whether use target property `UNITY_BUILD` of CMAKE to
-        # combine source files.
-        if(NOT "${unity_group_index_max}" STREQUAL "")
-            # Search in each registed group.
-            foreach(unity_group_index RANGE ${unity_group_index_max})
-                if(${unity_group_index} GREATER_EQUAL ${unity_group_index_max})
-                    break()
-                endif()
-                get_property(unity_group_sources GLOBAL PROPERTY ${TARGET}_${TYPE}_group_${unity_group_index}_sources)
-                if(${src_absolute_path} IN_LIST unity_group_sources)
-                    set(unity_file ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}_${unity_group_index}_${TYPE}.${TYPE})
-                    set(unity_file_sources ${TARGET}_${TYPE}_file_${unity_group_index}_sources)
-                    get_property(set_unity_file_sources GLOBAL PROPERTY ${unity_file_sources} SET)
-                    if(NOT ${set_unity_file_sources})
-                        # Add macro before include source files.
-                        set_property(GLOBAL PROPERTY ${unity_file_sources} "// Generate by Unity Build")
-                        set_property(GLOBAL APPEND PROPERTY ${unity_file_sources} ${UNITY_CC_BEFORE_CODE})
-                        if(WITH_GPU AND "${TYPE}" STREQUAL "cu")
-                            set_property(GLOBAL APPEND PROPERTY ${unity_file_sources} ${UNITY_CU_BEFORE_CODE})
-                        endif()
-                    endif()
-                    set_property(GLOBAL APPEND PROPERTY ${unity_file_sources} "#include \"${src_relative_path}\"")
-                    set(unity_target_sources ${unity_target_sources} ${unity_file})
-                    break()
-                endif()
-            endforeach()
+  # Variable unity_target_sources represents the source file used in TARGET
+  set(unity_target_sources "")
+  get_property(unity_group_index_max GLOBAL
+               PROPERTY ${TARGET}_${TYPE}_group_index)
+  foreach(src ${ARGN})
+    set(unity_file "")
+    # Note(zhouwei25): UB use the path releative to CMAKE_SOURCE_DIR.
+    # If use absolute path, sccache/ccache hit rate will be reduced.
+    if(IS_ABSOLUTE ${src})
+      set(src_absolute_path ${src})
+      file(RELATIVE_PATH src_relative_path ${CMAKE_SOURCE_DIR} ${src})
+    else()
+      set(src_absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${src})
+      file(RELATIVE_PATH src_relative_path ${CMAKE_SOURCE_DIR}
+           ${src_absolute_path})
+    endif()
+    # If `unity_group_index_max` is empty, there is no combination
+    # relationship.
+    # TODO(Avin0323): Whether use target property `UNITY_BUILD` of CMAKE to
+    # combine source files.
+    if(NOT "${unity_group_index_max}" STREQUAL "")
+      # Search in each registed group.
+      foreach(unity_group_index RANGE ${unity_group_index_max})
+        if(${unity_group_index} GREATER_EQUAL ${unity_group_index_max})
+          break()
         endif()
-        # Use original source file.
-        if("${unity_file}" STREQUAL "")
-            set(unity_target_sources ${unity_target_sources} ${src})
+        get_property(
+          unity_group_sources GLOBAL
+          PROPERTY ${TARGET}_${TYPE}_group_${unity_group_index}_sources)
+        if(${src_absolute_path} IN_LIST unity_group_sources)
+          set(unity_file
+              ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}_${unity_group_index}_${TYPE}.${TYPE}
+          )
+          set(unity_file_sources
+              ${TARGET}_${TYPE}_file_${unity_group_index}_sources)
+          get_property(
+            set_unity_file_sources GLOBAL
+            PROPERTY ${unity_file_sources}
+            SET)
+          if(NOT ${set_unity_file_sources})
+            # Add macro before include source files.
+            set_property(GLOBAL PROPERTY ${unity_file_sources}
+                                         "// Generate by Unity Build")
+            set_property(GLOBAL APPEND PROPERTY ${unity_file_sources}
+                                                ${UNITY_CC_BEFORE_CODE})
+            if(WITH_GPU AND "${TYPE}" STREQUAL "cu")
+              set_property(GLOBAL APPEND PROPERTY ${unity_file_sources}
+                                                  ${UNITY_CU_BEFORE_CODE})
+            endif()
+          endif()
+          set_property(
+            GLOBAL APPEND PROPERTY ${unity_file_sources}
+                                   "#include \"${src_relative_path}\"")
+          set(unity_target_sources ${unity_target_sources} ${unity_file})
+          break()
         endif()
-    endforeach()
+      endforeach()
+    endif()
+    # Use original source file.
+    if("${unity_file}" STREQUAL "")
+      set(unity_target_sources ${unity_target_sources} ${src})
+    endif()
+  endforeach()
 
-    set(unity_target_${TYPE}_sources ${unity_target_sources} PARENT_SCOPE)
+  set(unity_target_${TYPE}_sources
+      ${unity_target_sources}
+      PARENT_SCOPE)
 endfunction(compose_unity_target_sources)
 
 # Write the unity files used by `UNITY_TARGET`.
 # Write dependent on whether the contents of the unity file have changed, which
 # protects incremental compilation speed.
 function(finish_unity_target TYPE)
-    # Get UNITY_TARGET from CMAKE_CURRENT_SOURCE_DIR.
-    string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET ${CMAKE_CURRENT_SOURCE_DIR})
-    string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET})
-    set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity")
+  # Get UNITY_TARGET from CMAKE_CURRENT_SOURCE_DIR.
+  string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET
+                 ${CMAKE_CURRENT_SOURCE_DIR})
+  string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET})
+  set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity")
 
-    get_property(unity_group_index_max GLOBAL PROPERTY ${UNITY_TARGET}_${TYPE}_group_index)
-    if(NOT "${unity_group_index_max}" STREQUAL "")
-        foreach(unity_group_index RANGE ${unity_group_index_max})
-            if(${unity_group_index} GREATER_EQUAL ${unity_group_index_max})
-                break()
-            endif()
-            get_property(unity_file_sources GLOBAL PROPERTY ${UNITY_TARGET}_${TYPE}_file_${unity_group_index}_sources)
-            set(unity_file_read_content "")
-            string(JOIN "\n" unity_file_write_content ${unity_file_sources})
-            set(unity_file ${CMAKE_CURRENT_BINARY_DIR}/${UNITY_TARGET}_${unity_group_index}_${TYPE}.${TYPE})
-            file(READ ${unity_file} unity_file_read_content)
-            if(NOT "${unity_file_read_content}" STREQUAL "${unity_file_write_content}")
-                file(WRITE ${unity_file} ${unity_file_write_content})
-            endif()
-        endforeach()
-    endif()
+  get_property(unity_group_index_max GLOBAL
+               PROPERTY ${UNITY_TARGET}_${TYPE}_group_index)
+  if(NOT "${unity_group_index_max}" STREQUAL "")
+    foreach(unity_group_index RANGE ${unity_group_index_max})
+      if(${unity_group_index} GREATER_EQUAL ${unity_group_index_max})
+        break()
+      endif()
+      get_property(
+        unity_file_sources GLOBAL
+        PROPERTY ${UNITY_TARGET}_${TYPE}_file_${unity_group_index}_sources)
+      set(unity_file_read_content "")
+      string(JOIN "\n" unity_file_write_content ${unity_file_sources})
+      set(unity_file
+          ${CMAKE_CURRENT_BINARY_DIR}/${UNITY_TARGET}_${unity_group_index}_${TYPE}.${TYPE}
+      )
+      file(READ ${unity_file} unity_file_read_content)
+      if(NOT "${unity_file_read_content}" STREQUAL
+         "${unity_file_write_content}")
+        file(WRITE ${unity_file} ${unity_file_write_content})
+      endif()
+    endforeach()
+  endif()
 endfunction(finish_unity_target)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 02667dbce69ed..8e52831ebe972 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -6,50 +6,47 @@
 # First Argument: target name want to be linked with libraries
 # Rest Arguments: libraries which link together.
 function(target_circle_link_libraries TARGET_NAME)
-    if(APPLE)
-        set(LIBS)
-        set(inArchive OFF)
-        set(libsInArgn)
+  if(APPLE)
+    set(LIBS)
+    set(inArchive OFF)
+    set(libsInArgn)
 
-        foreach(arg ${ARGN})
-            if(${arg} STREQUAL "ARCHIVE_START")
-                set(inArchive ON)
-            elseif(${arg} STREQUAL "ARCHIVE_END")
-                set(inArchive OFF)
-            else()
-                if(inArchive)
-                    list(APPEND LIBS "-Wl,-force_load")
-                endif()
-                list(APPEND LIBS ${arg})
-                list(APPEND libsInArgn ${arg})
-            endif()
-        endforeach()
-        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-            if(NOT IOS_ENABLE_BITCODE)
-                list(APPEND LIBS "-undefined dynamic_lookup")
-            endif()
+    foreach(arg ${ARGN})
+      if(${arg} STREQUAL "ARCHIVE_START")
+        set(inArchive ON)
+      elseif(${arg} STREQUAL "ARCHIVE_END")
+        set(inArchive OFF)
+      else()
+        if(inArchive)
+          list(APPEND LIBS "-Wl,-force_load")
         endif()
-        list(REVERSE libsInArgn)
-        target_link_libraries(${TARGET_NAME}
-            ${LIBS}
-            ${libsInArgn})
+        list(APPEND LIBS ${arg})
+        list(APPEND libsInArgn ${arg})
+      endif()
+    endforeach()
+    if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}"
+                                                      STREQUAL "AppleClang")
+      if(NOT IOS_ENABLE_BITCODE)
+        list(APPEND LIBS "-undefined dynamic_lookup")
+      endif()
+    endif()
+    list(REVERSE libsInArgn)
+    target_link_libraries(${TARGET_NAME} ${LIBS} ${libsInArgn})
 
-    else()  # LINUX
-        set(LIBS)
+  else() # LINUX
+    set(LIBS)
 
-        foreach(arg ${ARGN})
-            if(${arg} STREQUAL "ARCHIVE_START")
-                list(APPEND LIBS "-Wl,--whole-archive")
-            elseif(${arg} STREQUAL "ARCHIVE_END")
-                list(APPEND LIBS "-Wl,--no-whole-archive")
-            else()
-                list(APPEND LIBS ${arg})
-            endif()
-        endforeach()
+    foreach(arg ${ARGN})
+      if(${arg} STREQUAL "ARCHIVE_START")
+        list(APPEND LIBS "-Wl,--whole-archive")
+      elseif(${arg} STREQUAL "ARCHIVE_END")
+        list(APPEND LIBS "-Wl,--no-whole-archive")
+      else()
+        list(APPEND LIBS ${arg})
+      endif()
+    endforeach()
 
-        target_link_libraries(${TARGET_NAME}
-                "-Wl,--start-group"
-                ${LIBS}
-                "-Wl,--end-group")
-    endif()
+    target_link_libraries(${TARGET_NAME} "-Wl,--start-group" ${LIBS}
+                          "-Wl,--end-group")
+  endif()
 endfunction()
diff --git a/cmake/version.cmake b/cmake/version.cmake
index 57ca750df6cb9..83bd3f1b1bc4a 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -3,7 +3,7 @@ set(PADDLE_VERSION $ENV{PADDLE_VERSION})
 set(tmp_version "HEAD")
 set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
 set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
-while ("${PADDLE_VERSION}" STREQUAL "")
+while("${PADDLE_VERSION}" STREQUAL "")
   # Check current branch name
   execute_process(
     COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version}
@@ -11,23 +11,24 @@ while ("${PADDLE_VERSION}" STREQUAL "")
     OUTPUT_VARIABLE GIT_BRANCH_NAME
     RESULT_VARIABLE GIT_BRANCH_RESULT
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if (NOT ${GIT_BRANCH_RESULT})
+  if(NOT ${GIT_BRANCH_RESULT})
     execute_process(
-      COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
+      COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always
+              ${tmp_version}
       WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
       OUTPUT_VARIABLE GIT_TAG_NAME
       RESULT_VARIABLE GIT_RESULT
       ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-    if (NOT ${GIT_RESULT})
+    if(NOT ${GIT_RESULT})
       # Check if current branch is release branch
-      if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
+      if(${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
         # Check the tag is a correct version
-        if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
+        if(${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
           # if no tag was found, set PADDLE_VERSION to 0.0.0 to represent latest
           set(PADDLE_VERSION "0.0.0")
-        elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
+        elseif(${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
           string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
-        else()  # otherwise, get the previous git tag name.
+        else() # otherwise, get the previous git tag name.
           set(tmp_version "${GIT_TAG_NAME}~1")
         endif()
       else()
@@ -37,9 +38,9 @@ while ("${PADDLE_VERSION}" STREQUAL "")
           OUTPUT_VARIABLE GIT_EXACT_TAG_NAME
           RESULT_VARIABLE GIT_EXACT_TAG_RESULT
           ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-        if (NOT ${GIT_EXACT_TAG_NAME})
+        if(NOT ${GIT_EXACT_TAG_NAME})
           # Check if current branch is tag branch
-          if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
+          if(${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
             string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME})
           else()
             set(PADDLE_VERSION "0.0.0")
diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake
index adf3d74c26220..6692f24dd6ae9 100644
--- a/cmake/xpu_kp.cmake
+++ b/cmake/xpu_kp.cmake
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 if(NOT WITH_XPU_KP)
-    return()
+  return()
 endif()
 
-set(LINK_FLAGS    "-Wl,--allow-multiple-definition")
-set(CMAKE_EXE_LINKER_FLAGS    "${LINK_FLAGS}")
+set(LINK_FLAGS "-Wl,--allow-multiple-definition")
+set(CMAKE_EXE_LINKER_FLAGS "${LINK_FLAGS}")
 set(CMAKE_SHARED_LINKER_FLAGS "${LINK_FLAGS}")
 
 if(NOT XPU_TOOLCHAIN)
@@ -31,7 +31,7 @@ message(STATUS "Build with XPU_TOOLCHAIN=" ${XPU_TOOLCHAIN})
 set(XPU_CLANG ${XPU_TOOLCHAIN}/bin/clang++)
 message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG})
 
-# The host sysroot of XPU compiler is gcc-8.2 
+# The host sysroot of XPU compiler is gcc-8.2
 if(NOT HOST_SYSROOT)
   set(HOST_SYSROOT /opt/compiler/gcc-8.2)
 endif()
@@ -45,19 +45,19 @@ if(NOT API_ARCH)
 endif()
 
 if(API_ARCH MATCHES "x86_64")
-if(EXISTS ${HOST_SYSROOT}/bin/g++)
-  set(HOST_CXX ${HOST_SYSROOT}/bin/g++)
-  set(HOST_AR ${HOST_SYSROOT}/bin/ar)
-else()
-  set(HOST_CXX /usr/bin/g++)
-  set(HOST_AR /usr/bin/ar)
-endif()
+  if(EXISTS ${HOST_SYSROOT}/bin/g++)
+    set(HOST_CXX ${HOST_SYSROOT}/bin/g++)
+    set(HOST_AR ${HOST_SYSROOT}/bin/ar)
+  else()
+    set(HOST_CXX /usr/bin/g++)
+    set(HOST_AR /usr/bin/ar)
+  endif()
 else()
   set(HOST_CXX ${CMAKE_CXX_COMPILER})
   set(HOST_AR ${CMAKE_AR})
 endif()
 
-set(TOOLCHAIN_ARGS )
+set(TOOLCHAIN_ARGS)
 
 if(OPT_LEVEL)
   set(OPT_LEVEL ${OPT_LEVEL})
@@ -74,8 +74,16 @@ message(STATUS "Build with HOST_AR=" ${HOST_AR})
 macro(compile_kernel COMPILE_ARGS)
   set(options "")
   set(oneValueArgs "")
-  set(multiValueArgs KERNEL DIRPATH XNAME DEVICE HOST XPU DEPENDS)
-  cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(multiValueArgs
+      KERNEL
+      DIRPATH
+      XNAME
+      DEVICE
+      HOST
+      XPU
+      DEPENDS)
+  cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
   set(kernel_path ${xpu_add_library_DIRPATH})
   set(kernel_name ${xpu_add_library_XNAME})
   set(device_o_extra_flags ${xpu_add_library_DEVICE})
@@ -84,16 +92,12 @@ macro(compile_kernel COMPILE_ARGS)
   set(cc_depends ${xpu_add_library_DEPENDS})
 
   set(kernel_target ${kernel_name}_kernel)
-  add_custom_target(${kernel_target}
-    WORKING_DIRECTORY
-      ${CMAKE_CURRENT_BINARY_DIR}
-    DEPENDS
-      kernel_build/${kernel_name}.host.o
-      kernel_build/${kernel_name}.bin.o
-    COMMENT
-      ${kernel_target}
-    VERBATIM
-    )
+  add_custom_target(
+    ${kernel_target}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS kernel_build/${kernel_name}.host.o kernel_build/${kernel_name}.bin.o
+    COMMENT ${kernel_target}
+    VERBATIM)
 
   if(cc_depends)
     add_dependencies(${kernel_target} ${xpu_add_library_DEPENDS})
@@ -106,24 +110,56 @@ macro(compile_kernel COMPILE_ARGS)
 
   set(XTDK_DIR ${XPU_TOOLCHAIN})
   set(CXX_DIR ${HOST_SYSROOT})
-  set(XPU_CXX_FLAGS  -fforce-enable-int128 -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer  -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function  -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes  -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG )
+  set(XPU_CXX_FLAGS
+      -fforce-enable-int128
+      -Wno-error=pessimizing-move
+      -Wno-error=constant-conversion
+      -Wno-error=c++11-narrowing
+      -Wno-error=shift-count-overflow
+      -Wno-error=unused-local-typedef
+      -Wno-error=deprecated-declarations
+      -Wno-deprecated-declarations
+      -std=c++14
+      -m64
+      -fPIC
+      -fno-omit-frame-pointer
+      -Wall
+      -Wno-inconsistent-missing-override
+      -Wextra
+      -Wnon-virtual-dtor
+      -Wdelete-non-virtual-dtor
+      -Wno-unused-parameter
+      -Wno-unused-function
+      -Wno-error=unused-local-typedefs
+      -Wno-error=ignored-attributes
+      -Wno-error=int-in-bool-context
+      -Wno-error=parentheses
+      -Wno-error=address
+      -Wno-ignored-qualifiers
+      -Wno-ignored-attributes
+      -Wno-parentheses
+      -DNDEBUG)
 
   #include path
-  get_property(dirs DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
+  get_property(
+    dirs
+    DIRECTORY ${CMAKE_SOURCE_DIR}
+    PROPERTY INCLUDE_DIRECTORIES)
   set(XPU_CXX_INCLUDES "")
   foreach(dir IN LISTS dirs)
     list(APPEND XPU_CXX_INCLUDES "-I${dir}")
   endforeach()
-  string(REPLACE ";" " " XPU_CXX_INCLUDES "${XPU_CXX_INCLUDES}" )
+  string(REPLACE ";" " " XPU_CXX_INCLUDES "${XPU_CXX_INCLUDES}")
   separate_arguments(XPU_CXX_INCLUDES UNIX_COMMAND "${XPU_CXX_INCLUDES}")
 
   #related flags
-  get_directory_property( DirDefs DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS )
+  get_directory_property(DirDefs DIRECTORY ${CMAKE_SOURCE_DIR}
+                                           COMPILE_DEFINITIONS)
   set(XPU_CXX_DEFINES "")
   foreach(def IN LISTS DirDefs)
     list(APPEND XPU_CXX_DEFINES "-D${def}")
   endforeach()
-  string(REPLACE ";" " " XPU_CXX_DEFINES "${XPU_CXX_DEFINES}" )
+  string(REPLACE ";" " " XPU_CXX_DEFINES "${XPU_CXX_DEFINES}")
   separate_arguments(XPU_CXX_DEFINES UNIX_COMMAND "${XPU_CXX_DEFINES}")
 
   set(ABI_VERSION "")
@@ -133,121 +169,119 @@ macro(compile_kernel COMPILE_ARGS)
     set(ABI_VERSION "-D_GLIBCXX_USE_CXX11_ABI=1")
   endif()
   add_custom_command(
-    OUTPUT
-      kernel_build/${kernel_name}.bin.o
-    COMMAND
-      ${CMAKE_COMMAND} -E make_directory kernel_build
-    COMMAND
-	  ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu
+    OUTPUT kernel_build/${kernel_name}.bin.o
+    COMMAND ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps
+            kernel_build/${kernel_name}.xpu
     COMMAND
-    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 ${ABI_VERSION} ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS}  ${XPU_CXX_INCLUDES} 
-       -I.  -o kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.xpu
-        --xpu-device-only -c -v 
-    COMMAND
-      ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec  kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
-    WORKING_DIRECTORY
-      ${CMAKE_CURRENT_BINARY_DIR}
-    DEPENDS
-      ${xpu_add_library_DEPENDS}
-    COMMENT
-      kernel_build/${kernel_name}.bin.o
-    VERBATIM
-    )
-    list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.bin.o)
+      ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 ${ABI_VERSION} ${OPT_LEVEL}
+      -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS}
+      ${XPU_CXX_INCLUDES} -I. -o kernel_build/${kernel_name}.bin.o.sec
+      kernel_build/${kernel_name}.xpu --xpu-device-only -c -v
+    COMMAND ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec
+            kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS ${xpu_add_library_DEPENDS}
+    COMMENT kernel_build/${kernel_name}.bin.o
+    VERBATIM)
+  list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.bin.o)
 
   add_custom_command(
-    OUTPUT
-      kernel_build/${kernel_name}.host.o
-    COMMAND
-      ${CMAKE_COMMAND} -E make_directory kernel_build
-    COMMAND
-	  ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu
+    OUTPUT kernel_build/${kernel_name}.host.o
+    COMMAND ${CMAKE_COMMAND} -E make_directory kernel_build
+    COMMAND ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps
+            kernel_build/${kernel_name}.xpu
     COMMAND
-    ${XPU_CLANG} --sysroot=${CXX_DIR}  -std=c++11 ${ABI_VERSION} ${OPT_LEVEL} -fno-builtin -mcpu=xpu2  -fPIC ${XPU_CXX_DEFINES}  ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} 
-        -I.  -o kernel_build/${kernel_name}.host.o kernel_build/${kernel_name}.xpu
-        --xpu-host-only -c -v 
-    WORKING_DIRECTORY
-      ${CMAKE_CURRENT_BINARY_DIR}
-    DEPENDS
-      ${xpu_add_library_DEPENDS}
-    COMMENT
-      kernel_build/${kernel_name}.host.o
-    VERBATIM
-    )
-    list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.host.o)
+      ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 ${ABI_VERSION} ${OPT_LEVEL}
+      -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS}
+      ${XPU_CXX_INCLUDES} -I. -o kernel_build/${kernel_name}.host.o
+      kernel_build/${kernel_name}.xpu --xpu-host-only -c -v
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS ${xpu_add_library_DEPENDS}
+    COMMENT kernel_build/${kernel_name}.host.o
+    VERBATIM)
+  list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.host.o)
 endmacro()
 
 ###############################################################################
 # XPU_ADD_LIBRARY
 ###############################################################################
 macro(xpu_add_library TARGET_NAME)
-    # Separate the sources from the options
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs STATIC DEPENDS)
-    cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(xpu_srcs ${xpu_add_library_STATIC})
-    set(xpu_target ${TARGET_NAME})
-    set(cc_srcs_depends ${xpu_add_library_DEPENDS})
-    
-    file(GLOB_RECURSE xpu_srcs_lists ${xpu_srcs})
-    list(LENGTH xpu_srcs_lists xpu_srcs_lists_num)
-
-    set(XPU1_DEVICE_O_EXTRA_FLAGS " ")
-    set(XPU1_HOST_O_EXTRA_FLAGS " ")
-
-    # Distinguish .xpu file from other files
-    foreach(cur_xpu_src IN LISTS xpu_srcs_lists)
-      get_filename_component(language_type_name ${cur_xpu_src} EXT)
-      if(${language_type_name} STREQUAL ".kps")
-        list(APPEND xpu_kernel_lists ${cur_xpu_src})
-      else()
-        list(APPEND cc_kernel_lists ${cur_xpu_src})
-      endif()
-    endforeach()
+  # Separate the sources from the options
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs STATIC DEPENDS)
+  cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  set(xpu_srcs ${xpu_add_library_STATIC})
+  set(xpu_target ${TARGET_NAME})
+  set(cc_srcs_depends ${xpu_add_library_DEPENDS})
 
-    # Ensure that there is only one xpu kernel
-    list(LENGTH xpu_kernel_lists xpu_kernel_lists_num)
-    list(LENGTH cc_srcs_depends cc_srcs_depends_num)
-
-    if(${xpu_kernel_lists_num})
-        foreach(xpu_kernel IN LISTS xpu_kernel_lists)
-            get_filename_component(kernel_name ${xpu_kernel} NAME_WE)
-            get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY)
-            set(kernel_rules ${kernel_dir}/${kernel_name}.rules)
-            set(kernel_name ${kernel_name})
-            compile_kernel( KERNEL ${xpu_kernel} DIRPATH ${kernel_dir} XNAME ${kernel_name} DEVICE ${XPU1_DEVICE_O_EXTRA_FLAGS} HOST ${XPU1_HOST_O_EXTRA_FLAGS} XPU "xpu2" DEPENDS ${cc_srcs_depends})
-        endforeach()
-
-        add_custom_target(${xpu_target}_src ALL
-            WORKING_DIRECTORY
-                ${CMAKE_CURRENT_BINARY_DIR}
-            DEPENDS
-                ${xpu_kernel_depends}
-                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
-            COMMENT
-                ${xpu_target}_src
-            VERBATIM
-            )
-
-        add_custom_command(
-            OUTPUT
-            ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
-            COMMAND
-                ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a ${xpu_kernel_depends}
-            WORKING_DIRECTORY
-                ${CMAKE_CURRENT_BINARY_DIR}
-            DEPENDS
-                ${xpu_kernel_depends}
-            COMMENT
-                ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
-            VERBATIM
-            ) 
-        
-        add_library(${xpu_target} STATIC ${cc_kernel_lists})
-        add_dependencies(${xpu_target} ${xpu_target}_src)
-        target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a)
+  file(GLOB_RECURSE xpu_srcs_lists ${xpu_srcs})
+  list(LENGTH xpu_srcs_lists xpu_srcs_lists_num)
+
+  set(XPU1_DEVICE_O_EXTRA_FLAGS " ")
+  set(XPU1_HOST_O_EXTRA_FLAGS " ")
+
+  # Distinguish .xpu file from other files
+  foreach(cur_xpu_src IN LISTS xpu_srcs_lists)
+    get_filename_component(language_type_name ${cur_xpu_src} EXT)
+    if(${language_type_name} STREQUAL ".kps")
+      list(APPEND xpu_kernel_lists ${cur_xpu_src})
     else()
-        add_library(${xpu_target} STATIC ${cc_kernel_lists})
+      list(APPEND cc_kernel_lists ${cur_xpu_src})
     endif()
+  endforeach()
+
+  # Ensure that there is only one xpu kernel
+  list(LENGTH xpu_kernel_lists xpu_kernel_lists_num)
+  list(LENGTH cc_srcs_depends cc_srcs_depends_num)
+
+  if(${xpu_kernel_lists_num})
+    foreach(xpu_kernel IN LISTS xpu_kernel_lists)
+      get_filename_component(kernel_name ${xpu_kernel} NAME_WE)
+      get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY)
+      set(kernel_rules ${kernel_dir}/${kernel_name}.rules)
+      set(kernel_name ${kernel_name})
+      compile_kernel(
+        KERNEL
+        ${xpu_kernel}
+        DIRPATH
+        ${kernel_dir}
+        XNAME
+        ${kernel_name}
+        DEVICE
+        ${XPU1_DEVICE_O_EXTRA_FLAGS}
+        HOST
+        ${XPU1_HOST_O_EXTRA_FLAGS}
+        XPU
+        "xpu2"
+        DEPENDS
+        ${cc_srcs_depends})
+    endforeach()
+
+    add_custom_target(
+      ${xpu_target}_src ALL
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      DEPENDS ${xpu_kernel_depends}
+              ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+      COMMENT ${xpu_target}_src
+      VERBATIM)
+
+    add_custom_command(
+      OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+      COMMAND ${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+              ${xpu_kernel_depends}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      DEPENDS ${xpu_kernel_depends}
+      COMMENT ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
+      VERBATIM)
+
+    add_library(${xpu_target} STATIC ${cc_kernel_lists})
+    add_dependencies(${xpu_target} ${xpu_target}_src)
+    target_link_libraries(${TARGET_NAME}
+                          ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a)
+  else()
+    add_library(${xpu_target} STATIC ${cc_kernel_lists})
+  endif()
 endmacro()
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 9d801c9e224a9..07041455df4fd 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,7 +1,9 @@
 add_subdirectory(utils)
 add_subdirectory(scripts)
 add_subdirectory(testing)
-set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
+set(PYTHON_TESTS_DIR
+    ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests
+    CACHE INTERNAL "python tests directory")
 add_subdirectory(phi)
 add_subdirectory(infrt)
 add_subdirectory(fluid)
diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index a92932b4d3247..304a764f5b87c 100755
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -2,35 +2,49 @@ add_subdirectory(collective)
 add_subdirectory(store)
 if(WITH_PYTHON)
   py_proto_compile(ps_py_proto SRCS the_one_ps.proto)
-  add_custom_target(ps_py_proto_init ALL  
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto)
+  add_custom_target(
+    ps_py_proto_init ALL
+    COMMAND ${CMAKE_COMMAND} -E make_directory
+            ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto)
   add_dependencies(ps_py_proto ps_py_proto_init)
-  if (NOT WIN32)
-    add_custom_command(TARGET ps_py_proto POST_BUILD
-      COMMAND mv the_one_ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/)
+  if(NOT WIN32)
+    add_custom_command(
+      TARGET ps_py_proto
+      POST_BUILD
+      COMMAND mv the_one_ps_pb2.py
+              ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/)
   else(NOT WIN32)
-    string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
-    add_custom_command(TARGET ps_py_proto POST_BUILD
+    string(
+      REPLACE "/" "\\" fleet_proto_dstpath
+              "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
+    add_custom_command(
+      TARGET ps_py_proto
+      POST_BUILD
       COMMAND copy /Y the_one_ps_pb2.py ${fleet_proto_dstpath}
-      COMMENT "Copy generated python the_one_ps_pb2 into directory ${fleet_proto_dstpath}.")
+      COMMENT
+        "Copy generated python the_one_ps_pb2 into directory ${fleet_proto_dstpath}."
+    )
   endif(NOT WIN32)
 endif()
 
 if(NOT WITH_PSCORE)
-    add_subdirectory(fleet_executor)
-    return()
+  add_subdirectory(fleet_executor)
+  return()
 endif()
 
 proto_library(ps_framework_proto SRCS the_one_ps.proto)
-add_custom_command(TARGET ps_framework_proto POST_BUILD
-    COMMAND mv the_one_ps.pb.h ps.pb.h
-    COMMAND mv the_one_ps.pb.cc ps.pb.cc)
+add_custom_command(
+  TARGET ps_framework_proto
+  POST_BUILD
+  COMMAND mv the_one_ps.pb.h ps.pb.h
+  COMMAND mv the_one_ps.pb.cc ps.pb.cc)
 
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-error=unused-value -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
+set(DISTRIBUTE_COMPILE_FLAGS
+    "-Wno-error=unused-value -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result"
+)
 
-if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-    set(DISTRIBUTE_COMPILE_FLAGS
-            "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
 endif()
 
 add_subdirectory(common)
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index f6b1bd47c1e46..0cfc82709637f 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,20 +1,65 @@
-cc_library(processgroup SRCS ProcessGroup.cc DEPS phi_api eager_api)
-cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi_api string_helper)
+cc_library(
+  processgroup
+  SRCS ProcessGroup.cc
+  DEPS phi_api eager_api)
+cc_library(
+  eager_reducer
+  SRCS reducer.cc
+  DEPS eager_api processgroup phi_api string_helper)
 
-if (WITH_DISTRIBUTE)
-  cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi_api eager_api gloo_wrapper)
+if(WITH_DISTRIBUTE)
+  cc_library(
+    processgroup_gloo
+    SRCS ProcessGroupGloo.cc
+    DEPS phi_api eager_api gloo_wrapper)
 endif()
 
 if(WITH_NCCL)
-  cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi_api eager_api)
-  if (WITH_DISTRIBUTE AND WITH_PSCORE)
-    cc_library(processgroup_heter SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi_api eager_api)
+  cc_library(
+    processgroup_nccl
+    SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc
+    DEPS place
+         cuda_stream
+         enforce
+         collective_helper
+         device_context
+         phi_api
+         eager_api)
+  if(WITH_DISTRIBUTE AND WITH_PSCORE)
+    cc_library(
+      processgroup_heter
+      SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc
+      DEPS place
+           cuda_stream
+           enforce
+           collective_helper
+           device_context
+           phi_api
+           eager_api)
   endif()
 endif()
 
 if(WITH_ASCEND_CL)
-  cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc DEPS place npu_stream enforce collective_helper device_context phi_api eager_api)
-  if (WITH_DISTRIBUTE AND WITH_PSCORE)
-    cc_library(processgroup_heter SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc DEPS place npu_stream enforce collective_helper device_context phi_api eager_api)
+  cc_library(
+    processgroup_hccl
+    SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc
+    DEPS place
+         npu_stream
+         enforce
+         collective_helper
+         device_context
+         phi_api
+         eager_api)
+  if(WITH_DISTRIBUTE AND WITH_PSCORE)
+    cc_library(
+      processgroup_heter
+      SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc
+      DEPS place
+           npu_stream
+           enforce
+           collective_helper
+           device_context
+           phi_api
+           eager_api)
   endif()
 endif()
diff --git a/paddle/fluid/distributed/collective/HCCLTools.cc b/paddle/fluid/distributed/collective/HCCLTools.cc
index 526a683e057c0..676a71cb30d95 100644
--- a/paddle/fluid/distributed/collective/HCCLTools.cc
+++ b/paddle/fluid/distributed/collective/HCCLTools.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/HCCLTools.h"
+
 #include "paddle/fluid/distributed/collective/Types.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h
index a1dcf7cd9b626..4955e24eadbfb 100644
--- a/paddle/fluid/distributed/collective/HCCLTools.h
+++ b/paddle/fluid/distributed/collective/HCCLTools.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <error.h>
+
 #include <string>
 
 #include "boost/variant.hpp"
diff --git a/paddle/fluid/distributed/collective/NCCLTools.cc b/paddle/fluid/distributed/collective/NCCLTools.cc
index 7e842ebf92166..2cecaf0734df6 100644
--- a/paddle/fluid/distributed/collective/NCCLTools.cc
+++ b/paddle/fluid/distributed/collective/NCCLTools.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/NCCLTools.h"
+
 #include "paddle/fluid/distributed/collective/Types.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h
index 0454518b1836c..f38ce8faa7ffb 100644
--- a/paddle/fluid/distributed/collective/NCCLTools.h
+++ b/paddle/fluid/distributed/collective/NCCLTools.h
@@ -16,9 +16,11 @@
 
 #include <cuda_runtime.h>
 #include <error.h>
+
 #include <string>
 
 #include "boost/variant.hpp"
+#include "paddle/fluid/distributed/collective/Types.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -26,8 +28,6 @@
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
 
-#include "paddle/fluid/distributed/collective/Types.h"
-
 namespace paddle {
 namespace distributed {
 
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index 52e09792d5d80..7ed6b188fd217 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -21,7 +21,6 @@
 
 #include "paddle/fluid/distributed/collective/Types.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
-
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
index 824341c3cd97d..1a390e38755fd 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -27,6 +27,7 @@
 #include <gloo/broadcast.h>
 #include <gloo/reduce.h>
 #include <gloo/scatter.h>
+
 #include "paddle/fluid/distributed/collective/Common.h"
 #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
@@ -485,8 +486,9 @@ std::shared_ptr<::gloo::transport::Device>
 ProcessGroupGloo::createDefaultDevice() {
   std::array<char, HOST_NAME_MAX> hostname{};
   auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal(
-                                "Get hostname error for createDefaultDevice."));
+  PADDLE_ENFORCE_EQ(
+      ret, 0,
+      platform::errors::Fatal("Get hostname error for createDefaultDevice."));
   ::addrinfo* result;
   result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
   ::addrinfo* cur;
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
index 9ed6c2198df4c..50249b03967a9 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+
 #include "paddle/fluid/distributed/collective/Common.h"
 #include "paddle/fluid/distributed/collective/HCCLTools.h"
 #include "paddle/fluid/memory/malloc.h"
@@ -216,15 +217,16 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::AllReduce(
     std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
     std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
     const AllreduceOptions& opts) {
-  return Collective(in_tensors, out_tensors,
-                    [&](phi::DenseTensor& input, phi::DenseTensor& output,
-                        HcclComm comm, const aclrtStream& stream) {
-                      return platform::dynload::HcclAllReduce(
-                          input.data(), output.data(), input.numel(),
-                          platform::ToHCCLDataType(input.dtype()),
-                          ToHCCLRedType(opts.reduce_op), comm, stream);
-                    },
-                    CommType::ALLREDUCE);
+  return Collective(
+      in_tensors, out_tensors,
+      [&](phi::DenseTensor& input, phi::DenseTensor& output, HcclComm comm,
+          const aclrtStream& stream) {
+        return platform::dynload::HcclAllReduce(
+            input.data(), output.data(), input.numel(),
+            platform::ToHCCLDataType(input.dtype()),
+            ToHCCLRedType(opts.reduce_op), comm, stream);
+      },
+      CommType::ALLREDUCE);
 }
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Broadcast(
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
index 2f0ff6b9565ea..a32984798febd 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
@@ -21,12 +21,11 @@
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
 #include "paddle/fluid/platform/device_context.h"
-
-#include "paddle/fluid/distributed/collective/HCCLTools.h"
-#include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
index 0911a4a3e3e18..0b388a6a848a9 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/ProcessGroupHeter.h"
+
 #include <chrono>
+
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/api/include/api.h"
@@ -129,8 +131,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::AllReduce(
             gid_, {dense_cpu_tensor.name()}, send_size, dense_cpu_tensor.data(),
             dense_cpu_tensor.numel() *
                 framework::DataTypeSize(dense_cpu_tensor.dtype()));
-        PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                      "Send to the switch module error."));
+        PADDLE_ENFORCE_EQ(ret, 0,
+                          platform::errors::PreconditionNotMet(
+                              "Send to the switch module error."));
         phi::DenseTensor cpu_tensor2;
         cpu_tensor2.AllocateFrom(
             std::make_unique<paddle::experimental::DefaultAllocator>(
@@ -140,8 +143,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::AllReduce(
         ret = client_->Recv(
             gid_, {dense_cpu_tensor.name()}, cpu_tensor2.data(),
             cpu_tensor2.numel() * framework::DataTypeSize(cpu_tensor2.dtype()));
-        PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                      "Recv from the switch module error."));
+        PADDLE_ENFORCE_EQ(ret, 0,
+                          platform::errors::PreconditionNotMet(
+                              "Recv from the switch module error."));
 
         switch (dense_cpu_tensor.dtype()) {
           case DataType::FLOAT32:
@@ -226,8 +230,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
               dense_cpu_tensor.data(),
               dense_cpu_tensor.numel() *
                   framework::DataTypeSize(dense_cpu_tensor.dtype()));
-          PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                        "Send to the switch module error."));
+          PADDLE_ENFORCE_EQ(ret, 0,
+                            platform::errors::PreconditionNotMet(
+                                "Send to the switch module error."));
         } else {
           int ret = client_->Recv(
               gid_, {dense_cpu_tensor.name()}, dense_cpu_tensor.data(),
@@ -286,8 +291,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Send(
   VLOG(2) << "tensor_name:" << tensor_name;
   int ret = client_->Send(gid_, {tensor_name}, send_size, cpu_tensor.data(),
                           tensor_size);
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                "Send to the switch module error."));
+  PADDLE_ENFORCE_EQ(
+      ret, 0,
+      platform::errors::PreconditionNotMet("Send to the switch module error."));
   return CreateTask(rank_, CommType::SEND, in_tensors);
 }
 
@@ -319,8 +325,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Recv(
   int ret = client_->Recv(
       gid_, {tensor_name}, cpu_tensor.data(),
       cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype()));
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                "receive to the switch module error."));
+  PADDLE_ENFORCE_EQ(ret, 0,
+                    platform::errors::PreconditionNotMet(
+                        "receive to the switch module error."));
   auto end = std::chrono::high_resolution_clock::now();
   std::chrono::duration<double> diff = end - start;
   double goodput = cpu_tensor.numel() *
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index f1b66864b2930..dc67205c78f56 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+
 #include "paddle/fluid/distributed/collective/Common.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -320,15 +321,16 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
   PADDLE_ENFORCE_EQ(
       CheckTensorsInCudaPlace(in_tensors), true,
       platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
-  return Collective(in_tensors, out_tensors,
-                    [&](const phi::DenseTensor& input, phi::DenseTensor& output,
-                        ncclComm_t comm, const gpuStream_t& stream) {
-                      return platform::dynload::ncclAllReduce(
-                          input.data(), output.data(), input.numel(),
-                          platform::ToNCCLDataType(input.type()),
-                          ToNCCLRedType(opts.reduce_op), comm, stream);
-                    },
-                    CommType::ALLREDUCE);
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const phi::DenseTensor& input, phi::DenseTensor& output,
+          ncclComm_t comm, const gpuStream_t& stream) {
+        return platform::dynload::ncclAllReduce(
+            input.data(), output.data(), input.numel(),
+            platform::ToNCCLDataType(input.type()),
+            ToNCCLRedType(opts.reduce_op), comm, stream);
+      },
+      CommType::ALLREDUCE);
 }
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
@@ -338,17 +340,17 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
       CheckTensorsInCudaPlace(in_tensors), true,
       platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
 
-  return Collective(in_tensors, out_tensors,
-                    [&](phi::DenseTensor& input, phi::DenseTensor& output,
-                        ncclComm_t comm, const gpuStream_t& stream) {
-                      const auto root = opts.source_rank * in_tensors.size() +
-                                        opts.source_root;
-                      return platform::dynload::ncclBroadcast(
-                          input.data(), output.data(), input.numel(),
-                          platform::ToNCCLDataType(input.type()), root, comm,
-                          stream);
-                    },
-                    CommType::BROADCAST);
+  return Collective(
+      in_tensors, out_tensors,
+      [&](phi::DenseTensor& input, phi::DenseTensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        const auto root =
+            opts.source_rank * in_tensors.size() + opts.source_root;
+        return platform::dynload::ncclBroadcast(
+            input.data(), output.data(), input.numel(),
+            platform::ToNCCLDataType(input.type()), root, comm, stream);
+      },
+      CommType::BROADCAST);
 }
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
@@ -400,15 +402,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
     std::vector<phi::DenseTensor>& tensors, int dst_rank) {
   CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
 
-  auto task = PointToPoint(tensors,
-                           [&](phi::DenseTensor& input, ncclComm_t comm,
-                               const gpuStream_t& stream, int dst_rank) {
-                             return platform::dynload::ncclSend(
-                                 input.data(), input.numel(),
-                                 platform::ToNCCLDataType(input.dtype()),
-                                 dst_rank, comm, stream);
-                           },
-                           dst_rank, CommType::SEND);
+  auto task = PointToPoint(
+      tensors,
+      [&](phi::DenseTensor& input, ncclComm_t comm, const gpuStream_t& stream,
+          int dst_rank) {
+        return platform::dynload::ncclSend(
+            input.data(), input.numel(),
+            platform::ToNCCLDataType(input.dtype()), dst_rank, comm, stream);
+      },
+      dst_rank, CommType::SEND);
   return task;
 }
 
@@ -416,15 +418,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
     std::vector<phi::DenseTensor>& tensors, int src_rank) {
   CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
 
-  auto task = PointToPoint(tensors,
-                           [&](phi::DenseTensor& output, ncclComm_t comm,
-                               const gpuStream_t& stream, int src_rank) {
-                             return platform::dynload::ncclRecv(
-                                 output.data(), output.numel(),
-                                 platform::ToNCCLDataType(output.dtype()),
-                                 src_rank, comm, stream);
-                           },
-                           src_rank, CommType::RECV);
+  auto task = PointToPoint(
+      tensors,
+      [&](phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream,
+          int src_rank) {
+        return platform::dynload::ncclRecv(
+            output.data(), output.numel(),
+            platform::ToNCCLDataType(output.dtype()), src_rank, comm, stream);
+      },
+      src_rank, CommType::RECV);
   return task;
 }
 
@@ -440,15 +442,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
   std::vector<phi::DenseTensor> shared_tensors;
   shared_tensors.push_back(shared_input);
 
-  auto task = PointToPoint(shared_tensors,
-                           [&](phi::DenseTensor& input, ncclComm_t comm,
-                               const gpuStream_t& stream, int dst_rank) {
-                             return platform::dynload::ncclSend(
-                                 input.data(), input.numel(),
-                                 platform::ToNCCLDataType(input.dtype()),
-                                 dst_rank, comm, stream);
-                           },
-                           dst_rank, CommType::SEND);
+  auto task = PointToPoint(
+      shared_tensors,
+      [&](phi::DenseTensor& input, ncclComm_t comm, const gpuStream_t& stream,
+          int dst_rank) {
+        return platform::dynload::ncclSend(
+            input.data(), input.numel(),
+            platform::ToNCCLDataType(input.dtype()), dst_rank, comm, stream);
+      },
+      dst_rank, CommType::SEND);
   return task;
 }
 
@@ -463,15 +465,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
   std::vector<phi::DenseTensor> shared_tensors;
   shared_tensors.push_back(shared_input);
 
-  auto task = PointToPoint(shared_tensors,
-                           [&](phi::DenseTensor& output, ncclComm_t comm,
-                               const gpuStream_t& stream, int src_rank) {
-                             return platform::dynload::ncclRecv(
-                                 output.data(), output.numel(),
-                                 platform::ToNCCLDataType(output.dtype()),
-                                 src_rank, comm, stream);
-                           },
-                           src_rank, CommType::RECV);
+  auto task = PointToPoint(
+      shared_tensors,
+      [&](phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream,
+          int src_rank) {
+        return platform::dynload::ncclRecv(
+            output.data(), output.numel(),
+            platform::ToNCCLDataType(output.dtype()), src_rank, comm, stream);
+      },
+      src_rank, CommType::RECV);
   return task;
 }
 
@@ -484,15 +486,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
   PADDLE_ENFORCE_EQ(
       CheckTensorsInCudaPlace(out_tensors), true,
       platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
-  return Collective(in_tensors, out_tensors,
-                    [&](const phi::DenseTensor& input, phi::DenseTensor& output,
-                        ncclComm_t comm, const gpuStream_t& stream) {
-                      return platform::dynload::ncclAllGather(
-                          input.data(), output.data(), input.numel(),
-                          platform::ToNCCLDataType(input.dtype()), comm,
-                          stream);
-                    },
-                    CommType::ALLGATHER);
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const phi::DenseTensor& input, phi::DenseTensor& output,
+          ncclComm_t comm, const gpuStream_t& stream) {
+        return platform::dynload::ncclAllGather(
+            input.data(), output.data(), input.numel(),
+            platform::ToNCCLDataType(input.dtype()), comm, stream);
+      },
+      CommType::ALLGATHER);
 }
 
 void* GetPointerByOffset(void* raw_pointer, size_t offset,
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index 82ced6e135ac9..2325e645b4c46 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -22,10 +22,9 @@
 #include <vector>
 
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
-
-#include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 96009ce722905..9c04b95a732e8 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -403,8 +403,9 @@ void EagerReducer::InitializeDenseGroups(
                           "Tensor %s is not initialized.", tensor_name));
     const auto size = tensor.numel();
     PADDLE_ENFORCE_GT(
-        size, 0, platform::errors::PreconditionNotMet(
-                     "The number of tensor %s's elements is 0.", tensor_name));
+        size, 0,
+        platform::errors::PreconditionNotMet(
+            "The number of tensor %s's elements is 0.", tensor_name));
     all_length += size;
 
     p_group->length_.push_back(size);
diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h
index 424bae0e5acd1..0527ceb9b5121 100644
--- a/paddle/fluid/distributed/collective/reducer.h
+++ b/paddle/fluid/distributed/collective/reducer.h
@@ -16,6 +16,7 @@
 
 #include <map>
 #include <vector>
+
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/utils/hook_utils.h"
diff --git a/paddle/fluid/distributed/common/CMakeLists.txt b/paddle/fluid/distributed/common/CMakeLists.txt
index eab6165ca689e..05f6a1d1ccec4 100644
--- a/paddle/fluid/distributed/common/CMakeLists.txt
+++ b/paddle/fluid/distributed/common/CMakeLists.txt
@@ -1,4 +1,6 @@
-
-cc_library(afs_wrapper SRCS afs_warpper.cc DEPS fs ps_framework_proto)
+cc_library(
+  afs_wrapper
+  SRCS afs_warpper.cc
+  DEPS fs ps_framework_proto)
 
 #set_property(GLOBAL PROPERTY COMMON_DEPS afs_warpper)
diff --git a/paddle/fluid/distributed/common/afs_warpper.cc b/paddle/fluid/distributed/common/afs_warpper.cc
index d539ec6080469..3a37c6be7c2af 100644
--- a/paddle/fluid/distributed/common/afs_warpper.cc
+++ b/paddle/fluid/distributed/common/afs_warpper.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/common/afs_warpper.h"
+
 #include "paddle/fluid/framework/io/fs.h"
 
 namespace paddle {
@@ -27,9 +28,10 @@ int AfsClient::initialize(const FsClientParameter& fs_client_param) {
 int AfsClient::initialize(const std::string& hadoop_bin, const std::string& uri,
                           const std::string& user, const std::string& passwd,
                           int buffer_size_param) {
-  return initialize(hadoop_bin, uri, paddle::string::format_string(
-                                         "%s,%s", user.c_str(), passwd.c_str()),
-                    buffer_size_param);
+  return initialize(
+      hadoop_bin, uri,
+      paddle::string::format_string("%s,%s", user.c_str(), passwd.c_str()),
+      buffer_size_param);
 }
 int AfsClient::initialize(const std::string& hadoop_bin, const std::string& uri,
                           const std::string& ugi, int buffer_size_param) {
diff --git a/paddle/fluid/distributed/common/afs_warpper.h b/paddle/fluid/distributed/common/afs_warpper.h
index d10668046c0a7..cef3e5ae35c28 100644
--- a/paddle/fluid/distributed/common/afs_warpper.h
+++ b/paddle/fluid/distributed/common/afs_warpper.h
@@ -19,6 +19,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/distributed/common/cost_timer.h b/paddle/fluid/distributed/common/cost_timer.h
index 5073dc9cf5084..1651121ee0cd9 100644
--- a/paddle/fluid/distributed/common/cost_timer.h
+++ b/paddle/fluid/distributed/common/cost_timer.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
 #include <unordered_map>
+
 #include "butil/time.h"
 #include "bvar/latency_recorder.h"
 #include "glog/logging.h"
diff --git a/paddle/fluid/distributed/common/local_random.h b/paddle/fluid/distributed/common/local_random.h
index 96b8d2d21a560..5a9a3b595d023 100644
--- a/paddle/fluid/distributed/common/local_random.h
+++ b/paddle/fluid/distributed/common/local_random.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <assert.h>
 #include <time.h>
+
 #include <atomic>
 #include <random>
 
diff --git a/paddle/fluid/distributed/common/registerer.h b/paddle/fluid/distributed/common/registerer.h
index 630be930c14d9..f4938c0f93f8c 100644
--- a/paddle/fluid/distributed/common/registerer.h
+++ b/paddle/fluid/distributed/common/registerer.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <glog/logging.h>
+
 #include <iostream>
 #include <map>
 #include <string>
diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index a36e8e648b193..3cafb0bdb5f92 100755
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -7,34 +7,81 @@ proto_library(interceptor_message_proto SRCS interceptor_message.proto)
 if(WITH_ARM_BRPC)
   set(BRPC_DEPS arm_brpc snappy gflags glog)
 elseif(WITH_DISTRIBUTE AND WITH_PSCORE)
-  set(BRPC_DEPS brpc ssl crypto protobuf zlib leveldb snappy gflags glog)
+  set(BRPC_DEPS
+      brpc
+      ssl
+      crypto
+      protobuf
+      zlib
+      leveldb
+      snappy
+      gflags
+      glog)
 else()
   set(BRPC_DEPS "")
 endif()
 
-cc_library(task_loop_thread_pool SRCS task_loop_thread_pool.cc task_loop_thread.cc task_loop.cc DEPS enforce glog)
+cc_library(
+  task_loop_thread_pool
+  SRCS task_loop_thread_pool.cc task_loop_thread.cc task_loop.cc
+  DEPS enforce glog)
 
-cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc dist_model.cc interceptor.cc
-        compute_interceptor.cc amplifier_interceptor.cc source_interceptor.cc sink_interceptor.cc message_service.cc message_bus.cc dist_model_tensor_wrapper.cc
-        DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto task_loop_thread_pool collective_helper
-        op_registry executor_gc_helper gflags glog ${BRPC_DEPS})
+cc_library(
+  fleet_executor
+  SRCS fleet_executor.cc
+       carrier.cc
+       task_node.cc
+       runtime_graph.cc
+       dist_model.cc
+       interceptor.cc
+       compute_interceptor.cc
+       amplifier_interceptor.cc
+       source_interceptor.cc
+       sink_interceptor.cc
+       message_service.cc
+       message_bus.cc
+       dist_model_tensor_wrapper.cc
+  DEPS proto_desc
+       fleet_executor_desc_proto
+       interceptor_message_proto
+       task_loop_thread_pool
+       collective_helper
+       op_registry
+       executor_gc_helper
+       gflags
+       glog
+       ${BRPC_DEPS})
 
 if(WITH_DISTRIBUTE)
-  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  set(DISTRIBUTE_COMPILE_FLAGS
+      "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
+  )
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
     set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
   endif()
-  set_source_files_properties(interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(compute_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(amplifier_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(source_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(sink_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(message_bus.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(message_bus.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(fleet_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(carrier.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(message_service.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(message_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    compute_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    amplifier_interceptor.cc PROPERTIES COMPILE_FLAGS
+                                        ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    source_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    sink_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    message_bus.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    message_bus.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    fleet_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(carrier.cc PROPERTIES COMPILE_FLAGS
+                                                    ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    message_service.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    message_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
   add_subdirectory(test)
 endif()
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 53bae87c0020e..754a3f5d2b22f 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
+
 #include <algorithm>
 
-#include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
@@ -148,8 +149,9 @@ void Carrier::WakeUp() {
 }
 
 void Carrier::Start() {
-  PADDLE_ENFORCE_EQ(is_init_, true, platform::errors::PreconditionNotMet(
-                                        "Using carrier before initialized."));
+  PADDLE_ENFORCE_EQ(is_init_, true,
+                    platform::errors::PreconditionNotMet(
+                        "Using carrier before initialized."));
   for (int64_t id : source_interceptor_ids_) {
     VLOG(3) << "Carrier Start is sending start to source interceptor " << id
             << ".";
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index d35a3260915e2..2846af97716da 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -35,7 +35,7 @@ namespace paddle {
 namespace framework {
 class Scope;
 class ProgramDesc;
-}
+}  // namespace framework
 
 namespace distributed {
 
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index fb907e3b5c29f..4ba11fa7e327d 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
-#include "paddle/fluid/distributed/fleet_executor/carrier.h"
 
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index d8f937e218be4..8fe73d774946c 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/distributed/fleet_executor/dist_model.h"
+
 #include <glog/logging.h>
+
 #include <chrono>  // NOLINT
 
-#include "paddle/fluid/distributed/fleet_executor/dist_model.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/block_desc.h"
@@ -294,8 +296,9 @@ bool DistModel::PrepareProgram() {
 
 bool DistModel::LoadProgram() {
   VLOG(3) << "Loading program from " << config_.model_dir;
-  PADDLE_ENFORCE_NE(config_.model_dir, "", platform::errors::InvalidArgument(
-                                               "Model dir must be provided."));
+  PADDLE_ENFORCE_NE(
+      config_.model_dir, "",
+      platform::errors::InvalidArgument("Model dir must be provided."));
   std::string model_path = config_.model_dir + ".pdmodel";
   framework::proto::ProgramDesc program_proto;
   std::string pb_content;
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.h b/paddle/fluid/distributed/fleet_executor/dist_model.h
index d0203c131357c..f5c1d47afb1a3 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.h
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.h
@@ -31,7 +31,7 @@ namespace framework {
 class ProgramDesc;
 class Scope;
 class BlockDesc;
-}
+}  // namespace framework
 
 namespace distributed {
 
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc
index b440d39c73a70..b7f590e7a8c81 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
index dc8b2596803e0..459e609762d84 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
+++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/macros.h"
 
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index e946d78550ff1..c4d7f3c7a6958 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -11,9 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+
 #include <algorithm>
 
-#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
index ccdb3dcc45948..176e5dab0da17 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
@@ -25,7 +25,7 @@ namespace paddle {
 namespace framework {
 class ProgramDesc;
 class Scope;
-}
+}  // namespace framework
 
 namespace distributed {
 class RuntimeGraph;
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.cc b/paddle/fluid/distributed/fleet_executor/interceptor.cc
index 710ebda41244e..2ff2bc04ff853 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
+
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/task_loop.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.h b/paddle/fluid/distributed/fleet_executor/interceptor.h
index 86ca7be7f44db..00fe2154d28fa 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.h
@@ -33,7 +33,7 @@ namespace paddle {
 namespace framework {
 class Scope;
 class GarbageCollector;
-}
+}  // namespace framework
 namespace distributed {
 
 class TaskNode;
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index 80a6b4667aa1a..76762af9e7e7a 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
+
 #include <chrono>
 #include <memory>
 #include <set>
@@ -19,7 +21,6 @@
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
-#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 
 namespace paddle {
@@ -28,8 +29,9 @@ namespace distributed {
 void MessageBus::Init(
     int64_t rank, const std::unordered_map<int64_t, std::string>& rank_to_addr,
     const std::string& addr) {
-  PADDLE_ENFORCE_EQ(is_init_, false, platform::errors::AlreadyExists(
-                                         "MessageBus is already init."));
+  PADDLE_ENFORCE_EQ(
+      is_init_, false,
+      platform::errors::AlreadyExists("MessageBus is already init."));
   rank_ = rank;
   is_init_ = true;
   rank_to_addr_ = rank_to_addr;
diff --git a/paddle/fluid/distributed/fleet_executor/message_service.cc b/paddle/fluid/distributed/fleet_executor/message_service.cc
index 1c66d83ea34d7..9d42b0d73dbb4 100644
--- a/paddle/fluid/distributed/fleet_executor/message_service.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_service.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/message_service.h"
+
 #include "brpc/server.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
index 614b4c37e8254..a5f90062dcfd9 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
+
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.h b/paddle/fluid/distributed/fleet_executor/runtime_graph.h
index 1ca9f0174ed07..a59a43cc200a5 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.h
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/platform/macros.h"
diff --git a/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc b/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc
index 77fbb23a6c71b..9d9e6c0356548 100644
--- a/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/sink_interceptor.h"
+
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/fleet_executor/source_interceptor.cc b/paddle/fluid/distributed/fleet_executor/source_interceptor.cc
index 78b2bed66dd99..6b2fd5565ea13 100644
--- a/paddle/fluid/distributed/fleet_executor/source_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/source_interceptor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/source_interceptor.h"
+
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc b/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc
index bb313ad37890d..90765dbdd2d09 100644
--- a/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc
@@ -31,8 +31,9 @@ TaskLoopThread::~TaskLoopThread() {
 }
 
 TaskLoop* TaskLoopThread::StartLoop() {
-  PADDLE_ENFORCE_EQ(start_, false, platform::errors::PreconditionNotMet(
-                                       "thread is already running."));
+  PADDLE_ENFORCE_EQ(
+      start_, false,
+      platform::errors::PreconditionNotMet("thread is already running."));
   start_ = true;
   thread_ = std::thread([this]() { Loop(); });
 
diff --git a/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc b/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc
index ed34bbb87fc6b..e962a29b4a150 100644
--- a/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc
@@ -30,8 +30,9 @@ TaskLoopThreadPool::TaskLoopThreadPool(int thread_num)
 TaskLoopThreadPool::~TaskLoopThreadPool() = default;
 
 void TaskLoopThreadPool::Start() {
-  PADDLE_ENFORCE_EQ(start_, false, platform::errors::PreconditionNotMet(
-                                       "thread pool is already start."));
+  PADDLE_ENFORCE_EQ(
+      start_, false,
+      platform::errors::PreconditionNotMet("thread pool is already start."));
   PADDLE_ENFORCE_GT(
       thread_num_, 0,
       platform::errors::InvalidArgument(
@@ -45,10 +46,12 @@ void TaskLoopThreadPool::Start() {
 }
 
 TaskLoop* TaskLoopThreadPool::GetLoop(int tid) {
-  PADDLE_ENFORCE_EQ(start_, true, platform::errors::PreconditionNotMet(
-                                      "thread pool must start first."));
-  PADDLE_ENFORCE_GE(tid, 0, platform::errors::OutOfRange(
-                                "tid must >= 0, but now is %d", tid));
+  PADDLE_ENFORCE_EQ(
+      start_, true,
+      platform::errors::PreconditionNotMet("thread pool must start first."));
+  PADDLE_ENFORCE_GE(
+      tid, 0,
+      platform::errors::OutOfRange("tid must >= 0, but now is %d", tid));
   PADDLE_ENFORCE_LT(tid, thread_num_,
                     platform::errors::OutOfRange(
                         "tid must < thread_num, but now tid=%d thread_num=%d",
@@ -57,8 +60,9 @@ TaskLoop* TaskLoopThreadPool::GetLoop(int tid) {
 }
 
 std::vector<TaskLoop*> TaskLoopThreadPool::GetAllLoops() {
-  PADDLE_ENFORCE_EQ(start_, true, platform::errors::PreconditionNotMet(
-                                      "thread pool must start first."));
+  PADDLE_ENFORCE_EQ(
+      start_, true,
+      platform::errors::PreconditionNotMet("thread pool must start first."));
   return loops_;
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc
index 232317333ea11..00ae30d281ee8 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_node.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -153,15 +154,17 @@ void TaskNode::SetRunAtOffset(int64_t value) {
 
 void TaskNode::SetReplyUpPerSteps(int64_t value) {
   PADDLE_ENFORCE_GE(
-      value, 1, platform::errors::InvalidArgument(
-                    "reply_up_per_steps must >= 1, but received %ld", value));
+      value, 1,
+      platform::errors::InvalidArgument(
+          "reply_up_per_steps must >= 1, but received %ld", value));
   reply_up_per_steps_ = value;
 }
 
 void TaskNode::SetSendDownPerSteps(int64_t value) {
   PADDLE_ENFORCE_GE(
-      value, 1, platform::errors::InvalidArgument(
-                    "send_down_per_steps must >= 1, but received %ld", value));
+      value, 1,
+      platform::errors::InvalidArgument(
+          "send_down_per_steps must >= 1, but received %ld", value));
   send_down_per_steps_ = value;
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
index 7dd4b5454567e..16e686a4401b8 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.h
+++ b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -26,7 +26,7 @@ namespace paddle {
 namespace framework {
 class OperatorBase;
 class OpDesc;
-}
+}  // namespace framework
 namespace distributed {
 
 class TaskNode final {
diff --git a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
index e0db8a261b585..0cd39b3aad6e6 100644
--- a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
@@ -1,25 +1,72 @@
-set_source_files_properties(interceptor_ping_pong_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(interceptor_ping_pong_test SRCS interceptor_ping_pong_test.cc DEPS fleet_executor ${BRPC_DEPS})
+set_source_files_properties(
+  interceptor_ping_pong_test.cc PROPERTIES COMPILE_FLAGS
+                                           ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  interceptor_ping_pong_test
+  SRCS interceptor_ping_pong_test.cc
+  DEPS fleet_executor ${BRPC_DEPS})
 
-set_source_files_properties(compute_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(compute_interceptor_test SRCS compute_interceptor_test.cc DEPS fleet_executor ${BRPC_DEPS})
+set_source_files_properties(
+  compute_interceptor_test.cc PROPERTIES COMPILE_FLAGS
+                                         ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  compute_interceptor_test
+  SRCS compute_interceptor_test.cc
+  DEPS fleet_executor ${BRPC_DEPS})
 
-set_source_files_properties(source_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(source_interceptor_test SRCS source_interceptor_test.cc DEPS fleet_executor ${BRPC_DEPS})
+set_source_files_properties(
+  source_interceptor_test.cc PROPERTIES COMPILE_FLAGS
+                                        ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  source_interceptor_test
+  SRCS source_interceptor_test.cc
+  DEPS fleet_executor ${BRPC_DEPS})
 
-set_source_files_properties(sink_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(sink_interceptor_test SRCS sink_interceptor_test.cc DEPS fleet_executor ${BRPC_DEPS})
+set_source_files_properties(
+  sink_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  sink_interceptor_test
+  SRCS sink_interceptor_test.cc
+  DEPS fleet_executor ${BRPC_DEPS})
 
-set_source_files_properties(interceptor_pipeline_short_path_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(interceptor_pipeline_short_path_test SRCS interceptor_pipeline_short_path_test.cc DEPS fleet_executor ${BRPC_DEPS})
+set_source_files_properties(
+  interceptor_pipeline_short_path_test.cc
+  PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  interceptor_pipeline_short_path_test
+  SRCS interceptor_pipeline_short_path_test.cc
+  DEPS fleet_executor ${BRPC_DEPS})
 
-set_source_files_properties(interceptor_pipeline_long_path_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(interceptor_pipeline_long_path_test SRCS interceptor_pipeline_long_path_test.cc DEPS fleet_executor ${BRPC_DEPS})
+set_source_files_properties(
+  interceptor_pipeline_long_path_test.cc PROPERTIES COMPILE_FLAGS
+                                                    ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  interceptor_pipeline_long_path_test
+  SRCS interceptor_pipeline_long_path_test.cc
+  DEPS fleet_executor ${BRPC_DEPS})
 
-set_source_files_properties(compute_interceptor_run_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(compute_interceptor_run_op_test SRCS compute_interceptor_run_op_test.cc DEPS fleet_executor ${BRPC_DEPS} op_registry fill_constant_op elementwise_add_op scope device_context)
+set_source_files_properties(
+  compute_interceptor_run_op_test.cc PROPERTIES COMPILE_FLAGS
+                                                ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  compute_interceptor_run_op_test
+  SRCS compute_interceptor_run_op_test.cc
+  DEPS fleet_executor
+       ${BRPC_DEPS}
+       op_registry
+       fill_constant_op
+       elementwise_add_op
+       scope
+       device_context)
 
-if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
-set_source_files_properties(interceptor_ping_pong_with_brpc_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(interceptor_ping_pong_with_brpc_test SRCS interceptor_ping_pong_with_brpc_test.cc DEPS fleet_executor ${BRPC_DEPS})
+if(WITH_DISTRIBUTE
+   AND WITH_PSCORE
+   AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+  set_source_files_properties(
+    interceptor_ping_pong_with_brpc_test.cc
+    PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  cc_test(
+    interceptor_ping_pong_with_brpc_test
+    SRCS interceptor_ping_pong_with_brpc_test.cc
+    DEPS fleet_executor ${BRPC_DEPS})
 endif()
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
index 35857fc86b5e0..bd81d3644f4d8 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
index 954b52693f46c..4992a8b34c9da 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
index 19c1d0a0d7a6a..54adf06fb67dd 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
index 78cff2606f6b8..3828c4478cbe6 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include <sys/socket.h>
 #include <time.h>
+
 #include <iostream>
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
index e909744a4b5d6..a78cd6955f246 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
index 0e57596bacbe6..53755bf1a40eb 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc
index 8ff908f90ec85..879d7e9b02941 100644
--- a/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc
@@ -16,7 +16,6 @@
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc
index e9c0437c829d4..21a1b4accc9f1 100644
--- a/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc
@@ -16,7 +16,6 @@
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/index_dataset/CMakeLists.txt b/paddle/fluid/distributed/index_dataset/CMakeLists.txt
index 98bc0a0ad4a26..524245be5f2ad 100644
--- a/paddle/fluid/distributed/index_dataset/CMakeLists.txt
+++ b/paddle/fluid/distributed/index_dataset/CMakeLists.txt
@@ -1,9 +1,18 @@
 proto_library(index_dataset_proto SRCS index_dataset.proto)
-cc_library(index_wrapper SRCS index_wrapper.cc DEPS index_dataset_proto fs)
+cc_library(
+  index_wrapper
+  SRCS index_wrapper.cc
+  DEPS index_dataset_proto fs)
 if(WITH_MKLDNN)
-  cc_library(index_sampler SRCS index_sampler.cc DEPS xxhash index_wrapper eigen3 mkldnn)
+  cc_library(
+    index_sampler
+    SRCS index_sampler.cc
+    DEPS xxhash index_wrapper eigen3 mkldnn)
 else()
-  cc_library(index_sampler SRCS index_sampler.cc DEPS xxhash index_wrapper eigen3)
+  cc_library(
+    index_sampler
+    SRCS index_sampler.cc
+    DEPS xxhash index_wrapper eigen3)
 endif()
 if(WITH_PYTHON)
   py_proto_compile(index_dataset_py_proto SRCS index_dataset.proto)
diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.cc b/paddle/fluid/distributed/index_dataset/index_sampler.cc
index 306d11d333dae..b82193220515a 100644
--- a/paddle/fluid/distributed/index_dataset/index_sampler.cc
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/index_dataset/index_sampler.h"
+
 #include "paddle/fluid/framework/data_feed.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.h b/paddle/fluid/distributed/index_dataset/index_sampler.h
index 02806b814c200..a82348c9ec586 100644
--- a/paddle/fluid/distributed/index_dataset/index_sampler.h
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/program_desc.h"
diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.cc b/paddle/fluid/distributed/index_dataset/index_wrapper.cc
index 27aa890f7600f..61941ef513334 100644
--- a/paddle/fluid/distributed/index_dataset/index_wrapper.cc
+++ b/paddle/fluid/distributed/index_dataset/index_wrapper.cc
@@ -9,15 +9,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
+
 #include <memory>
 #include <string>
 #include <thread>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/io/fs.h"
 
-#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
+#include "paddle/fluid/framework/io/fs.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.h b/paddle/fluid/distributed/index_dataset/index_wrapper.h
index 8fb8faf6c84a2..1c652e60bbbc3 100644
--- a/paddle/fluid/distributed/index_dataset/index_wrapper.h
+++ b/paddle/fluid/distributed/index_dataset/index_wrapper.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/distributed/index_dataset/index_dataset.pb.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -90,10 +91,11 @@ class IndexWrapper {
     }
     TreePtr tree = std::make_shared<TreeIndex>();
     int ret = tree->Load(tree_path);
-    PADDLE_ENFORCE_EQ(ret, 0, paddle::platform::errors::InvalidArgument(
-                                  "Load tree[%s] from path[%s] failed. Please "
-                                  "check whether the file exists.",
-                                  name, tree_path));
+    PADDLE_ENFORCE_EQ(ret, 0,
+                      paddle::platform::errors::InvalidArgument(
+                          "Load tree[%s] from path[%s] failed. Please "
+                          "check whether the file exists.",
+                          name, tree_path));
     tree_map.insert(std::pair<std::string, TreePtr>{name, tree});
   }
 
diff --git a/paddle/fluid/distributed/ps/service/CMakeLists.txt b/paddle/fluid/distributed/ps/service/CMakeLists.txt
index e7519ef4998b1..ad49b651e2e71 100755
--- a/paddle/fluid/distributed/ps/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt
@@ -1,57 +1,136 @@
 set(BRPC_SRCS ps_client.cc server.cc)
 set_source_files_properties(${BRPC_SRCS})
 
-
 if(WITH_HETERPS)
 
-    set(BRPC_DEPS brpc ssl crypto protobuf gflags glog zlib leveldb snappy gflags glog device_context rocksdb)
+  set(BRPC_DEPS
+      brpc
+      ssl
+      crypto
+      protobuf
+      gflags
+      glog
+      zlib
+      leveldb
+      snappy
+      gflags
+      glog
+      device_context
+      rocksdb)
 
 else()
 
-    set(BRPC_DEPS brpc ssl crypto protobuf gflags glog zlib leveldb snappy gflags glog device_context)
+  set(BRPC_DEPS
+      brpc
+      ssl
+      crypto
+      protobuf
+      gflags
+      glog
+      zlib
+      leveldb
+      snappy
+      gflags
+      glog
+      device_context)
 
 endif()
 
-brpc_library(sendrecv_rpc SRCS
-        ${BRPC_SRCS}
-        PROTO sendrecv.proto
-        DEPS ${BRPC_DEPS} )
+brpc_library(
+  sendrecv_rpc
+  SRCS
+  ${BRPC_SRCS}
+  PROTO
+  sendrecv.proto
+  DEPS
+  ${BRPC_DEPS})
 
 #set_property(GLOBAL PROPERTY RPC_DEPS sendrecv_rpc ${BRPC_DEPS} string_helper)
 
 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
 
-set_source_files_properties(communicator/communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(ps_service/service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(brpc_ps_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(brpc_ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(ps_local_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-
-set_source_files_properties(brpc_utils.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(heter_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(heter_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-
-set_source_files_properties(client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(graph_brpc_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(graph_brpc_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_library(brpc_utils SRCS brpc_utils.cc DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS})
-
-cc_library(downpour_server SRCS graph_brpc_server.cc brpc_ps_server.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
-cc_library(downpour_client SRCS graph_brpc_client.cc brpc_ps_client.cc
-ps_local_client.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
-
-cc_library(client SRCS ps_client.cc DEPS downpour_client boost ${RPC_DEPS})
-cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS})
-
-cc_library(communicator SRCS communicator/communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS})
-cc_library(ps_service SRCS ps_service/service.cc DEPS communicator client server boost ${RPC_DEPS})
-
-cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
-cc_library(heter_server SRCS heter_server.cc DEPS heter_client brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
-
-set_source_files_properties(ps_service/graph_py_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_library(graph_py_service SRCS ps_service/graph_py_service.cc DEPS ps_service)
+set_source_files_properties(
+  communicator/communicator.cc PROPERTIES COMPILE_FLAGS
+                                          ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  ps_service/service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  brpc_ps_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  brpc_ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  ps_local_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
+set_source_files_properties(
+  brpc_utils.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  heter_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  heter_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
+set_source_files_properties(client.cc PROPERTIES COMPILE_FLAGS
+                                                 ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(ps_client.cc PROPERTIES COMPILE_FLAGS
+                                                    ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(server.cc PROPERTIES COMPILE_FLAGS
+                                                 ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  graph_brpc_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  graph_brpc_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(
+  brpc_utils
+  SRCS brpc_utils.cc
+  DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS})
+
+cc_library(
+  downpour_server
+  SRCS graph_brpc_server.cc brpc_ps_server.cc
+  DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
+cc_library(
+  downpour_client
+  SRCS graph_brpc_client.cc brpc_ps_client.cc ps_local_client.cc
+  DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
+
+cc_library(
+  client
+  SRCS ps_client.cc
+  DEPS downpour_client boost ${RPC_DEPS})
+cc_library(
+  server
+  SRCS server.cc
+  DEPS downpour_server boost ${RPC_DEPS})
+
+cc_library(
+  communicator
+  SRCS communicator/communicator.cc
+  DEPS scope
+       client
+       boost
+       table
+       math_function
+       selected_rows_functor
+       ${RPC_DEPS})
+cc_library(
+  ps_service
+  SRCS ps_service/service.cc
+  DEPS communicator client server boost ${RPC_DEPS})
+
+cc_library(
+  heter_client
+  SRCS heter_client.cc
+  DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
+cc_library(
+  heter_server
+  SRCS heter_server.cc
+  DEPS heter_client brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
+
+set_source_files_properties(
+  ps_service/graph_py_service.cc PROPERTIES COMPILE_FLAGS
+                                            ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(
+  graph_py_service
+  SRCS ps_service/graph_py_service.cc
+  DEPS ps_service)
 
 #add_subdirectory(communicator)
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
old mode 100755
new mode 100644
index 0959b651bb558..89466076b23d0
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
+
 #include <memory>
 #include <sstream>
 #include <string>
 
-#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/framework/archive.h"
 
 static const int max_port = 65535;
@@ -245,8 +246,9 @@ int32_t BrpcPsClient::Initialize() {
 
 int DownpourBrpcClosure::check_response(size_t request_idx, int cmd_id) {
   if (_cntls[request_idx]->Failed()) {
-    LOG(ERROR) << "resquest cmd_id:" << cmd_id << " failed, "
-                                                  "err:"
+    LOG(ERROR) << "resquest cmd_id:" << cmd_id
+               << " failed, "
+                  "err:"
                << _cntls[request_idx]->ErrorText();
     return -1;
   }
@@ -263,8 +265,9 @@ int DownpourBrpcClosure::check_response(size_t request_idx, int cmd_id) {
 int DownpourBrpcClosure::check_save_response(size_t request_idx, int cmd_id) {
   int32_t feasign_size = 0;
   if (_cntls[request_idx]->Failed()) {
-    LOG(ERROR) << "resquest cmd_id:" << cmd_id << " failed, "
-                                                  "err:"
+    LOG(ERROR) << "resquest cmd_id:" << cmd_id
+               << " failed, "
+                  "err:"
                << _cntls[request_idx]->ErrorText();
     return -1;
   }
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
index e2c16d496c42c..17b6bbe22cefe 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <ThreadPool.h>
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
index 8167c37b59987..d859acbb42e44 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
+
 #include <thread>  // NOLINT
+
 #include "butil/object_pool.h"
 #include "paddle/fluid/distributed/common/cost_timer.h"
 #include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h"
diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.h b/paddle/fluid/distributed/ps/service/brpc_utils.h
index e68e15058f7b0..d4332744cebca 100644
--- a/paddle/fluid/distributed/ps/service/brpc_utils.h
+++ b/paddle/fluid/distributed/ps/service/brpc_utils.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <netdb.h>
+
 #include <iostream>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt b/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt
index 3610729d74d93..612358c71a6fb 100644
--- a/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt
@@ -1,8 +1,15 @@
-
-
 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
 
-set_source_files_properties(communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-
-
-cc_library(communicator SRCS communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS})
+set_source_files_properties(
+  communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
+cc_library(
+  communicator
+  SRCS communicator.cc
+  DEPS scope
+       client
+       boost
+       table
+       math_function
+       selected_rows_functor
+       ${RPC_DEPS})
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
index c4b833f294e17..c50f1d909cd95 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
+
 #include <google/protobuf/text_format.h>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h
index 75676c392435c..5f2a0cbb90976 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <ThreadPool.h>
 #include <stdint.h>
+
 #include <atomic>
 #include <deque>
 #include <map>
@@ -30,6 +31,7 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
+#include "paddle/fluid/distributed/ps/service/ps_client.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
@@ -42,8 +44,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/distributed/ps/service/ps_client.h"
-
 namespace paddle {
 namespace distributed {
 class PSClient;
@@ -157,8 +157,9 @@ template <typename T>
 inline void MergeVars(const std::string &var_name,
                       const std::vector<std::shared_ptr<Variable>> &vars,
                       Scope *scope, bool merge_add = true) {
-  PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument(
-                                            "vector vars are empty."));
+  PADDLE_ENFORCE_NE(
+      vars.empty(), true,
+      platform::errors::InvalidArgument("vector vars are empty."));
   auto cpu_place = platform::CPUPlace();
   auto &var0 = vars[0];
   auto *out_var = scope->Var(var_name);
diff --git a/paddle/fluid/distributed/ps/service/env.h b/paddle/fluid/distributed/ps/service/env.h
index 162ee6f098422..0fddb17da7c41 100644
--- a/paddle/fluid/distributed/ps/service/env.h
+++ b/paddle/fluid/distributed/ps/service/env.h
@@ -18,11 +18,13 @@
 #include <glog/logging.h>
 #include <netinet/in.h>
 #include <stdio.h>
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "gflags/gflags.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
index c1df490669dbe..ff9680044dd6b 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/service/graph_brpc_client.h"
+
 #include <algorithm>
 #include <memory>
 #include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "Eigen/Dense"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/distributed/ps/table/table.h"
@@ -149,7 +151,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
 std::future<int32_t> GraphBrpcClient::clear_nodes(uint32_t table_id,
                                                   int type_id, int idx_) {
   DownpourBrpcClosure *closure = new DownpourBrpcClosure(
-      server_size, [&, server_size = this->server_size ](void *done) {
+      server_size, [&, server_size = this->server_size](void *done) {
         int ret = 0;
         auto *closure = (DownpourBrpcClosure *)done;
         size_t fail_num = 0;
@@ -665,5 +667,5 @@ int32_t GraphBrpcClient::Initialize() {
   local_channel = NULL;
   return 0;
 }
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.h b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
index 51f14bc57cde0..c038c840df97f 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.h
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
@@ -15,11 +15,12 @@
 #pragma once
 
 #include <ThreadPool.h>
+
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
-#include <utility>
 #include "ThreadPool.h"
 #include "brpc/channel.h"
 #include "brpc/controller.h"
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
index 8ff12265269b2..5ce26b4525041 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
@@ -13,13 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/service/graph_brpc_server.h"
-#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 
 #include <thread>  // NOLINT
 #include <utility>
+
 #include "butil/endpoint.h"
 #include "iomanip"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.h b/paddle/fluid/distributed/ps/service/graph_brpc_server.h
index caf728701b289..726876bef1621 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.h
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
+#include <memory>
+#include <vector>
+
 #include "brpc/channel.h"
 #include "brpc/controller.h"
 #include "brpc/server.h"
-
-#include <memory>
-#include <vector>
 #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 #include "paddle/fluid/distributed/ps/service/server.h"
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc
index fd0962caaaead..44c03ca1757e5 100755
--- a/paddle/fluid/distributed/ps/service/heter_client.cc
+++ b/paddle/fluid/distributed/ps/service/heter_client.cc
@@ -139,8 +139,9 @@ void HeterClient::SendAndRecvAsync(
       message_name, send_var_name_val, recv_var_name_val, *p_ctx, p_scope,
       &request, &request_io_buffer);
 
-  int micro_id = GetMicroId(ctx, p_scope);
+  int micro_id = GetMicroId(ctx, p_scope);  // global
   auto minibatch_id = micro_id / 10;
+  VLOG(4) << "micro_id: " << micro_id;
   // select channel according to micro id
   if (mode == "forward") {
     int num = minibatch_id % xpu_channels_.size();
diff --git a/paddle/fluid/distributed/ps/service/heter_client.h b/paddle/fluid/distributed/ps/service/heter_client.h
old mode 100644
new mode 100755
index efaa48470a8bd..7683b8a16793e
--- a/paddle/fluid/distributed/ps/service/heter_client.h
+++ b/paddle/fluid/distributed/ps/service/heter_client.h
@@ -155,13 +155,13 @@ class HeterClient {
 
   // HeterClient singleton
   static std::shared_ptr<HeterClient> GetInstance(
-      const std::vector<std::string>& endpoint,
-      const std::vector<std::string>& previous_endpoint,
+      const std::vector<std::string>& endpoints,
+      const std::vector<std::string>& previous_endpoints,
       const int& trainer_id) {
     if (NULL == s_instance_) {
       s_instance_.reset(new HeterClient());
-      s_instance_->SetXpuList(endpoint);
-      s_instance_->SetPreviousXpuList(previous_endpoint);
+      s_instance_->SetXpuList(endpoints);
+      s_instance_->SetPreviousXpuList(previous_endpoints);
       s_instance_->SetTrainerID(trainer_id);
       s_instance_->CreateClient2XpuConnection();
     }
diff --git a/paddle/fluid/distributed/ps/service/heter_server.cc b/paddle/fluid/distributed/ps/service/heter_server.cc
index fd38a030ff366..4440647ac94c4 100755
--- a/paddle/fluid/distributed/ps/service/heter_server.cc
+++ b/paddle/fluid/distributed/ps/service/heter_server.cc
@@ -94,7 +94,6 @@ void HeterServer::StartHeterInterService(bool neeed_encrypt) {
     VLOG(4) << "switch inter server server start success! listen on "
             << endpoint_inter_;
   }
-
   {
     std::lock_guard<std::mutex> lock(this->mutex_ready_);
     stoped_ = false;
@@ -115,9 +114,6 @@ void HeterServer::SetFanin(const int& fan_in) { service_.SetFanin(fan_in); }
 void HeterServer::WaitServerReady() {
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
   condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  while (!this->ready_) {
-    sleep(1);
-  }
 }
 
 int SendAndRecvVariableHandler::SaveInSwitchWithShard(
diff --git a/paddle/fluid/distributed/ps/service/heter_server.h b/paddle/fluid/distributed/ps/service/heter_server.h
index ddcf36bf68d7b..97028066e6641 100755
--- a/paddle/fluid/distributed/ps/service/heter_server.h
+++ b/paddle/fluid/distributed/ps/service/heter_server.h
@@ -90,8 +90,10 @@ class ServiceHandlerBase {
 
 using SharedMiniScope =
     std::shared_ptr<std::unordered_map<int, ::paddle::framework::Scope*>>;
+
 using SharedMicroScope = std::shared_ptr<std::unordered_map<
     int, std::shared_ptr<std::vector<::paddle::framework::Scope*>>>>;
+
 using SharedTaskQueue = std::shared_ptr<
     std::unordered_map<int, std::shared_ptr<::paddle::framework::BlockingQueue<
                                 std::pair<std::string, int>>>>>;
@@ -226,6 +228,7 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase {
     auto* tensor = var->GetMutable<framework::LoDTensor>();
     auto data = reinterpret_cast<const float*>(tensor->data());
     auto micro_id = static_cast<int>(data[0]);
+    VLOG(4) << "micro_id in heter server: " << micro_id;
     int minibatch_index = micro_id / 10;
     int microbatch_index = micro_id % 10;
 
@@ -261,6 +264,9 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase {
     distributed::DeserializeFromMultiVarMsgAndIOBuf(
         *request, &request_io_buffer, *dev_ctx_, micro_scope);
     // blocking queue handles multi thread
+    VLOG(4) << "Handle in HeterServer: " << message_name << ", "
+            << microbatch_index;
+    VLOG(4) << "task_queue_ size: " << task_queue_->size();
     (*task_queue_)[minibatch_index]->Push(
         std::make_pair(message_name, microbatch_index));
 
@@ -274,6 +280,7 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase {
     distributed::SerializeToMultiVarMsgAndIOBuf(
         message_name, response_var_names, empty_var_names, *dev_ctx_,
         &local_scope, response, &response_io_buffer);
+    VLOG(4) << "Handle over";
     return 0;
   }
 
@@ -612,11 +619,9 @@ class HeterServer {
 
   // HeterWrapper singleton
   static std::shared_ptr<HeterServer> GetInstance() {
+    std::unique_lock<std::mutex> lock(mtx_);
     if (s_instance_ == nullptr) {
-      std::unique_lock<std::mutex> lock(mtx_);
-      if (NULL == s_instance_) {
-        s_instance_.reset(new HeterServer());
-      }
+      s_instance_.reset(new HeterServer());
     }
     return s_instance_;
   }
diff --git a/paddle/fluid/distributed/ps/service/ps_client.cc b/paddle/fluid/distributed/ps/service/ps_client.cc
index f7df99ec13cdf..a0216f2a7953a 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_client.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/service/ps_client.h"
+
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/distributed/ps/service/graph_brpc_client.h"
diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h
index 926bb7e7c9fd3..adf096c8469c5 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/distributed/common/cost_timer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc
index bc024ed3175bc..b6407ccebe52b 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/service/ps_local_client.h"
+
 #include "paddle/fluid/distributed/ps/table/table.h"
 
 //#define pslib_debug_dense_compress
@@ -316,5 +317,5 @@ ::std::future<int32_t> PsLocalClient::PushSparse(size_t table_id,
   table_ptr->Push(table_context);
   return done();
 }
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h
index 439ecf79f2f80..89c2f7446ac3b 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.h
@@ -223,5 +223,5 @@ class PsLocalClient : public PSClient {
   float _mse = 0;
   uint16_t _push_times = 0;
 };
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/service/ps_local_server.h b/paddle/fluid/distributed/ps/service/ps_local_server.h
index c09f8585b659d..2075e9dd2be28 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_server.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_server.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/service/server.h"
 
 namespace paddle {
@@ -37,5 +38,5 @@ class PsLocalServer : public PSServer {
  private:
   virtual int32_t Initialize() { return 0; }
 };
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
index ced51b8cbe383..255c0d3d655aa 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h"
+
 #include <thread>  // NOLINT
+
 #include "butil/endpoint.h"
 #include "iomanip"
 #include "paddle/fluid/distributed/ps/table/table.h"
@@ -501,5 +503,5 @@ void GraphPyClient::StopServer() {
   if (status.get() == 0) stoped_ = true;
 }
 void GraphPyClient::FinalizeWorker() { this->worker_ptr->FinalizeWorker(); }
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
index 55beb9b3932a6..7dd0340125693 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <unistd.h>
+
 #include <condition_variable>  // NOLINT
 #include <fstream>
 #include <iomanip>
@@ -23,21 +24,20 @@
 #include <thread>  // NOLINT
 #include <unordered_map>
 #include <vector>
-#include "google/protobuf/text_format.h"
 
+#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable.h"
-
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
 #include "paddle/fluid/distributed/ps/service/graph_brpc_client.h"
 #include "paddle/fluid/distributed/ps/service/graph_brpc_server.h"
 #include "paddle/fluid/distributed/ps/service/ps_service/service.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -198,5 +198,5 @@ class GraphPyClient : public GraphPyService {
   std::thread* client_thread;
   bool stoped_ = false;
 };
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/service/ps_service/service.cc b/paddle/fluid/distributed/ps/service/ps_service/service.cc
index 9c3a06c2212e6..9eb5d49a4051c 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/service.cc
@@ -17,7 +17,9 @@
 #include <fcntl.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
+
 #include <iostream>
+
 #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/distributed/ps/service/server.h b/paddle/fluid/distributed/ps/service/server.h
index c044e82884604..55bbbc06d878a 100644
--- a/paddle/fluid/distributed/ps/service/server.h
+++ b/paddle/fluid/distributed/ps/service/server.h
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "butil/endpoint.h"
 #include "google/protobuf/service.h"
 #include "paddle/fluid/distributed/common/registerer.h"
diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt
index b8eff940a0dca..fdda59420f03c 100644
--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -1,49 +1,125 @@
 set_property(GLOBAL PROPERTY TABLE_DEPS string_helper)
 set(graphDir graph)
 get_property(TABLE_DEPS GLOBAL PROPERTY TABLE_DEPS)
-set_source_files_properties(${graphDir}/graph_edge.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  ${graphDir}/graph_edge.cc PROPERTIES COMPILE_FLAGS
+                                       ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(graph_edge SRCS ${graphDir}/graph_edge.cc)
-set_source_files_properties(${graphDir}/graph_weighted_sampler.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_library(WeightedSampler SRCS ${graphDir}/graph_weighted_sampler.cc DEPS graph_edge)
-set_source_files_properties(${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_library(graph_node SRCS ${graphDir}/graph_node.cc DEPS WeightedSampler)
-set_source_files_properties(memory_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  ${graphDir}/graph_weighted_sampler.cc PROPERTIES COMPILE_FLAGS
+                                                   ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(
+  WeightedSampler
+  SRCS ${graphDir}/graph_weighted_sampler.cc
+  DEPS graph_edge)
+set_source_files_properties(
+  ${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS
+                                       ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(
+  graph_node
+  SRCS ${graphDir}/graph_node.cc
+  DEPS WeightedSampler)
+set_source_files_properties(
+  memory_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
 
 set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/")
-include_directories(${PADDLE_LIB_THIRD_PARTY_PATH}libmct/src/extern_libmct/libmct/include)
+include_directories(
+  ${PADDLE_LIB_THIRD_PARTY_PATH}libmct/src/extern_libmct/libmct/include)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
 
 set(TABLE_SRC memory_dense_table.cc barrier_table.cc common_graph_table.cc)
 #set(EXTERN_DEP rocksdb)
 
-cc_library(common_table SRCS ${TABLE_SRC} DEPS ${TABLE_DEPS}
-${RPC_DEPS} graph_edge graph_node device_context string_helper
-simple_threadpool xxhash generator)
+cc_library(
+  common_table
+  SRCS ${TABLE_SRC}
+  DEPS ${TABLE_DEPS}
+       ${RPC_DEPS}
+       graph_edge
+       graph_node
+       device_context
+       string_helper
+       simple_threadpool
+       xxhash
+       generator)
 
-set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
-cc_library(tensor_accessor SRCS tensor_accessor.cc DEPS ${TABLE_DEPS} eigen3 ps_framework_proto device_context)
-cc_library(tensor_table SRCS DEPS eigen3 ps_framework_proto executor scope device_context tensor ${TABLE_DEPS})
-set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(
+  tensor_accessor
+  SRCS tensor_accessor.cc
+  DEPS ${TABLE_DEPS} eigen3 ps_framework_proto device_context)
+cc_library(
+  tensor_table
+  SRCS
+  DEPS eigen3
+       ps_framework_proto
+       executor
+       scope
+       device_context
+       tensor
+       ${TABLE_DEPS})
+set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS
+                                                ${DISTRIBUTE_COMPILE_FLAGS})
 
-set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(ctr_double_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(sparse_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(ctr_dymf_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(memory_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(ssd_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  ctr_double_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  sparse_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  ctr_dymf_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  memory_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  ssd_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(
+  memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS
+                                        ${DISTRIBUTE_COMPILE_FLAGS})
 
-cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto)
-cc_library(ctr_accessor SRCS ctr_accessor.cc ctr_double_accessor.cc sparse_accessor.cc ctr_dymf_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
-cc_library(sparse_table SRCS memory_sparse_table.cc ssd_sparse_table.cc memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table rocksdb)
+cc_library(
+  sparse_sgd_rule
+  SRCS sparse_sgd_rule.cc
+  DEPS ${TABLE_DEPS} ps_framework_proto)
+cc_library(
+  ctr_accessor
+  SRCS ctr_accessor.cc ctr_double_accessor.cc sparse_accessor.cc
+       ctr_dymf_accessor.cc
+  DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
+cc_library(
+  sparse_table
+  SRCS memory_sparse_table.cc ssd_sparse_table.cc memory_sparse_geo_table.cc
+  DEPS ps_framework_proto
+       ${TABLE_DEPS}
+       fs
+       afs_wrapper
+       ctr_accessor
+       common_table
+       rocksdb)
 
-cc_library(table SRCS table.cc DEPS sparse_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
+cc_library(
+  table
+  SRCS table.cc
+  DEPS sparse_table
+       common_table
+       tensor_accessor
+       tensor_table
+       ps_framework_proto
+       string_helper
+       device_context
+       gflags
+       glog
+       boost)
 
 target_link_libraries(table -fopenmp)
diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h
index 7713c2bda295f..4db8ad0a55a5e 100644
--- a/paddle/fluid/distributed/ps/table/accessor.h
+++ b/paddle/fluid/distributed/ps/table/accessor.h
@@ -15,8 +15,10 @@
 #pragma once
 #include <stdint.h>
 #include <stdio.h>
+
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/distributed/common/afs_warpper.h"
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index 43dee275a3dc6..55a9c794e8ead 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -13,11 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+
 #include <time.h>
+
 #include <algorithm>
 #include <chrono>
 #include <set>
 #include <sstream>
+
 #include "paddle/fluid/distributed/common/utils.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/framework/generator.h"
@@ -212,7 +215,6 @@ int64_t GraphTable::load_graph_to_memory_from_ssd(int idx,
   for (size_t i = 0; i < bags.size(); i++) {
     if (bags[i].size() > 0) {
       tasks.push_back(_shards_task_pool[i]->enqueue([&, i, idx, this]() -> int {
-
         char ch[sizeof(int) * 2 + sizeof(int64_t)];
         memset(ch, 0, sizeof(int));
         memcpy(ch + sizeof(int), &idx, sizeof(int));
@@ -353,7 +355,6 @@ void GraphTable::export_partition_files(int idx, std::string file_path) {
   for (int i = 0; i < part_len; i++) {
     tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
         [&, i, idx, this]() -> int {
-
           std::string output_path =
               file_path + "partition_" + std::to_string(i);
 
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
index 25bec5276e729..6dd24df921dc1 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -17,6 +17,7 @@
 #include <ThreadPool.h>
 #include <assert.h>
 #include <pthread.h>
+
 #include <algorithm>
 #include <cassert>
 #include <cstdio>
@@ -36,6 +37,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/common_table.h"
 #include "paddle/fluid/distributed/ps/table/graph/class_macro.h"
@@ -670,4 +672,4 @@ struct hash<paddle::distributed::SampleKey> {
     return s.idx ^ s.node_key ^ s.sample_size;
   }
 };
-}
+}  // namespace std
diff --git a/paddle/fluid/distributed/ps/table/common_table.h b/paddle/fluid/distributed/ps/table/common_table.h
index f69d9ccbf1453..280573f71947e 100644
--- a/paddle/fluid/distributed/ps/table/common_table.h
+++ b/paddle/fluid/distributed/ps/table/common_table.h
@@ -19,9 +19,8 @@
 #include <mutex>               // NOLINT
 #include <set>
 
-#include "paddle/fluid/distributed/ps/table/table.h"
-
 #include "paddle/fluid/distributed/common/utils.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
index ef7311824faa6..254bbb96cad62 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
+
 #include <gflags/gflags.h>
+
 #include "glog/logging.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h
index 327c4cea760eb..96ec5b8398d13 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h
@@ -15,7 +15,9 @@
 #pragma once
 #include <stdint.h>
 #include <stdio.h>
+
 #include <vector>
+
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
index 4b84b7e8c36c3..2bde5271a0c43 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/ctr_double_accessor.h"
+
 #include <gflags/gflags.h>
+
 #include "glog/logging.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
index 5b781b2621c5b..3134b46960409 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
@@ -15,7 +15,9 @@
 #pragma once
 #include <stdint.h>
 #include <stdio.h>
+
 #include <vector>
+
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
index 68f28640fc69e..6fb6675edde8d 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
+
 #include <gflags/gflags.h>
+
 #include "glog/logging.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
index 6a9f5d28f5e59..c4bcd2bb3c98a 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
@@ -15,7 +15,9 @@
 #pragma once
 #include <stdint.h>
 #include <stdio.h>
+
 #include <vector>
+
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
diff --git a/paddle/fluid/distributed/ps/table/depends/dense.h b/paddle/fluid/distributed/ps/table/depends/dense.h
index aea757e8d5959..5e7c1cd438de8 100644
--- a/paddle/fluid/distributed/ps/table/depends/dense.h
+++ b/paddle/fluid/distributed/ps/table/depends/dense.h
@@ -15,13 +15,14 @@
 #pragma once
 
 #include <math.h>  // for sqrt in CPU and CUDA
+
 #include <functional>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/distributed/common/utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/table/depends/feature_value.h b/paddle/fluid/distributed/ps/table/depends/feature_value.h
index 36dc34808bd27..e6ab278787d47 100644
--- a/paddle/fluid/distributed/ps/table/depends/feature_value.h
+++ b/paddle/fluid/distributed/ps/table/depends/feature_value.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
+#include <mct/hash-map.hpp>
 #include <vector>
-#include "gflags/gflags.h"
 
-#include <mct/hash-map.hpp>
+#include "gflags/gflags.h"
 #include "paddle/fluid/distributed/common/chunk_allocator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/table/depends/geo_recorder.h b/paddle/fluid/distributed/ps/table/depends/geo_recorder.h
index adab0ee344bca..99530f72b1f74 100644
--- a/paddle/fluid/distributed/ps/table/depends/geo_recorder.h
+++ b/paddle/fluid/distributed/ps/table/depends/geo_recorder.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <ThreadPool.h>
+
 #include <future>  // NOLINT
 #include <memory>
 #include <unordered_set>
diff --git a/paddle/fluid/distributed/ps/table/depends/initializers.h b/paddle/fluid/distributed/ps/table/depends/initializers.h
index f46e659a88bab..7c707feacecc5 100644
--- a/paddle/fluid/distributed/ps/table/depends/initializers.h
+++ b/paddle/fluid/distributed/ps/table/depends/initializers.h
@@ -20,10 +20,9 @@
 #include <string>
 #include <utility>
 #include <vector>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/generator.h"
-
 #include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
index 223c8fafd26ab..4ae3aa7459a17 100644
--- a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
+++ b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
@@ -20,6 +20,7 @@
 #include <rocksdb/slice.h>
 #include <rocksdb/table.h>
 #include <rocksdb/write_batch.h>
+
 #include <iostream>
 #include <string>
 
@@ -153,5 +154,5 @@ class RocksDBHandler {
   std::vector<rocksdb::ColumnFamilyHandle*> _handles;
   rocksdb::DB* _db;
 };
-}  // distributed
-}  // paddle
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
index 004a536e8e56c..f2f346232d326 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/graph/graph_edge.h"
+
 #include <cstring>
 namespace paddle {
 namespace distributed {
@@ -25,5 +26,5 @@ void WeightedGraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
   id_arr.push_back(id);
   weight_arr.push_back(weight);
 }
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.h b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
index 5fc785fe25682..6b929af679e50 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
@@ -43,5 +43,5 @@ class WeightedGraphEdgeBlob : public GraphEdgeBlob {
  protected:
   std::vector<float> weight_arr;
 };
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.cc b/paddle/fluid/distributed/ps/table/graph/graph_node.cc
index 366e607261f0c..d966bd6965364 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_node.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+
 #include <cstring>
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h
index c6c594036d4fc..13fdcf4c64e62 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_node.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h
@@ -18,6 +18,7 @@
 #include <memory>
 #include <sstream>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h"
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc
index 8186acec1be3d..4f5c86db3142b 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h"
+
 #include <iostream>
 #include <memory>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/generator.h"
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h
index c10617022decb..cf83d27d7a2fd 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h
@@ -18,6 +18,7 @@
 #include <random>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/table/graph/graph_edge.h"
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/memory_dense_table.h b/paddle/fluid/distributed/ps/table/memory_dense_table.h
index 73653fbc2eb57..87a3f8661ae93 100644
--- a/paddle/fluid/distributed/ps/table/memory_dense_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_dense_table.h
@@ -17,7 +17,9 @@
 #include <ThreadPool.h>
 #include <assert.h>
 #include <pthread.h>
+
 #include <string>
+
 #include "Eigen/Dense"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/common_table.h"
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
index 60ba5d9602e44..bce9c774f1203 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
@@ -17,6 +17,7 @@
 #include <assert.h>
 // #include <pthread.h>
 #include <stdint.h>
+
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
index ee6a801fa9183..464f788b454e8 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
@@ -12,15 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
+
 #include <omp.h>
-#include <sstream>
 
-#include "paddle/fluid/distributed/common/cost_timer.h"
-#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
-#include "paddle/fluid/framework/io/fs.h"
+#include <sstream>
 
 #include "boost/lexical_cast.hpp"
 #include "glog/logging.h"
+#include "paddle/fluid/distributed/common/cost_timer.h"
+#include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/platform/enforce.h"
 
 DEFINE_bool(pserver_print_missed_key_num_every_push, false,
@@ -272,9 +273,8 @@ int32_t MemorySparseTable::Save(const std::string& dirname,
         if (_value_accesor->Save(it.value().data(), save_param)) {
           std::string format_value = _value_accesor->ParseToString(
               it.value().data(), it.value().size());
-          if (0 !=
-              write_channel->write_line(paddle::string::format_string(
-                  "%lu %s", it.key(), format_value.c_str()))) {
+          if (0 != write_channel->write_line(paddle::string::format_string(
+                       "%lu %s", it.key(), format_value.c_str()))) {
             ++retry_num;
             is_write_failed = true;
             LOG(ERROR)
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
index 6516c75a5d696..7b7a47ff998b1 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
@@ -17,12 +17,14 @@
 #include <ThreadPool.h>
 #include <assert.h>
 #include <pthread.h>
+
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "Eigen/Dense"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/common_table.h"
diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.cc b/paddle/fluid/distributed/ps/table/sparse_accessor.cc
index bc537880f1c21..772ff5d1fc5cc 100644
--- a/paddle/fluid/distributed/ps/table/sparse_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/sparse_accessor.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/sparse_accessor.h"
+
 #include <gflags/gflags.h>
+
 #include "glog/logging.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.h b/paddle/fluid/distributed/ps/table/sparse_accessor.h
index 875904847b2ea..5e76365901c27 100644
--- a/paddle/fluid/distributed/ps/table/sparse_accessor.h
+++ b/paddle/fluid/distributed/ps/table/sparse_accessor.h
@@ -15,7 +15,9 @@
 #pragma once
 #include <stdint.h>
 #include <stdio.h>
+
 #include <vector>
+
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
index 8471b93612828..a9a4c9beae22c 100644
--- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
+++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
+
 #include <gflags/gflags.h>
+
 #include "glog/logging.h"
 
 DEFINE_bool(enable_show_scale_gradient, true, "enable show scale gradient");
diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
index 55a37b5941921..0f7766e20a326 100644
--- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
+++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
@@ -14,8 +14,10 @@
 
 #pragma once
 #include <math.h>
+
 #include <thread>
 #include <vector>
+
 #include "glog/logging.h"                                  // for CHECK
 #include "paddle/fluid/distributed/common/local_random.h"  // for local_uniform_real_distribution
 #include "paddle/fluid/distributed/common/registerer.h"
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
index b1359d1323d89..7e1128baa0cd6 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/ssd_sparse_table.h"
+
 #include "paddle/fluid/distributed/common/cost_timer.h"
 #include "paddle/fluid/distributed/common/local_random.h"
 #include "paddle/fluid/distributed/common/topk_calculator.h"
@@ -362,9 +363,8 @@ int32_t SSDSparseTable::Save(const std::string& path,
         if (_value_accesor->Save(it.value().data(), save_param)) {
           std::string format_value = _value_accesor->ParseToString(
               it.value().data(), it.value().size());
-          if (0 !=
-              write_channel->write_line(paddle::string::format_string(
-                  "%lu %s", it.key(), format_value.c_str()))) {
+          if (0 != write_channel->write_line(paddle::string::format_string(
+                       "%lu %s", it.key(), format_value.c_str()))) {
             ++retry_num;
             is_write_failed = true;
             LOG(ERROR) << "SSDSparseTable save failed, retry it! path:"
@@ -597,9 +597,8 @@ int32_t SSDSparseTable::SaveCache(
   while (shuffled_channel->Read(data)) {
     for (auto& t : data) {
       ++feasign_size;
-      if (0 !=
-          write_channel->write_line(paddle::string::format_string(
-              "%lu %s", t.first, t.second.c_str()))) {
+      if (0 != write_channel->write_line(paddle::string::format_string(
+                   "%lu %s", t.first, t.second.c_str()))) {
         LOG(ERROR) << "Cache Table save failed, "
                       "path:"
                    << channel_config.path << ", retry it!";
diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc
index ef2eb3a746f66..cfa286f1c3f7f 100644
--- a/paddle/fluid/distributed/ps/table/table.cc
+++ b/paddle/fluid/distributed/ps/table/table.cc
@@ -16,13 +16,11 @@
 
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/common/registerer.h"
-
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
-#include "paddle/fluid/distributed/ps/table/memory_dense_table.h"
-
 #include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
 #include "paddle/fluid/distributed/ps/table/ctr_double_accessor.h"
 #include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
+#include "paddle/fluid/distributed/ps/table/memory_dense_table.h"
 #include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
 #include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
 #include "paddle/fluid/distributed/ps/table/sparse_accessor.h"
diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h
index 48fda782d489f..0c56b48a246d2 100644
--- a/paddle/fluid/distributed/ps/table/table.h
+++ b/paddle/fluid/distributed/ps/table/table.h
@@ -15,11 +15,13 @@
 #pragma once
 
 #include <assert.h>
+
 #include <atomic>
 #include <future>  // NOLINT
 #include <memory>
 #include <string>
 #include <utility>
+
 #include "paddle/fluid/distributed/common/afs_warpper.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h"
diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.cc b/paddle/fluid/distributed/ps/table/tensor_accessor.cc
index 5d1f69b7463da..880583f36842d 100644
--- a/paddle/fluid/distributed/ps/table/tensor_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/tensor_accessor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/tensor_accessor.h"
+
 #include "Eigen/Dense"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.h b/paddle/fluid/distributed/ps/table/tensor_accessor.h
index fad31d5df7f47..a5225127534a0 100644
--- a/paddle/fluid/distributed/ps/table/tensor_accessor.h
+++ b/paddle/fluid/distributed/ps/table/tensor_accessor.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
 
diff --git a/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt
index 6279b6aa95412..8b5457ef9eea5 100644
--- a/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt
@@ -1,9 +1,18 @@
-
 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
 
-set_source_files_properties(fleet.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_library(fleet
-        SRCS fleet.cc
-        DEPS framework_proto ps_framework_proto ps_service variable_helper scope op_registry fs shell ${RPC_DEPS})
+set_source_files_properties(fleet.cc PROPERTIES COMPILE_FLAGS
+                                                ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(
+  fleet
+  SRCS fleet.cc
+  DEPS framework_proto
+       ps_framework_proto
+       ps_service
+       variable_helper
+       scope
+       op_registry
+       fs
+       shell
+       ${RPC_DEPS})
 
 target_link_libraries(fleet z)
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc
index 955ba75e672d1..b9754d7b9debb 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.cc
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
+
 #include <google/protobuf/text_format.h>
 
 #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
 #include "paddle/fluid/distributed/ps/table/table.h"
-#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h
index ce109b63cce9c..f88c478724b8b 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.h
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.h
@@ -49,8 +49,8 @@ class PSCore;
 
 using framework::LoDTensor;
 using framework::Scope;
-using phi::SelectedRows;
 using framework::Variable;
+using phi::SelectedRows;
 
 using RpcCtxMap = std::unordered_map<std::string, CommContext>;
 
diff --git a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
old mode 100755
new mode 100644
index ca02ad31195ef..0156c0b42db05
--- a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
+++ b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
@@ -49,8 +49,8 @@ class PSCore;
 
 using framework::LoDTensor;
 using framework::Scope;
-using phi::SelectedRows;
 using framework::Variable;
+using phi::SelectedRows;
 
 using RpcCtxMap = std::unordered_map<std::string, CommContext>;
 
diff --git a/paddle/fluid/distributed/store/CMakeLists.txt b/paddle/fluid/distributed/store/CMakeLists.txt
index 1fde447d97dd9..cfab4aad5f795 100644
--- a/paddle/fluid/distributed/store/CMakeLists.txt
+++ b/paddle/fluid/distributed/store/CMakeLists.txt
@@ -1 +1,4 @@
-cc_library(tcp_store SRCS tcp_store.cc tcp_utils.cc DEPS enforce glog)
+cc_library(
+  tcp_store
+  SRCS tcp_store.cc tcp_utils.cc
+  DEPS enforce glog)
diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
index ec6f0e26a08fa..a46b4b32c9f18 100644
--- a/paddle/fluid/distributed/store/tcp_store.cc
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/distributed/store/tcp_store.h"
+
 #include <chrono>
 #include <iostream>
 #include <thread>
 
-#include "paddle/fluid/distributed/store/tcp_store.h"
 #include "paddle/fluid/distributed/store/tcp_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/flags.h"
diff --git a/paddle/fluid/distributed/store/tcp_utils.cc b/paddle/fluid/distributed/store/tcp_utils.cc
index a28cba288333d..466cd11fa5d3d 100644
--- a/paddle/fluid/distributed/store/tcp_utils.cc
+++ b/paddle/fluid/distributed/store/tcp_utils.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/store/tcp_utils.h"
+
 #include <cerrno>
 #include <cstring>
 #include <thread>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -51,12 +53,13 @@ ::addrinfo* get_addr_info(const std::string host, const std::string port,
   int n;
   n = ::getaddrinfo(node, port_cstr, &hints, &res);
   const char* gai_err = ::gai_strerror(n);
-  const char* proto =
-      (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : "");
-  PADDLE_ENFORCE_EQ(
-      n, 0, platform::errors::InvalidArgument(
-                "%s network %s:%s cannot be obtained. Details: %s.", proto,
-                host, port, gai_err));
+  const char* proto = (family == AF_INET    ? "IPv4"
+                       : family == AF_INET6 ? "IPv6"
+                                            : "");
+  PADDLE_ENFORCE_EQ(n, 0,
+                    platform::errors::InvalidArgument(
+                        "%s network %s:%s cannot be obtained. Details: %s.",
+                        proto, host, port, gai_err));
 
   return res;
 }
@@ -79,10 +82,11 @@ SocketType tcp_connect(const std::string host, const std::string port,
   do {
     for (::addrinfo* cur = res; cur != nullptr; cur = cur->ai_next) {
       sockfd = ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
-      PADDLE_ENFORCE_GT(sockfd, 0, platform::errors::InvalidArgument(
-                                       "Create socket to connect %s:%s failed. "
-                                       "Details: %s. ",
-                                       host, port, socket_error().message()));
+      PADDLE_ENFORCE_GT(sockfd, 0,
+                        platform::errors::InvalidArgument(
+                            "Create socket to connect %s:%s failed. "
+                            "Details: %s. ",
+                            host, port, socket_error().message()));
 
       if (::connect(sockfd, cur->ai_addr, cur->ai_addrlen) == 0) {
         retry = false;
diff --git a/paddle/fluid/distributed/store/tcp_utils.h b/paddle/fluid/distributed/store/tcp_utils.h
index 60cb3de124da3..ec9f610a18c17 100644
--- a/paddle/fluid/distributed/store/tcp_utils.h
+++ b/paddle/fluid/distributed/store/tcp_utils.h
@@ -29,6 +29,7 @@
 #include <chrono>
 #include <iostream>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 
 // Utility functions for TCP socket.
@@ -73,9 +74,10 @@ void send_bytes(SocketType socket, const T* buffer, size_t len) {
 
   while (to_send > 0) {
     auto byte_sent = ::send(socket, ptr, to_send, 0);
-    PADDLE_ENFORCE_GT(byte_sent, 0, platform::errors::InvalidArgument(
-                                        "TCP send error. Details: %s.",
-                                        socket_error().message()));
+    PADDLE_ENFORCE_GT(
+        byte_sent, 0,
+        platform::errors::InvalidArgument("TCP send error. Details: %s.",
+                                          socket_error().message()));
     to_send -= byte_sent;
     ptr += byte_sent;
   }
@@ -91,9 +93,10 @@ void receive_bytes(SocketType socket, T* buffer, size_t len) {
 
   while (to_recv > 0) {
     auto byte_received = ::recv(socket, ptr, to_recv, 0);
-    PADDLE_ENFORCE_GT(byte_received, 0, platform::errors::InvalidArgument(
-                                            "TCP receive error. Details: %s.",
-                                            socket_error().message()));
+    PADDLE_ENFORCE_GT(
+        byte_received, 0,
+        platform::errors::InvalidArgument("TCP receive error. Details: %s.",
+                                          socket_error().message()));
 
     to_recv -= byte_received;
     ptr += byte_received;
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index 9f339d7ee2c08..9b7a304b0a92a 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -1,46 +1,144 @@
-set_source_files_properties(table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor
-ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS})
+set_source_files_properties(
+  table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  table_test
+  SRCS table_test.cc
+  DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS}
+       ${RPC_DEPS})
 
-set_source_files_properties(dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table
-tensor_accessor ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS})
+set_source_files_properties(
+  dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  dense_table_test
+  SRCS dense_table_test.cc
+  DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS}
+       ${RPC_DEPS})
 
-set_source_files_properties(barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(barrier_table_test SRCS barrier_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
+set_source_files_properties(
+  barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  barrier_table_test
+  SRCS barrier_table_test.cc
+  DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
 
-set_source_files_properties(brpc_service_dense_sgd_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(brpc_service_dense_sgd_test SRCS brpc_service_dense_sgd_test.cc DEPS scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
+set_source_files_properties(
+  brpc_service_dense_sgd_test.cc PROPERTIES COMPILE_FLAGS
+                                            ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  brpc_service_dense_sgd_test
+  SRCS brpc_service_dense_sgd_test.cc
+  DEPS scope
+       server
+       client
+       communicator
+       ps_service
+       boost
+       table
+       ps_framework_proto
+       ${COMMON_DEPS})
 
-set_source_files_properties(brpc_service_sparse_sgd_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(brpc_service_sparse_sgd_test SRCS brpc_service_sparse_sgd_test.cc DEPS scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
+set_source_files_properties(
+  brpc_service_sparse_sgd_test.cc PROPERTIES COMPILE_FLAGS
+                                             ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  brpc_service_sparse_sgd_test
+  SRCS brpc_service_sparse_sgd_test.cc
+  DEPS scope
+       server
+       client
+       communicator
+       ps_service
+       boost
+       table
+       ps_framework_proto
+       ${COMMON_DEPS})
 
-set_source_files_properties(brpc_utils_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_function ${COMMON_DEPS} ${RPC_DEPS})
+set_source_files_properties(
+  brpc_utils_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  brpc_utils_test
+  SRCS brpc_utils_test.cc
+  DEPS brpc_utils scope math_function ${COMMON_DEPS} ${RPC_DEPS})
 
-set_source_files_properties(graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
+set_source_files_properties(
+  graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  graph_node_test
+  SRCS graph_node_test.cc
+  DEPS graph_py_service
+       scope
+       server
+       client
+       communicator
+       ps_service
+       boost
+       table
+       ps_framework_proto
+       ${COMMON_DEPS})
 
-set_source_files_properties(graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(graph_node_split_test SRCS graph_node_split_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
+set_source_files_properties(
+  graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  graph_node_split_test
+  SRCS graph_node_split_test.cc
+  DEPS graph_py_service
+       scope
+       server
+       client
+       communicator
+       ps_service
+       boost
+       table
+       ps_framework_proto
+       ${COMMON_DEPS})
 
-set_source_files_properties(graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(graph_table_sample_test SRCS graph_table_sample_test.cc DEPS  table ps_framework_proto ${COMMON_DEPS})
+set_source_files_properties(
+  graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS
+                                        ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  graph_table_sample_test
+  SRCS graph_table_sample_test.cc
+  DEPS table ps_framework_proto ${COMMON_DEPS})
 
-set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table)
+set_source_files_properties(
+  feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  feature_value_test
+  SRCS feature_value_test.cc
+  DEPS ${COMMON_DEPS} boost table)
 
-set_source_files_properties(sparse_sgd_rule_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(sparse_sgd_rule_test SRCS sparse_sgd_rule_test.cc DEPS ${COMMON_DEPS} boost table)
+set_source_files_properties(
+  sparse_sgd_rule_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  sparse_sgd_rule_test
+  SRCS sparse_sgd_rule_test.cc
+  DEPS ${COMMON_DEPS} boost table)
 
-set_source_files_properties(ctr_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(ctr_accessor_test SRCS ctr_accessor_test.cc DEPS ${COMMON_DEPS} boost table)
-set_source_files_properties(ctr_dymf_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(ctr_dymf_accessor_test SRCS ctr_dymf_accessor_test.cc DEPS ${COMMON_DEPS} boost table)
+set_source_files_properties(
+  ctr_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  ctr_accessor_test
+  SRCS ctr_accessor_test.cc
+  DEPS ${COMMON_DEPS} boost table)
+set_source_files_properties(
+  ctr_dymf_accessor_test.cc PROPERTIES COMPILE_FLAGS
+                                       ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  ctr_dymf_accessor_test
+  SRCS ctr_dymf_accessor_test.cc
+  DEPS ${COMMON_DEPS} boost table)
 
+set_source_files_properties(
+  memory_sparse_table_test.cc PROPERTIES COMPILE_FLAGS
+                                         ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  memory_sparse_table_test
+  SRCS memory_sparse_table_test.cc
+  DEPS ${COMMON_DEPS} boost table)
 
-set_source_files_properties(memory_sparse_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(memory_sparse_table_test SRCS memory_sparse_table_test.cc DEPS ${COMMON_DEPS} boost table)
-
-set_source_files_properties(memory_geo_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(memory_sparse_geo_table_test SRCS memory_geo_table_test.cc DEPS ${COMMON_DEPS} boost table)
+set_source_files_properties(
+  memory_geo_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  memory_sparse_geo_table_test
+  SRCS memory_geo_table_test.cc
+  DEPS ${COMMON_DEPS} boost table)
diff --git a/paddle/fluid/distributed/test/barrier_table_test.cc b/paddle/fluid/distributed/test/barrier_table_test.cc
index c4c5b22992804..f540939c6fd8f 100644
--- a/paddle/fluid/distributed/test/barrier_table_test.cc
+++ b/paddle/fluid/distributed/test/barrier_table_test.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <ThreadPool.h>
+
 #include <unordered_map>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/common_table.h"
diff --git a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
index f9d57be95affe..c1467dae9a7e2 100644
--- a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <unistd.h>
+
 #include <string>
 #include <thread>  // NOLINT
 
diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
index 29195d9985728..bade56f239f65 100644
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <unistd.h>
+
 #include <string>
 #include <thread>  // NOLINT
 
diff --git a/paddle/fluid/distributed/test/brpc_utils_test.cc b/paddle/fluid/distributed/test/brpc_utils_test.cc
index 16ff9bd75840b..33367bf16b72e 100644
--- a/paddle/fluid/distributed/test/brpc_utils_test.cc
+++ b/paddle/fluid/distributed/test/brpc_utils_test.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/distributed/ps/service/brpc_utils.h"
+
 #include <string>
 
 #include "gtest/gtest.h"
-
-#include "paddle/fluid/distributed/ps/service/brpc_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc
index 27b6ddf722b70..51254391a4283 100644
--- a/paddle/fluid/distributed/test/ctr_accessor_test.cc
+++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
+
 #include <cmath>
 #include <iostream>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
diff --git a/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc b/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc
index f6e773a414c7f..fbf179dbeeef0 100644
--- a/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc
+++ b/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
+
 #include <cmath>
 #include <iostream>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
diff --git a/paddle/fluid/distributed/test/dense_table_test.cc b/paddle/fluid/distributed/test/dense_table_test.cc
index 9529c776c120e..185d9d3aed1d4 100644
--- a/paddle/fluid/distributed/test/dense_table_test.cc
+++ b/paddle/fluid/distributed/test/dense_table_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <ThreadPool.h>
+
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/memory_dense_table.h"
diff --git a/paddle/fluid/distributed/test/feature_value_test.cc b/paddle/fluid/distributed/test/feature_value_test.cc
index 32e3944d35a1c..6e848c3e2f4e4 100644
--- a/paddle/fluid/distributed/test/feature_value_test.cc
+++ b/paddle/fluid/distributed/test/feature_value_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
+
 #include <vector>
+
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc
index 395d7c1eace82..fa9b89d75c83c 100644
--- a/paddle/fluid/distributed/test/graph_node_split_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <unistd.h>
+
 #include <condition_variable>  // NOLINT
 #include <fstream>
 #include <iomanip>
@@ -17,8 +18,8 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <unordered_set>
 #include <vector>
-#include "google/protobuf/text_format.h"
 
+#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index 3b43c2779ee4e..9cb244a9ec430 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -9,7 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+
 #include <unistd.h>
+
 #include <condition_variable>  // NOLINT
 #include <fstream>
 #include <iomanip>
@@ -17,8 +20,8 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <unordered_set>
 #include <vector>
-#include "google/protobuf/text_format.h"
 
+#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
@@ -30,7 +33,6 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h"
 #include "paddle/fluid/distributed/ps/service/ps_service/service.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
-#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc
index d7f6f2f34d77a..a3463162d276c 100644
--- a/paddle/fluid/distributed/test/graph_table_sample_test.cc
+++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include <unistd.h>
+
+#include <chrono>
 #include <condition_variable>  // NOLINT
 #include <fstream>
 #include <iomanip>
@@ -20,9 +22,8 @@
 #include <thread>  // NOLINT
 #include <unordered_set>
 #include <vector>
-#include "google/protobuf/text_format.h"
 
-#include <chrono>
+#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
diff --git a/paddle/fluid/distributed/test/memory_geo_table_test.cc b/paddle/fluid/distributed/test/memory_geo_table_test.cc
index ca3b51fade177..507211e69fa0f 100644
--- a/paddle/fluid/distributed/test/memory_geo_table_test.cc
+++ b/paddle/fluid/distributed/test/memory_geo_table_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <ThreadPool.h>
-
 #include <unistd.h>
+
 #include <string>
 #include <thread>  // NOLINT
 
diff --git a/paddle/fluid/distributed/test/memory_sparse_table_test.cc b/paddle/fluid/distributed/test/memory_sparse_table_test.cc
index 68bc50373ffad..1689b7716bbc4 100644
--- a/paddle/fluid/distributed/test/memory_sparse_table_test.cc
+++ b/paddle/fluid/distributed/test/memory_sparse_table_test.cc
@@ -12,16 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <ThreadPool.h>
+#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
 
+#include <ThreadPool.h>
 #include <unistd.h>
+
 #include <string>
 #include <thread>  // NOLINT
 
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
 #include "paddle/fluid/distributed/ps/table/table.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
index 1a4e16b926619..3a9a8d0b39ccd 100644
--- a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
+++ b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
+
 #include <cmath>
 #include <iostream>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 
diff --git a/paddle/fluid/distributed/test/table_test.cc b/paddle/fluid/distributed/test/table_test.cc
index 4f73519ef5e69..56809abad0c7c 100644
--- a/paddle/fluid/distributed/test/table_test.cc
+++ b/paddle/fluid/distributed/test/table_test.cc
@@ -30,4 +30,4 @@ TEST(Table, Initialize) {
   ASSERT_EQ(ret, -1);
 }
 }  // namespace distributed
-}  // // namespace paddle
+}  // namespace paddle
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 11c98e5da9dde..73d8539329a75 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,29 +1,82 @@
-set(eager_deps phi_api phi_dygraph_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta eager_nan_inf_utils grad_node_info grad_tensor_holder accumulation_node custom_operator_node)
+set(eager_deps
+    phi_api
+    phi_dygraph_api
+    hook_utils
+    tensor_utils
+    utils
+    global_utils
+    backward
+    phi_tensor
+    tracer
+    layer
+    autograd_meta
+    eager_nan_inf_utils
+    grad_node_info
+    grad_tensor_holder
+    accumulation_node
+    custom_operator_node)
 
-set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
-set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node)
+set(fluid_deps
+    tracer
+    layer
+    proto_desc
+    operator
+    op_registry
+    variable_helper
+    memcpy)
+set(generated_deps final_dygraph_function final_dygraph_node dygraph_function
+                   dygraph_node)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-    message("Performing Eager Dygraph Auto Code Generation")
-    add_subdirectory(auto_code_generator)
+  message("Performing Eager Dygraph Auto Code Generation")
+  add_subdirectory(auto_code_generator)
 endif()
 
 add_subdirectory(api)
 add_subdirectory(accumulation)
 add_subdirectory(custom_operator)
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-    add_subdirectory(pylayer)
-    cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
-    add_dependencies(grad_tensor_holder eager_final_state_codegen)
-    cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info switch_autotune)
+  add_subdirectory(pylayer)
+  cc_library(
+    grad_tensor_holder
+    SRCS grad_tensor_holder.cc
+    DEPS grad_node_info gradient_accumulator)
+  add_dependencies(grad_tensor_holder eager_final_state_codegen)
+  cc_library(
+    backward
+    SRCS backward.cc
+    DEPS grad_tensor_holder utils autograd_meta grad_node_info switch_autotune)
 endif()
 
-cc_library(eager_nan_inf_utils SRCS nan_inf_utils.cc DEPS phi_tensor nan_inf_utils enforce)
-cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor)
+cc_library(
+  eager_nan_inf_utils
+  SRCS nan_inf_utils.cc
+  DEPS phi_tensor nan_inf_utils enforce)
+cc_library(
+  grad_node_info
+  SRCS grad_node_info.cc
+  DEPS phi_api phi_tensor)
 
-cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi_api phi_tensor)
-cc_library(utils SRCS utils.cc DEPS phi_api phi_tensor global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
+cc_library(
+  autograd_meta
+  SRCS autograd_meta.cc
+  DEPS phi_api phi_tensor)
+cc_library(
+  utils
+  SRCS utils.cc
+  DEPS phi_api
+       phi_tensor
+       global_utils
+       layer
+       proto_desc
+       operator
+       op_registry
+       variable_helper
+       memcpy
+       scale_op
+       autograd_meta
+       hook_utils)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-    add_subdirectory(tests)
+  add_subdirectory(tests)
 endif()
diff --git a/paddle/fluid/eager/accumulation/CMakeLists.txt b/paddle/fluid/eager/accumulation/CMakeLists.txt
index 0531aa5aab373..297e853947dfb 100644
--- a/paddle/fluid/eager/accumulation/CMakeLists.txt
+++ b/paddle/fluid/eager/accumulation/CMakeLists.txt
@@ -1 +1,4 @@
-cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulator phi_api grad_node_info)
+cc_library(
+  accumulation_node
+  SRCS accumulation_node.cc
+  DEPS gradient_accumulator phi_api grad_node_info)
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 544e7c8fe85d6..09db68399f332 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -13,17 +13,15 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
+
+#include "glog/logging.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
-
-#include "paddle/phi/api/all.h"
-#include "paddle/phi/core/dense_tensor.h"
-
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
-
-#include "glog/logging.h"
+#include "paddle/phi/api/all.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace egr {
 
@@ -72,8 +70,7 @@ paddle::small_vector<std::vector<paddle::experimental::Tensor>,
 GradNodeAccumulation::operator()(
     paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                          kSlotSmallVectorSize>& grads,  // NOLINT
-    bool create_graph,
-    bool is_new_grad) {
+    bool create_graph, bool is_new_grad) {
   VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
   PADDLE_ENFORCE(grads.size() == 1,
                  paddle::platform::errors::Fatal(
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 6374534578cb8..7694e290bab95 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -41,8 +41,7 @@ class GradNodeAccumulation : public GradNodeBase {
                                kSlotSmallVectorSize>
   operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                   kSlotSmallVectorSize>& grads,  // NOLINT
-             bool create_graph = false,
-             bool is_new_grad = false) override;
+             bool create_graph = false, bool is_new_grad = false) override;
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
 
diff --git a/paddle/fluid/eager/amp_utils.h b/paddle/fluid/eager/amp_utils.h
index 2145f4a11965c..2834f7d5dc0b9 100644
--- a/paddle/fluid/eager/amp_utils.h
+++ b/paddle/fluid/eager/amp_utils.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 
diff --git a/paddle/fluid/eager/api/CMakeLists.txt b/paddle/fluid/eager/api/CMakeLists.txt
index 4c241fd5b721c..4525a58a44d48 100644
--- a/paddle/fluid/eager/api/CMakeLists.txt
+++ b/paddle/fluid/eager/api/CMakeLists.txt
@@ -1,4 +1,7 @@
 add_subdirectory(utils)
 add_subdirectory(generated)
 
-cc_library(eager_api SRCS all.cc DEPS tensor_utils hook_utils global_utils eager_scale)
+cc_library(
+  eager_api
+  SRCS all.cc
+  DEPS tensor_utils hook_utils global_utils eager_scale)
diff --git a/paddle/fluid/eager/api/generated/CMakeLists.txt b/paddle/fluid/eager/api/generated/CMakeLists.txt
index 4f634c6884b45..3f6bb90d69baa 100644
--- a/paddle/fluid/eager/api/generated/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_subdirectory(eager_generated)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-    add_subdirectory(fluid_generated)
+  add_subdirectory(fluid_generated)
 endif()
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
index 81ff07b8963f9..f704d2a49184b 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
@@ -1,6 +1,12 @@
-cc_library(scale_node SRCS scale_node.cc DEPS global_utils phi phi_api grad_node_info)
+cc_library(
+  scale_node
+  SRCS scale_node.cc
+  DEPS global_utils phi phi_api grad_node_info)
 
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
-cc_library(final_dygraph_node SRCS nodes.cc DEPS ${eager_deps})
-add_dependencies(final_dygraph_node eager_final_state_codegen)
+  cc_library(
+    final_dygraph_node
+    SRCS nodes.cc
+    DEPS ${eager_deps})
+  add_dependencies(final_dygraph_node eager_final_state_codegen)
 endif()
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 38f67cb5bdf2a..5adceb7e79af1 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -13,16 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
+
+#include "glog/logging.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/eager_tensor.h"
-
-#include "paddle/phi/kernels/scale_kernel.h"
-
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
-
-#include "glog/logging.h"
+#include "paddle/phi/kernels/scale_kernel.h"
 
 namespace egr {
 
@@ -147,8 +145,7 @@ paddle::small_vector<std::vector<paddle::experimental::Tensor>,
 GradNodeScale::operator()(
     paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                          kSlotSmallVectorSize>& grads,  // NOLINT
-    bool create_graph,
-    bool is_new_grad) {
+    bool create_graph, bool is_new_grad) {
   // 1. Check Output Size
   VLOG(6) << "grad size is: " << grads.size();
   PADDLE_ENFORCE(
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
index 04ff510944dd2..45872c97002aa 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -42,8 +42,7 @@ class GradNodeScale : public GradNodeBase {
                                kSlotSmallVectorSize>
   operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                   kSlotSmallVectorSize>& grads,  // NOLINT
-             bool create_graph = false,
-             bool is_new_grad = false) override;
+             bool create_graph = false, bool is_new_grad = false) override;
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
 
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
index c70bb80c35c78..8d6df647999bd 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
@@ -1,6 +1,12 @@
-cc_library(eager_scale SRCS scale.cc DEPS phi_api phi autograd_meta scale_node)
+cc_library(
+  eager_scale
+  SRCS scale.cc
+  DEPS phi_api phi autograd_meta scale_node)
 
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
-cc_library(final_dygraph_function SRCS dygraph_functions.cc DEPS ${eager_deps})
-add_dependencies(final_dygraph_function eager_final_state_codegen)
+  cc_library(
+    final_dygraph_function
+    SRCS dygraph_functions.cc
+    DEPS ${eager_deps})
+  add_dependencies(final_dygraph_function eager_final_state_codegen)
 endif()
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
index 7a374d567d5d0..836216d64b009 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
@@ -23,11 +23,11 @@
  * **/
 
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/scale.h"
+
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/utils.h"
-
 #include "paddle/phi/api/all.h"
 
 namespace egr {
diff --git a/paddle/fluid/eager/api/utils/CMakeLists.txt b/paddle/fluid/eager/api/utils/CMakeLists.txt
index a2a380ebad6c5..1fd4905605ea8 100644
--- a/paddle/fluid/eager/api/utils/CMakeLists.txt
+++ b/paddle/fluid/eager/api/utils/CMakeLists.txt
@@ -1,3 +1,12 @@
-cc_library(tensor_utils SRCS tensor_utils.cc DEPS phi_api autograd_meta grad_node_info accumulation_node)
-cc_library(hook_utils SRCS hook_utils.cc DEPS phi tensor_utils autograd_meta grad_node_info utils accumulation_node)
-cc_library(global_utils SRCS global_utils.cc DEPS place tracer)
+cc_library(
+  tensor_utils
+  SRCS tensor_utils.cc
+  DEPS phi_api autograd_meta grad_node_info accumulation_node)
+cc_library(
+  hook_utils
+  SRCS hook_utils.cc
+  DEPS phi tensor_utils autograd_meta grad_node_info utils accumulation_node)
+cc_library(
+  global_utils
+  SRCS global_utils.cc
+  DEPS place tracer)
diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h
index 3c18efea20349..6a6a443f69333 100644
--- a/paddle/fluid/eager/api/utils/global_utils.h
+++ b/paddle/fluid/eager/api/utils/global_utils.h
@@ -17,6 +17,7 @@
 
 #include <atomic>
 #include <memory>
+
 #include "paddle/fluid/eager/type_defs.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
@@ -73,8 +74,9 @@ class Controller {
     return op_meta_info_map_;
   }
 
-  void MergeOpMetaInfoMap(const std::unordered_map<
-                          std::string, std::vector<paddle::OpMetaInfo>>& map) {
+  void MergeOpMetaInfoMap(
+      const std::unordered_map<std::string, std::vector<paddle::OpMetaInfo>>&
+          map) {
     op_meta_info_map_.insert(map.begin(), map.end());
   }
 
diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc
index 8ee646b718c2f..6493135141f6a 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.cc
+++ b/paddle/fluid/eager/api/utils/hook_utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/api/utils/hook_utils.h"
+
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc
index 81ea92d1c3c48..84a9eb6dea6bb 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -13,17 +13,16 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
+
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/utils.h"
-
-#include "paddle/phi/api/all.h"
-
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/phi/api/all.h"
 
 namespace egr {
 namespace egr_utils_api {
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.h b/paddle/fluid/eager/api/utils/tensor_utils.h
index ac6de72dbff39..158aa5c8d7dd0 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.h
+++ b/paddle/fluid/eager/api/utils/tensor_utils.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/phi/api/all.h"
+#include "paddle/phi/api/include/tensor.h"
 
 namespace egr {
 namespace egr_utils_api {
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index d673c64d9da3c..8c067074d6efd 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -1,99 +1,161 @@
 add_subdirectory(final_state_generator)
 
-set(EAGER_GENERETOR_DEPS ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
+set(EAGER_GENERETOR_DEPS
+    ${GLOB_OP_LIB}
+    ${GLOB_OPERATOR_DEPS}
+    pybind
+    proto_desc
+    executor
+    layer
+    tracer
+    engine
+    imperative_profiler
+    imperative_flag)
 
 add_executable(eager_generator eager_generator.cc)
 target_link_libraries(eager_generator ${EAGER_GENERETOR_DEPS})
 
-get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
 target_link_libraries(eager_generator ${os_dependency_modules})
 
 if(WITH_ROCM)
-    target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB})
+  target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB})
 endif()
 
 # Prepare file structure
-message("Generate dygraph file structure at path: ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/generated")
-execute_process(
-    COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/generate_file_structures.py" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/"
+message(
+  "Generate dygraph file structure at path: ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/generated"
 )
+execute_process(
+  COMMAND
+    "${PYTHON_EXECUTABLE}"
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/generate_file_structures.py"
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/")
 
-set(tmp_dygraph_forward_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.tmp.h")
-set(tmp_dygraph_forward_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions.tmp.cc")
-set(tmp_dygraph_node_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.h")
-set(tmp_dygraph_node_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.cc")
-set(dygraph_forward_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h")
-set(dygraph_forward_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions.cc")
-set(dygraph_node_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h")
-set(dygraph_node_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.cc")
+set(tmp_dygraph_forward_h_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.tmp.h"
+)
+set(tmp_dygraph_forward_cc_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions.tmp.cc"
+)
+set(tmp_dygraph_node_h_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.h"
+)
+set(tmp_dygraph_node_cc_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.cc"
+)
+set(dygraph_forward_h_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
+)
+set(dygraph_forward_cc_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions.cc"
+)
+set(dygraph_node_h_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h"
+)
+set(dygraph_node_cc_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.cc"
+)
 
 if(WIN32)
-    set(EAGER_CODEGEN_DEPS eager_generator)
-    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-      set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}")
-    else()
-      set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
-    endif()
-    
-    if(${CBLAS_PROVIDER} STREQUAL MKLML)
-      message("Copied libiomp5md.dll for Eager AutoCodeGen")
-      ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/libiomp5md.dll
-        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${eager_generator_path}
-        DEPENDS mklml)
-      list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/libiomp5md.dll)
-    else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
-      message("Copied openblas.dll for Eager AutoCodeGen")
-      ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/openblas.dll
-        COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${eager_generator_path}
-        DEPENDS extern_openblas)
-      list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/openblas.dll)
-    endif()
+  set(EAGER_CODEGEN_DEPS eager_generator)
+  if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+    set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}")
+  else()
+    set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
+  endif()
+
+  if(${CBLAS_PROVIDER} STREQUAL MKLML)
+    message("Copied libiomp5md.dll for Eager AutoCodeGen")
+    add_custom_command(
+      OUTPUT ${eager_generator_path}/libiomp5md.dll
+      COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB}
+              ${eager_generator_path}
+      DEPENDS mklml)
+    list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/libiomp5md.dll)
+  else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
+    message("Copied openblas.dll for Eager AutoCodeGen")
+    add_custom_command(
+      OUTPUT ${eager_generator_path}/openblas.dll
+      COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB}
+              ${eager_generator_path}
+      DEPENDS extern_openblas)
+    list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/openblas.dll)
+  endif()
 
-    if(WITH_MKLDNN)
-      message("Copied mkldnn.dll for Eager AutoCodeGen")
-      ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/mkldnn.dll
-        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${eager_generator_path}
-        DEPENDS mkldnn)
-        list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/mkldnn.dll)
-    endif()
+  if(WITH_MKLDNN)
+    message("Copied mkldnn.dll for Eager AutoCodeGen")
+    add_custom_command(
+      OUTPUT ${eager_generator_path}/mkldnn.dll
+      COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB}
+              ${eager_generator_path}
+      DEPENDS mkldnn)
+    list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/mkldnn.dll)
+  endif()
 
-    if(WITH_ONNXRUNTIME)
-      message("Copied onnxruntime for Eager AutoCodeGen")
-      ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/onnxruntime.dll
-        COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SHARED_LIB} ${eager_generator_path}
-        DEPENDS onnxruntime)
-        list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/onnxruntime.dll)
-      ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/paddle2onnx.dll
-        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB} ${eager_generator_path}
-        DEPENDS paddle2onnx)
-        list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/paddle2onnx.dll)
-    endif()
+  if(WITH_ONNXRUNTIME)
+    message("Copied onnxruntime for Eager AutoCodeGen")
+    add_custom_command(
+      OUTPUT ${eager_generator_path}/onnxruntime.dll
+      COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SHARED_LIB}
+              ${eager_generator_path}
+      DEPENDS onnxruntime)
+    list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/onnxruntime.dll)
+    add_custom_command(
+      OUTPUT ${eager_generator_path}/paddle2onnx.dll
+      COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB}
+              ${eager_generator_path}
+      DEPENDS paddle2onnx)
+    list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/paddle2onnx.dll)
+  endif()
 
-    add_custom_target(eager_codegen
-      COMMAND "${eager_generator_path}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated"
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_h_path} ${dygraph_forward_h_path}
-      COMMENT "copy_if_different ${tmp_dygraph_forward_h_path} to ${dygraph_forward_h_path}"
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_cc_path} ${dygraph_forward_cc_path}
-      COMMENT "copy_if_different ${tmp_dygraph_forward_cc_path} to ${dygraph_forward_cc_path}"
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_h_path} ${dygraph_node_h_path}
-      COMMENT "copy_if_different ${tmp_dygraph_node_h_path} to ${dygraph_node_h_path}"
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_cc_path} ${dygraph_node_cc_path}
-      COMMENT "copy_if_different ${tmp_dygraph_node_cc_path} to ${dygraph_node_cc_path}"
-      DEPENDS ${EAGER_CODEGEN_DEPS}
-      VERBATIM)
+  add_custom_target(
+    eager_codegen
+    COMMAND
+      "${eager_generator_path}/eager_generator.exe"
+      "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated"
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_h_path}
+            ${dygraph_forward_h_path}
+    COMMENT
+      "copy_if_different ${tmp_dygraph_forward_h_path} to ${dygraph_forward_h_path}"
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_cc_path}
+            ${dygraph_forward_cc_path}
+    COMMENT
+      "copy_if_different ${tmp_dygraph_forward_cc_path} to ${dygraph_forward_cc_path}"
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_h_path}
+            ${dygraph_node_h_path}
+    COMMENT
+      "copy_if_different ${tmp_dygraph_node_h_path} to ${dygraph_node_h_path}"
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_cc_path}
+            ${dygraph_node_cc_path}
+    COMMENT
+      "copy_if_different ${tmp_dygraph_node_cc_path} to ${dygraph_node_cc_path}"
+    DEPENDS ${EAGER_CODEGEN_DEPS}
+    VERBATIM)
 else()
-    add_custom_target(eager_codegen
-          COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${CMAKE_CURRENT_BINARY_DIR}/../../pybind"
-                "${CMAKE_CURRENT_BINARY_DIR}/eager_generator" 
-                "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated"
-          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_h_path} ${dygraph_forward_h_path}
-          COMMENT "copy_if_different ${tmp_dygraph_forward_h_path} to ${dygraph_forward_h_path}"
-          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_cc_path} ${dygraph_forward_cc_path}
-          COMMENT "copy_if_different ${tmp_dygraph_forward_cc_path} to ${dygraph_forward_cc_path}"
-          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_h_path} ${dygraph_node_h_path}
-          COMMENT "copy_if_different ${tmp_dygraph_node_h_path} to ${dygraph_node_h_path}"
-          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_cc_path} ${dygraph_node_cc_path}
-          COMMENT "copy_if_different ${tmp_dygraph_node_cc_path} to ${dygraph_node_cc_path}"
-          DEPENDS eager_generator
-          VERBATIM)
+  add_custom_target(
+    eager_codegen
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${CMAKE_CURRENT_BINARY_DIR}/../../pybind"
+      "${CMAKE_CURRENT_BINARY_DIR}/eager_generator"
+      "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated"
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_h_path}
+            ${dygraph_forward_h_path}
+    COMMENT
+      "copy_if_different ${tmp_dygraph_forward_h_path} to ${dygraph_forward_h_path}"
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_cc_path}
+            ${dygraph_forward_cc_path}
+    COMMENT
+      "copy_if_different ${tmp_dygraph_forward_cc_path} to ${dygraph_forward_cc_path}"
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_h_path}
+            ${dygraph_node_h_path}
+    COMMENT
+      "copy_if_different ${tmp_dygraph_node_h_path} to ${dygraph_node_h_path}"
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_cc_path}
+            ${dygraph_node_cc_path}
+    COMMENT
+      "copy_if_different ${tmp_dygraph_node_cc_path} to ${dygraph_node_cc_path}"
+    DEPENDS eager_generator
+    VERBATIM)
 endif()
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
index 50dab6ce840a5..06668fa736570 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
@@ -1,39 +1,72 @@
-set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/new_api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml")
-set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/new_backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml")
-set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc")
-set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h")
-set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc")
-set(tmp_nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.h")
-set(forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.cc")
-set(forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h")
-set(nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.cc")
-set(nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h")
+set(api_yaml_path
+    "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/new_api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml"
+)
+set(backward_yaml_path
+    "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/new_backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml"
+)
+set(tmp_forwards_cc_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc"
+)
+set(tmp_forwards_h_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h"
+)
+set(tmp_nodes_cc_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc"
+)
+set(tmp_nodes_h_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.h"
+)
+set(forwards_cc_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.cc"
+)
+set(forwards_h_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
+)
+set(nodes_cc_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.cc"
+)
+set(nodes_h_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
+)
 # StringTensor only needs forward api
-set(fwd_api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api.yaml")
+set(fwd_api_yaml_path
+    "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api.yaml")
 
 message("Final State Eager CodeGen")
-add_custom_target(eager_final_state_codegen
-    COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py" 
-            "--api_yaml_path=${api_yaml_path}"
-            "--backward_yaml_path=${backward_yaml_path}"
-            "--forwards_cc_path=${tmp_forwards_cc_path}"
-            "--forwards_h_path=${tmp_forwards_h_path}"
-            "--nodes_cc_path=${tmp_nodes_cc_path}"
-            "--nodes_h_path=${tmp_nodes_h_path}"
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_forwards_cc_path} ${forwards_cc_path}
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_forwards_h_path} ${forwards_h_path}
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_nodes_cc_path} ${nodes_cc_path}
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_nodes_h_path} ${nodes_h_path}
-    VERBATIM
-)
+add_custom_target(
+  eager_final_state_codegen
+  COMMAND
+    "${PYTHON_EXECUTABLE}"
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py"
+    "--api_yaml_path=${api_yaml_path}"
+    "--backward_yaml_path=${backward_yaml_path}"
+    "--forwards_cc_path=${tmp_forwards_cc_path}"
+    "--forwards_h_path=${tmp_forwards_h_path}"
+    "--nodes_cc_path=${tmp_nodes_cc_path}" "--nodes_h_path=${tmp_nodes_h_path}"
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_forwards_cc_path}
+          ${forwards_cc_path}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_forwards_h_path}
+          ${forwards_h_path}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_nodes_cc_path}
+          ${nodes_cc_path}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_nodes_h_path}
+          ${nodes_h_path}
+  VERBATIM)
 
-set(tmp_python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h")
-set(python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function_impl.h")
-
-add_custom_target(eager_final_state_python_c_codegen
-    COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py" 
-            "--api_yaml_path=${api_yaml_path},${fwd_api_yaml_path}"
-            "--output_path=${tmp_python_c_output_path}"
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_python_c_output_path} ${python_c_output_path}
-    VERBATIM
+set(tmp_python_c_output_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h"
+)
+set(python_c_output_path
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function_impl.h"
 )
+
+add_custom_target(
+  eager_final_state_python_c_codegen
+  COMMAND
+    "${PYTHON_EXECUTABLE}"
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py"
+    "--api_yaml_path=${api_yaml_path},${fwd_api_yaml_path}"
+    "--output_path=${tmp_python_c_output_path}"
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_python_c_output_path}
+          ${python_c_output_path}
+  VERBATIM)
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index 57681be58ae47..87b2ff986dc92 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -31,7 +31,8 @@
     "leaky_relu_double_grad", "sqrt_double_grad", "rsqrt_double_grad",
     "square_double_grad", "celu_double_grad", "pad_double_grad",
     "pad3d_double_grad", "squeeze_double_grad", "unsqueeze_double_grad",
-    "conv3d_double_grad", "depthwise_conv2d_grad_grad"
+    "instance_norm_double_grad", "conv3d_double_grad",
+    "depthwise_conv2d_grad_grad"
 ])
 
 # For API dispatch used at python-level
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index d8b909c3bacc1..d23d71b07626d 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -1404,7 +1404,7 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
   const auto& out_metas = OutputMeta();
   paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> returns({slot_num_bwd_outputs});
   for (int i = 0; i < {slot_num_bwd_outputs}; ++i) {{
-    returns[i].resize(out_metas[i].size());
+    out_metas[i].size() == 0 ? returns[i].resize(1) : returns[i].resize(out_metas[i].size());
   }}
 """
 
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 63b899f6d6b62..36cfb4db1137a 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -13,27 +13,28 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/backward.h"
-#include <queue>
 
+#include <deque>
+
+#include "glog/logging.h"
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/grad_tensor_holder.h"
 #include "paddle/fluid/eager/utils.h"
-#include "paddle/fluid/platform/profiler.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
-#include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
 
 namespace egr {
 
 /*
-* GeneralGrad is Helpper class to implement custom grad operation between
-* outputs and inputs.
-*
-* **/
+ * GeneralGrad is Helpper class to implement custom grad operation between
+ * outputs and inputs.
+ *
+ * **/
 class GeneralGrad {
  public:
   static GeneralGrad& Instance() { return *general_grad_; }
@@ -100,19 +101,19 @@ class GeneralGrad {
     // make sure the path from root to target_node is ok
     std::unordered_set<GradNodeBase*> startup_ops;
     VLOG(6) << "Running in UpdateGraphInfo";
-    std::queue<GradNodeBase*> queue;
+    std::deque<GradNodeBase*> queue;
     for (auto& target_nodes_inputmeta_pair :
          input_target_nodes_inputmeta_map_) {
-      queue.emplace(target_nodes_inputmeta_pair.first);
+      queue.push_back(target_nodes_inputmeta_pair.first);
     }
 
     while (!queue.empty()) {
       auto* target_node = queue.front();
-      queue.pop();
+      queue.pop_front();
       if (!(depending_nodes_)[target_node].empty()) {
         auto precedding_nodes = (depending_nodes_)[target_node];
         for (auto pre_nodes : precedding_nodes) {
-          queue.emplace(pre_nodes);
+          queue.push_back(pre_nodes);
           if (potential_stop_nodes_.find(pre_nodes) !=
               potential_stop_nodes_.end()) {
             potential_stop_nodes_.erase(pre_nodes);
@@ -144,20 +145,20 @@ class GeneralGrad {
 
   // Get Graph Info Betweent input target GradNode and outputs，
   // record depending_nodes_、potential_stop_nodes_、potential_startup_nodes_
-  void GetGraphInfoBetweenTargets(const std::queue<GradNodeBase*>& init_queue) {
+  void GetGraphInfoBetweenTargets(const std::deque<GradNodeBase*>& init_queue) {
     VLOG(6) << "Runing In GetGraphInfoBetweenTargets";
 
     // Calculate in_degree for each node
     std::unordered_map<GradNodeBase*, int> node_in_degree_map;
 
     // Copy nodes
-    std::queue<GradNodeBase*> queue = init_queue;
+    std::deque<GradNodeBase*> queue = init_queue;
     std::unordered_set<GradNodeBase*> visited;
 
     // Visit each node exactly once in any order
     while (!queue.empty()) {
       GradNodeBase* node = queue.front();
-      queue.pop();
+      queue.pop_front();
 
       if (visited.count(node)) {
         continue;
@@ -198,7 +199,7 @@ class GeneralGrad {
 
           // Record depending relationship
           (depending_nodes_)[next_node].emplace(node);
-          queue.push(next_node);
+          queue.push_back(next_node);
         }
       }
     }
@@ -207,10 +208,10 @@ class GeneralGrad {
     UpdateGraphInfo();
   }
 
-  void ModifyReadyQueue(std::queue<GradNodeBase*>* queue) {
-    std::queue<GradNodeBase*> tmp_queue;
+  void ModifyReadyQueue(std::deque<GradNodeBase*>* queue) {
+    std::deque<GradNodeBase*> tmp_queue;
     for (auto nodes : potential_startup_nodes_) {
-      tmp_queue.emplace(nodes);
+      tmp_queue.push_back(nodes);
     }
     tmp_queue.swap(*queue);
   }
@@ -297,7 +298,7 @@ class GeneralGrad {
   void PreparedForGeneralGrad(
       const std::vector<paddle::experimental::Tensor>& inputs,
       const std::vector<paddle::experimental::Tensor>& no_grad_vars,
-      std::queue<GradNodeBase*>* queue,
+      std::deque<GradNodeBase*>* queue,
       const std::unordered_map<GradNodeBase*,
                                std::unique_ptr<GradTensorHolder>>&
           node_input_buffers_dict) {
@@ -366,14 +367,14 @@ class GeneralGrad {
   }
 
   void ReconstructBackwardGraph(
-      const std::queue<GradNodeBase*>& orig_init_queue) {
-    std::queue<GradNodeBase*> queue = orig_init_queue;
+      const std::deque<GradNodeBase*>& orig_init_queue) {
+    std::deque<GradNodeBase*> queue = orig_init_queue;
     std::unordered_set<GradNodeBase*> visited;
 
     // BFS and recursively copy the grad nodes
     while (!queue.empty()) {
       GradNodeBase* orig_node = queue.front();
-      queue.pop();
+      queue.pop_front();
       if (visited.count(orig_node)) {
         continue;
       }
@@ -417,7 +418,7 @@ class GeneralGrad {
           copied_edge.SetGradNode(copied_next_node);
 
           // Update BFS queue
-          queue.push(orig_next_node.get());
+          queue.push_back(orig_next_node.get());
         }
       }
     }
@@ -449,20 +450,20 @@ class GeneralGrad {
 };
 
 std::unordered_map<GradNodeBase*, int> getInDegreeMap(
-    const std::queue<GradNodeBase*>& init_queue) {
+    const std::deque<GradNodeBase*>& init_queue) {
   // Calculate in_degree for each node
   // We can completely remove this pass, if in_degree were set during forward
   // pass
   std::unordered_map<GradNodeBase*, int> node_in_degree_map;
 
   // Copy nodes
-  std::queue<GradNodeBase*> queue = init_queue;
+  std::deque<GradNodeBase*> queue = init_queue;
   std::unordered_set<GradNodeBase*> visited;
 
   // Visit each node exactly once in any order
   while (!queue.empty()) {
     GradNodeBase* node = queue.front();
-    queue.pop();
+    queue.pop_front();
 
     if (visited.count(node)) {
       continue;
@@ -490,7 +491,7 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
         if (!node_in_degree_map.count(next_node))
           node_in_degree_map[next_node] = 0;
         node_in_degree_map[next_node]++;
-        queue.push(next_node);
+        queue.push_back(next_node);
       }
     }
   }
@@ -548,8 +549,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
   /* --- Initialization --- */
   // 1. Init queue with starting nodes
   // 2. Prepare initial input buffers
-  std::queue<GradNodeBase*> queue;
-  std::queue<GradNodeBase*> orig_queue;
+  std::deque<GradNodeBase*> queue;
+  std::deque<GradNodeBase*> orig_queue;
   std::unordered_map<GradNodeBase*, std::unique_ptr<GradTensorHolder>>
       node_input_buffers_dict;
   for (size_t i = 0; i < tensors.size(); i++) {
@@ -582,7 +583,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     GradNodeBase* grad_node = shared_grad_node.get();
     if (is_general_grad) {
       // Save orig grad node
-      orig_queue.push(grad_node);
+      orig_queue.push_back(grad_node);
 
       // Replace grad_node with copied grad_node
       grad_node = GeneralGrad::Instance().CopyGradNode(shared_grad_node);
@@ -625,7 +626,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     }
 
     // Prepare queue, potential startup_nodes
-    queue.push(grad_node);
+    queue.push_back(grad_node);
   }
 
   if (is_general_grad) {
@@ -663,10 +664,10 @@ std::vector<paddle::experimental::Tensor> RunBackward(
         paddle::platform::TracerEventType::Operator, 1);
 
     if (queue.size() > 1 && node_in_degree_map[node] != 0) {
-      queue.pop();
+      queue.pop_front();
       continue;
     }
-    queue.pop();
+    queue.pop_front();
 
     // Run node: This is where Hook happens
     auto node_input_buffer_iter = node_input_buffers_dict.find(node);
@@ -798,11 +799,19 @@ std::vector<paddle::experimental::Tensor> RunBackward(
           bool is_potential_stop_node =
               GeneralGrad::Instance().GetPotentialStopNodes()->count(next_node);
           if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) {
-            queue.emplace(std::move(next_node));
+            if (dynamic_cast<egr::GradNodeAccumulation*>(next_node)) {
+              queue.push_front(std::move(next_node));
+            } else {
+              queue.push_back(std::move(next_node));
+            }
           }
         } else {
           if (node_in_degree_map[next_node] == 0) {
-            queue.emplace(std::move(next_node));
+            if (dynamic_cast<egr::GradNodeAccumulation*>(next_node)) {
+              queue.push_front(std::move(next_node));
+            } else {
+              queue.push_back(std::move(next_node));
+            }
           }
         }
       }
diff --git a/paddle/fluid/eager/custom_operator/CMakeLists.txt b/paddle/fluid/eager/custom_operator/CMakeLists.txt
index ccc9a03a55660..424194557dd84 100644
--- a/paddle/fluid/eager/custom_operator/CMakeLists.txt
+++ b/paddle/fluid/eager/custom_operator/CMakeLists.txt
@@ -1 +1,4 @@
-cc_library(custom_operator_node SRCS custom_operator_node.cc DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info)
+cc_library(
+  custom_operator_node
+  SRCS custom_operator_node.cc
+  DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info)
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index abdd8cadeed4c..3efcf3b21a4e3 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
+
 #include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index af387bb3238d1..71ccb072ce917 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -13,27 +13,24 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/grad_node_info.h"
+
+#include "glog/logging.h"
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/utils.h"
-
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/sparse_coo_tensor.h"
-
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/var_type.h"
-
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
-
-#include "glog/logging.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
 
 /**
  * Implementation of GradNodeBase, Edge and GradTensorHolder.
-**/
+ **/
 namespace egr {
 
 static void CheckTensor(const paddle::experimental::Tensor& pre,
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 747e98b846616..9070ac9e5b652 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -179,14 +179,13 @@ class GradNodeBase {
                                kSlotSmallVectorSize>
   operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                   kSlotSmallVectorSize>& grads,  // NOLINT
-             bool create_graph = false,
-             bool is_new_grad = false) = 0;
+             bool create_graph = false, bool is_new_grad = false) = 0;
 
   virtual void ClearTensorWrappers() = 0;
 
   /**
-       * Self-Copy interface designed for use in DoubleGrad
-       * **/
+   * Self-Copy interface designed for use in DoubleGrad
+   * **/
   virtual std::shared_ptr<GradNodeBase> Copy() const = 0;
 
   // adj_edges were moved inside OutputMeta(), so no available direct access
@@ -230,8 +229,8 @@ class GradNodeBase {
                                std::shared_ptr<egr::TensorHook>&& hook);
 
   /**
-  * Remove GradientHook
-  * **/
+   * Remove GradientHook
+   * **/
   bool RemoveGradientHook(const int64_t& hook_id) {
     auto remove_cnt = gradient_hooks_.erase(hook_id);
     if (remove_cnt == 0) {
@@ -252,8 +251,8 @@ class GradNodeBase {
                                  kSlotSmallVectorSize>& tensors);
 
   /**
-    * Handle Complex - Real Type Promotion
-    * **/
+   * Handle Complex - Real Type Promotion
+   * **/
   void HandleComplexGradToRealGrad(
       paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                            kSlotSmallVectorSize>* out_grads);
@@ -262,8 +261,8 @@ class GradNodeBase {
   virtual std::string name() { return "GradNodeBase"; }
 
   /**
-       * The following interfaces are designed for no_need_buffer
-       * **/
+   * The following interfaces are designed for no_need_buffer
+   * **/
   bool IsTensorWrappersCleared() { return is_tensor_wrappers_cleared_; }
 
   void SetIsTensorWrappersCleared(bool is_tensor_wrappers_cleared) {
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 64fb8b53b473c..6abf759cdba7a 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/grad_tensor_holder.h"
-#include "paddle/fluid/imperative/gradient_accumulator.h"
 
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/imperative/gradient_accumulator.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace egr {
diff --git a/paddle/fluid/eager/hooks.h b/paddle/fluid/eager/hooks.h
index 097150cf5ed59..a98b3d9f8e4df 100644
--- a/paddle/fluid/eager/hooks.h
+++ b/paddle/fluid/eager/hooks.h
@@ -18,6 +18,7 @@
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "paddle/phi/api/include/tensor.h"
 namespace egr {
 
diff --git a/paddle/fluid/eager/pylayer/CMakeLists.txt b/paddle/fluid/eager/pylayer/CMakeLists.txt
index 59030342eccad..4b0ad071117bc 100644
--- a/paddle/fluid/eager/pylayer/CMakeLists.txt
+++ b/paddle/fluid/eager/pylayer/CMakeLists.txt
@@ -1 +1,4 @@
-cc_library(py_layer_node SRCS py_layer_node.cc DEPS pybind phi_api grad_node_info)
+cc_library(
+  py_layer_node
+  SRCS py_layer_node.cc
+  DEPS pybind phi_api grad_node_info)
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.cc b/paddle/fluid/eager/pylayer/py_layer_node.cc
index a00b292fe0915..ec17a324b1ec9 100644
--- a/paddle/fluid/eager/pylayer/py_layer_node.cc
+++ b/paddle/fluid/eager/pylayer/py_layer_node.cc
@@ -13,18 +13,16 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/pylayer/py_layer_node.h"
-#include "paddle/fluid/eager/eager_tensor.h"
-
-#include "paddle/phi/api/all.h"
-#include "paddle/phi/core/dense_tensor.h"
 
+#include "glog/logging.h"
+#include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
-
-#include "glog/logging.h"
+#include "paddle/phi/api/all.h"
+#include "paddle/phi/core/dense_tensor.h"
 #pragma GCC diagnostic ignored "-Wattributes"
 #include "pybind11/pytypes.h"
 
@@ -34,8 +32,7 @@ paddle::small_vector<std::vector<paddle::experimental::Tensor>,
 GradNodePyLayer::operator()(
     paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                          kSlotSmallVectorSize>& grads,  // NOLINT
-    bool create_graph,
-    bool is_new_grad) {
+    bool create_graph, bool is_new_grad) {
   VLOG(3) << "Running Eager Backward Node: " << name();
 
   paddle::small_vector<std::vector<paddle::experimental::Tensor>,
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.h b/paddle/fluid/eager/pylayer/py_layer_node.h
index c1a8c6e626b4f..998480bbfebfa 100644
--- a/paddle/fluid/eager/pylayer/py_layer_node.h
+++ b/paddle/fluid/eager/pylayer/py_layer_node.h
@@ -38,8 +38,7 @@ class GradNodePyLayer : public GradNodeBase {
                                kSlotSmallVectorSize>
   operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                   kSlotSmallVectorSize>& grads,  // NOLINT
-             bool create_graph = false,
-             bool is_new_grad = false) override;
+             bool create_graph = false, bool is_new_grad = false) override;
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
 
diff --git a/paddle/fluid/eager/tests/data_structure_tests/CMakeLists.txt b/paddle/fluid/eager/tests/data_structure_tests/CMakeLists.txt
index 76c59561fc0bb..90159e9b8c32e 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/data_structure_tests/CMakeLists.txt
@@ -1,9 +1,27 @@
-cc_test(test_egr_ds_eager_tensor SRCS eager_tensor_test.cc DEPS ${eager_deps})
-cc_test(test_egr_ds_auotgrad_meta SRCS autograd_meta_test.cc DEPS ${eager_deps})
-cc_test(test_egr_ds_grad_node_info SRCS grad_node_info_test.cc DEPS ${eager_deps})
-cc_test(test_egr_ds_accumulation_node SRCS accumulation_node_test.cc DEPS ${eager_deps})
-cc_test(test_egr_ds_tensor_wrapper SRCS tensor_wrapper_test.cc DEPS ${eager_deps})
+cc_test(
+  test_egr_ds_eager_tensor
+  SRCS eager_tensor_test.cc
+  DEPS ${eager_deps})
+cc_test(
+  test_egr_ds_auotgrad_meta
+  SRCS autograd_meta_test.cc
+  DEPS ${eager_deps})
+cc_test(
+  test_egr_ds_grad_node_info
+  SRCS grad_node_info_test.cc
+  DEPS ${eager_deps})
+cc_test(
+  test_egr_ds_accumulation_node
+  SRCS accumulation_node_test.cc
+  DEPS ${eager_deps})
+cc_test(
+  test_egr_ds_tensor_wrapper
+  SRCS tensor_wrapper_test.cc
+  DEPS ${eager_deps})
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-    cc_test(test_egr_ds_grad_tensor_holder SRCS grad_tensor_holder_test.cc DEPS ${eager_deps} ${generated_deps})
+  cc_test(
+    test_egr_ds_grad_tensor_holder
+    SRCS grad_tensor_holder_test.cc
+    DEPS ${eager_deps} ${generated_deps})
 endif()
diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
index c159084d683e8..c53ffe823abba 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+
 #include <sstream>
 
 #include "gtest/gtest.h"
-
-#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/utils/hook_utils.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
diff --git a/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc b/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
index 48b4b9c57487a..f7415dd1f713d 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/autograd_meta.h"
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
-#include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h"
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index edbb441f27a08..a82965303af14 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/eager_tensor.h"
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
-#include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/imperative/var_helper.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/common/layout.h"
@@ -35,7 +35,7 @@ class AutogradMetaTest : public AbstractAutogradMeta {
   explicit AutogradMetaTest(int val) : val_(val) {}
   int val_ = 0;
 };
-}
+}  // namespace eager_test
 TEST(Tensor, Constructor) {
   paddle::experimental::Tensor et1 = paddle::experimental::Tensor();
   paddle::experimental::Tensor et2 = paddle::experimental::Tensor("et2");
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index 6687b6621ad54..63a4a72b631d6 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/grad_node_info.h"
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
@@ -85,8 +85,8 @@ void TestGradNodeBase(bool is_remove_gradient_hook) {
   CHECK_EQ(grad_test_node2->OutputMeta()[0].size(), size_t(1));
 
   VLOG(6) << "Test Gradient Hook";
-  auto gradient_hook = [](
-      const paddle::experimental::Tensor& et) -> paddle::experimental::Tensor {
+  auto gradient_hook = [](const paddle::experimental::Tensor& et)
+      -> paddle::experimental::Tensor {
     paddle::experimental::Tensor res;
     phi::DenseTensorMeta meta =
         phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index a00e629d1029a..eb9bd6007bf8a 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -14,7 +14,6 @@
 #pragma once
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
@@ -35,8 +34,7 @@ class GradTestNode : public egr::GradNodeBase {
                        egr::kSlotSmallVectorSize>
   operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                   egr::kSlotSmallVectorSize>& grads,  // NOLINT
-             bool create_graph = false,
-             bool is_new_grad = false) override {
+             bool create_graph = false, bool is_new_grad = false) override {
     val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
                ->data<float>()[0];
     phi::DenseTensorMeta meta =
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
index 0fe349294b438..17f593e24905d 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
@@ -12,17 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/grad_tensor_holder.h"
+
 #include <sstream>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
-#include "paddle/fluid/eager/grad_tensor_holder.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/core/selected_rows.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/selected_rows.h"
 
 PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
index 28c3472f90d03..8813f364840e0 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/tensor_wrapper.h"
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
-#include "paddle/fluid/eager/tensor_wrapper.h"
 #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h"
 #include "paddle/fluid/eager/utils.h"
 
diff --git a/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt
index 516789cbb8cf7..7b6dfae729f38 100644
--- a/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt
@@ -1,7 +1,29 @@
-cc_library(performance_benchmark_utils SRCS benchmark_utils.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node scale_op matmul_v2_op dygraph_function)
+cc_library(
+  performance_benchmark_utils
+  SRCS benchmark_utils.cc
+  DEPS ${eager_deps}
+       ${fluid_deps}
+       ${generated_deps}
+       eager_scale
+       scale_node
+       scale_op
+       matmul_v2_op
+       dygraph_function)
 
-cc_test(test_egr_performance_benchmark_eager_cpu SRCS benchmark_eager_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
-cc_test(test_egr_performance_benchmark_fluid_cpu SRCS benchmark_fluid_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
+cc_test(
+  test_egr_performance_benchmark_eager_cpu
+  SRCS benchmark_eager_cpu.cc
+  DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
+cc_test(
+  test_egr_performance_benchmark_fluid_cpu
+  SRCS benchmark_fluid_cpu.cc
+  DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
 
-cc_test(test_egr_performance_benchmark_eager_cuda SRCS benchmark_eager_cuda.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
-cc_test(test_egr_performance_benchmark_fluid_cuda SRCS benchmark_fluid_cuda.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
+cc_test(
+  test_egr_performance_benchmark_eager_cuda
+  SRCS benchmark_eager_cuda.cc
+  DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
+cc_test(
+  test_egr_performance_benchmark_fluid_cuda
+  SRCS benchmark_fluid_cuda.cc
+  DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index 056c7102f663b..3b0e6a3fdb6e1 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -15,19 +15,17 @@
 // Eager Dygraph
 
 #include <paddle/fluid/framework/op_registry.h>
+
 #include <chrono>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/platform/flags.h"
-
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
-
-#include "paddle/fluid/imperative/tracer.h"
-
 #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/platform/flags.h"
 
 #ifdef WITH_GPERFTOOLS
 #include "gperftools/profiler.h"
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 287d6e770dea2..5dd5cde548fc0 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -14,19 +14,17 @@
 
 // Eager Dygraph
 #include <paddle/fluid/framework/op_registry.h>
+
 #include <chrono>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/platform/flags.h"
-
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
-
-#include "paddle/fluid/imperative/tracer.h"
-
 #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/platform/flags.h"
 
 #ifdef WITH_GPERFTOOLS
 #include "gperftools/profiler.h"
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index b4b47a85f6666..bf1d955b9000f 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -23,7 +23,6 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/fluid/imperative/basic_engine.h"
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index d9afd7cc96523..0cd33a72e1a9a 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -23,7 +23,6 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/fluid/imperative/basic_engine.h"
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
index 86bf13707ed40..5b37e973f1dc6 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <math.h>
+
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/phi/api/all.h"
diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
index 719ef6673c07d..2f57489999ff8 100644
--- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
@@ -1,14 +1,47 @@
-cc_test(test_egr_task_tensor_utils SRCS tensor_utils_test.cc DEPS ${eager_deps})
-cc_test(test_egr_task_eager_utils SRCS eager_utils_test.cc DEPS ${eager_deps})
-cc_test(test_egr_task_forward_autograd SRCS forward_autograd_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
-cc_test(test_egr_task_nan_inf_utils SRCS nan_inf_utils_test.cc DEPS eager_nan_inf_utils)
+cc_test(
+  test_egr_task_tensor_utils
+  SRCS tensor_utils_test.cc
+  DEPS ${eager_deps})
+cc_test(
+  test_egr_task_eager_utils
+  SRCS eager_utils_test.cc
+  DEPS ${eager_deps})
+cc_test(
+  test_egr_task_forward_autograd
+  SRCS forward_autograd_test.cc
+  DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
+cc_test(
+  test_egr_task_nan_inf_utils
+  SRCS nan_inf_utils_test.cc
+  DEPS eager_nan_inf_utils)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-    cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node)
-    cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node)
-    cc_test(test_egr_task_grad SRCS grad_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node)
-    cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node)
-    cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node)
-    cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node)
-    cc_test(test_egr_task_autocodegen SRCS generated_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps})
+  cc_test(
+    test_egr_task_hook
+    SRCS hook_test.cc
+    DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node)
+  cc_test(
+    test_egr_task_backward
+    SRCS backward_test.cc
+    DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node)
+  cc_test(
+    test_egr_task_grad
+    SRCS grad_test.cc
+    DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node)
+  cc_test(
+    test_egr_task_fwd_bwd_joint
+    SRCS fwd_bwd_joint_test.cc
+    DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node)
+  cc_test(
+    test_egr_task_cross_batch
+    SRCS cross_batch_accumulation_test.cc
+    DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node)
+  cc_test(
+    test_egr_task_hook_intermidiate
+    SRCS hook_test_intermidiate.cc
+    DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node)
+  cc_test(
+    test_egr_task_autocodegen
+    SRCS generated_test.cc
+    DEPS ${eager_deps} ${fluid_deps} ${generated_deps})
 endif()
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index 7552ad83fa20f..c6d4514fa8e33 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -12,25 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/backward.h"
+
 #include <sstream>
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
-#include "paddle/fluid/eager/backward.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
-
-#include "paddle/fluid/eager/api/all.h"
-
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/tensor_meta.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index 4337c0d092ca0..847c082a30173 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -16,22 +16,17 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
 #include "paddle/fluid/eager/grad_node_info.h"
-
-#include "paddle/fluid/eager/api/all.h"
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/tensor_meta.h"
-
 #include "paddle/fluid/eager/tests/test_utils.h"
-
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index 551262d259e08..e4ca8dd164b8f 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -15,14 +15,12 @@
 #include <sstream>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/fluid/eager/utils.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
index 4cb316380aade..ebf396bebfab0 100644
--- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
@@ -16,18 +16,15 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
-
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/tensor_meta.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index 1f8fdb7de0c17..a4da315f44a7a 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -16,21 +16,17 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
 #include "paddle/fluid/eager/grad_node_info.h"
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/tensor_meta.h"
-
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
-
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index 3c237b76e64b0..b53cdf55d4306 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -17,17 +17,14 @@
 #include <chrono>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
-#include "paddle/fluid/eager/utils.h"
-
 #include "paddle/fluid/eager/tests/test_utils.h"
+#include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/imperative/tracer.h"
-
-#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/task_tests/grad_test.cc b/paddle/fluid/eager/tests/task_tests/grad_test.cc
index 72a94b40ed753..8d6c4d7843fb2 100644
--- a/paddle/fluid/eager/tests/task_tests/grad_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/grad_test.cc
@@ -16,17 +16,14 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
-
-#include "paddle/fluid/eager/api/all.h"
-
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index d7b887b28bde8..badbe87159785 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -16,22 +16,17 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
 #include "paddle/fluid/eager/grad_node_info.h"
-
-#include "paddle/fluid/eager/api/all.h"
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/tensor_meta.h"
-
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
-
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
index c4d4ff9110682..dbe2c13894566 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -15,16 +15,14 @@
 #include <sstream>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/fluid/eager/backward.h"
 #include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/phi/core/dense_tensor.h"
-
-#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
-#include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc b/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc
index be0563fbeedb4..73d213f71148f 100644
--- a/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/nan_inf_utils.h"
+
 #include <iostream>
 #include <limits>
 #include <tuple>
 
 #include "gtest/gtest.h"
-
-#include "paddle/fluid/eager/nan_inf_utils.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/api/include/api.h"
diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
index 24e5da060111f..aeddeb6fae7f2 100644
--- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
@@ -12,17 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+
 #include <sstream>
 
 #include "gtest/gtest.h"
-
-#include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/grad_tensor_holder.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/test_utils.h b/paddle/fluid/eager/tests/test_utils.h
index 47bfe9a7cabd5..cb1e531d82d63 100644
--- a/paddle/fluid/eager/tests/test_utils.h
+++ b/paddle/fluid/eager/tests/test_utils.h
@@ -18,14 +18,12 @@
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/utils.h"
-
-#include "paddle/phi/api/all.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/tensor_meta.h"
-
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/phi/api/all.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/tensor_meta.h"
 
 namespace eager_test {
 
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 5a730e4dbf164..3254b3bf89262 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -17,7 +17,6 @@
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tensor_wrapper.h"
-
 #include "paddle/fluid/operators/run_program_op.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -273,7 +272,7 @@ inline void RunProgramGradAPI(
     const paddle::framework::AttributeMap &attrs,
     std::vector<paddle::experimental::Tensor *> &x_grad,      // NOLINT
     std::vector<paddle::experimental::Tensor *> &params_grad  // NOLINT
-    ) {
+) {
   // if all output vars are set to stop_gradient, grad op no need to executed
   if (x_grad.empty() && params_grad.empty()) return;
 
@@ -368,8 +367,7 @@ class GradNodeRunProgram : public egr::GradNodeBase {
                                egr::kSlotSmallVectorSize>
   operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                   egr::kSlotSmallVectorSize> &grads,  // NOLINT
-             bool create_graph,
-             bool is_new_grad) override {
+             bool create_graph, bool is_new_grad) override {
     VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
     paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                          egr::kSlotSmallVectorSize>
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 7d9554c52eb6c..4d7d1aa2d8a3d 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -27,7 +27,7 @@
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/variable.h"
 
-PADDLE_DEFINE_EXPORTED_bool(retain_grad_for_all_tensor, true,
+PADDLE_DEFINE_EXPORTED_bool(retain_grad_for_all_tensor, false,
                             "retain grad for all tensor");
 
 namespace egr {
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index c6389e998315c..783afcc1e2c73 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -18,7 +18,6 @@
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
-
 #include "paddle/phi/api/all.h"
 
 namespace egr {
@@ -161,10 +160,11 @@ class EagerUtils {
     if (require_any_grad && autograd_meta) {
       PADDLE_ENFORCE_EQ(!autograd_meta->StopGradient() &&
                             egr::egr_utils_api::IsLeafTensor(target),
-                        false, paddle::platform::errors::InvalidArgument(
-                                   "Leaf Var (%s) that doesn't stop gradient "
-                                   "can't use inplace strategy.",
-                                   target.name()));
+                        false,
+                        paddle::platform::errors::InvalidArgument(
+                            "Leaf Var (%s) that doesn't stop gradient "
+                            "can't use inplace strategy.",
+                            target.name()));
     }
   }
 
@@ -234,8 +234,8 @@ class EagerUtils {
       const paddle::experimental::Tensor& tensor);
 
   /**
-    * Fill Zero
-    * **/
+   * Fill Zero
+   * **/
   static void FillZeroForEmptyOptionalGradInput(
       std::vector<paddle::experimental::Tensor>* in_grads,
       const std::vector<GradSlotMeta>& grad_in_metas);
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index bb7f3f26463d4..5402beb49e69d 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1,22 +1,30 @@
-
 #windows treat symbolic file as a real file, which is different with unix
 #We create a hidden file and compile it instead of origin source file.
 function(windows_symbolic TARGET)
   set(oneValueArgs "")
   set(multiValueArgs SRCS PATH)
-  cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
   set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
   foreach(src ${windows_symbolic_SRCS})
     get_filename_component(src ${src} NAME_WE)
-    if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu)
-        message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
+    if(NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu)
+      message(
+        FATAL
+        " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file."
+      )
     endif()
 
-    file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)
+    file(
+      GENERATE
+      OUTPUT ${final_path}/.${src}.cu
+      INPUT ${final_path}/${src}.cc)
 
-    add_custom_command(OUTPUT ${final_path}/.${src}.cu
-            COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
-            COMMENT "create hidden file of ${src}.cu")
+    add_custom_command(
+      OUTPUT ${final_path}/.${src}.cu
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc"
+              "${final_path}/.${src}.cu"
+      COMMENT "create hidden file of ${src}.cu")
     add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu)
   endforeach()
 endfunction()
@@ -26,7 +34,7 @@ add_subdirectory(details)
 add_subdirectory(fleet)
 add_subdirectory(io)
 add_subdirectory(new_executor)
-if (WITH_CINN)
+if(WITH_CINN)
   add_subdirectory(paddle2cinn)
 endif()
 #ddim lib
@@ -34,420 +42,1101 @@ proto_library(framework_proto SRCS framework.proto)
 proto_library(pass_desc_proto SRCS pass_desc.proto DEPS framework_proto)
 
 proto_library(op_def_proto SRCS op_def.proto DEPS framework_proto)
-cc_library(op_def_api SRCS op_def_api.cc DEPS op_def_proto boost)
-
-FILE(GLOB OP_DEF_FILES ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/compat/*.pbtxt)
-FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt 
+cc_library(
+  op_def_api
+  SRCS op_def_api.cc
+  DEPS op_def_proto boost)
+
+file(GLOB OP_DEF_FILES
+     ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/compat/*.pbtxt)
+file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt
      "namespace { \n"
      "const std::unordered_map<std::string, std::string> op_def_map =  { \n")
 foreach(OP_DEF_FILE ${OP_DEF_FILES})
-    FILE(READ ${OP_DEF_FILE}  OP_DEF_CONTENT)
-    get_filename_component(OP_NAME ${OP_DEF_FILE} NAME_WE)
-    FILE(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt
-    "{\"${OP_NAME}\",R\"(${OP_DEF_CONTENT})\"},\n")
+  file(READ ${OP_DEF_FILE} OP_DEF_CONTENT)
+  get_filename_component(OP_NAME ${OP_DEF_FILE} NAME_WE)
+  file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt
+       "{\"${OP_NAME}\",R\"(${OP_DEF_CONTENT})\"},\n")
 endforeach(OP_DEF_FILE)
-FILE(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt "{\"\",\"\"}};\n}")
+file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt "{\"\",\"\"}};\n}")
 
 proto_library(heter_service_proto SRCS heter_service.proto)
 proto_library(data_feed_proto SRCS data_feed.proto)
 proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
-  data_feed_proto)
-
-cc_library(string_array SRCS string_array.cc DEPS utf8proc)
-
-cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
-cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
+              data_feed_proto)
+
+cc_library(
+  string_array
+  SRCS string_array.cc
+  DEPS utf8proc)
+
+cc_library(
+  data_type
+  SRCS data_type.cc
+  DEPS framework_proto ddim device_context)
+cc_test(
+  data_type_test
+  SRCS data_type_test.cc
+  DEPS data_type place tensor)
 if(WITH_GPU)
-  if (WIN32)
+  if(WIN32)
     windows_symbolic(tensor_util SRCS tensor_util.cu)
-    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context dense_tensor)
+    nv_library(
+      tensor
+      SRCS tensor.cc .tensor_util.cu
+      DEPS place memory data_type device_context dense_tensor)
     add_dependencies(tensor tensor_util)
   else()
-    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler dense_tensor)
+    nv_library(
+      tensor
+      SRCS tensor.cc tensor_util.cu
+      DEPS place memory data_type device_context profiler dense_tensor)
   endif(WIN32)
 elseif(WITH_ROCM)
-  hip_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler dense_tensor)
+  hip_library(
+    tensor
+    SRCS tensor.cc tensor_util.cu
+    DEPS place memory data_type device_context profiler dense_tensor)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler dense_tensor)
+  cc_library(
+    tensor
+    SRCS tensor.cc tensor_util.cc
+    DEPS place memory data_type device_context profiler dense_tensor)
 endif()
 
-cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
+cc_test(
+  tensor_test
+  SRCS tensor_test.cc
+  DEPS tensor)
 if(WITH_GPU)
-  nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor)
+  nv_test(
+    tensor_util_test
+    SRCS tensor_util_test.cc tensor_util_test.cu
+    DEPS tensor dlpack_tensor)
 elseif(WITH_ROCM)
-  hip_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor)
+  hip_test(
+    tensor_util_test
+    SRCS tensor_util_test.cc tensor_util_test.cu
+    DEPS tensor dlpack_tensor)
 else()
-  cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor dlpack_tensor)
+  cc_test(
+    tensor_util_test
+    SRCS tensor_util_test.cc
+    DEPS tensor dlpack_tensor)
 endif()
 
-cc_test(copy_same_tensor_test SRCS copy_same_tensor_test.cc DEPS tensor)
+cc_test(
+  copy_same_tensor_test
+  SRCS copy_same_tensor_test.cc
+  DEPS tensor)
 
-cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
-cc_library(mixed_vector SRCS mixed_vector.cc DEPS device_context place memory)
+cc_test(
+  eigen_test
+  SRCS eigen_test.cc
+  DEPS tensor)
+cc_library(
+  mixed_vector
+  SRCS mixed_vector.cc
+  DEPS device_context place memory)
 
 if(WITH_GPU)
-  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor)
+  nv_test(
+    mixed_vector_test
+    SRCS mixed_vector_test.cc mixed_vector_test.cu
+    DEPS mixed_vector place memory device_context tensor)
 elseif(WITH_ROCM)
-  hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor)
+  hip_test(
+    mixed_vector_test
+    SRCS mixed_vector_test.cc mixed_vector_test.cu
+    DEPS mixed_vector place memory device_context tensor)
 else()
-  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS mixed_vector place memory device_context tensor)
+  cc_test(
+    mixed_vector_test
+    SRCS mixed_vector_test.cc
+    DEPS mixed_vector place memory device_context tensor)
 endif()
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim mixed_vector place tensor framework_proto version)
+cc_library(
+  lod_tensor
+  SRCS lod_tensor.cc
+  DEPS ddim mixed_vector place tensor framework_proto version)
 
-cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_utils lod_tensor memory)
+cc_test(
+  lod_tensor_test
+  SRCS lod_tensor_test.cc
+  DEPS lod_utils lod_tensor memory)
 
 if(WITH_GPU)
-  nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
+  nv_test(
+    lod_tensor_gpu_test
+    SRCS lod_tensor_test.cu
+    DEPS lod_tensor)
 elseif(WITH_ROCM)
-  hip_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
+  hip_test(
+    lod_tensor_gpu_test
+    SRCS lod_tensor_test.cu
+    DEPS lod_tensor)
 endif()
 
-cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog)
-
-cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
-cc_test(reader_test SRCS reader_test.cc DEPS reader)
-
-cc_library(threadpool SRCS threadpool.cc DEPS enforce)
-cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
-
-cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows_utils framework_proto scope)
-if (WITH_GPU)
+cc_library(
+  garbage_collector
+  SRCS garbage_collector.cc
+  DEPS device_context memory gflags glog)
+
+cc_library(
+  reader
+  SRCS reader.cc
+  DEPS lod_tensor ddim)
+cc_test(
+  reader_test
+  SRCS reader_test.cc
+  DEPS reader)
+
+cc_library(
+  threadpool
+  SRCS threadpool.cc
+  DEPS enforce)
+cc_test(
+  threadpool_test
+  SRCS threadpool_test.cc
+  DEPS threadpool)
+
+cc_library(
+  var_type_traits
+  SRCS var_type_traits.cc
+  DEPS lod_tensor selected_rows_utils framework_proto scope)
+if(WITH_GPU)
   target_link_libraries(var_type_traits dynload_cuda)
 endif()
-cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
+cc_test(
+  var_type_traits_test
+  SRCS var_type_traits_test.cc
+  DEPS var_type_traits)
 
 set(BRPC_DEPS "")
 if(WITH_PSCORE)
-    set(BRPC_DEPS brpc ssl crypto)
+  set(BRPC_DEPS brpc ssl crypto)
 endif()
 if(WITH_PSLIB)
-    if(WITH_PSLIB_BRPC)
-        set(BRPC_DEPS pslib_brpc)
-    elseif(NOT WITH_HETERPS)
-        set(BRPC_DEPS brpc ssl crypto)
-    endif()
-    if (WITH_ARM_BRPC)
-        set(BRPC_DEPS arm_brpc)
-    endif()
+  if(WITH_PSLIB_BRPC)
+    set(BRPC_DEPS pslib_brpc)
+  elseif(NOT WITH_HETERPS)
+    set(BRPC_DEPS brpc ssl crypto)
+  endif()
+  if(WITH_ARM_BRPC)
+    set(BRPC_DEPS arm_brpc)
+  endif()
 endif()
 
-cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
-cc_library(device_worker SRCS device_worker.cc DEPS trainer_desc_proto lod_tensor scope ${BRPC_DEPS})
-cc_test(device_worker_test SRCS device_worker_test.cc DEPS device_worker)
-
-cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
-cc_test(scope_test SRCS scope_test.cc DEPS scope)
-cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
-
-cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
+cc_library(
+  scope
+  SRCS scope.cc
+  DEPS glog threadpool xxhash var_type_traits)
+cc_library(
+  device_worker
+  SRCS device_worker.cc
+  DEPS trainer_desc_proto lod_tensor scope ${BRPC_DEPS})
+cc_test(
+  device_worker_test
+  SRCS device_worker_test.cc
+  DEPS device_worker)
+
+cc_library(
+  scope_pool
+  SRCS scope_pool.cc
+  DEPS scope)
+cc_test(
+  scope_test
+  SRCS scope_test.cc
+  DEPS scope)
+cc_test(
+  variable_test
+  SRCS variable_test.cc
+  DEPS tensor var_type_traits)
+
+cc_library(
+  data_device_transform
+  SRCS data_device_transform.cc
+  DEPS tensor)
 if(WITH_GPU)
-  nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry device_context math_function scope)
+  nv_test(
+    data_device_transform_test
+    SRCS data_device_transform_test.cu
+    DEPS operator op_registry device_context math_function scope)
 elseif(WITH_ROCM)
-  hip_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry device_context math_function scope)
+  hip_test(
+    data_device_transform_test
+    SRCS data_device_transform_test.cu
+    DEPS operator op_registry device_context math_function scope)
 endif()
 
 if(WITH_GPU)
-  if (WIN32)
-#windows treat symbolic file as a real file, which is different with unix
-#We create a hidden file and compile it instead of origin source file.
-      windows_symbolic(hidden_file SRCS data_type_transform.cu)
-      nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
-      add_dependencies(data_type_transform hidden_file)
+  if(WIN32)
+    #windows treat symbolic file as a real file, which is different with unix
+    #We create a hidden file and compile it instead of origin source file.
+    windows_symbolic(hidden_file SRCS data_type_transform.cu)
+    nv_library(
+      data_type_transform
+      SRCS .data_type_transform.cu
+      DEPS tensor)
+    add_dependencies(data_type_transform hidden_file)
   else()
-      nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+    nv_library(
+      data_type_transform
+      SRCS data_type_transform.cu
+      DEPS tensor)
   endif(WIN32)
-  nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
+  nv_test(
+    data_type_transform_test
+    SRCS data_type_transform_test.cc data_type_transform_test.cu
+    DEPS data_type_transform)
 elseif(WITH_ROCM)
-  hip_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
-  hip_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
+  hip_library(
+    data_type_transform
+    SRCS data_type_transform.cu
+    DEPS tensor)
+  hip_test(
+    data_type_transform_test
+    SRCS data_type_transform_test.cc data_type_transform_test.cu
+    DEPS data_type_transform)
 else()
-  cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
-  cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
+  cc_library(
+    data_type_transform
+    SRCS data_type_transform.cc
+    DEPS tensor)
+  cc_test(
+    data_type_transform_test
+    SRCS data_type_transform_test.cc
+    DEPS data_type_transform)
 endif()
 
-cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function)
-cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform)
-
-cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
-        framework_proto selected_rows_utils data_device_transform data_type_transform data_layout_transform)
-
-cc_library(attribute SRCS attribute.cc DEPS framework_proto boost enforce)
-cc_test(attribute_test SRCS attribute_test.cc DEPS attribute framework_proto proto_desc)
-cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
-device_context)
-
-cc_library(op_version_proto SRCS op_version_proto.cc DEPS framework_proto boost)
-
-cc_library(op_version_registry SRCS op_version_registry.cc DEPS op_version_proto framework_proto boost)
-cc_test(op_version_registry_test SRCS op_version_registry_test.cc DEPS op_version_registry)
-
-cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog)
-cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
-cc_library(no_need_buffer_vars_inference SRCS no_need_buffer_vars_inference.cc DEPS attribute device_context)
-cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto no_need_buffer_vars_inference)
-cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
-
-cc_test(no_need_buffer_vars_inference_test SRCS no_need_buffer_vars_inference_test.cc DEPS no_need_buffer_vars_inference layer)
-
-cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
-
-cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_vars_inference)
-
-cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
-
-IF(WITH_XPU)
-cc_library(phi_utils SRCS phi_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info xpu_op_list)
-ELSE()
-cc_library(phi_utils SRCS phi_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info)
-ENDIF()
+cc_library(
+  data_layout_transform
+  SRCS data_layout_transform.cc
+  DEPS tensor math_function)
+cc_test(
+  data_layout_transform_test
+  SRCS data_layout_transform_test.cc
+  DEPS data_layout_transform)
+
+cc_library(
+  data_transform
+  SRCS data_transform.cc
+  DEPS math_function
+       tensor
+       framework_proto
+       selected_rows_utils
+       data_device_transform
+       data_type_transform
+       data_layout_transform)
+
+cc_library(
+  attribute
+  SRCS attribute.cc
+  DEPS framework_proto boost enforce)
+cc_test(
+  attribute_test
+  SRCS attribute_test.cc
+  DEPS attribute framework_proto proto_desc)
+cc_test(
+  program_desc_test
+  SRCS program_desc_test.cc
+  DEPS proto_desc device_context)
+
+cc_library(
+  op_version_proto
+  SRCS op_version_proto.cc
+  DEPS framework_proto boost)
+
+cc_library(
+  op_version_registry
+  SRCS op_version_registry.cc
+  DEPS op_version_proto framework_proto boost)
+cc_test(
+  op_version_registry_test
+  SRCS op_version_registry_test.cc
+  DEPS op_version_registry)
+
+cc_library(
+  op_proto_maker
+  SRCS op_proto_maker.cc
+  DEPS framework_proto attribute glog)
+cc_test(
+  op_proto_maker_test
+  SRCS op_proto_maker_test.cc
+  DEPS op_proto_maker)
+cc_library(
+  no_need_buffer_vars_inference
+  SRCS no_need_buffer_vars_inference.cc
+  DEPS attribute device_context)
+cc_library(
+  op_info
+  SRCS op_info.cc
+  DEPS attribute framework_proto no_need_buffer_vars_inference)
+cc_library(
+  shape_inference
+  SRCS shape_inference.cc
+  DEPS ddim attribute device_context)
+
+cc_test(
+  no_need_buffer_vars_inference_test
+  SRCS no_need_buffer_vars_inference_test.cc
+  DEPS no_need_buffer_vars_inference layer)
+
+cc_library(
+  transfer_scope_cache
+  SRCS transfer_scope_cache.cc
+  DEPS scope framework_proto device_context)
+
+cc_library(
+  unused_var_check
+  SRCS unused_var_check.cc
+  DEPS glog no_need_buffer_vars_inference)
+
+cc_library(
+  op_kernel_type
+  SRCS op_kernel_type.cc
+  DEPS device_context place)
+
+if(WITH_XPU)
+  cc_library(
+    phi_utils
+    SRCS phi_utils.cc
+    DEPS lod_tensor
+         selected_rows_utils
+         place
+         phi
+         var_type_traits
+         phi_api_utils
+         op_info
+         xpu_op_list)
+else()
+  cc_library(
+    phi_utils
+    SRCS phi_utils.cc
+    DEPS lod_tensor
+         selected_rows_utils
+         place
+         phi
+         var_type_traits
+         phi_api_utils
+         op_info)
+endif()
 
-IF(WITH_XPU)
-cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils
-    phi_utils kernel_factory infershape_utils op_utils)
-ELSE()
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils
-    phi_utils kernel_factory infershape_utils op_utils)
-ENDIF()
+if(WITH_XPU)
+  cc_library(
+    operator
+    SRCS operator.cc
+    DEPS xpu_op_list
+         op_info
+         device_context
+         tensor
+         scope
+         glog
+         trainer_desc_proto
+         data_feed_proto
+         shape_inference
+         data_transform
+         lod_tensor
+         profiler
+         transfer_scope_cache
+         op_kernel_type
+         op_call_stack
+         unused_var_check
+         nan_inf_utils
+         phi_utils
+         kernel_factory
+         infershape_utils
+         op_utils)
+else()
+  cc_library(
+    operator
+    SRCS operator.cc
+    DEPS op_info
+         device_context
+         tensor
+         scope
+         glog
+         trainer_desc_proto
+         data_feed_proto
+         shape_inference
+         data_transform
+         lod_tensor
+         profiler
+         transfer_scope_cache
+         op_kernel_type
+         op_call_stack
+         unused_var_check
+         nan_inf_utils
+         phi_utils
+         kernel_factory
+         infershape_utils
+         op_utils)
+endif()
 
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
-cc_test(operator_exception_test SRCS operator_exception_test.cc DEPS operator op_registry device_context)
+cc_test(
+  operator_test
+  SRCS operator_test.cc
+  DEPS operator op_registry device_context)
+cc_test(
+  operator_exception_test
+  SRCS operator_exception_test.cc
+  DEPS operator op_registry device_context)
 
 cc_library(version SRCS version.cc)
-cc_test(version_test SRCS version_test.cc DEPS version)
-
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc process_mesh_desc.cc DEPS attribute shape_inference op_info operator glog version)
-
-cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
-
-cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce)
-cc_test(op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack)
-
-cc_library(program_processing SRCS program_processing.cc DEPS boost proto_desc)
-cc_test(program_processing_test SRCS program_processing_test.cc DEPS proto_desc program_processing)
+cc_test(
+  version_test
+  SRCS version_test.cc
+  DEPS version)
+
+cc_library(
+  proto_desc
+  SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc process_mesh_desc.cc
+  DEPS attribute shape_inference op_info operator glog version)
+
+cc_library(
+  op_registry
+  SRCS op_registry.cc
+  DEPS op_proto_maker op_info operator glog proto_desc)
+
+cc_library(
+  op_call_stack
+  SRCS op_call_stack.cc
+  DEPS op_proto_maker enforce)
+cc_test(
+  op_call_stack_test
+  SRCS op_call_stack_test.cc
+  DEPS op_call_stack)
+
+cc_library(
+  program_processing
+  SRCS program_processing.cc
+  DEPS boost proto_desc)
+cc_test(
+  program_processing_test
+  SRCS program_processing_test.cc
+  DEPS proto_desc program_processing)
 
 if(WITH_GPU)
-  nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+  nv_test(
+    op_registry_test
+    SRCS op_registry_test.cc
+    DEPS op_registry)
 elseif(WITH_ROCM)
-  hip_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+  hip_test(
+    op_registry_test
+    SRCS op_registry_test.cc
+    DEPS op_registry)
 endif()
 
 if(WITH_PYTHON)
   py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
   py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto)
-  py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
+  py_proto_compile(distributed_strategy_py_proto SRCS
+                   distributed_strategy.proto)
   py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto)
-#Generate an empty \
-    #__init__.py to make framework_py_proto as a valid python module.
-  add_custom_target(fleet_proto_init ALL  
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
-    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py 
-  )
-  add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto ps_py_proto ps_py_proto_init)
-  if (NOT WIN32)
-    add_custom_command(TARGET framework_py_proto POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+  #Generate an empty \
+  #__init__.py to make framework_py_proto as a valid python module.
+  add_custom_target(
+    fleet_proto_init ALL
+    COMMAND ${CMAKE_COMMAND} -E make_directory
+            ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+    COMMAND
+      ${CMAKE_COMMAND} -E touch
+      ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py)
+  add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E
+                                                        touch __init__.py)
+  add_dependencies(
+    framework_py_proto
+    framework_py_proto_init
+    trainer_py_proto
+    distributed_strategy_py_proto
+    fleet_proto_init
+    pass_desc_py_proto
+    ps_py_proto
+    ps_py_proto_init)
+  if(NOT WIN32)
+    add_custom_command(
+      TARGET framework_py_proto
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E make_directory
+              ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
       COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
-      COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+      COMMAND cp distributed_strategy_*.py
+              ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto
-      COMMAND cp ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/fleet_executor/fleet_executor_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
-      COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto.")
+    add_custom_target(
+      fleet_executor_proto_init ALL
+      DEPENDS fleet_proto_init fleet_executor_desc_py_proto
+      COMMAND
+        cp
+        ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/fleet_executor/fleet_executor_*.py
+        ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+      COMMENT
+        "Copy generated python proto into directory paddle/distributed/fleet/proto."
+    )
   else(NOT WIN32)
-    string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
-    string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
-    add_custom_command(TARGET framework_py_proto POST_BUILD
-          COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-          COMMAND copy /Y *.py ${proto_dstpath}
-	  COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
-          COMMENT "Copy generated python proto into directory paddle/fluid/proto."
-	  COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto."
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    string(REPLACE "/" "\\" proto_dstpath
+                   "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
+    string(
+      REPLACE "/" "\\" fleet_proto_dstpath
+              "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
+    add_custom_command(
+      TARGET framework_py_proto
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E make_directory
+              ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+      COMMAND copy /Y *.py ${proto_dstpath}
+      COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
+      COMMENT "Copy generated python proto into directory paddle/fluid/proto."
+      COMMENT
+        "Copy generated python proto into directory paddle/distributed/fleet/proto."
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif(NOT WIN32)
 endif()
 
-if (WITH_PSCORE)
-  add_custom_target(index_dataset_proto_init ALL DEPENDS fleet_proto_init index_dataset_py_proto
-    COMMAND cp ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/index_dataset/index_dataset_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
-    COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto.")
+if(WITH_PSCORE)
+  add_custom_target(
+    index_dataset_proto_init ALL
+    DEPENDS fleet_proto_init index_dataset_py_proto
+    COMMAND
+      cp
+      ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/index_dataset/index_dataset_*.py
+      ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+    COMMENT
+      "Copy generated python proto into directory paddle/distributed/fleet/proto."
+  )
 endif(WITH_PSCORE)
 
-cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
-
-cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
-cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
-
-if (TENSORRT_FOUND)
-cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper tensorrt_engine_op)
+cc_library(
+  lod_rank_table
+  SRCS lod_rank_table.cc
+  DEPS lod_tensor)
+
+cc_library(
+  feed_fetch_method
+  SRCS feed_fetch_method.cc
+  DEPS lod_tensor scope glog)
+cc_library(
+  variable_helper
+  SRCS variable_helper.cc
+  DEPS lod_tensor)
+
+if(TENSORRT_FOUND)
+  cc_library(
+    naive_executor
+    SRCS naive_executor.cc
+    DEPS op_registry
+         denormal
+         device_context
+         scope
+         framework_proto
+         glog
+         lod_rank_table
+         feed_fetch_method
+         graph_to_program_pass
+         variable_helper
+         tensorrt_engine_op)
 else()
-cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
+  cc_library(
+    naive_executor
+    SRCS naive_executor.cc
+    DEPS op_registry
+         denormal
+         device_context
+         scope
+         framework_proto
+         glog
+         lod_rank_table
+         feed_fetch_method
+         graph_to_program_pass
+         variable_helper)
 endif(TENSORRT_FOUND)
 
-cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector op_registry while_op_helper recurrent_op_helper conditional_block_op_helper)
+cc_library(
+  executor_gc_helper
+  SRCS executor_gc_helper.cc
+  DEPS scope
+       proto_desc
+       operator
+       garbage_collector
+       op_registry
+       while_op_helper
+       recurrent_op_helper
+       conditional_block_op_helper)
 if(WITH_DISTRIBUTE)
   if(WITH_PSLIB)
-    cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
-    dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
-    heterxpu_trainer.cc
-    data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc
-    ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc data_feed.cu
-    pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
-    device_context scope framework_proto trainer_desc_proto glog fs shell 
-    fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper metrics lodtensor_printer
-    lod_rank_table feed_fetch_method collective_helper ${GLOB_DISTRIBUTE_DEPS}
-    graph_to_program_pass variable_helper data_feed_proto timer monitor
-    heter_service_proto fleet_executor ${BRPC_DEP})
-    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses")
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-        set(DISTRIBUTE_COMPILE_FLAGS
-                "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+    cc_library(
+      executor
+      SRCS executor.cc
+           multi_trainer.cc
+           pipeline_trainer.cc
+           dataset_factory.cc
+           dist_multi_trainer.cc
+           trainer_factory.cc
+           trainer.cc
+           data_feed_factory.cc
+           heterxpu_trainer.cc
+           data_feed.cc
+           device_worker.cc
+           hogwild_worker.cc
+           hetercpu_worker.cc
+           ps_gpu_worker.cc
+           ps_gpu_trainer.cc
+           downpour_worker.cc
+           downpour_worker_opt.cc
+           data_feed.cu
+           pull_dense_worker.cc
+           section_worker.cc
+           device_worker_factory.cc
+           data_set.cc
+      DEPS op_registry
+           device_context
+           scope
+           framework_proto
+           trainer_desc_proto
+           glog
+           fs
+           shell
+           fleet_wrapper
+           heter_wrapper
+           ps_gpu_wrapper
+           box_wrapper
+           metrics
+           lodtensor_printer
+           lod_rank_table
+           feed_fetch_method
+           collective_helper
+           ${GLOB_DISTRIBUTE_DEPS}
+           graph_to_program_pass
+           variable_helper
+           data_feed_proto
+           timer
+           monitor
+           heter_service_proto
+           fleet_executor
+           ${BRPC_DEP})
+    set(DISTRIBUTE_COMPILE_FLAGS
+        "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses"
+    )
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
     endif()
-    set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(
+      executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(
+      device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(
+      hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(
+      heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   elseif(WITH_PSCORE)
-    cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
-            dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
-            heterxpu_trainer.cc heter_pipeline_trainer.cc
-            data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
-            downpour_worker.cc downpour_lite_worker.cc downpour_worker_opt.cc data_feed.cu
-            pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
-            device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
-            index_sampler index_wrapper sampler index_dataset_proto
-            lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method
-            graph_to_program_pass variable_helper timer monitor heter_service_proto fleet heter_server brpc fleet_executor)
-    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses")
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-        set(DISTRIBUTE_COMPILE_FLAGS
-                "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+    cc_library(
+      executor
+      SRCS executor.cc
+           multi_trainer.cc
+           pipeline_trainer.cc
+           dataset_factory.cc
+           dist_multi_trainer.cc
+           trainer_factory.cc
+           trainer.cc
+           data_feed_factory.cc
+           heterxpu_trainer.cc
+           heter_pipeline_trainer.cc
+           data_feed.cc
+           device_worker.cc
+           hogwild_worker.cc
+           hetercpu_worker.cc
+           downpour_worker.cc
+           downpour_lite_worker.cc
+           downpour_worker_opt.cc
+           data_feed.cu
+           pull_dense_worker.cc
+           section_worker.cc
+           heter_section_worker.cc
+           device_worker_factory.cc
+           data_set.cc
+      DEPS op_registry
+           device_context
+           scope
+           framework_proto
+           data_feed_proto
+           heter_service_proto
+           trainer_desc_proto
+           glog
+           index_sampler
+           index_wrapper
+           sampler
+           index_dataset_proto
+           lod_rank_table
+           fs
+           shell
+           fleet_wrapper
+           heter_wrapper
+           box_wrapper
+           metrics
+           lodtensor_printer
+           feed_fetch_method
+           graph_to_program_pass
+           variable_helper
+           timer
+           monitor
+           heter_service_proto
+           fleet
+           heter_server
+           brpc
+           fleet_executor)
+    set(DISTRIBUTE_COMPILE_FLAGS
+        "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses"
+    )
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
     endif()
-    set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(multi_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(hogwild_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(downpour_lite_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(heter_section_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(heter_pipeline_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(
+      executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(
+      device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(
+      multi_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(
+      hogwild_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(
+      downpour_lite_worker.cc PROPERTIES COMPILE_FLAGS
+                                         ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(
+      heter_section_worker.cc PROPERTIES COMPILE_FLAGS
+                                         ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(
+      heter_pipeline_trainer.cc PROPERTIES COMPILE_FLAGS
+                                           ${DISTRIBUTE_COMPILE_FLAGS})
   else()
-    cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
-            dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
-            heterxpu_trainer.cc
-            data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc
-            ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc data_feed.cu
-            pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
-            device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
-            lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method
-            graph_to_program_pass variable_helper timer monitor fleet_executor)
+    cc_library(
+      executor
+      SRCS executor.cc
+           multi_trainer.cc
+           pipeline_trainer.cc
+           dataset_factory.cc
+           dist_multi_trainer.cc
+           trainer_factory.cc
+           trainer.cc
+           data_feed_factory.cc
+           heterxpu_trainer.cc
+           data_feed.cc
+           device_worker.cc
+           hogwild_worker.cc
+           hetercpu_worker.cc
+           ps_gpu_worker.cc
+           ps_gpu_trainer.cc
+           downpour_worker.cc
+           downpour_worker_opt.cc
+           data_feed.cu
+           pull_dense_worker.cc
+           section_worker.cc
+           device_worker_factory.cc
+           data_set.cc
+      DEPS op_registry
+           device_context
+           scope
+           framework_proto
+           data_feed_proto
+           heter_service_proto
+           trainer_desc_proto
+           glog
+           lod_rank_table
+           fs
+           shell
+           fleet_wrapper
+           heter_wrapper
+           ps_gpu_wrapper
+           box_wrapper
+           metrics
+           lodtensor_printer
+           feed_fetch_method
+           graph_to_program_pass
+           variable_helper
+           timer
+           monitor
+           fleet_executor)
   endif()
 elseif(WITH_PSLIB)
-  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-      set(DISTRIBUTE_COMPILE_FLAGS
-              "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+  set(DISTRIBUTE_COMPILE_FLAGS
+      "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
+  )
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
   endif()
-  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
-  dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
-  heterxpu_trainer.cc
-  data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc
-  ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc data_feed.cu
-  pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
-  device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
-  lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
-  graph_to_program_pass variable_helper timer monitor fleet_executor ${BRPC_DEP})
+  set_source_files_properties(
+    executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  cc_library(
+    executor
+    SRCS executor.cc
+         multi_trainer.cc
+         pipeline_trainer.cc
+         dataset_factory.cc
+         dist_multi_trainer.cc
+         trainer_factory.cc
+         trainer.cc
+         data_feed_factory.cc
+         heterxpu_trainer.cc
+         data_feed.cc
+         device_worker.cc
+         hogwild_worker.cc
+         hetercpu_worker.cc
+         ps_gpu_worker.cc
+         ps_gpu_trainer.cc
+         downpour_worker.cc
+         downpour_worker_opt.cc
+         data_feed.cu
+         pull_dense_worker.cc
+         section_worker.cc
+         device_worker_factory.cc
+         data_set.cc
+    DEPS op_registry
+         device_context
+         scope
+         framework_proto
+         data_feed_proto
+         heter_service_proto
+         trainer_desc_proto
+         glog
+         lod_rank_table
+         fs
+         shell
+         fleet_wrapper
+         heter_wrapper
+         ps_gpu_wrapper
+         box_wrapper
+         lodtensor_printer
+         feed_fetch_method
+         graph_to_program_pass
+         variable_helper
+         timer
+         monitor
+         fleet_executor
+         ${BRPC_DEP})
 else()
-  cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
-  dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
-  heterxpu_trainer.cc
-  data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc
-  ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc data_feed.cu
-  pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
-  device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
-  lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
-  graph_to_program_pass variable_helper timer monitor fleet_executor)
+  cc_library(
+    executor
+    SRCS executor.cc
+         multi_trainer.cc
+         pipeline_trainer.cc
+         dataset_factory.cc
+         dist_multi_trainer.cc
+         trainer_factory.cc
+         trainer.cc
+         data_feed_factory.cc
+         heterxpu_trainer.cc
+         data_feed.cc
+         device_worker.cc
+         hogwild_worker.cc
+         hetercpu_worker.cc
+         ps_gpu_worker.cc
+         ps_gpu_trainer.cc
+         downpour_worker.cc
+         downpour_worker_opt.cc
+         data_feed.cu
+         pull_dense_worker.cc
+         section_worker.cc
+         device_worker_factory.cc
+         data_set.cc
+    DEPS op_registry
+         device_context
+         scope
+         framework_proto
+         data_feed_proto
+         heter_service_proto
+         trainer_desc_proto
+         glog
+         lod_rank_table
+         fs
+         shell
+         fleet_wrapper
+         heter_wrapper
+         ps_gpu_wrapper
+         box_wrapper
+         lodtensor_printer
+         feed_fetch_method
+         graph_to_program_pass
+         variable_helper
+         timer
+         monitor
+         fleet_executor)
 endif()
 
-target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper conditional_block_op_helper)
-
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS
-        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor
-        graph build_strategy bind_threaded_ssa_graph_executor collective_helper
-        fast_threaded_ssa_graph_executor variable_helper)
-
-cc_library(executor_cache SRCS executor_cache.cc DEPS parallel_executor)
+target_link_libraries(executor while_op_helper executor_gc_helper
+                      recurrent_op_helper conditional_block_op_helper)
+
+cc_library(
+  parallel_executor
+  SRCS parallel_executor.cc
+  DEPS threaded_ssa_graph_executor
+       scope_buffered_ssa_graph_executor
+       parallel_ssa_graph_executor
+       async_ssa_graph_executor
+       graph
+       build_strategy
+       bind_threaded_ssa_graph_executor
+       collective_helper
+       fast_threaded_ssa_graph_executor
+       variable_helper)
+
+cc_library(
+  executor_cache
+  SRCS executor_cache.cc
+  DEPS parallel_executor)
 if(WITH_PSCORE)
-    get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
-    cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS
-        conditional_block_op executor gloo_wrapper ${RPC_DEPS})
-    cc_test(heter_pipeline_trainer_test SRCS heter_pipeline_trainer_test.cc DEPS
-           conditional_block_op scale_op heter_listen_and_serv_op executor heter_server gloo_wrapper eigen_function ${RPC_DEPS})
+  get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
+  cc_test(
+    dist_multi_trainer_test
+    SRCS dist_multi_trainer_test.cc
+    DEPS conditional_block_op executor gloo_wrapper ${RPC_DEPS})
+  cc_test(
+    heter_pipeline_trainer_test
+    SRCS heter_pipeline_trainer_test.cc
+    DEPS conditional_block_op
+         scale_op
+         heter_listen_and_serv_op
+         executor
+         heter_server
+         gloo_wrapper
+         eigen_function
+         ${RPC_DEPS})
 else()
-    cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS
-        conditional_block_op executor gloo_wrapper)
+  cc_test(
+    dist_multi_trainer_test
+    SRCS dist_multi_trainer_test.cc
+    DEPS conditional_block_op executor gloo_wrapper)
 endif()
-cc_library(prune SRCS prune.cc DEPS framework_proto boost)
-cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
-cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
-        proto_desc)
-cc_library(selected_rows_utils SRCS selected_rows_utils.cc DEPS selected_rows)
-cc_test(selected_rows_utils_test SRCS selected_rows_utils_test.cc DEPS selected_rows_utils)
-
-cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type)
+cc_library(
+  prune
+  SRCS prune.cc
+  DEPS framework_proto boost)
+cc_test(
+  prune_test
+  SRCS prune_test.cc
+  DEPS op_info prune recurrent_op device_context)
+cc_test(
+  var_type_inference_test
+  SRCS var_type_inference_test.cc
+  DEPS op_registry proto_desc)
+cc_library(
+  selected_rows_utils
+  SRCS selected_rows_utils.cc
+  DEPS selected_rows)
+cc_test(
+  selected_rows_utils_test
+  SRCS selected_rows_utils_test.cc
+  DEPS selected_rows_utils)
+
+cc_test(
+  op_kernel_type_test
+  SRCS op_kernel_type_test.cc
+  DEPS place device_context framework_proto op_kernel_type)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 
-cc_test(tuple_test SRCS tuple_test.cc )
+cc_test(tuple_test SRCS tuple_test.cc)
 
 cc_test(inlined_vector_test SRCS inlined_vector_test.cc)
 
-cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
-cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
-
-cc_library(op_compatible_info SRCS op_compatible_info.cc DEPS string_helper proto_desc)
-cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatible_info proto_desc string_helper glog)
-
-cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer)
-cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
-cc_library(generator SRCS generator.cc DEPS enforce place)
-
-cc_library(infershape_utils SRCS infershape_utils.cc DEPS lod_tensor selected_rows_utils attribute place var_type_traits phi phi_api_utils op_info shape_inference)
-cc_test(infershape_utils_test SRCS infershape_utils_test.cc DEPS infershape_utils infermeta_utils meta_tensor)
+cc_library(
+  dlpack_tensor
+  SRCS dlpack_tensor.cc
+  DEPS tensor dlpack)
+cc_test(
+  dlpack_tensor_test
+  SRCS dlpack_tensor_test.cc
+  DEPS dlpack_tensor glog)
+
+cc_library(
+  op_compatible_info
+  SRCS op_compatible_info.cc
+  DEPS string_helper proto_desc)
+cc_test(
+  op_compatible_info_test
+  SRCS op_compatible_info_test.cc
+  DEPS op_compatible_info proto_desc string_helper glog)
+
+cc_library(
+  save_load_util
+  SRCS save_load_util.cc
+  DEPS tensor scope layer)
+cc_test(
+  save_load_util_test
+  SRCS save_load_util_test.cc
+  DEPS save_load_util tensor scope layer)
+cc_library(
+  generator
+  SRCS generator.cc
+  DEPS enforce place)
+
+cc_library(
+  infershape_utils
+  SRCS infershape_utils.cc
+  DEPS lod_tensor
+       selected_rows_utils
+       attribute
+       place
+       var_type_traits
+       phi
+       phi_api_utils
+       op_info
+       shape_inference)
+cc_test(
+  infershape_utils_test
+  SRCS infershape_utils_test.cc
+  DEPS infershape_utils infermeta_utils meta_tensor)
 
 # Get the current working branch
 execute_process(
   COMMAND git rev-parse --abbrev-ref HEAD
-    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-      OUTPUT_VARIABLE PADDLE_BRANCH
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-	)
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_BRANCH
+  OUTPUT_STRIP_TRAILING_WHITESPACE)
 
 # Get the latest abbreviated commit hash of the working branch
 execute_process(
   COMMAND git log -1 --format=%h
-    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-      OUTPUT_VARIABLE PADDLE_COMMIT
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-	)
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_COMMIT
+  OUTPUT_STRIP_TRAILING_WHITESPACE)
 
 message(STATUS "commit: ${PADDLE_COMMIT}")
 message(STATUS "branch: ${PADDLE_BRANCH}")
 
 configure_file(commit.h.in commit.h)
 
-cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api)
+cc_library(
+  custom_operator
+  SRCS custom_operator.cc
+  DEPS tensor
+       attribute
+       framework_proto
+       op_registry
+       operator
+       dynamic_loader
+       string_helper
+       phi_tensor
+       op_meta_info
+       phi_api)
 
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 
-set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
+set(FLUID_FRAMEWORK_MODULES
+    proto_desc
+    memory
+    lod_tensor
+    executor
+    data_feed_proto
+    layer
+    dynamic_loader
+    custom_operator)
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 
@@ -456,11 +1145,23 @@ if(WITH_TESTING AND TEST selected_rows_utils_test)
 endif()
 
 cc_test(scope_guard_test SRCS scope_guard_test.cc)
-cc_test(phi_utils_test SRCS phi_utils_test.cc DEPS phi_utils)
+cc_test(
+  phi_utils_test
+  SRCS phi_utils_test.cc
+  DEPS phi_utils)
 
 if(WITH_GPU OR WITH_ROCM)
-  cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
+  cc_library(
+    fluid_convert_utils
+    SRCS convert_utils.cc
+    DEPS data_type place gpu_info)
 else()
-  cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place)
+  cc_library(
+    fluid_convert_utils
+    SRCS convert_utils.cc
+    DEPS data_type place)
 endif()
-cc_test(convert_utils_test SRCS convert_utils_test.cc DEPS fluid_convert_utils)
+cc_test(
+  convert_utils_test
+  SRCS convert_utils_test.cc
+  DEPS fluid_convert_utils)
diff --git a/paddle/fluid/framework/archive.h b/paddle/fluid/framework/archive.h
index d058938386343..6a8f4ff47f35d 100644
--- a/paddle/fluid/framework/archive.h
+++ b/paddle/fluid/framework/archive.h
@@ -20,6 +20,7 @@
 #endif
 
 #include <glog/logging.h>
+
 #include <algorithm>
 #include <map>
 #include <memory>
@@ -31,6 +32,7 @@
 #include <utility>
 #include <valarray>
 #include <vector>
+
 #include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
index ae3d8379bdbf7..d6cc5dc639fe6 100644
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/async_executor.h"
+
+#include "gflags/gflags.h"
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/message.h"
 #include "google/protobuf/text_format.h"
-
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/executor_thread_worker.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h
index b0c6c8a01648f..01daf3c11187b 100644
--- a/paddle/fluid/framework/async_executor.h
+++ b/paddle/fluid/framework/async_executor.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <time.h>
+
 #include <map>
 #include <memory>
 #include <mutex>   // NOLINT
@@ -24,6 +25,7 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <typeinfo>
 #include <vector>
+
 #include "paddle/fluid/framework/data_feed.pb.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/executor.h"
diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
index 2164a21f3f892..b2c5bfde3aa56 100644
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
 #include <functional>
 #include <iosfwd>
 #include <string>
diff --git a/paddle/fluid/framework/attribute_test.cc b/paddle/fluid/framework/attribute_test.cc
index 27a6afb49f5e8..8a47e41d38359 100644
--- a/paddle/fluid/framework/attribute_test.cc
+++ b/paddle/fluid/framework/attribute_test.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/attribute.h"
+
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/program_desc.h"
-
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/utils/any.h"
 
 TEST(Attribute, GetAttrValueToAny) {
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index 80fee94f1c85d..1eb3585fa3339 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -20,6 +20,7 @@
 #endif
 
 #include <glog/logging.h>
+
 #include <algorithm>
 #include <condition_variable>  // NOLINT
 #include <deque>
@@ -28,6 +29,7 @@
 #include <mutex>  // NOLINT
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/expect.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/convert_utils_test.cc b/paddle/fluid/framework/convert_utils_test.cc
index 140806dfd7c5e..e3f5a4a8dcda1 100644
--- a/paddle/fluid/framework/convert_utils_test.cc
+++ b/paddle/fluid/framework/convert_utils_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/convert_utils.h"
+
 #include "gtest/gtest.h"
 
 namespace phi {
diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc
index d8c27ad280d18..d4f36be5e87e7 100644
--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <sys/types.h>
+
 #include <random>
 
 #include "gflags/gflags.h"
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 65c41e19ac423..0130fd4b57ffa 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -867,43 +867,43 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
     bool is_double_grad = (i == 2);
 
     // GradOpDescMaker
-    info.grad_op_maker_ = [grad_op_name, grad_op_inputs, grad_op_outputs,
-                           is_double_grad](
-        const OpDesc& fwd_op,
-        const std::unordered_set<std::string>& no_grad_set,
-        std::unordered_map<std::string, std::string>* grad_to_var,
-        const std::vector<BlockDesc*>& grad_block) {
-      CustomGradOpMaker<paddle::framework::OpDesc> maker(
-          fwd_op, no_grad_set, grad_to_var, grad_block, grad_op_name,
-          grad_op_inputs, grad_op_outputs, is_double_grad);
-      return maker();
-    };
+    info.grad_op_maker_ =
+        [grad_op_name, grad_op_inputs, grad_op_outputs, is_double_grad](
+            const OpDesc& fwd_op,
+            const std::unordered_set<std::string>& no_grad_set,
+            std::unordered_map<std::string, std::string>* grad_to_var,
+            const std::vector<BlockDesc*>& grad_block) {
+          CustomGradOpMaker<paddle::framework::OpDesc> maker(
+              fwd_op, no_grad_set, grad_to_var, grad_block, grad_op_name,
+              grad_op_inputs, grad_op_outputs, is_double_grad);
+          return maker();
+        };
 
     // GradOpBaseMaker
-    info.dygraph_grad_op_maker_ = [grad_op_name, grad_op_inputs,
-                                   grad_op_outputs, is_double_grad](
-        const std::string& type,
-        const imperative::NameVarBaseMap& var_base_map_in,
-        const imperative::NameVarBaseMap& var_base_map_out,
-        const framework::AttributeMap& attrs,
-        const framework::AttributeMap& default_attrs,
-        const std::map<std::string, std::string>& inplace_map) {
-      CustomGradOpMaker<paddle::imperative::OpBase> maker(
-          type, var_base_map_in, var_base_map_out, attrs, inplace_map,
-          grad_op_name, grad_op_inputs, grad_op_outputs, is_double_grad);
-      maker.SetDygraphDefaultAttrsMap(default_attrs);
-      return maker();
-    };
+    info.dygraph_grad_op_maker_ =
+        [grad_op_name, grad_op_inputs, grad_op_outputs, is_double_grad](
+            const std::string& type,
+            const imperative::NameVarBaseMap& var_base_map_in,
+            const imperative::NameVarBaseMap& var_base_map_out,
+            const framework::AttributeMap& attrs,
+            const framework::AttributeMap& default_attrs,
+            const std::map<std::string, std::string>& inplace_map) {
+          CustomGradOpMaker<paddle::imperative::OpBase> maker(
+              type, var_base_map_in, var_base_map_out, attrs, inplace_map,
+              grad_op_name, grad_op_inputs, grad_op_outputs, is_double_grad);
+          maker.SetDygraphDefaultAttrsMap(default_attrs);
+          return maker();
+        };
 
     /* Grad op register */
     OpInfo grad_info;
 
     // Grad Op
-    grad_info.creator_ = [](
-        const std::string& type, const VariableNameMap& inputs,
-        const VariableNameMap& outputs, const AttributeMap& attrs) {
-      return new CustomOperator(type, inputs, outputs, attrs);
-    };
+    grad_info.creator_ =
+        [](const std::string& type, const VariableNameMap& inputs,
+           const VariableNameMap& outputs, const AttributeMap& attrs) {
+          return new CustomOperator(type, inputs, outputs, attrs);
+        };
 
     // Grad InferShape
     if (grad_infer_shape_fn == nullptr) {
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index 4757eb60f4361..d51707970ffe4 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -13,18 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/framework/phi_utils.h"
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index b63f317aae893..1808caddabccd 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/framework/data_feed.h"
+
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #ifdef _LINUX
 #include <stdio_ext.h>
@@ -220,6 +221,7 @@ bool DataFeed::PickOneFile(std::string* filename) {
       file_idx_, platform::errors::PreconditionNotMet(
                      "You should call SetFileListIndex before PickOneFile"));
   std::unique_lock<std::mutex> lock(*mutex_for_pick_file_);
+  VLOG(4) << "filelist_ size: " << filelist_.size();
   if (*file_idx_ == filelist_.size()) {
     VLOG(3) << "DataFeed::PickOneFile no more file to pick";
     return false;
@@ -230,8 +232,9 @@ bool DataFeed::PickOneFile(std::string* filename) {
 }
 
 void DataFeed::CheckInit() {
-  PADDLE_ENFORCE_EQ(finish_init_, true, platform::errors::PreconditionNotMet(
-                                            "DataFeed initialization failed."));
+  PADDLE_ENFORCE_EQ(
+      finish_init_, true,
+      platform::errors::PreconditionNotMet("DataFeed initialization failed."));
 }
 
 void DataFeed::CheckSetFileList() {
@@ -284,6 +287,7 @@ void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
 
 template <typename T>
 bool PrivateQueueDataFeed<T>::Start() {
+  VLOG(4) << "entering PrivateQueueDataFeed<T>::Start()";
   CheckSetFileList();
   read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this);
   read_thread_.detach();
@@ -295,6 +299,7 @@ bool PrivateQueueDataFeed<T>::Start() {
 template <typename T>
 void PrivateQueueDataFeed<T>::ReadThread() {
 #ifdef _LINUX
+  VLOG(4) << "entering PrivateQueueDataFeed<T>::ReadThread()";
   std::string filename;
   while (PickOneFile(&filename)) {
     int err_no = 0;
@@ -356,6 +361,7 @@ InMemoryDataFeed<T>::InMemoryDataFeed() {
 template <typename T>
 bool InMemoryDataFeed<T>::Start() {
 #ifdef _LINUX
+  VLOG(4) << "entering InMemoryDataFeed<T>::Start()";
   this->CheckSetFileList();
   if (output_channel_->Size() == 0 && input_channel_->Size() != 0) {
     std::vector<T> data;
@@ -664,6 +670,7 @@ void MultiSlotDataFeed::Init(
 
 void MultiSlotDataFeed::ReadThread() {
 #ifdef _LINUX
+  VLOG(4) << "entering MultiSlotDataFeed::ReadThread()";
   std::string filename;
   while (PickOneFile(&filename)) {
     int err_no = 0;
@@ -831,7 +838,6 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe(
   } else {
     int use_slots_num = use_slots_.size();
     instance->resize(use_slots_num);
-
     const char* str = reader.get();
     std::string line = std::string(str);
 
@@ -971,10 +977,13 @@ void MultiSlotDataFeed::PutToFeedVec(
     if (feed_vec_[i] == nullptr) {
       continue;
     }
+    VLOG(4) << "MultiSlotDataFeed::PutToFeedVec i: " << i;
     const auto& type = ins_vec[i].GetType();
     const auto& offset = ins_vec[i].GetOffset();
     int total_instance = static_cast<int>(offset.back());
-
+    VLOG(4) << "total_instance: " << total_instance;
+    // platform::CPUPlace()
+    VLOG(4) << "this->place_: " << this->place_;
     if (type[0] == 'f') {  // float
       const auto& feasign = ins_vec[i].GetFloatData();
       float* tensor_ptr =
@@ -1612,9 +1621,10 @@ template class PrivateInstantDataFeed<std::vector<MultiSlotType>>;
 bool MultiSlotFileInstantDataFeed::Preprocess(const std::string& filename) {
   fd_ = open(filename.c_str(), O_RDONLY);
   PADDLE_ENFORCE_NE(
-      fd_, -1, platform::errors::Unavailable(
-                   "Fail to open file: %s in MultiSlotFileInstantDataFeed.",
-                   filename.c_str()));
+      fd_, -1,
+      platform::errors::Unavailable(
+          "Fail to open file: %s in MultiSlotFileInstantDataFeed.",
+          filename.c_str()));
 
   struct stat sb;
   fstat(fd_, &sb);
@@ -2175,7 +2185,7 @@ void SlotRecordInMemoryDataFeed::LoadIntoMemoryByLine(void) {
     SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE);
     // get slotrecord object function
     auto record_func = [this, &offset, &record_vec, &old_offset](
-        std::vector<SlotRecord>& vec, int num) {
+                           std::vector<SlotRecord>& vec, int num) {
       vec.resize(num);
       if (offset + num > OBJPOOL_BLOCK_SIZE) {
         input_channel_->WriteMove(offset, &record_vec[0]);
@@ -2573,6 +2583,7 @@ void SlotRecordInMemoryDataFeed::ExpandSlotRecord(SlotRecord* rec) {
 }
 
 bool SlotRecordInMemoryDataFeed::Start() {
+  VLOG(4) << "entering SlotRecordInMemoryDataFeed::Start";
 #ifdef _LINUX
   this->CheckSetFileList();
   if (input_channel_->Size() != 0) {
@@ -2667,8 +2678,8 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) {
     size_t* off_start_ptr = &offsets[j * offset_cols_size];
 
     int total_instance = static_cast<int>(off_start_ptr[offset_cols_size - 1]);
-    CHECK(total_instance >= 0) << "slot idx:" << j
-                               << ", total instance:" << total_instance;
+    CHECK(total_instance >= 0)
+        << "slot idx:" << j << ", total instance:" << total_instance;
     auto& info = used_slots_info_[j];
 
     // fill slot value with default value 0
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
index e46e4aeb0124c..e058b19469000 100644
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_feed_factory.h"
 
 #include <stdlib.h>
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc
index 2cc441bbd34cb..8375ed80e8319 100644
--- a/paddle/fluid/framework/data_feed_test.cc
+++ b/paddle/fluid/framework/data_feed_test.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/data_feed.h"
+
 #include <fcntl.h>
+
 #include <chrono>  // NOLINT
 #include <fstream>
 #include <iostream>
@@ -23,6 +25,7 @@
 #include <thread>  // NOLINT
 #include <utility>
 #include <vector>
+
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index 0c762ab2e77e5..f89d0f969abb2 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -13,6 +13,7 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/framework/data_set.h"
+
 #include "google/protobuf/text_format.h"
 #if (defined PADDLE_WITH_DISTRIBUTE) && (defined PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/index_dataset/index_sampler.h"
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 3d096eaebe344..5d961841a250b 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <ThreadPool.h>
+
 #include <fstream>
 #include <memory>
 #include <mutex>  // NOLINT
@@ -26,6 +27,7 @@
 #include <vector>
 #ifdef PADDLE_WITH_GLOO
 #include <gloo/broadcast.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
index 15cf30c1cf352..01802c11d5219 100644
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -44,8 +44,8 @@ TEST(DataType, float16) {
 
 TEST(DataType, bfloat16) {
   using paddle::framework::Tensor;
-  using paddle::platform::CPUPlace;
   using paddle::platform::bfloat16;
+  using paddle::platform::CPUPlace;
   namespace f = paddle::framework;
   f::proto::VarType::Type dtype = f::proto::VarType::BF16;
 
diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu
index 4fab3a7845489..3420298297b3f 100644
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/tensor_util.h"
 
-#include "gtest/gtest.h"
-
 TEST(DataTypeTransform, GPUTransform) {
   auto cpu_place = paddle::platform::CPUPlace();
   auto gpu_place = paddle::platform::CUDAPlace(0);
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 948eaab40b4f6..e193274ff2137 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -1,96 +1,284 @@
-cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node)
-cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
+cc_library(
+  var_handle
+  SRCS var_handle.cc
+  DEPS place framework_proto node)
+cc_library(
+  op_handle_base
+  SRCS op_handle_base.cc
+  DEPS var_handle device_context lod_tensor)
 
-cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
-cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
-cc_library(fetch_async_op_handle SRCS fetch_async_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+cc_library(
+  scale_loss_grad_op_handle
+  SRCS scale_loss_grad_op_handle.cc
+  DEPS op_handle_base scope lod_tensor ddim memory)
+cc_library(
+  fetch_op_handle
+  SRCS fetch_op_handle.cc
+  DEPS op_handle_base scope lod_tensor ddim memory)
+cc_library(
+  fetch_async_op_handle
+  SRCS fetch_async_op_handle.cc
+  DEPS op_handle_base scope lod_tensor ddim memory)
 
-cc_library(share_tensor_buffer_functor SRCS share_tensor_buffer_functor.cc DEPS framework_proto scope place operator op_registry) 
-cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
-cc_library(share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope computation_op_handle share_tensor_buffer_functor)
-cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
-cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry)
-cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
+cc_library(
+  share_tensor_buffer_functor
+  SRCS share_tensor_buffer_functor.cc
+  DEPS framework_proto scope place operator op_registry)
+cc_library(
+  computation_op_handle
+  SRCS computation_op_handle.cc
+  DEPS framework_proto scope place operator op_registry)
+cc_library(
+  share_tensor_buffer_op_handle
+  SRCS share_tensor_buffer_op_handle.cc
+  DEPS op_handle_base scope computation_op_handle share_tensor_buffer_functor)
+cc_library(
+  rpc_op_handle
+  SRCS rpc_op_handle.cc
+  DEPS framework_proto scope place operator op_registry)
+cc_library(
+  fetch_barrier_op_handle
+  SRCS fetch_barrier_op_handle.cc
+  DEPS framework_proto scope place operator op_registry)
+cc_library(
+  multi_devices_helper
+  SRCS multi_devices_helper.cc
+  DEPS graph graph_helper)
 
-cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows_utils)
+cc_library(
+  variable_visitor
+  SRCS variable_visitor.cc
+  DEPS lod_tensor selected_rows_utils)
 
 if(WITH_PSCORE)
-    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-    set_source_files_properties(reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(threaded_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(async_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set(DISTRIBUTE_COMPILE_FLAGS
+      "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
+  )
+  set_source_files_properties(
+    reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    threaded_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS
+                                              ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    async_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS
+                                           ${DISTRIBUTE_COMPILE_FLAGS})
 endif()
 
-
 if(WITH_GPU)
-    nv_library(nan_inf_utils SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu DEPS framework_proto scope place)
-    nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda variable_visitor)
-    nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda variable_visitor place device_memory_aligment)
-    nv_library(grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor
-            ddim memory dynload_cuda variable_visitor place device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle)
-
-    if(WITH_DGC)
-        nv_library(sparse_all_reduce_op_handle SRCS sparse_all_reduce_op_handle.cc DEPS op_handle_base scope 
-            lod_tensor ddim memory dynload_cuda variable_visitor dgc all_reduce_op_handle)
-    endif()
-
-    if(WITH_DISTRIBUTE)
-        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-                ddim dynload_cuda selected_rows_functor)
-    else()
-        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim dynload_cuda selected_rows_functor)
-    endif()
-    nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
-    nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
+  nv_library(
+    nan_inf_utils
+    SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu
+    DEPS framework_proto scope place)
+  nv_library(
+    all_reduce_op_handle
+    SRCS all_reduce_op_handle.cc
+    DEPS op_handle_base
+         scope
+         lod_tensor
+         ddim
+         memory
+         dynload_cuda
+         variable_visitor)
+  nv_library(
+    fused_all_reduce_op_handle
+    SRCS fused_all_reduce_op_handle.cc
+    DEPS op_handle_base
+         scope
+         lod_tensor
+         ddim
+         memory
+         dynload_cuda
+         variable_visitor
+         place
+         device_memory_aligment)
+  nv_library(
+    grad_merge_all_reduce_op_handle
+    SRCS grad_merge_all_reduce_op_handle.cc
+    DEPS op_handle_base
+         scope
+         lod_tensor
+         ddim
+         memory
+         dynload_cuda
+         variable_visitor
+         place
+         device_memory_aligment
+         all_reduce_op_handle
+         fused_all_reduce_op_handle)
+
+  if(WITH_DGC)
+    nv_library(
+      sparse_all_reduce_op_handle
+      SRCS sparse_all_reduce_op_handle.cc
+      DEPS op_handle_base
+           scope
+           lod_tensor
+           ddim
+           memory
+           dynload_cuda
+           variable_visitor
+           dgc
+           all_reduce_op_handle)
+  endif()
+
+  if(WITH_DISTRIBUTE)
+    nv_library(
+      reduce_op_handle
+      SRCS reduce_op_handle.cc
+      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
+           selected_rows_functor)
+  else()
+    nv_library(
+      reduce_op_handle
+      SRCS reduce_op_handle.cc
+      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
+           selected_rows_functor)
+  endif()
+  nv_library(
+    broadcast_op_handle
+    SRCS broadcast_op_handle.cc
+    DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
+  nv_library(
+    fused_broadcast_op_handle
+    SRCS fused_broadcast_op_handle.cc
+    DEPS broadcast_op_handle)
 elseif(WITH_ROCM)
-    hip_library(nan_inf_utils SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu DEPS framework_proto scope place)
-    hip_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda variable_visitor)
-    hip_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda variable_visitor place device_memory_aligment)
-    hip_library(grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor
-            ddim memory dynload_cuda variable_visitor place device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle)
-
-    if(WITH_DISTRIBUTE)
-        hip_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-                    ddim dynload_cuda selected_rows_functor)
-    else()
-        hip_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-                    ddim dynload_cuda selected_rows_functor)
-    endif()
-    hip_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
-    hip_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
+  hip_library(
+    nan_inf_utils
+    SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu
+    DEPS framework_proto scope place)
+  hip_library(
+    all_reduce_op_handle
+    SRCS all_reduce_op_handle.cc
+    DEPS op_handle_base
+         scope
+         lod_tensor
+         ddim
+         memory
+         dynload_cuda
+         variable_visitor)
+  hip_library(
+    fused_all_reduce_op_handle
+    SRCS fused_all_reduce_op_handle.cc
+    DEPS op_handle_base
+         scope
+         lod_tensor
+         ddim
+         memory
+         dynload_cuda
+         variable_visitor
+         place
+         device_memory_aligment)
+  hip_library(
+    grad_merge_all_reduce_op_handle
+    SRCS grad_merge_all_reduce_op_handle.cc
+    DEPS op_handle_base
+         scope
+         lod_tensor
+         ddim
+         memory
+         dynload_cuda
+         variable_visitor
+         place
+         device_memory_aligment
+         all_reduce_op_handle
+         fused_all_reduce_op_handle)
+
+  if(WITH_DISTRIBUTE)
+    hip_library(
+      reduce_op_handle
+      SRCS reduce_op_handle.cc
+      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
+           selected_rows_functor)
+  else()
+    hip_library(
+      reduce_op_handle
+      SRCS reduce_op_handle.cc
+      DEPS op_handle_base variable_visitor scope ddim dynload_cuda
+           selected_rows_functor)
+  endif()
+  hip_library(
+    broadcast_op_handle
+    SRCS broadcast_op_handle.cc
+    DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
+  hip_library(
+    fused_broadcast_op_handle
+    SRCS fused_broadcast_op_handle.cc
+    DEPS broadcast_op_handle)
 else()
-    if (WITH_ASCEND_CL)
-        cc_library(nan_inf_utils SRCS nan_inf_utils_detail.cc DEPS npu_op_runner framework_proto scope place)
-    else()
-        cc_library(nan_inf_utils SRCS nan_inf_utils_detail.cc DEPS framework_proto scope place)
-    endif()
-    cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-             variable_visitor)
-    cc_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            variable_visitor place device_memory_aligment)
-    cc_library(grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor
-            ddim memory variable_visitor place device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle)
-    if(WITH_DISTRIBUTE)
-        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-                ddim selected_rows_functor)
-    else()
-        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim selected_rows_functor)
-    endif()
-    cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
-    cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
+  if(WITH_ASCEND_CL)
+    cc_library(
+      nan_inf_utils
+      SRCS nan_inf_utils_detail.cc
+      DEPS npu_op_runner framework_proto scope place)
+  else()
+    cc_library(
+      nan_inf_utils
+      SRCS nan_inf_utils_detail.cc
+      DEPS framework_proto scope place)
+  endif()
+  cc_library(
+    all_reduce_op_handle
+    SRCS all_reduce_op_handle.cc
+    DEPS op_handle_base scope lod_tensor ddim memory variable_visitor)
+  cc_library(
+    fused_all_reduce_op_handle
+    SRCS fused_all_reduce_op_handle.cc
+    DEPS op_handle_base
+         scope
+         lod_tensor
+         ddim
+         memory
+         variable_visitor
+         place
+         device_memory_aligment)
+  cc_library(
+    grad_merge_all_reduce_op_handle
+    SRCS grad_merge_all_reduce_op_handle.cc
+    DEPS op_handle_base
+         scope
+         lod_tensor
+         ddim
+         memory
+         variable_visitor
+         place
+         device_memory_aligment
+         all_reduce_op_handle
+         fused_all_reduce_op_handle)
+  if(WITH_DISTRIBUTE)
+    cc_library(
+      reduce_op_handle
+      SRCS reduce_op_handle.cc
+      DEPS op_handle_base variable_visitor scope ddim selected_rows_functor)
+  else()
+    cc_library(
+      reduce_op_handle
+      SRCS reduce_op_handle.cc
+      DEPS op_handle_base variable_visitor scope ddim selected_rows_functor)
+  endif()
+  cc_library(
+    broadcast_op_handle
+    SRCS broadcast_op_handle.cc
+    DEPS op_handle_base scope ddim memory variable_visitor)
+  cc_library(
+    fused_broadcast_op_handle
+    SRCS fused_broadcast_op_handle.cc
+    DEPS broadcast_op_handle)
 endif()
 
-cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+cc_library(
+  gather_op_handle
+  SRCS gather_op_handle.cc
+  DEPS op_handle_base scope ddim memory variable_visitor)
 
-cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows_utils reference_count_pass_helper)
+cc_library(
+  eager_deletion_op_handle
+  SRCS eager_deletion_op_handle.cc
+  DEPS lod_tensor selected_rows_utils reference_count_pass_helper)
 
-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto
+set(SSA_GRAPH_EXECUTOR_DEPS
+    graph
+    framework_proto
     multi_devices_helper
     reference_count_pass
     eager_deletion_pass
@@ -98,60 +286,122 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto
     buffer_shared_cross_op_memory_reuse_pass
     inplace_addto_op_pass
     set_reader_device_info_utils)
-cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
+cc_library(
+  ssa_graph_executor
+  SRCS ssa_graph_executor.cc
+  DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
 
-cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
-        simple_threadpool device_context)
+cc_library(
+  threaded_ssa_graph_executor
+  SRCS threaded_ssa_graph_executor.cc
+  DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool
+       device_context)
 
-cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor)
+cc_library(
+  parallel_ssa_graph_executor
+  SRCS parallel_ssa_graph_executor.cc
+  DEPS threaded_ssa_graph_executor)
 
 set(ASYNC_SSA_GRAPH_EXECUTOR_DEPS threaded_ssa_graph_executor)
 
-cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS ${ASYNC_SSA_GRAPH_EXECUTOR_DEPS})
+cc_library(
+  async_ssa_graph_executor
+  SRCS async_ssa_graph_executor.cc
+  DEPS ${ASYNC_SSA_GRAPH_EXECUTOR_DEPS})
 
-cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
-        device_context broadcast_op_handle)
-cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
-        device_context gather_op_handle)
+cc_test(
+  broadcast_op_test
+  SRCS broadcast_op_handle_test.cc
+  DEPS var_handle
+       op_handle_base
+       scope
+       ddim
+       memory
+       device_context
+       broadcast_op_handle)
+cc_test(
+  gather_op_test
+  SRCS gather_op_handle_test.cc
+  DEPS var_handle
+       op_handle_base
+       scope
+       ddim
+       memory
+       device_context
+       gather_op_handle)
 
-cc_library(scope_buffered_monitor SRCS scope_buffered_monitor.cc DEPS scope profiler selected_rows_utils)
-cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor scope_buffered_monitor)
+cc_library(
+  scope_buffered_monitor
+  SRCS scope_buffered_monitor.cc
+  DEPS scope profiler selected_rows_utils)
+cc_library(
+  scope_buffered_ssa_graph_executor
+  SRCS scope_buffered_ssa_graph_executor.cc
+  DEPS ssa_graph_executor scope_buffered_monitor)
 #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
 #        device_context reduce_op_handle )
-cc_library(bind_threaded_ssa_graph_executor SRCS bind_threaded_ssa_graph_executor.cc
-        DEPS fetch_op_handle gflags ssa_graph_executor scope simple_threadpool device_context)
-cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
-        DEPS fetch_async_op_handle ssa_graph_executor scope simple_threadpool device_context)
-cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
-
-cc_test(exception_holder_test SRCS exception_holder_test.cc )
-
-set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
-    multi_devices_graph_print_pass multi_devices_graph_check_pass
-    fuse_elewise_add_act_pass fuse_bn_act_pass fuse_bn_add_act_pass 
-    multi_batch_merge_pass 
+cc_library(
+  bind_threaded_ssa_graph_executor
+  SRCS bind_threaded_ssa_graph_executor.cc
+  DEPS fetch_op_handle gflags ssa_graph_executor scope simple_threadpool
+       device_context)
+cc_library(
+  fast_threaded_ssa_graph_executor
+  SRCS fast_threaded_ssa_graph_executor.cc
+  DEPS fetch_async_op_handle ssa_graph_executor scope simple_threadpool
+       device_context)
+cc_test(
+  fused_broadcast_op_test
+  SRCS fused_broadcast_op_handle_test.cc
+  DEPS fused_broadcast_op_handle)
+
+cc_test(exception_holder_test SRCS exception_holder_test.cc)
+
+set(IR_PASS_DEPS
+    graph_viz_pass
+    multi_devices_graph_pass
+    multi_devices_graph_print_pass
+    multi_devices_graph_check_pass
+    fuse_elewise_add_act_pass
+    fuse_bn_act_pass
+    fuse_bn_add_act_pass
+    multi_batch_merge_pass
     fuse_relu_depthwise_conv_pass
     lock_free_optimize_pass
     sequential_execution_pass
     all_reduce_deps_pass
     add_reader_dependency_pass
     modify_op_lock_and_record_event_pass
-    coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
-    fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
-    sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass
-    fix_op_run_order_pass fuse_gemm_epilogue_pass)
+    coalesce_grad_tensor_pass
+    fuse_all_reduce_op_pass
+    backward_optimizer_op_deps_pass
+    fuse_adam_op_pass
+    fuse_sgd_op_pass
+    fuse_momentum_op_pass
+    sync_batch_norm_pass
+    runtime_context_cache_pass
+    graph_to_program_pass
+    fix_op_run_order_pass
+    fuse_gemm_epilogue_pass)
 
-if (WITH_CINN)
+if(WITH_CINN)
   set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass)
 endif()
 
-if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM))
+if(NOT APPLE
+   AND NOT WIN32
+   AND (WITH_GPU OR WITH_ROCM))
   set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass)
 endif()
-cc_library(build_strategy SRCS build_strategy.cc DEPS pass_builder ${IR_PASS_DEPS})
-cc_test(build_strategy_test SRCS build_strategy_test.cc
-        DEPS build_strategy op_registry op_proto_maker graph string_helper)
+cc_library(
+  build_strategy
+  SRCS build_strategy.cc
+  DEPS pass_builder ${IR_PASS_DEPS})
+cc_test(
+  build_strategy_test
+  SRCS build_strategy_test.cc
+  DEPS build_strategy op_registry op_proto_maker graph string_helper)
 
-if (WITH_MKLDNN)
+if(WITH_MKLDNN)
   target_link_libraries(build_strategy mkldnn_placement_pass)
 endif()
diff --git a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
index 75baf15dc5ec9..ebdf66cdde131 100644
--- a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h"
+
 #include <deque>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
diff --git a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h
index 5e973f13cc618..c907a4b4afc7c 100644
--- a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h
@@ -14,12 +14,14 @@
 
 #pragma once
 #include <ThreadPool.h>
+
 #include <condition_variable>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
diff --git a/paddle/fluid/framework/details/bkcl_op_handle.h b/paddle/fluid/framework/details/bkcl_op_handle.h
index 1a098f06f08f9..b0c2275b3a52b 100644
--- a/paddle/fluid/framework/details/bkcl_op_handle.h
+++ b/paddle/fluid/framework/details/bkcl_op_handle.h
@@ -14,8 +14,6 @@
 
 #pragma once
 
-#include "xpu/bkcl.h"
-
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -24,6 +22,7 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
+#include "xpu/bkcl.h"
 
 DECLARE_bool(sync_bkcl_allreduce);
 
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index fdf74d2f769fc..9ed76c87d846c 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/build_strategy.h"
 
 #include <glog/logging.h>
+
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/ir/graph_printer.h"
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
diff --git a/paddle/fluid/framework/details/build_strategy_test.cc b/paddle/fluid/framework/details/build_strategy_test.cc
index 69af77d23fbf4..1914c1d33de01 100644
--- a/paddle/fluid/framework/details/build_strategy_test.cc
+++ b/paddle/fluid/framework/details/build_strategy_test.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/details/build_strategy.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
@@ -23,8 +25,6 @@
 #include "gtest/gtest-test-part.h"
 #include "gtest/gtest.h"
 #include "gtest/gtest_pred_impl.h"
-
-#include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc
index 5b055d7cb4d12..b440da9f1dfb4 100644
--- a/paddle/fluid/framework/details/cow_ptr_test.cc
+++ b/paddle/fluid/framework/details/cow_ptr_test.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/fluid/framework/details/cow_ptr.h"
+
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 7f51de435ba6c..57440ed9aa2f4 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <cstddef>  // for size_t
+
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index ce471d55b24a1..8b5c3c1798780 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -46,6 +46,12 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
     VLOG(10)
         << "Change thread number to 1 because the toposort order is unique";
     strategy_.num_threads_ = 1;
+    traced_ops_.clear();
+    for (auto *op_node : TopologySortOperations(*graph_)) {
+      if (op_node->IsWrappedBy<OpHandleBase>()) {
+        traced_ops_.emplace_back(&(op_node->Wrapper<OpHandleBase>()));
+      }
+    }
   }
   pool_.reset(new ::ThreadPool(strategy.num_threads_));
   for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index 4477702900a8d..19b0061571596 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -14,10 +14,12 @@
 
 #pragma once
 #include <ThreadPool.h>
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index f4ca4907d48d0..7f44e68af6b0b 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -325,9 +325,10 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
 
     PADDLE_ENFORCE_EQ(
         platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)),
-        true, platform::errors::InvalidArgument(
-                  "The variable '%s' at scope %d is not in the right place.",
-                  var_name, scope_idx));
+        true,
+        platform::errors::InvalidArgument(
+            "The variable '%s' at scope %d is not in the right place.",
+            var_name, scope_idx));
     grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
   }
 }
@@ -356,10 +357,11 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
     // Get element number
     int64_t len = grad_tensor.at(i).second->numel();
     PADDLE_ENFORCE_GT(
-        len, 0, platform::errors::InvalidArgument(
-                    "The size of grad tensors of fused_all_reduce_op_handle  "
-                    "must be > 0, but got %d.",
-                    len));
+        len, 0,
+        platform::errors::InvalidArgument(
+            "The size of grad tensors of fused_all_reduce_op_handle  "
+            "must be > 0, but got %d.",
+            len));
     *numel +=
         platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
   }
diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
index 44b9ca90fc540..18de9f443a72f 100644
--- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h"
+
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
diff --git a/paddle/fluid/framework/details/graph_test_base.h b/paddle/fluid/framework/details/graph_test_base.h
index d139f8488309e..08d9c999a8a5d 100644
--- a/paddle/fluid/framework/details/graph_test_base.h
+++ b/paddle/fluid/framework/details/graph_test_base.h
@@ -18,6 +18,7 @@
 #include <iostream>
 #include <iterator>
 #include <string>
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph.h"
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index e6790de92d054..7b93baddb4af6 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/details/nan_inf_utils_detail.h"
+
+#include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/scope.h"
 
@@ -261,7 +262,7 @@ void CheckNanInf<paddle::platform::complex<float>>(
 }
 
 template <>
-    void CheckNanInf<paddle::platform::complex<double>>>
+    void CheckNanInf < paddle::platform::complex < double >>>
     (const paddle::platform::complex<double>* value, const size_t numel,
      int print_num, const std::string& op_type, const std::string& var_name) {
   double real_sum = 0.0;
@@ -563,8 +564,9 @@ static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
 
   if (sum >= 1.0) PrintNPUOpValueInfo(op, scope, place);
 
-  PADDLE_ENFORCE_LT(sum, 1.0, platform::errors::PreconditionNotMet(
-                                  "Operator %s contains Nan/Inf.", op.Type()));
+  PADDLE_ENFORCE_LT(sum, 1.0,
+                    platform::errors::PreconditionNotMet(
+                        "Operator %s contains Nan/Inf.", op.Type()));
 }
 #endif
 
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index 7cf11f7829da9..b8b5537c93cca 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -12,15 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/nan_inf_utils.h"
-#include "paddle/fluid/framework/details/nan_inf_utils_detail.h"
-
 #include <algorithm>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/details/nan_inf_utils.h"
+#include "paddle/fluid/framework/details/nan_inf_utils_detail.h"
 #include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 427b981e7cda2..213d70337648a 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -213,14 +213,14 @@ struct OpInfoFiller<T, kGradOpDescMaker> {
         platform::errors::AlreadyExists(
             "GradOpDescMaker of %s has been registered", op_type));
 
-    info->grad_op_maker_ = [](
-        const OpDesc& fwd_op,
-        const std::unordered_set<std::string>& no_grad_set,
-        std::unordered_map<std::string, std::string>* grad_to_var,
-        const std::vector<BlockDesc*>& grad_block) {
-      T maker(fwd_op, no_grad_set, grad_to_var, grad_block);
-      return maker();
-    };
+    info->grad_op_maker_ =
+        [](const OpDesc& fwd_op,
+           const std::unordered_set<std::string>& no_grad_set,
+           std::unordered_map<std::string, std::string>* grad_to_var,
+           const std::vector<BlockDesc*>& grad_block) {
+          T maker(fwd_op, no_grad_set, grad_to_var, grad_block);
+          return maker();
+        };
 
     info->use_default_grad_op_desc_maker_ =
         std::is_base_of<DefaultGradOpMaker<OpDesc, true>, T>::value ||
@@ -244,17 +244,17 @@ struct OpInfoFiller<T, kGradOpBaseMaker> {
         platform::errors::AlreadyExists(
             "GradOpBaseMaker of %s has been registered", op_type));
 
-    info->dygraph_grad_op_maker_ = [](
-        const std::string& type,
-        const imperative::NameVarBaseMap& var_base_map_in,
-        const imperative::NameVarBaseMap& var_base_map_out,
-        const framework::AttributeMap& attrs,
-        const framework::AttributeMap& default_attrs,
-        const std::map<std::string, std::string>& inplace_map) {
-      T maker(type, var_base_map_in, var_base_map_out, attrs, inplace_map);
-      maker.SetDygraphDefaultAttrsMap(default_attrs);
-      return maker();
-    };
+    info->dygraph_grad_op_maker_ =
+        [](const std::string& type,
+           const imperative::NameVarBaseMap& var_base_map_in,
+           const imperative::NameVarBaseMap& var_base_map_out,
+           const framework::AttributeMap& attrs,
+           const framework::AttributeMap& default_attrs,
+           const std::map<std::string, std::string>& inplace_map) {
+          T maker(type, var_base_map_in, var_base_map_out, attrs, inplace_map);
+          maker.SetDygraphDefaultAttrsMap(default_attrs);
+          return maker();
+        };
   }
 };
 
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 936e84a6c82b9..22c27fe86f1ae 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -90,10 +90,9 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
     const std::vector<platform::Place> &places, ir::Graph *graph)
     // TODO(Yancey1989): Copying graphs is not safely since it deleted the
     // attrs.
-    : ParallelSSAGraphExecutor(strategy, local_scopes, local_exec_scopes,
-                               places,
-                               SeparateMultiDevicesGraph(graph,
-                                                         places.size())) {}
+    : ParallelSSAGraphExecutor(
+          strategy, local_scopes, local_exec_scopes, places,
+          SeparateMultiDevicesGraph(graph, places.size())) {}
 
 ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
index d9d83efcb8e9b..88c8b1cbfb294 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "ThreadPool.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 2ae3880ab3c2c..799005e4b09bb 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -245,14 +245,15 @@ void ReduceOpHandle::RunImpl() {
         int type = platform::ToBKCLDataType(
             framework::TransToProtoVarType(lod_tensor.dtype()));
         size_t numel = static_cast<size_t>(lod_tensor.numel());
-        all_reduce_calls.emplace_back([buffer, recvbuffer, type, numel, root_id,
-                                       &bkcl_ctx] {
-          PADDLE_ENFORCE_EQ(bkcl_reduce(bkcl_ctx.comm(), buffer, recvbuffer,
-                                        numel, static_cast<BKCLDataType>(type),
-                                        BKCL_ADD, root_id, nullptr),
-                            BKCL_SUCCESS, platform::errors::Unavailable(
-                                              "bkcl_all_reduce failed"));
-        });
+        all_reduce_calls.emplace_back(
+            [buffer, recvbuffer, type, numel, root_id, &bkcl_ctx] {
+              PADDLE_ENFORCE_EQ(
+                  bkcl_reduce(bkcl_ctx.comm(), buffer, recvbuffer, numel,
+                              static_cast<BKCLDataType>(type), BKCL_ADD,
+                              root_id, nullptr),
+                  BKCL_SUCCESS,
+                  platform::errors::Unavailable("bkcl_all_reduce failed"));
+            });
       }
 
       WaitInputVarGenerated();
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
index 39bcf1d0f385f..35373e1a7090b 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc
index 57faf0e75ba99..bd1a4378f0729 100644
--- a/paddle/fluid/framework/details/scope_buffered_monitor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_monitor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/scope_buffered_monitor.h"
+
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index ea5a3c07957bf..091224f1e59bc 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <ThreadPool.h>
+
 #include <deque>
 #include <list>
 #include <memory>
@@ -21,6 +22,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/scope_buffered_monitor.h"
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index 7e63c5ffb9a44..28a5c31f6440f 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -41,8 +41,9 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
       is_encoded_(is_encoded),
       nranks_(nranks) {
   // TODO(gongwb) :polish them!
-  PADDLE_ENFORCE_EQ(is_encoded, true, platform::errors::InvalidArgument(
-                                          "The argument is_encoded is false."));
+  PADDLE_ENFORCE_EQ(
+      is_encoded, true,
+      platform::errors::InvalidArgument("The argument is_encoded is false."));
   VLOG(1) << "Use dgc allreduce mode"
           << ", nranks:" << nranks_;
 
@@ -193,11 +194,12 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
 
     sparse_reduce_calls.emplace_back([=] {
       platform::CUDADeviceGuard guard(dev_id);
-      PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce(
-                            gather_buff, k, out_tensor_buf,
-                            static_cast<int>(out_numel), nranks_, stream),
-                        true, platform::errors::Unavailable(
-                                  "Calling sparseReduce() failed."));
+      PADDLE_ENFORCE_EQ(
+          paddle::communication::dgc::sparseReduce(
+              gather_buff, k, out_tensor_buf, static_cast<int>(out_numel),
+              nranks_, stream),
+          true,
+          platform::errors::Unavailable("Calling sparseReduce() failed."));
     });
   }
 
diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc
index 880261436831d..56cd12f500168 100644
--- a/paddle/fluid/framework/device_worker.cc
+++ b/paddle/fluid/framework/device_worker.cc
@@ -190,9 +190,10 @@ void DeviceWorker::DumpField(const Scope& scope, int dump_mode,
       tensor = &cpu_tensor;
     }
     if (!CheckValidOutput(tensor, batch_size)) {
-      VLOG(0) << "Note: field[" << field << "] cannot pass check, so it was "
-                                            "skipped. Maybe the dimension is "
-                                            "wrong ";
+      VLOG(0) << "Note: field[" << field
+              << "] cannot pass check, so it was "
+                 "skipped. Maybe the dimension is "
+                 "wrong ";
       continue;
     }
     for (size_t i = 0; i < batch_size; ++i) {
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index e6635a2f941cd..c973afd156085 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker_factory.h"
 
 #include <stdlib.h>
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
old mode 100644
new mode 100755
index fff78dd872c99..b3a01ae169e4e
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -120,6 +120,7 @@ message BuildStrategy {
   optional bool fix_op_run_order = 13 [ default = false ];
   optional bool allow_cuda_graph_capture = 14 [ default = false ];
   optional int32 reduce_strategy = 15 [ default = 0 ];
+  optional bool fuse_gemm_epilogue = 16 [ default = false ];
 }
 
 message ExecutionStrategy {
@@ -314,6 +315,7 @@ message DistributedStrategy {
   optional bool adam_d2sum = 36 [ default = false ];
   optional bool auto_search = 37 [ default = false ];
   optional bool heter_ccl_mode = 38 [ default = false ];
+  optional bool is_fl_ps_mode = 39 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 20d08ef18aeb3..7e1f740bcc2cf 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/dlpack_tensor.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index 829908bd98228..6c19cf3450dbd 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/dlpack_tensor.h"
+
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
@@ -39,7 +40,7 @@ constexpr uint8_t GetDLDataTypeCode() {
                     : (std::is_integral<T>::value ? static_cast<uint8_t>(kDLInt)
                                                   : static_cast<uint8_t>(-1)));
 }
-}  // NOLINT
+}  // namespace
 
 template <typename T>
 void TestMain(const platform::Place &place, uint16_t lanes) {
diff --git a/paddle/fluid/framework/downpour_lite_worker.cc b/paddle/fluid/framework/downpour_lite_worker.cc
index 7344c93ef0679..8ceffe58dcf42 100644
--- a/paddle/fluid/framework/downpour_lite_worker.cc
+++ b/paddle/fluid/framework/downpour_lite_worker.cc
@@ -202,15 +202,15 @@ void DownpourLiteWorker::CopyDenseVars() {
     Variable* src_var = thread_scope_->FindVar(src_var_name);
     CHECK(src_var != nullptr) << src_var_name << " not found";  // NOLINT
     LoDTensor* src_tensor = src_var->GetMutable<LoDTensor>();
-    CHECK(src_tensor != nullptr) << src_var_name
-                                 << " tensor is null";  // NOLINT
+    CHECK(src_tensor != nullptr)
+        << src_var_name << " tensor is null";  // NOLINT
     float* src_data = src_tensor->data<float>();
 
     Variable* dest_var = thread_scope_->FindVar(dest_var_name);
     CHECK(dest_var != nullptr) << dest_var_name << " not found";  // NOLINT
     LoDTensor* dest_tensor = dest_var->GetMutable<LoDTensor>();
-    CHECK(dest_tensor != nullptr) << dest_var_name
-                                  << " tensor is null";  // NOLINT
+    CHECK(dest_tensor != nullptr)
+        << dest_var_name << " tensor is null";  // NOLINT
     float* dest_data = dest_tensor->data<float>();
 
     CHECK(src_tensor->numel() == dest_tensor->numel())
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 06c3d18af84ae..c14b48ef8a72f 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -155,8 +155,8 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) {
       continue;
     }
     LoDTensor* tensor = fea_var->GetMutable<LoDTensor>();
-    CHECK(tensor != nullptr) << "tensor of var "
-                             << sparse_key_names_[table_id][i] << " is null";
+    CHECK(tensor != nullptr)
+        << "tensor of var " << sparse_key_names_[table_id][i] << " is null";
 
     // skip slots which do not have embedding
     Variable* emb_var =
@@ -309,9 +309,9 @@ void DownpourWorker::AdjustInsWeight() {
   float* ins_weights = ins_weight_tensor->data<float>();
   size_t len = ins_weight_tensor->numel();  // len = batch size
   // here we assume nid_show slot only has one feasign in each instance
-  CHECK(len == nid_show_.size()) << "ins_weight size should be equal to "
-                                 << "nid_show size, " << len << " vs "
-                                 << nid_show_.size();
+  CHECK(len == nid_show_.size())
+      << "ins_weight size should be equal to "
+      << "nid_show size, " << len << " vs " << nid_show_.size();
   float nid_adjw_threshold = adjust_ins_weight_config_.nid_adjw_threshold();
   float nid_adjw_ratio = adjust_ins_weight_config_.nid_adjw_ratio();
   int64_t nid_adjw_num = 0;
@@ -326,9 +326,8 @@ void DownpourWorker::AdjustInsWeight() {
     }
     float ins_weight = 1.0;
     if (nid_show >= 0 && nid_show < nid_adjw_threshold) {
-      ins_weight = log(M_E +
-                       (nid_adjw_threshold - nid_show) / nid_adjw_threshold *
-                           nid_adjw_ratio);
+      ins_weight = log(M_E + (nid_adjw_threshold - nid_show) /
+                                 nid_adjw_threshold * nid_adjw_ratio);
       // count nid adjw insnum and weight
       ++nid_adjw_num;
       nid_adjw_weight += ins_weight;
@@ -423,15 +422,15 @@ void DownpourWorker::CopyDenseVars() {
     Variable* src_var = thread_scope_->FindVar(src_var_name);
     CHECK(src_var != nullptr) << src_var_name << " not found";  // NOLINT
     LoDTensor* src_tensor = src_var->GetMutable<LoDTensor>();
-    CHECK(src_tensor != nullptr) << src_var_name
-                                 << " tensor is null";  // NOLINT
+    CHECK(src_tensor != nullptr)
+        << src_var_name << " tensor is null";  // NOLINT
     float* src_data = src_tensor->data<float>();
 
     Variable* dest_var = thread_scope_->FindVar(dest_var_name);
     CHECK(dest_var != nullptr) << dest_var_name << " not found";  // NOLINT
     LoDTensor* dest_tensor = dest_var->GetMutable<LoDTensor>();
-    CHECK(dest_tensor != nullptr) << dest_var_name
-                                  << " tensor is null";  // NOLINT
+    CHECK(dest_tensor != nullptr)
+        << dest_var_name << " tensor is null";  // NOLINT
     float* dest_data = dest_tensor->data<float>();
 
     CHECK(src_tensor->numel() == dest_tensor->numel())
diff --git a/paddle/fluid/framework/eigen_test.cc b/paddle/fluid/framework/eigen_test.cc
index 43d5f9ea0e8db..4e214bd36f33a 100644
--- a/paddle/fluid/framework/eigen_test.cc
+++ b/paddle/fluid/framework/eigen_test.cc
@@ -13,10 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/phi/core/ddim.h"
 
 #include <gtest/gtest.h>
 
+#include "paddle/phi/core/ddim.h"
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 06ce9712f5c52..830bbacb6398c 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/executor.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/trainer_desc.pb.h"
 #include "paddle/fluid/framework/trainer_factory.h"
@@ -585,8 +587,9 @@ void Executor::RunPreparedContext(
           "Program in ExecutorPrepareContext should has feed_ops."));
   PADDLE_ENFORCE_EQ(
       has_fetch_operators(global_block, *fetch_targets, fetch_holder_name),
-      true, platform::errors::PreconditionNotMet(
-                "Program in the prepared context should has fetch_ops."));
+      true,
+      platform::errors::PreconditionNotMet(
+          "Program in the prepared context should has fetch_ops."));
 
   // map the data of feed_targets to feed_holder
   for (auto* op : global_block.AllOps()) {
diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index 0ab4bd5a12b06..468b3bc680af3 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/executor_cache.h"
+
 #include "paddle/fluid/framework/op_info.h"
 
 namespace paddle {
@@ -137,6 +138,31 @@ ExecutorInfoCache &ExecutorInfoCache::Instance() {
   return g_exe_cache_info_map;
 }
 
+static PEAndGraphPair CreateExecutorInfo(
+    const ProgramDesc &program_desc, const platform::Place &place,
+    int64_t start_op_index, int64_t end_op_index, framework::Scope *scope,
+    const details::BuildStrategy &build_strategy) {
+  auto execution_strategy = details::GetExecutionStrategy(place);
+  auto graph = std::make_shared<framework::ir::Graph>(
+      program_desc, start_op_index, end_op_index);
+  auto parallel_executor = std::make_shared<framework::ParallelExecutor>(
+      place, scope, execution_strategy, build_strategy, graph.get());
+  parallel_executor->PrepareVariables(scope);
+  return std::make_pair(parallel_executor, graph);
+}
+
+PEAndGraphPair CreateFixOrderExecutorInfo(const ProgramDesc &program_desc,
+                                          const platform::Place &place,
+                                          int64_t start_op_index,
+                                          int64_t end_op_index,
+                                          framework::Scope *scope) {
+  details::BuildStrategy build_strategy;
+  build_strategy.fix_op_run_order_ = true;
+  auto pe_and_graph = CreateExecutorInfo(program_desc, place, start_op_index,
+                                         end_op_index, scope, build_strategy);
+  return pe_and_graph;
+}
+
 CacheInfo GetExecutorInfoFromCache(const ProgramDesc &program_desc,
                                    const platform::Place &place,
                                    int64_t start_op_index, int64_t end_op_index,
@@ -153,21 +179,17 @@ CacheInfo GetExecutorInfoFromCache(const ProgramDesc &program_desc,
     }
 
     VLOG(1) << "create exe_info for " << program_id << " is_grad: " << is_grad;
-    auto execution_strategy = details::GetExecutionStrategy(place);
     auto &build_strategy = cached_exe_info.GetBuildStrategy(program_id);
 
     // 2. Construct Graph and ParallelExecutor.
-    auto graph = std::make_shared<framework::ir::Graph>(
-        program_desc, start_op_index, end_op_index);
-    auto parallel_executor = std::make_shared<framework::ParallelExecutor>(
-        place, scope, execution_strategy, build_strategy, graph.get());
-    parallel_executor->PrepareVariables(scope);
+    auto pe_and_graph = CreateExecutorInfo(program_desc, place, start_op_index,
+                                           end_op_index, scope, build_strategy);
 
     // 3. Insert value into cached map.
     auto &cached_value = cached_exe_info.GetMutable(program_id, is_grad);
-    cached_value.executor_ = parallel_executor;
-    cached_value.graph_ = std::move(graph);
-    return std::make_pair(parallel_executor, /*is_new_created=*/true);
+    cached_value.executor_ = pe_and_graph.first;
+    cached_value.graph_ = pe_and_graph.second;
+    return std::make_pair(pe_and_graph.first, /*is_new_created=*/true);
   } else {
     VLOG(1) << "get exe_info from cache by: " << program_id
             << " is_grad: " << is_grad;
diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h
index 8207b56fc04f1..25c0bfab90c4a 100644
--- a/paddle/fluid/framework/executor_cache.h
+++ b/paddle/fluid/framework/executor_cache.h
@@ -127,11 +127,20 @@ class ExecutorInfoCache {
 using CacheInfo =
     std::pair<std::shared_ptr<ParallelExecutor>, bool /*is_new_created*/>;
 
+using PEAndGraphPair =
+    std::pair<std::shared_ptr<ParallelExecutor>, std::shared_ptr<ir::Graph>>;
+
 CacheInfo GetExecutorInfoFromCache(const ProgramDesc& program_desc,
                                    const platform::Place& place,
                                    int64_t start_op_index, int64_t end_op_index,
                                    bool is_grad, int64_t program_id,
                                    framework::Scope* scope);
 
+PEAndGraphPair CreateFixOrderExecutorInfo(const ProgramDesc& program_desc,
+                                          const platform::Place& place,
+                                          int64_t start_op_index,
+                                          int64_t end_op_index,
+                                          framework::Scope* scope);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
index 06019372a7323..c6ccc2adc659f 100644
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/executor_thread_worker.h"
+
 #include <algorithm>
 #include <utility>
+
+#include "gflags/gflags.h"
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/message.h"
 #include "google/protobuf/text_format.h"
-
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
@@ -616,8 +617,8 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) {
     int len = tensor->numel();
     CHECK(slot_dim * len == g_tensor->numel())
         << "len:" << len << " g_numel:" << g_tensor->numel();
-    CHECK(len == tensor->numel()) << "len:" << len
-                                  << "t_numel:" << tensor->numel();
+    CHECK(len == tensor->numel())
+        << "len:" << len << "t_numel:" << tensor->numel();
     int64_t* ids = tensor->data<int64_t>();
     for (auto id_idx = 0u; id_idx < len; ++id_idx) {
       if (ids[id_idx] == 0) {
@@ -626,15 +627,15 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) {
       }
       memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim);
       push_g[fea_idx][0] = 1.0f;
-      CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx
-                                       << " size:" << fea_info.size();
+      CHECK(fea_idx < fea_info.size())
+          << "fea_idx:" << fea_idx << " size:" << fea_info.size();
       push_g[fea_idx][1] = static_cast<float>(fea_info[fea_idx].label);
       g += slot_dim;
       fea_idx++;
     }
   }
-  CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx
-                                    << " features size:" << features.size();
+  CHECK(fea_idx == features.size())
+      << "fea_idx:" << fea_idx << " features size:" << features.size();
   CHECK_GT(features.size(), 0);
 
   std::vector<float*> push_g_vec;
@@ -701,5 +702,5 @@ void AsyncExecutorThreadWorker::check_pull_push_memory(
 }
 #endif
 
-}  // einit_modelnd namespace framework
+}  // namespace framework
 }  // end namespace paddle
diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h
index 524922b0322e5..f4fa54d2c3a7b 100644
--- a/paddle/fluid/framework/executor_thread_worker.h
+++ b/paddle/fluid/framework/executor_thread_worker.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
+
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/program_desc.h"
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 096134e852833..ec3fdc49fdf1f 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
 
+#include <boost/variant.hpp>
 #include <string>
 
-#include <boost/variant.hpp>
 #include "glog/logging.h"
 
 namespace phi {
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 2e9104f40cc60..3b22a4b0d5d7a 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -1,71 +1,125 @@
 if(WITH_PSLIB)
-    if(WITH_PSLIB_BRPC)
-        set(BRPC_DEPS pslib_brpc)
-    else()
-        if(NOT WITH_HETERPS)
-            set(BRPC_DEPS brpc)
-        endif()
-    endif(WITH_PSLIB_BRPC)
-    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto proto_desc op_registry variable_helper scope ${BRPC_DEPS} pslib)
+  if(WITH_PSLIB_BRPC)
+    set(BRPC_DEPS pslib_brpc)
+  else()
+    if(NOT WITH_HETERPS)
+      set(BRPC_DEPS brpc)
+    endif()
+  endif(WITH_PSLIB_BRPC)
+  cc_library(
+    fleet_wrapper
+    SRCS fleet_wrapper.cc
+    DEPS framework_proto
+         proto_desc
+         op_registry
+         variable_helper
+         scope
+         ${BRPC_DEPS}
+         pslib)
 else()
-    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
+  cc_library(
+    fleet_wrapper
+    SRCS fleet_wrapper.cc
+    DEPS framework_proto variable_helper scope)
 endif(WITH_PSLIB)
 
 if(WITH_HETERPS)
-    if(WITH_NCCL AND WITH_GPU)
-        nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
-        DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
-        add_subdirectory(heter_ps)
-    elseif(WITH_XPU_KP)
-        xpu_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.kps ps_gpu_wrapper.cc
-        DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
-        add_subdirectory(heter_ps)
-    elseif(WITH_RCCL)
-        hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
-        DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
-        add_subdirectory(heter_ps)
-    endif()
+  if(WITH_NCCL AND WITH_GPU)
+    nv_library(
+      ps_gpu_wrapper
+      SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
+      DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
+    add_subdirectory(heter_ps)
+  elseif(WITH_XPU_KP)
+    xpu_library(
+      ps_gpu_wrapper
+      SRCS ps_gpu_wrapper.kps ps_gpu_wrapper.cc
+      DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
+    add_subdirectory(heter_ps)
+  elseif(WITH_RCCL)
+    hip_library(
+      ps_gpu_wrapper
+      SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
+      DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
+    add_subdirectory(heter_ps)
+  endif()
 else()
-    cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc DEPS gloo_wrapper)
+  cc_library(
+    ps_gpu_wrapper
+    SRCS ps_gpu_wrapper.cc
+    DEPS gloo_wrapper)
 endif(WITH_HETERPS)
 
 if(WITH_NCCL OR WITH_RCCL)
-    cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
+  cc_library(
+    nccl_wrapper
+    SRCS nccl_wrapper.cc
+    DEPS framework_proto variable_helper scope)
 endif()
 if(WITH_BOX_PS)
-    if(WITH_GPU)
-        nv_library(box_wrapper SRCS box_wrapper.cc box_wrapper.cu DEPS framework_proto lod_tensor box_ps)
-    endif()
-    if(WITH_ROCM)
-        hip_library(box_wrapper SRCS box_wrapper.cc box_wrapper.cu DEPS framework_proto lod_tensor box_ps)
-    endif()
+  if(WITH_GPU)
+    nv_library(
+      box_wrapper
+      SRCS box_wrapper.cc box_wrapper.cu
+      DEPS framework_proto lod_tensor box_ps)
+  endif()
+  if(WITH_ROCM)
+    hip_library(
+      box_wrapper
+      SRCS box_wrapper.cc box_wrapper.cu
+      DEPS framework_proto lod_tensor box_ps)
+  endif()
 else()
-    cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor)
+  cc_library(
+    box_wrapper
+    SRCS box_wrapper.cc
+    DEPS framework_proto lod_tensor)
 endif(WITH_BOX_PS)
 
-
 if(WITH_GLOO)
-    cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope gloo)
-    cc_library(metrics SRCS metrics.cc DEPS gloo_wrapper)
+  cc_library(
+    gloo_wrapper
+    SRCS gloo_wrapper.cc
+    DEPS framework_proto variable_helper scope gloo)
+  cc_library(
+    metrics
+    SRCS metrics.cc
+    DEPS gloo_wrapper)
 else()
-    cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
-    cc_library(metrics SRCS metrics.cc DEPS gloo_wrapper)
+  cc_library(
+    gloo_wrapper
+    SRCS gloo_wrapper.cc
+    DEPS framework_proto variable_helper scope)
+  cc_library(
+    metrics
+    SRCS metrics.cc
+    DEPS gloo_wrapper)
 endif(WITH_GLOO)
 
 if(WITH_PSLIB)
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-    set(DISTRIBUTE_COMPILE_FLAGS
-            "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
-endif()
-set_source_files_properties(heter_wrapper.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set(DISTRIBUTE_COMPILE_FLAGS
+      "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
+  )
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+  endif()
+  set_source_files_properties(
+    heter_wrapper.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 endif()
 
-cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto
-device_context heter_service_proto ${BRPC_DEPS})
+cc_library(
+  heter_wrapper
+  SRCS heter_wrapper.cc
+  DEPS framework_proto device_context heter_service_proto ${BRPC_DEPS})
 
-cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
+cc_test(
+  test_fleet_cc
+  SRCS test_fleet.cc
+  DEPS fleet_wrapper gloo_wrapper fs shell)
 
 if(WITH_ASCEND OR WITH_ASCEND_CL)
-    cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph)
+  cc_library(
+    ascend_wrapper
+    SRCS ascend_wrapper.cc
+    DEPS framework_proto lod_tensor ascend_ge ascend_graph)
 endif()
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
index d55862120116d..a4bd208959e43 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -22,6 +22,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "ge/ge_api.h"
+#include "graph/attr_value.h"
+#include "graph/tensor.h"
+#include "graph/types.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -29,11 +33,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/timer.h"
 
-#include "ge/ge_api.h"
-#include "graph/attr_value.h"
-#include "graph/tensor.h"
-#include "graph/types.h"
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc
index 8564a42165961..1bb432a791e2c 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cc
+++ b/paddle/fluid/framework/fleet/box_wrapper.cc
@@ -14,10 +14,12 @@
 
 #ifdef PADDLE_WITH_BOX_PS
 #include "paddle/fluid/framework/fleet/box_wrapper.h"
+
 #include <algorithm>
 #include <ctime>
 #include <memory>
 #include <numeric>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
@@ -186,26 +188,30 @@ void BasicAucCalculator::calculate_bucket_error() {
 void BoxWrapper::FeedPass(int date,
                           const std::vector<uint64_t>& feasgin_to_box) const {
   int ret = boxps_ptr_->FeedPass(date, feasgin_to_box);
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                "FeedPass failed in BoxPS."));
+  PADDLE_ENFORCE_EQ(
+      ret, 0,
+      platform::errors::PreconditionNotMet("FeedPass failed in BoxPS."));
 }
 
 void BoxWrapper::BeginFeedPass(int date, boxps::PSAgentBase** agent) const {
   int ret = boxps_ptr_->BeginFeedPass(date, *agent);
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                "BeginFeedPass failed in BoxPS."));
+  PADDLE_ENFORCE_EQ(
+      ret, 0,
+      platform::errors::PreconditionNotMet("BeginFeedPass failed in BoxPS."));
 }
 
 void BoxWrapper::EndFeedPass(boxps::PSAgentBase* agent) const {
   int ret = boxps_ptr_->EndFeedPass(agent);
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                "EndFeedPass failed in BoxPS."));
+  PADDLE_ENFORCE_EQ(
+      ret, 0,
+      platform::errors::PreconditionNotMet("EndFeedPass failed in BoxPS."));
 }
 
 void BoxWrapper::BeginPass() const {
   int ret = boxps_ptr_->BeginPass();
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                "BeginPass failed in BoxPS."));
+  PADDLE_ENFORCE_EQ(
+      ret, 0,
+      platform::errors::PreconditionNotMet("BeginPass failed in BoxPS."));
 }
 
 void BoxWrapper::SetTestMode(bool is_test) const {
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
index aea479ed0b214..17e59ac9104f6 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -17,6 +17,7 @@
 #include <ctime>
 #include <memory>
 #include <numeric>
+
 #include "paddle/fluid/framework/fleet/box_wrapper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -175,13 +176,13 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
 #define EXPAND_EMBED_PULL_CASE(i, ...)                                       \
   case i: {                                                                  \
     constexpr size_t ExpandDim = i;                                          \
-    PullCopy<EmbedxDim,                                                      \
-             ExpandDim><<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( \
-        gpu_values,                                                          \
-        reinterpret_cast<boxps::FeatureValueGpu<EmbedxDim, ExpandDim>*>(     \
-            total_values_gpu),                                               \
-        gpu_len, hidden_size, expand_embed_dim, slot_num, total_length,      \
-        gpu_keys);                                                           \
+    PullCopy<EmbedxDim, ExpandDim>                                           \
+        <<<(total_length + 512 - 1) / 512, 512, 0, stream>>>(                \
+            gpu_values,                                                      \
+            reinterpret_cast<boxps::FeatureValueGpu<EmbedxDim, ExpandDim>*>( \
+                total_values_gpu),                                           \
+            gpu_len, hidden_size, expand_embed_dim, slot_num, total_length,  \
+            gpu_keys);                                                       \
   } break
 #endif
 
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
index b043edca138a8..dc01df221e966 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include <sys/wait.h>
 #endif
 #include <glog/logging.h>
+
 #include <algorithm>
 #include <atomic>
 #include <ctime>
@@ -36,6 +37,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -65,10 +67,12 @@ class BasicAucCalculator {
     _local_pred = 0;
   }
   void add_data(double pred, int label) {
-    PADDLE_ENFORCE_GE(pred, 0.0, platform::errors::PreconditionNotMet(
-                                     "pred should be greater than 0"));
-    PADDLE_ENFORCE_LE(pred, 1.0, platform::errors::PreconditionNotMet(
-                                     "pred should be lower than 1"));
+    PADDLE_ENFORCE_GE(
+        pred, 0.0,
+        platform::errors::PreconditionNotMet("pred should be greater than 0"));
+    PADDLE_ENFORCE_LE(
+        pred, 1.0,
+        platform::errors::PreconditionNotMet("pred should be lower than 1"));
     PADDLE_ENFORCE_EQ(
         label * label, label,
         platform::errors::PreconditionNotMet(
@@ -172,13 +176,15 @@ class AfsManager {
                                          pwd.c_str(), conf_path.c_str());
     VLOG(0) << "AFSAPI Init: user: " << user << ", pwd: " << pwd;
     int ret = _afshandler->Init(true, (com_logstatus() == 0));
-    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                  "Called AFSAPI Init Interface Failed."));
+    PADDLE_ENFORCE_EQ(ret, 0,
+                      platform::errors::PreconditionNotMet(
+                          "Called AFSAPI Init Interface Failed."));
     // Too high level will hurt the performance
     comlog_set_log_level(4);
     ret = _afshandler->Connect();
-    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                  "Called AFSAPI Connect Interface Failed"));
+    PADDLE_ENFORCE_EQ(ret, 0,
+                      platform::errors::PreconditionNotMet(
+                          "Called AFSAPI Connect Interface Failed"));
   }
   virtual ~AfsManager() {
     if (_afshandler != NULL) {
@@ -294,8 +300,9 @@ class AfsManager {
     int ret =
         PopenBidirectionalInternal(cmd.c_str(), rfp, wfp, pid, true, true);
 
-    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                  "Called PopenBidirectionalInternal Failed"));
+    PADDLE_ENFORCE_EQ(ret, 0,
+                      platform::errors::PreconditionNotMet(
+                          "Called PopenBidirectionalInternal Failed"));
     std::string filename(path);
     if (strncmp(filename.c_str(), "afs:", 4) == 0) {
       filename = filename.substr(4);
@@ -451,8 +458,9 @@ class BoxWrapper {
     std::string ret_str;
     int ret = boxps_ptr_->SaveBase(batch_model_path, xbox_model_path, ret_str,
                                    seconds_from_1970 / 86400);
-    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                  "SaveBase failed in BoxPS."));
+    PADDLE_ENFORCE_EQ(
+        ret, 0,
+        platform::errors::PreconditionNotMet("SaveBase failed in BoxPS."));
     return ret_str;
   }
 
@@ -460,8 +468,9 @@ class BoxWrapper {
     VLOG(3) << "Begin SaveDelta";
     std::string ret_str;
     int ret = boxps_ptr_->SaveDelta(xbox_model_path, ret_str);
-    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                  "SaveDelta failed in BoxPS."));
+    PADDLE_ENFORCE_EQ(
+        ret, 0,
+        platform::errors::PreconditionNotMet("SaveDelta failed in BoxPS."));
     return ret_str;
   }
 
diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h
index 6f7009f4d5143..f6f1cbfc2a08d 100644
--- a/paddle/fluid/framework/fleet/box_wrapper_impl.h
+++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h
@@ -79,8 +79,9 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
     int ret = boxps_ptr_->PullSparseGPU(
         total_keys, reinterpret_cast<void*>(total_values_gpu),
         static_cast<int>(total_length), device_id);
-    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                  "PullSparseGPU failed in BoxPS."));
+    PADDLE_ENFORCE_EQ(
+        ret, 0,
+        platform::errors::PreconditionNotMet("PullSparseGPU failed in BoxPS."));
     pull_boxps_timer.Pause();
 
     VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
@@ -144,8 +145,9 @@ void BoxWrapper::PushSparseGradCase(
     int ret = boxps_ptr_->PushSparseGPU(
         total_keys, reinterpret_cast<void*>(total_grad_values_gpu),
         static_cast<int>(total_length), place.GetDeviceId());
-    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                  "PushSparseGPU failed in BoxPS."));
+    PADDLE_ENFORCE_EQ(
+        ret, 0,
+        platform::errors::PreconditionNotMet("PushSparseGPU failed in BoxPS."));
     push_boxps_timer.Pause();
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index deb2b90c93353..5c2be1e55f9ef 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <pslib.h>
 #endif
 #include <ThreadPool.h>
+
 #include <atomic>
 #include <ctime>
 #include <map>
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc
index d850d05d87f5c..56d0e1ec47e7e 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.cc
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+
 #include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h
index 42ae73f9b13f1..1ecaf1318b01b 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.h
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.h
@@ -214,8 +214,9 @@ class GlooWrapper {
           static_cast<void (*)(void*, const void*, const void*, size_t)>(
               &gloo::min<T>));
     } else {
-      PADDLE_ENFORCE_EQ(0, 1, paddle::platform::errors::InvalidArgument(
-                                  "AllReduce mode not known: " + mode));
+      PADDLE_ENFORCE_EQ(0, 1,
+                        paddle::platform::errors::InvalidArgument(
+                            "AllReduce mode not known: " + mode));
     }
     gloo::allreduce(opts);
 #else
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index 823b60c5ef1f2..560607bd160a1 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HETERPS
 
 #include <ThreadPool.h>
+
 #include <algorithm>
 #include <map>
 #include <unordered_map>
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index d62fc1c084962..7540c6147f4b7 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -1,38 +1,96 @@
-IF(WITH_GPU)
-    SET(HETERPS_DEPS device_context)
-    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
-        SET(HETERPS_DEPS ${HETERPS_DEPS} cub)
-    endif()
-    if(WITH_PSCORE)
-        get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
-        SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS})
-    endif()
-    nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu feature_value.h DEPS ${HETERPS_DEPS})
-    nv_library(hashtable_kernel SRCS hashtable_kernel.cu feature_value.h DEPS ${HETERPS_DEPS})
-    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
-    nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
-    nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
-    if(WITH_PSCORE)
-        nv_library(graph_gpu_ps SRCS graph_gpu_ps_table_inl.cu DEPS heter_comm table hashtable_kernel)
-        nv_library(graph_sampler SRCS graph_sampler_inl.h DEPS graph_gpu_ps)
-        nv_library(graph_gpu_wrapper SRCS graph_gpu_wrapper.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS} graph_gpu_ps)
-        nv_test(test_cpu_query SRCS test_cpu_query.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS} graph_gpu_ps graph_gpu_wrapper)
-        #ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu)
-        #target_link_libraries(test_sample_rate heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
-        #nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
-        #ADD_EXECUTABLE(test_cpu_query test_cpu_query.cu)
-        #target_link_libraries(test_cpu_query graph_gpu_ps)
-    endif()
-ENDIF()
-IF(WITH_XPU_KP)
-    SET(HETERPS_DEPS device_context)
-    xpu_library(heter_comm_kernel SRCS heter_comm_kernel.h heter_comm_kernel.kps feature_value.h)
-    xpu_library(hashtable_kernel SRCS hashtable.h hashtable_kernel.kps)
-    cc_library(heter_comm SRCS heter_comm.h heter_resource.cc DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
-    cc_library(heter_ps SRCS heter_ps.cc DEPS heter_comm)
-ENDIF()
-IF(WITH_ROCM)
-    hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
-    hip_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
-    hip_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
-ENDIF()
+if(WITH_GPU)
+  set(HETERPS_DEPS device_context)
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+    set(HETERPS_DEPS ${HETERPS_DEPS} cub)
+  endif()
+  if(WITH_PSCORE)
+    get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
+    set(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS})
+  endif()
+  nv_library(
+    heter_comm_kernel
+    SRCS heter_comm_kernel.cu feature_value.h
+    DEPS ${HETERPS_DEPS})
+  nv_library(
+    hashtable_kernel
+    SRCS hashtable_kernel.cu feature_value.h
+    DEPS ${HETERPS_DEPS})
+  nv_library(
+    heter_comm
+    SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h
+         mem_pool.h
+    DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
+  nv_test(
+    test_heter_comm
+    SRCS feature_value.h
+    DEPS heter_comm)
+  nv_library(
+    heter_ps
+    SRCS heter_ps.cu
+    DEPS heter_comm)
+  if(WITH_PSCORE)
+    nv_library(
+      graph_gpu_ps
+      SRCS graph_gpu_ps_table_inl.cu
+      DEPS heter_comm table hashtable_kernel)
+    nv_library(
+      graph_sampler
+      SRCS graph_sampler_inl.h
+      DEPS graph_gpu_ps)
+    nv_library(
+      graph_gpu_wrapper
+      SRCS graph_gpu_wrapper.cu
+      DEPS heter_comm
+           table
+           heter_comm_kernel
+           hashtable_kernel
+           heter_ps
+           ${HETERPS_DEPS}
+           graph_gpu_ps)
+    nv_test(
+      test_cpu_query
+      SRCS test_cpu_query.cu
+      DEPS heter_comm
+           table
+           heter_comm_kernel
+           hashtable_kernel
+           heter_ps
+           ${HETERPS_DEPS}
+           graph_gpu_ps
+           graph_gpu_wrapper)
+    #ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu)
+    #target_link_libraries(test_sample_rate heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
+    #nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
+    #ADD_EXECUTABLE(test_cpu_query test_cpu_query.cu)
+    #target_link_libraries(test_cpu_query graph_gpu_ps)
+  endif()
+endif()
+if(WITH_XPU_KP)
+  set(HETERPS_DEPS device_context)
+  xpu_library(heter_comm_kernel SRCS heter_comm_kernel.h heter_comm_kernel.kps
+                                     feature_value.h)
+  xpu_library(hashtable_kernel SRCS hashtable.h hashtable_kernel.kps)
+  cc_library(
+    heter_comm
+    SRCS heter_comm.h heter_resource.cc
+    DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
+  cc_library(
+    heter_ps
+    SRCS heter_ps.cc
+    DEPS heter_comm)
+endif()
+if(WITH_ROCM)
+  hip_library(
+    heter_comm
+    SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h
+         hashtable.h
+    DEPS cub device_context)
+  hip_test(
+    test_heter_comm
+    SRCS feature_value.h
+    DEPS heter_comm)
+  hip_library(
+    heter_ps
+    SRCS heter_ps.cu
+    DEPS heter_comm)
+endif()
diff --git a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
index 4ad32d1714f7d..da65cccb435d1 100644
--- a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
@@ -22,6 +22,7 @@
 #define CONCURRENT_UNORDERED_MAP_CUH
 
 #include <thrust/pair.h>
+
 #include <cassert>
 #include <iostream>
 #include <iterator>
@@ -258,7 +259,7 @@ class cycle_iterator_adapter {
     return old;
   }
 
-  __host__ __device__ const cycle_iterator_adapter& operator++(int)const {
+  __host__ __device__ const cycle_iterator_adapter& operator++(int) const {
     cycle_iterator_adapter<iterator_type> old(m_begin, m_end, m_current);
     if (m_end == (m_current + 1))
       m_current = m_begin;
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
index 19c355c671a38..2e7588d0ac48c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -17,6 +17,7 @@
 #include <iostream>
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -284,6 +285,6 @@ struct NodeQueryResult {
   };
   ~NodeQueryResult() {}
 };
-}
-};
+}  // namespace framework
+};  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index ae57c2ebe932f..5831863f7f5c3 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -14,7 +14,9 @@
 
 #pragma once
 #include <thrust/host_vector.h>
+
 #include <chrono>
+
 #include "heter_comm.h"
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
@@ -123,7 +125,7 @@ class GpuPsGraphTable : public HeterComm<uint64_t, int64_t, int> {
   std::condition_variable cv_;
   int cpu_table_status;
 };
-}
-};
+}  // namespace framework
+};  // namespace paddle
 //#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h"
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
index 72b9cae41c0fd..ab33d2a9c05bf 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
@@ -15,6 +15,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
+
 #include <functional>
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
@@ -859,11 +860,10 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     constexpr int TILE_SIZE = BLOCK_WARPS * 16;
     const dim3 block(WARP_SIZE, BLOCK_WARPS);
     const dim3 grid((shard_len + TILE_SIZE - 1) / TILE_SIZE);
-    neighbor_sample_example_v2<
-        WARP_SIZE, BLOCK_WARPS,
-        TILE_SIZE><<<grid, block, 0, resource_->remote_stream(i, gpu_id)>>>(
-        graph, id_array, actual_size_array, sample_array, sample_size,
-        shard_len, default_value);
+    neighbor_sample_example_v2<WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+        <<<grid, block, 0, resource_->remote_stream(i, gpu_id)>>>(
+            graph, id_array, actual_size_array, sample_array, sample_size,
+            shard_len, default_value);
   }
 
   for (int i = 0; i < total_gpu; ++i) {
@@ -946,12 +946,12 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
       constexpr int TILE_SIZE_ = BLOCK_WARPS_ * 16;
       const dim3 block2(WARP_SIZE_, BLOCK_WARPS_);
       const dim3 grid2((number_on_cpu + TILE_SIZE_ - 1) / TILE_SIZE_);
-      copy_buffer_ac_to_final_place<WARP_SIZE_, BLOCK_WARPS_,
-                                    TILE_SIZE_><<<grid2, block2, 0, stream>>>(
-          gpu_buffers_ptr, gpu_ac_ptr, val, actual_sample_size,
-          thrust::raw_pointer_cast(t_index.data()) + 1,
-          thrust::raw_pointer_cast(cumsum_gpu_ac.data()), number_on_cpu,
-          sample_size);
+      copy_buffer_ac_to_final_place<WARP_SIZE_, BLOCK_WARPS_, TILE_SIZE_>
+          <<<grid2, block2, 0, stream>>>(
+              gpu_buffers_ptr, gpu_ac_ptr, val, actual_sample_size,
+              thrust::raw_pointer_cast(t_index.data()) + 1,
+              thrust::raw_pointer_cast(cumsum_gpu_ac.data()), number_on_cpu,
+              sample_size);
 
       delete[] merge_buffers;
       delete[] cpu_keys;
@@ -1027,13 +1027,13 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start,
   local_begin_pos = [0,3]
   sample_size = [2,3]
   */
-  std::function<int(int, int, int, int, int&, int&)> range_check = [](
-      int x, int y, int x1, int y1, int& x2, int& y2) {
-    if (y <= x1 || x >= y1) return 0;
-    y2 = min(y, y1);
-    x2 = max(x1, x);
-    return y2 - x2;
-  };
+  std::function<int(int, int, int, int, int&, int&)> range_check =
+      [](int x, int y, int x1, int y1, int& x2, int& y2) {
+        if (y <= x1 || x >= y1) return 0;
+        y2 = min(y, y1);
+        x2 = max(x1, x);
+        return y2 - x2;
+      };
   auto graph = gpu_graph_list[gpu_id];
   if (graph.node_size == 0) {
     return result;
@@ -1106,6 +1106,6 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start,
   return result;
   */
 }
-}
-};
+}  // namespace framework
+};  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
index c976bb67cb21e..43f0101009d08 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -271,5 +271,5 @@ void GraphGpuWrapper::export_partition_files(int idx, std::string file_path) {
       ->cpu_graph_table->export_partition_files(idx, file_path);
 }
 #endif
-}
-};
+}  // namespace framework
+};  // namespace paddle
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
index a34e752fc7ea7..d3c4dea589030 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
 namespace paddle {
@@ -73,5 +74,5 @@ class GraphGpuWrapper {
   void* graph_table;
 };
 #endif
-}
-};
+}  // namespace framework
+};  // namespace paddle
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h b/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h
index a7c043f1edf37..7cec4fcfb8311 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <time.h>
+
 #include <algorithm>
 #include <chrono>
 #include <cstdlib>
@@ -23,6 +24,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
@@ -106,7 +108,7 @@ class AllInGpuGraphSampler : public GraphSampler {
   // std::shared_ptr<std::mt19937_64> random;
   int gpu_num;
 };
-}
-};
+}  // namespace framework
+};  // namespace paddle
 #include "paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h"
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h
index ad4b00b11aa39..e68612d57e259 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h
@@ -156,6 +156,6 @@ void AllInGpuGraphSampler::init(GpuPsGraphTable *g,
   this->gpu_num = g->gpu_num;
   graph_table = g->cpu_graph_table.get();
 }
-}
-};
+}  // namespace framework
+};  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index 234aa15ebf74d..112a59c8fec87 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
 #include <glog/logging.h>
+
 #include <limits>
 #include <memory>
 #include <vector>
@@ -36,6 +37,7 @@ limitations under the License. */
 #include "thrust/pair.h"
 #elif defined(__xpu__)
 #include <xpu/runtime.h>
+
 #include "xpu/kernel/cluster_header.h"
 #include "xpu/kernel/math.h"
 #include "xpu/kernel/simd.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index 57741c2c19b1c..c2e6cdc5c6993 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HETERPS
 #include <thread>
+
 #include "paddle/fluid/framework/fleet/heter_ps/hashtable.h"
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
 
@@ -366,10 +367,10 @@ template class HashTable<long, long>;
 template class HashTable<long, unsigned long>;
 template class HashTable<long, unsigned int>;
 
-template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
-    cudaStream_t>(const unsigned long* d_keys,
-                  paddle::framework::FeatureValue* d_vals, size_t len,
-                  cudaStream_t stream);
+template void
+HashTable<unsigned long, paddle::framework::FeatureValue>::get<cudaStream_t>(
+    const unsigned long* d_keys, paddle::framework::FeatureValue* d_vals,
+    size_t len, cudaStream_t stream);
 
 template void
 HashTable<unsigned long, paddle::framework::FeatureValue*>::get<cudaStream_t>(
@@ -395,10 +396,10 @@ template void HashTable<unsigned long, long>::get<cudaStream_t>(
 //    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
 //    stream);
 
-template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert<
-    cudaStream_t>(const unsigned long* d_keys,
-                  const paddle::framework::FeatureValue* d_vals, size_t len,
-                  cudaStream_t stream);
+template void
+HashTable<unsigned long, paddle::framework::FeatureValue>::insert<cudaStream_t>(
+    const unsigned long* d_keys, const paddle::framework::FeatureValue* d_vals,
+    size_t len, cudaStream_t stream);
 
 template void HashTable<unsigned long, paddle::framework::FeatureValue*>::
     insert<cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
@@ -438,21 +439,22 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::update<
               paddle::framework::FeaturePushValue>,
     cudaStream_t>(const unsigned long* d_keys,
                   const paddle::framework::FeaturePushValue* d_grads,
-                  size_t len, Optimizer<paddle::framework::FeatureValue,
-                                        paddle::framework::FeaturePushValue>
-                                  sgd,
-                  cudaStream_t stream);
-
-template void
-HashTable<unsigned long, paddle::framework::FeatureValue*>::update<
-    Optimizer<paddle::framework::FeatureValue,
-              paddle::framework::FeaturePushValue>,
-    cudaStream_t>(const unsigned long* d_keys, const char* d_grads, size_t len,
+                  size_t len,
                   Optimizer<paddle::framework::FeatureValue,
                             paddle::framework::FeaturePushValue>
                       sgd,
                   cudaStream_t stream);
 
+template void HashTable<unsigned long, paddle::framework::FeatureValue*>::
+    update<Optimizer<paddle::framework::FeatureValue,
+                     paddle::framework::FeaturePushValue>,
+           cudaStream_t>(const unsigned long* d_keys, const char* d_grads,
+                         size_t len,
+                         Optimizer<paddle::framework::FeatureValue,
+                                   paddle::framework::FeaturePushValue>
+                             sgd,
+                         cudaStream_t stream);
+
 // template void HashTable<unsigned long,
 // paddle::framework::FeatureValue>::update<
 //    Optimizer<paddle::framework::FeatureValue,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 815f06b0824e6..d016cdf4e09c1 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <thread>
 #include <vector>
+
 #include "cub/cub.cuh"
 #include "cub/util_allocator.cuh"
 #if defined(PADDLE_WITH_CUDA)
@@ -26,6 +27,7 @@ limitations under the License. */
 #elif defined(PADDLE_WITH_XPU_KP)
 // #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
 #include <xpu/runtime.h>
+
 #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
 #endif
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 64b177abb8638..38a4e7b7bb1a9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
 #include <queue>
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
index 94d7929b2947d..a5ee8e2ff8395 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
@@ -294,10 +294,10 @@ template void HeterCommKernel::fill_idx<uint32_t, cudaStream_t>(
 template void HeterCommKernel::calc_shard_offset<int, cudaStream_t>(
     int* idx, int* left, int* right, long long len, int total_devs,
     const cudaStream_t& stream);
-template void HeterCommKernel::calc_shard_index<
-    unsigned long, int, cudaStream_t>(unsigned long* d_keys, long long len,
-                                      int* shard_index, int total_devs,
-                                      const cudaStream_t& stream);
+template void
+HeterCommKernel::calc_shard_index<unsigned long, int, cudaStream_t>(
+    unsigned long* d_keys, long long len, int* shard_index, int total_devs,
+    const cudaStream_t& stream);
 
 template void HeterCommKernel::calc_shard_index<long, int, cudaStream_t>(
     long* d_keys, long long len, int* shard_index, int total_devs,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
index 700b43f18fb96..fe8e8c86505ce 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
+
 #include <vector>
 
 #ifdef PADDLE_WITH_HETERPS
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index 43b84ee5d26fb..cfe4662629415 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
 
 #ifdef PADDLE_WITH_HETERPS
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
index 8449a4048b72f..83dc232bc6a3b 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
 #if defined(PADDLE_WITH_CUDA)
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
index 2c312e9d4d60a..fe44c81fe445f 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
index 5717f44d400a5..087877818f5fb 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
@@ -24,6 +24,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU_KP
 #include <xpu/runtime.h>  // NOLINT
+
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #endif
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
index 4684b4a0bc155..82090ef4817c9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <curand_kernel.h>
 #endif
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_comm.cu b/paddle/fluid/framework/fleet/heter_ps/test_comm.cu
index 3a6ed50ad8e70..72fa0282066d2 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_comm.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_comm.cu
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
index 62a0df9430002..621c7f5bab412 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
index ff3cd9d2d046d..49e9a051ec0c0 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
index 06c7026eb51ca..28098181b6c2a 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu
index affa60d022ece..a1e8f06368b07 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include <unistd.h>
+
+#include <chrono>
 #include <condition_variable>  // NOLINT
 #include <fstream>
 #include <iomanip>
@@ -20,32 +22,30 @@
 #include <thread>  // NOLINT
 #include <unordered_set>
 #include <vector>
-#include "google/protobuf/text_format.h"
 
-#include <chrono>
+#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_sampler.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
-#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
-#include "paddle/fluid/framework/fleet/heter_ps/graph_sampler.h"
-#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
-#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
-#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
-#include "paddle/fluid/platform/cuda_device_guard.h"
-
 using namespace paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
diff --git a/paddle/fluid/framework/fleet/metrics.cc b/paddle/fluid/framework/fleet/metrics.cc
index 56bc568460bbc..4225281640588 100644
--- a/paddle/fluid/framework/fleet/metrics.cc
+++ b/paddle/fluid/framework/fleet/metrics.cc
@@ -17,6 +17,7 @@
 #include <ctime>
 #include <memory>
 #include <numeric>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 
 #if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
@@ -63,10 +64,12 @@ void BasicAucCalculator::add_data(const float* d_pred, const int64_t* d_label,
 }
 
 void BasicAucCalculator::add_unlock_data(double pred, int label) {
-  PADDLE_ENFORCE_GE(pred, 0.0, platform::errors::PreconditionNotMet(
-                                   "pred should be greater than 0"));
-  PADDLE_ENFORCE_LE(pred, 1.0, platform::errors::PreconditionNotMet(
-                                   "pred should be lower than 1"));
+  PADDLE_ENFORCE_GE(
+      pred, 0.0,
+      platform::errors::PreconditionNotMet("pred should be greater than 0"));
+  PADDLE_ENFORCE_LE(
+      pred, 1.0,
+      platform::errors::PreconditionNotMet("pred should be lower than 1"));
   PADDLE_ENFORCE_EQ(
       label * label, label,
       platform::errors::PreconditionNotMet(
@@ -272,10 +275,12 @@ void BasicAucCalculator::add_uid_data(const float* d_pred,
 
 void BasicAucCalculator::add_uid_unlock_data(double pred, int label,
                                              uint64_t uid) {
-  PADDLE_ENFORCE_GE(pred, 0.0, platform::errors::PreconditionNotMet(
-                                   "pred should be greater than 0"));
-  PADDLE_ENFORCE_LE(pred, 1.0, platform::errors::PreconditionNotMet(
-                                   "pred should be lower than 1"));
+  PADDLE_ENFORCE_GE(
+      pred, 0.0,
+      platform::errors::PreconditionNotMet("pred should be greater than 0"));
+  PADDLE_ENFORCE_LE(
+      pred, 1.0,
+      platform::errors::PreconditionNotMet("pred should be lower than 1"));
   PADDLE_ENFORCE_EQ(
       label * label, label,
       platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/framework/fleet/metrics.h b/paddle/fluid/framework/fleet/metrics.h
index 69b242664bb46..7c3ea1b5512f6 100644
--- a/paddle/fluid/framework/fleet/metrics.h
+++ b/paddle/fluid/framework/fleet/metrics.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <ThreadPool.h>
+
 #include <atomic>
 #include <ctime>
 #include <map>
@@ -35,6 +36,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/allreduce.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index 488a9ef8ce78f..fbe76696114d5 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <ctime>
 #include <memory>
 #include <numeric>
+
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index 0efec57e59db6..7ddc5a1f6dd66 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -27,6 +27,7 @@ limitations under the License. */
 #include <vector>
 #ifdef PADDLE_WITH_GLOO
 #include <gloo/broadcast.h>
+
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
diff --git a/paddle/fluid/framework/fleet/test_fleet.cc b/paddle/fluid/framework/fleet/test_fleet.cc
index 24f3e6bed6494..34aea9de3b1c5 100644
--- a/paddle/fluid/framework/fleet/test_fleet.cc
+++ b/paddle/fluid/framework/fleet/test_fleet.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #include "paddle/fluid/string/string_helper.h"
diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
index b621eca35b893..e3b9fe3626ddf 100644
--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/generator.h"
 
 #include <glog/logging.h>
+
 #include <memory>
 #include <utility>
 
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
index 35efc1bee33d5..f62e8f74d26d5 100644
--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <stdint.h>
+
 #include <atomic>
 #include <deque>
 #include <iostream>  // temp for debug
diff --git a/paddle/fluid/framework/gpu_utils.h b/paddle/fluid/framework/gpu_utils.h
index 37c9852a1ab1f..9c59333000e91 100644
--- a/paddle/fluid/framework/gpu_utils.h
+++ b/paddle/fluid/framework/gpu_utils.h
@@ -17,6 +17,7 @@
 #define EIGEN_USE_GPU
 
 #include <array>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
@@ -104,15 +105,17 @@ ConvertTensorIndex(int index, const Dim3& dims) {
 
 template <typename IntType, bool ceil>
 IntType CeilOrFloor(IntType x, IntType deviser) {
-  PADDLE_ENFORCE_GT(deviser, 0, platform::errors::InvalidArgument(
-                                    "deviser should be greater than 0, "
-                                    "but received is:%d",
-                                    deviser));
+  PADDLE_ENFORCE_GT(
+      deviser, 0,
+      platform::errors::InvalidArgument("deviser should be greater than 0, "
+                                        "but received is:%d",
+                                        deviser));
 
   PADDLE_ENFORCE_GT(
-      x, 0, platform::errors::InvalidArgument("input should be greater than 0, "
-                                              "but received is:%d",
-                                              x));
+      x, 0,
+      platform::errors::InvalidArgument("input should be greater than 0, "
+                                        "but received is:%d",
+                                        x));
 
   const IntType round_to_zero = x / deviser;
   const IntType inte_result = round_to_zero * deviser;
diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
index ebbfd446a03de..81f17be867f76 100644
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_call_stack.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
@@ -157,8 +158,9 @@ class GradOpDescMakerBase {
   const Attribute& GetAttr(const std::string& name) const {
     auto& map = fwd_op_.GetAttrMap();
     auto it = map.find(name);
-    PADDLE_ENFORCE_NE(it, map.end(), platform::errors::NotFound(
-                                         "Cannot find attribute (%s).", name));
+    PADDLE_ENFORCE_NE(
+        it, map.end(),
+        platform::errors::NotFound("Cannot find attribute (%s).", name));
     return it->second;
   }
 
diff --git a/paddle/fluid/framework/heter_pipeline_trainer.cc b/paddle/fluid/framework/heter_pipeline_trainer.cc
index d0d3c2fea3b56..dc99885811c2b 100644
--- a/paddle/fluid/framework/heter_pipeline_trainer.cc
+++ b/paddle/fluid/framework/heter_pipeline_trainer.cc
@@ -32,7 +32,9 @@ using TaskQueue =
                                 std::pair<std::string, int>>>>;
 
 void HeterPipelineTrainer::ResetDataset(Dataset* dataset) {
+#ifndef PADDLE_WITH_FLPS
   if (pipeline_stage_ == 0) {
+#endif
     SetDataset(dataset);
     const std::vector<paddle::framework::DataFeed*> readers =
         dataset->GetReaders();
@@ -51,40 +53,39 @@ void HeterPipelineTrainer::ResetDataset(Dataset* dataset) {
       this_worker->SetDataFeed(readers[cnt]);
       this_worker->SetReaderPlace(place_);
     }
+#ifndef PADDLE_WITH_FLPS
   }
+#endif
 }
 
 void HeterPipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
                                       Dataset* dataset) {
+  trainer_desc_ = trainer_desc;
   thread_num_ = trainer_desc.thread_num();
   ParseDumpConfig(trainer_desc);
   SetDebug(trainer_desc.debug());
   const std::vector<paddle::framework::DataFeed*> readers =
       dataset->GetReaders();
-  VLOG(3) << "readers num: " << readers.size();
   // change thread num to readers num
   thread_num_ = readers.size();
-  VLOG(3) << "worker thread num: " << thread_num_;
+  VLOG(3) << "worker(readers) thread num: " << thread_num_;
   const auto& heter_section_params = trainer_desc.heter_section_param();
   num_pipeline_stages_ = heter_section_params.num_pipeline_stages();
   pipeline_stage_ = heter_section_params.pipeline_stage();
   num_microbatches_ = heter_section_params.num_microbatches();
   VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_;
-  trainer_desc_ = trainer_desc;
   trainer_id_ = trainer_desc.trainer_id();
   for (int i = 0; i < num_pipeline_stages_; ++i) {
     auto trainer_num = trainer_desc.trainers(i);
     trainers_.push_back(trainer_num);
   }
   int cpu_trainer_num = trainers_[0];
-  // int cur_stage_trainer_num = trainers_[pipeline_stage_];
-  // int global_thread_num = cpu_trainer_num * thread_num_;
-  // int previous_trainers = 0;
-  // for (int i = 0; i < pipeline_stage_; i++) previous_trainers +=
-  // trainers_[i];
-  // int stage_trainer_id =
-  //    trainer_id_ - previous_trainers;  // trainer id in current stage
-
+  VLOG(4) << "trainer_id_: " << trainer_id_;
+  VLOG(4) << "cpu_trainer_num: " << cpu_trainer_num
+          << " xpu_trainer_num: " << trainers_[1];
+#ifdef PADDLE_WITH_FLPS
+  thread_num_ = 1;
+#endif
   if (pipeline_stage_ == 0) {  // for cpu trainer
     int cnt = -1;
     int real_thread_id = trainer_id_;
@@ -103,25 +104,33 @@ void HeterPipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
       this_worker->InitRandomDumpConfig(trainer_desc);
       this_worker->SetDeviceIndex(real_thread_id);
       real_thread_id += cpu_trainer_num;
-      // if (pipeline_stage_ == 0) {
       this_worker->SetDataFeed(readers[cnt]);
-      //}
       this_worker->SetMicrobatchNum(num_microbatches_);
       this_worker->SetPipelineStageNum(num_pipeline_stages_);
       this_worker->SetPipelineStage(pipeline_stage_);
     }
-  } else {  // for heter_trainer
-    // heter trainer with thread_id == -1 is not for
-    // real training
+  } else {
+    // for heter_trainer
+    // heter trainer with thread_id == -1 is not for real training, just for run
+    // listen op
     workers_[-1] = DeviceWorkerFactory::CreateDeviceWorker(
         trainer_desc.device_worker_name());
     auto this_worker =
         std::dynamic_pointer_cast<paddle::framework::HeterSectionWorker>(
             workers_[-1]);
+#ifdef PADDLE_WITH_FLPS
+    this_worker->SetDebug(debug_);
+    this_worker->SetNeedDumpField(need_dump_field_);
+    this_worker->SetNeedDumpParam(need_dump_param_);
+    this_worker->SetDumpFieldVector(dump_fields_);
+    this_worker->SetDumpParamVector(dump_param_);
+    this_worker->InitRandomDumpConfig(trainer_desc);
+    this_worker->SetDataFeed(readers[0]);
+#endif
+    this_worker->SetDeviceIndex(-1);
     this_worker->SetMicrobatchNum(num_microbatches_);
     this_worker->SetPipelineStageNum(num_pipeline_stages_);
     this_worker->SetPipelineStage(pipeline_stage_);
-    this_worker->SetDeviceIndex(-1);
   }
 }
 
@@ -159,14 +168,19 @@ void HeterPipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program,
   for (auto& worker_pair : workers_) {
     auto worker_index = worker_pair.first;
     auto device_worker = worker_pair.second;
+    VLOG(0) << "workers index in InitTrainerEnv: " << worker_index;
     auto this_worker =
         std::dynamic_pointer_cast<paddle::framework::HeterSectionWorker>(
             device_worker);
     this_worker->SetPlace(place);
     this_worker->Initialize(trainer_desc_);
+#ifdef PADDLE_WITH_FLPS
+    this_worker->SetReaderPlace(place);
+#else
     if (pipeline_stage_ == 0) {
       this_worker->SetReaderPlace(place);
     }
+#endif
     this_worker->SetRootScope(root_scope_);
     // generate mini_batch scope for every worker
     auto* minibatch_scope = &root_scope_->NewScope();
@@ -175,6 +189,7 @@ void HeterPipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program,
     // after set micro num & mini batch scope
     this_worker->CreateMicrobatchScopes();
     (*micro_scopes_)[worker_index] = this_worker->GetMicrobatchScopes();
+    VLOG(4) << "worker_index: " << worker_index;
     (*task_queue_)[worker_index] = this_worker->GetThreadQueue();
   }
 }
@@ -182,6 +197,7 @@ void HeterPipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program,
 void HeterPipelineTrainer::Run() {
   VLOG(3) << "Going to run HeterPipelineTrainer::Run()";
   if (listen_ptr_ == nullptr) {
+    VLOG(3) << "listen_ptr_ is null";
     for (auto& worker_pair : workers_) {
       auto& device_worker = worker_pair.second;
       auto worker_0 =
@@ -196,10 +212,14 @@ void HeterPipelineTrainer::Run() {
   heter_server->WaitServerReady();
   heter_server->SetMiniBatchScopes(mini_scopes_);
   heter_server->SetMicroBatchScopes(micro_scopes_);
+  VLOG(4) << "heter_server SetTaskQueue";
   heter_server->SetTaskQueue(task_queue_);
+
   // main training logic
+  VLOG(3) << "pipeline_stage_ is " << pipeline_stage_;
   if (pipeline_stage_ == 0) {  // for cpu trainer
     for (auto& worker_pair : workers_) {
+      VLOG(4) << "cpu worker index : " << worker_pair.first;
       auto device_worker = worker_pair.second;
       if (!debug_) {
         threads_.push_back(
@@ -212,6 +232,7 @@ void HeterPipelineTrainer::Run() {
   } else {  // for heter worker
     // start thread_worker with thread_id = -1
     for (auto& worker_pair : workers_) {
+      VLOG(4) << "xpu worker index : " << worker_pair.first;
       auto device_worker = worker_pair.second;
       if (!debug_) {
         threads_.push_back(
@@ -252,6 +273,10 @@ void HeterPipelineTrainer::Run() {
           this_worker->SetPipelineStageNum(num_pipeline_stages_);
           this_worker->SetPipelineStage(pipeline_stage_);
           this_worker->SetPlace(place_);
+#ifdef PADDLE_WITH_FLPS
+          this_worker->SetDataFeed(workers_[-1]->device_reader_);
+          this_worker->SetReaderPlace(place_);
+#endif
           this_worker->Initialize(trainer_desc_);
           this_worker->SetRootScope(root_scope_);
 
@@ -308,5 +333,5 @@ Scope* HeterPipelineTrainer::GetWorkerScope(int thread_id) {
 }
 
 }  // end namespace framework
-}  // end namespace paddle
+}  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc
old mode 100644
new mode 100755
index b6759bb2e6fe6..acbfe21ecdae0
--- a/paddle/fluid/framework/heter_section_worker.cc
+++ b/paddle/fluid/framework/heter_section_worker.cc
@@ -65,6 +65,52 @@ class TrainerDesc;
 
 uint64_t HeterSectionWorker::batch_id_(0);
 
+#ifdef PADDLE_WITH_FLPS
+void HeterSectionWorker::Initialize(const TrainerDesc& desc) {
+  trainer_desc_ = desc;
+  fetch_config_ = desc.fetch_config();
+  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
+  program_.reset(new ProgramDesc(
+      desc.heter_section_param().section_config().program_desc()));
+  thread_queue_.reset(
+      new ::paddle::framework::BlockingQueue<std::pair<std::string, int>>());
+  VLOG(4) << "addr of thread_queue_ is: " << thread_queue_.get();
+  bool is_first_stage = (pipeline_stage_ == 0);
+  bool is_last_stage = (pipeline_stage_ + 1 == num_pipeline_stages_);
+
+  if (is_first_stage) {
+    VLOG(0) << "entering first stage";
+    for (auto& op_desc : program_->Block(0).AllOps()) {
+      forward_ops_.push_back(std::move(OpRegistry::CreateOp(*op_desc)));
+    }
+    for (auto& op_desc : program_->Block(1).AllOps()) {
+      auto op = std::move(OpRegistry::CreateOp(*op_desc));
+      auto op_type = op->Type();
+      if (listen_op_ == nullptr && op_type == "heter_listen_and_serv") {
+        listen_op_ = std::move(op);
+      } else {
+        backward_ops_.push_back(std::move(op));
+      }
+    }
+  } else if (is_last_stage) {
+    VLOG(0) << "HeterSectionWorker::Initialize for the last stage";
+    for (auto& op_desc : program_->Block(0).AllOps()) {
+      auto op = std::move(OpRegistry::CreateOp(*op_desc));
+      auto op_type = op->Type();
+      if (listen_op_ == nullptr && op_type == "heter_listen_and_serv") {
+        listen_op_ = std::move(op);
+      } else {
+        forward_ops_.push_back(std::move(op));
+      }
+    }
+    VLOG(0) << "test111";
+    for (auto& op_desc : program_->Block(1).AllOps()) {
+      auto op = std::move(OpRegistry::CreateOp(*op_desc));
+      backward_ops_.push_back(std::move(op));
+    }
+  }
+}
+#else
 void HeterSectionWorker::Initialize(const TrainerDesc& desc) {
   trainer_desc_ = desc;
   fetch_config_ = desc.fetch_config();
@@ -122,6 +168,7 @@ void HeterSectionWorker::Initialize(const TrainerDesc& desc) {
     }
   }
 }
+#endif
 
 void HeterSectionWorker::RunBackward(int micro_id) {
   for (size_t i = 0; i < backward_ops_.size(); i++) {
@@ -147,8 +194,11 @@ void HeterSectionWorker::RunBackward(int micro_id) {
 void HeterSectionWorker::MiniBatchBarrier() {
   // get micro id & deserialize data
   std::set<int> micro_ids;
+  VLOG(4) << "entering MiniBatchBarrier";
+  VLOG(4) << "micro_ids_.size(): " << micro_ids_.size();
   while (micro_ids.size() < micro_ids_.size()) {
     auto task = (*thread_queue_).Pop();
+    VLOG(4) << "got one task from task que in cpu worker";
     auto message_name = task.first;
     auto micro_id = task.second;
     PADDLE_ENFORCE_EQ(message_name.find("backward") != std::string::npos, true,
@@ -164,19 +214,44 @@ void HeterSectionWorker::MiniBatchBarrier() {
     RunBackward(micro_id);
     batch_num_++;
     BatchPostProcess();
+    VLOG(0) << "one task in cpu worker overed!";
   }
   micro_ids_.clear();
 }
 
-void HeterSectionWorker::RunListen() { listen_op_->Run(*root_scope_, place_); }
+void HeterSectionWorker::RunListen() {
+  VLOG(4) << ">>> run listen_op";
+  listen_op_->Run(*root_scope_, place_);
+  VLOG(4) << "<<< run listen_op over";
+}
 
 void HeterSectionWorker::RunForward(int micro_id) {
+#ifdef PADDLE_WITH_FLPS
+  BindingDataFeedMemory(micro_id);
+  if (debug_) {
+    timeline_.Start();
+  }
+  int cur_micro_batch = device_reader_->Next();
+  if (cur_micro_batch <= 0) {
+    VLOG(0) << "no more data in device_reader_";
+    epoch_finish_ = true;
+    return;
+  }
+  if (debug_) {
+    timeline_.Pause();
+    read_time_ += timeline_.ElapsedSec();
+    total_time_ += timeline_.ElapsedSec();
+    total_ins_num_ += cur_micro_batch;
+  }
+  VLOG(3) << "read a batch in thread " << thread_id_ << " micro " << micro_id;
+#else
   if (pipeline_stage_ == 0) {
     BindingDataFeedMemory(micro_id);
     if (debug_) {
       timeline_.Start();
     }
-    int cur_micro_batch = device_reader_->Next();
+    int cur_micro_batch =
+        device_reader_->Next();  // batch_size is just micro_batch_size
     if (cur_micro_batch <= 0) {
       epoch_finish_ = true;
       return;
@@ -189,6 +264,7 @@ void HeterSectionWorker::RunForward(int micro_id) {
     }
     VLOG(3) << "read a batch in thread " << thread_id_ << " micro " << micro_id;
   }
+#endif
   for (size_t i = 0; i < forward_ops_.size(); i++) {
     auto& op = forward_ops_[i];
     VLOG(3) << "Forward: start to run op " << op->Type() << " for micro-batch "
@@ -301,7 +377,7 @@ void HeterSectionWorker::Run() {
     while (!epoch_finish_) {
       // forward
       for (int i = 0; i < num_microbatches_; i++) {
-        VLOG(5) << "Run " << i << " microbatch";
+        VLOG(4) << "Run " << i << " microbatch";
         RunForward(i);
         if (epoch_finish_ == true) {
           break;
@@ -312,15 +388,19 @@ void HeterSectionWorker::Run() {
       if (micro_ids_.size() > 0) {
         MiniBatchBarrier();
       }
+      VLOG(0) << "one batch run over! micro_ids_size: " << micro_ids_.size();
     }
   } else {  // for heter worker
+    VLOG(4) << "entering heter Run...";
     auto heter_server = paddle::distributed::HeterServer::GetInstance();
     while (true) {
       if (heter_server->IsStop()) {
+        VLOG(0) << "heter_server is stopped!!";
         epoch_finish_ = true;
         break;
       }
       auto task = (*thread_queue_).Pop();
+      VLOG(4) << "got one task from task que in heter worker";
       auto message_name = task.first;
       auto micro_id = task.second;
       if (is_last_stage) {
@@ -331,6 +411,8 @@ void HeterSectionWorker::Run() {
         RunBackward(micro_id);
         batch_num_++;
         BatchPostProcess();
+        VLOG(0) << "one batch run over! micro_id: " << micro_id
+                << " batch_num: " << batch_num_;
       } else {
         if (message_name.find("forward") != std::string::npos) {
           RunForward(micro_id);
@@ -371,6 +453,7 @@ void HeterSectionWorker::BatchPostProcess() {
 }
 
 void HeterSectionWorker::TrainFiles() {
+  VLOG(4) << "entering HeterSectionWorker::TrainFiles";
   if (thread_id_ >= 0) {
     total_ins_num_ = 0;
     batch_num_ = 0;
@@ -378,9 +461,17 @@ void HeterSectionWorker::TrainFiles() {
     timeline_.Start();
     VLOG(3) << "begin section_worker TrainFiles";
     epoch_finish_ = false;
+#ifdef PADDLE_WITH_FLPS
+    if (device_reader_ == nullptr) {
+      VLOG(4) << "device_reader_ is null!!";
+    }
+    device_reader_->Start();
+#else
     if (pipeline_stage_ == 0) {
       device_reader_->Start();
     }
+#endif
+    VLOG(4) << "Run in TrainFiles:";
     while (!epoch_finish_) {
       Run();
       dev_ctx_->Wait();
@@ -428,9 +519,13 @@ void HeterSectionWorker::TrainFilesWithProfiler() {
     total_ins_num_ = 0;
     op_name_.clear();
     op_total_time_.clear();
+#ifdef PADDLE_WITH_FLPS
+    device_reader_->Start();
+#else
     if (pipeline_stage_ == 0) {
       device_reader_->Start();
     }
+#endif
     while (!epoch_finish_) {
       Run();
       dev_ctx_->Wait();
diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h
index 9d0e3c50953bd..6b115d33d2faa 100644
--- a/paddle/fluid/framework/heter_service.h
+++ b/paddle/fluid/framework/heter_service.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <unordered_map>  // NOLINT
 #include <unordered_set>  // NOLINT
 #include <vector>
+
 #include "paddle/fluid/framework/heter_service.pb.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/framework/hetercpu_worker.cc b/paddle/fluid/framework/hetercpu_worker.cc
index 75cc18887da9a..85e44ec44c6e3 100644
--- a/paddle/fluid/framework/hetercpu_worker.cc
+++ b/paddle/fluid/framework/hetercpu_worker.cc
@@ -311,8 +311,8 @@ void HeterCpuWorker::CollectLabelInfo(std::shared_ptr<HeterTask> task,
       continue;
     }
     LoDTensor* tensor = fea_var->GetMutable<LoDTensor>();
-    CHECK(tensor != nullptr) << "tensor of var "
-                             << sparse_key_names_[table_id][i] << " is null";
+    CHECK(tensor != nullptr)
+        << "tensor of var " << sparse_key_names_[table_id][i] << " is null";
 
     // skip slots which do not have embedding
     Variable* emb_var = scope->FindVar(sparse_value_names_[table_id][i]);
@@ -465,9 +465,9 @@ void HeterCpuWorker::AdjustInsWeight(std::shared_ptr<HeterTask> task) {
   float* ins_weights = ins_weight_tensor->data<float>();
   size_t len = ins_weight_tensor->numel();  // len = batch size
   // here we assume nid_show slot only has one feasign in each instance
-  CHECK(len == nid_show_.size()) << "ins_weight size should be equal to "
-                                 << "nid_show size, " << len << " vs "
-                                 << nid_show_.size();
+  CHECK(len == nid_show_.size())
+      << "ins_weight size should be equal to "
+      << "nid_show size, " << len << " vs " << nid_show_.size();
   float nid_adjw_threshold = adjust_ins_weight_config_.nid_adjw_threshold();
   float nid_adjw_ratio = adjust_ins_weight_config_.nid_adjw_ratio();
   int64_t nid_adjw_num = 0;
@@ -482,9 +482,8 @@ void HeterCpuWorker::AdjustInsWeight(std::shared_ptr<HeterTask> task) {
     }
     float ins_weight = 1.0;
     if (nid_show >= 0 && nid_show < nid_adjw_threshold) {
-      ins_weight = log(M_E +
-                       (nid_adjw_threshold - nid_show) / nid_adjw_threshold *
-                           nid_adjw_ratio);
+      ins_weight = log(M_E + (nid_adjw_threshold - nid_show) /
+                                 nid_adjw_threshold * nid_adjw_ratio);
       // count nid adjw insnum and weight
       ++nid_adjw_num;
       nid_adjw_weight += ins_weight;
@@ -579,15 +578,15 @@ void HeterCpuWorker::CopyDenseVars() {
     Variable* src_var = thread_scope_->FindVar(src_var_name);
     CHECK(src_var != nullptr) << src_var_name << " not found";  // NOLINT
     LoDTensor* src_tensor = src_var->GetMutable<LoDTensor>();
-    CHECK(src_tensor != nullptr) << src_var_name
-                                 << " tensor is null";  // NOLINT
+    CHECK(src_tensor != nullptr)
+        << src_var_name << " tensor is null";  // NOLINT
     float* src_data = src_tensor->data<float>();
 
     Variable* dest_var = thread_scope_->FindVar(dest_var_name);
     CHECK(dest_var != nullptr) << dest_var_name << " not found";  // NOLINT
     LoDTensor* dest_tensor = dest_var->GetMutable<LoDTensor>();
-    CHECK(dest_tensor != nullptr) << dest_var_name
-                                  << " tensor is null";  // NOLINT
+    CHECK(dest_tensor != nullptr)
+        << dest_var_name << " tensor is null";  // NOLINT
     float* dest_data = dest_tensor->data<float>();
 
     CHECK(src_tensor->numel() == dest_tensor->numel())
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
index a4af56419a766..81c1a684959fa 100644
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <ctime>
 #include <string>
 #include <vector>
+
 #include "io/fs.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_feed_factory.h"
diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc
index 2eeefb19a1aa8..805f992cf3e8b 100644
--- a/paddle/fluid/framework/infershape_utils_test.cc
+++ b/paddle/fluid/framework/infershape_utils_test.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
+
 #include <string>
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h
index c46a77f0b3590..93bbec251fee4 100644
--- a/paddle/fluid/framework/inplace_op_inference.h
+++ b/paddle/fluid/framework/inplace_op_inference.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"
 
diff --git a/paddle/fluid/framework/io/CMakeLists.txt b/paddle/fluid/framework/io/CMakeLists.txt
index 85b45f1a5bbc1..0033e825172bb 100644
--- a/paddle/fluid/framework/io/CMakeLists.txt
+++ b/paddle/fluid/framework/io/CMakeLists.txt
@@ -1,7 +1,16 @@
-cc_library(shell SRCS shell.cc DEPS string_helper glog timer enforce)
-cc_library(fs SRCS fs.cc DEPS string_helper glog boost enforce shell)
+cc_library(
+  shell
+  SRCS shell.cc
+  DEPS string_helper glog timer enforce)
+cc_library(
+  fs
+  SRCS fs.cc
+  DEPS string_helper glog boost enforce shell)
 
-cc_test(test_fs SRCS test_fs.cc DEPS fs shell)
-if (WITH_CRYPTO) 
-    add_subdirectory(crypto)
-endif (WITH_CRYPTO)
+cc_test(
+  test_fs
+  SRCS test_fs.cc
+  DEPS fs shell)
+if(WITH_CRYPTO)
+  add_subdirectory(crypto)
+endif(WITH_CRYPTO)
diff --git a/paddle/fluid/framework/io/crypto/CMakeLists.txt b/paddle/fluid/framework/io/crypto/CMakeLists.txt
index ae16353ec92ef..e2de877c39e51 100644
--- a/paddle/fluid/framework/io/crypto/CMakeLists.txt
+++ b/paddle/fluid/framework/io/crypto/CMakeLists.txt
@@ -1,3 +1,12 @@
-cc_library(paddle_crypto SRCS cipher_utils.cc cipher.cc aes_cipher.cc DEPS cryptopp enforce)
-cc_test(aes_cipher_test SRCS aes_cipher_test.cc DEPS paddle_crypto)
-cc_test(cipher_utils_test SRCS cipher_utils_test.cc DEPS paddle_crypto)
+cc_library(
+  paddle_crypto
+  SRCS cipher_utils.cc cipher.cc aes_cipher.cc
+  DEPS cryptopp enforce)
+cc_test(
+  aes_cipher_test
+  SRCS aes_cipher_test.cc
+  DEPS paddle_crypto)
+cc_test(
+  cipher_utils_test
+  SRCS cipher_utils_test.cc
+  DEPS paddle_crypto)
diff --git a/paddle/fluid/framework/io/crypto/aes_cipher_test.cc b/paddle/fluid/framework/io/crypto/aes_cipher_test.cc
index 7f923f597b6de..67c758b012ad5 100644
--- a/paddle/fluid/framework/io/crypto/aes_cipher_test.cc
+++ b/paddle/fluid/framework/io/crypto/aes_cipher_test.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/io/crypto/aes_cipher.h"
+
 #include <cryptopp/cryptlib.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <fstream>
 #include <string>
+
 #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/io/crypto/cipher.cc b/paddle/fluid/framework/io/crypto/cipher.cc
index eca175c020cb6..2001e8a416a1a 100644
--- a/paddle/fluid/framework/io/crypto/cipher.cc
+++ b/paddle/fluid/framework/io/crypto/cipher.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/io/crypto/cipher.h"
+
 #include "paddle/fluid/framework/io/crypto/aes_cipher.h"
 #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/io/crypto/cipher_utils.cc b/paddle/fluid/framework/io/crypto/cipher_utils.cc
index ee9f06b2f3eb1..b622138f7814a 100644
--- a/paddle/fluid/framework/io/crypto/cipher_utils.cc
+++ b/paddle/fluid/framework/io/crypto/cipher_utils.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
 
 #include <cryptopp/osrng.h>
+
 #include <sstream>
 
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/io/crypto/cipher_utils_test.cc b/paddle/fluid/framework/io/crypto/cipher_utils_test.cc
index 928e2ced9b195..356c919cbcbe8 100644
--- a/paddle/fluid/framework/io/crypto/cipher_utils_test.cc
+++ b/paddle/fluid/framework/io/crypto/cipher_utils_test.cc
@@ -12,12 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/io/crypto/cipher_utils.h"
+
 #include <gtest/gtest.h>
+
 #include <fstream>
 #include <string>
 
-#include "paddle/fluid/framework/io/crypto/cipher_utils.h"
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/io/fs.cc b/paddle/fluid/framework/io/fs.cc
index b8aca886e7d60..fd602895aaed5 100644
--- a/paddle/fluid/framework/io/fs.cc
+++ b/paddle/fluid/framework/io/fs.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/io/fs.h"
 
 #include <sys/stat.h>
+
 #include <memory>
 
 #include "glog/logging.h"
diff --git a/paddle/fluid/framework/io/fs.h b/paddle/fluid/framework/io/fs.h
index 1ebe80e943aae..088d4d97424a1 100644
--- a/paddle/fluid/framework/io/fs.h
+++ b/paddle/fluid/framework/io/fs.h
@@ -16,6 +16,7 @@
 
 #include <stdint.h>
 #include <stdio.h>
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/framework/io/test_fs.cc b/paddle/fluid/framework/io/test_fs.cc
index 49dee603200c9..adb6141fd56a1 100644
--- a/paddle/fluid/framework/io/test_fs.cc
+++ b/paddle/fluid/framework/io/test_fs.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <fstream>
+
 #include "paddle/fluid/framework/io/fs.h"
 
 #if defined _WIN32 || defined __APPLE__
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 8166c43e65db1..374b5490d5da1 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -1,6 +1,11 @@
-set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp)
-set(pass_file_final ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
-file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt.  DO NOT EDIT!\n\n")
+set(pass_file
+    ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp)
+set(pass_file_final
+    ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+file(
+  WRITE ${pass_file}
+  "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt.  DO NOT EDIT!\n\n"
+)
 file(APPEND ${pass_file} "\#pragma once\n")
 file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
 
@@ -9,54 +14,103 @@ copy_if_different(${pass_file} ${pass_file_final})
 add_subdirectory(fuse_optimizer_ops_pass)
 add_subdirectory(memory_optimize_pass)
 add_subdirectory(multi_devices_graph_pass)
-if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM))
-    add_subdirectory(fusion_group)
+if(NOT APPLE
+   AND NOT WIN32
+   AND (WITH_GPU OR WITH_ROCM))
+  add_subdirectory(fusion_group)
 endif()
 
 # Usage: pass_library(target inference) will append to paddle_inference_pass.h
 unset(INFER_IR_PASSES CACHE) # clear the global variable
 function(pass_library TARGET DEST)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS DIR)
-    set(targetPrefix "")
-
-    cmake_parse_arguments(pass_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    if(pass_library_DIR)
-        cc_library(${TARGET} SRCS ${pass_library_DIR}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base op_version_registry ${pass_library_DEPS})
-    else()
-        cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base op_version_registry ${pass_library_DEPS})
-    endif()
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS DIR)
+  set(targetPrefix "")
+
+  cmake_parse_arguments(pass_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  if(pass_library_DIR)
+    cc_library(
+      ${TARGET}
+      SRCS ${pass_library_DIR}/${TARGET}.cc
+      DEPS graph_pattern_detector pass fuse_pass_base op_version_registry
+           ${pass_library_DEPS})
+  else()
+    cc_library(
+      ${TARGET}
+      SRCS ${TARGET}.cc
+      DEPS graph_pattern_detector pass fuse_pass_base op_version_registry
+           ${pass_library_DEPS})
+  endif()
 
-    # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
-    if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
-        if(NOT CMAKE_BUILD_TYPE STREQUAL "Release")
-            message(STATUS "add pass ${TARGET} ${DEST}")
-        endif()
-        file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
-        set(INFER_IR_PASSES ${INFER_IR_PASSES} ${TARGET} CACHE INTERNAL "")
+  # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
+  if(${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
+    if(NOT CMAKE_BUILD_TYPE STREQUAL "Release")
+      message(STATUS "add pass ${TARGET} ${DEST}")
     endif()
+    file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
+    set(INFER_IR_PASSES
+        ${INFER_IR_PASSES} ${TARGET}
+        CACHE INTERNAL "")
+  endif()
 endfunction()
 
-cc_library(node SRCS node.cc DEPS proto_desc)
-cc_library(graph SRCS graph.cc DEPS node pretty_log)
-cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
-cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
-cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
-cc_library(cost_model SRCS cost_model.cc DEPS executor graph profiler proto_desc device_tracer)
+cc_library(
+  node
+  SRCS node.cc
+  DEPS proto_desc)
+cc_library(
+  graph
+  SRCS graph.cc
+  DEPS node pretty_log)
+cc_library(
+  graph_helper
+  SRCS graph_helper.cc
+  DEPS graph)
+cc_library(
+  pass
+  SRCS pass.cc
+  DEPS graph node graph_helper)
+cc_library(
+  graph_traits
+  SRCS graph_traits.cc
+  DEPS graph)
+cc_library(
+  cost_model
+  SRCS cost_model.cc
+  DEPS executor graph profiler proto_desc device_tracer)
 
-SET(GRAPH_PATTERN_DETECTOR_DEPS graph graph_helper graph_traits)
-if (WITH_TESTING)
-    SET(GRAPH_PATTERN_DETECTOR_DEPS ${GRAPH_PATTERN_DETECTOR_DEPS} gtest)
+set(GRAPH_PATTERN_DETECTOR_DEPS graph graph_helper graph_traits)
+if(WITH_TESTING)
+  set(GRAPH_PATTERN_DETECTOR_DEPS ${GRAPH_PATTERN_DETECTOR_DEPS} gtest)
 endif(WITH_TESTING)
-cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS ${GRAPH_PATTERN_DETECTOR_DEPS})
+cc_library(
+  graph_pattern_detector
+  SRCS graph_pattern_detector.cc
+  DEPS ${GRAPH_PATTERN_DETECTOR_DEPS})
 
-cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector op_def_api pass)
-cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS graph_pattern_detector executor)
-cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS op_compat_sensible_pass)
-cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass)
+cc_library(
+  op_compat_sensible_pass
+  SRCS op_compat_sensible_pass.cc
+  DEPS graph_pattern_detector op_def_api pass)
+cc_library(
+  subgraph_detector
+  SRCS subgraph_detector.cc
+  DEPS graph_pattern_detector executor)
+cc_library(
+  fuse_pass_base
+  SRCS fuse_pass_base.cc
+  DEPS op_compat_sensible_pass)
+cc_library(
+  placement_pass_base
+  SRCS placement_pass_base.cc
+  DEPS pass)
 
-cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper)
+cc_library(
+  coalesce_grad_tensor_pass
+  SRCS coalesce_grad_tensor_pass.cc
+  DEPS graph graph_helper)
 
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
@@ -106,138 +160,348 @@ pass_library(generate_pass DEPS pass_desc_proto)
 target_link_libraries(generate_pass pass_desc_proto)
 
 if(WITH_TENSORRT)
-    pass_library(trt_map_matmul_to_mul_pass inference)
-    pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference)
-    pass_library(preln_skip_layernorm_fuse_pass inference)
-    pass_library(set_transformer_input_convert_pass inference)
-    pass_library(remove_padding_recover_padding_pass inference)
-    pass_library(delete_remove_padding_recover_padding_pass inference)
+  pass_library(trt_map_matmul_to_mul_pass inference)
+  pass_library(trt_embedding_eltwise_layernorm_fuse_pass inference)
+  pass_library(trt_multihead_matmul_fuse_pass inference)
+  pass_library(trt_skip_layernorm_fuse_pass inference)
+  pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference)
+  pass_library(preln_skip_layernorm_fuse_pass inference)
+  pass_library(set_transformer_input_convert_pass inference)
+  pass_library(remove_padding_recover_padding_pass inference)
+  pass_library(delete_remove_padding_recover_padding_pass inference)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
-    pass_library(cudnn_placement_pass base DEPS placement_pass_base)
-    pass_library(embedding_eltwise_layernorm_fuse_pass inference)
+  pass_library(cudnn_placement_pass base DEPS placement_pass_base)
+  pass_library(embedding_eltwise_layernorm_fuse_pass inference)
 endif()
 
 if(WITH_MKLDNN)
-    pass_library(mkldnn_placement_pass base DEPS placement_pass_base DIR mkldnn)
-    pass_library(mkldnn_inplace_pass inference DEPS mkldnn_placement_pass op_registry elementwise_add_op gelu_op activation_op softmax_op softmax DIR mkldnn)
-    pass_library(depthwise_conv_mkldnn_pass base DIR mkldnn)
-    pass_library(conv_affine_channel_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(conv_bias_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(int8_scale_calculation_mkldnn_pass inference DIR mkldnn)
-    pass_library(fc_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
-    pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
-    pass_library(cpu_bfloat16_pass inference DIR mkldnn)
-    pass_library(fc_mkldnn_pass inference DIR mkldnn)
-    pass_library(interpolate_mkldnn_pass inference DIR mkldnn)
-    pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(shuffle_channel_mkldnn_detect_pass inference DIR mkldnn)
-    pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(cpu_quantize_placement_pass base DIR mkldnn)
-    pass_library(cpu_quantize_pass inference DIR mkldnn)
-    pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
-    pass_library(reshape_transpose_matmul_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(reshape_transpose_matmul_v2_mkldnn_fuse_pass inference DIR mkldnn)
-    pass_library(matmul_transpose_reshape_fuse_pass inference DIR mkldnn)
-    pass_library(matmul_v2_transpose_reshape_fuse_pass inference DIR mkldnn)
-    pass_library(batch_norm_act_fuse_pass inference DIR mkldnn)
-    pass_library(multi_gru_fuse_pass inference DIR mkldnn)
-    pass_library(multi_gru_seq_fuse_pass inference DIR mkldnn)
-    pass_library(quant_dequant_mkldnn_pass inference DIR mkldnn)
-    pass_library(compute_propagate_scales_mkldnn_pass inference DIR mkldnn)
+  pass_library(mkldnn_placement_pass base DEPS placement_pass_base DIR mkldnn)
+  pass_library(
+    mkldnn_inplace_pass
+    inference
+    DEPS
+    mkldnn_placement_pass
+    op_registry
+    elementwise_add_op
+    gelu_op
+    activation_op
+    softmax_op
+    softmax
+    DIR
+    mkldnn)
+  pass_library(depthwise_conv_mkldnn_pass base DIR mkldnn)
+  pass_library(conv_affine_channel_mkldnn_fuse_pass inference DIR mkldnn)
+  pass_library(conv_bias_mkldnn_fuse_pass inference DIR mkldnn)
+  pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn)
+  pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn)
+  pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
+  pass_library(int8_scale_calculation_mkldnn_pass inference DIR mkldnn)
+  pass_library(fc_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
+  pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
+  pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
+  pass_library(cpu_bfloat16_pass inference DIR mkldnn)
+  pass_library(fc_mkldnn_pass inference DIR mkldnn)
+  pass_library(interpolate_mkldnn_pass inference DIR mkldnn)
+  pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn)
+  pass_library(shuffle_channel_mkldnn_detect_pass inference DIR mkldnn)
+  pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn)
+  pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn)
+  pass_library(cpu_quantize_placement_pass base DIR mkldnn)
+  pass_library(cpu_quantize_pass inference DIR mkldnn)
+  pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
+  pass_library(reshape_transpose_matmul_mkldnn_fuse_pass inference DIR mkldnn)
+  pass_library(reshape_transpose_matmul_v2_mkldnn_fuse_pass inference DIR
+               mkldnn)
+  pass_library(matmul_transpose_reshape_fuse_pass inference DIR mkldnn)
+  pass_library(matmul_v2_transpose_reshape_fuse_pass inference DIR mkldnn)
+  pass_library(batch_norm_act_fuse_pass inference DIR mkldnn)
+  pass_library(multi_gru_fuse_pass inference DIR mkldnn)
+  pass_library(multi_gru_seq_fuse_pass inference DIR mkldnn)
+  pass_library(quant_dequant_mkldnn_pass inference DIR mkldnn)
+  pass_library(compute_propagate_scales_mkldnn_pass inference DIR mkldnn)
 endif()
 
 if(WITH_IPU)
-    pass_library(forward_graph_extract_pass base DIR ipu)
-    pass_library(optimizer_extract_pass base DIR ipu)
-    pass_library(optimizer_state_align_pass base DIR ipu)
-    pass_library(ipu_graph_builder_pass base DIR ipu)
-    pass_library(ipu_runtime_replacer_pass base DIR ipu)
-    pass_library(inference_process_pass base DIR ipu)
-    pass_library(inference_postprocess_pass base DIR ipu)
-    pass_library(popart_canonicalization_pass base DIR ipu)
-    pass_library(ipu_inplace_pass base DIR ipu)
-    pass_library(infer_shape_pass base DIR ipu)
-    pass_library(delete_scale_op_pass base DIR ipu)
-    pass_library(avg_shard_pass base DIR ipu)
+  pass_library(forward_graph_extract_pass base DIR ipu)
+  pass_library(optimizer_extract_pass base DIR ipu)
+  pass_library(optimizer_state_align_pass base DIR ipu)
+  pass_library(ipu_graph_builder_pass base DIR ipu)
+  pass_library(ipu_runtime_replacer_pass base DIR ipu)
+  pass_library(inference_process_pass base DIR ipu)
+  pass_library(inference_postprocess_pass base DIR ipu)
+  pass_library(popart_canonicalization_pass base DIR ipu)
+  pass_library(ipu_inplace_pass base DIR ipu)
+  pass_library(infer_shape_pass base DIR ipu)
+  pass_library(delete_scale_op_pass base DIR ipu)
+  pass_library(avg_shard_pass base DIR ipu)
 endif()
 
-cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
-cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector )
-cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
-cc_library(fuse_gemm_epilogue_pass SRCS fuse_gemm_epilogue_pass.cc DEPS pass graph_pattern_detector )
-cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector )
-
-set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
-
-cc_library(pass_builder SRCS pass_builder.cc DEPS pass)
-cc_library(pass_test_util SRCS pass_test_util.cc DEPS graph pass)
-
-cc_test(node_test SRCS node_test.cc DEPS node)
-cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
-cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
-cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
-cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
-cc_test(cost_model_test SRCS cost_model_test.cc DEPS cost_model op_registry)
-cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
-cc_test(test_op_compat_sensible_pass SRCS op_compat_sensible_pass_tester.cc DEPS op_compat_sensible_pass)
-cc_test(test_fc_fuse_pass_cc SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
-cc_test(test_fc_lstm_fuse_pass_cc SRCS fc_lstm_fuse_pass_tester.cc DEPS fc_lstm_fuse_pass framework_proto)
-cc_test(test_fc_gru_fuse_pass_cc SRCS fc_gru_fuse_pass_tester.cc DEPS fc_gru_fuse_pass framework_proto)
-cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
-cc_test(test_seqpool_cvm_concat_fuse_pass SRCS seqpool_cvm_concat_fuse_pass_tester.cc DEPS seqpool_cvm_concat_fuse_pass framework_proto)
-cc_test(test_repeated_fc_relu_fuse_pass_cc SRCS repeated_fc_relu_fuse_pass_tester.cc DEPS repeated_fc_relu_fuse_pass framework_proto)
-cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
-cc_test(test_simplify_with_basic_ops_pass SRCS simplify_with_basic_ops_pass_tester.cc DEPS simplify_with_basic_ops_pass)
-cc_test(test_fc_elementwise_layernorm_fuse_pass_cc SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass)
-cc_test(test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DEPS skip_layernorm_fuse_pass)
-cc_test(test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass)
-cc_test(test_conv_bn_fuse_pass_cc SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass)
-cc_test(test_adaptive_pool2d_convert_global_pass SRCS adaptive_pool2d_convert_global_pass_tester.cc DEPS adaptive_pool2d_convert_global_pass)
-cc_test(test_unsqueeze2_eltwise_fuse_pass_cc SRCS unsqueeze2_eltwise_fuse_pass_tester.cc DEPS unsqueeze2_eltwise_fuse_pass)
-cc_test(test_generate_pass_cc SRCS generate_pass_tester.cc DEPS generate_pass pass_desc_proto)
+cc_library(
+  fuse_bn_act_pass
+  SRCS fuse_bn_act_pass.cc
+  DEPS pass graph_pattern_detector)
+cc_library(
+  fuse_bn_add_act_pass
+  SRCS fuse_bn_add_act_pass.cc
+  DEPS pass graph_pattern_detector)
+cc_library(
+  fuse_elewise_add_act_pass
+  SRCS fuse_elewise_add_act_pass.cc
+  DEPS pass graph_pattern_detector)
+cc_library(
+  fuse_gemm_epilogue_pass
+  SRCS fuse_gemm_epilogue_pass.cc
+  DEPS pass graph_pattern_detector)
+cc_library(
+  fuse_relu_depthwise_conv_pass
+  SRCS fuse_relu_depthwise_conv_pass.cc
+  DEPS pass graph_pattern_detector)
+
+set(GLOB_PASS_LIB
+    ${PASS_LIBRARY}
+    CACHE INTERNAL "Global PASS library")
+
+cc_library(
+  pass_builder
+  SRCS pass_builder.cc
+  DEPS pass)
+cc_library(
+  pass_test_util
+  SRCS pass_test_util.cc
+  DEPS graph pass)
+
+cc_test(
+  node_test
+  SRCS node_test.cc
+  DEPS node)
+cc_test(
+  pass_test
+  SRCS pass_test.cc
+  DEPS graph pass graph_helper)
+cc_test(
+  graph_test
+  SRCS graph_test.cc
+  DEPS graph graph_helper op_registry)
+cc_test(
+  graph_helper_test
+  SRCS graph_helper_test.cc
+  DEPS graph graph_helper op_registry)
+cc_test(
+  graph_to_program_pass_test
+  SRCS graph_to_program_pass_test.cc
+  DEPS graph_to_program_pass)
+cc_test(
+  cost_model_test
+  SRCS cost_model_test.cc
+  DEPS cost_model op_registry)
+cc_test(
+  test_graph_pattern_detector
+  SRCS graph_pattern_detector_tester.cc
+  DEPS graph_pattern_detector)
+cc_test(
+  test_op_compat_sensible_pass
+  SRCS op_compat_sensible_pass_tester.cc
+  DEPS op_compat_sensible_pass)
+cc_test(
+  test_fc_fuse_pass_cc
+  SRCS fc_fuse_pass_tester.cc
+  DEPS fc_fuse_pass framework_proto)
+cc_test(
+  test_fc_lstm_fuse_pass_cc
+  SRCS fc_lstm_fuse_pass_tester.cc
+  DEPS fc_lstm_fuse_pass framework_proto)
+cc_test(
+  test_fc_gru_fuse_pass_cc
+  SRCS fc_gru_fuse_pass_tester.cc
+  DEPS fc_gru_fuse_pass framework_proto)
+cc_test(
+  test_seqpool_concat_fuse_pass
+  SRCS seqpool_concat_fuse_pass_tester.cc
+  DEPS seqpool_concat_fuse_pass framework_proto)
+cc_test(
+  test_seqpool_cvm_concat_fuse_pass
+  SRCS seqpool_cvm_concat_fuse_pass_tester.cc
+  DEPS seqpool_cvm_concat_fuse_pass framework_proto)
+cc_test(
+  test_repeated_fc_relu_fuse_pass_cc
+  SRCS repeated_fc_relu_fuse_pass_tester.cc
+  DEPS repeated_fc_relu_fuse_pass framework_proto)
+cc_test(
+  test_is_test_pass
+  SRCS is_test_pass_tester.cc
+  DEPS is_test_pass)
+cc_test(
+  test_simplify_with_basic_ops_pass
+  SRCS simplify_with_basic_ops_pass_tester.cc
+  DEPS simplify_with_basic_ops_pass)
+cc_test(
+  test_fc_elementwise_layernorm_fuse_pass_cc
+  SRCS fc_elementwise_layernorm_fuse_pass_tester.cc
+  DEPS fc_elementwise_layernorm_fuse_pass)
+cc_test(
+  test_skip_layernorm_fuse_pass
+  SRCS skip_layernorm_fuse_pass_tester.cc
+  DEPS skip_layernorm_fuse_pass)
+cc_test(
+  test_multihead_matmul_fuse_pass
+  SRCS multihead_matmul_fuse_pass_tester.cc
+  DEPS multihead_matmul_fuse_pass)
+cc_test(
+  test_conv_bn_fuse_pass_cc
+  SRCS conv_bn_fuse_pass_tester.cc
+  DEPS conv_bn_fuse_pass)
+cc_test(
+  test_adaptive_pool2d_convert_global_pass
+  SRCS adaptive_pool2d_convert_global_pass_tester.cc
+  DEPS adaptive_pool2d_convert_global_pass)
+cc_test(
+  test_unsqueeze2_eltwise_fuse_pass_cc
+  SRCS unsqueeze2_eltwise_fuse_pass_tester.cc
+  DEPS unsqueeze2_eltwise_fuse_pass)
+cc_test(
+  test_generate_pass_cc
+  SRCS generate_pass_tester.cc
+  DEPS generate_pass pass_desc_proto)
 if(WITH_GPU OR WITH_ROCM)
-    cc_test(test_embedding_eltwise_layernorm_fuse_pass SRCS embedding_eltwise_layernorm_fuse_pass_tester.cc DEPS embedding_eltwise_layernorm_fuse_pass)
-    cc_test(test_cudnn_placement_pass SRCS cudnn_placement_pass_tester.cc DEPS cudnn_placement_pass)
+  cc_test(
+    test_embedding_eltwise_layernorm_fuse_pass
+    SRCS embedding_eltwise_layernorm_fuse_pass_tester.cc
+    DEPS embedding_eltwise_layernorm_fuse_pass)
+  cc_test(
+    test_cudnn_placement_pass
+    SRCS cudnn_placement_pass_tester.cc
+    DEPS cudnn_placement_pass)
 endif()
 if(NOT WIN32)
-    cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
+  cc_test(
+    test_sync_batch_norm_pass
+    SRCS sync_batch_norm_pass_tester.cc
+    DEPS sync_batch_norm_pass)
 endif()
-if (WITH_MKLDNN)
-    cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
-    cc_test(test_conv_bias_mkldnn_fuse_pass_cc SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
-    cc_test(test_conv_activation_mkldnn_fuse_pass SRCS mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc DEPS conv_activation_mkldnn_fuse_pass)
-    cc_test(test_conv_concat_relu_mkldnn_fuse_pass SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc DEPS conv_concat_relu_mkldnn_fuse_pass)
-    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util)
-    cc_test(test_int8_scale_calculation_mkldnn_pass SRCS mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc DEPS int8_scale_calculation_mkldnn_pass pass_test_util)
-    cc_test(test_fc_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS fc_elementwise_add_mkldnn_fuse_pass pass_test_util)
-    cc_test(test_fc_act_mkldnn_fuse_pass SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc DEPS fc_act_mkldnn_fuse_pass pass_test_util)
-    cc_test(test_batch_norm_act_fuse_pass SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc DEPS batch_norm_act_fuse_pass pass_test_util)
-    set(TEST_CONV_BN_PASS_DEPS conv_bn_fuse_pass graph_to_program_pass conv_op conv_transpose_op math_function im2col vol2col batch_norm_op gelu_op activation_op elementwise_add_op concat_and_split naive_executor device_context eigen_function)
-if (WITH_GPU OR WITH_ROCM)
+if(WITH_MKLDNN)
+  cc_test(
+    test_depthwise_conv_mkldnn_pass
+    SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc
+    DEPS depthwise_conv_mkldnn_pass)
+  cc_test(
+    test_conv_bias_mkldnn_fuse_pass_cc
+    SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+    DEPS conv_bias_mkldnn_fuse_pass naive_executor)
+  cc_test(
+    test_conv_activation_mkldnn_fuse_pass
+    SRCS mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
+    DEPS conv_activation_mkldnn_fuse_pass)
+  cc_test(
+    test_conv_concat_relu_mkldnn_fuse_pass
+    SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
+    DEPS conv_concat_relu_mkldnn_fuse_pass)
+  cc_test(
+    test_conv_elementwise_add_mkldnn_fuse_pass
+    SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+    DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util)
+  cc_test(
+    test_int8_scale_calculation_mkldnn_pass
+    SRCS mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
+    DEPS int8_scale_calculation_mkldnn_pass pass_test_util)
+  cc_test(
+    test_fc_elementwise_add_mkldnn_fuse_pass
+    SRCS mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc
+    DEPS fc_elementwise_add_mkldnn_fuse_pass pass_test_util)
+  cc_test(
+    test_fc_act_mkldnn_fuse_pass
+    SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc
+    DEPS fc_act_mkldnn_fuse_pass pass_test_util)
+  cc_test(
+    test_batch_norm_act_fuse_pass
+    SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc
+    DEPS batch_norm_act_fuse_pass pass_test_util)
+  set(TEST_CONV_BN_PASS_DEPS
+      conv_bn_fuse_pass
+      graph_to_program_pass
+      conv_op
+      conv_transpose_op
+      math_function
+      im2col
+      vol2col
+      batch_norm_op
+      gelu_op
+      activation_op
+      elementwise_add_op
+      concat_and_split
+      naive_executor
+      device_context
+      eigen_function)
+  if(WITH_GPU OR WITH_ROCM)
     set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv)
+  endif()
+  cc_test(
+    test_conv_batch_norm_mkldnn_fuse_pass
+    SRCS mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+    DEPS ${TEST_CONV_BN_PASS_DEPS})
+  cc_test(
+    test_scale_matmul_fuse_pass
+    SRCS mkldnn/scale_matmul_fuse_pass_tester.cc
+    DEPS scale_matmul_fuse_pass)
+  cc_test(
+    test_mkldnn_placement_pass
+    SRCS mkldnn/mkldnn_placement_pass_tester.cc
+    DEPS mkldnn_placement_pass)
+  cc_test(
+    test_mkldnn_inplace_pass
+    SRCS mkldnn/mkldnn_inplace_pass_tester.cc
+    DEPS mkldnn_inplace_pass)
+  cc_test(
+    test_compute_propagate_scales_mkldnn_pass
+    SRCS mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
+    DEPS compute_propagate_scales_mkldnn_pass naive_executor)
+  cc_test(
+    test_cpu_quantize_placement_pass
+    SRCS mkldnn/cpu_quantize_placement_pass_tester.cc
+    DEPS cpu_quantize_placement_pass)
+  cc_test(
+    test_cpu_quantize_pass
+    SRCS mkldnn/cpu_quantize_pass_tester.cc
+    DEPS cpu_quantize_pass naive_executor)
+  cc_test(
+    test_cpu_quantize_squash_pass
+    SRCS mkldnn/cpu_quantize_squash_pass_tester.cc
+    DEPS cpu_quantize_squash_pass naive_executor)
+  cc_test(
+    test_reshape_transpose_matmul_mkldnn_fuse_pass
+    SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
+    DEPS reshape_transpose_matmul_mkldnn_fuse_pass
+         reshape_transpose_matmul_v2_mkldnn_fuse_pass)
+  cc_test(
+    test_matmul_transpose_reshape_fuse_pass
+    SRCS mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc
+    DEPS matmul_transpose_reshape_fuse_pass
+         matmul_v2_transpose_reshape_fuse_pass)
+  cc_test(
+    test_shuffle_channel_mkldnn_detect_pass
+    SRCS mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
+    DEPS shuffle_channel_mkldnn_detect_pass)
+  cc_test(
+    test_cpu_bfloat16_placement_pass
+    SRCS mkldnn/cpu_bfloat16_placement_pass_tester.cc
+    DEPS cpu_bfloat16_placement_pass)
+  cc_test(
+    test_cpu_bfloat16_pass
+    SRCS mkldnn/cpu_bfloat16_pass_tester.cc
+    DEPS cpu_bfloat16_pass)
+  cc_test(
+    test_multi_gru_fuse_pass
+    SRCS mkldnn/multi_gru_fuse_pass_tester.cc
+    DEPS multi_gru_fuse_pass)
+  cc_test(
+    test_multi_gru_seq_fuse_pass
+    SRCS mkldnn/multi_gru_seq_fuse_pass_tester.cc
+    DEPS multi_gru_seq_fuse_pass)
+  set(TEST_FC_RNN_PASS_DEPS fc_gru_fuse_pass fc_lstm_fuse_pass
+                            mkldnn_placement_pass)
+  cc_test(
+    test_fc_rnn_mkldnn_fuse_pass
+    SRCS mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc
+    DEPS ${TEST_FC_RNN_PASS_DEPS})
 endif()
-    cc_test(test_conv_batch_norm_mkldnn_fuse_pass SRCS mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc DEPS ${TEST_CONV_BN_PASS_DEPS})
-    cc_test(test_scale_matmul_fuse_pass SRCS mkldnn/scale_matmul_fuse_pass_tester.cc DEPS scale_matmul_fuse_pass)
-    cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass)
-    cc_test(test_mkldnn_inplace_pass SRCS mkldnn/mkldnn_inplace_pass_tester.cc DEPS mkldnn_inplace_pass)
-    cc_test(test_compute_propagate_scales_mkldnn_pass SRCS mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc DEPS compute_propagate_scales_mkldnn_pass naive_executor)
-    cc_test(test_cpu_quantize_placement_pass SRCS mkldnn/cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
-    cc_test(test_cpu_quantize_pass SRCS mkldnn/cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
-    cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
-    cc_test(test_reshape_transpose_matmul_mkldnn_fuse_pass SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc DEPS reshape_transpose_matmul_mkldnn_fuse_pass reshape_transpose_matmul_v2_mkldnn_fuse_pass)
-    cc_test(test_matmul_transpose_reshape_fuse_pass SRCS mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc DEPS matmul_transpose_reshape_fuse_pass matmul_v2_transpose_reshape_fuse_pass)
-    cc_test(test_shuffle_channel_mkldnn_detect_pass SRCS mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc DEPS shuffle_channel_mkldnn_detect_pass)
-    cc_test(test_cpu_bfloat16_placement_pass SRCS mkldnn/cpu_bfloat16_placement_pass_tester.cc DEPS cpu_bfloat16_placement_pass)
-    cc_test(test_cpu_bfloat16_pass SRCS mkldnn/cpu_bfloat16_pass_tester.cc DEPS cpu_bfloat16_pass)
-    cc_test(test_multi_gru_fuse_pass SRCS mkldnn/multi_gru_fuse_pass_tester.cc DEPS multi_gru_fuse_pass)
-    cc_test(test_multi_gru_seq_fuse_pass SRCS mkldnn/multi_gru_seq_fuse_pass_tester.cc DEPS multi_gru_seq_fuse_pass)
-    set(TEST_FC_RNN_PASS_DEPS fc_gru_fuse_pass fc_lstm_fuse_pass mkldnn_placement_pass)
-    cc_test(test_fc_rnn_mkldnn_fuse_pass SRCS mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc DEPS ${TEST_FC_RNN_PASS_DEPS})
-endif ()
diff --git a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc
index 8870b68fbc5c5..e0ce58121a15e 100644
--- a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc
+++ b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/add_support_int8_pass.cc b/paddle/fluid/framework/ir/add_support_int8_pass.cc
index 3a3f5c3741f4d..d38853bb96489 100644
--- a/paddle/fluid/framework/ir/add_support_int8_pass.cc
+++ b/paddle/fluid/framework/ir/add_support_int8_pass.cc
@@ -68,9 +68,8 @@ void AddSupportInt8Pass::ApplyImpl(ir::Graph* graph) const {
                    i++) {
                 if (quanted_op_desc->Output(quanted_op_desc->OutputNames()[i])
                             .size() > 0 &&
-                    input_name ==
-                        quanted_op_desc->Output(
-                            quanted_op_desc->OutputNames()[i])[0]) {
+                    input_name == quanted_op_desc->Output(
+                                      quanted_op_desc->OutputNames()[i])[0]) {
                   outscale_flag = true;
                   quanted_op_desc->SetAttr(
                       quanted_op_desc->OutputNames()[i],
diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
index 08e7c6f5b8689..910cb5801db45 100644
--- a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
+++ b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
+
 #include <algorithm>
 #include <string>
+
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
index ae843aad7d313..710f8ef1b3759 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/conv_bn_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/conv_bn_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc
index 6086409ffd971..05c7834c9ca9b 100644
--- a/paddle/fluid/framework/ir/cost_model.cc
+++ b/paddle/fluid/framework/ir/cost_model.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/cost_model.h"
 
 #include <memory>
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/errors.h"
diff --git a/paddle/fluid/framework/ir/cost_model_test.cc b/paddle/fluid/framework/ir/cost_model_test.cc
index 57f3904d845c8..f5eaa2f0338cb 100644
--- a/paddle/fluid/framework/ir/cost_model_test.cc
+++ b/paddle/fluid/framework/ir/cost_model_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/cost_model.h"
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc
index 2d270f444adbc..2711ddf92d792 100644
--- a/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/cudnn_placement_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/cudnn_placement_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/operator.h"
 
diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
index 9473cc069285c..5043beef82401 100644
--- a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <string>
-
 #include "paddle/fluid/framework/ir/delete_dropout_op_pass.h"
 
+#include <string>
+
 namespace phi {
 class DenseTensor;
 }  // namespace phi
diff --git a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
index 79a06572d1427..e4b6e43e5c3dc 100644
--- a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/delete_fill_constant_op_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
index 2fc133edb7a96..a02efc0a7cef2 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
@@ -102,9 +102,10 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
         break;
       }
     }
-    PADDLE_ENFORCE_GT(arg_name.size(), 0, platform::errors::InvalidArgument(
-                                              "can not find the input %s.",
-                                              quant_dequant_op_out_name));
+    PADDLE_ENFORCE_GT(
+        arg_name.size(), 0,
+        platform::errors::InvalidArgument("can not find the input %s.",
+                                          quant_dequant_op_out_name));
     // any_op2_desc->SetAttr("enable_int8", true);
     any_op2_desc->SetAttr("bit_length", bit_length);
 
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
index 727e42629f9fa..8deaf10d200a5 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
index 482e38355c59c..a34e0a5d1deae 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h"
 
 #include <string>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
index 46a9b2eae35db..be22ee9b2fe36 100644
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h"
-
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 1e25b21483b82..1802616c0df5b 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
index 39b544e716079..e40759cd3fbe2 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h
index df3fbc293b78e..9ad3c28f09a2e 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h
@@ -13,9 +13,9 @@
 // limitations under the License.
 #pragma once
 
-#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index b99e607f92b5d..5b4bb98ff537c 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h
index a313e49f0b2b6..3e47f0795738e 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h
index ab66fb4a46a8a..632bb237fa219 100644
--- a/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h
+++ b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <memory>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
index f12273e94dddd..6a2a086704829 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fuse_bn_act_pass.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
index 005f006ab0478..ff4850838c51f 100644
--- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fuse_bn_add_act_pass.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
index 62f65baf33618..3feea822bc1ef 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
index f48224cbdc24f..1c6b856d987ce 100644
--- a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
@@ -14,7 +14,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -22,6 +24,12 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+static void GetTransposeAttrsFromOp(const OpDesc &op, bool *trans_x,
+                                    bool *trans_y) {
+  *trans_x = BOOST_GET_CONST(bool, op.GetAttr("trans_x"));
+  *trans_y = BOOST_GET_CONST(bool, op.GetAttr("trans_y"));
+}
+
 void FuseGemmEpiloguePass::ApplyImpl(ir::Graph *graph) const {
   EpiloguePassActivationCache cache;
 
@@ -75,6 +83,9 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph,
     if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc))
       return;
 
+    bool trans_x, trans_y;
+    GetTransposeAttrsFromOp(*matmul_op_desc, &trans_x, &trans_y);
+
     OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block());
     std::string activation = "none";
     fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue");
@@ -85,6 +96,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph,
     fused_gemm_epilogue_op_desc.SetAttr("activation", activation);
     fused_gemm_epilogue_op_desc.SetAttr("op_role",
                                         matmul_op_desc->GetAttr("op_role"));
+    fused_gemm_epilogue_op_desc.SetAttr("trans_x", trans_x);
+    fused_gemm_epilogue_op_desc.SetAttr("trans_y", trans_y);
     auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc);
 
     IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node);
@@ -154,6 +167,9 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd(
 
     auto activation = act_op->Op()->Type();
 
+    bool trans_x, trans_y;
+    GetTransposeAttrsFromOp(*matmul_op_desc, &trans_x, &trans_y);
+
     OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block());
     fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue");
     fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()});
@@ -163,6 +179,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd(
     fused_gemm_epilogue_op_desc.SetAttr("activation", activation);
     fused_gemm_epilogue_op_desc.SetAttr("op_role",
                                         matmul_op_desc->GetAttr("op_role"));
+    fused_gemm_epilogue_op_desc.SetAttr("trans_x", trans_x);
+    fused_gemm_epilogue_op_desc.SetAttr("trans_y", trans_y);
 
     auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc);
 
@@ -274,6 +292,9 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph,
                            matmul_grad_op_desc))
       return;
 
+    bool trans_x, trans_y;
+    GetTransposeAttrsFromOp(*matmul_grad_op_desc, &trans_x, &trans_y);
+
     OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block());
     std::string activation_grad = "none";
     fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad");
@@ -292,6 +313,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph,
                                              activation_grad);
     fused_gemm_epilogue_grad_op_desc.SetAttr(
         "op_role", matmul_grad_op_desc->GetAttr("op_role"));
+    fused_gemm_epilogue_grad_op_desc.SetAttr("trans_x", trans_x);
+    fused_gemm_epilogue_grad_op_desc.SetAttr("trans_y", trans_y);
 
     auto gemm_epilogue_grad_node =
         g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc);
@@ -394,6 +417,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd(
 
     auto activation_grad = act_grad_op->Op()->Type();
 
+    bool trans_x, trans_y;
+    GetTransposeAttrsFromOp(*matmul_grad_op_desc, &trans_x, &trans_y);
     OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block());
     fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad");
     fused_gemm_epilogue_grad_op_desc.SetInput("DOut",
@@ -410,6 +435,8 @@ ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd(
                                              activation_grad);
     fused_gemm_epilogue_grad_op_desc.SetAttr(
         "op_role", matmul_grad_op_desc->GetAttr("op_role"));
+    fused_gemm_epilogue_grad_op_desc.SetAttr("trans_x", trans_x);
+    fused_gemm_epilogue_grad_op_desc.SetAttr("trans_y", trans_y);
 
     auto gemm_epilogue_grad_node =
         g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc);
@@ -456,10 +483,6 @@ bool FuseGemmEpiloguePass::IsGemmFromLinear_(
       if (tmp_vec.size() > 0) return false;
     }
   }
-  if (BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_x")) ||
-      BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_y")))
-    return false;
-
   return true;
 }
 
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt
index 22876e962a033..7146e9919190d 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt
@@ -1,4 +1,16 @@
-cc_library(fuse_optimizer_op_pass SRCS fuse_optimizer_op_pass.cc DEPS graph graph_helper)
-cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc DEPS fuse_optimizer_op_pass)
-cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc DEPS fuse_optimizer_op_pass)
-cc_library(fuse_momentum_op_pass SRCS fuse_momentum_op_pass.cc DEPS fuse_optimizer_op_pass)
+cc_library(
+  fuse_optimizer_op_pass
+  SRCS fuse_optimizer_op_pass.cc
+  DEPS graph graph_helper)
+cc_library(
+  fuse_adam_op_pass
+  SRCS fuse_adam_op_pass.cc
+  DEPS fuse_optimizer_op_pass)
+cc_library(
+  fuse_sgd_op_pass
+  SRCS fuse_sgd_op_pass.cc
+  DEPS fuse_optimizer_op_pass)
+cc_library(
+  fuse_momentum_op_pass
+  SRCS fuse_momentum_op_pass.cc
+  DEPS fuse_optimizer_op_pass)
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
index 0094b674c2a17..9629b9209c4d8 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <sys/types.h>
+
 #include <string>
 
 #include "glog/logging.h"
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
index f87d31cbc409c..e290bdf99ce65 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
@@ -67,8 +67,9 @@ class FuseMomentumOpPass : public FuseOptimizerOpPass {
           platform::errors::InvalidArgument(
               "All momentum Op's attr(use_nesterov) must be same, but there "
               "are two different value: %d, %d.",
-              use_nesterov, BOOST_GET_CONST(bool, momentum_op->Op()->GetAttr(
-                                                      "use_nesterov"))));
+              use_nesterov,
+              BOOST_GET_CONST(bool,
+                              momentum_op->Op()->GetAttr("use_nesterov"))));
       PADDLE_ENFORCE_EQ(
           op_role,
           BOOST_GET_CONST(int, momentum_op->Op()->GetAttr(
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index 40e1de8a523aa..e3e5221531ee0 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/phi/core/kernel_factory.h"
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
index 56ca98b566070..bcfa69ac2e7ef 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h"
+
 #include <algorithm>
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
index 78b15398cc792..7df678fbdd7e3 100644
--- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
@@ -1,14 +1,22 @@
-cc_library(code_generator
-    SRCS operation.cc code_generator.cc code_generator_helper.cc
-    DEPS graph subgraph_detector)
+cc_library(
+  code_generator
+  SRCS operation.cc code_generator.cc code_generator_helper.cc
+  DEPS graph subgraph_detector)
 if(WITH_GPU OR WITH_ROCM)
-    cc_test(test_code_generator SRCS code_generator_tester.cc DEPS code_generator device_code lod_tensor graph_viz_pass)
+  cc_test(
+    test_code_generator
+    SRCS code_generator_tester.cc
+    DEPS code_generator device_code lod_tensor graph_viz_pass)
 endif()
 
-cc_library(fusion_group_pass
-    SRCS fusion_group_pass.cc elementwise_group_detector.cc
-    DEPS subgraph_detector fuse_pass_base code_generator device_code)
-cc_test(test_fusion_group_pass SRCS fusion_group_pass_tester.cc DEPS fusion_group_pass graph_viz_pass)
+cc_library(
+  fusion_group_pass
+  SRCS fusion_group_pass.cc elementwise_group_detector.cc
+  DEPS subgraph_detector fuse_pass_base code_generator device_code)
+cc_test(
+  test_fusion_group_pass
+  SRCS fusion_group_pass_tester.cc
+  DEPS fusion_group_pass graph_viz_pass)
 if(WITH_TESTING AND TEST test_code_generator)
-    set_tests_properties(test_code_generator PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_code_generator PROPERTIES TIMEOUT 120)
 endif()
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
index 5b125030a7a77..a8a09d690239c 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/fusion_group/code_generator.h"
+
 #include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h"
 #include "paddle/fluid/framework/ir/fusion_group/cuda_resources.h"
 
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
index 18bd6d623b7ea..650ed965067ad 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <sstream>
 #include <string>
+
 #include "paddle/fluid/framework/ir/fusion_group/operation.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index 7b6bbf0251001..a24a9af158ec0 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <string>
 
diff --git a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc
index 6fa3044affc21..5be4091ca8b3c 100644
--- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc
+++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/ir/fusion_group/operation.h"
diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
index 85d34405c5e57..44df3a837f6d3 100644
--- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
+++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h"
+
 #include "paddle/fluid/framework/ir/fusion_group/code_generator.h"
 #include "paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
index db22c03a7d9c0..402fad0e84cfa 100644
--- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc
index 2b7a3e1899c76..7d1b7bafa1365 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.cc
+++ b/paddle/fluid/framework/ir/fusion_group/operation.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/fusion_group/operation.h"
+
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fusion_group/subgraph.h b/paddle/fluid/framework/ir/fusion_group/subgraph.h
index 5a29e875aea61..1c334e70f1c30 100644
--- a/paddle/fluid/framework/ir/fusion_group/subgraph.h
+++ b/paddle/fluid/framework/ir/fusion_group/subgraph.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fusion_group/operation.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc
index 02c9d8e1c0c24..00d69c9d5d2b1 100644
--- a/paddle/fluid/framework/ir/generate_pass.cc
+++ b/paddle/fluid/framework/ir/generate_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/generate_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
@@ -234,178 +235,183 @@ bool IsDuplicatePattern(const GraphPatternDetector::subgraph_t& subgraph,
 
 GraphPatternDetector::handle_t GetGenerateDelete(
     const PDPattern& pattern, const proto::PassDesc& pass_desc) {
-  GraphPatternDetector::handle_t handler = [&](
-      const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-    if (IsDuplicatePattern(subgraph, graph)) {
-      return;
-    }
-    // `var_node_maps` record the mapping of variable to the pattern subgraph.
-    std::map<std::string, Node*> var_node_maps;
-    for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) {
-      Node* node = subgraph.at(pattern.RetrieveNode(var_map.pattern_var()));
-      const auto& iter = var_node_maps.find(var_map.replace_var());
-      if (var_node_maps.end() == iter) {
-        // first node is input
-        var_node_maps.insert({var_map.replace_var(), node});
-      } else {
-        // output node
-        for (Node* s_node : node->outputs) {
-          iter->second->outputs.push_back(s_node);
-          std::replace(s_node->inputs.begin(), s_node->inputs.end(), node,
-                       iter->second);
-          s_node->Op()->RenameInput(node->Name(), iter->second->Name());
+  GraphPatternDetector::handle_t handler =
+      [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
+        if (IsDuplicatePattern(subgraph, graph)) {
+          return;
         }
-      }
-    }
-    // Remove nodes that are intermediate.
-    std::unordered_set<const Node*> remove_nodes;
-    for (const std::unique_ptr<PDNode>& pdnode : pattern.nodes()) {
-      remove_nodes.emplace(subgraph.at(pdnode.get()));
-    }
-    for (auto iter : var_node_maps) {
-      remove_nodes.erase(iter.second);
-    }
-    GraphSafeRemoveNodes(graph, remove_nodes);
-  };
+        // `var_node_maps` record the mapping of variable to the pattern
+        // subgraph.
+        std::map<std::string, Node*> var_node_maps;
+        for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) {
+          Node* node = subgraph.at(pattern.RetrieveNode(var_map.pattern_var()));
+          const auto& iter = var_node_maps.find(var_map.replace_var());
+          if (var_node_maps.end() == iter) {
+            // first node is input
+            var_node_maps.insert({var_map.replace_var(), node});
+          } else {
+            // output node
+            for (Node* s_node : node->outputs) {
+              iter->second->outputs.push_back(s_node);
+              std::replace(s_node->inputs.begin(), s_node->inputs.end(), node,
+                           iter->second);
+              s_node->Op()->RenameInput(node->Name(), iter->second->Name());
+            }
+          }
+        }
+        // Remove nodes that are intermediate.
+        std::unordered_set<const Node*> remove_nodes;
+        for (const std::unique_ptr<PDNode>& pdnode : pattern.nodes()) {
+          remove_nodes.emplace(subgraph.at(pdnode.get()));
+        }
+        for (auto iter : var_node_maps) {
+          remove_nodes.erase(iter.second);
+        }
+        GraphSafeRemoveNodes(graph, remove_nodes);
+      };
   return handler;
 }
 
 GraphPatternDetector::handle_t GetGenerateRewrite(
     const PDPattern& pattern, const proto::PassDesc& pass_desc) {
-  GraphPatternDetector::handle_t handler = [&](
-      const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-    if (IsDuplicatePattern(subgraph, graph)) {
-      return;
-    }
-    for (const auto& condition : pass_desc.var_attr_conditions()) {
-      if (condition.has_condition_attr()) {
-        Node* node =
-            subgraph.at(pattern.RetrieveNode(condition.attr().var_name()));
-        Attribute node_attr = GetVarAttrValue(node->Var(), condition.attr());
-        Attribute condition_attr;
-        if (condition.condition_attr().role() ==
-            proto::PassDesc_RoleType_kVariable) {
-          Node* condition_node =
-              subgraph.at(pattern.RetrieveNode(condition.attr().var_name()));
-          condition_attr = GetVarAttrValue(condition_node->Var(),
-                                           condition.condition_attr());
-        } else {
-          PADDLE_THROW(
-              platform::errors::Unimplemented("Unimplemented for operation."));
-        }
-        bool check_failed = false;
-        if (condition.type() == proto::PassDesc_ConditionType_kEQ) {
-          check_failed = !(node_attr == condition_attr);
-        }
-        if (check_failed) {
-          VLOG(3) << "Check var [" << node->Name() << "] with attr ["
-                  << condition.attr().name() << "] failed, skip this pattern.";
+  GraphPatternDetector::handle_t handler =
+      [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
+        if (IsDuplicatePattern(subgraph, graph)) {
           return;
         }
-      }
-    }
-    // `var_node_maps` record the mapping of variable to the pattern subgraph.
-    std::map<std::string, Node*> var_node_maps;
-    for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) {
-      Node* node = subgraph.at(pattern.RetrieveNode(var_map.pattern_var()));
-      var_node_maps.insert({var_map.replace_var(), node});
-    }
-    // Traverse all operators to create subgraph.
-    for (int index = 0; index < pass_desc.replace_size(); ++index) {
-      const proto::OpDesc& op = pass_desc.replace(index);
-      OpDesc op_desc;
-      std::vector<Node *> in_nodes, out_nodes;
-      op_desc.SetType(op.type());
-      // Create Nodes for inputs of current operator.
-      for (const proto::OpDesc::Var& var : op.inputs()) {
-        std::vector<std::string> arguments;
-        for (const std::string& argument : var.arguments()) {
-          // The input may be mapped on the operator of pattern subgraph.
-          Node* node = nullptr;
-          auto iter = var_node_maps.find(argument);
-          if (var_node_maps.end() == iter) {
-            VarDesc var_desc(patterns::UniqueKey(argument));
-            node = graph->CreateVarNode(&var_desc);
-            var_node_maps.insert({argument, node});
-          } else {
-            node = iter->second;
-          }
-          in_nodes.push_back(node);
-          arguments.push_back(node->Name());
-        }
-        op_desc.SetInput(var.parameter(), arguments);
-      }
-      // Create Nodes for outputs of current operator.
-      for (const proto::OpDesc::Var& var : op.outputs()) {
-        std::vector<std::string> arguments;
-        for (const std::string& argument : var.arguments()) {
-          // The output may be mapped on the operator of pattern subgraph.
-          Node* node = nullptr;
-          auto iter = var_node_maps.find(argument);
-          if (var_node_maps.end() == iter) {
-            VarDesc var_desc(patterns::UniqueKey(argument));
-            node = graph->CreateVarNode(&var_desc);
-            var_node_maps.insert({argument, node});
-          } else {
-            if (in_nodes.end() ==
-                std::find(in_nodes.begin(), in_nodes.end(), iter->second)) {
-              node = iter->second;
+        for (const auto& condition : pass_desc.var_attr_conditions()) {
+          if (condition.has_condition_attr()) {
+            Node* node =
+                subgraph.at(pattern.RetrieveNode(condition.attr().var_name()));
+            Attribute node_attr =
+                GetVarAttrValue(node->Var(), condition.attr());
+            Attribute condition_attr;
+            if (condition.condition_attr().role() ==
+                proto::PassDesc_RoleType_kVariable) {
+              Node* condition_node = subgraph.at(
+                  pattern.RetrieveNode(condition.attr().var_name()));
+              condition_attr = GetVarAttrValue(condition_node->Var(),
+                                               condition.condition_attr());
             } else {
-              node = graph->CreateVarNode(iter->second->Var());
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unimplemented for operation."));
+            }
+            bool check_failed = false;
+            if (condition.type() == proto::PassDesc_ConditionType_kEQ) {
+              check_failed = !(node_attr == condition_attr);
+            }
+            if (check_failed) {
+              VLOG(3) << "Check var [" << node->Name() << "] with attr ["
+                      << condition.attr().name()
+                      << "] failed, skip this pattern.";
+              return;
             }
           }
-          out_nodes.push_back(node);
-          arguments.push_back(node->Name());
         }
-        op_desc.SetOutput(var.parameter(), arguments);
-      }
-      // Set attribute for current operator.
-      for (const proto::OpDesc::Attr& attr : op.attrs()) {
-        op_desc.SetAttr(attr.name(), GetAttrValue(attr));
-      }
-      for (const auto& attr_map : pass_desc.op_attr_maps()) {
-        if (attr_map.replace_attr().op_index() == index) {
-          Attribute attr;
-          if (attr_map.pattern_attr().role() ==
-              proto::PassDesc_RoleType_kVariable) {
-            Node* condition_node = subgraph.at(
-                pattern.RetrieveNode(attr_map.pattern_attr().var_name()));
-            attr =
-                GetVarAttrValue(condition_node->Var(), attr_map.pattern_attr());
-          } else {
-            Node* condition_node = subgraph.at(pattern.RetrieveNode(
-                std::to_string(attr_map.pattern_attr().op_index())));
-            attr =
-                GetOpAttrValue(condition_node->Op(), attr_map.pattern_attr());
+        // `var_node_maps` record the mapping of variable to the pattern
+        // subgraph.
+        std::map<std::string, Node*> var_node_maps;
+        for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) {
+          Node* node = subgraph.at(pattern.RetrieveNode(var_map.pattern_var()));
+          var_node_maps.insert({var_map.replace_var(), node});
+        }
+        // Traverse all operators to create subgraph.
+        for (int index = 0; index < pass_desc.replace_size(); ++index) {
+          const proto::OpDesc& op = pass_desc.replace(index);
+          OpDesc op_desc;
+          std::vector<Node*> in_nodes, out_nodes;
+          op_desc.SetType(op.type());
+          // Create Nodes for inputs of current operator.
+          for (const proto::OpDesc::Var& var : op.inputs()) {
+            std::vector<std::string> arguments;
+            for (const std::string& argument : var.arguments()) {
+              // The input may be mapped on the operator of pattern subgraph.
+              Node* node = nullptr;
+              auto iter = var_node_maps.find(argument);
+              if (var_node_maps.end() == iter) {
+                VarDesc var_desc(patterns::UniqueKey(argument));
+                node = graph->CreateVarNode(&var_desc);
+                var_node_maps.insert({argument, node});
+              } else {
+                node = iter->second;
+              }
+              in_nodes.push_back(node);
+              arguments.push_back(node->Name());
+            }
+            op_desc.SetInput(var.parameter(), arguments);
+          }
+          // Create Nodes for outputs of current operator.
+          for (const proto::OpDesc::Var& var : op.outputs()) {
+            std::vector<std::string> arguments;
+            for (const std::string& argument : var.arguments()) {
+              // The output may be mapped on the operator of pattern subgraph.
+              Node* node = nullptr;
+              auto iter = var_node_maps.find(argument);
+              if (var_node_maps.end() == iter) {
+                VarDesc var_desc(patterns::UniqueKey(argument));
+                node = graph->CreateVarNode(&var_desc);
+                var_node_maps.insert({argument, node});
+              } else {
+                if (in_nodes.end() ==
+                    std::find(in_nodes.begin(), in_nodes.end(), iter->second)) {
+                  node = iter->second;
+                } else {
+                  node = graph->CreateVarNode(iter->second->Var());
+                }
+              }
+              out_nodes.push_back(node);
+              arguments.push_back(node->Name());
+            }
+            op_desc.SetOutput(var.parameter(), arguments);
+          }
+          // Set attribute for current operator.
+          for (const proto::OpDesc::Attr& attr : op.attrs()) {
+            op_desc.SetAttr(attr.name(), GetAttrValue(attr));
           }
-          if (attr_map.has_operation()) {
-            Attribute operation = GetAttrValue(attr_map.operation().value());
-            attr = boost::apply_visitor(
-                operation_visitor(attr_map.operation().type()), attr,
-                operation);
+          for (const auto& attr_map : pass_desc.op_attr_maps()) {
+            if (attr_map.replace_attr().op_index() == index) {
+              Attribute attr;
+              if (attr_map.pattern_attr().role() ==
+                  proto::PassDesc_RoleType_kVariable) {
+                Node* condition_node = subgraph.at(
+                    pattern.RetrieveNode(attr_map.pattern_attr().var_name()));
+                attr = GetVarAttrValue(condition_node->Var(),
+                                       attr_map.pattern_attr());
+              } else {
+                Node* condition_node = subgraph.at(pattern.RetrieveNode(
+                    std::to_string(attr_map.pattern_attr().op_index())));
+                attr = GetOpAttrValue(condition_node->Op(),
+                                      attr_map.pattern_attr());
+              }
+              if (attr_map.has_operation()) {
+                Attribute operation =
+                    GetAttrValue(attr_map.operation().value());
+                attr = boost::apply_visitor(
+                    operation_visitor(attr_map.operation().type()), attr,
+                    operation);
+              }
+              op_desc.SetAttr(attr_map.replace_attr().name(), attr);
+            }
+          }
+          // Create a Node for current operator.
+          Node* op_node = graph->CreateOpNode(&op_desc);
+          for (Node* node : in_nodes) {
+            IR_NODE_LINK_TO(node, op_node);
+          }
+          for (Node* node : out_nodes) {
+            IR_NODE_LINK_TO(op_node, node);
           }
-          op_desc.SetAttr(attr_map.replace_attr().name(), attr);
         }
-      }
-      // Create a Node for current operator.
-      Node* op_node = graph->CreateOpNode(&op_desc);
-      for (Node* node : in_nodes) {
-        IR_NODE_LINK_TO(node, op_node);
-      }
-      for (Node* node : out_nodes) {
-        IR_NODE_LINK_TO(op_node, node);
-      }
-    }
-    // Remove nodes that are intermediate.
-    std::unordered_set<const Node*> remove_nodes;
-    for (const std::unique_ptr<PDNode>& pdnode : pattern.nodes()) {
-      remove_nodes.emplace(subgraph.at(pdnode.get()));
-    }
-    for (auto iter : var_node_maps) {
-      remove_nodes.erase(iter.second);
-    }
-    GraphSafeRemoveNodes(graph, remove_nodes);
-  };
+        // Remove nodes that are intermediate.
+        std::unordered_set<const Node*> remove_nodes;
+        for (const std::unique_ptr<PDNode>& pdnode : pattern.nodes()) {
+          remove_nodes.emplace(subgraph.at(pdnode.get()));
+        }
+        for (auto iter : var_node_maps) {
+          remove_nodes.erase(iter.second);
+        }
+        GraphSafeRemoveNodes(graph, remove_nodes);
+      };
   return handler;
 }
 
diff --git a/paddle/fluid/framework/ir/generate_pass_tester.cc b/paddle/fluid/framework/ir/generate_pass_tester.cc
index 6876dde50c157..7e98b11215a75 100644
--- a/paddle/fluid/framework/ir/generate_pass_tester.cc
+++ b/paddle/fluid/framework/ir/generate_pass_tester.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/generate_pass.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/generate_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 REGISTER_GENERATE_PASS(generate_fc_fuse) {
   paddle::framework::ir::PassPairs pass_pairs;
   for (bool with_relu : {true, false}) {
     // pattern
-    SUBGRAPH_(pattern) =
-        [ subgraph = &pattern, with_relu ](VAR_(x), VAR_(y), VAR_(z)) {
+    SUBGRAPH_(pattern) = [subgraph = &pattern, with_relu](VAR_(x), VAR_(y),
+                                                          VAR_(z)) {
       VLOG(3) << "exec lambda func.";
       auto mul = OP_(mul)({{"X", x}, {"Y", y}}).Out("Out");
       auto ewadd = OP_(elementwise_add)({{"X", mul}, {"Y", z}}).Out("Out");
@@ -32,8 +32,8 @@ REGISTER_GENERATE_PASS(generate_fc_fuse) {
       }
     };
     // replace
-    SUBGRAPH_(replace) =
-        [ subgraph = &replace, with_relu ](VAR_(x), VAR_(y), VAR_(z)) {
+    SUBGRAPH_(replace) = [subgraph = &replace, with_relu](VAR_(x), VAR_(y),
+                                                          VAR_(z)) {
       auto& fc = OP_(fc)({{"Input", x}, {"W", y}, {"Bias", z}});
       return fc.Out("Out");
     };
diff --git a/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc
index ac580b99b5c95..8e58231e98681 100644
--- a/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc
+++ b/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc
@@ -16,9 +16,9 @@
 
 #include <cmath>
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
-
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index f5f6f3ecb855c..acf8f6ec6435b 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/ir/graph.h"
+
 #include <memory>
 
-#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/operator.h"
 
 PADDLE_DEFINE_EXPORTED_bool(convert_all_blocks, true,
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 10645f08dc3ba..40a6fbbade80e 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <gflags/gflags.h>
+
 #include <map>
 #include <memory>
 #include <string>
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index ed7aa451d134c..d4c7a607db371 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
+
 #include <queue>
 #include <stack>
+
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
@@ -421,8 +423,9 @@ std::vector<ir::Node *> TopologySortGraphByDescOrder(const Graph &graph) {
            DescOrderComparator>
       adj_list = BuildOperationAdjList<DescOrderComparator>(graph);
   PADDLE_ENFORCE_EQ(HasCircleInternal<DescOrderComparator>(adj_list, nullptr),
-                    false, platform::errors::InvalidArgument(
-                               "Generated graph shouldn't contain cycle."));
+                    false,
+                    platform::errors::InvalidArgument(
+                        "Generated graph shouldn't contain cycle."));
   std::unordered_set<ir::Node *> visited;
   std::vector<ir::Node *> ret;
   for (auto adj : adj_list) {
diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc
index 0a2dcfed000c9..5972cd40817ac 100644
--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/graph.h"
-#include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index f7c1a68c826f0..ca5a82708c554 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/operator.h"
@@ -70,8 +71,9 @@ void PDPattern::AddEdge(PDNode *a, PDNode *b) {
       a, platform::errors::NotFound("PDNode %s is not found.", a->name()));
   PADDLE_ENFORCE_NOT_NULL(
       b, platform::errors::NotFound("PDNode %s is not found.", b->name()));
-  PADDLE_ENFORCE_NE(a, b, platform::errors::PermissionDenied(
-                              "Cannot connect the same node in the graph."));
+  PADDLE_ENFORCE_NE(a, b,
+                    platform::errors::PermissionDenied(
+                        "Cannot connect the same node in the graph."));
   edges_.emplace_back(a, b);
 }
 
@@ -2631,8 +2633,10 @@ PDNode *patterns::Bfloat16Placement::operator()(
 PDNode *patterns::OrphanedBfloat16::operator()() {
   auto *prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
   prev_op->assert_more([&](Node *node) {
-    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
-           "float32";
+    bool data_type_is_missing = !node->Op()->HasAttr("mkldnn_data_type");
+    bool data_type_is_fp32 = node->Op()->GetAttrIfExists<std::string>(
+                                 "mkldnn_data_type") == "float32";
+    return data_type_is_missing || data_type_is_fp32;
   });
   auto *prev_out = pattern->NewNode(prev_out_repr())->AsOutput();
 
@@ -2645,8 +2649,10 @@ PDNode *patterns::OrphanedBfloat16::operator()() {
 
   auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
   next_op->assert_more([&](Node *node) {
-    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
-           "float32";
+    bool data_type_is_missing = !node->Op()->HasAttr("mkldnn_data_type");
+    bool data_type_is_fp32 = node->Op()->GetAttrIfExists<std::string>(
+                                 "mkldnn_data_type") == "float32";
+    return data_type_is_missing || data_type_is_fp32;
   });
 
   prev_op->LinksTo({prev_out});
@@ -3058,11 +3064,10 @@ PDNode *patterns::ReshapeTransposeMatmulPattern::operator()(
     transpose_out->assert_is_only_output_of_op("transpose2");
 
   auto transpose_xshape =
-      with_transpose_xshape
-          ? pattern->NewNode(transpose_xshape_repr())
-                ->AsIntermediate()
-                ->assert_is_op_output("transpose2", "XShape")
-          : nullptr;
+      with_transpose_xshape ? pattern->NewNode(transpose_xshape_repr())
+                                  ->AsIntermediate()
+                                  ->assert_is_op_output("transpose2", "XShape")
+                            : nullptr;
 
   auto matmul_out = pattern->NewNode(matmul_out_repr())
                         ->AsOutput()
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
index 5ac5a5d983992..b02b2e13edc97 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
@@ -152,12 +152,12 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
   x.mutable_pattern()->AddEdge(any_var, any_op1);
 
   int count = 0;
-  GraphPatternDetector::handle_t handle = [&](
-      const GraphPatternDetector::subgraph_t& s, Graph* g) {
-    LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> "
-              << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name();
-    count++;
-  };
+  GraphPatternDetector::handle_t handle =
+      [&](const GraphPatternDetector::subgraph_t& s, Graph* g) {
+        LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> "
+                  << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name();
+        count++;
+      };
 
   x(&graph, handle);
 
diff --git a/paddle/fluid/framework/ir/graph_printer.h b/paddle/fluid/framework/ir/graph_printer.h
index 76b07f0d65309..1b0e059f122b5 100644
--- a/paddle/fluid/framework/ir/graph_printer.h
+++ b/paddle/fluid/framework/ir/graph_printer.h
@@ -15,11 +15,13 @@
 #pragma once
 
 #include <glog/logging.h>
+
 #include <fstream>
 #include <iosfwd>
 #include <memory>
 #include <ostream>
 #include <string>
+
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index 1ff67ae0fe0d9..db18a735ce2dd 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph.h"
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc
index 3ad591c6dff04..f57cdd9d9746c 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 
 #include <gflags/gflags.h>
+
 #include <algorithm>
 
 #include "paddle/fluid/framework/op_proto_maker.h"
diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc
index b06314563025a..36bc3e6dd781b 100644
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/graph_traits.h"
+
 #include <list>
 #include <map>
 
-#include "paddle/fluid/framework/ir/graph_traits.h"
-
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -76,21 +76,22 @@ NodesDFSIterator::NodesDFSIterator(const std::vector<Node *> &source) {
 }
 
 NodesDFSIterator::NodesDFSIterator(NodesDFSIterator &&other) noexcept
-    : stack_(std::move(other.stack_)),
-      visited_(std::move(other.visited_)) {}
+    : stack_(std::move(other.stack_)), visited_(std::move(other.visited_)) {}
 
 NodesDFSIterator::NodesDFSIterator(const NodesDFSIterator &other)
     : stack_(other.stack_), visited_(other.visited_) {}
 
 Node &NodesDFSIterator::operator*() {
-  PADDLE_ENFORCE_EQ(stack_.empty(), false, platform::errors::OutOfRange(
-                                               "The iterator exceeds range."));
+  PADDLE_ENFORCE_EQ(
+      stack_.empty(), false,
+      platform::errors::OutOfRange("The iterator exceeds range."));
   return *stack_.top();
 }
 
 NodesDFSIterator &NodesDFSIterator::operator++() {
-  PADDLE_ENFORCE_EQ(stack_.empty(), false, platform::errors::OutOfRange(
-                                               "The iterator exceeds range."));
+  PADDLE_ENFORCE_EQ(
+      stack_.empty(), false,
+      platform::errors::OutOfRange("The iterator exceeds range."));
   visited_.insert(stack_.top());
   auto *cur = stack_.top();
   stack_.pop();
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index 7311eb4b91df8..da48d1d19b60a 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_printer.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
index 6b91ea4e360df..3d60148c170f9 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/identity_scale_op_clean_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -46,42 +47,42 @@ void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const {
   scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
 
   int found_subgraph_count = 0;
-  GraphPatternDetector::handle_t handler = [&](
-      const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-    Node* scale_op_var = subgraph.at(scale_op);
-    Node* scale_in_var = subgraph.at(scale_in);
-    Node* scale_out_var = subgraph.at(scale_out);
-    const std::string scale_in_name = scale_in_var->Name();
-    const std::string scale_out_name = scale_out_var->Name();
-    // Remove links in graph
-    GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var});
-    // Modify pre_op_desc
-    // Link pre_op directly to scale_out
-    for (auto& node : graph->Nodes()) {
-      if (node->IsOp()) {
-        auto* op_desc = node->Op();
-        auto out_vars_map = op_desc->Outputs();
-        for (auto out_var_map : out_vars_map) {
-          auto names = out_var_map.second;
-          bool reset = false;
-          for (size_t i = 0; i < names.size(); i++) {
-            if (names[i] == scale_in_name) {
-              reset = true;
-              names[i] = scale_out_name;
-              break;
+  GraphPatternDetector::handle_t handler =
+      [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
+        Node* scale_op_var = subgraph.at(scale_op);
+        Node* scale_in_var = subgraph.at(scale_in);
+        Node* scale_out_var = subgraph.at(scale_out);
+        const std::string scale_in_name = scale_in_var->Name();
+        const std::string scale_out_name = scale_out_var->Name();
+        // Remove links in graph
+        GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var});
+        // Modify pre_op_desc
+        // Link pre_op directly to scale_out
+        for (auto& node : graph->Nodes()) {
+          if (node->IsOp()) {
+            auto* op_desc = node->Op();
+            auto out_vars_map = op_desc->Outputs();
+            for (auto out_var_map : out_vars_map) {
+              auto names = out_var_map.second;
+              bool reset = false;
+              for (size_t i = 0; i < names.size(); i++) {
+                if (names[i] == scale_in_name) {
+                  reset = true;
+                  names[i] = scale_out_name;
+                  break;
+                }
+              }
+              if (reset) {
+                op_desc->SetOutput(out_var_map.first, names);
+                op_desc->Flush();
+                IR_NODE_LINK_TO(node, scale_out_var);
+                break;
+              }
             }
           }
-          if (reset) {
-            op_desc->SetOutput(out_var_map.first, names);
-            op_desc->Flush();
-            IR_NODE_LINK_TO(node, scale_out_var);
-            break;
-          }
         }
-      }
-    }
-    found_subgraph_count++;
-  };
+        found_subgraph_count++;
+      };
 
   detector(graph, handler);
   AddStatis(found_subgraph_count);
diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
index f1ee3c26b8f48..5c7373e1a77d8 100644
--- a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
@@ -14,10 +14,9 @@
 
 #include "paddle/fluid/framework/ir/ipu/avg_shard_pass.h"
 
-#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
-
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
index ebe40c3ee204e..cbe57eae4c496 100644
--- a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/ipu/infer_shape_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
index a6b82089dc4df..df4ea7fac4b35 100644
--- a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
@@ -14,11 +14,10 @@
 
 #include "paddle/fluid/framework/ir/ipu/inference_process_pass.h"
 
-#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
-#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
-
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
index 4da913e7176ca..12d646e153b4f 100644
--- a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h"
+
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/platform/device/ipu/ipu_backend.h"
 #include "paddle/fluid/platform/device/ipu/ipu_names.h"
diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc
index bf0667aeafe60..d2444295544b9 100644
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@@ -11,9 +11,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/is_test_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/is_test_pass.h"
 #ifdef _WIN32
 #undef FALSE
 #undef TRUE
diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
index 4b0dc4809f550..1b7b06213fe3c 100644
--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h"
+
 #include <vector>
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
index 93b6396bf7f31..a72a59374f902 100644
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
@@ -29,31 +29,31 @@ class Node;
 class Graph;
 
 /*
-* Remove the sum op of all gradients of the backward op.
-* And remove the dependecies of the optimizer related to the
-* same backward op.
-*
-* Before this pass:
-*
-* forward_op1 forward_op2
-*     |            |
-*  grad_op1    grad_op2
-*        \      /
-*          \  /
-*         sum_op
-*           |
-*         sgd_op
-*
-* After this pass:
-* forward_op1 forward_op2
-*     |            |
-*  grad_op1    grad_op2
-*     |            |
-*  sgd_op1      sgd_op2
-*
-* sgd_op1 and sgd_op2 will update the same weight which holds the same
-* memory, so we could benefits from the acceleration
-*/
+ * Remove the sum op of all gradients of the backward op.
+ * And remove the dependecies of the optimizer related to the
+ * same backward op.
+ *
+ * Before this pass:
+ *
+ * forward_op1 forward_op2
+ *     |            |
+ *  grad_op1    grad_op2
+ *        \      /
+ *          \  /
+ *         sum_op
+ *           |
+ *         sgd_op
+ *
+ * After this pass:
+ * forward_op1 forward_op2
+ *     |            |
+ *  grad_op1    grad_op2
+ *     |            |
+ *  sgd_op1      sgd_op2
+ *
+ * sgd_op1 and sgd_op2 will update the same weight which holds the same
+ * memory, so we could benefits from the acceleration
+ */
 class LockFreeOptimizePass : public Pass {
  public:
   virtual ~LockFreeOptimizePass() {}
diff --git a/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc b/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc
index 2335e5eee01db..a4bab58506e82 100644
--- a/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc
@@ -16,9 +16,9 @@
 
 #include <cmath>
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
-
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
index 25b07ddf41414..32d02902e8643 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -1,24 +1,80 @@
-cc_library(op_graph_view SRCS op_graph_view.cc DEPS op_handle_base)
-cc_library(conditional_block_op_eager_deletion_pass SRCS conditional_block_op_eager_deletion_pass.cc DEPS conditional_block_op_helper graph_helper pass computation_op_handle)
-cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle)
-cc_library(recurrent_op_eager_deletion_pass SRCS recurrent_op_eager_deletion_pass.cc DEPS recurrent_op_helper graph_helper pass computation_op_handle)
-cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle var_handle)
-cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)
+cc_library(
+  op_graph_view
+  SRCS op_graph_view.cc
+  DEPS op_handle_base)
+cc_library(
+  conditional_block_op_eager_deletion_pass
+  SRCS conditional_block_op_eager_deletion_pass.cc
+  DEPS conditional_block_op_helper graph_helper pass computation_op_handle)
+cc_library(
+  while_op_eager_deletion_pass
+  SRCS while_op_eager_deletion_pass.cc
+  DEPS while_op_helper graph_helper pass computation_op_handle)
+cc_library(
+  recurrent_op_eager_deletion_pass
+  SRCS recurrent_op_eager_deletion_pass.cc
+  DEPS recurrent_op_helper graph_helper pass computation_op_handle)
+cc_library(
+  reference_count_pass_helper
+  SRCS reference_count_pass_helper.cc
+  DEPS garbage_collector computation_op_handle var_handle)
+cc_library(
+  reference_count_pass
+  SRCS reference_count_pass.cc
+  DEPS computation_op_handle graph graph_helper pass op_graph_view
+       reference_count_pass_helper)
 
-SET(EAGER_DELETETION_PASS_DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass conditional_block_op_eager_deletion_pass while_op_eager_deletion_pass recurrent_op_eager_deletion_pass reference_count_pass_helper)
-if (WITH_CINN)
-  cc_library(share_varinfo_into_cinn_pass SRCS share_varinfo_into_cinn_pass.cc DEPS pass enforce graph_helper computation_op_handle eager_deletion_op_handle cinn_compiler)
-  cc_test(share_varinfo_into_cinn_pass_test SRCS share_varinfo_into_cinn_pass_test.cc DEPS share_varinfo_into_cinn_pass parallel_executor cinn_compiler elementwise_add_op mul_op cinn_launch_op)
+set(EAGER_DELETETION_PASS_DEPS
+    computation_op_handle
+    eager_deletion_op_handle
+    graph
+    graph_helper
+    pass
+    conditional_block_op_eager_deletion_pass
+    while_op_eager_deletion_pass
+    recurrent_op_eager_deletion_pass
+    reference_count_pass_helper)
+if(WITH_CINN)
+  cc_library(
+    share_varinfo_into_cinn_pass
+    SRCS share_varinfo_into_cinn_pass.cc
+    DEPS pass enforce graph_helper computation_op_handle
+         eager_deletion_op_handle cinn_compiler)
+  cc_test(
+    share_varinfo_into_cinn_pass_test
+    SRCS share_varinfo_into_cinn_pass_test.cc
+    DEPS share_varinfo_into_cinn_pass parallel_executor cinn_compiler
+         elementwise_add_op mul_op cinn_launch_op)
   list(APPEND EAGER_DELETETION_PASS_DEPS share_varinfo_into_cinn_pass)
 endif()
 
-cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS ${EAGER_DELETETION_PASS_DEPS})
+cc_library(
+  eager_deletion_pass
+  SRCS eager_deletion_pass.cc
+  DEPS ${EAGER_DELETETION_PASS_DEPS})
 
-cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handle reference_count_pass_helper share_tensor_buffer_op_handle graph pass multi_devices_helper)
+cc_library(
+  memory_reuse_pass
+  SRCS memory_reuse_pass.cc
+  DEPS computation_op_handle reference_count_pass_helper
+       share_tensor_buffer_op_handle graph pass multi_devices_helper)
 
-cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass executor_gc_helper)
-cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass)
+cc_library(
+  buffer_shared_inplace_op_pass
+  SRCS buffer_shared_inplace_op_pass.cc
+  DEPS memory_reuse_pass executor_gc_helper)
+cc_library(
+  buffer_shared_cross_op_memory_reuse_pass
+  SRCS buffer_shared_cross_op_memory_reuse_pass.cc
+  DEPS memory_reuse_pass)
 
-cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass)
+cc_library(
+  inplace_addto_op_pass
+  SRCS inplace_addto_op_pass.cc
+  DEPS memory_reuse_pass)
 
-cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op eigen_function)
+cc_test(
+  test_reference_count_pass_last_lived_ops
+  SRCS test_reference_count_pass_last_lived_ops.cc
+  DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op
+       eigen_function)
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
index b12b84d4a491b..090673b87ed8f 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
@@ -321,13 +321,15 @@ size_t BufferSharedCrossOpMemoryReusePass::ResolveDependencyBetween(
 }
 
 void BufferSharedCrossOpMemoryReusePass::BuildOpDependencyMap() const {
-  PADDLE_ENFORCE_EQ(ops_.empty(), true, platform::errors::InvalidArgument(
-                                            "Ops must be initialized here."));
+  PADDLE_ENFORCE_EQ(
+      ops_.empty(), true,
+      platform::errors::InvalidArgument("Ops must be initialized here."));
   PADDLE_ENFORCE_EQ(
       op_to_idx_.empty(), true,
       platform::errors::InvalidArgument("Op to idx must be initialized here."));
-  PADDLE_ENFORCE_EQ(deps_.empty(), true, platform::errors::InvalidArgument(
-                                             "Deps must be initialized here."));
+  PADDLE_ENFORCE_EQ(
+      deps_.empty(), true,
+      platform::errors::InvalidArgument("Deps must be initialized here."));
 
   // Toposort ops
   OpGraphView graph_view(ir::FilterByNodeWrapper<OpHandleBase>(*graph_));
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
index 1ca6e989f275c..682a72c5729ac 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
@@ -166,8 +166,9 @@ static std::string GetFirstVarName(const OpDesc &op, const std::string &slot,
 static std::vector<std::vector<std::pair<std::string, std::string>>>
 GetInplaceVars(const BlockDesc &block, bool use_cuda,
                const std::vector<std::string> &skip_vars) {
-  PADDLE_ENFORCE_EQ(block.ID(), 0, platform::errors::Unimplemented(
-                                       "Inplace can only perform in block 0."));
+  PADDLE_ENFORCE_EQ(
+      block.ID(), 0,
+      platform::errors::Unimplemented("Inplace can only perform in block 0."));
   // only take block 0 gc_vars
   const auto op_gc_vars =
       GetEagerDeletionCleanVars(*block.Program(), skip_vars)[0];
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
index e89734bacec36..8d593254f90fa 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
@@ -19,6 +19,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
index d6f286afc5590..b5506dd1dcbdd 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
@@ -136,13 +136,15 @@ void OpGraphView::BreadthFirstVisit(Callback &&callback) const {
     }
   }
 
-  PADDLE_ENFORCE_EQ(num_calls, op_num, platform::errors::InvalidArgument(
-                                           "There are unvisited ops."));
+  PADDLE_ENFORCE_EQ(
+      num_calls, op_num,
+      platform::errors::InvalidArgument("There are unvisited ops."));
   PADDLE_ENFORCE_EQ(
       visited_ops.size(), op_num,
       platform::errors::InvalidArgument("There are unvisited ops."));
-  PADDLE_ENFORCE_EQ(op_deps.empty(), true, platform::errors::InvalidArgument(
-                                               "There are unvisited ops."));
+  PADDLE_ENFORCE_EQ(
+      op_deps.empty(), true,
+      platform::errors::InvalidArgument("There are unvisited ops."));
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
index 6077069ea747a..b1fdb5e2160e0 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
@@ -26,9 +26,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+using paddle::operators::OpAndGradOpPair;
 using paddle::operators::OpVariant;
 using paddle::operators::OpVariantSet;
-using paddle::operators::OpAndGradOpPair;
 
 void RecurrentOpEagerDeletionPass::ApplyImpl(Graph *graph) const {
   // Find all recurrent_op and recurrent_grad_op in graph
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
index 313b2cc33459e..3f88aaad57e26 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
index 88bf9e3876399..848b6e494ad67 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <memory>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
index 4aa59d9196b1b..80f201d2d5afc 100644
--- a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
+++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -25,9 +26,10 @@ void MixedPrecisionConfigurePass::InsertCastOps(
   VLOG(3) << "Insert the cast op before and after the kernel that does not "
              "supports fp16 precision";
 
-  auto update_cast_desc = [&](
-      framework::OpDesc& desc, const std::string& x_name,
-      const std::string& out_name, const int in_dtype, const int out_dtype) {
+  auto update_cast_desc = [&](framework::OpDesc& desc,
+                              const std::string& x_name,
+                              const std::string& out_name, const int in_dtype,
+                              const int out_dtype) {
     desc.SetType("cast");
     desc.SetInput("X", {x_name});
     desc.SetOutput("Out", {out_name});
diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
index 9f6cd8992dcb9..62145cb6a0fb1 100644
--- a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc
index e13d44ac23222..b1b546f085cf8 100644
--- a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc
@@ -34,7 +34,7 @@ void SetBatchNormAttrs(OpDesc* bn_op, bool is_test = true,
   bn_op->SetAttr("fuse_with_relu", false);
   bn_op->SetAttr("epsilon", 0.001f);
 }
-}
+}  // namespace
 
 // ------------------------------ Test cases -----------------------------------
 
@@ -48,11 +48,12 @@ TEST(FuseBatchNormActOneDNNPass, ThrowIsTestTrainableStats) {
   auto prog = test::BuildProgramDesc(
       {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"},
       {"scale", "bias"});
-  auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"},
-                                                     {"Scale", "scale"},
-                                                     {"Bias", "bias"},
-                                                     {"Mean", "m"},
-                                                     {"Variance", "v"}},
+  auto* bn_op = test::CreateOp(&prog, "batch_norm",
+                               {{"X", "x"},
+                                {"Scale", "scale"},
+                                {"Bias", "bias"},
+                                {"Mean", "m"},
+                                {"Variance", "v"}},
                                {{"Y", "bn_y"},
                                 {"MeanOut", "m_out"},
                                 {"VarianceOut", "var_out"},
@@ -73,11 +74,12 @@ TEST(FuseBatchNormActOneDNNPass, ThrowIsTestTrainableStats) {
 TEST(FuseBatchNormActOneDNNPass, FuseIsTest) {
   auto prog = test::BuildProgramDesc({"x", "m", "v", "bn_y", "act_y"},
                                      {"scale", "bias"});
-  auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"},
-                                                     {"Scale", "scale"},
-                                                     {"Bias", "bias"},
-                                                     {"Mean", "m"},
-                                                     {"Variance", "v"}},
+  auto* bn_op = test::CreateOp(&prog, "batch_norm",
+                               {{"X", "x"},
+                                {"Scale", "scale"},
+                                {"Bias", "bias"},
+                                {"Mean", "m"},
+                                {"Variance", "v"}},
                                {{"Y", "bn_y"}});
   SetBatchNormAttrs(bn_op, true, false);
   test::CreateOp(&prog, "relu", {{"X", "bn_y"}}, {{"Out", "act_y"}}, false);
@@ -106,11 +108,12 @@ TEST(FuseBatchNormActOneDNNPass, ThrowTrainableStats) {
   auto prog = test::BuildProgramDesc(
       {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"},
       {"scale", "bias"});
-  auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"},
-                                                     {"Scale", "scale"},
-                                                     {"Bias", "bias"},
-                                                     {"Mean", "m"},
-                                                     {"Variance", "v"}},
+  auto* bn_op = test::CreateOp(&prog, "batch_norm",
+                               {{"X", "x"},
+                                {"Scale", "scale"},
+                                {"Bias", "bias"},
+                                {"Mean", "m"},
+                                {"Variance", "v"}},
                                {{"Y", "bn_y"},
                                 {"MeanOut", "m_out"},
                                 {"VarianceOut", "var_out"},
@@ -132,11 +135,12 @@ TEST(FuseBatchNormActOneDNNPass, AllAttrsFalse) {
   auto prog = test::BuildProgramDesc(
       {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"},
       {"scale", "bias"});
-  auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"},
-                                                     {"Scale", "scale"},
-                                                     {"Bias", "bias"},
-                                                     {"Mean", "m"},
-                                                     {"Variance", "v"}},
+  auto* bn_op = test::CreateOp(&prog, "batch_norm",
+                               {{"X", "x"},
+                                {"Scale", "scale"},
+                                {"Bias", "bias"},
+                                {"Mean", "m"},
+                                {"Variance", "v"}},
                                {{"Y", "bn_y"},
                                 {"MeanOut", "m_out"},
                                 {"VarianceOut", "var_out"},
@@ -158,11 +162,12 @@ TEST(FuseBatchNormActOneDNNPass, ThrowUseMkldnn) {
   auto prog = test::BuildProgramDesc(
       {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"},
       {"scale", "bias"});
-  auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"},
-                                                     {"Scale", "scale"},
-                                                     {"Bias", "bias"},
-                                                     {"Mean", "m"},
-                                                     {"Variance", "v"}},
+  auto* bn_op = test::CreateOp(&prog, "batch_norm",
+                               {{"X", "x"},
+                                {"Scale", "scale"},
+                                {"Bias", "bias"},
+                                {"Mean", "m"},
+                                {"Variance", "v"}},
                                {{"Y", "bn_y"},
                                 {"MeanOut", "m_out"},
                                 {"VarianceOut", "var_out"},
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
index d7d0b988b551e..e19426d01d195 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h"
+
 #include <float.h>
+
 #include <algorithm>
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h"
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
index b0076c1b38cd4..26fb6e4978ff5 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
index 1fefab805b1d3..e3db85471766f 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
 #include <vector>
+
+#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
index a74d7443ee1fe..18e09173491da 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
@@ -23,8 +23,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 /*
-* Fuse the Conv and Elementwise_add to a ConvBiasOp.
-*/
+ * Fuse the Conv and Elementwise_add to a ConvBiasOp.
+ */
 class Graph;
 
 class ConvBiasFusePass : public FusePassBase {
@@ -38,8 +38,8 @@ class ConvBiasFusePass : public FusePassBase {
   const std::string name_scope_{"conv_bias_mkldnn_fuse"};
 };
 /*
-* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp.
-*/
+ * Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp.
+ */
 class Conv2DTransposeBiasFusePass : public ConvBiasFusePass {
  public:
   Conv2DTransposeBiasFusePass();
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
index e9850483ebe91..0e052debaeeb2 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
 #include <gtest/gtest.h>
-#include "paddle/fluid/framework/naive_executor.h"
-#include "paddle/fluid/platform/place.h"
 
+#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
index 6b648608ca1d2..7d165b1a38a46 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
index eebc87f5d9988..58eec79344dd5 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -188,7 +188,8 @@ class DeQuantizer final : public Quanter {
   bool IsNotPermittedName(const std::string& output_name) const override {
     std::unordered_map<std::string, std::vector<std::string>> block_list{
         {"layer_norm",
-         {"Mean", "Variance"}}};  // not used in inference in MKLDNN
+         {"Mean", "Variance"}},     // not used in inference in MKLDNN
+        {"fc", {"ResidualData"}}};  // artifical output, already dequantized
 
     std::vector<std::string> blocked_outputs{"XShape"};  // blocklist for any op
     auto op_name = op->Name();
@@ -225,7 +226,7 @@ class DeQuantizer final : public Quanter {
     return Quanter::create_quant_op(output_name, input_name);
   }
 };
-}
+}  // namespace
 using string::PrettyLogDetail;
 
 void CPUBFloat16Pass::ApplyImpl(ir::Graph* graph) const {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index a61c043b58065..452212664ec93 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
+
 #include <sstream>
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/string/pretty_log.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 912c16288c2b9..fb36365ac54ef 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"  // NOLINT
 #include <gtest/gtest.h>
+
 #include <unordered_map>
 
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"  // NOLINT
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
index 350fad2c672d4..f6e5279ed23af 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
index 06940b38ea8e0..979c601ac04c9 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
-
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
index b7f7a8071d214..2a8a248a99faf 100644
--- a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
index 7fc8806452b88..afcd493f92f56 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc
index 59d81cb86474d..4b158ccc5a8b0 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc
@@ -32,7 +32,9 @@ TEST(FuseFCActOneDNNPass, ThrowUseMkldnn) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}}, false);
   test::CreateOp(&prog, "gelu", {{"Input", "fc_y"}}, {{"Out", "act_y"}}, false);
@@ -51,7 +53,9 @@ TEST(FuseFCActOneDNNPass, FuseWithGeluTanh) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}});
   auto* act_op = test::CreateOp(&prog, "gelu", {{"Input", "fc_y"}},
@@ -83,7 +87,9 @@ TEST(FuseFCActOneDNNPass, FuseWithGeluErf) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}});
   auto* act_op = test::CreateOp(&prog, "gelu", {{"Input", "fc_y"}},
@@ -115,7 +121,9 @@ TEST(FuseFCActOneDNNPass, FuseWithGeluAuto) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}});
   test::CreateOp(&prog, "gelu", {{"Input", "fc_y"}}, {{"Out", "act_y"}}, false);
@@ -145,7 +153,9 @@ TEST(FuseFCActOneDNNPass, FuseWithTanh) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}});
   test::CreateOp(&prog, "tanh", {{"Input", "fc_y"}}, {{"Out", "act_y"}}, false);
@@ -175,7 +185,9 @@ TEST(FuseFCActOneDNNPass, FuseWithSigmoid) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}});
   test::CreateOp(&prog, "sigmoid", {{"Input", "fc_y"}}, {{"Out", "act_y"}},
@@ -206,7 +218,9 @@ TEST(FuseFCActOneDNNPass, FuseWithMish) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}});
   test::CreateOp(&prog, "mish", {{"Input", "fc_y"}}, {{"Out", "act_y"}}, false);
@@ -236,7 +250,9 @@ TEST(FuseFCActOneDNNPass, FuseWithHardSwish) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}});
   test::CreateOp(&prog, "hard_swish", {{"Input", "fc_y"}}, {{"Out", "act_y"}},
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
index 2e62597f2ee29..60856512779ff 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/string/pretty_log.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
index 678a8fb4a6955..a5481f5c6f30e 100644
--- a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
@@ -129,17 +129,13 @@ void Int8ScaleCalculationMkldnnPass::ApplyImpl(ir::Graph* graph) const {
     bool has_activation =
         !conv_op->Op()->GetAttrIfExists<std::string>("fuse_activation").empty();
     float activation_scale =
-        force_fp32_output
-            ? 1.0f
-            : has_activation
-                  ? conv_op->Op()->GetAttrIfExists<float>("Scale_out")
-                  : 1.0f;
+        force_fp32_output ? 1.0f
+        : has_activation  ? conv_op->Op()->GetAttrIfExists<float>("Scale_out")
+                          : 1.0f;
     auto scale_out_data =
-        force_fp32_output
-            ? 1.0f
-            : has_activation
-                  ? 1.0f
-                  : conv_op->Op()->GetAttrIfExists<float>("Scale_out");
+        force_fp32_output ? 1.0f
+        : has_activation  ? 1.0f
+                          : conv_op->Op()->GetAttrIfExists<float>("Scale_out");
     float sum_scale =
         fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
 
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
index 804d04e35f690..9d3940c96644b 100644
--- a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h"
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
index 4eb532b47cb4b..1ed36e06fb19f 100644
--- a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
index 34a35877a7f25..f6c99a477bcd8 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
@@ -13,8 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h"
+
 #include <paddle/fluid/string/pretty_log.h>
+
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc
index ed99989cf382f..ddb9e717392e1 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc
index dcf4664d963da..6e106fa9dae5f 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h"
+
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index 4236dc55d5186..06e0db4c93ea0 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <boost/logic/tribool.hpp>
 #include <random>
 #include <string>
 #include <unordered_set>
 
-#include <boost/logic/tribool.hpp>
-
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc
index c4770a322db50..1ca9e76f79d6f 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h"
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h"
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc
index d2763bd6a6dc0..ae8dbceb7a64c 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h
index 44b6d110db82c..880630055e916 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <memory>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index 7df957b2c0eca..7f4e5d32536a0 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h"
-
 #include <gtest/gtest.h>
-#include <unordered_set>
 
 #include <boost/logic/tribool.hpp>
+#include <unordered_set>
 
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
index 505bb2739e1d4..99a55b26e99db 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
index 4012e04f7d2af..671ad4c1c4b2f 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
-
 #include <gtest/gtest.h>
+
 #include <boost/logic/tribool.hpp>
 
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc
index 76a0c883c8923..73089df571765 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h"
+
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/platform/errors.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h
index 70f88104b4b52..cf53ecec9262e 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass_tester.cc
index 7b6681ff96784..60890336b3052 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass_tester.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h"
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc
index 7821501cc4b23..06125e51fb65e 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h"
+
 #include <limits>
 #include <sstream>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/platform/errors.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h
index 546a3d6570b41..af58ae2bda49c 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <utility>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass_tester.cc
index 3738e3ebd68eb..2924401bc2e6a 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass_tester.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h"
 #include <gtest/gtest.h>
+
 #include <initializer_list>
 
+#include "paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index 63e402cb52983..15100b23407b0 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -124,10 +126,11 @@ void QuantDequantMkldnnPass::CollectInputScalesFromFake(
       auto* op_desc = op_node->Op();
       const int bit_length =
           BOOST_GET_CONST(int, op_desc->GetAttr("bit_length"));
-      PADDLE_ENFORCE_EQ(bit_length, 8, platform::errors::InvalidArgument(
-                                           "Unsupported number quantization "
-                                           "bits: %d, only 8 is supported now.",
-                                           bit_length));
+      PADDLE_ENFORCE_EQ(bit_length, 8,
+                        platform::errors::InvalidArgument(
+                            "Unsupported number quantization "
+                            "bits: %d, only 8 is supported now.",
+                            bit_length));
 
       auto x_var_name = op_desc->Input("X")[0];
       auto scale_name = op_desc->Input("InScale")[0];
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
index a9442f707402d..5003e1878bfeb 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
index 96f575745a3a2..05b1d419f6f4a 100644
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h"
+
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/pretty_log.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
index e688635646001..023dd6af7ee01 100644
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.h"
-
-#include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.cc
index 203966dc682f5..ed57be12c78e3 100644
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.h"
+
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/pretty_log.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc
index 60f844ffc80ce..09bad959eb09f 100644
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h"
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc
index bf603dc4bbcb9..a7e0f3a583441 100644
--- a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h"
+
 #include <string>
 
-#include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
index fe42e8f96f851..86775e20aa73c 100644
--- a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <vector>
 
 #include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc
index 82d642264c2c4..cad92e3153b12 100644
--- a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc
index 003a39f37d4a6..662dfb0f9d4f9 100644
--- a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
 #include <vector>
+
+#include "paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
@@ -52,43 +53,27 @@ void MainTest(const std::string& activation_type) {
   }
 }
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithTanh) {
-  MainTest("tanh")
-}
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithTanh){MainTest("tanh")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithRelu) {
-  MainTest("relu")
-}
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithRelu){MainTest("relu")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithLeakyRelu) {
-  MainTest("leaky_relu")
-}
+TEST(FuseSoftplusActivationOneDNNPass,
+     FuseSoftplusWithLeakyRelu){MainTest("leaky_relu")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSwish) {
-  MainTest("swish")
-}
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSwish){MainTest("swish")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithHardswish) {
-  MainTest("hardswish")
-}
+TEST(FuseSoftplusActivationOneDNNPass,
+     FuseSoftplusWithHardswish){MainTest("hardswish")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSqrt) {
-  MainTest("sqrt")
-}
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSqrt){MainTest("sqrt")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithAbs) { MainTest("abs") }
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithAbs){MainTest("abs")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithClip) {
-  MainTest("clip")
-}
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithClip){MainTest("clip")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithGelu) {
-  MainTest("gelu")
-}
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithGelu){MainTest("gelu")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithRelu6) {
-  MainTest("relu6")
-}
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithRelu6){MainTest("relu6")}
 
 TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSigmoid) {
   MainTest("sigmoid")
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index 06af5eaec13bc..b849076935afe 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/multi_batch_merge_pass.h"
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
index fea12baf0651f..e97331bc87a45 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
@@ -1,7 +1,17 @@
-cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle op_graph_view multi_devices_helper)
+cc_library(
+  modify_op_lock_and_record_event_pass
+  SRCS modify_op_lock_and_record_event_pass.cc
+  DEPS computation_op_handle scale_loss_grad_op_handle op_graph_view
+       multi_devices_helper)
 
-cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
-cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
+cc_library(
+  multi_devices_graph_print_pass
+  SRCS multi_devices_graph_print_pass.cc
+  DEPS multi_devices_helper)
+cc_library(
+  multi_devices_graph_check_pass
+  SRCS multi_devices_graph_check_pass.cc
+  DEPS multi_devices_helper)
 
 set(ALL_REDUCE_OP_HANDLES all_reduce_op_handle)
 set(ALL_REDUCE_OP_HANDLES grad_merge_all_reduce_op_handle)
@@ -9,13 +19,46 @@ if(WITH_GPU AND WITH_DGC)
   list(APPEND ALL_REDUCE_OP_HANDLES sparse_all_reduce_op_handle)
 endif()
 
-cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle ${ALL_REDUCE_OP_HANDLES} reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
-cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
-cc_library(set_reader_device_info_utils SRCS set_reader_device_info_utils.cc DEPS graph graph_helper pass multi_devices_graph_pass)
+cc_library(
+  multi_devices_graph_pass
+  SRCS multi_devices_graph_pass.cc
+  DEPS multi_devices_helper
+       computation_op_handle
+       scale_loss_grad_op_handle
+       rpc_op_handle
+       fetch_barrier_op_handle
+       ${ALL_REDUCE_OP_HANDLES}
+       reduce_op_handle
+       broadcast_op_handle
+       fused_broadcast_op_handle)
+cc_library(
+  sequential_execution_pass
+  SRCS sequential_execution_pass.cc
+  DEPS graph graph_helper pass)
+cc_library(
+  set_reader_device_info_utils
+  SRCS set_reader_device_info_utils.cc
+  DEPS graph graph_helper pass multi_devices_graph_pass)
 
-cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle grad_merge_all_reduce_op_handle)
-cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS all_reduce_op_handle graph graph_helper pass)
-cc_library(backward_optimizer_op_deps_pass SRCS backward_optimizer_op_deps_pass.cc DEPS graph graph_helper pass)
-cc_library(add_reader_dependency_pass SRCS add_reader_dependency_pass.cc DEPS graph graph_helper pass)
-cc_library(fix_op_run_order_pass SRCS fix_op_run_order_pass.cc DEPS graph graph_helper multi_devices_helper pass op_handle_base eager_deletion_op_handle)
+cc_library(
+  fuse_all_reduce_op_pass
+  SRCS fuse_all_reduce_op_pass.cc
+  DEPS graph graph_helper fused_all_reduce_op_handle
+       grad_merge_all_reduce_op_handle)
+cc_library(
+  all_reduce_deps_pass
+  SRCS all_reduce_deps_pass.cc
+  DEPS all_reduce_op_handle graph graph_helper pass)
+cc_library(
+  backward_optimizer_op_deps_pass
+  SRCS backward_optimizer_op_deps_pass.cc
+  DEPS graph graph_helper pass)
+cc_library(
+  add_reader_dependency_pass
+  SRCS add_reader_dependency_pass.cc
+  DEPS graph graph_helper pass)
+cc_library(
+  fix_op_run_order_pass
+  SRCS fix_op_run_order_pass.cc
+  DEPS graph graph_helper multi_devices_helper pass op_handle_base
+       eager_deletion_op_handle)
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc
index abb1d062c96ef..b907869b4a38e 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <queue>
+
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc
index 772b4c1c915cc..55b6389768cb4 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
index 484d09fd4441d..5189f410e3c70 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <string>
+
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index 1b6245928d377..7180c3820c71e 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
+
 #include <algorithm>
 #include <fstream>
 #include <memory>
@@ -20,6 +21,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
@@ -495,9 +497,9 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
                         "use_dgc=%d, use_grad_merge=%d",
                         is_encoded, is_grad_merge));
 
-  auto append_allreduce_op = [&](
-      const std::vector<Scope *> &scopes,
-      const std::vector<platform::Place> &places) -> details::OpHandleBase * {
+  auto append_allreduce_op = [&](const std::vector<Scope *> &scopes,
+                                 const std::vector<platform::Place> &places)
+      -> details::OpHandleBase * {
     if (is_encoded) {
 #if defined(PADDLE_WITH_DGC) && \
     (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
@@ -758,13 +760,14 @@ int BalanceVarSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
           "and Parameter@Grad.",
           node->Name(), OpProtoAndCheckerMaker::OpRoleVarAttrName()));
   int dev_id = GetVarDeviceID(param_grad[1]);
-  PADDLE_ENFORCE_NE(dev_id, -1, platform::errors::NotFound(
-                                    "Can not find Device ID, for NodeName:%s, "
-                                    "NodeType:%s, Param:%s, Param@Grad:%s"
-                                    "For this fault, you can consult the "
-                                    "Paddle technical personnel for answer ",
-                                    node->Name(), node->Op()->Type(),
-                                    param_grad[0], param_grad[1]));
+  PADDLE_ENFORCE_NE(
+      dev_id, -1,
+      platform::errors::NotFound("Can not find Device ID, for NodeName:%s, "
+                                 "NodeType:%s, Param:%s, Param@Grad:%s"
+                                 "For this fault, you can consult the "
+                                 "Paddle technical personnel for answer ",
+                                 node->Name(), node->Op()->Type(),
+                                 param_grad[0], param_grad[1]));
   return dev_id;
 }
 
@@ -956,10 +959,11 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
   bool insert_op = false;
   if (OpHaveRole(*node, OpRole::kRPC)) {
     int op_dev_id = CreateRPCOp(result, node);
-    PADDLE_ENFORCE_NE(op_dev_id, -1, platform::errors::InvalidArgument(
-                                         "Can not schedule the RPC operator to "
-                                         "the right place. NodeName:%s.",
-                                         node->Name()));
+    PADDLE_ENFORCE_NE(op_dev_id, -1,
+                      platform::errors::InvalidArgument(
+                          "Can not schedule the RPC operator to "
+                          "the right place. NodeName:%s.",
+                          node->Name()));
     if (node->Op()->Type() == "recv") {
       auto recv_vars_attr =
           BOOST_GET_CONST(std::vector<std::string>,
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
index c76f30016763a..7508074207768 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
@@ -46,7 +46,7 @@ class NCCLContextMap;
 class BKCLContextMap;
 class BKCLCommunicator;
 #endif
-}
+}  // namespace platform
 
 namespace framework {
 class Scope;
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc
index 09ef94c0826d7..c7b6e477fd5aa 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h"
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 4a5947778056a..03d433f4db165 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -51,11 +51,12 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](
-      Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
-      Node* mul1_out, Node* mul2_out, Node* eltadd0_b, Node* eltadd1_b,
-      Node* eltadd2_b, Node* eltadd_qk_b, Node* reshape2,
-      Node* reshape2_qkv_out, Node* scale, Node* scale_out) {
+  auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2,
+                          Node* mul0_out, Node* mul1_out, Node* mul2_out,
+                          Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b,
+                          Node* eltadd_qk_b, Node* reshape2,
+                          Node* reshape2_qkv_out, Node* scale,
+                          Node* scale_out) {
     auto scale_attr = BOOST_GET_CONST(float, scale->Op()->GetAttr("scale"));
     // auto scale_bias = BOOST_GET_CONST(float, scale->Op()->GetAttr("bias"));
     // bool after_scale =
@@ -756,13 +757,14 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](
-      Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
-      Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w,
-      Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b,
-      Node* reshape2, Node* reshape2_qkv_out, Node* scale, Node* scale_out,
-      Node* softmax_qk, Node* eltadd0, Node* eltadd1, Node* eltadd2,
-      Node* matmul_qk, Node* reshape2_qkv) {
+  auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2,
+                          Node* mul0_out, Node* mul1_out, Node* mul2_out,
+                          Node* mul0_w, Node* mul1_w, Node* mul2_w,
+                          Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b,
+                          Node* eltadd_qk_b, Node* reshape2,
+                          Node* reshape2_qkv_out, Node* scale, Node* scale_out,
+                          Node* softmax_qk, Node* eltadd0, Node* eltadd1,
+                          Node* eltadd2, Node* matmul_qk, Node* reshape2_qkv) {
     auto scale_attr = BOOST_GET_CONST(float, scale->Op()->GetAttr("scale"));
 
     // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H)
@@ -1207,11 +1209,12 @@ int MultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph,
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](
-      Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
-      Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w,
-      Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b,
-      Node* reshape2, Node* reshape2_qkv_out, Node* matmul_qk) {
+  auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2,
+                          Node* mul0_out, Node* mul1_out, Node* mul2_out,
+                          Node* mul0_w, Node* mul1_w, Node* mul2_w,
+                          Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b,
+                          Node* eltadd_qk_b, Node* reshape2,
+                          Node* reshape2_qkv_out, Node* matmul_qk) {
     auto scale_attr = BOOST_GET_CONST(float, matmul_qk->Op()->GetAttr("alpha"));
 
     // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H)
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
index b121436ee870b..858ebf68b40fa 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
@@ -9,8 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h"  // NOLINT
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h"  // NOLINT
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/node_test.cc b/paddle/fluid/framework/ir/node_test.cc
index 9c47df402bdf2..2d84162e13aa6 100644
--- a/paddle/fluid/framework/ir/node_test.cc
+++ b/paddle/fluid/framework/ir/node_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/node.h"
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/var_desc.h"
 
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
index 73a8691f9e269..e309e068563e5 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
+
 #include <memory>
 #include <mutex>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/op_def_api.h"
 #include "paddle/fluid/framework/op_info.h"
 
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.h b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
index e24294a03a28a..393a2fb9392d5 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.h
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <map>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
index 756d3c2c77096..4b106d75f1c75 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 2c10a68188eb4..85eecbd014e96 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/pass.h"
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
@@ -90,9 +91,10 @@ static void MergePrograms(ProgramDesc *dst, const details::ProgramDescs &srcs,
   bool reverse = !append;
 
   auto create_var_visitor = [dst](const ProgramDesc &src) {
-    PADDLE_ENFORCE_EQ(src.Size(), 1, platform::errors::Unimplemented(
-                                         "MergePrograms can only support to "
-                                         "merge program with only one block."));
+    PADDLE_ENFORCE_EQ(
+        src.Size(), 1,
+        platform::errors::Unimplemented("MergePrograms can only support to "
+                                        "merge program with only one block."));
     const auto &src_block = src.Block(0);
     auto *dst_block = dst->MutableBlock(0);
     for (const auto *src_new_var : src_block.AllVars()) {
diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc
index 616ba7f1a9761..8c368a796ed10 100644
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -84,8 +84,9 @@ TEST(PassTest, TestPassAttrCheck) {
     } catch (paddle::platform::EnforceNotMet& e) {
       exception = std::string(e.what());
     }
-    std::string msg = "Invalid type for attritube test_pass_attr, expected: " +
-                      try_type + ", actual: int";
+    std::string msg =
+        "Invalid type for attritube test_pass_attr, expected: " + try_type +
+        ", actual: int";
     ASSERT_TRUE(exception.find(msg) != exception.npos);
   }
 
@@ -168,8 +169,9 @@ TEST(PassTest, TestPassAttrCheckConvertAllBlocks) {
     } catch (paddle::platform::EnforceNotMet& e) {
       exception = std::string(e.what());
     }
-    std::string msg = "Invalid type for attritube test_pass_attr, expected: " +
-                      try_type + ", actual: int";
+    std::string msg =
+        "Invalid type for attritube test_pass_attr, expected: " + try_type +
+        ", actual: int";
     ASSERT_TRUE(exception.find(msg) != exception.npos);
   }
 
diff --git a/paddle/fluid/framework/ir/pass_test_util.cc b/paddle/fluid/framework/ir/pass_test_util.cc
index 4d8965918f889..40dcb3cf1dbd8 100644
--- a/paddle/fluid/framework/ir/pass_test_util.cc
+++ b/paddle/fluid/framework/ir/pass_test_util.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/pass_test_util.h"
+
 #include <algorithm>
 #include <cstring>
 #include <exception>
@@ -23,7 +25,6 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/ir/pass_test_util.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
diff --git a/paddle/fluid/framework/ir/pass_tester_helper.h b/paddle/fluid/framework/ir/pass_tester_helper.h
index acefde9df6854..ad58e4e4a0cf4 100644
--- a/paddle/fluid/framework/ir/pass_tester_helper.h
+++ b/paddle/fluid/framework/ir/pass_tester_helper.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc
index 35ba920060779..fd1b54f8c4d37 100644
--- a/paddle/fluid/framework/ir/placement_pass_base.cc
+++ b/paddle/fluid/framework/ir/placement_pass_base.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/placement_pass_base.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
index d6761d2e82ef3..929ffa2cadbef 100644
--- a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
@@ -430,13 +430,15 @@ void PrelnEmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
   FusePassBase::Init(name_scope_, graph);
 
   bool enable_int8 = Get<bool>("enable_int8");
-  bool use_oss = Get<bool>("use_oss");
+  bool use_varseqlen = Get<bool>("use_varseqlen");
   bool with_interleaved = Get<bool>("with_interleaved");
   bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
-  if (!(enable_int8 && use_oss && with_interleaved && with_dynamic_shape)) {
+  if (!(enable_int8 && use_varseqlen && with_interleaved &&
+        with_dynamic_shape)) {
     VLOG(4) << "preln_embedding_eltwise_layernorm_fuse_pass need: use_trt, "
                "enable_int8, "
-               "use_oss, with_interleaved, with_dynamic_shape. Stop this pass, "
+               "use_varseqlen, with_interleaved, with_dynamic_shape. Stop this "
+               "pass, "
                "please reconfig.";
     return;
   }
diff --git a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
index 978360d8f0a95..80e6c2b796798 100644
--- a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
@@ -43,8 +43,8 @@ struct PrelnSkipLayerNorm : public PatternBase {
   PATTERN_DECL_NODE(layer_norm);
   // declare variable node's name
   PATTERN_DECL_NODE(
-      elementwise_out);  // (elementwise_input_x,elementwise_input_y) ->
-                         // elementwise_out
+      elementwise_out);  // (elementwise_input_x,elementwise_input_y)
+                         // -> elementwise_out
   PATTERN_DECL_NODE(layer_norm_bias);
   PATTERN_DECL_NODE(layer_norm_scale);
   PATTERN_DECL_NODE(layer_norm_out);
@@ -109,12 +109,13 @@ void PrelnSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
       graph, platform::errors::PreconditionNotMet("graph should not be null."));
   FusePassBase::Init("preln_skip_layernorm_fuse", graph);
   bool enable_int8 = Get<bool>("enable_int8");
-  bool use_oss = Get<bool>("use_oss");
+  bool use_varseqlen = Get<bool>("use_varseqlen");
   bool with_interleaved = Get<bool>("with_interleaved");
   bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
-  if (!(enable_int8 && use_oss && with_interleaved && with_dynamic_shape)) {
+  if (!(enable_int8 && use_varseqlen && with_interleaved &&
+        with_dynamic_shape)) {
     VLOG(4) << "preln_skip_layernorm_fuse_pass need: use_trt, enable_int8, "
-               "use_oss, "
+               "use_varseqlen, "
                "with_interleaved, with_dynamic_shape. Stop this pass, please "
                "reconfig. ";
     return;
diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
index 67dfe074dc075..ee9474f6fada0 100644
--- a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
+++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
@@ -22,6 +22,19 @@ namespace paddle {
 namespace framework {
 namespace ir {
 namespace patterns {
+void EmbEltwiseLayernorm::operator()() {
+  // Create nodes for fused_embedding_eltwise_layernorm.
+  auto* emb_elt_layernorm_op =
+      pattern->NewNode(emb_elt_layernorm_op_repr())
+          ->assert_is_op("fused_embedding_eltwise_layernorm");
+  auto* emb_elt_layernorm_out =
+      pattern->NewNode(emb_elt_layernorm_out_repr())
+          ->assert_is_op_output("fused_embedding_eltwise_layernorm", "Out");
+
+  // Add links for fused_embedding_eltwise_layernorm op.
+  emb_elt_layernorm_op->LinksTo({emb_elt_layernorm_out});
+}
+
 void SkipLayernorm::operator()() {
   // Create nodes for skip_layernorm.
   auto* skip_layernorm_x = pattern->NewNode(skip_layernorm_x_repr())
@@ -59,16 +72,12 @@ void Fc::operator()() {
   auto* fc_input =
       pattern->NewNode(fc_input_repr())->assert_is_op_input("fc", "Input");
   auto* fc_op = pattern->NewNode(fc_op_repr())->assert_is_op("fc");
-  auto* fc_out =
-      pattern->NewNode(fc_out_repr())->assert_is_op_output("fc", "Out");
-
-  // Add links for fc op.
-  fc_op->LinksFrom({fc_input}).LinksTo({fc_out});
+  fc_op->LinksFrom({fc_input});
 }
 
 void Activation::operator()() {
   // Create nodes for activation.
-  std::unordered_set<std::string> activation_ops{"relu", "sigmoid", "tanh"};
+  std::unordered_set<std::string> activation_ops{"relu", "sigmoid", "gelu"};
   auto* activation_input = pattern->NewNode(activation_input_repr())
                                ->assert_is_ops_input(activation_ops);
   auto* activation_op =
@@ -82,6 +91,18 @@ void Activation::operator()() {
 }  // namespace patterns
 
 void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
+  bool use_varseqlen = Get<bool>("use_varseqlen");
+  std::string pos_id = Get<std::string>("tensorrt_transformer_posid");
+  std::string mask_id = Get<std::string>("tensorrt_transformer_maskid");
+
+  if (use_varseqlen && pos_id != "" && mask_id != "" &&
+      graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
+      graph->Has(framework::ir::kMultiheadMatmulPass)) {
+    VLOG(3) << "start varseqlen remove_padding_recover_padding_pass";
+  } else {
+    return;
+  }
+
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::PreconditionNotMet("graph should not be null."));
   FusePassBase::Init(name_scope_, graph);
@@ -91,14 +112,14 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
   // Create an remove_padding op node
   auto insert_remove_padding_op = [&](Node* input_node, Node* op_node) {
     // create op, var in graph
-    OpDesc remove_padding;
+    OpDesc remove_padding(op_node->Op()->Block());
     std::string remove_padding_out_name =
         input_node->Name() + ".remove_padding";
-
-    VarDesc remove_padding_out(remove_padding_out_name);
-    remove_padding_out.SetDataType(input_node->Var()->GetDataType());
-    remove_padding_out.SetShape(input_node->Var()->GetShape());
-    remove_padding_out.SetPersistable(false);
+    auto* remove_padding_out =
+        op_node->Op()->Block()->Var(remove_padding_out_name);
+    remove_padding_out->SetDataType(input_node->Var()->GetDataType());
+    remove_padding_out->SetShape(input_node->Var()->GetShape());
+    remove_padding_out->SetPersistable(false);
 
     // remove_padding_op
     remove_padding.SetType("remove_padding");
@@ -110,7 +131,7 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
     remove_padding.SetOutput("Out", {remove_padding_out_name});
 
     auto remove_padding_op_node = graph->CreateOpNode(&remove_padding);
-    auto remove_padding_out_node = graph->CreateVarNode(&remove_padding_out);
+    auto remove_padding_out_node = graph->CreateVarNode(remove_padding_out);
 
     // replace link
     for (size_t i = 0; i < input_node->outputs.size(); ++i) {
@@ -145,13 +166,14 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
   // create an remove_padding op node
   auto insert_recover_padding_op = [&](Node* op_node, Node* out_node) {
     // create op, var in graph
-    OpDesc recover_padding;
+    OpDesc recover_padding(op_node->Op()->Block());
     std::string recover_padding_input_name =
         out_node->Name() + ".recover_padding";
-    VarDesc recover_padding_input(recover_padding_input_name);
-    recover_padding_input.SetDataType(out_node->Var()->GetDataType());
-    recover_padding_input.SetShape(out_node->Var()->GetShape());
-    recover_padding_input.SetPersistable(false);
+    auto* recover_padding_input =
+        op_node->Op()->Block()->Var(recover_padding_input_name);
+    recover_padding_input->SetDataType(out_node->Var()->GetDataType());
+    recover_padding_input->SetShape(out_node->Var()->GetShape());
+    recover_padding_input->SetPersistable(false);
 
     // recover_padding_op
     recover_padding.SetType("recover_padding");
@@ -164,7 +186,7 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
 
     auto recover_padding_op_node = graph->CreateOpNode(&recover_padding);
     auto recover_padding_input_node =
-        graph->CreateVarNode(&recover_padding_input);
+        graph->CreateVarNode(recover_padding_input);
 
     // replace link
     for (size_t i = 0; i < op_node->outputs.size(); ++i) {
@@ -195,39 +217,36 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
     op_node->Op()->RenameOutput(out_node->Name(), recover_padding_input_name);
   };
 
-  GraphPatternDetector gpd1;
-  patterns::SkipLayernorm skip_layernorm(gpd1.mutable_pattern(),
-                                         "remove_padding_recover_padding_pass");
-  skip_layernorm();
+  bool check_flag = true;
 
-  auto handler1 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+  GraphPatternDetector gpd0;
+  patterns::EmbEltwiseLayernorm fused_embedding_eltwise_layernorm(
+      gpd0.mutable_pattern(), "remove_padding_recover_padding_pass");
+  fused_embedding_eltwise_layernorm();
+
+  auto handler0 = [&](const GraphPatternDetector::subgraph_t& subgraph,
                       Graph* graph) {
     VLOG(3) << "remove_padding_recover_padding_pass for transformer: "
-               "skip_layernorm";
+               "fused_embedding_eltwise_layernorm";
 
-    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_x, skip_layernorm_x,
-                              skip_layernorm);
-    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_y, skip_layernorm_y,
-                              skip_layernorm);
-    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_op, skip_layernorm_op,
-                              skip_layernorm);
-    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_out, skip_layernorm_out,
-                              skip_layernorm);
+    GET_IR_NODE_FROM_SUBGRAPH(emb_elt_layernorm_op, emb_elt_layernorm_op,
+                              fused_embedding_eltwise_layernorm);
+    GET_IR_NODE_FROM_SUBGRAPH(emb_elt_layernorm_out, emb_elt_layernorm_out,
+                              fused_embedding_eltwise_layernorm);
 
-    insert_remove_padding_op(skip_layernorm_x, skip_layernorm_op);
-    insert_remove_padding_op(skip_layernorm_y, skip_layernorm_op);
-    insert_recover_padding_op(skip_layernorm_op, skip_layernorm_out);
+    insert_recover_padding_op(emb_elt_layernorm_op, emb_elt_layernorm_out);
 
     found_subgraph_count++;
   };
-  gpd1(graph, handler1);
+  gpd0(graph, handler0);
 
-  GraphPatternDetector gpd2;
+  GraphPatternDetector gpd1;
   patterns::MultiheadMatmul multihead_matmul(
-      gpd2.mutable_pattern(), "remove_padding_recover_padding_pass");
+      gpd1.mutable_pattern(), "remove_padding_recover_padding_pass");
   multihead_matmul();
 
-  auto handler2 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+  std::vector<int64_t> multihead_matmul_input_shape;
+  auto handler1 = [&](const GraphPatternDetector::subgraph_t& subgraph,
                       Graph* graph) {
     VLOG(3) << "remove_padding_recover_padding_pass for transformer: "
                "multihead_matmul";
@@ -239,11 +258,57 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_out, multihead_matmul_out,
                               multihead_matmul);
 
+    multihead_matmul_input_shape = multihead_matmul_input->Var()->GetShape();
+
     insert_remove_padding_op(multihead_matmul_input, multihead_matmul_op);
     insert_recover_padding_op(multihead_matmul_op, multihead_matmul_out);
 
     found_subgraph_count++;
   };
+  gpd1(graph, handler1);
+
+  GraphPatternDetector gpd2;
+  patterns::SkipLayernorm skip_layernorm(gpd2.mutable_pattern(),
+                                         "remove_padding_recover_padding_pass");
+  skip_layernorm();
+
+  auto handler2 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3) << "remove_padding_recover_padding_pass for transformer: "
+               "skip_layernorm";
+
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_x, skip_layernorm_x,
+                              skip_layernorm);
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_y, skip_layernorm_y,
+                              skip_layernorm);
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_op, skip_layernorm_op,
+                              skip_layernorm);
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_out, skip_layernorm_out,
+                              skip_layernorm);
+
+    std::vector<int64_t> skip_layernorm_x_shape =
+        skip_layernorm_x->Var()->GetShape();
+    if (skip_layernorm_x_shape.size() != multihead_matmul_input_shape.size()) {
+      check_flag = false;
+      VLOG(3) << "Transformer model remove_padding shape check failed, return "
+                 "remove_padding pass.";
+      return;
+    }
+    for (size_t i = 0; i < skip_layernorm_x_shape.size(); ++i) {
+      if (skip_layernorm_x_shape[i] != multihead_matmul_input_shape[i]) {
+        check_flag = false;
+      }
+    }
+    if (!check_flag) {
+      VLOG(3) << "Transformer model remove_padding shape check failed, return "
+                 "remove_padding pass.";
+      return;
+    }
+    insert_remove_padding_op(skip_layernorm_x, skip_layernorm_op);
+    insert_remove_padding_op(skip_layernorm_y, skip_layernorm_op);
+    insert_recover_padding_op(skip_layernorm_op, skip_layernorm_out);
+    found_subgraph_count++;
+  };
   gpd2(graph, handler2);
 
   GraphPatternDetector gpd3;
@@ -257,11 +322,39 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
 
     GET_IR_NODE_FROM_SUBGRAPH(fc_input, fc_input, fc);
     GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc_op, fc);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_out, fc_out, fc);
 
-    insert_remove_padding_op(fc_input, fc_op);
-    insert_recover_padding_op(fc_op, fc_out);
+    std::vector<int64_t> fc_input_shape = fc_input->Var()->GetShape();
+    if ((fc_input_shape.size() != multihead_matmul_input_shape.size()) ||
+        (fc_input_shape.size() != 3)) {
+      check_flag = false;
+      VLOG(3) << "Transformer model remove_padding shape check failed, return "
+                 "remove_padding pass.";
+      return;
+    }
+    if (fc_input_shape[0] != multihead_matmul_input_shape[0]) {
+      check_flag = false;
+    }
+    if (fc_input_shape[1] != multihead_matmul_input_shape[1]) {
+      check_flag = false;
+    }
+    if ((fc_input_shape[2] != multihead_matmul_input_shape[2]) &&
+        (fc_input_shape[2] != 4 * multihead_matmul_input_shape[2])) {
+      check_flag = false;
+    }
 
+    if (BOOST_GET_CONST(int, fc_op->Op()->GetAttr("in_num_col_dims")) != 2) {
+      check_flag = false;
+    }
+    if (!check_flag) {
+      VLOG(3) << "Transformer model remove_padding shape check failed, return "
+                 "remove_padding pass.";
+      return;
+    }
+    fc_op->Op()->RemoveAttr("in_num_col_dims");
+    fc_op->Op()->SetAttr("in_num_col_dims", 1);
+
+    insert_remove_padding_op(fc_input, fc_op);
+    insert_recover_padding_op(fc_op, fc_op->outputs[0]);
     found_subgraph_count++;
   };
   gpd3(graph, handler3);
@@ -280,6 +373,31 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(activation_op, activation_op, activation);
     GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out, activation);
 
+    std::vector<int64_t> activation_input_shape =
+        activation_input->Var()->GetShape();
+    if ((activation_input_shape.size() !=
+         multihead_matmul_input_shape.size()) ||
+        (activation_input_shape.size() != 3)) {
+      check_flag = false;
+      VLOG(3) << "Transformer model remove_padding shape check failed, return "
+                 "remove_padding pass.";
+      return;
+    }
+    if (activation_input_shape[0] != multihead_matmul_input_shape[0]) {
+      check_flag = false;
+    }
+    if (activation_input_shape[1] != multihead_matmul_input_shape[1]) {
+      check_flag = false;
+    }
+    if ((activation_input_shape[2] != multihead_matmul_input_shape[2]) &&
+        (activation_input_shape[2] != 4 * multihead_matmul_input_shape[2])) {
+      check_flag = false;
+    }
+    if (!check_flag) {
+      VLOG(3) << "Transformer model remove_padding shape check failed, return "
+                 "remove_padding pass.";
+      return;
+    }
     insert_remove_padding_op(activation_input, activation_op);
     insert_recover_padding_op(activation_op, activation_out);
 
diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
index d7ccfc75c2000..7b8075644cb51 100644
--- a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
+++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
@@ -32,6 +32,14 @@ namespace paddle {
 namespace framework {
 namespace ir {
 namespace patterns {
+struct EmbEltwiseLayernorm : public PatternBase {
+  EmbEltwiseLayernorm(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "emb_elt_layernorm") {}
+
+  void operator()();
+  PATTERN_DECL_NODE(emb_elt_layernorm_op);
+  PATTERN_DECL_NODE(emb_elt_layernorm_out);
+};
 
 struct SkipLayernorm : public PatternBase {
   SkipLayernorm(PDPattern *pattern, const std::string &name_scope)
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
index a03a6f5b2c72c..a2dd846ba52d5 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -145,9 +146,9 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
     return x->outputs[fc_idx]->outputs[0];
   };
 
-  auto var_next_is_fc_act_repeated_n_times = [=](
-      Node* x, int repeated_times, const std::string& act_type = "relu",
-      bool check_in_has_only_one_out = true) -> bool {
+  auto var_next_is_fc_act_repeated_n_times =
+      [=](Node* x, int repeated_times, const std::string& act_type = "relu",
+          bool check_in_has_only_one_out = true) -> bool {
     for (int i = 0; i < repeated_times; ++i) {
       if (!var_next_is_fc_act(x, act_type,
                               i == 0 && check_in_has_only_one_out)) {
@@ -191,9 +192,9 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
     return nullptr;
   };
 
-  auto var_before_is_fc_act_repeated_n_times = [=](
-      Node* x, int repeated_times,
-      const std::string& act_type = "relu") -> bool {
+  auto var_before_is_fc_act_repeated_n_times = [=](Node* x, int repeated_times,
+                                                   const std::string& act_type =
+                                                       "relu") -> bool {
     for (int i = 0; i < repeated_times; ++i) {
       if (!var_before_is_fc_act(x, act_type, i == repeated_times - 1)) {
         return false;
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc
index f0ff77acf9ff8..3112b776ae5e6 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
index 778e658354f26..451e41e767dc4 100644
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/runtime_context_cache_pass.h"
+
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
index 9fa951920f45a..2c0b142c98fbd 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
index 2b084bd5734b9..052b0a4bdc1b8 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
@@ -44,8 +44,8 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
            is_concat_op_with_inputs(x->outputs[0], num_inputs);
   };
 
-  auto is_seqpool_op_with_pootype_of_nth_input_of_concat = [=](
-      Node* x, const std::string& type, int idx) -> bool {
+  auto is_seqpool_op_with_pootype_of_nth_input_of_concat =
+      [=](Node* x, const std::string& type, int idx) -> bool {
     bool this_is_seqpool_op =
         x && x->IsOp() && x->Op()->Type() == "sequence_pool" &&
         x->Op()->HasAttr("pooltype") &&
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
index d366803851842..e56ba9ad1e751 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
index 7200e0ac1d469..916adbbe33720 100644
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
@@ -44,11 +44,11 @@ static void GetConcatNodes(ir::Graph* graph, std::vector<Node*>* concat_nodes) {
   GraphPatternDetector gpd;
   auto* pattern = gpd.mutable_pattern();
   auto concat_op_node = BuildCVMConcatPattern(pattern);
-  GraphPatternDetector::handle_t handler = [&](
-      const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-    Node* concat_op = subgraph.at(concat_op_node);
-    concat_nodes->push_back(concat_op);
-  };
+  GraphPatternDetector::handle_t handler =
+      [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
+        Node* concat_op = subgraph.at(concat_op_node);
+        concat_nodes->push_back(concat_op);
+      };
   gpd(graph, handler);
 }
 }  // anonymous namespace
@@ -148,19 +148,19 @@ void SeqPoolCVMConcatFusePass::ApplyImpl(ir::Graph* graph) const {
     Node* cvm_input_of_cvm;
     Node* concat_out_var = concat_node->outputs[0];
 
-    GraphPatternDetector::handle_t handler = [&](
-        const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-      Node* seqpool_in_var = subgraph.at(seqpool_in_var_node);
-      Node* seqpool_op = subgraph.at(seqpool_op_node);
-      Node* seqpool_out_var = subgraph.at(seqpool_out_var_node);
-      Node* seqpool_idx_out_var = subgraph.at(seqpool_idx_out_var_node);
-      Node* cvm_op = subgraph.at(cvm_op_node);
-      Node* cvm_out_var = subgraph.at(cvm_out_var_node);
-      cvm_input_of_cvm = subgraph.at(cvm_cvm_in_var_node);
-      marked_nodes.insert({seqpool_op, seqpool_out_var, seqpool_idx_out_var,
-                           cvm_op, cvm_out_var, concat_node});
-      ins_to_concat[cvm_out_var->Name()] = seqpool_in_var;
-    };
+    GraphPatternDetector::handle_t handler =
+        [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
+          Node* seqpool_in_var = subgraph.at(seqpool_in_var_node);
+          Node* seqpool_op = subgraph.at(seqpool_op_node);
+          Node* seqpool_out_var = subgraph.at(seqpool_out_var_node);
+          Node* seqpool_idx_out_var = subgraph.at(seqpool_idx_out_var_node);
+          Node* cvm_op = subgraph.at(cvm_op_node);
+          Node* cvm_out_var = subgraph.at(cvm_out_var_node);
+          cvm_input_of_cvm = subgraph.at(cvm_cvm_in_var_node);
+          marked_nodes.insert({seqpool_op, seqpool_out_var, seqpool_idx_out_var,
+                               cvm_op, cvm_out_var, concat_node});
+          ins_to_concat[cvm_out_var->Name()] = seqpool_in_var;
+        };
     gpd(graph, handler);
 
     if (!ins_to_concat.empty()) {
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
index bba640cf148d1..8d8ebc955d39e 100644
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h"
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc
index 37e77bc134d3c..f177f60708773 100644
--- a/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc
+++ b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc
@@ -21,129 +21,134 @@
 namespace paddle {
 namespace framework {
 namespace ir {
-SetTransformerInputConvertPass::SetTransformerInputConvertPass() {
-  AddOpCompat(OpCompat("elementwise_add"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("axis")
-      .End();
-}
 namespace patterns {
 
-void SetTransformerInputConvert::operator()() {
+void SetTransformerInputConvert::operator()(const std::string &pos_id) {
   std::unordered_set<std::string> lookup_table_ops{"lookup_table",
                                                    "lookup_table_v2"};
-  // Create nodes for lookup_table1 op.
-  auto *lookup_table1_x = pattern->NewNode(lookup_table1_x_repr())
-                              ->assert_is_ops_input(lookup_table_ops, "Ids");
-  auto *lookup_table1_w = pattern->NewNode(lookup_table1_w_repr())
-                              ->assert_is_ops_input(lookup_table_ops, "W");
-  auto *lookup_table1_op =
-      pattern->NewNode(lookup_table1_repr())->assert_is_ops(lookup_table_ops);
-  auto *lookup_table1_out = pattern->NewNode(lookup_table1_out_repr())
-                                ->assert_is_ops_output(lookup_table_ops)
-                                ->AsIntermediate()
-                                ->assert_is_op_input("elementwise_add", "X");
-
-  // Create nodes for lookup_table2 op.
-  auto *lookup_table2_x = pattern->NewNode(lookup_table2_x_repr())
-                              ->assert_is_ops_input(lookup_table_ops, "Ids");
-  auto *lookup_table2_w = pattern->NewNode(lookup_table2_w_repr())
-                              ->assert_is_ops_input(lookup_table_ops, "W");
-  auto *lookup_table2_op =
-      pattern->NewNode(lookup_table2_repr())->assert_is_ops(lookup_table_ops);
-  auto *lookup_table2_out = pattern->NewNode(lookup_table2_out_repr())
-                                ->assert_is_ops_output(lookup_table_ops)
-                                ->AsIntermediate()
-                                ->assert_is_op_input("elementwise_add", "Y");
-
-  // Create nodes for elementwise_add op.
-  auto *elementwise_op =
-      pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add");
-  auto *elementwise_out = pattern->NewNode(elementwise_out_repr())
-                              ->AsOutput()
-                              ->assert_is_only_output_of_op("elementwise_add");
+  // Create nodes for lookup_table.
+  auto *lookup_table_id =
+      pattern->NewNode(lookup_table_id_repr())
+          ->assert_is_ops_input(lookup_table_ops, "Ids")
+          ->assert_more([&](Node *node) { return node->Name() == pos_id; });
+  auto *lookup_table_op =
+      pattern->NewNode(lookup_table_repr())->assert_is_ops(lookup_table_ops);
 
   // links nodes.
-  lookup_table1_op->LinksFrom({lookup_table1_x, lookup_table1_w})
-      .LinksTo({lookup_table1_out});
-  lookup_table2_op->LinksFrom({lookup_table2_x, lookup_table2_w})
-      .LinksTo({lookup_table2_out});
-  elementwise_op->LinksFrom({lookup_table1_out, lookup_table2_out})
-      .LinksTo({elementwise_out});
+  lookup_table_op->LinksFrom({lookup_table_id});
 }
 
+void MultiheadMatmulOP::operator()() {
+  // Create nodes for multihead_matmul op.
+  auto *multihead_matmul = pattern->NewNode(multihead_matmul_repr())
+                               ->assert_is_op("multihead_matmul");
+  auto *multihead_matmul_out =
+      pattern->NewNode(multihead_matmul_out_repr())
+          ->assert_is_op_output("multihead_matmul", "Out");
+
+  // links nodes.
+  multihead_matmul_out->LinksFrom({multihead_matmul});
+}
 }  // namespace patterns
 
 void SetTransformerInputConvertPass::ApplyImpl(ir::Graph *graph) const {
+  bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
+  std::string pos_id = Get<std::string>("tensorrt_transformer_posid");
+
+  if (!(graph->Has(framework::ir::kMultiheadMatmulPass) && with_dynamic_shape &&
+        (pos_id != ""))) {
+    VLOG(3) << "Transformer model need MultiheadMatmul, and "
+               "with_dynamic_shape. Stop this pass, "
+               "please reconfig.";
+    return;
+  }
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::PreconditionNotMet("graph should not be null."));
   FusePassBase::Init(name_scope_, graph);
   int found_subgraph_count = 0;
-
-  GraphPatternDetector gpd;
+  Node *transformer_input_convert_out0_node;
+  Node *transformer_input_convert_out1_node;
+  GraphPatternDetector gpd0;
   patterns::SetTransformerInputConvert fused_pattern(
-      gpd.mutable_pattern(), "transformer_input_convert_pass");
-  fused_pattern();
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
-                     Graph *graph) {
-    if (!IsCompat(subgraph, graph)) {
-      LOG(WARNING) << "transformer_input_convert_pass in op compat failed.";
-      return;
-    }
-
-    VLOG(3) << "transformer_input_convert_pass for pos_id, max_seqlen";
-
-    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_x, lookup_table2_x, fused_pattern);
+      gpd0.mutable_pattern(), "transformer_input_convert_pass");
+  fused_pattern(pos_id);
+  auto handler0 = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                      Graph *graph) {
+    VLOG(3)
+        << "transformer_input_convert_pass for pos_id, max_seqlen, mask_tensor";
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table, lookup_table, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table_id, lookup_table_id, fused_pattern);
 
     // create op, var in graph
-    OpDesc new_desc;
+    OpDesc new_desc(lookup_table->Op()->Block());
+
     new_desc.SetType("transformer_input_convert");
 
     // inputs
-    new_desc.SetInput("X", {lookup_table2_x->Name()});
+    new_desc.SetInput("Input", {lookup_table_id->Name()});
 
     // outputs
-    std::vector<std::string> output_0 = {"pos_id_tensor"};
-    std::vector<std::string> output_1 = {"max_seqlen_tensor"};
-    new_desc.SetOutput("PosId", output_0);
-    new_desc.SetOutput("MaxSeqlen", output_1);
-
     std::string transformer_input_convert_out0_name = "pos_id_tensor";
     std::string transformer_input_convert_out1_name = "max_seqlen_tensor";
-    VarDesc transformer_input_convert_out0(transformer_input_convert_out0_name);
-    VarDesc transformer_input_convert_out1(transformer_input_convert_out1_name);
-    transformer_input_convert_out0.SetDataType(proto::VarType::INT32);
-    transformer_input_convert_out1.SetDataType(proto::VarType::INT32);
-    transformer_input_convert_out0.SetShape({-1});
-    transformer_input_convert_out1.SetShape({-1});
-    transformer_input_convert_out0.SetPersistable(false);
-    transformer_input_convert_out1.SetPersistable(false);
+    std::string transformer_input_convert_out2_name = "mask_tensor";
+    std::vector<std::string> output_0 = {transformer_input_convert_out0_name};
+    std::vector<std::string> output_1 = {transformer_input_convert_out1_name};
+    std::vector<std::string> output_2 = {transformer_input_convert_out2_name};
+    new_desc.SetOutput("PosId", output_0);
+    new_desc.SetOutput("MaxSeqlen", output_1);
+    new_desc.SetOutput("MaskTensor", output_2);
+
+    auto *transformer_input_convert_out0 =
+        lookup_table->Op()->Block()->Var(transformer_input_convert_out0_name);
+    auto *transformer_input_convert_out1 =
+        lookup_table->Op()->Block()->Var(transformer_input_convert_out1_name);
+    auto *transformer_input_convert_out2 =
+        lookup_table->Op()->Block()->Var(transformer_input_convert_out2_name);
+    transformer_input_convert_out0->SetDataType(proto::VarType::INT32);
+    transformer_input_convert_out1->SetDataType(proto::VarType::INT32);
+    transformer_input_convert_out2->SetDataType(proto::VarType::INT32);
+    transformer_input_convert_out0->SetShape({-1});
+    transformer_input_convert_out1->SetShape({-1});
+
+    transformer_input_convert_out2->SetShape({-1});
+
+    transformer_input_convert_out0->SetPersistable(false);
+    transformer_input_convert_out1->SetPersistable(false);
+    transformer_input_convert_out2->SetPersistable(false);
 
     auto new_op_node = graph->CreateOpNode(&new_desc);
     auto transformer_input_convert_out0_node =
-        graph->CreateVarNode(&transformer_input_convert_out0);
+        graph->CreateVarNode(transformer_input_convert_out0);
     auto transformer_input_convert_out1_node =
-        graph->CreateVarNode(&transformer_input_convert_out1);
+        graph->CreateVarNode(transformer_input_convert_out1);
+    auto transformer_input_convert_out2_node =
+        graph->CreateVarNode(transformer_input_convert_out2);
 
     // needn't create variable in scope
 
-    IR_NODE_LINK_TO(lookup_table2_x, new_op_node);
+    IR_NODE_LINK_TO(lookup_table_id, new_op_node);
     IR_NODE_LINK_TO(new_op_node, transformer_input_convert_out0_node);
     IR_NODE_LINK_TO(new_op_node, transformer_input_convert_out1_node);
-
-    found_subgraph_count++;
+    IR_NODE_LINK_TO(new_op_node, transformer_input_convert_out2_node);
+  };
+  gpd0(graph, handler0);
+
+  GraphPatternDetector gpd1;
+  patterns::MultiheadMatmulOP multihead_matmul_pattern(
+      gpd1.mutable_pattern(), "transformer_input_convert_pass");
+  multihead_matmul_pattern();
+  auto handler1 = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                      Graph *graph) {
+    VLOG(3) << "link pos_id, max_seqlen to multihead_matmul.";
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul, multihead_matmul,
+                              multihead_matmul_pattern);
+
+    IR_NODE_LINK_TO(transformer_input_convert_out0_node, multihead_matmul);
+    IR_NODE_LINK_TO(transformer_input_convert_out1_node, multihead_matmul);
   };
+  gpd1(graph, handler1);
 
-  gpd(graph, handler);
+  found_subgraph_count++;
   AddStatis(found_subgraph_count);
 }
 
@@ -153,9 +158,3 @@ void SetTransformerInputConvertPass::ApplyImpl(ir::Graph *graph) const {
 
 REGISTER_PASS(set_transformer_input_convert_pass,
               paddle::framework::ir::SetTransformerInputConvertPass);
-REGISTER_PASS_CAPABILITY(set_transformer_input_convert_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("lookup_table", 1)
-            .LE("lookup_table_v2", 1)
-            .LE("elementweise_add", 1));
diff --git a/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h
index 5a5843e810f9a..01c9b1c854bd1 100644
--- a/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h
+++ b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h
@@ -33,41 +33,36 @@ namespace framework {
 namespace ir {
 namespace patterns {
 
-//     in_var  emb       in_var   emb
-//       |      |          |       |
-//     lookup_table      lookup_table
-//           |                 |
-//        lkt_var           lkt_var
-//            \                /
-//             elementwise_add
-//                    |
-//               elt_out_var
+//     in_var  emb
+//       |      |
+//     lookup_table
+//           |
+//        lkt_var
+
 //
 struct SetTransformerInputConvert : public PatternBase {
   SetTransformerInputConvert(PDPattern *pattern, const std::string &name_scope)
-      : PatternBase(pattern, name_scope, "transformer_input_convert") {}
+      : PatternBase(pattern, name_scope, "transformer_input_convert_pass") {}
+  void operator()(const std::string &pos_id);
+  // declare operator node's name
+  PATTERN_DECL_NODE(lookup_table);
+  // declare variable node's name
+  PATTERN_DECL_NODE(lookup_table_id);
+};
 
+struct MultiheadMatmulOP : public PatternBase {
+  MultiheadMatmulOP(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "transformer_input_convert_pass") {}
   void operator()();
-
   // declare operator node's name
-  PATTERN_DECL_NODE(lookup_table1);
-  PATTERN_DECL_NODE(lookup_table2);
-  PATTERN_DECL_NODE(elementwise);
-
-  // declare variable node's name
-  PATTERN_DECL_NODE(lookup_table1_x);
-  PATTERN_DECL_NODE(lookup_table1_w);
-  PATTERN_DECL_NODE(lookup_table1_out);
-  PATTERN_DECL_NODE(lookup_table2_x);
-  PATTERN_DECL_NODE(lookup_table2_w);
-  PATTERN_DECL_NODE(lookup_table2_out);
-  PATTERN_DECL_NODE(elementwise_out);
+  PATTERN_DECL_NODE(multihead_matmul);
+  PATTERN_DECL_NODE(multihead_matmul_out);
 };
 }  // namespace patterns
 
 class SetTransformerInputConvertPass : public FusePassBase {
  public:
-  SetTransformerInputConvertPass();
+  SetTransformerInputConvertPass() {}
   virtual ~SetTransformerInputConvertPass() {}
 
  protected:
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
index bcd7bedcc43a6..9007105950b47 100644
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
+
 #include <string>
 
-#include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
index 80f387c442760..908797163d21c 100644
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h"
-
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index bfa14d9296b26..6bebe8de9f2e3 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -43,8 +43,8 @@ struct SkipLayerNorm : public PatternBase {
   PATTERN_DECL_NODE(layer_norm);
   // declare variable node's name
   PATTERN_DECL_NODE(
-      elementwise_out);  // (elementwise_input_x,elementwise_input_y) ->
-                         // elementwise_out
+      elementwise_out);  // (elementwise_input_x,elementwise_input_y)
+                         // -> elementwise_out
   PATTERN_DECL_NODE(layer_norm_bias);
   PATTERN_DECL_NODE(layer_norm_scale);
   PATTERN_DECL_NODE(layer_norm_out);
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
index 29be2c3cb09a7..c95fd0abd5294 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
index 7c43b02218213..a8c7150d6e3e0 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
@@ -170,8 +170,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
 
   auto* matmul_xy_op = pattern->NewNode(
       [=](Node* x) {
-        return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
-                                  x->Op()->Type() == "matmul") &&
+        return x && x->IsOp() &&
+               (x->Op()->Type() == "matmul_v2" ||
+                x->Op()->Type() == "matmul") &&
                is_fusion_first_mul_out(x->outputs[0]);
       },
       name_scope + "/matmul_xy_op");
@@ -212,8 +213,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
 
   auto* matmul_squared_x_y_op = pattern->NewNode(
       [=](Node* x) {
-        return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
-                                  x->Op()->Type() == "matmul") &&
+        return x && x->IsOp() &&
+               (x->Op()->Type() == "matmul_v2" ||
+                x->Op()->Type() == "matmul") &&
                is_fusion_mat_squared_x_y_op_out(x->outputs[0]);
       },
       name_scope + "/matmul_squared_x_y_op");
diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
index 94fb68506413c..78dafaa1e2f12 100644
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <string>
 
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index bda6b90386475..6802310383d37 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/trt_embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trt_embedding_eltwise_layernorm_fuse_pass.cc
new file mode 100644
index 0000000000000..8f1fdb0b521dd
--- /dev/null
+++ b/paddle/fluid/framework/ir/trt_embedding_eltwise_layernorm_fuse_pass.cc
@@ -0,0 +1,477 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/trt_embedding_eltwise_layernorm_fuse_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+static PDNode* create_emb_vars(PDPattern* pattern, const std::string& name,
+                               const std::string& arg,
+                               bool is_persist = false) {
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
+  PDNode* node =
+      pattern->NewNode(name)->assert_is_ops_input(embedding_ops, arg);
+  if (is_persist) return node->assert_is_persistable_var();
+  return node;
+}
+static PDNode* create_emb_out_vars(PDPattern* pattern, const std::string& name,
+                                   const std::string& arg) {
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
+  PDNode* node = pattern->NewNode(name)
+                     ->assert_is_only_output_of_ops(embedding_ops)
+                     ->assert_is_op_input("elementwise_add", arg)
+                     ->AsIntermediate();
+  return node;
+}
+void TrtEmbedding2Eltwise1Pattern::operator()() {
+  auto* lookup_table1_x =
+      create_emb_vars(pattern, lookup_table1_x_repr(), "Ids");
+  auto* lookup_table2_x =
+      create_emb_vars(pattern, lookup_table2_x_repr(), "Ids");
+  auto* lookup_table1_w =
+      create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
+  auto* lookup_table2_w =
+      create_emb_vars(pattern, lookup_table2_w_repr(), "W", true);
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
+  auto* feed1 = pattern->NewNode(feed1_repr())->assert_is_op("feed");
+  auto* feed2 = pattern->NewNode(feed2_repr())->assert_is_op("feed");
+
+  auto* lookup_table1 =
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
+  auto* lookup_table2 =
+      pattern->NewNode(lookup_table2_repr())->assert_is_ops(embedding_ops);
+  auto* lookup_table1_out =
+      create_emb_out_vars(pattern, lookup_table1_out_repr(), "X");
+  auto* lookup_table2_out =
+      create_emb_out_vars(pattern, lookup_table2_out_repr(), "Y");
+  auto* eltwise_add =
+      pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add");
+  auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr())
+                              ->assert_is_op_output("elementwise_add");
+  feed1->LinksTo({lookup_table1_x});
+  lookup_table1->LinksFrom({lookup_table1_x, lookup_table1_w})
+      .LinksTo({lookup_table1_out});
+  feed2->LinksTo({lookup_table2_x});
+  lookup_table2->LinksFrom({lookup_table2_x, lookup_table2_w})
+      .LinksTo({lookup_table2_out});
+  eltwise_add->LinksFrom({lookup_table1_out, lookup_table2_out})
+      .LinksTo({eltwise_add_out});
+}
+void TrtEmbedding1Eltwise1Pattern::operator()() {
+  auto* lookup_table1_x =
+      create_emb_vars(pattern, lookup_table1_x_repr(), "Ids");
+  auto* lookup_table1_w =
+      create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
+  auto* feed1 = pattern->NewNode(feed1_repr())->assert_is_op("feed");
+
+  auto* lookup_table1 =
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
+  auto* lookup_table1_out =
+      create_emb_out_vars(pattern, lookup_table1_out_repr(), "Y");
+  auto* eltwise_add =
+      pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add");
+  auto* eltwise_add_in = pattern->NewNode(eltwise_add_in_repr())
+                             ->assert_is_op_input("elementwise_add", "X")
+                             ->assert_is_op_output("elementwise_add");
+  auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr())
+                              ->assert_is_op_output("elementwise_add");
+  lookup_table1->LinksFrom({lookup_table1_x, lookup_table1_w})
+      .LinksTo({lookup_table1_out});
+  feed1->LinksTo({lookup_table1_x});
+  eltwise_add->LinksFrom({lookup_table1_out, eltwise_add_in})
+      .LinksTo({eltwise_add_out});
+}
+void TrtSkipLayerNorm::operator()() {
+  auto* eltwise_add =
+      pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add");
+  auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr())
+                              ->assert_is_op_output("elementwise_add")
+                              ->assert_is_op_input("layer_norm", "X")
+                              ->AsIntermediate();
+  auto* layer_norm =
+      pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm");
+  auto* layer_norm_out = pattern->NewNode(layer_norm_out_repr())
+                             ->assert_is_op_output("layer_norm", "Y")
+                             ->AsOutput();
+  auto* layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr())
+                                  ->AsInput()
+                                  ->assert_is_persistable_var()
+                                  ->assert_is_op_input("layer_norm", "Bias");
+  auto* layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr())
+                                   ->AsInput()
+                                   ->assert_is_persistable_var()
+                                   ->assert_is_op_input("layer_norm", "Scale");
+  auto* layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr())
+                                  ->AsOutput()
+                                  ->assert_is_op_output("layer_norm", "Mean");
+  auto* layer_norm_variance_var =
+      pattern->NewNode(layer_norm_variance_repr())
+          ->AsOutput()
+          ->assert_is_op_output("layer_norm", "Variance");
+  eltwise_add->LinksTo({eltwise_add_out});
+  layer_norm
+      ->LinksFrom({eltwise_add_out, layer_norm_bias_var, layer_norm_scale_var})
+      .LinksTo({layer_norm_out, layer_norm_mean_var, layer_norm_variance_var});
+}
+
+}  // namespace patterns
+
+int TrtEmbeddingEltwiseLayerNormFusePass::BuildFusion(
+    Graph* graph, const std::string& name_scope
+    /*const Scope* scope*/) const {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+  bool use_varseqlen = Get<bool>("use_varseqlen");
+  std::string pos_id = Get<std::string>("tensorrt_transformer_posid");
+  std::string mask_id = Get<std::string>("tensorrt_transformer_maskid");
+  std::vector<std::vector<std::pair<Node*, Node*>>> start_pattern_in_nodes;
+  std::vector<Node*> start_pattern_out_node;
+  std::vector<std::unordered_set<Node*>> start_pattern_remove_nodes;
+
+  // Create pattern.
+  patterns::TrtEmbedding2Eltwise1Pattern start_pattern(pattern,
+                                                       name_scope + "/start");
+  start_pattern();
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_x, lookup_table1_x, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_x, lookup_table2_x, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_w, lookup_table1_w, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_w, lookup_table2_w, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1, lookup_table1, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2, lookup_table2, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_out, lookup_table1_out,
+                              start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_out, lookup_table2_out,
+                              start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out, start_pattern);
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "Pass(TrtEmbedding2Eltwise1Pattern) in op compat failed.";
+      return;
+    }
+    std::vector<std::pair<Node*, Node*>> ins;
+    ins.push_back(std::make_pair(lookup_table1_x, lookup_table1_w));
+    ins.push_back(std::make_pair(lookup_table2_x, lookup_table2_w));
+    start_pattern_in_nodes.push_back(ins);
+    start_pattern_out_node.push_back(eltwise_add_out);
+
+    std::unordered_set<Node*> rm_nodes;
+    rm_nodes.insert({lookup_table1, lookup_table2, lookup_table1_out,
+                     lookup_table2_out, eltwise_add, eltwise_add_out});
+    start_pattern_remove_nodes.push_back(rm_nodes);
+  };
+  gpd(graph, handler);
+
+  std::vector<std::pair<Node*, Node*>> inner_pattern_ins;
+  std::vector<Node*> inner_pattern_tmp_in;
+  std::vector<Node*> inner_pattern_out;
+  std::vector<std::unordered_set<Node*>> inner_pattern_remove_nodes;
+
+  GraphPatternDetector gpd2;
+  auto* pattern2 = gpd2.mutable_pattern();
+  patterns::TrtEmbedding1Eltwise1Pattern second_pattern(pattern2,
+                                                        name_scope + "/second");
+  second_pattern();
+  auto handler2 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_x, lookup_table1_x, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_w, lookup_table1_w, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1, lookup_table1, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_out, lookup_table1_out,
+                              second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_in, eltwise_add_in, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out, second_pattern);
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "Pass(TrtEmbedding1Eltwise1Pattern) in op compat failed.";
+      return;
+    }
+    auto in = std::make_pair(lookup_table1_x, lookup_table1_w);
+    inner_pattern_ins.push_back(in);
+    inner_pattern_tmp_in.push_back(eltwise_add_in);
+    inner_pattern_out.push_back(eltwise_add_out);
+
+    std::unordered_set<Node*> rm_nodes;
+    rm_nodes.insert(
+        {lookup_table1, lookup_table1_out, eltwise_add, eltwise_add_out});
+    inner_pattern_remove_nodes.push_back(rm_nodes);
+  };
+  gpd2(graph, handler2);
+
+  std::vector<Node*> end_pattern_elt_out;
+  std::vector<Node*> end_pattern_scales;
+  std::vector<Node*> end_pattern_biases;
+  std::vector<Node*> end_pattern_out;
+  std::vector<Node*> end_patter_layernorms;
+  std::vector<std::unordered_set<Node*>> end_pattern_remove_nodes;
+  GraphPatternDetector gpd3;
+  auto* pattern3 = gpd3.mutable_pattern();
+  patterns::TrtSkipLayerNorm skip_layernorm_pattern(pattern3,
+                                                    name_scope + "/third");
+  skip_layernorm_pattern();
+  auto handler3 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out,
+                              skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm, layer_norm, skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out, layer_norm_out,
+                              skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_bias, layer_norm_bias,
+                              skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_scale, layer_norm_scale,
+                              skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean, layer_norm_mean,
+                              skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance,
+                              skip_layernorm_pattern);
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "Pass(TrtSkipLayerNorm) in op compat failed.";
+      return;
+    }
+    end_pattern_elt_out.push_back(eltwise_add_out);
+    std::unordered_set<Node*> rm_nodes;
+    rm_nodes.insert({layer_norm, layer_norm_mean, layer_norm_variance});
+    end_pattern_remove_nodes.push_back(rm_nodes);
+    end_pattern_biases.push_back(layer_norm_bias);
+    end_pattern_scales.push_back(layer_norm_scale);
+    end_pattern_out.push_back(layer_norm_out);
+    end_patter_layernorms.push_back(layer_norm);
+  };
+  gpd3(graph, handler3);
+
+  if (start_pattern_in_nodes.empty() || end_pattern_elt_out.empty()) {
+    return 0;
+  }
+  // only reserve the subgraphs that in connected domains.
+  int fusion_count = 0;
+  // fusion_id for (i, k, js)
+  std::vector<std::pair<size_t, std::pair<size_t, std::vector<size_t>>>>
+      fusion_ids;
+  for (size_t i = 0; i < start_pattern_in_nodes.size(); ++i) {
+    Node* tmp = start_pattern_out_node[i];
+    Node* old_tmp = nullptr;
+    // get correct inner pattern node order.
+    std::vector<size_t> js;
+    while (tmp != old_tmp) {
+      old_tmp = tmp;
+      for (size_t j = 0; j < inner_pattern_tmp_in.size(); ++j) {
+        if (inner_pattern_tmp_in[j] == tmp) {
+          tmp = inner_pattern_out[j];
+          js.push_back(j);
+          break;
+        }
+      }
+    }
+
+    for (size_t k = 0; k < end_pattern_elt_out.size(); ++k) {
+      if (tmp == end_pattern_elt_out[k]) {
+        fusion_ids.push_back(std::make_pair(i, std::make_pair(k, js)));
+        break;
+      }
+    }
+  }
+
+  for (size_t num = 0; num < fusion_ids.size(); ++num) {
+    int i = fusion_ids[num].first;
+    int k = fusion_ids[num].second.first;
+    std::vector<size_t> js = fusion_ids[num].second.second;
+
+    std::vector<std::string> ids;
+    std::vector<std::string> embs;
+    for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) {
+      ids.push_back(start_pattern_in_nodes[i][iter].first->Name());
+      embs.push_back(start_pattern_in_nodes[i][iter].second->Name());
+    }
+    for (size_t iter = 0; iter < js.size(); ++iter) {
+      ids.push_back(inner_pattern_ins[js[iter]].first->Name());
+      embs.push_back(inner_pattern_ins[js[iter]].second->Name());
+    }
+
+    OpDesc new_op_desc(end_patter_layernorms[0]->Op()->Block());
+    new_op_desc.SetType("fused_embedding_eltwise_layernorm");
+    new_op_desc.SetInput("Ids", ids);
+    new_op_desc.SetInput("Embs", embs);
+    new_op_desc.SetInput("WordId", {ids[0]});
+    if (use_varseqlen && pos_id != "" && mask_id != "") {
+      new_op_desc.SetInput("PosId", {pos_id});
+      new_op_desc.SetInput("MaskId", {mask_id});
+    } else {
+      new_op_desc.SetInput("PosId", {ids[1]});
+    }
+    if (ids.size() > 2) {
+      new_op_desc.SetInput("SentId", {ids[2]});
+    }
+
+    new_op_desc.SetInput("WordEmbedding", {embs[0]});
+    new_op_desc.SetInput("PosEmbedding", {embs[1]});
+    if (embs.size() > 2) {
+      new_op_desc.SetInput("SentEmbedding", {embs[2]});
+    }
+
+    new_op_desc.SetInput("Bias", {end_pattern_biases[k]->Name()});
+    new_op_desc.SetInput("Scale", {end_pattern_scales[k]->Name()});
+    new_op_desc.SetOutput("Out", {end_pattern_out[k]->Name()});
+    new_op_desc.SetAttr("epsilon",
+                        end_patter_layernorms[k]->Op()->GetAttr("epsilon"));
+
+    if (end_patter_layernorms[k]->Op()->HasAttr("out_threshold")) {
+      new_op_desc.SetAttr("enable_int8", true);
+      new_op_desc.SetAttr(
+          "out_threshold",
+          end_patter_layernorms[k]->Op()->GetAttr("out_threshold"));
+    }
+
+    auto* embedding_eltwise_layernorm = graph->CreateOpNode(&new_op_desc);
+
+    for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) {
+      IR_NODE_LINK_TO(start_pattern_in_nodes[i][iter].first,
+                      embedding_eltwise_layernorm);
+      IR_NODE_LINK_TO(start_pattern_in_nodes[i][iter].second,
+                      embedding_eltwise_layernorm);
+    }
+    for (size_t iter = 0; iter < js.size(); ++iter) {
+      IR_NODE_LINK_TO(inner_pattern_ins[js[iter]].first,
+                      embedding_eltwise_layernorm);
+      IR_NODE_LINK_TO(inner_pattern_ins[js[iter]].second,
+                      embedding_eltwise_layernorm);
+    }
+    IR_NODE_LINK_TO(end_pattern_biases[k], embedding_eltwise_layernorm);
+    IR_NODE_LINK_TO(end_pattern_scales[k], embedding_eltwise_layernorm);
+    IR_NODE_LINK_TO(embedding_eltwise_layernorm, end_pattern_out[k]);
+
+    // Remove unneeded nodes.
+    std::unordered_set<const Node*> marked_nodes;
+    marked_nodes.insert(start_pattern_remove_nodes[i].begin(),
+                        start_pattern_remove_nodes[i].end());
+    marked_nodes.insert(end_pattern_remove_nodes[k].begin(),
+                        end_pattern_remove_nodes[k].end());
+    for (size_t iter = 0; iter < js.size(); ++iter) {
+      marked_nodes.insert(inner_pattern_remove_nodes[js[iter]].begin(),
+                          inner_pattern_remove_nodes[js[iter]].end());
+    }
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  }
+
+  return fusion_count;
+}
+
+TrtEmbeddingEltwiseLayerNormFusePass::TrtEmbeddingEltwiseLayerNormFusePass() {
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .End();
+
+  AddOpCompat(OpCompat("layer_norm"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Scale")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddOutput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Mean")
+      .IsTensor()
+      .End()
+      .AddOutput("Variance")
+      .IsTensor()
+      .End()
+      .AddAttr("epsilon")
+      .IsNumGE(0.0f)
+      .IsNumLE(0.001f)
+      .End()
+      .AddAttr("begin_norm_axis")
+      .IsNumGT(0)
+      .End();
+}
+
+void TrtEmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
+  bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
+  if (!with_dynamic_shape) {
+    VLOG(3) << "trt_embedding_eltwise_layernorm_fuse_pass need: use_varseqlen, "
+               "with_dynamic_shape. Stop this pass, "
+               "please reconfig.";
+    return;
+  }
+  FusePassBase::Init(name_scope_, graph);
+  int fusion_count =
+      TrtEmbeddingEltwiseLayerNormFusePass::BuildFusion(graph, name_scope_);
+  if (fusion_count > 0) {
+    bool use_varseqlen = Get<bool>("use_varseqlen");
+    std::string pos_id = Get<std::string>("tensorrt_transformer_posid");
+    std::string mask_id = Get<std::string>("tensorrt_transformer_maskid");
+
+    if ((use_varseqlen && pos_id != "" && mask_id != "") ||
+        (!use_varseqlen && pos_id == "" && mask_id == "")) {
+      VLOG(3) << "start trt_embedding_eltwise_layernorm_fuse_pass";
+    } else {
+      PADDLE_THROW(
+          platform::errors::Fatal("Use transformer'varseqlen need config: "
+                                  "use_varseqlen, set pos_id, set "
+                                  "mask_id. Or not use varseqlen, do not set "
+                                  "pos_id, set mask_id. Please "
+                                  "reconfig"));
+    }
+    graph->Set(kEmbEltwiseLayernormPass, new bool(true));
+  }
+  AddStatis(fusion_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(trt_embedding_eltwise_layernorm_fuse_pass,
+              paddle::framework::ir::TrtEmbeddingEltwiseLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(trt_embedding_eltwise_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("lookup_table", 1)
+            .LE("lookup_table_v2", 1)
+            .LE("elementweise_add", 1));
diff --git a/paddle/fluid/framework/ir/trt_embedding_eltwise_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/trt_embedding_eltwise_layernorm_fuse_pass.h
new file mode 100644
index 0000000000000..2d956a38aac3c
--- /dev/null
+++ b/paddle/fluid/framework/ir/trt_embedding_eltwise_layernorm_fuse_pass.h
@@ -0,0 +1,167 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+// detect start pattern.
+//
+//     in_var  emb       in_var   emb
+//       |      |          |       |
+//     lookup_table      lookup_table
+//           |                 |
+//        lkt_var           lkt_var
+//            \                /
+//             elementwise_add
+//                    |
+//               elt_out_var
+//
+struct TrtEmbedding2Eltwise1Pattern : public PatternBase {
+  TrtEmbedding2Eltwise1Pattern(PDPattern* pattern,
+                               const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "embedding2_eltwise1") {}
+
+  void operator()();
+  PATTERN_DECL_NODE(feed1);
+  PATTERN_DECL_NODE(feed2);
+  PATTERN_DECL_NODE(lookup_table1_x);
+  PATTERN_DECL_NODE(lookup_table2_x);
+  PATTERN_DECL_NODE(lookup_table1_w);
+  PATTERN_DECL_NODE(lookup_table2_w);
+  PATTERN_DECL_NODE(lookup_table1);
+  PATTERN_DECL_NODE(lookup_table2);
+  PATTERN_DECL_NODE(lookup_table1_out);
+  PATTERN_DECL_NODE(lookup_table2_out);
+  PATTERN_DECL_NODE(eltwise_add);
+  PATTERN_DECL_NODE(eltwise_add_out);
+};
+
+// detect repeats inner pattern
+//
+//    elt_out_var            in_var   emb
+//         \                   |       |
+//          \                 lookup_table
+//           \                     |
+//            \                 lkt_var
+//             \                   /
+//                elementwise_add
+//                      |
+//                 elt_out_var
+//
+struct TrtEmbedding1Eltwise1Pattern : public PatternBase {
+  TrtEmbedding1Eltwise1Pattern(PDPattern* pattern,
+                               const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "embedding1_eltwise1") {}
+  void operator()();
+  PATTERN_DECL_NODE(feed1);
+  PATTERN_DECL_NODE(lookup_table1_x);
+  PATTERN_DECL_NODE(lookup_table1_w);
+  PATTERN_DECL_NODE(lookup_table1);
+  PATTERN_DECL_NODE(lookup_table1_out);
+  PATTERN_DECL_NODE(eltwise_add_in);
+  PATTERN_DECL_NODE(eltwise_add);
+  PATTERN_DECL_NODE(eltwise_add_out);
+};
+
+// detect end pattern
+//
+//     elementwise_add
+//            |
+//       elt_out_var
+//  scale     |       bias
+//    \       |        /
+//       layer_norm
+//
+struct TrtSkipLayerNorm : public PatternBase {
+  TrtSkipLayerNorm(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "skip_layernorm") {}
+  void operator()();
+  PATTERN_DECL_NODE(eltwise_add);
+  PATTERN_DECL_NODE(eltwise_add_out);
+  PATTERN_DECL_NODE(layer_norm);
+  PATTERN_DECL_NODE(layer_norm_bias);
+  PATTERN_DECL_NODE(layer_norm_scale);
+  PATTERN_DECL_NODE(layer_norm_out);
+  // Delete the mean and var nodes in the graph.
+  PATTERN_DECL_NODE(layer_norm_mean);
+  PATTERN_DECL_NODE(layer_norm_variance);
+};
+}  // namespace patterns
+
+// The TrtEmbeddingEltwiseLayerNormFusePass detect the following pattern:
+//
+// inputs                           operator            output
+// --------------------------------------------------------------------
+// (word, weights_0)                lookup_table     ->  word_emb
+// (pos, weights_1)                 lookup_table     ->  pos_emb
+// (sent, weights_2)                lookup_table     ->  sent_emb
+// (word_emb, pos_emb)              elementweise_add -> elementwise_out_0
+// (elemtwise_out_0, sent_emb)      elementweise_add -> elementwise_out_1
+// (elementwise_out_1, scale, bias) layer_norm       -> layer_norm_out
+//
+// and then convert the corresponding subgraph to:
+//
+// (word, pos, sent, weights_0, weights_1, weights_2,
+//       scale, baias)   embedding_eltwise_layernorm -> layer_norm_out
+//
+//
+//  in_var  emb_var   in_var   emb_var   in_var   emb_var      in_var   emb_var
+//    |        |        |         |        |         |           |         |
+//   lookup_table      lookup_table       lookup_table   ...    lookup_table
+//        |                 |                  |                     |
+//     lkt_var           lkt_var            lkt_var               lkt_var
+//        \                 /                  |         ...         |
+//          elementwise_add                    |                     |
+//                 \                          /                      |
+//                       elementwise_add                             |
+//                               |                                   |
+//                            elt_var                               /
+//                               \                                 /
+//                                         elementwise_add
+//                                                 |
+//                                            layer_norm
+
+class TrtEmbeddingEltwiseLayerNormFusePass : public FusePassBase {
+ public:
+  TrtEmbeddingEltwiseLayerNormFusePass();
+  virtual ~TrtEmbeddingEltwiseLayerNormFusePass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+  int BuildFusion(Graph* graph, const std::string& name_scope
+                  /*const Scope* scope*/) const;
+  const std::string name_scope_{"trt_embedding_eltwise_layernorm_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc
index d3211c0841416..a6e3780fd22c9 100644
--- a/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc
+++ b/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc
@@ -16,9 +16,9 @@
 
 #include <cmath>
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
-
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
new file mode 100644
index 0000000000000..2e3e957fd15f1
--- /dev/null
+++ b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
@@ -0,0 +1,1549 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+static void ReplaceOutputVar(Node* op, Node* old_var, Node* new_var) {
+  if (op->IsOp() && op->Op()) {
+    new_var->inputs.push_back(op);
+    for (size_t i = 0; i < op->outputs.size(); ++i) {
+      if (op->outputs[i] == old_var) {
+        op->outputs[i] = new_var;
+        op->Op()->RenameOutput(old_var->Name(), new_var->Name());
+      }
+    }
+  }
+}
+
+static int BuildFusion(Graph* graph, const std::string& name_scope) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  TrtMultiHeadMatmulPattern multihead_pattern(pattern, name_scope);
+
+  multihead_pattern();
+  // Create New OpDesc
+  auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2,
+                          Node* mul0_out, Node* mul1_out, Node* mul2_out,
+                          Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b,
+                          Node* eltadd_qk_b, Node* reshape2,
+                          Node* reshape2_qkv_out, Node* scale,
+                          Node* scale_out) {
+    auto scale_attr = BOOST_GET_CONST(float, scale->Op()->GetAttr("scale"));
+    // auto scale_bias = BOOST_GET_CONST(float, scale->Op()->GetAttr("bias"));
+    // bool after_scale =
+    //    BOOST_GET_CONST(bool, scale->Op()->GetAttr("bias_after_scale"));
+
+    // create multihead
+    OpDesc multihead_op_desc(mul0->Op()->Block());
+
+    // create tmp tensor
+    VarDesc k_var_desc(*mul1_out->Var());
+    k_var_desc.SetName("K" + mul1_out->Name());
+    auto* k_var_node = graph->CreateVarNode(&k_var_desc);
+
+    VarDesc q_var_desc(*mul0_out->Var());
+    q_var_desc.SetName("Q" + mul0_out->Name());
+    auto* q_var_node = graph->CreateVarNode(&q_var_desc);
+
+    VarDesc v_var_desc(*mul2_out->Var());
+    v_var_desc.SetName("V" + mul2_out->Name());
+    auto* v_var_node = graph->CreateVarNode(&v_var_desc);
+
+    auto reshape_desc = reshape2->Op();
+    int head_number =
+        BOOST_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape")).at(2);
+
+    ReplaceOutputVar(mul0, mul0_out, q_var_node);
+    ReplaceOutputVar(mul1, mul1_out, k_var_node);
+    ReplaceOutputVar(mul2, mul2_out, v_var_node);
+
+    multihead_op_desc.SetType("multihead_matmul");
+    multihead_op_desc.SetInput("Q", {q_var_node->Name()});
+    multihead_op_desc.SetInput("K", {k_var_node->Name()});
+    multihead_op_desc.SetInput("V", {v_var_node->Name()});
+
+    multihead_op_desc.SetInput("BiasQ", {eltadd0_b->Name()});
+    multihead_op_desc.SetInput("BiasK", {eltadd1_b->Name()});
+    multihead_op_desc.SetInput("BiasV", {eltadd2_b->Name()});
+    multihead_op_desc.SetInput("BiasQK", {eltadd_qk_b->Name()});
+
+    multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()});
+    multihead_op_desc.SetAttr("alpha", scale_attr);
+    multihead_op_desc.SetAttr("head_number", head_number);
+
+    auto* multihead = graph->CreateOpNode(&multihead_op_desc);
+    IR_NODE_LINK_TO(q_var_node, multihead);
+    IR_NODE_LINK_TO(k_var_node, multihead);
+    IR_NODE_LINK_TO(v_var_node, multihead);
+
+    IR_NODE_LINK_TO(eltadd0_b, multihead);
+    IR_NODE_LINK_TO(eltadd1_b, multihead);
+    IR_NODE_LINK_TO(eltadd2_b, multihead);
+    IR_NODE_LINK_TO(eltadd_qk_b, multihead);
+
+    IR_NODE_LINK_TO(multihead, reshape2_qkv_out);
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0_w, mul0_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0, reshape2_0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out, reshape2_0_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0, transpose2_0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out, transpose2_0_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(scale, scale, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(scale_out, scale_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul1, mul1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul1_out, mul1_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul1_w, mul1_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_1, reshape2_1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_1_out, reshape2_1_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_1, transpose2_1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_1_out, transpose2_1_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul2, mul2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul2_out, mul2_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul2_w, mul2_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_2, reshape2_2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_2_out, reshape2_2_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_2, transpose2_2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_2_out, transpose2_2_out,
+                              multihead_pattern);
+
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0, eltadd0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0_b, eltadd0_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0_out, eltadd0_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1, eltadd1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1_b, eltadd1_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1_out, eltadd1_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2, eltadd2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2_b, eltadd2_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2_out, eltadd2_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qk, matmul_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qk_out, matmul_qk_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk, eltadd_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_b, eltadd_qk_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_out, eltadd_qk_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk, softmax_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv, matmul_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out, matmul_qkv_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv, reshape2_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out, reshape2_qkv_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv, transpose2_qkv,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out,
+                              multihead_pattern);
+
+    fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out,
+                 eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b, reshape2_0,
+                 reshape2_qkv_out, scale, scale_out);
+
+    std::unordered_set<const Node*> marked_nodes(
+        {eltadd0,
+         eltadd1,
+         eltadd2,
+         eltadd0_out,
+         eltadd1_out,
+         eltadd2_out,
+         reshape2_0,
+         reshape2_1,
+         reshape2_2,
+         reshape2_0_out,
+         reshape2_1_out,
+         reshape2_2_out,
+         transpose2_0,
+         transpose2_1,
+         transpose2_2,
+         transpose2_0_out,
+         transpose2_1_out,
+         transpose2_2_out,
+         matmul_qk,
+         matmul_qk_out,
+         eltadd_qk,
+         eltadd_qk_out,
+         softmax_qk,
+         softmax_qk_out,  // dropout_qk, dropout_qk_out,
+         transpose2_qkv,
+         transpose2_qkv_out,
+         matmul_qkv,
+         matmul_qkv_out,
+         mul0_out,
+         mul1_out,
+         mul2_out,
+         reshape2_qkv,
+         scale});
+    // Remove unneeded nodes.
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+PDNode* TrtMultiHeadMatmulPattern::operator()() {
+  auto* input0 = pattern->NewNode(input0_repr());
+  input0->assert_is_op_input("mul");
+
+  // First path with scale
+  auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("mul");
+  auto* mul0_w_var = pattern->NewNode(mul0_w_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("mul", "Y");
+  auto* mul0_out_var =
+      pattern->NewNode(mul0_out_repr())->assert_is_op_output("mul");
+
+  decltype(mul0) eltadd0;
+  decltype(mul0) eltadd0_b_var;
+  decltype(mul0) eltadd0_out_var;
+
+  mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  eltadd0 = pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add");
+  eltadd0_b_var = pattern->NewNode(eltadd0_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd0_out_var = pattern->NewNode(eltadd0_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd0_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_0 =
+      pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_0_out_var =
+      pattern->NewNode(reshape2_0_out_repr())->assert_is_op_output("reshape2");
+  reshape2_0_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_0 =
+      pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
+  auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_0_out_var->AsIntermediate()->assert_is_op_input("scale");
+
+  auto* scale = pattern->NewNode(scale_repr())->assert_is_op("scale");
+  auto* scale_out_var =
+      pattern->NewNode(scale_out_repr())->assert_is_op_output("scale");
+  scale_out_var->AsIntermediate()->assert_is_op_input("matmul");
+
+  auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul");
+  auto* matmul_qk_out_var =
+      pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul");
+  matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_qk =
+      pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr())
+                                ->assert_is_op_output("elementwise_add");
+  eltadd_qk_out_var->AsIntermediate()->assert_is_op_input("softmax");
+
+  auto* softmax_qk =
+      pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
+  auto* softmax_qk_out_var =
+      pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax");
+  softmax_qk_out_var->AsIntermediate()->assert_is_op_input("matmul");
+
+  auto* matmul_qkv =
+      pattern->NewNode(matmul_qkv_repr())->assert_is_op("matmul");
+  auto* matmul_qkv_out_var =
+      pattern->NewNode(matmul_qkv_out_repr())->assert_is_op_output("matmul");
+  matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_qkv =
+      pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2");
+  auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr())
+                                     ->assert_is_op_output("transpose2");
+  transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_qkv =
+      pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
+  auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr())
+                                   ->assert_is_op_output("reshape2");
+  reshape2_qkv_out_var->assert_is_op_input("mul");
+
+  // Second path to matmul
+  auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("mul");
+  auto* mul1_w_var = pattern->NewNode(mul1_w_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("mul", "Y");
+  auto* mul1_out_var =
+      pattern->NewNode(mul1_out_repr())->assert_is_op_output("mul");
+
+  decltype(mul1) eltadd1;
+  decltype(mul1) eltadd1_b_var;
+  decltype(mul1) eltadd1_out_var;
+
+  mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+  eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add");
+  eltadd1_b_var = pattern->NewNode(eltadd1_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd1_out_var = pattern->NewNode(eltadd1_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd1_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_1 =
+      pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_1_out_var =
+      pattern->NewNode(reshape2_1_out_repr())->assert_is_op_output("reshape2");
+  reshape2_1_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_1 =
+      pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2");
+  auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_1_out_var->AsIntermediate()->assert_is_op_input(
+      "matmul");  // link to matmul qk
+
+  // Third path to matmul
+  auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("mul");
+  auto* mul2_w_var = pattern->NewNode(mul2_w_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("mul", "Y");
+  auto* mul2_out_var =
+      pattern->NewNode(mul2_out_repr())->assert_is_op_output("mul");
+
+  decltype(mul2) eltadd2;
+  decltype(mul2) eltadd2_b_var;
+  decltype(mul2) eltadd2_out_var;
+
+  mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+  eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add");
+  eltadd2_b_var = pattern->NewNode(eltadd2_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd2_out_var = pattern->NewNode(eltadd2_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd2_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_2 =
+      pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_2_out_var =
+      pattern->NewNode(reshape2_2_out_repr())->assert_is_op_output("reshape2");
+  reshape2_2_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_2 =
+      pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2");
+  auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_2_out_var->AsIntermediate()->assert_is_op_input(
+      "matmul");  // link to matmul qkv
+
+  // Q path
+  mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var});
+  eltadd0->LinksFrom({mul0_out_var, eltadd0_b_var}).LinksTo({eltadd0_out_var});
+
+  reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var});
+  transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var});
+  scale->LinksFrom({transpose2_0_out_var}).LinksTo({scale_out_var});
+  // K path
+  mul1->LinksFrom({input0, mul1_w_var}).LinksTo({mul1_out_var});
+  eltadd1->LinksFrom({mul1_out_var, eltadd1_b_var}).LinksTo({eltadd1_out_var});
+  reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var});
+  transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var});
+  // compute q*k
+  matmul_qk->LinksFrom({scale_out_var, transpose2_1_out_var})
+      .LinksTo({matmul_qk_out_var});
+  eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var})
+      .LinksTo({eltadd_qk_out_var});
+  softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var});
+  // V  path
+  mul2->LinksFrom({input0, mul2_w_var}).LinksTo({mul2_out_var});
+  eltadd2->LinksFrom({mul2_out_var, eltadd2_b_var}).LinksTo({eltadd2_out_var});
+  reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var});
+  transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var});
+  // compute q*k*v
+  matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var})
+      .LinksTo({matmul_qkv_out_var});
+  transpose2_qkv->LinksFrom({matmul_qkv_out_var})
+      .LinksTo({transpose2_qkv_out_var});
+  reshape2_qkv->LinksFrom({transpose2_qkv_out_var})
+      .LinksTo({reshape2_qkv_out_var});
+
+  return transpose2_2_out_var;
+}
+
+PDNode* TrtMultiHeadMatmulV3Pattern::operator()() {
+  std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
+  auto* input0 = pattern->NewNode(input0_repr());
+  input0->assert_is_ops_input(matmul_ops);
+
+  // First path with scale
+  auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_ops(matmul_ops);
+  auto* mul0_w_var = pattern->NewNode(mul0_w_repr())
+                         ->AsInput()
+                         ->assert_is_ops_input(matmul_ops, "Y");
+  auto* mul0_out_var =
+      pattern->NewNode(mul0_out_repr())->assert_is_ops_output(matmul_ops);
+
+  decltype(mul0) eltadd0;
+  decltype(mul0) eltadd0_b_var;
+  decltype(mul0) eltadd0_out_var;
+
+  mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  eltadd0 = pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add");
+  eltadd0_b_var = pattern->NewNode(eltadd0_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd0_out_var = pattern->NewNode(eltadd0_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd0_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_0 =
+      pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_0_out_var =
+      pattern->NewNode(reshape2_0_out_repr())->assert_is_op_output("reshape2");
+  reshape2_0_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_0 =
+      pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
+  auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_0_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops, "X");
+
+  auto* matmul_qk =
+      pattern->NewNode(matmul_qk_repr())->assert_is_ops(matmul_ops);
+  auto* matmul_qk_out_var =
+      pattern->NewNode(matmul_qk_out_repr())->assert_is_ops_output(matmul_ops);
+  matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_qk =
+      pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr())
+                                ->assert_is_op_output("elementwise_add");
+  eltadd_qk_out_var->AsIntermediate()->assert_is_op_input("softmax");
+
+  auto* softmax_qk =
+      pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
+  auto* softmax_qk_out_var =
+      pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax");
+  softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);
+
+  auto* matmul_qkv =
+      pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops);
+  auto* matmul_qkv_out_var =
+      pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops);
+  matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_qkv =
+      pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2");
+  auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr())
+                                     ->assert_is_op_output("transpose2");
+  transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_qkv =
+      pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
+  auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr())
+                                   ->assert_is_op_output("reshape2");
+  reshape2_qkv_out_var->assert_is_ops_input(matmul_ops);
+  // Second path to matmul
+  auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_ops(matmul_ops);
+  auto* mul1_w_var = pattern->NewNode(mul1_w_repr())
+                         ->AsInput()
+                         ->assert_is_ops_input(matmul_ops, "Y");
+  auto* mul1_out_var =
+      pattern->NewNode(mul1_out_repr())->assert_is_ops_output(matmul_ops);
+
+  decltype(mul1) eltadd1;
+  decltype(mul1) eltadd1_b_var;
+  decltype(mul1) eltadd1_out_var;
+
+  mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+  eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add");
+  eltadd1_b_var = pattern->NewNode(eltadd1_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd1_out_var = pattern->NewNode(eltadd1_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd1_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_1 =
+      pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_1_out_var =
+      pattern->NewNode(reshape2_1_out_repr())->assert_is_op_output("reshape2");
+  reshape2_1_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_1 =
+      pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2");
+  auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_1_out_var->AsIntermediate()->assert_is_ops_input(
+      matmul_ops, "Y");  // link to matmul qk
+
+  // Third path to matmul
+  auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_ops(matmul_ops);
+  auto* mul2_w_var = pattern->NewNode(mul2_w_repr())
+                         ->AsInput()
+                         ->assert_is_ops_input(matmul_ops, "Y");
+  auto* mul2_out_var =
+      pattern->NewNode(mul2_out_repr())->assert_is_ops_output(matmul_ops);
+
+  decltype(mul2) eltadd2;
+  decltype(mul2) eltadd2_b_var;
+  decltype(mul2) eltadd2_out_var;
+
+  mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+  eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add");
+  eltadd2_b_var = pattern->NewNode(eltadd2_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd2_out_var = pattern->NewNode(eltadd2_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd2_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_2 =
+      pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_2_out_var =
+      pattern->NewNode(reshape2_2_out_repr())->assert_is_op_output("reshape2");
+  reshape2_2_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_2 =
+      pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2");
+  auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_2_out_var->AsIntermediate()->assert_is_ops_input(
+      matmul_ops);  // link to matmul qkv
+
+  // Q path
+  mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var});
+  eltadd0->LinksFrom({mul0_out_var, eltadd0_b_var}).LinksTo({eltadd0_out_var});
+
+  reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var});
+  transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var});
+  // K path
+  mul1->LinksFrom({input0, mul1_w_var}).LinksTo({mul1_out_var});
+  eltadd1->LinksFrom({mul1_out_var, eltadd1_b_var}).LinksTo({eltadd1_out_var});
+  reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var});
+  transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var});
+  // compute q*k
+  matmul_qk->LinksFrom({transpose2_0_out_var, transpose2_1_out_var})
+      .LinksTo({matmul_qk_out_var});
+  eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var})
+      .LinksTo({eltadd_qk_out_var});
+  softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var});
+  // V  path
+  mul2->LinksFrom({input0, mul2_w_var}).LinksTo({mul2_out_var});
+  eltadd2->LinksFrom({mul2_out_var, eltadd2_b_var}).LinksTo({eltadd2_out_var});
+  reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var});
+  transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var});
+  // compute q*k*v
+  matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var})
+      .LinksTo({matmul_qkv_out_var});
+  transpose2_qkv->LinksFrom({matmul_qkv_out_var})
+      .LinksTo({transpose2_qkv_out_var});
+  reshape2_qkv->LinksFrom({transpose2_qkv_out_var})
+      .LinksTo({reshape2_qkv_out_var});
+
+  return transpose2_2_out_var;
+}
+}  // namespace patterns
+
+void TrtMultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+
+  int fusion_count = patterns::BuildFusion(graph, name_scope_);
+  AddStatis(fusion_count);
+}
+
+TrtMultiHeadMatmulV2FusePass::TrtMultiHeadMatmulV2FusePass() {
+  AddOpCompat(OpCompat("mul"))
+      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .IsTensor()
+      .End()
+      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddAttr("x_num_col_dims")
+      .IsNumEQ(2)
+      .End()
+      .AddAttr("y_num_col_dims")
+      .IsNumEQ(1)
+      .End();
+
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      // in bias, shape is (B, S, N*H),
+      // in biasqk, shape is (B, H, S, S)
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      // in bias, shape is (N*H)
+      // in biasqk, shape is (B, H, S, S)
+      .IsTensor()
+      .End()
+      // in bias, shape is (B, S, N*H)
+      // in biasqk, shape is (B, H, S, S)
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      // in bias, it equal to 2
+      // in biasqk, it equal to -1 or 0
+      .AddAttr("axis")
+      .IsIntIn({2, -1, 0})
+      .End();
+
+  AddOpCompat(OpCompat("reshape2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Shape")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("ShapeTensor")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("shape")  // -->(B, S, H, N)  <--(B, S, N*H)
+      .IsType<std::vector<int>>()
+      .End();
+
+  // -->: (B, S, H, N) -> (B, H, S, N)
+  // <--: (B, H, S, N) -> (B, S, H, N)
+  AddOpCompat(OpCompat("transpose2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("axis")  // {0, 2, 1, 3}
+      .IsType<std::vector<int>>()
+      .End();
+
+  AddOpCompat(OpCompat("scale"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("scale")
+      .IsType<float>()  // copy to new op. so unconstrained.
+      .End()
+      .AddAttr("bias")
+      .IsNumEQ(0.f)
+      .End()
+      .AddAttr("bias_after_scale")  // bias is 0, so unconstrained.
+      .IsType<bool>()
+      .End();
+
+  // QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S)
+  // QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N)
+  AddOpCompat(OpCompat("matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("alpha")
+      .IsNumEQ(1.0f)
+      .End()
+      .AddAttr("transpose_X")
+      .IsBoolEQ(false)
+      .End()
+      .AddAttr("transpose_Y")  // QK(true) QKV(false)
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("softmax"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({-1, 3})  // shape is (B, H, S, S), so axis is -1 or 3
+      .End();
+}
+
+int TrtMultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
+                                                const std::string& name_scope,
+                                                Scope* scope) const {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  patterns::TrtMultiHeadMatmulPattern multihead_pattern(pattern, name_scope);
+
+  multihead_pattern();
+  // Create New OpDesc
+  auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2,
+                          Node* mul0_out, Node* mul1_out, Node* mul2_out,
+                          Node* mul0_w, Node* mul1_w, Node* mul2_w,
+                          Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b,
+                          Node* eltadd_qk_b, Node* reshape2,
+                          Node* reshape2_qkv_out, Node* scale, Node* scale_out,
+                          Node* softmax_qk, Node* eltadd0, Node* eltadd1,
+                          Node* eltadd2, Node* matmul_qk, Node* reshape2_qkv) {
+    auto scale_attr = BOOST_GET_CONST(float, scale->Op()->GetAttr("scale"));
+
+    // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H)
+    // bias (B * S * 3 * N * H) + bias (3 * N * H)
+    // Transpose (B * S * 3 * N * H) -> (3 * B * N * S * H)
+    auto* wq_tensor = scope->FindVar(mul0_w->Name())->GetMutable<LoDTensor>();
+    auto* wk_tensor = scope->FindVar(mul1_w->Name())->GetMutable<LoDTensor>();
+    auto* wv_tensor = scope->FindVar(mul2_w->Name())->GetMutable<LoDTensor>();
+
+    auto* bq_tensor =
+        scope->FindVar(eltadd0_b->Name())->GetMutable<LoDTensor>();
+    auto* bk_tensor =
+        scope->FindVar(eltadd1_b->Name())->GetMutable<LoDTensor>();
+    auto* bv_tensor =
+        scope->FindVar(eltadd2_b->Name())->GetMutable<LoDTensor>();
+
+    auto* wq_data = wq_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* wk_data = wk_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* wv_data = wv_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bq_data = bq_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bk_data = bk_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bv_data = bv_tensor->mutable_data<float>(platform::CPUPlace());
+
+    auto combined_w_dims =
+        phi::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    auto combined_bias_dims = phi::make_ddim({3, bq_tensor->dims()[0]});
+
+    // reuse the mul0_w and eltadd_0_b nodes for the combined nodes.
+    auto* combined_w_desc = mul0_w->Var();
+    combined_w_desc->SetShape({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    combined_w_desc->SetPersistable(true);
+
+    auto* combined_bias_desc = eltadd0_b->Var();
+    combined_bias_desc->SetShape({3, bq_tensor->dims()[0]});
+    combined_bias_desc->SetPersistable(true);
+
+    framework::LoDTensor tmp_combined_w_tensor;
+    tmp_combined_w_tensor.Resize(combined_w_dims);
+    auto* tmp_combined_w_data =
+        tmp_combined_w_tensor.mutable_data<float>(platform::CPUPlace());
+
+    std::vector<float*> w_vec = {wq_data, wk_data, wv_data};
+    int dims_h = combined_w_dims[0], dims_w = combined_w_dims[2];
+    // Combine the three fc weights together.
+    for (int i = 0; i < dims_h; i++) {
+      for (int j = 0; j < 3; j++) {
+        for (int k = 0; k < dims_w; k++) {
+          int out_index = i * (3 * dims_w) + j * dims_w + k;
+          int in_index = i * dims_w + k;
+          tmp_combined_w_data[out_index] = w_vec[j][in_index];
+        }
+      }
+    }
+
+    wq_tensor->Resize(combined_w_dims);
+    auto* new_combined_w_data =
+        wq_tensor->mutable_data<float>(platform::CPUPlace());
+    memcpy(new_combined_w_data, tmp_combined_w_data,
+           sizeof(float) * wq_tensor->numel());
+
+    scope->EraseVars({mul1_w->Name(), mul2_w->Name()});
+
+    framework::LoDTensor tmp_combined_bias_tensor;
+    tmp_combined_bias_tensor.Resize(combined_bias_dims);
+    auto* tmp_combined_bias_data =
+        tmp_combined_bias_tensor.mutable_data<float>(platform::CPUPlace());
+
+    size_t bias_size = bq_tensor->numel();
+    memcpy(tmp_combined_bias_data, bq_data, sizeof(float) * bias_size);
+    memcpy(tmp_combined_bias_data + bias_size, bk_data,
+           sizeof(float) * bias_size);
+    memcpy(tmp_combined_bias_data + 2 * bias_size, bv_data,
+           sizeof(float) * bias_size);
+
+    bq_tensor->Resize(combined_bias_dims);
+    auto* new_combined_bias_data =
+        bq_tensor->mutable_data<float>(platform::CPUPlace());
+    memcpy(new_combined_bias_data, tmp_combined_bias_data,
+           sizeof(float) * bq_tensor->numel());
+
+    scope->EraseVars({eltadd1_b->Name(), eltadd2_b->Name()});
+
+    auto reshape_desc = reshape2->Op();
+    int head_number =
+        BOOST_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape")).at(2);
+
+    OpDesc multihead_op_desc(mul0->Op()->Block());
+    multihead_op_desc.SetType("multihead_matmul");
+
+    multihead_op_desc.SetInput("Input", {input0->Name()});
+    multihead_op_desc.SetInput("W", {mul0_w->Name()});
+    multihead_op_desc.SetInput("Bias", {eltadd0_b->Name()});
+    multihead_op_desc.SetInput("BiasQK", {eltadd_qk_b->Name()});
+
+    multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()});
+    multihead_op_desc.SetAttr("alpha", scale_attr);
+    multihead_op_desc.SetAttr("head_number", head_number);
+
+    auto* mul0_op_desc = mul0->Op();
+
+    // all mul op has same input.
+    if (multihead_op_desc.HasAttr("Input_scale")) {
+      multihead_op_desc.SetAttr("Input_scale",
+                                mul0_op_desc->GetAttr("Input_scale"));
+    }
+    auto* add0_op_desc = eltadd0->Op();
+    auto* add1_op_desc = eltadd1->Op();
+    auto* add2_op_desc = eltadd2->Op();
+    if (add0_op_desc->HasAttr("out_threshold")) {
+      auto out_scale0 =
+          BOOST_GET_CONST(float, add0_op_desc->GetAttr("out_threshold"));
+      auto out_scale1 =
+          BOOST_GET_CONST(float, add1_op_desc->GetAttr("out_threshold"));
+      auto out_scale2 =
+          BOOST_GET_CONST(float, add2_op_desc->GetAttr("out_threshold"));
+      auto out_scale_max = std::max(out_scale0, out_scale1);
+      out_scale_max = std::max(out_scale_max, out_scale2);
+      multihead_op_desc.SetAttr("fc_out_threshold", out_scale_max);
+    }
+
+    auto* softmax_qk_op_desc = softmax_qk->Op();
+    auto* matmul_qk_op_desc = matmul_qk->Op();
+    if (matmul_qk_op_desc->HasAttr("Input_scale")) {
+      multihead_op_desc.SetAttr("qkv2context_plugin_int8", true);
+      if (softmax_qk_op_desc->HasAttr("out_threshold")) {
+        auto qkv_plugin_scale = BOOST_GET_CONST(
+            float, softmax_qk_op_desc->GetAttr("out_threshold"));
+        multihead_op_desc.SetAttr("dp_probs", qkv_plugin_scale);
+      }
+    }
+    if (reshape2_qkv->Op()->HasAttr("out_threshold")) {
+      multihead_op_desc.SetAttr("out_threshold",
+                                reshape2_qkv->Op()->GetAttr("out_threshold"));
+    }
+    auto* multihead = graph->CreateOpNode(&multihead_op_desc);
+
+    IR_NODE_LINK_TO(input0, multihead);
+    IR_NODE_LINK_TO(mul0_w, multihead);
+    IR_NODE_LINK_TO(eltadd0_b, multihead);
+    IR_NODE_LINK_TO(eltadd_qk_b, multihead);
+
+    IR_NODE_LINK_TO(multihead, reshape2_qkv_out);
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING)
+          << "Op compat check in trt_multihead_matmul_fuse_pass_v2 failed.";
+      return;
+    }
+    // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0_w, mul0_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0, reshape2_0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out, reshape2_0_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0, transpose2_0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out, transpose2_0_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(scale, scale, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(scale_out, scale_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul1, mul1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul1_out, mul1_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul1_w, mul1_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_1, reshape2_1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_1_out, reshape2_1_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_1, transpose2_1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_1_out, transpose2_1_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul2, mul2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul2_out, mul2_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul2_w, mul2_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_2, reshape2_2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_2_out, reshape2_2_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_2, transpose2_2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_2_out, transpose2_2_out,
+                              multihead_pattern);
+
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0, eltadd0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0_b, eltadd0_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0_out, eltadd0_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1, eltadd1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1_b, eltadd1_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1_out, eltadd1_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2, eltadd2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2_b, eltadd2_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2_out, eltadd2_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qk, matmul_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qk_out, matmul_qk_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk, eltadd_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_b, eltadd_qk_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_out, eltadd_qk_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk, softmax_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv, matmul_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out, matmul_qkv_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv, reshape2_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out, reshape2_qkv_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv, transpose2_qkv,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out,
+                              multihead_pattern);
+
+    // If weights or biases in qkv's fc are shared by multiple multihead_matmul
+    // patterns, we do not support this kind of fusion, this pass will not take
+    // effect.
+    bool is_fc_params_shared =
+        mul0_w->outputs.size() > 1 || mul1_w->outputs.size() > 1 ||
+        mul2_w->outputs.size() > 1 || eltadd0_b->outputs.size() > 1 ||
+        eltadd1_b->outputs.size() > 1 || eltadd2_b->outputs.size() > 1;
+    if (is_fc_params_shared) {
+      return;
+    }
+    fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w,
+                 mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b,
+                 reshape2_0, reshape2_qkv_out, scale, scale_out, softmax_qk,
+                 eltadd0, eltadd1, eltadd2, matmul_qk, reshape2_qkv);
+
+    std::unordered_set<const Node*> marked_nodes({eltadd0,
+                                                  eltadd1,
+                                                  eltadd2,
+                                                  eltadd1_b,
+                                                  eltadd2_b,
+                                                  eltadd0_out,
+                                                  eltadd1_out,
+                                                  eltadd2_out,
+                                                  reshape2_0,
+                                                  reshape2_1,
+                                                  reshape2_2,
+                                                  reshape2_0_out,
+                                                  reshape2_1_out,
+                                                  reshape2_2_out,
+                                                  transpose2_0,
+                                                  transpose2_1,
+                                                  transpose2_2,
+                                                  transpose2_0_out,
+                                                  transpose2_1_out,
+                                                  transpose2_2_out,
+                                                  matmul_qk,
+                                                  matmul_qk_out,
+                                                  eltadd_qk,
+                                                  eltadd_qk_out,
+                                                  softmax_qk,
+                                                  softmax_qk_out,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_qkv,
+                                                  matmul_qkv_out,
+                                                  mul0,
+                                                  mul1,
+                                                  mul2,
+                                                  mul0_out,
+                                                  mul1_out,
+                                                  mul2_out,
+                                                  mul1_w,
+                                                  mul2_w,
+                                                  reshape2_qkv,
+                                                  scale});
+    // Remove unneeded nodes.
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+void TrtMultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::Fatal(
+          "During the multiheadMatmul pass, The scope should not be null."));
+
+  int fusion_count = BuildFusionV2(graph, name_scope_, scope);
+  if (fusion_count > 0) {
+    bool use_varseqlen = Get<bool>("use_varseqlen");
+    std::string pos_id = Get<std::string>("tensorrt_transformer_posid");
+    std::string mask_id = Get<std::string>("tensorrt_transformer_maskid");
+
+    if (use_varseqlen && pos_id != "" && mask_id != "") {
+      if (graph->Has(framework::ir::kEmbEltwiseLayernormPass)) {
+        VLOG(3) << "start varseqlen trt_multihead_matmul_fuse_pass_v2";
+      } else {
+        PADDLE_THROW(platform::errors::Fatal(
+            "Use transformer'varseqlen need "
+            "embedding_eltwise_layernorm_fuse_pass. please use no_varseqlen"));
+      }
+    } else if (!use_varseqlen && pos_id == "" && mask_id == "") {
+      VLOG(3) << "start no_varseqlen trt_multihead_matmul_fuse_pass_v2";
+    } else {
+      PADDLE_THROW(
+          platform::errors::Fatal("Use transformer'varseqlen need config: "
+                                  "use_varseqlen, set pos_id, set "
+                                  "mask_id. Or not use varseqlen, do not set "
+                                  "pos_id, set mask_id. Please "
+                                  "reconfig"));
+    }
+    graph->Set(kMultiheadMatmulPass, new bool(true));
+  }
+  AddStatis(fusion_count);
+}
+
+TrtMultiHeadMatmulV3FusePass::TrtMultiHeadMatmulV3FusePass() {
+  AddOpCompat(OpCompat("mul"))
+      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .IsTensor()
+      .End()
+      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddAttr("x_num_col_dims")
+      .IsNumEQ(2)
+      .End()
+      .AddAttr("y_num_col_dims")
+      .IsNumEQ(1)
+      .End();
+
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      // in bias, shape is (B, S, N*H),
+      // in biasqk, shape is (B, H, S, S)
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      // in bias, shape is (N*H)
+      // in biasqk, shape is (B, H, S, S)
+      .IsTensor()
+      .End()
+      // in bias, shape is (B, S, N*H)
+      // in biasqk, shape is (B, H, S, S)
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      // in bias, it equal to 2
+      // in biasqk, it equal to -1 or 0
+      .AddAttr("axis")
+      .IsIntIn({2, -1, 0})
+      .End();
+
+  AddOpCompat(OpCompat("reshape2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Shape")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("ShapeTensor")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("shape")  // -->(B, S, H, N)  <--(B, S, N*H)
+      .IsType<std::vector<int>>()
+      .End();
+
+  // -->: (B, S, H, N) -> (B, H, S, N)
+  // <--: (B, H, S, N) -> (B, S, H, N)
+  AddOpCompat(OpCompat("transpose2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("axis")  // {0, 2, 1, 3}
+      .IsType<std::vector<int>>()
+      .End();
+
+  // QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S)
+  // QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N)
+  AddOpCompat(OpCompat("matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("alpha")
+      .IsType<float>()  // QK(anyvalue, will copy to new op) QKV(1.0)
+      .End()
+      .AddAttr("transpose_X")
+      .IsBoolEQ(false)
+      .End()
+      .AddAttr("transpose_Y")  // QK(true) QKV(false)
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("matmul_v2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsBoolEQ(false)
+      .End()
+      .AddAttr("trans_y")  // QK(true) QKV(false)
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("softmax"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({-1, 3})  // shape is (B, H, S, S), so axis is -1 or 3
+      .End();
+}
+
+int TrtMultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph,
+                                                const std::string& name_scope,
+                                                Scope* scope) const {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  patterns::TrtMultiHeadMatmulV3Pattern multihead_pattern(pattern, name_scope);
+
+  multihead_pattern();
+  // Create New OpDesc
+  auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2,
+                          Node* mul0_out, Node* mul1_out, Node* mul2_out,
+                          Node* mul0_w, Node* mul1_w, Node* mul2_w,
+                          Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b,
+                          Node* eltadd_qk_b, Node* reshape2,
+                          Node* reshape2_qkv_out, Node* matmul_qk) {
+    auto scale_attr = BOOST_GET_CONST(float, matmul_qk->Op()->GetAttr("alpha"));
+
+    // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H)
+    // bias (B * S * 3 * N * H) + bias (3 * N * H)
+    // Transpose (B * S * 3 * N * H) -> (3 * B * N * S * H)
+    auto* wq_tensor = scope->FindVar(mul0_w->Name())->GetMutable<LoDTensor>();
+    auto* wk_tensor = scope->FindVar(mul1_w->Name())->GetMutable<LoDTensor>();
+    auto* wv_tensor = scope->FindVar(mul2_w->Name())->GetMutable<LoDTensor>();
+
+    auto* bq_tensor =
+        scope->FindVar(eltadd0_b->Name())->GetMutable<LoDTensor>();
+    auto* bk_tensor =
+        scope->FindVar(eltadd1_b->Name())->GetMutable<LoDTensor>();
+    auto* bv_tensor =
+        scope->FindVar(eltadd2_b->Name())->GetMutable<LoDTensor>();
+
+    auto* wq_data = wq_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* wk_data = wk_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* wv_data = wv_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bq_data = bq_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bk_data = bk_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bv_data = bv_tensor->mutable_data<float>(platform::CPUPlace());
+
+    auto combined_w_dims =
+        phi::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    auto combined_bias_dims = phi::make_ddim({3, bq_tensor->dims()[0]});
+
+    // reuse the mul0_w and eltadd_0_b nodes for the combined nodes.
+    auto* combined_w_desc = mul0_w->Var();
+    combined_w_desc->SetShape({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    combined_w_desc->SetPersistable(true);
+
+    auto* combined_bias_desc = eltadd0_b->Var();
+    combined_bias_desc->SetShape({3, bq_tensor->dims()[0]});
+    combined_bias_desc->SetPersistable(true);
+
+    framework::LoDTensor tmp_combined_w_tensor;
+    tmp_combined_w_tensor.Resize(combined_w_dims);
+    auto* tmp_combined_w_data =
+        tmp_combined_w_tensor.mutable_data<float>(platform::CPUPlace());
+
+    std::vector<float*> w_vec = {wq_data, wk_data, wv_data};
+    int dims_h = combined_w_dims[0], dims_w = combined_w_dims[2];
+    // Combine the three fc weights together.
+    for (int i = 0; i < dims_h; i++) {
+      for (int j = 0; j < 3; j++) {
+        for (int k = 0; k < dims_w; k++) {
+          int out_index = i * (3 * dims_w) + j * dims_w + k;
+          int in_index = i * dims_w + k;
+          tmp_combined_w_data[out_index] = w_vec[j][in_index];
+        }
+      }
+    }
+
+    wq_tensor->Resize(combined_w_dims);
+    auto* new_combined_w_data =
+        wq_tensor->mutable_data<float>(platform::CPUPlace());
+    memcpy(new_combined_w_data, tmp_combined_w_data,
+           sizeof(float) * wq_tensor->numel());
+
+    scope->EraseVars({mul1_w->Name(), mul2_w->Name()});
+
+    framework::LoDTensor tmp_combined_bias_tensor;
+    tmp_combined_bias_tensor.Resize(combined_bias_dims);
+    auto* tmp_combined_bias_data =
+        tmp_combined_bias_tensor.mutable_data<float>(platform::CPUPlace());
+
+    size_t bias_size = bq_tensor->numel();
+    memcpy(tmp_combined_bias_data, bq_data, sizeof(float) * bias_size);
+    memcpy(tmp_combined_bias_data + bias_size, bk_data,
+           sizeof(float) * bias_size);
+    memcpy(tmp_combined_bias_data + 2 * bias_size, bv_data,
+           sizeof(float) * bias_size);
+
+    bq_tensor->Resize(combined_bias_dims);
+    auto* new_combined_bias_data =
+        bq_tensor->mutable_data<float>(platform::CPUPlace());
+    memcpy(new_combined_bias_data, tmp_combined_bias_data,
+           sizeof(float) * bq_tensor->numel());
+
+    scope->EraseVars({eltadd1_b->Name(), eltadd2_b->Name()});
+
+    auto reshape_desc = reshape2->Op();
+    int head_number =
+        BOOST_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape")).at(2);
+
+    OpDesc multihead_op_desc(mul0->Op()->Block());
+    multihead_op_desc.SetType("multihead_matmul");
+
+    multihead_op_desc.SetInput("Input", {input0->Name()});
+    multihead_op_desc.SetInput("W", {mul0_w->Name()});
+    multihead_op_desc.SetInput("Bias", {eltadd0_b->Name()});
+    multihead_op_desc.SetInput("BiasQK", {eltadd_qk_b->Name()});
+
+    multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()});
+    multihead_op_desc.SetAttr("alpha", scale_attr);
+    multihead_op_desc.SetAttr("head_number", head_number);
+
+    auto* multihead = graph->CreateOpNode(&multihead_op_desc);
+
+    IR_NODE_LINK_TO(input0, multihead);
+    IR_NODE_LINK_TO(mul0_w, multihead);
+    IR_NODE_LINK_TO(eltadd0_b, multihead);
+    IR_NODE_LINK_TO(eltadd_qk_b, multihead);
+
+    IR_NODE_LINK_TO(multihead, reshape2_qkv_out);
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0_w, mul0_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0, reshape2_0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out, reshape2_0_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0, transpose2_0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out, transpose2_0_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul1, mul1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul1_out, mul1_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul1_w, mul1_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_1, reshape2_1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_1_out, reshape2_1_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_1, transpose2_1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_1_out, transpose2_1_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul2, mul2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul2_out, mul2_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul2_w, mul2_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_2, reshape2_2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_2_out, reshape2_2_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_2, transpose2_2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_2_out, transpose2_2_out,
+                              multihead_pattern);
+
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0, eltadd0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0_b, eltadd0_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0_out, eltadd0_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1, eltadd1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1_b, eltadd1_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1_out, eltadd1_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2, eltadd2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2_b, eltadd2_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2_out, eltadd2_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qk, matmul_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qk_out, matmul_qk_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk, eltadd_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_b, eltadd_qk_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_out, eltadd_qk_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk, softmax_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv, matmul_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out, matmul_qkv_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv, reshape2_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out, reshape2_qkv_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv, transpose2_qkv,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out,
+                              multihead_pattern);
+
+    // If weights or biases in qkv's fc are shared by multiple multihead_matmul
+    // patterns, we do not support this kind of fusion, this pass will not take
+    // effect.
+    bool is_fc_params_shared =
+        mul0_w->outputs.size() > 1 || mul1_w->outputs.size() > 1 ||
+        mul2_w->outputs.size() > 1 || eltadd0_b->outputs.size() > 1 ||
+        eltadd1_b->outputs.size() > 1 || eltadd2_b->outputs.size() > 1;
+    if (is_fc_params_shared) {
+      return;
+    }
+    fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w,
+                 mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b,
+                 reshape2_0, reshape2_qkv_out, matmul_qk);
+
+    std::unordered_set<const Node*> marked_nodes({eltadd0,
+                                                  eltadd1,
+                                                  eltadd2,
+                                                  eltadd1_b,
+                                                  eltadd2_b,
+                                                  eltadd0_out,
+                                                  eltadd1_out,
+                                                  eltadd2_out,
+                                                  reshape2_0,
+                                                  reshape2_1,
+                                                  reshape2_2,
+                                                  reshape2_0_out,
+                                                  reshape2_1_out,
+                                                  reshape2_2_out,
+                                                  transpose2_0,
+                                                  transpose2_1,
+                                                  transpose2_2,
+                                                  transpose2_0_out,
+                                                  transpose2_1_out,
+                                                  transpose2_2_out,
+                                                  matmul_qk,
+                                                  matmul_qk_out,
+                                                  eltadd_qk,
+                                                  eltadd_qk_out,
+                                                  softmax_qk,
+                                                  softmax_qk_out,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_qkv,
+                                                  matmul_qkv_out,
+                                                  mul0,
+                                                  mul1,
+                                                  mul2,
+                                                  mul0_out,
+                                                  mul1_out,
+                                                  mul2_out,
+                                                  mul1_w,
+                                                  mul2_w,
+                                                  reshape2_qkv});
+    // Remove unneeded nodes.
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+void TrtMultiHeadMatmulV3FusePass::ApplyImpl(Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::Fatal(
+          "During the multiheadMatmul pass, The scope should not be null."));
+
+  int fusion_count = BuildFusionV3(graph, name_scope_, scope);
+  if (fusion_count > 0) {
+    bool use_varseqlen = Get<bool>("use_varseqlen");
+    std::string pos_id = Get<std::string>("tensorrt_transformer_posid");
+    std::string mask_id = Get<std::string>("tensorrt_transformer_maskid");
+
+    if (use_varseqlen && pos_id != "" && mask_id != "") {
+      if (graph->Has(framework::ir::kEmbEltwiseLayernormPass)) {
+        VLOG(3) << "start varseqlen trt_multihead_matmul_fuse_pass_v3";
+      } else {
+        PADDLE_THROW(platform::errors::Fatal(
+            "Use transformer'varseqlen need "
+            "embedding_eltwise_layernorm_fuse_pass. please use no_varseqlen"));
+      }
+    } else if (!use_varseqlen && pos_id == "" && mask_id == "") {
+      VLOG(3) << "start no_varseqlen trt_multihead_matmul_fuse_pass_v3";
+    } else {
+      PADDLE_THROW(
+          platform::errors::Fatal("Use transformer'varseqlen need config: "
+                                  "use_varseqlen, set pos_id, set "
+                                  "mask_id. Or not use varseqlen, do not set "
+                                  "pos_id, set mask_id. Please "
+                                  "reconfig"));
+    }
+    graph->Set(kMultiheadMatmulPass, new bool(true));
+  }
+  AddStatis(fusion_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(trt_multihead_matmul_fuse_pass,
+              paddle::framework::ir::TrtMultiHeadMatmulFusePass);
+
+REGISTER_PASS(trt_multihead_matmul_fuse_pass_v2,
+              paddle::framework::ir::TrtMultiHeadMatmulV2FusePass);
+REGISTER_PASS(trt_multihead_matmul_fuse_pass_v3,
+              paddle::framework::ir::TrtMultiHeadMatmulV3FusePass);
+REGISTER_PASS_CAPABILITY(trt_multihead_matmul_fuse_pass_v2)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .LE("elementwise_add", 1)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .LE("matmul", 1)
+            .EQ("softmax", 0));
+REGISTER_PASS_CAPABILITY(trt_multihead_matmul_fuse_pass_v3)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .LE("matmul", 1)
+            .EQ("matmul_v2", 0)
+            .EQ("softmax", 0));
diff --git a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.h b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.h
new file mode 100644
index 0000000000000..467e803b4974c
--- /dev/null
+++ b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.h
@@ -0,0 +1,179 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct TrtMultiHeadMatmulPattern : public PatternBase {
+  TrtMultiHeadMatmulPattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "multihead_matmul") {}
+
+  PDNode* operator()();
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(input0);
+  PATTERN_DECL_NODE(mul0);
+  PATTERN_DECL_NODE(mul1);
+  PATTERN_DECL_NODE(mul2);
+  PATTERN_DECL_NODE(mul0_w);
+  PATTERN_DECL_NODE(mul1_w);
+  PATTERN_DECL_NODE(mul2_w);
+  PATTERN_DECL_NODE(mul0_out);
+  PATTERN_DECL_NODE(mul1_out);
+  PATTERN_DECL_NODE(mul2_out);
+  PATTERN_DECL_NODE(eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd1);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd2);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd1_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd2_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_out);
+  PATTERN_DECL_NODE(eltadd1_out);
+  PATTERN_DECL_NODE(eltadd2_out);
+  PATTERN_DECL_NODE(reshape2_0);
+  PATTERN_DECL_NODE(reshape2_1);
+  PATTERN_DECL_NODE(reshape2_2);
+  PATTERN_DECL_NODE(reshape2_qkv);
+  PATTERN_DECL_NODE(reshape2_0_out);
+  PATTERN_DECL_NODE(reshape2_1_out);
+  PATTERN_DECL_NODE(reshape2_2_out);
+  PATTERN_DECL_NODE(reshape2_qkv_out);
+  PATTERN_DECL_NODE(transpose2_0);
+  PATTERN_DECL_NODE(transpose2_1);
+  PATTERN_DECL_NODE(transpose2_2);
+  PATTERN_DECL_NODE(transpose2_qkv);
+  PATTERN_DECL_NODE(transpose2_0_out);
+  PATTERN_DECL_NODE(transpose2_1_out);
+  PATTERN_DECL_NODE(transpose2_2_out);
+  PATTERN_DECL_NODE(transpose2_qkv_out);
+  PATTERN_DECL_NODE(scale);
+  PATTERN_DECL_NODE(scale_out);
+  PATTERN_DECL_NODE(matmul_qk);
+  PATTERN_DECL_NODE(matmul_qk_out);
+  PATTERN_DECL_NODE(eltadd_qk);
+  PATTERN_DECL_NODE(eltadd_qk_b);
+  PATTERN_DECL_NODE(eltadd_qk_out);
+  PATTERN_DECL_NODE(softmax_qk);
+  PATTERN_DECL_NODE(softmax_qk_out);
+
+  PATTERN_DECL_NODE(matmul_qkv);
+  PATTERN_DECL_NODE(matmul_qkv_out);
+};
+
+struct TrtMultiHeadMatmulV3Pattern : public PatternBase {
+  TrtMultiHeadMatmulV3Pattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "multihead_matmul_v3") {}
+
+  PDNode* operator()();
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(input0);
+  PATTERN_DECL_NODE(mul0);
+  PATTERN_DECL_NODE(mul1);
+  PATTERN_DECL_NODE(mul2);
+  PATTERN_DECL_NODE(mul0_w);
+  PATTERN_DECL_NODE(mul1_w);
+  PATTERN_DECL_NODE(mul2_w);
+  PATTERN_DECL_NODE(mul0_out);
+  PATTERN_DECL_NODE(mul1_out);
+  PATTERN_DECL_NODE(mul2_out);
+  PATTERN_DECL_NODE(eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd1);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd2);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd1_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd2_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_out);
+  PATTERN_DECL_NODE(eltadd1_out);
+  PATTERN_DECL_NODE(eltadd2_out);
+  PATTERN_DECL_NODE(reshape2_0);
+  PATTERN_DECL_NODE(reshape2_1);
+  PATTERN_DECL_NODE(reshape2_2);
+  PATTERN_DECL_NODE(reshape2_qkv);
+  PATTERN_DECL_NODE(reshape2_0_out);
+  PATTERN_DECL_NODE(reshape2_1_out);
+  PATTERN_DECL_NODE(reshape2_2_out);
+  PATTERN_DECL_NODE(reshape2_qkv_out);
+  PATTERN_DECL_NODE(transpose2_0);
+  PATTERN_DECL_NODE(transpose2_1);
+  PATTERN_DECL_NODE(transpose2_2);
+  PATTERN_DECL_NODE(transpose2_qkv);
+  PATTERN_DECL_NODE(transpose2_0_out);
+  PATTERN_DECL_NODE(transpose2_1_out);
+  PATTERN_DECL_NODE(transpose2_2_out);
+  PATTERN_DECL_NODE(transpose2_qkv_out);
+  PATTERN_DECL_NODE(matmul_qk);
+  PATTERN_DECL_NODE(matmul_qk_out);
+  PATTERN_DECL_NODE(eltadd_qk);
+  PATTERN_DECL_NODE(eltadd_qk_b);
+  PATTERN_DECL_NODE(eltadd_qk_out);
+  PATTERN_DECL_NODE(softmax_qk);
+  PATTERN_DECL_NODE(softmax_qk_out);
+
+  PATTERN_DECL_NODE(matmul_qkv);
+  PATTERN_DECL_NODE(matmul_qkv_out);
+};
+
+}  // namespace patterns
+
+class TrtMultiHeadMatmulFusePass : public FusePassBase {
+ public:
+  virtual ~TrtMultiHeadMatmulFusePass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{"trt_multihead_matmul_fuse"};
+};
+
+class TrtMultiHeadMatmulV2FusePass : public FusePassBase {
+ public:
+  TrtMultiHeadMatmulV2FusePass();
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{"trt_multihead_matmul_fuse_v2"};
+
+ private:
+  int BuildFusionV2(Graph* graph, const std::string& name_scope,
+                    Scope* scope) const;
+};
+
+class TrtMultiHeadMatmulV3FusePass : public FusePassBase {
+ public:
+  TrtMultiHeadMatmulV3FusePass();
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{"trt_multihead_matmul_fuse_v3"};
+
+ private:
+  int BuildFusionV3(Graph* graph, const std::string& name_scope,
+                    Scope* scope) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
new file mode 100644
index 0000000000000..13883909435f7
--- /dev/null
+++ b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
@@ -0,0 +1,232 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct TrtSkipLayerNorm : public PatternBase {
+  TrtSkipLayerNorm(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "skip_layernorm") {}
+
+  PDNode *operator()(PDNode *x, PDNode *y);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(elementwise);
+  PATTERN_DECL_NODE(layer_norm);
+  // declare variable node's name
+  PATTERN_DECL_NODE(
+      elementwise_out);  // (elementwise_input_x,elementwise_input_y)
+                         // -> elementwise_out
+  PATTERN_DECL_NODE(layer_norm_bias);
+  PATTERN_DECL_NODE(layer_norm_scale);
+  PATTERN_DECL_NODE(layer_norm_out);
+  PATTERN_DECL_NODE(layer_norm_mean);
+  PATTERN_DECL_NODE(layer_norm_variance);
+};
+
+PDNode *TrtSkipLayerNorm::operator()(PDNode *x, PDNode *y) {
+  // Create nodes for elementwise add op.
+  x->assert_is_op_input("elementwise_add", "X");
+  y->assert_is_op_input("elementwise_add", "Y");
+  auto *elementwise =
+      pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add");
+  auto *elementwise_out_var =
+      pattern->NewNode(elementwise_out_repr())
+          ->AsOutput()
+          ->assert_is_only_output_of_op("elementwise_add");
+
+  // Add links for elementwise_add op.
+  elementwise->LinksFrom({x, y}).LinksTo({elementwise_out_var});
+
+  // Create nodes for layer_norm op.
+  elementwise_out_var->AsIntermediate()->assert_is_op_input("layer_norm");
+  auto *layer_norm =
+      pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm");
+  auto *layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr())
+                                  ->AsInput()
+                                  ->assert_is_persistable_var()
+                                  ->assert_is_op_input("layer_norm", "Bias");
+  auto *layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr())
+                                   ->AsInput()
+                                   ->assert_is_persistable_var()
+                                   ->assert_is_op_input("layer_norm", "Scale");
+
+  auto *layer_norm_out_var = pattern->NewNode(layer_norm_out_repr())
+                                 ->AsOutput()
+                                 ->assert_is_op_output("layer_norm", "Y");
+  auto *layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr())
+                                  ->AsOutput()
+                                  ->assert_is_op_output("layer_norm", "Mean");
+  auto *layer_norm_variance_var =
+      pattern->NewNode(layer_norm_variance_repr())
+          ->AsOutput()
+          ->assert_is_op_output("layer_norm", "Variance");
+
+  // Add links for layer_norm op.
+  layer_norm
+      ->LinksFrom(
+          {elementwise_out_var, layer_norm_bias_var, layer_norm_scale_var})
+      .LinksTo(
+          {layer_norm_out_var, layer_norm_mean_var, layer_norm_variance_var});
+  return layer_norm_out_var;
+}
+
+}  // namespace patterns
+
+void TrtSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  FusePassBase::Init("skip_layernorm_fuse", graph);
+  int found_subgraph_count = 0;
+
+  GraphPatternDetector gpd;
+  auto *x = gpd.mutable_pattern()
+                ->NewNode("skip_layernorm_fuse/x")
+                ->AsInput()
+                ->assert_is_op_input("elementwise_add", "X")
+                ->assert_var_not_persistable();
+  auto *y = gpd.mutable_pattern()
+                ->NewNode("skip_layernorm_fuse/y")
+                ->AsInput()
+                ->assert_is_op_input("elementwise_add", "Y")
+                ->assert_var_not_persistable();
+  patterns::TrtSkipLayerNorm fused_pattern(gpd.mutable_pattern(),
+                                           "skip_layernorm_fuse");
+  fused_pattern(x, y);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *graph) {
+    if (subgraph.count(x) <= 0 || subgraph.count(y) <= 0) {
+      LOG(WARNING) << "The subgraph is empty.";
+      return;
+    }
+
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "skip_layernorm pass in op compat failed.";
+      return;
+    }
+
+    VLOG(4) << "handle TrtSkipLayerNorm fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm, layer_norm, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_bias, layer_norm_bias, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_scale, layer_norm_scale,
+                              fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out, layer_norm_out, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean, layer_norm_mean, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance,
+                              fused_pattern);
+
+    std::unordered_set<const Node *> del_node_set;
+
+    // Create an TrtSkipLayerNorm op node
+    OpDesc new_desc(elementwise->Op()->Block());
+    new_desc.SetType("skip_layernorm");
+
+    // inputs
+    new_desc.SetInput("X", {subgraph.at(x)->Name()});
+    new_desc.SetInput("Y", {subgraph.at(y)->Name()});
+    new_desc.SetInput("Scale", {layer_norm_scale->Name()});
+    new_desc.SetInput("Bias", {layer_norm_bias->Name()});
+
+    if (layer_norm->Op()->HasAttr("out_threshold")) {
+      new_desc.SetAttr("enable_int8", true);
+      new_desc.SetAttr("out_threshold",
+                       layer_norm->Op()->GetAttr("out_threshold"));
+    }
+
+    // outputs
+    new_desc.SetOutput("Out", {layer_norm_out->Name()});
+
+    // attrs
+    new_desc.SetAttr("epsilon", layer_norm->Op()->GetAttr("epsilon"));
+    new_desc.SetAttr("begin_norm_axis",
+                     layer_norm->Op()->GetAttr("begin_norm_axis"));
+
+    auto fused_node = graph->CreateOpNode(&new_desc);  // OpDesc will be copied.
+
+    del_node_set.insert(elementwise);
+    del_node_set.insert(layer_norm);
+    del_node_set.insert(elementwise_out);
+    del_node_set.insert(layer_norm_mean);
+    del_node_set.insert(layer_norm_variance);
+    GraphSafeRemoveNodes(graph, del_node_set);
+
+    IR_NODE_LINK_TO(subgraph.at(x), fused_node);
+    IR_NODE_LINK_TO(subgraph.at(y), fused_node);
+    IR_NODE_LINK_TO(layer_norm_scale, fused_node);
+    IR_NODE_LINK_TO(layer_norm_bias, fused_node);
+    IR_NODE_LINK_TO(fused_node, layer_norm_out);
+
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  if (found_subgraph_count > 0) {
+    bool use_varseqlen = Get<bool>("use_varseqlen");
+    std::string pos_id = Get<std::string>("tensorrt_transformer_posid");
+    std::string mask_id = Get<std::string>("tensorrt_transformer_maskid");
+
+    if (use_varseqlen && pos_id != "" && mask_id != "") {
+      if (graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
+          graph->Has(framework::ir::kMultiheadMatmulPass)) {
+        VLOG(3) << "start varseqlen trt_skip_layernorm_fuse_pass";
+      } else {
+        PADDLE_THROW(platform::errors::Fatal(
+            "Use transformer'varseqlen need "
+            "embedding_eltwise_layernorm_fuse_pass. please use no_varseqlen"));
+      }
+    } else if (!use_varseqlen && pos_id == "" && mask_id == "") {
+      VLOG(3) << "start no_varseqlen trt_skip_layernorm_fuse_pass";
+    } else {
+      PADDLE_THROW(
+          platform::errors::Fatal("Use transformer'varseqlen need config: "
+                                  "use_varseqlen, set pos_id, set "
+                                  "mask_id. Or not use varseqlen, do not set "
+                                  "pos_id, set mask_id. Please "
+                                  "reconfig"));
+    }
+  }
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(trt_skip_layernorm_fuse_pass,
+              paddle::framework::ir::TrtSkipLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(trt_skip_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .EQ("layer_norm", 0));
diff --git a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.h
new file mode 100644
index 0000000000000..a299493efa0e9
--- /dev/null
+++ b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+//     |           |                            |            |
+// other_op1   other_op2                    other_op1    other_op2
+//     |           |              fuse           \          /
+//     |------elementwise_add      ->           skip_layernorm
+//                 |                                   |
+//             layer_norm                          other_op3
+//                 |                                   |
+//             other_op3
+//                 |
+class Graph;
+
+class TrtSkipLayerNormFusePass : public FusePassBase {
+ public:
+  TrtSkipLayerNormFusePass() {
+    AddOpCompat(OpCompat("elementwise_add"))
+        .AddInput("X")
+        .IsTensor()
+        .End()
+        .AddInput("Y")
+        .IsTensor()
+        .End()
+        .AddOutput("Out")
+        .IsTensor()
+        .End()
+        .AddAttr("axis")
+        .IsIntIn({0, -1})
+        .End();
+
+    AddOpCompat(OpCompat("layer_norm"))
+        .AddInput("X")
+        .IsTensor()
+        .End()
+        .AddInput("Scale")
+        .IsTensor()
+        .End()
+        .AddInput("Bias")
+        .IsTensor()
+        .End()
+        .AddOutput("Y")
+        .IsTensor()
+        .End()
+        .AddOutput("Mean")
+        .IsTensor()
+        .End()
+        .AddOutput("Variance")
+        .IsTensor()
+        .End()
+        .AddAttr("epsilon")
+        .IsNumGE(0.0f)
+        .IsNumLE(0.001f)
+        .End()
+        .AddAttr("begin_norm_axis")
+        .IsNumGT(0)
+        .End();
+  }
+
+  virtual ~TrtSkipLayerNormFusePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass_tester.cc b/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass_tester.cc
index 067a37c611a73..3ebd61ff575e3 100644
--- a/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass_tester.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
index 20075a49749f7..19836b69ae9bf 100644
--- a/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/yolo_box_fuse_pass.h"
+
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 1c5c12b3d57df..dd316a0979cc7 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <glog/logging.h>
+
 #include <memory>
 #include <string>
 #include <utility>
diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h
index 36a5c3c5d6013..7aa180ed75ce2 100644
--- a/paddle/fluid/framework/lod_tensor_array.h
+++ b/paddle/fluid/framework/lod_tensor_array.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index a89baac3e7a10..254e70231ea4e 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/lod_tensor.h"
+
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/phi/core/lod_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index dba3b3ff1e690..1c2740c2b2ee7 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/naive_executor.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/denormal.h"
diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc
index 2f3c3f3d06e32..763e314d226e6 100644
--- a/paddle/fluid/framework/naive_executor_test.cc
+++ b/paddle/fluid/framework/naive_executor_test.cc
@@ -13,8 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/naive_executor.h"
+
 #include <gtest/gtest.h>
+
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index 6046000739976..44d540769f2da 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -1,76 +1,136 @@
-set(INTERPRETERCORE_DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog 
-lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
-graph_to_program_pass variable_helper timer monitor nan_inf_utils)
-
+set(INTERPRETERCORE_DEPS
+    op_registry
+    device_context
+    scope
+    framework_proto
+    data_feed_proto
+    heter_service_proto
+    trainer_desc_proto
+    glog
+    lod_rank_table
+    fs
+    shell
+    fleet_wrapper
+    heter_wrapper
+    ps_gpu_wrapper
+    box_wrapper
+    lodtensor_printer
+    feed_fetch_method
+    graph_to_program_pass
+    variable_helper
+    timer
+    monitor
+    nan_inf_utils)
 
 add_subdirectory(workqueue)
 add_subdirectory(garbage_collector)
 
-cc_library(data_transfer SRCS data_transfer.cc DEPS enforce scope glog)
-cc_library(new_executor_defs SRCS new_executor_defs.cc DEPS enforce glog scope)
-cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs data_transfer)
-cc_library(event_manager SRCS event_manager.cc DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs)
-cc_library(stream_analyzer SRCS stream_analyzer.cc DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs)
+cc_library(
+  data_transfer
+  SRCS data_transfer.cc
+  DEPS enforce scope glog)
+cc_library(
+  new_executor_defs
+  SRCS new_executor_defs.cc
+  DEPS enforce glog scope)
+cc_library(
+  interpretercore_util
+  SRCS interpretercore_util.cc
+  DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs data_transfer)
+cc_library(
+  event_manager
+  SRCS event_manager.cc
+  DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs)
+cc_library(
+  stream_analyzer
+  SRCS stream_analyzer.cc
+  DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs)
 
 if(WITH_GPU OR WITH_ROCM)
-cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util interpretercore_event_garbage_collector interpretercore_fast_garbage_collector stream_analyzer event_manager)
+  cc_library(
+    interpretercore
+    SRCS interpretercore.cc
+    DEPS workqueue
+         ${DEVICE_EVENT_LIBS}
+         interpretercore_util
+         interpretercore_event_garbage_collector
+         interpretercore_fast_garbage_collector
+         stream_analyzer
+         event_manager)
 else()
-cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util interpretercore_event_garbage_collector  stream_analyzer event_manager)
+  cc_library(
+    interpretercore
+    SRCS interpretercore.cc
+    DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util
+         interpretercore_event_garbage_collector stream_analyzer event_manager)
 endif()
 
-cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore)
+cc_library(
+  standalone_executor
+  SRCS standalone_executor.cc
+  DEPS interpretercore)
 
-cc_library(staticgraph_executor_statistics SRCS executor_statistics.cc DEPS enforce glog os_info)
+cc_library(
+  staticgraph_executor_statistics
+  SRCS executor_statistics.cc
+  DEPS enforce glog os_info)
 
 # cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 # skip win32 since wget is not installed by default on windows machine.
-if (WITH_GPU AND WITH_TESTING AND NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
-    add_custom_target(
-        download_program
-        COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_main_program 
-        COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_startup_program 
-    )
-    
-    # all operators used in the program
-    set(OPS  
-        fill_constant_op
-        uniform_random_op
-        lookup_table_op
-        transpose_op
-        reshape_op
-        split_op
-        slice_op
-        concat_op
-        matmul_op
-        elementwise_add_op
-        elementwise_mul_op
-        softmax_with_cross_entropy_op
-        reduce_mean_op
-        reduce_sum_op
-        activation_op
-        sum_op
-        elementwise_max_op
-        elementwise_div_op
-        sgd_op
-        squared_l2_norm_op
-        memcpy_h2d_op
-        memcpy_d2h_op)
-    
-    # All deps of the operators above, part of GLOB_OPERATOR_DEPS.
-    set(OP_DEPS 
-        generator
-        softmax
-        selected_rows_functor
-        jit_kernel_helper
-        concat_and_split
-        cross_entropy)
+if(WITH_GPU
+   AND WITH_TESTING
+   AND NOT WIN32
+   AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+  add_custom_target(
+    download_program
+    COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_main_program
+    COMMAND wget -nc
+            https://paddle-ci.gz.bcebos.com/new_exec/lm_startup_program)
+
+  # all operators used in the program
+  set(OPS
+      fill_constant_op
+      uniform_random_op
+      lookup_table_op
+      transpose_op
+      reshape_op
+      split_op
+      slice_op
+      concat_op
+      matmul_op
+      elementwise_add_op
+      elementwise_mul_op
+      softmax_with_cross_entropy_op
+      reduce_mean_op
+      reduce_sum_op
+      activation_op
+      sum_op
+      elementwise_max_op
+      elementwise_div_op
+      sgd_op
+      squared_l2_norm_op
+      memcpy_h2d_op
+      memcpy_d2h_op)
+
+  # All deps of the operators above, part of GLOB_OPERATOR_DEPS.
+  set(OP_DEPS generator softmax selected_rows_functor jit_kernel_helper
+              concat_and_split cross_entropy)
 
-    cc_test(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${OPS} ${OP_DEPS})
-    set_tests_properties(standalone_executor_test PROPERTIES TIMEOUT 100)
+  cc_test(
+    standalone_executor_test
+    SRCS standalone_executor_test.cc
+    DEPS interpretercore
+         standalone_executor
+         operator
+         op_registry
+         executor
+         ${OPS}
+         ${OP_DEPS})
+  set_tests_properties(standalone_executor_test PROPERTIES TIMEOUT 100)
 
-    add_dependencies(standalone_executor_test download_program)
-    if (WITH_PROFILER)
-        target_link_libraries(standalone_executor_test profiler)
-        add_dependencies(standalone_executor_test profiler)
-    endif()
+  add_dependencies(standalone_executor_test download_program)
+  if(WITH_PROFILER)
+    target_link_libraries(standalone_executor_test profiler)
+    add_dependencies(standalone_executor_test profiler)
+  endif()
 endif()
diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index d0e5565139c54..171e15162fb45 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/data_transfer.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 
 namespace paddle {
@@ -276,9 +277,9 @@ std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
   // 2. Construct VariableNameMap
   VariableNameMap in_name_map = {{"X", {var_name}}};
   VariableNameMap out_name_map = {{"Out", {*new_var_name}}};
-  int dst_place_type = platform::is_cpu_place(dst_place)
-                           ? 0
-                           : platform::is_gpu_place(dst_place) ? 1 : -1;
+  int dst_place_type = platform::is_cpu_place(dst_place)   ? 0
+                       : platform::is_gpu_place(dst_place) ? 1
+                                                           : -1;
   AttributeMap attr_map = {{"dst_place_type", dst_place_type}};
 
   // 3. Create memcpy_d2h_op or memcpy_h2d_op
diff --git a/paddle/fluid/framework/new_executor/event_manager.cc b/paddle/fluid/framework/new_executor/event_manager.cc
index bca2264b66afc..0bfa00494d611 100644
--- a/paddle/fluid/framework/new_executor/event_manager.cc
+++ b/paddle/fluid/framework/new_executor/event_manager.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/event_manager.h"
+
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc
index fb79712d47d9e..f6afcf2f24d18 100644
--- a/paddle/fluid/framework/new_executor/executor_statistics.cc
+++ b/paddle/fluid/framework/new_executor/executor_statistics.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/executor_statistics.h"
+
 #include <fstream>
 #include <functional>
 #include <map>
@@ -21,6 +22,7 @@
 #include <set>
 #include <unordered_map>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/os_info.h"
@@ -520,7 +522,7 @@ void StatisticsEngine::MergeEvents(std::function<size_t(size_t, size_t)> merger,
 
 int StatisticsEngine::MergeInnerthreadEvents(
     std::vector<std::vector<StdEvent>>* all_evts) {
-  auto merger = [& priorities = priorities_](size_t idx1, size_t idx2) {
+  auto merger = [&priorities = priorities_](size_t idx1, size_t idx2) {
     return priorities[idx1].innerthread_priority <=
                    priorities[idx2].innerthread_priority
                ? idx1
@@ -541,7 +543,7 @@ int StatisticsEngine::MergeInnerthreadEvents(
 
 int StatisticsEngine::MergeInterthreadEvents(
     std::vector<std::vector<StdEvent>>* all_evts) {
-  auto merger = [& priorities = priorities_](size_t idx1, size_t idx2) {
+  auto merger = [&priorities = priorities_](size_t idx1, size_t idx2) {
     return priorities[idx1].interthread_priority <=
                    priorities[idx2].interthread_priority
                ? idx1
diff --git a/paddle/fluid/framework/new_executor/executor_statistics.h b/paddle/fluid/framework/new_executor/executor_statistics.h
index 530e9455968a8..ebe9d3a2e7925 100644
--- a/paddle/fluid/framework/new_executor/executor_statistics.h
+++ b/paddle/fluid/framework/new_executor/executor_statistics.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <memory>
+
 #include "paddle/fluid/platform/profiler/event_node.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt b/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt
index 2033eba88f9d1..359c56c561a4d 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt
@@ -1,10 +1,22 @@
-cc_library(interpretercore_garbage_collector SRCS garbage_collector.cc DEPS garbage_collector)
-cc_library(interpretercore_event_garbage_collector SRCS event_garbage_collector.cc DEPS interpretercore_garbage_collector)
+cc_library(
+  interpretercore_garbage_collector
+  SRCS garbage_collector.cc
+  DEPS garbage_collector)
+cc_library(
+  interpretercore_event_garbage_collector
+  SRCS event_garbage_collector.cc
+  DEPS interpretercore_garbage_collector)
 
 if(WITH_GPU OR WITH_ROCM)
-    if(WITH_GPU)
-        nv_library(interpretercore_fast_garbage_collector SRCS fast_garbage_collector.cc DEPS interpretercore_garbage_collector)
-    elseif(WITH_ROCM)
-        hip_library(interpretercore_fast_garbage_collector SRCS fast_garbage_collector.cc DEPS interpretercore_garbage_collector)
-    endif()
+  if(WITH_GPU)
+    nv_library(
+      interpretercore_fast_garbage_collector
+      SRCS fast_garbage_collector.cc
+      DEPS interpretercore_garbage_collector)
+  elseif(WITH_ROCM)
+    hip_library(
+      interpretercore_fast_garbage_collector
+      SRCS fast_garbage_collector.cc
+      DEPS interpretercore_garbage_collector)
+  endif()
 endif()
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
index 46c85a22dc3a3..1ae9f4223d3d9 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
@@ -110,7 +110,7 @@ void InterpreterCoreEventGarbageCollector::Free(
     const platform::DeviceContext* ctx) {
   event->Record(ctx);
   event->SetFininshed();  // Only for CPU Event
-  queue_->AddTask([ container = garbages, event = event ]() {
+  queue_->AddTask([container = garbages, event = event]() {
     while (!event->Query()) {
 #if defined(_WIN32)
       SleepEx(50, FALSE);
@@ -128,7 +128,7 @@ void InterpreterCoreEventGarbageCollector::Free(
     const platform::DeviceContext* ctx) {
   event->Record(ctx);
   event->SetFininshed();  // Only for CPU Event
-  queue_->AddTask([ container = garbage, event = event ]() {
+  queue_->AddTask([container = garbage, event = event]() {
     while (!event->Query()) {
 #if defined(_WIN32)
       SleepEx(50, FALSE);
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h
index 33954713d4e9f..57963269663d0 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h
+++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <queue>
+
 #include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue.h"
 
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
index a20cd27539848..8e849c79bd235 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h"
+
 #include "paddle/fluid/framework/garbage_collector.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h
index 34f95eee7316d..d0159c0ca83e5 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h
+++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <queue>
+
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 #include "paddle/fluid/platform/device_event.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index da2fd0c8c6114..fe0c7fe072178 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
+
 #include <unordered_set>
+
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
 #include "paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h"
@@ -585,10 +587,12 @@ void InterpreterCore::ExecuteInstructionList(
 
   for (size_t i = 0; i < dependecy_count_.size(); ++i) {
     if (dependecy_count_[i] == 0) {
-      async_work_queue_->AddTask(vec_instr.at(i).KernelType(), [
-        this, i, atomic_deps = atomic_deps.get(),
-        atomic_var_ref = atomic_var_ref.get()
-      ] { RunInstructionAsync(i, atomic_deps, atomic_var_ref); });
+      async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
+                                 [this, i, atomic_deps = atomic_deps.get(),
+                                  atomic_var_ref = atomic_var_ref.get()] {
+                                   RunInstructionAsync(i, atomic_deps,
+                                                       atomic_var_ref);
+                                 });
     }
   }
 
@@ -692,10 +696,10 @@ void InterpreterCore::RunInstructionAsync(
     ready_ops.pop();
     auto& instr_node = vec_instruction_.at(instr_id);
     VLOG(5) << __func__ << " OP id:" << instr_node.Id()
-            << " name:" << instr_node.OpBase()->Type()
-            << " type:" << (instr_node.KernelType() == OpFuncType::kQueueSync
-                                ? "kQueueSync"
-                                : "kQueueAsync")
+            << " name:" << instr_node.OpBase()->Type() << " type:"
+            << (instr_node.KernelType() == OpFuncType::kQueueSync
+                    ? "kQueueSync"
+                    : "kQueueAsync")
             << " runs on " << platform::GetCurrentThreadName();
 
     auto* op = instr_node.OpBase();
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index f601a4ad28bd7..0b75964b94e91 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/new_executor/interpretercore_util.h"
+
 #include <algorithm>
 
 #include "paddle/fluid/framework/executor_gc_helper.h"
@@ -398,9 +399,10 @@ void build_op_func_list(const platform::Place& place,
       // But some OPs do have such behavior (e.g., cinn_launch OP). Here special
       // treatment for them.
       if (op_with_kernel->Type() == "cinn_launch") {
-        VLOG(6) << "OP(" << op_with_kernel->Type() << ") use scope in kernel, "
-                                                      "so pass a real scope to "
-                                                      "ExecutionContext";
+        VLOG(6) << "OP(" << op_with_kernel->Type()
+                << ") use scope in kernel, "
+                   "so pass a real scope to "
+                   "ExecutionContext";
         runtime_scope = local_scope;
       }
 
@@ -747,8 +749,9 @@ std::map<int, std::list<int>> get_downstream_map(
 std::map<int, std::list<int>> build_op_downstream_map(
     const std::vector<Instruction>& vec_instruction,
     std::vector<std::vector<bool>>* op_happens_before) {
-  auto var2min_rw_op = std::map<
-      int, std::list<int>>();  // # map from variable id to read / write op id.
+  auto var2min_rw_op =
+      std::map<int, std::list<int>>();  // # map from variable id to read /
+                                        // write op id.
   auto var2recent_write_op =
       std::map<int, int>();  // # map from variable to recent write op.
   auto op2dependences =
@@ -825,8 +828,14 @@ std::map<int, std::list<int>> build_op_downstream_map(
   // add dependences for random op, make sure that the random op is scheduled
   // sequentially
   const std::set<std::string> random_op_set = {
-      "bernoulli", "poisson", "multinomial", "gaussian_random",
-      "truncated_gaussian_random", "uniform_random", "randint", "randperm",
+      "bernoulli",
+      "poisson",
+      "multinomial",
+      "gaussian_random",
+      "truncated_gaussian_random",
+      "uniform_random",
+      "randint",
+      "randperm",
       "exponential",
       "sampling_id"
       "dropout",
@@ -846,7 +855,10 @@ std::map<int, std::list<int>> build_op_downstream_map(
   // add dependency for communication op
   auto is_comm_op = [](std::string op) -> bool {
     const std::set<std::string> special_comm_op_set = {
-        "send", "recv", "send_v2", "recv_v2",
+        "send",
+        "recv",
+        "send_v2",
+        "recv_v2",
     };
     const std::string communication_op_prefix = "c_";
     if (op.find(communication_op_prefix) != std::string::npos ||
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h
index 60ac3702f4b3c..3d5b067c18792 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -22,10 +22,9 @@
 
 #include <chrono>
 #include <iostream>
-#include <string>
-
 #include <map>
 #include <memory>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index c75a7871d63e9..1a4dd2edf2793 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+
 #include <map>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
 // When in inference scenario, the scopes will not be written by two threads in
@@ -385,10 +386,11 @@ InterpretercoreInferShapeContext::GetOutputsVarType(
 void InterpretercoreInferShapeContext::SetOutputDim(const std::string& name,
                                                     const DDim& dim) {
   auto& vars = OutputVars(name);
-  PADDLE_ENFORCE_EQ(vars.size(), 1UL, platform::errors::InvalidArgument(
-                                          "Output(%s) should hold one element, "
-                                          "but now it holds %zu elements.",
-                                          name, vars.size()));
+  PADDLE_ENFORCE_EQ(
+      vars.size(), 1UL,
+      platform::errors::InvalidArgument("Output(%s) should hold one element, "
+                                        "but now it holds %zu elements.",
+                                        name, vars.size()));
   SetDim(vars[0], dim);
 }
 
@@ -653,8 +655,9 @@ void VariableScope::CheckExist(int id) const {
 }
 
 void VariableScope::CheckExist(const std::string& name) const {
-  PADDLE_ENFORCE_EQ(HasVar(name), true, platform::errors::NotFound(
-                                            "%s not in VariableScope.", name));
+  PADDLE_ENFORCE_EQ(
+      HasVar(name), true,
+      platform::errors::NotFound("%s not in VariableScope.", name));
 }
 
 void VariableScope::ClearListener() {
@@ -709,8 +712,9 @@ void VariableScopeListener::onClear() {}
 Instruction::Instruction(size_t id, OpFuncNode&& op_func_node,
                          const platform::DeviceContext& dev_ctx)
     : id_(id), op_func_node_(op_func_node), dev_ctx_(dev_ctx) {
-  PADDLE_ENFORCE_GE(id, 0, platform::errors::PreconditionNotMet(
-                               "Required id >= 0, but received id = %d", id));
+  PADDLE_ENFORCE_GE(id, 0,
+                    platform::errors::PreconditionNotMet(
+                        "Required id >= 0, but received id = %d", id));
 }
 
 size_t Instruction::Id() const { return id_; }
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 31315df5701e5..64332d7fc90b0 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/new_executor/standalone_executor.h"
+
 #include "paddle/fluid/framework/new_executor/interpretercore_util.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 23bd777fae1d5..60d59899549fa 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <chrono>
 #include <iostream>
 #include <string>
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc
index fdcd19b03098c..6c689c8548b90 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/stream_analyzer.h"
+
 #include <unordered_set>
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.h b/paddle/fluid/framework/new_executor/stream_analyzer.h
index 2a276c6f5097a..8a6552c6883c5 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.h
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/device_event.h"
diff --git a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
index 2690b29e01b9d..781ef9a64a253 100644
--- a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
@@ -1,3 +1,12 @@
-cc_library(workqueue_utils SRCS workqueue_utils.cc events_waiter.cc DEPS enforce glog)
-cc_library(workqueue SRCS workqueue.cc DEPS workqueue_utils enforce glog os_info)
-cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue)
+cc_library(
+  workqueue_utils
+  SRCS workqueue_utils.cc events_waiter.cc
+  DEPS enforce glog)
+cc_library(
+  workqueue
+  SRCS workqueue.cc
+  DEPS workqueue_utils enforce glog os_info)
+cc_test(
+  workqueue_test
+  SRCS workqueue_test.cc
+  DEPS workqueue)
diff --git a/paddle/fluid/framework/new_executor/workqueue/event_count.h b/paddle/fluid/framework/new_executor/workqueue/event_count.h
index 7a826c3990713..7c20e12ff1f94 100644
--- a/paddle/fluid/framework/new_executor/workqueue/event_count.h
+++ b/paddle/fluid/framework/new_executor/workqueue/event_count.h
@@ -54,6 +54,7 @@
 #include <cstdlib>
 #include <mutex>
 #include <vector>
+
 #include "glog/logging.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc b/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc
index 346e20d811e84..dbe609427adcf 100644
--- a/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/workqueue/events_waiter.h"
+
 #include <glog/logging.h>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/workqueue/events_waiter.h b/paddle/fluid/framework/new_executor/workqueue/events_waiter.h
index 9d85f4a27242c..9284ffa853a85 100644
--- a/paddle/fluid/framework/new_executor/workqueue/events_waiter.h
+++ b/paddle/fluid/framework/new_executor/workqueue/events_waiter.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/new_executor/workqueue/event_count.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 
diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
index 559eb6a7490cd..20aebfba8e8f8 100644
--- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
@@ -12,6 +12,7 @@
 #include <atomic>
 #include <cstdlib>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/new_executor/workqueue/event_count.h"
 #include "paddle/fluid/framework/new_executor/workqueue/run_queue.h"
diff --git a/paddle/fluid/framework/new_executor/workqueue/run_queue.h b/paddle/fluid/framework/new_executor/workqueue/run_queue.h
index 2fc42cf308ab8..7644425a48491 100644
--- a/paddle/fluid/framework/new_executor/workqueue/run_queue.h
+++ b/paddle/fluid/framework/new_executor/workqueue/run_queue.h
@@ -42,6 +42,7 @@
 #include <cstdint>
 #include <mutex>
 #include <vector>
+
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 
@@ -76,9 +77,8 @@ class RunQueue {
     unsigned front = front_.load(std::memory_order_relaxed);
     Elem* e = &array_[front & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
-    if (s != kEmpty ||
-        !e->state.compare_exchange_strong(s, kBusy,
-                                          std::memory_order_acquire)) {
+    if (s != kEmpty || !e->state.compare_exchange_strong(
+                           s, kBusy, std::memory_order_acquire)) {
       return w;
     }
     front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
@@ -93,9 +93,8 @@ class RunQueue {
     unsigned front = front_.load(std::memory_order_relaxed);
     Elem* e = &array_[(front - 1) & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
-    if (s != kReady ||
-        !e->state.compare_exchange_strong(s, kBusy,
-                                          std::memory_order_acquire)) {
+    if (s != kReady || !e->state.compare_exchange_strong(
+                           s, kBusy, std::memory_order_acquire)) {
       return Work();
     }
     Work w = std::move(e->w);
@@ -112,9 +111,8 @@ class RunQueue {
     unsigned back = back_.load(std::memory_order_relaxed);
     Elem* e = &array_[(back - 1) & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
-    if (s != kEmpty ||
-        !e->state.compare_exchange_strong(s, kBusy,
-                                          std::memory_order_acquire)) {
+    if (s != kEmpty || !e->state.compare_exchange_strong(
+                           s, kBusy, std::memory_order_acquire)) {
       return w;
     }
     back = ((back - 1) & kMask2) | (back & ~kMask2);
@@ -134,9 +132,8 @@ class RunQueue {
     unsigned back = back_.load(std::memory_order_relaxed);
     Elem* e = &array_[back & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
-    if (s != kReady ||
-        !e->state.compare_exchange_strong(s, kBusy,
-                                          std::memory_order_acquire)) {
+    if (s != kReady || !e->state.compare_exchange_strong(
+                           s, kBusy, std::memory_order_acquire)) {
       return Work();
     }
     Work w = std::move(e->w);
@@ -163,9 +160,8 @@ class RunQueue {
       Elem* e = &array_[mid & kMask];
       uint8_t s = e->state.load(std::memory_order_relaxed);
       if (n == 0) {
-        if (s != kReady ||
-            !e->state.compare_exchange_strong(s, kBusy,
-                                              std::memory_order_acquire))
+        if (s != kReady || !e->state.compare_exchange_strong(
+                               s, kBusy, std::memory_order_acquire))
           continue;
         start = mid;
       } else {
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
index 0f0de8ef9b05d..b06c540b756da 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
@@ -5,6 +5,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue.h"
+
 #include "paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -64,11 +65,8 @@ class WorkQueueImpl : public WorkQueue {
                                  platform::TracerEventType::UserDefined,
                                  10 /*level*/);
     if (tracker_ != nullptr) {
-      fn = [
-        task = std::move(fn), raii = CounterGuard<TaskTracker>(tracker_)
-      ]() mutable {
-        task();
-      };
+      fn = [task = std::move(fn),
+            raii = CounterGuard<TaskTracker>(tracker_)]() mutable { task(); };
     }
     queue_->AddTask(std::move(fn));
   }
@@ -158,11 +156,8 @@ void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function<void()> fn) {
                                10 /*level*/);
   assert(queue_idx < queues_.size());
   if (queues_options_.at(queue_idx).track_task) {
-    fn = [
-      task = std::move(fn), raii = CounterGuard<TaskTracker>(tracker_)
-    ]() mutable {
-      task();
-    };
+    fn = [task = std::move(fn),
+          raii = CounterGuard<TaskTracker>(tracker_)]() mutable { task(); };
   }
   queues_[queue_idx]->AddTask(std::move(fn));
 }
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.h b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
index 2c2576528fe0e..1a1900c56872d 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.h
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <type_traits>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -118,10 +119,10 @@ class WorkQueue {
         std::bind(std::forward<F>(f), std::forward<Args>(args)...);
     std::promise<ReturnType> prom;
     std::future<ReturnType> res = prom.get_future();
-    AddTask([
-      t = std::move(task),
-      p = FakeCopyable<std::promise<ReturnType>>(std::move(prom))
-    ]() mutable { p.Get().set_value(t()); });
+    AddTask([t = std::move(task), p = FakeCopyable<std::promise<ReturnType>>(
+                                      std::move(prom))]() mutable {
+      p.Get().set_value(t());
+    });
     return res;
   }
 
@@ -158,10 +159,9 @@ class WorkQueueGroup {
         std::bind(std::forward<F>(f), std::forward<Args>(args)...);
     std::promise<ReturnType> prom;
     std::future<ReturnType> res = prom.get_future();
-    AddTask(queue_idx, [
-      t = std::move(task),
-      p = FakeCopyable<std::promise<ReturnType>>(std::move(prom))
-    ]() mutable { p.Get().set_value(t()); });
+    AddTask(queue_idx, [t = std::move(task),
+                        p = FakeCopyable<std::promise<ReturnType>>(std::move(
+                            prom))]() mutable { p.Get().set_value(t()); });
     return res;
   }
 
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
index 857eaead5b658..3e38d0dbbf9a3 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue.h"
+
 #include <atomic>
 #include <thread>
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
@@ -37,10 +39,10 @@ TEST(WorkQueueUtils, TestEventsWaiter) {
 
 TEST(WorkQueue, TestSingleThreadedWorkQueue) {
   VLOG(1) << "In Test";
-  using paddle::framework::WorkQueueOptions;
-  using paddle::framework::WorkQueue;
   using paddle::framework::CreateSingleThreadedWorkQueue;
   using paddle::framework::EventsWaiter;
+  using paddle::framework::WorkQueue;
+  using paddle::framework::WorkQueueOptions;
   std::atomic<bool> finished{false};
   std::atomic<unsigned> counter{0};
   constexpr unsigned kLoopNum = 1000000;
@@ -83,10 +85,10 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) {
 
 TEST(WorkQueue, TestMultiThreadedWorkQueue) {
   VLOG(1) << "In Test";
-  using paddle::framework::WorkQueueOptions;
-  using paddle::framework::WorkQueue;
   using paddle::framework::CreateMultiThreadedWorkQueue;
   using paddle::framework::EventsWaiter;
+  using paddle::framework::WorkQueue;
+  using paddle::framework::WorkQueueOptions;
   std::atomic<bool> finished{false};
   std::atomic<unsigned> counter{0};
   constexpr unsigned kExternalLoopNum = 100;
@@ -136,10 +138,10 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) {
 }
 
 TEST(WorkQueue, TestWorkQueueGroup) {
-  using paddle::framework::WorkQueueOptions;
-  using paddle::framework::WorkQueueGroup;
   using paddle::framework::CreateWorkQueueGroup;
   using paddle::framework::EventsWaiter;
+  using paddle::framework::WorkQueueGroup;
+  using paddle::framework::WorkQueueOptions;
   std::atomic<bool> finished{false};
   std::atomic<unsigned> counter{0};
   constexpr unsigned kExternalLoopNum = 100;
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc
index 82dcbbd509dd5..152f89d9ef0b5 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
+
 #include <cstdint>
 #include <cstdlib>
 
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h
index b6e6ede8c334f..380746c05d604 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h
@@ -21,6 +21,7 @@
 #include <memory>
 #include <set>
 #include <string>
+
 #include "paddle/fluid/framework/new_executor/workqueue/events_waiter.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference.cc b/paddle/fluid/framework/no_need_buffer_vars_inference.cc
index 25f64838c6d39..665c9b811faee 100644
--- a/paddle/fluid/framework/no_need_buffer_vars_inference.cc
+++ b/paddle/fluid/framework/no_need_buffer_vars_inference.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/imperative/saved_variable_wrapper_list.h"
 
diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference_test.cc b/paddle/fluid/framework/no_need_buffer_vars_inference_test.cc
index a92d52fd2e9ea..a2c7df763a7ef 100644
--- a/paddle/fluid/framework/no_need_buffer_vars_inference_test.cc
+++ b/paddle/fluid/framework/no_need_buffer_vars_inference_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/imperative/layer.h"
diff --git a/paddle/fluid/framework/op_def_api.cc b/paddle/fluid/framework/op_def_api.cc
index 73f1409ae690e..b62f17987e651 100644
--- a/paddle/fluid/framework/op_def_api.cc
+++ b/paddle/fluid/framework/op_def_api.cc
@@ -17,6 +17,7 @@
 #define _LINUX
 #endif
 #include "paddle/fluid/framework/op_def_api.h"
+
 #include <fstream>
 #include <mutex>
 #include <string>
@@ -28,6 +29,7 @@
 #endif
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/op_def.pb.h"
 
diff --git a/paddle/fluid/framework/op_def_api.h b/paddle/fluid/framework/op_def_api.h
index 1ef2254d0da36..754b76663df1a 100644
--- a/paddle/fluid/framework/op_def_api.h
+++ b/paddle/fluid/framework/op_def_api.h
@@ -21,5 +21,5 @@ namespace framework {
 const proto::OpDef& GetOpDef(const std::string& op_name);
 
 bool HasOpDef(const std::string& op_name);
-}
-}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 87d3a048d0be0..db2a411da0086 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -495,8 +495,9 @@ bool OpDesc::HasProtoAttr(const std::string &name) const {
 
 proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
-                                          "Attribute %s is not found.", name));
+  PADDLE_ENFORCE_NE(
+      it, attrs_.end(),
+      platform::errors::NotFound("Attribute %s is not found.", name));
   return static_cast<proto::AttrType>(it->second.which() - 1);
 }
 
@@ -599,8 +600,9 @@ void OpDesc::SetAttrMap(
 
 Attribute OpDesc::GetAttr(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
-                                          "Attribute %s is not found.", name));
+  PADDLE_ENFORCE_NE(
+      it, attrs_.end(),
+      platform::errors::NotFound("Attribute %s is not found.", name));
   return it->second;
 }
 
@@ -854,10 +856,11 @@ bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
   if (length == 0) {
     return false;
   }
-  PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
-                                     "Input(%s) should have only one value, "
-                                     "but it has %d values now.",
-                                     name, length));
+  PADDLE_ENFORCE_EQ(
+      length, 1UL,
+      platform::errors::InvalidArgument("Input(%s) should have only one value, "
+                                        "but it has %d values now.",
+                                        name, length));
   return block_.HasVarRecursive(input_names[0]);
 }
 
@@ -870,10 +873,11 @@ bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const {
   if (length == 0) {
     return false;
   }
-  PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
-                                     "Output(%s) should have only one value, "
-                                     "but it has %d values now.",
-                                     name, length));
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    platform::errors::InvalidArgument(
+                        "Output(%s) should have only one value, "
+                        "but it has %d values now.",
+                        name, length));
   return block_.HasVarRecursive(output_names[0]);
 }
 
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 903ee73b2c013..51aeed2e5d734 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
 namespace paddle {
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index 889b6b0c86b2f..8b77b1d260c42 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -12,11 +12,11 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#include "paddle/fluid/framework/op_registry.h"
+
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/framework/op_registry.h"
-
 namespace pd = paddle::framework;
 
 namespace paddle {
@@ -58,8 +58,9 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
     AddInput("input", "input of cosine op").AsDuplicable();
     AddOutput("output", "output of cosine op").AsIntermediate();
     auto my_checker = [](int i) {
-      PADDLE_ENFORCE_EQ(i % 2, 0, platform::errors::InvalidArgument(
-                                      "'test_attr' must be even!"));
+      PADDLE_ENFORCE_EQ(
+          i % 2, 0,
+          platform::errors::InvalidArgument("'test_attr' must be even!"));
     };
     AddAttr<int>("test_attr", "a simple test attribute")
         .AddCustomChecker(my_checker);
diff --git a/paddle/fluid/framework/op_version_proto.h b/paddle/fluid/framework/op_version_proto.h
index 9b70bb93bb967..022531d53de1c 100644
--- a/paddle/fluid/framework/op_version_proto.h
+++ b/paddle/fluid/framework/op_version_proto.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
 #include <string>
 
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
index e66d0dc5a1f79..8f83631c272ee 100644
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -12,10 +12,10 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <gtest/gtest.h>
-
 #include "paddle/fluid/framework/op_version_registry.h"
 
+#include <gtest/gtest.h>
+
 namespace paddle {
 namespace framework {
 namespace compatible {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 69f14d7903c0b..7395a8e0da8e8 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 
 #include <glog/logging.h>
+
 #include <sstream>
 #include <string>
 
@@ -1205,10 +1206,11 @@ bool OperatorWithKernel::SupportsMKLDNN(
     const proto::VarType::Type data_type) const {
   auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_);
   if (op_kernel_iter == OperatorWithKernel::AllOpKernels().end()) {
-    VLOG(6) << "Warning: " << type_ << " don't find its MKLDNN Kernel in Fluid "
-                                       "Registered Kernels. And We don't "
-                                       "search its kernels in phi lib, "
-                                       "SupportsMKLDNN() return false.";
+    VLOG(6) << "Warning: " << type_
+            << " don't find its MKLDNN Kernel in Fluid "
+               "Registered Kernels. And We don't "
+               "search its kernels in phi lib, "
+               "SupportsMKLDNN() return false.";
     return false;
   }
   auto& op_kernels = op_kernel_iter->second;
@@ -1440,7 +1442,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 #if defined(PADDLE_WITH_XPU_KP)
         && (!is_xpu_unsupport || use_phi_xpu_kp)
 #endif
-            ) {
+    ) {
       run_phi_kernel_ = true;
     } else {
       auto& all_op_kernels = AllOpKernels();
@@ -1464,7 +1466,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 #if defined(PADDLE_WITH_XPU_KP)
           || (is_xpu_unsupport && !is_xpu_kp_support)
 #endif
-              ) {
+      ) {
         auto pt_cpu_kernel_key =
             FallBackToCpu(*kernel_type_.get(), pt_kernel_key, *this);
         pt_kernel_.reset(
@@ -2238,8 +2240,9 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
     if (arg_map_fn) {
       arg_map_fn_.reset(new phi::ArgumentMappingFn(*arg_map_fn));
     } else {
-      auto func = [this](
-          const phi::ArgumentMappingContext& ctx) -> phi::KernelSignature {
+      auto func =
+          [this](
+              const phi::ArgumentMappingContext& ctx) -> phi::KernelSignature {
         return phi::DefaultKernelSignatureMap::Instance().Get(type_);
       };
       arg_map_fn_.reset(new phi::ArgumentMappingFn(func));
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 2efa2e4bd8a75..dc13287b5aad3 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -27,6 +27,7 @@ limitations under the License. */
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
@@ -38,12 +39,10 @@ limitations under the License. */
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/variant.h"
-#include "paddle/utils/flat_hash_map.h"
-
-#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
 #include "paddle/phi/core/compat/op_utils.h"
 #include "paddle/phi/core/kernel_factory.h"
+#include "paddle/utils/flat_hash_map.h"
 
 namespace paddle {
 namespace framework {
@@ -610,12 +609,12 @@ class OperatorWithKernel : public OperatorBase {
 
   /* member functions for adapting to phi lib */
   /** In the Tensor calculation library, the new Kernel adopts a clearer and
-    * more streamlined design. The arguments of the Kernel and the input and
-    * output arguments registered in the original OpMaker do not match in some
-    * cases, so we use map to record the arguments required by the kernel.
-    * When selecting Kernel during Op execution, select the arguments of the
-    * original Op according to the GetExpectedPhiKernelArgs returned arguments.
-    */
+   * more streamlined design. The arguments of the Kernel and the input and
+   * output arguments registered in the original OpMaker do not match in some
+   * cases, so we use map to record the arguments required by the kernel.
+   * When selecting Kernel during Op execution, select the arguments of the
+   * original Op according to the GetExpectedPhiKernelArgs returned arguments.
+   */
   phi::KernelSignature GetExpectedPhiKernelArgs(
       const ExecutionContext& ctx) const;
 
diff --git a/paddle/fluid/framework/operator_exception_test.cc b/paddle/fluid/framework/operator_exception_test.cc
index 7b513996fb40e..0f635d170de2f 100644
--- a/paddle/fluid/framework/operator_exception_test.cc
+++ b/paddle/fluid/framework/operator_exception_test.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/operator.h"
 #include <exception>
 #include <stdexcept>
 #include <string>
 #include <utility>
+
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h
index ab812a30981f0..57d377f1389cf 100644
--- a/paddle/fluid/framework/operator_kernel_configs.h
+++ b/paddle/fluid/framework/operator_kernel_configs.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <mutex>
 #include <unordered_map>
 #include <vector>
+
 #include "glog/logging.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 24e09bcd463dc..3dda60de12ad4 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -11,11 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "gtest/gtest.h"
+#include "paddle/fluid/framework/operator.h"
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/init.h"
 
diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
index 75e258d14764c..7cb9cf254fb1a 100644
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -1,29 +1,85 @@
-cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper lod_tensor proto_desc)
-cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors enforce)
-cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn)
-cc_library(transform_type SRCS transform_type.cc DEPS errors enforce cinn)
-cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn)
-cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn cinn_launch_context)
+cc_library(
+  cinn_cache_key
+  SRCS cinn_cache_key.cc
+  DEPS boost graph graph_helper lod_tensor proto_desc)
+cc_library(
+  build_cinn_pass
+  SRCS build_cinn_pass.cc
+  DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors
+       enforce)
+cc_library(
+  transform_desc
+  SRCS transform_desc.cc
+  DEPS proto_desc cinn)
+cc_library(
+  transform_type
+  SRCS transform_type.cc
+  DEPS errors enforce cinn)
+cc_library(
+  cinn_graph_symbolization
+  SRCS cinn_graph_symbolization.cc
+  DEPS lod_tensor graph transform_desc cinn)
+cc_library(
+  cinn_compiler
+  SRCS cinn_compiler.cc
+  DEPS framework_proto
+       graph
+       lod_tensor
+       cinn_cache_key
+       cinn_graph_symbolization
+       cinn
+       cinn_launch_context)
 
-if (WITH_TESTING)
-  cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn)
+if(WITH_TESTING)
+  cc_test(
+    cinn_lib_test
+    SRCS cinn_lib_test.cc
+    DEPS cinn)
   set_tests_properties(cinn_lib_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-  cc_test(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS cinn_cache_key)
+  cc_test(
+    cinn_cache_key_test
+    SRCS cinn_cache_key_test.cc
+    DEPS cinn_cache_key)
   set_tests_properties(cinn_cache_key_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-  cc_test(build_cinn_pass_test SRCS build_cinn_pass_test.cc DEPS build_cinn_pass cinn_compiler op_registry mul_op activation_op elementwise_add_op)
+  cc_test(
+    build_cinn_pass_test
+    SRCS build_cinn_pass_test.cc
+    DEPS build_cinn_pass cinn_compiler op_registry mul_op activation_op
+         elementwise_add_op)
   set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-  cc_test(transform_desc_test SRCS transform_desc_test.cc DEPS transform_desc)
+  cc_test(
+    transform_desc_test
+    SRCS transform_desc_test.cc
+    DEPS transform_desc)
   set_tests_properties(transform_desc_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-  cc_test(transform_type_test SRCS transform_type_test.cc DEPS transform_type)
+  cc_test(
+    transform_type_test
+    SRCS transform_type_test.cc
+    DEPS transform_type)
   set_tests_properties(transform_type_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-  cc_test(cinn_graph_symbolization_test SRCS cinn_graph_symbolization_test.cc DEPS cinn_graph_symbolization)
-  set_tests_properties(cinn_graph_symbolization_test PROPERTIES LABELS "RUN_TYPE=CINN")
+  cc_test(
+    cinn_graph_symbolization_test
+    SRCS cinn_graph_symbolization_test.cc
+    DEPS cinn_graph_symbolization)
+  set_tests_properties(cinn_graph_symbolization_test PROPERTIES LABELS
+                                                                "RUN_TYPE=CINN")
 
-  cc_test(cinn_compiler_test SRCS cinn_compiler_test.cc DEPS cinn_compiler place proto_desc graph_viz_pass build_cinn_pass cinn mul_op activation_op elementwise_add_op)
+  cc_test(
+    cinn_compiler_test
+    SRCS cinn_compiler_test.cc
+    DEPS cinn_compiler
+         place
+         proto_desc
+         graph_viz_pass
+         build_cinn_pass
+         cinn
+         mul_op
+         activation_op
+         elementwise_add_op)
   set_tests_properties(cinn_compiler_test PROPERTIES LABELS "RUN_TYPE=CINN")
 endif()
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index 295510cdb1cf2..a2bdd2bc4c105 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -334,7 +334,7 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
   }
 
   GraphNodeSet need_feed_vars;
-  std::unordered_set<Node *> param_vars, output_vars;
+  std::unordered_set<Node*> param_vars, output_vars;
   // the subgraph is independently, so here we only need link
   // to the node in new subgraph, and discard the link to
   // out-graph.
@@ -386,18 +386,18 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
                subgraph.get());
   // Save lists of input variables, internal variables and output variables
   // of the cluster as attributes of the subgraph for convenience.
-  auto collect_names_fn = [](
-      const GraphNodeSet& nodes,
-      const std::unordered_set<std::string>& ignore_names) {
-    auto result = std::make_unique<std::vector<std::string>>();
-    for (auto* node : nodes) {
-      if (!node->Var() || ignore_names.count(node->Name())) {
-        continue;
-      }
-      result->emplace_back(node->Name());
-    }
-    return result;
-  };
+  auto collect_names_fn =
+      [](const GraphNodeSet& nodes,
+         const std::unordered_set<std::string>& ignore_names) {
+        auto result = std::make_unique<std::vector<std::string>>();
+        for (auto* node : nodes) {
+          if (!node->Var() || ignore_names.count(node->Name())) {
+            continue;
+          }
+          result->emplace_back(node->Name());
+        }
+        return result;
+      };
   subgraph->Set<std::vector<std::string>>(
       kInternalVars, collect_names_fn(cluster_internals, {}).release());
   subgraph->Set<std::vector<std::string>>(
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index d593aadc02c73..e9c517af2c395 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <string>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
index 9b5ce876c256f..585f9edce868a 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
@@ -100,7 +100,7 @@ size_t CinnCacheKeyByStructure::HashGraph(const ir::Graph& graph) {
 
   // graph.Nodes() return unordered_set, here using set to avoid the same graph
   // may return different result
-  std::set<ir::Node *, bool (*)(ir::Node *, ir::Node *)> node_set(compare),
+  std::set<ir::Node*, bool (*)(ir::Node*, ir::Node*)> node_set(compare),
       output_set(compare);
   node_set.insert(graph.Nodes().begin(), graph.Nodes().end());
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
index 1ebeecbff954a..24e65599018fa 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include <map>
 #include <unordered_set>
 
@@ -21,6 +22,7 @@
 #include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/phi/core/ddim.h"
+// clang-format on
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 12f603542066f..2a6a51d73f2b8 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -51,14 +51,14 @@ namespace paddle {
 namespace framework {
 namespace paddle2cinn {
 
-using ir::Graph;
-using ir::Node;
-using inference::analysis::Dot;
 using ::cinn::auto_schedule::AutoTuner;
 using ::cinn::common::Target;
 using ::cinn::frontend::Optimize;
 using ::cinn::hlir::framework::BuildScope;
 using ::cinn::hlir::framework::GraphCompiler;
+using inference::analysis::Dot;
+using ir::Graph;
+using ir::Node;
 
 CinnCompiler* CinnCompiler::GetInstance() {
   static CinnCompiler* instance = new CinnCompiler();
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
index a38e8b4c5f674..91c559767642a 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h"
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index 255e318c9fa69..5a84a97ee8da7 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -44,8 +44,8 @@ DECLARE_string(deny_cinn_ops);
 namespace paddle {
 namespace framework {
 namespace paddle2cinn {
-using ir::Graph;
 using ::cinn::common::Target;
+using ir::Graph;
 
 namespace {
 template <typename T, typename Alloc = std::allocator<T>>
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
index 31bf8d9b726d8..4e362057c915f 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+// clang-format off
 #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
 
 #include <algorithm>
@@ -30,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
+// clang-format on
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
index 526eb65a56ede..4155147da4b8f 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+// clang-format off
 #include <map>
 #include <string>
 #include <unordered_map>
@@ -26,6 +27,7 @@ limitations under the License. */
 
 #include "cinn/frontend/net_builder.h"
 #include "cinn/frontend/op_mapper_registry.h"
+// clang-format on
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
index c0e1ca8f0d123..8a6f92a6f45d0 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+// clang-format off
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
+// clang-format on
 
 namespace paddle {
 namespace framework {
 namespace paddle2cinn {
 
+using ::cinn::frontend::NetBuilder;
 using ir::Graph;
 using ir::Node;
-using ::cinn::frontend::NetBuilder;
 using CinnTensor = ::cinn::hlir::framework::Tensor;
 using OpMapperContext = CinnGraphSymbolization::OpMapperContext;
 using CinnOpDesc = CinnGraphSymbolization::CinnOpDesc;
diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc.h b/paddle/fluid/framework/paddle2cinn/transform_desc.h
index 76a4f812730df..6f0931b6d038d 100644
--- a/paddle/fluid/framework/paddle2cinn/transform_desc.h
+++ b/paddle/fluid/framework/paddle2cinn/transform_desc.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+// The headers cant be sorted by clang-format or compilint error occurs.
+// clang-format off
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -24,6 +26,7 @@
 #include "cinn/frontend/paddle/cpp/op_desc.h"
 #include "cinn/frontend/paddle/cpp/program_desc.h"
 #include "cinn/frontend/paddle/cpp/var_desc.h"
+// clang-format on
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc b/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc
index ba324295cad72..ae9f51c3f6790 100644
--- a/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include <unordered_map>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/paddle2cinn/transform_desc.h"
+// clang-format on
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/transform_type.cc b/paddle/fluid/framework/paddle2cinn/transform_type.cc
index 0e348084d254e..60502edd99acf 100644
--- a/paddle/fluid/framework/paddle2cinn/transform_type.cc
+++ b/paddle/fluid/framework/paddle2cinn/transform_type.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/paddle2cinn/transform_type.h"
+
 #include "cinn/common/type.h"
 #include "cinn/runtime/cinn_runtime.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/paddle2cinn/transform_type.h b/paddle/fluid/framework/paddle2cinn/transform_type.h
index e44960abbd98d..f0b08ba1e00a4 100644
--- a/paddle/fluid/framework/paddle2cinn/transform_type.h
+++ b/paddle/fluid/framework/paddle2cinn/transform_type.h
@@ -19,7 +19,7 @@
 struct cinn_type_t;
 namespace cinn::common {
 struct Type;
-}  // ::cinn::common
+}  // namespace cinn::common
 
 namespace paddle::framework::paddle2cinn {
 
diff --git a/paddle/fluid/framework/paddle2cinn/transform_type_test.cc b/paddle/fluid/framework/paddle2cinn/transform_type_test.cc
index 6c5d360d34cdd..4456642b3e9a0 100644
--- a/paddle/fluid/framework/paddle2cinn/transform_type_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/transform_type_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/paddle2cinn/transform_type.h"
+
 #include "cinn/common/type.h"
 #include "cinn/runtime/cinn_runtime.h"
 #include "gtest/gtest.h"
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index b088a535a1232..00d48098a13f6 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -666,8 +666,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                    ir::Graph *graph)
     : member_(new ParallelExecutorPrivate(places, scope)) {
   PADDLE_ENFORCE_EQ(places.size() > 0 && !platform::is_npu_place(places[0]),
-                    true, platform::errors::Unavailable(
-                              "NPU is not supported in ParallelExecutor."));
+                    true,
+                    platform::errors::Unavailable(
+                        "NPU is not supported in ParallelExecutor."));
   InitP2P(places);
   ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                  member_->places_.size());
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 18d0ee78ffbbc..3dc9fbcfbf312 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -42,9 +42,9 @@ namespace framework {
 
 class ParallelExecutorPrivate;
 
-using details::VariableInfo;
 using details::BuildStrategy;
 using details::ExecutionStrategy;
+using details::VariableInfo;
 namespace p = paddle::platform;
 using DeviceType = paddle::platform::DeviceType;
 
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index 3eda00006f959..19f7b024b27f2 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/phi_utils.h"
+
 #include <sstream>
 
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/phi_utils.h"
-
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h
index 785ede5c60175..535672f2e1288 100644
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -21,11 +21,10 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
-
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
diff --git a/paddle/fluid/framework/phi_utils_test.cc b/paddle/fluid/framework/phi_utils_test.cc
index cbcdf24c9f32b..02eb23f8ac17b 100644
--- a/paddle/fluid/framework/phi_utils_test.cc
+++ b/paddle/fluid/framework/phi_utils_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/phi_utils.h"
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 4a31adcca65ec..88738255af78e 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/program_desc.h"
+
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/version.h"
 
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index 4ceb0c5c82481..7e1c12f4ac5b1 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/framework/program_processing.cc b/paddle/fluid/framework/program_processing.cc
index 3bcf6f8f3855f..95b28b79dcf36 100644
--- a/paddle/fluid/framework/program_processing.cc
+++ b/paddle/fluid/framework/program_processing.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/program_processing.h"
+
 #include "paddle/fluid/framework/block_desc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
index 4c95f01ae569f..fbeedcc311ac7 100644
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <glog/logging.h>
 
 #include <queue>
+
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc
index 64b30878150d0..5fbfda716b437 100644
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/prune.h"
 
 #include <gtest/gtest.h>
+
 #include <string>
 
 #include "paddle/fluid/framework/block_desc.h"
diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index aec40a5a7ebdd..c86bfbc43bfb9 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <google/protobuf/text_format.h>
+
 #include <cstdlib>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index a12079a135dbd..7a0fe65182d13 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <time.h>
+
 #include "paddle/fluid/framework/device_worker.h"
 
 namespace phi {
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index b418339bf3296..27940f726dca1 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/reader.h"
+
 #include <deque>
 
 namespace paddle {
diff --git a/paddle/fluid/framework/save_load_util.cc b/paddle/fluid/framework/save_load_util.cc
index 44488fca01c02..284965fdfe9a8 100644
--- a/paddle/fluid/framework/save_load_util.cc
+++ b/paddle/fluid/framework/save_load_util.cc
@@ -342,8 +342,9 @@ bool LoadTensorFromDisk(
     uint32_t version;
     fin.read(reinterpret_cast<char*>(&version), sizeof(version));
     CheckInStreamState(fin, sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, platform::errors::InvalidArgument(
-                                       "Only version 0 tensor is supported."));
+    PADDLE_ENFORCE_EQ(version, 0U,
+                      platform::errors::InvalidArgument(
+                          "Only version 0 tensor is supported."));
     proto::VarType::TensorDesc desc;
     {
       // int32_t size
diff --git a/paddle/fluid/framework/save_load_util_test.cc b/paddle/fluid/framework/save_load_util_test.cc
index 10a34d7ce91ad..623f0f27bdaa2 100644
--- a/paddle/fluid/framework/save_load_util_test.cc
+++ b/paddle/fluid/framework/save_load_util_test.cc
@@ -11,11 +11,12 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/framework/save_load_util.h"
+
 #include <stdlib.h>
 #include <time.h>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/save_load_util.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/scope_guard.h b/paddle/fluid/framework/scope_guard.h
index 83387842e94ef..9c741f7bfc573 100644
--- a/paddle/fluid/framework/scope_guard.h
+++ b/paddle/fluid/framework/scope_guard.h
@@ -16,6 +16,7 @@
 
 #include <type_traits>
 #include <utility>
+
 #include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
@@ -41,12 +42,12 @@ class ScopeGuard {
 #define _PADDLE_CONCAT_TOKEN(x, y) x##y
 #define PADDLE_CONCAT_TOKEN(x, y) _PADDLE_CONCAT_TOKEN(x, y)
 
-#define DEFINE_PADDLE_SCOPE_GUARD(...)                                     \
-  auto PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__) = __VA_ARGS__;    \
-  ::paddle::framework::ScopeGuard<typename std::remove_reference<decltype( \
-      PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__))>::type>           \
-      PADDLE_CONCAT_TOKEN(__scope_guard, __LINE__)(                        \
-          PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__))
+#define DEFINE_PADDLE_SCOPE_GUARD(...)                                    \
+  auto PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__) = __VA_ARGS__;   \
+  ::paddle::framework::ScopeGuard<typename std::remove_reference<         \
+      decltype(PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__))>::type> \
+  PADDLE_CONCAT_TOKEN(__scope_guard, __LINE__)(                           \
+      PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__))
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/scope_guard_test.cc b/paddle/fluid/framework/scope_guard_test.cc
index d7a7a6168a368..793b3a1652a1c 100644
--- a/paddle/fluid/framework/scope_guard_test.cc
+++ b/paddle/fluid/framework/scope_guard_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/scope_guard.h"
+
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 1f821720d64d2..7bb8550926d63 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_ASCEND_CL)
 #include <float.h>
+
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/framework/selected_rows_utils.h b/paddle/fluid/framework/selected_rows_utils.h
index 8606295c45199..9ecff5719fb91 100644
--- a/paddle/fluid/framework/selected_rows_utils.h
+++ b/paddle/fluid/framework/selected_rows_utils.h
@@ -21,10 +21,9 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/phi/core/selected_rows.h"
-
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/selected_rows.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/selected_rows_utils_test.cc b/paddle/fluid/framework/selected_rows_utils_test.cc
index f23510c721e24..db2c6c1f991b7 100644
--- a/paddle/fluid/framework/selected_rows_utils_test.cc
+++ b/paddle/fluid/framework/selected_rows_utils_test.cc
@@ -9,11 +9,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/selected_rows_utils.h"
+
 #include <time.h>
+
 #include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/selected_rows_utils.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/string_array.cc b/paddle/fluid/framework/string_array.cc
old mode 100755
new mode 100644
index 3071e6bf4cff3..f6aee9b82f2c6
--- a/paddle/fluid/framework/string_array.cc
+++ b/paddle/fluid/framework/string_array.cc
@@ -12,12 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/string_array.h"
+
 #include <utf8proc.h>
 
 #include <exception>
 
 #include "glog/logging.h"
-#include "paddle/fluid/framework/string_array.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 57eddf782f06b..7ad9839d79dca 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -23,15 +23,14 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/stream.h"
-
-#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/stream.h"
 
 namespace paddle {
 
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index f5e230773fb2f..946b119ecb39f 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -24,12 +24,13 @@ namespace framework {
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   int rank = src.dims().size();
   PADDLE_ENFORCE_GE(
-      rank, 2, platform::errors::InvalidArgument(
-                   "'ReshapeToMatrix()' is only used for flatten high rank "
-                   "tensors to matrixs. The dimensions of Tensor must be "
-                   "greater or equal than 2. "
-                   "But received dimensions of Tensor is %d",
-                   rank));
+      rank, 2,
+      platform::errors::InvalidArgument(
+          "'ReshapeToMatrix()' is only used for flatten high rank "
+          "tensors to matrixs. The dimensions of Tensor must be "
+          "greater or equal than 2. "
+          "But received dimensions of Tensor is %d",
+          rank));
   if (rank == 2) {
     return src;
   }
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 3e104807535e9..05dd41eb6ffc5 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/tensor.h"
 
 #include <gtest/gtest.h>
+
 #include <string>
 
 namespace framework = paddle::framework;
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 1159280762f5a..1e25acb2c4ecb 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/tensor_util.h"
+
 #include <algorithm>
 #include <limits>
 #include <memory>
@@ -21,10 +23,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
-
 #include "paddle/phi/core/dense_tensor.h"
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -1249,10 +1249,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
      // proto buffer
     int32_t size = -1;
     is.read(reinterpret_cast<char*>(&size), sizeof(size));
-    PADDLE_ENFORCE_EQ(is.good(), true, platform::errors::Unavailable(
-                                           "Cannot read tensor desc size"));
-    PADDLE_ENFORCE_GE(size, 0, platform::errors::InvalidArgument(
-                                   "Tensor desc size should >= 0"));
+    PADDLE_ENFORCE_EQ(
+        is.good(), true,
+        platform::errors::Unavailable("Cannot read tensor desc size"));
+    PADDLE_ENFORCE_GE(
+        size, 0,
+        platform::errors::InvalidArgument("Tensor desc size should >= 0"));
     std::unique_ptr<char[]> buf(new char[size]);
     is.read(reinterpret_cast<char*>(buf.get()), size);
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 5e6e1227b1aac..2511fdf27ce69 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/tensor_util.h"
 
 #include <gtest/gtest.h>
+
 #include <cmath>
 
 namespace paddle {
@@ -254,64 +255,61 @@ TEST(TensorToVector, Tensor) {
 #endif
 }
 
-TEST(TensorToVector, Tensor_bool) {
-  {
-    paddle::framework::Tensor src;
-    bool* src_ptr =
-        src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
-    for (int i = 0; i < 3 * 3; ++i) {
-      src_ptr[i] = static_cast<bool>(i % 2);
-    }
+TEST(TensorToVector, Tensor_bool){{paddle::framework::Tensor src;
+bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
+for (int i = 0; i < 3 * 3; ++i) {
+  src_ptr[i] = static_cast<bool>(i % 2);
+}
 
-    paddle::platform::CPUPlace place;
-    std::vector<bool> dst;
-    paddle::framework::TensorToVector<bool>(src, &dst);
+paddle::platform::CPUPlace place;
+std::vector<bool> dst;
+paddle::framework::TensorToVector<bool>(src, &dst);
 
-    for (int i = 0; i < 3 * 3; ++i) {
-      EXPECT_EQ(src_ptr[i], dst[i]);
-    }
-  }
+for (int i = 0; i < 3 * 3; ++i) {
+  EXPECT_EQ(src_ptr[i], dst[i]);
+}
+}  // namespace framework
 #ifdef PADDLE_WITH_CUDA
-  {
-    std::vector<bool> src_vec = {
-        false, true, false, true, false, true, false, true, false,
-    };
-    paddle::framework::Tensor gpu_tensor;
-    paddle::platform::CUDAPlace place;
-    paddle::platform::CUDADeviceContext gpu_ctx(place);
-    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                             .GetAllocator(place, gpu_ctx.stream())
-                             .get());
-    gpu_ctx.PartialInitWithAllocator();
-    paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
-
-    std::vector<bool> dst;
-    paddle::framework::TensorToVector<bool>(gpu_tensor, gpu_ctx, &dst);
-
-    for (int i = 0; i < 3 * 3; ++i) {
-      EXPECT_EQ(src_vec[i], dst[i]);
-    }
+{
+  std::vector<bool> src_vec = {
+      false, true, false, true, false, true, false, true, false,
+  };
+  paddle::framework::Tensor gpu_tensor;
+  paddle::platform::CUDAPlace place;
+  paddle::platform::CUDADeviceContext gpu_ctx(place);
+  gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place, gpu_ctx.stream())
+                           .get());
+  gpu_ctx.PartialInitWithAllocator();
+  paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
+
+  std::vector<bool> dst;
+  paddle::framework::TensorToVector<bool>(gpu_tensor, gpu_ctx, &dst);
+
+  for (int i = 0; i < 3 * 3; ++i) {
+    EXPECT_EQ(src_vec[i], dst[i]);
   }
+}
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
-  {
-    std::vector<bool> src_vec = {
-        false, true, false, true, false, true, false, true, false,
-    };
-    paddle::framework::Tensor npu_tensor;
-    paddle::platform::NPUPlace place(0);
-    paddle::platform::NPUDeviceContext npu_ctx(place);
-    paddle::framework::TensorFromVector<bool>(src_vec, npu_ctx, &npu_tensor);
-
-    std::vector<bool> dst;
-    paddle::framework::TensorToVector<bool>(npu_tensor, npu_ctx, &dst);
-
-    for (int i = 0; i < 3 * 3; ++i) {
-      EXPECT_EQ(src_vec[i], dst[i]);
-    }
+{
+  std::vector<bool> src_vec = {
+      false, true, false, true, false, true, false, true, false,
+  };
+  paddle::framework::Tensor npu_tensor;
+  paddle::platform::NPUPlace place(0);
+  paddle::platform::NPUDeviceContext npu_ctx(place);
+  paddle::framework::TensorFromVector<bool>(src_vec, npu_ctx, &npu_tensor);
+
+  std::vector<bool> dst;
+  paddle::framework::TensorToVector<bool>(npu_tensor, npu_ctx, &dst);
+
+  for (int i = 0; i < 3 * 3; ++i) {
+    EXPECT_EQ(src_vec[i], dst[i]);
   }
-#endif
 }
+#endif
+}  // namespace paddle
 
 TEST(TensorFromDLPack, Tensor) {
   {
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 33533b1d10feb..b704ac4329dc8 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -43,8 +43,9 @@ void ThreadPool::Init() {
       num_threads = FLAGS_dist_threadpool_size;
       VLOG(1) << "set dist_threadpool_size to " << num_threads;
     }
-    PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument(
-                                          "The number of threads is 0."));
+    PADDLE_ENFORCE_GT(
+        num_threads, 0,
+        platform::errors::InvalidArgument("The number of threads is 0."));
     threadpool_.reset(new ThreadPool(num_threads));
   }
 }
diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc
index 1278a0f0643f4..0b6e12967fe1b 100644
--- a/paddle/fluid/framework/threadpool_test.cc
+++ b/paddle/fluid/framework/threadpool_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/threadpool.h"
+
 #include <gtest/gtest.h>
+
 #include <atomic>
 
 namespace framework = paddle::framework;
diff --git a/paddle/fluid/framework/trainer.cc b/paddle/fluid/framework/trainer.cc
index b033f9a99d6d9..dc48a8f8d8f2f 100644
--- a/paddle/fluid/framework/trainer.cc
+++ b/paddle/fluid/framework/trainer.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/trainer.h"
+
 #include "io/fs.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index 1f1122d32f5c3..48ea9143d621a 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/trainer_factory.h"
 
 #include <stdlib.h>
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/framework/trainer_test.cc b/paddle/fluid/framework/trainer_test.cc
index f689679d48696..1f4a162f90616 100644
--- a/paddle/fluid/framework/trainer_test.cc
+++ b/paddle/fluid/framework/trainer_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/trainer.h"
+
 #include <gtest/gtest.h>
 
 namespace paddle {
@@ -23,5 +24,5 @@ TEST() {
   // create dataset
   // train for a while
 }
-}
-}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 0937d96ad4c20..5feedb2c3d670 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/variant.h"
 #include "paddle/utils/small_vector.h"
diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc
index 2f03dc41ce002..43c44ff525fca 100644
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/unused_var_check.h"
 
 #include <glog/logging.h>
+
 #include <string>
 
 #include "gflags/gflags.h"
diff --git a/paddle/fluid/framework/unused_var_check.h b/paddle/fluid/framework/unused_var_check.h
index 95f6917fbcde7..cc4977e439c4c 100644
--- a/paddle/fluid/framework/unused_var_check.h
+++ b/paddle/fluid/framework/unused_var_check.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <glog/logging.h>
+
 #include <string>
 #include <unordered_set>
 
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
index 0a24efd003bcf..3a3edc9b4c64e 100644
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -318,18 +318,20 @@ void VarDesc::SetAttr(const std::string &name, const Attribute &v) {
   bool valid = attr_type == proto::AttrType::INT ||
                attr_type == proto::AttrType::STRING ||
                attr_type == proto::AttrType::INTS;
-  PADDLE_ENFORCE_EQ(valid, true, platform::errors::InvalidArgument(
-                                     "The value for attr (%s) must be "
-                                     "one of list or int or string.",
-                                     name));
+  PADDLE_ENFORCE_EQ(
+      valid, true,
+      platform::errors::InvalidArgument("The value for attr (%s) must be "
+                                        "one of list or int or string.",
+                                        name));
 
   this->attrs_[name] = v;
 }
 
 Attribute VarDesc::GetAttr(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
-                                          "Attribute %s is not found.", name));
+  PADDLE_ENFORCE_NE(
+      it, attrs_.end(),
+      platform::errors::NotFound("Attribute %s is not found.", name));
   return it->second;
 }
 
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index 5483ef01c0844..ce489a57a019e 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -245,9 +245,12 @@ TEST(InferVarType, multiple_api) {
   ASSERT_ANY_THROW(infer.SetDataTypes(&ctx, "test2_a_out", {}));
 
   ASSERT_EQ(0u, infer.GetShape(&ctx, "test2_a_out").size());
-  infer.SetShape(&ctx, "test2_a_out", {
-                                          1, 3, 3,
-                                      });
+  infer.SetShape(&ctx, "test2_a_out",
+                 {
+                     1,
+                     3,
+                     3,
+                 });
   ASSERT_EQ(3u, infer.GetShape(&ctx, "test2_a_out").size());
 
   ASSERT_EQ(0, infer.GetLoDLevel(&ctx, "test2_a_out"));
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index 401ccb03d78d6..345928666bd52 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/var_type_traits.h"
+
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/scope.h"
@@ -25,6 +26,7 @@
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 #include <cudnn.h>
+
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #endif
@@ -41,6 +43,8 @@
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #endif
 
+#include "paddle/fluid/operators/cuda_graph_with_in_out.h"
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 9fe67e1dcdff3..463331494d908 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -87,6 +87,8 @@ namespace operators {
 
 class CudnnRNNCache;
 
+class CUDAGraphWithInOuts;
+
 namespace reader {
 class LoDTensorBlockingQueueHolder;
 class OrderedMultiDeviceLoDTensorBlockingQueueHolder;
@@ -189,7 +191,8 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #if defined(PADDLE_WITH_CNCL)
     cnclCliqueId,
 #endif
-    int, float, Vocab>;
+    std::vector<std::unique_ptr<operators::CUDAGraphWithInOuts>>, int, float,
+    Vocab>;
 template <typename T>
 struct VarTypeTrait {
   static_assert(VarTypeRegistry::IsRegistered<T>(), "Must be registered type");
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
index 00ae5154f83ab..4a81f66948de3 100644
--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -12,13 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/var_type_traits.h"
+
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
-#include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #ifdef PADDLE_WITH_CUDA
 #if defined(PADDLE_WITH_NCCL)
diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc
index 92042e4725986..c01bef79cdccd 100644
--- a/paddle/fluid/framework/version.cc
+++ b/paddle/fluid/framework/version.cc
@@ -24,7 +24,7 @@ bool IsProgramVersionSupported(int64_t version) {
    * new version. The compatibility judgment cannot be made only
    * by the version number. Please do not use this interface,
    * it may be discarded because backward compatibility.
-  */
+   */
   return true;
 }
 
@@ -33,7 +33,7 @@ bool IsTensorVersionSupported(uint32_t version) {
    * new version. The compatibility judgment cannot be made only
    * by the version number. Please do not use this interface,
    * it may be discarded because backward compatibility.
-  */
+   */
   return true;
 }
 
diff --git a/paddle/fluid/framework/version_test.cc b/paddle/fluid/framework/version_test.cc
index ec5a340ee6ef3..7c52209981ff9 100644
--- a/paddle/fluid/framework/version_test.cc
+++ b/paddle/fluid/framework/version_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/version.h"
+
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 92af1901b71ab..eaf0a09541d77 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,65 +1,214 @@
-cc_library(imperative_flag SRCS flags.cc DEPS gflags flags)
-cc_library(var_helper SRCS var_helper.cc DEPS tensor phi_api)
-IF(WITH_XPU)
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils phi_api phi_utils var_helper)
-ELSE()
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils phi_api phi_utils var_helper)
-ENDIF()
-cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry var_helper phi_api)
+cc_library(
+  imperative_flag
+  SRCS flags.cc
+  DEPS gflags flags)
+cc_library(
+  var_helper
+  SRCS var_helper.cc
+  DEPS tensor phi_api)
+if(WITH_XPU)
+  cc_library(
+    prepared_operator
+    SRCS prepared_operator.cc
+    DEPS xpu_op_list
+         proto_desc
+         operator
+         device_context
+         lod_tensor
+         selected_rows_utils
+         var_type_traits
+         op_kernel_type
+         data_transform
+         nan_inf_utils
+         phi_api
+         phi_utils
+         var_helper)
+else()
+  cc_library(
+    prepared_operator
+    SRCS prepared_operator.cc
+    DEPS proto_desc
+         operator
+         device_context
+         lod_tensor
+         selected_rows_utils
+         var_type_traits
+         op_kernel_type
+         data_transform
+         nan_inf_utils
+         phi_api
+         phi_utils
+         var_helper)
+endif()
+cc_library(
+  layer
+  SRCS layer.cc
+  DEPS prepared_operator
+       math_function
+       imperative_flag
+       variable_helper
+       op_registry
+       var_helper
+       phi_api)
 add_subdirectory(jit)
-if (WITH_GPU)
-cc_library(layout_autotune SRCS layout_autotune.cc DEPS op_info phi_gpu_info)
+if(WITH_GPU)
+  cc_library(
+    layout_autotune
+    SRCS layout_autotune.cc
+    DEPS op_info phi_gpu_info)
 else()
-cc_library(layout_autotune SRCS layout_autotune.cc DEPS op_info)
+  cc_library(
+    layout_autotune
+    SRCS layout_autotune.cc
+    DEPS op_info)
 endif()
-cc_library(amp SRCS amp_auto_cast.cc DEPS layer var_helper)
-cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper layout_autotune)
-cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator switch_autotune)
-cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator switch_autotune)
-cc_library(imperative_profiler SRCS profiler.cc DEPS flags)
+cc_library(
+  amp
+  SRCS amp_auto_cast.cc
+  DEPS layer var_helper)
+cc_library(
+  tracer
+  SRCS tracer.cc
+  DEPS layer
+       engine
+       program_desc_tracer
+       amp
+       denormal
+       garbage_collector
+       var_helper
+       layout_autotune)
+cc_library(
+  basic_engine
+  SRCS basic_engine.cc
+  DEPS layer gradient_accumulator switch_autotune)
+cc_library(
+  engine
+  SRCS basic_engine.cc partial_grad_engine.cc
+  DEPS layer gradient_accumulator switch_autotune)
+cc_library(
+  imperative_profiler
+  SRCS profiler.cc
+  DEPS flags)
 if(NOT WIN32)
-    if(WITH_NCCL OR WITH_RCCL)
-        cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows_utils tensor)
-        cc_library(nccl_context SRCS nccl_context.cc DEPS collective_helper device_context imperative_all_reduce var_type_traits)
-        if(WITH_NCCL)
-            nv_library(reducer SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce)
-        endif()
-        if(WITH_RCCL)
-            hip_library(reducer SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce)
-        endif()
-    endif()
-    if(WITH_XPU_BKCL)
-        cc_library(bkcl_context SRCS bkcl_context.cc DEPS collective_helper device_context tensor var_type_traits)
-        cc_library(reducer SRCS reducer.cc DEPS layer)
+  if(WITH_NCCL OR WITH_RCCL)
+    cc_library(
+      imperative_all_reduce
+      SRCS all_reduce.cc
+      DEPS collective_helper device_context selected_rows_utils tensor)
+    cc_library(
+      nccl_context
+      SRCS nccl_context.cc
+      DEPS collective_helper device_context imperative_all_reduce
+           var_type_traits)
+    if(WITH_NCCL)
+      nv_library(
+        reducer
+        SRCS reducer.cc reducer.cu
+        DEPS layer imperative_all_reduce)
     endif()
-    if(WITH_ASCEND_CL)
-        cc_library(hccl_context SRCS hccl_context.cc DEPS collective_helper device_context tensor var_type_traits)
-        cc_library(reducer SRCS reducer.cc DEPS layer)
+    if(WITH_RCCL)
+      hip_library(
+        reducer
+        SRCS reducer.cc reducer.cu
+        DEPS layer imperative_all_reduce)
     endif()
-    if(WITH_CNCL)
-        cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits)
-	cc_library(reducer SRCS reducer.cc DEPS layer)
-    endif()
-    if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL)
-        cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits)
-    endif()
-    cc_library(data_loader SRCS data_loader.cc DEPS enforce)
+  endif()
+  if(WITH_XPU_BKCL)
+    cc_library(
+      bkcl_context
+      SRCS bkcl_context.cc
+      DEPS collective_helper device_context tensor var_type_traits)
+    cc_library(
+      reducer
+      SRCS reducer.cc
+      DEPS layer)
+  endif()
+  if(WITH_ASCEND_CL)
+    cc_library(
+      hccl_context
+      SRCS hccl_context.cc
+      DEPS collective_helper device_context tensor var_type_traits)
+    cc_library(
+      reducer
+      SRCS reducer.cc
+      DEPS layer)
+  endif()
+  if(WITH_CNCL)
+    cc_library(
+      cncl_context
+      SRCS cncl_context.cc
+      DEPS collective_helper device_context tensor var_type_traits)
+    cc_library(
+      reducer
+      SRCS reducer.cc
+      DEPS layer)
+  endif()
+  if(WITH_NCCL
+     OR WITH_RCCL
+     OR WITH_XPU_BKCL
+     OR WITH_ASCEND_CL)
+    cc_library(
+      heter_ccl_context
+      SRCS heter_ccl_context.cc
+      DEPS collective_helper device_context tensor var_type_traits)
+  endif()
+  cc_library(
+    data_loader
+    SRCS data_loader.cc
+    DEPS enforce)
 endif(NOT WIN32)
 if(WITH_GLOO)
-    cc_library(imperative_gloo_context SRCS gloo_context.cc DEPS collective_helper device_context tensor var_type_traits)
-    if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL OR WITH_CNCL) ))
-        cc_library(reducer SRCS reducer.cc DEPS layer)
-    endif()
+  cc_library(
+    imperative_gloo_context
+    SRCS gloo_context.cc
+    DEPS collective_helper device_context tensor var_type_traits)
+  if(WIN32
+     OR (NOT
+         (WITH_NCCL
+          OR WITH_RCCL
+          OR WITH_XPU_BKCL
+          OR WITH_ASCEND_CL
+          OR WITH_CNCL)
+        ))
+    cc_library(
+      reducer
+      SRCS reducer.cc
+      DEPS layer)
+  endif()
 endif()
 
 if(WITH_MLU)
-    SET(MLU_DEPS mlu_baseop)
+  set(MLU_DEPS mlu_baseop)
 endif()
 
 if(NOT WITH_ASCEND_CL)
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor ${MLU_DEPS})
+  cc_library(
+    gradient_accumulator
+    SRCS gradient_accumulator.cc
+    DEPS blas
+         operator
+         lod_tensor
+         selected_rows_utils
+         selected_rows_functor
+         var_type_traits
+         layer
+         math_function
+         phi_tensor
+         ${MLU_DEPS})
 else()
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner phi_tensor)
+  cc_library(
+    gradient_accumulator
+    SRCS gradient_accumulator.cc
+    DEPS blas
+         operator
+         lod_tensor
+         selected_rows_utils
+         selected_rows_functor
+         var_type_traits
+         layer
+         math_function
+         npu_op_runner
+         phi_tensor)
 endif()
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index 436e22f00c303..f6484d5cdda08 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -15,6 +15,7 @@
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 
 #include "paddle/fluid/imperative/all_reduce.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 
 #ifdef PADDLE_WITH_NCCL
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 3f6863d642cc8..ff6e297ba8003 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/amp_auto_cast.h"
+
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/imperative/type_defs.h"
@@ -302,9 +304,8 @@ static inline framework::proto::VarType::Type GetPromoteType(
   // dtype of input(X)
   if (op_type == "moving_average_abs_max_scale") {
     for (const auto& pair : ins) {
-      if (pair.first == "X" &&
-          GetDataType<VarType>(pair.second.front()) ==
-              framework::proto::VarType::FP16) {
+      if (pair.first == "X" && GetDataType<VarType>(pair.second.front()) ==
+                                   framework::proto::VarType::FP16) {
         dst_type = framework::proto::VarType::FP16;
       }
     }
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index 49761a8df0b6b..fcc30b2590a6c 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -19,6 +19,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 
diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc
index 11abbfe7cf6a3..9990fde95ce64 100644
--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
@@ -14,13 +14,14 @@
 
 #if defined(PADDLE_WITH_XPU_BKCL)
 
+#include "paddle/fluid/imperative/bkcl_context.h"
+
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/imperative/bkcl_context.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -46,10 +47,11 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
   auto bkcl_dtype =
       platform::ToBKCLDataType(framework::TransToProtoVarType(src.dtype()));
 
-  PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), src_ptr, dst_ptr, src.numel(),
-                                    bkcl_dtype, BKCL_ADD, stream),
-                    BKCL_SUCCESS, platform::errors::PreconditionNotMet(
-                                      "BKCL all reduce failed"));
+  PADDLE_ENFORCE_EQ(
+      bkcl_all_reduce(comm->comm(), src_ptr, dst_ptr, src.numel(), bkcl_dtype,
+                      BKCL_ADD, stream),
+      BKCL_SUCCESS,
+      platform::errors::PreconditionNotMet("BKCL all reduce failed"));
 }
 /*
 Baidu Kunlun Communication Library(BKCL) is designed for multi Baidu Kunlun
diff --git a/paddle/fluid/imperative/cncl_context.cc b/paddle/fluid/imperative/cncl_context.cc
index 779b748c2d2d4..19f22e7402989 100644
--- a/paddle/fluid/imperative/cncl_context.cc
+++ b/paddle/fluid/imperative/cncl_context.cc
@@ -18,14 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
-
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/gen_comm_id_helper.h"
-#include "paddle/fluid/platform/place.h"
-
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/mlu/cncl_helper.h"
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
@@ -184,8 +182,9 @@ paddle::platform::DeviceContext *CNCLParallelContext::GetDeviceContext(
 }
 
 void CNCLParallelContext::WaitCompute(int ring_id) {
-  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
-                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_GE(
+      ring_id, 0,
+      platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id));
   PADDLE_ENFORCE_LT(ring_id, compute_events_.size(),
                     platform::errors::OutOfRange(
                         "ring id must < compute events size,"
@@ -205,8 +204,9 @@ void CNCLParallelContext::WaitCompute(int ring_id) {
 }
 
 void CNCLParallelContext::WaitComm(int ring_id) {
-  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
-                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_GE(
+      ring_id, 0,
+      platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id));
   PADDLE_ENFORCE_LT(ring_id, comm_events_.size(),
                     platform::errors::OutOfRange(
                         "ring id must < comm events size,"
diff --git a/paddle/fluid/imperative/data_loader.cc b/paddle/fluid/imperative/data_loader.cc
index c43149c9b563e..66eed2981062a 100644
--- a/paddle/fluid/imperative/data_loader.cc
+++ b/paddle/fluid/imperative/data_loader.cc
@@ -19,6 +19,7 @@
 #include <stdlib.h>
 #include <sys/wait.h>
 #include <unistd.h>
+
 #include <csignal>
 
 #include "glog/logging.h"
diff --git a/paddle/fluid/imperative/data_loader.h b/paddle/fluid/imperative/data_loader.h
index fdfa117eafe76..e66a3b9edc3ff 100644
--- a/paddle/fluid/imperative/data_loader.h
+++ b/paddle/fluid/imperative/data_loader.h
@@ -17,6 +17,7 @@
 #ifndef _WIN32
 
 #include <unistd.h>
+
 #include <cstdint>
 #include <set>
 
diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h
index 124c31df73349..fe426a76b3292 100644
--- a/paddle/fluid/imperative/execution_context.h
+++ b/paddle/fluid/imperative/execution_context.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/variable.h"
diff --git a/paddle/fluid/imperative/flags.cc b/paddle/fluid/imperative/flags.cc
index c2d668eccdaf9..df424b32fcadf 100644
--- a/paddle/fluid/imperative/flags.cc
+++ b/paddle/fluid/imperative/flags.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/flags.h"
+
 #include "paddle/fluid/platform/flags.h"
 
 PADDLE_DEFINE_EXPORTED_uint64(dygraph_debug, 0,
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index dd34b8b619f80..c5bcab4daa9a9 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/gloo_context.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #include "paddle/fluid/framework/tensor_util.h"
diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h
index 23e4e02945bf6..5e0973e7e9913 100644
--- a/paddle/fluid/imperative/gloo_context.h
+++ b/paddle/fluid/imperative/gloo_context.h
@@ -16,6 +16,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/variable.h"
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 499cf4d8ad6d8..36e6f551dc63b 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -874,8 +874,9 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
           }
 
           PADDLE_ENFORCE_EQ(var_info.var->Var().IsType<framework::LoDTensor>(),
-                            true, platform::errors::PermissionDenied(
-                                      "Gradient var must be LoDTensor"));
+                            true,
+                            platform::errors::PermissionDenied(
+                                "Gradient var must be LoDTensor"));
           if (CurCnt() == 0) {
             MoveOrCopyVar(dst_var->MutableVar(), var_info.var->MutableVar(),
                           var_info.unchange_input);
@@ -896,9 +897,10 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
           PADDLE_ENFORCE_EQ(
               var_info.var->Var().IsType<framework::LoDTensor>() ||
                   var_info.var->Var().IsType<phi::SelectedRows>(),
-              true, platform::errors::PermissionDenied("The type of Gradient "
-                                                       "var must be LoDTensor "
-                                                       "or SelectedRows"));
+              true,
+              platform::errors::PermissionDenied("The type of Gradient "
+                                                 "var must be LoDTensor "
+                                                 "or SelectedRows"));
           if (CurCnt() == 0) {
             MoveOrCopyVar(dst_var->MutableVar(), var_info.var->MutableVar(),
                           var_info.unchange_input);
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index 03f6775defc2f..382623b627623 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/layer.h"
diff --git a/paddle/fluid/imperative/hccl_context.cc b/paddle/fluid/imperative/hccl_context.cc
index 31d988753f23c..8fb434cbc2aee 100644
--- a/paddle/fluid/imperative/hccl_context.cc
+++ b/paddle/fluid/imperative/hccl_context.cc
@@ -13,18 +13,16 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/hccl_context.h"
-#include "paddle/fluid/framework/convert_utils.h"
 
+#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
-
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
 
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
-
 namespace paddle {
 namespace framework {
 class Variable;
@@ -193,8 +191,9 @@ paddle::platform::DeviceContext *HCCLParallelContext::GetDeviceContext(
 }
 
 void HCCLParallelContext::WaitCompute(int ring_id) {
-  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
-                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_GE(
+      ring_id, 0,
+      platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id));
   PADDLE_ENFORCE_LT(ring_id, compute_events_.size(),
                     platform::errors::OutOfRange(
                         "ring id must < compute events size,"
@@ -214,8 +213,9 @@ void HCCLParallelContext::WaitCompute(int ring_id) {
 }
 
 void HCCLParallelContext::WaitComm(int ring_id) {
-  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
-                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_GE(
+      ring_id, 0,
+      platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id));
   PADDLE_ENFORCE_LT(ring_id, comm_events_.size(),
                     platform::errors::OutOfRange(
                         "ring id must < comm events size,"
diff --git a/paddle/fluid/imperative/infer_var_type_context.h b/paddle/fluid/imperative/infer_var_type_context.h
index 297ec840db4c0..079e180c2a70d 100644
--- a/paddle/fluid/imperative/infer_var_type_context.h
+++ b/paddle/fluid/imperative/infer_var_type_context.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/imperative/type_defs.h"
diff --git a/paddle/fluid/imperative/jit/CMakeLists.txt b/paddle/fluid/imperative/jit/CMakeLists.txt
index 66f2a9840798c..bcc1c0746b823 100644
--- a/paddle/fluid/imperative/jit/CMakeLists.txt
+++ b/paddle/fluid/imperative/jit/CMakeLists.txt
@@ -1,2 +1,8 @@
-cc_library(op_desc_meta SRCS op_desc_meta.cc DEPS proto_desc layer)
-cc_library(program_desc_tracer SRCS program_desc_tracer.cc DEPS op_desc_meta)
+cc_library(
+  op_desc_meta
+  SRCS op_desc_meta.cc
+  DEPS proto_desc layer)
+cc_library(
+  program_desc_tracer
+  SRCS program_desc_tracer.cc
+  DEPS op_desc_meta)
diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.cc b/paddle/fluid/imperative/jit/program_desc_tracer.cc
index 35ff262fe3d86..e0f52beb6e555 100644
--- a/paddle/fluid/imperative/jit/program_desc_tracer.cc
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/jit/program_desc_tracer.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 76f64ab73a64b..7357db4e2001b 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -16,7 +16,6 @@
 
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/framework/convert_utils.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/infer_var_type_context.h"
 #include "paddle/fluid/imperative/op_base.h"
@@ -284,9 +283,10 @@ std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
   PADDLE_ENFORCE_EQ(
       Var().IsInitialized() && (Var().IsType<framework::LoDTensor>() ||
                                 Var().IsType<phi::SelectedRows>()),
-      true, platform::errors::InvalidArgument(
-                "Variable is not initialized or Variable's type is not "
-                "LoDTensor or SelectedRows when getting numpy tensor"));
+      true,
+      platform::errors::InvalidArgument(
+          "Variable is not initialized or Variable's type is not "
+          "LoDTensor or SelectedRows when getting numpy tensor"));
 
   if (Var().IsType<framework::LoDTensor>()) {
     auto& src_tensor = Var().Get<framework::LoDTensor>();
diff --git a/paddle/fluid/imperative/layout_autotune.cc b/paddle/fluid/imperative/layout_autotune.cc
index ed0526eaad316..e936505b2ae03 100644
--- a/paddle/fluid/imperative/layout_autotune.cc
+++ b/paddle/fluid/imperative/layout_autotune.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/layout_autotune.h"
+
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/imperative/layout_transformer.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
@@ -119,8 +120,9 @@ paddle::imperative::NameVarMap<VarType> AutoTuneLayout(
         LayoutAutoTune::Instance().SetDesiredLayout(DataLayout::NHWC);
         VLOG(3) << "Tune the layout from "
                 << BOOST_GET_CONST(std::string, (*attrs)["data_format"])
-                << " to " << paddle::framework::DataLayoutToString(
-                                 LayoutAutoTune::Instance().GetDesiredLayout());
+                << " to "
+                << paddle::framework::DataLayoutToString(
+                       LayoutAutoTune::Instance().GetDesiredLayout());
       } else {
         LayoutAutoTune::Instance().DisableLayoutAutoTune();
         return ins;
diff --git a/paddle/fluid/imperative/layout_autotune.h b/paddle/fluid/imperative/layout_autotune.h
index df3772b826da1..2da368910e6c3 100644
--- a/paddle/fluid/imperative/layout_autotune.h
+++ b/paddle/fluid/imperative/layout_autotune.h
@@ -14,8 +14,10 @@
 
 #pragma once
 #include <glog/logging.h>
+
 #include <memory>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/phi/common/layout.h"
 
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index e9d987cc7045f..4a0dcb1b3bbea 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -22,6 +22,7 @@
 
 #ifdef PADDLE_WITH_NCCL
 #include <nccl.h>
+
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
 
@@ -159,8 +160,9 @@ paddle::platform::DeviceContext *NCCLParallelContext::GetDeviceContext(
 }
 
 void NCCLParallelContext::WaitCompute(int ring_id) {
-  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
-                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_GE(
+      ring_id, 0,
+      platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id));
   PADDLE_ENFORCE_LT(ring_id, compute_events_.size(),
                     platform::errors::OutOfRange(
                         "ring id must < compute events size,"
@@ -185,8 +187,9 @@ void NCCLParallelContext::WaitCompute(int ring_id) {
 }
 
 void NCCLParallelContext::WaitComm(int ring_id) {
-  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
-                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_GE(
+      ring_id, 0,
+      platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id));
   PADDLE_ENFORCE_LT(ring_id, comm_events_.size(),
                     platform::errors::OutOfRange(
                         "ring id must < comm events size,"
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index b8a616ae67d21..ba0221a1729fa 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/saved_variable_wrapper_list.h"
 #include "paddle/fluid/imperative/type_defs.h"
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index f2f64d92a23fc..a4baca6f25724 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -24,6 +24,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 #include "paddle/fluid/imperative/layer.h"
diff --git a/paddle/fluid/imperative/partial_grad_engine.h b/paddle/fluid/imperative/partial_grad_engine.h
index b5da39f8d4237..4ec6cdb3fcd5d 100644
--- a/paddle/fluid/imperative/partial_grad_engine.h
+++ b/paddle/fluid/imperative/partial_grad_engine.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index cfd3813d60d44..ac99755786359 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -258,7 +258,7 @@ PreparedOp PrepareImpl(
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
         && !is_xpu_unsupport
 #endif
-        ) {
+    ) {
       VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name
               << " | kernel key: " << pt_kernel_key
               << " | kernel: " << phi_kernel;
@@ -306,7 +306,7 @@ PreparedOp PrepareImpl(
 #if defined(PADDLE_WITH_XPU_KP)
       || (is_xpu_unsupport && !is_xpu_kp_support)
 #endif
-          ) {
+  ) {
     if (has_phi_kernel) {
       auto pt_cpu_kernel_key =
           FallBackToCpu(expected_kernel_key, pt_kernel_key, op);
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index ccc8d64517f95..0c2d70dfe3c82 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
@@ -28,8 +29,6 @@
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/var_helper.h"
-
-#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/selected_rows.h"
diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc
index 48af63056c5e3..097f62fe42258 100644
--- a/paddle/fluid/imperative/profiler.cc
+++ b/paddle/fluid/imperative/profiler.cc
@@ -18,7 +18,9 @@
 #include "gperftools/profiler.h"
 #endif
 #include <glog/logging.h>
+
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/flags.h"
 
 PADDLE_DEFINE_EXPORTED_string(
diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h
index 2d7d319203833..f5951a52d718e 100644
--- a/paddle/fluid/imperative/py_layer_fwd.h
+++ b/paddle/fluid/imperative/py_layer_fwd.h
@@ -16,12 +16,12 @@
 
 #include <string>
 #include <vector>
-#include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/imperative/prepared_operator.h"
-#include "paddle/fluid/imperative/tracer.h"
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/prepared_operator.h"
+#include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/operators/py_layer_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index c7fd2215eb42a..47d7b6366f700 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -18,13 +18,10 @@
 
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/string/string_helper.h"
-
+#include "paddle/fluid/imperative/parallel_context.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
-
-#include "paddle/fluid/imperative/parallel_context.h"
-
+#include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/dense_tensor.h"
 namespace paddle {
 namespace imperative {
@@ -452,8 +449,9 @@ void Reducer::InitializeDenseGroups(
                           "Tensor %s is not initialized.", var_name));
     const auto size = lod_tensor->numel();
     PADDLE_ENFORCE_GT(
-        size, 0, platform::errors::PreconditionNotMet(
-                     "The number of tensor %s's elements is 0.", var_name));
+        size, 0,
+        platform::errors::PreconditionNotMet(
+            "The number of tensor %s's elements is 0.", var_name));
     all_length += size;
 
     p_group->length_.push_back(size);
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index 9fac4b41cbde0..852d8cf076acb 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <ThreadPool.h>
+
 #include <algorithm>
 #include <iostream>
 #include <map>
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index 09de0106ed619..5084363b9c135 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -1,26 +1,108 @@
 if(WIN32)
-    cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS device_context)
+  cc_test(
+    nccl_context_test
+    SRCS nccl_context_test.cc
+    DEPS device_context)
 else()
-    if (WITH_GLOO AND (WITH_NCCL OR WITH_RCCL))
-        cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
-        cc_test(heter_ccl_context_test SRCS heter_ccl_context_test.cc DEPS heter_ccl_context nccl_context imperative_gloo_context gloo_context gloo_wrapper gloo fs shell)
-        #set_tests_properties(heter_ccl_context_test PROPERTIES LABELS "RUN_TYPE=DIST")
-    endif()
-    if (WITH_XPU_BKCL)
-        cc_test(bkcl_context_test SRCS bkcl_context_test.cc DEPS bkcl_context)
-    endif()
-    if (WITH_CNCL)
-        cc_test(cncl_context_test SRCS cncl_context_test.cc DEPS cncl_context)
-    endif()
+  if(WITH_GLOO AND (WITH_NCCL OR WITH_RCCL))
+    cc_test(
+      nccl_context_test
+      SRCS nccl_context_test.cc
+      DEPS nccl_context)
+    cc_test(
+      heter_ccl_context_test
+      SRCS heter_ccl_context_test.cc
+      DEPS heter_ccl_context
+           nccl_context
+           imperative_gloo_context
+           gloo_context
+           gloo_wrapper
+           gloo
+           fs
+           shell)
+    #set_tests_properties(heter_ccl_context_test PROPERTIES LABELS "RUN_TYPE=DIST")
+  endif()
+  if(WITH_XPU_BKCL)
+    cc_test(
+      bkcl_context_test
+      SRCS bkcl_context_test.cc
+      DEPS bkcl_context)
+  endif()
+  if(WITH_CNCL)
+    cc_test(
+      cncl_context_test
+      SRCS cncl_context_test.cc
+      DEPS cncl_context)
+  endif()
 endif(WIN32)
 
-
-cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function phi_tensor phi_api phi_api_utils)
-cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy)
-cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place)
-cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
-cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy)
-cc_test(test_eager SRCS test_eager.cc DEPS tracer layer prepared_operator mul_op)
-if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_CNCL)
-cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy)
+cc_test(
+  test_gradient_accmulator
+  SRCS test_gradient_accmulator.cc
+  DEPS memcpy
+       selected_rows_utils
+       selected_rows_functor
+       gradient_accumulator
+       math_function
+       phi_tensor
+       phi_api
+       phi_api_utils)
+cc_test(
+  test_layer
+  SRCS test_layer.cc
+  DEPS layer
+       proto_desc
+       operator
+       op_registry
+       variable_helper
+       mul_op
+       memcpy)
+cc_test(
+  test_prepare_op
+  SRCS test_prepare_op.cc
+  DEPS prepared_operator
+       op_info
+       split_op
+       layer
+       concat_and_split
+       activation_op
+       place)
+cc_test(
+  test_tracer
+  SRCS test_tracer.cc
+  DEPS tracer
+       layer
+       proto_desc
+       operator
+       op_registry
+       variable_helper
+       mul_op
+       reduce_sum_op
+       elementwise_add_op
+       memcpy)
+cc_test(
+  test_hooks
+  SRCS test_hooks.cc
+  DEPS tracer
+       basic_engine
+       layer
+       proto_desc
+       operator
+       op_registry
+       variable_helper
+       mul_op
+       elementwise_add_op
+       memcpy)
+cc_test(
+  test_eager
+  SRCS test_eager.cc
+  DEPS tracer layer prepared_operator mul_op)
+if(WITH_NCCL
+   OR WITH_RCCL
+   OR WITH_XPU_BKCL
+   OR WITH_CNCL)
+  cc_test(
+    test_group
+    SRCS test_group.cc
+    DEPS reducer concat_and_split memcpy)
 endif()
diff --git a/paddle/fluid/imperative/tests/bkcl_context_test.cc b/paddle/fluid/imperative/tests/bkcl_context_test.cc
index 580d86b1696bc..b4d299ba829d9 100644
--- a/paddle/fluid/imperative/tests/bkcl_context_test.cc
+++ b/paddle/fluid/imperative/tests/bkcl_context_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <thread>  // NOLINT
-
 #include "paddle/fluid/imperative/bkcl_context.h"
 
+#include <thread>  // NOLINT
+
 #include "gtest/gtest.h"
 
 namespace imperative = paddle::imperative;
diff --git a/paddle/fluid/imperative/tests/cncl_context_test.cc b/paddle/fluid/imperative/tests/cncl_context_test.cc
index 1d5ee8e7fc899..1019d4eacdc9f 100644
--- a/paddle/fluid/imperative/tests/cncl_context_test.cc
+++ b/paddle/fluid/imperative/tests/cncl_context_test.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/imperative/cncl_context.h"
+
 #include <thread>  // NOLINT
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/imperative/cncl_context.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 
-#include "gtest/gtest.h"
-
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
 namespace framework = paddle::framework;
diff --git a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
index 91f38f82ed058..67059916d0317 100644
--- a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/imperative/heter_ccl_context.h"
+
 #include <chrono>
 #include <thread>  // NOLINT
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/imperative/heter_ccl_context.h"
-
-#include "gtest/gtest.h"
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc
index 9ee083626c5b8..48479e1412b4b 100644
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/imperative/nccl_context.h"
+
 #include <thread>  // NOLINT
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/imperative/nccl_context.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 
-#include "gtest/gtest.h"
-
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
 namespace framework = paddle::framework;
diff --git a/paddle/fluid/imperative/tests/test_eager.cc b/paddle/fluid/imperative/tests/test_eager.cc
index 3def103ae9aa5..1d6ec7330756f 100644
--- a/paddle/fluid/imperative/tests/test_eager.cc
+++ b/paddle/fluid/imperative/tests/test_eager.cc
@@ -88,8 +88,9 @@ TEST(test_var_helper, eager_var_helper) {
       egr_tensor, framework::OpKernelType(framework::proto::VarType::FP32,
                                           platform::CPUPlace()));
   SetCachedValue<egr::EagerVariable>(
-      egr_tensor, framework::OpKernelType(framework::proto::VarType::FP32,
-                                          platform::CPUPlace()),
+      egr_tensor,
+      framework::OpKernelType(framework::proto::VarType::FP32,
+                              platform::CPUPlace()),
       egr_tensor2);
   ASSERT_ANY_THROW(GetPlace<egr::EagerVariable>(egr_tensor2));
   ASSERT_ANY_THROW(SetType<egr::EagerVariable>(
diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
index 88b18a4c17620..d2e768d6ef114 100644
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@@ -384,7 +384,7 @@ static void TestGradientAccumulatorTestUnchangeInput(
     for (auto use_tensor2 : use_tensors) {
       /** g_accum1 && g_accum2: has not been initialized
        *    test accumulate on this graph
-      */
+       */
       auto g_var1 = std::make_shared<VariableWrapper>("g_var1");
       g_var1->SetOverridedStopGradient(false);
       auto g_accum1 = CreateAccumulator(g_var1, sort_gradient);
@@ -437,7 +437,7 @@ static void TestGradientAccumulatorTestUnchangeInput(
 
       /** g_accum3 && g_accum4: has been initialized
        *    test accumulate on previous graph
-      */
+       */
       auto var3 = create_var(use_tensor1);
       auto var_wrapper3_3 = std::make_shared<VariableWrapper>("tmp1_3");
       auto var_wrapper4_3 = std::make_shared<VariableWrapper>("tmp2_3");
diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc
index 5e674af1a08a8..0025103c53196 100644
--- a/paddle/fluid/imperative/tests/test_group.cc
+++ b/paddle/fluid/imperative/tests/test_group.cc
@@ -14,8 +14,8 @@
 
 #include <sstream>
 #include <string>
-#include "gtest/gtest.h"
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/imperative/reducer.h"
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index 4cda3f32fdf3f..cfda7a0cac4e9 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -17,9 +17,11 @@
 //
 
 #include <paddle/fluid/framework/op_registry.h>
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/imperative/prepared_operator.h"
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 350263bc5457d..2295ea4bf67c9 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/imperative/tracer.h"
+
 #include <map>
 #include <set>
 #include <unordered_set>
 #include <utility>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/execution_context.h"
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 4e671d52457e2..b9048c4847075 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -21,6 +21,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "ThreadPool.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
diff --git a/paddle/fluid/imperative/var_helper.h b/paddle/fluid/imperative/var_helper.h
index 9ce456b1103b3..91788e73fa583 100644
--- a/paddle/fluid/imperative/var_helper.h
+++ b/paddle/fluid/imperative/var_helper.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/variable.h"
 
 namespace egr {
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 633f481df808b..109cb5d8fe07d 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -17,19 +17,20 @@ if(WITH_TESTING)
   include(tests/test.cmake) # some generic cmake function for inference
 endif()
 
-cc_library(paddle_inference_io
-    SRCS io.cc
-    DEPS paddle_framework ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+cc_library(
+  paddle_inference_io
+  SRCS io.cc
+  DEPS paddle_framework ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
 
 # analysis and tensorrt must be added before creating static library,
 # otherwise, there would be undefined reference to them in static library.
 add_subdirectory(analysis)
 add_subdirectory(utils)
-if (TENSORRT_FOUND)
+if(TENSORRT_FOUND)
   add_subdirectory(tensorrt)
 endif()
 
-if (WITH_LITE)
+if(WITH_LITE)
   add_subdirectory(lite)
 endif()
 
@@ -42,20 +43,30 @@ add_subdirectory(api)
 
 # Create static inference library if needed
 # All static libs in inference/api
-set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
-     zero_copy_tensor reset_tensor_array
-        analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
+set(STATIC_INFERENCE_API
+    paddle_inference_api
+    analysis_predictor
+    zero_copy_tensor
+    reset_tensor_array
+    analysis_config
+    paddle_pass_builder
+    activation_functions
+    ${mkldnn_quantizer_cfg})
 
 #windows GPU static library over the limit, so not create_static_lib, and cc_library is dummy
 if(WIN32 AND WITH_GPU)
-  cc_library(paddle_inference DEPS ${fluid_modules} phi ${STATIC_INFERENCE_API} ${utils_modules})
+  cc_library(paddle_inference DEPS ${fluid_modules} phi ${STATIC_INFERENCE_API}
+                                   ${utils_modules})
 else()
-  create_static_lib(paddle_inference ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules})
+  create_static_lib(paddle_inference ${fluid_modules} ${phi_modules}
+                    ${STATIC_INFERENCE_API} ${utils_modules})
 endif()
 
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
-  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.sym")
+  set(LINK_FLAGS
+      "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.sym"
+  )
   set_target_properties(paddle_inference PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
 endif()
 
@@ -63,7 +74,7 @@ endif()
 add_subdirectory(capi_exp)
 
 if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
-    add_subdirectory(tests/api)
+  add_subdirectory(tests/api)
 endif()
 
 set(SHARED_INFERENCE_SRCS
@@ -80,43 +91,53 @@ set(SHARED_INFERENCE_SRCS
     ${PADDLE_CUSTOM_OP_SRCS})
 
 # shared inference library deps
-set(SHARED_INFERENCE_DEPS ${fluid_modules} phi analysis_predictor ${utils_modules})
+set(SHARED_INFERENCE_DEPS ${fluid_modules} phi analysis_predictor
+                          ${utils_modules})
 
-if (WITH_CRYPTO) 
-    set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto)
-endif (WITH_CRYPTO)
+if(WITH_CRYPTO)
+  set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto)
+endif(WITH_CRYPTO)
 
-if (WITH_PSCORE)
-    set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service tensor_table)
-endif ()
+if(WITH_PSCORE)
+  set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service
+                            tensor_table)
+endif()
 
-if (WITH_ONNXRUNTIME)
-  set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} 
-      ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc
-  )
-endif (WITH_ONNXRUNTIME)
+if(WITH_ONNXRUNTIME)
+  set(SHARED_INFERENCE_SRCS
+      ${SHARED_INFERENCE_SRCS}
+      ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc)
+endif(WITH_ONNXRUNTIME)
 
 # Create shared inference library
-cc_library(paddle_inference_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
-    DEPS ${SHARED_INFERENCE_DEPS})
+cc_library(
+  paddle_inference_shared SHARED
+  SRCS ${SHARED_INFERENCE_SRCS}
+  DEPS ${SHARED_INFERENCE_DEPS})
 
 get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
 target_link_libraries(paddle_inference_shared ${os_dependency_modules})
 if(WIN32)
-    target_link_libraries(paddle_inference_shared gflags)
+  target_link_libraries(paddle_inference_shared gflags)
 endif()
 
-set_target_properties(paddle_inference_shared PROPERTIES OUTPUT_NAME paddle_inference)
+set_target_properties(paddle_inference_shared PROPERTIES OUTPUT_NAME
+                                                         paddle_inference)
 if(NOT APPLE AND NOT WIN32)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
-  if (WITH_CUSTOM_DEVICE)
-    set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_custom_device.map")
+  if(WITH_CUSTOM_DEVICE)
+    set(LINK_FLAGS
+        "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_custom_device.map"
+    )
   else()
-    set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.map")
+    set(LINK_FLAGS
+        "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.map")
   endif()
-  set_target_properties(paddle_inference_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  set_target_properties(paddle_inference_shared PROPERTIES LINK_FLAGS
+                                                           "${LINK_FLAGS}")
   # check symbol hidden
-  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
+  file(
+    WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
     "execute_process(COMMAND sh -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
     " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference.so\" RESULT_VARIABLE symbol_res)\n"
     "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
@@ -126,5 +147,6 @@ if(NOT APPLE AND NOT WIN32)
     OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
     COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
     DEPENDS paddle_inference_shared)
-  add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
+  add_custom_target(check_symbol ALL
+                    DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
 endif()
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 3d1a467565c84..f374c5c7cc20f 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,72 +1,112 @@
 unset(analysis_deps CACHE)
 set(analysis_deps # analysis_deps can be extended across the project
-        framework_proto proto_desc graph pass paddle_inference_io executor pretty_log
-        ir_pass_manager
-        CACHE INTERNAL "")
+    framework_proto
+    proto_desc
+    graph
+    pass
+    paddle_inference_io
+    executor
+    pretty_log
+    ir_pass_manager
+    CACHE INTERNAL "")
 
 add_subdirectory(ir_passes)
 add_subdirectory(passes)
 
-cc_library(analysis_helper SRCS helper.cc DEPS framework_proto proto_desc graph paddle_inference_io)
+cc_library(
+  analysis_helper
+  SRCS helper.cc
+  DEPS framework_proto proto_desc graph paddle_inference_io)
 
-cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES} analysis_helper)
+cc_library(
+  ir_pass_manager
+  SRCS ir_pass_manager.cc
+  DEPS graph pass ${INFER_IR_PASSES} analysis_helper)
 
-cc_library(argument INTERFACE SRCS argument.cc DEPS scope proto_desc)
-cc_library(analysis_pass INTERFACE SRCS analysis_pass.cc DEPS proto_desc)
+cc_library(
+  argument INTERFACE
+  SRCS argument.cc
+  DEPS scope proto_desc)
+cc_library(
+  analysis_pass INTERFACE
+  SRCS analysis_pass.cc
+  DEPS proto_desc)
 
-cc_library(analysis SRCS analyzer.cc
-  DEPS ${analysis_deps} analysis_helper
-  analysis_pass ${INFER_IR_PASSES}
-  )
+cc_library(
+  analysis
+  SRCS analyzer.cc
+  DEPS ${analysis_deps} analysis_helper analysis_pass ${INFER_IR_PASSES})
 
 function(inference_analysis_test_build TARGET)
   if(WITH_TESTING)
-     set(options "")
-     set(oneValueArgs "")
-     set(multiValueArgs SRCS EXTRA_DEPS)
-     cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-     inference_base_test_build(${TARGET}
-             SRCS ${analysis_test_SRCS}
-             DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS})
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS EXTRA_DEPS)
+    cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    inference_base_test_build(
+      ${TARGET}
+      SRCS
+      ${analysis_test_SRCS}
+      DEPS
+      analysis
+      pass
+      ${GLOB_PASS_LIB}
+      ${analysis_test_EXTRA_DEPS})
   endif()
 endfunction()
 
 function(inference_analysis_test_run TARGET)
   if(WITH_TESTING)
-     set(options "")
-     set(oneValueArgs "")
-     set(multiValueArgs COMMAND ARGS)
-     cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-     inference_base_test_run(${TARGET}
-	     COMMAND ${analysis_test_COMMAND}
-             ARGS ${analysis_test_ARGS})
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs COMMAND ARGS)
+    cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    inference_base_test_run(${TARGET} COMMAND ${analysis_test_COMMAND} ARGS
+                            ${analysis_test_ARGS})
   endif()
 endfunction()
 
 function(inference_analysis_test TARGET)
   if(WITH_TESTING)
-     set(options "")
-     set(oneValueArgs "")
-     set(multiValueArgs SRCS ARGS EXTRA_DEPS)
-     cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-     inference_base_test_build(${TARGET}
-             SRCS ${analysis_test_SRCS}
-             DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS})
-     inference_base_test_run(${TARGET}
-	     COMMAND ${TARGET}
-             ARGS ${analysis_test_ARGS})
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS ARGS EXTRA_DEPS)
+    cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    inference_base_test_build(
+      ${TARGET}
+      SRCS
+      ${analysis_test_SRCS}
+      DEPS
+      analysis
+      pass
+      ${GLOB_PASS_LIB}
+      ${analysis_test_EXTRA_DEPS})
+    inference_base_test_run(${TARGET} COMMAND ${TARGET} ARGS
+                            ${analysis_test_ARGS})
   endif()
 endfunction(inference_analysis_test)
 
-
-if (NOT APPLE AND NOT WIN32)
-  inference_analysis_test(test_analyzer
-    SRCS analyzer_tester.cc
-    EXTRA_DEPS reset_tensor_array paddle_inference_shared
-    ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
+if(NOT APPLE AND NOT WIN32)
+  inference_analysis_test(
+    test_analyzer
+    SRCS
+    analyzer_tester.cc
+    EXTRA_DEPS
+    reset_tensor_array
+    paddle_inference_shared
+    ARGS
+    --inference_model_dir=${WORD2VEC_MODEL_DIR})
 elseif(WIN32)
-    inference_analysis_test(test_analyzer
-      SRCS analyzer_tester.cc
-      EXTRA_DEPS reset_tensor_array paddle_inference_api
-      ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
+  inference_analysis_test(
+    test_analyzer
+    SRCS
+    analyzer_tester.cc
+    EXTRA_DEPS
+    reset_tensor_array
+    paddle_inference_api
+    ARGS
+    --inference_model_dir=${WORD2VEC_MODEL_DIR})
 endif()
diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
index 14a1c3eea3417..a95498d82d0e6 100644
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <glog/logging.h>
+
 #include <iosfwd>
 #include <string>
 
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index be7d6ab868022..2b56f8e00d644 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/analysis/passes/passes.h"
 #include "paddle/fluid/string/pretty_log.h"
 
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 4db54706285d4..95a985158e678 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -37,6 +37,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/flags.h"
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 3f96fd69e4ee1..84fcd4e3c396f 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/analyzer.h"
-
 #include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 2336fd1980d2e..07b7b37485956 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -216,8 +216,12 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
                       bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
-  DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
+  DECL_ARGUMENT_FIELD(tensorrt_use_varseqlen, TensorRtUseOSS, bool);
   DECL_ARGUMENT_FIELD(tensorrt_with_interleaved, TensorRtWithInterleaved, bool);
+  DECL_ARGUMENT_FIELD(tensorrt_transformer_posid, TensorRtTransformerPosid,
+                      std::string);
+  DECL_ARGUMENT_FIELD(tensorrt_transformer_maskid, TensorRtTransformerMaskid,
+                      std::string);
   DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path,
                       TensorRtShapeRangeInfoPath, std::string);
   DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape,
diff --git a/paddle/fluid/inference/analysis/dot.h b/paddle/fluid/inference/analysis/dot.h
index 6d883f558709b..619e3461d3ea5 100644
--- a/paddle/fluid/inference/analysis/dot.h
+++ b/paddle/fluid/inference/analysis/dot.h
@@ -20,6 +20,7 @@
 #pragma once
 
 #include <glog/logging.h>
+
 #include <sstream>
 #include <string>
 #include <unordered_map>
diff --git a/paddle/fluid/inference/analysis/dot_tester.cc b/paddle/fluid/inference/analysis/dot_tester.cc
index c785a312bf96c..0b669093a1f32 100644
--- a/paddle/fluid/inference/analysis/dot_tester.cc
+++ b/paddle/fluid/inference/analysis/dot_tester.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/dot.h"
-
 #include <gtest/gtest.h>
+
 #include <memory>
 
+#include "paddle/fluid/inference/analysis/dot.h"
+
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 88ae61ff1fc98..f952016516184 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <sys/stat.h>
+
 #include <cstdio>
 #include <fstream>
 #include <memory>
@@ -72,8 +73,9 @@ struct DataTypeNamer {
   template <typename T>
   const std::string &repr() const {
     auto x = std::type_index(typeid(T));
-    PADDLE_ENFORCE_GT(dic_.count(x), 0, platform::errors::PreconditionNotMet(
-                                            "unknown type for representation"));
+    PADDLE_ENFORCE_GT(dic_.count(x), 0,
+                      platform::errors::PreconditionNotMet(
+                          "unknown type for representation"));
     return dic_.at(x);
   }
 
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index aafbe57e05ff2..6c74d7b738cf6 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+
 #include <map>
 #include <memory>
 #include <string>
@@ -20,6 +21,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
@@ -29,8 +31,8 @@
 namespace paddle {
 namespace inference {
 namespace analysis {
-using string::PrettyLogEndl;
 using string::PrettyLog;
+using string::PrettyLogEndl;
 using string::Style;
 
 IRPassManager::IRPassManager(Argument *argument) {
@@ -55,9 +57,13 @@ void IRPassManager::CreatePasses(Argument *argument,
   int pass_num = 0;
   for (const std::string &pass_name : passes) {
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
-    pass->Set("use_oss", new bool(argument->tensorrt_use_oss()));
+    pass->Set("use_varseqlen", new bool(argument->tensorrt_use_varseqlen()));
     pass->Set("with_interleaved",
               new bool(argument->tensorrt_with_interleaved()));
+    pass->Set("tensorrt_transformer_posid",
+              new std::string(argument->tensorrt_transformer_posid()));
+    pass->Set("tensorrt_transformer_maskid",
+              new std::string(argument->tensorrt_transformer_maskid()));
     pass->Set("disable_logs", new bool(argument->disable_logs()));
     auto precision_mode = argument->tensorrt_precision_mode();
     bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8;
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h
index 823dc8907ea53..9f9a5fc347123 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -27,6 +27,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/program_desc.h"
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
index 7faef7d391f02..a7a561b7b37a1 100644
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -1,34 +1,63 @@
-cc_library(subgraph_util SRCS subgraph_util.cc DEPS subgraph_detector)
+cc_library(
+  subgraph_util
+  SRCS subgraph_util.cc
+  DEPS subgraph_detector)
 
-if (WITH_GPU AND TENSORRT_FOUND)
-  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_util tensorrt_op_teller infer_io_utils)
+if(WITH_GPU AND TENSORRT_FOUND)
+  cc_library(
+    tensorrt_subgraph_pass
+    SRCS tensorrt_subgraph_pass.cc
+    DEPS subgraph_util tensorrt_op_teller infer_io_utils)
 
-  set(analysis_deps ${analysis_deps}
-          subgraph_util tensorrt_subgraph_pass
-          CACHE INTERNAL "")
+  set(analysis_deps
+      ${analysis_deps} subgraph_util tensorrt_subgraph_pass
+      CACHE INTERNAL "")
 
-  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp)
+  set(pass_file
+      ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp
+  )
   file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
-  set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
+  set(INFER_IR_PASSES
+      ${INFER_IR_PASSES} tensorrt_subgraph_pass
+      CACHE INTERNAL "")
 endif()
 
-if (WITH_LITE) 
-  cc_library(lite_subgraph_pass SRCS lite_subgraph_pass.cc DEPS ${analysis_deps} subgraph_util lite_op_teller)
-  set(analysis_deps ${analysis_deps} subgraph_util lite_subgraph_pass CACHE INTERNAL "")
-  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp)
+if(WITH_LITE)
+  cc_library(
+    lite_subgraph_pass
+    SRCS lite_subgraph_pass.cc
+    DEPS ${analysis_deps} subgraph_util lite_op_teller)
+  set(analysis_deps
+      ${analysis_deps} subgraph_util lite_subgraph_pass
+      CACHE INTERNAL "")
+  set(pass_file
+      ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp
+  )
   file(APPEND ${pass_file} "USE_PASS(lite_subgraph_pass);\n")
-  set(INFER_IR_PASSES ${INFER_IR_PASSES} lite_subgraph_pass CACHE INTERNAL "")
-  cc_test(lite_subgraph_pass_tester SRCS lite_subgraph_pass_tester.cc DEPS lite_subgraph_pass gtest glog)
+  set(INFER_IR_PASSES
+      ${INFER_IR_PASSES} lite_subgraph_pass
+      CACHE INTERNAL "")
+  cc_test(
+    lite_subgraph_pass_tester
+    SRCS lite_subgraph_pass_tester.cc
+    DEPS lite_subgraph_pass gtest glog)
 endif()
 
-MESSAGE("WITH_DLNNE:${WITH_DLNNE}")
+message("WITH_DLNNE:${WITH_DLNNE}")
 if(WITH_DLNNE)
-  cc_library(dlnne_subgraph_pass SRCS dlnne_subgraph_pass.cc DEPS ${analysis_deps} subgraph_util)
-  set(analysis_deps ${analysis_deps}
-        subgraph_util dlnne_subgraph_pass
-        CACHE INTERNAL "")
+  cc_library(
+    dlnne_subgraph_pass
+    SRCS dlnne_subgraph_pass.cc
+    DEPS ${analysis_deps} subgraph_util)
+  set(analysis_deps
+      ${analysis_deps} subgraph_util dlnne_subgraph_pass
+      CACHE INTERNAL "")
 
-  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp)
+  set(pass_file
+      ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp
+  )
   file(APPEND ${pass_file} "USE_PASS(dlnne_subgraph_pass);\n")
-  set(INFER_IR_PASSES ${INFER_IR_PASSES} dlnne_subgraph_pass CACHE INTERNAL "")
+  set(INFER_IR_PASSES
+      ${INFER_IR_PASSES} dlnne_subgraph_pass
+      CACHE INTERNAL "")
 endif()
diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
index 8f789139af9bf..b2a07722829be 100644
--- a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
@@ -11,19 +11,19 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <algorithm>
-#include <map>
-#include <set>
+#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h"
 
+#include <algorithm>
 #include <fstream>
 #include <iostream>
+#include <map>
+#include <set>
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/subgraph_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h"
-#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
@@ -52,18 +52,39 @@ using framework::ir::Node;
 
 void analysis::DlnneSubgraphPass::ApplyImpl(framework::ir::Graph *graph) const {
   static std::unordered_set<std::string> teller_set{
-      "mul", "matmul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
-      "hard_swish", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-      "elementwise_add", "elementwise_mul", "dropout", "prelu",
-      "conv2d_transpose", "leaky_relu",
+      "mul",
+      "matmul",
+      "conv2d",
+      "pool2d",
+      "relu",
+      "softmax",
+      "sigmoid",
+      "hard_swish",
+      "depthwise_conv2d",
+      "batch_norm",
+      "concat",
+      "tanh",
+      "pad",
+      "elementwise_add",
+      "elementwise_mul",
+      "dropout",
+      "prelu",
+      "conv2d_transpose",
+      "leaky_relu",
       // "fc",
-      "shuffle_channel", "swish", "split",
+      "shuffle_channel",
+      "swish",
+      "split",
       // "instance_norm",
       "gelu",
       // "layer_norm",
       // "scale",
       // "stack",
-      "relu6", "reshape2", "transpose2", "concat", "slice",
+      "relu6",
+      "reshape2",
+      "transpose2",
+      "concat",
+      "slice",
   };
 
   framework::ir::FusePassBase::Init("dlnne_subgraph_pass", graph);
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index 083fc8991192e..b5ddacd440e25 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -12,7 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h"
+
 #include <algorithm>
+#include <fstream>
+#include <iostream>
 #include <map>
 #include <memory>
 #include <set>
@@ -21,28 +25,22 @@
 #include <unordered_set>
 #include <vector>
 
-#include <fstream>
-#include <iostream>
-
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/subgraph_detector.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/lite/engine.h"
 #include "paddle/fluid/inference/lite/op_teller.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/subgraph_detector.h"
-#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h"
 #include "paddle/fluid/string/pretty_log.h"
 
-#include "paddle/fluid/inference/lite/engine.h"
-
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-using framework::ir::Node;
 using framework::ir::Agent;
-using framework::ir::SubGraphFuser;
 using framework::ir::Graph;
+using framework::ir::Node;
+using framework::ir::SubGraphFuser;
 
 namespace lite {
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h
index e79a64f0f72cf..198a86c185bc6 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h
@@ -14,10 +14,12 @@
 
 #pragma once
 #include <paddle/fluid/framework/ir/fuse_pass_base.h>
+
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc
index 90ad7ec0b4437..8c88e2869cce3 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h"
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/inference/lite/op_teller.h"
 
@@ -29,7 +30,7 @@ void AppendLiteSubBlocks(const std::vector<framework::OpDesc*>& subgraph_ops,
                          framework::ProgramDesc* engine_program,
                          framework::ProgramDesc* host_program,
                          const int32_t host_sub_id);
-}
+}  // namespace lite
 
 TEST(LiteSubgraphPass, basic) {
   framework::ProgramDesc host_program;
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index b73eb624db85b..394ce7799e8ee 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -377,12 +377,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
                   Get<int>("workspace_size"), precision_mode, calibrator.get(),
                   Get<int>("gpu_device_id"), min_input_shape, max_input_shape,
                   opt_input_shape, disable_trt_plugin_fp16);
-  trt_engine->SetUseOSS(Get<bool>("use_oss"));
+  trt_engine->SetUseOSS(Get<bool>("use_varseqlen"));
   trt_engine->SetWithInterleaved(Get<bool>("with_interleaved"));
+  trt_engine->SetTransformerPosid(
+      Get<std::string>("tensorrt_transformer_posid"));
+  trt_engine->SetTransformerMaskid(
+      Get<std::string>("tensorrt_transformer_maskid"));
   trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
   trt_engine->SetDLACore(Get<int>("trt_dla_core"));
   trt_engine->SetUseInspector(Get<bool>("use_inspector"));
-  trt_engine->SetWithErnie(graph->Has(framework::ir::kMultiheadMatmulPass));
+  trt_engine->SetWithErnie(
+      graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
+      graph->Has(framework::ir::kMultiheadMatmulPass));
 
   if (use_static_engine) {
     trt_engine_serialized_data = GetTrtEngineSerializedData(
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index a950899a8a458..17bb8b6c62ab7 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -1,28 +1,55 @@
-cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass zero_copy_tensor)
-cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
-cc_library(adjust_cudnn_workspace_size_pass SRCS adjust_cudnn_workspace_size_pass.cc DEPS analysis_pass graph_to_program_pass)
-cc_library(inference_op_replace_pass SRCS inference_op_replace_pass.cc DEPS analysis_pass graph_to_program_pass)
-IF(WITH_TESTING)
-  cc_library(ir_graph_clean_pass SRCS ir_graph_clean_pass.cc DEPS analysis_pass gtest)
-ELSE()
-  cc_library(ir_graph_clean_pass SRCS ir_graph_clean_pass.cc DEPS analysis_pass)
-ENDIF()
-
-cc_library(analysis_passes SRCS passes.cc DEPS
+cc_library(
   ir_graph_build_pass
+  SRCS ir_graph_build_pass.cc
+  DEPS analysis_pass argument ir_pass_manager)
+cc_library(
   ir_analysis_pass
+  SRCS ir_analysis_pass.cc
+  DEPS analysis_pass argument ir_pass_manager)
+cc_library(
+  memory_optim_pass
+  SRCS memory_optimize_pass.cc
+  DEPS analysis_pass zero_copy_tensor)
+cc_library(
   ir_params_sync_among_devices_pass
+  SRCS ir_params_sync_among_devices_pass.cc
+  DEPS analysis_pass argument ir_pass_manager)
+cc_library(
+  ir_graph_to_program_pass
+  SRCS ir_graph_to_program_pass.cc
+  DEPS analysis_pass graph_to_program_pass)
+cc_library(
   adjust_cudnn_workspace_size_pass
-  memory_optim_pass
+  SRCS adjust_cudnn_workspace_size_pass.cc
+  DEPS analysis_pass graph_to_program_pass)
+cc_library(
   inference_op_replace_pass
-  ir_graph_to_program_pass
-  ir_graph_clean_pass
-)
+  SRCS inference_op_replace_pass.cc
+  DEPS analysis_pass graph_to_program_pass)
+if(WITH_TESTING)
+  cc_library(
+    ir_graph_clean_pass
+    SRCS ir_graph_clean_pass.cc
+    DEPS analysis_pass gtest)
+else()
+  cc_library(
+    ir_graph_clean_pass
+    SRCS ir_graph_clean_pass.cc
+    DEPS analysis_pass)
+endif()
+
+cc_library(
+  analysis_passes
+  SRCS passes.cc
+  DEPS ir_graph_build_pass
+       ir_analysis_pass
+       ir_params_sync_among_devices_pass
+       adjust_cudnn_workspace_size_pass
+       memory_optim_pass
+       inference_op_replace_pass
+       ir_graph_to_program_pass
+       ir_graph_clean_pass)
 
-set(analysis_deps ${analysis_deps}
-        analysis_passes
-        subgraph_detector
-        CACHE INTERNAL "")
+set(analysis_deps
+    ${analysis_deps} analysis_passes subgraph_detector
+    CACHE INTERNAL "")
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
index 34192965297a6..05bda4e75c9bd 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+
 #include <memory>
 #include <utility>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
index 2c2113c06d917..fca431b5d7779 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index 321716b1c8a18..fca5e2563424e 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/io.h"
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
index adbde0433fad2..e7ef23e791e9d 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
index 0f3633ca6fa4b..999fb4ad8d764 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/program_desc.h"
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
index 613eb04497e61..5b20667d62ab6 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 614eea24a0e2e..a0c7a94cd1b30 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 3fa417c2ea631..70620e8692cd8 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -61,7 +61,8 @@ void MemoryOptimizePass::CollectLifeCycle(
     auto reads = op_node->inputs;
     auto writes = op_node->outputs;
 
-    std::vector<Node*> requires(reads.begin(), reads.end());
+    std::vector<Node*>
+    requires(reads.begin(), reads.end());
     requires.insert(requires.end(), writes.begin(), writes.end());
 
     // Disable reuse of feed variables.
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
index 8ca5ffa2581f1..5dcd8b1059ebc 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -35,16 +35,15 @@ namespace inference {
 namespace analysis {
 
 /* Memory optimization.
-* We will perform the following operation:
-* 1. Collect all var's lifetime.
-* 2. Make reuse plan: the vars can be reused if there is no overlap(on lifetime)
-* between
-* them.
-* The final plan is a mapping table in which the key represents the original
-* name of var and the value in the table represents the current name of var.
-* 3. Perform reuse plan: Replace all var's name in the model according to the
-* mapping table.
-*/
+ * We will perform the following operation:
+ * 1. Collect all var's lifetime.
+ * 2. Make reuse plan: the vars can be reused if there is no overlap(on
+ * lifetime) between them. The final plan is a mapping table in which the key
+ * represents the original name of var and the value in the table represents the
+ * current name of var.
+ * 3. Perform reuse plan: Replace all var's name in the model according to the
+ * mapping table.
+ */
 class MemoryOptimizePass : public AnalysisPass {
  public:
   using space_table_t = std::unordered_map<std::string, size_t>;
diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc
index ca0b25c29d495..19aab1a948dd2 100644
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/passes.h"
+
 #include "paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h"
 #include "paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
diff --git a/paddle/fluid/inference/analysis/passes/passes.h b/paddle/fluid/inference/analysis/passes/passes.h
index 8a13091d083e5..b3b240c280c96 100644
--- a/paddle/fluid/inference/analysis/passes/passes.h
+++ b/paddle/fluid/inference/analysis/passes/passes.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index 56565c8f3f72a..6c7690a4779bf 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #pragma once
 #include <gtest/gtest.h>
+
 #include <fstream>
 #include <string>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/inference/analysis/helper.h"
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 56cc4aa755bda..e25c5e963982f 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -14,7 +14,7 @@
 #
 
 if(APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)
 
 add_subdirectory(details)
@@ -22,76 +22,139 @@ add_subdirectory(details)
 if(WITH_MKLDNN)
   set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
   set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn_quantizer.cc)
-  cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder)
-  set(mkldnn_quantizer_cfg ${mkldnn_quantizer_cfg} PARENT_SCOPE)
+  cc_library(
+    ${mkldnn_quantizer_cfg}
+    SRCS mkldnn_quantizer_config.cc
+    DEPS lod_tensor paddle_pass_builder)
+  set(mkldnn_quantizer_cfg
+      ${mkldnn_quantizer_cfg}
+      PARENT_SCOPE)
 endif()
 
-cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer utf8proc)
-cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor)
+cc_library(
+  analysis_config
+  SRCS analysis_config.cc
+  DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer
+       utf8proc)
+cc_library(
+  paddle_infer_contrib
+  SRCS paddle_infer_contrib.cc
+  DEPS zero_copy_tensor)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 
-set(paddle_inference_api_deps lod_tensor scope reset_tensor_array
-    analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
+set(paddle_inference_api_deps
+    lod_tensor
+    scope
+    reset_tensor_array
+    analysis_config
+    paddle_infer_contrib
+    zero_copy_tensor
+    trainer_desc_proto
+    custom_operator)
 
 if(WITH_CRYPTO)
-    list(APPEND paddle_inference_api_deps paddle_crypto)
+  list(APPEND paddle_inference_api_deps paddle_crypto)
 endif()
 
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS ${paddle_inference_api_deps})
+cc_library(
+  paddle_inference_api
+  SRCS api.cc api_impl.cc helper.cc
+  DEPS ${paddle_inference_api_deps})
 
 if(WIN32)
-    target_link_libraries(paddle_inference_api gflags)
+  target_link_libraries(paddle_inference_api gflags)
 endif()
 
-set(inference_deps ${analysis_deps} paddle_inference_api analysis naive_executor ${GLOB_PASS_LIB})
+set(inference_deps ${analysis_deps} paddle_inference_api analysis
+                   naive_executor ${GLOB_PASS_LIB})
 
 if(WITH_GPU AND TENSORRT_FOUND)
-    set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
+  set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
 endif()
 
-if (WITH_ONNXRUNTIME)
-    cc_library(analysis_predictor SRCS analysis_predictor.cc onnxruntime_predictor.cc resource_manager.cc infer_context.cc ${mkldnn_quantizer_src} DEPS ${inference_deps}
-              zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx)
-else (WITH_ONNXRUNTIME)
-    cc_library(analysis_predictor SRCS analysis_predictor.cc resource_manager.cc infer_context.cc ${mkldnn_quantizer_src} DEPS ${inference_deps}
-              zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
-endif (WITH_ONNXRUNTIME)
-
-
-cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
+if(WITH_ONNXRUNTIME)
+  cc_library(
+    analysis_predictor
+    SRCS analysis_predictor.cc onnxruntime_predictor.cc resource_manager.cc
+         infer_context.cc ${mkldnn_quantizer_src}
+    DEPS ${inference_deps}
+         zero_copy_tensor
+         ir_pass_manager
+         op_compatible_info
+         infer_io_utils
+         onnxruntime
+         paddle2onnx)
+else(WITH_ONNXRUNTIME)
+  cc_library(
+    analysis_predictor
+    SRCS analysis_predictor.cc resource_manager.cc infer_context.cc
+         ${mkldnn_quantizer_src}
+    DEPS ${inference_deps} zero_copy_tensor ir_pass_manager op_compatible_info
+         infer_io_utils)
+endif(WITH_ONNXRUNTIME)
+
+cc_test(
+  test_paddle_inference_api
+  SRCS api_tester.cc
+  DEPS paddle_inference_api)
 
 if(WITH_TESTING)
-  if (NOT APPLE AND NOT WIN32)
-    if (WITH_GPU)
-      inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_inference_shared
-        ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
+  if(NOT APPLE AND NOT WIN32)
+    if(WITH_GPU)
+      inference_base_test(
+        test_api_impl
+        SRCS
+        api_impl_tester.cc
+        DEPS
+        paddle_inference_shared
+        ARGS
+        --word2vec_dirname=${WORD2VEC_MODEL_DIR}
+        --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
     endif()
   elseif(WIN32)
-    inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
-      ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
+    inference_base_test(
+      test_api_impl
+      SRCS
+      api_impl_tester.cc
+      DEPS
+      ${inference_deps}
+      ARGS
+      --word2vec_dirname=${WORD2VEC_MODEL_DIR}
+      --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
   endif()
 
 endif()
 
-if (NOT APPLE AND NOT WIN32)
-  cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_inference_shared
-          ARGS --dirname=${WORD2VEC_MODEL_DIR})
-elseif (WIN32)
-  cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
-          ARGS --dirname=${WORD2VEC_MODEL_DIR})
+if(NOT APPLE AND NOT WIN32)
+  cc_test(
+    test_analysis_predictor
+    SRCS analysis_predictor_tester.cc
+    DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR})
+elseif(WIN32)
+  cc_test(
+    test_analysis_predictor
+    SRCS analysis_predictor_tester.cc
+    DEPS analysis_predictor benchmark ${inference_deps} ARGS
+         --dirname=${WORD2VEC_MODEL_DIR})
 endif()
 
 if(WITH_TESTING AND WITH_MKLDNN)
-  if (NOT APPLE AND NOT WIN32)
-    cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR})
-  elseif (WIN32)
-    cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
-            ARGS --dirname=${WORD2VEC_MODEL_DIR})
+  if(NOT APPLE AND NOT WIN32)
+    cc_test(
+      test_mkldnn_quantizer
+      SRCS mkldnn_quantizer_tester.cc
+      DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR})
+  elseif(WIN32)
+    cc_test(
+      test_mkldnn_quantizer
+      SRCS mkldnn_quantizer_tester.cc
+      DEPS analysis_predictor benchmark ${inference_deps} ARGS
+           --dirname=${WORD2VEC_MODEL_DIR})
   endif()
 endif()
 
 if(WITH_TESTING AND TEST test_api_impl)
-    if(NOT APPLE)
-        set_tests_properties(test_api_impl PROPERTIES TIMEOUT 120)
-    endif()
+  if(NOT APPLE)
+    set_tests_properties(test_api_impl PROPERTIES TIMEOUT 120)
+  endif()
 endif()
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 735e1b7be4c1f..c23397a082860 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -15,6 +15,7 @@
 #include <sstream>
 #include <string>
 #include <tuple>
+
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/inference/utils/table_printer.h"
@@ -256,8 +257,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(trt_dla_core_);
   CP_MEMBER(trt_use_static_engine_);
   CP_MEMBER(trt_use_calib_mode_);
-  CP_MEMBER(trt_use_oss_);
+  CP_MEMBER(trt_use_varseqlen_);
   CP_MEMBER(trt_with_interleaved_);
+  CP_MEMBER(tensorrt_transformer_posid_);
+  CP_MEMBER(tensorrt_transformer_maskid_);
   CP_MEMBER(trt_tuned_dynamic_shape_);
   CP_MEMBER(trt_allow_build_at_runtime_);
   CP_MEMBER(collect_shape_range_info_);
@@ -546,7 +549,7 @@ void AnalysisConfig::Exp_DisableTensorRtOPs(
   trt_disabled_ops_.insert(trt_disabled_ops_.end(), ops.begin(), ops.end());
 }
 
-void AnalysisConfig::EnableTensorRtOSS() { trt_use_oss_ = true; }
+void AnalysisConfig::EnableVarseqlen() { trt_use_varseqlen_ = true; }
 
 // TODO(Superjomn) refactor this, buggy.
 void AnalysisConfig::Update() {
@@ -1034,9 +1037,13 @@ std::string AnalysisConfig::Summary() {
                                                         ? shape_range_info_path_
                                                         : "false"});
 
-      os.InsertRow({"tensorrt_use_oss", trt_use_oss_ ? "true" : "false"});
+      os.InsertRow(
+          {"tensorrt_use_varseqlen", trt_use_varseqlen_ ? "true" : "false"});
       os.InsertRow({"tensorrt_with_interleaved",
                     trt_with_interleaved_ ? "true" : "false"});
+      os.InsertRow({"tensorrt_transformer_posid", tensorrt_transformer_posid_});
+      os.InsertRow(
+          {"tensorrt_transformer_maskid", tensorrt_transformer_maskid_});
       os.InsertRow({"tensorrt_use_dla", trt_use_dla_ ? "true" : "false"});
       if (trt_use_dla_) {
         os.InsertRow({"tensorrt_dla_core", std::to_string(trt_dla_core_)});
@@ -1099,8 +1106,9 @@ LiteNNAdapterConfig &LiteNNAdapterConfig::SetModelCacheBuffers(
                     platform::errors::InvalidArgument(
                         "model_cache_buffer should not be empty."));
   PADDLE_ENFORCE_EQ(nnadapter_model_cache_buffers.count(model_cache_token),
-                    false, platform::errors::InvalidArgument(
-                               "model_cache_token has already been set."));
+                    false,
+                    platform::errors::InvalidArgument(
+                        "model_cache_token has already been set."));
 
   nnadapter_model_cache_buffers[model_cache_token] = model_cache_buffer;
   return *this;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 09a5bbddba87c..5f9051ff2fdb9 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -83,9 +83,9 @@ namespace paddle {
 
 using inference::Singleton;
 #if PADDLE_WITH_TENSORRT
-using inference::tensorrt::TRTInt8Calibrator;
 using inference::tensorrt::TRTCalibratorEngine;
 using inference::tensorrt::TRTCalibratorEngineManager;
+using inference::tensorrt::TRTInt8Calibrator;
 #endif
 
 int AnalysisPredictor::clone_num_ = 1;
@@ -853,8 +853,10 @@ void AnalysisPredictor::PrepareArgument() {
   }
 
   argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
-  argument_.SetTensorRtUseOSS(config_.trt_use_oss_);
+  argument_.SetTensorRtUseOSS(config_.trt_use_varseqlen_);
   argument_.SetTensorRtWithInterleaved(config_.trt_with_interleaved_);
+  argument_.SetTensorRtTransformerPosid(config_.tensorrt_transformer_posid_);
+  argument_.SetTensorRtTransformerMaskid(config_.tensorrt_transformer_maskid_);
   argument_.SetMinInputShape(config_.min_input_shape_);
   argument_.SetMaxInputShape(config_.max_input_shape_);
   argument_.SetOptimInputShape(config_.optim_input_shape_);
@@ -1025,8 +1027,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 }
 
 template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+    const AnalysisConfig &config) {
   // TODO(NHZlX): Should add the link to the doc of
   // paddle_infer::CreatePredictor<paddle_infer::Config>
   if (config.glog_info_disabled()) {
@@ -1803,6 +1806,9 @@ USE_TRT_CONVERTER(fused_preln_embedding_eltwise_layernorm)
 USE_TRT_CONVERTER(preln_skip_layernorm)
 USE_TRT_CONVERTER(roll)
 USE_TRT_CONVERTER(strided_slice)
+USE_TRT_CONVERTER(transformer_input_convert)
+USE_TRT_CONVERTER(recover_padding)
+USE_TRT_CONVERTER(remove_padding)
 #endif
 
 namespace paddle_infer {
@@ -1971,6 +1977,20 @@ void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c,
 #endif
 }
 
+void InternalUtils::SetTransformerPosid(
+    paddle_infer::Config *c, const std::string &tensorrt_transformer_posid) {
+#ifdef PADDLE_WITH_CUDA
+  c->tensorrt_transformer_posid_ = tensorrt_transformer_posid;
+#endif
+}
+
+void InternalUtils::SetTransformerMaskid(
+    paddle_infer::Config *c, const std::string &tensorrt_transformer_maskid) {
+#ifdef PADDLE_WITH_CUDA
+  c->tensorrt_transformer_maskid_ = tensorrt_transformer_maskid;
+#endif
+}
+
 void InternalUtils::SyncStream(paddle_infer::Predictor *p) {
 #ifdef PADDLE_WITH_CUDA
   auto *pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index e96526730fdea..1cfdaf1a55864 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -41,7 +41,7 @@ using float16 = paddle::platform::float16;
 namespace experimental {
 class InternalUtils;
 };
-}
+}  // namespace paddle_infer
 ///
 /// \file analysis_predictor.h
 ///
@@ -55,10 +55,10 @@ class InternalUtils;
 
 namespace paddle {
 
-using inference::analysis::Argument;
-using inference::analysis::Analyzer;
-using framework::proto::ProgramDesc;
 using framework::NaiveExecutor;
+using framework::proto::ProgramDesc;
+using inference::analysis::Analyzer;
+using inference::analysis::Argument;
 
 ///
 /// \class AnalysisPredictor
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index e8a1384166aff..f16054565a7fc 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -18,7 +18,9 @@
 #endif
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <thread>  // NOLINT
+
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/api/helper.h"
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index e2befadf0a89b..9e4633774a2fc 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <sstream>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/commit.h"
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 1c4369af646af..38960aecb703b 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/inference/api/api_impl.h"
+
 #include <glog/logging.h>
+
 #include <memory>
 #include <sstream>
 #include <string>
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/place.h"
@@ -348,8 +350,9 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
 }
 
 template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+    const NativeConfig &config) {
   // TODO(NHZlX): Should add the link to the doc of
   // paddle_infer::CreatePredictor<paddle_infer::Config>
   VLOG(3) << "create NativePaddlePredictor";
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index b91eff4573ed0..d503d2581392a 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <glog/logging.h>
+
 #include <map>
 #include <memory>
 #include <string>
diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc
index 46724fa6b1aca..1faf46fad2be6 100644
--- a/paddle/fluid/inference/api/api_tester.cc
+++ b/paddle/fluid/inference/api/api_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <exception>
 #include <string>
 
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 547e265d2fdb5..a76ed63f10646 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -1,31 +1,33 @@
 cmake_minimum_required(VERSION 3.0)
 project(cpp_inference_demo CXX C)
-option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
-option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
-option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
-option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
-option(WITH_ONNXRUNTIME       "Compile demo with ONNXRuntime"       OFF)
+option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON)
+option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF)
+option(WITH_STATIC_LIB
+       "Compile demo with static/shared library, default use static." ON)
+option(USE_TENSORRT "Compile demo with TensorRT." OFF)
+option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF)
 
 if(NOT WITH_STATIC_LIB)
   add_definitions("-DPADDLE_WITH_SHARED_LIB")
 else()
-  # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode. 
+  # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode.
   # Set it to empty in static library mode to avoid compilation issues.
   add_definitions("/DPD_INFER_DECL=")
 endif()
 
 macro(safe_set_static_flag)
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
+  foreach(flag_var
+          CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+          CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+    if(${flag_var} MATCHES "/MD")
+      string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+    endif(${flag_var} MATCHES "/MD")
+  endforeach(flag_var)
 endmacro()
 
 if(NOT DEFINED PADDLE_LIB)
-  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+  message(
+    FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
 endif()
 if(NOT DEFINED DEMO_NAME)
   message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
@@ -47,7 +49,7 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
-if (WITH_ONNXRUNTIME)
+if(WITH_ONNXRUNTIME)
   include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include")
   include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include")
 
@@ -55,21 +57,25 @@ if (WITH_ONNXRUNTIME)
   link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib")
 endif()
 
-if (WIN32)
+if(WIN32)
   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
   option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
-  if (MSVC_STATIC_CRT)
-    if (WITH_MKL)
+  if(MSVC_STATIC_CRT)
+    if(WITH_MKL)
       set(FLAG_OPENMP "/openmp")
     endif()
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4244 /wd4251 /wd4267 /wd4305")
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
+    set(CMAKE_C_FLAGS_DEBUG
+        "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
+    set(CMAKE_C_FLAGS_RELEASE
+        "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4251 /wd4267 /wd4305")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
+    set(CMAKE_CXX_FLAGS_DEBUG
+        "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
+    set(CMAKE_CXX_FLAGS_RELEASE
+        "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
     safe_set_static_flag()
-    if (WITH_STATIC_LIB)
+    if(WITH_STATIC_LIB)
       add_definitions(-DSTATIC_LIB)
     endif()
   endif()
@@ -82,42 +88,55 @@ endif()
 
 if(WITH_GPU)
   if(NOT WIN32)
-    set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
+    set(CUDA_LIB
+        "/usr/local/cuda/lib64/"
+        CACHE STRING "CUDA Library")
   else()
-    set(CUDA_LIB "" CACHE STRING "CUDA_LIB")
+    set(CUDA_LIB
+        ""
+        CACHE STRING "CUDA_LIB")
     if("${CUDA_LIB}" STREQUAL "")
       if(DEFINED ENV{CUDA_PATH})
         set(CUDA_LIB "$ENV{CUDA_PATH}\\lib\\x64")
       else()
-        set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\lib\\x64")
+        set(CUDA_LIB
+            "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\lib\\x64"
+        )
       endif()
     endif()
     message(STATUS "Current CUDA lib path: ${CUDA_LIB}")
   endif(NOT WIN32)
 endif()
 
-if (USE_TENSORRT AND WITH_GPU)
-  set(TENSORRT_ROOT "" CACHE STRING "The root directory of TensorRT library")
+if(USE_TENSORRT AND WITH_GPU)
+  set(TENSORRT_ROOT
+      ""
+      CACHE STRING "The root directory of TensorRT library")
   if("${TENSORRT_ROOT}" STREQUAL "")
-      message(FATAL_ERROR "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH ")
+    message(
+      FATAL_ERROR
+        "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH "
+    )
   endif()
   set(TENSORRT_INCLUDE_DIR ${TENSORRT_ROOT}/include)
   set(TENSORRT_LIB_DIR ${TENSORRT_ROOT}/lib)
   file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
-  string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
-    "${TENSORRT_VERSION_FILE_CONTENTS}")
+  string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)"
+               TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
   if("${TENSORRT_MAJOR_VERSION}" STREQUAL "")
-    file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h TENSORRT_VERSION_FILE_CONTENTS)
-    string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
-      "${TENSORRT_VERSION_FILE_CONTENTS}")
+    file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h
+         TENSORRT_VERSION_FILE_CONTENTS)
+    string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)"
+                 TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
   endif()
   if("${TENSORRT_MAJOR_VERSION}" STREQUAL "")
     message(SEND_ERROR "Failed to detect TensorRT version.")
   endif()
   string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
-    TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
-  message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
-    "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
+                       TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
+  message(
+    STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
+           "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
   include_directories("${TENSORRT_INCLUDE_DIR}")
   link_directories("${TENSORRT_LIB_DIR}")
 endif()
@@ -129,8 +148,9 @@ if(WITH_MKL)
     set(MATH_LIB ${MATH_LIB_PATH}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
                  ${MATH_LIB_PATH}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
   else()
-    set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
-                 ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+    set(MATH_LIB
+        ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+        ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
   endif()
   set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn")
   if(EXISTS ${MKLDNN_PATH})
@@ -145,65 +165,99 @@ else()
   set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas")
   include_directories("${OPENBLAS_LIB_PATH}/include/openblas")
   if(WIN32)
-    set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+    set(MATH_LIB
+        ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX})
   else()
-    set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+    set(MATH_LIB
+        ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
   endif()
 endif()
 
 if(WITH_STATIC_LIB)
-  set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
+  set(DEPS
+      ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}
+  )
 else()
   if(WIN32)
-    set(DEPS ${PADDLE_LIB}/paddle/lib/paddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
+    set(DEPS
+        ${PADDLE_LIB}/paddle/lib/paddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
   else()
-    set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX})
+    set(DEPS
+        ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX}
+    )
   endif()
 endif()
 
-if (WITH_ONNXRUNTIME)
+if(WITH_ONNXRUNTIME)
   if(WIN32)
-    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx)
+    set(DEPS
+        ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib
+        paddle2onnx)
   elseif(APPLE)
-    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx)
+    set(DEPS
+        ${DEPS}
+        ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib
+        paddle2onnx)
   else()
-    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx)
+    set(DEPS
+        ${DEPS}
+        ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0
+        paddle2onnx)
   endif()
 endif()
 
-
-if (NOT WIN32)
+if(NOT WIN32)
   set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-  set(DEPS ${DEPS}
-      ${MATH_LIB} ${MKLDNN_LIB}
-      glog gflags protobuf xxhash cryptopp utf8proc
+  set(DEPS
+      ${DEPS}
+      ${MATH_LIB}
+      ${MKLDNN_LIB}
+      glog
+      gflags
+      protobuf
+      xxhash
+      cryptopp
+      utf8proc
       ${EXTERNAL_LIB})
 else()
-  set(DEPS ${DEPS}
-      ${MATH_LIB} ${MKLDNN_LIB}
-      glog gflags_static libprotobuf xxhash cryptopp-static utf8proc_static
+  set(DEPS
+      ${DEPS}
+      ${MATH_LIB}
+      ${MKLDNN_LIB}
+      glog
+      gflags_static
+      libprotobuf
+      xxhash
+      cryptopp-static
+      utf8proc_static
       ${EXTERNAL_LIB})
   set(DEPS ${DEPS} shlwapi.lib)
 endif(NOT WIN32)
 
 if(WITH_GPU)
   if(NOT WIN32)
-    if (USE_TENSORRT)
-      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX})
-      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX})
+    if(USE_TENSORRT)
+      set(DEPS ${DEPS}
+               ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX})
+      set(DEPS
+          ${DEPS}
+          ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX})
     endif()
     set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
   else()
     if(USE_TENSORRT)
-      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
-      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS}
+               ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS}
+               ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
       if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7)
-        set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_STATIC_LIBRARY_SUFFIX})
+        set(DEPS ${DEPS}
+                 ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_STATIC_LIBRARY_SUFFIX})
       endif()
     endif()
-    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
-    set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
-    set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX})
+    set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX})
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX})
   endif()
 endif()
 
@@ -217,40 +271,61 @@ if(WIN32)
   endif()
 
   if(USE_TENSORRT)
-    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}
-              ${LIB_PATH}
-            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
-              ${LIB_PATH}
-    )
+    add_custom_command(
+      TARGET ${DEMO_NAME}
+      POST_BUILD
+      COMMAND
+        ${CMAKE_COMMAND} -E copy
+        ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX} ${LIB_PATH}
+      COMMAND
+        ${CMAKE_COMMAND} -E copy
+        ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
+        ${LIB_PATH})
     if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7)
-      add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-              COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_SHARED_LIBRARY_SUFFIX}
-                ${LIB_PATH})
+      add_custom_command(
+        TARGET ${DEMO_NAME}
+        POST_BUILD
+        COMMAND
+          ${CMAKE_COMMAND} -E copy
+          ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_SHARED_LIBRARY_SUFFIX}
+          ${LIB_PATH})
     endif()
   endif()
   if(WITH_MKL)
-    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-          COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll ${LIB_PATH}
-          COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll ${LIB_PATH}
-          COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll  ${LIB_PATH}
-    )
+    add_custom_command(
+      TARGET ${DEMO_NAME}
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll
+              ${LIB_PATH}
+      COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll
+              ${LIB_PATH}
+      COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll
+              ${LIB_PATH})
   else()
-    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-          COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${LIB_PATH}
-    )
+    add_custom_command(
+      TARGET ${DEMO_NAME}
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll
+              ${LIB_PATH})
   endif()
   if(WITH_ONNXRUNTIME)
-    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll
-      ${LIB_PATH}
-    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll
-      ${LIB_PATH}
-    )
+    add_custom_command(
+      TARGET ${DEMO_NAME}
+      POST_BUILD
+      COMMAND
+        ${CMAKE_COMMAND} -E copy
+        ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll
+        ${LIB_PATH}
+      COMMAND
+        ${CMAKE_COMMAND} -E copy
+        ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll
+        ${LIB_PATH})
   endif()
   if(NOT WITH_STATIC_LIB)
-      add_custom_command(TARGET ${DEMO_NAME} POST_BUILD 
-        COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${LIB_PATH}
-      )
+    add_custom_command(
+      TARGET ${DEMO_NAME}
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy
+              "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${LIB_PATH})
   endif()
 endif()
diff --git a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
index ef5c08cd041eb..f9ac07a830459 100644
--- a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
@@ -17,7 +17,9 @@ limitations under the License. */
  */
 
 #include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
+
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "utils.h"  // NOLINT
 
diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
index 9edb4ecbfd228..551b66fcaf7fa 100644
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -17,6 +17,7 @@ limitations under the License. */
  */
 
 #include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
+
 #include "gflags/gflags.h"
 #include "utils.h"  // NOLINT
 
diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h
index b4f40194aa947..dfba4b8ebf6cd 100644
--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -14,11 +14,13 @@
 
 #pragma once
 #include <math.h>
+
 #include <algorithm>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
+
 #include "paddle/include/paddle_inference_api.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index 818444fbcb648..352efc1e63dbd 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -17,6 +17,7 @@ limitations under the License. */
  */
 
 #include <glog/logging.h>
+
 #include "gflags/gflags.h"
 #include "utils.h"  // NOLINT
 
diff --git a/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc b/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc
index 8d0538f8fa52d..b1f770066e7be 100644
--- a/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc
+++ b/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc
@@ -13,14 +13,15 @@
 // limitations under the License.
 
 #include <glog/logging.h>
+
 #include <algorithm>
 #include <fstream>
 #include <iostream>
 #include <numeric>
 #include <string>
 #include <vector>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/include/paddle_inference_api.h"
 
 DEFINE_string(modeldir, "", "Directory of the inference model.");
diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt
index 0d7a8d57a9c5a..c1ff6ea68a2bd 100644
--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -13,13 +13,28 @@
 # limitations under the License.
 #
 
-cc_library(reset_tensor_array SRCS reset_tensor_array.cc DEPS lod_tensor scope)
-if (WITH_ONNXRUNTIME)
-    cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce onnxruntime)
-    cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc DEPS onnxruntime)
-else (WITH_ONNXRUNTIME)
-    cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce)
-    cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc)
-endif (WITH_ONNXRUNTIME)
+cc_library(
+  reset_tensor_array
+  SRCS reset_tensor_array.cc
+  DEPS lod_tensor scope)
+if(WITH_ONNXRUNTIME)
+  cc_library(
+    zero_copy_tensor
+    SRCS zero_copy_tensor.cc
+    DEPS scope lod_tensor enforce onnxruntime)
+  cc_library(
+    zero_copy_tensor_dummy
+    SRCS zero_copy_tensor_dummy.cc
+    DEPS onnxruntime)
+else(WITH_ONNXRUNTIME)
+  cc_library(
+    zero_copy_tensor
+    SRCS zero_copy_tensor.cc
+    DEPS scope lod_tensor enforce)
+  cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc)
+endif(WITH_ONNXRUNTIME)
 
-cc_test(zero_copy_tensor_test SRCS zero_copy_tensor_test.cc DEPS paddle_inference_api)
+cc_test(
+  zero_copy_tensor_test
+  SRCS zero_copy_tensor_test.cc
+  DEPS paddle_inference_api)
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index bb966dc5c6c1b..661d9def40653 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -340,8 +340,9 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 #ifdef PADDLE_WITH_MKLDNN
     if (tensor->layout() == paddle::framework::DataLayout::kMKLDNN)
       paddle::framework::innerTransDataLayoutFromMKLDNN(
-          tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls()
-                                .get_cur_paddle_data_layout(),
+          tensor->layout(),
+          paddle::platform::MKLDNNDeviceContext::tls()
+              .get_cur_paddle_data_layout(),
           *tensor, &out, paddle::platform::CPUPlace(), true);
     else
       std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
@@ -852,8 +853,9 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data,
 #ifdef PADDLE_WITH_MKLDNN
     if (tensor->layout() == paddle::framework::DataLayout::kMKLDNN)
       paddle::framework::innerTransDataLayoutFromMKLDNN(
-          tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls()
-                                .get_cur_paddle_data_layout(),
+          tensor->layout(),
+          paddle::platform::MKLDNNDeviceContext::tls()
+              .get_cur_paddle_data_layout(),
           *tensor, &out, paddle::platform::CPUPlace(), true);
     else
       std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc
index b9e0e90a40316..3454c5c8fd17b 100644
--- a/paddle/fluid/inference/api/helper.cc
+++ b/paddle/fluid/inference/api/helper.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/helper.h"
+
 #include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index acc52ac046815..1c58b004e6d31 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <glog/logging.h>
+
 #include <fstream>
 #if !defined(_WIN32)
 #include <sys/time.h>
@@ -377,8 +378,9 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
                       double batch_latency, int epoch = 1,
                       const framework::proto::VarType::Type data_type =
                           framework::proto::VarType::FP32) {
-  PADDLE_ENFORCE_GT(batch_size, 0, platform::errors::InvalidArgument(
-                                       "Non-positive batch size."));
+  PADDLE_ENFORCE_GT(
+      batch_size, 0,
+      platform::errors::InvalidArgument("Non-positive batch size."));
   double sample_latency = batch_latency / batch_size;
   LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid
             << " ======";
diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h
index b7a8bf637d872..c2a23a7ca2ce5 100644
--- a/paddle/fluid/inference/api/infer_context.h
+++ b/paddle/fluid/inference/api/infer_context.h
@@ -25,21 +25,21 @@ class InferCPUContext : public phi::CPUContext {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class InferGPUContext : public phi::GPUContext {
  public:
-  using phi::GPUContext::SetStream;
-  using phi::GPUContext::SetEigenDevice;
   using phi::GPUContext::SetBlasHandle;
   using phi::GPUContext::SetBlasTensorCoreHandle;
   using phi::GPUContext::SetBlasTF32Handle;
   using phi::GPUContext::SetDnnHandle;
+  using phi::GPUContext::SetEigenDevice;
   using phi::GPUContext::SetSolverHandle;
   using phi::GPUContext::SetSparseHandle;
+  using phi::GPUContext::SetStream;
   // using phi::GPUContext::SetDnnWorkspaceHandle;
   using phi::GPUContext::SetComputeCapability;
+  using phi::GPUContext::SetDriverVersion;
+  using phi::GPUContext::SetMaxGridDimSize;
+  using phi::GPUContext::SetMaxThreadsPerBlock;
   using phi::GPUContext::SetMaxThreadsPerMultiProcessor;
   using phi::GPUContext::SetMultiProcessors;
-  using phi::GPUContext::SetMaxThreadsPerBlock;
-  using phi::GPUContext::SetMaxGridDimSize;
-  using phi::GPUContext::SetDriverVersion;
   using phi::GPUContext::SetRuntimeVersion;
 };
 #endif
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 4dc80a1d75390..73096973c381c 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/mkldnn_quantizer.h"
+
 #include <algorithm>
 #include <limits>
 #include <map>
 #include <numeric>
 #include <unordered_map>
 #include <utility>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -33,10 +35,10 @@
 
 namespace paddle {
 
-using platform::CPUPlace;
 using framework::LoDTensor;
 using framework::Variable;
 using framework::ir::Graph;
+using platform::CPUPlace;
 using ConstEigenVectorArrayMap =
     Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
 using EigenMatrixDoubleArray =
@@ -57,8 +59,9 @@ static void check_var(const Variable* var, const std::string& var_name) {
 }
 
 static void check_tensor(const LoDTensor& tensor) {
-  PADDLE_ENFORCE_GT(tensor.dims().size(), 0, platform::errors::InvalidArgument(
-                                                 "Tensor dimension is empty."));
+  PADDLE_ENFORCE_GT(
+      tensor.dims().size(), 0,
+      platform::errors::InvalidArgument("Tensor dimension is empty."));
 }
 
 void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForRNNWeights(
@@ -531,8 +534,9 @@ AnalysisPredictor::MkldnnQuantizer::Histogram(
   PADDLE_ENFORCE_GE(max_val, min_val,
                     platform::errors::InvalidArgument(
                         "MkldnnQuantizer: To calculate Histogram, max_val (" +
-                        std::to_string(max_val) + ") must be greater or equal"
-                                                  "to min_val (" +
+                        std::to_string(max_val) +
+                        ") must be greater or equal"
+                        "to min_val (" +
                         std::to_string(min_val) + ")."));
   ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
                                         var_tensor.numel(), 1};
@@ -570,7 +574,8 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
 
   auto* builder = predictor_.config_.pass_builder();
   builder->SetPasses({
-      "cpu_quantize_pass", "cpu_quantize_squash_pass",
+      "cpu_quantize_pass",
+      "cpu_quantize_squash_pass",
       "int8_scale_calculation_mkldnn_pass",
   });
   if (predictor_.config_.ir_debug_) builder->TurnOnDebug();
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.h b/paddle/fluid/inference/api/mkldnn_quantizer.h
index 5e7aa39de52bc..811f2941a7d14 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.h
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.h
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc b/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc
index 2bee4763d4fe9..05077f8ba34cc 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 DEFINE_string(dirname, "", "dirname to tests.");
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h
index d01756e4b96b1..294a83a4335ba 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.h
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -18,6 +18,9 @@
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "onnxruntime_c_api.h"    // NOLINT
+#include "onnxruntime_cxx_api.h"  // NOLINT
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_compatible_info.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
@@ -27,9 +30,6 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/string/printf.h"
-
-#include "onnxruntime_c_api.h"    // NOLINT
-#include "onnxruntime_cxx_api.h"  // NOLINT
 #include "paddle2onnx/converter.h"
 
 #ifdef PADDLE_WITH_TESTING
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
index 4a702edacc903..ff8528c085009 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
@@ -12,16 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
-
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
+
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
 #include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index af6cf88a3224f..489c32bc59d17 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -618,14 +618,14 @@ struct PD_INFER_DECL AnalysisConfig {
   /// may be more high-performance. Libnvinfer_plugin.so greater than
   /// V7.2.1 is needed.
   ///
-  void EnableTensorRtOSS();
+  void EnableVarseqlen();
 
   ///
   /// \brief A boolean state telling whether to use the TensorRT OSS.
   ///
   /// \return bool Whether to use the TensorRT OSS.
   ///
-  bool tensorrt_oss_enabled() { return trt_use_oss_; }
+  bool tensorrt_varseqlen_enabled() { return trt_use_varseqlen_; }
 
   ///
   /// \brief Enable TensorRT DLA
@@ -912,11 +912,18 @@ struct PD_INFER_DECL AnalysisConfig {
   bool thread_local_stream_{false};
   bool use_gpu_fp16_{false};
   std::unordered_set<std::string> gpu_fp16_disabled_op_types_{
-      "conv2d_fusion", "conv2d", "roll", "strided_slice", "depthwise_conv2d",
-      "unfold", "generate_proposals_v2", "nearest_interp_v2",
+      "conv2d_fusion",
+      "conv2d",
+      "roll",
+      "strided_slice",
+      "depthwise_conv2d",
+      "unfold",
+      "generate_proposals_v2",
+      "nearest_interp_v2",
       "bilinear_interp_v2"
       "yolo_box",
-      "multiclass_nms3", "matrix_nms"};
+      "multiclass_nms3",
+      "matrix_nms"};
 
   bool use_cudnn_{false};
 
@@ -954,8 +961,10 @@ struct PD_INFER_DECL AnalysisConfig {
   Precision tensorrt_precision_mode_{Precision::kFloat32};
   bool trt_use_static_engine_{false};
   bool trt_use_calib_mode_{true};
-  bool trt_use_oss_{false};
+  bool trt_use_varseqlen_{false};
   bool trt_with_interleaved_{false};
+  std::string tensorrt_transformer_posid_{""};
+  std::string tensorrt_transformer_maskid_{""};
   bool trt_use_dla_{false};
   int trt_dla_core_{0};
   std::map<std::string, std::vector<int>> min_input_shape_{};
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index dc9f7debe5f2f..78af756c24b03 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -27,6 +27,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "crypto/cipher.h"
 #include "paddle_infer_declare.h"  // NOLINT
 #include "paddle_tensor.h"         // NOLINT
@@ -391,12 +392,14 @@ PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
     const ConfigT& config);
 
 template <>
-PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    NativeConfig, PaddleEngineKind::kNative>(const NativeConfig& config);
+PD_INFER_DECL std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+    const NativeConfig& config);
 
 template <>
-PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config);
+PD_INFER_DECL std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+    const AnalysisConfig& config);
 
 template <>
 PD_INFER_DECL std::unique_ptr<PaddlePredictor>
@@ -435,6 +438,12 @@ class PD_INFER_DECL InternalUtils {
   static void UpdateConfigInterleaved(paddle_infer::Config* c,
                                       bool with_interleaved);
 
+  static void SetTransformerPosid(
+      paddle_infer::Config* c, const std::string& tensorrt_transformer_posid);
+
+  static void SetTransformerMaskid(
+      paddle_infer::Config* c, const std::string& tensorrt_transformer_maskid);
+
   static void SyncStream(paddle_infer::Predictor* pred);
   static void SyncStream(cudaStream_t stream);
   template <typename T>
diff --git a/paddle/fluid/inference/api/paddle_infer_contrib.cc b/paddle/fluid/inference/api/paddle_infer_contrib.cc
index d27f20a93b3a4..e785e91a67139 100644
--- a/paddle/fluid/inference/api/paddle_infer_contrib.cc
+++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/paddle_infer_contrib.h"
+
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index f9ec41f6c8358..9e5b76db4ac16 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -20,6 +20,7 @@
 #include <miopen/miopen.h>
 #endif
 #include <glog/logging.h>
+
 #include <algorithm>
 #include <sstream>
 
@@ -94,25 +95,25 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "add_support_int8_pass",                 //
       // "fc_fuse_pass",                        //
       "simplify_with_basic_ops_pass",                 //
-      "embedding_eltwise_layernorm_fuse_pass",        //
+      "trt_embedding_eltwise_layernorm_fuse_pass",    //
       "preln_embedding_eltwise_layernorm_fuse_pass",  //
-      "multihead_matmul_fuse_pass_v2",                //
-      "multihead_matmul_fuse_pass_v3",                //
-      "skip_layernorm_fuse_pass",                     //
+      "trt_multihead_matmul_fuse_pass_v2",            //
+      "trt_multihead_matmul_fuse_pass_v3",            //
+      "trt_skip_layernorm_fuse_pass",                 //
       "preln_skip_layernorm_fuse_pass",               //
       // "set_transformer_input_convert_pass",           //
-      "conv_bn_fuse_pass",                 //
-      "unsqueeze2_eltwise_fuse_pass",      //
-      "trt_squeeze2_matmul_fuse_pass",     //
-      "trt_reshape2_matmul_fuse_pass",     //
-      "trt_flatten2_matmul_fuse_pass",     //
-      "trt_map_matmul_v2_to_mul_pass",     //
-      "trt_map_matmul_v2_to_matmul_pass",  //
-      "trt_map_matmul_to_mul_pass",        //
-      "fc_fuse_pass",                      //
-      "conv_elementwise_add_fuse_pass",    //
-      // "remove_padding_recover_padding_pass",          //
-      // "delete_remove_padding_recover_padding_pass",    //
+      "conv_bn_fuse_pass",                           //
+      "unsqueeze2_eltwise_fuse_pass",                //
+      "trt_squeeze2_matmul_fuse_pass",               //
+      "trt_reshape2_matmul_fuse_pass",               //
+      "trt_flatten2_matmul_fuse_pass",               //
+      "trt_map_matmul_v2_to_mul_pass",               //
+      "trt_map_matmul_v2_to_matmul_pass",            //
+      "trt_map_matmul_to_mul_pass",                  //
+      "fc_fuse_pass",                                //
+      "conv_elementwise_add_fuse_pass",              //
+      "remove_padding_recover_padding_pass",         //
+      "delete_remove_padding_recover_padding_pass",  //
       // "yolo_box_fuse_pass",      //
       "tensorrt_subgraph_pass",  //
       "conv_bn_fuse_pass",       //
@@ -348,6 +349,10 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {
 void CpuPassStrategy::EnableMkldnnBfloat16() {
 #ifdef PADDLE_WITH_MKLDNN
   if (!use_mkldnn_bfloat16_) {
+    passes_.push_back("fc_mkldnn_pass");
+    passes_.push_back("fc_act_mkldnn_fuse_pass");
+    passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass");
+
     passes_.push_back("cpu_bfloat16_placement_pass");
     passes_.push_back("cpu_bfloat16_pass");
     passes_.push_back("cpu_quantize_squash_pass");
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
index c41968dc58590..24e76598e400b 100644
--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -15,6 +15,7 @@
 
 #include <functional>
 #include <memory>
+
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/backends/cpu/forwards.h"
 
diff --git a/paddle/fluid/inference/capi/CMakeLists.txt b/paddle/fluid/inference/capi/CMakeLists.txt
index 32f780122bcd6..73ba41607aae8 100644
--- a/paddle/fluid/inference/capi/CMakeLists.txt
+++ b/paddle/fluid/inference/capi/CMakeLists.txt
@@ -15,15 +15,22 @@
 
 set(C_API_SRCS pd_config.cc pd_predictor.cc pd_tensor.cc c_api.cc)
 
-cc_library(paddle_inference_c SRCS ${C_API_SRCS} DEPS paddle_inference)
+cc_library(
+  paddle_inference_c
+  SRCS ${C_API_SRCS}
+  DEPS paddle_inference)
 
 if(NOT ON_INFER)
-    return()
+  return()
 endif()
 
 # Create inference capi shared library
-cc_library(paddle_inference_c_shared SHARED SRCS ${C_API_SRCS} DEPS paddle_inference)
-set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME paddle_inference_c)
+cc_library(
+  paddle_inference_c_shared SHARED
+  SRCS ${C_API_SRCS}
+  DEPS paddle_inference)
+set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME
+                                                           paddle_inference_c)
 if(WIN32)
-    target_link_libraries(paddle_inference_c_shared shlwapi.lib)
+  target_link_libraries(paddle_inference_c_shared shlwapi.lib)
 endif()
diff --git a/paddle/fluid/inference/capi/c_api.cc b/paddle/fluid/inference/capi/c_api.cc
index 07493c742c4fa..f2a9838f4bc7d 100644
--- a/paddle/fluid/inference/capi/c_api.cc
+++ b/paddle/fluid/inference/capi/c_api.cc
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/capi/c_api_internal.h b/paddle/fluid/inference/capi/c_api_internal.h
index 7e69b7210768e..11728fb9878fc 100644
--- a/paddle/fluid/inference/capi/c_api_internal.h
+++ b/paddle/fluid/inference/capi/c_api_internal.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <memory>
+
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc
index 9bb52ba578025..2bacc94c0d118 100644
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -18,6 +18,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc
index 12d7f78e169cc..e88fbfc5a86a3 100644
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -19,6 +19,7 @@
 #include <memory>
 #include <numeric>
 #include <vector>
+
 #include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
diff --git a/paddle/fluid/inference/capi/pd_tensor.cc b/paddle/fluid/inference/capi/pd_tensor.cc
index 9b1eedd7c5a81..199db92d1b0d3 100644
--- a/paddle/fluid/inference/capi/pd_tensor.cc
+++ b/paddle/fluid/inference/capi/pd_tensor.cc
@@ -17,6 +17,7 @@
 #include <cstring>
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt
index 521d24329d464..e35e14a0c0241 100644
--- a/paddle/fluid/inference/capi_exp/CMakeLists.txt
+++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt
@@ -15,15 +15,22 @@
 
 set(C_API_SRCS pd_config.cc pd_predictor.cc pd_tensor.cc pd_utils.cc)
 
-cc_library(paddle_inference_c SRCS ${C_API_SRCS} DEPS paddle_inference)
+cc_library(
+  paddle_inference_c
+  SRCS ${C_API_SRCS}
+  DEPS paddle_inference)
 
 if(NOT ON_INFER)
-    return()
+  return()
 endif()
 
 # Create inference capi shared library
-cc_library(paddle_inference_c_shared SHARED SRCS ${C_API_SRCS} DEPS paddle_inference)
-set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME paddle_inference_c)
+cc_library(
+  paddle_inference_c_shared SHARED
+  SRCS ${C_API_SRCS}
+  DEPS paddle_inference)
+set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME
+                                                           paddle_inference_c)
 if(WIN32)
-    target_link_libraries(paddle_inference_c_shared shlwapi.lib)
+  target_link_libraries(paddle_inference_c_shared shlwapi.lib)
 endif()
diff --git a/paddle/fluid/inference/capi_exp/lod_demo.cc b/paddle/fluid/inference/capi_exp/lod_demo.cc
index 2b049e992e71d..c67d6f870bdd9 100644
--- a/paddle/fluid/inference/capi_exp/lod_demo.cc
+++ b/paddle/fluid/inference/capi_exp/lod_demo.cc
@@ -27,8 +27,10 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 
 int main(int argc, char *argv[]) {
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
index d7b07652babbd..4e1c5a2a0ddd0 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/capi_exp/pd_config.h"
+
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/capi_exp/pd_types.h"
 #include "paddle/fluid/inference/capi_exp/utils_internal.h"
@@ -303,13 +304,13 @@ void PD_ConfigDisableTensorRtOPs(__pd_keep PD_Config* pd_config, size_t ops_num,
   config->Exp_DisableTensorRtOPs(ops_list);
 }
 
-void PD_ConfigEnableTensorRtOSS(__pd_keep PD_Config* pd_config) {
+void PD_ConfigEnableVarseqlen(__pd_keep PD_Config* pd_config) {
   CHECK_AND_CONVERT_PD_CONFIG;
-  config->EnableTensorRtOSS();
+  config->EnableVarseqlen();
 }
 PD_Bool PD_ConfigTensorRtOssEnabled(__pd_keep PD_Config* pd_config) {
   CHECK_AND_CONVERT_PD_CONFIG;
-  return config->tensorrt_oss_enabled();
+  return config->tensorrt_varseqlen_enabled();
 }
 
 void PD_ConfigEnableTensorRtDla(__pd_keep PD_Config* pd_config,
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
index f6b754cad213f..667843520d686 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -432,7 +432,7 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigDisableTensorRtOPs(
 ///
 /// \param[in] pd_onfig config
 ///
-PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtOSS(
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableVarseqlen(
     __pd_keep PD_Config* pd_config);
 ///
 /// \brief A boolean state telling whether to use the TensorRT OSS.
diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.cc b/paddle/fluid/inference/capi_exp/pd_predictor.cc
index 5ca58b0e4138b..c85dfdf522e67 100644
--- a/paddle/fluid/inference/capi_exp/pd_predictor.cc
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/capi_exp/pd_predictor.h"
+
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/capi_exp/pd_types.h"
 #include "paddle/fluid/inference/capi_exp/pd_utils.h"
diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.cc b/paddle/fluid/inference/capi_exp/pd_tensor.cc
index 9c661dea6f2bb..520cfa813f47e 100644
--- a/paddle/fluid/inference/capi_exp/pd_tensor.cc
+++ b/paddle/fluid/inference/capi_exp/pd_tensor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/capi_exp/pd_tensor.h"
+
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/capi_exp/pd_types.h"
 #include "paddle/fluid/inference/capi_exp/pd_utils.h"
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.cc b/paddle/fluid/inference/capi_exp/pd_utils.cc
index efca350fbaf49..7942a860c4ee8 100644
--- a/paddle/fluid/inference/capi_exp/pd_utils.cc
+++ b/paddle/fluid/inference/capi_exp/pd_utils.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
+
 #include <string>
 
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/capi_exp/pd_utils.h"
 #include "paddle/fluid/inference/capi_exp/utils_internal.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/inference/experimental/javaapi/CMakeLists.txt b/paddle/fluid/inference/experimental/javaapi/CMakeLists.txt
index 5b66d1de91917..fc4a3c408dfe2 100644
--- a/paddle/fluid/inference/experimental/javaapi/CMakeLists.txt
+++ b/paddle/fluid/inference/experimental/javaapi/CMakeLists.txt
@@ -1,5 +1,6 @@
 include_directories($ENV{jni_path} $ENV{jni_sub_path} $ENV{paddle_path})
-find_library(PADDLE_INFERENCE_C libpaddle_inference_c.so HINTS $ENV{paddle_inference_lib})
+find_library(PADDLE_INFERENCE_C libpaddle_inference_c.so
+             HINTS $ENV{paddle_inference_lib})
 aux_source_directory(native JNI_SRCS)
 add_library(paddle_inference SHARED ${JNI_SRCS})
 target_link_libraries(paddle_inference ${PADDLE_INFERENCE_C})
diff --git a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp
index 593ba3cb51d8c..efea093fa245a 100644
--- a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp
+++ b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp
@@ -13,9 +13,10 @@
 // limitations under the License.
 
 #include "com_baidu_paddle_inference_Config.h"
+
 #include <iostream>
-#include "jni_convert_util.h"  // NOLINT
 
+#include "jni_convert_util.h"  // NOLINT
 #include "pd_inference_api.h"  // NOLINT
 
 JNIEXPORT void JNICALL Java_com_baidu_paddle_inference_Config_cppConfigDestroy(
diff --git a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Predictor.cpp b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Predictor.cpp
index 7eff03690ae8e..0912c2ad57a68 100644
--- a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Predictor.cpp
+++ b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Predictor.cpp
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "com_baidu_paddle_inference_Predictor.h"
+
 #include <jni.h>
+
 #include "jni_convert_util.h"  // NOLINT
 #include "pd_inference_api.h"  // NOLINT
 
diff --git a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Tensor.cpp b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Tensor.cpp
index b9be4a73ac2ce..a90ae165ebd51 100644
--- a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Tensor.cpp
+++ b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Tensor.cpp
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "com_baidu_paddle_inference_Tensor.h"
+
 #include <jni.h>
+
 #include "pd_inference_api.h"  // NOLINT
 
 JNIEXPORT void JNICALL Java_com_baidu_paddle_inference_Tensor_cppTensorDestroy(
diff --git a/paddle/fluid/inference/experimental/javaapi/native/jni_convert_util.h b/paddle/fluid/inference/experimental/javaapi/native/jni_convert_util.h
index 0026ec2f4102c..c363559298f18 100644
--- a/paddle/fluid/inference/experimental/javaapi/native/jni_convert_util.h
+++ b/paddle/fluid/inference/experimental/javaapi/native/jni_convert_util.h
@@ -17,6 +17,7 @@
 
 #include <jni.h>
 #include <string.h>
+
 #include <string>
 #include <vector>
 
@@ -54,8 +55,8 @@ inline jstring cpp_string_to_jstring(JNIEnv *env, std::string str) {
                           reinterpret_cast<const jbyte *>(data));
 
   jstring encoding = env->NewStringUTF("UTF-8");
-  jstring res = (jstring)(
-      env->NewObject(strClass, strClassInitMethodID, bytes, encoding));
+  jstring res = (jstring)(env->NewObject(strClass, strClassInitMethodID, bytes,
+                                         encoding));
 
   env->DeleteLocalRef(strClass);
   env->DeleteLocalRef(encoding);
diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go
index 8f9f34c06b476..0aca2a1075fd3 100644
--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -500,8 +500,8 @@ func (config *Config) DisableTensorRtOPs(ops []string) {
 /// may be more high-performance. Libnvinfer_plugin.so greater than
 /// V7.2.1 is needed.
 ///
-func (config *Config) EnableTensorRtOSS() {
-	C.PD_ConfigEnableTensorRtOSS(config.c)
+func (config *Config) EnableVarseqlen() {
+	C.PD_ConfigEnableVarseqlen(config.c)
 }
 
 ///
diff --git a/paddle/fluid/inference/goapi/config_test.go b/paddle/fluid/inference/goapi/config_test.go
index 297841dcbcf6c..080f2fd0135e5 100644
--- a/paddle/fluid/inference/goapi/config_test.go
+++ b/paddle/fluid/inference/goapi/config_test.go
@@ -54,7 +54,7 @@ func TestNewConfig(t *testing.T) {
 	}
 	config.SetTRTDynamicShapeInfo(minInputShape, maxInputShape, optInputShape, false)
 
-	config.EnableTensorRtOSS()
+	config.EnableVarseqlen()
 	t.Logf("TensorrtOssEnabled:%+v", config.TensorrtOssEnabled())
 
 	config.EnableTensorRtDLA(0)
@@ -138,4 +138,4 @@ func TestONNXRuntime(t *testing.T) {
 
 	config.SetCpuMathLibraryNumThreads(4)
 	t.Logf("CpuMathLibraryNumThreads:%+v", config.CpuMathLibraryNumThreads())
-}
\ No newline at end of file
+}
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index 317ef9d93acf3..1106ad261ec41 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/inference/lite/CMakeLists.txt b/paddle/fluid/inference/lite/CMakeLists.txt
index 6d981d007e73a..7aa010cb0066c 100644
--- a/paddle/fluid/inference/lite/CMakeLists.txt
+++ b/paddle/fluid/inference/lite/CMakeLists.txt
@@ -2,8 +2,23 @@ if(XPU_SDK_ROOT)
   set(XPU_DEPS xpuapi xpurt)
 endif()
 
-cc_library(lite_op_teller SRCS op_teller.cc DEPS ${LITE_DEPS} framework_proto device_context boost xxhash)
-cc_library(lite_engine SRCS engine.cc DEPS ${LITE_DEPS} framework_proto ${XPU_DEPS})
-cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy ${LITE_DEPS} framework_proto boost device_context ${XPU_DEPS})
-cc_test(test_lite_engine SRCS test_engine_lite.cc DEPS lite_engine protobuf framework_proto glog gtest analysis)
-cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils)
+cc_library(
+  lite_op_teller
+  SRCS op_teller.cc
+  DEPS ${LITE_DEPS} framework_proto device_context boost xxhash)
+cc_library(
+  lite_engine
+  SRCS engine.cc
+  DEPS ${LITE_DEPS} framework_proto ${XPU_DEPS})
+cc_library(
+  lite_tensor_utils
+  SRCS tensor_utils.cc
+  DEPS memcpy ${LITE_DEPS} framework_proto boost device_context ${XPU_DEPS})
+cc_test(
+  test_lite_engine
+  SRCS test_engine_lite.cc
+  DEPS lite_engine protobuf framework_proto glog gtest analysis)
+cc_test(
+  test_lite_tensor_utils
+  SRCS test_tensor_utils.cc
+  DEPS lite_engine lite_tensor_utils)
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index cd78cfecd8635..8f8f68b170b62 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -25,6 +25,7 @@
 #endif
 
 #include "paddle/fluid/inference/lite/engine.h"
+
 #include <utility>
 
 namespace paddle {
diff --git a/paddle/fluid/inference/lite/op_teller.cc b/paddle/fluid/inference/lite/op_teller.cc
index 3a162c3fde13f..3d2ed0a5c9890 100644
--- a/paddle/fluid/inference/lite/op_teller.cc
+++ b/paddle/fluid/inference/lite/op_teller.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/inference/lite/op_teller.h"
+
 #include <map>
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/lite/engine.h"
-#include "paddle/fluid/inference/lite/op_teller.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/lite/op_teller.h b/paddle/fluid/inference/lite/op_teller.h
index b9391a98a2ee3..1a969f1293dd2 100644
--- a/paddle/fluid/inference/lite/op_teller.h
+++ b/paddle/fluid/inference/lite/op_teller.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/op_desc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index eeaa128290339..f70455f18ebfd 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/lite/tensor_utils.h"
+
 #include <functional>
 #include <map>
 #include <memory>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/inference/lite/engine.h"
@@ -26,9 +28,9 @@ namespace inference {
 namespace lite {
 namespace utils {
 
-using paddle::lite_api::TargetType;
-using paddle::lite_api::PrecisionType;
 using paddle::lite_api::DataLayoutType;
+using paddle::lite_api::PrecisionType;
+using paddle::lite_api::TargetType;
 
 template <typename DstLoD, typename SrcLoD>
 void SetLoD(DstLoD* dst, const SrcLoD& src) {
diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc
index 85f7d3ee363a7..dee83f70ba2a2 100644
--- a/paddle/fluid/inference/lite/test_engine_lite.cc
+++ b/paddle/fluid/inference/lite/test_engine_lite.cc
@@ -14,14 +14,12 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/inference/utils/singleton.h"
-
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-
 #include "paddle/fluid/inference/lite/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/operators/lite/ut_helper.h"
 
 namespace paddle {
@@ -29,9 +27,9 @@ namespace inference {
 namespace lite {
 
 using inference::lite::AddTensorToBlockDesc;
-using paddle::inference::lite::AddFetchListToBlockDesc;
 using inference::lite::CreateTensor;
 using inference::lite::serialize_params;
+using paddle::inference::lite::AddFetchListToBlockDesc;
 
 void make_fake_model(std::string* model, std::string* param) {
   framework::ProgramDesc program;
diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc
index b0c7c7448a50e..09a6cda62b352 100644
--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/lite/tensor_utils.h"
 
@@ -21,9 +22,9 @@ namespace inference {
 namespace lite {
 namespace utils {
 
-using paddle::lite_api::TargetType;
-using paddle::lite_api::PrecisionType;
 using paddle::lite_api::DataLayoutType;
+using paddle::lite_api::PrecisionType;
+using paddle::lite_api::TargetType;
 
 TEST(LiteEngineOp, GetNativePlace) {
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index c713e3a66ac71..abd00ef9de67e 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,11 +1,27 @@
 # Compiling with WITH_PYTHON=ON and WITH_TENSORRT=ON failed on windows. Temporarily add paddle_inference_api dependency to solve the problem
 if(WIN32)
-    nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost paddle_inference_api)
+  nv_library(
+    tensorrt_engine
+    SRCS engine.cc trt_int8_calibrator.cc
+    DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost
+         paddle_inference_api)
 else()
-    nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost)
+  nv_library(
+    tensorrt_engine
+    SRCS engine.cc trt_int8_calibrator.cc
+    DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost)
 endif()
-nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto device_context boost)
-nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
-nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
+nv_library(
+  tensorrt_op_teller
+  SRCS op_teller.cc
+  DEPS framework_proto device_context boost)
+nv_test(
+  test_tensorrt
+  SRCS test_tensorrt.cc
+  DEPS dynload_cuda device_context dynamic_loader)
+nv_test(
+  test_tensorrt_engine
+  SRCS test_engine.cc
+  DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(plugin)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 1910e2f6eb906..b27a584de2bfa 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,62 +1,70 @@
 # Add TRT tests
-nv_library(tensorrt_converter
-           SRCS matmul_op.cc
-                conv2d_op.cc
-                fc_op.cc
-                pool2d_op.cc
-                elementwise_op.cc
-                batch_norm_op.cc
-                activation_op.cc
-                unary_op.cc
-                softmax_op.cc
-                concat_op.cc
-                dropout_op.cc
-                group_norm_op.cc
-                pad_op.cc
-                split_op.cc
-                prelu_op.cc
-                leaky_relu_op.cc
-                gelu_op.cc
-                layer_norm_op.cc
-                multihead_matmul_op.cc
-                shuffle_channel_op.cc
-                swish_op.cc
-                instance_norm_op.cc
-                stack_op.cc
-                transpose_op.cc
-                flatten_op.cc
-                flatten_contiguous_range_op.cc
-                emb_eltwise_layernorm.cc
-                skip_layernorm.cc
-                scale_op.cc
-                slice_op.cc
-                hard_sigmoid_op.cc
-                hard_swish_op.cc
-                clip_op.cc
-                gather_op.cc
-                anchor_generator_op.cc
-                yolo_box_op.cc
-                yolo_box_head_op.cc
-                arg_max_op.cc
-                roi_align_op.cc
-                affine_channel_op.cc
-                multiclass_nms_op.cc
-                multiclass_nms3_op.cc
-                nearest_interp_op.cc
-                reshape_op.cc
-                reduce_op.cc
-                gather_nd_op.cc
-                tile_op.cc
-                conv3d_op.cc
-                mish_op.cc
-                nearest_interp_v2_op.cc
-                pool3d_op.cc
-                deformable_conv_op.cc
-                preln_emb_eltwise_layernorm.cc
-		strided_slice_op.cc
-                preln_skip_layernorm.cc
-		roll_op.cc
-           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
+nv_library(
+  tensorrt_converter
+  SRCS matmul_op.cc
+       conv2d_op.cc
+       fc_op.cc
+       pool2d_op.cc
+       elementwise_op.cc
+       batch_norm_op.cc
+       activation_op.cc
+       unary_op.cc
+       softmax_op.cc
+       concat_op.cc
+       dropout_op.cc
+       group_norm_op.cc
+       pad_op.cc
+       split_op.cc
+       prelu_op.cc
+       leaky_relu_op.cc
+       gelu_op.cc
+       layer_norm_op.cc
+       multihead_matmul_op.cc
+       shuffle_channel_op.cc
+       swish_op.cc
+       instance_norm_op.cc
+       stack_op.cc
+       transpose_op.cc
+       flatten_op.cc
+       flatten_contiguous_range_op.cc
+       emb_eltwise_layernorm.cc
+       skip_layernorm.cc
+       scale_op.cc
+       slice_op.cc
+       hard_sigmoid_op.cc
+       hard_swish_op.cc
+       clip_op.cc
+       gather_op.cc
+       anchor_generator_op.cc
+       yolo_box_op.cc
+       yolo_box_head_op.cc
+       arg_max_op.cc
+       roi_align_op.cc
+       affine_channel_op.cc
+       multiclass_nms_op.cc
+       multiclass_nms3_op.cc
+       nearest_interp_op.cc
+       reshape_op.cc
+       reduce_op.cc
+       gather_nd_op.cc
+       tile_op.cc
+       conv3d_op.cc
+       mish_op.cc
+       nearest_interp_v2_op.cc
+       pool3d_op.cc
+       deformable_conv_op.cc
+       preln_emb_eltwise_layernorm.cc
+       strided_slice_op.cc
+       preln_skip_layernorm.cc
+       roll_op.cc
+       transformer_input_convert_op.cc
+       remove_padding_op.cc
+       recover_padding_op.cc
+  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto
+       op_registry)
 
-nv_test(test_op_converter SRCS test_op_converter.cc DEPS
-  paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter)
+nv_test(
+  test_op_converter
+  SRCS test_op_converter.cc
+  DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_engine
+       tensorrt_converter)
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index b86351e394bd1..2ef8ec16c76df 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <NvInfer.h>
+
 #include <string>
 
 #include "glog/logging.h"
diff --git a/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc b/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc
index 2bbe6ea3d2fa8..df6c601500c3b 100644
--- a/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <cstdio>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index 7a494860e6fa1..ffb32bab52296 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -30,23 +30,28 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-#if IS_TRT_VERSION_GE(6000)
     VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer";
 
     framework::OpDesc op_desc(op, nullptr);
     auto word_id_name = op_desc.Input("WordId").front();
-    auto pos_id_name = op_desc.Input("PosId").front();
+    auto pos_id_name = engine_->tensorrt_transformer_posid();
     engine_->Set("ernie_pos_name", new std::string(pos_id_name));
 
     auto sent_id_name = op_desc.Input("SentId").front();
+    auto mask_id_name = engine_->tensorrt_transformer_maskid();
     auto word_emb_name = op_desc.Input("WordEmbedding").front();
     auto pos_emb_name = op_desc.Input("PosEmbedding").front();
     auto sent_emb_name = op_desc.Input("SentEmbedding").front();
 
     std::vector<std::string> id_names;
     std::vector<std::string> emb_names;
+    bool flag_varseqlen =
+        engine_->use_varseqlen() && pos_id_name != "" && mask_id_name != "";
 
-    if (engine_->use_oss()) {
+    if (flag_varseqlen) {
+      engine_->SetITensor("word_id", engine_->GetITensor(word_id_name));
+      engine_->SetITensor("pos_id", engine_->GetITensor(pos_id_name));
+      engine_->SetITensor("mask_id", engine_->GetITensor(mask_id_name));
       id_names =
           std::vector<std::string>{word_id_name, pos_id_name, sent_id_name};
       emb_names =
@@ -106,7 +111,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     nvinfer1::ILayer* layer = nullptr;
     bool enable_int8 = op_desc.HasAttr("enable_int8");
 
-    if (engine_->use_oss()) {
+    if (flag_varseqlen) {
       int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
       if (enable_int8) {
         output_fp16 = 1;
@@ -121,7 +126,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
           output_fp16, 1,
           platform::errors::InvalidArgument(
               "Only Precision::KHalf(fp16) is supported when infering "
-              "ernie(bert) model with config.EnableTensorRtOSS(). "
+              "ernie(bert) model with config.EnableVarseqlen(). "
               "But Precision::KFloat32 is setted."));
       const std::vector<nvinfer1::PluginField> fields{
           {"bert_embeddings_layernorm_beta", bias,
@@ -159,8 +164,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
       plugin_inputs.emplace_back(
           engine_->GetITensor(pos_id_name));  // cu_seqlens,
                                               // eval_placeholder_2
-      auto max_seqlen_tensor =
-          engine_->GetITensor(engine_->network()->getInput(3)->getName());
+      auto max_seqlen_tensor = engine_->GetITensor(mask_id_name);
       auto* shuffle_layer =
           TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor);
       nvinfer1::Dims shape_dim;
@@ -193,8 +197,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
         engine_->SetTensorDynamicRange(plugin_layer->getOutput(1), out_scale);
       }
       if (engine_->with_interleaved()) {
-        VLOG(4)
-            << "fused emb_eltwise_layernorm op: use_oss and with_interleaved";
+        VLOG(4) << "fused emb_eltwise_layernorm op: use_varseqlen and "
+                   "with_interleaved";
         if (!enable_int8) {
           PADDLE_THROW(
               platform::errors::Fatal("use with_interleaved must be int8."));
@@ -229,12 +233,6 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
       RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
                                test_mode);
     }
-
-#else
-    PADDLE_THROW(platform::errors::Fatal(
-        "You are running the TRT Dynamic Shape mode, need to confirm that "
-        "your TRT version is no less than 6.0"));
-#endif
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index a631332dae360..bf3170dacc7df 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -250,8 +250,7 @@ class FcOpConverter : public OpConverter {
     }
     // If use tensorrt'oss, the x_dim and x_num_col_dims need change, and can
     // not add Shuffle layer in ernie's multihead.
-    if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 &&
-        x_dim.d[3] == 1 && x_num_col_dims == 2) {
+    if (x_dim.nbDims == 4 && x_num_col_dims == 1) {
       if (enable_int8 || support_int8) {
         // add conv1x1 layer
         nvinfer1::DimsHW nv_ksize(1, 1);
diff --git a/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc b/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
index e08f50833ed99..c293282b761d3 100644
--- a/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
@@ -50,10 +50,11 @@ class FlattenContiguousRangeOpConverter : public OpConverter {
       for (int i = 0, j = 0; i < dims; ++i) {
         if (start_axis <= i + 1 && i + 1 <= stop_axis) {
           int dim_i = input_dim.d[i];
-          PADDLE_ENFORCE_GT(dim_i, 0, platform::errors::InvalidArgument(
-                                          "flatten_contiguous_range input dim "
-                                          "should be > 0, but got %d.",
-                                          dim_i));
+          PADDLE_ENFORCE_GT(dim_i, 0,
+                            platform::errors::InvalidArgument(
+                                "flatten_contiguous_range input dim "
+                                "should be > 0, but got %d.",
+                                dim_i));
           dim_prod *= dim_i;
           if (i + 1 == stop_axis) {
             flatten_dim.d[j++] = dim_prod;
diff --git a/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
index 910a807d3626a..2a62f9009e209 100644
--- a/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
index b468518fa5a3c..02e9610ea1ec4 100644
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
@@ -13,15 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
+
 #include <cuda.h>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-using platform::is_gpu_place;
 using platform::is_cpu_place;
+using platform::is_gpu_place;
 
 class DefaultIOConverter : public EngineIOConverter {
  public:
@@ -49,8 +51,9 @@ class DefaultIOConverter : public EngineIOConverter {
           out, in.data<float>(), size, cudaMemcpyHostToDevice, *stream_));
     } else if (is_gpu_place(place)) {
       PADDLE_ENFORCE_EQ(
-          0, cudaMemcpyAsync(out, in.data<float>(), size,
-                             cudaMemcpyDeviceToDevice, *stream_),
+          0,
+          cudaMemcpyAsync(out, in.data<float>(), size, cudaMemcpyDeviceToDevice,
+                          *stream_),
           platform::errors::External(
               "cudaMemcpyAsync(cudaMemcpyDeviceToDevice) error."));
     } else {
@@ -78,14 +81,16 @@ class DefaultIOConverter : public EngineIOConverter {
             "But out's memory_size = %u, max_size = %u.",
             size, max_size));
     if (is_cpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
-                                           cudaMemcpyDeviceToHost, *stream_),
+      PADDLE_ENFORCE_EQ(0,
+                        cudaMemcpyAsync(out->data<float>(), in, size,
+                                        cudaMemcpyDeviceToHost, *stream_),
                         platform::errors::External(
                             "cudaMemcpyAsync(cudaMemcpyDeviceToHost) error."));
     } else if (is_gpu_place(place)) {
       PADDLE_ENFORCE_EQ(
-          0, cudaMemcpyAsync(out->data<float>(), in, size,
-                             cudaMemcpyDeviceToDevice, *stream_),
+          0,
+          cudaMemcpyAsync(out->data<float>(), in, size,
+                          cudaMemcpyDeviceToDevice, *stream_),
           platform::errors::External(
               "cudaMemcpyAsync(cudaMemcpyDeviceToDevice) error."));
     } else {
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.h b/paddle/fluid/inference/tensorrt/convert/io_converter.h
index 58c178028b8b2..3ff78a6dc7a3b 100644
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
index a968ea2a2c484..ae39267533928 100644
--- a/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
index b0d67a5bf90ca..d630f7e9967a7 100644
--- a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index 4b4ad01f5674a..f06554e7ebb41 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -76,12 +76,14 @@ class MultiheadMatMulOpConverter : public OpConverter {
 
     nvinfer1::ILayer* layer = nullptr;
     auto output_name = op_desc.Output("Out")[0];
-
+    bool flag_varseqlen = engine_->use_varseqlen() &&
+                          engine_->tensorrt_transformer_posid() != "" &&
+                          engine_->tensorrt_transformer_maskid() != "";
     if (engine_->with_dynamic_shape()) {
-      if (engine_->use_oss()) {
+      if (flag_varseqlen) {
         if (engine_->precision() == AnalysisConfig::Precision::kFloat32) {
           PADDLE_THROW(platform::errors::Fatal(
-              "use use_oss must be int8 or half, not float32."));
+              "use use_varseqlen must be int8 or half, not float32."));
         }
         nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT,
                                  static_cast<void*>(weight_data),
@@ -90,7 +92,8 @@ class MultiheadMatMulOpConverter : public OpConverter {
                                static_cast<void*>(bias_data),
                                static_cast<int32_t>(bias_t->numel())};
         if (engine_->with_interleaved()) {
-          VLOG(4) << "fused multihead_matmul op: use_oss and with_interleaved";
+          VLOG(4) << "fused multihead_matmul op: use_varseqlen and "
+                     "with_interleaved";
           if (!op_desc.HasAttr("Input_scale")) {
             PADDLE_THROW(
                 platform::errors::Fatal("use with_interleaved must be int8."));
@@ -233,9 +236,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
                   BOOST_GET_CONST(float, op_desc.GetAttr("dp_probs")) / 127.0;
             }
           }
-
-          auto mask_tensor = engine_->GetITensor("qkv_plugin_mask");
-
           auto creator = GetPluginRegistry()->getPluginCreator(
               "CustomQKVToContextPluginDynamic", "2");
           assert(creator != nullptr);
@@ -272,18 +272,10 @@ class MultiheadMatMulOpConverter : public OpConverter {
 
           std::vector<nvinfer1::ITensor*> plugin_inputs;
           plugin_inputs.emplace_back(fc_layer->getOutput(0));
-          plugin_inputs.emplace_back(mask_tensor);
-          if (engine_->Has("ernie_pos_name")) {
-            plugin_inputs.emplace_back(engine_->GetITensor(
-                engine_->Get<std::string>("ernie_pos_name")));
-          } else {
-            plugin_inputs.emplace_back(engine_->GetITensor(
-                engine_->network()
-                    ->getInput(2)
-                    ->getName()));  // cu_seqlens, eval_placeholder_2
-          }
-          auto max_seqlen_tensor =
-              engine_->GetITensor(engine_->network()->getInput(3)->getName());
+          plugin_inputs.emplace_back(engine_->GetITensor("qkv_plugin_mask"));
+          plugin_inputs.emplace_back(engine_->GetITensor("pos_id"));
+
+          auto max_seqlen_tensor = engine_->GetITensor("mask_id");
           auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
               engine_, Shuffle,
               *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 0a99b12edc25c..077ba32ba89c1 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
@@ -268,14 +269,16 @@ class OpConverter {
           }
         }
         engine->DeclareInput(
-            input, FluidDataType2TRT(
-                       var->Proto()->type().lod_tensor().tensor().data_type()),
+            input,
+            FluidDataType2TRT(
+                var->Proto()->type().lod_tensor().tensor().data_type()),
             Vec2TRT_Dims(input_shape, input, true));
 #endif
       } else {
         engine->DeclareInput(
-            input, FluidDataType2TRT(
-                       var->Proto()->type().lod_tensor().tensor().data_type()),
+            input,
+            FluidDataType2TRT(
+                var->Proto()->type().lod_tensor().tensor().data_type()),
             Vec2TRT_Dims(var_shape, input));
       }
     }
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
index 87fdbb71a3faf..4ee8db7c69d62 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
@@ -32,7 +32,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
 #if IS_TRT_VERSION_GE(7000)
     VLOG(4) << "convert fluid PrelnEmbEltwiseLayerNorm op to tensorrt layer";
 
-    if (!(engine_->use_oss() && engine_->with_interleaved())) {
+    if (!(engine_->use_varseqlen() && engine_->with_interleaved())) {
       PADDLE_THROW(platform::errors::Fatal(
           "PrelnErnie: If you want to use oss, must be with interleaved"));
     }
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
index 8053135cc452c..1e9aec29e347a 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
@@ -24,7 +24,7 @@ class PrelnSkipLayerNormOpConverter : public OpConverter {
                   const framework::Scope& scope, bool test_mode) override {
 #if IS_TRT_VERSION_GE(7000)
     VLOG(4) << "convert fused preln_skip_layernorm op to tensorrt layer";
-    if (!(engine_->use_oss() && engine_->with_interleaved())) {
+    if (!(engine_->use_varseqlen() && engine_->with_interleaved())) {
       PADDLE_THROW(platform::errors::Fatal(
           "PrelnErnie: If you want to use oss, must be with interleaved"));
     }
@@ -60,7 +60,8 @@ class PrelnSkipLayerNormOpConverter : public OpConverter {
 
     nvinfer1::ILayer* layer = nullptr;
 
-    VLOG(4) << "fused preln_skip_layernorm op: use_oss and with_interleaved";
+    VLOG(4)
+        << "fused preln_skip_layernorm op: use_varseqlen and with_interleaved";
 
     auto creator = GetPluginRegistry()->getPluginCreator(
         "CustomSkipLayerNormPluginDynamic", "4");
diff --git a/paddle/fluid/inference/tensorrt/convert/recover_padding_op.cc b/paddle/fluid/inference/tensorrt/convert/recover_padding_op.cc
new file mode 100644
index 0000000000000..8f996e1d0f8bc
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/recover_padding_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Recover padding of transformer'input.
+ */
+class RecoverPadding : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "Recover padding of transformer'output: VarSeqlen -> Padding.";
+    if (!engine_->with_dynamic_shape()) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "recover_padding_op: If you want to use transformer, must "
+          "be with dynamic shape"));
+    }
+
+    framework::OpDesc op_desc(op, nullptr);
+    /*
+    auto x_var_name = op_desc.Input(InputNames()).front();
+    auto* x_var_desc = block->FindVar(x_var_name);
+    const auto x_shape = x_var_desc->GetShape();
+    */
+    auto input_name = op_desc.Input("Input").front();
+
+    std::cout << "input_name: " << input_name << std::endl;
+
+    std::vector<nvinfer1::ITensor*> plugin_inputs;
+    plugin_inputs.push_back(engine_->GetITensor(input_name));
+    plugin_inputs.push_back(engine_->GetITensor("pos_id"));
+    plugin_inputs.push_back(engine_->GetITensor("mask_id"));
+    int input_num = 3;
+    auto output_name = op_desc.Output("Out").front();
+
+    plugin::RecoverPaddingPlugin* plugin = new plugin::RecoverPaddingPlugin();
+    nvinfer1::ILayer* layer =
+        engine_->AddDynamicPlugin(plugin_inputs.data(), input_num, plugin);
+
+    RreplenishLayerAndOutput(layer, "recover_padding", {output_name},
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(recover_padding, RecoverPadding);
diff --git a/paddle/fluid/inference/tensorrt/convert/remove_padding_op.cc b/paddle/fluid/inference/tensorrt/convert/remove_padding_op.cc
new file mode 100644
index 0000000000000..49d5edbbd4e02
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/remove_padding_op.cc
@@ -0,0 +1,69 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Remove padding of transformer'input.
+ */
+class RemovePadding : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "Remove padding of transformer'input: Padding -> VarSeqlen";
+    if (!engine_->with_dynamic_shape()) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "remove_padding_op: If you want to use transformer, must "
+          "be with dynamic shape"));
+    }
+
+    framework::OpDesc op_desc(op, nullptr);
+    auto input_name = op_desc.Input("Input").front();
+
+    std::vector<nvinfer1::ITensor*> plugin_inputs;
+    plugin_inputs.push_back(engine_->GetITensor(input_name));
+    plugin_inputs.push_back(engine_->GetITensor("pos_id"));
+    plugin_inputs.push_back(engine_->GetITensor("word_id"));
+    size_t input_num = plugin_inputs.size();
+    auto output_name = op_desc.Output("Out").front();
+
+    plugin::RemovePaddingPlugin* plugin = new plugin::RemovePaddingPlugin();
+    nvinfer1::ILayer* layer =
+        engine_->AddDynamicPlugin(plugin_inputs.data(), input_num, plugin);
+
+    RreplenishLayerAndOutput(layer, "remove_padding_op", {output_name},
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(remove_padding, RemovePadding);
diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
index 831e117311771..6f65e27192319 100644
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -52,10 +52,13 @@ class SkipLayerNormOpConverter : public OpConverter {
     bool enable_int8 = op_desc.HasAttr("enable_int8");
 
     nvinfer1::ILayer* layer = nullptr;
-
-    if (engine_->use_oss()) {
+    bool flag_varseqlen = engine_->use_varseqlen() &&
+                          engine_->tensorrt_transformer_posid() != "" &&
+                          engine_->tensorrt_transformer_maskid() != "";
+    if (flag_varseqlen) {
       if (engine_->with_interleaved()) {
-        VLOG(4) << "fused skip_layernorm op: use_oss and with_interleaved";
+        VLOG(4)
+            << "fused skip_layernorm op: use_varseqlen and with_interleaved";
         if (!enable_int8) {
           PADDLE_THROW(
               platform::errors::Fatal("use with_interleaved must be int8."));
diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index dea9a1ec3d76d..fa6f488940365 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h"
-#include "paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h"
 
 namespace paddle {
 namespace inference {
@@ -74,47 +73,12 @@ class SliceOpConverter : public OpConverter {
 
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
-      if (engine_->use_oss() && engine_->with_ernie() &&
-          input_dims.nbDims == 4) {
-        std::vector<nvinfer1::ITensor*> plugin_inputs;
-        if (engine_->with_interleaved()) {
-          auto* shuffler_slice = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-          nvinfer1::Permutation transpose_embed{2, 1, 0, 3};
-          shuffler_slice->setSecondTranspose(transpose_embed);
-          engine_->SetTensorDynamicRange(shuffler_slice->getOutput(0),
-                                         out_scale);
-          shuffler_slice->setName(
-              ("SpecialSlice_interleaved: transpose: (Output: " + output_name +
-               ")")
-                  .c_str());
-          plugin_inputs.emplace_back(shuffler_slice->getOutput(0));
-        } else {
-          plugin_inputs.emplace_back(input);
-        }
-        std::string pos_name;
-        if (engine_->Has("ernie_pos_name")) {
-          pos_name = engine_->Get<std::string>("ernie_pos_name");
-        } else {
-          // hard code for compatibility
-          pos_name = engine_->network()->getInput(2)->getName();
-        }
-        plugin_inputs.emplace_back(
-            engine_->GetITensor(pos_name));  // cu_seqlens, eval_placeholder_2
-
-        // bool ban_fp16 = engine_->disable_trt_plugin_fp16();
-        plugin::SpecialSlicePluginDynamic* plugin =
-            new plugin::SpecialSlicePluginDynamic();
-        layer = engine_->AddDynamicPlugin(plugin_inputs.data(),
-                                          plugin_inputs.size(), plugin);
-      } else {
-        bool with_fp16 =
-            engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-        int decrease_axis =
-            decrease_axises.size() == 0 ? -1 : decrease_axises[0];
-        plugin::SlicePluginDynamic* plugin = new plugin::SlicePluginDynamic(
-            starts, ends, axes, decrease_axis, with_fp16);
-        layer = engine_->AddDynamicPlugin(&input, 1, plugin);
-      }
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+      int decrease_axis = decrease_axises.size() == 0 ? -1 : decrease_axises[0];
+      plugin::SlicePluginDynamic* plugin = new plugin::SlicePluginDynamic(
+          starts, ends, axes, decrease_axis, with_fp16);
+      layer = engine_->AddDynamicPlugin(&input, 1, plugin);
     } else {
       bool with_fp16 =
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index 46e6c18bfb8e3..66acee964cdbc 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 1ad82df41737c..7a034f2c166dd 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc
index 92e34e48bdb29..caa9e9ee2898d 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
index 6c876964297f9..b1319312adfe0 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
index a856d14144469..0b9f4a5fd84db 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
index cf37739608763..2d77b9b32db2c 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
@@ -12,6 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
index 9c6ea51fe5a35..5221843db19d8 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
index 8134d389469cb..4647521dd32b0 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
index 8f91309a0a00d..a2fe32b75f3de 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
index f17e00de0eeb7..f7984dd0ab750 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc
index c84c30255fa96..d2dbb7fb5920c 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index 86cb7543d42da..35b8fe1ee6ad7 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc
index f5ab6a9924931..96b14c4e40cb0 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 9bfae64fe80e3..9a4d4db3435a2 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
 #include <gtest/gtest.h>  // NOLINT
 
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
index ba35d7ddbb2f4..a8e36f827d8e3 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
index 36f13262a73d7..b917aa865d28f 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -12,7 +12,9 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 #include <gtest/gtest.h>
+
 #include <fstream>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
index f2541ff7c0b5e..d71cf051972d1 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc
index 3ebb51afdf44f..b5e640ea24412 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
index 9cd5e81141598..babe682ab4e48 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
@@ -12,6 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
index 3b6a4a80044eb..1d23aeedc5a8d 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc
index 7a5a886affed3..94ca6f0ed4627 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc b/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc
new file mode 100644
index 0000000000000..045a5d163ca51
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Convert Transformer Input(pos_id, max_seqlen).
+ */
+class TransformerInputConvert : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "Convert Transformer Input(pos_id, max_seqlen), use "
+               "transformer_input_convert_plugin";
+    if (!engine_->with_dynamic_shape()) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "transformer_input_convert_op: If you want to use transformer, must "
+          "be with dynamic shape"));
+    }
+
+    framework::OpDesc op_desc(op, nullptr);
+    auto input_name = op_desc.Input("Input").front();
+    auto* input = engine_->GetITensor(input_name);
+    int input_num = op_desc.Input("Input").size();
+
+    // tensorrt_subgraph_pass will rename tensor
+    // auto pos_id_name = op_desc.Output("PosId").front();
+    // auto max_seqlen_name = op_desc.Output("MaxSeqlen").front();
+    auto pos_id_name = "pos_id_tensor";
+    auto max_seqlen_name = "max_seqlen_tensor";
+
+    plugin::TransformerInputConvertPlugin* plugin =
+        new plugin::TransformerInputConvertPlugin();
+    nvinfer1::ILayer* layer =
+        engine_->AddDynamicPlugin(&input, input_num, plugin);
+
+    RreplenishLayerAndOutput(layer, "transformer_input_convert",
+                             {pos_id_name, max_seqlen_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(transformer_input_convert, TransformerInputConvert);
diff --git a/paddle/fluid/inference/tensorrt/convert/unary_op.cc b/paddle/fluid/inference/tensorrt/convert/unary_op.cc
index aa3d38ebe2073..72d5cb2aeb4d3 100644
--- a/paddle/fluid/inference/tensorrt/convert/unary_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/unary_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <NvInfer.h>
+
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
index 17d217dff43fd..f5ab63daa88df 100644
--- a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 00a6b2ffbf923..7f308fd3a04d5 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <NvInfer.h>
 #include <glog/logging.h>
+
 #include <string>
 
 #include "cuda_runtime_api.h"  // NOLINT
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index f781cd0cb3a8d..b28fe827156c3 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <NvInfer.h>
+
 #include <map>
 #include <memory>
 #include <mutex>  // NOLINT
@@ -151,7 +152,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
     return dims;
   }
 }
-}  // NOLINT
+}  // namespace
 
 class TRTInt8Calibrator;
 
@@ -410,14 +411,19 @@ class TensorRTEngine {
     suffix_counter += 1;
   }
 
-  void SetUseOSS(bool use_oss) { use_oss_ = use_oss; }
+  void SetUseOSS(bool use_varseqlen) { use_varseqlen_ = use_varseqlen; }
   void SetUseDLA(bool use_dla) { use_dla_ = use_dla; }
   void SetDLACore(int dla_core) { dla_core_ = dla_core; }
   void SetWithErnie(bool with_ernie) { with_ernie_ = with_ernie; }
   void SetWithInterleaved(bool with_interleaved) {
     with_interleaved_ = with_interleaved;
   }
-
+  void SetTransformerPosid(std::string tensorrt_transformer_posid) {
+    tensorrt_transformer_posid_ = tensorrt_transformer_posid;
+  }
+  void SetTransformerMaskid(std::string tensorrt_transformer_maskid) {
+    tensorrt_transformer_maskid_ = tensorrt_transformer_maskid;
+  }
   void ClearWeights() {
     for (auto& weight_pair : weight_map) {
       weight_pair.second.reset(nullptr);
@@ -488,9 +494,15 @@ class TensorRTEngine {
     return ret;
   }
 
-  bool use_oss() { return use_oss_; }
+  bool use_varseqlen() { return use_varseqlen_; }
   bool with_ernie() { return with_ernie_; }
   bool with_interleaved() { return with_interleaved_; }
+  std::string tensorrt_transformer_posid() {
+    return tensorrt_transformer_posid_;
+  }
+  std::string tensorrt_transformer_maskid() {
+    return tensorrt_transformer_maskid_;
+  }
   bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; }
   bool with_dynamic_shape() { return with_dynamic_shape_; }
   AnalysisConfig::Precision precision() { return precision_; }
@@ -612,11 +624,13 @@ class TensorRTEngine {
   ShapeMapType max_input_shape_;
   ShapeMapType optim_input_shape_;
   bool disable_trt_plugin_fp16_{false};
-  bool use_oss_{false};
+  bool use_varseqlen_{false};
   bool use_dla_{false};
   int dla_core_{0};
   bool with_ernie_{false};
   bool with_interleaved_{false};
+  std::string tensorrt_transformer_posid_;
+  std::string tensorrt_transformer_maskid_;
   nvinfer1::ILogger& logger_;
 
   // max data size for the buffers.
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index b8051d8610442..e283000cdace5 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -17,9 +17,11 @@
 #include <NvInfer.h>
 #include <cuda.h>
 #include <glog/logging.h>
+
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/platform/dynload/tensorrt.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 690bc173c77cf..dc7c77bc66acf 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+
 #include <bitset>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/data_layout.h"
 
@@ -125,7 +127,10 @@ struct SimpleOpTypeSetTeller : public Teller {
       "strided_slice",
       "fused_preln_embedding_eltwise_layernorm",
       "roll",
-      "preln_skip_layernorm"};
+      "preln_skip_layernorm",
+      "transformer_input_convert",
+      "recover_padding",
+      "remove_padding"};
   std::unordered_set<std::string> teller_set{
       "mul",
       "matmul",
@@ -194,7 +199,10 @@ struct SimpleOpTypeSetTeller : public Teller {
       "fused_preln_embedding_eltwise_layernorm",
       "preln_skip_layernorm",
       "roll",
-      "multiclass_nms3"};
+      "multiclass_nms3",
+      "transformer_input_convert",
+      "recover_padding",
+      "remove_padding"};
 };
 
 bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h
index 0a0cbeae51b02..40f1a0055c78b 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index ff6a1cd60f720..0377c82838bdd 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1,20 +1,35 @@
-nv_library(tensorrt_plugin
-           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
-           prelu_op_plugin.cu gelu_op_plugin.cu
-           pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu
-           instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
-           qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu
-           hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu
-           anchor_generator_op_plugin.cu
-           yolo_box_op_plugin.cu
-           yolo_box_head_op_plugin.cu
-           roi_align_op_plugin.cu
-           gather_nd_op_plugin.cu
-           mish_op_plugin.cu
-           pool3d_op_plugin.cu
-           deformable_conv_op_plugin.cu
-           matmul_op_int8_plugin.cu
-	   DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
+nv_library(
+  tensorrt_plugin
+  SRCS trt_plugin.cc
+       split_op_plugin.cu
+       elementwise_op_plugin.cu
+       prelu_op_plugin.cu
+       gelu_op_plugin.cu
+       pool_op_plugin.cu
+       swish_op_plugin.cu
+       layer_norm_op_plugin.cu
+       instance_norm_op_plugin.cu
+       emb_eltwise_layernorm_plugin.cu
+       qkv_to_context_plugin.cu
+       skip_layernorm_op_plugin.cu
+       slice_op_plugin.cu
+       hard_swish_op_plugin.cu
+       stack_op_plugin.cu
+       anchor_generator_op_plugin.cu
+       yolo_box_op_plugin.cu
+       yolo_box_head_op_plugin.cu
+       roi_align_op_plugin.cu
+       gather_nd_op_plugin.cu
+       mish_op_plugin.cu
+       pool3d_op_plugin.cu
+       deformable_conv_op_plugin.cu
+       matmul_op_int8_plugin.cu
+       transformer_input_convert_plugin.cu
+       remove_padding_plugin.cu
+       recover_padding_plugin.cu
+  DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
 
-nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
-  paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin)
+nv_test(
+  test_split_plugin
+  SRCS test_split_plugin.cc
+  DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin)
diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
index e5584f2658067..a339f880ac388 100644
--- a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
@@ -14,6 +14,7 @@
 
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+
 #include <algorithm>
 #include <cassert>
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
index 6128f8f0e4134..7ea664ded66f2 100644
--- a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+
 #include <algorithm>
 #include <cstdio>
 
@@ -88,9 +89,10 @@ DeformableConvPlugin::DeformableConvPlugin(
   dilations_.insert(dilations_.end(), dilations.cbegin(), dilations.cend());
   PADDLE_ENFORCE_EQ(data_type_ == nvinfer1::DataType::kFLOAT ||
                         data_type_ == nvinfer1::DataType::kHALF,
-                    true, platform::errors::InvalidArgument(
-                              "The DeformableConv TRT Plugin's input type "
-                              "should be float or half."));
+                    true,
+                    platform::errors::InvalidArgument(
+                        "The DeformableConv TRT Plugin's input type "
+                        "should be float or half."));
   PADDLE_ENFORCE_EQ(
       paddings_.size(), strides_.size(),
       platform::errors::InvalidArgument(
@@ -124,9 +126,10 @@ DeformableConvPlugin::DeformableConvPlugin(
   output_dim_.insert(output_dim_.end(), output_dim.cbegin(), output_dim.cend());
   PADDLE_ENFORCE_EQ(data_type_ == nvinfer1::DataType::kFLOAT ||
                         data_type_ == nvinfer1::DataType::kHALF,
-                    true, platform::errors::InvalidArgument(
-                              "The DeformableConv TRT Plugin's input type "
-                              "should be float or half."));
+                    true,
+                    platform::errors::InvalidArgument(
+                        "The DeformableConv TRT Plugin's input type "
+                        "should be float or half."));
   PADDLE_ENFORCE_EQ(
       paddings_.size(), strides_.size(),
       platform::errors::InvalidArgument(
@@ -363,13 +366,11 @@ __global__ void ModulatedDeformableIm2colGpuKernel<float>(
     const float* data_im_ptr =
         data_im + (b_col * num_channels + c_im) * height * width;
     const float* data_offset_ptr =
-        data_offset +
-        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
-            kernel_w * height_col * width_col;
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const float* data_mask_ptr =
-        data_mask +
-        (b_col * deformable_group + deformable_group_index) * kernel_h *
-            kernel_w * height_col * width_col;
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
 
     for (int i = 0; i < kernel_h; ++i) {
       for (int j = 0; j < kernel_w; ++j) {
@@ -432,13 +433,11 @@ __global__ void ModulatedDeformableIm2colGpuKernel<half>(
     const half* data_im_ptr =
         data_im + (b_col * num_channels + c_im) * height * width;
     const half* data_offset_ptr =
-        data_offset +
-        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
-            kernel_w * height_col * width_col;
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const half* data_mask_ptr =
-        data_mask +
-        (b_col * deformable_group + deformable_group_index) * kernel_h *
-            kernel_w * height_col * width_col;
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
 
     for (int i = 0; i < kernel_h; ++i) {
       for (int j = 0; j < kernel_w; ++j) {
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
index 1070a88cee737..5f4abee2838f7 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <glog/logging.h>
+
 #include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
 
 namespace paddle {
@@ -67,14 +68,16 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data,
 
 nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
     int index, const nvinfer1::Dims *input_dims, int num_inputs) TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "There is only one output in TRT elementwise "
-                                  "op plugin, but got output index: %d.",
-                                  index));
-  PADDLE_ENFORCE_EQ(num_inputs, 2, platform::errors::InvalidArgument(
-                                       "There are 2 inputs in TRT elementwise "
-                                       "op plugin, but got input number: %d.",
-                                       num_inputs));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "There is only one output in TRT elementwise "
+                        "op plugin, but got output index: %d.",
+                        index));
+  PADDLE_ENFORCE_EQ(
+      num_inputs, 2,
+      platform::errors::InvalidArgument("There are 2 inputs in TRT elementwise "
+                                        "op plugin, but got input number: %d.",
+                                        num_inputs));
   PADDLE_ENFORCE_NOT_NULL(
       input_dims,
       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
index aa1ab5389a572..51fc1bebd90be 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index 82f4420a2a04c..6c7530cdc1f05 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include <stdio.h>
+
 #include <cassert>
 #include <cub/cub.cuh>  // NOLINT
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -253,10 +255,11 @@ nvinfer1::DataType EmbEltwiseLayernormPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types,
     int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(
-      index, 0, platform::errors::InvalidArgument(
-                    "The EmbEltwiseLayernorm Plugin only has one input, so the "
-                    "index value should be 0, but get %d.",
-                    index));
+      index, 0,
+      platform::errors::InvalidArgument(
+          "The EmbEltwiseLayernorm Plugin only has one input, so the "
+          "index value should be 0, but get %d.",
+          index));
   if (with_fp16_)
     return nvinfer1::DataType::kHALF;
   else
diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
index 841fb2f6fe399..f27b66b03f544 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
@@ -15,9 +15,11 @@
 #pragma once
 
 #include <thrust/device_vector.h>
+
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
index 08b259e0f952e..cba1bb04c3654 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
@@ -15,6 +15,7 @@
 #include <cassert>
 #include <cstring>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -112,15 +113,15 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs,
     VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp32";
     const float* input = static_cast<const float*>(inputs[0]);
     float* output = static_cast<float*>(outputs[0]);
-    gelu_kernel<float, block_size><<<grid_size, block_size, 0, stream>>>(
-        kA, num, input, output);
+    gelu_kernel<float, block_size>
+        <<<grid_size, block_size, 0, stream>>>(kA, num, input, output);
   } else if (type == nvinfer1::DataType::kHALF) {
     VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp16";
     const half* input = static_cast<const half*>(inputs[0]);
     half* output = static_cast<half*>(outputs[0]);
-    no_exact_gelu_kernel<half,
-                         block_size><<<grid_size, block_size, 0, stream>>>(
-        kAT, kBT, kCT, num, input, output);
+    no_exact_gelu_kernel<half, block_size>
+        <<<grid_size, block_size, 0, stream>>>(kAT, kBT, kCT, num, input,
+                                               output);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "The Gelu TRT Plugin's input type should be float or half."));
@@ -170,10 +171,11 @@ bool GeluPluginDynamic::supportsFormatCombination(
 nvinfer1::DataType GeluPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType* input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The Gelu Plugin only has one input, so the "
-                                  "index value should be 0, but get %d.",
-                                  index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Gelu Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
   return input_types[0];
 }
 
@@ -192,15 +194,15 @@ int GeluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp32";
     const float* input = static_cast<const float*>(inputs[0]);
     float* output = static_cast<float*>(outputs[0]);
-    gelu_kernel<float, block_size><<<grid_size, block_size, 0, stream>>>(
-        kA, num, input, output);
+    gelu_kernel<float, block_size>
+        <<<grid_size, block_size, 0, stream>>>(kA, num, input, output);
   } else if (input_type == nvinfer1::DataType::kHALF) {
     VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp16";
     const half* input = static_cast<const half*>(inputs[0]);
     half* output = static_cast<half*>(outputs[0]);
-    no_exact_gelu_kernel<half,
-                         block_size><<<grid_size, block_size, 0, stream>>>(
-        kAT, kBT, kCT, num, input, output);
+    no_exact_gelu_kernel<half, block_size>
+        <<<grid_size, block_size, 0, stream>>>(kAT, kBT, kCT, num, input,
+                                               output);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "The Gelu TRT Plugin's input type should be float or half."));
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
index 7efdd2798b264..8436ccad78a2c 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
@@ -14,9 +14,11 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <cassert>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
index 9872b1ff8d957..05ed76bd3c983 100644
--- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
@@ -14,6 +14,7 @@
 
 #include <cassert>
 #include <cstring>
+
 #include "paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h
index 475c908c13bbf..b1e693799bd77 100644
--- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h
@@ -14,9 +14,11 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <cassert>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
index 03686aefc1370..9acd688f707a3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include <stdio.h>
+
 #include <cassert>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
index 67d44184a76d0..16e2a284d4bf2 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include <stdio.h>
+
 #include <cassert>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h"
 #include "paddle/phi/kernels/layer_norm_kernel.h"
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
index 9e8ce30283373..42dfa2b8aa02b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
@@ -17,6 +17,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
diff --git a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h
index be8f1c418fc7f..9ca6ff29240d4 100644
--- a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 #include <cassert>
-
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu
index 6e268e7b0b330..f655d23e62810 100644
--- a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <cstring>
+
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h"
 
@@ -38,11 +39,12 @@ bool MishPlugin::supportsFormat(
 nvinfer1::Dims MishPlugin::getOutputDimensions(int index,
                                                const nvinfer1::Dims* in_dims,
                                                int nb_inputs) TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(nb_inputs, 1, platform::errors::InvalidArgument(
-                                      "We expect [number of inputs] == 1"
-                                      "in TRT Mish op plugin, but got "
-                                      "[number of inputs] = %d.",
-                                      nb_inputs));
+  PADDLE_ENFORCE_EQ(
+      nb_inputs, 1,
+      platform::errors::InvalidArgument("We expect [number of inputs] == 1"
+                                        "in TRT Mish op plugin, but got "
+                                        "[number of inputs] = %d.",
+                                        nb_inputs));
   PADDLE_ENFORCE_LT(index, this->getNbOutputs(),
                     platform::errors::InvalidArgument(
                         "We expect [index] < [number of outputs]"
@@ -123,14 +125,14 @@ int MishPlugin::enqueue(int batchSize, const void* const* inputs,
     VLOG(1) << "TRT Plugin DataType selected. Mish-->fp32";
     const float* input = static_cast<const float*>(inputs[0]);
     float* output = static_cast<float*>(outputs[0]);
-    mish_kernel<float><<<grid_size, block_size, 0, stream>>>(threshold_, num,
-                                                             input, output);
+    mish_kernel<float>
+        <<<grid_size, block_size, 0, stream>>>(threshold_, num, input, output);
   } else if (type == nvinfer1::DataType::kHALF) {
     VLOG(1) << "TRT Plugin DataType selected. Mish-->fp16";
     const half* input = static_cast<const half*>(inputs[0]);
     half* output = static_cast<half*>(outputs[0]);
-    mish_kernel<half><<<grid_size, block_size, 0, stream>>>(threshold_, num,
-                                                            input, output);
+    mish_kernel<half>
+        <<<grid_size, block_size, 0, stream>>>(threshold_, num, input, output);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "The Mish TRT Plugin's input type should be float or half."));
@@ -192,10 +194,11 @@ bool MishPluginDynamic::supportsFormatCombination(
 nvinfer1::DataType MishPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType* input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The Mish Plugin only has one input, so the "
-                                  "index value should be 0, but get %d.",
-                                  index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Mish Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
   return input_types[0];
 }
 
@@ -214,14 +217,14 @@ int MishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     VLOG(1) << "TRT Plugin DataType selected. Mish-->fp32";
     const float* input = static_cast<const float*>(inputs[0]);
     float* output = static_cast<float*>(outputs[0]);
-    mish_kernel<float><<<grid_size, block_size, 0, stream>>>(threshold_, num,
-                                                             input, output);
+    mish_kernel<float>
+        <<<grid_size, block_size, 0, stream>>>(threshold_, num, input, output);
   } else if (input_type == nvinfer1::DataType::kHALF) {
     VLOG(1) << "TRT Plugin DataType selected. Mish-->fp16";
     const half* input = static_cast<const half*>(inputs[0]);
     half* output = static_cast<half*>(outputs[0]);
-    mish_kernel<half><<<grid_size, block_size, 0, stream>>>(threshold_, num,
-                                                            input, output);
+    mish_kernel<half>
+        <<<grid_size, block_size, 0, stream>>>(threshold_, num, input, output);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "The Mish TRT Plugin's input type should be float or half."));
diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
index 75390666ea097..fdef7b93f32fd 100644
--- a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
@@ -14,8 +14,10 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
index 5596a89a083fe..40cb2b88e711c 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
@@ -70,10 +70,11 @@ nvinfer1::Dims Pool3DPlugin::getOutputDimensions(
                         "The Pool3D Plugin only has one input, so the nbInputs "
                         "value should be 1, but get %d.",
                         nbInputs));
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The Pool3D Plugin only has one input, so "
-                                  "the index value should be 0, but get %d.",
-                                  index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Pool3D Plugin only has one input, so "
+                        "the index value should be 0, but get %d.",
+                        index));
   PADDLE_ENFORCE_EQ(inputDims[0].nbDims, 4,
                     platform::errors::InvalidArgument(
                         "The Pool3D Plugin only has four Dimensions, so the "
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h
index 7c9a8625d70f3..d54ce067e5ef3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h
@@ -14,9 +14,11 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <cassert>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
index 9bfe98d759d8e..80f7e349dac4a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
@@ -240,10 +240,11 @@ bool PoolPluginDynamic::supportsFormatCombination(
 nvinfer1::DataType PoolPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The Pool Plugin only has one input, so the "
-                                  "index value should be 0, but get %d.",
-                                  index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Pool Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
   PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT), true,
                     platform::errors::InvalidArgument(
                         "The input type should be half or float"));
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h
index d1bf2cd02e84f..155d69cc45784 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h
@@ -14,9 +14,11 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <cassert>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
index 1ea2b8b5f6ec4..72c1d546e9a2e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -144,10 +144,11 @@ bool PReluPluginDynamic::supportsFormatCombination(
 nvinfer1::DataType PReluPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The PRelu Plugin only has one input, so the "
-                                  "index value should be 0, but get %d.",
-                                  index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The PRelu Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
   PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT), true,
                     platform::errors::InvalidArgument(
                         "The input type should be half or float"));
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
index e0a77de6f5491..0025e1ee5b436 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -17,9 +17,9 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index e2f1aab9b6460..d3da5d7225d33 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include <stdio.h>
+
 #include <cassert>
 #include <cub/cub.cuh>  // NOLINT
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -103,8 +105,8 @@ inline void TransposeQKV(const int batch, const int seq_len,
                       platform::errors::InvalidArgument(
                           "head_num (%d) * head_size (%d) should <= %d",
                           head_num, head_size, 1024));
-    TransposeQkvKernel<float><<<grid, block, 0, stream>>>(head_size, input,
-                                                          output);
+    TransposeQkvKernel<float>
+        <<<grid, block, 0, stream>>>(head_size, input, output);
   }
 }
 
@@ -142,8 +144,8 @@ inline void TransposeQKV(const int batch, const int seq_len,
                       platform::errors::InvalidArgument(
                           "head_num (%d) * head_size (%d) should <= %d",
                           head_num, head_size, 1024));
-    TransposeQkvKernel<half><<<grid, block, 0, stream>>>(head_size, input,
-                                                         output);
+    TransposeQkvKernel<half>
+        <<<grid, block, 0, stream>>>(head_size, input, output);
   }
 }
 
@@ -218,10 +220,11 @@ nvinfer1::DataType QkvToContextPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types,
     int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(
-      index, 0, platform::errors::InvalidArgument(
-                    "The EmbEltwiseLayernorm Plugin only has one input, so the "
-                    "index value should be 0, but get %d.",
-                    index));
+      index, 0,
+      platform::errors::InvalidArgument(
+          "The EmbEltwiseLayernorm Plugin only has one input, so the "
+          "index value should be 0, but get %d.",
+          index));
   return input_types[0];
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu
new file mode 100644
index 0000000000000..515e01f40538c
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu
@@ -0,0 +1,120 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+__global__ void RecoverPaddingKernel(const float* input0, const int32_t* input1,
+                                     float* output) {
+  int word_id = blockIdx.x * gridDim.y + blockIdx.y;
+  int32_t seqence_length = input1[blockIdx.x + 1] - input1[blockIdx.x];
+  if (blockIdx.y < seqence_length) {
+    output[word_id * gridDim.z * blockDim.x + blockIdx.z * blockDim.x +
+           threadIdx.x] =
+        input0[(input1[blockIdx.x] + blockIdx.y) * gridDim.z * blockDim.x +
+               blockIdx.z * blockDim.x + threadIdx.x];
+  } else {
+    output[word_id * gridDim.z * blockDim.x + blockIdx.z * blockDim.x +
+           threadIdx.x] = 0;
+  }
+}
+
+nvinfer1::DataType RecoverPaddingPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType* input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
+  return input_types[0];
+}
+
+nvinfer1::DimsExprs RecoverPaddingPlugin::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
+  nvinfer1::DimsExprs output_dims{};
+  output_dims.nbDims = 3;
+  const auto* one = exprBuilder.constant(1);
+  output_dims.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kSUB,
+                                           *inputs[1].d[0], *one);
+  output_dims.d[1] = inputs[2].d[1];
+  output_dims.d[2] = inputs[0].d[1];
+  return output_dims;
+}
+
+bool RecoverPaddingPlugin::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
+    int nbOutputs) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_EQ(nbInputs, 3,
+                    platform::errors::InvalidArgument("Must have 3 inputs, "
+                                                      "but got %d input(s). ",
+                                                      nbInputs));
+  PADDLE_ENFORCE_EQ(nbOutputs, getNbOutputs(),
+                    platform::errors::InvalidArgument("Must have 1 output, "
+                                                      "but got %d output(s). ",
+                                                      nbOutputs));
+  if (pos == 1) {  // PosId, MaxSeqlen
+    return inOut[pos].type == nvinfer1::DataType::kINT32 &&
+           inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+  }
+  return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+         inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+  // return (inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format
+  // == nvinfer1::TensorFormat::kLINEAR)||
+  // (inOut[pos].type == nvinfer1::DataType::kHALF && inOut[pos].format ==
+  // nvinfer1::TensorFormat::kLINEAR)||
+  // (inOut[pos].type == nvinfer1::DataType::kINT8 && inOut[pos].format ==
+  // nvinfer1::TensorFormat::kCHW32);
+}
+
+void RecoverPaddingPlugin::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* outputs,
+    int nbOutputs) TRT_NOEXCEPT {}
+
+void RecoverPaddingPlugin::attachToContext(
+    cudnnContext* cudnnContext, cublasContext* cublasContext,
+    nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}
+
+void RecoverPaddingPlugin::detachFromContext() TRT_NOEXCEPT {}
+
+void RecoverPaddingPlugin::terminate() TRT_NOEXCEPT {}
+
+int RecoverPaddingPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                  const nvinfer1::PluginTensorDesc* outputDesc,
+                                  const void* const* inputs,
+                                  void* const* outputs, void* workspace,
+                                  cudaStream_t stream) TRT_NOEXCEPT {
+  const auto input0_desc = inputDesc[0];
+  const auto input1_desc = inputDesc[1];
+  const auto input2_desc = inputDesc[2];
+  const float* input0 = static_cast<const float*>(inputs[0]);
+  const int32_t* input1 =
+      static_cast<const int32_t*>(inputs[1]);  // pos_id_tensor
+  float* output = static_cast<float*>(outputs[0]);
+  const int32_t num_threads = 256;
+  const dim3 num_blocks(
+      input1_desc.dims.d[0] - 1, input2_desc.dims.d[1],
+      input0_desc.dims.d[1] / num_threads);  //  batchs, max sequnce length
+                                             //  (mask_id.dims.d[1]),
+                                             //  input.dims.d[1]/256
+  RecoverPaddingKernel<<<num_blocks, num_threads, 0, stream>>>(input0, input1,
+                                                               output);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.h b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.h
new file mode 100644
index 0000000000000..71b576610e25c
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.h
@@ -0,0 +1,133 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cassert>
+#include <string>
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class RecoverPaddingPlugin : public DynamicPluginTensorRT {
+ public:
+  RecoverPaddingPlugin() {}
+
+  RecoverPaddingPlugin(void const* serial_data, size_t serial_length) {}
+
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
+    RecoverPaddingPlugin* ptr = new RecoverPaddingPlugin();
+    return ptr;
+  }
+
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "recover_padding_plugin";
+  }
+
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+
+  int initialize() TRT_NOEXCEPT { return 0; }
+  void terminate() TRT_NOEXCEPT;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* outputs,
+                       int nbOutputs) TRT_NOEXCEPT override;
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override {
+    return 0;
+  }
+
+  void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
+                       nvinfer1::IGpuAllocator* gpuAllocator)
+      TRT_NOEXCEPT override;
+
+  void detachFromContext() TRT_NOEXCEPT override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
+
+  void destroy() TRT_NOEXCEPT override { delete this; }
+
+ protected:
+  size_t getSerializationSize() const TRT_NOEXCEPT override { return 0; }
+
+  void serialize(void* buffer) const TRT_NOEXCEPT override {}
+};
+
+class RecoverPaddingPluginCreator : public nvinfer1::IPluginCreator {
+ public:
+  RecoverPaddingPluginCreator() {}
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "recover_padding_plugin";
+  }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* plugin_field)
+      TRT_NOEXCEPT override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, void const* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
+    RecoverPaddingPlugin* obj =
+        new RecoverPaddingPlugin(serial_data, serial_length);
+    obj->setPluginNamespace(name);
+    return obj;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+};
+REGISTER_TRT_PLUGIN_V2(RecoverPaddingPluginCreator);
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu
new file mode 100644
index 0000000000000..84e36a4d5f638
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu
@@ -0,0 +1,118 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+__global__ void RemovePaddingKernel(const float* input0, const int32_t* input1,
+                                    float* output) {
+  int word_id = blockIdx.x * gridDim.y + blockIdx.y;
+  int32_t seqence_length = input1[blockIdx.x + 1] - input1[blockIdx.x];
+  if (blockIdx.y < seqence_length) {
+    output[(input1[blockIdx.x] + blockIdx.y) * gridDim.z * blockDim.x +
+           blockIdx.z * blockDim.x + threadIdx.x] =
+        input0[word_id * gridDim.z * blockDim.x + blockIdx.z * blockDim.x +
+               threadIdx.x];
+  }
+}
+
+nvinfer1::DataType RemovePaddingPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType* input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
+  return input_types[0];
+}
+
+nvinfer1::DimsExprs RemovePaddingPlugin::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
+  nvinfer1::DimsExprs output_dims{};
+  output_dims.nbDims = 4;
+  output_dims.d[0] = inputs[2].d[0];
+  output_dims.d[1] = inputs[0].d[2];
+  output_dims.d[2] = exprBuilder.constant(1);
+  output_dims.d[3] = exprBuilder.constant(1);
+
+  return output_dims;
+}
+
+bool RemovePaddingPlugin::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
+    int nbOutputs) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_EQ(nbInputs, 3,
+                    platform::errors::InvalidArgument("Must have 3 inputs, "
+                                                      "but got %d input(s). ",
+                                                      nbInputs));
+  PADDLE_ENFORCE_EQ(nbOutputs, getNbOutputs(),
+                    platform::errors::InvalidArgument("Must have 1 output, "
+                                                      "but got %d output(s). ",
+                                                      nbOutputs));
+  if (pos == 1 || pos == 2) {  // pos_id, work_id
+    return inOut[pos].type == nvinfer1::DataType::kINT32 &&
+           inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+  }
+  return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+         inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+  // return (inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format
+  // == nvinfer1::TensorFormat::kLINEAR)||
+  // (inOut[pos].type == nvinfer1::DataType::kHALF && inOut[pos].format ==
+  // nvinfer1::TensorFormat::kLINEAR)||
+  // (inOut[pos].type == nvinfer1::DataType::kINT8 && inOut[pos].format ==
+  // nvinfer1::TensorFormat::kCHW32);
+}
+
+void RemovePaddingPlugin::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* outputs,
+    int nbOutputs) TRT_NOEXCEPT {}
+
+void RemovePaddingPlugin::attachToContext(
+    cudnnContext* cudnnContext, cublasContext* cublasContext,
+    nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}
+
+void RemovePaddingPlugin::detachFromContext() TRT_NOEXCEPT {}
+
+void RemovePaddingPlugin::terminate() TRT_NOEXCEPT {}
+
+int RemovePaddingPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                 const nvinfer1::PluginTensorDesc* outputDesc,
+                                 const void* const* inputs,
+                                 void* const* outputs, void* workspace,
+                                 cudaStream_t stream) TRT_NOEXCEPT {
+  const auto input_desc = inputDesc[0];
+  const float* input0 = static_cast<const float*>(inputs[0]);
+  const int32_t* input1 =
+      static_cast<const int32_t*>(inputs[1]);  // pos_id_tensor
+  float* output = static_cast<float*>(outputs[0]);
+
+  const auto input0_desc = inputDesc[0];
+
+  const int32_t num_threads = 256;
+  const dim3 num_blocks(
+      input0_desc.dims.d[0], input0_desc.dims.d[1],
+      input0_desc.dims.d[2] /
+          num_threads);  //  batchs, max sequnce length, input.dims.d[2]/256
+
+  RemovePaddingKernel<<<num_blocks, num_threads, 0, stream>>>(input0, input1,
+                                                              output);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.h b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.h
new file mode 100644
index 0000000000000..89fda3dd775c1
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.h
@@ -0,0 +1,133 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cassert>
+#include <string>
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class RemovePaddingPlugin : public DynamicPluginTensorRT {
+ public:
+  RemovePaddingPlugin() {}
+
+  RemovePaddingPlugin(void const* serial_data, size_t serial_length) {}
+
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
+    RemovePaddingPlugin* ptr = new RemovePaddingPlugin();
+    return ptr;
+  }
+
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "remove_padding_plugin";
+  }
+
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+
+  int initialize() TRT_NOEXCEPT { return 0; }
+  void terminate() TRT_NOEXCEPT;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* outputs,
+                       int nbOutputs) TRT_NOEXCEPT override;
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override {
+    return 0;
+  }
+
+  void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
+                       nvinfer1::IGpuAllocator* gpuAllocator)
+      TRT_NOEXCEPT override;
+
+  void detachFromContext() TRT_NOEXCEPT override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
+
+  void destroy() TRT_NOEXCEPT override { delete this; }
+
+ protected:
+  size_t getSerializationSize() const TRT_NOEXCEPT override { return 0; }
+
+  void serialize(void* buffer) const TRT_NOEXCEPT override {}
+};
+
+class RemovePaddingPluginCreator : public nvinfer1::IPluginCreator {
+ public:
+  RemovePaddingPluginCreator() {}
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "remove_padding_plugin";
+  }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* plugin_field)
+      TRT_NOEXCEPT override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, void const* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
+    RemovePaddingPlugin* obj =
+        new RemovePaddingPlugin(serial_data, serial_length);
+    obj->setPluginNamespace(name);
+    return obj;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+};
+REGISTER_TRT_PLUGIN_V2(RemovePaddingPluginCreator);
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
index 7dc31fb44719a..7eded9e823e2e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
@@ -14,6 +14,7 @@
 
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+
 #include <algorithm>
 
 #include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h"
@@ -281,13 +282,12 @@ int RoiAlignPluginDynamic::enqueue_impl(
         width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch,
         aligned_, static_cast<OutT*>(outputs[0]));
   } else {
-    GPUROIAlignOpt<
-        T, OutT,
-        false><<<blocks, threads, width * height * sizeof(T), stream>>>(
-        output_size, static_cast<const T*>(inputs[0]),
-        static_cast<const T*>(inputs[1]), spatial_scale_, channels, height,
-        width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch,
-        aligned_, static_cast<OutT*>(outputs[0]));
+    GPUROIAlignOpt<T, OutT, false>
+        <<<blocks, threads, width * height * sizeof(T), stream>>>(
+            output_size, static_cast<const T*>(inputs[0]),
+            static_cast<const T*>(inputs[1]), spatial_scale_, channels, height,
+            width, pooled_height_, pooled_width_, sampling_ratio_,
+            rois_num / batch, aligned_, static_cast<OutT*>(outputs[0]));
   }
 
   return cudaGetLastError() != cudaSuccess;
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
index fb14749f3d1db..e1527f85088ad 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@@ -14,9 +14,11 @@
 
 #include <cuda_runtime.h>
 #include <stdio.h>
+
 #include <cassert>
 #include <cub/cub.cuh>  // NOLINT
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
@@ -105,8 +107,9 @@ nvinfer1::DataType SkipLayerNormPluginDynamic::getOutputDataType(
                         index));
   PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT ||
                      input_types[0] == nvinfer1::DataType::kHALF),
-                    true, platform::errors::InvalidArgument(
-                              "The input type should be half or float"));
+                    true,
+                    platform::errors::InvalidArgument(
+                        "The input type should be half or float"));
   return input_types[0];
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
index 0a6d24f90722e..ad426204d5aa1 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
@@ -14,9 +14,11 @@
 
 #include <cuda_runtime.h>
 #include <stdio.h>
+
 #include <cassert>
 #include <cub/cub.cuh>  // NOLINT
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h"
 
@@ -301,14 +303,16 @@ bool SlicePluginDynamic::supportsFormatCombination(
 nvinfer1::DataType SlicePluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The Slice Plugin only has one input, so the "
-                                  "index value should be 0, but get %d.",
-                                  index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Slice Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
   PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT ||
                      input_types[0] == nvinfer1::DataType::kHALF),
-                    true, platform::errors::InvalidArgument(
-                              "The input type should be half or float"));
+                    true,
+                    platform::errors::InvalidArgument(
+                        "The input type should be half or float"));
   return input_types[0];
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
deleted file mode 100644
index 324e9c0392c93..0000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
+++ /dev/null
@@ -1,197 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cassert>
-#include <cstring>
-#include <vector>
-#include "paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-#if IS_TRT_VERSION_GE(6000)
-SpecialSlicePluginDynamic::SpecialSlicePluginDynamic() {}
-
-SpecialSlicePluginDynamic::SpecialSlicePluginDynamic(void const* serial_data,
-                                                     size_t serial_length) {}
-
-SpecialSlicePluginDynamic::~SpecialSlicePluginDynamic() {}
-
-nvinfer1::IPluginV2DynamicExt* SpecialSlicePluginDynamic::clone() const
-    TRT_NOEXCEPT {
-  return new SpecialSlicePluginDynamic();
-}
-
-const char* SpecialSlicePluginDynamic::getPluginType() const TRT_NOEXCEPT {
-  return "special_slice_plugin";
-}
-
-int SpecialSlicePluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-int SpecialSlicePluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
-
-size_t SpecialSlicePluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
-  size_t serialize_size = 0;
-  return serialize_size;
-}
-
-void SpecialSlicePluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT {}
-
-nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions(
-    int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-    nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT {
-  nvinfer1::DimsExprs output(inputs[0]);
-  output.nbDims++;
-  for (int i = output.nbDims - 1; i > 1; i--) {
-    output.d[i] = inputs[0].d[i - 1];
-  }
-  auto one = expr_builder.constant(1);
-  output.d[1] = one;
-  output.d[0] = expr_builder.operation(nvinfer1::DimensionOperation::kSUB,
-                                       *inputs[1].d[0], *one);
-  // remove padding 1
-  output.nbDims -= 2;
-
-  return output;
-}
-
-void SpecialSlicePluginDynamic::configurePlugin(
-    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT {}
-
-size_t SpecialSlicePluginDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc* outputs,
-    int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-void SpecialSlicePluginDynamic::destroy() TRT_NOEXCEPT { delete this; }
-
-void SpecialSlicePluginDynamic::terminate() TRT_NOEXCEPT {}
-
-bool SpecialSlicePluginDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc* desc, int nb_inputs,
-    int nb_outputs) TRT_NOEXCEPT {
-  if (pos == 0)  // slice tensor
-    return (desc[pos].type == nvinfer1::DataType::kHALF &&
-            desc[pos].format ==
-                nvinfer1::TensorFormat::kLINEAR);  // || desc[pos].type ==
-  // nvinfer1::DataType::kFLOAT);
-
-  if (pos == 1)  // cu_seqlen
-    return (desc[pos].type == nvinfer1::DataType::kINT32 &&
-            desc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-
-  return (desc[pos].type == nvinfer1::DataType::kHALF &&
-          desc[pos].format ==
-              nvinfer1::TensorFormat::kLINEAR);  // || desc[pos].type ==
-  // nvinfer1::DataType::kFLOAT);
-}
-
-nvinfer1::DataType SpecialSlicePluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType* input_types,
-    int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be equal to 0"));
-  return input_types[0];
-}
-
-template <typename T>
-__global__ void SpecialSliceKernel(const T* slice_input,
-                                   const int32_t* cu_seqlens, T* output) {
-  const int hidden = blockDim.x * gridDim.x;
-  const int hidden_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const int batch_id = blockIdx.y;
-
-  output[batch_id * hidden + hidden_id] =
-      slice_input[cu_seqlens[batch_id] * hidden + hidden_id];
-}
-
-int SpecialSlicePluginDynamic::enqueue(
-    const nvinfer1::PluginTensorDesc* input_desc,
-    const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs,
-    void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
-  auto input_dims = input_desc[0].dims;  // (sum(S), hidden, 1, 1)
-  auto out_dims = output_desc[0].dims;   // (batch, hidden, 1, 1)
-
-  PADDLE_ENFORCE_EQ(
-      input_desc[0].type, nvinfer1::DataType::kHALF,
-      platform::errors::InvalidArgument("Type of input should be half."));
-
-  const int32_t hidden = input_dims.d[1];
-  PADDLE_ENFORCE_EQ(hidden % 128, 0, platform::errors::InvalidArgument(
-                                         "hidden should be multiple of 128."));
-
-  constexpr int num_threads = 128;
-  const half* slice_input = static_cast<const half*>(inputs[0]);
-  const int32_t* cu_seqlens = static_cast<const int32_t*>(inputs[1]);
-  half* output = static_cast<half*>(outputs[0]);
-
-  const int32_t num_blocks_x = hidden / num_threads;
-  const int32_t num_blocks_y = out_dims.d[0];         // batchs
-  const dim3 num_blocks(num_blocks_x, num_blocks_y);  // blocks
-
-  SpecialSliceKernel<<<num_blocks, num_threads, 0, stream>>>(
-      slice_input, cu_seqlens, output);
-  return cudaGetLastError() != cudaSuccess;
-}
-
-SpecialSlicePluginDynamicCreator::SpecialSlicePluginDynamicCreator() {}
-
-const char* SpecialSlicePluginDynamicCreator::getPluginName() const
-    TRT_NOEXCEPT {
-  return "special_slice_plugin";
-}
-
-const char* SpecialSlicePluginDynamicCreator::getPluginVersion() const
-    TRT_NOEXCEPT {
-  return "1";
-}
-
-const nvinfer1::PluginFieldCollection*
-SpecialSlicePluginDynamicCreator::getFieldNames() TRT_NOEXCEPT {
-  return &field_collection_;
-}
-
-nvinfer1::IPluginV2* SpecialSlicePluginDynamicCreator::createPlugin(
-    const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT {
-  return new SpecialSlicePluginDynamic();
-}
-
-nvinfer1::IPluginV2* SpecialSlicePluginDynamicCreator::deserializePlugin(
-    const char* name, const void* serial_data,
-    size_t serial_length) TRT_NOEXCEPT {
-  auto plugin = new SpecialSlicePluginDynamic(serial_data, serial_length);
-  return plugin;
-}
-
-void SpecialSlicePluginDynamicCreator::setPluginNamespace(
-    const char* lib_namespace) TRT_NOEXCEPT {
-  plugin_namespace_ = lib_namespace;
-}
-
-const char* SpecialSlicePluginDynamicCreator::getPluginNamespace() const
-    TRT_NOEXCEPT {
-  return plugin_namespace_.c_str();
-}
-
-#endif
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h
deleted file mode 100644
index c3521e4ed6371..0000000000000
--- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdio.h>
-#include <cassert>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
-
-#if IS_TRT_VERSION_GE(6000)
-class SpecialSlicePluginDynamic : public DynamicPluginTensorRT {
- public:
-  SpecialSlicePluginDynamic();
-  SpecialSlicePluginDynamic(void const* serial_data, size_t serial_length);
-  ~SpecialSlicePluginDynamic();
-  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(
-      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-      nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos,
-                                 const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                       int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                          int nbInputs,
-                          const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc,
-              const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) TRT_NOEXCEPT override;
-
-  nvinfer1::DataType getOutputDataType(
-      int index, const nvinfer1::DataType* inputTypes,
-      int nbInputs) const TRT_NOEXCEPT override;
-
-  const char* getPluginType() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  int initialize() TRT_NOEXCEPT override;
-  void terminate() TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void* buffer) const TRT_NOEXCEPT override;
-  void destroy() TRT_NOEXCEPT override;
-
- private:
-  int axis_;
-  int num_stack_;
-};
-
-class SpecialSlicePluginDynamicCreator : public nvinfer1::IPluginCreator {
- public:
-  SpecialSlicePluginDynamicCreator();
-  const char* getPluginName() const TRT_NOEXCEPT override;
-  const char* getPluginVersion() const TRT_NOEXCEPT override;
-  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
-  nvinfer1::IPluginV2* createPlugin(const char* name,
-                                    const nvinfer1::PluginFieldCollection* fc)
-      TRT_NOEXCEPT override;
-  nvinfer1::IPluginV2* deserializePlugin(
-      const char* name, const void* serial_data,
-      size_t serial_length) TRT_NOEXCEPT override;
-  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override;
-  const char* getPluginNamespace() const TRT_NOEXCEPT override;
-
- private:
-  std::string plugin_namespace_;
-  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
-  std::vector<nvinfer1::PluginField> plugin_attributes_;
-};
-REGISTER_TRT_PLUGIN_V2(SpecialSlicePluginDynamicCreator);
-#endif
-
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index ec4fcca6d74d0..1cfc9fade7b15 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <cuda_fp16.h>
+
 #include <algorithm>
+
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index 7a41fe1d1eef2..49f028493ee87 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -15,9 +15,11 @@
 #pragma once
 
 #include <thrust/device_vector.h>
+
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
index 74a6c3cdf3e4e..1c6dae78b387d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
@@ -15,6 +15,7 @@
 #include <cassert>
 #include <cstring>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h"
 
 namespace paddle {
@@ -128,8 +129,9 @@ bool StackPluginDynamic::supportsFormatCombination(
 nvinfer1::DataType StackPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType* input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be equal to 0"));
+  PADDLE_ENFORCE_EQ(
+      index, 0,
+      platform::errors::InvalidArgument("The index should be equal to 0"));
   return input_types[0];
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
index 965c53e269877..12beafdadb316 100644
--- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
@@ -14,9 +14,11 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <cassert>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
index 2c2fad74b9a2d..1992dd57d68fe 100644
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include <stdio.h>
+
 #include <cassert>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h"
 
@@ -181,10 +183,11 @@ bool SwishPluginDynamic::supportsFormatCombination(
 nvinfer1::DataType SwishPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The Swish Plugin only has one input, so the "
-                                  "index value should be 0, but get %d.",
-                                  index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Swish Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
   return input_types[0];
 }
 
@@ -203,8 +206,8 @@ int SwishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
     VLOG(1) << "TRT Plugin DataType selected. Swish-->fp32";
     const float *input = static_cast<const float *>(inputs[0]);
     float *output = static_cast<float *>(outputs[0]);
-    swish_kernel<float><<<blocks, threads, 0, stream>>>(num, input, output,
-                                                        beta_);
+    swish_kernel<float>
+        <<<blocks, threads, 0, stream>>>(num, input, output, beta_);
   } else if (input_type == nvinfer1::DataType::kHALF) {
     VLOG(1) << "TRT Plugin DataType selected. Swish-->fp16";
     const half *input = static_cast<const half *>(inputs[0]);
diff --git a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
index 46f585e655746..9cb680da5a95d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.cu
new file mode 100644
index 0000000000000..a7fff02781609
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.cu
@@ -0,0 +1,110 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+__global__ void TransformerInputConvertKernel(const int64_t* input,
+                                              int32_t* output0) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  __shared__ int32_t shared_data;
+  if (threadIdx.x == static_cast<int>(input[tid])) {
+    atomicAdd(&shared_data, 1);
+  }
+  output0[0] = 0;
+  output0[blockIdx.x + 1] = shared_data;
+  __syncthreads();
+  for (int i = 0; i < blockDim.x; ++i) {
+    output0[i + 1] += output0[i];
+  }
+}
+
+nvinfer1::DataType TransformerInputConvertPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType* input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
+  return nvinfer1::DataType::kINT32;
+}
+
+nvinfer1::DimsExprs TransformerInputConvertPlugin::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
+  nvinfer1::DimsExprs output_dims{};
+  output_dims.nbDims = 1;
+  if (outputIndex == 0) {  // PosId
+    const auto* one = exprBuilder.constant(1);
+    output_dims.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kSUM,
+                                             *inputs[0].d[0], *one);
+  } else {  // MaxSeqlen
+    output_dims.d[0] = inputs[0].d[1];
+  }
+  return output_dims;
+}
+
+bool TransformerInputConvertPlugin::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
+    int nbOutputs) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_EQ(nbInputs, 1,
+                    platform::errors::InvalidArgument("Must have 1 inputs, "
+                                                      "but got %d input(s). ",
+                                                      nbInputs));
+  PADDLE_ENFORCE_EQ(nbOutputs, getNbOutputs(),
+                    platform::errors::InvalidArgument("Must have 2 output, "
+                                                      "but got %d output(s). ",
+                                                      nbOutputs));
+  if (pos == 0) {  // input
+    return inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+  } else {  // output0, output1
+    return inOut[pos].type == nvinfer1::DataType::kINT32 &&
+           inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+  }
+}
+
+void TransformerInputConvertPlugin::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* outputs,
+    int nbOutputs) TRT_NOEXCEPT {}
+
+void TransformerInputConvertPlugin::attachToContext(
+    cudnnContext* cudnnContext, cublasContext* cublasContext,
+    nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}
+
+void TransformerInputConvertPlugin::detachFromContext() TRT_NOEXCEPT {}
+
+void TransformerInputConvertPlugin::terminate() TRT_NOEXCEPT {}
+
+int TransformerInputConvertPlugin::enqueue(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
+  const auto input_desc = inputDesc[0];
+  const int64_t* input = static_cast<const int64_t*>(inputs[0]);
+  int32_t* output0 = static_cast<int32_t*>(outputs[0]);  // PosId
+  // int32_t* output1 = static_cast<int32_t*>(outputs[1]);    // MaxSeqlen
+
+  const int32_t num_blocks = input_desc.dims.d[0];   // batchs
+  const int32_t num_threads = input_desc.dims.d[1];  // max sequnce length
+
+  TransformerInputConvertKernel<<<num_blocks, num_threads, 0, stream>>>(
+      input, output0);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.h b/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.h
new file mode 100644
index 0000000000000..92aa0c48a49ce
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.h
@@ -0,0 +1,134 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cassert>
+#include <string>
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class TransformerInputConvertPlugin : public DynamicPluginTensorRT {
+ public:
+  TransformerInputConvertPlugin() {}
+
+  TransformerInputConvertPlugin(void const* serial_data, size_t serial_length) {
+  }
+
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
+    TransformerInputConvertPlugin* ptr = new TransformerInputConvertPlugin();
+    return ptr;
+  }
+
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "transformer_input_convert_plugin";
+  }
+
+  int getNbOutputs() const TRT_NOEXCEPT override { return 2; }
+
+  int initialize() TRT_NOEXCEPT { return 0; }
+  void terminate() TRT_NOEXCEPT;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* outputs,
+                       int nbOutputs) TRT_NOEXCEPT override;
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override {
+    return 0;
+  }
+
+  void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
+                       nvinfer1::IGpuAllocator* gpuAllocator)
+      TRT_NOEXCEPT override;
+
+  void detachFromContext() TRT_NOEXCEPT override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
+
+  void destroy() TRT_NOEXCEPT override { delete this; }
+
+ protected:
+  size_t getSerializationSize() const TRT_NOEXCEPT override { return 0; }
+
+  void serialize(void* buffer) const TRT_NOEXCEPT override {}
+};
+
+class TransformerInputConvertPluginCreator : public nvinfer1::IPluginCreator {
+ public:
+  TransformerInputConvertPluginCreator() {}
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "transformer_input_convert_plugin";
+  }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* plugin_field)
+      TRT_NOEXCEPT override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, void const* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
+    TransformerInputConvertPlugin* obj =
+        new TransformerInputConvertPlugin(serial_data, serial_length);
+    obj->setPluginNamespace(name);
+    return obj;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+};
+REGISTER_TRT_PLUGIN_V2(TransformerInputConvertPluginCreator);
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index 9210cd48d078b..a1316384cd491 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <NvInfer.h>
+
 #include <cstring>
 #include <string>
 #include <unordered_map>
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
index 16751c764bd03..cf9c66f0eb3fc 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <type_traits>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h
index 2094dbfc9db4b..7116093ae36e6 100644
--- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
index 2f5b75c102004..70f36ec34b708 100644
--- a/paddle/fluid/inference/tensorrt/test_tensorrt.cc
+++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <cuda_runtime_api.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include "NvInfer.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/platform/dynload/tensorrt.h"
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
index c84cb45b7ecba..35c776b9e532c 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
@@ -16,6 +16,7 @@
 
 #include <NvInfer.h>
 #include <cuda_runtime_api.h>
+
 #include <atomic>
 #include <memory>
 #include <mutex>  // NOLINT
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index fc85f83661889..307af84fa367e 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -1,409 +1,592 @@
-if (NOT APPLE AND NOT WIN32)
-    set(INFERENCE_EXTRA_DEPS paddle_inference_shared)
+if(NOT APPLE AND NOT WIN32)
+  set(INFERENCE_EXTRA_DEPS paddle_inference_shared)
 else()
-    set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_inference_io ir_pass_manager analysis_predictor benchmark)
+  set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_inference_io
+                           ir_pass_manager analysis_predictor benchmark)
 endif()
 
 if(WITH_GPU AND TENSORRT_FOUND)
-    set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps})
+  set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps})
 endif()
 
 function(download_data install_dir data_file check_sum)
-    string(REGEX MATCH "[^/\\]+$" file_name ${data_file})
-    if (NOT EXISTS ${install_dir}/${file_name})
-        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_file} ${check_sum})
-    endif()
+  string(REGEX MATCH "[^/\\]+$" file_name ${data_file})
+  if(NOT EXISTS ${install_dir}/${file_name})
+    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}
+                                      ${data_file} ${check_sum})
+  endif()
 endfunction()
 
 function(download_data_without_verify install_dir data_file)
-    string(REGEX MATCH "[^/\\]+$" file_name ${data_file})
-    if (NOT EXISTS ${install_dir}/${file_name})
-        inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL} ${data_file})
-    endif()
+  string(REGEX MATCH "[^/\\]+$" file_name ${data_file})
+  if(NOT EXISTS ${install_dir}/${file_name})
+    inference_download_and_uncompress_without_verify(
+      ${install_dir} ${INFERENCE_URL} ${data_file})
+  endif()
 endfunction()
 
 function(download_int8_data install_dir data_file check_sum)
-    if (NOT EXISTS ${install_dir}/${data_file})
-        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8
+                                      ${data_file} ${check_sum})
+  endif()
 endfunction()
 
 function(download_int8_data_without_verify install_dir data_file)
-    if (NOT EXISTS ${install_dir}/${data_file})
-        inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/int8 ${data_file})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress_without_verify(
+      ${install_dir} ${INFERENCE_URL}/int8 ${data_file})
+  endif()
 endfunction()
 
 function(download_bfloat16_data install_dir data_file check_sum)
-    if (NOT EXISTS ${install_dir}/${data_file})
-        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/bfloat16 ${data_file} ${check_sum})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/bfloat16
+                                      ${data_file} ${check_sum})
+  endif()
 endfunction()
 
 function(download_bfloat16_data_without_verify install_dir data_file)
-    if (NOT EXISTS ${install_dir}/${data_file})
-        inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/bfloat16 ${data_file})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress_without_verify(
+      ${install_dir} ${INFERENCE_URL}/bfloat16 ${data_file})
+  endif()
 endfunction()
 
 function(download_GRU_data install_dir data_file check_sum)
-    if (NOT EXISTS ${install_dir}/${data_file})
-        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/gru ${data_file} ${check_sum})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/gru
+                                      ${data_file} ${check_sum})
+  endif()
 endfunction()
 
 function(download_GRU_data_without_verify install_dir data_file)
-    if (NOT EXISTS ${install_dir}/${data_file})
-        inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/gru ${data_file})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress_without_verify(
+      ${install_dir} ${INFERENCE_URL}/gru ${data_file})
+  endif()
 endfunction()
 
 function(download_quant_data install_dir data_file check_sum)
-    if (NOT EXISTS ${install_dir}/${data_file})
-	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file} ${check_sum})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress(
+      ${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file} ${check_sum})
+  endif()
 endfunction()
 
 function(download_quant_data_without_verify install_dir data_file)
-    if (NOT EXISTS ${install_dir}/${data_file})
-	    inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress_without_verify(
+      ${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
+  endif()
 endfunction()
 
-function(download_model_and_data install_dir model_name model_check_sum data_name data_check_sum)
-    download_data(${install_dir} ${model_name} ${model_check_sum}) 
-    download_data(${install_dir} ${data_name} ${data_check_sum})
+function(download_model_and_data install_dir model_name model_check_sum
+         data_name data_check_sum)
+  download_data(${install_dir} ${model_name} ${model_check_sum})
+  download_data(${install_dir} ${data_name} ${data_check_sum})
 endfunction()
 
-function(download_model_and_data_without_verify install_dir model_name data_name)
-    download_data_without_verify(${install_dir} ${model_name}) 
-    download_data_without_verify(${install_dir} ${data_name})
+function(download_model_and_data_without_verify install_dir model_name
+         data_name)
+  download_data_without_verify(${install_dir} ${model_name})
+  download_data_without_verify(${install_dir} ${data_name})
 endfunction()
 
 function(download_result install_dir result_name check_sum)
-    download_data(${install_dir} ${result_name} ${check_sum})
+  download_data(${install_dir} ${result_name} ${check_sum})
 endfunction()
 
 function(download_result_without_verify install_dir result_name)
-    download_data_without_verify(${install_dir} ${result_name})
+  download_data_without_verify(${install_dir} ${result_name})
 endfunction()
 
 function(inference_analysis_api_test target install_dir filename)
-    inference_analysis_test(${target} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt --refer_result=${install_dir}/result.txt)
+  inference_analysis_test(
+    ${target}
+    SRCS
+    ${filename}
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${install_dir}/model
+    --infer_data=${install_dir}/data.txt
+    --refer_result=${install_dir}/result.txt)
 endfunction()
 
 function(inference_analysis_api_int8_test target install_dir filename)
-    inference_analysis_test(${target} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${install_dir}/model
-             --infer_data=${install_dir}/data.txt
-             --refer_result=${install_dir}/result.txt
-             --accuracy=0.8
-             --batch_size=5
-             --enable_int8=true)
+  inference_analysis_test(
+    ${target}
+    SRCS
+    ${filename}
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${install_dir}/model
+    --infer_data=${install_dir}/data.txt
+    --refer_result=${install_dir}/result.txt
+    --accuracy=0.8
+    --batch_size=5
+    --enable_int8=true)
 endfunction()
 
-function(inference_multiple_models_analysis_api_test target install_dir filename)
-    inference_analysis_test(${target} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${install_dir}/mobilenet_v2_models/1 --infer_model2=${install_dir}/mobilenet_v2_models/xx --infer_model3=${install_dir}/mobilenet_v2_models/3)
+function(inference_multiple_models_analysis_api_test target install_dir
+         filename)
+  inference_analysis_test(
+    ${target}
+    SRCS
+    ${filename}
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${install_dir}/mobilenet_v2_models/1
+    --infer_model2=${install_dir}/mobilenet_v2_models/xx
+    --infer_model3=${install_dir}/mobilenet_v2_models/3)
 endfunction()
 
 function(inference_analysis_api_test_build TARGET_NAME filename)
-	inference_analysis_test_build(${TARGET_NAME} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS})
+  inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} EXTRA_DEPS
+                                ${INFERENCE_EXTRA_DEPS})
 endfunction()
 
-function(inference_analysis_api_int8_test_run TARGET_NAME test_binary model_dir data_path)
-	inference_analysis_test_run(${TARGET_NAME}
-	COMMAND ${test_binary}
-        ARGS --infer_model=${model_dir}/model
-             --infer_data=${data_path}
-             --warmup_batch_size=${WARMUP_BATCH_SIZE}
-             --batch_size=50
-             --enable_int8=true
-             --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
-	     --iterations=2)
+function(inference_analysis_api_int8_test_run TARGET_NAME test_binary model_dir
+         data_path)
+  inference_analysis_test_run(
+    ${TARGET_NAME}
+    COMMAND
+    ${test_binary}
+    ARGS
+    --infer_model=${model_dir}/model
+    --infer_data=${data_path}
+    --warmup_batch_size=${WARMUP_BATCH_SIZE}
+    --batch_size=50
+    --enable_int8=true
+    --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
+    --iterations=2)
 endfunction()
 
-function(inference_analysis_api_int8_test_run_custom_warmup_batch_size TARGET_NAME test_binary model_dir data_path warmup_batch_size)
-    set(WARMUP_BATCH_SIZE ${warmup_batch_size})
-    inference_analysis_api_int8_test_run(${TARGET_NAME} ${test_binary} ${model_dir} ${data_path})
+function(inference_analysis_api_int8_test_run_custom_warmup_batch_size
+         TARGET_NAME test_binary model_dir data_path warmup_batch_size)
+  set(WARMUP_BATCH_SIZE ${warmup_batch_size})
+  inference_analysis_api_int8_test_run(${TARGET_NAME} ${test_binary}
+                                       ${model_dir} ${data_path})
 endfunction()
 
-function(inference_analysis_api_bfloat16_test_run TARGET_NAME test_binary model_dir data_path)
-	inference_analysis_test_run(${TARGET_NAME}
-	COMMAND ${test_binary}
-        ARGS --infer_model=${model_dir}/model
-             --infer_data=${data_path}
-             --batch_size=50
-             --enable_bf16=true
-             --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
-	     --iterations=2)
+function(inference_analysis_api_bfloat16_test_run TARGET_NAME test_binary
+         model_dir data_path)
+  inference_analysis_test_run(
+    ${TARGET_NAME}
+    COMMAND
+    ${test_binary}
+    ARGS
+    --infer_model=${model_dir}/model
+    --infer_data=${data_path}
+    --batch_size=50
+    --enable_bf16=true
+    --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
+    --iterations=2)
 endfunction()
 
-function(inference_analysis_api_object_dection_int8_test_run TARGET_NAME test_binary model_dir data_path)
-	inference_analysis_test_run(${TARGET_NAME}
-	COMMAND ${test_binary}
-        ARGS --infer_model=${model_dir}/model
-             --infer_data=${data_path}
-             --warmup_batch_size=10
-             --batch_size=300
-             --enable_int8=true
-             --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
-	     --iterations=1)
+function(inference_analysis_api_object_dection_int8_test_run TARGET_NAME
+         test_binary model_dir data_path)
+  inference_analysis_test_run(
+    ${TARGET_NAME}
+    COMMAND
+    ${test_binary}
+    ARGS
+    --infer_model=${model_dir}/model
+    --infer_data=${data_path}
+    --warmup_batch_size=10
+    --batch_size=300
+    --enable_int8=true
+    --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
+    --iterations=1)
 endfunction()
 
 function(inference_analysis_api_test_with_fake_data_build TARGET_NAME filename)
-	inference_analysis_test_build(${TARGET_NAME} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS})
+  inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} EXTRA_DEPS
+                                ${INFERENCE_EXTRA_DEPS})
 endfunction()
 
-function(inference_analysis_api_test_with_fake_data_run TARGET_NAME test_binary model_dir disable_fc)
-    inference_analysis_test_run(${TARGET_NAME}
-	COMMAND ${test_binary}
-        ARGS --infer_model=${model_dir}/model
-             --disable_mkldnn_fc=${disable_fc}) 
+function(inference_analysis_api_test_with_fake_data_run TARGET_NAME test_binary
+         model_dir disable_fc)
+  inference_analysis_test_run(
+    ${TARGET_NAME} COMMAND ${test_binary} ARGS --infer_model=${model_dir}/model
+    --disable_mkldnn_fc=${disable_fc})
 endfunction()
 
-function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_model_dir int8_model_dir data_path enable_quant_int8)
-    inference_analysis_test_run(${TARGET_NAME}
-    COMMAND ${test_binary}
-        ARGS --fp32_model=${fp32_model_dir}
-             --int8_model=${int8_model_dir}
-             --infer_data=${data_path}
-             --batch_size=50
-             --enable_int8=true
-             --enable_quant_int8=${enable_quant_int8}
-             --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
-             --with_accuracy_layer=false
-             --iterations=2)
+function(
+  inference_analysis_api_quant_test_run
+  TARGET_NAME
+  test_binary
+  fp32_model_dir
+  int8_model_dir
+  data_path
+  enable_quant_int8)
+  inference_analysis_test_run(
+    ${TARGET_NAME}
+    COMMAND
+    ${test_binary}
+    ARGS
+    --fp32_model=${fp32_model_dir}
+    --int8_model=${int8_model_dir}
+    --infer_data=${data_path}
+    --batch_size=50
+    --enable_int8=true
+    --enable_quant_int8=${enable_quant_int8}
+    --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
+    --with_accuracy_layer=false
+    --iterations=2)
 endfunction()
 
-function(inference_analysis_api_lexical_test_run TARGET_NAME test_binary infer_model data_path)
-    inference_analysis_test_run(${TARGET_NAME}
-    COMMAND ${test_binary}
-        ARGS --infer_model=${infer_model}
-             --infer_data=${data_path}
-             --batch_size=50
-             --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
-             --with_accuracy_layer=true
-             --use_analysis=true
-             --iterations=2)
+function(inference_analysis_api_lexical_test_run TARGET_NAME test_binary
+         infer_model data_path)
+  inference_analysis_test_run(
+    ${TARGET_NAME}
+    COMMAND
+    ${test_binary}
+    ARGS
+    --infer_model=${infer_model}
+    --infer_data=${data_path}
+    --batch_size=50
+    --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
+    --with_accuracy_layer=true
+    --use_analysis=true
+    --iterations=2)
 endfunction()
 
-function(inference_analysis_api_lexical_bfloat16_test_run TARGET_NAME test_binary infer_model data_path)
-    inference_analysis_test_run(${TARGET_NAME}
-    COMMAND ${test_binary}
-        ARGS --infer_model=${infer_model}
-             --infer_data=${data_path}
-             --batch_size=50
-             --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
-             --with_accuracy_layer=true
-             --use_analysis=true
-             --enable_bf16=true
-             --iterations=2)
+function(inference_analysis_api_lexical_bfloat16_test_run TARGET_NAME
+         test_binary infer_model data_path)
+  inference_analysis_test_run(
+    ${TARGET_NAME}
+    COMMAND
+    ${test_binary}
+    ARGS
+    --infer_model=${infer_model}
+    --infer_data=${data_path}
+    --batch_size=50
+    --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
+    --with_accuracy_layer=true
+    --use_analysis=true
+    --enable_bf16=true
+    --iterations=2)
 endfunction()
 
-function(inference_analysis_api_lexical_int8_test_run TARGET_NAME test_binary infer_model data_path fuse_multi_gru)
-    inference_analysis_test_run(${TARGET_NAME}
-    COMMAND ${test_binary}
-        ARGS --infer_model=${infer_model}
-             --infer_data=${data_path}
-             --batch_size=100
-             --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
-             --with_accuracy_layer=true
-             --use_analysis=true
-             --enable_int8=true
-             --quantized_accuracy=0.01
-             --fuse_multi_gru=${fuse_multi_gru}
-             --iterations=4)
+function(inference_analysis_api_lexical_int8_test_run TARGET_NAME test_binary
+         infer_model data_path fuse_multi_gru)
+  inference_analysis_test_run(
+    ${TARGET_NAME}
+    COMMAND
+    ${test_binary}
+    ARGS
+    --infer_model=${infer_model}
+    --infer_data=${data_path}
+    --batch_size=100
+    --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
+    --with_accuracy_layer=true
+    --use_analysis=true
+    --enable_int8=true
+    --quantized_accuracy=0.01
+    --fuse_multi_gru=${fuse_multi_gru}
+    --iterations=4)
 endfunction()
 
-function(preprocess_data2bin_test_run target py_script_source data_dir output_file)
-	py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/${py_script_source}
-	        ARGS --data_dir=${data_dir}
-		     --output_file=${output_file}
-		     --local)
+function(preprocess_data2bin_test_run target py_script_source data_dir
+         output_file)
+  py_test(${target}
+          SRCS ${CMAKE_CURRENT_SOURCE_DIR}/${py_script_source} ARGS
+               --data_dir=${data_dir} --output_file=${output_file} --local)
 endfunction()
 
 if(NOT APPLE AND WITH_MKLML)
-    # RNN1
-    set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
-    download_model_and_data_without_verify(${RNN1_INSTALL_DIR} "rnn1/model.tar.gz" "rnn1/data.txt.tar.gz")
-    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
-    
-    # seq_pool1
-    set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
-    download_model_and_data_without_verify(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
-    inference_analysis_api_test(test_analyzer_seq_pool1_compare_determine ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_determine_tester.cc)
-    inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_tester.cc)
-    inference_analysis_api_test(test_analyzer_seq_pool1_fuse_compare_zero_copy ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc)
-    inference_analysis_api_test(test_analyzer_seq_pool1_fuse_statis ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_statis_tester.cc)
-    inference_analysis_api_test(test_analyzer_seq_pool1_profile ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_profile_tester.cc)
-    if(NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
-        set_tests_properties(test_analyzer_seq_pool1_compare_determine PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_analyzer_seq_pool1_fuse_compare_zero_copy PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_analyzer_seq_pool1_fuse_statis PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_analyzer_seq_pool1_profile PROPERTIES TIMEOUT 120)
-    endif()
+  # RNN1
+  set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
+  download_model_and_data_without_verify(
+    ${RNN1_INSTALL_DIR} "rnn1/model.tar.gz" "rnn1/data.txt.tar.gz")
+  inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR}
+                              analyzer_rnn1_tester.cc)
+
+  # seq_pool1
+  set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
+  download_model_and_data_without_verify(
+    ${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz"
+    "seq_pool1_data.txt.tar.gz")
+  inference_analysis_api_test(
+    test_analyzer_seq_pool1_compare_determine ${SEQ_POOL1_INSTALL_DIR}
+    analyzer_seq_pool1_compare_determine_tester.cc)
+  inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR}
+                              analyzer_seq_pool1_compare_tester.cc)
+  inference_analysis_api_test(
+    test_analyzer_seq_pool1_fuse_compare_zero_copy ${SEQ_POOL1_INSTALL_DIR}
+    analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc)
+  inference_analysis_api_test(
+    test_analyzer_seq_pool1_fuse_statis ${SEQ_POOL1_INSTALL_DIR}
+    analyzer_seq_pool1_fuse_statis_tester.cc)
+  inference_analysis_api_test(
+    test_analyzer_seq_pool1_profile ${SEQ_POOL1_INSTALL_DIR}
+    analyzer_seq_pool1_profile_tester.cc)
+  if(NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+    set_tests_properties(test_analyzer_seq_pool1_compare_determine
+                         PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_seq_pool1_fuse_compare_zero_copy
+                         PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_seq_pool1_fuse_statis PROPERTIES TIMEOUT
+                                                                        120)
+    set_tests_properties(test_analyzer_seq_pool1_profile PROPERTIES TIMEOUT 120)
+  endif()
 else()
-    # TODO: fix this test on MACOS and OPENBLAS, the reason is that
-    # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
-    message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_rnn1")
-    message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1")
+  # TODO: fix this test on MACOS and OPENBLAS, the reason is that
+  # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
+  message(
+    WARNING
+      "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_rnn1"
+  )
+  message(
+    WARNING
+      "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1"
+  )
 endif()
 
-
 # RNN2
 set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
-download_model_and_data_without_verify(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
+download_model_and_data_without_verify(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz"
+                                       "rnn2_data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR}
+                            analyzer_rnn2_tester.cc)
 
 # TODO(luotao, Superjom) Disable DAM test, temporarily fix
 # https://github.com/PaddlePaddle/Paddle/issues/15032#issuecomment-455990914.
 # After inference framework refactor, will reopen it.
 # normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
-download_model_and_data_without_verify(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
+download_model_and_data_without_verify(${DAM_INSTALL_DIR} "DAM_model.tar.gz"
+                                       "DAM_data.txt.tar.gz")
 #inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator)
 
 # small DAM
 set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
-download_model_and_data_without_verify(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt)
-
-#save model 
-inference_analysis_api_test(test_analyzer_save_model ${DAM_SMALL_INSTALL_DIR} analyzer_save_model_tester.cc)
+download_model_and_data_without_verify(
+  ${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
+inference_analysis_test(
+  test_analyzer_small_dam
+  SRCS
+  analyzer_dam_tester.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  ARGS
+  --infer_model=${DAM_SMALL_INSTALL_DIR}/model
+  --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt)
+
+#save model
+inference_analysis_api_test(test_analyzer_save_model ${DAM_SMALL_INSTALL_DIR}
+                            analyzer_save_model_tester.cc)
 
 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
-download_model_and_data_without_verify(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_ner ${CHINESE_NER_INSTALL_DIR} analyzer_ner_tester.cc)
+download_model_and_data_without_verify(
+  ${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz"
+  "chinese_ner-data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_ner ${CHINESE_NER_INSTALL_DIR}
+                            analyzer_ner_tester.cc)
 
 # lac
 set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
-download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" 419ca6eb85f57a01bfe173591910aec5 "lac_data.txt.tar.gz" 9983539cd6b34fbdc411e43422776bfd)
-inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc)
+download_model_and_data(
+  ${LAC_INSTALL_DIR} "lac_model.tar.gz" 419ca6eb85f57a01bfe173591910aec5
+  "lac_data.txt.tar.gz" 9983539cd6b34fbdc411e43422776bfd)
+inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR}
+                            analyzer_lac_tester.cc)
 
 # Pyramid DNN
 set(PYRAMID_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/pyramid_dnn")
-download_model_and_data_without_verify(${PYRAMID_DNN_INSTALL_DIR} "PyramidDNN_model.tar.gz" "PyramidDNN_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_pyramid_dnn ${PYRAMID_DNN_INSTALL_DIR} analyzer_pyramid_dnn_tester.cc)
+download_model_and_data_without_verify(
+  ${PYRAMID_DNN_INSTALL_DIR} "PyramidDNN_model.tar.gz"
+  "PyramidDNN_data.txt.tar.gz")
+inference_analysis_api_test(
+  test_analyzer_pyramid_dnn ${PYRAMID_DNN_INSTALL_DIR}
+  analyzer_pyramid_dnn_tester.cc)
 
 # Ernie
 set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie")
-download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_model.tar.gz" aa59192dd41ed377f9f168e3a1309fa6 "Ernie_data.txt.tar.gz" 5396e63548edad7ca561e7e26a9476d1)
-download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz" 73beea65abda2edb61c1662cd3180c62)
-if (WITH_GPU)
-    inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} analyzer_ernie_tester.cc)
+download_model_and_data(
+  ${ERNIE_INSTALL_DIR} "Ernie_model.tar.gz" aa59192dd41ed377f9f168e3a1309fa6
+  "Ernie_data.txt.tar.gz" 5396e63548edad7ca561e7e26a9476d1)
+download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz"
+                73beea65abda2edb61c1662cd3180c62)
+if(WITH_GPU)
+  inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR}
+                              analyzer_ernie_tester.cc)
 endif()
-inference_analysis_api_int8_test(test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR} analyzer_ernie_int8_tester.cc)
+inference_analysis_api_int8_test(test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR}
+                                 analyzer_ernie_int8_tester.cc)
 
 # Ernie large
 set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie_Large")
-download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_large_model.tar.gz" af7715245ed32cc77374625d4c80f7ef "Ernie_large_data.txt.tar.gz" edb2113eec93783cad56ed76d47ba57f)
-download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz" 1facda98eef1085dc9d435ebf3f23a73)
-inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
-    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-    ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
-if(NOT WIN32 AND NOT APPLE AND TEST test_analyzer_ernie_large)
-    set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
+download_model_and_data(
+  ${ERNIE_INSTALL_DIR} "Ernie_large_model.tar.gz"
+  af7715245ed32cc77374625d4c80f7ef "Ernie_large_data.txt.tar.gz"
+  edb2113eec93783cad56ed76d47ba57f)
+download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz"
+                1facda98eef1085dc9d435ebf3f23a73)
+inference_analysis_test(
+  test_analyzer_ernie_large
+  SRCS
+  analyzer_ernie_tester.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  ARGS
+  --infer_model=${ERNIE_INSTALL_DIR}/model
+  --infer_data=${ERNIE_INSTALL_DIR}/data.txt
+  --refer_result=${ERNIE_INSTALL_DIR}/result.txt
+  --ernie_large=true)
+if(NOT WIN32
+   AND NOT APPLE
+   AND TEST test_analyzer_ernie_large)
+  set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS
+                                                            "RUN_TYPE=NIGHTLY")
 endif()
-if (WIN32 AND TEST test_analyzer_ernie_large)
-    set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 200)
+if(WIN32 AND TEST test_analyzer_ernie_large)
+  set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 200)
 endif()
 
 # text_classification
-set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
-download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" 3f0f440313ca50e26184e65ffd5809ab "text_classification_data.txt.tar.gz" 36ae620020cc3377f45ed330dd36238f)
-inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc)
+set(TEXT_CLASSIFICATION_INSTALL_DIR
+    "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
+download_model_and_data(
+  ${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz"
+  3f0f440313ca50e26184e65ffd5809ab "text_classification_data.txt.tar.gz"
+  36ae620020cc3377f45ed330dd36238f)
+inference_analysis_api_test(
+  test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR}
+  analyzer_text_classification_tester.cc)
 
 # seq_conv1
 set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1")
-download_model_and_data_without_verify(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc)
+download_model_and_data_without_verify(
+  ${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR}
+                            analyzer_seq_conv1_tester.cc)
 
 # transformer, the dataset only works on batch_size=8 now
 set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
-download_model_and_data_without_verify(${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz" "temp/transformer_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_compare_tester.cc 
-  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
-       --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
-inference_analysis_test(test_analyzer_transformer_fuse SRCS analyzer_transformer_fuse_tester.cc 
-  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
-       --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
-inference_analysis_test(test_analyzer_transformer_profile SRCS analyzer_transformer_profile_tester.cc 
-  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
-       --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
+download_model_and_data_without_verify(
+  ${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz"
+  "temp/transformer_data.txt.tar.gz")
+inference_analysis_test(
+  test_analyzer_transformer
+  SRCS
+  analyzer_transformer_compare_tester.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  ARGS
+  --infer_model=${TRANSFORMER_INSTALL_DIR}/model
+  --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt
+  --batch_size=8
+  --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
+inference_analysis_test(
+  test_analyzer_transformer_fuse
+  SRCS
+  analyzer_transformer_fuse_tester.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  ARGS
+  --infer_model=${TRANSFORMER_INSTALL_DIR}/model
+  --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt
+  --batch_size=8
+  --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
+inference_analysis_test(
+  test_analyzer_transformer_profile
+  SRCS
+  analyzer_transformer_profile_tester.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  ARGS
+  --infer_model=${TRANSFORMER_INSTALL_DIR}/model
+  --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt
+  --batch_size=8
+  --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
 
 # VIT-OCR
 set(VIT_OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/vit")
-if (NOT EXISTS ${VIT_OCR_INSTALL_DIR}/vit_ocr.tgz)
-    inference_download_and_uncompress_without_verify(${VIT_OCR_INSTALL_DIR} ${INFERENCE_URL} "ocr/vit_ocr.tgz")
+if(NOT EXISTS ${VIT_OCR_INSTALL_DIR}/vit_ocr.tgz)
+  inference_download_and_uncompress_without_verify(
+    ${VIT_OCR_INSTALL_DIR} ${INFERENCE_URL} "ocr/vit_ocr.tgz")
 endif()
-inference_analysis_test(test_analyzer_vit_ocr SRCS analyzer_vit_ocr_tester.cc
-  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-  ARGS --infer_model=${VIT_OCR_INSTALL_DIR}/vit_ocr/model --infer_data=${VIT_OCR_INSTALL_DIR}/vit_ocr/datavit.txt)
+inference_analysis_test(
+  test_analyzer_vit_ocr
+  SRCS
+  analyzer_vit_ocr_tester.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  ARGS
+  --infer_model=${VIT_OCR_INSTALL_DIR}/vit_ocr/model
+  --infer_data=${VIT_OCR_INSTALL_DIR}/vit_ocr/datavit.txt)
 
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
-if (NOT EXISTS ${OCR_INSTALL_DIR}/ocr.tar.gz)
-    inference_download_and_uncompress_without_verify(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/ocr.tar.gz")
+if(NOT EXISTS ${OCR_INSTALL_DIR}/ocr.tar.gz)
+  inference_download_and_uncompress_without_verify(
+    ${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/"
+    "inference-vis-demos/ocr.tar.gz")
 endif()
-inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
+inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR}
+                            analyzer_vis_tester.cc)
 
 # densebox
 set(DENSEBOX_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/densebox")
 download_data_without_verify(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz")
-inference_analysis_test(test_analyzer_detect_functional_mkldnn SRCS analyzer_detect_functional_mkldnn_tester.cc 
-  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-  ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt 
-       --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)
+inference_analysis_test(
+  test_analyzer_detect_functional_mkldnn
+  SRCS
+  analyzer_detect_functional_mkldnn_tester.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  ARGS
+  --infer_model=${DENSEBOX_INSTALL_DIR}/model
+  --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt
+  --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)
 
 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
-if (NOT EXISTS ${MOBILENET_INSTALL_DIR}/mobilenet.tar.gz)
-    inference_download_and_uncompress_without_verify(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/mobilenet.tar.gz")
+if(NOT EXISTS ${MOBILENET_INSTALL_DIR}/mobilenet.tar.gz)
+  inference_download_and_uncompress_without_verify(
+    ${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/"
+    "inference-vis-demos/mobilenet.tar.gz")
 endif()
-inference_analysis_api_test(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
+inference_analysis_api_test(test_analyzer_mobilenet_transpose
+                            ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
 
 ### Image classification tests with fake data
 set(IMG_CLASS_TEST_APP "test_analyzer_image_classification")
 set(IMG_CLASS_TEST_APP_SRC "analyzer_image_classification_tester.cc")
 
 # build test binary to be used in subsequent tests
-inference_analysis_api_test_with_fake_data_build(${IMG_CLASS_TEST_APP} ${IMG_CLASS_TEST_APP_SRC})
+inference_analysis_api_test_with_fake_data_build(${IMG_CLASS_TEST_APP}
+                                                 ${IMG_CLASS_TEST_APP_SRC})
 
 # googlenet
 set(GOOGLENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/googlenet")
 download_data_without_verify(${GOOGLENET_MODEL_DIR} "googlenet.tar.gz")
-inference_analysis_api_test_with_fake_data_run(test_analyzer_googlenet ${IMG_CLASS_TEST_APP}
-	${GOOGLENET_MODEL_DIR} false)
+inference_analysis_api_test_with_fake_data_run(
+  test_analyzer_googlenet ${IMG_CLASS_TEST_APP} ${GOOGLENET_MODEL_DIR} false)
 
 # resnet50
 set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
 download_data_without_verify(${RESNET50_MODEL_DIR} "resnet50_model.tar.gz")
-inference_analysis_api_test_with_fake_data_run(test_analyzer_resnet50 ${IMG_CLASS_TEST_APP}
-	${RESNET50_MODEL_DIR} true)
-if (WIN32)
-    set_tests_properties(test_analyzer_resnet50 PROPERTIES TIMEOUT 200)
+inference_analysis_api_test_with_fake_data_run(
+  test_analyzer_resnet50 ${IMG_CLASS_TEST_APP} ${RESNET50_MODEL_DIR} true)
+if(WIN32)
+  set_tests_properties(test_analyzer_resnet50 PROPERTIES TIMEOUT 200)
 endif()
 
-
 # mobilenet with depthwise_conv op
-set(MOBILENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv")
+set(MOBILENET_MODEL_DIR
+    "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv")
 download_data_without_verify(${MOBILENET_MODEL_DIR} "mobilenet_model.tar.gz")
-inference_analysis_api_test_with_fake_data_run(test_analyzer_mobilenet_depthwise_conv ${IMG_CLASS_TEST_APP}
-	${MOBILENET_MODEL_DIR} false)
+inference_analysis_api_test_with_fake_data_run(
+  test_analyzer_mobilenet_depthwise_conv ${IMG_CLASS_TEST_APP}
+  ${MOBILENET_MODEL_DIR} false)
 
 if(WITH_MKLDNN)
 
@@ -418,97 +601,135 @@ if(WITH_MKLDNN)
   set(IMAGENET_DATA_ARCHIVE "imagenet_val_100_tail.tar.gz")
   set(IMAGENET_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/imagenet")
   set(IMAGENET_DATA_PATH "${IMAGENET_DATA_DIR}/data.bin")
-  download_int8_data_without_verify(${IMAGENET_DATA_DIR} ${IMAGENET_DATA_ARCHIVE})
+  download_int8_data_without_verify(${IMAGENET_DATA_DIR}
+                                    ${IMAGENET_DATA_ARCHIVE})
 
   # build test binary to be used in subsequent tests
   set(INT8_IMG_CLASS_TEST_APP "test_analyzer_int8_image_classification")
-  set(INT8_IMG_CLASS_TEST_APP_SRC "analyzer_int8_image_classification_tester.cc")
-  inference_analysis_api_test_build(${INT8_IMG_CLASS_TEST_APP} ${INT8_IMG_CLASS_TEST_APP_SRC})
+  set(INT8_IMG_CLASS_TEST_APP_SRC
+      "analyzer_int8_image_classification_tester.cc")
+  inference_analysis_api_test_build(${INT8_IMG_CLASS_TEST_APP}
+                                    ${INT8_IMG_CLASS_TEST_APP_SRC})
 
   # resnet50 int8
   set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
-  download_int8_data_without_verify(${INT8_RESNET50_MODEL_DIR} "resnet50_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_resnet50 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET50_MODEL_DIR} ${IMAGENET_DATA_PATH})
+  download_int8_data_without_verify(${INT8_RESNET50_MODEL_DIR}
+                                    "resnet50_int8_model.tar.gz")
+  inference_analysis_api_int8_test_run(
+    test_analyzer_int8_resnet50 ${INT8_IMG_CLASS_TEST_APP}
+    ${INT8_RESNET50_MODEL_DIR} ${IMAGENET_DATA_PATH})
 
   # mobilenetv1 int8
   set(INT8_MOBILENETV1_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv1")
-  download_int8_data_without_verify(${INT8_MOBILENETV1_MODEL_DIR} "mobilenetv1_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv1 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV1_MODEL_DIR} ${IMAGENET_DATA_PATH})
- 
+  download_int8_data_without_verify(${INT8_MOBILENETV1_MODEL_DIR}
+                                    "mobilenetv1_int8_model.tar.gz")
+  inference_analysis_api_int8_test_run(
+    test_analyzer_int8_mobilenetv1 ${INT8_IMG_CLASS_TEST_APP}
+    ${INT8_MOBILENETV1_MODEL_DIR} ${IMAGENET_DATA_PATH})
+
   # mobilenetv2 int8
   set(INT8_MOBILENETV2_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv2")
-  download_int8_data_without_verify(${INT8_MOBILENETV2_MODEL_DIR} "mobilenet_v2_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv2 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH})
- 
+  download_int8_data_without_verify(${INT8_MOBILENETV2_MODEL_DIR}
+                                    "mobilenet_v2_int8_model.tar.gz")
+  inference_analysis_api_int8_test_run(
+    test_analyzer_int8_mobilenetv2 ${INT8_IMG_CLASS_TEST_APP}
+    ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH})
+
   # resnet101 int8
   # TODO(grygielski) Enable after MKL-DNN 1.0 merge
   set(INT8_RESNET101_MODEL_DIR "${INT8_DATA_DIR}/resnet101")
-  download_int8_data_without_verify(${INT8_RESNET101_MODEL_DIR} "Res101_int8_model.tar.gz" )
-#   inference_analysis_api_int8_test_run(test_analyzer_int8_resnet101 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET101_MODEL_DIR} ${IMAGENET_DATA_PATH})
- 
+  download_int8_data_without_verify(${INT8_RESNET101_MODEL_DIR}
+                                    "Res101_int8_model.tar.gz")
+  #   inference_analysis_api_int8_test_run(test_analyzer_int8_resnet101 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET101_MODEL_DIR} ${IMAGENET_DATA_PATH})
+
   # vgg16 int8
   # TODO(grygielski) Enable after MKL-DNN 1.0 merge
   set(INT8_VGG16_MODEL_DIR "${INT8_DATA_DIR}/vgg16")
-  download_int8_data_without_verify(${INT8_VGG16_MODEL_DIR} "VGG16_int8_model.tar.gz" )
-#  inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH})
- 
+  download_int8_data_without_verify(${INT8_VGG16_MODEL_DIR}
+                                    "VGG16_int8_model.tar.gz")
+  #  inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH})
+
   # vgg19 int8
   # TODO(grygielski) Enable after MKL-DNN 1.0 merge
   set(INT8_VGG19_MODEL_DIR "${INT8_DATA_DIR}/vgg19")
-  download_int8_data_without_verify(${INT8_VGG19_MODEL_DIR} "VGG19_int8_model.tar.gz" )
-#   inference_analysis_api_int8_test_run(test_analyzer_int8_vgg19 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG19_MODEL_DIR} ${IMAGENET_DATA_PATH})
+  download_int8_data_without_verify(${INT8_VGG19_MODEL_DIR}
+                                    "VGG19_int8_model.tar.gz")
+  #   inference_analysis_api_int8_test_run(test_analyzer_int8_vgg19 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG19_MODEL_DIR} ${IMAGENET_DATA_PATH})
 
   # googlenet int8
   set(INT8_GOOGLENET_MODEL_DIR "${INT8_DATA_DIR}/googlenet")
-  download_int8_data_without_verify(${INT8_GOOGLENET_MODEL_DIR} "GoogleNet_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run_custom_warmup_batch_size(test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH} 10)
-
- # mobilenetv3_large_x1_0 int8
- set(INT8_MOBILENETV3_LARGE_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv3_large")
- set(INT8_MOBILENETV3_FILE_NAME "MobileNetV3_large_x1_0_infer.tar")
- if (NOT EXISTS ${INT8_MOBILENETV3_LARGE_MODEL_DIR}/${INT8_MOBILENETV3_FILE_NAME})
-    inference_download_and_uncompress_without_verify(${INT8_MOBILENETV3_LARGE_MODEL_DIR} "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/" ${INT8_MOBILENETV3_FILE_NAME})
- endif()
- inference_analysis_test_run(test_analyzer_int8_mobilenetv3_large 
-    COMMAND ${INT8_IMG_CLASS_TEST_APP} 
-    ARGS --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer
-        --infer_data=${IMAGENET_DATA_PATH} 
-        --warmup_batch_size=50
-        --batch_size=1
-        --enable_int8=true 
-        --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} 
-        --iterations=100
-        --with_accuracy_layer=false)
+  download_int8_data_without_verify(${INT8_GOOGLENET_MODEL_DIR}
+                                    "GoogleNet_int8_model.tar.gz")
+  inference_analysis_api_int8_test_run_custom_warmup_batch_size(
+    test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP}
+    ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH} 10)
+
+  # mobilenetv3_large_x1_0 int8
+  set(INT8_MOBILENETV3_LARGE_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv3_large")
+  set(INT8_MOBILENETV3_FILE_NAME "MobileNetV3_large_x1_0_infer.tar")
+  if(NOT EXISTS
+     ${INT8_MOBILENETV3_LARGE_MODEL_DIR}/${INT8_MOBILENETV3_FILE_NAME})
+    inference_download_and_uncompress_without_verify(
+      ${INT8_MOBILENETV3_LARGE_MODEL_DIR}
+      "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/"
+      ${INT8_MOBILENETV3_FILE_NAME})
+  endif()
+  inference_analysis_test_run(
+    test_analyzer_int8_mobilenetv3_large
+    COMMAND
+    ${INT8_IMG_CLASS_TEST_APP}
+    ARGS
+    --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer
+    --infer_data=${IMAGENET_DATA_PATH}
+    --warmup_batch_size=50
+    --batch_size=1
+    --enable_int8=true
+    --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
+    --iterations=100
+    --with_accuracy_layer=false)
 
   ### BFLOAT16 tests
 
   # build test binary to be used in subsequent tests
   set(BF16_IMG_CLASS_TEST_APP "test_analyzer_bfloat16_image_classification")
-  set(BF16_IMG_CLASS_TEST_APP_SRC "analyzer_bfloat16_image_classification_tester.cc")
-  inference_analysis_api_test_build(${BF16_IMG_CLASS_TEST_APP} ${BF16_IMG_CLASS_TEST_APP_SRC})
+  set(BF16_IMG_CLASS_TEST_APP_SRC
+      "analyzer_bfloat16_image_classification_tester.cc")
+  inference_analysis_api_test_build(${BF16_IMG_CLASS_TEST_APP}
+                                    ${BF16_IMG_CLASS_TEST_APP_SRC})
 
   # resnet50 bfloat16
-  inference_analysis_api_bfloat16_test_run(test_analyzer_bfloat16_resnet50 ${BF16_IMG_CLASS_TEST_APP} ${INT8_RESNET50_MODEL_DIR} ${IMAGENET_DATA_PATH})
-  
+  inference_analysis_api_bfloat16_test_run(
+    test_analyzer_bfloat16_resnet50 ${BF16_IMG_CLASS_TEST_APP}
+    ${INT8_RESNET50_MODEL_DIR} ${IMAGENET_DATA_PATH})
+
   # googlenet bfloat16
-  inference_analysis_api_bfloat16_test_run(test_analyzer_bfloat16_googlenet ${BF16_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH})
+  inference_analysis_api_bfloat16_test_run(
+    test_analyzer_bfloat16_googlenet ${BF16_IMG_CLASS_TEST_APP}
+    ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH})
 
   # mobilenetv1 bfloat16
-  inference_analysis_api_bfloat16_test_run(test_analyzer_bfloat16_mobilenetv1 ${BF16_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV1_MODEL_DIR} ${IMAGENET_DATA_PATH})
+  inference_analysis_api_bfloat16_test_run(
+    test_analyzer_bfloat16_mobilenetv1 ${BF16_IMG_CLASS_TEST_APP}
+    ${INT8_MOBILENETV1_MODEL_DIR} ${IMAGENET_DATA_PATH})
 
   # mobilenetv2 bfloat16
-  inference_analysis_api_bfloat16_test_run(test_analyzer_bfloat16_mobilenetv2 ${BF16_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH})
-
-  # mobilenetv3_large 
-  inference_analysis_test_run(test_analyzer_bfloat16_mobilenetv3_large
-  COMMAND ${BF16_IMG_CLASS_TEST_APP}
-      ARGS --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer
-        --infer_data=${IMAGENET_DATA_PATH} 
-        --batch_size=1
-        --enable_bf16=true
-        --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
-        --iterations=100
-        --with_accuracy_layer=false)
+  inference_analysis_api_bfloat16_test_run(
+    test_analyzer_bfloat16_mobilenetv2 ${BF16_IMG_CLASS_TEST_APP}
+    ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH})
+
+  # mobilenetv3_large
+  inference_analysis_test_run(
+    test_analyzer_bfloat16_mobilenetv3_large
+    COMMAND
+    ${BF16_IMG_CLASS_TEST_APP}
+    ARGS
+    --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer
+    --infer_data=${IMAGENET_DATA_PATH}
+    --batch_size=1
+    --enable_bf16=true
+    --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
+    --iterations=100
+    --with_accuracy_layer=false)
 
   ### Object detection models
   set(PASCALVOC_DATA_PATH "${INT8_DATA_DIR}/pascalvoc_val_head_300.bin")
@@ -516,21 +737,25 @@ if(WITH_MKLDNN)
   set(INT8_OBJ_DETECT_TEST_APP_SRC "analyzer_int8_object_detection_tester.cc")
 
   # download dataset if necessary
-  download_int8_data_without_verify(${INT8_DATA_DIR} "pascalvoc_val_head_300.tar.gz")
-
+  download_int8_data_without_verify(${INT8_DATA_DIR}
+                                    "pascalvoc_val_head_300.tar.gz")
 
   # build test binary to be used in subsequent tests
-  inference_analysis_api_test_build(${INT8_OBJ_DETECT_TEST_APP} ${INT8_OBJ_DETECT_TEST_APP_SRC})
+  inference_analysis_api_test_build(${INT8_OBJ_DETECT_TEST_APP}
+                                    ${INT8_OBJ_DETECT_TEST_APP_SRC})
 
   # mobilenet-ssd int8
   set(INT8_MOBILENET_SSD_MODEL_DIR "${INT8_DATA_DIR}/mobilenet-ssd")
-  download_int8_data_without_verify(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" )
-  inference_analysis_api_object_dection_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH})
+  download_int8_data_without_verify(${INT8_MOBILENET_SSD_MODEL_DIR}
+                                    "mobilenet_ssd_int8_model.tar.gz")
+  inference_analysis_api_object_dection_int8_test_run(
+    test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP}
+    ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH})
 
   ### Lexcial analysis GRU model
   set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru")
-  download_GRU_data_without_verify("${GRU_PATH}" "GRU_eval_data.tar.gz")
-  download_GRU_data_without_verify("${GRU_PATH}" "GRU_eval_model_v2.tar.gz")
+  download_gru_data_without_verify("${GRU_PATH}" "GRU_eval_data.tar.gz")
+  download_gru_data_without_verify("${GRU_PATH}" "GRU_eval_model_v2.tar.gz")
   set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin")
   set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model_v2")
   set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis")
@@ -539,266 +764,497 @@ if(WITH_MKLDNN)
   # build test binary to be used in subsequent tests
   inference_analysis_api_test_build(${LEXICAL_TEST_APP} ${LEXICAL_TEST_APP_SRC})
   # run lexcial analysis test
-  inference_analysis_api_lexical_test_run(test_analyzer_lexical_gru ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH})
+  inference_analysis_api_lexical_test_run(
+    test_analyzer_lexical_gru ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH}
+    ${GRU_DATA_PATH})
   # run bfloat16 lexical analysis test
-  inference_analysis_api_lexical_bfloat16_test_run(test_analyzer_lexical_gru_bfloat16 ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH})
+  inference_analysis_api_lexical_bfloat16_test_run(
+    test_analyzer_lexical_gru_bfloat16 ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH}
+    ${GRU_DATA_PATH})
   # run post-training quantization lexical analysis test
-  inference_analysis_api_lexical_int8_test_run(test_analyzer_lexical_gru_int8 ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH} false)
-  # run post-training quantization lexical analysis test with multi_gru fuse 
-  inference_analysis_api_lexical_int8_test_run(test_analyzer_lexical_gru_int8_multi_gru ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH} true)
+  inference_analysis_api_lexical_int8_test_run(
+    test_analyzer_lexical_gru_int8 ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH}
+    ${GRU_DATA_PATH} false)
+  # run post-training quantization lexical analysis test with multi_gru fuse
+  inference_analysis_api_lexical_int8_test_run(
+    test_analyzer_lexical_gru_int8_multi_gru ${LEXICAL_TEST_APP}
+    ${GRU_MODEL_PATH} ${GRU_DATA_PATH} true)
 
   ### optimized FP32 vs. Quant INT8 tests
-  
+
   set(QUANT_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant")
   set(QUANT_IMG_CLASS_TEST_APP "test_analyzer_quant_image_classification")
-  set(QUANT_IMG_CLASS_TEST_APP_SRC "analyzer_quant_image_classification_tester.cc")
+  set(QUANT_IMG_CLASS_TEST_APP_SRC
+      "analyzer_quant_image_classification_tester.cc")
 
   # build test binary to be used in subsequent tests
-  inference_analysis_api_test_build(${QUANT_IMG_CLASS_TEST_APP} ${QUANT_IMG_CLASS_TEST_APP_SRC})
+  inference_analysis_api_test_build(${QUANT_IMG_CLASS_TEST_APP}
+                                    ${QUANT_IMG_CLASS_TEST_APP_SRC})
 
   # MobileNetV1 FP32 vs. Quant INT8
   # The FP32 model should already be downloaded for slim Quant unit tests on Linux
   set(QUANT2_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2")
-  set(QUANT2_INT8_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2_int8")
+  set(QUANT2_INT8_MobileNetV1_MODEL_DIR
+      "${QUANT_DATA_DIR}/MobileNetV1_quant2_int8")
   if(NOT LINUX)
-      download_quant_data_without_verify(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz")
+    download_quant_data_without_verify(${QUANT2_MobileNetV1_MODEL_DIR}
+                                       "MobileNet_qat_perf.tar.gz")
   endif(NOT LINUX)
-  download_quant_data_without_verify(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz")
-  inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH} false)
+  download_quant_data_without_verify(${QUANT2_INT8_MobileNetV1_MODEL_DIR}
+                                     "MobileNet_qat_perf_int8.tar.gz")
+  inference_analysis_api_quant_test_run(
+    test_analyzer_quant_performance_benchmark
+    ${QUANT_IMG_CLASS_TEST_APP}
+    ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float
+    ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8
+    ${IMAGENET_DATA_PATH}
+    false)
 
   # Quant2 MobileNetV1
-  inference_analysis_api_quant_test_run(test_analyzer_quant2_mobilenetv1_mkldnn ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${IMAGENET_DATA_PATH} true)
+  inference_analysis_api_quant_test_run(
+    test_analyzer_quant2_mobilenetv1_mkldnn
+    ${QUANT_IMG_CLASS_TEST_APP}
+    ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float
+    ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float
+    ${IMAGENET_DATA_PATH}
+    true)
 
   # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes,
   # with weight scales in `fake_channel_wise_dequantize_max_abs` operators
-  set(QUANT2_RESNET50_CHANNELWISE_MODEL_DIR "${QUANT_DATA_DIR}/ResNet50_quant2_channelwise")
-  set(QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE "ResNet50_qat_channelwise.tar.gz")
+  set(QUANT2_RESNET50_CHANNELWISE_MODEL_DIR
+      "${QUANT_DATA_DIR}/ResNet50_quant2_channelwise")
+  set(QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE
+      "ResNet50_qat_channelwise.tar.gz")
   if(NOT LINUX)
-      download_quant_data_without_verify(${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR} ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE})
+    download_quant_data_without_verify(
+      ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}
+      ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE})
   endif(NOT LINUX)
-  set(QUANT2_RESNET50_MODEL ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise)
-  inference_analysis_api_quant_test_run(test_analyzer_quant2_resnet50_channelwise_mkldnn ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_RESNET50_MODEL} ${QUANT2_RESNET50_MODEL} ${IMAGENET_DATA_PATH} true)
+  set(QUANT2_RESNET50_MODEL
+      ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise)
+  inference_analysis_api_quant_test_run(
+    test_analyzer_quant2_resnet50_channelwise_mkldnn
+    ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_RESNET50_MODEL}
+    ${QUANT2_RESNET50_MODEL} ${IMAGENET_DATA_PATH} true)
 
   ### Other tests
- 
+
   # MKLDNN quantizer config
   set(MKLDNN_QUANTIZER_CONFIG_TEST_APP "test_mkldnn_quantizer_config")
   set(MKLDNN_QUANTIZER_CONFIG_TEST_APP_SRC "mkldnn_quantizer_config_tester.cc")
-  inference_analysis_api_test_build(${MKLDNN_QUANTIZER_CONFIG_TEST_APP} ${MKLDNN_QUANTIZER_CONFIG_TEST_APP_SRC})
-  inference_analysis_test_run(test_mkldnn_quantizer_config COMMAND ${MKLDNN_QUANTIZER_CONFIG_TEST_APP})
+  inference_analysis_api_test_build(${MKLDNN_QUANTIZER_CONFIG_TEST_APP}
+                                    ${MKLDNN_QUANTIZER_CONFIG_TEST_APP_SRC})
+  inference_analysis_test_run(test_mkldnn_quantizer_config COMMAND
+                              ${MKLDNN_QUANTIZER_CONFIG_TEST_APP})
 
   # preprocess data2bin imagenet
-    download_int8_data_without_verify(${INT8_DATA_DIR} "imagenet_small.tar.gz")
-    set(IMAGENET_SMALL_DATA_DIR "${INT8_DATA_DIR}/imagenet_small")
-    set(IMAGENET_SMALL_OUTPUT_FILE "imagenet_small.bin")
-    preprocess_data2bin_test_run(preprocess_local_imagenet "full_ILSVRC2012_val_preprocess.py" ${IMAGENET_SMALL_DATA_DIR} ${IMAGENET_SMALL_OUTPUT_FILE})
-    
+  download_int8_data_without_verify(${INT8_DATA_DIR} "imagenet_small.tar.gz")
+  set(IMAGENET_SMALL_DATA_DIR "${INT8_DATA_DIR}/imagenet_small")
+  set(IMAGENET_SMALL_OUTPUT_FILE "imagenet_small.bin")
+  preprocess_data2bin_test_run(
+    preprocess_local_imagenet "full_ILSVRC2012_val_preprocess.py"
+    ${IMAGENET_SMALL_DATA_DIR} ${IMAGENET_SMALL_OUTPUT_FILE})
+
   # preprocess data2bin pascalvoc
   download_int8_data_without_verify(${INT8_DATA_DIR} "pascalvoc_small.tar.gz")
   set(PASCALVOC_SMALL_DATA_DIR "${INT8_DATA_DIR}/pascalvoc_small")
   set(PASCALVOC_SMALL_OUTPUT_FILE "pascalvoc_small.bin")
-  preprocess_data2bin_test_run(preprocess_local_pascalvoc "full_pascalvoc_test_preprocess.py" ${PASCALVOC_SMALL_DATA_DIR} ${PASCALVOC_SMALL_OUTPUT_FILE})
+  preprocess_data2bin_test_run(
+    preprocess_local_pascalvoc "full_pascalvoc_test_preprocess.py"
+    ${PASCALVOC_SMALL_DATA_DIR} ${PASCALVOC_SMALL_OUTPUT_FILE})
 
 endif()
 
 # bert, max_len=20, embedding_dim=128
 set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
-download_model_and_data_without_verify(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
-if (WITH_GPU)
-    inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc)
+download_model_and_data_without_verify(
+  ${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
+if(WITH_GPU)
+  inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR}
+                              analyzer_bert_tester.cc)
 endif()
 
 # multiple models prediction
 set(MMP_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/multi_model_prediction")
-download_data_without_verify(${MMP_INSTALL_DIR} PaddleInference/mobilenet_v2_models.tar.gz) 
-inference_multiple_models_analysis_api_test(test_analyzer_multi_model_prediction ${MMP_INSTALL_DIR} analyzer_mmp_tester.cc)
+download_data_without_verify(${MMP_INSTALL_DIR}
+                             PaddleInference/mobilenet_v2_models.tar.gz)
+inference_multiple_models_analysis_api_test(
+  test_analyzer_multi_model_prediction ${MMP_INSTALL_DIR}
+  analyzer_mmp_tester.cc)
 
 if(WITH_GPU AND TENSORRT_FOUND)
-    set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models")
-    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models.tar.gz)
-        inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz" 3dcccdc38b549b6b1b4089723757bd98)
-    endif()
-    set(TEST_SPLIT_CONVERTER_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_split_op_converter_test")
-    if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL}/split_converter.tgz)
-        inference_download_and_uncompress_without_verify(${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test "split_converter.tgz")
-    endif()
-    inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(trt_resnet50_test SRCS trt_resnet50_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(trt_resnext_test SRCS trt_resnext_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(trt_fc_prelu_test SRCS trt_fc_prelu_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(trt_cascade_rcnn_test SRCS trt_cascade_rcnn_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(trt_split_converter_test SRCS trt_split_converter_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-            ARGS --infer_model=${TEST_SPLIT_CONVERTER_MODEL}/)
-    inference_analysis_test(test_analyzer_capi_exp_gpu SRCS analyzer_capi_exp_gpu_tester.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(test_analyzer_capi_exp_xpu SRCS analyzer_capi_exp_xpu_tester.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-            
-    set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
-    if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz)
-        inference_download_and_uncompress_without_verify(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz")
-    endif()
-    inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR})
-
-    set(TRT_MODEL_QUANT_YOLOV3_DIR "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware")
-    if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware.tgz)
-        inference_download_and_uncompress_without_verify(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz")
-    endif()
-    inference_analysis_test(trt_quant_int8_yolov3_r50_test SRCS trt_quant_int8_yolov3_r50_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_QUANT_YOLOV3_DIR})
-
-    set(TEST_TRT_DYNAMIC_MODEL2 "${TRT_MODEL_INSTALL_DIR}/complex_model_dynamic")
-    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2}/complex_model_dynamic2.tar.gz)
-        inference_download_and_uncompress_without_verify(${TEST_TRT_DYNAMIC_MODEL2} ${INFERENCE_URL}/tensorrt_test "complex_model_dynamic2.tar.gz")
-    endif()
-
-    set(TEST_TRT_DYNAMIC_MODEL "${TRT_MODEL_INSTALL_DIR}/conv_bn_swish_split_gelu")
-    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL}/conv_bn_swish_split_gelu.tar.gz)
-        inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL} ${INFERENCE_URL}/tensorrt_test "conv_bn_swish_split_gelu.tar.gz" 2a5e8791e47b221b4f782151d76da9c6)
-    endif()
-    inference_analysis_test(trt_dynamic_shape_test SRCS trt_dynamic_shape_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR})
-
-    set(TEST_TRT_ERNIE_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4.tar.gz)
-        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4.tar.gz" 5fa371efa75706becbaad79195d2ca68)
-    endif()
-
-    inference_analysis_test(test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
-
-    set(TEST_TRT_TRANSFORMER_PRUNE_MODEL "${TRT_MODEL_INSTALL_DIR}/transformer_prune")
-    if (NOT EXISTS ${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune.tar.gz)
-        inference_download_and_uncompress(${TEST_TRT_TRANSFORMER_PRUNE_MODEL} ${INFERENCE_URL}/tensorrt_test "transformer_prune.tar.gz" 77b56dc73ff0cf44ddb1ce9ca0b0f471)
-    endif()
-
-    inference_analysis_test(test_trt_dynamic_shape_transformer_prune SRCS trt_dynamic_shape_transformer_prune_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-            ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune)
-
-    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized.tgz)
-        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz" 833d73fc6a7f7e1ee4a1fd6419209e55)
-    endif()
-
-    inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_serialize_deserialize_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
-
-    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized.tgz)
-        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_fp16_unserialized.tgz" c5ff2d0cad79953ffbf2b8b9e2fae6e4)
-    endif()
-
-    inference_analysis_test(test_trt_dynamic_shape_ernie_fp16_ser_deser SRCS trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized)
+  set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models")
+  if(NOT EXISTS ${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models.tar.gz)
+    inference_download_and_uncompress(
+      ${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test
+      "trt_inference_test_models.tar.gz" 3dcccdc38b549b6b1b4089723757bd98)
+  endif()
+  set(TEST_SPLIT_CONVERTER_MODEL
+      "${TRT_MODEL_INSTALL_DIR}/trt_split_op_converter_test")
+  if(NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL}/split_converter.tgz)
+    inference_download_and_uncompress_without_verify(
+      ${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test
+      "split_converter.tgz")
+  endif()
+  inference_analysis_test(
+    trt_mobilenet_test
+    SRCS
+    trt_mobilenet_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+  inference_analysis_test(
+    trt_resnet50_test
+    SRCS
+    trt_resnet50_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+  inference_analysis_test(
+    trt_resnext_test
+    SRCS
+    trt_resnext_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+  inference_analysis_test(
+    trt_fc_prelu_test
+    SRCS
+    trt_fc_prelu_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+  inference_analysis_test(
+    trt_cascade_rcnn_test
+    SRCS
+    trt_cascade_rcnn_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+  inference_analysis_test(
+    trt_split_converter_test
+    SRCS
+    trt_split_converter_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${TEST_SPLIT_CONVERTER_MODEL}/)
+  inference_analysis_test(
+    test_analyzer_capi_exp_gpu
+    SRCS
+    analyzer_capi_exp_gpu_tester.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_c
+    ARGS
+    --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+  inference_analysis_test(
+    test_analyzer_capi_exp_xpu
+    SRCS
+    analyzer_capi_exp_xpu_tester.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_c
+    ARGS
+    --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+
+  set(TRT_MODEL_QUANT_RESNET_DIR
+      "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
+  if(NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz)
+    inference_download_and_uncompress_without_verify(
+      ${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test
+      "small_quant_model.tgz")
+  endif()
+  inference_analysis_test(
+    trt_quant_int8_test
+    SRCS
+    trt_quant_int8_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${TRT_MODEL_QUANT_RESNET_DIR})
+
+  set(TRT_MODEL_QUANT_YOLOV3_DIR
+      "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware")
+  if(NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware.tgz)
+    inference_download_and_uncompress_without_verify(
+      ${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test
+      "yolov3_r50_quant_aware.tgz")
+  endif()
+  inference_analysis_test(
+    trt_quant_int8_yolov3_r50_test
+    SRCS
+    trt_quant_int8_yolov3_r50_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${TRT_MODEL_QUANT_YOLOV3_DIR})
+
+  set(TEST_TRT_DYNAMIC_MODEL2 "${TRT_MODEL_INSTALL_DIR}/complex_model_dynamic")
+  if(NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2}/complex_model_dynamic2.tar.gz)
+    inference_download_and_uncompress_without_verify(
+      ${TEST_TRT_DYNAMIC_MODEL2} ${INFERENCE_URL}/tensorrt_test
+      "complex_model_dynamic2.tar.gz")
+  endif()
+
+  set(TEST_TRT_DYNAMIC_MODEL
+      "${TRT_MODEL_INSTALL_DIR}/conv_bn_swish_split_gelu")
+  if(NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL}/conv_bn_swish_split_gelu.tar.gz)
+    inference_download_and_uncompress(
+      ${TEST_TRT_DYNAMIC_MODEL} ${INFERENCE_URL}/tensorrt_test
+      "conv_bn_swish_split_gelu.tar.gz" 2a5e8791e47b221b4f782151d76da9c6)
+  endif()
+  inference_analysis_test(
+    trt_dynamic_shape_test
+    SRCS
+    trt_dynamic_shape_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${TRT_MODEL_INSTALL_DIR})
+
+  set(TEST_TRT_ERNIE_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test")
+  if(NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4.tar.gz)
+    inference_download_and_uncompress(
+      ${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test
+      "ernie_model_4.tar.gz" 5fa371efa75706becbaad79195d2ca68)
+  endif()
+
+  inference_analysis_test(
+    test_trt_dynamic_shape_ernie
+    SRCS
+    trt_dynamic_shape_ernie_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
+
+  set(TEST_TRT_TRANSFORMER_PRUNE_MODEL
+      "${TRT_MODEL_INSTALL_DIR}/transformer_prune")
+  if(NOT EXISTS ${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune.tar.gz)
+    inference_download_and_uncompress(
+      ${TEST_TRT_TRANSFORMER_PRUNE_MODEL} ${INFERENCE_URL}/tensorrt_test
+      "transformer_prune.tar.gz" 77b56dc73ff0cf44ddb1ce9ca0b0f471)
+  endif()
+
+  inference_analysis_test(
+    test_trt_dynamic_shape_transformer_prune
+    SRCS
+    trt_dynamic_shape_transformer_prune_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune)
+
+  if(NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized.tgz)
+    inference_download_and_uncompress(
+      ${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test
+      "ernie_model_4_unserialized.tgz" 833d73fc6a7f7e1ee4a1fd6419209e55)
+  endif()
+
+  inference_analysis_test(
+    test_trt_dynamic_shape_ernie_ser_deser
+    SRCS
+    trt_dynamic_shape_ernie_serialize_deserialize_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
+
+  if(NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized.tgz)
+    inference_download_and_uncompress(
+      ${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test
+      "ernie_model_4_fp16_unserialized.tgz" c5ff2d0cad79953ffbf2b8b9e2fae6e4)
+  endif()
+
+  inference_analysis_test(
+    test_trt_dynamic_shape_ernie_fp16_ser_deser
+    SRCS
+    trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized)
 
 endif()
 
 set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
 download_data_without_verify(${LITE_MODEL_INSTALL_DIR} "mul_model_fp32.tgz")
 
-inference_analysis_test(lite_mul_model_test SRCS lite_mul_model_test.cc
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${LITE_MODEL_INSTALL_DIR})
-inference_analysis_test(lite_resnet50_test SRCS lite_resnet50_test.cc
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${RESNET50_MODEL_DIR})
-
-inference_analysis_test(test_analyzer_capi_exp SRCS analyzer_capi_exp_tester.cc
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
-        ARGS --infer_model=${RESNET50_MODEL_DIR}/model)
-
-inference_analysis_test(test_analyzer_capi_exp_pd_config SRCS analyzer_capi_exp_pd_config_tester.cc
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
-        ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
-
-inference_analysis_test(test_analyzer_capi_exp_pd_tensor SRCS analyzer_capi_exp_pd_tensor_tester.cc
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
-        ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
-
-if (NOT APPLE AND NOT WIN32)
-    inference_analysis_test(test_analyzer_capi_exp_pd_threads SRCS analyzer_capi_exp_pd_threads_tester.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
-            ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
+inference_analysis_test(
+  lite_mul_model_test
+  SRCS
+  lite_mul_model_test.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  ARGS
+  --infer_model=${LITE_MODEL_INSTALL_DIR})
+inference_analysis_test(
+  lite_resnet50_test
+  SRCS
+  lite_resnet50_test.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  ARGS
+  --infer_model=${RESNET50_MODEL_DIR})
+
+inference_analysis_test(
+  test_analyzer_capi_exp
+  SRCS
+  analyzer_capi_exp_tester.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_c
+  ARGS
+  --infer_model=${RESNET50_MODEL_DIR}/model)
+
+inference_analysis_test(
+  test_analyzer_capi_exp_pd_config
+  SRCS
+  analyzer_capi_exp_pd_config_tester.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_c
+  ARGS
+  --infer_model=${MOBILENET_INSTALL_DIR}/model)
+
+inference_analysis_test(
+  test_analyzer_capi_exp_pd_tensor
+  SRCS
+  analyzer_capi_exp_pd_tensor_tester.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_c
+  ARGS
+  --infer_model=${MOBILENET_INSTALL_DIR}/model)
+
+if(NOT APPLE AND NOT WIN32)
+  inference_analysis_test(
+    test_analyzer_capi_exp_pd_threads
+    SRCS
+    analyzer_capi_exp_pd_threads_tester.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_c
+    ARGS
+    --infer_model=${MOBILENET_INSTALL_DIR}/model)
 endif()
-inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zerocopy_tensor_tester.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-            ARGS --infer_model=${OCR_INSTALL_DIR}/model)        
-
-if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
-    inference_analysis_test(test_analyzer_dist_model SRCS analyzer_dist_model_tester.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${OCR_INSTALL_DIR}/model)
+inference_analysis_test(
+  test_analyzer_zerocopytensor_tensor
+  SRCS
+  analyzer_zerocopy_tensor_tester.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  ARGS
+  --infer_model=${OCR_INSTALL_DIR}/model)
+
+if(WITH_DISTRIBUTE
+   AND WITH_PSCORE
+   AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+  inference_analysis_test(
+    test_analyzer_dist_model
+    SRCS
+    analyzer_dist_model_tester.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${OCR_INSTALL_DIR}/model)
 endif()
 
-inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_tensor_tester.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-            ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt)    
-            
+inference_analysis_test(
+  test_analyzer_paddletensor_tensor
+  SRCS
+  analyzer_paddle_tensor_tester.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  ARGS
+  --infer_model=${OCR_INSTALL_DIR}/model
+  --infer_data=${OCR_INSTALL_DIR}/data.txt
+  --refer_result=${OCR_INSTALL_DIR}/result.txt)
+
 if(WITH_MKLDNN)
-  inference_analysis_test(test_analyzer_capi_exp_int SRCS analyzer_capi_exp_int_tester.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
-            ARGS --infer_model=${INT8_DATA_DIR}/resnet50/model)
+  inference_analysis_test(
+    test_analyzer_capi_exp_int
+    SRCS
+    analyzer_capi_exp_int_tester.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_c
+    ARGS
+    --infer_model=${INT8_DATA_DIR}/resnet50/model)
 endif()
 
-inference_analysis_test(test_analyzer_capi_exp_ner SRCS analyzer_capi_exp_ner_tester.cc 
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
-        ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model)
+inference_analysis_test(
+  test_analyzer_capi_exp_ner
+  SRCS
+  analyzer_capi_exp_ner_tester.cc
+  EXTRA_DEPS
+  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_c
+  ARGS
+  --infer_model=${CHINESE_NER_INSTALL_DIR}/model)
 
 if(WITH_GPU)
-    inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${RESNET50_MODEL_DIR})
-
-    inference_analysis_test(paddle_infer_api_copy_tensor_tester SRCS paddle_infer_api_copy_tensor_tester.cc
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${RESNET50_MODEL_DIR})
-    set_tests_properties(paddle_infer_api_copy_tensor_tester PROPERTIES TIMEOUT 30)
+  inference_analysis_test(
+    paddle_infer_api_test
+    SRCS
+    paddle_infer_api_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${RESNET50_MODEL_DIR})
+
+  inference_analysis_test(
+    paddle_infer_api_copy_tensor_tester
+    SRCS
+    paddle_infer_api_copy_tensor_tester.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${RESNET50_MODEL_DIR})
+  set_tests_properties(paddle_infer_api_copy_tensor_tester PROPERTIES TIMEOUT
+                                                                      30)
 endif()
 
-cc_test(paddle_infer_api_errors_test SRCS paddle_infer_api_errors_tester.cc DEPS paddle_inference_api)
+cc_test(
+  paddle_infer_api_errors_test
+  SRCS paddle_infer_api_errors_tester.cc
+  DEPS paddle_inference_api)
 
 if("$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
-    return()
+  return()
 endif()
 
 if(WITH_GPU AND TENSORRT_FOUND)
-    set_tests_properties(trt_resnext_test PROPERTIES TIMEOUT 300)
-    set_tests_properties(trt_quant_int8_yolov3_r50_test PROPERTIES TIMEOUT 300)
-    set_tests_properties(trt_resnet50_test PROPERTIES TIMEOUT 300)
-    set_tests_properties(trt_cascade_rcnn_test PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_trt_dynamic_shape_ernie_ser_deser PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_trt_dynamic_shape_ernie_fp16_ser_deser PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_trt_dynamic_shape_ernie PROPERTIES TIMEOUT 300)
+  set_tests_properties(trt_resnext_test PROPERTIES TIMEOUT 300)
+  set_tests_properties(trt_quant_int8_yolov3_r50_test PROPERTIES TIMEOUT 300)
+  set_tests_properties(trt_resnet50_test PROPERTIES TIMEOUT 300)
+  set_tests_properties(trt_cascade_rcnn_test PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_trt_dynamic_shape_ernie_ser_deser PROPERTIES TIMEOUT
+                                                                         300)
+  set_tests_properties(test_trt_dynamic_shape_ernie_fp16_ser_deser
+                       PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_trt_dynamic_shape_ernie PROPERTIES TIMEOUT 300)
 endif()
 
 if(WITH_MKLDNN)
-    set_tests_properties(test_analyzer_int8_resnet50 PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_analyzer_int8_mobilenet_ssd PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_analyzer_quant_performance_benchmark PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_analyzer_int8_mobilenetv2 PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_analyzer_int8_mobilenetv1 PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_analyzer_int8_mobilenetv3_large PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_analyzer_quant2_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_analyzer_quant2_resnet50_channelwise_mkldnn PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_analyzer_int8_resnet50 PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_analyzer_int8_mobilenet_ssd PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_analyzer_quant_performance_benchmark
+                       PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_analyzer_int8_mobilenetv2 PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_analyzer_int8_mobilenetv1 PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_analyzer_int8_mobilenetv3_large PROPERTIES TIMEOUT
+                                                                       120)
+  set_tests_properties(test_analyzer_quant2_mobilenetv1_mkldnn
+                       PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_analyzer_quant2_resnet50_channelwise_mkldnn
+                       PROPERTIES TIMEOUT 120)
 endif()
 
 set_tests_properties(lite_resnet50_test PROPERTIES TIMEOUT 120)
@@ -809,45 +1265,74 @@ set_tests_properties(test_analyzer_ernie_int8 PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_googlenet PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_small_dam PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_transformer PROPERTIES TIMEOUT 120)
-set_tests_properties(test_analyzer_mobilenet_depthwise_conv PROPERTIES TIMEOUT 120)
-if (WITH_GPU)
-    set_tests_properties(test_analyzer_bert PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_analyzer_ernie PROPERTIES TIMEOUT 120)
+set_tests_properties(test_analyzer_mobilenet_depthwise_conv PROPERTIES TIMEOUT
+                                                                       120)
+if(WITH_GPU)
+  set_tests_properties(test_analyzer_bert PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_analyzer_ernie PROPERTIES TIMEOUT 120)
 endif()
 if(WITH_GPU AND TENSORRT_FOUND)
-    set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 120)
-    if(WITH_MKLDNN)
-        set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT 120)
-    endif()
+  set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 120)
+  if(WITH_MKLDNN)
+    set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT 120)
+  endif()
 endif()
 if(ON_INFER OR WITH_GPU)
-    set_tests_properties(test_analyzer_transformer_profile PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_analyzer_transformer_profile PROPERTIES TIMEOUT 120)
 endif()
 
-if (WITH_IPU)
-    #word2vec sample
-    set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec/word2vec.inference.model")
-    inference_analysis_test(ipu_word2vec_sample SRCS ipu_word2vec_sample.cc
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${WORD2VEC_INSTALL_DIR})
-
-    # ERNIE
-    set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie")
-    inference_analysis_api_test(ipu_ernie_test ${ERNIE_INSTALL_DIR} ipu_ernie_test.cc
-            ARGS --warmup=true --repeat=10)
-    inference_analysis_api_test(ipu_ernie_fp16_test ${ERNIE_INSTALL_DIR} ipu_ernie_fp16_test.cc
-            ARGS --warmup=true --repeat=10)
-
-    # Resnet50
-    set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
-    inference_analysis_test(ipu_resnet50_test SRCS ipu_resnet50_test.cc
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10)
-    inference_analysis_test(ipu_resnet50_fp16_test SRCS ipu_resnet50_fp16_test.cc
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10)
-
-    # Only support Resnet50 and Ernie currently
-    inference_analysis_api_test(ipu_multi_model_profile SRCS ipu_multi_model_profile.cc
-        ARGS --model_name="Resnet50" --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10)
+if(WITH_IPU)
+  #word2vec sample
+  set(WORD2VEC_INSTALL_DIR
+      "${INFERENCE_DEMO_INSTALL_DIR}/word2vec/word2vec.inference.model")
+  inference_analysis_test(
+    ipu_word2vec_sample
+    SRCS
+    ipu_word2vec_sample.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${WORD2VEC_INSTALL_DIR})
+
+  # ERNIE
+  set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie")
+  inference_analysis_api_test(ipu_ernie_test ${ERNIE_INSTALL_DIR}
+                              ipu_ernie_test.cc ARGS --warmup=true --repeat=10)
+  inference_analysis_api_test(
+    ipu_ernie_fp16_test ${ERNIE_INSTALL_DIR} ipu_ernie_fp16_test.cc ARGS
+    --warmup=true --repeat=10)
+
+  # Resnet50
+  set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
+  inference_analysis_test(
+    ipu_resnet50_test
+    SRCS
+    ipu_resnet50_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${RESNET50_MODEL_DIR}
+    --warmup=true
+    --repeat=10)
+  inference_analysis_test(
+    ipu_resnet50_fp16_test
+    SRCS
+    ipu_resnet50_fp16_test.cc
+    EXTRA_DEPS
+    ${INFERENCE_EXTRA_DEPS}
+    ARGS
+    --infer_model=${RESNET50_MODEL_DIR}
+    --warmup=true
+    --repeat=10)
+
+  # Only support Resnet50 and Ernie currently
+  inference_analysis_api_test(
+    ipu_multi_model_profile
+    SRCS
+    ipu_multi_model_profile.cc
+    ARGS
+    --model_name="Resnet50"
+    --infer_model=${RESNET50_MODEL_DIR}
+    --warmup=true
+    --repeat=10)
 endif()
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
index dcda34c64da5d..ae838955adc02 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
@@ -65,7 +67,7 @@ TEST(PD_Config, gpu_interface) {
                                   &min_shape_ptr, &max_shape_ptr,
                                   &opt_shape_ptr, FALSE);
   PD_ConfigDisableTensorRtOPs(config, 1, &ops_name);
-  PD_ConfigEnableTensorRtOSS(config);
+  PD_ConfigEnableVarseqlen(config);
   bool oss_enabled = PD_ConfigTensorRtOssEnabled(config);
   EXPECT_TRUE(oss_enabled);
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
index d3a15cb285772..dfcf5fda4763e 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
index 4369cd78dfa37..db5406b8ef6af 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
@@ -15,8 +15,10 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
index a341ffd7a081c..8b094e8a6cb9b 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
index f4017fc5a7f34..33685e6a96060 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
@@ -15,11 +15,13 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <fstream>
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
index 8951c446b1f83..f59b337d6afe5 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
@@ -15,11 +15,13 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <fstream>
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
index a84c19de25516..347f0e6e2532a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
index c60e0a25f28c0..524d39854debe 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_int_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_int_tester.cc
index c0c8ff083de57..cf8582ee778e9 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_int_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_int_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_ner_tester.cc
index bf0576f9f93b1..b74f51af980db 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_ner_tester.cc
@@ -15,8 +15,10 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
index a9c24c4503f9f..d0cd55e918e65 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
@@ -15,11 +15,13 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <fstream>
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
@@ -69,8 +71,9 @@ void PD_run() {
   PD_DeletePaddleTensor(input);
   int size;
   const int* out_shape = PD_GetPaddleTensorShape(out_data, &size);
-  PADDLE_ENFORCE_EQ(size, 2, paddle::platform::errors::InvalidArgument(
-                                 "The Output shape's size is NOT match."));
+  PADDLE_ENFORCE_EQ(size, 2,
+                    paddle::platform::errors::InvalidArgument(
+                        "The Output shape's size is NOT match."));
   std::vector<int> ref_outshape_size({9, 6});
   for (int i = 0; i < 2; ++i) {
     PADDLE_ENFORCE_EQ(out_shape[i], ref_outshape_size[i],
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
index 0b2be0076fdb1..4ff3e27f420be 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc
index 33a67d8140575..e6a6a8c1037a0 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index 820bbf0701778..e3bdb98ec522b 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <vector>
+
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
index 384bef8a4b439..c21785f7ce7a3 100644
--- a/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
index 5333f0052d742..166bdc621c198 100644
--- a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 DEFINE_string(infer_shape, "", "data shape file");
diff --git a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
index af0a51e4ddbb4..cf3380d0406d0 100644
--- a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 DEFINE_bool(disable_mkldnn_fc, false, "Disable usage of MKL-DNN's FC op");
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
index d11b5f0c218f2..c6d266ceb21eb 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
index 57ab1b00908b1..18990dba3148e 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index bd3a1d737afb1..2b69a15e26a8a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -148,8 +148,9 @@ TEST(Analyzer_LAC, profile) {
                           "The size of output should be equal to 1."));
     size_t size = GetSize(output[0]);
     size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
-    PADDLE_ENFORCE_GE(size, batch1_size, paddle::platform::errors::Fatal(
-                                             "The size of batch is invaild."));
+    PADDLE_ENFORCE_GE(
+        size, batch1_size,
+        paddle::platform::errors::Fatal("The size of batch is invaild."));
     int64_t *pdata = static_cast<int64_t *>(output[0].data.data());
     for (size_t i = 0; i < batch1_size; ++i) {
       EXPECT_EQ(pdata[i], lac_ref_data[i]);
diff --git a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
index 141e60513eb95..7e754ad93bc3d 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
@@ -211,18 +212,15 @@ std::vector<double> Lexical_Test(
       }
     }
     // nums_infer, nums_label, nums_correct
-    auto precision =
-        acc_sum[0]
-            ? static_cast<double>(acc_sum[2]) / static_cast<double>(acc_sum[0])
-            : 0;
-    auto recall =
-        acc_sum[1]
-            ? static_cast<double>(acc_sum[2]) / static_cast<double>(acc_sum[1])
-            : 0;
-    auto f1_score =
-        acc_sum[2]
-            ? static_cast<float>(2 * precision * recall) / (precision + recall)
-            : 0;
+    auto precision = acc_sum[0] ? static_cast<double>(acc_sum[2]) /
+                                      static_cast<double>(acc_sum[0])
+                                : 0;
+    auto recall = acc_sum[1] ? static_cast<double>(acc_sum[2]) /
+                                   static_cast<double>(acc_sum[1])
+                             : 0;
+    auto f1_score = acc_sum[2] ? static_cast<float>(2 * precision * recall) /
+                                     (precision + recall)
+                               : 0;
 
     LOG(INFO) << "Precision:  " << std::fixed << std::setw(6)
               << std::setprecision(5) << precision;
diff --git a/paddle/fluid/inference/tests/api/analyzer_mmp_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mmp_tester.cc
index 4a5ec95934a9a..43fed05db133c 100644
--- a/paddle/fluid/inference/tests/api/analyzer_mmp_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mmp_tester.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <random>
+
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
-#include <random>
-
 // Here add missing commands
 DEFINE_string(infer_model2, "", "model path");
 DEFINE_string(infer_model3, "", "model path");
@@ -96,8 +96,9 @@ void compare(bool use_mkldnn = false) {
       xx_output.begin(), xx_output.end(), xx2_output.begin(),
       [](const float& l, const float& r) { return fabs(l - r) < 1e-4; });
 
-  PADDLE_ENFORCE_EQ(result, true, paddle::platform::errors::Fatal(
-                                      "Results of model run independently "
+  PADDLE_ENFORCE_EQ(
+      result, true,
+      paddle::platform::errors::Fatal("Results of model run independently "
                                       "differs from results of the same model "
                                       "run as a sequence of models"));
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_paddle_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_paddle_tensor_tester.cc
index 2eb75c4dc5369..2c02b87ba2be4 100644
--- a/paddle/fluid/inference/tests/api/analyzer_paddle_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_paddle_tensor_tester.cc
@@ -16,9 +16,8 @@
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc
index 4bb59f3c8df42..1618ba575a26e 100644
--- a/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index 978aaf1c6a32d..883d946dff54e 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -47,8 +47,9 @@ struct DataRecord {
       num_lines++;
       std::vector<std::string> data;
       split(line, '\t', &data);
-      PADDLE_ENFORCE_GT(data.size(), 4, paddle::platform::errors::Fatal(
-                                            "The size of data is invaild."));
+      PADDLE_ENFORCE_GT(
+          data.size(), 4,
+          paddle::platform::errors::Fatal("The size of data is invaild."));
       // load title1 data
       std::vector<int64_t> title1_data;
       split_to_int64(data[0], ' ', &title1_data);
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
index 8f0778b83e52e..1ef5e81e18a38 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
index 099ff1f31a759..5a78d36276cb9 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
index 1fbcbf1a3f427..30cea4f69bdd0 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
index d33b11c389a09..15f4b3a3a5bf0 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
index 0ccd95f2a176d..063d29abee9a2 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h
index 5d7f7c290f6a2..ef00c0209738e 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h b/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h
index e43456ed8322e..a384c75e0bb45 100644
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h
@@ -15,6 +15,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index faa15fc4f0a17..0a43d166e93cf 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc
index 029f2f0421d15..08f26bae37bea 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/analyzer_zerocopy_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_zerocopy_tensor_tester.cc
index e1ee1b196e4d3..d8ba615c8ed77 100644
--- a/paddle/fluid/inference/tests/api/analyzer_zerocopy_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_zerocopy_tensor_tester.cc
@@ -16,9 +16,8 @@
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index b952b62f13ed6..6ef3eb95dd222 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <ostream>
 #include <string>
+
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc b/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc
index 1d69069da0716..38cf475d3da6f 100644
--- a/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc
+++ b/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <cmath>
 
 #include "gflags/gflags.h"
diff --git a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
index 5fde8e6a5e1e6..cbfe8229d31a1 100644
--- a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <cmath>
 
 #include "gflags/gflags.h"
diff --git a/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc b/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc
index d38c5c3416351..a0e36e9779da8 100644
--- a/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc
+++ b/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc
@@ -31,8 +31,8 @@ limitations under the License. */
 DEFINE_string(infer_model, "", "Directory of the inference model.");
 
 using paddle_infer::Config;
-using paddle_infer::Predictor;
 using paddle_infer::CreatePredictor;
+using paddle_infer::Predictor;
 
 void inference(std::string model_path, bool use_ipu,
                std::vector<float> *out_data) {
diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
index 9211ea246a5c5..1adbf0ec7a552 100644
--- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
@@ -14,11 +14,12 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <mutex>   // NOLINT
 #include <thread>  // NOLINT
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
index 59bbaa2b78fb0..169d0b9987d79 100644
--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <cmath>
 
 #include "gflags/gflags.h"
diff --git a/paddle/fluid/inference/tests/api/mkldnn_quantizer_config_tester.cc b/paddle/fluid/inference/tests/api/mkldnn_quantizer_config_tester.cc
index 4a2527a217f8b..d972945db7d8c 100644
--- a/paddle/fluid/inference/tests/api/mkldnn_quantizer_config_tester.cc
+++ b/paddle/fluid/inference/tests/api/mkldnn_quantizer_config_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
@@ -90,9 +91,10 @@ TEST(Mkldnn_quantizer_config, configuration) {
 
   PADDLE_ENFORCE_EQ(
       cfg.mkldnn_quantizer_config()->scale_algo("conv2d", "Input"),
-      conv2d_scale_algo, platform::errors::InvalidArgument(
-                             "Scale algorithm got from config differs with the "
-                             "one set previously."));
+      conv2d_scale_algo,
+      platform::errors::InvalidArgument(
+          "Scale algorithm got from config differs with the "
+          "one set previously."));
 
   PADDLE_ENFORCE_EQ(
       cfg.mkldnn_quantizer_config()->scale_algo("unknown", "unknown"),
diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
index 2be69781c4e60..38bcb7645abb5 100644
--- a/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
+
 #include <cstring>
 #include <numeric>
+
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/inference/api/paddle_infer_contrib.h"
diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_errors_tester.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_errors_tester.cc
index c5a0746c4d760..ab82c82b1e3b3 100644
--- a/paddle/fluid/inference/tests/api/paddle_infer_api_errors_tester.cc
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_errors_tester.cc
@@ -15,7 +15,6 @@
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/inference/api/paddle_infer_contrib.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
index 88ebd85c79a13..8cbc410eb5ff3 100644
--- a/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
@@ -15,10 +15,11 @@ limitations under the License. */
 #include <cuda_runtime.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <cstring>
 #include <numeric>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index f2df018f4978a..d7784a909afd4 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -1081,7 +1081,7 @@ static bool CompareTensor(const framework::LoDTensor &a,
 }
 
 void ConvertFP32toFP16(paddle::PaddleTensor &tensor  // NOLINT
-                       ) {
+) {
   int num = 1;
   for (auto dim : tensor.shape) {
     num *= dim;
@@ -1101,7 +1101,7 @@ void ConvertFP32toFP16(paddle::PaddleTensor &tensor  // NOLINT
 }
 
 void ConvertFP16toFP32(paddle::PaddleTensor &tensor  // NOLINT
-                       ) {
+) {
   int num = 1;
   for (auto dim : tensor.shape) {
     num *= dim;
diff --git a/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc b/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc
index a1f31c3108ba5..ab059496ad8a7 100644
--- a/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
index 7e9f71c8b3c0c..b0c4c13dbbc63 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc
index 209dd90c48070..f269432d4da1e 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
index 5ae14576dfeb0..3ca62afba1d05 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
@@ -24,8 +24,8 @@ limitations under the License. */
 #include <map>
 #include <string>
 #include <vector>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
index 1058a5b5ec6b8..977c6856f8c08 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
@@ -210,7 +210,11 @@ std::shared_ptr<paddle_infer::Predictor> InitPredictor() {
   config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
                                 opt_input_shape);
   // erinie varlen must be used with oss
-  config.EnableTensorRtOSS();
+  config.EnableVarseqlen();
+  paddle_infer::experimental::InternalUtils::SetTransformerPosid(&config,
+                                                                 input_name2);
+  paddle_infer::experimental::InternalUtils::SetTransformerMaskid(&config,
+                                                                  input_name3);
 
   return paddle_infer::CreatePredictor(config);
 }
@@ -222,13 +226,78 @@ void run(paddle_infer::Predictor* predictor, std::vector<float>* out_data) {
 
   int32_t i1[run_seq_len] = {
       // sentence 1
-      1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321, 4095, 1902, 4,
-      134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2, 75, 201, 340, 9, 14, 44,
-      486, 218, 1140, 279, 12043, 2,
+      1,
+      3558,
+      4,
+      75,
+      491,
+      89,
+      340,
+      313,
+      93,
+      4,
+      255,
+      10,
+      75,
+      321,
+      4095,
+      1902,
+      4,
+      134,
+      49,
+      75,
+      311,
+      14,
+      44,
+      178,
+      543,
+      15,
+      12043,
+      2,
+      75,
+      201,
+      340,
+      9,
+      14,
+      44,
+      486,
+      218,
+      1140,
+      279,
+      12043,
+      2,
       // sentence 2
-      101, 2054, 2234, 2046, 2486, 2044, 1996, 2047, 4552, 2001, 9536, 1029,
-      102, 2004, 1997, 2008, 2154, 1010, 1996, 2047, 4552, 9536, 2075, 1996,
-      2117, 3072, 2234, 2046, 2486, 1012, 102,
+      101,
+      2054,
+      2234,
+      2046,
+      2486,
+      2044,
+      1996,
+      2047,
+      4552,
+      2001,
+      9536,
+      1029,
+      102,
+      2004,
+      1997,
+      2008,
+      2154,
+      1010,
+      1996,
+      2047,
+      4552,
+      9536,
+      2075,
+      1996,
+      2117,
+      3072,
+      2234,
+      2046,
+      2486,
+      1012,
+      102,
   };
   int32_t i2[run_seq_len] = {
       // sentence 1
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
index ccdf237ffa54d..4b22bba2bcc97 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
index 2d7aa72a036fd..a238e62fc7cc0 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc b/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc
index c0be194493112..93d4a88383c33 100644
--- a/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_instance_norm_converter_test.cc b/paddle/fluid/inference/tests/api/trt_instance_norm_converter_test.cc
index ceb8b99774e48..243be1d33193c 100644
--- a/paddle/fluid/inference/tests/api/trt_instance_norm_converter_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_instance_norm_converter_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
index a87bf7b085bd8..bcf8a23b9b922 100644
--- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc b/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc
index ca25967b59a6a..3a884abe88889 100644
--- a/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc
@@ -14,9 +14,10 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <numeric>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_quant_int8_yolov3_r50_test.cc b/paddle/fluid/inference/tests/api/trt_quant_int8_yolov3_r50_test.cc
index 1fa24dddead88..d9e1e3f8c9e8a 100644
--- a/paddle/fluid/inference/tests/api/trt_quant_int8_yolov3_r50_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_quant_int8_yolov3_r50_test.cc
@@ -11,9 +11,10 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <numeric>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_resnet50_test.cc b/paddle/fluid/inference/tests/api/trt_resnet50_test.cc
index 2975967e0c0de..cdc6586f1272b 100644
--- a/paddle/fluid/inference/tests/api/trt_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_resnet50_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_resnext_test.cc b/paddle/fluid/inference/tests/api/trt_resnext_test.cc
index b525a1b706858..374074957c870 100644
--- a/paddle/fluid/inference/tests/api/trt_resnext_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_resnext_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_split_converter_test.cc b/paddle/fluid/inference/tests/api/trt_split_converter_test.cc
index c00b36b520bcd..0726db28343bc 100644
--- a/paddle/fluid/inference/tests/api/trt_split_converter_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_split_converter_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_test_helper.h b/paddle/fluid/inference/tests/api/trt_test_helper.h
index aaa285b2fc2c9..cadf996e071d8 100644
--- a/paddle/fluid/inference/tests/api/trt_test_helper.h
+++ b/paddle/fluid/inference/tests/api/trt_test_helper.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <dirent.h>
+
 #include <string>
 #include <vector>
 
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
index ad7ef0c04ce67..5aef30bf335c3 100644
--- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
@@ -1,32 +1,34 @@
 cmake_minimum_required(VERSION 3.0)
 project(cpp_inference_demo CXX C)
-option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
-option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
-option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   OFF)
-option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
-option(WITH_GTEST "Compile demo with GTEST"   OFF)
-option(WITH_ONNXRUNTIME       "Compile demo with ONNXRuntime"       OFF)
+option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON)
+option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF)
+option(WITH_STATIC_LIB
+       "Compile demo with static/shared library, default use static." OFF)
+option(USE_TENSORRT "Compile demo with TensorRT." OFF)
+option(WITH_GTEST "Compile demo with GTEST" OFF)
+option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF)
 
 if(NOT WITH_STATIC_LIB)
   add_definitions("-DPADDLE_WITH_SHARED_LIB")
 else()
-  # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode. 
+  # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode.
   # Set it to empty in static library mode to avoid compilation issues.
   add_definitions("/DPD_INFER_DECL=")
 endif()
 
 macro(safe_set_static_flag)
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
+  foreach(flag_var
+          CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+          CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+    if(${flag_var} MATCHES "/MD")
+      string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+    endif(${flag_var} MATCHES "/MD")
+  endforeach(flag_var)
 endmacro()
 
 if(NOT DEFINED PADDLE_LIB)
-  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+  message(
+    FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
 endif()
 if(NOT DEFINED DEMO_NAME)
   message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
@@ -46,7 +48,7 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
 link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
-if (WITH_ONNXRUNTIME)
+if(WITH_ONNXRUNTIME)
   include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include")
   include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include")
 
@@ -54,21 +56,25 @@ if (WITH_ONNXRUNTIME)
   link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib")
 endif()
 
-if (WIN32)
+if(WIN32)
   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
   option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
-  if (MSVC_STATIC_CRT)
-    if (WITH_MKL)
+  if(MSVC_STATIC_CRT)
+    if(WITH_MKL)
       set(FLAG_OPENMP "/openmp")
     endif()
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4244 /wd4530")
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
+    set(CMAKE_C_FLAGS_DEBUG
+        "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
+    set(CMAKE_C_FLAGS_RELEASE
+        "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4530")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
+    set(CMAKE_CXX_FLAGS_DEBUG
+        "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
+    set(CMAKE_CXX_FLAGS_RELEASE
+        "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
     safe_set_static_flag()
-    if (WITH_STATIC_LIB)
+    if(WITH_STATIC_LIB)
       add_definitions(-DSTATIC_LIB)
     endif()
   endif()
@@ -81,60 +87,75 @@ endif()
 
 if(WITH_GPU)
   if(NOT WIN32)
-    set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
+    set(CUDA_LIB
+        "/usr/local/cuda/lib64/"
+        CACHE STRING "CUDA Library")
   else()
-    set(CUDA_LIB "" CACHE STRING "CUDA_LIB")
+    set(CUDA_LIB
+        ""
+        CACHE STRING "CUDA_LIB")
     if("${CUDA_LIB}" STREQUAL "")
       if(DEFINED ENV{CUDA_PATH})
         set(CUDA_LIB "$ENV{CUDA_PATH}\\lib\\x64")
       else()
-        set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\lib\\x64")
+        set(CUDA_LIB
+            "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\lib\\x64"
+        )
       endif()
     endif()
     message(STATUS "Current CUDA lib path: ${CUDA_LIB}")
   endif(NOT WIN32)
 endif()
 
-if (USE_TENSORRT AND WITH_GPU)
-  set(TENSORRT_ROOT "" CACHE STRING "The root directory of TensorRT library")
+if(USE_TENSORRT AND WITH_GPU)
+  set(TENSORRT_ROOT
+      ""
+      CACHE STRING "The root directory of TensorRT library")
   if("${TENSORRT_ROOT}" STREQUAL "")
-      message(FATAL_ERROR "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH ")
+    message(
+      FATAL_ERROR
+        "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH "
+    )
   endif()
   set(TENSORRT_INCLUDE_DIR ${TENSORRT_ROOT}/include)
   set(TENSORRT_LIB_DIR ${TENSORRT_ROOT}/lib)
   file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
-  string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
-    "${TENSORRT_VERSION_FILE_CONTENTS}")
-  string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" TENSORRT_MINOR_VERSION
-    "${TENSORRT_VERSION_FILE_CONTENTS}")
-  string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" TENSORRT_PATCH_VERSION
-    "${TENSORRT_VERSION_FILE_CONTENTS}")
-  string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" TENSORRT_BUILD_VERSION
-    "${TENSORRT_VERSION_FILE_CONTENTS}")
+  string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)"
+               TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
+  string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)"
+               TENSORRT_MINOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
+  string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)"
+               TENSORRT_PATCH_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
+  string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)"
+               TENSORRT_BUILD_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
   if("${TENSORRT_MAJOR_VERSION}" STREQUAL "")
-    file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h TENSORRT_VERSION_FILE_CONTENTS)
-    string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
-      "${TENSORRT_VERSION_FILE_CONTENTS}")
-    string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" TENSORRT_MINOR_VERSION
-      "${TENSORRT_VERSION_FILE_CONTENTS}")
-    string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" TENSORRT_PATCH_VERSION
-      "${TENSORRT_VERSION_FILE_CONTENTS}")
-    string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" TENSORRT_BUILD_VERSION
-      "${TENSORRT_VERSION_FILE_CONTENTS}")
+    file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h
+         TENSORRT_VERSION_FILE_CONTENTS)
+    string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)"
+                 TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
+    string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)"
+                 TENSORRT_MINOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
+    string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)"
+                 TENSORRT_PATCH_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
+    string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)"
+                 TENSORRT_BUILD_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}")
   endif()
   if("${TENSORRT_MAJOR_VERSION}" STREQUAL "")
     message(SEND_ERROR "Failed to detect TensorRT version.")
   endif()
   string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
-    TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
+                       TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
   string(REGEX REPLACE "define NV_TENSORRT_MINOR +([0-9]+)" "\\1"
-    TENSORRT_MINOR_VERSION "${TENSORRT_MINOR_VERSION}")
+                       TENSORRT_MINOR_VERSION "${TENSORRT_MINOR_VERSION}")
   string(REGEX REPLACE "define NV_TENSORRT_PATCH +([0-9]+)" "\\1"
-    TENSORRT_PATCH_VERSION "${TENSORRT_PATCH_VERSION}")
+                       TENSORRT_PATCH_VERSION "${TENSORRT_PATCH_VERSION}")
   string(REGEX REPLACE "define NV_TENSORRT_BUILD +([0-9]+)" "\\1"
-    TENSORRT_BUILD_VERSION "${TENSORRT_BUILD_VERSION}")
-  message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
-    "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION} ")
+                       TENSORRT_BUILD_VERSION "${TENSORRT_BUILD_VERSION}")
+  message(
+    STATUS
+      "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
+      "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION} "
+  )
   include_directories("${TENSORRT_INCLUDE_DIR}")
   link_directories("${TENSORRT_LIB_DIR}")
   add_compile_definitions(NV_TENSORRT_MAJOR=${TENSORRT_MAJOR_VERSION})
@@ -150,8 +171,9 @@ if(WITH_MKL)
     set(MATH_LIB ${MATH_LIB_PATH}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
                  ${MATH_LIB_PATH}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
   else()
-    set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
-                 ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+    set(MATH_LIB
+        ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+        ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
   endif()
   set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn")
   if(EXISTS ${MKLDNN_PATH})
@@ -166,63 +188,97 @@ else()
   set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas")
   include_directories("${OPENBLAS_LIB_PATH}/include/openblas")
   if(WIN32)
-    set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+    set(MATH_LIB
+        ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX})
   else()
-    set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+    set(MATH_LIB
+        ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
   endif()
 endif()
 
 if(WITH_STATIC_LIB)
-  set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
+  set(DEPS
+      ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}
+  )
 else()
   if(WIN32)
-    set(DEPS ${PADDLE_LIB}/paddle/lib/paddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
+    set(DEPS
+        ${PADDLE_LIB}/paddle/lib/paddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
   else()
-    set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX})
+    set(DEPS
+        ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX}
+    )
   endif()
 endif()
 
-if (WITH_ONNXRUNTIME)
+if(WITH_ONNXRUNTIME)
   if(WIN32)
-    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx)
+    set(DEPS
+        ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib
+        paddle2onnx)
   elseif(APPLE)
-    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx)
+    set(DEPS
+        ${DEPS}
+        ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib
+        paddle2onnx)
   else()
-    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx)
+    set(DEPS
+        ${DEPS}
+        ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0
+        paddle2onnx)
   endif()
 endif()
 
-if (NOT WIN32)
+if(NOT WIN32)
   set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-  set(DEPS ${DEPS}
-      ${MATH_LIB} ${MKLDNN_LIB}
-      glog gflags protobuf xxhash cryptopp
+  set(DEPS
+      ${DEPS}
+      ${MATH_LIB}
+      ${MKLDNN_LIB}
+      glog
+      gflags
+      protobuf
+      xxhash
+      cryptopp
       ${EXTERNAL_LIB})
 else()
-  set(DEPS ${DEPS}
-      ${MATH_LIB} ${MKLDNN_LIB}
-      glog gflags_static libprotobuf xxhash cryptopp-static ${EXTERNAL_LIB})
+  set(DEPS
+      ${DEPS}
+      ${MATH_LIB}
+      ${MKLDNN_LIB}
+      glog
+      gflags_static
+      libprotobuf
+      xxhash
+      cryptopp-static
+      ${EXTERNAL_LIB})
   set(DEPS ${DEPS} shlwapi.lib)
 endif(NOT WIN32)
 
 if(WITH_GPU)
   if(NOT WIN32)
-    if (USE_TENSORRT)
-      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX})
-      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX})
+    if(USE_TENSORRT)
+      set(DEPS ${DEPS}
+               ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX})
+      set(DEPS
+          ${DEPS}
+          ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX})
     endif()
     set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
   else()
     if(USE_TENSORRT)
-      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
-      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS}
+               ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS}
+               ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
       if(${TENSORRT_MAJOR_VERSION} EQUAL 7)
-        set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_STATIC_LIBRARY_SUFFIX})
+        set(DEPS ${DEPS}
+                 ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_STATIC_LIBRARY_SUFFIX})
       endif()
     endif()
-    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
-    set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
-    set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX})
+    set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX})
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX})
   endif()
 endif()
 
@@ -237,11 +293,14 @@ if(WITH_GTEST)
   include(GNUInstallDirs)
   include_directories(${GTEST_INSTALL_DIR}/include)
   add_dependencies(${DEMO_NAME} thirdparty_gtest)
-  IF(WIN32)
+  if(WIN32)
     target_link_libraries(${DEMO_NAME} ${GTEST_LIBRARIES})
-  ELSE()
-    target_link_libraries(${DEMO_NAME} ${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest${CMAKE_STATIC_LIBRARY_SUFFIX})
-  ENDIF(WIN32)
+  else()
+    target_link_libraries(
+      ${DEMO_NAME}
+      ${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest${CMAKE_STATIC_LIBRARY_SUFFIX}
+    )
+  endif(WIN32)
 endif()
 if(WIN32)
   if("${CMAKE_GENERATOR}" MATCHES "Ninja")
@@ -251,41 +310,62 @@ if(WIN32)
   endif()
 
   if(USE_TENSORRT)
-    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}
-              ${LIB_PATH}
-            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
-              ${LIB_PATH}
-    )
+    add_custom_command(
+      TARGET ${DEMO_NAME}
+      POST_BUILD
+      COMMAND
+        ${CMAKE_COMMAND} -E copy
+        ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX} ${LIB_PATH}
+      COMMAND
+        ${CMAKE_COMMAND} -E copy
+        ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
+        ${LIB_PATH})
     if(${TENSORRT_MAJOR_VERSION} EQUAL 7)
-      add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-              COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_SHARED_LIBRARY_SUFFIX}
-                ${LIB_PATH})
+      add_custom_command(
+        TARGET ${DEMO_NAME}
+        POST_BUILD
+        COMMAND
+          ${CMAKE_COMMAND} -E copy
+          ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_SHARED_LIBRARY_SUFFIX}
+          ${LIB_PATH})
     endif()
   endif()
   if(WITH_MKL)
     message("LIB_PATH IS ${LIB_PATH}")
-    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-          COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll ${LIB_PATH}
-          COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll ${LIB_PATH}
-          COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll  ${LIB_PATH}
-    )
+    add_custom_command(
+      TARGET ${DEMO_NAME}
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll
+              ${LIB_PATH}
+      COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll
+              ${LIB_PATH}
+      COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll
+              ${LIB_PATH})
   else()
-    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-          COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${LIB_PATH}
-    )
+    add_custom_command(
+      TARGET ${DEMO_NAME}
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll
+              ${LIB_PATH})
   endif()
   if(WITH_ONNXRUNTIME)
-    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll
-      ${LIB_PATH}
-    COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll
-      ${LIB_PATH}
-    )
+    add_custom_command(
+      TARGET ${DEMO_NAME}
+      POST_BUILD
+      COMMAND
+        ${CMAKE_COMMAND} -E copy
+        ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll
+        ${LIB_PATH}
+      COMMAND
+        ${CMAKE_COMMAND} -E copy
+        ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll
+        ${LIB_PATH})
   endif()
   if(NOT WITH_STATIC_LIB)
-      add_custom_command(TARGET ${DEMO_NAME} POST_BUILD 
-        COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${LIB_PATH}
-      )
+    add_custom_command(
+      TARGET ${DEMO_NAME}
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy
+              "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${LIB_PATH})
   endif()
 endif()
diff --git a/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake b/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake
index b38984314ec85..49b0a04197d12 100644
--- a/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake
+++ b/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake
@@ -1,43 +1,50 @@
 find_package(Git REQUIRED)
 message("${CMAKE_BUILD_TYPE}")
-SET(GTEST_PREFIX_DIR    ${CMAKE_CURRENT_BINARY_DIR}/gtest)
-SET(GTEST_SOURCE_DIR    ${CMAKE_CURRENT_BINARY_DIR}/gtest/src/extern_gtest)
-SET(GTEST_INSTALL_DIR   ${CMAKE_CURRENT_BINARY_DIR}/install/gtest)
-SET(GTEST_INCLUDE_DIR   "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
-set(GTEST_REPOSITORY     https://github.com/google/googletest.git)
-set(GTEST_TAG            release-1.8.1)
-INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR})
-IF(WIN32)
-    # if use CMAKE_INSTALL_LIBDIR, the path of lib actually is install/gtest/lib/gtest.lib but GTEST_LIBRARIES 
-    # is install/gtest/gtest.lib
-    set(GTEST_LIBRARIES
-        "${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
-    set(GTEST_MAIN_LIBRARIES
-        "${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
-ELSE()
-    set(GTEST_LIBRARIES
-        "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
-    set(GTEST_MAIN_LIBRARIES
-        "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
-ENDIF(WIN32)
+set(GTEST_PREFIX_DIR ${CMAKE_CURRENT_BINARY_DIR}/gtest)
+set(GTEST_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/gtest/src/extern_gtest)
+set(GTEST_INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/install/gtest)
+set(GTEST_INCLUDE_DIR
+    "${GTEST_INSTALL_DIR}/include"
+    CACHE PATH "gtest include directory." FORCE)
+set(GTEST_REPOSITORY https://github.com/google/googletest.git)
+set(GTEST_TAG release-1.8.1)
+include_directories(${GTEST_INCLUDE_DIR})
+if(WIN32)
+  # if use CMAKE_INSTALL_LIBDIR, the path of lib actually is install/gtest/lib/gtest.lib but GTEST_LIBRARIES
+  # is install/gtest/gtest.lib
+  set(GTEST_LIBRARIES
+      "${GTEST_INSTALL_DIR}/lib/gtest.lib"
+      CACHE FILEPATH "gtest libraries." FORCE)
+  set(GTEST_MAIN_LIBRARIES
+      "${GTEST_INSTALL_DIR}/lib/gtest_main.lib"
+      CACHE FILEPATH "gtest main libraries." FORCE)
+else()
+  set(GTEST_LIBRARIES
+      "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a"
+      CACHE FILEPATH "gtest libraries." FORCE)
+  set(GTEST_MAIN_LIBRARIES
+      "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a"
+      CACHE FILEPATH "gtest main libraries." FORCE)
+endif(WIN32)
 ExternalProject_Add(
-    extern_gtest
-    PREFIX gtest
-    GIT_REPOSITORY ${GTEST_REPOSITORY}
-    GIT_TAG ${GTEST_TAG}
-    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
-    UPDATE_COMMAND  ""
-    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
-               -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-               -DCMAKE_BUILD_TYPE:STRING=Release
-    BUILD_BYPRODUCTS ${GTEST_LIBRARIES}
-    BUILD_BYPRODUCTS ${GTEST_MAIN_LIBRARIES}
-)
+  extern_gtest
+  PREFIX gtest
+  GIT_REPOSITORY ${GTEST_REPOSITORY}
+  GIT_TAG ${GTEST_TAG}
+  DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+  UPDATE_COMMAND ""
+  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+             -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+             -DCMAKE_BUILD_TYPE:STRING=Release
+  BUILD_BYPRODUCTS ${GTEST_LIBRARIES}
+  BUILD_BYPRODUCTS ${GTEST_MAIN_LIBRARIES})
 
-ADD_LIBRARY(thirdparty_gtest STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET thirdparty_gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES})
-ADD_DEPENDENCIES(thirdparty_gtest extern_gtest)
+add_library(thirdparty_gtest STATIC IMPORTED GLOBAL)
+set_property(TARGET thirdparty_gtest PROPERTY IMPORTED_LOCATION
+                                              ${GTEST_LIBRARIES})
+add_dependencies(thirdparty_gtest extern_gtest)
 
-ADD_LIBRARY(thirdparty_gtest_main STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET thirdparty_gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
-ADD_DEPENDENCIES(thirdparty_gtest_main extern_gtest)
+add_library(thirdparty_gtest_main STATIC IMPORTED GLOBAL)
+set_property(TARGET thirdparty_gtest_main PROPERTY IMPORTED_LOCATION
+                                                   ${GTEST_MAIN_LIBRARIES})
+add_dependencies(thirdparty_gtest_main extern_gtest)
diff --git a/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc b/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc
index 4e924e3197965..53edc554ebaf8 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc
@@ -68,7 +68,7 @@ std::shared_ptr<Predictor> InitPredictor() {
   config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
                                 opt_input_shape);
   // erinie varlen must be used with oss
-  config.EnableTensorRtOSS();
+  config.EnableVarseqlen();
 
   return CreatePredictor(config);
 }
diff --git a/paddle/fluid/inference/tests/infer_ut/test_suite.h b/paddle/fluid/inference/tests/infer_ut/test_suite.h
index a5c8c52402180..8737afa809933 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_suite.h
+++ b/paddle/fluid/inference/tests/infer_ut/test_suite.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <math.h>
+
 #include <algorithm>
 #include <deque>
 #include <fstream>
@@ -26,7 +27,6 @@
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/include/paddle_inference_api.h"
 
 namespace paddle {
@@ -64,7 +64,7 @@ void SingleThreadPrediction(paddle_infer::Predictor *predictor,
                             int repeat_times = 2) {
   // prepare input tensor
   auto input_names = predictor->GetInputNames();
-  for (const auto & [ key, value ] : *input_data_map) {
+  for (const auto &[key, value] : *input_data_map) {
     switch (value.type) {
       case paddle::PaddleDType::INT64: {
         std::vector<int64_t> input_value =
@@ -150,7 +150,7 @@ void SingleThreadPrediction(paddle_infer::Predictor *predictor,
 void CompareRecord(std::map<std::string, Record> *truth_output_data,
                    std::map<std::string, Record> *infer_output_data,
                    float epislon = 1e-5) {
-  for (const auto & [ key, value ] : *infer_output_data) {
+  for (const auto &[key, value] : *infer_output_data) {
     auto truth_record = (*truth_output_data)[key];
     VLOG(1) << "output name: " << key;
     size_t numel = value.data.size() / sizeof(float);
@@ -190,7 +190,7 @@ double SingleThreadProfile(paddle_infer::Predictor *predictor,
                            int repeat_times = 2) {
   // prepare input tensor
   auto input_names = predictor->GetInputNames();
-  for (const auto & [ key, value ] : *input_data_map) {
+  for (const auto &[key, value] : *input_data_map) {
     switch (value.type) {
       case paddle::PaddleDType::INT64: {
         std::vector<int64_t> input_value =
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 6b6c0cd22f03b..d4b3ebdaa0b7f 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -1,26 +1,33 @@
 include(ExternalProject)
-set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url")
-set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
-    "A path setting inference demo download directories.")
-set(CPU_NUM_THREADS_ON_CI 4 CACHE STRING "Run multi-threads on CI to reduce CI time.")
-set(WARMUP_BATCH_SIZE 100 CACHE STRING "Default warmup_batch_size.")
+set(INFERENCE_URL
+    "http://paddle-inference-dist.bj.bcebos.com"
+    CACHE STRING "inference download url")
+set(INFERENCE_DEMO_INSTALL_DIR
+    "${THIRD_PARTY_PATH}/inference_demo"
+    CACHE STRING "A path setting inference demo download directories.")
+set(CPU_NUM_THREADS_ON_CI
+    4
+    CACHE STRING "Run multi-threads on CI to reduce CI time.")
+set(WARMUP_BATCH_SIZE
+    100
+    CACHE STRING "Default warmup_batch_size.")
 
 function(inference_download INSTALL_DIR URL FILENAME)
   message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
   string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
   ExternalProject_Add(
-      extern_inference_download_${FILENAME_EX}
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      PREFIX                ${INSTALL_DIR}
-      URL                   ${URL}/${FILENAME}
-      DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME}
-      DOWNLOAD_DIR          ${INSTALL_DIR}
-      DOWNLOAD_NO_PROGRESS  1
-      CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         ""
-      UPDATE_COMMAND        ""
-      INSTALL_COMMAND       ""
-  )
+    extern_inference_download_${FILENAME_EX}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX ${INSTALL_DIR}
+    URL ${URL}/${FILENAME}
+    DOWNLOAD_COMMAND wget --no-check-certificate -q -O
+                     ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME}
+    DOWNLOAD_DIR ${INSTALL_DIR}
+    DOWNLOAD_NO_PROGRESS 1
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND "")
 endfunction()
 
 function(inference_download_and_uncompress INSTALL_DIR URL FILENAME CHECK_SUM)
@@ -30,93 +37,101 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME CHECK_SUM)
   set(EXTERNAL_PROJECT_NAME "extern_download_${FILENAME_EX}")
   set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
   ExternalProject_Add(
-      ${EXTERNAL_PROJECT_NAME}
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      PREFIX                ${INSTALL_DIR}
-      URL                   ${URL}/${FILENAME}
-      URL_HASH              MD5=${CHECK_SUM}
-      DOWNLOAD_DIR          ${INSTALL_DIR}
-      DOWNLOAD_NO_EXTRACT   1
-      DOWNLOAD_NO_PROGRESS  1
-      CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR}
-                            ${CMAKE_COMMAND} -E tar xzf ${DOWNLOAD_NAME}
-      UPDATE_COMMAND        ""
-      INSTALL_COMMAND       ""
-  )
+    ${EXTERNAL_PROJECT_NAME}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX ${INSTALL_DIR}
+    URL ${URL}/${FILENAME}
+    URL_HASH MD5=${CHECK_SUM}
+    DOWNLOAD_DIR ${INSTALL_DIR}
+    DOWNLOAD_NO_EXTRACT 1
+    DOWNLOAD_NO_PROGRESS 1
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR} ${CMAKE_COMMAND} -E
+                  tar xzf ${DOWNLOAD_NAME}
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND "")
 endfunction()
 
-function(inference_download_and_uncompress_without_verify INSTALL_DIR URL FILENAME)
+function(inference_download_and_uncompress_without_verify INSTALL_DIR URL
+         FILENAME)
   message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
   string(REGEX REPLACE "[-%./\\]" "_" FILENAME_EX ${FILENAME})
   string(REGEX MATCH "[^/\\]+$" DOWNLOAD_NAME ${FILENAME})
   set(EXTERNAL_PROJECT_NAME "extern_download_${FILENAME_EX}")
   set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
   ExternalProject_Add(
-      ${EXTERNAL_PROJECT_NAME}
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      PREFIX                ${INSTALL_DIR}
-      URL                   ${URL}/${FILENAME}
-      DOWNLOAD_DIR          ${INSTALL_DIR}
-      DOWNLOAD_NO_EXTRACT   1
-      DOWNLOAD_NO_PROGRESS  1
-      CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR}
-                            ${CMAKE_COMMAND} -E tar xzf ${DOWNLOAD_NAME}
-      UPDATE_COMMAND        ""
-      INSTALL_COMMAND       ""
-  )
+    ${EXTERNAL_PROJECT_NAME}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX ${INSTALL_DIR}
+    URL ${URL}/${FILENAME}
+    DOWNLOAD_DIR ${INSTALL_DIR}
+    DOWNLOAD_NO_EXTRACT 1
+    DOWNLOAD_NO_PROGRESS 1
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR} ${CMAKE_COMMAND} -E
+                  tar xzf ${DOWNLOAD_NAME}
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND "")
 endfunction()
 
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
 if(NOT EXISTS ${WORD2VEC_INSTALL_DIR}/word2vec.inference.model.tar.gz)
-  inference_download_and_uncompress_without_verify(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
+  inference_download_and_uncompress_without_verify(
+    ${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
 
-set(IMG_CLS_RESNET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/image_classification_resnet")
-if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model.tgz)
-  inference_download_and_uncompress_without_verify(${IMG_CLS_RESNET_INSTALL_DIR} ${INFERENCE_URL} "image_classification_resnet.inference.model.tgz")
+set(IMG_CLS_RESNET_INSTALL_DIR
+    "${INFERENCE_DEMO_INSTALL_DIR}/image_classification_resnet")
+if(NOT EXISTS
+   ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model.tgz
+)
+  inference_download_and_uncompress_without_verify(
+    ${IMG_CLS_RESNET_INSTALL_DIR} ${INFERENCE_URL}
+    "image_classification_resnet.inference.model.tgz")
 endif()
-set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model")
+set(IMG_CLS_RESNET_MODEL_DIR
+    "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model")
 
 if(WITH_ONNXRUNTIME)
   set(MOBILENETV2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/MobileNetV2")
   if(NOT EXISTS ${MOBILENETV2_INSTALL_DIR}/MobileNetV2.inference.model.tar.gz)
-    inference_download_and_uncompress_without_verify(${MOBILENETV2_INSTALL_DIR} ${INFERENCE_URL} "MobileNetV2.inference.model.tar.gz")
+    inference_download_and_uncompress_without_verify(
+      ${MOBILENETV2_INSTALL_DIR} ${INFERENCE_URL}
+      "MobileNetV2.inference.model.tar.gz")
   endif()
   set(MOBILENETV2_MODEL_DIR "${MOBILENETV2_INSTALL_DIR}/MobileNetV2")
 endif()
 
-function (inference_base_test_build TARGET)
-   set(options "")
-   set(oneValueArgs "")
-   set(multiValueArgs SRCS DEPS)
-   cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-   cc_test_build(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS})
+function(inference_base_test_build TARGET)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(base_test "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  cc_test_build(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS})
 endfunction()
 
-function (inference_base_test_run TARGET)
-   set(options "")
-   set(oneValueArgs "")
-   set(multiValueArgs COMMAND ARGS)
-   cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-   if(WITH_GPU)
-       set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
-   endif()
-   cc_test_run(${TARGET} COMMAND ${base_test_COMMAND} ARGS ${mem_opt} ${base_test_ARGS})
+function(inference_base_test_run TARGET)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs COMMAND ARGS)
+  cmake_parse_arguments(base_test "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  if(WITH_GPU)
+    set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
+  endif()
+  cc_test_run(${TARGET} COMMAND ${base_test_COMMAND} ARGS ${mem_opt}
+              ${base_test_ARGS})
 endfunction()
 
-function (inference_base_test TARGET)
-   set(options "")
-   set(oneValueArgs "")
-   set(multiValueArgs SRCS ARGS DEPS)
-   cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-   inference_base_test_build(${TARGET}
-	   SRCS ${base_test_SRCS}
-	   DEPS ${base_test_DEPS})
-   inference_base_test_run(${TARGET}
-	   COMMAND ${TARGET}
-	   ARGS ${base_test_ARGS})
+function(inference_base_test TARGET)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS ARGS DEPS)
+  cmake_parse_arguments(base_test "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  inference_base_test_build(${TARGET} SRCS ${base_test_SRCS} DEPS
+                            ${base_test_DEPS})
+  inference_base_test_run(${TARGET} COMMAND ${TARGET} ARGS ${base_test_ARGS})
 endfunction()
-
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
index 9a495194a8ac1..a32a61842a5ec 100644
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -1,8 +1,23 @@
-cc_library(benchmark SRCS benchmark.cc DEPS enforce)
-cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
-cc_library(infer_io_utils SRCS io_utils.cc DEPS paddle_inference_api lod_tensor shape_range_info_proto)
-cc_test(infer_io_utils_tester SRCS io_utils_tester.cc DEPS infer_io_utils)
+cc_library(
+  benchmark
+  SRCS benchmark.cc
+  DEPS enforce)
+cc_test(
+  test_benchmark
+  SRCS benchmark_tester.cc
+  DEPS benchmark)
+cc_library(
+  infer_io_utils
+  SRCS io_utils.cc
+  DEPS paddle_inference_api lod_tensor shape_range_info_proto)
+cc_test(
+  infer_io_utils_tester
+  SRCS io_utils_tester.cc
+  DEPS infer_io_utils)
 cc_library(table_printer SRCS table_printer.cc)
-cc_test(test_table_printer SRCS table_printer_tester.cc DEPS table_printer)
+cc_test(
+  test_table_printer
+  SRCS table_printer_tester.cc
+  DEPS table_printer)
 
 proto_library(shape_range_info_proto SRCS shape_range_info.proto)
diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc
index 0c48c2db9b691..8f7614cb10a44 100644
--- a/paddle/fluid/inference/utils/benchmark_tester.cc
+++ b/paddle/fluid/inference/utils/benchmark_tester.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/utils/benchmark.h"
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/inference/utils/benchmark.h"
+
 using namespace paddle::inference;  // NOLINT
 TEST(Benchmark, basic) {
   Benchmark benchmark;
diff --git a/paddle/fluid/inference/utils/io_utils.cc b/paddle/fluid/inference/utils/io_utils.cc
index 87331e1978f95..425c67d2fd240 100644
--- a/paddle/fluid/inference/utils/io_utils.cc
+++ b/paddle/fluid/inference/utils/io_utils.cc
@@ -158,8 +158,9 @@ void SerializePDTensorsToFile(const std::string &path,
 void DeserializePDTensorsToFile(const std::string &path,
                                 std::vector<PaddleTensor> *tensors) {
   bool is_present = analysis::FileExists(path);
-  PADDLE_ENFORCE_EQ(is_present, true, platform::errors::InvalidArgument(
-                                          "Cannot open %s to read", path));
+  PADDLE_ENFORCE_EQ(
+      is_present, true,
+      platform::errors::InvalidArgument("Cannot open %s to read", path));
   std::ifstream fin(path, std::ios::binary);
   DeserializePDTensorsToStream(fin, tensors);
   fin.close();
diff --git a/paddle/fluid/inference/utils/io_utils_tester.cc b/paddle/fluid/inference/utils/io_utils_tester.cc
index ffd97232652fd..e8ebb72acc322 100644
--- a/paddle/fluid/inference/utils/io_utils_tester.cc
+++ b/paddle/fluid/inference/utils/io_utils_tester.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/utils/io_utils.h"
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <utility>
+
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h
index 6828924c300fd..5fccd3458a1d0 100644
--- a/paddle/fluid/inference/utils/singleton.h
+++ b/paddle/fluid/inference/utils/singleton.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/utils/table_printer_tester.cc b/paddle/fluid/inference/utils/table_printer_tester.cc
index f56d2527d730c..fc482807b2854 100644
--- a/paddle/fluid/inference/utils/table_printer_tester.cc
+++ b/paddle/fluid/inference/utils/table_printer_tester.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/utils/table_printer.h"
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/inference/utils/table_printer.h"
+
 namespace paddle {
 namespace inference {}  // namespace inference
 }  // namespace paddle
@@ -43,7 +44,7 @@ TEST(table_printer, output) {
   table.InsertRow({"trt_precision", "fp32"});
   table.InsertRow({"enable_dynamic_shape", "true"});
   table.InsertRow({"DisableTensorRtOPs", "{}"});
-  table.InsertRow({"EnableTensorRtOSS", "ON"});
+  table.InsertRow({"EnableVarseqlen", "ON"});
   table.InsertRow({"tensorrt_dla_enabled", "ON"});
   table.InsetDivider();
 
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 53e7993945586..1f72482eef777 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -1,57 +1,89 @@
 add_subdirectory(detail)
 add_subdirectory(allocation)
 
-if (WITH_MKLDNN)
-    set(MKLDNN_CTX_DEPS mkldnn)
-else ()
-    set(MKLDNN_CTX_DEPS)
+if(WITH_MKLDNN)
+  set(MKLDNN_CTX_DEPS mkldnn)
+else()
+  set(MKLDNN_CTX_DEPS)
 endif()
 
-cc_library(malloc SRCS malloc.cc DEPS
-    place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS})
-cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
-cc_library(stats SRCS stats.cc DEPS enforce)
+cc_library(
+  malloc
+  SRCS malloc.cc
+  DEPS place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS})
+cc_library(
+  memcpy
+  SRCS memcpy.cc
+  DEPS place device_context)
+cc_library(
+  stats
+  SRCS stats.cc
+  DEPS enforce)
 cc_library(memory DEPS malloc memcpy stats)
 
-cc_test(memory_stats_test SRCS memory_stats_test.cc DEPS memory)
-cc_test(stats_test SRCS stats_test.cc DEPS stats)
-
-if (WITH_GPU)
-    nv_test(malloc_test
-            SRCS malloc_test.cu
-            DEPS device_context malloc)
-    nv_test(stream_safe_cuda_alloc_test
-            SRCS stream_safe_cuda_alloc_test.cu
-            DEPS malloc cuda_graph_with_memory_pool)
-    nv_test(cuda_managed_memory_test
-            SRCS cuda_managed_memory_test.cu
-            DEPS malloc gpu_info place)
-    
-    if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
-        set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES 
-                             ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;FLAGS_allocator_strategy=auto_growth")
-    endif()
+cc_test(
+  memory_stats_test
+  SRCS memory_stats_test.cc
+  DEPS memory)
+cc_test(
+  stats_test
+  SRCS stats_test.cc
+  DEPS stats)
+
+if(WITH_GPU)
+  nv_test(
+    malloc_test
+    SRCS malloc_test.cu
+    DEPS device_context malloc)
+  nv_test(
+    stream_safe_cuda_alloc_test
+    SRCS stream_safe_cuda_alloc_test.cu
+    DEPS malloc cuda_graph_with_memory_pool)
+  nv_test(
+    cuda_managed_memory_test
+    SRCS cuda_managed_memory_test.cu
+    DEPS malloc gpu_info place)
+
+  if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
+    set_tests_properties(
+      stream_safe_cuda_alloc_test
+      PROPERTIES
+        ENVIRONMENT
+        "FLAGS_use_stream_safe_cuda_allocator=true;FLAGS_allocator_strategy=auto_growth"
+    )
+  endif()
 endif()
 
-if (WITH_ROCM)
-    hip_test(malloc_test
-            SRCS malloc_test.cu
-            DEPS device_context malloc)
-    hip_test(cuda_managed_memory_test
-            SRCS cuda_managed_memory_test.cu
-            DEPS malloc gpu_info place)
+if(WITH_ROCM)
+  hip_test(
+    malloc_test
+    SRCS malloc_test.cu
+    DEPS device_context malloc)
+  hip_test(
+    cuda_managed_memory_test
+    SRCS cuda_managed_memory_test.cu
+    DEPS malloc gpu_info place)
 endif()
 
 if(WITH_TESTING AND TEST cuda_managed_memory_test)
-set_tests_properties(cuda_managed_memory_test PROPERTIES
-                     ENVIRONMENT "FLAGS_use_cuda_managed_memory=true;FLAGS_allocator_strategy=auto_growth"
-                     TIMEOUT 50)
+  set_tests_properties(
+    cuda_managed_memory_test
+    PROPERTIES
+      ENVIRONMENT
+      "FLAGS_use_cuda_managed_memory=true;FLAGS_allocator_strategy=auto_growth"
+      TIMEOUT 50)
 endif()
 
-if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")       
-  nv_test(get_base_ptr_test SRCS get_base_ptr_test.cu DEPS malloc gpu_info)
-  set_tests_properties(get_base_ptr_test PROPERTIES 
-                       ENVIRONMENT "FLAGS_allocator_strategy=auto_growth;
+if(WITH_GPU
+   AND WITH_TESTING
+   AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+  nv_test(
+    get_base_ptr_test
+    SRCS get_base_ptr_test.cu
+    DEPS malloc gpu_info)
+  set_tests_properties(
+    get_base_ptr_test
+    PROPERTIES ENVIRONMENT "FLAGS_allocator_strategy=auto_growth;
                                     FLAGS_use_stream_safe_cuda_allocator=true;")
 endif()
 
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 5af13f76b36bd..109afd06f4df1 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -1,137 +1,264 @@
-cc_library(allocator SRCS allocator.cc DEPS place stats)
-cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
-cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
-cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
-cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
-cc_library(naive_best_fit_allocator SRCS naive_best_fit_allocator.cc DEPS allocator buddy_allocator profiler)
-cc_test(naive_best_fit_allocator_test SRCS naive_best_fit_allocator_test.cc DEPS naive_best_fit_allocator)
-cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator)
-
-if (WITH_MKLDNN)
+cc_library(
+  allocator
+  SRCS allocator.cc
+  DEPS place stats)
+cc_library(
+  cpu_allocator
+  SRCS cpu_allocator.cc
+  DEPS allocator)
+cc_library(
+  locked_allocator
+  SRCS locked_allocator.cc
+  DEPS allocator)
+cc_library(
+  buffered_allocator
+  SRCS buffered_allocator.cc
+  DEPS allocator)
+cc_library(
+  best_fit_allocator
+  SRCS best_fit_allocator.cc
+  DEPS allocator)
+cc_library(
+  naive_best_fit_allocator
+  SRCS naive_best_fit_allocator.cc
+  DEPS allocator buddy_allocator profiler)
+cc_test(
+  naive_best_fit_allocator_test
+  SRCS naive_best_fit_allocator_test.cc
+  DEPS naive_best_fit_allocator)
+cc_test(
+  buffered_allocator_test
+  SRCS buffered_allocator_test.cc
+  DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator)
+
+if(WITH_MKLDNN)
   set(MKLDNN_CTX_DEPS mkldnn)
-else ()
+else()
   set(MKLDNN_CTX_DEPS)
 endif()
 
-if (WITH_GPU)
-  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats)
-  nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
-  nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
-  nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph)
-  nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
+if(WITH_GPU)
+  nv_library(
+    cuda_allocator
+    SRCS cuda_allocator.cc
+    DEPS allocator cuda_device_guard stats)
+  nv_library(
+    cuda_managed_allocator
+    SRCS cuda_managed_allocator.cc
+    DEPS allocator cuda_device_guard gpu_info)
+  nv_library(
+    pinned_allocator
+    SRCS pinned_allocator.cc
+    DEPS allocator)
+  nv_library(
+    stream_safe_cuda_allocator
+    SRCS stream_safe_cuda_allocator.cc
+    DEPS allocator cuda_graph)
+  nv_library(
+    thread_local_allocator
+    SRCS thread_local_allocator.cc
+    DEPS allocator)
 
-  cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
+  cc_test(
+    thread_local_allocator_test
+    SRCS thread_local_allocator_test.cc
+    DEPS thread_local_allocator)
   if(CUDA_VERSION GREATER_EQUAL 10.2)
-    nv_library(cuda_virtual_mem_allocator SRCS cuda_virtual_mem_allocator.cc DEPS dynload_cuda)
+    nv_library(
+      cuda_virtual_mem_allocator
+      SRCS cuda_virtual_mem_allocator.cc
+      DEPS dynload_cuda)
   endif()
 endif()
 
-if (WITH_ROCM)
-  hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats)
-  hip_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
-  hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
-  hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
-  hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
-  
-  cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
+if(WITH_ROCM)
+  hip_library(
+    cuda_allocator
+    SRCS cuda_allocator.cc
+    DEPS allocator cuda_device_guard stats)
+  hip_library(
+    cuda_managed_allocator
+    SRCS cuda_managed_allocator.cc
+    DEPS allocator cuda_device_guard gpu_info)
+  hip_library(
+    pinned_allocator
+    SRCS pinned_allocator.cc
+    DEPS allocator)
+  hip_library(
+    stream_safe_cuda_allocator
+    SRCS stream_safe_cuda_allocator.cc
+    DEPS allocator)
+  hip_library(
+    thread_local_allocator
+    SRCS thread_local_allocator.cc
+    DEPS allocator)
+
+  cc_test(
+    thread_local_allocator_test
+    SRCS thread_local_allocator_test.cc
+    DEPS thread_local_allocator)
 endif()
 
-if (WITH_ASCEND_CL)
-  cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info)
-  cc_library(npu_pinned_allocator SRCS npu_pinned_allocator.cc DEPS allocator npu_info)
+if(WITH_ASCEND_CL)
+  cc_library(
+    npu_allocator
+    SRCS npu_allocator.cc
+    DEPS allocator npu_info)
+  cc_library(
+    npu_pinned_allocator
+    SRCS npu_pinned_allocator.cc
+    DEPS allocator npu_info)
 endif()
 
-cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
+cc_library(
+  retry_allocator
+  SRCS retry_allocator.cc
+  DEPS allocator)
 
-if (WITH_GPU OR WITH_ROCM)
-    set(AllocatorFacadeDeps gpu_info cuda_allocator cuda_managed_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator device_context)
-    if(CUDA_VERSION GREATER_EQUAL 10.2)
-      list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
-    endif()
+if(WITH_GPU OR WITH_ROCM)
+  set(AllocatorFacadeDeps
+      gpu_info
+      cuda_allocator
+      cuda_managed_allocator
+      pinned_allocator
+      cuda_device_guard
+      thread_local_allocator
+      stream_safe_cuda_allocator
+      device_context)
+  if(CUDA_VERSION GREATER_EQUAL 10.2)
+    list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
+  endif()
 elseif(WITH_XPU)
-    set(AllocatorFacadeDeps xpu_info)
+  set(AllocatorFacadeDeps xpu_info)
 elseif(WITH_IPU)
-    set(AllocatorFacadeDeps ipu_info)
+  set(AllocatorFacadeDeps ipu_info)
 elseif(WITH_ASCEND)
-    set(AllocatorFacadeDeps ascend_npu_info)
-else ()
-    set(AllocatorFacadeDeps)
+  set(AllocatorFacadeDeps ascend_npu_info)
+else()
+  set(AllocatorFacadeDeps)
 endif()
 
-if (WITH_CUSTOM_DEVICE)
-  cc_library(custom_allocator SRCS custom_allocator.cc DEPS allocator device_manager)
+if(WITH_CUSTOM_DEVICE)
+  cc_library(
+    custom_allocator
+    SRCS custom_allocator.cc
+    DEPS allocator device_manager)
   set(AllocatorFacadeDeps ${AllocatorFacadeDeps} custom_allocator)
 endif()
 
-if (WITH_GPU)
-    nv_test(best_fit_allocator_test
-            SRCS best_fit_allocator_test.cc
-                best_fit_allocator_test.cu
-            DEPS best_fit_allocator
-                locked_allocator
-                cpu_allocator
-                cuda_allocator
-                device_context
-                memcpy)
-elseif (WITH_ROCM)
-    hip_test(best_fit_allocator_test
-            SRCS best_fit_allocator_test.cc
-                best_fit_allocator_test.cu
-            DEPS best_fit_allocator
-                locked_allocator
-                cpu_allocator
-                cuda_allocator
-                device_context
-                memcpy)
+if(WITH_GPU)
+  nv_test(
+    best_fit_allocator_test
+    SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu
+    DEPS best_fit_allocator locked_allocator cpu_allocator cuda_allocator
+         device_context memcpy)
+elseif(WITH_ROCM)
+  hip_test(
+    best_fit_allocator_test
+    SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu
+    DEPS best_fit_allocator locked_allocator cpu_allocator cuda_allocator
+         device_context memcpy)
 else()
-    cc_test(best_fit_allocator_test
-            SRCS best_fit_allocator_test.cc
-            DEPS best_fit_allocator
-                locked_allocator
-                cpu_allocator)
+  cc_test(
+    best_fit_allocator_test
+    SRCS best_fit_allocator_test.cc
+    DEPS best_fit_allocator locked_allocator cpu_allocator)
 endif()
 
-list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator virtual_memory_auto_growth_best_fit_allocator best_fit_allocator)
+list(
+  APPEND
+  AllocatorFacadeDeps
+  cpu_allocator
+  locked_allocator
+  aligned_allocator
+  retry_allocator
+  buffered_allocator
+  naive_best_fit_allocator
+  auto_growth_best_fit_allocator
+  virtual_memory_auto_growth_best_fit_allocator
+  best_fit_allocator)
 
-if (WITH_ASCEND_CL)
-    list(APPEND AllocatorFacadeDeps npu_pinned_allocator)
+if(WITH_ASCEND_CL)
+  list(APPEND AllocatorFacadeDeps npu_pinned_allocator)
 endif()
 
+cc_library(
+  aligned_allocator
+  SRCS aligned_allocator.cc
+  DEPS allocator)
+cc_test(
+  test_aligned_allocator
+  SRCS test_aligned_allocator.cc
+  DEPS aligned_allocator)
+cc_library(
+  allocator_strategy
+  SRCS allocator_strategy.cc
+  DEPS gflags ${AllocatorFacadeDeps})
+cc_library(
+  allocator_facade
+  SRCS allocator_facade.cc
+  DEPS allocator_strategy stats)
 
-cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
-cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
-cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy stats)
-
-if (WITH_GPU)
+if(WITH_GPU)
   target_link_libraries(allocator_facade cuda_graph)
 endif()
 
-cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
-if (WITH_TESTING)
-  if ((WITH_GPU OR WITH_ROCM) AND TARGET retry_allocator_test)
+cc_test(
+  retry_allocator_test
+  SRCS retry_allocator_test.cc
+  DEPS retry_allocator locked_allocator cpu_allocator)
+if(WITH_TESTING)
+  if((WITH_GPU OR WITH_ROCM) AND TARGET retry_allocator_test)
     target_link_libraries(retry_allocator_test cuda_allocator)
   endif()
 
-  if (TEST retry_allocator_test)
-    set_tests_properties(retry_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  if(TEST retry_allocator_test)
+    set_tests_properties(retry_allocator_test PROPERTIES LABELS
+                                                         "RUN_TYPE=EXCLUSIVE")
   endif()
 endif()
 
-cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
+cc_test(
+  allocator_facade_abs_flags_test
+  SRCS allocator_facade_abs_flags_test.cc
+  DEPS allocator_facade)
 
-cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
+cc_test(
+  allocator_facade_frac_flags_test
+  SRCS allocator_facade_frac_flags_test.cc
+  DEPS allocator_facade)
 
-cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator flags)
-cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
-cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator)
+cc_library(
+  auto_growth_best_fit_allocator
+  SRCS auto_growth_best_fit_allocator.cc
+  DEPS allocator aligned_allocator flags)
+cc_test(
+  auto_growth_best_fit_allocator_facade_test
+  SRCS auto_growth_best_fit_allocator_facade_test.cc
+  DEPS cpu_allocator auto_growth_best_fit_allocator)
+cc_test(
+  auto_growth_best_fit_allocator_test
+  SRCS auto_growth_best_fit_allocator_test.cc
+  DEPS auto_growth_best_fit_allocator)
 
-cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator)
+cc_library(
+  virtual_memory_auto_growth_best_fit_allocator
+  SRCS virtual_memory_auto_growth_best_fit_allocator.cc
+  DEPS allocator aligned_allocator)
 
 if(NOT WIN32)
-  cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
-  cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
-  if (WITH_GPU)
-    cc_library(cuda_ipc_allocator SRCS cuda_ipc_allocator.cc DEPS allocator)
+  cc_library(
+    mmap_allocator
+    SRCS mmap_allocator.cc
+    DEPS allocator)
+  cc_test(
+    mmap_allocator_test
+    SRCS mmap_allocator_test.cc
+    DEPS mmap_allocator allocator)
+  if(WITH_GPU)
+    cc_library(
+      cuda_ipc_allocator
+      SRCS cuda_ipc_allocator.cc
+      DEPS allocator)
   endif()
 endif(NOT WIN32)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 46e1a500e4870..d72af70657a29 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -28,6 +28,7 @@
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include <shared_mutex>
+
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/cuda_managed_allocator.h"
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
@@ -123,6 +124,8 @@ class CUDAGraphAllocator
       : underlying_allocator_(allocator) {}
 
  public:
+  ~CUDAGraphAllocator() { VLOG(10) << "CUDAGraphAllocator destructed"; }
+
   static std::shared_ptr<Allocator> Create(
       const std::shared_ptr<Allocator>& allocator) {
     return std::shared_ptr<Allocator>(new CUDAGraphAllocator(allocator));
@@ -973,7 +976,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
 AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
 #ifdef PADDLE_WITH_CUDA
   if (UNLIKELY(IsCUDAGraphCapturing())) {
-    auto id = platform::CUDAGraph::CapturingID();
+    auto id = platform::CUDAGraph::CapturingPoolID();
     auto iter = cuda_graph_map_.find(id);
     PADDLE_ENFORCE_NE(
         iter, cuda_graph_map_.end(),
@@ -1116,7 +1119,7 @@ void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
 }
 
 #ifdef PADDLE_WITH_CUDA
-void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
+void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(int64_t id) {
   PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
                     platform::errors::InvalidArgument(
                         "CUDA Graph is only supported when the "
@@ -1124,23 +1127,32 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
                         "FLAGS_allocator_strategy=\"%s\"",
                         FLAGS_allocator_strategy));
   auto& allocator = cuda_graph_map_[id];
-  PADDLE_ENFORCE_EQ(
-      allocator.get(), nullptr,
-      platform::errors::InvalidArgument(
-          "The memory pool of the CUDA Graph with ID %d have been prepared.",
-          id));
-  allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
-
-  VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
+  auto& ref_cnt = cuda_graph_ref_cnt_[id];
+  if (allocator.get() == nullptr) {
+    allocator.reset(
+        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
+    VLOG(10) << "Create memory pool for CUDA Graph with memory ID " << id;
+  } else {
+    VLOG(10) << "Use created memory pool for CUDA Graph with memory ID " << id;
+  }
+  ++ref_cnt;
 }
 
-void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
-  auto iter = cuda_graph_map_.find(id);
-  PADDLE_ENFORCE_NE(iter, cuda_graph_map_.end(),
+void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(int64_t id) {
+  auto ref_cnt_iter = cuda_graph_ref_cnt_.find(id);
+  PADDLE_ENFORCE_NE(ref_cnt_iter, cuda_graph_ref_cnt_.end(),
                     platform::errors::InvalidArgument(
-                        "Cannot find CUDA Graph with ID = %d", id));
-  cuda_graph_map_.erase(iter);
-  VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
+                        "Cannot find CUDA Graph with memory ID = %d", id));
+  auto& ref_cnt = ref_cnt_iter->second;
+  --ref_cnt;
+  if (ref_cnt == 0) {
+    cuda_graph_map_.erase(id);
+    cuda_graph_ref_cnt_.erase(ref_cnt_iter);
+    VLOG(10) << "Remove memory pool of CUDA Graph with memory ID " << id;
+  } else {
+    VLOG(10) << "Decrease memory pool ID " << id << " reference count to be "
+             << ref_cnt;
+  }
 }
 #endif
 #endif
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 1dea50edccf2e..a37c11c0c048b 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <memory>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
@@ -89,8 +90,8 @@ class AllocatorFacade {
 #endif
 
 #ifdef PADDLE_WITH_CUDA
-  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id);
-  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id);
+  void PrepareMemoryPoolForCUDAGraph(int64_t id);
+  void RemoveMemoryPoolOfCUDAGraph(int64_t id);
 #endif
 
   // TODO(yy): Allocate a Copy-On-Write allocation?
@@ -98,8 +99,9 @@ class AllocatorFacade {
   AllocatorFacade();
   AllocatorFacadePrivate* m_;
 #ifdef PADDLE_WITH_CUDA
-  std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
+  std::unordered_map<int64_t, std::unique_ptr<AllocatorFacadePrivate>>
       cuda_graph_map_;
+  std::unordered_map<int64_t, int64_t> cuda_graph_ref_cnt_;
 #endif
 };
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
index fca07ba8e2511..d3f16ec628660 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index 782062283e985..d460480bc734f 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
index 4469673b305bf..70c43145cc85d 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <condition_variable>  // NOLINT
 #include <mutex>               // NOLINT
 #include <random>
 #include <thread>  // NOLINT
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
index 8d2f6e07a2901..441e80dfa4f8d 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <cstdlib>
-
-#include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 
+#include <cstdlib>
+
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
 
 DECLARE_bool(free_idle_chunk);
 DECLARE_bool(free_when_no_cache_hit);
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 4cfe3997d89a9..c93645bf7a00d 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
+
 #include <cmath>
 
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index 69cb7c2708f9d..64ee632c3879a 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <stdint.h>
+
 #include <array>
 #include <list>
 #include <map>
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 62a2dd78128bb..de6cac63e9ddb 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -24,6 +24,7 @@
 #endif
 
 #include <string>
+
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h
index 522b1d623e83b..f3df30827417d 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
index b2f24d5aed1eb..dff93736a6e70 100644
--- a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
@@ -15,15 +15,16 @@
 #ifndef _WIN32
 
 #include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
-#include "paddle/fluid/platform/cuda_device_guard.h"
 
 #include <fcntl.h>
 #include <stdlib.h>
 #include <sys/mman.h>
+
 #include <random>
 #include <string>
 
 #include "glog/logging.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
index 0c83d4d36634e..ac62b10c0e07a 100644
--- a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
@@ -24,6 +24,7 @@
 #endif
 
 #include <string>
+
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
index a235b3871b3e6..9494141615f34 100644
--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
@@ -18,6 +18,7 @@
 #endif
 
 #include <string>
+
 #include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
index e7b296e6a5a11..ff26a96a0e101 100644
--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
@@ -16,10 +16,12 @@
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc
index e53d7b1cc766a..2cd969e2bd17f 100644
--- a/paddle/fluid/memory/allocation/custom_allocator.cc
+++ b/paddle/fluid/memory/allocation/custom_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/custom_allocator.h"
+
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/memory/allocation/custom_allocator.h b/paddle/fluid/memory/allocation/custom_allocator.h
index 0f34bc156c872..b10f840f60d94 100644
--- a/paddle/fluid/memory/allocation/custom_allocator.h
+++ b/paddle/fluid/memory/allocation/custom_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc
index 25c2235cce853..6fd87fb6a7748 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator.cc
@@ -19,6 +19,7 @@
 #include <fcntl.h>
 #include <stdlib.h>
 #include <sys/mman.h>
+
 #include <random>
 #include <string>
 
@@ -217,9 +218,9 @@ std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
   const std::string &ipc_name = GetIPCName();
   int flags = O_RDWR | O_CREAT;
   int fd = shm_open(ipc_name.c_str(), flags, 0600);
-  PADDLE_ENFORCE_NE(
-      fd, -1, platform::errors::Unavailable("File descriptor %s open failed",
-                                            ipc_name.c_str()));
+  PADDLE_ENFORCE_NE(fd, -1,
+                    platform::errors::Unavailable(
+                        "File descriptor %s open failed", ipc_name.c_str()));
   PADDLE_ENFORCE_EQ(ftruncate(fd, size), 0,
                     platform::errors::Unavailable(
                         "Fruncate a file to a specified length failed!"));
@@ -239,9 +240,9 @@ std::shared_ptr<MemoryMapReaderAllocation> RebuildMemoryMapReaderAllocation(
   flags &= ~O_CREAT;
 
   int fd = shm_open(ipc_name.c_str(), flags, 0600);
-  PADDLE_ENFORCE_NE(
-      fd, -1, platform::errors::Unavailable("File descriptor %s open failed",
-                                            ipc_name.c_str()));
+  PADDLE_ENFORCE_NE(fd, -1,
+                    platform::errors::Unavailable(
+                        "File descriptor %s open failed", ipc_name.c_str()));
   void *ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
   PADDLE_ENFORCE_NE(ptr, MAP_FAILED,
                     platform::errors::Unavailable(
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 5efbfce7fedd6..7cc95de83101b 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -24,7 +24,6 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
-
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/common/place.h"
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
index 05db0d7341aca..3d6500d0f5642 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <stdint.h>
+
 #include <algorithm>
 #include <mutex>  // NOLINT
 #include <unordered_map>
diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc
index d69663f636e32..1c277c5db84d6 100644
--- a/paddle/fluid/memory/allocation/npu_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/npu_allocator.h"
+
 #include <string>
+
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h
index ff55ba70c520f..04832c6fd9b63 100644
--- a/paddle/fluid/memory/allocation/npu_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 5e5aea6dab2cc..ad11d81875231 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
+
 #include "paddle/fluid/memory/stats.h"
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index d6074975720c5..2914da4f6361c 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -44,8 +44,9 @@ void RetryAllocator::FreeImpl(phi::Allocation* allocation) {
   size_t size = allocation->size();
   underlying_allocator_->Free(allocation);
   if (UNLIKELY(waited_allocate_size_)) {
-    VLOG(10) << "Free " << size << " bytes and notify all waited threads, "
-                                   "where waited_allocate_size_ = "
+    VLOG(10) << "Free " << size
+             << " bytes and notify all waited threads, "
+                "where waited_allocate_size_ = "
              << waited_allocate_size_;
     cv_.notify_all();
   }
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index cb593f5ab74c7..e7370036cee36 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 
 #include <thread>  // NOLINT
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 80877cb670ba9..81a87ef07b592 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
+
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index 32d3896e66bbf..ac4b7c790c950 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -17,6 +17,7 @@
 #include <list>
 #include <map>
 #include <set>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
index c5378d9f59c3d..74c83149b4cb5 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
+
 #include <condition_variable>  // NOLINT
 #include <thread>              // NOLINT
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/malloc.h"
 
diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
index c8b4e980566d0..07ad149a3078d 100644
--- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
+
 #include <mutex>
 
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
-#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/buffer.h b/paddle/fluid/memory/buffer.h
index 99b25ca289ce1..f42b5262e3422 100644
--- a/paddle/fluid/memory/buffer.h
+++ b/paddle/fluid/memory/buffer.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <type_traits>
+
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt
index a039cd8f41860..afe5c0dba0f3b 100644
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
@@ -1,47 +1,78 @@
 include(ExternalProject)
 
-cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place)
+cc_library(
+  memory_block
+  SRCS memory_block.cc memory_block_desc.cc meta_cache.cc
+  DEPS place)
 
 if(WITH_GPU)
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
+  nv_library(
+    system_allocator
+    SRCS system_allocator.cc
+    DEPS gflags cpu_info gpu_info place)
 elseif(WITH_ROCM)
-  hip_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
+  hip_library(
+    system_allocator
+    SRCS system_allocator.cc
+    DEPS gflags cpu_info gpu_info place)
 elseif(${WITH_ASCEND_CL})
-  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info npu_info place)
+  cc_library(
+    system_allocator
+    SRCS system_allocator.cc
+    DEPS gflags cpu_info npu_info place)
 elseif(WITH_MLU)
-  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info mlu_info place)
+  cc_library(
+    system_allocator
+    SRCS system_allocator.cc
+    DEPS gflags cpu_info mlu_info place)
 else()
-  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place)
+  cc_library(
+    system_allocator
+    SRCS system_allocator.cc
+    DEPS gflags cpu_info place)
 endif()
 
-cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
+cc_test(
+  system_allocator_test
+  SRCS system_allocator_test.cc
+  DEPS system_allocator)
 
-cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
+cc_library(
+  buddy_allocator
+  SRCS buddy_allocator.cc
+  DEPS memory_block system_allocator glog)
 
-cc_test(buddy_allocator_test SRCS buddy_allocator_test.cc DEPS buddy_allocator)
+cc_test(
+  buddy_allocator_test
+  SRCS buddy_allocator_test.cc
+  DEPS buddy_allocator)
 
-FUNCTION(file_download_and_uncompress URL NAME)
-  MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
-  SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME} PARENT_SCOPE)
+function(file_download_and_uncompress URL NAME)
+  message(STATUS "Download dependence[${NAME}] from ${URL}")
+  set(${NAME}_INCLUDE_DIR
+      ${THIRD_PARTY_PATH}/${NAME}
+      PARENT_SCOPE)
   ExternalProject_Add(
-      extern_download_${NAME}
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      PREFIX                ${THIRD_PARTY_PATH}/${NAME}
-      URL                   ${URL}
-      DOWNLOAD_DIR          ${THIRD_PARTY_PATH}/${NAME}
-      SOURCE_DIR            ${THIRD_PARTY_PATH}/${NAME}
-      DOWNLOAD_NO_PROGRESS  1
-      CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         ""
-      UPDATE_COMMAND        ""
-      INSTALL_COMMAND       ""
-    )
-  set(third_party_deps ${third_party_deps} extern_download_${NAME} PARENT_SCOPE)
-ENDFUNCTION()
+    extern_download_${NAME}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX ${THIRD_PARTY_PATH}/${NAME}
+    URL ${URL}
+    DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}
+    SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}
+    DOWNLOAD_NO_PROGRESS 1
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND "")
+  set(third_party_deps
+      ${third_party_deps} extern_download_${NAME}
+      PARENT_SCOPE)
+endfunction()
 
 if(WITH_TESTING)
   if(TEST buddy_allocator_test)
-    set_tests_properties(buddy_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+    set_tests_properties(buddy_allocator_test PROPERTIES LABELS
+                                                         "RUN_TYPE=EXCLUSIVE")
   endif()
   set(URL "https://paddle-ci.cdn.bcebos.com/buddy_allocator_test_data.tar")
   file_download_and_uncompress(URL "buddy_allocator")
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index e1077d66c54ec..244445d59b829 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -168,8 +168,9 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
 }
 
 void GPUAllocator::Free(void* p, size_t size, size_t index) {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The index should be 0, index is %d", index));
   PADDLE_ENFORCE_GE(gpu_alloc_size_, size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
@@ -223,8 +224,9 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
 
 void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
   gpuError_t err;
-  PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument(
-                                  "The index should be 1, but got %d", index));
+  PADDLE_ENFORCE_EQ(index, 1,
+                    platform::errors::InvalidArgument(
+                        "The index should be 1, but got %d", index));
 
   PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size,
                     platform::errors::InvalidArgument(
@@ -310,8 +312,9 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
 
 void NPUAllocator::Free(void* p, size_t size, size_t index) {
   VLOG(4) << "Free " << p << " size " << size;
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The index should be 0, index is %d", index));
   PADDLE_ENFORCE_GE(npu_alloc_size_, size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
@@ -355,8 +358,9 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
 
 void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
   aclError err;
-  PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument(
-                                  "The index should be 1, but got %d", index));
+  PADDLE_ENFORCE_EQ(index, 1,
+                    platform::errors::InvalidArgument(
+                        "The index should be 1, but got %d", index));
 
   PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size,
                     platform::errors::InvalidArgument(
@@ -425,8 +429,9 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
 }
 
 void MLUAllocator::Free(void* p, size_t size, size_t index) {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The index should be 0, index is %d", index));
   PADDLE_ENFORCE_GE(mlu_alloc_size_, size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
@@ -469,8 +474,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
 
 void CustomAllocator::Free(void* p, size_t size, size_t index) {
   VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The index should be 0, index is %d", index));
   PADDLE_ENFORCE_GE(plug_alloc_size, size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index f6ff6282a614a..18c2e278f99c5 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stddef.h>  // for size_t
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/memory/get_base_ptr_test.cu b/paddle/fluid/memory/get_base_ptr_test.cu
index 188d2f5f420cf..c8928bda0c937 100644
--- a/paddle/fluid/memory/get_base_ptr_test.cu
+++ b/paddle/fluid/memory/get_base_ptr_test.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <random>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 796bdcf0ec2f6..a7d0fa9781f77 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -24,9 +24,9 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
-using phi::Allocation;
-using allocation::Allocator;
 using allocation::AllocationPtr;
+using allocation::Allocator;
+using phi::Allocation;
 
 extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                                size_t size);
diff --git a/paddle/fluid/memory/memory_stats_test.cc b/paddle/fluid/memory/memory_stats_test.cc
index b2fc602e401ed..081f0d3d78c13 100644
--- a/paddle/fluid/memory/memory_stats_test.cc
+++ b/paddle/fluid/memory/memory_stats_test.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/memory/memory.h"
 #include <algorithm>
 #include <vector>
+
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/memory.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/pinned_memory_test.cu b/paddle/fluid/memory/pinned_memory_test.cu
index 837c964e2ad32..e5958615d0184 100644
--- a/paddle/fluid/memory/pinned_memory_test.cu
+++ b/paddle/fluid/memory/pinned_memory_test.cu
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <gtest/gtest.h>
+
 #include <unordered_map>
 
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
-
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h
index bb6a3cca6644c..a30ee161e1c08 100644
--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <atomic>
 #include <map>
 #include <string>
+
 #include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
@@ -149,15 +150,16 @@ void HostMemoryStatUpdate(const std::string& stat_type, int dev_id,
 #define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \
   DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment)
 
-#define HOST_MEMORY_STAT_FUNC(item, id, func, ...)                           \
-  [&] {                                                                      \
-    PADDLE_ENFORCE_EQ(id, 0, paddle::platform::errors::OutOfRange(           \
-                                 "Only support device id 0 for host memory " \
-                                 "stats, not support device id: %d",         \
-                                 id));                                       \
-    return paddle::memory::Stat<                                             \
-               paddle::memory::HostMemoryStat##item##0>::GetInstance()       \
-        ->func(__VA_ARGS__);                                                 \
+#define HOST_MEMORY_STAT_FUNC(item, id, func, ...)                     \
+  [&] {                                                                \
+    PADDLE_ENFORCE_EQ(id, 0,                                           \
+                      paddle::platform::errors::OutOfRange(            \
+                          "Only support device id 0 for host memory "  \
+                          "stats, not support device id: %d",          \
+                          id));                                        \
+    return paddle::memory::Stat<                                       \
+               paddle::memory::HostMemoryStat##item##0>::GetInstance() \
+        ->func(__VA_ARGS__);                                           \
   }()
 
 #define HOST_MEMORY_STAT_CURRENT_VALUE(item, id) \
diff --git a/paddle/fluid/memory/stats_test.cc b/paddle/fluid/memory/stats_test.cc
index bcaba8e91080f..73a6b921ca8a4 100644
--- a/paddle/fluid/memory/stats_test.cc
+++ b/paddle/fluid/memory/stats_test.cc
@@ -13,11 +13,13 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/stats.h"
+
 #include <condition_variable>
 #include <mutex>
 #include <string>
 #include <thread>
 #include <vector>
+
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index 3bf873bcfc231..5b5350c34fb6f 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -25,6 +25,7 @@
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
+
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #endif
 
@@ -47,9 +48,9 @@ __global__ void add_kernel(int *x, int *y, int n) {
 void CheckMemLeak(const platform::CUDAPlace &place) {
   uint64_t cuda_malloc_size =
       platform::RecordedGpuMallocSize(place.GetDeviceId());
-  ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
-                                 << " bytes memory that not released yet,"
-                                 << " there may be a memory leak problem";
+  ASSERT_EQ(cuda_malloc_size, 0)
+      << "Found " << cuda_malloc_size << " bytes memory that not released yet,"
+      << " there may be a memory leak problem";
 }
 
 TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 3112d0d8205a8..b2fd59b47454e 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -107,6 +107,7 @@ register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combin
         recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
+target_link_libraries(run_program_op cuda_graph_with_memory_pool)
 op_library(quantize_linear_op DEPS cast_kernel)
 op_library(save_combine_op DEPS string_array)
 op_library(load_combine_op DEPS string_array)
diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index b9517e1cc863c..86b60da341e63 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index b4a97e24cf292..b9d5e5fbe5ebc 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -20,8 +20,8 @@ namespace paddle {
 namespace operators {
 using framework::Tensor;
 using platform::ActivationDescriptor;
-using platform::TensorDescriptor;
 using platform::CUDADeviceContext;
+using platform::TensorDescriptor;
 
 #ifdef PADDLE_WITH_HIP
 #define GPUDNN_ACTIVATION_RELU miopenActivationRELU
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 6905f3d79546e..e500992e1b5a5 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1454,18 +1454,19 @@ namespace plat = paddle::platform;
   REGISTER_OPERATOR(KERNEL_TYPE##_grad, ops::ActivationOpGrad,              \
                     ops::ActivationGradOpInplaceInferer);
 
-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, op_name, functor,        \
-                                       grad_functor)                      \
-  REGISTER_OP_CPU_KERNEL(                                                 \
-      act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
-                                      ops::functor<float>>,               \
-      ops::ActivationKernel<paddle::platform::CPUDeviceContext,           \
-                            ops::functor<double>>);                       \
-  REGISTER_OP_CPU_KERNEL(                                                 \
-      act_type##_grad,                                                    \
-      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
-                                ops::grad_functor<float>>,                \
-      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, op_name, functor,  \
+                                       grad_functor)                \
+  REGISTER_OP_CPU_KERNEL(                                           \
+      act_type,                                                     \
+      ops::ActivationKernel<paddle::platform::CPUDeviceContext,     \
+                            ops::functor<float>>,                   \
+      ops::ActivationKernel<paddle::platform::CPUDeviceContext,     \
+                            ops::functor<double>>);                 \
+  REGISTER_OP_CPU_KERNEL(                                           \
+      act_type##_grad,                                              \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, \
+                                ops::grad_functor<float>>,          \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, \
                                 ops::grad_functor<double>>);
 
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
@@ -1781,21 +1782,18 @@ REGISTER_OP_VERSION(hard_shrink)
                 "((x < -threshold) + (x > threshold)); after checkpoint: out = "
                 "x * (((x < -threshold) + (x > threshold)) > 0)"));
 
-REGISTER_OP_VERSION(softplus)
-    .AddCheckpoint(
-        R"ROC(add new attributes [beta] and [threshold], and the formula is changed to "
+REGISTER_OP_VERSION(softplus).AddCheckpoint(
+    R"ROC(add new attributes [beta] and [threshold], and the formula is changed to "
          " softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\ \\text{For numerical"
          " stability, the implementation reverts to the linear function when: beta * x > threshold.})ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewAttr("beta", "The beta value of the new formula", 1.0f)
-            .NewAttr("threshold", "The threshold value of the new formula",
-                     20.0f));
-
-REGISTER_OP_VERSION(mish)
-    .AddCheckpoint(
-        R"ROC(add new attributes [use_mkldnn], and when computing softplus the formula is changed as the new veriosn of softplus)ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "use_mkldnn", "(bool, default false) Only used in mkldnn kernel",
-            false));
+    paddle::framework::compatible::OpVersionDesc()
+        .NewAttr("beta", "The beta value of the new formula", 1.0f)
+        .NewAttr("threshold", "The threshold value of the new formula", 20.0f));
+
+REGISTER_OP_VERSION(mish).AddCheckpoint(
+    R"ROC(add new attributes [use_mkldnn], and when computing softplus the formula is changed as the new veriosn of softplus)ROC",
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "use_mkldnn", "(bool, default false) Only used in mkldnn kernel",
+        false));
 
 /* ========================================================================== */
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 5f3916a65e792..81f5e24abfed5 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -12,19 +12,20 @@ limitations under the License. */
 
 #pragma once
 #include <glog/logging.h>
+
 #include <algorithm>
+#include <cmath>
 #include <memory>
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
-
-#include <cmath>
 #ifndef _USE_MATH_DEFINES
 #define _USE_MATH_DEFINES
 #endif
 
 #include <type_traits>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -362,9 +363,8 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) =
-        dout *
-        ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
-            .template cast<T>();
+        dout * ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
+                   .template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() {
diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc
index e950f952c24e6..4127e4b1b103b 100644
--- a/paddle/fluid/operators/activation_op_xpu.cc
+++ b/paddle/fluid/operators/activation_op_xpu.cc
@@ -253,8 +253,9 @@ struct XPUHardSwishFunctor : public BaseActivationFunctor<T> {
     PADDLE_ENFORCE_EQ(threshold, 6.0f,
                       platform::errors::External(
                           "Not support threshold [%f] in XPU", threshold));
-    PADDLE_ENFORCE_EQ(scale, 6.0f, platform::errors::External(
-                                       "Not support scale [%f] in XPU", scale));
+    PADDLE_ENFORCE_EQ(
+        scale, 6.0f,
+        platform::errors::External("Not support scale [%f] in XPU", scale));
     PADDLE_ENFORCE_EQ(
         offset, 3.0f,
         platform::errors::External("Not support offset [%f] in XPU", offset));
@@ -273,8 +274,9 @@ struct XPUHardSwishGradFunctor : public BaseActivationFunctor<T> {
     PADDLE_ENFORCE_EQ(threshold, 6.0f,
                       platform::errors::External(
                           "Not support threshold [%f] in XPU", threshold));
-    PADDLE_ENFORCE_EQ(scale, 6.0f, platform::errors::External(
-                                       "Not support scale [%f] in XPU", scale));
+    PADDLE_ENFORCE_EQ(
+        scale, 6.0f,
+        platform::errors::External("Not support scale [%f] in XPU", scale));
     PADDLE_ENFORCE_EQ(
         offset, 3.0f,
         platform::errors::External("Not support offset [%f] in XPU", offset));
@@ -377,10 +379,12 @@ struct XPUPowGradFunctor : public BaseActivationFunctor<T> {
     auto x_dims = phi::vectorize<int>(x->dims());
     auto dy_dims = phi::vectorize<int>(dOut->dims());
     auto dx_dims = phi::vectorize<int>(dX->dims());
-    PADDLE_ENFORCE_EQ(x_dims, dy_dims, platform::errors::PreconditionNotMet(
-                                           "x_dims should match dy_dims."));
-    PADDLE_ENFORCE_EQ(x_dims, dx_dims, platform::errors::PreconditionNotMet(
-                                           "x_dims should match dx_dims."));
+    PADDLE_ENFORCE_EQ(
+        x_dims, dy_dims,
+        platform::errors::PreconditionNotMet("x_dims should match dy_dims."));
+    PADDLE_ENFORCE_EQ(
+        x_dims, dx_dims,
+        platform::errors::PreconditionNotMet("x_dims should match dx_dims."));
     float pow_factor = ctx.Attr<float>("factor");
 
     auto xpu_context =
diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc
index e5fcd270eb8b8..4d2c23e2bb440 100644
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/add_position_encoding_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc
index 716a2e40179e4..d0f0a6ae0c679 100644
--- a/paddle/fluid/operators/addmm_op.cc
+++ b/paddle/fluid/operators/addmm_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc
index 1b584fc557849..cd6798be2b2ed 100644
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index cf4041f721af2..87a71130b85bf 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -81,13 +81,13 @@ class AffineChannelCUDAKernel : public framework::OpKernel<T> {
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     grid = std::min(std::max(max_threads / block, 1), grid);
     if (layout == framework::DataLayout::kNCHW) {
-      KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
-                          true><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_d, scale_d, bias_d, C, HxW, num, y_d);
+      KeAffineChannelCUDA<T, framework::DataLayout::kNCHW, true>
+          <<<grid, block, 0, dev_ctx.stream()>>>(x_d, scale_d, bias_d, C, HxW,
+                                                 num, y_d);
     } else {
-      KeAffineChannelCUDA<T, framework::DataLayout::kNHWC,
-                          true><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_d, scale_d, bias_d, C, HxW, num, y_d);
+      KeAffineChannelCUDA<T, framework::DataLayout::kNHWC, true>
+          <<<grid, block, 0, dev_ctx.stream()>>>(x_d, scale_d, bias_d, C, HxW,
+                                                 num, y_d);
     }
   }
 };
@@ -169,29 +169,29 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
     if (layout == framework::DataLayout::kNCHW) {
       if (dscale && dbias) {
         const T* x_d = x->data<T>();
-        AffineChannelScaleBiasGradientCUDAKernel<
-            T, block, framework::DataLayout::kNCHW><<<grid2, block, 0,
-                                                      dev_ctx.stream()>>>(
-            dy_d, x_d, N, C, HxW, ds_d, db_d);
+        AffineChannelScaleBiasGradientCUDAKernel<T, block,
+                                                 framework::DataLayout::kNCHW>
+            <<<grid2, block, 0, dev_ctx.stream()>>>(dy_d, x_d, N, C, HxW, ds_d,
+                                                    db_d);
       }
       if (dx) {
-        KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
-                            false><<<grid1, block, 0, dev_ctx.stream()>>>(
-            dy_d, s_d, nullptr, C, HxW, num, dx_d);
+        KeAffineChannelCUDA<T, framework::DataLayout::kNCHW, false>
+            <<<grid1, block, 0, dev_ctx.stream()>>>(dy_d, s_d, nullptr, C, HxW,
+                                                    num, dx_d);
       }
     } else {
       if (dscale && dbias) {
         const T* x_d = x->data<T>();
-        AffineChannelScaleBiasGradientCUDAKernel<
-            T, block, framework::DataLayout::kNHWC><<<grid2, block, 0,
-                                                      dev_ctx.stream()>>>(
-            dy_d, x_d, N, C, HxW, ds_d, db_d);
+        AffineChannelScaleBiasGradientCUDAKernel<T, block,
+                                                 framework::DataLayout::kNHWC>
+            <<<grid2, block, 0, dev_ctx.stream()>>>(dy_d, x_d, N, C, HxW, ds_d,
+                                                    db_d);
       }
 
       if (dx) {
-        KeAffineChannelCUDA<T, framework::DataLayout::kNHWC,
-                            false><<<grid1, block, 0, dev_ctx.stream()>>>(
-            dy_d, s_d, nullptr, C, HxW, num, dx_d);
+        KeAffineChannelCUDA<T, framework::DataLayout::kNHWC, false>
+            <<<grid1, block, 0, dev_ctx.stream()>>>(dy_d, s_d, nullptr, C, HxW,
+                                                    num, dx_d);
       }
     }
   }
diff --git a/paddle/fluid/operators/affine_channel_op_xpu.cc b/paddle/fluid/operators/affine_channel_op_xpu.cc
index db3eedea7ca67..4de233b184aed 100644
--- a/paddle/fluid/operators/affine_channel_op_xpu.cc
+++ b/paddle/fluid/operators/affine_channel_op_xpu.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
index 31801b14564d3..6fca4afabd9cc 100644
--- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
@@ -65,8 +65,9 @@ class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         platform::dynload::cudnnSpatialTfGridGeneratorForward(
             handle, cudnn_st_desc, theta_data, output_data),
-        0, platform::errors::Fatal("Some errors has occurred "
-                                   "during forward computation in cudnn."));
+        0,
+        platform::errors::Fatal("Some errors has occurred "
+                                "during forward computation in cudnn."));
   }
 };
 
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index e311d21bb54d3..d7a49a965a0ee 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/affine_grid_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu
index eeb4b3bc8a760..29a540bdc2ce5 100644
--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -42,8 +42,8 @@ struct Linspace<paddle::platform::CUDADeviceContext, T> {
     auto stream = ctx.cuda_device_context().stream();
     int block = 512;
     int grid = (count + block - 1) / block;
-    LinspaceKernel<T><<<grid, block, 0, stream>>>(start, slice, count,
-                                                  number_data);
+    LinspaceKernel<T>
+        <<<grid, block, 0, stream>>>(start, slice, count, number_data);
   }
 };
 
diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h
index 21540de2b640e..cbf70b9135be2 100644
--- a/paddle/fluid/operators/affine_grid_op.h
+++ b/paddle/fluid/operators/affine_grid_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/operators/amp/CMakeLists.txt b/paddle/fluid/operators/amp/CMakeLists.txt
index 2ea8bbcbc61df..cbedb02f86836 100644
--- a/paddle/fluid/operators/amp/CMakeLists.txt
+++ b/paddle/fluid/operators/amp/CMakeLists.txt
@@ -1,10 +1,14 @@
 include(operators)
 if(WITH_UNITY_BUILD)
-    # Load Unity Build rules for operators in paddle/fluid/operators/amp.
-    include(unity_build_rule.cmake)
+  # Load Unity Build rules for operators in paddle/fluid/operators/amp.
+  include(unity_build_rule.cmake)
 endif()
 register_operators()
 
 if(WITH_ASCEND_CL)
-    cc_test(check_finite_and_unscale_op_npu_test SRCS check_finite_and_unscale_op_npu_test.cc DEPS op_registry check_finite_and_unscale_op scope device_context enforce executor)
+  cc_test(
+    check_finite_and_unscale_op_npu_test
+    SRCS check_finite_and_unscale_op_npu_test.cc
+    DEPS op_registry check_finite_and_unscale_op scope device_context enforce
+         executor)
 endif()
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
index 68f6e3b2f3bd0..78bacc3016178 100644
--- a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <cmath>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
index 2f6977b9e2da2..7771902c02b1f 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
@@ -143,10 +143,10 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
     int blocks_per_grid =
         (total_num + elements_per_block - 1) / elements_per_block;
     VLOG(3) << "launch kernel";
-    CheckFiniteAndUnscale<
-        T, MPDType><<<blocks_per_grid, threads_per_block,
-                      (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>(
-        d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs);
+    CheckFiniteAndUnscale<T, MPDType>
+        <<<blocks_per_grid, threads_per_block, (xs_size + 1) * sizeof(int64_t),
+           dev_ctx.stream()>>>(d_xs, inverse_scale_v, xs_size, d_starts,
+                               found_inf_data, d_outs);
     VLOG(3) << "finish kernel";
   }
 };
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
index 2862d9230768c..46572579e081c 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <cstdlib>
 #include <memory>
 #include <random>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
index 30266d3eec0e0..1d3e5e5162ca9 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
@@ -65,13 +65,15 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
         int r = xpu::isfinite(dev_ctx.x_context(),
                               reinterpret_cast<const XPUTyp*>(x->data<T>()),
                               is_finite.data<bool>(), x->numel());
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(isfinite) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
-        r = xpu::logical_not(dev_ctx.x_context(), reinterpret_cast<const bool*>(
-                                                      is_finite.data<bool>()),
-                             is_finite.data<bool>(), x->numel());
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API(isfinite) return wrong "
+                                       "value[%d %s]",
+                                       r, XPUAPIErrorMsg[r]));
+        r = xpu::logical_not(
+            dev_ctx.x_context(),
+            reinterpret_cast<const bool*>(is_finite.data<bool>()),
+            is_finite.data<bool>(), x->numel());
         PADDLE_ENFORCE_EQ(
             r, XPU_SUCCESS,
             platform::errors::External("XPU API(logical_not) return wrong "
@@ -79,10 +81,11 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
                                        r, XPUAPIErrorMsg[r]));
         r = xpu::any(dev_ctx.x_context(), is_finite.data<bool>(),
                      found_inf_data, x->numel());
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(any) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API(any) return wrong "
+                                       "value[%d %s]",
+                                       r, XPUAPIErrorMsg[r]));
         if (dev_ctx.x_context()->xpu_stream) {
           dev_ctx.Wait();
         }
@@ -106,36 +109,40 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
         int r = xpu::cast_v2(dev_ctx.x_context(),
                              reinterpret_cast<const float16*>(x->data<T>()),
                              float_x.data<MPDType>(), x->numel());
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(cast_v2) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API(cast_v2) return wrong "
+                                       "value[%d %s]",
+                                       r, XPUAPIErrorMsg[r]));
 
         r = xpu::scale(dev_ctx.x_context(), float_x.data<MPDType>(),
                        float_out.data<MPDType>(), x->numel(), false,
                        inverse_scale, 0.0);
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(scale) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API(scale) return wrong "
+                                       "value[%d %s]",
+                                       r, XPUAPIErrorMsg[r]));
 
         r = xpu::cast_v2(dev_ctx.x_context(), float_out.data<MPDType>(),
                          reinterpret_cast<float16*>(out->data<T>()),
                          out->numel());
 
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(cast_v2) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API(cast_v2) return wrong "
+                                       "value[%d %s]",
+                                       r, XPUAPIErrorMsg[r]));
       } else {
         int r = xpu::scale(dev_ctx.x_context(),
                            reinterpret_cast<const XPUTyp*>(x->data<T>()),
                            reinterpret_cast<XPUTyp*>(out->data<T>()),
                            x->numel(), false, inverse_scale, 0.0);
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(scale) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API(scale) return wrong "
+                                       "value[%d %s]",
+                                       r, XPUAPIErrorMsg[r]));
       }
     }
     if (dev_ctx.x_context()->xpu_stream) {
diff --git a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
index e5a2d93e32fe2..c102bd2ba47bd 100644
--- a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <cmath>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/amp/get_float_status_op_npu.cc b/paddle/fluid/operators/amp/get_float_status_op_npu.cc
index 8109a1ff43ff2..0c1187616503b 100644
--- a/paddle/fluid/operators/amp/get_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/get_float_status_op_npu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <cmath>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/amp/unity_build_rule.cmake b/paddle/fluid/operators/amp/unity_build_rule.cmake
index bfdab0cd9623c..fa460e33c8068 100644
--- a/paddle/fluid/operators/amp/unity_build_rule.cmake
+++ b/paddle/fluid/operators/amp/unity_build_rule.cmake
@@ -4,9 +4,7 @@
 # Generally, the combination rules in this file do not need to be modified.
 # If there are some redefined error in compiling with the source file which
 # in combination rule, you can remove the source file from the following rules.
-register_unity_group(cc
-    check_finite_and_unscale_op.cc
-    update_loss_scaling_op.cc)
-register_unity_group(cu
-    check_finite_and_unscale_op.cu
-    update_loss_scaling_op.cu)
+register_unity_group(cc check_finite_and_unscale_op.cc
+                     update_loss_scaling_op.cc)
+register_unity_group(cu check_finite_and_unscale_op.cu
+                     update_loss_scaling_op.cu)
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
index 8354650df0237..baf742b0b404b 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
+
 #include <cstring>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
index 43f8f84578c70..81f986434411c 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h
index 41eb94247f593..f4c6b6f1f7d8d 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.h
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h
@@ -19,6 +19,7 @@
 #endif  // PADDLE_WITH_CUDA && __NVCC__
 #include <cmath>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index f9a93a47ff2be..da7e23c4620ba 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
 #include <cmath>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 DECLARE_int32(min_loss_scaling);
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
index fe03d93f4480f..8f57e00fe1117 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
 #include <cstring>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -59,10 +60,11 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
         r = xpu::constant(dev_ctx.x_context(),
                           reinterpret_cast<XPUTyp*>(out_data), num,
                           XPUTyp(0.0));
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(constant) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API(constant) return wrong "
+                                       "value[%d %s]",
+                                       r, XPUAPIErrorMsg[r]));
       }
     }
     const bool stop_update = ctx.Attr<bool>("stop_update");
diff --git a/paddle/fluid/operators/angle_op.h b/paddle/fluid/operators/angle_op.h
index 116a8053db3ed..ace345465dc25 100644
--- a/paddle/fluid/operators/angle_op.h
+++ b/paddle/fluid/operators/angle_op.h
@@ -17,11 +17,11 @@
 #define _USE_MATH_DEFINES
 #endif
 #include <cmath>
-#include "paddle/phi/kernels/funcs/complex_functors.h"
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
index c5e4188ca2d6f..63fd27a1edf7a 100644
--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
-
-#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
@@ -28,20 +27,18 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ArgMaxInferShapeFunctor);
 
-REGISTER_OP_VERSION(arg_max)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(arg_max).AddCheckpoint(
+    R"ROC(
               Upgrade argmax add a new attribute [flatten] and modify the attribute of dtype)ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewAttr("flatten",
-                     "In order to compute the argmax over the flattened array "
-                     "when the "
-                     "argument `axis` in python API is None.",
-                     false)
-            .ModifyAttr(
-                "dtype",
-                "Change the default value of dtype from -1 to 3"
-                ", means return the int64 indices directly. The rearse why "
-                "changing the default value is that the int64 value in "
-                "VarType is 3 in the frameworke.proto.",
-                3));
+    paddle::framework::compatible::OpVersionDesc()
+        .NewAttr("flatten",
+                 "In order to compute the argmax over the flattened array "
+                 "when the "
+                 "argument `axis` in python API is None.",
+                 false)
+        .ModifyAttr("dtype",
+                    "Change the default value of dtype from -1 to 3"
+                    ", means return the int64 indices directly. The rearse why "
+                    "changing the default value is that the int64 value in "
+                    "VarType is 3 in the frameworke.proto.",
+                    3));
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index 585341beea12c..194a3070bf683 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <type_traits>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
index fb3abd01af8c3..c995d56cf6b09 100644
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -27,20 +27,18 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ArgMinInferShapeFunctor);
 
-REGISTER_OP_VERSION(arg_min)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(arg_min).AddCheckpoint(
+    R"ROC(
               Upgrade argmin add a new attribute [flatten] and modify the attribute of dtype)ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewAttr("flatten",
-                     "In order to compute the argmin over the flattened array "
-                     "when the "
-                     "argument `axis` in python API is None.",
-                     false)
-            .ModifyAttr(
-                "dtype",
-                "Change the default value of dtype from -1 to 3"
-                ", means return the int64 indices directly. The rearse why "
-                "changing the default value is that the int64 value in "
-                "VarType is 3 in the frameworke.proto.",
-                3));
+    paddle::framework::compatible::OpVersionDesc()
+        .NewAttr("flatten",
+                 "In order to compute the argmin over the flattened array "
+                 "when the "
+                 "argument `axis` in python API is None.",
+                 false)
+        .ModifyAttr("dtype",
+                    "Change the default value of dtype from -1 to 3"
+                    ", means return the int64 indices directly. The rearse why "
+                    "changing the default value is that the int64 value in "
+                    "VarType is 3 in the frameworke.proto.",
+                    3));
diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h
index af44a77c8131d..0cc3b695aef93 100644
--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 1db3592b1cfab..f0824695a060f 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <paddle/fluid/operators/math/concat_and_split.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/lod_utils.h"
diff --git a/paddle/fluid/operators/ascend_trigger_op.h b/paddle/fluid/operators/ascend_trigger_op.h
index eaa79da2ba8ee..d1eaa00c2a3e0 100644
--- a/paddle/fluid/operators/ascend_trigger_op.h
+++ b/paddle/fluid/operators/ascend_trigger_op.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_ASCEND
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
diff --git a/paddle/fluid/operators/assign_op_xpu.cc b/paddle/fluid/operators/assign_op_xpu.cc
index b95be3096f071..7d03982f6ad03 100644
--- a/paddle/fluid/operators/assign_op_xpu.cc
+++ b/paddle/fluid/operators/assign_op_xpu.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/assign_op.h"
-
 #include <string>
 
+#include "paddle/fluid/operators/assign_op.h"
+
 namespace paddle {
 namespace framework {
 class OpDesc;
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index bf7d609370a8d..22db7d9e982c2 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/attention_lstm_op.h"
+
 #include <string>
+
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/cpu_vec.h"
@@ -62,8 +64,9 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
           "LSTMWeight dims should be (%d + %d) * %d.", D, M, 4 * D));
 
   auto b_dims = ctx->GetInputDim("LSTMBias");
-  PADDLE_ENFORCE_EQ(b_dims.size(), 2, platform::errors::InvalidArgument(
-                                          "Input(LSTMBias)'s rank must be 2."));
+  PADDLE_ENFORCE_EQ(
+      b_dims.size(), 2,
+      platform::errors::InvalidArgument("Input(LSTMBias)'s rank must be 2."));
   PADDLE_ENFORCE_EQ(b_dims[0], 1,
                     platform::errors::InvalidArgument(
                         "LSTMBias dims should be 1 x %d.", 4 * D));
@@ -72,11 +75,13 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                         "LSTMBias dims should be 1 x %d.", 4 * D));
 
   auto c_dims = ctx->GetInputDim("C0");
-  PADDLE_ENFORCE_EQ(c_dims.size(), 2, platform::errors::InvalidArgument(
-                                          "Input(C0)'s rank must be 2."));
+  PADDLE_ENFORCE_EQ(
+      c_dims.size(), 2,
+      platform::errors::InvalidArgument("Input(C0)'s rank must be 2."));
   if (ctx->IsRuntime()) {
-    PADDLE_ENFORCE_EQ(c_dims[1], D, platform::errors::InvalidArgument(
-                                        "C0 dims should be N x %d.", D));
+    PADDLE_ENFORCE_EQ(
+        c_dims[1], D,
+        platform::errors::InvalidArgument("C0 dims should be N x %d.", D));
   }
 
   if (ctx->HasInput("H0")) {
@@ -126,10 +131,12 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(dims.size(), 2,
                       platform::errors::InvalidArgument(
                           "Input(AttentionScalar)'s rank must be 2."));
-    PADDLE_ENFORCE_EQ(dims[0], 1, platform::errors::InvalidArgument(
-                                      "AttentionScalar shapes must be 1 * 1."));
-    PADDLE_ENFORCE_EQ(dims[1], 1, platform::errors::InvalidArgument(
-                                      "AttentionScalar shapes must be 1 * 1."));
+    PADDLE_ENFORCE_EQ(dims[0], 1,
+                      platform::errors::InvalidArgument(
+                          "AttentionScalar shapes must be 1 * 1."));
+    PADDLE_ENFORCE_EQ(dims[1], 1,
+                      platform::errors::InvalidArgument(
+                          "AttentionScalar shapes must be 1 * 1."));
   }
 
   if (ctx->HasInput("AttentionScalarBias")) {
@@ -332,14 +339,15 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
       int len = x_lod[0][i + 1] - x_lod[0][i];
       max_seq_len = max_seq_len < len ? len : max_seq_len;
     }
-    PADDLE_ENFORCE_EQ(x_lod.size(), 1UL, platform::errors::InvalidArgument(
-                                             "Input(X)'s lod size must be 1."));
+    PADDLE_ENFORCE_EQ(
+        x_lod.size(), 1UL,
+        platform::errors::InvalidArgument("Input(X)'s lod size must be 1."));
     PADDLE_ENFORCE_EQ(
         c0->dims()[0], N,
         platform::errors::InvalidArgument("C0 dims should be %d x %d.", N, D));
     fc_out->Resize({max_seq_len, 1});
 
-    std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
+    std::function<void(const int, const T*, T*)> act_gate, act_cell, act_cand;
     auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
     auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
     auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index 289dda56b19df..de6eca3903f88 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/batch_fc_op.cc b/paddle/fluid/operators/batch_fc_op.cc
index 952625bcb6e46..2d2deae69a783 100644
--- a/paddle/fluid/operators/batch_fc_op.cc
+++ b/paddle/fluid/operators/batch_fc_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/batch_fc_op.h"
+
 #include <string>
 
 namespace paddle {
@@ -42,8 +43,9 @@ class BatchFCOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(input_dims.size(), 3,
                       platform::errors::InvalidArgument(
                           "Input of BatchFCOp should have 3D."));
-    PADDLE_ENFORCE_EQ(w_dims.size(), 3, platform::errors::InvalidArgument(
-                                            "W of BatchFCOp should have 3D."));
+    PADDLE_ENFORCE_EQ(
+        w_dims.size(), 3,
+        platform::errors::InvalidArgument("W of BatchFCOp should have 3D."));
     PADDLE_ENFORCE_EQ(
         input_dims[0], w_dims[0],
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu
index ddedf0172be82..5843acb4fdd0c 100644
--- a/paddle/fluid/operators/batch_fc_op.cu
+++ b/paddle/fluid/operators/batch_fc_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/batch_fc_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 2663a08101157..67384338d764e 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/batch_norm_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/data_layout.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -167,10 +169,11 @@ framework::OpKernelType BatchNormOp::GetExpectedKernelType(
       bn_param_type,
       framework::TransToProtoVarType(ctx.Input<Tensor>("Mean")->dtype()),
       platform::errors::InvalidArgument("Mean input should be of float type"));
-  PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                       ctx.Input<Tensor>("Variance")->dtype()),
-                    platform::errors::InvalidArgument(
-                        "Variance input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type,
+      framework::TransToProtoVarType(ctx.Input<Tensor>("Variance")->dtype()),
+      platform::errors::InvalidArgument(
+          "Variance input should be of float type"));
 
   // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
   framework::LibraryType library = framework::LibraryType::kPlain;
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index d274e8d2c006d..b82b49e5cd58e 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/layout_utils.h"
diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc
index 6507890a8b5dc..6dff315aa6a21 100644
--- a/paddle/fluid/operators/batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/batch_norm_op_mlu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc
index ae03ecbcb16a0..725b7f3848f4a 100644
--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/batch_norm_op_npu.cc
@@ -113,8 +113,9 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
       runner_reduce.Run(stream);
 
       const auto &runner_update = NpuOpRunner(
-          "BNTrainingUpdate", {x_tensor, sum, square_sum, *scale, *bias,
-                               *running_mean, *running_var},
+          "BNTrainingUpdate",
+          {x_tensor, sum, square_sum, *scale, *bias, *running_mean,
+           *running_var},
           {y_tesnor, *mean_out, *variance_out, *saved_mean, *saved_variance},
           {{"factor", momentum}, {"epsilon", epsilon}});
       runner_update.Run(stream);
@@ -216,10 +217,11 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
                         {dx_tensor}, {{"epsilon", epsilon}});
         runner_infer.Run(stream);
       } else {
-        const auto &runner_reduce = NpuOpRunner(
-            "BNTrainingReduceGrad", {dy_tensor, x_tensor, *d_scale, *d_bias,
-                                     *scale, *saved_mean, *saved_inv_variance},
-            {dx_tensor}, {{"epsilon", epsilon}});
+        const auto &runner_reduce =
+            NpuOpRunner("BNTrainingReduceGrad",
+                        {dy_tensor, x_tensor, *d_scale, *d_bias, *scale,
+                         *saved_mean, *saved_inv_variance},
+                        {dx_tensor}, {{"epsilon", epsilon}});
         runner_reduce.Run(stream);
       }
     }
diff --git a/paddle/fluid/operators/batch_norm_op_xpu.cc b/paddle/fluid/operators/batch_norm_op_xpu.cc
index 0893324c602a8..3ade2f36ad89f 100644
--- a/paddle/fluid/operators/batch_norm_op_xpu.cc
+++ b/paddle/fluid/operators/batch_norm_op_xpu.cc
@@ -13,10 +13,11 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/batch_norm_op.h"
 #include <iterator>
 #include <vector>
 
+#include "paddle/fluid/operators/batch_norm_op.h"
+
 namespace paddle {
 namespace operators {
 
@@ -128,8 +129,9 @@ static int calculate_inv_BN_Y(xpu::Context *ctx, T *x, const T *scale,
                               const T *bias, const T *mean, const T *variance,
                               const int N, const int C, const int M,
                               const T *y) {
-  PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
-                              "X and Y should be inplaced in inplace mode"));
+  PADDLE_ENFORCE_EQ(x, y,
+                    platform::errors::InvalidArgument(
+                        "X and Y should be inplaced in inplace mode"));
   std::vector<int> tensor_shape_vec({N, C, M});
   std::vector<int> array_shape_vec({1, C, 1});
   // y - bias
@@ -207,8 +209,9 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
       is_inplace = false;
       if (d_x) {
         PADDLE_ENFORCE_NE(
-            d_x, d_y, platform::errors::InvalidArgument(
-                          "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
+            d_x, d_y,
+            platform::errors::InvalidArgument(
+                "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
       }
     }
 
@@ -275,11 +278,12 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
         int r1 =
             calculate_inv_var(dev_ctx.x_context(), global_var->data<float>(),
                               epsilon, C, epsilon_data, global_inv_std_data);
-        PADDLE_ENFORCE_EQ(r1, XPU_SUCCESS, platform::errors::External(
-                                               "XPU API(batch_norm_grad "
-                                               "calculate_inv_var function) "
-                                               "return wrong value[%d %s]",
-                                               r1, XPUAPIErrorMsg[r1]));
+        PADDLE_ENFORCE_EQ(
+            r1, XPU_SUCCESS,
+            platform::errors::External("XPU API(batch_norm_grad "
+                                       "calculate_inv_var function) "
+                                       "return wrong value[%d %s]",
+                                       r1, XPUAPIErrorMsg[r1]));
       }
       auto px = *x;
       auto *inv_std_data =
@@ -290,11 +294,12 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
           dev_ctx.x_context(), px.mutable_data<T>(ctx.GetPlace()),
           scale->data<float>(), bias->data<float>(), mean_data, inv_std_data, N,
           C, H * W, x->data<T>());
-      PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, platform::errors::External(
-                                             "XPU API(batch_norm_grad "
-                                             "calculate_inv_BN_Y function) "
-                                             "return wrong value[%d %s]",
-                                             r2, XPUAPIErrorMsg[r2]));
+      PADDLE_ENFORCE_EQ(
+          r2, XPU_SUCCESS,
+          platform::errors::External("XPU API(batch_norm_grad "
+                                     "calculate_inv_BN_Y function) "
+                                     "return wrong value[%d %s]",
+                                     r2, XPUAPIErrorMsg[r2]));
     }
 
     int r3;
@@ -319,10 +324,11 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
           scale_data, batch_mean->data<float>(), batch_inv_std->data<float>(),
           d_scale_data, d_bias_data, is_nchw);
     }
-    PADDLE_ENFORCE_EQ(r3, XPU_SUCCESS, platform::errors::External(
-                                           "XPU API(batch_norm_grad) return "
-                                           "wrong value[%d %s]",
-                                           r3, XPUAPIErrorMsg[r3]));
+    PADDLE_ENFORCE_EQ(
+        r3, XPU_SUCCESS,
+        platform::errors::External("XPU API(batch_norm_grad) return "
+                                   "wrong value[%d %s]",
+                                   r3, XPUAPIErrorMsg[r3]));
   }
 };
 
diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
index facb4cd82542b..1cc6e36467767 100644
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 3fae65c50177b..0e3e32666a832 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/beam_search_decode_op.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/operators/beam_search_decode_op.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc
index cf32e40742441..6f70136b2d213 100644
--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
@@ -103,11 +103,9 @@ TEST(BeamSearchDecodeOp, Backtrace) {
                                 std::vector<int>{1, 1, 3, 5}, &ids, &scores);
   paddle::test::GenerateExample(
       std::vector<size_t>{0, 2, 4},
-      std::vector<size_t>{0, 0, 0, 2,
-                          2},  // the branchs of the first source sentence
-                               // are pruned since finished
-      std::vector<int>{5, 1},
-      &ids, &scores);
+      std::vector<size_t>{0, 0, 0, 2, 2},  // the branchs of the first source
+                                           // sentence are pruned since finished
+      std::vector<int>{5, 1}, &ids, &scores);
 
   ASSERT_EQ(ids.size(), 5UL);
   ASSERT_EQ(scores.size(), 5UL);
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index 887d28f5875e3..90b6359f447ef 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/beam_search_op.cu.cc b/paddle/fluid/operators/beam_search_op.cu.cc
index 4ef9476eee5d3..15aca070221b0 100644
--- a/paddle/fluid/operators/beam_search_op.cu.cc
+++ b/paddle/fluid/operators/beam_search_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/beam_search_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/beam_search_op_npu.cc b/paddle/fluid/operators/beam_search_op_npu.cc
index cae3d0e55fc5d..f5fa0ac026d57 100644
--- a/paddle/fluid/operators/beam_search_op_npu.cc
+++ b/paddle/fluid/operators/beam_search_op_npu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/beam_search_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/beam_search_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_NPU_KERNEL(
diff --git a/paddle/fluid/operators/benchmark/CMakeLists.txt b/paddle/fluid/operators/benchmark/CMakeLists.txt
index e5023d8eb354a..e05011eaf6b3a 100644
--- a/paddle/fluid/operators/benchmark/CMakeLists.txt
+++ b/paddle/fluid/operators/benchmark/CMakeLists.txt
@@ -1,3 +1,14 @@
-cc_test(op_tester SRCS op_tester.cc op_tester_config.cc
-        DEPS memory timer framework_proto proto_desc lod_tensor op_registry
-        device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} eigen_function)
+cc_test(
+  op_tester
+  SRCS op_tester.cc op_tester_config.cc
+  DEPS memory
+       timer
+       framework_proto
+       proto_desc
+       lod_tensor
+       op_registry
+       device_context
+       scope
+       ${GLOB_OP_LIB}
+       ${GLOB_OPERATOR_DEPS}
+       eigen_function)
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
index 4b1593b1f8b40..fc01eef8058c3 100644
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/benchmark/op_tester.h"
+
 #include <fstream>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_info.h"
diff --git a/paddle/fluid/operators/benchmark/op_tester.h b/paddle/fluid/operators/benchmark/op_tester.h
index 6acd42c8675cb..217fbe2653e3d 100644
--- a/paddle/fluid/operators/benchmark/op_tester.h
+++ b/paddle/fluid/operators/benchmark/op_tester.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/benchmark/op_tester_config.h"
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc
index e9477798858d1..d7a055ede1b73 100644
--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/benchmark/op_tester_config.h"
+
 #include <fstream>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/bilateral_slice_op.cc b/paddle/fluid/operators/bilateral_slice_op.cc
index 675566504c211..124441093d3a5 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cc
+++ b/paddle/fluid/operators/bilateral_slice_op.cc
@@ -10,9 +10,11 @@
    limitations under the License. */
 
 #include "paddle/fluid/operators/bilateral_slice_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu
index e7bf6d212dcf1..f20debdf0b815 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cu
+++ b/paddle/fluid/operators/bilateral_slice_op.cu
@@ -11,6 +11,7 @@
 
 #include <algorithm>
 #include <string>
+
 #include "paddle/fluid/operators/bilateral_slice_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -167,11 +168,11 @@ class BilateralSliceOpCUDAKernel : public framework::OpKernel<T> {
     platform::GpuLaunchConfig config =
         platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), total_count);
 
-    BilateralSliceCudaForwardKernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        output_data, grid_data, guide_data, input_data, grid_sizes, has_offset,
-        total_count, output_dims[1]);
+    BilateralSliceCudaForwardKernel<T>
+        <<<config.block_per_grid, config.thread_per_block, 0,
+           ctx.cuda_device_context().stream()>>>(
+            output_data, grid_data, guide_data, input_data, grid_sizes,
+            has_offset, total_count, output_dims[1]);
   }
 };
 
@@ -475,29 +476,29 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
     platform::GpuLaunchConfig config =
         platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), grid_count);
 
-    BilateralSliceCudaGridGradKernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        grid_grad_data, output_grad_data, guide_data, input_data, grid_sizes,
-        has_offset, grid_count, output_chans);
+    BilateralSliceCudaGridGradKernel<T>
+        <<<config.block_per_grid, config.thread_per_block, 0,
+           ctx.cuda_device_context().stream()>>>(
+            grid_grad_data, output_grad_data, guide_data, input_data,
+            grid_sizes, has_offset, grid_count, output_chans);
 
     config =
         platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), guide_count);
 
-    BilateralSliceCudaGuideGradKernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        guide_grad_data, output_grad_data, grid_data, guide_data, input_data,
-        grid_sizes, has_offset, guide_count, output_chans);
+    BilateralSliceCudaGuideGradKernel<T>
+        <<<config.block_per_grid, config.thread_per_block, 0,
+           ctx.cuda_device_context().stream()>>>(
+            guide_grad_data, output_grad_data, grid_data, guide_data,
+            input_data, grid_sizes, has_offset, guide_count, output_chans);
 
     config =
         platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_count);
 
-    BilateralSliceCudaInputGradKernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        input_grad_data, output_grad_data, grid_data, guide_data, grid_sizes,
-        has_offset, input_count, output_chans);
+    BilateralSliceCudaInputGradKernel<T>
+        <<<config.block_per_grid, config.thread_per_block, 0,
+           ctx.cuda_device_context().stream()>>>(
+            input_grad_data, output_grad_data, grid_data, guide_data,
+            grid_sizes, has_offset, input_count, output_chans);
   }
 };
 
diff --git a/paddle/fluid/operators/bilateral_slice_op.h b/paddle/fluid/operators/bilateral_slice_op.h
index a388f4763ec68..66783f151ea06 100644
--- a/paddle/fluid/operators/bilateral_slice_op.h
+++ b/paddle/fluid/operators/bilateral_slice_op.h
@@ -12,6 +12,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/hostdevice.h"
 
diff --git a/paddle/fluid/operators/bmm_op.cc b/paddle/fluid/operators/bmm_op.cc
index 6b5f4755d771e..16066c1a13e41 100644
--- a/paddle/fluid/operators/bmm_op.cc
+++ b/paddle/fluid/operators/bmm_op.cc
@@ -13,6 +13,7 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/operators/bmm_op.h"
+
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/bmm_op.h b/paddle/fluid/operators/bmm_op.h
index 3fecb55caaeea..271a74a44442c 100644
--- a/paddle/fluid/operators/bmm_op.h
+++ b/paddle/fluid/operators/bmm_op.h
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/operators/bmm_op_xpu.cc b/paddle/fluid/operators/bmm_op_xpu.cc
index cc18558027982..348f25d46b4c5 100644
--- a/paddle/fluid/operators/bmm_op_xpu.cc
+++ b/paddle/fluid/operators/bmm_op_xpu.cc
@@ -16,8 +16,8 @@
 
 #include <string>
 #include <vector>
-#include "paddle/fluid/operators/matmul_v2_op.h"
 
+#include "paddle/fluid/operators/matmul_v2_op.h"
 #include "paddle/fluid/operators/xpu_api_wrapper.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc
index bbe4bb08adf27..afa7aee445043 100644
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/bpr_loss_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h
index 993bc0fccf07d..fd6df2c159470 100644
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ b/paddle/fluid/operators/bpr_loss_op.h
@@ -61,8 +61,9 @@ class BprLossOpKernel : public framework::OpKernel<T> {
     const int64_t* label_data = labels->data<int64_t>();
     for (int i = 0; i < step_size; ++i) {
       int lbl_pos = label_data[i];
-      PADDLE_ENFORCE_GE(lbl_pos, 0, platform::errors::InvalidArgument(
-                                        "label data %d is illegal.", lbl_pos));
+      PADDLE_ENFORCE_GE(lbl_pos, 0,
+                        platform::errors::InvalidArgument(
+                            "label data %d is illegal.", lbl_pos));
       PADDLE_ENFORCE_LT(lbl_pos, class_num,
                         platform::errors::InvalidArgument(
                             "label data %d is illegal.", lbl_pos));
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
index 1063a8b799215..53146417f2149 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.cc
+++ b/paddle/fluid/operators/broadcast_tensors_op.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using framework::Tensor;
 using framework::DDim;
+using framework::Tensor;
 
 class BroadcastTensorsOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 76e0f23df2168..f0146994c1f7e 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cast_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 034cb47fab189..2f222d23e7cba 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
-
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 
diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc
index 64324d9772b47..8551d799cc39b 100644
--- a/paddle/fluid/operators/cast_op_xpu.cc
+++ b/paddle/fluid/operators/cast_op_xpu.cc
@@ -19,9 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/platform/float16.h"
-#include "xpu/refactor/math.h"
-
 #include "paddle/phi/kernels/cast_kernel.h"
+#include "xpu/refactor/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/center_loss_op.cc b/paddle/fluid/operators/center_loss_op.cc
index cd1aa9d9c841a..add0bf966d933 100644
--- a/paddle/fluid/operators/center_loss_op.cc
+++ b/paddle/fluid/operators/center_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/center_loss_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu
index 549bb5ae75aff..b46feeae64bd4 100644
--- a/paddle/fluid/operators/center_loss_op.cu
+++ b/paddle/fluid/operators/center_loss_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <iostream>
+
 #include "paddle/fluid/operators/center_loss_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/center_loss_op.h b/paddle/fluid/operators/center_loss_op.h
index ed266e9ac7dc5..18769fed37ba9 100644
--- a/paddle/fluid/operators/center_loss_op.h
+++ b/paddle/fluid/operators/center_loss_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cstring>
 #include <limits>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc
index dfb0ad96b0be2..83bdaa2de7db1 100644
--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/chunk_eval_op.h"
+
 #include <string>
 #include <vector>
 
@@ -55,11 +56,12 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           (inference_dim.size() == 3 && inference_dim[2] == 1) ||
               inference_dim.size() == 2,
-          true, platform::errors::InvalidArgument(
-                    "when Input(SeqLength) is provided, Input(Inference) "
-                    "should be of dim 3 (batch_size, bucket, 1) or dim 2 "
-                    "(batch_size, bucket), but received [%s].",
-                    inference_dim));
+          true,
+          platform::errors::InvalidArgument(
+              "when Input(SeqLength) is provided, Input(Inference) "
+              "should be of dim 3 (batch_size, bucket, 1) or dim 2 "
+              "(batch_size, bucket), but received [%s].",
+              inference_dim));
       auto seq_length_dim = ctx->GetInputDim("SeqLength");
       PADDLE_ENFORCE_LE(seq_length_dim.size(), 2,
                         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index 862a0d04fbdfe..f2a4201fd960d 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -1,19 +1,67 @@
 include(operators)
 
-cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
-cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor transform_type cinn)
+cc_library(
+  cinn_op_helper
+  SRCS cinn_op_helper.cc
+  DEPS operator device_context)
+cc_library(
+  cinn_launch_context
+  SRCS cinn_launch_context.cc
+  DEPS ddim
+       lod_tensor
+       scope
+       proto_desc
+       graph
+       build_strategy
+       device_context
+       parallel_executor
+       transform_type
+       cinn)
 
-SET(CINN_OP_DEPS parallel_executor string_helper variable_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type)
+set(CINN_OP_DEPS
+    parallel_executor
+    string_helper
+    variable_helper
+    cinn
+    cinn_compiler
+    cinn_op_helper
+    cinn_launch_context
+    transform_type)
 register_operators(DEPS ${CINN_OP_DEPS})
 
-if (WITH_TESTING)
-  cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope proto_desc graph cinn_launch_context cinn_instruction_run_op cinn)
-  set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN")
+if(WITH_TESTING)
+  cc_test(
+    cinn_launch_context_test
+    SRCS cinn_launch_context_test.cc
+    DEPS ddim
+         lod_tensor
+         scope
+         proto_desc
+         graph
+         cinn_launch_context
+         cinn_instruction_run_op
+         cinn)
+  set_tests_properties(cinn_launch_context_test PROPERTIES LABELS
+                                                           "RUN_TYPE=CINN")
 
-  SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda")
-  cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op gflags)
-  set_tests_properties(cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT "${CINN_RUN_ENVIRONMENT}")
+  set(CINN_RUN_ENVIRONMENT
+      "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda"
+  )
+  cc_test(
+    cinn_launch_op_test
+    SRCS cinn_launch_op_test.cc
+    DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op
+         elementwise_add_op gflags)
+  set_tests_properties(
+    cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT
+                                   "${CINN_RUN_ENVIRONMENT}")
 
-  cc_test(cinn_instruction_run_op_test SRCS cinn_instruction_run_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op)
-  set_tests_properties(cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT "${CINN_RUN_ENVIRONMENT}")
+  cc_test(
+    cinn_instruction_run_op_test
+    SRCS cinn_instruction_run_op_test.cc
+    DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op
+         elementwise_add_op)
+  set_tests_properties(
+    cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT
+                                            "${CINN_RUN_ENVIRONMENT}")
 endif()
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
index 0903c53e5ecac..be9829dd43b17 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/cinn/cinn_instruction_run_op.h"
+
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -48,12 +49,12 @@ class CinnInstructionRunOp : public framework::OperatorWithKernel {
 
  protected:
   /* [Why use single type kernel]:
-  *
-  * Whether the kernel data type is int, float or other type,
-  * which has no effect on its execution logic, so directly
-  * specified a data type here.
-  *
-  */
+   *
+   * Whether the kernel data type is int, float or other type,
+   * which has no effect on its execution logic, so directly
+   * specified a data type here.
+   *
+   */
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(framework::proto::VarType::FP32,
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
index ea72f6c53745a..afa350ef116c4 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cinn/cinn_instruction_run_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
index 81c2d23d3f149..13483d78f49b6 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
@@ -18,6 +18,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "cinn/hlir/framework/graph_compiler.h"
 #include "cinn/hlir/framework/instruction.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
index 68bc3a0eb5c53..cbfab3090c0ad 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <stdlib.h>
+
 #include <string>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index a660d59fb4c0f..6b70efee86f57 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
+
 #include <algorithm>
 #include <functional>
 #include <utility>
 #include <vector>
+
 #include "cinn/hlir/framework/graph_compiler.h"
 #include "cinn/hlir/framework/instruction.h"
 #include "cinn/hlir/framework/scope.h"
@@ -43,13 +45,13 @@
 namespace paddle {
 namespace operators::details {
 
-using framework::Scope;
 using framework::LoDTensor;
 using framework::ParallelExecutor;
+using framework::Scope;
 using CinnInstruction = ::cinn::hlir::framework::Instruction;
 using CinnRuntimeProgram = ::cinn::hlir::framework::Program;
-using framework::paddle2cinn::Name2VarInfoMap;
 using framework::paddle2cinn::kMemOptVarInfoFromMainGraph;
+using framework::paddle2cinn::Name2VarInfoMap;
 
 CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
                                      const CinnCompiledObject& compiled_obj)
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index ed5e4383d83d2..0bbbcc8b03177 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
index ecbfbf2f92ebf..cd4465d355f35 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
+
 #include <memory>
 #include <set>
 #include <utility>
+
 #include "cinn/auto_schedule/auto_tuner.h"
 #include "cinn/common/target.h"
 #include "cinn/common/type.h"
@@ -38,11 +40,11 @@ USE_OP(cinn_instruction_run);
 namespace paddle {
 namespace operators::details {
 
+using framework::LoDTensor;
 using framework::OpDesc;
+using framework::ParallelExecutor;
 using framework::ProgramDesc;
-using framework::LoDTensor;
 using framework::ir::Graph;
-using framework::ParallelExecutor;
 using framework::paddle2cinn::Name2VarInfoMap;
 using CinnShape = ::cinn::hlir::framework::Shape;
 using CinnInstruction = ::cinn::hlir::framework::Instruction;
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc
index 0a9b66bc92c15..3b0198613dbdb 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/cinn/cinn_launch_op.h"
+
 #include <functional>
 #include <vector>
+
 #include "cinn/hlir/framework/graph_compiler.h"
 #include "cinn/runtime/cinn_runtime.h"
 #include "cinn/runtime/flags.h"
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
index 9dfd53834e937..fb5a48ca3d0b4 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cinn/cinn_launch_op.h"
+
 #include "paddle/fluid/framework/operator.h"
 
 /* see [Why use single type kernel] */
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index f40b788dfb5b3..62c79faafec72 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -77,16 +77,16 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     std::map<std::string, const LoDTensor*> inputs_name2tensor;
     std::vector<std::string> input_x_variable_names;
     std::vector<std::string> input_no_need_buffer_variable_names;
-    auto add_name2tensor_fn = [&inputs_name2tensor](
-        const std::vector<std::string>& variable_names,
-        const std::vector<const LoDTensor*>& tensors) {
-      std::transform(
-          variable_names.begin(), variable_names.end(), tensors.begin(),
-          std::inserter(inputs_name2tensor, inputs_name2tensor.end()),
-          [](const std::string& name, const LoDTensor* tensor) {
-            return std::make_pair(name, tensor);
-          });
-    };
+    auto add_name2tensor_fn =
+        [&inputs_name2tensor](const std::vector<std::string>& variable_names,
+                              const std::vector<const LoDTensor*>& tensors) {
+          std::transform(
+              variable_names.begin(), variable_names.end(), tensors.begin(),
+              std::inserter(inputs_name2tensor, inputs_name2tensor.end()),
+              [](const std::string& name, const LoDTensor* tensor) {
+                return std::make_pair(name, tensor);
+              });
+        };
 
     auto input_x_tensors = ctx.MultiInput<LoDTensor>(kX);
     if (!input_x_tensors.empty()) {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index b0bd043f43247..9ed9fad36a3d7 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cinn/cinn_launch_op.h"
+
 #include <stdlib.h>
+
 #include <mutex>
 #include <random>
 #include <string>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.cc b/paddle/fluid/operators/cinn/cinn_op_helper.cc
index 3fb9c822c77c4..26fee2d9e577c 100644
--- a/paddle/fluid/operators/cinn/cinn_op_helper.cc
+++ b/paddle/fluid/operators/cinn/cinn_op_helper.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/device_context.h"
 
diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.h b/paddle/fluid/operators/cinn/cinn_op_helper.h
index e542134b94689..55ee3789c0a82 100644
--- a/paddle/fluid/operators/cinn/cinn_op_helper.h
+++ b/paddle/fluid/operators/cinn/cinn_op_helper.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/operator.h"
 
 // We define some common names or utility functions
diff --git a/paddle/fluid/operators/cinn/test_helper.h b/paddle/fluid/operators/cinn/test_helper.h
index 9720a5309fa6e..4e06882279bee 100644
--- a/paddle/fluid/operators/cinn/test_helper.h
+++ b/paddle/fluid/operators/cinn/test_helper.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <random>
 #include <string>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index a23cf2815d8fe..7192b415c27ec 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -15,17 +15,20 @@
 #ifdef PADDLE_WITH_HIP
 #include <hiprand.h>
 #include <hiprand_kernel.h>
+
 #include <hipcub/hipcub.hpp>
 typedef hiprandState curandState;
 namespace cub = hipcub;
 #else
 #include <curand.h>
 #include <curand_kernel.h>
+
 #include <cub/cub.cuh>
 #endif
 
 #include <iterator>
 #include <random>
+
 #include "paddle/fluid/operators/class_center_sample_op.h"
 #include "paddle/phi/api/include/tensor.h"
 
diff --git a/paddle/fluid/operators/class_center_sample_op.h b/paddle/fluid/operators/class_center_sample_op.h
index 24ce9ace3bf11..8f12e90e18539 100644
--- a/paddle/fluid/operators/class_center_sample_op.h
+++ b/paddle/fluid/operators/class_center_sample_op.h
@@ -16,6 +16,7 @@
 #include <map>
 #include <set>
 #include <vector>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index 8822fffd326e1..379cd4c665314 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -114,10 +114,11 @@ class ClipByNormOp : public framework::OperatorWithKernel {
                           "Output(Out) of ClipByNormOp should not be null. "
                           "Please check if it is created correctly."));
     auto max_norm = ctx->Attrs().Get<float>("max_norm");
-    PADDLE_ENFORCE_GT(max_norm, 0, platform::errors::InvalidArgument(
-                                       "max_norm should be greater than 0. "
-                                       "Received max_norm is %f.",
-                                       max_norm));
+    PADDLE_ENFORCE_GT(
+        max_norm, 0,
+        platform::errors::InvalidArgument("max_norm should be greater than 0. "
+                                          "Received max_norm is %f.",
+                                          max_norm));
     auto x_dims = ctx->GetInputDim("X");
     ctx->SetOutputDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/clip_by_norm_op_xpu.cc b/paddle/fluid/operators/clip_by_norm_op_xpu.cc
index 7c91f06a8d722..62c2608f11c4c 100644
--- a/paddle/fluid/operators/clip_by_norm_op_xpu.cc
+++ b/paddle/fluid/operators/clip_by_norm_op_xpu.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/clip_by_norm_op.h"
 #include <vector>
 
+#include "paddle/fluid/operators/clip_by_norm_op.h"
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
index 6e898d31663fa..46eb9448d9d6b 100644
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -179,14 +180,13 @@ REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad, ops::ClipGradInplaceInferer,
                   ops::ClipDoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::ClipDoubleGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_VERSION(clip)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(clip).AddCheckpoint(
+    R"ROC(
               Upgrade clip add a new input [Min])ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewInput("Min",
-                      "Pass the mix, min value as input, not attribute. Min is "
-                      "dispensable.")
-            .NewInput("Max",
-                      "Pass the mix, min value as input, not attribute. Max is "
-                      "dispensable."));
+    paddle::framework::compatible::OpVersionDesc()
+        .NewInput("Min",
+                  "Pass the mix, min value as input, not attribute. Min is "
+                  "dispensable.")
+        .NewInput("Max",
+                  "Pass the mix, min value as input, not attribute. Max is "
+                  "dispensable."));
diff --git a/paddle/fluid/operators/clip_op_xpu.cc b/paddle/fluid/operators/clip_op_xpu.cc
index c551312837274..a99e5d2506fad 100644
--- a/paddle/fluid/operators/clip_op_xpu.cc
+++ b/paddle/fluid/operators/clip_op_xpu.cc
@@ -61,10 +61,11 @@ class ClipXPUKernel : public framework::OpKernel<T> {
     auto out_data = reinterpret_cast<XPUDataType*>(out->data<T>());
     int r = xpu::clip_v2(dev_ctx.x_context(), x_data, out_data, x->numel(), min,
                          max);
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                          "XPU API(clip_v2) return wrong "
-                                          "value[%d %s]",
-                                          r, XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(clip_v2) return wrong "
+                                   "value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
   }
 };
 
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index aa5a38e4dbf08..af15ca2acb7f4 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -14,6 +14,7 @@
 
 #include <sstream>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -265,11 +266,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
           ->ShareDataWith(fused_tensor->Slice(
               static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
           .Resize(dim);
-      len = use_align
-                ? platform::Alignment(len * size_of_dtype, context.GetPlace(),
-                                      align_size) /
-                      size_of_dtype
-                : len;
+      len = use_align ? platform::Alignment(len * size_of_dtype,
+                                            context.GetPlace(), align_size) /
+                            size_of_dtype
+                      : len;
       ss << "output(" << out_var_names[i] << ")  dim:(" << dim << ")"
          << " address: " << out_tensors[i]->data() << " len: " << len << ", ";
       offset += len;
@@ -304,12 +304,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
           size, 0,
           platform::errors::InvalidArgument(
               "The number of tensor `%s`'s elements is 0.", var_names[i]));
-      auto len =
-          use_align
-              ? platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
-                                    place, align_size) /
-                    size_of_dtype
-              : static_cast<size_t>(size);
+      auto len = use_align ? platform::Alignment(
+                                 static_cast<size_t>(size) * size_of_dtype,
+                                 place, align_size) /
+                                 size_of_dtype
+                           : static_cast<size_t>(size);
       const void *ptr =
           lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
       VLOG(4) << size << " " << len;
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 89c573d2dcb71..c94b0c93eb34a 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -2,72 +2,154 @@ include(operators)
 
 set(COLLECTIVE_DEPS "")
 
-set(COLLECTIVE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+set(COLLECTIVE_COMPILE_FLAGS
+    "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
+)
 
-file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+file(
+  GLOB OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "*_op.cc")
 list(REMOVE_DUPLICATES OPS)
 
 foreach(src ${OPS})
-    set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
+  set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS
+                                                ${COLLECTIVE_COMPILE_FLAGS})
 endforeach()
 
-register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op c_gen_cncl_id_op DEPS ${COLLECTIVE_DEPS})
+register_operators(
+  EXCLUDES
+  c_gen_bkcl_id_op
+  gen_bkcl_id_op
+  c_gen_nccl_id_op
+  gen_nccl_id_op
+  c_gen_hccl_id_op
+  gen_hccl_id_op
+  c_gen_cncl_id_op
+  DEPS
+  ${COLLECTIVE_DEPS})
 
 if(WITH_NCCL OR WITH_RCCL)
-    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
-    op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
-    op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
+  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
+  op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
+  op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
 if(WITH_GLOO)
-    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
+  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
 endif()
 
 if(WITH_XPU_BKCL)
-    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
-    op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
-    op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
+  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
+  op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
+  op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
 if(WITH_CNCL)
-    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
-    op_library(c_gen_cncl_id_op DEPS ${COLLECTIVE_DEPS})
+  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
+  op_library(c_gen_cncl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
 if(WITH_ASCEND_CL)
-    cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope)
-    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper)
-    op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
-    op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
+  cc_library(
+    gen_hccl_id_op_helper
+    SRCS gen_hccl_id_op_helper.cc
+    DEPS dynload_warpctc dynamic_loader scope)
+  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper
+                      gen_hccl_id_op_helper)
+  op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
+  op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
-set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
-set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
+set(OPERATOR_DEPS
+    ${OPERATOR_DEPS} ${COLLECTIVE_DEPS}
+    PARENT_SCOPE)
+set(GLOB_COLLECTIVE_DEPS
+    ${COLLECTIVE_DEPS}
+    CACHE INTERNAL "collective dependency")
 
 if(WITH_ASCEND_CL)
-    set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper
-        gen_hccl_id_op op_registry ascend_hccl flags
-        dynamic_loader dynload_warpctc scope device_context enforce executor)
-    cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc
-        DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc
-        DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc
-        DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc
-        DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
-            DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
-        DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc
-        DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
-        DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(checknumeric SRCS checknumeric_npu_test.cc
-        DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
-        DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
-    cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
-        DEPS op_registry elementwise_add_op c_sync_calc_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+  set(COMMON_TEST_DEPS_FOR_HCOM
+      c_comm_init_hccl_op
+      c_gen_hccl_id_op
+      gen_hccl_id_op_helper
+      gen_hccl_id_op
+      op_registry
+      ascend_hccl
+      flags
+      dynamic_loader
+      dynload_warpctc
+      scope
+      device_context
+      enforce
+      executor)
+  cc_test(
+    c_broadcast_op_npu_test
+    SRCS c_broadcast_op_npu_test.cc
+    DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+  cc_test(
+    c_allreduce_sum_op_npu_test
+    SRCS c_allreduce_sum_op_npu_test.cc
+    DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+  cc_test(
+    c_reducescatter_op_npu_test
+    SRCS c_reducescatter_op_npu_test.cc
+    DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+  cc_test(
+    c_allgather_op_npu_test
+    SRCS c_allgather_op_npu_test.cc
+    DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+  cc_test(
+    c_reduce_sum_op_npu_test
+    SRCS c_reduce_sum_op_npu_test.cc
+    DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+  cc_test(
+    c_allreduce_max_op_npu_test
+    SRCS c_allreduce_max_op_npu_test.cc
+    DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+  cc_test(
+    send_v2_op_npu_test
+    SRCS send_v2_op_npu_test.cc
+    DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+  cc_test(
+    recv_v2_op_npu_test
+    SRCS recv_v2_op_npu_test.cc
+    DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+  cc_test(
+    checknumeric
+    SRCS checknumeric_npu_test.cc
+    DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+  cc_test(
+    c_sync_comm_stream_op_npu_test
+    SRCS c_sync_comm_stream_op_npu_test.cc
+    DEPS op_registry
+         c_broadcast_op
+         c_comm_init_hccl_op
+         c_sync_comm_stream_op
+         c_gen_hccl_id_op
+         gen_hccl_id_op_helper
+         ${COLLECTIVE_DEPS}
+         ascend_hccl
+         dynamic_loader
+         dynload_warpctc
+         scope
+         device_context
+         enforce
+         executor)
+  cc_test(
+    c_sync_calc_stream_op_npu_test
+    SRCS c_sync_calc_stream_op_npu_test.cc
+    DEPS op_registry
+         elementwise_add_op
+         c_sync_calc_stream_op
+         c_gen_hccl_id_op
+         gen_hccl_id_op_helper
+         ${COLLECTIVE_DEPS}
+         ascend_hccl
+         dynamic_loader
+         dynload_warpctc
+         scope
+         device_context
+         enforce
+         executor)
 endif()
diff --git a/paddle/fluid/operators/collective/allreduce_op.cc b/paddle/fluid/operators/collective/allreduce_op.cc
index 63b135a74cf4b..53843104dc5fd 100644
--- a/paddle/fluid/operators/collective/allreduce_op.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/collective/allreduce_op.h"
+
 #include <future>  // NOLINT
 #include <ostream>
 
-#include "paddle/fluid/operators/collective/allreduce_op.h"
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc
index 0e0ea72208488..bb498047a50b0 100644
--- a/paddle/fluid/operators/collective/alltoall_op.cu.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
@@ -91,6 +91,9 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(alltoall, ops::AllToAllOpCUDAKernel<float>,
                         ops::AllToAllOpCUDAKernel<double>,
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+                        ops::AllToAllOpCUDAKernel<plat::bfloat16>,
+#endif
                         ops::AllToAllOpCUDAKernel<int>,
                         ops::AllToAllOpCUDAKernel<int64_t>,
                         ops::AllToAllOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/barrier_op.h b/paddle/fluid/operators/collective/barrier_op.h
index 6df4d24c0edf9..88333f36413b8 100644
--- a/paddle/fluid/operators/collective/barrier_op.h
+++ b/paddle/fluid/operators/collective/barrier_op.h
@@ -25,6 +25,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/barrier.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
diff --git a/paddle/fluid/operators/collective/broadcast_op.cc b/paddle/fluid/operators/collective/broadcast_op.cc
index 61e27887b68c7..071b0350de6d2 100644
--- a/paddle/fluid/operators/collective/broadcast_op.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <ostream>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc
index c4e779698ccca..f20ec75a97006 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cc
@@ -26,8 +26,9 @@ class CAllGatherOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "AllGather");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Input", "Out", "AllGather");
     int nranks = ctx->Attrs().Get<int>("nranks");
-    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
-                                     "The value of nranks should be >=2."));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The value of nranks should be >=2."));
     framework::DDim dim = ctx->GetInputDim("X");
     dim[0] = dim[0] * nranks;
     if (dim[0] < 0) dim[0] = -1;
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
index 0d97ffa96dc5c..62ed916d6e08c 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@@ -90,6 +90,9 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(c_allgather, ops::CAllGatherOpCUDAKernel<float>,
                         ops::CAllGatherOpCUDAKernel<double>,
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+                        ops::CAllGatherOpCUDAKernel<plat::bfloat16>,
+#endif
                         ops::CAllGatherOpCUDAKernel<int>,
                         ops::CAllGatherOpCUDAKernel<int64_t>,
                         ops::CAllGatherOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allgather_op.h b/paddle/fluid/operators/collective/c_allgather_op.h
index aa2040a2693b2..7f8c7b2f50e7c 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.h
+++ b/paddle/fluid/operators/collective/c_allgather_op.h
@@ -25,6 +25,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/allgather.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
index 5339293da0fe2..f9ffdea790807 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-
 #include <memory>
 
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
index 7206dd01bcaa3..087f6b879c328 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -17,23 +17,22 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
index 0946ad8aca65e..5c2d6981bad03 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -17,23 +17,22 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 404f7c017ac41..61cf4cf5b7f5f 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -41,6 +41,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/allreduce.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
@@ -335,10 +336,11 @@ class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
             "Invalid reduce type: %d", red_type));
     }
 
-    PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), sendbuff, recvbuff, numel,
-                                      dtype, bkcl_red_type, stream),
-                      BKCL_SUCCESS, platform::errors::PreconditionNotMet(
-                                        "BKCL all reduce failed"));
+    PADDLE_ENFORCE_EQ(
+        bkcl_all_reduce(comm->comm(), sendbuff, recvbuff, numel, dtype,
+                        bkcl_red_type, stream),
+        BKCL_SUCCESS,
+        platform::errors::PreconditionNotMet("BKCL all reduce failed"));
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should be compiled with XPU."));
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
index 8fe7fce21e465..565633c2e7b2d 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -19,6 +19,9 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     c_allreduce_sum, ops::CAllReduceOpCUDAKernel<ops::kRedSum, float>,
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+    ops::CAllReduceOpCUDAKernel<ops::kRedSum, plat::bfloat16>,
+#endif
     ops::CAllReduceOpCUDAKernel<ops::kRedSum, double>,
     ops::CAllReduceOpCUDAKernel<ops::kRedSum, int>,
     ops::CAllReduceOpCUDAKernel<ops::kRedSum, int64_t>,
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.kps
similarity index 58%
rename from paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
rename to paddle/fluid/operators/collective/c_allreduce_sum_op.kps
index d23572e6d670b..3230d2c9ec331 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.kps
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef PADDLE_WITH_XPU_KP
+
+// Please do not modify the following code
+#if defined(__CUDA_ARCH__)
+#undef __CUDA_ARCH__
+#endif
+
+#if defined(__CUDACC__)
+#undef __CUDACC__
+#endif
+
+#if defined(__CUDA__)
+#undef __CUDA__
+#endif
+
+#if defined(__NVCC__)
+#undef __NVCC__
+#endif
+
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_XPU_KERNEL(c_allreduce_sum,
-                       ops::CAllReduceOpXPUKernel<ops::kRedSum, float>)
+REGISTER_OP_KERNEL(c_allreduce_sum, KP, plat::XPUPlace,
+                   ops::CAllReduceOpXPUKernel<ops::kRedSum, float>);
+
+#endif
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
index 61e5f27903477..4c76d094bafa5 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -17,20 +17,19 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index eeae16a0d71f3..478dc85914964 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -98,6 +98,9 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(c_broadcast, ops::CBroadcastOpCUDAKernel<float>,
                         ops::CBroadcastOpCUDAKernel<double>,
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+                        ops::CBroadcastOpCUDAKernel<plat::bfloat16>,
+#endif
                         ops::CBroadcastOpCUDAKernel<int>,
                         ops::CBroadcastOpCUDAKernel<int64_t>,
                         ops::CBroadcastOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.h b/paddle/fluid/operators/collective/c_broadcast_op.h
index eb4acb9a369fc..394ea45efbb7d 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.h
+++ b/paddle/fluid/operators/collective/c_broadcast_op.h
@@ -24,6 +24,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/broadcast.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
index cf4d6a28744b3..e383e78c5dddc 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -17,20 +17,19 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
index 5820bd318d8bc..c9605f4d1b268 100644
--- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
@@ -15,13 +15,17 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/fluid/framework/threadpool.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -48,9 +52,9 @@ class CCommInitAllOp : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "CCommInitAllOp can run on gpu place only"));
+    // PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true,
+    //                   platform::errors::PreconditionNotMet(
+    //                       "CCommInitAllOp can run on gpu place only"));
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     std::vector<int> devices = Attr<std::vector<int>>("devices");
@@ -61,9 +65,52 @@ class CCommInitAllOp : public framework::OperatorBase {
     int rid = Attr<int>("ring_id");
 
     platform::NCCLCommContext::Instance().CreateAllNCCLComms(devices, rid);
+
+#elif defined(PADDLE_WITH_XPU_BKCL)
+    std::vector<int> devices = Attr<std::vector<int>>("devices");
+    int ring_id = Attr<int>("ring_id");
+
+    if (devices.empty()) {
+      int count = platform::GetXPUDeviceCount();
+      for (int i = 0; i < count; ++i) {
+        devices.push_back(i);
+      }
+    }
+
+    if (devices.size() > 1) {
+      std::vector<platform::Place> place_list_;
+      for (size_t i = 0; i < devices.size(); ++i) {
+        auto p = platform::XPUPlace(devices[i]);
+        place_list_.push_back(p);
+      }
+
+      // create pthread to bkcl_init_rank on all devices
+      auto ptr = new platform::BKCLContextMap(place_list_);
+      ptr->init();
+
+      for (size_t i = 0; i < devices.size(); ++i) {
+        platform::BKCLCommContext::Instance().AssignBKCLComm(
+            ptr->contexts_.at(devices[i]).comm_, devices.size(), devices[i],
+            devices[i], ring_id);
+
+        VLOG(0) << "bkcl communicator of rank " << devices[i] << " in ring "
+                << ring_id << " has been created on device " << devices[i];
+
+        // TODO(WorgenZhang): need release comm_map_ when quit
+        // std::call_once(once_flag_, []() {
+        //   std::atexit([]() {
+        //   platform::BKCLCommContext::Instance().ReleaseBKCLComms(); });
+        // });
+      }
+
+      VLOG(0) << "done bkcl_init_rank on all devices";
+    } else {
+      VLOG(0)
+          << "bkcl_init_rank doesn't support on one device, skip init process";
+    }
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
+        "PaddlePaddle should compile with GPU or XPU."));
 #endif
   }
 };
diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
index 86c966378ccb6..3ea24f6e654f0 100644
--- a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <rccl.h>
 #endif
 #include <stdint.h>
+
 #include <ostream>
 #include <string>
 
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index 82d3b1b1dbfea..a41d4293c90e4 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -71,8 +71,9 @@ class CCommInitOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(place) || platform::is_xpu_place(place) ||
             platform::is_mlu_place(place),
-        true, platform::errors::PreconditionNotMet(
-                  "CCommInitOp can run on gpu or xpu or mlu place only."));
+        true,
+        platform::errors::PreconditionNotMet(
+            "CCommInitOp can run on gpu or xpu or mlu place only."));
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL)
@@ -97,18 +98,9 @@ class CCommInitOp : public framework::OperatorBase {
     if (Attr<int>("device_id") >= 0) {
       device_id = Attr<int>("device_id");
     }
-
-#if defined(PADDLE_WITH_XPU_BKCL) && defined(PADDLE_WITH_HETERPS) && \
-    defined(PADDLE_WITH_PSLIB)
-    // XPUPS rank_id only equals 0, so replace rank_id with device_id
-    CommContext::Instance().CreateComm(comm_id, nranks, device_id, device_id,
-                                       rid);
-#else
     int rank_id = Attr<int>("rank");
     CommContext::Instance().CreateComm(comm_id, nranks, rank_id, device_id,
                                        rid);
-#endif
-
 #endif
   }
 };
diff --git a/paddle/fluid/operators/collective/c_concat_op.cc b/paddle/fluid/operators/collective/c_concat_op.cc
index 551fde2116258..155db23a0391a 100644
--- a/paddle/fluid/operators/collective/c_concat_op.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cc
@@ -27,17 +27,19 @@ class CConcatOp : public framework::OperatorWithKernel {
     int nranks = ctx->Attrs().Get<int>("nranks");
     int rank = ctx->Attrs().Get<int>("rank");
     int ring_id = ctx->Attrs().Get<int>("ring_id");
-    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
-                                     "The number of ranks (%d) for c_concat "
-                                     "must be greater than 1.",
-                                     nranks));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The number of ranks (%d) for c_concat "
+                          "must be greater than 1.",
+                          nranks));
     PADDLE_ENFORCE_GE(
         ring_id, 0,
         platform::errors::InvalidArgument(
             "The ring_id (%d) for c_concat must be non-negative.", ring_id));
     PADDLE_ENFORCE_GE(
-        rank, 0, platform::errors::InvalidArgument(
-                     "The rank (%d) for c_concat must be non-negative.", rank));
+        rank, 0,
+        platform::errors::InvalidArgument(
+            "The rank (%d) for c_concat must be non-negative.", rank));
     PADDLE_ENFORCE_LT(rank, nranks,
                       platform::errors::InvalidArgument(
                           "The value of rank (%d) for c_concat must "
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
index d3d9db0e5f87e..98df6c8688e74 100644
--- a/paddle/fluid/operators/collective/c_concat_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/collective/c_concat_op.h"
+
 #include <vector>
 
-#include "paddle/fluid/operators/collective/c_concat_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/phi/api/include/tensor.h"
 
diff --git a/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc b/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
index ec174ad0e56bc..3bd7e3ceffa2a 100644
--- a/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
@@ -21,9 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_gen_cncl_id_op.cc b/paddle/fluid/operators/collective/c_gen_cncl_id_op.cc
index 7e65fba571800..d2e85171a4a40 100644
--- a/paddle/fluid/operators/collective/c_gen_cncl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_cncl_id_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <cncl.h>
+
 #include <string>
 
 #include "paddle/fluid/framework/op_proto_maker.h"
@@ -21,9 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
index 6eec385388090..3f81eab7bc2c4 100644
--- a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
@@ -19,12 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device/npu/dynload/hccl.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-#include "paddle/fluid/platform/device/npu/dynload/hccl.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index d392beb3a4834..d4f1fe1c18297 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -20,9 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index 4e9edb53730c2..5399a4aacbe2c 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -40,6 +40,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/reduce.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
@@ -261,10 +262,11 @@ class CReduceOpXPUKernel : public framework::OpKernel<T> {
             "Invalid reduce type: %d", red_type));
     }
 
-    PADDLE_ENFORCE_EQ(bkcl_reduce(comm->comm(), sendbuff, recvbuff, numel,
-                                  dtype, bkcl_red_type, root, stream),
-                      BKCL_SUCCESS, platform::errors::PreconditionNotMet(
-                                        "BKCL all reduce failed"));
+    PADDLE_ENFORCE_EQ(
+        bkcl_reduce(comm->comm(), sendbuff, recvbuff, numel, dtype,
+                    bkcl_red_type, root, stream),
+        BKCL_SUCCESS,
+        platform::errors::PreconditionNotMet("BKCL all reduce failed"));
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should be compiled with XPU."));
@@ -319,9 +321,10 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
         break;
 
       default:
-        PADDLE_ENFORCE_EQ(true, false, platform::errors::InvalidArgument(
-                                           "red_type must be one of kRedSum, "
-                                           "kRedMax, kRedMin, kRedProd."));
+        PADDLE_ENFORCE_EQ(true, false,
+                          platform::errors::InvalidArgument(
+                              "red_type must be one of kRedSum, "
+                              "kRedMax, kRedMin, kRedProd."));
     }
 
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
index c4e410d04da5f..3bd55ea370465 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -17,20 +17,19 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
index 9b05e940d4f60..fda192c45e779 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -76,6 +76,9 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(c_reducescatter, ops::CReduceScatterOpCUDAKernel<float>,
                         ops::CReduceScatterOpCUDAKernel<double>,
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+                        ops::CReduceScatterOpCUDAKernel<plat::bfloat16>,
+#endif
                         ops::CReduceScatterOpCUDAKernel<int>,
                         ops::CReduceScatterOpCUDAKernel<int64_t>,
                         ops::CReduceScatterOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
index 8b498787c69db..16437d4769eb0 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -17,23 +17,22 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/c_scatter_op.h b/paddle/fluid/operators/collective/c_scatter_op.h
index 71a5f488ebc11..ee07d7663b2ec 100644
--- a/paddle/fluid/operators/collective/c_scatter_op.h
+++ b/paddle/fluid/operators/collective/c_scatter_op.h
@@ -24,6 +24,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/scatter.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index 4c9fb14842489..71216538a4e12 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -373,15 +373,15 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     const int end_index = start_index + D;
 
     if (label_type == framework::proto::VarType::INT32) {
-      MaskLabelByIndexGrad<T,
-                           int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          logit_grad_2d.data<T>(), loss_grad->data<T>(),
-          labels->data<int32_t>(), start_index, end_index, N, D);
+      MaskLabelByIndexGrad<T, int32_t>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(
+              logit_grad_2d.data<T>(), loss_grad->data<T>(),
+              labels->data<int32_t>(), start_index, end_index, N, D);
     } else if (label_type == framework::proto::VarType::INT64) {
-      MaskLabelByIndexGrad<T,
-                           int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          logit_grad_2d.data<T>(), loss_grad->data<T>(),
-          labels->data<int64_t>(), start_index, end_index, N, D);
+      MaskLabelByIndexGrad<T, int64_t>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(
+              logit_grad_2d.data<T>(), loss_grad->data<T>(),
+              labels->data<int64_t>(), start_index, end_index, N, D);
     }
   }
 };
diff --git a/paddle/fluid/operators/collective/c_split_op.cc b/paddle/fluid/operators/collective/c_split_op.cc
index 37ec989f3f981..32f3ff9eab10d 100644
--- a/paddle/fluid/operators/collective/c_split_op.cc
+++ b/paddle/fluid/operators/collective/c_split_op.cc
@@ -27,17 +27,19 @@ class CSplitOp : public framework::OperatorWithKernel {
     int nranks = ctx->Attrs().Get<int>("nranks");
     int rank = ctx->Attrs().Get<int>("rank");
     int ring_id = ctx->Attrs().Get<int>("ring_id");
-    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
-                                     "The number of ranks (%d) for c_split "
-                                     "must be greater than 1.",
-                                     nranks));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The number of ranks (%d) for c_split "
+                          "must be greater than 1.",
+                          nranks));
     PADDLE_ENFORCE_GE(
         ring_id, 0,
         platform::errors::InvalidArgument(
             "The ring_id (%d) for c_split must be non-negative.", ring_id));
     PADDLE_ENFORCE_GE(
-        rank, 0, platform::errors::InvalidArgument(
-                     "The rank (%d) for c_split must be non-negative.", rank));
+        rank, 0,
+        platform::errors::InvalidArgument(
+            "The rank (%d) for c_split must be non-negative.", rank));
     PADDLE_ENFORCE_LT(rank, nranks,
                       platform::errors::InvalidArgument(
                           "The value of rank (%d) for c_split must "
diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu
index a0c4182468f07..1dce4ce04b56f 100644
--- a/paddle/fluid/operators/collective/c_split_op.cu
+++ b/paddle/fluid/operators/collective/c_split_op.cu
@@ -59,10 +59,11 @@ class CSplitOpCUDAKernel : public framework::OpKernel<T> {
     int rank = ctx.Attr<int>("rank");
     auto place = ctx.GetPlace();
 
-    PADDLE_ENFORCE_GE(rank, 0, platform::errors::PreconditionNotMet(
-                                   "The value of rank (%d) for c_split must be "
-                                   "greater than or equal to 0.",
-                                   rank));
+    PADDLE_ENFORCE_GE(rank, 0,
+                      platform::errors::PreconditionNotMet(
+                          "The value of rank (%d) for c_split must be "
+                          "greater than or equal to 0.",
+                          rank));
     PADDLE_ENFORCE_GE(nranks, 2,
                       platform::errors::PreconditionNotMet(
                           "The value of nranks (%d) for c_split must be "
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index 6ad22ff8b19eb..bf7434686b97a 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -23,7 +23,6 @@ class CSyncCalcStreamOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) Dependency of the variable need to sync");
     AddComment(R"DOC(
 CSyncCalcStream Operator
-
 Call calculation stream synchronization.
 )DOC");
   }
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.kps b/paddle/fluid/operators/collective/c_sync_calc_stream_op.kps
new file mode 100644
index 0000000000000..65126f416c4aa
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.kps
@@ -0,0 +1,42 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU_KP
+
+// Please do not modify the following code
+#if defined(__CUDA_ARCH__)
+#undef __CUDA_ARCH__
+#endif
+
+#if defined(__CUDACC__)
+#undef __CUDACC__
+#endif
+
+#if defined(__CUDA__)
+#undef __CUDA__
+#endif
+
+#if defined(__NVCC__)
+#undef __NVCC__
+#endif
+
+#include "paddle/fluid/operators/collective/c_sync_calc_stream_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_KERNEL(c_sync_calc_stream, KP, plat::XPUPlace,
+                   ops::CSyncCalcStreamKernel<float>);
+
+#endif
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index 5a9a00aa8e4d2..a3717459a2dac 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -11,25 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
-#endif
-
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
-#endif
-
-#if defined(PADDLE_WITH_CNCL)
-#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
-#endif
-
-#if defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
+#include "paddle/fluid/operators/collective/c_sync_comm_stream_op.h"
 
 namespace paddle {
 namespace operators {
@@ -58,62 +40,11 @@ class CSyncCommStreamOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
     AddComment(R"DOC(
 CSyncCommStream Operator
-
 Call communication stream synchronization.
 )DOC");
   }
 };
 
-template <typename T>
-class CSyncCommStreamKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto place = ctx.GetPlace();
-    int ring_id = ctx.Attr<int>("ring_id");
-    auto stream =
-        platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
-
-    platform::GpuStreamSync(stream);
-
-#elif defined(PADDLE_WITH_ASCEND_CL)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync comm stream op can run on npu place only for "
-                          "now, but we got %s, please check the environment.",
-                          place.DebugString()));
-    int ring_id = ctx.Attr<int>("ring_id");
-    auto stream =
-        platform::HCCLCommContext::Instance().Get(ring_id, place)->stream();
-    platform::NPUStreamSync(stream);
-
-#elif defined(PADDLE_WITH_CNCL)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_mlu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on mlu place only for now."));
-    int ring_id = ctx.Attr<int>("ring_id");
-    auto stream =
-        platform::CNCLCommContext::Instance().Get(ring_id, place)->stream();
-    platform::MLUStreamSync(stream);
-#elif defined(PADDLE_WITH_XPU_BKCL)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on xpu place only for now."));
-    int ring_id = ctx.Attr<int>("ring_id");
-    auto comm_dev_ctx = platform::BKCLCommContext::Instance()
-                            .Get(ring_id, place)
-                            ->dev_context();
-    comm_dev_ctx->Wait();
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -127,5 +58,3 @@ REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
 REGISTER_OP_NPU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
 
 REGISTER_OP_MLU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
-
-REGISTER_OP_XPU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.h b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
new file mode 100644
index 0000000000000..f9dec9303742c
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#endif
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
+#endif
+
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#endif
+
+#if defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSyncCommStreamKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto place = ctx.GetPlace();
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto stream =
+        platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
+
+    platform::GpuStreamSync(stream);
+
+#elif defined(PADDLE_WITH_ASCEND_CL)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync comm stream op can run on npu place only for "
+                          "now, but we got %s, please check the environment.",
+                          place.DebugString()));
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto stream =
+        platform::HCCLCommContext::Instance().Get(ring_id, place)->stream();
+    platform::NPUStreamSync(stream);
+
+#elif defined(PADDLE_WITH_CNCL)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_mlu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on mlu place only for now."));
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto stream =
+        platform::CNCLCommContext::Instance().Get(ring_id, place)->stream();
+    platform::MLUStreamSync(stream);
+#elif defined(PADDLE_WITH_XPU_BKCL)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on xpu place only for now."));
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto comm_dev_ctx = platform::BKCLCommContext::Instance()
+                            .Get(ring_id, place)
+                            ->dev_context();
+    comm_dev_ctx->Wait();
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.kps b/paddle/fluid/operators/collective/c_sync_comm_stream_op.kps
new file mode 100644
index 0000000000000..bfac7bf5c5b92
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.kps
@@ -0,0 +1,42 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU_KP
+
+// Please do not modify the following code
+#if defined(__CUDA_ARCH__)
+#undef __CUDA_ARCH__
+#endif
+
+#if defined(__CUDACC__)
+#undef __CUDACC__
+#endif
+
+#if defined(__CUDA__)
+#undef __CUDA__
+#endif
+
+#if defined(__NVCC__)
+#undef __NVCC__
+#endif
+
+#include "paddle/fluid/operators/collective/c_sync_comm_stream_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_KERNEL(c_sync_comm_stream, KP, plat::XPUPlace,
+                   ops::CSyncCommStreamKernel<float>);
+
+#endif
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
index 133085ad3f3b0..91b89486c6a4b 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -26,11 +26,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
index 36c6f4fadd0fc..b99ac3816352c 100644
--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
@@ -17,21 +17,20 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <cmath>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/gen_bkcl_id_op.cc b/paddle/fluid/operators/collective/gen_bkcl_id_op.cc
index 1ce8938356895..f60030cec7628 100644
--- a/paddle/fluid/operators/collective/gen_bkcl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_bkcl_id_op.cc
@@ -24,11 +24,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
 
-#include "paddle/fluid/platform/gen_comm_id_helper.h"
-
 namespace paddle {
 namespace operators {
 
@@ -69,9 +68,10 @@ class GenBKCLIdOp : public framework::OperatorBase {
     int trainer_id = Attr<int>("trainer_id");
     std::string endpoint = trainers[trainer_id];
 
-    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
-                                         "trainer_id %d is less than 0. Its "
-                                         "valid range is [0, trainer_size)"));
+    PADDLE_ENFORCE_GE(
+        trainer_id, 0,
+        platform::errors::InvalidArgument("trainer_id %d is less than 0. Its "
+                                          "valid range is [0, trainer_size)"));
     PADDLE_ENFORCE_LT(
         trainer_id, static_cast<int>(trainers.size()),
         platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op.cc b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
index 3d78082f12fc9..e0809459be109 100644
--- a/paddle/fluid/operators/collective/gen_hccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
@@ -21,14 +21,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
 
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-
 namespace paddle {
 namespace operators {
 
@@ -48,9 +47,10 @@ class GenHCCLIdOp : public framework::OperatorBase {
     int trainer_id = Attr<int>("trainer_id");
     std::string endpoint = trainers[trainer_id];
 
-    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
-                                         "trainer_id %d is less than 0. Its "
-                                         "valid range is [0, trainer_size)"));
+    PADDLE_ENFORCE_GE(
+        trainer_id, 0,
+        platform::errors::InvalidArgument("trainer_id %d is less than 0. Its "
+                                          "valid range is [0, trainer_size)"));
     PADDLE_ENFORCE_LT(
         trainer_id, static_cast<int>(trainers.size()),
         platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
index ad50ac367508b..ba573509bd18a 100644
--- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
 #include <arpa/inet.h>
 #include <netdb.h>
 #include <netinet/in.h>
diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
index 7a5b6b5f429b2..1e23f38c13ad0 100644
--- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
@@ -70,9 +70,10 @@ class GenNCCLIdOp : public framework::OperatorBase {
     int trainer_id = Attr<int>("trainer_id");
     std::string endpoint = trainers[trainer_id];
 
-    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
-                                         "trainer_id %d is less than 0. Its "
-                                         "valid range is [0, trainer_size)"));
+    PADDLE_ENFORCE_GE(
+        trainer_id, 0,
+        platform::errors::InvalidArgument("trainer_id %d is less than 0. Its "
+                                          "valid range is [0, trainer_size)"));
     PADDLE_ENFORCE_LT(
         trainer_id, static_cast<int>(trainers.size()),
         platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cc b/paddle/fluid/operators/collective/partial_allgather_op.cc
index bef2ff94d6308..6783d2f0b4593 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cc
@@ -26,8 +26,9 @@ class PartialAllGatherOp : public framework::OperatorWithKernel {
     int nranks = ctx->Attrs().Get<int>("nranks");
     int rank = ctx->Attrs().Get<int>("rank");
 
-    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
-                                     "The value of nranks should be >=2."));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The value of nranks should be >=2."));
     PADDLE_ENFORCE_EQ(
         (rank >= 0 && rank < nranks), true,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc b/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
index 0314bb7d5de1d..c727161d10179 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/collective/partial_allgather_op.h"
 #include <memory>
 
+#include "paddle/fluid/operators/collective/partial_allgather_op.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cc b/paddle/fluid/operators/collective/partial_recv_op.cc
index 99b2169180c77..df59f49cb3a60 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/partial_recv_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/partial_recv_op_npu.cc b/paddle/fluid/operators/collective/partial_recv_op_npu.cc
index f14ce5f81f905..4704ab7683cf3 100644
--- a/paddle/fluid/operators/collective/partial_recv_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/partial_recv_op.h"
-
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 
@@ -55,8 +54,9 @@ class PartialRecvOpASCENDKernel : public framework::OpKernel<T> {
     int nranks = comm->nranks();
     int peer = ctx.Attr<int>("peer");
 
-    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
-                                     "The nranks must be 2, but (%d)", nranks));
+    PADDLE_ENFORCE_EQ(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The nranks must be 2, but (%d)", nranks));
 
     int root = peer;
 
diff --git a/paddle/fluid/operators/collective/partial_send_op_npu.cc b/paddle/fluid/operators/collective/partial_send_op_npu.cc
index 31c74fcc196be..8f53bd8fc5f6a 100644
--- a/paddle/fluid/operators/collective/partial_send_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/send_v2_op.h"
-
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 
@@ -52,8 +51,9 @@ class PartialSendOpASCENDKernel : public framework::OpKernel<T> {
     int nranks = comm->nranks();
     int rank = comm->rank();
 
-    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
-                                     "The nranks must be 2, but (%d)", nranks));
+    PADDLE_ENFORCE_EQ(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The nranks must be 2, but (%d)", nranks));
 
     int root = rank;
 
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc
index 494665544f0d3..15da47e713bb9 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/recv_v2_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index f7a2e198db938..67c30438869b1 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -224,6 +224,9 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(recv_v2, ops::RecvOpV2CUDAKernel<float>,
                         ops::RecvOpV2CUDAKernel<double>,
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+                        ops::RecvOpV2CUDAKernel<plat::bfloat16>,
+#endif
                         ops::RecvOpV2CUDAKernel<int>,
                         ops::RecvOpV2CUDAKernel<int64_t>,
                         ops::RecvOpV2CUDAKernel<int8_t>,
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
index c31f1210f0422..9aa1ab788693d 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -61,8 +61,9 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
     int nranks = comm->nranks();
     int peer = ctx.Attr<int>("peer");
 
-    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
-                                     "The nranks must be 2, but (%d)", nranks));
+    PADDLE_ENFORCE_EQ(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The nranks must be 2, but (%d)", nranks));
 
     int root = peer;
 
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
index 6e02d36215697..0022b6bf39ddf 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -17,20 +17,19 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/operators/collective/recv_v2_op.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index 8878b7c3449b9..cfb3a11513a21 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -197,6 +197,9 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(send_v2, ops::SendOpV2CUDAKernel<float>,
                         ops::SendOpV2CUDAKernel<double>,
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+                        ops::SendOpV2CUDAKernel<plat::bfloat16>,
+#endif
                         ops::SendOpV2CUDAKernel<int>,
                         ops::SendOpV2CUDAKernel<int64_t>,
                         ops::SendOpV2CUDAKernel<int8_t>,
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
index 882630467a012..ee34026cb28b2 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -60,8 +60,9 @@ class CSendOpASCENDKernel : public framework::OpKernel<T> {
     int nranks = comm->nranks();
     int rank = comm->rank();
 
-    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
-                                     "The nranks must be 2, but (%d)", nranks));
+    PADDLE_ENFORCE_EQ(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The nranks must be 2, but (%d)", nranks));
 
     int root = rank;
 
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
index 57e3dd53cc774..9784e6ddc1537 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -17,19 +17,19 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include "gtest/gtest.h"
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/operators/collective/send_v2_op.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/common_infer_shape_functions.cc b/paddle/fluid/operators/common_infer_shape_functions.cc
index 1d187451c6858..8bd60c77c46cf 100644
--- a/paddle/fluid/operators/common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/common_infer_shape_functions.cc
@@ -61,12 +61,13 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
     PADDLE_ENFORCE_EQ(
         x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 ||
             y_dims_array[i] <= 1,
-        true, platform::errors::InvalidArgument(
-                  "Broadcast dimension mismatch. Operands could "
-                  "not be broadcast together with the shape of X = [%s] and "
-                  "the shape of Y = [%s]. Received [%d] in X is not equal to "
-                  "[%d] in Y at i:%d.",
-                  x_dims, y_dims, x_dims_array[i], y_dims_array[i], i));
+        true,
+        platform::errors::InvalidArgument(
+            "Broadcast dimension mismatch. Operands could "
+            "not be broadcast together with the shape of X = [%s] and "
+            "the shape of Y = [%s]. Received [%d] in X is not equal to "
+            "[%d] in Y at i:%d.",
+            x_dims, y_dims, x_dims_array[i], y_dims_array[i], i));
     if ((x_dims_array[i] > 1 || y_dims_array[i] > 1) ||
         (x_dims_array[i] == 1 && y_dims_array[i] == 1)) {
       out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
diff --git a/paddle/fluid/operators/complex_op.cc b/paddle/fluid/operators/complex_op.cc
index 7241c92258eea..d358f5765f9e8 100644
--- a/paddle/fluid/operators/complex_op.cc
+++ b/paddle/fluid/operators/complex_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/complex_op.h"
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
diff --git a/paddle/fluid/operators/complex_view_op.cc b/paddle/fluid/operators/complex_view_op.cc
index 763f936ec9c48..92b48fe8b06c7 100644
--- a/paddle/fluid/operators/complex_view_op.cc
+++ b/paddle/fluid/operators/complex_view_op.cc
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/operators/complex_view_op.cu b/paddle/fluid/operators/complex_view_op.cu
index 261881cb8d256..b62c0470dd6ba 100644
--- a/paddle/fluid/operators/complex_view_op.cu
+++ b/paddle/fluid/operators/complex_view_op.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/complex_view_op.h"
-
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/complex_view_op.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index a467f2dbee7c9..599fbcce39ff3 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -15,11 +15,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/concat_op.h"
 
 #include <paddle/fluid/platform/complex.h>
+
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/infershape_utils.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index 50aca54c12dec..746e0e7a056fe 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -17,11 +17,11 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
-
 #include "paddle/phi/kernels/concat_kernel.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 
diff --git a/paddle/fluid/operators/concat_op_mlu.cc b/paddle/fluid/operators/concat_op_mlu.cc
index 63f4ec46599ba..3d927af96e1b7 100644
--- a/paddle/fluid/operators/concat_op_mlu.cc
+++ b/paddle/fluid/operators/concat_op_mlu.cc
@@ -74,6 +74,65 @@ class ConcatMLUKernel : public framework::OpKernel<T> {
                     output_desc.get(), GetBasePtr(out));
   }
 };
+
+template <typename T>
+class ConcatGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
+    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
+    auto outs =
+        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+    auto axis = ctx.Attr<int>("axis");
+    int split_num = ins.size();
+
+    PADDLE_ENFORCE_NOT_NULL(ins[0],
+                            platform::errors::NotFound(
+                                "The first input tensor is not initalized."));
+
+    if (ctx.HasInput("AxisTensor")) {
+      auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
+      axis = GetDataFromTensor<int>(axis_tensor)[0];
+    }
+
+    axis = ComputeAxis(static_cast<int64_t>(axis),
+                       static_cast<int64_t>(ins[0]->dims().size()));
+    PADDLE_ENFORCE_GE(axis, 0,
+                      platform::errors::InvalidArgument(
+                          "concat_grad: axis should be larger than or "
+                          "equal to 0, but received axis is %d.",
+                          axis));
+    PADDLE_ENFORCE_LT(
+        axis, out_grad->dims().size(),
+        platform::errors::InvalidArgument(
+            "concat_grad: axis should be less than ins[0]->dims()!"
+            "But received axis is %d, while ins[0]->dims()"
+            "size is %d.",
+            axis, out_grad->dims().size()));
+    // get output tensor that the name is not kEmptyVarName
+    std::vector<void*> outputs_vec;
+    std::vector<MLUCnnlTensorDesc> output_descs;
+    std::vector<cnnlTensorDescriptor_t> descs_vec;
+    for (size_t j = 0; j < outs.size(); ++j) {
+      if (out_var_names[j] != framework::kEmptyVarName &&
+          outs[j]->numel() != 0UL) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        output_descs.emplace_back(MLUCnnlTensorDesc(*outs[j]));
+        descs_vec.push_back(output_descs.back().get());
+        outputs_vec.push_back(GetBasePtr(outs[j]));
+      } else {
+        outputs_vec.push_back(nullptr);
+      }
+    }
+
+    MLUCnnlTensorDesc out_grad_desc(*out_grad);
+    MLUCnnl::Split(ctx, static_cast<int>(split_num), static_cast<int>(axis),
+                   out_grad_desc.get(), GetBasePtr(out_grad), descs_vec.data(),
+                   outputs_vec.data());
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
@@ -84,3 +143,9 @@ REGISTER_OP_MLU_KERNEL(concat, ops::ConcatMLUKernel<float>,
                        ops::ConcatMLUKernel<int64_t>,
                        ops::ConcatMLUKernel<bool>, ops::ConcatMLUKernel<int>,
                        ops::ConcatMLUKernel<uint8_t>);
+REGISTER_OP_MLU_KERNEL(concat_grad, ops::ConcatGradMLUKernel<float>,
+                       ops::ConcatGradMLUKernel<paddle::platform::float16>,
+                       ops::ConcatGradMLUKernel<int64_t>,
+                       ops::ConcatGradMLUKernel<bool>,
+                       ops::ConcatGradMLUKernel<int>,
+                       ops::ConcatGradMLUKernel<uint8_t>);
diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc
index ba35098bbac10..fcbfc6f7a2b3c 100644
--- a/paddle/fluid/operators/concat_op_xpu.cc
+++ b/paddle/fluid/operators/concat_op_xpu.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/concat_op.h"
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
+#include "paddle/fluid/operators/concat_op.h"
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/phi/core/lod_utils.h"
 
 namespace paddle {
@@ -33,17 +33,19 @@ class ConcatXPUKernel : public framework::OpKernel<T> {
     auto ins = ctx.MultiInput<framework::LoDTensor>("X");
     framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
     int axis = ctx.Attr<int>("axis");
-    PADDLE_ENFORCE_NE(ins[0], nullptr, platform::errors::InvalidArgument(
-                                           "The input should not be null."));
+    PADDLE_ENFORCE_NE(
+        ins[0], nullptr,
+        platform::errors::InvalidArgument("The input should not be null."));
     PADDLE_ENFORCE_NE(ctx.HasInput("AxisTensor"), true,
                       platform::errors::InvalidArgument(
                           "XPU donot surpport AxisTensor for now"));
     axis = ComputeAxis(static_cast<int64_t>(axis),
                        static_cast<int64_t>(ins[0]->dims().size()));
-    PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument(
-                                   "concat: axis should be larger than or "
-                                   "equal to 0, but received axis is %d.",
-                                   axis));
+    PADDLE_ENFORCE_GE(axis, 0,
+                      platform::errors::InvalidArgument(
+                          "concat: axis should be larger than or "
+                          "equal to 0, but received axis is %d.",
+                          axis));
     PADDLE_ENFORCE_LT(axis, ins[0]->dims().size(),
                       platform::errors::InvalidArgument(
                           "concat: axis should be less than ins[0]->dims()!"
@@ -94,8 +96,9 @@ class ConcatXPUKernel : public framework::OpKernel<T> {
       }
     }
 
-    PADDLE_ENFORCE_GT(xdims_list.size(), 0, platform::errors::InvalidArgument(
-                                                "No tensor need concat"));
+    PADDLE_ENFORCE_GT(
+        xdims_list.size(), 0,
+        platform::errors::InvalidArgument("No tensor need concat"));
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
     int r = xpu::concat<XPUType>(dev_ctx.x_context(), ptrs,
@@ -129,8 +132,9 @@ class ConcatGradXPUKernel : public framework::OpKernel<T> {
         }
       }
     }
-    PADDLE_ENFORCE_NE(ins[0], nullptr, platform::errors::InvalidArgument(
-                                           "The input should not be null."));
+    PADDLE_ENFORCE_NE(
+        ins[0], nullptr,
+        platform::errors::InvalidArgument("The input should not be null."));
     auto axis = ctx.Attr<int>("axis");
     if (ctx.HasInput("AxisTensor")) {
       auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
@@ -149,10 +153,11 @@ class ConcatGradXPUKernel : public framework::OpKernel<T> {
         ptrs[j] = nullptr;
       }
     }
-    PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument(
-                                   "concat_grad: axis should be larger than or "
-                                   "equal to 0, but received axis is %d.",
-                                   axis));
+    PADDLE_ENFORCE_GE(axis, 0,
+                      platform::errors::InvalidArgument(
+                          "concat_grad: axis should be larger than or "
+                          "equal to 0, but received axis is %d.",
+                          axis));
     PADDLE_ENFORCE_LT(
         axis, out_grad->dims().size(),
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc
index cbec1182f20b8..0c294b60482e4 100644
--- a/paddle/fluid/operators/conj_op.cc
+++ b/paddle/fluid/operators/conj_op.cc
@@ -74,8 +74,9 @@ REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker,
                   ConjInferShapeFunctor);
 
 REGISTER_OP_CPU_KERNEL(
-    conj, ops::ConjKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::complex<float>>,
+    conj,
+    ops::ConjKernel<paddle::platform::CPUDeviceContext,
+                    paddle::platform::complex<float>>,
     ops::ConjKernel<paddle::platform::CPUDeviceContext,
                     paddle::platform::complex<double>>,
     ops::ConjKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/conj_op.cu b/paddle/fluid/operators/conj_op.cu
index d04024d70a8ea..548508636ca26 100644
--- a/paddle/fluid/operators/conj_op.cu
+++ b/paddle/fluid/operators/conj_op.cu
@@ -17,8 +17,9 @@
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    conj, ops::ConjKernel<paddle::platform::CUDADeviceContext,
-                          paddle::platform::complex<float>>,
+    conj,
+    ops::ConjKernel<paddle::platform::CUDADeviceContext,
+                    paddle::platform::complex<float>>,
     ops::ConjKernel<paddle::platform::CUDADeviceContext,
                     paddle::platform::complex<double>>,
     ops::ConjKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index 0c18522fa32ea..193c5c4505641 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -1,24 +1,51 @@
 include(operators)
 if(WITH_UNITY_BUILD)
-    # Load Unity Build rules for operators in paddle/fluid/operators/controlflow.
-    include(unity_build_rule.cmake)
+  # Load Unity Build rules for operators in paddle/fluid/operators/controlflow.
+  include(unity_build_rule.cmake)
 endif()
 register_operators(EXCLUDES conditional_block_op DEPS naive_executor)
 
-cc_library(conditional_block_op SRCS conditional_block_op.cc DEPS executor)
-cc_library(op_variant SRCS op_variant.cc DEPS operator proto_desc)
-cc_library(conditional_block_op_helper SRCS conditional_block_op_helper.cc DEPS operator op_variant conditional_block_op)
-cc_library(recurrent_op_helper SRCS recurrent_op_helper.cc DEPS operator op_variant recurrent_op)
-cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator op_variant) 
+cc_library(
+  conditional_block_op
+  SRCS conditional_block_op.cc
+  DEPS executor)
+cc_library(
+  op_variant
+  SRCS op_variant.cc
+  DEPS operator proto_desc)
+cc_library(
+  conditional_block_op_helper
+  SRCS conditional_block_op_helper.cc
+  DEPS operator op_variant conditional_block_op)
+cc_library(
+  recurrent_op_helper
+  SRCS recurrent_op_helper.cc
+  DEPS operator op_variant recurrent_op)
+cc_library(
+  while_op_helper
+  SRCS while_op_helper.cc
+  DEPS operator op_variant)
 
-cc_test(conditional_block_op_test SRCS conditional_block_op_test.cc DEPS conditional_block_op executor)
+cc_test(
+  conditional_block_op_test
+  SRCS conditional_block_op_test.cc
+  DEPS conditional_block_op executor)
 
 if(WITH_UNITY_BUILD)
-    target_link_libraries(paddle_operators_controlflow_unity conditional_block_op)
+  target_link_libraries(paddle_operators_controlflow_unity conditional_block_op)
 else()
-    target_link_libraries(conditional_block_infer_op conditional_block_op)
+  target_link_libraries(conditional_block_infer_op conditional_block_op)
 endif()
 
-file(APPEND ${pybind_file} "USE_OP_ITSELF(less_than);\nUSE_OP_ITSELF(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
-file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n")
-file(APPEND ${pybind_file} "USE_OP_ITSELF(bitwise_and);\nUSE_OP_ITSELF(bitwise_or);\nUSE_OP_ITSELF(bitwise_xor);\nUSE_OP_ITSELF(bitwise_not);\n")
+file(
+  APPEND ${pybind_file}
+  "USE_OP_ITSELF(less_than);\nUSE_OP_ITSELF(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n"
+)
+file(
+  APPEND ${pybind_file}
+  "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n"
+)
+file(
+  APPEND ${pybind_file}
+  "USE_OP_ITSELF(bitwise_and);\nUSE_OP_ITSELF(bitwise_or);\nUSE_OP_ITSELF(bitwise_xor);\nUSE_OP_ITSELF(bitwise_not);\n"
+)
diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cc b/paddle/fluid/operators/controlflow/bitwise_op.cc
index 4dcbbc8568ff1..19865f9a9fb71 100644
--- a/paddle/fluid/operators/controlflow/bitwise_op.cc
+++ b/paddle/fluid/operators/controlflow/bitwise_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 72d81d8c3fdf2..21fc69eb019d3 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -80,14 +80,12 @@ class CompareOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_COMPARE_OP_VERSION(op_type)                               \
-  REGISTER_OP_VERSION(op_type)                                             \
-      .AddCheckpoint(                                                      \
-          R"ROC(Upgrade compare ops, add a new attribute [force_cpu])ROC", \
-          paddle::framework::compatible::OpVersionDesc().ModifyAttr(       \
-              "force_cpu",                                                 \
-              "In order to force fill output variable to gpu memory.",     \
-              false));
+#define REGISTER_COMPARE_OP_VERSION(op_type)                           \
+  REGISTER_OP_VERSION(op_type).AddCheckpoint(                          \
+      R"ROC(Upgrade compare ops, add a new attribute [force_cpu])ROC", \
+      paddle::framework::compatible::OpVersionDesc().ModifyAttr(       \
+          "force_cpu",                                                 \
+          "In order to force fill output variable to gpu memory.", false));
 
 #define REGISTER_COMPARE_OP(op_type, _equation)                          \
   struct _##op_type##Comment {                                           \
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index c024e4a12cd47..c1d13ffdf1295 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -68,10 +68,11 @@ class ConditionalOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(framework::TransToProtoVarType(ips[0]->dtype()) ==
                               framework::proto::VarType::BOOL &&
                           ips[0]->numel() == 1,
-                      true, platform::errors::InvalidArgument(
-                                "condition input's data type should be bool, "
-                                "numel should be 1, actual numel is %d",
-                                ips[0]->numel()));
+                      true,
+                      platform::errors::InvalidArgument(
+                          "condition input's data type should be bool, "
+                          "numel should be 1, actual numel is %d",
+                          ips[0]->numel()));
     bool res = false;
     if (platform::is_gpu_place(ips[0]->place())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
index 111ca9c63c634..369a1ffedc419 100644
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -35,10 +35,11 @@ static void DataCopy(const framework::LoDTensor &src_item,
       // as params are not a subject to paddle's data_format
       VLOG(4) << "innerTransDataLayoutFromMKLDNN";
       framework::innerTransDataLayoutFromMKLDNN(
-          src_item.layout(), fetch_var_name == framework::GradVarName("Filter")
-                                 ? framework::DataLayout::kNCHW
-                                 : paddle::platform::MKLDNNDeviceContext::tls()
-                                       .get_cur_paddle_data_layout(),
+          src_item.layout(),
+          fetch_var_name == framework::GradVarName("Filter")
+              ? framework::DataLayout::kNCHW
+              : paddle::platform::MKLDNNDeviceContext::tls()
+                    .get_cur_paddle_data_layout(),
           src_item, &out, platform::CPUPlace());
       paddle::framework::TensorCopySync(out, platform::CPUPlace(), dst_item);
     } else {
@@ -92,11 +93,12 @@ class FetchOp : public framework::OperatorBase {
 
     int col = Attr<int>("col");
     PADDLE_ENFORCE_GE(
-        col, 0, platform::errors::InvalidArgument(
-                    "Expected the column index (the attribute 'col' of "
-                    "operator 'Fetch') of current fetching variable to be "
-                    "no less than 0. But received column index = %d.",
-                    col));
+        col, 0,
+        platform::errors::InvalidArgument(
+            "Expected the column index (the attribute 'col' of "
+            "operator 'Fetch') of current fetching variable to be "
+            "no less than 0. But received column index = %d.",
+            col));
 
     VLOG(3) << "Fetch variable " << fetch_var_name << " to variable "
             << out_name << "'s " << col << " column.";
diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
index caa67139a9b95..29d6eb1b2d44c 100644
--- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
@@ -42,10 +42,11 @@ static void DeepCopy(const framework::LoDTensor &src_item,
       // Convert to desired Paddle layout, apart from grads of filter
       // as params are not a subject to paddle's data_format
       framework::innerTransDataLayoutFromMKLDNN(
-          src_item.layout(), fetch_var_name == framework::GradVarName("Filter")
-                                 ? framework::DataLayout::kNCHW
-                                 : paddle::platform::MKLDNNDeviceContext::tls()
-                                       .get_cur_paddle_data_layout(),
+          src_item.layout(),
+          fetch_var_name == framework::GradVarName("Filter")
+              ? framework::DataLayout::kNCHW
+              : paddle::platform::MKLDNNDeviceContext::tls()
+                    .get_cur_paddle_data_layout(),
           src_item, &out, platform::CPUPlace());
       paddle::framework::TensorCopySync(out, platform::CPUPlace(), dst_item);
     } else {
@@ -123,11 +124,12 @@ class FetchV2Kernel {
 
     int col = ctx.Attr<int>("col");
     PADDLE_ENFORCE_GE(
-        col, 0, platform::errors::InvalidArgument(
-                    "Expected the column index (the attribute 'col' of "
-                    "operator 'Fetch') of current fetching variable to be "
-                    "no less than 0. But received column index = %d.",
-                    col));
+        col, 0,
+        platform::errors::InvalidArgument(
+            "Expected the column index (the attribute 'col' of "
+            "operator 'Fetch') of current fetching variable to be "
+            "no less than 0. But received column index = %d.",
+            col));
 
     auto *fetch_list = out_var->GetMutable<framework::FetchList>();
 
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 55bd4879ab794..7f3b004004136 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -62,9 +62,10 @@ class GetPlacesOp : public framework::OperatorBase {
       device_count =
           is_gpu ? CUDADevCount() : std::thread::hardware_concurrency();
     }
-    PADDLE_ENFORCE_NE(device_count, 0UL, platform::errors::InvalidArgument(
-                                             "Cannot indicate %s device count",
-                                             is_gpu ? "GPU" : "CPU"));
+    PADDLE_ENFORCE_NE(
+        device_count, 0UL,
+        platform::errors::InvalidArgument("Cannot indicate %s device count",
+                                          is_gpu ? "GPU" : "CPU"));
 
     auto out_var_name = Output("Out");
     auto &places = *(GET_DATA_SAFELY(scope.FindVar(out_var_name), "Output",
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index 4d11cb5ff74e6..a9c28f48ef739 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
diff --git a/paddle/fluid/operators/controlflow/op_variant.h b/paddle/fluid/operators/controlflow/op_variant.h
index cc1f36a875f77..57d44b6793966 100644
--- a/paddle/fluid/operators/controlflow/op_variant.h
+++ b/paddle/fluid/operators/controlflow/op_variant.h
@@ -50,8 +50,9 @@ class OpVariant {
   const AttrType &Attr(const std::string &name) const {
     auto &attrs = Attrs();
     auto it = attrs.find(name);
-    PADDLE_ENFORCE_NE(it, attrs.end(), platform::errors::NotFound(
-                                           "Cannot find attribute %s.", name));
+    PADDLE_ENFORCE_NE(
+        it, attrs.end(),
+        platform::errors::NotFound("Cannot find attribute %s.", name));
     return BOOST_GET_CONST(AttrType, it->second);
   }
 
diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
index 43913cae6b3c2..62cd2fc3376d5 100644
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/controlflow/unity_build_rule.cmake b/paddle/fluid/operators/controlflow/unity_build_rule.cmake
index 690a332d20b4c..594ae3a36cf1d 100644
--- a/paddle/fluid/operators/controlflow/unity_build_rule.cmake
+++ b/paddle/fluid/operators/controlflow/unity_build_rule.cmake
@@ -4,20 +4,18 @@
 # Generally, the combination rules in this file do not need to be modified.
 # If there are some redefined error in compiling with the source file which
 # in combination rule, you can remove the source file from the following rules.
-register_unity_group(cc
-    compare_all_op.cc
-    compare_op.cc
-    conditional_block_infer_op.cc
-    feed_op.cc
-    fetch_op.cc
-    fetch_v2_op.cc
-    get_places_op.cc
-    logical_op.cc
-    bitwise_op.cc
-    tensor_array_read_write_op.cc
-    while_op.cc)
-register_unity_group(cu
-    logical_op.cu
-    bitwise_op.cu
-    compare_op.cu
-    compare_all_op.cu)
+register_unity_group(
+  cc
+  compare_all_op.cc
+  compare_op.cc
+  conditional_block_infer_op.cc
+  feed_op.cc
+  fetch_op.cc
+  fetch_v2_op.cc
+  get_places_op.cc
+  logical_op.cc
+  bitwise_op.cc
+  tensor_array_read_write_op.cc
+  while_op.cc)
+register_unity_group(cu logical_op.cu bitwise_op.cu compare_op.cu
+                     compare_all_op.cu)
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index d8daa25f31be8..a551bad8eb10e 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -45,7 +45,7 @@ static std::string GetSkipEagerDeletionVarsDebugString(
   }
   return str;
 }
-}  // NOLINT
+}  // namespace
 
 class WhileOp : public framework::OperatorBase {
  public:
@@ -375,10 +375,11 @@ class WhileGradOp : public framework::OperatorBase {
           PADDLE_ENFORCE_EQ(
               var->IsType<framework::LoDTensorArray>() ||
                   var->IsType<LoDTensor>(),
-              true, platform::errors::InvalidArgument(
-                        "Currently the type of var only can be LoDTensorArray, "
-                        "or LoDTensor, but the received var[%s] is %s.",
-                        inside_grad_name, framework::ToTypeName(var->Type())));
+              true,
+              platform::errors::InvalidArgument(
+                  "Currently the type of var only can be LoDTensorArray, "
+                  "or LoDTensor, but the received var[%s] is %s.",
+                  inside_grad_name, framework::ToTypeName(var->Type())));
 
           if ((var_iter == outside_og_names.end()) &&
               var->IsType<LoDTensor>()) {
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
index 63b273fdbb8bd..2b2001be6bfff 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 
 #include <string>
+
 #include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h
index 9e1a323fc9f3d..f141c9eb08766 100644
--- a/paddle/fluid/operators/conv_base_helper.h
+++ b/paddle/fluid/operators/conv_base_helper.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/conv_search_cache.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
index af67d857e0eb7..3d704c8be30e4 100644
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <functional>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index f084862b419d5..28ca2feeec53b 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -19,15 +19,13 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/op_version_registry.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
-#include "paddle/fluid/platform/cudnn_workspace_helper.h"
-
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
@@ -864,16 +862,15 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad,
                   ops::Conv3DDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad);
 
-REGISTER_OP_VERSION(conv2d)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(conv2d).AddCheckpoint(
+    R"ROC(
       Upgrade conv2d, add a new attribute [use_addto].
     )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "use_addto",
-            "In order to support new feature (inplace addto strategy) for "
-            "gradient accumulation.",
-            false));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "use_addto",
+        "In order to support new feature (inplace addto strategy) for "
+        "gradient accumulation.",
+        false));
 
 REGISTER_OP_VERSION(depthwise_conv2d)
     .AddCheckpoint(
@@ -886,13 +883,12 @@ REGISTER_OP_VERSION(depthwise_conv2d)
             "gradient accumulation.",
             false));
 
-REGISTER_OP_VERSION(conv3d)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(conv3d).AddCheckpoint(
+    R"ROC(
       Upgrade conv3d, add a new attribute [use_addto].
     )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "use_addto",
-            "In order to support new feature (inplace addto strategy) for "
-            "gradient accumulation.",
-            false));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "use_addto",
+        "In order to support new feature (inplace addto strategy) for "
+        "gradient accumulation.",
+        false));
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 58f2eeee256db..644a827b48821 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/layout_utils.h"
diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc
index 3ace825e7b80d..15a5aa737ae7e 100644
--- a/paddle/fluid/operators/conv_op_npu.cc
+++ b/paddle/fluid/operators/conv_op_npu.cc
@@ -130,12 +130,12 @@ class DepthwiseConvNPUKernel : public framework::OpKernel<T> {
         "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}});
     runner_trans.Run(stream);
 
-    const auto& runner =
-        NpuOpRunner("DepthwiseConv2D", {input_tensor, transformed_filter},
-                    {output_tensor}, {{"strides", strides},
-                                      {"dilations", dilations},
-                                      {"pads", padding},
-                                      {"data_format", data_format}});
+    const auto& runner = NpuOpRunner(
+        "DepthwiseConv2D", {input_tensor, transformed_filter}, {output_tensor},
+        {{"strides", strides},
+         {"dilations", dilations},
+         {"pads", padding},
+         {"data_format", data_format}});
     runner.Run(stream);
   }
 };
@@ -392,14 +392,15 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
         filter_grad_fp32.ShareDataWith(*filter_grad);
       }
 
-      const auto& runner = NpuOpRunner(
-          "Conv2DBackpropFilterD", {input_tensor, output_grad_tensor},
-          {filter_grad_fp32}, {{"filter_size", filter_shape_vec},
-                               {"strides", strides_vec},
-                               {"pads", paddings},
-                               {"dilations", dilations_vec},
-                               {"groups", groups},
-                               {"data_format", data_format}});
+      const auto& runner =
+          NpuOpRunner("Conv2DBackpropFilterD",
+                      {input_tensor, output_grad_tensor}, {filter_grad_fp32},
+                      {{"filter_size", filter_shape_vec},
+                       {"strides", strides_vec},
+                       {"pads", paddings},
+                       {"dilations", dilations_vec},
+                       {"groups", groups},
+                       {"data_format", data_format}});
       runner.Run(stream);
 
       if (framework::TransToProtoVarType(input->dtype()) ==
@@ -418,12 +419,13 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
       }
       const auto& runner =
           NpuOpRunner("Conv2DBackpropInputD", {*filter, output_grad_tensor},
-                      {input_grad_tensor}, {{"input_size", input_shape_vec},
-                                            {"strides", strides_vec},
-                                            {"pads", paddings},
-                                            {"dilations", dilations_vec},
-                                            {"groups", groups},
-                                            {"data_format", data_format}});
+                      {input_grad_tensor},
+                      {{"input_size", input_shape_vec},
+                       {"strides", strides_vec},
+                       {"pads", paddings},
+                       {"dilations", dilations_vec},
+                       {"groups", groups},
+                       {"data_format", data_format}});
       runner.Run(stream);
     }
   }
@@ -452,11 +454,12 @@ class NPUConv3dKernel : public framework::OpKernel<T> {
                           "= [%s]",
                           data_format));
 
-    PADDLE_ENFORCE_EQ(groups, 1, platform::errors::Unimplemented(
-                                     "the groups must be 1 in "
-                                     "the npu kernel of conv3d, but got groups "
-                                     "= [%d]",
-                                     groups));
+    PADDLE_ENFORCE_EQ(groups, 1,
+                      platform::errors::Unimplemented(
+                          "the groups must be 1 in "
+                          "the npu kernel of conv3d, but got groups "
+                          "= [%d]",
+                          groups));
 
     output->mutable_data<T>(ctx.GetPlace());
 
@@ -537,11 +540,12 @@ class NPUConv3dGradKernel : public framework::OpKernel<T> {
                           "= [%s]",
                           data_format));
 
-    PADDLE_ENFORCE_EQ(groups, 1, platform::errors::Unimplemented(
-                                     "the groups must be 1 in "
-                                     "the npu kernel of conv3d, but got groups "
-                                     "= [%d]",
-                                     groups));
+    PADDLE_ENFORCE_EQ(groups, 1,
+                      platform::errors::Unimplemented(
+                          "the groups must be 1 in "
+                          "the npu kernel of conv3d, but got groups "
+                          "= [%d]",
+                          groups));
 
     auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
     auto input_tensor =
@@ -593,14 +597,15 @@ class NPUConv3dGradKernel : public framework::OpKernel<T> {
       filter_grad_tensor.ShareDataWith(*filter_grad);
       filter_grad_tensor.set_layout(DataLayout::kNCDHW);
 
-      const auto& runner = NpuOpRunner(
-          "Conv3DBackpropFilterD", {input_tensor, output_grad_tensor},
-          {filter_grad_tensor}, {{"filter_size", filter_shape_vec},
-                                 {"strides", strides_vec},
-                                 {"pads", paddings},
-                                 {"dilations", dilations_vec},
-                                 {"groups", groups},
-                                 {"data_format", data_format}});
+      const auto& runner =
+          NpuOpRunner("Conv3DBackpropFilterD",
+                      {input_tensor, output_grad_tensor}, {filter_grad_tensor},
+                      {{"filter_size", filter_shape_vec},
+                       {"strides", strides_vec},
+                       {"pads", paddings},
+                       {"dilations", dilations_vec},
+                       {"groups", groups},
+                       {"data_format", data_format}});
       runner.Run(stream);
     }
 
@@ -613,14 +618,15 @@ class NPUConv3dGradKernel : public framework::OpKernel<T> {
       input_grad_tensor.ShareDataWith(*input_grad);
       input_grad_tensor.set_layout(DataLayout::kNCDHW);
 
-      const auto& runner = NpuOpRunner(
-          "Conv3DBackpropInputD", {filter_tensor, output_grad_tensor},
-          {input_grad_tensor}, {{"input_size", input_shape_vec},
-                                {"strides", strides_vec},
-                                {"pads", paddings},
-                                {"dilations", dilations_vec},
-                                {"groups", groups},
-                                {"data_format", data_format}});
+      const auto& runner =
+          NpuOpRunner("Conv3DBackpropInputD",
+                      {filter_tensor, output_grad_tensor}, {input_grad_tensor},
+                      {{"input_size", input_shape_vec},
+                       {"strides", strides_vec},
+                       {"pads", paddings},
+                       {"dilations", dilations_vec},
+                       {"groups", groups},
+                       {"data_format", data_format}});
       runner.Run(stream);
     }
   }
diff --git a/paddle/fluid/operators/conv_op_xpu.cc b/paddle/fluid/operators/conv_op_xpu.cc
index cc5c20d392809..d66eefc694691 100644
--- a/paddle/fluid/operators/conv_op_xpu.cc
+++ b/paddle/fluid/operators/conv_op_xpu.cc
@@ -8,10 +8,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/conv_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #ifdef PADDLE_WITH_XPU
 namespace paddle {
diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
index e7af908eba2c5..e996021ed843e 100644
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_shift_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/eigen.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index fe76fc3aebbc1..8b60c67f92e5e 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc
index 050ede78f72cf..c07be5a3fdbf1 100644
--- a/paddle/fluid/operators/conv_transpose_op_npu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_npu.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/conv_transpose_op.h"
-
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/conv_transpose_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 
@@ -90,9 +89,9 @@ class Conv2DTransposeNPUKernel : public framework::OpKernel<T> {
     auto output_dim_vec = phi::vectorize(output_tensor.dims());
 
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    const auto& runner =
-        NpuOpRunner("Conv2DTransposeD", {input_tensor, *filter},
-                    {output_tensor}, {{"input_size", output_dim_vec},
+    const auto& runner = NpuOpRunner("Conv2DTransposeD",
+                                     {input_tensor, *filter}, {output_tensor},
+                                     {{"input_size", output_dim_vec},
                                       {"strides", strides},
                                       {"dilations", dilations},
                                       {"output_padding", output_padding},
@@ -167,14 +166,15 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
     if (filter_grad) {
       filter_grad->mutable_data<T>(ctx.GetPlace());
-      const auto& runner = NpuOpRunner(
-          "Conv2DBackpropFilterD", {output_grad_tensor, input_tensor},
-          {*filter_grad}, {{"filter_size", phi::vectorize<int>(filter_dims)},
-                           {"strides", strides_vec},
-                           {"pads", paddings},
-                           {"dilations", dilations_vec},
-                           {"groups", groups},
-                           {"data_format", data_format}});
+      const auto& runner =
+          NpuOpRunner("Conv2DBackpropFilterD",
+                      {output_grad_tensor, input_tensor}, {*filter_grad},
+                      {{"filter_size", phi::vectorize<int>(filter_dims)},
+                       {"strides", strides_vec},
+                       {"pads", paddings},
+                       {"dilations", dilations_vec},
+                       {"groups", groups},
+                       {"data_format", data_format}});
       runner.Run(stream);
     }
     if (input_grad) {
@@ -184,13 +184,13 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
       if (channel_last) {
         input_grad_tensor.set_layout(DataLayout::kNHWC);
       }
-      const auto& runner =
-          NpuOpRunner("Conv2D", {output_grad_tensor, *filter},
-                      {input_grad_tensor}, {{"strides", strides_vec},
-                                            {"pads", paddings},
-                                            {"dilations", dilations_vec},
-                                            {"groups", groups},
-                                            {"data_format", data_format}});
+      const auto& runner = NpuOpRunner("Conv2D", {output_grad_tensor, *filter},
+                                       {input_grad_tensor},
+                                       {{"strides", strides_vec},
+                                        {"pads", paddings},
+                                        {"dilations", dilations_vec},
+                                        {"groups", groups},
+                                        {"data_format", data_format}});
       runner.Run(stream);
     }
   }
diff --git a/paddle/fluid/operators/conv_transpose_op_xpu.cc b/paddle/fluid/operators/conv_transpose_op_xpu.cc
index b8bd3c4f00608..ae25c57784f02 100644
--- a/paddle/fluid/operators/conv_transpose_op_xpu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_xpu.cc
@@ -9,12 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/conv_transpose_op.h"
-
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/conv_transpose_op.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 
diff --git a/paddle/fluid/operators/correlation_op.cc b/paddle/fluid/operators/correlation_op.cc
index 62e0f311d15d0..21258958549ae 100644
--- a/paddle/fluid/operators/correlation_op.cc
+++ b/paddle/fluid/operators/correlation_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu
index f488cc12e642b..f9dd9ab98a308 100644
--- a/paddle/fluid/operators/correlation_op.cu
+++ b/paddle/fluid/operators/correlation_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 #ifdef __HIPCC__
@@ -227,11 +228,11 @@ class CorrelationCUDAKernel : public framework::OpKernel<T> {
     dim3 threadsPerBlock(THREADS_PER_BLOCK);
     dim3 totalBlocksCorr(N, OH, OW);
 
-    correlation_forward<
-        T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
-        output->data<T>(), OC, OH, OW, rinput1.data<T>(), C, H, W,
-        rinput2.data<T>(), pad_size, kernel_size, max_displacement, stride1,
-        stride2);
+    correlation_forward<T>
+        <<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+            output->data<T>(), OC, OH, OW, rinput1.data<T>(), C, H, W,
+            rinput2.data<T>(), pad_size, kernel_size, max_displacement, stride1,
+            stride2);
   }
 };
 
@@ -472,19 +473,19 @@ class CorrelationCUDAGradKernel : public framework::OpKernel<T> {
     dim3 totalBlocksCorr(H, W, C);
 
     for (int n = 0; n < N; n++) {
-      correlation_backward_input1<
-          T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
-          n, grad_input1->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH,
-          GOW, rinput2.data<T>(), pad_size, kernel_size, max_displacement,
-          stride1, stride2);
+      correlation_backward_input1<T>
+          <<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+              n, grad_input1->data<T>(), C, H, W, grad_output->data<T>(), GOC,
+              GOH, GOW, rinput2.data<T>(), pad_size, kernel_size,
+              max_displacement, stride1, stride2);
     }
 
     for (int n = 0; n < N; n++) {
-      correlation_backward_input2<
-          T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
-          n, grad_input2->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH,
-          GOW, rinput1.data<T>(), pad_size, kernel_size, max_displacement,
-          stride1, stride2);
+      correlation_backward_input2<T>
+          <<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+              n, grad_input2->data<T>(), C, H, W, grad_output->data<T>(), GOC,
+              GOH, GOW, rinput1.data<T>(), pad_size, kernel_size,
+              max_displacement, stride1, stride2);
     }
   }
 };
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
index d41ceafba1a1b..4c0c5596e5d1b 100644
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cos_sim_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index 6d3e6e34c3b8e..fa080b7a4b466 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -158,11 +158,12 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             (label_dims.size() == 2UL && label_dims[1] == 1) ||
                 label_dims.size() == 1UL,
-            true, platform::errors::InvalidArgument(
-                      "The Input(Label) should be a 2-D tensor with last "
-                      "dimension fixed to 1 or a 1-D tensor. But received: "
-                      "input rank %u, input shape [%s].",
-                      label_dims.size(), label_dims));
+            true,
+            platform::errors::InvalidArgument(
+                "The Input(Label) should be a 2-D tensor with last "
+                "dimension fixed to 1 or a 1-D tensor. But received: "
+                "input rank %u, input shape [%s].",
+                label_dims.size(), label_dims));
       }
       if (ctx->IsRuntime() || (emission_dims[0] > 0 && label_dims[0] > 0)) {
         PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index 6b11ff69c3056..8b40abf3debe4 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <limits>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/jit/kernels.h"
@@ -22,8 +23,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::LoDTensor;
 using framework::LoD;
+using framework::LoDTensor;
 using framework::Tensor;
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index 9de5bc6ea3636..2e0a054fa122b 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/crop_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index 5ac28fafb09b9..49e1d6ab5842a 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
@@ -171,17 +172,19 @@ class CropGradKernel : public framework::OpKernel<T> {
     size_t rank =
         context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
     PADDLE_ENFORCE_GE(
-        rank, 1, platform::errors::InvalidArgument(
-                     "The number of dimensions of the input 'Out@GRAD' for "
-                     "CropGrad must be greater than or equal "
-                     "to 1, but the value received is %d.",
-                     rank));
+        rank, 1,
+        platform::errors::InvalidArgument(
+            "The number of dimensions of the input 'Out@GRAD' for "
+            "CropGrad must be greater than or equal "
+            "to 1, but the value received is %d.",
+            rank));
     PADDLE_ENFORCE_LE(
-        rank, 6, platform::errors::InvalidArgument(
-                     "The number of dimensions of the input 'Out@GRAD' for "
-                     "CropGrad must be less than or equal "
-                     "to 6, but the value received is %d.",
-                     rank));
+        rank, 6,
+        platform::errors::InvalidArgument(
+            "The number of dimensions of the input 'Out@GRAD' for "
+            "CropGrad must be less than or equal "
+            "to 6, but the value received is %d.",
+            rank));
     switch (rank) {
       case 1:
         CropGradFunction<DeviceContext, T, 1>(context);
diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc
index 0e53bbb5d189f..a9a94e2c948b9 100644
--- a/paddle/fluid/operators/crop_tensor_op.cc
+++ b/paddle/fluid/operators/crop_tensor_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/crop_tensor_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/crop_tensor_op.h b/paddle/fluid/operators/crop_tensor_op.h
index 409458037a204..851d007896d7e 100644
--- a/paddle/fluid/operators/crop_tensor_op.h
+++ b/paddle/fluid/operators/crop_tensor_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
@@ -72,11 +73,12 @@ static framework::DDim ValidateShape(const std::vector<int> shape,
                             "The value (%d) of the %uth element for shape of "
                             "Op(crop_tensor) should not be zero.",
                             shape[i], i));
-      PADDLE_ENFORCE_EQ(shape[i], -1, platform::errors::InvalidArgument(
-                                          "When the value (%d) of the %uth "
-                                          "element for shape of Op(crop_tensor)"
-                                          " is negative, only -1 is supported.",
-                                          shape[i], i));
+      PADDLE_ENFORCE_EQ(shape[i], -1,
+                        platform::errors::InvalidArgument(
+                            "When the value (%d) of the %uth "
+                            "element for shape of Op(crop_tensor)"
+                            " is negative, only -1 is supported.",
+                            shape[i], i));
       output_shape[i] = in_dims[i] - offsets[i];
     } else {
       output_shape[i] = static_cast<int64_t>(shape[i]);
@@ -226,11 +228,12 @@ class CropTensorKernel : public framework::OpKernel<T> {
             "value received is %d.",
             rank));
     PADDLE_ENFORCE_LE(
-        rank, 6, platform::errors::InvalidArgument(
-                     "The number of dimensions of the input 'x' for "
-                     "Op(crop_tensor) must be less than or equal to 6, but the "
-                     "value received is %d.",
-                     rank));
+        rank, 6,
+        platform::errors::InvalidArgument(
+            "The number of dimensions of the input 'x' for "
+            "Op(crop_tensor) must be less than or equal to 6, but the "
+            "value received is %d.",
+            rank));
     switch (rank) {
       case 1:
         CropTensorFunction<DeviceContext, T, 1>(context);
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 4f5912c81baef..a880584f4cfe7 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cross_entropy_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc
index 674b75625d198..977d84e1e47c8 100644
--- a/paddle/fluid/operators/cross_op.cc
+++ b/paddle/fluid/operators/cross_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
@@ -21,8 +22,8 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
 using framework::DDim;
+using framework::Tensor;
 const int kDefaultDim = framework::DDim::kMaxRank;
 
 class CrossOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index ba90c677570c5..10ec5a6bdd140 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -15,7 +15,9 @@ limitations under the License. */
 #include <stdio.h>
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+
 #include <vector>
+
 #include "paddle/fluid/operators/ctc_align_op.h"
 
 namespace paddle {
@@ -92,10 +94,10 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
       auto* output_length = ctx.Output<LoDTensor>("OutputLength");
       T* output_length_data =
           output_length->mutable_data<T>({input_dims[0], 1}, ctx.GetPlace());
-      PaddingMergeAndDelCudaKernel<
-          T><<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>(
-          input_dims[1], tokens, input_length_data, blank, merge_repeated,
-          padding_value, input_dims[0], output_data, output_length_data);
+      PaddingMergeAndDelCudaKernel<T>
+          <<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>(
+              input_dims[1], tokens, input_length_data, blank, merge_repeated,
+              padding_value, input_dims[0], output_data, output_length_data);
     } else {
       const size_t level = 0;
       auto input_lod = framework::ToAbsOffset(input->lod());
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index c561974b0c976..9e189a9fb6356 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <string.h>
+
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/cuda_graph_with_in_out.h b/paddle/fluid/operators/cuda_graph_with_in_out.h
new file mode 100644
index 0000000000000..e7a943aee4d36
--- /dev/null
+++ b/paddle/fluid/operators/cuda_graph_with_in_out.h
@@ -0,0 +1,156 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/tensor.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_CUDA
+class CUDAGraphWithInOuts {
+ public:
+  template <typename Callable>
+  CUDAGraphWithInOuts(Callable &&callable, platform::CUDAPlace place,
+                      const std::vector<const framework::Tensor *> &in_ptrs,
+                      cudaStreamCaptureMode mode, int64_t pool_id) {
+    in_indices_.resize(in_ptrs.size());
+    ins_.reserve(in_ptrs.size());
+    int64_t valid_in_idx = 0;
+    for (size_t i = 0; i < in_ptrs.size(); ++i) {
+      if (in_ptrs[i] == nullptr) {
+        in_indices_[i] = -1;
+      } else {
+        in_indices_[i] = (valid_in_idx++);
+        ins_.push_back(*in_ptrs[i]);
+      }
+    }
+
+    platform::BeginCUDAGraphCapture(place, mode, pool_id);
+    auto out_ptrs = callable(in_ptrs);
+    graph_ = platform::EndCUDAGraphCapture();
+    graph_->Replay();
+
+    out_indices_.resize(out_ptrs.size());
+    outs_.reserve(out_ptrs.size());
+    int64_t valid_out_idx = 0;
+    for (size_t i = 0; i < out_ptrs.size(); ++i) {
+      if (out_ptrs[i] == nullptr) {
+        out_indices_[i] = -1;
+      } else {
+        out_indices_[i] = (valid_out_idx++);
+        outs_.push_back(*out_ptrs[i]);
+      }
+    }
+  }
+
+  void Run(const std::vector<const framework::Tensor *> &ins) {
+    PADDLE_ENFORCE_EQ(
+        ins.size(), in_indices_.size(),
+        phi::errors::InvalidArgument("The input number does not match."));
+    for (size_t i = 0; i < in_indices_.size(); ++i) {
+      if (in_indices_[i] >= 0) {
+        auto *dst = &ins_[in_indices_[i]];
+        framework::TensorCopy(*ins[i], dst->place(), dst);
+      }
+    }
+    graph_->Replay();
+  }
+
+  std::vector<framework::Tensor *> GetOutputs() {
+    std::vector<framework::Tensor *> outs(out_indices_.size());
+    for (size_t i = 0; i < out_indices_.size(); ++i) {
+      if (out_indices_[i] >= 0) {
+        outs[i] = &outs_[out_indices_[i]];
+      }
+    }
+    return outs;
+  }
+
+  int64_t PoolID() const { return graph_->PoolID(); }
+
+ private:
+  std::unique_ptr<platform::CUDAGraph> graph_;
+  std::vector<framework::Tensor> ins_;
+  std::vector<framework::Tensor> outs_;
+  std::vector<int64_t> in_indices_;
+  std::vector<int64_t> out_indices_;
+};
+
+template <typename Callable>
+static std::unique_ptr<CUDAGraphWithInOuts> CaptureCUDAGraph(
+    Callable &&callable, const framework::ExecutionContext &ctx,
+    const std::vector<std::string> &input_names,
+    const std::vector<std::string> &output_names, cudaStreamCaptureMode mode,
+    int64_t pool_id) {
+  std::vector<const framework::Tensor *> inputs;
+  for (const auto &name : input_names) {
+    auto input_tensors = ctx.MultiInput<framework::Tensor>(name);
+    inputs.insert(inputs.end(), input_tensors.begin(), input_tensors.end());
+  }
+
+  auto func = [&](const std::vector<const framework::Tensor *> &inputs) {
+    callable(ctx);
+    std::vector<framework::Tensor *> outputs;
+    for (const auto &name : output_names) {
+      auto output_tensors = ctx.MultiOutput<framework::Tensor>(name);
+      outputs.insert(outputs.end(), output_tensors.begin(),
+                     output_tensors.end());
+    }
+    return outputs;
+  };
+
+  return std::make_unique<CUDAGraphWithInOuts>(func, ctx.GetPlace(), inputs,
+                                               mode, pool_id);
+}
+
+static void ExecuteCUDAGraph(const framework::ExecutionContext &ctx,
+                             const std::vector<std::string> &input_names,
+                             const std::vector<std::string> &output_names,
+                             CUDAGraphWithInOuts *graph) {
+  std::vector<const framework::Tensor *> inputs;
+  for (const auto &name : input_names) {
+    auto input_tensors = ctx.MultiInput<framework::Tensor>(name);
+    inputs.insert(inputs.end(), input_tensors.begin(), input_tensors.end());
+  }
+
+  graph->Run(inputs);
+  auto outputs = graph->GetOutputs();
+
+  size_t idx = 0;
+  for (const auto &name : output_names) {
+    auto output_tensors = ctx.MultiOutput<framework::Tensor>(name);
+    for (auto *out_t : output_tensors) {
+      if (outputs[idx] != nullptr) {
+        *out_t = *outputs[idx];
+      } else {
+        PADDLE_ENFORCE_EQ(
+            out_t, nullptr,
+            phi::errors::InvalidArgument(
+                "The %d-th output variable should be nullptr.", idx));
+      }
+      ++idx;
+    }
+  }
+}
+#else
+class CUDAGraphWithInOuts {};
+#endif
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h
index 5451cf815cae3..da8284b4f2e43 100644
--- a/paddle/fluid/operators/cudnn_lstm_cache.h
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index ccb0062fcc723..9ff4f796995c0 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h
index 6c059257b94e8..e2159a09c120c 100644
--- a/paddle/fluid/operators/cudnn_rnn_cache.h
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 11633fb0b8703..dbb703e7e874d 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -86,13 +86,12 @@ REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker,
                   ops::CumsumGradMaker<paddle::imperative::OpBase>,
                   CumsumInferShapeFunctor);
 
-REGISTER_OP_VERSION(cumsum)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(cumsum).AddCheckpoint(
+    R"ROC(
       Upgrade cumsum add a new attribute [flatten].
     )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "flatten",
-            "In order to compute the cumsum over the flattened array when the "
-            "argument `axis` in python API is None.",
-            false));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "flatten",
+        "In order to compute the cumsum over the flattened array when the "
+        "argument `axis` in python API is None.",
+        false));
diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc
index e909906da7baa..912167cec5af7 100644
--- a/paddle/fluid/operators/cvm_op.cc
+++ b/paddle/fluid/operators/cvm_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cvm_op.h"
+
 #include <memory>
+
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index 137de2d5af985..8287654949e70 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/data_norm_op.h"
+
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/data_layout.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -163,10 +165,11 @@ class DataNormOp : public framework::OperatorWithKernel {
                       OperatorWithKernel::IndicateVarDataType(ctx, "BatchSum"),
                       platform::errors::InvalidArgument(
                           "BatchSum input should be of float type"));
-    PADDLE_ENFORCE_EQ(dn_param_type, OperatorWithKernel::IndicateVarDataType(
-                                         ctx, "BatchSquareSum"),
-                      platform::errors::InvalidArgument(
-                          "BatchSquareSum input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        dn_param_type,
+        OperatorWithKernel::IndicateVarDataType(ctx, "BatchSquareSum"),
+        platform::errors::InvalidArgument(
+            "BatchSquareSum input should be of float type"));
 
     bool enable_scale_and_shift = ctx.Attr<bool>("enable_scale_and_shift");
     if (enable_scale_and_shift) {
@@ -277,8 +280,9 @@ class DataNormKernel<platform::CPUDeviceContext, T>
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::InvalidArgument(
-                                            "The Input dim size should be 2"));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument("The Input dim size should be 2"));
     const int N = x_dims[0];
     const int C =
         (data_layout == DataLayout::kNCHW ? x_dims[1]
@@ -515,8 +519,9 @@ class DataNormGradKernel<platform::CPUDeviceContext, T>
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::InvalidArgument(
-                                            "The Input dim size should be 2"));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument("The Input dim size should be 2"));
     const int N = x_dims[0];
     const int C =
         (data_layout == DataLayout::kNCHW ? x_dims[1]
@@ -757,10 +762,9 @@ REGISTER_OP_CPU_KERNEL(
     data_norm_grad,
     ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_VERSION(data_norm)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(data_norm).AddCheckpoint(
+    R"ROC(
               upgrad data_norm op by adding scale_w to support scale and shift.)ROC",
-        paddle::framework::compatible::OpVersionDesc().NewInput(
-            "scale_w",
-            "scale_w is used to do scale duirng data_norm like batchnorm "));
+    paddle::framework::compatible::OpVersionDesc().NewInput(
+        "scale_w",
+        "scale_w is used to do scale duirng data_norm like batchnorm "));
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index 28a7922120139..21c7d7d4bf496 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/data_norm_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -100,8 +101,9 @@ class DataNormKernel<platform::CUDADeviceContext, T>
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
     // Align with CPU version, but should we add this restriction?
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::PreconditionNotMet(
-                                            "The Input dim size should be 2"));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2,
+        platform::errors::PreconditionNotMet("The Input dim size should be 2"));
     const int N = x_dims[0];
     const int C = x_dims[1];
     const T *batch_size_in = ctx.Input<Tensor>("BatchSize")->data<T>();
@@ -143,8 +145,9 @@ class DataNormGradKernel<platform::CUDADeviceContext, T>
 
     const auto &x_dims = x->dims();
     // Align with CPU version, but should we add this restriction?
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::PreconditionNotMet(
-                                            "The Input dim size should be 2"));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2,
+        platform::errors::PreconditionNotMet("The Input dim size should be 2"));
     const int N = x_dims[0];
     const int C = x_dims[1];
 
diff --git a/paddle/fluid/operators/decode_jpeg_op.cu b/paddle/fluid/operators/decode_jpeg_op.cu
index de6b35bc9cd0a..a257afc50f955 100644
--- a/paddle/fluid/operators/decode_jpeg_op.cu
+++ b/paddle/fluid/operators/decode_jpeg_op.cu
@@ -15,6 +15,7 @@
 #if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP)
 
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/dynload/nvjpeg.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/deformable_conv_op.cc b/paddle/fluid/operators/deformable_conv_op.cc
index 1b76aca1e660e..b54c8a81abd64 100644
--- a/paddle/fluid/operators/deformable_conv_op.cc
+++ b/paddle/fluid/operators/deformable_conv_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/deformable_conv_op_xpu.cc b/paddle/fluid/operators/deformable_conv_op_xpu.cc
index 240e5658956dd..d977cfe844a6a 100644
--- a/paddle/fluid/operators/deformable_conv_op_xpu.cc
+++ b/paddle/fluid/operators/deformable_conv_op_xpu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
@@ -169,28 +170,32 @@ class DeformableConvGradXPUKernel : public framework::OpKernel<T> {
     const float* offset_ptr = offset.data<float>();
     const float* mask_ptr = mask.data<float>();
     if (dx_data == nullptr) {
-      PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&dx_data),
-                                   input->numel() * sizeof(T)),
-                        XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                         "XPU has no enough memory"));
+      PADDLE_ENFORCE_EQ(
+          xpu_malloc(reinterpret_cast<void**>(&dx_data),
+                     input->numel() * sizeof(T)),
+          XPU_SUCCESS,
+          platform::errors::ResourceExhausted("XPU has no enough memory"));
     }
     if (dw_data == nullptr) {
-      PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&dw_data),
-                                   filter.numel() * sizeof(T)),
-                        XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                         "XPU has no enough memory"));
+      PADDLE_ENFORCE_EQ(
+          xpu_malloc(reinterpret_cast<void**>(&dw_data),
+                     filter.numel() * sizeof(T)),
+          XPU_SUCCESS,
+          platform::errors::ResourceExhausted("XPU has no enough memory"));
     }
     if (doffset_data == nullptr) {
-      PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&doffset_data),
-                                   offset.numel() * sizeof(T)),
-                        XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                         "XPU has no enough memory"));
+      PADDLE_ENFORCE_EQ(
+          xpu_malloc(reinterpret_cast<void**>(&doffset_data),
+                     offset.numel() * sizeof(T)),
+          XPU_SUCCESS,
+          platform::errors::ResourceExhausted("XPU has no enough memory"));
     }
     if (dmask_data == nullptr) {
-      PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&dmask_data),
-                                   mask.numel() * sizeof(T)),
-                        XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                         "XPU has no enough memory"));
+      PADDLE_ENFORCE_EQ(
+          xpu_malloc(reinterpret_cast<void**>(&dmask_data),
+                     mask.numel() * sizeof(T)),
+          XPU_SUCCESS,
+          platform::errors::ResourceExhausted("XPU has no enough memory"));
     }
 
     int input_dim = input->numel() / input->dims()[0];
@@ -207,10 +212,11 @@ class DeformableConvGradXPUKernel : public framework::OpKernel<T> {
     int f = filter.dims()[0];
 
     T* filter_grad_tmp = nullptr;
-    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&filter_grad_tmp),
-                                 filter_grad->numel() * sizeof(T)),
-                      XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                       "XPU has no enough memory"));
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc(reinterpret_cast<void**>(&filter_grad_tmp),
+                   filter_grad->numel() * sizeof(T)),
+        XPU_SUCCESS,
+        platform::errors::ResourceExhausted("XPU has no enough memory"));
 
     // set zeros for d_table_data
     const int zero = 0;
diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cc b/paddle/fluid/operators/deformable_conv_v1_op.cc
index 0ec95cb54bae8..2da561c868516 100644
--- a/paddle/fluid/operators/deformable_conv_v1_op.cc
+++ b/paddle/fluid/operators/deformable_conv_v1_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cc b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
index 7e7cdbd8d178c..a989e3f9217c0 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
+
 #include <iostream>
 #include <memory>
 #include <vector>
+
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace paddle {
@@ -165,11 +167,12 @@ class DeformablePSROIPoolOp : public framework::OperatorWithKernel {
     auto part_width = part_size[1];
     auto sample_per_part = ctx->Attrs().Get<int>("sample_per_part");
     auto trans_std = ctx->Attrs().Get<float>("trans_std");
-    PADDLE_ENFORCE_GE(trans_std, 0., platform::errors::InvalidArgument(
-                                         "Input(trans_std) should not be lower "
-                                         "than 0.0, but received trans_std "
-                                         "is:%f",
-                                         trans_std));
+    PADDLE_ENFORCE_GE(trans_std, 0.,
+                      platform::errors::InvalidArgument(
+                          "Input(trans_std) should not be lower "
+                          "than 0.0, but received trans_std "
+                          "is:%f",
+                          trans_std));
     PADDLE_ENFORCE_GE(
         input_dims[1], output_channels,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index 873950b2d2f65..174f045c1605c 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -23,10 +23,12 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <algorithm>
 #include <iostream>
 #include <limits>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h
index 3deabce54ed0b..6ff6ab20df2fb 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
@@ -25,6 +25,7 @@
 #include <algorithm>
 #include <iostream>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/dequantize_op.cc b/paddle/fluid/operators/dequantize_op.cc
index 876bd1199ad3b..2bed296efd77a 100644
--- a/paddle/fluid/operators/dequantize_op.cc
+++ b/paddle/fluid/operators/dequantize_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/dequantize_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -47,8 +48,8 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker);
 
 REGISTER_OP_VERSION(dequantize)
-    .AddCheckpoint(
-        R"ROC( Add a new attribute [Shift])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "Shift", "Dequantize data to uint8 if provided non-zero value.",
-            0.0f));
+    .AddCheckpoint(R"ROC( Add a new attribute [Shift])ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewAttr(
+                       "Shift",
+                       "Dequantize data to uint8 if provided non-zero value.",
+                       0.0f));
diff --git a/paddle/fluid/operators/dequantize_op.h b/paddle/fluid/operators/dequantize_op.h
index 75c27a06c210f..ea7a08c8f3684 100644
--- a/paddle/fluid/operators/dequantize_op.h
+++ b/paddle/fluid/operators/dequantize_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/dequeue_op.cc b/paddle/fluid/operators/dequeue_op.cc
index fb5d53dacf0ed..1a6286b0a3289 100644
--- a/paddle/fluid/operators/dequeue_op.cc
+++ b/paddle/fluid/operators/dequeue_op.cc
@@ -14,6 +14,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index f10c801919999..6e5ea3e8aa721 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -1,96 +1,129 @@
 set(LOCAL_DETECTION_LIBS)
 
 function(detection_library TARGET_NAME)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    set(options "")
-    set(common_deps op_registry)
-    set(pybind_flag 0)
-    cmake_parse_arguments(detection_library "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-    set(srcs)
-    # filter cuda source file when not build with cuda/rocm
-    foreach(src ${detection_library_SRCS})
-      if (NOT WITH_GPU AND NOT WITH_ROCM)
-        if(${src} MATCHES ".*\\.cc$")
-          list(APPEND srcs ${src})
-        endif()
-      else()
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  set(common_deps op_registry)
+  set(pybind_flag 0)
+  cmake_parse_arguments(detection_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  set(srcs)
+  # filter cuda source file when not build with cuda/rocm
+  foreach(src ${detection_library_SRCS})
+    if(NOT WITH_GPU AND NOT WITH_ROCM)
+      if(${src} MATCHES ".*\\.cc$")
         list(APPEND srcs ${src})
       endif()
-    endforeach()
-    
-    op_library(${TARGET_NAME} SRCS ${srcs} DEPS ${common_deps} ${detection_library_DEPS})
+    else()
+      list(APPEND srcs ${src})
+    endif()
+  endforeach()
+
+  op_library(${TARGET_NAME} SRCS ${srcs} DEPS ${common_deps}
+             ${detection_library_DEPS})
 
-    set(LOCAL_DETECTION_LIBS
-            ${TARGET_NAME}
-            ${LOCAL_DETECTION_LIBS}
-        PARENT_SCOPE)
+  set(LOCAL_DETECTION_LIBS
+      ${TARGET_NAME} ${LOCAL_DETECTION_LIBS}
+      PARENT_SCOPE)
 endfunction()
 
-if (WITH_ASCEND_CL)
-    detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu box_coder_op_npu.cc)
-    detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu density_prior_box_op_npu.cc)
+if(WITH_ASCEND_CL)
+  detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu
+                    box_coder_op_npu.cc)
+  detection_library(density_prior_box_op SRCS density_prior_box_op.cc
+                    density_prior_box_op.cu density_prior_box_op_npu.cc)
 else()
-    detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
-    detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
+  detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
+  detection_library(density_prior_box_op SRCS density_prior_box_op.cc
+                    density_prior_box_op.cu)
 endif()
 
 if(WITH_XPU)
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_xpu.cc)
+  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
+                    iou_similarity_op_xpu.cc)
   detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_xpu.cc)
+  detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc
+                    generate_proposals_v2_op_xpu.cc)
 elseif(WITH_ASCEND_CL)
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_npu.cc)
-  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu prior_box_op_npu.cc)
+  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
+                    iou_similarity_op_npu.cc)
+  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu
+                    prior_box_op_npu.cc)
 else()
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu)
+  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
+                    iou_similarity_op.cu)
   detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
+  # detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
 endif()
 
 detection_library(bipartite_match_op SRCS bipartite_match_op.cc)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
 detection_library(anchor_generator_op SRCS anchor_generator_op.cc
-anchor_generator_op.cu)
-detection_library(target_assign_op SRCS target_assign_op.cc
-target_assign_op.cu)
+                  anchor_generator_op.cu)
+detection_library(target_assign_op SRCS target_assign_op.cc target_assign_op.cu)
 detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
-polygon_box_transform_op.cu)
+                  polygon_box_transform_op.cu)
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
-detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
+detection_library(generate_proposal_labels_op SRCS
+                  generate_proposal_labels_op.cc)
 detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc DEPS gpc)
 detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
 detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
 detection_library(yolo_box_op SRCS yolo_box_op.cc)
-detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
-detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu)
-detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
+detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
+                  box_decoder_and_assign_op.cu)
+detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc
+                  sigmoid_focal_loss_op.cu)
+detection_library(retinanet_detection_output_op SRCS
+                  retinanet_detection_output_op.cc)
 detection_library(nms_op SRCS nms_op.cc nms_op.cu)
 
 if(WITH_GPU OR WITH_ROCM)
   set(TMPDEPS memory)
   if(WITH_GPU)
-    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
-        set(TMPDEPS memory cub)
+    if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+      set(TMPDEPS memory cub)
     endif()
   endif()
-  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS})
-  detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc generate_proposals_v2_op.cu DEPS ${TMPDEPS})
-  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS ${TMPDEPS})
-  detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS ${TMPDEPS})
+  detection_library(generate_proposals_op SRCS generate_proposals_op.cc
+                    generate_proposals_op.cu DEPS ${TMPDEPS})
+  detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc
+                    generate_proposals_v2_op.cu DEPS ${TMPDEPS})
+  detection_library(
+    distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc
+    distribute_fpn_proposals_op.cu DEPS ${TMPDEPS})
+  detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc
+                    collect_fpn_proposals_op.cu DEPS ${TMPDEPS})
 else()
   detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
-  detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
-  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
+  if(NOT WITH_XPU)
+    detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
+  endif()
+  detection_library(distribute_fpn_proposals_op SRCS
+                    distribute_fpn_proposals_op.cc)
   detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc)
 endif()
 
-detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
+detection_library(
+  roi_perspective_transform_op SRCS roi_perspective_transform_op.cc
+  roi_perspective_transform_op.cu)
 #Export local libraries to parent
 # set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
 
-cc_library(mask_util SRCS mask_util.cc DEPS memory)
-cc_test(mask_util_test SRCS mask_util_test.cc DEPS memory mask_util)
-cc_library(gpc SRCS gpc.cc DEPS op_registry)
-detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS mask_util)
+cc_library(
+  mask_util
+  SRCS mask_util.cc
+  DEPS memory)
+cc_test(
+  mask_util_test
+  SRCS mask_util_test.cc
+  DEPS memory mask_util)
+cc_library(
+  gpc
+  SRCS gpc.cc
+  DEPS op_registry)
+detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS
+                  mask_util)
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h
index 0bcb56d7aa8d5..b3d490ac0b512 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.h
+++ b/paddle/fluid/operators/detection/anchor_generator_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index 7bbbbe7f40ecc..b9b9b0b0c0dbf 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -122,8 +123,9 @@ void BboxOverlaps(const framework::Tensor& r_boxes,
       inter_h = std::max(y_max - y_min + 1, zero);
       inter_area = inter_w * inter_h;
       overlaps_et(i, j) =
-          (inter_area == 0.) ? 0 : inter_area /
-                                       (r_box_area + c_box_area - inter_area);
+          (inter_area == 0.)
+              ? 0
+              : inter_area / (r_box_area + c_box_area - inter_area);
     }
   }
 }
diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc
index 73f0607fdde7f..08d688a149543 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cc
+++ b/paddle/fluid/operators/detection/box_clip_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/box_clip_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index 65f2a5590716d..672b9a5db95d2 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -12,6 +12,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/box_clip_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h
index 13ba7894d6009..4bcc81dbf9865 100644
--- a/paddle/fluid/operators/detection/box_clip_op.h
+++ b/paddle/fluid/operators/detection/box_clip_op.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 69d829e0021f3..461dcb7f39ab5 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/box_coder_op.h"
+
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu
index 22dc606df9df5..b7dee412ee319 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/box_coder_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
index a626f790fac90..6ddfd71765390 100644
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
index d3565f87f33bb..7eed920fb3d55 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
@@ -13,6 +13,7 @@ limitations under the License. */
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
index 92c9ab34aa454..b1b8c3ba2da84 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License.*/
 
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index 860fdd01794cc..bea6fb1748858 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -18,6 +18,7 @@ namespace cub = hipcub;
 #endif
 
 #include <paddle/fluid/memory/allocation/allocator.h>
+
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
index e5ae9a6ccbda5..973cbc6ec1658 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -20,6 +20,7 @@ limitations under the License.*/
 #include <numeric>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h
index adc2723acbf70..0912ce9016031 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/operators/detection/prior_box_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
index 4e514e62f4081..e382586ec666c 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 7ad25e003b491..5adf1469ec2f9 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -21,6 +21,7 @@ namespace cub = hipcub;
 #endif
 
 #include <paddle/fluid/memory/allocation/allocator.h>
+
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index 5479e08c2a5ef..85db2437ee550 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <cstring>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index c9cc4e722071c..da86502f78c35 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <math.h>
+
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index cbf17048400bf..bc528060355f0 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <math.h>
+
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index d6130823271f0..a6d2d8a2a0172 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <cstring>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 5fb7973fd89e4..20efb1fa6ca92 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #include <paddle/fluid/memory/allocation/allocator.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index 1f1802574c5b8..b8b6118058fa2 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <cstring>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
index 005309e8ee577..deb7f3a41df1f 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #include <paddle/fluid/memory/allocation/allocator.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc
new file mode 100644
index 0000000000000..28c94668ba7c5
--- /dev/null
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc
@@ -0,0 +1,370 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <paddle/fluid/memory/allocation/allocator.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+namespace {
+template <typename T>
+static void SortDescending(const platform::XPUDeviceContext &dev_ctx,
+                           const Tensor &value, Tensor *index_out,
+                           int pre_nms_top_n) {
+  auto *value_data = value.data<T>();
+  auto place = dev_ctx.GetPlace();
+  auto cpu_place = platform::CPUPlace();
+
+  Tensor scores_slice_cpu;
+  scores_slice_cpu.Resize({value.numel()});
+  auto *scores_slice_cpu_data = scores_slice_cpu.mutable_data<T>(cpu_place);
+
+  memory::Copy(cpu_place, scores_slice_cpu_data, place, value_data,
+               sizeof(T) * value.numel());
+
+  // Sort index
+  Tensor index_t;
+  int *index = index_t.mutable_data<int>({value.numel()}, cpu_place);
+  for (int i = 0; i < value.numel(); ++i) {
+    index[i] = i;
+  }
+  auto compare = [scores_slice_cpu_data](const int64_t &i, const int64_t &j) {
+    return scores_slice_cpu_data[i] > scores_slice_cpu_data[j];
+  };
+
+  if (pre_nms_top_n <= 0 || pre_nms_top_n >= value.numel()) {
+    std::sort(index, index + value.numel(), compare);
+  } else {
+    std::nth_element(index, index + pre_nms_top_n, index + value.numel(),
+                     compare);
+    std::sort(index, index + pre_nms_top_n, compare);
+    index_t.Resize({pre_nms_top_n});
+  }
+
+  int *idx_out =
+      index_out->mutable_data<int>({index_t.numel()}, dev_ctx.GetPlace());
+  memory::Copy(place, idx_out, cpu_place, index, sizeof(T) * index_t.numel());
+}
+
+template <typename T>
+static std::pair<Tensor, Tensor> ProposalForOneImage(
+    const platform::XPUDeviceContext &dev_ctx, const Tensor &im_shape,
+    const Tensor &anchors, const Tensor &variances,
+    const Tensor &bbox_deltas,  // [M, 4]
+    const Tensor &scores,       // [N, 1]
+    int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
+    float eta, bool pixel_offset) {
+  // 1. pre nms
+  Tensor index_sort;
+  SortDescending<T>(dev_ctx, scores, &index_sort, pre_nms_top_n);
+
+  Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
+  scores_sel.mutable_data<T>({index_sort.numel(), 1}, dev_ctx.GetPlace());
+  bbox_sel.mutable_data<T>({index_sort.numel(), 4}, dev_ctx.GetPlace());
+  anchor_sel.mutable_data<T>({index_sort.numel(), 4}, dev_ctx.GetPlace());
+  var_sel.mutable_data<T>({index_sort.numel(), 4}, dev_ctx.GetPlace());
+
+  int r = xpu::gather<T>(dev_ctx.x_context(), scores.data<T>(),
+                         index_sort.data<int>(), scores_sel.data<T>(),
+                         {static_cast<int>(scores.numel()), 1},
+                         index_sort.numel(), 0);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External("XPU API(gather) return "
+                                               "wrong value[%d %s]",
+                                               r, XPUAPIErrorMsg[r]));
+
+  r = xpu::gather<T>(dev_ctx.x_context(), bbox_deltas.data<T>(),
+                     index_sort.data<int>(), bbox_sel.data<T>(),
+                     {static_cast<int>(bbox_deltas.numel()) / 4, 4},
+                     index_sort.numel(), 0);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External("XPU API(gather) return "
+                                               "wrong value[%d %s]",
+                                               r, XPUAPIErrorMsg[r]));
+
+  r = xpu::gather<T>(dev_ctx.x_context(), anchors.data<T>(),
+                     index_sort.data<int>(), anchor_sel.data<T>(),
+                     {static_cast<int>(anchors.numel()) / 4, 4},
+                     index_sort.numel(), 0);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External("XPU API(gather) return "
+                                               "wrong value[%d %s]",
+                                               r, XPUAPIErrorMsg[r]));
+
+  r = xpu::gather<T>(dev_ctx.x_context(), variances.data<T>(),
+                     index_sort.data<int>(), var_sel.data<T>(),
+                     {static_cast<int>(variances.numel()) / 4, 4},
+                     index_sort.numel(), 0);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External("XPU API(gather) return "
+                                               "wrong value[%d %s]",
+                                               r, XPUAPIErrorMsg[r]));
+
+  int num = scores.numel();
+  int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
+                                                                : pre_nms_top_n;
+  scores_sel.Resize({pre_nms_num, 1});
+  index_sort.Resize({pre_nms_num, 1});
+
+  // 2. box decode and clipping
+  Tensor proposals;
+  proposals.mutable_data<T>({pre_nms_num, 4}, dev_ctx.GetPlace());
+
+  r = xpu::box_decoder<T>(dev_ctx.x_context(), anchor_sel.data<T>(),
+                          var_sel.data<T>(), bbox_sel.data<T>(),
+                          proposals.data<T>(), pre_nms_num, !pixel_offset, true,
+                          im_shape.data<T>());
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External("XPU API(box_decoder) return "
+                                               "wrong value[%d %s]",
+                                               r, XPUAPIErrorMsg[r]));
+
+  // 3. filter
+  Tensor keep_index, keep_num_t;
+  keep_index.mutable_data<int>({pre_nms_num}, dev_ctx.GetPlace());
+  keep_num_t.mutable_data<int>({1}, dev_ctx.GetPlace());
+  min_size = std::max(min_size, 1.0f);
+  r = xpu::remove_small_boxes<T>(dev_ctx.x_context(), proposals.data<T>(),
+                                 im_shape.data<T>(), keep_index.data<int>(),
+                                 keep_num_t.data<int>(), pre_nms_num, min_size,
+                                 false, pixel_offset);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
+                                        "XPU API(remove_small_boxes) return "
+                                        "wrong value[%d %s]",
+                                        r, XPUAPIErrorMsg[r]));
+  int keep_num;
+  const auto xpu_place = dev_ctx.GetPlace();
+  memory::Copy(platform::CPUPlace(), &keep_num, xpu_place,
+               keep_num_t.data<int>(), sizeof(int));
+  keep_index.Resize({keep_num});
+
+  Tensor scores_filter, proposals_filter;
+  // Handle the case when there is no keep index left
+  if (keep_num == 0) {
+    phi::funcs::SetConstant<platform::XPUDeviceContext, T> set_zero;
+    proposals_filter.mutable_data<T>({1, 4}, dev_ctx.GetPlace());
+    scores_filter.mutable_data<T>({1, 1}, dev_ctx.GetPlace());
+    set_zero(dev_ctx, &proposals_filter, static_cast<T>(0));
+    set_zero(dev_ctx, &scores_filter, static_cast<T>(0));
+    return std::make_pair(proposals_filter, scores_filter);
+  }
+  proposals_filter.mutable_data<T>({keep_num, 4}, dev_ctx.GetPlace());
+  scores_filter.mutable_data<T>({keep_num, 1}, dev_ctx.GetPlace());
+  r = xpu::gather<T>(dev_ctx.x_context(), proposals.data<T>(),
+                     keep_index.data<int>(), proposals_filter.data<T>(),
+                     {pre_nms_num, 4}, keep_num, 0);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External("XPU API(gather) return "
+                                               "wrong value[%d %s]",
+                                               r, XPUAPIErrorMsg[r]));
+
+  r = xpu::gather<T>(dev_ctx.x_context(), scores_sel.data<T>(),
+                     keep_index.data<int>(), scores_filter.data<T>(),
+                     {pre_nms_num, 1}, keep_num, 0);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External("XPU API(gather) return "
+                                               "wrong value[%d %s]",
+                                               r, XPUAPIErrorMsg[r]));
+
+  if (nms_thresh <= 0) {
+    if (dev_ctx.x_context()->xpu_stream) {
+      dev_ctx.Wait();
+    }
+    return std::make_pair(proposals_filter, scores_filter);
+  }
+
+  // 4. nms
+  int nms_keep_num = 0;
+  r = xpu::nms<T>(dev_ctx.x_context(), proposals_filter.data<T>(), nullptr,
+                  keep_index.data<int>(), 1, 1, keep_num, -1, nms_thresh, -1, 0,
+                  &nms_keep_num, pixel_offset);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External("XPU API(nms) return the"
+                                               "wrong value[%d %s]",
+                                               r, XPUAPIErrorMsg[r]));
+  if (post_nms_top_n > 0 && post_nms_top_n < nms_keep_num) {
+    keep_index.Resize({post_nms_top_n});
+  } else {
+    keep_index.Resize({nms_keep_num});
+  }
+
+  Tensor scores_nms, proposals_nms;
+  proposals_nms.mutable_data<T>({keep_index.numel(), 4}, dev_ctx.GetPlace());
+  scores_nms.mutable_data<T>({keep_index.numel(), 1}, dev_ctx.GetPlace());
+  r = xpu::gather<T>(dev_ctx.x_context(), proposals_filter.data<T>(),
+                     keep_index.data<int>(), proposals_nms.data<T>(),
+                     {keep_num, 4}, keep_index.numel(), 0);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External("XPU API(gather) return "
+                                               "wrong value[%d %s]",
+                                               r, XPUAPIErrorMsg[r]));
+  r = xpu::gather<T>(dev_ctx.x_context(), scores_filter.data<T>(),
+                     keep_index.data<int>(), scores_nms.data<T>(),
+                     {keep_num, 1}, keep_index.numel(), 0);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External("XPU API(gather) return "
+                                               "wrong value[%d %s]",
+                                               r, XPUAPIErrorMsg[r]));
+  if (dev_ctx.x_context()->xpu_stream) {
+    dev_ctx.Wait();
+  }
+  return std::make_pair(proposals_nms, scores_nms);
+}
+}  // namespace
+
+template <typename DeviceContext, typename T>
+class XPUGenerateProposalsV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *scores = context.Input<Tensor>("Scores");
+    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
+    auto *im_shape = context.Input<Tensor>("ImShape");
+    auto anchors = GET_DATA_SAFELY(context.Input<Tensor>("Anchors"), "Input",
+                                   "Anchors", "GenerateProposals");
+    auto variances = GET_DATA_SAFELY(context.Input<Tensor>("Variances"),
+                                     "Input", "Variances", "GenerateProposals");
+
+    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
+    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
+
+    int pre_nms_top_n = context.Attr<int>("pre_nms_topN");
+    int post_nms_top_n = context.Attr<int>("post_nms_topN");
+    float nms_thresh = context.Attr<float>("nms_thresh");
+    float min_size = context.Attr<float>("min_size");
+    float eta = context.Attr<float>("eta");
+    bool pixel_offset = context.Attr<bool>("pixel_offset");
+    PADDLE_ENFORCE_GE(eta, 1.,
+                      platform::errors::InvalidArgument(
+                          "Not support adaptive NMS. The attribute 'eta' "
+                          "should not less than 1. But received eta=[%d]",
+                          eta));
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+
+    auto scores_dim = scores->dims();
+    // the shape of bbox score
+    int num = scores_dim[0];
+    int c_score = scores_dim[1];
+    int h_score = scores_dim[2];
+    int w_score = scores_dim[3];
+
+    auto bbox_dim = bbox_deltas->dims();
+    int c_bbox = bbox_dim[1];
+    int h_bbox = bbox_dim[2];
+    int w_bbox = bbox_dim[3];
+
+    Tensor bbox_deltas_swap, scores_swap;
+    bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
+                                     dev_ctx.GetPlace());
+    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
+                                dev_ctx.GetPlace());
+
+    std::vector<int> axis = {0, 2, 3, 1};
+    int r = xpu::transpose<T>(dev_ctx.x_context(), bbox_deltas->data<T>(),
+                              bbox_deltas_swap.data<T>(),
+                              {num, c_bbox, h_bbox, w_bbox}, axis);
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External("XPU API(transpose) return "
+                                                 "wrong value[%d %s]",
+                                                 r, XPUAPIErrorMsg[r]));
+    r = xpu::transpose<T>(dev_ctx.x_context(), scores->data<T>(),
+                          scores_swap.data<T>(),
+                          {num, c_score, h_score, w_score}, axis);
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External("XPU API(transpose) return "
+                                                 "wrong value[%d %s]",
+                                                 r, XPUAPIErrorMsg[r]));
+
+    anchors.Resize({anchors.numel() / 4, 4});
+    variances.Resize({variances.numel() / 4, 4});
+
+    // output
+    rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
+                              context.GetPlace());
+    rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
+
+    T *rpn_rois_data = rpn_rois->data<T>();
+    T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
+
+    auto place = dev_ctx.GetPlace();
+    auto cpu_place = platform::CPUPlace();
+
+    int num_proposals = 0;
+    std::vector<size_t> offset(1, 0);
+    std::vector<int> tmp_num;
+
+    for (int64_t i = 0; i < num; ++i) {
+      Tensor im_shape_slice = im_shape->Slice(i, i + 1);
+      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+
+      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
+      scores_slice.Resize({h_score * w_score * c_score, 1});
+
+      std::pair<Tensor, Tensor> box_score_pair = ProposalForOneImage<T>(
+          dev_ctx, im_shape_slice, anchors, variances, bbox_deltas_slice,
+          scores_slice, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size,
+          eta, pixel_offset);
+
+      Tensor &proposals = box_score_pair.first;
+      Tensor &scores = box_score_pair.second;
+
+      memory::Copy(place, rpn_rois_data + num_proposals * 4, place,
+                   proposals.data<T>(), sizeof(T) * proposals.numel());
+      memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
+                   scores.data<T>(), sizeof(T) * scores.numel());
+      if (dev_ctx.x_context()->xpu_stream) {
+        dev_ctx.Wait();
+      }
+      num_proposals += proposals.dims()[0];
+      offset.emplace_back(num_proposals);
+      tmp_num.push_back(proposals.dims()[0]);
+    }
+    if (context.HasOutput("RpnRoisNum")) {
+      auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
+      rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
+      int *num_data = rpn_rois_num->data<int>();
+      memory::Copy(place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num);
+      rpn_rois_num->Resize({num});
+    }
+    framework::LoD lod;
+    lod.emplace_back(offset);
+    rpn_rois->set_lod(lod);
+    rpn_roi_probs->set_lod(lod);
+    rpn_rois->Resize({num_proposals, 4});
+    rpn_roi_probs->Resize({num_proposals, 1});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(generate_proposals_v2,
+                       ops::XPUGenerateProposalsV2Kernel<
+                           paddle::platform::XPUDeviceContext, float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc
index 6b1b0cd8b3578..4dea559d8e466 100644
--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
@@ -24,6 +24,7 @@
  **/
 
 #include "paddle/fluid/operators/detection/gpc.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace gpc {
diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
index 8cc0ebcab61f7..3f8bc8674186d 100644
--- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc
+++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 limitations under the License. */
 
 #include <glog/logging.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
 
@@ -51,16 +52,17 @@ class LocalityAwareNMSOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           box_dims[2] == 4 || box_dims[2] == 8 || box_dims[2] == 16 ||
               box_dims[2] == 24 || box_dims[2] == 32,
-          true, platform::errors::InvalidArgument(
-                    "The last dimension of Input(BBoxes) must be 4 or 8, "
-                    "represents the layout of coordinate "
-                    "[xmin, ymin, xmax, ymax] or "
-                    "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
-                    "8 points: [xi, yi] i= 1,2,...,8 or "
-                    "12 points: [xi, yi] i= 1,2,...,12 or "
-                    "16 points: [xi, yi] i= 1,2,...,16. "
-                    "But received %d.",
-                    box_dims[2]));
+          true,
+          platform::errors::InvalidArgument(
+              "The last dimension of Input(BBoxes) must be 4 or 8, "
+              "represents the layout of coordinate "
+              "[xmin, ymin, xmax, ymax] or "
+              "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
+              "8 points: [xi, yi] i= 1,2,...,8 or "
+              "12 points: [xi, yi] i= 1,2,...,12 or "
+              "16 points: [xi, yi] i= 1,2,...,16. "
+              "But received %d.",
+              box_dims[2]));
       PADDLE_ENFORCE_EQ(
           box_dims[1], score_dims[2],
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/detection/mask_util.cc b/paddle/fluid/operators/detection/mask_util.cc
index e06218cfe569f..41505ee84286a 100644
--- a/paddle/fluid/operators/detection/mask_util.cc
+++ b/paddle/fluid/operators/detection/mask_util.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/mask_util.h"
+
 #include <math.h>
 #include <stdlib.h>
+
 #include "paddle/fluid/memory/memory.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/mask_util.h b/paddle/fluid/operators/detection/mask_util.h
index 4e0ea54f6d89f..25b03a11f7db0 100644
--- a/paddle/fluid/operators/detection/mask_util.h
+++ b/paddle/fluid/operators/detection/mask_util.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <stdint.h>
+
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/mask_util_test.cc b/paddle/fluid/operators/detection/mask_util_test.cc
index de904e9474639..68f7a6db6488e 100644
--- a/paddle/fluid/operators/detection/mask_util_test.cc
+++ b/paddle/fluid/operators/detection/mask_util_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/mask_util.h"
+
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/memory/memory.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/matrix_nms_op.cc b/paddle/fluid/operators/detection/matrix_nms_op.cc
index 3353739b01bf6..5eee52dfbc704 100644
--- a/paddle/fluid/operators/detection/matrix_nms_op.cc
+++ b/paddle/fluid/operators/detection/matrix_nms_op.cc
@@ -405,7 +405,6 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(matrix_nms, ops::MatrixNMSKernel<float>,
                        ops::MatrixNMSKernel<double>);
 REGISTER_OP_VERSION(matrix_nms)
-    .AddCheckpoint(
-        R"ROC(Upgrade matrix_nms: add a new output [RoisNum].)ROC",
-        paddle::framework::compatible::OpVersionDesc().NewOutput(
-            "RoisNum", "The number of RoIs in each image."));
+    .AddCheckpoint(R"ROC(Upgrade matrix_nms: add a new output [RoisNum].)ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewOutput(
+                       "RoisNum", "The number of RoIs in each image."));
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 83cf6e5fd30f6..f603a501f4b78 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 limitations under the License. */
 
 #include <glog/logging.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
 
@@ -55,18 +56,19 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
                             ". But received rank = %d",
                             box_dims.size()));
       if (score_size == 3) {
-        PADDLE_ENFORCE_EQ(
-            box_dims[2] == 4 || box_dims[2] == 8 || box_dims[2] == 16 ||
-                box_dims[2] == 24 || box_dims[2] == 32,
-            true, platform::errors::InvalidArgument(
-                      "The last dimension of Input"
-                      "(BBoxes) must be 4 or 8, "
-                      "represents the layout of coordinate "
-                      "[xmin, ymin, xmax, ymax] or "
-                      "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
-                      "8 points: [xi, yi] i= 1,2,...,8 or "
-                      "12 points: [xi, yi] i= 1,2,...,12 or "
-                      "16 points: [xi, yi] i= 1,2,...,16"));
+        PADDLE_ENFORCE_EQ(box_dims[2] == 4 || box_dims[2] == 8 ||
+                              box_dims[2] == 16 || box_dims[2] == 24 ||
+                              box_dims[2] == 32,
+                          true,
+                          platform::errors::InvalidArgument(
+                              "The last dimension of Input"
+                              "(BBoxes) must be 4 or 8, "
+                              "represents the layout of coordinate "
+                              "[xmin, ymin, xmax, ymax] or "
+                              "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
+                              "8 points: [xi, yi] i= 1,2,...,8 or "
+                              "12 points: [xi, yi] i= 1,2,...,12 or "
+                              "16 points: [xi, yi] i= 1,2,...,16"));
         PADDLE_ENFORCE_EQ(
             box_dims[1], score_dims[2],
             platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/detection/nms_op.cc b/paddle/fluid/operators/detection/nms_op.cc
index f6dc44eb5fc2d..34a92efa68a63 100644
--- a/paddle/fluid/operators/detection/nms_op.cc
+++ b/paddle/fluid/operators/detection/nms_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/nms_op.h"
+
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/nms_op.cu b/paddle/fluid/operators/detection/nms_op.cu
index b6027e67d6ced..4f62c735c265a 100644
--- a/paddle/fluid/operators/detection/nms_op.cu
+++ b/paddle/fluid/operators/detection/nms_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/operators/detection/nms_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
diff --git a/paddle/fluid/operators/detection/nms_util.h b/paddle/fluid/operators/detection/nms_util.h
index 0e448d42fc2ed..7a6565ac760f1 100644
--- a/paddle/fluid/operators/detection/nms_util.h
+++ b/paddle/fluid/operators/detection/nms_util.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/operators/detection/poly_util.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/poly_util.cc b/paddle/fluid/operators/detection/poly_util.cc
index 1af2c95c6cf52..6aa81bf1b39f7 100644
--- a/paddle/fluid/operators/detection/poly_util.cc
+++ b/paddle/fluid/operators/detection/poly_util.cc
@@ -16,13 +16,14 @@ limitations under the License. */
 #define POLY_UTIL_CC_
 
 #include "paddle/fluid/operators/detection/poly_util.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
-using gpc::gpc_polygon_clip;
 using gpc::gpc_free_polygon;
+using gpc::gpc_polygon_clip;
 
 template <class T>
 void Array2PointVec(const T*& box, const size_t box_size,
diff --git a/paddle/fluid/operators/detection/poly_util.h b/paddle/fluid/operators/detection/poly_util.h
index f07baf72d9ff0..cc37f00008d33 100644
--- a/paddle/fluid/operators/detection/poly_util.h
+++ b/paddle/fluid/operators/detection/poly_util.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #define POLY_UTIL_H_
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/gpc.h"
 
diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h
index 4000994beb541..889bc8354bc41 100644
--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
index bc46ec0b65639..4e49a6ed8521e 100644
--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 limitations under the License. */
 
 #include <glog/logging.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -66,23 +67,26 @@ class RetinanetDetectionOutputOp : public framework::OperatorWithKernel {
     auto im_info_dims = ctx->GetInputDim("ImInfo");
 
     const size_t b_n = bboxes_dims.size();
-    PADDLE_ENFORCE_GT(b_n, 0, platform::errors::InvalidArgument(
-                                  "The number of Variables in Input(BBoxes) "
-                                  "should be greater than 0, "
-                                  "but received number is:%d.",
-                                  b_n));
+    PADDLE_ENFORCE_GT(b_n, 0,
+                      platform::errors::InvalidArgument(
+                          "The number of Variables in Input(BBoxes) "
+                          "should be greater than 0, "
+                          "but received number is:%d.",
+                          b_n));
     const size_t s_n = scores_dims.size();
-    PADDLE_ENFORCE_GT(s_n, 0, platform::errors::InvalidArgument(
-                                  "The number of Variables in Input(Scores) "
-                                  "should be greater than 0, "
-                                  "but received number is:%d.",
-                                  s_n));
+    PADDLE_ENFORCE_GT(s_n, 0,
+                      platform::errors::InvalidArgument(
+                          "The number of Variables in Input(Scores) "
+                          "should be greater than 0, "
+                          "but received number is:%d.",
+                          s_n));
     const size_t a_n = anchors_dims.size();
-    PADDLE_ENFORCE_GT(a_n, 0, platform::errors::InvalidArgument(
-                                  "The number of Variables in Input(Anchors) "
-                                  "should be greater than 0, "
-                                  "but received number is:%d.",
-                                  a_n));
+    PADDLE_ENFORCE_GT(a_n, 0,
+                      platform::errors::InvalidArgument(
+                          "The number of Variables in Input(Anchors) "
+                          "should be greater than 0, "
+                          "but received number is:%d.",
+                          a_n));
     auto bbox_dims = bboxes_dims[0];
     auto score_dims = scores_dims[0];
     auto anchor_dims = anchors_dims[0];
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
index 353d17a6e09f2..eb6d6c6db9284 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -40,8 +41,8 @@ bool GT(T a, T b) {
 }
 
 /*
-*check if (x, y) is in the boundary of roi
-*/
+ *check if (x, y) is in the boundary of roi
+ */
 template <typename T>
 bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
   for (int i = 0; i < 4; i++) {
@@ -431,10 +432,9 @@ class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
               T matrix[9];
               get_transform_matrix<T>(transformed_width, transformed_height,
                                       roi_x, roi_y, matrix);
-              const T* out_grad_ptr = out_grad_data +
-                                      (roi_idx * channels + c) *
-                                          transformed_height *
-                                          transformed_width;
+              const T* out_grad_ptr = out_grad_data + (roi_idx * channels + c) *
+                                                          transformed_height *
+                                                          transformed_width;
               for (int out_h = 0; out_h < transformed_height; ++out_h) {
                 for (int out_w = 0; out_w < transformed_width; ++out_w) {
                   T src_w;
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index 515a4bbac59c2..1bff79606d44b 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 using paddle::platform::float16;
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 
 namespace paddle {
 namespace operators {
@@ -56,8 +57,8 @@ __device__ T min(T a, T b) {
 }
 
 /*
-* check if (x, y) is in the boundary of roi
-*/
+ * check if (x, y) is in the boundary of roi
+ */
 template <typename T>
 __device__ bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
   for (int i = 0; i < 4; i++) {
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index e96c0bbc27290..b636decdfbff3 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <random>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
index 8526f1762cdc9..31f3dab81fef6 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
index 518295958630c..fcb7ec1fbfee0 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <limits>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 35e389090175f..ae7dfe0dd6685 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -36,10 +36,11 @@ class YoloBoxOp : public framework::OperatorWithKernel {
     auto iou_aware = ctx->Attrs().Get<bool>("iou_aware");
     auto iou_aware_factor = ctx->Attrs().Get<float>("iou_aware_factor");
 
-    PADDLE_ENFORCE_EQ(dim_x.size(), 4, platform::errors::InvalidArgument(
-                                           "Input(X) should be a 4-D tensor."
-                                           "But received X dimension(%s)",
-                                           dim_x.size()));
+    PADDLE_ENFORCE_EQ(
+        dim_x.size(), 4,
+        platform::errors::InvalidArgument("Input(X) should be a 4-D tensor."
+                                          "But received X dimension(%s)",
+                                          dim_x.size()));
     if (iou_aware) {
       PADDLE_ENFORCE_EQ(
           dim_x[1], anchor_num * (6 + class_num),
@@ -245,11 +246,10 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     YoloBoxInferShapeFunctor);
 
-REGISTER_OP_VERSION(yolo_box)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(yolo_box).AddCheckpoint(
+    R"ROC(
       Upgrade yolo box to add new attribute [iou_aware, iou_aware_factor].
     )ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewAttr("iou_aware", "Whether use iou aware", false)
-            .NewAttr("iou_aware_factor", "iou aware factor", 0.5f));
+    paddle::framework::compatible::OpVersionDesc()
+        .NewAttr("iou_aware", "Whether use iou aware", false)
+        .NewAttr("iou_aware_factor", "iou aware factor", 0.5f));
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index 21044734ca801..2170fd0639fcb 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -10,6 +10,7 @@
    limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/type_defs.h"
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index 588967f0832a9..aa4695cc97556 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection_map_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
index 4dd41837f060e..a034572a0c481 100644
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc
index 6959b5cf81106..ec5a51bbffa59 100644
--- a/paddle/fluid/operators/determinant_op.cc
+++ b/paddle/fluid/operators/determinant_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/determinant_op.h"
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/backward.h"
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index 702ff3bfd87b0..d4c05b631e3bb 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include <cmath>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cc b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
index 85a29271b13b5..f60380f047591 100644
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.cc
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
@@ -10,10 +10,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <string>
-
 #include "paddle/fluid/operators/dgc_clip_by_norm_op.h"
 
+#include <string>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/dgc_op.cc b/paddle/fluid/operators/dgc_op.cc
index 5fe66fa38a83b..95d3f75de9a02 100644
--- a/paddle/fluid/operators/dgc_op.cc
+++ b/paddle/fluid/operators/dgc_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/dgc_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h
index b1bf5e2778167..91093f67e0536 100644
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
-#include "dgc/dgc.h"
 
+#include "dgc/dgc.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
@@ -118,10 +118,12 @@ class DGCOpKernel : public framework::OpKernel<T> {
         1 - get_period_sparcity(
                 sparsity, static_cast<float>(*current_step - rampup_begin_step),
                 rampup_step);
-    PADDLE_ENFORCE_GE(ratio, 0.0, platform::errors::InvalidArgument(
-                                      "DGC sparsity ratio must >= 0"));
-    PADDLE_ENFORCE_LT(ratio, 1.0, platform::errors::InvalidArgument(
-                                      "DGC sparsity ratio must < 1"));
+    PADDLE_ENFORCE_GE(
+        ratio, 0.0,
+        platform::errors::InvalidArgument("DGC sparsity ratio must >= 0"));
+    PADDLE_ENFORCE_LT(
+        ratio, 1.0,
+        platform::errors::InvalidArgument("DGC sparsity ratio must < 1"));
     int k = static_cast<int>(g->numel() * ratio);
 
     VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov
diff --git a/paddle/fluid/operators/diag_embed_op.cu b/paddle/fluid/operators/diag_embed_op.cu
index 7e3ab6be664cb..a9d92fdf634a7 100644
--- a/paddle/fluid/operators/diag_embed_op.cu
+++ b/paddle/fluid/operators/diag_embed_op.cu
@@ -14,6 +14,7 @@
 
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/diag_embed_op.h"
 
diff --git a/paddle/fluid/operators/diag_embed_op.h b/paddle/fluid/operators/diag_embed_op.h
index a5621be3baa27..b07047996d513 100644
--- a/paddle/fluid/operators/diag_embed_op.h
+++ b/paddle/fluid/operators/diag_embed_op.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/for_range.h"
diff --git a/paddle/fluid/operators/dirichlet_op.h b/paddle/fluid/operators/dirichlet_op.h
index 540acad423aa3..658688816eb8f 100644
--- a/paddle/fluid/operators/dirichlet_op.h
+++ b/paddle/fluid/operators/dirichlet_op.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <cmath>
 #include <random>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/for_range.h"
 
diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc
index 55b2484941293..6f897bff75c24 100644
--- a/paddle/fluid/operators/dist_op.cc
+++ b/paddle/fluid/operators/dist_op.cc
@@ -14,6 +14,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/dlnne/CMakeLists.txt b/paddle/fluid/operators/dlnne/CMakeLists.txt
index 4fe9cf214eaa7..11347f0f94e5c 100644
--- a/paddle/fluid/operators/dlnne/CMakeLists.txt
+++ b/paddle/fluid/operators/dlnne/CMakeLists.txt
@@ -1,39 +1,30 @@
 # compile flags
-set(DLNNE_FLAGS
-  -Wno-error=non-virtual-dtor
-  -Wno-error=unused-variable
-  -Wno-error=attributes
-  ${fsanitize}
-)
+set(DLNNE_FLAGS -Wno-error=non-virtual-dtor -Wno-error=unused-variable
+                -Wno-error=attributes ${fsanitize})
 foreach(flag ${DLNNE_FLAGS})
   safe_set_cflag(CMAKE_C_FLAGS ${flag})
   safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
 endforeach()
 
-
 # add nne
-find_path(DLNNE_INCLUDE_DIR dlnne.h
-  PATHS
-  $ENV{SOFTWARE_SOURCE_DIR} $ENV{SOFTWARE_SOURCE_DIR}/driver/nne/include
-  NO_DEFAULT_PATH
-)
-
-find_library(DLNNE_LIB libdlnne.so
-  PATHS
-  $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/driver/nne
-  NO_DEFAULT_PATH
-)
+find_path(
+  DLNNE_INCLUDE_DIR dlnne.h
+  PATHS $ENV{SOFTWARE_SOURCE_DIR} $ENV{SOFTWARE_SOURCE_DIR}/driver/nne/include
+  NO_DEFAULT_PATH)
 
-find_path(CUDA_INCLUDE_DIR cuda.h
-  $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/include
-)
+find_library(
+  DLNNE_LIB libdlnne.so
+  PATHS $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/driver/nne
+  NO_DEFAULT_PATH)
 
-find_library(CURT_LIB libcurt.so
-  PATHS
-  $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/lib
-  NO_DEFAULT_PATH
-)
+find_path(CUDA_INCLUDE_DIR cuda.h
+          $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/include)
 
+find_library(
+  CURT_LIB libcurt.so
+  PATHS $ENV{SOFTWARE_BUILD_DIR}
+        $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/lib
+  NO_DEFAULT_PATH)
 
 message("DLNNE_INCLUDE_DIR: "${DLNNE_INCLUDE_DIR})
 message("DLNNE_LIB: "${DLNNE_LIB})
@@ -43,7 +34,15 @@ message("CURT_LIB: "${CURT_LIB})
 include_directories("${DLNNE_INCLUDE_DIR}")
 include_directories("${CUDA_INCLUDE_DIR}")
 
-op_library(dlnne_engine_op DEPS ${GLOB_OPERATOR_DEPS} framework_proto boost device_context op_registry scope)
+op_library(
+  dlnne_engine_op
+  DEPS
+  ${GLOB_OPERATOR_DEPS}
+  framework_proto
+  boost
+  device_context
+  op_registry
+  scope)
 
 #message("PYBIND_FILE:${pybind_file}")
 #file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(dlnne_engine);\n")
@@ -51,4 +50,7 @@ op_library(dlnne_engine_op DEPS ${GLOB_OPERATOR_DEPS} framework_proto boost devi
 
 target_link_libraries(dlnne_engine_op ${DLNNE_LIB} ${CURT_LIB})
 
-cc_test(test_dlnne_engine_op SRCS dlnne_engine_op_test.cc DEPS dlnne_engine_op analysis)
+cc_test(
+  test_dlnne_engine_op
+  SRCS dlnne_engine_op_test.cc
+  DEPS dlnne_engine_op analysis)
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.h b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
index 6b2622366fedc..857f295326b94 100644
--- a/paddle/fluid/operators/dlnne/dlnne_engine_op.h
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #pragma once
+#include <assert.h>
 #include <cuda.h>          // NOTLINT
 #include <cuda_runtime.h>  // NOTLINT
 #include <dlnne.h>         // NOTLINT
 
-#include <assert.h>
 #include <ctime>
 #include <fstream>
 #include <iostream>
@@ -128,11 +128,13 @@ class DlnneEngineOp : public framework::OperatorBase {
              << ".onnx";
 
     builder = dl::nne::CreateInferBuilder();
-    PADDLE_ENFORCE_NE(builder, nullptr, platform::errors::Unavailable(
-                                            "nne create builder failed"));
+    PADDLE_ENFORCE_NE(
+        builder, nullptr,
+        platform::errors::Unavailable("nne create builder failed"));
     parser = dl::nne::CreateParser();
-    PADDLE_ENFORCE_NE(parser, nullptr, platform::errors::Unavailable(
-                                           "nne create parser failed"));
+    PADDLE_ENFORCE_NE(
+        parser, nullptr,
+        platform::errors::Unavailable("nne create parser failed"));
 
     network = builder->CreateNetwork();
 
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
index 611366f6c5b8a..8e1d7fe5d815a 100644
--- a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/dlnne/dlnne_engine_op.h"
+
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_desc.h"
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 6af8c925ff580..c40f6c0bbaea0 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -19,11 +19,13 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <curand_kernel.h>
+
 #include "paddle/fluid/platform/dynload/curand.h"
 #endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
+
 #include "paddle/fluid/platform/dynload/hiprand.h"
 #endif
 
@@ -34,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_impl_util.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/functors.h"
@@ -195,9 +198,11 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
     size_t main_offset =
         size / (block_size * kVecSize) * (block_size * kVecSize);
 
-    VectorizedRandomGenerator<T, uint8_t><<<grid_size, block_size, 0, stream>>>(
-        size, seed_data, dropout_prob, x_data, mask_data, y_data,
-        upscale_in_train, increment, main_offset);
+    PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(
+        !is_fix_seed, (VectorizedRandomGenerator<T, uint8_t>), grid_size,
+        block_size, 0, stream, offset, KERNEL_PARAMS.As<uint64_t>(1),
+        KERNEL_PARAMS.As<uint64_t>(7), size, seed_data, dropout_prob, x_data,
+        mask_data, y_data, upscale_in_train, increment, main_offset);
   } else {
     if (upscale_in_train) {
 // todo: can y share with data with x directly?
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 8d033ea3194b9..9426efa494208 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/binary.h"
diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc
index 851f26ee0e717..24de99d6d8f85 100644
--- a/paddle/fluid/operators/dropout_op_xpu.cc
+++ b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/edit_distance_op.cc b/paddle/fluid/operators/edit_distance_op.cc
index db8a107290eb6..8127895569f6f 100644
--- a/paddle/fluid/operators/edit_distance_op.cc
+++ b/paddle/fluid/operators/edit_distance_op.cc
@@ -37,12 +37,13 @@ class EditDistanceOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           hyp_dims.size() == 2 && ref_dims.size() == 2 &&
               hyp_dims[0] == ref_dims[0],
-          true, platform::errors::InvalidArgument(
-                    "Input(Hyps) and Input(Refs) must be 2-D Tensors with "
-                    "identical first dimension. But received Input(Hyps): "
-                    "input rank %u, input shape [%s]; received Input(Refs): "
-                    "input rank %u, input shape [%s]",
-                    hyp_dims.size(), hyp_dims, ref_dims.size(), ref_dims));
+          true,
+          platform::errors::InvalidArgument(
+              "Input(Hyps) and Input(Refs) must be 2-D Tensors with "
+              "identical first dimension. But received Input(Hyps): "
+              "input rank %u, input shape [%s]; received Input(Refs): "
+              "input rank %u, input shape [%s]",
+              hyp_dims.size(), hyp_dims, ref_dims.size(), ref_dims));
       PADDLE_ENFORCE_EQ(
           hyp_length_dims[0] == ref_length_dims[0] &&
               hyp_length_dims[0] == hyp_dims[0],
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
index 49ac7183ff3b0..eb208c559cef6 100644
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/edit_distance_op.h"
diff --git a/paddle/fluid/operators/edit_distance_op.h b/paddle/fluid/operators/edit_distance_op.h
index ef290c2eff2be..101e3a90b80d3 100644
--- a/paddle/fluid/operators/edit_distance_op.h
+++ b/paddle/fluid/operators/edit_distance_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/eig_op.cc b/paddle/fluid/operators/eig_op.cc
index 6f1737dba819c..5239248d82f1f 100644
--- a/paddle/fluid/operators/eig_op.cc
+++ b/paddle/fluid/operators/eig_op.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/eig_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -32,10 +34,11 @@ class EigOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("X");
     int rank = x_dims.size();
-    PADDLE_ENFORCE_GE(rank, 2, platform::errors::InvalidArgument(
-                                   "Expects input tensor x to be not less than "
-                                   "2 dimentions, but got dimention %d",
-                                   rank));
+    PADDLE_ENFORCE_GE(rank, 2,
+                      platform::errors::InvalidArgument(
+                          "Expects input tensor x to be not less than "
+                          "2 dimentions, but got dimention %d",
+                          rank));
     PADDLE_ENFORCE_EQ(x_dims[rank - 2], x_dims[rank - 1],
                       platform::errors::InvalidArgument(
                           "The input matrix must be a square matrix, "
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index fe898a6c41c2a..0f9afae8267bf 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -15,8 +15,10 @@
 #pragma once
 
 #include <math.h>
+
 #include <algorithm>
 #include <complex>
+
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
diff --git a/paddle/fluid/operators/eigvals_op.cc b/paddle/fluid/operators/eigvals_op.cc
index 2ef591dd26a06..177dc684662f5 100644
--- a/paddle/fluid/operators/eigvals_op.cc
+++ b/paddle/fluid/operators/eigvals_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/eigvals_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/eigvals_op.h b/paddle/fluid/operators/eigvals_op.h
index 4627acc0d07de..d75b33e0857bc 100644
--- a/paddle/fluid/operators/eigvals_op.h
+++ b/paddle/fluid/operators/eigvals_op.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
@@ -71,14 +72,16 @@ static void SpiltBatchSquareMatrix(const Tensor& input,
 }
 
 static void CheckLapackEigResult(const int info, const std::string& name) {
-  PADDLE_ENFORCE_LE(info, 0, platform::errors::PreconditionNotMet(
-                                 "The QR algorithm failed to compute all the "
-                                 "eigenvalues in function %s.",
-                                 name.c_str()));
+  PADDLE_ENFORCE_LE(info, 0,
+                    platform::errors::PreconditionNotMet(
+                        "The QR algorithm failed to compute all the "
+                        "eigenvalues in function %s.",
+                        name.c_str()));
   PADDLE_ENFORCE_GE(
-      info, 0, platform::errors::InvalidArgument(
-                   "The %d-th argument has an illegal value in function %s.",
-                   -info, name.c_str()));
+      info, 0,
+      platform::errors::InvalidArgument(
+          "The %d-th argument has an illegal value in function %s.", -info,
+          name.c_str()));
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/einsum_op.cc b/paddle/fluid/operators/einsum_op.cc
index 6da0045443ccc..7fc19d6913f83 100644
--- a/paddle/fluid/operators/einsum_op.cc
+++ b/paddle/fluid/operators/einsum_op.cc
@@ -14,6 +14,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/elementwise/CMakeLists.txt b/paddle/fluid/operators/elementwise/CMakeLists.txt
index 216a3f79d6f92..25b34a2c0a2c3 100644
--- a/paddle/fluid/operators/elementwise/CMakeLists.txt
+++ b/paddle/fluid/operators/elementwise/CMakeLists.txt
@@ -1,14 +1,32 @@
 include(operators)
 if(WITH_UNITY_BUILD)
-    # Load Unity Build rules for operators in paddle/fluid/operators/elementwise.
-    include(unity_build_rule.cmake)
+  # Load Unity Build rules for operators in paddle/fluid/operators/elementwise.
+  include(unity_build_rule.cmake)
 endif()
 register_operators(DEPS op_version_registry)
 
-cc_test(test_elementwise_add_op_inplace SRCS test_elementwise_add_op_inplace.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
-cc_test(test_elementwise_div_grad_grad SRCS test_elementwise_div_grad_grad.cc DEPS op_registry elementwise_div_op scope device_context enforce executor)
-cc_test(test_elementwise_add_grad_grad SRCS test_elementwise_add_grad_grad.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
+cc_test(
+  test_elementwise_add_op_inplace
+  SRCS test_elementwise_add_op_inplace.cc
+  DEPS op_registry elementwise_add_op scope device_context enforce executor)
+cc_test(
+  test_elementwise_div_grad_grad
+  SRCS test_elementwise_div_grad_grad.cc
+  DEPS op_registry elementwise_div_op scope device_context enforce executor)
+cc_test(
+  test_elementwise_add_grad_grad
+  SRCS test_elementwise_add_grad_grad.cc
+  DEPS op_registry elementwise_add_op scope device_context enforce executor)
 
 if(WITH_ASCEND_CL)
-cc_test(elementwise_op_npu_test SRCS elementwise_op_npu_test.cc DEPS op_registry elementwise_add_op elementwise_sub_op scope device_context enforce executor)
+  cc_test(
+    elementwise_op_npu_test
+    SRCS elementwise_op_npu_test.cc
+    DEPS op_registry
+         elementwise_add_op
+         elementwise_sub_op
+         scope
+         device_context
+         enforce
+         executor)
 endif()
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
index 22a5de4c60941..9c1a84ba8b67f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <memory>
 #include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 13fd9b81a8765..e0523a26ee3ce 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index e9adb9abdb528..b3363862d5f97 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc b/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc
index e003a43b5c56b..ebdebb2f4852a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h
index ff1e12103be91..8c230c5f47bf6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mlu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
@@ -16,6 +16,7 @@
 
 #ifdef PADDLE_WITH_MLU
 #include <vector>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc
index 156589384c0dd..19d28301ffb83 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc
@@ -15,11 +15,11 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index 45b6f7cb39194..253014a79817a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/complex.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index e2dd0e36d400a..39045bf0d5904 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/cpu_info.h"
-
 #include "paddle/phi/kernels/elementwise_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 80b07721f0b4d..476b891bb419d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/transform.h"
-
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/cpu/elementwise_grad.h"
@@ -60,14 +59,14 @@ namespace paddle {
 namespace operators {
 
 /*
-*  Pack input and output tensors into respective vectors with
-*  consideration of varible X`s class type.
-*  Input variable X is supported to be whether LoDTensor or
-*  SelectedRows class type in this package function, once X
-*  was SelectedRows type, a valid pointer x_for_selectedrows
-*  is excepted to be passed in from op kernel for acquisition
-*  of the valid address of LoDTensor created ahead in the function.
-*/
+ *  Pack input and output tensors into respective vectors with
+ *  consideration of varible X`s class type.
+ *  Input variable X is supported to be whether LoDTensor or
+ *  SelectedRows class type in this package function, once X
+ *  was SelectedRows type, a valid pointer x_for_selectedrows
+ *  is excepted to be passed in from op kernel for acquisition
+ *  of the valid address of LoDTensor created ahead in the function.
+ */
 template <typename OutT>
 int PackTensorsIntoVector(const framework::ExecutionContext &ctx,
                           std::vector<const framework::Tensor *> *ins,
@@ -327,10 +326,11 @@ static void FusedElemwiseAndActBroadcast1CUDA(gpuStream_t stream, const T *x,
                                               T *intermediate_out) {
   int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, w);
   int gird_size = h;
-  FusedElemwiseAndActBroadcast1CUDAKernel<
-      T, CompoundFunctor, BcastY, KeepIntermediateOut,
-      SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
-      x, y, h, w, compound_functor, out, intermediate_out);
+  FusedElemwiseAndActBroadcast1CUDAKernel<T, CompoundFunctor, BcastY,
+                                          KeepIntermediateOut,
+                                          SameShapeOfIntermediateOutAndOut>
+      <<<gird_size, block_size, 0, stream>>>(x, y, h, w, compound_functor, out,
+                                             intermediate_out);
 }
 
 template <typename T, typename CompoundFunctor, bool BcastY,
@@ -385,10 +385,11 @@ static void FusedElemwiseAndActBroadcast2CUDA(gpuStream_t stream, const T *x,
   int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
   int gird_size = n;
 
-  FusedElemwiseAndActBroadcast2CUDAKernel<
-      T, CompoundFunctor, BcastY, KeepIntermediateOut,
-      SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
-      x, y, compound_functor, pre, n, post, out, intermediate_out);
+  FusedElemwiseAndActBroadcast2CUDAKernel<T, CompoundFunctor, BcastY,
+                                          KeepIntermediateOut,
+                                          SameShapeOfIntermediateOutAndOut>
+      <<<gird_size, block_size, 0, stream>>>(x, y, compound_functor, pre, n,
+                                             post, out, intermediate_out);
 }
 
 #endif
@@ -544,8 +545,9 @@ void FusedElemwiseAndActGradComputeNoBroadcast(
       out->data<T>(), dout->data<T>(), dx_op, dy_op, dintermediate_op,
       dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
       dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-      dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                               ctx.GetPlace())});
+      dintermediate == nullptr
+          ? nullptr
+          : dintermediate->mutable_data<T>(ctx.GetPlace())});
 }
 
 template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
@@ -605,12 +607,11 @@ static void FusedElemwiseAndActGradBroadcast1CPU(
         }
       }
       if (d_intermediate != nullptr) {
-        T tmp = UseIntermediateOut
-                    ? dintermediate_op.UseIntermediateOut(
-                          x_val, intermediate_out[tmp_out_idx], out[offset],
-                          dout[offset])
-                    : dintermediate_op.Recompute(x_val, y_val, out[offset],
-                                                 dout[i]);
+        T tmp = UseIntermediateOut ? dintermediate_op.UseIntermediateOut(
+                                         x_val, intermediate_out[tmp_out_idx],
+                                         out[offset], dout[offset])
+                                   : dintermediate_op.Recompute(
+                                         x_val, y_val, out[offset], dout[i]);
         if (SameShapeOfIntermediateOutAndOut) {
           d_intermediate[tmp_out_idx] = tmp;
         } else {
@@ -686,12 +687,11 @@ static void FusedElemwiseAndActGradBroadcast2CPU(
           }
         }
         if (d_intermediate != nullptr) {
-          T tmp = UseIntermediateOut
-                      ? dintermediate_op.UseIntermediateOut(
-                            x_val, intermediate_out[tmp_out_idx], out[offset],
-                            dout[offset])
-                      : dintermediate_op.Recompute(x_val, y_val, out[offset],
-                                                   dout[i]);
+          T tmp = UseIntermediateOut ? dintermediate_op.UseIntermediateOut(
+                                           x_val, intermediate_out[tmp_out_idx],
+                                           out[offset], dout[offset])
+                                     : dintermediate_op.Recompute(
+                                           x_val, y_val, out[offset], dout[i]);
           if (SameShapeOfIntermediateOutAndOut) {
             d_intermediate[tmp_out_idx] = tmp;
           } else {
@@ -835,11 +835,12 @@ static void FusedElemwiseAndActGradBroadcast1CUDA(
   int theory_block = (w + BLOCK_X - 1) / BLOCK_X;
   dim3 grids(std::min(theory_block, max_blocks));
 
-  FusedElemwiseAndActGradBroadcast1CUDAKernel<
-      T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut, BcastY,
-      SameShapeOfIntermediateOutAndOut><<<grids, blocks, 0, stream>>>(
-      x, y, intermediate_out, out, dout, h, w, dx_op, dy_op, dintermediate_op,
-      dx, dy, d_intermediate);
+  FusedElemwiseAndActGradBroadcast1CUDAKernel<T, DX_OP, DY_OP, DIntermediate_OP,
+                                              UseIntermediateOut, BcastY,
+                                              SameShapeOfIntermediateOutAndOut>
+      <<<grids, blocks, 0, stream>>>(x, y, intermediate_out, out, dout, h, w,
+                                     dx_op, dy_op, dintermediate_op, dx, dy,
+                                     d_intermediate);
 }
 
 template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
@@ -899,12 +900,11 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel(
       }
     }
     if (d_intermediate != nullptr) {
-      T tmp = UseIntermediateOut
-                  ? dintermediate_op.UseIntermediateOut(
-                        y_val, intermediate_out[tmp_out_idx], out[offset],
-                        dout[offset])
-                  : dintermediate_op.Recompute(x_val, y_val, out[offset],
-                                               dout[offset]);
+      T tmp = UseIntermediateOut ? dintermediate_op.UseIntermediateOut(
+                                       y_val, intermediate_out[tmp_out_idx],
+                                       out[offset], dout[offset])
+                                 : dintermediate_op.Recompute(
+                                       x_val, y_val, out[offset], dout[offset]);
       if (SameShapeOfIntermediateOutAndOut) {
         d_intermediate[tmp_out_idx] = tmp;
       } else {
@@ -951,11 +951,12 @@ static void FusedElemwiseAndActGradBroadcast2CUDA(
     T *dintermediate) {
   int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
   int gird_size = n;
-  FusedElemwiseAndActGradBroadcast2CUDAKernel<
-      T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut, BcastY,
-      SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
-      x, y, intermediate_out, out, dout, pre, n, post, dx_op, dy_op,
-      dintermediate_op, dx, dy, dintermediate);
+  FusedElemwiseAndActGradBroadcast2CUDAKernel<T, DX_OP, DY_OP, DIntermediate_OP,
+                                              UseIntermediateOut, BcastY,
+                                              SameShapeOfIntermediateOutAndOut>
+      <<<gird_size, block_size, 0, stream>>>(
+          x, y, intermediate_out, out, dout, pre, n, post, dx_op, dy_op,
+          dintermediate_op, dx, dy, dintermediate);
 }
 #endif
 
@@ -995,8 +996,9 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
           out->data<T>(), dout->data<T>(), h, w, dx_op, dy_op, dintermediate_op,
           dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
           dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-          dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                                   ctx.GetPlace()));
+          dintermediate == nullptr
+              ? nullptr
+              : dintermediate->mutable_data<T>(ctx.GetPlace()));
 #endif
     } else {
       FusedElemwiseAndActGradBroadcast1CPU<T, DX_OP, DY_OP, DIntermediate_OP,
@@ -1007,8 +1009,9 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
           out->data<T>(), dout->data<T>(), h, w, dx_op, dy_op, dintermediate_op,
           dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
           dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-          dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                                   ctx.GetPlace()));
+          dintermediate == nullptr
+              ? nullptr
+              : dintermediate->mutable_data<T>(ctx.GetPlace()));
     }
   } else {
     if (platform::is_gpu_place(ctx.GetPlace())) {
@@ -1022,8 +1025,9 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
           dintermediate_op,
           dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
           dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-          dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                                   ctx.GetPlace()));
+          dintermediate == nullptr
+              ? nullptr
+              : dintermediate->mutable_data<T>(ctx.GetPlace()));
 #endif
     } else {
       FusedElemwiseAndActGradBroadcast2CPU<T, DX_OP, DY_OP, DIntermediate_OP,
@@ -1035,8 +1039,9 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
           dintermediate_op,
           dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
           dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-          dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                                   ctx.GetPlace()));
+          dintermediate == nullptr
+              ? nullptr
+              : dintermediate->mutable_data<T>(ctx.GetPlace()));
     }
   }
 }
diff --git a/paddle/fluid/operators/elementwise/elementwise_xpu.h b/paddle/fluid/operators/elementwise/elementwise_xpu.h
index db5c94b9d1a6e..3f38450581ec8 100644
--- a/paddle/fluid/operators/elementwise/elementwise_xpu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <tuple>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
 #include "xpu/refactor/math.h"
@@ -32,8 +33,9 @@ void XPUElementwise(
                       const std::vector<int>&, const std::vector<int>&)>
         func) {
   auto x_var = ctx.InputVar("X");
-  PADDLE_ENFORCE_NE(x_var, nullptr, platform::errors::InvalidArgument(
-                                        "Cannot get input Variable X"));
+  PADDLE_ENFORCE_NE(
+      x_var, nullptr,
+      platform::errors::InvalidArgument("Cannot get input Variable X"));
   PADDLE_ENFORCE_EQ(
       x_var->IsType<framework::LoDTensor>(), true,
       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
index 3cecc52a3c481..f647bd91d5f3d 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
@@ -18,6 +18,7 @@
 #include <random>
 #include <string>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -72,11 +73,12 @@ class TestElementwiseDivGradGradWithoutDout
 
   std::unique_ptr<framework::OperatorBase> CreateTestOp() override {
     auto op = framework::OpRegistry::CreateOp(
-        this->op_type_, {{"Y", {"Y"}},
-                         {"Out", {"Out"}},
-                         {"DDX", {"DDX"}},
-                         {"DDY", {"DDY"}},
-                         {"DX", {"DX"}}},
+        this->op_type_,
+        {{"Y", {"Y"}},
+         {"Out", {"Out"}},
+         {"DDX", {"DDX"}},
+         {"DDY", {"DDY"}},
+         {"DX", {"DX"}}},
         {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}},
         {{"use_mkldnn", false}, {"axis", 0}});
     return op;
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
index 05f87e5465abe..7defe4e5793ab 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
+++ b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
@@ -21,6 +21,7 @@
 #include <random>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/elementwise/unity_build_rule.cmake b/paddle/fluid/operators/elementwise/unity_build_rule.cmake
index ea001fe438545..060c990ea8712 100644
--- a/paddle/fluid/operators/elementwise/unity_build_rule.cmake
+++ b/paddle/fluid/operators/elementwise/unity_build_rule.cmake
@@ -4,25 +4,27 @@
 # Generally, the combination rules in this file do not need to be modified.
 # If there are some redefined error in compiling with the source file which
 # in combination rule, you can remove the source file from the following rules.
-register_unity_group(cc
-    elementwise_add_op.cc
-    mkldnn/elementwise_add_mkldnn_op.cc
-    elementwise_div_op.cc
-    elementwise_floordiv_op.cc
-    elementwise_max_op.cc
-    elementwise_min_op.cc
-    elementwise_mod_op.cc
-    elementwise_mul_op.cc
-    mkldnn/elementwise_mul_mkldnn_op.cc
-    elementwise_pow_op.cc
-    elementwise_sub_op.cc)
-register_unity_group(cu
-    elementwise_add_op.cu
-    elementwise_div_op.cu
-    elementwise_floordiv_op.cu
-    elementwise_max_op.cu
-    elementwise_min_op.cu
-    elementwise_mod_op.cu
-    elementwise_mul_op.cu
-    elementwise_pow_op.cu
-    elementwise_sub_op.cu)
+register_unity_group(
+  cc
+  elementwise_add_op.cc
+  mkldnn/elementwise_add_mkldnn_op.cc
+  elementwise_div_op.cc
+  elementwise_floordiv_op.cc
+  elementwise_max_op.cc
+  elementwise_min_op.cc
+  elementwise_mod_op.cc
+  elementwise_mul_op.cc
+  mkldnn/elementwise_mul_mkldnn_op.cc
+  elementwise_pow_op.cc
+  elementwise_sub_op.cc)
+register_unity_group(
+  cu
+  elementwise_add_op.cu
+  elementwise_div_op.cu
+  elementwise_floordiv_op.cu
+  elementwise_max_op.cu
+  elementwise_min_op.cu
+  elementwise_mod_op.cu
+  elementwise_mul_op.cu
+  elementwise_pow_op.cu
+  elementwise_sub_op.cu)
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
index 9e0e4e7fe1c6d..0f6c308b211bf 100644
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index 093c4d8f7930e..cace8b5fdffa7 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/expand_as_op.h"
+
 #include <memory>
 #include <vector>
 
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
old mode 100755
new mode 100644
index 9361edd43bf15..8cdab4c5e1a41
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -10,8 +10,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/expand_as_v2_op.h"
+
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/infermeta/binary.h"
@@ -107,7 +109,6 @@ REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp,
                   ops::ExpandAsV2GradNoNeedBufVarsInferer);
 
 REGISTER_OP_VERSION(expand_as_v2)
-    .AddCheckpoint(
-        R"ROC(fix expand_as_v2 and add new input [Y])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewInput(
-            "Y", "Expand X according to the shape of Y"));
+    .AddCheckpoint(R"ROC(fix expand_as_v2 and add new input [Y])ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewInput(
+                       "Y", "Expand X according to the shape of Y"));
diff --git a/paddle/fluid/operators/expand_as_v2_op_npu.cc b/paddle/fluid/operators/expand_as_v2_op_npu.cc
index 67d95e1240022..28fd922d77b81 100644
--- a/paddle/fluid/operators/expand_as_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_as_v2_op_npu.cc
@@ -30,10 +30,11 @@ class ExpandAsV2NPUKernel : public framework::OpKernel<T> {
                           "expand_as_v2 op must be greater than or equal to "
                           "the rank (%d) of the input 'x'.",
                           target_rank, rank));
-    PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument(
-                                   "The rank (%d) of the input 'x' for "
-                                   "expand_as_v2 op must be positive.",
-                                   rank));
+    PADDLE_ENFORCE_GE(
+        rank, 1,
+        platform::errors::InvalidArgument("The rank (%d) of the input 'x' for "
+                                          "expand_as_v2 op must be positive.",
+                                          rank));
     PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED,
                       platform::errors::InvalidArgument(
                           "The rank (%d) of the input 'target_tensor' for "
diff --git a/paddle/fluid/operators/expand_as_v2_op_xpu.cc b/paddle/fluid/operators/expand_as_v2_op_xpu.cc
index 0912b280aa6c7..fc3d77f3cc82c 100644
--- a/paddle/fluid/operators/expand_as_v2_op_xpu.cc
+++ b/paddle/fluid/operators/expand_as_v2_op_xpu.cc
@@ -33,10 +33,11 @@ class ExpandAsV2XPUKernel : public framework::OpKernel<T> {
                           "expand_as_v2 op must be greater than or equal to "
                           "the rank (%d) of the input 'x'.",
                           target_rank, rank));
-    PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument(
-                                   "The rank (%d) of the input 'x' for "
-                                   "expand_as_v2 op must be positive.",
-                                   rank));
+    PADDLE_ENFORCE_GE(
+        rank, 1,
+        platform::errors::InvalidArgument("The rank (%d) of the input 'x' for "
+                                          "expand_as_v2 op must be positive.",
+                                          rank));
     PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED,
                       platform::errors::InvalidArgument(
                           "The rank (%d) of the input 'target_tensor' for "
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index e45761112d4bd..04cdbd5a60615 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/expand_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 05cd893b057af..880adad743fa3 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -213,12 +213,13 @@ class ExpandGradKernel : public framework::OpKernel<T> {
       framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
                             out0);
     } else {
-      PADDLE_ENFORCE_GE(dims, 1, platform::errors::InvalidArgument(
-                                     "The number of dimensions of the input "
-                                     "'Out@GRAD' for Op(expand_grad)"
-                                     " must be greater than or equal to 1, but "
-                                     "the value received is %d.",
-                                     dims));
+      PADDLE_ENFORCE_GE(dims, 1,
+                        platform::errors::InvalidArgument(
+                            "The number of dimensions of the input "
+                            "'Out@GRAD' for Op(expand_grad)"
+                            " must be greater than or equal to 1, but "
+                            "the value received is %d.",
+                            dims));
       PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
                         platform::errors::InvalidArgument(
                             "The number of dimensions of the input 'Out@GRAD' "
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index 292f706cb186b..6aeea745911aa 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/expand_v2_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc
index c9fe19fd091da..c64bdabf59964 100644
--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_v2_op_npu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/expand_v2_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/expand_v2_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/expand_v2_op_xpu.cc b/paddle/fluid/operators/expand_v2_op_xpu.cc
index cb2165c4e922e..3d010c964bcfd 100644
--- a/paddle/fluid/operators/expand_v2_op_xpu.cc
+++ b/paddle/fluid/operators/expand_v2_op_xpu.cc
@@ -13,8 +13,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/expand_v2_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/expand_v2_op.h"
 
 namespace paddle {
 namespace operators {
@@ -110,10 +110,11 @@ class ExpandV2XPUKernel : public framework::OpKernel<T> {
       r = xpu::broadcast<XPUType>(dev_ctx.x_context(), x_data, out_data,
                                   x_shape, out_shape);
     }
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                          "XPU API(broadcast) return wrong "
-                                          "value[%d %s] in ExpandV2XPUKernel.",
-                                          r, XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(broadcast) return wrong "
+                                   "value[%d %s] in ExpandV2XPUKernel.",
+                                   r, XPUAPIErrorMsg[r]));
   }
 };
 
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index 8172f441e64a4..5a3a1cf53deb1 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fake_dequantize_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu.h b/paddle/fluid/operators/fake_dequantize_op.cu.h
index 9859dd4607c15..50f772ec45de8 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu.h
+++ b/paddle/fluid/operators/fake_dequantize_op.cu.h
@@ -119,10 +119,10 @@ struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
         quant_stride *= in_dims[i];
       }
 
-      DequantizeOneScaleQuantAxisN<
-          T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-          in_data, scale_factor, max_range, num, in_dims[quant_axis],
-          quant_stride, out_data);
+      DequantizeOneScaleQuantAxisN<T>
+          <<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+              in_data, scale_factor, max_range, num, in_dims[quant_axis],
+              quant_stride, out_data);
     } else if (scale_num == 2) {
       // Not need to consider quant_axis
       int num = in->numel();
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
index aad2c2c7d985a..e623a638922d5 100644
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/ddim.h"
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index ac72f23d46ea8..855c78d2998bd 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fake_quantize_op.h"
+
 #include <algorithm>
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/transform.h"
@@ -832,7 +834,7 @@ REGISTER_OP_VERSION(moving_average_abs_max_scale)
             "Delete output in order to make the inference model not "
             "save moving_average_abs_max_scale operator. This will "
             "make the quantitative model be correctly applied in inference."))
-    .AddCheckpoint(
-        R"ROC(Incompatible upgrade of output [Out])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewOutput(
-            "Out", "In order to support dygraph qat, add output again."));
+    .AddCheckpoint(R"ROC(Incompatible upgrade of output [Out])ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewOutput(
+                       "Out",
+                       "In order to support dygraph qat, add output again."));
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index a6130c272d72b..580521183cbdc 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #endif  // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_
 
 #include <string>
+
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/fake_quantize_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -80,10 +81,10 @@ struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
 
     framework::Tensor max;
     T* max_data = max.mutable_data<T>(phi::make_ddim({grid}), ctx.GetPlace());
-    FindAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
-        in, num, max_data);
-    FindAbsMaxKernel<T><<<1, block, 1024 * sizeof(T), ctx.stream()>>>(
-        max_data, grid, out);
+    FindAbsMaxKernel<T>
+        <<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(in, num, max_data);
+    FindAbsMaxKernel<T>
+        <<<1, block, 1024 * sizeof(T), ctx.stream()>>>(max_data, grid, out);
   }
 };
 
@@ -176,9 +177,9 @@ struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
       int cout = in_dims[0];
       int grid = cout;
       int block = 1024;
-      FindChannelAbsMaxKernelQuantAxis0<
-          T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
-          in_data, num, cout, out_abs_max);
+      FindChannelAbsMaxKernelQuantAxis0<T>
+          <<<grid, block, block * sizeof(T), ctx.stream()>>>(in_data, num, cout,
+                                                             out_abs_max);
     } else if (quant_axis == 1) {
       int cin = in_dims[0];
       int cout = in_dims[1];
@@ -193,17 +194,17 @@ struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
 
       for (int i = 0; i < cin / max_threads; i++) {
         int block = max_threads;
-        FindChannelAbsMaxKernelQuantAxis1<
-            T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
-            in_data, num, cin, cout, out_abs_max);
+        FindChannelAbsMaxKernelQuantAxis1<T>
+            <<<grid, block, block * sizeof(T), ctx.stream()>>>(
+                in_data, num, cin, cout, out_abs_max);
         in_data += num / cin;
       }
 
       int block = cin % max_threads;
       if (block > 0) {
-        FindChannelAbsMaxKernelQuantAxis1<
-            T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
-            in_data, num, in_dims[0], in_dims[1], out_abs_max);
+        FindChannelAbsMaxKernelQuantAxis1<T>
+            <<<grid, block, block * sizeof(T), ctx.stream()>>>(
+                in_data, num, in_dims[0], in_dims[1], out_abs_max);
       }
     }
   }
@@ -549,16 +550,16 @@ struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
     if (quant_axis == 0) {
       int grid = in_dims[0];
       int block = 1024;
-      ChannelClipAndQuantDequantKernelQuantAxis0<
-          T><<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
-                                               num, in_dims[0], out_data);
+      ChannelClipAndQuantDequantKernelQuantAxis0<T>
+          <<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt, num,
+                                             in_dims[0], out_data);
     } else if (quant_axis == 1) {
       int grid = in_dims[0] * in_dims[1];
       int block = 1024;
 
-      ChannelClipAndQuantDequantKernelQuantAxis1<
-          T><<<grid, block, 0, ctx.stream()>>>(
-          in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
+      ChannelClipAndQuantDequantKernelQuantAxis1<T>
+          <<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt, num,
+                                             in_dims[0], in_dims[1], out_data);
     }
   }
 };
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index dc3f081cc9eab..182db11ed847d 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 6e646f0d4bf26..68ef8f3c2be11 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fc_op.h"
+
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index 47c7128603587..1c76c2c36b84e 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"
 
diff --git a/paddle/fluid/operators/fill_any_like_op_xpu.cc b/paddle/fluid/operators/fill_any_like_op_xpu.cc
index ec4ba6e926c41..a07fbe5a7a550 100644
--- a/paddle/fluid/operators/fill_any_like_op_xpu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_xpu.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/phi/kernels/full_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 07593a70f05b7..d6726b99813e6 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fill_constant_op.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index edd8613ba525d..a121eb8cc841b 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -84,9 +84,10 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
       const auto &dev_ctx =
           ctx.template device_context<paddle::platform::NPUDeviceContext>();
       auto op_func = [&shape, &value](
-          const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs,
-          const NPUAttributeMap &attrs,
-          const platform::NPUDeviceContext &dev_ctx) {
+                         const std::vector<Tensor> &inputs,
+                         const std::vector<Tensor> &outputs,
+                         const NPUAttributeMap &attrs,
+                         const platform::NPUDeviceContext &dev_ctx) {
         Tensor tensor_value;
         tensor_value.mutable_data<uint8_t>({1}, dev_ctx.GetPlace());
         FillNpuTensorWithConstant<uint8_t>(&tensor_value,
diff --git a/paddle/fluid/operators/fill_diagonal_tensor_op.h b/paddle/fluid/operators/fill_diagonal_tensor_op.h
index ebb980b66af85..5bee72f526815 100644
--- a/paddle/fluid/operators/fill_diagonal_tensor_op.h
+++ b/paddle/fluid/operators/fill_diagonal_tensor_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index 521ddd4ec12b3..e934b794f8ba7 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fill_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fill_op.h b/paddle/fluid/operators/fill_op.h
index c5cbffbf5c695..7f7e0f2b31aa0 100644
--- a/paddle/fluid/operators/fill_op.h
+++ b/paddle/fluid/operators/fill_op.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <vector>
 
-#include <algorithm>
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
index 2d340829332c8..518d8414c5092 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fill_zeros_like_op.h"
+
 #include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
index 4cb0887c1f326..91809b8cd11bd 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fill_zeros_like_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
diff --git a/paddle/fluid/operators/filter_by_instag_op.cc b/paddle/fluid/operators/filter_by_instag_op.cc
index 02ea2d59ae307..cb1e3083320b4 100644
--- a/paddle/fluid/operators/filter_by_instag_op.cc
+++ b/paddle/fluid/operators/filter_by_instag_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/operators/filter_by_instag_op.h"
 
 #include <memory>
+
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/var_type_inference.h"
 
diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu
index 7870efba4e7a1..75680a61b30eb 100644
--- a/paddle/fluid/operators/filter_by_instag_op.cu
+++ b/paddle/fluid/operators/filter_by_instag_op.cu
@@ -20,6 +20,7 @@
 
 #include <thrust/copy.h>
 #include <thrust/device_vector.h>
+
 #include <cstring>
 #include <random>
 #include <string>
@@ -30,11 +31,10 @@
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/filter_by_instag_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 
-#include "paddle/fluid/operators/filter_by_instag_op.h"
-
 #if defined(PADDLE_WITH_CUDA)
 namespace cg = cooperative_groups;
 #endif
@@ -277,7 +277,7 @@ __global__ void filter_copy_fuse_kernel(
         T* dst = out_data + output_start_idx * x1_embed_size;
         const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size;
         const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size;
-        for (const T *j = src_start; j != src_end; dst++, j++) {
+        for (const T* j = src_start; j != src_end; dst++, j++) {
           *dst = *j;
         }
       }
@@ -306,7 +306,7 @@ __global__ void copy_grad_kernel(const size_t N, const int ins_per_thread,
     const T* src_end =
         out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size;
 
-    for (const T *j = src_start; j != src_end; dst++, j++) {
+    for (const T* j = src_start; j != src_end; dst++, j++) {
       *dst = *j;
     }
   }
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index 3abc980ceaafc..6172fef9b4bba 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/mixed_vector.h"
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index d1ac573b84461..2e767c3705188 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/flatten_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index cacd30cad8a94..6a91cd8b9414a 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/flip_op.cc b/paddle/fluid/operators/flip_op.cc
index e1ee1a86a2f96..b00cbf5c4fc26 100644
--- a/paddle/fluid/operators/flip_op.cc
+++ b/paddle/fluid/operators/flip_op.cc
@@ -93,10 +93,9 @@ REGISTER_OPERATOR(flip, ops::FlipOp, ops::FlipOpMaker, ops::FlipOpInferVarType,
                   FlipInferShapeFunctor);
 
 /* ==========================  register checkpoint ===========================*/
-REGISTER_OP_VERSION(flip)
-    .AddCheckpoint(
-        R"ROC(Upgrade flip, add new attr [axis] and delete attr [dims].)ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewAttr("axis", "The added attr 'axis' doesn't set default value.",
-                     paddle::none)
-            .DeleteAttr("dims", "The attr 'dims' is deleted."));
+REGISTER_OP_VERSION(flip).AddCheckpoint(
+    R"ROC(Upgrade flip, add new attr [axis] and delete attr [dims].)ROC",
+    paddle::framework::compatible::OpVersionDesc()
+        .NewAttr("axis", "The added attr 'axis' doesn't set default value.",
+                 paddle::none)
+        .DeleteAttr("dims", "The attr 'dims' is deleted."));
diff --git a/paddle/fluid/operators/fold_op.h b/paddle/fluid/operators/fold_op.h
index c0aa47a0b4fcc..fd1a7558b7127 100644
--- a/paddle/fluid/operators/fold_op.h
+++ b/paddle/fluid/operators/fold_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/frame_op.cc b/paddle/fluid/operators/frame_op.cc
index 2ff9beb36f284..00c98cae10e1c 100644
--- a/paddle/fluid/operators/frame_op.cc
+++ b/paddle/fluid/operators/frame_op.cc
@@ -33,10 +33,11 @@ class FrameOp : public framework::OperatorWithKernel {
     const int x_rank = x_dims.size();
 
     PADDLE_ENFORCE_GE(
-        x_rank, 1, platform::errors::InvalidArgument(
-                       "Input(X) of FrameOp should be a tensor which contains "
-                       "at least 1 dimension, but got rank %s.",
-                       x_rank));
+        x_rank, 1,
+        platform::errors::InvalidArgument(
+            "Input(X) of FrameOp should be a tensor which contains "
+            "at least 1 dimension, but got rank %s.",
+            x_rank));
     PADDLE_ENFORCE_GT(hop_length, 0,
                       platform::errors::InvalidArgument(
                           "Attribute(hop_length) of FrameOp should be greater "
diff --git a/paddle/fluid/operators/fsp_op.cc b/paddle/fluid/operators/fsp_op.cc
index f00ec6a1e140c..16ce2b43bf4e1 100644
--- a/paddle/fluid/operators/fsp_op.cc
+++ b/paddle/fluid/operators/fsp_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fsp_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index e23891d899de6..4ffb96d3c51bc 100755
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -1,97 +1,149 @@
 include(operators)
 if(WITH_UNITY_BUILD)
-    # Load Unity Build rules for operators in paddle/fluid/operators/fused.
-    include(unity_build_rule.cmake)
+  # Load Unity Build rules for operators in paddle/fluid/operators/fused.
+  include(unity_build_rule.cmake)
 endif()
-register_operators(EXCLUDES
-    fused_bn_activation_op
-    conv_fusion_op
-    fusion_transpose_flatten_concat_op
-    fusion_conv_inception_op
-    fused_fc_elementwise_layernorm_op
-    multihead_matmul_op
-    skip_layernorm_op
-    yolo_box_head_op
-    yolo_box_post_op
-    fused_embedding_eltwise_layernorm_op
-    fusion_group_op
-    fusion_gru_op
-    fusion_lstm_op
-    fused_bn_add_activation_op
-    fused_attention_op
-    fused_transformer_op
-    fused_feedforward_op
-    fused_multi_transformer_op
-    fused_bias_dropout_residual_layer_norm_op
-    resnet_unit_op
-    fused_gemm_epilogue_op
-    fused_gate_attention_op)
+register_operators(
+  EXCLUDES
+  fused_bn_activation_op
+  conv_fusion_op
+  fusion_transpose_flatten_concat_op
+  fusion_conv_inception_op
+  fused_fc_elementwise_layernorm_op
+  multihead_matmul_op
+  skip_layernorm_op
+  yolo_box_head_op
+  yolo_box_post_op
+  fused_embedding_eltwise_layernorm_op
+  fusion_group_op
+  fusion_gru_op
+  fusion_lstm_op
+  fused_bn_add_activation_op
+  fused_attention_op
+  fused_transformer_op
+  fused_feedforward_op
+  fused_multi_transformer_op
+  fused_bias_dropout_residual_layer_norm_op
+  resnet_unit_op
+  fused_gemm_epilogue_op
+  fused_gate_attention_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
 op_library(fusion_lstm_op)
 
+if(WITH_GPU OR WITH_ROCM)
+  # fused_bn_activation_op needs cudnn 7.4.1 above
+  # HIP not support bn act fuse in MIOPEN
+  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401))
+    op_library(fused_bn_activation_op)
+  endif()
+  # conv_fusion_op needs cudnn 7 above
+  if(NOT ${CUDNN_VERSION} VERSION_LESS 7100)
+    op_library(conv_fusion_op)
+  endif()
+  # fusion_transpose_flatten_concat_op
+  # HIP not support cudnnTransformTensor
+  if(NOT WITH_ROCM)
+    op_library(fusion_transpose_flatten_concat_op)
+  endif()
+  # fusion_conv_inception_op needs cudnn 7 above
+  # HIP not support cudnnConvolutionBiasActivationForward
+  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100))
+    op_library(fusion_conv_inception_op)
+  endif()
+  # fused_fc_elementwise_layernorm_op
+  op_library(fused_fc_elementwise_layernorm_op)
+  # multihead_matmul_op
+  op_library(multihead_matmul_op)
+  op_library(skip_layernorm_op)
+  op_library(yolo_box_head_op)
+  op_library(yolo_box_post_op)
+  op_library(fused_embedding_eltwise_layernorm_op)
+  op_library(fused_gate_attention_op)
+  # fusion_group
+  if(NOT APPLE AND NOT WIN32)
+    op_library(fusion_group_op DEPS device_code)
+    cc_test(
+      test_fusion_group_op
+      SRCS fusion_group_op_test.cc
+      DEPS fusion_group_op)
+  endif()
+  # fused_bn_add_activation
+  # HIP not support bn act fuse in MIOPEN
+  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401))
+    op_library(fused_bn_add_activation_op)
+  endif()
+  # fused_dropout
+  # only support CUDA
+  if(NOT WITH_ROCM)
+    nv_test(
+      test_fused_residual_dropout_bias
+      SRCS fused_residual_dropout_bias_test.cu
+      DEPS tensor
+           op_registry
+           dropout_op
+           layer_norm_op
+           device_context
+           generator
+           memory)
+    nv_test(
+      test_fused_dropout_act_bias
+      SRCS fused_dropout_act_bias_test.cu
+      DEPS tensor
+           op_registry
+           dropout_op
+           layer_norm_op
+           device_context
+           generator
+           memory)
+    nv_test(
+      test_fused_layernorm_residual_dropout_bias
+      SRCS fused_layernorm_residual_dropout_bias_test.cu
+      DEPS tensor
+           op_registry
+           dropout_op
+           layer_norm_op
+           device_context
+           generator
+           memory)
 
-if (WITH_GPU OR WITH_ROCM)
-    # fused_bn_activation_op needs cudnn 7.4.1 above
-    # HIP not support bn act fuse in MIOPEN
-    if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401))
-        op_library(fused_bn_activation_op)
-    endif()
-    # conv_fusion_op needs cudnn 7 above
-    if (NOT ${CUDNN_VERSION} VERSION_LESS 7100)
-        op_library(conv_fusion_op)
-    endif()
-    # fusion_transpose_flatten_concat_op
-    # HIP not support cudnnTransformTensor
-    if(NOT WITH_ROCM)
-        op_library(fusion_transpose_flatten_concat_op)
-    endif()
-    # fusion_conv_inception_op needs cudnn 7 above
-    # HIP not support cudnnConvolutionBiasActivationForward
-    if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100))
-        op_library(fusion_conv_inception_op)
-    endif()
-    # fused_fc_elementwise_layernorm_op
-    op_library(fused_fc_elementwise_layernorm_op)
-    # multihead_matmul_op
-    op_library(multihead_matmul_op)
-    op_library(skip_layernorm_op)
-    op_library(yolo_box_head_op)
-    op_library(yolo_box_post_op)
-    op_library(fused_embedding_eltwise_layernorm_op)
-    op_library(fused_gate_attention_op)
-    # fusion_group
-    if(NOT APPLE AND NOT WIN32)
-        op_library(fusion_group_op DEPS device_code)
-        cc_test(test_fusion_group_op SRCS fusion_group_op_test.cc DEPS fusion_group_op)
-    endif()
-    # fused_bn_add_activation
-    # HIP not support bn act fuse in MIOPEN
-    if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401))
-        op_library(fused_bn_add_activation_op)
-    endif()
-    # fused_dropout
-    # only support CUDA
-    if(NOT WITH_ROCM)
-        nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
-        nv_test(test_fused_dropout_act_bias SRCS fused_dropout_act_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
-        nv_test(test_fused_layernorm_residual_dropout_bias SRCS fused_layernorm_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
+    op_library(fused_feedforward_op)
+    # fused_attention_op
+    op_library(fused_attention_op)
+    op_library(fused_multi_transformer_op)
+    op_library(fused_bias_dropout_residual_layer_norm_op)
+  endif()
+  # resnet_unit needs cudnn 8.0 above
+  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
+    op_library(resnet_unit_op)
+    cc_test(
+      test_cudnn_norm_conv
+      SRCS cudnn_norm_conv_test.cc
+      DEPS conv_op
+           blas
+           im2col
+           vol2col
+           depthwise_conv
+           eigen_function
+           tensor
+           op_registry
+           device_context
+           generator
+           memory)
+    cc_test(
+      test_cudnn_bn_add_relu
+      SRCS cudnn_bn_add_relu_test.cc
+      DEPS batch_norm_op
+           fused_bn_add_activation_op
+           tensor
+           op_registry
+           device_context
+           generator
+           memory)
+  endif()
 
-        op_library(fused_feedforward_op)
-        # fused_attention_op
-        op_library(fused_attention_op)
-        op_library(fused_multi_transformer_op)
-        op_library(fused_bias_dropout_residual_layer_norm_op)
-    endif()
-    # resnet_unit needs cudnn 8.0 above
-    if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
-        op_library(resnet_unit_op)
-        cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory)
-        cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory)
-    endif()
-
-    if (CUDA_VERSION GREATER_EQUAL 11.6)
-        op_library(fused_gemm_epilogue_op)
-    endif()
+  if(CUDA_VERSION GREATER_EQUAL 11.6)
+    op_library(fused_gemm_epilogue_op)
+  endif()
 endif()
diff --git a/paddle/fluid/operators/fused/attention_layer_norm.h b/paddle/fluid/operators/fused/attention_layer_norm.h
index 43491a9faf18c..b960b83597973 100644
--- a/paddle/fluid/operators/fused/attention_layer_norm.h
+++ b/paddle/fluid/operators/fused/attention_layer_norm.h
@@ -38,11 +38,10 @@ class AttnLayerNorm {
     auto stream = dev_ctx_.stream();
 
     switch (GetDesiredBlockDim(feature_size_)) {
-      FIXED_BLOCK_DIM_CASE(
-          LayerNormForward<T, LayerNormParamType<T>,
-                           kBlockDim><<<batch_size_, kBlockDim, 0, stream>>>(
-              x_data, scale_data, bias_data, y_data, mean_data, var_data,
-              epsilon_, feature_size_));
+      FIXED_BLOCK_DIM_CASE(LayerNormForward<T, LayerNormParamType<T>, kBlockDim>
+                           <<<batch_size_, kBlockDim, 0, stream>>>(
+                               x_data, scale_data, bias_data, y_data, mean_data,
+                               var_data, epsilon_, feature_size_));
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Feature_size must be larger than 1"));
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index b059223eaf6e7..feac0f7953027 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -120,24 +120,24 @@ void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
   auto stream = ctx.stream();
   switch (vec_size) {
     case 4: {
-      BroadcastKernelBinary<T, T, 2, 4,
-                            data_per_thread><<<blocks, threads, 0, stream>>>(
-          in0, in1, out, use_broadcast, numel, configlists, main_tid, tail_tid,
-          func);
+      BroadcastKernelBinary<T, T, 2, 4, data_per_thread>
+          <<<blocks, threads, 0, stream>>>(in0, in1, out, use_broadcast, numel,
+                                           configlists, main_tid, tail_tid,
+                                           func);
       break;
     }
     case 2: {
-      BroadcastKernelBinary<T, T, 2, 2,
-                            data_per_thread><<<blocks, threads, 0, stream>>>(
-          in0, in1, out, use_broadcast, numel, configlists, main_tid, tail_tid,
-          func);
+      BroadcastKernelBinary<T, T, 2, 2, data_per_thread>
+          <<<blocks, threads, 0, stream>>>(in0, in1, out, use_broadcast, numel,
+                                           configlists, main_tid, tail_tid,
+                                           func);
       break;
     }
     case 1: {
-      BroadcastKernelBinary<T, T, 2, 1,
-                            data_per_thread><<<blocks, threads, 0, stream>>>(
-          in0, in1, out, use_broadcast, numel, configlists, main_tid, tail_tid,
-          func);
+      BroadcastKernelBinary<T, T, 2, 1, data_per_thread>
+          <<<blocks, threads, 0, stream>>>(in0, in1, out, use_broadcast, numel,
+                                           configlists, main_tid, tail_tid,
+                                           func);
       break;
     }
     default: {
@@ -176,8 +176,8 @@ void Launch1DColumnReduce(gpuStream_t stream, const int max_threads,
   const int block = 256;
   const int max_blocks = std::max(max_threads / block, 1);
   const int grid = std::min(left_num, max_blocks);
-  Compute1DColumnReduceKernel<T, block><<<grid, block, 0, stream>>>(
-      reduce_num, left_num, d_out, d_bias);
+  Compute1DColumnReduceKernel<T, block>
+      <<<grid, block, 0, stream>>>(reduce_num, left_num, d_out, d_bias);
 }
 
 void SetConfigForColumnReduce(const int max_threads, const int reduce_num,
@@ -273,8 +273,8 @@ void Launch2DColumnReduce(const platform::CUDADeviceContext& dev_ctx,
   const auto& stream = dev_ctx.stream();
 
   if (!should_reduce_again) {
-    BiasAddBwSinglePassKernel<T><<<grid, block, 0, stream>>>(d_out, reduce_num,
-                                                             left_num, d_bias);
+    BiasAddBwSinglePassKernel<T>
+        <<<grid, block, 0, stream>>>(d_out, reduce_num, left_num, d_bias);
   } else {
     framework::Tensor tmp_sum;
     tmp_sum.Resize({grid.y, left_num});
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index 304aad16ad0c6..a85b2f99bb157 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -14,12 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc
index 671e94061cb5c..490d92880c9a8 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cc
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 8191c85f2a120..9ca9f8aaf743f 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <array>
+
 #include "paddle/fluid/framework/conv_search_cache.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index 516b10fa021c1..09fa3a247e64b 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -182,19 +182,20 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx,
   std::string data_layout = "NHWC";
   attrs.insert({"data_layout", data_layout});
 
-  auto op = framework::OpRegistry::CreateOp(
-      "batch_norm", {{"X", {"X"}},
-                     {"Scale", {"Scale"}},
-                     {"Bias", {"Bias"}},
-                     {"Mean", {"Mean"}},
-                     {"Variance", {"Variance"}}},
-      {{"Y", {"Y"}},
-       {"MeanOut", {"Mean"}},
-       {"VarianceOut", {"Variance"}},
-       {"SavedMean", {"SavedMean"}},
-       {"SavedVariance", {"SavedVariance"}},
-       {"ReserveSpace", {"ReserveSpace"}}},
-      attrs);
+  auto op =
+      framework::OpRegistry::CreateOp("batch_norm",
+                                      {{"X", {"X"}},
+                                       {"Scale", {"Scale"}},
+                                       {"Bias", {"Bias"}},
+                                       {"Mean", {"Mean"}},
+                                       {"Variance", {"Variance"}}},
+                                      {{"Y", {"Y"}},
+                                       {"MeanOut", {"Mean"}},
+                                       {"VarianceOut", {"Variance"}},
+                                       {"SavedMean", {"SavedMean"}},
+                                       {"SavedVariance", {"SavedVariance"}},
+                                       {"ReserveSpace", {"ReserveSpace"}}},
+                                      attrs);
   op->Run(scope, ctx.GetPlace());
 
   paddle::framework::TensorCopySync(*y, platform::CPUPlace(), cpu_y);
@@ -314,8 +315,9 @@ void ComputeFusedBNAddReluBackward(
   attrs.insert({"epsilon", epsilon});
   attrs.insert({"act_type", act_type});
 
-  auto op = framework::OpRegistry::CreateOp(
-      "fused_bn_add_activation_grad", {{"X", {"X"}},
+  auto op =
+      framework::OpRegistry::CreateOp("fused_bn_add_activation_grad",
+                                      {{"X", {"X"}},
                                        {"Y", {"Y"}},
                                        {"Y@GRAD", {"Y@GRAD"}},
                                        {"Scale", {"Scale"}},
@@ -323,11 +325,11 @@ void ComputeFusedBNAddReluBackward(
                                        {"SavedMean", {"SavedMean"}},
                                        {"SavedVariance", {"SavedVariance"}},
                                        {"ReserveSpace", {"ReserveSpace"}}},
-      {{"X@GRAD", {"X@GRAD"}},
-       {"Z@GRAD", {"Z@GRAD"}},
-       {"Scale@GRAD", {"Scale@GRAD"}},
-       {"Bias@GRAD", {"Bias@GRAD"}}},
-      attrs);
+                                      {{"X@GRAD", {"X@GRAD"}},
+                                       {"Z@GRAD", {"Z@GRAD"}},
+                                       {"Scale@GRAD", {"Scale@GRAD"}},
+                                       {"Bias@GRAD", {"Bias@GRAD"}}},
+                                      attrs);
   op->Run(scope, ctx.GetPlace());
 
   paddle::framework::TensorCopySync(*dx, platform::CPUPlace(), cpu_dx);
diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h
index 13fad0b7cbb3d..a8f700c21199f 100644
--- a/paddle/fluid/operators/fused/cudnn_fusion_helper.h
+++ b/paddle/fluid/operators/fused/cudnn_fusion_helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index 5881322007add..f4443bba3fdb2 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -167,9 +167,10 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx,
   attrs.insert({"workspace_size_MB", 512});
 
   auto op = framework::OpRegistry::CreateOp(
-      "conv2d_grad", {{"Input", {"Input"}},
-                      {"Filter", {"Filter"}},
-                      {"Output@GRAD", {"Output@GRAD"}}},
+      "conv2d_grad",
+      {{"Input", {"Input"}},
+       {"Filter", {"Filter"}},
+       {"Output@GRAD", {"Output@GRAD"}}},
       {{"Input@GRAD", {"Input@GRAD"}}, {"Filter@GRAD", {"Filter@GRAD"}}},
       attrs);
   op->Run(scope, ctx.GetPlace());
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 38f9aff226ea9..ce95b0a320c66 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -186,8 +186,9 @@ class FMHARef {
     if (dropout_param_.dropout_prob_) {
       DropoutFwGPUKernelDriver<T>(
           static_cast<const phi::GPUContext&>(dev_ctx_),
-          dropout_param_.is_test_, static_cast<const std::string>(
-                                       dropout_param_.dropout_implementation_),
+          dropout_param_.is_test_,
+          static_cast<const std::string>(
+              dropout_param_.dropout_implementation_),
           dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_,
           dropout_param_.is_fix_seed_, dropout_param_.seed_val_,
           static_cast<const Tensor&>(*softmax_out_tensor), dropout_param_.seed_,
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index a1adec9641a6e..06ede8e2c7bdd 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -88,12 +89,13 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     // y: qkv's weight: [3, num_head, dim_head, dim_embed]
     auto x_dim = ctx->GetInputDim("X");
     auto y_dim = ctx->GetInputDim("QKVW");
-    PADDLE_ENFORCE_EQ(x_dim.size(), 3, platform::errors::InvalidArgument(
-                                           "The dimensions of x must be 3"
-                                           "(batch_size, seq_len, dim_embed),"
-                                           "but received dimensions of"
-                                           "Input is [%d]",
-                                           x_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        x_dim.size(), 3,
+        platform::errors::InvalidArgument("The dimensions of x must be 3"
+                                          "(batch_size, seq_len, dim_embed),"
+                                          "but received dimensions of"
+                                          "Input is [%d]",
+                                          x_dim.size()));
     PADDLE_ENFORCE_EQ(y_dim.size(), 4,
                       platform::errors::InvalidArgument(
                           "The dimensions of qkv_weight must be 4"
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index f25bd53992894..73fdd29fd62c3 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -13,21 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <cuda_fp16.h>
+
 #include <cub/cub.cuh>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/fused/attention_layer_norm.h"
+#include "paddle/fluid/operators/fused/attn_gemm.h"
+#include "paddle/fluid/operators/fused/fmha_ref.h"
+#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/operators/fused/attention_layer_norm.h"
-#include "paddle/fluid/operators/fused/attn_gemm.h"
-#include "paddle/fluid/operators/fused/fmha_ref.h"
-#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
-
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -463,11 +463,13 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
       auto *bias_dropout_residual_out_data =
           bias_dropout_residual_out->data<T>();
       auto *d_ln_2_scale_data =
-          (d_ln_2_scale == nullptr ? nullptr : d_ln_2_scale->mutable_data<U>(
-                                                   ctx.GetPlace()));
+          (d_ln_2_scale == nullptr
+               ? nullptr
+               : d_ln_2_scale->mutable_data<U>(ctx.GetPlace()));
       auto *d_ln_2_bias_data =
-          (d_ln_2_bias == nullptr ? nullptr : d_ln_2_bias->mutable_data<U>(
-                                                  ctx.GetPlace()));
+          (d_ln_2_bias == nullptr
+               ? nullptr
+               : d_ln_2_bias->mutable_data<U>(ctx.GetPlace()));
       auto *d_bias_dropout_residual_out_data =
           d_bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
index 781f51d70ec66..56f9afdbe9090 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
index 71a2c9728cc6b..35a48611a74f1 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <cuda_fp16.h>
+
 #include <cub/cub.cuh>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
index 1b3521f14962a..464856003f03f 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_bn_activation_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -70,20 +72,22 @@ void FusedBatchNormActOp::InferShape(framework::InferShapeContext *ctx) const {
 
   const auto x_dims = ctx->GetInputDim("X");
 
-  PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::PreconditionNotMet(
-                                          "ShapeError: the dimension of input "
-                                          "X must greater than or equal to 2."
-                                          "But received: the shape of input X "
-                                          "= [%s], the dimension of input X ="
-                                          "[%d]",
-                                          x_dims, x_dims.size()));
-  PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::PreconditionNotMet(
-                                          "ShapeError: the dimension of input "
-                                          "X must smaller than or equal to 5."
-                                          "But received: the shape of input X "
-                                          "= [%s], the dimension of input X ="
-                                          "[%d]",
-                                          x_dims, x_dims.size()));
+  PADDLE_ENFORCE_GE(
+      x_dims.size(), 2,
+      platform::errors::PreconditionNotMet("ShapeError: the dimension of input "
+                                           "X must greater than or equal to 2."
+                                           "But received: the shape of input X "
+                                           "= [%s], the dimension of input X ="
+                                           "[%d]",
+                                           x_dims, x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(), 5,
+      platform::errors::PreconditionNotMet("ShapeError: the dimension of input "
+                                           "X must smaller than or equal to 5."
+                                           "But received: the shape of input X "
+                                           "= [%s], the dimension of input X ="
+                                           "[%d]",
+                                           x_dims, x_dims.size()));
 
   const int64_t C = x_dims[x_dims.size() - 1];
 
@@ -140,22 +144,26 @@ framework::OpKernelType FusedBatchNormActOp::GetExpectedKernelType(
   if (input_data_type == framework::proto::VarType::FP64) {
     bn_param_type = framework::proto::VarType::FP64;
   }
-  PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                       ctx.Input<Tensor>("Scale")->dtype()),
-                    platform::errors::PreconditionNotMet(
-                        "Scale input should be of float type"));
-  PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                       ctx.Input<Tensor>("Bias")->dtype()),
-                    platform::errors::PreconditionNotMet(
-                        "Bias input should be of float type"));
-  PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                       ctx.Input<Tensor>("Mean")->dtype()),
-                    platform::errors::PreconditionNotMet(
-                        "Mean input should be of float type"));
-  PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                       ctx.Input<Tensor>("Variance")->dtype()),
-                    platform::errors::PreconditionNotMet(
-                        "Variance input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type,
+      framework::TransToProtoVarType(ctx.Input<Tensor>("Scale")->dtype()),
+      platform::errors::PreconditionNotMet(
+          "Scale input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type,
+      framework::TransToProtoVarType(ctx.Input<Tensor>("Bias")->dtype()),
+      platform::errors::PreconditionNotMet(
+          "Bias input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type,
+      framework::TransToProtoVarType(ctx.Input<Tensor>("Mean")->dtype()),
+      platform::errors::PreconditionNotMet(
+          "Mean input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type,
+      framework::TransToProtoVarType(ctx.Input<Tensor>("Variance")->dtype()),
+      platform::errors::PreconditionNotMet(
+          "Variance input should be of float type"));
 
   framework::LibraryType library = framework::LibraryType::kPlain;
   framework::DataLayout layout = framework::DataLayout::kAnyLayout;
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
index 9e709c9a01a1c..0ebe21dfc6059 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -16,6 +16,7 @@
 #include <cfloat>
 #include <string>
 #include <vector>
+
 #include "cub/cub.cuh"
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/activation_op.h"
@@ -181,8 +182,9 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
                 ctx.GetPlace()),
             variance_out->template mutable_data<BatchNormParamType<T>>(
                 ctx.GetPlace()),
-            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
-                         ctx.GetPlace()),
+            epsilon,
+            saved_mean->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
             saved_variance->template mutable_data<BatchNormParamType<T>>(
                 ctx.GetPlace()),
             activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
@@ -343,10 +345,12 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
             /*dBnScaleBiasDesc=*/bn_param_desc_,
             /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
             /*bnBiasData=*/bias->template data<BatchNormParamType<T>>(),
-            /*dBnScaleData=*/d_scale
-                ->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
-            /*dBnBiasData=*/d_bias
-                ->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+            /*dBnScaleData=*/
+            d_scale->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            /*dBnBiasData=*/
+            d_bias->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
             /*epsilon=*/epsilon,
             /*savedMean=*/saved_mean_data,
             /*savedInvVariance=*/saved_var_data,
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.h b/paddle/fluid/operators/fused/fused_bn_activation_op.h
index b8404e4c6553f..da9bca4fc22f7 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
index d667fafb83594..5d06ac19f9e1c 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -52,20 +54,22 @@ void FusedBatchNormAddActOp::InferShape(
                         "of input X = [%s], and the shape of "
                         "input Y = [%s]",
                         x_dims, z_dims));
-  PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::InvalidArgument(
-                                          "ShapeError: the dimensions of input "
-                                          "must greater than or equal to 2."
-                                          "But received: the shape of input "
-                                          "= [%s], the dimension of input = "
-                                          "[%d]",
-                                          x_dims, x_dims.size()));
-  PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::InvalidArgument(
-                                          "ShapeError: the dimensions of input "
-                                          "must smaller than or equal to 5."
-                                          "But received: the shape of input "
-                                          "= [%s], the dimension of input = "
-                                          "[%d]",
-                                          x_dims, x_dims.size()));
+  PADDLE_ENFORCE_GE(
+      x_dims.size(), 2,
+      platform::errors::InvalidArgument("ShapeError: the dimensions of input "
+                                        "must greater than or equal to 2."
+                                        "But received: the shape of input "
+                                        "= [%s], the dimension of input = "
+                                        "[%d]",
+                                        x_dims, x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(), 5,
+      platform::errors::InvalidArgument("ShapeError: the dimensions of input "
+                                        "must smaller than or equal to 5."
+                                        "But received: the shape of input "
+                                        "= [%s], the dimension of input = "
+                                        "[%d]",
+                                        x_dims, x_dims.size()));
 
   const int64_t C = x_dims[x_dims.size() - 1];
 
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
index 421c1bacb6633..2f7fc6160122d 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -16,6 +16,7 @@
 #include <cfloat>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
@@ -160,8 +161,9 @@ class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
                 ctx.GetPlace()),
             variance_out->template mutable_data<BatchNormParamType<T>>(
                 ctx.GetPlace()),
-            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
-                         ctx.GetPlace()),
+            epsilon,
+            saved_mean->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
             saved_variance->template mutable_data<BatchNormParamType<T>>(
                 ctx.GetPlace()),
             activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
index d5e5ae9bda642..07d2e4564b692 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
old mode 100755
new mode 100644
index 9f5a1bad047b4..f7af7deff5376
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
@@ -109,15 +109,15 @@ void LaunchDropoutActBias(Functor act_functor, const uint64_t seed,
   const int real_vec_size = cols % VecSize == 0 ? VecSize : 1;
   const auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size);
   if (cols % VecSize == 0) {
-    FusedDropoutActBias<T, MaskType, VecSize, Functor><<<
-        config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-        act_functor, seed, rows, cols, increment, dropout_prob,
-        is_upscale_in_train, is_test, src, bias, dst, mask_data);
+    FusedDropoutActBias<T, MaskType, VecSize, Functor>
+        <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+            act_functor, seed, rows, cols, increment, dropout_prob,
+            is_upscale_in_train, is_test, src, bias, dst, mask_data);
   } else {
-    FusedDropoutActBias<T, MaskType, 1, Functor><<<
-        config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-        act_functor, seed, rows, cols, increment, dropout_prob,
-        is_upscale_in_train, is_test, src, bias, dst, mask_data);
+    FusedDropoutActBias<T, MaskType, 1, Functor>
+        <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+            act_functor, seed, rows, cols, increment, dropout_prob,
+            is_upscale_in_train, is_test, src, bias, dst, mask_data);
   }
 }
 
@@ -231,28 +231,28 @@ void LaunchDropoutActBiasGrad(Functor act_functor, const T *dout,
     dim3 block_dim(threads, 128, 1);
     dim3 grid_dim(blocks, 1, 1);
     if (cols % VecSize == 0) {
-      FusedDropoutActBiasGrad<
-          T, MaskType, 8, 128, VecSize,
-          Functor><<<grid_dim, block_dim, 0, ctx.stream()>>>(
-          act_functor, dout, mask, src, bias, factor, rows, cols, dx, dbias);
+      FusedDropoutActBiasGrad<T, MaskType, 8, 128, VecSize, Functor>
+          <<<grid_dim, block_dim, 0, ctx.stream()>>>(act_functor, dout, mask,
+                                                     src, bias, factor, rows,
+                                                     cols, dx, dbias);
     } else {
-      FusedDropoutActBiasGrad<
-          T, MaskType, 8, 128, 1,
-          Functor><<<grid_dim, block_dim, 0, ctx.stream()>>>(
-          act_functor, dout, mask, src, bias, factor, rows, cols, dx, dbias);
+      FusedDropoutActBiasGrad<T, MaskType, 8, 128, 1, Functor>
+          <<<grid_dim, block_dim, 0, ctx.stream()>>>(act_functor, dout, mask,
+                                                     src, bias, factor, rows,
+                                                     cols, dx, dbias);
     }
   } else {
     const uint64_t n = rows * cols;
     platform::GpuLaunchConfig config =
         platform::GetGpuLaunchConfig1D(ctx, n / real_vec_size);
     if (n % VecSize == 0) {
-      FusedDropoutActGrad<T, MaskType, VecSize, Functor><<<
-          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-          act_functor, dout, mask, src, factor, n, dx);
+      FusedDropoutActGrad<T, MaskType, VecSize, Functor>
+          <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+              act_functor, dout, mask, src, factor, n, dx);
     } else {
-      FusedDropoutActGrad<T, MaskType, 1, Functor><<<
-          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-          act_functor, dout, mask, src, factor, n, dx);
+      FusedDropoutActGrad<T, MaskType, 1, Functor>
+          <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+              act_functor, dout, mask, src, factor, n, dx);
     }
   }
 }
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index c352f08ec2ba7..6dc1c446bd7d5 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -30,7 +30,7 @@ namespace operators {
  * The DropoutParam will be used in the fused_dropout_act_bias,
  * fused_residual_dropout_bias(pre_layer_norm=ture) or
  * fused_layernorm_residual_dropout_bias(pre_layer_norm=false).
-*/
+ */
 struct DropoutParam {
   uint64_t seed;
   float dropout_prob;
@@ -232,8 +232,8 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
     using U = LayerNormParamType<T>;
     switch (GetDesiredBlockDim(this->cols_)) {
       FIXED_BLOCK_DIM_CASE(
-          LayerNormForward<
-              T, U, kBlockDim><<<this->rows_, kBlockDim, 0, ctx.stream()>>>(
+          LayerNormForward<T, U, kBlockDim>
+          <<<this->rows_, kBlockDim, 0, ctx.stream()>>>(
               src, gamma, beta, out, mean, variance, epsilon_, this->cols_));
     }
   }
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
index 3e69bf0806756..a43562b297228 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_elemwise_activation_op.h"
+
 #include <memory>
 #include <unordered_set>
 
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
index 5404cdeab01e0..3ce54968355a5 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
@@ -412,8 +413,9 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto in_y = ctx.Input<framework::Tensor>("Y");
-    PADDLE_ENFORCE_NE(in_y, nullptr, platform::errors::InvalidArgument(
-                                         "Input(Y) should not be nullptr."));
+    PADDLE_ENFORCE_NE(
+        in_y, nullptr,
+        platform::errors::InvalidArgument("Input(Y) should not be nullptr."));
     auto in_out = ctx.Input<framework::Tensor>("Out");
     PADDLE_ENFORCE_NE(
         in_out, nullptr,
@@ -449,15 +451,17 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
                             " so the number of 'Out' should be two."));
     } else {
       if (!InputXCanBeAbsent(functor_list)) {
-        PADDLE_ENFORCE_NE(in_x, nullptr, platform::errors::InvalidArgument(
-                                             "Input(X) should not be null."));
+        PADDLE_ENFORCE_NE(
+            in_x, nullptr,
+            platform::errors::InvalidArgument("Input(X) should not be null."));
       }
     }
 
     // Get in_x
     if (ctx.HasInput("X")) {
-      PADDLE_ENFORCE_NE(in_x, nullptr, platform::errors::InvalidArgument(
-                                           "Input(X) should not be null."));
+      PADDLE_ENFORCE_NE(
+          in_x, nullptr,
+          platform::errors::InvalidArgument("Input(X) should not be null."));
     } else {
       // If functor_list contains elementwise_add, the backward doesn't use
       // in_x, in_y and in_out.
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
index 6746b3b8e8489..951189269c748 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/errors.h"
 
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index 13f1c6808aef2..f0cb2edb670ec 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include <algorithm>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index 7308f30779248..625bfe36e3864 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h"
+
 #include <string>
+
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/cpu_vec.h"
@@ -100,10 +102,11 @@ void FusedEmbeddingFCLSTMOp::InferShape(
       platform::errors::InvalidArgument(
           "The rank of Input(Bias) should be 2, but received value is:%d.",
           b_dims.size()));
-  PADDLE_ENFORCE_EQ(b_dims[0], 1, platform::errors::InvalidArgument(
-                                      "The first dimension of Input(Bias) "
-                                      "should be 1, but received value is:%d.",
-                                      b_dims[0]));
+  PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                    platform::errors::InvalidArgument(
+                        "The first dimension of Input(Bias) "
+                        "should be 1, but received value is:%d.",
+                        b_dims[0]));
   PADDLE_ENFORCE_EQ(
       b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
       platform::errors::InvalidArgument(
@@ -237,21 +240,21 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
 template <typename T>
 class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
  public:
-#define INIT_VEC_FUNC                                                          \
-  std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
-  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");               \
-  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
-  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
-  if (platform::MayIUse(platform::avx)) {                                      \
-    phi::funcs::VecActivations<T, platform::avx> act_functor;                  \
-    act_gate = act_functor(act_gate_str);                                      \
-    act_cell = act_functor(act_cell_str);                                      \
-    act_cand = act_functor(act_cand_str);                                      \
-  } else {                                                                     \
-    phi::funcs::VecActivations<T, platform::isa_any> act_functor;              \
-    act_gate = act_functor(act_gate_str);                                      \
-    act_cell = act_functor(act_cell_str);                                      \
-    act_cand = act_functor(act_cand_str);                                      \
+#define INIT_VEC_FUNC                                                        \
+  std::function<void(const int, const T*, T*)> act_gate, act_cell, act_cand; \
+  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");             \
+  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");             \
+  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");        \
+  if (platform::MayIUse(platform::avx)) {                                    \
+    phi::funcs::VecActivations<T, platform::avx> act_functor;                \
+    act_gate = act_functor(act_gate_str);                                    \
+    act_cell = act_functor(act_cell_str);                                    \
+    act_cand = act_functor(act_cand_str);                                    \
+  } else {                                                                   \
+    phi::funcs::VecActivations<T, platform::isa_any> act_functor;            \
+    act_gate = act_functor(act_gate_str);                                    \
+    act_cell = act_functor(act_cell_str);                                    \
+    act_cand = act_functor(act_cand_str);                                    \
   }
 
 #define INIT_BASE_INPUT_OUTPUT                        \
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
index ec3a76e316ecd..cb3bf5857750f 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/var_type_inference.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
index 04d3730a77d4d..2c0184fea463e 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -179,22 +179,20 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
     if (with_relu) {
       switch (platform::RoundToPowerOfTwo(N)) {
         CUDA_LAUNCH_KERNEL_HELPER(
-            InplaceAddReluAddLayerNormKernel<
-                T, true,
-                kPowerOfTwoDim><<<std::max(max_threads / kPowerOfTwoDim, 1),
-                                  kPowerOfTwoDim, 0, dev_ctx.stream()>>>(
-                y_data, bias_0_data, bias_1_data, scale_data, out_data,
-                mean_data, variance_data, M, N, epsilon));
+            InplaceAddReluAddLayerNormKernel<T, true, kPowerOfTwoDim>
+            <<<std::max(max_threads / kPowerOfTwoDim, 1), kPowerOfTwoDim, 0,
+               dev_ctx.stream()>>>(y_data, bias_0_data, bias_1_data, scale_data,
+                                   out_data, mean_data, variance_data, M, N,
+                                   epsilon));
       }
     } else {
       switch (platform::RoundToPowerOfTwo(N)) {
         CUDA_LAUNCH_KERNEL_HELPER(
-            InplaceAddReluAddLayerNormKernel<
-                T, false,
-                kPowerOfTwoDim><<<std::max(max_threads / kPowerOfTwoDim, 1),
-                                  kPowerOfTwoDim, 0, dev_ctx.stream()>>>(
-                y_data, bias_0_data, bias_1_data, scale_data, out_data,
-                mean_data, variance_data, M, N, epsilon));
+            InplaceAddReluAddLayerNormKernel<T, false, kPowerOfTwoDim>
+            <<<std::max(max_threads / kPowerOfTwoDim, 1), kPowerOfTwoDim, 0,
+               dev_ctx.stream()>>>(y_data, bias_0_data, bias_1_data, scale_data,
+                                   out_data, mean_data, variance_data, M, N,
+                                   epsilon));
       }
     }
   }
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index 8e15232acda90..d3cc1b9127670 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/matmul_v2_op.h"
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 2eb9885286dab..675ec29da67c8 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -14,11 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/matmul_v2_op.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/fluid/operators/matmul_v2_op.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
@@ -387,20 +386,19 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
         !pre_layer_norm ? context.Input<framework::Tensor>("Ln2Bias") : nullptr;
 
     auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* d_ln1_scale = pre_layer_norm
-                            ? context.Output<framework::Tensor>(
-                                  framework::GradVarName("Ln1Scale"))
-                            : nullptr;
-    auto* d_ln1_bias = pre_layer_norm
-                           ? context.Output<framework::Tensor>(
-                                 framework::GradVarName("Ln1Bias"))
-                           : nullptr;
-    auto* d_ln2_scale =
-        pre_layer_norm ? nullptr : context.Output<framework::Tensor>(
-                                       framework::GradVarName("Ln2Scale"));
-    auto* d_ln2_bias =
-        pre_layer_norm ? nullptr : context.Output<framework::Tensor>(
-                                       framework::GradVarName("Ln2Bias"));
+    auto* d_ln1_scale = pre_layer_norm ? context.Output<framework::Tensor>(
+                                             framework::GradVarName("Ln1Scale"))
+                                       : nullptr;
+    auto* d_ln1_bias = pre_layer_norm ? context.Output<framework::Tensor>(
+                                            framework::GradVarName("Ln1Bias"))
+                                      : nullptr;
+    auto* d_ln2_scale = pre_layer_norm
+                            ? nullptr
+                            : context.Output<framework::Tensor>(
+                                  framework::GradVarName("Ln2Scale"));
+    auto* d_ln2_bias = pre_layer_norm ? nullptr
+                                      : context.Output<framework::Tensor>(
+                                            framework::GradVarName("Ln2Bias"));
     auto* d_linear1_weight = context.Output<framework::Tensor>(
         framework::GradVarName("Linear1Weight"));
     auto* d_linear1_bias = context.Output<framework::Tensor>(
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
index ba9dbd82e3dcc..0bbeabd5fc9cb 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
index b1badf72557ae..8f375a22cc023 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -374,9 +374,9 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
         v_transpose_out, qkv_transpose_out, softmax_out, fmha_out, &config);
 
     // 3. Gating Linear
-    Tensor *fmha_or_gate_out =
-        !has_gating ? fmha_out : ComputeGatingLinearForward<T>(ctx, config,
-                                                               query, fmha_out);
+    Tensor *fmha_or_gate_out = !has_gating ? fmha_out
+                                           : ComputeGatingLinearForward<T>(
+                                                 ctx, config, query, fmha_out);
 
     // 4. Output Linear
     ComputeOutputLinearForward<T>(ctx, config, fmha_or_gate_out);
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
index 4c4e3661e6d6e..978daa3be85e9 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -208,6 +210,9 @@ class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
 
+    auto trans_x = ctx->Attrs().Get<bool>("trans_x");
+    auto trans_y = ctx->Attrs().Get<bool>("trans_y");
+
     PADDLE_ENFORCE_GE(
         dout_dims.size(), 2,
         platform::errors::InvalidArgument(
@@ -242,14 +247,14 @@ class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel {
     auto x_mat_dims = phi::flatten_to_2d(x_dims, x_dims.size() - 1);
 
     PADDLE_ENFORCE_EQ(
-        dout_mat_dims[1], y_dims[1],
+        dout_mat_dims[1], trans_y ? y_dims[0] : y_dims[1],
         platform::errors::InvalidArgument(
             "The last dimension of DOut should be equal with Y's last"
             "dimension. But received DOut[-1] = [%d], Y[1] = [%d].",
             dout_mat_dims[1], y_dims[1]));
 
     PADDLE_ENFORCE_EQ(
-        dout_mat_dims[0], x_mat_dims[0],
+        dout_mat_dims[0], trans_x ? x_mat_dims[1] : x_mat_dims[0],
         platform::errors::InvalidArgument(
             "The first dimension of DOut should be equal with X's first"
             "dimension. But received DOut[0] = [%d], Y[0] = [%d].",
@@ -288,7 +293,7 @@ class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel {
 
     if (ctx->HasOutput("DBias")) {
       std::vector<int64_t> dbias_dims;
-      dbias_dims.push_back(y_dims[1]);
+      dbias_dims.push_back(trans_y ? y_dims[0] : y_dims[1]);
       ctx->SetOutputDim("DBias", phi::make_ddim(dbias_dims));
     }
   }
@@ -323,6 +328,20 @@ class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("DBias",
               "The output grad tensor to bias of Out = (Act(X) * Y) + bias.")
         .AsDispensable();
+    AddAttr<bool>(
+        "trans_x",
+        R"DOC((bool, default false), Whether to transpose input tensor X 
+    or not. The input tensor X coulbe be more than two dimension. When 
+    set trans_x=true, it would fully reverse X. For instant: X with shpae 
+    [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "trans_y",
+        R"DOC((bool, default false), Whether to transpose input tensor Y 
+    or not. The input tensor Y should be two dimension. When 
+    set trans_y=true, it would transpose Y. For instant: Y with shpae 
+    [d0, d1] -> [d1, d0].)DOC")
+        .SetDefault(false);
 
     AddAttr<std::string>(
         "activation_grad",
@@ -343,11 +362,39 @@ X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3]
   }
 };
 
+template <typename T>
+class FusedGemmEpilogueOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    const auto& act_type = this->template Attr<std::string>("activation");
+    PADDLE_ENFORCE_EQ(
+        act_type, "none",
+        phi::errors::InvalidArgument("The activation should be none."));
+
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Y", this->Input("Y"));
+    op->SetInput("DOut", this->OutputGrad("Out"));
+
+    op->SetOutput("DX", this->InputGrad("X"));
+    op->SetOutput("DY", this->InputGrad("Y"));
+    op->SetOutput("DBias", this->InputGrad("Bias"));
+
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fused_gemm_epilogue, ops::FusedGemmEpilogueOp,
-                  ops::FusedGemmEpilogueOpMaker)
+REGISTER_OPERATOR(
+    fused_gemm_epilogue, ops::FusedGemmEpilogueOp,
+    ops::FusedGemmEpilogueOpMaker,
+    ops::FusedGemmEpilogueOpGradMaker<paddle::framework::OpDesc>,
+    ops::FusedGemmEpilogueOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(fused_gemm_epilogue_grad, ops::FusedGemmEpilogueGradOp,
-                  ops::FusedGemmEpilogueGradOpMaker)
+                  ops::FusedGemmEpilogueGradOpMaker);
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
index 9bf3d1a485efc..407cd2b974def 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/framework/scope_guard.h"
 #include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h"
 #include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/float16.h"
@@ -41,6 +42,8 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
     bool trans_y = ctx.Attr<bool>("trans_y");
 
     std::string activation = ctx.Attr<std::string>("activation");
+    VLOG(10) << "trans_x = " << trans_x << " , trans_y = " << trans_y
+             << " , activation = " << activation;
     bool enable_auxiliary = reserve_space == nullptr ? false : true;
 
     out->mutable_data<T>(ctx.GetPlace());
@@ -48,6 +51,7 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
 
     auto x_mat_dims =
         phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
+    // (M * K) * (K * N)
     int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0];
     int64_t K = trans_y ? y->dims()[1] : y->dims()[0];
     int64_t N = trans_y ? y->dims()[0] : y->dims()[1];
@@ -106,10 +110,11 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
           platform::dynload::cublasLtMatmulDescSetAttribute(
               operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
               &aux_data, sizeof(aux_data)));
+      int64_t aux_ld = N;
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cublasLtMatmulDescSetAttribute(
-              operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N,
-              sizeof(N)));
+              operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &aux_ld,
+              sizeof(aux_ld)));
     }
 
     cublasLtMatrixLayout_t x_desc = NULL, y_desc = NULL, out_desc = NULL;
@@ -129,8 +134,7 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
         &out_desc, mat_type, N, M, N));
 
     cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
-    size_t workspace_size = 4 * 1024 * 1024;
-
+    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024 * 1024;
     cudaStream_t stream = dev_ctx.stream();
     memory::allocation::AllocationPtr workspace =
         memory::Alloc(dev_ctx, workspace_size);
@@ -149,13 +153,13 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
     const auto* y_data = y->data<T>();
     const auto* x_data = x->data<T>();
 
-    cublasLtMatmulAlgo_t algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo(
+    auto algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo(
         lt_handle, operation_desc, y_desc, x_desc, out_desc, alpha, beta,
         y_data, x_data, out_data, stream, workspace->ptr(), workspace_size);
 
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
         lt_handle, operation_desc, alpha, y_data, y_desc, x_data, x_desc, beta,
-        out_data, out_desc, out_data, out_desc, &algo, workspace->ptr(),
+        out_data, out_desc, out_data, out_desc, algo, workspace->ptr(),
         workspace_size, stream));
 
     PADDLE_ENFORCE_GPU_SUCCESS(
@@ -191,12 +195,94 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
   }
 };
 
+enum FusedGEMMGradInType { kDX = 0, kDY = 1, kDZ = 2 };
+
+template <bool TransX, bool TransY>
+struct FusedGEMMGradTrait;
+
+template <>
+struct FusedGEMMGradTrait<false, false> {
+  static constexpr auto kXGradA = FusedGEMMGradInType::kDZ;
+  static constexpr auto kXGradB = FusedGEMMGradInType::kDY;
+  static constexpr auto kXGradATrans = false;
+  static constexpr auto kXGradBTrans = true;
+
+  static constexpr auto kYGradA = FusedGEMMGradInType::kDX;
+  static constexpr auto kYGradB = FusedGEMMGradInType::kDZ;
+  static constexpr auto kYGradATrans = true;
+  static constexpr auto kYGradBTrans = false;
+};
+
+template <>
+struct FusedGEMMGradTrait<true, false> {
+  static constexpr auto kXGradA = FusedGEMMGradInType::kDY;
+  static constexpr auto kXGradB = FusedGEMMGradInType::kDZ;
+  static constexpr auto kXGradATrans = false;
+  static constexpr auto kXGradBTrans = true;
+
+  static constexpr auto kYGradA = FusedGEMMGradInType::kDX;
+  static constexpr auto kYGradB = FusedGEMMGradInType::kDZ;
+  static constexpr auto kYGradATrans = false;
+  static constexpr auto kYGradBTrans = false;
+};
+
+template <>
+struct FusedGEMMGradTrait<false, true> {
+  static constexpr auto kXGradA = FusedGEMMGradInType::kDZ;
+  static constexpr auto kXGradB = FusedGEMMGradInType::kDY;
+  static constexpr auto kXGradATrans = false;
+  static constexpr auto kXGradBTrans = false;
+
+  static constexpr auto kYGradA = FusedGEMMGradInType::kDZ;
+  static constexpr auto kYGradB = FusedGEMMGradInType::kDX;
+  static constexpr auto kYGradATrans = true;
+  static constexpr auto kYGradBTrans = false;
+};
+
+template <>
+struct FusedGEMMGradTrait<true, true> {
+  static constexpr auto kXGradA = FusedGEMMGradInType::kDY;
+  static constexpr auto kXGradB = FusedGEMMGradInType::kDZ;
+  static constexpr auto kXGradATrans = true;
+  static constexpr auto kXGradBTrans = true;
+
+  static constexpr auto kYGradA = FusedGEMMGradInType::kDZ;
+  static constexpr auto kYGradB = FusedGEMMGradInType::kDX;
+  static constexpr auto kYGradATrans = true;
+  static constexpr auto kYGradBTrans = true;
+};
+
+static constexpr auto BoolToCuBlasEnum(bool transpose) {
+  return transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
+}
+
 template <typename DeviceContext, typename T>
 class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    bool transpose_x = ctx.Attr<bool>("trans_x");
+    bool transpose_y = ctx.Attr<bool>("trans_y");
 
+    if (transpose_x) {
+      if (transpose_y) {
+        ComputeImpl<true, true>(ctx);
+      } else {
+        ComputeImpl<true, false>(ctx);
+      }
+    } else {
+      if (transpose_y) {
+        ComputeImpl<false, true>(ctx);
+      } else {
+        ComputeImpl<false, false>(ctx);
+      }
+    }
+  }
+
+ private:
+  template <bool TransX, bool TransY>
+  static void ComputeImpl(const framework::ExecutionContext& ctx) {
+    using Trait = FusedGEMMGradTrait<TransX, TransY>;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     const Tensor* dout = ctx.Input<Tensor>("DOut");
     const Tensor* x = ctx.Input<Tensor>("X");
     const Tensor* y = ctx.Input<Tensor>("Y");
@@ -208,13 +294,18 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
 
     std::string activation_grad = ctx.Attr<std::string>("activation_grad");
 
-    auto dout_mat_dims =
-        phi::flatten_to_2d(dout->dims(), dout->dims().size() - 1);
-    auto x_mat_dims = phi::flatten_to_2d(x->dims(), x->dims().size() - 1);
+    VLOG(10) << "trans_x = " << TransX << " , trans_y = " << TransY
+             << " , activation_grad = " << activation_grad;
+
+    auto x_mat_dims =
+        phi::flatten_to_2d(x->dims(), TransX ? 1 : x->dims().size() - 1);
+
+    // (M * K) * (K * N)
+    int64_t M = TransX ? x_mat_dims[1] : x_mat_dims[0];
+    int64_t K = TransY ? y->dims()[1] : y->dims()[0];
+    int64_t N = TransY ? y->dims()[0] : y->dims()[1];
 
-    int64_t M = x_mat_dims[0];
-    int64_t K = y->dims()[0];
-    int64_t N = y->dims()[1];
+    VLOG(10) << "M = " << M << " , K = " << K << " , N = " << N;
 
     cudaDataType_t mat_type = CUDA_R_32F;
     cudaDataType_t scale_type = CUDA_R_32F;
@@ -229,7 +320,8 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
     }
 
     cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
-    size_t workspace_size = 4 * 1024 * 1024;
+    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024 * 1024;
+    const cublasLtMatmulAlgo_t* algo = nullptr;
     cudaStream_t stream = dev_ctx.stream();
 
     double alpha64 = 1.0, beta64 = 0.0;
@@ -243,24 +335,81 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
       beta = &beta32;
     }
 
-    cublasOperation_t trans_dout = CUBLAS_OP_N;
-    cublasLtMatrixLayout_t dout_desc = NULL;
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
-        &dout_desc, mat_type, N, M, N));
+    cublasLtMatrixLayout_t dout_desc = nullptr, dout_trans_desc = nullptr;
+    cublasLtMatrixLayout_t x_desc = nullptr, x_trans_desc = nullptr;
+    cublasLtMatrixLayout_t y_desc = nullptr, y_trans_desc = nullptr;
+    cublasLtMatrixLayout_t dx_desc = nullptr, dy_desc = nullptr;
+    cublasLtMatmulDesc_t dx_operation_desc = nullptr,
+                         dy_operation_desc = nullptr;
+
+    DEFINE_PADDLE_SCOPE_GUARD([&] {
+      auto descs = {dout_desc, dout_trans_desc, x_desc,  x_trans_desc,
+                    y_desc,    y_trans_desc,    dx_desc, dy_desc};
+      for (auto desc : descs) {
+        if (desc) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              platform::dynload::cublasLtMatrixLayoutDestroy(desc));
+        }
+      }
 
+      if (dx_operation_desc) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescDestroy(dx_operation_desc));
+      }
+
+      if (dy_operation_desc) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescDestroy(dy_operation_desc));
+      }
+    });
+
+    auto x_row = TransX ? K : M;
+    auto x_col = TransX ? M : K;
+    auto y_row = TransY ? N : K;
+    auto y_col = TransY ? K : N;
+    auto z_row = TransX ? N : M;
+    auto z_col = TransX ? M : N;
+
+    // dx = func(dout, y)
     if (dx) {
-      cublasLtMatmulDesc_t dx_operation_desc = NULL;
+      constexpr auto kXGradAIsDZ = (Trait::kXGradA == FusedGEMMGradInType::kDZ);
+      cublasLtMatrixLayout_t *dx_dout_desc, *dx_y_desc;
+
+      if (TransX) {
+        dx_dout_desc = &dout_trans_desc;
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatrixLayoutCreate(
+                dx_dout_desc, mat_type, z_row, z_col, z_row));
+      } else {
+        dx_dout_desc = &dout_desc;
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatrixLayoutCreate(
+                dx_dout_desc, mat_type, z_col, z_row, z_col));
+      }
+
+      dx_y_desc = &y_trans_desc;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          dx_y_desc, mat_type, y_col, y_row, y_col));
+
+      auto& a_desc = kXGradAIsDZ ? (*dx_dout_desc) : (*dx_y_desc);
+      auto& b_desc = kXGradAIsDZ ? (*dx_y_desc) : (*dx_dout_desc);
+      auto a_trans = BoolToCuBlasEnum(Trait::kXGradATrans);
+      auto b_trans = BoolToCuBlasEnum(Trait::kXGradBTrans);
+
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &dx_desc, mat_type, x_col, x_row, x_col));
+
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
           &dx_operation_desc, compute_type, scale_type));
-      cublasOperation_t trans_y = CUBLAS_OP_T;
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cublasLtMatmulDescSetAttribute(
-              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_dout,
-              sizeof(trans_dout)));
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &a_trans,
+              sizeof(a_trans)));
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cublasLtMatmulDescSetAttribute(
-              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_y,
-              sizeof(trans_y)));
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &b_trans,
+              sizeof(b_trans)));
+
       cublasLtEpilogue_t epiloque_func_for_dx =
           get_epilogue_type_(activation_grad);
       PADDLE_ENFORCE_GPU_SUCCESS(
@@ -274,105 +423,116 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
             platform::dynload::cublasLtMatmulDescSetAttribute(
                 dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
                 &aux_data, sizeof(aux_data)));
+        int64_t aux_ld = TransX ? M : K;
         PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cublasLtMatmulDescSetAttribute(
-                dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &K,
-                sizeof(K)));
+                dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD,
+                &aux_ld, sizeof(aux_ld)));
       }
 
-      cublasLtMatrixLayout_t y_desc = NULL, dx_desc = NULL;
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
-          &y_desc, mat_type, N, K, N));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
-          &dx_desc, mat_type, K, M, K));
-
-      memory::allocation::AllocationPtr dx_workspace =
-          memory::Alloc(dev_ctx, workspace_size);
+      auto dx_workspace = memory::Alloc(dev_ctx, workspace_size);
 
-      dx->mutable_data<T>(ctx.GetPlace());
-      auto* dx_data = dx->data<T>();
+      auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
       const auto* y_data = y->data<T>();
       const auto* dout_data = dout->data<T>();
+      const auto* a_data = kXGradAIsDZ ? dout_data : y_data;
+      const auto* b_data = kXGradAIsDZ ? y_data : dout_data;
 
-      cublasLtMatmulAlgo_t algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo(
-          lt_handle, dx_operation_desc, y_desc, dout_desc, dx_desc, alpha, beta,
-          y_data, dout_data, dx_data, stream, dx_workspace->ptr(),
-          workspace_size);
+      auto algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo(
+          lt_handle, dx_operation_desc, b_desc, a_desc, dx_desc, alpha, beta,
+          b_data, a_data, dx_data, stream, dx_workspace->ptr(), workspace_size);
 
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
-          lt_handle, dx_operation_desc, alpha, y->data<T>(), y_desc,
-          dout->data<T>(), dout_desc, beta, dx_data, dx_desc, dx_data, dx_desc,
-          &algo, dx_workspace->ptr(), workspace_size, stream));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatmulDescDestroy(dx_operation_desc));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatrixLayoutDestroy(y_desc));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatrixLayoutDestroy(dx_desc));
+          lt_handle, dx_operation_desc, alpha, b_data, b_desc, a_data, a_desc,
+          beta, dx_data, dx_desc, dx_data, dx_desc, algo, dx_workspace->ptr(),
+          workspace_size, stream));
     }
 
+    // dy = func(dout, x)
     if (dy) {
-      cublasLtMatmulDesc_t dy_operation_desc = NULL;
+      constexpr auto kYGradAIsDZ = (Trait::kYGradA == FusedGEMMGradInType::kDZ);
+
+      cublasLtMatrixLayout_t *dy_dout_desc = nullptr, *dy_x_desc = nullptr;
+      if (TransX) {
+        dy_dout_desc = &dout_trans_desc;
+        if (dout_trans_desc == nullptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              platform::dynload::cublasLtMatrixLayoutCreate(
+                  dy_dout_desc, mat_type, z_row, z_col, z_row));
+        }
+      } else {
+        dy_dout_desc = &dout_desc;
+        if (dout_desc == nullptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              platform::dynload::cublasLtMatrixLayoutCreate(
+                  dy_dout_desc, mat_type, z_col, z_row, z_col));
+        }
+      }
+
+      dy_x_desc = &x_trans_desc;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          dy_x_desc, mat_type, x_col, x_row, x_col));
+
+      auto& a_desc = kYGradAIsDZ ? (*dy_dout_desc) : (*dy_x_desc);
+      auto& b_desc = kYGradAIsDZ ? (*dy_x_desc) : (*dy_dout_desc);
+      auto a_trans = BoolToCuBlasEnum(Trait::kYGradATrans);
+      auto b_trans = BoolToCuBlasEnum(Trait::kYGradBTrans);
+
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &dy_desc, mat_type, y_col, y_row, y_col));
+
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
           &dy_operation_desc, compute_type, scale_type));
-      cublasOperation_t trans_x = CUBLAS_OP_T;
+
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cublasLtMatmulDescSetAttribute(
-              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_dout,
-              sizeof(trans_dout)));
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &a_trans,
+              sizeof(a_trans)));
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cublasLtMatmulDescSetAttribute(
-              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_x,
-              sizeof(trans_x)));
-      cublasLtEpilogue_t epiloque_func_for_dy = dbias == nullptr
-                                                    ? CUBLASLT_EPILOGUE_DEFAULT
-                                                    : CUBLASLT_EPILOGUE_BGRADA;
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &b_trans,
+              sizeof(b_trans)));
+
+      cublasLtEpilogue_t epiloque_func_for_dy;
+      if (dbias == nullptr) {
+        epiloque_func_for_dy = CUBLASLT_EPILOGUE_DEFAULT;
+      } else {
+        if (TransY) {
+          epiloque_func_for_dy = CUBLASLT_EPILOGUE_BGRADB;
+        } else {
+          epiloque_func_for_dy = CUBLASLT_EPILOGUE_BGRADA;
+        }
+      }
+
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cublasLtMatmulDescSetAttribute(
               dy_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE,
               &epiloque_func_for_dy, sizeof(epiloque_func_for_dy)));
 
       if (dbias) {
-        dbias->mutable_data<T>(ctx.GetPlace());
-        auto* dbias_data = dbias->data<T>();
+        auto* dbias_data = dbias->mutable_data<T>(ctx.GetPlace());
         PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cublasLtMatmulDescSetAttribute(
                 dy_operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER,
                 &dbias_data, sizeof(dbias_data)));
       }
 
-      cublasLtMatrixLayout_t x_desc = NULL, dy_desc = NULL;
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
-          &x_desc, mat_type, K, M, K));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
-          &dy_desc, mat_type, N, K, N));
-
-      memory::allocation::AllocationPtr dy_workspace =
-          memory::Alloc(dev_ctx, workspace_size);
-
-      dy->mutable_data<T>(ctx.GetPlace());
-      auto* dy_data = dy->data<T>();
+      auto dy_workspace = memory::Alloc(dev_ctx, workspace_size);
+      auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
       const auto* dout_data = dout->data<T>();
       const auto* x_data = x->data<T>();
+      const auto* a_data = kYGradAIsDZ ? dout_data : x_data;
+      const auto* b_data = kYGradAIsDZ ? x_data : dout_data;
 
-      cublasLtMatmulAlgo_t algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo(
-          lt_handle, dy_operation_desc, dout_desc, x_desc, dy_desc, alpha, beta,
-          dout_data, x_data, dy_data, stream, dy_workspace->ptr(),
-          workspace_size);
+      auto algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo(
+          lt_handle, dy_operation_desc, b_desc, a_desc, dy_desc, alpha, beta,
+          b_data, a_data, dy_data, stream, dy_workspace->ptr(), workspace_size);
 
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
-          lt_handle, dy_operation_desc, alpha, dout_data, dout_desc, x_data,
-          x_desc, beta, dy_data, dy_desc, dy_data, dy_desc, &algo,
-          dy_workspace->ptr(), workspace_size, stream));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatmulDescDestroy(dy_operation_desc));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatrixLayoutDestroy(x_desc));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatrixLayoutDestroy(dy_desc));
+          lt_handle, dy_operation_desc, alpha, b_data, b_desc, a_data, a_desc,
+          beta, dy_data, dy_desc, dy_data, dy_desc, algo, dy_workspace->ptr(),
+          workspace_size, stream));
     }
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cublasLtMatrixLayoutDestroy(dout_desc));
   }
 
  private:
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h
index c90a6966fe0a8..b00bdfe5660a9 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h
@@ -16,12 +16,16 @@ limitations under the License. */
 #pragma once
 
 #include <cuda_runtime_api.h>
+
 #include <algorithm>
 #include <mutex>
 #include <unordered_map>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/dynload/cublasLt.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/utils/optional.h"
 
 DECLARE_int64(cublaslt_exhaustive_search_times);
 
@@ -39,12 +43,14 @@ class GemmEpilogueAlgoCache {
   GemmEpilogueAlgoCache(GemmEpilogueAlgoCache const &) = delete;
   void operator=(GemmEpilogueAlgoCache const &) = delete;
 
-  cublasLtMatmulAlgo_t GetGemmAlgo(
+  cublasLtMatmulAlgo_t *GetGemmAlgo(
       cublasLtHandle_t lt_handle, cublasLtMatmulDesc_t op_desc,
       cublasLtMatrixLayout_t a_desc, cublasLtMatrixLayout_t b_desc,
       cublasLtMatrixLayout_t c_desc, const void *alpha, const void *beta,
       const void *a, const void *b, void *c, cudaStream_t stream,
       void *workspace, size_t workspace_size) {
+    if (search_times_ <= 0) return nullptr;
+
     int64_t seed = 0;
     std::hash<int64_t> hash_fn;
 
@@ -54,132 +60,108 @@ class GemmEpilogueAlgoCache {
     HashMatrixLayoutDesc_(c_desc, &seed, hash_fn);
 
     cublasLtMatmulAlgo_t ret;
-    auto it = map_.end();
-    bool have_found = false;
     {
       std::lock_guard<std::mutex> lock(cache_mutex_);
-      it = map_.find(seed);
-
+      auto it = map_.find(seed);
       if (it != map_.end()) {
-        ret = it->second;
-        have_found = true;
+        return &(it->second);
       }
     }
 
-    if (!have_found) {
-      cublasLtMatmulPreference_t preference;
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatmulPreferenceCreate(&preference));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatmulPreferenceSetAttribute(
-              preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-              &workspace_size, sizeof(workspace_size)));
-
-      int returned_results = 0;
-      cublasLtMatmulHeuristicResult_t heuristic_results[requested_algo_count_] =
-          {0};
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatmulAlgoGetHeuristic(
-              lt_handle, op_desc, a_desc, b_desc, c_desc, c_desc, preference,
-              requested_algo_count_, heuristic_results, &returned_results));
-
-      PADDLE_ENFORCE_GT(
-          returned_results, 0,
-          platform::errors::Unavailable("No GEMM epilogue algorithm support!"));
-
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cublasLtMatmulPreferenceDestroy(preference));
-
-      if (search_times_ > 0) {
-        int best_algo_idx = -1;
-        float best_algo_time = 0;
-
-        // Run 100 times for warmup
-        int warmup_algo_idx = 0;
-        for (int t = 0; t < 100; t++) {
-          cublasStatus_t status = platform::dynload::cublasLtMatmul(
-              lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c, c_desc,
-              c, c_desc, &heuristic_results[warmup_algo_idx].algo, workspace,
-              workspace_size, stream);
-          if (status != CUBLAS_STATUS_SUCCESS) {
-            t = -1;
-            warmup_algo_idx += 1;
-            if (warmup_algo_idx == requested_algo_count_) {
-              PADDLE_THROW(platform::errors::Unavailable(
-                  "No GEMM epilogue algorithm support!"));
-            }
-          }
-        }
+    cublasLtMatmulPreference_t preference;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulPreferenceCreate(&preference));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulPreferenceSetAttribute(
+            preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+            &workspace_size, sizeof(workspace_size)));
 
-        cudaEvent_t start_event, stop_event;
-        PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&start_event));
-        PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&stop_event));
-
-        for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) {
-          float curr_time = 0;
-          for (int check_idx = 0; check_idx < search_times_; check_idx++) {
-            float time = 0;
-            PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event, stream));
-
-            cublasStatus_t status = platform::dynload::cublasLtMatmul(
-                lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c,
-                c_desc, c, c_desc, &heuristic_results[algo_idx].algo, workspace,
-                workspace_size, stream);
-
-            PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(stop_event, stream));
-            PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(stop_event));
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                cudaEventElapsedTime(&time, start_event, stop_event));
-            curr_time += time;
-            if (status != CUBLAS_STATUS_SUCCESS) {
-              curr_time = 3.40282e+038;  // Max Value of float
-              break;
-            }
-          }
-
-          curr_time = curr_time / search_times_;
-          if (curr_time < best_algo_time || algo_idx == 0) {
-            best_algo_idx = algo_idx;
-            best_algo_time = curr_time;
-          }
-        }
+    int returned_results = 0;
+    std::vector<cublasLtMatmulHeuristicResult_t> heuristic_results(
+        requested_algo_count_);
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulAlgoGetHeuristic(
+            lt_handle, op_desc, a_desc, b_desc, c_desc, c_desc, preference,
+            requested_algo_count_, heuristic_results.data(),
+            &returned_results));
 
-        PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(start_event));
-        PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(stop_event));
+    PADDLE_ENFORCE_GT(
+        returned_results, 0,
+        platform::errors::Unavailable("No GEMM epilogue algorithm support!"));
 
-        if (best_algo_idx == -1) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulPreferenceDestroy(preference));
+
+    int best_algo_idx = -1;
+    float best_algo_time = 0;
+
+    // Run 100 times for warmup
+    int warmup_algo_idx = 0;
+    for (int t = 0; t < 100; t++) {
+      cublasStatus_t status = platform::dynload::cublasLtMatmul(
+          lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c, c_desc, c,
+          c_desc, &heuristic_results[warmup_algo_idx].algo, workspace,
+          workspace_size, stream);
+      if (status != CUBLAS_STATUS_SUCCESS) {
+        t = -1;
+        warmup_algo_idx += 1;
+        if (warmup_algo_idx == requested_algo_count_) {
           PADDLE_THROW(platform::errors::Unavailable(
               "No GEMM epilogue algorithm support!"));
         }
+      }
+    }
 
-        ret = heuristic_results[best_algo_idx].algo;
-      } else {
-        int decided_algo_idx = -1;
-        for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) {
-          cublasStatus_t status = platform::dynload::cublasLtMatmul(
-              lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c, c_desc,
-              c, c_desc, &heuristic_results[algo_idx].algo, workspace,
-              workspace_size, stream);
-          if (status == CUBLAS_STATUS_SUCCESS) {
-            decided_algo_idx = algo_idx;
-            break;
-          }
-        }
-        if (decided_algo_idx == -1) {
-          PADDLE_THROW(platform::errors::Unavailable(
-              "No GEMM epilogue algorithm support!"));
+    cudaEvent_t start_event, stop_event;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&start_event));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&stop_event));
+
+    for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) {
+      float curr_time = 0;
+      for (int check_idx = 0; check_idx < search_times_; check_idx++) {
+        float time = 0;
+        PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event, stream));
+
+        cublasStatus_t status = platform::dynload::cublasLtMatmul(
+            lt_handle, op_desc, alpha, a, a_desc, b, b_desc, beta, c, c_desc, c,
+            c_desc, &heuristic_results[algo_idx].algo, workspace,
+            workspace_size, stream);
+
+        PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(stop_event, stream));
+        PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(stop_event));
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            cudaEventElapsedTime(&time, start_event, stop_event));
+        curr_time += time;
+        if (status != CUBLAS_STATUS_SUCCESS) {
+          curr_time = 3.40282e+038;  // Max Value of float
+          break;
         }
-        ret = heuristic_results[decided_algo_idx].algo;
       }
 
-      std::lock_guard<std::mutex> lock(cache_mutex_);
-      map_[seed] = ret;
+      curr_time = curr_time / search_times_;
+      if (curr_time < best_algo_time || algo_idx == 0) {
+        best_algo_idx = algo_idx;
+        best_algo_time = curr_time;
+      }
+    }
+
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(start_event));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(stop_event));
+
+    if (best_algo_idx == -1) {
+      PADDLE_THROW(
+          platform::errors::Unavailable("No GEMM epilogue algorithm support!"));
     }
 
-    VLOG(4) << "Search time:" << search_times_ << ", Is hash-key (" << seed
-            << ") found in GemmEpilogueAlgoCache? " << have_found;
+    ret = heuristic_results[best_algo_idx].algo;
+
+    VLOG(4) << "Search time:" << search_times_ << ", hash-key (" << seed
+            << ") not found in GemmEpilogueAlgoCache";
 
-    return ret;
+    std::lock_guard<std::mutex> lock(cache_mutex_);
+    auto &algo_in_map = map_[seed];
+    algo_in_map = ret;
+    return &algo_in_map;
   }
 
  private:
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index 866de8e04a9bc..f72f73438c0a2 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -441,11 +441,10 @@ void LaunchLayernormResidualDropoutBias(
     // call layernorm forward
     switch (GetDesiredBlockDim(cols)) {
       FIXED_BLOCK_DIM_CASE(
-          LayerNormForward<
-              T, U, kBlockDim,
-              ScaleBiasWithSameTypeX><<<rows, kBlockDim, 0, ctx.stream()>>>(
-              dst, scale, layernorm_bias, layernorm_dst, mean, var, epsilon,
-              cols));
+          LayerNormForward<T, U, kBlockDim, ScaleBiasWithSameTypeX>
+          <<<rows, kBlockDim, 0, ctx.stream()>>>(dst, scale, layernorm_bias,
+                                                 layernorm_dst, mean, var,
+                                                 epsilon, cols));
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Product from begin_norm_axis to end must be larger than 1"));
@@ -468,21 +467,25 @@ void LaunchLayernormResidualDropoutBias(
         static_cast<int>(std::ceil(rows / static_cast<float>(ROWS_PER_CTA))); \
     fused_fast_ln_fwd_kernel<                                                 \
         T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, uint8_t,     \
-        VecSize, WARPS_M, WARPS_N, BYTES_PER_LDG,                             \
-        cols><<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(                    \
-        rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,         \
-        increment, epsilon, src, residual, bias, scale, layernorm_bias,       \
-        mask_data, mean, var, dst, layernorm_dst);                            \
+        VecSize, WARPS_M, WARPS_N, BYTES_PER_LDG, cols>                       \
+        <<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(                         \
+            rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,     \
+            increment, epsilon, src, residual, bias, scale, layernorm_bias,   \
+            mask_data, mean, var, dst, layernorm_dst);                        \
   } break
 
 #define LAUNCH_FUSED_FAST_LN_KERNEL       \
   LAUNCH_FUSED_FAST_LN_KERNEL_BASE(768);  \
   LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1024); \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1280); \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1536); \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1792); \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(2048); \
   LAUNCH_FUSED_FAST_LN_KERNEL_BASE(4096)
 
   bool can_call_fast_ln_kernel = false;
-  if ((cols == 768 || cols == 1024 || cols == 4096) && scale != nullptr &&
-      layernorm_bias != nullptr) {
+  if (((cols >= 768 && cols <= 2048 && cols % 256 == 0) || cols == 4096) &&
+      scale != nullptr && layernorm_bias != nullptr) {
     can_call_fast_ln_kernel = true;
   }
   VLOG(6) << "can_call_fast_ln_kernel = " << can_call_fast_ln_kernel;
@@ -490,12 +493,11 @@ void LaunchLayernormResidualDropoutBias(
   const int VecSize = MAX_CACHE_BYTES / sizeof(T);
   if (cols % VecSize != 0) {
     int blockDim = GetDesiredBlockDim(cols);
-    FusedLayernormResidualDropoutBias<
-        T, uint8_t, 1, U,
-        ScaleBiasWithSameTypeX><<<rows, blockDim, 0, ctx.stream()>>>(
-        rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, increment,
-        epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst,
-        layernorm_dst, mean, var);
+    FusedLayernormResidualDropoutBias<T, uint8_t, 1, U, ScaleBiasWithSameTypeX>
+        <<<rows, blockDim, 0, ctx.stream()>>>(
+            rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,
+            increment, epsilon, src, residual, bias, scale, layernorm_bias,
+            mask_data, dst, layernorm_dst, mean, var);
   } else {
     if (can_call_fast_ln_kernel) {
       switch (cols) {
@@ -508,12 +510,12 @@ void LaunchLayernormResidualDropoutBias(
       }
     } else {
       int blockDim = GetDesiredBlockDim(cols / VecSize);
-      FusedLayernormResidualDropoutBias<
-          T, uint8_t, VecSize, U,
-          ScaleBiasWithSameTypeX><<<rows, blockDim, 0, ctx.stream()>>>(
-          rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,
-          increment, epsilon, src, residual, bias, scale, layernorm_bias,
-          mask_data, dst, layernorm_dst, mean, var);
+      FusedLayernormResidualDropoutBias<T, uint8_t, VecSize, U,
+                                        ScaleBiasWithSameTypeX>
+          <<<rows, blockDim, 0, ctx.stream()>>>(
+              rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,
+              increment, epsilon, src, residual, bias, scale, layernorm_bias,
+              mask_data, dst, layernorm_dst, mean, var);
     }
   }
 }
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
index 98602e4edd0a2..63627db49d6fa 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -62,12 +63,13 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel {
     // y: qkv's weight: [3, num_head, dim_head, dim_embed]
     auto x_dim = ctx->GetInputDim("X");
     auto y_dim = ctx->GetInputsDim("QKVW")[0];
-    PADDLE_ENFORCE_EQ(x_dim.size(), 3, platform::errors::InvalidArgument(
-                                           "The dimensions of x must be 3"
-                                           "(batch_size, seq_len, dim_embed),"
-                                           "but received dimensions of"
-                                           "Input is [%d]",
-                                           x_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        x_dim.size(), 3,
+        platform::errors::InvalidArgument("The dimensions of x must be 3"
+                                          "(batch_size, seq_len, dim_embed),"
+                                          "but received dimensions of"
+                                          "Input is [%d]",
+                                          x_dim.size()));
     PADDLE_ENFORCE_EQ(y_dim.size(), 4,
                       platform::errors::InvalidArgument(
                           "The dimensions of qkv_weight must be 4"
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index fe93d323c59bc..814827d95b6bd 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -16,6 +16,9 @@ limitations under the License. */
 // https://github.com/NVIDIA/FasterTransformer/blob/v4.0/fastertransformer/cuda/masked_multihead_attention.cu
 // We add License in the head.
 
+// headers sort by clang-format may cause compiling error or test faiure,
+// see https://github.com/PaddlePaddle/Paddle/pull/42840/
+// clang-format off
 #include <cuda_fp16.h>
 #include <float.h>
 #include <cub/cub.cuh>
@@ -35,6 +38,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
+// clang-format on
 
 namespace paddle {
 namespace operators {
@@ -529,10 +533,10 @@ inline __device__ void zero(T &dst) {  // NOLINT
   dst = tmp.raw;
 }
 
-template <typename T, int Dh, int THREADS_PER_KEY, int THREADS_PER_VALUE,
-          int THREADS_PER_BLOCK>
+template <typename T, int Dh, int Dh_MAX, int THREADS_PER_KEY,
+          int THREADS_PER_VALUE, int THREADS_PER_BLOCK>
 __global__ void masked_multihead_attention_kernel(
-    Masked_multihead_attention_params<T> params) {
+    Masked_multihead_attention_params<T> params, int pad_active_groups) {
 #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
 
   static_assert(Dh % THREADS_PER_KEY == 0, "");
@@ -560,11 +564,12 @@ __global__ void masked_multihead_attention_kernel(
   const int tid = threadIdx.x;
 
   float qk_max = -FLT_MAX;
+  float qk = 0;
 
   // qkv [B, S=1, 3, num_head, head_dim]
   int qkv_base_offset = bi * 3 * params.num_head * Dh + hi * Dh;
 
-  using Qk_vec = typename Qk_vec_<T, Dh>::Type;
+  using Qk_vec = typename Qk_vec_<T, Dh_MAX>::Type;
   constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T);
   static_assert(Dh % QK_VEC_SIZE == 0 && Dh / QK_VEC_SIZE <= WARP_SIZE, "");
   constexpr int QK_VECS_PER_WARP = Dh / QK_VEC_SIZE;
@@ -605,18 +610,18 @@ __global__ void masked_multihead_attention_kernel(
                  params.timestep * QK_ELTS_IN_16B + ci;
     *reinterpret_cast<Qk_vec *>(&params.cache_kv[offset]) = k;
 
-    float qk = dot<Qk_vec, Qk_vec>(q, k);
-#pragma unroll
-    for (int mask = QK_VECS_PER_WARP / 2; mask >= 1; mask /= 2) {
-      qk += __shfl_xor_sync(shfl_mask(QK_VECS_PER_WARP), qk, mask);
+    qk = dot<Qk_vec, Qk_vec>(q, k);
+  }
+  if (tid < WARP_SIZE) {
+    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+      qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
     }
-
-    qk *= params.inv_sqrt_dh;
     if (tid == 0) {
       // NOTE(wangxi): mask must be 0.0
       // T mask = params.attn_mask[
       //    bi * (params.timestep + 1) + params.timestep];
       // qk += static_cast<float>(mask);
+      qk *= params.inv_sqrt_dh;
       qk_max = qk;
       qk_smem[params.timestep] = qk;
     }
@@ -746,16 +751,18 @@ __global__ void masked_multihead_attention_kernel(
   zero(out);
 
   constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
-  for (int ti = vo; ti < params.timestep; ti += V_PER_ITER) {
-    V_vec v = *reinterpret_cast<const V_vec *>(&v_cache[ti * Dh]);
+  if (vo < V_PER_ITER) {
+    for (int ti = vo; ti < params.timestep; ti += V_PER_ITER) {
+      V_vec v = *reinterpret_cast<const V_vec *>(&v_cache[ti * Dh]);
 #if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
-    float logit = logits_smem[ti];
-    out = fma(logit, cast_to_float(v), out);
+      float logit = logits_smem[ti];
+      out = fma(logit, cast_to_float(v), out);
 #else
-    T logit = logits_smem[ti];
-    // Update the partial sums.
-    out = fma(logit, v, out);
+      T logit = logits_smem[ti];
+      // Update the partial sums.
+      out = fma(logit, v, out);
 #endif
+    }
   }
 
 #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
@@ -784,8 +791,12 @@ __global__ void masked_multihead_attention_kernel(
 
   __syncthreads();
 
+  if (vo < pad_active_groups / 2) {
+    zero(*reinterpret_cast<V_vec *>(&out_smem[vo * Dh + vi]));
+  }
 #pragma unroll
-  for (int active_groups = V_PER_ITER; active_groups >= 2; active_groups /= 2) {
+  for (int active_groups = pad_active_groups; active_groups >= 2;
+       active_groups /= 2) {
     int midpoint = active_groups / 2;
 
     if (vo >= midpoint && vo < active_groups) {
@@ -830,7 +841,7 @@ __global__ void masked_multihead_attention_kernel(
 template <typename T>
 inline size_t smem_size_in_bytes(
     const Masked_multihead_attention_params<T> &params, int dim_head,
-    int threads_per_value, int threads_per_block) {
+    int threads_per_value, int threads_per_block, int pad_active_groups) {
   size_t qk_sz = div_up(params.timestep + 1, 4) * 16;
   size_t logits_sz = 0;
 
@@ -841,31 +852,33 @@ inline size_t smem_size_in_bytes(
 #endif
   size_t softmax_sz = qk_sz + logits_sz;
 
-  int rows_per_red = threads_per_block / threads_per_value;
+  int rows_per_red = pad_active_groups;
   size_t red_sz = rows_per_red * dim_head * sizeof(T) / 2;
 
   return max(softmax_sz, red_sz);
 }
 
-#define MMHA_LAUNCH_KERNEL(T, Dh, THDS_PER_KEY, THDS_PER_VALUE,          \
-                           THDS_PER_BLOCK, stream)                       \
-  size_t smem_sz =                                                       \
-      smem_size_in_bytes<T>(params, Dh, THDS_PER_VALUE, THDS_PER_BLOCK); \
-  dim3 grid(params.num_head, params.batch_size);                         \
-  masked_multihead_attention_kernel<                                     \
-      T, Dh, THDS_PER_KEY, THDS_PER_VALUE,                               \
-      THDS_PER_BLOCK><<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params)
-
-template <typename T, int Dh>
+#define MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE,        \
+                           THDS_PER_BLOCK, stream)                             \
+  int pad_active_groups =                                                      \
+      1 << static_cast<int>(ceil(std::log2(THDS_PER_BLOCK / THDS_PER_VALUE))); \
+  size_t smem_sz = smem_size_in_bytes<T>(params, Dh, THDS_PER_VALUE,           \
+                                         THDS_PER_BLOCK, pad_active_groups);   \
+  dim3 grid(params.num_head, params.batch_size);                               \
+  masked_multihead_attention_kernel<T, Dh, Dh_MAX, THDS_PER_KEY,               \
+                                    THDS_PER_VALUE, THDS_PER_BLOCK>            \
+      <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params, pad_active_groups)
+
+template <typename T, int Dh, int Dh_MAX>
 void fmha_launch_kernel(const Masked_multihead_attention_params<T> &params,
                         const cudaStream_t &stream) {
   constexpr int THREADS_PER_VALUE = Dh * sizeof(T) / 16;
   if (params.timestep < 32) {
-    MMHA_LAUNCH_KERNEL(T, Dh, 4, THREADS_PER_VALUE, 64, stream);
+    MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, stream);
   } else if (params.timestep < 2048) {
-    MMHA_LAUNCH_KERNEL(T, Dh, 2, THREADS_PER_VALUE, 128, stream);
+    MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, stream);
   } else {
-    MMHA_LAUNCH_KERNEL(T, Dh, 1, THREADS_PER_VALUE, 256, stream);
+    MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 1, THREADS_PER_VALUE, 256, stream);
   }
 }
 
@@ -890,18 +903,21 @@ void fmha(const platform::CUDADeviceContext &dev_ctx, const Tensor &qkv_tensor,
 
   switch (dim_head) {
     case 32:
-      fmha_launch_kernel<T, 32>(params, dev_ctx.stream());
+      fmha_launch_kernel<T, 32, 32>(params, dev_ctx.stream());
       break;
     case 64:
-      fmha_launch_kernel<T, 64>(params, dev_ctx.stream());
+      fmha_launch_kernel<T, 64, 64>(params, dev_ctx.stream());
+      break;
+    case 96:
+      fmha_launch_kernel<T, 96, 128>(params, dev_ctx.stream());
       break;
     case 128:
-      fmha_launch_kernel<T, 128>(params, dev_ctx.stream());
+      fmha_launch_kernel<T, 128, 128>(params, dev_ctx.stream());
       break;
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
           "dim_head = %d is unsupport, only support "
-          "dim_head = 32, 64 or 128 for now.",
+          "dim_head = 32, 64, 96 or 128 for now.",
           dim_head));
   }
 }
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index 1d3085a013f81..0cc31e6fc3255 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -153,16 +153,15 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
   const int real_vec_size = cols % VecSize == 0 ? VecSize : 1;
   auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size);
   if (cols % VecSize == 0) {
-    FusedResidualDropoutBias<T, uint8_t, VecSize><<<
-        config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-        rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
-        bias, mask_data, dst, increment, is_test);
+    FusedResidualDropoutBias<T, uint8_t, VecSize>
+        <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+            rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
+            bias, mask_data, dst, increment, is_test);
   } else {
-    FusedResidualDropoutBias<
-        T, uint8_t,
-        1><<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-        rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
-        bias, mask_data, dst, increment, is_test);
+    FusedResidualDropoutBias<T, uint8_t, 1>
+        <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+            rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
+            bias, mask_data, dst, increment, is_test);
   }
 }
 
@@ -263,27 +262,26 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask,
     dim3 block_dim(threads, 128, 1);
     dim3 grid_dim(blocks, 1, 1);
     if (cols % VecSize == 0) {
-      FusedResidualDropoutBiasGrad<
-          T, MaskType, 8, 128,
-          VecSize><<<grid_dim, block_dim, 0, ctx.stream()>>>(
-          dout, mask, factor, rows, cols, dx, dbias);
+      FusedResidualDropoutBiasGrad<T, MaskType, 8, 128, VecSize>
+          <<<grid_dim, block_dim, 0, ctx.stream()>>>(dout, mask, factor, rows,
+                                                     cols, dx, dbias);
     } else {
-      FusedResidualDropoutBiasGrad<T, MaskType, 8, 128,
-                                   1><<<grid_dim, block_dim, 0, ctx.stream()>>>(
-          dout, mask, factor, rows, cols, dx, dbias);
+      FusedResidualDropoutBiasGrad<T, MaskType, 8, 128, 1>
+          <<<grid_dim, block_dim, 0, ctx.stream()>>>(dout, mask, factor, rows,
+                                                     cols, dx, dbias);
     }
   } else {
     const uint64_t n = rows * cols;
     platform::GpuLaunchConfig config =
         platform::GetGpuLaunchConfig1D(ctx, n / real_vec_size);
     if (n % VecSize == 0) {
-      FusedResidualDropoutGrad<T, MaskType, VecSize><<<
-          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-          dout, mask, factor, n, dx);
+      FusedResidualDropoutGrad<T, MaskType, VecSize>
+          <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+              dout, mask, factor, n, dx);
     } else {
-      FusedResidualDropoutGrad<T, MaskType, 1><<<
-          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-          dout, mask, factor, n, dx);
+      FusedResidualDropoutGrad<T, MaskType, 1>
+          <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+              dout, mask, factor, n, dx);
     }
   }
 }
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
index 23b82ac5d966f..e316f58b3f759 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_seqpool_cvm_op.h"
+
 #include <string>
 namespace paddle {
 namespace operators {
@@ -34,9 +35,10 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         cvm_dims.size(), 2UL,
         platform::errors::InvalidArgument("Input(CVM)'s rank should be 2."));
-    PADDLE_ENFORCE_EQ(cvm_dims[1], 2UL, platform::errors::InvalidArgument(
-                                            "The 2nd dimension of "
-                                            "Input(CVM) should be 2."));
+    PADDLE_ENFORCE_EQ(
+        cvm_dims[1], 2UL,
+        platform::errors::InvalidArgument("The 2nd dimension of "
+                                          "Input(CVM) should be 2."));
 
     auto ins_dims = ctx->GetInputsDim("X");
     const int cvm_offset = ctx->Attrs().Get<int>("cvm_offset");
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
index 3770a536a8fcf..2b6b7d4934539 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <string>
+
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/operators/fused/fused_seqpool_cvm_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h
index 6042772adb054..e3bc424f25910 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
index 11f1011dec3a2..4c00f778ced3f 100644
--- a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
+++ b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
@@ -114,10 +114,9 @@ __global__ void FusedSoftmaxMaskVecKernel(T* dst, const T* src, const T* mask,
   }
 }
 
-#define SOFTMAX_MASK_KERNEL(VEC_SIZE, ELEMENTS)                    \
-  FusedSoftmaxMaskVecKernel<T, VEC_SIZE,                           \
-                            ELEMENTS><<<grid, block, 0, stream>>>( \
-      dst, src, mask, seq_len)
+#define SOFTMAX_MASK_KERNEL(VEC_SIZE, ELEMENTS)    \
+  FusedSoftmaxMaskVecKernel<T, VEC_SIZE, ELEMENTS> \
+      <<<grid, block, 0, stream>>>(dst, src, mask, seq_len)
 
 // FIXME(wangxi): It is found that the performance of VEC_SIZE=2 is better
 //  than that of =4 and =8. Further analysis of the kernel is needed later.
diff --git a/paddle/fluid/operators/fused/fused_transformer_op.cc b/paddle/fluid/operators/fused/fused_transformer_op.cc
index 9e5fc42fc76dd..d11171eb2d086 100644
--- a/paddle/fluid/operators/fused/fused_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_transformer_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_transformer_op.h"
+
 #include <string>
 
 namespace paddle {
@@ -157,5 +158,5 @@ void FusedMHA<T>::ComputeForward(T* output, T* softmax_mask) {}
 template <typename T>
 void FusedMHA<T>::ComputeBackward(const T* grad_output, T* softmax_mask,
                                   T* grad_x) {}
-}
-}
\ No newline at end of file
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_transformer_op.h b/paddle/fluid/operators/fused/fused_transformer_op.h
index 2d2d390d243e5..a2d5862abf06a 100644
--- a/paddle/fluid/operators/fused/fused_transformer_op.h
+++ b/paddle/fluid/operators/fused/fused_transformer_op.h
@@ -151,5 +151,5 @@ class FusedTransformerEncoderLayer {
 
   std::string act_method;
 };
-}
-}
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
index eeeb004003c9c..802cd18e1db24 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
@@ -35,8 +36,9 @@ class ConvInceptionFusionOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         in_dims.size(), 4,
         platform::errors::InvalidArgument("Conv intput should be 4-D tensor."));
-    PADDLE_ENFORCE_EQ(w_dims.size(), 4, platform::errors::InvalidArgument(
-                                            "There should be 4 filters."));
+    PADDLE_ENFORCE_EQ(
+        w_dims.size(), 4,
+        platform::errors::InvalidArgument("There should be 4 filters."));
     PADDLE_ENFORCE_EQ(w_dims[0][1], in_dims[1],
                       platform::errors::InvalidArgument(
                           "Invalid fileter channel number %d, which should be "
diff --git a/paddle/fluid/operators/fused/fusion_group_op.cu.cc b/paddle/fluid/operators/fused/fusion_group_op.cu.cc
index 94949f5633116..c592bbe7d3e9a 100644
--- a/paddle/fluid/operators/fused/fusion_group_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_group_op.h"
+
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/fused/fusion_group_op.h b/paddle/fluid/operators/fused/fusion_group_op.h
index 5e5f2c60ffbd4..f71355b85d96a 100644
--- a/paddle/fluid/operators/fused/fusion_group_op.h
+++ b/paddle/fluid/operators/fused/fusion_group_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_code.h"
 
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index afbd5380a8301..fd05155bc2cef 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_gru_op.h"
+
 #include <cstring>  // for memcpy
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 3dada660aeffe..f2e6f099b4b58 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_lstm_op.h"
+
 #include <string>
+
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index bed5125b99583..c9d6d42efac24 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -13,8 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/jit/kernels.h"
 
 namespace paddle {
@@ -24,10 +26,11 @@ void FusionRepeatedFCReluOp::InferShape(
     framework::InferShapeContext* ctx) const {
   OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusionRepeatedFCRelu");
   auto sz = ctx->Inputs("W").size();
-  PADDLE_ENFORCE_GT(sz, 1UL, platform::errors::InvalidArgument(
-                                 "Inputs(W) of FusionRepeatedFCReluOp should "
-                                 "be greater than 1, but received value is %d.",
-                                 sz));
+  PADDLE_ENFORCE_GT(sz, 1UL,
+                    platform::errors::InvalidArgument(
+                        "Inputs(W) of FusionRepeatedFCReluOp should "
+                        "be greater than 1, but received value is %d.",
+                        sz));
   PADDLE_ENFORCE_EQ(
       ctx->Inputs("Bias").size(), sz,
       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
index ee28a54805653..b99b53de9c4d6 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h"
+
 #include <algorithm>  // for min, max
 #include <string>
+
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"
 
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index 58613173ad212..7341d1f864d93 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h"
+
 #include <string>
+
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/cpu_vec.h"
@@ -48,8 +50,9 @@ void FusionSeqExpandConcatFCOp::InferShape(
   for (size_t i = 1; i < ins_dims.size(); ++i) {
     sum += ins_dims[i][1];
   }
-  PADDLE_ENFORCE_EQ(sum, w_dims[0], platform::errors::InvalidArgument(
-                                        "FC height should be sum of all inputs "
+  PADDLE_ENFORCE_EQ(
+      sum, w_dims[0],
+      platform::errors::InvalidArgument("FC height should be sum of all inputs "
                                         "width, but received FC height is: %d, "
                                         "sum of all inputs width is: %d.",
                                         w_dims[0], sum));
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
index e574d67e3982c..1d487ef3dabc1 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
@@ -13,8 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_seqpool_concat_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/jit/kernels.h"
 
 namespace paddle {
@@ -29,17 +31,19 @@ void FusionSeqPoolConcatOp::InferShape(
                         ctx->Inputs("X").size()));
   OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FusionSeqPoolConcat");
   int axis = ctx->Attrs().Get<int>("axis");
-  PADDLE_ENFORCE_EQ(axis, 1, platform::errors::InvalidArgument(
-                                 "FusionSeqPoolConcatOp only supports concat "
-                                 "axis=1 yet, but received axis value is %d",
-                                 axis));
+  PADDLE_ENFORCE_EQ(axis, 1,
+                    platform::errors::InvalidArgument(
+                        "FusionSeqPoolConcatOp only supports concat "
+                        "axis=1 yet, but received axis value is %d",
+                        axis));
 
   auto ins_dims = ctx->GetInputsDim("X");
   const size_t n = ins_dims.size();
-  PADDLE_ENFORCE_GT(n, 0UL, platform::errors::InvalidArgument(
-                                "Input tensors count should be greater than 0, "
-                                "but received value is %d.",
-                                n));
+  PADDLE_ENFORCE_GT(n, 0UL,
+                    platform::errors::InvalidArgument(
+                        "Input tensors count should be greater than 0, "
+                        "but received value is %d.",
+                        n));
   if (n == 1) {
     LOG(WARNING) << "Only have one input, may waste memory";
   }
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
index c74cc504840d3..d29bc00b5459e 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
@@ -13,8 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/jit/kernels.h"
 
 namespace paddle {
@@ -31,20 +33,23 @@ void FusionSeqPoolCVMConcatOp::InferShape(
       paddle::platform::errors::InvalidArgument(
           "Output(Out) of FusionSeqPoolCVMConcatOp should not be null."));
   int axis = ctx->Attrs().Get<int>("axis");
-  PADDLE_ENFORCE_EQ(axis, 1, paddle::platform::errors::InvalidArgument(
-                                 "FusionSeqPoolCVMConcatOp only supports "
-                                 "concat axis=1 yet, but received %d.",
-                                 axis));
+  PADDLE_ENFORCE_EQ(axis, 1,
+                    paddle::platform::errors::InvalidArgument(
+                        "FusionSeqPoolCVMConcatOp only supports "
+                        "concat axis=1 yet, but received %d.",
+                        axis));
   bool use_cvm = ctx->Attrs().Get<bool>("use_cvm");
-  PADDLE_ENFORCE_EQ(use_cvm, true, paddle::platform::errors::InvalidArgument(
-                                       "FusionSeqPoolCVMConcatOp only supports "
-                                       "use_cvm is true yet, but received %d.",
-                                       use_cvm));
+  PADDLE_ENFORCE_EQ(use_cvm, true,
+                    paddle::platform::errors::InvalidArgument(
+                        "FusionSeqPoolCVMConcatOp only supports "
+                        "use_cvm is true yet, but received %d.",
+                        use_cvm));
 
   auto ins_dims = ctx->GetInputsDim("X");
   const size_t n = ins_dims.size();
-  PADDLE_ENFORCE_GT(n, 0UL, paddle::platform::errors::InvalidArgument(
-                                "Input tensors count should > 0."));
+  PADDLE_ENFORCE_GT(n, 0UL,
+                    paddle::platform::errors::InvalidArgument(
+                        "Input tensors count should > 0."));
   if (n == 1) {
     LOG(WARNING) << "Only have one input, may waste memory";
   }
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
index 870f72b8c7f0d..047fefc1eeb07 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -13,8 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/jit/kernels.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
index 954cd7cc7a40b..bf8e9818e545f 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
index 786f5b4e07798..eb29859d8d15b 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
index 66e6c00da2db8..52140c0ca46ee 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/phi/core/ddim.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
index 0ffc4c91b851c..c9956dcdd2010 100644
--- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <initializer_list>
 #include <iostream>
 #include <memory>
+
 #include "dnnl.hpp"
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/operator.h"
@@ -31,8 +32,8 @@ using paddle::platform::CPUDeviceContext;
 using paddle::platform::CreateKey;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
-using platform::to_void_cast;
 using phi::vectorize;
+using platform::to_void_cast;
 using Direction = dnnl::rnn_direction;
 
 namespace {
diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc
index e7d697767fcac..ad0cc0bd1cf86 100644
--- a/paddle/fluid/operators/fused/multi_gru_op.cc
+++ b/paddle/fluid/operators/fused/multi_gru_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cstring>  // for memcpy
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"
diff --git a/paddle/fluid/operators/fused/multi_gru_op.h b/paddle/fluid/operators/fused/multi_gru_op.h
index ebd3faf44a84b..8b064c8754f5e 100644
--- a/paddle/fluid/operators/fused/multi_gru_op.h
+++ b/paddle/fluid/operators/fused/multi_gru_op.h
@@ -19,9 +19,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using framework::ExecutionContext;
 using framework::LoDTensor;
 using framework::Tensor;
-using framework::ExecutionContext;
 
 class MultiGRUOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cc b/paddle/fluid/operators/fused/multihead_matmul_op.cc
index 8f2c04d5afe12..79b886c37297c 100644
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cc
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/errors.h"
 
diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu
index f0e05659c9294..301553467165a 100644
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
@@ -105,8 +107,8 @@ void TransQKVWithBias(const int batch, const int seq_len, const int head_size,
                       platform::errors::InvalidArgument(
                           "head_num (%d) * head_size (%d) should <= %d",
                           head_num, head_size, 1024 * 4));
-    TransposeQkvKernel<float4><<<grid, block, 0, stream>>>(h, input4, bias4,
-                                                           output4);
+    TransposeQkvKernel<float4>
+        <<<grid, block, 0, stream>>>(h, input4, bias4, output4);
   } else if (head_size % 2 == 0 && scratch_size % 2 == 0) {
     const int h = head_size / 2;
     const float2 *input2 = reinterpret_cast<const float2 *>(input);
@@ -118,8 +120,8 @@ void TransQKVWithBias(const int batch, const int seq_len, const int head_size,
                       platform::errors::InvalidArgument(
                           "head_num (%d) * head_size (%d) should <= %d",
                           head_num, head_size, 1024 * 2));
-    TransposeQkvKernel<float2><<<grid, block, 0, stream>>>(h, input2, bias2,
-                                                           output2);
+    TransposeQkvKernel<float2>
+        <<<grid, block, 0, stream>>>(h, input2, bias2, output2);
   } else {
     const dim3 block(head_size, head_num, 1);
     // limit head_size * head_num to max block size(1024).
@@ -127,8 +129,8 @@ void TransQKVWithBias(const int batch, const int seq_len, const int head_size,
                       platform::errors::InvalidArgument(
                           "head_num (%d) * head_size (%d) should <= %d",
                           head_num, head_size, 1024));
-    TransposeQkvKernel<float><<<grid, block, 0, stream>>>(head_size, input,
-                                                          bias, output);
+    TransposeQkvKernel<float>
+        <<<grid, block, 0, stream>>>(head_size, input, bias, output);
   }
 }
 
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc
index 6f4246aadd903..d5860fe9cf12b 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cc
@@ -115,13 +115,14 @@ class ResNetUnitOp : public framework::OperatorWithKernel {
       bn_param_shape = {1, 1, 1, bn_param_shape[0]};
     }
     framework::DDim bn_param_dims = phi::make_ddim(bn_param_shape);
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4, platform::errors::InvalidArgument(
-                                            "The dimensions of input "
-                                            "must equal to 4."
-                                            "But received: the shape of input "
-                                            "= [%s], the dimension of input = "
-                                            "[%d]",
-                                            x_dims, x_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 4,
+        platform::errors::InvalidArgument("The dimensions of input "
+                                          "must equal to 4."
+                                          "But received: the shape of input "
+                                          "= [%s], the dimension of input = "
+                                          "[%d]",
+                                          x_dims, x_dims.size()));
     PADDLE_ENFORCE_EQ(w_dims.size(), 4,
                       platform::errors::InvalidArgument(
                           "The dimensions of filter "
@@ -180,14 +181,16 @@ class ResNetUnitOp : public framework::OperatorWithKernel {
     // and var tensors should be float when input tensor's dtype is float16.
     auto bn_param_type = framework::proto::VarType::FP32;
 
-    PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                         ctx.Input<Tensor>("ScaleX")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                         ctx.Input<Tensor>("BiasX")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("ScaleX")->dtype()),
+        platform::errors::InvalidArgument(
+            "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("BiasX")->dtype()),
+        platform::errors::InvalidArgument(
+            "Bias input should be of float type"));
     framework::LibraryType library = framework::LibraryType::kPlain;
     framework::DataLayout layout = framework::DataLayout::kAnyLayout;
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cc b/paddle/fluid/operators/fused/skip_layernorm_op.cc
index 442f359c0dac5..6ac6f51e4ce47 100644
--- a/paddle/fluid/operators/fused/skip_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/errors.h"
 
diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cu b/paddle/fluid/operators/fused/skip_layernorm_op.cu
index e755ea33755ca..66a164ff31bea 100644
--- a/paddle/fluid/operators/fused/skip_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
diff --git a/paddle/fluid/operators/fused/unity_build_rule.cmake b/paddle/fluid/operators/fused/unity_build_rule.cmake
index c428b7456bb20..8605cd3cdae85 100644
--- a/paddle/fluid/operators/fused/unity_build_rule.cmake
+++ b/paddle/fluid/operators/fused/unity_build_rule.cmake
@@ -4,16 +4,17 @@
 # Generally, the combination rules in this file do not need to be modified.
 # If there are some redefined error in compiling with the source file which
 # in combination rule, you can remove the source file from the following rules.
-register_unity_group(cc
-    fused_elemwise_activation_op.cc
-    fused_embedding_fc_lstm_op.cc
-    fused_embedding_seq_pool_op.cc
-    fusion_lstm_op.cc
-    fusion_repeated_fc_relu_op.cc
-    fusion_seqconv_eltadd_relu_op.cc
-    fusion_seqexpand_concat_fc_op.cc
-    fusion_seqpool_concat_op.cc
-    fusion_squared_mat_sub_op.cc
-    multi_gru_op.cc
-    mkldnn/multi_gru_mkldnn_op.cc
-    fusion_seqpool_cvm_concat_op.cc)
+register_unity_group(
+  cc
+  fused_elemwise_activation_op.cc
+  fused_embedding_fc_lstm_op.cc
+  fused_embedding_seq_pool_op.cc
+  fusion_lstm_op.cc
+  fusion_repeated_fc_relu_op.cc
+  fusion_seqconv_eltadd_relu_op.cc
+  fusion_seqexpand_concat_fc_op.cc
+  fusion_seqpool_concat_op.cc
+  fusion_squared_mat_sub_op.cc
+  multi_gru_op.cc
+  mkldnn/multi_gru_mkldnn_op.cc
+  fusion_seqpool_cvm_concat_op.cc)
diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cc b/paddle/fluid/operators/fused_softmax_mask_op.cc
index a41380028338a..a33070d94b919 100644
--- a/paddle/fluid/operators/fused_softmax_mask_op.cc
+++ b/paddle/fluid/operators/fused_softmax_mask_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/fused_softmax_mask_op.h"
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cu b/paddle/fluid/operators/fused_softmax_mask_op.cu
index c4ab4de8a64cb..b68a6907d7a65 100644
--- a/paddle/fluid/operators/fused_softmax_mask_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_op.cu
@@ -40,6 +40,7 @@ limitations under the License. */
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
+
 #include <algorithm>
 #include <string>
 
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
index c737ba361e0f2..eefca7b6ab564 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
@@ -11,6 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.h"
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
index d4c5b8877056f..4ee90eb318496 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
@@ -39,6 +39,7 @@ limitations under the License. */
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
+
 #include <algorithm>
 #include <string>
 
@@ -395,49 +396,49 @@ class SoftmaxMaskFuseUpperTriangleKernel : public framework::OpKernel<T> {
 
     switch (pow2_index) {
       case 5:  // 32
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 5><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 5>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 6:  // 64
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 6><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 6>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 7:  // 128
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 7><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 7>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 8:  // 256
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 8><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 8>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 9:  // 512
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 9><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 9>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 10:  // 1024
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 10><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                   key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 10>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 11:  // 2048
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 11><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                   key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 11>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 12:  // 4096
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 12><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                   key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 12>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 13:  // 8192
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 13><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                   key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 13>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       default:
         break;
@@ -483,58 +484,58 @@ class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel<T> {
 
     switch (pow2_index) {
       case 5:  // 32
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 5><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                  softmax_rst_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 5>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 6:  // 64
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 6><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                  softmax_rst_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 6>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 7:  // 128
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 7><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                  softmax_rst_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 7>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 8:  // 256
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 8><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                  softmax_rst_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 8>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 9:  // 512
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 9><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                  softmax_rst_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 9>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 10:  // 1024
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 10><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                   softmax_rst_data,
-                                                   batch_count, key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 10>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 11:  // 2048
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 11><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                   softmax_rst_data,
-                                                   batch_count, key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 11>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 12:  // 4096
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 12><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                   softmax_rst_data,
-                                                   batch_count, key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 12>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 13:  // 8192
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 13><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                   softmax_rst_data,
-                                                   batch_count, key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 13>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       default:
         break;
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 9f2b48a24b447..d44dd324d6ccb 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -153,7 +153,7 @@ REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
                   ops::GatherGradNoNeedBufferVarInferer,
                   GatherGradInferShapeFunctor);
 
-REGISTER_OP_VERSION(gather)
-    .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
-                   paddle::framework::compatible::OpVersionDesc().NewInput(
-                       "Axis", "Specify the axis of gather operation."));
+REGISTER_OP_VERSION(gather).AddCheckpoint(
+    R"ROC(upgrad gather, add a new input [Axis])ROC",
+    paddle::framework::compatible::OpVersionDesc().NewInput(
+        "Axis", "Specify the axis of gather operation."));
diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc
index 6c691aa14ae77..327eec2a6ca74 100644
--- a/paddle/fluid/operators/gather_op_xpu.cc
+++ b/paddle/fluid/operators/gather_op_xpu.cc
@@ -38,9 +38,20 @@ class GatherOpXPUKernel : public framework::OpKernel<T> {
     auto *x = ctx.Input<Tensor>("X");
     auto *index = ctx.Input<Tensor>("Index");
     auto *output = ctx.Output<Tensor>("Out");
+
+    int axis = ctx.Attr<int>("axis");
     if (ctx.HasInput("Axis")) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Now, it doesn't support XPU with Axis."));
+      Tensor cpu_axis;
+      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
+      framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis);
+      const auto &axis_type = axis_tensor->dtype();
+      if (framework::TransToProtoVarType(axis_type) ==
+          framework::proto::VarType::INT32) {
+        axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
+      } else if (framework::TransToProtoVarType(axis_type) ==
+                 framework::proto::VarType::INT64) {
+        axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
+      }
     }
 
     output->mutable_data<T>(ctx.GetPlace());
@@ -72,13 +83,13 @@ class GatherOpXPUKernel : public framework::OpKernel<T> {
       r = xpu::gather<XPUType, int>(
           dev_ctx.x_context(), reinterpret_cast<const XPUType *>(x->data<T>()),
           index->data<int>(), reinterpret_cast<XPUType *>(output->data<T>()),
-          xshape, index->dims()[0], 0);
+          xshape, index->dims()[0], axis);
     } else {
       r = xpu::gather<XPUType, int64_t>(
           dev_ctx.x_context(), reinterpret_cast<const XPUType *>(x->data<T>()),
           index->data<int64_t>(),
           reinterpret_cast<XPUType *>(output->data<T>()), xshape,
-          index->dims()[0], 0);
+          index->dims()[0], axis);
     }
     PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
                       platform::errors::External(
@@ -102,9 +113,19 @@ class GatherGradOpXPUKernel : public framework::OpKernel<T> {
     auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
 
+    int axis = ctx.Attr<int>("axis");
     if (ctx.HasInput("Axis")) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Now, it doesn't support XPU with Axis."));
+      Tensor cpu_axis;
+      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
+      framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis);
+      const auto &axis_type = axis_tensor->dtype();
+      if (framework::TransToProtoVarType(axis_type) ==
+          framework::proto::VarType::INT32) {
+        axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
+      } else if (framework::TransToProtoVarType(axis_type) ==
+                 framework::proto::VarType::INT64) {
+        axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
+      }
     }
     if (dout->numel() == 0) {
       return;
@@ -139,7 +160,7 @@ class GatherGradOpXPUKernel : public framework::OpKernel<T> {
           dev_ctx.x_context(),
           reinterpret_cast<const XPUType *>(dout->data<T>()),
           index->data<int>(), reinterpret_cast<XPUType *>(dx->data<T>()),
-          xshape, index->dims()[0], 0, overwrite);
+          xshape, index->dims()[0], axis, overwrite);
     } else {
       xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
       int *index_int_ptr_l3 =
@@ -147,16 +168,17 @@ class GatherGradOpXPUKernel : public framework::OpKernel<T> {
       r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
                                          index->data<int64_t>(),
                                          index_int_ptr_l3, index->numel());
-      PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                            "XPU API(cast_v2) return wrong "
-                                            "value[%d %s]",
-                                            r, XPUAPIErrorMsg[r]));
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU API(cast_v2) return wrong "
+                                     "value[%d %s]",
+                                     r, XPUAPIErrorMsg[r]));
 
       r = xpu::gather_grad<XPUType, int>(
           dev_ctx.x_context(),
           reinterpret_cast<const XPUType *>(dout->data<T>()), index_int_ptr_l3,
           reinterpret_cast<XPUType *>(dx->data<T>()), xshape, index->dims()[0],
-          0, overwrite);
+          axis, overwrite);
     }
     PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
                       platform::errors::External(
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu
index f97eb3d5e9d9a..6c4a7a01f3fbb 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.cu
+++ b/paddle/fluid/operators/gather_scatter_kernel.cu
@@ -132,10 +132,11 @@ struct gpu_gather_scatter_functor {
     int64_t grid = (n + block - 1) / block;
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    GatherScatterGPUKernel<tensor_t, index_t, func_t,
-                           is_scatter_like><<<grid, block, 0, stream>>>(
-        self_data, dim, index_data, src_data, inner_dim_size, select_dim_size,
-        replaced_select_dim_size, outer_dim_size, index_size, reduce_op);
+    GatherScatterGPUKernel<tensor_t, index_t, func_t, is_scatter_like>
+        <<<grid, block, 0, stream>>>(self_data, dim, index_data, src_data,
+                                     inner_dim_size, select_dim_size,
+                                     replaced_select_dim_size, outer_dim_size,
+                                     index_size, reduce_op);
   }
 };  // struct gpu_gather_scatter_functor
 
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index c962dd065234f..676143bf01145 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/kernels/funcs/gather.h"
+
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/kernels/funcs/gather.h"
 
 TEST(Gather, GatherData) {
   paddle::framework::Tensor* src = new paddle::framework::Tensor();
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index deac932d59b80..1e89091b202de 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <thrust/random.h>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/gaussian_random_op_xpu.cc b/paddle/fluid/operators/gaussian_random_op_xpu.cc
index 5a1ac46f615d2..2ffc90fbd8c20 100644
--- a/paddle/fluid/operators/gaussian_random_op_xpu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_xpu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include <random>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc
index 3be2606bfc939..080ceaa45e343 100644
--- a/paddle/fluid/operators/gelu_op.cc
+++ b/paddle/fluid/operators/gelu_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/gelu_op_xpu.cc b/paddle/fluid/operators/gelu_op_xpu.cc
index 559d2448ad945..408638f7d2cfc 100644
--- a/paddle/fluid/operators/gelu_op_xpu.cc
+++ b/paddle/fluid/operators/gelu_op_xpu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cc b/paddle/fluid/operators/graph_khop_sampler_op.cc
index c83ee25840605..edf7d20c6d5c8 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cc
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cc
@@ -19,10 +19,11 @@ namespace operators {
 
 void InputShapeCheck(const framework::DDim& dims, std::string tensor_name) {
   if (dims.size() == 2) {
-    PADDLE_ENFORCE_EQ(dims[1], 1, platform::errors::InvalidArgument(
-                                      "The last dim of %s should be 1 when it "
-                                      "is 2D, but we get %d",
-                                      tensor_name, dims[1]));
+    PADDLE_ENFORCE_EQ(dims[1], 1,
+                      platform::errors::InvalidArgument(
+                          "The last dim of %s should be 1 when it "
+                          "is 2D, but we get %d",
+                          tensor_name, dims[1]));
   } else {
     PADDLE_ENFORCE_EQ(
         dims.size(), 1,
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index df977b43512a0..a63fdc89e24b2 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -26,6 +26,7 @@ limitations under the License. */
 #include <thrust/sort.h>
 #include <thrust/transform.h>
 #include <thrust/unique.h>
+
 #include <ostream>
 
 #ifdef PADDLE_WITH_HIP
@@ -217,15 +218,16 @@ void SampleNeighbors(const framework::ExecutionContext& ctx, const T* src,
   constexpr int TILE_SIZE = BLOCK_WARPS * 16;
   const dim3 block(WARP_SIZE, BLOCK_WARPS);
   const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE);
-  GraphSampleNeighborsCUDAKernel<T, BLOCK_WARPS, TILE_SIZE><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx.device_context())
-          .stream()>>>(
-      0, k, bs, thrust::raw_pointer_cast(inputs->data()), src, dst_count,
-      src_eids, thrust::raw_pointer_cast(outputs->data()),
-      thrust::raw_pointer_cast(outputs_eids->data()),
-      thrust::raw_pointer_cast(output_ptr.data()),
-      thrust::raw_pointer_cast(output_idxs.data()), return_eids);
+  GraphSampleNeighborsCUDAKernel<T, BLOCK_WARPS, TILE_SIZE>
+      <<<grid, block, 0,
+         reinterpret_cast<const platform::CUDADeviceContext&>(
+             ctx.device_context())
+             .stream()>>>(
+          0, k, bs, thrust::raw_pointer_cast(inputs->data()), src, dst_count,
+          src_eids, thrust::raw_pointer_cast(outputs->data()),
+          thrust::raw_pointer_cast(outputs_eids->data()),
+          thrust::raw_pointer_cast(output_ptr.data()),
+          thrust::raw_pointer_cast(output_idxs.data()), return_eids);
 
   // 5. Get inputs = outputs - inputs:
   if (!is_last_layer) {
@@ -264,19 +266,19 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input,
   int grid_tmp = (num_input + block - 1) / block;
   int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   // 1. Insert data into keys and values.
-  BuildHashTable<
-      T><<<grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                               ctx.device_context())
-                               .stream()>>>(
+  BuildHashTable<T><<<grid, block, 0,
+                      reinterpret_cast<const platform::CUDADeviceContext&>(
+                          ctx.device_context())
+                          .stream()>>>(
       input, num_input, len_hashtable, thrust::raw_pointer_cast(keys->data()),
       thrust::raw_pointer_cast(key_index->data()));
 
   // 2. Get item index count.
   thrust::device_vector<int> item_count(num_input + 1, 0);
-  GetItemIndexCount<
-      T><<<grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                               ctx.device_context())
-                               .stream()>>>(
+  GetItemIndexCount<T><<<grid, block, 0,
+                         reinterpret_cast<const platform::CUDADeviceContext&>(
+                             ctx.device_context())
+                             .stream()>>>(
       input, thrust::raw_pointer_cast(item_count.data()), num_input,
       len_hashtable, thrust::raw_pointer_cast(keys->data()),
       thrust::raw_pointer_cast(key_index->data()));
@@ -287,16 +289,16 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input,
   unique_items->resize(total_unique_items);
 
   // 3. Get unique items.
-  FillUniqueItems<
-      T><<<grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                               ctx.device_context())
-                               .stream()>>>(
-      input, num_input, len_hashtable,
-      thrust::raw_pointer_cast(unique_items->data()),
-      thrust::raw_pointer_cast(item_count.data()),
-      thrust::raw_pointer_cast(keys->data()),
-      thrust::raw_pointer_cast(values->data()),
-      thrust::raw_pointer_cast(key_index->data()));
+  FillUniqueItems<T>
+      <<<grid, block, 0,
+         reinterpret_cast<const platform::CUDADeviceContext&>(
+             ctx.device_context())
+             .stream()>>>(input, num_input, len_hashtable,
+                          thrust::raw_pointer_cast(unique_items->data()),
+                          thrust::raw_pointer_cast(item_count.data()),
+                          thrust::raw_pointer_cast(keys->data()),
+                          thrust::raw_pointer_cast(values->data()),
+                          thrust::raw_pointer_cast(key_index->data()));
 }
 
 template <typename T>
@@ -337,23 +339,23 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
   int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid_tmp = (outputs->size() + block - 1) / block;
   int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
-  ReindexSrcOutput<
-      T><<<grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                               ctx.device_context())
-                               .stream()>>>(
+  ReindexSrcOutput<T><<<grid, block, 0,
+                        reinterpret_cast<const platform::CUDADeviceContext&>(
+                            ctx.device_context())
+                            .stream()>>>(
       thrust::raw_pointer_cast(outputs->data()), outputs->size(), size,
       thrust::raw_pointer_cast(keys.data()),
       thrust::raw_pointer_cast(values.data()));
 
   int grid_ = (bs + block - 1) / block;
-  ReindexInputNodes<T><<<grid_, block, 0,
-                         reinterpret_cast<const platform::CUDADeviceContext&>(
-                             ctx.device_context())
-                             .stream()>>>(
-      thrust::raw_pointer_cast(orig_nodes->data()), bs,
-      thrust::raw_pointer_cast(reindex_nodes->data()), size,
-      thrust::raw_pointer_cast(keys.data()),
-      thrust::raw_pointer_cast(values.data()));
+  ReindexInputNodes<T>
+      <<<grid_, block, 0,
+         reinterpret_cast<const platform::CUDADeviceContext&>(
+             ctx.device_context())
+             .stream()>>>(thrust::raw_pointer_cast(orig_nodes->data()), bs,
+                          thrust::raw_pointer_cast(reindex_nodes->data()), size,
+                          thrust::raw_pointer_cast(keys.data()),
+                          thrust::raw_pointer_cast(values.data()));
 }
 
 template <typename DeviceContext, typename T>
@@ -532,15 +534,16 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel<T> {
     const dim3 block(WARP_SIZE, BLOCK_WARPS);
     const dim3 grid((unique_dst_size + TILE_SIZE - 1) / TILE_SIZE);
 
-    GetDstEdgeCUDAKernel<T, BLOCK_WARPS, TILE_SIZE><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(
-        unique_dst_size,
-        thrust::raw_pointer_cast(unique_dst_merge_reindex.data()),
-        thrust::raw_pointer_cast(dst_sample_counts_merge.data()),
-        thrust::raw_pointer_cast(dst_ptr.data()),
-        thrust::raw_pointer_cast(dst_merge.data()));
+    GetDstEdgeCUDAKernel<T, BLOCK_WARPS, TILE_SIZE>
+        <<<grid, block, 0,
+           reinterpret_cast<const platform::CUDADeviceContext&>(
+               ctx.device_context())
+               .stream()>>>(
+            unique_dst_size,
+            thrust::raw_pointer_cast(unique_dst_merge_reindex.data()),
+            thrust::raw_pointer_cast(dst_sample_counts_merge.data()),
+            thrust::raw_pointer_cast(dst_ptr.data()),
+            thrust::raw_pointer_cast(dst_merge.data()));
 
     // 8. Give operator's outputs.
     auto* out_src = ctx.Output<Tensor>("Out_Src");
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.h b/paddle/fluid/operators/graph_khop_sampler_op.h
index d7121cb549370..1005a6ab11cc0 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.h
+++ b/paddle/fluid/operators/graph_khop_sampler_op.h
@@ -15,10 +15,12 @@ limitations under the License. */
 #pragma once
 
 #include <stdlib.h>
+
 #include <numeric>
 #include <random>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 4331523d26edc..4d989ed1f2ec0 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/group_norm_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -234,7 +235,6 @@ class GroupNormGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-DECLARE_INPLACE_OP_INFERER(GroupNormInplaceInferer, {"X", "Y"});
 DECLARE_INPLACE_OP_INFERER(GroupNormGradInplaceInferer,
                            {framework::GradVarName("Y"),
                             framework::GradVarName("X")});
@@ -256,8 +256,7 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
                   ops::GroupNormOpInferVarType,
                   ops::GroupNormGradMaker<paddle::framework::OpDesc>,
-                  ops::GroupNormGradMaker<paddle::imperative::OpBase>,
-                  ops::GroupNormInplaceInferer);
+                  ops::GroupNormGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp,
                   ops::GroupNormGradInplaceInferer);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index bb8031b0cc4e6..84eb2fbc7d31f 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -322,9 +322,9 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
         ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
             x_data, mean_data, temp_var_data, size);
       } else {
-        VectorizedGetMeanAndVarNCHW<
-            T, AccT, vec_size><<<grids, blocks, 0, dev_ctx.stream()>>>(
-            x_data, mean_data, temp_var_data, size);
+        VectorizedGetMeanAndVarNCHW<T, AccT, vec_size>
+            <<<grids, blocks, 0, dev_ctx.stream()>>>(x_data, mean_data,
+                                                     temp_var_data, size);
       }
     } else {
       set_zero(dev_ctx, mean, static_cast<T>(0));
@@ -613,16 +613,16 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
       }
       block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
       dim3 blocks(block_size_nchw);
-      ScalarGetDsDbCUDAKernel<
-          T><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
-          imsize, x_data, dy_data, ds_data, db_data);
+      ScalarGetDsDbCUDAKernel<T>
+          <<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
+              imsize, x_data, dy_data, ds_data, db_data);
 
       if (d_scale || d_bias) {
         const int block = 256;
-        GetScaleBiasGradientCUDAKernel<
-            T><<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>(
-            x_dims[0], C, groups, epsilon, mean_data, var_data, ds_data,
-            db_data, d_scale_data, d_bias_data);
+        GetScaleBiasGradientCUDAKernel<T>
+            <<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>(
+                x_dims[0], C, groups, epsilon, mean_data, var_data, ds_data,
+                db_data, d_scale_data, d_bias_data);
       }
 
       if (d_x_data != nullptr) {
@@ -639,10 +639,10 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
         T* p2_data = p2.data<T>();
         T* p3_data = p3.data<T>();
 
-        GetBackwardParamsCUDAKernel<T, block_dims><<<
-            dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>(
-            imsize, groups, group_size, epsilon, mean_data, var_data,
-            scale_data, ds_data, db_data, p1_data, p2_data, p3_data);
+        GetBackwardParamsCUDAKernel<T, block_dims>
+            <<<dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>(
+                imsize, groups, group_size, epsilon, mean_data, var_data,
+                scale_data, ds_data, db_data, p1_data, p2_data, p3_data);
         GetXGradientCUDAKernel<T><<<grid, threads, 0, dev_ctx.stream()>>>(
             imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data,
             dy_data, d_x_data);
diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h
index 2d80ab89471fc..28a3ad2a8e1ee 100644
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <array>
 #include <numeric>
 #include <string>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc
index 8de8647186ed3..dfc509941bc2d 100644
--- a/paddle/fluid/operators/group_norm_op_npu.cc
+++ b/paddle/fluid/operators/group_norm_op_npu.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/group_norm_op.h"
 #include <vector>
+
+#include "paddle/fluid/operators/group_norm_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 58cbdfda34799..21ad5914c5d4d 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/gru_op.h"
+
 #include <memory>
 #include <string>
+
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h"
 #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 852655034c8c2..4cc6c65983fe9 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
index 8998c51f0df62..b6d9ef50f83e8 100644
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/gru_unit_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 291f5f4ad2673..2dd1515919b3b 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -77,9 +77,9 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     // calculate unactivated gate outputs
     if (bias) {
       auto b = framework::EigenMatrix<T>::From(*bias);
-      g.device(place) = x +
-                        b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
-                            .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
+      g.device(place) =
+          x + b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
+                  .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
     } else {
       g.device(place) = x;
     }
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index cce80518354d7..f72fe9282abb6 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/hinge_loss_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/huber_loss_op_xpu.cc b/paddle/fluid/operators/huber_loss_op_xpu.cc
index ccddec2779515..2fafd18621528 100644
--- a/paddle/fluid/operators/huber_loss_op_xpu.cc
+++ b/paddle/fluid/operators/huber_loss_op_xpu.cc
@@ -39,10 +39,11 @@ class HuberLossXPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::XPUDeviceContext>();
     int r = xpu::huber_loss<T>(dev_ctx.x_context(), in0_data, in1_data,
                                residual_data, out_data, in0->numel(), 1, delta);
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                          "XPU API(huber_loss) return wrong "
-                                          "value[%d %s]",
-                                          r, XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(huber_loss) return wrong "
+                                   "value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
   }
 };
 
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index d248857b8f42f..107384742bbdd 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/im2sequence_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index b0c4b9b4a99a5..218161fd00aaa 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h
index bb26e2f445e70..d8417e42e1bf7 100644
--- a/paddle/fluid/operators/index_impl.cu.h
+++ b/paddle/fluid/operators/index_impl.cu.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/random.h>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -73,16 +74,16 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) {
   size_t main_offset = (numel / (vec_size * block)) * vec_size * block;
   switch (vec_size) {
     case 4:
-      VectorizedIndexKernel<T, Functor, 4><<<grid, block, 0, stream>>>(
-          out_data, numel, main_offset, func);
+      VectorizedIndexKernel<T, Functor, 4>
+          <<<grid, block, 0, stream>>>(out_data, numel, main_offset, func);
       break;
     case 2:
-      VectorizedIndexKernel<T, Functor, 2><<<grid, block, 0, stream>>>(
-          out_data, numel, main_offset, func);
+      VectorizedIndexKernel<T, Functor, 2>
+          <<<grid, block, 0, stream>>>(out_data, numel, main_offset, func);
       break;
     case 1:
-      VectorizedIndexKernel<T, Functor, 1><<<grid, block, 0, stream>>>(
-          out_data, numel, main_offset, func);
+      VectorizedIndexKernel<T, Functor, 1>
+          <<<grid, block, 0, stream>>>(out_data, numel, main_offset, func);
       break;
     default: {
       PADDLE_THROW(paddle::platform::errors::Unimplemented(
diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc
index d17c6368c7537..15fc0f6d14fe4 100644
--- a/paddle/fluid/operators/index_sample_op.cc
+++ b/paddle/fluid/operators/index_sample_op.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/platform/enforce.h"
 
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/binary.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index 684829be2697c..c82aaab0fe1c2 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index d420d0319bfe4..6cb8d664d8022 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/inplace_abn_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/phi/kernels/batch_norm_grad_kernel.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
@@ -38,18 +40,21 @@ class InplaceABNOp : public paddle::operators::BatchNormOp {
     if (input_data_type == framework::proto::VarType::FP64) {
       bn_param_type = framework::proto::VarType::FP64;
     }
-    PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                         ctx.Input<Tensor>("Scale")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                         ctx.Input<Tensor>("Bias")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Bias input should be of float type"));
-    PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                         ctx.Input<Tensor>("Mean")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Mean input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("Scale")->dtype()),
+        platform::errors::InvalidArgument(
+            "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("Bias")->dtype()),
+        platform::errors::InvalidArgument(
+            "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("Mean")->dtype()),
+        platform::errors::InvalidArgument(
+            "Mean input should be of float type"));
     PADDLE_ENFORCE_EQ(
         bn_param_type,
         framework::TransToProtoVarType(ctx.Input<Tensor>("Variance")->dtype()),
@@ -209,8 +214,9 @@ class InplaceABNKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<Tensor>("X");
     auto* y = ctx.Output<Tensor>("Y");
-    PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
-                                "X and Y not inplaced in inplace mode"));
+    PADDLE_ENFORCE_EQ(x, y,
+                      platform::errors::InvalidArgument(
+                          "X and Y not inplaced in inplace mode"));
     auto activation =
         GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
index 6476023fcd20e..7245629e565e9 100644
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -28,8 +28,9 @@ class InplaceABNKernel
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* y = ctx.Output<Tensor>("Y");
     auto* x = ctx.Input<Tensor>("X");
-    PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
-                                "X and Y not inplaced in inplace mode"));
+    PADDLE_ENFORCE_EQ(x, y,
+                      platform::errors::InvalidArgument(
+                          "X and Y not inplaced in inplace mode"));
     auto activation =
         GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
diff --git a/paddle/fluid/operators/inplace_abn_op.h b/paddle/fluid/operators/inplace_abn_op.h
index 942404978584d..275209911d18e 100644
--- a/paddle/fluid/operators/inplace_abn_op.h
+++ b/paddle/fluid/operators/inplace_abn_op.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index de92de453a354..21ccf777051c2 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/instance_norm_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -38,16 +40,18 @@ framework::OpKernelType InstanceNormOp::GetExpectedKernelType(
     in_param_type = framework::proto::VarType::FP64;
   }
   if (ctx.HasInput("Scale")) {
-    PADDLE_ENFORCE_EQ(in_param_type, framework::TransToProtoVarType(
-                                         ctx.Input<Tensor>("Scale")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        in_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("Scale")->dtype()),
+        platform::errors::InvalidArgument(
+            "Scale input should be of float type"));
   }
   if (ctx.HasInput("Bias")) {
-    PADDLE_ENFORCE_EQ(in_param_type, framework::TransToProtoVarType(
-                                         ctx.Input<Tensor>("Bias")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        in_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("Bias")->dtype()),
+        platform::errors::InvalidArgument(
+            "Bias input should be of float type"));
   }
 
   return framework::OpKernelType(input_data_type, ctx.GetPlace());
diff --git a/paddle/fluid/operators/instance_norm_op.h b/paddle/fluid/operators/instance_norm_op.h
index 265e4acef0d7a..3f99cdf10c64b 100644
--- a/paddle/fluid/operators/instance_norm_op.h
+++ b/paddle/fluid/operators/instance_norm_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index fda168c94e1e0..3c746d7c08a1a 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -10,9 +10,11 @@
    limitations under the License. */
 
 #include "paddle/fluid/operators/interpolate_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -112,11 +114,12 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
 
   PADDLE_ENFORCE_EQ("bilinear" == interp_method || "nearest" == interp_method ||
                         "bicubic" == interp_method,
-                    true, platform::errors::InvalidArgument(
-                              "Interpolation method can only be \"bilinear\" "
-                              "or \"nearest\" or \"bicubic\" when "
-                              "Input(X) dimension is 4, but got method is %s.",
-                              interp_method));
+                    true,
+                    platform::errors::InvalidArgument(
+                        "Interpolation method can only be \"bilinear\" "
+                        "or \"nearest\" or \"bicubic\" when "
+                        "Input(X) dimension is 4, but got method is %s.",
+                        interp_method));
   const DataLayout data_layout = framework::StringToDataLayout(
       ctx->Attrs().Get<std::string>("data_layout"));
 
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 8a63c9a394638..729eba43d7264 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -11,6 +11,7 @@
 
 #include <algorithm>
 #include <string>
+
 #include "paddle/fluid/operators/interpolate_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -860,9 +861,10 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
       out_w = size_data[0];
     }
   }
-  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                  "out_w in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0,
+                    platform::errors::InvalidArgument(
+                        "out_w in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
   framework::DDim dim_out;
   if (data_layout == DataLayout::kNCHW) {
     dim_out = {n, c, out_w};
@@ -942,12 +944,14 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
       out_w = size_data[1];
     }
   }
-  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
-                                  "out_h in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
-  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                  "out_w in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_h, 0,
+                    platform::errors::InvalidArgument(
+                        "out_h in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0,
+                    platform::errors::InvalidArgument(
+                        "out_w in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
 
   framework::DDim dim_out;
   if (data_layout == DataLayout::kNCHW) {
@@ -984,21 +988,21 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
       platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpFw<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-        out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+    KeNearestNeighborInterpFw<T>
+        <<<config.block_per_grid, config.thread_per_block, 0,
+           ctx.cuda_device_context().stream()>>>(
+            input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+            out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
     KeBilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                             ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
   } else if ("bicubic" == interp_method) {
-    KeBicubicInterpFw<T><<<config.block_per_grid, 512, 0,
-                           ctx.cuda_device_context().stream()>>>(
-        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-        out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+    KeBicubicInterpFw<T>
+        <<<config.block_per_grid, 512, 0, ctx.cuda_device_context().stream()>>>(
+            input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+            out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   }
 }
 
@@ -1051,15 +1055,18 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
       out_w = size_data[2];
     }
   }
-  PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument(
-                                  "out_d in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
-  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
-                                  "out_h in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
-  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                  "out_w in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_d, 0,
+                    platform::errors::InvalidArgument(
+                        "out_d in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_h, 0,
+                    platform::errors::InvalidArgument(
+                        "out_h in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0,
+                    platform::errors::InvalidArgument(
+                        "out_w in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
 
   framework::DDim dim_out;
   if (data_layout == DataLayout::kNCHW) {
@@ -1271,11 +1278,11 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
       platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpBw<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
-        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+    KeNearestNeighborInterpBw<T>
+        <<<config.block_per_grid, config.thread_per_block, 0,
+           ctx.cuda_device_context().stream()>>>(
+            input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
+            out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
     KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                             ctx.cuda_device_context().stream()>>>(
@@ -1283,10 +1290,10 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
         n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
         data_layout);
   } else if ("bicubic" == interp_method) {
-    KeBicubicInterpBw<T><<<config.block_per_grid, 512, 0,
-                           ctx.cuda_device_context().stream()>>>(
-        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
-        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+    KeBicubicInterpBw<T>
+        <<<config.block_per_grid, 512, 0, ctx.cuda_device_context().stream()>>>(
+            input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
+            out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   }
 }
 
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 57b5eb553cc4c..18caed22b4855 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -13,6 +13,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -808,9 +809,10 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
       out_w = out_size_data[0];
     }
   }
-  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                  "out_w in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0,
+                    platform::errors::InvalidArgument(
+                        "out_w in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
   framework::DDim dim_out;
   if (data_layout == DataLayout::kNCHW) {
     dim_out = {n, c, out_w};
@@ -876,12 +878,14 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
       out_w = out_size_data[1];
     }
   }
-  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
-                                  "out_h in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
-  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                  "out_w in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_h, 0,
+                    platform::errors::InvalidArgument(
+                        "out_h in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0,
+                    platform::errors::InvalidArgument(
+                        "out_w in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
   framework::DDim dim_out;
   if (data_layout == DataLayout::kNCHW) {
     dim_out = {n, c, out_h, out_w};
@@ -964,15 +968,18 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
       out_w = out_size_data[2];
     }
   }
-  PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument(
-                                  "out_d in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
-  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
-                                  "out_h in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
-  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                  "out_w in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_d, 0,
+                    platform::errors::InvalidArgument(
+                        "out_d in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_h, 0,
+                    platform::errors::InvalidArgument(
+                        "out_h in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0,
+                    platform::errors::InvalidArgument(
+                        "out_w in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
 
   framework::DDim dim_out;
   if (data_layout == DataLayout::kNCHW) {
diff --git a/paddle/fluid/operators/interpolate_op_npu.cc b/paddle/fluid/operators/interpolate_op_npu.cc
old mode 100755
new mode 100644
index f83f149b87c31..0cbac393af504
--- a/paddle/fluid/operators/interpolate_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_op_npu.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/interpolate_op.h"
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/operators/interpolate_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/interpolate_op_xpu.cc b/paddle/fluid/operators/interpolate_op_xpu.cc
index 9576dc8452463..09780505ac2ce 100644
--- a/paddle/fluid/operators/interpolate_op_xpu.cc
+++ b/paddle/fluid/operators/interpolate_op_xpu.cc
@@ -111,14 +111,16 @@ class InterpolateXPUKernel : public framework::OpKernel<T> {
         out_w = out_size_data[1];
       }
     }
-    PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
-                                    "out_h in Attr(out_shape) of "
-                                    "Op(interpolate) "
-                                    "should be greater than 0."));
-    PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                    "out_w in Attr(out_shape) of "
-                                    "Op(interpolate) "
-                                    "should be greater than 0."));
+    PADDLE_ENFORCE_GT(
+        out_h, 0,
+        platform::errors::InvalidArgument("out_h in Attr(out_shape) of "
+                                          "Op(interpolate) "
+                                          "should be greater than 0."));
+    PADDLE_ENFORCE_GT(
+        out_w, 0,
+        platform::errors::InvalidArgument("out_w in Attr(out_shape) of "
+                                          "Op(interpolate) "
+                                          "should be greater than 0."));
     framework::DDim dim_out;
     if (data_layout == DataLayout::kNCHW) {
       dim_out = {n, c, out_h, out_w};
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index d0d7b7694fc3a..6bac35ee1d455 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -40,10 +40,11 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
   const DataLayout data_layout = framework::StringToDataLayout(
       ctx->Attrs().Get<std::string>("data_layout"));
   for (int i = 0; i < dim_x.size(); ++i) {
-    PADDLE_ENFORCE_NE(dim_x[i], 0, platform::errors::InvalidArgument(
-                                       "The shape of input(x) should be larged "
-                                       "than 0, bug received shape[%d] is %d ",
-                                       i, dim_x[i]));
+    PADDLE_ENFORCE_NE(dim_x[i], 0,
+                      platform::errors::InvalidArgument(
+                          "The shape of input(x) should be larged "
+                          "than 0, bug received shape[%d] is %d ",
+                          i, dim_x[i]));
   }
   if (ctx->HasInputs("SizeTensor")) {
     // top prority size
@@ -144,10 +145,11 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
       ctx->Attrs().Get<std::string>("data_layout"));
 
   for (int i = 0; i < dim_x.size(); ++i) {
-    PADDLE_ENFORCE_NE(dim_x[i], 0, platform::errors::InvalidArgument(
-                                       "The shape of input(x) should be larged "
-                                       "than 0, bug received shape[%d] is %d ",
-                                       i, dim_x[i]));
+    PADDLE_ENFORCE_NE(dim_x[i], 0,
+                      platform::errors::InvalidArgument(
+                          "The shape of input(x) should be larged "
+                          "than 0, bug received shape[%d] is %d ",
+                          i, dim_x[i]));
   }
 
   if (ctx->HasInputs("SizeTensor")) {
@@ -263,10 +265,11 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
       ctx->Attrs().Get<std::string>("data_layout"));
 
   for (int i = 0; i < dim_x.size(); ++i) {
-    PADDLE_ENFORCE_NE(dim_x[i], 0, platform::errors::InvalidArgument(
-                                       "The shape of input(x) should be larged "
-                                       "than 0, bug received shape[%d] is %d ",
-                                       i, dim_x[i]));
+    PADDLE_ENFORCE_NE(dim_x[i], 0,
+                      platform::errors::InvalidArgument(
+                          "The shape of input(x) should be larged "
+                          "than 0, bug received shape[%d] is %d ",
+                          i, dim_x[i]));
   }
 
   if (ctx->HasInputs("SizeTensor")) {
diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc
index 615b5ea142b58..97f39aa490264 100644
--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
-
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/interpolate_v2_op_xpu.cc b/paddle/fluid/operators/interpolate_v2_op_xpu.cc
index 9cbfc95158348..9d52c9a865ea7 100644
--- a/paddle/fluid/operators/interpolate_v2_op_xpu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_xpu.cc
@@ -114,14 +114,16 @@ class InterpolateV2XPUKernel : public framework::OpKernel<T> {
         out_w = out_size_data[1];
       }
     }
-    PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
-                                    "out_h in Attr(out_shape) of "
-                                    "Op(interpolate) "
-                                    "should be greater than 0."));
-    PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                    "out_w in Attr(out_shape) of "
-                                    "Op(interpolate) "
-                                    "should be greater than 0."));
+    PADDLE_ENFORCE_GT(
+        out_h, 0,
+        platform::errors::InvalidArgument("out_h in Attr(out_shape) of "
+                                          "Op(interpolate) "
+                                          "should be greater than 0."));
+    PADDLE_ENFORCE_GT(
+        out_w, 0,
+        platform::errors::InvalidArgument("out_w in Attr(out_shape) of "
+                                          "Op(interpolate) "
+                                          "should be greater than 0."));
     framework::DDim dim_out;
     if (data_layout == DataLayout::kNCHW) {
       dim_out = {n, c, out_h, out_w};
diff --git a/paddle/fluid/operators/inverse_op.cc b/paddle/fluid/operators/inverse_op.cc
index f5b817a0e11fa..c4f3fbb2ca772 100644
--- a/paddle/fluid/operators/inverse_op.cc
+++ b/paddle/fluid/operators/inverse_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/inverse_op.h"
+
 #include <string>
 #include <unordered_map>
 
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index 2e770f9852569..456c1c2d44f3e 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -120,15 +120,16 @@ namespace ops = paddle::operators;
       paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, \
       paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
 
-#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor)                      \
-  REGISTER_OP_CPU_KERNEL(                                                   \
-      op_type, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int, \
-                                   ops::functor>,                           \
-      ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t,      \
-                          ops::functor>,                                    \
-      ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,        \
-                          ops::functor>,                                    \
-      ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,       \
+#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor)                 \
+  REGISTER_OP_CPU_KERNEL(                                              \
+      op_type,                                                         \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, int,     \
+                          ops::functor>,                               \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t, \
+                          ops::functor>,                               \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,   \
+                          ops::functor>,                               \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,  \
                           ops::functor>);
 
 REGISTER_OP_MAKER(isinf, "isinf(X)");
diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu
index e233e37136490..d1437d5b44d6f 100644
--- a/paddle/fluid/operators/isfinite_op.cu
+++ b/paddle/fluid/operators/isfinite_op.cu
@@ -18,8 +18,9 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
-    isinf, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
-                               ops::InfinityFunctor>,
+    isinf,
+    ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
+                        ops::InfinityFunctor>,
     ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,
                         ops::InfinityFunctor>,
     ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,
@@ -38,8 +39,9 @@ REGISTER_OP_CUDA_KERNEL(isnan,
                                             plat::float16, ops::NANFunctor>);
 
 REGISTER_OP_CUDA_KERNEL(
-    isfinite, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
-                                  ops::IsfiniteFunctor>,
+    isfinite,
+    ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
+                        ops::IsfiniteFunctor>,
     ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,
                         ops::IsfiniteFunctor>,
     ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,
diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt
index 080e7f7d5e859..a6f10e5fbdab7 100644
--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@@ -1,16 +1,25 @@
-
 set(jit_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/jit/kernels.h.tmp)
 set(jit_file_final ${PADDLE_BINARY_DIR}/paddle/fluid/operators/jit/kernels.h)
-file(WRITE  ${jit_file} "// Generated by the paddle/fluid/operators/jit/CMakeLists.txt.  DO NOT EDIT!\n\n")
+file(
+  WRITE ${jit_file}
+  "// Generated by the paddle/fluid/operators/jit/CMakeLists.txt.  DO NOT EDIT!\n\n"
+)
 file(APPEND ${jit_file} "\#pragma once\n")
 file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
-file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n")
+file(APPEND ${jit_file}
+     "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n")
 
 set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash)
 
-file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
+file(
+  GLOB jit_kernel_cc_srcs
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "*.cc")
 list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc)
-cc_library(jit_kernel_base SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
+cc_library(
+  jit_kernel_base
+  SRCS ${jit_kernel_cc_srcs}
+  DEPS ${JIT_KERNEL_DEPS})
 
 copy_if_different(${jit_file} ${jit_file_final})
 
@@ -18,14 +27,27 @@ copy_if_different(${jit_file} ${jit_file_final})
 add_subdirectory(refer)
 add_subdirectory(more)
 if(WITH_XBYAK)
-    add_subdirectory(gen)
+  add_subdirectory(gen)
 endif()
 
-cc_library(jit_kernel_helper INTERFACE SRCS ${jit_kernel_cc_srcs} DEPS jit_kernel_base ${JIT_KERNEL_DEPS})
-cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper)
+cc_library(
+  jit_kernel_helper INTERFACE
+  SRCS ${jit_kernel_cc_srcs}
+  DEPS jit_kernel_base ${JIT_KERNEL_DEPS})
+cc_test(
+  jit_kernel_test
+  SRCS test.cc
+  DEPS jit_kernel_helper)
 if(NOT WIN32)
-    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer tensor)
+  cc_binary(
+    jit_kernel_benchmark
+    SRCS
+    benchmark.cc
+    DEPS
+    jit_kernel_helper
+    device_tracer
+    tensor)
 endif()
 if(WITH_TESTING AND TEST jit_kernel_test)
-    set_tests_properties(jit_kernel_test PROPERTIES TIMEOUT 120)
+  set_tests_properties(jit_kernel_test PROPERTIES TIMEOUT 120)
 endif()
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index a8e441a96717d..3103a286772ce 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -324,8 +324,9 @@ void BenchKernelSgd() {
             "than n-1 (Sgd size -1). But upper - lower is %d and n-1 is %d.",
             static_cast<size_t>(upper - lower), (n - 1)));
     PADDLE_ENFORCE_GT(
-        n, 0, paddle::platform::errors::InvalidArgument(
-                  "The Sgd size should be larger than 0. But the n is %d.", n));
+        n, 0,
+        paddle::platform::errors::InvalidArgument(
+            "The Sgd size should be larger than 0. But the n is %d.", n));
     std::vector<int64_t> all, out;
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt
index ab8829b7baf5f..60e29ea81d5eb 100644
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -1,38 +1,45 @@
+file(
+  GLOB jitcode_cc_srcs
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "*.cc")
 
-file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
-
-cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak)
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE)
+cc_library(
+  jit_kernel_jitcode
+  SRCS ${jitcode_cc_srcs}
+  DEPS jit_kernel_base xbyak)
+set(JIT_KERNEL_DEPS
+    ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode
+    PARENT_SCOPE)
 
 function(USE_JITKERNEL_GEN TARGET)
-    file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n")
+  file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n")
 endfunction()
 
 # use gen jitcode kernel by name
-USE_JITKERNEL_GEN(kMatMul)
-USE_JITKERNEL_GEN(kVMul)
-USE_JITKERNEL_GEN(kVAdd)
-USE_JITKERNEL_GEN(kVSub)
-USE_JITKERNEL_GEN(kVAddRelu)
-USE_JITKERNEL_GEN(kVScal)
-USE_JITKERNEL_GEN(kVAddBias)
-USE_JITKERNEL_GEN(kVRelu)
-USE_JITKERNEL_GEN(kVSquare)
-USE_JITKERNEL_GEN(kVIdentity)
-USE_JITKERNEL_GEN(kVExp)
-USE_JITKERNEL_GEN(kVSigmoid)
-USE_JITKERNEL_GEN(kVTanh)
-USE_JITKERNEL_GEN(kLSTMCtHt)
-USE_JITKERNEL_GEN(kLSTMC1H1)
-USE_JITKERNEL_GEN(kGRUH1)
-USE_JITKERNEL_GEN(kGRUHtPart1)
-USE_JITKERNEL_GEN(kGRUHtPart2)
-USE_JITKERNEL_GEN(kNCHW16CMulNC)
-USE_JITKERNEL_GEN(kSeqPool)
-USE_JITKERNEL_GEN(kHMax)
-USE_JITKERNEL_GEN(kHSum)
-USE_JITKERNEL_GEN(kEmbSeqPool)
-USE_JITKERNEL_GEN(kAdam)
-USE_JITKERNEL_GEN(kAdamW)
-USE_JITKERNEL_GEN(kSgd)
-USE_JITKERNEL_GEN(kVBroadcast)
+use_jitkernel_gen(kMatMul)
+use_jitkernel_gen(kVMul)
+use_jitkernel_gen(kVAdd)
+use_jitkernel_gen(kVSub)
+use_jitkernel_gen(kVAddRelu)
+use_jitkernel_gen(kVScal)
+use_jitkernel_gen(kVAddBias)
+use_jitkernel_gen(kVRelu)
+use_jitkernel_gen(kVSquare)
+use_jitkernel_gen(kVIdentity)
+use_jitkernel_gen(kVExp)
+use_jitkernel_gen(kVSigmoid)
+use_jitkernel_gen(kVTanh)
+use_jitkernel_gen(kLSTMCtHt)
+use_jitkernel_gen(kLSTMC1H1)
+use_jitkernel_gen(kGRUH1)
+use_jitkernel_gen(kGRUHtPart1)
+use_jitkernel_gen(kGRUHtPart2)
+use_jitkernel_gen(kNCHW16CMulNC)
+use_jitkernel_gen(kSeqPool)
+use_jitkernel_gen(kHMax)
+use_jitkernel_gen(kHSum)
+use_jitkernel_gen(kEmbSeqPool)
+use_jitkernel_gen(kAdam)
+use_jitkernel_gen(kAdamW)
+use_jitkernel_gen(kSgd)
+use_jitkernel_gen(kVBroadcast)
diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc
index 677e9979399c5..5a73e3c56d511 100644
--- a/paddle/fluid/operators/jit/gen/act.cc
+++ b/paddle/fluid/operators/jit/gen/act.cc
@@ -122,9 +122,8 @@ bool VTanhCreator::CanBeUsed(const int& d) const {
 }
 
 size_t VReluCreator::CodeSize(const int& d) const {
-  return 96 /* init size */ +
-         (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ *
-             8 /* average bytes for each instruction */;
+  return 96 /* init size */ + (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ *
+                                  8 /* average bytes for each instruction */;
 }
 
 size_t VSquareCreator::CodeSize(const int& d) const {
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
index bd84368a57388..24434c5993bbb 100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <type_traits>
+
 #include "paddle/fluid/operators/jit/gen_base.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc
index 3b2139c9ed025..9c859229c5a88 100644
--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
@@ -122,20 +122,23 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
   std::unique_ptr<GenBase> CreateJitCode(
       const matmul_attr_t& attr) const override {
     PADDLE_ENFORCE_GT(
-        attr.m, 0, platform::errors::InvalidArgument(
-                       "The attribute m (first matrix's row) of MatMul should "
-                       "be larger than 0. But it is %d.",
-                       attr.m));
+        attr.m, 0,
+        platform::errors::InvalidArgument(
+            "The attribute m (first matrix's row) of MatMul should "
+            "be larger than 0. But it is %d.",
+            attr.m));
     PADDLE_ENFORCE_GT(
-        attr.n, 0, platform::errors::InvalidArgument(
-                       "The attribute n (first matrix's col) of MatMul should "
-                       "be larger than 0. But it is %d.",
-                       attr.n));
+        attr.n, 0,
+        platform::errors::InvalidArgument(
+            "The attribute n (first matrix's col) of MatMul should "
+            "be larger than 0. But it is %d.",
+            attr.n));
     PADDLE_ENFORCE_GT(
-        attr.k, 0, platform::errors::InvalidArgument(
-                       "The attribute k (second matrix's col) of MatMul should "
-                       "be larger than 0. But it is %d.",
-                       attr.k));
+        attr.k, 0,
+        platform::errors::InvalidArgument(
+            "The attribute k (second matrix's col) of MatMul should "
+            "be larger than 0. But it is %d.",
+            attr.k));
     return make_unique<MatMulJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h
index eb7328d7e069c..af62632634024 100644
--- a/paddle/fluid/operators/jit/gen/matmul.h
+++ b/paddle/fluid/operators/jit/gen/matmul.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <stdlib.h>  // for malloc and free
+
 #include <string>
 #include <vector>
 
@@ -33,10 +34,11 @@ class MatMulJitCode : public JitCode {
                          size_t code_size = 256 * 1024,
                          void* code_ptr = nullptr)
       : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
-    PADDLE_ENFORCE_EQ(m_, 1, platform::errors::Unimplemented(
-                                 "Jitcode of matmul only support m==1 (first "
-                                 "matrix's row) now. But m is %d.",
-                                 m_));
+    PADDLE_ENFORCE_EQ(m_, 1,
+                      platform::errors::Unimplemented(
+                          "Jitcode of matmul only support m==1 (first "
+                          "matrix's row) now. But m is %d.",
+                          m_));
     this->genCode();
   }
 
diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc
index 52fdf04f3f677..4788050a14cd7 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
@@ -62,22 +62,23 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
     return platform::MayIUse(platform::avx);
   }
   size_t CodeSize(const seq_pool_attr_t& attr) const override {
-    return 96 +
-           ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *
-                4 /* load, mul and save */ +
-            256) *
-               16;
+    return 96 + ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *
+                     4 /* load, mul and save */
+                 + 256) *
+                    16;
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.w, 0, platform::errors::InvalidArgument(
-                                     "The attribute width of SeqPool should "
-                                     "be larger than 0. But it is %d.",
-                                     attr.w));
-    PADDLE_ENFORCE_GT(attr.h, 0, platform::errors::InvalidArgument(
-                                     "The attribute height of SeqPool should "
-                                     "be larger than 0. But it is %d.",
-                                     attr.h));
+    PADDLE_ENFORCE_GT(attr.w, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute width of SeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.w));
+    PADDLE_ENFORCE_GT(attr.h, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute height of SeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.h));
     return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc
index 5baafa11cfea0..2a3c347c16a25 100644
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/operators/jit/gen_base.h"
 
 #include <fstream>
+
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"  // for posix_memalign
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
index c22a7f3ec9292..761c52b7d7c79 100644
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -17,8 +17,8 @@
 #include <memory>  // for unique_ptr
 #include <string>
 #include <vector>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 DECLARE_bool(dump_jitcode);
diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc
index 46da6fba2e98a..07d69658632a6 100644
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -13,7 +13,9 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/helper.h"
+
 #include <numeric>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -112,10 +114,11 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
   int block, rest;
   const auto groups = packed_groups(n, k, &block, &rest);
   std::for_each(groups.begin(), groups.end(), [&](int i) {
-    PADDLE_ENFORCE_GT(i, 0, platform::errors::InvalidArgument(
-                                "Each element of groups should be larger than "
-                                "0. However the element: %d doesn't satify.",
-                                i));
+    PADDLE_ENFORCE_GT(i, 0,
+                      platform::errors::InvalidArgument(
+                          "Each element of groups should be larger than "
+                          "0. However the element: %d doesn't satify.",
+                          i));
   });
   int sum = std::accumulate(groups.begin(), groups.end(), 0);
   std::memset(dst, 0, k * sum * block * sizeof(float));
diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h
index 9a48d9c3c8d6c..0389828b49537 100644
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <cstdint>
+
 #include "paddle/fluid/operators/jit/macro.h"
 #include "paddle/fluid/platform/macros.h"
 
diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc
index 4f652002bc745..528aec9ace1d3 100644
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/kernel_key.h"
+
 #include <xxhash.h>  // XXH64: 13.8 GB/s
 
 namespace paddle {
diff --git a/paddle/fluid/operators/jit/more/CMakeLists.txt b/paddle/fluid/operators/jit/more/CMakeLists.txt
index fa503356baa73..0851ca065b53d 100644
--- a/paddle/fluid/operators/jit/more/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/CMakeLists.txt
@@ -1,17 +1,18 @@
-
 function(USE_JITKERNEL_MORE TARGET TYPE)
-    file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n")
+  file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n")
 endfunction()
 
 if(WITH_MKLML)
-    add_subdirectory(mkl)
+  add_subdirectory(mkl)
 endif()
 
 if(WITH_AVX)
-    add_subdirectory(intrinsic)
+  add_subdirectory(intrinsic)
 endif()
 
 # mix should be last
 add_subdirectory(mix)
 
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} PARENT_SCOPE)
+set(JIT_KERNEL_DEPS
+    ${JIT_KERNEL_DEPS}
+    PARENT_SCOPE)
diff --git a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt
index 468937a4f6b27..c6222c9b29b3b 100644
--- a/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt
@@ -1,9 +1,16 @@
+file(
+  GLOB jit_kernel_cc_intrinsic
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "*.cc")
+cc_library(
+  jit_kernel_intrinsic
+  SRCS ${jit_kernel_cc_intrinsic}
+  DEPS jit_kernel_base)
 
-file(GLOB jit_kernel_cc_intrinsic RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
-cc_library(jit_kernel_intrinsic SRCS ${jit_kernel_cc_intrinsic} DEPS jit_kernel_base)
-
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE)
+set(JIT_KERNEL_DEPS
+    ${JIT_KERNEL_DEPS} jit_kernel_intrinsic
+    PARENT_SCOPE)
 
 # use mkl kernels by name and type
-USE_JITKERNEL_MORE(kCRFDecoding, intrinsic)
-USE_JITKERNEL_MORE(kLayerNorm, intrinsic)
+use_jitkernel_more(kCRFDecoding, intrinsic)
+use_jitkernel_more(kLayerNorm, intrinsic)
diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
index 7e1f7ab8bf8b0..f11a690523bf8 100644
--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
@@ -13,7 +13,9 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h"
+
 #include <limits>
+
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
index 61d8c50c56825..ef8fe6963c045 100644
--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
@@ -13,7 +13,9 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/more/intrinsic/layer_norm.h"
+
 #include <limits>
+
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
diff --git a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
index dd039d2915296..b5bc6c8457577 100644
--- a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
@@ -1,15 +1,21 @@
+file(
+  GLOB jit_kernel_mix_cc
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "*.cc")
+cc_library(
+  jit_kernel_mix
+  SRCS ${jit_kernel_mix_cc}
+  DEPS jit_kernel_base)
 
+set(JIT_KERNEL_DEPS
+    ${JIT_KERNEL_DEPS} jit_kernel_mix
+    PARENT_SCOPE)
 
-file(GLOB jit_kernel_mix_cc RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
-cc_library(jit_kernel_mix SRCS ${jit_kernel_mix_cc} DEPS jit_kernel_base)
-
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_mix PARENT_SCOPE)
-
-USE_JITKERNEL_MORE(kVSigmoid, mix)
-USE_JITKERNEL_MORE(kVTanh, mix)
-USE_JITKERNEL_MORE(kLSTMCtHt, mix)
-USE_JITKERNEL_MORE(kLSTMC1H1, mix)
-USE_JITKERNEL_MORE(kGRUH1, mix)
-USE_JITKERNEL_MORE(kGRUHtPart1, mix)
-USE_JITKERNEL_MORE(kGRUHtPart2, mix)
-USE_JITKERNEL_MORE(kSoftmax, mix)
+use_jitkernel_more(kVSigmoid, mix)
+use_jitkernel_more(kVTanh, mix)
+use_jitkernel_more(kLSTMCtHt, mix)
+use_jitkernel_more(kLSTMC1H1, mix)
+use_jitkernel_more(kGRUH1, mix)
+use_jitkernel_more(kGRUHtPart1, mix)
+use_jitkernel_more(kGRUHtPart2, mix)
+use_jitkernel_more(kSoftmax, mix)
diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc
index a4459cee5b8a3..f0008d4152f53 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/more/mix/mix.h"
+
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/jit/registry.h"
 
diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
index 56f1a62ad4e06..609ddd3c284c8 100644
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -1,20 +1,24 @@
-
-cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml)
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE)
+cc_library(
+  jit_kernel_mkl
+  SRCS mkl.cc
+  DEPS jit_kernel_base dynload_mklml)
+set(JIT_KERNEL_DEPS
+    ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl
+    PARENT_SCOPE)
 
 # use mkl kernels by name and type
-USE_JITKERNEL_MORE(kMatMul, mkl)
-USE_JITKERNEL_MORE(kVMul, mkl)
-USE_JITKERNEL_MORE(kVAdd, mkl)
-USE_JITKERNEL_MORE(kVScal, mkl)
-USE_JITKERNEL_MORE(kStrideScal, mkl)
-USE_JITKERNEL_MORE(kVExp, mkl)
-USE_JITKERNEL_MORE(kVSquare, mkl)
-USE_JITKERNEL_MORE(kVCopy, mkl)
-USE_JITKERNEL_MORE(kVSigmoid, mkl)
-USE_JITKERNEL_MORE(kVTanh, mkl)
-USE_JITKERNEL_MORE(kSeqPool, mkl)
-USE_JITKERNEL_MORE(kSoftmax, mkl)
-USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
-USE_JITKERNEL_MORE(kSgd, mkl)
-USE_JITKERNEL_MORE(kVBroadcast, mkl)
+use_jitkernel_more(kMatMul, mkl)
+use_jitkernel_more(kVMul, mkl)
+use_jitkernel_more(kVAdd, mkl)
+use_jitkernel_more(kVScal, mkl)
+use_jitkernel_more(kStrideScal, mkl)
+use_jitkernel_more(kVExp, mkl)
+use_jitkernel_more(kVSquare, mkl)
+use_jitkernel_more(kVCopy, mkl)
+use_jitkernel_more(kVSigmoid, mkl)
+use_jitkernel_more(kVTanh, mkl)
+use_jitkernel_more(kSeqPool, mkl)
+use_jitkernel_more(kSoftmax, mkl)
+use_jitkernel_more(kEmbSeqPool, mkl)
+use_jitkernel_more(kSgd, mkl)
+use_jitkernel_more(kVBroadcast, mkl)
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc
index 75ebddb125989..16bf045aa6671 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/more/mkl/mkl.h"
+
 #include "paddle/fluid/operators/jit/refer/refer.h"
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
index 5f3c29ad5efb8..ad04b4618cb41 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -117,10 +117,11 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out,
             "The idx shoud be lower than the attribute table_height of "
             "EmbSeqPool. But %dth of idx is %d and table_height is %d.",
             i, idx[i], attr->table_height));
-    PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
-                                     "The idx shoud be equal to or larger than "
-                                     "the 0. But %dth of idx is %d.",
-                                     i, idx[i]));
+    PADDLE_ENFORCE_GE(idx[i], 0,
+                      platform::errors::InvalidArgument(
+                          "The idx shoud be equal to or larger than "
+                          "the 0. But %dth of idx is %d.",
+                          i, idx[i]));
   };
 
   for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -204,11 +205,12 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
                             "less than the attribute. But %dth of rows "
                             "is %d and grad_width is %d.",
                             i, h_idx, attr->param_height));
-      PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
-                                      "The rows of Sgd should be "
-                                      "larger than 0. But %dth of rows "
-                                      "is %d.",
-                                      i, h_idx));
+      PADDLE_ENFORCE_GE(
+          h_idx, 0,
+          platform::errors::InvalidArgument("The rows of Sgd should be "
+                                            "larger than 0. But %dth of rows "
+                                            "is %d.",
+                                            i, h_idx));
       VAXPY(scalar, grad + i * width, out + h_idx * width, width);
     }
   } else {
@@ -220,11 +222,12 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
                             "less than the attribute. But %dth of rows "
                             "is %d and grad_width is %d.",
                             i, h_idx, attr->param_height));
-      PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
-                                      "The rows of Sgd should be "
-                                      "larger than 0. But %dth of rows "
-                                      "is %d.",
-                                      i, h_idx));
+      PADDLE_ENFORCE_GE(
+          h_idx, 0,
+          platform::errors::InvalidArgument("The rows of Sgd should be "
+                                            "larger than 0. But %dth of rows "
+                                            "is %d.",
+                                            i, h_idx));
       VScal(&scalar, grad + i * width, out + h_idx * width, width);
       VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width,
            width);
diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt
index a1ee4508f7241..5ef93f989df31 100644
--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@@ -1,42 +1,46 @@
-
-cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base)
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE)
+cc_library(
+  jit_kernel_refer
+  SRCS refer.cc
+  DEPS jit_kernel_base)
+set(JIT_KERNEL_DEPS
+    ${JIT_KERNEL_DEPS} jit_kernel_refer
+    PARENT_SCOPE)
 
 function(USE_JITKERNEL_REFER TARGET)
-    file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n")
+  file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n")
 endfunction()
 
 # use refer kernel by name
-USE_JITKERNEL_REFER(kVMul)
-USE_JITKERNEL_REFER(kVAdd)
-USE_JITKERNEL_REFER(kVAddRelu)
-USE_JITKERNEL_REFER(kVSub)
-USE_JITKERNEL_REFER(kVScal)
-USE_JITKERNEL_REFER(kStrideScal)
-USE_JITKERNEL_REFER(kVAddBias)
-USE_JITKERNEL_REFER(kVCopy)
-USE_JITKERNEL_REFER(kVRelu)
-USE_JITKERNEL_REFER(kVIdentity)
-USE_JITKERNEL_REFER(kVExp)
-USE_JITKERNEL_REFER(kVSigmoid)
-USE_JITKERNEL_REFER(kVTanh)
-USE_JITKERNEL_REFER(kLSTMCtHt)
-USE_JITKERNEL_REFER(kLSTMC1H1)
-USE_JITKERNEL_REFER(kGRUH1)
-USE_JITKERNEL_REFER(kGRUHtPart1)
-USE_JITKERNEL_REFER(kGRUHtPart2)
-USE_JITKERNEL_REFER(kCRFDecoding)
-USE_JITKERNEL_REFER(kLayerNorm)
-USE_JITKERNEL_REFER(kNCHW16CMulNC)
-USE_JITKERNEL_REFER(kSeqPool)
-USE_JITKERNEL_REFER(kMatMul)
-USE_JITKERNEL_REFER(kVSquare)
-USE_JITKERNEL_REFER(kHSum)
-USE_JITKERNEL_REFER(kHMax)
-USE_JITKERNEL_REFER(kStrideASum)
-USE_JITKERNEL_REFER(kSoftmax)
-USE_JITKERNEL_REFER(kEmbSeqPool)
-USE_JITKERNEL_REFER(kAdam)
-USE_JITKERNEL_REFER(kAdamW)
-USE_JITKERNEL_REFER(kSgd)
-USE_JITKERNEL_REFER(kVBroadcast)
+use_jitkernel_refer(kVMul)
+use_jitkernel_refer(kVAdd)
+use_jitkernel_refer(kVAddRelu)
+use_jitkernel_refer(kVSub)
+use_jitkernel_refer(kVScal)
+use_jitkernel_refer(kStrideScal)
+use_jitkernel_refer(kVAddBias)
+use_jitkernel_refer(kVCopy)
+use_jitkernel_refer(kVRelu)
+use_jitkernel_refer(kVIdentity)
+use_jitkernel_refer(kVExp)
+use_jitkernel_refer(kVSigmoid)
+use_jitkernel_refer(kVTanh)
+use_jitkernel_refer(kLSTMCtHt)
+use_jitkernel_refer(kLSTMC1H1)
+use_jitkernel_refer(kGRUH1)
+use_jitkernel_refer(kGRUHtPart1)
+use_jitkernel_refer(kGRUHtPart2)
+use_jitkernel_refer(kCRFDecoding)
+use_jitkernel_refer(kLayerNorm)
+use_jitkernel_refer(kNCHW16CMulNC)
+use_jitkernel_refer(kSeqPool)
+use_jitkernel_refer(kMatMul)
+use_jitkernel_refer(kVSquare)
+use_jitkernel_refer(kHSum)
+use_jitkernel_refer(kHMax)
+use_jitkernel_refer(kStrideASum)
+use_jitkernel_refer(kSoftmax)
+use_jitkernel_refer(kEmbSeqPool)
+use_jitkernel_refer(kAdam)
+use_jitkernel_refer(kAdamW)
+use_jitkernel_refer(kSgd)
+use_jitkernel_refer(kVBroadcast)
diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc
index 779d4c172b83c..9919f2d46dd8b 100644
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/refer/refer.h"
+
 #include "paddle/fluid/operators/jit/registry.h"
 
 namespace refer = paddle::operators::jit::refer;
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
index 79b2e174efc16..3f1e5b3235b25 100644
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -481,10 +481,11 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out,
             "The idx shoud be lower than the attribute table_height of "
             "EmbSeqPool. But %dth of idx is %d and table_height is %d.",
             i, idx[i], attr->table_height));
-    PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
-                                     "The idx shoud be equal to or larger than "
-                                     "the 0. But %dth of idx is %d.",
-                                     i, idx[i]));
+    PADDLE_ENFORCE_GE(idx[i], 0,
+                      platform::errors::InvalidArgument(
+                          "The idx shoud be equal to or larger than "
+                          "the 0. But %dth of idx is %d.",
+                          i, idx[i]));
   };
 
   for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -539,11 +540,12 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
                           "less than the attribute. But %dth of rows "
                           "is %d and grad_width is %d.",
                           i, h_idx, attr->param_height));
-    PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
-                                    "The rows of Sgd should be "
-                                    "larger than 0. But %dth of rows "
-                                    "is %d.",
-                                    i, h_idx));
+    PADDLE_ENFORCE_GE(
+        h_idx, 0,
+        platform::errors::InvalidArgument("The rows of Sgd should be "
+                                          "larger than 0. But %dth of rows "
+                                          "is %d.",
+                                          i, h_idx));
     for (int64_t j = 0; j < attr->grad_width; ++j) {
       out[h_idx * attr->grad_width + j] =
           param[h_idx * attr->grad_width + j] -
diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h
index 567a903236979..15d5e605b01bb 100644
--- a/paddle/fluid/operators/jit/registry.h
+++ b/paddle/fluid/operators/jit/registry.h
@@ -18,6 +18,7 @@
 #include <tuple>
 #include <type_traits>
 #include <utility>  // for std::move
+
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/operators/jit/kernel_pool.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index 74f2d62c64da9..27e816248ab38 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -286,48 +286,48 @@ void TestKernelLSTM() {
             ref(&step, &attr);
             VLOG(10) << attr;
 
-            auto verifier = [](
-                const typename KernelTuple::func_type tgt,
-                const std::vector<T>& xsrc, const std::vector<T>& wp,
-                const std::vector<T>& ct_1, const std::vector<T>& ct_ref,
-                const std::vector<T>& ht_ref,
-                const typename KernelTuple::attr_type& attr) {
-              EXPECT_TRUE(tgt != nullptr);
-              EXPECT_EQ(ct_ref.size(), ht_ref.size());
-              EXPECT_EQ(ct_1.size(), ht_ref.size());
-              EXPECT_EQ(xsrc.size(), 4 * ht_ref.size());
-              EXPECT_EQ(wp.size(), 3 * ht_ref.size());
-
-              // x could be changed after compute, so copy to save src
-              int d = ht_ref.size();
-              std::vector<T> x(xsrc.size()), ct(ct_ref.size()),
-                  ht(ht_ref.size());
-              std::vector<T> checked(2 * d);
-              std::copy(xsrc.begin(), xsrc.end(), x.begin());
-
-              const T* ct_1_data = ct_1.data();
-              const T* wp_data = wp.data();
-              const T* ct_ref_data = ct_ref.data();
-              const T* ht_ref_data = ht_ref.data();
-              T* x_data = x.data();
-              T* ct_data = ct.data();
-              T* ht_data = ht.data();
-              T* checked_data = checked.data();
-
-              jit::lstm_t step;
-              step.gates = x_data;
-              step.ct_1 = ct_1_data;
-              step.ct = ct_data;
-              step.ht = ht_data;
-              if (attr.use_peephole) {
-                step.wp = wp_data;
-                step.checked = checked_data;
-              }
-
-              tgt(&step, &attr);
-              ExpectEQ<T>(ct_data, ct_ref_data, d);
-              ExpectEQ<T>(ht_data, ht_ref_data, d);
-            };
+            auto verifier =
+                [](const typename KernelTuple::func_type tgt,
+                   const std::vector<T>& xsrc, const std::vector<T>& wp,
+                   const std::vector<T>& ct_1, const std::vector<T>& ct_ref,
+                   const std::vector<T>& ht_ref,
+                   const typename KernelTuple::attr_type& attr) {
+                  EXPECT_TRUE(tgt != nullptr);
+                  EXPECT_EQ(ct_ref.size(), ht_ref.size());
+                  EXPECT_EQ(ct_1.size(), ht_ref.size());
+                  EXPECT_EQ(xsrc.size(), 4 * ht_ref.size());
+                  EXPECT_EQ(wp.size(), 3 * ht_ref.size());
+
+                  // x could be changed after compute, so copy to save src
+                  int d = ht_ref.size();
+                  std::vector<T> x(xsrc.size()), ct(ct_ref.size()),
+                      ht(ht_ref.size());
+                  std::vector<T> checked(2 * d);
+                  std::copy(xsrc.begin(), xsrc.end(), x.begin());
+
+                  const T* ct_1_data = ct_1.data();
+                  const T* wp_data = wp.data();
+                  const T* ct_ref_data = ct_ref.data();
+                  const T* ht_ref_data = ht_ref.data();
+                  T* x_data = x.data();
+                  T* ct_data = ct.data();
+                  T* ht_data = ht.data();
+                  T* checked_data = checked.data();
+
+                  jit::lstm_t step;
+                  step.gates = x_data;
+                  step.ct_1 = ct_1_data;
+                  step.ct = ct_data;
+                  step.ht = ht_data;
+                  if (attr.use_peephole) {
+                    step.wp = wp_data;
+                    step.checked = checked_data;
+                  }
+
+                  tgt(&step, &attr);
+                  ExpectEQ<T>(ct_data, ct_ref_data, d);
+                  ExpectEQ<T>(ht_data, ht_ref_data, d);
+                };
             TestAllImpls<KernelTuple, PlaceType>(attr, verifier, xsrc, wp, ct_1,
                                                  ct_ref, ht_ref, attr);
           }
@@ -484,41 +484,42 @@ void TestKernelLayerNorm() {
         ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
             left, epsilon, right);
 
-        auto verifier = [](
-            const typename KernelTuple::func_type tgt, const std::vector<T>& x_,
-            const std::vector<T>& outref_, const std::vector<T>& mean_,
-            const std::vector<T>& var_, const std::vector<T>& scale,
-            const std::vector<T>& bias, const int& left, const float& epsilon,
-            const typename KernelTuple::attr_type& right) {
-          EXPECT_TRUE(tgt != nullptr);
-          std::vector<T> outtgt(outref_.size());
-          std::vector<T> x(x_.size());
-          std::vector<T> mean(mean_.size());
-          std::vector<T> var(var_.size());
-          std::vector<T> outref(outref_.size());
-          std::copy(x_.begin(), x_.end(), x.begin());
-          std::copy(mean_.begin(), mean_.end(), mean.begin());
-          std::copy(var_.begin(), var_.end(), var.begin());
-          std::copy(outref_.begin(), outref_.end(), outref.begin());
-
-          EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
-          EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
-          EXPECT_EQ(mean.size(), static_cast<size_t>(left));
-          EXPECT_EQ(var.size(), static_cast<size_t>(left));
-          EXPECT_EQ(scale.size(), static_cast<size_t>(right));
-          EXPECT_EQ(bias.size(), static_cast<size_t>(right));
-
-          const T* scale_data = scale.data();
-          const T* bias_data = bias.data();
-          T* x_data = x.data();
-          T* mean_data = mean.data();
-          T* var_data = var.data();
-          T* outref_data = outref.data();
-          T* outtgt_data = outtgt.data();
-          tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data,
-              left, epsilon, right);
-          ExpectEQ<T>(outtgt_data, outref_data, left * right);
-        };
+        auto verifier =
+            [](const typename KernelTuple::func_type tgt,
+               const std::vector<T>& x_, const std::vector<T>& outref_,
+               const std::vector<T>& mean_, const std::vector<T>& var_,
+               const std::vector<T>& scale, const std::vector<T>& bias,
+               const int& left, const float& epsilon,
+               const typename KernelTuple::attr_type& right) {
+              EXPECT_TRUE(tgt != nullptr);
+              std::vector<T> outtgt(outref_.size());
+              std::vector<T> x(x_.size());
+              std::vector<T> mean(mean_.size());
+              std::vector<T> var(var_.size());
+              std::vector<T> outref(outref_.size());
+              std::copy(x_.begin(), x_.end(), x.begin());
+              std::copy(mean_.begin(), mean_.end(), mean.begin());
+              std::copy(var_.begin(), var_.end(), var.begin());
+              std::copy(outref_.begin(), outref_.end(), outref.begin());
+
+              EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
+              EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
+              EXPECT_EQ(mean.size(), static_cast<size_t>(left));
+              EXPECT_EQ(var.size(), static_cast<size_t>(left));
+              EXPECT_EQ(scale.size(), static_cast<size_t>(right));
+              EXPECT_EQ(bias.size(), static_cast<size_t>(right));
+
+              const T* scale_data = scale.data();
+              const T* bias_data = bias.data();
+              T* x_data = x.data();
+              T* mean_data = mean.data();
+              T* var_data = var.data();
+              T* outref_data = outref.data();
+              T* outtgt_data = outtgt.data();
+              tgt(x_data, outtgt_data, mean_data, var_data, scale_data,
+                  bias_data, left, epsilon, right);
+              ExpectEQ<T>(outtgt_data, outref_data, left * right);
+            };
         TestAllImpls<KernelTuple, PlaceType>(right, verifier, x, outref, mean,
                                              var, scale, bias, left, epsilon,
                                              right);
@@ -548,11 +549,12 @@ void TestKernelCRFDecoding() {
       ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
           trackref.data(), tag_num);
 
-      auto verifier = [](
-          const typename KernelTuple::func_type tgt, const int& seq_len,
-          const std::vector<T>& x, const std::vector<T>& w,
-          const std::vector<T>& alpharef, const std::vector<int>& trackref,
-          const typename KernelTuple::attr_type& tag_num) {
+      auto verifier = [](const typename KernelTuple::func_type tgt,
+                         const int& seq_len, const std::vector<T>& x,
+                         const std::vector<T>& w,
+                         const std::vector<T>& alpharef,
+                         const std::vector<int>& trackref,
+                         const typename KernelTuple::attr_type& tag_num) {
         constexpr int state_trans_base_idx = 2;
         EXPECT_TRUE(tgt != nullptr);
         EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num));
@@ -878,12 +880,13 @@ void TestKernelAdam() {
       mom2.data(), param.data(), mom1_out.data(), mom2_out.data(),
       param_out.data());
 
-  auto verifier = [](
-      const typename KernelTuple::func_type tgt, T beta1, T beta2, T lr, T eps,
-      int64_t numel, const std::vector<T>& grad, const std::vector<T>& mom1,
-      const std::vector<T>& mom2, const std::vector<T>& param,
-      const std::vector<T>& ref_mom1_out, const std::vector<T>& ref_mom2_out,
-      const std::vector<T>& ref_param_out) {
+  auto verifier = [](const typename KernelTuple::func_type tgt, T beta1,
+                     T beta2, T lr, T eps, int64_t numel,
+                     const std::vector<T>& grad, const std::vector<T>& mom1,
+                     const std::vector<T>& mom2, const std::vector<T>& param,
+                     const std::vector<T>& ref_mom1_out,
+                     const std::vector<T>& ref_mom2_out,
+                     const std::vector<T>& ref_param_out) {
     EXPECT_TRUE(tgt != nullptr);
     EXPECT_EQ(param.size(), static_cast<size_t>(numel));
     EXPECT_EQ(grad.size(), static_cast<size_t>(numel));
@@ -944,30 +947,31 @@ void TestKernelAdamW() {
       grad.data(), mom1.data(), mom2.data(), param.data(), mom1_out.data(),
       mom2_out.data(), param_out.data());
 
-  auto verifier = [](
-      const typename KernelTuple::func_type tgt, T beta1, T beta2, T lr, T eps,
-      T old_lr, T lr_ratio, T coeff, int64_t numel, const std::vector<T>& grad,
-      const std::vector<T>& mom1, const std::vector<T>& mom2,
-      const std::vector<T>& param, const std::vector<T>& ref_mom1_out,
-      const std::vector<T>& ref_mom2_out, const std::vector<T>& ref_param_out) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(param.size(), static_cast<size_t>(numel));
-    EXPECT_EQ(grad.size(), static_cast<size_t>(numel));
-    EXPECT_EQ(mom1.size(), static_cast<size_t>(numel));
-    EXPECT_EQ(mom2.size(), static_cast<size_t>(numel));
-
-    std::vector<T> jit_mom1_out(ref_mom1_out.size());
-    std::vector<T> jit_mom2_out(ref_mom2_out.size());
-    std::vector<T> jit_param_out(ref_param_out.size());
-
-    tgt(beta1, beta2, -lr, eps, old_lr, lr_ratio, coeff, numel, grad.data(),
-        mom1.data(), mom2.data(), param.data(), jit_mom1_out.data(),
-        jit_mom2_out.data(), jit_param_out.data());
-
-    ExpectEQ<T>(ref_mom1_out.data(), jit_mom1_out.data(), numel);
-    ExpectEQ<T>(ref_mom2_out.data(), jit_mom2_out.data(), numel);
-    ExpectEQ<T>(ref_param_out.data(), jit_param_out.data(), numel);
-  };
+  auto verifier =
+      [](const typename KernelTuple::func_type tgt, T beta1, T beta2, T lr,
+         T eps, T old_lr, T lr_ratio, T coeff, int64_t numel,
+         const std::vector<T>& grad, const std::vector<T>& mom1,
+         const std::vector<T>& mom2, const std::vector<T>& param,
+         const std::vector<T>& ref_mom1_out, const std::vector<T>& ref_mom2_out,
+         const std::vector<T>& ref_param_out) {
+        EXPECT_TRUE(tgt != nullptr);
+        EXPECT_EQ(param.size(), static_cast<size_t>(numel));
+        EXPECT_EQ(grad.size(), static_cast<size_t>(numel));
+        EXPECT_EQ(mom1.size(), static_cast<size_t>(numel));
+        EXPECT_EQ(mom2.size(), static_cast<size_t>(numel));
+
+        std::vector<T> jit_mom1_out(ref_mom1_out.size());
+        std::vector<T> jit_mom2_out(ref_mom2_out.size());
+        std::vector<T> jit_param_out(ref_param_out.size());
+
+        tgt(beta1, beta2, -lr, eps, old_lr, lr_ratio, coeff, numel, grad.data(),
+            mom1.data(), mom2.data(), param.data(), jit_mom1_out.data(),
+            jit_mom2_out.data(), jit_param_out.data());
+
+        ExpectEQ<T>(ref_mom1_out.data(), jit_mom1_out.data(), numel);
+        ExpectEQ<T>(ref_mom2_out.data(), jit_mom2_out.data(), numel);
+        ExpectEQ<T>(ref_param_out.data(), jit_param_out.data(), numel);
+      };
 
   TestAllImpls<KernelTuple, PlaceType>(
       1, verifier, beta1, beta2, learning_rate, eps, old_lr, lr_ratio, coeff,
@@ -988,8 +992,9 @@ void TestKernelSgd() {
                           "and n-1 is %d.",
                           static_cast<size_t>(upper - lower), n - 1));
     PADDLE_ENFORCE_GT(
-        n, 0, paddle::platform::errors::InvalidArgument(
-                  "The Sgd size should be larger than 0. But the n is %d.", n));
+        n, 0,
+        paddle::platform::errors::InvalidArgument(
+            "The Sgd size should be larger than 0. But the n is %d.", n));
     std::vector<int64_t> all, out;
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
@@ -1031,11 +1036,12 @@ void TestKernelSgd() {
                       grad_w);
         }
 
-        auto verifier = [](
-            const typename KernelTuple::func_type tgt, const T lr,
-            const std::vector<T>& param, const std::vector<T>& grad,
-            const std::vector<int64_t>& rows, const std::vector<T>& oref,
-            const typename KernelTuple::attr_type& attr) {
+        auto verifier = [](const typename KernelTuple::func_type tgt,
+                           const T lr, const std::vector<T>& param,
+                           const std::vector<T>& grad,
+                           const std::vector<int64_t>& rows,
+                           const std::vector<T>& oref,
+                           const typename KernelTuple::attr_type& attr) {
           EXPECT_TRUE(tgt != nullptr);
           EXPECT_EQ(param.size(),
                     static_cast<size_t>(attr.param_height * attr.param_width));
diff --git a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
index 169befc88f28d..82de4c82d1121 100644
--- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
@@ -19,4 +19,4 @@ namespace paddle {
 namespace operators {
 namespace kernel_primitives = phi::kps;
 }
-}
+}  // namespace paddle
diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc
index 67c1942ea0b41..8597c21b3ec97 100644
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
@@ -11,6 +11,7 @@
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/binary.h"
diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc
index eac181489aa9d..41499f3f7bf8b 100644
--- a/paddle/fluid/operators/kldiv_loss_op_npu.cc
+++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the Licnse. */
 
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/kthvalue_op.cc b/paddle/fluid/operators/kthvalue_op.cc
index 4c679d3026386..1ff9ab796e9d9 100644
--- a/paddle/fluid/operators/kthvalue_op.cc
+++ b/paddle/fluid/operators/kthvalue_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index ddd0554add510..7a6a28a33c13c 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/l1_norm_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index 7e07610db2875..e14e61006478e 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 5b5ddddaafb24..ac20a5962f394 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -36,8 +36,6 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
-#define LN_NUM_COLS 1024
-
 inline static int GetDesiredBlockDim(int64_t block_dim) {
 #ifdef __HIPCC__
   const int kMaxBlockDim = 256;
@@ -183,11 +181,12 @@ template <typename T, typename U, typename ScaleT = U, int VecSize = 8,
           int ROWS_PER_CTA = WARPS_M,
           int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW *VecSize,
           int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA>
-__global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
+__global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
     int rows, int cols, const float epsilon, const T *__restrict__ x_ptr,
     const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr,
     U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
     T *__restrict__ y_ptr) {
+  __shared__ U smem[WARPS_M * WARPS_N];
   using Vec = phi::AlignedVector<T, VecSize>;
   using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
 
@@ -210,12 +209,12 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
     col += THREADS_PER_ROW;
   }
 
-  constexpr U rn = 1.f / U(LN_NUM_COLS);
+  constexpr U rn = 1.f / U(ELTS_PER_ROW);
   for (int row = r; row < rows; row += gridDim.x * ROWS_PER_CTA) {
     Vec x[LDGS];
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
+      phi::Load<T, VecSize>(x_ptr + row * ELTS_PER_ROW + col * VecSize, &x[it]);
       col += THREADS_PER_ROW;
     }
     U xf[LDGS * VecSize];
@@ -235,6 +234,23 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
       mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it);
     }
+    if (WARPS_N > 1) {
+      if (lane == 0) {
+        smem[warp_m * WARPS_N + warp_n] = mu_local;
+      }
+      __syncthreads();
+      if (tidx == 0) {
+        mu_local = 0.f;
+#pragma unroll
+        for (int it = 0; it < WARPS_N; ++it) {
+          mu_local += smem[warp_m * WARPS_N + it];
+        }
+        smem[warp_m] = mu_local;
+      }
+      __syncthreads();
+      mu_local = smem[warp_m];
+    }
+
     mu_local *= rn;
     if (lane == 0) {
       mean_out_ptr[row] = mu_local;
@@ -254,6 +270,24 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
       var_local += __shfl_xor_sync(uint32_t(-1), var_local, it);
     }
+
+    if (WARPS_N > 1) {
+      if (lane == 0) {
+        smem[warp_m * WARPS_N + warp_n] = var_local;
+      }
+      __syncthreads();
+      if (tidx == 0) {
+        var_local = 0.f;
+#pragma unroll
+        for (int it = 0; it < WARPS_N; ++it) {
+          var_local += smem[warp_m * WARPS_N + it];
+        }
+        smem[warp_m] = var_local;
+      }
+      __syncthreads();
+      var_local = smem[warp_m];
+    }
+
     // Note: to assure if it is right for double
     U rsigma = rsqrtf(var_local * rn + epsilon);
     if (lane == 0) {
@@ -277,7 +311,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
 
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      phi::Store<T, VecSize>(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], y_ptr + row * ELTS_PER_ROW + col * VecSize);
       col += THREADS_PER_ROW;
     }
   }
@@ -416,10 +450,10 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
   const int r = bidx * ROWS_PER_CTA + warp_m;
   const int c = warp_n * THREADS_PER_WARP + lane;
 
-  static_assert(LN_NUM_COLS == THREADS_PER_ROW * LDGS * VecSize, "");
+  static_assert(ELTS_PER_ROW == THREADS_PER_ROW * LDGS * VecSize, "");
 
   // smem for column reduction
-  __shared__ U smem_[ROWS_PER_CTA * LN_NUM_COLS];
+  __shared__ U smem_[ROWS_PER_CTA * ELTS_PER_ROW];
 
   U dgamma_sum[LDGS * VecSize];
   U dbeta_sum[LDGS * VecSize];
@@ -434,7 +468,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
   U *sum_loss2_shared = &smem_sum_loss2[warp_m * WARPS_N];
 
   // step-1: compute dx and local results of dscale and dbias
-  constexpr float rn = 1.f / static_cast<float>(LN_NUM_COLS);
+  constexpr float rn = 1.f / static_cast<float>(ELTS_PER_ROW);
   Vec_scale gamma[LDGS];
   int col = c;
 #pragma unroll
@@ -452,12 +486,12 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
     int col = c;
 #pragma unroll
     for (int it = 0; it < LDGS; it++) {
-      phi::Load<T, VecSize>(dout_ptr + row * LN_NUM_COLS + col * VecSize,
+      phi::Load<T, VecSize>(dout_ptr + row * ELTS_PER_ROW + col * VecSize,
                             &dout[it]);
-      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
+      phi::Load<T, VecSize>(x_ptr + row * ELTS_PER_ROW + col * VecSize, &x[it]);
       if (isFusedDropoutResidualLn) {
         phi::Load<MaskType, VecSize>(
-            mask_ptr + row * LN_NUM_COLS + col * VecSize, &mask_vec[it]);
+            mask_ptr + row * ELTS_PER_ROW + col * VecSize, &mask_vec[it]);
       }
 
       col += THREADS_PER_ROW;
@@ -551,10 +585,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
     col = c;
 #pragma unroll
     for (int it = 0; it < LDGS; it++) {
-      phi::Store<T, VecSize>(x[it], dx_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it],
+                             dx_ptr + row * ELTS_PER_ROW + col * VecSize);
       if (isFusedDropoutResidualLn) {
         phi::Store<T, VecSize>(
-            dout[it], d_dropout_src_ptr + row * LN_NUM_COLS + col * VecSize);
+            dout[it], d_dropout_src_ptr + row * ELTS_PER_ROW + col * VecSize);
       }
       col += THREADS_PER_ROW;
     }
@@ -562,12 +597,12 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
 
   // step-2: column reduction of dscale and dbias for each thread block.
   // each block's sum: [4 * 1024] -> [1 * 1024]
-  enum { NUM_RES = LN_NUM_COLS / THREADS_PER_CTA };  // 1024/128 = 8
-  static_assert(NUM_RES * THREADS_PER_CTA == LN_NUM_COLS, "");
+  enum { NUM_RES = ELTS_PER_ROW / THREADS_PER_CTA };  // 1024/128 = 8
+  static_assert(NUM_RES * THREADS_PER_CTA == ELTS_PER_ROW, "");
 
   U *smem_write;
 
-  smem_write = &smem_[warp_m * LN_NUM_COLS + tid_r * VecSize];  // [4 * 1024]
+  smem_write = &smem_[warp_m * ELTS_PER_ROW + tid_r * VecSize];  // [4 * 1024]
 #pragma unroll
   for (int it = 0; it < LDGS; it++) {
 #pragma unroll
@@ -583,12 +618,12 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
   for (int it = 0; it < ROWS_PER_CTA; it++) {
     for (int jt = 0; jt < NUM_RES; jt++) {
       cta_dbeta_sum[jt] +=
-          smem_[it * LN_NUM_COLS + tidx + jt * THREADS_PER_CTA];
+          smem_[it * ELTS_PER_ROW + tidx + jt * THREADS_PER_CTA];
     }
   }
   __syncthreads();
 
-  smem_write = &smem_[warp_m * LN_NUM_COLS + tid_r * VecSize];
+  smem_write = &smem_[warp_m * ELTS_PER_ROW + tid_r * VecSize];
 #pragma unroll
   for (int it = 0; it < LDGS; it++) {
 #pragma unroll
@@ -603,19 +638,19 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
   for (int it = 0; it < ROWS_PER_CTA; it++) {
     for (int jt = 0; jt < NUM_RES; jt++) {
       cta_dgamma_sum[jt] +=
-          smem_[it * LN_NUM_COLS + tidx + jt * THREADS_PER_CTA];
+          smem_[it * ELTS_PER_ROW + tidx + jt * THREADS_PER_CTA];
     }
   }
 
   // the shape of results：(#blocks, 1024)
   U *dgamma_part =
-      static_cast<U *>(dgamma_temp_ptr) + bidx * LN_NUM_COLS + tidx;
+      static_cast<U *>(dgamma_temp_ptr) + bidx * ELTS_PER_ROW + tidx;
   for (int jt = 0; jt < NUM_RES; jt++) {
     *dgamma_part = cta_dgamma_sum[jt];
     dgamma_part += THREADS_PER_CTA;
   }
 
-  U *dbeta_part = static_cast<U *>(dbeta_temp_ptr) + bidx * LN_NUM_COLS + tidx;
+  U *dbeta_part = static_cast<U *>(dbeta_temp_ptr) + bidx * ELTS_PER_ROW + tidx;
   for (int jt = 0; jt < NUM_RES; jt++) {
     *dbeta_part = cta_dbeta_sum[jt];
     dbeta_part += THREADS_PER_CTA;
@@ -626,7 +661,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
  * output is [1, 1024].
  * #blocks: 32
  * #threads: 512
-*/
+ */
 // todo(@limin29): to think if there are better impl strategies
 template <
     typename U, typename ScaleT = U, int VecSize = 1, int WARPS_M = 16,
@@ -640,7 +675,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
     const int rows, U *__restrict__ dg_part_, U *__restrict__ db_part_,
     ScaleT *__restrict__ dg_, ScaleT *__restrict__ db_) {
   using Vec = phi::AlignedVector<U, VecSize>;
-  static_assert(VEC_COLS == LN_NUM_COLS / VecSize, "");
+  static_assert(VEC_COLS == ELTS_PER_ROW / VecSize, "");
 
   const int tidx = threadIdx.x;
   const int bidx = blockIdx.x;
@@ -656,8 +691,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
   __shared__ U smem_space[(WARPS_M - 1) * THREADS_PER_ROW * VecSize];
 
   for (int col = c; col < VEC_COLS; col += gridDim.x * THREADS_PER_ROW) {
-    const U *dg_part_ptr = (dg_part_) + r * LN_NUM_COLS + col * VecSize;
-    const U *db_part_ptr = (db_part_) + r * LN_NUM_COLS + col * VecSize;
+    const U *dg_part_ptr = (dg_part_) + r * ELTS_PER_ROW + col * VecSize;
+    const U *db_part_ptr = (db_part_) + r * ELTS_PER_ROW + col * VecSize;
 
     U dg_sum[VecSize];
     U db_sum[VecSize];
@@ -669,8 +704,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
       Vec db;
       phi::Load<U, VecSize>(dg_part_ptr, &dg);
       phi::Load<U, VecSize>(db_part_ptr, &db);
-      dg_part_ptr += ROWS_PER_CTA * LN_NUM_COLS;
-      db_part_ptr += ROWS_PER_CTA * LN_NUM_COLS;
+      dg_part_ptr += ROWS_PER_CTA * ELTS_PER_ROW;
+      db_part_ptr += ROWS_PER_CTA * ELTS_PER_ROW;
 
 #pragma unroll
       for (int jt = 0; jt < VecSize; jt++) {
@@ -748,16 +783,16 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
 }
 
 /* This function support two kinds of computations (only for float and fp16
-* type):
-*
-* Case-1: compute layer_norm_grad for layernorm op by setting mask_ptr and
-* d_dropout_src_ptr to nullptr. Here, d_x_ptr returns the grad of layernorm
-* input.
-*
-* Case-2: compute layer_norm_grad + residual_grad + dropout_grad for
-* fused_dropout_residual_layernorm op. Here, dx_ptr returns residual_grad.
-*
-*/
+ * type):
+ *
+ * Case-1: compute layer_norm_grad for layernorm op by setting mask_ptr and
+ * d_dropout_src_ptr to nullptr. Here, d_x_ptr returns the grad of layernorm
+ * input.
+ *
+ * Case-2: compute layer_norm_grad + residual_grad + dropout_grad for
+ * fused_dropout_residual_layernorm op. Here, dx_ptr returns residual_grad.
+ *
+ */
 template <typename T, typename U, typename ScaleT = U,
           typename MaskType = uint8_t>
 void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
@@ -804,19 +839,19 @@ void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
             "To compute fused_dropout_residual_ln grad, d_dropout_src_ptr "
             "can't be null"));
       }
-      fused_ln_bwd_1024_kernel<
-          true, T, U, ScaleT, MaskType, VecSize, WARPS_M, WARPS_N,
-          BYTES_PER_LDG><<<gridx, THREADS_PER_CTA, 0, stream>>>(
-          rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr,
-          dscale_temp_ptr, dbias_temp_ptr, dx_ptr, mask_ptr, factor,
-          d_dropout_src_ptr);
+      fused_ln_bwd_1024_kernel<true, T, U, ScaleT, MaskType, VecSize, WARPS_M,
+                               WARPS_N, BYTES_PER_LDG>
+          <<<gridx, THREADS_PER_CTA, 0, stream>>>(
+              rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr,
+              dscale_temp_ptr, dbias_temp_ptr, dx_ptr, mask_ptr, factor,
+              d_dropout_src_ptr);
 
     } else {
-      fused_ln_bwd_1024_kernel<
-          false, T, U, ScaleT, MaskType, VecSize, WARPS_M, WARPS_N,
-          BYTES_PER_LDG><<<gridx, THREADS_PER_CTA, 0, stream>>>(
-          rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr,
-          dscale_temp_ptr, dbias_temp_ptr, dx_ptr);
+      fused_ln_bwd_1024_kernel<false, T, U, ScaleT, MaskType, VecSize, WARPS_M,
+                               WARPS_N, BYTES_PER_LDG>
+          <<<gridx, THREADS_PER_CTA, 0, stream>>>(
+              rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr,
+              dscale_temp_ptr, dbias_temp_ptr, dx_ptr);
     }
     const int WARPS_M_2 = 16;
     const int WARPS_N_2 = 1;
@@ -838,10 +873,10 @@ void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Only support float and fp16 type"));
     } else {
-      ln_bwd_1024_final_kernel<
-          U, ScaleT, VecSize_2, WARPS_M_2, WARPS_N_2,
-          BYTES_PER_LDG_2><<<gridx_2, THREADS_PER_CTA_2, 0, stream>>>(
-          gridx, dscale_temp_ptr, dbias_temp_ptr, dscale_ptr, dbias_ptr);
+      ln_bwd_1024_final_kernel<U, ScaleT, VecSize_2, WARPS_M_2, WARPS_N_2,
+                               BYTES_PER_LDG_2>
+          <<<gridx_2, THREADS_PER_CTA_2, 0, stream>>>(
+              gridx, dscale_temp_ptr, dbias_temp_ptr, dscale_ptr, dbias_ptr);
     }
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
@@ -1352,16 +1387,17 @@ static void LayerNormBackward(
   if (gradient_flag == 0) return;
 
   if (batch_size == 1) {
-    LayerNormBackwardWhenBatchSizeIsOne<T, U, ScaleBiasWithSameTypeX><<<
-        (feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, 0,
-        stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, epsilon,
-                  feature_size);
+    LayerNormBackwardWhenBatchSizeIsOne<T, U, ScaleBiasWithSameTypeX>
+        <<<(feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, 0,
+           stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, epsilon,
+                     feature_size);
 
     if (d_x != nullptr) {
       switch (GetDesiredBlockDim(feature_size)) {
-        FIXED_BLOCK_DIM_CASE(LayerNormBackwardPostProcessToCalculateDX<
-                             T, U, kBlockDim><<<1, kBlockDim, 0, stream>>>(
-            x, d_x, mean, var, epsilon, feature_size));
+        FIXED_BLOCK_DIM_CASE(
+            LayerNormBackwardPostProcessToCalculateDX<T, U, kBlockDim>
+            <<<1, kBlockDim, 0, stream>>>(x, d_x, mean, var, epsilon,
+                                          feature_size));
       }
     }
     return;
@@ -1373,9 +1409,9 @@ static void LayerNormBackward(
       switch (block_dim) {
         FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
             feature_size, kMaxBlockNum,
-            LayerNormBackwardGradientScaleOrBias<
-                T, U, kBlockDim, false, false,
-                ScaleBiasWithSameTypeX><<<block_num, kBlockDim, 0, stream>>>(
+            LayerNormBackwardGradientScaleOrBias<T, U, kBlockDim, false, false,
+                                                 ScaleBiasWithSameTypeX>
+            <<<block_num, kBlockDim, 0, stream>>>(
                 x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
                 batch_size, feature_size, col_offset));
       }
@@ -1384,9 +1420,9 @@ static void LayerNormBackward(
       switch (block_dim) {
         FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
             feature_size, kMaxBlockNum,
-            LayerNormBackwardGradientScaleOrBias<
-                T, U, kBlockDim, false, true,
-                ScaleBiasWithSameTypeX><<<block_num, kBlockDim, 0, stream>>>(
+            LayerNormBackwardGradientScaleOrBias<T, U, kBlockDim, false, true,
+                                                 ScaleBiasWithSameTypeX>
+            <<<block_num, kBlockDim, 0, stream>>>(
                 x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
                 batch_size, feature_size, col_offset));
       }
@@ -1395,9 +1431,9 @@ static void LayerNormBackward(
       switch (block_dim) {
         FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
             feature_size, kMaxBlockNum,
-            LayerNormBackwardGradientAll<
-                T, U, kBlockDim, false,
-                ScaleBiasWithSameTypeX><<<block_num, kBlockDim, 0, stream>>>(
+            LayerNormBackwardGradientAll<T, U, kBlockDim, false,
+                                         ScaleBiasWithSameTypeX>
+            <<<block_num, kBlockDim, 0, stream>>>(
                 x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
                 batch_size, feature_size, col_offset));
       }
@@ -1405,9 +1441,9 @@ static void LayerNormBackward(
     case 4:  // d_x != nullptr, d_scale == nullptr, d_bias == nullptr
       switch (GetDesiredBlockDim(feature_size)) {
         FIXED_BLOCK_DIM_CASE(
-            LayerNormBackwardGradientOnlyDX<
-                T, U, kBlockDim,
-                ScaleBiasWithSameTypeX><<<batch_size, kBlockDim, 0, stream>>>(
+            LayerNormBackwardGradientOnlyDX<T, U, kBlockDim,
+                                            ScaleBiasWithSameTypeX>
+            <<<batch_size, kBlockDim, 0, stream>>>(
                 x, d_y, d_x, mean, var, scale, epsilon, feature_size));
       }
       break;
@@ -1415,34 +1451,34 @@ static void LayerNormBackward(
       switch (block_dim) {
         FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
             feature_size, kMaxBlockNum,
-            LayerNormBackwardGradientScaleOrBias<
-                T, U, kBlockDim, true, false,
-                ScaleBiasWithSameTypeX><<<block_num, kBlockDim, 0, stream>>>(
+            LayerNormBackwardGradientScaleOrBias<T, U, kBlockDim, true, false,
+                                                 ScaleBiasWithSameTypeX>
+            <<<block_num, kBlockDim, 0, stream>>>(
                 x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
                 batch_size, feature_size, col_offset));
       }
       switch (GetDesiredBlockDim(feature_size)) {
         FIXED_BLOCK_DIM_CASE(
-            LayerNormBackwardPostProcessToCalculateDX<
-                T, U, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-                x, d_x, mean, var, epsilon, feature_size));
+            LayerNormBackwardPostProcessToCalculateDX<T, U, kBlockDim>
+            <<<batch_size, kBlockDim, 0, stream>>>(x, d_x, mean, var, epsilon,
+                                                   feature_size));
       }
       break;
     case 6:  // d_x != nullptr, d_scale != nullptr, d_bias == nullptr
       switch (block_dim) {
         FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
             feature_size, kMaxBlockNum,
-            LayerNormBackwardGradientScaleOrBias<
-                T, U, kBlockDim, true, true,
-                ScaleBiasWithSameTypeX><<<block_num, kBlockDim, 0, stream>>>(
+            LayerNormBackwardGradientScaleOrBias<T, U, kBlockDim, true, true,
+                                                 ScaleBiasWithSameTypeX>
+            <<<block_num, kBlockDim, 0, stream>>>(
                 x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
                 batch_size, feature_size, col_offset));
       }
       switch (GetDesiredBlockDim(feature_size)) {
         FIXED_BLOCK_DIM_CASE(
-            LayerNormBackwardPostProcessToCalculateDX<
-                T, U, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-                x, d_x, mean, var, epsilon, feature_size));
+            LayerNormBackwardPostProcessToCalculateDX<T, U, kBlockDim>
+            <<<batch_size, kBlockDim, 0, stream>>>(x, d_x, mean, var, epsilon,
+                                                   feature_size));
       }
       break;
     case 7:  // d_x != nullptr, d_scale != nullptr, d_bias != nullptr
@@ -1476,29 +1512,30 @@ static void LayerNormBackward(
         U *part_grad_gamma = reinterpret_cast<U *>(part_grad_gamma_ptr->ptr());
         U *part_grad_beta = reinterpret_cast<U *>(part_grad_beta_ptr->ptr());
 
-        LayerNormBackwardPartGradGammaBeta<
-            T, U, BDIMX2, BDIMY2, VPT><<<blocks2, threads2, 0, stream>>>(
-            d_y, x, batch_size, feature_size, mean, var, epsilon,
-            part_grad_gamma,
-            part_grad_beta);  // compute part_grad_gamma, beta
+        LayerNormBackwardPartGradGammaBeta<T, U, BDIMX2, BDIMY2, VPT>
+            <<<blocks2, threads2, 0, stream>>>(
+                d_y, x, batch_size, feature_size, mean, var, epsilon,
+                part_grad_gamma,
+                part_grad_beta);  // compute part_grad_gamma, beta
 
         constexpr int BDIMX3 = 32;
         constexpr int BDIMY3 = 8;
         dim3 threads3(BDIMX3, BDIMY3, 1);
         const dim3 blocks3((feature_size + BDIMX2 - 1) / BDIMX2, 1, 1);
-        LayerNormBackwardSumGradGammaBeta<
-            T, U, BDIMX3, BDIMY3,
-            ScaleBiasWithSameTypeX><<<blocks3, threads3, 0, stream>>>(
-            part_grad_gamma, part_grad_beta, part_size, batch_size,
-            feature_size, d_scale, d_bias);
+        LayerNormBackwardSumGradGammaBeta<T, U, BDIMX3, BDIMY3,
+                                          ScaleBiasWithSameTypeX>
+            <<<blocks3, threads3, 0, stream>>>(part_grad_gamma, part_grad_beta,
+                                               part_size, batch_size,
+                                               feature_size, d_scale, d_bias);
 
         constexpr int BDIMX1 = 32;
         constexpr int BDIMY1 = 4;
         dim3 threads1(BDIMX1, BDIMY1, 1);
-        LayerNormBackwardComputeGradInput<
-            T, U, BDIMX1, BDIMY1,
-            ScaleBiasWithSameTypeX><<<batch_size, threads1, 0, stream>>>(
-            d_y, x, batch_size, feature_size, mean, var, epsilon, scale, d_x);
+        LayerNormBackwardComputeGradInput<T, U, BDIMX1, BDIMY1,
+                                          ScaleBiasWithSameTypeX>
+            <<<batch_size, threads1, 0, stream>>>(d_y, x, batch_size,
+                                                  feature_size, mean, var,
+                                                  epsilon, scale, d_x);
 #ifdef PADDLE_WITH_CUDA
       }
 #endif
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index 224ab748dab6c..3d1e563ef1aca 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/operators/layer_norm_op_xpu.cc b/paddle/fluid/operators/layer_norm_op_xpu.cc
index 3b21a55f8df0d..a27952c57f7fa 100644
--- a/paddle/fluid/operators/layer_norm_op_xpu.cc
+++ b/paddle/fluid/operators/layer_norm_op_xpu.cc
@@ -88,8 +88,9 @@ class LayerNormGradXPUKernel : public framework::OpKernel<T> {
     auto* dscale_data =
         (dscale == nullptr ? nullptr
                            : dscale->mutable_data<float>(ctx.GetPlace()));
-    auto* dbias_data = (dbias == nullptr ? nullptr : dbias->mutable_data<float>(
-                                                         ctx.GetPlace()));
+    auto* dbias_data =
+        (dbias == nullptr ? nullptr
+                          : dbias->mutable_data<float>(ctx.GetPlace()));
     auto* dx_data =
         (dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()));
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
diff --git a/paddle/fluid/operators/layout_utils.h b/paddle/fluid/operators/layout_utils.h
index e304f33d0455a..f058afdb4adc3 100644
--- a/paddle/fluid/operators/layout_utils.h
+++ b/paddle/fluid/operators/layout_utils.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index 7308363b9fe0d..a6ef87d43e2d4 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -28,9 +28,10 @@ static inline T NormalizeL1(T* x, size_t len) {
   // Right now, we just bet that sum won't be zero. If this really happens, we
   // will figure out what should be done then.
   PADDLE_ENFORCE_GT(
-      sum, 0., platform::errors::InvalidArgument(
-                   "The unnormalized probabilities of all possible unfinished "
-                   "sequences must be greater than 0."));
+      sum, 0.,
+      platform::errors::InvalidArgument(
+          "The unnormalized probabilities of all possible unfinished "
+          "sequences must be greater than 0."));
   T s = 1. / sum;
   for (size_t i = 0; i < len; ++i) x[i] *= s;
   return sum;
@@ -44,8 +45,8 @@ struct ScalarMul {
   T scalar;
 };
 
-using framework::LoDTensor;
 using framework::LoD;
+using framework::LoDTensor;
 using framework::Tensor;
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index e9375be1706eb..5e451d99dbc85 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -77,10 +77,9 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     LinspaceInferShapeFunctor);
 
-REGISTER_OP_VERSION(linspace)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(linspace).AddCheckpoint(
+    R"ROC(
       Upgrade linspace to add a new attribute [dtype].
     )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "dtype", "In order to change output data type ", 5));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "dtype", "In order to change output data type ", 5));
diff --git a/paddle/fluid/operators/lite/CMakeLists.txt b/paddle/fluid/operators/lite/CMakeLists.txt
index 5bb7892590848..3955c6e322b0e 100644
--- a/paddle/fluid/operators/lite/CMakeLists.txt
+++ b/paddle/fluid/operators/lite/CMakeLists.txt
@@ -1,2 +1,5 @@
 op_library(lite_engine_op DEPS lite_engine lite_tensor_utils)
-cc_test(test_lite_engine_op SRCS lite_engine_op_test.cc DEPS lite_engine_op analysis)
+cc_test(
+  test_lite_engine_op
+  SRCS lite_engine_op_test.cc
+  DEPS lite_engine_op analysis)
diff --git a/paddle/fluid/operators/lite/lite_engine_op.cc b/paddle/fluid/operators/lite/lite_engine_op.cc
index 7a879c1e21642..0ec1c55f7abee 100644
--- a/paddle/fluid/operators/lite/lite_engine_op.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lite/lite_engine_op.h"
+
 #include <string>
 #include <vector>
 
diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h
index 5d2a1683d381b..240f6b06325f4 100644
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
@@ -26,11 +26,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-
 #include "paddle/fluid/inference/lite/engine.h"
 #include "paddle/fluid/inference/lite/tensor_utils.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc
index 01583cea31222..c38386365f3dc 100644
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -12,6 +12,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License. */
 
+#include "paddle/fluid/operators/lite/lite_engine_op.h"
+
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/block_desc.h"
@@ -19,13 +21,12 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/operators/lite/lite_engine_op.h"
 #include "paddle/fluid/operators/lite/ut_helper.h"
 
 USE_NO_KERNEL_OP(lite_engine)
 
-using paddle::inference::lite::AddTensorToBlockDesc;
 using paddle::inference::lite::AddFetchListToBlockDesc;
+using paddle::inference::lite::AddTensorToBlockDesc;
 using paddle::inference::lite::CreateTensor;
 using paddle::inference::lite::serialize_params;
 namespace paddle {
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index 374bfa73f2187..94797b08ade80 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/load_combine_op.h"
+
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/operators/load_combine_op.h"
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index ba19aee9b8d76..196792707ebbd 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <string>
-
 #include "paddle/fluid/operators/load_op.h"
 
+#include <string>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index 5616309683365..616aad2b97691 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lod_reset_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index 642c8bcd9ae49..f6f7155f37c3a 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index 883e3597d8a31..11edbc84a19d9 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc
index f103a69707a21..99ccad1ca76a5 100644
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <cmath>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc
index fee1f56ebdcf2..1ba0a0f3b3d7e 100644
--- a/paddle/fluid/operators/log_loss_op_xpu.cc
+++ b/paddle/fluid/operators/log_loss_op_xpu.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include <memory>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc
index da38f906b9bd3..95ebeedaf797e 100644
--- a/paddle/fluid/operators/log_softmax_op.cc
+++ b/paddle/fluid/operators/log_softmax_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 29079b8b1385d..c519e0845f750 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -113,26 +113,22 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
     dim3 grids(8, 1);
 #ifdef PADDLE_WITH_HIP
     if (padding_idx == -1)
-      LookupTable<
-          T, 64, 4, 8,
-          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+      LookupTable<T, 64, 4, 8, false>
+          <<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+              output, table, ids, N, K, D, padding_idx);
     else
-      LookupTable<
-          T, 64, 4, 8,
-          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+      LookupTable<T, 64, 4, 8, true>
+          <<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+              output, table, ids, N, K, D, padding_idx);
 #else
     if (padding_idx == -1)
-      LookupTable<
-          T, 128, 8, 8,
-          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+      LookupTable<T, 128, 8, 8, false>
+          <<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+              output, table, ids, N, K, D, padding_idx);
     else
-      LookupTable<
-          T, 128, 8, 8,
-          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+      LookupTable<T, 128, 8, 8, true>
+          <<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+              output, table, ids, N, K, D, padding_idx);
 #endif  // PADDLE_WITH_HIP
   }
 };
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cc b/paddle/fluid/operators/lookup_table_v2_op.cc
index 48ae080783d11..65aeca1e49928 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_v2_op.h"
 
 #include <memory>
+
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index c2df6dff5b53c..c47ea64e24c42 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <iostream>
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
diff --git a/paddle/fluid/operators/lookup_table_v2_op_xpu.cc b/paddle/fluid/operators/lookup_table_v2_op_xpu.cc
index 521d3ab571efd..223bf2cc8678b 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_xpu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_xpu.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/lookup_table_v2_op.h"
 #include <memory>
+
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/fluid/operators/lookup_table_v2_op.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #ifdef PADDLE_WITH_XPU
 namespace paddle {
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 88d70d9bb7dae..17c5f08c66c94 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lrn_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN
@@ -174,20 +176,23 @@ class LRNOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("MidOut"), "Output", "MidOut", "LRN");
 
     auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(x_dim.size(), 4, platform::errors::InvalidArgument(
-                                           "Input(input) rank should be 4, "
-                                           "but received input rank (%d) != 4",
-                                           x_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        x_dim.size(), 4,
+        platform::errors::InvalidArgument("Input(input) rank should be 4, "
+                                          "but received input rank (%d) != 4",
+                                          x_dim.size()));
 
     int n = ctx->Attrs().Get<int>("n");
-    PADDLE_ENFORCE_GT(n, 0UL, platform::errors::InvalidArgument(
-                                  "Argument(n) should be positive, "
-                                  "but received n(%d) not greater than 0",
-                                  n));
-    PADDLE_ENFORCE_EQ(n % 2, 1UL, platform::errors::InvalidArgument(
-                                      "Argument(n) should be odd value, "
-                                      "but received n(%d) is not an odd value",
-                                      n));
+    PADDLE_ENFORCE_GT(n, 0UL,
+                      platform::errors::InvalidArgument(
+                          "Argument(n) should be positive, "
+                          "but received n(%d) not greater than 0",
+                          n));
+    PADDLE_ENFORCE_EQ(n % 2, 1UL,
+                      platform::errors::InvalidArgument(
+                          "Argument(n) should be odd value, "
+                          "but received n(%d) is not an odd value",
+                          n));
 
     ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index f2d72d0740573..671055caa16f1 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -68,18 +69,21 @@ class LRNKernel : public framework::OpKernel<T> {
     T beta = ctx.Attr<float>("beta");
     T k = ctx.Attr<float>("k");
 
-    PADDLE_ENFORCE_GE(alpha, 0UL, platform::errors::InvalidArgument(
-                                      "Argument(alpha) should >= 0.0, "
-                                      "but received alpha(%d) less than 0",
-                                      alpha));
-    PADDLE_ENFORCE_GE(beta, 0UL, platform::errors::InvalidArgument(
-                                     "Argument(beta) should >= 0.0, "
-                                     "but received beta(%d) less than 0",
-                                     beta));
-    PADDLE_ENFORCE_GE(k, 0UL, platform::errors::InvalidArgument(
-                                  "Argument(k) should >= 0.0, "
-                                  "but received k(%d) less than 0",
-                                  k));
+    PADDLE_ENFORCE_GE(
+        alpha, 0UL,
+        platform::errors::InvalidArgument("Argument(alpha) should >= 0.0, "
+                                          "but received alpha(%d) less than 0",
+                                          alpha));
+    PADDLE_ENFORCE_GE(
+        beta, 0UL,
+        platform::errors::InvalidArgument("Argument(beta) should >= 0.0, "
+                                          "but received beta(%d) less than 0",
+                                          beta));
+    PADDLE_ENFORCE_GE(
+        k, 0UL,
+        platform::errors::InvalidArgument("Argument(k) should >= 0.0, "
+                                          "but received k(%d) less than 0",
+                                          k));
 
     LRNFunctor<DeviceContext, T> f;
     f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta, data_layout);
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index 2ec9de3e3bbfc..21a0fce289348 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstm_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 4ec3072a96d44..1e1aaf3ea5328 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
@@ -272,9 +273,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
 
     phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
 
-    auto ToBatch = [&batch_gate, &to_batch](
-        const DeviceContext& ctx, const framework::LoDTensor& src,
-        const framework::DDim& dims, framework::LoDTensor& dst) {
+    auto ToBatch = [&batch_gate, &to_batch](const DeviceContext& ctx,
+                                            const framework::LoDTensor& src,
+                                            const framework::DDim& dims,
+                                            framework::LoDTensor& dst) {
       dst.mutable_data<T>(dims, ctx.GetPlace());
       dst.set_lod(batch_gate->lod());
       to_batch(ctx, src, &dst, false);
diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
index 917482589fcf3..235a4bd689b23 100644
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstm_unit_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index 562f7755591fd..7ecf294433ead 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstmp_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 5d24c0b70d347..5e68259852c28 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
@@ -371,9 +372,10 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
 
     phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
 
-    auto ToBatch = [&batch_gate, &to_batch](
-        const DeviceContext& ctx, const framework::LoDTensor& src,
-        const framework::DDim& dims, framework::LoDTensor& dst) {
+    auto ToBatch = [&batch_gate, &to_batch](const DeviceContext& ctx,
+                                            const framework::LoDTensor& src,
+                                            const framework::DDim& dims,
+                                            framework::LoDTensor& dst) {
       dst.mutable_data<T>(dims, ctx.GetPlace());
       dst.set_lod(batch_gate->lod());
       to_batch(ctx, src, &dst, false);
diff --git a/paddle/fluid/operators/lstsq_op.cc b/paddle/fluid/operators/lstsq_op.cc
index f060125620f5a..e093e4d8c01a6 100644
--- a/paddle/fluid/operators/lstsq_op.cc
+++ b/paddle/fluid/operators/lstsq_op.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/lstsq_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -139,4 +141,4 @@ REGISTER_OPERATOR(lstsq, ops::LstsqOp, ops::LstsqOpMaker)
 
 REGISTER_OP_CPU_KERNEL(
     lstsq, ops::LstsqCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LstsqCPUKernel<paddle::platform::CPUDeviceContext, double>);
\ No newline at end of file
+    ops::LstsqCPUKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu
index 10e2867bf2953..53c78fef7b5d4 100644
--- a/paddle/fluid/operators/lstsq_op.cu
+++ b/paddle/fluid/operators/lstsq_op.cu
@@ -17,6 +17,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/lstsq_op.h"
 #include "paddle/fluid/operators/qr_op.h"
diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h
index 520722dafcbea..7955b3b7df9a3 100644
--- a/paddle/fluid/operators/lstsq_op.h
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -15,8 +15,10 @@
 #pragma once
 
 #include <math.h>
+
 #include <algorithm>
 #include <complex>
+
 #include "paddle/fluid/operators/eig_op.h"
 #include "paddle/fluid/operators/math/eigen_values_vectors.h"
 #include "paddle/fluid/operators/math/matrix_solve.h"
diff --git a/paddle/fluid/operators/lu_op.cc b/paddle/fluid/operators/lu_op.cc
index fc8673181c467..0894323015e68 100644
--- a/paddle/fluid/operators/lu_op.cc
+++ b/paddle/fluid/operators/lu_op.cc
@@ -45,8 +45,9 @@ class LUOp : public framework::OperatorWithKernel {
     bool pivots = context->Attrs().Get<bool>("pivots");
     auto x_dims = context->GetInputDim("X");
     int x_rank = x_dims.size();
-    PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument(
-                                     "the rank of input must greater than 2"));
+    PADDLE_ENFORCE_GE(x_rank, 2,
+                      platform::errors::InvalidArgument(
+                          "the rank of input must greater than 2"));
     context->SetOutputDim("Out", x_dims);
     int m = x_dims[x_rank - 1];
     int n = x_dims[x_rank - 2];
diff --git a/paddle/fluid/operators/lu_unpack_op.cc b/paddle/fluid/operators/lu_unpack_op.cc
index e38a4703f64ee..e3b4263b4ff68 100644
--- a/paddle/fluid/operators/lu_unpack_op.cc
+++ b/paddle/fluid/operators/lu_unpack_op.cc
@@ -53,8 +53,9 @@ class LU_UnpackOp : public framework::OperatorWithKernel {
 
     auto x_dims = context->GetInputDim("X");
     int x_rank = x_dims.size();
-    PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument(
-                                     "the rank of input must greater than 2"));
+    PADDLE_ENFORCE_GE(x_rank, 2,
+                      platform::errors::InvalidArgument(
+                          "the rank of input must greater than 2"));
 
     // context->SetOutputDim("Out", x_dims);
     int m = x_dims[x_rank - 1];
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index a2e34d98461e0..1cef3705973e7 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -20,16 +20,19 @@ namespace cub = hipcub;
 #endif
 
 #include <vector>
+
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/margin_cross_entropy_op.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -63,19 +66,34 @@ void GetClassInterval(const gpuStream_t& stream, const platform::Place& place,
   framework::TensorFromVector(shard_dim_vec, ctx, &num_classes_per_device);
   int* num_classes_per_device_ptr = num_classes_per_device.data<int>();
 
-  const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
-  // use global calculate stream
-  const auto calcu_stream =
-      static_cast<platform::CUDADeviceContext*>(
-          platform::DeviceContextPool::Instance().Get(place))
-          ->stream();
-
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-      num_classes_per_device_ptr, num_classes_per_device_ptr,
-      num_classes_per_device.numel(),
-      platform::ToNCCLDataType(
-          framework::TransToProtoVarType(num_classes_per_device.dtype())),
-      ncclSum, comm->comm(), calcu_stream));
+  auto map = distributed::ProcessGroupMapFromGid::getInstance();
+  if (map->has(rid)) {
+    // Use ProcessGroup
+    distributed::ProcessGroup* pg = map->get(rid);
+    std::vector<phi::DenseTensor> in_tensor;
+    std::vector<phi::DenseTensor> out_tensor;
+    in_tensor.push_back(num_classes_per_device);
+    out_tensor.push_back(num_classes_per_device);
+
+    distributed::AllreduceOptions opts;
+    opts.reduce_op = distributed::ReduceOp::SUM;
+    auto task = pg->AllReduce(in_tensor, out_tensor, opts);
+    task->Wait();
+  } else {
+    const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    // use global calculate stream
+    const auto calcu_stream =
+        static_cast<platform::CUDADeviceContext*>(
+            platform::DeviceContextPool::Instance().Get(place))
+            ->stream();
+
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+        num_classes_per_device_ptr, num_classes_per_device_ptr,
+        num_classes_per_device.numel(),
+        platform::ToNCCLDataType(
+            framework::TransToProtoVarType(num_classes_per_device.dtype())),
+        ncclSum, comm->comm(), calcu_stream));
+  }
 
   auto class_interval_ptr =
       class_interval->mutable_data<int>({nranks + 1}, place);
@@ -228,14 +246,21 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     platform::NCCLComm* comm;
+    distributed::ProcessGroup* pg = nullptr;
     gpuStream_t stream;
     if (nranks > 1) {
-      comm = platform::NCCLCommContext::Instance().Get(rid, place);
-
-      // use global calculate stream
-      stream = static_cast<platform::CUDADeviceContext*>(
-                   platform::DeviceContextPool::Instance().Get(place))
-                   ->stream();
+      auto map = distributed::ProcessGroupMapFromGid::getInstance();
+      if (map->has(rid)) {
+        // Use ProcessGroup
+        pg = map->get(rid);
+      } else {
+        comm = platform::NCCLCommContext::Instance().Get(rid, place);
+
+        // use global calculate stream
+        stream = static_cast<platform::CUDADeviceContext*>(
+                     platform::DeviceContextPool::Instance().Get(place))
+                     ->stream();
+      }
     }
 #endif
 
@@ -274,16 +299,16 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     // save match_logits, used for gradient computation.
     if (label_type == framework::proto::VarType::INT32) {
       typedef int32_t LabelT;
-      AddMarginToPositiveLogitsKernel<
-          T><<<NumBlocks(N), threads, 0, dev_ctx.stream()>>>(
-          logits_ptr, labels->data<LabelT>(), margin1, margin2, margin3, rank,
-          nranks, N, D, class_interval.data<int>());
+      AddMarginToPositiveLogitsKernel<T>
+          <<<NumBlocks(N), threads, 0, dev_ctx.stream()>>>(
+              logits_ptr, labels->data<LabelT>(), margin1, margin2, margin3,
+              rank, nranks, N, D, class_interval.data<int>());
     } else if (label_type == framework::proto::VarType::INT64) {
       typedef int64_t LabelT;
-      AddMarginToPositiveLogitsKernel<
-          T><<<NumBlocks(N), threads, 0, dev_ctx.stream()>>>(
-          logits_ptr, labels->data<LabelT>(), margin1, margin2, margin3, rank,
-          nranks, N, D, class_interval.data<int>());
+      AddMarginToPositiveLogitsKernel<T>
+          <<<NumBlocks(N), threads, 0, dev_ctx.stream()>>>(
+              logits_ptr, labels->data<LabelT>(), margin1, margin2, margin3,
+              rank, nranks, N, D, class_interval.data<int>());
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
           "margin_cross_entropy label type noly support int32 and int64, "
@@ -306,11 +331,23 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     if (nranks > 1) {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-          logits_max_buff, logits_max_buff, logits_max.numel(),
-          platform::ToNCCLDataType(
-              framework::TransToProtoVarType(logits_max.dtype())),
-          ncclMax, comm->comm(), stream));
+      if (pg) {
+        std::vector<phi::DenseTensor> in_tensor;
+        std::vector<phi::DenseTensor> out_tensor;
+        in_tensor.push_back(logits_max);
+        out_tensor.push_back(logits_max);
+
+        distributed::AllreduceOptions opts;
+        opts.reduce_op = distributed::ReduceOp::MAX;
+        auto task = pg->AllReduce(in_tensor, out_tensor, opts);
+        task->Wait();
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            logits_max_buff, logits_max_buff, logits_max.numel(),
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(logits_max.dtype())),
+            ncclMax, comm->comm(), stream));
+      }
     }
 #endif
 
@@ -329,18 +366,30 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     if (nranks > 1) {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-          sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(),
-          platform::ToNCCLDataType(
-              framework::TransToProtoVarType(sum_exp_logits.dtype())),
-          ncclSum, comm->comm(), stream));
+      if (pg) {
+        std::vector<phi::DenseTensor> in_tensor;
+        std::vector<phi::DenseTensor> out_tensor;
+        in_tensor.push_back(sum_exp_logits);
+        out_tensor.push_back(sum_exp_logits);
+
+        distributed::AllreduceOptions opts;
+        opts.reduce_op = distributed::ReduceOp::SUM;
+        auto task = pg->AllReduce(in_tensor, out_tensor, opts);
+        task->Wait();
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(),
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(sum_exp_logits.dtype())),
+            ncclSum, comm->comm(), stream));
+      }
     }
 #endif
 
     // step 5, (logit - logit_max) - log(sum(exp(logit - logit_max)))
-    LogitsMinusLogSumKernel<
-        T><<<NumBlocks(N * D), threads, 0, dev_ctx.stream()>>>(
-        logits_ptr, sum_exp_logits_buff, N, D);
+    LogitsMinusLogSumKernel<T>
+        <<<NumBlocks(N * D), threads, 0, dev_ctx.stream()>>>(
+            logits_ptr, sum_exp_logits_buff, N, D);
 
     // step 6, prob = exp((logit - logit_max) - log(sum(exp(logit -
     // logit_max))))
@@ -349,25 +398,37 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
         dev_ctx, loss, static_cast<T>(0.0));
     if (label_type == framework::proto::VarType::INT32) {
       typedef int32_t LabelT;
-      HardLabelSoftmaxWithCrossEntropyKernel<
-          T, LabelT><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          loss_ptr, logits_ptr, labels->data<LabelT>(), rank, N, D,
-          class_interval.data<int>());
+      HardLabelSoftmaxWithCrossEntropyKernel<T, LabelT>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(
+              loss_ptr, logits_ptr, labels->data<LabelT>(), rank, N, D,
+              class_interval.data<int>());
     } else if (label_type == framework::proto::VarType::INT64) {
       typedef int64_t LabelT;
-      HardLabelSoftmaxWithCrossEntropyKernel<
-          T, LabelT><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          loss_ptr, logits_ptr, labels->data<LabelT>(), rank, N, D,
-          class_interval.data<int>());
+      HardLabelSoftmaxWithCrossEntropyKernel<T, LabelT>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(
+              loss_ptr, logits_ptr, labels->data<LabelT>(), rank, N, D,
+              class_interval.data<int>());
     }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     if (nranks > 1) {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-          loss_ptr, loss_ptr, loss->numel(),
-          platform::ToNCCLDataType(
-              framework::TransToProtoVarType(loss->dtype())),
-          ncclSum, comm->comm(), stream));
+      if (pg) {
+        std::vector<phi::DenseTensor> in_tensor;
+        std::vector<phi::DenseTensor> out_tensor;
+        in_tensor.push_back(*loss);
+        out_tensor.push_back(*loss);
+
+        distributed::AllreduceOptions opts;
+        opts.reduce_op = distributed::ReduceOp::SUM;
+        auto task = pg->AllReduce(in_tensor, out_tensor, opts);
+        task->Wait();
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            loss_ptr, loss_ptr, loss->numel(),
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(loss->dtype())),
+            ncclSum, comm->comm(), stream));
+      }
     }
 #endif
   }
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
index b4ff8b6d8dcf5..31055002993ed 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/margin_rank_loss_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/marker_op.cu b/paddle/fluid/operators/marker_op.cu
index cfa5c6dc7a918..fe61aefe0bb3a 100644
--- a/paddle/fluid/operators/marker_op.cu
+++ b/paddle/fluid/operators/marker_op.cu
@@ -48,8 +48,8 @@ class MarkerOpCUDAKernel : public framework::OpKernel<T> {
         "MarkerCUDA", "marker_" + marker_role + "_" + marker_pos,
         platform::TracerEventType::OperatorInner, 1,
         platform::EventRole::kInnerOp);
-    SimpleMarkerKernel<T><<<1, 32, 0, dev_ctx.stream()>>>(in_temp, out_temp,
-                                                          32);
+    SimpleMarkerKernel<T>
+        <<<1, 32, 0, dev_ctx.stream()>>>(in_temp, out_temp, 32);
   }
 };
 
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
index d32ab65509e5e..2ae4fbdbe103f 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/match_matrix_tensor_op.h"
+
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <memory>
 #include <vector>
 
-#include "paddle/fluid/operators/match_matrix_tensor_op.h"
 #include "paddle/fluid/operators/search_compute.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math.h b/paddle/fluid/operators/math.h
index d4b9e35bccedc..47281fb0280f0 100644
--- a/paddle/fluid/operators/math.h
+++ b/paddle/fluid/operators/math.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
+#include "math.h"  // NOLINT
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/core/hostdevice.h"
 
-#include "math.h"  // NOLINT
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 913ce07ec673c..ac538cfbd5c68 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,14 +1,17 @@
-if (WITH_ASCEND_CL)
-  cc_library(beam_search_npu SRCS beam_search_npu.cc DEPS npu_op_runner)
+if(WITH_ASCEND_CL)
+  cc_library(
+    beam_search_npu
+    SRCS beam_search_npu.cc
+    DEPS npu_op_runner)
 endif()
 
 # please add new math_library in alphabetical order
-if (WITH_ASCEND_CL)
-math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner)
-elseif (WITH_MLU)
-math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop)
+if(WITH_ASCEND_CL)
+  math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner)
+elseif(WITH_MLU)
+  math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop)
 else()
-math_library(concat_and_split DEPS concat_and_split_functor)
+  math_library(concat_and_split DEPS concat_and_split_functor)
 endif()
 math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)
@@ -22,23 +25,30 @@ math_library(sampler DEPS generator)
 math_library(maxouting)
 
 if(WITH_MKLDNN)
-    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler mixed_vector)
+  math_library(
+    selected_rows_functor
+    DEPS
+    selected_rows_utils
+    math_function
+    blas
+    mkldnn_axpy_handler
+    mixed_vector)
 else()
-    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mixed_vector)
+  math_library(selected_rows_functor DEPS selected_rows_utils math_function
+               blas mixed_vector)
 endif()
 
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function jit_kernel_helper)
-if (WITH_ASCEND_CL)
-    math_library(beam_search DEPS math_function beam_search_npu)
+if(WITH_ASCEND_CL)
+  math_library(beam_search DEPS math_function beam_search_npu)
 else()
-    math_library(beam_search DEPS math_function)
+  math_library(beam_search DEPS math_function)
 endif()
 math_library(matrix_bit_code)
 
-
 math_library(unpooling)
 math_library(vol2col)
 math_library(prelu)
@@ -46,28 +56,58 @@ math_library(bert_encoder_functor)
 math_library(tree2col DEPS math_function)
 math_library(matrix_solve)
 
-cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
-cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
-cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
-cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
-cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling)
-cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search)
+cc_test(
+  selected_rows_functor_test
+  SRCS selected_rows_functor_test.cc
+  DEPS selected_rows_functor)
+cc_test(
+  im2col_test
+  SRCS im2col_test.cc
+  DEPS im2col)
+cc_test(
+  vol2col_test
+  SRCS vol2col_test.cc
+  DEPS vol2col)
+cc_test(
+  sequence_padding_test
+  SRCS sequence_padding_test.cc
+  DEPS sequence_padding)
+cc_test(
+  sequence_pooling_test
+  SRCS sequence_pooling_test.cc
+  DEPS sequence_pooling)
+cc_test(
+  beam_search_test
+  SRCS beam_search_test.cc
+  DEPS beam_search)
 if(WITH_GPU)
-    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
+  nv_test(
+    selected_rows_functor_gpu_test
+    SRCS selected_rows_functor_test.cu.cc
+    DEPS selected_rows_functor math_function)
 endif()
 if(WITH_ROCM)
-    hip_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
+  hip_test(
+    selected_rows_functor_gpu_test
+    SRCS selected_rows_functor_test.cu.cc
+    DEPS selected_rows_functor math_function)
 endif()
-cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
+cc_test(
+  concat_test
+  SRCS concat_test.cc
+  DEPS concat_and_split)
 
 if(WITH_GPU AND (NOT WITH_ROCM))
-#currenty not yet support ROCM 
-#the generic conversion APIs of dense and sparse are only supported after cuda11.2
-    if((NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2)) 
-        cc_test(cusparse_conversion_api_test SRCS cusparse_conversion_api_test.cc DEPS tensor)
-    endif()
+  #currenty not yet support ROCM
+  #the generic conversion APIs of dense and sparse are only supported after cuda11.2
+  if((NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2))
+    cc_test(
+      cusparse_conversion_api_test
+      SRCS cusparse_conversion_api_test.cc
+      DEPS tensor)
+  endif()
 endif()
 
 if(WITH_TESTING AND TEST im2col_test)
-    set_tests_properties(im2col_test PROPERTIES TIMEOUT 120)
+  set_tests_properties(im2col_test PROPERTIES TIMEOUT 120)
 endif()
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index 486979aa0a8b3..7a21f2f64978d 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -348,11 +348,10 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
     float* selected_scores_data =
         selected_scores->mutable_data<float>(selected_dims, context.GetPlace());
     int* parent_idx_data =
-        parent_idx
-            ? parent_idx->mutable_data<int>(
-                  {static_cast<int64_t>(num_seqs * beam_size)},
-                  context.GetPlace())
-            : nullptr;
+        parent_idx ? parent_idx->mutable_data<int>(
+                         {static_cast<int64_t>(num_seqs * beam_size)},
+                         context.GetPlace())
+                   : nullptr;
 
     framework::LoD selected_lod(2);
     selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
@@ -369,8 +368,8 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
                             static_cast<int>(beam_size));
       switch (platform::RoundToPowerOfTwo(beam_size * seq_width)) {
         CUDA_LAUNCH_KERNEL_HELPER(
-            BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq><<<
-                1, kMaxThreadsPerSeq, 0, context.stream()>>>(
+            BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq>
+            <<<1, kMaxThreadsPerSeq, 0, context.stream()>>>(
                 selected_ids_data, selected_scores_data, parent_idx_data,
                 selected_offsets, pre_ids_data, pre_scores_data, ids_data,
                 scores_data, seq_length, static_cast<int>(seq_width),
@@ -387,8 +386,8 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
                             static_cast<int>(beam_size));
       switch (platform::RoundToPowerOfTwo(beam_size * num_seqs * 32)) {
         CUDA_LAUNCH_KERNEL_HELPER(
-            BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs><<<
-                1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>(
+            BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs>
+            <<<1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>(
                 selected_ids_data, selected_scores_data, parent_idx_data,
                 selected_offsets, pre_ids_data, pre_scores_data, ids_data,
                 scores_data, seq_offsets, static_cast<int>(num_seqs),
diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h
index 4474e7ea52aff..c0d39aa2d8fa9 100644
--- a/paddle/fluid/operators/math/beam_search.h
+++ b/paddle/fluid/operators/math/beam_search.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
index b0547ef9d956c..7cf4c867db7a3 100644
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/beam_search.h"
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu
index 0cdad6beeb9f6..4aba6f3c0b9e9 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
@@ -815,23 +815,23 @@ void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden,
     const int threads = 256;
     if (hidden % 2 == 0) {
       if (std::is_same<T, float>::value) {
-        SkipLayerNormKernel2<float, float2,
-                             threads><<<block, threads, 0, stream>>>(
-            num, hidden / 2, reinterpret_cast<const float2 *>(input1),
-            reinterpret_cast<const float2 *>(input2),
-            reinterpret_cast<float2 *>(output),
-            reinterpret_cast<const float2 *>(scale),
-            reinterpret_cast<const float2 *>(bias), eps);
+        SkipLayerNormKernel2<float, float2, threads>
+            <<<block, threads, 0, stream>>>(
+                num, hidden / 2, reinterpret_cast<const float2 *>(input1),
+                reinterpret_cast<const float2 *>(input2),
+                reinterpret_cast<float2 *>(output),
+                reinterpret_cast<const float2 *>(scale),
+                reinterpret_cast<const float2 *>(bias), eps);
 // HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
 #ifndef __HIPCC__
       } else if (std::is_same<T, __half>::value) {
-        SkipLayerNormKernel2<__half, __half2,
-                             threads><<<block, threads, 0, stream>>>(
-            num, hidden / 2, reinterpret_cast<const __half2 *>(input1),
-            reinterpret_cast<const __half2 *>(input2),
-            reinterpret_cast<__half2 *>(output),
-            reinterpret_cast<const float2 *>(scale),
-            reinterpret_cast<const float2 *>(bias), eps);
+        SkipLayerNormKernel2<__half, __half2, threads>
+            <<<block, threads, 0, stream>>>(
+                num, hidden / 2, reinterpret_cast<const __half2 *>(input1),
+                reinterpret_cast<const __half2 *>(input2),
+                reinterpret_cast<__half2 *>(output),
+                reinterpret_cast<const float2 *>(scale),
+                reinterpret_cast<const float2 *>(bias), eps);
 #endif
       } else {
         assert(false);
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h
index 683606ec73383..fd40ac540bfdc 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
@@ -17,10 +17,12 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
+
 #include <cub/cub.cuh>  // NOLINT
 #endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
+
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
diff --git a/paddle/fluid/operators/math/bloomfilter.h b/paddle/fluid/operators/math/bloomfilter.h
index fa3d37ed5f41e..f16fdd135b5a4 100644
--- a/paddle/fluid/operators/math/bloomfilter.h
+++ b/paddle/fluid/operators/math/bloomfilter.h
@@ -16,11 +16,9 @@ limitations under the License. */
 #define BLOOMFILTER_MAGIC_NUM_NEW 17070416
 
 #include <inttypes.h>
-#include <stdlib.h>
-
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-
 #include <unistd.h>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index e51631385eb75..1ea8cafd25e08 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/concat_and_split.h"
-
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h
index b5b0aae23ac87..3b6a12e24023e 100644
--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index de358bf623e61..542dcda963aea 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -119,13 +119,13 @@ void ConcatCase1(DeviceContext* context) {
 }
 
 /**
-  * case 2:
-  *    inputs:
-  *        t_a.shape: [2, 3, 4]
-  *        t_b.shape: [2, 4, 4]
-  *    output:
-  *        out.shape: [2, 7, 4]
-  */
+ * case 2:
+ *    inputs:
+ *        t_a.shape: [2, 3, 4]
+ *        t_b.shape: [2, 4, 4]
+ *    output:
+ *        out.shape: [2, 7, 4]
+ */
 template <typename DeviceContext, typename Place>
 void ConcatCase2(DeviceContext* context) {
   paddle::framework::Tensor input_a_cpu;
@@ -222,13 +222,13 @@ void ConcatCase2(DeviceContext* context) {
 }
 
 /**
-  * case 3:
-  *    inputs:
-  *        t_a.shape: [2, 3, 5]
-  *        t_b.shape: [2, 3, 4]
-  *    output:
-  *        out.shape: [2, 3, 9]
-  */
+ * case 3:
+ *    inputs:
+ *        t_a.shape: [2, 3, 5]
+ *        t_b.shape: [2, 3, 4]
+ *    output:
+ *        out.shape: [2, 3, 9]
+ */
 template <typename DeviceContext, typename Place>
 void ConcatCase3(DeviceContext* context) {
   paddle::framework::Tensor input_a_cpu;
@@ -326,14 +326,14 @@ void ConcatCase3(DeviceContext* context) {
 }
 
 /**
-  * case 4:
-  *    inputs:
-  *        axis = 1
-  *        t_a.shape: [2, 3, 4]
-  *        t_b.shape: [2, 3, 4]
-  *    output:
-  *        out.shape: [2, 6, 4]
-  */
+ * case 4:
+ *    inputs:
+ *        axis = 1
+ *        t_a.shape: [2, 3, 4]
+ *        t_b.shape: [2, 3, 4]
+ *    output:
+ *        out.shape: [2, 6, 4]
+ */
 template <typename DeviceContext, typename Place>
 void ConcatCase4(DeviceContext* context) {
   paddle::framework::Tensor input_a_cpu;
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
index cb2f59182c111..a2b83f998566f 100644
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/cross_entropy.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
index da7340e4eb0b3..e562816d6dab6 100644
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <limits>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/phi/core/hostdevice.h"
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index 1ade2190bb96e..22ce162a44ce0 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -42,9 +42,10 @@ static void CheckEighResult(const int batch, const int info) {
           "tridiagonal form did not converge to zero",
           batch, info));
   PADDLE_ENFORCE_GE(
-      info, 0, platform::errors::PreconditionNotMet(
-                   "For batch [%d]: the [%d] argument had an illegal value",
-                   batch, info));
+      info, 0,
+      platform::errors::PreconditionNotMet(
+          "For batch [%d]: the [%d] argument had an illegal value", batch,
+          info));
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu
index 9b03895cdef25..946a1477c3b6a 100644
--- a/paddle/fluid/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
@@ -36,35 +37,35 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
           int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
           threads = dim3(tiled_size, 1);
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruGate<
-              T, tiled_size><<<grid, threads, 0, stream>>>(
-              value.gate_value, value.prev_out_value, value.gate_weight,
-              value.reset_output_value, frame_size, active_gate);
+          detail::KeFastCollectiveGruGate<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(
+                  value.gate_value, value.prev_out_value, value.gate_weight,
+                  value.reset_output_value, frame_size, active_gate);
 
           frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruOut<
-              T, tiled_size><<<grid, threads, 0, stream>>>(
-              value.state_weight, value.prev_out_value, value.output_value,
-              value.gate_value, value.reset_output_value, frame_size,
-              active_node, origin_mode);
+          detail::KeFastCollectiveGruOut<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(
+                  value.state_weight, value.prev_out_value, value.output_value,
+                  value.gate_value, value.reset_output_value, frame_size,
+                  active_node, origin_mode);
         } else {
           constexpr int tiled_size = 16;
           int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
           threads = dim3(tiled_size, 1);
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruGate<
-              T, tiled_size><<<grid, threads, 0, stream>>>(
-              value.gate_value, value.prev_out_value, value.gate_weight,
-              value.reset_output_value, frame_size, active_gate);
+          detail::KeFastCollectiveGruGate<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(
+                  value.gate_value, value.prev_out_value, value.gate_weight,
+                  value.reset_output_value, frame_size, active_gate);
 
           frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruOut<
-              T, tiled_size><<<grid, threads, 0, stream>>>(
-              value.state_weight, value.prev_out_value, value.output_value,
-              value.gate_value, value.reset_output_value, frame_size,
-              active_node, origin_mode);
+          detail::KeFastCollectiveGruOut<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(
+                  value.state_weight, value.prev_out_value, value.output_value,
+                  value.gate_value, value.reset_output_value, frame_size,
+                  active_node, origin_mode);
         }
         return;
       } else {
@@ -86,18 +87,18 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
 
     if (batch_size == 1) {
       detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* is_batch= */ false,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gate_value,
-          value.reset_output_value, value.prev_out_value, frame_size,
-          batch_size, active_gate);
+                                      /* is_batch= */ false, T>
+          <<<grid, threads, 0, stream>>>(
+              detail::forward::gru_resetOutput<T>(), value.gate_value,
+              value.reset_output_value, value.prev_out_value, frame_size,
+              batch_size, active_gate);
     } else {
       detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* is_batch= */ true,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gate_value,
-          value.reset_output_value, value.prev_out_value, frame_size,
-          batch_size, active_gate);
+                                      /* is_batch= */ true, T>
+          <<<grid, threads, 0, stream>>>(
+              detail::forward::gru_resetOutput<T>(), value.gate_value,
+              value.reset_output_value, value.prev_out_value, frame_size,
+              batch_size, active_gate);
     }
 
     if (value.prev_out_value) {
@@ -109,18 +110,18 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
 
     if (batch_size == 1) {
       detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* is_batch= */ false,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gate_value,
-          value.prev_out_value, value.output_value, frame_size, batch_size,
-          active_node, origin_mode);
+                                      /* is_batch= */ false, T>
+          <<<grid, threads, 0, stream>>>(detail::forward::gru_finalOutput<T>(),
+                                         value.gate_value, value.prev_out_value,
+                                         value.output_value, frame_size,
+                                         batch_size, active_node, origin_mode);
     } else {
       detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* is_batch= */ true,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gate_value,
-          value.prev_out_value, value.output_value, frame_size, batch_size,
-          active_node, origin_mode);
+                                      /* is_batch= */ true, T>
+          <<<grid, threads, 0, stream>>>(detail::forward::gru_finalOutput<T>(),
+                                         value.gate_value, value.prev_out_value,
+                                         value.output_value, frame_size,
+                                         batch_size, active_node, origin_mode);
     }
   }
 };
@@ -147,19 +148,21 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
     }
 
     if (batch_size == 1) {
-      detail::KeGruBackwardStateGrad<
-          detail::backward::gru_stateGrad<T>,
-          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.output_grad, frame_size, batch_size, active_node, origin_mode);
+      detail::KeGruBackwardStateGrad<detail::backward::gru_stateGrad<T>,
+                                     /* is_batch= */ false>
+          <<<grid, threads, 0, stream>>>(
+              detail::backward::gru_stateGrad<T>(), value.gate_value,
+              grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+              grad.output_grad, frame_size, batch_size, active_node,
+              origin_mode);
     } else {
-      detail::KeGruBackwardStateGrad<
-          detail::backward::gru_stateGrad<T>,
-          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.output_grad, frame_size, batch_size, active_node, origin_mode);
+      detail::KeGruBackwardStateGrad<detail::backward::gru_stateGrad<T>,
+                                     /* is_batch= */ true>
+          <<<grid, threads, 0, stream>>>(
+              detail::backward::gru_stateGrad<T>(), value.gate_value,
+              grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+              grad.output_grad, frame_size, batch_size, active_node,
+              origin_mode);
     }
 
     auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
@@ -179,19 +182,19 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
     }
 
     if (batch_size == 1) {
-      detail::KeGruBackwardResetGrad<
-          detail::backward::gru_resetGrad<T>,
-          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.reset_output_grad, frame_size, batch_size, active_gate);
+      detail::KeGruBackwardResetGrad<detail::backward::gru_resetGrad<T>,
+                                     /* is_batch= */ false>
+          <<<grid, threads, 0, stream>>>(
+              detail::backward::gru_resetGrad<T>(), value.gate_value,
+              grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+              grad.reset_output_grad, frame_size, batch_size, active_gate);
     } else {
-      detail::KeGruBackwardResetGrad<
-          detail::backward::gru_resetGrad<T>,
-          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.reset_output_grad, frame_size, batch_size, active_gate);
+      detail::KeGruBackwardResetGrad<detail::backward::gru_resetGrad<T>,
+                                     /* is_batch= */ true>
+          <<<grid, threads, 0, stream>>>(
+              detail::backward::gru_resetGrad<T>(), value.gate_value,
+              grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+              grad.reset_output_grad, frame_size, batch_size, active_gate);
     }
 
     if (grad.prev_out_grad && value.prev_out_value) {
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 8fc6c52122abf..1f5f575c7c350 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -111,16 +111,18 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, DeviceContext,
                        ((dilation[0] * (filter_height - 1) + 1))) /
                               stride[0] +
                           1,
-                      col_height, platform::errors::InvalidArgument(
-                                      "Output_height and padding(padding_up, "
-                                      "padding_down) are inconsistent."));
+                      col_height,
+                      platform::errors::InvalidArgument(
+                          "Output_height and padding(padding_up, "
+                          "padding_down) are inconsistent."));
     PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
                        ((dilation[1] * (filter_width - 1) + 1))) /
                               stride[1] +
                           1,
-                      col_width, platform::errors::InvalidArgument(
-                                     "Output_height and padding(padding_up, "
-                                     "padding_down) are inconsistent."));
+                      col_width,
+                      platform::errors::InvalidArgument(
+                          "Output_height and padding(padding_up, "
+                          "padding_down) are inconsistent."));
 
     int channels_col = im_channels * filter_height * filter_width;
 
@@ -275,9 +277,10 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF, DeviceContext,
 
     PADDLE_ENFORCE_EQ(
         (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height, platform::errors::InvalidArgument(
-                        "Output_height and padding(padding_up, padding_down) "
-                        "are inconsistent."));
+        col_height,
+        platform::errors::InvalidArgument(
+            "Output_height and padding(padding_up, padding_down) "
+            "are inconsistent."));
     PADDLE_ENFORCE_EQ(
         (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
         col_width,
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
index 09253a495cd5c..0c48547002fc5 100644
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -220,16 +221,18 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, DeviceContext,
                        (dilation[0] * (filter_height - 1) + 1)) /
                               stride[0] +
                           1,
-                      col_height, platform::errors::InvalidArgument(
-                                      "Output_height and padding(padding_up, "
-                                      "padding_down) are inconsistent."));
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width, platform::errors::InvalidArgument(
-                                     "col_width and padding(padding_left, "
-                                     "padding_right) are inconsistent."));
+                      col_height,
+                      platform::errors::InvalidArgument(
+                          "Output_height and padding(padding_up, "
+                          "padding_down) are inconsistent."));
+    PADDLE_ENFORCE_EQ(
+        (im_width + padding[1] + padding[3] -
+         (dilation[1] * (filter_width - 1) + 1)) /
+                stride[1] +
+            1,
+        col_width,
+        platform::errors::InvalidArgument("col_width and padding(padding_left, "
+                                          "padding_right) are inconsistent."));
 
     size_t num_kernels = im_channels * im_height * im_width;
 
@@ -430,16 +433,18 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF, DeviceContext,
                        (dilation[0] * (filter_height - 1) + 1)) /
                               stride[0] +
                           1,
-                      col_height, platform::errors::InvalidArgument(
-                                      "Output_height and padding(padding_up, "
-                                      "padding_down) are inconsistent."));
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width, platform::errors::InvalidArgument(
-                                     "col_width and padding(padding_left, "
-                                     "padding_right) are inconsistent."));
+                      col_height,
+                      platform::errors::InvalidArgument(
+                          "Output_height and padding(padding_up, "
+                          "padding_down) are inconsistent."));
+    PADDLE_ENFORCE_EQ(
+        (im_width + padding[1] + padding[3] -
+         (dilation[1] * (filter_width - 1) + 1)) /
+                stride[1] +
+            1,
+        col_width,
+        platform::errors::InvalidArgument("col_width and padding(padding_left, "
+                                          "padding_right) are inconsistent."));
 
     int block_dim_x = 0;
     int block_dim_y = 0;
diff --git a/paddle/fluid/operators/math/im2col.h b/paddle/fluid/operators/math/im2col.h
index 3865443170481..2a81637d7a815 100644
--- a/paddle/fluid/operators/math/im2col.h
+++ b/paddle/fluid/operators/math/im2col.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h
index 01f1e220e65d9..f3755653f28d4 100644
--- a/paddle/fluid/operators/math/im2col_cfo_cpu.h
+++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 0e4032986cf0c..ff766cfad2cb1 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
+
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index b77e23450360c..bd170b674042d 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -24,6 +24,7 @@ namespace cub = hipcub;
 
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/reverse_iterator.h>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -196,15 +197,15 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
   grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
   dim3 thread_dims(kThreadNumX, kThreadNumY);
   if (reverse) {
-    InclusiveScanInnerDimCUDAKernel<
-        T, BinaryOp, kThreadNumX, kThreadNumY,
-        /*kReverse=*/true><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
-        x, y, outer_dim, inner_dim, init, op);
+    InclusiveScanInnerDimCUDAKernel<T, BinaryOp, kThreadNumX, kThreadNumY,
+                                    /*kReverse=*/true>
+        <<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(x, y, outer_dim,
+                                                         inner_dim, init, op);
   } else {
-    InclusiveScanInnerDimCUDAKernel<
-        T, BinaryOp, kThreadNumX, kThreadNumY,
-        /*kReverse=*/false><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
-        x, y, outer_dim, inner_dim, init, op);
+    InclusiveScanInnerDimCUDAKernel<T, BinaryOp, kThreadNumX, kThreadNumY,
+                                    /*kReverse=*/false>
+        <<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(x, y, outer_dim,
+                                                         inner_dim, init, op);
   }
 }
 
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 083d6967ff03a..a3c1d23e89b37 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/bfloat16.h"
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 359552a0717a0..1d6afa50cc930 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc
index 7b239b8166644..f2b083b833701 100644
--- a/paddle/fluid/operators/math/matrix_solve.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/matrix_solve.h"
+
 #include "Eigen/Core"
 #include "Eigen/LU"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc
index 737196dde1dfc..59c8c07e6e186 100644
--- a/paddle/fluid/operators/math/matrix_solve.cu.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/matrix_solve.h"
+
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/solve_op.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h
index 415d0c6dd8e0c..cecc3517934c7 100644
--- a/paddle/fluid/operators/math/matrix_solve.h
+++ b/paddle/fluid/operators/math/matrix_solve.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "Eigen/Core"
 #include "Eigen/LU"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index f86eb103449f6..1ae0c709e4da9 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <thrust/random.h>
 #include <thrust/sort.h>
+
 #include <iostream>
 #include <vector>
 
diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
index 5f1cd25941614..d645e1994f101 100644
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/sampler.h"
 
 #include <glog/logging.h>
+
 #include "paddle/fluid/framework/generator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index e4b033b6c5857..7689c31838d33 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index db5c66d319701..edcb21cb56a25 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -163,10 +163,10 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     dim3 threads(block_size, 1);
     dim3 grid(in1_rows.size(), 1);
     paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddTensorKernel<
-        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data,
-        in1_row_numel);
+    SelectedRowsAddTensorKernel<T, block_size>
+        <<<grid, threads, 0, context.stream()>>>(
+            in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data,
+            in1_row_numel);
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
     auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
@@ -223,10 +223,10 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
     dim3 threads(block_size, 1);
     dim3 grid(in1_rows.size(), 1);
     paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddTensorKernel<
-        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data,
-        in1_row_numel);
+    SelectedRowsAddTensorKernel<T, block_size>
+        <<<grid, threads, 0, context.stream()>>>(
+            in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data,
+            in1_row_numel);
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
     auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
@@ -343,10 +343,10 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     dim3 threads(block_size, 1);
     dim3 grid(in1_rows.size(), 1);
     paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddToTensorKernel<
-        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data,
-        in1_row_numel);
+    SelectedRowsAddToTensorKernel<T, block_size>
+        <<<grid, threads, 0, context.stream()>>>(
+            in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data,
+            in1_row_numel);
   }
 };
 
@@ -380,10 +380,10 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
     dim3 threads(block_size, 1);
     dim3 grid(in1_rows.size(), 1);
     paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddToTensorKernel<
-        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data,
-        in1_row_numel);
+    SelectedRowsAddToTensorKernel<T, block_size>
+        <<<grid, threads, 0, context.stream()>>>(
+            in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data,
+            in1_row_numel);
   }
 };
 
@@ -695,9 +695,9 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
 
     dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
     dim3 grid(in1_rows.size(), 1);
-    UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
-        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
-                                              op, in2_data, in1_row_numel);
+    UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS>
+        <<<grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
+                                                 op, in2_data, in1_row_numel);
   }
 };
 }  // namespace scatter
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index e0e28f93f367e..e6358cda274f6 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -457,8 +457,9 @@ TEST(selected_rows_functor, cpu_sum_to) {
   paddle::operators::math::SelectedRowsSumTo<paddle::platform::CPUDeviceContext,
                                              float>
       sum_to_functor;
-  sum_to_functor(ctx, std::vector<phi::SelectedRows*>(
-                          {selected_rows1.get(), selected_rows2.get()}),
+  sum_to_functor(ctx,
+                 std::vector<phi::SelectedRows*>(
+                     {selected_rows1.get(), selected_rows2.get()}),
                  std::vector<int64_t>({0, in1_value->numel()}), output.get());
   auto out_height = output->height();
   EXPECT_EQ(out_height, height);
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index 0912a964792a8..6e1d0bb367050 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+
 #include "gtest/gtest.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index 35ba8c1d118a8..97e276fff02d7 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_padding.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
 namespace phi {
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
index 9aca6ad0f5a2f..ef7981858a96d 100644
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/operators/math/sequence_padding.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 
diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h
index 956a4ff6a2d45..687c64fc23e5d 100644
--- a/paddle/fluid/operators/math/sequence_padding.h
+++ b/paddle/fluid/operators/math/sequence_padding.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -64,13 +65,14 @@ inline static void CheckDims(const framework::DDim& seq_tensor_dims,
   PADDLE_ENFORCE_EQ(
       seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
           seq_tensor_dims.size() == pad_tensor_dims.size(),
-      true, platform::errors::InvalidArgument(
-                "pad_tensor's rank should be 1 greater than seq_tensor's "
-                "rank, or be equal with it. The pad_tensor's rank is %ld, "
-                "expected the seq_tensor's rank is %ld or %ld, but got %ld. "
-                "Please check the input value.",
-                pad_tensor_dims.size(), pad_tensor_dims.size(),
-                pad_tensor_dims.size() - 1, seq_tensor_dims.size()));
+      true,
+      platform::errors::InvalidArgument(
+          "pad_tensor's rank should be 1 greater than seq_tensor's "
+          "rank, or be equal with it. The pad_tensor's rank is %ld, "
+          "expected the seq_tensor's rank is %ld or %ld, but got %ld. "
+          "Please check the input value.",
+          pad_tensor_dims.size(), pad_tensor_dims.size(),
+          pad_tensor_dims.size() - 1, seq_tensor_dims.size()));
 }
 
 /*
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index 8312d7cd9b72b..9abe9e598881a 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/math/sequence_pooling.h"
+
 #include <string>
 
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/sequence_pooling.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index fa7b043153851..217b29e1b6b18 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <string>
+
 #include "paddle/fluid/operators/math/sequence_pooling.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/macros.h"
@@ -170,41 +171,41 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(std::max(static_cast<int>(lod.size()) - 1, 1), 1);
     paddle::framework::MixVector<size_t> mix_vector(&lod);
     if (pooltype == "MAX") {
-      sequence_pool_kernel<
-          T, MaxPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          MaxPoolFunctor<T>(), input.data<T>(), pad_value,
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), index->data<int>());
+      sequence_pool_kernel<T, MaxPoolFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              MaxPoolFunctor<T>(), input.data<T>(), pad_value,
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              output->mutable_data<T>(context.GetPlace()), index->data<int>());
     } else if (pooltype == "AVERAGE") {
-      sequence_pool_kernel<
-          T, AvgPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          AvgPoolFunctor<T>(), input.data<T>(), pad_value,
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_kernel<T, AvgPoolFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              AvgPoolFunctor<T>(), input.data<T>(), pad_value,
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SUM") {
-      sequence_pool_kernel<
-          T, SumPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          SumPoolFunctor<T>(), input.data<T>(), pad_value,
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_kernel<T, SumPoolFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              SumPoolFunctor<T>(), input.data<T>(), pad_value,
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SQRT") {
-      sequence_pool_kernel<
-          T, SqrtPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          SqrtPoolFunctor<T>(), input.data<T>(), pad_value,
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_kernel<T, SqrtPoolFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              SqrtPoolFunctor<T>(), input.data<T>(), pad_value,
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "LAST") {
-      sequence_pool_kernel<
-          T, LastPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          LastPoolFunctor<T>(), input.data<T>(), pad_value,
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_kernel<T, LastPoolFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              LastPoolFunctor<T>(), input.data<T>(), pad_value,
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "FIRST") {
-      sequence_pool_kernel<
-          T, FirstPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          FirstPoolFunctor<T>(), input.data<T>(), pad_value,
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_kernel<T, FirstPoolFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              FirstPoolFunctor<T>(), input.data<T>(), pad_value,
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              output->mutable_data<T>(context.GetPlace()), nullptr);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "unsupported pooling pooltype: %s. Only support \"MAX\", "
@@ -338,41 +339,41 @@ class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(std::max(static_cast<int>(lod.size()) - 1, 1), 1);
     paddle::framework::MixVector<size_t> mix_vector(&lod);
     if (pooltype == "MAX") {
-      sequence_pool_grad_kernel<
-          T, MaxPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          MaxPoolGradFunctor<T>(), out_grad.data<T>(),
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), index->data<int>());
+      sequence_pool_grad_kernel<T, MaxPoolGradFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              MaxPoolGradFunctor<T>(), out_grad.data<T>(),
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              in_grad->mutable_data<T>(context.GetPlace()), index->data<int>());
     } else if (pooltype == "AVERAGE") {
-      sequence_pool_grad_kernel<
-          T, AvgPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          AvgPoolGradFunctor<T>(), out_grad.data<T>(),
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_grad_kernel<T, AvgPoolGradFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              AvgPoolGradFunctor<T>(), out_grad.data<T>(),
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SUM") {
-      sequence_pool_grad_kernel<
-          T, SumPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          SumPoolGradFunctor<T>(), out_grad.data<T>(),
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_grad_kernel<T, SumPoolGradFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              SumPoolGradFunctor<T>(), out_grad.data<T>(),
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SQRT") {
-      sequence_pool_grad_kernel<
-          T, SqrtPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          SqrtPoolGradFunctor<T>(), out_grad.data<T>(),
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_grad_kernel<T, SqrtPoolGradFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              SqrtPoolGradFunctor<T>(), out_grad.data<T>(),
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "LAST") {
-      sequence_pool_grad_kernel<
-          T, LastPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          LastPoolGradFunctor<T>(), out_grad.data<T>(),
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_grad_kernel<T, LastPoolGradFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              LastPoolGradFunctor<T>(), out_grad.data<T>(),
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "FIRST") {
-      sequence_pool_grad_kernel<
-          T, FirstPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          FirstPoolGradFunctor<T>(), out_grad.data<T>(),
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_grad_kernel<T, FirstPoolGradFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              FirstPoolGradFunctor<T>(), out_grad.data<T>(),
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              in_grad->mutable_data<T>(context.GetPlace()), nullptr);
 
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h
index 847d0bca951a7..f5b6701b46ef4 100644
--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
index 815d221e2556a..6d9c75f955041 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_pooling.h"
+
 #include <gtest/gtest.h>
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
index bc8832a1bbc56..8f954e068c048 100644
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_scale.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
 namespace phi {
diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu
index 253a67c2c8cbe..c0b97497cc7bf 100644
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -53,10 +53,10 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
         seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
         seq_width);
 #else
-    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
-        num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-        seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
-        seq_width);
+    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>
+        <<<num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
+            seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
+            seq_width);
 #endif
     mix_vector.CopyToCPU();
   }
@@ -82,10 +82,10 @@ class ScaleLoDTensorFunctor<phi::GPUContext, T> {
         seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
         seq_width);
 #else
-    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
-        num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-        seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
-        seq_width);
+    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>
+        <<<num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
+            seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
+            seq_width);
 #endif
     mix_vector.CopyToCPU();
   }
diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc
index c855cb763a97b..adea86a6c5a87 100644
--- a/paddle/fluid/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/softmax.h"
+
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 69642c8194221..33da631d27b14 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/jit/kernels.h"
@@ -66,34 +67,32 @@ class SoftmaxEigen {
     if (num_remain == 1) {
       // axis == -1, axis and class in same dimension, calculate along
       // class dimension directly for higher performance
-      softmax.device(*context.eigen_device()) = (logits -
-                                                 logits.maximum(along_axis)
-                                                     .eval()
-                                                     .reshape(batch_by_one)
-                                                     .broadcast(one_by_class))
-                                                    .unaryExpr(ValueClip<T>());
+      softmax.device(*context.eigen_device()) =
+          (logits - logits.maximum(along_axis)
+                        .eval()
+                        .reshape(batch_by_one)
+                        .broadcast(one_by_class))
+              .unaryExpr(ValueClip<T>());
     } else {
       // axis != -1, class dimension split into (axis, remain), max and sum
       // should be calculated along axis dimension
       softmax.device(*context.eigen_device()) =
-          (logits.reshape(batch_axis_remain) -
-           logits.reshape(batch_axis_remain)
-               .maximum(along_axis)
-               .eval()
-               .reshape(batch_one_remain)
-               .broadcast(one_axis_one)
-               .reshape(batch_classes))
+          (logits.reshape(batch_axis_remain) - logits.reshape(batch_axis_remain)
+                                                   .maximum(along_axis)
+                                                   .eval()
+                                                   .reshape(batch_one_remain)
+                                                   .broadcast(one_axis_one)
+                                                   .reshape(batch_classes))
               .unaryExpr(ValueClip<T>());
     }
 
     softmax.device(*context.eigen_device()) = softmax.exp();
     softmax.device(*context.eigen_device()) =
-        (softmax *
-         softmax.reshape(batch_axis_remain)
-             .sum(along_axis)
-             .inverse()
-             .eval()
-             .broadcast(one_axis));
+        (softmax * softmax.reshape(batch_axis_remain)
+                       .sum(along_axis)
+                       .inverse()
+                       .eval()
+                       .broadcast(one_axis));
   }
 };
 
@@ -128,31 +127,28 @@ class SoftmaxEigen<DeviceContext, platform::float16, is_test> {
       // axis == -1, axis and class in same dimension, calculate along
       // class dimension directly for higher performance
       softmax.device(*context.eigen_device()) =
-          (logits -
-           logits.maximum(along_axis)
-               .reshape(batch_by_one)
-               .broadcast(one_by_class))
+          (logits - logits.maximum(along_axis)
+                        .reshape(batch_by_one)
+                        .broadcast(one_by_class))
               .unaryExpr(ValueClip<platform::float16>());
     } else {
       // axis != -1, class dimension split into (axis, remain), max and sum
       // should be calculated along axis dimension
       softmax.device(*context.eigen_device()) =
-          (logits.reshape(batch_axis_remain) -
-           logits.reshape(batch_axis_remain)
-               .maximum(along_axis)
-               .reshape(batch_one_remain)
-               .broadcast(one_axis_one)
-               .reshape(batch_classes))
+          (logits.reshape(batch_axis_remain) - logits.reshape(batch_axis_remain)
+                                                   .maximum(along_axis)
+                                                   .reshape(batch_one_remain)
+                                                   .broadcast(one_axis_one)
+                                                   .reshape(batch_classes))
               .unaryExpr(ValueClip<platform::float16>());
     }
 
     softmax.device(*context.eigen_device()) = softmax.exp();
     softmax.device(*context.eigen_device()) =
-        (softmax *
-         softmax.reshape(batch_axis_remain)
-             .sum(along_axis)
-             .inverse()
-             .broadcast(one_axis));
+        (softmax * softmax.reshape(batch_axis_remain)
+                       .sum(along_axis)
+                       .inverse()
+                       .broadcast(one_axis));
   }
 };
 
@@ -187,31 +183,28 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16, is_test> {
       // axis == -1, axis and class in same dimension, calculate along
       // class dimension directly for higher performance
       softmax.device(*context.eigen_device()) =
-          (logits -
-           logits.maximum(along_axis)
-               .reshape(batch_by_one)
-               .broadcast(one_by_class))
+          (logits - logits.maximum(along_axis)
+                        .reshape(batch_by_one)
+                        .broadcast(one_by_class))
               .unaryExpr(ValueClip<platform::bfloat16>());
     } else {
       // axis != -1, class dimension split into (axis, remain), max and sum
       // should be calculated along axis dimension
       softmax.device(*context.eigen_device()) =
-          (logits.reshape(batch_axis_remain) -
-           logits.reshape(batch_axis_remain)
-               .maximum(along_axis)
-               .reshape(batch_one_remain)
-               .broadcast(one_axis_one)
-               .reshape(batch_classes))
+          (logits.reshape(batch_axis_remain) - logits.reshape(batch_axis_remain)
+                                                   .maximum(along_axis)
+                                                   .reshape(batch_one_remain)
+                                                   .broadcast(one_axis_one)
+                                                   .reshape(batch_classes))
               .unaryExpr(ValueClip<platform::bfloat16>());
     }
 
     softmax.device(*context.eigen_device()) = softmax.exp();
     softmax.device(*context.eigen_device()) =
-        (softmax *
-         softmax.reshape(batch_axis_remain)
-             .sum(along_axis)
-             .inverse()
-             .broadcast(one_axis));
+        (softmax * softmax.reshape(batch_axis_remain)
+                       .sum(along_axis)
+                       .inverse()
+                       .broadcast(one_axis));
   }
 };
 
diff --git a/paddle/fluid/operators/math/sparse_impl.cu.h b/paddle/fluid/operators/math/sparse_impl.cu.h
index dd2d256dd73b2..03f94ed573604 100644
--- a/paddle/fluid/operators/math/sparse_impl.cu.h
+++ b/paddle/fluid/operators/math/sparse_impl.cu.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/cusparse.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index cd1fa13001ce2..8ad0a17c27ea9 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/math/tree2col.h"
+
 #include <deque>
 #include <stack>
 
diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu
index bdaab212ab170..c8bba20a423e5 100644
--- a/paddle/fluid/operators/math/tree2col.cu
+++ b/paddle/fluid/operators/math/tree2col.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <stack>
+
 #include "paddle/fluid/operators/math/tree2col.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/math/tree2col.h b/paddle/fluid/operators/math/tree2col.h
index 88104b858ba01..df4b233a763d7 100644
--- a/paddle/fluid/operators/math/tree2col.h
+++ b/paddle/fluid/operators/math/tree2col.h
@@ -17,6 +17,7 @@
 #include <array>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
index fb61a36a8e1a7..d8581d731e82b 100644
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/math/vol2col.h b/paddle/fluid/operators/math/vol2col.h
index 3122828b2eeba..cddcb0af467dc 100644
--- a/paddle/fluid/operators/math/vol2col.h
+++ b/paddle/fluid/operators/math/vol2col.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
index 210cf10d8879d..4889817cd9eac 100644
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/vol2col.h"
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 9d381e1f22b5f..2c16774e324a7 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include <algorithm>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -258,13 +259,14 @@ class MatMulGradKernel : public framework::OpKernel<T> {
       MatMul(context, a, trans_a, b, trans_b, out);
     } else {
       auto &ctx = context.template device_context<DeviceContext>();
-      MatMul(context, is_fold_init_dims_a
-                          ? FoldInitDims(a)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
-             trans_a, is_fold_init_dims_b
-                          ? FoldInitDims(b)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
-             trans_b, out);
+      MatMul(
+          context,
+          is_fold_init_dims_a ? FoldInitDims(a)
+                              : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
+          trans_a,
+          is_fold_init_dims_b ? FoldInitDims(b)
+                              : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
+          trans_b, out);
     }
   }
 
@@ -425,13 +427,14 @@ class MatMulDoubleGradKernel : public framework::OpKernel<T> {
       MatMul(context, a, trans_a, b, trans_b, flag, out);
     } else {
       auto &ctx = context.template device_context<DeviceContext>();
-      MatMul(context, is_fold_init_dims_a
-                          ? FoldInitDims(a)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
-             trans_a, is_fold_init_dims_b
-                          ? FoldInitDims(b)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
-             trans_b, flag, out);
+      MatMul(
+          context,
+          is_fold_init_dims_a ? FoldInitDims(a)
+                              : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
+          trans_a,
+          is_fold_init_dims_b ? FoldInitDims(b)
+                              : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
+          trans_b, flag, out);
     }
   }
 
@@ -602,12 +605,13 @@ class MatMulOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           mat_dim_x.batch_size_ == mat_dim_y.batch_size_ ||
               mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0,
-          true, platform::errors::InvalidArgument(
-                    "The batch size of the two matrices should be equal, or "
-                    "at least one is zero.\n"
-                    "But received X's shape: %s, Y's shape: %s.",
-                    DumpMatrixShape(mat_dim_x).c_str(),
-                    DumpMatrixShape(mat_dim_y).c_str()));
+          true,
+          platform::errors::InvalidArgument(
+              "The batch size of the two matrices should be equal, or "
+              "at least one is zero.\n"
+              "But received X's shape: %s, Y's shape: %s.",
+              DumpMatrixShape(mat_dim_x).c_str(),
+              DumpMatrixShape(mat_dim_y).c_str()));
     }
     int64_t dim_out_y = mat_dim_y.width_;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
@@ -996,13 +1000,12 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MatMulDoubleGradKernel<paddle::platform::CUDADeviceContext, double>);
 #endif
 
-REGISTER_OP_VERSION(matmul)
-    .AddCheckpoint(
-        R"ROC(Register matmul for adding the attribute of
+REGISTER_OP_VERSION(matmul).AddCheckpoint(
+    R"ROC(Register matmul for adding the attribute of
        fused_reshape_Y)ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "fused_reshape_Y",
-            "In order to support the function of fused the input Y "
-            " and input X into the input X when "
-            "using the operator of matmul, and get raw shape of input Y.",
-            std::vector<int>{}));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "fused_reshape_Y",
+        "In order to support the function of fused the input Y "
+        " and input X into the input X when "
+        "using the operator of matmul, and get raw shape of input Y.",
+        std::vector<int>{}));
diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc
index 80d4492e04981..3477715d6d3de 100644
--- a/paddle/fluid/operators/matmul_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_op_xpu.cc
@@ -315,14 +315,15 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> {
       MatMul(context, a, trans_a, b, trans_b, out);
     } else {
       auto &dev_ctx = context.template device_context<DeviceContext>();
-      MatMul(
-          context, is_fold_init_dims_a
-                       ? FoldInitDims(a)
-                       : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, a),
-          trans_a, is_fold_init_dims_b
-                       ? FoldInitDims(b)
-                       : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, b),
-          trans_b, out);
+      MatMul(context,
+             is_fold_init_dims_a
+                 ? FoldInitDims(a)
+                 : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, a),
+             trans_a,
+             is_fold_init_dims_b
+                 ? FoldInitDims(b)
+                 : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, b),
+             trans_b, out);
     }
   }
 
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 162ebdafec1cb..168a3dbfeaac1 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/matmul_v2_op.h"
+
 #include <string>
 #include <vector>
 
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index 34a8e97af2e1c..b47cdf6e8cb0d 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <functional>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/dot_op.h"
diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc
index 87df75ac46504..f85e714ce9555 100644
--- a/paddle/fluid/operators/matmul_v2_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc
@@ -14,10 +14,10 @@
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/matmul_v2_op.h"
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/operators/matmul_v2_op.h"
 #include "paddle/fluid/operators/xpu_api_wrapper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc
index 56f65340ea999..ffbb8538d947a 100644
--- a/paddle/fluid/operators/matrix_power_op.cc
+++ b/paddle/fluid/operators/matrix_power_op.cc
@@ -14,6 +14,7 @@
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc
index e7d08b6597360..fddfaa3526a07 100644
--- a/paddle/fluid/operators/matrix_rank_op.cc
+++ b/paddle/fluid/operators/matrix_rank_op.cc
@@ -14,6 +14,7 @@
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h
index 9fa00e60e0550..1cf9f4433bc2c 100644
--- a/paddle/fluid/operators/mean_iou_op.h
+++ b/paddle/fluid/operators/mean_iou_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/mean_op_xpu.cc b/paddle/fluid/operators/mean_op_xpu.cc
index ef96fe2f03ba4..811b138c8d10d 100644
--- a/paddle/fluid/operators/mean_op_xpu.cc
+++ b/paddle/fluid/operators/mean_op_xpu.cc
@@ -56,8 +56,9 @@ class MeanGradXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(OG->numel(), 1, platform::errors::InvalidArgument(
-                                          "Mean Gradient should be scalar"));
+    PADDLE_ENFORCE_EQ(
+        OG->numel(), 1,
+        platform::errors::InvalidArgument("Mean Gradient should be scalar"));
     auto IG = context.Output<Tensor>(framework::GradVarName("X"));
     IG->mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<DeviceContext>();
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index e2b86bd0e3b92..0d4c2f7b3b4b0 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/phi/core/lod_utils.h"
 
 namespace phi {
diff --git a/paddle/fluid/operators/merge_selected_rows_op.cc b/paddle/fluid/operators/merge_selected_rows_op.cc
index ea223ad1b3231..cfb8aa1f8a76e 100644
--- a/paddle/fluid/operators/merge_selected_rows_op.cc
+++ b/paddle/fluid/operators/merge_selected_rows_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/merge_selected_rows_op.h"
+
 #include <unordered_map>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/merge_selected_rows_op.h b/paddle/fluid/operators/merge_selected_rows_op.h
index 4c87a4a641194..d0f18b22b2797 100644
--- a/paddle/fluid/operators/merge_selected_rows_op.h
+++ b/paddle/fluid/operators/merge_selected_rows_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 
diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc
index 5a6862f380da1..cc57a25a1fb34 100644
--- a/paddle/fluid/operators/meshgrid_op.cc
+++ b/paddle/fluid/operators/meshgrid_op.cc
@@ -16,10 +16,9 @@
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-
-#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
 
diff --git a/paddle/fluid/operators/metrics/CMakeLists.txt b/paddle/fluid/operators/metrics/CMakeLists.txt
index 101939dde2c01..b968dbf288ee2 100644
--- a/paddle/fluid/operators/metrics/CMakeLists.txt
+++ b/paddle/fluid/operators/metrics/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(operators)
 if(WITH_UNITY_BUILD)
-    # Load Unity Build rules for operators in paddle/fluid/operators/metrics.
-    include(unity_build_rule.cmake)
+  # Load Unity Build rules for operators in paddle/fluid/operators/metrics.
+  include(unity_build_rule.cmake)
 endif()
 register_operators()
diff --git a/paddle/fluid/operators/metrics/unity_build_rule.cmake b/paddle/fluid/operators/metrics/unity_build_rule.cmake
index fcb690a7b6a85..58acbc3b1e62f 100644
--- a/paddle/fluid/operators/metrics/unity_build_rule.cmake
+++ b/paddle/fluid/operators/metrics/unity_build_rule.cmake
@@ -4,10 +4,5 @@
 # Generally, the combination rules in this file do not need to be modified.
 # If there are some redefined error in compiling with the source file which
 # in combination rule, you can remove the source file from the following rules.
-register_unity_group(cc
-    accuracy_op.cc
-    auc_op.cc
-    precision_recall_op.cc)
-register_unity_group(cu
-    accuracy_op.cu
-    auc_op.cu)
+register_unity_group(cc accuracy_op.cc auc_op.cc precision_recall_op.cc)
+register_unity_group(cu accuracy_op.cu auc_op.cu)
diff --git a/paddle/fluid/operators/miopen_lstm_cache.h b/paddle/fluid/operators/miopen_lstm_cache.h
index c307218baa406..045f917de7016 100644
--- a/paddle/fluid/operators/miopen_lstm_cache.h
+++ b/paddle/fluid/operators/miopen_lstm_cache.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
diff --git a/paddle/fluid/operators/miopen_rnn_cache.h b/paddle/fluid/operators/miopen_rnn_cache.h
index 38cea39abd5de..438163cd77eaa 100644
--- a/paddle/fluid/operators/miopen_rnn_cache.h
+++ b/paddle/fluid/operators/miopen_rnn_cache.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
diff --git a/paddle/fluid/operators/mkldnn/CMakeLists.txt b/paddle/fluid/operators/mkldnn/CMakeLists.txt
index ce95ec560c25e..f40286ad5d8a2 100644
--- a/paddle/fluid/operators/mkldnn/CMakeLists.txt
+++ b/paddle/fluid/operators/mkldnn/CMakeLists.txt
@@ -1 +1,4 @@
-cc_library(mkldnn_axpy_handler SRCS axpy_handler.cc DEPS place device_context enforce)
+cc_library(
+  mkldnn_axpy_handler
+  SRCS axpy_handler.cc
+  DEPS place device_context enforce)
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 393247644c2e8..db74b24b405ed 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -30,11 +30,11 @@ class MKLDNNDeviceContext;
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
-using framework::Tensor;
 using dnnl::memory;
 using dnnl::primitive;
 using dnnl::stream;
+using framework::DataLayout;
+using framework::Tensor;
 using platform::GetMKLDNNFormat;
 using platform::MKLDNNDeviceContext;
 using platform::to_void_cast;
diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc
index ee630fe186a24..80f74195d8e3c 100644
--- a/paddle/fluid/operators/mkldnn/axpy_handler.cc
+++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
+
 #include <cinttypes>
 #include <memory>
 #include <string>
 #include <vector>
 
 #include "dnnl.hpp"
-#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -82,7 +83,7 @@ static void naive_axpy(int n, T alpha, const T *x, T *y) {
   }
 }
 
-}  // anonnymouse namespace
+}  // namespace
 
 template <typename T>
 class OneDNNAXPYHandler<T>::Impl {
diff --git a/paddle/fluid/operators/mkldnn/caching_tests.cmake b/paddle/fluid/operators/mkldnn/caching_tests.cmake
index f48a5d822f8dc..49f08622265d0 100644
--- a/paddle/fluid/operators/mkldnn/caching_tests.cmake
+++ b/paddle/fluid/operators/mkldnn/caching_tests.cmake
@@ -1,6 +1,20 @@
-set(TEST_MKLDNN_CACHING_DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op conv_op im2col vol2col softmax scope device_context enforce)
-if (WITH_GPU OR WITH_ROCM)
+set(TEST_MKLDNN_CACHING_DEPS
+    op_registry
+    elementwise_mul_op
+    elementwise_add_op
+    activation_op
+    softmax_op
+    conv_op
+    im2col
+    vol2col
+    softmax
+    scope
+    device_context
+    enforce)
+if(WITH_GPU OR WITH_ROCM)
   set(TEST_MKLDNN_CACHING_DEPS ${TEST_MKLDNN_CACHING_DEPS} depthwise_conv)
 endif()
-cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS ${TEST_MKLDNN_CACHING_DEPS})
-
+cc_test(
+  test_mkldnn_caching
+  SRCS mkldnn/test_mkldnn_caching.cc
+  DEPS ${TEST_MKLDNN_CACHING_DEPS})
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 5095fa067193a..0881baa6f8eea 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/operators/concat_op.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -21,13 +22,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
-using framework::Tensor;
-using framework::LoDTensor;
+using dnnl::concat;
 using dnnl::memory;
 using dnnl::primitive;
-using dnnl::concat;
 using dnnl::stream;
+using framework::DataLayout;
+using framework::LoDTensor;
+using framework::Tensor;
 using platform::to_void_cast;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index fba17d303f282..65092e059f4af 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -203,8 +203,9 @@ class ConvMKLDNNHandlerT
       dnnl::memory::desc src_md, weights_md;
       if (platform::is_int8<T>()) {
         src_md = platform::MKLDNNMemDesc(
-            src_tz, framework::ToMKLDNNDataType(
-                        framework::TransToProtoVarType(input->dtype())),
+            src_tz,
+            framework::ToMKLDNNDataType(
+                framework::TransToProtoVarType(input->dtype())),
             chosen_memory_format);
         weights_md = platform::MKLDNNMemDesc(
             weights_tz, dnnl::memory::data_type::s8, chosen_memory_format);
@@ -459,13 +460,12 @@ class ConvMKLDNNHandlerT
     auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
     bool is_multi_channel = scale_weights_data.size() > 1;
     bool has_activation = !ctx.Attr<std::string>("fuse_activation").empty();
-    float activation_scale =
-        force_fp32_output ? 1.0f : has_activation ? ctx.Attr<float>("Scale_out")
-                                                  : 1.0f;
-    auto scale_out_data =
-        force_fp32_output ? 1.0f : has_activation
-                                       ? 1.0f
-                                       : ctx.Attr<float>("Scale_out");
+    float activation_scale = force_fp32_output ? 1.0f
+                             : has_activation  ? ctx.Attr<float>("Scale_out")
+                                               : 1.0f;
+    auto scale_out_data = force_fp32_output ? 1.0f
+                          : has_activation  ? 1.0f
+                                            : ctx.Attr<float>("Scale_out");
     float sum_scale =
         fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
     int count =
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index 747e4603d7fe7..e507b2429b7d9 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -28,8 +28,8 @@ using dnnl::primitive;
 using dnnl::reorder;
 using platform::to_void_cast;
 using Tensor = framework::Tensor;
-using framework::DataLayout;
 using dnnl::stream;
+using framework::DataLayout;
 using platform::GetMKLDNNFormat;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
index 91dccbee0aef2..035add5fd834d 100644
--- a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
@@ -18,11 +18,11 @@ limitations under the License. */
 
 namespace {
 
-using paddle::framework::Tensor;
-using phi::vectorize;
-using paddle::framework::GradVarName;
 using paddle::framework::ExecutionContext;
+using paddle::framework::GradVarName;
+using paddle::framework::Tensor;
 using paddle::platform::MKLDNNDeviceContext;
+using phi::vectorize;
 
 template <typename T>
 class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 4078d012fce90..5cbcad5d965a4 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -31,19 +31,19 @@ class MKLDNNDeviceContext;
 namespace paddle {
 namespace operators {
 
+using dnnl::inner_product_forward;
+using dnnl::memory;
+using dnnl::primitive;
+using dnnl::prop_kind;
+using dnnl::stream;
 using framework::DataLayout;
-using framework::Tensor;
-using framework::LoDTensor;
 using framework::DDim;
 using framework::ExecutionContext;
+using framework::LoDTensor;
+using framework::Tensor;
+using platform::GetMKLDNNFormat;
 using platform::MKLDNNDeviceContext;
 using platform::to_void_cast;
-using platform::GetMKLDNNFormat;
-using dnnl::memory;
-using dnnl::inner_product_forward;
-using dnnl::primitive;
-using dnnl::stream;
-using dnnl::prop_kind;
 
 template <typename T_in, typename T_w, typename T_out>
 class FCPrimitiveFactory {
diff --git a/paddle/fluid/operators/mkldnn/inplace_op_tests.cmake b/paddle/fluid/operators/mkldnn/inplace_op_tests.cmake
index c03ce74df7d64..18893e22ec85b 100644
--- a/paddle/fluid/operators/mkldnn/inplace_op_tests.cmake
+++ b/paddle/fluid/operators/mkldnn/inplace_op_tests.cmake
@@ -1,2 +1,12 @@
-cc_test(test_mkldnn_op_inplace SRCS mkldnn/test_mkldnn_op_inplace.cc DEPS op_registry elementwise_add_op activation_op softmax_op softmax scope device_context enforce executor)
-
+cc_test(
+  test_mkldnn_op_inplace
+  SRCS mkldnn/test_mkldnn_op_inplace.cc
+  DEPS op_registry
+       elementwise_add_op
+       activation_op
+       softmax_op
+       softmax
+       scope
+       device_context
+       enforce
+       executor)
diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
index 37d6c07290312..a53a30b737dc4 100644
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
@@ -19,12 +19,12 @@
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
 using dnnl::memory;
 using dnnl::primitive;
 using dnnl::reorder;
-using dnnl::stream;
 using dnnl::resampling_forward;
+using dnnl::stream;
+using framework::DataLayout;
 using platform::GetMKLDNNFormat;
 using platform::to_void_cast;
 
@@ -114,9 +114,10 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_GT(std::all_of(out_dims.begin(), out_dims.end(),
                                   [](int i) { return i > 0; }),
-                      0, platform::errors::InvalidArgument(
-                             "out_d, out_h, out_w of Op(interpolate) "
-                             "should be greater than 0."));
+                      0,
+                      platform::errors::InvalidArgument(
+                          "out_d, out_h, out_w of Op(interpolate) "
+                          "should be greater than 0."));
 
     const std::vector<int64_t> nc_dims = {in_dims[0], in_dims[1]};
     out_dims.insert(out_dims.begin(), nc_dims.begin(), nc_dims.end());
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
index e9abe84e67980..8921db6cbcef9 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -13,19 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h"
+
 #include <tuple>
+
 #include "paddle/fluid/framework/convert_utils.h"
 
 using dnnl::memory;
 using dnnl::primitive;
 using paddle::framework::DataLayout;
 using paddle::framework::ExecutionContext;
-using phi::vectorize;
 using paddle::platform::GetMKLDNNFormat;
-using paddle::platform::MKLDNNFormatForSize;
 using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::MKLDNNFormatForSize;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::to_void_cast;
+using phi::vectorize;
 using Tensor = paddle::framework::Tensor;
 
 namespace {
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h
index 583dcd04018b2..07cb2173a7ec5 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h
@@ -22,8 +22,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using platform::MKLDNNDeviceContext;
 using framework::ExecutionContext;
+using platform::MKLDNNDeviceContext;
 using Tensor = framework::Tensor;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
index 6e7ba59cf1ad8..424faf30d3a9f 100644
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -20,8 +20,8 @@ using dnnl::memory;
 using dnnl::primitive;
 using paddle::framework::DataLayout;
 using paddle::framework::ExecutionContext;
-using paddle::platform::MatMulV2MKLDNNHandler;
 using paddle::platform::GetMKLDNNFormat;
+using paddle::platform::MatMulV2MKLDNNHandler;
 using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::to_void_cast;
@@ -206,11 +206,12 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] == 1 ||
                 y_bd_dims[i] == 1,
-            true, paddle::platform::errors::InvalidArgument(
-                      "Tensor dimensions are incorrect for broadcasting."
-                      "Dimensions in X and Y must be same or equal to 1, but "
-                      "received x_dim[%d]=%d and y_dims[%d]= %d",
-                      i, x_bd_dims[i], i, y_bd_dims[i]));
+            true,
+            paddle::platform::errors::InvalidArgument(
+                "Tensor dimensions are incorrect for broadcasting."
+                "Dimensions in X and Y must be same or equal to 1, but "
+                "received x_dim[%d]=%d and y_dims[%d]= %d",
+                i, x_bd_dims[i], i, y_bd_dims[i]));
         out_dims[i] = std::max(x_bd_dims[i], y_bd_dims[i]);
       }
       out->Resize(phi::make_ddim(out_dims));
diff --git a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
index 8bad3e86b2934..4c94bc3f3ad57 100644
--- a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
+++ b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
@@ -1 +1,14 @@
-cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op shape_op crop_op activation_op pooling transpose_op scope device_context enforce executor)
+cc_test(
+  test_mkldnn_op_nhwc
+  SRCS mkldnn/test_mkldnn_op_nhwc.cc
+  DEPS op_registry
+       pool_op
+       shape_op
+       crop_op
+       activation_op
+       pooling
+       transpose_op
+       scope
+       device_context
+       enforce
+       executor)
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index 77763531c8296..dbf3adcdad07d 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -20,14 +20,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
-using framework::Tensor;
 using dnnl::memory;
 using dnnl::pooling_backward;
 using dnnl::pooling_forward;
 using dnnl::primitive;
 using dnnl::reorder;
 using dnnl::stream;
+using framework::DataLayout;
+using framework::Tensor;
 using platform::to_void_cast;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 8cbe46bee481a..8f3a3e8ba65e7 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -27,8 +27,8 @@ using dnnl::primitive;
 using dnnl::reorder;
 using platform::to_void_cast;
 using Tensor = framework::Tensor;
-using framework::DataLayout;
 using dnnl::stream;
+using framework::DataLayout;
 using platform::GetMKLDNNFormat;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
index 9a7ac6d505522..778a33f27af0a 100644
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
@@ -46,10 +46,12 @@ class ReQuantOpKernel : public framework::OpKernel<T> {
     bool with_shift = shift_in != 0.0f || shift_out != 0.0f;
     auto* output = ctx.Output<Tensor>("Output");
 
-    PADDLE_ENFORCE_NE(scale_in, 0.0f, platform::errors::InvalidArgument(
-                                          "Scale of input cannot be 0.0"));
-    PADDLE_ENFORCE_NE(scale_out, 0.0f, platform::errors::InvalidArgument(
-                                           "Scale of output cannot be 0.0"));
+    PADDLE_ENFORCE_NE(
+        scale_in, 0.0f,
+        platform::errors::InvalidArgument("Scale of input cannot be 0.0"));
+    PADDLE_ENFORCE_NE(
+        scale_out, 0.0f,
+        platform::errors::InvalidArgument("Scale of output cannot be 0.0"));
     if (shift_in != 0.0f) {
       PADDLE_ENFORCE_EQ(
           framework::TransToProtoVarType(input->dtype()),
diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
index a21034d48baaa..f1c5153240ee2 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
@@ -31,8 +31,8 @@ namespace paddle {
 namespace operators {
 
 using paddle::framework::LoDTensor;
-using platform::to_void_cast;
 using platform::GetMKLDNNFormat;
+using platform::to_void_cast;
 
 static std::vector<int> extract_shape(
     const std::vector<const Tensor*>& list_new_shape_tensor) {
diff --git a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
index 28a00be5fa47e..798fe51901df0 100644
--- a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
@@ -17,13 +17,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
-using framework::Tensor;
-using framework::LoDTensor;
+using dnnl::concat;
 using dnnl::memory;
 using dnnl::primitive;
-using dnnl::concat;
 using dnnl::stream;
+using framework::DataLayout;
+using framework::LoDTensor;
+using framework::Tensor;
 using platform::to_void_cast;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index de21c2687bd44..b564602fdaada 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -116,8 +116,9 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const auto& mkldnn_engine = dev_ctx.GetEngine();
     auto in_vars = ctx.MultiInputVar("X");
 
-    PADDLE_ENFORCE_NE(in_vars.empty(), true, platform::errors::InvalidArgument(
-                                                 "Input variable is empty."));
+    PADDLE_ENFORCE_NE(
+        in_vars.empty(), true,
+        platform::errors::InvalidArgument("Input variable is empty."));
     auto& input0 = in_vars[0]->Get<LoDTensor>();
     LoDTensor* output = ctx.Output<LoDTensor>("Out");
 
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
index b5fb0c54c7812..1e04cc8a8a525 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -16,6 +16,7 @@
 #include <map>
 #include <random>
 #include <string>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -121,8 +122,9 @@ void RunOperator(const platform::Place &place, const std::string &op_type,
   auto op =
       num_inputs[op_type] > 1
           ? framework::OpRegistry::CreateOp(
-                op_type, {{first_input_var_name, {first_input}},
-                          {second_input_var_name, {"x1"}}},
+                op_type,
+                {{first_input_var_name, {first_input}},
+                 {second_input_var_name, {"x1"}}},
                 {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}})
           : framework::OpRegistry::CreateOp(
                 op_type, {{first_input_var_name, {first_input}}},
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
index 4090d5ffca801..a1acf3706c590 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
@@ -16,6 +16,7 @@
 #include <cstdlib>
 #include <memory>
 #include <random>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index b9866ba8c3647..f4b79a0216332 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -16,6 +16,7 @@
 #include <cstdlib>
 #include <memory>
 #include <random>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index ee9922773147c..13f9dba9eeb8f 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -66,7 +66,7 @@ class TransposeMKLDNNHandler {
  protected:
   dnnl::memory::desc Axis2MemoryDesc(std::vector<int64_t>& nchw_tz,  // NOLINT
                                      std::vector<int>& axis          // NOLINT
-                                     ) {
+  ) {
     size_t ndims = axis.size();
 
     std::vector<int64_t> strides(ndims);
diff --git a/paddle/fluid/operators/mlu/CMakeLists.txt b/paddle/fluid/operators/mlu/CMakeLists.txt
index efd6aeb8eeb1c..c383edecaac91 100644
--- a/paddle/fluid/operators/mlu/CMakeLists.txt
+++ b/paddle/fluid/operators/mlu/CMakeLists.txt
@@ -1,5 +1,10 @@
-
-IF(WITH_MLU)
-    cc_library(mlu_baseop SRCS mlu_baseop.cc DEPS neuware_lib device_context)
-    cc_test(activation_op_mlu_test SRCS activation_op_mlu_test.cc DEPS op_registry activation_op scope device_context executor)
-ENDIF()
+if(WITH_MLU)
+  cc_library(
+    mlu_baseop
+    SRCS mlu_baseop.cc
+    DEPS neuware_lib device_context)
+  cc_test(
+    activation_op_mlu_test
+    SRCS activation_op_mlu_test.cc
+    DEPS op_registry activation_op scope device_context executor)
+endif()
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 9d3b8e2407fbf..1ff27454013e1 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc
index 9c16ccb138f7d..d946f177545b4 100644
--- a/paddle/fluid/operators/mode_op.cc
+++ b/paddle/fluid/operators/mode_op.cc
@@ -13,10 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-
-#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index 9a53c7162ff6d..4216ee097be52 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/modified_huber_loss_op.h"
+
 #include <memory>
 
 namespace paddle {
@@ -29,10 +30,11 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
 
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::InvalidArgument(
-                                            "Input(input) rank should be 2, "
-                                            "but received input rank(%d) != 2",
-                                            x_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument("Input(input) rank should be 2, "
+                                          "but received input rank(%d) != 2",
+                                          x_dims.size()));
 
     if (ctx->IsRuntime() ||
         (phi::product(x_dims) > 0 && phi::product(y_dims) > 0)) {
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
index 8f1894b5af0a1..ad34a54a9bf29 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <thrust/for_each.h>
 #include <thrust/host_vector.h>
 #include <thrust/tuple.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/modified_huber_loss_op.h"
 #include "paddle/phi/core/hostdevice.h"
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index ef04d5582d3c0..b31935cefc235 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
diff --git a/paddle/fluid/operators/mul_op_xpu.cc b/paddle/fluid/operators/mul_op_xpu.cc
index 7410b3b607c82..9f52dc8559d42 100644
--- a/paddle/fluid/operators/mul_op_xpu.cc
+++ b/paddle/fluid/operators/mul_op_xpu.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/xpu_api_wrapper.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index 4e6ad35e612b7..72243b408f4be 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
 
diff --git a/paddle/fluid/operators/nanmedian_op.cc b/paddle/fluid/operators/nanmedian_op.cc
index 23a497bdb1d3d..63bfea650ac00 100644
--- a/paddle/fluid/operators/nanmedian_op.cc
+++ b/paddle/fluid/operators/nanmedian_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt
index b3d53f0d39020..218d53aa6303a 100644
--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
@@ -1,24 +1,38 @@
-if (NOT (WITH_NCCL OR WITH_RCCL))
+if(NOT (WITH_NCCL OR WITH_RCCL))
   return()
 endif()
 
 if(WITH_GPU AND NOT WIN32)
-  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
+  nv_library(
+    nccl_common
+    SRCS nccl_gpu_common.cc
+    DEPS device_context operator)
 endif()
 
 if(WITH_ROCM AND NOT WIN32)
-  hip_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
+  hip_library(
+    nccl_common
+    SRCS nccl_gpu_common.cc
+    DEPS device_context operator)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
-    op_library(nccl_op DEPS nccl_common)
-    set(OPERATOR_DEPS ${OPERATOR_DEPS} nccl_common PARENT_SCOPE)
+  op_library(nccl_op DEPS nccl_common)
+  set(OPERATOR_DEPS
+      ${OPERATOR_DEPS} nccl_common
+      PARENT_SCOPE)
 endif()
 
 if(WITH_GPU AND NOT WIN32)
-    nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+  nv_test(
+    nccl_op_test
+    SRCS nccl_op_test.cu.cc
+    DEPS nccl_op gpu_info device_context)
 endif()
 
 if(WITH_ROCM AND NOT WIN32)
-    hip_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+  hip_test(
+    nccl_op_test
+    SRCS nccl_op_test.cu.cc
+    DEPS nccl_op gpu_info device_context)
 endif()
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
index bcbc96ea1b6d1..8a0112fa11d80 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
@@ -25,7 +25,7 @@ size_t last_num_gpus = -1;
 // TODO(panyx0718): Need to decide whether Paddle supports parallel
 // runs with different number GPUs. If true, current solution is not enough.
 std::mutex comm_mu;
-}
+}  // namespace
 
 int Communicator::GetCommId(int device_id) const {
   std::lock_guard<std::mutex> guard(comm_mu);
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
index 65c3447ff23ee..b99800ecd64be 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -19,9 +19,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using framework::LoDTensor;
 using framework::Tensor;
 using platform::Communicator;
-using framework::LoDTensor;
 
 template <typename Type>
 class NCCLTypeWrapper;
diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
index 80144c6f25894..21649bfcd378f 100644
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <mutex>   // NOLINT
 #include <thread>  // NOLINT
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index c8af241559429..38c9b809eb6e4 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -15,11 +15,13 @@ limitations under the License. */
 #pragma once
 
 #include <math.h>
+
 #include <iterator>
 #include <random>
 #include <set>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc
index 8f14bc10d5094..d3cbec495fdb5 100644
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ b/paddle/fluid/operators/nll_loss_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/backward.h"
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index 51daccce0e882..0a1f647627a9a 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/unary.h"
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index 0ed1f2719de25..18ae152a689e8 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -450,27 +450,27 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
     set_constant(ctx, dX, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDXWithGlobal<
-            T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
-            dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
-            dx_data);
+        DoubleGradComputeDXWithGlobal<T, DataLayout::kNHWC>
+            <<<grid1, block, 0, ctx.stream()>>>(dy_data, ddscale_data,
+                                                variance_data, epsilon, C,
+                                                sample_size, num, dx_data);
       } else {
-        DoubleGradComputeDXWithGlobal<
-            T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
-            dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
-            dx_data);
+        DoubleGradComputeDXWithGlobal<T, DataLayout::kNCHW>
+            <<<grid1, block, 0, ctx.stream()>>>(dy_data, ddscale_data,
+                                                variance_data, epsilon, C,
+                                                sample_size, num, dx_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDX<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
-            x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
-            ddscale_data, N, C, sample_size, epsilon, dx_data);
+        DoubleGradComputeDX<T, block, DataLayout::kNHWC>
+            <<<grid, block, 0, ctx.stream()>>>(
+                x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
+                ddscale_data, N, C, sample_size, epsilon, dx_data);
       } else {
-        DoubleGradComputeDX<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
-            x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
-            ddscale_data, N, C, sample_size, epsilon, dx_data);
+        DoubleGradComputeDX<T, block, DataLayout::kNCHW>
+            <<<grid, block, 0, ctx.stream()>>>(
+                x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
+                ddscale_data, N, C, sample_size, epsilon, dx_data);
       }
     }
   }
@@ -479,27 +479,27 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
     set_constant(ctx, dScale, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDScaleWithGlobal<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
-            ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
-            dscale_data);
+        DoubleGradComputeDScaleWithGlobal<T, block, DataLayout::kNHWC>
+            <<<grid, block, 0, ctx.stream()>>>(ddx_data, variance_data, dy_data,
+                                               epsilon, N, C, sample_size,
+                                               dscale_data);
       } else {
-        DoubleGradComputeDScaleWithGlobal<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
-            ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
-            dscale_data);
+        DoubleGradComputeDScaleWithGlobal<T, block, DataLayout::kNCHW>
+            <<<grid, block, 0, ctx.stream()>>>(ddx_data, variance_data, dy_data,
+                                               epsilon, N, C, sample_size,
+                                               dscale_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDScale<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
-            x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
-            sample_size, epsilon, dscale_data);
+        DoubleGradComputeDScale<T, block, DataLayout::kNHWC>
+            <<<grid, block, 0, ctx.stream()>>>(
+                x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
+                sample_size, epsilon, dscale_data);
       } else {
-        DoubleGradComputeDScale<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
-            x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
-            sample_size, epsilon, dscale_data);
+        DoubleGradComputeDScale<T, block, DataLayout::kNCHW>
+            <<<grid, block, 0, ctx.stream()>>>(
+                x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
+                sample_size, epsilon, dscale_data);
       }
     }
   }
@@ -508,27 +508,29 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
     set_constant(ctx, ddY, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDDYWithGlobal<
-            T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
-            ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
-            ddscale_data, epsilon, C, sample_size, num, ddy_data);
+        DoubleGradComputeDDYWithGlobal<T, DataLayout::kNHWC>
+            <<<grid1, block, 0, ctx.stream()>>>(
+                ddx_data, scale_data, mean_data, variance_data, x_data,
+                ddbias_data, ddscale_data, epsilon, C, sample_size, num,
+                ddy_data);
       } else {
-        DoubleGradComputeDDYWithGlobal<
-            T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
-            ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
-            ddscale_data, epsilon, C, sample_size, num, ddy_data);
+        DoubleGradComputeDDYWithGlobal<T, DataLayout::kNCHW>
+            <<<grid1, block, 0, ctx.stream()>>>(
+                ddx_data, scale_data, mean_data, variance_data, x_data,
+                ddbias_data, ddscale_data, epsilon, C, sample_size, num,
+                ddy_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDDY<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
-            x_data, mean_data, variance_data, ddscale_data, ddbias_data,
-            ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
+        DoubleGradComputeDDY<T, block, DataLayout::kNHWC>
+            <<<grid, block, 0, ctx.stream()>>>(
+                x_data, mean_data, variance_data, ddscale_data, ddbias_data,
+                ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
       } else {
-        DoubleGradComputeDDY<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
-            x_data, mean_data, variance_data, ddscale_data, ddbias_data,
-            ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
+        DoubleGradComputeDDY<T, block, DataLayout::kNCHW>
+            <<<grid, block, 0, ctx.stream()>>>(
+                x_data, mean_data, variance_data, ddscale_data, ddbias_data,
+                ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
       }
     }
   }
diff --git a/paddle/fluid/operators/norm_utils.h b/paddle/fluid/operators/norm_utils.h
index fee06fe5dd4fa..363702459221d 100644
--- a/paddle/fluid/operators/norm_utils.h
+++ b/paddle/fluid/operators/norm_utils.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <memory>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu
index 923d89c24853f..2fc180fe678e9 100644
--- a/paddle/fluid/operators/number_count_op.cu
+++ b/paddle/fluid/operators/number_count_op.cu
@@ -97,13 +97,13 @@ class NumberCountOpCUDAKernel : public framework::OpKernel<T> {
     auto out_data = number_count->mutable_data<T>(out_dims, place);
     const T* gate_data = numbers->data<T>();
 
-    initialize_zero_kernel<
-        T><<<GET_BLOCKS(upper_range), CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>(
-        out_data, upper_range);
+    initialize_zero_kernel<T>
+        <<<GET_BLOCKS(upper_range), CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>(
+            out_data, upper_range);
 
-    NumberCount<
-        T><<<CEIL(upper_range, PERTHREAD_EXPERTS), 256, 0, dev_ctx.stream()>>>(
-        gate_data, out_data, batch_size, upper_range);
+    NumberCount<T>
+        <<<CEIL(upper_range, PERTHREAD_EXPERTS), 256, 0, dev_ctx.stream()>>>(
+            gate_data, out_data, batch_size, upper_range);
   }
 };
 
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
index 64323e588c628..e6b6320898fb1 100644
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/one_hot_op.h"
+
 #include <string>
 #include <vector>
 
diff --git a/paddle/fluid/operators/one_hot_op_npu.cc b/paddle/fluid/operators/one_hot_op_npu.cc
index 24b506ebf8a06..4e11cbb38883b 100644
--- a/paddle/fluid/operators/one_hot_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/one_hot_op.h"
-
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc
index 122b6a8a80aac..cb7b9963bbdf3 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cc
+++ b/paddle/fluid/operators/one_hot_v2_op.cc
@@ -14,6 +14,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc
index e5702a37bb2b4..dcf098f105c21 100644
--- a/paddle/fluid/operators/one_hot_v2_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/optimizers/CMakeLists.txt b/paddle/fluid/operators/optimizers/CMakeLists.txt
index 6989447fc04fd..7a27dda735c4a 100644
--- a/paddle/fluid/operators/optimizers/CMakeLists.txt
+++ b/paddle/fluid/operators/optimizers/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(operators)
 if(WITH_UNITY_BUILD)
-    # Load Unity Build rules for operators in paddle/fluid/operators/optimizers.
-    include(unity_build_rule.cmake)
+  # Load Unity Build rules for operators in paddle/fluid/operators/optimizers.
+  include(unity_build_rule.cmake)
 endif()
 register_operators()
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc
index 91bad1430615f..64f22cced3baf 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cc
@@ -15,13 +15,12 @@ limitations under the License. */
 #include <cmath>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
index 1ea91f6ebfa3e..e13805f694bd6 100644
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -183,16 +183,25 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
-    const auto& runner =
-        NpuOpRunner("ApplyAdamD",
-                    {
-                        *param, *mom1, *mom2, *beta1_pow, *beta2_pow, *lr,
-                        *beta1_tensor, *beta2_tensor, *epsilon_tensor, *grad,
-                    },
-                    {
-                        *param_out, *mom1_out, *mom2_out,
-                    },
-                    {});
+    const auto& runner = NpuOpRunner("ApplyAdamD",
+                                     {
+                                         *param,
+                                         *mom1,
+                                         *mom2,
+                                         *beta1_pow,
+                                         *beta2_pow,
+                                         *lr,
+                                         *beta1_tensor,
+                                         *beta2_tensor,
+                                         *epsilon_tensor,
+                                         *grad,
+                                     },
+                                     {
+                                         *param_out,
+                                         *mom1_out,
+                                         *mom2_out,
+                                     },
+                                     {});
     runner.Run(stream);
 
     // NOTE(zhiqiu): ApplyAdamD updates params inplace, so
diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
index 6ea0b2054cdea..37467c7ba9614 100644
--- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
@@ -306,8 +306,9 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
       }
       xpu_wait(dev_ctx.x_context()->xpu_stream);
     } else {
-      PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument(
-                                  "Variable type not supported by adam_op"));
+      PADDLE_ENFORCE_EQ(1, 2,
+                        platform::errors::InvalidArgument(
+                            "Variable type not supported by adam_op"));
     }
   }
 };
diff --git a/paddle/fluid/operators/optimizers/adamw_op.cc b/paddle/fluid/operators/optimizers/adamw_op.cc
index e2670625d4e50..43e9dc0cae8ef 100644
--- a/paddle/fluid/operators/optimizers/adamw_op.cc
+++ b/paddle/fluid/operators/optimizers/adamw_op.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/optimizers/adam_op.h"
-
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/operators/optimizers/adam_op.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
 
diff --git a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc
index d86d2bd2ffb4a..57a6b744fd6db 100644
--- a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc
@@ -205,8 +205,9 @@ class AdamwOpXPUKernel : public framework::OpKernel<T> {
         }
       }
     } else {
-      PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument(
-                                  "Variable type not supported by adamw_op"));
+      PADDLE_ENFORCE_EQ(1, 2,
+                        platform::errors::InvalidArgument(
+                            "Variable type not supported by adamw_op"));
     }
   }
 };
diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/fluid/operators/optimizers/cast_with_ptr.h
index a3fbb0e59e24e..eb031ae0c933a 100644
--- a/paddle/fluid/operators/optimizers/cast_with_ptr.h
+++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h
@@ -43,9 +43,9 @@ static void VecCastKernel(const platform::CUDADeviceContext &ctx, const InT *x,
   in_arr[0] = reinterpret_cast<const _ptr_ char *>(x);
   phi::Array<_ptr_ OutT *, 1> out_arr;
   out_arr[0] = y;
-  phi::funcs::VectorizedElementwiseKernel<
-      OutT, FunctorT, 1, 1, VecSize><<<block, thread, 0, stream>>>(
-      in_arr, out_arr, n, main_offset, FunctorT());
+  phi::funcs::VectorizedElementwiseKernel<OutT, FunctorT, 1, 1, VecSize>
+      <<<block, thread, 0, stream>>>(in_arr, out_arr, n, main_offset,
+                                     FunctorT());
 }
 
 }  // namespace details
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
index 7f0b2b7d064ed..40ac044e6475e 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <string>
-
 #include "paddle/fluid/operators/optimizers/dgc_momentum_op.h"
 
+#include <string>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
index 3688b8067c231..7cbc52f4235de 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
@@ -83,10 +83,12 @@ static void GetParamGradShardInfo(const std::vector<ParamGradInfo> &infos,
   VLOG(10) << "start_size = " << start_size << " , end_size = " << end_size;
 
   if (infos.empty()) {
-    PADDLE_ENFORCE_EQ(start_size, 0, platform::errors::InvalidArgument(
-                                         "start_size should be 0."));
-    PADDLE_ENFORCE_EQ(end_size, 0, platform::errors::InvalidArgument(
-                                       "end_size should be 0."));
+    PADDLE_ENFORCE_EQ(
+        start_size, 0,
+        platform::errors::InvalidArgument("start_size should be 0."));
+    PADDLE_ENFORCE_EQ(
+        end_size, 0,
+        platform::errors::InvalidArgument("end_size should be 0."));
     *start_idx = 0;
     *end_idx = 0;
     *start_numel_offset = 0;
@@ -104,15 +106,17 @@ static void GetParamGradShardInfo(const std::vector<ParamGradInfo> &infos,
       infos.begin());
   if (i == n || infos[i].numel_offset != start_size) {
     PADDLE_ENFORCE_GT(
-        i, 0, platform::errors::InvalidArgument(
-                  "Cannot find suitable sharding which is between [%d, %d)",
-                  start_size, end_size));
+        i, 0,
+        platform::errors::InvalidArgument(
+            "Cannot find suitable sharding which is between [%d, %d)",
+            start_size, end_size));
     --i;
   }
   PADDLE_ENFORCE_LT(
-      i, n, platform::errors::InvalidArgument(
-                "Cannot find suitable sharding which is between [%d, %d)",
-                start_size, end_size));
+      i, n,
+      platform::errors::InvalidArgument(
+          "Cannot find suitable sharding which is between [%d, %d)", start_size,
+          end_size));
   *start_idx = i;
   *start_numel_offset = start_size - infos[i].numel_offset;
   auto j = static_cast<size_t>(
@@ -450,8 +454,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
                       platform::errors::InvalidArgument(
                           "The attr(alignment) should be the power of 2."));
     PADDLE_ENFORCE_GE(
-        rank, 0, platform::errors::InvalidArgument(
-                     "The attr(rank) should be equal to or larger than 0."));
+        rank, 0,
+        platform::errors::InvalidArgument(
+            "The attr(rank) should be equal to or larger than 0."));
     PADDLE_ENFORCE_LT(
         rank, nranks,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index c857c6de4d093..eb354ef6d7576 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <cmath>
+
 #include "paddle/fluid/memory/buffer.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/optimizers/cast_with_ptr.h"
@@ -32,6 +33,7 @@
 
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
+
 #include "math.h"  // NOLINT
 namespace cub = hipcub;
 #endif
@@ -190,9 +192,8 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
   PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL);
 #undef PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL
 
-  MultiTensorL2NormReduceAgainCUDAKernel<
-      MT, OutT, kBlockDim><<<n, kBlockDim, 0, stream>>>(tmp_out_ptr, y,
-                                                        max_chunk_num);
+  MultiTensorL2NormReduceAgainCUDAKernel<MT, OutT, kBlockDim>
+      <<<n, kBlockDim, 0, stream>>>(tmp_out_ptr, y, max_chunk_num);
 }
 
 template <int LogLevel>
@@ -508,14 +509,14 @@ static void MultiTensorUpdateLambMomentAndTrustRatioDiv(
                                       "Output(Step) cannot be nullptr."));
   }
 
-#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL                         \
-  do {                                                                    \
-    UpdateLambMomentAndTrustRatioDivCUDAKernel<T, GradT, kVecSize><<<     \
-        config.block_per_grid, config.thread_per_block, 0, stream>>>(     \
-        param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p,    \
-        beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, step, \
-        weight_decay, weight_decay_end_numel, beta1, beta2, epsilon,      \
-        max_global_grad_norm, numel, rescale_grad);                       \
+#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL                             \
+  do {                                                                        \
+    UpdateLambMomentAndTrustRatioDivCUDAKernel<T, GradT, kVecSize>            \
+        <<<config.block_per_grid, config.thread_per_block, 0, stream>>>(      \
+            param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p,    \
+            beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, step, \
+            weight_decay, weight_decay_end_numel, beta1, beta2, epsilon,      \
+            max_global_grad_norm, numel, rescale_grad);                       \
   } while (0)
 
   PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL);
@@ -705,8 +706,9 @@ static void MultiTensorUpdateLambParamAndBetaPows(
     PADDLE_ENFORCE_NOT_NULL(beta2pow, platform::errors::InvalidArgument(
                                           "Beta2Pow should not be nullptr."));
   } else {
-    PADDLE_ENFORCE_EQ(beta2pow, nullptr, platform::errors::InvalidArgument(
-                                             "Beta2Pow should be nullptr."));
+    PADDLE_ENFORCE_EQ(
+        beta2pow, nullptr,
+        platform::errors::InvalidArgument("Beta2Pow should be nullptr."));
   }
 
   const int block_dim = 512;
@@ -744,21 +746,21 @@ static void MultiTensorUpdateLambParamAndBetaPows(
                     betapow_helper);                                           \
   } while (0)
 
-#define PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE        \
-  do {                                                              \
-    auto callback = [&](                                            \
-        const MultiTensorLauncher<kNumTensor, kNumChunk> &launcher, \
-        int launch_n) {                                             \
-      if (has_beta_pow && launch_n == 0) {                          \
-        PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(true);          \
-        beta1pow = nullptr;                                         \
-        beta2pow = nullptr;                                         \
-      } else {                                                      \
-        PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(false);         \
-      }                                                             \
-    };                                                              \
-    MultiTensorApplyWithCallback<kNumTensor, kNumChunk>(            \
-        stream, offsets, n, chunk_size, block_dim, callback);       \
+#define PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE            \
+  do {                                                                  \
+    auto callback =                                                     \
+        [&](const MultiTensorLauncher<kNumTensor, kNumChunk> &launcher, \
+            int launch_n) {                                             \
+          if (has_beta_pow && launch_n == 0) {                          \
+            PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(true);          \
+            beta1pow = nullptr;                                         \
+            beta2pow = nullptr;                                         \
+          } else {                                                      \
+            PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(false);         \
+          }                                                             \
+        };                                                              \
+    MultiTensorApplyWithCallback<kNumTensor, kNumChunk>(                \
+        stream, offsets, n, chunk_size, block_dim, callback);           \
   } while (0)
 
   PD_VEC_LAUNCH_KERNEL(vec_size,
@@ -793,11 +795,11 @@ static void LaunchScaleKernel(const platform::CUDADeviceContext &dev_ctx,
   int vec_size = std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0));
   auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size);
 
-#define PD_LAMB_VEC_SCALE_KERNEL_CASE                                          \
-  do {                                                                         \
-    ScaleCUDAKernel<T1, T2, kVecSize><<<config.block_per_grid,                 \
-                                        config.thread_per_block, 0, stream>>>( \
-        x, scale, y, n);                                                       \
+#define PD_LAMB_VEC_SCALE_KERNEL_CASE                                    \
+  do {                                                                   \
+    ScaleCUDAKernel<T1, T2, kVecSize>                                    \
+        <<<config.block_per_grid, config.thread_per_block, 0, stream>>>( \
+            x, scale, y, n);                                             \
   } while (0)
 
   PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAMB_VEC_SCALE_KERNEL_CASE);
@@ -1015,7 +1017,7 @@ static void CheckHasNanInfGrad(const float *fp32_grad, int fp32_numel,
   if (fp32_numel > 0) {
     fp32_has_nan_inf = reinterpret_cast<bool *>(nan_inf_flag + 1);
     cub::TransformInputIterator<bool, IsNanInfFunctor<float>, const float *>
-    iter(fp32_grad, IsNanInfFunctor<float>());
+        iter(fp32_grad, IsNanInfFunctor<float>());
     CubDeviceReduce(iter, fp32_has_nan_inf, fp32_numel, OrFunctor(), false,
                     stream, cub_tmp_buffer);
   }
@@ -1082,11 +1084,11 @@ static void LaunchElementwiseAddWithCastKernel(
                GetChunkedVecSize(z, 0));
   auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size);
 
-#define PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL                            \
-  do {                                                                        \
-    ElementwiseAddWithCastCUDAKernel<T1, T2, T3, kVecSize><<<                 \
-        config.block_per_grid, config.thread_per_block, 0, stream>>>(x, y, z, \
-                                                                     n);      \
+#define PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL                             \
+  do {                                                                         \
+    ElementwiseAddWithCastCUDAKernel<T1, T2, T3, kVecSize>                     \
+        <<<config.block_per_grid, config.thread_per_block, 0, stream>>>(x, y,  \
+                                                                        z, n); \
   } while (0)
 
   PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL);
@@ -1445,10 +1447,10 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
         if (is_grad_scaled_by_nranks) {
           clip_scale *= num_devices;
         }
-        CalcGradNormClipBeforeAllReduceScale<
-            float, platform::float16><<<1, 1, 0, stream>>>(
-            global_scale, max_global_grad_norm, fp32_square_grad_norm,
-            fp32_scale, fp16_scale, clip_scale);
+        CalcGradNormClipBeforeAllReduceScale<float, platform::float16>
+            <<<1, 1, 0, stream>>>(global_scale, max_global_grad_norm,
+                                  fp32_square_grad_norm, fp32_scale, fp16_scale,
+                                  clip_scale);
         if (fp32_scale) {
           VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place);
         } else {
@@ -1567,11 +1569,12 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
                         fp16_partial_fused_offsets, fp16_local_param_num,
                         param_square_norm + fp16_local_start_idx);
     } else {
-      MultiTensorL2Norm(
-          place, stream, fp16_param + fused_offsets[fp16_local_start_idx] -
-                             fused_offsets[fp32_global_param_num],
-          fused_offsets + fp16_local_start_idx, fp16_local_param_num,
-          param_square_norm + fp16_local_start_idx);
+      MultiTensorL2Norm(place, stream,
+                        fp16_param + fused_offsets[fp16_local_start_idx] -
+                            fused_offsets[fp32_global_param_num],
+                        fused_offsets + fp16_local_start_idx,
+                        fp16_local_param_num,
+                        param_square_norm + fp16_local_start_idx);
     }
 
     MultiTensorL2Norm(place, stream, trust_ratio_div,
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.h b/paddle/fluid/operators/optimizers/dpsgd_op.h
index 688a7f1ad8435..69a853c5d1846 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.h
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <stdlib.h>
+
 #include <iostream>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -108,9 +110,8 @@ class DpsgdOpKernel : public framework::OpKernel<T> {
 
     // update parameters
     for (int64_t i = 0; i < grad->numel(); ++i) {
-      out_data[i] =
-          param_data[i] -
-          lr[0] * (grad_data[i] / scale + gaussian_noise / batch_size);
+      out_data[i] = param_data[i] - lr[0] * (grad_data[i] / scale +
+                                             gaussian_noise / batch_size);
     }
     // CCS16 - Deep Learning with Differential Privacy.
     // [https://arxiv.org/abs/1607.00133]
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h
index 596ed05df3ffd..73fd7ceb67b0e 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.h
+++ b/paddle/fluid/operators/optimizers/ftrl_op.h
@@ -74,9 +74,8 @@ class SparseFTRLFunctor {
       l_acc_out_[j] += g - (std::sqrt(new_acc) - std::sqrt(s_acc)) / lr * p;
     } else {
       l_acc_out_[j] +=
-          g -
-          (std::pow(new_acc, -lr_power_) - std::pow(s_acc, -lr_power_)) / lr *
-              p;
+          g - (std::pow(new_acc, -lr_power_) - std::pow(s_acc, -lr_power_)) /
+                  lr * p;
     }
 
     auto l_acc = l_acc_out_[j];
diff --git a/paddle/fluid/operators/optimizers/lamb_op.cc b/paddle/fluid/operators/optimizers/lamb_op.cc
index 48ceba3695f83..fb2a78d28edfc 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/lamb_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/optimizers/lamb_op.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
@@ -227,13 +229,12 @@ REGISTER_OP_CPU_KERNEL(
     ops::LambOpKernel<paddle::platform::CPUDeviceContext, double>);
 
 /* ==========================  register checkpoint ===========================*/
-REGISTER_OP_VERSION(lamb)
-    .AddCheckpoint(
-        R"ROC(Upgrade lamb, add two new outputs [Beta1PowOut] and [Beta2PowOut].)ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewInput("Beta1PowOut",
-                      "The Output beta1 power accumulator. 'Beta1PowOut' is "
-                      "dispensable.")
-            .NewInput("Beta2PowOut",
-                      "The Output beta2 power accumulator. 'Beta2PowOut' is "
-                      "dispensable."));
+REGISTER_OP_VERSION(lamb).AddCheckpoint(
+    R"ROC(Upgrade lamb, add two new outputs [Beta1PowOut] and [Beta2PowOut].)ROC",
+    paddle::framework::compatible::OpVersionDesc()
+        .NewInput("Beta1PowOut",
+                  "The Output beta1 power accumulator. 'Beta1PowOut' is "
+                  "dispensable.")
+        .NewInput("Beta2PowOut",
+                  "The Output beta2 power accumulator. 'Beta2PowOut' is "
+                  "dispensable."));
diff --git a/paddle/fluid/operators/optimizers/lamb_op.cu b/paddle/fluid/operators/optimizers/lamb_op.cu
index b46fa19ea1352..a9f880fdbb67d 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/lamb_op.cu
@@ -16,7 +16,8 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    lamb, ops::LambOpKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::float16>,
+    lamb,
+    ops::LambOpKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>,
     ops::LambOpKernel<paddle::platform::CUDADeviceContext, float>,
     ops::LambOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/lamb_op.h b/paddle/fluid/operators/optimizers/lamb_op.h
index 45acf2b3e4834..2956ff204679e 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.h
+++ b/paddle/fluid/operators/optimizers/lamb_op.h
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #pragma once
 #include <math.h>  // for sqrt in CPU and CUDA
+
 #include <Eigen/Dense>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/buffer.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
@@ -655,10 +657,10 @@ class LambOpKernel : public framework::OpKernel<T> {
     // TODO(zengjinle): remove the following Eigen operations when
     // *skip_update == true.
     memory::Buffer buffer(dev_ctx.GetPlace());
-    math::SquaredL2Norm(
-        dev_ctx, reinterpret_cast<const MT*>(IsMultiPrecision ? master_param_ptr
-                                                              : param_ptr),
-        p_norm_ptr, numel, &buffer);
+    math::SquaredL2Norm(dev_ctx,
+                        reinterpret_cast<const MT*>(
+                            IsMultiPrecision ? master_param_ptr : param_ptr),
+                        p_norm_ptr, numel, &buffer);
     math::SquaredL2Norm(dev_ctx, trust_ratio_div_ptr, trust_ratio_div_norm_ptr,
                         numel, &buffer);
 
@@ -675,12 +677,12 @@ class LambOpKernel : public framework::OpKernel<T> {
 #define CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(__should_update_beta_pow)         \
   do {                                                                       \
     LambParamUpateFunctor<T, MT, IsMultiPrecision, __should_update_beta_pow> \
-    param_update_functor(                                                    \
-        lr.template data<MT>(), static_cast<const T*>(param_ptr),            \
-        static_cast<const MT*>(master_param_ptr), p_norm_ptr,                \
-        trust_ratio_div_ptr, trust_ratio_div_norm_ptr,                       \
-        static_cast<T*>(param_out_ptr),                                      \
-        static_cast<MT*>(master_param_out_ptr), skip_update_flag);           \
+        param_update_functor(                                                \
+            lr.template data<MT>(), static_cast<const T*>(param_ptr),        \
+            static_cast<const MT*>(master_param_ptr), p_norm_ptr,            \
+            trust_ratio_div_ptr, trust_ratio_div_norm_ptr,                   \
+            static_cast<T*>(param_out_ptr),                                  \
+            static_cast<MT*>(master_param_out_ptr), skip_update_flag);       \
     if (__should_update_beta_pow) {                                          \
       param_update_functor.SetBetaPows(beta1_pow_ptr, beta2_pow_ptr,         \
                                        beta1_pow_out_ptr, beta2_pow_out_ptr, \
diff --git a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
index 7aa5783a01bfd..ef224382cd091 100644
--- a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/lamb_op.h"
 #include "gflags/gflags.h"
+#include "paddle/fluid/operators/optimizers/lamb_op.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
index 5b883a11e5733..553ac69edcac7 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -129,8 +129,9 @@ __device__ inline void VectorizeLarsUpdate(
   for (int i = tid + tail_offset; i < numel; i += grid_stride) {
     MT grad_val = static_cast<MT>(grad[i]) * rescale_grad;
     MT param_val = param[i];
-    MT velocity_tmp = Fma(velocity[i], mu, local_lr * Fma(lars_weight_decay,
-                                                          param_val, grad_val));
+    MT velocity_tmp =
+        Fma(velocity[i], mu,
+            local_lr * Fma(lars_weight_decay, param_val, grad_val));
     MT param_tmp = param_val - velocity_tmp;
     param_out[i] = static_cast<T>(param_tmp);
     velocity_out[i] = velocity_tmp;
@@ -314,10 +315,10 @@ inline void SeparatedLarsMomentumOpCUDAKernel(
     const MT rescale_grad, const int64_t numel, const MT* master_param_data,
     MT* master_out_data, const bool is_amp) {
   LarsThreadConfig<T> lars_thread_config(numel);
-  L2NormKernel<T, MT><<<lars_thread_config.grid_for_norm, LARS_BLOCK_SIZE, 0,
-                        cuda_ctx.stream()>>>(
-      param_data, grad_data, p_buffer, g_buffer, numel,
-      lars_thread_config.repeat_times, rescale_grad);
+  L2NormKernel<T, MT>
+      <<<lars_thread_config.grid_for_norm, LARS_BLOCK_SIZE, 0,
+         cuda_ctx.stream()>>>(param_data, grad_data, p_buffer, g_buffer, numel,
+                              lars_thread_config.repeat_times, rescale_grad);
 
   MomentumLarsKernel<T, MT><<<lars_thread_config.grid_for_lars, LARS_BLOCK_SIZE,
                               0, cuda_ctx.stream()>>>(
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
index 54ead6d3df7f0..280c0930e91d5 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/optimizers/merged_momentum_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/optimizers/merged_momentum_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
index 5fad5eca9affc..d405500d60768 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/optimizers/merged_momentum_op.h"
-
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
 
@@ -151,10 +150,11 @@ class NPUMergedMomentumOpKernel : public framework::OpKernel<T> {
       framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out);
       framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out);
       // NOTE: ApplyMomentum will change the input
-      const auto& runner = NpuOpRunner(
-          "ApplyMomentum", {*param_out, *velocity_out, *learning_rate,
-                            regularized_grad, mu_tensor},
-          {*param_out}, {{"use_nesterov", use_nesterov}});
+      const auto& runner =
+          NpuOpRunner("ApplyMomentum",
+                      {*param_out, *velocity_out, *learning_rate,
+                       regularized_grad, mu_tensor},
+                      {*param_out}, {{"use_nesterov", use_nesterov}});
       runner.Run(dev_ctx.stream());
     }
   }
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cc b/paddle/fluid/operators/optimizers/momentum_op.cc
index 50d2c946f3afe..94fb4c156ef5f 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/optimizers/momentum_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
@@ -109,28 +110,26 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ops::MomentumOpInferVarType);
 
-REGISTER_OP_VERSION(momentum)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(momentum).AddCheckpoint(
+    R"ROC(
       Upgrade momentum add 4 attributes [regularization_method, regularization_coeff,
       multi_precision, rescale_grad].
     )ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewInput("MasterParam", "FP32 master weight for AMP.")
-            .NewOutput("MasterParamOut",
-                       "The updated FP32 master weight for AMP. "
-                       "It shared memory with Input(MasterParam).")
-            .NewAttr("regularization_method",
-                     "(string) regularization_method, right now only support "
-                     "l2decay or none",
-                     std::string(""))
-            .NewAttr("regularization_coeff", "(float) regularization_coeff",
-                     0.0f)
-            .NewAttr(
-                "multi_precision",
-                "(bool) Whether to use multi-precision during weight updating.",
-                false)
-            .NewAttr("rescale_grad",
-                     "(float) Multiply the gradient with `rescale_grad`"
-                     "before updating. Often choose to be `1.0/batch_size`.",
-                     1.0f));
+    paddle::framework::compatible::OpVersionDesc()
+        .NewInput("MasterParam", "FP32 master weight for AMP.")
+        .NewOutput("MasterParamOut",
+                   "The updated FP32 master weight for AMP. "
+                   "It shared memory with Input(MasterParam).")
+        .NewAttr("regularization_method",
+                 "(string) regularization_method, right now only support "
+                 "l2decay or none",
+                 std::string(""))
+        .NewAttr("regularization_coeff", "(float) regularization_coeff", 0.0f)
+        .NewAttr(
+            "multi_precision",
+            "(bool) Whether to use multi-precision during weight updating.",
+            false)
+        .NewAttr("rescale_grad",
+                 "(float) Multiply the gradient with `rescale_grad`"
+                 "before updating. Often choose to be `1.0/batch_size`.",
+                 1.0f));
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index 017f33d7458fc..2f6a9758a2cf5 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
diff --git a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
index b8fa81b2e7123..417f89410cf88 100644
--- a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/momentum_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/optimizers/momentum_op.h"
 #include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
 
 namespace paddle {
@@ -77,8 +77,9 @@ class MLUMomentumOpKernel : public framework::OpKernel<T> {
                              GetBasePtr(learning_rate), GetBasePtr(&mu_tensor),
                              GetBasePtr(param_out), GetBasePtr(velocity_out));
     } else if (grad_var->IsType<phi::SelectedRows>()) {
-      PADDLE_ENFORCE_EQ(false, true, platform::errors::PermissionDenied(
-                                         "Unsupport SparseMomentum"));
+      PADDLE_ENFORCE_EQ(
+          false, true,
+          platform::errors::PermissionDenied("Unsupport SparseMomentum"));
     } else {
       PADDLE_ENFORCE_EQ(false, true,
                         platform::errors::PermissionDenied(
diff --git a/paddle/fluid/operators/optimizers/momentum_op_npu.cc b/paddle/fluid/operators/optimizers/momentum_op_npu.cc
index 2d73766b97364..d3ffeb18be7b9 100644
--- a/paddle/fluid/operators/optimizers/momentum_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_npu.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/optimizers/momentum_op.h"
-
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
@@ -70,14 +69,16 @@ class NPUMomentumOpKernel : public framework::OpKernel<T> {
       framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out);
       framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out);
       // NOTE: ApplyMomentum will change the input
-      const auto& runner = NpuOpRunner(
-          "ApplyMomentum", {*param_out, *velocity_out, *learning_rate,
-                            regularized_grad, mu_tensor},
-          {*param_out}, {{"use_nesterov", use_nesterov}});
+      const auto& runner =
+          NpuOpRunner("ApplyMomentum",
+                      {*param_out, *velocity_out, *learning_rate,
+                       regularized_grad, mu_tensor},
+                      {*param_out}, {{"use_nesterov", use_nesterov}});
       runner.Run(dev_ctx.stream());
     } else if (grad_var->IsType<phi::SelectedRows>()) {
-      PADDLE_ENFORCE_EQ(false, true, platform::errors::PermissionDenied(
-                                         "Unsupport SparseMomentum"));
+      PADDLE_ENFORCE_EQ(
+          false, true,
+          platform::errors::PermissionDenied("Unsupport SparseMomentum"));
     } else {
       PADDLE_ENFORCE_EQ(false, true,
                         platform::errors::PermissionDenied(
diff --git a/paddle/fluid/operators/optimizers/momentum_op_xpu.cc b/paddle/fluid/operators/optimizers/momentum_op_xpu.cc
index 6897213c91a34..749d38f315e00 100644
--- a/paddle/fluid/operators/optimizers/momentum_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_xpu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <string>
+
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/optimizers/multi_tensor_apply.h b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
index 179e8f452545c..98850aa816bdc 100644
--- a/paddle/fluid/operators/optimizers/multi_tensor_apply.h
+++ b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <cstdint>
+
 #include "math.h"  // NOLINT
 
 namespace paddle {
@@ -108,11 +109,11 @@ class MultiTensorLauncher {
         stream_(stream) {}
 
   template <typename Functor, typename... Args>
-  void Launch(Functor &&functor, Args &&... args) const {
-    MultiTensorApplyCUDAKernel<
-        Functor, MaxTensorNumPerLaunch,
-        MaxChunkNumPerLaunch><<<chunk_id_, block_dim_, 0, stream_>>>(
-        functor, meta_, chunk_size_, args...);
+  void Launch(Functor &&functor, Args &&...args) const {
+    MultiTensorApplyCUDAKernel<Functor, MaxTensorNumPerLaunch,
+                               MaxChunkNumPerLaunch>
+        <<<chunk_id_, block_dim_, 0, stream_>>>(functor, meta_, chunk_size_,
+                                                args...);
   }
 
  private:
@@ -189,7 +190,7 @@ template <typename Functor, int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
           typename... Args>
 static void MultiTensorApply(Functor functor, gpuStream_t stream,
                              const int *offsets, int n, int chunk_size,
-                             int block_dim, Args &&... args) {
+                             int block_dim, Args &&...args) {
   auto callback = [&](const MultiTensorLauncher<MaxTensorNumPerLaunch,
                                                 MaxChunkNumPerLaunch> &launcher,
                       int i) { launcher.Launch(functor, args...); };
diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
index 6893e5d6b9b2c..5eeeb7353072e 100644
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
+++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
 
diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h
index 74cf762745077..353d8777a84ab 100644
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h
+++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h
@@ -47,9 +47,8 @@ struct Pow2DecayWithLinearWarmupFunctor {
       auto new_lr = static_cast<double>(step) / warmup_steps_ * base_lr_;
       *lr_ = static_cast<T>(new_lr);
     } else if (step < total_steps_) {
-      auto factor = 1 -
-                    static_cast<double>(step - warmup_steps_) /
-                        (total_steps_ - warmup_steps_);
+      auto factor = 1 - static_cast<double>(step - warmup_steps_) /
+                            (total_steps_ - warmup_steps_);
       auto new_lr =
           static_cast<double>(base_lr_ - end_lr_) * (factor * factor) + end_lr_;
       *lr_ = static_cast<T>(new_lr);
@@ -76,9 +75,10 @@ class Pow2DecayWithLinearWarmupOpKernel : public framework::OpKernel<T> {
     auto *lr_out = ctx.Output<framework::Tensor>("LearningRateOut");
     auto *step_out = ctx.Output<framework::Tensor>("StepOut");
     PADDLE_ENFORCE_EQ(
-        lr, lr_out, platform::errors::InvalidArgument("Input(LearningRate) and "
-                                                      "Output(LearningRateOut) "
-                                                      "must be the same."));
+        lr, lr_out,
+        platform::errors::InvalidArgument("Input(LearningRate) and "
+                                          "Output(LearningRateOut) "
+                                          "must be the same."));
     PADDLE_ENFORCE_NOT_NULL(lr,
                             platform::errors::InvalidArgument(
                                 "Input(LearingRate) should not be nullptr."));
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cc b/paddle/fluid/operators/optimizers/rmsprop_op.cc
index b3458724482e9..874e21cc6ccbf 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
 
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
index b53d51686cfd7..7f4810ea4207a 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include <gflags/gflags.h>
+
 #include <iostream>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index a2af131cb505e..b5822fd5c446e 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <string>
-
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
+
+#include <string>
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -76,10 +76,11 @@ class SGDOpInferVarType : public framework::VarTypeInference {
     auto in_var_type = ctx->GetInputType("Param");
     PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
                           in_var_type == framework::proto::VarType::LOD_TENSOR,
-                      true, platform::errors::InvalidArgument(
-                                "The input Var's type should be LoDtensor or "
-                                "SelectedRows, but the received type is %s",
-                                in_var_type));
+                      true,
+                      platform::errors::InvalidArgument(
+                          "The input Var's type should be LoDtensor or "
+                          "SelectedRows, but the received type is %s",
+                          in_var_type));
 
     ctx->SetOutputType("ParamOut", in_var_type, framework::ALL_ELEMENTS);
   }
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index 222244a2fd1e3..ba2e84a6a789d 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -106,11 +107,11 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
       int block = 512;
       int grid = (param->numel() + block - 1) / block;
 
-      SGDKernelMT<
-          T, MPDType><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
-          param->data<T>(), grad->data<T>(), learning_rate->data<T>(),
-          param->numel(), param_out->mutable_data<T>(ctx.GetPlace()),
-          master_in_data, master_out_data);
+      SGDKernelMT<T, MPDType>
+          <<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+              param->data<T>(), grad->data<T>(), learning_rate->data<T>(),
+              param->numel(), param_out->mutable_data<T>(ctx.GetPlace()),
+              master_in_data, master_out_data);
 
     } else if (grad_var->IsType<phi::SelectedRows>()) {
       // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
diff --git a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
index e7c03be95cae1..7203357db10e3 100644
--- a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include <string>
+
+#include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
index c38545df17311..0c4fa916f4331 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/optimizers/sparse_momentum_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
index 08b2d3764feba..296a3d5b88975 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/optimizers/unity_build_rule.cmake b/paddle/fluid/operators/optimizers/unity_build_rule.cmake
index 769bb781d6e72..61e63ad9a6e61 100644
--- a/paddle/fluid/operators/optimizers/unity_build_rule.cmake
+++ b/paddle/fluid/operators/optimizers/unity_build_rule.cmake
@@ -4,32 +4,34 @@
 # Generally, the combination rules in this file do not need to be modified.
 # If there are some redefined error in compiling with the source file which
 # in combination rule, you can remove the source file from the following rules.
-register_unity_group(cc
-    ftrl_op.cc
-    lars_momentum_op.cc
-    momentum_op.cc
-    sgd_op.cc
-    proximal_adagrad_op.cc
-    adagrad_op.cc
-    adam_op.cc
-    adamax_op.cc
-    dgc_momentum_op.cc
-    proximal_gd_op.cc
-    decayed_adagrad_op.cc
-    adadelta_op.cc
-    lamb_op.cc
-    dpsgd_op.cc
-    rmsprop_op.cc)
-register_unity_group(cu
-    ftrl_op.cu
-    lars_momentum_op.cu
-    momentum_op.cu
-    sgd_op.cu
-    proximal_adagrad_op.cu
-    adagrad_op.cu
-    adam_op.cu
-    adamax_op.cu
-    decayed_adagrad_op.cu
-    adadelta_op.cu
-    lamb_op.cu
-    rmsprop_op.cu)
+register_unity_group(
+  cc
+  ftrl_op.cc
+  lars_momentum_op.cc
+  momentum_op.cc
+  sgd_op.cc
+  proximal_adagrad_op.cc
+  adagrad_op.cc
+  adam_op.cc
+  adamax_op.cc
+  dgc_momentum_op.cc
+  proximal_gd_op.cc
+  decayed_adagrad_op.cc
+  adadelta_op.cc
+  lamb_op.cc
+  dpsgd_op.cc
+  rmsprop_op.cc)
+register_unity_group(
+  cu
+  ftrl_op.cu
+  lars_momentum_op.cu
+  momentum_op.cu
+  sgd_op.cu
+  proximal_adagrad_op.cu
+  adagrad_op.cu
+  adam_op.cu
+  adamax_op.cu
+  decayed_adagrad_op.cu
+  adadelta_op.cu
+  lamb_op.cu
+  rmsprop_op.cu)
diff --git a/paddle/fluid/operators/p_norm_op.cc b/paddle/fluid/operators/p_norm_op.cc
index c7c8ebf562b4d..21254521fa912 100644
--- a/paddle/fluid/operators/p_norm_op.cc
+++ b/paddle/fluid/operators/p_norm_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -124,11 +125,10 @@ REGISTER_OPERATOR(p_norm, ops::PnormOp, ops::PnormOpMaker,
                   PNormInferShapeFunctor);
 REGISTER_OPERATOR(p_norm_grad, ops::PnormOpGrad, PNormGradInferShapeFunctor);
 
-REGISTER_OP_VERSION(p_norm)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(p_norm).AddCheckpoint(
+    R"ROC(
         Upgrade p_norm, add 1 attribute [asvector].
       )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "asvector",
-            "Compute as vector when axis is None and input is matrix", false));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "asvector", "Compute as vector when axis is None and input is matrix",
+        false));
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index 38fa3316a6e27..6d27433512e90 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index 80931fea90f9c..b7f9977f3edb7 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc
index e4952a243262b..b7a638d7ce930 100644
--- a/paddle/fluid/operators/pad3d_op.cc
+++ b/paddle/fluid/operators/pad3d_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/unary.h"
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 087b8ecba6e1f..61a2120e1e43e 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/pad_constant_like_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h
index 0aedd800e1a23..cc7c39d12cd1a 100644
--- a/paddle/fluid/operators/pad_constant_like_op.h
+++ b/paddle/fluid/operators/pad_constant_like_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index dc162ae5782f2..eaf343dde0f0f 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/complex.h"
diff --git a/paddle/fluid/operators/partial_concat_op.cc b/paddle/fluid/operators/partial_concat_op.cc
index fedadc7581e71..e0e6ec31e41e0 100644
--- a/paddle/fluid/operators/partial_concat_op.cc
+++ b/paddle/fluid/operators/partial_concat_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/partial_concat_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
@@ -93,8 +94,9 @@ class PartialConcatOp : public framework::OperatorWithKernel {
         break;
       }
     }
-    PADDLE_ENFORCE_EQ(flag, 1, platform::errors::InvalidArgument(
-                                   "All Inputs of PartialSum OP are Empty!"));
+    PADDLE_ENFORCE_EQ(flag, 1,
+                      platform::errors::InvalidArgument(
+                          "All Inputs of PartialSum OP are Empty!"));
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu
index 322e84ae8b9c2..d36a73037151d 100644
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/partial_concat_op.h"
diff --git a/paddle/fluid/operators/partial_concat_op.h b/paddle/fluid/operators/partial_concat_op.h
index 20a6639e23301..b12cb0a0293e7 100644
--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc
index 72630998d4337..a3ce78054acde 100644
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/partial_sum_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
@@ -96,8 +97,9 @@ class PartialSumOp : public framework::OperatorWithKernel {
       }
     }
 
-    PADDLE_ENFORCE_EQ(flag, 1, platform::errors::InvalidArgument(
-                                   "All Inputs of PartialSum OP are Empty!"));
+    PADDLE_ENFORCE_EQ(flag, 1,
+                      platform::errors::InvalidArgument(
+                          "All Inputs of PartialSum OP are Empty!"));
     return framework::OpKernelType(input_data_type, platform::CPUPlace());
   }
 };
diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu
index 63d140d6769b8..b363483fe6945 100644
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/partial_sum_op.h"
diff --git a/paddle/fluid/operators/partial_sum_op.h b/paddle/fluid/operators/partial_sum_op.h
index d9c6fd758f44c..21c16ed2f6227 100644
--- a/paddle/fluid/operators/partial_sum_op.h
+++ b/paddle/fluid/operators/partial_sum_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index 1724aedbe9b24..026a1749c39d0 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc
index d5896c4105932..b964d8fe116e9 100644
--- a/paddle/fluid/operators/poisson_op.cc
+++ b/paddle/fluid/operators/poisson_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 44f3d8090e565..30ead84d1a987 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -15,13 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/pool_op.h"
 
 #include <unordered_map>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/backward.h"
 #include "paddle/phi/infermeta/unary.h"
-
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc
index f178a966e1e08..d2ec4089f9da9 100644
--- a/paddle/fluid/operators/pool_op_xpu.cc
+++ b/paddle/fluid/operators/pool_op_xpu.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <unordered_map>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 
@@ -112,11 +113,12 @@ class PoolGradXPUKernel : public framework::OpKernel<T> {
     bool exclusive = context.Attr<bool>("exclusive");
     bool adaptive = context.Attr<bool>("adaptive");
     const int* index_data = nullptr;
-    PADDLE_ENFORCE_EQ(ksize.size(), 2, platform::errors::InvalidArgument(
-                                           "The Pool2d XPU OP only support 2 "
-                                           "dimension pooling!, but received "
-                                           "%d-dimension pool kernel size",
-                                           ksize.size()));
+    PADDLE_ENFORCE_EQ(
+        ksize.size(), 2,
+        platform::errors::InvalidArgument("The Pool2d XPU OP only support 2 "
+                                          "dimension pooling!, but received "
+                                          "%d-dimension pool kernel size",
+                                          ksize.size()));
     PADDLE_ENFORCE_EQ(!adaptive || (ksize[0] * ksize[1] == 1), true,
                       platform::errors::InvalidArgument(
                           "The Pool2d XPU OP does not support (adaptive == "
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index e0341f4a4b471..8619cc28d50d3 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index cbe58644f5381..02273b7943ae2 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/positive_negative_pair_op.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -41,11 +42,12 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
           ctx->HasInput("AccumulatePositivePair") &&
               ctx->HasInput("AccumulateNegativePair") &&
               ctx->HasInput("AccumulateNeutralPair"),
-          true, platform::errors::InvalidArgument(
-                    "All optional inputs(AccumulatePositivePair, "
-                    "AccumulateNegativePair, AccumulateNeutralPair) of "
-                    "PositiveNegativePairOp are required if one of them "
-                    "is specified."));
+          true,
+          platform::errors::InvalidArgument(
+              "All optional inputs(AccumulatePositivePair, "
+              "AccumulateNegativePair, AccumulateNeutralPair) of "
+              "PositiveNegativePairOp are required if one of them "
+              "is specified."));
       PADDLE_ENFORCE_EQ(
           ctx->GetInputDim("AccumulatePositivePair"), scalar_dim,
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
index a47deb18b6fcc..972258350bf19 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index de35f67405810..50dc9d6429af0 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/operators/prim_ops/CMakeLists.txt b/paddle/fluid/operators/prim_ops/CMakeLists.txt
index a58ee6dc1f7ba..d29933bc1964a 100644
--- a/paddle/fluid/operators/prim_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/prim_ops/CMakeLists.txt
@@ -1,11 +1,11 @@
 include(operators)
 if(WITH_UNITY_BUILD)
-    # Load Unity Build rules for operators in paddle/fluid/operators/prim_ops.
-    include(unity_build_rule.cmake)
+  # Load Unity Build rules for operators in paddle/fluid/operators/prim_ops.
+  include(unity_build_rule.cmake)
 endif()
 register_operators()
 
-SET(PRIM_OP_SRCS
+set(PRIM_OP_SRCS
     reshape_p_op.cc
     broadcast_p_op.cc
     reduce_p_op.cc
@@ -25,4 +25,7 @@ SET(PRIM_OP_SRCS
     matmul_p_op.cc
     fill_constant_p_op.cc)
 
-cc_test(prim_op_test SRCS prim_op_test.cc ${PRIM_OP_SRCS} DEPS op_registry)
+cc_test(
+  prim_op_test
+  SRCS prim_op_test.cc ${PRIM_OP_SRCS}
+  DEPS op_registry)
diff --git a/paddle/fluid/operators/prim_ops/prim_op_test.cc b/paddle/fluid/operators/prim_ops/prim_op_test.cc
index 2d65149d130bb..e5b84d00f1f28 100644
--- a/paddle/fluid/operators/prim_ops/prim_op_test.cc
+++ b/paddle/fluid/operators/prim_ops/prim_op_test.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 
diff --git a/paddle/fluid/operators/prim_ops/unity_build_rule.cmake b/paddle/fluid/operators/prim_ops/unity_build_rule.cmake
index 5d6a732272b9b..74b04d234fcde 100644
--- a/paddle/fluid/operators/prim_ops/unity_build_rule.cmake
+++ b/paddle/fluid/operators/prim_ops/unity_build_rule.cmake
@@ -1,20 +1,20 @@
-register_unity_group(cc
-    reshape_p_op.cc
-    broadcast_p_op.cc
-    reduce_p_op.cc
-    transpose_p_op.cc
-    split_p_op.cc
-    concat_p_op.cc
-    slice_select_p_op.cc
-    slice_assign_p_op.cc
-    gather_p_op.cc
-    scatter_add_p_op.cc
-    add_p_op.cc
-    sub_p_op.cc
-    mul_p_op.cc
-    div_p_op.cc
-    sqrt_p_op.cc
-    tanh_p_op.cc
-    matmul_p_op.cc
-    fill_constant_p_op.cc
-    )
+register_unity_group(
+  cc
+  reshape_p_op.cc
+  broadcast_p_op.cc
+  reduce_p_op.cc
+  transpose_p_op.cc
+  split_p_op.cc
+  concat_p_op.cc
+  slice_select_p_op.cc
+  slice_assign_p_op.cc
+  gather_p_op.cc
+  scatter_add_p_op.cc
+  add_p_op.cc
+  sub_p_op.cc
+  mul_p_op.cc
+  div_p_op.cc
+  sqrt_p_op.cc
+  tanh_p_op.cc
+  matmul_p_op.cc
+  fill_constant_p_op.cc)
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index 4dd4114d378e9..16d6185e87e15 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -178,10 +178,8 @@ REGISTER_OPERATOR(print, ops::PrintOp, ops::PrintOpProtoAndCheckMaker,
                   ops::PrintOpGradientMaker<paddle::imperative::OpBase>,
                   ops::PrintOpInferShape, ops::PrintOpVarTypeInference);
 
-REGISTER_OP_VERSION(print)
-    .AddCheckpoint(
-        R"ROC(Upgrade print add a new attribute [print_tensor_layout] to "
+REGISTER_OP_VERSION(print).AddCheckpoint(
+    R"ROC(Upgrade print add a new attribute [print_tensor_layout] to "
              "contorl whether to print tensor's layout.)ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "print_tensor_layout", "Whether to print the tensor's layout.",
-            true));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "print_tensor_layout", "Whether to print the tensor's layout.", true));
diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc
index f03a392bfc736..51bd079849a52 100644
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ b/paddle/fluid/operators/prroi_pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/prroi_pool_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index 0fdccc729adde..8431d945749f3 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
index 6a2ed6592e7fe..2e729f94dc8f3 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
@@ -79,9 +79,10 @@ class PruneGateByCapacityFunctor {
     int blocks = NumBlocks(batch_size);
     int threads = kNumCUDAThreads;
 
-    prune_gate_by_capacity_kernel<T1,
-                                  T2><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        gate_idx_data, new_gate_idx_data_, expert_count_out_data, batch_size);
+    prune_gate_by_capacity_kernel<T1, T2>
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(
+            gate_idx_data, new_gate_idx_data_, expert_count_out_data,
+            batch_size);
   }
 
  private:
diff --git a/paddle/fluid/operators/pscore/CMakeLists.txt b/paddle/fluid/operators/pscore/CMakeLists.txt
index de0ee481aa6e7..04407ea117d17 100755
--- a/paddle/fluid/operators/pscore/CMakeLists.txt
+++ b/paddle/fluid/operators/pscore/CMakeLists.txt
@@ -1,49 +1,152 @@
-if (WITH_PSLIB)
-    return()
+if(WITH_PSLIB)
+  return()
 endif()
 
 include(operators)
 
 set(DISTRIBUTE_DEPS "")
 
-if (WITH_ARM_BRPC)
-    list(APPEND DISTRIBUTE_DEPS executor fleet ps_service brpc_utils heter_server heter_client ps_framework_proto framework_proto sendrecv_rpc arm_brpc gflags glog snappy device_context)
+if(WITH_ARM_BRPC)
+  list(
+    APPEND
+    DISTRIBUTE_DEPS
+    executor
+    fleet
+    ps_service
+    brpc_utils
+    heter_server
+    heter_client
+    ps_framework_proto
+    framework_proto
+    sendrecv_rpc
+    arm_brpc
+    gflags
+    glog
+    snappy
+    device_context)
 else()
-    list(APPEND DISTRIBUTE_DEPS executor fleet ps_service brpc_utils heter_server heter_client ps_framework_proto framework_proto sendrecv_rpc brpc leveldb ssl crypto protobuf gflags glog zlib snappy device_context)
+  list(
+    APPEND
+    DISTRIBUTE_DEPS
+    executor
+    fleet
+    ps_service
+    brpc_utils
+    heter_server
+    heter_client
+    ps_framework_proto
+    framework_proto
+    sendrecv_rpc
+    brpc
+    leveldb
+    ssl
+    crypto
+    protobuf
+    gflags
+    glog
+    zlib
+    snappy
+    device_context)
 endif()
 
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses")
+set(DISTRIBUTE_COMPILE_FLAGS
+    "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses"
+)
 
-if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-    set(DISTRIBUTE_COMPILE_FLAGS
-            "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+  set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
 endif()
 
-file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+file(
+  GLOB OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "*_op.cc")
 list(REMOVE_DUPLICATES OPS)
 
-foreach (src ${OPS})
-    set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-endforeach ()
+foreach(src ${OPS})
+  set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS
+                                                ${DISTRIBUTE_COMPILE_FLAGS})
+endforeach()
 
 register_operators(DEPS ${DISTRIBUTE_DEPS})
 
-set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE)
+set(OPERATOR_DEPS
+    ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS}
+    PARENT_SCOPE)
 
-set_source_files_properties(heter_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(heter_server_test SRCS heter_server_test.cc DEPS ${RPC_DEPS} ${DISTRIBUTE_DEPS} executor scope proto_desc scale_op eigen_function)
+set_source_files_properties(
+  heter_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  heter_server_test
+  SRCS heter_server_test.cc
+  DEPS ${RPC_DEPS}
+       ${DISTRIBUTE_DEPS}
+       executor
+       scope
+       proto_desc
+       scale_op
+       eigen_function)
 
-set_source_files_properties(send_and_recv_op_cpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(send_and_recv_cpu_test SRCS send_and_recv_op_cpu_test.cc DEPS executor scope proto_desc scale_op send_and_recv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function)
+set_source_files_properties(
+  send_and_recv_op_cpu_test.cc PROPERTIES COMPILE_FLAGS
+                                          ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  send_and_recv_cpu_test
+  SRCS send_and_recv_op_cpu_test.cc
+  DEPS executor
+       scope
+       proto_desc
+       scale_op
+       send_and_recv_op
+       ${RPC_DEPS}
+       ${DISTRIBUTE_DEPS}
+       eigen_function)
 
-set_source_files_properties(send_and_recv_op_gpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(send_and_recv_gpu_test SRCS send_and_recv_op_gpu_test.cc DEPS executor scope proto_desc scale_op send_and_recv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function)
+set_source_files_properties(
+  send_and_recv_op_gpu_test.cc PROPERTIES COMPILE_FLAGS
+                                          ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  send_and_recv_gpu_test
+  SRCS send_and_recv_op_gpu_test.cc
+  DEPS executor
+       scope
+       proto_desc
+       scale_op
+       send_and_recv_op
+       ${RPC_DEPS}
+       ${DISTRIBUTE_DEPS}
+       eigen_function)
 
-set_source_files_properties(heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(heter_listen_and_server_test SRCS heter_listen_and_server_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function)
+set_source_files_properties(
+  heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS
+                                             ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(
+  heter_listen_and_server_test
+  SRCS heter_listen_and_server_test.cc
+  DEPS executor
+       scope
+       proto_desc
+       scale_op
+       heter_listen_and_serv_op
+       ${RPC_DEPS}
+       ${DISTRIBUTE_DEPS}
+       eigen_function)
 
 #set_source_files_properties(heter_cloud_comm_cpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 #cc_test(heter_cloud_comm_cpu_test SRCS heter_cloud_comm_cpu_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function)
 
-set_source_files_properties(switch_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_binary(switch_server_test SRCS switch_server_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function)
+set_source_files_properties(
+  switch_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_binary(
+  switch_server_test
+  SRCS
+  switch_server_test.cc
+  DEPS
+  executor
+  scope
+  proto_desc
+  scale_op
+  heter_listen_and_serv_op
+  ${RPC_DEPS}
+  ${DISTRIBUTE_DEPS}
+  eigen_function)
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
index f101e509d936f..d09b1c7aa068e 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
@@ -9,11 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/pscore/distributed_lookup_table_op.h"
+
 #include <algorithm>
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/pscore/distributed_lookup_table_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
index c2717c19b2d8e..c9390aa42a656 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
@@ -13,6 +13,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
index 9868a6257924e..701b6250445bd 100644
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
@@ -9,11 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/pscore/distributed_push_sparse_op.h"
+
 #include <algorithm>
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/pscore/distributed_push_sparse_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
index 6d3faae6a2d09..7c361dfd1a7dc 100644
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
@@ -13,6 +13,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
index 457e37744d316..5d77851b72a24 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 PADDLE_DEFINE_EXPORTED_int32(rpc_send_thread_num, 12,
@@ -92,8 +93,9 @@ void HeterListenAndServOp::RunAsyncLoop(framework::ProgramDesc *program) const {
     auto blkid = block_list[i];
     auto it = message_to_block_id.find_value(blkid);
     heter_server_->RegisterServiceHandler(
-        it->first, [&](const MultiVarMsg *request, MultiVarMsg *response,
-                       brpc::Controller *cntl) -> int {
+        it->first,
+        [&](const MultiVarMsg *request, MultiVarMsg *response,
+            brpc::Controller *cntl) -> int {
           return send_and_recv_variable_handler_->Handle(request, response,
                                                          cntl);
         });
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h
old mode 100755
new mode 100644
index 3ecff083b00c7..29cc041d68216
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
 #include <atomic>
 #include <memory>
 #include <set>
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
index ab2fcba51062f..da57660a74d39 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include <stdlib.h>
 #include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
 
 #include <random>
 #include <sstream>
+#include <string>
+#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps/service/heter_client.h"
diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc
index d4ee00d10a50b..db647dfaf238b 100644
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
@@ -12,17 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/distributed/ps/service/heter_server.h"
+
 #include <stdlib.h>
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
 
+#include <memory>
 #include <random>
 #include <sstream>
+#include <string>
+#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps/service/heter_client.h"
-#include "paddle/fluid/distributed/ps/service/heter_server.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace framework = paddle::framework;
@@ -181,13 +182,15 @@ void StartSendAndRecvServer(std::string endpoint) {
   heter_server_ptr_->SetEndPoint(endpoint);
   LOG(INFO) << "before HeterServer::RegisterServiceHandler";
   heter_server_ptr_->RegisterServiceHandler(
-      in_var_name, [&](const MultiVarMsg* request, MultiVarMsg* response,
-                       brpc::Controller* cntl) -> int {
+      in_var_name,
+      [&](const MultiVarMsg* request, MultiVarMsg* response,
+          brpc::Controller* cntl) -> int {
         return b_req_handler->Handle(request, response, cntl);
       });
   heter_server_ptr_->RegisterServiceHandler(
-      in_var_name2, [&](const MultiVarMsg* request, MultiVarMsg* response,
-                        brpc::Controller* cntl) -> int {
+      in_var_name2,
+      [&](const MultiVarMsg* request, MultiVarMsg* response,
+          brpc::Controller* cntl) -> int {
         return b_req_handler->Handle(request, response, cntl);
       });
 
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
old mode 100755
new mode 100644
index 7c25d38d1ebad..a21d11ee1b19e
--- a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
@@ -14,12 +14,13 @@ limitations under the License. */
 
 #if defined PADDLE_WITH_PSCORE
 #include <stdlib.h>
+
 #include <memory>
+#include <random>
+#include <sstream>
 #include <string>
 #include <thread>  // NOLINT
 
-#include <random>
-#include <sstream>
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps/service/heter_client.h"
 #include "paddle/fluid/distributed/ps/service/heter_server.h"
@@ -158,8 +159,9 @@ void StartSendAndRecvServer(std::string endpoint) {
   b_rpc_service->SetEndPoint(endpoint);
   LOG(INFO) << "before HeterServer::RegisterServiceHandler";
   b_rpc_service->RegisterServiceHandler(
-      in_var_name, [&](const MultiVarMsg* request, MultiVarMsg* response,
-                       brpc::Controller* cntl) -> int {
+      in_var_name,
+      [&](const MultiVarMsg* request, MultiVarMsg* response,
+          brpc::Controller* cntl) -> int {
         return b_req_handler->Handle(request, response, cntl);
       });
 
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
index 4054846460b07..c8e24c77734f8 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
@@ -15,12 +15,12 @@ limitations under the License. */
 #if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSCORE)
 
 #include <stdlib.h>
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
 
+#include <memory>
 #include <random>
 #include <sstream>
+#include <string>
+#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps/service/heter_client.h"
@@ -178,8 +178,9 @@ void StartSendAndRecvServer(std::string endpoint) {
   b_rpc_service2->SetEndPoint(endpoint);
   LOG(INFO) << "before HeterServer::RegisterServiceHandler";
   b_rpc_service2->RegisterServiceHandler(
-      in_var_name, [&](const MultiVarMsg* request, MultiVarMsg* response,
-                       brpc::Controller* cntl) -> int {
+      in_var_name,
+      [&](const MultiVarMsg* request, MultiVarMsg* response,
+          brpc::Controller* cntl) -> int {
         return b_req_handler->Handle(request, response, cntl);
       });
 
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h
index 559c7eed84e6f..f803b57b187f8 100644
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/box_wrapper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.h b/paddle/fluid/operators/pull_gpups_sparse_op.h
index abfdb62ec34ac..58e1172552135 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.h
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/pull_sparse_op.cc b/paddle/fluid/operators/pull_sparse_op.cc
index fb83746de19ec..57d361b7a77bb 100644
--- a/paddle/fluid/operators/pull_sparse_op.cc
+++ b/paddle/fluid/operators/pull_sparse_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/pull_sparse_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/pull_sparse_op.h b/paddle/fluid/operators/pull_sparse_op.h
index 2498adc141cd7..e3f0f88ce5552 100644
--- a/paddle/fluid/operators/pull_sparse_op.h
+++ b/paddle/fluid/operators/pull_sparse_op.h
@@ -16,6 +16,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/pull_sparse_v2_op.cc b/paddle/fluid/operators/pull_sparse_v2_op.cc
index f5f2e728e38c0..a8fc84b9c2b73 100644
--- a/paddle/fluid/operators/pull_sparse_v2_op.cc
+++ b/paddle/fluid/operators/pull_sparse_v2_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/pull_sparse_v2_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/pull_sparse_v2_op.h b/paddle/fluid/operators/pull_sparse_v2_op.h
index 29337cc2d94b4..c24d0a4f338e7 100644
--- a/paddle/fluid/operators/pull_sparse_v2_op.h
+++ b/paddle/fluid/operators/pull_sparse_v2_op.h
@@ -16,6 +16,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/push_dense_op.cc b/paddle/fluid/operators/push_dense_op.cc
index 5b9f05bd126b8..5284a1a61e5ef 100644
--- a/paddle/fluid/operators/push_dense_op.cc
+++ b/paddle/fluid/operators/push_dense_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/push_dense_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/push_dense_op.h b/paddle/fluid/operators/push_dense_op.h
index 592ef5ff72a65..c8f98a1ea9e5d 100644
--- a/paddle/fluid/operators/push_dense_op.h
+++ b/paddle/fluid/operators/push_dense_op.h
@@ -16,6 +16,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index f676348bc0af2..de46357e497fd 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -20,6 +20,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc
index 14c9e8b0c260f..db8f315366a7b 100644
--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <vector>
-
 #include "paddle/fluid/operators/py_layer_op.h"
 
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/py_layer_op.h b/paddle/fluid/operators/py_layer_op.h
index 6625a4a1a753c..ea048ee9e5948 100644
--- a/paddle/fluid/operators/py_layer_op.h
+++ b/paddle/fluid/operators/py_layer_op.h
@@ -18,6 +18,7 @@
 #include <functional>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/python_headers.h"
 
diff --git a/paddle/fluid/operators/pyramid_hash_op.cc b/paddle/fluid/operators/pyramid_hash_op.cc
index 4b0ade99154a1..6650037e4d2f4 100644
--- a/paddle/fluid/operators/pyramid_hash_op.cc
+++ b/paddle/fluid/operators/pyramid_hash_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <xxhash.h>
+
 #include <algorithm>
 #include <cmath>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/search_compute.h"
@@ -216,9 +218,8 @@ class CPUPyramidHashOPKernel : public framework::OpKernel<T> {
   bool should_use_term(math::bloomfilter* _filter,
                        math::bloomfilter* _black_filter, const float* word_repr,
                        int len) const {
-    return (!_filter ||
-            1 == math::bloomfilter_get(_filter, word_repr,
-                                       len * sizeof(float))) &&
+    return (!_filter || 1 == math::bloomfilter_get(_filter, word_repr,
+                                                   len * sizeof(float))) &&
            (!_black_filter ||
             0 == math::bloomfilter_get(_black_filter, word_repr,
                                        len * sizeof(float)));
diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc
index 02d5e5f03f02e..55cab539c4d4e 100644
--- a/paddle/fluid/operators/qr_op.cc
+++ b/paddle/fluid/operators/qr_op.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/qr_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/phi/core/ddim.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu
index a57a8d5cf8b7f..695b90e9319e4 100644
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
@@ -16,8 +16,10 @@ limitations under the License. */
 // HIP not support cusolver
 
 #include <thrust/device_vector.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/qr_op.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
@@ -43,8 +45,9 @@ class QrGPUKernel : public framework::OpKernel<T> {
     std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
 
     auto numel = x.numel();
-    PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet(
-                                    "The input of QR is empty."));
+    PADDLE_ENFORCE_GT(
+        numel, 0,
+        platform::errors::PreconditionNotMet("The input of QR is empty."));
     auto x_dims = x.dims();
     int x_rank = x_dims.size();
     int m = x_dims[x_rank - 2];
diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h
index 5ef02d8942797..760b2efd21f6b 100644
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -16,6 +16,7 @@
 
 #include <Eigen/Dense>
 #include <cstdarg>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/svd_helper.h"
@@ -89,11 +90,11 @@ class QrGradKernel : public framework::OpKernel<T> {
     }
 
     // m >= n case
-    auto m_gt_n_case = [](
-        const framework::ExecutionContext& ctx,
-        math::DeviceIndependenceTensorOperations<DeviceContext, T>& dito,
-        const Tensor& dQ, const Tensor& dR, const Tensor& A, const Tensor& Q,
-        const Tensor& R) -> framework::Tensor {
+    auto m_gt_n_case =
+        [](const framework::ExecutionContext& ctx,
+           math::DeviceIndependenceTensorOperations<DeviceContext, T>& dito,
+           const Tensor& dQ, const Tensor& dR, const Tensor& A, const Tensor& Q,
+           const Tensor& R) -> framework::Tensor {
       // Hai-Jun Liao, Jin-Guo Liu, Lei Wang, Tao Xiang (2019). Differentiable
       // Programming Tensor Networks.
       // https://arxiv.org/abs/1903.09650 Section 3. QR factorization
diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc
index 4039f0e9d07e1..edd2a06a50001 100644
--- a/paddle/fluid/operators/quantize_linear_op.cc
+++ b/paddle/fluid/operators/quantize_linear_op.cc
@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/quantize_linear_op.h"
+
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/transform.h"
diff --git a/paddle/fluid/operators/quantize_linear_op.cu b/paddle/fluid/operators/quantize_linear_op.cu
index 6c7e430f51126..6e3e39562c719 100644
--- a/paddle/fluid/operators/quantize_linear_op.cu
+++ b/paddle/fluid/operators/quantize_linear_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/fake_dequantize_op.cu.h"
 #include "paddle/fluid/operators/fake_quantize_op.cu.h"
@@ -46,10 +47,10 @@ struct ChannelDequantizeFunctorV2<platform::CUDADeviceContext, T> {
       quant_stride *= in_dims[i];
     }
 
-    DequantizeOneScaleQuantAxisN<
-        T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-        in_data, scale_factor, max_range, num, in_dims[quant_axis],
-        quant_stride, out_data);
+    DequantizeOneScaleQuantAxisN<T>
+        <<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+            in_data, scale_factor, max_range, num, in_dims[quant_axis],
+            quant_stride, out_data);
   }
 };
 
diff --git a/paddle/fluid/operators/quantize_linear_op.h b/paddle/fluid/operators/quantize_linear_op.h
index e20b99e85f0b3..df1a93ba638ae 100644
--- a/paddle/fluid/operators/quantize_linear_op.h
+++ b/paddle/fluid/operators/quantize_linear_op.h
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/malloc.h"
diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc
index 951951253c47a..62ec77bc2240f 100644
--- a/paddle/fluid/operators/quantize_op.cc
+++ b/paddle/fluid/operators/quantize_op.cc
@@ -13,6 +13,7 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/operators/quantize_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -57,13 +58,13 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(quantize, ops::QuantOp, ops::QuantOpMaker);
 
 REGISTER_OP_VERSION(quantize)
-    .AddCheckpoint(
-        R"ROC( Add a new attribute [bfloat16])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "bfloat16", "If true, float32 input is converted to bfloat16",
-            false))
-    .AddCheckpoint(
-        R"ROC( Add a new attribute [Shift])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "Shift", "Quantize data to uint8 if provided non-zero value.",
-            0.0f));
+    .AddCheckpoint(R"ROC( Add a new attribute [bfloat16])ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewAttr(
+                       "bfloat16",
+                       "If true, float32 input is converted to bfloat16",
+                       false))
+    .AddCheckpoint(R"ROC( Add a new attribute [Shift])ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewAttr(
+                       "Shift",
+                       "Quantize data to uint8 if provided non-zero value.",
+                       0.0f));
diff --git a/paddle/fluid/operators/quantize_op.h b/paddle/fluid/operators/quantize_op.h
index 091306e4637c7..dd1b3c42fb5f9 100644
--- a/paddle/fluid/operators/quantize_op.h
+++ b/paddle/fluid/operators/quantize_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/queue_generator_op.cc b/paddle/fluid/operators/queue_generator_op.cc
index e2174b9346e1e..3683fbd075db2 100644
--- a/paddle/fluid/operators/queue_generator_op.cc
+++ b/paddle/fluid/operators/queue_generator_op.cc
@@ -43,9 +43,10 @@ class QueueGeneratorOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
     std::vector<std::string> names = Attr<std::vector<std::string>>("names");
-    PADDLE_ENFORCE_GT(names.size(), 0, platform::errors::InvalidArgument(
-                                           "The attribute 'names' for "
-                                           "Op(queue_generator) must be set."));
+    PADDLE_ENFORCE_GT(
+        names.size(), 0,
+        platform::errors::InvalidArgument("The attribute 'names' for "
+                                          "Op(queue_generator) must be set."));
 
     int capacity = Attr<int>("capacity");
     PADDLE_ENFORCE_GT(capacity, 0,
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index 2928c3b502781..cfda710bd7745 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
diff --git a/paddle/fluid/operators/random_routing_op.cu b/paddle/fluid/operators/random_routing_op.cu
index fec65518a9d48..471cfb40e6167 100644
--- a/paddle/fluid/operators/random_routing_op.cu
+++ b/paddle/fluid/operators/random_routing_op.cu
@@ -71,9 +71,9 @@ class RandomRoutingOpCUDAKernel : public framework::OpKernel<T> {
     auto topk_idx_data = topk_idx->data<int64_t>();
     auto out_data = out->data<int64_t>();
 
-    random_routing_kernel<
-        T><<<GET_BLOCKS(num_idx), CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>(
-        out_data, num_idx, N, D, prob_data, topk_idx_data, topk_value_data);
+    random_routing_kernel<T>
+        <<<GET_BLOCKS(num_idx), CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>(
+            out_data, num_idx, N, D, prob_data, topk_idx_data, topk_value_data);
   }
 };
 
diff --git a/paddle/fluid/operators/randperm_op.cc b/paddle/fluid/operators/randperm_op.cc
index 1b28ab3c133f7..aed1f2b0ed102 100644
--- a/paddle/fluid/operators/randperm_op.cc
+++ b/paddle/fluid/operators/randperm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
@@ -29,10 +30,11 @@ class RandpermOp : public framework::OperatorWithKernel {
                           "The output(Out) of randperm op must not be null."));
     int n = ctx->Attrs().Get<int>("n");
     PADDLE_ENFORCE_GT(
-        n, 0, platform::errors::InvalidArgument(
-                  "The input 'n' of randperm op should be greater than 0. "
-                  "But received %d.",
-                  n));
+        n, 0,
+        platform::errors::InvalidArgument(
+            "The input 'n' of randperm op should be greater than 0. "
+            "But received %d.",
+            n));
 
     ctx->SetOutputDim("Out", phi::make_ddim({n}));
   }
diff --git a/paddle/fluid/operators/randperm_op_npu.cc b/paddle/fluid/operators/randperm_op_npu.cc
index a16c0d905a555..c9f6121101601 100644
--- a/paddle/fluid/operators/randperm_op_npu.cc
+++ b/paddle/fluid/operators/randperm_op_npu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/randperm_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/randperm_op.h"
 
 template <typename T>
 using kernel =
diff --git a/paddle/fluid/operators/range_op.cc b/paddle/fluid/operators/range_op.cc
index 80fdb2ce6c345..215f83698186c 100644
--- a/paddle/fluid/operators/range_op.cc
+++ b/paddle/fluid/operators/range_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/range_op.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h
index 8924b23ce5cf8..e2fd16dd629ad 100644
--- a/paddle/fluid/operators/range_op.h
+++ b/paddle/fluid/operators/range_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <functional>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -22,13 +23,15 @@ namespace operators {
 
 template <typename T>
 void GetSize(T start, T end, T step, int64_t* size) {
-  PADDLE_ENFORCE_NE(step, 0, platform::errors::InvalidArgument(
-                                 "The step of range op should not be 0."));
+  PADDLE_ENFORCE_NE(step, 0,
+                    platform::errors::InvalidArgument(
+                        "The step of range op should not be 0."));
 
   if (start < end) {
     PADDLE_ENFORCE_GT(
-        step, 0, platform::errors::InvalidArgument(
-                     "The step should be greater than 0 while start < end."));
+        step, 0,
+        platform::errors::InvalidArgument(
+            "The step should be greater than 0 while start < end."));
   }
 
   if (start > end) {
diff --git a/paddle/fluid/operators/range_op_xpu.cc b/paddle/fluid/operators/range_op_xpu.cc
index 6672968de3a02..bfc0d27f7ca26 100644
--- a/paddle/fluid/operators/range_op_xpu.cc
+++ b/paddle/fluid/operators/range_op_xpu.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/range_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/range_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/rank_attention_op.cc b/paddle/fluid/operators/rank_attention_op.cc
index e5332da6475d7..89bdeb57b5fdf 100644
--- a/paddle/fluid/operators/rank_attention_op.cc
+++ b/paddle/fluid/operators/rank_attention_op.cc
@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/rank_attention_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu
index 9b3a1e5637115..61d723c27f7e5 100644
--- a/paddle/fluid/operators/rank_attention_op.cu
+++ b/paddle/fluid/operators/rank_attention_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/rank_attention.cu.h"
 #include "paddle/fluid/operators/rank_attention_op.h"
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 7e06b45943cdc..9dd59de98d553 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -1,26 +1,36 @@
 include(operators)
 
-cc_library(reader_op_registry SRCS reader_op_registry.cc DEPS operator op_registry reader)
+cc_library(
+  reader_op_registry
+  SRCS reader_op_registry.cc
+  DEPS operator op_registry reader)
 set(LOCAL_READER_LIBS)
 
 function(reader_library TARGET_NAME)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    set(options "")
-    set(common_deps reader_op_registry)
-    cmake_parse_arguments(reader_library "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-    op_library(${TARGET_NAME} SRCS ${reader_library_SRCS} DEPS ${common_deps} ${reader_library_DEPS})
-    set(LOCAL_READER_LIBS
-            ${TARGET_NAME}
-            ${LOCAL_READER_LIBS}
-        PARENT_SCOPE)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  set(common_deps reader_op_registry)
+  cmake_parse_arguments(reader_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  op_library(${TARGET_NAME} SRCS ${reader_library_SRCS} DEPS ${common_deps}
+             ${reader_library_DEPS})
+  set(LOCAL_READER_LIBS
+      ${TARGET_NAME} ${LOCAL_READER_LIBS}
+      PARENT_SCOPE)
 endfunction()
 
-cc_library(py_reader SRCS py_reader.cc DEPS reader)
-cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool)
+cc_library(
+  py_reader
+  SRCS py_reader.cc
+  DEPS reader)
+cc_library(
+  buffered_reader
+  SRCS buffered_reader.cc
+  DEPS reader simple_threadpool)
 
-reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader)
+reader_library(create_double_buffer_reader_op SRCS
+               create_double_buffer_reader_op.cc DEPS buffered_reader)
 reader_library(create_py_reader_op SRCS create_py_reader_op.cc DEPS py_reader)
 
 op_library(read_op DEPS py_reader buffered_reader)
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index f126070a7eb96..38c45ca2803ff 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -161,9 +161,10 @@ class BlockingQueue {
 
  private:
   inline void EnforceNotKilled() {
-    PADDLE_ENFORCE_NE(killed_, true, platform::errors::Fatal(
-                                         "Blocking queue is killed because the "
-                                         "data reader raises an exception."));
+    PADDLE_ENFORCE_NE(
+        killed_, true,
+        platform::errors::Fatal("Blocking queue is killed because the "
+                                "data reader raises an exception."));
   }
 
  private:
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index db0f5758d2f53..193f6c29724b7 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reader/buffered_reader.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc
index 86fbddc0ec2cf..b83d085284175 100644
--- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reader/ctr_reader.h"
-
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 8557ef950b3e9..8b2809b286cfe 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -322,9 +322,10 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope,
   framework::Executor executor(place);
   auto *block = Attr<framework::BlockDesc *>(kStepBlock);
   auto *program = block->Program();
-  auto ctx = executor.Prepare(
-      *program, block->ID(), Attr<std::vector<std::string>>(
-                                 kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/);
+  auto ctx =
+      executor.Prepare(*program, block->ID(),
+                       Attr<std::vector<std::string>>(
+                           kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/);
 
   for (size_t step_id = 0; step_id < seq_len; ++step_id) {
     size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
@@ -387,19 +388,19 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope,
     //   outside::output[seq_offset: seq_offset + 1] = inside::output
     executor.CreateVariables(ctx->prog_, &cur_scope, ctx->block_id_);
     if (step_id > 0) {
-      LinkTensorWithCallback(scope, Outputs(kInputGrads), cur_scope,
-                             GradVarLists(Inputs(kInputs)),
-                             [&](const framework::LoDTensor &src_tensor,
-                                 framework::LoDTensor *dst_tensor) {
-                               if (src_tensor.memory_size() ==
-                                   0) {  // Inside Gradient is not created.
-                                 return;
-                               }
-                               framework::Tensor src_slice =
-                                   src_tensor.Slice(seq_offset, seq_offset + 1);
-                               dst_tensor->ShareDataWith(src_slice);
-                             },
-                             true /*is_backward*/);
+      LinkTensorWithCallback(
+          scope, Outputs(kInputGrads), cur_scope, GradVarLists(Inputs(kInputs)),
+          [&](const framework::LoDTensor &src_tensor,
+              framework::LoDTensor *dst_tensor) {
+            if (src_tensor.memory_size() ==
+                0) {  // Inside Gradient is not created.
+              return;
+            }
+            framework::Tensor src_slice =
+                src_tensor.Slice(seq_offset, seq_offset + 1);
+            dst_tensor->ShareDataWith(src_slice);
+          },
+          true /*is_backward*/);
     }
 
     VLOG(5) << "Recurrent memory linking finished ";
@@ -604,7 +605,8 @@ if reverse is True
       |          |          |         |
       v          v          v         v
       o          o          o         o
-)DOC").SetDefault(false);
+)DOC")
+        .SetDefault(false);
     AddAttr<bool>(RecurrentBase::kIsTrain, "").SetDefault(true);
     AddAttr<std::vector<std::string>>(RecurrentBase::kSkipEagerDeletionVars,
                                       "Vars that would skip eager deletion."
@@ -663,14 +665,16 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
           ctx->Attrs()
               .Get<std::vector<std::string>>(RecurrentBase::kExStates)
               .size(),
-          0, platform::errors::InvalidArgument("The Attr(%s) should be empty.",
-                                               RecurrentBase::kExStates));
+          0,
+          platform::errors::InvalidArgument("The Attr(%s) should be empty.",
+                                            RecurrentBase::kExStates));
       PADDLE_ENFORCE_EQ(
           ctx->Attrs()
               .Get<std::vector<std::string>>(RecurrentBase::kStates)
               .size(),
-          0, platform::errors::InvalidArgument("The Attr(%s) should be empty.",
-                                               RecurrentBase::kStates));
+          0,
+          platform::errors::InvalidArgument("The Attr(%s) should be empty.",
+                                            RecurrentBase::kStates));
     }
 
     PADDLE_ENFORCE_EQ(
@@ -702,9 +706,10 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
     if (ctx->HasInputs(RecurrentBase::kParameters)) {
       PADDLE_ENFORCE_EQ(
           ctx->HasOutputs(framework::GradVarName(RecurrentBase::kParameters)),
-          true, platform::errors::InvalidArgument(
-                    "The output of(%s) should not be empty.",
-                    framework::GradVarName(RecurrentBase::kParameters)));
+          true,
+          platform::errors::InvalidArgument(
+              "The output of(%s) should not be empty.",
+              framework::GradVarName(RecurrentBase::kParameters)));
       ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kParameters),
                          ctx->GetInputsDim(RecurrentBase::kParameters));
     }
diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
index 9a2abfd93d066..7c2f91999e964 100644
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
@@ -1,30 +1,42 @@
 include(operators)
 if(WITH_UNITY_BUILD)
-    # Load Unity Build rules for operators in paddle/fluid/operators/reduce_ops.
-    include(unity_build_rule.cmake)
+  # Load Unity Build rules for operators in paddle/fluid/operators/reduce_ops.
+  include(unity_build_rule.cmake)
 endif()
 if(WITH_GPU)
-    if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
-        register_operators(DEPS cub)
-    else()
-        register_operators()
-    endif()
-else()
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+    register_operators(DEPS cub)
+  else()
     register_operators()
+  endif()
+else()
+  register_operators()
 endif()
 
 if(WITH_GPU)
-    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
-	nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub)
-    else()
-	nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor)
-    endif()
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+    nv_test(
+      check_reduce_rank_test
+      SRCS check_reduce_rank_test.cu
+      DEPS tensor cub)
+  else()
+    nv_test(
+      check_reduce_rank_test
+      SRCS check_reduce_rank_test.cu
+      DEPS tensor)
+  endif()
 endif()
 
 if(WITH_ROCM)
-    hip_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor)
+  hip_test(
+    check_reduce_rank_test
+    SRCS check_reduce_rank_test.cu
+    DEPS tensor)
 endif()
 
 if(WITH_ASCEND_CL)
-    cc_test(reduce_any_op_npu_test SRCS reduce_any_op_npu_test.cc DEPS op_registry reduce_any_op scope device_context enforce executor)
+  cc_test(
+    reduce_any_op_npu_test
+    SRCS reduce_any_op_npu_test.cc
+    DEPS op_registry reduce_any_op scope device_context enforce executor)
 endif()
diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
index 83a21a919dcaa..063f7ca041a86 100644
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
+++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
index 0602c73db6bbc..4128d51559c30 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
@@ -15,6 +15,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
index c5bc66e23ce8a..29587faa48005 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
@@ -16,16 +16,18 @@
 
 REGISTER_REDUCE_OP(reduce_amax);
 REGISTER_OP_CPU_KERNEL(
-    reduce_amax, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                   ops::MaxFunctor>,
+    reduce_amax,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                      ops::MaxFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
                       ops::MaxFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
                       ops::MaxFunctor>);
 REGISTER_OP_CPU_KERNEL(
-    reduce_amax_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                            float, ops::AMaxOrAMinGradFunctor>,
+    reduce_amax_grad,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,
+                          ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
                           ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
index 27f2e2b70c681..18c846bc2b469 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
@@ -15,8 +15,9 @@
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 
 REGISTER_OP_CUDA_KERNEL(
-    reduce_amax_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::AMaxOrAMinGradFunctor>,
+    reduce_amax_grad,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,
+                          ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                           ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
index 027bf8ea00a9b..8069e526f1adc 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
@@ -16,16 +16,18 @@
 
 REGISTER_REDUCE_OP(reduce_amin);
 REGISTER_OP_CPU_KERNEL(
-    reduce_amin, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                   ops::MinFunctor>,
+    reduce_amin,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                      ops::MinFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
                       ops::MinFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
                       ops::MinFunctor>);
 REGISTER_OP_CPU_KERNEL(
-    reduce_amin_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                            float, ops::AMaxOrAMinGradFunctor>,
+    reduce_amin_grad,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,
+                          ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
                           ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
index a296c4c5d6fa1..c7a26049634ce 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
@@ -15,8 +15,9 @@
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 
 REGISTER_OP_CUDA_KERNEL(
-    reduce_amin_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::AMaxOrAMinGradFunctor>,
+    reduce_amin_grad,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,
+                          ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                           ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
index 15812778e0023..4cc3239ea68fb 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
index 1c1269a08dbdc..eb62c84fd5aef 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
@@ -15,6 +15,7 @@
 #ifdef PADDLE_WITH_XPU
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index dc41979defb93..8ce115ce66921 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
+
 #include <memory>
 #include <string>
 #include <utility>
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
index 715dcb25c209f..111537f64558c 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
@@ -11,8 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
index 5e5b04d57b002..f6d8aa1318234 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index b21e41c5b8548..a2048004615b7 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -21,7 +21,6 @@
 #include <vector>
 
 #include "paddle/fluid/framework/tensor.h"
-
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 76641698ead67..322ef1fdff67a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <set>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/cast_op.h"
@@ -484,8 +485,9 @@ class ReduceOp : public framework::OperatorWithKernel {
           platform::is_gpu_place(ctx.GetPlace()) ||
               platform::is_npu_place(ctx.GetPlace()) ||
               platform::is_mlu_place(ctx.GetPlace()),
-          true, platform::errors::InvalidArgument(
-                    "float16 can only be used on GPU or NPU or MLU place"));
+          true,
+          platform::errors::InvalidArgument(
+              "float16 can only be used on GPU or NPU or MLU place"));
     }
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
index c144e65cbf647..a9d5863558cf7 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
index 95dda354cae7d..96e496217d04f 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
@@ -17,6 +17,7 @@
 #ifdef PADDLE_WITH_MLU
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
index 324fd369e82b5..f9ae575e801b9 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
@@ -20,6 +20,7 @@
 #include <set>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
index 7a5c86c35c6a2..f50cfd0417aaf 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
@@ -15,6 +15,7 @@
 #ifdef PADDLE_WITH_XPU
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake b/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake
index c4f32a8d25764..f5c1af004f34f 100644
--- a/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake
+++ b/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake
@@ -4,18 +4,16 @@
 # Generally, the combination rules in this file do not need to be modified.
 # If there are some redefined error in compiling with the source file which
 # in combination rule, you can remove the source file from the following rules.
-register_unity_group(cc
-    reduce_all_op.cc
-    reduce_any_op.cc
-    reduce_prod_op.cc
-    reduce_sum_op.cc)
-register_unity_group(cu
-    reduce_all_op.cu
-    reduce_any_op.cu
-    reduce_prod_op.cu
-    reduce_prod_op.part.cu
-    reduce_sum_op.cu
-    reduce_sum_op.part.cu)
+register_unity_group(cc reduce_all_op.cc reduce_any_op.cc reduce_prod_op.cc
+                     reduce_sum_op.cc)
+register_unity_group(
+  cu
+  reduce_all_op.cu
+  reduce_any_op.cu
+  reduce_prod_op.cu
+  reduce_prod_op.part.cu
+  reduce_sum_op.cu
+  reduce_sum_op.part.cu)
 # The following groups are to make better use of `/MP` which MSVC's parallel
 # compilation instruction when compiling in Unity Build.
 register_unity_group(cu frobenius_norm_op.cu)
diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu
index e40bd147b9925..028f5a7f51567 100644
--- a/paddle/fluid/operators/renorm_op.cu
+++ b/paddle/fluid/operators/renorm_op.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/renorm_op.h"
-
 #include <algorithm>
 #include <cstdio>
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/fluid/operators/renorm_op.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
@@ -107,10 +106,10 @@ __global__ void RenormGradKernelFunc2(const T* x_data, const T* dout_data,
   __syncthreads();
   if (i < size) {
     dx_data[i] = dim_value[dim_index] * dout_data[i];
-    dx_data[i] = dx_data[i] +
-                 weight_derivative[dim_index] * dim_power_sum[dim_index] *
-                     pow(abs(x_data[i]), T(p - 1.0)) *
-                     (x_data[i] >= 0 ? 1 : -1);
+    dx_data[i] = dx_data[i] + weight_derivative[dim_index] *
+                                  dim_power_sum[dim_index] *
+                                  pow(abs(x_data[i]), T(p - 1.0)) *
+                                  (x_data[i] >= 0 ? 1 : -1);
   }
 }
 
diff --git a/paddle/fluid/operators/repeat_interleave_op.cc b/paddle/fluid/operators/repeat_interleave_op.cc
index d6f9df5d79e60..daa45bf78f27d 100644
--- a/paddle/fluid/operators/repeat_interleave_op.cc
+++ b/paddle/fluid/operators/repeat_interleave_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/repeat_interleave_op.h"
+
 #include <memory>
 
 namespace paddle {
@@ -51,11 +52,12 @@ class RepeatInterleaveOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           repeats_dim.size() == 1 ||
               (repeats_dim.size() == 2 && repeats_dim[1] == 1),
-          true, platform::errors::InvalidArgument(
-                    "The 'shape' of Input(RepeatsTensor) must be 1-D tensor. "
-                    "But received: the 'shape' of Input(Index) is [%s], "
-                    "the dimension of Input(Index) is [%d].",
-                    repeats_dim, repeats_dim.size()));
+          true,
+          platform::errors::InvalidArgument(
+              "The 'shape' of Input(RepeatsTensor) must be 1-D tensor. "
+              "But received: the 'shape' of Input(Index) is [%s], "
+              "the dimension of Input(Index) is [%d].",
+              repeats_dim, repeats_dim.size()));
 
       PADDLE_ENFORCE_EQ(repeats_dim[0] != 0, true,
                         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/repeat_interleave_op.cu b/paddle/fluid/operators/repeat_interleave_op.cu
index 5f48a4a94ac99..2b8464d5bf6bf 100644
--- a/paddle/fluid/operators/repeat_interleave_op.cu
+++ b/paddle/fluid/operators/repeat_interleave_op.cu
@@ -127,10 +127,10 @@ class RepeatInterleaveCUDAKernel : public framework::OpKernel<T> {
         int64_t size = output_dim[dim];
         int64_t delta = input_dim[dim] - size;
 
-        index_select_cuda_kernel<T, int64_t><<<
-            (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-            PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data,
-                                                  numel, stride, size, delta);
+        index_select_cuda_kernel<T, int64_t>
+            <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+               PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+                in_data, out_data, index_data, numel, stride, size, delta);
       } else {
         RepeatsTensor2IndexTensor<DeviceContext, int>(*repeats_tensor, &index);
 
@@ -143,10 +143,10 @@ class RepeatInterleaveCUDAKernel : public framework::OpKernel<T> {
         int64_t size = output_dim[dim];
         int64_t delta = input_dim[dim] - size;
 
-        index_select_cuda_kernel<T, int><<<
-            (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-            PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data,
-                                                  numel, stride, size, delta);
+        index_select_cuda_kernel<T, int>
+            <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+               PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+                in_data, out_data, index_data, numel, stride, size, delta);
       }
     } else if (repeats > 0) {
       int64_t index_size = in->dims()[dim] * repeats;
@@ -169,10 +169,10 @@ class RepeatInterleaveCUDAKernel : public framework::OpKernel<T> {
       int64_t delta = input_dim[dim] - size;
 
       const int* index_data = index.data<int>();
-      index_select_cuda_kernel<T, int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                                             PADDLE_CUDA_NUM_THREADS,
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          in_data, out_data, index_data, numel, stride, size, delta);
+      index_select_cuda_kernel<T, int>
+          <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+              in_data, out_data, index_data, numel, stride, size, delta);
       platform::GpuStreamSync(stream);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -206,9 +206,9 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel<T> {
     auto stream =
         context.template device_context<platform::CUDADeviceContext>().stream();
 
-    index_select_grad_init<
-        T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel);
+    index_select_grad_init<T>
+        <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel);
 
     int repeats = context.Attr<int>("Repeats");
     framework::LoDTensor index;
@@ -237,22 +237,24 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel<T> {
         int64_t index_nums = index.numel();
 
         const int64_t* index_data = index.data<int64_t>();
-        index_select_grad_cuda_kernel<T, int64_t><<<
-            (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-            PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-            output_grad_data, in_grad_data, index_data, index_nums, out_nums,
-            stride, size, delta);
+        index_select_grad_cuda_kernel<T, int64_t>
+            <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) /
+                   PADDLE_CUDA_NUM_THREADS,
+               PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+                output_grad_data, in_grad_data, index_data, index_nums,
+                out_nums, stride, size, delta);
         platform::GpuStreamSync(stream);
       } else {
         RepeatsTensor2IndexTensor<DeviceContext, int>(*repeats_tensor, &index);
         int64_t index_nums = index.numel();
 
         const int* index_data = index.data<int>();
-        index_select_grad_cuda_kernel<T, int><<<
-            (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-            PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-            output_grad_data, in_grad_data, index_data, index_nums, out_nums,
-            stride, size, delta);
+        index_select_grad_cuda_kernel<T, int>
+            <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) /
+                   PADDLE_CUDA_NUM_THREADS,
+               PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+                output_grad_data, in_grad_data, index_data, index_nums,
+                out_nums, stride, size, delta);
         platform::GpuStreamSync(stream);
       }
     } else if (repeats > 0) {
@@ -268,11 +270,11 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel<T> {
 
       const int* index_data = index.data<int>();
       int64_t index_nums = index.numel();
-      index_select_grad_cuda_kernel<T, int><<<
-          (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
-                                                index_data, index_nums,
-                                                out_nums, stride, size, delta);
+      index_select_grad_cuda_kernel<T, int>
+          <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+              output_grad_data, in_grad_data, index_data, index_nums, out_nums,
+              stride, size, delta);
       platform::GpuStreamSync(stream);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/repeat_interleave_op.h b/paddle/fluid/operators/repeat_interleave_op.h
index 68b66bd534ca8..f8e39fdc90762 100644
--- a/paddle/fluid/operators/repeat_interleave_op.h
+++ b/paddle/fluid/operators/repeat_interleave_op.h
@@ -14,11 +14,11 @@
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/index_select_op.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-
-#include "paddle/fluid/operators/index_select_op.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/requantize_op.cc b/paddle/fluid/operators/requantize_op.cc
index 2d87ae91fbe60..d9345c1145ba6 100644
--- a/paddle/fluid/operators/requantize_op.cc
+++ b/paddle/fluid/operators/requantize_op.cc
@@ -13,6 +13,7 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/operators/requantize_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
diff --git a/paddle/fluid/operators/requantize_op.h b/paddle/fluid/operators/requantize_op.h
index c2b154db11dc7..8166aa98f076f 100644
--- a/paddle/fluid/operators/requantize_op.h
+++ b/paddle/fluid/operators/requantize_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/rnn_op.cc b/paddle/fluid/operators/rnn_op.cc
index caf90219935de..d3c6ee7c1e1a8 100644
--- a/paddle/fluid/operators/rnn_op.cc
+++ b/paddle/fluid/operators/rnn_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index bf78b6a696559..db84387e6cfa7 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
index 7be1c19012099..18938d7183200 100644
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 12e33d56c0020..e47145535a389 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index 07a6117d71119..9c66566fdfd89 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -12,9 +12,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/row_conv_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index c5794948aaec6..b1cabb018b9e0 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -344,9 +344,9 @@ class RowConvKernel<platform::CUDADeviceContext, T>
       dim3 block_dim = dim3(32, 32);
       dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
       int mem_per_block = (future_context * block_dim.x) * sizeof(T);
-      RowConvForwardSharedMemory<
-          T><<<grid_dim, block_dim, mem_per_block, stream>>>(
-          in, weight, num_sequence, input_dim, future_context, idx, out);
+      RowConvForwardSharedMemory<T>
+          <<<grid_dim, block_dim, mem_per_block, stream>>>(
+              in, weight, num_sequence, input_dim, future_context, idx, out);
     } else {
       dim3 block_dim = dim3(32, 32);
       dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
@@ -413,10 +413,10 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
             (block_y * block_x + block_y * (block_x + future_context - 1) +
              future_context * block_y) *
             sizeof(T);
-        RowConvGradFilterImproved<
-            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
-            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
-            idx, dfilter);
+        RowConvGradFilterImproved<T>
+            <<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+                in, dout, num_sequence, input_dim, future_context, block_x,
+                block_y, idx, dfilter);
       } else {
         dim3 block_dim = dim3(32, 32);
         dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
@@ -424,10 +424,10 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
         int block_y = block_dim.y;
         int mem_per_block =
             (block_x * block_y * 2) * sizeof(T);  // For 2 arrays of size 32x32
-        RowConvGradFilter<
-            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
-            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
-            idx, dfilter);
+        RowConvGradFilter<T>
+            <<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+                in, dout, num_sequence, input_dim, future_context, block_x,
+                block_y, idx, dfilter);
       }
     }
 
@@ -437,9 +437,10 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
         dim3 block_dim = dim3(32, 32);
         dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
         int mem_per_block = (future_context * block_dim.x) * sizeof(T);
-        RowConvGradInputSharedMemory<
-            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
-            dout, weights, num_sequence, input_dim, future_context, idx, din);
+        RowConvGradInputSharedMemory<T>
+            <<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+                dout, weights, num_sequence, input_dim, future_context, idx,
+                din);
       } else {
         dim3 block_dim = dim3(32, 32);
         dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
diff --git a/paddle/fluid/operators/rrelu_op.cc b/paddle/fluid/operators/rrelu_op.cc
index c543a088e9d7f..558c77b5b9220 100644
--- a/paddle/fluid/operators/rrelu_op.cc
+++ b/paddle/fluid/operators/rrelu_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/unary.h"
diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc
index ec62feb07bc80..38c92de4523d5 100644
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
@@ -90,6 +90,8 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker {
               "computes double grad.")
         .AsDuplicable()
         .AsDispensable();
+    AddOutput("CUDAGraph", "The output CUDA Graph when use_cuda_graph=True.")
+        .AsDispensable();
     AddAttr<BlockDesc*>("global_block",
                         "(BlockDesc *)"
                         "The global block of executed program desc.");
@@ -107,6 +109,13 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker {
         "program_id",
         "(int64_t)"
         "The unique hash id used as cache key for ExecutorInfoCache.");
+    AddAttr<std::string>("cuda_graph_capture_mode",
+                         "(str, default '') The CUDA Graph capture mode. "
+                         "Default '' means no CUDA Graph capturing.")
+        .SetDefault("");
+    AddAttr<int64_t>("cuda_graph_pool_id",
+                     "(int64_t, default 0) The CUDA Graph memory pool ID.")
+        .SetDefault(0);
     AddComment(R"DOC(
 RunProgram operator.
 
@@ -191,6 +200,9 @@ class RunProgramGradOpMaker : public framework::SingleGradOpMaker<T> {
     grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     grad_op->SetInput("OutScope", this->Output("OutScope"));
     grad_op->SetInput("DOut", this->Output("DOut"));
+    if (this->HasOutput("CUDAGraph")) {
+      grad_op->SetInput("CUDAGraph", this->Output("CUDAGraph"));
+    }
     grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
 
     auto block_desc =
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index fbc52480c8266..bfd33efe833d2 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -34,6 +34,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/operators/cuda_graph_with_in_out.h"
+#endif
 
 DECLARE_bool(use_mkldnn);
 
@@ -96,11 +99,12 @@ static void CheckOutputVarStatus(const Variable &src_var,
             var_name,
             platform::demangle(framework::ToTypeName(src_var.Type()))));
     PADDLE_ENFORCE_EQ(src_var.Get<phi::SelectedRows>().value().IsInitialized(),
-                      true, platform::errors::InvalidArgument(
-                                "The tensor in output variable %s get from "
-                                "RunProgram(Grad)Op's "
-                                "internal scope is not initialized.",
-                                var_name));
+                      true,
+                      platform::errors::InvalidArgument(
+                          "The tensor in output variable %s get from "
+                          "RunProgram(Grad)Op's "
+                          "internal scope is not initialized.",
+                          var_name));
 
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
@@ -167,13 +171,84 @@ static void ShareVarsFromScope(const std::vector<Variable *> &vars,
   }
 }
 
+#ifdef PADDLE_WITH_CUDA
+static cudaStreamCaptureMode StringToCUDAGraphCaptureMode(
+    const std::string &mode) {
+  if (mode == "global") {
+    return cudaStreamCaptureModeGlobal;
+  } else if (mode == "thread_local") {
+    return cudaStreamCaptureModeThreadLocal;
+  } else if (mode == "relaxed") {
+    return cudaStreamCaptureModeRelaxed;
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Unsupported CUDA Graph capture mode %s", mode));
+  }
+}
+#endif
+
 }  // namespace details
 
 template <typename DeviceContext, typename T>
 class RunProgramOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto &capture_mode = ctx.Attr<std::string>("cuda_graph_capture_mode");
+    auto is_test = ctx.Attr<bool>("is_test");
+    if (capture_mode.empty()) {
+      ComputeImpl(ctx, is_test, false);
+      return;
+    }
+
+#ifdef PADDLE_WITH_CUDA
+    auto mode = details::StringToCUDAGraphCaptureMode(capture_mode);
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        phi::errors::InvalidArgument("The cuda_graph_capture_mode is only "
+                                     "valid when using NVIDIA GPU."));
+    auto *graph_var = ctx.OutputVar("CUDAGraph");
+    PADDLE_ENFORCE_NOT_NULL(
+        graph_var,
+        phi::errors::InvalidArgument("Output(CUDAGraph) must exist when "
+                                     "cuda_graph_capture_mode is valid."));
+    using GraphVecType = std::vector<std::unique_ptr<CUDAGraphWithInOuts>>;
+    auto &inner_graphs = *(graph_var->GetMutable<GraphVecType>());
+    inner_graphs.resize(std::max<size_t>(3, inner_graphs.size()));
+    size_t graph_idx = is_test ? 0 : 1;
+    if (inner_graphs[graph_idx].get() == nullptr) {
+      int64_t pool_id;
+      if (inner_graphs[1 - graph_idx].get() != nullptr) {
+        pool_id = inner_graphs[1 - graph_idx]->PoolID();
+      } else {
+        pool_id = ctx.Attr<int64_t>("cuda_graph_pool_id");
+      }
+
+      framework::PEAndGraphPair pe_and_graph;
+      auto callable = [this, is_test, &pe_and_graph](
+                          const framework::ExecutionContext &exe_ctx) {
+        pe_and_graph = ComputeImpl(exe_ctx, is_test, true);
+      };
+      inner_graphs[graph_idx] = CaptureCUDAGraph(
+          callable, ctx, {"X"}, {"Out", "DOut"}, mode, pool_id);
+      VLOG(10) << "Capture Forward CUDA Graph";
+    } else {
+      VLOG(10) << "Run Forward CUDA Graph directly";
+      ExecuteCUDAGraph(ctx, {"X"}, {"Out", "DOut"},
+                       inner_graphs[graph_idx].get());
+    }
+#else
+    PADDLE_THROW(
+        phi::errors::InvalidArgument("The cuda_graph_capture_mode is only "
+                                     "valid when using NVIDIA GPU."));
+#endif
+  }
+
+ private:
+  framework::PEAndGraphPair ComputeImpl(const framework::ExecutionContext &ctx,
+                                        bool is_test,
+                                        bool use_cuda_graph) const {
     VLOG(2) << "RunProgramOpKernel Compute";
+    framework::PEAndGraphPair pe_and_graph;
     // Step 1. prepare inputs, outputs, attrs
     auto &input_vars = ctx.MultiInputVar("X");
     auto &param_vars = ctx.MultiInputVar("Params");
@@ -192,7 +267,6 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
 
     auto start_op_index = ctx.Attr<int64_t>("start_op_index");
     auto end_op_index = ctx.Attr<int64_t>("end_op_index");
-    auto is_test = ctx.Attr<bool>("is_test");
     auto program_id = ctx.Attr<int64_t>("program_id");
 
     // NOTE(chenweihang): In order not to add new variable type, use vector
@@ -223,15 +297,29 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
 
     if (end_op_index > start_op_index) {
       auto *program = global_block->Program();
-      auto cache_info = framework::GetExecutorInfoFromCache(
-          *program, ctx.GetPlace(), start_op_index, end_op_index,
-          /*is_grad=*/false, program_id, &scope);
-      auto &parallel_executor = cache_info.first;
+      bool is_new_created;
+      if (use_cuda_graph) {
+        pe_and_graph = framework::CreateFixOrderExecutorInfo(
+            *program, ctx.GetPlace(), start_op_index, end_op_index, &scope);
+        is_new_created = true;
+      } else {
+        auto cache_info = framework::GetExecutorInfoFromCache(
+            *program, ctx.GetPlace(), start_op_index, end_op_index,
+            /*is_grad=*/false, program_id, &scope);
+        pe_and_graph.first = cache_info.first;
+        is_new_created = cache_info.second;
+      }
+
+      auto &parallel_executor = pe_and_graph.first;
+
       // all out_vars are skip_eager_var
+      std::vector<std::string> tmp_vars;
       auto &skip_eager_delete_vars =
-          framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
-              program_id, false);
-      if (cache_info.second /*is_new_created*/) {
+          use_cuda_graph
+              ? tmp_vars
+              : framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
+                    program_id, false);
+      if (is_new_created) {
         parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_var_names);
         skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
                                       output_var_names.begin(),
@@ -263,6 +351,7 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_MKLDNN
     if (FLAGS_use_mkldnn) platform::DontClearMKLDNNCache(ctx.GetPlace());
 #endif
+    return pe_and_graph;
   }
 };
 
@@ -270,14 +359,68 @@ template <typename DeviceContext, typename T>
 class RunProgramGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto &capture_mode = ctx.Attr<std::string>("cuda_graph_capture_mode");
+    if (capture_mode.empty()) {
+      ComputeImpl(ctx, false);
+      return;
+    }
+
+#ifdef PADDLE_WITH_CUDA
+    auto mode = details::StringToCUDAGraphCaptureMode(capture_mode);
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        phi::errors::InvalidArgument("The cuda_graph_capture_mode is only "
+                                     "valid when using NVIDIA GPU."));
+    auto *graph_var =
+        const_cast<framework::Variable *>(ctx.InputVar("CUDAGraph"));
+    PADDLE_ENFORCE_NOT_NULL(
+        graph_var,
+        phi::errors::InvalidArgument("Output(CUDAGraph) must exist when "
+                                     "cuda_graph_capture_mode is valid."));
+    auto &inner_graphs = *(
+        graph_var
+            ->GetMutable<std::vector<std::unique_ptr<CUDAGraphWithInOuts>>>());
+    const size_t graph_idx = 2;
+    if (inner_graphs[graph_idx].get() == nullptr) {
+      framework::PEAndGraphPair pe_and_graph;
+      auto callable =
+          [this, &pe_and_graph](const framework::ExecutionContext &exe_ctx) {
+            pe_and_graph = ComputeImpl(exe_ctx, true);
+          };
+      int64_t pool_id = inner_graphs[0].get() != nullptr
+                            ? inner_graphs[0]->PoolID()
+                            : inner_graphs[1]->PoolID();
+      inner_graphs[graph_idx] =
+          CaptureCUDAGraph(callable, ctx, {framework::GradVarName("Out")},
+                           {framework::GradVarName("X")}, mode, pool_id);
+      VLOG(10) << "Capture Backward CUDA Graph";
+    } else {
+      ExecuteCUDAGraph(ctx, {framework::GradVarName("Out")},
+                       {framework::GradVarName("X")},
+                       inner_graphs[graph_idx].get());
+      VLOG(10) << "Run Backward CUDA Graph directly";
+    }
+#else
+    PADDLE_THROW(
+        phi::errors::InvalidArgument("The cuda_graph_capture_mode is only "
+                                     "valid when using NVIDIA GPU."));
+#endif
+  }
+
+ private:
+  framework::PEAndGraphPair ComputeImpl(const framework::ExecutionContext &ctx,
+                                        bool use_cuda_graph) const {
     VLOG(2) << "RunProgramGradOpKernel Compute";
+    framework::PEAndGraphPair pe_and_graph;
     // Step 1. prepare inputs and outputs
     auto &output_grad_vars = ctx.MultiInputVar(framework::GradVarName("Out"));
     auto input_grad_vars = ctx.MultiOutputVar(framework::GradVarName("X"));
     auto param_grad_vars = ctx.MultiOutputVar(framework::GradVarName("Params"));
 
     // if all output vars are set to stop_gradient, grad op no need to executed
-    if (input_grad_vars.empty() && param_grad_vars.empty()) return;
+    if (input_grad_vars.empty() && param_grad_vars.empty()) {
+      return pe_and_graph;
+    }
 
     auto output_grad_var_names = ctx.InputNames(framework::GradVarName("Out"));
     // NOTE: after PR22939 [Add double grad] merged, the grad op maker's
@@ -321,15 +464,27 @@ class RunProgramGradOpKernel : public framework::OpKernel<T> {
     if (end_op_index > start_op_index) {
       // Step 2. prepare executor and scope
       auto *program = global_block->Program();
-      auto cache_info = framework::GetExecutorInfoFromCache(
-          *program, ctx.GetPlace(), start_op_index, end_op_index,
-          /*is_grad*/ true, program_id, &scope);
-      auto &parallel_executor = cache_info.first;
+      bool is_new_created;
+      if (use_cuda_graph) {
+        pe_and_graph = framework::CreateFixOrderExecutorInfo(
+            *program, ctx.GetPlace(), start_op_index, end_op_index, &scope);
+        is_new_created = true;
+      } else {
+        auto cache_info = framework::GetExecutorInfoFromCache(
+            *program, ctx.GetPlace(), start_op_index, end_op_index,
+            /*is_grad*/ true, program_id, &scope);
+        pe_and_graph.first = cache_info.first;
+        is_new_created = cache_info.second;
+      }
 
+      auto &parallel_executor = pe_and_graph.first;
+      std::vector<std::string> tmp_vars;
       auto &skip_eager_delete_vars =
-          framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
-              program_id, true);
-      if (cache_info.second /*is_new_created*/) {
+          use_cuda_graph
+              ? tmp_vars
+              : framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
+                    program_id, true);
+      if (is_new_created) {
         parallel_executor->SkipMemoryReuse(/*scope_idx=*/0,
                                            output_grad_var_names);
 
@@ -360,6 +515,7 @@ class RunProgramGradOpKernel : public framework::OpKernel<T> {
     global_inner_scope->DeleteScope(&scope);
     VLOG(2) << "The number of sub scopes after backward: "
             << global_inner_scope->kids().size();
+    return pe_and_graph;
   }
 };
 
diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc
index e02c7ade9a11a..a80d527fd5c38 100644
--- a/paddle/fluid/operators/sample_logits_op.cc
+++ b/paddle/fluid/operators/sample_logits_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/sample_logits_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/operators/math/sample_prob.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
index 273010e5443f8..7eff9429244fc 100644
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -146,9 +147,9 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
     int threads = 512;
     size_t size = batch_size * num_true;
     int grid = (size + threads - 1) / threads;
-    GPUSetLabel<
-        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-        size, num_true, sampled_labels_data);
+    GPUSetLabel<T>
+        <<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+            size, num_true, sampled_labels_data);
 
     if (use_customized_samples) {
       const Tensor* customized_samples =
@@ -190,17 +191,17 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
 
     size = batch_size * num_take;
     grid = (size + threads - 1) / threads;
-    GPUTakeAlongD1<
-        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-        size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
-        p_value);
+    GPUTakeAlongD1<T>
+        <<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+            size, batch_size, array_slice_size, idx_slice_size, p_array,
+            p_index, p_value);
 
     if (remove_accidental_hits) {
       const size_t size = batch_size * (num_true + num_samples);
       int grid = (size + threads - 1) / threads;
-      gpu_compute_remove_accidental_hits<
-          T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-          size, num_true, idx_slice_size, p_index, p_value);
+      gpu_compute_remove_accidental_hits<T>
+          <<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+              size, num_true, idx_slice_size, p_index, p_value);
     }
 
     // subtracted sampled logits with logQ(y|x)
@@ -246,10 +247,10 @@ class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
     const size_t size = batch_size;
     int grid = (size + threads - 1) / threads;
 
-    GPUPutAlongD1<
-        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-        size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
-        p_value);
+    GPUPutAlongD1<T>
+        <<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+            size, batch_size, array_slice_size, idx_slice_size, p_array,
+            p_index, p_value);
   }
 };
 
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
index ae741ae321292..815a2897d5d20 100644
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index 7fe6623dcca14..23aa88459cec1 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <string>
-
 #include "paddle/fluid/operators/save_combine_op.h"
 
+#include <string>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h
index 8b8e27b79b96b..a419e862501f6 100644
--- a/paddle/fluid/operators/save_combine_op.h
+++ b/paddle/fluid/operators/save_combine_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
 #include <fstream>
 #include <numeric>
 #include <sstream>
diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc
index 493f5081ee42b..797321efd6c45 100644
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <iostream>
 #include <string>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/bfloat16.h"
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index d819c172e4a9d..02774c6b72aca 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -12,14 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/save_op.h"
+
 #include <stdint.h>
+
 #include <fstream>
 #include <numeric>
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/operators/save_op.h"
-
 namespace paddle {
 namespace operators {
 class SaveOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h
index e4ca1423afaea..64aca1ab6b71f 100644
--- a/paddle/fluid/operators/save_op.h
+++ b/paddle/fluid/operators/save_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
 #include <fstream>
 #include <numeric>
 #include <string>
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index cbf2b9152079e..ebc4c6441489c 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index 40f5699a29b35..fdc98d084ed07 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/scale_kernel.h"
 
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc
index 0ae0e1500c166..0cfc3a77aadb2 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ b/paddle/fluid/operators/scatter_nd_add_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/ddim.h"
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 5f6b04cf59e0e..a2e8071e01353 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/ddim.h"
diff --git a/paddle/fluid/operators/scatter_op_xpu.cc b/paddle/fluid/operators/scatter_op_xpu.cc
index 07dd2f2d85fe9..3ab084b660a0a 100644
--- a/paddle/fluid/operators/scatter_op_xpu.cc
+++ b/paddle/fluid/operators/scatter_op_xpu.cc
@@ -56,11 +56,12 @@ class ScatterOpXPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         index->dims().size() == 1 ||
             (index->dims().size() == 2 && index->dims()[1] == 1),
-        true, platform::errors::InvalidArgument(
-                  "index's shape is error, "
-                  "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
-                  "but got index'dims shape is %d",
-                  index->dims().size()));
+        true,
+        platform::errors::InvalidArgument(
+            "index's shape is error, "
+            "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
+            "but got index'dims shape is %d",
+            index->dims().size()));
 
     int index_size = static_cast<int>(index->dims()[0]);
     auto x_dims = x->dims();
diff --git a/paddle/fluid/operators/seed_op.cc b/paddle/fluid/operators/seed_op.cc
index 837ccae0284f5..7cad6dcab7ca3 100644
--- a/paddle/fluid/operators/seed_op.cc
+++ b/paddle/fluid/operators/seed_op.cc
@@ -74,13 +74,12 @@ REGISTER_OP_CPU_KERNEL(
     seed, ops::CPUSeedKernel<paddle::platform::CPUDeviceContext, int>);
 
 /* ==========================  register checkpoint ===========================*/
-REGISTER_OP_VERSION(seed)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(seed).AddCheckpoint(
+    R"ROC(
              Upgrade seed add a new attribute [force_cpu])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "force_cpu",
-            "If true, Force fill output variable to cpu."
-            "memory. Otherwise, fill output variable to the running "
-            "device",
-            false));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "force_cpu",
+        "If true, Force fill output variable to cpu."
+        "memory. Otherwise, fill output variable to the running "
+        "device",
+        false));
diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc
index 9d4c8532a82c0..92010e8afc058 100644
--- a/paddle/fluid/operators/segment_pool_op.cc
+++ b/paddle/fluid/operators/segment_pool_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/sequence_ops/CMakeLists.txt b/paddle/fluid/operators/sequence_ops/CMakeLists.txt
index 0ca88409f4126..fe36afd96c5e8 100644
--- a/paddle/fluid/operators/sequence_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/sequence_ops/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(operators)
 if(WITH_UNITY_BUILD)
-    # Load Unity Build rules for operators in paddle/fluid/operators/sequence_ops.
-    include(unity_build_rule.cmake)
+  # Load Unity Build rules for operators in paddle/fluid/operators/sequence_ops.
+  include(unity_build_rule.cmake)
 endif()
 register_operators()
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
index f6523255e2438..0f17ff1e1b7bc 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h"
+
 #include <memory>
 #include <vector>
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
index d58a2da29c941..4856e38011bae 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
index 1b8525febe2d4..f27e6535d3199 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
@@ -16,6 +16,7 @@
 
 #include <utility>
 #include <vector>
+
 #include "boost/optional.hpp"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
@@ -49,7 +50,7 @@ inline framework::LoD ConcatLoD(const Container &xs,
 
 template <typename T, typename... ARGS>
 inline std::vector<std::reference_wrapper<T>> GetDataVectorSafely(
-    const std::vector<T *> &vec, ARGS &&... args) {
+    const std::vector<T *> &vec, ARGS &&...args) {
   std::vector<std::reference_wrapper<T>> result;
   result.reserve(vec.size());
   for (auto *ptr : vec) {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
index 62fa5bc26aca2..1935a62621de4 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/context_project.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
index 23c6a0133e1ed..ef440a580f913 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
@@ -54,10 +54,12 @@ class SequenceConvXPUKernel : public framework::OpKernel<T> {
 
     int up_pad = std::max(0, -context_start);
     int down_pad = std::max(0, context_start + context_length - 1);
-    PADDLE_ENFORCE_EQ(up_pad, 2, platform::errors::InvalidArgument(
-                                     "Only support up_pad equal 2."));
-    PADDLE_ENFORCE_EQ(down_pad, 2, platform::errors::InvalidArgument(
-                                       "Only support down_pad equal 2."));
+    PADDLE_ENFORCE_EQ(
+        up_pad, 2,
+        platform::errors::InvalidArgument("Only support up_pad equal 2."));
+    PADDLE_ENFORCE_EQ(
+        down_pad, 2,
+        platform::errors::InvalidArgument("Only support down_pad equal 2."));
 
     auto xpu_context =
         context.template device_context<DeviceContext>().x_context();
@@ -75,8 +77,9 @@ class SequenceConvXPUKernel : public framework::OpKernel<T> {
     // If batch size set to 256, the lod is {0, batch[0] - 0,
     // batch[1] - batch [0], ..., batch[255] - batch[254]},
     // so the lod_size will be 257.
-    PADDLE_ENFORCE_LE(lod_size, 257, platform::errors::InvalidArgument(
-                                         "Only support batch size <= 256."));
+    PADDLE_ENFORCE_LE(
+        lod_size, 257,
+        platform::errors::InvalidArgument("Only support batch size <= 256."));
 
     std::vector<int> cpu_lodx(lod_size);
     for (int i = 0; i < lod_size; i++) {
@@ -155,15 +158,18 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
 
     int up_pad = std::max(0, -context_start);
     int down_pad = std::max(0, context_start + context_length - 1);
-    PADDLE_ENFORCE_EQ(up_pad, 2, platform::errors::InvalidArgument(
-                                     "Only support up_pad equal 2."));
-    PADDLE_ENFORCE_EQ(down_pad, 2, platform::errors::InvalidArgument(
-                                       "Only support down_pad equal 2."));
+    PADDLE_ENFORCE_EQ(
+        up_pad, 2,
+        platform::errors::InvalidArgument("Only support up_pad equal 2."));
+    PADDLE_ENFORCE_EQ(
+        down_pad, 2,
+        platform::errors::InvalidArgument("Only support down_pad equal 2."));
 
     auto lod_level_0 = in->lod()[0];
     int lod_size = lod_level_0.size();
-    PADDLE_ENFORCE_LE(lod_size, 257, platform::errors::InvalidArgument(
-                                         "Only support batch size <= 256."));
+    PADDLE_ENFORCE_LE(
+        lod_size, 257,
+        platform::errors::InvalidArgument("Only support batch size <= 256."));
 
     std::vector<int> cpu_lodx(lod_size);
     for (int i = 0; i < lod_size; i++) {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
index 9591f3e8b5bbf..0f47e8a9c2a98 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
@@ -14,6 +14,7 @@
 
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+
 #include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
index 79503d9714f5b..552a8283b3671 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
+
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
index 12d3eee65da70..a87c327922425 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+
 #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.h b/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
index ed98b694b2754..8d10ee508a22d 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
index 494c8e3ab74a0..01e9835270cac 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
index 7e1a06b9eca5b..5cc4ecdd12aa3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
index 117fa504ff354..5abe6df09e52d 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <numeric>  // std::iota
 #include <sstream>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
index e4f2c1b2b8fd1..4817b003a2870 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
+
 #include <memory>
 
 namespace paddle {
@@ -64,10 +65,11 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           ref_level == -1 ||
               (ref_level >= 0 && ref_level < static_cast<int>(y_lod.size())),
-          true, platform::errors::InvalidArgument(
-                    "Invlid `ref_level`, which should be either equal to -1 "
-                    "or in [0, %d), but received `ref_level` = %u.",
-                    y_lod.size(), ref_level));
+          true,
+          platform::errors::InvalidArgument(
+              "Invlid `ref_level`, which should be either equal to -1 "
+              "or in [0, %d), but received `ref_level` = %u.",
+              y_lod.size(), ref_level));
 
       if (ref_level == -1) ref_level = y_lod.size() - 1;
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
index 7b7bc5183bf1f..90f911c438bc9 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
index f22b424b30735..060a3e7cab332 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
index 5d0e1d0194edd..7d018e764bdc9 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
index 3aaa2828d5bfb..d4022e80d8000 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
index 01990ebb73291..af42285158bcb 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h"
+
 #include <memory>
 #include <string>
 
@@ -30,11 +31,12 @@ class SequencePoolOp : public framework::OperatorWithKernel {
     if (!ctx->IsRuntime()) {
       // Check the lod_level for compile-time.
       auto in_lod_level = ctx->GetLoDLevel("X");
-      PADDLE_ENFORCE_GT(in_lod_level, 0, platform::errors::InvalidArgument(
-                                             "The LoD level of Input(X) should "
-                                             "be larger than 0, but received: "
-                                             "lod level %u.",
-                                             in_lod_level));
+      PADDLE_ENFORCE_GT(
+          in_lod_level, 0,
+          platform::errors::InvalidArgument("The LoD level of Input(X) should "
+                                            "be larger than 0, but received: "
+                                            "lod level %u.",
+                                            in_lod_level));
       ctx->SetLoDLevel("Out", in_lod_level - 1);
     }
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
index 4d981e0187aca..96d02e6d2e542 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
@@ -38,9 +39,10 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto lod = in->lod();
     auto lod_level = lod.size();
     // InferShape by lod
-    PADDLE_ENFORCE_GT(lod_level, 0, platform::errors::InvalidArgument(
-                                        "Input(X) Tensor of SequencePoolOp "
-                                        "does not contain LoD information."));
+    PADDLE_ENFORCE_GT(
+        lod_level, 0,
+        platform::errors::InvalidArgument("Input(X) Tensor of SequencePoolOp "
+                                          "does not contain LoD information."));
     PADDLE_ENFORCE_LE(lod_level, 2UL,
                       platform::errors::InvalidArgument(
                           "The lod level of input shall be no more than 2."
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
index 980879db4d06e..3a62bc554df2c 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/sequence_ops/sequence_reshape_op.h"
+
 #include <memory>
+
 #include "paddle/phi/core/ddim.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
index 90a17d713cf29..85282bf23b48c 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <memory>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/algorithm.h"
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
index 25c12ab565a14..6fa151af4e117 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_scatter_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
index 06fb444740fee..fdb24892e09a2 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
index e7585f7ab0487..e3f8d16a7ade9 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
index c91c59dbfee99..0d91832948dc8 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -134,10 +134,10 @@ struct SequenceSoftmaxFunctor<platform::CUDADeviceContext, T> {
     dim3 block_size(thread_x);
     dim3 grid_size(max_blocks);
     paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
-    sequence_softmax_kernel<
-        T, kThreadsPerBlock><<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()), height,
-        out->mutable_data<T>(context.GetPlace()));
+    sequence_softmax_kernel<T, kThreadsPerBlock>
+        <<<grid_size, block_size, 0, context.stream()>>>(
+            x.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()), height,
+            out->mutable_data<T>(context.GetPlace()));
   }
 };
 
@@ -158,11 +158,11 @@ struct SequenceSoftmaxGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid_size(max_blocks);
 
     paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
-    sequence_softmax_grad_kernel<
-        T, kThreadsPerBlock><<<grid_size, block_size, 0, context.stream()>>>(
-        dout.data<T>(), out.data<T>(),
-        mixv_ref_lod.CUDAData(context.GetPlace()), height,
-        dx->mutable_data<T>(context.GetPlace()));
+    sequence_softmax_grad_kernel<T, kThreadsPerBlock>
+        <<<grid_size, block_size, 0, context.stream()>>>(
+            dout.data<T>(), out.data<T>(),
+            mixv_ref_lod.CUDAData(context.GetPlace()), height,
+            dx->mutable_data<T>(context.GetPlace()));
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
index bacdd7e4ccb74..b1d5ec8e9c65e 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h"
+
 #include <memory>
 #include <string>
 
@@ -44,8 +45,9 @@ class SequenceTopkAvgPoolingOp : public framework::OperatorWithKernel {
     auto topks = attr.Get<std::vector<int>>("topks");
     auto num_k = topks.size();
     PADDLE_ENFORCE_GT(
-        num_k, 0, platform::errors::InvalidArgument(
-                      "Expected topks.size() > 0, but received %zu.", num_k));
+        num_k, 0,
+        platform::errors::InvalidArgument(
+            "Expected topks.size() > 0, but received %zu.", num_k));
 
     auto row_dim = ctx->GetInputDim("ROW");
     auto row_shape_0 = row_dim[0];
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
index 47180f123fa78..b5ee43387b35e 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
index 180d14cfada31..636be3b2f6ca7 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
index d96dc91f3bc16..d643ef860c3ca 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"
diff --git a/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake b/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake
index 9ccc4432df5cd..9a87e27b24197 100644
--- a/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake
+++ b/paddle/fluid/operators/sequence_ops/unity_build_rule.cmake
@@ -4,36 +4,38 @@
 # Generally, the combination rules in this file do not need to be modified.
 # If there are some redefined error in compiling with the source file which
 # in combination rule, you can remove the source file from the following rules.
-register_unity_group(cc
-    sequence_concat_op.cc
-    sequence_conv_op.cc
-    sequence_enumerate_op.cc
-    sequence_erase_op.cc
-    sequence_expand_op.cc
-    sequence_mask_op.cc
-    sequence_pad_op.cc
-    sequence_pool_op.cc
-    sequence_expand_as_op.cc
-    sequence_reshape_op.cc
-    sequence_reverse_op.cc
-    sequence_scatter_op.cc
-    sequence_slice_op.cc
-    sequence_softmax_op.cc
-    sequence_topk_avg_pooling_op.cc
-    sequence_unpad_op.cc
-    sequence_concat_op.cu.cc
-    sequence_conv_op.cu.cc)
-register_unity_group(cu
-    sequence_enumerate_op.cu
-    sequence_erase_op.cu
-    sequence_expand_op.cu
-    sequence_mask_op.cu
-    sequence_pad_op.cu
-    sequence_pool_op.cu
-    sequence_expand_as_op.cu
-    sequence_reshape_op.cu
-    sequence_reverse_op.cu
-    sequence_slice_op.cu
-    sequence_softmax_cudnn_op.cu.cc
-    sequence_softmax_op.cu
-    sequence_unpad_op.cu)
+register_unity_group(
+  cc
+  sequence_concat_op.cc
+  sequence_conv_op.cc
+  sequence_enumerate_op.cc
+  sequence_erase_op.cc
+  sequence_expand_op.cc
+  sequence_mask_op.cc
+  sequence_pad_op.cc
+  sequence_pool_op.cc
+  sequence_expand_as_op.cc
+  sequence_reshape_op.cc
+  sequence_reverse_op.cc
+  sequence_scatter_op.cc
+  sequence_slice_op.cc
+  sequence_softmax_op.cc
+  sequence_topk_avg_pooling_op.cc
+  sequence_unpad_op.cc
+  sequence_concat_op.cu.cc
+  sequence_conv_op.cu.cc)
+register_unity_group(
+  cu
+  sequence_enumerate_op.cu
+  sequence_erase_op.cu
+  sequence_expand_op.cu
+  sequence_mask_op.cu
+  sequence_pad_op.cu
+  sequence_pool_op.cu
+  sequence_expand_as_op.cu
+  sequence_reshape_op.cu
+  sequence_reverse_op.cu
+  sequence_slice_op.cu
+  sequence_softmax_cudnn_op.cu.cc
+  sequence_softmax_op.cu
+  sequence_unpad_op.cu)
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 73655bcb18500..4adedf09aa354 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -18,7 +18,6 @@
 
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
index daa033f9dc66d..2231eb212a2bc 100644
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/set_value_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
-
 #include "paddle/phi/kernels/funcs/slice_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index 9001ce5d51dec..38482f7b55edf 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc
index a62d1b434e764..d4c7d937d4b7b 100644
--- a/paddle/fluid/operators/shape_op_xpu.cc
+++ b/paddle/fluid/operators/shape_op_xpu.cc
@@ -11,6 +11,7 @@
 
 #ifdef PADDLE_WITH_XPU
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/share_buffer_op.h b/paddle/fluid/operators/share_buffer_op.h
index 1d0abf14f577e..1b564c3bef09f 100644
--- a/paddle/fluid/operators/share_buffer_op.h
+++ b/paddle/fluid/operators/share_buffer_op.h
@@ -27,8 +27,9 @@ class ShareBufferOpKernel : public framework::OpKernel<T> {
     const auto inputs = ctx.MultiInput<framework::Tensor>("X");
     auto outputs = ctx.MultiOutput<framework::Tensor>("Out");
     size_t n = inputs.size();
-    PADDLE_ENFORCE_EQ(n, outputs.size(), platform::errors::PermissionDenied(
-                                             "Variable number not match."));
+    PADDLE_ENFORCE_EQ(
+        n, outputs.size(),
+        platform::errors::PermissionDenied("Variable number not match."));
     const auto &share_dims_and_dtype =
         ctx.Attr<std::vector<bool>>("share_dims_and_dtype");
     if (!share_dims_and_dtype.empty()) {
diff --git a/paddle/fluid/operators/share_data_op.cc b/paddle/fluid/operators/share_data_op.cc
index 6fcc29e900261..63e8cb648e84b 100644
--- a/paddle/fluid/operators/share_data_op.cc
+++ b/paddle/fluid/operators/share_data_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/share_data_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -31,8 +32,9 @@ class ShareDataOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         in_type == framework::proto::VarType::LOD_TENSOR ||
             in_type == framework::proto::VarType::SELECTED_ROWS,
-        true, platform::errors::InvalidArgument(
-                  "Type of Variable[X] must be LoDTensor or SelectedRows!"));
+        true,
+        platform::errors::InvalidArgument(
+            "Type of Variable[X] must be LoDTensor or SelectedRows!"));
     PADDLE_ENFORCE_EQ(
         in_type, out_type,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index 1a3666ad82368..7388144dda320 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/array_operator.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/phi/core/lod_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/operators/shuffle_batch_op.cc b/paddle/fluid/operators/shuffle_batch_op.cc
index 45f7ab278a3c1..e338b48a4ccaa 100644
--- a/paddle/fluid/operators/shuffle_batch_op.cc
+++ b/paddle/fluid/operators/shuffle_batch_op.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/shuffle_batch_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/var_type_inference.h"
 
diff --git a/paddle/fluid/operators/shuffle_batch_op.h b/paddle/fluid/operators/shuffle_batch_op.h
index 2708b4a392d17..f56832f959919 100644
--- a/paddle/fluid/operators/shuffle_batch_op.h
+++ b/paddle/fluid/operators/shuffle_batch_op.h
@@ -21,6 +21,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc
index 70fddc9b04712..c43d456e94e47 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/shuffle_channel_op.h"
+
 #include <memory>
 #include <string>
 
@@ -61,8 +62,9 @@ class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("group", "the number of groups.")
         .SetDefault(1)
         .AddCustomChecker([](const int& group) {
-          PADDLE_ENFORCE_GE(group, 1, platform::errors::InvalidArgument(
-                                          "group should be larger than 0."));
+          PADDLE_ENFORCE_GE(group, 1,
+                            platform::errors::InvalidArgument(
+                                "group should be larger than 0."));
         });
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu
index 582d1ea0f26af..d3f6224594be3 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
@@ -67,10 +67,10 @@ class ShuffleChannelOpCUDAKernel : public framework::OpKernel<T> {
     const T* input_data = input->data<T>();
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
 
-    ShuffleChannel<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        count, feature_map_size, output_data, input_data, group_row,
-        group_column, sp_sz);
+    ShuffleChannel<T>
+        <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            count, feature_map_size, output_data, input_data, group_row,
+            group_column, sp_sz);
   }
 };
 
@@ -103,10 +103,10 @@ class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel<T> {
     int threads = kNumCUDAThreads;
     int count = num * group_column * group_row * sp_sz;
 
-    ShuffleChannel<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        count, feature_map_size, input_grad_data, output_grad_data, group_row,
-        group_column, sp_sz);
+    ShuffleChannel<T>
+        <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            count, feature_map_size, input_grad_data, output_grad_data,
+            group_row, group_column, sp_sz);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h
index aeaac486f3f2b..409acdfdff7ba 100644
--- a/paddle/fluid/operators/shuffle_channel_op.h
+++ b/paddle/fluid/operators/shuffle_channel_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 016ff54645b02..0cf1296fce650 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/similarity_focus_op.h b/paddle/fluid/operators/similarity_focus_op.h
index 4fa4d772aa3a9..17ea30277b85d 100644
--- a/paddle/fluid/operators/similarity_focus_op.h
+++ b/paddle/fluid/operators/similarity_focus_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cstring>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -67,16 +68,16 @@ class SimilarityFocusKernel : public framework::OpKernel<T> {
 
     std::vector<std::pair<T, int64_t>> array(array_size);
 
-    bool (*cmp)(std::pair<T, int64_t>, std::pair<T, int64_t>) = [](
-        std::pair<T, int64_t> x, std::pair<T, int64_t> y) {
-      return x.first > y.first;
-    };
+    bool (*cmp)(std::pair<T, int64_t>, std::pair<T, int64_t>) =
+        [](std::pair<T, int64_t> x, std::pair<T, int64_t> y) {
+          return x.first > y.first;
+        };
 
-    int64_t (*compute_index)(int64_t*, int, int, int, int) = [](
-        int64_t* dim, int d1, int d2, int d3, int d4) {
-      return d1 * dim[1] * dim[2] * dim[3] + d2 * dim[2] * dim[3] +
-             d3 * dim[3] + d4;
-    };
+    int64_t (*compute_index)(int64_t*, int, int, int, int) =
+        [](int64_t* dim, int d1, int d2, int d3, int d4) {
+          return d1 * dim[1] * dim[2] * dim[3] + d2 * dim[2] * dim[3] +
+                 d3 * dim[3] + d4;
+        };
 
     PADDLE_ENFORCE_GT(
         axis, 0,
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index c6432d00e9de1..a815e12d061cf 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/slice_op.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/phi/kernels/funcs/slice_utils.h"
 
 namespace paddle {
@@ -85,8 +87,9 @@ class SliceOp : public framework::OperatorWithKernel {
     }
     if (ctx->HasInputs("EndsTensorList")) {
       ends_size = ctx->Inputs("EndsTensorList").size();
-      PADDLE_ENFORCE_GT(ends_size, 0, platform::errors::InvalidArgument(
-                                          "EndsTensorList size can't be zero"));
+      PADDLE_ENFORCE_GT(ends_size, 0,
+                        platform::errors::InvalidArgument(
+                            "EndsTensorList size can't be zero"));
     }
 
     if (!ctx->HasInput("StartsTensor")) {
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index a9a98b46d5eb7..f18ffef3f5834 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/utils.h"
diff --git a/paddle/fluid/operators/slice_op_mlu.cc b/paddle/fluid/operators/slice_op_mlu.cc
index 43322e4b2e75b..7645232ec0cbc 100644
--- a/paddle/fluid/operators/slice_op_mlu.cc
+++ b/paddle/fluid/operators/slice_op_mlu.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/slice_op.h"
-
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/slice_op.h"
 #include "paddle/phi/kernels/funcs/slice_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
index 0d0d9ab19df30..3441453430e5c 100644
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/slice_op.h"
-
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/slice_utils.h"
 
diff --git a/paddle/fluid/operators/slice_op_xpu.cc b/paddle/fluid/operators/slice_op_xpu.cc
index 6ac1027b0ce19..8f2dfd38d491b 100644
--- a/paddle/fluid/operators/slice_op_xpu.cc
+++ b/paddle/fluid/operators/slice_op_xpu.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/slice_op.h"
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/operators/slice_op.h"
 #include "xpu/refactor/math.h"
 
 namespace paddle {
@@ -53,8 +54,9 @@ class SliceXPUKernel : public framework::OpKernel<T> {
       start = std::max(start, 0);
       end = std::max(end, 0);
       end = std::min(end, dim_value);
-      PADDLE_ENFORCE_GT(end, start, platform::errors::InvalidArgument(
-                                        "end should greater than start"));
+      PADDLE_ENFORCE_GT(
+          end, start,
+          platform::errors::InvalidArgument("end should greater than start"));
       starts[i] = start;
       ends[i] = end;
     }
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
index c0318d344aef3..05204354d0912 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/smooth_l1_loss_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
index 136ea68ac9efe..bdc46abff2ad2 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/smooth_l1_loss_op.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/smooth_l1_loss_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 3840b99dd176d..7304467833a90 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -61,8 +61,9 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     if (input_data_type == framework::proto::VarType::FP16) {
       PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()) ||
                             platform::is_xpu_place(ctx.GetPlace()),
-                        true, platform::errors::InvalidArgument(
-                                  "float16 can only be used on GPU/XPU place"));
+                        true,
+                        platform::errors::InvalidArgument(
+                            "float16 can only be used on GPU/XPU place"));
     }
 #endif
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
index c07467a9b0ba3..4b55f5af09dc6 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
@@ -44,8 +44,9 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
     Tensor* loss = context.Output<Tensor>("Loss");
     const int rank = logits->dims().size();
     const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
-    PADDLE_ENFORCE_EQ(axis, rank - 1, platform::errors::InvalidArgument(
-                                          "axis should == rank - 1"));
+    PADDLE_ENFORCE_EQ(
+        axis, rank - 1,
+        platform::errors::InvalidArgument("axis should == rank - 1"));
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
     const int n = phi::funcs::SizeToAxis(axis, logits->dims());
@@ -140,8 +141,9 @@ class SoftmaxWithCrossEntropyGradXPUKernel : public framework::OpKernel<T> {
 
     const int rank = logit_grad->dims().size();
     const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
-    PADDLE_ENFORCE_EQ(axis, rank - 1, platform::errors::InvalidArgument(
-                                          "axis should == rank - 1"));
+    PADDLE_ENFORCE_EQ(
+        axis, rank - 1,
+        platform::errors::InvalidArgument("axis should == rank - 1"));
     const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims());
     const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims());
 
diff --git a/paddle/fluid/operators/solve_op.cc b/paddle/fluid/operators/solve_op.cc
index 57302ae034271..4d23f1ce20945 100644
--- a/paddle/fluid/operators/solve_op.cc
+++ b/paddle/fluid/operators/solve_op.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/solve_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/phi/core/ddim.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/solve_op.h b/paddle/fluid/operators/solve_op.h
index 7f3a574866604..928fbf755d7f7 100644
--- a/paddle/fluid/operators/solve_op.h
+++ b/paddle/fluid/operators/solve_op.h
@@ -92,9 +92,10 @@ static framework::DDim GetOutputShapeUnsqueeze(
   for (int axis : unsqz_dims) {
     int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
     // Vaildity Check: the axis bound
-    PADDLE_ENFORCE_GE(cur, 0, platform::errors::InvalidArgument(
-                                  "The insert dimension value should "
-                                  "not be less than 0"));
+    PADDLE_ENFORCE_GE(
+        cur, 0,
+        platform::errors::InvalidArgument("The insert dimension value should "
+                                          "not be less than 0"));
     PADDLE_ENFORCE_LE(cur, cur_output_size,
                       platform::errors::InvalidArgument(
                           "The insert dimension value shoule not be larger "
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
index 013467396b3a6..6a6972f3293e4 100644
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -38,8 +38,9 @@ class SpaceToDepthOp : public framework::OperatorWithKernel {
                        "Output(Out) of SpaceToDepthOp should not be null."));
 
     auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4, platform::errors::InvalidArgument(
-                                            "input should be a 4D tensor"));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 4,
+        platform::errors::InvalidArgument("input should be a 4D tensor"));
     auto blocksize = ctx->Attrs().Get<int64_t>("blocksize");
 
     PADDLE_ENFORCE_GT(blocksize, 1,
diff --git a/paddle/fluid/operators/sparse_attention_op.cc b/paddle/fluid/operators/sparse_attention_op.cc
index a6534543a6515..14d1ffe3f11b0 100644
--- a/paddle/fluid/operators/sparse_attention_op.cc
+++ b/paddle/fluid/operators/sparse_attention_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu
index 49f8263ab289a..2949642d2f3dd 100644
--- a/paddle/fluid/operators/sparse_attention_op.cu
+++ b/paddle/fluid/operators/sparse_attention_op.cu
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <math.h>
+
 #include <limits>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #if defined(PADDLE_WITH_CUDA)
@@ -90,17 +92,15 @@ __global__ void BlockSparseSoftmaxForward(T* softmax, const T* src, T scale,
       if (cur_block_col < cur_block_nnz) {
         // read kp mask
         T cur_kp_mask;
-        if ((kp_mask != nullptr) &&
-            std::abs(kp_mask[colindex[cur_block_col]]) <
-                std::numeric_limits<T>::epsilon()) {
+        if ((kp_mask != nullptr) && std::abs(kp_mask[colindex[cur_block_col]]) <
+                                        std::numeric_limits<T>::epsilon()) {
           cur_kp_mask = -std::numeric_limits<T>::infinity();
         } else {
           cur_kp_mask = 0;
         }
         // do mask operation
-        if ((attnptr != nullptr) &&
-            std::abs(attnptr[colindex[cur_block_col]]) <
-                std::numeric_limits<T>::epsilon()) {
+        if ((attnptr != nullptr) && std::abs(attnptr[colindex[cur_block_col]]) <
+                                        std::numeric_limits<T>::epsilon()) {
           srcdata[cur_reg_index] =
               -std::numeric_limits<T>::infinity() * scale + cur_kp_mask;
         } else {
@@ -280,37 +280,37 @@ void SparseSoftmaxBackward(const platform::CUDADeviceContext& ctx,
   T scaling = static_cast<T>(1.0) / sqrt(static_cast<T>(num_cols));
 
   if (num_cols <= 4) {
-    BlockSparseSoftmaxBackward<T, block_size, 4><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 4>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else if (num_cols > 4 && num_cols <= 8) {
-    BlockSparseSoftmaxBackward<T, block_size, 8><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 8>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else if (num_cols > 8 && num_cols <= 16) {
-    BlockSparseSoftmaxBackward<T, block_size, 16><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 16>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else if (num_cols > 16 && num_cols <= 32) {
-    BlockSparseSoftmaxBackward<T, block_size, 32><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 32>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else if (num_cols > 32 && num_cols <= 64) {
-    BlockSparseSoftmaxBackward<T, block_size, 64><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 64>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else if (num_cols > 64 && num_cols <= 128) {
-    BlockSparseSoftmaxBackward<T, block_size, 128><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 128>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else if (num_cols > 128 && num_cols <= 256) {
-    BlockSparseSoftmaxBackward<T, block_size, 256><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 256>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else if (num_cols > 256 && num_cols <= 512) {
-    BlockSparseSoftmaxBackward<T, block_size, 512><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 512>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "The head_dim of query in sparse_attention op should less or equal "
diff --git a/paddle/fluid/operators/spectral_norm_op.h b/paddle/fluid/operators/spectral_norm_op.h
index ee75c96c23a9f..765b9a4dbfae6 100644
--- a/paddle/fluid/operators/spectral_norm_op.h
+++ b/paddle/fluid/operators/spectral_norm_op.h
@@ -11,6 +11,7 @@
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc
index 0270f7e0576c8..cd2053b4ef083 100644
--- a/paddle/fluid/operators/spectral_op.cc
+++ b/paddle/fluid/operators/spectral_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/spectral_op.h"
+
 #include "paddle/fluid/operators/spectral_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h
index 71b54caf5ee79..4900e88fbe18f 100644
--- a/paddle/fluid/operators/spectral_op.h
+++ b/paddle/fluid/operators/spectral_op.h
@@ -18,6 +18,7 @@
 #include <numeric>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index dc20952903ab2..6c60c1a17e017 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/split_op.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/infershape_utils.h"
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index cf44f341b2b64..143e1d72868a1 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/kernels/split_kernel.h"
diff --git a/paddle/fluid/operators/split_op_mlu.cc b/paddle/fluid/operators/split_op_mlu.cc
index adc3ea14e32d6..0d438854673cb 100644
--- a/paddle/fluid/operators/split_op_mlu.cc
+++ b/paddle/fluid/operators/split_op_mlu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/split_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/split_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/split_op_xpu.cc b/paddle/fluid/operators/split_op_xpu.cc
index 8f02d8157b202..b24d0a70b05b0 100644
--- a/paddle/fluid/operators/split_op_xpu.cc
+++ b/paddle/fluid/operators/split_op_xpu.cc
@@ -12,9 +12,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/split_op.h"
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/operators/split_op.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
index b1e0127f4cf91..05230399b300a 100644
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/spp_op.h"
+
 #include <string>
 #include <vector>
 namespace paddle {
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index aa944cfcfbb17..cd81ade1f9d81 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
diff --git a/paddle/fluid/operators/squeeze_op_mlu.cc b/paddle/fluid/operators/squeeze_op_mlu.cc
new file mode 100644
index 0000000000000..d492846b41c11
--- /dev/null
+++ b/paddle/fluid/operators/squeeze_op_mlu.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_MLU
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/squeeze_op.h"
+#include "paddle/fluid/platform/device/mlu/device_context.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(
+    squeeze, ops::SqueezeKernel<plat::MLUDeviceContext, float>,
+    ops::SqueezeKernel<plat::MLUDeviceContext, double>,
+    ops::SqueezeKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::SqueezeKernel<plat::MLUDeviceContext, bool>,
+    ops::SqueezeKernel<plat::MLUDeviceContext, int>,
+    ops::SqueezeKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::SqueezeKernel<plat::MLUDeviceContext, int8_t>,
+    ops::SqueezeKernel<plat::MLUDeviceContext, int64_t>);
+
+REGISTER_OP_MLU_KERNEL(
+    squeeze_grad, ops::SqueezeGradKernel<plat::MLUDeviceContext, float>,
+    ops::SqueezeGradKernel<plat::MLUDeviceContext, double>,
+    ops::SqueezeGradKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::SqueezeGradKernel<plat::MLUDeviceContext, bool>,
+    ops::SqueezeGradKernel<plat::MLUDeviceContext, int>,
+    ops::SqueezeGradKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::SqueezeGradKernel<plat::MLUDeviceContext, int8_t>,
+    ops::SqueezeGradKernel<plat::MLUDeviceContext, int64_t>);
+
+REGISTER_OP_MLU_KERNEL(
+    squeeze2, ops::SqueezeKernel<plat::MLUDeviceContext, float>,
+    ops::SqueezeKernel<plat::MLUDeviceContext, double>,
+    ops::SqueezeKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::SqueezeKernel<plat::MLUDeviceContext, bool>,
+    ops::SqueezeKernel<plat::MLUDeviceContext, int>,
+    ops::SqueezeKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::SqueezeKernel<plat::MLUDeviceContext, int8_t>,
+    ops::SqueezeKernel<plat::MLUDeviceContext, int64_t>);
+
+REGISTER_OP_MLU_KERNEL(
+    squeeze2_grad, ops::Squeeze2GradKernel<plat::MLUDeviceContext, float>,
+    ops::Squeeze2GradKernel<plat::MLUDeviceContext, double>,
+    ops::Squeeze2GradKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::Squeeze2GradKernel<plat::MLUDeviceContext, bool>,
+    ops::Squeeze2GradKernel<plat::MLUDeviceContext, int>,
+    ops::Squeeze2GradKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::Squeeze2GradKernel<plat::MLUDeviceContext, int8_t>,
+    ops::Squeeze2GradKernel<plat::MLUDeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc
index 6fc80ca379f3f..6b0a0657afba8 100644
--- a/paddle/fluid/operators/stack_op.cc
+++ b/paddle/fluid/operators/stack_op.cc
@@ -14,6 +14,7 @@
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc
index 9d4ef0ffa20e2..c3e6e333e4cf4 100644
--- a/paddle/fluid/operators/stack_op_npu.cc
+++ b/paddle/fluid/operators/stack_op_npu.cc
@@ -30,8 +30,9 @@ class StackNPUKernel : public framework::OpKernel<T> {
     if (axis < 0) axis += (x[0]->dims().size() + 1);
     int num = static_cast<int>(x.size());
 
-    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
-                                  "number of input Tensor <= 0"));
+    PADDLE_ENFORCE_GT(
+        num, 0,
+        platform::errors::InvalidArgument("number of input Tensor <= 0"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -59,8 +60,9 @@ class StackGradNPUKernel : public framework::OpKernel<T> {
     if (axis < 0) axis += dy->dims().size();
     int num = dy->dims()[axis];
 
-    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
-                                  "number of input Tensor <= 0"));
+    PADDLE_ENFORCE_GT(
+        num, 0,
+        platform::errors::InvalidArgument("number of input Tensor <= 0"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
diff --git a/paddle/fluid/operators/stack_op_xpu.cc b/paddle/fluid/operators/stack_op_xpu.cc
index baaa2b4884ce3..925fcc08615ac 100644
--- a/paddle/fluid/operators/stack_op_xpu.cc
+++ b/paddle/fluid/operators/stack_op_xpu.cc
@@ -15,6 +15,7 @@
 #ifdef PADDLE_WITH_XPU
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/concat_op.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
diff --git a/paddle/fluid/operators/stft_op.cc b/paddle/fluid/operators/stft_op.cc
index 7d4103ddf3859..36e867417291c 100644
--- a/paddle/fluid/operators/stft_op.cc
+++ b/paddle/fluid/operators/stft_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/stft_op.h"
+
 #include "paddle/fluid/operators/spectral_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/stft_op.h b/paddle/fluid/operators/stft_op.h
index e75c59232bcae..cc17ed9a43cc1 100644
--- a/paddle/fluid/operators/stft_op.h
+++ b/paddle/fluid/operators/stft_op.h
@@ -17,7 +17,6 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/frame_op.h"
 #include "paddle/fluid/operators/spectral_op.h"
diff --git a/paddle/fluid/operators/strided_slice_op_npu.cc b/paddle/fluid/operators/strided_slice_op_npu.cc
index b142b8f099b89..80952e9b5560c 100644
--- a/paddle/fluid/operators/strided_slice_op_npu.cc
+++ b/paddle/fluid/operators/strided_slice_op_npu.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/funcs/strided_slice.h"
 #include "paddle/fluid/operators/slice_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/strided_slice.h"
 
 namespace paddle {
 namespace operators {
@@ -186,14 +186,16 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
     out->Resize(out_dims);
     out->mutable_data<T>(place);
 
-    const auto& runner = NpuOpRunner(
-        "StridedSlice", {*in, starts_indices_tensor, ends_indices_tensor,
-                         strides_indices_tensor},
-        {*out}, {{"begin_mask", 0},
-                 {"end_mask", 0},
-                 {"ellipsis_mask", 0},
-                 {"new_axis_mask", 0},
-                 {"shrink_axis_mask", 0}});
+    const auto& runner =
+        NpuOpRunner("StridedSlice",
+                    {*in, starts_indices_tensor, ends_indices_tensor,
+                     strides_indices_tensor},
+                    {*out},
+                    {{"begin_mask", 0},
+                     {"end_mask", 0},
+                     {"ellipsis_mask", 0},
+                     {"new_axis_mask", 0},
+                     {"shrink_axis_mask", 0}});
     runner.Run(stream);
 
     if (need_reverse) {
diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc
index 42047021b408a..9e4089680f420 100644
--- a/paddle/fluid/operators/string/faster_tokenizer_op.cc
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc
@@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/string/faster_tokenizer_op.h"
+
 #include <utf8proc.h>
 
 #include <algorithm>
+#include <boost/algorithm/string.hpp>
 #include <chrono>
 #include <codecvt>
 #include <fstream>
@@ -22,10 +25,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 
-#include <boost/algorithm/string.hpp>
-
 #include "paddle/fluid/framework/string_array.h"
-#include "paddle/fluid/operators/string/faster_tokenizer_op.h"
 
 namespace paddle {
 namespace operators {
@@ -38,12 +38,11 @@ using std::ifstream;
 using std::int64_t;
 using std::min;
 using std::runtime_error;
-using std::unordered_map;
-using std::unordered_set;
 using std::shared_ptr;
 using std::size_t;
-using std::int64_t;
 using std::string;
+using std::unordered_map;
+using std::unordered_set;
 using std::vector;
 using std::wstring;
 
diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.h b/paddle/fluid/operators/string/faster_tokenizer_op.h
index 446be3a1999fc..a6b8bfea59c47 100644
--- a/paddle/fluid/operators/string/faster_tokenizer_op.h
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.h
@@ -26,15 +26,14 @@ namespace operators {
 
 using std::endl;
 using std::int64_t;
+using std::shared_ptr;
 using std::size_t;
 using std::string;
-using std::shared_ptr;
-using std::vector;
 using std::unordered_map;
 using std::unordered_set;
 using std::vector;
-using std::wstring;
 using std::wcout;
+using std::wstring;
 
 inline bool IsControl(const wchar_t& ch);
 inline bool IsChineseChar(const wchar_t& ch);
diff --git a/paddle/fluid/operators/string/unity_build_rule.cmake b/paddle/fluid/operators/string/unity_build_rule.cmake
index a4b209d2df13e..90922407ec712 100644
--- a/paddle/fluid/operators/string/unity_build_rule.cmake
+++ b/paddle/fluid/operators/string/unity_build_rule.cmake
@@ -4,5 +4,4 @@
 # Generally, the combination rules in this file do not need to be modified.
 # If there are some redefined error in compiling with the source file which
 # in combination rule, you can remove the source file from the following rules.
-register_unity_group(cc
-    faster_tokenizer_op.cc)
\ No newline at end of file
+register_unity_group(cc faster_tokenizer_op.cc)
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 51040544fac34..bc6997e36ebf7 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -36,9 +36,8 @@ class SumOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "sum");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "sum");
 
-    if (ctx->IsRuntime() &&
-        ctx->GetOutputsVarType("Out")[0] ==
-            framework::proto::VarType::LOD_TENSOR_ARRAY) {
+    if (ctx->IsRuntime() && ctx->GetOutputsVarType("Out")[0] ==
+                                framework::proto::VarType::LOD_TENSOR_ARRAY) {
       return;  // skip runtime infershape when is tensor array;
     }
 
@@ -47,11 +46,12 @@ class SumOp : public framework::OperatorWithKernel {
 
     auto N = x_dims.size();
     PADDLE_ENFORCE_GT(
-        N, 0, platform::errors::InvalidArgument(
-                  "The input tensor X's dimensions of SumOp "
-                  "should be larger than 0. But received X's dimensions %d, "
-                  "X's shape = [%s].",
-                  N, &x_dims));
+        N, 0,
+        platform::errors::InvalidArgument(
+            "The input tensor X's dimensions of SumOp "
+            "should be larger than 0. But received X's dimensions %d, "
+            "X's shape = [%s].",
+            N, &x_dims));
     if (N == 1) {
       VLOG(3) << "Warning: SumOp have only one input, may waste memory";
     }
@@ -115,8 +115,9 @@ class SumOp : public framework::OperatorWithKernel {
     framework::LibraryType library{framework::LibraryType::kPlain};
     framework::DataLayout layout{framework::DataLayout::kAnyLayout};
 
-    PADDLE_ENFORCE_GT(x_vars.size(), 0, platform::errors::InvalidArgument(
-                                            "Input[X] should not be empty"));
+    PADDLE_ENFORCE_GT(
+        x_vars.size(), 0,
+        platform::errors::InvalidArgument("Input[X] should not be empty"));
 
     PADDLE_ENFORCE_NOT_NULL(
         x_vars[0], platform::errors::NotFound(
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 8c6c083cde880..3bf249425c2ce 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/sum_op.h"
@@ -205,8 +206,8 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
           reinterpret_cast<T **>(tmp_sr_in_out_array->ptr());
 
       ComputeKernelParameter(length);
-      SumSelectedRowsCUDAKernel<T><<<grids, blocks, 0, stream>>>(
-          sr_in_out_array_data, length, rows);
+      SumSelectedRowsCUDAKernel<T>
+          <<<grids, blocks, 0, stream>>>(sr_in_out_array_data, length, rows);
       dst_write = true;
     }
   }
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 3c51b3398be4e..8c1e3a3dbf191 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/sum_op_mlu.cc b/paddle/fluid/operators/sum_op_mlu.cc
index 179c038e83716..68e31c364b64b 100644
--- a/paddle/fluid/operators/sum_op_mlu.cc
+++ b/paddle/fluid/operators/sum_op_mlu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/sum_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/sum_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sum_op_xpu.cc b/paddle/fluid/operators/sum_op_xpu.cc
index 5899591549eac..a1cdaddd11b42 100644
--- a/paddle/fluid/operators/sum_op_xpu.cc
+++ b/paddle/fluid/operators/sum_op_xpu.cc
@@ -11,8 +11,9 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/sum_op.h"
 #include <vector>
+
+#include "paddle/fluid/operators/sum_op.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index 166f49999d552..468c658e5e640 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -15,9 +15,11 @@
 #pragma once
 
 #include <Eigen/src/Core/util/Constants.h>
+
 #include <Eigen/Dense>
 #include <Eigen/SVD>
 #include <iostream>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/diag_op.h"
@@ -101,20 +103,22 @@ struct RealMulComplexFunctor {
   // y: complex number (c+0j) pretend to be a real number
   // out: complex number (ac+bcj)
   inline HOSTDEVICE T operator()(T x, T y) {
-    PADDLE_ENFORCE_LT(y.imag, 1e-6, platform::errors::InvalidArgument(
-                                        "The image part of y must to be 0"
-                                        "but got [%d]",
-                                        y.imag));
+    PADDLE_ENFORCE_LT(
+        y.imag, 1e-6,
+        platform::errors::InvalidArgument("The image part of y must to be 0"
+                                          "but got [%d]",
+                                          y.imag));
     return platform::complex<phi::dtype::Real<T>>(x.real * y.real,
                                                   x.imag * y.real);
   }
 };
 
 static std::vector<int> GetBroadcastShape(InTensors ins) {
-  PADDLE_ENFORCE_EQ(ins.size(), 2, platform::errors::InvalidArgument(
-                                       "GetBroadcastShape Receive 2 tensors"
-                                       "but got [%d]",
-                                       ins.size()));
+  PADDLE_ENFORCE_EQ(
+      ins.size(), 2,
+      platform::errors::InvalidArgument("GetBroadcastShape Receive 2 tensors"
+                                        "but got [%d]",
+                                        ins.size()));
   auto x_dim = ins[0]->dims();
   auto y_dim = ins[1]->dims();
   std::vector<int> broadcast_shape =
@@ -596,8 +600,9 @@ struct DeviceIndependenceTensorOperations {
     attrs["lower"] = lower;
     NameInTensorMap inputs({{"X", {&x}}});
     int x_rank = x.dims().size();
-    PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument(
-                                     "Rank must be at least 2."));
+    PADDLE_ENFORCE_GE(
+        x_rank, 2,
+        platform::errors::InvalidArgument("Rank must be at least 2."));
     std::vector<int> out_shape = phi::vectorize<int>(x.dims());
     return CreateOpRunAndReturnTensor("tril_triu", inputs, attrs, out_shape);
   }
diff --git a/paddle/fluid/operators/svd_op.cc b/paddle/fluid/operators/svd_op.cc
index 3ca7320114a8a..e68b013d2fb62 100644
--- a/paddle/fluid/operators/svd_op.cc
+++ b/paddle/fluid/operators/svd_op.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/svd_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/phi/core/ddim.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
diff --git a/paddle/fluid/operators/svd_op.cu b/paddle/fluid/operators/svd_op.cu
index e987589e83c19..317ea7c5363b9 100644
--- a/paddle/fluid/operators/svd_op.cu
+++ b/paddle/fluid/operators/svd_op.cu
@@ -16,8 +16,10 @@ limitations under the License. */
 // HIP not support cusolver
 
 #include <thrust/device_vector.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/svd_op.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
diff --git a/paddle/fluid/operators/svd_op.h b/paddle/fluid/operators/svd_op.h
index 42a847206a3cb..1008a69e6de0f 100644
--- a/paddle/fluid/operators/svd_op.h
+++ b/paddle/fluid/operators/svd_op.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <cstdarg>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/svd_helper.h"
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h
index 17c96544988b6..9818aa3651baf 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
@@ -137,7 +137,7 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
                           const float momentum, const bool is_test,
                           const bool use_global_stats
 
-                          ) {
+) {
   const auto &x_dims = x->dims();
   PADDLE_ENFORCE_GE(x_dims.size(), 2,
                     platform::errors::InvalidArgument(
@@ -178,13 +178,11 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
     const int threads = 256;
     int grid = std::min(C, (max_threads + threads - 1) / threads);
     if (layout == framework::DataLayout::kNCHW) {
-      KeLocalStats<T, threads,
-                   framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
-          x_d, N, H * W * D, C, stats);
+      KeLocalStats<T, threads, framework::DataLayout::kNCHW>
+          <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
     } else {
-      KeLocalStats<T, threads,
-                   framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
-          x_d, N, H * W * D, C, stats);
+      KeLocalStats<T, threads, framework::DataLayout::kNHWC>
+          <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
     }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -221,13 +219,13 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
 
   int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
   if (layout == framework::DataLayout::kNCHW) {
-    KeNormAffine<T, framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-        x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel,
-        y_d);
+    KeNormAffine<T, framework::DataLayout::kNCHW>
+        <<<grid2, block, 0, stream>>>(x_d, s_d, b_d, mean_data, var_data,
+                                      epsilon, C, H * W * D, x_numel, y_d);
   } else {
-    KeNormAffine<T, framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-        x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel,
-        y_d);
+    KeNormAffine<T, framework::DataLayout::kNHWC>
+        <<<grid2, block, 0, stream>>>(x_d, s_d, b_d, mean_data, var_data,
+                                      epsilon, C, H * W * D, x_numel, y_d);
   }
 }
 
@@ -436,30 +434,30 @@ void SyncBatchNormGradFunctor(
 
   if (is_inplace) {
     if (layout == framework::DataLayout::kNCHW) {
-      KeBNRestoreData<
-          T, framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-          px.mutable_data<T>(ctx.GetPlace()),
-          scale->data<BatchNormParamType<T>>(),
-          bias->data<BatchNormParamType<T>>(), saved_mean, saved_inv_var,
-          epsilon, C, H * W * D, x_numel, x->data<T>());
+      KeBNRestoreData<T, framework::DataLayout::kNCHW>
+          <<<grid2, block, 0, stream>>>(px.mutable_data<T>(ctx.GetPlace()),
+                                        scale->data<BatchNormParamType<T>>(),
+                                        bias->data<BatchNormParamType<T>>(),
+                                        saved_mean, saved_inv_var, epsilon, C,
+                                        H * W * D, x_numel, x->data<T>());
     } else {
-      KeBNRestoreData<
-          T, framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-          px.mutable_data<T>(ctx.GetPlace()),
-          scale->data<BatchNormParamType<T>>(),
-          bias->data<BatchNormParamType<T>>(), saved_mean, saved_inv_var,
-          epsilon, C, H * W * D, x_numel, x->data<T>());
+      KeBNRestoreData<T, framework::DataLayout::kNHWC>
+          <<<grid2, block, 0, stream>>>(px.mutable_data<T>(ctx.GetPlace()),
+                                        scale->data<BatchNormParamType<T>>(),
+                                        bias->data<BatchNormParamType<T>>(),
+                                        saved_mean, saved_inv_var, epsilon, C,
+                                        H * W * D, x_numel, x->data<T>());
     }
   }
 
   if (layout == framework::DataLayout::kNCHW) {
-    KeBackwardLocalStats<
-        T, threads, framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
-        dy_d, x_d, saved_mean, N, fsize, C, stats);
+    KeBackwardLocalStats<T, threads, framework::DataLayout::kNCHW>
+        <<<grid, threads, 0, stream>>>(dy_d, x_d, saved_mean, N, fsize, C,
+                                       stats);
   } else {
-    KeBackwardLocalStats<
-        T, threads, framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
-        dy_d, x_d, saved_mean, N, fsize, C, stats);
+    KeBackwardLocalStats<T, threads, framework::DataLayout::kNHWC>
+        <<<grid, threads, 0, stream>>>(dy_d, x_d, saved_mean, N, fsize, C,
+                                       stats);
   }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -476,35 +474,33 @@ void SyncBatchNormGradFunctor(
 
   if (layout == framework::DataLayout::kNCHW) {
     if (d_scale && d_bias) {
-      KeBNBackwardScaleBias<
-          T, threads,
-          framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
-          dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize,
-          d_scale->data<BatchNormParamType<T>>(),
-          d_bias->data<BatchNormParamType<T>>());
+      KeBNBackwardScaleBias<T, threads, framework::DataLayout::kNCHW>
+          <<<grid, threads, 0, stream>>>(dy_d, x_d, saved_mean, saved_inv_var,
+                                         epsilon, N, C, fsize,
+                                         d_scale->data<BatchNormParamType<T>>(),
+                                         d_bias->data<BatchNormParamType<T>>());
     }
     if (d_x) {
-      KeBNBackwardData<
-          T, framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-          dy_d, x_d, scale->data<BatchNormParamType<T>>(), saved_mean,
-          saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C, fsize,
-          x->numel(), d_x->data<T>());
+      KeBNBackwardData<T, framework::DataLayout::kNCHW>
+          <<<grid2, block, 0, stream>>>(
+              dy_d, x_d, scale->data<BatchNormParamType<T>>(), saved_mean,
+              saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C, fsize,
+              x->numel(), d_x->data<T>());
     }
   } else {
     if (d_scale && d_bias) {
-      KeBNBackwardScaleBias<
-          T, threads,
-          framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
-          dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize,
-          d_scale->data<BatchNormParamType<T>>(),
-          d_bias->data<BatchNormParamType<T>>());
+      KeBNBackwardScaleBias<T, threads, framework::DataLayout::kNHWC>
+          <<<grid, threads, 0, stream>>>(dy_d, x_d, saved_mean, saved_inv_var,
+                                         epsilon, N, C, fsize,
+                                         d_scale->data<BatchNormParamType<T>>(),
+                                         d_bias->data<BatchNormParamType<T>>());
     }
     if (d_x) {
-      KeBNBackwardData<
-          T, framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-          dy_d, x_d, scale->data<BatchNormParamType<T>>(), saved_mean,
-          saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C, fsize,
-          x->numel(), d_x->data<T>());
+      KeBNBackwardData<T, framework::DataLayout::kNHWC>
+          <<<grid2, block, 0, stream>>>(
+              dy_d, x_d, scale->data<BatchNormParamType<T>>(), saved_mean,
+              saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C, fsize,
+              x->numel(), d_x->data<T>());
     }
   }
 }
diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
index b5632f4fe4a84..604f8f97a3f41 100644
--- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
@@ -566,8 +566,9 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       paddle::framework::TensorToVector(
           device_count_tensor, ctx.device_context(), &device_count_vec);
       device_counts = device_count_vec[0];
-      PADDLE_ENFORCE_GE(device_counts, 2, platform::errors::PreconditionNotMet(
-                                              "device_counts should >= 2."));
+      PADDLE_ENFORCE_GE(
+          device_counts, 2,
+          platform::errors::PreconditionNotMet("device_counts should >= 2."));
     }
 
     // cacl var_ref
diff --git a/paddle/fluid/operators/tdm_child_op.cc b/paddle/fluid/operators/tdm_child_op.cc
index a60fc537e3216..a7a218972ecf9 100644
--- a/paddle/fluid/operators/tdm_child_op.cc
+++ b/paddle/fluid/operators/tdm_child_op.cc
@@ -13,7 +13,9 @@
  limitations under the License. */
 
 #include "paddle/fluid/operators/tdm_child_op.h"
+
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/sampler.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h
index e437975320cc5..c39d8260a8b36 100644
--- a/paddle/fluid/operators/tdm_child_op.h
+++ b/paddle/fluid/operators/tdm_child_op.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/tdm_sampler_op.cc b/paddle/fluid/operators/tdm_sampler_op.cc
index 6aad72a0d9cbe..68d079e679302 100644
--- a/paddle/fluid/operators/tdm_sampler_op.cc
+++ b/paddle/fluid/operators/tdm_sampler_op.cc
@@ -13,7 +13,9 @@
  limitations under the License. */
 
 #include "paddle/fluid/operators/tdm_sampler_op.h"
+
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/sampler.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/tdm_sampler_op.h b/paddle/fluid/operators/tdm_sampler_op.h
index bf752a9c8ad78..c3ed90ae68ebd 100644
--- a/paddle/fluid/operators/tdm_sampler_op.h
+++ b/paddle/fluid/operators/tdm_sampler_op.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc
index 3bdb9cb972fc6..12d0f288d97c9 100644
--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ b/paddle/fluid/operators/temporal_shift_op.cc
@@ -10,12 +10,13 @@
    limitations under the License. */
 
 #include "paddle/fluid/operators/temporal_shift_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/op_registry.h"
 
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu
index 1d7aeec142ff0..f8e642cdb897c 100644
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
@@ -179,13 +179,13 @@ class TemporalShiftOpCUDAKernel : public framework::OpKernel<T> {
     grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid);
 
     if (data_layout == DataLayout::kNCHW) {
-      KeTemporalShiftFwNCHW<
-          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
-          input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2);
+      KeTemporalShiftFwNCHW<T>
+          <<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+              input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2);
     } else {
-      KeTemporalShiftFwNHWC<
-          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
-          input_data, output_data, ntchw, tchw, chw, t, c, c1, c2);
+      KeTemporalShiftFwNHWC<T>
+          <<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+              input_data, output_data, ntchw, tchw, chw, t, c, c1, c2);
     }
   }
 };
@@ -233,13 +233,15 @@ class TemporalShiftGradOpCUDAKernel : public framework::OpKernel<T> {
     grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid);
 
     if (data_layout == DataLayout::kNCHW) {
-      KeTemporalShiftBwNCHW<
-          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
-          output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, c2);
+      KeTemporalShiftBwNCHW<T>
+          <<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+              output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1,
+              c2);
     } else {
-      KeTemporalShiftBwNHWC<
-          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
-          output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, c2);
+      KeTemporalShiftBwNHWC<T>
+          <<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+              output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1,
+              c2);
     }
   }
 };
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
index 95ae32fa2ea6f..41d1fc2356e4b 100644
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -96,10 +96,11 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
         *scope.FindVar(Output("OutIndex"))->GetMutable<framework::LoDTensor>();
 
     const size_t n = inx.size();
-    PADDLE_ENFORCE_GT(n, 0, platform::errors::InvalidArgument(
-                                "Input tensorarray size should > 0,"
-                                "but the received is %d",
-                                n));
+    PADDLE_ENFORCE_GT(
+        n, 0,
+        platform::errors::InvalidArgument("Input tensorarray size should > 0,"
+                                          "but the received is %d",
+                                          n));
 
     std::string base_name = Inputs("X")[0];
     std::vector<std::string> names;
@@ -235,10 +236,11 @@ class LoDTensorArray2TensorGradOp : public framework::OperatorBase {
 
     auto &inx = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
     const size_t n = inx.size();
-    PADDLE_ENFORCE_GT(n, 0, platform::errors::InvalidArgument(
-                                "Input tensorarray size should > 0, "
-                                "but the received is: %d. ",
-                                n));
+    PADDLE_ENFORCE_GT(
+        n, 0,
+        platform::errors::InvalidArgument("Input tensorarray size should > 0, "
+                                          "but the received is: %d. ",
+                                          n));
 
     std::string base_name = Inputs("X")[0];
     std::vector<std::string> names;
diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc
index ef46ee25156e5..8f02bc870e2fb 100644
--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/fluid/operators/tensor_formatter.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/operators/tensor_formatter.h"
 
 #include <string>
+
 #include "paddle/fluid/framework/convert_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/tensor_to_string.h b/paddle/fluid/operators/tensor_to_string.h
index bd9e7f6219b4a..c1ca1dff9ffe7 100644
--- a/paddle/fluid/operators/tensor_to_string.h
+++ b/paddle/fluid/operators/tensor_to_string.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <sstream>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -56,7 +57,7 @@ static std::vector<T> ToVector(const framework::Tensor &src) {
 }
 
 template <typename... Args>
-static std::string FlattenToString(Args &&... args) {
+static std::string FlattenToString(Args &&...args) {
   const auto &vec = ToVector(std::forward<Args>(args)...);
   return "[" + string::join_strings(vec, ',') + "]";
 }
diff --git a/paddle/fluid/operators/tensorrt/CMakeLists.txt b/paddle/fluid/operators/tensorrt/CMakeLists.txt
index a7f18245ab9e9..e0fed2804a9b7 100644
--- a/paddle/fluid/operators/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt
@@ -1,4 +1,6 @@
-op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter infer_io_utils analysis_helper)
-nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-  DEPS tensorrt_engine_op
-  analysis)
+op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter
+           infer_io_utils analysis_helper)
+nv_test(
+  test_tensorrt_engine_op
+  SRCS tensorrt_engine_op_test.cc
+  DEPS tensorrt_engine_op analysis)
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 0a71875d8931e..1e5ce6fa3e80c 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -52,9 +52,9 @@ namespace operators {
 
 using inference::Singleton;
 using inference::tensorrt::TensorRTEngine;
-using inference::tensorrt::TRTInt8Calibrator;
 using inference::tensorrt::TRTCalibratorEngine;
 using inference::tensorrt::TRTCalibratorEngineManager;
+using inference::tensorrt::TRTInt8Calibrator;
 
 static void RuntimeStaticShapeCheck(std::vector<int64_t> runtime_input_shape,
                                     std::vector<int64_t> model_input_shape) {
@@ -111,10 +111,10 @@ static void RuntimeDynamicShapeCheck(
   //         "TRT engine runtime input %s dims size(%d) inconsistent "
   //         "with the dynamic shape size(%d)",
   //         x, runtime_input_shape.size(), min_input_shape.size()));
-  auto is_input_shape_valid = [&](
-      const std::vector<int32_t> &runtime_input_shape,
-      const std::vector<int32_t> &min_input_shape,
-      const std::vector<int32_t> &max_input_shape) -> bool {
+  auto is_input_shape_valid =
+      [&](const std::vector<int32_t> &runtime_input_shape,
+          const std::vector<int32_t> &min_input_shape,
+          const std::vector<int32_t> &max_input_shape) -> bool {
     for (size_t i = 0; i < runtime_input_shape.size(); i++) {
       if (runtime_input_shape[i] <= max_input_shape[i] &&
           runtime_input_shape[i] >= min_input_shape[i]) {
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 243ae757277a8..c4278cfeb58c5 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
+
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_desc.h"
diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc
index cea6b458aec78..ee2d38fea7033 100644
--- a/paddle/fluid/operators/tile_op_npu.cc
+++ b/paddle/fluid/operators/tile_op_npu.cc
@@ -27,10 +27,11 @@ class TileNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto rank = context.Input<Tensor>("X")->dims().size();
     PADDLE_ENFORCE_GE(
-        rank, 1, platform::errors::InvalidArgument(
-                     "The rank of the input 'x' for tile op must be a positive "
-                     "integer, but the value received is %d.",
-                     rank));
+        rank, 1,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'x' for tile op must be a positive "
+            "integer, but the value received is %d.",
+            rank));
     PADDLE_ENFORCE_LE(
         rank, MAX_RANK_SUPPORTED,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/tile_op_xpu.cc b/paddle/fluid/operators/tile_op_xpu.cc
index 598377587d6f7..a0ce4a2bebeb7 100644
--- a/paddle/fluid/operators/tile_op_xpu.cc
+++ b/paddle/fluid/operators/tile_op_xpu.cc
@@ -25,10 +25,11 @@ class TileXPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto rank = context.Input<Tensor>("X")->dims().size();
     PADDLE_ENFORCE_GE(
-        rank, 1, platform::errors::InvalidArgument(
-                     "The rank of the input 'x' for tile op must be a positive "
-                     "integer, but the value received is %d.",
-                     rank));
+        rank, 1,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'x' for tile op must be a positive "
+            "integer, but the value received is %d.",
+            rank));
     PADDLE_ENFORCE_LE(
         rank, MAX_RANK_SUPPORTED,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index 848ab1cb556e0..a7981c86c450c 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <stdio.h>
+
 #include <cstdio>
 #include <vector>
 #ifdef __NVCC__
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index cce5ad2631733..d8fc129588a03 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/top_k_op.h"
+
 #include <memory>
 
 namespace paddle {
@@ -39,8 +40,9 @@ class TopkOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(k, 1,
                       platform::errors::InvalidArgument(
                           "Attribute k must be >= 1, but got k is %d.", k));
-    PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
-                                                "input must have >= 1d shape"));
+    PADDLE_ENFORCE_GE(
+        input_dims.size(), 1,
+        platform::errors::InvalidArgument("input must have >= 1d shape"));
 
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_GE(
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 30a5a802a5360..fc8f08ca4805a 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -96,8 +96,8 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
     int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
     switch (GetDesiredBlockDim(input_width)) {
       FIXED_BLOCK_DIM(
-          KeMatrixTopK<T, 5,
-                       kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          KeMatrixTopK<T, 5, kBlockDim>
+          <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
               output_data, k, indices_data, input_data, input_width,
               input_width, static_cast<int>(k), gridx, input_height));
       default:
@@ -133,8 +133,8 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
     int gridx = row < kMaxHeight ? row : kMaxHeight;
     switch (GetDesiredBlockDim(col)) {
       FIXED_BLOCK_DIM(
-          AssignGrad<T, 5,
-                     kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          AssignGrad<T, 5, kBlockDim>
+          <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
               x_grad_data, indices_data, out_grad_data, row, col, k));
       default:
         PADDLE_THROW(
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index aad2f096a536e..9d933eb5c47ed 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <iostream>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/top_k_op_mlu.cc b/paddle/fluid/operators/top_k_op_mlu.cc
index 102902bdaaaaf..16b2ac9807e83 100644
--- a/paddle/fluid/operators/top_k_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_op_mlu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/top_k_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc
index 04e4d88b008e0..051cb9611bab1 100644
--- a/paddle/fluid/operators/top_k_v2_op_npu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_npu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc
index c6c0fa3c0019e..36ad2d74869c6 100644
--- a/paddle/fluid/operators/trace_op.cc
+++ b/paddle/fluid/operators/trace_op.cc
@@ -118,19 +118,16 @@ REGISTER_OPERATOR(trace_grad, ops::TraceGradOp,
                   ops::TraceGradNoNeedBufferVarsInferer);
 
 /* ==========================  register checkpoint ===========================*/
-REGISTER_OP_VERSION(trace)
-    .AddCheckpoint(
-        R"ROC(Upgrade trace add a new attribute [axis2])ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewAttr("axis1",
-                     "The added attribute 'axis1' is not yet registered.",
-                     std::vector<float>{0.0f})
-            .NewAttr("axis2",
-                     "The added attribute 'axis2' is not yet registered.",
-                     std::vector<float>{1.0f})
-            .DeleteAttr("dim1",
-                        "The attribute 'dim1' is not recommend according to "
-                        "the specification 2.0.")
-            .DeleteAttr("dim2",
-                        "The attribute 'dim2' is not recommend according to "
-                        "the specification 2.0."));
+REGISTER_OP_VERSION(trace).AddCheckpoint(
+    R"ROC(Upgrade trace add a new attribute [axis2])ROC",
+    paddle::framework::compatible::OpVersionDesc()
+        .NewAttr("axis1", "The added attribute 'axis1' is not yet registered.",
+                 std::vector<float>{0.0f})
+        .NewAttr("axis2", "The added attribute 'axis2' is not yet registered.",
+                 std::vector<float>{1.0f})
+        .DeleteAttr("dim1",
+                    "The attribute 'dim1' is not recommend according to "
+                    "the specification 2.0.")
+        .DeleteAttr("dim2",
+                    "The attribute 'dim2' is not recommend according to "
+                    "the specification 2.0."));
diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc
index f26bcdca4a7b3..3b55631900d30 100644
--- a/paddle/fluid/operators/transfer_layout_op.cc
+++ b/paddle/fluid/operators/transfer_layout_op.cc
@@ -146,7 +146,7 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL_FUNCTOR(transfer_layout, float,
                                ops::TransferLayoutKernel);
 REGISTER_OP_VERSION(transfer_layout)
-    .AddCheckpoint(
-        R"ROC(refine transfer_layout, add src_layout attribute)ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "src_layout", "(int, the layout of the input tensor", -1));
+    .AddCheckpoint(R"ROC(refine transfer_layout, add src_layout attribute)ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewAttr(
+                       "src_layout", "(int, the layout of the input tensor",
+                       -1));
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index a45d32b34b983..4eceb69e8ce45 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/transpose_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h
index a31ac28c9910c..40a967b11f7a9 100644
--- a/paddle/fluid/operators/transpose_op.cu.h
+++ b/paddle/fluid/operators/transpose_op.cu.h
@@ -96,12 +96,15 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, Dim3 input_dims,
   int x = threadIdx.x;
 
   Dim3 output_dims = {
-      input_dims[0], input_dims[2], input_dims[1],
+      input_dims[0],
+      input_dims[2],
+      input_dims[1],
   };
 
   // Align dim to Tiles
   Dim3 tile_aligned_input_dim = {
-      input_dims[0], (input_dims[1] + TileX - 1) / TileX,
+      input_dims[0],
+      (input_dims[1] + TileX - 1) / TileX,
       (input_dims[2] + TileY - 1) / TileY,
   };
 
@@ -111,7 +114,8 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, Dim3 input_dims,
 
   // Compute real index align to tile:0, 32, 64...
   Index3 block_tile_index_in_input = {
-      input_block_tile_index[0], input_block_tile_index[1] * TileX,
+      input_block_tile_index[0],
+      input_block_tile_index[1] * TileX,
       input_block_tile_index[2] * TileY,
   };
 
@@ -165,12 +169,14 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, Dim3 input_dims,
 
   // Store sm value back to out
   Index3 output_block_tile_index = {
-      input_block_tile_index[0], input_block_tile_index[2],
+      input_block_tile_index[0],
+      input_block_tile_index[2],
       input_block_tile_index[1],
   };
 
   Index3 block_tile_index_in_output = {
-      output_block_tile_index[0], output_block_tile_index[1] * TileY,
+      output_block_tile_index[0],
+      output_block_tile_index[1] * TileY,
       output_block_tile_index[2] * TileX,
   };
 
@@ -265,15 +271,13 @@ void LaunchNarrowDims2TransposeKernel(const phi::GPUContext& d, int tile_size_i,
                                       T* output) {
   constexpr int NumThreads = tile_long;
   if (tile_size_i <= tile_long && tile_size_j <= tile_short) {
-    TilingSwapDim1And2<
-        T, NumThreads, tile_long,
-        tile_short><<<total_tiles_count, NumThreads, 0, d.stream()>>>(
-        input, input_dims, output);
+    TilingSwapDim1And2<T, NumThreads, tile_long, tile_short>
+        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
+                                                           output);
   } else {
-    TilingSwapDim1And2<
-        T, NumThreads, tile_short,
-        tile_long><<<total_tiles_count, NumThreads, 0, d.stream()>>>(
-        input, input_dims, output);
+    TilingSwapDim1And2<T, NumThreads, tile_short, tile_long>
+        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
+                                                           output);
   }
 }
 
@@ -392,10 +396,10 @@ void SwapDim1And2InNarrow(const phi::GPUContext& d, const T* input,
     // data may not aligned to tile, so some threads wasted, we need
     // to find least wasted threads, which means we need to find tile
     // can split input properly, in another words: num_wasted_threads=0.
-    int num_wasted_threads = input_long_edge -
-                             framework::CeilOrFloor<int, false>(
-                                 input_long_edge, proposed_tile_long_edge) *
-                                 proposed_tile_long_edge;
+    int num_wasted_threads =
+        input_long_edge - framework::CeilOrFloor<int, false>(
+                              input_long_edge, proposed_tile_long_edge) *
+                              proposed_tile_long_edge;
 
     int num_full_tiles = framework::CeilOrFloor<int, false>(
         input_long_edge, proposed_tile_long_edge);
@@ -499,10 +503,9 @@ void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input,
     int total_tiles_count =
         input_dims_aligned[0] * input_dims_aligned[1] * input_dims_aligned[2];
 
-    TilingSwapDim1And2<
-        T, kNumThreads, kTileSize,
-        kTileSize><<<total_tiles_count, kNumThreads, 0, d.stream()>>>(
-        input, input_dims, output);
+    TilingSwapDim1And2<T, kNumThreads, kTileSize, kTileSize>
+        <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
+                                                            output);
 
   } else if (narrow_tile) {
     // If input shape is like Rect, such as 2X100, use Narrow tile size.
@@ -513,9 +516,9 @@ void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input,
     // If input shape is small, such as 8X8, just do simple copy
     int total_elements = input_dims[0] * input_dims[1] * input_dims[2];
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_elements);
-    TransposeSimpleKernel<T, 0, 2, 1><<<
-        config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
-        total_elements, input, input_dims, output);
+    TransposeSimpleKernel<T, 0, 2, 1>
+        <<<config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
+            total_elements, input, input_dims, output);
   }
 }
 
@@ -543,9 +546,9 @@ struct SwapDim0And2InTranspose {
     size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_size);
 
-    TransposeSimpleKernel<T, 2, 1, 0><<<
-        config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
-        total_size, in, input_dims, out);
+    TransposeSimpleKernel<T, 2, 1, 0>
+        <<<config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
+            total_size, in, input_dims, out);
   }
 };
 
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index a9e4876cc82a4..891aa312f69ff 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/transpose_op_mlu.cc b/paddle/fluid/operators/transpose_op_mlu.cc
index 40cb22bab50ec..38f6114e48d3f 100644
--- a/paddle/fluid/operators/transpose_op_mlu.cc
+++ b/paddle/fluid/operators/transpose_op_mlu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/transpose_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/transpose_op_xpu.cc b/paddle/fluid/operators/transpose_op_xpu.cc
index 00a43c74d8736..32b303238ab81 100644
--- a/paddle/fluid/operators/transpose_op_xpu.cc
+++ b/paddle/fluid/operators/transpose_op_xpu.cc
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/transpose_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/tree_conv_op.h b/paddle/fluid/operators/tree_conv_op.h
index afe5379dc3f2a..8c479076175dd 100644
--- a/paddle/fluid/operators/tree_conv_op.h
+++ b/paddle/fluid/operators/tree_conv_op.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <iostream>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/tree2col.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/operators/tril_indices_op.cc b/paddle/fluid/operators/tril_indices_op.cc
index be42f53dd2344..63b5c1a2431ce 100644
--- a/paddle/fluid/operators/tril_indices_op.cc
+++ b/paddle/fluid/operators/tril_indices_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc
index b941fa3d03ae1..8ca83ed881099 100644
--- a/paddle/fluid/operators/tril_triu_op.cc
+++ b/paddle/fluid/operators/tril_triu_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index dc5a66dce16d6..21e2061e73b6c 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
+
 #include <limits>
 #include <random>
 #include <vector>
@@ -19,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 #include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
index 261d9cee2d5cd..363d909d84dcf 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
index 803b61fbe813f..45a4b6a3bab7e 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
@@ -14,11 +14,12 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 #include <limits>
 #include <random>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unbind_op.cc b/paddle/fluid/operators/unbind_op.cc
index f2fc08308c6b3..739fc98f3f086 100644
--- a/paddle/fluid/operators/unbind_op.cc
+++ b/paddle/fluid/operators/unbind_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unbind_op.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
diff --git a/paddle/fluid/operators/unbind_op.h b/paddle/fluid/operators/unbind_op.h
index 6e35f262de420..8e6cd391578c7 100644
--- a/paddle/fluid/operators/unbind_op.h
+++ b/paddle/fluid/operators/unbind_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 3e27402c86947..a988c6843893c 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -16,10 +16,12 @@
 #include <algorithm>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/random.h>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
@@ -102,8 +104,9 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
           "Expected dtype of ShapeTensorList of %d-th must be int32, int64. "
           "But got "
           "unsupport dtype: %s.",
-          i, paddle::framework::DataTypeToString(
-                 framework::TransToProtoVarType(tensor->dtype()))));
+          i,
+          paddle::framework::DataTypeToString(
+              framework::TransToProtoVarType(tensor->dtype()))));
     }
   }
 
diff --git a/paddle/fluid/operators/uniform_random_op_mlu.cc b/paddle/fluid/operators/uniform_random_op_mlu.cc
index 2c5f13f5a9307..fdf1252eb0ded 100644
--- a/paddle/fluid/operators/uniform_random_op_mlu.cc
+++ b/paddle/fluid/operators/uniform_random_op_mlu.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/uniform_random_op.h"
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/uniform_random_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/uniform_random_op_xpu.cc b/paddle/fluid/operators/uniform_random_op_xpu.cc
index ae2adf834194d..23d0f61c2bd1d 100644
--- a/paddle/fluid/operators/uniform_random_op_xpu.cc
+++ b/paddle/fluid/operators/uniform_random_op_xpu.cc
@@ -14,11 +14,12 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/uniform_random_op.h"
 #include <string>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/uniform_random_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unique_consecutive_op.cc b/paddle/fluid/operators/unique_consecutive_op.cc
index 24ef3a85ee2ce..567f7bac34be7 100644
--- a/paddle/fluid/operators/unique_consecutive_op.cc
+++ b/paddle/fluid/operators/unique_consecutive_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unique_consecutive_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/unique_consecutive_op.cu b/paddle/fluid/operators/unique_consecutive_op.cu
index fbffb01ed19b6..9db14e82b25b1 100644
--- a/paddle/fluid/operators/unique_consecutive_op.cu
+++ b/paddle/fluid/operators/unique_consecutive_op.cu
@@ -18,8 +18,10 @@ limitations under the License. */
 #include <thrust/scatter.h>
 #include <thrust/sequence.h>
 #include <thrust/unique.h>
+
 #include <iostream>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor_util.h"            // TensorToVector()
 #include "paddle/fluid/operators/unique_consecutive_op.h"  // TransComute()
 
diff --git a/paddle/fluid/operators/unique_consecutive_op.h b/paddle/fluid/operators/unique_consecutive_op.h
index b31c2aa67a587..4dc1871b5d140 100644
--- a/paddle/fluid/operators/unique_consecutive_op.h
+++ b/paddle/fluid/operators/unique_consecutive_op.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/transpose_op.h"
diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc
index 5c103e088b559..fbbd562c1b8a2 100644
--- a/paddle/fluid/operators/unique_op.cc
+++ b/paddle/fluid/operators/unique_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unique_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
index 01439d2182464..d59e6590a88f3 100644
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/transpose_op.h"
diff --git a/paddle/fluid/operators/unique_with_counts_op.h b/paddle/fluid/operators/unique_with_counts_op.h
index af8bfe813a6b0..227fdef222432 100644
--- a/paddle/fluid/operators/unique_with_counts_op.h
+++ b/paddle/fluid/operators/unique_with_counts_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/unique_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 1be8f3387dbad..62aa990ca7bc8 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -4,533 +4,569 @@
 # Generally, the combination rules in this file do not need to be modified.
 # If there are some redefined error in compiling with the source file which
 # in combination rule, you can remove the source file from the following rules.
-register_unity_group(cc
-    abs_op.cc
-    add_position_encoding_op.cc
-    addmm_op.cc
-    affine_channel_op.cc
-    affine_grid_op.cc
-    allclose_op.cc
-    argsort_op.cc
-    array_to_lod_tensor_op.cc
-    assert_op.cc
-    assign_op.cc
-    assign_value_op.cc
-    attention_lstm_op.cc
-    average_accumulates_op.cc
-    batch_fc_op.cc
-    bce_loss_op.cc
-    beam_search_op.cc
-    beam_search_decode_op.cc
-    bernoulli_op.cc
-    bilateral_slice_op.cc)
-register_unity_group(cc
-    mkldnn/batch_norm_mkldnn_op.cc
-    bilinear_tensor_product_op.cc
-    bmm_op.cc
-    bpr_loss_op.cc
-    cast_op.cc
-    mkldnn/cast_mkldnn_op.cc
-    cholesky_op.cc
-    chunk_eval_op.cc
-    clip_by_norm_op.cc
-    clip_op.cc
-    coalesce_tensor_op.cc
-    mkldnn/activation_mkldnn_op.cc
-    mkldnn/interpolate_mkldnn_op.cc
-    mkldnn/pool_mkldnn_op.cc
-    mkldnn/softmax_mkldnn_op.cc)
-register_unity_group(cc
-    center_loss_op.cc
-    mkldnn/concat_mkldnn_op.cc
-    mkldnn/conv_mkldnn_op.cc
-    mkldnn/conv_transpose_mkldnn_op.cc
-    correlation_op.cc
-    cos_sim_op.cc
-    crf_decoding_op.cc
-    crop_op.cc
-    ascend_trigger_op.cc
-    conj_op.cc
-    imag_op.cc
-    kldiv_loss_op.cc
-    memcpy_op.cc)
-register_unity_group(cc
-    cross_entropy_op.cc
-    cross_op.cc
-    ctc_align_op.cc
-    cudnn_lstm_op.cc
-    cumsum_op.cc
-    cvm_op.cc
-    data_norm_op.cc
-    deformable_conv_op.cc
-    deformable_conv_v1_op.cc
-    deformable_psroi_pooling_op.cc
-    delete_var_op.cc
-    dequantize_abs_max_op.cc
-    dequantize_op.cc
-    mkldnn/dequantize_mkldnn_op.cc)
-register_unity_group(cc
-    dequeue_op.cc
-    detection_map_op.cc
-    dgc_clip_by_norm_op.cc
-    diag_embed_op.cc
-    diag_op.cc
-    diag_v2_op.cc
-    dot_op.cc
-    edit_distance_op.cc
-    empty_op.cc
-    enqueue_op.cc
-    erf_op.cc
-    py_func_op.cc
-    real_op.cc
-    sync_batch_norm_op.cc
-    top_k_op.cc
-    conv_op.cc
-    conv_transpose_op.cc
-    gru_unit_op.cc)
-register_unity_group(cc
-    expand_v2_op.cc
-    fake_dequantize_op.cc
-    fc_op.cc
-    mkldnn/fc_mkldnn_op.cc
-    fill_any_like_op.cc
-    fill_constant_batch_size_like_op.cc
-    fill_constant_op.cc
-    fill_op.cc
-    fill_zeros_like_op.cc
-    filter_by_instag_op.cc)
-register_unity_group(cc
-    flatten_op.cc
-    flip_op.cc
-    fsp_op.cc
-    gather_nd_op.cc
-    gather_op.cc
-    gather_tree_op.cc
-    gaussian_random_batch_size_like_op.cc
-    gaussian_random_op.cc
-    mkldnn/gaussian_random_mkldnn_op.cc
-    group_norm_op.cc gru_op.cc)
-register_unity_group(cc
-    hash_op.cc
-    hierarchical_sigmoid_op.cc
-    hinge_loss_op.cc
-    histogram_op.cc
-    huber_loss_op.cc
-    im2sequence_op.cc
-    increment_op.cc
-    index_sample_op.cc
-    index_select_op.cc
-    interpolate_op.cc
-    isfinite_v2_op.cc)
-register_unity_group(cc
-    inplace_abn_op.cc
-    interpolate_v2_op.cc
-    inverse_op.cc
-    is_empty_op.cc
-    isfinite_op.cc
-    kron_op.cc
-    l1_norm_op.cc
-    label_smooth_op.cc
-    layer_norm_op.cc
-    mkldnn/layer_norm_mkldnn_op.cc
-    mkldnn/layer_norm_mkldnn_op.cc
-    linspace_op.cc
-    load_combine_op.cc
-    load_op.cc)
-register_unity_group(cc
-    lod_array_length_op.cc
-    lod_rank_table_op.cc
-    lod_reset_op.cc
-    lod_tensor_to_array_op.cc
-    log_softmax_op.cc
-    lookup_table_dequant_op.cc
-    lrn_op.cc
-    mkldnn/lrn_mkldnn_op.cc
-    lstm_unit_op.cc
-    lstmp_op.cc)
-register_unity_group(cc
-    log_loss_op.cc
-    lookup_table_v2_op.cc
-    margin_rank_loss_op.cc
-    masked_select_op.cc
-    match_matrix_tensor_op.cc
-    matmul_op.cc
-    mkldnn/matmul_mkldnn_op.cc
-    max_sequence_len_op.cc
-    maxout_op.cc
-    merge_lod_tensor_op.cc
-    merge_selected_rows_op.cc
-    meshgrid_op.cc)
-register_unity_group(cc
-    concat_op.cc
-    conv_shift_op.cc
-    dequantize_log_op.cc
-    dropout_op.cc
-    expand_op.cc
-    fake_quantize_op.cc
-    gelu_op.cc
-    get_tensor_from_selected_rows_op.cc
-    lookup_table_op.cc
-    matmul_v2_op.cc)
-register_unity_group(cc
-    mean_iou_op.cc
-    mean_op.cc
-    minus_op.cc
-    mish_op.cc
-    mul_op.cc
-    multinomial_op.cc
-    multiplex_op.cc
-    mv_op.cc
-    nce_op.cc
-    nll_loss_op.cc
-    norm_op.cc
-    one_hot_op.cc
-    one_hot_v2_op.cc
-    pad2d_op.cc
-    pad3d_op.cc
-    pad_constant_like_op.cc
-    pad_op.cc)
-register_unity_group(cc
-    modified_huber_loss_op.cc
-    partial_sum_op.cc
-    pixel_shuffle_op.cc
-    pool_op.cc
-    pool_with_index_op.cc
-    positive_negative_pair_op.cc
-    prelu_op.cc
-    print_op.cc
-    prroi_pool_op.cc
-    psroi_pool_op.cc
-    pull_box_extended_sparse_op.cc
-    pull_box_sparse_op.cc
-    pull_sparse_op.cc
-    pull_sparse_v2_op.cc)
-register_unity_group(cc
-    push_dense_op.cc
-    quantize_op.cc
-    mkldnn/quantize_mkldnn_op.cc
-    queue_generator_op.cc
-    randint_op.cc
-    random_crop_op.cc
-    randperm_op.cc
-    range_op.cc
-    rank_attention_op.cc
-    rank_loss_op.cc
-    recurrent_op.cc
-    reorder_lod_tensor_by_rank_op.cc
-    requantize_op.cc
-    mkldnn/requantize_mkldnn_op.cc
-    reshape_op.cc
-    reverse_op.cc)
-register_unity_group(cc
-    rnn_memory_helper_op.cc
-    roi_align_op.cc
-    roll_op.cc
-    run_program_op.cc
-    sample_logits_op.cc
-    sampling_id_op.cc
-    save_combine_op.cc
-    save_op.cc
-    scale_op.cc
-    mkldnn/scale_mkldnn_op.cc
-    scatter_nd_add_op.cc
-    scatter_op.cc
-    seed_op.cc
-    select_input_op.cc
-    select_output_op.cc)
-register_unity_group(cc
-    roi_pool_op.cc
-    selu_op.cc
-    shape_op.cc
-    shard_index_op.cc
-    shrink_rnn_memory_op.cc
-    shuffle_batch_op.cc
-    shuffle_channel_op.cc
-    sigmoid_cross_entropy_with_logits_op.cc
-    sign_op.cc
-    similarity_focus_op.cc
-    size_op.cc
-    slice_op.cc
-    softmax_op.cc)
-register_unity_group(cc
-    space_to_depth_op.cc
-    spectral_norm_op.cc
-    split_lod_tensor_op.cc
-    split_op.cc
-    split_selected_rows_op.cc
-    spp_op.cc
-    squared_l2_norm_op.cc
-    squeeze_op.cc
-    stack_op.cc
-    strided_slice_op.cc
-    sum_op.cc
-    mkldnn/sum_mkldnn_op.cc
-    tdm_child_op.cc
-    tdm_sampler_op.cc
-    teacher_student_sigmoid_loss_op.cc
-    temporal_shift_op.cc)
-register_unity_group(cc
-    row_conv_op.cc
-    tensor_array_to_tensor_op.cc
-    tile_op.cc
-    top_k_v2_op.cc
-    trace_op.cc
-    transpose_op.cc
-    mkldnn/transpose_mkldnn_op.cc
-    tree_conv_op.cc
-    tril_triu_op.cc
-    truncated_gaussian_random_op.cc
-    unbind_op.cc
-    unfold_op.cc)
-register_unity_group(cc
-    smooth_l1_loss_op.cc
-    uniform_random_batch_size_like_op.cc
-    uniform_random_op.cc
-    unique_op.cc
-    unique_with_counts_op.cc
-    unpool_op.cc
-    unsqueeze_op.cc
-    unstack_op.cc
-    var_conv_2d_op.cc
-    where_index_op.cc
-    where_op.cc)
-register_unity_group(cc
-    affine_grid_cudnn_op.cu.cc
-    beam_search_op.cu.cc
-    cudnn_lstm_op.cu.cc
-    empty_op.cu.cc
-    fc_op.cu.cc
-    fill_constant_batch_size_like_op.cu.cc
-    fill_constant_op.cu.cc
-    fill_op.cu.cc
-    fill_zeros_like_op.cu.cc
-    flatten_op.cu.cc
-    grid_sampler_cudnn_op.cu.cc
-    gru_op.cu.cc
-    inverse_op.cu.cc
-    is_empty_op.cu.cc
-    maxout_op.cu.cc
-    mul_op.cu.cc
-    concat_op.cu.cc
-    mul_op.cu.cc
-    pool_op.cu.cc
-    pool_cudnn_op.cu.cc
-    pool_with_index_op.cu.cc
-    run_program_op.cu.cc
-    softmax_op.cu.cc
-    softmax_cudnn_op.cu.cc
-    spp_op.cu.cc
-    squeeze_op.cu.cc
-    unbind_op.cu.cc
-    unpool_op.cu.cc
-    unsqueeze_op.cu.cc)
-register_unity_group(cc
-    arg_max_op.cc
-    arg_min_op.cc
-    squared_l2_distance_op.cc)
-register_unity_group(cc
-    linear_chain_crf_op.cc
-    lstm_op.cc
-    partial_concat_op.cc
-    pyramid_hash_op.cc
-    recurrent_op.cc
-    run_program_op.cc
-    softmax_with_cross_entropy_op.cc
-    warpctc_op.cc)
-register_unity_group(cc
-    conv_op.cu.cc
-    lstm_op.cu.cc
-    rnn_op.cu.cc
-    split_op.cu.cc
-    activation_cudnn_op.cu.cc
-    assign_value_op.cu.cc
-    merge_selected_rows_op.cu.cc
-    run_program_op.cu.cc
-    warpctc_op.cu.cc)
-register_unity_group(cu
-    addmm_op.cu
-    affine_channel_op.cu
-    allclose_op.cu
-    assign_value_op.cu
-    bce_loss_op.cu
-    bernoulli_op.cu
-    bilateral_slice_op.cu
-    batch_norm_op.cu)
-register_unity_group(cu
-    bilinear_tensor_product_op.cu
-    bmm_op.cu
-    cast_op.cu
-    cholesky_op.cu
-    clip_by_norm_op.cu
-    clip_op.cu
-    conv_cudnn_op.cu
-    affine_grid_op.cu)
-register_unity_group(cu
-    center_loss_op.cu
-    conv_op.cu
-    conv_transpose_cudnn_op.cu
-    conv_transpose_op.cu
-    cos_sim_op.cu
-    crop_op.cu
-    average_accumulates_op.cu
-    conj_op.cu
-    correlation_op.cu)
-register_unity_group(cu
-    cross_entropy_op.cu
-    cross_op.cu
-    ctc_align_op.cu
-    cumsum_op.cu
-    cvm_op.cu
-    data_norm_op.cu
-    deformable_conv_op.cu
-    deformable_conv_v1_op.cu
-    dequantize_abs_max_op.cu)
-register_unity_group(cu
-    dgc_clip_by_norm_op.cu
-    diag_embed_op.cu
-    diag_op.cu
-    diag_v2_op.cu
-    edit_distance_op.cu
-    erf_op.cu
-    meshgrid_op.cu
-    imag_op.cu)
-register_unity_group(cu
-    expand_v2_op.cu
-    fake_dequantize_op.cu
-    fill_any_like_op.cu)
-register_unity_group(cu
-    flip_op.cu
-    fsp_op.cu
-    gather_nd_op.cu
-    gather_op.cu
-    gather_tree_op.cu
-    gaussian_random_op.cu
-    grid_sampler_op.cu
-    group_norm_op.cu)
-register_unity_group(cu
-    hinge_loss_op.cu
-    histogram_op.cu
-    huber_loss_op.cu
-    im2sequence_op.cu
-    increment_op.cu
-    index_sample_op.cu
-    index_select_op.cu
-    interpolate_op.cu
-    isfinite_v2_op.cu)
-register_unity_group(cu
-    inplace_abn_op.cu
-    interpolate_v2_op.cu
-    isfinite_op.cu
-    l1_norm_op.cu
-    label_smooth_op.cu
-    linspace_op.cu
-    load_combine_op.cu
-    load_op.cu)
-register_unity_group(cu
-    lod_reset_op.cu
-    log_softmax_op.cu
-    lrn_op.cu
-    lstm_unit_op.cu
-    dot_op.cu
-    psroi_pool_op.cu
-    rank_loss_op.cu
-    real_op.cu)
-register_unity_group(cu
-    log_loss_op.cu
-    lookup_table_v2_op.cu
-    margin_rank_loss_op.cu
-    masked_select_op.cu
-    merge_selected_rows_op.cu
-    lstmp_op.cu
-    shuffle_channel_op.cu
-    softmax_cudnn_op.cu
-    squared_l2_distance_op.cu)
-register_unity_group(cu
-    conv_shift_op.cu
-    dequantize_log_op.cu
-    dropout_op.cu
-    fake_quantize_op.cu
-    gelu_op.cu
-    lookup_table_op.cu
-    sigmoid_cross_entropy_with_logits_op.cu
-    softmax_with_cross_entropy_op.cu)
-register_unity_group(cu
-    mean_iou_op.cu
-    mean_op.cu
-    minus_op.cu
-    mish_op.cu
-    multinomial_op.cu
-    multiplex_op.cu
-    mv_op.cu
-    nll_loss_op.cu
-    norm_op.cu
-    one_hot_op.cu
-    pad2d_op.cu
-    pad3d_op.cu
-    pad_constant_like_op.cu
-    pad_op.cu)
-register_unity_group(cu
-    partial_sum_op.cu
-    pixel_shuffle_op.cu
-    prelu_op.cu
-    prroi_pool_op.cu
-    pull_box_extended_sparse_op.cu
-    pull_box_sparse_op.cu)
-register_unity_group(cu
-    randint_op.cu
-    random_crop_op.cu
-    randperm_op.cu
-    range_op.cu
-    reverse_op.cu
-    partial_concat_op.cu
-    kldiv_loss_op.cu
-    instance_norm_op.cu)
-register_unity_group(cu
-    roi_align_op.cu
-    roll_op.cu
-    sample_logits_op.cu
-    sampling_id_op.cu
-    save_combine_op.cu
-    save_op.cu
-    scale_op.cu
-    scatter_nd_add_op.cu
-    scatter_op.cu
-    seed_op.cu)
-register_unity_group(cu
-    roi_pool_op.cu
-    selu_op.cu
-    shape_op.cu
-    shard_index_op.cu
-    sign_op.cu
-    size_op.cu
-    slice_op.cu)
-register_unity_group(cu
-    space_to_depth_op.cu
-    spectral_norm_op.cu
-    split_op.cu
-    split_selected_rows_op.cu
-    squared_l2_norm_op.cu
-    sum_op.cu
-    temporal_shift_op.cu
-    arg_max_op.cu)
-register_unity_group(cu
-    row_conv_op.cu
-    tree_conv_op.cu
-    tril_triu_op.cu
-    truncated_gaussian_random_op.cu
-    unfold_op.cu
-    arg_min_op.cu
-    crop_tensor_op.cu)
-register_unity_group(cu
-    smooth_l1_loss_op.cu
-    uniform_random_op.cu
-    unstack_op.cu
-    where_index_op.cu
-    where_op.cu
-    layer_norm_op.cu)
-register_unity_group(cu
-    expand_as_op.cu
-    stack_op.cu)
+register_unity_group(
+  cc
+  abs_op.cc
+  add_position_encoding_op.cc
+  addmm_op.cc
+  affine_channel_op.cc
+  affine_grid_op.cc
+  allclose_op.cc
+  argsort_op.cc
+  array_to_lod_tensor_op.cc
+  assert_op.cc
+  assign_op.cc
+  assign_value_op.cc
+  attention_lstm_op.cc
+  average_accumulates_op.cc
+  batch_fc_op.cc
+  bce_loss_op.cc
+  beam_search_op.cc
+  beam_search_decode_op.cc
+  bernoulli_op.cc
+  bilateral_slice_op.cc)
+register_unity_group(
+  cc
+  mkldnn/batch_norm_mkldnn_op.cc
+  bilinear_tensor_product_op.cc
+  bmm_op.cc
+  bpr_loss_op.cc
+  cast_op.cc
+  mkldnn/cast_mkldnn_op.cc
+  cholesky_op.cc
+  chunk_eval_op.cc
+  clip_by_norm_op.cc
+  clip_op.cc
+  coalesce_tensor_op.cc
+  mkldnn/activation_mkldnn_op.cc
+  mkldnn/interpolate_mkldnn_op.cc
+  mkldnn/pool_mkldnn_op.cc
+  mkldnn/softmax_mkldnn_op.cc)
+register_unity_group(
+  cc
+  center_loss_op.cc
+  mkldnn/concat_mkldnn_op.cc
+  mkldnn/conv_mkldnn_op.cc
+  mkldnn/conv_transpose_mkldnn_op.cc
+  correlation_op.cc
+  cos_sim_op.cc
+  crf_decoding_op.cc
+  crop_op.cc
+  ascend_trigger_op.cc
+  conj_op.cc
+  imag_op.cc
+  kldiv_loss_op.cc
+  memcpy_op.cc)
+register_unity_group(
+  cc
+  cross_entropy_op.cc
+  cross_op.cc
+  ctc_align_op.cc
+  cudnn_lstm_op.cc
+  cumsum_op.cc
+  cvm_op.cc
+  data_norm_op.cc
+  deformable_conv_op.cc
+  deformable_conv_v1_op.cc
+  deformable_psroi_pooling_op.cc
+  delete_var_op.cc
+  dequantize_abs_max_op.cc
+  dequantize_op.cc
+  mkldnn/dequantize_mkldnn_op.cc)
+register_unity_group(
+  cc
+  dequeue_op.cc
+  detection_map_op.cc
+  dgc_clip_by_norm_op.cc
+  diag_embed_op.cc
+  diag_op.cc
+  diag_v2_op.cc
+  dot_op.cc
+  edit_distance_op.cc
+  empty_op.cc
+  enqueue_op.cc
+  erf_op.cc
+  py_func_op.cc
+  real_op.cc
+  sync_batch_norm_op.cc
+  top_k_op.cc
+  conv_op.cc
+  conv_transpose_op.cc
+  gru_unit_op.cc)
+register_unity_group(
+  cc
+  expand_v2_op.cc
+  fake_dequantize_op.cc
+  fc_op.cc
+  mkldnn/fc_mkldnn_op.cc
+  fill_any_like_op.cc
+  fill_constant_batch_size_like_op.cc
+  fill_constant_op.cc
+  fill_op.cc
+  fill_zeros_like_op.cc
+  filter_by_instag_op.cc)
+register_unity_group(
+  cc
+  flatten_op.cc
+  flip_op.cc
+  fsp_op.cc
+  gather_nd_op.cc
+  gather_op.cc
+  gather_tree_op.cc
+  gaussian_random_batch_size_like_op.cc
+  gaussian_random_op.cc
+  mkldnn/gaussian_random_mkldnn_op.cc
+  group_norm_op.cc
+  gru_op.cc)
+register_unity_group(
+  cc
+  hash_op.cc
+  hierarchical_sigmoid_op.cc
+  hinge_loss_op.cc
+  histogram_op.cc
+  huber_loss_op.cc
+  im2sequence_op.cc
+  increment_op.cc
+  index_sample_op.cc
+  index_select_op.cc
+  interpolate_op.cc
+  isfinite_v2_op.cc)
+register_unity_group(
+  cc
+  inplace_abn_op.cc
+  interpolate_v2_op.cc
+  inverse_op.cc
+  is_empty_op.cc
+  isfinite_op.cc
+  kron_op.cc
+  l1_norm_op.cc
+  label_smooth_op.cc
+  layer_norm_op.cc
+  mkldnn/layer_norm_mkldnn_op.cc
+  mkldnn/layer_norm_mkldnn_op.cc
+  linspace_op.cc
+  load_combine_op.cc
+  load_op.cc)
+register_unity_group(
+  cc
+  lod_array_length_op.cc
+  lod_rank_table_op.cc
+  lod_reset_op.cc
+  lod_tensor_to_array_op.cc
+  log_softmax_op.cc
+  lookup_table_dequant_op.cc
+  lrn_op.cc
+  mkldnn/lrn_mkldnn_op.cc
+  lstm_unit_op.cc
+  lstmp_op.cc)
+register_unity_group(
+  cc
+  log_loss_op.cc
+  lookup_table_v2_op.cc
+  margin_rank_loss_op.cc
+  masked_select_op.cc
+  match_matrix_tensor_op.cc
+  matmul_op.cc
+  mkldnn/matmul_mkldnn_op.cc
+  max_sequence_len_op.cc
+  maxout_op.cc
+  merge_lod_tensor_op.cc
+  merge_selected_rows_op.cc
+  meshgrid_op.cc)
+register_unity_group(
+  cc
+  concat_op.cc
+  conv_shift_op.cc
+  dequantize_log_op.cc
+  dropout_op.cc
+  expand_op.cc
+  fake_quantize_op.cc
+  gelu_op.cc
+  get_tensor_from_selected_rows_op.cc
+  lookup_table_op.cc
+  matmul_v2_op.cc)
+register_unity_group(
+  cc
+  mean_iou_op.cc
+  mean_op.cc
+  minus_op.cc
+  mish_op.cc
+  mul_op.cc
+  multinomial_op.cc
+  multiplex_op.cc
+  mv_op.cc
+  nce_op.cc
+  nll_loss_op.cc
+  norm_op.cc
+  one_hot_op.cc
+  one_hot_v2_op.cc
+  pad2d_op.cc
+  pad3d_op.cc
+  pad_constant_like_op.cc
+  pad_op.cc)
+register_unity_group(
+  cc
+  modified_huber_loss_op.cc
+  partial_sum_op.cc
+  pixel_shuffle_op.cc
+  pool_op.cc
+  pool_with_index_op.cc
+  positive_negative_pair_op.cc
+  prelu_op.cc
+  print_op.cc
+  prroi_pool_op.cc
+  psroi_pool_op.cc
+  pull_box_extended_sparse_op.cc
+  pull_box_sparse_op.cc
+  pull_sparse_op.cc
+  pull_sparse_v2_op.cc)
+register_unity_group(
+  cc
+  push_dense_op.cc
+  quantize_op.cc
+  mkldnn/quantize_mkldnn_op.cc
+  queue_generator_op.cc
+  randint_op.cc
+  random_crop_op.cc
+  randperm_op.cc
+  range_op.cc
+  rank_attention_op.cc
+  rank_loss_op.cc
+  recurrent_op.cc
+  reorder_lod_tensor_by_rank_op.cc
+  requantize_op.cc
+  mkldnn/requantize_mkldnn_op.cc
+  reshape_op.cc
+  reverse_op.cc)
+register_unity_group(
+  cc
+  rnn_memory_helper_op.cc
+  roi_align_op.cc
+  roll_op.cc
+  run_program_op.cc
+  sample_logits_op.cc
+  sampling_id_op.cc
+  save_combine_op.cc
+  save_op.cc
+  scale_op.cc
+  mkldnn/scale_mkldnn_op.cc
+  scatter_nd_add_op.cc
+  scatter_op.cc
+  seed_op.cc
+  select_input_op.cc
+  select_output_op.cc)
+register_unity_group(
+  cc
+  roi_pool_op.cc
+  selu_op.cc
+  shape_op.cc
+  shard_index_op.cc
+  shrink_rnn_memory_op.cc
+  shuffle_batch_op.cc
+  shuffle_channel_op.cc
+  sigmoid_cross_entropy_with_logits_op.cc
+  sign_op.cc
+  similarity_focus_op.cc
+  size_op.cc
+  slice_op.cc
+  softmax_op.cc)
+register_unity_group(
+  cc
+  space_to_depth_op.cc
+  spectral_norm_op.cc
+  split_lod_tensor_op.cc
+  split_op.cc
+  split_selected_rows_op.cc
+  spp_op.cc
+  squared_l2_norm_op.cc
+  squeeze_op.cc
+  stack_op.cc
+  strided_slice_op.cc
+  sum_op.cc
+  mkldnn/sum_mkldnn_op.cc
+  tdm_child_op.cc
+  tdm_sampler_op.cc
+  teacher_student_sigmoid_loss_op.cc
+  temporal_shift_op.cc)
+register_unity_group(
+  cc
+  row_conv_op.cc
+  tensor_array_to_tensor_op.cc
+  tile_op.cc
+  top_k_v2_op.cc
+  trace_op.cc
+  transpose_op.cc
+  mkldnn/transpose_mkldnn_op.cc
+  tree_conv_op.cc
+  tril_triu_op.cc
+  truncated_gaussian_random_op.cc
+  unbind_op.cc
+  unfold_op.cc)
+register_unity_group(
+  cc
+  smooth_l1_loss_op.cc
+  uniform_random_batch_size_like_op.cc
+  uniform_random_op.cc
+  unique_op.cc
+  unique_with_counts_op.cc
+  unpool_op.cc
+  unsqueeze_op.cc
+  unstack_op.cc
+  var_conv_2d_op.cc
+  where_index_op.cc
+  where_op.cc)
+register_unity_group(
+  cc
+  affine_grid_cudnn_op.cu.cc
+  beam_search_op.cu.cc
+  cudnn_lstm_op.cu.cc
+  empty_op.cu.cc
+  fc_op.cu.cc
+  fill_constant_batch_size_like_op.cu.cc
+  fill_constant_op.cu.cc
+  fill_op.cu.cc
+  fill_zeros_like_op.cu.cc
+  flatten_op.cu.cc
+  grid_sampler_cudnn_op.cu.cc
+  gru_op.cu.cc
+  inverse_op.cu.cc
+  is_empty_op.cu.cc
+  maxout_op.cu.cc
+  mul_op.cu.cc
+  concat_op.cu.cc
+  mul_op.cu.cc
+  pool_op.cu.cc
+  pool_cudnn_op.cu.cc
+  pool_with_index_op.cu.cc
+  run_program_op.cu.cc
+  softmax_op.cu.cc
+  softmax_cudnn_op.cu.cc
+  spp_op.cu.cc
+  squeeze_op.cu.cc
+  unbind_op.cu.cc
+  unpool_op.cu.cc
+  unsqueeze_op.cu.cc)
+register_unity_group(cc arg_max_op.cc arg_min_op.cc squared_l2_distance_op.cc)
+register_unity_group(
+  cc
+  linear_chain_crf_op.cc
+  lstm_op.cc
+  partial_concat_op.cc
+  pyramid_hash_op.cc
+  recurrent_op.cc
+  run_program_op.cc
+  softmax_with_cross_entropy_op.cc
+  warpctc_op.cc)
+register_unity_group(
+  cc
+  conv_op.cu.cc
+  lstm_op.cu.cc
+  rnn_op.cu.cc
+  split_op.cu.cc
+  activation_cudnn_op.cu.cc
+  assign_value_op.cu.cc
+  merge_selected_rows_op.cu.cc
+  run_program_op.cu.cc
+  warpctc_op.cu.cc)
+register_unity_group(
+  cu
+  addmm_op.cu
+  affine_channel_op.cu
+  allclose_op.cu
+  assign_value_op.cu
+  bce_loss_op.cu
+  bernoulli_op.cu
+  bilateral_slice_op.cu
+  batch_norm_op.cu)
+register_unity_group(
+  cu
+  bilinear_tensor_product_op.cu
+  bmm_op.cu
+  cast_op.cu
+  cholesky_op.cu
+  clip_by_norm_op.cu
+  clip_op.cu
+  conv_cudnn_op.cu
+  affine_grid_op.cu)
+register_unity_group(
+  cu
+  center_loss_op.cu
+  conv_op.cu
+  conv_transpose_cudnn_op.cu
+  conv_transpose_op.cu
+  cos_sim_op.cu
+  crop_op.cu
+  average_accumulates_op.cu
+  conj_op.cu
+  correlation_op.cu)
+register_unity_group(
+  cu
+  cross_entropy_op.cu
+  cross_op.cu
+  ctc_align_op.cu
+  cumsum_op.cu
+  cvm_op.cu
+  data_norm_op.cu
+  deformable_conv_op.cu
+  deformable_conv_v1_op.cu
+  dequantize_abs_max_op.cu)
+register_unity_group(
+  cu
+  dgc_clip_by_norm_op.cu
+  diag_embed_op.cu
+  diag_op.cu
+  diag_v2_op.cu
+  edit_distance_op.cu
+  erf_op.cu
+  meshgrid_op.cu
+  imag_op.cu)
+register_unity_group(cu expand_v2_op.cu fake_dequantize_op.cu
+                     fill_any_like_op.cu)
+register_unity_group(
+  cu
+  flip_op.cu
+  fsp_op.cu
+  gather_nd_op.cu
+  gather_op.cu
+  gather_tree_op.cu
+  gaussian_random_op.cu
+  grid_sampler_op.cu
+  group_norm_op.cu)
+register_unity_group(
+  cu
+  hinge_loss_op.cu
+  histogram_op.cu
+  huber_loss_op.cu
+  im2sequence_op.cu
+  increment_op.cu
+  index_sample_op.cu
+  index_select_op.cu
+  interpolate_op.cu
+  isfinite_v2_op.cu)
+register_unity_group(
+  cu
+  inplace_abn_op.cu
+  interpolate_v2_op.cu
+  isfinite_op.cu
+  l1_norm_op.cu
+  label_smooth_op.cu
+  linspace_op.cu
+  load_combine_op.cu
+  load_op.cu)
+register_unity_group(
+  cu
+  lod_reset_op.cu
+  log_softmax_op.cu
+  lrn_op.cu
+  lstm_unit_op.cu
+  dot_op.cu
+  psroi_pool_op.cu
+  rank_loss_op.cu
+  real_op.cu)
+register_unity_group(
+  cu
+  log_loss_op.cu
+  lookup_table_v2_op.cu
+  margin_rank_loss_op.cu
+  masked_select_op.cu
+  merge_selected_rows_op.cu
+  lstmp_op.cu
+  shuffle_channel_op.cu
+  softmax_cudnn_op.cu
+  squared_l2_distance_op.cu)
+register_unity_group(
+  cu
+  conv_shift_op.cu
+  dequantize_log_op.cu
+  dropout_op.cu
+  fake_quantize_op.cu
+  gelu_op.cu
+  lookup_table_op.cu
+  sigmoid_cross_entropy_with_logits_op.cu
+  softmax_with_cross_entropy_op.cu)
+register_unity_group(
+  cu
+  mean_iou_op.cu
+  mean_op.cu
+  minus_op.cu
+  mish_op.cu
+  multinomial_op.cu
+  multiplex_op.cu
+  mv_op.cu
+  nll_loss_op.cu
+  norm_op.cu
+  one_hot_op.cu
+  pad2d_op.cu
+  pad3d_op.cu
+  pad_constant_like_op.cu
+  pad_op.cu)
+register_unity_group(
+  cu
+  partial_sum_op.cu
+  pixel_shuffle_op.cu
+  prelu_op.cu
+  prroi_pool_op.cu
+  pull_box_extended_sparse_op.cu
+  pull_box_sparse_op.cu)
+register_unity_group(
+  cu
+  randint_op.cu
+  random_crop_op.cu
+  randperm_op.cu
+  range_op.cu
+  reverse_op.cu
+  partial_concat_op.cu
+  kldiv_loss_op.cu
+  instance_norm_op.cu)
+register_unity_group(
+  cu
+  roi_align_op.cu
+  roll_op.cu
+  sample_logits_op.cu
+  sampling_id_op.cu
+  save_combine_op.cu
+  save_op.cu
+  scale_op.cu
+  scatter_nd_add_op.cu
+  scatter_op.cu
+  seed_op.cu)
+register_unity_group(
+  cu
+  roi_pool_op.cu
+  selu_op.cu
+  shape_op.cu
+  shard_index_op.cu
+  sign_op.cu
+  size_op.cu
+  slice_op.cu)
+register_unity_group(
+  cu
+  space_to_depth_op.cu
+  spectral_norm_op.cu
+  split_op.cu
+  split_selected_rows_op.cu
+  squared_l2_norm_op.cu
+  sum_op.cu
+  temporal_shift_op.cu
+  arg_max_op.cu)
+register_unity_group(
+  cu
+  row_conv_op.cu
+  tree_conv_op.cu
+  tril_triu_op.cu
+  truncated_gaussian_random_op.cu
+  unfold_op.cu
+  arg_min_op.cu
+  crop_tensor_op.cu)
+register_unity_group(
+  cu
+  smooth_l1_loss_op.cu
+  uniform_random_op.cu
+  unstack_op.cu
+  where_index_op.cu
+  where_op.cu
+  layer_norm_op.cu)
+register_unity_group(cu expand_as_op.cu stack_op.cu)
 # The following groups are to make better use of `/MP` which MSVC's parallel
 # compilation instruction when compiling in Unity Build.
 register_unity_group(cu activation_op.cu)
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 36e9d894541b0..b18c4e4de4475 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unpool_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h
index 35aeb4e0d610e..062008f95ea3c 100644
--- a/paddle/fluid/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/unpooling.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 445e8cd468bf3..82edcd5a9fcf7 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -101,9 +101,10 @@ class UnsqueezeOp : public framework::OperatorWithKernel {
     for (int axis : unsqz_dims) {
       int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
       // Vaildity Check: the axis bound
-      PADDLE_ENFORCE_GE(cur, 0, platform::errors::InvalidArgument(
-                                    "The insert dimension value should "
-                                    "not be less than 0"));
+      PADDLE_ENFORCE_GE(
+          cur, 0,
+          platform::errors::InvalidArgument("The insert dimension value should "
+                                            "not be less than 0"));
       PADDLE_ENFORCE_LE(cur, cur_output_size,
                         platform::errors::InvalidArgument(
                             "The insert dimension value shoud not be larger "
diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h
index f6112fb59c122..86038aced3846 100644
--- a/paddle/fluid/operators/unsqueeze_op.h
+++ b/paddle/fluid/operators/unsqueeze_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -72,9 +73,10 @@ class UnsqueezeKernel : public framework::OpKernel<T> {
     for (int axis : unsqz_dims) {
       int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
       // Vaildity Check: the axis bound
-      PADDLE_ENFORCE_GE(cur, 0, platform::errors::InvalidArgument(
-                                    "The insert dimension value should "
-                                    "not be less than 0"));
+      PADDLE_ENFORCE_GE(
+          cur, 0,
+          platform::errors::InvalidArgument("The insert dimension value should "
+                                            "not be less than 0"));
       PADDLE_ENFORCE_LE(cur, cur_output_size,
                         platform::errors::InvalidArgument(
                             "The insert dimension value shoule not be larger "
diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc
index 8c8684bf4b035..df2325f5dc523 100644
--- a/paddle/fluid/operators/unstack_op.cc
+++ b/paddle/fluid/operators/unstack_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/for_range.h"
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index d84f7b165fd99..009e883ccb642 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <paddle/fluid/framework/operator.h>
+
 #include <string>
 #include <vector>
 
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index 3dffa0be2e28a..977cd99984ca0 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/var_conv_2d_op.h"
+
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/platform/dynload/mklml.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 24d39c25cf335..247ff43b8a047 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,229 +1,448 @@
-proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
+proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto
+              simple_threadpool)
 if(WITH_GPU)
   proto_library(external_error_proto SRCS external_error.proto)
 endif(WITH_GPU)
-if (WITH_PYTHON)
+if(WITH_PYTHON)
   py_proto_compile(profiler_py_proto SRCS profiler.proto)
-  add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+  add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E
+                                                       touch __init__.py)
   add_dependencies(profiler_py_proto profiler_py_proto_init)
 
-  if (NOT WIN32)
-    add_custom_command(TARGET profiler_py_proto POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
-        COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
-        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  if(NOT WIN32)
+    add_custom_command(
+      TARGET profiler_py_proto
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E make_directory
+              ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+      COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+      COMMENT
+        "Copy generated python proto into directory paddle/fluid/proto/profiler."
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   else(NOT WIN32)
-    string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/")
-    add_custom_command(TARGET profiler_py_proto POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
-        COMMAND copy /Y *.py ${proto_dstpath}
-        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    string(REPLACE "/" "\\" proto_dstpath
+                   "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/")
+    add_custom_command(
+      TARGET profiler_py_proto
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E make_directory
+              ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+      COMMAND copy /Y *.py ${proto_dstpath}
+      COMMENT
+        "Copy generated python proto into directory paddle/fluid/proto/profiler."
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif(NOT WIN32)
 endif()
 
-cc_library(flags SRCS flags.cc DEPS gflags boost)
-cc_library(denormal SRCS denormal.cc DEPS)
+cc_library(
+  flags
+  SRCS flags.cc
+  DEPS gflags boost)
+cc_library(
+  denormal
+  SRCS denormal.cc
+  DEPS)
 
-cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
+cc_test(
+  errors_test
+  SRCS errors_test.cc
+  DEPS errors enforce)
 
 set(enforce_deps flags errors boost flags phi_enforce)
 if(WITH_GPU)
   set(enforce_deps ${enforce_deps} external_error_proto)
 endif()
 
-cc_library(enforce INTERFACE SRCS enforce.cc DEPS ${enforce_deps})
+cc_library(
+  enforce INTERFACE
+  SRCS enforce.cc
+  DEPS ${enforce_deps})
 cc_library(monitor SRCS monitor.cc)
-cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
+cc_test(
+  enforce_test
+  SRCS enforce_test.cc
+  DEPS stringpiece enforce)
 
 set(CPU_INFO_DEPS gflags glog enforce)
-IF(WITH_XBYAK)
-    list(APPEND CPU_INFO_DEPS xbyak)
-ENDIF()
-cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
-cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
-cc_library(os_info SRCS os_info.cc DEPS enforce)
-cc_test(os_info_test SRCS os_info_test.cc DEPS os_info)
-
-IF(WITH_GPU)
-    nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph)
-ELSE()
-    cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade)
-ENDIF()
-
-cc_library(place SRCS place.cc DEPS enforce boost phi_place)
-cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
-
-IF(WITH_MKLDNN)
-    set(MKLDNN_CTX_DEPS mkldnn)
-ELSE()
-    set(MKLDNN_CTX_DEPS)
-ENDIF()
+if(WITH_XBYAK)
+  list(APPEND CPU_INFO_DEPS xbyak)
+endif()
+cc_library(
+  cpu_info
+  SRCS cpu_info.cc
+  DEPS ${CPU_INFO_DEPS})
+cc_test(
+  cpu_info_test
+  SRCS cpu_info_test.cc
+  DEPS cpu_info)
+cc_library(
+  os_info
+  SRCS os_info.cc
+  DEPS enforce)
+cc_test(
+  os_info_test
+  SRCS os_info_test.cc
+  DEPS os_info)
+
+if(WITH_GPU)
+  nv_library(
+    cuda_graph_with_memory_pool
+    SRCS cuda_graph_with_memory_pool.cc
+    DEPS device_context allocator_facade cuda_graph)
+else()
+  cc_library(
+    cuda_graph_with_memory_pool
+    SRCS cuda_graph_with_memory_pool.cc
+    DEPS device_context allocator_facade)
+endif()
+
+cc_library(
+  place
+  SRCS place.cc
+  DEPS enforce boost phi_place)
+cc_test(
+  place_test
+  SRCS place_test.cc
+  DEPS place glog gflags)
+
+if(WITH_MKLDNN)
+  set(MKLDNN_CTX_DEPS mkldnn)
+else()
+  set(MKLDNN_CTX_DEPS)
+endif()
 
 add_subdirectory(device)
 add_subdirectory(dynload)
 add_subdirectory(stream)
 
-cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
-cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
+cc_library(
+  cpu_helper
+  SRCS cpu_helper.cc
+  DEPS cblas enforce)
+cc_test(
+  cpu_helper_test
+  SRCS cpu_helper_test.cc
+  DEPS cpu_helper)
 
 set(dgc_deps "")
-IF(WITH_DGC)
-    set(dgc_deps dgc)
-ENDIF()
-
-IF(WITH_GPU OR WITH_ROCM)
-    set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
-ENDIF()
-
-IF(WITH_IPU)
-    set(IPU_CTX_DEPS ipu_info)
-ELSE()
-    set(IPU_CTX_DEPS)
-ENDIF(WITH_IPU)
-
-IF(WITH_ASCEND_CL)
-    set(NPU_CTX_DEPS npu_stream npu_info)
-ENDIF()
-
-IF(WITH_MLU)
-    set(MLU_CTX_DEPS mlu_device_context)
-ENDIF()
-
-IF(WITH_ASCEND_CL OR WITH_MLU)
-cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
-ENDIF()
-
-IF(WITH_GPU)
-    nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
-ENDIF()
-IF(WITH_ROCM)
-    hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
-ENDIF()
-
-IF(WITH_GPU OR WITH_ROCM)
+if(WITH_DGC)
+  set(dgc_deps dgc)
+endif()
+
+if(WITH_GPU OR WITH_ROCM)
+  set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
+endif()
+
+if(WITH_IPU)
+  set(IPU_CTX_DEPS ipu_info)
+else()
+  set(IPU_CTX_DEPS)
+endif(WITH_IPU)
+
+if(WITH_ASCEND_CL)
+  set(NPU_CTX_DEPS npu_stream npu_info)
+endif()
+
+if(WITH_MLU)
+  set(MLU_CTX_DEPS mlu_device_context)
+endif()
+
+if(WITH_ASCEND_CL OR WITH_MLU)
+  cc_library(
+    stream_callback_manager
+    SRCS stream_callback_manager.cc
+    DEPS simple_threadpool enforce)
+endif()
+
+if(WITH_GPU)
+  nv_library(
+    stream_callback_manager
+    SRCS stream_callback_manager.cc
+    DEPS simple_threadpool enforce)
+endif()
+if(WITH_ROCM)
+  hip_library(
+    stream_callback_manager
+    SRCS stream_callback_manager.cc
+    DEPS simple_threadpool enforce)
+endif()
+
+if(WITH_GPU OR WITH_ROCM)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
-ELSEIF(WITH_ASCEND_CL)
+elseif(WITH_ASCEND_CL)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
-ELSE()
+else()
   set(STREAM_CALLBACK_DEPS)
-ENDIF()
+endif()
 
 if(WITH_GLOO)
-    cc_library(gloo_context SRCS gloo_context.cc DEPS framework_proto gloo_wrapper enforce)
+  cc_library(
+    gloo_context
+    SRCS gloo_context.cc
+    DEPS framework_proto gloo_wrapper enforce)
 endif()
 
-cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
+cc_library(
+  cudnn_workspace_helper
+  SRCS cudnn_workspace_helper.cc
+  DEPS boost)
 
 # separate init from device_context to avoid cycle dependencies
-cc_library(init SRCS init.cc DEPS device_context custom_kernel context_pool)
+cc_library(
+  init
+  SRCS init.cc
+  DEPS device_context custom_kernel context_pool)
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
-    place phi_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
-    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context generator)
+cc_library(
+  device_context
+  SRCS device_context.cc
+  DEPS simple_threadpool
+       malloc
+       xxhash
+       ${STREAM_CALLBACK_DEPS}
+       place
+       phi_place
+       eigen3
+       stringpiece
+       cpu_helper
+       cpu_info
+       framework_proto
+       ${IPU_CTX_DEPS}
+       ${GPU_CTX_DEPS}
+       ${NPU_CTX_DEPS}
+       ${MKLDNN_CTX_DEPS}
+       ${dgc_deps}
+       dlpack
+       cudnn_workspace_helper
+       ${XPU_CTX_DEPS}
+       ${MLU_CTX_DEPS}
+       eigen3
+       cpu_context
+       generator)
 if(WITH_XPU)
   target_link_libraries(device_context xpu_context xpu_resource_pool)
 endif()
 
-cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
+cc_library(
+  collective_helper
+  SRCS collective_helper.cc gen_comm_id_helper.cc
+  DEPS framework_proto device_context enforce)
 if(WITH_ASCEND_CL)
-    target_link_libraries(collective_helper npu_collective_helper)
+  target_link_libraries(collective_helper npu_collective_helper)
 endif()
 
 if(WITH_CNCL)
-    target_link_libraries(collective_helper mlu_collective_helper)
+  target_link_libraries(collective_helper mlu_collective_helper)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
-    target_link_libraries(device_context gpu_info gpu_context phi_gpu_info)
-    target_link_libraries(device_context gpu_resource_pool)
+  target_link_libraries(device_context gpu_info gpu_context phi_gpu_info)
+  target_link_libraries(device_context gpu_resource_pool)
 endif()
-if (WITH_CUSTOM_DEVICE)
-    target_link_libraries(device_context custom_context)
+if(WITH_CUSTOM_DEVICE)
+  target_link_libraries(device_context custom_context)
 endif()
 if(WITH_ASCEND_CL)
-    target_link_libraries(device_context npu_resource_pool)
+  target_link_libraries(device_context npu_resource_pool)
 endif()
 
 if(WITH_MLU)
-    target_link_libraries(device_context mlu_resource_pool)
+  target_link_libraries(device_context mlu_resource_pool)
 endif()
 
 if(WITH_CUSTOM_DEVICE)
-    target_link_libraries(device_context custom_context)
+  target_link_libraries(device_context custom_context)
 endif()
 
-cc_test(init_test SRCS init_test.cc DEPS device_context)
+cc_test(
+  init_test
+  SRCS init_test.cc
+  DEPS device_context)
 
 # Manage all device event library
 set(DEVICE_EVENT_LIBS)
-cc_library(device_event_base SRCS device_event_base.cc DEPS place enforce device_context op_registry)
-set(DEVICE_EVENT_LIBS  device_event_base CACHE INTERNAL "device event libs")
-
+cc_library(
+  device_event_base
+  SRCS device_event_base.cc
+  DEPS place enforce device_context op_registry)
+set(DEVICE_EVENT_LIBS
+    device_event_base
+    CACHE INTERNAL "device event libs")
 
 if(WITH_GPU)
-  nv_library(device_event_gpu SRCS device_event_gpu.cc DEPS device_event_base)
-  set(DEVICE_EVENT_LIBS  device_event_gpu CACHE INTERNAL "device event libs")
-  nv_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu)
-
-  nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-  nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+  nv_library(
+    device_event_gpu
+    SRCS device_event_gpu.cc
+    DEPS device_event_base)
+  set(DEVICE_EVENT_LIBS
+      device_event_gpu
+      CACHE INTERNAL "device event libs")
+  nv_test(
+    device_event_test
+    SRCS device_event_test.cc
+    DEPS device_event_gpu)
+
+  nv_test(
+    device_context_test
+    SRCS device_context_test.cu
+    DEPS device_context gpu_info)
+  nv_test(
+    transform_test
+    SRCS transform_test.cu
+    DEPS memory place device_context)
 endif()
 
 if(WITH_ROCM)
-  hip_library(device_event_gpu SRCS device_event_gpu.cc DEPS device_event_base)
-  set(DEVICE_EVENT_LIBS  device_event_gpu CACHE INTERNAL "device event libs")
-  hip_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu)
-
-  hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-  hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+  hip_library(
+    device_event_gpu
+    SRCS device_event_gpu.cc
+    DEPS device_event_base)
+  set(DEVICE_EVENT_LIBS
+      device_event_gpu
+      CACHE INTERNAL "device event libs")
+  hip_test(
+    device_event_test
+    SRCS device_event_test.cc
+    DEPS device_event_gpu)
+
+  hip_test(
+    device_context_test
+    SRCS device_context_test.cu
+    DEPS device_context gpu_info)
+  hip_test(
+    transform_test
+    SRCS transform_test.cu
+    DEPS memory place device_context)
 endif()
 
 cc_library(timer SRCS timer.cc)
-cc_test(timer_test SRCS timer_test.cc DEPS timer)
-
-cc_library(lodtensor_printer SRCS lodtensor_printer.cc DEPS ddim place tensor scope lod_tensor variable_helper framework_proto)
-cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer)
+cc_test(
+  timer_test
+  SRCS timer_test.cc
+  DEPS timer)
+
+cc_library(
+  lodtensor_printer
+  SRCS lodtensor_printer.cc
+  DEPS ddim
+       place
+       tensor
+       scope
+       lod_tensor
+       variable_helper
+       framework_proto)
+cc_test(
+  lodtensor_printer_test
+  SRCS lodtensor_printer_test.cc
+  DEPS lodtensor_printer)
 
 add_subdirectory(profiler)
 
-cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
+cc_library(
+  device_tracer
+  SRCS device_tracer.cc
+  DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)
-  nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler stats)
-  nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
+  nv_library(
+    profiler
+    SRCS profiler.cc profiler.cu
+    DEPS os_info
+         device_tracer
+         gpu_info
+         enforce
+         dynload_cuda
+         new_profiler
+         stats)
+  nv_library(
+    device_memory_aligment
+    SRCS device_memory_aligment.cc
+    DEPS cpu_info gpu_info place)
 elseif(WITH_ROCM)
-  hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler stats)
-  hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
+  hip_library(
+    profiler
+    SRCS profiler.cc profiler.cu
+    DEPS os_info device_tracer gpu_info enforce new_profiler stats)
+  hip_library(
+    device_memory_aligment
+    SRCS device_memory_aligment.cc
+    DEPS cpu_info gpu_info place)
 else()
-  cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler stats)
-  cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
+  cc_library(
+    profiler
+    SRCS profiler.cc
+    DEPS os_info device_tracer enforce new_profiler stats)
+  cc_library(
+    device_memory_aligment
+    SRCS device_memory_aligment.cc
+    DEPS cpu_info place)
 endif()
 
-cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
-cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
-cc_test(complex_test SRCS complex_test.cc DEPS lod_tensor)
+cc_test(
+  profiler_test
+  SRCS profiler_test.cc
+  DEPS profiler)
+cc_test(
+  float16_test
+  SRCS float16_test.cc
+  DEPS lod_tensor)
+cc_test(
+  bfloat16_test
+  SRCS bfloat16_test.cc
+  DEPS lod_tensor)
+cc_test(
+  complex_test
+  SRCS complex_test.cc
+  DEPS lod_tensor)
 
-IF(WITH_GPU)
-  nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
-  nv_test(bfloat16_gpu_test SRCS bfloat16_test.cu DEPS lod_tensor)
-  nv_test(complex_gpu_test SRCS complex_test.cu DEPS lod_tensor)
-  nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
-  nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
-ENDIF()
+if(WITH_GPU)
+  nv_test(
+    float16_gpu_test
+    SRCS float16_test.cu
+    DEPS lod_tensor)
+  nv_test(
+    bfloat16_gpu_test
+    SRCS bfloat16_test.cu
+    DEPS lod_tensor)
+  nv_test(
+    complex_gpu_test
+    SRCS complex_test.cu
+    DEPS lod_tensor)
+  nv_test(
+    test_limit_gpu_memory
+    SRCS test_limit_gpu_memory.cu
+    DEPS gpu_info flags)
+  nv_library(
+    cuda_device_guard
+    SRCS cuda_device_guard.cc
+    DEPS gpu_info)
+endif()
 
-IF(WITH_ROCM)
-  hip_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
-  hip_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
-  hip_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
-ENDIF()
+if(WITH_ROCM)
+  hip_test(
+    float16_gpu_test
+    SRCS float16_test.cu
+    DEPS lod_tensor)
+  hip_test(
+    test_limit_gpu_memory
+    SRCS test_limit_gpu_memory.cu
+    DEPS gpu_info flags)
+  hip_library(
+    cuda_device_guard
+    SRCS cuda_device_guard.cc
+    DEPS gpu_info)
+endif()
 
 if(NOT APPLE AND NOT WIN32)
-  cc_library(device_code SRCS device_code.cc DEPS device_context)
+  cc_library(
+    device_code
+    SRCS device_code.cc
+    DEPS device_context)
   if(WITH_GPU OR WITH_ROCM)
-    cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor)
+    cc_test(
+      device_code_test
+      SRCS device_code_test.cc
+      DEPS device_code lod_tensor)
   endif()
 endif()
diff --git a/paddle/fluid/platform/aligned_vector.h b/paddle/fluid/platform/aligned_vector.h
index 6d48917ba1f6d..b42ae15405e7f 100644
--- a/paddle/fluid/platform/aligned_vector.h
+++ b/paddle/fluid/platform/aligned_vector.h
@@ -43,11 +43,11 @@ HOSTDEVICE inline void Store(const AlignedVector<T, Size>& vec, T* addr) {
 }
 
 /*
-* Only the address of input data is the multiplier of 1,2,4, vectorized load
-* with corresponding multiplier-value is possible. Moreover, the maximum length
-* of vectorized load is 128 bits once. Hence, valid length of vectorized load
-* shall be determined under both former constraints.
-*/
+ * Only the address of input data is the multiplier of 1,2,4, vectorized load
+ * with corresponding multiplier-value is possible. Moreover, the maximum length
+ * of vectorized load is 128 bits once. Hence, valid length of vectorized load
+ * shall be determined under both former constraints.
+ */
 template <typename T>
 int GetVectorizedSize(const T* pointer) {
   constexpr int max_load_bits = 128;
@@ -58,11 +58,11 @@ int GetVectorizedSize(const T* pointer) {
   constexpr int vec2 = std::alignment_of<AlignedVector<T, 2>>::value;  // NOLINT
   if (address % vec8 == 0) {
     /*
-    * Currently, decide to deal with no more than 4 data once while adopting
-    * vectorization load/store, if performance test shows that dealing with
-    * 8 data once in vectorization load/store does get optimized, return code
-    * below can be changed into " return std::min(8, valid_vec_size); " .
-    */
+     * Currently, decide to deal with no more than 4 data once while adopting
+     * vectorization load/store, if performance test shows that dealing with
+     * 8 data once in vectorization load/store does get optimized, return code
+     * below can be changed into " return std::min(8, valid_vec_size); " .
+     */
     return std::min(4, valid_vec_size);
   } else if (address % vec4 == 0) {
     return std::min(4, valid_vec_size);
diff --git a/paddle/fluid/platform/bfloat16_test.cc b/paddle/fluid/platform/bfloat16_test.cc
index 794c1ff684c8d..f824716ab9224 100644
--- a/paddle/fluid/platform/bfloat16_test.cc
+++ b/paddle/fluid/platform/bfloat16_test.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/bfloat16.h"
+
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu
index 391b91487fa8a..c5f38cf94eedb 100644
--- a/paddle/fluid/platform/bfloat16_test.cu
+++ b/paddle/fluid/platform/bfloat16_test.cu
@@ -17,7 +17,9 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <iostream>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 
 #if defined(PADDLE_CUDA_BF16)
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index d05de900e5e77..8f0e4204772f8 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/collective_helper.h"
+
 #include <utility>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
diff --git a/paddle/fluid/platform/complex_test.cc b/paddle/fluid/platform/complex_test.cc
index c7ded7587172e..3547631064d39 100644
--- a/paddle/fluid/platform/complex_test.cc
+++ b/paddle/fluid/platform/complex_test.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/complex.h"
+
 #include <complex>
+
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
diff --git a/paddle/fluid/platform/complex_test.cu b/paddle/fluid/platform/complex_test.cu
index 08ec75878b827..b814bcde6841f 100644
--- a/paddle/fluid/platform/complex_test.cu
+++ b/paddle/fluid/platform/complex_test.cu
@@ -18,6 +18,7 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <thrust/complex.h>
+
 #include <bitset>
 #include <iostream>
 
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index e486044486571..c32af3b37a409 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #endif  // _WIN32
 
 #include <algorithm>
+
 #include "paddle/fluid/platform/flags.h"
 
 DECLARE_double(fraction_of_cpu_memory_to_use);
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
index 4804d3f6ed301..4ef2a9709a59d 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -13,26 +13,37 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device_context.h"
 
+DECLARE_bool(use_stream_safe_cuda_allocator);
+
 namespace paddle {
 namespace platform {
 
 #ifdef PADDLE_WITH_CUDA
 void BeginCUDAGraphCapture(platform::CUDAPlace place,
-                           cudaStreamCaptureMode mode) {
+                           cudaStreamCaptureMode mode, int64_t pool_id) {
   auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
   dev_ctx->cudnn_workspace_handle().ResetWorkspace();
 
   auto stream = dev_ctx->stream();
   CUDAGraph::BeginCapture(place, stream, mode);
-  auto id = CUDAGraph::CapturingID();
+
+  auto old_value = FLAGS_use_stream_safe_cuda_allocator;
+  if (old_value) {
+    FLAGS_use_stream_safe_cuda_allocator = false;
+  }
+  pool_id = CUDAGraph::SetMemoryPoolID(pool_id);
   memory::allocation::AllocatorFacade::Instance().PrepareMemoryPoolForCUDAGraph(
-      id);
-  AddResetCallbackIfCapturingCUDAGraph([id] {
+      pool_id);
+  if (old_value) {
+    FLAGS_use_stream_safe_cuda_allocator = true;
+  }
+  AddResetCallbackIfCapturingCUDAGraph([pool_id] {
     memory::allocation::AllocatorFacade::Instance().RemoveMemoryPoolOfCUDAGraph(
-        id);
+        pool_id);
   });
 }
 
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
index 7a9e1a3a1419c..b8831126be052 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.h
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
@@ -23,10 +23,51 @@
 namespace paddle {
 namespace platform {
 
+#ifdef PADDLE_WITH_CUDA
+#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(                                   \
+    __cond, __kernel_func, __grid, __block, __sm_size, __stream, __seed_inc,  \
+    __seed_expr, __offset_expr, ...)                                          \
+  do {                                                                        \
+    if (::paddle::platform::CUDAGraph::IsThisThreadCapturing() && (__cond)) { \
+      using __Helper =                                                        \
+          ::paddle::platform::IsSameKernelHelper<decltype(&__kernel_func),    \
+                                                 &__kernel_func>;             \
+      auto *dev_ctx =                                                         \
+          ::paddle::platform::DeviceContextPool::Instance().GetByPlace(       \
+              ::paddle::platform::CUDAGraph::CapturingPlace());               \
+      auto __set_seed_func =                                                  \
+          [=](::paddle::platform::CUDAKernelParams *__params,                 \
+              bool __check_only) -> bool {                                    \
+        if (__check_only) {                                                   \
+          return __params->func() == &__kernel_func &&                        \
+                 __Helper::Compare(*__params, __VA_ARGS__);                   \
+        }                                                                     \
+        auto &KERNEL_PARAMS = *__params;                                      \
+        uint64_t __seed, __offset;                                            \
+        ::paddle::operators::GetSeedDataAndIncrement(                         \
+            *dev_ctx, nullptr, false, 0, __seed_inc, &__seed, &__offset);     \
+        __seed_expr = static_cast<decltype(__seed_expr)>(__seed);             \
+        __offset_expr = static_cast<decltype(__offset_expr)>(__offset);       \
+        return true;                                                          \
+      };                                                                      \
+      ::paddle::platform::CUDAGraph::RecordRandomKernelInfo(__set_seed_func); \
+    }                                                                         \
+    __kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__);     \
+  } while (0)
+#else
+#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(                                  \
+    __cond, __kernel_func, __grid, __block, __sm_size, __stream, __seed_inc, \
+    __seed_expr, __offset_expr, ...)                                         \
+  do {                                                                       \
+    __kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__);    \
+  } while (0)
+#endif
+
 // NOTE: These APIs are not thread-safe.
 #ifdef PADDLE_WITH_CUDA
 void BeginCUDAGraphCapture(platform::CUDAPlace place,
-                           cudaStreamCaptureMode mode);
+                           cudaStreamCaptureMode mode,
+                           int64_t pool_id = CUDAGraph::kInvalidPoolID);
 std::unique_ptr<CUDAGraph> EndCUDAGraphCapture();
 #endif
 
diff --git a/paddle/fluid/platform/denormal.cc b/paddle/fluid/platform/denormal.cc
index 4af156d1577dd..4cfb082544322 100644
--- a/paddle/fluid/platform/denormal.cc
+++ b/paddle/fluid/platform/denormal.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/denormal.h"
+
 #include <tuple>
 #include <utility>
 
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index cbf3fdd263b48..62745883023cb 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -1,27 +1,26 @@
-
 set(DEV_LIBS custom_device)
 
 # GPU
-IF(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM)
   add_subdirectory(gpu)
-ENDIF()
+endif()
 
 # XPU
-IF(WITH_XPU)
+if(WITH_XPU)
   add_subdirectory(xpu)
-ENDIF()
+endif()
 
 # NPU
-IF(WITH_ASCEND OR WITH_ASCEND_CL)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
   add_subdirectory(npu)
-ENDIF()
+endif()
 
 # IPU
-IF(WITH_IPU)
+if(WITH_IPU)
   add_subdirectory(ipu)
-ENDIF()
+endif()
 
 # MLU
-IF(WITH_MLU)
+if(WITH_MLU)
   add_subdirectory(mlu)
-ENDIF()
+endif()
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
index f7c13ec7ed5ed..66120f55f7cdc 100644
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -1,15 +1,30 @@
-IF(WITH_GPU)
-    add_subdirectory(cuda)
-    nv_library(gpu_info SRCS gpu_info.cc DEPS phi_gpu_info gflags glog enforce monitor dynload_cuda)
+if(WITH_GPU)
+  add_subdirectory(cuda)
+  nv_library(
+    gpu_info
+    SRCS gpu_info.cc
+    DEPS phi_gpu_info gflags glog enforce monitor dynload_cuda)
 
-    nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
-    nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
-ELSEIF(WITH_ROCM)
-    add_subdirectory(rocm)
-    hip_library(gpu_info SRCS gpu_info.cc DEPS phi_gpu_info gflags glog enforce monitor dynload_cuda)
+  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
+  nv_test(
+    cudnn_desc_test
+    SRCS cudnn_desc_test.cc
+    DEPS dynload_cuda)
+elseif(WITH_ROCM)
+  add_subdirectory(rocm)
+  hip_library(
+    gpu_info
+    SRCS gpu_info.cc
+    DEPS phi_gpu_info gflags glog enforce monitor dynload_cuda)
 
-    hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
-    hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
-ENDIF()
+  hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
+  hip_test(
+    cudnn_desc_test
+    SRCS cudnn_desc_test.cc
+    DEPS dynload_cuda)
+endif()
 
-cc_library(gpu_resource_pool SRCS gpu_resource_pool.cc DEPS gpu_info)
+cc_library(
+  gpu_resource_pool
+  SRCS gpu_resource_pool.cc
+  DEPS gpu_info)
diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
index 85050038d5a83..da9121550e07a 100644
--- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
@@ -1,4 +1,13 @@
-nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
-nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
+nv_library(
+  cuda_graph
+  SRCS cuda_graph.cc
+  DEPS enforce allocator_facade)
+nv_library(
+  cuda_profiler
+  SRCS cuda_profiler.cc
+  DEPS enforce)
 
-nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda phi)
+nv_test(
+  cudnn_helper_test
+  SRCS cudnn_helper_test.cc
+  DEPS dynload_cuda phi)
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
index 8ee3b118c32f2..c5a515ce43611 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
@@ -14,12 +14,79 @@
 
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
 
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
+
 namespace paddle {
 namespace platform {
 
 std::unique_ptr<CUDAGraph> CUDAGraph::capturing_graph_{nullptr};
 paddle::optional<std::thread::id> CUDAGraph::capturing_thread_id_{paddle::none};
 
+static std::vector<cudaGraphNode_t> ToposortCUDAGraph(cudaGraph_t graph) {
+  size_t num_nodes;
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphGetNodes(graph, nullptr, &num_nodes));
+  std::vector<cudaGraphNode_t> nodes(num_nodes);
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaGraphGetNodes(graph, nodes.data(), &num_nodes));
+
+  size_t num_edges;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaGraphGetEdges(graph, nullptr, nullptr, &num_edges));
+  std::vector<cudaGraphNode_t> from(num_edges), to(num_edges);
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaGraphGetEdges(graph, from.data(), to.data(), &num_edges));
+
+  std::unordered_map<cudaGraphNode_t, std::unordered_set<cudaGraphNode_t>>
+      in_edges, out_edges;
+  for (auto node : nodes) {
+    in_edges[node];
+    out_edges[node];
+  }
+
+  for (size_t i = 0; i < num_edges; ++i) {
+    in_edges[to[i]].insert(from[i]);
+    out_edges[from[i]].insert(to[i]);
+  }
+
+  std::queue<cudaGraphNode_t> q;
+  for (const auto &pair : in_edges) {
+    if (pair.second.empty()) {
+      q.push(pair.first);
+    }
+  }
+
+  nodes.clear();
+  while (!q.empty()) {
+    auto cur = q.front();
+    q.pop();
+    nodes.push_back(cur);
+
+    for (auto out_node : out_edges.at(cur)) {
+      auto &in_nodes = in_edges.at(out_node);
+      in_nodes.erase(cur);
+      if (in_nodes.empty()) {
+        q.push(out_node);
+      }
+    }
+  }
+  PADDLE_ENFORCE_EQ(
+      nodes.size(), num_nodes,
+      phi::errors::InvalidArgument("Toposort error, this may be a bug."));
+  return nodes;
+}
+
+CUDAGraphID CUDAGraph::UniqueID() {
+  static std::atomic<CUDAGraphID> id;
+  return id.fetch_add(1);
+}
+
+int64_t CUDAGraph::UniqueMemoryPoolID() {
+  static std::atomic<int64_t> id(CUDAGraph::kDefaultPoolID + 1);
+  return id.fetch_add(1);
+}
+
 void CUDAGraph::Reset() {
   if (is_reset_) return;
 #if CUDA_VERSION >= 10010
@@ -46,9 +113,16 @@ void CUDAGraph::Replay() {
   PADDLE_ENFORCE_EQ(is_reset_, false,
                     errors::PermissionDenied(
                         "Cannot replay the CUDA Graph after reset is called."));
-  for (auto exec_graph : exec_graphs_) {
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphLaunch(exec_graph, stream_));
+  size_t n = exec_graphs_.size();
+  for (size_t i = 0; i < n; ++i) {
+    if (!is_first_run_) {
+      for (auto &hook : pre_hooks_[i]) {
+        hook(exec_graphs_[i]);
+      }
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphLaunch(exec_graphs_[i], stream_));
   }
+  is_first_run_ = false;
 #endif
 }
 
@@ -72,7 +146,8 @@ void CUDAGraph::BeginSegmentCapture() {
                     platform::errors::PermissionDenied(
                         "CUDA Graph should not be invalidated."));
   VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_
-           << ", segment id " << capturing_graph_->graphs_.size();
+           << ", segment id " << capturing_graph_->graphs_.size()
+           << ", memory pool id " << capturing_graph_->pool_id_;
 #endif
 }
 
@@ -112,15 +187,57 @@ void CUDAGraph::EndSegmentCapture() {
   if (num_nodes == 0) {
     PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphDestroy(graph));
     VLOG(10) << "Skip empty CUDA Graph with ID " << capturing_graph_->id_
-             << ", segment id " << capturing_graph_->graphs_.size();
+             << ", segment id " << capturing_graph_->graphs_.size()
+             << ", memory pool id " << capturing_graph_->pool_id_;
     return;
   }
 
+  auto sorted_nodes = ToposortCUDAGraph(graph);
+  capturing_graph_->pre_hooks_.emplace_back();
+  std::unordered_set<cudaGraphNode_t> visited;
+  VLOG(10) << "SetSeedFunc number : "
+           << capturing_graph_->set_seed_funcs_.size();
+  for (const auto &set_seed_func : capturing_graph_->set_seed_funcs_) {
+    bool found = false;
+    for (auto node : sorted_nodes) {
+      if (visited.count(node) > 0) continue;
+      cudaGraphNodeType type;
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphNodeGetType(node, &type));
+      if (type == cudaGraphNodeTypeKernel) {
+        cudaKernelNodeParams params;
+        auto err = cudaGraphKernelNodeGetParams(node, &params);
+        if (err == cudaErrorInvalidDeviceFunction) {
+          continue;
+        } else {
+          PADDLE_ENFORCE_GPU_SUCCESS(err);
+        }
+        CUDAKernelParams kernel_params(&params);
+        if (set_seed_func(&kernel_params, true)) {
+          capturing_graph_->pre_hooks_.back().push_back(
+              [set_seed_func, node, params](cudaGraphExec_t exec_graph) {
+                CUDAKernelParams kernel_params(&params);
+                set_seed_func(&kernel_params, false);
+                PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphExecKernelNodeSetParams(
+                    exec_graph, node, &params));
+              });
+          visited.insert(node);
+          found = true;
+          break;
+        }
+      }
+    }
+    PADDLE_ENFORCE_EQ(found, true,
+                      phi::errors::InvalidArgument(
+                          "Cannot find the corresponding random CUDA kernel."));
+  }
+  capturing_graph_->set_seed_funcs_.clear();
+
   cudaGraphExec_t exec_graph;
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaGraphInstantiate(&exec_graph, graph, nullptr, nullptr, 0));
   VLOG(10) << "End to capture CUDA Graph with ID " << capturing_graph_->id_
-           << ", segment id " << capturing_graph_->graphs_.size();
+           << ", segment id " << capturing_graph_->graphs_.size()
+           << ", memory pool id " << capturing_graph_->pool_id_;
   capturing_graph_->graphs_.emplace_back(graph);
   capturing_graph_->exec_graphs_.emplace_back(exec_graph);
 #endif
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
index ca1e7abb375cb..b3704fc628adc 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
@@ -20,10 +20,10 @@
 #include <mutex>
 #include <thread>
 #include <vector>
+
 #include "cuda.h"          // NOLINT
 #include "cuda_runtime.h"  // NOLINT
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
-
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
@@ -32,6 +32,69 @@
 namespace paddle {
 namespace platform {
 
+template <typename T>
+static bool IsBitwiseEqual(const T &x, const T &y) {
+  return std::memcmp(&x, &y, sizeof(T)) == 0;
+}
+
+class CUDAKernelParams {
+ public:
+  explicit CUDAKernelParams(const cudaKernelNodeParams *params)
+      : params_(params) {}
+
+  const void *func() const { return params_->func; }
+
+  template <typename T>
+  T &As(size_t idx) const {
+    return *reinterpret_cast<T *>(params_->kernelParams[idx]);
+  }
+
+ private:
+  const cudaKernelNodeParams *params_;
+};
+
+template <typename F, F f>
+struct IsSameKernelHelper;
+
+template <typename Return, typename... FuncArgs,
+          Return (*kernel_fn)(FuncArgs...)>
+struct IsSameKernelHelper<Return (*)(FuncArgs...), kernel_fn> {
+ private:
+  using FuncArgsTuple = decltype(std::make_tuple(std::declval<FuncArgs>()...));
+
+  template <typename TupleT, size_t IDX, bool IsEnd /*=false*/>
+  struct Impl {
+    static bool Compare(const CUDAKernelParams &params, const TupleT &args) {
+      using CompareT = typename std::tuple_element<IDX, FuncArgsTuple>::type;
+      if (!IsBitwiseEqual<CompareT>(params.As<CompareT>(IDX),
+                                    std::get<IDX>(args))) {
+        return false;
+      }
+
+      constexpr auto NewIsEnd = (IDX + 1 == std::tuple_size<TupleT>::value);
+      return Impl<TupleT, IDX + 1, NewIsEnd>::Compare(params, args);
+    }
+  };
+
+  template <typename TupleT, size_t IDX>
+  struct Impl<TupleT, IDX, true> {
+    static bool Compare(const CUDAKernelParams &params, const TupleT &args) {
+      return true;
+    }
+  };
+
+ public:
+  template <typename... Args>
+  static bool Compare(const CUDAKernelParams &params, Args... args) {
+    constexpr auto kNumArgs = sizeof...(FuncArgs);
+    static_assert(kNumArgs == sizeof...(Args), "Argument number not match");
+
+    auto args_tuple = std::make_tuple(args...);
+    using TupleT = typename std::decay<decltype(args_tuple)>::type;
+    return Impl<TupleT, 0, kNumArgs == 0>::Compare(params, args_tuple);
+  }
+};
+
 #if CUDA_VERSION >= 10010
 static void ThrowErrorIfNotSupportCUDAGraph() {}
 #else
@@ -61,10 +124,35 @@ class CUDAGraph {
   }
 
  public:
+  static constexpr int64_t kDefaultPoolID = 0;
+  static constexpr int64_t kInvalidPoolID = -1;
+
   ~CUDAGraph() { Reset(); }
 
   CUDAGraphID ID() const { return id_; }
 
+  static int64_t SetMemoryPoolID(int64_t pool_id) {
+    auto &pool_id_ = capturing_graph_->pool_id_;
+    PADDLE_ENFORCE_EQ(
+        pool_id_, kInvalidPoolID,
+        phi::errors::InvalidArgument("Cannot reset memory pool id twice, the "
+                                     "former memory pool id is %d.",
+                                     pool_id_));
+    if (pool_id <= kInvalidPoolID) {
+      pool_id_ = UniqueMemoryPoolID();
+    } else {
+      PADDLE_ENFORCE_GE(
+          pool_id, kDefaultPoolID,
+          phi::errors::InvalidArgument("Invalid memory pool id %d.", pool_id));
+      pool_id_ = pool_id;
+    }
+    return pool_id_;
+  }
+
+  int64_t PoolID() const { return pool_id_; }
+
+  static int64_t CapturingPoolID() { return capturing_graph_->pool_id_; }
+
   void Replay();
 
   void Reset();
@@ -120,12 +208,17 @@ class CUDAGraph {
     }
   }
 
- private:
-  static CUDAGraphID UniqueID() {
-    static std::atomic<CUDAGraphID> id;
-    return id.fetch_add(1);
+  using SetSeedFunc = std::function<bool(CUDAKernelParams *, bool)>;
+  static void RecordRandomKernelInfo(SetSeedFunc set_seed_func) {
+    std::lock_guard<std::mutex> guard(capturing_graph_->func_mtx_);
+    capturing_graph_->set_seed_funcs_.emplace_back(std::move(set_seed_func));
   }
 
+  static int64_t UniqueMemoryPoolID();
+
+ private:
+  static CUDAGraphID UniqueID();
+
  private:
 #if CUDA_VERSION >= 10010
   std::vector<cudaGraph_t> graphs_;
@@ -135,10 +228,17 @@ class CUDAGraph {
   cudaStream_t stream_{nullptr};
   platform::CUDAPlace place_;
   CUDAGraphID id_;
+  int64_t pool_id_{kInvalidPoolID};
   std::vector<std::function<void()>> callbacks_;
   bool is_reset_{false};
   std::mutex mtx_;
 
+  std::vector<SetSeedFunc> set_seed_funcs_;
+  std::vector<std::vector<std::function<void(cudaGraphExec_t)>>> pre_hooks_;
+  std::mutex func_mtx_;
+
+  bool is_first_run_{true};
+
   static paddle::optional<std::thread::id> capturing_thread_id_;
   static std::unique_ptr<CUDAGraph> capturing_graph_;
 };
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
index a32db3a9921e3..7185d2356aae5 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -68,7 +68,7 @@ namespace platform {
  *      }
  *    }
  *
-*/
+ */
 
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
   int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc
index 851d0d18c604c..86c72769eb56e 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc
@@ -15,13 +15,13 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
 #define GOOGLE_GLOG_DLL_DECL
 
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
 TEST(CudnnHelper, ScopedTensorDescriptor) {
-  using paddle::platform::ScopedTensorDescriptor;
   using paddle::platform::DataLayout;
+  using paddle::platform::ScopedTensorDescriptor;
 
   ScopedTensorDescriptor tensor_desc;
   std::vector<int> shape = {2, 4, 6, 6};
@@ -65,8 +65,8 @@ TEST(CudnnHelper, ScopedTensorDescriptor) {
 }
 
 TEST(CudnnHelper, ScopedFilterDescriptor) {
-  using paddle::platform::ScopedFilterDescriptor;
   using paddle::platform::DataLayout;
+  using paddle::platform::ScopedFilterDescriptor;
 
   ScopedFilterDescriptor filter_desc;
   std::vector<int> shape = {2, 3, 3};
@@ -129,8 +129,8 @@ TEST(CudnnHelper, ScopedConvolutionDescriptor) {
 }
 
 TEST(CudnnHelper, ScopedPoolingDescriptor) {
-  using paddle::platform::ScopedPoolingDescriptor;
   using paddle::platform::PoolingMode;
+  using paddle::platform::ScopedPoolingDescriptor;
 
   ScopedPoolingDescriptor pool_desc;
   std::vector<int> src_kernel = {2, 2, 5};
diff --git a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
index ab8bb2cad8c51..28c0e0ef9acf8 100644
--- a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
+++ b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <iostream>
 #ifdef _WIN32
@@ -22,13 +23,12 @@
 
 #define PADDLE_CUDA_FP16
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
-
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 using paddle::platform::float16;
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T>
 __global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
diff --git a/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc b/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
index 8ea30027e8ade..2e58e71cc2c06 100644
--- a/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
+++ b/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 8c04e935134c7..6b302d2449da5 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <mutex>
 #include <set>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -100,8 +101,9 @@ static size_t GpuAllocSize(bool realloc) {
   size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
                            : FLAGS_initial_gpu_memory_in_mb;
   size_t alloc_bytes =
-      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
-                                           FLAGS_fraction_of_gpu_memory_to_use);
+      (flag_mb > 0ul
+           ? flag_mb << 20
+           : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
   PADDLE_ENFORCE_GE(
       available_to_alloc, alloc_bytes,
       platform::errors::ResourceExhausted("Not enough available GPU memory."));
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
index 94b47cca948e6..3a97797c98260 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include <stddef.h>
+
 #include <array>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
index 80d60ca95bf6f..5cacdfcb12f03 100644
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -25,9 +25,11 @@
 #endif
 
 #include <stddef.h>
+
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/platform/device_context.h"
 
 #ifdef __HIPCC__
@@ -93,9 +95,9 @@ struct GpuLaunchConfig {
 };
 
 /* According to NVIDIA, if number of threads per block is 64/128/256/512,
-  * cuda performs better. And number of blocks should be greater (at least
-  * 2x~4x) than number of SMs. Hence, SM count is took into account within
-  * this function to determine the right number of threads per block. */
+ * cuda performs better. And number of blocks should be greater (at least
+ * 2x~4x) than number of SMs. Hence, SM count is took into account within
+ * this function to determine the right number of threads per block. */
 inline GpuLaunchConfig GetGpuLaunchConfig1D(
     const platform::CUDADeviceContext& context, int64_t numel,
     int vec_size = 1) {
@@ -143,14 +145,16 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(
 
 inline GpuLaunchConfig GetGpuLaunchConfig2D(
     const platform::CUDADeviceContext& context, int x_dim, int y_dim) {
-  PADDLE_ENFORCE_GT(x_dim, 0, platform::errors::InvalidArgument(
-                                  "x dim number should greater than 0,"
-                                  " but received value is: %d",
-                                  x_dim));
-  PADDLE_ENFORCE_GT(y_dim, 0, platform::errors::InvalidArgument(
-                                  "y dim number should greater than 0,"
-                                  " but received value is: %d",
-                                  y_dim));
+  PADDLE_ENFORCE_GT(
+      x_dim, 0,
+      platform::errors::InvalidArgument("x dim number should greater than 0,"
+                                        " but received value is: %d",
+                                        x_dim));
+  PADDLE_ENFORCE_GT(
+      y_dim, 0,
+      platform::errors::InvalidArgument("y dim number should greater than 0,"
+                                        " but received value is: %d",
+                                        y_dim));
 
   const int kThreadsPerBlock = 256;
   int block_cols = (std::min)(x_dim, kThreadsPerBlock);
diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
index 803674779e756..a0e9d459721fd 100644
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #endif
 #include <stdio.h>
+
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
index 2c55eb972b765..56fdb0da34057 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
@@ -14,6 +14,7 @@
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
+
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index d0b48eca5021b..2cadd55d2dc77 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -19,11 +19,13 @@
 
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
+
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
 
 #else
 #include <cuda_runtime.h>
+
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index 61ea0fd3cd293..b9e612b98def9 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -16,6 +16,7 @@
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include <stdio.h>
+
 #include <memory>
 #include <string>
 #include <thread>  // NOLINT
@@ -31,6 +32,8 @@
 #ifdef PADDLE_WITH_RCCL
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -52,6 +55,10 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
     return ncclFloat16;
   } else if (type == framework::proto::VarType::INT8) {
     return ncclInt8;
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+  } else if (type == framework::proto::VarType::BF16) {
+    return ncclBfloat16;
+#endif
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));
@@ -69,6 +76,10 @@ inline ncclDataType_t ToNCCLDataType(experimental::DataType type) {
     return ncclInt64;
   } else if (type == experimental::DataType::FLOAT16) {
     return ncclFloat16;
+#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+  } else if (type == experimental::DataType::BFLOAT16) {
+    return ncclBfloat16;
+#endif
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));
@@ -254,7 +265,7 @@ class NCCLCommunicator {
    *allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
    *create a new nccl comm for sync_batch_norm_op. And these codes should be
    *polished with a unified nccl management.
-  */
+   */
   NCCLContextMap *GetSyncBatchNormCtx(
       framework::Scope *scope, const std::vector<platform::Place> &places) {
     auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
diff --git a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
index 988807258c123..070312adbc2e6 100644
--- a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
@@ -1 +1,4 @@
-hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
+hip_test(
+  miopen_helper_test
+  SRCS miopen_helper_test.cc
+  DEPS dynload_cuda)
diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc b/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc
index 13cf52dc2c6a3..e99fc7f37a8f8 100644
--- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc
@@ -15,13 +15,13 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
 #define GOOGLE_GLOG_DLL_DECL
 
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
 TEST(MIOpenHelper, ScopedTensorDescriptor) {
-  using paddle::platform::ScopedTensorDescriptor;
   using paddle::platform::DataLayout;
+  using paddle::platform::ScopedTensorDescriptor;
 
   ScopedTensorDescriptor tensor_desc;
   std::vector<int> shape = {2, 4, 6, 6};
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
index a0f3fb0f73ba5..c0f6f173a798a 100644
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
@@ -65,7 +65,7 @@ namespace platform {
  *      }
  *    }
  *
-*/
+ */
 
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                     \
   int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index 7712ede8fd210..29f2a2955e0c2 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -1,35 +1,42 @@
 if(WITH_IPU)
   set(paddle_ipu_handler ${CMAKE_CURRENT_BINARY_DIR}/paddle_ipu_handler.h.tmp)
   set(paddle_ipu_handler_final ${CMAKE_CURRENT_BINARY_DIR}/paddle_ipu_handler.h)
-  file(WRITE ${paddle_ipu_handler} "// Auto generated from CMake. DO NOT EDIT!\n\n")
+  file(WRITE ${paddle_ipu_handler}
+       "// Auto generated from CMake. DO NOT EDIT!\n\n")
   file(APPEND ${paddle_ipu_handler} "\#pragma once\n")
-  file(APPEND ${paddle_ipu_handler} "\#include \"paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h\"\n\n")
-  file(GLOB POPART_CANONICALIZATION_SRC ${CMAKE_CURRENT_SOURCE_DIR}/popart_canonicalization/*.cc)
+  file(
+    APPEND ${paddle_ipu_handler}
+    "\#include \"paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h\"\n\n"
+  )
+  file(GLOB POPART_CANONICALIZATION_SRC
+       ${CMAKE_CURRENT_SOURCE_DIR}/popart_canonicalization/*.cc)
   copy_if_different(${paddle_ipu_handler} ${paddle_ipu_handler_final})
 
   foreach(file_path ${POPART_CANONICALIZATION_SRC})
     file(READ ${file_path} file_content)
-    string(REGEX MATCHALL "(REGISTER_HANDLER)(\\()([A-Za-z0-9_]+)(,)" op_handlers ${file_content})
+    string(REGEX MATCHALL "(REGISTER_HANDLER)(\\()([A-Za-z0-9_]+)(,)"
+                 op_handlers ${file_content})
     string(REPLACE "REGISTER_HANDLER(" "" op_handlers "${op_handlers}")
     string(REPLACE "," "" op_handlers "${op_handlers}")
     foreach(op_handler ${op_handlers})
       file(APPEND ${paddle_ipu_handler} "USE_HANDLER(${op_handler});\n")
     endforeach()
   endforeach()
-  
-  set(IPU_BACKEND_SRC
-    "ipu_strategy.cc"
-    "ipu_executor.cc"
-    "ipu_compiler.cc"
-    "ipu_backend.cc"
-    "ipu_utils.cc"
-  )
-  set(IPU_INFO_SRC
-    "ipu_info.cc"
-    "ipu_device.cc"
-  )
 
-  cc_library(popart_canonicalization SRCS ${POPART_CANONICALIZATION_SRC} DEPS graph)
-  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper popdist popart_canonicalization)
-  cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart-only enforce)
+  set(IPU_BACKEND_SRC "ipu_strategy.cc" "ipu_executor.cc" "ipu_compiler.cc"
+                      "ipu_backend.cc" "ipu_utils.cc")
+  set(IPU_INFO_SRC "ipu_info.cc" "ipu_device.cc")
+
+  cc_library(
+    popart_canonicalization
+    SRCS ${POPART_CANONICALIZATION_SRC}
+    DEPS graph)
+  cc_library(
+    ipu_backend
+    SRCS ${IPU_BACKEND_SRC}
+    DEPS popart-only graph graph_helper popdist popart_canonicalization)
+  cc_library(
+    ipu_info
+    SRCS ${IPU_INFO_SRC}
+    DEPS popart-only enforce)
 endif()
diff --git a/paddle/fluid/platform/device/ipu/ipu_device.cc b/paddle/fluid/platform/device/ipu/ipu_device.cc
index 2d0381cb8b3ea..f6de526c90090 100644
--- a/paddle/fluid/platform/device/ipu/ipu_device.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_device.cc
@@ -45,9 +45,10 @@ int GetNumDevices() {
   }
   int num_devices =
       popart::DeviceManager::createDeviceManager().enumerateDevices().size();
-  PADDLE_ENFORCE_GT(num_devices, 0, platform::errors::Unavailable(
-                                        "Do not found any IPU devices, please "
-                                        "make sure Poplar sdk is enabled"));
+  PADDLE_ENFORCE_GT(
+      num_devices, 0,
+      platform::errors::Unavailable("Do not found any IPU devices, please "
+                                    "make sure Poplar sdk is enabled"));
   return num_devices;
 }
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index d490334ee33f5..30c9bc2094a8a 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -243,7 +243,8 @@ void Executor::AcquireDevice() {
     VLOG(10) << "Create IPU model device...";
     std::map<std::string, std::string> deviceOpts{
         {
-            "numIPUs", std::to_string(ipu_strategy_->num_ipus),
+            "numIPUs",
+            std::to_string(ipu_strategy_->num_ipus),
         },
         {"ipuVersion", "ipu2"},
     };
@@ -254,7 +255,8 @@ void Executor::AcquireDevice() {
     VLOG(10) << "Create offline device...";
     std::map<std::string, std::string> deviceOpts{
         {
-            "numIPUs", std::to_string(ipu_strategy_->num_ipus),
+            "numIPUs",
+            std::to_string(ipu_strategy_->num_ipus),
         },
         {"ipuVersion", "ipu2"},
     };
diff --git a/paddle/fluid/platform/device/ipu/ipu_info.h b/paddle/fluid/platform/device/ipu/ipu_info.h
index fe7076e0b50b6..06ef070ed65ea 100644
--- a/paddle/fluid/platform/device/ipu/ipu_info.h
+++ b/paddle/fluid/platform/device/ipu/ipu_info.h
@@ -13,6 +13,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_IPU
 #include <memory>
 #include <vector>
+
 #include "glog/logging.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index da08c76fb90d1..0e17a485afb01 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <popart/patterns/patterns.hpp>
 #include <popart/sessionoptions.hpp>
 #include <popart/tensorlocation.hpp>
+
 #include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -143,10 +144,11 @@ class IpuStrategy {
       std::map<std::string, std::function<void(ValueType)>> &options,  // NOLINT
       const std::string &type_str) {
     auto it = options.find(key);
-    PADDLE_ENFORCE_NE(it, options.end(), platform::errors::InvalidArgument(
-                                             "Cannot find option: %s, type: %s "
-                                             "when setting IpuStrategy options",
-                                             key, type_str));
+    PADDLE_ENFORCE_NE(
+        it, options.end(),
+        platform::errors::InvalidArgument("Cannot find option: %s, type: %s "
+                                          "when setting IpuStrategy options",
+                                          key, type_str));
     it->second(value);
   }
 
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
index 254e566567424..1d5fe8c329f11 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
@@ -57,14 +57,14 @@ Node *gelu_handler(Graph *graph, Node *node) {
                              {{"value", std::vector<float>{1.4142135623730951}},
                               {"dims", std::vector<int64_t>{1}},
                               {"dtype", GetOutputVarDType(node)}});
-    auto zero_point_five =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0.5}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", GetOutputVarDType(node)}});
-    auto one =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{1}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", GetOutputVarDType(node)}});
+    auto zero_point_five = CreateConst(graph, node, {}, {},
+                                       {{"value", std::vector<float>{0.5}},
+                                        {"dims", std::vector<int64_t>{1}},
+                                        {"dtype", GetOutputVarDType(node)}});
+    auto one = CreateConst(graph, node, {}, {},
+                           {{"value", std::vector<float>{1}},
+                            {"dims", std::vector<int64_t>{1}},
+                            {"dtype", GetOutputVarDType(node)}});
     auto div =
         CreateBaseOp(graph, node, "popart_div",
                      {GetInputVarNode("X", node), sqrt2->outputs[0]}, {}, {});
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
index af72f84c9d771..9b91abc4a67af 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -44,9 +44,10 @@ Node *pow_handler(Graph *graph, Node *node) {
         MakeConstAttrMapFromValue<float>(value_, {1}, GetOutputVarDType(node));
 
     auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
-    return CreateBaseOp(graph, node, "popart_pow", {GetInputVarNode("X", node),
-                                                    new_node_const->outputs[0]},
-                        node->outputs);
+    return CreateBaseOp(
+        graph, node, "popart_pow",
+        {GetInputVarNode("X", node), new_node_const->outputs[0]},
+        node->outputs);
   }
 }
 
@@ -380,10 +381,10 @@ Node *cumsum_handler(Graph *graph, Node *node) {
   auto reverse = BOOST_GET_CONST(bool, op->GetAttr("reverse"));
   int64_t popart_reverse = 1 ? reverse : 0;
   auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
-  auto axis_node =
-      CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{axis}},
-                                        {"dims", std::vector<int64_t>{1}},
-                                        {"dtype", ONNXDataType::INT64}});
+  auto axis_node = CreateConst(graph, node, {}, {},
+                               {{"value", std::vector<int64_t>{axis}},
+                                {"dims", std::vector<int64_t>{1}},
+                                {"dtype", ONNXDataType::INT64}});
   return CreateBaseOp(
       graph, node, "popart_cumsum",
       {GetInputVarNode("X", node), axis_node->outputs[0]},
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
index 2e9913f58efbb..bce6bac88e204 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
@@ -35,20 +35,20 @@ Node *conv2d_handler(Graph *graph, Node *node) {
   auto stride_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
   auto stride = std::vector<int64_t>{stride_.begin(), stride_.end()};
   if (!op->Input("Bias").empty()) {
-    return CreateConv(
-        graph, node,
-        {
-            GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
-            GetInputVarNode("Bias", node),
-        },
-        node->outputs, dilations, group_, {}, pads, stride);
+    return CreateConv(graph, node,
+                      {
+                          GetInputVarNode("Input", node),
+                          GetInputVarNode("Filter", node),
+                          GetInputVarNode("Bias", node),
+                      },
+                      node->outputs, dilations, group_, {}, pads, stride);
   } else {
-    return CreateConv(
-        graph, node,
-        {
-            GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
-        },
-        node->outputs, dilations, group_, {}, pads, stride);
+    return CreateConv(graph, node,
+                      {
+                          GetInputVarNode("Input", node),
+                          GetInputVarNode("Filter", node),
+                      },
+                      node->outputs, dilations, group_, {}, pads, stride);
   }
 }
 
@@ -148,15 +148,16 @@ Node *pool2d_handler(Graph *graph, Node *node) {
     auto dilations = std::vector<int64_t>{};
     int64_t storage_order = 0;
     return CreateBaseOp(graph, node, "popart_maxpool", node->inputs,
-                        node->outputs, {
-                                           {"num_outputs", num_outputs},
-                                           {"kernel_shape", kernel_shape},
-                                           {"ceil_mode", ceil_mode},
-                                           {"dilations", dilations},
-                                           {"pads", pads},
-                                           {"storage_order", storage_order},
-                                           {"strides", strides},
-                                       });
+                        node->outputs,
+                        {
+                            {"num_outputs", num_outputs},
+                            {"kernel_shape", kernel_shape},
+                            {"ceil_mode", ceil_mode},
+                            {"dilations", dilations},
+                            {"pads", pads},
+                            {"storage_order", storage_order},
+                            {"strides", strides},
+                        });
   } else if (pooling_type == "avg") {
     int64_t count_include_pad = 0;
     return CreateBaseOp(graph, node, "popart_averagepool", node->inputs,
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
index 0525bb66f1618..b51d923bfcf5c 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
@@ -173,8 +173,9 @@ Node *CreateConv(Graph *graph, Node *node, const std::vector<Node *> &inputs,
 Node *CreateSoftmaxOpset11(Graph *graph, Node *node,
                            const std::vector<Node *> &inputs,
                            const std::vector<Node *> &outputs, int64_t axis) {
-  PADDLE_ENFORCE_EQ(inputs.size(), 1, platform::errors::InvalidArgument(
-                                          "Softmax op only support one input"));
+  PADDLE_ENFORCE_EQ(
+      inputs.size(), 1,
+      platform::errors::InvalidArgument("Softmax op only support one input"));
   auto x_shape = inputs[0]->Var()->GetShape();
   int x_rank = x_shape.size();
   if (axis < 0) {
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
index aec89a1cf0d82..77ce2f3166914 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
@@ -69,10 +69,10 @@ Node *topk_handler(Graph *graph, Node *node) {
     var_k = GetInputVarNode("K", node);
   } else {
     auto k = BOOST_GET_CONST(int, op->GetAttr("k"));
-    auto *op_k =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{k}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", ONNXDataType::INT64}});
+    auto *op_k = CreateConst(graph, node, {}, {},
+                             {{"value", std::vector<int64_t>{k}},
+                              {"dims", std::vector<int64_t>{1}},
+                              {"dtype", ONNXDataType::INT64}});
     var_k = op_k->outputs[0];
   }
 
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
index 00926ee7a0b25..bf32744d5a542 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
@@ -61,7 +61,9 @@ Node *fill_constant_handler(Graph *graph, Node *node) {
   }
   return CreateConst(graph, node, node->inputs, node->outputs,
                      AttributeMap{
-                         {"value", value}, {"dims", dims}, {"dtype", dtype},
+                         {"value", value},
+                         {"dims", dims},
+                         {"dtype", dtype},
                      });
 }
 
@@ -76,13 +78,14 @@ Node *gaussian_random_handler(Graph *graph, Node *node) {
   auto seed_ = BOOST_GET_CONST(int, op->GetAttr("seed"));
   auto seed = static_cast<float>(seed_);
   return CreateBaseOp(graph, node, "popart_randomnormal", node->inputs,
-                      node->outputs, {
-                                         {"shape", shape},
-                                         {"dtype", dtype},
-                                         {"mean", mean},
-                                         {"scale", scale},
-                                         {"seed", seed},
-                                     });
+                      node->outputs,
+                      {
+                          {"shape", shape},
+                          {"dtype", dtype},
+                          {"mean", mean},
+                          {"scale", scale},
+                          {"seed", seed},
+                      });
 }
 
 Node *uniform_random_handler(Graph *graph, Node *node) {
@@ -96,13 +99,14 @@ Node *uniform_random_handler(Graph *graph, Node *node) {
   auto seed_ = BOOST_GET_CONST(int, op->GetAttr("seed"));
   auto seed = static_cast<float>(seed_);
   return CreateBaseOp(graph, node, "popart_randomuniform", node->inputs,
-                      node->outputs, {
-                                         {"shape", shape},
-                                         {"dtype", dtype},
-                                         {"high", high},
-                                         {"low", low},
-                                         {"seed", seed},
-                                     });
+                      node->outputs,
+                      {
+                          {"shape", shape},
+                          {"dtype", dtype},
+                          {"high", high},
+                          {"low", low},
+                          {"seed", seed},
+                      });
 }
 
 Node *transpose_handler(Graph *graph, Node *node) {
@@ -204,32 +208,33 @@ Node *lookup_table_op_handler(Graph *graph, Node *node,
   if (padding_idx_ >= 0 && padding_idx_ < table_size_) {
     std::vector<float> const_value_(emb_size_, 0);
     std::vector<int64_t> const_shape_{1, emb_size_};
-    auto concat_const =
-        CreateConst(graph, node, {}, {}, {{"value", const_value_},
-                                          {"dims", const_shape_},
-                                          {"dtype", GetOutputVarDType(node)}});
-    auto axes =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{0}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", ONNXDataType::INT64}});
-    auto step =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{1}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", ONNXDataType::INT64}});
-
-    auto left_start =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{0}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", ONNXDataType::INT64}});
+    auto concat_const = CreateConst(graph, node, {}, {},
+                                    {{"value", const_value_},
+                                     {"dims", const_shape_},
+                                     {"dtype", GetOutputVarDType(node)}});
+    auto axes = CreateConst(graph, node, {}, {},
+                            {{"value", std::vector<int64_t>{0}},
+                             {"dims", std::vector<int64_t>{1}},
+                             {"dtype", ONNXDataType::INT64}});
+    auto step = CreateConst(graph, node, {}, {},
+                            {{"value", std::vector<int64_t>{1}},
+                             {"dims", std::vector<int64_t>{1}},
+                             {"dtype", ONNXDataType::INT64}});
+
+    auto left_start = CreateConst(graph, node, {}, {},
+                                  {{"value", std::vector<int64_t>{0}},
+                                   {"dims", std::vector<int64_t>{1}},
+                                   {"dtype", ONNXDataType::INT64}});
     auto left_end = CreateConst(graph, node, {}, {},
                                 {{"value", std::vector<int64_t>{padding_idx_}},
                                  {"dims", std::vector<int64_t>{1}},
                                  {"dtype", ONNXDataType::INT64}});
 
-    auto right_start = CreateConst(
-        graph, node, {}, {}, {{"value", std::vector<int64_t>{padding_idx_ + 1}},
-                              {"dims", std::vector<int64_t>{1}},
-                              {"dtype", ONNXDataType::INT64}});
+    auto right_start =
+        CreateConst(graph, node, {}, {},
+                    {{"value", std::vector<int64_t>{padding_idx_ + 1}},
+                     {"dims", std::vector<int64_t>{1}},
+                     {"dtype", ONNXDataType::INT64}});
     auto right_end = CreateConst(graph, node, {}, {},
                                  {{"value", std::vector<int64_t>{table_size_}},
                                   {"dims", std::vector<int64_t>{1}},
@@ -471,7 +476,9 @@ Node *assign_value_handler(Graph *graph, Node *node) {
   }
   return CreateConst(graph, node, node->inputs, node->outputs,
                      AttributeMap{
-                         {"value", values}, {"dims", dims}, {"dtype", dtype},
+                         {"value", values},
+                         {"dims", dims},
+                         {"dtype", dtype},
                      });
 }
 
@@ -529,10 +536,10 @@ Node *one_hot_handler(Graph *graph, Node *node) {
                                     {{"value", std::vector<int64_t>{depth}},
                                      {"dims", std::vector<int64_t>{1}},
                                      {"dtype", ONNXDataType::INT64}});
-    auto value_tensor =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
-                                          {"dims", std::vector<int64_t>{2}},
-                                          {"dtype", ONNXDataType::FLOAT}});
+    auto value_tensor = CreateConst(graph, node, {}, {},
+                                    {{"value", std::vector<float>{0, 1}},
+                                     {"dims", std::vector<int64_t>{2}},
+                                     {"dtype", ONNXDataType::FLOAT}});
     return CreateBaseOp(graph, node, "popart_onehot",
                         {GetInputVarNode("X", node), depth_tensor->outputs[0],
                          value_tensor->outputs[0]},
@@ -550,21 +557,21 @@ Node *one_hot_v2_handler(Graph *graph, Node *node) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Do not support allow_out_of_range=True"));
   } else {
-    auto depth_tensor =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<int>{depth}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", ONNXDataType::INT32}});
+    auto depth_tensor = CreateConst(graph, node, {}, {},
+                                    {{"value", std::vector<int>{depth}},
+                                     {"dims", std::vector<int64_t>{1}},
+                                     {"dtype", ONNXDataType::INT32}});
     Node *value_tensor = nullptr;
     if (GetOutputVarNode("Out", node)->Var()->GetDataType() == VarType::FP16) {
-      value_tensor =
-          CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
-                                            {"dims", std::vector<int64_t>{2}},
-                                            {"dtype", ONNXDataType::FLOAT16}});
+      value_tensor = CreateConst(graph, node, {}, {},
+                                 {{"value", std::vector<float>{0, 1}},
+                                  {"dims", std::vector<int64_t>{2}},
+                                  {"dtype", ONNXDataType::FLOAT16}});
     } else {
-      value_tensor =
-          CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
-                                            {"dims", std::vector<int64_t>{2}},
-                                            {"dtype", ONNXDataType::FLOAT}});
+      value_tensor = CreateConst(graph, node, {}, {},
+                                 {{"value", std::vector<float>{0, 1}},
+                                  {"dims", std::vector<int64_t>{2}},
+                                  {"dtype", ONNXDataType::FLOAT}});
     }
 
     return CreateBaseOp(graph, node, "popart_onehot",
diff --git a/paddle/fluid/platform/device/mlu/CMakeLists.txt b/paddle/fluid/platform/device/mlu/CMakeLists.txt
index 1f3a7670849c2..08b33c9b58f06 100644
--- a/paddle/fluid/platform/device/mlu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/mlu/CMakeLists.txt
@@ -1,12 +1,32 @@
-
 if(NOT WITH_MLU)
-    return()
+  return()
 endif()
 
-cc_test(mlu_enforce_test SRCS enforce_test.cc DEPS stringpiece)
-cc_library(mlu_info SRCS mlu_info.cc DEPS enforce glog monitor neuware_lib)
-cc_library(mlu_stream SRCS mlu_stream.cc DEPS boost mlu_info stream_callback_manager eigen3 ${MKLDNN_CTX_DEPS})
-cc_library(mlu_device_context SRCS device_context.cc DEPS mlu_stream)
-cc_test(mlu_device_context_test SRCS device_context_test.cc DEPS mlu_device_context)
-cc_library(mlu_collective_helper SRCS mlu_collective_helper.cc DEPS mlu_stream mlu_info)
-cc_library(mlu_resource_pool SRCS mlu_resource_pool.cc DEPS mlu_info)
+cc_test(
+  mlu_enforce_test
+  SRCS enforce_test.cc
+  DEPS stringpiece)
+cc_library(
+  mlu_info
+  SRCS mlu_info.cc
+  DEPS enforce glog monitor neuware_lib)
+cc_library(
+  mlu_stream
+  SRCS mlu_stream.cc
+  DEPS boost mlu_info stream_callback_manager eigen3 ${MKLDNN_CTX_DEPS})
+cc_library(
+  mlu_device_context
+  SRCS device_context.cc
+  DEPS mlu_stream)
+cc_test(
+  mlu_device_context_test
+  SRCS device_context_test.cc
+  DEPS mlu_device_context)
+cc_library(
+  mlu_collective_helper
+  SRCS mlu_collective_helper.cc
+  DEPS mlu_stream mlu_info)
+cc_library(
+  mlu_resource_pool
+  SRCS mlu_resource_pool.cc
+  DEPS mlu_info)
diff --git a/paddle/fluid/platform/device/mlu/cncl_helper.h b/paddle/fluid/platform/device/mlu/cncl_helper.h
index 2f9bed0142641..634e420d5ce53 100644
--- a/paddle/fluid/platform/device/mlu/cncl_helper.h
+++ b/paddle/fluid/platform/device/mlu/cncl_helper.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CNCL
 #include <cncl.h>
-
 #include <stdio.h>
+
 #include <memory>
 #include <string>
 #include <thread>  // NOLINT
diff --git a/paddle/fluid/platform/device/mlu/device_context.h b/paddle/fluid/platform/device/mlu/device_context.h
index 120916b4f5c56..d607b1e12f5a7 100644
--- a/paddle/fluid/platform/device/mlu/device_context.h
+++ b/paddle/fluid/platform/device/mlu/device_context.h
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_MLU
 #include <mutex>
+
 #include "paddle/fluid/platform/device/mlu/enforce.h"
 #include "paddle/fluid/platform/device/mlu/mlu_stream.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/platform/device/mlu/device_context_test.cc b/paddle/fluid/platform/device/mlu/device_context_test.cc
index 5caaa9dec1e4b..41f79c7092ea4 100644
--- a/paddle/fluid/platform/device/mlu/device_context_test.cc
+++ b/paddle/fluid/platform/device/mlu/device_context_test.cc
@@ -20,9 +20,9 @@ limitations under the License. */
 
 TEST(Device, Init) {
   using paddle::platform::DeviceContext;
+  using paddle::platform::MLUContext;
   using paddle::platform::MLUDeviceContext;
   using paddle::platform::MLUPlace;
-  using paddle::platform::MLUContext;
 
   int count = paddle::platform::GetMLUDeviceCount();
   for (int i = 0; i < count; i++) {
@@ -34,9 +34,9 @@ TEST(Device, Init) {
 }
 
 TEST(Device, MLUDeviceContext) {
+  using paddle::mluCnnlHandle;
   using paddle::platform::MLUDeviceContext;
   using paddle::platform::MLUPlace;
-  using paddle::mluCnnlHandle;
 
   int count = paddle::platform::GetMLUDeviceCount();
   for (int i = 0; i < count; i++) {
@@ -48,9 +48,9 @@ TEST(Device, MLUDeviceContext) {
 }
 
 TEST(Device, MLUStream) {
+  using paddle::mluStream;
   using paddle::platform::MLUDeviceContext;
   using paddle::platform::MLUPlace;
-  using paddle::mluStream;
 
   int count = paddle::platform::GetMLUDeviceCount();
   for (int i = 0; i < count; i++) {
@@ -62,11 +62,11 @@ TEST(Device, MLUStream) {
 }
 
 TEST(Device, DeviceContextPool) {
+  using paddle::platform::CPUPlace;
   using paddle::platform::DeviceContextPool;
   using paddle::platform::MLUDeviceContext;
-  using paddle::platform::Place;
-  using paddle::platform::CPUPlace;
   using paddle::platform::MLUPlace;
+  using paddle::platform::Place;
 
   DeviceContextPool& pool = DeviceContextPool::Instance();
   auto cpu_dev_ctx1 = pool.Get(CPUPlace());
diff --git a/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc b/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc
index 7708267c1bc72..4051caac1c800 100644
--- a/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc
+++ b/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_CNCL)
 #include <utility>
+
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/mlu/enforce.h"
 
diff --git a/paddle/fluid/platform/device/mlu/mlu_info.cc b/paddle/fluid/platform/device/mlu/mlu_info.cc
index 7cad99bf5d22d..e3672707210fb 100644
--- a/paddle/fluid/platform/device/mlu/mlu_info.cc
+++ b/paddle/fluid/platform/device/mlu/mlu_info.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
+
 #include <mutex>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/mlu/enforce.h"
@@ -187,8 +189,9 @@ static size_t MLUAllocSize(bool realloc) {
   size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
                            : FLAGS_initial_gpu_memory_in_mb;
   size_t alloc_bytes =
-      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
-                                           FLAGS_fraction_of_gpu_memory_to_use);
+      (flag_mb > 0ul
+           ? flag_mb << 20
+           : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
   PADDLE_ENFORCE_GE(
       available_to_alloc, alloc_bytes,
       platform::errors::ResourceExhausted("Not enough available MLU memory."));
diff --git a/paddle/fluid/platform/device/mlu/mlu_stream.cc b/paddle/fluid/platform/device/mlu/mlu_stream.cc
index 7a27a49250a1e..f570cc77e5a97 100644
--- a/paddle/fluid/platform/device/mlu/mlu_stream.cc
+++ b/paddle/fluid/platform/device/mlu/mlu_stream.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/mlu/mlu_stream.h"
+
 #include "paddle/fluid/platform/device/mlu/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/npu/CMakeLists.txt b/paddle/fluid/platform/device/npu/CMakeLists.txt
index 52db36d131ec2..9015a76e9cd5a 100644
--- a/paddle/fluid/platform/device/npu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/npu/CMakeLists.txt
@@ -3,13 +3,31 @@
 add_subdirectory(dynload)
 
 if(WITH_ASCEND)
-  cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl)
+  cc_library(
+    ascend_npu_info
+    SRCS ascend_npu_info.cc
+    DEPS gflags glog enforce atlas_acl)
 endif()
 
 if(WITH_ASCEND_CL)
-  cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor ascendcl acl_op_compiler)
-  cc_library(npu_resource_pool SRCS npu_resource_pool.cc DEPS npu_info)
-  cc_library(npu_stream SRCS npu_stream.cc DEPS enforce boost stream_callback_manager)
-  cc_library(npu_collective_helper SRCS npu_collective_helper.cc DEPS npu_stream npu_info data_type)
-  cc_library(npu_op_runner SRCS npu_op_runner.cc DEPS operator npu_info)
+  cc_library(
+    npu_info
+    SRCS npu_info.cc
+    DEPS gflags glog enforce monitor ascendcl acl_op_compiler)
+  cc_library(
+    npu_resource_pool
+    SRCS npu_resource_pool.cc
+    DEPS npu_info)
+  cc_library(
+    npu_stream
+    SRCS npu_stream.cc
+    DEPS enforce boost stream_callback_manager)
+  cc_library(
+    npu_collective_helper
+    SRCS npu_collective_helper.cc
+    DEPS npu_stream npu_info data_type)
+  cc_library(
+    npu_op_runner
+    SRCS npu_op_runner.cc
+    DEPS operator npu_info)
 endif()
diff --git a/paddle/fluid/platform/device/npu/ascend_npu_info.cc b/paddle/fluid/platform/device/npu/ascend_npu_info.cc
index c100b2d0a1740..a9204ac3fca50 100644
--- a/paddle/fluid/platform/device/npu/ascend_npu_info.cc
+++ b/paddle/fluid/platform/device/npu/ascend_npu_info.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device/npu/ascend_npu_info.h"
+
 #include <glog/logging.h>
+
 #include "acl/acl_rt.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/npu/dynload/CMakeLists.txt b/paddle/fluid/platform/device/npu/dynload/CMakeLists.txt
index 7232d51a602b3..9f36942524bf3 100644
--- a/paddle/fluid/platform/device/npu/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/device/npu/dynload/CMakeLists.txt
@@ -1,3 +1,6 @@
 if(WITH_ASCEND_CL)
-  cc_library(npu_hccl SRCS hccl.cc DEPS dynamic_loader warpctc)
+  cc_library(
+    npu_hccl
+    SRCS hccl.cc
+    DEPS dynamic_loader warpctc)
 endif()
diff --git a/paddle/fluid/platform/device/npu/dynload/hccl.h b/paddle/fluid/platform/device/npu/dynload/hccl.h
index 3d7587bfa266b..ae140dd295067 100644
--- a/paddle/fluid/platform/device/npu/dynload/hccl.h
+++ b/paddle/fluid/platform/device/npu/dynload/hccl.h
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include <hccl/hccl.h>
 #include <hccl/hccl_types.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
diff --git a/paddle/fluid/platform/device/npu/enforce_npu.h b/paddle/fluid/platform/device/npu/enforce_npu.h
index 3887ee4866af8..243926868631d 100644
--- a/paddle/fluid/platform/device/npu/enforce_npu.h
+++ b/paddle/fluid/platform/device/npu/enforce_npu.h
@@ -17,10 +17,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_ASCEND_CL
 #include <string>
 
-#include "paddle/fluid/platform/enforce.h"
-
 #include "acl/acl.h"
 #include "hccl/hccl_types.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h
index 134ec04030d75..107fe5989ddba 100644
--- a/paddle/fluid/platform/device/npu/hccl_helper.h
+++ b/paddle/fluid/platform/device/npu/hccl_helper.h
@@ -17,6 +17,7 @@
 #ifdef PADDLE_WITH_ASCEND_CL
 
 #include <stdio.h>
+
 #include <memory>
 #include <string>
 #include <thread>  // NOLINT
@@ -24,11 +25,10 @@
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/platform/device/npu/dynload/hccl.h"
-#include "paddle/fluid/platform/device/npu/enforce_npu.h"
-
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/dynload/hccl.h"
+#include "paddle/fluid/platform/device/npu/enforce_npu.h"
 #include "paddle/fluid/platform/float16.h"
 
 #define HCCL_ID_VARNAME "HCCLID"
diff --git a/paddle/fluid/platform/device/npu/npu_collective_helper.cc b/paddle/fluid/platform/device/npu/npu_collective_helper.cc
index cdec3519a23f3..77528fe19fcb4 100644
--- a/paddle/fluid/platform/device/npu/npu_collective_helper.cc
+++ b/paddle/fluid/platform/device/npu/npu_collective_helper.cc
@@ -14,6 +14,7 @@
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include <utility>
+
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/enforce_npu.h"
 
diff --git a/paddle/fluid/platform/device/npu/npu_info.cc b/paddle/fluid/platform/device/npu/npu_info.cc
index b5516944b750e..2688c88f55773 100644
--- a/paddle/fluid/platform/device/npu/npu_info.cc
+++ b/paddle/fluid/platform/device/npu/npu_info.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/npu/npu_info.h"
+
 #include <algorithm>
 #include <cstdlib>
 #include <memory>
 
 #include "gflags/gflags.h"
-
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/monitor.h"
@@ -153,8 +153,9 @@ static size_t NPUAllocSize(bool realloc) {
   size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
                            : FLAGS_initial_gpu_memory_in_mb;
   size_t alloc_bytes =
-      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
-                                           FLAGS_fraction_of_gpu_memory_to_use);
+      (flag_mb > 0ul
+           ? flag_mb << 20
+           : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
   PADDLE_ENFORCE_GE(
       available_to_alloc, alloc_bytes,
       platform::errors::ResourceExhausted("Not enough available NPU memory."));
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.cc b/paddle/fluid/platform/device/npu/npu_op_runner.cc
index 72169ae303b4c..d38443acca3a3 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.cc
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 
 #include "acl/acl.h"
 #include "acl/acl_op_compiler.h"
-
 #include "paddle/fluid/framework/framework.pb.h"
 
 DECLARE_string(npu_precision_mode);
diff --git a/paddle/fluid/platform/device/npu/npu_resource_pool.cc b/paddle/fluid/platform/device/npu/npu_resource_pool.cc
index d837e90c3c42c..e7c302289dbfe 100644
--- a/paddle/fluid/platform/device/npu/npu_resource_pool.cc
+++ b/paddle/fluid/platform/device/npu/npu_resource_pool.cc
@@ -14,6 +14,7 @@
 
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_resource_pool.h"
+
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/npu/npu_stream.cc b/paddle/fluid/platform/device/npu/npu_stream.cc
index 0b15a0d937e82..55a73146815c9 100644
--- a/paddle/fluid/platform/device/npu/npu_stream.cc
+++ b/paddle/fluid/platform/device/npu/npu_stream.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
index 3399fff087f8d..19656bf1cce64 100644
--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -2,11 +2,32 @@ if(NOT WITH_XPU)
   return()
 endif()
 
-set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)
+set(XPU_CTX_DEPS
+    xpulib
+    ssl
+    crypto
+    rt
+    z
+    resolv
+    dl)
 
-
-cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place phi_xpu_info)
-cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context op_kernel_type)
-cc_library(xpu_resource_pool SRCS xpu_resource_pool.cc DEPS xpu_info)
+cc_library(
+  xpu_info
+  SRCS xpu_info.cc
+  DEPS gflags
+       glog
+       enforce
+       xpulib
+       device_context
+       place
+       phi_xpu_info)
+cc_library(
+  xpu_op_list
+  SRCS xpu_op_list.cc
+  DEPS gflags glog enforce xpulib device_context op_kernel_type)
+cc_library(
+  xpu_resource_pool
+  SRCS xpu_resource_pool.cc
+  DEPS xpu_info)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/platform/device/xpu/bkcl_helper.h b/paddle/fluid/platform/device/xpu/bkcl_helper.h
index 24fd8b5faa4e9..a7a3e4f060529 100644
--- a/paddle/fluid/platform/device/xpu/bkcl_helper.h
+++ b/paddle/fluid/platform/device/xpu/bkcl_helper.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <stdio.h>
+
 #include <memory>
 #include <string>
 #include <thread>  // NOLINT
@@ -217,7 +218,7 @@ class BKCLCommunicator {
    *bkcl_all_reduce
    *parallelly. So create a new bkcl comm for sync_batch_norm_op. And these
    *codes should be polished with a unified bkcl management.
-  */
+   */
   BKCLContextMap *GetSyncBatchNormCtx(
       framework::Scope *scope, const std::vector<platform::Place> &places) {
     auto *bkcl_id_var = scope->FindVar(BKCL_ID_VARNAME);
diff --git a/paddle/fluid/platform/device/xpu/enforce_xpu.h b/paddle/fluid/platform/device/xpu/enforce_xpu.h
index c55d91c301550..77d14aa712e70 100644
--- a/paddle/fluid/platform/device/xpu/enforce_xpu.h
+++ b/paddle/fluid/platform/device/xpu/enforce_xpu.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
-
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/xpu/tests/CMakeLists.txt b/paddle/fluid/platform/device/xpu/tests/CMakeLists.txt
index 6d98fefcf8317..e51896df6159a 100644
--- a/paddle/fluid/platform/device/xpu/tests/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/tests/CMakeLists.txt
@@ -1 +1,4 @@
-cc_test(enforce_xpu_test SRCS enforce_xpu_test.cc DEPS stringpiece)
+cc_test(
+  enforce_xpu_test
+  SRCS enforce_xpu_test.cc
+  DEPS stringpiece)
diff --git a/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc b/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc
index 8cba98f3fb352..0b528c3999e07 100644
--- a/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc
+++ b/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
+
 #include "gtest/gtest.h"
 
 template <typename T>
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 99f8e5ace9c00..b94d0353e5dd5 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -38,6 +38,11 @@ XPUOpMap& get_kl2_ops() {
       {"argsort", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                                 pOpKernelType(vartype::INT64, XPUPlace()),
                                 pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"assign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                               pOpKernelType(vartype::FP64, XPUPlace()),
+                               pOpKernelType(vartype::INT32, XPUPlace()),
+                               pOpKernelType(vartype::INT64, XPUPlace()),
+                               pOpKernelType(vartype::BOOL, XPUPlace())})},
       {"assign_value",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"batch_norm_grad",
@@ -209,6 +214,8 @@ XPUOpMap& get_kl2_ops() {
                                   pOpKernelType(vartype::FP16, XPUPlace())})},
       {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                              pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"generate_proposals_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"greater_equal",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc
index cdd7ee7f806e9..dbc8ed4a51aaf 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
@@ -13,14 +13,13 @@ limitations under the License. */
 #include <algorithm>
 #include <cstdlib>
 #include <string>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
-
 #include "paddle/phi/backends/xpu/xpu_info.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h
index 38b4defadc6c3..2dd0f3275309e 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.h
+++ b/paddle/fluid/platform/device/xpu/xpu_info.h
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 #include <vector>
+
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
 #include "xpu/runtime.h"
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index 778c18146d64d..452f388f03dcf 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -113,6 +113,12 @@ XPUOpMap& get_kp_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_amax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_amin", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"c_sync_calc_stream",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"c_sync_comm_stream",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"c_allreduce_sum",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };
 
   return s_xpu_kp_kernels;
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 0738514336201..8ace4d1a32c50 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -9,6 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
+
 #include <mutex>
 #include <string>
 #include <unordered_set>
@@ -17,7 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/xpu/xpu2_op_list.h"
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h"
-#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc
index a4226dabf9d52..4ee32ad5a03cd 100644
--- a/paddle/fluid/platform/device_code.cc
+++ b/paddle/fluid/platform/device_code.cc
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/device_code.h"
+
 #include <sys/stat.h>
+
 #include <algorithm>
 #include <set>
 #include <utility>
 
-#include "paddle/fluid/platform/device_code.h"
 #include "paddle/fluid/platform/enforce.h"
 
 DECLARE_string(cuda_dir);
diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc
index 7da8c56138543..cb2649686ec02 100644
--- a/paddle/fluid/platform/device_code_test.cc
+++ b/paddle/fluid/platform/device_code_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device_code.h"
+
 #include <utility>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/init.h"
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 09a29c3429cba..0bd606257f541 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
+
 #include <functional>
 #include <memory>
 #include <set>
+
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index a63d41405f1b2..d0dae706ba572 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -21,13 +21,12 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/core/device_context.h"
-
-#include "paddle/fluid/memory/malloc.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index 08a04a9565af7..2db29dc11ada0 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -11,18 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/platform/device_context.h"
-
 #include <vector>
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/platform/device_context.h"
 
 TEST(Device, Init) {
-  using paddle::platform::DeviceContext;
   using paddle::platform::CUDADeviceContext;
   using paddle::platform::CUDAPlace;
+  using paddle::platform::DeviceContext;
 
   int count = paddle::platform::GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
@@ -94,11 +93,11 @@ TEST(Device, CUDADeviceContext) {
 }
 
 TEST(Device, DeviceContextPool) {
-  using paddle::platform::DeviceContextPool;
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::Place;
   using paddle::platform::CPUPlace;
+  using paddle::platform::CUDADeviceContext;
   using paddle::platform::CUDAPlace;
+  using paddle::platform::DeviceContextPool;
+  using paddle::platform::Place;
 
   DeviceContextPool& pool = DeviceContextPool::Instance();
   auto cpu_dev_ctx1 = pool.Get(CPUPlace());
diff --git a/paddle/fluid/platform/device_context_xpu_test.cc b/paddle/fluid/platform/device_context_xpu_test.cc
index 3de2e3957a990..50cb0f98d334f 100644
--- a/paddle/fluid/platform/device_context_xpu_test.cc
+++ b/paddle/fluid/platform/device_context_xpu_test.cc
@@ -11,12 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/platform/device_context.h"
-
 #include <vector>
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/platform/device_context.h"
 
 TEST(Device, Init) {
   using paddle::platform::DeviceContext;
@@ -33,10 +32,10 @@ TEST(Device, Init) {
 }
 
 TEST(Device, DeviceContextPool) {
+  using paddle::platform::CPUPlace;
   using paddle::platform::DeviceContextPool;
-  using paddle::platform::XPUDeviceContext;
   using paddle::platform::Place;
-  using paddle::platform::CPUPlace;
+  using paddle::platform::XPUDeviceContext;
   using paddle::platform::XPUPlace;
 
   DeviceContextPool& pool = DeviceContextPool::Instance();
diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h
index 463329d32c936..82d93dee3989f 100644
--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -23,8 +23,8 @@
  *  for USE_PASS from pass_library.
  */
 
-using ::paddle::platform::kCUDA;
 using ::paddle::platform::kCPU;
+using ::paddle::platform::kCUDA;
 
 USE_EVENT(kCPU)
 USE_EVENT_WAIT(kCPU, kCPU)
diff --git a/paddle/fluid/platform/device_event_base.cc b/paddle/fluid/platform/device_event_base.cc
index 67fad3857f2c1..374de7d923f30 100644
--- a/paddle/fluid/platform/device_event_base.cc
+++ b/paddle/fluid/platform/device_event_base.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/device_event_base.h"
+
 #include "paddle/fluid/platform/device_event_cpu.h"
 #include "paddle/fluid/platform/event.h"
 
diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h
index 8fe5ef9fcb107..4e751aa6d133a 100644
--- a/paddle/fluid/platform/device_event_base.h
+++ b/paddle/fluid/platform/device_event_base.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <memory>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/platform/device_event_cpu.h b/paddle/fluid/platform/device_event_cpu.h
index 6e2bf4c7ad135..1620dffdabd51 100644
--- a/paddle/fluid/platform/device_event_cpu.h
+++ b/paddle/fluid/platform/device_event_cpu.h
@@ -16,6 +16,7 @@
 #include <atomic>
 #include <condition_variable>
 #include <mutex>
+
 #include "paddle/fluid/platform/device_event_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc
index f42ccc5a1db54..f176d1a0d5dbd 100644
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
@@ -101,8 +101,8 @@ void EventResetCUDA(const DeviceEvent* event) {
 }  // namespace platform
 }  // namespace paddle
 
-using ::paddle::platform::kCUDA;
 using ::paddle::platform::kCPU;
+using ::paddle::platform::kCUDA;
 REGISTER_EVENT_CREATE_FUNCTION(kCUDA, paddle::platform::DeviceEventCreateCUDA)
 REGISTER_EVENT_RECORD_FUNCTION(kCUDA, paddle::platform::DeviceEventRecordCUDA)
 REGISTER_EVENT_QUERY_FUNCTION(kCUDA, paddle::platform::DeviceEventQueryCUDA)
diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc
index d9f744b26256b..92fe7c02bd0bd 100644
--- a/paddle/fluid/platform/device_event_test.cc
+++ b/paddle/fluid/platform/device_event_test.cc
@@ -13,15 +13,16 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/device_event.h"
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/place.h"
 
-using ::paddle::platform::kCUDA;
 using ::paddle::platform::kCPU;
+using ::paddle::platform::kCUDA;
 
-using paddle::platform::DeviceEvent;
 using paddle::platform::DeviceContextPool;
+using paddle::platform::DeviceEvent;
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 73847ce24aa72..fa345ed31cbb2 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/device_tracer.h"
+
 #include <deque>
 #include <forward_list>
 #include <fstream>
@@ -20,7 +22,6 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "glog/logging.h"
-#include "paddle/fluid/platform/device_tracer.h"
 
 DECLARE_bool(enable_host_event_recorder_hook);
 
@@ -255,7 +256,9 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
             }
             break;
           }
-          default: { break; }
+          default: {
+            break;
+          }
         }
       } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
         // Seems not an error in this case.
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 1f95e12127104..bba0ad35e0216 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,55 +1,89 @@
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce phi_dynamic_loader)
+cc_library(
+  dynamic_loader
+  SRCS dynamic_loader.cc
+  DEPS glog gflags enforce phi_dynamic_loader)
 
-list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)
+list(
+  APPEND
+  CUDA_SRCS
+  cublas.cc
+  cublasLt.cc
+  cudnn.cc
+  curand.cc
+  cusolver.cc
+  cusparse.cc
+  nvtx.cc
+  cufft.cc)
 
-if (NOT WITH_NV_JETSON)
-    list(APPEND CUDA_SRCS nvjpeg.cc)
+if(NOT WITH_NV_JETSON)
+  list(APPEND CUDA_SRCS nvjpeg.cc)
 endif()
 
-if (WITH_ROCM)
+if(WITH_ROCM)
   list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc)
 endif()
 
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows.
-if (NOT APPLE)
-    list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
-  if (WITH_NCCL)
+if(NOT APPLE)
+  list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
+  if(WITH_NCCL)
     list(APPEND CUDA_SRCS nccl.cc)
   endif()
-  if (WITH_ROCM)
+  if(WITH_ROCM)
     list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc)
-    if (WITH_RCCL)
+    if(WITH_RCCL)
       list(APPEND HIP_SRCS rccl.cc)
     endif()
   endif()
 endif()
 
-if (TENSORRT_FOUND)
+if(TENSORRT_FOUND)
   list(APPEND CUDA_SRCS tensorrt.cc)
 endif()
 
 configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
-if (CUPTI_FOUND)
-    list(APPEND CUDA_SRCS cupti.cc)
+if(CUPTI_FOUND)
+  list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 if(WITH_ROCM)
-  hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader phi_dynload_cuda)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc phi_dynload_warpctc)
-elseif (WITH_ASCEND_CL)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl phi_dynload_warpctc)
+  hip_library(
+    dynload_cuda
+    SRCS ${HIP_SRCS}
+    DEPS dynamic_loader phi_dynload_cuda)
+  cc_library(
+    dynload_warpctc
+    SRCS warpctc.cc
+    DEPS dynamic_loader warpctc phi_dynload_warpctc)
+elseif(WITH_ASCEND_CL)
+  cc_library(
+    dynload_warpctc
+    SRCS warpctc.cc
+    DEPS dynamic_loader warpctc npu_hccl phi_dynload_warpctc)
 else()
-  nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader phi_dynload_cuda)
-  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc phi_dynload_warpctc)
+  nv_library(
+    dynload_cuda
+    SRCS ${CUDA_SRCS}
+    DEPS dynamic_loader phi_dynload_cuda)
+  cc_library(
+    dynload_warpctc
+    SRCS warpctc.cc
+    DEPS dynamic_loader warpctc phi_dynload_warpctc)
 endif()
-if (WITH_MKLML)
-    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml phi_dynload_mklml)
+if(WITH_MKLML)
+  cc_library(
+    dynload_mklml
+    SRCS mklml.cc
+    DEPS dynamic_loader mklml phi_dynload_mklml)
 endif()
 
 # TODO(TJ): add iomp, mkldnn?
 
-if (MKL_FOUND AND WITH_ONEMKL)
+if(MKL_FOUND AND WITH_ONEMKL)
   message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
-  cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader phi_dynload_mklrt)
+  cc_library(
+    dynload_mklrt
+    SRCS mklrt.cc
+    DEPS dynamic_loader phi_dynload_mklrt)
   target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE})
 endif()
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index d7d43cecc25dd..496b253dff5b3 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cublasXt.h>
 #include <cublas_v2.h>
 #include <cuda.h>
+
 #include <mutex>  // NOLINT
 #include <type_traits>
 
diff --git a/paddle/fluid/platform/dynload/cublasLt.h b/paddle/fluid/platform/dynload/cublasLt.h
index 5157cfdad2e59..3a1d28072c591 100644
--- a/paddle/fluid/platform/dynload/cublasLt.h
+++ b/paddle/fluid/platform/dynload/cublasLt.h
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include <cublasLt.h>
 #include <cuda.h>
+
 #include <mutex>  // NOLINT
 #include <type_traits>
 
diff --git a/paddle/fluid/platform/dynload/cuda_driver.cc b/paddle/fluid/platform/dynload/cuda_driver.cc
index a0f9647f08934..c6851594b803b 100644
--- a/paddle/fluid/platform/dynload/cuda_driver.cc
+++ b/paddle/fluid/platform/dynload/cuda_driver.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
+
 #include "paddle/phi/backends/dynload/cuda_driver.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/cuda_driver.h b/paddle/fluid/platform/dynload/cuda_driver.h
index f5550e9f9fe39..b696ffc1a3be8 100644
--- a/paddle/fluid/platform/dynload/cuda_driver.h
+++ b/paddle/fluid/platform/dynload/cuda_driver.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cuda.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/cuda_driver.h"
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index 553792d3bbf25..05cacb74c8673 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/cudnn.h"
+
 #include "paddle/phi/backends/dynload/cudnn.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index bf4bb08a696ed..9af1e8065c49d 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cudnn.h>
 #include <glog/logging.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/cudnn.h"
diff --git a/paddle/fluid/platform/dynload/cufft.cc b/paddle/fluid/platform/dynload/cufft.cc
index 1996ab16167f1..6a06c4bdb6ac4 100644
--- a/paddle/fluid/platform/dynload/cufft.cc
+++ b/paddle/fluid/platform/dynload/cufft.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/cufft.h"
+
 #include "paddle/phi/backends/dynload/cufft.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/cufft.h b/paddle/fluid/platform/dynload/cufft.h
index 6c3a0992d758d..d79603a5a01fc 100644
--- a/paddle/fluid/platform/dynload/cufft.h
+++ b/paddle/fluid/platform/dynload/cufft.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cufft.h>
 #include <cufftXt.h>
 #include <glog/logging.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/cufft.h"
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index 854e5a7b9f04a..8e08785f20925 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cuda.h>
 #include <cuda_occupancy.h>
 #include <cupti.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/cupti.h"
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 1fdd9240284dc..f4065a196d3c4 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <curand.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/curand.h"
diff --git a/paddle/fluid/platform/dynload/cusolver.h b/paddle/fluid/platform/dynload/cusolver.h
index 212c350ebb288..854de23150cad 100644
--- a/paddle/fluid/platform/dynload/cusolver.h
+++ b/paddle/fluid/platform/dynload/cusolver.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <cusolverDn.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/cusolver.h"
diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h
index b4b9352167829..925852bb4158b 100644
--- a/paddle/fluid/platform/dynload/cusparse.h
+++ b/paddle/fluid/platform/dynload/cusparse.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <cusparse.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/cusparse.h"
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 5ce63b244efde..2f24e1b87daba 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 
diff --git a/paddle/fluid/platform/dynload/hiprtc.cc b/paddle/fluid/platform/dynload/hiprtc.cc
index 6c4a4bfd0dedc..d9bb3fd2c4214 100644
--- a/paddle/fluid/platform/dynload/hiprtc.cc
+++ b/paddle/fluid/platform/dynload/hiprtc.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/hiprtc.h"
+
 #include "paddle/phi/backends/dynload/hiprtc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/hiprtc.h b/paddle/fluid/platform/dynload/hiprtc.h
index 851dadbac63d2..f27d5d808f77b 100644
--- a/paddle/fluid/platform/dynload/hiprtc.h
+++ b/paddle/fluid/platform/dynload/hiprtc.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <hip/hiprtc.h>
+
 #include <mutex>  // NOLINT
+
 #include "paddle/phi/backends/dynload/hiprtc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/miopen.cc b/paddle/fluid/platform/dynload/miopen.cc
index 9660188b68d4f..15012531b4c9f 100644
--- a/paddle/fluid/platform/dynload/miopen.cc
+++ b/paddle/fluid/platform/dynload/miopen.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/miopen.h"
+
 #include "paddle/phi/backends/dynload/cudnn.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index b99cd5ebb6e15..20b92b170511c 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -14,10 +14,11 @@ limitations under the License. */
 
 #pragma once
 #include <glog/logging.h>
-
 #include <miopen/miopen.h>
 #include <miopen/version.h>
+
 #include <mutex>  // NOLINT
+
 #include "paddle/phi/backends/dynload/miopen.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 1c7d0c17a0fc8..78cae9a082153 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <mkl.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/mklml.h"
diff --git a/paddle/fluid/platform/dynload/mklrt.h b/paddle/fluid/platform/dynload/mklrt.h
index 334b98a1c3d5a..e1a2bedfa8e2c 100644
--- a/paddle/fluid/platform/dynload/mklrt.h
+++ b/paddle/fluid/platform/dynload/mklrt.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <mkl_dfti.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index a38d1d4272e39..c2052719dd56c 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <nccl.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/nccl.h"
diff --git a/paddle/fluid/platform/dynload/nvjpeg.h b/paddle/fluid/platform/dynload/nvjpeg.h
index 8aaf672fe67b9..026a3b6488606 100644
--- a/paddle/fluid/platform/dynload/nvjpeg.h
+++ b/paddle/fluid/platform/dynload/nvjpeg.h
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 #include <nvjpeg.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/nvjpeg.h"
diff --git a/paddle/fluid/platform/dynload/nvrtc.cc b/paddle/fluid/platform/dynload/nvrtc.cc
index a032299827742..242aa912ad838 100644
--- a/paddle/fluid/platform/dynload/nvrtc.cc
+++ b/paddle/fluid/platform/dynload/nvrtc.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/nvrtc.h"
+
 #include "paddle/phi/backends/dynload/nvrtc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/nvrtc.h b/paddle/fluid/platform/dynload/nvrtc.h
index 5ca8860c5acbe..e03235e116f25 100644
--- a/paddle/fluid/platform/dynload/nvrtc.h
+++ b/paddle/fluid/platform/dynload/nvrtc.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <nvrtc.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/nvrtc.h"
diff --git a/paddle/fluid/platform/dynload/nvtx.h b/paddle/fluid/platform/dynload/nvtx.h
index 3f974eca1d00b..c3dc9e31df354 100644
--- a/paddle/fluid/platform/dynload/nvtx.h
+++ b/paddle/fluid/platform/dynload/nvtx.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifndef _WIN32
 #include <cuda.h>
 #include <nvToolsExt.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/nvtx.h"
diff --git a/paddle/fluid/platform/dynload/rccl.h b/paddle/fluid/platform/dynload/rccl.h
index 7bb4992c89cb9..2f874bb59f593 100644
--- a/paddle/fluid/platform/dynload/rccl.h
+++ b/paddle/fluid/platform/dynload/rccl.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <rccl.h>
 
 #include <mutex>  // NOLINT
+
 #include "paddle/phi/backends/dynload/rccl.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/rocblas.h b/paddle/fluid/platform/dynload/rocblas.h
index 04f4fdd9506da..5cec6fb48798b 100644
--- a/paddle/fluid/platform/dynload/rocblas.h
+++ b/paddle/fluid/platform/dynload/rocblas.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <hip/hip_runtime.h>
 #include <rocblas.h>
+
 #include <mutex>  // NOLINT
 #include <type_traits>
 
diff --git a/paddle/fluid/platform/dynload/rocm_driver.cc b/paddle/fluid/platform/dynload/rocm_driver.cc
index 088129f3f8d02..4fa20c5c4bbb8 100644
--- a/paddle/fluid/platform/dynload/rocm_driver.cc
+++ b/paddle/fluid/platform/dynload/rocm_driver.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/rocm_driver.h"
+
 #include "paddle/phi/backends/dynload/rocm_driver.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/rocm_driver.h b/paddle/fluid/platform/dynload/rocm_driver.h
index 5a902239fefd4..5c8e18611c40a 100644
--- a/paddle/fluid/platform/dynload/rocm_driver.h
+++ b/paddle/fluid/platform/dynload/rocm_driver.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <hip/hip_runtime.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/rocm_driver.h"
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
index 8153877b7bbb8..8d700faac0c14 100644
--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/tensorrt.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 772a7750fe90d..1106eef455957 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -36,6 +36,7 @@ limitations under the License. */
 #include <cusparse.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
+
 #include "paddle/fluid/platform/external_error.pb.h"
 #endif  // PADDLE_WITH_CUDA
 
@@ -77,6 +78,7 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/cusolver.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include <error.h>
+
 #include "paddle/phi/backends/dynload/nccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
@@ -88,6 +90,7 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/rocblas.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #include <error.h>  // NOLINT
+
 #include "paddle/phi/backends/dynload/rccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_HIP
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index b9e4239299169..771c4853f6f24 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -63,19 +63,22 @@ TEST(ENFORCE, FAILED) {
 TEST(ENFORCE, NO_ARG_OK) {
   int a = 2;
   int b = 2;
-  PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_EQ tests failed."));
+  PADDLE_ENFORCE_EQ(
+      a, b,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_EQ tests failed."));
   // test enforce with extra message.
-  PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable(
-                              "Some %s wrong in PADDLE_ENFORCE_EQ.", "info"));
+  PADDLE_ENFORCE_EQ(a, b,
+                    paddle::platform::errors::Unavailable(
+                        "Some %s wrong in PADDLE_ENFORCE_EQ.", "info"));
 }
 
 TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
   int a = 2;
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_EQ(a, 1 + 3, paddle::platform::errors::InvalidArgument(
-                                    "The result is not equal correct result."));
+    PADDLE_ENFORCE_EQ(a, 1 + 3,
+                      paddle::platform::errors::InvalidArgument(
+                          "The result is not equal correct result."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -89,8 +92,9 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
   int a = 2;
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_EQ(a, 1 + 3, paddle::platform::errors::InvalidArgument(
-                                    "The result is not equal correct result."));
+    PADDLE_ENFORCE_EQ(a, 1 + 3,
+                      paddle::platform::errors::InvalidArgument(
+                          "The result is not equal correct result."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -102,10 +106,12 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
 }
 
 TEST(ENFORCE_NE, OK) {
-  PADDLE_ENFORCE_NE(1, 2, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_NE tests failed."));
-  PADDLE_ENFORCE_NE(1.0, 2UL, paddle::platform::errors::Unavailable(
-                                  "PADDLE_ENFORCE_NE tests failed."));
+  PADDLE_ENFORCE_NE(
+      1, 2,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_NE tests failed."));
+  PADDLE_ENFORCE_NE(
+      1.0, 2UL,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_NE tests failed."));
 }
 TEST(ENFORCE_NE, FAIL) {
   bool caught_exception = false;
@@ -125,14 +131,16 @@ TEST(ENFORCE_NE, FAIL) {
 }
 
 TEST(ENFORCE_GT, OK) {
-  PADDLE_ENFORCE_GT(2, 1, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_GT tests failed."));
+  PADDLE_ENFORCE_GT(
+      2, 1,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_GT tests failed."));
 }
 TEST(ENFORCE_GT, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GT(1, 2, paddle::platform::errors::InvalidArgument(
-                                "Expected 1 > 2, but received 1:1 <= 2:2."));
+    PADDLE_ENFORCE_GT(1, 2,
+                      paddle::platform::errors::InvalidArgument(
+                          "Expected 1 > 2, but received 1:1 <= 2:2."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -143,18 +151,22 @@ TEST(ENFORCE_GT, FAIL) {
 }
 
 TEST(ENFORCE_GE, OK) {
-  PADDLE_ENFORCE_GE(2, 2, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_GE tests failed."));
-  PADDLE_ENFORCE_GE(3, 2, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_GE tests failed."));
-  PADDLE_ENFORCE_GE(3.21, 2.0, paddle::platform::errors::Unavailable(
-                                   "PADDLE_ENFORCE_GE tests failed."));
+  PADDLE_ENFORCE_GE(
+      2, 2,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_GE tests failed."));
+  PADDLE_ENFORCE_GE(
+      3, 2,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_GE tests failed."));
+  PADDLE_ENFORCE_GE(
+      3.21, 2.0,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_GE tests failed."));
 }
 TEST(ENFORCE_GE, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GE(1, 2, paddle::platform::errors::InvalidArgument(
-                                "Expected 1 >= 2, but received 1:1 < 2:2."));
+    PADDLE_ENFORCE_GE(1, 2,
+                      paddle::platform::errors::InvalidArgument(
+                          "Expected 1 >= 2, but received 1:1 < 2:2."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -165,22 +177,28 @@ TEST(ENFORCE_GE, FAIL) {
 }
 
 TEST(ENFORCE_LE, OK) {
-  PADDLE_ENFORCE_LE(1, 1, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_LE tests failed."));
-  PADDLE_ENFORCE_LE(1UL, 1UL, paddle::platform::errors::Unavailable(
-                                  "PADDLE_ENFORCE_LE tests failed."));
-  PADDLE_ENFORCE_LE(2, 3, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_LE tests failed."));
-  PADDLE_ENFORCE_LE(2UL, 3UL, paddle::platform::errors::Unavailable(
-                                  "PADDLE_ENFORCE_LE tests failed."));
-  PADDLE_ENFORCE_LE(2.0, 3.2, paddle::platform::errors::Unavailable(
-                                  "PADDLE_ENFORCE_LE tests failed."));
+  PADDLE_ENFORCE_LE(
+      1, 1,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed."));
+  PADDLE_ENFORCE_LE(
+      1UL, 1UL,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed."));
+  PADDLE_ENFORCE_LE(
+      2, 3,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed."));
+  PADDLE_ENFORCE_LE(
+      2UL, 3UL,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed."));
+  PADDLE_ENFORCE_LE(
+      2.0, 3.2,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed."));
 }
 TEST(ENFORCE_LE, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GT(1, 2, paddle::platform::errors::InvalidArgument(
-                                "Expected 1 > 2, but received 1:1 <= 2:2."));
+    PADDLE_ENFORCE_GT(1, 2,
+                      paddle::platform::errors::InvalidArgument(
+                          "Expected 1 > 2, but received 1:1 <= 2:2."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -191,12 +209,15 @@ TEST(ENFORCE_LE, FAIL) {
 }
 
 TEST(ENFORCE_LT, OK) {
-  PADDLE_ENFORCE_LT(3, 10, paddle::platform::errors::Unavailable(
-                               "PADDLE_ENFORCE_LT tests failed."));
-  PADDLE_ENFORCE_LT(2UL, 3UL, paddle::platform::errors::Unavailable(
-                                  "PADDLE_ENFORCE_LT tests failed."));
-  PADDLE_ENFORCE_LT(2, 3, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_LT tests failed."));
+  PADDLE_ENFORCE_LT(
+      3, 10,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LT tests failed."));
+  PADDLE_ENFORCE_LT(
+      2UL, 3UL,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LT tests failed."));
+  PADDLE_ENFORCE_LT(
+      2, 3,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LT tests failed."));
 }
 TEST(ENFORCE_LT, FAIL) {
   bool caught_exception = false;
@@ -263,16 +284,18 @@ std::ostream& operator<<(std::ostream& os, const Dims& d) {
 
 TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
   Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}};
-  PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_EQ tests failed."));
+  PADDLE_ENFORCE_EQ(
+      a, b,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_EQ tests failed."));
 }
 
 TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
   Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable(
-                                "PADDLE_ENFORCE_EQ tests failed."));
+    PADDLE_ENFORCE_EQ(a, b,
+                      paddle::platform::errors::Unavailable(
+                          "PADDLE_ENFORCE_EQ tests failed."));
   } catch (paddle::platform::EnforceNotMet&) {
     caught_exception = true;
   }
@@ -481,10 +504,12 @@ TEST(enforce, cannot_to_string_type) {
                 "int can be converted to string");
   CannotToStringType obj1(3), obj2(4), obj3(3);
 
-  PADDLE_ENFORCE_NE(obj1, obj2, paddle::platform::errors::InvalidArgument(
-                                    "Object 1 is not equal to Object 2"));
-  PADDLE_ENFORCE_EQ(obj1, obj3, paddle::platform::errors::InvalidArgument(
-                                    "Object 1 is equal to Object 3"));
+  PADDLE_ENFORCE_NE(obj1, obj2,
+                    paddle::platform::errors::InvalidArgument(
+                        "Object 1 is not equal to Object 2"));
+  PADDLE_ENFORCE_EQ(obj1, obj3,
+                    paddle::platform::errors::InvalidArgument(
+                        "Object 1 is equal to Object 3"));
 
   std::string msg = "Compare obj1 with obj2";
   try {
diff --git a/paddle/fluid/platform/errors.h b/paddle/fluid/platform/errors.h
index 57f5b3a7c9374..758af3e2d9137 100644
--- a/paddle/fluid/platform/errors.h
+++ b/paddle/fluid/platform/errors.h
@@ -18,5 +18,5 @@ namespace paddle {
 namespace platform {
 namespace errors = ::phi::errors;
 using error = ::phi::ErrorCode;
-}
-}
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/errors_test.cc b/paddle/fluid/platform/errors_test.cc
index 712b67a654c40..8b11c1d2d2492 100644
--- a/paddle/fluid/platform/errors_test.cc
+++ b/paddle/fluid/platform/errors_test.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/errors.h"
+
 #include <string>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/errors.h"
 
 using namespace paddle::platform::errors;  // NOLINT
 
diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h
index bef551078b332..f2a150c301216 100644
--- a/paddle/fluid/platform/fast_divmod.h
+++ b/paddle/fluid/platform/fast_divmod.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cstdint>
+
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 #define INT_BITS 32
diff --git a/paddle/fluid/platform/flags.h b/paddle/fluid/platform/flags.h
index b9d78c2e9dc39..0a38d61293978 100644
--- a/paddle/fluid/platform/flags.h
+++ b/paddle/fluid/platform/flags.h
@@ -18,6 +18,7 @@
 #include <map>
 #include <string>
 #include <type_traits>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/variant.h"
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 2c00854e082eb..dc7fdc6b443d9 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <bitset>
 #include <iostream>
 
diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc
index bbec743d26f3b..45ca4a6f27765 100644
--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <netinet/in.h>
 #include <stdlib.h>
 #include <sys/socket.h>
+
 #include <algorithm>
 #include <string>
 #include <thread>  // NOLINT
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index 5301dd307590b..bc5bd274bf8a7 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/init.h"
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/device_context.h"
 #ifdef PADDLE_WITH_MLU
diff --git a/paddle/fluid/platform/lock_guard_ptr.h b/paddle/fluid/platform/lock_guard_ptr.h
index bff24e74a7070..66d6e446d3f16 100644
--- a/paddle/fluid/platform/lock_guard_ptr.h
+++ b/paddle/fluid/platform/lock_guard_ptr.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <stdint.h>
+
 #include <memory>
 #include <mutex>  // NOLINT
 namespace paddle {
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 5476d244f6035..382f96e83bfce 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -1061,16 +1061,18 @@ static void SetDstMemoryQuantized(
   const size_t dst_dims = dst_tz.size();
   MKLDNNMemoryFormat dst_fmt;
 
-  PADDLE_ENFORCE_LE(dst_dims, 5, platform::errors::InvalidArgument(
-                                     "Dst memory for quantization can not have "
-                                     "dims > 5. But received dst_dims is %d.",
-                                     dst_dims));
+  PADDLE_ENFORCE_LE(dst_dims, 5,
+                    platform::errors::InvalidArgument(
+                        "Dst memory for quantization can not have "
+                        "dims > 5. But received dst_dims is %d.",
+                        dst_dims));
   dst_fmt = platform::MKLDNNFormatForSize(dst_dims, output_format);
 
-  auto tmp_dst_md = platform::MKLDNNMemDesc(
-      {dst_tz}, paddle::framework::ToMKLDNNDataType(
-                    framework::DataTypeTrait<T>::DataType()),
-      dst_fmt);
+  auto tmp_dst_md =
+      platform::MKLDNNMemDesc({dst_tz},
+                              paddle::framework::ToMKLDNNDataType(
+                                  framework::DataTypeTrait<T>::DataType()),
+                              dst_fmt);
   dst_md.reset(new dnnl::memory::desc(tmp_dst_md));
   dst_memory.reset(
       new dnnl::memory(*dst_md, engine, to_void_cast<T>(output_data)));
diff --git a/paddle/fluid/platform/monitor.h b/paddle/fluid/platform/monitor.h
index dc9abaf36d825..e7612f6dcb6cd 100644
--- a/paddle/fluid/platform/monitor.h
+++ b/paddle/fluid/platform/monitor.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <stdio.h>
+
 #include <atomic>
 #include <memory>
 #include <mutex>  // NOLINT
diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc
index 36dd7891d5518..694f701b5ad9b 100644
--- a/paddle/fluid/platform/os_info.cc
+++ b/paddle/fluid/platform/os_info.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/os_info.h"
+
 #include <functional>
 #include <sstream>
 #include <thread>
diff --git a/paddle/fluid/platform/os_info_test.cc b/paddle/fluid/platform/os_info_test.cc
index b3311f1d19e63..149da6ba27aea 100644
--- a/paddle/fluid/platform/os_info_test.cc
+++ b/paddle/fluid/platform/os_info_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/platform/os_info.h"
+
 #include <thread>
+
 #include "gtest/gtest.h"
 
 TEST(ThreadInfo, TestThreadIdUtils) {
-  using paddle::platform::GetCurrentThreadStdId;
-  using paddle::platform::GetCurrentThreadId;
   using paddle::platform::GetAllThreadIds;
+  using paddle::platform::GetCurrentThreadId;
+  using paddle::platform::GetCurrentThreadStdId;
   EXPECT_EQ(std::hash<std::thread::id>()(std::this_thread::get_id()),
             GetCurrentThreadId().std_tid);
   auto ids = GetAllThreadIds();
@@ -26,10 +28,10 @@ TEST(ThreadInfo, TestThreadIdUtils) {
 }
 
 TEST(ThreadInfo, TestThreadNameUtils) {
-  using paddle::platform::GetCurrentThreadStdId;
+  using paddle::platform::GetAllThreadNames;
   using paddle::platform::GetCurrentThreadName;
+  using paddle::platform::GetCurrentThreadStdId;
   using paddle::platform::SetCurrentThreadName;
-  using paddle::platform::GetAllThreadNames;
   SetCurrentThreadName("MainThread");
   EXPECT_FALSE(SetCurrentThreadName("MainThread"));
   auto names = GetAllThreadNames();
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 75abf36e676d0..c573650f1791f 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/profiler.h"
+
 #include <mutex>  // NOLINT
 #include <random>
 #include <sstream>
@@ -20,7 +22,6 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/common_event.h"
 #include "paddle/fluid/platform/profiler/host_event_recorder.h"
 #include "paddle/fluid/platform/profiler/host_tracer.h"
diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt
index 084bc44dbc78b..ea3111b73613a 100644
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -1,14 +1,52 @@
-cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
-cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
+cc_library(
+  host_tracer
+  SRCS host_tracer.cc
+  DEPS enforce)
+cc_library(
+  cuda_tracer
+  SRCS cuda_tracer.cc cupti_data_process.cc
+  DEPS workqueue_utils enforce glog)
 add_subdirectory(mlu)
-cc_library(event_node SRCS event_node.cc DEPS enforce)
-cc_library(profiler_utils SRCS utils.cc DEPS enforce glog)
+cc_library(
+  event_node
+  SRCS event_node.cc
+  DEPS enforce)
+cc_library(
+  profiler_utils
+  SRCS utils.cc
+  DEPS enforce glog)
 add_subdirectory(dump)
-cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils)
-cc_library(event_bind SRCS event_python.cc DEPS profiler_logger)
-cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog)
-cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind mlu_tracer)
-cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger)
-cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils)
-cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind)
-cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler)
+cc_library(
+  profiler_logger
+  SRCS chrometracing_logger.cc dump/serialization_logger.cc
+       dump/deserialization_reader.cc
+  DEPS nodetreeproto event_node profiler_utils)
+cc_library(
+  event_bind
+  SRCS event_python.cc
+  DEPS profiler_logger)
+cc_library(
+  cpu_utilization
+  SRCS cpu_utilization.cc
+  DEPS cpu_info os_info enforce glog)
+cc_library(
+  new_profiler
+  SRCS profiler.cc
+  DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind
+       mlu_tracer)
+cc_test(
+  test_event_node
+  SRCS test_event_node.cc
+  DEPS event_node profiler_logger)
+cc_test(
+  test_extra_info
+  SRCS test_extra_info.cc
+  DEPS profiler_utils)
+cc_test(
+  test_serialization_logger
+  SRCS dump/test_serialization_logger.cc
+  DEPS event_bind)
+cc_test(
+  new_profiler_test
+  SRCS profiler_test.cc
+  DEPS new_profiler)
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 4ee95a530fb43..f728a820bd73c 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/profiler/chrometracing_logger.h"
+
 #include <cstdio>
 #include <ctime>
 #include <limits>
 
 #include "glog/logging.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler/chrometracing_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
 #include "paddle/fluid/platform/profiler/utils.h"
 
@@ -304,9 +304,10 @@ void ChromeTracingLogger::HandleTypeKernel(
   blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y *
                                      kernel_info.grid_z) /
                   device_property.multiProcessorCount;
-  warps_per_sm = blocks_per_sm * (kernel_info.block_x * kernel_info.block_y *
-                                  kernel_info.block_z) /
-                 threads_per_warp;
+  warps_per_sm =
+      blocks_per_sm *
+      (kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) /
+      threads_per_warp;
   occupancy = CalculateEstOccupancy(
       device_node.DeviceId(), kernel_info.registers_per_thread,
       kernel_info.static_shared_memory, kernel_info.dynamic_shared_memory,
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h
index 8977ab748c63a..12d98d1ef0c63 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.h
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <set>
 #include <unordered_map>
 #include <utility>
+
 #include "paddle/fluid/platform/profiler/output_logger.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/profiler/common_event.h b/paddle/fluid/platform/profiler/common_event.h
index cfdc3be110a5b..8fe3b15052306 100644
--- a/paddle/fluid/platform/profiler/common_event.h
+++ b/paddle/fluid/platform/profiler/common_event.h
@@ -17,6 +17,7 @@
 #include <cstring>
 #include <functional>
 #include <string>
+
 #include "paddle/fluid/platform/event.h"  // import EventRole, TODO(TIEXING): remove later
 #include "paddle/fluid/platform/profiler/trace_event.h"
 
diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc
index d507153d3f5b4..4319841c8a93b 100644
--- a/paddle/fluid/platform/profiler/cpu_utilization.cc
+++ b/paddle/fluid/platform/profiler/cpu_utilization.cc
@@ -54,12 +54,13 @@ void CpuUtilization::RecordBeginTimeInfo() {
   if (stat_file != nullptr) {
     char temp_str[200];
     uint64_t temp_lu;
-    int retval = fscanf(
-        stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
-                   "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
-        temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
-        &system_tms_start_.tms_stime, &idle_start_, &iowait_start_, &irq_start_,
-        &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
+    int retval =
+        fscanf(stat_file,
+               "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+               "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+               temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
+               &system_tms_start_.tms_stime, &idle_start_, &iowait_start_,
+               &irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
     if (retval != 11) {
       LOG(WARNING)
           << "Failed to read cpu utilization information at record beginning."
@@ -87,12 +88,13 @@ void CpuUtilization::RecordEndTimeInfo() {
   if (stat_file != nullptr) {
     char temp_str[200];
     uint64_t temp_lu;
-    int retval = fscanf(
-        stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
-                   "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
-        temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
-        &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
-        &softirq_end_, &steal_end_, &temp_lu, &temp_lu);
+    int retval =
+        fscanf(stat_file,
+               "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+               "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+               temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
+               &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
+               &softirq_end_, &steal_end_, &temp_lu, &temp_lu);
 
     if (retval != 11) {
       LOG(WARNING)
diff --git a/paddle/fluid/platform/profiler/cpu_utilization.h b/paddle/fluid/platform/profiler/cpu_utilization.h
index 7b05a6302cdb0..aa25ae5a43c10 100644
--- a/paddle/fluid/platform/profiler/cpu_utilization.h
+++ b/paddle/fluid/platform/profiler/cpu_utilization.h
@@ -15,8 +15,10 @@
 #pragma once
 
 #include <stdio.h>
+
 #include <cinttypes>
 #include <cstdint>
+
 #include "glog/logging.h"
 #ifdef _MSC_VER
 #include <windows.h>
diff --git a/paddle/fluid/platform/profiler/cuda_tracer.cc b/paddle/fluid/platform/profiler/cuda_tracer.cc
index 2d3e354dc271a..9e32f7bbf19ee 100644
--- a/paddle/fluid/platform/profiler/cuda_tracer.cc
+++ b/paddle/fluid/platform/profiler/cuda_tracer.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/profiler/cuda_tracer.h"
+
 #include <string>
 #include <unordered_map>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/platform/os_info.h"
diff --git a/paddle/fluid/platform/profiler/cuda_tracer.h b/paddle/fluid/platform/profiler/cuda_tracer.h
index 20a60521266a2..36c5ab4eb5546 100644
--- a/paddle/fluid/platform/profiler/cuda_tracer.h
+++ b/paddle/fluid/platform/profiler/cuda_tracer.h
@@ -17,6 +17,7 @@
 #include <cstdint>
 #include <mutex>
 #include <vector>
+
 #include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/profiler/tracer_base.h"
diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
index da12dccb74924..7cb8b597dcdd0 100644
--- a/paddle/fluid/platform/profiler/cupti_data_process.cc
+++ b/paddle/fluid/platform/profiler/cupti_data_process.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/profiler/cupti_data_process.h"
+
 #include <cstdio>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/os_info.h"
 
diff --git a/paddle/fluid/platform/profiler/cupti_data_process.h b/paddle/fluid/platform/profiler/cupti_data_process.h
index 01b2e72ade4e2..7b80046473456 100644
--- a/paddle/fluid/platform/profiler/cupti_data_process.h
+++ b/paddle/fluid/platform/profiler/cupti_data_process.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <unordered_map>
+
 #include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/profiler/trace_event_collector.h"
 
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
index de3411579d3e9..82363fcff6349 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -9,7 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
+
 #include <cstring>
+
 #include "paddle/fluid/platform/profiler/extra_info.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
index 73021f4362af5..b8afe2af0e776 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -9,9 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "glog/logging.h"
-
 #include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
+
+#include "glog/logging.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
 #include "paddle/fluid/platform/profiler/extra_info.h"
 #include "paddle/fluid/platform/profiler/utils.h"
diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
index d294bfee58c2b..5253ecc505dbb 100644
--- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
@@ -13,26 +13,25 @@
 // limitations under the License.
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
 #include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
 #include "paddle/fluid/platform/profiler/event_python.h"
 
-using paddle::platform::SerializationLogger;
-using paddle::platform::DeserializationReader;
-using paddle::platform::NodeTrees;
-using paddle::platform::HostTraceEventNode;
 using paddle::platform::CudaRuntimeTraceEventNode;
+using paddle::platform::DeserializationReader;
+using paddle::platform::DeviceTraceEvent;
 using paddle::platform::DeviceTraceEventNode;
 using paddle::platform::HostTraceEvent;
-using paddle::platform::RuntimeTraceEvent;
-using paddle::platform::DeviceTraceEvent;
-using paddle::platform::TracerEventType;
+using paddle::platform::HostTraceEventNode;
 using paddle::platform::KernelEventInfo;
 using paddle::platform::MemcpyEventInfo;
 using paddle::platform::MemsetEventInfo;
+using paddle::platform::NodeTrees;
 using paddle::platform::ProfilerResult;
+using paddle::platform::RuntimeTraceEvent;
+using paddle::platform::SerializationLogger;
+using paddle::platform::TracerEventType;
 
 TEST(SerializationLoggerTest, dump_case0) {
   std::list<HostTraceEvent> host_events;
diff --git a/paddle/fluid/platform/profiler/event_node.cc b/paddle/fluid/platform/profiler/event_node.cc
index b909fb5f25aa7..e1af63ad8909c 100644
--- a/paddle/fluid/platform/profiler/event_node.cc
+++ b/paddle/fluid/platform/profiler/event_node.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler/event_node.h"
 
 #include <limits.h>
+
 #include <algorithm>
 #include <deque>
 #include <set>
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
index 5c42c8e8bf61e..abde62c6b1444 100644
--- a/paddle/fluid/platform/profiler/event_python.cc
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler/event_python.h"
+
 #include "paddle/fluid/platform/profiler/chrometracing_logger.h"
 #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
 #include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
diff --git a/paddle/fluid/platform/profiler/event_tracing.h b/paddle/fluid/platform/profiler/event_tracing.h
index fcaba9a43ca93..fd81c15f92ad7 100644
--- a/paddle/fluid/platform/profiler/event_tracing.h
+++ b/paddle/fluid/platform/profiler/event_tracing.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/profiler/trace_event.h"
 
@@ -70,10 +71,11 @@ class RecordEvent {
    * @param level: Used to filter events, works like glog VLOG(level).
    * RecordEvent will works if HostTraceLevel >= level.
    */
-  explicit RecordEvent(const char* name, const TracerEventType type =
-                                             TracerEventType::UserDefined,
-                       uint32_t level = kDefaultTraceLevel,
-                       const EventRole role = EventRole::kOrdinary);
+  explicit RecordEvent(
+      const char* name,
+      const TracerEventType type = TracerEventType::UserDefined,
+      uint32_t level = kDefaultTraceLevel,
+      const EventRole role = EventRole::kOrdinary);
 
   RecordEvent(const std::string& name, const std::string& attr,
               const TracerEventType type = TracerEventType::UserDefined,
diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h
index afd4135246556..1359c3b85a096 100644
--- a/paddle/fluid/platform/profiler/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <type_traits>
 #include <vector>
+
 #include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/os_info.h"
@@ -58,7 +59,7 @@ class EventContainer {
  public:
   // Record an event
   template <typename... Args>
-  void Record(Args &&... args) {
+  void Record(Args &&...args) {
     DoRecord(ContainsStdString<Args...>(), std::forward<Args>(args)...);
   }
 
@@ -112,7 +113,7 @@ class EventContainer {
 
   // Record an event with string arguments
   template <typename... Args>
-  void DoRecord(std::true_type, Args &&... args) {
+  void DoRecord(std::true_type, Args &&...args) {
     auto *storage = GetEventStorage();
     std::function<void *(size_t)> allocator = [this](size_t size) {
       return GetStrBufFromArena(size);
@@ -122,7 +123,7 @@ class EventContainer {
 
   // Record an event without any string argument
   template <typename... Args>
-  void DoRecord(std::false_type, Args &&... args) {
+  void DoRecord(std::false_type, Args &&...args) {
     auto *storage = GetEventStorage();
     new (storage) EventType(std::forward<Args>(args)...);
   }
@@ -199,7 +200,7 @@ class ThreadEventRecorder {
  public:
   // Forward call to EventContainer::Record
   template <typename... Args>
-  void RecordEvent(Args &&... args) {
+  void RecordEvent(Args &&...args) {
     base_evt_cntr_.Record(std::forward<Args>(args)...);
   }
 
@@ -237,7 +238,7 @@ class HostEventRecorder {
   // Do your best to avoid using 'std::string' as the argument type.
   // It will cause deep-copy to harm performance.
   template <typename... Args>
-  void RecordEvent(Args &&... args) {
+  void RecordEvent(Args &&...args) {
     GetThreadLocalRecorder()->RecordEvent(std::forward<Args>(args)...);
   }
 
diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc
index b7eb53331b793..8a36a3a8bab44 100644
--- a/paddle/fluid/platform/profiler/host_tracer.cc
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/profiler/host_tracer.h"
+
 #include "glog/logging.h"
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/profiler/common_event.h"
diff --git a/paddle/fluid/platform/profiler/mlu/CMakeLists.txt b/paddle/fluid/platform/profiler/mlu/CMakeLists.txt
index 01b3757ea6912..d510edb0457db 100644
--- a/paddle/fluid/platform/profiler/mlu/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/mlu/CMakeLists.txt
@@ -2,4 +2,7 @@ if(WITH_MLU)
   set(MLU_INFO mlu_info)
 endif()
 
-cc_library(mlu_tracer SRCS mlu_tracer.cc cnpapi_data_process.cc DEPS workqueue_utils enforce glog ${MLU_INFO})
+cc_library(
+  mlu_tracer
+  SRCS mlu_tracer.cc cnpapi_data_process.cc
+  DEPS workqueue_utils enforce glog ${MLU_INFO})
diff --git a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
index 36abf77279d06..7afdb5eb2a352 100644
--- a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
+++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
+
 #include <cstdio>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/os_info.h"
 
diff --git a/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc b/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
index 2d719a8bbfdcb..bbaafa3faa60a 100644
--- a/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
+++ b/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
+
 #include <string>
 #include <unordered_map>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/platform/os_info.h"
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index a417eda1509e5..8bcf856c01ab6 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/profiler/profiler.h"
+
 #include "glog/logging.h"
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h
index ea346a4fb748d..65a3bcc02d857 100644
--- a/paddle/fluid/platform/profiler/profiler.h
+++ b/paddle/fluid/platform/profiler/profiler.h
@@ -20,6 +20,7 @@
 #include <functional>
 #include <list>
 #include <memory>
+
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/profiler/cpu_utilization.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc
index f2c867ffff217..1f1fbcb71ecd5 100644
--- a/paddle/fluid/platform/profiler/profiler_test.cc
+++ b/paddle/fluid/platform/profiler/profiler_test.cc
@@ -14,6 +14,7 @@
 
 #include <set>
 #include <string>
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #ifdef PADDLE_WITH_CUDA
@@ -27,11 +28,11 @@
 #include "paddle/fluid/platform/profiler/profiler.h"
 
 TEST(ProfilerTest, TestHostTracer) {
-  using paddle::platform::ProfilerOptions;
   using paddle::platform::Profiler;
+  using paddle::platform::ProfilerOptions;
+  using paddle::platform::ProfilerResult;
   using paddle::platform::RecordInstantEvent;
   using paddle::platform::TracerEventType;
-  using paddle::platform::ProfilerResult;
   ProfilerOptions options;
   options.trace_level = 2;
   options.trace_switch = 3;
@@ -58,8 +59,8 @@ TEST(ProfilerTest, TestHostTracer) {
 }
 
 TEST(ProfilerTest, TestCudaTracer) {
-  using paddle::platform::ProfilerOptions;
   using paddle::platform::Profiler;
+  using paddle::platform::ProfilerOptions;
   using paddle::platform::ProfilerResult;
   ProfilerOptions options;
   options.trace_level = 0;
diff --git a/paddle/fluid/platform/profiler/test_event_node.cc b/paddle/fluid/platform/profiler/test_event_node.cc
index b8d1306ad076c..23ad917b57d0e 100644
--- a/paddle/fluid/platform/profiler/test_event_node.cc
+++ b/paddle/fluid/platform/profiler/test_event_node.cc
@@ -13,22 +13,21 @@
 // limitations under the License.
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/platform/profiler/chrometracing_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
 
 using paddle::platform::ChromeTracingLogger;
-using paddle::platform::NodeTrees;
-using paddle::platform::HostTraceEventNode;
 using paddle::platform::CudaRuntimeTraceEventNode;
+using paddle::platform::DeviceTraceEvent;
 using paddle::platform::DeviceTraceEventNode;
 using paddle::platform::HostTraceEvent;
-using paddle::platform::RuntimeTraceEvent;
-using paddle::platform::DeviceTraceEvent;
-using paddle::platform::TracerEventType;
+using paddle::platform::HostTraceEventNode;
 using paddle::platform::KernelEventInfo;
 using paddle::platform::MemcpyEventInfo;
 using paddle::platform::MemsetEventInfo;
+using paddle::platform::NodeTrees;
+using paddle::platform::RuntimeTraceEvent;
+using paddle::platform::TracerEventType;
 TEST(NodeTreesTest, LogMe_case0) {
   std::list<HostTraceEvent> host_events;
   std::list<RuntimeTraceEvent> runtime_events;
@@ -194,8 +193,10 @@ TEST(NodeTreesTest, HandleTrees_case0) {
   }
   std::function<void(HostTraceEventNode*)> host_event_node_handle(
       [&](HostTraceEventNode* a) { logger.LogHostTraceEventNode(*a); });
-  std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle([&](
-      CudaRuntimeTraceEventNode* a) { logger.LogRuntimeTraceEventNode(*a); });
+  std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle(
+      [&](CudaRuntimeTraceEventNode* a) {
+        logger.LogRuntimeTraceEventNode(*a);
+      });
   std::function<void(DeviceTraceEventNode*)> device_event_node_handle(
       [&](DeviceTraceEventNode* a) { logger.LogDeviceTraceEventNode(*a); });
   tree.HandleTrees(host_event_node_handle, runtime_event_node_handle,
diff --git a/paddle/fluid/platform/profiler/trace_event_collector.h b/paddle/fluid/platform/profiler/trace_event_collector.h
index 5f2bc9dc90db9..d1593bc1bfcd7 100644
--- a/paddle/fluid/platform/profiler/trace_event_collector.h
+++ b/paddle/fluid/platform/profiler/trace_event_collector.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <list>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/platform/profiler/trace_event.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h
index 06d1636c4617c..433fd0b825a11 100644
--- a/paddle/fluid/platform/profiler/utils.h
+++ b/paddle/fluid/platform/profiler/utils.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <ctime>
 #include <string>
+
 #include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/os_info.h"
@@ -26,8 +27,9 @@ template <typename... Args>
 std::string string_format(const std::string& format, Args... args) {
   int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) +
                1;  // Extra space for '\0'
-  PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal(
-                                   "Error during profiler data formatting."));
+  PADDLE_ENFORCE_GE(
+      size_s, 0,
+      platform::errors::Fatal("Error during profiler data formatting."));
   auto size = static_cast<size_t>(size_s);
   auto buf = std::make_unique<char[]>(size);
   std::snprintf(buf.get(), size, format.c_str(), args...);
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index f64e05504aa3f..ae856044f8fc5 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -390,8 +390,8 @@ void SetEvent(bool merge_thread, const Event &analyze_event,
             index++;
           }
           if (split_pos == -1 && !main_thread_event_name.count(rit->name())) {
-            event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
-                         rit->name();
+            event_name = "thread" + std::to_string(rit->thread_id()) +
+                         "::" + rit->name();
           } else {
             if (!main_thread_event_name.count(rit->name())) {
               event_name =
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index e9f84a49246f7..18d4b4dc83478 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -36,24 +36,24 @@ TEST(Event, CpuElapsedTime) {
 
 TEST(RecordEvent, RecordEvent) {
   using paddle::platform::Event;
+  using paddle::platform::EventRole;
+  using paddle::platform::EventSortingKey;
   using paddle::platform::EventType;
-  using paddle::platform::RecordEvent;
-  using paddle::platform::PushEvent;
   using paddle::platform::PopEvent;
   using paddle::platform::ProfilerState;
-  using paddle::platform::EventSortingKey;
-  using paddle::platform::EventRole;
+  using paddle::platform::PushEvent;
+  using paddle::platform::RecordEvent;
 
   ProfilerState state = ProfilerState::kCPU;
   EnableProfiler(state);
 
   /* Usage 1:
-  *  PushEvent(evt_name);
-  *  ...
-  *  code to be analyzed
-  *  ...
-  * PopEvent(evt_name);
-  */
+   *  PushEvent(evt_name);
+   *  ...
+   *  code to be analyzed
+   *  ...
+   * PopEvent(evt_name);
+   */
   LOG(INFO) << "Usage 1: PushEvent & PopEvent";
   for (int loop = 0; loop < 3; ++loop) {
     for (int i = 1; i < 5; ++i) {
diff --git a/paddle/fluid/platform/resource_pool.h b/paddle/fluid/platform/resource_pool.h
index f01d006d5b273..737001a50abbf 100644
--- a/paddle/fluid/platform/resource_pool.h
+++ b/paddle/fluid/platform/resource_pool.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
diff --git a/paddle/fluid/platform/stream/CMakeLists.txt b/paddle/fluid/platform/stream/CMakeLists.txt
index 6a825e9077c0a..25d2874ca04d2 100644
--- a/paddle/fluid/platform/stream/CMakeLists.txt
+++ b/paddle/fluid/platform/stream/CMakeLists.txt
@@ -1,3 +1,6 @@
-IF(WITH_GPU OR WITH_ROCM)
-    cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce boost eigen3 ${MKLDNN_CTX_DEPS})
-ENDIF()
+if(WITH_GPU OR WITH_ROCM)
+  cc_library(
+    cuda_stream
+    SRCS cuda_stream.cc
+    DEPS enforce boost eigen3 ${MKLDNN_CTX_DEPS})
+endif()
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
index e3e735d03aba1..d7f60e4019d2e 100644
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/stream/cuda_stream.h"
+
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 6fa326d57bc67..bb9a405798b63 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/stream_callback_manager.h"
+
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h
index 6f714a677033b..32c759d01026c 100644
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
+
 #include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h"
 #endif
 
diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu
index 2e7b8b402f69a..1caa2e8770772 100644
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
@@ -38,10 +39,10 @@ class Multiply {
 using paddle::memory::Alloc;
 using paddle::memory::Copy;
 
-using paddle::platform::CPUPlace;
-using paddle::platform::CUDAPlace;
 using paddle::platform::CPUDeviceContext;
+using paddle::platform::CPUPlace;
 using paddle::platform::CUDADeviceContext;
+using paddle::platform::CUDAPlace;
 
 using paddle::platform::Transform;
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 90a86aaf31f26..bf74d1184322c 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,46 +1,82 @@
-set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_wrapper metrics prune
-  feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
-  analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
-  gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
-  cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store new_profiler)
-
-if (WITH_PSCORE)
+set(PYBIND_DEPS
+    init
+    pybind
+    python
+    proto_desc
+    memory
+    executor
+    fleet_wrapper
+    box_wrapper
+    metrics
+    prune
+    feed_fetch_method
+    pass
+    generate_pass
+    pass_builder
+    parallel_executor
+    profiler
+    layer
+    tracer
+    engine
+    scope_pool
+    analysis_predictor
+    imperative_profiler
+    imperative_flag
+    save_load_util
+    dlpack_tensor
+    device_context
+    gloo_wrapper
+    infer_io_utils
+    heter_wrapper
+    generator
+    op_version_registry
+    ps_gpu_wrapper
+    custom_operator
+    cost_model
+    cuda_graph_with_memory_pool
+    fleet_executor
+    global_utils
+    phi_utils
+    tcp_store
+    new_profiler)
+
+if(WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
   set(PYBIND_DEPS ${PYBIND_DEPS} graph_py_service)
-  if (WITH_HETERPS)
+  if(WITH_HETERPS)
     set(PYBIND_DEPS ${PYBIND_DEPS} graph_gpu_wrapper)
   endif()
 endif()
-if (WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM)
   set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
   set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard)
 endif()
 
-if (WITH_GPU)
+if(WITH_GPU)
   set(PYBIND_DEPS ${PYBIND_DEPS} cuda_profiler)
 endif()
-if (WITH_IPU)
+if(WITH_IPU)
   set(PYBIND_DEPS ${PYBIND_DEPS} ipu_info)
 endif()
 
-if (WITH_NCCL OR WITH_RCCL)
+if(WITH_NCCL OR WITH_RCCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
   set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
 endif()
 
-if (WITH_XPU_BKCL)
+if(WITH_XPU_BKCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
   set(PYBIND_DEPS ${PYBIND_DEPS} bkcl_context)
   set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
 endif()
 
-if (WITH_ASCEND_CL)
+if(WITH_ASCEND_CL)
   set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
   set(PYBIND_DEPS ${PYBIND_DEPS} hccl_context)
   set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
 endif()
 
-if (WITH_CNCL)
+if(WITH_CNCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
   set(PYBIND_DEPS ${PYBIND_DEPS} cncl_context)
 endif()
@@ -48,10 +84,10 @@ endif()
 if(NOT WIN32)
   set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
   set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator)
-  if (WITH_GPU)
+  if(WITH_GPU)
     set(PYBIND_DEPS ${PYBIND_DEPS} cuda_ipc_allocator)
   endif()
-  if (WITH_NCCL OR WITH_RCCL)
+  if(WITH_NCCL OR WITH_RCCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
     set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
   endif()
@@ -63,45 +99,45 @@ if(WITH_PYTHON)
 endif()
 
 set(PYBIND_SRCS
-  pybind.cc
-  exception.cc
-  protobuf.cc
-  const_value.cc
-  global_value_getter_setter.cc
-  reader_py.cc
-  fleet_wrapper_py.cc
-  heter_wrapper_py.cc
-  ps_gpu_wrapper_py.cc
-  gloo_wrapper_py.cc
-  box_helper_py.cc
-  metrics_py.cc
-  data_set_py.cc
-  imperative.cc
-  ir.cc
-  bind_cost_model.cc
-  bind_fleet_executor.cc
-  inference_api.cc
-  compatible.cc
-  io.cc
-  generator_py.cc
-  communication.cc
-  cuda_streams_py.cc)
+    pybind.cc
+    exception.cc
+    protobuf.cc
+    const_value.cc
+    global_value_getter_setter.cc
+    reader_py.cc
+    fleet_wrapper_py.cc
+    heter_wrapper_py.cc
+    ps_gpu_wrapper_py.cc
+    gloo_wrapper_py.cc
+    box_helper_py.cc
+    metrics_py.cc
+    data_set_py.cc
+    imperative.cc
+    ir.cc
+    bind_cost_model.cc
+    bind_fleet_executor.cc
+    inference_api.cc
+    compatible.cc
+    io.cc
+    generator_py.cc
+    communication.cc
+    cuda_streams_py.cc)
 
 if(NOT ON_INFER)
-  set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
-  if (WITH_NCCL)
-    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
-    if (WITH_PSCORE)
-      set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter)
+  set(PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
+  if(WITH_NCCL)
+    set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
+    if(WITH_PSCORE)
+      set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter)
     endif()
   endif()
-  if (WITH_GLOO)
-    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
+  if(WITH_GLOO)
+    set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
   endif()
   if(WITH_ASCEND_CL)
-    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl)
-    if (WITH_PSCORE)
-      set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter)
+    set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl)
+    if(WITH_PSCORE)
+      set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter)
     endif()
   endif()
   set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
@@ -119,45 +155,69 @@ if(WITH_GLOO)
   set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
 endif(WITH_GLOO)
 
-if (WITH_CRYPTO)
+if(WITH_CRYPTO)
   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_crypto)
   set(PYBIND_SRCS ${PYBIND_SRCS} crypto.cc)
-endif (WITH_CRYPTO)
-
-if (WITH_PSLIB)
-  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-      set(DISTRIBUTE_COMPILE_FLAGS
-              "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+endif(WITH_CRYPTO)
+
+if(WITH_PSLIB)
+  set(DISTRIBUTE_COMPILE_FLAGS
+      "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result"
+  )
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
   endif()
-  set_source_files_properties(heter_wrapper_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    heter_wrapper_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 endif(WITH_PSLIB)
-if (WITH_PSCORE)
-  if (WITH_ARM_BRPC)
-    set(DISTRIBUTE_COMPILE_FLAGS "-faligned-new -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
+if(WITH_PSCORE)
+  if(WITH_ARM_BRPC)
+    set(DISTRIBUTE_COMPILE_FLAGS
+        "-faligned-new -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result"
+    )
   else()
-    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
+    set(DISTRIBUTE_COMPILE_FLAGS
+        "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result"
+    )
   endif()
-  set_source_files_properties(fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   list(APPEND PYBIND_DEPS fleet communicator index_wrapper index_sampler)
   list(APPEND PYBIND_SRCS fleet_py.cc)
 endif()
 
-if (WITH_NCCL OR WITH_RCCL)
+if(WITH_NCCL OR WITH_RCCL)
   list(APPEND PYBIND_SRCS nccl_wrapper_py.cc)
 endif()
 
 if(WITH_PYTHON)
   # generate op pybind functions automatically for dygraph.
-  if (WITH_ASCEND_CL)
-    set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag ascend_wrapper)
+  if(WITH_ASCEND_CL)
+    set(OP_FUNCTION_GENERETOR_DEPS
+        pybind
+        proto_desc
+        executor
+        layer
+        tracer
+        engine
+        imperative_profiler
+        imperative_flag
+        ascend_wrapper)
   else()
-    set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
+    set(OP_FUNCTION_GENERETOR_DEPS
+        pybind
+        proto_desc
+        executor
+        layer
+        tracer
+        engine
+        imperative_profiler
+        imperative_flag)
   endif()
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})
 
-  if (WITH_NCCL OR WITH_RCCL)
+  if(WITH_NCCL OR WITH_RCCL)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context)
   endif()
 
@@ -176,13 +236,15 @@ if(WITH_PYTHON)
   add_executable(op_function_generator op_function_generator.cc)
   target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
   add_executable(eager_op_function_generator eager_op_function_generator.cc)
-  target_link_libraries(eager_op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
+  target_link_libraries(eager_op_function_generator
+                        ${OP_FUNCTION_GENERETOR_DEPS})
   if(NOT WIN32)
     add_executable(kernel_signature_generator kernel_signature_generator.cc)
-    target_link_libraries(kernel_signature_generator ${OP_FUNCTION_GENERETOR_DEPS})
+    target_link_libraries(kernel_signature_generator
+                          ${OP_FUNCTION_GENERETOR_DEPS})
   endif()
 
-  get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(op_function_generator ${os_dependency_modules})
   target_link_libraries(eager_op_function_generator ${os_dependency_modules})
   if(WITH_ROCM)
@@ -193,11 +255,13 @@ if(WITH_PYTHON)
 
   set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h)
   set(tmp_impl_file ${impl_file}.tmp)
-  set(eager_impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function_impl.h)
+  set(eager_impl_file
+      ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function_impl.h)
   set(tmp_eager_impl_file ${eager_impl_file}.tmp)
 
   set(OP_IMPL_DEPS op_function_generator)
-  set(EAGER_OP_IMPL_DEPS eager_op_function_generator eager_final_state_python_c_codegen)
+  set(EAGER_OP_IMPL_DEPS eager_op_function_generator
+                         eager_final_state_python_c_codegen)
 
   if(WIN32)
     if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
@@ -206,81 +270,103 @@ if(WITH_PYTHON)
       set(op_impl_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
     endif()
 
-    file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat ""
-    "set build_times=1\n"
-    ":retry\n"
-    "ECHO op_function_generator run %build_times% time\n"
-    "taskkill /f /im op_function_generator.exe 2>NUL\n"
-    "${op_impl_path}/op_function_generator.exe ${tmp_impl_file}\n"
-    "if %ERRORLEVEL% NEQ 0 (\n"
-    "    set /a build_times=%build_times%+1\n"
-    "    if %build_times% GEQ 10 (\n"
-    "        exit /b 1\n"
-    "    ) else (\n"
-    "        goto :retry\n"
-    "    )\n"
-    ")\n"
-    "exit /b 0")
-
-    file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat ""
-    "set build_times=1\n"
-    ":retry\n"
-    "ECHO eager_op_function_generator run %build_times% time\n"
-    "taskkill /f /im eager_op_function_generator.exe 2>NUL\n"
-    "${op_impl_path}/eager_op_function_generator.exe ${tmp_eager_impl_file}\n"
-    "if %ERRORLEVEL% NEQ 0 (\n"
-    "    set /a build_times=%build_times%+1\n"
-    "    if %build_times% GEQ 10 (\n"
-    "        exit /b 1\n"
-    "    ) else (\n"
-    "        goto :retry\n"
-    "    )\n"
-    ")\n"
-    "exit /b 0")
+    file(
+      WRITE
+      ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
+      ""
+      "set build_times=1\n"
+      ":retry\n"
+      "ECHO op_function_generator run %build_times% time\n"
+      "taskkill /f /im op_function_generator.exe 2>NUL\n"
+      "${op_impl_path}/op_function_generator.exe ${tmp_impl_file}\n"
+      "if %ERRORLEVEL% NEQ 0 (\n"
+      "    set /a build_times=%build_times%+1\n"
+      "    if %build_times% GEQ 10 (\n"
+      "        exit /b 1\n"
+      "    ) else (\n"
+      "        goto :retry\n"
+      "    )\n"
+      ")\n"
+      "exit /b 0")
+
+    file(
+      WRITE
+      ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat
+      ""
+      "set build_times=1\n"
+      ":retry\n"
+      "ECHO eager_op_function_generator run %build_times% time\n"
+      "taskkill /f /im eager_op_function_generator.exe 2>NUL\n"
+      "${op_impl_path}/eager_op_function_generator.exe ${tmp_eager_impl_file}\n"
+      "if %ERRORLEVEL% NEQ 0 (\n"
+      "    set /a build_times=%build_times%+1\n"
+      "    if %build_times% GEQ 10 (\n"
+      "        exit /b 1\n"
+      "    ) else (\n"
+      "        goto :retry\n"
+      "    )\n"
+      ")\n"
+      "exit /b 0")
 
     if(${CBLAS_PROVIDER} STREQUAL MKLML)
-      ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/libiomp5md.dll
-        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${op_impl_path}
+      add_custom_command(
+        OUTPUT ${op_impl_path}/libiomp5md.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB}
+                ${op_impl_path}
         DEPENDS mklml)
       list(APPEND OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll)
       list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll)
     else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
-      ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/openblas.dll
+      add_custom_command(
+        OUTPUT ${op_impl_path}/openblas.dll
         COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${op_impl_path}
         DEPENDS extern_openblas)
       list(APPEND OP_IMPL_DEPS ${op_impl_path}/openblas.dll)
       list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/openblas.dll)
     endif()
     if(WITH_MKLDNN)
-      ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/mkldnn.dll
+      add_custom_command(
+        OUTPUT ${op_impl_path}/mkldnn.dll
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${op_impl_path}
         DEPENDS mkldnn)
-        list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
-        list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
+      list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
     endif()
     if(WITH_ONNXRUNTIME)
-      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll
-        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR}
+      add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB}
+                ${CMAKE_CURRENT_BINARY_DIR}
         DEPENDS paddle2onnx)
       list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll)
-      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS
+           ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll)
 
-      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll
-        COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR} 
+      add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SHARED_LIB}
+                ${CMAKE_CURRENT_BINARY_DIR}
         DEPENDS onnxruntime)
       list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll)
-      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS
+           ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime.dll)
     endif()
 
-    add_custom_command(OUTPUT ${impl_file}
-      COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
+    add_custom_command(
+      OUTPUT ${impl_file}
+      COMMAND
+        ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file}
+              ${impl_file}
       COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
       DEPENDS ${OP_IMPL_DEPS})
     if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-      add_custom_command(OUTPUT ${eager_impl_file}
-        COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat
-        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file}
+      add_custom_command(
+        OUTPUT ${eager_impl_file}
+        COMMAND
+          ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file}
+                ${eager_impl_file}
         COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}"
         DEPENDS ${EAGER_OP_IMPL_DEPS})
     endif()
@@ -290,79 +376,120 @@ if(WITH_PYTHON)
     # LD_LIBRARY_PATH. This is different with Windows platformm, which search
     # *.dll in current directory automatically.
     if(WITH_ONNXRUNTIME)
-      if (APPLE)
-        set(PADDLE2ONNX_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.dylib)
-        set(ONNXRUNTIME_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.dylib)
+      if(APPLE)
+        set(PADDLE2ONNX_PYBIND_OUT
+            ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.dylib)
+        set(ONNXRUNTIME_PYBIND_OUT
+            ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.dylib)
       else()
-        set(PADDLE2ONNX_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.so)
-        set(ONNXRUNTIME_PYBIND_OUT ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.so)
+        set(PADDLE2ONNX_PYBIND_OUT
+            ${CMAKE_CURRENT_BINARY_DIR}/libpaddle2onnx.so)
+        set(ONNXRUNTIME_PYBIND_OUT
+            ${CMAKE_CURRENT_BINARY_DIR}/libonnxruntime.so)
       endif()
 
-      ADD_CUSTOM_COMMAND(OUTPUT ${PADDLE2ONNX_PYBIND_OUT}
-        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_LIB} ${CMAKE_CURRENT_BINARY_DIR}
+      add_custom_command(
+        OUTPUT ${PADDLE2ONNX_PYBIND_OUT}
+        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_LIB}
+                ${CMAKE_CURRENT_BINARY_DIR}
         DEPENDS paddle2onnx)
       list(APPEND OP_IMPL_DEPS ${PADDLE2ONNX_PYBIND_OUT})
       list(APPEND EAGER_OP_IMPL_DEPS ${PADDLE2ONNX_PYBIND_OUT})
 
-      ADD_CUSTOM_COMMAND(OUTPUT ${ONNXRUNTIME_PYBIND_OUT}
-        COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_LIB} ${CMAKE_CURRENT_BINARY_DIR} 
+      add_custom_command(
+        OUTPUT ${ONNXRUNTIME_PYBIND_OUT}
+        COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_LIB}
+                ${CMAKE_CURRENT_BINARY_DIR}
         DEPENDS onnxruntime)
       list(APPEND OP_IMPL_DEPS ${ONNXRUNTIME_PYBIND_OUT})
       list(APPEND EAGER_OP_IMPL_DEPS ${ONNXRUNTIME_PYBIND_OUT})
     endif()
 
     if(WITH_MKLML)
-      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so
-        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${CMAKE_CURRENT_BINARY_DIR}
+      add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB}
+                ${CMAKE_CURRENT_BINARY_DIR}
         DEPENDS mklml)
       list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so)
       list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so)
     endif()
     if(WITH_MKLDNN)
-      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0
-        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR}
+      add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB}
+                ${CMAKE_CURRENT_BINARY_DIR}
         DEPENDS mkldnn)
       list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0)
       list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0)
     endif()
-    add_custom_command(OUTPUT ${impl_file}
-          COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
-              "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator"
-              "${tmp_impl_file}"
-          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
-          COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
-          DEPENDS ${OP_IMPL_DEPS}
-          VERBATIM)
+    add_custom_command(
+      OUTPUT ${impl_file}
+      COMMAND
+        ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
+        "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator" "${tmp_impl_file}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file}
+              ${impl_file}
+      COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
+      DEPENDS ${OP_IMPL_DEPS}
+      VERBATIM)
     if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-      add_custom_command(OUTPUT ${eager_impl_file}
-            COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
-                "${CMAKE_CURRENT_BINARY_DIR}/eager_op_function_generator"
-                "${tmp_eager_impl_file}"
-            COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file}
-            COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}"
-            DEPENDS ${EAGER_OP_IMPL_DEPS}
-            VERBATIM)
-      endif()
+      add_custom_command(
+        OUTPUT ${eager_impl_file}
+        COMMAND
+          ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
+          "${CMAKE_CURRENT_BINARY_DIR}/eager_op_function_generator"
+          "${tmp_eager_impl_file}"
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file}
+                ${eager_impl_file}
+        COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}"
+        DEPENDS ${EAGER_OP_IMPL_DEPS}
+        VERBATIM)
+    endif()
   endif(WIN32)
   add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file})
   if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-    add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file})
+    add_custom_target(eager_op_function_generator_cmd ALL
+                      DEPENDS ${eager_impl_file})
   endif()
 
-  list(APPEND PYBIND_DEPS interpretercore standalone_executor staticgraph_executor_statistics)
-  cc_library(op_function_common SRCS op_function_common.cc DEPS ${PYBIND_DEPS})
+  list(APPEND PYBIND_DEPS interpretercore standalone_executor
+       staticgraph_executor_statistics)
+  cc_library(
+    op_function_common
+    SRCS op_function_common.cc
+    DEPS ${PYBIND_DEPS})
   list(APPEND PYBIND_DEPS op_function_common)
 
   if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-    cc_library(paddle_eager
-    SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc eager_py_layer.cc
-    DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node py_layer_node global_utils utils python custom_operator custom_operator_node)
+    cc_library(
+      paddle_eager
+      SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc
+           eager_utils.cc eager_py_layer.cc
+      DEPS eager_api
+           autograd_meta
+           backward
+           grad_node_info
+           phi
+           op_function_common
+           final_dygraph_function
+           final_dygraph_node
+           dygraph_function
+           dygraph_node
+           accumulation_node
+           py_layer_node
+           global_utils
+           utils
+           python
+           custom_operator
+           custom_operator_node)
     add_dependencies(paddle_eager eager_codegen)
     add_dependencies(paddle_eager eager_op_function_generator_cmd)
     list(APPEND PYBIND_DEPS paddle_eager)
   endif()
 
-  cc_library(paddle_pybind SHARED
+  cc_library(
+    paddle_pybind SHARED
     SRCS ${PYBIND_SRCS}
     DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${GLOB_DEV_LIB})
 
@@ -374,7 +501,7 @@ if(WITH_PYTHON)
     target_link_libraries(paddle_pybind ${ROCM_HIPRTC_LIB})
   endif()
 
-  get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(paddle_pybind ${os_dependency_modules})
   add_dependencies(paddle_pybind op_function_generator_cmd)
 endif(WITH_PYTHON)
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc
index fdf3a12a81fb2..8c1eb2c1b9003 100644
--- a/paddle/fluid/pybind/ascend_wrapper_py.cc
+++ b/paddle/fluid/pybind/ascend_wrapper_py.cc
@@ -26,11 +26,13 @@ limitations under the License. */
 #include <ge/ge_api.h>
 #include <graph/attr_value.h>
 #include <graph/operator_factory.h>
+
 #include <map>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 #include "paddle/fluid/platform/device/npu/ascend_npu_info.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -78,8 +80,9 @@ ge::Status ge_initialize(
   py::gil_scoped_release release;
   auto init_options = convert_map(options);
   ge::Status res = ge::GEInitialize(init_options);
-  PADDLE_ENFORCE_EQ(res, ge::SUCCESS, platform::errors::Fatal(
-                                          "ge initialize not success:%d", res));
+  PADDLE_ENFORCE_EQ(
+      res, ge::SUCCESS,
+      platform::errors::Fatal("ge initialize not success:%d", res));
   py::gil_scoped_acquire acquire;
   return res;
 }
@@ -253,7 +256,7 @@ void BindAscendGraph(py::module *m) {
         return std::unique_ptr<ge::Session>(
             new ge::Session(convert_map(options)));
       }))
-      .def("add_graph", (ge::Status (Session::*)(uint32_t, const Graph &)) &
+      .def("add_graph", (ge::Status(Session::*)(uint32_t, const Graph &)) &
                             Session::AddGraph)
       .def("add_graph",
            [](Session &ss, uint32_t index, const Graph &graph,
@@ -261,14 +264,15 @@ void BindAscendGraph(py::module *m) {
              return ss.AddGraph(index, graph, convert_map(options));
            })
       .def("remove_graph", &Session::RemoveGraph)
-      .def("run_graph",
-           [](Session &ss, uint32_t graphId,
-              const std::vector<Tensor> &inputs) -> py::tuple {
-             std::vector<Tensor> outputs;
-             ge::Status res = ss.RunGraph(graphId, inputs, outputs);
-             return py::make_tuple(outputs, res);
-           },
-           py::call_guard<py::gil_scoped_release>())
+      .def(
+          "run_graph",
+          [](Session &ss, uint32_t graphId,
+             const std::vector<Tensor> &inputs) -> py::tuple {
+            std::vector<Tensor> outputs;
+            ge::Status res = ss.RunGraph(graphId, inputs, outputs);
+            return py::make_tuple(outputs, res);
+          },
+          py::call_guard<py::gil_scoped_release>())
       .def("build_graph", &Session::BuildGraph)
       .def("run_graph_async", &Session::RunGraphAsync)
 #ifdef PADDLE_WITH_ASCEND_STRING
@@ -385,7 +389,7 @@ void BindAscendGraph(py::module *m) {
            })
 #ifdef PADDLE_WITH_ASCEND_STRING
       .def("get_input_desc",
-           (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc)
+           (TensorDesc(Operator::*)(uint32_t) const) & Operator::GetInputDesc)
       .def("get_input_desc",
            [](Operator &op, const std::string &name) {
              return op.GetInputDescByName(name.c_str());
@@ -420,7 +424,7 @@ void BindAscendGraph(py::module *m) {
              return op.GetOutputDescByName(name.c_str());
            })
       .def("get_output_desc",
-           (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetOutputDesc)
+           (TensorDesc(Operator::*)(uint32_t) const) & Operator::GetOutputDesc)
       .def("update_output_desc",
            static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
                const char *, const TensorDesc &)>(&Operator::UpdateOutputDesc))
@@ -779,19 +783,18 @@ void BindAscendGraph(py::module *m) {
       .def("get_tensor_desc", &Tensor::GetTensorDesc)
       // .def("set_data", (graphStatus(Tensor::*)(std::vector<uint8_t> &&)) &
       // Tensor::SetData)
-      .def("set_data", (graphStatus (Tensor::*)(const std::vector<uint8_t> &)) &
+      .def("set_data", (graphStatus(Tensor::*)(const std::vector<uint8_t> &)) &
                            Tensor::SetData)
       .def("set_data",
-           (graphStatus (Tensor::*)(const uint8_t *, size_t)) & Tensor::SetData)
+           (graphStatus(Tensor::*)(const uint8_t *, size_t)) & Tensor::SetData)
 #ifdef PADDLE_WITH_ASCEND_STRING
-      .def("set_data",
-           (graphStatus (Tensor::*)(const char *)) & Tensor::SetData)
+      .def("set_data", (graphStatus(Tensor::*)(const char *)) & Tensor::SetData)
 #else
       .def("set_data",
            (graphStatus (Tensor::*)(const std::string &)) & Tensor::SetData)
 #endif
       .def("set_data",
-           (graphStatus (Tensor::*)(const std::vector<AscendString> &)) &
+           (graphStatus(Tensor::*)(const std::vector<AscendString> &)) &
                Tensor::SetData)
 
       .def("get_data",
@@ -813,8 +816,9 @@ void BindAscendGraph(py::module *m) {
       .def(py::init<Shape, Format, DataType>(), py::arg("shape"),
            py::arg("format") = FORMAT_ND, py::arg("dt") = DT_FLOAT)
       .def(py::init<const TensorDesc &>())
-      .def("update", (void (TensorDesc::*)(const Shape &, Format, DataType)) &
-                         TensorDesc::Update,
+      .def("update",
+           (void(TensorDesc::*)(const Shape &, Format, DataType)) &
+               TensorDesc::Update,
            py::arg("shape"), py::arg("format") = FORMAT_ND,
            py::arg("dt") = DT_FLOAT)
       .def("set_shape", &TensorDesc::SetShape)
diff --git a/paddle/fluid/pybind/bind_cost_model.cc b/paddle/fluid/pybind/bind_cost_model.cc
index a4a40f1fd02c9..ef2fe0dd3d446 100644
--- a/paddle/fluid/pybind/bind_cost_model.cc
+++ b/paddle/fluid/pybind/bind_cost_model.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/pybind/bind_cost_model.h"
 
 #include <pybind11/stl.h>
+
 #include "paddle/fluid/framework/ir/cost_model.h"
 #include "paddle/fluid/framework/program_desc.h"
 
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index 8491d1e224930..6bd032037443e 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -13,10 +13,13 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/bind_fleet_executor.h"
+
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/distributed/fleet_executor/dist_model.h"
 #include "paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
@@ -62,13 +65,13 @@ struct npy_format_descriptor<paddle::platform::float16> {
 namespace paddle {
 namespace pybind {
 
-using paddle::distributed::FleetExecutor;
-using paddle::distributed::TaskNode;
-using paddle::distributed::DistModelConfig;
 using paddle::distributed::DistModel;
+using paddle::distributed::DistModelConfig;
 using paddle::distributed::DistModelDataBuf;
-using paddle::distributed::DistModelTensor;
 using paddle::distributed::DistModelDataType;
+using paddle::distributed::DistModelTensor;
+using paddle::distributed::FleetExecutor;
+using paddle::distributed::TaskNode;
 using paddle::framework::OpDesc;
 using paddle::framework::ProgramDesc;
 
@@ -217,33 +220,34 @@ void BindFleetExecutor(py::module* m) {
       .def("reset", &DistModelDataBufReset<float>)
       .def("reset", &DistModelDataBufReset<paddle::platform::float16>)
       .def("length", &DistModelDataBuf::length)
-      .def("tolist", [](DistModelDataBuf& self,
-                        const std::string& dtype) -> py::list {
-        py::list l;
-        if (dtype == "int32") {
-          auto* data = static_cast<int32_t*>(self.data());
-          auto size = self.length() / sizeof(int32_t);
-          l = py::cast(std::vector<int32_t>(data, data + size));
-        } else if (dtype == "int64") {
-          auto* data = static_cast<int64_t*>(self.data());
-          auto size = self.length() / sizeof(int64_t);
-          l = py::cast(std::vector<int64_t>(data, data + size));
-        } else if (dtype == "float32") {
-          auto* data = static_cast<float*>(self.data());
-          auto size = self.length() / sizeof(float);
-          l = py::cast(std::vector<float>(data, data + size));
-        } else if (dtype == "float16") {
-          auto* data = static_cast<paddle::platform::float16*>(self.data());
-          auto size = self.length() / sizeof(paddle::platform::float16);
-          l = py::cast(
-              std::vector<paddle::platform::float16>(data, data + size));
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported data type. Now only supports INT32, INT64, "
-              "FLOAT16 and FLOAT32."));
-        }
-        return l;
-      });
+      .def("tolist",
+           [](DistModelDataBuf& self, const std::string& dtype) -> py::list {
+             py::list l;
+             if (dtype == "int32") {
+               auto* data = static_cast<int32_t*>(self.data());
+               auto size = self.length() / sizeof(int32_t);
+               l = py::cast(std::vector<int32_t>(data, data + size));
+             } else if (dtype == "int64") {
+               auto* data = static_cast<int64_t*>(self.data());
+               auto size = self.length() / sizeof(int64_t);
+               l = py::cast(std::vector<int64_t>(data, data + size));
+             } else if (dtype == "float32") {
+               auto* data = static_cast<float*>(self.data());
+               auto size = self.length() / sizeof(float);
+               l = py::cast(std::vector<float>(data, data + size));
+             } else if (dtype == "float16") {
+               auto* data =
+                   static_cast<paddle::platform::float16*>(self.data());
+               auto size = self.length() / sizeof(paddle::platform::float16);
+               l = py::cast(
+                   std::vector<paddle::platform::float16>(data, data + size));
+             } else {
+               PADDLE_THROW(platform::errors::Unimplemented(
+                   "Unsupported data type. Now only supports INT32, INT64, "
+                   "FLOAT16 and FLOAT32."));
+             }
+             return l;
+           });
 
   py::class_<DistModelTensor>(*m, "DistModelTensor")
       .def(py::init<>())
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
index aef02d65b4dbd..418804df02879 100644
--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -12,16 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/pybind/communication.h"
+
 #include <Python.h>
 #include <pybind11/chrono.h>
 #include <pybind11/complex.h>
 #include <pybind11/functional.h>
 #include <pybind11/stl.h>
+
 #include <chrono>
 #include <string>
 
 #include "paddle/fluid/distributed/store/tcp_store.h"
-#include "paddle/fluid/pybind/communication.h"
 
 namespace py = pybind11;
 
@@ -35,22 +37,24 @@ void BindTCPStore(py::module *m) {
       py::class_<distributed::Store, std::shared_ptr<distributed::Store>>(
           *m, "Store")
           .def(py::init<>())
-          .def("set",
-               [](distributed::Store &self, const std::string &key,
-                  const std::string &value) {
-                 std::vector<uint8_t> data(value.begin(), value.end());
-                 self.set(key, data);
-               },
-               py::arg("key"), py::arg("value"),
-               py::call_guard<py::gil_scoped_release>())
-          .def("get",
-               [](distributed::Store &self,
-                  const std::string &key) -> py::bytes {
-                 auto data = self.get(key);
-                 return py::bytes(reinterpret_cast<char *>(data.data()),
-                                  data.size());
-               },
-               py::arg("key"), py::call_guard<py::gil_scoped_release>())
+          .def(
+              "set",
+              [](distributed::Store &self, const std::string &key,
+                 const std::string &value) {
+                std::vector<uint8_t> data(value.begin(), value.end());
+                self.set(key, data);
+              },
+              py::arg("key"), py::arg("value"),
+              py::call_guard<py::gil_scoped_release>())
+          .def(
+              "get",
+              [](distributed::Store &self,
+                 const std::string &key) -> py::bytes {
+                auto data = self.get(key);
+                return py::bytes(reinterpret_cast<char *>(data.data()),
+                                 data.size());
+              },
+              py::arg("key"), py::call_guard<py::gil_scoped_release>())
           .def("add", &distributed::Store::add,
                py::call_guard<py::gil_scoped_release>())
           .def("wait", &distributed::Store::wait,
diff --git a/paddle/fluid/pybind/communicator_py.cc b/paddle/fluid/pybind/communicator_py.cc
index 723d7f3197230..0cb5aa6ef7023 100644
--- a/paddle/fluid/pybind/communicator_py.cc
+++ b/paddle/fluid/pybind/communicator_py.cc
@@ -15,16 +15,17 @@ limitations under the License. */
 #include "paddle/fluid/pybind/communicator_py.h"
 
 #include <Python.h>
+
 #include <map>
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/program_desc.h"
-#include "pybind11/pybind11.h"
 
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/distributed/communicator.h"
 #include "paddle/fluid/operators/distributed/large_scale_kv.h"
 #include "paddle/fluid/operators/distributed/ps/service/communicator/communicator_common.h"
+#include "pybind11/pybind11.h"
 
 namespace py = pybind11;
 
diff --git a/paddle/fluid/pybind/compatible.cc b/paddle/fluid/pybind/compatible.cc
index cfe87a86cf0e5..013d0cc0c6068 100644
--- a/paddle/fluid/pybind/compatible.cc
+++ b/paddle/fluid/pybind/compatible.cc
@@ -13,23 +13,25 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/compatible.h"
+
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
 
 namespace py = pybind11;
 
-using paddle::framework::compatible::OpAttrVariantT;
-using paddle::framework::compatible::OpUpdateInfo;
 using paddle::framework::compatible::OpAttrInfo;
-using paddle::framework::compatible::OpInputOutputInfo;
+using paddle::framework::compatible::OpAttrVariantT;
 using paddle::framework::compatible::OpBugfixInfo;
-using paddle::framework::compatible::OpUpdateType;
-using paddle::framework::compatible::OpUpdateBase;
-using paddle::framework::compatible::OpVersionDesc;
 using paddle::framework::compatible::OpCheckpoint;
+using paddle::framework::compatible::OpInputOutputInfo;
+using paddle::framework::compatible::OpUpdateBase;
+using paddle::framework::compatible::OpUpdateInfo;
+using paddle::framework::compatible::OpUpdateType;
 using paddle::framework::compatible::OpVersion;
+using paddle::framework::compatible::OpVersionDesc;
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 8b48d0b4e44ca..89a3904d0003f 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/const_value.h"
+
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/pybind/crypto.cc b/paddle/fluid/pybind/crypto.cc
index 8fbf395bf18a6..07a9e4021cee7 100644
--- a/paddle/fluid/pybind/crypto.cc
+++ b/paddle/fluid/pybind/crypto.cc
@@ -97,11 +97,12 @@ void BindAESCipher(py::module* m) {
 void BindCipherFactory(py::module* m) {
   py::class_<CipherFactory>(*m, "CipherFactory")
       .def(py::init<>())
-      .def_static("create_cipher",
-                  [](const std::string& config_file) {
-                    return CipherFactory::CreateCipher(config_file);
-                  },
-                  py::arg("config_file") = std::string());
+      .def_static(
+          "create_cipher",
+          [](const std::string& config_file) {
+            return CipherFactory::CreateCipher(config_file);
+          },
+          py::arg("config_file") = std::string());
 }
 
 void BindCipherUtils(py::module* m) {
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 64c145c94f99d..54080d5e09615 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -12,13 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/pybind/cuda_streams_py.h"
+
 #include <string>
 #include <vector>
 
 #include "paddle/fluid/platform/device_event_base.h"
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
-#include "paddle/fluid/pybind/cuda_streams_py.h"
 
 namespace py = pybind11;
 
@@ -28,29 +29,31 @@ void BindCudaStream(py::module *m_ptr) {
   auto &m = *m_ptr;
 
   // Bind Methods
-  m.def("_get_current_stream",
-        [](int deviceId) {
+  m.def(
+      "_get_current_stream",
+      [](int deviceId) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-          return paddle::platform::stream::get_current_stream(deviceId);
+        return paddle::platform::stream::get_current_stream(deviceId);
 #else
-          PADDLE_THROW(platform::errors::Unavailable(
-              "Paddle is not compiled with CUDA. Cannot visit cuda current"
-              "stream."));
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CUDA. Cannot visit cuda current"
+            "stream."));
 #endif
-        },
-        py::return_value_policy::reference);
+      },
+      py::return_value_policy::reference);
 
-  m.def("_set_current_stream",
-        [](paddle::platform::stream::CUDAStream &stream) {
+  m.def(
+      "_set_current_stream",
+      [](paddle::platform::stream::CUDAStream &stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-          return paddle::platform::stream::set_current_stream(&stream);
+        return paddle::platform::stream::set_current_stream(&stream);
 #else
-          PADDLE_THROW(platform::errors::Unavailable(
-              "Paddle is not compiled with CUDA. Cannot set cuda current "
-              "stream."));
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CUDA. Cannot set cuda current "
+            "stream."));
 #endif
-        },
-        py::return_value_policy::reference);
+      },
+      py::return_value_policy::reference);
 
   m.def("_device_synchronize", [](int device_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -94,12 +97,13 @@ void BindCudaStream(py::module *m_ptr) {
 
   )DOC")
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      .def("wait_event",
-           [](paddle::platform::stream::CUDAStream &self,
-              paddle::platform::CudaEvent &event) {
-             self.WaitEvent(event.GetRawCudaEvent());
-           },
-           R"DOC(
+      .def(
+          "wait_event",
+          [](paddle::platform::stream::CUDAStream &self,
+             paddle::platform::CudaEvent &event) {
+            self.WaitEvent(event.GetRawCudaEvent());
+          },
+          R"DOC(
       Makes all future work submitted to stream wait for all work captured in event.
 
       Parameters:
@@ -115,15 +119,16 @@ void BindCudaStream(py::module *m_ptr) {
           s.wait_event(event)
 
            )DOC")
-      .def("wait_stream",
-           [](paddle::platform::stream::CUDAStream &self,
-              paddle::platform::stream::CUDAStream &stream) {
-             paddle::platform::CudaEvent event;
-             event.Record(stream.raw_stream());
-
-             self.WaitEvent(event.GetRawCudaEvent());
-           },
-           R"DOC(
+      .def(
+          "wait_stream",
+          [](paddle::platform::stream::CUDAStream &self,
+             paddle::platform::stream::CUDAStream &stream) {
+            paddle::platform::CudaEvent event;
+            event.Record(stream.raw_stream());
+
+            self.WaitEvent(event.GetRawCudaEvent());
+          },
+          R"DOC(
       Synchronizes with the given stream.
 
       Parameters:
@@ -139,11 +144,12 @@ void BindCudaStream(py::module *m_ptr) {
             s1.wait_stream(s2)
 
            )DOC")
-      .def("query",
-           [](paddle::platform::stream::CUDAStream &self) {
-             return self.Query();
-           },
-           R"DOC(
+      .def(
+          "query",
+          [](paddle::platform::stream::CUDAStream &self) {
+            return self.Query();
+          },
+          R"DOC(
       Return the status whether if all operations in stream have completed.
 
       Returns: A boolean value.
@@ -157,11 +163,12 @@ void BindCudaStream(py::module *m_ptr) {
             is_done = s.query()
 
            )DOC")
-      .def("synchronize",
-           [](paddle::platform::stream::CUDAStream &self) {
-             self.Synchronize();
-           },
-           R"DOC(
+      .def(
+          "synchronize",
+          [](paddle::platform::stream::CUDAStream &self) {
+            self.Synchronize();
+          },
+          R"DOC(
       Waits for stream tasks to complete.
 
       Examples:
@@ -173,16 +180,17 @@ void BindCudaStream(py::module *m_ptr) {
             s.synchronize()
 
            )DOC")
-      .def("record_event",
-           [](paddle::platform::stream::CUDAStream &self,
-              paddle::platform::CudaEvent *event) {
-             if (event == nullptr) {
-               event = new paddle::platform::CudaEvent();
-             }
-             event->Record(self.raw_stream());
-             return event;
-           },
-           R"DOC(
+      .def(
+          "record_event",
+          [](paddle::platform::stream::CUDAStream &self,
+             paddle::platform::CudaEvent *event) {
+            if (event == nullptr) {
+              event = new paddle::platform::CudaEvent();
+            }
+            event->Record(self.raw_stream());
+            return event;
+          },
+          R"DOC(
       Record a CUDA event in the stream.
 
       Parameters:
@@ -201,7 +209,7 @@ void BindCudaStream(py::module *m_ptr) {
             event = s.record_event()
 
            )DOC",
-           py::arg("event") = nullptr)
+          py::arg("event") = nullptr)
       .def_property_readonly(
           "cuda_stream",
           [](paddle::platform::stream::CUDAStream &self) {
@@ -225,32 +233,33 @@ void BindCudaStream(py::module *m_ptr) {
 
            )DOC")
 #endif
-      .def("__init__",
-           [](paddle::platform::stream::CUDAStream &self,
-              platform::CUDAPlace *device, int priority) {
+      .def(
+          "__init__",
+          [](paddle::platform::stream::CUDAStream &self,
+             platform::CUDAPlace *device, int priority) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-             if (priority != 1 && priority != 2) {
-               PADDLE_THROW(platform::errors::InvalidArgument(
-                   "Priority should be 1(high) or 2(normal) "));
-             }
-             auto prio = paddle::platform::stream::Priority(priority);
-             auto stream_flag =
-                 paddle::platform::stream::StreamFlag::kStreamNonBlocking;
-
-             if (device == nullptr) {
-               int curr_device_id = platform::GetCurrentDeviceId();
-               auto device_tmp = platform::CUDAPlace(curr_device_id);
-               device = &device_tmp;
-             }
-
-             new (&self) paddle::platform::stream::CUDAStream(*device, prio,
-                                                              stream_flag);
+            if (priority != 1 && priority != 2) {
+              PADDLE_THROW(platform::errors::InvalidArgument(
+                  "Priority should be 1(high) or 2(normal) "));
+            }
+            auto prio = paddle::platform::stream::Priority(priority);
+            auto stream_flag =
+                paddle::platform::stream::StreamFlag::kStreamNonBlocking;
+
+            if (device == nullptr) {
+              int curr_device_id = platform::GetCurrentDeviceId();
+              auto device_tmp = platform::CUDAPlace(curr_device_id);
+              device = &device_tmp;
+            }
+
+            new (&self) paddle::platform::stream::CUDAStream(*device, prio,
+                                                             stream_flag);
 #else
             PADDLE_THROW(platform::errors::Unavailable(
         "Class CUDAStream can only be initialized on the GPU platform."));
 #endif
-           },
-           py::arg("device") = nullptr, py::arg("priority") = 2)
+          },
+          py::arg("device") = nullptr, py::arg("priority") = 2)
       .def(
           "__init__",
           [](paddle::platform::stream::CUDAStream &self, int device,
@@ -315,15 +324,16 @@ void BindCudaStream(py::module *m_ptr) {
 
   )DOC")
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      .def("record",
-           [](paddle::platform::CudaEvent &self,
-              paddle::platform::stream::CUDAStream *stream) {
-             if (stream == nullptr) {
-               stream = paddle::platform::stream::get_current_stream(-1);
-             }
-             self.Record(stream->raw_stream());
-           },
-           R"DOC(
+      .def(
+          "record",
+          [](paddle::platform::CudaEvent &self,
+             paddle::platform::stream::CUDAStream *stream) {
+            if (stream == nullptr) {
+              stream = paddle::platform::stream::get_current_stream(-1);
+            }
+            self.Record(stream->raw_stream());
+          },
+          R"DOC(
           Records the event in the given stream.
 
           Parameters:
@@ -338,10 +348,11 @@ void BindCudaStream(py::module *m_ptr) {
               event.record()
     
         )DOC",
-           py::arg("stream") = nullptr)
-      .def("query",
-           [](paddle::platform::CudaEvent &self) { return self.Query(); },
-           R"DOC(
+          py::arg("stream") = nullptr)
+      .def(
+          "query",
+          [](paddle::platform::CudaEvent &self) { return self.Query(); },
+          R"DOC(
           Queries the event's status.
 
           Returns: A boolean which indicates all work currently captured by the event has been completed.
@@ -355,8 +366,9 @@ void BindCudaStream(py::module *m_ptr) {
                 is_done = event.query()
 
            )DOC")
-      .def("synchronize",
-           [](paddle::platform::CudaEvent &self) { self.Synchronize(); }, R"DOC(
+      .def(
+          "synchronize",
+          [](paddle::platform::CudaEvent &self) { self.Synchronize(); }, R"DOC(
             Waits for an event to complete.
 
             Examples:
@@ -369,22 +381,23 @@ void BindCudaStream(py::module *m_ptr) {
 
            )DOC")
 #endif
-      .def("__init__",
-           [](paddle::platform::CudaEvent &self, bool enable_timing,
-              bool blocking, bool interprocess) {
+      .def(
+          "__init__",
+          [](paddle::platform::CudaEvent &self, bool enable_timing,
+             bool blocking, bool interprocess) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-             unsigned int flags = platform::GenerateDeviceEventFlag(
-                 enable_timing, blocking, interprocess);
-             new (&self) paddle::platform::CudaEvent(flags);
+            unsigned int flags = platform::GenerateDeviceEventFlag(
+                enable_timing, blocking, interprocess);
+            new (&self) paddle::platform::CudaEvent(flags);
 #else
-             PADDLE_THROW(platform::errors::Unavailable(
-                 "Class CUDAEvent can only be initialized on the GPU "
-                 "platform."));
+            PADDLE_THROW(platform::errors::Unavailable(
+                "Class CUDAEvent can only be initialized on the GPU "
+                "platform."));
 
 #endif
-           },
-           py::arg("enable_timing") = false, py::arg("blocking") = false,
-           py::arg("interprocess") = false);
+          },
+          py::arg("enable_timing") = false, py::arg("blocking") = false,
+          py::arg("interprocess") = false);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index 5e2274cb65138..700bd458a58eb 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/text_format.h"
 #include "paddle/fluid/framework/async_executor.h"
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 6636fc8aca51d..3d1a81da6f382 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -109,132 +109,141 @@ void BindDistributed(py::module *m) {
           .def("rank", &distributed::ProcessGroup::GetRank)
           .def("size", &distributed::ProcessGroup::GetSize)
           .def("name", &distributed::ProcessGroup::GetBackendName)
-          .def("allreduce",
-               [](distributed::ProcessGroup &self, py::handle py_tensor,
-                  distributed::ReduceOp op) {
-                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
-                 distributed::AllreduceOptions opts;
-                 opts.reduce_op = op;
-                 auto dense =
-                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                 std::vector<phi::DenseTensor> tensors = {*dense};
-                 return self.AllReduce(tensors, tensors, opts);
-               },
-               py::arg("tensor"), py::arg("op") = distributed::ReduceOp::SUM,
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("broadcast",
-               [](distributed::ProcessGroup &self, py::handle py_tensor,
-                  int source_rank) {
-                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
-                 distributed::BroadcastOptions opts;
-                 opts.source_rank = source_rank;
-                 auto dense =
-                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                 std::vector<phi::DenseTensor> tensors = {*dense};
-                 return self.Broadcast(tensors, tensors, opts);
-               },
-               py::arg("tensor"), py::arg("source_rank"),
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("barrier",
-               [](distributed::ProcessGroup &self, std::vector<int> place_ids) {
-                 distributed::BarrierOptions opts;
-                 opts.place_ids = place_ids;
-                 return self.Barrier(opts);
-               },
-               py::arg("place_ids") = std::vector<int>{},
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("send",
-               [](distributed::ProcessGroup &self, py::handle py_tensor,
-                  int dst) {
-                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
-                 auto dense =
-                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                 std::vector<phi::DenseTensor> tensors = {*dense};
-                 return self.Send(tensors, dst);
-               },
-               py::arg("tensor"), py::arg("dst"),
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("recv",
-               [](distributed::ProcessGroup &self, py::handle py_tensor,
-                  int src) {
-                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
-                 auto dense =
-                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                 std::vector<phi::DenseTensor> tensors = {*dense};
-                 return self.Recv(tensors, src);
-               },
-               py::arg("tensor"), py::arg("src"),
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("all_gather",
-               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
-                  py::handle py_out_tensor) {
-                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
-                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
-                 auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
-                     in_tensor.impl());
-                 auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
-                     out_tensor.impl());
-                 std::vector<phi::DenseTensor> in_tensors = {*in_dense};
-                 std::vector<phi::DenseTensor> out_tensors = {*out_dense};
-                 return self.AllGather(in_tensors, out_tensors);
-               },
-               py::arg("in"), py::arg("out"),
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("alltoall",
-               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
-                  py::handle py_out_tensor) {
-                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
-                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
-                 auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
-                     in_tensor.impl());
-                 auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
-                     out_tensor.impl());
-                 std::vector<phi::DenseTensor> in_tensors = {*in_dense};
-                 std::vector<phi::DenseTensor> out_tensors = {*out_dense};
-                 return self.AllToAll(in_tensors, out_tensors);
-               },
-               py::arg("in"), py::arg("out"),
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("reduce",
-               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
-                  int dst, distributed::ReduceOp op) {
-                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
-                 distributed::ReduceOptions opts;
-                 opts.reduce_op = op;
-                 opts.root_rank = dst;
-                 auto dense = std::dynamic_pointer_cast<phi::DenseTensor>(
-                     in_tensor.impl());
-                 std::vector<phi::DenseTensor> tensors = {*dense};
-                 return self.Reduce(tensors, tensors, opts);
-               },
-               py::arg("tensor"), py::arg("dst"),
-               py::arg("op") = distributed::ReduceOp::SUM,
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("scatter",
-               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
-                  py::handle py_out_tensor, int src) {
-                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
-                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
-                 distributed::ScatterOptions opts;
-                 opts.root_rank = src;
-                 auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
-                     in_tensor.impl());
-                 auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
-                     out_tensor.impl());
-                 std::vector<phi::DenseTensor> in_tensors = {*in_dense};
-                 std::vector<phi::DenseTensor> out_tensors = {*out_dense};
-                 return self.Scatter(in_tensors, out_tensors, opts);
-               },
-               py::arg("in"), py::arg("out"), py::arg("src"),
-               py::call_guard<py::gil_scoped_release>());
+          .def(
+              "allreduce",
+              [](distributed::ProcessGroup &self, py::handle py_tensor,
+                 distributed::ReduceOp op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                distributed::AllreduceOptions opts;
+                opts.reduce_op = op;
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.AllReduce(tensors, tensors, opts);
+              },
+              py::arg("tensor"), py::arg("op") = distributed::ReduceOp::SUM,
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "broadcast",
+              [](distributed::ProcessGroup &self, py::handle py_tensor,
+                 int source_rank) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                distributed::BroadcastOptions opts;
+                opts.source_rank = source_rank;
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Broadcast(tensors, tensors, opts);
+              },
+              py::arg("tensor"), py::arg("source_rank"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "barrier",
+              [](distributed::ProcessGroup &self, std::vector<int> place_ids) {
+                distributed::BarrierOptions opts;
+                opts.place_ids = place_ids;
+                return self.Barrier(opts);
+              },
+              py::arg("place_ids") = std::vector<int>{},
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "send",
+              [](distributed::ProcessGroup &self, py::handle py_tensor,
+                 int dst) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Send(tensors, dst);
+              },
+              py::arg("tensor"), py::arg("dst"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "recv",
+              [](distributed::ProcessGroup &self, py::handle py_tensor,
+                 int src) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Recv(tensors, src);
+              },
+              py::arg("tensor"), py::arg("src"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "all_gather",
+              [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                 py::handle py_out_tensor) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> in_tensors = {*in_dense};
+                std::vector<phi::DenseTensor> out_tensors = {*out_dense};
+                return self.AllGather(in_tensors, out_tensors);
+              },
+              py::arg("in"), py::arg("out"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "alltoall",
+              [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                 py::handle py_out_tensor) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> in_tensors = {*in_dense};
+                std::vector<phi::DenseTensor> out_tensors = {*out_dense};
+                return self.AllToAll(in_tensors, out_tensors);
+              },
+              py::arg("in"), py::arg("out"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "reduce",
+              [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                 int dst, distributed::ReduceOp op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                distributed::ReduceOptions opts;
+                opts.reduce_op = op;
+                opts.root_rank = dst;
+                auto dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Reduce(tensors, tensors, opts);
+              },
+              py::arg("tensor"), py::arg("dst"),
+              py::arg("op") = distributed::ReduceOp::SUM,
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "scatter",
+              [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                 py::handle py_out_tensor, int src) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                distributed::ScatterOptions opts;
+                opts.root_rank = src;
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> in_tensors = {*in_dense};
+                std::vector<phi::DenseTensor> out_tensors = {*out_dense};
+                return self.Scatter(in_tensors, out_tensors, opts);
+              },
+              py::arg("in"), py::arg("out"), py::arg("src"),
+              py::call_guard<py::gil_scoped_release>());
 
 #if defined(PADDLE_WITH_NCCL)
   py::class_<distributed::ProcessGroupNCCL,
@@ -316,29 +325,31 @@ void BindDistributed(py::module *m) {
                   &ProcessGroupGloo::createDefaultDevice);
 #endif
 
-  m->def("eager_assign_group_by_size",
-         [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
-            std::vector<size_t> group_size_limits,
-            std::vector<int64_t> tensor_indices) {
-           auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
-           return distributed::Eager_AssignGroupBySize(
-               tensors, is_sparse_gradient, group_size_limits, tensor_indices);
-         },
-         py::arg("tensors"), py::arg("is_sparse_gradient"),
-         py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
-         py::arg("tensor_indices") = std::vector<int64_t>{},
-         py::call_guard<py::gil_scoped_release>());
+  m->def(
+      "eager_assign_group_by_size",
+      [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
+         std::vector<size_t> group_size_limits,
+         std::vector<int64_t> tensor_indices) {
+        auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
+        return distributed::Eager_AssignGroupBySize(
+            tensors, is_sparse_gradient, group_size_limits, tensor_indices);
+      },
+      py::arg("tensors"), py::arg("is_sparse_gradient"),
+      py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
+      py::arg("tensor_indices") = std::vector<int64_t>{},
+      py::call_guard<py::gil_scoped_release>());
 
   py::class_<distributed::EagerReducer,
              std::shared_ptr<distributed::EagerReducer>>(*m, "EagerReducer",
                                                          R"DOC()DOC")
       .def(py::init(&CreateEagerReducer))
-      .def("prepare_for_backward",
-           [](distributed::EagerReducer &self, py::handle py_tensors) {
-             auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
-             self.PrepareForBackward(params);
-           },
-           py::arg("tensors"), py::call_guard<py::gil_scoped_release>());
+      .def(
+          "prepare_for_backward",
+          [](distributed::EagerReducer &self, py::handle py_tensors) {
+            auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
+            self.PrepareForBackward(params);
+          },
+          py::arg("tensors"), py::call_guard<py::gil_scoped_release>());
 }
 
 }  // end namespace pybind
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index c1b26ee0b792d..f9325d1b9ca53 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -9,6 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 // disable numpy compile error
+#include "paddle/fluid/pybind/eager.h"
+
 #include <Python.h>
 
 #include <string>
@@ -22,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
@@ -488,45 +489,45 @@ void AutoInitStringTensorByStringTensor(
 }
 
 /** We should have init function with signature:
-   * 1.
-   * def __init__ ()
-   * 2.
-   * def __init__ (
-   * ** dtype: paddle::framework::proto::VarType::Type,
-   * ** dims: vector<int>,
-   * ** name: std::string,
-   * ** type: paddle::framework::proto::VarType::LodTensor,
-   * ** persistable: bool)
-   * 3. (multi-place)
-   * (should have at least one parameter, one parameter equals to case 4, zero
-   * parameter equals to case 1)
-   * def __init__ (
-   * ** value: ndarray,
-   * ** place: paddle::platform::Place,
-   * ** persistable: bool,
-   * ** zero_copy: bool,
-   * ** name: std::string,
-   * ** stop_gradient: bool)
-   * 4.
-   * def __init__ (
-   * ** value: ndarray)
-   * 5.
-   * def __init__ (
-   * ** tensor: Tensor)
-   * 6. (multi-place)
-   * (should have at least one parameter, one parameter equals to case 5, zero
-   * parameter equals to case 1.)
-   * def __init__ (
-   * ** tensor: Tensor,
-   * ** place: paddle::platform::Place,
-   * ** name: std::string)
-   * 7. (multi-place) (should have at least one parameter, one parameter similar
-   * to case 5, zero parameter equals to case 1.)
-   * def __init__ (
-   * ** tensor: FrameworkTensor,
-   * ** place: paddle::platform::Place,
-   * ** name: std::string)
-   *  **/
+ * 1.
+ * def __init__ ()
+ * 2.
+ * def __init__ (
+ * ** dtype: paddle::framework::proto::VarType::Type,
+ * ** dims: vector<int>,
+ * ** name: std::string,
+ * ** type: paddle::framework::proto::VarType::LodTensor,
+ * ** persistable: bool)
+ * 3. (multi-place)
+ * (should have at least one parameter, one parameter equals to case 4, zero
+ * parameter equals to case 1)
+ * def __init__ (
+ * ** value: ndarray,
+ * ** place: paddle::platform::Place,
+ * ** persistable: bool,
+ * ** zero_copy: bool,
+ * ** name: std::string,
+ * ** stop_gradient: bool)
+ * 4.
+ * def __init__ (
+ * ** value: ndarray)
+ * 5.
+ * def __init__ (
+ * ** tensor: Tensor)
+ * 6. (multi-place)
+ * (should have at least one parameter, one parameter equals to case 5, zero
+ * parameter equals to case 1.)
+ * def __init__ (
+ * ** tensor: Tensor,
+ * ** place: paddle::platform::Place,
+ * ** name: std::string)
+ * 7. (multi-place) (should have at least one parameter, one parameter similar
+ * to case 5, zero parameter equals to case 1.)
+ * def __init__ (
+ * ** tensor: FrameworkTensor,
+ * ** place: paddle::platform::Place,
+ * ** name: std::string)
+ *  **/
 int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
   EAGER_TRY
   // set a flag to record use kwargs or not
@@ -828,37 +829,37 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
 }
 
 /** We should have init function with signature:
-   * 1.
-   * def __init__ ()
-   *
-   * 2.
-   * def __init__ (
-   * ** dims: vector<int>,
-   * ** name: std::string)
-   *
-   * 3.
-   * (should have at least one parameter, one parameter equals to case 4, zero
-   * parameter equals to case 1)
-   * def __init__ (
-   * ** value: ndarray,
-   * ** zero_copy: bool,
-   * ** name: std::string)
-   *
-   * 4.
-   * def __init__ (
-   * ** value: ndarray)
-   *
-   * 5.
-   * def __init__ (
-   * ** tensor: Tensor)
-   *
-   * 6.
-   * (should have at least one parameter, one parameter equals to case 5, zero
-   * parameter equals to case 1.)
-   * def __init__ (
-   * ** tensor: Tensor,
-   * ** name: std::string)
-   * **/
+ * 1.
+ * def __init__ ()
+ *
+ * 2.
+ * def __init__ (
+ * ** dims: vector<int>,
+ * ** name: std::string)
+ *
+ * 3.
+ * (should have at least one parameter, one parameter equals to case 4, zero
+ * parameter equals to case 1)
+ * def __init__ (
+ * ** value: ndarray,
+ * ** zero_copy: bool,
+ * ** name: std::string)
+ *
+ * 4.
+ * def __init__ (
+ * ** value: ndarray)
+ *
+ * 5.
+ * def __init__ (
+ * ** tensor: Tensor)
+ *
+ * 6.
+ * (should have at least one parameter, one parameter equals to case 5, zero
+ * parameter equals to case 1.)
+ * def __init__ (
+ * ** tensor: Tensor,
+ * ** name: std::string)
+ * **/
 int StringTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
   // set a flag to record use kwargs or not
   bool flag_kwargs = false;
@@ -916,8 +917,9 @@ int StringTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
       // case 1
       VLOG(6) << "Calling case1's string initializer.";
       EmptyStringTensorInitializer(
-          py_tensor_ptr, egr::Controller::Instance().GenerateUniqueName(
-                             "generated_string_tensor"),
+          py_tensor_ptr,
+          egr::Controller::Instance().GenerateUniqueName(
+              "generated_string_tensor"),
           egr::Controller::Instance().GetExpectedPlace());
       return 0;
     } else {
diff --git a/paddle/fluid/pybind/eager.h b/paddle/fluid/pybind/eager.h
index a3eac7ab47043..db2b438c3bd94 100644
--- a/paddle/fluid/pybind/eager.h
+++ b/paddle/fluid/pybind/eager.h
@@ -11,11 +11,11 @@ limitations under the License. */
 #pragma once
 
 #include <Python.h>
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
 
 #include "paddle/fluid/eager/pylayer/py_layer_node.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/eager_custom_python_api.h b/paddle/fluid/pybind/eager_custom_python_api.h
index 99ec4212918de..df4920a5e690f 100644
--- a/paddle/fluid/pybind/eager_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_custom_python_api.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <iostream>
+
 #include "paddle/phi/core/enforce.h"
 
 static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
@@ -27,7 +28,8 @@ static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
         GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false);
     auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true);
     framework::AttributeMap attrs;
-    ConstructAttrMapFromPyArgs("run_program", args, 5, PyTuple_GET_SIZE(args),
+    // TODO(zengjinle): support CUDA Graph on eager mode
+    ConstructAttrMapFromPyArgs("run_program", args, 6, PyTuple_GET_SIZE(args),
                                attrs);
 
     tstate = PyEval_SaveThread();
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 628e808ef99ac..c75ac0b52c52c 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -20,9 +20,6 @@ typedef SSIZE_T ssize_t;
 #include <string>
 #include <vector>
 
-#include "pybind11/numpy.h"
-#include "pybind11/pybind11.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
@@ -51,6 +48,8 @@ typedef SSIZE_T ssize_t;
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index b54f4e1416c35..ab6b8edd52eae 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -21,9 +21,6 @@ typedef SSIZE_T ssize_t;
 #include <unordered_map>
 #include <vector>
 
-#include "pybind11/numpy.h"
-#include "pybind11/pybind11.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
@@ -47,12 +44,15 @@ typedef SSIZE_T ssize_t;
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "pybind11/detail/internals.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace pybind {
@@ -518,7 +518,10 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args,
     } else if (grad->is_dense_tensor()) {
       if (grad->initialized()) {
         if (set_to_zero) {
-          grad->set_impl(paddle::experimental::zeros_like(*grad).impl());
+          auto* grad_t = static_cast<phi::DenseTensor*>(grad->impl().get());
+          auto* dev_ctx =
+              platform::DeviceContextPool::Instance().Get(grad_t->place());
+          phi::funcs::set_constant(*dev_ctx, grad_t, 0.0);
           if (is_leaf) {
             std::static_pointer_cast<egr::GradNodeAccumulation>(
                 egr::EagerUtils::grad_node(self->tensor))
@@ -555,13 +558,26 @@ static PyObject* tensor__zero_grads(TensorObject* self, PyObject* args,
                        "Please check if you have manually cleared"
                        "the grad inside autograd_meta"));
     if (grad->initialized()) {
-      grad->set_impl(paddle::experimental::zeros_like(*(grad)).impl());
+      if (grad->is_dense_tensor()) {
+        auto* t = static_cast<phi::DenseTensor*>(grad->impl().get());
+        auto* dev_ctx = platform::DeviceContextPool::Instance().Get(t->place());
+        phi::funcs::set_constant(*dev_ctx, t, 0.0);
+      } else {
+        grad->set_impl(paddle::experimental::zeros_like(*(grad)).impl());
+      }
     }
   } else {
     auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor);
     if (meta->MutableGrad()->initialized()) {
-      meta->MutableGrad()->set_impl(
-          paddle::experimental::zeros_like(*(meta->MutableGrad())).impl());
+      if (meta->MutableGrad()->is_dense_tensor()) {
+        auto* t =
+            static_cast<phi::DenseTensor*>(meta->MutableGrad()->impl().get());
+        auto* dev_ctx = platform::DeviceContextPool::Instance().Get(t->place());
+        phi::funcs::set_constant(*dev_ctx, t, 0.0);
+      } else {
+        meta->MutableGrad()->set_impl(
+            paddle::experimental::zeros_like(*(meta->MutableGrad())).impl());
+      }
     }
   }
 
@@ -990,10 +1006,11 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
       PADDLE_ENFORCE_EQ(
           egr::egr_utils_api::IsLeafTensor(self->tensor) &&
               !egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient(),
-          false, platform::errors::InvalidArgument(
-                     "Leaf Tensor (%s) that doesn't stop gradient can't use "
-                     "inplace strategy.",
-                     self->tensor.name()));
+          false,
+          platform::errors::InvalidArgument(
+              "Leaf Tensor (%s) that doesn't stop gradient can't use "
+              "inplace strategy.",
+              self->tensor.name()));
     }
 
     paddle::experimental::Tensor value_tensor;
@@ -1215,9 +1232,10 @@ static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args,
                         "Only can register backward hook for leaf Tensor."));
   PADDLE_ENFORCE_EQ(
       !egr::EagerUtils::unsafe_autograd_meta(self->tensor)->StopGradient(),
-      true, platform::errors::InvalidArgument(
-                "Cannot register backward hook on a Tensor that stop "
-                "gradient."));
+      true,
+      platform::errors::InvalidArgument(
+          "Cannot register backward hook on a Tensor that stop "
+          "gradient."));
   PADDLE_ENFORCE(
       grad_node.get() != nullptr,
       paddle::platform::errors::Fatal("Detected NULL grad_node,"
@@ -1650,8 +1668,8 @@ PyMethodDef variable_methods[] = {
      (PyCFunction)(void (*)(void))tensor_method__is_initialized,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_is_dense_tensor_hold_allocation",
-     (PyCFunction)(
-         void (*)(void))tensor_method__is_dense_tensor_hold_allocation,
+     (PyCFunction)(void (*)(
+         void))tensor_method__is_dense_tensor_hold_allocation,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_copy_to", (PyCFunction)(void (*)(void))tensor_method__copy_to,
      METH_VARARGS | METH_KEYWORDS, NULL},
@@ -1776,8 +1794,8 @@ PyMethodDef string_tensor_variable_methods[] = {
      (PyCFunction)(void (*)(void))tensor_method__is_initialized,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_is_string_tensor_hold_allocation",
-     (PyCFunction)(
-         void (*)(void))tensor_method__is_string_tensor_hold_allocation,
+     (PyCFunction)(void (*)(
+         void))tensor_method__is_string_tensor_hold_allocation,
      METH_VARARGS | METH_KEYWORDS, NULL},
     // TODO(zhoushunjie): Need to add _copy_to, copy_ for StringTensor.
     {NULL, NULL, 0, NULL}};
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index b546aa2d76bcd..f58f3ce94537e 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -486,7 +486,8 @@ int main(int argc, char* argv[]) {
       "\"paddle/fluid/pybind/op_function_common.h\"",
       "\"paddle/fluid/eager/api/generated/fluid_generated/"
       "dygraph_forward_api.h\"",
-      "\"paddle/fluid/pybind/exception.h\"", "<Python.h>"};
+      "\"paddle/fluid/pybind/exception.h\"",
+      "<Python.h>"};
 
   std::ofstream out(argv[1], std::ios::out);
 
diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc
index 47a5309d691f5..a0cef6388c13f 100644
--- a/paddle/fluid/pybind/eager_py_layer.cc
+++ b/paddle/fluid/pybind/eager_py_layer.cc
@@ -16,8 +16,6 @@ limitations under the License. */
 #include <vector>
 
 #pragma GCC diagnostic ignored "-Wattributes"
-#include "pybind11/pytypes.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
@@ -34,6 +32,7 @@ limitations under the License. */
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "pybind11/detail/internals.h"
+#include "pybind11/pytypes.h"
 #pragma GCC diagnostic ignored "-Wwrite-strings"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
 
@@ -323,10 +322,11 @@ PyObject* pylayer_method_apply(PyObject* cls, PyObject* args,
           egr::EagerUtils::autograd_meta(dirty_tensor);
       PADDLE_ENFORCE_EQ(!dirty_tensor_autograd_meta->StopGradient() &&
                             egr::egr_utils_api::IsLeafTensor(*dirty_tensor),
-                        false, paddle::platform::errors::InvalidArgument(
-                                   "Leaf Var (%s) that doesn't stop gradient "
-                                   "can't use inplace strategy.",
-                                   dirty_tensor->name()));
+                        false,
+                        paddle::platform::errors::InvalidArgument(
+                            "Leaf Var (%s) that doesn't stop gradient "
+                            "can't use inplace strategy.",
+                            dirty_tensor->name()));
       dirty_tensor->bump_inplace_version();
       VLOG(3) << "Tensor(" << dirty_tensor->name()
               << ") uses Inplace Strategy.";
@@ -466,16 +466,19 @@ PyMethodDef pylayer_methods[] = {
      METH_O, NULL},
     {NULL, NULL, 0, NULL}};
 
-struct PyGetSetDef pylayer_properties[]{
-    {"container", (getter)tensor_properties_get_container,
-     (setter)tensor_properties_set_container, nullptr, nullptr},
-    {"non_differentiable", (getter)tensor_properties_get_non_differentiable,
-     (setter)tensor_properties_set_non_differentiable, nullptr, nullptr},
-    {"dirty_tensors", (getter)tensor_properties_get_dirty_tensors,
-     (setter)tensor_properties_set_dirty_tensors, nullptr, nullptr},
-    {"materialize_grads", nullptr,
-     (setter)tensor_properties_set_materialize_grads, nullptr, nullptr},
-    {nullptr, nullptr, nullptr, nullptr, nullptr}};
+struct PyGetSetDef pylayer_properties[] {
+  {"container", (getter)tensor_properties_get_container,
+   (setter)tensor_properties_set_container, nullptr, nullptr},
+      {"non_differentiable", (getter)tensor_properties_get_non_differentiable,
+       (setter)tensor_properties_set_non_differentiable, nullptr, nullptr},
+      {"dirty_tensors", (getter)tensor_properties_get_dirty_tensors,
+       (setter)tensor_properties_set_dirty_tensors, nullptr, nullptr},
+      {"materialize_grads", nullptr,
+       (setter)tensor_properties_set_materialize_grads, nullptr, nullptr},
+  {
+    nullptr, nullptr, nullptr, nullptr, nullptr
+  }
+};
 
 void BindEagerPyLayer(PyObject* module) {
   auto heap_type = reinterpret_cast<PyHeapTypeObject*>(
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index efa0fe2cb582e..9bcac35037d04 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+// clang-format will try to move eager_utils.h in front of other headers
+// according to google c++ style, and that cause compiling problems.
+// clang-format off
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/framework/convert_utils.h"
@@ -31,6 +34,7 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+// clang-format on
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 7f94f6c90e5a0..beab99877bd79 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -16,12 +16,12 @@ typedef SSIZE_T ssize_t;
 #endif
 
 #include <Python.h>
+
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
-
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 namespace paddle {
@@ -112,8 +112,9 @@ struct TupleTensorResult {
                   PyObject* args, ssize_t arg_idx) {
     TupleTensorResult<Tuple, N - 1>::Run(out, result, value_idx, args, arg_idx);
     if (N - 1 == value_idx) {
-      PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out),
-                                                 value_idx, args, arg_idx));
+      PyTuple_SET_ITEM(
+          result, N - 1,
+          ToPyObject(std::get<N - 1>(out), value_idx, args, arg_idx));
     } else {
       PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
     }
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 4f25a6f1a5ca8..934a9ef97fb15 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/exception.h"
+
 #include "paddle/phi/api/ext/exception.h"
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 4ffb513671c56..25f2c91002844 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 #undef _XOPEN_SOURCE
 #endif
 
-#include "paddle/fluid/pybind/fleet_py.h"
-
 #include <map>
 #include <memory>
 #include <string>
@@ -35,17 +33,18 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h"
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
+#include "paddle/fluid/pybind/fleet_py.h"
 
 namespace py = pybind11;
 using paddle::distributed::CommContext;
 using paddle::distributed::Communicator;
+using paddle::distributed::FeatureNode;
 using paddle::distributed::FleetWrapper;
-using paddle::distributed::HeterClient;
-using paddle::distributed::GraphPyService;
 using paddle::distributed::GraphNode;
-using paddle::distributed::GraphPyServer;
 using paddle::distributed::GraphPyClient;
-using paddle::distributed::FeatureNode;
+using paddle::distributed::GraphPyServer;
+using paddle::distributed::GraphPyService;
+using paddle::distributed::HeterClient;
 
 namespace paddle {
 namespace pybind {
@@ -246,13 +245,13 @@ void BindGraphPyClient(py::module* m) {
       .def("bind_local_server", &GraphPyClient::bind_local_server);
 }
 
-using paddle::distributed::TreeIndex;
-using paddle::distributed::IndexWrapper;
 using paddle::distributed::IndexNode;
+using paddle::distributed::IndexWrapper;
+using paddle::distributed::TreeIndex;
 #ifdef PADDLE_WITH_HETERPS
 using paddle::framework::GraphGpuWrapper;
-using paddle::framework::NeighborSampleResult;
 using paddle::framework::NeighborSampleQuery;
+using paddle::framework::NeighborSampleResult;
 using paddle::framework::NodeQueryResult;
 #endif
 
diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc
index af1c3da727d41..0e1d4cd76add2 100644
--- a/paddle/fluid/pybind/fleet_wrapper_py.cc
+++ b/paddle/fluid/pybind/fleet_wrapper_py.cc
@@ -46,10 +46,10 @@ void BindFleetWrapper(py::module* m) {
       .def("push_dense", &framework::FleetWrapper::PushDenseVarsSync)
       .def("pull_dense", &framework::FleetWrapper::PullDenseVarsSync)
       .def("init_server", &framework::FleetWrapper::InitServer)
-      .def("run_server", (uint64_t (framework::FleetWrapper::*)(void)) &
+      .def("run_server", (uint64_t(framework::FleetWrapper::*)(void)) &
                              framework::FleetWrapper::RunServer)
-      .def("run_server", (uint64_t (framework::FleetWrapper::*)(  // NOLINT
-                             const std::string&, uint32_t)) &     // NOLINT
+      .def("run_server", (uint64_t(framework::FleetWrapper::*)(  // NOLINT
+                             const std::string&, uint32_t)) &    // NOLINT
                              framework::FleetWrapper::RunServer)
       .def("init_worker", &framework::FleetWrapper::InitWorker)
       .def("init_model", &framework::FleetWrapper::PushDenseParamSync)
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 6bb85da8c466f..e456526f8441c 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -8,9 +8,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/phi/core/generator.h"
 #include <fcntl.h>
 
+#include "paddle/phi/core/generator.h"
+
 #ifdef _POSIX_C_SOURCE
 #undef _POSIX_C_SOURCE
 #endif
diff --git a/paddle/fluid/pybind/gloo_context_py.cc b/paddle/fluid/pybind/gloo_context_py.cc
index 2314ceac76e5b..b4ee1bcd02bd7 100644
--- a/paddle/fluid/pybind/gloo_context_py.cc
+++ b/paddle/fluid/pybind/gloo_context_py.cc
@@ -43,13 +43,14 @@ void BindGlooContext(py::module *m) {
   py::class_<platform::GlooParallelStrategy> gloo_parallel_strategy(
       *m, "GlooParallelStrategy", "");
   gloo_parallel_strategy.def(py::init())
-      .def_property("rank_num",
-                    [](const platform::GlooParallelStrategy &self) {
-                      return self.rank_num;
-                    },
-                    [](platform::GlooParallelStrategy &self, int nranks) {
-                      self.rank_num = nranks;
-                    })
+      .def_property(
+          "rank_num",
+          [](const platform::GlooParallelStrategy &self) {
+            return self.rank_num;
+          },
+          [](platform::GlooParallelStrategy &self, int nranks) {
+            self.rank_num = nranks;
+          })
       .def_property(
           "rank",
           [](const platform::GlooParallelStrategy &self) { return self.rank; },
@@ -62,20 +63,22 @@ void BindGlooContext(py::module *m) {
           [](platform::GlooParallelStrategy &self, const std::string &iface) {
             self.iface = iface;
           })
-      .def_property("init_seconds",
-                    [](const platform::GlooParallelStrategy &self) {
-                      return self.init_seconds;
-                    },
-                    [](platform::GlooParallelStrategy &self, int init_seconds) {
-                      self.init_seconds = init_seconds;
-                    })
-      .def_property("run_seconds",
-                    [](const platform::GlooParallelStrategy &self) {
-                      return self.run_seconds;
-                    },
-                    [](platform::GlooParallelStrategy &self, int run_seconds) {
-                      self.run_seconds = run_seconds;
-                    })
+      .def_property(
+          "init_seconds",
+          [](const platform::GlooParallelStrategy &self) {
+            return self.init_seconds;
+          },
+          [](platform::GlooParallelStrategy &self, int init_seconds) {
+            self.init_seconds = init_seconds;
+          })
+      .def_property(
+          "run_seconds",
+          [](const platform::GlooParallelStrategy &self) {
+            return self.run_seconds;
+          },
+          [](platform::GlooParallelStrategy &self, int run_seconds) {
+            self.run_seconds = run_seconds;
+          })
       .def_property(
           "ip_address",
           [](const platform::GlooParallelStrategy &self) {
@@ -83,13 +86,14 @@ void BindGlooContext(py::module *m) {
           },
           [](platform::GlooParallelStrategy &self,
              const std::string &ip_address) { self.ip_address = ip_address; })
-      .def_property("ip_port",
-                    [](const platform::GlooParallelStrategy &self) {
-                      return self.ip_port;
-                    },
-                    [](platform::GlooParallelStrategy &self, int ip_port) {
-                      self.ip_port = ip_port;
-                    });
+      .def_property(
+          "ip_port",
+          [](const platform::GlooParallelStrategy &self) {
+            return self.ip_port;
+          },
+          [](platform::GlooParallelStrategy &self, int ip_port) {
+            self.ip_port = ip_port;
+          });
 
   py::class_<platform::GlooParallelContext> gloo_ctx(*m, "GlooParallelContext");
   gloo_ctx.def(py::init<const platform::GlooParallelStrategy &>())
diff --git a/paddle/fluid/pybind/gloo_context_py.h b/paddle/fluid/pybind/gloo_context_py.h
index 89bd183097b75..51f736ed060ce 100644
--- a/paddle/fluid/pybind/gloo_context_py.h
+++ b/paddle/fluid/pybind/gloo_context_py.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <Python.h>
+
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index d24c0355c2493..3de6c64617ddd 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -159,10 +159,9 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
 // only initialize varbase, but not its tensor.
 static void InitVarBaseOnly(imperative::VarBase *self, const std::string &name,
                             bool persistable = false, int stop_gradient = -1) {
-  auto name_ = name == ""
-                   ? imperative::GetCurrentTracer()->GenerateUniqueName(
-                         "generated_tensor")
-                   : name;
+  auto name_ = name == "" ? imperative::GetCurrentTracer()->GenerateUniqueName(
+                                "generated_tensor")
+                          : name;
 
   VLOG(5) << "Init Tensor as: / name: " << name_
           << " / persistable: " << persistable
@@ -274,10 +273,9 @@ static void InitVarBaseFromTensorWithArgDefault(imperative::VarBase *self,
                                                 const std::string &name) {
   VLOG(4) << "Init VarBase";
   auto place = imperative::GetCurrentTracer()->ExpectedPlace();
-  auto name_ = name == ""
-                   ? imperative::GetCurrentTracer()->GenerateUniqueName(
-                         "generated_tensor")
-                   : name;
+  auto name_ = name == "" ? imperative::GetCurrentTracer()->GenerateUniqueName(
+                                "generated_tensor")
+                          : name;
   new (self) imperative::VarBase(name_);
   self->SetPersistable(false);
   self->SetType(framework::proto::VarType::LOD_TENSOR);
@@ -299,10 +297,9 @@ static void InitVarBaseFromTensorWithArg(imperative::VarBase *self,
                                          const P &place,
                                          const std::string &name) {
   VLOG(4) << "Init VarBase";
-  auto name_ = name == ""
-                   ? imperative::GetCurrentTracer()->GenerateUniqueName(
-                         "generated_tensor")
-                   : name;
+  auto name_ = name == "" ? imperative::GetCurrentTracer()->GenerateUniqueName(
+                                "generated_tensor")
+                          : name;
   new (self) imperative::VarBase(name_);
   self->SetPersistable(false);
   self->SetType(framework::proto::VarType::LOD_TENSOR);
@@ -556,38 +553,39 @@ void BindImperative(py::module *m_ptr) {
       },
       py::return_value_policy::take_ownership);
 
-  m.def("_array_to_share_memory_tensor",
-        [](py::object &obj) {
-          // 1. cast to python array
-          auto array = obj.cast<py::array>();
-          PADDLE_ENFORCE_NE(
-              string::Sprintf("%s", array.dtype()).compare("object"), 0,
-              platform::errors::InvalidArgument(
-                  "Faild to convert input data to a regular ndarray.\n  * "
-                  "Usually this means the input data contains nested "
-                  "lists with different lengths.\n  * Check the reader "
-                  "function passed to 'set_(sample/sample_list/batch)"
-                  "_generator' to locate the data causes this issue."));
-          // 2. construcct LoDTensor
-          framework::LoDTensor t;
-          SetTensorFromPyArray<platform::CPUPlace>(&t, array,
-                                                   platform::CPUPlace(), true);
-          // 3. allocate shared memory
-          void *data_ptr = t.data();
-          size_t data_size = t.numel() * framework::DataTypeSize(t.dtype());
-          auto shared_writer_holder =
-              memory::allocation::AllocateMemoryMapWriterAllocation(data_size);
-          // 4. maintain mmap fd set & backup ipc_name
-          const std::string &ipc_name = shared_writer_holder->ipc_name();
-          memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
-          // 5. copy data & reset holder
-          memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
-                       platform::CPUPlace(), data_ptr, data_size);
-          t.ResetHolder(shared_writer_holder);
-
-          return t;
-        },
-        py::return_value_policy::take_ownership);
+  m.def(
+      "_array_to_share_memory_tensor",
+      [](py::object &obj) {
+        // 1. cast to python array
+        auto array = obj.cast<py::array>();
+        PADDLE_ENFORCE_NE(
+            string::Sprintf("%s", array.dtype()).compare("object"), 0,
+            platform::errors::InvalidArgument(
+                "Faild to convert input data to a regular ndarray.\n  * "
+                "Usually this means the input data contains nested "
+                "lists with different lengths.\n  * Check the reader "
+                "function passed to 'set_(sample/sample_list/batch)"
+                "_generator' to locate the data causes this issue."));
+        // 2. construcct LoDTensor
+        framework::LoDTensor t;
+        SetTensorFromPyArray<platform::CPUPlace>(&t, array,
+                                                 platform::CPUPlace(), true);
+        // 3. allocate shared memory
+        void *data_ptr = t.data();
+        size_t data_size = t.numel() * framework::DataTypeSize(t.dtype());
+        auto shared_writer_holder =
+            memory::allocation::AllocateMemoryMapWriterAllocation(data_size);
+        // 4. maintain mmap fd set & backup ipc_name
+        const std::string &ipc_name = shared_writer_holder->ipc_name();
+        memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
+        // 5. copy data & reset holder
+        memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
+                     platform::CPUPlace(), data_ptr, data_size);
+        t.ResetHolder(shared_writer_holder);
+
+        return t;
+      },
+      py::return_value_policy::take_ownership);
 
   m.def("_remove_tensor_list_mmap_fds", [](py::list &tensor_list) {
     for (size_t i = 0; i < tensor_list.size(); ++i) {
@@ -1089,31 +1087,32 @@ void BindImperative(py::module *m_ptr) {
                      self.Name()));
              return var->CurrentInplaceVersion();
            })
-      .def("_bump_inplace_version",
-           [](std::shared_ptr<imperative::VarBase> &self) {
-             // NOTE(liym27): _bump_inplace_version is only used for inplace
-             // operation
-             self->BumpInplaceVersion();
-           },
-           R"DOC(
+      .def(
+          "_bump_inplace_version",
+          [](std::shared_ptr<imperative::VarBase> &self) {
+            // NOTE(liym27): _bump_inplace_version is only used for inplace
+            // operation
+            self->BumpInplaceVersion();
+          },
+          R"DOC(
         **Notes**:
             **This API is ONLY available in Dygraph mode.**
             **This is a very low level API. Users should not use it directly. **
          Bump the version whenever the Tensor is modified through an inplace operation.
             )DOC")
-      .def("numpy",
+      .def(
+          "numpy",
 
-           [](imperative::VarBase &self) -> py::array {
-             const auto &tensor =
-                 self.MutableVar()->Get<framework::LoDTensor>();
-             PADDLE_ENFORCE_EQ(
-                 tensor.IsInitialized(), true,
-                 platform::errors::InvalidArgument(
-                     "Tensor of %s is Empty, please check if it has no data.",
-                     self.Name()));
-             return TensorToPyArray(tensor, true);
-           },
-           R"DOC(
+          [](imperative::VarBase &self) -> py::array {
+            const auto &tensor = self.MutableVar()->Get<framework::LoDTensor>();
+            PADDLE_ENFORCE_EQ(
+                tensor.IsInitialized(), true,
+                platform::errors::InvalidArgument(
+                    "Tensor of %s is Empty, please check if it has no data.",
+                    self.Name()));
+            return TensorToPyArray(tensor, true);
+          },
+          R"DOC(
         Returns a numpy array shows the value of current Tensor.
         
         Returns:
@@ -1133,68 +1132,69 @@ void BindImperative(py::module *m_ptr) {
                 x = linear(data)
                 print(x.numpy())
        )DOC")
-      .def("detach",
-           [](const imperative::VarBase
-                  &self) -> std::shared_ptr<imperative::VarBase> {
-             PADDLE_ENFORCE_EQ(
-                 self.Var().IsInitialized(), true,
-                 platform::errors::InvalidArgument(
-                     "Tensor %s has not been initialized!", self.Name()));
+      .def(
+          "detach",
+          [](const imperative::VarBase &self)
+              -> std::shared_ptr<imperative::VarBase> {
+            PADDLE_ENFORCE_EQ(
+                self.Var().IsInitialized(), true,
+                platform::errors::InvalidArgument(
+                    "Tensor %s has not been initialized!", self.Name()));
 
-             PADDLE_ENFORCE_EQ(
-                 self.Var().IsType<framework::LoDTensor>() ||
-                     self.Var().IsType<phi::SelectedRows>(),
-                 true,
-                 platform::errors::InvalidArgument(
-                     "Type of Tensor[%s] must be LoDTensor or SelectedRows!",
-                     self.Name()));
+            PADDLE_ENFORCE_EQ(
+                self.Var().IsType<framework::LoDTensor>() ||
+                    self.Var().IsType<phi::SelectedRows>(),
+                true,
+                platform::errors::InvalidArgument(
+                    "Type of Tensor[%s] must be LoDTensor or SelectedRows!",
+                    self.Name()));
 
-             auto detach_var = std::make_shared<imperative::VarBase>(
-                 true, "detach_" + self.Name());
+            auto detach_var = std::make_shared<imperative::VarBase>(
+                true, "detach_" + self.Name());
 
-             detach_var->SetPersistable(self.Persistable());
-             detach_var->SetType(self.Type());
-             detach_var->SetDataType(self.DataType());
+            detach_var->SetPersistable(self.Persistable());
+            detach_var->SetType(self.Type());
+            detach_var->SetDataType(self.DataType());
 
-             if (self.Var().IsType<framework::LoDTensor>()) {
-               const auto &origin_tensor =
-                   self.Var().Get<framework::LoDTensor>();
-               PADDLE_ENFORCE_EQ(
-                   origin_tensor.IsInitialized(), true,
-                   platform::errors::InvalidArgument(
-                       "Tensor %s has not been initialized!", self.Name()));
-
-               auto *detach_tensor =
-                   detach_var->MutableVar()->GetMutable<framework::LoDTensor>();
-               detach_tensor->ShareDataWith(origin_tensor);
-               // NOTE(liym27): Call ShareInplaceVersionCounterWith to share the
-               // same TensorInplaceVersion, which is used to check whether
-               // inplace
-               // operations are correct.
-               detach_tensor->ShareInplaceVersionCounterWith(origin_tensor);
-             } else {
-               const auto &origin_selected_rows =
-                   self.Var().Get<phi::SelectedRows>();
-               PADDLE_ENFORCE_EQ(
-                   origin_selected_rows.value().IsInitialized(), true,
-                   platform::errors::InvalidArgument(
-                       "Tensor %s has not been initialized!", self.Name()));
-
-               auto *detach_selected_rows =
-                   detach_var->MutableVar()->GetMutable<phi::SelectedRows>();
-               detach_selected_rows->set_height(origin_selected_rows.height());
-               detach_selected_rows->set_rows(origin_selected_rows.rows());
-               detach_selected_rows->mutable_value()->ShareDataWith(
-                   origin_selected_rows.value());
-               detach_selected_rows->mutable_value()
-                   ->ShareInplaceVersionCounterWith(
-                       origin_selected_rows.value());
-             }
-             VLOG(3) << "The detached Tensor(" << detach_var->Name()
-                     << ") share data with " << self.Name();
-             return detach_var;
-           },
-           py::return_value_policy::take_ownership, R"DOC(
+            if (self.Var().IsType<framework::LoDTensor>()) {
+              const auto &origin_tensor =
+                  self.Var().Get<framework::LoDTensor>();
+              PADDLE_ENFORCE_EQ(
+                  origin_tensor.IsInitialized(), true,
+                  platform::errors::InvalidArgument(
+                      "Tensor %s has not been initialized!", self.Name()));
+
+              auto *detach_tensor =
+                  detach_var->MutableVar()->GetMutable<framework::LoDTensor>();
+              detach_tensor->ShareDataWith(origin_tensor);
+              // NOTE(liym27): Call ShareInplaceVersionCounterWith to share the
+              // same TensorInplaceVersion, which is used to check whether
+              // inplace
+              // operations are correct.
+              detach_tensor->ShareInplaceVersionCounterWith(origin_tensor);
+            } else {
+              const auto &origin_selected_rows =
+                  self.Var().Get<phi::SelectedRows>();
+              PADDLE_ENFORCE_EQ(
+                  origin_selected_rows.value().IsInitialized(), true,
+                  platform::errors::InvalidArgument(
+                      "Tensor %s has not been initialized!", self.Name()));
+
+              auto *detach_selected_rows =
+                  detach_var->MutableVar()->GetMutable<phi::SelectedRows>();
+              detach_selected_rows->set_height(origin_selected_rows.height());
+              detach_selected_rows->set_rows(origin_selected_rows.rows());
+              detach_selected_rows->mutable_value()->ShareDataWith(
+                  origin_selected_rows.value());
+              detach_selected_rows->mutable_value()
+                  ->ShareInplaceVersionCounterWith(
+                      origin_selected_rows.value());
+            }
+            VLOG(3) << "The detached Tensor(" << detach_var->Name()
+                    << ") share data with " << self.Name();
+            return detach_var;
+          },
+          py::return_value_policy::take_ownership, R"DOC(
 
         Returns a new Tensor, detached from the current graph.
         It will share data with origin Tensor and always doesn't have a Tensor copy.
@@ -1256,23 +1256,23 @@ void BindImperative(py::module *m_ptr) {
       .def("_gradient_set_empty", &imperative::VarBase::_GradientSetEmpty,
            py::arg("set_is_empty") = true)
       .def("_is_gradient_set_empty", &imperative::VarBase::_IsGradientSetEmpty)
-      .def("clone",
-           [](std::shared_ptr<imperative::VarBase> &self) {
-             const auto &tensor = self->Var().Get<framework::LoDTensor>();
-             PADDLE_ENFORCE_EQ(
-                 tensor.IsInitialized(), true,
-                 platform::errors::InvalidArgument(
-                     "%s has not been initialized", self->Name()));
-             auto tracer = imperative::GetCurrentTracer();
-             auto new_var = std::make_shared<imperative::VarBase>(
-                 true, tracer->GenerateUniqueName(self->Name() + "_clone"));
-             framework::AttributeMap attrs;
-             imperative::NameVarBaseMap ins = {{"X", {self}}};
-             imperative::NameVarBaseMap outs = {{"Out", {new_var}}};
-             tracer->TraceOp("assign", ins, outs, attrs);
-             return new_var;
-           },
-           py::return_value_policy::copy, R"DOC(
+      .def(
+          "clone",
+          [](std::shared_ptr<imperative::VarBase> &self) {
+            const auto &tensor = self->Var().Get<framework::LoDTensor>();
+            PADDLE_ENFORCE_EQ(tensor.IsInitialized(), true,
+                              platform::errors::InvalidArgument(
+                                  "%s has not been initialized", self->Name()));
+            auto tracer = imperative::GetCurrentTracer();
+            auto new_var = std::make_shared<imperative::VarBase>(
+                true, tracer->GenerateUniqueName(self->Name() + "_clone"));
+            framework::AttributeMap attrs;
+            imperative::NameVarBaseMap ins = {{"X", {self}}};
+            imperative::NameVarBaseMap outs = {{"Out", {new_var}}};
+            tracer->TraceOp("assign", ins, outs, attrs);
+            return new_var;
+          },
+          py::return_value_policy::copy, R"DOC(
 
         Returns a new Tensor, which is clone of origin Tensor, and it remains in the current graph.
         It will always have a Tensor copy.
@@ -1305,11 +1305,12 @@ void BindImperative(py::module *m_ptr) {
               print(x.grad)          # None
        )DOC")
       .def("_grad_name", &imperative::VarBase::GradVarName)
-      .def("_grad_value",
-           [](imperative::VarBase &self) {
-             return self.MutableGradVar()->Get<framework::LoDTensor>();
-           },
-           py::return_value_policy::reference)
+      .def(
+          "_grad_value",
+          [](imperative::VarBase &self) {
+            return self.MutableGradVar()->Get<framework::LoDTensor>();
+          },
+          py::return_value_policy::reference)
       .def("_set_grad_type",
            [](imperative::VarBase &self, framework::proto::VarType::Type type) {
              self.MutableGradVarBase()->SetType(type);
@@ -1337,26 +1338,27 @@ void BindImperative(py::module *m_ptr) {
                }
              }
            })
-      .def("_grad_ivar",
-           [](const imperative::VarBase &self) {
-             auto &grad_var = self.GradVarBase();
-
-             if (grad_var && grad_var->Var().IsInitialized()) {
-               auto *tensor =
-                   grad_var->MutableVar()->IsType<framework::LoDTensor>()
-                       ? grad_var->MutableVar()
-                             ->GetMutable<framework::LoDTensor>()
-                       : grad_var->MutableVar()
-                             ->GetMutable<phi::SelectedRows>()
-                             ->mutable_value();
-
-               if (tensor->IsInitialized()) {
-                 return grad_var;
-               }
-             }
-             return std::shared_ptr<imperative::VarBase>(nullptr);
-           },
-           py::return_value_policy::copy)
+      .def(
+          "_grad_ivar",
+          [](const imperative::VarBase &self) {
+            auto &grad_var = self.GradVarBase();
+
+            if (grad_var && grad_var->Var().IsInitialized()) {
+              auto *tensor =
+                  grad_var->MutableVar()->IsType<framework::LoDTensor>()
+                      ? grad_var->MutableVar()
+                            ->GetMutable<framework::LoDTensor>()
+                      : grad_var->MutableVar()
+                            ->GetMutable<phi::SelectedRows>()
+                            ->mutable_value();
+
+              if (tensor->IsInitialized()) {
+                return grad_var;
+              }
+            }
+            return std::shared_ptr<imperative::VarBase>(nullptr);
+          },
+          py::return_value_policy::copy)
       .def("_set_grad_ivar",
            [](imperative::VarBase &self, imperative::VarBase &grad) {
              self.SetGradVarBase(grad);
@@ -1365,13 +1367,14 @@ void BindImperative(py::module *m_ptr) {
            [](imperative::VarBase &self) {
              return self.Var().IsType<phi::SelectedRows>();
            })
-      .def("_allreduce",
-           [](imperative::VarBase &self,
-              const imperative::ParallelStrategy &strategy) {
-             if (strategy.nranks_ > 1) {
+      .def(
+          "_allreduce",
+          [](imperative::VarBase &self,
+             const imperative::ParallelStrategy &strategy) {
+            if (strategy.nranks_ > 1) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #if NCCL_VERSION_CODE >= 2212
-               imperative::AllReduce(self.Var(), self.MutableVar(), strategy);
+              imperative::AllReduce(self.Var(), self.MutableVar(), strategy);
 #else
                if (!self.Var().IsType<phi::SelectedRows>()) {
                  imperative::AllReduce(self.Var(), self.MutableVar(), strategy);
@@ -1388,9 +1391,9 @@ void BindImperative(py::module *m_ptr) {
                    "Imperative allreduce is not supported when paddle is "
                    "not compiled with NCCL."));
 #endif  // PADDLE_WITH_NCCL or PADDLE_WITH_RCCL
-             }
-           },
-           py::call_guard<py::gil_scoped_release>())
+            }
+          },
+          py::call_guard<py::gil_scoped_release>())
       .def("_register_grad_hook",
            [](imperative::VarBase &self, const py::handle &hook) {
              PADDLE_ENFORCE_EQ(
@@ -1425,22 +1428,23 @@ void BindImperative(py::module *m_ptr) {
                    std::make_shared<std::function<void()>>(py_func));
              }
            })
-      .def("_register_backward_hook",
-           [](imperative::VarBase &self, const py::handle &hook) {
-             PADDLE_ENFORCE_EQ(
-                 self.IsLeaf(), true,
-                 platform::errors::InvalidArgument(
-                     "Only can register backward hook for leaf Tensor."));
-             PADDLE_ENFORCE_EQ(
-                 !self.OverridedStopGradient() && self.HasGradVar(), true,
-                 platform::errors::InvalidArgument(
-                     "Cannot register backward hook on a Tensor that stop "
-                     "gradient or without gradient."));
-             auto py_func = PyObjectCast<std::function<void()>>(hook.ptr());
-             self.GradVarBase()->AddVoidHook(
-                 std::make_shared<std::function<void()>>(py_func));
-           },
-           R"DOC(
+      .def(
+          "_register_backward_hook",
+          [](imperative::VarBase &self, const py::handle &hook) {
+            PADDLE_ENFORCE_EQ(
+                self.IsLeaf(), true,
+                platform::errors::InvalidArgument(
+                    "Only can register backward hook for leaf Tensor."));
+            PADDLE_ENFORCE_EQ(
+                !self.OverridedStopGradient() && self.HasGradVar(), true,
+                platform::errors::InvalidArgument(
+                    "Cannot register backward hook on a Tensor that stop "
+                    "gradient or without gradient."));
+            auto py_func = PyObjectCast<std::function<void()>>(hook.ptr());
+            self.GradVarBase()->AddVoidHook(
+                std::make_shared<std::function<void()>>(py_func));
+          },
+          R"DOC(
              Registers a backward hook for current Tensor.
 
              This hook will be called every time the gradient of current Tensor has been fully calculated.
@@ -1461,17 +1465,18 @@ void BindImperative(py::module *m_ptr) {
              Returns:
                  None
            )DOC")
-      .def("cpu",
-           [](const std::shared_ptr<imperative::VarBase> &self) {
-             if (platform::is_cpu_place(self->Place())) {
-               return self;
-             } else {
-               auto new_var = self->NewVarBase(platform::CPUPlace(), true);
-               new_var->SetOverridedStopGradient(self->OverridedStopGradient());
-               return new_var;
-             }
-           },
-           R"DOC(
+      .def(
+          "cpu",
+          [](const std::shared_ptr<imperative::VarBase> &self) {
+            if (platform::is_cpu_place(self->Place())) {
+              return self;
+            } else {
+              auto new_var = self->NewVarBase(platform::CPUPlace(), true);
+              new_var->SetOverridedStopGradient(self->OverridedStopGradient());
+              return new_var;
+            }
+          },
+          R"DOC(
         Returns a copy of this Tensor in CPU memory.
 
         If this Tensor is already in CPU memory, then no copy is performed and the original Tensor is returned.
@@ -1487,24 +1492,25 @@ void BindImperative(py::module *m_ptr) {
               print(y.place)    # CPUPlace
 
               )DOC")
-      .def("pin_memory",
-           [](const std::shared_ptr<imperative::VarBase> &self) {
+      .def(
+          "pin_memory",
+          [](const std::shared_ptr<imperative::VarBase> &self) {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-             PADDLE_THROW(platform::errors::PermissionDenied(
-                 "Cannot copy this Tensor to pinned memory in CPU version "
-                 "Paddle, "
-                 "Please recompile or reinstall Paddle with CUDA support."));
+            PADDLE_THROW(platform::errors::PermissionDenied(
+                "Cannot copy this Tensor to pinned memory in CPU version "
+                "Paddle, "
+                "Please recompile or reinstall Paddle with CUDA support."));
 #endif
-             if (platform::is_cuda_pinned_place(self->Place())) {
-               return self;
-             } else {
-               auto new_var =
-                   self->NewVarBase(platform::CUDAPinnedPlace(), true);
-               new_var->SetOverridedStopGradient(self->OverridedStopGradient());
-               return new_var;
-             }
-           },
-           R"DOC(
+            if (platform::is_cuda_pinned_place(self->Place())) {
+              return self;
+            } else {
+              auto new_var =
+                  self->NewVarBase(platform::CUDAPinnedPlace(), true);
+              new_var->SetOverridedStopGradient(self->OverridedStopGradient());
+              return new_var;
+            }
+          },
+          R"DOC(
         Returns a copy of this Tensor in pin memory.
 
         If this Tensor is already in pin memory, then no copy is performed and the original Tensor is returned.
@@ -1520,13 +1526,14 @@ void BindImperative(py::module *m_ptr) {
               print(y.place)      # CUDAPinnedPlace
 
       )DOC")
-      .def("cuda",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              py::handle &handle, bool blocking) {
+      .def(
+          "cuda",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             py::handle &handle, bool blocking) {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-             PADDLE_THROW(platform::errors::PermissionDenied(
-                 "Cannot copy this Tensor to GPU in CPU version Paddle, "
-                 "Please recompile or reinstall Paddle with CUDA support."));
+            PADDLE_THROW(platform::errors::PermissionDenied(
+                "Cannot copy this Tensor to GPU in CPU version Paddle, "
+                "Please recompile or reinstall Paddle with CUDA support."));
 #else
              int device_count = platform::GetGPUDeviceCount();
              int device_id = 0;
@@ -1563,8 +1570,8 @@ void BindImperative(py::module *m_ptr) {
                return new_var;
              }
 #endif
-           },
-           py::arg("device_id") = py::none(), py::arg("blocking") = true, R"DOC(
+          },
+          py::arg("device_id") = py::none(), py::arg("blocking") = true, R"DOC(
         Returns a copy of this Tensor in GPU memory.
 
         If this Tensor is already in GPU memory and device_id is default, 
@@ -1592,49 +1599,51 @@ void BindImperative(py::module *m_ptr) {
               y = x.cuda(1)
               print(y.place)        # CUDAPlace(1)
        )DOC")
-      .def("_share_memory",
-           [](const std::shared_ptr<imperative::VarBase> &self) {
+      .def(
+          "_share_memory",
+          [](const std::shared_ptr<imperative::VarBase> &self) {
 #ifndef _WIN32
-             PADDLE_ENFORCE_EQ(
-                 platform::is_cpu_place(self->Place()), true,
-                 platform::errors::InvalidArgument(
-                     "Sharing memory only support CPU Tensor currently"));
-             // 1. get LoDTensor
-             auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
-             // 2. allocate shared memory
-             void *data_ptr = t->data();
-             size_t data_size =
-                 t->numel() * framework::SizeOfType(
-                                  framework::TransToProtoVarType(t->dtype()));
-             auto shared_writer_holder =
-                 memory::allocation::AllocateMemoryMapWriterAllocation(
-                     data_size);
-             // 3. maintain mmap fd set & backup ipc_name
-             const std::string &ipc_name = shared_writer_holder->ipc_name();
-             memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
-             // 4. copy data & reset holder
-             memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
-                          platform::CPUPlace(), data_ptr, data_size);
-             t->ResetHolder(shared_writer_holder);
-             return *t;
+            PADDLE_ENFORCE_EQ(
+                platform::is_cpu_place(self->Place()), true,
+                platform::errors::InvalidArgument(
+                    "Sharing memory only support CPU Tensor currently"));
+            // 1. get LoDTensor
+            auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
+            // 2. allocate shared memory
+            void *data_ptr = t->data();
+            size_t data_size =
+                t->numel() * framework::SizeOfType(
+                                 framework::TransToProtoVarType(t->dtype()));
+            auto shared_writer_holder =
+                memory::allocation::AllocateMemoryMapWriterAllocation(
+                    data_size);
+            // 3. maintain mmap fd set & backup ipc_name
+            const std::string &ipc_name = shared_writer_holder->ipc_name();
+            memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
+            // 4. copy data & reset holder
+            memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
+                         platform::CPUPlace(), data_ptr, data_size);
+            t->ResetHolder(shared_writer_holder);
+            return *t;
 #else
              PADDLE_THROW(platform::errors::PermissionDenied(
                  "Sharing memory in Windows OS is not supported currently"));
 #endif
-           },
-           py::return_value_policy::reference)
+          },
+          py::return_value_policy::reference)
 #if defined(PADDLE_WITH_CUDA)
-      .def("_uva",
-           [](const std::shared_ptr<imperative::VarBase> &self, int device_id) {
-             PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->Place()), true,
-                               platform::errors::InvalidArgument(
-                                   "Unified virtual addressing only support "
-                                   "CPU Tensor currently."));
-             auto *self_tensor =
-                 self->MutableVar()->GetMutable<framework::LoDTensor>();
-             tensor_uva(self_tensor, device_id);
-           },
-           py::arg("device_id") = 0, py::return_value_policy::reference, R"DOC(
+      .def(
+          "_uva",
+          [](const std::shared_ptr<imperative::VarBase> &self, int device_id) {
+            PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->Place()), true,
+                              platform::errors::InvalidArgument(
+                                  "Unified virtual addressing only support "
+                                  "CPU Tensor currently."));
+            auto *self_tensor =
+                self->MutableVar()->GetMutable<framework::LoDTensor>();
+            tensor_uva(self_tensor, device_id);
+          },
+          py::arg("device_id") = 0, py::return_value_policy::reference, R"DOC(
         Returns self tensor with the UVA(unified virtual addressing).
 
         Args:
@@ -1651,86 +1660,94 @@ void BindImperative(py::module *m_ptr) {
        )DOC")
 #endif
       .def("copy_", &imperative::VarBase::CopyFrom)
-      .def("_copy_to",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::CPUPlace &place, bool blocking) {
-             auto new_var = self->NewVarBase(place, blocking);
-             // Note(zhiqiu): Since NewVarBase may use GpuCopyAsync to
-             // copy data from the tensor of self to the tensor of new varbase,
-             // we need to ensure that the varbase self is not destructed until
-             // the GpuCopyAsync is completed. Otherwise, the memory may be
-             // freed
-             // when varbase self is destructed.
-             // To do that, we increase the reference count of self by 1 and
-             // add a cuda event to wait the GpuCopyAsync's completion.
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("_copy_to",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::CUDAPinnedPlace &place, bool blocking) {
-             auto new_var = self->NewVarBase(place, blocking);
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("_copy_to",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::XPUPlace &place, bool blocking) {
-             auto new_var = self->NewVarBase(place, blocking);
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("_copy_to",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::CUDAPlace &place, bool blocking) {
-             auto new_var = self->NewVarBase(place, blocking);
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("_copy_to",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::NPUPlace &place, bool blocking) {
-             auto new_var = self->NewVarBase(place, blocking);
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("_copy_to",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::MLUPlace &place, bool blocking) {
-             auto new_var = self->NewVarBase(place, blocking);
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("_copy_to",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::Place &place, bool blocking) {
-             auto new_var = self->NewVarBase(place, blocking);
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("value", [](imperative::VarBase &self) { return self.MutableVar(); },
-           py::return_value_policy::reference)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::CPUPlace &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            // Note(zhiqiu): Since NewVarBase may use GpuCopyAsync to
+            // copy data from the tensor of self to the tensor of new varbase,
+            // we need to ensure that the varbase self is not destructed until
+            // the GpuCopyAsync is completed. Otherwise, the memory may be
+            // freed
+            // when varbase self is destructed.
+            // To do that, we increase the reference count of self by 1 and
+            // add a cuda event to wait the GpuCopyAsync's completion.
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::CUDAPinnedPlace &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::XPUPlace &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::CUDAPlace &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::NPUPlace &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::MLUPlace &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::Place &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
+      .def(
+          "value", [](imperative::VarBase &self) { return self.MutableVar(); },
+          py::return_value_policy::reference)
       .def("_clear",
            [](const std::shared_ptr<imperative::VarBase> &self) {
              auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
@@ -1842,39 +1859,28 @@ void BindImperative(py::module *m_ptr) {
                     &imperative::VarBase::SetOverridedStopGradient)
       .def_property("persistable", &imperative::VarBase::Persistable,
                     &imperative::VarBase::SetPersistable)
-      .def_property_readonly("shape",
-                             [](imperative::VarBase &self) {
-                               if (self.Var().IsType<framework::LoDTensor>()) {
-                                 return phi::vectorize<int>(
-                                     self.Var()
-                                         .Get<framework::LoDTensor>()
-                                         .dims());
-                               } else if (self.Var()
-                                              .IsType<phi::SelectedRows>()) {
-                                 return phi::vectorize<int>(
-                                     self.Var()
-                                         .Get<phi::SelectedRows>()
-                                         .value()
-                                         .dims());
-                               } else if (self.Var()
-                                              .IsType<framework::Strings>()) {
-                                 return std::vector<int>{static_cast<int>(
-                                     self.Var()
-                                         .Get<framework::Strings>()
-                                         .size())};
-                               } else if (self.Var()
-                                              .IsType<framework::Vocab>()) {
-                                 return std::vector<int>{static_cast<int>(
-                                     self.Var()
-                                         .Get<framework::Vocab>()
-                                         .size())};
-                               } else {
-                                 VLOG(2) << "It is meaningless to get shape of "
-                                            "variable type "
-                                         << GetTypeName(self);
-                                 return std::vector<int>();
-                               }
-                             })
+      .def_property_readonly(
+          "shape",
+          [](imperative::VarBase &self) {
+            if (self.Var().IsType<framework::LoDTensor>()) {
+              return phi::vectorize<int>(
+                  self.Var().Get<framework::LoDTensor>().dims());
+            } else if (self.Var().IsType<phi::SelectedRows>()) {
+              return phi::vectorize<int>(
+                  self.Var().Get<phi::SelectedRows>().value().dims());
+            } else if (self.Var().IsType<framework::Strings>()) {
+              return std::vector<int>{static_cast<int>(
+                  self.Var().Get<framework::Strings>().size())};
+            } else if (self.Var().IsType<framework::Vocab>()) {
+              return std::vector<int>{
+                  static_cast<int>(self.Var().Get<framework::Vocab>().size())};
+            } else {
+              VLOG(2) << "It is meaningless to get shape of "
+                         "variable type "
+                      << GetTypeName(self);
+              return std::vector<int>();
+            }
+          })
       .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf,
                              R"DOC(
       Whether a Tensor is leaf Tensor.
@@ -2157,13 +2163,14 @@ void BindImperative(py::module *m_ptr) {
           [](imperative::ParallelStrategy &self, int nranks) {
             self.nranks_ = nranks;
           })
-      .def_property("local_rank",
-                    [](const imperative::ParallelStrategy &self) {
-                      return self.local_rank_;
-                    },
-                    [](imperative::ParallelStrategy &self, int local_rank) {
-                      self.local_rank_ = local_rank;
-                    })
+      .def_property(
+          "local_rank",
+          [](const imperative::ParallelStrategy &self) {
+            return self.local_rank_;
+          },
+          [](imperative::ParallelStrategy &self, int local_rank) {
+            self.local_rank_ = local_rank;
+          })
       .def_property(
           "trainer_endpoints",
           [](const imperative::ParallelStrategy &self) {
@@ -2172,12 +2179,14 @@ void BindImperative(py::module *m_ptr) {
           [](imperative::ParallelStrategy &self, std::vector<std::string> eps) {
             self.trainer_endpoints_ = eps;
           })
-      .def_property("current_endpoint",
-                    [](const imperative::ParallelStrategy &self) {
-                      return self.current_endpoint_;
-                    },
-                    [](imperative::ParallelStrategy &self,
-                       const std::string &ep) { self.current_endpoint_ = ep; })
+      .def_property(
+          "current_endpoint",
+          [](const imperative::ParallelStrategy &self) {
+            return self.current_endpoint_;
+          },
+          [](imperative::ParallelStrategy &self, const std::string &ep) {
+            self.current_endpoint_ = ep;
+          })
       .def_property(
           "nrings",
           [](const imperative::ParallelStrategy &self) { return self.nrings_; },
@@ -2224,9 +2233,9 @@ void BindImperative(py::module *m_ptr) {
       },
       py::call_guard<py::gil_scoped_release>());
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
-    defined(PADDLE_WITH_CNCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||          \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \
+    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_CNCL)
   py::class_<imperative::ParallelContext,
              std::shared_ptr<imperative::ParallelContext>>(m,
                                                            "ParallelContext");
@@ -2359,43 +2368,44 @@ void BindImperative(py::module *m_ptr) {
         });
 
 #if defined(PADDLE_WITH_CUDA)
-  m.def("to_uva_tensor",
-        [](const py::object &obj, int device_id) {
-          const auto &tracer = imperative::GetCurrentTracer();
-          auto new_tensor = std::shared_ptr<imperative::VarBase>(
-              new imperative::VarBase(tracer->GenerateUniqueName()));
-          auto array = obj.cast<py::array>();
-          if (py::isinstance<py::array_t<int32_t>>(array)) {
-            SetUVATensorFromPyArray<int32_t>(new_tensor, array, device_id);
-          } else if (py::isinstance<py::array_t<int64_t>>(array)) {
-            SetUVATensorFromPyArray<int64_t>(new_tensor, array, device_id);
-          } else if (py::isinstance<py::array_t<float>>(array)) {
-            SetUVATensorFromPyArray<float>(new_tensor, array, device_id);
-          } else if (py::isinstance<py::array_t<double>>(array)) {
-            SetUVATensorFromPyArray<double>(new_tensor, array, device_id);
-          } else if (py::isinstance<py::array_t<int8_t>>(array)) {
-            SetUVATensorFromPyArray<int8_t>(new_tensor, array, device_id);
-          } else if (py::isinstance<py::array_t<int16_t>>(array)) {
-            SetUVATensorFromPyArray<int16_t>(new_tensor, array, device_id);
-          } else if (py::isinstance<py::array_t<paddle::platform::float16>>(
-                         array)) {
-            SetUVATensorFromPyArray<paddle::platform::float16>(
-                new_tensor, array, device_id);
-          } else if (py::isinstance<py::array_t<bool>>(array)) {
-            SetUVATensorFromPyArray<bool>(new_tensor, array, device_id);
-          } else {
-            // obj may be any type, obj.cast<py::array>() may be failed,
-            // then the array.dtype will be string of unknown meaning.
-            PADDLE_THROW(platform::errors::InvalidArgument(
-                "Input object type error or incompatible array data type. "
-                "tensor.set() supports array with bool, float16, float32, "
-                "float64, int8, int16, int32, int64,"
-                "please check your input or input array data type."));
-          }
-          return new_tensor;
-        },
-        py::arg("obj"), py::arg("device_id") = 0,
-        py::return_value_policy::reference, R"DOC(
+  m.def(
+      "to_uva_tensor",
+      [](const py::object &obj, int device_id) {
+        const auto &tracer = imperative::GetCurrentTracer();
+        auto new_tensor = std::shared_ptr<imperative::VarBase>(
+            new imperative::VarBase(tracer->GenerateUniqueName()));
+        auto array = obj.cast<py::array>();
+        if (py::isinstance<py::array_t<int32_t>>(array)) {
+          SetUVATensorFromPyArray<int32_t>(new_tensor, array, device_id);
+        } else if (py::isinstance<py::array_t<int64_t>>(array)) {
+          SetUVATensorFromPyArray<int64_t>(new_tensor, array, device_id);
+        } else if (py::isinstance<py::array_t<float>>(array)) {
+          SetUVATensorFromPyArray<float>(new_tensor, array, device_id);
+        } else if (py::isinstance<py::array_t<double>>(array)) {
+          SetUVATensorFromPyArray<double>(new_tensor, array, device_id);
+        } else if (py::isinstance<py::array_t<int8_t>>(array)) {
+          SetUVATensorFromPyArray<int8_t>(new_tensor, array, device_id);
+        } else if (py::isinstance<py::array_t<int16_t>>(array)) {
+          SetUVATensorFromPyArray<int16_t>(new_tensor, array, device_id);
+        } else if (py::isinstance<py::array_t<paddle::platform::float16>>(
+                       array)) {
+          SetUVATensorFromPyArray<paddle::platform::float16>(new_tensor, array,
+                                                             device_id);
+        } else if (py::isinstance<py::array_t<bool>>(array)) {
+          SetUVATensorFromPyArray<bool>(new_tensor, array, device_id);
+        } else {
+          // obj may be any type, obj.cast<py::array>() may be failed,
+          // then the array.dtype will be string of unknown meaning.
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Input object type error or incompatible array data type. "
+              "tensor.set() supports array with bool, float16, float32, "
+              "float64, int8, int16, int32, int64,"
+              "please check your input or input array data type."));
+        }
+        return new_tensor;
+      },
+      py::arg("obj"), py::arg("device_id") = 0,
+      py::return_value_policy::reference, R"DOC(
   Returns tensor with the UVA(unified virtual addressing) created from numpy array.
 
   Args:
diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h
index 0e3e98512d60f..91b9294421529 100644
--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <Python.h>
+
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 944781484076b..d6ffbf010016a 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/inference_api.h"
+
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
+
 #include <cstring>
 #include <functional>
 #include <iostream>
@@ -26,6 +28,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_infer_contrib.h"
@@ -75,8 +78,8 @@ using paddle::AnalysisPredictor;
 using paddle::NativeConfig;
 using paddle::NativePaddlePredictor;
 using paddle::PaddleBuf;
-using paddle::PaddleDType;
 using paddle::PaddleDataLayout;
+using paddle::PaddleDType;
 using paddle::PaddlePassBuilder;
 using paddle::PaddlePlace;
 using paddle::PaddlePredictor;
@@ -379,13 +382,13 @@ void BindInferenceApi(py::module *m) {
          &paddle::CreatePaddlePredictor<AnalysisConfig>, py::arg("config"));
   m->def("create_paddle_predictor",
          &paddle::CreatePaddlePredictor<NativeConfig>, py::arg("config"));
-  m->def("create_predictor", [](const paddle_infer::Config &config)
-                                 -> std::unique_ptr<paddle_infer::Predictor> {
-                                   auto pred =
-                                       std::unique_ptr<paddle_infer::Predictor>(
-                                           new paddle_infer::Predictor(config));
-                                   return pred;
-                                 });
+  m->def("create_predictor",
+         [](const paddle_infer::Config &config)
+             -> std::unique_ptr<paddle_infer::Predictor> {
+           auto pred = std::unique_ptr<paddle_infer::Predictor>(
+               new paddle_infer::Predictor(config));
+           return pred;
+         });
   m->def("copy_tensor", &CopyPaddleInferTensor);
   m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
   m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes);
@@ -578,11 +581,11 @@ void BindAnalysisConfig(py::module *m) {
       .def(py::init<const std::string &>())
       .def(py::init<const std::string &, const std::string &>())
       .def("summary", &AnalysisConfig::Summary)
-      .def("set_model", (void (AnalysisConfig::*)(const std::string &)) &
-                            AnalysisConfig::SetModel)
-      .def("set_model", (void (AnalysisConfig::*)(const std::string &,
-                                                  const std::string &)) &
+      .def("set_model", (void(AnalysisConfig::*)(const std::string &)) &
                             AnalysisConfig::SetModel)
+      .def("set_model",
+           (void(AnalysisConfig::*)(const std::string &, const std::string &)) &
+               AnalysisConfig::SetModel)
       .def("set_prog_file", &AnalysisConfig::SetProgFile)
       .def("set_params_file", &AnalysisConfig::SetParamsFile)
       .def("model_dir", &AnalysisConfig::model_dir)
@@ -657,8 +660,9 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("disable_trt_plugin_fp16") = false)
       .def("tensorrt_dynamic_shape_enabled",
            &AnalysisConfig::tensorrt_dynamic_shape_enabled)
-      .def("enable_tensorrt_oss", &AnalysisConfig::EnableTensorRtOSS)
-      .def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled)
+      .def("enable_tensorrt_varseqlen", &AnalysisConfig::EnableVarseqlen)
+      .def("tensorrt_varseqlen_enabled",
+           &AnalysisConfig::tensorrt_varseqlen_enabled)
       .def("collect_shape_range_info", &AnalysisConfig::CollectShapeRangeInfo)
       .def("shape_range_info_path", &AnalysisConfig::shape_range_info_path)
       .def("shape_range_info_collected",
@@ -715,11 +719,12 @@ void BindAnalysisConfig(py::module *m) {
            [](AnalysisConfig &self, const std::string &pass) {
              self.pass_builder()->DeletePass(pass);
            })
-      .def("pass_builder",
-           [](AnalysisConfig &self) {
-             return dynamic_cast<PaddlePassBuilder *>(self.pass_builder());
-           },
-           py::return_value_policy::reference)
+      .def(
+          "pass_builder",
+          [](AnalysisConfig &self) {
+            return dynamic_cast<PaddlePassBuilder *>(self.pass_builder());
+          },
+          py::return_value_policy::reference)
       .def("nnadapter", &AnalysisConfig::NNAdapter)
       .def("set_dist_config", &AnalysisConfig::SetDistConfig)
       .def("dist_config", &AnalysisConfig::dist_config);
diff --git a/paddle/fluid/pybind/io.cc b/paddle/fluid/pybind/io.cc
index a7222abf45c50..c880696242126 100644
--- a/paddle/fluid/pybind/io.cc
+++ b/paddle/fluid/pybind/io.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/io.h"
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/pybind/io.h b/paddle/fluid/pybind/io.h
index 942c93deccf99..7f10306e919e9 100644
--- a/paddle/fluid/pybind/io.h
+++ b/paddle/fluid/pybind/io.h
@@ -20,6 +20,7 @@ typedef SSIZE_T ssize_t;
 #endif
 
 #include <Python.h>
+
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index ecbacd37d5666..ef005ee8b10fc 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/ir.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -31,18 +33,18 @@
 #include "pybind11/stl.h"
 
 namespace py = pybind11;
-using paddle::framework::ir::Graph;
-using paddle::framework::ir::Node;
-using paddle::framework::ir::NodeComp;
-using paddle::framework::ir::GraphSafeRemoveNodes;
-using paddle::framework::ir::HasCircle;
-using paddle::framework::ir::GraphNum;
-using paddle::framework::ir::TopologySortOperations;
-using paddle::framework::ir::BuildOperationAdjList;
 using paddle::framework::OpDesc;
 using paddle::framework::ProgramDesc;
 using paddle::framework::Scope;
 using paddle::framework::VarDesc;
+using paddle::framework::ir::BuildOperationAdjList;
+using paddle::framework::ir::Graph;
+using paddle::framework::ir::GraphNum;
+using paddle::framework::ir::GraphSafeRemoveNodes;
+using paddle::framework::ir::HasCircle;
+using paddle::framework::ir::Node;
+using paddle::framework::ir::NodeComp;
+using paddle::framework::ir::TopologySortOperations;
 using pybind11::return_value_policy;
 
 namespace paddle {
@@ -104,16 +106,18 @@ void BindGraph(py::module *m) {
            })
       .def("erase", &Graph::Erase)
       .def("nodes", &Graph::Nodes, return_value_policy::reference)
-      .def("create_var_node",
-           [](Graph &self, VarDesc &var_desc) {
-             return self.CreateVarNode(&var_desc);
-           },
-           return_value_policy::reference)
-      .def("create_op_node",
-           [](Graph &self, OpDesc &op_desc) {
-             return self.CreateOpNode(&op_desc);
-           },
-           return_value_policy::reference)
+      .def(
+          "create_var_node",
+          [](Graph &self, VarDesc &var_desc) {
+            return self.CreateVarNode(&var_desc);
+          },
+          return_value_policy::reference)
+      .def(
+          "create_op_node",
+          [](Graph &self, OpDesc &op_desc) {
+            return self.CreateOpNode(&op_desc);
+          },
+          return_value_policy::reference)
       .def("create_control_dep_var", &Graph::CreateControlDepVar,
            return_value_policy::reference)
       .def("create_empty_node", &Graph::CreateEmptyNode,
diff --git a/paddle/fluid/pybind/ir.h b/paddle/fluid/pybind/ir.h
index 2cc1459bbe0fe..ad2d6aa11bfef 100644
--- a/paddle/fluid/pybind/ir.h
+++ b/paddle/fluid/pybind/ir.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <pybind11/pybind11.h>
+
 #include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 0e9c08cff2859..a3c6fa14765aa 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/pybind/op_function_common.h"
+
 #include <pybind11/chrono.h>
 #include <pybind11/complex.h>
 #include <pybind11/functional.h>
@@ -28,7 +30,6 @@
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/pybind/imperative.h"
-#include "paddle/fluid/pybind/op_function_common.h"
 
 namespace py = pybind11;
 namespace paddle {
@@ -640,10 +641,11 @@ void CastPyArg2AttrBlock(PyObject* obj,
 void ConstructAttrMapFromPyArgs(
     const std::string& op_type, PyObject* args, ssize_t attr_start,
     ssize_t attr_end, paddle::framework::AttributeMap& attrs) {  // NOLINT
-  PADDLE_ENFORCE_EQ(
-      (attr_end - attr_start) % 2, 0,
-      platform::errors::InvalidArgument(
-          "The number of arguments for attributes should be even."));
+  PADDLE_ENFORCE_EQ((attr_end - attr_start) % 2, 0,
+                    platform::errors::InvalidArgument(
+                        "The number of arguments for attributes should be even "
+                        "but attr_start = %d, attr_end = %d.",
+                        attr_start, attr_end));
 
   auto attr_type_map = &(OpAttrTypeMap::Instance().Map()[op_type]);
 
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index 972e8aafab758..a6fd06f5d7059 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -182,7 +182,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
-    {"run_program", {"DOut"}},
+    {"run_program", {"DOut", "CUDAGraph"}},
     {"adam",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
       "MasterParamOut"}},
@@ -267,7 +267,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"moving_average_abs_max_scale",
      {"Out", "OutScale", "OutAccum", "OutState"}},
     {"rnn", {"DropoutState"}},
-    {"run_program", {"Out", "DOut", "OutScope"}},
+    {"run_program", {"Out", "DOut", "OutScope", "CUDAGraph"}},
     {"clear_float_status", {"FloatStatusOut"}},
     {"get_float_status", {"FloatStatusOut"}},
     {"assign", {"Out"}},
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 66bf8c95179af..329b3b83337dc 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -76,11 +76,12 @@ void BindProgramDesc(pybind11::module *m) {
                  platform::errors::InvalidArgument(
                      "Failed to parse ProgramDesc from binary string."));
            })
-      .def("_set_version",
-           [](pd::ProgramDesc &self, int64_t version) {
-             return self.SetVersion(version);
-           },
-           pybind11::arg("version") = pd::kCurProgramVersion)
+      .def(
+          "_set_version",
+          [](pd::ProgramDesc &self, int64_t version) {
+            return self.SetVersion(version);
+          },
+          pybind11::arg("version") = pd::kCurProgramVersion)
       .def("_version",
            [](pd::ProgramDesc &self) -> int64_t { return self.Version(); })
       .def("get_op_deps", [](const framework::ProgramDesc &program) {
@@ -113,18 +114,20 @@ void BindBlockDesc(pybind11::module *m) {
       .def("_insert_op", &pd::BlockDesc::InsertOp,
            pybind11::return_value_policy::reference)
       .def("_remove_op", &pd::BlockDesc::RemoveOp)
-      .def("var",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.Var(name);
-           },
-           pybind11::return_value_policy::reference)
-      .def("has_var",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.HasVar(name);
-           },
-           pybind11::return_value_policy::reference)
+      .def(
+          "var",
+          [](pd::BlockDesc &self, pybind11::bytes byte_name) {
+            std::string name = byte_name;
+            return self.Var(name);
+          },
+          pybind11::return_value_policy::reference)
+      .def(
+          "has_var",
+          [](pd::BlockDesc &self, pybind11::bytes byte_name) {
+            std::string name = byte_name;
+            return self.HasVar(name);
+          },
+          pybind11::return_value_policy::reference)
       .def("_rename_var",
            [](pd::BlockDesc &self, const pybind11::bytes &byte_name,
               const pybind11::bytes &byte_name_new) {
@@ -137,24 +140,27 @@ void BindBlockDesc(pybind11::module *m) {
              std::string name = byte_name;
              return self.HasVarRecursive(name);
            })
-      .def("find_var",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.FindVar(name);
-           },
-           pybind11::return_value_policy::reference)
-      .def("find_var_recursive",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.FindVarRecursive(name);
-           },
-           pybind11::return_value_policy::reference)
-      .def("_remove_var",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.RemoveVar(name);
-           },
-           pybind11::return_value_policy::reference)
+      .def(
+          "find_var",
+          [](pd::BlockDesc &self, pybind11::bytes byte_name) {
+            std::string name = byte_name;
+            return self.FindVar(name);
+          },
+          pybind11::return_value_policy::reference)
+      .def(
+          "find_var_recursive",
+          [](pd::BlockDesc &self, pybind11::bytes byte_name) {
+            std::string name = byte_name;
+            return self.FindVarRecursive(name);
+          },
+          pybind11::return_value_policy::reference)
+      .def(
+          "_remove_var",
+          [](pd::BlockDesc &self, pybind11::bytes byte_name) {
+            std::string name = byte_name;
+            return self.RemoveVar(name);
+          },
+          pybind11::return_value_policy::reference)
       .def("all_vars", &pd::BlockDesc::AllVars,
            pybind11::return_value_policy::reference)
       .def("op_size", &pd::BlockDesc::OpSize)
@@ -258,8 +264,9 @@ void BindOpDesc(pybind11::module *m) {
 
   pybind11::class_<pd::OpDesc> op_desc(*m, "OpDesc", "");
   op_desc
-      .def("__init__", [](pd::OpDesc &self) { new (&self) pd::OpDesc(); },
-           pybind11::return_value_policy::reference)
+      .def(
+          "__init__", [](pd::OpDesc &self) { new (&self) pd::OpDesc(); },
+          pybind11::return_value_policy::reference)
       .def("copy_from", &pd::OpDesc::CopyFrom)
       .def("type", &pd::OpDesc::Type)
       .def("set_type", &pd::OpDesc::SetType)
@@ -304,8 +311,9 @@ void BindOpDesc(pybind11::module *m) {
       .def("infer_var_type", &pd::OpDesc::InferVarType)
       .def("set_is_target", &pd::OpDesc::SetIsTarget)
       .def("serialize_to_string", SerializeMessage<pd::OpDesc>)
-      .def("block", [](pd::OpDesc &self) { return self.Block(); },
-           pybind11::return_value_policy::reference)
+      .def(
+          "block", [](pd::OpDesc &self) { return self.Block(); },
+          pybind11::return_value_policy::reference)
       .def("id", &pd::OpDesc::Id)
       .def("original_id", &pd::OpDesc::OriginalId)
       .def("set_original_id", &pd::OpDesc::SetOriginalId)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 0e1271c1fe07f..cba7d03623516 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -604,6 +604,8 @@ PYBIND11_MODULE(core_noavx, m) {
                         place, static_cast<cudaStreamCaptureMode>(mode));
                   })
       .def_static("end_capture", &platform::EndCUDAGraphCapture)
+      .def_static("gen_new_memory_pool_id",
+                  &platform::CUDAGraph::UniqueMemoryPoolID)
       .def("replay", &platform::CUDAGraph::Replay)
       .def("reset", &platform::CUDAGraph::Reset)
       .def("print_to_dot_files", &platform::CUDAGraph::PrintToDotFiles);
@@ -691,56 +693,56 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("_get_use_default_grad_op_desc_maker_ops",
         [] { return OpInfoMap::Instance().GetUseDefaultGradOpDescMakerOps(); });
 
-  m.def("_get_all_register_op_kernels",
-        [](const std::string &lib) {
-          std::unordered_map<std::string, std::vector<std::string>>
-              all_kernels_info;
-          if (lib == "fluid" || lib == "all") {
-            auto &all_kernels =
-                paddle::framework::OperatorWithKernel::AllOpKernels();
-
-            for (auto &kernel_pair : all_kernels) {
-              auto op_type = kernel_pair.first;
-              std::vector<std::string> kernel_types;
-              for (auto &info_pair : kernel_pair.second) {
-                paddle::framework::OpKernelType kernel_type = info_pair.first;
-                kernel_types.emplace_back(
-                    paddle::framework::KernelTypeToString(kernel_type));
-              }
-              all_kernels_info.emplace(op_type, kernel_types);
+  m.def(
+      "_get_all_register_op_kernels",
+      [](const std::string &lib) {
+        std::unordered_map<std::string, std::vector<std::string>>
+            all_kernels_info;
+        if (lib == "fluid" || lib == "all") {
+          auto &all_kernels =
+              paddle::framework::OperatorWithKernel::AllOpKernels();
+
+          for (auto &kernel_pair : all_kernels) {
+            auto op_type = kernel_pair.first;
+            std::vector<std::string> kernel_types;
+            for (auto &info_pair : kernel_pair.second) {
+              paddle::framework::OpKernelType kernel_type = info_pair.first;
+              kernel_types.emplace_back(
+                  paddle::framework::KernelTypeToString(kernel_type));
             }
+            all_kernels_info.emplace(op_type, kernel_types);
           }
-          if (lib == "phi" || lib == "all") {
-            auto phi_kernels = phi::KernelFactory::Instance().kernels();
-            for (auto &kernel_pair : phi_kernels) {
-              auto op_type = phi::TransToFluidOpName(kernel_pair.first);
-              std::vector<std::string> kernel_types;
-              for (auto &info_pair : kernel_pair.second) {
-                framework::OpKernelType kernel_type =
-                    framework::TransPhiKernelKeyToOpKernelType(info_pair.first);
-                auto kernel_type_str =
-                    framework::KernelTypeToString(kernel_type);
-                if (all_kernels_info.count(op_type)) {
-                  if (std::find(all_kernels_info[op_type].begin(),
-                                all_kernels_info[op_type].end(),
-                                kernel_type_str) ==
-                      all_kernels_info[op_type].end()) {
-                    all_kernels_info[op_type].emplace_back(kernel_type_str);
-                  }
-                } else {
-                  kernel_types.emplace_back(kernel_type_str);
+        }
+        if (lib == "phi" || lib == "all") {
+          auto phi_kernels = phi::KernelFactory::Instance().kernels();
+          for (auto &kernel_pair : phi_kernels) {
+            auto op_type = phi::TransToFluidOpName(kernel_pair.first);
+            std::vector<std::string> kernel_types;
+            for (auto &info_pair : kernel_pair.second) {
+              framework::OpKernelType kernel_type =
+                  framework::TransPhiKernelKeyToOpKernelType(info_pair.first);
+              auto kernel_type_str = framework::KernelTypeToString(kernel_type);
+              if (all_kernels_info.count(op_type)) {
+                if (std::find(all_kernels_info[op_type].begin(),
+                              all_kernels_info[op_type].end(),
+                              kernel_type_str) ==
+                    all_kernels_info[op_type].end()) {
+                  all_kernels_info[op_type].emplace_back(kernel_type_str);
                 }
-              }
-              if (!kernel_types.empty()) {
-                all_kernels_info.emplace(op_type, kernel_types);
+              } else {
+                kernel_types.emplace_back(kernel_type_str);
               }
             }
+            if (!kernel_types.empty()) {
+              all_kernels_info.emplace(op_type, kernel_types);
+            }
           }
+        }
 
-          return all_kernels_info;
-        },
-        py::arg("lib") = "all",
-        R"DOC(
+        return all_kernels_info;
+      },
+      py::arg("lib") = "all",
+      R"DOC(
            Return the registered kernels in paddle.
 
            Args:
@@ -1009,9 +1011,10 @@ PYBIND11_MODULE(core_noavx, m) {
                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
           )DOC")
 
-      .def("shape",
-           [](framework::Tensor &self) { return vectorize(self.dims()); },
-           R"DOC(
+      .def(
+          "shape",
+          [](framework::Tensor &self) { return vectorize(self.dims()); },
+          R"DOC(
            Return the shape of Tensor.
 
            Returns:
@@ -1099,20 +1102,21 @@ PYBIND11_MODULE(core_noavx, m) {
       // avoid misuse.
       // The discussion is here:
       // https://github.com/PaddlePaddle/Paddle/issues/10855
-      .def("set_lod",
-           [](framework::Tensor &self,
-              const std::vector<std::vector<size_t>> &lod) {
-             // the input lod is offset-based level-of-detail info
-             LoD new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             PADDLE_ENFORCE_EQ(
-                 CheckLoD(new_lod, vectorize(self.dims()).front()), true,
-                 platform::errors::InvalidArgument(
-                     "The provided LoD is invalid, the LoD is %s", new_lod));
-             self.set_lod(new_lod);
-           },
-           py::arg("lod"), R"DOC(
+      .def(
+          "set_lod",
+          [](framework::Tensor &self,
+             const std::vector<std::vector<size_t>> &lod) {
+            // the input lod is offset-based level-of-detail info
+            LoD new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            PADDLE_ENFORCE_EQ(
+                CheckLoD(new_lod, vectorize(self.dims()).front()), true,
+                platform::errors::InvalidArgument(
+                    "The provided LoD is invalid, the LoD is %s", new_lod));
+            self.set_lod(new_lod);
+          },
+          py::arg("lod"), R"DOC(
            Set LoD of the Tensor.
 
            Args:
@@ -1132,28 +1136,29 @@ PYBIND11_MODULE(core_noavx, m) {
                  t.set_lod([[0, 2, 5]])
                  print(t.lod()) # [[0, 2, 5]]
            )DOC")
-      .def("set_recursive_sequence_lengths",
-           [](framework::Tensor &self, const std::vector<std::vector<size_t>>
-                                           &recursive_sequence_lengths) {
-             // the input recursive_sequence_lengths is length-based
-             // level-of-detail info
-             LoD new_lod;
-             new_lod.reserve(recursive_sequence_lengths.size());
-             std::copy(recursive_sequence_lengths.begin(),
-                       recursive_sequence_lengths.end(),
-                       std::back_inserter(new_lod));
-             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
-             PADDLE_ENFORCE_EQ(
-                 CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true,
-                 platform::errors::InvalidArgument(
-                     "The provided recursive_sequence_lengths info is "
-                     "invalid, "
-                     "the LoD converted by recursive_sequence_lengths is "
-                     "%s",
-                     new_lod));
-             self.set_lod(new_offset_lod);
-           },
-           py::arg("recursive_sequence_lengths"), R"DOC(
+      .def(
+          "set_recursive_sequence_lengths",
+          [](framework::Tensor &self, const std::vector<std::vector<size_t>>
+                                          &recursive_sequence_lengths) {
+            // the input recursive_sequence_lengths is length-based
+            // level-of-detail info
+            LoD new_lod;
+            new_lod.reserve(recursive_sequence_lengths.size());
+            std::copy(recursive_sequence_lengths.begin(),
+                      recursive_sequence_lengths.end(),
+                      std::back_inserter(new_lod));
+            LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+            PADDLE_ENFORCE_EQ(
+                CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true,
+                platform::errors::InvalidArgument(
+                    "The provided recursive_sequence_lengths info is "
+                    "invalid, "
+                    "the LoD converted by recursive_sequence_lengths is "
+                    "%s",
+                    new_lod));
+            self.set_lod(new_offset_lod);
+          },
+          py::arg("recursive_sequence_lengths"), R"DOC(
            Set LoD of the Tensor according to recursive sequence lengths.
 
            For example, if recursive_sequence_lengths=[[2, 3]], which means
@@ -1178,16 +1183,17 @@ PYBIND11_MODULE(core_noavx, m) {
                  print(t.recursive_sequence_lengths())  # [[2, 3]]
                  print(t.lod())  # [[0, 2, 5]]
            )DOC")
-      .def("lod",
-           [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
-             // output the offset-based lod info
-             LoD lod = self.lod();
-             std::vector<std::vector<size_t>> new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             return new_lod;
-           },
-           R"DOC(
+      .def(
+          "lod",
+          [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
+            // output the offset-based lod info
+            LoD lod = self.lod();
+            std::vector<std::vector<size_t>> new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            return new_lod;
+          },
+          R"DOC(
            Return the LoD of the Tensor.
 
            Returns:
@@ -1205,16 +1211,17 @@ PYBIND11_MODULE(core_noavx, m) {
                  print(t.lod()) # [[0, 2, 5]]
            )DOC")
       // Set above comments of set_lod.
-      .def("recursive_sequence_lengths",
-           [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
-             // output the length-based lod info
-             LoD lod = phi::ConvertToLengthBasedLoD(self.lod());
-             std::vector<std::vector<size_t>> new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             return new_lod;
-           },
-           R"DOC(
+      .def(
+          "recursive_sequence_lengths",
+          [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
+            // output the length-based lod info
+            LoD lod = phi::ConvertToLengthBasedLoD(self.lod());
+            std::vector<std::vector<size_t>> new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            return new_lod;
+          },
+          R"DOC(
            Return the recursive sequence lengths corresponding to of the LodD 
            of the Tensor.
 
@@ -1232,13 +1239,14 @@ PYBIND11_MODULE(core_noavx, m) {
                  t.set_recursive_sequence_lengths([[2, 3]])
                  print(t.recursive_sequence_lengths()) # [[2, 3]]
            )DOC")
-      .def("has_valid_recursive_sequence_lengths",
-           [](framework::Tensor &self) -> bool {
-             // Check that the lod info is valid and match the outermost
-             // dimension of the Tensor data
-             return CheckLoD(self.lod(), vectorize(self.dims()).front());
-           },
-           R"DOC(
+      .def(
+          "has_valid_recursive_sequence_lengths",
+          [](framework::Tensor &self) -> bool {
+            // Check that the lod info is valid and match the outermost
+            // dimension of the Tensor data
+            return CheckLoD(self.lod(), vectorize(self.dims()).front());
+          },
+          R"DOC(
            Check whether the LoD of the Tensor is valid.
 
            Returns:
@@ -1622,9 +1630,10 @@ PYBIND11_MODULE(core_noavx, m) {
               const int64_t &height) {
              new (&instance) phi::SelectedRows(rows, height);
            })
-      .def("get_tensor",
-           [](phi::SelectedRows &self) { return self.mutable_value(); },
-           py::return_value_policy::reference)
+      .def(
+          "get_tensor",
+          [](phi::SelectedRows &self) { return self.mutable_value(); },
+          py::return_value_policy::reference)
       .def("numel",
            [](phi::SelectedRows &self) -> int64_t {
              return self.value().numel();
@@ -1666,11 +1675,12 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("get_float",
            [](const Variable &var) -> float { return var.Get<float>(); })
-      .def("get_tensor",
-           [](Variable &self) -> LoDTensor * {
-             return self.GetMutable<LoDTensor>();
-           },
-           py::return_value_policy::reference)
+      .def(
+          "get_tensor",
+          [](Variable &self) -> LoDTensor * {
+            return self.GetMutable<LoDTensor>();
+          },
+          py::return_value_policy::reference)
       .def("get_bytes",
            [](Variable &self) {
              return py::bytes(*self.GetMutable<std::string>());
@@ -1681,53 +1691,60 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("set_vocab", [](Variable &self,
                            Vocab vocab) { *self.GetMutable<Vocab>() = vocab; })
-      .def("get_string_tensor",
-           [](Variable &self) { return self.GetMutable<Strings>(); },
-           py::return_value_policy::reference)
-      .def("get_map_tensor",
-           [](Variable &self) { return self.GetMutable<Vocab>(); },
-           py::return_value_policy::reference)
-      .def("get_lod_rank_table",
-           [](Variable &self) { return self.GetMutable<LoDRankTable>(); },
-           py::return_value_policy::reference)
-      .def("get_selected_rows",
-           [](Variable &self) -> phi::SelectedRows * {
-             return self.GetMutable<phi::SelectedRows>();
-           },
-           py::return_value_policy::reference)
-      .def("get_lod_tensor_array",
-           [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
-           py::return_value_policy::reference)
-      .def("get_fetch_list",
-           [](Variable &self) { return self.GetMutable<FetchList>(); },
-           py::return_value_policy::reference)
+      .def(
+          "get_string_tensor",
+          [](Variable &self) { return self.GetMutable<Strings>(); },
+          py::return_value_policy::reference)
+      .def(
+          "get_map_tensor",
+          [](Variable &self) { return self.GetMutable<Vocab>(); },
+          py::return_value_policy::reference)
+      .def(
+          "get_lod_rank_table",
+          [](Variable &self) { return self.GetMutable<LoDRankTable>(); },
+          py::return_value_policy::reference)
+      .def(
+          "get_selected_rows",
+          [](Variable &self) -> phi::SelectedRows * {
+            return self.GetMutable<phi::SelectedRows>();
+          },
+          py::return_value_policy::reference)
+      .def(
+          "get_lod_tensor_array",
+          [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
+          py::return_value_policy::reference)
+      .def(
+          "get_fetch_list",
+          [](Variable &self) { return self.GetMutable<FetchList>(); },
+          py::return_value_policy::reference)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      .def("get_communicator",
-           [](Variable &self) -> platform::Communicator * {
-             return self.GetMutable<platform::Communicator>();
-           },
-           py::return_value_policy::reference)
+      .def(
+          "get_communicator",
+          [](Variable &self) -> platform::Communicator * {
+            return self.GetMutable<platform::Communicator>();
+          },
+          py::return_value_policy::reference)
 #endif
-      .def("get_reader",
-           [](Variable &self) -> framework::ReaderHolder * {
-             PADDLE_ENFORCE_EQ(
-                 self.IsType<framework::ReaderHolder>(), true,
-                 platform::errors::InvalidArgument(
-                     "The variable is not type of ReaderHolder."));
-             return self.GetMutable<framework::ReaderHolder>();
-           },
-           py::return_value_policy::reference)
-      .def("get_scope",
-           [](Variable &self) -> Scope * {
-             auto scope_vec =
-                 self.GetMutable<std::vector<framework::Scope *>>();
-             PADDLE_ENFORCE_GT(
-                 scope_vec->size(), 0,
-                 platform::errors::InvalidArgument(
-                     "The size of scope_vec should be greater than 0"));
-             return scope_vec->front();
-           },
-           py::return_value_policy::reference)
+      .def(
+          "get_reader",
+          [](Variable &self) -> framework::ReaderHolder * {
+            PADDLE_ENFORCE_EQ(self.IsType<framework::ReaderHolder>(), true,
+                              platform::errors::InvalidArgument(
+                                  "The variable is not type of ReaderHolder."));
+            return self.GetMutable<framework::ReaderHolder>();
+          },
+          py::return_value_policy::reference)
+      .def(
+          "get_scope",
+          [](Variable &self) -> Scope * {
+            auto scope_vec = self.GetMutable<std::vector<framework::Scope *>>();
+            PADDLE_ENFORCE_GT(
+                scope_vec->size(), 0,
+                platform::errors::InvalidArgument(
+                    "The size of scope_vec should be greater than 0"));
+            return scope_vec->front();
+          },
+          py::return_value_policy::reference)
       .def("set_scope", [](Variable &self, Scope &scope) {
         auto scope_vec = self.GetMutable<std::vector<framework::Scope *>>();
         scope_vec->emplace_back(&scope);
@@ -1760,12 +1777,13 @@ All parameter, weight, gradient are variables in Paddle.
   _Scope
       .def("_remove_from_pool",
            [](Scope &self) { ScopePool::Instance().Remove(&self); })
-      .def("var",
-           [](Scope &self, const std::string &name) -> Variable * {
-             return self.Var(name);
-           },
-           py::arg("name"),
-           R"DOC(
+      .def(
+          "var",
+          [](Scope &self, const std::string &name) -> Variable * {
+            return self.Var(name);
+          },
+          py::arg("name"),
+          R"DOC(
            Find or create variable named :code:`name` in the current scope.
 
            If the variable named :code:`name` does not exist in the
@@ -1778,7 +1796,7 @@ All parameter, weight, gradient are variables in Paddle.
            Returns:
                out (core.Variable): the found or created variable.
            )DOC",
-           py::return_value_policy::reference)
+          py::return_value_policy::reference)
       .def("find_var", &Scope::FindVar, py::arg("name"),
            R"DOC(
            Find variable named :code:`name` in the current scope or
@@ -1804,33 +1822,35 @@ All parameter, weight, gradient are variables in Paddle.
                None
            )DOC",
            py::return_value_policy::reference)
-      .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
-           R"DOC(
+      .def(
+          "new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
+          R"DOC(
            Create a new sub-scope of the current scope.
 
            Returns:
                out (core._Scope): the created sub-scope.
            )DOC",
-           py::return_value_policy::reference)
+          py::return_value_policy::reference)
       .def("drop_kids", &Scope::DropKids,
            R"DOC(
            Delete all sub-scopes of the current scope.
            )DOC")
       .def("_kids", &Scope::kids);
 
-  m.def("Scope",
-        []() -> Scope * {
-          auto *s = new Scope();
-          ScopePool::Instance().Insert(std::unique_ptr<Scope>(s));
-          return s;
-        },
-        R"DOC(
+  m.def(
+      "Scope",
+      []() -> Scope * {
+        auto *s = new Scope();
+        ScopePool::Instance().Insert(std::unique_ptr<Scope>(s));
+        return s;
+      },
+      R"DOC(
         Create a new scope.
 
         Returns:
             out (core._Scope): the created scope.
         )DOC",
-        py::return_value_policy::reference);
+      py::return_value_policy::reference);
 
   //! @note: Be careful! PyBind will return std::string as an unicode, not
   //! Python str. If you want a str object, you should cast them in Python.
@@ -1917,11 +1937,12 @@ All parameter, weight, gradient are variables in Paddle.
     return std::make_tuple(ProgramDesc(pruned_desc),
                            pruned_origin_block_id_map);
   });
-  m.def("prune_backward",
-        [](const framework::ProgramDesc &program) {
-          return PruneBackward(program);
-        },
-        R"DOC(
+  m.def(
+      "prune_backward",
+      [](const framework::ProgramDesc &program) {
+        return PruneBackward(program);
+      },
+      R"DOC(
              Prune the backward part of a program, mostly called in
              program.clone(for_test=True).
               
@@ -2788,8 +2809,8 @@ All parameter, weight, gradient are variables in Paddle.
       .def("outputs",
            [](const OperatorBase &op)
                -> std::map<std::string, std::vector<std::string>> {
-                 return op.Outputs();
-               })
+             return op.Outputs();
+           })
       .def("output_vars",
            [](const OperatorBase &op) { return op.OutputVars(true); })
       .def("inputs", [](const OperatorBase &op) { return op.Inputs(); })
@@ -2804,11 +2825,12 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<framework::TrainerBase, std::shared_ptr<framework::TrainerBase>>(
       m, "TrainerBase")
-      .def("get_worker_scope",
-           [](TrainerBase &self, int thread_id) -> Scope * {
-             return self.GetWorkerScope(thread_id);
-           },
-           py::return_value_policy::reference)
+      .def(
+          "get_worker_scope",
+          [](TrainerBase &self, int thread_id) -> Scope * {
+            return self.GetWorkerScope(thread_id);
+          },
+          py::return_value_policy::reference)
       .def("finalize", &TrainerBase::Finalize)
       .def("ResetDataset", &TrainerBase::ResetDataset);
 
@@ -3008,21 +3030,23 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("device_memory_stat_current_value",
         memory::DeviceMemoryStatCurrentValue);
   m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
-  m.def("run_cmd",
-        [](const std::string &cmd, int time_out = -1,
-           int sleep_inter = -1) -> const std::string {
-          return paddle::framework::shell_get_command_output(cmd, time_out,
-                                                             sleep_inter);
-        },
-        py::arg("cmd"), py::arg("time_out") = -1, py::arg("sleep_inter") = -1);
-  m.def("shell_execute_cmd",
-        [](const std::string &cmd, int time_out = 0, int sleep_inter = 0,
-           bool redirect_stderr = false) -> std::vector<std::string> {
-          return paddle::framework::shell_execute_cmd(
-              cmd, time_out, sleep_inter, redirect_stderr);
-        },
-        py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0,
-        py::arg("redirect_stderr") = false);
+  m.def(
+      "run_cmd",
+      [](const std::string &cmd, int time_out = -1,
+         int sleep_inter = -1) -> const std::string {
+        return paddle::framework::shell_get_command_output(cmd, time_out,
+                                                           sleep_inter);
+      },
+      py::arg("cmd"), py::arg("time_out") = -1, py::arg("sleep_inter") = -1);
+  m.def(
+      "shell_execute_cmd",
+      [](const std::string &cmd, int time_out = 0, int sleep_inter = 0,
+         bool redirect_stderr = false) -> std::vector<std::string> {
+        return paddle::framework::shell_execute_cmd(cmd, time_out, sleep_inter,
+                                                    redirect_stderr);
+      },
+      py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0,
+      py::arg("redirect_stderr") = false);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
@@ -3090,9 +3114,10 @@ All parameter, weight, gradient are variables in Paddle.
   pylodtensorarray
       .def("__init__",
            [](LoDTensorArray &instance) { new (&instance) LoDTensorArray(); })
-      .def("__getitem__",
-           [](LoDTensorArray &self, size_t i) { return &self.at(i); },
-           py::return_value_policy::reference)
+      .def(
+          "__getitem__",
+          [](LoDTensorArray &self, size_t i) { return &self.at(i); },
+          py::return_value_policy::reference)
       .def("__len__", [](LoDTensorArray &self) { return self.size(); })
       .def("__setitem__",
            [](LoDTensorArray &self, size_t i, const LoDTensor &t) {
@@ -3103,13 +3128,14 @@ All parameter, weight, gradient are variables in Paddle.
              self[i].ShareDataWith(t);
              self[i].set_lod(t.lod());
            })
-      .def("append",
-           [](LoDTensorArray &self, const LoDTensor &t) {
-             self.emplace_back();
-             self.back().ShareDataWith(t);
-             self.back().set_lod(t.lod());
-           },
-           py::arg("tensor"), R"DOC(
+      .def(
+          "append",
+          [](LoDTensorArray &self, const LoDTensor &t) {
+            self.emplace_back();
+            self.back().ShareDataWith(t);
+            self.back().set_lod(t.lod());
+          },
+          py::arg("tensor"), R"DOC(
              Append a LoDensor to LoDTensorArray.
               
              Args:
@@ -3129,89 +3155,94 @@ All parameter, weight, gradient are variables in Paddle.
                    t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                    arr.append(t)
            )DOC")
-      .def("_move_to_list",
-           [](LoDTensorArray &self) -> py::list {
-             py::list res(self.size());
-             for (size_t i = 0; i < self.size(); ++i) {
-               res[i] = py::cast(std::move(self[i]));
-             }
-             self.clear();
-             return res;
-           },
-           py::return_value_policy::take_ownership);
+      .def(
+          "_move_to_list",
+          [](LoDTensorArray &self) -> py::list {
+            py::list res(self.size());
+            for (size_t i = 0; i < self.size(); ++i) {
+              res[i] = py::cast(std::move(self[i]));
+            }
+            self.clear();
+            return res;
+          },
+          py::return_value_policy::take_ownership);
 
   py::class_<FetchList>(m, "FetchList", R"DOC( FetchList is a
         vector of boost::variant<LoDTensor, LoDTensorArray>.
         )DOC")
-      .def("_move_to_list",
-           [](FetchList &self) -> py::list {
-             py::list res(self.size());
-             for (size_t i = 0; i < self.size(); ++i) {
-               if (data_is_lod_tensor(self[i])) {
-                 auto &data = BOOST_GET(LoDTensor, self[i]);
-                 res[i] = py::cast(std::move(data));
-               } else {
-                 auto &data = BOOST_GET(LoDTensorArray, self[i]);
-                 py::list tmp(data.size());
-                 for (size_t j = 0; j < data.size(); ++j) {
-                   tmp[j] = py::cast(std::move(data[j]));
-                 }
-                 res[i] = std::move(tmp);
-               }
-             }
-             self.clear();
-             return res;
-           },
-           py::return_value_policy::take_ownership)
+      .def(
+          "_move_to_list",
+          [](FetchList &self) -> py::list {
+            py::list res(self.size());
+            for (size_t i = 0; i < self.size(); ++i) {
+              if (data_is_lod_tensor(self[i])) {
+                auto &data = BOOST_GET(LoDTensor, self[i]);
+                res[i] = py::cast(std::move(data));
+              } else {
+                auto &data = BOOST_GET(LoDTensorArray, self[i]);
+                py::list tmp(data.size());
+                for (size_t j = 0; j < data.size(); ++j) {
+                  tmp[j] = py::cast(std::move(data[j]));
+                }
+                res[i] = std::move(tmp);
+              }
+            }
+            self.clear();
+            return res;
+          },
+          py::return_value_policy::take_ownership)
 
-      .def("append",
-           [](FetchList &self, const LoDTensor &t) {
-             self.emplace_back();
-             auto &lod_tensor = BOOST_GET(LoDTensor, self.back());
-             lod_tensor.ShareDataWith(t);
-             lod_tensor.set_lod(t.lod());
-           },
-           py::arg("var"))
-
-      .def("append",
-           [](FetchList &self, const LoDTensorArray &t) {
-             self.emplace_back();
-             auto &lod_tensor_array = BOOST_GET(LoDTensorArray, self.back());
-             for (size_t i = 0; i < t.size(); ++i) {
-               lod_tensor_array[i].ShareDataWith(t[i]);
-               lod_tensor_array[i].set_lod(t[i].lod());
-             }
-           },
-           py::arg("var"));
+      .def(
+          "append",
+          [](FetchList &self, const LoDTensor &t) {
+            self.emplace_back();
+            auto &lod_tensor = BOOST_GET(LoDTensor, self.back());
+            lod_tensor.ShareDataWith(t);
+            lod_tensor.set_lod(t.lod());
+          },
+          py::arg("var"))
+
+      .def(
+          "append",
+          [](FetchList &self, const LoDTensorArray &t) {
+            self.emplace_back();
+            auto &lod_tensor_array = BOOST_GET(LoDTensorArray, self.back());
+            for (size_t i = 0; i < t.size(); ++i) {
+              lod_tensor_array[i].ShareDataWith(t[i]);
+              lod_tensor_array[i].set_lod(t[i].lod());
+            }
+          },
+          py::arg("var"));
 
   py::class_<FetchUnmergedList>(m, "FetchUnmergedList", R"DOC(
         FetchUnmergedList is 2-D array of FetchType(boost::variant(LoDTensor, LoDTensorArray)).
         )DOC")
-      .def("_move_to_list",
-           [](FetchUnmergedList &self) -> py::list {
-             py::list res(self.size());
-             for (size_t i = 0; i < self.size(); ++i) {
-               py::list tmp(self[i].size());
-               for (size_t j = 0; j < self[i].size(); ++j) {
-                 if (data_is_lod_tensor(self[i][j])) {
-                   auto &var = BOOST_GET(LoDTensor, self[i][j]);
-                   tmp[j] = py::cast(std::move(var));
-                 } else {
-                   auto &var = BOOST_GET(LoDTensorArray, self[i][j]);
-                   py::list tmp_array(var.size());
-                   for (size_t k = 0; k < var.size(); ++k) {
-                     tmp_array[k] = std::move(var[k]);
-                   }
-                   tmp[j] = std::move(tmp_array);
-                 }
-               }
-               res[i] = std::move(tmp);
-               self[i].clear();
-             }
-             self.clear();
-             return res;
-           },
-           py::return_value_policy::take_ownership);
+      .def(
+          "_move_to_list",
+          [](FetchUnmergedList &self) -> py::list {
+            py::list res(self.size());
+            for (size_t i = 0; i < self.size(); ++i) {
+              py::list tmp(self[i].size());
+              for (size_t j = 0; j < self[i].size(); ++j) {
+                if (data_is_lod_tensor(self[i][j])) {
+                  auto &var = BOOST_GET(LoDTensor, self[i][j]);
+                  tmp[j] = py::cast(std::move(var));
+                } else {
+                  auto &var = BOOST_GET(LoDTensorArray, self[i][j]);
+                  py::list tmp_array(var.size());
+                  for (size_t k = 0; k < var.size(); ++k) {
+                    tmp_array[k] = std::move(var[k]);
+                  }
+                  tmp[j] = std::move(tmp_array);
+                }
+              }
+              res[i] = std::move(tmp);
+              self[i].clear();
+            }
+            self.clear();
+            return res;
+          },
+          py::return_value_policy::take_ownership);
 
   m.def("op_support_gpu", OpSupportGPU);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -3225,11 +3256,12 @@ All parameter, weight, gradient are variables in Paddle.
     }
     platform::EmptyCache();
   });
-  m.def("get_device_properties",
-        [](int id) -> const gpuDeviceProp & {
-          return platform::GetDeviceProperties(id);
-        },
-        py::return_value_policy::copy);
+  m.def(
+      "get_device_properties",
+      [](int id) -> const gpuDeviceProp & {
+        return platform::GetDeviceProperties(id);
+      },
+      py::return_value_policy::copy);
 
   py::class_<gpuDeviceProp>(m, "_gpuDeviceProperties")
       .def_property_readonly(
@@ -3407,15 +3439,16 @@ All parameter, weight, gradient are variables in Paddle.
              profiler->Prepare();
            })
       .def("start", &paddle::platform::Profiler::Start)
-      .def("stop",
-           [](paddle::platform::Profiler *profiler) {
-             platform::DisableHostEventRecorder();
-             auto result = profiler->Stop();
-             framework::StaticGraphExecutorPerfStatistics(
-                 result->GetNodeTrees());
-             return result;
-           },
-           py::return_value_policy::automatic_reference);
+      .def(
+          "stop",
+          [](paddle::platform::Profiler *profiler) {
+            platform::DisableHostEventRecorder();
+            auto result = profiler->Stop();
+            framework::StaticGraphExecutorPerfStatistics(
+                result->GetNodeTrees());
+            return result;
+          },
+          py::return_value_policy::automatic_reference);
 
   py::class_<paddle::platform::ProfilerOptions>(m, "ProfilerOptions")
       .def(py::init<>())
@@ -3664,11 +3697,12 @@ All parameter, weight, gradient are variables in Paddle.
           },
           R"DOC(This config that the this is distributed training with parameter server
               )DOC")
-      .def_property("_dry_run",
-                    [](const ExecutionStrategy &self) { return self.dry_run_; },
-                    [](ExecutionStrategy &self, bool dry_run) {
-                      self.dry_run_ = dry_run;
-                    });
+      .def_property(
+          "_dry_run",
+          [](const ExecutionStrategy &self) { return self.dry_run_; },
+          [](ExecutionStrategy &self, bool dry_run) {
+            self.dry_run_ = dry_run;
+          });
 
   exec_strategy.def_property(
       "use_experimental_executor",
@@ -3916,11 +3950,12 @@ All parameter, weight, gradient are variables in Paddle.
              const std::vector<std::string> &trainers_endpoints) {
             self.trainers_endpoints_ = trainers_endpoints;
           })
-      .def_property("trainer_id",
-                    [](const BuildStrategy &self) { return self.trainer_id_; },
-                    [](BuildStrategy &self, int trainer_id) {
-                      self.trainer_id_ = trainer_id;
-                    })
+      .def_property(
+          "trainer_id",
+          [](const BuildStrategy &self) { return self.trainer_id_; },
+          [](BuildStrategy &self, int trainer_id) {
+            self.trainer_id_ = trainer_id;
+          })
       .def_property(
           "nccl_comm_num",
           [](const BuildStrategy &self) { return self.nccl_comm_num_; },
@@ -3933,20 +3968,22 @@ All parameter, weight, gradient are variables in Paddle.
           [](BuildStrategy &self, int bkcl_comm_num) {
             self.bkcl_comm_num_ = bkcl_comm_num;
           })
-      .def_property("use_hierarchical_allreduce",
-                    [](const BuildStrategy &self) {
-                      return self.use_hierarchical_allreduce_;
-                    },
-                    [](BuildStrategy &self, bool use) {
-                      self.use_hierarchical_allreduce_ = use;
-                    })
-      .def_property("hierarchical_allreduce_inter_nranks",
-                    [](const BuildStrategy &self) {
-                      return self.hierarchical_allreduce_inter_nranks_;
-                    },
-                    [](BuildStrategy &self, int nranks) {
-                      self.hierarchical_allreduce_inter_nranks_ = nranks;
-                    })
+      .def_property(
+          "use_hierarchical_allreduce",
+          [](const BuildStrategy &self) {
+            return self.use_hierarchical_allreduce_;
+          },
+          [](BuildStrategy &self, bool use) {
+            self.use_hierarchical_allreduce_ = use;
+          })
+      .def_property(
+          "hierarchical_allreduce_inter_nranks",
+          [](const BuildStrategy &self) {
+            return self.hierarchical_allreduce_inter_nranks_;
+          },
+          [](BuildStrategy &self, int nranks) {
+            self.hierarchical_allreduce_inter_nranks_ = nranks;
+          })
 
       .def_property(
           "fuse_elewise_add_act_ops",
@@ -4105,19 +4142,20 @@ All parameter, weight, gradient are variables in Paddle.
                         build_strategy = static.BuildStrategy()
                         build_strategy.fuse_relu_depthwise_conv = True
           )DOC")
-      .def_property("fuse_broadcast_ops",
-                    [](const BuildStrategy &self) {
-                      return self.fuse_broadcast_ops_ == true ||
-                             self.fuse_broadcast_ops_ == paddle::none;
-                    },
-                    [](BuildStrategy &self, bool b) {
-                      PADDLE_ENFORCE_NE(self.IsFinalized(), true,
-                                        platform::errors::PreconditionNotMet(
-                                            "BuildStrategy has been finlaized, "
-                                            "cannot be configured again."));
-                      self.fuse_broadcast_ops_ = b;
-                    },
-                    R"DOC((bool, optional): fuse_broadcast_op indicates whether
+      .def_property(
+          "fuse_broadcast_ops",
+          [](const BuildStrategy &self) {
+            return self.fuse_broadcast_ops_ == true ||
+                   self.fuse_broadcast_ops_ == paddle::none;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, "
+                                  "cannot be configured again."));
+            self.fuse_broadcast_ops_ = b;
+          },
+          R"DOC((bool, optional): fuse_broadcast_op indicates whether
                       to fuse the broadcast ops. Note that, in Reduce mode,
                       fusing broadcast ops may make the program faster. Because
                       fusing broadcast OP equals delaying the execution of all
@@ -4135,18 +4173,19 @@ All parameter, weight, gradient are variables in Paddle.
                               build_strategy = static.BuildStrategy()
                               build_strategy.fuse_broadcast_ops = True
                     )DOC")
-      .def_property("fuse_all_optimizer_ops",
-                    [](const BuildStrategy &self) {
-                      return self.fuse_all_optimizer_ops_ == true ||
-                             self.fuse_all_optimizer_ops_ == paddle::none;
-                    },
-                    [](BuildStrategy &self, bool b) {
-                      PADDLE_ENFORCE_NE(self.IsFinalized(), true,
-                                        platform::errors::PreconditionNotMet(
-                                            "BuildStrategy has been finlaized, "
-                                            "cannot be configured again."));
-                      self.fuse_all_optimizer_ops_ = b;
-                    })
+      .def_property(
+          "fuse_all_optimizer_ops",
+          [](const BuildStrategy &self) {
+            return self.fuse_all_optimizer_ops_ == true ||
+                   self.fuse_all_optimizer_ops_ == paddle::none;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, "
+                                  "cannot be configured again."));
+            self.fuse_all_optimizer_ops_ = b;
+          })
       .def_property(
           "sync_batch_norm",
           [](const BuildStrategy &self) { return self.sync_batch_norm_; },
@@ -4229,9 +4268,10 @@ All parameter, weight, gradient are variables in Paddle.
             self.is_distribution_ = b;
 #endif
           })
-      .def_property("async_mode",
-                    [](const BuildStrategy &self) { return self.async_mode_; },
-                    [](BuildStrategy &self, bool b) { self.async_mode_ = b; })
+      .def_property(
+          "async_mode",
+          [](const BuildStrategy &self) { return self.async_mode_; },
+          [](BuildStrategy &self, bool b) { self.async_mode_ = b; })
       .def_property(
           "enable_inplace",
           [](const BuildStrategy &self) { return self.enable_inplace_; },
@@ -4247,13 +4287,14 @@ All parameter, weight, gradient are variables in Paddle.
                    self.fuse_all_reduce_ops_ == paddle::none;
           },
           [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
-      .def_property("enable_backward_optimizer_op_deps",
-                    [](const BuildStrategy &self) {
-                      return self.enable_backward_optimizer_op_deps_;
-                    },
-                    [](BuildStrategy &self, bool b) {
-                      self.enable_backward_optimizer_op_deps_ = b;
-                    })
+      .def_property(
+          "enable_backward_optimizer_op_deps",
+          [](const BuildStrategy &self) {
+            return self.enable_backward_optimizer_op_deps_;
+          },
+          [](BuildStrategy &self, bool b) {
+            self.enable_backward_optimizer_op_deps_ = b;
+          })
       .def_property(
           "cache_runtime_context",
           [](const BuildStrategy &self) { return self.cache_runtime_context_; },
@@ -4273,24 +4314,26 @@ All parameter, weight, gradient are variables in Paddle.
           [](BuildStrategy &self, bool fix_op_run_order) {
             self.fix_op_run_order_ = fix_op_run_order;
           })
-      .def_property("allow_cuda_graph_capture",
-                    [](const BuildStrategy &self) {
-                      return self.allow_cuda_graph_capture_;
-                    },
-                    [](BuildStrategy &self, bool allow_cuda_graph_capture) {
-                      self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
-                    })
+      .def_property(
+          "allow_cuda_graph_capture",
+          [](const BuildStrategy &self) {
+            return self.allow_cuda_graph_capture_;
+          },
+          [](BuildStrategy &self, bool allow_cuda_graph_capture) {
+            self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
+          })
       .def("_copy",
            [](const BuildStrategy &self) {
              auto new_bs = self;
              new_bs.ClearFinalized();
              return new_bs;
            })
-      .def("_finalize_strategy_and_create_passes",
-           [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
-             return self.CreatePassesFromStrategy(true);
-           },
-           R"DOC(Allow user to customized passes. Normally model-specific
+      .def(
+          "_finalize_strategy_and_create_passes",
+          [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
+            return self.CreatePassesFromStrategy(true);
+          },
+          R"DOC(Allow user to customized passes. Normally model-specific
                 optimization passes should be defined in this way. BuildStrategy
                 cannot be updated after being finalized.)DOC");
 
@@ -4308,11 +4351,12 @@ All parameter, weight, gradient are variables in Paddle.
       // We still cannot get local_scope from this vector, since the element
       // of vec<Scope*> will be freed by Python GC. We can only return Scope*
       // one by one and mark them as reference.
-      .def("local_scopes",
-           [](ParallelExecutor &self) -> std::vector<Scope *> * {
-             return &self.GetLocalScopes();
-           },
-           py::return_value_policy::reference)
+      .def(
+          "local_scopes",
+          [](ParallelExecutor &self) -> std::vector<Scope *> * {
+            return &self.GetLocalScopes();
+          },
+          py::return_value_policy::reference)
       .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes)
       .def("_need_create_local_exe_scopes",
            &ParallelExecutor::NeedCreateLocalExeScope)
@@ -4344,12 +4388,13 @@ All parameter, weight, gradient are variables in Paddle.
              std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>>(
       m, "IpuBackend")
       // manage IpuBackend in C++
-      .def("get_instance",
-           []() {
-             return std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>(
-                 platform::ipu::IpuBackend::GetInstance());
-           },
-           py::return_value_policy::reference)
+      .def(
+          "get_instance",
+          []() {
+            return std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>(
+                platform::ipu::IpuBackend::GetInstance());
+          },
+          py::return_value_policy::reference)
       .def("weights_to_host", &platform::ipu::IpuBackend::WeightsToHost)
       .def("detach", &platform::ipu::IpuBackend::Detach)
       .def("reset", &platform::ipu::IpuBackend::Reset)
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index e0aab0dd06ecb..3e779ba41c0eb 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/reader_py.h"
+
 #include <exception>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "Python.h"
 #include "boost/optional.hpp"
 #include "gflags/gflags.h"
@@ -337,32 +339,33 @@ void BindMultiDeviceReader(py::module *module, const char *reader_name) {
            py::call_guard<py::gil_scoped_release>())
       .def("read_next_list", &ReaderType::ReadNextList,
            py::call_guard<py::gil_scoped_release>())
-      .def("read_next_var_list",
-           [](ReaderType &self) {
-             auto result_list = self.ReadNextList();
-             auto &tensor_list = result_list[0];
-             std::vector<std::shared_ptr<imperative::VarBase>> var_list;
-             var_list.reserve(tensor_list.size());
-             auto func = [](framework::LoDTensor &lod_tensor) {
-               std::string act_name =
-                   imperative::GetCurrentTracer()->GenerateUniqueName(
-                       "generated_var");
-               auto new_var = std::make_shared<imperative::VarBase>(act_name);
-               new_var->SetPersistable(false);
-               new_var->SetType(framework::proto::VarType::LOD_TENSOR);
-               new_var->SetDataType(
-                   framework::TransToProtoVarType(lod_tensor.dtype()));
-               auto *tensor =
-                   new_var->MutableVar()->GetMutable<framework::LoDTensor>();
-               *tensor = std::move(lod_tensor);
-               return new_var;
-             };
-             for (auto &tensor : tensor_list) {
-               var_list.emplace_back(func(tensor));
-             }
-             return var_list;
-           },
-           py::call_guard<py::gil_scoped_release>())
+      .def(
+          "read_next_var_list",
+          [](ReaderType &self) {
+            auto result_list = self.ReadNextList();
+            auto &tensor_list = result_list[0];
+            std::vector<std::shared_ptr<imperative::VarBase>> var_list;
+            var_list.reserve(tensor_list.size());
+            auto func = [](framework::LoDTensor &lod_tensor) {
+              std::string act_name =
+                  imperative::GetCurrentTracer()->GenerateUniqueName(
+                      "generated_var");
+              auto new_var = std::make_shared<imperative::VarBase>(act_name);
+              new_var->SetPersistable(false);
+              new_var->SetType(framework::proto::VarType::LOD_TENSOR);
+              new_var->SetDataType(
+                  framework::TransToProtoVarType(lod_tensor.dtype()));
+              auto *tensor =
+                  new_var->MutableVar()->GetMutable<framework::LoDTensor>();
+              *tensor = std::move(lod_tensor);
+              return new_var;
+            };
+            for (auto &tensor : tensor_list) {
+              var_list.emplace_back(func(tensor));
+            }
+            return var_list;
+          },
+          py::call_guard<py::gil_scoped_release>())
       .def("reset", &ReaderType::Reset,
            py::call_guard<py::gil_scoped_release>())
       .def("shutdown", &ReaderType::Shutdown,
@@ -372,34 +375,35 @@ void BindMultiDeviceReader(py::module *module, const char *reader_name) {
 void BindReader(py::module *module) {
   auto &m = *module;
 
-  m.def("diff_tensor_shape", [](const framework::LoDTensor &tensor,
-                                const framework::VarDesc &var_desc,
-                                size_t num_places) -> py::object {
-    auto diff = DiffTensorShapeWithVarDesc(tensor, var_desc, num_places);
-    if (diff) {
-      return py::cast(std::move(diff.get()));
-    } else {
-      return py::cast(nullptr);
-    }
-  });
-
-  m.def("init_lod_tensor_blocking_queue",
-        [](framework::Variable &var, size_t capacity,
-           bool is_ordered) -> py::object {
-          VLOG(1) << "init_lod_tensor_blocking_queue";
-          if (is_ordered) {
-            auto *holder = var.GetMutable<
-                reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder>();
-            holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
-            return py::cast(holder->GetQueue());
-          } else {
-            auto *holder =
-                var.GetMutable<reader::LoDTensorBlockingQueueHolder>();
-            holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
-            return py::cast(holder->GetQueue());
-          }
-        },
-        py::return_value_policy::copy);
+  m.def(
+      "diff_tensor_shape",
+      [](const framework::LoDTensor &tensor, const framework::VarDesc &var_desc,
+         size_t num_places) -> py::object {
+        auto diff = DiffTensorShapeWithVarDesc(tensor, var_desc, num_places);
+        if (diff) {
+          return py::cast(std::move(diff.get()));
+        } else {
+          return py::cast(nullptr);
+        }
+      });
+
+  m.def(
+      "init_lod_tensor_blocking_queue",
+      [](framework::Variable &var, size_t capacity,
+         bool is_ordered) -> py::object {
+        VLOG(1) << "init_lod_tensor_blocking_queue";
+        if (is_ordered) {
+          auto *holder = var.GetMutable<
+              reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder>();
+          holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
+          return py::cast(holder->GetQueue());
+        } else {
+          auto *holder = var.GetMutable<reader::LoDTensorBlockingQueueHolder>();
+          holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
+          return py::cast(holder->GetQueue());
+        }
+      },
+      py::return_value_policy::copy);
 
   py::class_<framework::ReaderHolder>(m, "Reader", "")
       .def("start", &framework::ReaderHolder::Start)
@@ -408,12 +412,13 @@ void BindReader(py::module *module) {
   py::class_<reader::LoDTensorBlockingQueue,
              std::shared_ptr<reader::LoDTensorBlockingQueue>>(
       m, "LoDTensorBlockingQueue", "")
-      .def("push",
-           [](reader::LoDTensorBlockingQueue &self,
-              const std::vector<framework::LoDTensor> &lod_tensor_vec) {
-             return self.Push(lod_tensor_vec);
-           },
-           py::call_guard<py::gil_scoped_release>())
+      .def(
+          "push",
+          [](reader::LoDTensorBlockingQueue &self,
+             const std::vector<framework::LoDTensor> &lod_tensor_vec) {
+            return self.Push(lod_tensor_vec);
+          },
+          py::call_guard<py::gil_scoped_release>())
       .def("size", &reader::LoDTensorBlockingQueue::Size)
       .def("capacity", &reader::LoDTensorBlockingQueue::Cap)
       .def("close", &reader::LoDTensorBlockingQueue::Close)
@@ -424,12 +429,13 @@ void BindReader(py::module *module) {
   py::class_<reader::OrderedMultiDeviceLoDTensorBlockingQueue,
              std::shared_ptr<reader::OrderedMultiDeviceLoDTensorBlockingQueue>>(
       m, "OrderedMultiDeviceLoDTensorBlockingQueue", "")
-      .def("push",
-           [](reader::OrderedMultiDeviceLoDTensorBlockingQueue &self,
-              const std::vector<framework::LoDTensor> &lod_tensor_vec) {
-             return self.Push(lod_tensor_vec);
-           },
-           py::call_guard<py::gil_scoped_release>())
+      .def(
+          "push",
+          [](reader::OrderedMultiDeviceLoDTensorBlockingQueue &self,
+             const std::vector<framework::LoDTensor> &lod_tensor_vec) {
+            return self.Push(lod_tensor_vec);
+          },
+          py::call_guard<py::gil_scoped_release>())
       .def("size", &reader::OrderedMultiDeviceLoDTensorBlockingQueue::Size)
       .def("capacity", &reader::OrderedMultiDeviceLoDTensorBlockingQueue::Cap)
       .def("close", &reader::OrderedMultiDeviceLoDTensorBlockingQueue::Close)
@@ -444,19 +450,20 @@ void BindReader(py::module *module) {
   BindMultiDeviceReader<reader::OrderedMultiDeviceLoDTensorBlockingQueue>(
       module, "OrderedMultiDeviceFeedReader");
 
-  m.def("create_py_reader",
-        [](const std::shared_ptr<reader::LoDTensorBlockingQueue> &queue,
-           const std::vector<std::string> &names,
-           const std::vector<std::vector<int>> &shapes,
-           const std::vector<framework::proto::VarType::Type> &dtypes,
-           const std::vector<bool> &need_check_feed,
-           const std::vector<platform::Place> &dst_places,
-           bool use_double_buffer, bool drop_last, bool pin_memory) {
-          return new MultiDeviceFeedReader<reader::LoDTensorBlockingQueue>(
-              queue, names, shapes, dtypes, need_check_feed, dst_places,
-              use_double_buffer, drop_last, pin_memory);
-        },
-        py::return_value_policy::take_ownership);
+  m.def(
+      "create_py_reader",
+      [](const std::shared_ptr<reader::LoDTensorBlockingQueue> &queue,
+         const std::vector<std::string> &names,
+         const std::vector<std::vector<int>> &shapes,
+         const std::vector<framework::proto::VarType::Type> &dtypes,
+         const std::vector<bool> &need_check_feed,
+         const std::vector<platform::Place> &dst_places, bool use_double_buffer,
+         bool drop_last, bool pin_memory) {
+        return new MultiDeviceFeedReader<reader::LoDTensorBlockingQueue>(
+            queue, names, shapes, dtypes, need_check_feed, dst_places,
+            use_double_buffer, drop_last, pin_memory);
+      },
+      py::return_value_policy::take_ownership);
 
   m.def(
       "create_py_reader",
diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h
index add332abd30ea..109f3e5705b60 100644
--- a/paddle/fluid/pybind/slice_utils.h
+++ b/paddle/fluid/pybind/slice_utils.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <Python.h>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/scope_guard.h"
 #include "paddle/fluid/operators/utils.h"
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 63b36bd917390..ed7ce64032b0e 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -15,12 +15,14 @@ limitations under the License. */
 #pragma once
 
 #include <Python.h>
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
diff --git a/paddle/fluid/pybind/uva_utils.h b/paddle/fluid/pybind/uva_utils.h
index 94f55769b7356..3ea3d7ee1a742 100644
--- a/paddle/fluid/pybind/uva_utils.h
+++ b/paddle/fluid/pybind/uva_utils.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <Python.h>
+
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h
index 45fe89e8b5b14..d161b2a912fca 100644
--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
@@ -17,6 +17,6 @@
 #include <sstream>
 #include <string>
 #include <utility>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/utils/string/pretty_log.h"
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index e5f224bf6ad99..3846acbde4819 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -1,10 +1,10 @@
-if (NOT WITH_INFRT)
-    return()
+if(NOT WITH_INFRT)
+  return()
 endif()
 
-option(INFRT_WITH_PHI  "Compile INFRT with PHI"    ON)
-option(INFRT_WITH_GPU  "Compile INFRT with GPU"    OFF)
-option(INFRT_WITH_TRT  "Compile INFRT with TensorRT"    OFF)
+option(INFRT_WITH_PHI "Compile INFRT with PHI" ON)
+option(INFRT_WITH_GPU "Compile INFRT with GPU" OFF)
+option(INFRT_WITH_TRT "Compile INFRT with TensorRT" OFF)
 
 #TODO(xiaowei) remove fluid
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
@@ -13,13 +13,13 @@ if(WITH_GPU)
   set(INFRT_WITH_GPU ON)
 endif()
 
-if (INFRT_WITH_PHI)
+if(INFRT_WITH_PHI)
   add_definitions("-DINFRT_WITH_PHI")
 
   # TODO(wilber): Now Infrt gpu/trt depends on phi's components, Modify compile dependency options later.
-  if (INFRT_WITH_GPU)
+  if(INFRT_WITH_GPU)
     add_definitions("-DINFRT_WITH_GPU")
-    if (INFRT_WITH_TRT)
+    if(INFRT_WITH_TRT)
       add_definitions("-DINFRT_WITH_TRT")
     endif()
   endif()
@@ -32,8 +32,8 @@ foreach(flag ${INFRT_FLAGS})
   safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
 endforeach()
 
-set(INFRT_SOURCE_DIR "${PADDLE_SOURCE_DIR}/paddle/infrt" )
-set(INFRT_BINARY_DIR "${PADDLE_BINARY_DIR}/paddle/infrt" )
+set(INFRT_SOURCE_DIR "${PADDLE_SOURCE_DIR}/paddle/infrt")
+set(INFRT_BINARY_DIR "${PADDLE_BINARY_DIR}/paddle/infrt")
 set(INFRT_TEST_TARGETS CACHE INTERNAL "")
 include(infrt_lib)
 
@@ -41,21 +41,29 @@ set(infrt_src CACHE INTERNAL "" FORCE)
 
 # Gather headers for library publish.
 function(core_gather_headers)
-    file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
-
-    foreach(header ${includes})
-        set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
-    endforeach()
+  file(
+    GLOB includes
+    LIST_DIRECTORIES false
+    RELATIVE ${CMAKE_SOURCE_DIR}
+    *.h)
+
+  foreach(header ${includes})
+    set(core_includes
+        "${core_includes};${header}"
+        CACHE INTERNAL "")
+  endforeach()
 endfunction()
 
 function(gather_srcs SRC_GROUP)
-    set(options)
-    set(oneValueArgs)
-    set(multiValueArgs "SRCS")
-    cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN})
-    foreach(cpp ${prefix_SRCS})
-        set(${SRC_GROUP} "${${SRC_GROUP}};${CMAKE_CURRENT_SOURCE_DIR}/${cpp}" CACHE INTERNAL "")
-    endforeach()
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs "SRCS")
+  cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN})
+  foreach(cpp ${prefix_SRCS})
+    set(${SRC_GROUP}
+        "${${SRC_GROUP}};${CMAKE_CURRENT_SOURCE_DIR}/${cpp}"
+        CACHE INTERNAL "")
+  endforeach()
 endfunction()
 
 # This method is similar to the global cc_test, but discard the huge amount default dependencies those are
@@ -65,28 +73,36 @@ function(cc_test_tiny TARGET_NAME)
     set(options SERIAL)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS)
-    cmake_parse_arguments(cc_test_tiny "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(cc_test_tiny "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_tiny_SRCS})
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(${TARGET_NAME} ${cc_test_tiny_DEPS} ${os_dependency_modules} infrt_gtest_main gtest )
-    add_dependencies(${TARGET_NAME} ${cc_test_tiny_DEPS} infrt_gtest_main gtest extern_gtest)
+    target_link_libraries(${TARGET_NAME} ${cc_test_tiny_DEPS}
+                          ${os_dependency_modules} infrt_gtest_main gtest)
+    add_dependencies(${TARGET_NAME} ${cc_test_tiny_DEPS} infrt_gtest_main gtest
+                     extern_gtest)
 
-    add_test(NAME ${TARGET_NAME}
+    add_test(
+      NAME ${TARGET_NAME}
       COMMAND ${TARGET_NAME} "${cc_test_tiny_ARGS}"
-            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    if (${cc_test_tiny_SERIAL})
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if(${cc_test_tiny_SERIAL})
       set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
-    set(INFRT_TEST_TARGETS ${INFRT_TEST_TARGETS} ${TARGET_NAME} CACHE INTERNAL "")
+    set(INFRT_TEST_TARGETS
+        ${INFRT_TEST_TARGETS} ${TARGET_NAME}
+        CACHE INTERNAL "")
   endif()
 
 endfunction()
 
-if (WITH_TESTING)
-    cc_library(infrt_gtest_main SRCS gtest_main.cc DEPS gtest glog gflags)
+if(WITH_TESTING)
+  cc_library(
+    infrt_gtest_main
+    SRCS gtest_main.cc
+    DEPS gtest glog gflags)
 endif()
 
-
 add_subdirectory(api)
 add_subdirectory(backends)
 add_subdirectory(common)
@@ -99,27 +115,24 @@ add_subdirectory(external_kernels)
 add_subdirectory(paddle)
 add_subdirectory(tests)
 
-
 # MLIR td file generations
-set(infrt_mlir_incs
-        basic_kernels_inc
-        test_kernels_inc
-        tensor_shape_inc
-        dense_tensor_inc
-        pd_extra_ops_inc
-        trt_ops_inc
-        )
-
-if (INFRT_WITH_PHI)
-    set(phi_libs phi)
-    set(infrt_mlir_incs ${infrt_mlir_incs}
-        MLIRinfrt_phi_tensorIncGen
-        MLIRinfrt_phi_baseIncGen
-        )
+set(infrt_mlir_incs basic_kernels_inc test_kernels_inc tensor_shape_inc
+                    dense_tensor_inc pd_extra_ops_inc trt_ops_inc)
+
+if(INFRT_WITH_PHI)
+  set(phi_libs phi)
+  set(infrt_mlir_incs ${infrt_mlir_incs} MLIRinfrt_phi_tensorIncGen
+                      MLIRinfrt_phi_baseIncGen)
 endif()
 
-cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive)
-cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto)
+cc_library(
+  infrt SHARED
+  SRCS ${infrt_src}
+  DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive)
+cc_library(
+  infrt_static
+  SRCS ${infrt_src}
+  DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto)
 add_dependencies(infrt ${infrt_mlir_incs} mlir-headers)
 
 add_custom_target(test_infrt_exec DEPENDS ${INFRT_TEST_TARGETS})
diff --git a/paddle/infrt/api/CMakeLists.txt b/paddle/infrt/api/CMakeLists.txt
index 6d4604edee6a0..2d88af7d5b5c8 100644
--- a/paddle/infrt/api/CMakeLists.txt
+++ b/paddle/infrt/api/CMakeLists.txt
@@ -1,9 +1,8 @@
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    infrt_api.cc
-    )
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/infrt_api_test.cc.in ${CMAKE_CURRENT_SOURCE_DIR}/infrt_api_test.cc)
+gather_srcs(infrt_src SRCS infrt_api.cc)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/infrt_api_test.cc.in
+               ${CMAKE_CURRENT_SOURCE_DIR}/infrt_api_test.cc)
 
 # Disable temporarily for the external-kernel's mkldnn is outdate
 cc_test_tiny(test_infrt_api SRCS infrt_api_test.cc DEPS infrt ${MLIR_IR_LIBS})
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
index f0bf46567a5bf..2f4bbd5df352c 100644
--- a/paddle/infrt/api/infrt_api.cc
+++ b/paddle/infrt/api/infrt_api.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include "paddle/infrt/api/infrt_api.h"
 
 #include <llvm/ADT/SmallVector.h>
@@ -61,6 +62,7 @@
 #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
 #include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h"
 #endif
+// clang-format on
 
 using namespace infrt::host_context;  // NOLINT
 using namespace infrt::tensor;        // NOLINT
diff --git a/paddle/infrt/backends/CMakeLists.txt b/paddle/infrt/backends/CMakeLists.txt
index b639f89292568..606fec5d92dae 100644
--- a/paddle/infrt/backends/CMakeLists.txt
+++ b/paddle/infrt/backends/CMakeLists.txt
@@ -1,3 +1,5 @@
-if (INFRT_WITH_PHI AND WITH_GPU AND WITH_TENSORRT)
+if(INFRT_WITH_PHI
+   AND WITH_GPU
+   AND WITH_TENSORRT)
   add_subdirectory(tensorrt)
 endif()
diff --git a/paddle/infrt/backends/host/phi_context.h b/paddle/infrt/backends/host/phi_context.h
index 2af1fab100821..880d1f03d8766 100644
--- a/paddle/infrt/backends/host/phi_context.h
+++ b/paddle/infrt/backends/host/phi_context.h
@@ -35,12 +35,12 @@ class CpuPhiContext : public ::phi::CPUContext {
 class GpuPhiContext : public ::phi::GPUContext {
  public:
   using Base = ::phi::GPUContext;
-  using ::phi::GPUContext::SetStream;
-  using ::phi::GPUContext::SetEigenDevice;
   using ::phi::GPUContext::SetBlasHandle;
   using ::phi::GPUContext::SetDnnHandle;
+  using ::phi::GPUContext::SetEigenDevice;
   using ::phi::GPUContext::SetSolverHandle;
   using ::phi::GPUContext::SetSparseHandle;
+  using ::phi::GPUContext::SetStream;
 };
 
 }  // namespace backends
diff --git a/paddle/infrt/backends/tensorrt/CMakeLists.txt b/paddle/infrt/backends/tensorrt/CMakeLists.txt
index 672515ea4b7f8..9a9db6b737c10 100644
--- a/paddle/infrt/backends/tensorrt/CMakeLists.txt
+++ b/paddle/infrt/backends/tensorrt/CMakeLists.txt
@@ -4,4 +4,11 @@ core_gather_headers()
 
 gather_srcs(infrt_src SRCS trt_engine.cc)
 
-cc_test_tiny(test_infrt_trt SRCS test_trt_engine.cc DEPS infrt phi_dynload_cuda tensorrt_converter)
+cc_test_tiny(
+  test_infrt_trt
+  SRCS
+  test_trt_engine.cc
+  DEPS
+  infrt
+  phi_dynload_cuda
+  tensorrt_converter)
diff --git a/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.cu b/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.cu
index 5a53777c8e30f..f3e2fe35074a6 100644
--- a/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.cu
@@ -199,8 +199,8 @@ bool PoolPlugin::isOutputBroadcastAcrossBatch(int32_t outputIndex,
   return false;
 }
 
-bool PoolPlugin::canBroadcastInputAcrossBatch(int32_t inputIndex) const
-    noexcept {
+bool PoolPlugin::canBroadcastInputAcrossBatch(
+    int32_t inputIndex) const noexcept {
   return false;
 }
 
diff --git a/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.h b/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.h
index 0da1d15845330..34189f95438bf 100644
--- a/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.h
+++ b/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.h
@@ -114,10 +114,10 @@ class PoolPlugin : public nvinfer1::IPluginV2IOExt {
   char const* getPluginNamespace() const noexcept override;
 
   // IPluginV2Ext methods
-  nvinfer1::DataType getOutputDataType(int32_t index,
-                                       nvinfer1::DataType const* inputTypes,
-                                       int32_t nbInputs) const
-      noexcept override;
+  nvinfer1::DataType getOutputDataType(
+      int32_t index,
+      nvinfer1::DataType const* inputTypes,
+      int32_t nbInputs) const noexcept override;
   bool isOutputBroadcastAcrossBatch(int32_t outputIndex,
                                     bool const* inputIsBroadcasted,
                                     int32_t nbInputs) const noexcept override;
diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
index 89dd3b0dc7abf..7e081362f9c62 100644
--- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <math.h>
-
 #include <NvInfer.h>
 #include <NvInferRuntime.h>
 #include <NvInferRuntimeCommon.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <math.h>
+
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc
index a2d4954618986..a539078e4af4d 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/trt_engine.cc
@@ -18,6 +18,7 @@
 #include <NvInferRuntime.h>
 #include <NvInferRuntimeCommon.h>
 #include <glog/logging.h>
+
 #include "paddle/phi/backends/dynload/tensorrt.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/ddim.h"
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h
index 41d11a7111709..44f36a84cb5dc 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.h
+++ b/paddle/infrt/backends/tensorrt/trt_engine.h
@@ -17,6 +17,7 @@
 
 #include <NvInfer.h>
 #include <NvInferRuntime.h>
+
 #include "paddle/infrt/backends/tensorrt/trt_options.h"
 #include "paddle/infrt/backends/tensorrt/trt_utils.h"
 #include "paddle/phi/backends/dynload/tensorrt.h"
diff --git a/paddle/infrt/backends/tensorrt/trt_options.h b/paddle/infrt/backends/tensorrt/trt_options.h
index d5190f5e6220e..b4e36da2058ed 100644
--- a/paddle/infrt/backends/tensorrt/trt_options.h
+++ b/paddle/infrt/backends/tensorrt/trt_options.h
@@ -15,12 +15,12 @@
 
 #pragma once
 
+#include <NvInfer.h>
+
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include <NvInfer.h>
-
 namespace infrt {
 namespace backends {
 namespace tensorrt {
diff --git a/paddle/infrt/common/CMakeLists.txt b/paddle/infrt/common/CMakeLists.txt
index 931e3e42307eb..c77f099aef4a4 100644
--- a/paddle/infrt/common/CMakeLists.txt
+++ b/paddle/infrt/common/CMakeLists.txt
@@ -1,14 +1,17 @@
 core_gather_headers()
-set(core_includes "${core_includes};infrt/common/dtype.def" CACHE INTERNAL "")
+set(core_includes
+    "${core_includes};infrt/common/dtype.def"
+    CACHE INTERNAL "")
 
-gather_srcs(infrt_src SRCS
-    dtype.cc
-    global.cc
-    target.cc
-    type.cc
-    shared.cc
-    object.cc
-    string.cc
-    buffer.cc
-    memory.cc
-    )
+gather_srcs(
+  infrt_src
+  SRCS
+  dtype.cc
+  global.cc
+  target.cc
+  type.cc
+  shared.cc
+  object.cc
+  string.cc
+  buffer.cc
+  memory.cc)
diff --git a/paddle/infrt/common/global.h b/paddle/infrt/common/global.h
index e6586cb3a3c60..2d7735d525244 100644
--- a/paddle/infrt/common/global.h
+++ b/paddle/infrt/common/global.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <mlir/IR/MLIRContext.h>
+
 #include "paddle/infrt/tensor/dense_host_tensor.h"
 
 namespace infrt {
diff --git a/paddle/infrt/common/memory.h b/paddle/infrt/common/memory.h
index 678529b8b785c..643b21477615d 100644
--- a/paddle/infrt/common/memory.h
+++ b/paddle/infrt/common/memory.h
@@ -15,9 +15,9 @@
 #pragma once
 
 #include <glog/logging.h>
-#include <unordered_map>
 
 #include <memory>
+#include <unordered_map>
 
 #include "paddle/infrt/common/macros.h"
 #include "paddle/infrt/common/target.h"
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
index cf3906c32e559..33206dbd56b6e 100644
--- a/paddle/infrt/dialect/CMakeLists.txt
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -1,13 +1,14 @@
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    dialect.cc
-    init_dialects.cc
-    tensor_shape.cc
-    dense_tensor.cc
-    mlir_loader.cc
-    diagnostic_utils.cc
-    )
+gather_srcs(
+  infrt_src
+  SRCS
+  dialect.cc
+  init_dialects.cc
+  tensor_shape.cc
+  dense_tensor.cc
+  mlir_loader.cc
+  diagnostic_utils.cc)
 
 mlir_tablegen_on(tensor_shape DIALECT ts)
 mlir_tablegen_on(dense_tensor DIALECT dt)
@@ -18,12 +19,13 @@ target_link_libraries(infrtopt infrt)
 
 add_executable(print-ir print_ir.cc)
 target_link_libraries(print-ir infrt ${mlir_libs})
-cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt
+             ${MLIR_IR_LIBS})
 
 add_subdirectory(infrt)
 add_subdirectory(pd)
 add_subdirectory(tensorrt)
 
-if (INFRT_WITH_PHI)
-    add_subdirectory(phi)
+if(INFRT_WITH_PHI)
+  add_subdirectory(phi)
 endif()
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
index 7fbd1e8a4efe1..8dec818a80a27 100644
--- a/paddle/infrt/dialect/dense_tensor.h
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+
+// clang-format off
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
@@ -25,3 +27,4 @@
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/dense_tensor.hpp.inc"
+// clang-format on
diff --git a/paddle/infrt/dialect/diagnostic_utils.cc b/paddle/infrt/dialect/diagnostic_utils.cc
index 4151001067ecb..8785ce69b8e8f 100644
--- a/paddle/infrt/dialect/diagnostic_utils.cc
+++ b/paddle/infrt/dialect/diagnostic_utils.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/dialect/diagnostic_utils.h"
 
 #include <llvm/Support/raw_ostream.h>
+
 #include <string>
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/infrt/common/CMakeLists.txt b/paddle/infrt/dialect/infrt/common/CMakeLists.txt
index f693c82b5060e..593030be0a5bd 100644
--- a/paddle/infrt/dialect/infrt/common/CMakeLists.txt
+++ b/paddle/infrt/dialect/infrt/common/CMakeLists.txt
@@ -1,6 +1,3 @@
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    types.cc
-    utils.cc
-    )
+gather_srcs(infrt_src SRCS types.cc utils.cc)
diff --git a/paddle/infrt/dialect/infrt/ir/CMakeLists.txt b/paddle/infrt/dialect/infrt/ir/CMakeLists.txt
index 7c009bdb267e6..103c603e765c3 100644
--- a/paddle/infrt/dialect/infrt/ir/CMakeLists.txt
+++ b/paddle/infrt/dialect/infrt/ir/CMakeLists.txt
@@ -1,10 +1,6 @@
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    infrt_dialect.cc
-    basic_kernels.cc
-    test_kernels.cc
-    )
+gather_srcs(infrt_src SRCS infrt_dialect.cc basic_kernels.cc test_kernels.cc)
 
 add_mlir_dialect(infrt_ops infrt)
 
diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
index c4f20cb4d35c5..0e3a10270cde2 100644
--- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
+++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 
 #include <llvm/ADT/TypeSwitch.h>
@@ -60,6 +61,7 @@ void InfrtDialect::initialize() {
 #include "paddle/infrt/dialect/infrt/ir/test_kernels.cpp.inc"
       >();
 }
+// clang-format on
 
 /// Parse a type registered to this dialect.
 mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.h b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h
index e2e9b9348eb46..5a7c45b320547 100644
--- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.h
+++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h
@@ -23,8 +23,8 @@
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/InferTypeOpInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
-#include "paddle/infrt/dialect/infrt/common/types.h"
 
+#include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_opsDialect.h.inc"
 #define GET_TYPEDEF_CLASSES
 #include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.h.inc"
diff --git a/paddle/infrt/dialect/infrt/pass/CMakeLists.txt b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt
index ab06c00d143a7..7fa0ee1c716c9 100644
--- a/paddle/infrt/dialect/infrt/pass/CMakeLists.txt
+++ b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt
@@ -1,8 +1,5 @@
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    infrt_op_fuse_pass.cc
-    infrt_weights_unfold_pass.cc
-    )
+gather_srcs(infrt_src SRCS infrt_op_fuse_pass.cc infrt_weights_unfold_pass.cc)
 
 mlir_add_rewriter(infrt_op_fuse)
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
index 63be5ca909563..309e0f8b94040 100644
--- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
 
 #include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 namespace {
diff --git a/paddle/infrt/dialect/init_dialects.cc b/paddle/infrt/dialect/init_dialects.cc
index 8da34bd404be6..c204f9ea62669 100644
--- a/paddle/infrt/dialect/init_dialects.cc
+++ b/paddle/infrt/dialect/init_dialects.cc
@@ -19,12 +19,10 @@
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
-
 #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 #include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
-
 #include "paddle/infrt/dialect/tensor_shape.h"
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc
index 19b8cba12df86..ab533a25c4173 100644
--- a/paddle/infrt/dialect/mlir_loader.cc
+++ b/paddle/infrt/dialect/mlir_loader.cc
@@ -20,10 +20,10 @@
 #include <mlir/IR/Diagnostics.h>
 #include <mlir/IR/OperationSupport.h>
 #include <mlir/Parser.h>
-#include <unordered_map>
 
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
diff --git a/paddle/infrt/dialect/mlir_loader.h b/paddle/infrt/dialect/mlir_loader.h
index 5e50ad9e5a271..b4faba8068e44 100644
--- a/paddle/infrt/dialect/mlir_loader.h
+++ b/paddle/infrt/dialect/mlir_loader.h
@@ -16,9 +16,9 @@
 
 #include <glog/logging.h>
 #include <mlir/IR/BuiltinOps.h>
-#include <string>
 
 #include <memory>
+#include <string>
 
 namespace infrt {
 namespace dialect {
diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc
index 2006530958f0b..e57666ffca080 100644
--- a/paddle/infrt/dialect/opt.cc
+++ b/paddle/infrt/dialect/opt.cc
@@ -14,6 +14,7 @@
 
 #include <mlir/Support/MlirOptMain.h>
 #include <mlir/Transforms/Passes.h>
+
 #include "paddle/infrt/dialect/init_dialects.h"
 
 int main(int argc, char **argv) {
diff --git a/paddle/infrt/dialect/pd/common/CMakeLists.txt b/paddle/infrt/dialect/pd/common/CMakeLists.txt
index ee1b0d4c30deb..d253a84755713 100644
--- a/paddle/infrt/dialect/pd/common/CMakeLists.txt
+++ b/paddle/infrt/dialect/pd/common/CMakeLists.txt
@@ -1,4 +1,3 @@
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    )
+gather_srcs(infrt_src SRCS)
diff --git a/paddle/infrt/dialect/pd/ir/CMakeLists.txt b/paddle/infrt/dialect/pd/ir/CMakeLists.txt
index 8aacfc97623c0..7c1c99a97a02a 100644
--- a/paddle/infrt/dialect/pd/ir/CMakeLists.txt
+++ b/paddle/infrt/dialect/pd/ir/CMakeLists.txt
@@ -1,7 +1,5 @@
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    pd_ops.cc
-    )
+gather_srcs(infrt_src SRCS pd_ops.cc)
 add_mlir_dialect(pd_ops pd)
 mlir_tablegen_on(pd_extra_ops)
diff --git a/paddle/infrt/dialect/pd/pass/CMakeLists.txt b/paddle/infrt/dialect/pd/pass/CMakeLists.txt
index 827df597b76e2..be87052794ebc 100644
--- a/paddle/infrt/dialect/pd/pass/CMakeLists.txt
+++ b/paddle/infrt/dialect/pd/pass/CMakeLists.txt
@@ -1,8 +1,5 @@
-
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    pd_op_fuse_pass.cc
-    )
+gather_srcs(infrt_src SRCS pd_op_fuse_pass.cc)
 
 mlir_add_rewriter(pd_op_fuse)
diff --git a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc
index 8bdf957db27d8..c9247abe695ae 100644
--- a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc
+++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc
@@ -14,6 +14,7 @@
 #include "paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h"  // NOLINT
 
 #include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+
 #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace {
diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt
index 67f6bb8a2d7bb..f07c6f70fb609 100644
--- a/paddle/infrt/dialect/phi/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/CMakeLists.txt
@@ -1,5 +1,5 @@
-if (NOT INFRT_WITH_PHI)
-    return()
+if(NOT INFRT_WITH_PHI)
+  return()
 endif()
 
 add_subdirectory(ir)
@@ -8,5 +8,4 @@ add_subdirectory(pass)
 add_executable(phi-exec phi_exec.cc)
 target_link_libraries(phi-exec infrt)
 
-gather_srcs(infrt_src SRCS
-    data_type.cc)
+gather_srcs(infrt_src SRCS data_type.cc)
diff --git a/paddle/infrt/dialect/phi/ir/CMakeLists.txt b/paddle/infrt/dialect/phi/ir/CMakeLists.txt
index 0497b9832118f..e038da564be1a 100644
--- a/paddle/infrt/dialect/phi/ir/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/ir/CMakeLists.txt
@@ -6,7 +6,4 @@ add_mlir_dialect(phi_gpu_kernels phi_gpu)
 
 #mlir_tablegen_on(infrt_phi_tensor)
 
-gather_srcs(infrt_src SRCS
-    phi_base.cc 
-    infrt_phi_tensor.cc
-    phi_kernels.cc)
+gather_srcs(infrt_src SRCS phi_base.cc infrt_phi_tensor.cc phi_kernels.cc)
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
index 9a92558daab03..f7358db5bf356 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+// clang-format off
 #include <mlir/Dialect/Traits.h>
 #include <mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
@@ -37,3 +38,4 @@
 // NOLINT
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h.inc"
+// clang-format on
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc
index 1bd6068d3fb96..39a23529ac3d1 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.cc
+++ b/paddle/infrt/dialect/phi/ir/phi_base.cc
@@ -21,6 +21,7 @@
 #include <mlir/IR/MLIRContext.h>
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/IR/Types.h>
+
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.cpp.inc"
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h
index 64cd08cc05ed4..2cbdef5af906e 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.h
+++ b/paddle/infrt/dialect/phi/ir/phi_base.h
@@ -18,8 +18,8 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #include <string>
-#include "paddle/infrt/dialect/infrt/common/types.h"
 
+#include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc"
 
 #define GET_OP_CLASSES
diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.cc b/paddle/infrt/dialect/phi/ir/phi_kernels.cc
index c7a837b83fc24..69c3f96339117 100644
--- a/paddle/infrt/dialect/phi/ir/phi_kernels.cc
+++ b/paddle/infrt/dialect/phi/ir/phi_kernels.cc
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+
 #include <mlir/IR/BuiltinTypes.h>
 
 #include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.cpp.inc"
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.cpp.inc"  // NOLINT
-
 #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.cpp.inc"
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.h b/paddle/infrt/dialect/phi/ir/phi_kernels.h
index 4f8b41852cc67..9321ebb148f86 100644
--- a/paddle/infrt/dialect/phi/ir/phi_kernels.h
+++ b/paddle/infrt/dialect/phi/ir/phi_kernels.h
@@ -32,11 +32,9 @@
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
-
 #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.h.inc"
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.h.inc"
-
 #include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.h.inc"
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.h.inc"
diff --git a/paddle/infrt/dialect/phi/pass/CMakeLists.txt b/paddle/infrt/dialect/phi/pass/CMakeLists.txt
index dc60ecf63fe2e..e664e05f9dde7 100644
--- a/paddle/infrt/dialect/phi/pass/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/pass/CMakeLists.txt
@@ -1,9 +1,9 @@
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    proto_arg_map_context.cc
-    phi_op_convert_pass.cc
-    kernel_op_desc.cc
-   )
+gather_srcs(infrt_src SRCS proto_arg_map_context.cc phi_op_convert_pass.cc
+            kernel_op_desc.cc)
 
-cc_test(test_kernel_op_desc SRCS kernel_op_desc_test.cc DEPS infrt)
+cc_test(
+  test_kernel_op_desc
+  SRCS kernel_op_desc_test.cc
+  DEPS infrt)
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
index 9425a290142da..ff870a06752e5 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
+
 #include <glog/logging.h>
+
 #include "paddle/infrt/dialect/phi/data_type.h"
 #include "paddle/phi/core/type_defs.h"
 #include "paddle/phi/kernels/declarations.h"
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
index cdc8f7cbff553..4385d3c941727 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/infrt/dialect/infrt/common/types.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
index bd5f0799a60d5..24af0ea437875 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <vector>
 
 #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
 #include "paddle/phi/kernels/declarations.h"
+// clang-format on
 
 namespace infrt {
 
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
index 862c9ae4ee5af..f4de56b42a683 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
+++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
@@ -20,6 +20,7 @@
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/Operation.h>
 #include <mlir/IR/OperationSupport.h>
+
 #include <list>
 #include <unordered_set>
 #include <vector>
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
index a0e74426a4097..9748e1679d3f1 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
+++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
+
 #include "paddle/infrt/dialect/infrt/common/types.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
index 7cb2651ccf6a2..30bde83cd8199 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <mlir/IR/Operation.h>
+
 #include <unordered_map>
+
 #include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
 
diff --git a/paddle/infrt/dialect/phi/phi_exec.cc b/paddle/infrt/dialect/phi/phi_exec.cc
index a2808a00cb67d..0aae8cc93377d 100644
--- a/paddle/infrt/dialect/phi/phi_exec.cc
+++ b/paddle/infrt/dialect/phi/phi_exec.cc
@@ -41,7 +41,9 @@ bool parse_inputs(int argc,
       *params_file_name = argv[2];
       return true;
     }
-    default: { return false; }
+    default: {
+      return false;
+    }
   }
 }
 
diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc
index b118a5f7a9caf..a240cebe73655 100644
--- a/paddle/infrt/dialect/print_ir.cc
+++ b/paddle/infrt/dialect/print_ir.cc
@@ -28,6 +28,7 @@
 #include <mlir/Pass/PassManager.h>
 #include <mlir/Support/LogicalResult.h>
 #include <mlir/Transforms/Passes.h>
+
 #include <iostream>
 
 #include "paddle/infrt/common/global.h"
@@ -74,8 +75,8 @@ void printOperation(mlir::Operation *op, int indent) {
   if (!op->getAttrs().empty()) {
     printIndent(indent) << op->getAttrs().size() << " attributes:\n";
     for (mlir::NamedAttribute attr : op->getAttrs()) {
-      printIndent(indent + 1) << "- {" << attr.getName() << " : "
-                              << attr.getValue() << "}\n";
+      printIndent(indent + 1)
+          << "- {" << attr.getName() << " : " << attr.getValue() << "}\n";
     }
   }
 
diff --git a/paddle/infrt/dialect/tensor_shape.cc b/paddle/infrt/dialect/tensor_shape.cc
index 92c03818264ee..9a825224f1d30 100644
--- a/paddle/infrt/dialect/tensor_shape.cc
+++ b/paddle/infrt/dialect/tensor_shape.cc
@@ -66,5 +66,4 @@ void TensorShapeDialect::printType(mlir::Type type,
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/tensor_shape.cpp.inc"  // NOLINT
-
 #include "paddle/infrt/dialect/tensor_shape_dialect.cpp.inc"
diff --git a/paddle/infrt/dialect/tensorrt/CMakeLists.txt b/paddle/infrt/dialect/tensorrt/CMakeLists.txt
index 5b62b78e4dab1..68c6da2746433 100755
--- a/paddle/infrt/dialect/tensorrt/CMakeLists.txt
+++ b/paddle/infrt/dialect/tensorrt/CMakeLists.txt
@@ -1,13 +1,14 @@
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    trt_ops.cc
-    trt_op_converter_pass.cc
-    trt_op_teller_pass.cc
-    trt_graph_fuse_pass.cc
-    trt_graph_split_pass.cc
-    trt_type_convert_pass.cc
-    )
+gather_srcs(
+  infrt_src
+  SRCS
+  trt_ops.cc
+  trt_op_converter_pass.cc
+  trt_op_teller_pass.cc
+  trt_graph_fuse_pass.cc
+  trt_graph_split_pass.cc
+  trt_type_convert_pass.cc)
 mlir_tablegen_on(trt_ops)
 mlir_add_rewriter(pd_lower_to_trt)
 
diff --git a/paddle/infrt/dialect/tensorrt/convert.h b/paddle/infrt/dialect/tensorrt/convert.h
index 2a242ca285ba8..2dcd86486f51d 100644
--- a/paddle/infrt/dialect/tensorrt/convert.h
+++ b/paddle/infrt/dialect/tensorrt/convert.h
@@ -20,6 +20,7 @@
 #include <mlir/IR/BuiltinAttributes.h>
 #include <mlir/IR/PatternMatch.h>
 #include <mlir/Transforms/DialectConversion.h>
+
 #include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc
index dcb84ceb50edf..899e71f1c990f 100644
--- a/paddle/infrt/dialect/tensorrt/trt_exec.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc
@@ -11,10 +11,14 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
+// clang-format off
 #include <llvm/Support/CommandLine.h>
 #include <mlir/Pass/PassManager.h>
+
 #include <iostream>
 #include <string>
+
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/infrt/pass/infrt_weights_unfold_pass.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
@@ -44,6 +48,7 @@
 #endif
 
 #include <mlir/Transforms/Passes.h>
+// clang-format on
 
 int main(int argc, char** argv) {
   static llvm::cl::opt<std::string> input_file(
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index bbe9a76e87b00..7109fc772ec86 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -18,6 +18,7 @@
 #include <llvm/ADT/SetVector.h>
 #include <mlir/Analysis/SliceAnalysis.h>
 #include <mlir/IR/Builders.h>
+
 #include <list>
 #include <unordered_set>
 #include <vector>
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index d5ce871edd1a3..d74fe3e5e9c2f 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
 
 #include <mlir/IR/Builders.h>
+
 #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index d7b917385cf14..35b869fb30788 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -16,6 +16,7 @@
 
 #include <llvm/Support/Casting.h>
 #include <mlir/IR/Builders.h>
+
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index 415a78a6967ab..161fbbbcc65a5 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -11,6 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
+// clang-format off
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 #include <mlir/IR/DialectImplementation.h>
 #include <mlir/IR/Matchers.h>
@@ -24,6 +26,7 @@
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
+// clang-format on
 
 namespace infrt {
 namespace trt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index 76768037dbdb3..e851c26c43c8c 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -28,6 +28,7 @@
 #include <mlir/Interfaces/InferTypeOpInterface.h>
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
+
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
diff --git a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc
index 35c81d0230161..1cb7c4155b987 100644
--- a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h"
 
 #include <glog/logging.h>
+
 #include <set>
 
 #include "llvm/ADT/StringRef.h"
diff --git a/paddle/infrt/external_kernels/CMakeLists.txt b/paddle/infrt/external_kernels/CMakeLists.txt
index 9e90c1896c79f..96cfe2b73d8cd 100644
--- a/paddle/infrt/external_kernels/CMakeLists.txt
+++ b/paddle/infrt/external_kernels/CMakeLists.txt
@@ -8,6 +8,8 @@ set(external_kernels_lib "${CMAKE_CURRENT_BINARY_DIR}/libexternal_kernels.so")
 message(STATUS "basic_mlir: ${basic_mlir}")
 message(STATUS "external_kernels_lib: ${external_kernels_lib}")
 add_test(
-    NAME run_and_check_external_kernels
-    COMMAND sh -c "${CMAKE_BINARY_DIR}/infrt/host_context/infrtexec -i ${basic_mlir} --shared_libs=${external_kernels_lib} | ${LLVM_PATH}/bin/FileCheck ${basic_mlir}"
+  NAME run_and_check_external_kernels
+  COMMAND
+    sh -c
+    "${CMAKE_BINARY_DIR}/infrt/host_context/infrtexec -i ${basic_mlir} --shared_libs=${external_kernels_lib} | ${LLVM_PATH}/bin/FileCheck ${basic_mlir}"
 )
diff --git a/paddle/infrt/host_context/CMakeLists.txt b/paddle/infrt/host_context/CMakeLists.txt
index 14cbea70ca841..2901a282cda7d 100644
--- a/paddle/infrt/host_context/CMakeLists.txt
+++ b/paddle/infrt/host_context/CMakeLists.txt
@@ -1,26 +1,33 @@
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    kernel_frame.cc
-    kernel_registry.cc
-    value.cc
-    kernel_utils.cc
-    symbol_table.cc
-    op_executable.cc
-    core_runtime.cc
-    mlir_to_runtime_translate.cc
-    function.cc
-    mlir_function_executable.cc
-    mlir_program_executor.cc
-    paddle_mlir.cc
-    )
+gather_srcs(
+  infrt_src
+  SRCS
+  kernel_frame.cc
+  kernel_registry.cc
+  value.cc
+  kernel_utils.cc
+  symbol_table.cc
+  op_executable.cc
+  core_runtime.cc
+  mlir_to_runtime_translate.cc
+  function.cc
+  mlir_function_executable.cc
+  mlir_program_executor.cc
+  paddle_mlir.cc)
 
-cc_test_tiny(test_infrt_host_context_value SRCS value_test.cc DEPS infrt ${MLIR_IR_LIBS})
-cc_test_tiny(test_infrt_kernel_utils SRCS kernel_utils_test.cc DEPS infrt ${MLIR_IR_LIBS})
-cc_test_tiny(test_infrt_kernel_registry SRCS kernel_registry_test.cc DEPS infrt ${MLIR_IR_LIBS})
-cc_test_tiny(test_infrt_op_executable SRCS op_executable_test.cc DEPS infrt ${MLIR_IR_LIBS})
-cc_test_tiny(test_infrt_core_runtime SRCS core_runtime_test.cc DEPS infrt ${MLIR_IR_LIBS})
-cc_test_tiny(test_infrt_mlir_to_runtime_translate SRCS mlir_to_runtime_translate_test.cc DEPS infrt ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_host_context_value SRCS value_test.cc DEPS infrt
+             ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_kernel_utils SRCS kernel_utils_test.cc DEPS infrt
+             ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_kernel_registry SRCS kernel_registry_test.cc DEPS infrt
+             ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_op_executable SRCS op_executable_test.cc DEPS infrt
+             ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_core_runtime SRCS core_runtime_test.cc DEPS infrt
+             ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_mlir_to_runtime_translate SRCS
+             mlir_to_runtime_translate_test.cc DEPS infrt ${MLIR_IR_LIBS})
 
 add_executable(paddle-mlir-convert paddle_mlir_converter.cc)
 target_link_libraries(paddle-mlir-convert infrt ${MLIR_IR_LIBS})
diff --git a/paddle/infrt/host_context/core_runtime.cc b/paddle/infrt/host_context/core_runtime.cc
index e3917bd07d242..3dbb0b41c9fb8 100644
--- a/paddle/infrt/host_context/core_runtime.cc
+++ b/paddle/infrt/host_context/core_runtime.cc
@@ -14,9 +14,8 @@
 
 #include "paddle/infrt/host_context/core_runtime.h"
 
-#include <unordered_map>
-
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "paddle/infrt/host_context/kernel_registry.h"
diff --git a/paddle/infrt/host_context/core_runtime.h b/paddle/infrt/host_context/core_runtime.h
index acb6a66cac630..585369e249b22 100644
--- a/paddle/infrt/host_context/core_runtime.h
+++ b/paddle/infrt/host_context/core_runtime.h
@@ -46,7 +46,7 @@ class CoreRuntime : public std::enable_shared_from_this<CoreRuntime> {
 
   //! Get the results of the execution.
   llvm::SmallVector<ValueRef, 4>  //
-      GetResults(llvm::ArrayRef<std::string> arg_names);
+  GetResults(llvm::ArrayRef<std::string> arg_names);
 
   std::shared_ptr<CoreRuntime> getptr() {
     return std::shared_ptr<CoreRuntime>(this);
diff --git a/paddle/infrt/host_context/kernel_registry.cc b/paddle/infrt/host_context/kernel_registry.cc
index 5693e973a3f98..2518056ba9d29 100644
--- a/paddle/infrt/host_context/kernel_registry.cc
+++ b/paddle/infrt/host_context/kernel_registry.cc
@@ -39,8 +39,8 @@ const std::vector<const char *> &KernelRegistry::GetAttrNameList(
 void KernelRegistry::AddKernel(const std::string &key,
                                KernelImplementation fn,
                                const std::vector<const char *> &attr_order) {
-  CHECK(!impl_->data.count(key)) << "kernel [" << key
-                                 << "] is registered twice";
+  CHECK(!impl_->data.count(key))
+      << "kernel [" << key << "] is registered twice";
   impl_->data.emplace(
       key, std::make_pair([fn]() { return fn; }, std::move(attr_order)));
 }
@@ -48,8 +48,8 @@ void KernelRegistry::AddKernel(const std::string &key,
 void KernelRegistry::AddKernel(const std::string &key,
                                KernelLauncher fn,
                                const std::vector<const char *> &attr_order) {
-  CHECK(!impl_->data.count(key)) << "kernel [" << key
-                                 << "] is registered twice";
+  CHECK(!impl_->data.count(key))
+      << "kernel [" << key << "] is registered twice";
   impl_->data.emplace(key,
                       std::make_pair(std::move(fn), std::move(attr_order)));
 }
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
index 6ad51a02bda29..1ae7cdc742afd 100644
--- a/paddle/infrt/host_context/mlir_exec.cc
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -14,6 +14,7 @@
 
 #include <llvm/Support/CommandLine.h>
 #include <mlir/Pass/PassManager.h>
+
 #include <iostream>
 #include <string>
 
diff --git a/paddle/infrt/host_context/mlir_program_executor.h b/paddle/infrt/host_context/mlir_program_executor.h
index c2ccb90640b21..7808c460457aa 100644
--- a/paddle/infrt/host_context/mlir_program_executor.h
+++ b/paddle/infrt/host_context/mlir_program_executor.h
@@ -19,10 +19,10 @@
 #include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
 #include <mlir/IR/OperationSupport.h>
-#include <unordered_map>
 
 #include <memory>
 #include <string>
+#include <unordered_map>
 
 #include "paddle/infrt/host_context/core_runtime.h"
 #include "paddle/infrt/host_context/kernel_registry.h"
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index 05bb28b7c5613..9292e593a708f 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
 
+#include <glog/logging.h>
 #include <llvm/Support/SourceMgr.h>
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
 #include <mlir/IR/BuiltinAttributes.h>
@@ -23,7 +24,6 @@
 #include <mlir/IR/OperationSupport.h>
 #include <mlir/Parser.h>
 
-#include <glog/logging.h>
 #include <iostream>
 #include <memory>
 #include <string>
@@ -591,8 +591,8 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op,
   {
     // lookup the callee function
     auto it = table.find(callee_name.getValue().str());
-    CHECK(it != table.end()) << "can't find function ["
-                             << callee_name.getValue().str() << "]";
+    CHECK(it != table.end())
+        << "can't find function [" << callee_name.getValue().str() << "]";
     auto* function =
         impl_->cur_op->CreateFunctionExecutable(it->second, &impl_->func_defs);
     impl_->cur_op->AppendAttribute(new Value(function));
diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc
index 4d588a9c2b523..b53dc0545c72f 100644
--- a/paddle/infrt/host_context/op_executable.cc
+++ b/paddle/infrt/host_context/op_executable.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/host_context/op_executable.h"
 
 #include <mlir/IR/BuiltinOps.h>
+
 #include <string>
 #include <unordered_set>
 
diff --git a/paddle/infrt/host_context/op_executable.h b/paddle/infrt/host_context/op_executable.h
index 550f6ab6349ed..b80b99fd41405 100644
--- a/paddle/infrt/host_context/op_executable.h
+++ b/paddle/infrt/host_context/op_executable.h
@@ -16,6 +16,7 @@
 #include <llvm/ADT/ArrayRef.h>
 #include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Region.h>
+
 #include <memory>
 #include <string>
 #include <unordered_map>
diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h
index 57bdc1b48578b..629181cca3d6e 100644
--- a/paddle/infrt/host_context/paddle_mlir.h
+++ b/paddle/infrt/host_context/paddle_mlir.h
@@ -20,6 +20,7 @@
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/MLIRContext.h>
+
 #include <fstream>
 #include <iostream>
 #include <string>
diff --git a/paddle/infrt/host_context/paddle_mlir_converter.cc b/paddle/infrt/host_context/paddle_mlir_converter.cc
index a2808a00cb67d..0aae8cc93377d 100644
--- a/paddle/infrt/host_context/paddle_mlir_converter.cc
+++ b/paddle/infrt/host_context/paddle_mlir_converter.cc
@@ -41,7 +41,9 @@ bool parse_inputs(int argc,
       *params_file_name = argv[2];
       return true;
     }
-    default: { return false; }
+    default: {
+      return false;
+    }
   }
 }
 
diff --git a/paddle/infrt/host_context/symbol_table.h b/paddle/infrt/host_context/symbol_table.h
index 805215a78ce0d..8c79c78c690e8 100644
--- a/paddle/infrt/host_context/symbol_table.h
+++ b/paddle/infrt/host_context/symbol_table.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include <unordered_map>
-
 #include <memory>
+#include <unordered_map>
 
 #include "paddle/infrt/host_context/value.h"
 
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index 1834cb4c0db05..af785c13349fd 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -159,15 +159,15 @@ class Value : public common::Object {
 
   template <typename T>
   const T& get() const {
-    CHECK(data.template is<T>()) << "typeid: " << data.index()
-                                 << " != " << ValueVariantType::IndexOf<T>;
+    CHECK(data.template is<T>())
+        << "typeid: " << data.index() << " != " << ValueVariantType::IndexOf<T>;
     return data.get<T>();
   }
 
   template <typename T>
   T& get() {
-    CHECK(data.template is<T>()) << "typeid: " << data.index()
-                                 << " != " << ValueVariantType::IndexOf<T>;
+    CHECK(data.template is<T>())
+        << "typeid: " << data.index() << " != " << ValueVariantType::IndexOf<T>;
     return data.get<T>();
   }
 
diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt
index f20344f6f6b84..6a18047885d48 100644
--- a/paddle/infrt/kernel/CMakeLists.txt
+++ b/paddle/infrt/kernel/CMakeLists.txt
@@ -3,11 +3,12 @@ add_subdirectory(tensorrt)
 
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    basic_kernels.cc
-    # phi_kernels.cc
-    test_kernels.cc
-    tensor_shape_kernels.cc
-    tensor_kernels.cc
-    control_flow_kernels.cc
-    )
+gather_srcs(
+  infrt_src
+  SRCS
+  basic_kernels.cc
+  # phi_kernels.cc
+  test_kernels.cc
+  tensor_shape_kernels.cc
+  tensor_kernels.cc
+  control_flow_kernels.cc)
diff --git a/paddle/infrt/kernel/phi/CMakeLists.txt b/paddle/infrt/kernel/phi/CMakeLists.txt
index 22a59ab2faf8c..92e4a49cd849c 100644
--- a/paddle/infrt/kernel/phi/CMakeLists.txt
+++ b/paddle/infrt/kernel/phi/CMakeLists.txt
@@ -1,34 +1,39 @@
-if (NOT INFRT_WITH_PHI)
-    return()
+if(NOT INFRT_WITH_PHI)
+  return()
 endif()
 
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    registry.cc
-    dense_tensor_kernels.cc
-    context_kernels.cc
-)
+gather_srcs(infrt_src SRCS registry.cc dense_tensor_kernels.cc
+            context_kernels.cc)
 
-set(infrt_register_phi_kernels_gen_source_file ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc)
-set(infrt_register_phi_kernels_gen_file ${CMAKE_SOURCE_DIR}/tools/infrt/get_phi_kernel_function.sh)
-set(wrapped_infermeta_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h)
-set(wrapped_infermeta_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc)
+set(infrt_register_phi_kernels_gen_source_file
+    ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
+)
+set(infrt_register_phi_kernels_gen_file
+    ${CMAKE_SOURCE_DIR}/tools/infrt/get_phi_kernel_function.sh)
+set(wrapped_infermeta_header_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h)
+set(wrapped_infermeta_source_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc)
 
 add_custom_command(
-        OUTPUT ${infrt_register_phi_kernels_gen_source_file}
-        COMMAND bash ${infrt_register_phi_kernels_gen_file}
-        DEPENDS wrapped_infermeta
-        VERBATIM)
-add_custom_target(infrt_register_phi_kernel
-        COMMAND bash ${infrt_register_phi_kernels_gen_file}
-        DEPENDS wrapped_infermeta
-        COMMENT "infrt generate ${infrt_register_phi_kernels_gen_source_file}"
-        VERBATIM)
+  OUTPUT ${infrt_register_phi_kernels_gen_source_file}
+  COMMAND bash ${infrt_register_phi_kernels_gen_file}
+  DEPENDS wrapped_infermeta
+  VERBATIM)
+add_custom_target(
+  infrt_register_phi_kernel
+  COMMAND bash ${infrt_register_phi_kernels_gen_file}
+  DEPENDS wrapped_infermeta
+  COMMENT "infrt generate ${infrt_register_phi_kernels_gen_source_file}"
+  VERBATIM)
 
-cc_library(infrt_naive SRCS infershaped/infershaped_kernel_launcher.cc
-        infershaped/infershaped_kernel_launchers.cc
-        DEPS phi wrapped_infermeta)
+cc_library(
+  infrt_naive
+  SRCS infershaped/infershaped_kernel_launcher.cc
+       infershaped/infershaped_kernel_launchers.cc
+  DEPS phi wrapped_infermeta)
 
 cc_test_tiny(test_infrt_infershape_launchers SRCS
-infershaped/infershape_launchers_test.cc DEPS infrt)
+             infershaped/infershape_launchers_test.cc DEPS infrt)
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index 95e25b243f3ab..8c49f47e7d873 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
+
 #include <memory>
+
 #include "llvm/Support/ErrorHandling.h"
 #include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/common/string.h"
diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
index 2e40261f27386..cb9640451f9b2 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h"
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/meta_tensor.h"
 
diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h b/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h
index 277c4ad6b7afc..531d77ba952aa 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h
+++ b/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <type_traits>
+
 #include "paddle/infrt/tensor/dense_host_tensor.h"
 #include "paddle/phi/core/dense_tensor.h"
 
diff --git a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
index d87027847202b..bac25e0f437d8 100644
--- a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
+++ b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <llvm/ADT/SmallVector.h>
+
 #include <iostream>
 
 #include "paddle/infrt/backends/host/phi_context.h"
diff --git a/paddle/infrt/kernel/tensorrt/CMakeLists.txt b/paddle/infrt/kernel/tensorrt/CMakeLists.txt
index cd35fccbe2aa3..2cb595f7ba4f7 100644
--- a/paddle/infrt/kernel/tensorrt/CMakeLists.txt
+++ b/paddle/infrt/kernel/tensorrt/CMakeLists.txt
@@ -1,10 +1,10 @@
-if (NOT (INFRT_WITH_PHI AND INFRT_WITH_GPU AND INFRT_WITH_TRT))
+if(NOT
+   (INFRT_WITH_PHI
+    AND INFRT_WITH_GPU
+    AND INFRT_WITH_TRT))
   return()
 endif()
 
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    registry.cc
-    trt_kernels.cc
-)
+gather_srcs(infrt_src SRCS registry.cc trt_kernels.cc)
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
index c0f5ebb4a7657..0ea68f2e835f7 100644
--- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
 #include <string>
 #include <unordered_set>
@@ -36,6 +37,7 @@
 #include "paddle/infrt/host_context/symbol_table.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
+// clang-format on
 
 namespace infrt {
 namespace kernel {
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.h b/paddle/infrt/kernel/tensorrt/trt_kernels.h
index bf23bd45c1341..bf41c124a299b 100644
--- a/paddle/infrt/kernel/tensorrt/trt_kernels.h
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.h
@@ -19,7 +19,6 @@
 #include <utility>
 
 #include "mlir/IR/Operation.h"
-
 #include "paddle/infrt/backends/tensorrt/trt_engine.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 
diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc
index bcf475d1bc09d..e00afa4b7901a 100644
--- a/paddle/infrt/kernel/test_kernels.cc
+++ b/paddle/infrt/kernel/test_kernels.cc
@@ -92,11 +92,11 @@ class BenchmarkStats {
     std::sort(run_times_walltime_.begin(), run_times_walltime_.end());
     std::sort(run_times_cpu_.begin(), run_times_cpu_.end());
 
-    auto percentile = [](
-        double p, const std::vector<std::chrono::nanoseconds> &run_times) {
-      assert(p >= 0.0 && p <= 1.0);
-      return run_times[run_times.size() * p];
-    };
+    auto percentile =
+        [](double p, const std::vector<std::chrono::nanoseconds> &run_times) {
+          assert(p >= 0.0 && p <= 1.0);
+          return run_times[run_times.size() * p];
+        };
 
     // BM: prefix is added to make grepping results from lit output easier.
     std::string prefix;
diff --git a/paddle/infrt/paddle/CMakeLists.txt b/paddle/infrt/paddle/CMakeLists.txt
index 21c117535fe70..5f894626f8015 100644
--- a/paddle/infrt/paddle/CMakeLists.txt
+++ b/paddle/infrt/paddle/CMakeLists.txt
@@ -5,14 +5,16 @@ add_subdirectory(pb)
 
 core_gather_headers()
 
-gather_srcs(infrt_src SRCS
-    model_parser.cc
-    scope.cc
-    tensor.cc
-    )
+gather_srcs(infrt_src SRCS model_parser.cc scope.cc tensor.cc)
 
-file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+file(
+  GLOB includes
+  LIST_DIRECTORIES false
+  RELATIVE ${CMAKE_SOURCE_DIR}
+  *.h)
 
 foreach(header ${includes})
-  set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+  set(core_includes
+      "${core_includes};${header}"
+      CACHE INTERNAL "")
 endforeach()
diff --git a/paddle/infrt/paddle/cpp/CMakeLists.txt b/paddle/infrt/paddle/cpp/CMakeLists.txt
index 8b48603bddf8e..9947747108494 100644
--- a/paddle/infrt/paddle/cpp/CMakeLists.txt
+++ b/paddle/infrt/paddle/cpp/CMakeLists.txt
@@ -1,5 +1,11 @@
-file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+file(
+  GLOB includes
+  LIST_DIRECTORIES false
+  RELATIVE ${CMAKE_SOURCE_DIR}
+  *.h)
 
 foreach(header ${includes})
-  set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+  set(core_includes
+      "${core_includes};${header}"
+      CACHE INTERNAL "")
 endforeach()
diff --git a/paddle/infrt/paddle/pb/CMakeLists.txt b/paddle/infrt/paddle/pb/CMakeLists.txt
index b3491cfe13618..3614201a95f65 100644
--- a/paddle/infrt/paddle/pb/CMakeLists.txt
+++ b/paddle/infrt/paddle/pb/CMakeLists.txt
@@ -1,12 +1,13 @@
-gather_srcs(infrt_src SRCS
-    var_desc.cc
-    op_desc.cc
-    block_desc.cc
-    program_desc.cc
-    )
+gather_srcs(infrt_src SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc)
 
-file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+file(
+  GLOB includes
+  LIST_DIRECTORIES false
+  RELATIVE ${CMAKE_SOURCE_DIR}
+  *.h)
 
 foreach(header ${includes})
-  set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+  set(core_includes
+      "${core_includes};${header}"
+      CACHE INTERNAL "")
 endforeach()
diff --git a/paddle/infrt/paddle/scope.h b/paddle/infrt/paddle/scope.h
index 4ebf846374c6f..1f81d0914dfc6 100644
--- a/paddle/infrt/paddle/scope.h
+++ b/paddle/infrt/paddle/scope.h
@@ -13,10 +13,9 @@
 // limitations under the License.
 
 #pragma once
-#include <unordered_map>
-
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "paddle/infrt/common/macros.h"
diff --git a/paddle/infrt/support/type_traits.h b/paddle/infrt/support/type_traits.h
index 341dabb7c1c4a..33a42fe37eaa6 100644
--- a/paddle/infrt/support/type_traits.h
+++ b/paddle/infrt/support/type_traits.h
@@ -115,7 +115,8 @@ struct nonesuch {
 
 template <class Default,
           class AlwaysVoid,
-          template <class...> class Op,
+          template <class...>
+          class Op,
           class... Args>
 struct detector : std::false_type {
   using value_t = std::false_type;
diff --git a/paddle/infrt/tensor/CMakeLists.txt b/paddle/infrt/tensor/CMakeLists.txt
index 95d4090a9a3f7..b1c3149276c59 100644
--- a/paddle/infrt/tensor/CMakeLists.txt
+++ b/paddle/infrt/tensor/CMakeLists.txt
@@ -2,13 +2,14 @@ core_gather_headers()
 
 add_subdirectory(phi)
 
-gather_srcs(infrt_src SRCS
+gather_srcs(
+  infrt_src
+  SRCS
   tensor_map.cc
   tensor_metadata.cc
   dense_tensor_view.cc
   dense_host_tensor.cc
-  tensor_shape.cc
-  )
+  tensor_shape.cc)
 
 # set(tensor_map_mlir "${CMAKE_SOURCE_DIR}/infrt/dialect/mlir_tests/tensor_map.mlir")
 # set(external_kernels_lib "${CMAKE_BINARY_DIR}/paddle/libexternal_kernels.so")
diff --git a/paddle/infrt/tensor/phi/CMakeLists.txt b/paddle/infrt/tensor/phi/CMakeLists.txt
index 97e26661266e9..94658e223e287 100644
--- a/paddle/infrt/tensor/phi/CMakeLists.txt
+++ b/paddle/infrt/tensor/phi/CMakeLists.txt
@@ -1,3 +1 @@
-gather_srcs(infrt_src SRCS
-  tensor_map.cc
-)
+gather_srcs(infrt_src SRCS tensor_map.cc)
diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt
index a720ad824794e..22e5e232d5485 100644
--- a/paddle/infrt/tests/CMakeLists.txt
+++ b/paddle/infrt/tests/CMakeLists.txt
@@ -1,11 +1,21 @@
 cc_test_tiny(test_abs_model SRCS models/test_abs.cc DEPS infrt ${MLIR_IR_LIBS})
 
-configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py")
+configure_file(lit.cfg.py.in
+               "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py")
 
-add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\""
+add_test(
+  NAME test_infrt_by_lit
+  COMMAND
+    sh -c
+    "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\""
     DEPENDS infrtopt infrtexec)
 
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/resnet50.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/resnet50.mlir)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in
+               ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir.in
+               ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/resnet50.mlir.in
+               ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/resnet50.mlir)
+configure_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir.in
+  ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensorrt/disabled_linear.mlir)
diff --git a/paddle/infrt/tests/models/test_abs.cc b/paddle/infrt/tests/models/test_abs.cc
index 89bbe78ffe27a..aa5a2c6945b47 100644
--- a/paddle/infrt/tests/models/test_abs.cc
+++ b/paddle/infrt/tests/models/test_abs.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include <gtest/gtest.h>
 #include <llvm/Support/CommandLine.h>
 #include <mlir/Pass/PassManager.h>
@@ -49,6 +50,7 @@
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 #include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+// clang-format on
 
 static llvm::cl::list<std::string> cl_shared_libs(  // NOLINT
     "shared_libs",
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 58ad42ddd1ff8..7f3dd1ddc38fb 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -23,14 +23,33 @@ add_subdirectory(tools)
 add_subdirectory(tests)
 
 # make an unity target for compile deps
-set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor api_scalar api_int_array)
+set(PHI_DEPS
+    convert_utils
+    dense_tensor
+    phi_context
+    kernel_factory
+    kernel_context
+    arg_map_context
+    infermeta
+    lod_utils
+    op_compat_infos
+    sparse_csr_tensor
+    sparse_coo_tensor
+    string_tensor
+    api_scalar
+    api_int_array)
 get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
 
 create_dummy_static_lib(phi LIBS ${PHI_DEPS} LIMIT 100)
 
-set(phi_extension_header_file ${CMAKE_CURRENT_SOURCE_DIR}/extension.h CACHE INTERNAL "phi/extension.h file")
-file(WRITE ${phi_extension_header_file} "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n")
+set(phi_extension_header_file
+    ${CMAKE_CURRENT_SOURCE_DIR}/extension.h
+    CACHE INTERNAL "phi/extension.h file")
+file(
+  WRITE ${phi_extension_header_file}
+  "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n"
+)
 
 # generate inner headers include dir for users
 generate_unify_header(backends)
diff --git a/paddle/phi/api/CMakeLists.txt b/paddle/phi/api/CMakeLists.txt
index d575759db32ee..b1d97cbc7fa2c 100644
--- a/paddle/phi/api/CMakeLists.txt
+++ b/paddle/phi/api/CMakeLists.txt
@@ -1,2 +1,6 @@
 add_subdirectory(lib)
-cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api strings_api)
+cc_library(
+  phi_api
+  SRCS all.cc
+  DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api
+       strings_api)
diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h
index a9475db800816..fa19714dde7db 100644
--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -317,25 +317,24 @@ using InferShapeFunc = std::vector<std::vector<int64_t>> (*)(
     const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
     const std::vector<paddle::any>& attrs);
 
-#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type)            \
-  template <typename... Tail>                                               \
-  struct InferShapeCallHelper<input_type, Tail...> {                        \
-    template <int in_idx,                                                   \
-              int vec_in_idx,                                               \
-              int attr_idx,                                                 \
-              typename... PreviousArgs>                                     \
-    static Return InferShape(                                               \
-        const std::vector<std::vector<int64_t>>& input_shapes,              \
-        const std::vector<std::vector<std::vector<int64_t>>>&               \
-            vec_input_shapes,                                               \
-        const std::vector<paddle::any>& attrs,                              \
-        const PreviousArgs&... pargs) {                                     \
-      input_type arg = input_shapes[in_idx];                                \
-      return InferShapeCallHelper<Tail...>::template InferShape<in_idx + 1, \
-                                                                vec_in_idx, \
-                                                                attr_idx>(  \
-          input_shapes, vec_input_shapes, attrs, pargs..., arg);            \
-    }                                                                       \
+#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type)     \
+  template <typename... Tail>                                        \
+  struct InferShapeCallHelper<input_type, Tail...> {                 \
+    template <int in_idx,                                            \
+              int vec_in_idx,                                        \
+              int attr_idx,                                          \
+              typename... PreviousArgs>                              \
+    static Return InferShape(                                        \
+        const std::vector<std::vector<int64_t>>& input_shapes,       \
+        const std::vector<std::vector<std::vector<int64_t>>>&        \
+            vec_input_shapes,                                        \
+        const std::vector<paddle::any>& attrs,                       \
+        const PreviousArgs&... pargs) {                              \
+      input_type arg = input_shapes[in_idx];                         \
+      return InferShapeCallHelper<Tail...>::                         \
+          template InferShape<in_idx + 1, vec_in_idx, attr_idx>(     \
+              input_shapes, vec_input_shapes, attrs, pargs..., arg); \
+    }                                                                \
   }
 
 #define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type)    \
@@ -397,10 +396,8 @@ struct InferShapeFuncImpl<Return (*)(Args...), impl_fn> {
       const std::vector<std::vector<int64_t>>& input_shapes,
       const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
       const std::vector<paddle::any>& attrs) {
-    return InferShapeCallHelper<Args..., TypeTag<int>>::template InferShape<0,
-                                                                            0,
-                                                                            0>(
-        input_shapes, vec_input_shapes, attrs);
+    return InferShapeCallHelper<Args..., TypeTag<int>>::
+        template InferShape<0, 0, 0>(input_shapes, vec_input_shapes, attrs);
   }
 
  private:
@@ -482,20 +479,19 @@ using InferDtypeFunc = std::vector<DataType> (*)(
     }                                                                        \
   }
 
-#define PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(input_type)            \
-  template <typename... Tail>                                                \
-  struct InferDtypeCallHelper<input_type, Tail...> {                         \
-    template <int in_idx, int vec_in_idx, typename... PreviousArgs>          \
-    static Return InferDtype(                                                \
-        const std::vector<DataType>& input_dtypes,                           \
-        const std::vector<std::vector<DataType>>& vec_input_dtypes,          \
-        const PreviousArgs&... pargs) {                                      \
-      input_type arg = vec_input_dtypes[vec_in_idx];                         \
-      return InferDtypeCallHelper<Tail...>::template InferDtype<in_idx,      \
-                                                                vec_in_idx + \
-                                                                    1>(      \
-          input_dtypes, vec_input_dtypes, pargs..., arg);                    \
-    }                                                                        \
+#define PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(input_type)   \
+  template <typename... Tail>                                       \
+  struct InferDtypeCallHelper<input_type, Tail...> {                \
+    template <int in_idx, int vec_in_idx, typename... PreviousArgs> \
+    static Return InferDtype(                                       \
+        const std::vector<DataType>& input_dtypes,                  \
+        const std::vector<std::vector<DataType>>& vec_input_dtypes, \
+        const PreviousArgs&... pargs) {                             \
+      input_type arg = vec_input_dtypes[vec_in_idx];                \
+      return InferDtypeCallHelper<Tail...>::                        \
+          template InferDtype<in_idx, vec_in_idx + 1>(              \
+              input_dtypes, vec_input_dtypes, pargs..., arg);       \
+    }                                                               \
   }
 
 template <typename F, F f>
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 004ed8de520d9..a1c6989555f20 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -1,11 +1,20 @@
 add_subdirectory(utils)
 
-if (WITH_GPU)
-  nv_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils phi_enforce)
-elseif (WITH_ROCM)
-  hip_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils phi_enforce)
+if(WITH_GPU)
+  nv_library(
+    phi_tensor_raw
+    SRCS tensor.cc
+    DEPS tensor_base dense_tensor phi_api_utils phi_enforce)
+elseif(WITH_ROCM)
+  hip_library(
+    phi_tensor_raw
+    SRCS tensor.cc
+    DEPS tensor_base dense_tensor phi_api_utils phi_enforce)
 else()
-  cc_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils phi_enforce)
+  cc_library(
+    phi_tensor_raw
+    SRCS tensor.cc
+    DEPS tensor_base dense_tensor phi_api_utils phi_enforce)
 endif()
 
 set(api_gen_base ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_base.py)
@@ -13,71 +22,94 @@ set(api_gen_base ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_base.py)
 # forward api file
 set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py)
 set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
-set(new_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/new_api.yaml)
+set(new_api_yaml_file
+    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/new_api.yaml)
 set(api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/api.h)
 set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/api.cc)
 set(api_header_file_tmp ${api_header_file}.tmp)
 set(api_source_file_tmp ${api_source_file}.tmp)
 
 # backward api file
-set(bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward_api_gen.py)
-set(bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml)
-set(new_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/new_backward.yaml)
-set(bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/backward_api.h)
+set(bw_api_gen_file
+    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward_api_gen.py)
+set(bw_api_yaml_file
+    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml)
+set(new_bw_api_yaml_file
+    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/new_backward.yaml)
+set(bw_api_header_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/backward_api.h)
 set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc)
 set(bw_api_header_file_tmp ${bw_api_header_file}.tmp)
 set(bw_api_source_file_tmp ${bw_api_source_file}.tmp)
 
 # dygraph(intermediate) api file
-set(im_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/intermediate_api_gen.py)
-set(dygraph_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/dygraph_api.h)
-set(dygraph_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/dygraph_api.cc)
+set(im_api_gen_file
+    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/intermediate_api_gen.py)
+set(dygraph_api_header_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/dygraph_api.h)
+set(dygraph_api_source_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/dygraph_api.cc)
 set(dygraph_api_header_file_tmp ${dygraph_api_header_file}.tmp)
 set(dygraph_api_source_file_tmp ${dygraph_api_source_file}.tmp)
 
 # sparse api file
-set(sparse_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api_gen.py)
-set(sparse_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml)
-set(sparse_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h)
+set(sparse_api_gen_file
+    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api_gen.py)
+set(sparse_api_yaml_file
+    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml)
+set(sparse_api_header_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h)
 set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc)
 set(sparse_api_header_file_tmp ${sparse_api_header_file}.tmp)
 set(sparse_api_source_file_tmp ${sparse_api_source_file}.tmp)
 
 # sparse bw api file
-set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py)
-set(sparse_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml)
-set(sparse_bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h)
-set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc)
+set(sparse_bw_api_gen_file
+    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py)
+set(sparse_bw_api_yaml_file
+    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml)
+set(sparse_bw_api_header_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h)
+set(sparse_bw_api_source_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc)
 set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp)
 set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp)
 
 # strings api file
-set(strings_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api_gen.py)
-set(strings_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api.yaml)
-set(strings_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/strings_api.h)
-set(strings_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/strings_api.cc)
+set(strings_api_gen_file
+    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api_gen.py)
+set(strings_api_yaml_file
+    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api.yaml)
+set(strings_api_header_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/strings_api.h)
+set(strings_api_source_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/strings_api.cc)
 set(strings_api_header_file_tmp ${strings_api_header_file}.tmp)
 set(strings_api_source_file_tmp ${strings_api_source_file}.tmp)
 
 # wrapped infermeta file
-set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py)
-set(wrapped_infermeta_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h)
-set(wrapped_infermeta_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc)
+set(wrapped_infermeta_gen_file
+    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py)
+set(wrapped_infermeta_header_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h)
+set(wrapped_infermeta_source_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc)
 
-if (NOT PYTHON_EXECUTABLE)
+if(NOT PYTHON_EXECUTABLE)
   find_package(PythonInterp REQUIRED)
 endif()
 
 # install extra dependencies
-execute_process(
-  COMMAND ${PYTHON_EXECUTABLE} -m pip install -U pyyaml jinja2
-)
+execute_process(COMMAND ${PYTHON_EXECUTABLE} -m pip install -U pyyaml jinja2)
 
 # parse apis
 set(parsed_api_dir ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/parsed_apis)
-set(generated_op_path ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_op.cc)
-set(generated_argument_mapping_path ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_sig.cc)
-message("parse api yamls: 
+set(generated_op_path
+    ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_op.cc)
+set(generated_argument_mapping_path
+    ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_sig.cc)
+message(
+  "parse api yamls:
 - ${api_yaml_file}
 - ${new_api_yaml_file}
 - ${bw_api_yaml_file}
@@ -85,24 +117,18 @@ message("parse api yamls:
 execute_process(
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen
   COMMAND ${CMAKE_COMMAND} -E make_directory ${parsed_api_dir}
-  COMMAND ${PYTHON_EXECUTABLE} parse_api.py 
-      --api_yaml_path ./api.yaml 
-      --output_path ./parsed_apis/api.parsed.yaml
-  COMMAND ${PYTHON_EXECUTABLE} parse_api.py 
-      --api_yaml_path ./new_api.yaml 
-      --output_path ./parsed_apis/new_api.parsed.yaml
-  COMMAND ${PYTHON_EXECUTABLE} parse_api.py
-      --api_yaml_path ./backward.yaml
-      --output_path ./parsed_apis/backward_api.parsed.yaml
-      --backward
-  COMMAND ${PYTHON_EXECUTABLE} parse_api.py
-      --api_yaml_path ./new_backward.yaml
-      --output_path ./parsed_apis/new_backward_api.parsed.yaml
-      --backward
-  RESULTS_VARIABLE _results
-)
+  COMMAND ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./api.yaml
+          --output_path ./parsed_apis/api.parsed.yaml
+  COMMAND ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./new_api.yaml
+          --output_path ./parsed_apis/new_api.parsed.yaml
+  COMMAND ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./backward.yaml
+          --output_path ./parsed_apis/backward_api.parsed.yaml --backward
+  COMMAND
+    ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./new_backward.yaml
+    --output_path ./parsed_apis/new_backward_api.parsed.yaml --backward
+    RESULTS_VARIABLE _results)
 foreach(_result in ${_results})
-  if (${_result})
+  if(${_result})
     message(FATAL_ERROR "api yaml parsing failed, exiting.")
   endif()
 endforeach()
@@ -113,52 +139,67 @@ message("validate api yaml:
 - ${parsed_api_dir}/new_backward_api.parsed.yaml")
 execute_process(
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen
-  COMMAND ${PYTHON_EXECUTABLE} cross_validate.py
-      --forward_yaml_paths ./parsed_apis/api.parsed.yaml ./parsed_apis/new_api.parsed.yaml 
-      --backward_yaml_paths ./parsed_apis/backward_api.parsed.yaml ./parsed_apis/new_backward_api.parsed.yaml
-  RESULT_VARIABLE _result
-)
-if (${_result}) 
-    message(FATAL_ERROR "api validation failed, exiting." )
+  COMMAND
+    ${PYTHON_EXECUTABLE} cross_validate.py --forward_yaml_paths
+    ./parsed_apis/api.parsed.yaml ./parsed_apis/new_api.parsed.yaml
+    --backward_yaml_paths ./parsed_apis/backward_api.parsed.yaml
+    ./parsed_apis/new_backward_api.parsed.yaml
+  RESULT_VARIABLE _result)
+if(${_result})
+  message(FATAL_ERROR "api validation failed, exiting.")
 endif()
 
 # code generation for op, op makers, and argument mapping functions
-message("create or remove auto-geneated operators: ${generated_op_path}.tmp
-create or remove auto-geneated argument mappings: ${generated_argument_mapping_path}.tmp")
+message(
+  "create or remove auto-geneated operators: ${generated_op_path}.tmp
+create or remove auto-geneated argument mappings: ${generated_argument_mapping_path}.tmp"
+)
 execute_process(
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen
-  COMMAND ${PYTHON_EXECUTABLE} generate_op.py
-      --api_yaml_path ./parsed_apis/new_api.parsed.yaml
-      --backward_api_yaml_path ./parsed_apis/new_backward_api.parsed.yaml
-      --output_op_path "${generated_op_path}.tmp"
-      --output_arg_map_path "${generated_argument_mapping_path}.tmp"
-  RESULT_VARIABLE _result
-)
-if (${_result})
-    message(FATAL_ERROR "operator codegen failed, exiting." )
+  COMMAND
+    ${PYTHON_EXECUTABLE} generate_op.py --api_yaml_path
+    ./parsed_apis/new_api.parsed.yaml --backward_api_yaml_path
+    ./parsed_apis/new_backward_api.parsed.yaml --output_op_path
+    "${generated_op_path}.tmp" --output_arg_map_path
+    "${generated_argument_mapping_path}.tmp"
+  RESULT_VARIABLE _result)
+if(${_result})
+  message(FATAL_ERROR "operator codegen failed, exiting.")
 endif()
 
-
 if(EXISTS "${generated_op_path}.tmp" AND EXISTS "${generated_op_path}")
-  execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${generated_op_path}.tmp" "${generated_op_path}")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                          "${generated_op_path}.tmp" "${generated_op_path}")
   message("copy if different ${generated_op_path}.tmp ${generated_op_path}")
 elseif(EXISTS "${generated_op_path}.tmp")
-  execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_op_path}.tmp" "${generated_op_path}")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_op_path}.tmp"
+                          "${generated_op_path}")
   message("copy ${generated_op_path}.tmp ${generated_op_path}")
 else()
   execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f "${generated_op_path}")
   message("remove ${generated_op_path}")
 endif()
 
-
-if(EXISTS "${generated_argument_mapping_path}.tmp" AND EXISTS "${generated_argument_mapping_path}")
-  execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${generated_argument_mapping_path}.tmp" "${generated_argument_mapping_path}")
-  message("copy if different ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}")
+if(EXISTS "${generated_argument_mapping_path}.tmp"
+   AND EXISTS "${generated_argument_mapping_path}")
+  execute_process(
+    COMMAND
+      ${CMAKE_COMMAND} -E copy_if_different
+      "${generated_argument_mapping_path}.tmp"
+      "${generated_argument_mapping_path}")
+  message(
+    "copy if different ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}"
+  )
 elseif(EXISTS "${generated_argument_mapping_path}.tmp")
-  execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_argument_mapping_path}.tmp" "${generated_argument_mapping_path}")
-  message("copy ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}")
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} -E copy "${generated_argument_mapping_path}.tmp"
+            "${generated_argument_mapping_path}")
+  message(
+    "copy ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}"
+  )
 else()
-  execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f "${generated_argument_mapping_path}")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f
+                          "${generated_argument_mapping_path}")
   message("remove ${generated_argument_mapping_path}")
 endif()
 
@@ -166,26 +207,31 @@ endif()
 add_custom_command(
   OUTPUT ${api_header_file} ${api_source_file}
   COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
-  COMMAND ${PYTHON_EXECUTABLE} ${api_gen_file}
-                 --api_yaml_path ${api_yaml_file} ${new_api_yaml_file}
-                 --api_header_path ${api_header_file_tmp}
-                 --api_header_path ${api_header_file_tmp}
-                 --api_source_path ${api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp} ${api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_source_file_tmp} ${api_source_file}
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${api_gen_file} --api_yaml_path ${api_yaml_file}
+    ${new_api_yaml_file} --api_header_path ${api_header_file_tmp}
+    --api_header_path ${api_header_file_tmp} --api_source_path
+    ${api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp}
+          ${api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_source_file_tmp}
+          ${api_source_file}
   COMMENT "copy_if_different ${api_header_file} ${api_source_file}"
   DEPENDS ${api_yaml_file} ${api_gen_file} ${api_gen_base}
   VERBATIM)
 
 # generate backward api
 add_custom_command(
-  OUTPUT ${bw_api_header_file} ${bw_api_source_file} ${bw_api_header_file_tmp} ${bw_api_source_file_tmp}
-  COMMAND ${PYTHON_EXECUTABLE} ${bw_api_gen_file}
-                 --backward_yaml_path ${bw_api_yaml_file} ${new_bw_api_yaml_file}
-                 --backward_header_path ${bw_api_header_file_tmp}
-                 --backward_source_path ${bw_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_header_file_tmp} ${bw_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_source_file_tmp} ${bw_api_source_file}
+  OUTPUT ${bw_api_header_file} ${bw_api_source_file} ${bw_api_header_file_tmp}
+         ${bw_api_source_file_tmp}
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${bw_api_gen_file} --backward_yaml_path
+    ${bw_api_yaml_file} ${new_bw_api_yaml_file} --backward_header_path
+    ${bw_api_header_file_tmp} --backward_source_path ${bw_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_header_file_tmp}
+          ${bw_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_source_file_tmp}
+          ${bw_api_source_file}
   COMMENT "copy_if_different ${bw_api_header_file} ${bw_api_source_file}"
   DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
   VERBATIM)
@@ -193,82 +239,177 @@ add_custom_command(
 # generate sparse api
 add_custom_command(
   OUTPUT ${sparse_api_header_file} ${sparse_api_source_file}
-  COMMAND ${PYTHON_EXECUTABLE} ${sparse_api_gen_file}
-                 --api_yaml_path ${sparse_api_yaml_file}
-                 --api_header_path ${sparse_api_header_file_tmp}
-                 --api_source_path ${sparse_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} ${sparse_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} ${sparse_api_source_file}
-  COMMENT "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}"
-  DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base} ${api_gen_file}
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${sparse_api_gen_file} --api_yaml_path
+    ${sparse_api_yaml_file} --api_header_path ${sparse_api_header_file_tmp}
+    --api_source_path ${sparse_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp}
+          ${sparse_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp}
+          ${sparse_api_source_file}
+  COMMENT
+    "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}"
+  DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base}
+          ${api_gen_file}
   VERBATIM)
 
 # generate backward sparse api
 add_custom_command(
   OUTPUT ${sparse_bw_api_header_file} ${sparse_bw_api_source_file}
-  COMMAND ${PYTHON_EXECUTABLE} ${sparse_bw_api_gen_file}
-                 --api_yaml_path ${sparse_bw_api_yaml_file}
-                 --api_header_path ${sparse_bw_api_header_file_tmp}
-                 --api_source_path ${sparse_bw_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_header_file_tmp} ${sparse_bw_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_source_file_tmp} ${sparse_bw_api_source_file}
-  COMMENT "copy_if_different ${sparse_bw_api_header_file} ${sparse_bw_sparse_api_source_file}"
-  DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base} ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file}
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${sparse_bw_api_gen_file} --api_yaml_path
+    ${sparse_bw_api_yaml_file} --api_header_path
+    ${sparse_bw_api_header_file_tmp} --api_source_path
+    ${sparse_bw_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_header_file_tmp}
+          ${sparse_bw_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_source_file_tmp}
+          ${sparse_bw_api_source_file}
+  COMMENT
+    "copy_if_different ${sparse_bw_api_header_file} ${sparse_bw_sparse_api_source_file}"
+  DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base}
+          ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file}
   VERBATIM)
 
 # generate strings api
 add_custom_command(
   OUTPUT ${strings_api_header_file} ${strings_api_source_file}
-  COMMAND ${PYTHON_EXECUTABLE} ${strings_api_gen_file}
-                 --api_yaml_path ${strings_api_yaml_file}
-                 --api_header_path ${strings_api_header_file_tmp}
-                 --api_source_path ${strings_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_header_file_tmp} ${strings_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_source_file_tmp} ${strings_api_source_file}
-  COMMENT "copy_if_different ${strings_api_header_file} ${strings_strings_api_source_file}"
-  DEPENDS ${strings_api_yaml_file} ${strings_api_gen_file} ${api_gen_base} ${api_gen_file}
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${strings_api_gen_file} --api_yaml_path
+    ${strings_api_yaml_file} --api_header_path ${strings_api_header_file_tmp}
+    --api_source_path ${strings_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_header_file_tmp}
+          ${strings_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_source_file_tmp}
+          ${strings_api_source_file}
+  COMMENT
+    "copy_if_different ${strings_api_header_file} ${strings_strings_api_source_file}"
+  DEPENDS ${strings_api_yaml_file} ${strings_api_gen_file} ${api_gen_base}
+          ${api_gen_file}
   VERBATIM)
 
 # generate dygraph(intermediate) api
 add_custom_command(
   OUTPUT ${dygraph_api_header_file} ${dygraph_api_source_file}
-  COMMAND ${PYTHON_EXECUTABLE} ${im_api_gen_file}
-                 --api_yaml_path ${api_yaml_file} ${new_api_yaml_file}
-                 --sparse_api_yaml_path ${sparse_api_yaml_file}
-                 --dygraph_api_header_path ${dygraph_api_header_file_tmp}
-                 --dygraph_api_source_path ${dygraph_api_source_file_tmp}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_header_file_tmp} ${dygraph_api_header_file}
-  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_source_file_tmp} ${dygraph_api_source_file}
-  DEPENDS ${api_yaml_file} ${sparse_api_yaml_file} ${im_api_gen_file} ${api_gen_base} ${api_gen_file}
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${im_api_gen_file} --api_yaml_path ${api_yaml_file}
+    ${new_api_yaml_file} --sparse_api_yaml_path ${sparse_api_yaml_file}
+    --dygraph_api_header_path ${dygraph_api_header_file_tmp}
+    --dygraph_api_source_path ${dygraph_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_header_file_tmp}
+          ${dygraph_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dygraph_api_source_file_tmp}
+          ${dygraph_api_source_file}
+  DEPENDS ${api_yaml_file} ${sparse_api_yaml_file} ${im_api_gen_file}
+          ${api_gen_base} ${api_gen_file}
   VERBATIM)
 
 # generate wrapped infermeta
 add_custom_command(
   OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
-  COMMAND ${PYTHON_EXECUTABLE} ${wrapped_infermeta_gen_file}
-                 --api_yaml_path ${api_yaml_file} ${new_api_yaml_file}
-                 --wrapped_infermeta_header_path ${wrapped_infermeta_header_file}
-                 --wrapped_infermeta_source_path ${wrapped_infermeta_source_file}
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${wrapped_infermeta_gen_file} --api_yaml_path
+    ${api_yaml_file} ${new_api_yaml_file} --wrapped_infermeta_header_path
+    ${wrapped_infermeta_header_file} --wrapped_infermeta_source_path
+    ${wrapped_infermeta_source_file}
   DEPENDS ${api_yaml_file} ${wrapped_infermeta_gen_file} ${api_gen_base}
   VERBATIM)
 
-cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw)
-cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi)
-cc_library(context_pool SRCS context_pool.cc DEPS phi_context phi_enforce place)
+cc_library(
+  op_meta_info
+  SRCS op_meta_info.cc
+  DEPS phi_tensor_raw)
+cc_library(
+  wrapped_infermeta
+  SRCS ${wrapped_infermeta_source_file}
+  DEPS phi)
+cc_library(
+  context_pool
+  SRCS context_pool.cc
+  DEPS phi_context phi_enforce place)
 
-cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory context_pool)
-cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor)
-cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel copy_kernel tensor)
-cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform)
-cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform tensor_copy)
+cc_library(
+  kernel_dispatch
+  SRCS kernel_dispatch.cc
+  DEPS phi_tensor_raw phi_context kernel_factory context_pool)
+cc_library(
+  api_gen_utils
+  SRCS api_gen_utils.cc
+  DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor)
+cc_library(
+  phi_data_transform
+  SRCS data_transform.cc
+  DEPS phi_tensor_raw transfer_layout_kernel cast_kernel copy_kernel tensor)
+cc_library(
+  api_custom_impl
+  SRCS api_custom_impl.cc
+  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta
+       phi_data_transform)
+cc_library(
+  sparse_api_custom_impl
+  SRCS sparse_api_custom_impl.cc
+  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform
+       tensor_copy)
 
-cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl)
-cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl global_utils)
-cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
-cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl)
-cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform phi_function_api sparse_api)
-cc_library(strings_api SRCS ${strings_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils)
-cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api strings_api)
-cc_library(tensor_copy SRCS tensor_copy.cc DEPS phi_tensor_raw copy_kernel kernel_dispatch api_gen_utils)
-cc_library(api_scalar SRCS scalar.cc DEPS tensor_copy)
-cc_library(api_int_array SRCS int_array.cc DEPS tensor_copy)
+cc_library(
+  phi_function_api
+  SRCS ${api_source_file}
+  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform
+       api_custom_impl)
+cc_library(
+  phi_bw_function_api
+  SRCS ${bw_api_source_file}
+  DEPS phi_tensor_raw
+       phi
+       kernel_dispatch
+       api_gen_utils
+       backward_infermeta
+       phi_data_transform
+       phi_function_api
+       api_custom_impl
+       global_utils)
+cc_library(
+  sparse_api
+  SRCS ${sparse_api_source_file}
+  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
+cc_library(
+  sparse_bw_api
+  SRCS ${sparse_bw_api_source_file}
+  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api
+       sparse_api_custom_impl)
+cc_library(
+  phi_dygraph_api
+  SRCS ${dygraph_api_source_file}
+  DEPS phi_tensor_raw
+       phi
+       kernel_dispatch
+       api_gen_utils
+       phi_data_transform
+       phi_function_api
+       sparse_api)
+cc_library(
+  strings_api
+  SRCS ${strings_api_source_file}
+  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils)
+cc_library(
+  phi_tensor
+  SRCS tensor_method.cc
+  DEPS phi_tensor_raw
+       phi_function_api
+       api_gen_utils
+       kernel_dispatch
+       infermeta
+       sparse_api
+       strings_api)
+cc_library(
+  tensor_copy
+  SRCS tensor_copy.cc
+  DEPS phi_tensor_raw copy_kernel kernel_dispatch api_gen_utils)
+cc_library(
+  api_scalar
+  SRCS scalar.cc
+  DEPS tensor_copy)
+cc_library(
+  api_int_array
+  SRCS int_array.cc
+  DEPS tensor_copy)
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 3ef7763d57e8b..5ca7f2b51edd2 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 
+#include "glog/logging.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
@@ -28,8 +29,6 @@ limitations under the License. */
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
 
-#include "glog/logging.h"
-
 namespace paddle {
 namespace experimental {
 
diff --git a/paddle/phi/api/lib/backend_set.h b/paddle/phi/api/lib/backend_set.h
index 2aa4f969221d9..93f8f05b74b75 100644
--- a/paddle/phi/api/lib/backend_set.h
+++ b/paddle/phi/api/lib/backend_set.h
@@ -32,8 +32,9 @@ class BackendSet final {
  public:
   constexpr BackendSet() : bitset_(0) {}
   explicit constexpr BackendSet(Backend b)
-      : bitset_(b == Backend::UNDEFINED ? 0 : 1ULL << (static_cast<uint8_t>(b) -
-                                                       1)) {}
+      : bitset_(b == Backend::UNDEFINED
+                    ? 0
+                    : 1ULL << (static_cast<uint8_t>(b) - 1)) {}
 
   inline uint64_t bitset() const { return bitset_; }
 
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 12f7b8bba5870..4803616812cd0 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+// clang-format off
 #include "paddle/phi/api/lib/data_transform.h"
 
 #include "paddle/phi/api/lib/kernel_dispatch.h"
@@ -23,6 +24,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/transfer_layout_kernel.h"
 
 #include "paddle/fluid/framework/tensor_util.h"
+// clang-format on
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc
index 71ba8eaae2d36..0b93c96e7f81d 100644
--- a/paddle/phi/api/lib/sparse_api_custom_impl.cc
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/sparse_api_custom_impl.h"
 
 #include <memory>
+
 #include "glog/logging.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index a340c0fed10d8..74364d5ab0373 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+// clang-format off
 #include "paddle/phi/api/include/tensor.h"
 
 #include <memory>
@@ -34,6 +35,7 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_utils.h"
 
 #include "paddle/fluid/platform/stream/cuda_stream.h"
+// clang-format off
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/api/lib/tensor_copy.cc b/paddle/phi/api/lib/tensor_copy.cc
index 85de3601fd96a..5f8c2ed71e939 100644
--- a/paddle/phi/api/lib/tensor_copy.cc
+++ b/paddle/phi/api/lib/tensor_copy.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/api/lib/tensor_copy.h"
+
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/core/compat/convert_utils.h"
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index 5285392b4a6ac..fbeeb3332eadb 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+// clang-format off
 #include "paddle/phi/api/include/tensor.h"
 
 #include "paddle/phi/common/int_array.h"
@@ -22,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/infermeta/unary.h"
+// clang-format off
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt
index 0e1cd0cb83fd4..ef99a1586285e 100644
--- a/paddle/phi/api/lib/utils/CMakeLists.txt
+++ b/paddle/phi/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,13 @@
-cc_library(phi_api_utils SRCS tensor_utils.cc DEPS
-tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits string_tensor int_array scalar)
+cc_library(
+  phi_api_utils
+  SRCS tensor_utils.cc
+  DEPS tensor_base
+       convert_utils
+       dense_tensor
+       lod_tensor
+       selected_rows_utils
+       place
+       var_type_traits
+       string_tensor
+       int_array
+       scalar)
diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h
index 36a0901bbe980..f930f5b11f64f 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.h
+++ b/paddle/phi/api/lib/utils/tensor_utils.h
@@ -18,7 +18,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/variable.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 5f61615554645..c981b625192da 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -12,7 +12,10 @@ if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
 
-cc_library(phi_context SRCS all_context.cc DEPS device_context cpu_context)
+cc_library(
+  phi_context
+  SRCS all_context.cc
+  DEPS device_context cpu_context)
 
 if(WITH_XPU)
   add_dependencies(phi_context xpu_context)
@@ -24,11 +27,31 @@ endif()
 
 if(WITH_CUSTOM_DEVICE)
   add_dependencies(phi_context custom_context)
-  cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place)
-  cc_library(device_guard SRCS device_guard.cc DEPS enforce place)
-  cc_library(stream SRCS stream.cc DEPS callback_manager)
-  cc_library(event SRCS event.cc DEPS enforce place)
-  cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags)
-  cc_library(device_manager SRCS device_manager.cc DEPS custom_device)
-  set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library")
+  cc_library(
+    callback_manager
+    SRCS callback_manager.cc
+    DEPS enforce place)
+  cc_library(
+    device_guard
+    SRCS device_guard.cc
+    DEPS enforce place)
+  cc_library(
+    stream
+    SRCS stream.cc
+    DEPS callback_manager)
+  cc_library(
+    event
+    SRCS event.cc
+    DEPS enforce place)
+  cc_library(
+    device_base
+    SRCS device_base.cc
+    DEPS stream event callback_manager device_guard device_context flags)
+  cc_library(
+    device_manager
+    SRCS device_manager.cc
+    DEPS custom_device)
+  set(GLOB_DEV_LIB
+      device_manager custom_device
+      CACHE INTERNAL "Global DEV library")
 endif()
diff --git a/paddle/phi/backends/callback_manager.cc b/paddle/phi/backends/callback_manager.cc
index 4a958ef73bfc6..295f70fc65cd7 100644
--- a/paddle/phi/backends/callback_manager.cc
+++ b/paddle/phi/backends/callback_manager.cc
@@ -13,11 +13,12 @@
 // limitations under the License.
 
 #include "paddle/phi/backends/callback_manager.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/enforce.h"
 
 #include <ThreadPool.h>
 
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
 namespace phi {
 
 CallbackManager::CallbackManager(stream::Stream *stream)
diff --git a/paddle/phi/backends/cpu/CMakeLists.txt b/paddle/phi/backends/cpu/CMakeLists.txt
index 82ea42566fc1f..e32aa17758b2b 100644
--- a/paddle/phi/backends/cpu/CMakeLists.txt
+++ b/paddle/phi/backends/cpu/CMakeLists.txt
@@ -1,6 +1,12 @@
 if(WITH_MKLDNN)
   # TODO(wilber): support mkldnn context.
-  cc_library(cpu_context SRCS cpu_context.cc DEPS phi_device_context mkldnn eigen3)
+  cc_library(
+    cpu_context
+    SRCS cpu_context.cc
+    DEPS phi_device_context mkldnn eigen3)
 else()
-  cc_library(cpu_context SRCS cpu_context.cc DEPS phi_device_context eigen3)
+  cc_library(
+    cpu_context
+    SRCS cpu_context.cc
+    DEPS phi_device_context eigen3)
 endif()
diff --git a/paddle/phi/backends/custom/CMakeLists.txt b/paddle/phi/backends/custom/CMakeLists.txt
index 5b46afb4ce9ee..d8ed6706eba22 100644
--- a/paddle/phi/backends/custom/CMakeLists.txt
+++ b/paddle/phi/backends/custom/CMakeLists.txt
@@ -1,5 +1,14 @@
-if (WITH_CUSTOM_DEVICE)
-  cc_library(custom_context SRCS custom_context.cc DEPS phi_device_context device_manager)
-  cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context)
-  cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context)
+if(WITH_CUSTOM_DEVICE)
+  cc_library(
+    custom_context
+    SRCS custom_context.cc
+    DEPS phi_device_context device_manager)
+  cc_library(
+    custom_device
+    SRCS custom_device.cc
+    DEPS device_base device_context)
+  cc_test(
+    custom_device_test
+    SRCS custom_device_test.cc
+    DEPS device_manager device_context)
 endif()
diff --git a/paddle/phi/backends/custom/custom_context.h b/paddle/phi/backends/custom/custom_context.h
index 37b0ee21219b5..57be8534fa954 100644
--- a/paddle/phi/backends/custom/custom_context.h
+++ b/paddle/phi/backends/custom/custom_context.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
+
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/device_context.h"
 
diff --git a/paddle/phi/backends/custom/custom_device_test.cc b/paddle/phi/backends/custom/custom_device_test.cc
index 53b88f9b4ac79..51fa74b4dc5f3 100644
--- a/paddle/phi/backends/custom/custom_device_test.cc
+++ b/paddle/phi/backends/custom/custom_device_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <string>
 
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/phi/backends/device_base.cc b/paddle/phi/backends/device_base.cc
index b72c6efd51f2c..e57653702c538 100644
--- a/paddle/phi/backends/device_base.cc
+++ b/paddle/phi/backends/device_base.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/backends/device_base.h"
+
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
@@ -214,8 +215,9 @@ size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) {
   size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
                            : FLAGS_initial_gpu_memory_in_mb;
   size_t alloc_bytes =
-      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
-                                           FLAGS_fraction_of_gpu_memory_to_use);
+      (flag_mb > 0ul
+           ? flag_mb << 20
+           : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
   PADDLE_ENFORCE_GE(available_to_alloc,
                     alloc_bytes,
                     phi::errors::ResourceExhausted(
diff --git a/paddle/phi/backends/device_ext.h b/paddle/phi/backends/device_ext.h
index 749d8d323b62d..ff58f4f35fd32 100644
--- a/paddle/phi/backends/device_ext.h
+++ b/paddle/phi/backends/device_ext.h
@@ -34,7 +34,9 @@ typedef enum {
   C_INTERNAL_ERROR  // plugin error
 } C_Status;
 
-typedef struct C_Device_st { int id; } * C_Device;
+typedef struct C_Device_st {
+  int id;
+} * C_Device;
 
 typedef struct C_Stream_st* C_Stream;
 
diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h
index 18d51687ef121..56d99ba43bdd1 100644
--- a/paddle/phi/backends/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -19,11 +19,10 @@
 
 #include "paddle/phi/backends/device_base.h"
 #include "paddle/phi/backends/device_ext.h"
+#include "paddle/phi/backends/dynload/port.h"
 #include "paddle/phi/backends/event.h"
 #include "paddle/phi/backends/stream.h"
 #include "paddle/phi/common/place.h"
-
-#include "paddle/phi/backends/dynload/port.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
 namespace phi {
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index bc5ef3cd5c078..91dbafe0cd38d 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -1,57 +1,94 @@
-cc_library(phi_dynamic_loader SRCS dynamic_loader.cc DEPS enforce glog gflags)
+cc_library(
+  phi_dynamic_loader
+  SRCS dynamic_loader.cc
+  DEPS enforce glog gflags)
 
-list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc  nvtx.cc cufft.cc)
+list(
+  APPEND
+  CUDA_SRCS
+  cublas.cc
+  cublasLt.cc
+  cudnn.cc
+  curand.cc
+  cusolver.cc
+  cusparse.cc
+  nvtx.cc
+  cufft.cc)
 
-if (NOT WITH_NV_JETSON)
+if(NOT WITH_NV_JETSON)
   list(APPEND CUDA_SRCS nvjpeg.cc)
 endif()
 
-if (WITH_ROCM)
+if(WITH_ROCM)
   list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc)
 endif()
 
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows.
-if (NOT APPLE)
+if(NOT APPLE)
   list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
-  if (WITH_NCCL)
+  if(WITH_NCCL)
     list(APPEND CUDA_SRCS nccl.cc)
   endif()
-  if (WITH_ROCM)
+  if(WITH_ROCM)
     list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc)
-    if (WITH_RCCL)
+    if(WITH_RCCL)
       list(APPEND HIP_SRCS rccl.cc)
     endif()
   endif()
 endif()
 
-if (TENSORRT_FOUND)
+if(TENSORRT_FOUND)
   list(APPEND CUDA_SRCS tensorrt.cc)
 endif()
 
 configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
-if (CUPTI_FOUND)
+if(CUPTI_FOUND)
   list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 if(WITH_ROCM)
-  hip_library(phi_dynload_cuda SRCS ${HIP_SRCS} DEPS phi_dynamic_loader)
-  cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc)
-elseif (WITH_ASCEND_CL)
-  cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc npu_hccl)
+  hip_library(
+    phi_dynload_cuda
+    SRCS ${HIP_SRCS}
+    DEPS phi_dynamic_loader)
+  cc_library(
+    phi_dynload_warpctc
+    SRCS warpctc.cc
+    DEPS phi_dynamic_loader warpctc)
+elseif(WITH_ASCEND_CL)
+  cc_library(
+    phi_dynload_warpctc
+    SRCS warpctc.cc
+    DEPS phi_dynamic_loader warpctc npu_hccl)
 else()
-  nv_library(phi_dynload_cuda SRCS ${CUDA_SRCS} DEPS phi_dynamic_loader)
-  cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc)
+  nv_library(
+    phi_dynload_cuda
+    SRCS ${CUDA_SRCS}
+    DEPS phi_dynamic_loader)
+  cc_library(
+    phi_dynload_warpctc
+    SRCS warpctc.cc
+    DEPS phi_dynamic_loader warpctc)
 endif()
-if (WITH_MKLML)
-  cc_library(phi_dynload_mklml SRCS mklml.cc DEPS phi_dynamic_loader mklml)
+if(WITH_MKLML)
+  cc_library(
+    phi_dynload_mklml
+    SRCS mklml.cc
+    DEPS phi_dynamic_loader mklml)
 endif()
 
-cc_library(phi_dynload_lapack SRCS lapack.cc DEPS phi_dynamic_loader)
+cc_library(
+  phi_dynload_lapack
+  SRCS lapack.cc
+  DEPS phi_dynamic_loader)
 add_dependencies(phi_dynload_lapack extern_lapack)
 # TODO(TJ): add iomp, mkldnn?
 
-if (MKL_FOUND AND WITH_ONEMKL)
+if(MKL_FOUND AND WITH_ONEMKL)
   message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
-  cc_library(phi_dynload_mklrt SRCS mklrt.cc DEPS phi_dynamic_loader)
+  cc_library(
+    phi_dynload_mklrt
+    SRCS mklrt.cc
+    DEPS phi_dynamic_loader)
   target_include_directories(phi_dynload_mklrt PRIVATE ${MKL_INCLUDE})
 endif()
diff --git a/paddle/phi/backends/dynload/cublas.h b/paddle/phi/backends/dynload/cublas.h
index ee0696fb4b218..308ae2accef14 100644
--- a/paddle/phi/backends/dynload/cublas.h
+++ b/paddle/phi/backends/dynload/cublas.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cublasXt.h>
 #include <cublas_v2.h>
 #include <cuda.h>
+
 #include <mutex>  // NOLINT
 #include <type_traits>
 
diff --git a/paddle/phi/backends/dynload/cublasLt.h b/paddle/phi/backends/dynload/cublasLt.h
index 4c7ac9c3f21c4..1e2a20ebdf440 100644
--- a/paddle/phi/backends/dynload/cublasLt.h
+++ b/paddle/phi/backends/dynload/cublasLt.h
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include <cublasLt.h>
 #include <cuda.h>
+
 #include <mutex>  // NOLINT
 #include <type_traits>
 
diff --git a/paddle/phi/backends/dynload/cuda_driver.h b/paddle/phi/backends/dynload/cuda_driver.h
index f4ea70a81b91f..f743a33a1866f 100644
--- a/paddle/phi/backends/dynload/cuda_driver.h
+++ b/paddle/phi/backends/dynload/cuda_driver.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cuda.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/cudnn.cc b/paddle/phi/backends/dynload/cudnn.cc
index 02d626d5f98f9..8aa3b623273d7 100644
--- a/paddle/phi/backends/dynload/cudnn.cc
+++ b/paddle/phi/backends/dynload/cudnn.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/backends/dynload/cudnn.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace phi {
diff --git a/paddle/phi/backends/dynload/cudnn.h b/paddle/phi/backends/dynload/cudnn.h
index a3afb98e3e636..7b9004308e95b 100644
--- a/paddle/phi/backends/dynload/cudnn.h
+++ b/paddle/phi/backends/dynload/cudnn.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_CUDA
 #include <cudnn.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/cufft.cc b/paddle/phi/backends/dynload/cufft.cc
index 596a68c1ed6aa..5a7080032d28d 100644
--- a/paddle/phi/backends/dynload/cufft.cc
+++ b/paddle/phi/backends/dynload/cufft.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/backends/dynload/cufft.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace phi {
diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
index 4697e335477ec..a27d7c3ab1eee 100644
--- a/paddle/phi/backends/dynload/cufft.h
+++ b/paddle/phi/backends/dynload/cufft.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cufft.h>
 #include <cufftXt.h>
 #include <glog/logging.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h
index a526fbfd92639..22e21b78f4f2e 100644
--- a/paddle/phi/backends/dynload/cupti.h
+++ b/paddle/phi/backends/dynload/cupti.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cuda.h>
 #include <cuda_occupancy.h>
 #include <cupti.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/curand.h b/paddle/phi/backends/dynload/curand.h
index 875403b03bb81..f3c4496dc4d39 100644
--- a/paddle/phi/backends/dynload/curand.h
+++ b/paddle/phi/backends/dynload/curand.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <curand.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/cusolver.h b/paddle/phi/backends/dynload/cusolver.h
index 40e5f183dc035..1354e31055480 100644
--- a/paddle/phi/backends/dynload/cusolver.h
+++ b/paddle/phi/backends/dynload/cusolver.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <cusolverDn.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/cusparse.h b/paddle/phi/backends/dynload/cusparse.h
index 8f7d54d55dbc4..a7e305f98d49a 100644
--- a/paddle/phi/backends/dynload/cusparse.h
+++ b/paddle/phi/backends/dynload/cusparse.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <cusparse.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/hiprand.h b/paddle/phi/backends/dynload/hiprand.h
index ccaf02d93047a..3e9502dd94d91 100644
--- a/paddle/phi/backends/dynload/hiprand.h
+++ b/paddle/phi/backends/dynload/hiprand.h
@@ -16,9 +16,9 @@ limitations under the License. */
 #include <hiprand.h>
 
 #include <mutex>  // NOLINT
-#include "paddle/phi/backends/dynload/port.h"
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
 
 namespace phi {
 namespace dynload {
diff --git a/paddle/phi/backends/dynload/hiprtc.h b/paddle/phi/backends/dynload/hiprtc.h
index 0404aad559394..75dd88f87bd3a 100644
--- a/paddle/phi/backends/dynload/hiprtc.h
+++ b/paddle/phi/backends/dynload/hiprtc.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <hip/hiprtc.h>
+
 #include <mutex>  // NOLINT
+
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 #include "paddle/phi/backends/dynload/port.h"
 
diff --git a/paddle/phi/backends/dynload/lapack.cc b/paddle/phi/backends/dynload/lapack.cc
index bb03beabd4ffc..9719da9775146 100644
--- a/paddle/phi/backends/dynload/lapack.cc
+++ b/paddle/phi/backends/dynload/lapack.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/backends/dynload/lapack.h"
+
 #include <mutex>
 
 namespace phi {
diff --git a/paddle/phi/backends/dynload/lapack.h b/paddle/phi/backends/dynload/lapack.h
index c81c66c69282f..f0e1e9ad7a4c0 100644
--- a/paddle/phi/backends/dynload/lapack.h
+++ b/paddle/phi/backends/dynload/lapack.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <complex>
 #include <mutex>
+
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 #include "paddle/phi/backends/dynload/port.h"
diff --git a/paddle/phi/backends/dynload/miopen.cc b/paddle/phi/backends/dynload/miopen.cc
index e7916873ccfde..9c58da1d6ff1a 100644
--- a/paddle/phi/backends/dynload/miopen.cc
+++ b/paddle/phi/backends/dynload/miopen.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/backends/dynload/miopen.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace phi {
diff --git a/paddle/phi/backends/dynload/miopen.h b/paddle/phi/backends/dynload/miopen.h
index eb14bfe8ec543..eeaf8028ec312 100644
--- a/paddle/phi/backends/dynload/miopen.h
+++ b/paddle/phi/backends/dynload/miopen.h
@@ -14,10 +14,11 @@ limitations under the License. */
 
 #pragma once
 #include <glog/logging.h>
-
 #include <miopen/miopen.h>
 #include <miopen/version.h>
+
 #include <mutex>  // NOLINT
+
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 #include "paddle/phi/backends/dynload/port.h"
 
diff --git a/paddle/phi/backends/dynload/mklml.h b/paddle/phi/backends/dynload/mklml.h
index 5f5520a831eb1..0f0c31f8064df 100644
--- a/paddle/phi/backends/dynload/mklml.h
+++ b/paddle/phi/backends/dynload/mklml.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <mkl.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/mklrt.h b/paddle/phi/backends/dynload/mklrt.h
index 8638d83d025bd..0267fb69a5932 100644
--- a/paddle/phi/backends/dynload/mklrt.h
+++ b/paddle/phi/backends/dynload/mklrt.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <mkl_dfti.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/nccl.h b/paddle/phi/backends/dynload/nccl.h
index b04ef0f0651eb..6c73c562caa69 100644
--- a/paddle/phi/backends/dynload/nccl.h
+++ b/paddle/phi/backends/dynload/nccl.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <nccl.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/nvjpeg.h b/paddle/phi/backends/dynload/nvjpeg.h
index 13bb8a5698f15..6e71e6b582c05 100644
--- a/paddle/phi/backends/dynload/nvjpeg.h
+++ b/paddle/phi/backends/dynload/nvjpeg.h
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 #include <nvjpeg.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/nvrtc.h b/paddle/phi/backends/dynload/nvrtc.h
index 516ca7686d253..9244e9487b250 100644
--- a/paddle/phi/backends/dynload/nvrtc.h
+++ b/paddle/phi/backends/dynload/nvrtc.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <nvrtc.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/nvtx.h b/paddle/phi/backends/dynload/nvtx.h
index e9fd32668dc80..a9a166b289e33 100644
--- a/paddle/phi/backends/dynload/nvtx.h
+++ b/paddle/phi/backends/dynload/nvtx.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifndef _WIN32
 #include <cuda.h>
 #include <nvToolsExt.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/port.h b/paddle/phi/backends/dynload/port.h
index 981e5f5af644e..d380993c9b67a 100644
--- a/paddle/phi/backends/dynload/port.h
+++ b/paddle/phi/backends/dynload/port.h
@@ -28,6 +28,7 @@
 #include <dlfcn.h>  // dladdr
 #include <sys/stat.h>
 #include <sys/time.h>
+
 #include <algorithm>  // std::accumulate
 #else
 #ifndef NOMINMAX
@@ -40,6 +41,7 @@
 #include <stdio.h>
 #include <windows.h>
 #include <winsock.h>
+
 #include <numeric>  // std::accumulate in msvc
 #ifndef S_ISDIR     // windows port for sys/stat.h
 #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
diff --git a/paddle/phi/backends/dynload/rccl.h b/paddle/phi/backends/dynload/rccl.h
index 4472684962832..2da35dc2df2db 100644
--- a/paddle/phi/backends/dynload/rccl.h
+++ b/paddle/phi/backends/dynload/rccl.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <rccl.h>
 
 #include <mutex>  // NOLINT
+
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 #include "paddle/phi/backends/dynload/port.h"
 
diff --git a/paddle/phi/backends/dynload/rocblas.h b/paddle/phi/backends/dynload/rocblas.h
index 18061b192e465..a9804b3d82a7d 100644
--- a/paddle/phi/backends/dynload/rocblas.h
+++ b/paddle/phi/backends/dynload/rocblas.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <hip/hip_runtime.h>
 #include <rocblas.h>
+
 #include <mutex>  // NOLINT
 #include <type_traits>
 
diff --git a/paddle/phi/backends/dynload/rocm_driver.h b/paddle/phi/backends/dynload/rocm_driver.h
index 59e35b787a599..4e456db44c904 100644
--- a/paddle/phi/backends/dynload/rocm_driver.h
+++ b/paddle/phi/backends/dynload/rocm_driver.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <hip/hip_runtime.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/tensorrt.cc b/paddle/phi/backends/dynload/tensorrt.cc
index cc3b4e0146088..4552570102025 100644
--- a/paddle/phi/backends/dynload/tensorrt.cc
+++ b/paddle/phi/backends/dynload/tensorrt.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/phi/backends/dynload/tensorrt.h"
+
 #include <string>
 
 namespace phi {
diff --git a/paddle/phi/backends/event.cc b/paddle/phi/backends/event.cc
index a474536f865c1..43077d280f360 100644
--- a/paddle/phi/backends/event.cc
+++ b/paddle/phi/backends/event.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/backends/event.h"
+
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/phi/backends/device_guard.h"
 #include "paddle/phi/backends/stream.h"
diff --git a/paddle/phi/backends/gpu/CMakeLists.txt b/paddle/phi/backends/gpu/CMakeLists.txt
index ebe8f1ca4c101..6d9f2de67d530 100644
--- a/paddle/phi/backends/gpu/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/CMakeLists.txt
@@ -1,10 +1,22 @@
 if(WITH_GPU)
   add_subdirectory(cuda)
-  nv_library(phi_gpu_info SRCS gpu_info.cc DEPS phi_cuda_info gflags glog enforce phi_dynload_cuda)
+  nv_library(
+    phi_gpu_info
+    SRCS gpu_info.cc
+    DEPS phi_cuda_info gflags glog enforce phi_dynload_cuda)
 elseif(WITH_ROCM)
   add_subdirectory(rocm)
-  hip_library(phi_gpu_info SRCS gpu_info.cc DEPS phi_rocm_info gflags glog enforce phi_dynload_cuda)
+  hip_library(
+    phi_gpu_info
+    SRCS gpu_info.cc
+    DEPS phi_rocm_info gflags glog enforce phi_dynload_cuda)
 endif()
 
-cc_library(gpu_resources SRCS gpu_resources.cc DEPS phi_device_context phi_gpu_info)
-cc_library(gpu_context SRCS gpu_context.cc DEPS phi_device_context phi_gpu_info eigen3 gpu_resources)
+cc_library(
+  gpu_resources
+  SRCS gpu_resources.cc
+  DEPS phi_device_context phi_gpu_info)
+cc_library(
+  gpu_context
+  SRCS gpu_context.cc
+  DEPS phi_device_context phi_gpu_info eigen3 gpu_resources)
diff --git a/paddle/phi/backends/gpu/cuda/CMakeLists.txt b/paddle/phi/backends/gpu/cuda/CMakeLists.txt
index a3393f97d7559..9765f5dc03b5a 100644
--- a/paddle/phi/backends/gpu/cuda/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/cuda/CMakeLists.txt
@@ -1 +1,4 @@
-nv_library(phi_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce phi_dynload_cuda)
+nv_library(
+  phi_cuda_info
+  SRCS cuda_info.cc
+  DEPS gflags glog enforce phi_dynload_cuda)
diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h
index 08670832c775f..c62addfd257ab 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h
@@ -60,7 +60,7 @@ namespace gpu {
  *      }
  *    }
  *
-*/
+ */
 
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
   int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index e8c264b884fe3..f51f287ee4a08 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include <mutex>
 
 #include "glog/logging.h"
-
 #include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index db9f287041dfb..5246155131dbe 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <array>
 #include <functional>
 #include <mutex>
+
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_helper.h"
diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h
index 443830acf4793..323565c000a1c 100644
--- a/paddle/phi/backends/gpu/gpu_info.h
+++ b/paddle/phi/backends/gpu/gpu_info.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include <stddef.h>
+
 #include <array>
 #include <string>
 #include <vector>
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index 888b44632ea28..2dd1431ff58bb 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -25,9 +25,11 @@
 #endif
 
 #include <stddef.h>
+
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/enforce.h"
 
@@ -95,9 +97,9 @@ struct GpuLaunchConfig {
 };
 
 /* According to NVIDIA, if number of threads per block is 64/128/256/512,
-  * cuda performs better. And number of blocks should be greater (at least
-  * 2x~4x) than number of SMs. Hence, SM count is took into account within
-  * this function to determine the right number of threads per block. */
+ * cuda performs better. And number of blocks should be greater (at least
+ * 2x~4x) than number of SMs. Hence, SM count is took into account within
+ * this function to determine the right number of threads per block. */
 inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
                                             int64_t numel,
                                             int vec_size = 1) {
diff --git a/paddle/phi/backends/gpu/gpu_resources.h b/paddle/phi/backends/gpu/gpu_resources.h
index 07ccb6215409a..7bec5eebf5886 100644
--- a/paddle/phi/backends/gpu/gpu_resources.h
+++ b/paddle/phi/backends/gpu/gpu_resources.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <array>
+
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/common/place.h"
 
diff --git a/paddle/phi/backends/gpu/rocm/CMakeLists.txt b/paddle/phi/backends/gpu/rocm/CMakeLists.txt
index 257e4cc8afbcf..730aad5d2fd2b 100644
--- a/paddle/phi/backends/gpu/rocm/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/rocm/CMakeLists.txt
@@ -1 +1,4 @@
-hip_library(phi_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce phi_dynload_cuda)
+hip_library(
+  phi_rocm_info
+  SRCS rocm_info.cc
+  DEPS gflags glog enforce phi_dynload_cuda)
diff --git a/paddle/phi/backends/gpu/rocm/rocm_helper.h b/paddle/phi/backends/gpu/rocm/rocm_helper.h
index 2d75b6ea4cb71..14e9ca660bdf9 100644
--- a/paddle/phi/backends/gpu/rocm/rocm_helper.h
+++ b/paddle/phi/backends/gpu/rocm/rocm_helper.h
@@ -60,7 +60,7 @@ namespace gpu {
  *      }
  *    }
  *
-*/
+ */
 
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                     \
   int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
diff --git a/paddle/phi/backends/gpu/rocm/rocm_info.cc b/paddle/phi/backends/gpu/rocm/rocm_info.cc
index 23e58d34b2572..b89d5a3c1624f 100644
--- a/paddle/phi/backends/gpu/rocm/rocm_info.cc
+++ b/paddle/phi/backends/gpu/rocm/rocm_info.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <array>
+
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
 // TODO(phi): remove fluid headers.
diff --git a/paddle/phi/backends/stream.cc b/paddle/phi/backends/stream.cc
index 30939f31fcc3c..f8b15bdbd9e63 100644
--- a/paddle/phi/backends/stream.cc
+++ b/paddle/phi/backends/stream.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/backends/stream.h"
+
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/phi/backends/device_guard.h"
 #include "paddle/phi/backends/event.h"
diff --git a/paddle/phi/backends/xpu/CMakeLists.txt b/paddle/phi/backends/xpu/CMakeLists.txt
index 4d885757bb1a6..861b57956ba8e 100644
--- a/paddle/phi/backends/xpu/CMakeLists.txt
+++ b/paddle/phi/backends/xpu/CMakeLists.txt
@@ -1,2 +1,8 @@
-cc_library(phi_xpu_info SRCS xpu_info.cc DEPS enforce xpulib phi_place)
-cc_library(xpu_context SRCS xpu_context.cc DEPS phi_device_context phi_xpu_info)
+cc_library(
+  phi_xpu_info
+  SRCS xpu_info.cc
+  DEPS enforce xpulib phi_place)
+cc_library(
+  xpu_context
+  SRCS xpu_context.cc
+  DEPS phi_device_context phi_xpu_info)
diff --git a/paddle/phi/backends/xpu/enforce_xpu.h b/paddle/phi/backends/xpu/enforce_xpu.h
index 29b048ead852d..30095e3a0074a 100644
--- a/paddle/phi/backends/xpu/enforce_xpu.h
+++ b/paddle/phi/backends/xpu/enforce_xpu.h
@@ -14,11 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
 #include "xpu/bkcl.h"
 
-#include "paddle/fluid/platform/enforce.h"
-
 namespace phi {
 namespace backends {
 namespace xpu {
diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc
index 7cc9eb44bc488..dbff88c0a2709 100644
--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -18,7 +18,6 @@
 
 #include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/common/place.h"
-
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
 #include "xpu/xdnn.h"
@@ -86,8 +85,8 @@ struct XPUContext::Impl {
   void Init() {
     owned_ = true;
     backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
-    LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
-                            << static_cast<int>(place_.device);
+    LOG_FIRST_N(WARNING, 1)
+        << "Please NOTE: xpu device: " << static_cast<int>(place_.device);
     context_ = xpu::create_context();
     xpu_version_ = backends::xpu::get_xpu_version(place_.device);
     SetL3Cache();
diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h
index b87489c567cab..d39b3c9cc1ff7 100644
--- a/paddle/phi/backends/xpu/xpu_context.h
+++ b/paddle/phi/backends/xpu/xpu_context.h
@@ -15,12 +15,12 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
-#include "paddle/phi/backends/xpu/forwards.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/device_context.h"
 
+#include "paddle/phi/backends/xpu/forwards.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/device_context.h"
 
 namespace xpu = baidu::xpu::api;
 
diff --git a/paddle/phi/backends/xpu/xpu_header.h b/paddle/phi/backends/xpu/xpu_header.h
index 5337f78c64207..1fe6f6d07796f 100644
--- a/paddle/phi/backends/xpu/xpu_header.h
+++ b/paddle/phi/backends/xpu/xpu_header.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
-
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
 #include "xpu/xdnn.h"
diff --git a/paddle/phi/backends/xpu/xpu_info.h b/paddle/phi/backends/xpu/xpu_info.h
index b1056cdc4b14b..9d5f073eaa8e6 100644
--- a/paddle/phi/backends/xpu/xpu_info.h
+++ b/paddle/phi/backends/xpu/xpu_info.h
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/phi/common/place.h"
 
 namespace phi {
diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt
index b1ca4d1f8a8c6..d9266bd06d278 100644
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -1,3 +1,9 @@
 cc_library(phi_place SRCS place.cc)
-cc_library(scalar SRCS scalar.cc DEPS phi_enforce tensor)
-cc_library(int_array SRCS int_array.cc DEPS phi_enforce tensor)
+cc_library(
+  scalar
+  SRCS scalar.cc
+  DEPS phi_enforce tensor)
+cc_library(
+  int_array
+  SRCS int_array.cc
+  DEPS phi_enforce tensor)
diff --git a/paddle/phi/common/data_type.h b/paddle/phi/common/data_type.h
index 1792cb9370673..ef9b425048298 100644
--- a/paddle/phi/common/data_type.h
+++ b/paddle/phi/common/data_type.h
@@ -14,11 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
-
-#include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/common/pstring.h"
 
 namespace paddle {
diff --git a/paddle/phi/common/int_array.cc b/paddle/phi/common/int_array.cc
index daed2b6625a9e..81701ee010ca2 100644
--- a/paddle/phi/common/int_array.cc
+++ b/paddle/phi/common/int_array.cc
@@ -14,9 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/common/int_array.h"
 
-#include "paddle/phi/common/place.h"
-
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/place.h"
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc
index 667d0a32b93da..c15a17651b18b 100644
--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "glog/logging.h"
-
 #include "paddle/phi/api/ext/exception.h"
 
 namespace phi {
diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc
index 41f1c9541823d..2954af086ac4c 100644
--- a/paddle/phi/common/scalar.cc
+++ b/paddle/phi/common/scalar.cc
@@ -14,11 +14,10 @@ limitations under the License. */
 
 #include "paddle/phi/common/scalar.h"
 
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/enforce.h"
-
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
 namespace paddle {
 namespace experimental {
 
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 41f654bfc8f30..8b180a2c2aeff 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -6,30 +6,78 @@ set(phi_enforce_deps errors flags)
 if(WITH_GPU)
   set(phi_enforce_deps ${phi_enforce_deps} external_error_proto)
 endif()
-cc_library(phi_enforce SRCS enforce.cc DEPS ${phi_enforce_deps})
+cc_library(
+  phi_enforce
+  SRCS enforce.cc
+  DEPS ${phi_enforce_deps})
 
-cc_library(kernel_factory SRCS kernel_factory.cc DEPS phi_enforce fluid_convert_utils)
-cc_library(kernel_context SRCS kernel_context.cc DEPS phi_enforce phi_context)
+cc_library(
+  kernel_factory
+  SRCS kernel_factory.cc
+  DEPS phi_enforce fluid_convert_utils)
+cc_library(
+  kernel_context
+  SRCS kernel_context.cc
+  DEPS phi_enforce phi_context)
 
-cc_library(ddim SRCS ddim.cc DEPS phi_enforce)
-cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS phi_enforce)
-cc_library(tensor_meta SRCS tensor_meta.cc DEPS phi_enforce)
-cc_library(lod_utils SRCS lod_utils.cc DEPS phi_enforce)
+cc_library(
+  ddim
+  SRCS ddim.cc
+  DEPS phi_enforce)
+cc_library(
+  tensor_base
+  SRCS tensor_base.cc allocator.cc
+  DEPS phi_enforce)
+cc_library(
+  tensor_meta
+  SRCS tensor_meta.cc
+  DEPS phi_enforce)
+cc_library(
+  lod_utils
+  SRCS lod_utils.cc
+  DEPS phi_enforce)
 
-cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS convert_utils fluid_convert_utils tensor_meta tensor_base)
-cc_library(sparse_coo_tensor SRCS sparse_coo_tensor.cc DEPS tensor_meta tensor_base)
-cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_base)
-cc_library(string_tensor SRCS string_tensor.cc DEPS convert_utils tensor_meta tensor_base)
+cc_library(
+  dense_tensor
+  SRCS dense_tensor.cc dense_tensor_impl.cc
+  DEPS convert_utils fluid_convert_utils tensor_meta tensor_base)
+cc_library(
+  sparse_coo_tensor
+  SRCS sparse_coo_tensor.cc
+  DEPS tensor_meta tensor_base)
+cc_library(
+  sparse_csr_tensor
+  SRCS sparse_csr_tensor.cc
+  DEPS dense_tensor tensor_base)
+cc_library(
+  string_tensor
+  SRCS string_tensor.cc
+  DEPS convert_utils tensor_meta tensor_base)
 
-cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
-cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
-cc_library(selected_rows SRCS selected_rows_impl.cc selected_rows.cc DEPS tensor_base dense_tensor phi_enforce ddim memcpy)
-cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
+cc_library(
+  meta_tensor
+  SRCS meta_tensor.cc
+  DEPS tensor_base tensor_meta dense_tensor)
+cc_library(
+  infermeta_utils
+  SRCS infermeta_utils.cc
+  DEPS meta_tensor)
+cc_library(
+  selected_rows
+  SRCS selected_rows_impl.cc selected_rows.cc
+  DEPS tensor_base dense_tensor phi_enforce ddim memcpy)
+cc_library(
+  phi_device_context
+  SRCS device_context.cc
+  DEPS dense_tensor selected_rows)
 
-cc_library(custom_kernel SRCS custom_kernel.cc DEPS kernel_factory)
+cc_library(
+  custom_kernel
+  SRCS custom_kernel.cc
+  DEPS kernel_factory)
 
 # Will remove once we implemented MKLDNN_Tensor
 if(WITH_MKLDNN)
-    add_dependencies(dense_tensor mkldnn)
-    add_dependencies(tensor_base mkldnn)
+  add_dependencies(dense_tensor mkldnn)
+  add_dependencies(tensor_base mkldnn)
 endif()
diff --git a/paddle/phi/core/compat/CMakeLists.txt b/paddle/phi/core/compat/CMakeLists.txt
index 3423e380970df..3fd9b74255c1d 100644
--- a/paddle/phi/core/compat/CMakeLists.txt
+++ b/paddle/phi/core/compat/CMakeLists.txt
@@ -1,5 +1,11 @@
-cc_library(arg_map_context SRCS arg_map_context.cc DEPS phi_enforce)
-cc_library(op_utils SRCS op_utils.cc DEPS arg_map_context enforce)
+cc_library(
+  arg_map_context
+  SRCS arg_map_context.cc
+  DEPS phi_enforce)
+cc_library(
+  op_utils
+  SRCS op_utils.cc
+  DEPS arg_map_context enforce)
 
 set(convert_utils_deps data_type place op_utils)
 
@@ -13,4 +19,7 @@ endif()
 if(WITH_CUSTOM_DEVICE)
   set(convert_utils_deps ${convert_utils_deps} device_manager)
 endif()
-cc_library(convert_utils SRCS convert_utils.cc DEPS ${convert_utils_deps})
+cc_library(
+  convert_utils
+  SRCS convert_utils.cc
+  DEPS ${convert_utils_deps})
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index 8eb6524e79c0f..ae3b8924ece69 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <unordered_set>
 
 #include "glog/logging.h"
-
 #include "paddle/phi/core/compat/arg_map_context.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/phi/core/ddim.h b/paddle/phi/core/ddim.h
index dd13081ddafff..794d7051aee58 100644
--- a/paddle/phi/core/ddim.h
+++ b/paddle/phi/core/ddim.h
@@ -238,10 +238,10 @@ int arity(const DDim& ddim);
 std::ostream& operator<<(std::ostream&, const DDim&);
 
 /**
-* \brief Flatten dim to 3d
-* e.g., DDim d = mak_ddim({1, 2, 3, 4, 5, 6})
-*       flatten_to_3d(d, 2, 4); ===> {1*2, 3*4, 5*6} ===> {2, 12, 30}
-*/
+ * \brief Flatten dim to 3d
+ * e.g., DDim d = mak_ddim({1, 2, 3, 4, 5, 6})
+ *       flatten_to_3d(d, 2, 4); ===> {1*2, 3*4, 5*6} ===> {2, 12, 30}
+ */
 DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims);
 
 // Reshape a tensor to a matrix. The matrix's first dimension(column length)
diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
index 06d3e435bc110..09098705b11e4 100644
--- a/paddle/phi/core/dense_tensor.h
+++ b/paddle/phi/core/dense_tensor.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_meta.h"
 
 /* @jim19930609: Move to MKLDNN_Tensor in the future
-    */
+ */
 #ifdef PADDLE_WITH_MKLDNN
 #include "dnnl.hpp"
 #endif
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index 8c97b6bf223fb..a59b910b7e006 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -12,15 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/core/dense_tensor.h"
-
+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
-
 #include "paddle/phi/core/compat/convert_utils.h"
-
-#include "paddle/fluid/memory/malloc.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_utils.h"
diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc
index 0f5f22b5bd1f4..ce57f4f627baa 100644
--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/core/device_context.h"
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/selected_rows.h"
diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h
index d7c2c777ca632..45e4fbf64dc04 100644
--- a/paddle/phi/core/device_context.h
+++ b/paddle/phi/core/device_context.h
@@ -75,17 +75,17 @@ class PADDLE_API DeviceContext {
   void SetHostAllocator(const Allocator*);
 
   /**
-  * @brief Set the zero-size Allocator object.
-  *
-  * @param allocator
-  */
+   * @brief Set the zero-size Allocator object.
+   *
+   * @param allocator
+   */
   void SetZeroAllocator(const Allocator*);
 
   /**
-  * @brief Set the zero-size Allocator object.
-  *
-  * @param allocator
-  */
+   * @brief Set the zero-size Allocator object.
+   *
+   * @param allocator
+   */
   void SetPinnedAllocator(const Allocator*);
 
   /**
@@ -135,10 +135,10 @@ class PADDLE_API DeviceContext {
   virtual void Wait() const {}
 
   /**
-  * @brief Set the generator for special op.
-  *
-  * @param Generator
-  */
+   * @brief Set the generator for special op.
+   *
+   * @param Generator
+   */
   void SetGenerator(Generator*);
   /**
    * @brief Get the generator object.
@@ -148,10 +148,10 @@ class PADDLE_API DeviceContext {
   Generator* GetGenerator() const;
 
   /**
-  * @brief Set the host generator for special op.
-  *
-  * @param Generator
-  */
+   * @brief Set the host generator for special op.
+   *
+   * @param Generator
+   */
   void SetHostGenerator(Generator*);
   /**
    * @brief Get the host generator object.
diff --git a/paddle/phi/core/enforce.cc b/paddle/phi/core/enforce.cc
index ae6b0135b3222..91e0316ff7558 100644
--- a/paddle/phi/core/enforce.cc
+++ b/paddle/phi/core/enforce.cc
@@ -14,13 +14,12 @@ limitations under the License. */
 
 #include "paddle/phi/core/enforce.h"
 
+#include <boost/variant.hpp>
 #include <map>
 #include <memory>
 #include <unordered_map>
 #include <vector>
 
-#include <boost/variant.hpp>
-
 // <boost/variant.hpp> is not suitable to be placed in the header file,
 // it will introduce a large number of unnecessary includes, and these type
 // declarations that depend on boost are also not suitable for the phi header
diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h
index 0869df143235f..decebbe66a538 100644
--- a/paddle/phi/core/hostdevice.h
+++ b/paddle/phi/core/hostdevice.h
@@ -20,6 +20,7 @@
 
 #if defined(__xpu__)
 #include <xpu/runtime.h>
+
 #include "xpu/kernel/cluster_header.h"
 #include "xpu/kernel/debug.h"
 #include "xpu/kernel/math.h"
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index d479147f06ba1..d864544e10dd8 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -15,7 +15,6 @@
 #include "paddle/phi/core/kernel_factory.h"
 
 #include "glog/logging.h"
-
 #include "paddle/phi/core/enforce.h"
 
 namespace phi {
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 41e1e2b53a9e9..65f655d50375c 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -22,13 +22,12 @@
 #include <vector>
 
 #include "paddle/phi/core/custom_kernel.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_factory.h"
 #include "paddle/phi/core/kernel_utils.h"
 #include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/type_defs.h"
 
-#include "paddle/phi/core/enforce.h"
-
 namespace phi {
 
 #define BACKEND(arg__) phi::Backend::arg__
@@ -58,16 +57,13 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
     for (auto arg_type : args_type) {
       if (arg_type == std::type_index(typeid(const CPUContext&))
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-          ||
-          arg_type == std::type_index(typeid(const GPUContext&))) {
+          || arg_type == std::type_index(typeid(const GPUContext&))) {
 #elif defined(PADDLE_WITH_XPU)
-          ||
-          arg_type == std::type_index(typeid(const XPUContext&))) {
+          || arg_type == std::type_index(typeid(const XPUContext&))) {
 #elif defined(PADDLE_WITH_CUSTOM_DEVICE)
-          ||
-          arg_type == std::type_index(typeid(const CustomContext&))) {
+          || arg_type == std::type_index(typeid(const CustomContext&))) {
 #else
-              ) {
+      ) {
 #endif
         // do nothing, skip context arg now
       } else if (arg_type == std::type_index(typeid(const DenseTensor&))) {
@@ -420,93 +416,93 @@ struct KernelRegistrar {
   PD_CONCATENATE(_PD_KERNEL_INSTANTIATION_, N)                             \
   (meta_kernel_fn, backend, context, __VA_ARGS__)
 
-#define _PD_KERNEL_INSTANTIATION_1(              \
-    meta_kernel_fn, backend, context, cpp_dtype) \
-  template decltype(                             \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>
-#define _PD_KERNEL_INSTANTIATION_2(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_1(                                       \
+#define _PD_KERNEL_INSTANTIATION_1(                     \
+    meta_kernel_fn, backend, context, cpp_dtype)        \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>
+#define _PD_KERNEL_INSTANTIATION_2(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_1(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_3(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_2(                                       \
+#define _PD_KERNEL_INSTANTIATION_3(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_2(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_4(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_3(                                       \
+#define _PD_KERNEL_INSTANTIATION_4(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_3(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_5(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_4(                                       \
+#define _PD_KERNEL_INSTANTIATION_5(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_4(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_6(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_5(                                       \
+#define _PD_KERNEL_INSTANTIATION_6(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_5(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_7(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_6(                                       \
+#define _PD_KERNEL_INSTANTIATION_7(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_6(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_8(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_7(                                       \
+#define _PD_KERNEL_INSTANTIATION_8(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_7(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_9(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_8(                                       \
+#define _PD_KERNEL_INSTANTIATION_9(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_8(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_10(                                          \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_9(                                       \
+#define _PD_KERNEL_INSTANTIATION_10(                    \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_9(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_11(                                          \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_10(                                      \
+#define _PD_KERNEL_INSTANTIATION_11(                    \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_10(                \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_12(                                          \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_11(                                      \
+#define _PD_KERNEL_INSTANTIATION_12(                    \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_11(                \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_13(                                          \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_12(                                      \
+#define _PD_KERNEL_INSTANTIATION_13(                    \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_12(                \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_14(                                          \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_13(                                      \
+#define _PD_KERNEL_INSTANTIATION_14(                    \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_13(                \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_15(                                          \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_14(                                      \
+#define _PD_KERNEL_INSTANTIATION_15(                    \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_14(                \
       meta_kernel_fn, backend, context, __VA_ARGS__))
 
 #define PD_KERNEL_REGISTRAR_INIT(reg_type,                   \
@@ -569,8 +565,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -592,8 +588,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -623,8 +619,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -654,8 +650,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -685,8 +681,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -716,8 +712,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -747,8 +743,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -778,8 +774,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -809,8 +805,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -840,8 +836,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -871,8 +867,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -902,8 +898,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -933,8 +929,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -964,8 +960,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -995,8 +991,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index d4765d1c4c3b4..3b5fd0247a484 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -233,9 +233,8 @@ template <typename Return,
           Return (*kernel_fn)(DevCtx, Args...)>
 struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   static void Compute(KernelContext* ctx) {
-    KernelCallHelper<DevCtx,
-                     Args...,
-                     TypeTag<int>>::template Compute<0, 0, 0, 0>(ctx);
+    KernelCallHelper<DevCtx, Args..., TypeTag<int>>::
+        template Compute<0, 0, 0, 0>(ctx);
   }
 
   static void VariadicCompute(const DeviceContext& dev_ctx, Args... args) {
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index d277f32d8ea9a..271759161868b 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "glog/logging.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
@@ -21,8 +22,6 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 
-#include "glog/logging.h"
-
 namespace phi {
 
 // TODO(chenweihang): add other flags if needed
diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc
index 0a4e0d6191510..20cbf3dffcb16 100644
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/core/string_tensor.h"
+
 #include "paddle/fluid/memory/malloc.h"
 
 namespace phi {
diff --git a/paddle/phi/core/tensor_base.cc b/paddle/phi/core/tensor_base.cc
index 1b3628906af09..718bf09ff7eb9 100644
--- a/paddle/phi/core/tensor_base.cc
+++ b/paddle/phi/core/tensor_base.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/core/tensor_base.h"
+
 #include "paddle/phi/core/utils/type_registry.h"
 
 namespace phi {}
diff --git a/paddle/phi/core/utils/intrusive_ptr.h b/paddle/phi/core/utils/intrusive_ptr.h
index 2b7580192539f..e2e6cb7060d05 100644
--- a/paddle/phi/core/utils/intrusive_ptr.h
+++ b/paddle/phi/core/utils/intrusive_ptr.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <utility>
+
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
 
diff --git a/paddle/phi/infermeta/CMakeLists.txt b/paddle/phi/infermeta/CMakeLists.txt
index 1a19fd003222d..92b64ab4e666a 100644
--- a/paddle/phi/infermeta/CMakeLists.txt
+++ b/paddle/phi/infermeta/CMakeLists.txt
@@ -1,3 +1,9 @@
-cc_library(infermeta SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc DEPS convert_utils meta_tensor infermeta_utils)
-cc_library(backward_infermeta SRCS backward.cc DEPS meta_tensor convert_utils)
+cc_library(
+  infermeta
+  SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc
+  DEPS convert_utils meta_tensor infermeta_utils)
+cc_library(
+  backward_infermeta
+  SRCS backward.cc
+  DEPS meta_tensor convert_utils)
 add_subdirectory(strings)
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 521eb03fd770f..f59ea5549bd71 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -313,10 +313,10 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
 }
 
 void InstanceNormGradInferMeta(const MetaTensor& x,
-                               const MetaTensor& y_grad,
                                const MetaTensor& scale,
                                const MetaTensor& saved_mean,
                                const MetaTensor& saved_variance,
+                               const MetaTensor& y_grad,
                                float epsilon,
                                MetaTensor* x_grad,
                                MetaTensor* scale_grad,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index 93e2d4c43bc3f..0e7ed640d8ffb 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -145,10 +145,10 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 MetaTensor* dx);
 
 void InstanceNormGradInferMeta(const MetaTensor& x,
-                               const MetaTensor& y_grad,
                                const MetaTensor& scale,
                                const MetaTensor& saved_mean,
                                const MetaTensor& saved_variance,
+                               const MetaTensor& y_grad,
                                float epsilon,
                                MetaTensor* x_grad,
                                MetaTensor* scale_grad,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index a8d5ad564fe9b..f10fc54795ddb 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 63f0d0c1eeb28..61c57981f94b5 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/multiary.h"
+
 #include <vector>
+
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/phi/infermeta/strings/CMakeLists.txt b/paddle/phi/infermeta/strings/CMakeLists.txt
index 3e1a947728f51..c2f891fe712eb 100644
--- a/paddle/phi/infermeta/strings/CMakeLists.txt
+++ b/paddle/phi/infermeta/strings/CMakeLists.txt
@@ -1 +1,4 @@
-cc_library(string_infermeta SRCS nullary.cc unary.cc DEPS convert_utils infermeta_utils)
+cc_library(
+  string_infermeta
+  SRCS nullary.cc unary.cc
+  DEPS convert_utils infermeta_utils)
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 3c2888cee58c7..d84cc9e6d75af 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/ternary.h"
+
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 437c55c840f1a..67795c2a8aa6e 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -1,7 +1,14 @@
-set(kernel_declare_file ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.tmp CACHE INTERNAL "declarations.h file")
-set(kernel_declare_file_final ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h)
-file(WRITE ${kernel_declare_file} "// Generated by the paddle/phi/kernels/CMakeLists.txt.  DO NOT EDIT!\n\n#pragma once\n\n")
-file(APPEND ${kernel_declare_file} "#include \"paddle/phi/core/kernel_registry.h\"\n\n")
+set(kernel_declare_file
+    ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.tmp
+    CACHE INTERNAL "declarations.h file")
+set(kernel_declare_file_final
+    ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h)
+file(
+  WRITE ${kernel_declare_file}
+  "// Generated by the paddle/phi/kernels/CMakeLists.txt.  DO NOT EDIT!\n\n#pragma once\n\n"
+)
+file(APPEND ${kernel_declare_file}
+     "#include \"paddle/phi/core/kernel_registry.h\"\n\n")
 
 # phi functors and functions called by kernels
 add_subdirectory(funcs)
@@ -13,8 +20,25 @@ add_subdirectory(autotune)
 set_property(GLOBAL PROPERTY PHI_KERNELS "")
 
 # [ 1. Common kernel compilation dependencies ]
-set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor selected_rows_functor)
+set(COMMON_KERNEL_DEPS
+    dense_tensor
+    sparse_coo_tensor
+    sparse_csr_tensor
+    kernel_context
+    kernel_factory
+    arg_map_context
+    convert_utils
+    lod_utils
+    custom_kernel)
+set(COMMON_KERNEL_DEPS
+    ${COMMON_KERNEL_DEPS}
+    eigen_function
+    blas
+    math_function
+    im2col
+    vol2col
+    concat_and_split_functor
+    selected_rows_functor)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
@@ -30,50 +54,105 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
 # Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(AUTOTUNE_KERNELS conv_kernel conv_grad_kernel conv_grad_grad_kernel conv_transpose_kernel conv_transpose_grad_kernel)
-set(MANUAL_BUILD_KERNELS ${AUTOTUNE_KERNELS} cross_entropy_kernel adam_kernel adamw_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel
-    gumbel_softmax_kernel gumbel_softmax_grad_kernel hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel
-    matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
-    put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
-    softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
-    triangular_solve_grad_kernel determinant_grad_kernel reduce_sum_kernel reduce_mean_kernel rnn_kernel rnn_grad_kernel warpctc_kernel warpctc_grad_kernel)
+set(AUTOTUNE_KERNELS conv_kernel conv_grad_kernel conv_grad_grad_kernel
+                     conv_transpose_kernel conv_transpose_grad_kernel)
+set(MANUAL_BUILD_KERNELS
+    ${AUTOTUNE_KERNELS}
+    cross_entropy_kernel
+    adam_kernel
+    adamw_kernel
+    deformable_conv_kernel
+    deformable_conv_grad_kernel
+    eigh_kernel
+    gumbel_softmax_kernel
+    gumbel_softmax_grad_kernel
+    hierarchical_sigmoid_kernel
+    hierarchical_sigmoid_grad_kernel
+    matrix_power_kernel
+    matrix_power_grad_kernel
+    maxout_kernel
+    maxout_grad_kernel
+    pool_kernel
+    put_along_axis_kernel
+    put_along_axis_grad_kernel
+    segment_pool_kernel
+    segment_pool_grad_kernel
+    softmax_kernel
+    softmax_grad_kernel
+    take_along_axis_kernel
+    take_along_axis_grad_kernel
+    triangular_solve_grad_kernel
+    determinant_grad_kernel
+    reduce_sum_kernel
+    reduce_mean_kernel
+    rnn_kernel
+    rnn_grad_kernel
+    warpctc_kernel
+    warpctc_grad_kernel)
 foreach(src ${AUTOTUNE_KERNELS})
   kernel_library(${src} DEPS ${COMMON_KERNEL_DEPS} switch_autotune)
 endforeach()
-kernel_library(adam_kernel DEPS gflags glog flags ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper)
+kernel_library(
+  adam_kernel
+  DEPS
+  gflags
+  glog
+  flags
+  ${COMMON_KERNEL_DEPS}
+  selected_rows_functor
+  threadpool
+  jit_kernel_helper)
 kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel)
-kernel_library(cross_entropy_kernel DEPS ${COMMON_KERNEL_DEPS} softmax cross_entropy)
-kernel_library(deformable_conv_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor)
-kernel_library(deformable_conv_grad_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor)
-kernel_library(determinant_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
+kernel_library(cross_entropy_kernel DEPS ${COMMON_KERNEL_DEPS} softmax
+               cross_entropy)
+kernel_library(deformable_conv_kernel DEPS ${COMMON_KERNEL_DEPS}
+               deformable_conv_functor)
+kernel_library(deformable_conv_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
+               deformable_conv_functor)
+kernel_library(determinant_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
+               matrix_inverse)
 kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
-kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code)
-kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code)
+kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS}
+               matrix_bit_code)
+kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
+               matrix_bit_code)
 kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(reduce_sum_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel)
 kernel_library(reduce_mean_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel)
 kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
-kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
+kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
+               matrix_inverse)
 kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
 kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
 kernel_library(pool_kernel DEPS ${COMMON_KERNEL_DEPS} pooling)
-kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
-kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS}
+               gather_scatter_kernel)
+kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
+               gather_scatter_kernel)
 kernel_library(segment_pool_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
-kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
+kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
+               segment_pooling)
 kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
-kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
-kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
-kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce)
-kernel_library(rnn_kernel DEPS ${COMMON_KERNEL_DEPS} concat_and_split_functor lstm_compute gru_compute)
-kernel_library(rnn_grad_kernel DEPS ${COMMON_KERNEL_DEPS} concat_and_split_functor lstm_compute gru_compute)
-kernel_library(warpctc_kernel DEPS ${COMMON_KERNEL_DEPS} phi_dynload_warpctc sequence_padding sequence_scale)
-kernel_library(warpctc_grad_kernel DEPS ${COMMON_KERNEL_DEPS} phi_dynload_warpctc sequence_padding sequence_scale)
+kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS}
+               gather_scatter_kernel)
+kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
+               gather_scatter_kernel)
+kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
+               matrix_reduce)
+kernel_library(rnn_kernel DEPS ${COMMON_KERNEL_DEPS} concat_and_split_functor
+               lstm_compute gru_compute)
+kernel_library(rnn_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
+               concat_and_split_functor lstm_compute gru_compute)
+kernel_library(warpctc_kernel DEPS ${COMMON_KERNEL_DEPS} phi_dynload_warpctc
+               sequence_padding sequence_scale)
+kernel_library(warpctc_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
+               phi_dynload_warpctc sequence_padding sequence_scale)
 
 # 4. auto parse and build kernel targets by cmake
-register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} )
+register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS
+                 ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS})
 
 # phi sparse kernels
 add_subdirectory(sparse)
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index 2349bf990acd3..3d8e4db08bba1 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -14,12 +14,11 @@
 
 #include "paddle/phi/kernels/assign_kernel.h"
 
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/utils/optional.h"
 
-#include "paddle/fluid/framework/tensor_util.h"
-
 namespace phi {
 
 template <typename Context>
diff --git a/paddle/phi/kernels/auc_kernel.h b/paddle/phi/kernels/auc_kernel.h
index acbd17c7801e2..f58c3ce112bd7 100644
--- a/paddle/phi/kernels/auc_kernel.h
+++ b/paddle/phi/kernels/auc_kernel.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
 
diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt
index 63dc22459446f..a7a6c2f8e4dc0 100644
--- a/paddle/phi/kernels/autotune/CMakeLists.txt
+++ b/paddle/phi/kernels/autotune/CMakeLists.txt
@@ -1,12 +1,33 @@
-if (WITH_GPU)
-  nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
-  nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest)
-elseif (WITH_ROCM)
-  hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
-  hip_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest)
+if(WITH_GPU)
+  nv_test(
+    gpu_timer_test
+    SRCS gpu_timer_test.cu
+    DEPS gtest)
+  nv_test(
+    auto_tune_test
+    SRCS auto_tune_test.cu
+    DEPS gtest)
+elseif(WITH_ROCM)
+  hip_test(
+    gpu_timer_test
+    SRCS gpu_timer_test.cu
+    DEPS gtest)
+  hip_test(
+    auto_tune_test
+    SRCS auto_tune_test.cu
+    DEPS gtest)
 endif()
 
-cc_library(cache SRCS cache.cc DEPS boost)
-cc_library(switch_autotune SRCS switch_autotune.cc DEPS cache flags)
+cc_library(
+  cache
+  SRCS cache.cc
+  DEPS boost)
+cc_library(
+  switch_autotune
+  SRCS switch_autotune.cc
+  DEPS cache flags)
 
-cc_test(cache_test SRCS cache_test.cc DEPS gtest cache)
+cc_test(
+  cache_test
+  SRCS cache_test.cc
+  DEPS gtest cache)
diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h
index eaf325dad7500..e18b854cf34b3 100644
--- a/paddle/phi/kernels/autotune/auto_tune_base.h
+++ b/paddle/phi/kernels/autotune/auto_tune_base.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <type_traits>
+
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/autotune/gpu_timer.h"
diff --git a/paddle/phi/kernels/autotune/auto_tune_test.cu b/paddle/phi/kernels/autotune/auto_tune_test.cu
index f477cd1219331..c3918b8ebe59d 100644
--- a/paddle/phi/kernels/autotune/auto_tune_test.cu
+++ b/paddle/phi/kernels/autotune/auto_tune_test.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include "glog/logging.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/all_context.h"
@@ -66,8 +67,8 @@ float Algo(const phi::GPUContext& ctx,
                      N);
 #else
   VLOG(3) << "Vecsize is " << Vecsize;
-  VecSumTest<float, Vecsize><<<blocks, threads, 0, ctx.stream()>>>(
-      d_in_data, d_out_data, N);
+  VecSumTest<float, Vecsize>
+      <<<blocks, threads, 0, ctx.stream()>>>(d_in_data, d_out_data, N);
 #endif
   return Vecsize;
 }
diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc
index ef2cbe633d496..5e2c9e1c742ff 100644
--- a/paddle/phi/kernels/autotune/cache.cc
+++ b/paddle/phi/kernels/autotune/cache.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/autotune/cache.h"
+
 #include <iomanip>
+
 #include "glog/logging.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h
index 37c5d134e8a61..9d7f57e96e373 100644
--- a/paddle/phi/kernels/autotune/cache.h
+++ b/paddle/phi/kernels/autotune/cache.h
@@ -19,6 +19,7 @@
 #include <numeric>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
diff --git a/paddle/phi/kernels/autotune/cache_test.cc b/paddle/phi/kernels/autotune/cache_test.cc
index f99f8bfc8b821..53574c3d0c9ac 100644
--- a/paddle/phi/kernels/autotune/cache_test.cc
+++ b/paddle/phi/kernels/autotune/cache_test.cc
@@ -13,9 +13,12 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/autotune/cache.h"
+
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <functional>
+
 #include "glog/logging.h"
 
 enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 };
diff --git a/paddle/phi/kernels/autotune/gpu_timer_test.cu b/paddle/phi/kernels/autotune/gpu_timer_test.cu
index b6eb345885f30..d24508dfa2064 100644
--- a/paddle/phi/kernels/autotune/gpu_timer_test.cu
+++ b/paddle/phi/kernels/autotune/gpu_timer_test.cu
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <functional>
+
 #include "glog/logging.h"
 #include "paddle/phi/kernels/autotune/gpu_timer.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
diff --git a/paddle/phi/kernels/autotune/switch_autotune.h b/paddle/phi/kernels/autotune/switch_autotune.h
index 1793940542d47..de638ac4eda75 100644
--- a/paddle/phi/kernels/autotune/switch_autotune.h
+++ b/paddle/phi/kernels/autotune/switch_autotune.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <cmath>
+
 #include "paddle/phi/kernels/autotune/cache.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h
index 3de2f69f452db..afbb0c78ca981 100644
--- a/paddle/phi/kernels/batch_norm_grad_kernel.h
+++ b/paddle/phi/kernels/batch_norm_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
index 5d24f6684a48f..79d5b8a445b48 100644
--- a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
+++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/broadcast_tensors_kernel.h b/paddle/phi/kernels/broadcast_tensors_kernel.h
index 22b5201b6900d..dccaebcf41ffe 100644
--- a/paddle/phi/kernels/broadcast_tensors_kernel.h
+++ b/paddle/phi/kernels/broadcast_tensors_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/channel_shuffle_grad_kernel.h b/paddle/phi/kernels/channel_shuffle_grad_kernel.h
index ac89f3336bc76..d75d887d0fcd8 100644
--- a/paddle/phi/kernels/channel_shuffle_grad_kernel.h
+++ b/paddle/phi/kernels/channel_shuffle_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/channel_shuffle_kernel.h b/paddle/phi/kernels/channel_shuffle_kernel.h
index 12de25606dd96..c15e06fb552bf 100644
--- a/paddle/phi/kernels/channel_shuffle_kernel.h
+++ b/paddle/phi/kernels/channel_shuffle_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/conv_kernel.cc b/paddle/phi/kernels/conv_kernel.cc
index 7268384f401a1..542a4ec8a61c8 100644
--- a/paddle/phi/kernels/conv_kernel.cc
+++ b/paddle/phi/kernels/conv_kernel.cc
@@ -14,9 +14,8 @@
 
 #include "paddle/phi/kernels/conv_kernel.h"
 
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
@@ -41,8 +40,8 @@ void ConvInferKernel(const Context& dev_ctx,
                          dilations,
                          data_format,
                          /*use_addto=*/false,
-                         /*workspace_size_MB=*/paddle::platform::
-                             GetDefaultConvWorkspaceSizeLimitMB(),
+                         /*workspace_size_MB=*/
+                         paddle::platform::GetDefaultConvWorkspaceSizeLimitMB(),
                          /*exhaustive_search=*/false,
                          out);
 }
diff --git a/paddle/phi/kernels/conv_transpose_grad_kernel.h b/paddle/phi/kernels/conv_transpose_grad_kernel.h
index 2b1c0c1a934cf..00d5fb51f01ee 100644
--- a/paddle/phi/kernels/conv_transpose_grad_kernel.h
+++ b/paddle/phi/kernels/conv_transpose_grad_kernel.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/conv_transpose_kernel.h b/paddle/phi/kernels/conv_transpose_kernel.h
index de56f13ddf73e..e39617e0e7c0c 100644
--- a/paddle/phi/kernels/conv_transpose_kernel.h
+++ b/paddle/phi/kernels/conv_transpose_kernel.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc
index 9f89fc27a7167..a10e0eed64aec 100644
--- a/paddle/phi/kernels/cpu/abs_kernel.cc
+++ b/paddle/phi/kernels/cpu/abs_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/abs_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/accuracy_kernel.cc b/paddle/phi/kernels/cpu/accuracy_kernel.cc
index 6ff8a1f755897..17246de35db22 100644
--- a/paddle/phi/kernels/cpu/accuracy_kernel.cc
+++ b/paddle/phi/kernels/cpu/accuracy_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/accuracy_kernel.h"
 
 #include <algorithm>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index 165627839a308..bd3e16d54dcad 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/activation_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
diff --git a/paddle/phi/kernels/cpu/adagrad_kernel.cc b/paddle/phi/kernels/cpu/adagrad_kernel.cc
index fcd89caf7fa29..d6867deff4c15 100644
--- a/paddle/phi/kernels/cpu/adagrad_kernel.cc
+++ b/paddle/phi/kernels/cpu/adagrad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/adagrad_kernel.h"
+
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/allclose_kernel.cc b/paddle/phi/kernels/cpu/allclose_kernel.cc
index f95ddc5621e9a..c6a512aa95cb1 100644
--- a/paddle/phi/kernels/cpu/allclose_kernel.cc
+++ b/paddle/phi/kernels/cpu/allclose_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/allclose_kernel.h"
 
 #include <cmath>
+
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/arange_kernel.cc b/paddle/phi/kernels/cpu/arange_kernel.cc
index 478251b0d3b6a..7f7e555423176 100644
--- a/paddle/phi/kernels/cpu/arange_kernel.cc
+++ b/paddle/phi/kernels/cpu/arange_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/arange_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/range_function.h"
diff --git a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
index 7a519aab0ad71..3bc8c853a7b42 100644
--- a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/atan2_kernel.cc b/paddle/phi/kernels/cpu/atan2_kernel.cc
index df6f5f59ac005..4cb96ad8b6c6c 100644
--- a/paddle/phi/kernels/cpu/atan2_kernel.cc
+++ b/paddle/phi/kernels/cpu/atan2_kernel.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
index 366a08e59fee3..beda276c8ef3a 100644
--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
-
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpu/batch_norm_utils.h"
 
diff --git a/paddle/phi/kernels/cpu/batch_norm_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
index 743128e8dea99..cb8af06b540f8 100644
--- a/paddle/phi/kernels/cpu/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/batch_norm_kernel.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#include "paddle/fluid/framework/tensor_util.h"
-
 namespace phi {
 
 template <typename T>
diff --git a/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc
index 6859451e8be32..fc91af3ff71bc 100644
--- a/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/bce_loss_grad_kernel.h"
 
 #include <algorithm>  // for max
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/bce_loss_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_kernel.cc
index 76b9793651484..9d62fabcbe736 100644
--- a/paddle/phi/kernels/cpu/bce_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/bce_loss_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/bce_loss_kernel.h"
 
 #include <algorithm>  // for max
+
 #include "paddle/fluid/operators/math.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/bernoulli_kernel.cc b/paddle/phi/kernels/cpu/bernoulli_kernel.cc
index 09c07d9ec9dea..6bf548154a404 100644
--- a/paddle/phi/kernels/cpu/bernoulli_kernel.cc
+++ b/paddle/phi/kernels/cpu/bernoulli_kernel.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/bernoulli_kernel.h"
+
 #include <random>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc b/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc
index 2268212316af6..ef7e8a981c520 100644
--- a/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h"
-#include "paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(bilinear_tensor_product_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc b/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc
index 25bc5913865a0..d822656418261 100644
--- a/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc
+++ b/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/bilinear_tensor_product_kernel.h"
-#include "paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h"
 
 PD_REGISTER_KERNEL(bilinear_tensor_product,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
index 0869cd62024dc..413638e177222 100644
--- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
index 4cb6db8769271..3ad26164d7d8d 100644
--- a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/broadcast_tensors_kernel.h"
-#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
 
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
 
 PD_REGISTER_KERNEL(broadcast_tensors,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/cast_kernel.cc b/paddle/phi/kernels/cpu/cast_kernel.cc
index 2132f0d5ae86c..8abfa173fd06d 100644
--- a/paddle/phi/kernels/cpu/cast_kernel.cc
+++ b/paddle/phi/kernels/cpu/cast_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/cast_kernel.h"
-#include "paddle/phi/kernels/cpu/cast_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/cpu/cast_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/channel_shuffle_grad_kernel.cc b/paddle/phi/kernels/cpu/channel_shuffle_grad_kernel.cc
index fcc91b2191673..e95b454dbf900 100644
--- a/paddle/phi/kernels/cpu/channel_shuffle_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/channel_shuffle_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/channel_shuffle_grad_kernel.h"
-#include "paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(channel_shuffle_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/channel_shuffle_kernel.cc b/paddle/phi/kernels/cpu/channel_shuffle_kernel.cc
index 95d19ec6a7746..0bac82e779c21 100644
--- a/paddle/phi/kernels/cpu/channel_shuffle_kernel.cc
+++ b/paddle/phi/kernels/cpu/channel_shuffle_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/channel_shuffle_kernel.h"
-#include "paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h"
 
 PD_REGISTER_KERNEL(channel_shuffle,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc b/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
index b6f5dd29ba2b7..612d10994cb17 100644
--- a/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(cholesky_solve_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc b/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
index 02597560a7f51..11cb66f88c1f6 100644
--- a/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
+++ b/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/clip_grad_kernel.cc b/paddle/phi/kernels/cpu/clip_grad_kernel.cc
index bccdc0746d51c..89a14af10d16c 100644
--- a/paddle/phi/kernels/cpu/clip_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/clip_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/clip_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/clip_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/clip_kernel.cc b/paddle/phi/kernels/cpu/clip_kernel.cc
index 5fd9aea966f8d..bcbb85279277e 100644
--- a/paddle/phi/kernels/cpu/clip_kernel.cc
+++ b/paddle/phi/kernels/cpu/clip_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/clip_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/clip_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/compare_kernel.cc b/paddle/phi/kernels/cpu/compare_kernel.cc
index 9006325a521ec..694b44c16d80e 100644
--- a/paddle/phi/kernels/cpu/compare_kernel.cc
+++ b/paddle/phi/kernels/cpu/compare_kernel.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/compare_kernel.h"
-#include "paddle/phi/kernels/impl/compare_kernel_impl.h"
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/impl/compare_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/complex_grad_kernel.cc b/paddle/phi/kernels/cpu/complex_grad_kernel.cc
index 5c1d50f5bf27d..11b7a05834607 100644
--- a/paddle/phi/kernels/cpu/complex_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/complex_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/complex_grad_kernel.h"
-#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
 
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(real_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc
index 859d5a84527a2..bef0b7b747a42 100644
--- a/paddle/phi/kernels/cpu/complex_kernel.cc
+++ b/paddle/phi/kernels/cpu/complex_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/impl/complex_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/complex_kernel_impl.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/complex.h"
diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
index 4538ccf9433f9..3289c8f5c84d6 100644
--- a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/conv_grad_grad_kernel.h"
-#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
 
 namespace phi {
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc
index 2d8a9bf1de733..880837dd7cd61 100644
--- a/paddle/phi/kernels/cpu/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/conv_grad_kernel.h"
-#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/conv_kernel.cc b/paddle/phi/kernels/cpu/conv_kernel.cc
index e0b4ee7d5776f..ec3253194930b 100644
--- a/paddle/phi/kernels/cpu/conv_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/conv_kernel.h"
-#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
 
 namespace phi {
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc
index 8d0749500695c..17fe44dea3f65 100644
--- a/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
-#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/conv_transpose_kernel.cc b/paddle/phi/kernels/cpu/conv_transpose_kernel.cc
index b4cacc850938e..ad9a5933f2809 100644
--- a/paddle/phi/kernels/cpu/conv_transpose_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_transpose_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/conv_transpose_kernel.h"
-#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/cross_entropy_kernel.cc b/paddle/phi/kernels/cpu/cross_entropy_kernel.cc
index c684fb416eaab..bd3eb3eb754c3 100644
--- a/paddle/phi/kernels/cpu/cross_entropy_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_entropy_kernel.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/cross_entropy_kernel.h"
 
+#include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
@@ -21,8 +22,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/softmax_kernel.h"
 
-#include "paddle/fluid/operators/math/cross_entropy.h"
-
 namespace phi {
 
 template <typename T>
diff --git a/paddle/phi/kernels/cpu/cross_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_grad_kernel.cc
index 390420008e6ea..8dddc6f6e4e95 100644
--- a/paddle/phi/kernels/cpu/cross_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/cross_grad_kernel.h"
-#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(cross_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/cross_kernel.cc b/paddle/phi/kernels/cpu/cross_kernel.cc
index a63f33174eacd..1f3a8fe5a3879 100644
--- a/paddle/phi/kernels/cpu/cross_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/cross_kernel.h"
-#include "paddle/phi/kernels/impl/cross_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cross_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     cross, CPU, ALL_LAYOUT, phi::CrossKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/cumprod_kernel.cc b/paddle/phi/kernels/cpu/cumprod_kernel.cc
index aea338027f5bb..4ecf092918418 100644
--- a/paddle/phi/kernels/cpu/cumprod_kernel.cc
+++ b/paddle/phi/kernels/cpu/cumprod_kernel.cc
@@ -16,6 +16,7 @@
 
 #include <cstdint>
 #include <type_traits>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
index f64b1d3291f5e..a4d43ef8fbe89 100644
--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -58,10 +58,9 @@ inline void ModulatedDeformableCol2imCPUKernel(
     int w_in = w_out * stride_w - pad_w;
     int h_in = h_out * stride_h - pad_h;
 
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
+    const T* data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const int data_offset_h_ptr =
         ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
     const int data_offset_w_ptr =
@@ -75,9 +74,9 @@ inline void ModulatedDeformableCol2imCPUKernel(
 
     T cur_top_grad = data_col[thread];
     if (data_mask) {
-      const T* data_mask_ptr = data_mask +
-                               (b * deformable_group + deformable_group_index) *
-                                   kernel_h * kernel_w * height_col * width_col;
+      const T* data_mask_ptr =
+          data_mask + (b * deformable_group + deformable_group_index) *
+                          kernel_h * kernel_w * height_col * width_col;
       const T mask = data_mask_ptr[data_mask_hw_ptr];
       cur_top_grad *= mask;
     }
@@ -180,23 +179,20 @@ void ModulatedDeformableCol2imCoordCPUKernel(
     const int deformable_group_index = c / (2 * kernel_h * kernel_w);
     const int col_step = kernel_h * kernel_w;
     int cnt = 0;
-    const T* data_col_ptr = data_col +
-                            deformable_group_index *
-                                channel_per_deformable_group * batch_size *
-                                width_col * height_col;
-    const T* data_im_ptr = data_im +
-                           (b * deformable_group + deformable_group_index) *
-                               channel_per_deformable_group / kernel_h /
-                               kernel_w * height * width;
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
+    const T* data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T* data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T* data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const T* data_mask_ptr =
         data_mask
-            ? data_mask +
-                  (b * deformable_group + deformable_group_index) * kernel_h *
-                      kernel_w * height_col * width_col
+            ? data_mask + (b * deformable_group + deformable_group_index) *
+                              kernel_h * kernel_w * height_col * width_col
             : nullptr;
 
     const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
diff --git a/paddle/phi/kernels/cpu/diag_grad_kernel.cc b/paddle/phi/kernels/cpu/diag_grad_kernel.cc
index c56b225e2a753..616ea753ef1ba 100644
--- a/paddle/phi/kernels/cpu/diag_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/diag_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/diag_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
diff --git a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
index c3c290b4fe91e..5671e70c96e0a 100644
--- a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/diagonal_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
diff --git a/paddle/phi/kernels/cpu/diagonal_kernel.cc b/paddle/phi/kernels/cpu/diagonal_kernel.cc
index df17b458e1166..8ea5826ba25f7 100644
--- a/paddle/phi/kernels/cpu/diagonal_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/diagonal_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
diff --git a/paddle/phi/kernels/cpu/digamma_grad_kernel.cc b/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
index da1b5ae556609..dc7fcaf6f92be 100644
--- a/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/digamma_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/cpu/digamma_kernel.cc b/paddle/phi/kernels/cpu/digamma_kernel.cc
index ee120a29b6061..80cbda4b7a9fc 100644
--- a/paddle/phi/kernels/cpu/digamma_kernel.cc
+++ b/paddle/phi/kernels/cpu/digamma_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/digamma_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/cpu/dist_grad_kernel.cc b/paddle/phi/kernels/cpu/dist_grad_kernel.cc
index 2b7f8f98f9473..c1aaa2adf7563 100644
--- a/paddle/phi/kernels/cpu/dist_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/dist_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/dist_grad_kernel.h"
-#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dist_kernel.cc b/paddle/phi/kernels/cpu/dist_kernel.cc
index ccf3d4be83230..0c7b5db64b38f 100644
--- a/paddle/phi/kernels/cpu/dist_kernel.cc
+++ b/paddle/phi/kernels/cpu/dist_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/dist_kernel.h"
-#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
 
 PD_REGISTER_KERNEL(dist, CPU, ALL_LAYOUT, phi::DistKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dot_grad_kernel.cc b/paddle/phi/kernels/cpu/dot_grad_kernel.cc
index a2abdb7c00900..883b77802217b 100644
--- a/paddle/phi/kernels/cpu/dot_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/dot_grad_kernel.cc
@@ -13,12 +13,11 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/dot_grad_kernel.h"
-#include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(dot_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
index b77a6c55b1471..db95656421884 100644
--- a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/dropout_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/cpu/dropout_kernel.cc b/paddle/phi/kernels/cpu/dropout_kernel.cc
index fa12e505e4209..d9c02eff0106f 100644
--- a/paddle/phi/kernels/cpu/dropout_kernel.cc
+++ b/paddle/phi/kernels/cpu/dropout_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/dropout_kernel.h"
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/eigh_grad_kernel.cc b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc
index 5135778db56c5..db533416d2748 100644
--- a/paddle/phi/kernels/cpu/eigh_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/eigh_grad_kernel.h"
-#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
 
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(eigh_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/eigh_kernel.cc b/paddle/phi/kernels/cpu/eigh_kernel.cc
index 92fd20ca9b825..0f0a10c837792 100644
--- a/paddle/phi/kernels/cpu/eigh_kernel.cc
+++ b/paddle/phi/kernels/cpu/eigh_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/eigh_kernel.h"
-#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/einsum_kernel.cc b/paddle/phi/kernels/cpu/einsum_kernel.cc
index 8968542b3e0b8..401d2fd158a5d 100644
--- a/paddle/phi/kernels/cpu/einsum_kernel.cc
+++ b/paddle/phi/kernels/cpu/einsum_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/einsum_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
diff --git a/paddle/phi/kernels/cpu/elementwise.h b/paddle/phi/kernels/cpu/elementwise.h
index 0f67df661136d..255dae7da014d 100644
--- a/paddle/phi/kernels/cpu/elementwise.h
+++ b/paddle/phi/kernels/cpu/elementwise.h
@@ -16,10 +16,9 @@ limitations under the License. */
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
-
-#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
index 5019b9f570628..b5e28ab39e5a6 100644
--- a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
index d380621818b35..15fe92c929194 100644
--- a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index 286b0d0ffaad9..f090ddd5bbe9a 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc b/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
index 2424a5330109c..349150373844b 100644
--- a/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc b/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc
index 0e97852ac33e1..a013309233d47 100644
--- a/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/embedding_grad_kernel.cc b/paddle/phi/kernels/cpu/embedding_grad_kernel.cc
index 21b3e6da8d9ef..fabb4e83d52f7 100644
--- a/paddle/phi/kernels/cpu/embedding_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/embedding_grad_kernel.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/embedding_grad_kernel.h"
-#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/embedding_kernel.cc b/paddle/phi/kernels/cpu/embedding_kernel.cc
index 76cc3814b0567..0430f7a005221 100644
--- a/paddle/phi/kernels/cpu/embedding_kernel.cc
+++ b/paddle/phi/kernels/cpu/embedding_kernel.cc
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/embedding_kernel.h"
-#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/erf_grad_kernel.cc b/paddle/phi/kernels/cpu/erf_grad_kernel.cc
index 3c1cd0df1531a..ae0b218bc0be3 100644
--- a/paddle/phi/kernels/cpu/erf_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/erf_grad_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/erf_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/erf_kernel.cc b/paddle/phi/kernels/cpu/erf_kernel.cc
index 05ce4cab7fcef..ace9775c0b869 100644
--- a/paddle/phi/kernels/cpu/erf_kernel.cc
+++ b/paddle/phi/kernels/cpu/erf_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/erf_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc
index b1fe4f026ab07..2d363189936b0 100644
--- a/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/erfinv_grad_kernel.h"
-#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     erfinv_grad, CPU, ALL_LAYOUT, phi::ErfinvGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/erfinv_kernel.cc b/paddle/phi/kernels/cpu/erfinv_kernel.cc
index 4f3a740f9d9be..f298cc358d662 100644
--- a/paddle/phi/kernels/cpu/erfinv_kernel.cc
+++ b/paddle/phi/kernels/cpu/erfinv_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/erfinv_kernel.h"
-#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h"
 
 PD_REGISTER_KERNEL(erfinv, CPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc b/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
index 6eafe9aa49dfe..c57e3a87281e0 100644
--- a/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/expand_as_grad_kernel.h"
-#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(expand_as_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/expand_as_kernel.cc b/paddle/phi/kernels/cpu/expand_as_kernel.cc
index 697ea138097ee..4ec28ef8413cc 100644
--- a/paddle/phi/kernels/cpu/expand_as_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_as_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/expand_as_kernel.h"
-#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
 
 PD_REGISTER_KERNEL(expand_as,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/expand_grad_kernel.cc b/paddle/phi/kernels/cpu/expand_grad_kernel.cc
index 4799a6aa7afdf..5cbbf253b747d 100644
--- a/paddle/phi/kernels/cpu/expand_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/expand_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/cpu/expand_kernel.cc b/paddle/phi/kernels/cpu/expand_kernel.cc
index 077048976729f..2df833d0f9c30 100644
--- a/paddle/phi/kernels/cpu/expand_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/expand_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/cpu/eye_kernel.cc b/paddle/phi/kernels/cpu/eye_kernel.cc
index a0d0f2c439096..ef3489d3fae0d 100644
--- a/paddle/phi/kernels/cpu/eye_kernel.cc
+++ b/paddle/phi/kernels/cpu/eye_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/eye_kernel.h"
-#include "paddle/phi/kernels/impl/eye_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/eye_kernel_impl.h"
 
 PD_REGISTER_KERNEL(eye,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc
index 338be9e252da3..5434296be4dbe 100644
--- a/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
-#include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(frobenius_norm_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc b/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc
index 77509b953bf39..56444ddad8d8b 100644
--- a/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/frobenius_norm_kernel.h"
-#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     frobenius_norm, CPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc
index 0b76425a659a0..ceb2312b53a0b 100644
--- a/paddle/phi/kernels/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/cpu/full_kernel.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
diff --git a/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc
index b375a7ec4691c..88a288afd318e 100644
--- a/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/gather_nd_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
diff --git a/paddle/phi/kernels/cpu/gather_nd_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_kernel.cc
index aa32d036934e8..8ae866a1c8add 100644
--- a/paddle/phi/kernels/cpu/gather_nd_kernel.cc
+++ b/paddle/phi/kernels/cpu/gather_nd_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/gather_nd_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/gather.h"
diff --git a/paddle/phi/kernels/cpu/gather_tree_kernel.cc b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
index 25fb870d851f6..6f3cac6c4aa10 100644
--- a/paddle/phi/kernels/cpu/gather_tree_kernel.cc
+++ b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/gather_tree_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
index 348d24b534e3e..c600149cbbacc 100644
--- a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
@@ -14,11 +14,10 @@
 
 #include "paddle/phi/kernels/gaussian_random_kernel.h"
 
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#include "paddle/fluid/framework/generator.h"
-
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc
index d7af220574565..4d23470aa4e9e 100644
--- a/paddle/phi/kernels/cpu/gelu_kernel.cc
+++ b/paddle/phi/kernels/cpu/gelu_kernel.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/gelu_kernel.h"
+
 #include <algorithm>
 #include <cmath>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/cpu/graph_reindex_kernel.cc b/paddle/phi/kernels/cpu/graph_reindex_kernel.cc
index c0a88f3222717..428bcb031704c 100644
--- a/paddle/phi/kernels/cpu/graph_reindex_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_reindex_kernel.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/graph_reindex_kernel.h"
+
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/phi/kernels/graph_reindex_kernel.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
@@ -59,11 +59,15 @@ void GraphReindexKernel(const Context& dev_ctx,
     src[i] = node_map[node];
   }
   // Reindex Dst
+  // Add support for multi-type edges reindex
+  int num_edge_types = count.dims()[0] / bs;
   int cnt = 0;
-  for (int i = 0; i < bs; i++) {
-    for (int j = 0; j < count_data[i]; j++) {
-      T node = x_data[i];
-      dst[cnt++] = node_map[node];
+  for (int i = 0; i < num_edge_types; i++) {
+    for (int j = 0; j < bs; j++) {
+      for (int k = 0; k < count_data[i * bs + j]; k++) {
+        T node = x_data[j];
+        dst[cnt++] = node_map[node];
+      }
     }
   }
 
diff --git a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
index 70aac053417b8..1ef5373d6310b 100644
--- a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <vector>
-
 #include "paddle/phi/kernels/graph_sample_neighbors_kernel.h"
 
+#include <vector>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
index 6ea65d005c1ad..ad04bd258e141 100644
--- a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/graph_send_recv_grad_kernel.h"
-#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
 
 #include <algorithm>
 #include <vector>
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
index 8f71ba12cc4fa..e4034230c7866 100644
--- a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/graph_send_recv_kernel.h"
-#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
 
 #include <algorithm>
 #include <set>
@@ -22,6 +21,7 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
index 923cb8424115e..32fa0d5aafefe 100644
--- a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
@@ -73,8 +73,9 @@ static inline void ClipWithMask(const CPUContext& ctx,
                          .cwiseMin(static_cast<T>(max_val));
       auto in_bound = (clipped == reflected).template cast<T>();
       grid_scale_t.device(place) =
-          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
-                          (is_neg != one_more_flip).template cast<T>()) *
+          grid_scale_t *
+          ((is_neg == one_more_flip).template cast<T>() -
+           (is_neg != one_more_flip).template cast<T>()) *
           in_bound;
       grid_slice_t.device(place) = clipped;
     }
diff --git a/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc
index a4c131e72b59a..832df98e0f3f6 100644
--- a/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/gumbel_softmax_grad_kernel.h"
-#include "paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(gumbel_softmax_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
index eb406665c5f4f..7638ca3aa7ee6 100644
--- a/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
+++ b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/gumbel_softmax_kernel.h"
-#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/histogram_kernel.cc b/paddle/phi/kernels/cpu/histogram_kernel.cc
index 82b88f868d8a7..d9c41508efde0 100644
--- a/paddle/phi/kernels/cpu/histogram_kernel.cc
+++ b/paddle/phi/kernels/cpu/histogram_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/histogram_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
index 654f2c9400af0..b52a587070af6 100644
--- a/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/huber_loss_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/huber_loss_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_kernel.cc
index 702c0589057af..2c4d8941ab87b 100644
--- a/paddle/phi/kernels/cpu/huber_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/huber_loss_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/huber_loss_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/huber_loss_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
index d060e8c9b2837..fe8ca4e432e21 100644
--- a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/index_sample_grad_kernel.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc
index b895e4aa7c0e7..faa6953704e80 100644
--- a/paddle/phi/kernels/cpu/index_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/index_sample_kernel.h"
+
 #include <cmath>
 #include <fstream>
 #include <set>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
diff --git a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
index 340d2907a7909..45ef003410926 100644
--- a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -42,10 +43,10 @@ using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
 template <typename T, typename Context>
 void InstanceNormGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
-                            const DenseTensor& d_y,
                             const paddle::optional<DenseTensor>& scale,
                             const DenseTensor& saved_mean,
                             const DenseTensor& saved_variance,
+                            const DenseTensor& d_y,
                             float epsilon,
                             DenseTensor* d_x,
                             DenseTensor* d_scale,
@@ -142,12 +143,11 @@ void InstanceNormGradKernel(const Context& dev_ctx,
   dx_arr.device(*place) = scale_arr.broadcast(bcast_param) *
                           inv_var_arr.broadcast(bcast) *
                           (dy_arr - dy_mean -
-                           tmp *
-                               (dy_arr * tmp)
-                                   .mean(mean_rdims)
-                                   .reshape(NxC_shape)
-                                   .eval()
-                                   .broadcast(bcast));
+                           tmp * (dy_arr * tmp)
+                                     .mean(mean_rdims)
+                                     .reshape(NxC_shape)
+                                     .eval()
+                                     .broadcast(bcast));
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/cpu/instance_norm_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
index 5eac473effa0e..4deced5499ecb 100644
--- a/paddle/phi/kernels/cpu/instance_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
index d4e13aa3b24fe..edd41b2c7a31d 100644
--- a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/interpolate_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/isclose_kernel.cc b/paddle/phi/kernels/cpu/isclose_kernel.cc
index 633c6ba093e42..dca21494b3ee9 100644
--- a/paddle/phi/kernels/cpu/isclose_kernel.cc
+++ b/paddle/phi/kernels/cpu/isclose_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/isclose_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/isclose_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc
index f9399d38d711f..9f6e2573e33e5 100644
--- a/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/kldiv_loss_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc
index c462b8ec32c89..ecb1915cf420e 100644
--- a/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/kldiv_loss_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc b/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc
index 74664fb270b2d..1a900b4bc2aff 100644
--- a/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/label_smooth_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/cpu/label_smooth_kernel.cc b/paddle/phi/kernels/cpu/label_smooth_kernel.cc
index af9548e8186bc..cdeed73310d24 100644
--- a/paddle/phi/kernels/cpu/label_smooth_kernel.cc
+++ b/paddle/phi/kernels/cpu/label_smooth_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/label_smooth_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
index a30f54fd4b60e..081a32b4f245b 100644
--- a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/layer_norm_grad_kernel.h"
+
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/funcs/layer_norm_util.h"
 #if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
index 52722468e16bd..dbc3da0ca15ac 100644
--- a/paddle/phi/kernels/cpu/layer_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/layer_norm_kernel.h"
+
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/funcs/layer_norm_util.h"
 #if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
diff --git a/paddle/phi/kernels/cpu/lerp_grad_kernel.cc b/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
index d74919011ec5d..ae98cb9d03aee 100644
--- a/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/lerp_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/lerp_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/lerp_kernel.cc b/paddle/phi/kernels/cpu/lerp_kernel.cc
index 7adfc35bfa321..d02e706d8d600 100644
--- a/paddle/phi/kernels/cpu/lerp_kernel.cc
+++ b/paddle/phi/kernels/cpu/lerp_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/lerp_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/lerp_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc
index 116fa3f8d3f6a..a87c01214a93d 100644
--- a/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/lgamma_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/lgamma_kernel.cc b/paddle/phi/kernels/cpu/lgamma_kernel.cc
index f849322174d29..4979ad0b30bcd 100644
--- a/paddle/phi/kernels/cpu/lgamma_kernel.cc
+++ b/paddle/phi/kernels/cpu/lgamma_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/lgamma_kernel.h"
 
 #include <unsupported/Eigen/SpecialFunctions>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc
index 5f344b9cc3fe0..d3e5e90fd17a3 100644
--- a/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc
@@ -55,10 +55,9 @@ struct LogSoftmaxGradFunctor {
     Eigen::DSizes<int, 2> one_axis(1, axis_dim);
 
     dx.device(*context.eigen_device()) =
-        dy -
-        (y.exp()) * (dy.reshape(batch_axis_remain)
-                         .sum(along_class)
-                         .broadcast(one_axis));
+        dy - (y.exp()) * (dy.reshape(batch_axis_remain)
+                              .sum(along_class)
+                              .broadcast(one_axis));
   }
 };
 
diff --git a/paddle/phi/kernels/cpu/log_softmax_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
index 241742378cc5d..510eb7a6ca97a 100644
--- a/paddle/phi/kernels/cpu/log_softmax_kernel.cc
+++ b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
@@ -72,34 +72,31 @@ struct LogSoftmaxFunctor {
       // axis == -1, axis and class in same dimension, calculate along
       // class dimension directly for higher performance
       log_softmax.device(*context.eigen_device()) =
-          (logits -
-           logits.maximum(along_axis)
-               .eval()
-               .reshape(batch_by_one)
-               .broadcast(one_by_class))
+          (logits - logits.maximum(along_axis)
+                        .eval()
+                        .reshape(batch_by_one)
+                        .broadcast(one_by_class))
               .unaryExpr(ValueClip<T>());
     } else {
       // axis != -1, class dimension split into (axis, remain), max and sum
       // should be calculated along axis dimension
       log_softmax.device(*context.eigen_device()) =
-          (logits.reshape(batch_axis_remain) -
-           logits.reshape(batch_axis_remain)
-               .maximum(along_axis)
-               .eval()
-               .reshape(batch_one_remain)
-               .broadcast(one_axis_one)
-               .reshape(batch_classes))
+          (logits.reshape(batch_axis_remain) - logits.reshape(batch_axis_remain)
+                                                   .maximum(along_axis)
+                                                   .eval()
+                                                   .reshape(batch_one_remain)
+                                                   .broadcast(one_axis_one)
+                                                   .reshape(batch_classes))
               .unaryExpr(ValueClip<T>());
     }
 
     log_softmax.device(*context.eigen_device()) =
-        log_softmax -
-        log_softmax.exp()
-            .eval()
-            .reshape(batch_axis_remain)
-            .sum(along_axis)
-            .log()
-            .broadcast(one_axis);
+        log_softmax - log_softmax.exp()
+                          .eval()
+                          .reshape(batch_axis_remain)
+                          .sum(along_axis)
+                          .log()
+                          .broadcast(one_axis);
   }
 };
 
diff --git a/paddle/phi/kernels/cpu/logsumexp_kernel.cc b/paddle/phi/kernels/cpu/logsumexp_kernel.cc
index 06e0b30a9ca65..f1fecdfbe9e66 100644
--- a/paddle/phi/kernels/cpu/logsumexp_kernel.cc
+++ b/paddle/phi/kernels/cpu/logsumexp_kernel.cc
@@ -16,7 +16,6 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/impl/logsumexp_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
diff --git a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
index aba519ff04849..e3cd8fff8a50e 100644
--- a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(matmul_grad,
diff --git a/paddle/phi/kernels/cpu/matmul_kernel.cc b/paddle/phi/kernels/cpu/matmul_kernel.cc
index 8aa25c0da07d9..c75a50130db76 100644
--- a/paddle/phi/kernels/cpu/matmul_kernel.cc
+++ b/paddle/phi/kernels/cpu/matmul_kernel.cc
@@ -15,9 +15,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/matmul_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
 PD_REGISTER_KERNEL(matmul,
diff --git a/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
index ae3b4d2b45582..0f60f8da71a8b 100644
--- a/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/matrix_power_grad_kernel.h"
-#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(matrix_power_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/matrix_power_kernel.cc b/paddle/phi/kernels/cpu/matrix_power_kernel.cc
index f40e1e616f526..08ee7cbc865df 100644
--- a/paddle/phi/kernels/cpu/matrix_power_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_power_kernel.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/matrix_power_kernel.h"
-#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     matrix_power, CPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/matrix_rank_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc
index 5e13abe8aed2c..f56bd3d6dbe8a 100644
--- a/paddle/phi/kernels/cpu/matrix_rank_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/matrix_rank_kernel.h"
-#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
index 3bfc07319e98d..af9b7728389ba 100644
--- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -16,6 +16,7 @@
 
 #include <Eigen/Dense>
 #include <Eigen/SVD>
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
diff --git a/paddle/phi/kernels/cpu/maxout_grad_kernel.cc b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc
index 429344a362b1c..dad4e96b5a8b1 100644
--- a/paddle/phi/kernels/cpu/maxout_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     maxout_grad, CPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/maxout_kernel.cc b/paddle/phi/kernels/cpu/maxout_kernel.cc
index e7cd3ab07ff59..cc1d21d310b1f 100644
--- a/paddle/phi/kernels/cpu/maxout_kernel.cc
+++ b/paddle/phi/kernels/cpu/maxout_kernel.cc
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
 
 PD_REGISTER_KERNEL(maxout, CPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc b/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc
index 159d109255381..5b43fb02b5117 100644
--- a/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/meshgrid_grad_kernel.h"
-#include "paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(meshgrid_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/meshgrid_kernel.cc b/paddle/phi/kernels/cpu/meshgrid_kernel.cc
index c201103b3dac4..35e43f7bbc85e 100644
--- a/paddle/phi/kernels/cpu/meshgrid_kernel.cc
+++ b/paddle/phi/kernels/cpu/meshgrid_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/meshgrid_kernel.h"
-#include "paddle/phi/kernels/impl/meshgrid_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/meshgrid_kernel_impl.h"
 
 PD_REGISTER_KERNEL(meshgrid,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/momentum_kernel.cc b/paddle/phi/kernels/cpu/momentum_kernel.cc
index 63cc5592ef422..7a4ea9f19e5c2 100644
--- a/paddle/phi/kernels/cpu/momentum_kernel.cc
+++ b/paddle/phi/kernels/cpu/momentum_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/momentum_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
index 2cd75404be821..f6b07584ce44e 100644
--- a/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/multi_dot_grad_kernel.h"
-#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     multi_dot_grad, CPU, ALL_LAYOUT, phi::MultiDotGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/multi_dot_kernel.cc b/paddle/phi/kernels/cpu/multi_dot_kernel.cc
index a4249a98e46dd..00cf425a038a1 100644
--- a/paddle/phi/kernels/cpu/multi_dot_kernel.cc
+++ b/paddle/phi/kernels/cpu/multi_dot_kernel.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/multi_dot_kernel.h"
-#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     multi_dot, CPU, ALL_LAYOUT, phi::MultiDotKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
index f5a426e93db2c..12ba6dadde304 100644
--- a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
@@ -15,7 +15,6 @@
 #include "paddle/phi/kernels/multiplex_grad_kernel.h"
 
 #include "paddle/fluid/memory/memcpy.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/cpu/mv_kernel.cc b/paddle/phi/kernels/cpu/mv_kernel.cc
index 7f76ddda6dde5..408eda34e1c00 100644
--- a/paddle/phi/kernels/cpu/mv_kernel.cc
+++ b/paddle/phi/kernels/cpu/mv_kernel.cc
@@ -16,7 +16,6 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/impl/mv_kernel_impl.h"
 
 PD_REGISTER_KERNEL(mv, CPU, ALL_LAYOUT, phi::MvKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc
index 156124c214895..f8639a0d10fee 100644
--- a/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/nanmedian_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/nanmedian_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_kernel.cc
index ed38405c9179f..03d7fe304be3e 100644
--- a/paddle/phi/kernels/cpu/nanmedian_kernel.cc
+++ b/paddle/phi/kernels/cpu/nanmedian_kernel.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/nanmedian_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h"
 #include "paddle/phi/kernels/top_k_kernel.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
index dd2b09ee39acb..9048e87d04989 100644
--- a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/operators/math.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/nll_loss_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
index 92cb6a1ad17de..c966e91a9a6e9 100644
--- a/paddle/phi/kernels/cpu/nll_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/nll_loss_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/norm_grad_kernel.cc b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
index bd05e2c4c6ec1..92ca51b499c7a 100644
--- a/paddle/phi/kernels/cpu/norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
@@ -13,15 +13,13 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/norm_grad_kernel.h"
-#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#include "paddle/phi/kernels/funcs/eigen/common.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/cpu/norm_kernel.cc b/paddle/phi/kernels/cpu/norm_kernel.cc
index 50906d9c3bb94..f69d03b66b1b5 100644
--- a/paddle/phi/kernels/cpu/norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/norm_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
diff --git a/paddle/phi/kernels/cpu/one_hot_kernel.cc b/paddle/phi/kernels/cpu/one_hot_kernel.cc
index fc7979e41d938..f408c9f036152 100644
--- a/paddle/phi/kernels/cpu/one_hot_kernel.cc
+++ b/paddle/phi/kernels/cpu/one_hot_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/one_hot_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/cpu/p_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/p_norm_grad_kernel.cc
index 44ab050408653..32905ab087883 100644
--- a/paddle/phi/kernels/cpu/p_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/p_norm_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/p_norm_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/cpu/p_norm_kernel.cc b/paddle/phi/kernels/cpu/p_norm_kernel.cc
index 9da7fdbb297c2..597939953b277 100644
--- a/paddle/phi/kernels/cpu/p_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/p_norm_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/p_norm_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
diff --git a/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc b/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc
index b32065d4f0a14..0e2bfd04b620e 100644
--- a/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/pixel_shuffle_grad_kernel.h"
-#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(pixel_shuffle_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc b/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc
index 80f8fa7b50efb..44dcb8b59f77c 100644
--- a/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc
+++ b/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/pixel_shuffle_kernel.h"
-#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     pixel_shuffle, CPU, ALL_LAYOUT, phi::PixelShuffleKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc b/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc
index ef61fca35957e..cbcbf1e129d20 100644
--- a/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/pixel_unshuffle_grad_kernel.h"
-#include "paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(pixel_unshuffle_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc b/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc
index 9f4bc747f3209..837378972c69a 100644
--- a/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc
+++ b/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/pixel_unshuffle_kernel.h"
-#include "paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h"
 
 PD_REGISTER_KERNEL(pixel_unshuffle,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/poisson_kernel.cc b/paddle/phi/kernels/cpu/poisson_kernel.cc
index 6a3e32c2f0785..8ba1afe229eee 100644
--- a/paddle/phi/kernels/cpu/poisson_kernel.cc
+++ b/paddle/phi/kernels/cpu/poisson_kernel.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/poisson_kernel.h"
+
 #include <random>
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/poisson_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/pool_grad_kernel.cc b/paddle/phi/kernels/cpu/pool_grad_kernel.cc
index bb97694d8fc38..68cd57c52277b 100644
--- a/paddle/phi/kernels/cpu/pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/pool_grad_kernel.cc
@@ -14,9 +14,8 @@
 
 #include "paddle/phi/kernels/pool_grad_kernel.h"
 
-#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     pool2d_grad, CPU, ALL_LAYOUT, phi::Pool2dGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/pool_kernel.cc b/paddle/phi/kernels/cpu/pool_kernel.cc
index 1d57e282c3c8a..3d3880692c0c8 100644
--- a/paddle/phi/kernels/cpu/pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/pool_kernel.cc
@@ -14,9 +14,8 @@
 
 #include "paddle/phi/kernels/pool_kernel.h"
 
-#include "paddle/phi/kernels/impl/pool_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pool_kernel_impl.h"
 
 PD_REGISTER_KERNEL(pool2d, CPU, ALL_LAYOUT, phi::Pool2dKernel, float, double) {}
 PD_REGISTER_KERNEL(max_pool2d_with_index,
diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
index b68c3ad545d33..202baddd713a4 100644
--- a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/psroi_pool_grad_kernel.h"
 
 #include <algorithm>
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
index 4f7925ad00f5a..82eff70b75643 100644
--- a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/psroi_pool_kernel.h"
 
 #include <algorithm>
+
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/qr_kernel.cc b/paddle/phi/kernels/cpu/qr_kernel.cc
index b0e82cedb6b8b..6a5551d95571b 100644
--- a/paddle/phi/kernels/cpu/qr_kernel.cc
+++ b/paddle/phi/kernels/cpu/qr_kernel.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Dense>
-
 #include "paddle/phi/kernels/qr_kernel.h"
 
+#include <Eigen/Dense>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h
index 35395dccca1af..dad288cff2c1a 100644
--- a/paddle/phi/kernels/cpu/reduce.h
+++ b/paddle/phi/kernels/cpu/reduce.h
@@ -17,10 +17,9 @@
 #include <set>
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/cast_kernel.h"
-
-#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 // See Note [ Why still include the fluid headers? ]
diff --git a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
index 66ae5e02ffc75..abc18b1c578a8 100644
--- a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
@@ -111,4 +111,3 @@ PD_REGISTER_KERNEL(sum_grad,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-
diff --git a/paddle/phi/kernels/cpu/rmsprop_kernel.cc b/paddle/phi/kernels/cpu/rmsprop_kernel.cc
index fa1e1a2eed345..1d60823d75949 100644
--- a/paddle/phi/kernels/cpu/rmsprop_kernel.cc
+++ b/paddle/phi/kernels/cpu/rmsprop_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/rmsprop_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/rnn_functor.h b/paddle/phi/kernels/cpu/rnn_functor.h
index ab6f98ffcd5d6..911814647d6c0 100644
--- a/paddle/phi/kernels/cpu/rnn_functor.h
+++ b/paddle/phi/kernels/cpu/rnn_functor.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -21,9 +23,6 @@
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/operators/utils.h"
-
 namespace phi {
 
 #define DEFINE_MODE_DETECTOR(MODE_NAME, MODE_STR)       \
@@ -252,9 +251,12 @@ inline std::vector<DenseTensor> Unbind(const DenseTensor& in) {
 }
 
 template <typename CellType,
-          template <typename, typename> class LayerT,
-          template <typename, typename> class SingleLayerT,
-          template <typename, typename> class BidirLayerT,
+          template <typename, typename>
+          class LayerT,
+          template <typename, typename>
+          class SingleLayerT,
+          template <typename, typename>
+          class BidirLayerT,
           typename T,
           typename Context>
 void RnnFunc(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
index 4dd1894320af7..1cd4add7d50e6 100644
--- a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
@@ -16,7 +16,6 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/cpu/rnn_functor.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
@@ -962,8 +961,10 @@ void dropout_cpu_grad_function_inplace(const CPUContext& dev_ctx,
 }
 
 template <typename GradCellType,
-          template <typename, typename> class SingleGradLayerT,
-          template <typename, typename> class BidirGradLayerT,
+          template <typename, typename>
+          class SingleGradLayerT,
+          template <typename, typename>
+          class BidirGradLayerT,
           typename T>
 void RnnGradFunc(const CPUContext& dev_ctx,
                  const DenseTensor& x,
diff --git a/paddle/phi/kernels/cpu/rnn_kernel.cc b/paddle/phi/kernels/cpu/rnn_kernel.cc
index 80c521918ed07..e2e784b2943cc 100644
--- a/paddle/phi/kernels/cpu/rnn_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_kernel.cc
@@ -49,7 +49,8 @@ struct Cell {
 };
 
 template <typename T,
-          template <typename> class EigenActivationFunctor,
+          template <typename>
+          class EigenActivationFunctor,
           funcs::detail::ActivationType act_type>
 struct SimpleRNNCell : Cell<T> {
   void operator()(const CPUContext* dev_ctx,
diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc
index cd779b72e7a84..cf0dc47f47bd3 100644
--- a/paddle/phi/kernels/cpu/roi_align_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc
@@ -79,16 +79,12 @@ std::vector<OffsetsAndRatios<T>> GetIndexesAndRatios(
     for (std::size_t px = 0; px < pooled_width; px++) {
       for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) {
         // calculate x of sample points
-        auto y =
-            roi_ymin +
-            bin_h * (py +
-                     static_cast<T>(iy + .5f) / static_cast<T>(roi_bin_grid_h));
+        auto y = roi_ymin + bin_h * (py + static_cast<T>(iy + .5f) /
+                                              static_cast<T>(roi_bin_grid_h));
         for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) {
           // calculate x of sample points
-          auto x = roi_xmin +
-                   bin_w * (px +
-                            static_cast<T>(ix + .5f) /
-                                static_cast<T>(roi_bin_grid_w));
+          auto x = roi_xmin + bin_w * (px + static_cast<T>(ix + .5f) /
+                                                static_cast<T>(roi_bin_grid_w));
 
           // deal with elements out of map
           if (y < -1.0 || y > height || x < -1.0 || x > width) {
diff --git a/paddle/phi/kernels/cpu/scatter_grad_kernel.cc b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc
index 62fd58704c4fe..f09015f24a136 100644
--- a/paddle/phi/kernels/cpu/scatter_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/scatter_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/cpu/scatter_kernel.cc b/paddle/phi/kernels/cpu/scatter_kernel.cc
index d48ceaf29a08c..7032c3bb5a335 100644
--- a/paddle/phi/kernels/cpu/scatter_kernel.cc
+++ b/paddle/phi/kernels/cpu/scatter_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/scatter_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
index cc143ba8d0e45..7c3665c5d2e2e 100644
--- a/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/scatter_nd_add_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
index 04ae10f5e8b5d..31e2f4c716122 100644
--- a/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
+++ b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/scatter_nd_add_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
index a5c9dc4c55e49..744ec7805fa60 100644
--- a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/segment_pool_grad_kernel.h"
-#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(segment_pool_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
index ad76a7a86bcb2..541ccd3436548 100644
--- a/paddle/phi/kernels/cpu/segment_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/segment_pool_kernel.h"
-#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
 
 PD_REGISTER_KERNEL(segment_pool,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/selu_grad_kernel.cc b/paddle/phi/kernels/cpu/selu_grad_kernel.cc
index 32101b1913282..9f83e39a363d3 100644
--- a/paddle/phi/kernels/cpu/selu_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/selu_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/selu_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/selu_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/sgd_kernel.cc b/paddle/phi/kernels/cpu/sgd_kernel.cc
index 214fd82bef358..055c44d38e4b2 100644
--- a/paddle/phi/kernels/cpu/sgd_kernel.cc
+++ b/paddle/phi/kernels/cpu/sgd_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/sgd_kernel.h"
+
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/sign_kernel.cc b/paddle/phi/kernels/cpu/sign_kernel.cc
index 5fe11ffbd6d5c..9ded252c5c592 100644
--- a/paddle/phi/kernels/cpu/sign_kernel.cc
+++ b/paddle/phi/kernels/cpu/sign_kernel.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sign_kernel.h"
-#include "paddle/phi/kernels/impl/sign_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/sign_kernel_impl.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/bfloat16.h"
diff --git a/paddle/phi/kernels/cpu/size_kernel.cc b/paddle/phi/kernels/cpu/size_kernel.cc
index 71ebf9cdc09f7..ca8373b84889d 100644
--- a/paddle/phi/kernels/cpu/size_kernel.cc
+++ b/paddle/phi/kernels/cpu/size_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/size_kernel.h"
-#include "paddle/phi/kernels/impl/size_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/size_kernel_impl.h"
 
 PD_REGISTER_KERNEL(size,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/slice_grad_kernel.cc b/paddle/phi/kernels/cpu/slice_grad_kernel.cc
index 5c2cb3ea80e87..7e3efd217511f 100644
--- a/paddle/phi/kernels/cpu/slice_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/slice_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/slice_grad_kernel.h"
-#include "paddle/phi/kernels/impl/slice_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/slice_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(slice_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/slice_kernel.cc b/paddle/phi/kernels/cpu/slice_kernel.cc
index 736540609dd72..0f2fe98a85323 100644
--- a/paddle/phi/kernels/cpu/slice_kernel.cc
+++ b/paddle/phi/kernels/cpu/slice_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/slice_kernel.h"
-#include "paddle/phi/kernels/impl/slice_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/slice_kernel_impl.h"
 
 PD_REGISTER_KERNEL(slice,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc b/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc
index d78477073ad03..d296aba66503b 100644
--- a/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/sparse_weight_embedding_grad_kernel.h"
-#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc b/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc
index c0f95d03888b8..cfdccb5c8d9ba 100644
--- a/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc
+++ b/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/embedding_kernel.h"
-#include "paddle/phi/kernels/funcs/embedding_util.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/embedding_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc
index 56d872922490a..288cdd235aede 100644
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -17,7 +17,6 @@
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
index 400f7e8783932..2aff156819748 100644
--- a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/temporal_shift_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
index 6721117992dd5..29be487131964 100644
--- a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/temporal_shift_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
index 9dbcf575f33c1..dee69222e6dc0 100644
--- a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/transpose_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc
index 14aca258a2c71..660254fef86f6 100644
--- a/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(tril_triu_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/tril_triu_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_kernel.cc
index a3d20e55e21fb..f3599bb92b97b 100644
--- a/paddle/phi/kernels/cpu/tril_triu_kernel.cc
+++ b/paddle/phi/kernels/cpu/tril_triu_kernel.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
 
 PD_REGISTER_KERNEL(tril_triu,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/trunc_grad_kernel.cc b/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
index 4d85dd609e2d1..24fc389256222 100644
--- a/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/trunc_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/trunc_kernel.cc b/paddle/phi/kernels/cpu/trunc_kernel.cc
index babae6ce7c931..5fe33ec6a4b2e 100644
--- a/paddle/phi/kernels/cpu/trunc_kernel.cc
+++ b/paddle/phi/kernels/cpu/trunc_kernel.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/trunc_kernel.h"
+
 #include <math.h>
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/trunc_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/unfold_grad_kernel.cc b/paddle/phi/kernels/cpu/unfold_grad_kernel.cc
index c97005dd84547..6ba4ba49b9af9 100644
--- a/paddle/phi/kernels/cpu/unfold_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/unfold_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/unfold_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unfold_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/unfold_kernel.cc b/paddle/phi/kernels/cpu/unfold_kernel.cc
index e38d8acd09820..f15201542e6c1 100644
--- a/paddle/phi/kernels/cpu/unfold_kernel.cc
+++ b/paddle/phi/kernels/cpu/unfold_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/unfold_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unfold_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/uniform_random_kernel.cc b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
index c95a8f4ded6dc..a09812363f1d8 100644
--- a/paddle/phi/kernels/cpu/uniform_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/uniform_random_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/unique_kernel.cc b/paddle/phi/kernels/cpu/unique_kernel.cc
index 853b401315d22..834f05f73e228 100644
--- a/paddle/phi/kernels/cpu/unique_kernel.cc
+++ b/paddle/phi/kernels/cpu/unique_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/unique_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
diff --git a/paddle/phi/kernels/cpu/unstack_grad_kernel.cc b/paddle/phi/kernels/cpu/unstack_grad_kernel.cc
index 9c2dce808dca7..c494cbc965eff 100644
--- a/paddle/phi/kernels/cpu/unstack_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/unstack_grad_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/unstack_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/unstack_kernel.cc b/paddle/phi/kernels/cpu/unstack_kernel.cc
index 3d233e9ec405f..4bc8d1b2c93b2 100644
--- a/paddle/phi/kernels/cpu/unstack_kernel.cc
+++ b/paddle/phi/kernels/cpu/unstack_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/unstack_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unstack_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
index fab49f5416048..c98a098aa0e6f 100644
--- a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
+++ b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
@@ -109,7 +109,8 @@ struct Gather {
 };
 
 template <typename Context,
-          template <typename InT, typename OutT> typename CompareFunctor,
+          template <typename InT, typename OutT>
+          typename CompareFunctor,
           typename T>
 struct GetMask {
   void operator()(const Context& dev_ctx,
@@ -122,7 +123,8 @@ struct GetMask {
 };
 
 template <typename Context,
-          template <typename T> typename BinaryFunctor,
+          template <typename T>
+          typename BinaryFunctor,
           typename T>
 struct BinaryOperation {
   void operator()(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/cpu/warpctc_grad_kernel.cc b/paddle/phi/kernels/cpu/warpctc_grad_kernel.cc
index 0b29336335481..7d70d825250ee 100644
--- a/paddle/phi/kernels/cpu/warpctc_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/warpctc_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/warpctc_grad_kernel.h"
-#include "paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     warpctc_grad, CPU, ALL_LAYOUT, phi::WarpctcGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/warpctc_kernel.cc b/paddle/phi/kernels/cpu/warpctc_kernel.cc
index 4b87202c11e92..239c6cb0cbe04 100644
--- a/paddle/phi/kernels/cpu/warpctc_kernel.cc
+++ b/paddle/phi/kernels/cpu/warpctc_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/warpctc_kernel.h"
-#include "paddle/phi/kernels/impl/warpctc_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/warpctc_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     warpctc, CPU, ALL_LAYOUT, phi::WarpctcKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/yolo_box_kernel.cc b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
index a83bc019fc3af..6b882ad289512 100644
--- a/paddle/phi/kernels/cpu/yolo_box_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/yolo_box_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/yolo_box_util.h"
diff --git a/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc
index 383009229f9a1..655106e9cb44d 100644
--- a/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/yolov3_loss_grad_kernel.h"
+
 #include <algorithm>
 #include <vector>
 
-#include "paddle/phi/kernels/yolov3_loss_grad_kernel.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/yolov3_loss_functor.h"
diff --git a/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc b/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc
index 8a190ab25a7b2..75b2e3c5c4a0e 100644
--- a/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/yolov3_loss_kernel.h"
+
 #include <algorithm>
 #include <vector>
 
-#include "paddle/phi/kernels/yolov3_loss_kernel.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/yolov3_loss_functor.h"
diff --git a/paddle/phi/kernels/cumprod_grad_kernel.h b/paddle/phi/kernels/cumprod_grad_kernel.h
index b3cb17b28e07f..7610cad31e327 100644
--- a/paddle/phi/kernels/cumprod_grad_kernel.h
+++ b/paddle/phi/kernels/cumprod_grad_kernel.h
@@ -25,4 +25,4 @@ void CumprodGradKernel(const Context& dev_ctx,
                        const DenseTensor& dout,
                        int dim,
                        DenseTensor* dx);
-}  // phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/cumprod_kernel.h b/paddle/phi/kernels/cumprod_kernel.h
index 96d76cb0f4370..bb8b1427b30c4 100644
--- a/paddle/phi/kernels/cumprod_kernel.h
+++ b/paddle/phi/kernels/cumprod_kernel.h
@@ -23,4 +23,4 @@ void CumprodKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    int dim,
                    DenseTensor* out);
-}  // phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/diagonal_kernel.h b/paddle/phi/kernels/diagonal_kernel.h
index 7cf7282307a4b..10afd7dbe920a 100644
--- a/paddle/phi/kernels/diagonal_kernel.h
+++ b/paddle/phi/kernels/diagonal_kernel.h
@@ -25,4 +25,4 @@ void DiagonalKernel(const Context& dev_ctx,
                     int axis1,
                     int axis2,
                     DenseTensor* out);
-}  // phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/digamma_grad_kernel.h b/paddle/phi/kernels/digamma_grad_kernel.h
index ae5346080d30d..abd8634518d2c 100644
--- a/paddle/phi/kernels/digamma_grad_kernel.h
+++ b/paddle/phi/kernels/digamma_grad_kernel.h
@@ -24,4 +24,4 @@ void DigammaGradKernel(const Context& ctx,
                        const DenseTensor& out_grad,
                        DenseTensor* x_grad);
 
-}  // namepsace phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/digamma_kernel.h b/paddle/phi/kernels/digamma_kernel.h
index ce25f2e148e96..3cf1eae67cc3e 100644
--- a/paddle/phi/kernels/digamma_kernel.h
+++ b/paddle/phi/kernels/digamma_kernel.h
@@ -21,4 +21,4 @@ namespace phi {
 template <typename T, typename Context>
 void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
 
-}  // namepsace phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index 06d258a8a4e80..d8cf0bd2ef90d 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -14,9 +14,8 @@
 #include "paddle/phi/kernels/empty_kernel.h"
 
 #include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/expand_kernel.h b/paddle/phi/kernels/expand_kernel.h
index 3b44c46e4dd7c..930240db6ccca 100644
--- a/paddle/phi/kernels/expand_kernel.h
+++ b/paddle/phi/kernels/expand_kernel.h
@@ -26,4 +26,4 @@ void ExpandKernel(const Context& ctx,
                   const IntArray& shape,
                   DenseTensor* out);
 
-}  // namepsace phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 83f96c1f9f521..54279fca6e429 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/flatten_grad_kernel.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index f304e7706add4..dd000896073c7 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/flatten_kernel.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/unary.h"
diff --git a/paddle/phi/kernels/frobenius_norm_grad_kernel.h b/paddle/phi/kernels/frobenius_norm_grad_kernel.h
index cfe8192d1a69b..65db8dd9e0a10 100644
--- a/paddle/phi/kernels/frobenius_norm_grad_kernel.h
+++ b/paddle/phi/kernels/frobenius_norm_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/frobenius_norm_kernel.h b/paddle/phi/kernels/frobenius_norm_kernel.h
index f5f37ee0c0fa5..30122cb416094 100644
--- a/paddle/phi/kernels/frobenius_norm_kernel.h
+++ b/paddle/phi/kernels/frobenius_norm_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h
index d5785f2eedafa..228e862a09c79 100644
--- a/paddle/phi/kernels/full_kernel.h
+++ b/paddle/phi/kernels/full_kernel.h
@@ -19,7 +19,6 @@
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
-
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index f80117ccec799..f481821a7bfcc 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -15,14 +15,14 @@
 #pragma once
 
 #include <glog/logging.h>
+
 #include <algorithm>
+#include <cmath>
 #include <memory>
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
-
-#include <cmath>
 #ifndef _USE_MATH_DEFINES
 #define _USE_MATH_DEFINES
 #endif
@@ -986,9 +986,9 @@ struct BReluGradFunctor : public BaseActivationFunctor<T> {
             typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
-                       .template cast<T>();
+    dx.device(d) =
+        dout * ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
+                   .template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
@@ -1054,11 +1054,10 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
           GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
       auto ddout = EigenVector<T>::Flatten(
           GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
-      ddout.device(*d) =
-          ddx *
-          ((x > static_cast<T>(0)).template cast<T>() +
-           static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
-              .template cast<T>();
+      ddout.device(*d) = ddx * ((x > static_cast<T>(0)).template cast<T>() +
+                                static_cast<T>(alpha) *
+                                    (x <= static_cast<T>(0)).template cast<T>())
+                                   .template cast<T>();
     }
   }
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
@@ -1290,11 +1289,10 @@ struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
     if (ddOut) {
       auto ddout = EigenVector<T>::Flatten(
           GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
-      ddout.device(*d) = ddx *
-                         ((x > static_cast<T>(0)).template cast<T>() +
-                          static_cast<T>(alpha) * x.exp() *
-                              (x <= static_cast<T>(0)).template cast<T>())
-                             .template cast<T>();
+      ddout.device(*d) = ddx * ((x > static_cast<T>(0)).template cast<T>() +
+                                static_cast<T>(alpha) * x.exp() *
+                                    (x <= static_cast<T>(0)).template cast<T>())
+                                   .template cast<T>();
     }
   }
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
@@ -1980,11 +1978,10 @@ struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
     if (ddOut) {
       auto ddout = EigenVector<T>::Flatten(
           GET_DATA_SAFELY(ddOut, "Output", "DDOut", "CELUGradGrad"));
-      ddout.device(*d) = ddx *
-                         ((x > static_cast<T>(0)).template cast<T>() +
-                          (x / static_cast<T>(alpha)).exp() *
-                              (x <= static_cast<T>(0)).template cast<T>())
-                             .template cast<T>();
+      ddout.device(*d) = ddx * ((x > static_cast<T>(0)).template cast<T>() +
+                                (x / static_cast<T>(alpha)).exp() *
+                                    (x <= static_cast<T>(0)).template cast<T>())
+                                   .template cast<T>();
     }
   }
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
diff --git a/paddle/phi/kernels/funcs/adam_functors.h b/paddle/phi/kernels/funcs/adam_functors.h
index 2f706f0ef1c36..b14ee7f072e4e 100644
--- a/paddle/phi/kernels/funcs/adam_functors.h
+++ b/paddle/phi/kernels/funcs/adam_functors.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <math.h>  // for sqrt in CPU and CUDA
+
 #include <Eigen/Dense>
 
 #include "paddle/phi/kernels/funcs/algorithm.h"
@@ -169,9 +170,8 @@ class AdamFunctor<T, CPUAdam> {
 
     moment1_out = beta1_ * mom1 + (1 - beta1_) * g;
     moment2_out = beta2_ * mom2 + (1 - beta2_) * g * g;
-    param_out = param -
-                lr * (moment1_out /
-                      (moment2_out.sqrt() + epsilon_ * sqrt(1 - beta2_pow)));
+    param_out = param - lr * (moment1_out / (moment2_out.sqrt() +
+                                             epsilon_ * sqrt(1 - beta2_pow)));
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h
index 14a9560b841fa..70f75d5352ac5 100644
--- a/paddle/phi/kernels/funcs/aligned_vector.h
+++ b/paddle/phi/kernels/funcs/aligned_vector.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/phi/core/hostdevice.h"
 #if defined(__xpu__)
 #define CHAR_BIT 8
@@ -45,11 +46,11 @@ HOSTDEVICE inline void Store(const AlignedVector<T, Size>& vec, T* addr) {
 }
 
 /*
-* Only the address of input data is the multiplier of 1,2,4, vectorized load
-* with corresponding multiplier-value is possible. Moreover, the maximum length
-* of vectorized load is 128 bits once. Hence, valid length of vectorized load
-* shall be determined under both former constraints.
-*/
+ * Only the address of input data is the multiplier of 1,2,4, vectorized load
+ * with corresponding multiplier-value is possible. Moreover, the maximum length
+ * of vectorized load is 128 bits once. Hence, valid length of vectorized load
+ * shall be determined under both former constraints.
+ */
 template <typename T>
 int GetVectorizedSize(const T* pointer) {
   constexpr int max_load_bits = 128;
@@ -60,11 +61,11 @@ int GetVectorizedSize(const T* pointer) {
   constexpr int vec2 = std::alignment_of<AlignedVector<T, 2>>::value;  // NOLINT
   if (address % vec8 == 0) {
     /*
-    * Currently, decide to deal with no more than 4 data once while adopting
-    * vectorization load/store, if performance test shows that dealing with
-    * 8 data once in vectorization load/store does get optimized, return code
-    * below can be changed into " return std::min(8, valid_vec_size); " .
-    */
+     * Currently, decide to deal with no more than 4 data once while adopting
+     * vectorization load/store, if performance test shows that dealing with
+     * 8 data once in vectorization load/store does get optimized, return code
+     * below can be changed into " return std::min(8, valid_vec_size); " .
+     */
     return std::min(4, valid_vec_size);
   } else if (address % vec4 == 0) {
     return std::min(4, valid_vec_size);
diff --git a/paddle/phi/kernels/funcs/blas/CMakeLists.txt b/paddle/phi/kernels/funcs/blas/CMakeLists.txt
index cb054cc76e1d7..732114f2a6e80 100644
--- a/paddle/phi/kernels/funcs/blas/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/blas/CMakeLists.txt
@@ -1 +1,4 @@
-cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
+cc_library(
+  blas
+  SRCS blas.cc
+  DEPS cblas framework_proto device_context)
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index e2b16a1eb7ff1..3e197a18f96b9 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/dynload/cublas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 DECLARE_bool(enable_cublas_tensor_op_math);
 DECLARE_bool(gemm_use_half_precision_compute_type);
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index ecdfa7abcfd42..88b87c07c7615 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -456,21 +456,16 @@ void LaunchBroadcastKernel(
                     read_lens * gpu_config.GetBlockSize();
   int tail_tid = numel % (read_lens * gpu_config.GetBlockSize());
 #endif
-  VectorizedBroadcastKernel<InT,
-                            OutT,
-                            Functor,
-                            Arity,
-                            NumOuts,
-                            VecSize><<<blocks, threads, 0, stream>>>(
-      ins_data,
-      outs_data,
-      use_broadcast,
-      numel,
-      configs,
-      main_offset,
-      tail_tid,
-      read_lens,
-      func);
+  VectorizedBroadcastKernel<InT, OutT, Functor, Arity, NumOuts, VecSize>
+      <<<blocks, threads, 0, stream>>>(ins_data,
+                                       outs_data,
+                                       use_broadcast,
+                                       numel,
+                                       configs,
+                                       main_offset,
+                                       tail_tid,
+                                       read_lens,
+                                       func);
 }
 
 template <ElementwiseType ET,
@@ -589,10 +584,9 @@ void BroadcastKernel(const KPDevice &ctx,
     dims_size.emplace_back(in->dims().size());
   }
 
-  axis = axis == -1
-             ? *std::max_element(dims_size.begin(), dims_size.end()) -
-                   *std::min_element(dims_size.begin(), dims_size.end())
-             : axis;
+  axis = axis == -1 ? *std::max_element(dims_size.begin(), dims_size.end()) -
+                          *std::min_element(dims_size.begin(), dims_size.end())
+                    : axis;
   BroadcastKernelForDifferentVecSize<ET, InT, OutT, Functor, NumOuts>(
       ctx, ins, outs, axis, func);
 }
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 06be592dd9375..5abaf6c2ffa87 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
-
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
index ea256e93bba75..48858fa59390e 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
@@ -60,14 +60,12 @@ inline void ModulatedDeformableIm2colCPUKernel(
     const T* data_im_ptr =
         data_im + (b_col * num_channels + c_im) * height * width;
     const T* data_offset_ptr =
-        data_offset +
-        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
-            kernel_w * height_col * width_col;
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const T* data_mask_ptr =
         data_mask
-            ? data_mask +
-                  (b_col * deformable_group + deformable_group_index) *
-                      kernel_h * kernel_w * height_col * width_col
+            ? data_mask + (b_col * deformable_group + deformable_group_index) *
+                              kernel_h * kernel_w * height_col * width_col
             : nullptr;
 
     for (int i = 0; i < kernel_h; ++i) {
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
index 8bfb46c6636e9..bebea5dcb74ca 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
 
 namespace phi {
 namespace funcs {
@@ -70,14 +69,12 @@ __global__ void ModulatedDeformableIm2colGpuKernel(
     const T* data_im_ptr =
         data_im + (b_col * num_channels + c_im) * height * width;
     const T* data_offset_ptr =
-        data_offset +
-        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
-            kernel_w * height_col * width_col;
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const T* data_mask_ptr =
         data_mask
-            ? data_mask +
-                  (b_col * deformable_group + deformable_group_index) *
-                      kernel_h * kernel_w * height_col * width_col
+            ? data_mask + (b_col * deformable_group + deformable_group_index) *
+                              kernel_h * kernel_w * height_col * width_col
             : nullptr;
 
     for (int i = 0; i < kernel_h; ++i) {
@@ -129,28 +126,28 @@ void ModulatedDeformableIm2col(const Context& dev_ctx,
   int blocks = NumBlocks(num_kernels);
   int threads = kNumCUDAThreads;
 
-  ModulatedDeformableIm2colGpuKernel<
-      T><<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
-                                                   data_im,
-                                                   data_offset,
-                                                   data_mask,
-                                                   im_shape[1],
-                                                   im_shape[2],
-                                                   filter_shape[2],
-                                                   filter_shape[3],
-                                                   paddings[0],
-                                                   paddings[1],
-                                                   strides[0],
-                                                   strides[1],
-                                                   dilations[0],
-                                                   dilations[1],
-                                                   channel_per_deformable_group,
-                                                   col_shape[1],
-                                                   im_shape[0],
-                                                   deformable_groups,
-                                                   col_shape[2],
-                                                   col_shape[3],
-                                                   data_col);
+  ModulatedDeformableIm2colGpuKernel<T>
+      <<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
+                                                 data_im,
+                                                 data_offset,
+                                                 data_mask,
+                                                 im_shape[1],
+                                                 im_shape[2],
+                                                 filter_shape[2],
+                                                 filter_shape[3],
+                                                 paddings[0],
+                                                 paddings[1],
+                                                 strides[0],
+                                                 strides[1],
+                                                 dilations[0],
+                                                 dilations[1],
+                                                 channel_per_deformable_group,
+                                                 col_shape[1],
+                                                 im_shape[0],
+                                                 deformable_groups,
+                                                 col_shape[2],
+                                                 col_shape[3],
+                                                 data_col);
 }
 
 template void ModulatedDeformableIm2col(
diff --git a/paddle/phi/kernels/funcs/detail/activation_functions.h b/paddle/phi/kernels/funcs/detail/activation_functions.h
index 475557f164210..d41dca33f7571 100644
--- a/paddle/phi/kernels/funcs/detail/activation_functions.h
+++ b/paddle/phi/kernels/funcs/detail/activation_functions.h
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #pragma once
 #include <math.h>
+
 #include <stdexcept>
 #include <string>
+
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/core/hostdevice.h"
 
diff --git a/paddle/phi/kernels/funcs/detail/avx_mathfun.h b/paddle/phi/kernels/funcs/detail/avx_mathfun.h
index e5e7388d51dff..75e4922648c20 100644
--- a/paddle/phi/kernels/funcs/detail/avx_mathfun.h
+++ b/paddle/phi/kernels/funcs/detail/avx_mathfun.h
@@ -356,11 +356,11 @@ v8sf sin256_ps(v8sf x) {  // any x
   /* scale by 4/Pi */
   y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
 
-/*
-  Here we start a series of integer operations, which are in the
-  realm of AVX2.
-  If we don't have AVX, let's perform them using SSE2 directives
-*/
+  /*
+    Here we start a series of integer operations, which are in the
+    realm of AVX2.
+    If we don't have AVX, let's perform them using SSE2 directives
+  */
 
 #ifdef __AVX2__
   /* store the integer part of y in mm0 */
diff --git a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
index 0016bfb64c96e..0fdf490c5534d 100644
--- a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
diff --git a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
index 6657417beac8d..93232d8f7f434 100644
--- a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
+
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
diff --git a/paddle/phi/kernels/funcs/detail/gru_kernel.h b/paddle/phi/kernels/funcs/detail/gru_kernel.h
index db53fc4576daa..9e2aef1940619 100644
--- a/paddle/phi/kernels/funcs/detail/gru_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
+
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 
diff --git a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
index ed8e749f7fdad..02fddc57b313a 100644
--- a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
diff --git a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
index 6d4c430d9e648..5d06dddd9645b 100644
--- a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
@@ -249,27 +249,27 @@ void gpu_lstm_forward(const paddle::platform::DeviceContext& context,
   if (batch_size == 1) {
     KeLstmForward<T,
                   Op,
-                  /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op,
-        value,
-        frame_size,
-        batch_size,
-        cell_clip,
-        active_node,
-        active_gate,
-        active_state);
+                  /* is_batch= */ false>
+        <<<grid, threads, 0, stream>>>(op,
+                                       value,
+                                       frame_size,
+                                       batch_size,
+                                       cell_clip,
+                                       active_node,
+                                       active_gate,
+                                       active_state);
   } else {
     KeLstmForward<T,
                   Op,
-                  /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op,
-        value,
-        frame_size,
-        batch_size,
-        cell_clip,
-        active_node,
-        active_gate,
-        active_state);
+                  /* is_batch= */ true>
+        <<<grid, threads, 0, stream>>>(op,
+                                       value,
+                                       frame_size,
+                                       batch_size,
+                                       cell_clip,
+                                       active_node,
+                                       active_gate,
+                                       active_state);
   }
 }
 
@@ -303,29 +303,29 @@ void gpu_lstm_backward(const paddle::platform::DeviceContext& context,
   if (batch_size == 1) {
     KeLstmBackward<T,
                    Op,
-                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op,
-        value,
-        grad,
-        frame_size,
-        batch_size,
-        cell_clip,
-        active_node,
-        active_gate,
-        active_state);
+                   /* is_batch= */ false>
+        <<<grid, threads, 0, stream>>>(op,
+                                       value,
+                                       grad,
+                                       frame_size,
+                                       batch_size,
+                                       cell_clip,
+                                       active_node,
+                                       active_gate,
+                                       active_state);
   } else {
     KeLstmBackward<T,
                    Op,
-                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op,
-        value,
-        grad,
-        frame_size,
-        batch_size,
-        cell_clip,
-        active_node,
-        active_gate,
-        active_state);
+                   /* is_batch= */ true>
+        <<<grid, threads, 0, stream>>>(op,
+                                       value,
+                                       grad,
+                                       frame_size,
+                                       batch_size,
+                                       cell_clip,
+                                       active_node,
+                                       active_gate,
+                                       active_state);
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/detail/lstm_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_kernel.h
index 8b42926412525..0846f05a0c2c5 100644
--- a/paddle/phi/kernels/funcs/detail/lstm_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
+
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 
diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h
index 19a93970d090a..81525cb25449e 100644
--- a/paddle/phi/kernels/funcs/diagonal.h
+++ b/paddle/phi/kernels/funcs/diagonal.h
@@ -17,6 +17,7 @@
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 #endif
 
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index 68e986c334ecb..0e6b3a3f9d733 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -319,10 +319,9 @@ void distribution_and_transform(const GPUContext &ctx,
   uint64_t seed = seed_offset.first;
   uint64_t offset = seed_offset.second;
 
-  DistributionKernel<T,
-                     DistOp,
-                     TransformOp><<<grid_size, block_size, 0, ctx.stream()>>>(
-      size, seed, offset, dist, trans, out_data, total_thread);
+  DistributionKernel<T, DistOp, TransformOp>
+      <<<grid_size, block_size, 0, ctx.stream()>>>(
+          size, seed, offset, dist, trans, out_data, total_thread);
 }
 
 #endif
diff --git a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
index 8b64e35b93526..de771f12fbfe2 100644
--- a/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/eigen/CMakeLists.txt
@@ -1,9 +1,24 @@
-file(GLOB EIGEN_CC_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
-file(GLOB EIGEN_CU_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu")
+file(
+  GLOB EIGEN_CC_SOURCES
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "*.cc")
+file(
+  GLOB EIGEN_CU_SOURCES
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "*.cu")
 if(WITH_GPU)
-  nv_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3)
+  nv_library(
+    eigen_function
+    SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES}
+    DEPS eigen3)
 elseif(WITH_ROCM)
-  hip_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3)
+  hip_library(
+    eigen_function
+    SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES}
+    DEPS eigen3)
 else()
-  cc_library(eigen_function SRCS ${EIGEN_CC_SOURCES} DEPS eigen3)
+  cc_library(
+    eigen_function
+    SRCS ${EIGEN_CC_SOURCES}
+    DEPS eigen3)
 endif()
diff --git a/paddle/phi/kernels/funcs/eigen/extensions.h b/paddle/phi/kernels/funcs/eigen/extensions.h
index fbb9d8e3d2ef5..c724564417b19 100644
--- a/paddle/phi/kernels/funcs/eigen/extensions.h
+++ b/paddle/phi/kernels/funcs/eigen/extensions.h
@@ -20,7 +20,6 @@
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/hostdevice.h"
-
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 1093bdfa726c8..71dfbc206a191 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -494,7 +494,7 @@ template <template <int Index, int VecSize> typename Func,
           int Begin = 0>
 struct Unroller {
   template <typename... Args>
-  static HOSTDEVICE inline void step(Args &&... args) {
+  static HOSTDEVICE inline void step(Args &&...args) {
     Func<Begin, VecSize>::Apply(std::forward<Args>(args)...);
     Unroller<Func, VecSize, End, Begin + 1>::step(args...);
   }
@@ -503,7 +503,7 @@ struct Unroller {
 template <template <int Index, int VecSize> typename Func, int VecSize, int End>
 struct Unroller<Func, VecSize, End, End> {
   template <typename... Args>
-  static HOSTDEVICE inline void step(Args &&... args) {}
+  static HOSTDEVICE inline void step(Args &&...args) {}
 };
 
 template <int Index, int VecSize>
@@ -818,23 +818,18 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
   int grid_size = 8;
   auto stream = ctx.x_context()->xpu_stream;
   int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
-  VectorizedElementwiseKernel<OutT,
-                              Functor,
-                              Arity,
-                              NumOuts,
-                              VecSize><<<grid_size, block_size, 0, stream>>>(
-      ins_data, outs_data, numel, main_offset, func);
+  VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize>
+      <<<grid_size, block_size, 0, stream>>>(
+          ins_data, outs_data, numel, main_offset, func);
 #else
   auto gpu_config =
       phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
   int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
                     gpu_config.GetBlockSize();
   auto stream = ctx.stream();
-  VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize><<<
-      gpu_config.block_per_grid,
-      gpu_config.thread_per_block,
-      0,
-      stream>>>(ins_data, outs_data, numel, main_offset, func);
+  VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize>
+      <<<gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
+          ins_data, outs_data, numel, main_offset, func);
 #endif
 }
 
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index 70b11bc8c90b2..e30ab8716b608 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/hostdevice.h"
 #if defined(__xpu__)
 #include <xpu/runtime.h>
+
 #include "xpu/kernel/math_xpu2.h"  //pow()
 #endif
 
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index 7508d8ee8cdc8..9ca21b967a414 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -1314,8 +1314,9 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
     }
   };
 
-  auto FastBroadCastAllCUDAF = [&](
-      const std::vector<int> &broadcast_pos, int max_dim, bool is_x_large) {
+  auto FastBroadCastAllCUDAF = [&](const std::vector<int> &broadcast_pos,
+                                   int max_dim,
+                                   bool is_x_large) {
     int axis = broadcast_pos[0];
     int pre = std::accumulate(
         out_dims_array, out_dims_array + axis, 1, std::multiplies<int>());
@@ -1361,85 +1362,85 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
         dy_data);
   };
 
-  auto FastBroadCastOneCUDAF = [&](
-      const std::vector<int> &broadcast_pos, int max_dim, bool is_x) {
-    int axis = broadcast_pos[0];
-    int pre = std::accumulate(
-        out_dims_array, out_dims_array + axis, 1, std::multiplies<int>());
-    int mid = out_dims_array[axis];
-    int post = std::accumulate(out_dims_array + axis + 1,
-                               out_dims_array + max_dim,
-                               1,
-                               std::multiplies<int>());
-
-    int k_pre;
-    int k_mid;
-    int k_post;
-
-    if (is_x) {
-      k_pre = std::accumulate(
-          y_dims_array, y_dims_array + axis, 1, std::multiplies<int>());
-      k_mid = y_dims_array[axis];
-      k_post = std::accumulate(y_dims_array + axis + 1,
-                               y_dims_array + max_dim,
-                               1,
-                               std::multiplies<int>());
-      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
-      dim3 grid_size = dim3(pre * post);
-      paddle::platform::LimitGridDim(ctx, &grid_size);
-      // we need to calc y offset with blockid, so do x_pre/y_pre to get left
-      // size.
-      if (k_pre != pre) k_pre = pre / k_pre;
-
-      FastCommonGradBroadcastOneCUDAKernel<<<grid_size,
-                                             block_size,
-                                             0,
-                                             stream>>>(x_data,
-                                                       y_data,
-                                                       out_data,
-                                                       dout_data,
-                                                       pre,
-                                                       mid,
-                                                       post,
-                                                       k_pre,
-                                                       k_mid,
-                                                       k_post,
-                                                       true,
-                                                       dx_op,
-                                                       dx_data);
-    } else {
-      k_pre = std::accumulate(
-          x_dims_array, x_dims_array + axis, 1, std::multiplies<int>());
-      k_mid = x_dims_array[axis];
-      k_post = std::accumulate(x_dims_array + axis + 1,
-                               x_dims_array + max_dim,
-                               1,
-                               std::multiplies<int>());
-      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
-      dim3 grid_size = dim3(pre * post);
-      paddle::platform::LimitGridDim(ctx, &grid_size);
-      if (k_pre != pre) k_pre = pre / k_pre;
-
-      FastCommonGradBroadcastOneCUDAKernel<<<grid_size,
-                                             block_size,
-                                             0,
-                                             stream>>>(x_data,
-                                                       y_data,
-                                                       out_data,
-                                                       dout_data,
-                                                       pre,
-                                                       mid,
-                                                       post,
-                                                       k_pre,
-                                                       k_mid,
-                                                       k_post,
-                                                       false,
-                                                       dy_op,
-                                                       dy_data);
-    }
-    VLOG(3) << "FastBroadCastOneCUDAF pre:" << pre << " mid:" << mid
-            << " post:" << post;
-  };
+  auto FastBroadCastOneCUDAF =
+      [&](const std::vector<int> &broadcast_pos, int max_dim, bool is_x) {
+        int axis = broadcast_pos[0];
+        int pre = std::accumulate(
+            out_dims_array, out_dims_array + axis, 1, std::multiplies<int>());
+        int mid = out_dims_array[axis];
+        int post = std::accumulate(out_dims_array + axis + 1,
+                                   out_dims_array + max_dim,
+                                   1,
+                                   std::multiplies<int>());
+
+        int k_pre;
+        int k_mid;
+        int k_post;
+
+        if (is_x) {
+          k_pre = std::accumulate(
+              y_dims_array, y_dims_array + axis, 1, std::multiplies<int>());
+          k_mid = y_dims_array[axis];
+          k_post = std::accumulate(y_dims_array + axis + 1,
+                                   y_dims_array + max_dim,
+                                   1,
+                                   std::multiplies<int>());
+          int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
+          dim3 grid_size = dim3(pre * post);
+          paddle::platform::LimitGridDim(ctx, &grid_size);
+          // we need to calc y offset with blockid, so do x_pre/y_pre to get
+          // left size.
+          if (k_pre != pre) k_pre = pre / k_pre;
+
+          FastCommonGradBroadcastOneCUDAKernel<<<grid_size,
+                                                 block_size,
+                                                 0,
+                                                 stream>>>(x_data,
+                                                           y_data,
+                                                           out_data,
+                                                           dout_data,
+                                                           pre,
+                                                           mid,
+                                                           post,
+                                                           k_pre,
+                                                           k_mid,
+                                                           k_post,
+                                                           true,
+                                                           dx_op,
+                                                           dx_data);
+        } else {
+          k_pre = std::accumulate(
+              x_dims_array, x_dims_array + axis, 1, std::multiplies<int>());
+          k_mid = x_dims_array[axis];
+          k_post = std::accumulate(x_dims_array + axis + 1,
+                                   x_dims_array + max_dim,
+                                   1,
+                                   std::multiplies<int>());
+          int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
+          dim3 grid_size = dim3(pre * post);
+          paddle::platform::LimitGridDim(ctx, &grid_size);
+          if (k_pre != pre) k_pre = pre / k_pre;
+
+          FastCommonGradBroadcastOneCUDAKernel<<<grid_size,
+                                                 block_size,
+                                                 0,
+                                                 stream>>>(x_data,
+                                                           y_data,
+                                                           out_data,
+                                                           dout_data,
+                                                           pre,
+                                                           mid,
+                                                           post,
+                                                           k_pre,
+                                                           k_mid,
+                                                           k_post,
+                                                           false,
+                                                           dy_op,
+                                                           dy_data);
+        }
+        VLOG(3) << "FastBroadCastOneCUDAF pre:" << pre << " mid:" << mid
+                << " post:" << post;
+      };
 
   // do fast elementwise if: 1. only one input need to do broadcast, we can
   // fallback
@@ -1571,23 +1572,21 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                          x_dims_order.data(),
                          bytes,
                          ctx.stream());
-    CommonGradBroadcastCUDAKernel<
-        T,
-        DX_OP,
-        Tout><<<x_blocks, x_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
-                                                           y_strides_array_gpu,
-                                                           out_dims_array_gpu,
-                                                           x_strides_order_gpu,
-                                                           x_dims_order_gpu,
-                                                           x_data,
-                                                           y_data,
-                                                           out_data,
-                                                           dout_data,
-                                                           dx_data,
-                                                           out_size,
-                                                           max_dim,
-                                                           x_threads,
-                                                           dx_op);
+    CommonGradBroadcastCUDAKernel<T, DX_OP, Tout>
+        <<<x_blocks, x_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
+                                                      y_strides_array_gpu,
+                                                      out_dims_array_gpu,
+                                                      x_strides_order_gpu,
+                                                      x_dims_order_gpu,
+                                                      x_data,
+                                                      y_data,
+                                                      out_data,
+                                                      dout_data,
+                                                      dx_data,
+                                                      out_size,
+                                                      max_dim,
+                                                      x_threads,
+                                                      dx_op);
   }
   if (dy) {
     auto y_strides_order_tmp = paddle::memory::Alloc(ctx, bytes);
@@ -1608,23 +1607,21 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                          y_dims_order.data(),
                          bytes,
                          ctx.stream());
-    CommonGradBroadcastCUDAKernel<
-        T,
-        DY_OP,
-        Tout><<<y_blocks, y_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
-                                                           y_strides_array_gpu,
-                                                           out_dims_array_gpu,
-                                                           y_strides_order_gpu,
-                                                           y_dims_order_gpu,
-                                                           x_data,
-                                                           y_data,
-                                                           out_data,
-                                                           dout_data,
-                                                           dy_data,
-                                                           out_size,
-                                                           max_dim,
-                                                           y_threads,
-                                                           dy_op);
+    CommonGradBroadcastCUDAKernel<T, DY_OP, Tout>
+        <<<y_blocks, y_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
+                                                      y_strides_array_gpu,
+                                                      out_dims_array_gpu,
+                                                      y_strides_order_gpu,
+                                                      y_dims_order_gpu,
+                                                      x_data,
+                                                      y_data,
+                                                      out_data,
+                                                      dout_data,
+                                                      dy_data,
+                                                      out_size,
+                                                      max_dim,
+                                                      y_threads,
+                                                      dy_op);
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/fc_functor.cc b/paddle/phi/kernels/funcs/fc_functor.cc
index e14f8522c969a..0fb38c971abf5 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cc
+++ b/paddle/phi/kernels/funcs/fc_functor.cc
@@ -78,15 +78,14 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
         errors::PermissionDenied("When bias is NULL, relu can not be true."));
     return;
   }
-  auto compute = relu
-                     ? paddle::operators::jit::KernelFuncs<
-                           paddle::operators::jit::VAddReluTuple<T>,
-                           paddle::platform::CPUPlace>::Cache()
-                           .At(N)
-                     : paddle::operators::jit::KernelFuncs<
-                           paddle::operators::jit::VAddTuple<T>,
-                           paddle::platform::CPUPlace>::Cache()
-                           .At(N);
+  auto compute = relu ? paddle::operators::jit::KernelFuncs<
+                            paddle::operators::jit::VAddReluTuple<T>,
+                            paddle::platform::CPUPlace>::Cache()
+                            .At(N)
+                      : paddle::operators::jit::KernelFuncs<
+                            paddle::operators::jit::VAddTuple<T>,
+                            paddle::platform::CPUPlace>::Cache()
+                            .At(N);
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
index a26f0edcab272..b441ad581793d 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cu
+++ b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"
@@ -126,15 +127,11 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
     const int threads = 256;
     const int blocks = M;
     if (relu) {
-      InplaceAddReluKernel<T,
-                           true,
-                           threads><<<blocks, threads, 0, context.stream()>>>(
-          N, B, Y);
+      InplaceAddReluKernel<T, true, threads>
+          <<<blocks, threads, 0, context.stream()>>>(N, B, Y);
     } else {
-      InplaceAddReluKernel<T,
-                           false,
-                           threads><<<blocks, threads, 0, context.stream()>>>(
-          N, B, Y);
+      InplaceAddReluKernel<T, false, threads>
+          <<<blocks, threads, 0, context.stream()>>>(N, B, Y);
     }
   }
 }
diff --git a/paddle/phi/kernels/funcs/fc_functor.h b/paddle/phi/kernels/funcs/fc_functor.h
index 3c759acb194b0..e5ed0d709cdc9 100644
--- a/paddle/phi/kernels/funcs/fc_functor.h
+++ b/paddle/phi/kernels/funcs/fc_functor.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/platform/device_context.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h
index 617d249308cda..fbffd0c1e2b8a 100644
--- a/paddle/phi/kernels/funcs/gather.cu.h
+++ b/paddle/phi/kernels/funcs/gather.cu.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/memory/memcpy.h"
 // TODO(paddle-dev): move gpu_primitives.h to phi
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
@@ -260,17 +261,16 @@ void GatherV2CUDAFunction(const DenseTensor* input,
 
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, out_size);
   auto stream = ctx.stream();
-  GatherGPUKernel<
-      T,
-      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-      input_data,
-      index_data,
-      out_data,
-      outer_dim_size,
-      inner_dim_size,
-      index_size,
-      index_dim_size,
-      out_size);
+  GatherGPUKernel<T, U>
+      <<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+          input_data,
+          index_data,
+          out_data,
+          outer_dim_size,
+          inner_dim_size,
+          index_size,
+          index_dim_size,
+          out_size);
 }
 
 template <typename T, typename U>
@@ -306,17 +306,16 @@ void GatherV2GradCUDAFunction(const DenseTensor* input,
 
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, input_size);
   auto stream = ctx.stream();
-  GatherGradGPUKernel<
-      T,
-      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-      input_data,
-      index_data,
-      out_data,
-      outer_dim_size,
-      inner_dim_size,
-      input_index_dim_size,
-      out_index_dim_size,
-      input_size);
+  GatherGradGPUKernel<T, U>
+      <<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+          input_data,
+          index_data,
+          out_data,
+          outer_dim_size,
+          inner_dim_size,
+          input_index_dim_size,
+          out_index_dim_size,
+          input_size);
 }
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/gather.h b/paddle/phi/kernels/funcs/gather.h
index 740042c999aa9..094bc46cb6f45 100644
--- a/paddle/phi/kernels/funcs/gather.h
+++ b/paddle/phi/kernels/funcs/gather.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <memory.h>
+
 #include <cstring>
 #include <vector>
 
diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
index 7666206b7f7f4..bbc3fdaeeacec 100644
--- a/paddle/phi/kernels/funcs/gru_compute.cu
+++ b/paddle/phi/kernels/funcs/gru_compute.cu
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
 #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
@@ -37,57 +38,49 @@ struct GRUUnitFunctor<paddle::platform::CUDADeviceContext, T> {
           int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
           threads = dim3(tiled_size, 1);
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruGate<
-              T,
-              tiled_size><<<grid, threads, 0, stream>>>(
-              value.gate_value,
-              value.prev_out_value,
-              value.gate_weight,
-              value.reset_output_value,
-              frame_size,
-              active_gate);
+          detail::KeFastCollectiveGruGate<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(value.gate_value,
+                                             value.prev_out_value,
+                                             value.gate_weight,
+                                             value.reset_output_value,
+                                             frame_size,
+                                             active_gate);
 
           frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruOut<
-              T,
-              tiled_size><<<grid, threads, 0, stream>>>(
-              value.state_weight,
-              value.prev_out_value,
-              value.output_value,
-              value.gate_value,
-              value.reset_output_value,
-              frame_size,
-              active_node,
-              origin_mode);
+          detail::KeFastCollectiveGruOut<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(value.state_weight,
+                                             value.prev_out_value,
+                                             value.output_value,
+                                             value.gate_value,
+                                             value.reset_output_value,
+                                             frame_size,
+                                             active_node,
+                                             origin_mode);
         } else {
           constexpr int tiled_size = 16;
           int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
           threads = dim3(tiled_size, 1);
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruGate<
-              T,
-              tiled_size><<<grid, threads, 0, stream>>>(
-              value.gate_value,
-              value.prev_out_value,
-              value.gate_weight,
-              value.reset_output_value,
-              frame_size,
-              active_gate);
+          detail::KeFastCollectiveGruGate<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(value.gate_value,
+                                             value.prev_out_value,
+                                             value.gate_weight,
+                                             value.reset_output_value,
+                                             frame_size,
+                                             active_gate);
 
           frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruOut<
-              T,
-              tiled_size><<<grid, threads, 0, stream>>>(
-              value.state_weight,
-              value.prev_out_value,
-              value.output_value,
-              value.gate_value,
-              value.reset_output_value,
-              frame_size,
-              active_node,
-              origin_mode);
+          detail::KeFastCollectiveGruOut<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(value.state_weight,
+                                             value.prev_out_value,
+                                             value.output_value,
+                                             value.gate_value,
+                                             value.reset_output_value,
+                                             frame_size,
+                                             active_node,
+                                             origin_mode);
         }
         return;
       } else {
diff --git a/paddle/phi/kernels/funcs/inclusive_scan.h b/paddle/phi/kernels/funcs/inclusive_scan.h
index b285c5bdbbfc0..0f97b244bf0ba 100644
--- a/paddle/phi/kernels/funcs/inclusive_scan.h
+++ b/paddle/phi/kernels/funcs/inclusive_scan.h
@@ -24,6 +24,7 @@ namespace cub = hipcub;
 
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/reverse_iterator.h>
+
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
@@ -214,21 +215,21 @@ static void InclusiveScanInnerDim(const T *x,
   grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
   dim3 thread_dims(kThreadNumX, kThreadNumY);
   if (reverse) {
-    InclusiveScanInnerDimCUDAKernel<
-        T,
-        BinaryOp,
-        kThreadNumX,
-        kThreadNumY,
-        /*kReverse=*/true><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
-        x, y, outer_dim, inner_dim, init, op);
+    InclusiveScanInnerDimCUDAKernel<T,
+                                    BinaryOp,
+                                    kThreadNumX,
+                                    kThreadNumY,
+                                    /*kReverse=*/true>
+        <<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
+            x, y, outer_dim, inner_dim, init, op);
   } else {
-    InclusiveScanInnerDimCUDAKernel<
-        T,
-        BinaryOp,
-        kThreadNumX,
-        kThreadNumY,
-        /*kReverse=*/false><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
-        x, y, outer_dim, inner_dim, init, op);
+    InclusiveScanInnerDimCUDAKernel<T,
+                                    BinaryOp,
+                                    kThreadNumX,
+                                    kThreadNumY,
+                                    /*kReverse=*/false>
+        <<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
+            x, y, outer_dim, inner_dim, init, op);
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/index_impl.cu.h b/paddle/phi/kernels/funcs/index_impl.cu.h
index ccb70fe25ddce..f90380bef70bd 100644
--- a/paddle/phi/kernels/funcs/index_impl.cu.h
+++ b/paddle/phi/kernels/funcs/index_impl.cu.h
@@ -71,16 +71,16 @@ void IndexKernel(const KPDevice &dev_ctx, DenseTensor *out, Functor func) {
   size_t main_offset = (numel / (vec_size * block)) * vec_size * block;
   switch (vec_size) {
     case 4:
-      VectorizedIndexKernel<T, Functor, 4><<<grid, block, 0, stream>>>(
-          out_data, numel, main_offset, func);
+      VectorizedIndexKernel<T, Functor, 4>
+          <<<grid, block, 0, stream>>>(out_data, numel, main_offset, func);
       break;
     case 2:
-      VectorizedIndexKernel<T, Functor, 2><<<grid, block, 0, stream>>>(
-          out_data, numel, main_offset, func);
+      VectorizedIndexKernel<T, Functor, 2>
+          <<<grid, block, 0, stream>>>(out_data, numel, main_offset, func);
       break;
     case 1:
-      VectorizedIndexKernel<T, Functor, 1><<<grid, block, 0, stream>>>(
-          out_data, numel, main_offset, func);
+      VectorizedIndexKernel<T, Functor, 1>
+          <<<grid, block, 0, stream>>>(out_data, numel, main_offset, func);
       break;
     default: {
       PADDLE_THROW(phi::errors::Unimplemented(
diff --git a/paddle/phi/kernels/funcs/lapack/lapack_function.cc b/paddle/phi/kernels/funcs/lapack/lapack_function.cc
index 0f887dce4b4da..247bb52153c3e 100644
--- a/paddle/phi/kernels/funcs/lapack/lapack_function.cc
+++ b/paddle/phi/kernels/funcs/lapack/lapack_function.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+
 #include "paddle/phi/backends/dynload/lapack.h"
 #include "paddle/phi/common/complex.h"
 
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index afa2214f5b9df..25f222546656f 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index df2af82d551ee..42ba0ba7113e3 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -227,14 +228,14 @@ struct TransposeNormal<phi::GPUContext, T> {
                          : (1 << static_cast<int>(std::log2(elements)));
     int grid_size = elements / block_size;
     grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
-    TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
-        in_ptr,
-        out_ptr,
-        elements,
-        in_stride_ptr,
-        out_stride_ptr,
-        axis_ptr,
-        rank);
+    TransposeNormalKernel<T>
+        <<<grid_size, block_size, 0, context.stream()>>>(in_ptr,
+                                                         out_ptr,
+                                                         elements,
+                                                         in_stride_ptr,
+                                                         out_stride_ptr,
+                                                         axis_ptr,
+                                                         rank);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/math_function_impl.h b/paddle/phi/kernels/funcs/math_function_impl.h
index 1638d03e50f95..7c337e6c0dba9 100644
--- a/paddle/phi/kernels/funcs/math_function_impl.h
+++ b/paddle/phi/kernels/funcs/math_function_impl.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
index 686b8405bf750..eef355e688435 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/matrix_inverse.h"
 
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h
index 1c6756f1720a2..f0cd265a54648 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.h
+++ b/paddle/phi/kernels/funcs/matrix_inverse.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "Eigen/Core"
 #include "Eigen/LU"
-
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/paddle/phi/kernels/funcs/padding.h b/paddle/phi/kernels/funcs/padding.h
index e2c4e766b605b..d6faa5f824c0d 100644
--- a/paddle/phi/kernels/funcs/padding.h
+++ b/paddle/phi/kernels/funcs/padding.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <utility>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/funcs/pooling.cc b/paddle/phi/kernels/funcs/pooling.cc
index 10c88b9798c6f..acc9a9c095cd4 100644
--- a/paddle/phi/kernels/funcs/pooling.cc
+++ b/paddle/phi/kernels/funcs/pooling.cc
@@ -16,18 +16,19 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
 namespace phi {
 namespace funcs {
 
 /*
-* Tensors are in NCHW or NHWC format.
-* Ksize, strides are two elements. These two elements represent height
-* and width, respectively.
-* Paddings are four elements. These four elements represent height_up,
-* height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCHW or NHWC format.
+ * Ksize, strides are two elements. These two elements represent height
+ * and width, respectively.
+ * Paddings are four elements. These four elements represent height_up,
+ * height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, typename T>
 class Pool2dFunctor<CPUContext, PoolProcess, T> {
  public:
@@ -248,12 +249,12 @@ class Pool2dFunctor<CPUContext, PoolProcess, T> {
 };
 
 /*
-* tensors are in NCHW or NHWC format.
-* Ksize, strides are two elements. These two elements represent height
-* and width, respectively.
-* Paddings are four elements. These four elements represent height_up,
-* height_down, width_left and width_right, respectively.
-*/
+ * tensors are in NCHW or NHWC format.
+ * Ksize, strides are two elements. These two elements represent height
+ * and width, respectively.
+ * Paddings are four elements. These four elements represent height_up,
+ * height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, class T>
 class Pool2dGradFunctor<CPUContext, PoolProcess, T> {
  public:
@@ -492,12 +493,12 @@ class Pool2dGradFunctor<CPUContext, PoolProcess, T> {
 };
 
 /*
-* Tensors are in NCHW or NHWC format.
-* Ksize, strides are two elements. These two elements represent height
-* and width, respectively.
-* Paddings are four elements. These four elements represent height_up,
-* height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCHW or NHWC format.
+ * Ksize, strides are two elements. These two elements represent height
+ * and width, respectively.
+ * Paddings are four elements. These four elements represent height_up,
+ * height_down, width_left and width_right, respectively.
+ */
 template <class T>
 class MaxPool2dGradFunctor<CPUContext, T> {
  public:
@@ -682,13 +683,13 @@ template class Pool2dGradFunctor<CPUContext, MaxPoolGrad<double>, double>;
 template class Pool2dGradFunctor<CPUContext, AvgPoolGrad<double>, double>;
 
 /*
-* Tensors are in NCDHW or NDHWC format.
-* Ksize, strides, paddings are three elements. These three elements represent
-* depth, height and width, respectively.
-* Paddings are six elements. These six elements represent depth_forth,
-* depth_back,
-* height_up, height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCDHW or NDHWC format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ * Paddings are six elements. These six elements represent depth_forth,
+ * depth_back,
+ * height_up, height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, class T>
 class Pool3dFunctor<CPUContext, PoolProcess, T> {
  public:
@@ -981,13 +982,13 @@ class Pool3dFunctor<CPUContext, PoolProcess, T> {
 };
 
 /*
-* Tensors are in NCDHW or NDHWC format.
-* Ksize, strides, paddings are three elements. These three elements represent
-* depth, height and width, respectively.
-* Paddings are six elements. These six elements represent depth_forth,
-* depth_back,
-* height_up, height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCDHW or NDHWC format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ * Paddings are six elements. These six elements represent depth_forth,
+ * depth_back,
+ * height_up, height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, class T>
 class Pool3dGradFunctor<CPUContext, PoolProcess, T> {
  public:
@@ -1298,13 +1299,13 @@ class Pool3dGradFunctor<CPUContext, PoolProcess, T> {
 };
 
 /*
-* Tensors are in NCDHW or NDHWC format.
-* Ksize, strides, paddings are three elements. These three elements represent
-* depth, height and width, respectively.
-* Paddings are six elements. These six elements represent depth_forth,
-* depth_back,
-* height_up, height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCDHW or NDHWC format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ * Paddings are six elements. These six elements represent depth_forth,
+ * depth_back,
+ * height_up, height_down, width_left and width_right, respectively.
+ */
 template <class T>
 class MaxPool3dGradFunctor<CPUContext, T> {
  public:
diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
index b0e68abc08a57..6e4fc414afd4a 100644
--- a/paddle/phi/kernels/funcs/pooling.cu
+++ b/paddle/phi/kernels/funcs/pooling.cu
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/funcs/pooling.h"
-
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace phi {
 namespace funcs {
@@ -468,25 +468,25 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
 
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
-    KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        input_channels,
-        input_height,
-        input_width,
-        output_height,
-        output_width,
-        ksize_height,
-        ksize_width,
-        stride_height,
-        stride_width,
-        padding_height,
-        padding_width,
-        pool_divmods,
-        pool_process,
-        exclusive,
-        adaptive,
-        output_data);
+    KernelPool2D<PoolProcess, T>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 input_channels,
+                                                 input_height,
+                                                 input_width,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_height,
+                                                 padding_width,
+                                                 pool_divmods,
+                                                 pool_process,
+                                                 exclusive,
+                                                 adaptive,
+                                                 output_data);
   }
   void operator()(const phi::GPUContext& context,
                   const DenseTensor& input,
@@ -535,26 +535,26 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
 
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
-    KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        input_channels,
-        input_height,
-        input_width,
-        output_height,
-        output_width,
-        ksize_height,
-        ksize_width,
-        stride_height,
-        stride_width,
-        padding_height,
-        padding_width,
-        pool_divmods,
-        pool_process,
-        exclusive,
-        adaptive,
-        output_data,
-        channel_last);
+    KernelPool2D<PoolProcess, T>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 input_channels,
+                                                 input_height,
+                                                 input_width,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_height,
+                                                 padding_width,
+                                                 pool_divmods,
+                                                 pool_process,
+                                                 exclusive,
+                                                 adaptive,
+                                                 output_data,
+                                                 channel_last);
   }
 };
 /*
@@ -748,24 +748,24 @@ class MaxPool2dGradFunctor<phi::GPUContext, T> {
 
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
-    KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        output_data,
-        output_grad_data,
-        input_channels,
-        input_height,
-        input_width,
-        output_height,
-        output_width,
-        ksize_height,
-        ksize_width,
-        stride_height,
-        stride_width,
-        padding_height,
-        padding_width,
-        input_grad_data,
-        pool_divmods);
+    KernelMaxPool2DGrad<T>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 output_data,
+                                                 output_grad_data,
+                                                 input_channels,
+                                                 input_height,
+                                                 input_width,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_height,
+                                                 padding_width,
+                                                 input_grad_data,
+                                                 pool_divmods);
   }
   void operator()(const phi::GPUContext& context,
                   const DenseTensor& input,
@@ -812,25 +812,25 @@ class MaxPool2dGradFunctor<phi::GPUContext, T> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
 
-    KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        output_data,
-        output_grad_data,
-        input_channels,
-        input_height,
-        input_width,
-        output_height,
-        output_width,
-        ksize_height,
-        ksize_width,
-        stride_height,
-        stride_width,
-        padding_height,
-        padding_width,
-        input_grad_data,
-        pool_divmods,
-        channel_last);
+    KernelMaxPool2DGrad<T>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 output_data,
+                                                 output_grad_data,
+                                                 input_channels,
+                                                 input_height,
+                                                 input_width,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_height,
+                                                 padding_width,
+                                                 input_grad_data,
+                                                 pool_divmods,
+                                                 channel_last);
   }
 };
 
@@ -1299,29 +1299,29 @@ class Pool3dFunctor<phi::GPUContext, PoolProcess, T> {
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        input_channels,
-        input_depth,
-        input_height,
-        input_width,
-        output_depth,
-        output_height,
-        output_width,
-        ksize_depth,
-        ksize_height,
-        ksize_width,
-        stride_depth,
-        stride_height,
-        stride_width,
-        padding_depth,
-        padding_height,
-        padding_width,
-        pool_process,
-        exclusive,
-        adaptive,
-        output_data);
+    KernelPool3D<PoolProcess, T>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 input_channels,
+                                                 input_depth,
+                                                 input_height,
+                                                 input_width,
+                                                 output_depth,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_depth,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_depth,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_depth,
+                                                 padding_height,
+                                                 padding_width,
+                                                 pool_process,
+                                                 exclusive,
+                                                 adaptive,
+                                                 output_data);
   }
   void operator()(const phi::GPUContext& context,
                   const DenseTensor& input,
@@ -1375,30 +1375,30 @@ class Pool3dFunctor<phi::GPUContext, PoolProcess, T> {
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        input_channels,
-        input_depth,
-        input_height,
-        input_width,
-        output_depth,
-        output_height,
-        output_width,
-        ksize_depth,
-        ksize_height,
-        ksize_width,
-        stride_depth,
-        stride_height,
-        stride_width,
-        padding_depth,
-        padding_height,
-        padding_width,
-        pool_process,
-        exclusive,
-        adaptive,
-        output_data,
-        channel_last);
+    KernelPool3D<PoolProcess, T>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 input_channels,
+                                                 input_depth,
+                                                 input_height,
+                                                 input_width,
+                                                 output_depth,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_depth,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_depth,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_depth,
+                                                 padding_height,
+                                                 padding_width,
+                                                 pool_process,
+                                                 exclusive,
+                                                 adaptive,
+                                                 output_data,
+                                                 channel_last);
   }
 };
 
@@ -1454,31 +1454,31 @@ class Pool3dGradFunctor<phi::GPUContext, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool3DGrad<T, PoolProcess><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        output_data,
-        output_grad_data,
-        input_channels,
-        input_depth,
-        input_height,
-        input_width,
-        output_depth,
-        output_height,
-        output_width,
-        ksize_depth,
-        ksize_height,
-        ksize_width,
-        stride_depth,
-        stride_height,
-        stride_width,
-        padding_depth,
-        padding_height,
-        padding_width,
-        pool_process,
-        exclusive,
-        adaptive,
-        input_grad_data);
+    KernelPool3DGrad<T, PoolProcess>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 output_data,
+                                                 output_grad_data,
+                                                 input_channels,
+                                                 input_depth,
+                                                 input_height,
+                                                 input_width,
+                                                 output_depth,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_depth,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_depth,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_depth,
+                                                 padding_height,
+                                                 padding_width,
+                                                 pool_process,
+                                                 exclusive,
+                                                 adaptive,
+                                                 input_grad_data);
   }
   void operator()(const phi::GPUContext& context,
                   const DenseTensor& input,
@@ -1608,28 +1608,28 @@ class MaxPool3dGradFunctor<phi::GPUContext, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        output_data,
-        output_grad_data,
-        input_channels,
-        input_depth,
-        input_height,
-        input_width,
-        output_depth,
-        output_height,
-        output_width,
-        ksize_depth,
-        ksize_height,
-        ksize_width,
-        stride_depth,
-        stride_height,
-        stride_width,
-        padding_depth,
-        padding_height,
-        padding_width,
-        input_grad_data);
+    KernelMaxPool3DGrad<T>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 output_data,
+                                                 output_grad_data,
+                                                 input_channels,
+                                                 input_depth,
+                                                 input_height,
+                                                 input_width,
+                                                 output_depth,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_depth,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_depth,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_depth,
+                                                 padding_height,
+                                                 padding_width,
+                                                 input_grad_data);
   }
   void operator()(const phi::GPUContext& context,
                   const DenseTensor& input,
@@ -1915,24 +1915,24 @@ class MaxPool2dWithIndexFunctor<phi::GPUContext, T1, T2> {
 
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
-    KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        input_channels,
-        input_height,
-        input_width,
-        output_height,
-        output_width,
-        ksize_height,
-        ksize_width,
-        stride_height,
-        stride_width,
-        padding_height,
-        padding_width,
-        adaptive,
-        output_data,
-        mask_data,
-        pool_divmods);
+    KernelMaxPool2dWithIdx<T1, T2>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 input_channels,
+                                                 input_height,
+                                                 input_width,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_height,
+                                                 padding_width,
+                                                 adaptive,
+                                                 output_data,
+                                                 mask_data,
+                                                 pool_divmods);
   }
 };
 
@@ -1976,24 +1976,24 @@ class MaxPool2dWithIndexGradFunctor<phi::GPUContext, T1, T2> {
 
     auto pool_divmods =
         FastDivModForPooling(input_channels, input_width, input_height);
-    KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        output_grad_data,
-        mask_data,
-        input_channels,
-        input_height,
-        input_width,
-        output_height,
-        output_width,
-        ksize_height,
-        ksize_width,
-        stride_height,
-        stride_width,
-        padding_height,
-        padding_width,
-        adaptive,
-        input_grad_data,
-        pool_divmods);
+    KernelMaxPool2DWithIdxGrad<T1, T2>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 output_grad_data,
+                                                 mask_data,
+                                                 input_channels,
+                                                 input_height,
+                                                 input_width,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_height,
+                                                 padding_width,
+                                                 adaptive,
+                                                 input_grad_data,
+                                                 pool_divmods);
   }
 };
 
@@ -2212,28 +2212,28 @@ class MaxPool3dWithIndexFunctor<phi::GPUContext, T1, T2> {
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        input_channels,
-        input_depth,
-        input_height,
-        input_width,
-        output_depth,
-        output_height,
-        output_width,
-        ksize_depth,
-        ksize_height,
-        ksize_width,
-        stride_depth,
-        stride_height,
-        stride_width,
-        padding_depth,
-        padding_height,
-        padding_width,
-        adaptive,
-        output_data,
-        mask_data);
+    KernelMaxPool3DWithIdx<T1, T2>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 input_channels,
+                                                 input_depth,
+                                                 input_height,
+                                                 input_width,
+                                                 output_depth,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_depth,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_depth,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_depth,
+                                                 padding_height,
+                                                 padding_width,
+                                                 adaptive,
+                                                 output_data,
+                                                 mask_data);
   }
 };
 
@@ -2281,28 +2281,28 @@ class MaxPool3dWithIndexGradFunctor<phi::GPUContext, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        output_grad_data,
-        mask_data,
-        input_channels,
-        input_depth,
-        input_height,
-        input_width,
-        output_depth,
-        output_height,
-        output_width,
-        ksize_depth,
-        ksize_height,
-        ksize_width,
-        stride_depth,
-        stride_height,
-        stride_width,
-        padding_depth,
-        padding_height,
-        padding_width,
-        adaptive,
-        input_grad_data);
+    KernelMaxPool3DWithIdxGrad<T1, T2>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 output_grad_data,
+                                                 mask_data,
+                                                 input_channels,
+                                                 input_depth,
+                                                 input_height,
+                                                 input_width,
+                                                 output_depth,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_depth,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_depth,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_depth,
+                                                 padding_height,
+                                                 padding_width,
+                                                 adaptive,
+                                                 input_grad_data);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
index fa285dc69d1ca..0eebfc856851e 100644
--- a/paddle/phi/kernels/funcs/pooling.h
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/platform/macros.h"  // import FLT_MAX
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index df14b0a21f24d..5c74751b348c0 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -895,24 +895,20 @@ static void LaunchReduceKernel(const Tx* x_data,
     auto grid_num = config.grid;
     auto block_num = config.block;
 #endif
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    OneDimIndexCal><<<grid_num, block_num, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim,
-        is_mean && (!config.should_reduce_again));
+    ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp, OneDimIndexCal>
+        <<<grid_num, block_num, 0, stream>>>(
+            x_data,
+            config.output_data,
+            reducer,
+            transform,
+            init,
+            config.reduce_num,
+            config.left_num,
+            config.reduce_last_dim,
+            reduce_index_calculator,
+            left_index_calculator,
+            dim,
+            is_mean && (!config.should_reduce_again));
 
   } else {
     int reduce_rank = config.reduce_strides.size();
@@ -939,24 +935,20 @@ static void LaunchReduceKernel(const Tx* x_data,
     auto grid_num = config.grid;
     auto block_num = config.block;
 #endif
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    IndexCalculator><<<grid_num, block_num, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim,
-        is_mean && (!config.should_reduce_again));
+    ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp, IndexCalculator>
+        <<<grid_num, block_num, 0, stream>>>(
+            x_data,
+            config.output_data,
+            reducer,
+            transform,
+            init,
+            config.reduce_num,
+            config.left_num,
+            config.reduce_last_dim,
+            reduce_index_calculator,
+            left_index_calculator,
+            dim,
+            is_mean && (!config.should_reduce_again));
   }
 
   if (config.should_reduce_again) {
@@ -982,30 +974,31 @@ static void LaunchReduceKernel(const Tx* x_data,
     auto grid_size = grid;
     auto block_size = block;
 #endif
-    ReduceHigherDimKernel<
-        Ty,
-        Ty,
-        MPType,
-        ReduceOp,
-        kps::IdentityFunctor<Ty, MPType>><<<grid_size, block_size, 0, stream>>>(
-        config.output_data,
-        y_data,
-        reducer,
-        kps::IdentityFunctor<Ty, MPType>(),
-        init,
-        config.grid.y,
-        config.left_num,
-        config.grid.y,
-        dim,
-        config.reduce_num,
-        is_mean);
+    ReduceHigherDimKernel<Ty,
+                          Ty,
+                          MPType,
+                          ReduceOp,
+                          kps::IdentityFunctor<Ty, MPType>>
+        <<<grid_size, block_size, 0, stream>>>(
+            config.output_data,
+            y_data,
+            reducer,
+            kps::IdentityFunctor<Ty, MPType>(),
+            init,
+            config.grid.y,
+            config.left_num,
+            config.grid.y,
+            dim,
+            config.reduce_num,
+            is_mean);
   }
 }
 
 #if !defined(PADDLE_WITH_XPU_KP)
 template <typename Tx,
           typename Ty,
-          template <typename> class ReduceOp,
+          template <typename>
+          class ReduceOp,
           typename TransformOp>
 static typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value,
                                void>::type
@@ -1044,7 +1037,8 @@ CubTensorReduceImpl(const Tx* x_data,
 
 template <typename Tx,
           typename Ty,
-          template <typename> class ReduceOp,
+          template <typename>
+          class ReduceOp,
           typename TransformOp>
 static typename std::enable_if<std::is_same<Tx, phi::dtype::float16>::value,
                                void>::type
@@ -1061,7 +1055,8 @@ CubTensorReduceImpl(const Tx* x_data,
 
 template <typename Tx,
           typename Ty,
-          template <typename> class ReduceOp,
+          template <typename>
+          class ReduceOp,
           typename TransformOp>
 void ReduceKernel(const KPDevice& dev_ctx,
                   const phi::DenseTensor& x,
@@ -1146,22 +1141,19 @@ void ReduceKernel(const KPDevice& dev_ctx,
     auto grid_num = config.grid;
     auto block_num = config.block;
 #endif
-    ReduceHigherDimKernel<Tx,
-                          Ty,
-                          MPType,
-                          ReduceOp<MPType>,
-                          TransformOp><<<grid_num, block_num, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        reducer.initial(),
-        config.reduce_num,
-        config.left_num,
-        config.blocking_size,
-        dim,
-        config.reduce_num,
-        is_mean && (!config.should_reduce_again));
+    ReduceHigherDimKernel<Tx, Ty, MPType, ReduceOp<MPType>, TransformOp>
+        <<<grid_num, block_num, 0, stream>>>(
+            x_data,
+            config.output_data,
+            reducer,
+            transform,
+            reducer.initial(),
+            config.reduce_num,
+            config.left_num,
+            config.blocking_size,
+            dim,
+            config.reduce_num,
+            is_mean && (!config.should_reduce_again));
 
     if (config.should_reduce_again) {
       dim3 block = dim3(config.block.x, 1, 1);
@@ -1177,24 +1169,23 @@ void ReduceKernel(const KPDevice& dev_ctx,
       auto grid_size = grid;
       auto block_size = block;
 #endif
-      ReduceHigherDimKernel<
-          Ty,
-          Ty,
-          MPType,
-          ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty,
-                               MPType>><<<grid_size, block_size, 0, stream>>>(
-          config.output_data,
-          y_data,
-          reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
-          reducer.initial(),
-          config.grid.y,
-          config.left_num,
-          config.grid.y,
-          dim2,
-          config.reduce_num,
-          is_mean);
+      ReduceHigherDimKernel<Ty,
+                            Ty,
+                            MPType,
+                            ReduceOp<MPType>,
+                            kps::IdentityFunctor<Ty, MPType>>
+          <<<grid_size, block_size, 0, stream>>>(
+              config.output_data,
+              y_data,
+              reducer,
+              kps::IdentityFunctor<Ty, MPType>(config.grid.y),
+              reducer.initial(),
+              config.grid.y,
+              config.left_num,
+              config.grid.y,
+              dim2,
+              config.reduce_num,
+              is_mean);
     }
     return;
   }
diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
index 87083af3bc6a2..e10ae3951ae2f 100644
--- a/paddle/phi/kernels/funcs/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/common/place.h"
@@ -232,14 +233,14 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx,
   dim3 grid = dim3((n + block - 1) / block);
   paddle::platform::LimitGridDim(ctx, &grid);
 
-  ScatterNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
-      p_update,
-      p_index,
-      p_output,
-      g_output_dims,
-      remain_numel,
-      slice_size,
-      end_size);
+  ScatterNdCUDAKernel<T, IndexT>
+      <<<grid, block, 0, ctx.stream()>>>(p_update,
+                                         p_index,
+                                         p_output,
+                                         g_output_dims,
+                                         remain_numel,
+                                         slice_size,
+                                         end_size);
 }
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/scatter.h b/paddle/phi/kernels/funcs/scatter.h
index 5d15c955a7f21..0b381e5710651 100644
--- a/paddle/phi/kernels/funcs/scatter.h
+++ b/paddle/phi/kernels/funcs/scatter.h
@@ -27,7 +27,7 @@ namespace phi {
 namespace funcs {
 
 /**
-  * Return the updated array pointer, use blas or eigen lib to optimize time
+ * Return the updated array pointer, use blas or eigen lib to optimize time
  * cost
  */
 template <typename T, typename IndexT = int>
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
index 687cccb1f64f9..1012ca413ed60 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/funcs/segment_pooling.h"
-
 #include <algorithm>
 
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -21,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
 
 namespace phi {
 namespace funcs {
@@ -281,18 +280,16 @@ void SegmentPoolCUDAGradFunctor(const phi::GPUContext& ctx,
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
   if (pooltype == "MAX" || pooltype == "MIN") {
-    SegmentIndexGradKernel<T,
-                           Index,
-                           ArrangeHelper<Index>><<<config.block_per_grid.x,
-                                                   config.thread_per_block.x,
-                                                   0,
-                                                   ctx.stream()>>>(
-        segment_ids.data<Index>(),
-        input.data<T>(),
-        output.data<T>(),
-        out_grad.data<T>(),
-        in_grad->data<T>(),
-        h);
+    SegmentIndexGradKernel<T, Index, ArrangeHelper<Index>>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           ctx.stream()>>>(segment_ids.data<Index>(),
+                           input.data<T>(),
+                           output.data<T>(),
+                           out_grad.data<T>(),
+                           in_grad->data<T>(),
+                           h);
   } else {
     PADDLE_THROW(phi::errors::InvalidArgument(
         "Unsupported segment pooling grad operation, Only MAX, MIN "
@@ -333,14 +330,14 @@ class SegmentPoolFunctor<phi::GPUContext, T, IndexT> {
           (input_length_size + DimTileSize - 1) / DimTileSize;
       auto config =
           phi::backends::gpu::GetGpuLaunchConfig1D(ctx, total_stripe_count);
-      SegmentSumIdsKernel<T, IndexT, IndexT(8)><<<config.block_per_grid.x,
-                                                  config.thread_per_block.x,
-                                                  0,
-                                                  ctx.stream()>>>(
-          segment_ids.data<IndexT>(),
-          summed_ids->data<T>(),
-          input_length_size,
-          total_stripe_count);
+      SegmentSumIdsKernel<T, IndexT, IndexT(8)>
+          <<<config.block_per_grid.x,
+             config.thread_per_block.x,
+             0,
+             ctx.stream()>>>(segment_ids.data<IndexT>(),
+                             summed_ids->data<T>(),
+                             input_length_size,
+                             total_stripe_count);
     }
 
     auto h = ArrangeHelper<IndexT>(
@@ -348,57 +345,51 @@ class SegmentPoolFunctor<phi::GPUContext, T, IndexT> {
     auto config =
         phi::backends::gpu::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
     if (pooltype == "MEAN") {
-      SegmentMeanKernel<T, IndexT, IndexT(8)><<<config.block_per_grid.x,
-                                                config.thread_per_block.x,
-                                                0,
-                                                ctx.stream()>>>(
-          segment_ids.data<IndexT>(),
-          input.data<T>(),
-          output->data<T>(),
-          summed_ids->data<T>(),
-          h.input_length_size,
-          h.inner_dim_size,
-          h.output_length_size,
-          h.total_stripe_count);
+      SegmentMeanKernel<T, IndexT, IndexT(8)>
+          <<<config.block_per_grid.x,
+             config.thread_per_block.x,
+             0,
+             ctx.stream()>>>(segment_ids.data<IndexT>(),
+                             input.data<T>(),
+                             output->data<T>(),
+                             summed_ids->data<T>(),
+                             h.input_length_size,
+                             h.inner_dim_size,
+                             h.output_length_size,
+                             h.total_stripe_count);
     } else if (pooltype == "SUM") {
       SumPool<T> pool;
-      SegmentOpsKernel<T,
-                       IndexT,
-                       ArrangeHelper<IndexT>,
-                       SumPool<T>><<<config.block_per_grid.x,
-                                     config.thread_per_block.x,
-                                     0,
-                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                                     input.data<T>(),
-                                                     output->data<T>(),
-                                                     h,
-                                                     pool);
+      SegmentOpsKernel<T, IndexT, ArrangeHelper<IndexT>, SumPool<T>>
+          <<<config.block_per_grid.x,
+             config.thread_per_block.x,
+             0,
+             ctx.stream()>>>(segment_ids.data<IndexT>(),
+                             input.data<T>(),
+                             output->data<T>(),
+                             h,
+                             pool);
     } else if (pooltype == "MAX") {
       MaxPool<T> pool;
-      SegmentOpsKernel<T,
-                       IndexT,
-                       ArrangeHelper<IndexT>,
-                       MaxPool<T>><<<config.block_per_grid.x,
-                                     config.thread_per_block.x,
-                                     0,
-                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                                     input.data<T>(),
-                                                     output->data<T>(),
-                                                     h,
-                                                     pool);
+      SegmentOpsKernel<T, IndexT, ArrangeHelper<IndexT>, MaxPool<T>>
+          <<<config.block_per_grid.x,
+             config.thread_per_block.x,
+             0,
+             ctx.stream()>>>(segment_ids.data<IndexT>(),
+                             input.data<T>(),
+                             output->data<T>(),
+                             h,
+                             pool);
     } else if (pooltype == "MIN") {
       MinPool<T> pool;
-      SegmentOpsKernel<T,
-                       IndexT,
-                       ArrangeHelper<IndexT>,
-                       MinPool<T>><<<config.block_per_grid.x,
-                                     config.thread_per_block.x,
-                                     0,
-                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                                     input.data<T>(),
-                                                     output->data<T>(),
-                                                     h,
-                                                     pool);
+      SegmentOpsKernel<T, IndexT, ArrangeHelper<IndexT>, MinPool<T>>
+          <<<config.block_per_grid.x,
+             config.thread_per_block.x,
+             0,
+             ctx.stream()>>>(segment_ids.data<IndexT>(),
+                             input.data<T>(),
+                             output->data<T>(),
+                             h,
+                             pool);
     } else {
       PADDLE_THROW(phi::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
diff --git a/paddle/phi/kernels/funcs/segment_pooling.h b/paddle/phi/kernels/funcs/segment_pooling.h
index 09da9eb304773..5432330abc708 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.h
+++ b/paddle/phi/kernels/funcs/segment_pooling.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index 193b9f614c9d5..a036f27cc2b80 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -25,6 +25,7 @@ namespace cub = hipcub;
 #endif
 
 #include <algorithm>
+
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
@@ -39,12 +40,12 @@ namespace funcs {
 using Mode = kps::details::ReduceMode;
 
 /*
-* Count how many of the data being processed by the current block are true
-* 1. Load data from global memory and cast from bool to int64_t
-* 2. Get result of this thread according to thread reduce
-* 3. Get result of this block according to block reduce
-* 4. first block store 0 and current result
-*/
+ * Count how many of the data being processed by the current block are true
+ * 1. Load data from global memory and cast from bool to int64_t
+ * 2. Get result of this thread according to thread reduce
+ * 3. Get result of this block according to block reduce
+ * 4. first block store 0 and current result
+ */
 template <typename T>
 struct NonZeroFunctor {
   HOSTDEVICE NonZeroFunctor() {}
@@ -110,10 +111,10 @@ __global__ void GetBlockCountKernel(const InT *in,
 }
 
 /*
-* Get block num prefix us one block, VecSize must be 2
-* 1. Each thread load 2 data : threadIdx.x and threadIdx.x + blockDimx.x
-* 2. Cumsum limitation is blockDim.x must be less than 512
-*/
+ * Get block num prefix us one block, VecSize must be 2
+ * 1. Each thread load 2 data : threadIdx.x and threadIdx.x + blockDimx.x
+ * 2. Cumsum limitation is blockDim.x must be less than 512
+ */
 
 template <typename InT,
           typename OutT,
@@ -248,8 +249,8 @@ struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 2> {
 };
 
 /**
-* Get mask's index if mask == true
-*/
+ * Get mask's index if mask == true
+ */
 template <typename InT,
           typename MT,
           typename OutT,
@@ -257,14 +258,13 @@ template <typename InT,
           int VecSize,
           int MaskData,
           int IsBoundary>  // SelectType = 1 Mask_select else where_index
-__device__ void
-SelectKernelImpl(OutT *out,
-                 const MT *mask,
-                 const InT *in,
-                 Functor func,
-                 int num,
-                 int data_offset,
-                 int store_rank) {
+__device__ void SelectKernelImpl(OutT *out,
+                                 const MT *mask,
+                                 const InT *in,
+                                 Functor func,
+                                 int num,
+                                 int data_offset,
+                                 int store_rank) {
   const int kCVecSize = 2;
   // each thread cumsum 2 data
   using IdT = int64_t;
@@ -418,8 +418,8 @@ void SelectKernel(const KPDevice &dev_ctx,
   DenseTensor count_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
   CT *count_data = count_mem.data<CT>();
   // 1.3 launch CountKernl
-  GetBlockCountKernel<MT, CT, kVecSize><<<grid, block, 0, stream>>>(
-      cond_data, count_data, numel, main_offset);
+  GetBlockCountKernel<MT, CT, kVecSize>
+      <<<grid, block, 0, stream>>>(cond_data, count_data, numel, main_offset);
   // 2.1 alloc cumsum data for CoutBlock prefix
   DenseTensor cumsum_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
   CT *cumsum_data = cumsum_mem.data<CT>();
@@ -454,20 +454,15 @@ void SelectKernel(const KPDevice &dev_ctx,
   auto out_data = out->mutable_data<OutT>(cuda_place);
   // 3.2 get true data's index according to cond_data and cumsum_data
   if (total_true_num <= 0) return;
-  SelectKernel<MT,
-               InT,
-               CT,
-               OutT,
-               Functor,
-               kVecSize,
-               SelectData><<<grid, block, 0, stream>>>(out_data,
-                                                       cond_data,
-                                                       in_data_ptr,
-                                                       cumsum_data,
-                                                       func,
-                                                       numel,
-                                                       main_offset,
-                                                       rank);
+  SelectKernel<MT, InT, CT, OutT, Functor, kVecSize, SelectData>
+      <<<grid, block, 0, stream>>>(out_data,
+                                   cond_data,
+                                   in_data_ptr,
+                                   cumsum_data,
+                                   func,
+                                   numel,
+                                   main_offset,
+                                   rank);
 }
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/sequence2batch.h b/paddle/phi/kernels/funcs/sequence2batch.h
index e7c387fb99b9c..ed3a50d883dc0 100644
--- a/paddle/phi/kernels/funcs/sequence2batch.h
+++ b/paddle/phi/kernels/funcs/sequence2batch.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/phi/kernels/funcs/slice_utils.h b/paddle/phi/kernels/funcs/slice_utils.h
index 0c956248fd9ef..e26a6543789d7 100644
--- a/paddle/phi/kernels/funcs/slice_utils.h
+++ b/paddle/phi/kernels/funcs/slice_utils.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <paddle/phi/core/ddim.h>
+
 #include <string>
 #include <vector>
 
diff --git a/paddle/phi/kernels/funcs/sparse/flatten_indices.h b/paddle/phi/kernels/funcs/sparse/flatten_indices.h
index ca212e4366ec4..9a031b8cc12ca 100644
--- a/paddle/phi/kernels/funcs/sparse/flatten_indices.h
+++ b/paddle/phi/kernels/funcs/sparse/flatten_indices.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
 #include "paddle/phi/core/ddim.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
index 9ed7cef12a148..b9568f1df716d 100644
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -27,7 +27,7 @@ namespace sparse {
  * rulebook_len: the length of rulebook
  * channels: the output channel size
  * out: the outputs
-**/
+ **/
 template <typename T>
 __global__ void ScatterKernel(const T* input,
                               const int* unique_value,
diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu
index 5c424316a83df..1364a1cd3fa2c 100644
--- a/paddle/phi/kernels/gpu/abs_kernel.cu
+++ b/paddle/phi/kernels/gpu/abs_kernel.cu
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu
index 5eecfce093248..d5b50800f53fa 100644
--- a/paddle/phi/kernels/gpu/accuracy_kernel.cu
+++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/accuracy_kernel.h"
-
 #include <thrust/execution_policy.h>
 #include <thrust/reduce.h>
+
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/accuracy_kernel.h"
 
 namespace phi {
 using paddle::platform::PADDLE_CUDA_NUM_THREADS;
@@ -94,15 +94,14 @@ void AccuracyRawKernel(const Context& dev_ctx,
     return;
   }
 
-  AccuracyCudaKernel<
-      PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-      num_samples,
-      infer_width,
-      indices_data,
-      label_data,
-      correct_data,
-      accuracy_data,
-      total_data);
+  AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS>
+      <<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(num_samples,
+                                                  infer_width,
+                                                  indices_data,
+                                                  label_data,
+                                                  correct_data,
+                                                  accuracy_data,
+                                                  total_data);
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index 1479fd494435d..8d7b3e8dde681 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/activation_grad_kernel.h"
-
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/activation_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/impl/activation_grad_impl.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-
 namespace phi {
 
 template <typename T, typename Context, typename Functor>
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 8db31c5ed5b79..05ec5dfb840c0 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -12,18 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/activation_kernel.h"
-
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/activation_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/impl/activation_grad_impl.h"
 #include "paddle/phi/kernels/impl/activation_impl.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-
 namespace phi {
 
 template <typename T, typename Context, typename Functor>
diff --git a/paddle/phi/kernels/gpu/adadelta_kernel.cu b/paddle/phi/kernels/gpu/adadelta_kernel.cu
index 7516a277a746f..15c6ad18a9680 100644
--- a/paddle/phi/kernels/gpu/adadelta_kernel.cu
+++ b/paddle/phi/kernels/gpu/adadelta_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/adadelta_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/adadelta_kernel.h"
 #include "paddle/phi/kernels/impl/adadelta_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
diff --git a/paddle/phi/kernels/gpu/adagrad_kernel.cu b/paddle/phi/kernels/gpu/adagrad_kernel.cu
index 0e037eb808ceb..381dde09be7d4 100644
--- a/paddle/phi/kernels/gpu/adagrad_kernel.cu
+++ b/paddle/phi/kernels/gpu/adagrad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/adagrad_kernel.h"
-
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/adagrad_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/impl/adagrad_kernel_impl.h"
 
@@ -106,19 +105,18 @@ struct SparseAdagradFunctor<phi::GPUContext, T> {
     dim3 threads(block_size, 1);
     dim3 grid2(1, merge_rows.size());
     paddle::framework::MixVector<int64_t> mixv_merge_rows(&merge_rows);
-    SparseAdagradFunctorKernel<
-        T,
-        256><<<grid2,
-               threads,
-               0,
-               reinterpret_cast<const phi::GPUContext&>(context).stream()>>>(
-        grad_merge_data,
-        mixv_merge_rows.CUDAMutableData(context.GetPlace()),
-        lr,
-        param_data,
-        moment_data,
-        grad_width,
-        epsilon);
+    SparseAdagradFunctorKernel<T, 256>
+        <<<grid2,
+           threads,
+           0,
+           reinterpret_cast<const phi::GPUContext&>(context).stream()>>>(
+            grad_merge_data,
+            mixv_merge_rows.CUDAMutableData(context.GetPlace()),
+            lr,
+            param_data,
+            moment_data,
+            grad_width,
+            epsilon);
     mixv_merge_rows.CopyToCPU();
   }
 };
diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu
index 449aaae1a4be4..edeeb64f5db5f 100644
--- a/paddle/phi/kernels/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/gpu/adam_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/adam_kernel.h"
-
 #include <math.h>  // for sqrt in CPU and CUDA
+
 #include <vector>
 
 #include "paddle/fluid/framework/tensor_util.h"
@@ -23,6 +22,7 @@
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/adam_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/paddle/phi/kernels/gpu/adamax_kernel.cu b/paddle/phi/kernels/gpu/adamax_kernel.cu
index 0817c531318c3..d9171c80fa9d8 100644
--- a/paddle/phi/kernels/gpu/adamax_kernel.cu
+++ b/paddle/phi/kernels/gpu/adamax_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/adamax_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/adamax_kernel.h"
 #include "paddle/phi/kernels/impl/adamax_kernel_impl.h"
 
 PD_REGISTER_KERNEL(adamax, GPU, ALL_LAYOUT, phi::AdamaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu
index 0fff142567a5e..7c00cd13d6b26 100644
--- a/paddle/phi/kernels/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/gpu/adamw_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/adamw_kernel.h"
-
 #include <math.h>  // for sqrt in CPU and CUDA
+
 #include <vector>
 
 #include "paddle/fluid/framework/tensor_util.h"
@@ -23,6 +22,7 @@
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/adamw_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu
index 87636631a9b95..d109ec49cfdb9 100644
--- a/paddle/phi/kernels/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -12,15 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/add_n_kernel.h"
-
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/add_n_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/memory/memcpy.h"
-
 namespace phi {
 
 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
diff --git a/paddle/phi/kernels/gpu/addmm_grad_kernel.cu b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
index 65978da1374e4..be71f619aa17e 100644
--- a/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/addmm_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/addmm_grad_kernel.h"
 #include "paddle/phi/kernels/impl/addmm_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
diff --git a/paddle/phi/kernels/gpu/addmm_kernel.cu b/paddle/phi/kernels/gpu/addmm_kernel.cu
index 7b589ce20acca..e30d664068e17 100644
--- a/paddle/phi/kernels/gpu/addmm_kernel.cu
+++ b/paddle/phi/kernels/gpu/addmm_kernel.cu
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/addmm_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/addmm_kernel.h"
 #include "paddle/phi/kernels/impl/addmm_kernel_impl.h"
 
 PD_REGISTER_KERNEL(addmm, GPU, ALL_LAYOUT, phi::AddmmKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu
index 8abc6b272c511..cfa809f60c2bc 100644
--- a/paddle/phi/kernels/gpu/allclose_kernel.cu
+++ b/paddle/phi/kernels/gpu/allclose_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/allclose_kernel.h"
-
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/allclose_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/arange_kernel.cu b/paddle/phi/kernels/gpu/arange_kernel.cu
index 9ea0d7c5393c3..99456147e5fed 100644
--- a/paddle/phi/kernels/gpu/arange_kernel.cu
+++ b/paddle/phi/kernels/gpu/arange_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/arange_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/arange_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/range_function.h"
 
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
index 385ddb5e521a2..101e01df52108 100644
--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/arg_min_max_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/arg_min_max_kernel.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 
@@ -27,6 +26,7 @@
 namespace cub = hipcub;
 #endif
 #include <limits>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -121,33 +121,27 @@ void ComputeFullArg(const phi::GPUContext& dev_ctx,
 
   if (typeid(Reducer) == typeid(cub::ArgMax)) {
     switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgCUDAKernel<T,
-                        IndType,
-                        Reducer,
-                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height,
-              width,
-              post,
-              Reducer(),
-              std::numeric_limits<T>::lowest(),
-              in_data,
-              out_data));
+      FIXED_BLOCK_DIM_CASE(ArgCUDAKernel<T, IndType, Reducer, kBlockDim>
+                           <<<grid_size, kBlockDim, 0, cu_stream>>>(
+                               height,
+                               width,
+                               post,
+                               Reducer(),
+                               std::numeric_limits<T>::lowest(),
+                               in_data,
+                               out_data));
     }
   } else {
     switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgCUDAKernel<T,
-                        IndType,
-                        Reducer,
-                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height,
-              width,
-              post,
-              Reducer(),
-              std::numeric_limits<T>::max(),
-              in_data,
-              out_data));
+      FIXED_BLOCK_DIM_CASE(ArgCUDAKernel<T, IndType, Reducer, kBlockDim>
+                           <<<grid_size, kBlockDim, 0, cu_stream>>>(
+                               height,
+                               width,
+                               post,
+                               Reducer(),
+                               std::numeric_limits<T>::max(),
+                               in_data,
+                               out_data));
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
index 15bca474f58c3..a2d149cb2e438 100644
--- a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/argsort_kernel.h"
-
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
+
+#include "paddle/phi/kernels/argsort_kernel.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
index 6a9c1e275998b..6e8d47d9a57f8 100644
--- a/paddle/phi/kernels/gpu/argsort_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/argsort_kernel.h"
-
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
+
+#include "paddle/phi/kernels/argsort_kernel.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
diff --git a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
index 6652d242de5ce..7e68610af1d54 100644
--- a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/atan2_kernel.cu b/paddle/phi/kernels/gpu/atan2_kernel.cu
index dd0bba177defe..887c11c7e7ff1 100644
--- a/paddle/phi/kernels/gpu/atan2_kernel.cu
+++ b/paddle/phi/kernels/gpu/atan2_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu
index 5a1bb9874fe19..ef55fb9a5e345 100644
--- a/paddle/phi/kernels/gpu/auc_kernel.cu
+++ b/paddle/phi/kernels/gpu/auc_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/auc_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/auc_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index c08fa4eb260d4..6de239182c15b 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -12,21 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/layout_utils.h"
+#include "paddle/fluid/operators/norm_utils.cu.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/flags.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
-
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
-#include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
-
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/operators/layout_utils.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#include "paddle/fluid/platform/flags.h"
 #include "paddle/phi/kernels/gpu/batch_norm_utils.h"
 
 #ifdef __HIPCC__
@@ -595,8 +591,8 @@ void BatchNormGradRawKernel(const Context &ctx,
               /*dBnScaleBiasDesc=*/bn_param_desc_,
               /*bnScaleData=*/scale.template data<BatchNormParamType<T>>(),
               /*bnBiasData=*/nullptr,
-              /*dBnScaleData=*/ctx.template Alloc<BatchNormParamType<T>>(
-                  d_scale),
+              /*dBnScaleData=*/
+              ctx.template Alloc<BatchNormParamType<T>>(d_scale),
               /*dBnBiasData=*/ctx.template Alloc<BatchNormParamType<T>>(d_bias),
               /*epsilon=*/epsilon,
               /*savedMean=*/saved_mean_data,
@@ -604,44 +600,42 @@ void BatchNormGradRawKernel(const Context &ctx,
               /*activationDesc=*/nullptr,
               /*workspace=*/workspace_ptr,
               /*workSpaceSizeInBytes=*/workspace_size,
-              /*reserveSpace=*/const_cast<uint8_t *>(
-                  reserve_space->template data<uint8_t>()),
+              /*reserveSpace=*/
+              const_cast<uint8_t *>(reserve_space->template data<uint8_t>()),
               /*reserveSpaceSizeInBytes=*/reserve_space_size));
 #endif  // CUDNN_VERSION_MIN(7, 4, 1)
       if (!called) {
 #ifdef PADDLE_WITH_HIP
         if (compute_format == DataLayout::kNCHW) {
-          BNBackward<T,
-                     block,
-                     DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
-              transformed_d_y.template data<T>(),
-              transformed_x.template data<T>(),
-              scale.template data<BatchNormParamType<T>>(),
-              saved_mean_data,
-              saved_var_data,
-              C,
-              N,
-              H * W * D,
-              epsilon,
-              transformed_d_x.template data<T>(),
-              ctx.template Alloc<BatchNormParamType<T>>(d_scale),
-              ctx.template Alloc<BatchNormParamType<T>>(d_bias));
+          BNBackward<T, block, DataLayout::kNCHW>
+              <<<grid2, block, 0, ctx.stream()>>>(
+                  transformed_d_y.template data<T>(),
+                  transformed_x.template data<T>(),
+                  scale.template data<BatchNormParamType<T>>(),
+                  saved_mean_data,
+                  saved_var_data,
+                  C,
+                  N,
+                  H * W * D,
+                  epsilon,
+                  transformed_d_x.template data<T>(),
+                  ctx.template Alloc<BatchNormParamType<T>>(d_scale),
+                  ctx.template Alloc<BatchNormParamType<T>>(d_bias));
         } else {
-          BNBackward<T,
-                     block,
-                     DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
-              transformed_d_y.template data<T>(),
-              transformed_x.template data<T>(),
-              scale.template data<BatchNormParamType<T>>(),
-              saved_mean_data,
-              saved_var_data,
-              C,
-              N,
-              H * W * D,
-              epsilon,
-              transformed_d_x.template data<T>(),
-              ctx.template Alloc<BatchNormParamType<T>>(d_scale),
-              ctx.template Alloc<BatchNormParamType<T>>(d_bias));
+          BNBackward<T, block, DataLayout::kNHWC>
+              <<<grid2, block, 0, ctx.stream()>>>(
+                  transformed_d_y.template data<T>(),
+                  transformed_x.template data<T>(),
+                  scale.template data<BatchNormParamType<T>>(),
+                  saved_mean_data,
+                  saved_var_data,
+                  C,
+                  N,
+                  H * W * D,
+                  epsilon,
+                  transformed_d_x.template data<T>(),
+                  ctx.template Alloc<BatchNormParamType<T>>(d_scale),
+                  ctx.template Alloc<BatchNormParamType<T>>(d_bias));
         }
 
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
@@ -693,67 +687,59 @@ void BatchNormGradRawKernel(const Context &ctx,
       // This branch call CUDA kernels
       if (compute_format == DataLayout::kNCHW) {
         if (d_x) {
-          BNBackwardData<
-              T,
-              block,
-              phi::DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
-              d_y->data<T>(),
-              scale.data<BatchNormParamType<T>>(),
-              saved_mean_data,
-              x.data<T>(),
-              saved_var_data,
-              C,
-              N,
-              H * W * D,
-              d_x->data<T>());
+          BNBackwardData<T, block, phi::DataLayout::kNCHW>
+              <<<grid2, block, 0, ctx.stream()>>>(
+                  d_y->data<T>(),
+                  scale.data<BatchNormParamType<T>>(),
+                  saved_mean_data,
+                  x.data<T>(),
+                  saved_var_data,
+                  C,
+                  N,
+                  H * W * D,
+                  d_x->data<T>());
         }
         if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<
-              T,
-              block,
-              phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-              d_y->data<T>(),
-              x.data<T>(),
-              saved_mean_data,
-              saved_var_data,
-              epsilon,
-              N,
-              C,
-              H * W * D,
-              d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
+          KeBNBackwardScaleBias<T, block, phi::DataLayout::kNCHW>
+              <<<grid2, block, 0, stream>>>(
+                  d_y->data<T>(),
+                  x.data<T>(),
+                  saved_mean_data,
+                  saved_var_data,
+                  epsilon,
+                  N,
+                  C,
+                  H * W * D,
+                  d_scale->data<BatchNormParamType<T>>(),
+                  d_bias->data<BatchNormParamType<T>>());
         }
       } else {
         if (d_x) {
-          BNBackwardData<
-              T,
-              block,
-              phi::DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
-              d_y->data<T>(),
-              scale.data<BatchNormParamType<T>>(),
-              saved_mean_data,
-              x.data<T>(),
-              saved_var_data,
-              C,
-              N,
-              H * W * D,
-              d_x->data<T>());
+          BNBackwardData<T, block, phi::DataLayout::kNHWC>
+              <<<grid2, block, 0, ctx.stream()>>>(
+                  d_y->data<T>(),
+                  scale.data<BatchNormParamType<T>>(),
+                  saved_mean_data,
+                  x.data<T>(),
+                  saved_var_data,
+                  C,
+                  N,
+                  H * W * D,
+                  d_x->data<T>());
         }
         if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<
-              T,
-              block,
-              phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-              d_y->data<T>(),
-              x.data<T>(),
-              saved_mean_data,
-              saved_var_data,
-              epsilon,
-              N,
-              C,
-              H * W * D,
-              d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
+          KeBNBackwardScaleBias<T, block, phi::DataLayout::kNHWC>
+              <<<grid2, block, 0, stream>>>(
+                  d_y->data<T>(),
+                  x.data<T>(),
+                  saved_mean_data,
+                  saved_var_data,
+                  epsilon,
+                  N,
+                  C,
+                  H * W * D,
+                  d_scale->data<BatchNormParamType<T>>(),
+                  d_bias->data<BatchNormParamType<T>>());
         }
       }
     }
@@ -802,61 +788,55 @@ void BatchNormGradRawKernel(const Context &ctx,
 
     if (compute_format == DataLayout::kNCHW) {
       if (d_x) {
-        KeBNBackwardData<T,
-                         phi::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
-            d_y->data<T>(),
-            scale.data<BatchNormParamType<T>>(),
-            running_var_data,
-            epsilon,
-            C,
-            H * W,
-            num,
-            d_x->data<T>());
+        KeBNBackwardData<T, phi::DataLayout::kNCHW>
+            <<<grid1, block, 0, stream>>>(d_y->data<T>(),
+                                          scale.data<BatchNormParamType<T>>(),
+                                          running_var_data,
+                                          epsilon,
+                                          C,
+                                          H * W,
+                                          num,
+                                          d_x->data<T>());
       }
       if (d_scale && d_bias) {
-        KeBNBackwardScaleBias<
-            T,
-            block,
-            phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-            d_y->data<T>(),
-            x.data<T>(),
-            running_mean_data,
-            running_var_data,
-            epsilon,
-            N,
-            C,
-            H * W * D,
-            d_scale->data<BatchNormParamType<T>>(),
-            d_bias->data<BatchNormParamType<T>>());
+        KeBNBackwardScaleBias<T, block, phi::DataLayout::kNCHW>
+            <<<grid2, block, 0, stream>>>(
+                d_y->data<T>(),
+                x.data<T>(),
+                running_mean_data,
+                running_var_data,
+                epsilon,
+                N,
+                C,
+                H * W * D,
+                d_scale->data<BatchNormParamType<T>>(),
+                d_bias->data<BatchNormParamType<T>>());
       }
     } else {
       if (d_x) {
-        KeBNBackwardData<T,
-                         phi::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
-            d_y->data<T>(),
-            scale.data<BatchNormParamType<T>>(),
-            running_var_data,
-            epsilon,
-            C,
-            H * W,
-            num,
-            d_x->data<T>());
+        KeBNBackwardData<T, phi::DataLayout::kNHWC>
+            <<<grid1, block, 0, stream>>>(d_y->data<T>(),
+                                          scale.data<BatchNormParamType<T>>(),
+                                          running_var_data,
+                                          epsilon,
+                                          C,
+                                          H * W,
+                                          num,
+                                          d_x->data<T>());
       }
       if (d_scale && d_bias) {
-        KeBNBackwardScaleBias<
-            T,
-            block,
-            phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-            d_y->data<T>(),
-            x.data<T>(),
-            running_mean_data,
-            running_var_data,
-            epsilon,
-            N,
-            C,
-            H * W * D,
-            d_scale->data<BatchNormParamType<T>>(),
-            d_bias->data<BatchNormParamType<T>>());
+        KeBNBackwardScaleBias<T, block, phi::DataLayout::kNHWC>
+            <<<grid2, block, 0, stream>>>(
+                d_y->data<T>(),
+                x.data<T>(),
+                running_mean_data,
+                running_var_data,
+                epsilon,
+                N,
+                C,
+                H * W * D,
+                d_scale->data<BatchNormParamType<T>>(),
+                d_bias->data<BatchNormParamType<T>>());
       }
     }
   }
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index e2aeec723628c..26b28d501869c 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -20,20 +20,17 @@
 namespace cub = hipcub;
 #endif
 
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/layout_utils.h"
+#include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/flags.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
-
-#include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
-
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/operators/layout_utils.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#include "paddle/fluid/platform/flags.h"
 #include "paddle/phi/kernels/gpu/batch_norm_utils.h"
 
 #ifdef __HIPCC__
@@ -353,33 +350,31 @@ void BatchNormKernel(const Context &ctx,
     const int block_size = 256;
     const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
     if (compute_format == DataLayout::kNCHW) {
-      BNForwardInference<
-          T,
-          DataLayout::kNCHW><<<grid_size, block_size, 0, ctx.stream()>>>(
-          transformed_x.template data<T>(),
-          est_mean->template data<BatchNormParamType<T>>(),
-          est_var->template data<BatchNormParamType<T>>(),
-          scale.template data<BatchNormParamType<T>>(),
-          bias.template data<BatchNormParamType<T>>(),
-          C,
-          N,
-          H * W * D,
-          epsilon,
-          transformed_y.template data<T>());
+      BNForwardInference<T, DataLayout::kNCHW>
+          <<<grid_size, block_size, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              est_mean->template data<BatchNormParamType<T>>(),
+              est_var->template data<BatchNormParamType<T>>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              transformed_y.template data<T>());
     } else {
-      BNForwardInference<
-          T,
-          DataLayout::kNHWC><<<grid_size, block_size, 0, ctx.stream()>>>(
-          transformed_x.template data<T>(),
-          est_mean->template data<BatchNormParamType<T>>(),
-          est_var->template data<BatchNormParamType<T>>(),
-          scale.template data<BatchNormParamType<T>>(),
-          bias.template data<BatchNormParamType<T>>(),
-          C,
-          N,
-          H * W * D,
-          epsilon,
-          transformed_y.template data<T>());
+      BNForwardInference<T, DataLayout::kNHWC>
+          <<<grid_size, block_size, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              est_mean->template data<BatchNormParamType<T>>(),
+              est_var->template data<BatchNormParamType<T>>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              transformed_y.template data<T>());
     }
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // PADDLE_ENFORCE_GPU_SUCCESS(
@@ -539,41 +534,37 @@ void BatchNormKernel(const Context &ctx,
         const int max_blocks = std::max(max_threads / block, 1);
         const int grid = std::min(C, max_blocks);
         if (compute_format == DataLayout::kNCHW) {
-          BNForwardTraining<
-              T,
-              block,
-              DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
-              transformed_x.template data<T>(),
-              scale.template data<BatchNormParamType<T>>(),
-              bias.template data<BatchNormParamType<T>>(),
-              C,
-              N,
-              H * W * D,
-              epsilon,
-              this_factor,
-              transformed_y.template data<T>(),
-              mean_out->template data<BatchNormParamType<T>>(),
-              variance_out->template data<BatchNormParamType<T>>(),
-              saved_mean->template data<BatchNormParamType<T>>(),
-              saved_variance->template data<BatchNormParamType<T>>());
+          BNForwardTraining<T, block, DataLayout::kNCHW>
+              <<<grid, block, 0, ctx.stream()>>>(
+                  transformed_x.template data<T>(),
+                  scale.template data<BatchNormParamType<T>>(),
+                  bias.template data<BatchNormParamType<T>>(),
+                  C,
+                  N,
+                  H * W * D,
+                  epsilon,
+                  this_factor,
+                  transformed_y.template data<T>(),
+                  mean_out->template data<BatchNormParamType<T>>(),
+                  variance_out->template data<BatchNormParamType<T>>(),
+                  saved_mean->template data<BatchNormParamType<T>>(),
+                  saved_variance->template data<BatchNormParamType<T>>());
         } else {
-          BNForwardTraining<
-              T,
-              block,
-              DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
-              transformed_x.template data<T>(),
-              scale.template data<BatchNormParamType<T>>(),
-              bias.template data<BatchNormParamType<T>>(),
-              C,
-              N,
-              H * W * D,
-              epsilon,
-              this_factor,
-              transformed_y.template data<T>(),
-              mean_out->template data<BatchNormParamType<T>>(),
-              variance_out->template data<BatchNormParamType<T>>(),
-              saved_mean->template data<BatchNormParamType<T>>(),
-              saved_variance->template data<BatchNormParamType<T>>());
+          BNForwardTraining<T, block, DataLayout::kNHWC>
+              <<<grid, block, 0, ctx.stream()>>>(
+                  transformed_x.template data<T>(),
+                  scale.template data<BatchNormParamType<T>>(),
+                  bias.template data<BatchNormParamType<T>>(),
+                  C,
+                  N,
+                  H * W * D,
+                  epsilon,
+                  this_factor,
+                  transformed_y.template data<T>(),
+                  mean_out->template data<BatchNormParamType<T>>(),
+                  variance_out->template data<BatchNormParamType<T>>(),
+                  saved_mean->template data<BatchNormParamType<T>>(),
+                  saved_variance->template data<BatchNormParamType<T>>());
         }
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
index 94eabac4d1306..b9f1680726de8 100644
--- a/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/bce_loss_grad_kernel.h"
-
 #include <algorithm>
 #include <vector>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/bce_loss_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/bce_loss_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
index b190bce474280..bc2d278049ce9 100644
--- a/paddle/phi/kernels/gpu/bce_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/bce_loss_kernel.h"
-
 #include <algorithm>
 #include <vector>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/bce_loss_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 
diff --git a/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu b/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu
index f4f69ee83eea1..6186e90d54ce1 100644
--- a/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h"
 #include "paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h"
 
-#include "paddle/phi/core/kernel_registry.h"
-
 PD_REGISTER_KERNEL(bilinear_tensor_product_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu b/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu
index b81b842cedba2..b23e9ccfcc8ea 100644
--- a/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu
+++ b/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/bilinear_tensor_product_kernel.h"
 #include "paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h"
 
-#include "paddle/phi/core/kernel_registry.h"
-
 PD_REGISTER_KERNEL(bilinear_tensor_product,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu
index 8e60b31c3706b..257c708ee5314 100644
--- a/paddle/phi/kernels/gpu/bincount_kernel.cu
+++ b/paddle/phi/kernels/gpu/bincount_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/bincount_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/bincount_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -108,11 +107,9 @@ void BincountCUDAInner(const Context& dev_ctx,
     int64_t* output_data = dev_ctx.template Alloc<int64_t>(output);
     phi::funcs::SetConstant<Context, int64_t>()(dev_ctx, output, 0L);
 
-    KernelBincount<T, InputT, int64_t><<<GET_BLOCKS(input_numel),
-                                         PADDLE_CUDA_NUM_THREADS,
-                                         0,
-                                         stream>>>(
-        input_data, input_numel, has_weights, weights_data, output_data);
+    KernelBincount<T, InputT, int64_t>
+        <<<GET_BLOCKS(input_numel), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+            input_data, input_numel, has_weights, weights_data, output_data);
   } else {
     const auto& weights_type =
         paddle::framework::TransToProtoVarType(weights->dtype());
@@ -122,20 +119,16 @@ void BincountCUDAInner(const Context& dev_ctx,
       phi::funcs::SetConstant<Context, float>()(
           dev_ctx, output, static_cast<float>(0));
 
-      KernelBincount<T, InputT, float><<<GET_BLOCKS(input_numel),
-                                         PADDLE_CUDA_NUM_THREADS,
-                                         0,
-                                         stream>>>(
-          input_data, input_numel, has_weights, weights_data, output_data);
+      KernelBincount<T, InputT, float>
+          <<<GET_BLOCKS(input_numel), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+              input_data, input_numel, has_weights, weights_data, output_data);
     } else {
       double* output_data = dev_ctx.template Alloc<double>(output);
       phi::funcs::SetConstant<Context, double>()(
           dev_ctx, output, static_cast<double>(0));
-      KernelBincount<T, InputT, double><<<GET_BLOCKS(input_numel),
-                                          PADDLE_CUDA_NUM_THREADS,
-                                          0,
-                                          stream>>>(
-          input_data, input_numel, has_weights, weights_data, output_data);
+      KernelBincount<T, InputT, double>
+          <<<GET_BLOCKS(input_numel), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+              input_data, input_numel, has_weights, weights_data, output_data);
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
index d4850b74477d2..eb70ef9ee76f4 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
-
 #include <vector>
+
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
index aa45bd3c43891..5c87c9fc90783 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
-#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
-
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
+#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
 
 PD_REGISTER_KERNEL(broadcast_tensors,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/cast_grad_kernel.cu b/paddle/phi/kernels/gpu/cast_grad_kernel.cu
index f4b610301583c..0029e6e954e86 100644
--- a/paddle/phi/kernels/gpu/cast_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cast_grad_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/cast_grad_kernel.h"
 #include "paddle/phi/kernels/gpu/cast_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu
index a879dc3bafd74..b2b42482ad357 100644
--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cast_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/gpu/cast_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/channel_shuffle_grad_kernel.cu b/paddle/phi/kernels/gpu/channel_shuffle_grad_kernel.cu
index 63d3d4a554f81..5fc67daf44a4f 100644
--- a/paddle/phi/kernels/gpu/channel_shuffle_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/channel_shuffle_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/channel_shuffle_grad_kernel.h"
-#include "paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/channel_shuffle_grad_kernel.h"
+#include "paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(channel_shuffle_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/channel_shuffle_kernel.cu b/paddle/phi/kernels/gpu/channel_shuffle_kernel.cu
index f85cb4aafd1dc..adecd2f960036 100644
--- a/paddle/phi/kernels/gpu/channel_shuffle_kernel.cu
+++ b/paddle/phi/kernels/gpu/channel_shuffle_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/channel_shuffle_kernel.h"
-#include "paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/channel_shuffle_kernel.h"
+#include "paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h"
 
 PD_REGISTER_KERNEL(channel_shuffle,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu b/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu
index 9165e8ea4147f..0cff8de529428 100644
--- a/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cholesky_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cholesky_grad_kernel.h"
 #include "paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
index 22ea87d83e8db..ec9ac7545d23c 100644
--- a/paddle/phi/kernels/gpu/cholesky_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -15,16 +15,17 @@ limitations under the License. */
 #ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
-#include "paddle/phi/kernels/cholesky_kernel.h"
-
 #include <thrust/device_vector.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cholesky_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
index 82b1282cc36dc..9be20c8025226 100644
--- a/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
@@ -15,10 +15,9 @@
 #ifndef PADDLE_WITH_HIP
 // backward reuse forward, HIP not support forward
 
-#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(cholesky_solve_grad,  // cuda_only
                    GPU,
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
index f1c91f3824780..f74f4bf3814f3 100644
--- a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
@@ -15,14 +15,13 @@
 #ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
-#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
-
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/clip_grad_kernel.cu b/paddle/phi/kernels/gpu/clip_grad_kernel.cu
index 4566e8468ec16..bab7dd41aee7c 100644
--- a/paddle/phi/kernels/gpu/clip_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/clip_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/clip_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/clip_grad_kernel.h"
 #include "paddle/phi/kernels/impl/clip_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(clip_grad,
diff --git a/paddle/phi/kernels/gpu/clip_kernel.cu b/paddle/phi/kernels/gpu/clip_kernel.cu
index 9e0050db7fdbf..9295b8b37a01f 100644
--- a/paddle/phi/kernels/gpu/clip_kernel.cu
+++ b/paddle/phi/kernels/gpu/clip_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/clip_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/clip_kernel.h"
 #include "paddle/phi/kernels/impl/clip_kernel_impl.h"
 
 PD_REGISTER_KERNEL(clip,
diff --git a/paddle/phi/kernels/gpu/complex_grad_kernel.cu b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
index ad694445d1874..b1a4c984eed71 100644
--- a/paddle/phi/kernels/gpu/complex_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/complex_grad_kernel.h"
-#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
-
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/complex_grad_kernel.h"
+#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(imag_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index e03e079581a9b..ae53aa9510228 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/impl/complex_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/impl/complex_kernel_impl.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/complex.h"
diff --git a/paddle/phi/kernels/gpu/concat_grad_kernel.cu b/paddle/phi/kernels/gpu/concat_grad_kernel.cu
index 2445978daca46..6b980c1a03335 100644
--- a/paddle/phi/kernels/gpu/concat_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_grad_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/concat_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/concat_grad_kernel.h"
 #include "paddle/phi/kernels/impl/concat_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(concat_grad,
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index accb1cc3d77e3..9582110c621cd 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/concat_kernel.h"
-
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
@@ -22,6 +20,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/lod_utils.h"
+#include "paddle/phi/kernels/concat_kernel.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 
diff --git a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
index 6449a193a082e..f2669ebe04b2a 100644
--- a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
-#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
diff --git a/paddle/phi/kernels/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_kernel.cu
index 677ec4a0620af..5fae327c2a93f 100644
--- a/paddle/phi/kernels/gpu/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/conv_grad_kernel.h"
-#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/conv_kernel.cu b/paddle/phi/kernels/gpu/conv_kernel.cu
index 680ee4426af06..3aa406af4c2ae 100644
--- a/paddle/phi/kernels/gpu/conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/conv_kernel.h"
-#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
index e583e13650aeb..ee140a529b7fb 100644
--- a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
@@ -12,15 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
-#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h"
-
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpu/depthwise_conv.h"
+#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
index b7d34a5baf3df..4f8aae09a7385 100644
--- a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
@@ -12,15 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/conv_transpose_kernel.h"
-#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h"
-
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpu/depthwise_conv.h"
+#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu
index 16eff5b26e38a..c917ce7548903 100644
--- a/paddle/phi/kernels/gpu/copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/copy_kernel.cu
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/copy_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/malloc.h"
diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
index c66daf4fe64e6..94d91cbcbbd28 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -22,6 +22,10 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
@@ -31,11 +35,6 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
-#include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
 namespace phi {
 
 template <typename T>
@@ -195,19 +194,19 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
       logits_grad_2d.Resize({n, d});
       int grid = (n * remain + block - 1) / block;
       const auto* label_data = label.data<LabelT>();
-      HardLabelCrossEntropyGradientKernel<T,
-                                          LabelT><<<grid, block, 0, stream>>>(
-          logit_grad_data, label_data, n, d, remain, ignore_index);
+      HardLabelCrossEntropyGradientKernel<T, LabelT>
+          <<<grid, block, 0, stream>>>(
+              logit_grad_data, label_data, n, d, remain, ignore_index);
       int num = n * d;
       grid = (num + block - 1) / block;
-      ScaleCrossEntropyGradient<T, LabelT><<<grid, block, 0, stream>>>(
-          logit_grad_data,
-          loss_grad_data,
-          num,
-          d,
-          remain,
-          label_data,
-          ignore_index);
+      ScaleCrossEntropyGradient<T, LabelT>
+          <<<grid, block, 0, stream>>>(logit_grad_data,
+                                       loss_grad_data,
+                                       num,
+                                       d,
+                                       remain,
+                                       label_data,
+                                       ignore_index);
     }
 
     return;
@@ -224,15 +223,15 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
     const T* softmax_data = softmax.data<T>();
     const auto* label_data = label.data<LabelT>();
     int grid = (n * d + block - 1) / block;
-    SoftmaxWithCrossEntropyGradHardLabel<T><<<grid, block, 0, stream>>>(
-        logit_grad_data,
-        loss_grad_data,
-        softmax_data,
-        label_data,
-        n,
-        d / remain,
-        remain,
-        ignore_index);
+    SoftmaxWithCrossEntropyGradHardLabel<T>
+        <<<grid, block, 0, stream>>>(logit_grad_data,
+                                     loss_grad_data,
+                                     softmax_data,
+                                     label_data,
+                                     n,
+                                     d / remain,
+                                     remain,
+                                     ignore_index);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index 1908c78060483..75a4658ee7dad 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -22,6 +22,10 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
@@ -31,11 +35,6 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
-#include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
 namespace phi {
 
 #define ALIGN_BYTES 16
@@ -704,13 +703,11 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss,
   }
 }
 
-#define SOFTMAX_WARP_FORWARD_SOFT_CASE(Log2Elements, VecT, AccT)               \
-  case Log2Elements:                                                           \
-    WarpSoftmaxForwardSoftLabel<T,                                             \
-                                VecT,                                          \
-                                AccT,                                          \
-                                Log2Elements><<<blocks, threads, 0, stream>>>( \
-        loss, softmax, src, label, batch_size, stride, element_count);         \
+#define SOFTMAX_WARP_FORWARD_SOFT_CASE(Log2Elements, VecT, AccT)           \
+  case Log2Elements:                                                       \
+    WarpSoftmaxForwardSoftLabel<T, VecT, AccT, Log2Elements>               \
+        <<<blocks, threads, 0, stream>>>(                                  \
+            loss, softmax, src, label, batch_size, stride, element_count); \
     break;
 
 /*
@@ -1104,23 +1101,17 @@ __global__ void WarpSoftmaxForward(T* loss,
   }
 }
 
-#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, LabelT, VecT, AccT)  \
-  case Log2Elements:                                                 \
-    WarpSoftmaxForward<T,                                            \
-                       LabelT,                                       \
-                       VecT,                                         \
-                       AccT,                                         \
-                       Log2Elements,                                 \
-                       mode,                                         \
-                       IgnoreIndex><<<blocks, threads, 0, stream>>>( \
-        loss,                                                        \
-        softmax,                                                     \
-        src,                                                         \
-        label,                                                       \
-        batch_size,                                                  \
-        stride,                                                      \
-        element_count,                                               \
-        ignore_index);                                               \
+#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, LabelT, VecT, AccT)            \
+  case Log2Elements:                                                           \
+    WarpSoftmaxForward<T, LabelT, VecT, AccT, Log2Elements, mode, IgnoreIndex> \
+        <<<blocks, threads, 0, stream>>>(loss,                                 \
+                                         softmax,                              \
+                                         src,                                  \
+                                         label,                                \
+                                         batch_size,                           \
+                                         stride,                               \
+                                         element_count,                        \
+                                         ignore_index);                        \
     break;
 
 /*
@@ -1189,12 +1180,9 @@ void LaunchVectorizedSoftmaxForward(T* loss,
   block_size = std::max(block_size, kps::details::kWarpSize);
   dim3 grids(high_dim);
   dim3 blocks(block_size);
-  VectorizedSoftmaxForward<T,
-                           AccT,
-                           LabelT,
-                           vec_size,
-                           IgnoreIndex><<<grids, blocks, 0, stream>>>(
-      loss, softmax, logits, label, high_dim, mid_dim, ignore_index);
+  VectorizedSoftmaxForward<T, AccT, LabelT, vec_size, IgnoreIndex>
+      <<<grids, blocks, 0, stream>>>(
+          loss, softmax, logits, label, high_dim, mid_dim, ignore_index);
 }
 
 /*
@@ -1281,10 +1269,9 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
     int threads = 128;
     int blocks = (N * dim * D + threads - 1) / threads;
     // compute cross entropy, input is log softmax
-    CrossEntropyExpHardLabel<T,
-                             LabelT,
-                             IgnoreIndex><<<blocks, threads, 0, stream>>>(
-        loss_data, softmax_data, labels_data, N, dim, D, ignore_index);
+    CrossEntropyExpHardLabel<T, LabelT, IgnoreIndex>
+        <<<blocks, threads, 0, stream>>>(
+            loss_data, softmax_data, labels_data, N, dim, D, ignore_index);
   }
 }
 
@@ -1366,44 +1353,38 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,
       int blocks = (n * d + kBatchPerBlock - 1) / kBatchPerBlock;
       dim3 threads(kThreadPerBlock / kBatchPerBlock, kBatchPerBlock, 1);
 
-      CrossEntropySoftLabel<T,
-                            T,
-                            false><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          loss_data,
-          NULL,
-          logits_data,
-          labels_data,
-          n,
-          axis_dim,
-          d / axis_dim,
-          kDimLog2);
+      CrossEntropySoftLabel<T, T, false>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(loss_data,
+                                                     NULL,
+                                                     logits_data,
+                                                     labels_data,
+                                                     n,
+                                                     axis_dim,
+                                                     d / axis_dim,
+                                                     kDimLog2);
     } else {  // HardLabel
       auto* logits_data = softmax->data<T>();
       auto* labels_data = labels.data<LabelT>();
       int threads = 128;
       int blocks = (n * d / axis_dim + threads - 1) / threads;
       if (ignore_index >= 0 && ignore_index < axis_dim) {
-        CrossEntropyHardLabel<T,
-                              LabelT,
-                              true><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            loss_data,
-            logits_data,
-            labels_data,
-            n,
-            axis_dim,
-            d / axis_dim,
-            ignore_index);
+        CrossEntropyHardLabel<T, LabelT, true>
+            <<<blocks, threads, 0, dev_ctx.stream()>>>(loss_data,
+                                                       logits_data,
+                                                       labels_data,
+                                                       n,
+                                                       axis_dim,
+                                                       d / axis_dim,
+                                                       ignore_index);
       } else {
-        CrossEntropyHardLabel<T,
-                              LabelT,
-                              false><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            loss_data,
-            logits_data,
-            labels_data,
-            n,
-            axis_dim,
-            d / axis_dim,
-            ignore_index);
+        CrossEntropyHardLabel<T, LabelT, false>
+            <<<blocks, threads, 0, dev_ctx.stream()>>>(loss_data,
+                                                       logits_data,
+                                                       labels_data,
+                                                       n,
+                                                       axis_dim,
+                                                       d / axis_dim,
+                                                       ignore_index);
       }
     }
 
diff --git a/paddle/phi/kernels/gpu/cross_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
index 1bb0d42dad81a..1f83f05f81c77 100644
--- a/paddle/phi/kernels/gpu/cross_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cross_grad_kernel.h"
-#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cross_grad_kernel.h"
+#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(cross_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/cross_kernel.cu b/paddle/phi/kernels/gpu/cross_kernel.cu
index aa944f8291674..4f3e5f0ca8c5d 100644
--- a/paddle/phi/kernels/gpu/cross_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cross_kernel.h"
-#include "paddle/phi/kernels/impl/cross_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cross_kernel.h"
+#include "paddle/phi/kernels/impl/cross_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     cross, GPU, ALL_LAYOUT, phi::CrossKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
index 6e8712462928d..bbae4fd130c7a 100644
--- a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cumprod_grad_kernel.h"
-
 #include <thrust/transform.h>
+
 #include "paddle/fluid/operators/math/inclusive_scan.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cumprod_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/cumprod.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
diff --git a/paddle/phi/kernels/gpu/cumprod_kernel.cu b/paddle/phi/kernels/gpu/cumprod_kernel.cu
index 1bbf8972a2479..86aef50ac321c 100644
--- a/paddle/phi/kernels/gpu/cumprod_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumprod_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cumprod_kernel.h"
-
 #include "paddle/fluid/operators/math/inclusive_scan.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cumprod_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/cumprod.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
diff --git a/paddle/phi/kernels/gpu/cumsum_kernel.cu b/paddle/phi/kernels/gpu/cumsum_kernel.cu
index 13975ddd3ef89..ed131e0ff5413 100644
--- a/paddle/phi/kernels/gpu/cumsum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumsum_kernel.cu
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cumsum_kernel.h"
-
 #include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
 #include <thrust/reverse.h>
 #include <thrust/scan.h>
+
+#include "paddle/phi/kernels/cumsum_kernel.h"
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
@@ -302,13 +302,13 @@ void CumsumKernel(const Context& dev_ctx,
         out_data, in_data, outer_size, inner_size, scan_size, exclusive);
 
   } else {
-    BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
-        next_out_data,
-        next_in_data,
-        outer_size,
-        inner_size,
-        scan_size,
-        exclusive);
+    BlockScanKernel<T, 128, 4>
+        <<<scan_grid, 128, 0, dev_ctx.stream()>>>(next_out_data,
+                                                  next_in_data,
+                                                  outer_size,
+                                                  inner_size,
+                                                  scan_size,
+                                                  exclusive);
   }
   swap_ptr(next_in_data, next_out_data);
   if (reverse) {
diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index 265d123dfeaf2..d80a4b8cc4cde 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/deformable_conv_grad_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/deformable_conv_grad_kernel.h"
 #include "paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h"
 
 namespace phi {
@@ -69,10 +68,9 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
     int w_in = w_out * stride_w - pad_w;
     int h_in = h_out * stride_h - pad_h;
 
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
+    const T* data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const int data_offset_h_ptr =
         ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
     const int data_offset_w_ptr =
@@ -86,9 +84,9 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
 
     T cur_top_grad = data_col[thread];
     if (data_mask) {
-      const T* data_mask_ptr = data_mask +
-                               (b * deformable_group + deformable_group_index) *
-                                   kernel_h * kernel_w * height_col * width_col;
+      const T* data_mask_ptr =
+          data_mask + (b * deformable_group + deformable_group_index) *
+                          kernel_h * kernel_w * height_col * width_col;
       const T mask = data_mask_ptr[data_mask_hw_ptr];
       cur_top_grad *= mask;
     }
@@ -134,28 +132,28 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
   int blocks = NumBlocks(num_kernels);
   int threads = kNumCUDAThreads;
 
-  ModulatedDeformableCol2imGpuKernel<
-      T><<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
-                                                   data_col,
-                                                   data_offset,
-                                                   data_mask,
-                                                   im_shape[0],
-                                                   im_shape[1],
-                                                   im_shape[2],
-                                                   kernel_shape[2],
-                                                   kernel_shape[3],
-                                                   pad[0],
-                                                   pad[1],
-                                                   stride[0],
-                                                   stride[1],
-                                                   dilation[0],
-                                                   dilation[1],
-                                                   channel_per_deformable_group,
-                                                   col_shape[1],
-                                                   deformable_group,
-                                                   col_shape[2],
-                                                   col_shape[3],
-                                                   grad_im);
+  ModulatedDeformableCol2imGpuKernel<T>
+      <<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
+                                                 data_col,
+                                                 data_offset,
+                                                 data_mask,
+                                                 im_shape[0],
+                                                 im_shape[1],
+                                                 im_shape[2],
+                                                 kernel_shape[2],
+                                                 kernel_shape[3],
+                                                 pad[0],
+                                                 pad[1],
+                                                 stride[0],
+                                                 stride[1],
+                                                 dilation[0],
+                                                 dilation[1],
+                                                 channel_per_deformable_group,
+                                                 col_shape[1],
+                                                 deformable_group,
+                                                 col_shape[2],
+                                                 col_shape[3],
+                                                 grad_im);
 }
 
 template <typename T>
@@ -196,23 +194,20 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
     const int deformable_group_index = c / (2 * kernel_h * kernel_w);
     const int col_step = kernel_h * kernel_w;
     int cnt = 0;
-    const T* data_col_ptr = data_col +
-                            deformable_group_index *
-                                channel_per_deformable_group * batch_size *
-                                width_col * height_col;
-    const T* data_im_ptr = data_im +
-                           (b * deformable_group + deformable_group_index) *
-                               channel_per_deformable_group / kernel_h /
-                               kernel_w * height * width;
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
+    const T* data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T* data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T* data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const T* data_mask_ptr =
         data_mask
-            ? data_mask +
-                  (b * deformable_group + deformable_group_index) * kernel_h *
-                      kernel_w * height_col * width_col
+            ? data_mask + (b * deformable_group + deformable_group_index) *
+                              kernel_h * kernel_w * height_col * width_col
             : nullptr;
 
     const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
@@ -301,32 +296,32 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
   int blocks = NumBlocks(num_kernels);
   int threads = kNumCUDAThreads;
 
-  ModulatedDeformableCol2imCoordGpuKernel<
-      T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-      num_kernels,
-      data_col,
-      data_im,
-      data_offset,
-      data_mask,
-      im_shape[0],
-      im_shape[1],
-      im_shape[2],
-      kernel_shape[2],
-      kernel_shape[3],
-      paddings[0],
-      paddings[1],
-      strides[0],
-      strides[1],
-      dilations[0],
-      dilations[1],
-      channel_per_deformable_group,
-      col_shape[1],
-      2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
-      deformable_groups,
-      col_shape[2],
-      col_shape[3],
-      grad_offset,
-      grad_mask);
+  ModulatedDeformableCol2imCoordGpuKernel<T>
+      <<<blocks, threads, 0, dev_ctx.stream()>>>(
+          num_kernels,
+          data_col,
+          data_im,
+          data_offset,
+          data_mask,
+          im_shape[0],
+          im_shape[1],
+          im_shape[2],
+          kernel_shape[2],
+          kernel_shape[3],
+          paddings[0],
+          paddings[1],
+          strides[0],
+          strides[1],
+          dilations[0],
+          dilations[1],
+          channel_per_deformable_group,
+          col_shape[1],
+          2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
+          deformable_groups,
+          col_shape[2],
+          col_shape[3],
+          grad_offset,
+          grad_mask);
 }
 
 template <typename T>
@@ -351,9 +346,9 @@ void FilterGradAddup(const Context& dev_ctx,
                      const int width,
                      const T* dweight_3d,
                      T* filter_grad) {
-  FilterGradAddupGpuKernel<
-      T><<<NumBlocks(nthreads), kNumCUDAThreads, 0, dev_ctx.stream()>>>(
-      nthreads, n, height, width, dweight_3d, filter_grad);
+  FilterGradAddupGpuKernel<T>
+      <<<NumBlocks(nthreads), kNumCUDAThreads, 0, dev_ctx.stream()>>>(
+          nthreads, n, height, width, dweight_3d, filter_grad);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
index 2476dcbafb984..17a7b3265ca3e 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/deformable_conv_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/deformable_conv_kernel.h"
 #include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h"
 
 PD_REGISTER_KERNEL(deformable_conv,
diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
index 5270a4b2fdb8d..8586c56c560dc 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/hostdevice.h"
@@ -1249,73 +1250,71 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
 #endif
     int grid_size = (nums_output + block_size - 1) / block_size;
 
-#define check_case(c_filter_multiplier, c_stride, c_filter)               \
-  if (c_filter_multiplier == 0 ||                                         \
-      filter_multiplier == c_filter_multiplier &&                         \
-          stride_height == stride_width && stride_height == c_stride &&   \
-          (ksize_height == ksize_width && ksize_height == c_filter ||     \
-           c_filter == -1)) {                                             \
-    if (c_filter == -1) {                                                 \
-      threads.x = block_size;                                             \
-      grid.x = grid_size;                                                 \
-      threads.y = threads.z = grid.y = grid.z = 1;                        \
-    }                                                                     \
-    if (data_layout != DataLayout::kNHWC) {                               \
-      KernelDepthwiseConvSp<                                              \
-          T,                                                              \
-          c_filter_multiplier,                                            \
-          c_stride,                                                       \
-          c_filter,                                                       \
-          DataLayout::kNCHW,                                              \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data,                                                     \
-          filter_data,                                                    \
-          batch_size,                                                     \
-          output_channels,                                                \
-          output_height,                                                  \
-          output_width,                                                   \
-          input_channels,                                                 \
-          input_height,                                                   \
-          input_width,                                                    \
-          filter_multiplier,                                              \
-          ksize_height,                                                   \
-          ksize_width,                                                    \
-          stride_height,                                                  \
-          stride_width,                                                   \
-          padding_height,                                                 \
-          padding_width,                                                  \
-          dilate_height,                                                  \
-          dilate_width,                                                   \
-          output_data);                                                   \
-    } else {                                                              \
-      KernelDepthwiseConvSp<                                              \
-          T,                                                              \
-          c_filter_multiplier,                                            \
-          c_stride,                                                       \
-          c_filter,                                                       \
-          DataLayout::kNHWC,                                              \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data,                                                     \
-          filter_data,                                                    \
-          batch_size,                                                     \
-          output_channels,                                                \
-          output_height,                                                  \
-          output_width,                                                   \
-          input_channels,                                                 \
-          input_height,                                                   \
-          input_width,                                                    \
-          filter_multiplier,                                              \
-          ksize_height,                                                   \
-          ksize_width,                                                    \
-          stride_height,                                                  \
-          stride_width,                                                   \
-          padding_height,                                                 \
-          padding_width,                                                  \
-          dilate_height,                                                  \
-          dilate_width,                                                   \
-          output_data);                                                   \
-    }                                                                     \
-    return;                                                               \
+#define check_case(c_filter_multiplier, c_stride, c_filter)             \
+  if (c_filter_multiplier == 0 ||                                       \
+      filter_multiplier == c_filter_multiplier &&                       \
+          stride_height == stride_width && stride_height == c_stride && \
+          (ksize_height == ksize_width && ksize_height == c_filter ||   \
+           c_filter == -1)) {                                           \
+    if (c_filter == -1) {                                               \
+      threads.x = block_size;                                           \
+      grid.x = grid_size;                                               \
+      threads.y = threads.z = grid.y = grid.z = 1;                      \
+    }                                                                   \
+    if (data_layout != DataLayout::kNHWC) {                             \
+      KernelDepthwiseConvSp<T,                                          \
+                            c_filter_multiplier,                        \
+                            c_stride,                                   \
+                            c_filter,                                   \
+                            DataLayout::kNCHW,                          \
+                            fuse_relu_before_conv>                      \
+          <<<grid, threads, 0, context.stream()>>>(input_data,          \
+                                                   filter_data,         \
+                                                   batch_size,          \
+                                                   output_channels,     \
+                                                   output_height,       \
+                                                   output_width,        \
+                                                   input_channels,      \
+                                                   input_height,        \
+                                                   input_width,         \
+                                                   filter_multiplier,   \
+                                                   ksize_height,        \
+                                                   ksize_width,         \
+                                                   stride_height,       \
+                                                   stride_width,        \
+                                                   padding_height,      \
+                                                   padding_width,       \
+                                                   dilate_height,       \
+                                                   dilate_width,        \
+                                                   output_data);        \
+    } else {                                                            \
+      KernelDepthwiseConvSp<T,                                          \
+                            c_filter_multiplier,                        \
+                            c_stride,                                   \
+                            c_filter,                                   \
+                            DataLayout::kNHWC,                          \
+                            fuse_relu_before_conv>                      \
+          <<<grid, threads, 0, context.stream()>>>(input_data,          \
+                                                   filter_data,         \
+                                                   batch_size,          \
+                                                   output_channels,     \
+                                                   output_height,       \
+                                                   output_width,        \
+                                                   input_channels,      \
+                                                   input_height,        \
+                                                   input_width,         \
+                                                   filter_multiplier,   \
+                                                   ksize_height,        \
+                                                   ksize_width,         \
+                                                   stride_height,       \
+                                                   stride_width,        \
+                                                   padding_height,      \
+                                                   padding_width,       \
+                                                   dilate_height,       \
+                                                   dilate_width,        \
+                                                   output_data);        \
+    }                                                                   \
+    return;                                                             \
   }
     check_case(1, 1, 3);
     check_case(1, 1, 5);
@@ -1417,70 +1416,68 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
     }
     int filter_multiplier = output_channels / input_channels;
 
-#define check_case(c_filter_multiplier, c_stride, c_filter)               \
-  if (c_filter_multiplier == 0 ||                                         \
-      filter_multiplier == c_filter_multiplier &&                         \
-          stride_height == stride_width && stride_height == c_stride &&   \
-          (ksize_height == ksize_width && ksize_height == c_filter ||     \
-           c_filter == -1)) {                                             \
-    if (data_layout != DataLayout::kNHWC) {                               \
-      KernelDepthwiseConvInputGradSp<                                     \
-          T,                                                              \
-          c_filter_multiplier,                                            \
-          c_stride,                                                       \
-          c_filter,                                                       \
-          DataLayout::kNCHW,                                              \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data,                                                     \
-          output_grad_data,                                               \
-          filter_data,                                                    \
-          batch_size,                                                     \
-          output_channels,                                                \
-          output_height,                                                  \
-          output_width,                                                   \
-          input_channels,                                                 \
-          input_height,                                                   \
-          input_width,                                                    \
-          filter_multiplier,                                              \
-          ksize_height,                                                   \
-          ksize_width,                                                    \
-          stride_height,                                                  \
-          stride_width,                                                   \
-          padding_height,                                                 \
-          padding_width,                                                  \
-          dilate_height,                                                  \
-          dilate_width,                                                   \
-          input_grad_data);                                               \
-    } else {                                                              \
-      KernelDepthwiseConvInputGradSp<                                     \
-          T,                                                              \
-          c_filter_multiplier,                                            \
-          c_stride,                                                       \
-          c_filter,                                                       \
-          DataLayout::kNHWC,                                              \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data,                                                     \
-          output_grad_data,                                               \
-          filter_data,                                                    \
-          batch_size,                                                     \
-          output_channels,                                                \
-          output_height,                                                  \
-          output_width,                                                   \
-          input_channels,                                                 \
-          input_height,                                                   \
-          input_width,                                                    \
-          filter_multiplier,                                              \
-          ksize_height,                                                   \
-          ksize_width,                                                    \
-          stride_height,                                                  \
-          stride_width,                                                   \
-          padding_height,                                                 \
-          padding_width,                                                  \
-          dilate_height,                                                  \
-          dilate_width,                                                   \
-          input_grad_data);                                               \
-    }                                                                     \
-    return;                                                               \
+#define check_case(c_filter_multiplier, c_stride, c_filter)             \
+  if (c_filter_multiplier == 0 ||                                       \
+      filter_multiplier == c_filter_multiplier &&                       \
+          stride_height == stride_width && stride_height == c_stride && \
+          (ksize_height == ksize_width && ksize_height == c_filter ||   \
+           c_filter == -1)) {                                           \
+    if (data_layout != DataLayout::kNHWC) {                             \
+      KernelDepthwiseConvInputGradSp<T,                                 \
+                                     c_filter_multiplier,               \
+                                     c_stride,                          \
+                                     c_filter,                          \
+                                     DataLayout::kNCHW,                 \
+                                     fuse_relu_before_conv>             \
+          <<<grid, threads, 0, context.stream()>>>(input_data,          \
+                                                   output_grad_data,    \
+                                                   filter_data,         \
+                                                   batch_size,          \
+                                                   output_channels,     \
+                                                   output_height,       \
+                                                   output_width,        \
+                                                   input_channels,      \
+                                                   input_height,        \
+                                                   input_width,         \
+                                                   filter_multiplier,   \
+                                                   ksize_height,        \
+                                                   ksize_width,         \
+                                                   stride_height,       \
+                                                   stride_width,        \
+                                                   padding_height,      \
+                                                   padding_width,       \
+                                                   dilate_height,       \
+                                                   dilate_width,        \
+                                                   input_grad_data);    \
+    } else {                                                            \
+      KernelDepthwiseConvInputGradSp<T,                                 \
+                                     c_filter_multiplier,               \
+                                     c_stride,                          \
+                                     c_filter,                          \
+                                     DataLayout::kNHWC,                 \
+                                     fuse_relu_before_conv>             \
+          <<<grid, threads, 0, context.stream()>>>(input_data,          \
+                                                   output_grad_data,    \
+                                                   filter_data,         \
+                                                   batch_size,          \
+                                                   output_channels,     \
+                                                   output_height,       \
+                                                   output_width,        \
+                                                   input_channels,      \
+                                                   input_height,        \
+                                                   input_width,         \
+                                                   filter_multiplier,   \
+                                                   ksize_height,        \
+                                                   ksize_width,         \
+                                                   stride_height,       \
+                                                   stride_width,        \
+                                                   padding_height,      \
+                                                   padding_width,       \
+                                                   dilate_height,       \
+                                                   dilate_width,        \
+                                                   input_grad_data);    \
+    }                                                                   \
+    return;                                                             \
   }
     check_case(1, 1, 3);
     check_case(1, 1, 5);
@@ -1574,32 +1571,31 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
           (ksize_height == ksize_width && ksize_height == c_filter ||          \
            c_filter == -1)) {                                                  \
     if (data_layout != DataLayout::kNHWC) {                                    \
-      KernelDepthwiseConvFilterGradSp<                                         \
-          T,                                                                   \
-          c_filter_multiplier,                                                 \
-          c_stride,                                                            \
-          c_filter,                                                            \
-          DataLayout::kNCHW,                                                   \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          output_grad_data,                                                    \
-          input_data,                                                          \
-          batch_size,                                                          \
-          output_channels,                                                     \
-          output_height,                                                       \
-          output_width,                                                        \
-          input_channels,                                                      \
-          input_height,                                                        \
-          input_width,                                                         \
-          filter_multiplier,                                                   \
-          ksize_height,                                                        \
-          ksize_width,                                                         \
-          stride_height,                                                       \
-          stride_width,                                                        \
-          padding_height,                                                      \
-          padding_width,                                                       \
-          dilate_height,                                                       \
-          dilate_width,                                                        \
-          filter_grad_data);                                                   \
+      KernelDepthwiseConvFilterGradSp<T,                                       \
+                                      c_filter_multiplier,                     \
+                                      c_stride,                                \
+                                      c_filter,                                \
+                                      DataLayout::kNCHW,                       \
+                                      fuse_relu_before_conv>                   \
+          <<<grid, threads, 0, context.stream()>>>(output_grad_data,           \
+                                                   input_data,                 \
+                                                   batch_size,                 \
+                                                   output_channels,            \
+                                                   output_height,              \
+                                                   output_width,               \
+                                                   input_channels,             \
+                                                   input_height,               \
+                                                   input_width,                \
+                                                   filter_multiplier,          \
+                                                   ksize_height,               \
+                                                   ksize_width,                \
+                                                   stride_height,              \
+                                                   stride_width,               \
+                                                   padding_height,             \
+                                                   padding_width,              \
+                                                   dilate_height,              \
+                                                   dilate_width,               \
+                                                   filter_grad_data);          \
     } else {                                                                   \
       framework::Tensor filter_grad_hwc;                                       \
       if (c_filter != -1) {                                                    \
@@ -1624,32 +1620,31 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
         grid = dim3(ksize_width * ksize_height, output_height, batch_size);    \
         threads = dim3(std::min(output_channels, block_size), blocks, 1);      \
       }                                                                        \
-      KernelDepthwiseConvFilterGradSp<                                         \
-          T,                                                                   \
-          c_filter_multiplier,                                                 \
-          c_stride,                                                            \
-          c_filter,                                                            \
-          DataLayout::kNHWC,                                                   \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          output_grad_data,                                                    \
-          input_data,                                                          \
-          batch_size,                                                          \
-          output_channels,                                                     \
-          output_height,                                                       \
-          output_width,                                                        \
-          input_channels,                                                      \
-          input_height,                                                        \
-          input_width,                                                         \
-          filter_multiplier,                                                   \
-          ksize_height,                                                        \
-          ksize_width,                                                         \
-          stride_height,                                                       \
-          stride_width,                                                        \
-          padding_height,                                                      \
-          padding_width,                                                       \
-          dilate_height,                                                       \
-          dilate_width,                                                        \
-          filter_grad_data);                                                   \
+      KernelDepthwiseConvFilterGradSp<T,                                       \
+                                      c_filter_multiplier,                     \
+                                      c_stride,                                \
+                                      c_filter,                                \
+                                      DataLayout::kNHWC,                       \
+                                      fuse_relu_before_conv>                   \
+          <<<grid, threads, 0, context.stream()>>>(output_grad_data,           \
+                                                   input_data,                 \
+                                                   batch_size,                 \
+                                                   output_channels,            \
+                                                   output_height,              \
+                                                   output_width,               \
+                                                   input_channels,             \
+                                                   input_height,               \
+                                                   input_width,                \
+                                                   filter_multiplier,          \
+                                                   ksize_height,               \
+                                                   ksize_width,                \
+                                                   stride_height,              \
+                                                   stride_width,               \
+                                                   padding_height,             \
+                                                   padding_width,              \
+                                                   dilate_height,              \
+                                                   dilate_width,               \
+                                                   filter_grad_data);          \
       if (c_filter != -1) {                                                    \
         std::vector<int> perm_axis({2, 3, 0, 1});                              \
         phi::funcs::TransposeNormal<phi::GPUContext, T> trans;                 \
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
index c50ceae33fc79..7310883e59508 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
@@ -12,15 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/operators/conv_op.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/fluid/operators/conv_op.h"
-
-#include "paddle/phi/kernels/gpu/depthwise_conv.h"
-
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/determinant_grad_kernel.cu b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu
index cce12a87fac72..267a3b5e3fa58 100644
--- a/paddle/phi/kernels/gpu/determinant_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/determinant_grad_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/determinant_grad_kernel.h"
 #include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(determinant_grad,
diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu
index 2518408387395..b2dddf1fdb83f 100644
--- a/paddle/phi/kernels/gpu/determinant_kernel.cu
+++ b/paddle/phi/kernels/gpu/determinant_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/determinant_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/determinant_kernel.h"
 #include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
diff --git a/paddle/phi/kernels/gpu/diag_grad_kernel.cu b/paddle/phi/kernels/gpu/diag_grad_kernel.cu
index 65bf837e6cf8a..5a579ecc27b7f 100644
--- a/paddle/phi/kernels/gpu/diag_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/diag_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/diag_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/diag_kernel.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -85,16 +84,16 @@ void DiagGradKernel(const Context& dev_ctx,
           (offset >= 0 ? offset * dout_stride_1 : -offset * dout_stride_0);
 
       std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
-      ExtractDiagonalKernel<T><<<std::get<1>(block_grid_size),
-                                 std::get<0>(block_grid_size),
-                                 0,
-                                 dev_ctx.stream()>>>(
-          dout_data,
-          dx_data,
-          start,
-          dx_length,
-          dout_stride_0 + dout_stride_1,
-          dx_stride);
+      ExtractDiagonalKernel<T>
+          <<<std::get<1>(block_grid_size),
+             std::get<0>(block_grid_size),
+             0,
+             dev_ctx.stream()>>>(dout_data,
+                                 dx_data,
+                                 start,
+                                 dx_length,
+                                 dout_stride_0 + dout_stride_1,
+                                 dx_stride);
     }
   } else {
     phi::funcs::SetConstant<Context, T> set_padding_value;
diff --git a/paddle/phi/kernels/gpu/diag_kernel.cu b/paddle/phi/kernels/gpu/diag_kernel.cu
index 95d3d3365d91b..fd63084ecb35a 100644
--- a/paddle/phi/kernels/gpu/diag_kernel.cu
+++ b/paddle/phi/kernels/gpu/diag_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/diag_kernel.h"
-
 #include <algorithm>
 #include <tuple>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/diag_kernel.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/gpu/dist_grad_kernel.cu b/paddle/phi/kernels/gpu/dist_grad_kernel.cu
index c458f8cce3e0a..cdfec8d4afd71 100644
--- a/paddle/phi/kernels/gpu/dist_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/dist_grad_kernel.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/dist_grad_kernel.h"
-#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/dist_grad_kernel.h"
+#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
 
 #ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu
index 87e75e02754a8..d5ed172161275 100644
--- a/paddle/phi/kernels/gpu/dist_kernel.cu
+++ b/paddle/phi/kernels/gpu/dist_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/dist_kernel.h"
-#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/dist_kernel.h"
+#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
 
 #ifdef PADDLE_WITH_HIP
 // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
diff --git a/paddle/phi/kernels/gpu/dot_grad_kernel.cu b/paddle/phi/kernels/gpu/dot_grad_kernel.cu
index 7defc0304e511..c299e11b1f992 100644
--- a/paddle/phi/kernels/gpu/dot_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/dot_grad_kernel.cu
@@ -12,13 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/dot_grad_kernel.h"
-#include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/dot_grad_kernel.h"
+#include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(dot_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/dot_kernel.cu b/paddle/phi/kernels/gpu/dot_kernel.cu
index 4442396f6c9dd..f947ef310eb22 100644
--- a/paddle/phi/kernels/gpu/dot_kernel.cu
+++ b/paddle/phi/kernels/gpu/dot_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/dot_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/dot_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
diff --git a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
index 94d4942a41878..b27029fe863fa 100644
--- a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
@@ -13,10 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/dropout_impl.cu.h"
-#include "paddle/phi/kernels/dropout_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/dropout_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/dropout_kernel.cu b/paddle/phi/kernels/gpu/dropout_kernel.cu
index fae0e8cb25b5c..8ae3dd25cc8f6 100644
--- a/paddle/phi/kernels/gpu/dropout_kernel.cu
+++ b/paddle/phi/kernels/gpu/dropout_kernel.cu
@@ -13,10 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/dropout_impl.cu.h"
-#include "paddle/phi/kernels/dropout_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/dropout_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
index 5e33966055ea0..51ff66b4a9ce7 100644
--- a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/eigh_grad_kernel.h"
-#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/eigh_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(eigh_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/eigh_kernel.cu b/paddle/phi/kernels/gpu/eigh_kernel.cu
index 4ff3b371b6a01..8c3ebf5f12b4b 100644
--- a/paddle/phi/kernels/gpu/eigh_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigh_kernel.cu
@@ -15,11 +15,10 @@
 #ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
-#include "paddle/phi/kernels/eigh_kernel.h"
-#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/eigh_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
index 950f811475c99..a8464be3bb3c6 100644
--- a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/einsum_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/einsum_kernel.h"
 #include "paddle/phi/kernels/impl/einsum_grad_impl.h"
 
 PD_REGISTER_KERNEL(einsum_grad,
diff --git a/paddle/phi/kernels/gpu/einsum_kernel.cu b/paddle/phi/kernels/gpu/einsum_kernel.cu
index d1f4c6590387a..088538908261f 100644
--- a/paddle/phi/kernels/gpu/einsum_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/einsum_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/einsum_kernel.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
 
 PD_REGISTER_KERNEL(einsum,
diff --git a/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
index 517fbcba158b8..4739d8cb3341f 100644
--- a/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
index 57bf6da4060d3..5ef0752d6c8ba 100644
--- a/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/elementwise_divide_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_divide_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index e5432b5f9187c..9c1ced3c1bd11 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -216,13 +216,13 @@ void ElementwiseAddGrad(const GPUContext &ctx,
         dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
                  PREDEFINED_BLOCK_SIZE,
              1);
-    SimpleElemwiseAddGradCUDAKernel<
-        T><<<grid_size, block_size, 0, ctx.stream()>>>(
-        dout.data<T>(),
-        size,
-        vec_size,
-        dx->mutable_data<T>(ctx.GetPlace()),
-        dy->mutable_data<T>(ctx.GetPlace()));
+    SimpleElemwiseAddGradCUDAKernel<T>
+        <<<grid_size, block_size, 0, ctx.stream()>>>(
+            dout.data<T>(),
+            size,
+            vec_size,
+            dx->mutable_data<T>(ctx.GetPlace()),
+            dy->mutable_data<T>(ctx.GetPlace()));
   } else {
     VLOG(4) << "Special case when dy_data is the same as dout_data, "
                "and dx_data is the same as dout_data, do not need "
@@ -291,9 +291,12 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
         auto size = dy->numel();
         dim3 grid_size =
             dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
-        SimpleElemwiseSubGradCUDAKernel<
-            T><<<grid_size, block_size, 0, ctx.stream()>>>(
-            dout.data<T>(), size, nullptr, dy->mutable_data<T>(ctx.GetPlace()));
+        SimpleElemwiseSubGradCUDAKernel<T>
+            <<<grid_size, block_size, 0, ctx.stream()>>>(
+                dout.data<T>(),
+                size,
+                nullptr,
+                dy->mutable_data<T>(ctx.GetPlace()));
       }
     } else {
       std::vector<int> reduce_dims =
@@ -316,12 +319,12 @@ void elementwise_sub_grad(const GPUContext &ctx,
   auto size = x.numel();
   dim3 grid_size =
       dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
-  SimpleElemwiseSubGradCUDAKernel<
-      T><<<grid_size, block_size, 0, ctx.stream()>>>(
-      dout.data<T>(),
-      size,
-      dx->mutable_data<T>(ctx.GetPlace()),
-      dy->mutable_data<T>(ctx.GetPlace()));
+  SimpleElemwiseSubGradCUDAKernel<T>
+      <<<grid_size, block_size, 0, ctx.stream()>>>(
+          dout.data<T>(),
+          size,
+          dx->mutable_data<T>(ctx.GetPlace()),
+          dy->mutable_data<T>(ctx.GetPlace()));
 }
 /*
 ******************************
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index 3e7430fd84eaf..8dc0917fef575 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/elementwise_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu
index 3442d7f028539..0201854e533dd 100644
--- a/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/elementwise_multiply_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_multiply_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
index 45e19b9838405..2edf7a132ed7d 100644
--- a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/elementwise_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index 47b1b304f5ec9..d01674719edc7 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -12,17 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/embedding_grad_kernel.h"
-#include "paddle/phi/kernels/funcs/embedding_util.h"
-
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/embedding_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
-
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/embedding_kernel.cu b/paddle/phi/kernels/gpu/embedding_kernel.cu
index 14a40abefffd2..c0516d00899f4 100644
--- a/paddle/phi/kernels/gpu/embedding_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/embedding_kernel.h"
-#include "paddle/phi/kernels/funcs/embedding_util.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/embedding_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu
index 50fbfddf0432e..078632fb4e647 100644
--- a/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/erfinv_grad_kernel.h"
-#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erfinv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     erfinv_grad, GPU, ALL_LAYOUT, phi::ErfinvGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/erfinv_kernel.cu b/paddle/phi/kernels/gpu/erfinv_kernel.cu
index 10df0bdf5603c..3296bb3dbb167 100644
--- a/paddle/phi/kernels/gpu/erfinv_kernel.cu
+++ b/paddle/phi/kernels/gpu/erfinv_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/erfinv_kernel.h"
-#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erfinv_kernel.h"
+#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h"
 
 PD_REGISTER_KERNEL(erfinv, GPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
index 273851cfd8b34..387708af05bea 100644
--- a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/expand_as_grad_kernel.h"
-#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/expand_as_grad_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(expand_as_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/expand_as_kernel.cu b/paddle/phi/kernels/gpu/expand_as_kernel.cu
index 0972eebeabf18..68e683127baf2 100644
--- a/paddle/phi/kernels/gpu/expand_as_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_as_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/expand_as_kernel.h"
-#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/expand_as_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
 
 PD_REGISTER_KERNEL(expand_as,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/eye_kernel.cu b/paddle/phi/kernels/gpu/eye_kernel.cu
index 069310b0d1562..d720feb9030c6 100644
--- a/paddle/phi/kernels/gpu/eye_kernel.cu
+++ b/paddle/phi/kernels/gpu/eye_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/eye_kernel.h"
-#include "paddle/phi/kernels/impl/eye_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/eye_kernel.h"
+#include "paddle/phi/kernels/impl/eye_kernel_impl.h"
 
 PD_REGISTER_KERNEL(eye,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu
index 668d673bd3269..519a57f32871d 100644
--- a/paddle/phi/kernels/gpu/flip_kernel.cu
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/flip_kernel.h"
-
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/flip_kernel.h"
 
 namespace phi {
 
@@ -115,15 +114,15 @@ void FlipKernel(const Context& dev_ctx,
                        bytes,
                        dev_ctx.stream());
 
-  flip_cuda_kernel<T><<<dim_grid, dim_block, 0, dev_ctx.stream()>>>(
-      N,
-      in_data,
-      out_data,
-      x_shape_array_gpu,
-      x_strides_array_gpu,
-      flip_dims_array_gpu,
-      flip_dims_size,
-      total_dims);
+  flip_cuda_kernel<T>
+      <<<dim_grid, dim_block, 0, dev_ctx.stream()>>>(N,
+                                                     in_data,
+                                                     out_data,
+                                                     x_shape_array_gpu,
+                                                     x_strides_array_gpu,
+                                                     flip_dims_array_gpu,
+                                                     flip_dims_size,
+                                                     total_dims);
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu
index 221bf1cb4c68c..9011bb8c5d2eb 100644
--- a/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
 #include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h"
 
-#include "paddle/phi/core/kernel_registry.h"
-
 PD_REGISTER_KERNEL(frobenius_norm_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
index 012237165b739..b921d2d640330 100644
--- a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/frobenius_norm_kernel.h"
 #include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
 
-#include "paddle/phi/core/kernel_registry.h"
-
 PD_REGISTER_KERNEL(
     frobenius_norm, GPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index 50e57a46317e3..b823bb6aa6706 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/full_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gather_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_grad_kernel.cu
index 04149a2f9ee41..6965c2b0c244e 100644
--- a/paddle/phi/kernels/gpu/gather_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/gather_kernel.h"
-
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/gather_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gather_kernel.cu b/paddle/phi/kernels/gpu/gather_kernel.cu
index 7e0c6cc168564..ffec56073e6c7 100644
--- a/paddle/phi/kernels/gpu/gather_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/gather_kernel.h"
-
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/gather_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gather_tree_kernel.cu b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
index 2906b81cb4009..0172933d1be60 100644
--- a/paddle/phi/kernels/gpu/gather_tree_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/gather_tree_kernel.h"
-
 #include <algorithm>
+
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gather_tree_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
index b80634357d62f..0494c38b213a2 100644
--- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
@@ -12,17 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/gaussian_random_kernel.h"
-
 #include <thrust/random.h>
+
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
-
-#include "paddle/fluid/framework/generator.h"
+#include "paddle/phi/kernels/gaussian_random_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
index 2b9be7c615435..247e107b77440 100644
--- a/paddle/phi/kernels/gpu/gelu_funcs.h
+++ b/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -48,8 +48,9 @@ template <bool FastMode>
 static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) {
   auto tanh_out =
       FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
-  auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) *
-                         (0.79788456f + 0.1070322243f * x * x)) +
+  auto tmp = 0.5f * x *
+                 ((1.0f - tanh_out * tanh_out) *
+                  (0.79788456f + 0.1070322243f * x * x)) +
              0.5f * (1.0f + tanh_out);
   return tmp * y_g;
 }
@@ -115,9 +116,8 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
       block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
       VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block          \
                << " , thread = " << thread;                                   \
-      FP16FastGeluFwdCUDAKernel<                                              \
-          __vec_size,                                                         \
-          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(x, y, n);  \
+      FP16FastGeluFwdCUDAKernel<__vec_size, __use_fast_math>                  \
+          <<<block, thread, 0, dev_ctx.stream()>>>(x, y, n);                  \
       return true;                                                            \
     }                                                                         \
   } while (0)
@@ -154,10 +154,8 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
       block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
       VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block          \
                << " , thread = " << thread;                                   \
-      FP16FastGeluBwdCUDAKernel<                                              \
-          __vec_size,                                                         \
-          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(           \
-          x, y_g, x_g, n);                                                    \
+      FP16FastGeluBwdCUDAKernel<__vec_size, __use_fast_math>                  \
+          <<<block, thread, 0, dev_ctx.stream()>>>(x, y_g, x_g, n);           \
       return true;                                                            \
     }                                                                         \
   } while (0)
diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
index 1f33d5c901f29..7ed2b6b71fb5e 100644
--- a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include "paddle/phi/kernels/gelu_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -20,6 +21,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/gpu/gelu_funcs.h"
+// clang-format on
 
 DECLARE_bool(use_fast_math);
 
diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu
index 00dc58df0d826..509a5ccf4d177 100644
--- a/paddle/phi/kernels/gpu/gelu_kernel.cu
+++ b/paddle/phi/kernels/gpu/gelu_kernel.cu
@@ -12,6 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format will try to sort headers according to google c++ style,
+// and that cause compiling problems.
+// clang-format off
 #include "paddle/phi/kernels/gelu_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -20,6 +23,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/gpu/gelu_funcs.h"
+// clang-format on
 
 DECLARE_bool(use_fast_math);
 
diff --git a/paddle/phi/kernels/gpu/graph_reindex_funcs.h b/paddle/phi/kernels/gpu/graph_reindex_funcs.h
index ea4f67e9d47e3..0a6d6a549a730 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_funcs.h
+++ b/paddle/phi/kernels/gpu/graph_reindex_funcs.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/graph_reindex_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/graph_reindex_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
index 9869d5a517bcb..c6be5231fe276 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -18,11 +18,10 @@
 #include <thrust/scan.h>
 #include <thrust/sequence.h>
 
-#include "paddle/phi/kernels/gpu/graph_reindex_funcs.h"
-#include "paddle/phi/kernels/graph_reindex_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/graph_reindex_funcs.h"
+#include "paddle/phi/kernels/graph_reindex_kernel.h"
 
 namespace phi {
 
@@ -92,8 +91,8 @@ void FillBufferHashTable(const Context& dev_ctx,
   int grid_tmp = (num_input + block - 1) / block;
   int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   // Insert data.
-  BuildHashTable<T><<<grid, block, 0, dev_ctx.stream()>>>(
-      input, num_input, key_index);
+  BuildHashTable<T>
+      <<<grid, block, 0, dev_ctx.stream()>>>(input, num_input, key_index);
 
   // Get item index count.
   thrust::device_vector<int> item_count(num_input + 1, 0);
@@ -331,26 +330,36 @@ void GraphReindexKernel(const Context& dev_ctx,
   }
 
   // Get reindex dst edge.
+  // Add support for multi-type edges reindex.
+  int num_ac_count = count.dims()[0];
+  int num_edge_types = num_ac_count / bs;
   thrust::device_vector<int> unique_dst_reindex(bs);
   thrust::sequence(unique_dst_reindex.begin(), unique_dst_reindex.end());
-  thrust::device_vector<int> dst_ptr(bs);
-  thrust::exclusive_scan(count_data, count_data + bs, dst_ptr.begin());
   constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
   constexpr int TILE_SIZE = BLOCK_WARPS * 16;
   const dim3 block(WARP_SIZE, BLOCK_WARPS);
   const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE);
-
   reindex_dst->Resize({num_edges});
   T* reindex_dst_data = dev_ctx.template Alloc<T>(reindex_dst);
-
-  GetDstEdgeCUDAKernel<T,
-                       BLOCK_WARPS,
-                       TILE_SIZE><<<grid, block, 0, dev_ctx.stream()>>>(
-      bs,
-      thrust::raw_pointer_cast(unique_dst_reindex.data()),
-      count_data,
-      thrust::raw_pointer_cast(dst_ptr.data()),
-      reindex_dst_data);
+  int begin = 0;
+  for (int i = 0; i < num_edge_types; i++) {
+    thrust::device_vector<int> dst_ptr(bs);
+    thrust::exclusive_scan(
+        count_data + i * bs, count_data + (i + 1) * bs, dst_ptr.begin());
+
+    GetDstEdgeCUDAKernel<T, BLOCK_WARPS, TILE_SIZE>
+        <<<grid, block, 0, dev_ctx.stream()>>>(
+            bs,
+            thrust::raw_pointer_cast(unique_dst_reindex.data()),
+            count_data + i * bs,
+            thrust::raw_pointer_cast(dst_ptr.data()),
+            reindex_dst_data + begin);
+
+    int count_i =
+        thrust::reduce(thrust::device_pointer_cast(count_data) + i * bs,
+                       thrust::device_pointer_cast(count_data) + (i + 1) * bs);
+    begin += count_i;
+  }
 
   out_nodes->Resize({static_cast<int>(unique_nodes.size())});
   T* out_nodes_data = dev_ctx.template Alloc<T>(out_nodes);
diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
index 174495dad34b2..c1e9184b222e9 100644
--- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
@@ -27,11 +27,10 @@
 #include <curand_kernel.h>
 #endif
 
-#include "paddle/phi/kernels/graph_sample_neighbors_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/graph_sample_neighbors_kernel.h"
 
 namespace phi {
 
@@ -175,21 +174,19 @@ void SampleNeighbors(const Context& dev_ctx,
   constexpr int TILE_SIZE = BLOCK_WARPS * 16;
   const dim3 block(WARP_SIZE, BLOCK_WARPS);
   const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE);
-  SampleKernel<T,
-               WARP_SIZE,
-               BLOCK_WARPS,
-               TILE_SIZE><<<grid, block, 0, dev_ctx.stream()>>>(
-      0,
-      sample_size,
-      bs,
-      thrust::raw_pointer_cast(input),
-      row,
-      col_ptr,
-      eids,
-      thrust::raw_pointer_cast(output),
-      thrust::raw_pointer_cast(output_eids),
-      thrust::raw_pointer_cast(output_ptr.data()),
-      return_eids);
+  SampleKernel<T, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+      <<<grid, block, 0, dev_ctx.stream()>>>(
+          0,
+          sample_size,
+          bs,
+          thrust::raw_pointer_cast(input),
+          row,
+          col_ptr,
+          eids,
+          thrust::raw_pointer_cast(output),
+          thrust::raw_pointer_cast(output_eids),
+          thrust::raw_pointer_cast(output_ptr.data()),
+          return_eids);
 }
 
 template <typename T, int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
@@ -327,27 +324,27 @@ void FisherYatesSampleNeighbors(const Context& dev_ctx,
   const dim3 block(WARP_SIZE, BLOCK_WARPS);
   const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE);
 
-  FisherYatesSampleKernel<T,
-                          WARP_SIZE,
-                          BLOCK_WARPS,
-                          TILE_SIZE><<<grid, block, 0, dev_ctx.stream()>>>(
-      0, sample_size, bs, thrust::raw_pointer_cast(input), perm_data, col_ptr);
-
-  GatherEdge<T,
-             WARP_SIZE,
-             BLOCK_WARPS,
-             TILE_SIZE><<<grid, block, 0, dev_ctx.stream()>>>(
-      sample_size,
-      bs,
-      thrust::raw_pointer_cast(input),
-      row,
-      col_ptr,
-      eids,
-      thrust::raw_pointer_cast(output),
-      thrust::raw_pointer_cast(output_eids),
-      thrust::raw_pointer_cast(output_ptr.data()),
-      perm_data,
-      return_eids);
+  FisherYatesSampleKernel<T, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+      <<<grid, block, 0, dev_ctx.stream()>>>(0,
+                                             sample_size,
+                                             bs,
+                                             thrust::raw_pointer_cast(input),
+                                             perm_data,
+                                             col_ptr);
+
+  GatherEdge<T, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+      <<<grid, block, 0, dev_ctx.stream()>>>(
+          sample_size,
+          bs,
+          thrust::raw_pointer_cast(input),
+          row,
+          col_ptr,
+          eids,
+          thrust::raw_pointer_cast(output),
+          thrust::raw_pointer_cast(output_eids),
+          thrust::raw_pointer_cast(output_ptr.data()),
+          perm_data,
+          return_eids);
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
index 1eab521170bc5..a93603ae18f1c 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
+++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
@@ -13,16 +13,16 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/kernels/graph_send_recv_kernel.h"
-
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
+
 #include <algorithm>
 #include <vector>
 
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
index 8743b4e8a7408..b00d993164674 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
@@ -12,15 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
-#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h"
-
 #include <algorithm>
 #include <vector>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
+#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h"
 
 namespace phi {
 
@@ -75,12 +74,9 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
   int64_t input_size = src_dims[0];
   if (pool_type == "SUM") {
     GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<
-        T,
-        IndexT,
-        GraphSendRecvSumCUDAFunctor<T,
-                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
-        p_src, d_index, s_index, p_output, index_size, slice_size, functor);
+    GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvSumCUDAFunctor<T, IndexT>>
+        <<<grid, block, 0, ctx.stream()>>>(
+            p_src, d_index, s_index, p_output, index_size, slice_size, functor);
   } else if (pool_type == "MEAN") {
     const int32_t* s_count = dst_count->data<int32_t>();
     ManipulateMeanGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
@@ -88,15 +84,15 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
   } else if (pool_type == "MAX" || pool_type == "MIN") {
     const T* ptr_input = x.data<T>();
     const T* ptr_output = out->data<T>();
-    ManipulateMinMaxGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
-        p_src,
-        d_index,
-        s_index,
-        p_output,
-        index_size,
-        slice_size,
-        ptr_input,
-        ptr_output);
+    ManipulateMinMaxGradCUDAKernel<T, IndexT>
+        <<<grid, block, 0, ctx.stream()>>>(p_src,
+                                           d_index,
+                                           s_index,
+                                           p_output,
+                                           index_size,
+                                           slice_size,
+                                           ptr_input,
+                                           ptr_output);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
index 2826c071d6ec3..446a2361aedd8 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
-#include "paddle/phi/kernels/graph_send_recv_kernel.h"
-
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
+
 #include <algorithm>
 #include <vector>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
 
 namespace phi {
 
@@ -93,20 +93,14 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
   int64_t input_size = src_dims[0];
   if (pool_type == "SUM") {
     GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<
-        T,
-        IndexT,
-        GraphSendRecvSumCUDAFunctor<T,
-                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
-        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+    GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvSumCUDAFunctor<T, IndexT>>
+        <<<grid, block, 0, ctx.stream()>>>(
+            p_src, s_index, d_index, p_output, index_size, slice_size, functor);
   } else if (pool_type == "MAX") {
     GraphSendRecvMaxCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<
-        T,
-        IndexT,
-        GraphSendRecvMaxCUDAFunctor<T,
-                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
-        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+    GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvMaxCUDAFunctor<T, IndexT>>
+        <<<grid, block, 0, ctx.stream()>>>(
+            p_src, s_index, d_index, p_output, index_size, slice_size, functor);
 
     if (out_size > 0) {
       input_size = out_size;
@@ -118,12 +112,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
         p_output, input_size, slice_size);
   } else if (pool_type == "MIN") {
     GraphSendRecvMinCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<
-        T,
-        IndexT,
-        GraphSendRecvMinCUDAFunctor<T,
-                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
-        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+    GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvMinCUDAFunctor<T, IndexT>>
+        <<<grid, block, 0, ctx.stream()>>>(
+            p_src, s_index, d_index, p_output, index_size, slice_size, functor);
 
     if (out_size > 0) {
       input_size = out_size;
@@ -135,12 +126,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
         p_output, input_size, slice_size);
   } else if (pool_type == "MEAN") {
     GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<
-        T,
-        IndexT,
-        GraphSendRecvSumCUDAFunctor<T,
-                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
-        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+    GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvSumCUDAFunctor<T, IndexT>>
+        <<<grid, block, 0, ctx.stream()>>>(
+            p_src, s_index, d_index, p_output, index_size, slice_size, functor);
 
     ctx.template Alloc<int32_t>(dst_count);
     int32_t* p_dst_count = dst_count->data<int32_t>();
diff --git a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
index 457a348be832b..40633fed348a5 100644
--- a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
@@ -12,16 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
-
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpu/grid_sample_utils.h"
-
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
 
 namespace phi {
 
@@ -295,23 +293,23 @@ void GridSampleGradKernel(const Context& dev_ctx,
   auto cu_stream = dev_ctx.stream();
   backends::gpu::GpuLaunchConfig config =
       backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
-  GridSamplerCudaBackwardKernel<
-      T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-      count,
-      out_grad.data<T>(),
-      x.data<T>(),
-      grid.data<T>(),
-      n,
-      c,
-      out_h,
-      out_w,
-      in_h,
-      in_w,
-      x_grad->data<T>(),
-      grid_grad_data,
-      enum_mode,
-      enum_padding_mode,
-      align_corners);
+  GridSamplerCudaBackwardKernel<T>
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+          count,
+          out_grad.data<T>(),
+          x.data<T>(),
+          grid.data<T>(),
+          n,
+          c,
+          out_h,
+          out_w,
+          in_h,
+          in_w,
+          x_grad->data<T>(),
+          grid_grad_data,
+          enum_mode,
+          enum_padding_mode,
+          align_corners);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/grid_sample_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
index f611b46911c4f..4a5d567caa1ab 100644
--- a/paddle/phi/kernels/gpu/grid_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/grid_sample_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/grid_sample_kernel.h"
 
 namespace phi {
 
@@ -210,21 +209,21 @@ void GridSampleKernel(const Context& dev_ctx,
   auto cu_stream = dev_ctx.stream();
   backends::gpu::GpuLaunchConfig config =
       backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
-  GridSampleCudaKernel<
-      T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-      count,
-      n,
-      c,
-      out_h,
-      out_w,
-      in_h,
-      in_w,
-      x.data<T>(),
-      grid.data<T>(),
-      output_data,
-      enum_mode,
-      enum_padding_mode,
-      align_corners);
+  GridSampleCudaKernel<T>
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+          count,
+          n,
+          c,
+          out_h,
+          out_w,
+          in_h,
+          in_w,
+          x.data<T>(),
+          grid.data<T>(),
+          output_data,
+          enum_mode,
+          enum_padding_mode,
+          align_corners);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
index a28a7512f4986..71d9859c8a3b3 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gumbel_softmax_grad_kernel.h"
 #include "paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h"
 
-#include "paddle/phi/core/kernel_registry.h"
-
 PD_REGISTER_KERNEL(gumbel_softmax_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
index c0e557f09bcc9..d68c77de02fa4 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/gumbel_softmax_kernel.h"
-#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/gumbel_softmax_kernel.h"
+#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #ifdef __NVCC__
@@ -105,14 +104,14 @@ struct OneHotGenerator<GPUContext, T> {
     ctx.template Alloc<T>(&input_tensor);
     paddle::framework::TensorCopy(*out, ctx.GetPlace(), &input_tensor);
     funcs::set_constant(ctx, out, 0.0);
-    OneHotCUDAKernel<T,
-                     thread_size><<<block_size, thread_size, 0, ctx.stream()>>>(
-        height,
-        size_from_axis / size_out_axis,
-        size_out_axis,
-        std::numeric_limits<T>::lowest(),
-        input_tensor.data<T>(),
-        out->data<T>());
+    OneHotCUDAKernel<T, thread_size>
+        <<<block_size, thread_size, 0, ctx.stream()>>>(
+            height,
+            size_from_axis / size_out_axis,
+            size_out_axis,
+            std::numeric_limits<T>::lowest(),
+            input_tensor.data<T>(),
+            out->data<T>());
   }
 };
 
diff --git a/paddle/phi/kernels/gpu/histogram_kernel.cu b/paddle/phi/kernels/gpu/histogram_kernel.cu
index c5eb5220537cd..2950aef15ca93 100644
--- a/paddle/phi/kernels/gpu/histogram_kernel.cu
+++ b/paddle/phi/kernels/gpu/histogram_kernel.cu
@@ -12,17 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/histogram_kernel.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/histogram_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/increment_kernel.cu b/paddle/phi/kernels/gpu/increment_kernel.cu
index b3c3127191148..8e7cb6e8b3c05 100644
--- a/paddle/phi/kernels/gpu/increment_kernel.cu
+++ b/paddle/phi/kernels/gpu/increment_kernel.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/increment_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/increment_kernel_impl.h"
+#include "paddle/phi/kernels/increment_kernel.h"
 
 PD_REGISTER_KERNEL(increment,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
index c8c025c7fc18f..b763f05531d00 100644
--- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/index_sample_grad_kernel.h"
-
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/index_sample_grad_kernel.h"
 
 namespace phi {
 
@@ -107,24 +107,24 @@ void IndexSampleGradKernel(const Context& ctx,
 
   if (index_type == DataType::INT64) {
     const int64_t* index_data = index.data<int64_t>();
-    IndexSampleGrad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
-        index_data,
-        input_grad_data,
-        output_grad_data,
-        index_length,
-        input_length,
-        batch_size,
-        same_data_in_index_row);
+    IndexSampleGrad<T, int64_t>
+        <<<grid_dim, block_dim, 0, stream>>>(index_data,
+                                             input_grad_data,
+                                             output_grad_data,
+                                             index_length,
+                                             input_length,
+                                             batch_size,
+                                             same_data_in_index_row);
   } else if (index_type == DataType::INT32) {
     const int* index_data = index.data<int>();
-    IndexSampleGrad<T, int><<<grid_dim, block_dim, 0, stream>>>(
-        index_data,
-        input_grad_data,
-        output_grad_data,
-        index_length,
-        input_length,
-        batch_size,
-        same_data_in_index_row);
+    IndexSampleGrad<T, int>
+        <<<grid_dim, block_dim, 0, stream>>>(index_data,
+                                             input_grad_data,
+                                             output_grad_data,
+                                             index_length,
+                                             input_length,
+                                             batch_size,
+                                             same_data_in_index_row);
   }
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu
index 0eca473a565a8..702c955cd7e01 100644
--- a/paddle/phi/kernels/gpu/index_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/index_sample_kernel.h"
-
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/index_sample_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
index 84094f4c1ee5a..75f74e4afdf1f 100644
--- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/index_select_grad_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/index_select_grad_kernel.h"
 
 DECLARE_bool(cudnn_deterministic);
 
@@ -100,27 +99,26 @@ void IndexSelectGradKernel(const Context& ctx,
 
   if (index_type == phi::DataType::INT64) {
     const int64_t* index_data = index.data<int64_t>();
-    index_select_grad_cuda_kernel<T,
-                                  int64_t><<<grid_dim, block_dim, 0, stream>>>(
-        output_grad_data,
-        in_grad_data,
-        index_data,
-        index_nums,
-        out_nums,
-        stride,
-        size,
-        delta);
+    index_select_grad_cuda_kernel<T, int64_t>
+        <<<grid_dim, block_dim, 0, stream>>>(output_grad_data,
+                                             in_grad_data,
+                                             index_data,
+                                             index_nums,
+                                             out_nums,
+                                             stride,
+                                             size,
+                                             delta);
   } else {
     const int* index_data = index.data<int>();
-    index_select_grad_cuda_kernel<T, int><<<grid_dim, block_dim, 0, stream>>>(
-        output_grad_data,
-        in_grad_data,
-        index_data,
-        index_nums,
-        out_nums,
-        stride,
-        size,
-        delta);
+    index_select_grad_cuda_kernel<T, int>
+        <<<grid_dim, block_dim, 0, stream>>>(output_grad_data,
+                                             in_grad_data,
+                                             index_data,
+                                             index_nums,
+                                             out_nums,
+                                             stride,
+                                             size,
+                                             delta);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu
index 0a6ac69cef098..d2a2bff075b19 100644
--- a/paddle/phi/kernels/gpu/index_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/index_select_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/index_select_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
index b72acc7073383..bdc81b59f1433 100644
--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -21,6 +19,7 @@
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
 #include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
 
 namespace phi {
 template <typename T, int BlockDim>
@@ -274,10 +273,10 @@ __global__ void DoubleGradComputeDScale(const T *x,
   if (ddx != nullptr) {
     T dscale_tmp = 0;
     for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dscale_tmp +=
-          ddx[i] * var_val * (dy[i] - dy_sum_val / sample_size -
-                              dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) *
-                                  var_val * var_val / sample_size);
+      dscale_tmp += ddx[i] * var_val *
+                    (dy[i] - dy_sum_val / sample_size -
+                     dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) * var_val *
+                         var_val / sample_size);
     }
     dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
     if (threadIdx.x == 0) {
@@ -290,10 +289,10 @@ __global__ void DoubleGradComputeDScale(const T *x,
 template <typename T, typename Context>
 void InstanceNormGradKernel(const Context &dev_ctx,
                             const DenseTensor &x,
-                            const DenseTensor &d_y,
                             const paddle::optional<DenseTensor> &scale,
                             const DenseTensor &saved_mean,
                             const DenseTensor &saved_variance,
+                            const DenseTensor &d_y,
                             float epsilon_f,
                             DenseTensor *d_x,
                             DenseTensor *d_scale,
@@ -563,18 +562,18 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
   if (dx) {
     T *dx_data = dev_ctx.template Alloc<T>(dx);
     set_zero(dev_ctx, dx, static_cast<T>(0));
-    DoubleGradComputeDX<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-        x_data,
-        mean_data,
-        variance_data,
-        ddx_data,
-        dy_data,
-        scale_data,
-        ddscale_data,
-        C,
-        sample_size,
-        epsilon,
-        dx_data);
+    DoubleGradComputeDX<T, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddx_data,
+                                               dy_data,
+                                               scale_data,
+                                               ddscale_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               dx_data);
   }
   if (dscale) {
     DenseTensor dscale_tmp;
@@ -585,34 +584,34 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
 
     T *dscale_data = dev_ctx.template Alloc<T>(dscale);
     set_zero(dev_ctx, dscale, static_cast<T>(0));
-    DoubleGradComputeDScale<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-        x_data,
-        mean_data,
-        variance_data,
-        ddx_data,
-        dy_data,
-        C,
-        sample_size,
-        epsilon,
-        dscale_tmp_data);
+    DoubleGradComputeDScale<T, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddx_data,
+                                               dy_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               dscale_tmp_data);
     add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
         dscale_tmp.data<T>(), dscale->data<T>(), N, C);
   }
   if (ddy) {
     T *ddy_data = dev_ctx.template Alloc<T>(ddy);
     set_zero(dev_ctx, ddy, static_cast<T>(0));
-    DoubleGradComputeDDY<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-        x_data,
-        mean_data,
-        variance_data,
-        ddscale_data,
-        ddbias_data,
-        ddx_data,
-        scale_data,
-        C,
-        sample_size,
-        epsilon,
-        ddy_data);
+    DoubleGradComputeDDY<T, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddscale_data,
+                                               ddbias_data,
+                                               ddx_data,
+                                               scale_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               ddy_data);
   }
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
index b729223689809..c7696c2dab874 100644
--- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/instance_norm_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
 #include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+#include "paddle/phi/kernels/instance_norm_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
index cd0f4e1493e5c..4b27e5dd35916 100644
--- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/interpolate_grad_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -23,6 +21,7 @@
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
 #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/interpolate_grad_kernel.h"
 
 namespace phi {
 
@@ -1059,21 +1058,21 @@ static void Interpolate2DCUDABwd(
     } else if (!optimize_flag & is_nchw) {
       const int num_kernels = n * c * out_h * out_w;
       const int num_threads = std::min(dev_ctx.GetMaxThreadsPerBlock(), 1024);
-      KeBilinearInterpNCHWBw<
-          T><<<backends::gpu::DivUp(num_kernels, num_threads),
-               num_threads,
-               0,
-               dev_ctx.stream()>>>(input_grad_data,
-                                   in_h,
-                                   in_w,
-                                   out_h,
-                                   out_w,
-                                   n,
-                                   c,
-                                   ratio_h,
-                                   ratio_w,
-                                   output_grad_data,
-                                   align_type_value);
+      KeBilinearInterpNCHWBw<T>
+          <<<backends::gpu::DivUp(num_kernels, num_threads),
+             num_threads,
+             0,
+             dev_ctx.stream()>>>(input_grad_data,
+                                 in_h,
+                                 in_w,
+                                 out_h,
+                                 out_w,
+                                 n,
+                                 c,
+                                 ratio_h,
+                                 ratio_w,
+                                 output_grad_data,
+                                 align_type_value);
     } else {
       int64_t cw = c * out_w;
       auto interp_divmods = funcs::FastDivModForInterpolate(c, out_chw, cw);
@@ -1100,23 +1099,23 @@ static void Interpolate2DCUDABwd(
 #else
     constexpr int thread_per_block = 512;
 #endif
-    KeBicubicInterpBw<
-        T><<<config.block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>(
-        input_grad_data,
-        in_h,
-        in_w,
-        n,
-        in_chw,
-        output_grad_data,
-        out_h,
-        out_w,
-        n,
-        out_chw,
-        c,
-        ratio_h,
-        ratio_w,
-        align_corners,
-        data_layout);
+    KeBicubicInterpBw<T>
+        <<<config.block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>(
+            input_grad_data,
+            in_h,
+            in_w,
+            n,
+            in_chw,
+            output_grad_data,
+            out_h,
+            out_w,
+            n,
+            out_chw,
+            c,
+            ratio_h,
+            ratio_w,
+            align_corners,
+            data_layout);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu
index 3bd59c807103c..108449c52ade3 100644
--- a/paddle/phi/kernels/gpu/interpolate_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu
@@ -12,17 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/interpolate_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
+#include "paddle/phi/kernels/interpolate_kernel.h"
 
 namespace phi {
 using paddle::platform::FastDivMod;
@@ -949,23 +947,23 @@ static void Interpolate2DCUDAFwd(
     } else {
       int64_t cw = c * out_w;
       auto interp_divmods = funcs::FastDivModForInterpolate(c, out_chw, cw);
-      KeBilinearInterpFw<
-          T><<<config.block_per_grid, thread_num, 0, dev_ctx.stream()>>>(
-          input_data,
-          in_h,
-          in_w,
-          n,
-          in_chw,
-          output_data,
-          out_h,
-          out_w,
-          n,
-          out_chw,
-          c,
-          ratio_h,
-          ratio_w,
-          align_type_value,
-          interp_divmods);
+      KeBilinearInterpFw<T>
+          <<<config.block_per_grid, thread_num, 0, dev_ctx.stream()>>>(
+              input_data,
+              in_h,
+              in_w,
+              n,
+              in_chw,
+              output_data,
+              out_h,
+              out_w,
+              n,
+              out_chw,
+              c,
+              ratio_h,
+              ratio_w,
+              align_type_value,
+              interp_divmods);
     }
   } else if ("bicubic" == interp_method) {
 #ifdef __HIPCC__
@@ -973,23 +971,23 @@ static void Interpolate2DCUDAFwd(
 #else
     constexpr int thread_per_block = 512;
 #endif
-    KeBicubicInterpFw<
-        T><<<config.block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>(
-        input_data,
-        in_h,
-        in_w,
-        n,
-        in_chw,
-        output_data,
-        out_h,
-        out_w,
-        n,
-        out_chw,
-        c,
-        ratio_h,
-        ratio_w,
-        align_corners,
-        data_layout);
+    KeBicubicInterpFw<T>
+        <<<config.block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>(
+            input_data,
+            in_h,
+            in_w,
+            n,
+            in_chw,
+            output_data,
+            out_h,
+            out_w,
+            n,
+            out_chw,
+            c,
+            ratio_h,
+            ratio_w,
+            align_corners,
+            data_layout);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/isclose_kernel.cu b/paddle/phi/kernels/gpu/isclose_kernel.cu
index 34774ec715c48..20540521510b5 100644
--- a/paddle/phi/kernels/gpu/isclose_kernel.cu
+++ b/paddle/phi/kernels/gpu/isclose_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/isclose_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/isclose_kernel_impl.h"
+#include "paddle/phi/kernels/isclose_kernel.h"
 
 PD_REGISTER_KERNEL(
     isclose, GPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu
index 8ca53f021f054..de3f6bc3f40c6 100644
--- a/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h"
+#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h"
 PD_REGISTER_KERNEL(
     kldiv_loss_grad, GPU, ALL_LAYOUT, phi::KLDivLossGradKernel, float, double) {
 }
diff --git a/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu
index 9388ac7071c31..adaf6963bb83a 100644
--- a/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/kldiv_loss_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h"
+#include "paddle/phi/kernels/kldiv_loss_kernel.h"
 PD_REGISTER_KERNEL(
     kldiv_loss, GPU, ALL_LAYOUT, phi::KLDivLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/kron_grad_kernel.cu b/paddle/phi/kernels/gpu/kron_grad_kernel.cu
index 13ef2adaab3f3..4f4e329ed84f3 100644
--- a/paddle/phi/kernels/gpu/kron_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/kron_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/kron_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/kron_grad_kernel_impl.h"
+#include "paddle/phi/kernels/kron_grad_kernel.h"
 
 PD_REGISTER_KERNEL(kron_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/kron_kernel.cu b/paddle/phi/kernels/gpu/kron_kernel.cu
index a2124fd5af7d7..3d2b1573e890e 100644
--- a/paddle/phi/kernels/gpu/kron_kernel.cu
+++ b/paddle/phi/kernels/gpu/kron_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/kron_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/kron_kernel_impl.h"
+#include "paddle/phi/kernels/kron_kernel.h"
 
 PD_REGISTER_KERNEL(kron,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
index bcd370a72d91d..4b317e3b9424d 100644
--- a/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
-
 #include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
 
 namespace phi {
 static int getBlockSize(int col) {
@@ -53,9 +52,9 @@ void KthvalueGradKernel(const Context& dev_ctx,
   int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
   int grid_size = std::min(max_blocks, pre);
-  paddle::operators::AssignGradWithAxis<
-      T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-      out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
+  paddle::operators::AssignGradWithAxis<T>
+      <<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+          out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
index 4218e153ec29b..bd2b16fb378e2 100644
--- a/paddle/phi/kernels/gpu/kthvalue_kernel.cu
+++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/kthvalue_kernel.h"
-
 #include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/kthvalue_kernel.h"
 
 namespace phi {
 inline int getBlockSize(int col) {
@@ -55,9 +54,9 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx,
   unsigned int grid_size = num_rows < maxGridDimX
                                ? static_cast<unsigned int>(num_rows)
                                : maxGridDimX;
-  paddle::operators::InitIndex<
-      int64_t><<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<int64_t>(), num_rows, num_cols);
+  paddle::operators::InitIndex<int64_t>
+      <<<grid_size, block_size, 0, cu_stream>>>(
+          input_indices.data<int64_t>(), num_rows, num_cols);
   cub::CountingInputIterator<int64_t> counting_iter(0);
   cub::TransformInputIterator<int64_t,
                               paddle::operators::SegmentOffsetIter,
diff --git a/paddle/phi/kernels/gpu/label_smooth_kernel.cu b/paddle/phi/kernels/gpu/label_smooth_kernel.cu
index bf7ac939eb389..2bcb0ce5f3ada 100644
--- a/paddle/phi/kernels/gpu/label_smooth_kernel.cu
+++ b/paddle/phi/kernels/gpu/label_smooth_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <vector>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
index 961937441e1cf..5a399361aaac9 100644
--- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/layer_norm_grad_kernel.h"
-
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#include "paddle/phi/kernels/layer_norm_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index 72127042c16e0..10aeba339cb04 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/layer_norm_kernel.h"
-
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#include "paddle/phi/kernels/layer_norm_kernel.h"
 
 namespace phi {
 
@@ -37,11 +36,10 @@ void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
   int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
   int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
   switch (paddle::operators::GetDesiredBlockDim(feature_size)) {
-    FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward<
-                         T,
-                         T,
-                         kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-        input, scale, bias, output, mean, variance, eps, feature_size));
+    FIXED_BLOCK_DIM_CASE(
+        paddle::operators::LayerNormForward<T, T, kBlockDim>
+        <<<batch_size, kBlockDim, 0, stream>>>(
+            input, scale, bias, output, mean, variance, eps, feature_size));
     default:
       PADDLE_THROW(phi::errors::InvalidArgument(
           "Product from begin_norm_axis to end in layer_norm must be larger "
@@ -84,7 +82,7 @@ void LayerNormKernel(const Context &dev_ctx,
       PADDLE_ENFORCE_EQ(
           scale->dtype(),
           bias->dtype(),
-          phi::errors::InvalidArgument("Thie Scale and Bias of layer_norm op "
+          phi::errors::InvalidArgument("This Scale and Bias of layer_norm op "
                                        "should have the same data type."));
     }
   } else {
@@ -108,22 +106,18 @@ void LayerNormKernel(const Context &dev_ctx,
 #define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
   do {                                                                     \
     switch (paddle::operators::GetDesiredBlockDim(feature_size)) {         \
-      FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward<            \
-                           T,                                              \
-                           U,                                              \
-                           kBlockDim,                                      \
-                           IsScaleBiasSameDTypeWithX><<<batch_size,        \
-                                                        kBlockDim,         \
-                                                        0,                 \
-                                                        stream>>>(         \
-          x_data,                                                          \
-          static_cast<const ScaleBiasT *>(void_scale_data),                \
-          static_cast<const ScaleBiasT *>(void_bias_data),                 \
-          y_data,                                                          \
-          mean_data,                                                       \
-          var_data,                                                        \
-          epsilon,                                                         \
-          feature_size));                                                  \
+      FIXED_BLOCK_DIM_CASE(                                                \
+          paddle::operators::                                              \
+              LayerNormForward<T, U, kBlockDim, IsScaleBiasSameDTypeWithX> \
+          <<<batch_size, kBlockDim, 0, stream>>>(                          \
+              x_data,                                                      \
+              static_cast<const ScaleBiasT *>(void_scale_data),            \
+              static_cast<const ScaleBiasT *>(void_bias_data),             \
+              y_data,                                                      \
+              mean_data,                                                   \
+              var_data,                                                    \
+              epsilon,                                                     \
+              feature_size));                                              \
       default:                                                             \
         PADDLE_THROW(phi::errors::InvalidArgument(                         \
             "Product from begin_norm_axis to end must be larger than 1")); \
@@ -131,59 +125,75 @@ void LayerNormKernel(const Context &dev_ctx,
     }                                                                      \
   } while (0)
 
+#define PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, feature_size)          \
+  case (feature_size): {                                                     \
+    constexpr int WARPS_N = feature_size < 1024 ? 1 : (feature_size / 1024); \
+    constexpr int WARPS_M = 4 / WARPS_N;                                     \
+    const int THREADS_PER_WARP = 32;                                         \
+    const int BYTES_PER_LDG = 16;                                            \
+    const int VecSize = BYTES_PER_LDG / sizeof(T);                           \
+    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;        \
+    const int ROWS_PER_CTA = WARPS_M;                                        \
+    const int grid = static_cast<int>(                                       \
+        std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));           \
+    paddle::operators::fast_ln_fwd_kernel<T,                                 \
+                                          U,                                 \
+                                          ScaleT,                            \
+                                          VecSize,                           \
+                                          WARPS_M,                           \
+                                          WARPS_N,                           \
+                                          BYTES_PER_LDG>                     \
+        <<<grid, THREADS_PER_CTA, 0, stream>>>(                              \
+            batch_size,                                                      \
+            feature_size,                                                    \
+            epsilon,                                                         \
+            x_data,                                                          \
+            static_cast<const ScaleT *>(void_scale_data),                    \
+            static_cast<const ScaleT *>(void_bias_data),                     \
+            mean_data,                                                       \
+            var_data,                                                        \
+            y_data);                                                         \
+  } break
+
+#define PADDLE_LAUNCH_FAST_LAYERNORM_FWD(ScaleT)       \
+  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 768);  \
+  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1024); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1280); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1536); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1792); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 2048); \
+  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 4096)
+
 #ifdef PADDLE_WITH_CUDA
-  bool can_call_1024_kernel = false;
-  if (feature_size == 1024 && scale != nullptr && bias != nullptr) {
-    can_call_1024_kernel = true;
+  bool can_call_fast_kernel = false;
+  if ((feature_size >= 768 && feature_size <= 2048 && feature_size % 256 == 0 ||
+       feature_size == 4096) &&
+      scale != nullptr && bias != nullptr) {
+    // can_call_fast_kernel = true;
+    can_call_fast_kernel = false;
   }
-  if (can_call_1024_kernel) {
-    const int WARPS_M = 4;
-    const int WARPS_N = 1;
-    const int THREADS_PER_WARP = 32;
-    const int BYTES_PER_LDG = 16;
-    const int VecSize = BYTES_PER_LDG / sizeof(T);
-
-    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
-    const int ROWS_PER_CTA = WARPS_M;
-
-    const int grid = static_cast<int>(
-        std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));
+
+  if (can_call_fast_kernel) {
     if (is_scale_bias_same_dtype_with_x) {
-      paddle::operators::ln_fwd_1024_kernel<
-          T,
-          U,
-          T,
-          VecSize,
-          WARPS_M,
-          WARPS_N,
-          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
-          batch_size,
-          feature_size,
-          epsilon,
-          x_data,
-          static_cast<const T *>(void_scale_data),
-          static_cast<const T *>(void_bias_data),
-          mean_data,
-          var_data,
-          y_data);
+      switch (feature_size) {
+        PADDLE_LAUNCH_FAST_LAYERNORM_FWD(T);
+        default:
+          PADDLE_THROW(phi::errors::InvalidArgument(
+              "Only when feature_size is from 256 to 4096 and is diviaible by "
+              "256 is supported "
+              "now"));
+          break;
+      }
     } else {
-      paddle::operators::ln_fwd_1024_kernel<
-          T,
-          U,
-          U,
-          VecSize,
-          WARPS_M,
-          WARPS_N,
-          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
-          batch_size,
-          feature_size,
-          epsilon,
-          x_data,
-          static_cast<const U *>(void_scale_data),
-          static_cast<const U *>(void_bias_data),
-          mean_data,
-          var_data,
-          y_data);
+      switch (feature_size) {
+        PADDLE_LAUNCH_FAST_LAYERNORM_FWD(U);
+        default:
+          PADDLE_THROW(phi::errors::InvalidArgument(
+              "Only when feature_size is from 256 to 4096 and is diviaible by "
+              "is supported "
+              "now"));
+          break;
+      }
     }
   } else {
 #endif
@@ -197,6 +207,7 @@ void LayerNormKernel(const Context &dev_ctx,
 #endif
 
 #undef PADDLE_LAUNCH_LAYERNORM_FWD
+#undef PADDLE_LAUNCH_FAST_LAYERNORM_FWD
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu
index 3e4cd21a658f1..c7e82b8cd7e2d 100644
--- a/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/lgamma_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h"
+#include "paddle/phi/kernels/lgamma_grad_kernel.h"
 PD_REGISTER_KERNEL(
     lgamma_grad, GPU, ALL_LAYOUT, phi::LgammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/lgamma_kernel.cu b/paddle/phi/kernels/gpu/lgamma_kernel.cu
index e94d67f4ce324..050002f055573 100644
--- a/paddle/phi/kernels/gpu/lgamma_kernel.cu
+++ b/paddle/phi/kernels/gpu/lgamma_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/lgamma_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/lgamma_kernel.h"
 
 namespace phi {
 template <typename T>
diff --git a/paddle/phi/kernels/gpu/linspace_kernel.cu b/paddle/phi/kernels/gpu/linspace_kernel.cu
index 66a3f833d276a..f16347cd3b6bf 100644
--- a/paddle/phi/kernels/gpu/linspace_kernel.cu
+++ b/paddle/phi/kernels/gpu/linspace_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/linspace_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/linspace_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu
index 3bb256ad0326f..bc14bd8f3c7df 100644
--- a/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/log_loss_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h"
+#include "paddle/phi/kernels/log_loss_grad_kernel.h"
 
 PD_REGISTER_KERNEL(
     log_loss_grad, GPU, ALL_LAYOUT, phi::LogLossGradKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/log_loss_kernel.cu b/paddle/phi/kernels/gpu/log_loss_kernel.cu
index 0934520ea4ad1..e7982b0b6fdf1 100644
--- a/paddle/phi/kernels/gpu/log_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/log_loss_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/log_loss_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/log_loss_kernel_impl.h"
+#include "paddle/phi/kernels/log_loss_kernel.h"
 
 PD_REGISTER_KERNEL(log_loss, GPU, ALL_LAYOUT, phi::LogLossKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
index f7b282536558d..78d1261df6f25 100644
--- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/log_softmax_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+#include "paddle/phi/kernels/log_softmax_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
index d7e34c6c14e7a..b73bd6d6a9d9e 100644
--- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/log_softmax_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+#include "paddle/phi/kernels/log_softmax_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/logspace_kernel.cu b/paddle/phi/kernels/gpu/logspace_kernel.cu
index f47b7d35cdcda..673e8f0432015 100644
--- a/paddle/phi/kernels/gpu/logspace_kernel.cu
+++ b/paddle/phi/kernels/gpu/logspace_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/logspace_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/data_type_transform.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/logspace_kernel.h"
 
 namespace phi {
 
@@ -90,8 +89,8 @@ void LogspaceKernel(const Context& ctx,
     LogspaceKernelInner<T><<<grid, block, 0, stream>>>(
         start_data, stop_data, step, base_data, num, out_data);
   } else {
-    LogspaceSpecialKernel<T><<<grid, block, 0, stream>>>(
-        start_data, base_data, out_data);
+    LogspaceSpecialKernel<T>
+        <<<grid, block, 0, stream>>>(start_data, base_data, out_data);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu b/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu
index 490b3e9404561..a5555bf7b5901 100644
--- a/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/logsumexp_grad_kernel.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h"
+#include "paddle/phi/kernels/logsumexp_grad_kernel.h"
 
 PD_REGISTER_KERNEL(
     logsumexp_grad, GPU, ALL_LAYOUT, phi::LogsumexpGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/logsumexp_kernel.cu b/paddle/phi/kernels/gpu/logsumexp_kernel.cu
index 0f07a39ab113a..c7c23fc307fe9 100644
--- a/paddle/phi/kernels/gpu/logsumexp_kernel.cu
+++ b/paddle/phi/kernels/gpu/logsumexp_kernel.cu
@@ -12,12 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/logsumexp_kernel.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/impl/logsumexp_kernel_impl.h"
+#include "paddle/phi/kernels/logsumexp_kernel.h"
 
 PD_REGISTER_KERNEL(
     logsumexp, GPU, ALL_LAYOUT, phi::LogsumexpKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu
index 8986c97583e20..b443ae6b8fb5e 100644
--- a/paddle/phi/kernels/gpu/masked_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu
@@ -17,11 +17,10 @@
 #include <thrust/reverse.h>
 #include <thrust/scan.h>
 
-#include "paddle/phi/kernels/funcs/select_impl.cu.h"
-#include "paddle/phi/kernels/masked_select_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
+#include "paddle/phi/kernels/masked_select_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
index 9c80d5e151c1c..b6c13360cd404 100644
--- a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
@@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/matmul_grad_kernel.h"
-
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
+#include "paddle/phi/kernels/matmul_grad_kernel.h"
 
 PD_REGISTER_KERNEL(matmul_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/matmul_kernel.cu b/paddle/phi/kernels/gpu/matmul_kernel.cu
index 20c9a5229aaa6..32d70ae0763f0 100644
--- a/paddle/phi/kernels/gpu/matmul_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_kernel.cu
@@ -12,13 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/matmul_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
 
 PD_REGISTER_KERNEL(matmul,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu b/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
index 25a9de8f8bed4..3739d7f2eeddd 100644
--- a/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/matrix_power_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
+#include "paddle/phi/kernels/matrix_power_grad_kernel.h"
 
 PD_REGISTER_KERNEL(matrix_power_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/matrix_power_kernel.cu b/paddle/phi/kernels/gpu/matrix_power_kernel.cu
index d7ae7d8a3f745..f474090f9db09 100644
--- a/paddle/phi/kernels/gpu/matrix_power_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_power_kernel.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/matrix_power_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
+#include "paddle/phi/kernels/matrix_power_kernel.h"
 
 PD_REGISTER_KERNEL(
     matrix_power, GPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
index 9b889a9b4c006..8d69e6d896a52 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
@@ -15,11 +15,10 @@
 #ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
-#include "paddle/phi/kernels/matrix_rank_kernel.h"
-#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/matrix_rank_kernel.h"
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
index 84768866cc9e7..f3030d7f6cd76 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -15,10 +15,9 @@
 #ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
-#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
-
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -28,6 +27,7 @@
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
 #include "paddle/phi/kernels/reduce_max_kernel.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/maxout_grad_kernel.cu b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu
index 86ff09fd74b06..a405f38523a75 100644
--- a/paddle/phi/kernels/gpu/maxout_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     maxout_grad, GPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/maxout_kernel.cu b/paddle/phi/kernels/gpu/maxout_kernel.cu
index 88776a49f19b2..e5407a4925c84 100644
--- a/paddle/phi/kernels/gpu/maxout_kernel.cu
+++ b/paddle/phi/kernels/gpu/maxout_kernel.cu
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
 
 PD_REGISTER_KERNEL(maxout, GPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/mean_all_grad_kernel.cu b/paddle/phi/kernels/gpu/mean_all_grad_kernel.cu
index 83d4e3a57735f..b1a12b436e265 100644
--- a/paddle/phi/kernels/gpu/mean_all_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/mean_all_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/mean_all_kernel.h"
-
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/mean_all_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/mean_all_kernel.cu b/paddle/phi/kernels/gpu/mean_all_kernel.cu
index 799865be26e24..d87b738f4e717 100644
--- a/paddle/phi/kernels/gpu/mean_all_kernel.cu
+++ b/paddle/phi/kernels/gpu/mean_all_kernel.cu
@@ -12,14 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/mean_all_kernel.h"
-
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/mean_all_kernel.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 
-#include "paddle/fluid/memory/memcpy.h"
-
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc
index 37f2c40143b65..80cf88b3ceb7f 100644
--- a/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/meshgrid_grad_kernel.h"
-#include "paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(meshgrid_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc b/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc
index 9d52d1e115de9..c863550979444 100644
--- a/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/meshgrid_kernel.h"
-#include "paddle/phi/kernels/impl/meshgrid_kernel_impl.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/meshgrid_kernel_impl.h"
 
 PD_REGISTER_KERNEL(meshgrid,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/mode_grad_kernel.cu b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
index 43502621c2d3a..77235c1da39cb 100644
--- a/paddle/phi/kernels/gpu/mode_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/mode_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/mode.h"
+#include "paddle/phi/kernels/mode_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/mode_kernel.cu b/paddle/phi/kernels/gpu/mode_kernel.cu
index 629b9722cd6bc..ee255f10ebc5c 100644
--- a/paddle/phi/kernels/gpu/mode_kernel.cu
+++ b/paddle/phi/kernels/gpu/mode_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/mode_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/mode.h"
+#include "paddle/phi/kernels/mode_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/momentum_kernel.cu b/paddle/phi/kernels/gpu/momentum_kernel.cu
index 5a4f5d33e6165..5e00e074fe8f5 100644
--- a/paddle/phi/kernels/gpu/momentum_kernel.cu
+++ b/paddle/phi/kernels/gpu/momentum_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/momentum_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
+#include "paddle/phi/kernels/momentum_kernel.h"
 
 PD_REGISTER_KERNEL(momentum,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
index 6761d945e952e..61aeff9f3c708 100644
--- a/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
-#include "paddle/phi/kernels/multi_dot_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+#include "paddle/phi/kernels/multi_dot_grad_kernel.h"
 
 using float16 = phi::dtype::float16;
 
diff --git a/paddle/phi/kernels/gpu/multi_dot_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_kernel.cu
index 60b1fce5ddd89..e890c03c34577 100644
--- a/paddle/phi/kernels/gpu/multi_dot_kernel.cu
+++ b/paddle/phi/kernels/gpu/multi_dot_kernel.cu
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
-#include "paddle/phi/kernels/multi_dot_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+#include "paddle/phi/kernels/multi_dot_kernel.h"
 
 using float16 = phi::dtype::float16;
 
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index 21a506a840cc7..a4fba88d2e037 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -236,12 +236,12 @@ void MultinomialKernel(const Context& dev_ctx,
   int block_size = num_categories < 512 ? num_categories : 512;
   dim3 block_norm(block_size);
   dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1);
-  NormalizeProbability<T><<<grid_norm, block_norm, 0, dev_ctx.stream()>>>(
-      norm_probs_data,
-      in_data,
-      sum_rows_data,
-      num_distributions,
-      num_categories);
+  NormalizeProbability<T>
+      <<<grid_norm, block_norm, 0, dev_ctx.stream()>>>(norm_probs_data,
+                                                       in_data,
+                                                       sum_rows_data,
+                                                       num_distributions,
+                                                       num_categories);
 
   // Get cumulative probability of each distribution. It's the same function
   // of ``cumsum`` op.
@@ -277,15 +277,15 @@ void MultinomialKernel(const Context& dev_ctx,
   uint64_t increment = curand4_loop_times * 4;
   auto seed_offset = gen_cuda->IncrementOffset(increment);
 
-  sampleMultinomialWithReplacement<T><<<grid, block, 0, dev_ctx.stream()>>>(
-      num_samples,
-      out_data,
-      num_distributions,
-      num_categories,
-      cumulative_probs_data,
-      norm_probs_data,
-      seed_offset.first,
-      seed_offset.second);
+  sampleMultinomialWithReplacement<T>
+      <<<grid, block, 0, dev_ctx.stream()>>>(num_samples,
+                                             out_data,
+                                             num_distributions,
+                                             num_categories,
+                                             cumulative_probs_data,
+                                             norm_probs_data,
+                                             seed_offset.first,
+                                             seed_offset.second);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
index 21576ab608d26..35258280f04da 100644
--- a/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/multiplex_grad_kernel.h"
-
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/multiplex_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/multiplex_kernel.cu b/paddle/phi/kernels/gpu/multiplex_kernel.cu
index 743448a468666..e1fbd7abdc4af 100644
--- a/paddle/phi/kernels/gpu/multiplex_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiplex_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/multiplex_kernel.h"
-
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/multiplex_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/mv_grad_kernel.cu b/paddle/phi/kernels/gpu/mv_grad_kernel.cu
index 9eb8cd375ebd6..58788492a741c 100644
--- a/paddle/phi/kernels/gpu/mv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/mv_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/mv_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/mv_grad_kernel.h"
 
 namespace phi {
 
@@ -58,9 +57,9 @@ void MvGradKernel(const Context &dev_ctx,
   if (dx) {
     T *dx_data = dev_ctx.template Alloc<T>(dx);
 
-    MVGradDxCUDAKernel<
-        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-        m, n, dout_data, vec_data, dx_data);
+    MVGradDxCUDAKernel<T>
+        <<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+            m, n, dout_data, vec_data, dx_data);
   }
 
   if (dvec) {
diff --git a/paddle/phi/kernels/gpu/mv_kernel.cu b/paddle/phi/kernels/gpu/mv_kernel.cu
index 1faba5a62d2cd..82122723258de 100644
--- a/paddle/phi/kernels/gpu/mv_kernel.cu
+++ b/paddle/phi/kernels/gpu/mv_kernel.cu
@@ -12,11 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/mv_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/impl/mv_kernel_impl.h"
+#include "paddle/phi/kernels/mv_kernel.h"
 
 PD_REGISTER_KERNEL(mv, GPU, ALL_LAYOUT, phi::MvKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
index a7cd49c0e53f3..d373e3bd9f3ee 100644
--- a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
@@ -18,6 +18,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h"
 #include "paddle/phi/kernels/nanmedian_grad_kernel.h"
 
 namespace phi {
@@ -72,9 +73,9 @@ void CalcMedianGradKernel(const Context& dev_ctx,
   int64_t pre_dim = numel / stride;
 
   T div_factor = static_cast<T>(2.0);
-  KernelNanmedianGrad<
-      T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-      x_ptr, m_ptr, out_grad_ptr, x_grad_ptr, stride, pre_dim, div_factor);
+  KernelNanmedianGrad<T>
+      <<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+          x_ptr, m_ptr, out_grad_ptr, x_grad_ptr, stride, pre_dim, div_factor);
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/nanmedian_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
index 5975e2748997e..5ebf8637bfe22 100644
--- a/paddle/phi/kernels/gpu/nanmedian_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
@@ -18,6 +18,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h"
 #include "paddle/phi/kernels/nanmedian_kernel.h"
 #include "paddle/phi/kernels/top_k_kernel.h"
 
@@ -216,30 +217,30 @@ void ProcessMedianKernel(const Context& dev_ctx,
   T div_factor = static_cast<T>(2.0);
   T nan_val = std::numeric_limits<T>::quiet_NaN();
   if (should_ignore_nan) {
-    CalcNanmedianKernel<
-        T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        sort_out_ptr,
-        sort_indices_ptr,
-        nan_counts_ptr,
-        m_ptr,
-        o_ptr,
-        is_ori_odd,
-        pre_dim,
-        max_valid_num,
-        stride,
-        div_factor,
-        nan_val);
+    CalcNanmedianKernel<T>
+        <<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+            sort_out_ptr,
+            sort_indices_ptr,
+            nan_counts_ptr,
+            m_ptr,
+            o_ptr,
+            is_ori_odd,
+            pre_dim,
+            max_valid_num,
+            stride,
+            div_factor,
+            nan_val);
   } else {
-    CalcMedianKernel<
-        T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        sort_out_ptr,
-        sort_indices_ptr,
-        m_ptr,
-        o_ptr,
-        div_factor,
-        is_ori_odd,
-        pre_dim,
-        sort_k);
+    CalcMedianKernel<T>
+        <<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+            sort_out_ptr,
+            sort_indices_ptr,
+            m_ptr,
+            o_ptr,
+            div_factor,
+            is_ori_odd,
+            pre_dim,
+            sort_k);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/nll_loss.h b/paddle/phi/kernels/gpu/nll_loss.h
index a457264498feb..bb47a2f06f4c3 100644
--- a/paddle/phi/kernels/gpu/nll_loss.h
+++ b/paddle/phi/kernels/gpu/nll_loss.h
@@ -14,9 +14,11 @@
 
 #pragma once
 #include <thrust/functional.h>
+
 #include <algorithm>
 #include <functional>
 #include <string>
+
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
index 407f33c40089c..7b356826f5d8c 100644
--- a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/nll_loss_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/nll_loss.h"
+#include "paddle/phi/kernels/nll_loss_grad_kernel.h"
 
 namespace phi {
 template <typename T, typename Context>
@@ -49,25 +48,25 @@ void NllLossGradKernel(const Context& dev_ctx,
     int blocks = NumBlocks(batch_size);
     int threads = kNumCUDAThreads;
     if (reduction == "none") {
-      GPUNLLLossBackward1D_no_reduce<
-          T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
-                                                       label_data,
-                                                       weight_data,
-                                                       dout_data,
-                                                       batch_size,
-                                                       n_classes,
-                                                       ignore_index);
+      GPUNLLLossBackward1D_no_reduce<T>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                     label_data,
+                                                     weight_data,
+                                                     dout_data,
+                                                     batch_size,
+                                                     n_classes,
+                                                     ignore_index);
     } else {
-      GPUNLLLossBackward1D_with_reduce<T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
-          dx_data,
-          total_weight_data,
-          label_data,
-          weight_data,
-          dout_data,
-          batch_size,
-          n_classes,
-          size_average,
-          ignore_index);
+      GPUNLLLossBackward1D_with_reduce<T>
+          <<<1, NTHREADS, 0, dev_ctx.stream()>>>(dx_data,
+                                                 total_weight_data,
+                                                 label_data,
+                                                 weight_data,
+                                                 dout_data,
+                                                 batch_size,
+                                                 n_classes,
+                                                 size_average,
+                                                 ignore_index);
     }
   } else if (x_dims.size() == 4) {
     const auto in_dim2 = x_dims[2];
@@ -78,32 +77,32 @@ void NllLossGradKernel(const Context& dev_ctx,
     int blocks = NumBlocks(out_numel);
     int threads = kNumCUDAThreads;
     if (reduction == "none") {
-      GPUNLLLossBackward2D_no_reduce<
-          T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
-                                                       label_data,
-                                                       weight_data,
-                                                       dout_data,
-                                                       batch_size,
-                                                       n_classes,
-                                                       in_dim2,
-                                                       in_dim3,
-                                                       ignore_index);
+      GPUNLLLossBackward2D_no_reduce<T>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                     label_data,
+                                                     weight_data,
+                                                     dout_data,
+                                                     batch_size,
+                                                     n_classes,
+                                                     in_dim2,
+                                                     in_dim3,
+                                                     ignore_index);
     } else {
       int blocks_per_sample = NumBlocks(map_size) / 128;
       blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
       int total_blocks = blocks_per_sample * batch_size;
-      GPUNLLLossBackward2D_with_reduce<
-          T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
-                                                             total_weight_data,
-                                                             label_data,
-                                                             weight_data,
-                                                             dout_data,
-                                                             batch_size,
-                                                             n_classes,
-                                                             map_size,
-                                                             blocks_per_sample,
-                                                             size_average,
-                                                             ignore_index);
+      GPUNLLLossBackward2D_with_reduce<T>
+          <<<total_blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                           total_weight_data,
+                                                           label_data,
+                                                           weight_data,
+                                                           dout_data,
+                                                           batch_size,
+                                                           n_classes,
+                                                           map_size,
+                                                           blocks_per_sample,
+                                                           size_average,
+                                                           ignore_index);
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
index 99a8b10b11b5c..bdb110fa92948 100644
--- a/paddle/phi/kernels/gpu/nll_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/nll_loss_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/nll_loss.h"
+#include "paddle/phi/kernels/nll_loss_kernel.h"
 
 namespace phi {
 
@@ -49,25 +48,25 @@ void NllLossRawKernel(const Context& dev_ctx,
     int blocks = NumBlocks(batch_size);
     int threads = kNumCUDAThreads;
     if (reduction == "none") {
-      GPUNLLLossForward1D_no_reduce<
-          T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
-                                                       x_data,
-                                                       label_data,
-                                                       weight_data,
-                                                       batch_size,
-                                                       n_classes,
-                                                       ignore_index);
+      GPUNLLLossForward1D_no_reduce<T>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                     x_data,
+                                                     label_data,
+                                                     weight_data,
+                                                     batch_size,
+                                                     n_classes,
+                                                     ignore_index);
     } else {
-      GPUNLLLossForward1D_with_reduce<T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
-          out_data,
-          total_weight_data,
-          x_data,
-          label_data,
-          weight_data,
-          batch_size,
-          n_classes,
-          size_average,
-          ignore_index);
+      GPUNLLLossForward1D_with_reduce<T>
+          <<<1, NTHREADS, 0, dev_ctx.stream()>>>(out_data,
+                                                 total_weight_data,
+                                                 x_data,
+                                                 label_data,
+                                                 weight_data,
+                                                 batch_size,
+                                                 n_classes,
+                                                 size_average,
+                                                 ignore_index);
     }
   } else if (x_dims.size() == 4) {
     const auto in_dim2 = x_dims[2];
@@ -77,34 +76,34 @@ void NllLossRawKernel(const Context& dev_ctx,
     int blocks = NumBlocks(out_numel);
     int threads = kNumCUDAThreads;
     if (reduction == "none") {
-      GPUNLLLossForward2D_no_reduce<
-          T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
-                                                       x_data,
-                                                       label_data,
-                                                       weight_data,
-                                                       batch_size,
-                                                       n_classes,
-                                                       in_dim2,
-                                                       in_dim3,
-                                                       ignore_index);
+      GPUNLLLossForward2D_no_reduce<T>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                     x_data,
+                                                     label_data,
+                                                     weight_data,
+                                                     batch_size,
+                                                     n_classes,
+                                                     in_dim2,
+                                                     in_dim3,
+                                                     ignore_index);
     } else {
       int blocks_per_sample = NumBlocks(map_size) / 128;
       blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
       int total_blocks = blocks_per_sample * batch_size;
-      GPUNLLLossForward2D_with_reduce<
-          T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(out_data,
-                                                             total_weight_data,
-                                                             x_data,
-                                                             label_data,
-                                                             weight_data,
-                                                             batch_size,
-                                                             n_classes,
-                                                             map_size,
-                                                             blocks_per_sample,
-                                                             ignore_index);
+      GPUNLLLossForward2D_with_reduce<T>
+          <<<total_blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                           total_weight_data,
+                                                           x_data,
+                                                           label_data,
+                                                           weight_data,
+                                                           batch_size,
+                                                           n_classes,
+                                                           map_size,
+                                                           blocks_per_sample,
+                                                           ignore_index);
       if (size_average) {
-        GPUNLLLossForward2D_size_average<T><<<1, 1, 0, dev_ctx.stream()>>>(
-            out_data, total_weight_data);
+        GPUNLLLossForward2D_size_average<T>
+            <<<1, 1, 0, dev_ctx.stream()>>>(out_data, total_weight_data);
       }
     }
   }
diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
index 43a08b0603e65..388e7b889a193 100644
--- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+
 #include "paddle/phi/kernels/norm_grad_kernel.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -22,11 +23,9 @@
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/phi/common/bfloat16.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
 namespace phi {
@@ -105,8 +104,8 @@ void NormGradKernel(const Context& ctx,
   int max_threads = ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(max_threads / block, 1);
   int grid = std::min(max_blocks, pre * post);
-  NormalizeGradient<T, block><<<grid, block, 0, ctx.stream()>>>(
-      x_data, x_norm, dy, pre, n, post, dx);
+  NormalizeGradient<T, block>
+      <<<grid, block, 0, ctx.stream()>>>(x_data, x_norm, dy, pre, n, post, dx);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/norm_kernel.cu b/paddle/phi/kernels/gpu/norm_kernel.cu
index 274f91b8dd661..2877069a22679 100644
--- a/paddle/phi/kernels/gpu/norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+
 #include "paddle/phi/kernels/norm_kernel.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -22,11 +23,9 @@
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/phi/common/float16.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
 namespace phi {
@@ -118,8 +117,8 @@ void NormKernel(const Context& ctx,
   int max_threads = ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(max_threads / block, 1);
   int grid = std::min(max_blocks, pre * post);
-  Normalize<T, block><<<grid, block, 0, ctx.stream()>>>(
-      x_ptr, pre, n, post, eps, y, norm_ptr);
+  Normalize<T, block>
+      <<<grid, block, 0, ctx.stream()>>>(x_ptr, pre, n, post, eps, y, norm_ptr);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu
index 2ae9e9333ecb5..adc87b049eefe 100644
--- a/paddle/phi/kernels/gpu/one_hot_kernel.cu
+++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/one_hot_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/one_hot_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu
index fdfed25b3dda8..9305f30939fad 100644
--- a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/p_norm_grad_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/reduce_grad_functions.h"
+#include "paddle/phi/kernels/p_norm_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/p_norm_kernel.cu b/paddle/phi/kernels/gpu/p_norm_kernel.cu
index 80ef97d9cf88c..12038fa22437b 100644
--- a/paddle/phi/kernels/gpu/p_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_norm_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/p_norm_kernel.h"
-
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/p_norm_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
index 8f4af0a450890..8832bf6a3a4cb 100644
--- a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/pad3d_grad_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/pad3d_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu
index d1b1d70667673..eb8dfa5276708 100644
--- a/paddle/phi/kernels/gpu/pad3d_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/pad3d_kernel.h"
-
 #include <algorithm>
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -21,6 +19,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/pad3d_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/pad_grad_kernel.cu b/paddle/phi/kernels/gpu/pad_grad_kernel.cu
index a25472d122b83..c5e2e077e4158 100644
--- a/paddle/phi/kernels/gpu/pad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/pad_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h"
+#include "paddle/phi/kernels/pad_grad_kernel.h"
 
 PD_REGISTER_KERNEL(pad_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu b/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu
index 6b82cbc67485b..6634d863fc14e 100644
--- a/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h"
-#include "paddle/phi/kernels/pixel_shuffle_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h"
+#include "paddle/phi/kernels/pixel_shuffle_grad_kernel.h"
 
 PD_REGISTER_KERNEL(pixel_shuffle_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu b/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu
index 25b240c6c1a3b..8ceb1b7001161 100644
--- a/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu
+++ b/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h"
-#include "paddle/phi/kernels/pixel_shuffle_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h"
+#include "paddle/phi/kernels/pixel_shuffle_kernel.h"
 
 PD_REGISTER_KERNEL(
     pixel_shuffle, GPU, ALL_LAYOUT, phi::PixelShuffleKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu b/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu
index 9cbbc5072aa25..f36c0e4517443 100644
--- a/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h"
-#include "paddle/phi/kernels/pixel_unshuffle_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h"
+#include "paddle/phi/kernels/pixel_unshuffle_grad_kernel.h"
 
 PD_REGISTER_KERNEL(pixel_unshuffle_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu b/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu
index ca2e520ffde10..54d29ab7b13ec 100644
--- a/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu
+++ b/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h"
-#include "paddle/phi/kernels/pixel_unshuffle_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h"
+#include "paddle/phi/kernels/pixel_unshuffle_kernel.h"
 
 PD_REGISTER_KERNEL(pixel_unshuffle,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
index a5ab6a1ccd49f..832f9ad31188f 100644
--- a/paddle/phi/kernels/gpu/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
@@ -12,12 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/pool_grad_kernel.h"
-
-#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"
-
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"
+#include "paddle/phi/kernels/pool_grad_kernel.h"
 
 PD_REGISTER_KERNEL(pool2d_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/pool_kernel.cu b/paddle/phi/kernels/gpu/pool_kernel.cu
index e8641395bef92..ac3718cfb80a2 100644
--- a/paddle/phi/kernels/gpu/pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/pool_kernel.cu
@@ -12,12 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/pool_kernel.h"
-
-#include "paddle/phi/kernels/impl/pool_kernel_impl.h"
-
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pool_kernel_impl.h"
+#include "paddle/phi/kernels/pool_kernel.h"
 
 PD_REGISTER_KERNEL(pool2d,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/prelu_funcs.h b/paddle/phi/kernels/gpu/prelu_funcs.h
index 76ee9439a2050..efb22bfadfc92 100644
--- a/paddle/phi/kernels/gpu/prelu_funcs.h
+++ b/paddle/phi/kernels/gpu/prelu_funcs.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
index 013ad1974a8fb..57d1838e90475 100644
--- a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/prelu_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/gpu/prelu_funcs.h"
+#include "paddle/phi/kernels/prelu_grad_kernel.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 
 namespace phi {
@@ -82,18 +81,18 @@ class PreluOpGradFunctor {
     size_t channel =
         mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1];
 
-    PReluOpGradKernel<
-        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
-        x,
-        alpha,
-        out_grad,
-        x_grad,
-        alpha_grad,
-        channel,
-        plane_size,
-        spatial_size,
-        numel,
-        mode);
+    PReluOpGradKernel<T>
+        <<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
+            x,
+            alpha,
+            out_grad,
+            x_grad,
+            alpha_grad,
+            channel,
+            plane_size,
+            spatial_size,
+            numel,
+            mode);
   }
 };
 
diff --git a/paddle/phi/kernels/gpu/prelu_kernel.cu b/paddle/phi/kernels/gpu/prelu_kernel.cu
index c4730768982bb..ad87012485e76 100644
--- a/paddle/phi/kernels/gpu/prelu_kernel.cu
+++ b/paddle/phi/kernels/gpu/prelu_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/prelu_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/prelu_funcs.h"
+#include "paddle/phi/kernels/prelu_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
index 45e4730e173fe..8b58340efd5ed 100644
--- a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/psroi_pool_kernel.h"
-
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
index f296d0d20743e..d392ae7432fd6 100644
--- a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/psroi_pool_kernel.h"
-
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
 
 namespace phi {
 
@@ -207,19 +207,19 @@ void PsroiPoolKernel(const Context& ctx,
   int threads = kNumCUDAThreads;
 
   // call cuda kernel function
-  GPUPSROIPoolForward<T><<<blocks, threads, 0, ctx.stream()>>>(
-      output_size,
-      x.data<T>(),
-      rois.data<T>(),
-      spatial_scale,
-      input_channels,
-      height,
-      width,
-      output_channels,
-      pooled_height,
-      pooled_width,
-      rois_batch_id_list_gpu.data<int>(),
-      ctx.template Alloc<T>(out));
+  GPUPSROIPoolForward<T>
+      <<<blocks, threads, 0, ctx.stream()>>>(output_size,
+                                             x.data<T>(),
+                                             rois.data<T>(),
+                                             spatial_scale,
+                                             input_channels,
+                                             height,
+                                             width,
+                                             output_channels,
+                                             pooled_height,
+                                             pooled_width,
+                                             rois_batch_id_list_gpu.data<int>(),
+                                             ctx.template Alloc<T>(out));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
index f553da361f1fe..209dd07d950c6 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
-
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
index d363c0c28364c..b52f2ae0ce199 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/put_along_axis_kernel.h"
-
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/put_along_axis_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu
index 90eaea6a0868c..0882114e84672 100644
--- a/paddle/phi/kernels/gpu/randint_kernel.cu
+++ b/paddle/phi/kernels/gpu/randint_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/randint_kernel.h"
-
 #include <random>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/randint_kernel.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index 94f063512c06f..d1c8265f2faf0 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -16,10 +16,12 @@
 
 #ifdef __NVCC__
 #include <curand_kernel.h>
+
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
 #include <hiprand_kernel.h>
+
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index 7f6ecef80879f..bb914defbe892 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -24,8 +24,10 @@
 namespace phi {
 
 template <typename T,
-          template <typename> class ReduceOp,
-          template <typename, typename> class TransformOp>
+          template <typename>
+          class ReduceOp,
+          template <typename, typename>
+          class TransformOp>
 void Reduce(const KPDevice& dev_ctx,
             const DenseTensor& x,
             bool reduce_all,
diff --git a/paddle/phi/kernels/gpu/reduce_any_kernel.cu b/paddle/phi/kernels/gpu/reduce_any_kernel.cu
index 39c8cbe442cbd..25f73c64a5417 100644
--- a/paddle/phi/kernels/gpu/reduce_any_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_any_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_any_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_any_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
index b4ff277b5026c..7ce58bf8b2b5f 100644
--- a/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_max_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h"
+#include "paddle/phi/kernels/reduce_max_grad_kernel.h"
 
 PD_REGISTER_KERNEL(max_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu
index 50564a339ddc0..57a86c63bfc7f 100644
--- a/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_mean_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/gpu/reduce_grad.h"
+#include "paddle/phi/kernels/reduce_mean_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu
index ea1d377c45976..16914860491a3 100644
--- a/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_min_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h"
+#include "paddle/phi/kernels/reduce_min_grad_kernel.h"
 
 PD_REGISTER_KERNEL(min_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu
index 08444cf95d6c6..25f5ea33fbf7f 100644
--- a/paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_prod_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h"
+#include "paddle/phi/kernels/reduce_prod_grad_kernel.h"
 
 PD_REGISTER_KERNEL(prod_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
index db4ace1a02271..4ae1dcfeba0a1 100644
--- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_prod_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_prod_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
index 8b111641cfa40..f5d75b621c0be 100644
--- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/gpu/reduce_grad.h"
+#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/rmsprop_kernel.cu b/paddle/phi/kernels/gpu/rmsprop_kernel.cu
index 071c09ea67578..c49910e88b51a 100644
--- a/paddle/phi/kernels/gpu/rmsprop_kernel.cu
+++ b/paddle/phi/kernels/gpu/rmsprop_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/rmsprop_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h"
+#include "paddle/phi/kernels/rmsprop_kernel.h"
 
 PD_REGISTER_KERNEL(
     rmsprop, GPU, ALL_LAYOUT, phi::RmspropDenseKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h
index 68d8b2e5eef0e..fb8e07b8f14b1 100644
--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/dense_tensor.h"
-
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
index 98c2f618e7868..fe0446323739f 100644
--- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
@@ -14,15 +14,13 @@
 
 #include "paddle/phi/kernels/rnn_grad_kernel.h"
 
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/gpu/rnn_functor.h"
 
-#include "paddle/fluid/operators/utils.h"
-
 namespace phi {
 
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index 5a19d5b89f0e3..0eb74303f41b4 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -14,15 +14,13 @@
 
 #include "paddle/phi/kernels/rnn_kernel.h"
 
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/gpu/rnn_functor.h"
 
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/operators/utils.h"
-
 namespace phi {
 
 template <typename T>
diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
index 9f9ea6753402b..cfb9033bd9caa 100644
--- a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
@@ -12,17 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/roi_align_grad_kernel.h"
-
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/roi_align_grad_kernel.h"
 
 namespace phi {
 
@@ -236,21 +234,21 @@ void RoiAlignGradKernel(const Context& dev_ctx,
   int threads = kNumCUDAThreads;
 
   if (output_grad_size > 0) {
-    GPURoiAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        output_grad_size,
-        boxes.data<T>(),
-        out_grad.data<T>(),
-        rois_num,
-        spatial_scale,
-        channels,
-        height,
-        width,
-        pooled_height,
-        pooled_width,
-        sampling_ratio,
-        roi_id_data,
-        dx->data<T>(),
-        aligned);
+    GPURoiAlignBackward<T>
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(output_grad_size,
+                                                   boxes.data<T>(),
+                                                   out_grad.data<T>(),
+                                                   rois_num,
+                                                   spatial_scale,
+                                                   channels,
+                                                   height,
+                                                   width,
+                                                   pooled_height,
+                                                   pooled_width,
+                                                   sampling_ratio,
+                                                   roi_id_data,
+                                                   dx->data<T>(),
+                                                   aligned);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu
index fc24179ed3d26..0bf96a729fca0 100644
--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -12,14 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/roi_align_kernel.h"
-
+#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/kernels/roi_align_kernel.h"
 
 namespace phi {
 
@@ -232,20 +230,20 @@ void RoiAlignKernel(const Context& dev_ctx,
   int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
   paddle::memory::Copy(
       gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
-  GPURoiAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-      output_size,
-      x.data<T>(),
-      boxes.data<T>(),
-      spatial_scale,
-      channels,
-      height,
-      width,
-      pooled_height,
-      pooled_width,
-      sampling_ratio,
-      roi_id_data,
-      dev_ctx.template Alloc<T>(out),
-      aligned);
+  GPURoiAlignForward<T>
+      <<<blocks, threads, 0, dev_ctx.stream()>>>(output_size,
+                                                 x.data<T>(),
+                                                 boxes.data<T>(),
+                                                 spatial_scale,
+                                                 channels,
+                                                 height,
+                                                 width,
+                                                 pooled_height,
+                                                 pooled_width,
+                                                 sampling_ratio,
+                                                 roi_id_data,
+                                                 dev_ctx.template Alloc<T>(out),
+                                                 aligned);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
index 1a5af93c562bf..f66d6633b9ea0 100644
--- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
@@ -12,16 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
-
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
 
 namespace phi {
 
@@ -139,20 +137,20 @@ void RoiPoolGradKernel(const Context& dev_ctx,
     int threads = kNumCUDAThreads;
 
     if (output_grad_size > 0) {
-      GPURoiPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          output_grad_size,
-          boxes.data<T>(),
-          out_grad.data<T>(),
-          arg_max.data<int64_t>(),
-          rois_num,
-          spatial_scale,
-          channels,
-          height,
-          width,
-          pooled_height,
-          pooled_width,
-          roi_id_data,
-          dx->data<T>());
+      GPURoiPoolBackward<T>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(output_grad_size,
+                                                     boxes.data<T>(),
+                                                     out_grad.data<T>(),
+                                                     arg_max.data<int64_t>(),
+                                                     rois_num,
+                                                     spatial_scale,
+                                                     channels,
+                                                     height,
+                                                     width,
+                                                     pooled_height,
+                                                     pooled_width,
+                                                     roi_id_data,
+                                                     dx->data<T>());
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
index 32ea6223c9c2a..4d3576f0c2f09 100644
--- a/paddle/phi/kernels/gpu/roi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
@@ -12,14 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/roi_pool_kernel.h"
-
+#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/kernels/roi_pool_kernel.h"
 
 namespace phi {
 
@@ -62,18 +60,18 @@ __global__ void GPURoiPoolForward(const int nthreads,
     int box_width = max(box_end_w - box_start_w + 1, 1);
     int box_height = max(box_end_h - box_start_h + 1, 1);
 
-    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
-                                        static_cast<double>(box_height) /
-                                        static_cast<double>(pooled_height)));
-    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
-                                        static_cast<double>(box_width) /
-                                        static_cast<double>(pooled_width)));
-    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
-                                     static_cast<double>(box_height) /
-                                     static_cast<double>(pooled_height)));
-    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
-                                     static_cast<double>(box_width) /
-                                     static_cast<double>(pooled_width)));
+    int hstart = static_cast<int>(
+        floor(static_cast<double>(ph) * static_cast<double>(box_height) /
+              static_cast<double>(pooled_height)));
+    int wstart = static_cast<int>(
+        floor(static_cast<double>(pw) * static_cast<double>(box_width) /
+              static_cast<double>(pooled_width)));
+    int hend = static_cast<int>(
+        ceil(static_cast<double>(ph + 1) * static_cast<double>(box_height) /
+             static_cast<double>(pooled_height)));
+    int wend = static_cast<int>(
+        ceil(static_cast<double>(pw + 1) * static_cast<double>(box_width) /
+             static_cast<double>(pooled_width)));
     hstart = min(max(hstart + box_start_h, 0), height);
     hend = min(max(hend + box_start_h, 0), height);
     wstart = min(max(wstart + box_start_w, 0), width);
@@ -197,19 +195,19 @@ void RoiPoolKernel(const Context& dev_ctx,
   T* output_data = dev_ctx.template Alloc<T>(out);
   int64_t* arg_max_data = dev_ctx.template Alloc<int64_t>(arg_max);
 
-  GPURoiPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-      output_size,
-      x.data<T>(),
-      boxes.data<T>(),
-      spatial_scale,
-      channels,
-      height,
-      width,
-      pooled_height,
-      pooled_width,
-      box_id_data,
-      output_data,
-      arg_max_data);
+  GPURoiPoolForward<T>
+      <<<blocks, threads, 0, dev_ctx.stream()>>>(output_size,
+                                                 x.data<T>(),
+                                                 boxes.data<T>(),
+                                                 spatial_scale,
+                                                 channels,
+                                                 height,
+                                                 width,
+                                                 pooled_height,
+                                                 pooled_width,
+                                                 box_id_data,
+                                                 output_data,
+                                                 arg_max_data);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
index 82e0fa72ab076..fc5d4ff538643 100644
--- a/paddle/phi/kernels/gpu/roll_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/roll_grad_kernel.h"
-
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
+#include "paddle/phi/kernels/roll_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu
index 5d3584e4f44c1..8b137e1a5aa0b 100644
--- a/paddle/phi/kernels/gpu/roll_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/roll_kernel.h"
-
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/array.h"
 #include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
+#include "paddle/phi/kernels/roll_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h
index abe3ee470b4bc..823164f3fbc52 100644
--- a/paddle/phi/kernels/gpu/roll_kernel_impl.h
+++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h
@@ -49,23 +49,22 @@ __global__ void RollCudaKernel(const T* input,
   output[output_idx] = input[idx];
 }
 
-#define CALL_ROLL_CUDA_KERNEL(N)                                              \
-  case N: {                                                                   \
-    phi::Array<int64_t, N> _strides;                                          \
-    phi::Array<int64_t, N> _shifts;                                           \
-    phi::Array<int64_t, N> _sizes;                                            \
-    for (size_t idx = 0; idx < N; ++idx) {                                    \
-      _strides[idx] = strides[idx];                                           \
-      _shifts[idx] = shifts_data[idx];                                        \
-      _sizes[idx] = sizes[idx];                                               \
-    }                                                                         \
-    RollCudaKernel<                                                           \
-        T,                                                                    \
-        N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
-             PADDLE_CUDA_NUM_THREADS,                                         \
-             0,                                                               \
-             stream>>>(in_data, out_data, numel, _shifts, _strides, _sizes);  \
-    break;                                                                    \
+#define CALL_ROLL_CUDA_KERNEL(N)                                            \
+  case N: {                                                                 \
+    phi::Array<int64_t, N> _strides;                                        \
+    phi::Array<int64_t, N> _shifts;                                         \
+    phi::Array<int64_t, N> _sizes;                                          \
+    for (size_t idx = 0; idx < N; ++idx) {                                  \
+      _strides[idx] = strides[idx];                                         \
+      _shifts[idx] = shifts_data[idx];                                      \
+      _sizes[idx] = sizes[idx];                                             \
+    }                                                                       \
+    RollCudaKernel<T, N>                                                    \
+        <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
+           PADDLE_CUDA_NUM_THREADS,                                         \
+           0,                                                               \
+           stream>>>(in_data, out_data, numel, _shifts, _strides, _sizes);  \
+    break;                                                                  \
   }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu b/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
index 44dc31ed5d926..e2ebfc2ca027d 100644
--- a/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/rrelu_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
@@ -21,6 +19,7 @@
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/gpu/prelu_funcs.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
+#include "paddle/phi/kernels/rrelu_grad_kernel.h"
 
 namespace phi {
 
@@ -48,9 +47,9 @@ class RReluOpGradFunctor {
                   const T* out_grad,
                   T* x_grad,
                   int numel) {
-    RReluOpGradKernel<
-        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
-        x, noise, out_grad, x_grad, numel);
+    RReluOpGradKernel<T>
+        <<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
+            x, noise, out_grad, x_grad, numel);
   }
 };
 
diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu
index 6f96a697b2f2d..52b28bf37f02e 100644
--- a/paddle/phi/kernels/gpu/scale_kernel.cu
+++ b/paddle/phi/kernels/gpu/scale_kernel.cu
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/scale_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/scale_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/searchsorted_kernel.cu b/paddle/phi/kernels/gpu/searchsorted_kernel.cu
index 4a2ce2241c22d..5976c14d9a9ea 100644
--- a/paddle/phi/kernels/gpu/searchsorted_kernel.cu
+++ b/paddle/phi/kernels/gpu/searchsorted_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/searchsorted_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/searchsorted_kernel_impl.h"
+#include "paddle/phi/kernels/searchsorted_kernel.h"
 
 PD_REGISTER_KERNEL(searchsorted,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
index 9d1769e18b4b8..5f636ea7f9dd6 100644
--- a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
-#include "paddle/phi/kernels/segment_pool_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
+#include "paddle/phi/kernels/segment_pool_grad_kernel.h"
 
 PD_REGISTER_KERNEL(segment_pool_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
index 3128e534166ac..4f24cf518d614 100644
--- a/paddle/phi/kernels/gpu/segment_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
-#include "paddle/phi/kernels/segment_pool_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
+#include "paddle/phi/kernels/segment_pool_kernel.h"
 
 PD_REGISTER_KERNEL(segment_pool,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/selu_grad_kernel.cu b/paddle/phi/kernels/gpu/selu_grad_kernel.cu
index 0ed299413c172..c6dffc33ae6c8 100644
--- a/paddle/phi/kernels/gpu/selu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/selu_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/selu_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/selu_grad_kernel_impl.h"
+#include "paddle/phi/kernels/selu_grad_kernel.h"
 
 PD_REGISTER_KERNEL(
     selu_grad, GPU, ALL_LAYOUT, phi::SeluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/selu_kernel.cu b/paddle/phi/kernels/gpu/selu_kernel.cu
index 99303d8c18a97..57be1087b44c6 100644
--- a/paddle/phi/kernels/gpu/selu_kernel.cu
+++ b/paddle/phi/kernels/gpu/selu_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/selu_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/selu_kernel_impl.h"
+#include "paddle/phi/kernels/selu_kernel.h"
 
 PD_REGISTER_KERNEL(selu, GPU, ALL_LAYOUT, phi::SeluKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
index 7eed96699e720..aafb374af63e2 100644
--- a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/set_value_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/set_value_grad_kernel_impl.h"
+#include "paddle/phi/kernels/set_value_grad_kernel.h"
 
 PD_REGISTER_KERNEL(set_value_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/set_value_kernel.cu b/paddle/phi/kernels/gpu/set_value_kernel.cu
index f788da010b682..b744cfc768e8d 100644
--- a/paddle/phi/kernels/gpu/set_value_kernel.cu
+++ b/paddle/phi/kernels/gpu/set_value_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/set_value_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/set_value_kernel_impl.h"
+#include "paddle/phi/kernels/set_value_kernel.h"
 
 PD_REGISTER_KERNEL(set_value,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu
index d71112a2f2884..6d27843f13881 100644
--- a/paddle/phi/kernels/gpu/sgd_kernel.cu
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -12,15 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/sgd_kernel.h"
-
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/backends/gpu/gpu_helper.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_helper.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/sgd_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/shard_index_kernel.cu b/paddle/phi/kernels/gpu/shard_index_kernel.cu
index 0bd7b93f68928..b20c229a89fd1 100644
--- a/paddle/phi/kernels/gpu/shard_index_kernel.cu
+++ b/paddle/phi/kernels/gpu/shard_index_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/shard_index_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/shard_index_kernel.h"
 
 namespace phi {
 
@@ -85,12 +84,12 @@ void ShardIndexKernel(const Context& dev_ctx,
   auto* out_data = dev_ctx.template Alloc<T>(out);
   int64_t numel = in.numel();
   auto stream = dev_ctx.stream();
-  ShardIndexInner<
-      T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-           PADDLE_CUDA_NUM_THREADS,
-           0,
-           stream>>>(
-      in_data, out_data, numel, index_num, nshards, shard_id, ignore_value);
+  ShardIndexInner<T>
+      <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+         PADDLE_CUDA_NUM_THREADS,
+         0,
+         stream>>>(
+          in_data, out_data, numel, index_num, nshards, shard_id, ignore_value);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
index 6f9cda83a9a98..c300b6d3f3daa 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <algorithm>
+
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
index f61cd2c39674e..8425f71cc2653 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h"
-
 #include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
index b0e9efe5bbafe..245ac95eeac67 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h"
-
 #include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/sign_kernel.cu.cc b/paddle/phi/kernels/gpu/sign_kernel.cu.cc
index 1fe17a7a227ec..37f10243dc596 100644
--- a/paddle/phi/kernels/gpu/sign_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/sign_kernel.cu.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sign_kernel.h"
-#include "paddle/phi/kernels/impl/sign_kernel_impl.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/sign_kernel_impl.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/kernels/gpu/size_kernel.cu b/paddle/phi/kernels/gpu/size_kernel.cu
index 7051fb78c7587..3ca1a1d6b7603 100644
--- a/paddle/phi/kernels/gpu/size_kernel.cu
+++ b/paddle/phi/kernels/gpu/size_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/size_kernel_impl.h"
-#include "paddle/phi/kernels/size_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/size_kernel_impl.h"
+#include "paddle/phi/kernels/size_kernel.h"
 
 PD_REGISTER_KERNEL(size,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/slice_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/slice_grad_kernel.cu.cc
index 2769f5cc65d71..a6db80abaee17 100644
--- a/paddle/phi/kernels/gpu/slice_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/slice_grad_kernel.cu.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/slice_grad_kernel.h"
-#include "paddle/phi/kernels/impl/slice_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/slice_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(slice_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/slice_kernel.cu.cc b/paddle/phi/kernels/gpu/slice_kernel.cu.cc
index 0fa61962c9eb0..8743163b220b9 100644
--- a/paddle/phi/kernels/gpu/slice_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/slice_kernel.cu.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/slice_kernel.h"
-#include "paddle/phi/kernels/impl/slice_kernel_impl.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/slice_kernel_impl.h"
 
 PD_REGISTER_KERNEL(slice,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
index 04052e0dfc39a..fe213e923a92f 100644
--- a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/softmax_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/softmax_grad_kernel_impl.h"
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
 
 PD_REGISTER_KERNEL(softmax_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu
index 4a02f438c7e7e..9415e6b2bad47 100644
--- a/paddle/phi/kernels/gpu/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_kernel.cu
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/softmax_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
+#include "paddle/phi/kernels/softmax_kernel.h"
 
 PD_REGISTER_KERNEL(softmax,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu
index 73b64ce970319..9a854378fb54f 100644
--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
@@ -12,13 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/split_kernel.h"
-
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/split_kernel.h"
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu b/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu
index c5a243f45bd97..8b2d00c5170d0 100644
--- a/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/squeeze_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h"
+#include "paddle/phi/kernels/squeeze_grad_kernel.h"
 
 PD_REGISTER_KERNEL(squeeze_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/squeeze_kernel.cu b/paddle/phi/kernels/gpu/squeeze_kernel.cu
index ae15e210a02e7..6088e384c2e6b 100644
--- a/paddle/phi/kernels/gpu/squeeze_kernel.cu
+++ b/paddle/phi/kernels/gpu/squeeze_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/squeeze_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/squeeze_kernel_impl.h"
+#include "paddle/phi/kernels/squeeze_kernel.h"
 
 PD_REGISTER_KERNEL(squeeze,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/stack_grad_kernel.cu b/paddle/phi/kernels/gpu/stack_grad_kernel.cu
index 9b754e22692af..a24b48e0cf2c2 100644
--- a/paddle/phi/kernels/gpu/stack_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/stack_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/stack_grad_kernel.h"
-
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/stack_grad_kernel.h"
 
 namespace phi {
 
@@ -105,27 +104,27 @@ void StackGradKernel(const Context& dev_ctx,
       dev_ctx, dy_pre * split_dim * dy_suf);
 
   if (out.numel() < std::numeric_limits<int32_t>::max()) {
-    UnStackHelperCUDAKernel<T, int32_t><<<config.block_per_grid.x,
-                                          config.thread_per_block.x,
-                                          0,
-                                          dev_ctx.stream()>>>(
-        dy_data,
-        dy_pre,
-        split_dim,
-        dy_suf,
-        split_dim,
-        reinterpret_cast<T**>(tmp_out_data->ptr()));
+    UnStackHelperCUDAKernel<T, int32_t>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(dy_data,
+                               dy_pre,
+                               split_dim,
+                               dy_suf,
+                               split_dim,
+                               reinterpret_cast<T**>(tmp_out_data->ptr()));
   } else {
-    UnStackHelperCUDAKernel<T, int64_t><<<config.block_per_grid.x,
-                                          config.thread_per_block.x,
-                                          0,
-                                          dev_ctx.stream()>>>(
-        dy_data,
-        dy_pre,
-        split_dim,
-        dy_suf,
-        split_dim,
-        reinterpret_cast<T**>(tmp_out_data->ptr()));
+    UnStackHelperCUDAKernel<T, int64_t>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(dy_data,
+                               dy_pre,
+                               split_dim,
+                               dy_suf,
+                               split_dim,
+                               reinterpret_cast<T**>(tmp_out_data->ptr()));
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/stack_kernel.cu b/paddle/phi/kernels/gpu/stack_kernel.cu
index cc7d136c95293..54bee95356ed3 100644
--- a/paddle/phi/kernels/gpu/stack_kernel.cu
+++ b/paddle/phi/kernels/gpu/stack_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/stack_kernel.h"
-
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/stack_kernel.h"
 
 namespace phi {
 
@@ -77,25 +76,25 @@ void StackKernel(const Context& dev_ctx,
       phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row);
 
   if (out->numel() < std::numeric_limits<int32_t>::max()) {
-    StackCUDAKernel<T, int32_t><<<config.block_per_grid,
-                                  config.thread_per_block,
-                                  0,
-                                  dev_ctx.stream()>>>(
-        reinterpret_cast<T**>(tmp_x_data->ptr()),
-        x_col,
-        x_row,
-        out_col,
-        y_data);
+    StackCUDAKernel<T, int32_t>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(reinterpret_cast<T**>(tmp_x_data->ptr()),
+                               x_col,
+                               x_row,
+                               out_col,
+                               y_data);
   } else {
-    StackCUDAKernel<T, int64_t><<<config.block_per_grid,
-                                  config.thread_per_block,
-                                  0,
-                                  dev_ctx.stream()>>>(
-        reinterpret_cast<T**>(tmp_x_data->ptr()),
-        x_col,
-        x_row,
-        out_col,
-        y_data);
+    StackCUDAKernel<T, int64_t>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(reinterpret_cast<T**>(tmp_x_data->ptr()),
+                               x_col,
+                               x_row,
+                               out_col,
+                               y_data);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/strided_slice_grad_kernel.cu b/paddle/phi/kernels/gpu/strided_slice_grad_kernel.cu
index 90d9f1d986577..cae05b67fefbe 100644
--- a/paddle/phi/kernels/gpu/strided_slice_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/strided_slice_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/strided_slice_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h"
+#include "paddle/phi/kernels/strided_slice_grad_kernel.h"
 
 PD_REGISTER_KERNEL(strided_slice_raw_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/strided_slice_kernel.cu b/paddle/phi/kernels/gpu/strided_slice_kernel.cu
index 716150ff47dea..fa7f5532f55fe 100644
--- a/paddle/phi/kernels/gpu/strided_slice_kernel.cu
+++ b/paddle/phi/kernels/gpu/strided_slice_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/strided_slice_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/strided_slice_kernel_impl.h"
+#include "paddle/phi/kernels/strided_slice_kernel.h"
 
 PD_REGISTER_KERNEL(strided_slice_raw,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
index e09cfd370a4f0..61363ce8c49eb 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
-
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
index 9665a917d9dc4..cb567ef991527 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/take_along_axis_kernel.h"
-
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/take_along_axis_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/tile_grad_kernel.cu b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
index c092609e623d3..32e78bbdb7cc7 100644
--- a/paddle/phi/kernels/gpu/tile_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/tile_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/tile_grad_kernel_impl.h"
+#include "paddle/phi/kernels/tile_grad_kernel.h"
 
 PD_REGISTER_KERNEL(tile_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu
index 990877a8445cb..40ee38f73b382 100644
--- a/paddle/phi/kernels/gpu/tile_kernel.cu
+++ b/paddle/phi/kernels/gpu/tile_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/tile_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/tile_kernel_impl.h"
+#include "paddle/phi/kernels/tile_kernel.h"
 
 PD_REGISTER_KERNEL(tile,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
index 32c5fc0006f4c..1db05ed18b0b9 100644
--- a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/top_k_grad_kernel.h"
-
 #include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/top_k_grad_kernel.h"
 
 namespace phi {
 
@@ -71,9 +70,9 @@ void TopkGradKernel(const Context& dev_ctx,
   int grid_size = std::min(max_blocks, pre);
 
   // lanuch the cuda kernel to assign the grad
-  ops::AssignGradWithAxis<
-      T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-      out_grad_data, indices_data, x_grad_data, pre, post, n, k);
+  ops::AssignGradWithAxis<T>
+      <<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+          out_grad_data, indices_data, x_grad_data, pre, post, n, k);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
index 8262023826b32..b160ad1d1c5dd 100644
--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/top_k_kernel.h"
-
 #include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/top_k_kernel.h"
 
 namespace phi {
 
@@ -102,25 +101,23 @@ void TopkKernel(const Context& dev_ctx,
       // 1. Gather TopK, but without sorting
       constexpr int max_num_threads = 1024;
       if (largest) {
-        ops::RadixTopK<
-            T,
-            true><<<input_height, max_num_threads, 0, dev_ctx.stream()>>>(
-            input_data,
-            k,
-            input_height,
-            input_width,
-            output_data,
-            indices_data);
+        ops::RadixTopK<T, true>
+            <<<input_height, max_num_threads, 0, dev_ctx.stream()>>>(
+                input_data,
+                k,
+                input_height,
+                input_width,
+                output_data,
+                indices_data);
       } else {
-        ops::RadixTopK<
-            T,
-            false><<<input_height, max_num_threads, 0, dev_ctx.stream()>>>(
-            input_data,
-            k,
-            input_height,
-            input_width,
-            output_data,
-            indices_data);
+        ops::RadixTopK<T, false>
+            <<<input_height, max_num_threads, 0, dev_ctx.stream()>>>(
+                input_data,
+                k,
+                input_height,
+                input_width,
+                output_data,
+                indices_data);
       }
       // 2. Sort if needed
       if (sorted) {
@@ -165,35 +162,31 @@ void TopkKernel(const Context& dev_ctx,
     int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
     switch (ops::GetDesiredBlockDim(input_width)) {
 #ifdef PADDLE_WITH_HIP
-      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
-                      T,
-                      20,
-                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-          output_data,
-          k,
-          indices_data,
-          input_data,
-          input_width,
-          input_width,
-          static_cast<int>(k),
-          gridx,
-          input_height,
-          largest));
+      FIXED_BLOCK_DIM(
+          ops::KeMatrixTopK<T, 20, kBlockDim>
+          <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(output_data,
+                                                      k,
+                                                      indices_data,
+                                                      input_data,
+                                                      input_width,
+                                                      input_width,
+                                                      static_cast<int>(k),
+                                                      gridx,
+                                                      input_height,
+                                                      largest));
 #else
-      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
-                      T,
-                      5,
-                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-          output_data,
-          k,
-          indices_data,
-          input_data,
-          input_width,
-          input_width,
-          static_cast<int>(k),
-          gridx,
-          input_height,
-          largest));
+      FIXED_BLOCK_DIM(
+          ops::KeMatrixTopK<T, 5, kBlockDim>
+          <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(output_data,
+                                                      k,
+                                                      indices_data,
+                                                      input_data,
+                                                      input_width,
+                                                      input_width,
+                                                      static_cast<int>(k),
+                                                      gridx,
+                                                      input_height,
+                                                      largest));
 #endif
       default:
         PADDLE_THROW(errors::Fatal(
@@ -271,35 +264,31 @@ void TopkKernel(const Context& dev_ctx,
     int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
     switch (ops::GetDesiredBlockDim(input_width)) {
 #ifdef PADDLE_WITH_HIP
-      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
-                      T,
-                      20,
-                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-          trans_out.data<T>(),
-          k,
-          trans_ind.data<int64_t>(),
-          trans_input.data<T>(),
-          input_width,
-          input_width,
-          static_cast<int>(k),
-          gridx,
-          input_height,
-          largest));
+      FIXED_BLOCK_DIM(
+          ops::KeMatrixTopK<T, 20, kBlockDim>
+          <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(trans_out.data<T>(),
+                                                      k,
+                                                      trans_ind.data<int64_t>(),
+                                                      trans_input.data<T>(),
+                                                      input_width,
+                                                      input_width,
+                                                      static_cast<int>(k),
+                                                      gridx,
+                                                      input_height,
+                                                      largest));
 #else
-      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
-                      T,
-                      5,
-                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-          trans_out.data<T>(),
-          k,
-          trans_ind.data<int64_t>(),
-          trans_input.data<T>(),
-          input_width,
-          input_width,
-          static_cast<int>(k),
-          gridx,
-          input_height,
-          largest));
+      FIXED_BLOCK_DIM(
+          ops::KeMatrixTopK<T, 5, kBlockDim>
+          <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(trans_out.data<T>(),
+                                                      k,
+                                                      trans_ind.data<int64_t>(),
+                                                      trans_input.data<T>(),
+                                                      input_width,
+                                                      input_width,
+                                                      static_cast<int>(k),
+                                                      gridx,
+                                                      input_height,
+                                                      largest));
 #endif
       default:
         PADDLE_THROW(errors::Fatal(
diff --git a/paddle/phi/kernels/gpu/trace_grad_kernel.cu b/paddle/phi/kernels/gpu/trace_grad_kernel.cu
index 6692c1e19b033..5b2575f48b87e 100644
--- a/paddle/phi/kernels/gpu/trace_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/trace_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/trace_grad_kernel_impl.h"
+#include "paddle/phi/kernels/trace_grad_kernel.h"
 
 PD_REGISTER_KERNEL(trace_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu
index 4a749c5b3347d..7f22c14c6b609 100644
--- a/paddle/phi/kernels/gpu/trace_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/trace_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/trace_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
index 0687dc0c200a8..1439b2aa2ab86 100644
--- a/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
-#include "paddle/phi/kernels/transpose_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+#include "paddle/phi/kernels/transpose_grad_kernel.h"
 
 PD_REGISTER_KERNEL(transpose_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu
index 203f10e4ddd47..d7daaff2d413b 100644
--- a/paddle/phi/kernels/gpu/transpose_kernel.cu
+++ b/paddle/phi/kernels/gpu/transpose_kernel.cu
@@ -14,16 +14,15 @@
 
 #include <vector>
 
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/transpose_kernel.h"
-
 #include "paddle/fluid/framework/gpu_utils.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
 
 namespace phi {
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
index a48afeb2c796b..22629582bed87 100644
--- a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/triangular_solve_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -21,6 +19,7 @@
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/allocation/allocator.h"
diff --git a/paddle/phi/kernels/gpu/tril_indices_kernel.cu b/paddle/phi/kernels/gpu/tril_indices_kernel.cu
index be83f28451166..7068ab1775f16 100644
--- a/paddle/phi/kernels/gpu/tril_indices_kernel.cu
+++ b/paddle/phi/kernels/gpu/tril_indices_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/tril_indices_kernel.h"
-
 #include <algorithm>
 #include <tuple>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/tril_indices_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
index bc3ef1bc623bb..3271b38ae8726 100644
--- a/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(tril_triu_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/tril_triu_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_kernel.cu
index 8c48edf9eff25..65dcca70584b8 100644
--- a/paddle/phi/kernels/gpu/tril_triu_kernel.cu
+++ b/paddle/phi/kernels/gpu/tril_triu_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
 
 PD_REGISTER_KERNEL(tril_triu,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
index 33ecb4d6eb544..7c59b39890f46 100644
--- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/truncated_gaussian_random_kernel.h"
-
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
+
 #include <limits>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/truncated_gaussian_random_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/unbind_kernel.cu b/paddle/phi/kernels/gpu/unbind_kernel.cu
index 8a7aa8f6033ab..1efc3a1094da2 100644
--- a/paddle/phi/kernels/gpu/unbind_kernel.cu
+++ b/paddle/phi/kernels/gpu/unbind_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/unbind_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unbind_kernel_impl.h"
+#include "paddle/phi/kernels/unbind_kernel.h"
 
 PD_REGISTER_KERNEL(unbind,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
index 68e61b7328971..b149110bddce6 100644
--- a/paddle/phi/kernels/gpu/uniform_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/uniform_random_kernel.h"
-
 #include <thrust/random.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/uniform_random_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
index c09730ba76a91..fe9304d0e150e 100644
--- a/paddle/phi/kernels/gpu/unique_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/unique_kernel.h"
-
 #include <thrust/adjacent_difference.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
@@ -21,13 +19,16 @@
 #include <thrust/scatter.h>
 #include <thrust/sequence.h>
 #include <thrust/unique.h>
+
 #include <iostream>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor_util.h"  // TensorToVector()
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/unique_functor.h"
+#include "paddle/phi/kernels/unique_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu b/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu
index 6c3a2066f0f2d..6bd3f30779426 100644
--- a/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/unsqueeze_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h"
+#include "paddle/phi/kernels/unsqueeze_grad_kernel.h"
 
 PD_REGISTER_KERNEL(unsqueeze_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/unsqueeze_kernel.cu b/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
index 86b4462254637..78f1fafda9bcc 100644
--- a/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
+++ b/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/unsqueeze_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unsqueeze_kernel_impl.h"
+#include "paddle/phi/kernels/unsqueeze_kernel.h"
 
 PD_REGISTER_KERNEL(unsqueeze,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/unstack_grad_kernel.cu b/paddle/phi/kernels/gpu/unstack_grad_kernel.cu
index b7c349de0df32..6245e8258f8bb 100644
--- a/paddle/phi/kernels/gpu/unstack_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/unstack_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/unstack_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h"
+#include "paddle/phi/kernels/unstack_grad_kernel.h"
 
 PD_REGISTER_KERNEL(unstack_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/unstack_kernel.cu b/paddle/phi/kernels/gpu/unstack_kernel.cu
index f147f4c0f0edf..dde6d926b7ca0 100644
--- a/paddle/phi/kernels/gpu/unstack_kernel.cu
+++ b/paddle/phi/kernels/gpu/unstack_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/unstack_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unstack_kernel_impl.h"
+#include "paddle/phi/kernels/unstack_kernel.h"
 
 PD_REGISTER_KERNEL(unstack,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
index 25d6d46c20b9f..dc04c69ec70d5 100644
--- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -80,7 +80,8 @@ int64_t ComputeBlockSize(int64_t col) {
 }
 
 template <typename Context,
-          template <typename T> typename BinaryFunctor,
+          template <typename T>
+          typename BinaryFunctor,
           typename T>
 struct BinaryOperation {
   void operator()(const Context& dev_ctx,
@@ -89,15 +90,15 @@ struct BinaryOperation {
                   DenseTensor* output) {
     std::vector<const DenseTensor*> ins{&lhs, &rhs};
     std::vector<DenseTensor*> outs{output};
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                   T,
-                                                   T>(
-        dev_ctx, ins, &outs, -1, BinaryFunctor<T>());
+    paddle::operators::
+        LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+            dev_ctx, ins, &outs, -1, BinaryFunctor<T>());
   }
 };
 
 template <typename Context,
-          template <typename InT, typename OutT> typename CompareFunctor,
+          template <typename InT, typename OutT>
+          typename CompareFunctor,
           typename T>
 struct GetMask {
   void operator()(const Context& dev_ctx,
@@ -188,9 +189,8 @@ struct Argmax {
     T* out_data = out->data<T>();
     switch (ComputeBlockSize(width)) {
       FIXED_BLOCK_DIM_CASE(
-          ArgmaxCUDAKernel<T,
-                           IndType,
-                           kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+          ArgmaxCUDAKernel<T, IndType, kBlockDim>
+          <<<grid_size, kBlockDim, 0, cu_stream>>>(
               height, width, post, in_data, out_idx_data, out_data));
     }
   }
@@ -206,15 +206,13 @@ struct GetMaxValue {
     dev_ctx.template Alloc<T>(&out_data);
     switch (ComputeBlockSize(input.numel())) {
       FIXED_BLOCK_DIM_CASE(
-          ArgmaxCUDAKernel<T,
-                           T,
-                           kBlockDim><<<1, kBlockDim, 0, dev_ctx.stream()>>>(
-              1,
-              input.numel(),
-              1,
-              input.data<int64_t>(),
-              nullptr,
-              out_data.data<int64_t>()));
+          ArgmaxCUDAKernel<T, T, kBlockDim>
+          <<<1, kBlockDim, 0, dev_ctx.stream()>>>(1,
+                                                  input.numel(),
+                                                  1,
+                                                  input.data<int64_t>(),
+                                                  nullptr,
+                                                  out_data.data<int64_t>()));
     }
     DenseTensor max_value_tensor;
     phi::Copy(dev_ctx, out_data, phi::CPUPlace(), false, &max_value_tensor);
diff --git a/paddle/phi/kernels/gpu/warpctc_grad_kernel.cu b/paddle/phi/kernels/gpu/warpctc_grad_kernel.cu
index 612b03555c6f1..97a75bf684911 100644
--- a/paddle/phi/kernels/gpu/warpctc_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/warpctc_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/warpctc_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h"
+#include "paddle/phi/kernels/warpctc_grad_kernel.h"
 
 PD_REGISTER_KERNEL(
     warpctc_grad, GPU, ALL_LAYOUT, phi::WarpctcGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/warpctc_kernel.cu b/paddle/phi/kernels/gpu/warpctc_kernel.cu
index 3379322f3dfd8..8d93f24b65914 100644
--- a/paddle/phi/kernels/gpu/warpctc_kernel.cu
+++ b/paddle/phi/kernels/gpu/warpctc_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/warpctc_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/warpctc_kernel_impl.h"
+#include "paddle/phi/kernels/warpctc_kernel.h"
 
 PD_REGISTER_KERNEL(
     warpctc, GPU, ALL_LAYOUT, phi::WarpctcKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/where_grad_kernel.cu b/paddle/phi/kernels/gpu/where_grad_kernel.cu
index 14cc1d311321d..2be698c3455d5 100644
--- a/paddle/phi/kernels/gpu/where_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/where_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/where_grad_kernel.h"
 
 namespace phi {
 
@@ -50,9 +49,9 @@ void WhereGradKernel(const Context& ctx,
 
   auto stream = ctx.stream();
   auto config = backends::gpu::GetGpuLaunchConfig1D(ctx, numel);
-  WhereGradCUDAKernel<
-      T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-      numel, dout, cond_data, dx, dy);
+  WhereGradCUDAKernel<T>
+      <<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+          numel, dout, cond_data, dx, dy);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/where_index_kernel.cu
index 3ff73ce8b3bab..c16859c52b22a 100644
--- a/paddle/phi/kernels/gpu/where_index_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_index_kernel.cu
@@ -20,13 +20,12 @@
 namespace cub = hipcub;
 #endif
 
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/select_impl.cu.h"
-#include "paddle/phi/kernels/where_index_kernel.h"
-
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
+#include "paddle/phi/kernels/where_index_kernel.h"
 
 namespace phi {
 template <typename MaskT, typename IndexT, typename OutT>
diff --git a/paddle/phi/kernels/gpu/where_kernel.cu b/paddle/phi/kernels/gpu/where_kernel.cu
index 441be02b99efa..c623b6ec8b721 100644
--- a/paddle/phi/kernels/gpu/where_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/where_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/where_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
index 53e4c39d8bcee..e5552f28f88e3 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -12,13 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
-
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/operators/conv_miopen_helper.h"
 #else
@@ -28,16 +26,13 @@
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-
-#include "paddle/phi/kernels/cpu/conv_util.h"
-#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
-
-#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
-
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 9d4acb95ea48a..80100ba8ff44d 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -12,14 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/conv_grad_kernel.h"
-
-#include "paddle/phi/core/dense_tensor.h"
-
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/kernels/conv_grad_kernel.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/operators/conv_miopen_helper.h"
 #else
@@ -29,16 +26,13 @@
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
-
+#include "paddle/phi/kernels/funcs/padding.h"
 #include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/float16.h"
-
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index 3d3ab7b7a4e94..c746c4db9dce8 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/conv_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_kernel.h"
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/operators/conv_miopen_helper.h"
@@ -27,16 +26,13 @@
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
-
+#include "paddle/phi/kernels/funcs/padding.h"
 #include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/float16.h"
-
 namespace phi {
 
 template <typename T, typename Context>
@@ -334,9 +330,9 @@ void ConvCudnnKernel(const Context& ctx,
   paddle::operators::ScalingParamType<T> alpha = 1.0f;
   paddle::operators::ScalingParamType<T> beta = 0.0f;
 
-// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
-// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
-// VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
+  // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+  // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+  // VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
 
 #ifdef PADDLE_WITH_HIP
   workspace_handle.RunFunc(
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index 601ac43eeefd3..6d5a0dd5e0b7e 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
-
 #include <algorithm>
+
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index ce02a00162b57..67a2f381d76f4 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/conv_transpose_kernel.h"
-
 #include <algorithm>
+
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/padding.h"
 #include "paddle/phi/kernels/funcs/slice.h"
diff --git a/paddle/phi/kernels/gpudnn/pool_gpudnn.h b/paddle/phi/kernels/gpudnn/pool_gpudnn.h
index 0cf2c991464fc..69fd51b7f0ddc 100644
--- a/paddle/phi/kernels/gpudnn/pool_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/pool_gpudnn.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
index b731d03347024..919a2a2193a4f 100644
--- a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
@@ -12,15 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/pool_grad_kernel.h"
-
-#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h"
+#include "paddle/phi/kernels/pool_grad_kernel.h"
 #include "paddle/phi/kernels/pool_kernel.h"
 
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu
index d8f965667758b..53e3d7c9426c0 100644
--- a/paddle/phi/kernels/gpudnn/pool_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/pool_kernel.h"
-
-#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h"
+#include "paddle/phi/kernels/pool_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 58781e8c6e491..ca3574de77170 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -493,14 +493,11 @@ __global__ void WarpSoftmaxBackward(T* dst,
   }
 }
 
-#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, AccT)                      \
-  case Log2Elements:                                                       \
-    WarpSoftmaxForward<T,                                                  \
-                       VecT,                                               \
-                       AccT,                                               \
-                       Log2Elements,                                       \
-                       LogMode><<<blocks, threads, 0, dev_ctx.stream()>>>( \
-        dst, src, batch_size, stride, element_count);                      \
+#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, AccT)        \
+  case Log2Elements:                                         \
+    WarpSoftmaxForward<T, VecT, AccT, Log2Elements, LogMode> \
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(          \
+            dst, src, batch_size, stride, element_count);    \
     break;
 
 /*
@@ -533,14 +530,11 @@ void SwitchWarpSoftmaxForward(const int blocks,
   }
 }
 
-#define SOFTMAX_WARP_BACKWARD_CASE(Log2Elements, AccT)                      \
-  case Log2Elements:                                                        \
-    WarpSoftmaxBackward<T,                                                  \
-                        VecT,                                               \
-                        AccT,                                               \
-                        Log2Elements,                                       \
-                        LogMode><<<blocks, threads, 0, dev_ctx.stream()>>>( \
-        dst, grad, src, batch_size, stride, element_count);                 \
+#define SOFTMAX_WARP_BACKWARD_CASE(Log2Elements, AccT)          \
+  case Log2Elements:                                            \
+    WarpSoftmaxBackward<T, VecT, AccT, Log2Elements, LogMode>   \
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(             \
+            dst, grad, src, batch_size, stride, element_count); \
     break;
 
 /*
@@ -621,7 +615,8 @@ static void GetLaunchConfig(
 
 template <typename T,
           typename AccT,
-          template <typename, typename> class Functor>
+          template <typename, typename>
+          class Functor>
 __global__ void NormalSoftmaxForward(
     T* output, const T* input, int high_dim, int mid_dim, int low_dim) {
   using kMode = kps::details::ReduceMode;
@@ -668,7 +663,8 @@ __global__ void NormalSoftmaxForward(
 
 template <typename T,
           typename AccT,
-          template <typename, typename> class Functor,
+          template <typename, typename>
+          class Functor,
           bool LogMode>
 __global__ void NormalSoftmaxBackward(T* input_grad,
                                       const T* output_grad,
@@ -726,17 +722,13 @@ void LaunchNormalSoftmaxForward(const GPUContext& dev_ctx,
   dim3 grid, block;
   GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block);
   if (LogMode) {
-    NormalSoftmaxForward<
-        T,
-        AccT,
-        LogSoftmaxForwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
-        output_data, input_data, high_dim, mid_dim, low_dim);
+    NormalSoftmaxForward<T, AccT, LogSoftmaxForwardFunctor>
+        <<<grid, block, 0, dev_ctx.stream()>>>(
+            output_data, input_data, high_dim, mid_dim, low_dim);
   } else {
-    NormalSoftmaxForward<
-        T,
-        AccT,
-        SoftmaxForwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
-        output_data, input_data, high_dim, mid_dim, low_dim);
+    NormalSoftmaxForward<T, AccT, SoftmaxForwardFunctor>
+        <<<grid, block, 0, dev_ctx.stream()>>>(
+            output_data, input_data, high_dim, mid_dim, low_dim);
   }
 }
 
@@ -752,27 +744,21 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
   dim3 grid, block;
   GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block);
   if (LogMode) {
-    NormalSoftmaxBackward<T,
-                          AccT,
-                          LogSoftmaxBackwardFunctor,
-                          LogMode><<<grid, block, 0, dev_ctx.stream()>>>(
-        input_grad_data,
-        output_grad_data,
-        output_data,
-        high_dim,
-        mid_dim,
-        low_dim);
+    NormalSoftmaxBackward<T, AccT, LogSoftmaxBackwardFunctor, LogMode>
+        <<<grid, block, 0, dev_ctx.stream()>>>(input_grad_data,
+                                               output_grad_data,
+                                               output_data,
+                                               high_dim,
+                                               mid_dim,
+                                               low_dim);
   } else {
-    NormalSoftmaxBackward<T,
-                          AccT,
-                          SoftmaxBackwardFunctor,
-                          LogMode><<<grid, block, 0, dev_ctx.stream()>>>(
-        input_grad_data,
-        output_grad_data,
-        output_data,
-        high_dim,
-        mid_dim,
-        low_dim);
+    NormalSoftmaxBackward<T, AccT, SoftmaxBackwardFunctor, LogMode>
+        <<<grid, block, 0, dev_ctx.stream()>>>(input_grad_data,
+                                               output_grad_data,
+                                               output_data,
+                                               high_dim,
+                                               mid_dim,
+                                               low_dim);
   }
 }
 
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
index 45ab645d37367..343cba311ae46 100644
--- a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/softmax_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
index 37175c427ffe1..b71f39722c98d 100644
--- a/paddle/phi/kernels/gpudnn/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/softmax_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+#include "paddle/phi/kernels/softmax_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/graph_send_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
index fbb6db358a476..1379e0f542a72 100644
--- a/paddle/phi/kernels/graph_send_recv_grad_kernel.h
+++ b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/utils/optional.h"
 
diff --git a/paddle/phi/kernels/graph_send_recv_kernel.h b/paddle/phi/kernels/graph_send_recv_kernel.h
index 51768fbc18f01..8f635225b75a4 100644
--- a/paddle/phi/kernels/graph_send_recv_kernel.h
+++ b/paddle/phi/kernels/graph_send_recv_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
index 80dba29e76cbd..58471eb3c8fc6 100644
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
 
-#include "paddle/fluid/platform/device_context.h"
-
 namespace phi {
 
 template <typename T, typename Context, typename Functor>
diff --git a/paddle/phi/kernels/impl/activation_impl.h b/paddle/phi/kernels/impl/activation_impl.h
index 1a62c4e06b557..721179372012d 100644
--- a/paddle/phi/kernels/impl/activation_impl.h
+++ b/paddle/phi/kernels/impl/activation_impl.h
@@ -14,12 +14,11 @@
 
 #pragma once
 
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-#include "paddle/fluid/platform/device_context.h"
-
 namespace phi {
 
 #define ToString(x) #x
diff --git a/paddle/phi/kernels/impl/adagrad_kernel_impl.h b/paddle/phi/kernels/impl/adagrad_kernel_impl.h
index ca9fedaf158d6..1b64da5283c25 100644
--- a/paddle/phi/kernels/impl/adagrad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/adagrad_kernel_impl.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/adagrad_kernel.h"
-
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/phi/kernels/adagrad_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
index 9956f07bf0b98..bd775110f3a9e 100644
--- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/addmm_grad_kernel.h"
-
 #include <type_traits>
+
+#include "paddle/phi/kernels/addmm_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
diff --git a/paddle/phi/kernels/impl/addmm_kernel_impl.h b/paddle/phi/kernels/impl/addmm_kernel_impl.h
index 3286e31f68923..41f3f4b39c98a 100644
--- a/paddle/phi/kernels/impl/addmm_kernel_impl.h
+++ b/paddle/phi/kernels/impl/addmm_kernel_impl.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/addmm_kernel.h"
-
 #include <type_traits>
+
+#include "paddle/phi/kernels/addmm_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
diff --git a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
index 0eff1378f41de..8b3ced7387ae7 100644
--- a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/atan2_grad_kernel.h"
-
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/atan2_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/atan2_kernel_impl.h b/paddle/phi/kernels/impl/atan2_kernel_impl.h
index 7653032f2113c..e80256b7254cc 100644
--- a/paddle/phi/kernels/impl/atan2_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/atan2_kernel.h"
-
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/atan2_kernel.h"
 
 namespace phi {
 template <typename T>
diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
index d7167704a4824..0e8b4c216fa93 100644
--- a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
+++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
-
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
index b8df86cc69344..7ffd69e16ee85 100644
--- a/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
@@ -14,9 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/cholesky_grad_kernel.h"
-
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/cholesky_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
index 371644e6434a4..f68a3e596299f 100644
--- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/kernels/cholesky_solve_grad_kernel.h"
-
 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
index c039d11635ba2..1cc8acc21f352 100644
--- a/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
-
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
diff --git a/paddle/phi/kernels/impl/clip_grad_kernel_impl.h b/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
index 7ce86492327ba..0e6fc199610d2 100644
--- a/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
@@ -14,13 +14,11 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/clip_kernel.h"
-
-#include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/clip_kernel.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
diff --git a/paddle/phi/kernels/impl/clip_kernel_impl.h b/paddle/phi/kernels/impl/clip_kernel_impl.h
index 17c04c31a598a..dc916eb2af819 100644
--- a/paddle/phi/kernels/impl/clip_kernel_impl.h
+++ b/paddle/phi/kernels/impl/clip_kernel_impl.h
@@ -14,13 +14,11 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/clip_kernel.h"
-
-#include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/clip_kernel.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
diff --git a/paddle/phi/kernels/impl/compare_kernel_impl.h b/paddle/phi/kernels/impl/compare_kernel_impl.h
index 4390c1f8e661c..2a8b858856c07 100644
--- a/paddle/phi/kernels/impl/compare_kernel_impl.h
+++ b/paddle/phi/kernels/impl/compare_kernel_impl.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/compare_kernel.h"
-
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/compare_kernel.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
index e89920340ff18..6d169354cb4c3 100644
--- a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
@@ -13,9 +13,8 @@
 // limitations under the License.
 #pragma once
 
-#include "paddle/phi/kernels/concat_grad_kernel.h"
-
 #include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/phi/kernels/concat_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 
diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h
index 5cf59fe01920a..132eda7596f6a 100644
--- a/paddle/phi/kernels/impl/conv_cudnn_impl.h
+++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h
@@ -14,12 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/core/dense_tensor.h"
-
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/fluid/framework/eigen.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/operators/conv_miopen_helper.h"
 #else
@@ -27,13 +25,12 @@
 #endif
 
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-
-#include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 
 DECLARE_bool(cudnn_deterministic);
 DECLARE_int64(conv_workspace_size_limit);
diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
index d4fd952a67001..3fbaf2b2d4629 100644
--- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -14,12 +14,11 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
-
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
diff --git a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
index ee2faf761fe32..a76545716af97 100644
--- a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
@@ -14,12 +14,11 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/conv_transpose_kernel.h"
-
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
index ab1c33d50a456..d9c3333fc24cb 100644
--- a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/determinant_grad_kernel.h"
-
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/determinant_grad_kernel.h"
 #include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
diff --git a/paddle/phi/kernels/impl/determinant_kernel_impl.h b/paddle/phi/kernels/impl/determinant_kernel_impl.h
index f3a611b89c95c..18fb152b28968 100644
--- a/paddle/phi/kernels/impl/determinant_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_kernel_impl.h
@@ -14,17 +14,15 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/determinant_kernel.h"
-
 #include <Eigen/Dense>
 #include <Eigen/LU>
 #include <algorithm>
 #include <cmath>
 #include <vector>
 
-#include "paddle/phi/core/enforce.h"
-
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/determinant_kernel.h"
 
 namespace phi {
 namespace detail {
diff --git a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
index 92550de1800e1..49046dfa4d20d 100644
--- a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <unsupported/Eigen/SpecialFunctions>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
diff --git a/paddle/phi/kernels/impl/digamma_kernel_impl.h b/paddle/phi/kernels/impl/digamma_kernel_impl.h
index 8994979e64d70..4547806a38ddb 100644
--- a/paddle/phi/kernels/impl/digamma_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <unsupported/Eigen/SpecialFunctions>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
diff --git a/paddle/phi/kernels/impl/dist_kernel_impl.h b/paddle/phi/kernels/impl/dist_kernel_impl.h
index 397fc1b922433..c4ee7cec34750 100644
--- a/paddle/phi/kernels/impl/dist_kernel_impl.h
+++ b/paddle/phi/kernels/impl/dist_kernel_impl.h
@@ -15,8 +15,10 @@
 #pragma once
 
 #include <math.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
index ae2e31085027f..52d28e481b0e9 100644
--- a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
@@ -14,13 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-
 #include "paddle/phi/kernels/complex_kernel.h"
-
-#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index bfbd6e0c51cfc..f3521c81ce46b 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <set>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
diff --git a/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
index 220de197c8593..96cf08af9634f 100644
--- a/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 #include "paddle/phi/kernels/impl/reduce_grad.h"
 
diff --git a/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h b/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h
index 8577a4e3c6345..d1de47e128e57 100644
--- a/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h
+++ b/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/frobenius_norm_kernel.h"
-
 #include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/frobenius_norm_kernel.h"
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h b/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
index 2517d84898727..655634e319924 100644
--- a/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
+++ b/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <random>
+
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h
index 1ae90960ef445..aef55201a2b12 100644
--- a/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h
index ecd23bbfc1c45..851a78b07413e 100644
--- a/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <string>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
diff --git a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
index 9ef6c61fd60fb..fd1c1dbc8d666 100644
--- a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <unsupported/Eigen/SpecialFunctions>
+
 #include "paddle/phi/kernels/funcs/for_range.h"
 namespace phi {
 template <typename T>
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index 5641e7a8274f3..be32f85fe99a4 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -14,17 +14,15 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
 #include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
-#include "paddle/phi/kernels/cpu/reduce.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #endif
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index 3201923e1b2c6..99257ce4a6adf 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -14,11 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 
-#include "paddle/phi/core/dense_tensor.h"
-
 namespace phi {
 
 static void GetBroadcastFromDims(const int x_ndim,
diff --git a/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
index 546ea74674281..4d551b3d82282 100644
--- a/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/maxout_grad_kernel.h"
-
 #include "paddle/fluid/operators/math/maxouting.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/maxout_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/maxout_kernel_impl.h b/paddle/phi/kernels/impl/maxout_kernel_impl.h
index da8c259ebf217..529534d11c8d4 100644
--- a/paddle/phi/kernels/impl/maxout_kernel_impl.h
+++ b/paddle/phi/kernels/impl/maxout_kernel_impl.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/maxout_kernel.h"
-
 #include "paddle/fluid/operators/math/maxouting.h"
+#include "paddle/phi/kernels/maxout_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h b/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h
index b31fc5ac348fb..386bb1b47ef6d 100644
--- a/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/meshgrid_grad_kernel.h"
-
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/meshgrid_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/meshgrid_kernel_impl.h b/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
index 9167cab978a19..e5e7f785b8127 100644
--- a/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
+++ b/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
@@ -14,13 +14,12 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/meshgrid_kernel.h"
-
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/meshgrid_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/momentum_kernel_impl.h b/paddle/phi/kernels/impl/momentum_kernel_impl.h
index 825a3b9d56990..93e5e957fd4dc 100644
--- a/paddle/phi/kernels/impl/momentum_kernel_impl.h
+++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h
@@ -14,14 +14,13 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/momentum_kernel.h"
-
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/algorithm.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/momentum_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h b/paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h
new file mode 100644
index 0000000000000..8a30082ac366e
--- /dev/null
+++ b/paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/nanmedian_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PostprocessMedianGradKernel(const Context& dev_ctx,
+                                 DenseTensor* input,
+                                 const IntArray& raw_axes,
+                                 DenseTensor* x) {
+  auto input_dim = input->dims();
+  auto rank = input_dim.size();
+
+  std::vector<int64_t> axes = raw_axes.GetData();
+  int64_t axes_size = static_cast<int>(axes.size());
+  for (int64_t i = 0; i < axes_size; i++) {
+    if (axes[i] < 0) {
+      axes[i] += rank;
+    }
+  }
+
+  std::vector<int> trans_back;
+  std::vector<int> reshape_back;
+  trans_back.reserve(rank);
+  trans_back.resize(rank);
+
+  int offset = 0;
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) == axes.end()) {
+      reshape_back.push_back(input_dim[i]);
+      trans_back[i] = offset;
+      offset += 1;
+    }
+  }
+
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) != axes.end()) {
+      trans_back[i] = offset;
+      reshape_back.push_back(input_dim[i]);
+      offset += 1;
+    }
+  }
+
+  input->Resize(make_ddim(reshape_back));
+  funcs::TransCompute<Context, T>(
+      static_cast<int>(trans_back.size()), dev_ctx, *input, x, trans_back);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/nanmedian_kernel_impl.h b/paddle/phi/kernels/impl/nanmedian_kernel_impl.h
new file mode 100644
index 0000000000000..0d3585eb1ce06
--- /dev/null
+++ b/paddle/phi/kernels/impl/nanmedian_kernel_impl.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/nanmedian_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PreprocessMedianKernel(const Context& dev_ctx,
+                            const DenseTensor& input,
+                            const IntArray& raw_axes,
+                            DenseTensor* x) {
+  auto input_dim = input.dims();
+  auto rank = input_dim.size();
+  std::vector<int> perm;
+  std::vector<int64_t> reshape;
+
+  std::vector<int64_t> axes = raw_axes.GetData();
+  int64_t axes_size = static_cast<int>(axes.size());
+  for (int64_t i = 0; i < axes_size; i++) {
+    if (axes[i] < 0) {
+      axes[i] += rank;
+    }
+  }
+
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) == axes.end()) {
+      perm.push_back(i);
+      reshape.push_back(input_dim[i]);
+    }
+  }
+
+  int64_t post_numel = 1;
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) != axes.end()) {
+      perm.push_back(i);
+      post_numel *= input_dim[i];
+    }
+  }
+  reshape.push_back(post_numel);
+
+  DDim trans_dim(input_dim);
+  int ndims = perm.size();
+  for (int i = 0; i < ndims; i++) {
+    trans_dim[i] = input_dim[perm[i]];
+  }
+  x->Resize(trans_dim);
+  dev_ctx.template Alloc<T>(x);
+  funcs::TransCompute<Context, T>(ndims, dev_ctx, input, x, perm);
+
+  x->Resize(make_ddim(reshape));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pad_kernel_impl.h b/paddle/phi/kernels/impl/pad_kernel_impl.h
index 8e3ebb0dfe03b..c4ff32f8b3272 100644
--- a/paddle/phi/kernels/impl/pad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pad_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <utility>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/padding.h"
 namespace phi {
diff --git a/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
index f71f6cd990aa1..4e4091328628e 100644
--- a/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
index c5e41b4902951..4a0ebc148acef 100644
--- a/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
index 7fe89ce34c8b5..a816deaeb04bd 100644
--- a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
@@ -14,11 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/pool_grad_kernel.h"
-
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/pool_grad_kernel.h"
 #include "paddle/phi/kernels/pool_kernel.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/pool_kernel_impl.h b/paddle/phi/kernels/impl/pool_kernel_impl.h
index 665d02fd0173e..fb93fc1ce666a 100644
--- a/paddle/phi/kernels/impl/pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pool_kernel_impl.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/pool_kernel.h"
-
 #include <algorithm>
+
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/pool_kernel.h"
 
 #if defined(__HIPCC__) || defined(__NVCC__)
 #include "paddle/phi/kernels/funcs/reduce_function.h"
diff --git a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
index 9051ef6845966..83dd4a2b576bb 100644
--- a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/reduce_max_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 #include "paddle/phi/kernels/impl/reduce_grad.h"
+#include "paddle/phi/kernels/reduce_max_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
index 53bd0b7d57f1e..592b5309cd970 100644
--- a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/reduce_min_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 #include "paddle/phi/kernels/impl/reduce_grad.h"
+#include "paddle/phi/kernels/reduce_min_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
index 3bf103b0fda9c..69775281a259c 100644
--- a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/reduce_prod_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 #include "paddle/phi/kernels/impl/reduce_grad.h"
+#include "paddle/phi/kernels/reduce_prod_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/reverse_kernel_impl.h b/paddle/phi/kernels/impl/reverse_kernel_impl.h
index acdd46a086583..16ee333f83fa9 100644
--- a/paddle/phi/kernels/impl/reverse_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reverse_kernel_impl.h
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/reverse_kernel.h"
-
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/reverse_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/rmsprop_kernel_impl.h b/paddle/phi/kernels/impl/rmsprop_kernel_impl.h
index 1954c5f20db3e..a01d4ba3aea39 100644
--- a/paddle/phi/kernels/impl/rmsprop_kernel_impl.h
+++ b/paddle/phi/kernels/impl/rmsprop_kernel_impl.h
@@ -16,12 +16,11 @@
 
 #include <math.h>
 
-#include "paddle/phi/kernels/rmsprop_kernel.h"
-
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/phi/kernels/funcs/algorithm.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/rmsprop_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
index bd0ba26b99a43..e75c3e980ef69 100644
--- a/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
index 8a6df37ab3e35..c1671a1b37adf 100644
--- a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/impl/selu_grad_kernel_impl.h b/paddle/phi/kernels/impl/selu_grad_kernel_impl.h
index d09c87b0a4ed2..4f6550b9bec24 100644
--- a/paddle/phi/kernels/impl/selu_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/selu_grad_kernel_impl.h
@@ -13,9 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/kernels/impl/selu_kernel_impl.h"
-
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/impl/selu_kernel_impl.h"
 
 namespace phi {
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/impl/selu_kernel_impl.h b/paddle/phi/kernels/impl/selu_kernel_impl.h
index 888bac42bfd91..288f7bb9b793e 100644
--- a/paddle/phi/kernels/impl/selu_kernel_impl.h
+++ b/paddle/phi/kernels/impl/selu_kernel_impl.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
index 96660c7084be6..40543645b01d1 100644
--- a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
@@ -16,7 +16,6 @@
 
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/dense_tensor.h"
-
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/impl/set_value_kernel_impl.h b/paddle/phi/kernels/impl/set_value_kernel_impl.h
index 229dcf671f993..4859a7348e5be 100644
--- a/paddle/phi/kernels/impl/set_value_kernel_impl.h
+++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h
@@ -17,7 +17,6 @@
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
-
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
diff --git a/paddle/phi/kernels/impl/slice_grad_kernel_impl.h b/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
index a5c67a32553c9..1a6d64ee58a4d 100644
--- a/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/slice_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/slice_utils.h"
+#include "paddle/phi/kernels/slice_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h b/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
index 915bf16a92df1..19df20c0d7cb6 100644
--- a/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/softmax_grad_kernel.h"
-
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/softmax_kernel_impl.h b/paddle/phi/kernels/impl/softmax_kernel_impl.h
index 7aa43fdb7f270..aa0ebf2570c66 100644
--- a/paddle/phi/kernels/impl/softmax_kernel_impl.h
+++ b/paddle/phi/kernels/impl/softmax_kernel_impl.h
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/softmax_kernel.h"
-
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/softmax_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h b/paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h
index 95780682c98dd..f8b604ef1179b 100644
--- a/paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h
@@ -13,9 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/kernels/strided_slice_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/strided_slice.h"
+#include "paddle/phi/kernels/strided_slice_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/strided_slice_kernel_impl.h b/paddle/phi/kernels/impl/strided_slice_kernel_impl.h
index 81e6d5056267a..5d6c3d8992cb4 100644
--- a/paddle/phi/kernels/impl/strided_slice_kernel_impl.h
+++ b/paddle/phi/kernels/impl/strided_slice_kernel_impl.h
@@ -13,9 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/kernels/strided_slice_kernel.h"
-
 #include "paddle/phi/kernels/funcs/strided_slice.h"
+#include "paddle/phi/kernels/strided_slice_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
index 30f2d5a05cdc0..3ea75b036a5a2 100644
--- a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
@@ -14,8 +14,6 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/triangular_solve_grad_kernel.h"
-
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -24,6 +22,7 @@
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/matrix_reduce.h"
 #include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+#include "paddle/phi/kernels/triangular_solve_grad_kernel.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
index dcc7224b5075c..91dbde04aca1f 100644
--- a/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/tril_triu_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+#include "paddle/phi/kernels/tril_triu_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/tril_triu_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
index 959169d87cefd..24c032893c3fb 100644
--- a/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
+++ b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/tril_triu_kernel.h"
-
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+#include "paddle/phi/kernels/tril_triu_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
index 0724cffdd4448..66fa2a4dc04f5 100644
--- a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/impl/unfold_kernel_impl.h b/paddle/phi/kernels/impl/unfold_kernel_impl.h
index 4526d1c3dcd7d..3b75e149f48e2 100644
--- a/paddle/phi/kernels/impl/unfold_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unfold_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/instance_norm_grad_kernel.h b/paddle/phi/kernels/instance_norm_grad_kernel.h
index be7e4ce3e3488..2a661a3fd3853 100644
--- a/paddle/phi/kernels/instance_norm_grad_kernel.h
+++ b/paddle/phi/kernels/instance_norm_grad_kernel.h
@@ -21,10 +21,10 @@ namespace phi {
 template <typename T, typename Context>
 void InstanceNormGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
-                            const DenseTensor& y_grad,
                             const paddle::optional<DenseTensor>& scale,
                             const DenseTensor& saved_mean,
                             const DenseTensor& saved_variance,
+                            const DenseTensor& y_grad,
                             float epsilon,
                             DenseTensor* x_grad,
                             DenseTensor* scale_grad,
diff --git a/paddle/phi/kernels/kldiv_loss_kernel.h b/paddle/phi/kernels/kldiv_loss_kernel.h
index 103780ab74728..7c6cc231c9480 100644
--- a/paddle/phi/kernels/kldiv_loss_kernel.h
+++ b/paddle/phi/kernels/kldiv_loss_kernel.h
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/kps/compare_kernel.cu b/paddle/phi/kernels/kps/compare_kernel.cu
index c315ce2fa9dde..0b0990627f0be 100644
--- a/paddle/phi/kernels/kps/compare_kernel.cu
+++ b/paddle/phi/kernels/kps/compare_kernel.cu
@@ -20,7 +20,9 @@
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #else
 #include <thrust/fill.h>
+
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/compare_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
diff --git a/paddle/phi/kernels/kps/reduce_all_kernel.cu b/paddle/phi/kernels/kps/reduce_all_kernel.cu
index dc6355a213ffb..0459acd982269 100644
--- a/paddle/phi/kernels/kps/reduce_all_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_all_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_all_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_all_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/kps/reduce_max_kernel.cu b/paddle/phi/kernels/kps/reduce_max_kernel.cu
index dd63b05bda1fb..bc997c6c4e3b6 100644
--- a/paddle/phi/kernels/kps/reduce_max_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_max_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_max_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_max_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/kps/reduce_mean_kernel.cu b/paddle/phi/kernels/kps/reduce_mean_kernel.cu
index 8e4a65df12263..c4ecd4380c306 100644
--- a/paddle/phi/kernels/kps/reduce_mean_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_mean_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_mean_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_mean_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/kps/reduce_min_kernel.cu b/paddle/phi/kernels/kps/reduce_min_kernel.cu
index 59d69c29decdf..6fea48b588abb 100644
--- a/paddle/phi/kernels/kps/reduce_min_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_min_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_min_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_min_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/kps/reduce_sum_kernel.cu b/paddle/phi/kernels/kps/reduce_sum_kernel.cu
index e800e4685ec04..f219abd3348a6 100644
--- a/paddle/phi/kernels/kps/reduce_sum_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_sum_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_sum_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_sum_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/masked_select_grad_kernel.h b/paddle/phi/kernels/masked_select_grad_kernel.h
index db7d105093d2a..f8aa06024c58e 100644
--- a/paddle/phi/kernels/masked_select_grad_kernel.h
+++ b/paddle/phi/kernels/masked_select_grad_kernel.h
@@ -24,4 +24,4 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
                             const DenseTensor& out_grad,
                             DenseTensor* x_grad);
 
-}  // namspace phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/masked_select_kernel.h b/paddle/phi/kernels/masked_select_kernel.h
index 471f650690d36..d6fef8569d751 100644
--- a/paddle/phi/kernels/masked_select_kernel.h
+++ b/paddle/phi/kernels/masked_select_kernel.h
@@ -23,4 +23,4 @@ void MaskedSelectKernel(const Context& dev_ctx,
                         const DenseTensor& mask,
                         DenseTensor* out);
 
-}  // namspace phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/matmul_kernel.h b/paddle/phi/kernels/matmul_kernel.h
index a4c4971499fdf..7f4de8d5792ac 100644
--- a/paddle/phi/kernels/matmul_kernel.h
+++ b/paddle/phi/kernels/matmul_kernel.h
@@ -16,7 +16,6 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
-
 #include "paddle/phi/kernels/empty_kernel.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/mv_kernel.h b/paddle/phi/kernels/mv_kernel.h
index ab4f0b82794ab..df4626f4d49d4 100644
--- a/paddle/phi/kernels/mv_kernel.h
+++ b/paddle/phi/kernels/mv_kernel.h
@@ -24,4 +24,4 @@ void MvKernel(const Context& ctx,
               const DenseTensor& vec,
               DenseTensor* out);
 
-}  // namepsace phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/nanmedian_grad_kernel.h b/paddle/phi/kernels/nanmedian_grad_kernel.h
index dc7321c1aa751..e8fb01b7060a7 100644
--- a/paddle/phi/kernels/nanmedian_grad_kernel.h
+++ b/paddle/phi/kernels/nanmedian_grad_kernel.h
@@ -13,55 +13,12 @@
 // limitations under the License.
 
 #pragma once
+
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 
-template <typename T, typename Context>
-void PostprocessMedianGradKernel(const Context& dev_ctx,
-                                 DenseTensor* input,
-                                 const IntArray& raw_axes,
-                                 DenseTensor* x) {
-  auto input_dim = input->dims();
-  auto rank = input_dim.size();
-
-  std::vector<int64_t> axes = raw_axes.GetData();
-  int64_t axes_size = static_cast<int>(axes.size());
-  for (int64_t i = 0; i < axes_size; i++) {
-    if (axes[i] < 0) {
-      axes[i] += rank;
-    }
-  }
-
-  std::vector<int> trans_back;
-  std::vector<int> reshape_back;
-  trans_back.reserve(rank);
-  trans_back.resize(rank);
-
-  int offset = 0;
-  for (int64_t i = 0; i < rank; i++) {
-    if (std::find(axes.begin(), axes.end(), i) == axes.end()) {
-      reshape_back.push_back(input_dim[i]);
-      trans_back[i] = offset;
-      offset += 1;
-    }
-  }
-
-  for (int64_t i = 0; i < rank; i++) {
-    if (std::find(axes.begin(), axes.end(), i) != axes.end()) {
-      trans_back[i] = offset;
-      reshape_back.push_back(input_dim[i]);
-      offset += 1;
-    }
-  }
-
-  input->Resize(make_ddim(reshape_back));
-  funcs::TransCompute<Context, T>(
-      static_cast<int>(trans_back.size()), dev_ctx, *input, x, trans_back);
-}
-
 template <typename T, typename Context>
 void NanmedianGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
diff --git a/paddle/phi/kernels/nanmedian_kernel.h b/paddle/phi/kernels/nanmedian_kernel.h
index 374f420381bdc..4bb382a443144 100644
--- a/paddle/phi/kernels/nanmedian_kernel.h
+++ b/paddle/phi/kernels/nanmedian_kernel.h
@@ -13,58 +13,12 @@
 // limitations under the License.
 
 #pragma once
+
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 
-template <typename T, typename Context>
-void PreprocessMedianKernel(const Context& dev_ctx,
-                            const DenseTensor& input,
-                            const IntArray& raw_axes,
-                            DenseTensor* x) {
-  auto input_dim = input.dims();
-  auto rank = input_dim.size();
-  std::vector<int> perm;
-  std::vector<int64_t> reshape;
-
-  std::vector<int64_t> axes = raw_axes.GetData();
-  int64_t axes_size = static_cast<int>(axes.size());
-  for (int64_t i = 0; i < axes_size; i++) {
-    if (axes[i] < 0) {
-      axes[i] += rank;
-    }
-  }
-
-  for (int64_t i = 0; i < rank; i++) {
-    if (std::find(axes.begin(), axes.end(), i) == axes.end()) {
-      perm.push_back(i);
-      reshape.push_back(input_dim[i]);
-    }
-  }
-
-  int64_t post_numel = 1;
-  for (int64_t i = 0; i < rank; i++) {
-    if (std::find(axes.begin(), axes.end(), i) != axes.end()) {
-      perm.push_back(i);
-      post_numel *= input_dim[i];
-    }
-  }
-  reshape.push_back(post_numel);
-
-  DDim trans_dim(input_dim);
-  int ndims = perm.size();
-  for (int i = 0; i < ndims; i++) {
-    trans_dim[i] = input_dim[perm[i]];
-  }
-  x->Resize(trans_dim);
-  dev_ctx.template Alloc<T>(x);
-  funcs::TransCompute<Context, T>(ndims, dev_ctx, input, x, perm);
-
-  x->Resize(make_ddim(reshape));
-}
-
 template <typename T, typename Context>
 void NanmedianKernel(const Context& dev_ctx,
                      const DenseTensor& x,
diff --git a/paddle/phi/kernels/pixel_shuffle_grad_kernel.h b/paddle/phi/kernels/pixel_shuffle_grad_kernel.h
index be57de5da4053..c42731d354b5a 100644
--- a/paddle/phi/kernels/pixel_shuffle_grad_kernel.h
+++ b/paddle/phi/kernels/pixel_shuffle_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/pixel_shuffle_kernel.h b/paddle/phi/kernels/pixel_shuffle_kernel.h
index 18b9ab9c21fdc..bf7c9f07224a0 100644
--- a/paddle/phi/kernels/pixel_shuffle_kernel.h
+++ b/paddle/phi/kernels/pixel_shuffle_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/pixel_unshuffle_grad_kernel.h b/paddle/phi/kernels/pixel_unshuffle_grad_kernel.h
index 868633e56be50..43919d9e63f94 100644
--- a/paddle/phi/kernels/pixel_unshuffle_grad_kernel.h
+++ b/paddle/phi/kernels/pixel_unshuffle_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/pixel_unshuffle_kernel.h b/paddle/phi/kernels/pixel_unshuffle_kernel.h
index 179e2b6639f9e..f91326d384c39 100644
--- a/paddle/phi/kernels/pixel_unshuffle_kernel.h
+++ b/paddle/phi/kernels/pixel_unshuffle_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/pool_grad_kernel.h b/paddle/phi/kernels/pool_grad_kernel.h
index 0658dc22c823b..d26bee2eb2c34 100644
--- a/paddle/phi/kernels/pool_grad_kernel.h
+++ b/paddle/phi/kernels/pool_grad_kernel.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/pool_kernel.h b/paddle/phi/kernels/pool_kernel.h
index 348af02181517..b9a4c830fa5bd 100644
--- a/paddle/phi/kernels/pool_kernel.h
+++ b/paddle/phi/kernels/pool_kernel.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index fabc6c0d13e7c..b5df98671f0b0 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -487,28 +487,28 @@ __device__ __forceinline__ void Reduce(T* out,
 }
 
 /*
-* @brief Fill register with a constant according to OpFunc
-*
-* @template paraments
-* InT: The data type of in1 and in2.
-* OutT: The data type of out.
-* NX: The number of data columns loaded by each thread.
-* NY: The number of data rows loaded by each thread.
-* BlockSize: Identifies the current device thread index method. Currently only
-* GPU was supported.
-* OpFunc: Compute functor which has an operator() as following
-*     template <typename InT>
-*     struct XxxFunctor {
-*       HOSTDEVICE InT operator()()
-* const {
-*         return a;
-*       }
-*     };
-*
-* @param
-* out: The register pointer of out, the size is NX * NY.
-* compute: Compute function which was declared like OpFunc<InT>().
-*/
+ * @brief Fill register with a constant according to OpFunc
+ *
+ * @template paraments
+ * InT: The data type of in1 and in2.
+ * OutT: The data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. Currently only
+ * GPU was supported.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename InT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE InT operator()()
+ * const {
+ *         return a;
+ *       }
+ *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT>().
+ */
 template <typename InT,
           typename OutT,
           int NX,
@@ -523,31 +523,31 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
 }
 
 /*
-* @brief Get ReturnsCount random data fromm compute according to state, state
-* can be curandStatePhilox4_32_10_t, hiprandStatePhilox4_32_10_t which has beed
-* initialized.
-*
-* @template paraments
-* StateType: the type of state, can be curandStatePhilox4_32_10_t or
-* hiprandStatePhilox4_32_10_t.
-* OutT: the type of out register.
-* ReturnsCount: The number of random data generated by OpFunc.
-* BlockSize: Identifies the current device thread index method. Currently only
-* GPU was supported.
-* OpFunc: Compute functor which has an operator() as following
-*     template <typename T>
-*     struct XxxFunctor {
-*       HOSTDEVICE InT operator()(StateType state)
-* const {
-*         return ranomd(state);  // Returns ReturnsCount random numbers with
-* data type T
-*       }
-*     };
-*
-* @param
-* out: The register pointer of out, the size is NX * NY.
-* compute: Compute function which was declared like OpFunc<T>().
-*/
+ * @brief Get ReturnsCount random data fromm compute according to state, state
+ * can be curandStatePhilox4_32_10_t, hiprandStatePhilox4_32_10_t which has beed
+ * initialized.
+ *
+ * @template paraments
+ * StateType: the type of state, can be curandStatePhilox4_32_10_t or
+ * hiprandStatePhilox4_32_10_t.
+ * OutT: the type of out register.
+ * ReturnsCount: The number of random data generated by OpFunc.
+ * BlockSize: Identifies the current device thread index method. Currently only
+ * GPU was supported.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename T>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE InT operator()(StateType state)
+ * const {
+ *         return ranomd(state);  // Returns ReturnsCount random numbers with
+ * data type T
+ *       }
+ *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<T>().
+ */
 
 template <typename StateType,
           typename OutT,
@@ -565,28 +565,28 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
 }
 
 /*
-* @brief Complete the prefix and in the block, each thread calculates 2 data,
-* the size of out and in is 2, and BlockDim.x must be less then 512.
-*
-* @template paraments
-* InT: the type of input register.
-* OutT: the type of out register.
-* BlockSize: Identifies the current device thread index method. Currently only
-* GPU was supported.
-* OpFunc: Compute functor which has an operator() as following
-*     template <typename T>
-*     struct XxxFunctor {
-*       HOSTDEVICE InT operator()(T a, T b)
-* const {
-*         return a + b;
-*       }
-*     };
-*
-* @param
-* out: The register pointer of out, the size is 2;
-* in: The register pointer of input, the size is 2;
-* compute: Compute function which was declared like OpFunc<T>().
-*/
+ * @brief Complete the prefix and in the block, each thread calculates 2 data,
+ * the size of out and in is 2, and BlockDim.x must be less then 512.
+ *
+ * @template paraments
+ * InT: the type of input register.
+ * OutT: the type of out register.
+ * BlockSize: Identifies the current device thread index method. Currently only
+ * GPU was supported.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename T>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE InT operator()(T a, T b)
+ * const {
+ *         return a + b;
+ *       }
+ *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is 2;
+ * in: The register pointer of input, the size is 2;
+ * compute: Compute function which was declared like OpFunc<T>().
+ */
 
 #define SHARED_SIZE_LIMIT 512
 template <typename InT, typename OutT, int BlockSize, class OpFunc>
@@ -626,22 +626,22 @@ __device__ __forceinline__ void Cumsum(OutT* out,
 #undef SHARED_SIZE_LIMIT
 
 /*
-* @brief Sort data in this block, each thread calculates 2 data, the size of out
-* and in is 2, and BlockDim.x must be less then 512.
-*
-* @template paraments
-* InT: the type of input register.
-* OutT: the type of out register.
-* BlockSize: Identifies the current device thread index method. Currently only
-* GPU was supported.
-*
-* @param
-* out: The register pointer of out, the size is 2.
-* in: The register pointer of input, the size is 2.
-* num: The num of this block
-* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
-* sorted in escending.
-*/
+ * @brief Sort data in this block, each thread calculates 2 data, the size of
+ * out and in is 2, and BlockDim.x must be less then 512.
+ *
+ * @template paraments
+ * InT: the type of input register.
+ * OutT: the type of out register.
+ * BlockSize: Identifies the current device thread index method. Currently only
+ * GPU was supported.
+ *
+ * @param
+ * out: The register pointer of out, the size is 2.
+ * in: The register pointer of input, the size is 2.
+ * num: The num of this block
+ * monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
+ * sorted in escending.
+ */
 #define SHARED_SIZE_LIMIT 1024
 // each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
 // larger than blockDim.x * 2
@@ -682,25 +682,25 @@ __device__ __forceinline__ void Sort(OutT* out,
 }
 
 /*
-* @brief Sort data with data_index in this block, each thread calculates 2 data,
-* the size of out and in is 2, and BlockDim.x must be less then 512.
-*
-* @template paraments
-* InT: The type of input register.
-* OutT: The type of out register.
-* IndexType: The type of index.
-* BlockSize: Identifies the current device thread index method. Currently only
-* GPU was supported.
-*
-* @param
-* out: The register pointer of out, the size is 2.
-* out_index: The register pointer of out_index, the size is 2.
-* in: The register pointer of input, the size is 2.
-* in_index: The register pointer of in_index, the size is 2.
-* num: The num of this block.
-* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
-* sorted in escending.
-*/
+ * @brief Sort data with data_index in this block, each thread calculates 2
+ * data, the size of out and in is 2, and BlockDim.x must be less then 512.
+ *
+ * @template paraments
+ * InT: The type of input register.
+ * OutT: The type of out register.
+ * IndexType: The type of index.
+ * BlockSize: Identifies the current device thread index method. Currently only
+ * GPU was supported.
+ *
+ * @param
+ * out: The register pointer of out, the size is 2.
+ * out_index: The register pointer of out_index, the size is 2.
+ * in: The register pointer of input, the size is 2.
+ * in_index: The register pointer of in_index, the size is 2.
+ * num: The num of this block.
+ * monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
+ * sorted in escending.
+ */
 template <typename InT, typename OutT, typename IndexType, int BlockSize>
 __device__ __forceinline__ void Sort(OutT* out,
                                      IndexType* out_index,
diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
index eb45def836edc..6ec05ee505443 100644
--- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
@@ -361,28 +361,28 @@ __device__ __forceinline__ void Reduce(T* out,
 }
 
 /*
-* @brief Fill register with a constant according to OpFunc
-*
-* @template paraments
-* InT: The data type of in1 and in2.
-* OutT: The data type of out.
-* NX: The number of data columns loaded by each thread.
-* NY: The number of data rows loaded by each thread.
-* BlockSize: Identifies the current device thread index method. For xpu,
-* core_id() is used as the index.
-* OpFunc: Compute functor which has an operator() as following
-*     template <typename InT>
-*     struct XxxFunctor {
-*       HOSTDEVICE InT operator()()
-* const {
-*         return a;
-*       }
-*     };
-*
-* @param
-* out: The register pointer of out, the size is NX * NY.
-* compute: Compute function which was declared like OpFunc<InT>().
-*/
+ * @brief Fill register with a constant according to OpFunc
+ *
+ * @template paraments
+ * InT: The data type of in1 and in2.
+ * OutT: The data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename InT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE InT operator()()
+ * const {
+ *         return a;
+ *       }
+ *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT>().
+ */
 template <typename InT,
           typename OutT,
           int NX,
diff --git a/paddle/phi/kernels/reshape_grad_kernel.cc b/paddle/phi/kernels/reshape_grad_kernel.cc
index 129a69d4e4e0f..35f85ba86aa34 100644
--- a/paddle/phi/kernels/reshape_grad_kernel.cc
+++ b/paddle/phi/kernels/reshape_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/reshape_grad_kernel.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/reshape_kernel.cc b/paddle/phi/kernels/reshape_kernel.cc
index efcad999b447d..a723ea19d3456 100644
--- a/paddle/phi/kernels/reshape_kernel.cc
+++ b/paddle/phi/kernels/reshape_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/reshape_kernel.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/unary.h"
diff --git a/paddle/phi/kernels/selected_rows/CMakeLists.txt b/paddle/phi/kernels/selected_rows/CMakeLists.txt
index c6fb621ffc075..520536d82352a 100644
--- a/paddle/phi/kernels/selected_rows/CMakeLists.txt
+++ b/paddle/phi/kernels/selected_rows/CMakeLists.txt
@@ -1,3 +1,13 @@
-
-set(SELECTED_ROWS_KERNEL_DEPS dense_tensor selected_rows selected_rows_functor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel)
+set(SELECTED_ROWS_KERNEL_DEPS
+    dense_tensor
+    selected_rows
+    selected_rows_functor
+    sparse_csr_tensor
+    kernel_context
+    kernel_factory
+    arg_map_context
+    convert_utils
+    lod_utils
+    math_function
+    custom_kernel)
 register_kernels(DEPS ${SELECTED_ROWS_KERNEL_DEPS} SUB_DIR "selected_rows")
diff --git a/paddle/phi/kernels/selected_rows/activation_kernel.cc b/paddle/phi/kernels/selected_rows/activation_kernel.cc
index 438a080a635f0..4a27d0763a235 100644
--- a/paddle/phi/kernels/selected_rows/activation_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.cc
@@ -14,12 +14,10 @@
 
 #include "paddle/phi/kernels/selected_rows/activation_kernel.h"
 
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/activation_kernel.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/activation_kernel.h"
 
 namespace phi {
 namespace sr {
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index 03cd7fed411f3..14987bc61b159 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -18,11 +18,10 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/full_kernel.h"
-
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 
 namespace phi {
 namespace sr {
diff --git a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
index 18b6da818a1f3..ec9fed6e3d9eb 100644
--- a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/selected_rows/adam_kernel.h"
-
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -23,6 +21,7 @@
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/selected_rows/adam_kernel.h"
 
 namespace phi {
 namespace sr {
@@ -208,28 +207,28 @@ void AdamDenseParamSparseGradKernel(
     int ndim = param.numel();
     int blocks = (ndim + threads - 1) / threads;
 
-    SparseAdamCUDAKernelREG<T,
-                            MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        beta1_,
-        beta2_,
-        epsilon_,
-        *beta1_pow.data<MPDType>(),
-        *beta2_pow.data<MPDType>(),
-        moment1.data<MPDType>(),
-        dev_ctx.template Alloc<MPDType>(moment1_out),
-        moment2.data<MPDType>(),
-        dev_ctx.template Alloc<MPDType>(moment2_out),
-        learning_rate.data<MPDType>(),
-        grad_data,
-        param.data<T>(),
-        dev_ctx.template Alloc<T>(param_out),
-        master_in_data,
-        master_out_data,
-        rows,
-        row_numel,
-        grad_merge.rows().size(),
-        lazy_mode,
-        ndim);
+    SparseAdamCUDAKernelREG<T, MPDType>
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(
+            beta1_,
+            beta2_,
+            epsilon_,
+            *beta1_pow.data<MPDType>(),
+            *beta2_pow.data<MPDType>(),
+            moment1.data<MPDType>(),
+            dev_ctx.template Alloc<MPDType>(moment1_out),
+            moment2.data<MPDType>(),
+            dev_ctx.template Alloc<MPDType>(moment2_out),
+            learning_rate.data<MPDType>(),
+            grad_data,
+            param.data<T>(),
+            dev_ctx.template Alloc<T>(param_out),
+            master_in_data,
+            master_out_data,
+            rows,
+            row_numel,
+            grad_merge.rows().size(),
+            lazy_mode,
+            ndim);
     if (!use_global_beta_pow) {
       // Update with cpu
       dev_ctx.template HostAlloc<MPDType>(beta1_pow_out)[0] =
diff --git a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
index 182c4390b1722..35a349a277d74 100644
--- a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/selected_rows/adamw_kernel.h"
-
 #include <math.h>  // for sqrt in CPU and CUDA
+
 #include <vector>
 
 #include "paddle/fluid/framework/tensor_util.h"
@@ -26,6 +25,7 @@
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/selected_rows/adamw_kernel.h"
 
 namespace phi {
 namespace sr {
@@ -230,30 +230,30 @@ void AdamwDenseParamSparseGradKernel(
     int ndim = param.numel();
     int blocks = (ndim + threads - 1) / threads;
 
-    SparseAdamWCUDAKernelREG<T,
-                             MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        beta1_,
-        beta2_,
-        epsilon_,
-        coeff_,
-        lr_ratio_,
-        *beta1_pow.data<MPDType>(),
-        *beta2_pow.data<MPDType>(),
-        moment1.data<MPDType>(),
-        dev_ctx.template Alloc<MPDType>(moment1_out),
-        moment2.data<MPDType>(),
-        dev_ctx.template Alloc<MPDType>(moment2_out),
-        learning_rate.data<MPDType>(),
-        grad_data,
-        param.data<T>(),
-        dev_ctx.template Alloc<T>(param_out),
-        master_in_data,
-        master_out_data,
-        rows,
-        row_numel,
-        grad_merge.rows().size(),
-        lazy_mode,
-        ndim);
+    SparseAdamWCUDAKernelREG<T, MPDType>
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(
+            beta1_,
+            beta2_,
+            epsilon_,
+            coeff_,
+            lr_ratio_,
+            *beta1_pow.data<MPDType>(),
+            *beta2_pow.data<MPDType>(),
+            moment1.data<MPDType>(),
+            dev_ctx.template Alloc<MPDType>(moment1_out),
+            moment2.data<MPDType>(),
+            dev_ctx.template Alloc<MPDType>(moment2_out),
+            learning_rate.data<MPDType>(),
+            grad_data,
+            param.data<T>(),
+            dev_ctx.template Alloc<T>(param_out),
+            master_in_data,
+            master_out_data,
+            rows,
+            row_numel,
+            grad_merge.rows().size(),
+            lazy_mode,
+            ndim);
     if (!use_global_beta_pow) {
       // Update with cpu
       dev_ctx.template HostAlloc<MPDType>(beta1_pow_out)[0] =
diff --git a/paddle/phi/kernels/selected_rows/gpu/clip_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/clip_kernel.cu
index a8d659559e19e..b9f4febb3b48d 100644
--- a/paddle/phi/kernels/selected_rows/gpu/clip_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/clip_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/selected_rows/clip_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/selected_rows/clip_kernel.h"
 #include "paddle/phi/kernels/selected_rows/impl/clip_kernel_impl.h"
 
 PD_REGISTER_KERNEL(clip_sr,
diff --git a/paddle/phi/kernels/selected_rows/impl/clip_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/clip_kernel_impl.h
index 1d95e633b93a6..c39d386746773 100644
--- a/paddle/phi/kernels/selected_rows/impl/clip_kernel_impl.h
+++ b/paddle/phi/kernels/selected_rows/impl/clip_kernel_impl.h
@@ -14,13 +14,12 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/selected_rows/clip_kernel.h"
-
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/kernels/selected_rows/clip_kernel.h"
 
 namespace phi {
 namespace sr {
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
index 67126d82042b2..575bcc0d09fd6 100644
--- a/paddle/phi/kernels/selected_rows/shape_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/selected_rows/shape_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/kernels/selu_kernel.h b/paddle/phi/kernels/selu_kernel.h
index cd5d27e98ccc1..b8130d2398691 100644
--- a/paddle/phi/kernels/selu_kernel.h
+++ b/paddle/phi/kernels/selu_kernel.h
@@ -25,4 +25,4 @@ void SeluKernel(const Context& dev_ctx,
                 float scale,
                 float alpha,
                 DenseTensor* out);
-}  // phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc
index f87b5014c1207..ea48ea6171e6c 100644
--- a/paddle/phi/kernels/shape_kernel.cc
+++ b/paddle/phi/kernels/shape_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/shape_kernel.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/sparse/CMakeLists.txt b/paddle/phi/kernels/sparse/CMakeLists.txt
index 479d530429498..6c5e7dee4cb35 100644
--- a/paddle/phi/kernels/sparse/CMakeLists.txt
+++ b/paddle/phi/kernels/sparse/CMakeLists.txt
@@ -1,3 +1,13 @@
-
-set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel copy_kernel)
+set(SPARSE_KERNEL_DEPS
+    dense_tensor
+    sparse_coo_tensor
+    sparse_csr_tensor
+    kernel_context
+    kernel_factory
+    arg_map_context
+    convert_utils
+    lod_utils
+    math_function
+    custom_kernel
+    copy_kernel)
 register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse")
diff --git a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
index 22c5e14b35f56..9d1f71afceb5e 100644
--- a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/coalesced_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
index 34337db558c8a..5a981fb8df350 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
+
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index d133464ab853c..1b95de890deeb 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
 
 namespace phi {
 namespace sparse {
@@ -25,7 +25,7 @@ namespace sparse {
  * x: (N, D, H, W, C)
  * kernel: (D, H, W, C, OC)
  * out: (N, D, H, W, OC)
-**/
+ **/
 template <typename T, typename IntT = int>
 void Conv3dCPUKernel(const CPUContext& dev_ctx,
                      const SparseCooTensor& x,
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
index 0e5714b174361..37579ae85640d 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
+
+#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -22,8 +24,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
-
 namespace phi {
 namespace sparse {
 
@@ -73,7 +73,7 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx,
  * @brief Filter the DenseTensor x by the
  * mask.non_zero_indices() and output a SparseCooTensor
  * x and mask must have the same shape.
-**/
+ **/
 template <typename T, typename Context>
 void SparseMaskKernel(const Context& dev_ctx,
                       const DenseTensor& x,
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
index 71a0095395552..fdf8e5aa7ebf2 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
index 28211a1cda347..7655913374dbd 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/visit_type.h"
@@ -27,7 +28,7 @@ namespace sparse {
  * x: (N, D, H, W, C)
  * kernel: (D, H, W, C, OC)
  * out: (N, D, H, W, OC)
-**/
+ **/
 template <typename T, typename IntT = int>
 void MaxPoolCPUKernel(const CPUContext& dev_ctx,
                       const SparseCooTensor& x,
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index 2301d31d7a6c2..28b1b3368ed42 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
+
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
index b2e7884580c74..7d9e566916add 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
@@ -133,17 +133,15 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx,
 
   // 5. scatter the values
   config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
-  phi::funcs::sparse::ScatterKernel<T><<<config.block_per_grid,
-                                         config.thread_per_block,
-                                         0,
-                                         dev_ctx.stream()>>>(
-      x_values_ptr,
-      public_indexs.data<int>(),
-      values_indexs_ptr,
-      out_nnz,
-      nnz,
-      stride,
-      out_values.data<T>());
+  phi::funcs::sparse::ScatterKernel<T>
+      <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
+          x_values_ptr,
+          public_indexs.data<int>(),
+          values_indexs_ptr,
+          out_nnz,
+          nnz,
+          stride,
+          out_values.data<T>());
 
   // 6. convert index to coordinate
   Dim<DDim::kMaxRank> const_dims;
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index fcbb3c60183eb..24a7387d4fe19 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -45,7 +45,7 @@ using Dims4D = phi::funcs::sparse::Dims4D;
  * output: the outputs
  * index_size: the size of indices
  * slice_size: slice size corresponding to each index, here is the channel size
-**/
+ **/
 template <typename T, typename IndexT = int>
 __global__ void GatherKernel(const T* params,
                              const IndexT* indices,
@@ -115,7 +115,7 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
  * out_dims: indicates the output dims
  * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
  * rulebook_out_indexs: the output index in rulebook
-**/
+ **/
 template <typename T>
 __global__ void UpdateIndexKernel(const T* unique_keys,
                                   const int* unique_values,
@@ -198,7 +198,7 @@ __global__ void UpdateOutIndexAndCounterAfterLowerBound(
  * rulebook: the rulebook to save the kernel index, input index and output index
  * counter: save the number of times each location in the kernel participates in
  *the caculation
-**/
+ **/
 template <typename T>
 __global__ void ProductRuleBookKernel(const T* x_indices,
                                       const Dims4D x_dims,
@@ -421,8 +421,8 @@ int ProductRuleBook(const Context& dev_ctx,
                                 rulebook_ptr,
                                 rulebook_ptr + 3 * rulebook_len,
                                 -1);
-    phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
-        rulebook_ptr, last, bound_ptr);
+    phi::funcs::sparse::DistanceKernel<IntT>
+        <<<1, 1, 0, dev_ctx.stream()>>>(rulebook_ptr, last, bound_ptr);
     phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
                                        bound_ptr,
                                        sizeof(IntT),
@@ -525,18 +525,18 @@ int ProductRuleBook(const Context& dev_ctx,
 
     config =
         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
-    UpdateIndexKernel<IntT><<<config.block_per_grid.x,
-                              config.thread_per_block.x,
-                              0,
-                              dev_ctx.stream()>>>(
-        unique_key_ptr,
-        unique_value_ptr,
-        out_index_ptr,
-        out_non_zero_num,
-        rulebook_len,
-        d_out_dims,
-        out_indices_ptr,
-        rulebook_ptr + 2 * rulebook_len);
+    UpdateIndexKernel<IntT>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(unique_key_ptr,
+                               unique_value_ptr,
+                               out_index_ptr,
+                               out_non_zero_num,
+                               rulebook_len,
+                               d_out_dims,
+                               out_indices_ptr,
+                               rulebook_ptr + 2 * rulebook_len);
     out->SetMember(out_indices, out_values, out_dims, true);
   } else {
     DenseTensor out_indices =
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index c19bf67be2611..d83d064418eec 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -150,15 +150,15 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
 
   config = phi::backends::gpu::GetGpuLaunchConfig1D(
       dev_ctx, rulebook_len * out_channels, 1);
-  GatherKernel<T, IntT><<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          0,
-                          dev_ctx.stream()>>>(
-      out_grad.non_zero_elements().data<T>(),
-      rulebook_ptr + rulebook_len * 2,
-      out_grad_features_ptr,
-      rulebook_len,
-      out_channels);
+  GatherKernel<T, IntT>
+      <<<config.block_per_grid.x,
+         config.thread_per_block.x,
+         0,
+         dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
+                             rulebook_ptr + rulebook_len * 2,
+                             out_grad_features_ptr,
+                             rulebook_len,
+                             out_channels);
 
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 30f0482a0cc36..c3b6c8c6abcc8 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -157,37 +157,37 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
     set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
     config =
         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1);
-    phi::funcs::ScatterCUDAKernel<T, IntT><<<config.block_per_grid,
-                                             config.thread_per_block,
-                                             0,
-                                             dev_ctx.stream()>>>(
-        out_features_ptr,
-        rulebook_ptr + 2 * n,
-        out_values_ptr,
-        n,
-        out_channels,
-        false);
+    phi::funcs::ScatterCUDAKernel<T, IntT>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(out_features_ptr,
+                               rulebook_ptr + 2 * n,
+                               out_values_ptr,
+                               n,
+                               out_channels,
+                               false);
   } else {
     config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, out->nnz() * out_channels, 1);
-    phi::funcs::sparse::ScatterKernel<T><<<config.block_per_grid.x,
-                                           config.thread_per_block.x,
-                                           0,
-                                           dev_ctx.stream()>>>(
-        out_features_ptr,
-        unique_value.data<int>(),
-        out_index.data<int>(),
-        out->nnz(),
-        n,
-        out_channels,
-        out_values_ptr);
+    phi::funcs::sparse::ScatterKernel<T>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(out_features_ptr,
+                               unique_value.data<int>(),
+                               out_index.data<int>(),
+                               out->nnz(),
+                               n,
+                               out_channels,
+                               out_values_ptr);
   }
 }
 /**
  * x: (N, D, H, W, C)
  * kernel: (D, H, W, C, OC)
  * out: (N, D, H, W, OC)
-**/
+ **/
 template <typename T, typename Context>
 void Conv3dKernel(const Context& dev_ctx,
                   const SparseCooTensor& x,
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index 81c63c48ebff2..cbbdc122f616f 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -108,7 +108,7 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx,
  * @brief Filter the DenseTensor x by the
  * mask.non_zero_indices() and output a SparseCooTensor
  * x and mask must have the same shape.
-**/
+ **/
 template <typename T, typename Context>
 void SparseMaskKernel(const Context& dev_ctx,
                       const DenseTensor& x,
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
index c22e67eef6712..694fe667c879e 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
@@ -24,6 +22,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"
+#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -105,18 +104,18 @@ void MaxPoolGradGPUKernel(const GPUContext& dev_ctx,
 
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, counter[i] * in_channels, 1);
-    MaxPoolGradCudaKernel<T, IntT><<<config.block_per_grid.x,
-                                     config.thread_per_block.x,
-                                     0,
-                                     dev_ctx.stream()>>>(
-        in_features_ptr,
-        out_features_ptr,
-        out_grad_ptr,
-        rulebook_ptr + offsets[i] + rulebook_len,
-        counter[i],
-        rulebook_len,
-        in_channels,
-        x_grad_ptr);
+    MaxPoolGradCudaKernel<T, IntT>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(in_features_ptr,
+                               out_features_ptr,
+                               out_grad_ptr,
+                               rulebook_ptr + offsets[i] + rulebook_len,
+                               counter[i],
+                               rulebook_len,
+                               in_channels,
+                               x_grad_ptr);
   }
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
index e3eb7aa24331d..534afbd0f1421 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -46,7 +45,7 @@ __global__ void MaxPoolCudaKernel(const T* in_features_ptr,
  * x: (N, D, H, W, C)
  * kernel: (D, H, W, C, OC)
  * out: (N, D, H, W, OC)
-**/
+ **/
 template <typename T, typename IntT = int>
 void MaxPoolGPUKernel(const GPUContext& dev_ctx,
                       const SparseCooTensor& x,
@@ -113,16 +112,16 @@ void MaxPoolGPUKernel(const GPUContext& dev_ctx,
 
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, counter[i] * in_channels, 1);
-    MaxPoolCudaKernel<T, IntT><<<config.block_per_grid.x,
-                                 config.thread_per_block.x,
-                                 0,
-                                 dev_ctx.stream()>>>(
-        in_features_ptr,
-        rulebook_ptr + offsets[i] + rulebook_len,
-        counter[i],
-        rulebook_len,
-        in_channels,
-        out_features_ptr);
+    MaxPoolCudaKernel<T, IntT>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(in_features_ptr,
+                               rulebook_ptr + offsets[i] + rulebook_len,
+                               counter[i],
+                               rulebook_len,
+                               in_channels,
+                               out_features_ptr);
   }
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index b208e70e04046..38553d1fe1d7a 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <thrust/execution_policy.h>
 #include <thrust/remove.h>
 
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -22,8 +23,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
-#include "paddle/fluid/platform/enforce.h"
-
 namespace phi {
 namespace sparse {
 
@@ -526,17 +525,17 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
 
-  KernelSparseCooToDense<T, int64_t><<<config.block_per_grid.x,
-                                       config.thread_per_block.x,
-                                       0,
-                                       dev_ctx.stream()>>>(
-      indices.data<int64_t>(),
-      d_sparse_offsets.data<int64_t>(),
-      x_data,
-      out_data,
-      non_zero_num,
-      base_offset,
-      sparse_dim);
+  KernelSparseCooToDense<T, int64_t>
+      <<<config.block_per_grid.x,
+         config.thread_per_block.x,
+         0,
+         dev_ctx.stream()>>>(indices.data<int64_t>(),
+                             d_sparse_offsets.data<int64_t>(),
+                             x_data,
+                             out_data,
+                             non_zero_num,
+                             base_offset,
+                             sparse_dim);
 }
 
 }  // namespace sparse
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
index 15d78692f4f35..69677be34b231 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
 
diff --git a/paddle/phi/kernels/split_kernel.h b/paddle/phi/kernels/split_kernel.h
index 6baac241426c7..1a426472c025b 100644
--- a/paddle/phi/kernels/split_kernel.h
+++ b/paddle/phi/kernels/split_kernel.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/core/dense_tensor.h"
-
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 
diff --git a/paddle/phi/kernels/strings/CMakeLists.txt b/paddle/phi/kernels/strings/CMakeLists.txt
index 54eeeb290e1bc..7cbba08e16189 100644
--- a/paddle/phi/kernels/strings/CMakeLists.txt
+++ b/paddle/phi/kernels/strings/CMakeLists.txt
@@ -3,10 +3,23 @@ if(WITH_GPU OR WITH_ROCM)
   add_subdirectory(gpu)
 endif()
 
-cc_library(unicode SRCS unicode.cc DEPS utf8proc)
+cc_library(
+  unicode
+  SRCS unicode.cc
+  DEPS utf8proc)
 set_property(GLOBAL PROPERTY STRING_KERNELS "")
 
-set(STRING_KERNEL_DEPS dense_tensor string_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel)
+set(STRING_KERNEL_DEPS
+    dense_tensor
+    string_tensor
+    sparse_coo_tensor
+    sparse_csr_tensor
+    kernel_context
+    kernel_factory
+    arg_map_context
+    convert_utils
+    lod_utils
+    custom_kernel)
 set(STRING_KERNEL_DEPS ${STRING_KERNEL_DEPS} eigen_function blas math_function)
 # remove this dep after removing fluid deps on tensor creation
 set(STRING_KERNEL_DEPS ${STRING_KERNEL_DEPS} phi_api_utils)
diff --git a/paddle/phi/kernels/strings/case_utils.h b/paddle/phi/kernels/strings/case_utils.h
index 2c30102a5a607..66744c6915bc6 100644
--- a/paddle/phi/kernels/strings/case_utils.h
+++ b/paddle/phi/kernels/strings/case_utils.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 
diff --git a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
index 41889f9cc5ed7..efd69c6e2f901 100644
--- a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
+++ b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/strings/strings_copy_kernel.h"
-#include "paddle/phi/core/kernel_registry.h"
 
 #include "glog/logging.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 namespace strings {
diff --git a/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu b/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
index 5cb4d21ec9906..7a2d61f29f7cb 100644
--- a/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
+++ b/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/strings/strings_copy_kernel.h"
-
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/backends/gpu/gpu_helper.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-
-#include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/strings/gpu/copy_utils.h"
+#include "paddle/phi/kernels/strings/strings_copy_kernel.h"
 
 using pstring = ::phi::dtype::pstring;
 
diff --git a/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu b/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
index 53916def37bda..05d868f4db831 100644
--- a/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
+++ b/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
@@ -9,12 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/strings/strings_lower_upper_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/strings/strings_lower_upper_kernel.h"
 #include "paddle/phi/kernels/strings/unicode.h"
 
 using pstring = ::phi::dtype::pstring;
@@ -44,9 +43,8 @@ struct AsciiCaseConverter<phi::GPUContext, CharConverter> {
     dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
     dim3 grid_size =
         dim3((num + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
-    StringCaseConvertCUDAKernel<
-        CharConverter><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-        out, in, num);
+    StringCaseConvertCUDAKernel<CharConverter>
+        <<<grid_size, block_size, 0, dev_ctx.stream()>>>(out, in, num);
   }
 };
 
diff --git a/paddle/phi/kernels/strings/strings_lower_upper_kernel.h b/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
index db6c267a8586d..36486bc3ec686 100644
--- a/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
+++ b/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
@@ -93,7 +93,8 @@ struct AsciiCaseConverter {
 };
 
 template <typename DeviceContext,
-          template <typename DeviceContextT> typename CharConverter>
+          template <typename DeviceContextT>
+          typename CharConverter>
 struct UTF8CaseConverter {
   void operator()(const DeviceContext& dev_ctx,
                   const pstring* in,
diff --git a/paddle/phi/kernels/strings/unicode.cc b/paddle/phi/kernels/strings/unicode.cc
index bca75c08bce94..9f636809de876 100644
--- a/paddle/phi/kernels/strings/unicode.cc
+++ b/paddle/phi/kernels/strings/unicode.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/strings/unicode.h"
+
 #include <utf8proc.h>
+
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/kernels/strings/unicode_flag.h"
 
diff --git a/paddle/phi/kernels/strings/unicode.h b/paddle/phi/kernels/strings/unicode.h
index f6c5248faeb2e..45e41b72d086c 100644
--- a/paddle/phi/kernels/strings/unicode.h
+++ b/paddle/phi/kernels/strings/unicode.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <cstring>
 #include <memory>
+
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/macros.h"
 
diff --git a/paddle/phi/kernels/transpose_grad_kernel.h b/paddle/phi/kernels/transpose_grad_kernel.h
index 33d4ca7e3c6c2..e224da81a25d0 100644
--- a/paddle/phi/kernels/transpose_grad_kernel.h
+++ b/paddle/phi/kernels/transpose_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/ops/compat/CMakeLists.txt b/paddle/phi/ops/compat/CMakeLists.txt
index baae70903c6a4..34ded6653cf48 100644
--- a/paddle/phi/ops/compat/CMakeLists.txt
+++ b/paddle/phi/ops/compat/CMakeLists.txt
@@ -1,7 +1,13 @@
-set(op_utils_header ${PADDLE_BINARY_DIR}/paddle/phi/ops/compat/signatures.h.tmp CACHE INTERNAL "op_args_fns.cc file")
-set(op_utils_header_final ${PADDLE_BINARY_DIR}/paddle/phi/ops/compat/signatures.h)
-file(WRITE ${op_utils_header} "// Generated by the paddle/phi/ops/compat/CMakeLists.txt.  DO NOT EDIT!\n\n")
-file(APPEND ${op_utils_header} "#include \"paddle/phi/core/compat/op_utils.h\"\n\n")
+set(op_utils_header
+    ${PADDLE_BINARY_DIR}/paddle/phi/ops/compat/signatures.h.tmp
+    CACHE INTERNAL "op_args_fns.cc file")
+set(op_utils_header_final
+    ${PADDLE_BINARY_DIR}/paddle/phi/ops/compat/signatures.h)
+file(
+  WRITE ${op_utils_header}
+  "// Generated by the paddle/phi/ops/compat/CMakeLists.txt.  DO NOT EDIT!\n\n")
+file(APPEND ${op_utils_header}
+     "#include \"paddle/phi/core/compat/op_utils.h\"\n\n")
 
 # Automatically generate the registration code of all arg map functions
 # and compile the corresponding target to avoid frequent code conflicts
diff --git a/paddle/phi/ops/compat/instance_norm_sig.cc b/paddle/phi/ops/compat/instance_norm_sig.cc
index 2b490078512b1..6ccf120979887 100644
--- a/paddle/phi/ops/compat/instance_norm_sig.cc
+++ b/paddle/phi/ops/compat/instance_norm_sig.cc
@@ -27,7 +27,7 @@ KernelSignature InstanceNormOpArgumentMapping(
 KernelSignature InstanceNormGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("instance_norm_grad",
-                         {"X", "Y@GRAD", "Scale", "SavedMean", "SavedVariance"},
+                         {"X", "Scale", "SavedMean", "SavedVariance", "Y@GRAD"},
                          {"epsilon"},
                          {"X@GRAD", "Scale@GRAD", "Bias@GRAD"});
 }
diff --git a/paddle/phi/ops/compat/matrix_rank_sig.cc b/paddle/phi/ops/compat/matrix_rank_sig.cc
index 40dc29579b401..bb884e9c3499b 100644
--- a/paddle/phi/ops/compat/matrix_rank_sig.cc
+++ b/paddle/phi/ops/compat/matrix_rank_sig.cc
@@ -27,7 +27,9 @@ KernelSignature MatrixRankOpArgumentMapping(const ArgumentMappingContext& ctx) {
     return KernelSignature("matrix_rank",
                            {"X"},
                            {
-                               "tol", "use_default_tol", "hermitian",
+                               "tol",
+                               "use_default_tol",
+                               "hermitian",
                            },
                            {"Out"});
   }
diff --git a/paddle/phi/ops/compat/segment_pool_sig.cc b/paddle/phi/ops/compat/segment_pool_sig.cc
index db07343f9ad84..62b2b08f4c186 100644
--- a/paddle/phi/ops/compat/segment_pool_sig.cc
+++ b/paddle/phi/ops/compat/segment_pool_sig.cc
@@ -20,7 +20,11 @@ KernelSignature SegmentPoolGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("segment_pool_grad",
                          {
-                             "X", "SegmentIds", "Out", "SummedIds", "Out@GRAD",
+                             "X",
+                             "SegmentIds",
+                             "Out",
+                             "SummedIds",
+                             "Out@GRAD",
                          },
                          {"pooltype"},
                          {"X@GRAD"});
diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt
index 2333f82d626c4..a337e4ee4bd1b 100644
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -1,33 +1,111 @@
 if(WITH_GPU)
-  nv_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor glog)
+  nv_test(
+    test_phi_tensor
+    SRCS test_pten_tensor.cc
+    DEPS phi_tensor glog)
 elseif(WITH_ROCM)
-  hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor glog)
+  hip_test(
+    test_phi_tensor
+    SRCS test_pten_tensor.cc
+    DEPS phi_tensor glog)
 else()
-  cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor glog)
+  cc_test(
+    test_phi_tensor
+    SRCS test_pten_tensor.cc
+    DEPS phi_tensor glog)
 endif()
 
-cc_test(test_phi_exception SRCS test_pten_exception.cc DEPS gtest)
+cc_test(
+  test_phi_exception
+  SRCS test_pten_exception.cc
+  DEPS gtest)
 
 set(COMMON_API_TEST_DEPS phi_tensor phi_api phi_api_utils)
-cc_test(test_mean_api SRCS test_mean_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_dot_api SRCS test_dot_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_empty_api SRCS test_empty_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar)
-cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_embedding_api SRCS test_embedding_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_cast_api SRCS test_cast_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_to_api SRCS test_to_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_slice_api SRCS test_slice_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_sum_api SRCS test_sum_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar)
-cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_conj_api SRCS test_conj_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_concat_api SRCS test_concat_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_split_api SRCS test_split_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_data_transform SRCS test_data_transform.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_sparse_conv_api SRCS test_sparse_conv_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_strings_empty_api SRCS test_strings_empty_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_strings_lower_upper_api SRCS test_strings_lower_upper_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_mean_api
+  SRCS test_mean_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_dot_api
+  SRCS test_dot_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_matmul_api
+  SRCS test_matmul_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_empty_api
+  SRCS test_empty_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_fill_api
+  SRCS test_fill_api.cc
+  DEPS ${COMMON_API_TEST_DEPS} api_scalar)
+cc_test(
+  test_elementwise_api
+  SRCS test_elementwise_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_embedding_api
+  SRCS test_embedding_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_cast_api
+  SRCS test_cast_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_reshape_api
+  SRCS test_reshape_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_to_api
+  SRCS test_to_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_slice_api
+  SRCS test_slice_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_sum_api
+  SRCS test_sum_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_scale_api
+  SRCS test_scale_api.cc
+  DEPS ${COMMON_API_TEST_DEPS} api_scalar)
+cc_test(
+  test_scale_benchmark
+  SRCS test_scale_benchmark.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_conj_api
+  SRCS test_conj_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_concat_api
+  SRCS test_concat_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_split_api
+  SRCS test_split_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_data_transform
+  SRCS test_data_transform.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_sparse_utils_api
+  SRCS test_sparse_utils_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_sparse_conv_api
+  SRCS test_sparse_conv_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_strings_empty_api
+  SRCS test_strings_empty_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_strings_lower_upper_api
+  SRCS test_strings_lower_upper_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h
index 16143fb11e0ff..322f7b27abdb1 100644
--- a/paddle/phi/tests/api/scale_api.h
+++ b/paddle/phi/tests/api/scale_api.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "glog/logging.h"
-
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
diff --git a/paddle/phi/tests/api/test_cast_api.cc b/paddle/phi/tests/api/test_cast_api.cc
index 5448fb9d42470..b627cc3528341 100644
--- a/paddle/phi/tests/api/test_cast_api.cc
+++ b/paddle/phi/tests/api/test_cast_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_concat_api.cc b/paddle/phi/tests/api/test_concat_api.cc
index 824b72b97ac12..d271e1cc5dddc 100644
--- a/paddle/phi/tests/api/test_concat_api.cc
+++ b/paddle/phi/tests/api/test_concat_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_conj_api.cc b/paddle/phi/tests/api/test_conj_api.cc
index 62a588dff1280..ea8791f2181b4 100644
--- a/paddle/phi/tests/api/test_conj_api.cc
+++ b/paddle/phi/tests/api/test_conj_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc
index 21d5eef4098c0..7e8204ea6c7a2 100644
--- a/paddle/phi/tests/api/test_data_transform.cc
+++ b/paddle/phi/tests/api/test_data_transform.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
diff --git a/paddle/phi/tests/api/test_dot_api.cc b/paddle/phi/tests/api/test_dot_api.cc
index 3fcd4e8a01d12..39ba6c666c605 100644
--- a/paddle/phi/tests/api/test_dot_api.cc
+++ b/paddle/phi/tests/api/test_dot_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_elementwise_api.cc b/paddle/phi/tests/api/test_elementwise_api.cc
index fb4c68a87cb25..f9c10e8c801c1 100644
--- a/paddle/phi/tests/api/test_elementwise_api.cc
+++ b/paddle/phi/tests/api/test_elementwise_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_embedding_api.cc b/paddle/phi/tests/api/test_embedding_api.cc
index 6ccd382786bd1..a590bf2ce6220 100644
--- a/paddle/phi/tests/api/test_embedding_api.cc
+++ b/paddle/phi/tests/api/test_embedding_api.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/backward/backward_api.h"
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/tests/api/test_empty_api.cc b/paddle/phi/tests/api/test_empty_api.cc
index 48adbe1bd2682..6363247427159 100644
--- a/paddle/phi/tests/api/test_empty_api.cc
+++ b/paddle/phi/tests/api/test_empty_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_fill_api.cc b/paddle/phi/tests/api/test_fill_api.cc
index 523fa895d147e..cae56fd663445 100644
--- a/paddle/phi/tests/api/test_fill_api.cc
+++ b/paddle/phi/tests/api/test_fill_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_matmul_api.cc b/paddle/phi/tests/api/test_matmul_api.cc
index 0d4ec7bd4f592..c54c5398280e1 100644
--- a/paddle/phi/tests/api/test_matmul_api.cc
+++ b/paddle/phi/tests/api/test_matmul_api.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/backward/backward_api.h"
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/tests/api/test_mean_api.cc b/paddle/phi/tests/api/test_mean_api.cc
index af47f2cd7714a..717423c8419dd 100644
--- a/paddle/phi/tests/api/test_mean_api.cc
+++ b/paddle/phi/tests/api/test_mean_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_pten_exception.cc b/paddle/phi/tests/api/test_pten_exception.cc
index 837438104876a..92c44684f3ae5 100644
--- a/paddle/phi/tests/api/test_pten_exception.cc
+++ b/paddle/phi/tests/api/test_pten_exception.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include <iostream>
 #include <string>
+
 #include "gtest/gtest.h"
 #include "paddle/phi/api/ext/exception.h"
 
diff --git a/paddle/phi/tests/api/test_reshape_api.cc b/paddle/phi/tests/api/test_reshape_api.cc
index 4a857e2d1dcda..46b73778bc5f7 100644
--- a/paddle/phi/tests/api/test_reshape_api.cc
+++ b/paddle/phi/tests/api/test_reshape_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_scale_api.cc b/paddle/phi/tests/api/test_scale_api.cc
index 5f1e118946675..2795ebcf28611 100644
--- a/paddle/phi/tests/api/test_scale_api.cc
+++ b/paddle/phi/tests/api/test_scale_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_scale_benchmark.cc b/paddle/phi/tests/api/test_scale_benchmark.cc
index e2870a780aeae..dbada896bafd4 100644
--- a/paddle/phi/tests/api/test_scale_benchmark.cc
+++ b/paddle/phi/tests/api/test_scale_benchmark.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_slice_api.cc b/paddle/phi/tests/api/test_slice_api.cc
index ee2ade0229f1f..46245c45f1e40 100644
--- a/paddle/phi/tests/api/test_slice_api.cc
+++ b/paddle/phi/tests/api/test_slice_api.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
index c00113389adb7..bbdb2f70d7fd3 100644
--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -13,12 +13,11 @@ the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/include/sparse_api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_sparse_utils_api.cc b/paddle/phi/tests/api/test_sparse_utils_api.cc
index da66334ced78a..e02017555111c 100644
--- a/paddle/phi/tests/api/test_sparse_utils_api.cc
+++ b/paddle/phi/tests/api/test_sparse_utils_api.cc
@@ -13,12 +13,11 @@ the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/include/sparse_api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_split_api.cc b/paddle/phi/tests/api/test_split_api.cc
index 1b84e7793cf6a..64dab30a6a7cc 100644
--- a/paddle/phi/tests/api/test_split_api.cc
+++ b/paddle/phi/tests/api/test_split_api.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_strings_empty_api.cc b/paddle/phi/tests/api/test_strings_empty_api.cc
index 5f7e373a712d7..3286498c2c098 100644
--- a/paddle/phi/tests/api/test_strings_empty_api.cc
+++ b/paddle/phi/tests/api/test_strings_empty_api.cc
@@ -13,6 +13,7 @@ the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/strings_api.h"
diff --git a/paddle/phi/tests/api/test_strings_lower_upper_api.cc b/paddle/phi/tests/api/test_strings_lower_upper_api.cc
index ed911298bdebb..c8abae1836f33 100644
--- a/paddle/phi/tests/api/test_strings_lower_upper_api.cc
+++ b/paddle/phi/tests/api/test_strings_lower_upper_api.cc
@@ -13,6 +13,7 @@ the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/strings_api.h"
diff --git a/paddle/phi/tests/api/test_sum_api.cc b/paddle/phi/tests/api/test_sum_api.cc
index 9781d70d2b913..935435162aac0 100644
--- a/paddle/phi/tests/api/test_sum_api.cc
+++ b/paddle/phi/tests/api/test_sum_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_to_api.cc b/paddle/phi/tests/api/test_to_api.cc
index 4e8755be0c773..dcf433482516f 100644
--- a/paddle/phi/tests/api/test_to_api.cc
+++ b/paddle/phi/tests/api/test_to_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/common/CMakeLists.txt b/paddle/phi/tests/common/CMakeLists.txt
index 150336a1ed694..3499489541d1c 100644
--- a/paddle/phi/tests/common/CMakeLists.txt
+++ b/paddle/phi/tests/common/CMakeLists.txt
@@ -1,11 +1,32 @@
-cc_test(phi_test_backend SRCS test_backend.cc DEPS gtest)
-cc_test(phi_test_data_layout SRCS test_data_layout.cc DEPS gtest)
-cc_test(phi_test_data_type SRCS test_data_type.cc DEPS gtest)
-cc_test(phi_test_place SRCS test_place.cc DEPS phi_place)
-cc_test(phi_test_int_array SRCS test_int_array.cc DEPS int_array api_int_array phi phi_api)
-if (WITH_GPU)
-    nv_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar)
+cc_test(
+  phi_test_backend
+  SRCS test_backend.cc
+  DEPS gtest)
+cc_test(
+  phi_test_data_layout
+  SRCS test_data_layout.cc
+  DEPS gtest)
+cc_test(
+  phi_test_data_type
+  SRCS test_data_type.cc
+  DEPS gtest)
+cc_test(
+  phi_test_place
+  SRCS test_place.cc
+  DEPS phi_place)
+cc_test(
+  phi_test_int_array
+  SRCS test_int_array.cc
+  DEPS int_array api_int_array phi phi_api)
+if(WITH_GPU)
+  nv_test(
+    phi_test_scalar
+    SRCS test_scalar.cu
+    DEPS scalar api_scalar)
 endif()
 if(WITH_ROCM)
-    hip_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar)
+  hip_test(
+    phi_test_scalar
+    SRCS test_scalar.cu
+    DEPS scalar api_scalar)
 endif()
diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc
index f93394f31df90..c1550e31fae88 100644
--- a/paddle/phi/tests/common/test_backend.cc
+++ b/paddle/phi/tests/common/test_backend.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <iostream>
 
 #include "paddle/phi/api/ext/exception.h"
diff --git a/paddle/phi/tests/common/test_data_layout.cc b/paddle/phi/tests/common/test_data_layout.cc
index b5b6ed119be6e..3a53e25f92b2c 100644
--- a/paddle/phi/tests/common/test_data_layout.cc
+++ b/paddle/phi/tests/common/test_data_layout.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <sstream>
 
diff --git a/paddle/phi/tests/common/test_data_type.cc b/paddle/phi/tests/common/test_data_type.cc
index 517e2ee2ff839..4d3d1de64924d 100644
--- a/paddle/phi/tests/common/test_data_type.cc
+++ b/paddle/phi/tests/common/test_data_type.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <sstream>
 
diff --git a/paddle/phi/tests/common/test_int_array.cc b/paddle/phi/tests/common/test_int_array.cc
index a6278ee4a34fc..30ad7cdd74c59 100644
--- a/paddle/phi/tests/common/test_int_array.cc
+++ b/paddle/phi/tests/common/test_int_array.cc
@@ -12,17 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "gtest/gtest.h"
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/include/context_pool.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/kernels/full_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "gtest/gtest.h"
+#include "paddle/phi/kernels/full_kernel.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
diff --git a/paddle/phi/tests/common/test_place.cc b/paddle/phi/tests/common/test_place.cc
index ed2eb7126ed28..8b1dfc60acf5a 100644
--- a/paddle/phi/tests/common/test_place.cc
+++ b/paddle/phi/tests/common/test_place.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/common/place.h"
-
 #include <map>  // NOLINT
+
 #include "gtest/gtest.h"
+#include "paddle/phi/common/place.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/common/test_scalar.cu b/paddle/phi/tests/common/test_scalar.cu
index 6b0caa175dc04..89b41ef1e583f 100644
--- a/paddle/phi/tests/common/test_scalar.cu
+++ b/paddle/phi/tests/common/test_scalar.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <map>  // NOLINT
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/include/tensor.h"
diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt
index 7d2fd90e6bb7b..57a55963d5c66 100644
--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -1,27 +1,66 @@
-cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel scalar)
-cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
+cc_test(
+  test_custom_kernel
+  SRCS test_custom_kernel.cc
+  DEPS custom_kernel scalar)
+cc_test(
+  test_dense_tensor
+  SRCS test_dense_tensor.cc
+  DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
-cc_test(test_kernel_factory SRCS test_kernel_factory.cc DEPS kernel_factory scale_kernel)
-cc_test(test_sparse_coo_tensor SRCS test_sparse_coo_tensor.cc DEPS dense_tensor sparse_coo_tensor)
-cc_test(test_sparse_csr_tensor SRCS test_sparse_csr_tensor.cc DEPS dense_tensor sparse_csr_tensor)
-cc_test(test_op_utils SRCS test_op_utils.cc DEPS op_compat_infos)
-cc_test(test_phi_device_context SRCS test_device_context.cc DEPS phi_context cpu_context)
-cc_test(test_meta_fn_utils SRCS test_meta_fn_utils.cc DEPS dense_tensor wrapped_infermeta infermeta infermeta_utils)
+cc_test(
+  test_kernel_factory
+  SRCS test_kernel_factory.cc
+  DEPS kernel_factory scale_kernel)
+cc_test(
+  test_sparse_coo_tensor
+  SRCS test_sparse_coo_tensor.cc
+  DEPS dense_tensor sparse_coo_tensor)
+cc_test(
+  test_sparse_csr_tensor
+  SRCS test_sparse_csr_tensor.cc
+  DEPS dense_tensor sparse_csr_tensor)
+cc_test(
+  test_op_utils
+  SRCS test_op_utils.cc
+  DEPS op_compat_infos)
+cc_test(
+  test_phi_device_context
+  SRCS test_device_context.cc
+  DEPS phi_context cpu_context)
+cc_test(
+  test_meta_fn_utils
+  SRCS test_meta_fn_utils.cc
+  DEPS dense_tensor wrapped_infermeta infermeta infermeta_utils)
 
-cc_test(test_ddim SRCS test_ddim.cc DEPS ddim)
+cc_test(
+  test_ddim
+  SRCS test_ddim.cc
+  DEPS ddim)
 if(WITH_GPU)
-  nv_test(test_dim SRCS test_dim.cu DEPS ddim)
+  nv_test(
+    test_dim
+    SRCS test_dim.cu
+    DEPS ddim)
 elseif(WITH_ROCM)
-  hip_test(test_dim SRCS test_dim.cu DEPS ddim)
+  hip_test(
+    test_dim
+    SRCS test_dim.cu
+    DEPS ddim)
 endif()
 
-cc_test(selected_rows_test SRCS test_selected_rows.cc DEPS selected_rows)
+cc_test(
+  selected_rows_test
+  SRCS test_selected_rows.cc
+  DEPS selected_rows)
 if(WITH_TESTING AND TEST selected_rows_test)
   set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
 endif()
-if (NOT WIN32)
-cc_test(test_rw_lock SRCS test_rw_lock.cc)
-endif (NOT WIN32)
-cc_test(test_string_tensor SRCS test_string_tensor.cc DEPS string_tensor)
+if(NOT WIN32)
+  cc_test(test_rw_lock SRCS test_rw_lock.cc)
+endif(NOT WIN32)
+cc_test(
+  test_string_tensor
+  SRCS test_string_tensor.cc
+  DEPS string_tensor)
 cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
diff --git a/paddle/phi/tests/core/test_dense_tensor.cc b/paddle/phi/tests/core/test_dense_tensor.cc
index 42814317b9c83..f6a3e3fa41348 100644
--- a/paddle/phi/tests/core/test_dense_tensor.cc
+++ b/paddle/phi/tests/core/test_dense_tensor.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "gtest/gtest.h"
-
 #include "glog/logging.h"
+#include "gtest/gtest.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/tests/core/allocator.h"
 
diff --git a/paddle/phi/tests/core/test_dim.cu b/paddle/phi/tests/core/test_dim.cu
index 5d8919d8c5494..2a449191367b4 100644
--- a/paddle/phi/tests/core/test_dim.cu
+++ b/paddle/phi/tests/core/test_dim.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <thrust/device_vector.h>
+
 #include <sstream>
 
 #include "gtest/gtest.h"
diff --git a/paddle/phi/tests/core/test_intrusive_ptr.cc b/paddle/phi/tests/core/test_intrusive_ptr.cc
index d9d6008f17b98..e0888f89ce6c6 100644
--- a/paddle/phi/tests/core/test_intrusive_ptr.cc
+++ b/paddle/phi/tests/core/test_intrusive_ptr.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/phi/core/utils/intrusive_ptr.h"
 #include "paddle/phi/core/utils/intrusive_ref_counter.h"
 
diff --git a/paddle/phi/tests/core/test_kernel_factory.cc b/paddle/phi/tests/core/test_kernel_factory.cc
index 490d4967eeba2..44ea9fba1191b 100644
--- a/paddle/phi/tests/core/test_kernel_factory.cc
+++ b/paddle/phi/tests/core/test_kernel_factory.cc
@@ -15,13 +15,12 @@ limitations under the License. */
 #include <iostream>
 #include <sstream>
 
+#include "gtest/gtest.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_factory.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#include "gtest/gtest.h"
-
 PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
 
 namespace phi {
diff --git a/paddle/phi/tests/core/test_rw_lock.cc b/paddle/phi/tests/core/test_rw_lock.cc
index 7a9f72cb2bb94..59d1aed2c3dbf 100644
--- a/paddle/phi/tests/core/test_rw_lock.cc
+++ b/paddle/phi/tests/core/test_rw_lock.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/core/utils/rw_lock.h"
-
 #include <gtest/gtest.h>  // NOLINT
-#include <thread>         // NOLINT
+
+#include <thread>  // NOLINT
+
+#include "paddle/phi/core/utils/rw_lock.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/core/test_selected_rows.cc b/paddle/phi/tests/core/test_selected_rows.cc
index b6229eda60402..793737a6fb4f7 100644
--- a/paddle/phi/tests/core/test_selected_rows.cc
+++ b/paddle/phi/tests/core/test_selected_rows.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <time.h>
+
 #include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
diff --git a/paddle/phi/tests/core/test_sparse_coo_tensor.cc b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
index 5e7642bbfdcb0..e9ee1dde6b2a5 100644
--- a/paddle/phi/tests/core/test_sparse_coo_tensor.cc
+++ b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "gtest/gtest.h"
-
 #include "glog/logging.h"
+#include "gtest/gtest.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/tests/core/allocator.h"
diff --git a/paddle/phi/tests/core/test_sparse_csr_tensor.cc b/paddle/phi/tests/core/test_sparse_csr_tensor.cc
index 1f9d48364a9fc..7fad7bac399cd 100644
--- a/paddle/phi/tests/core/test_sparse_csr_tensor.cc
+++ b/paddle/phi/tests/core/test_sparse_csr_tensor.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gtest/gtest.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
diff --git a/paddle/phi/tests/core/test_string_tensor.cc b/paddle/phi/tests/core/test_string_tensor.cc
index 7a3ad7ffb3aaf..53bf51d38c0b0 100644
--- a/paddle/phi/tests/core/test_string_tensor.cc
+++ b/paddle/phi/tests/core/test_string_tensor.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <sstream>
 #include <string>
 #include <utility>
-#include "gtest/gtest.h"
 
+#include "gtest/gtest.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/pstring.h"
diff --git a/paddle/phi/tests/core/test_type_info.cc b/paddle/phi/tests/core/test_type_info.cc
index 1bb2aeb2b7ab9..6d023268c7b80 100644
--- a/paddle/phi/tests/core/test_type_info.cc
+++ b/paddle/phi/tests/core/test_type_info.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gtest/gtest.h"
-
 #include "paddle/phi/core/utils/type_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/tests/core/unroll_array_ops_test.cc b/paddle/phi/tests/core/unroll_array_ops_test.cc
index ddac4608f7e69..03a4beb374e60 100644
--- a/paddle/phi/tests/core/unroll_array_ops_test.cc
+++ b/paddle/phi/tests/core/unroll_array_ops_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/core/utils/unroll_array_ops.h"
 
 #include <gtest/gtest.h>
+
 #include <array>
 
 namespace phi {
diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt
index a02e4f3d57aa3..b7d53b31bc3ba 100644
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -1,43 +1,127 @@
-cc_test(test_copy_dev_api SRCS test_copy_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_dot_dev_api SRCS test_dot_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_creation_dev_api SRCS test_creation_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_flatten_dev_api SRCS test_flatten_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_matmul_dev_api SRCS test_matmul_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_mean_dev_api SRCS test_mean_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_scale_dev_api SRCS test_scale_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_cast_dev_api SRCS test_cast_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_elementwise_dev_api SRCS test_elementwise_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_reshape_dev_api SRCS test_reshape_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_sum_dev_api SRCS test_sum_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_conj_dev_api SRCS test_conj_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_sparse_conv3d_dev_api SRCS test_sparse_conv3d_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_sparse_pool_dev_api SRCS test_sparse_pool_dev_api.cc DEPS phi phi_api_utils)
-cc_test(test_sparse_activation_dev_api SRCS test_sparse_activation_dev_api.cc DEPS phi phi_api_utils)
+cc_test(
+  test_copy_dev_api
+  SRCS test_copy_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_dot_dev_api
+  SRCS test_dot_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_creation_dev_api
+  SRCS test_creation_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_flatten_dev_api
+  SRCS test_flatten_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_matmul_dev_api
+  SRCS test_matmul_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_mean_dev_api
+  SRCS test_mean_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_scale_dev_api
+  SRCS test_scale_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_cast_dev_api
+  SRCS test_cast_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_elementwise_dev_api
+  SRCS test_elementwise_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_reshape_dev_api
+  SRCS test_reshape_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_sum_dev_api
+  SRCS test_sum_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_conj_dev_api
+  SRCS test_conj_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_concat_dev_api
+  SRCS test_concat_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_split_dev_api
+  SRCS test_split_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_sparse_utils_dev_api
+  SRCS test_sparse_utils_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_sparse_conv3d_dev_api
+  SRCS test_sparse_conv3d_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_sparse_pool_dev_api
+  SRCS test_sparse_pool_dev_api.cc
+  DEPS phi phi_api_utils)
+cc_test(
+  test_sparse_activation_dev_api
+  SRCS test_sparse_activation_dev_api.cc
+  DEPS phi phi_api_utils)
 
-cc_test(test_math_function SRCS test_math_function.cc DEPS math_function)
+cc_test(
+  test_math_function
+  SRCS test_math_function.cc
+  DEPS math_function)
 if(WITH_GPU)
-    nv_test(test_math_function_gpu SRCS test_math_function.cu DEPS math_function)
+  nv_test(
+    test_math_function_gpu
+    SRCS test_math_function.cu
+    DEPS math_function)
 endif()
 if(WITH_ROCM)
-    hip_test(test_math_function_gpu SRCS test_math_function.cu DEPS math_function)
+  hip_test(
+    test_math_function_gpu
+    SRCS test_math_function.cu
+    DEPS math_function)
 endif()
 
-cc_test(test_cpu_vec SRCS test_cpu_vec.cc DEPS blas cpu_info)
+cc_test(
+  test_cpu_vec
+  SRCS test_cpu_vec.cc
+  DEPS blas cpu_info)
 
 # For String Kernels
-cc_test(test_strings_lower_upper_dev_api SRCS test_strings_lower_upper_dev_api.cc DEPS phi phi_api_utils)
-IF(WITH_GPU)
-  nv_test(test_strings_lower_upper_dev_gpu_api SRCS test_strings_lower_upper_dev_api.cu DEPS phi phi_api_utils)
-ELSEIF(WITH_ROCM)
-  hip_test(test_strings_lower_upper_dev_gpu_api SRCS test_strings_lower_upper_dev_api.cu DEPS phi phi_api_utils)
-ENDIF()
+cc_test(
+  test_strings_lower_upper_dev_api
+  SRCS test_strings_lower_upper_dev_api.cc
+  DEPS phi phi_api_utils)
+if(WITH_GPU)
+  nv_test(
+    test_strings_lower_upper_dev_gpu_api
+    SRCS test_strings_lower_upper_dev_api.cu
+    DEPS phi phi_api_utils)
+elseif(WITH_ROCM)
+  hip_test(
+    test_strings_lower_upper_dev_gpu_api
+    SRCS test_strings_lower_upper_dev_api.cu
+    DEPS phi phi_api_utils)
+endif()
 
-cc_test(test_strings_copy_dev_api SRCS test_strings_copy_dev_api.cc DEPS phi phi_api_utils)
-IF(WITH_GPU)
-  nv_test(test_strings_copy_dev_gpu_api SRCS test_strings_copy_dev_api.cu DEPS phi phi_api_utils)
-ELSEIF(WITH_ROCM)
-  hip_test(test_strings_copy_dev_gpu_api SRCS test_strings_copy_dev_api.cu DEPS phi phi_api_utils)
-ENDIF()
+cc_test(
+  test_strings_copy_dev_api
+  SRCS test_strings_copy_dev_api.cc
+  DEPS phi phi_api_utils)
+if(WITH_GPU)
+  nv_test(
+    test_strings_copy_dev_gpu_api
+    SRCS test_strings_copy_dev_api.cu
+    DEPS phi phi_api_utils)
+elseif(WITH_ROCM)
+  hip_test(
+    test_strings_copy_dev_gpu_api
+    SRCS test_strings_copy_dev_api.cu
+    DEPS phi phi_api_utils)
+endif()
diff --git a/paddle/phi/tests/kernels/test_cast_dev_api.cc b/paddle/phi/tests/kernels/test_cast_dev_api.cc
index 957b949347125..179e44f0f0f12 100644
--- a/paddle/phi/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_cast_dev_api.cc
@@ -14,16 +14,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/cast_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cast_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_concat_dev_api.cc b/paddle/phi/tests/kernels/test_concat_dev_api.cc
index 7f954085f601c..0dd58b1bba938 100644
--- a/paddle/phi/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_concat_dev_api.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/kernels/concat_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/concat_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_conj_dev_api.cc b/paddle/phi/tests/kernels/test_conj_dev_api.cc
index 3d2a69df2f971..5ac676ffcbcae 100644
--- a/paddle/phi/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_conj_dev_api.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/complex_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/complex_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc
index 460d85f83133f..9eba14ebc81a8 100644
--- a/paddle/phi/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_creation_dev_api.cc b/paddle/phi/tests/kernels/test_creation_dev_api.cc
index 8c2c8642ab900..2dcd8739991f8 100644
--- a/paddle/phi/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_creation_dev_api.cc
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/full_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_dot_dev_api.cc b/paddle/phi/tests/kernels/test_dot_dev_api.cc
index 457e39525931d..de20907cadf44 100644
--- a/paddle/phi/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_dot_dev_api.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/dot_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/dot_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
index 36b200d4d4494..63f8b86a534ed 100644
--- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
@@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/elementwise_divide_kernel.h"
 #include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 namespace phi {
 namespace tests {
 
diff --git a/paddle/phi/tests/kernels/test_flatten_dev_api.cc b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
index e3f2e8b57e3df..23ee9869c0e51 100644
--- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/flatten_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/flatten_kernel.h"
 
 PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 
diff --git a/paddle/phi/tests/kernels/test_matmul_dev_api.cc b/paddle/phi/tests/kernels/test_matmul_dev_api.cc
index b8e201d7dc58a..f25acaf9bcc3f 100644
--- a/paddle/phi/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_matmul_dev_api.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/kernels/matmul_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_mean_dev_api.cc b/paddle/phi/tests/kernels/test_mean_dev_api.cc
index 92fc7f3c92a98..6f3f91a7dbe56 100644
--- a/paddle/phi/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/kernels/reduce_mean_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/reduce_mean_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_reshape_dev_api.cc b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
index 7de039372fa9c..f0f521d57dbd8 100644
--- a/paddle/phi/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/kernels/reshape_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/reshape_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_scale_dev_api.cc b/paddle/phi/tests/kernels/test_scale_dev_api.cc
index c4c80ce79af7c..eff18bdeecaab 100644
--- a/paddle/phi/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_scale_dev_api.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/kernels/scale_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/scale_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
index 05781156cd1d6..d1c464e4b1c9d 100644
--- a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/place.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/activation_grad_kernel.h"
 #include "paddle/phi/kernels/activation_kernel.h"
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index 9fb0e5692645d..b7d56cb0d2b06 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 namespace phi {
 namespace tests {
 
diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
index 8f7288d70d7d0..5640da399f4e5 100644
--- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
 
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 namespace phi {
 namespace tests {
 
@@ -264,7 +264,22 @@ TEST(DEV_API, sparse_maxpool) {
   std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
   std::vector<float> features = {1, 2, 3};
   std::vector<int> out_indices = {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      1,
+      1,
+      0,
+      1,
+      0,
+      1,
   };
   std::vector<float> out_features = {2, 2, 3, 3};
   std::vector<float> x_grad = {0, 4, 6};
@@ -330,7 +345,22 @@ TEST(DEV_API, sparse_maxpool_channel) {
   std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
   std::vector<float> features = {1, 1, 2, 2, 3, 3};
   std::vector<int> out_indices = {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      1,
+      1,
+      0,
+      1,
+      0,
+      1,
   };
   std::vector<float> out_features = {2, 2, 2, 2, 3, 3, 3, 3};
   std::vector<float> x_grad = {0, 0, 4, 4, 6, 6};
@@ -364,7 +394,22 @@ TEST(DEV_API, sparse_maxpool3d) {
   std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
   std::vector<float> features = {1, 1, 2, 2, 3, 3};
   std::vector<int> out_indices = {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      1,
+      1,
+      0,
+      1,
+      0,
+      1,
   };
   std::vector<float> out_features = {2, 2, 2, 2, 3, 3, 3, 3};
   std::vector<float> x_grad = {0, 0, 4, 4, 6, 6};
diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
index 93728ad31b0d6..0c1a7bbb3d806 100644
--- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
@@ -13,18 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
-
-#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_split_dev_api.cc b/paddle/phi/tests/kernels/test_split_dev_api.cc
index d5160933c1fa0..a358fcdf28db0 100644
--- a/paddle/phi/tests/kernels/test_split_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_split_dev_api.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/kernels/split_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/split_kernel.h"
 namespace phi {
 namespace tests {
 
diff --git a/paddle/phi/tests/kernels/test_strings_copy_dev_api.cc b/paddle/phi/tests/kernels/test_strings_copy_dev_api.cc
index 3984cae52d4cc..6cf75260be771 100644
--- a/paddle/phi/tests/kernels/test_strings_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_strings_copy_dev_api.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <memory>
 #include <string>
+
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/pstring.h"
diff --git a/paddle/phi/tests/kernels/test_strings_copy_dev_api.cu b/paddle/phi/tests/kernels/test_strings_copy_dev_api.cu
index f04f66f50aafa..6a1af65aaa966 100644
--- a/paddle/phi/tests/kernels/test_strings_copy_dev_api.cu
+++ b/paddle/phi/tests/kernels/test_strings_copy_dev_api.cu
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <memory>
 #include <string>
+
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/pstring.h"
diff --git a/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cc b/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cc
index 57353d386dc6e..7da4ac19baffd 100644
--- a/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cc
@@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <memory>
 #include <string>
 
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/string_tensor.h"
 #include "paddle/phi/kernels/strings/strings_empty_kernel.h"
 #include "paddle/phi/kernels/strings/strings_lower_upper_kernel.h"
-
-#include "paddle/phi/backends/all_context.h"
 namespace phi {
 namespace tests {
 
diff --git a/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cu b/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cu
index 3b4bff001436c..a04da1a12d2d7 100644
--- a/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cu
+++ b/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include <stdio.h>
+
 #include <algorithm>
 #include <memory>
 #include <string>
@@ -40,8 +41,8 @@ namespace tests {
 namespace framework = paddle::framework;
 using DDim = phi::DDim;
 using pstring = ::phi::dtype::pstring;
-using phi::GPUPlace;
 using phi::CPUPlace;
+using phi::GPUPlace;
 
 TEST(DEV_API, strings_cast_convert) {
   auto gpu0 = GPUPlace();
diff --git a/paddle/phi/tests/kernels/test_sum_dev_api.cc b/paddle/phi/tests/kernels/test_sum_dev_api.cc
index 9e889ab4ea4f6..2cd677373f4ef 100644
--- a/paddle/phi/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/kernels/reduce_sum_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/reduce_sum_kernel.h"
 namespace phi {
 namespace tests {
 
diff --git a/paddle/phi/tests/ops/CMakeLists.txt b/paddle/phi/tests/ops/CMakeLists.txt
index 58ad327669621..634af80f05a1f 100644
--- a/paddle/phi/tests/ops/CMakeLists.txt
+++ b/paddle/phi/tests/ops/CMakeLists.txt
@@ -1 +1,4 @@
-cc_test(test_op_signature SRCS test_op_signature.cc DEPS op_utils)
+cc_test(
+  test_op_signature
+  SRCS test_op_signature.cc
+  DEPS op_utils)
diff --git a/paddle/phi/tests/ops/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc
index 4379dfd7cc4af..204b7f359a6b4 100644
--- a/paddle/phi/tests/ops/test_op_signature.cc
+++ b/paddle/phi/tests/ops/test_op_signature.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/tests/ops/test_op_signature.h"
 
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <unordered_set>
 
diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h
index 1535f40b70072..745f263208fc2 100644
--- a/paddle/phi/tests/ops/test_op_signature.h
+++ b/paddle/phi/tests/ops/test_op_signature.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
diff --git a/paddle/phi/tools/CMakeLists.txt b/paddle/phi/tools/CMakeLists.txt
index 5693a46d97721..d1df5ec327546 100644
--- a/paddle/phi/tools/CMakeLists.txt
+++ b/paddle/phi/tools/CMakeLists.txt
@@ -1,8 +1,8 @@
 add_executable(print_pten_kernels print_pten_kernels.cc)
 target_link_libraries(print_pten_kernels phi phi_api_utils)
 if(WIN32)
-    target_link_libraries(print_pten_kernels shlwapi.lib)
+  target_link_libraries(print_pten_kernels shlwapi.lib)
 endif()
 if(WITH_ROCM)
-    target_link_libraries(print_pten_kernels ${ROCM_HIPRTC_LIB})
+  target_link_libraries(print_pten_kernels ${ROCM_HIPRTC_LIB})
 endif()
diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt
index 68cb5a19f99ab..ced98cc643e83 100644
--- a/paddle/scripts/CMakeLists.txt
+++ b/paddle/scripts/CMakeLists.txt
@@ -1,7 +1,13 @@
-configure_file(submit_local.sh.in
-    paddle
-    @ONLY)
+configure_file(submit_local.sh.in paddle @ONLY)
 
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle DESTINATION bin
-        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
+install(
+  FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle
+  DESTINATION bin
+  PERMISSIONS
+    OWNER_EXECUTE
+    OWNER_WRITE
+    OWNER_READ
+    GROUP_EXECUTE
+    GROUP_READ
+    WORLD_EXECUTE
+    WORLD_READ)
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index efd2de5621604..b3862ea6b3232 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -360,6 +360,10 @@ function check_style() {
     # pre-commit use python3.8.0 
     OLD_PATH=$PATH
     export PATH=/usr/local/python3.8.0/bin:/usr/local/python3.8.0/include:/usr/local/bin:${PATH}
+    
+    if ! [[ $(pre-commit --version) == *"2.17.0"* ]]; then
+        pip install pre-commit==2.17.0
+    fi
 
     pre-commit install
     clang-format --version
@@ -477,10 +481,10 @@ EOF
 }
 
 function cmake_gen_and_build() {
-    startTime_s=`date +%s`
+    startTime_s=100
     cmake_gen $1
     build $2
-    endTime_s=`date +%s`
+    endTime_s=200
     [ -n "$startTime_firstBuild" ] && startTime_s=$startTime_firstBuild
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
     echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
@@ -1127,7 +1131,6 @@ function check_diff_file_for_coverage() {
     diff_h_file=$(git diff --name-status test develop | awk '$1 != "D" {print $2}' | grep '\.h$' | awk -F "/" '{printf "%s,",$NF}')
     diff_cc_file=$(git diff --name-status test develop | awk '$1 != "D" {print $2}' | grep -E '\.(cc|c)$' | awk -F "/" '{printf "%s,",$NF}')
     diff_py_file=$(git diff --name-status test develop | grep '\.py$' | awk '$1 != "D" {printf "%s,",$2}')
-
     export PADDLE_GIT_DIFF_H_FILE=${diff_h_file%*,}
     export PADDLE_GIT_DIFF_CC_FILE=${diff_cc_file%*,}
     export PADDLE_GIT_DIFF_PY_FILE=${diff_py_file%*,}
@@ -1563,6 +1566,10 @@ set +x
         card_test "$exclusive_tests_medium_parallel" -1 4                  # run cases exclusively, in this cases would be run with 2/4/8 GPUs
         card_test "$exclusive_tests_non_parallel" -1 2                # run cases exclusively, in this cases would be run with 2/4/8 GPUs
         exclu_ut_endTime_s=`date +%s`
+        
+        echo "ipipe_log_param_1_TestCases_Total_Time: $[ $single_ut_endTime_s - $single_ut_startTime_s ]s" 
+        echo "ipipe_log_param_2_TestCases_Total_Time: $[ $multi_ut_endTime_s - $multi_ut_startTime_s ]s" 
+        echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $exclu_ut_endTime_s - $exclu_ut_startTime_s ]s" 
 
         echo "ipipe_log_param_1_TestCases_Total_Time: $[ $single_ut_endTime_s - $single_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
         echo "ipipe_log_param_2_TestCases_Total_Time: $[ $multi_ut_endTime_s - $multi_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
@@ -1684,6 +1691,68 @@ set -ex
     fi
 }
 
+function classify_case_by_cardNum() {
+    cd ${PADDLE_ROOT}/build
+    test_cases=$(ctest -N -V) # get all test cases
+    single_card_tests='^job$'                           # all cases list which would take single GPU
+    multiple_card_tests='^job$'
+    exclusive_card_tests='^job$'
+    nightly_tests='^job$'
+
+    is_exclusive=''           # indicate whether the case is exclusive type
+    is_multicard=''           # indicate whether the case is multiple GPUs type
+    is_nightly=''             # indicate whether the case will only run at night
+set +x
+    while read -r line; do
+        if [[ "$line" == "" ]]; then
+            continue
+        fi
+            read matchstr <<< $(echo "$line"|grep -oEi 'Test[ \t]+#')
+            if [[ "$matchstr" == "" ]]; then
+                # Any test case with LABELS property would be parse here
+                # RUN_TYPE=EXCLUSIVE mean the case would run exclusively
+                # RUN_TYPE=DIST mean the case would take two graph GPUs during runtime
+                # RUN_TYPE=NIGHTLY or RUN_TYPE=DIST:NIGHTLY or RUN_TYPE=EXCLUSIVE:NIGHTLY means the case will ONLY run at night
+                read is_exclusive <<< $(echo "$line"|grep -oEi "RUN_TYPE=EXCLUSIVE")
+                read is_multicard <<< $(echo "$line"|grep -oEi "RUN_TYPE=DIST")
+                read is_nightly <<< $(echo "$line"|grep -oEi "RUN_TYPE=NIGHTLY|RUN_TYPE=DIST:NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY")
+                continue
+            fi
+            read testcase <<< $(echo "$line"|grep -oEi "\w+$")
+
+            if [[ "$is_nightly" != "" ]] && [ ${NIGHTLY_MODE:-OFF} == "OFF" ]; then
+                echo $testcase" will only run at night."
+                nightly_tests="$nightly_tests|^$testcase$" 
+                echo "$testcase" >> ${PADDLE_ROOT}/build/nightly_case
+                continue
+            fi
+
+            if [[ "$is_multicard" == "" ]]; then
+                # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
+                read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist_")
+            fi
+            if [[ "$is_exclusive" != "" ]]; then
+                exclusive_card_tests="$exclusive_card_tests|^$testcase$"
+            elif [[ "$is_multicard" != "" ]]; then
+                multiple_card_tests="$multiple_card_tests|^$testcase$"
+            else
+                single_card_tests="$single_card_tests|^$testcase$"
+            fi
+            is_exclusive=''
+            is_multicard=''
+            is_nightly=''
+            matchstr=''
+            testcase=''
+    done <<< "$test_cases"; 
+set -x
+    rm -rf ${PADDLE_ROOT}/build/classify_case_by_cardNum.txt
+    touch ${PADDLE_ROOT}/build/classify_case_by_cardNum.txt
+    echo 'single_card_tests: '$single_card_tests >> ${PADDLE_ROOT}/build/classify_case_by_cardNum.txt
+    echo 'multiple_card_tests: '$multiple_card_tests >> ${PADDLE_ROOT}/build/classify_case_by_cardNum.txt
+    echo 'exclusive_card_tests: '$exclusive_card_tests >> ${PADDLE_ROOT}/build/classify_case_by_cardNum.txt
+    echo 'nightly_tests: '$nightly_tests >> ${PADDLE_ROOT}/build/classify_case_by_cardNum.txt
+}
+
 function show_ut_retry_result() {
     if [ "$SYSTEM" == "Darwin" ]; then
         exec_retry_threshold_count=10
@@ -1921,8 +1990,15 @@ set -x
     #generate ut file map
     python ${PADDLE_ROOT}/tools/get_ut_file_map.py 'get_ut_map' ${PADDLE_ROOT}
 
+
+    wait;
+    #classify_case_by_cardNum
+    classify_case_by_cardNum    
+    
     #generate ut mem map
-    python ${PADDLE_ROOT}/tools/get_ut_mem_map.py $tmp_dir 
+    python ${PADDLE_ROOT}/tools/get_ut_mem_map.py $tmp_dir
+    python ${PADDLE_ROOT}/tools/final_ut_parallel_rule.py ${PADDLE_ROOT}
+    
 }
 
 function get_failedUts_precise_map_file {
@@ -2288,6 +2364,220 @@ set -ex
     fi   
 }
 
+function parallel_test_base_gpu_test() {
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit tests in parallel way ...
+    ========================================
+EOF
+
+
+set -x
+        # set trt_convert ut to run 15% cases.
+        export TEST_NUM_PERCENT_CASES=0.15
+        precison_cases=""
+        bash $PADDLE_ROOT/tools/check_added_ut.sh
+        if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
+            python3.7 $PADDLE_ROOT/tools/get_pr_ut.py
+        fi
+        if [ -a "$PADDLE_ROOT/duplicate_ut" ];then
+            duplicate_uts=$(cat $PADDLE_ROOT/duplicate_ut|sed -e 's/\r//g')
+            if [[ "$duplicate_uts" != "" ]];then
+                set +x
+                echo "========================================"
+                echo "The new unit test has the same name as the existing unit test"
+                cat "$PADDLE_ROOT/duplicate_ut"
+                echo "========================================"
+                exit 102;
+                set -x
+            fi
+        fi
+        if [ -a "$PADDLE_ROOT/added_ut" ];then
+            added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$
+            env CUDA_VISIBLE_DEVICES=0 ctest -R "(${added_uts})" -LE "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$?
+            ctest -R "(${added_uts})" -L "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error_1=$?
+            if [ "$added_ut_error" != 0 ] && [ "$added_ut_error_1" != 0 ];then
+                echo "========================================"
+                echo "Added UT should not exceed 15 seconds"
+                echo "========================================"
+                exit 8;
+            fi
+        fi
+set +x
+        EXIT_CODE=0;
+        wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/pre_test/CTestCostData.txt --no-check-certificate
+        mkdir -p ${PADDLE_ROOT}/build/Testing/Temporary/
+        cp -r ${PADDLE_ROOT}/build/CTestCostData.txt ${PADDLE_ROOT}/build/Testing/Temporary/
+        
+        ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d' > all_ut_list
+        get_quickly_disable_ut||disable_ut_quickly='disable_ut'    # indicate whether the case was in quickly disable list
+        test_cases=$(ctest -N -V) # get all test cases
+
+        python ${PADDLE_ROOT}/tools/group_case_for_parallel.py ${PADDLE_ROOT}
+
+        single_ut_mem_0_startTime_s=`date +%s`
+        while read line
+        do
+            card_test "$line" 1 4
+        done < $PADDLE_ROOT/tools/single_card_tests_mem0_new
+        single_ut_mem_0_endTime_s=`date +%s`
+        echo "ipipe_log_param_1_mem_0_TestCases_Total_Time: $[ $single_ut_mem_0_endTime_s - $single_ut_mem_0_startTime_s ]s" 
+        echo "ipipe_log_param_1_mem_0_TestCases_Total_Time: $[ $single_ut_mem_0_endTime_s - $single_ut_mem_0_startTime_s ]s"  >> ${PADDLE_ROOT}/build/build_summary.txt
+
+        single_ut_startTime_s=`date +%s`
+        while read line
+        do
+            num=$[(`echo $line | awk -F"$" '{print NF-1}'`-1)/6]
+            if [ $num -eq 0 ]; then
+                num=1
+            fi
+            card_test "$line" 1 $num
+        done < $PADDLE_ROOT/tools/single_card_tests_new
+        single_ut_endTime_s=`date +%s`
+        echo "ipipe_log_param_1_TestCases_Total_Time: $[ $single_ut_endTime_s - $single_ut_startTime_s ]s" 
+        echo "ipipe_log_param_1_TestCases_Total_Time: $[ $single_ut_endTime_s - $single_ut_startTime_s ]s"   >> ${PADDLE_ROOT}/build/build_summary.txt
+
+        multiple_ut_mem_0_startTime_s=`date +%s`
+        while read line
+        do
+            card_test "$line" 2 4
+        done < $PADDLE_ROOT/tools/multiple_card_tests_mem0_new
+        multiple_ut_mem_0_endTime_s=`date +%s`
+        echo "ipipe_log_param_2_mem0_TestCases_Total_Time: $[ $multiple_ut_mem_0_endTime_s - $multiple_ut_mem_0_startTime_s ]s" 
+        echo "ipipe_log_param_2_mem0_TestCases_Total_Time: $[ $multiple_ut_mem_0_endTime_s - $multiple_ut_mem_0_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        multiple_ut_startTime_s=`date +%s`
+        while read line
+        do
+            num=$[(`echo $line | awk -F"$" '{print NF-1}'`-1)/6]
+            if [ $num -eq 0 ]; then
+                num=1
+            fi
+            card_test "$line" 2 $num 
+
+        done < $PADDLE_ROOT/tools/multiple_card_tests_new
+        multiple_ut_endTime_s=`date +%s`
+        echo "ipipe_log_param_2_TestCases_Total_Time: $[ $multiple_ut_endTime_s - $multiple_ut_startTime_s ]s" 
+        echo "ipipe_log_param_2_TestCases_Total_Time: $[ $multiple_ut_endTime_s - $multiple_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+
+        exclusive_ut_mem_0_startTime_s=`date +%s`
+        while read line
+        do
+            card_test "$line" -1 4
+        done < $PADDLE_ROOT/tools/exclusive_card_tests_mem0_new
+        exclusive_ut_mem_0_endTime_s=`date +%s`
+        echo "ipipe_log_param_-1_mem0_TestCases_Total_Time: $[ $exclusive_ut_mem_0_endTime_s - $exclusive_ut_mem_0_startTime_s ]s" 
+        echo "ipipe_log_param_-1_mem0_TestCases_Total_Time: $[ $exclusive_ut_mem_0_endTime_s - $exclusive_ut_mem_0_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+
+        exclusive_ut_startTime_s=`date +%s`
+        while read line
+        do
+            num=$[(`echo $line | awk -F"$" '{print NF-1}'`-1)/6]
+            if [ $num -eq 0 ]; then
+                num=1
+            fi
+            card_test "$line" -1 $num 
+        done < $PADDLE_ROOT/tools/exclusive_card_tests_new
+        exclusive_ut_endTime_s=`date +%s`
+        echo "ipipe_log_param_-1_TestCases_Total_Time: $[ $exclusive_ut_endTime_s - $exclusive_ut_startTime_s ]s"
+        echo "ipipe_log_param_-1_TestCases_Total_Time: $[ $exclusive_ut_endTime_s - $exclusive_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        
+        noparallel_ut_startTime_s=`date +%s`
+        while read line
+        do
+            card_test "$line" -1 2
+        done < $PADDLE_ROOT/tools/no_parallel_case_file
+        noparallel_ut_endTime_s=`date +%s`
+        echo "ipipe_log_param_noparallel_TestCases_Total_Time: $[ $noparallel_ut_endTime_s - $noparallel_ut_startTime_s ]s"
+        echo "ipipe_log_param_noparallel_TestCases_Total_Time: $[ $noparallel_ut_endTime_s - $noparallel_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt   
+        ###retry
+        collect_failed_tests
+        rm -f $tmp_dir/*
+        exec_times=0
+        retry_unittests_record=''
+        retry_time=4
+        exec_time_array=('first' 'second' 'third' 'fourth')
+        parallel_failed_tests_exec_retry_threshold=120
+        exec_retry_threshold=30
+        is_retry_execuate=0
+        rerun_ut_startTime_s=`date +%s`
+        if [ -n "$failed_test_lists" ];then
+            if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
+                bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists"    # cat logs for tiemout uts which killed by ctest
+            fi
+            read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            need_retry_ut_arr=(${need_retry_ut_str})
+            need_retry_ut_count=${#need_retry_ut_arr[@]}
+            read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            while ( [ $exec_times -lt $retry_time ] )
+                do
+                    if [[ "${exec_times}" == "0" ]] ;then
+                        if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    elif [[ "${exec_times}" == "1" ]] ;then
+                        read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                        need_retry_ut_arr=(${need_retry_ut_str})
+                        need_retry_ut_count=${#need_retry_ut_arr[@]} 
+                        if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    fi
+                    if [[ "$is_retry_execuate" == "0" ]];then
+                        set +e
+                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                        failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                        set -e
+                        if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "2" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
+                        echo "========================================="
+                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                        echo "========================================="
+                        echo "The following unittest will be re-run:"
+                        echo "${retry_unittests}"                    
+                        for line in ${retry_unittests[@]} ;
+                            do
+                                if [[ "$retry_cases" == "" ]]; then
+                                    retry_cases="^$line$"
+                                else
+                                    retry_cases="$retry_cases|^$line$"
+                                fi
+                            done
+
+                        if [[ "$retry_cases" != "" ]]; then
+                            card_test "$retry_cases" -1 2
+                        fi
+                        exec_times=$[$exec_times+1]
+                        failed_test_lists=''
+                        collect_failed_tests
+                        rm -f $tmp_dir/*
+                        retry_cases=''
+                    else 
+                        break
+                    fi 
+                done
+            retry_unittests_record="$retry_unittests_record$failed_test_lists"
+        fi
+        rerun_ut_endTime_s=`date +%s`
+        echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s"
+        echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        cp $PADDLE_ROOT/build/Testing/Temporary/CTestCostData.txt ${cfs_dir}/coverage/${AGILE_PULL_ID}/${AGILE_REVISION}/
+        if [[ "$EXIT_CODE" != "0" ]]; then
+            show_ut_retry_result
+        fi
+set -ex
+    fi
+}
+
 function parallel_test_base_ipu() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/ipu
@@ -2424,7 +2714,7 @@ function parallel_test() {
     if [ "$WITH_CINN" == "ON" ];then
         parallel_test_base_cinn
     elif [ "$WITH_GPU" == "ON" ] || [ "$WITH_ROCM" == "ON" ];then
-        parallel_test_base_gpu
+        parallel_test_base_gpu_test
     elif [ "$WITH_XPU" == "ON" ];then
         parallel_test_base_xpu
     elif [ "$WITH_ASCEND_CL" == "ON" ];then
@@ -3127,7 +3417,6 @@ function main() {
         parallel_test
         ;;
       cicheck_coverage)
-        check_approvals_of_unittest 1
         check_diff_file_for_coverage
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         enable_unused_var_check
@@ -3136,13 +3425,11 @@ function main() {
         check_change_of_unittest ${PYTHON_ABI:-""}
         ;;
       cpu_cicheck_coverage)
-        check_approvals_of_unittest 1
         check_diff_file_for_coverage
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         enable_unused_var_check
         ;;
       gpu_cicheck_coverage)
-        check_approvals_of_unittest 1
         parallel_test
         check_coverage
         check_change_of_unittest ${PYTHON_ABI:-""}
@@ -3294,6 +3581,10 @@ function main() {
         # only test trt convert.
         trt_convert_test
         ;;
+      classify_case_by_cardNum)
+        # only class case by card num
+        classify_case_by_cardNum
+        ;;
       *)
         print_usage
         exit 1
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index f5cfd14e6b84c..7b02aef22e8dc 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -1,11 +1,21 @@
 # for paddle test case
 
 if(WITH_TESTING)
-  set(paddle_gtest_main_deps device_context gtest gflags init memory phi_utils proto_desc)
+  set(paddle_gtest_main_deps
+      device_context
+      gtest
+      gflags
+      init
+      memory
+      phi_utils
+      proto_desc)
 
-  if (WITH_GPU OR WITH_ROCM)
+  if(WITH_GPU OR WITH_ROCM)
     list(APPEND paddle_gtest_main_deps gpu_info)
   endif()
 
-  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS ${paddle_gtest_main_deps})
+  cc_library(
+    paddle_gtest_main
+    SRCS paddle_gtest_main.cc
+    DEPS ${paddle_gtest_main_deps})
 endif()
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 7669c06b2c2b7..a428b176d67ed 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -1,5 +1,14 @@
 add_subdirectory(string)
 
-cc_test(array_ref_test SRCS array_ref_test.cc DEPS gtest gflags)
-cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
-cc_test(variant_test SRCS variant_test.cc DEPS gtest)
+cc_test(
+  array_ref_test
+  SRCS array_ref_test.cc
+  DEPS gtest gflags)
+cc_test(
+  small_vector_test
+  SRCS small_vector_test.cc
+  DEPS gtest gflags)
+cc_test(
+  variant_test
+  SRCS variant_test.cc
+  DEPS gtest)
diff --git a/paddle/utils/flat_hash_map.h b/paddle/utils/flat_hash_map.h
index 64a75fffa5767..56318ab90e6c8 100644
--- a/paddle/utils/flat_hash_map.h
+++ b/paddle/utils/flat_hash_map.h
@@ -36,11 +36,11 @@ struct functor_storage : Functor {
   functor_storage() = default;
   functor_storage(const Functor &functor) : Functor(functor) {}
   template <typename... Args>
-  Result operator()(Args &&... args) {
+  Result operator()(Args &&...args) {
     return static_cast<Functor &>(*this)(std::forward<Args>(args)...);
   }
   template <typename... Args>
-  Result operator()(Args &&... args) const {
+  Result operator()(Args &&...args) const {
     return static_cast<const Functor &>(*this)(std::forward<Args>(args)...);
   }
 };
@@ -136,7 +136,7 @@ struct sherwood_v3_entry {
   bool is_empty() const { return distance_from_desired < 0; }
   bool is_at_desired_position() const { return distance_from_desired <= 0; }
   template <typename... Args>
-  void emplace(int8_t distance, Args &&... args) {
+  void emplace(int8_t distance, Args &&...args) {
     new (std::addressof(value)) T(std::forward<Args>(args)...);
     distance_from_desired = distance;
   }
@@ -317,9 +317,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
   }
   sherwood_v3_table(sherwood_v3_table &&other,
                     const ArgumentAlloc &alloc) noexcept
-      : EntryAlloc(alloc),
-        Hasher(std::move(other)),
-        Equal(std::move(other)) {
+      : EntryAlloc(alloc), Hasher(std::move(other)), Equal(std::move(other)) {
     swap_pointers(other);
   }
   sherwood_v3_table &operator=(const sherwood_v3_table &other) {
@@ -476,7 +474,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
   }
 
   template <typename Key, typename... Args>
-  std::pair<iterator, bool> emplace(Key &&key, Args &&... args) {
+  std::pair<iterator, bool> emplace(Key &&key, Args &&...args) {
     size_t index =
         hash_policy.index_for_hash(hash_object(key), num_slots_minus_one);
     EntryPointer current_entry = entries + ptrdiff_t(index);
@@ -499,7 +497,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
     return emplace(std::move(value));
   }
   template <typename... Args>
-  iterator emplace_hint(const_iterator, Args &&... args) {
+  iterator emplace_hint(const_iterator, Args &&...args) {
     return emplace(std::forward<Args>(args)...).first;
   }
   iterator insert(const_iterator, const value_type &value) {
@@ -702,7 +700,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
   emplace_new_key(int8_t distance_from_desired,
                   EntryPointer current_entry,
                   Key &&key,
-                  Args &&... args) {
+                  Args &&...args) {
     using std::swap;
     if (num_slots_minus_one == 0 || distance_from_desired == max_lookups ||
         num_elements + 1 >
@@ -789,7 +787,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
     }
   };
 };
-}
+}  // namespace detailv3
 
 struct prime_number_hash_policy {
   static size_t mod0(size_t) { return 0llu; }
@@ -1708,7 +1706,7 @@ class flat_hash_set
   flat_hash_set() {}
 
   template <typename... Args>
-  std::pair<typename Table::iterator, bool> emplace(Args &&... args) {
+  std::pair<typename Table::iterator, bool> emplace(Args &&...args) {
     return Table::emplace(T(std::forward<Args>(args)...));
   }
   std::pair<typename Table::iterator, bool> emplace(const key_type &arg) {
diff --git a/paddle/utils/none.h b/paddle/utils/none.h
index d2da8f26a118f..965855f556226 100644
--- a/paddle/utils/none.h
+++ b/paddle/utils/none.h
@@ -21,7 +21,7 @@ namespace paddle {
 
 namespace detail {
 struct none_helper {};
-}
+}  // namespace detail
 
 typedef int detail::none_helper::*none_t;
 
diff --git a/paddle/utils/optional.h b/paddle/utils/optional.h
index 2b5a657f4d42e..ec245500551c8 100644
--- a/paddle/utils/optional.h
+++ b/paddle/utils/optional.h
@@ -38,7 +38,7 @@ template <class T, class Factory>
 void construct(Factory const& factory, void* address) {
   factory.template apply<T>(address);
 }
-}
+}  // namespace paddle_optional_detail
 
 template <typename T>
 class optional;
diff --git a/paddle/utils/small_vector.h b/paddle/utils/small_vector.h
index 27db9ae18822a..5a8abdb4d2386 100644
--- a/paddle/utils/small_vector.h
+++ b/paddle/utils/small_vector.h
@@ -52,8 +52,7 @@ class iterator_range {
   template <typename Container>
   iterator_range(Container &&c)
       // TODO: Consider ADL/non-member begin/end calls.
-      : begin_iterator(c.begin()),
-        end_iterator(c.end()) {}
+      : begin_iterator(c.begin()), end_iterator(c.end()) {}
   iterator_range(IteratorT begin_iterator, IteratorT end_iterator)
       : begin_iterator(std::move(begin_iterator)),
         end_iterator(std::move(end_iterator)) {}
@@ -132,10 +131,8 @@ class small_vector_base {
 };
 
 template <class T>
-using SmallVectorSizeType =
-    typename std::conditional<sizeof(T) < 4 && sizeof(void *) >= 8,
-                              uint64_t,
-                              uint32_t>::type;
+using SmallVectorSizeType = typename std::
+    conditional<sizeof(T) < 4 && sizeof(void *) >= 8, uint64_t, uint32_t>::type;
 
 /// Figure out the offset of the first element.
 template <class T, typename = void>
@@ -296,8 +293,8 @@ class small_vector_template_common
   using Base::size;
 
   // forward iterator creation methods.
-  iterator begin() { return (iterator) this->BeginX; }
-  const_iterator begin() const { return (const_iterator) this->BeginX; }
+  iterator begin() { return (iterator)this->BeginX; }
+  const_iterator begin() const { return (const_iterator)this->BeginX; }
   iterator end() { return begin() + size(); }
   const_iterator end() const { return begin() + size(); }
 
@@ -451,7 +448,7 @@ class small_vector_template_base : public small_vector_template_common<T> {
   }
 
   template <typename... ArgTypes>
-  T &growAndEmplaceBack(ArgTypes &&... Args) {
+  T &growAndEmplaceBack(ArgTypes &&...Args) {
     // Grow manually in case one of Args is an internal reference.
     size_t NewCapacity;
     T *NewElts = mallocForGrow(0, NewCapacity);
@@ -599,7 +596,7 @@ class small_vector_template_base<T, true>
   }
 
   template <typename... ArgTypes>
-  T &growAndEmplaceBack(ArgTypes &&... Args) {
+  T &growAndEmplaceBack(ArgTypes &&...Args) {
     // Use push_back with a copy in case Args has an internal reference,
     // side-stepping reference invalidation problems without losing the realloc
     // optimization.
@@ -972,7 +969,7 @@ class small_vector_impl : public small_vector_template_base<T> {
   }
 
   template <typename... ArgTypes>
-  reference emplace_back(ArgTypes &&... Args) {
+  reference emplace_back(ArgTypes &&...Args) {
     if (this->size() >= this->capacity())
       return this->growAndEmplaceBack(std::forward<ArgTypes>(Args)...);
 
@@ -1359,7 +1356,7 @@ struct Struct16B {
 struct Struct32B {
   alignas(32) void *X;
 };
-}
+}  // namespace
 static_assert(sizeof(small_vector<void *, 0>) ==
                   sizeof(unsigned) * 2 + sizeof(void *),
               "wasted space in small_vector size 0");
diff --git a/paddle/utils/string/CMakeLists.txt b/paddle/utils/string/CMakeLists.txt
index db3cb542ba374..3e35da9d62d73 100644
--- a/paddle/utils/string/CMakeLists.txt
+++ b/paddle/utils/string/CMakeLists.txt
@@ -1,8 +1,26 @@
-cc_library(stringpiece SRCS piece.cc DEPS flags)
-cc_library(pretty_log SRCS pretty_log.cc DEPS flags)
-cc_library(string_helper SRCS string_helper.cc DEPS flags)
-cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece gflags)
-cc_test(stringprintf_test SRCS printf_test.cc DEPS gflags)
+cc_library(
+  stringpiece
+  SRCS piece.cc
+  DEPS flags)
+cc_library(
+  pretty_log
+  SRCS pretty_log.cc
+  DEPS flags)
+cc_library(
+  string_helper
+  SRCS string_helper.cc
+  DEPS flags)
+cc_test(
+  stringpiece_test
+  SRCS piece_test.cc
+  DEPS stringpiece gflags)
+cc_test(
+  stringprintf_test
+  SRCS printf_test.cc
+  DEPS gflags)
 cc_test(to_string_test SRCS to_string_test.cc)
 cc_test(split_test SRCS split_test.cc)
-cc_test(string_helper_test SRCS string_helper_test.cc DEPS string_helper)
+cc_test(
+  string_helper_test
+  SRCS string_helper_test.cc
+  DEPS string_helper)
diff --git a/paddle/utils/string/piece.cc b/paddle/utils/string/piece.cc
index 305ac85a5320e..ae62f53378f0b 100644
--- a/paddle/utils/string/piece.cc
+++ b/paddle/utils/string/piece.cc
@@ -15,6 +15,7 @@
 #include "paddle/utils/string/piece.h"
 
 #include <string.h>
+
 #include <algorithm>
 #define CHAR_POINTER_CMP(a, b) \
   do {                         \
@@ -71,8 +72,9 @@ bool operator<=(Piece x, Piece y) { return Compare(x, y) <= 0; }
 bool operator>=(Piece x, Piece y) { return Compare(x, y) >= 0; }
 
 bool HasPrefix(Piece s, Piece x) {
-  return !x.len() ? true : ((s.len() >= x.len()) &&
-                            (memcmp(s.data(), x.data(), x.len()) == 0));
+  return !x.len() ? true
+                  : ((s.len() >= x.len()) &&
+                     (memcmp(s.data(), x.data(), x.len()) == 0));
 }
 
 bool HasSuffix(Piece s, Piece x) {
diff --git a/paddle/utils/string/pretty_log.cc b/paddle/utils/string/pretty_log.cc
index b014c6de20d85..90d3a6c1c4cd3 100644
--- a/paddle/utils/string/pretty_log.cc
+++ b/paddle/utils/string/pretty_log.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/utils/string/pretty_log.h"
+
 #include "gflags/gflags.h"
 
 DEFINE_bool(color, true, "Whether to turn on pretty log");
diff --git a/paddle/utils/string/pretty_log.h b/paddle/utils/string/pretty_log.h
index 9a8038f3a8bef..9de7ce24abd72 100644
--- a/paddle/utils/string/pretty_log.h
+++ b/paddle/utils/string/pretty_log.h
@@ -17,8 +17,8 @@
 #include <sstream>
 #include <string>
 #include <utility>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/utils/string/printf.h"
 
 DECLARE_bool(color);
@@ -59,30 +59,30 @@ struct Style {
 template <typename... Args>
 static void PrettyLogEndl(const std::string &style,
                           const char *fmt,
-                          const Args &... args) {
+                          const Args &...args) {
   std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl;
 }
 template <typename... Args>
 static void PrettyLog(const std::string &style,
                       const char *fmt,
-                      const Args &... args) {
+                      const Args &...args) {
   std::cerr << style << Sprintf(fmt, args...) << reset();
 }
 
 template <typename... Args>
-static void PrettyLogInfo(const char *fmt, const Args &... args) {
+static void PrettyLogInfo(const char *fmt, const Args &...args) {
   PrettyLogEndl(Style::info(), fmt, args...);
 }
 template <typename... Args>
-static void PrettyLogDetail(const char *fmt, const Args &... args) {
+static void PrettyLogDetail(const char *fmt, const Args &...args) {
   PrettyLogEndl(Style::detail(), fmt, args...);
 }
 template <typename... Args>
-static void PrettyLogH1(const char *fmt, const Args &... args) {
+static void PrettyLogH1(const char *fmt, const Args &...args) {
   PrettyLogEndl(Style::H1(), fmt, args...);
 }
 template <typename... Args>
-static void PrettyLogH2(const char *fmt, const Args &... args) {
+static void PrettyLogH2(const char *fmt, const Args &...args) {
   PrettyLogEndl(Style::H2(), fmt, args...);
 }
 
diff --git a/paddle/utils/string/string_helper.cc b/paddle/utils/string/string_helper.cc
index 37b9e9ce4e513..2b694de4b5834 100644
--- a/paddle/utils/string/string_helper.cc
+++ b/paddle/utils/string/string_helper.cc
@@ -16,6 +16,7 @@
 
 #include <ctype.h>
 #include <stdio.h>
+
 #include <cstring>
 #include <string>
 
diff --git a/paddle/utils/string/string_helper.h b/paddle/utils/string/string_helper.h
index e6cb2e90b8fa1..f34ae49fcfa15 100644
--- a/paddle/utils/string/string_helper.h
+++ b/paddle/utils/string/string_helper.h
@@ -17,6 +17,7 @@
 #include <assert.h>
 #include <ctype.h>
 #include <stdio.h>
+
 #include <cstring>
 #include <sstream>
 #include <string>
diff --git a/paddle/utils/string/tinyformat/tinyformat.h b/paddle/utils/string/tinyformat/tinyformat.h
index 4e46cbc26b638..f9c55fe1835fd 100644
--- a/paddle/utils/string/tinyformat/tinyformat.h
+++ b/paddle/utils/string/tinyformat/tinyformat.h
@@ -846,7 +846,7 @@ template <int N>
 class FormatListN : public FormatList {
  public:
   template <typename... Args>
-  FormatListN(const Args &... args)  // NOLINT
+  FormatListN(const Args &...args)  // NOLINT
       : FormatList(&m_formatterStore[0], N),
         m_formatterStore{FormatArg(args)...} {
     static_assert(sizeof...(args) == N, "Number of args must be N");
@@ -875,7 +875,7 @@ class FormatListN<0> : public FormatList {
 ///
 ///   FormatListRef formatList = makeFormatList( /*...*/ );
 template <typename... Args>
-detail::FormatListN<sizeof...(Args)> makeFormatList(const Args &... args) {
+detail::FormatListN<sizeof...(Args)> makeFormatList(const Args &...args) {
   return detail::FormatListN<sizeof...(args)>(args...);
 }  // NOLINT
 
@@ -889,14 +889,14 @@ inline void vformat(std::ostream &out, const char *fmt, FormatListRef list) {
 
 /// Format list of arguments to the stream according to given format string.
 template <typename... Args>
-void format(std::ostream &out, const char *fmt, const Args &... args) {
+void format(std::ostream &out, const char *fmt, const Args &...args) {
   vformat(out, fmt, makeFormatList(args...));
 }
 
 /// Format list of arguments according to the given format string and return
 /// the result as a string.
 template <typename... Args>
-std::string format(const char *fmt, const Args &... args) {
+std::string format(const char *fmt, const Args &...args) {
   std::ostringstream oss;
   format(oss, fmt, args...);
   return oss.str();
@@ -904,12 +904,12 @@ std::string format(const char *fmt, const Args &... args) {
 
 /// Format list of arguments to std::cout, according to the given format string
 template <typename... Args>
-void printf(const char *fmt, const Args &... args) {
+void printf(const char *fmt, const Args &...args) {
   format(std::cout, fmt, args...);
 }
 
 template <typename... Args>
-void printfln(const char *fmt, const Args &... args) {
+void printfln(const char *fmt, const Args &...args) {
   format(std::cout, fmt, args...);
   std::cout << '\n';
 }
diff --git a/paddle/utils/string/to_string_test.cc b/paddle/utils/string/to_string_test.cc
index 778ba8bb113a2..740e4435fc3b7 100644
--- a/paddle/utils/string/to_string_test.cc
+++ b/paddle/utils/string/to_string_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/utils/string/to_string.h"
+
 #include <gtest/gtest.h>
 
 constexpr char kOutputString[] = "User Defined Output";
diff --git a/paddle/utils/variant.h b/paddle/utils/variant.h
index 4348abc9cbff0..a6822920dd705 100644
--- a/paddle/utils/variant.h
+++ b/paddle/utils/variant.h
@@ -429,10 +429,10 @@ inline constexpr remove_reference_t<T> &&move(T &&t) noexcept {
 }
 
 #ifdef MPARK_INTEGER_SEQUENCE
-using std::integer_sequence;
 using std::index_sequence;
-using std::make_index_sequence;
 using std::index_sequence_for;
+using std::integer_sequence;
+using std::make_index_sequence;
 #else
 template <typename T, T... Is>
 struct integer_sequence {
@@ -597,14 +597,14 @@ struct Invoke;
 template <>
 struct Invoke<true /* pmf */, 0 /* is_base_of */> {
   template <typename R, typename T, typename Arg, typename... Args>
-  inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&... args)
+  inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&...args)
       MPARK_RETURN((lib::forward<Arg>(arg).*pmf)(lib::forward<Args>(args)...))
 };
 
 template <>
 struct Invoke<true /* pmf */, 1 /* is_reference_wrapper */> {
   template <typename R, typename T, typename Arg, typename... Args>
-  inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&... args)
+  inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&...args)
       MPARK_RETURN((lib::forward<Arg>(arg).get().*
                     pmf)(lib::forward<Args>(args)...))
 };
@@ -612,7 +612,7 @@ struct Invoke<true /* pmf */, 1 /* is_reference_wrapper */> {
 template <>
 struct Invoke<true /* pmf */, 2 /* otherwise */> {
   template <typename R, typename T, typename Arg, typename... Args>
-  inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&... args)
+  inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&...args)
       MPARK_RETURN(((*lib::forward<Arg>(arg)).*
                     pmf)(lib::forward<Args>(args)...))
 };
@@ -639,27 +639,29 @@ struct Invoke<false /* pmo */, 2 /* otherwise */> {
 };
 
 template <typename R, typename T, typename Arg, typename... Args>
-inline constexpr auto invoke(R T::*f, Arg &&arg, Args &&... args) MPARK_RETURN(
-    Invoke<std::is_function<R>::value,
-           (std::is_base_of<T, lib::decay_t<Arg>>::value
-                ? 0
-                : is_reference_wrapper<lib::decay_t<Arg>>::value ? 1 : 2)>::
-        invoke(f, lib::forward<Arg>(arg), lib::forward<Args>(args)...))
+inline constexpr auto invoke(R T::*f, Arg &&arg, Args &&...args)
+    MPARK_RETURN(Invoke<std::is_function<R>::value,
+                        (std::is_base_of<T, lib::decay_t<Arg>>::value ? 0
+                         : is_reference_wrapper<lib::decay_t<Arg>>::value
+                             ? 1
+                             : 2)>::invoke(f,
+                                           lib::forward<Arg>(arg),
+                                           lib::forward<Args>(args)...))
 
 #ifdef _MSC_VER
 #pragma warning(push)
 #pragma warning(disable : 4100)
 #endif
-    template <typename F, typename... Args>
-    inline constexpr auto invoke(F &&f, Args &&... args)
-        MPARK_RETURN(lib::forward<F>(f)(lib::forward<Args>(args)...))
+        template <typename F, typename... Args>
+        inline constexpr auto invoke(F &&f, Args &&...args)
+            MPARK_RETURN(lib::forward<F>(f)(lib::forward<Args>(args)...))
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
 }  // namespace detail
 
 template <typename F, typename... Args>
-inline constexpr auto invoke(F &&f, Args &&... args)
+inline constexpr auto invoke(F &&f, Args &&...args)
     MPARK_RETURN(detail::invoke(lib::forward<F>(f),
                                 lib::forward<Args>(args)...))
 
@@ -842,10 +844,10 @@ using type_pack_element_t = typename type_pack_element<I, Ts...>::type;
 #endif
 
 #ifdef MPARK_TRIVIALITY_TYPE_TRAITS
-using std::is_trivially_copy_constructible;
-using std::is_trivially_move_constructible;
 using std::is_trivially_copy_assignable;
+using std::is_trivially_copy_constructible;
 using std::is_trivially_move_assignable;
+using std::is_trivially_move_constructible;
 #else
 template <typename T>
 struct is_trivially_copy_constructible
@@ -1049,12 +1051,14 @@ struct valueless_t {};
 enum class Trait { TriviallyAvailable, Available, Unavailable };
 
 template <typename T,
-          template <typename> class IsTriviallyAvailable,
-          template <typename> class IsAvailable>
+          template <typename>
+          class IsTriviallyAvailable,
+          template <typename>
+          class IsAvailable>
 inline constexpr Trait trait() {
-  return IsTriviallyAvailable<T>::value
-             ? Trait::TriviallyAvailable
-             : IsAvailable<T>::value ? Trait::Available : Trait::Unavailable;
+  return IsTriviallyAvailable<T>::value ? Trait::TriviallyAvailable
+         : IsAvailable<T>::value        ? Trait::Available
+                                        : Trait::Unavailable;
 }
 
 #ifdef MPARK_CPP14_CONSTEXPR
@@ -1195,7 +1199,7 @@ struct base {
 
     template <typename Visitor, typename... Alts>
     inline static constexpr DECLTYPE_AUTO invoke(Visitor &&visitor,
-                                                 Alts &&... alts)
+                                                 Alts &&...alts)
         DECLTYPE_AUTO_RETURN(lib::invoke(lib::forward<Visitor>(visitor),
                                          lib::forward<Alts>(alts)...))
   };
@@ -1230,12 +1234,12 @@ struct base {
   struct dispatcher<true, R, ITs...> {
     template <std::size_t B, typename F>
     MPARK_ALWAYS_INLINE static constexpr R dispatch(
-        F &&f, typename ITs::type &&... visited_vs) {
+        F &&f, typename ITs::type &&...visited_vs) {
       using Expected = R;
-      using Actual = decltype(
-          lib::invoke(lib::forward<F>(f),
-                      access::base::get_alt<ITs::value>(
-                          lib::forward<typename ITs::type>(visited_vs))...));
+      using Actual = decltype(lib::invoke(
+          lib::forward<F>(f),
+          access::base::get_alt<ITs::value>(
+              lib::forward<typename ITs::type>(visited_vs))...));
       return visit_return_type_check<Expected, Actual>::invoke(
           lib::forward<F>(f),
           access::base::get_alt<ITs::value>(
@@ -1244,7 +1248,7 @@ struct base {
 
     template <std::size_t B, typename F, typename V, typename... Vs>
     MPARK_ALWAYS_INLINE static constexpr R dispatch(
-        F &&f, typename ITs::type &&... visited_vs, V &&v, Vs &&... vs) {
+        F &&f, typename ITs::type &&...visited_vs, V &&v, Vs &&...vs) {
 #define MPARK_DISPATCH(I)                                                   \
   dispatcher<(I < lib::decay_t<V>::size()),                                 \
              R,                                                             \
@@ -1336,11 +1340,11 @@ struct base {
     }
 
     template <std::size_t I, typename F, typename... Vs>
-    MPARK_ALWAYS_INLINE static constexpr R dispatch_case(F &&f, Vs &&... vs) {
+    MPARK_ALWAYS_INLINE static constexpr R dispatch_case(F &&f, Vs &&...vs) {
       using Expected = R;
-      using Actual = decltype(
-          lib::invoke(lib::forward<F>(f),
-                      access::base::get_alt<I>(lib::forward<Vs>(vs))...));
+      using Actual = decltype(lib::invoke(
+          lib::forward<F>(f),
+          access::base::get_alt<I>(lib::forward<Vs>(vs))...));
       return visit_return_type_check<Expected, Actual>::invoke(
           lib::forward<F>(f),
           access::base::get_alt<I>(lib::forward<Vs>(vs))...);
@@ -1350,7 +1354,7 @@ struct base {
     MPARK_ALWAYS_INLINE static constexpr R dispatch_at(std::size_t index,
                                                        F &&f,
                                                        V &&v,
-                                                       Vs &&... vs) {
+                                                       Vs &&...vs) {
       static_assert(lib::all<(lib::decay_t<V>::size() ==
                               lib::decay_t<Vs>::size())...>::value,
                     "all of the variants must be the same size.");
@@ -1449,7 +1453,7 @@ struct base {
 
   template <typename F, typename... Fs>
   inline static constexpr lib::array<lib::decay_t<F>, sizeof...(Fs) + 1>
-  make_farray(F &&f, Fs &&... fs) {
+  make_farray(F &&f, Fs &&...fs) {
     return {{lib::forward<F>(f), lib::forward<Fs>(fs)...}};
   }
 
@@ -1457,11 +1461,11 @@ struct base {
   struct make_fmatrix_impl {
     template <std::size_t... Is>
     inline static constexpr dispatch_result_t<F, Vs...> dispatch(F &&f,
-                                                                 Vs &&... vs) {
+                                                                 Vs &&...vs) {
       using Expected = dispatch_result_t<F, Vs...>;
-      using Actual = decltype(
-          lib::invoke(lib::forward<F>(f),
-                      access::base::get_alt<Is>(lib::forward<Vs>(vs))...));
+      using Actual = decltype(lib::invoke(
+          lib::forward<F>(f),
+          access::base::get_alt<Is>(lib::forward<Vs>(vs))...));
       return visit_return_type_check<Expected, Actual>::invoke(
           lib::forward<F>(f),
           access::base::get_alt<Is>(lib::forward<Vs>(vs))...);
@@ -1515,11 +1519,11 @@ struct base {
   struct make_fdiagonal_impl {
     template <std::size_t I>
     inline static constexpr dispatch_result_t<F, Vs...> dispatch(F &&f,
-                                                                 Vs &&... vs) {
+                                                                 Vs &&...vs) {
       using Expected = dispatch_result_t<F, Vs...>;
-      using Actual = decltype(
-          lib::invoke(lib::forward<F>(f),
-                      access::base::get_alt<I>(lib::forward<Vs>(vs))...));
+      using Actual = decltype(lib::invoke(
+          lib::forward<F>(f),
+          access::base::get_alt<I>(lib::forward<Vs>(vs))...));
       return visit_return_type_check<Expected, Actual>::invoke(
           lib::forward<F>(f),
           access::base::get_alt<I>(lib::forward<Vs>(vs))...);
@@ -1571,8 +1575,7 @@ constexpr fdiagonal_t<F, Vs...> fdiagonal<F, Vs...>::value;
 
 struct alt {
   template <typename Visitor, typename... Vs>
-  inline static constexpr DECLTYPE_AUTO visit_alt(Visitor &&visitor,
-                                                  Vs &&... vs)
+  inline static constexpr DECLTYPE_AUTO visit_alt(Visitor &&visitor, Vs &&...vs)
 #ifdef MPARK_VARIANT_SWITCH_VISIT
       DECLTYPE_AUTO_RETURN(
           base::dispatcher<true,
@@ -1597,7 +1600,7 @@ struct alt {
 
           template <typename Visitor, typename... Vs>
           inline static constexpr DECLTYPE_AUTO
-      visit_alt_at(std::size_t index, Visitor &&visitor, Vs &&... vs)
+      visit_alt_at(std::size_t index, Visitor &&visitor, Vs &&...vs)
 #ifdef MPARK_VARIANT_SWITCH_VISIT
           DECLTYPE_AUTO_RETURN(
               base::dispatcher<
@@ -1640,7 +1643,7 @@ struct variant {
                   "`visit` requires the visitor to be exhaustive.");
 
     inline static constexpr DECLTYPE_AUTO invoke(Visitor &&visitor,
-                                                 Values &&... values)
+                                                 Values &&...values)
         DECLTYPE_AUTO_RETURN(lib::invoke(lib::forward<Visitor>(visitor),
                                          lib::forward<Values>(values)...))
   };
@@ -1650,7 +1653,7 @@ struct variant {
     Visitor &&visitor_;
 
     template <typename... Alts>
-    inline constexpr DECLTYPE_AUTO operator()(Alts &&... alts) const
+    inline constexpr DECLTYPE_AUTO operator()(Alts &&...alts) const
         DECLTYPE_AUTO_RETURN(visit_exhaustiveness_check<
                              Visitor,
                              decltype((lib::forward<Alts>(alts).value))...>::
@@ -1665,13 +1668,13 @@ struct variant {
           public
       : template <typename Visitor, typename... Vs>
         inline static constexpr DECLTYPE_AUTO
-        visit_alt(Visitor &&visitor, Vs &&... vs)
+        visit_alt(Visitor &&visitor, Vs &&...vs)
             DECLTYPE_AUTO_RETURN(alt::visit_alt(lib::forward<Visitor>(visitor),
                                                 lib::forward<Vs>(vs).impl_...))
 
                 template <typename Visitor, typename... Vs>
                 inline static constexpr DECLTYPE_AUTO
-        visit_alt_at(std::size_t index, Visitor &&visitor, Vs &&... vs)
+        visit_alt_at(std::size_t index, Visitor &&visitor, Vs &&...vs)
             DECLTYPE_AUTO_RETURN(
                 alt::visit_alt_at(index,
                                   lib::forward<Visitor>(visitor),
@@ -1679,13 +1682,13 @@ struct variant {
 
                 template <typename Visitor, typename... Vs>
                 inline static constexpr DECLTYPE_AUTO
-        visit_value(Visitor &&visitor, Vs &&... vs) DECLTYPE_AUTO_RETURN(
+        visit_value(Visitor &&visitor, Vs &&...vs) DECLTYPE_AUTO_RETURN(
             visit_alt(make_value_visitor(lib::forward<Visitor>(visitor)),
                       lib::forward<Vs>(vs)...))
 
             template <typename Visitor, typename... Vs>
             inline static constexpr DECLTYPE_AUTO
-        visit_value_at(std::size_t index, Visitor &&visitor, Vs &&... vs)
+        visit_value_at(std::size_t index, Visitor &&visitor, Vs &&...vs)
             DECLTYPE_AUTO_RETURN(
                 visit_alt_at(index,
                              make_value_visitor(lib::forward<Visitor>(visitor)),
@@ -1703,7 +1706,7 @@ struct alt {
 #pragma warning(disable : 4244)
 #endif
   template <typename... Args>
-  inline explicit constexpr alt(in_place_t, Args &&... args)
+  inline explicit constexpr alt(in_place_t, Args &&...args)
       : value(lib::forward<Args>(args)...) {}
 #ifdef _MSC_VER
 #pragma warning(pop)
@@ -1727,12 +1730,12 @@ union recursive_union<DestructibleTrait, Index> {};
                                                                            \
     template <typename... Args>                                            \
     inline explicit constexpr recursive_union(in_place_index_t<0>,         \
-                                              Args &&... args)             \
+                                              Args &&...args)              \
         : head_(in_place_t{}, lib::forward<Args>(args)...) {}              \
                                                                            \
     template <std::size_t I, typename... Args>                             \
     inline explicit constexpr recursive_union(in_place_index_t<I>,         \
-                                              Args &&... args)             \
+                                              Args &&...args)              \
         : tail_(in_place_index_t<I - 1>{}, lib::forward<Args>(args)...) {} \
                                                                            \
     recursive_union(const recursive_union &) = default;                    \
@@ -1765,11 +1768,10 @@ template <Trait DestructibleTrait, typename... Ts>
 class base {
  public:
   inline explicit constexpr base(valueless_t tag) noexcept
-      : data_(tag),
-        index_(static_cast<index_t>(-1)) {}
+      : data_(tag), index_(static_cast<index_t>(-1)) {}
 
   template <std::size_t I, typename... Args>
-  inline explicit constexpr base(in_place_index_t<I>, Args &&... args)
+  inline explicit constexpr base(in_place_index_t<I>, Args &&...args)
       : data_(in_place_index_t<I>{}, lib::forward<Args>(args)...), index_(I) {}
 
   inline constexpr bool valueless_by_exception() const noexcept {
@@ -1823,9 +1825,9 @@ struct dtor {
 #if !defined(_MSC_VER) || _MSC_VER >= 1910
 #define MPARK_INHERITING_CTOR(type, base) using base::base;
 #else
-#define MPARK_INHERITING_CTOR(type, base)         \
-  template <typename... Args>                     \
-  inline explicit constexpr type(Args &&... args) \
+#define MPARK_INHERITING_CTOR(type, base)        \
+  template <typename... Args>                    \
+  inline explicit constexpr type(Args &&...args) \
       : base(lib::forward<Args>(args)...) {}
 #endif
 
@@ -1851,19 +1853,21 @@ class destructor;
     destroy                                                               \
   }
 
-MPARK_VARIANT_DESTRUCTOR(Trait::TriviallyAvailable, ~destructor() = default;
-                         , inline void destroy() noexcept {
-                           this->index_ = static_cast<index_t>(-1);
-                         });
+MPARK_VARIANT_DESTRUCTOR(
+    Trait::TriviallyAvailable, ~destructor() = default;
+    , inline void destroy() noexcept {
+      this->index_ = static_cast<index_t>(-1);
+    });
 
-MPARK_VARIANT_DESTRUCTOR(Trait::Available,
-                         ~destructor() { destroy(); },
-                         inline void destroy() noexcept {
-                           if (!this->valueless_by_exception()) {
-                             visitation::alt::visit_alt(dtor{}, *this);
-                           }
-                           this->index_ = static_cast<index_t>(-1);
-                         });
+MPARK_VARIANT_DESTRUCTOR(
+    Trait::Available,
+    ~destructor() { destroy(); },
+    inline void destroy() noexcept {
+      if (!this->valueless_by_exception()) {
+        visitation::alt::visit_alt(dtor{}, *this);
+      }
+      this->index_ = static_cast<index_t>(-1);
+    });
 
 MPARK_VARIANT_DESTRUCTOR(Trait::Unavailable, ~destructor() = delete;
                          , inline void destroy() noexcept = delete;);
@@ -1889,7 +1893,7 @@ class constructor : public destructor<Traits> {
 #endif
 
   template <std::size_t I, typename T, typename... Args>
-  inline static T &construct_alt(alt<I, T> &a, Args &&... args) {
+  inline static T &construct_alt(alt<I, T> &a, Args &&...args) {
     auto *result = ::new (static_cast<void *>(lib::addressof(a)))
         alt<I, T>(in_place_t{}, lib::forward<Args>(args)...);
     return result->value;
@@ -1976,11 +1980,11 @@ MPARK_VARIANT_COPY_CONSTRUCTOR(
     Trait::TriviallyAvailable,
     copy_constructor(const copy_constructor &that) = default;);
 
-MPARK_VARIANT_COPY_CONSTRUCTOR(Trait::Available,
-                               copy_constructor(const copy_constructor &that)
-                               : copy_constructor(valueless_t{}) {
-                                 this->generic_construct(*this, that);
-                               });
+MPARK_VARIANT_COPY_CONSTRUCTOR(
+    Trait::Available, copy_constructor(const copy_constructor &that)
+    : copy_constructor(valueless_t{}) {
+      this->generic_construct(*this, that);
+    });
 
 MPARK_VARIANT_COPY_CONSTRUCTOR(
     Trait::Unavailable, copy_constructor(const copy_constructor &) = delete;);
@@ -1996,7 +2000,7 @@ class assignment : public copy_constructor<Traits> {
   using super::operator=;
 
   template <std::size_t I, typename... Args>
-  inline /* auto & */ auto emplace(Args &&... args)
+  inline /* auto & */ auto emplace(Args &&...args)
       -> decltype(this->construct_alt(access::base::get_alt<I>(*this),
                                       lib::forward<Args>(args)...)) {
     this->destroy();
@@ -2161,18 +2165,19 @@ class impl : public copy_assignment<traits<Ts...>> {
     if (this->valueless_by_exception() && that.valueless_by_exception()) {
       // do nothing.
     } else if (this->index() == that.index()) {
-      visitation::alt::visit_alt_at(this->index(),
+      visitation::alt::visit_alt_at(
+          this->index(),
 #ifdef MPARK_GENERIC_LAMBDAS
-                                    [](auto &this_alt, auto &that_alt) {
-                                      using std::swap;
-                                      swap(this_alt.value, that_alt.value);
-                                    }
+          [](auto &this_alt, auto &that_alt) {
+            using std::swap;
+            swap(this_alt.value, that_alt.value);
+          }
 #else
-                                    swapper {}
+          swapper {}
 #endif
-                                    ,
-                                    *this,
-                                    that);
+          ,
+          *this,
+          that);
     } else {
       impl *lhs = this;
       impl *rhs = lib::addressof(that);
@@ -2275,7 +2280,7 @@ struct is_in_place_type : std::false_type {};
 template <typename T>
 struct is_in_place_type<in_place_type_t<T>> : std::true_type {};
 
-}  // detail
+}  // namespace detail
 
 template <typename... Ts>
 class variant {
@@ -2321,8 +2326,7 @@ class variant {
             lib::enable_if_t<std::is_constructible<T, Args...>::value, int> = 0>
   inline explicit constexpr variant(
       in_place_index_t<I>,
-      Args
-          &&... args) noexcept(std::is_nothrow_constructible<T, Args...>::value)
+      Args &&...args) noexcept(std::is_nothrow_constructible<T, Args...>::value)
       : impl_(in_place_index_t<I>{}, lib::forward<Args>(args)...) {}
 
   template <
@@ -2336,11 +2340,11 @@ class variant {
   inline explicit constexpr variant(
       in_place_index_t<I>,
       std::initializer_list<Up> il,
-      Args &&... args) noexcept(std::
-                                    is_nothrow_constructible<
-                                        T,
-                                        std::initializer_list<Up> &,
-                                        Args...>::value)
+      Args &&...args) noexcept(std::
+                                   is_nothrow_constructible<
+                                       T,
+                                       std::initializer_list<Up> &,
+                                       Args...>::value)
       : impl_(in_place_index_t<I>{}, il, lib::forward<Args>(args)...) {}
 
   template <typename T,
@@ -2349,8 +2353,7 @@ class variant {
             lib::enable_if_t<std::is_constructible<T, Args...>::value, int> = 0>
   inline explicit constexpr variant(
       in_place_type_t<T>,
-      Args
-          &&... args) noexcept(std::is_nothrow_constructible<T, Args...>::value)
+      Args &&...args) noexcept(std::is_nothrow_constructible<T, Args...>::value)
       : impl_(in_place_index_t<I>{}, lib::forward<Args>(args)...) {}
 
   template <
@@ -2364,11 +2367,11 @@ class variant {
   inline explicit constexpr variant(
       in_place_type_t<T>,
       std::initializer_list<Up> il,
-      Args &&... args) noexcept(std::
-                                    is_nothrow_constructible<
-                                        T,
-                                        std::initializer_list<Up> &,
-                                        Args...>::value)
+      Args &&...args) noexcept(std::
+                                   is_nothrow_constructible<
+                                       T,
+                                       std::initializer_list<Up> &,
+                                       Args...>::value)
       : impl_(in_place_index_t<I>{}, il, lib::forward<Args>(args)...) {}
 
   ~variant() = default;
@@ -2395,7 +2398,7 @@ class variant {
             typename... Args,
             typename T = lib::type_pack_element_t<I, Ts...>,
             lib::enable_if_t<std::is_constructible<T, Args...>::value, int> = 0>
-  inline T &emplace(Args &&... args) {
+  inline T &emplace(Args &&...args) {
     return impl_.template emplace<I>(lib::forward<Args>(args)...);
   }
 
@@ -2407,7 +2410,7 @@ class variant {
       lib::enable_if_t<
           std::is_constructible<T, std::initializer_list<Up> &, Args...>::value,
           int> = 0>
-  inline T &emplace(std::initializer_list<Up> il, Args &&... args) {
+  inline T &emplace(std::initializer_list<Up> il, Args &&...args) {
     return impl_.template emplace<I>(il, lib::forward<Args>(args)...);
   }
 
@@ -2415,7 +2418,7 @@ class variant {
             typename... Args,
             std::size_t I = detail::find_index_sfinae<T, Ts...>::value,
             lib::enable_if_t<std::is_constructible<T, Args...>::value, int> = 0>
-  inline T &emplace(Args &&... args) {
+  inline T &emplace(Args &&...args) {
     return impl_.template emplace<I>(lib::forward<Args>(args)...);
   }
 
@@ -2427,7 +2430,7 @@ class variant {
       lib::enable_if_t<
           std::is_constructible<T, std::initializer_list<Up> &, Args...>::value,
           int> = 0>
-  inline T &emplace(std::initializer_list<Up> il, Args &&... args) {
+  inline T &emplace(std::initializer_list<Up> il, Args &&...args) {
     return impl_.template emplace<I>(il, lib::forward<Args>(args)...);
   }
 
@@ -2534,10 +2537,10 @@ inline constexpr const T &&get(const variant<Ts...> &&v) {
 namespace detail {
 
 template <std::size_t I, typename V>
-inline constexpr /* auto * */ AUTO generic_get_if(V *v) noexcept AUTO_RETURN(
-    v &&holds_alternative<I>(*v)
-        ? lib::addressof(access::variant::get_alt<I>(*v).value)
-        : nullptr)
+inline constexpr /* auto * */ AUTO generic_get_if(V *v) noexcept
+    AUTO_RETURN(v &&holds_alternative<I>(*v)
+                    ? lib::addressof(access::variant::get_alt<I>(*v).value)
+                    : nullptr)
 
 }  // namespace detail
 
@@ -2720,7 +2723,7 @@ inline constexpr bool all(std::initializer_list<bool> bs) {
 }  // namespace detail
 
 template <typename Visitor, typename... Vs>
-inline constexpr decltype(auto) visit(Visitor &&visitor, Vs &&... vs) {
+inline constexpr decltype(auto) visit(Visitor &&visitor, Vs &&...vs) {
   return (detail::all(
               lib::array<bool, sizeof...(Vs)>{!vs.valueless_by_exception()...})
               ? (void)0
@@ -2744,7 +2747,7 @@ inline constexpr bool all(const lib::array<bool, N> &bs) {
 }  // namespace detail
 
 template <typename Visitor, typename... Vs>
-inline constexpr DECLTYPE_AUTO visit(Visitor &&visitor, Vs &&... vs)
+inline constexpr DECLTYPE_AUTO visit(Visitor &&visitor, Vs &&...vs)
     DECLTYPE_AUTO_RETURN(
         (detail::all(lib::array<bool, sizeof...(Vs)>{
              {!vs.valueless_by_exception()...}})
diff --git a/paddle/utils/variant_test.cc b/paddle/utils/variant_test.cc
index e690269d801c1..ef4a6cf8cd89c 100644
--- a/paddle/utils/variant_test.cc
+++ b/paddle/utils/variant_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/utils/variant.h"
+
 #include "gtest/gtest.h"
 #include "paddle/phi/core/enforce.h"
 
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index fdcd560658146..0c1089b1fd440 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,23 +1,21 @@
 file(GLOB UTILS_PY_FILES . ./paddle/legacy/utils/*.py)
 file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/*.py)
-set(PY_FILES paddle/__init__.py
-  ${UTILS_PY_FILES}
-  ${FLUID_PY_FILES})
+set(PY_FILES paddle/__init__.py ${UTILS_PY_FILES} ${FLUID_PY_FILES})
 
 if(WITH_GPU)
-  SET(PACKAGE_NAME "paddlepaddle-gpu")
+  set(PACKAGE_NAME "paddlepaddle-gpu")
 elseif(WITH_MLU)
-  SET(PACKAGE_NAME "paddlepaddle-mlu")
+  set(PACKAGE_NAME "paddlepaddle-mlu")
 elseif(WITH_ROCM)
-  SET(PACKAGE_NAME "paddlepaddle-rocm")
+  set(PACKAGE_NAME "paddlepaddle-rocm")
 elseif(WITH_ASCEND_CL)
-  SET(PACKAGE_NAME "paddlepaddle-npu")
+  set(PACKAGE_NAME "paddlepaddle-npu")
 elseif(WITH_XPU)
-  SET(PACKAGE_NAME "paddlepaddle-xpu")
+  set(PACKAGE_NAME "paddlepaddle-xpu")
 elseif(WITH_IPU)
-  SET(PACKAGE_NAME "paddlepaddle-ipu")
+  set(PACKAGE_NAME "paddlepaddle-ipu")
 else()
-  SET(PACKAGE_NAME "paddlepaddle")
+  set(PACKAGE_NAME "paddlepaddle")
 endif()
 
 set(SETUP_LOG_FILE "setup.py.log")
@@ -26,7 +24,9 @@ set(FLUID_CORE_NAME "core")
 if(WITH_AVX AND AVX_FOUND)
   set(FLUID_CORE_NAME "${FLUID_CORE_NAME}_avx")
   if(NOT DEFINED NOAVX_CORE_FILE OR NOAVX_CORE_FILE STREQUAL "")
-    message(STATUS "MESSAGE: This is just a message for publishing release.
+    message(
+      STATUS
+        "MESSAGE: This is just a message for publishing release.
       You are building AVX version without NOAVX core.
       So the wheel package may fail on NOAVX machine.
       You can add -DNOAVX_CORE_FILE=/path/to/your/core_noavx.* in cmake command
@@ -44,29 +44,31 @@ else()
 endif()
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
-    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+               ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
 
-IF(WIN32)
-    # Python would use the .pyd by default under Windows series platform
-    set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.pyd)
-    set(FLUID_CORE_LIB ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.lib)
-    
-    add_custom_command(OUTPUT ${FLUID_CORE}
-      COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-      COMMAND cmake -E copy $<TARGET_LINKER_FILE:paddle_pybind> ${FLUID_CORE_LIB}
-      DEPENDS paddle_pybind)
-
-    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd)  
-ELSE()
-    set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.so)
-    add_custom_command(OUTPUT ${FLUID_CORE}
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-        DEPENDS paddle_pybind)
-
-    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.so)
-ENDIF()
+if(WIN32)
+  # Python would use the .pyd by default under Windows series platform
+  set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.pyd)
+  set(FLUID_CORE_LIB ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.lib)
+
+  add_custom_command(
+    OUTPUT ${FLUID_CORE}
+    COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+    COMMAND cmake -E copy $<TARGET_LINKER_FILE:paddle_pybind> ${FLUID_CORE_LIB}
+    DEPENDS paddle_pybind)
+
+  set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd)
+else()
+  set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.so)
+  add_custom_command(
+    OUTPUT ${FLUID_CORE}
+    COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+    DEPENDS paddle_pybind)
+
+  set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.so)
+endif()
 
 set(FLUID_CORE_DEPS ${FLUID_CORE})
 
@@ -75,40 +77,55 @@ if(HAS_NOAVX_CORE AND EXISTS "${NOAVX_CORE_FILE}")
   get_filename_component(NOAVX_CORE_EXT ${NOAVX_CORE_FILE} EXT)
   if(WIN32)
     if(NOT NOAVX_CORE_EXT STREQUAL ".pyd")
-      message(FATAL_ERROR "Wrong file ${NOAVX_CORE_NAME}, the ext does not match windows *.pyd!")
+      message(
+        FATAL_ERROR
+          "Wrong file ${NOAVX_CORE_NAME}, the ext does not match windows *.pyd!"
+      )
     endif()
   else()
     if(NOT NOAVX_CORE_EXT STREQUAL ".so")
-      message(FATAL_ERROR "Wrong file ${NOAVX_CORE_NAME}, the ext does not match *.so!")
+      message(
+        FATAL_ERROR
+          "Wrong file ${NOAVX_CORE_NAME}, the ext does not match *.so!")
     endif()
   endif()
-  add_custom_command(OUTPUT ${FLUID_NOAVX_CORE}
-    COMMAND cmake -E copy ${NOAVX_CORE_FILE} ${FLUID_NOAVX_CORE} DEPENDS paddle_pybind)
+  add_custom_command(
+    OUTPUT ${FLUID_NOAVX_CORE}
+    COMMAND cmake -E copy ${NOAVX_CORE_FILE} ${FLUID_NOAVX_CORE}
+    DEPENDS paddle_pybind)
   list(APPEND FLUID_CORE_DEPS ${FLUID_NOAVX_CORE})
 endif()
 
 add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE_DEPS})
 
-IF(WIN32)
-  add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/
-    COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+if(WIN32)
+  add_custom_command(
+    OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+    COMMAND
+      ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle
+      ${PADDLE_BINARY_DIR}/python/paddle/
+    COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py
+            bdist_wheel
     COMMENT "Packing whl packages------>>>"
-    DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto pass_desc_py_proto ${PY_FILES})
-ELSE(WIN32)
-  add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+    DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto
+            profiler_py_proto pass_desc_py_proto ${PY_FILES})
+else(WIN32)
+  add_custom_command(
+    OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND touch stub.cc
     COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMENT "Packing whl packages------>>>"
-    DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto pass_desc_py_proto ${PY_FILES})
-ENDIF()
+    DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto
+            profiler_py_proto pass_desc_py_proto ${PY_FILES})
+endif()
 
-add_custom_target(paddle_python ALL DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp)
+add_custom_target(paddle_python ALL
+                  DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp)
 
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 
-if (WITH_TESTING)
+if(WITH_TESTING)
   add_subdirectory(paddle/reader/tests)
   add_subdirectory(paddle/dataset/tests)
   add_subdirectory(paddle/tests)
@@ -117,8 +134,7 @@ if (WITH_TESTING)
   add_subdirectory(paddle/fluid/contrib/slim/tests)
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
-    DESTINATION opt/paddle/share/wheels
-)
+        DESTINATION opt/paddle/share/wheels)
 
 if(APPLE)
   find_program(INSTALL_NAME_TOOL_EXECUTABLE install_name_tool)
@@ -126,10 +142,13 @@ if(APPLE)
     message(FATAL_ERROR "install_name_tool not found, please check.\n")
   endif()
 endif()
-if(LINUX AND NOT WITH_SW AND NOT WITH_ARM)
+if(LINUX
+   AND NOT WITH_SW
+   AND NOT WITH_ARM)
   find_program(PATCHELF_EXECUTABLE patchelf)
   if(NOT PATCHELF_EXECUTABLE)
-    message(FATAL_ERROR "patchelf not found, please install it.\n"
-            "For Ubuntu, the command is: apt-get install -y patchelf.")
+    message(
+      FATAL_ERROR "patchelf not found, please install it.\n"
+                  "For Ubuntu, the command is: apt-get install -y patchelf.")
   endif()
 endif()
diff --git a/python/paddle/_C_ops.py b/python/paddle/_C_ops.py
index 2bcaa5478e574..e8f89c739c953 100644
--- a/python/paddle/_C_ops.py
+++ b/python/paddle/_C_ops.py
@@ -14,6 +14,7 @@
 
 from paddle.fluid import core
 from .fluid import framework
+
 __all__ = []
 
 _already_switch_to_eager_ = False
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 930918e967eed..75ec75cc43100 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -24,6 +24,7 @@
 from .batch import batch  # noqa: F401
 from .framework import monkey_patch_variable
 from .framework import monkey_patch_math_varbase
+
 monkey_patch_variable()
 monkey_patch_math_varbase()
 
@@ -52,6 +53,7 @@
     Tensor = framework.core.eager.Tensor
 else:
     from .framework import VarBase as Tensor  # noqa: F401
+
 Tensor.__qualname__ = 'Tensor'  # noqa: F401
 import paddle.compat  # noqa: F401
 import paddle.distributed  # noqa: F401
@@ -75,7 +77,6 @@
 import paddle.reader  # noqa: F401
 import paddle.static  # noqa: F401
 import paddle.vision  # noqa: F401
-import paddle.sparse  # noqa: F401
 
 from .tensor.attribute import is_complex  # noqa: F401
 from .tensor.attribute import is_integer  # noqa: F401
@@ -373,272 +374,272 @@
 disable_static()
 
 __all__ = [  # noqa
-           'dtype',
-           'uint8',
-           'int8',
-           'int16',
-           'int32',
-           'int64',
-           'float16',
-           'float32',
-           'float64',
-           'bfloat16',
-           'bool',
-           'complex64',
-           'complex128',
-           'addmm',
-           'allclose',
-           'isclose',
-           't',
-           'add',
-           'subtract',
-           'diag',
-           'diagflat',
-           'isnan',
-           'scatter_nd_add',
-           'unstack',
-           'get_default_dtype',
-           'save',
-           'multinomial',
-           'get_cuda_rng_state',
-           'rank',
-           'empty_like',
-           'eye',
-           'cumsum',
-           'cumprod',
-           'logit',
-           'sign',
-           'is_empty',
-           'equal',
-           'equal_all',
-           'is_tensor',
-           'is_complex',
-           'is_integer',
-           'cross',
-           'where',
-           'log1p',
-           'cos',
-           'tan',
-           'mean',
-           'mode',
-           'mv',
-           'in_dynamic_mode',
-           'min',
-           'amin',
-           'any',
-           'slice',
-           'normal',
-           'logsumexp',
-           'full',
-           'unsqueeze',
-           'unsqueeze_',
-           'argmax',
-           'Model',
-           'summary',
-           'flops',
-           'sort',
-           'searchsorted',
-           'split',
-           'logical_and',
-           'full_like',
-           'less_than',
-           'kron',
-           'clip',
-           'Tensor',
-           'crop',
-           'ParamAttr',
-           'stanh',
-           'randint',
-           'randint_like',
-           'assign',
-           'gather',
-           'scale',
-           'zeros',
-           'rsqrt',
-           'squeeze',
-           'squeeze_',
-           'to_tensor',
-           'gather_nd',
-           'isinf',
-           'uniform',
-           'floor_divide',
-           'remainder',
-           'floor_mod',
-           'roll',
-           'batch',
-           'max',
-           'amax',
-           'logical_or',
-           'bitwise_and',
-           'bitwise_or',
-           'bitwise_xor',
-           'bitwise_not',
-           'mm',
-           'flip',
-           'rot90',
-           'bincount',
-           'histogram',
-           'multiplex',
-           'CUDAPlace',
-           'NPUPlace',
-           'empty',
-           'shape',
-           'real',
-           'imag',
-           'is_floating_point',
-           'complex',
-           'reciprocal',
-           'rand',
-           'less_equal',
-           'triu',
-           'sin',
-           'dist',
-           'unbind',
-           'meshgrid',
-           'arange',
-           'load',
-           'numel',
-           'median',
-           'nanmedian',
-           'quantile',
-           'nanquantile',
-           'no_grad',
-           'set_grad_enabled',
-           'is_grad_enabled',
-           'mod',
-           'abs',
-           'tril',
-           'pow',
-           'zeros_like',
-           'maximum',
-           'topk',
-           'index_select',
-           'CPUPlace',
-           'matmul',
-           'seed',
-           'acos',
-           'logical_xor',
-           'exp',
-           'expm1',
-           'bernoulli',
-           'poisson',
-           'sinh',
-           'round',
-           'DataParallel',
-           'argmin',
-           'prod',
-           'broadcast_shape',
-           'conj',
-           'neg',
-           'lgamma',
-           'lerp',
-           'erfinv',
-           'inner',
-           'outer',
-           'square',
-           'divide',
-           'ceil',
-           'atan',
-           'atan2',
-           'rad2deg',
-           'deg2rad',
-           'gcd',
-           'lcm',
-           'expand',
-           'broadcast_to',
-           'ones_like',
-           'index_sample',
-           'cast',
-           'grad',
-           'all',
-           'ones',
-           'not_equal',
-           'sum',
-           'nansum',
-           'nanmean',
-           'tile',
-           'greater_equal',
-           'isfinite',
-           'create_parameter',
-           'dot',
-           'increment',
-           'erf',
-           'bmm',
-           'chunk',
-           'tolist',
-           'tensordot',
-           'greater_than',
-           'shard_index',
-           'argsort',
-           'tanh',
-           'tanh_',
-           'transpose',
-           'randn',
-           'strided_slice',
-           'unique',
-           'unique_consecutive',
-           'set_cuda_rng_state',
-           'set_printoptions',
-           'std',
-           'flatten',
-           'asin',
-           'multiply',
-           'disable_static',
-           'masked_select',
-           'var',
-           'trace',
-           'enable_static',
-           'scatter_nd',
-           'set_default_dtype',
-           'disable_signal_handler',
-           'expand_as',
-           'stack',
-           'sqrt',
-           'randperm',
-           'linspace',
-           'logspace',
-           'reshape',
-           'reshape_',
-           'reverse',
-           'nonzero',
-           'CUDAPinnedPlace',
-           'logical_not',
-           'add_n',
-           'minimum',
-           'scatter',
-           'scatter_',
-           'floor',
-           'cosh',
-           'log',
-           'log2',
-           'log10',
-           'concat',
-           'check_shape',
-           'trunc',
-           'frac',
-           'digamma',
-           'standard_normal',
-           'diagonal',
-           'broadcast_tensors',
-           'einsum',
-           'set_flags',
-           'get_flags',
-           'asinh',
-           'acosh',
-           'atanh',
-           'as_complex',
-           'as_real',
-           'diff',
-           'angle',
-           'fmax',
-           'fmin',
-           'moveaxis',
-           'repeat_interleave',
-           'clone',
-           'kthvalue',
-           'renorm',
-           'take_along_axis',
-           'put_along_axis',
-           'heaviside',
-           'tril_indices',
+    'dtype',
+    'uint8',
+    'int8',
+    'int16',
+    'int32',
+    'int64',
+    'float16',
+    'float32',
+    'float64',
+    'bfloat16',
+    'bool',
+    'complex64',
+    'complex128',
+    'addmm',
+    'allclose',
+    'isclose',
+    't',
+    'add',
+    'subtract',
+    'diag',
+    'diagflat',
+    'isnan',
+    'scatter_nd_add',
+    'unstack',
+    'get_default_dtype',
+    'save',
+    'multinomial',
+    'get_cuda_rng_state',
+    'rank',
+    'empty_like',
+    'eye',
+    'cumsum',
+    'cumprod',
+    'logit',
+    'sign',
+    'is_empty',
+    'equal',
+    'equal_all',
+    'is_tensor',
+    'is_complex',
+    'is_integer',
+    'cross',
+    'where',
+    'log1p',
+    'cos',
+    'tan',
+    'mean',
+    'mode',
+    'mv',
+    'in_dynamic_mode',
+    'min',
+    'amin',
+    'any',
+    'slice',
+    'normal',
+    'logsumexp',
+    'full',
+    'unsqueeze',
+    'unsqueeze_',
+    'argmax',
+    'Model',
+    'summary',
+    'flops',
+    'sort',
+    'searchsorted',
+    'split',
+    'logical_and',
+    'full_like',
+    'less_than',
+    'kron',
+    'clip',
+    'Tensor',
+    'crop',
+    'ParamAttr',
+    'stanh',
+    'randint',
+    'randint_like',
+    'assign',
+    'gather',
+    'scale',
+    'zeros',
+    'rsqrt',
+    'squeeze',
+    'squeeze_',
+    'to_tensor',
+    'gather_nd',
+    'isinf',
+    'uniform',
+    'floor_divide',
+    'remainder',
+    'floor_mod',
+    'roll',
+    'batch',
+    'max',
+    'amax',
+    'logical_or',
+    'bitwise_and',
+    'bitwise_or',
+    'bitwise_xor',
+    'bitwise_not',
+    'mm',
+    'flip',
+    'rot90',
+    'bincount',
+    'histogram',
+    'multiplex',
+    'CUDAPlace',
+    'NPUPlace',
+    'empty',
+    'shape',
+    'real',
+    'imag',
+    'is_floating_point',
+    'complex',
+    'reciprocal',
+    'rand',
+    'less_equal',
+    'triu',
+    'sin',
+    'dist',
+    'unbind',
+    'meshgrid',
+    'arange',
+    'load',
+    'numel',
+    'median',
+    'nanmedian',
+    'quantile',
+    'nanquantile',
+    'no_grad',
+    'set_grad_enabled',
+    'is_grad_enabled',
+    'mod',
+    'abs',
+    'tril',
+    'pow',
+    'zeros_like',
+    'maximum',
+    'topk',
+    'index_select',
+    'CPUPlace',
+    'matmul',
+    'seed',
+    'acos',
+    'logical_xor',
+    'exp',
+    'expm1',
+    'bernoulli',
+    'poisson',
+    'sinh',
+    'round',
+    'DataParallel',
+    'argmin',
+    'prod',
+    'broadcast_shape',
+    'conj',
+    'neg',
+    'lgamma',
+    'lerp',
+    'erfinv',
+    'inner',
+    'outer',
+    'square',
+    'divide',
+    'ceil',
+    'atan',
+    'atan2',
+    'rad2deg',
+    'deg2rad',
+    'gcd',
+    'lcm',
+    'expand',
+    'broadcast_to',
+    'ones_like',
+    'index_sample',
+    'cast',
+    'grad',
+    'all',
+    'ones',
+    'not_equal',
+    'sum',
+    'nansum',
+    'nanmean',
+    'tile',
+    'greater_equal',
+    'isfinite',
+    'create_parameter',
+    'dot',
+    'increment',
+    'erf',
+    'bmm',
+    'chunk',
+    'tolist',
+    'tensordot',
+    'greater_than',
+    'shard_index',
+    'argsort',
+    'tanh',
+    'tanh_',
+    'transpose',
+    'randn',
+    'strided_slice',
+    'unique',
+    'unique_consecutive',
+    'set_cuda_rng_state',
+    'set_printoptions',
+    'std',
+    'flatten',
+    'asin',
+    'multiply',
+    'disable_static',
+    'masked_select',
+    'var',
+    'trace',
+    'enable_static',
+    'scatter_nd',
+    'set_default_dtype',
+    'disable_signal_handler',
+    'expand_as',
+    'stack',
+    'sqrt',
+    'randperm',
+    'linspace',
+    'logspace',
+    'reshape',
+    'reshape_',
+    'reverse',
+    'nonzero',
+    'CUDAPinnedPlace',
+    'logical_not',
+    'add_n',
+    'minimum',
+    'scatter',
+    'scatter_',
+    'floor',
+    'cosh',
+    'log',
+    'log2',
+    'log10',
+    'concat',
+    'check_shape',
+    'trunc',
+    'frac',
+    'digamma',
+    'standard_normal',
+    'diagonal',
+    'broadcast_tensors',
+    'einsum',
+    'set_flags',
+    'get_flags',
+    'asinh',
+    'acosh',
+    'atanh',
+    'as_complex',
+    'as_real',
+    'diff',
+    'angle',
+    'fmax',
+    'fmin',
+    'moveaxis',
+    'repeat_interleave',
+    'clone',
+    'kthvalue',
+    'renorm',
+    'take_along_axis',
+    'put_along_axis',
+    'heaviside',
+    'tril_indices',
 ]
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index ca08ce196a983..46582b1770b46 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -83,10 +83,10 @@ def __init__(self,
                  incr_every_n_steps=1000,
                  decr_every_n_nan_or_inf=2,
                  use_dynamic_loss_scaling=True):
-        super(GradScaler, self).__init__(enable, init_loss_scaling, incr_ratio,
-                                         decr_ratio, incr_every_n_steps,
-                                         decr_every_n_nan_or_inf,
-                                         use_dynamic_loss_scaling)
+        super(GradScaler,
+              self).__init__(enable, init_loss_scaling, incr_ratio, decr_ratio,
+                             incr_every_n_steps, decr_every_n_nan_or_inf,
+                             use_dynamic_loss_scaling)
 
     def scale(self, var):
         """
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index f36cdafa46491..d2c2beadf386b 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,6 +16,7 @@
 from paddle.fluid import framework
 from paddle.fluid.backward import gradients_with_optimizer
 import paddle
+
 __all__ = []
 
 
@@ -81,14 +82,16 @@ def check_tensors(in_out_list, name):
         if isinstance(in_out_list, (list, tuple)):
             assert len(in_out_list) > 0, "{} connot be empyt".format(name)
             for each_var in in_out_list:
-                assert isinstance(each_var, (
-                    paddle.Tensor, core.eager.Tensor
-                )), "Elements of {} must be paddle.Tensor".format(name)
+                assert isinstance(
+                    each_var,
+                    (paddle.Tensor, core.eager.Tensor
+                     )), "Elements of {} must be paddle.Tensor".format(name)
             return in_out_list
         else:
-            assert isinstance(in_out_list, (
-                paddle.Tensor, core.eager.Tensor
-            )), "{} must be Tensor or list of Tensor".format(name)
+            assert isinstance(
+                in_out_list,
+                (paddle.Tensor, core.eager.Tensor
+                 )), "{} must be Tensor or list of Tensor".format(name)
             return [in_out_list]
 
     tensors = check_tensors(tensors, "tensors")
diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py
index 93142c9112fd0..8dda4811d1b26 100644
--- a/python/paddle/autograd/functional.py
+++ b/python/paddle/autograd/functional.py
@@ -139,7 +139,7 @@ def _double_backward_trick(ys, xs, v):
     """Double backward trick for computing ``jvp`` by ``vjp``
     see details: https://j-towns.github.io/2017/06/12/A-new-trick.html
     """
-    # The value of ys_grad is not important, it can be any random value in 
+    # The value of ys_grad is not important, it can be any random value in
     # theory, but it's required to set stop_gradient=False.
     ys_grad = _zeros_like_with_grad(ys)
     xs_grad = _grad(ys, xs, ys_grad)
@@ -302,10 +302,11 @@ def reducer(x):
     """
 
     def __init__(self, func, xs, is_batched=False):
+
         def _jac_func(*xs):
             jac = Jacobian(func, xs, is_batched=is_batched)
-            if (is_batched and jac.shape[1] != 1) or (not is_batched and
-                                                      jac.shape[0] != 1):
+            if (is_batched and jac.shape[1] != 1) or (not is_batched
+                                                      and jac.shape[0] != 1):
                 raise RuntimeError(
                     "The function given to Hessian shoud return as single element Tensor or batched single element Tensor."
                 )
@@ -362,18 +363,18 @@ def _lazy_axis(self):
 
     def _lazy_indexes(self, indexes):
         idx = indexes[self._lazy_axis]
-        return (idx, ) if isinstance(
-            idx, int) else tuple(range(idx.start, idx.stop, idx.step))
+        return (idx, ) if isinstance(idx, int) else tuple(
+            range(idx.start, idx.stop, idx.step))
 
     def _flatten(self, xs):
         raise NotImplementedError
 
     def _shifted_indexes(self, indexes, lazy_axis_size=0):
         idx = indexes[self._lazy_axis]
-        shifted_lazy_axis_idx = 0 if isinstance(
-            idx, int) else slice(0, lazy_axis_size, 1)
-        return indexes[:self._lazy_axis] + (shifted_lazy_axis_idx,
-                                            ) + indexes[self._lazy_axis + 1:]
+        shifted_lazy_axis_idx = 0 if isinstance(idx, int) else slice(
+            0, lazy_axis_size, 1)
+        return indexes[:self._lazy_axis] + (
+            shifted_lazy_axis_idx, ) + indexes[self._lazy_axis + 1:]
 
     def __getitem__(self, indexes):
         indexes = _multi_index(indexes, self.shape)
@@ -381,8 +382,8 @@ def __getitem__(self, indexes):
         if isinstance(indexes[self._lazy_axis], int):
             other_indexes = indexes[:self._lazy_axis] + \
                 indexes[self._lazy_axis+1:]
-            return self._cached_evaluate(indexes[self._lazy_axis])[
-                other_indexes]
+            return self._cached_evaluate(
+                indexes[self._lazy_axis])[other_indexes]
         lazy_indexes = self._lazy_indexes(indexes)
         part_jac = paddle.stack(
             [self._cached_evaluate(i) for i in lazy_indexes],
@@ -424,7 +425,8 @@ def _flatten(self, xs):
     def _evaluate(self, row_index):
         return self._flatten(_grad(
             self._flatten_ys[row_index],
-            self._xs, ))
+            self._xs,
+        ))
 
 
 class _JacobianBatchLast(_Jacobian):
@@ -508,8 +510,8 @@ def _multi_index(indexes, shape):
     positive_indexes = []
     for i, index in enumerate(indexes):
         if isinstance(index, slice):
-            index = slice(index.start or 0, index.stop or shape[i],
-                          index.step or 1)
+            index = slice(index.start or 0, index.stop or shape[i], index.step
+                          or 1)
             positive_indexes.append(
                 slice(
                     index.start + shape[i] if index.start < 0 else index.start,
@@ -530,9 +532,8 @@ def _as_tensors(xs):
 
 def _stack_tensor_or_return_none(origin_list):
     assert len(origin_list) > 0, "Can't not stack an empty list"
-    return paddle.stack(
-        origin_list, axis=0) if isinstance(
-            origin_list[0], paddle.fluid.framework.Variable) else None
+    return paddle.stack(origin_list, axis=0) if isinstance(
+        origin_list[0], paddle.fluid.framework.Variable) else None
 
 
 def _replace_none_with_zero_tensor(xs, refs):
@@ -809,23 +810,20 @@ def func(x, y):
     fin_size = len(inputs)
     fout_size = len(outputs)
     flat_outputs = tuple(
-        paddle.reshape(
-            output, shape=[-1]) for output in outputs)
+        paddle.reshape(output, shape=[-1]) for output in outputs)
     jacobian = tuple()
     for i, flat_output in enumerate(flat_outputs):
         jac_i = list([] for _ in range(fin_size))
         for k in range(len(flat_output)):
-            row_k = paddle.grad(
-                flat_output[k],
-                inputs,
-                create_graph=create_graph,
-                retain_graph=True,
-                allow_unused=allow_unused)
+            row_k = paddle.grad(flat_output[k],
+                                inputs,
+                                create_graph=create_graph,
+                                retain_graph=True,
+                                allow_unused=allow_unused)
             for j in range(fin_size):
                 jac_i[j].append(
-                    paddle.reshape(
-                        row_k[j], shape=[-1])
-                    if isinstance(row_k[j], paddle.Tensor) else None)
+                    paddle.reshape(row_k[j], shape=[-1]) if isinstance(
+                        row_k[j], paddle.Tensor) else None)
         jacobian += (tuple(
             _stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), )
     if fin_size == 1 and fout_size == 1:
@@ -957,25 +955,22 @@ def func(x, y):
     fin_size = len(inputs)
     fout_size = len(outputs)
     flat_outputs = tuple(
-        paddle.reshape(
-            output, shape=[batch_size, -1]) for output in outputs)
+        paddle.reshape(output, shape=[batch_size, -1]) for output in outputs)
     jacobian = tuple()
     for i, flat_output in enumerate(flat_outputs):
         jac_i = list([] for _ in range(fin_size))
         for k in range(flat_output.shape[1]):
 
-            row_k = paddle.grad(
-                flat_output[:, k],
-                inputs,
-                create_graph=create_graph,
-                retain_graph=True,
-                allow_unused=allow_unused)
+            row_k = paddle.grad(flat_output[:, k],
+                                inputs,
+                                create_graph=create_graph,
+                                retain_graph=True,
+                                allow_unused=allow_unused)
 
             for j in range(fin_size):
                 jac_i[j].append(
-                    paddle.reshape(
-                        row_k[j], shape=[-1])
-                    if isinstance(row_k[j], paddle.Tensor) else None)
+                    paddle.reshape(row_k[j], shape=[-1]) if isinstance(
+                        row_k[j], paddle.Tensor) else None)
         jacobian += (tuple(
             _stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), )
     if fin_size == 1 and fout_size == 1:
@@ -1119,18 +1114,19 @@ def func(x, y):
     ], "The function to compute batched Hessian matrix should return a Tensor of shape [batch_size, 1]"
 
     def jac_func(*ins):
-        grad_inputs = paddle.grad(
-            outputs,
-            ins,
-            create_graph=True,
-            retain_graph=True,
-            allow_unused=allow_unused)
+        grad_inputs = paddle.grad(outputs,
+                                  ins,
+                                  create_graph=True,
+                                  retain_graph=True,
+                                  allow_unused=allow_unused)
         return tuple(
             _replace_none_with_zero_tensor(grad_inputs[i], inputs[i])
             for i in range(len(inputs)))
 
-    return batch_jacobian(
-        jac_func, inputs, create_graph=create_graph, allow_unused=allow_unused)
+    return batch_jacobian(jac_func,
+                          inputs,
+                          create_graph=create_graph,
+                          allow_unused=allow_unused)
 
 
 @framework.dygraph_only
@@ -1245,18 +1241,19 @@ def func(x, y):
     ], "The function to compute Hessian matrix should return a Tensor with a single element"
 
     def jac_func(*ins):
-        grad_inputs = paddle.grad(
-            outputs,
-            ins,
-            create_graph=True,
-            retain_graph=True,
-            allow_unused=allow_unused)
+        grad_inputs = paddle.grad(outputs,
+                                  ins,
+                                  create_graph=True,
+                                  retain_graph=True,
+                                  allow_unused=allow_unused)
         return tuple(
             _replace_none_with_zero_tensor(grad_inputs[i], inputs[i])
             for i in range(len(inputs)))
 
-    return jacobian(
-        jac_func, inputs, create_graph=create_graph, allow_unused=allow_unused)
+    return jacobian(jac_func,
+                    inputs,
+                    create_graph=create_graph,
+                    allow_unused=allow_unused)
 
 
 def vhp(func, inputs, v=None, create_graph=False, allow_unused=False):
diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
index 0fb90b334f8e5..64946268bd7c9 100644
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,6 +17,7 @@
 from paddle.fluid.dygraph.amp.auto_cast import amp_state
 from paddle.amp.auto_cast import auto_cast
 from paddle.fluid import core
+
 __all__ = []
 
 
@@ -123,7 +124,9 @@ def backward(ctx, dy):
 
 
 def with_mateclass(meta, *bases):
+
     class impl(meta):
+
         def __new__(cls, name, temp_bases, attrs):
             return meta(name, bases, attrs)
 
@@ -131,6 +134,7 @@ def __new__(cls, name, temp_bases, attrs):
 
 
 class CPyLayer(object):
+
     @classmethod
     @dygraph_only
     def apply(cls, *args, **kwargs):
@@ -178,6 +182,7 @@ def backward(ctx, dy):
 
 
 class PyLayerBackward(PyLayerContext):
+
     def backward(self, *args, **kwargs):
         with paddle.fluid.dygraph.guard():
             with paddle.fluid.dygraph.no_grad():
@@ -192,6 +197,7 @@ def backward(self, *args, **kwargs):
 
 
 class LayerMeta(type):
+
     def __init__(cls, name, bases, attrs):
         cls._backward_function = type(name + '_backward', (PyLayerBackward, ),
                                       {"_forward_cls": cls})
@@ -330,6 +336,7 @@ def backward(ctx, dy):
 
 
 class EagerPyLayerContext(object):
+
     def save_for_backward(self, *tensors):
         """
         Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
@@ -494,11 +501,13 @@ def backward(ctx, grad, grad2):
 
 
 class EagerPyLayerBackward(core.eager.PyLayer, EagerPyLayerContext):
+
     def backward(self, *args):
         return self._forward_cls.backward(self, *args)
 
 
 class EagerPyLayerMeta(type):
+
     def __init__(cls, name, bases, attrs):
         cls._backward_function = type(name + '_backward',
                                       (EagerPyLayerBackward, ),
@@ -510,6 +519,7 @@ def __init__(cls, name, bases, attrs):
 class EagerPyLayer(
         with_mateclass(EagerPyLayerMeta, core.eager.PyLayer,
                        EagerPyLayerContext)):
+
     @staticmethod
     def forward(ctx, *args, **kwargs):
         """
@@ -590,6 +600,7 @@ def backward(ctx, dy):
 
 
 def once_differentiable(backward):
+
     def wrapper(ctx, *args):
         with paddle.fluid.dygraph.no_grad():
             outputs = backward(ctx, *args)
diff --git a/python/paddle/callbacks.py b/python/paddle/callbacks.py
index 08fab3e0adb5e..46f69aae1bbfa 100644
--- a/python/paddle/callbacks.py
+++ b/python/paddle/callbacks.py
@@ -21,11 +21,6 @@
 from .hapi.callbacks import ReduceLROnPlateau  # noqa: F401
 
 __all__ = [  #noqa
-    'Callback',
-    'ProgBarLogger',
-    'ModelCheckpoint',
-    'VisualDL',
-    'LRScheduler',
-    'EarlyStopping',
-    'ReduceLROnPlateau'
+    'Callback', 'ProgBarLogger', 'ModelCheckpoint', 'VisualDL', 'LRScheduler',
+    'EarlyStopping', 'ReduceLROnPlateau'
 ]
diff --git a/python/paddle/cost_model/__init__.py b/python/paddle/cost_model/__init__.py
index 65f2533032ae3..e6907128642c6 100644
--- a/python/paddle/cost_model/__init__.py
+++ b/python/paddle/cost_model/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,4 +13,5 @@
 # limitations under the License.
 
 from .cost_model import CostModel  # noqa: F401
+
 __all__ = ['CostModel']
diff --git a/python/paddle/cost_model/cost_model.py b/python/paddle/cost_model/cost_model.py
index e6a87468a1172..a59ff31a683a4 100644
--- a/python/paddle/cost_model/cost_model.py
+++ b/python/paddle/cost_model/cost_model.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,6 +21,7 @@
 
 
 class CostModel():
+
     def __init__(self):
         pass
 
@@ -29,10 +30,11 @@ def build_program(self):
 
         main_program = static.Program()
         startup_program = static.Program()
-        with static.program_guard(
-                main_program=main_program, startup_program=startup_program):
-            data = paddle.static.data(
-                name='X', shape=[None, 1], dtype='float32')
+        with static.program_guard(main_program=main_program,
+                                  startup_program=startup_program):
+            data = paddle.static.data(name='X',
+                                      shape=[None, 1],
+                                      dtype='float32')
             hidden = paddle.static.nn.fc(data, 10)
             loss = paddle.mean(hidden)
             paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
@@ -59,8 +61,8 @@ def profile_measure(self,
         cost_data = cost_model.ProfileMeasure(device)
 
     def static_cost_data(self):
-        static_cost_data_path = os.path.join(
-            os.path.dirname(__file__), "static_op_benchmark.json")
+        static_cost_data_path = os.path.join(os.path.dirname(__file__),
+                                             "static_op_benchmark.json")
         with open(static_cost_data_path, 'r') as load_f:
             load_dict = json.load(load_f)
         self._static_cost_data = load_dict
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
index b33f1314f623d..9c4f4adccd26e 100644
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -47,10 +47,11 @@
 
 
 def reader_creator(filename, sub_name, cycle=False):
+
     def read_batch(batch):
         data = batch[six.b('data')]
-        labels = batch.get(
-            six.b('labels'), batch.get(six.b('fine_labels'), None))
+        labels = batch.get(six.b('labels'), batch.get(six.b('fine_labels'),
+                                                      None))
         assert labels is not None
         for sample, label in six.moves.zip(data, labels):
             yield (sample / 255.0).astype(numpy.float32), int(label)
@@ -129,10 +130,10 @@ def train10(cycle=False):
     :return: Training reader creator
     :rtype: callable
     """
-    return reader_creator(
-        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch',
-        cycle=cycle)
+    return reader_creator(paddle.dataset.common.download(
+        CIFAR10_URL, 'cifar', CIFAR10_MD5),
+                          'data_batch',
+                          cycle=cycle)
 
 
 @deprecated(
@@ -152,10 +153,10 @@ def test10(cycle=False):
     :return: Test reader creator.
     :rtype: callable
     """
-    return reader_creator(
-        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch',
-        cycle=cycle)
+    return reader_creator(paddle.dataset.common.download(
+        CIFAR10_URL, 'cifar', CIFAR10_MD5),
+                          'test_batch',
+                          cycle=cycle)
 
 
 @deprecated(
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 71f469b92e4a7..5a10fe120ea7d 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -64,9 +64,9 @@ def download(url, module_name, md5sum, save_name=None):
     if not os.path.exists(dirname):
         os.makedirs(dirname)
 
-    filename = os.path.join(dirname,
-                            url.split('/')[-1]
-                            if save_name is None else save_name)
+    filename = os.path.join(
+        dirname,
+        url.split('/')[-1] if save_name is None else save_name)
 
     if os.path.exists(filename) and md5file(filename) == md5sum:
         return filename
@@ -79,8 +79,9 @@ def download(url, module_name, md5sum, save_name=None):
         if retry < retry_limit:
             retry += 1
         else:
-            raise RuntimeError("Cannot download {0} within retry limit {1}".
-                               format(url, retry_limit))
+            raise RuntimeError(
+                "Cannot download {0} within retry limit {1}".format(
+                    url, retry_limit))
         sys.stderr.write("Cache file %s not found, downloading %s \n" %
                          (filename, url))
         sys.stderr.write("Begin to download\n")
@@ -98,8 +99,8 @@ def download(url, module_name, md5sum, save_name=None):
                     total_iter = total_length / chunk_size + 1
                     log_interval = total_iter // 20 if total_iter > 20 else 1
                     log_index = 0
-                    bar = paddle.hapi.progressbar.ProgressBar(
-                        total_iter, name='item')
+                    bar = paddle.hapi.progressbar.ProgressBar(total_iter,
+                                                              name='item')
                     for data in r.iter_content(chunk_size=chunk_size):
                         f.write(data)
                         log_index += 1
@@ -121,9 +122,8 @@ def fetch_all():
     ]:
         if "fetch" in dir(
                 importlib.import_module("paddle.dataset.%s" % module_name)):
-            getattr(
-                importlib.import_module("paddle.dataset.%s" % module_name),
-                "fetch")()
+            getattr(importlib.import_module("paddle.dataset.%s" % module_name),
+                    "fetch")()
 
 
 def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump):
@@ -206,5 +206,5 @@ def _check_exists_and_download(path, url, md5, module_name, download=True):
     if download:
         return paddle.dataset.common.download(url, module_name, md5)
     else:
-        raise ValueError('{} not exists and auto download disabled'.format(
-            path))
+        raise ValueError(
+            '{} not exists and auto download disabled'.format(path))
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
index f09163ea424b0..eb43eaf742e11 100644
--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -152,6 +152,7 @@ def reader_creator(corpus_reader,
                    word_dict=None,
                    predicate_dict=None,
                    label_dict=None):
+
     def reader():
         for sentence, predicate, labels in corpus_reader():
 
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 8ca948b49bc4a..04b3a4cfc1754 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -73,8 +73,11 @@ def default_mapper(is_train, sample):
     '''
     img, label = sample
     img = load_image_bytes(img)
-    img = simple_transform(
-        img, 256, 224, is_train, mean=[103.94, 116.78, 123.68])
+    img = simple_transform(img,
+                           256,
+                           224,
+                           is_train,
+                           mean=[103.94, 116.78, 123.68])
     return img.flatten().astype('float32'), label
 
 
@@ -164,15 +167,14 @@ def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     :return: train data reader
     :rtype: callable
     '''
-    return reader_creator(
-        download(DATA_URL, 'flowers', DATA_MD5),
-        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5),
-        TRAIN_FLAG,
-        mapper,
-        buffered_size,
-        use_xmap,
-        cycle=cycle)
+    return reader_creator(download(DATA_URL, 'flowers', DATA_MD5),
+                          download(LABEL_URL, 'flowers', LABEL_MD5),
+                          download(SETID_URL, 'flowers', SETID_MD5),
+                          TRAIN_FLAG,
+                          mapper,
+                          buffered_size,
+                          use_xmap,
+                          cycle=cycle)
 
 
 @deprecated(
@@ -198,15 +200,14 @@ def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     :return: test data reader
     :rtype: callable
     '''
-    return reader_creator(
-        download(DATA_URL, 'flowers', DATA_MD5),
-        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5),
-        TEST_FLAG,
-        mapper,
-        buffered_size,
-        use_xmap,
-        cycle=cycle)
+    return reader_creator(download(DATA_URL, 'flowers', DATA_MD5),
+                          download(LABEL_URL, 'flowers', LABEL_MD5),
+                          download(SETID_URL, 'flowers', SETID_MD5),
+                          TEST_FLAG,
+                          mapper,
+                          buffered_size,
+                          use_xmap,
+                          cycle=cycle)
 
 
 @deprecated(
@@ -230,11 +231,10 @@ def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
     :return: test data reader
     :rtype: callable
     '''
-    return reader_creator(
-        download(DATA_URL, 'flowers', DATA_MD5),
-        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
-        buffered_size, use_xmap)
+    return reader_creator(download(DATA_URL, 'flowers', DATA_MD5),
+                          download(LABEL_URL, 'flowers', LABEL_MD5),
+                          download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG,
+                          mapper, buffered_size, use_xmap)
 
 
 def fetch():
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index a094529edf575..ae0d7d95b11a8 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -45,10 +45,9 @@
     # will be the C++ execubable on Windows
     if sys.platform == 'win32' and 'python.exe' not in interpreter:
         interpreter = sys.exec_prefix + os.sep + 'python.exe'
-    import_cv2_proc = subprocess.Popen(
-        [interpreter, "-c", "import cv2"],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE)
+    import_cv2_proc = subprocess.Popen([interpreter, "-c", "import cv2"],
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.PIPE)
     out, err = import_cv2_proc.communicate()
     retcode = import_cv2_proc.poll()
     if retcode != 0:
@@ -123,10 +122,9 @@ def batch_images_from_tar(data_file,
                 output = {}
                 output['label'] = labels
                 output['data'] = data
-                pickle.dump(
-                    output,
-                    open('%s/batch_%d' % (out_path, file_id), 'wb'),
-                    protocol=2)
+                pickle.dump(output,
+                            open('%s/batch_%d' % (out_path, file_id), 'wb'),
+                            protocol=2)
                 file_id += 1
                 data = []
                 labels = []
@@ -134,8 +132,9 @@ def batch_images_from_tar(data_file,
         output = {}
         output['label'] = labels
         output['data'] = data
-        pickle.dump(
-            output, open('%s/batch_%d' % (out_path, file_id), 'wb'), protocol=2)
+        pickle.dump(output,
+                    open('%s/batch_%d' % (out_path, file_id), 'wb'),
+                    protocol=2)
 
     with open(meta_file, 'a') as meta:
         for file in os.listdir(out_path):
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
index 961d238b0ad41..b45cf4f6474bf 100644
--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -51,9 +51,9 @@ def tokenize(pattern):
         while tf != None:
             if bool(pattern.match(tf.name)):
                 # newline and punctuations removal and ad-hoc tokenization.
-                yield tarf.extractfile(tf).read().rstrip(six.b(
-                    "\n\r")).translate(
-                        None, six.b(string.punctuation)).lower().split()
+                yield tarf.extractfile(tf).read().rstrip(
+                    six.b("\n\r")).translate(None, six.b(
+                        string.punctuation)).lower().split()
             tf = tarf.next()
 
 
@@ -117,9 +117,8 @@ def train(word_idx):
     :return: Training reader creator
     :rtype: callable
     """
-    return reader_creator(
-        re.compile(r"aclImdb/train/pos/.*\.txt$"),
-        re.compile(r"aclImdb/train/neg/.*\.txt$"), word_idx)
+    return reader_creator(re.compile(r"aclImdb/train/pos/.*\.txt$"),
+                          re.compile(r"aclImdb/train/neg/.*\.txt$"), word_idx)
 
 
 @deprecated(
@@ -139,9 +138,8 @@ def test(word_idx):
     :return: Test reader creator
     :rtype: callable
     """
-    return reader_creator(
-        re.compile(r"aclImdb/test/pos/.*\.txt$"),
-        re.compile(r"aclImdb/test/neg/.*\.txt$"), word_idx)
+    return reader_creator(re.compile(r"aclImdb/test/pos/.*\.txt$"),
+                          re.compile(r"aclImdb/test/neg/.*\.txt$"), word_idx)
 
 
 @deprecated(
diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
index 85fe011fa143a..fa6b1d7493bed 100644
--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -83,6 +83,7 @@ def build_dict(min_word_freq=50):
 
 
 def reader_creator(filename, word_idx, n, data_type):
+
     def reader():
         with tarfile.open(
                 paddle.dataset.common.download(
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 02cdd30708392..5c81d5d25cf80 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -41,6 +41,7 @@
 
 
 def reader_creator(image_filename, label_filename, buffer_size):
+
     def reader():
         with gzip.GzipFile(image_filename, 'rb') as image_file:
             img_buf = image_file.read()
@@ -61,8 +62,8 @@ def reader():
                 offset_lab = 0
                 # label file : 8B
                 magic_byte_lab = '>II'
-                magic_lab, label_num = struct.unpack_from(magic_byte_lab,
-                                                          lab_buf, offset_lab)
+                magic_lab, label_num = struct.unpack_from(
+                    magic_byte_lab, lab_buf, offset_lab)
                 offset_lab += struct.calcsize(magic_byte_lab)
 
                 while True:
@@ -76,8 +77,9 @@ def reader():
                     fmt_images = '>' + str(buffer_size * rows * cols) + 'B'
                     images_temp = struct.unpack_from(fmt_images, img_buf,
                                                      offset_img)
-                    images = numpy.reshape(images_temp, (
-                        buffer_size, rows * cols)).astype('float32')
+                    images = numpy.reshape(
+                        images_temp,
+                        (buffer_size, rows * cols)).astype('float32')
                     offset_img += struct.calcsize(fmt_images)
 
                     images = images / 255.0
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
index 9af06e088ca87..ccf9a95436b16 100644
--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -89,8 +89,8 @@ def value(self):
 
     def __str__(self):
         return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
-            self.index, "M"
-            if self.is_male else "F", age_table[self.age], self.job_id)
+            self.index, "M" if self.is_male else "F", age_table[self.age],
+            self.job_id)
 
     def __repr__(self):
         return str(self)
@@ -142,8 +142,10 @@ def __initialize_meta_info__():
                     for line in user_file:
                         line = cpt.to_text(line, encoding='latin')
                         uid, gender, age, job, _ = line.strip().split("::")
-                        USER_INFO[int(uid)] = UserInfo(
-                            index=uid, gender=gender, age=age, job_id=job)
+                        USER_INFO[int(uid)] = UserInfo(index=uid,
+                                                       gender=gender,
+                                                       age=age,
+                                                       job_id=job)
     return fn
 
 
diff --git a/python/paddle/dataset/tests/cifar_test.py b/python/paddle/dataset/tests/cifar_test.py
index 54dff6b40cf3c..7de9f06db60ea 100644
--- a/python/paddle/dataset/tests/cifar_test.py
+++ b/python/paddle/dataset/tests/cifar_test.py
@@ -21,6 +21,7 @@
 
 
 class TestCIFAR(unittest.TestCase):
+
     def check_reader(self, reader):
         sum = 0
         label = 0
diff --git a/python/paddle/dataset/tests/flowers_test.py b/python/paddle/dataset/tests/flowers_test.py
index 256c116b7cff6..14a8917ec7145 100644
--- a/python/paddle/dataset/tests/flowers_test.py
+++ b/python/paddle/dataset/tests/flowers_test.py
@@ -21,6 +21,7 @@
 
 
 class TestFlowers(unittest.TestCase):
+
     def check_reader(self, reader):
         sum = 0
         label = 0
diff --git a/python/paddle/dataset/tests/imikolov_test.py b/python/paddle/dataset/tests/imikolov_test.py
index 5556274211fc3..7c0b186a2d93a 100644
--- a/python/paddle/dataset/tests/imikolov_test.py
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -23,6 +23,7 @@
 
 
 class TestMikolov(unittest.TestCase):
+
     def check_reader(self, reader, n):
         for l in reader():
             self.assertEqual(len(l), n)
diff --git a/python/paddle/dataset/tests/mnist_test.py b/python/paddle/dataset/tests/mnist_test.py
index 238b58244e147..f878329b0ff93 100644
--- a/python/paddle/dataset/tests/mnist_test.py
+++ b/python/paddle/dataset/tests/mnist_test.py
@@ -21,6 +21,7 @@
 
 
 class TestMNIST(unittest.TestCase):
+
     def check_reader(self, reader):
         sum = 0
         label = 0
diff --git a/python/paddle/dataset/tests/test_image.py b/python/paddle/dataset/tests/test_image.py
index 259939d62f641..af4d697edf27e 100644
--- a/python/paddle/dataset/tests/test_image.py
+++ b/python/paddle/dataset/tests/test_image.py
@@ -23,6 +23,7 @@
 
 
 class Image(unittest.TestCase):
+
     def test_resize_flip_chw(self):
         # resize
         im = image.load_image('cat.jpg')
diff --git a/python/paddle/dataset/tests/voc2012_test.py b/python/paddle/dataset/tests/voc2012_test.py
index 21c24e6df823f..7a6fd7150ffe9 100644
--- a/python/paddle/dataset/tests/voc2012_test.py
+++ b/python/paddle/dataset/tests/voc2012_test.py
@@ -21,6 +21,7 @@
 
 
 class TestVOC(unittest.TestCase):
+
     def check_reader(self, reader):
         sum = 0
         label = 0
diff --git a/python/paddle/dataset/tests/wmt16_test.py b/python/paddle/dataset/tests/wmt16_test.py
index 68a9819c8f335..b75924fe65f44 100644
--- a/python/paddle/dataset/tests/wmt16_test.py
+++ b/python/paddle/dataset/tests/wmt16_test.py
@@ -21,6 +21,7 @@
 
 
 class TestWMT16(unittest.TestCase):
+
     def checkout_one_sample(self, sample):
         # train data has 3 field: source language word indices,
         # target language word indices, and target next word indices.
@@ -38,22 +39,22 @@ def checkout_one_sample(self, sample):
 
     def test_train(self):
         for idx, sample in enumerate(
-                paddle.dataset.wmt16.train(
-                    src_dict_size=100000, trg_dict_size=100000)()):
+                paddle.dataset.wmt16.train(src_dict_size=100000,
+                                           trg_dict_size=100000)()):
             if idx >= 10: break
             self.checkout_one_sample(sample)
 
     def test_test(self):
         for idx, sample in enumerate(
-                paddle.dataset.wmt16.test(
-                    src_dict_size=1000, trg_dict_size=1000)()):
+                paddle.dataset.wmt16.test(src_dict_size=1000,
+                                          trg_dict_size=1000)()):
             if idx >= 10: break
             self.checkout_one_sample(sample)
 
     def test_val(self):
         for idx, sample in enumerate(
-                paddle.dataset.wmt16.validation(
-                    src_dict_size=1000, trg_dict_size=1000)()):
+                paddle.dataset.wmt16.validation(src_dict_size=1000,
+                                                trg_dict_size=1000)()):
             if idx >= 10: break
             self.checkout_one_sample(sample)
 
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
index dea2dfc8c9818..ae72c8e88ea0d 100644
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -73,8 +73,8 @@ def load_data(filename, feature_num=14, ratio=0.8):
 
     data = np.fromfile(filename, sep=' ')
     data = data.reshape(data.shape[0] // feature_num, feature_num)
-    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
-        axis=0) / data.shape[0]
+    maximums, minimums, avgs = data.max(axis=0), data.min(
+        axis=0), data.sum(axis=0) / data.shape[0]
     # if you want to print the distribution of input data, you could use function of feature_range
     #feature_range(maximums[:-1], minimums[:-1])
     for i in six.moves.range(feature_num - 1):
@@ -135,8 +135,10 @@ def reader():
 
 
 def fluid_model():
-    parameter_tar = paddle.dataset.common.download(
-        FLUID_URL_MODEL, 'uci_housing', FLUID_MD5_MODEL, 'fit_a_line.fluid.tar')
+    parameter_tar = paddle.dataset.common.download(FLUID_URL_MODEL,
+                                                   'uci_housing',
+                                                   FLUID_MD5_MODEL,
+                                                   'fit_a_line.fluid.tar')
 
     tar = tarfile.TarFile(parameter_tar, mode='r')
     dirpath = tempfile.mkdtemp()
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
index 9f8abb2c4bfe9..bb0a77b4f20d5 100644
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -50,6 +50,7 @@
 
 
 def __read_to_dict(tar_file, dict_size):
+
     def __to_dict(fd, size):
         out_dict = dict()
         for line_count, line in enumerate(fd):
@@ -76,6 +77,7 @@ def __to_dict(fd, size):
 
 
 def reader_creator(tar_file, file_name, dict_size):
+
     def reader():
         src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
         with tarfile.open(tar_file, mode='r') as f:
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index f313da98f0abc..80e35d9fde952 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -68,9 +68,9 @@ def __build_dict(tar_file, dict_size, save_path, lang):
         fout.write(
             cpt.to_bytes("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)))
         for idx, word in enumerate(
-                sorted(
-                    six.iteritems(word_dict), key=lambda x: x[1],
-                    reverse=True)):
+                sorted(six.iteritems(word_dict),
+                       key=lambda x: x[1],
+                       reverse=True)):
             if idx + 3 == dict_size: break
             fout.write(cpt.to_bytes(word[0]))
             fout.write(cpt.to_bytes('\n'))
@@ -79,8 +79,8 @@ def __build_dict(tar_file, dict_size, save_path, lang):
 def __load_dict(tar_file, dict_size, lang, reverse=False):
     dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
                              "wmt16/%s_%d.dict" % (lang, dict_size))
-    if not os.path.exists(dict_path) or (
-            len(open(dict_path, "rb").readlines()) != dict_size):
+    if not os.path.exists(dict_path) or (len(open(dict_path, "rb").readlines())
+                                         != dict_size):
         __build_dict(tar_file, dict_size, dict_path, lang)
 
     word_dict = {}
@@ -94,14 +94,15 @@ def __load_dict(tar_file, dict_size, lang, reverse=False):
 
 
 def __get_dict_size(src_dict_size, trg_dict_size, src_lang):
-    src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else
-                                        TOTAL_DE_WORDS))
-    trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else
-                                        TOTAL_EN_WORDS))
+    src_dict_size = min(
+        src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else TOTAL_DE_WORDS))
+    trg_dict_size = min(
+        trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else TOTAL_EN_WORDS))
     return src_dict_size, trg_dict_size
 
 
 def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
+
     def reader():
         src_dict = __load_dict(tar_file, src_dict_size, src_lang)
         trg_dict = __load_dict(tar_file, trg_dict_size,
@@ -124,9 +125,9 @@ def reader():
                 if len(line_split) != 2:
                     continue
                 src_words = line_split[src_col].split()
-                src_ids = [start_id] + [
-                    src_dict.get(w, unk_id) for w in src_words
-                ] + [end_id]
+                src_ids = [start_id
+                           ] + [src_dict.get(w, unk_id)
+                                for w in src_words] + [end_id]
 
                 trg_words = line_split[trg_col].split()
                 trg_ids = [trg_dict.get(w, unk_id) for w in trg_words]
@@ -184,13 +185,12 @@ def train(src_dict_size, trg_dict_size, src_lang="en"):
     src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
                                                    src_lang)
 
-    return reader_creator(
-        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                "wmt16.tar.gz"),
-        file_name="wmt16/train",
-        src_dict_size=src_dict_size,
-        trg_dict_size=trg_dict_size,
-        src_lang=src_lang)
+    return reader_creator(tar_file=paddle.dataset.common.download(
+        DATA_URL, "wmt16", DATA_MD5, "wmt16.tar.gz"),
+                          file_name="wmt16/train",
+                          src_dict_size=src_dict_size,
+                          trg_dict_size=trg_dict_size,
+                          src_lang=src_lang)
 
 
 @deprecated(
@@ -238,13 +238,12 @@ def test(src_dict_size, trg_dict_size, src_lang="en"):
     src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
                                                    src_lang)
 
-    return reader_creator(
-        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                "wmt16.tar.gz"),
-        file_name="wmt16/test",
-        src_dict_size=src_dict_size,
-        trg_dict_size=trg_dict_size,
-        src_lang=src_lang)
+    return reader_creator(tar_file=paddle.dataset.common.download(
+        DATA_URL, "wmt16", DATA_MD5, "wmt16.tar.gz"),
+                          file_name="wmt16/test",
+                          src_dict_size=src_dict_size,
+                          trg_dict_size=trg_dict_size,
+                          src_lang=src_lang)
 
 
 @deprecated(
@@ -290,13 +289,12 @@ def validation(src_dict_size, trg_dict_size, src_lang="en"):
     src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
                                                    src_lang)
 
-    return reader_creator(
-        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                "wmt16.tar.gz"),
-        file_name="wmt16/val",
-        src_dict_size=src_dict_size,
-        trg_dict_size=trg_dict_size,
-        src_lang=src_lang)
+    return reader_creator(tar_file=paddle.dataset.common.download(
+        DATA_URL, "wmt16", DATA_MD5, "wmt16.tar.gz"),
+                          file_name="wmt16/val",
+                          src_dict_size=src_dict_size,
+                          trg_dict_size=trg_dict_size,
+                          src_lang=src_lang)
 
 
 @deprecated(
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index 89e0ae49fc48f..929a1c2d77fb4 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -1,18 +1,18 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the functions to manipulate devices 
+# TODO: define the functions to manipulate devices
 import re
 import os
 from paddle.fluid import core
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index 8cb4f5f765611..d867f07122992 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -178,8 +178,8 @@ def extract_cuda_device_id(device, op_name):
         else:
             raise ValueError(
                 "The current string {} is not expected. Because {} only support string which is like 'gpu:x'. "
-                "Please input appropriate string again!".format(device,
-                                                                op_name))
+                "Please input appropriate string again!".format(
+                    device, op_name))
     else:
         raise ValueError(
             "The device type {} is not expected. Because {} only support int, str or paddle.CUDAPlace. "
diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py
index 29e1b2694a699..c6554d78fb86a 100644
--- a/python/paddle/device/cuda/graphs.py
+++ b/python/paddle/device/cuda/graphs.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,15 +17,24 @@
 
 if is_compiled_with_cuda() and not is_compiled_with_rocm():
     from paddle.fluid.core import CUDAGraph as CoreCUDAGraph
+
+    def is_cuda_graph_supported():
+        return True
 else:
     CoreCUDAGraph = None
 
+    def is_cuda_graph_supported():
+        return False
+
+
+ALL_MODES = ["global", "thread_local", "relaxed"]
+
 
 class CUDAGraph:
+
     def __init__(self, place=None, mode="thread_local"):
         assert CoreCUDAGraph is not None, "CUDA Graph is only supported on PaddlePaddle compiled with NVIDIA GPU."
 
-        ALL_MODES = ["global", "thread_local", "relaxed"]
         self._graph = None
         if place is None:
             device_id = int(os.environ.get('FLAGS_selected_gpus', 0))
@@ -53,5 +62,27 @@ def print_to_dot_files(self, dirname, flags=None):
         assert os.path.isdir(
             dirname), "The dirname {} should be a directory".format(dirname)
         if flags is None:
-            flags = 2047  # only all information. It can be any integer inside [1, 2048)  
+            flags = 2047  # only all information. It can be any integer inside [1, 2048)
         self._graph.print_to_dot_files(dirname, flags)
+
+
+def wrap_cuda_graph(function, mode="thread_local", memory_pool="default"):
+    assert mode in ALL_MODES
+    from paddle.jit import to_static
+    from paddle.nn import Layer
+    new_function = to_static(function)
+    if isinstance(function, Layer):
+        mock_func = new_function.forward
+    else:
+        mock_func = new_function
+    mock_func._cuda_graph_capture_mode = mode
+    if memory_pool == "default":
+        mock_func._cuda_graph_pool_id = 0
+    elif memory_pool == "new":
+        mock_func._cuda_graph_pool_id = CoreCUDAGraph.gen_new_memory_pool_id()
+    else:
+        if isinstance(memory_pool, Layer):
+            mock_func._cuda_graph_pool_id = memory_pool.forward._cuda_graph_pool_id
+        else:
+            mock_func._cuda_graph_pool_id = memory_pool._cuda_graph_pool_id
+    return new_function
diff --git a/python/paddle/device/cuda/streams.py b/python/paddle/device/cuda/streams.py
index 4efe500503409..d25355056e8d5 100644
--- a/python/paddle/device/cuda/streams.py
+++ b/python/paddle/device/cuda/streams.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 50e4f7285b169..003a14799c53e 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -59,33 +59,33 @@
 from .sharding import *  # noqa: F401
 
 __all__ = [  # noqa
-      "spawn",
-      "launch",
-      "scatter",
-      "broadcast",
-      "ParallelEnv",
-      "new_group",
-      "init_parallel_env",
-      "gloo_init_parallel_env",
-      "gloo_barrier",
-      "gloo_release",
-      "QueueDataset",
-      "split",
-      "CountFilterEntry",
-      "ShowClickEntry",
-      "get_world_size",
-      "get_group",
-      "all_gather",
-      "InMemoryDataset",
-      "barrier",
-      "all_reduce",
-      "alltoall",
-      "send",
-      "reduce",
-      "recv",
-      "ReduceOp",
-      "wait",
-      "get_rank",
-      "ProbabilityEntry",
-      "ParallelMode",
+    "spawn",
+    "launch",
+    "scatter",
+    "broadcast",
+    "ParallelEnv",
+    "new_group",
+    "init_parallel_env",
+    "gloo_init_parallel_env",
+    "gloo_barrier",
+    "gloo_release",
+    "QueueDataset",
+    "split",
+    "CountFilterEntry",
+    "ShowClickEntry",
+    "get_world_size",
+    "get_group",
+    "all_gather",
+    "InMemoryDataset",
+    "barrier",
+    "all_reduce",
+    "alltoall",
+    "send",
+    "reduce",
+    "recv",
+    "ReduceOp",
+    "wait",
+    "get_rank",
+    "ProbabilityEntry",
+    "ParallelMode",
 ]
diff --git a/python/paddle/distributed/auto_parallel/cluster.py b/python/paddle/distributed/auto_parallel/cluster.py
index 3685729cb6c29..e70b29dbe3931 100644
--- a/python/paddle/distributed/auto_parallel/cluster.py
+++ b/python/paddle/distributed/auto_parallel/cluster.py
@@ -50,14 +50,14 @@ def __init__(self, global_id, local_id, machine):
         self._local_id = local_id
         self._machine = machine
         self._type = None
-        # Different device have different models, such as 
+        # Different device have different models, such as
         # "Tesla V100-SXM2-32GB" and "A100-SXM4-40GB" etc.
         self._model = None
         # Double precision GFLOPS
         self._dp_gflops = None
         # Single precision GFLOPS
         self._sp_gflops = None
-        # Memory is stored by GB 
+        # Memory is stored by GB
         self._memory = None
 
     @property
@@ -144,9 +144,9 @@ def __init__(self, source, target):
         self._src = source
         self._tgt = target
         self._type = None
-        # bandwidth is stored by GB/s 
+        # bandwidth is stored by GB/s
         self._bandwidth = None
-        # latency is stored by millisecond 
+        # latency is stored by millisecond
         self._latency = None
         self._hop = None
 
@@ -210,6 +210,7 @@ def __repr__(self):
 
 
 class Machine:
+
     def __init__(self, id):
         self._id = id
         self._hostname = None
@@ -290,6 +291,7 @@ def __repr__(self):
 
 
 class AlphaLatency:
+
     def __init__(self, alpha_latency):
         assert isinstance(alpha_latency, dict)
         self._base = alpha_latency.get("base", None)
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 465c450c0b076..19a5b001abbb7 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -137,6 +137,7 @@ def _validate_dims_mapping(dims_mapping, process_mesh):
 
 
 class Completer:
+
     def __init__(self, dist_context):
         assert dist_context is not None
         self._dist_context = dist_context
@@ -248,8 +249,8 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
                                 tensor_desc.name(), compatible_dims_mapping)
                             changed = True
             # Find the most compatible implemenetations from the distributed operator
-            op_dist_impls = find_compatible_distributed_operator_impls(
-                dist_op, fwd=True)
+            op_dist_impls = find_compatible_distributed_operator_impls(dist_op,
+                                                                       fwd=True)
             if op_dist_impls is not None:
                 not_compatible = True
                 backup_op_dist_attr = copy.deepcopy(op_dist_attr)
@@ -451,6 +452,7 @@ def _update_process_mesh_by_nearest(self, op_node, nearest_op_node):
                     tensor_dist_attr.process_mesh = compatible_process_mesh
 
     def _update_process_mesh_for_specials(self):
+
         def _find_nearest_tensor_node_before(nodes, idx, var_name):
             for node in reversed(nodes[:idx]):
                 if node.is_var() and node.var() is not None \
@@ -694,8 +696,8 @@ def _update_process_mesh(self):
         # Step 2.2: set the process meshes of ops by the nearest op node after the first op node
         if idx_of_first_op_node_has_process_mesh + 1 > len(ordered_op_nodes):
             return None
-        for idx, op_node in enumerate(ordered_op_nodes[
-                idx_of_first_op_node_has_process_mesh + 1:]):
+        for idx, op_node in enumerate(
+                ordered_op_nodes[idx_of_first_op_node_has_process_mesh + 1:]):
             original_idx = idx_of_first_op_node_has_process_mesh + idx + 1
             nearest_op_node = ordered_op_nodes[original_idx - 1]
             nearest_op_dist_attr = self._dist_context.get_dist_attr_for_graph(
@@ -831,9 +833,9 @@ def _get_op_by_id(ops, id):
             if grad_op.desc.original_id(
             ) in dist_op_context.grad_op_id_to_op_id:
                 # TODO support the case where one forward op corresponding to multiple xxx_grad op
-                forward_op = _get_op_by_id(ops,
-                                           dist_op_context.grad_op_id_to_op_id[
-                                               grad_op.desc.original_id()])
+                forward_op = _get_op_by_id(
+                    ops, dist_op_context.grad_op_id_to_op_id[
+                        grad_op.desc.original_id()])
                 assert forward_op is not None
 
                 fwd_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
@@ -862,8 +864,8 @@ def _get_op_by_id(ops, id):
                                 input_name)
                     assert ref_dims_mapping is not None, "[{}] 's dims mapping is NONE".format(
                         input_name)
-                    grad_op_dist_attr.set_input_dims_mapping(input_name,
-                                                             ref_dims_mapping)
+                    grad_op_dist_attr.set_input_dims_mapping(
+                        input_name, ref_dims_mapping)
 
                 for output_name in grad_op.output_arg_names:
                     assert output_name in grad_var_to_var[appended_grad_times]
@@ -878,8 +880,8 @@ def _get_op_by_id(ops, id):
                     self._dist_context.set_tensor_dist_attr_for_program(
                         output_var, tensor_dist_attr)
                     # op
-                    grad_op_dist_attr.set_output_dims_mapping(output_name,
-                                                              ref_dims_mapping)
+                    grad_op_dist_attr.set_output_dims_mapping(
+                        output_name, ref_dims_mapping)
 
                 self._dist_context.set_op_dist_attr_for_program(
                     grad_op, grad_op_dist_attr)
@@ -934,10 +936,10 @@ def _get_op_by_id(ops, id):
                     # op
                     grad_op_dist_attr = OperatorDistributedAttribute()
                     grad_op_dist_attr.process_mesh = ref_process_mesh
-                    grad_op_dist_attr.set_input_dims_mapping(ref_var_name,
-                                                             ref_dims_mapping)
-                    grad_op_dist_attr.set_output_dims_mapping(output_var_name,
-                                                              ref_dims_mapping)
+                    grad_op_dist_attr.set_input_dims_mapping(
+                        ref_var_name, ref_dims_mapping)
+                    grad_op_dist_attr.set_output_dims_mapping(
+                        output_var_name, ref_dims_mapping)
 
                 elif grad_op.type in ['shape', 'fill_constant']:
                     continue
@@ -977,8 +979,8 @@ def _get_op_by_id(ops, id):
         first_backward_op_idx = -1
         for idx, op in enumerate(serial_main_program.global_block().ops):
             if int(op.attr('op_role')) == int(
-                    int(core.op_proto_and_checker_maker.OpRole.Backward) | int(
-                        core.op_proto_and_checker_maker.OpRole.Loss)):
+                    int(core.op_proto_and_checker_maker.OpRole.Backward)
+                    | int(core.op_proto_and_checker_maker.OpRole.Loss)):
                 assert op.type == "fill_constant"
                 first_backward_op_idx = idx
                 break
@@ -1025,8 +1027,8 @@ def _get_op_by_id(ops, id):
                 op_dist_attr.process_mesh = process_mesh
                 op_dist_attr.set_output_dims_mapping(grad_var.name,
                                                      dims_mapping)
-                self._dist_context.set_op_dist_attr_for_program(ops[idx],
-                                                                op_dist_attr)
+                self._dist_context.set_op_dist_attr_for_program(
+                    ops[idx], op_dist_attr)
                 continue
 
             # complete the annotation of grad op (xxx_grad op or sum op)
@@ -1035,9 +1037,10 @@ def _get_op_by_id(ops, id):
             if grad_op.desc.original_id(
             ) in dist_op_context.grad_op_id_to_op_id:
                 # TODO support the case where one forward op corresponding to multiple xxx_grad op
-                forward_op = _get_op_by_id(ops[:first_backward_op_idx],
-                                           dist_op_context.grad_op_id_to_op_id[
-                                               grad_op.desc.original_id()])
+                forward_op = _get_op_by_id(
+                    ops[:first_backward_op_idx],
+                    dist_op_context.grad_op_id_to_op_id[
+                        grad_op.desc.original_id()])
                 assert forward_op is not None
 
                 if grad_op.type == "concat" and forward_op.type == "split":
@@ -1060,8 +1063,8 @@ def _get_op_by_id(ops, id):
                     self._dist_context.set_tensor_dist_attr_for_program(
                         output_var, output_var_dist_attr)
 
-                    grad_op_dist_attr.set_output_dims_mapping(output_var.name,
-                                                              ref_dims_mapping)
+                    grad_op_dist_attr.set_output_dims_mapping(
+                        output_var.name, ref_dims_mapping)
                     grad_op_dist_attr.process_mesh = ref_mesh
                     self._dist_context.set_op_dist_attr_for_program(
                         grad_op, grad_op_dist_attr)
@@ -1095,8 +1098,8 @@ def _get_op_by_id(ops, id):
                                 input_name)
                     assert ref_dims_mapping is not None, "[{}] 's dims mapping is NONE".format(
                         input_name)
-                    grad_op_dist_attr.set_input_dims_mapping(input_name,
-                                                             ref_dims_mapping)
+                    grad_op_dist_attr.set_input_dims_mapping(
+                        input_name, ref_dims_mapping)
 
                 for output_name in grad_op.output_arg_names:
                     assert output_name in grad_var_to_var
@@ -1111,8 +1114,8 @@ def _get_op_by_id(ops, id):
                     self._dist_context.set_tensor_dist_attr_for_program(
                         output_var, tensor_dist_attr)
                     # op
-                    grad_op_dist_attr.set_output_dims_mapping(output_name,
-                                                              ref_dims_mapping)
+                    grad_op_dist_attr.set_output_dims_mapping(
+                        output_name, ref_dims_mapping)
 
                 grad_op_dist_attr.impl_type = fwd_op_dist_attr.impl_type
                 grad_op_dist_attr.impl_idx = fwd_op_dist_attr.impl_idx
@@ -1170,10 +1173,10 @@ def _get_op_by_id(ops, id):
                     # op
                     grad_op_dist_attr = OperatorDistributedAttribute()
                     grad_op_dist_attr.process_mesh = ref_process_mesh
-                    grad_op_dist_attr.set_input_dims_mapping(ref_var_name,
-                                                             ref_dims_mapping)
-                    grad_op_dist_attr.set_output_dims_mapping(output_var_name,
-                                                              ref_dims_mapping)
+                    grad_op_dist_attr.set_input_dims_mapping(
+                        ref_var_name, ref_dims_mapping)
+                    grad_op_dist_attr.set_output_dims_mapping(
+                        output_var_name, ref_dims_mapping)
 
                 else:
                     raise ValueError("got unexpect op [{}]".format(
@@ -1186,7 +1189,7 @@ def complete_update_annotation(self, serial_main_program):
         """Complete the annotation of vars and ops in the update phase for parallel program."""
 
         # Notice: serial_main_program is actually a dist_main_program of current rank,
-        # and must be passed into this function. 
+        # and must be passed into this function.
         # TODO: We should fix this behavior.
 
         ops = list(serial_main_program.global_block().ops)
@@ -1223,10 +1226,10 @@ def complete_update_annotation(self, serial_main_program):
                         op, op_dist_attr)
 
                 if "Grad" in op.input_names and "Param" in ops[idx].input_names:
-                    assert len(op.input(
-                        "Param")) == 1, "Only support one-to-one now."
-                    assert len(op.input(
-                        "Grad")) == 1, "Only support one-to-one now."
+                    assert len(
+                        op.input("Param")) == 1, "Only support one-to-one now."
+                    assert len(
+                        op.input("Grad")) == 1, "Only support one-to-one now."
                     param = vars[op.input("Param")[0]]
                     grad_var = vars[op.input("Grad")[0]]
 
@@ -1245,12 +1248,12 @@ def complete_update_annotation(self, serial_main_program):
                                                         ref_dims_mapping)
                     op_dist_attr.set_input_dims_mapping(param.name,
                                                         ref_dims_mapping)
-                    op_dist_attr.set_output_dims_mapping(param.name,
-                                                         ref_dims_mapping)
+                    op_dist_attr.set_output_dims_mapping(
+                        param.name, ref_dims_mapping)
                     learning_var = vars[op.input("LearningRate")[0]]
                     op_dist_attr.set_input_dims_mapping(learning_var.name, [-1])
-                    op_dist_attr.set_output_dims_mapping(learning_var.name,
-                                                         [-1])
+                    op_dist_attr.set_output_dims_mapping(
+                        learning_var.name, [-1])
 
                     if not learning_rate_completed:
                         learning_rate_completed = True
@@ -1275,10 +1278,10 @@ def complete_update_annotation(self, serial_main_program):
 
                         if "Beta1Pow" in input_name or "Beta2Pow" in input_name:
                             input_var_attr.dims_mapping = [-1]
-                            op_dist_attr.set_input_dims_mapping(input_var.name,
-                                                                [-1])
-                            op_dist_attr.set_output_dims_mapping(input_var.name,
-                                                                 [-1])
+                            op_dist_attr.set_input_dims_mapping(
+                                input_var.name, [-1])
+                            op_dist_attr.set_output_dims_mapping(
+                                input_var.name, [-1])
                         else:
                             assert "Moment" in input_name
                             input_var_attr.dims_mapping = ref_dims_mapping
@@ -1307,7 +1310,7 @@ def complete_prim_annotation(self, serial_main_program=None):
         if serial_main_program is None:
             serial_main_program = self._dist_context.serial_main_program
         else:
-            self._dist_context.serial_main_program = serial_main_program
+            self._dist_context._serial_main_program = serial_main_program
 
         import time
 
diff --git a/python/paddle/distributed/auto_parallel/converter.py b/python/paddle/distributed/auto_parallel/converter.py
index 2ea200c7d6f81..69292ab1827e8 100644
--- a/python/paddle/distributed/auto_parallel/converter.py
+++ b/python/paddle/distributed/auto_parallel/converter.py
@@ -133,8 +133,9 @@ def convert(self, strict=True):
                 tensors_dict[tensor_name] = Converter.merge_and_slice(
                     tensor_list, pre_dist_attr, cur_dist_attr)
             except ValueError as err:
-                raise ValueError("Fail to convert tensor '{}'. "
-                                 .format(str(tensor_name)) + str(err))
+                raise ValueError(
+                    "Fail to convert tensor '{}'. ".format(str(tensor_name)) +
+                    str(err))
 
         for tensor_name in self._pre_strategy:
             if tensor_name not in self._cur_strategy:
@@ -150,17 +151,17 @@ def convert(self, strict=True):
         tensor_not_in_cur = set(tensor_not_in_cur) - set(tensor_match_with_cur)
         if tensor_not_in_pre:
             warnings.warn(
-                "tensors [{}] are not found in last training strategy."
-                .format(str(tensor_not_in_pre)))
+                "tensors [{}] are not found in last training strategy.".format(
+                    str(tensor_not_in_pre)))
         if tensor_not_in_cur:
             warnings.warn(
-                "tensors [{}] are not found in current training strategy."
-                .format(str(tensor_not_in_cur)))
+                "tensors [{}] are not found in current training strategy.".
+                format(str(tensor_not_in_cur)))
         if tensor_not_in_ckpt:
             warnings.warn(
                 "tensors [{}] are found in pre_strategy, but are not found"
-                "in checkpoint files, please check your checkpoint files."
-                .format(str(tensor_not_in_ckpt)))
+                "in checkpoint files, please check your checkpoint files.".
+                format(str(tensor_not_in_ckpt)))
 
         return tensors_dict
 
@@ -360,8 +361,9 @@ def split(complete_tensor, partition_index_list, length):
         """
         sliced_tensor_list = []
         axis = len(complete_tensor.shape) - length
-        sliced_tensor = np.split(
-            complete_tensor, partition_index_list[axis], axis=axis)
+        sliced_tensor = np.split(complete_tensor,
+                                 partition_index_list[axis],
+                                 axis=axis)
         if length == 1:
             return sliced_tensor
         for tensor in sliced_tensor:
diff --git a/python/paddle/distributed/auto_parallel/cost/base_cost.py b/python/paddle/distributed/auto_parallel/cost/base_cost.py
index 763f78c510615..4455d6f66483b 100644
--- a/python/paddle/distributed/auto_parallel/cost/base_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/base_cost.py
@@ -85,8 +85,8 @@ def _parse_op_to_desc(op, dist_context=None):
 def parse_to_desc(op=None, dist_op=None, dist_context=None):
     desc = None
     if op is None and dist_op is not None and dist_context is not None:
-        desc = _parse_op_to_desc(
-            op=dist_op.serial_op, dist_context=dist_context)
+        desc = _parse_op_to_desc(op=dist_op.serial_op,
+                                 dist_context=dist_context)
     elif op is not None and dist_op is None and dist_context is None:
         desc = _parse_op_to_desc(op)
 
@@ -94,6 +94,7 @@ def parse_to_desc(op=None, dist_op=None, dist_context=None):
 
 
 def parse_desc_to_str(desc):
+
     def _parse_dtype(dtype):
         dtype_str = ""
         if dtype == paddle.float32:
@@ -248,10 +249,10 @@ def get_max_beta(self, ranks):
         else:
             for i in range(len(ranks)):
                 for j in range(i + 1, len(ranks)):
-                    forward_order_beta = self.cluster.get_beta(ranks[i],
-                                                               ranks[j])
-                    backward_order_beta = self.cluster.get_beta(ranks[j],
-                                                                ranks[i])
+                    forward_order_beta = self.cluster.get_beta(
+                        ranks[i], ranks[j])
+                    backward_order_beta = self.cluster.get_beta(
+                        ranks[j], ranks[i])
                     beta = forward_order_beta if forward_order_beta > backward_order_beta else backward_order_beta
                     if max_beta == None:
                         max_beta = beta
@@ -275,6 +276,7 @@ def get_hops(self, ranks):
 
 
 class Cost:
+
     def __init__(self, time=0, memory=0, flops=0):
         self.time = time
         self.memory = memory
@@ -338,6 +340,7 @@ def __sub__(self, rhs):
 
 
 class OpCost:
+
     def __init__(self, op=None, op_desc=None):
         self._op = op
         self._op_desc = op_desc
@@ -462,8 +465,8 @@ def comm_count(self):
             elif dtype == paddle.float16:
                 factor = 2
             else:
-                raise TypeError("This dtype {} is not supported now".format(
-                    dtype))
+                raise TypeError(
+                    "This dtype {} is not supported now".format(dtype))
             comm_count = reduce(lambda x, y: x * y, shape) * factor
             self._comm_count = comm_count
 
@@ -506,8 +509,9 @@ def group_ranks(self):
     def _check_comm_op_type(cls):
         if cls.OP_TYPE != "COMM":
             if cls.OP_TYPE not in COMM_OP_TYPE:
-                raise TypeError("Please Check op type in {}, but got {}.".
-                                format(COMM_OP_TYPE, cls.OP_TYPE))
+                raise TypeError(
+                    "Please Check op type in {}, but got {}.".format(
+                        COMM_OP_TYPE, cls.OP_TYPE))
 
 
 class CompOpCost(OpCost):
@@ -523,8 +527,9 @@ def __init__(self, op=None, op_desc=None, cluster=None):
     def _check_comp_op_type(cls):
         if cls.OP_TYPE != "COMP":
             if cls.OP_TYPE in NON_COMP_TYPE:
-                raise TypeError("Please Check op type not in {}, but got {}.".
-                                format(NON_COMP_TYPE, cls.OP_TYPE))
+                raise TypeError(
+                    "Please Check op type not in {}, but got {}.".format(
+                        NON_COMP_TYPE, cls.OP_TYPE))
 
 
 def register_op_cost(cls):
diff --git a/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
index a32fdf1824e62..0f92bcc8facf2 100644
--- a/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
@@ -22,8 +22,9 @@ class AllreduceSumOpCost(CommOpCost):
     OP_TYPE = "c_allreduce_sum"
 
     def __init__(self, op=None, op_desc=None, comm_context=None):
-        super(AllreduceSumOpCost, self).__init__(
-            op=op, op_desc=op_desc, comm_context=comm_context)
+        super(AllreduceSumOpCost, self).__init__(op=op,
+                                                 op_desc=op_desc,
+                                                 comm_context=comm_context)
 
     def calc_time(self):
         # use tree if cross machine and use ring if in a single machine
@@ -38,20 +39,20 @@ def calc_time(self):
 
     def calc_time_ring(self):
         alpha = self.comm_context.base_ring
-        alpha += 2 * (
-            self.rank_count - self.machine_count) * self.comm_context.intra_ring
+        alpha += 2 * (self.rank_count -
+                      self.machine_count) * self.comm_context.intra_ring
         alpha += 2 * (self.machine_count - 1) * (
             self.comm_context.inter_ring + self.hops * self.comm_context.switch)
         beta = self.comm_context.get_max_beta(self.group_ranks)
-        time = alpha + 2 * (self.rank_count - 1
-                            ) / self.rank_count * self.comm_count * beta
+        time = alpha + 2 * (self.rank_count -
+                            1) / self.rank_count * self.comm_count * beta
 
         return time
 
     def calc_time_tree(self):
         alpha = self.comm_context.base_tree
-        alpha += 2 * (self.rank_count / self.machine_count - 1
-                      ) * self.comm_context.intra_tree
+        alpha += 2 * (self.rank_count / self.machine_count -
+                      1) * self.comm_context.intra_tree
         alpha += math.log2(self.machine_count) * (
             self.comm_context.inter_tree + self.hops * self.comm_context.switch)
         beta = self.comm_context.get_max_beta(self.group_ranks)
@@ -66,8 +67,9 @@ class AllgatherOpCost(CommOpCost):
     OP_TYPE = "c_allgather"
 
     def __init__(self, op=None, op_desc=None, comm_context=None):
-        super(AllgatherOpCost, self).__init__(
-            op=op, op_desc=op_desc, comm_context=comm_context)
+        super(AllgatherOpCost, self).__init__(op=op,
+                                              op_desc=op_desc,
+                                              comm_context=comm_context)
 
     def calc_time(self):
         time = self.calc_time_ring()
@@ -75,13 +77,13 @@ def calc_time(self):
 
     def calc_time_ring(self):
         alpha = self.comm_context.base_ring
-        alpha += (
-            self.rank_count - self.machine_count) * self.comm_context.intra_ring
+        alpha += (self.rank_count -
+                  self.machine_count) * self.comm_context.intra_ring
         alpha += (self.machine_count - 1) * (
             self.comm_context.inter_ring + self.hops * self.comm_context.switch)
         beta = self.comm_context.get_max_beta(self.group_ranks)
-        time = alpha + (self.rank_count - 1
-                        ) / self.rank_count * self.comm_count * beta
+        time = alpha + (self.rank_count -
+                        1) / self.rank_count * self.comm_count * beta
         return time
 
 
@@ -90,8 +92,9 @@ class BroadcastOpCost(CommOpCost):
     OP_TYPE = "c_broadcast"
 
     def __init__(self, op=None, op_desc=None, comm_context=None):
-        super(BroadcastOpCost, self).__init__(
-            op=op, op_desc=op_desc, comm_context=comm_context)
+        super(BroadcastOpCost, self).__init__(op=op,
+                                              op_desc=op_desc,
+                                              comm_context=comm_context)
 
     def calc_time(self):
         time = self.calc_time_ring()
@@ -114,8 +117,9 @@ class IdentityOpCost(CommOpCost):
     OP_TYPE = "c_identity"
 
     def __init__(self, op=None, op_desc=None, comm_context=None):
-        super(IdentityOpCost, self).__init__(
-            op=op, op_desc=op_desc, comm_context=comm_context)
+        super(IdentityOpCost, self).__init__(op=op,
+                                             op_desc=op_desc,
+                                             comm_context=comm_context)
 
     def calc_time(self):
         return 0
@@ -126,8 +130,9 @@ class RecvOpCost(CommOpCost):
     OP_TYPE = "recv_v2"
 
     def __init__(self, op=None, op_desc=None, comm_context=None):
-        super(RecvOpCost, self).__init__(
-            op=op, op_desc=op_desc, comm_context=comm_context)
+        super(RecvOpCost, self).__init__(op=op,
+                                         op_desc=op_desc,
+                                         comm_context=comm_context)
 
     def calc_time(self):
         alpha = self.comm_context.base_ring
@@ -145,8 +150,9 @@ class SendOpCost(CommOpCost):
     OP_TYPE = "send_v2"
 
     def __init__(self, op=None, op_desc=None, comm_context=None):
-        super(SendOpCost, self).__init__(
-            op=op, op_desc=op_desc, comm_context=comm_context)
+        super(SendOpCost, self).__init__(op=op,
+                                         op_desc=op_desc,
+                                         comm_context=comm_context)
 
     def calc_time(self):
         alpha = self.comm_context.base_ring
diff --git a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
index 8958c4bf905c2..6556a1110d222 100644
--- a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
@@ -20,8 +20,9 @@ class AssignOpCost(CompOpCost):
     OP_TYPE = "assign"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(AssignOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(AssignOpCost, self).__init__(op=op,
+                                           op_desc=op_desc,
+                                           cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -38,8 +39,9 @@ class AssignValueOpCost(CompOpCost):
     OP_TYPE = "assign_value"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(AssignValueOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(AssignValueOpCost, self).__init__(op=op,
+                                                op_desc=op_desc,
+                                                cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -56,8 +58,9 @@ class BeamSearchOpCost(CompOpCost):
     OP_TYPE = "beam_search"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(BeamSearchOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(BeamSearchOpCost, self).__init__(op=op,
+                                               op_desc=op_desc,
+                                               cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -74,8 +77,9 @@ class BeamSearchDecodeOpCost(CompOpCost):
     OP_TYPE = "beam_search_decode"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(BeamSearchDecodeOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(BeamSearchDecodeOpCost, self).__init__(op=op,
+                                                     op_desc=op_desc,
+                                                     cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -92,8 +96,9 @@ class CastOpCost(CompOpCost):
     OP_TYPE = "cast"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(CastOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(CastOpCost, self).__init__(op=op,
+                                         op_desc=op_desc,
+                                         cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -110,8 +115,9 @@ class ConcatOpCost(CompOpCost):
     OP_TYPE = "concat"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ConcatOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ConcatOpCost, self).__init__(op=op,
+                                           op_desc=op_desc,
+                                           cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -128,8 +134,9 @@ class ElementwiseAddOpCost(CompOpCost):
     OP_TYPE = "elementwise_add"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseAddOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ElementwiseAddOpCost, self).__init__(op=op,
+                                                   op_desc=op_desc,
+                                                   cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -146,8 +153,9 @@ class ElementwiseAddGradOpCost(CompOpCost):
     OP_TYPE = "elementwise_add_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseAddGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ElementwiseAddGradOpCost, self).__init__(op=op,
+                                                       op_desc=op_desc,
+                                                       cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -164,8 +172,9 @@ class ElementwiseDivOpCost(CompOpCost):
     OP_TYPE = "elementwise_div"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseDivOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ElementwiseDivOpCost, self).__init__(op=op,
+                                                   op_desc=op_desc,
+                                                   cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -182,8 +191,9 @@ class ElementwiseDivGradOpCost(CompOpCost):
     OP_TYPE = "elementwise_div_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseDivGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ElementwiseDivGradOpCost, self).__init__(op=op,
+                                                       op_desc=op_desc,
+                                                       cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -200,8 +210,9 @@ class ElementwiseMulOpCost(CompOpCost):
     OP_TYPE = "elementwise_mul"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseMulOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ElementwiseMulOpCost, self).__init__(op=op,
+                                                   op_desc=op_desc,
+                                                   cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -218,8 +229,9 @@ class ElementwiseMulGradOpCost(CompOpCost):
     OP_TYPE = "elementwise_mul_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseMulGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ElementwiseMulGradOpCost, self).__init__(op=op,
+                                                       op_desc=op_desc,
+                                                       cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -236,8 +248,9 @@ class ElementwiseSubOpCost(CompOpCost):
     OP_TYPE = "elementwise_sub"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseSubOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ElementwiseSubOpCost, self).__init__(op=op,
+                                                   op_desc=op_desc,
+                                                   cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -254,8 +267,9 @@ class ElementwiseSubGradOpCost(CompOpCost):
     OP_TYPE = "elementwise_sub_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseSubGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ElementwiseSubGradOpCost, self).__init__(op=op,
+                                                       op_desc=op_desc,
+                                                       cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -272,8 +286,9 @@ class EmbeddingOpCost(CompOpCost):
     OP_TYPE = "c_embedding"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(EmbeddingOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(EmbeddingOpCost, self).__init__(op=op,
+                                              op_desc=op_desc,
+                                              cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -290,8 +305,9 @@ class EmbeddingGradOpCost(CompOpCost):
     OP_TYPE = "c_embedding_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(EmbeddingGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(EmbeddingGradOpCost, self).__init__(op=op,
+                                                  op_desc=op_desc,
+                                                  cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -308,8 +324,9 @@ class FillConstantOpCost(CompOpCost):
     OP_TYPE = "fill_constant"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(FillConstantOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(FillConstantOpCost, self).__init__(op=op,
+                                                 op_desc=op_desc,
+                                                 cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -326,8 +343,9 @@ class FillConstantBatchSizeLikeOpCost(CompOpCost):
     OP_TYPE = "fill_constant_batch_size_like"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(FillConstantBatchSizeLikeOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(FillConstantBatchSizeLikeOpCost, self).__init__(op=op,
+                                                              op_desc=op_desc,
+                                                              cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -344,8 +362,8 @@ class FillConstantBatchSizeLikeGradOpCost(CompOpCost):
     OP_TYPE = "fill_constant_batch_size_like_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(FillConstantBatchSizeLikeGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(FillConstantBatchSizeLikeGradOpCost,
+              self).__init__(op=op, op_desc=op_desc, cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -362,8 +380,9 @@ class GatherOpCost(CompOpCost):
     OP_TYPE = "gather"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(GatherOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(GatherOpCost, self).__init__(op=op,
+                                           op_desc=op_desc,
+                                           cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -380,8 +399,9 @@ class GeluOpCost(CompOpCost):
     OP_TYPE = "gelu"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(GeluOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(GeluOpCost, self).__init__(op=op,
+                                         op_desc=op_desc,
+                                         cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -398,8 +418,9 @@ class GeluGradOpCost(CompOpCost):
     OP_TYPE = "gelu_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(GeluGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(GeluGradOpCost, self).__init__(op=op,
+                                             op_desc=op_desc,
+                                             cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -416,8 +437,9 @@ class GreaterEqualOpCost(CompOpCost):
     OP_TYPE = "greater_equal"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(GreaterEqualOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(GreaterEqualOpCost, self).__init__(op=op,
+                                                 op_desc=op_desc,
+                                                 cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -434,8 +456,9 @@ class IncrementOpCost(CompOpCost):
     OP_TYPE = "increment"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(IncrementOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(IncrementOpCost, self).__init__(op=op,
+                                              op_desc=op_desc,
+                                              cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -448,8 +471,9 @@ class IsEmptyOpCost(CompOpCost):
     OP_TYPE = "is_empty"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(IsEmptyOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(IsEmptyOpCost, self).__init__(op=op,
+                                            op_desc=op_desc,
+                                            cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -462,8 +486,9 @@ class LayerNormOpCost(CompOpCost):
     OP_TYPE = "layer_norm"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LayerNormOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(LayerNormOpCost, self).__init__(op=op,
+                                              op_desc=op_desc,
+                                              cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -480,8 +505,9 @@ class LayerNormGradOpCost(CompOpCost):
     OP_TYPE = "layer_norm_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LayerNormGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(LayerNormGradOpCost, self).__init__(op=op,
+                                                  op_desc=op_desc,
+                                                  cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -498,8 +524,9 @@ class LessThanOpCost(CompOpCost):
     OP_TYPE = "less_than"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LessThanOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(LessThanOpCost, self).__init__(op=op,
+                                             op_desc=op_desc,
+                                             cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -516,8 +543,9 @@ class LogicalNotOpCost(CompOpCost):
     OP_TYPE = "logical_not"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LogicalNotOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(LogicalNotOpCost, self).__init__(op=op,
+                                               op_desc=op_desc,
+                                               cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -534,8 +562,9 @@ class LogicalAndOpCost(CompOpCost):
     OP_TYPE = "logical_and"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LogicalAndOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(LogicalAndOpCost, self).__init__(op=op,
+                                               op_desc=op_desc,
+                                               cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -552,8 +581,9 @@ class LodResetOpCost(CompOpCost):
     OP_TYPE = "lod_reset"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LodResetOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(LodResetOpCost, self).__init__(op=op,
+                                             op_desc=op_desc,
+                                             cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -587,8 +617,9 @@ class LookupTableV2OpCost(CompOpCost):
     OP_TYPE = "lookup_table_v2"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LookupTableV2OpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(LookupTableV2OpCost, self).__init__(op=op,
+                                                  op_desc=op_desc,
+                                                  cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -605,8 +636,9 @@ class LookupTableV2GradOpCost(CompOpCost):
     OP_TYPE = "lookup_table_v2_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LookupTableV2GradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(LookupTableV2GradOpCost, self).__init__(op=op,
+                                                      op_desc=op_desc,
+                                                      cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -623,8 +655,9 @@ class MatmulOpCost(CompOpCost):
     OP_TYPE = "matmul"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(MatmulOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(MatmulOpCost, self).__init__(op=op,
+                                           op_desc=op_desc,
+                                           cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -641,8 +674,9 @@ class MatmulGradOpCost(CompOpCost):
     OP_TYPE = "matmul_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(MatmulGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(MatmulGradOpCost, self).__init__(op=op,
+                                               op_desc=op_desc,
+                                               cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -659,8 +693,9 @@ class MatmulV2OpCost(CompOpCost):
     OP_TYPE = "matmul_v2"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(MatmulV2OpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(MatmulV2OpCost, self).__init__(op=op,
+                                             op_desc=op_desc,
+                                             cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -677,8 +712,9 @@ class MatmulV2GradOpCost(CompOpCost):
     OP_TYPE = "matmul_v2_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(MatmulV2GradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(MatmulV2GradOpCost, self).__init__(op=op,
+                                                 op_desc=op_desc,
+                                                 cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -695,8 +731,9 @@ class MemcpyOpCost(CompOpCost):
     OP_TYPE = "memcpy"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(MemcpyOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(MemcpyOpCost, self).__init__(op=op,
+                                           op_desc=op_desc,
+                                           cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -730,8 +767,9 @@ class MulGradOpCost(CompOpCost):
     OP_TYPE = "mul_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(MulGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(MulGradOpCost, self).__init__(op=op,
+                                            op_desc=op_desc,
+                                            cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -748,8 +786,9 @@ class OneHotOpCost(CompOpCost):
     OP_TYPE = "one_hot"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(OneHotOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(OneHotOpCost, self).__init__(op=op,
+                                           op_desc=op_desc,
+                                           cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -766,8 +805,9 @@ class ReadFromArrayOpCost(CompOpCost):
     OP_TYPE = "read_from_array"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ReadFromArrayOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ReadFromArrayOpCost, self).__init__(op=op,
+                                                  op_desc=op_desc,
+                                                  cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -784,8 +824,9 @@ class ReduceSumOpCost(CompOpCost):
     OP_TYPE = "reduce_sum"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ReduceSumOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ReduceSumOpCost, self).__init__(op=op,
+                                              op_desc=op_desc,
+                                              cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -802,8 +843,9 @@ class ReduceSumGradOpCost(CompOpCost):
     OP_TYPE = "reduce_sum_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ReduceSumGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ReduceSumGradOpCost, self).__init__(op=op,
+                                                  op_desc=op_desc,
+                                                  cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -820,8 +862,9 @@ class Reshape2OpCost(CompOpCost):
     OP_TYPE = "reshape2"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(Reshape2OpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(Reshape2OpCost, self).__init__(op=op,
+                                             op_desc=op_desc,
+                                             cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -838,8 +881,9 @@ class Reshape2GradOpCost(CompOpCost):
     OP_TYPE = "reshape2_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(Reshape2GradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(Reshape2GradOpCost, self).__init__(op=op,
+                                                 op_desc=op_desc,
+                                                 cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -856,8 +900,9 @@ class ReduceMeanOpCost(CompOpCost):
     OP_TYPE = "reduce_mean"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ReduceMeanOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ReduceMeanOpCost, self).__init__(op=op,
+                                               op_desc=op_desc,
+                                               cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -874,8 +919,9 @@ class ReduceMeanGradOpCost(CompOpCost):
     OP_TYPE = "reduce_mean_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ReduceMeanGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ReduceMeanGradOpCost, self).__init__(op=op,
+                                                   op_desc=op_desc,
+                                                   cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -892,8 +938,9 @@ class SamplingIdOpCost(CompOpCost):
     OP_TYPE = "sampling_id"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SamplingIdOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(SamplingIdOpCost, self).__init__(op=op,
+                                               op_desc=op_desc,
+                                               cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -910,8 +957,9 @@ class ScaleOpCost(CompOpCost):
     OP_TYPE = "scale"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ScaleOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(ScaleOpCost, self).__init__(op=op,
+                                          op_desc=op_desc,
+                                          cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -928,8 +976,9 @@ class SliceOpCost(CompOpCost):
     OP_TYPE = "slice"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SliceOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(SliceOpCost, self).__init__(op=op,
+                                          op_desc=op_desc,
+                                          cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -946,8 +995,9 @@ class SoftmaxOpCost(CompOpCost):
     OP_TYPE = "softmax"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SoftmaxOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(SoftmaxOpCost, self).__init__(op=op,
+                                            op_desc=op_desc,
+                                            cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -964,8 +1014,9 @@ class SoftmaxGradOpCost(CompOpCost):
     OP_TYPE = "softmax_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SoftmaxGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(SoftmaxGradOpCost, self).__init__(op=op,
+                                                op_desc=op_desc,
+                                                cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -982,8 +1033,9 @@ class SoftmaxWithCrossEntropyOpCost(CompOpCost):
     OP_TYPE = "softmax_with_cross_entropy"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SoftmaxWithCrossEntropyOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(SoftmaxWithCrossEntropyOpCost, self).__init__(op=op,
+                                                            op_desc=op_desc,
+                                                            cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1000,8 +1052,9 @@ class SoftmaxWithCrossEntropyGradOpCost(CompOpCost):
     OP_TYPE = "softmax_with_cross_entropy_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SoftmaxWithCrossEntropyGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(SoftmaxWithCrossEntropyGradOpCost, self).__init__(op=op,
+                                                                op_desc=op_desc,
+                                                                cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1018,8 +1071,9 @@ class SplitOpCost(CompOpCost):
     OP_TYPE = "split"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SplitOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(SplitOpCost, self).__init__(op=op,
+                                          op_desc=op_desc,
+                                          cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1036,8 +1090,9 @@ class Squeeze2OpCost(CompOpCost):
     OP_TYPE = "squeeze2"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(Squeeze2OpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(Squeeze2OpCost, self).__init__(op=op,
+                                             op_desc=op_desc,
+                                             cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1054,8 +1109,9 @@ class SquareOpCost(CompOpCost):
     OP_TYPE = "square"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SquareOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(SquareOpCost, self).__init__(op=op,
+                                           op_desc=op_desc,
+                                           cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1072,8 +1128,9 @@ class SquareGradOpCost(CompOpCost):
     OP_TYPE = "square_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SquareGradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(SquareGradOpCost, self).__init__(op=op,
+                                               op_desc=op_desc,
+                                               cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1107,8 +1164,9 @@ class TopKOpCost(CompOpCost):
     OP_TYPE = "top_k"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(TopKOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(TopKOpCost, self).__init__(op=op,
+                                         op_desc=op_desc,
+                                         cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1125,8 +1183,9 @@ class Transpose2OpCost(CompOpCost):
     OP_TYPE = "transpose2"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(Transpose2OpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(Transpose2OpCost, self).__init__(op=op,
+                                               op_desc=op_desc,
+                                               cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1143,8 +1202,9 @@ class Transpose2GradOpCost(CompOpCost):
     OP_TYPE = "transpose2_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(Transpose2GradOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(Transpose2GradOpCost, self).__init__(op=op,
+                                                   op_desc=op_desc,
+                                                   cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1161,8 +1221,9 @@ class Unsqueeze2OpCost(CompOpCost):
     OP_TYPE = "unsqueeze2"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(Unsqueeze2OpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(Unsqueeze2OpCost, self).__init__(op=op,
+                                               op_desc=op_desc,
+                                               cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1179,8 +1240,9 @@ class WriteToArrayOpCost(CompOpCost):
     OP_TYPE = "write_to_array"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(WriteToArrayOpCost, self).__init__(
-            op=op, op_desc=op_desc, cluster=cluster)
+        super(WriteToArrayOpCost, self).__init__(op=op,
+                                                 op_desc=op_desc,
+                                                 cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
diff --git a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
index 7bd535af8be97..5a1aeec2d9ffe 100644
--- a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
@@ -14,6 +14,7 @@
 
 
 class CostEstimator:
+
     def __init__(self,
                  program,
                  cluster=None,
diff --git a/python/paddle/distributed/auto_parallel/cost/tensor_cost.py b/python/paddle/distributed/auto_parallel/cost/tensor_cost.py
index 2db1c06d5960b..9741020da6512 100644
--- a/python/paddle/distributed/auto_parallel/cost/tensor_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/tensor_cost.py
@@ -22,6 +22,7 @@
 
 
 class TensorCost:
+
     def __init__(self, tensor=None, dist_tensor=None, shape=None, dtype=None):
         self._check_args(tensor, dist_tensor, shape, dtype)
         self._tensor = tensor
@@ -59,20 +60,20 @@ def _check_args(self, tensor, dist_tensor, shape, dtype):
             assert (tensor is None and shape is None)
             if not isinstance(dist_tensor, DistributedTensor):
                 raise TypeError(
-                    "Please check dist_tensor type is DistributedTensor, but got {}".
-                    format(type(dist_tensor)))
+                    "Please check dist_tensor type is DistributedTensor, but got {}"
+                    .format(type(dist_tensor)))
 
         elif shape is not None:
-            assert (tensor is None and dist_tensor is None and
-                    dtype is not None)
+            assert (tensor is None and dist_tensor is None
+                    and dtype is not None)
             if not isinstance(shape, (list, set)):
                 raise TypeError(
                     "Please check shape type is list or set, but got {}".format(
                         type(shape)))
 
         elif dtype is not None:
-            assert (tensor is None and dist_tensor is None and
-                    shape is not None)
+            assert (tensor is None and dist_tensor is None
+                    and shape is not None)
 
     @property
     def cost(self):
diff --git a/python/paddle/distributed/auto_parallel/cost_model.py b/python/paddle/distributed/auto_parallel/cost_model.py
index b72c044428f6c..e35fae57caec6 100644
--- a/python/paddle/distributed/auto_parallel/cost_model.py
+++ b/python/paddle/distributed/auto_parallel/cost_model.py
@@ -37,6 +37,7 @@ class CostNodeType(Enum):
 
 
 class Cost(object):
+
     def __init__(self):
         self.runtime = None
         self.static_mem = None
@@ -51,6 +52,7 @@ class CostModelMode(Enum):
 
 
 class CostNode(object):
+
     def __init__(self, node, node_type, id=None):
         self.id = id
         self.node = node
@@ -71,6 +73,7 @@ def cost(self, cost):
 
 
 class MergedOpsCostNode(CostNode):
+
     def __init__(self, node_type, id=None, base_node_list=None, is_bwd=False):
         super(MergedOpsCostNode, self).__init__(None, node_type, id)
         self.node_list = base_node_list
@@ -78,6 +81,7 @@ def __init__(self, node_type, id=None, base_node_list=None, is_bwd=False):
 
 
 class CommOpCostNode(CostNode):
+
     def __init__(self,
                  node,
                  node_type,
@@ -118,6 +122,7 @@ def init_comm_cost(self, cluster=None):
 
 
 class TensorCostNode(CostNode):
+
     def __init__(self,
                  node,
                  node_type,
@@ -159,6 +164,7 @@ def get_size(self):
 
 
 class CompOpCostNode(CostNode):
+
     def __init__(self, node, node_type, id=None, is_bwd=False, is_optim=False):
         super(CompOpCostNode, self).__init__(node, node_type, id)
         self.is_bwd = is_bwd
@@ -174,6 +180,7 @@ def init_comp_cost(self, cost_data):
 
 
 class PipeEvent(object):
+
     def __init__(self, stage_id, event_name, duration, start_time=-1):
         self.stage_id = stage_id
         self.name = event_name
@@ -183,6 +190,7 @@ def __init__(self, stage_id, event_name, duration, start_time=-1):
 
 
 class CostModel(object):
+
     def __init__(self,
                  mode=CostModelMode.BENCHMARKING,
                  cluster=None,
@@ -261,8 +269,8 @@ def _parse_sub_program(self, program, nodes, graph, cost_data, sub_idx):
                 op_node = CommOpCostNode(op, CostNodeType.COMMUNICATION, op_id,
                                          is_bwd)
             else:
-                is_bwd = (int(op.attr('op_role')) == int(OpRole.Backward)
-                          ) or "@GRAD" in op.input_arg_names
+                is_bwd = (int(op.attr('op_role')) == int(
+                    OpRole.Backward)) or "@GRAD" in op.input_arg_names
                 is_optim = 'LearningRate' in op.input_names
                 op_node = CompOpCostNode(op, CostNodeType.COMPUTATION, op_id,
                                          is_bwd, is_optim)
@@ -310,11 +318,10 @@ def _parse_sub_program(self, program, nodes, graph, cost_data, sub_idx):
 
                         write_op_cnt += 1
                         new_var_id = node_id + '_write_{}'.format(write_op_cnt)
-                        new_var = TensorCostNode(
-                            node.node,
-                            CostNodeType.VARIABLE,
-                            new_var_id,
-                            shared_node_id=node_id)
+                        new_var = TensorCostNode(node.node,
+                                                 CostNodeType.VARIABLE,
+                                                 new_var_id,
+                                                 shared_node_id=node_id)
 
                         graph[new_var_id] = [[], []]
                         graph[pred_id][SUCC].append(new_var_id)
@@ -341,8 +348,8 @@ def parse_program(self, distributed_program):
             self.runtime_graph.append({})
             self._parse_sub_program(
                 sub_prog, self.nodes[sub_idx], self.origin_graph[sub_idx],
-                self.cost_data[0 if self.rank2pp is None else self.rank2pp[
-                    sub_idx]], sub_idx)
+                self.cost_data[0 if self.rank2pp is None else self.
+                               rank2pp[sub_idx]], sub_idx)
         return self.nodes
 
     def _find_succ_op(self, node_id, sub_idx=0):
@@ -417,11 +424,10 @@ def _merge_node(self, to_merge_node_list, merge_type='linear', nodes=None):
                         merge_type))
         merged_node_id = 'merged_' + str(len(nodes))
         is_bwd = to_merge_node_list[0].is_bwd
-        merged_node = MergedOpsCostNode(
-            CostNodeType.MERGED,
-            id=merged_node_id,
-            base_node_list=nodes_list,
-            is_bwd=is_bwd)
+        merged_node = MergedOpsCostNode(CostNodeType.MERGED,
+                                        id=merged_node_id,
+                                        base_node_list=nodes_list,
+                                        is_bwd=is_bwd)
         merged_node.cost = node_cost
         return merged_node_id, merged_node
 
@@ -435,10 +441,12 @@ def merge_linear(self):
         '''
         cnt = 0
         for sub_idx in range(self.total_rank):
-            cnt += self._merge_linear(
-                self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=False)
-            cnt += self._merge_linear(
-                self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=True)
+            cnt += self._merge_linear(self.nodes[sub_idx],
+                                      self.runtime_graph[sub_idx],
+                                      is_bwd=False)
+            cnt += self._merge_linear(self.nodes[sub_idx],
+                                      self.runtime_graph[sub_idx],
+                                      is_bwd=True)
         return cnt
 
     def merge_branch(self):
@@ -454,10 +462,12 @@ def merge_branch(self):
         '''
         cnt = 0
         for sub_idx in range(self.total_rank):
-            cnt += self._merge_branch(
-                self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=False)
-            cnt += self._merge_branch(
-                self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=True)
+            cnt += self._merge_branch(self.nodes[sub_idx],
+                                      self.runtime_graph[sub_idx],
+                                      is_bwd=False)
+            cnt += self._merge_branch(self.nodes[sub_idx],
+                                      self.runtime_graph[sub_idx],
+                                      is_bwd=True)
         return cnt
 
     def _merge_linear(self, nodes, runtime_graph, is_bwd=False):
@@ -482,8 +492,8 @@ def _merge_linear(self, nodes, runtime_graph, is_bwd=False):
                 # delete edges and add new edges
                 succ = None
                 try:
-                    runtime_graph[merged_node_id][SUCC] = copy.deepcopy(edges[
-                        SUCC])
+                    runtime_graph[merged_node_id][SUCC] = copy.deepcopy(
+                        edges[SUCC])
 
                     if len(runtime_graph[pred_id][SUCC]) > 1:
                         # predecessor has more than 1 successor
@@ -558,8 +568,8 @@ def _merge_branch(self, nodes, runtime_graph, is_bwd=False):
 
                 to_merge = True
                 try:
-                    if len(edges[SUCC]) < 1 or len(runtime_graph[edges[SUCC][0]]
-                                                   [SUCC]) < 1:
+                    if len(edges[SUCC]) < 1 or len(
+                            runtime_graph[edges[SUCC][0]][SUCC]) < 1:
                         continue
                 except:
                     continue
@@ -596,6 +606,7 @@ def _merge_branch(self, nodes, runtime_graph, is_bwd=False):
         return reduct_cnt
 
     def get_runtime_cost(self):
+
         def get_node_cost(node):
             node_cost = node.cost + self.opcall_overhead
             if isinstance(node, MergedOpsCostNode):
@@ -660,8 +671,8 @@ def _simulate_mem(self, nodes, origin_graph):
                     static_mem += size
                 cur_mem += size
             edges = sim_graph[node_id]
-            if not (node.type == CostNodeType.VARIABLE and
-                    node.node.persistable):
+            if not (node.type == CostNodeType.VARIABLE
+                    and node.node.persistable):
                 for succ_id in edges[SUCC]:
                     sim_graph[succ_id][PRED].remove(node_id)
                     if len(sim_graph[succ_id][PRED]) == 0:
@@ -670,8 +681,8 @@ def _simulate_mem(self, nodes, origin_graph):
                 pred = nodes
                 if pred.type == CostNodeType.VARIABLE:
                     sim_graph[pred_id][SUCC].remove(node_id)
-                    if len(sim_graph[pred_id][
-                            SUCC]) == 0 and not pred.node.persistable:
+                    if len(sim_graph[pred_id]
+                           [SUCC]) == 0 and not pred.node.persistable:
                         cur_mem -= pred.get_size()
         return static_mem, cur_mem, top_mem
 
@@ -703,18 +714,16 @@ def _simulate_pipeline(self):
                     event_list.append(e)
                     if stid != stage_num - 1:
                         q.put(
-                            PipeEvent(
-                                stid + 1,
-                                'fwd',
-                                self.fwd_time[stid + 1],
-                                start_time=e.e_time))
+                            PipeEvent(stid + 1,
+                                      'fwd',
+                                      self.fwd_time[stid + 1],
+                                      start_time=e.e_time))
                     else:
                         q.put(
-                            PipeEvent(
-                                stid,
-                                'bwd',
-                                self.bwd_time[stid],
-                                start_time=e.e_time))
+                            PipeEvent(stid,
+                                      'bwd',
+                                      self.bwd_time[stid],
+                                      start_time=e.e_time))
                     fwd_cnt[stid] -= 1
                     global_time[stid] = e.e_time
                 else:
@@ -725,20 +734,18 @@ def _simulate_pipeline(self):
                 event_list.append(e)
                 if stid != 0:
                     q.put(
-                        PipeEvent(
-                            stid - 1,
-                            'bwd',
-                            self.bwd_time[stid - 1],
-                            start_time=e.e_time))
+                        PipeEvent(stid - 1,
+                                  'bwd',
+                                  self.bwd_time[stid - 1],
+                                  start_time=e.e_time))
                 fwd_cnt[stid] += 1
                 bwd_cnt[stid] -= 1
                 if bwd_cnt[stid] == 0:
                     q.put(
-                        PipeEvent(
-                            stid,
-                            'optim',
-                            self.optim_time[stid],
-                            start_time=e.e_time))
+                        PipeEvent(stid,
+                                  'optim',
+                                  self.optim_time[stid],
+                                  start_time=e.e_time))
                 global_time[stid] = e.e_time
             elif e.name == 'optim':
                 e.s_time = max(global_time[stid], e.s_time)
@@ -792,11 +799,10 @@ def estimate_cost(distributed_program, cluster, pipeline_config,
     """
     # the following line is left for now, cluster model will be involved in the future
     assert cluster is None, "For now, cluster remains None"
-    cm_ctx = CostModel(
-        cluster=cluster,
-        batch_size=batch_size,
-        standalone_cost_data=standalone_cost_data,
-        pipeline_config=pipeline_config)
+    cm_ctx = CostModel(cluster=cluster,
+                       batch_size=batch_size,
+                       standalone_cost_data=standalone_cost_data,
+                       pipeline_config=pipeline_config)
     cm_ctx.init(distributed_program)
     cost = cm_ctx.get_cost()
     return cost
diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py
index 3dbdb79f48541..9bbc4de6bddf0 100644
--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -51,6 +51,7 @@ def append_op_output_suffix(name):
 
 
 class TensorDistributedAttribute:
+
     def __init__(self):
         # The process mesh of distributed operator attribute must is the same as
         # the process meshes of all input and output distributed attributed
@@ -123,8 +124,8 @@ def init(self, dist_attr):
                             key, dist_attr)
         elif isinstance(dist_attr, TensorDistributedAttribute):
             for key in get_tensor_dist_attr_field_keys():
-                field_property = TensorDistributedAttribute.__dict__.get(key,
-                                                                         None)
+                field_property = TensorDistributedAttribute.__dict__.get(
+                    key, None)
                 if field_property:
                     field_property.fset(self, field_property.fget(dist_attr))
                 else:
@@ -192,6 +193,7 @@ def __str__(self):
 
 
 class OperatorDistributedAttribute:
+
     def __init__(self):
         self._process_mesh = None
         self._op_type = None
@@ -356,8 +358,8 @@ def init(self, dist_attr):
                     tensor_name, dist_attr.get_output_dist_attr(tensor_name))
             self._is_annotated = copy.deepcopy(dist_attr._is_annotated)
             for key in get_op_dist_attr_field_keys():
-                field_property = OperatorDistributedAttribute.__dict__.get(key,
-                                                                           None)
+                field_property = OperatorDistributedAttribute.__dict__.get(
+                    key, None)
                 if field_property:
                     field_property.fset(self, field_property.fget(dist_attr))
                 else:
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 6a38b53cf2c10..bf4f66e7c1b6b 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -203,8 +203,8 @@ def _backup_serial_info(self, mode):
             self._serial_main_program.clone())
         self._backup_serial_startup_program_stack.append(
             self._serial_startup_program.clone())
-        self._backup_pass_context_stack.append(
-            copy.deepcopy(self._pass_context))
+        self._backup_pass_context_stack.append(copy.deepcopy(
+            self._pass_context))
         self._backup_block_state_stack.append(copy.deepcopy(self._block_state))
 
     def _backup_dist_info(self, mode):
@@ -343,8 +343,12 @@ def initialize(self):
                 self._serial_startup_program = self._original_serial_startup_program
             if not self._serial_loss:
                 if isinstance(self._original_serial_loss, list):
-                    assert len(self._original_serial_loss) == 1
-                    self._serial_loss = self._original_serial_loss[0]
+                    if len(self._original_serial_loss) == 1:
+                        self._serial_loss = self._original_serial_loss[0]
+                    elif len(self._original_serial_loss) == 0:
+                        self._serial_loss = self._original_serial_loss
+                    else:
+                        raise ValueError("multi loss vars are not supported.")
                 else:
                     self._serial_loss = self._original_serial_loss
             if not self._serial_optimizer:
@@ -394,8 +398,8 @@ def get_dist_tensor_for_program(self, serial_tensor):
             return dist_tensor
         else:
             serial_tensor_id = serial_tensor.desc.original_id()
-            dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id,
-                                                             None)
+            dist_tensor = self._dist_tensors_for_program.get(
+                serial_tensor_id, None)
             if dist_tensor:
                 return dist_tensor
             else:
@@ -434,8 +438,8 @@ def get_tensor_dist_attr_for_program(self, serial_tensor):
             return dist_tensor.dist_attr
         else:
             serial_tensor_id = serial_tensor.desc.original_id()
-            dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id,
-                                                             None)
+            dist_tensor = self._dist_tensors_for_program.get(
+                serial_tensor_id, None)
             if dist_tensor:
                 return dist_tensor.dist_attr
             else:
@@ -544,6 +548,7 @@ def _init_dist_attr_for_program(self, no_default=False):
             self._dist_ops_for_program)
 
     def _order_nodes_by_program_order(self):
+
         def _contains(nodes, target_node):
             for node in nodes:
                 if _node_id(node) == _node_id(target_node):
@@ -715,8 +720,8 @@ def copy_dist_attr_from_graph_to_program(self):
         # here we just set there process_mesh to the first one.
         for orphan_node in self._serial_orphan_tensor_nodes:
             serial_tensor_id = orphan_node.var().id()
-            dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id,
-                                                             None)
+            dist_tensor = self._dist_tensors_for_program.get(
+                serial_tensor_id, None)
             if dist_tensor:
                 dist_tensor.dist_attr.process_mesh = self._process_meshes[0]
             else:
@@ -803,11 +808,10 @@ def validate_dist_attr_for_program(self):
                 assert dist_tensor is not None, \
                     "Tensor {} does not have a distributed attribute.".format(
                         dist_tensor.serial_tensor.name)
-                if (dist_tensor is not None) and (
-                        not dist_tensor.validate_dist_attr()):
+                if (dist_tensor
+                        is not None) and (not dist_tensor.validate_dist_attr()):
                     assert False, "Tensor {} (id: {}, original_id: {}) has a wrong distributed attributes {}.".format(
-                        dist_tensor.serial_tensor.name,
-                        dist_tensor.desc.id(),
+                        dist_tensor.serial_tensor.name, dist_tensor.desc.id(),
                         dist_tensor.desc.original_id(), dist_tensor.dist_attr)
             for op in block.ops:
                 dist_op = self.get_dist_op_for_program(op)
@@ -816,8 +820,7 @@ def validate_dist_attr_for_program(self):
                         dist_op.serial_op.type)
                 if (dist_op is not None) and (not dist_op.validate_dist_attr()):
                     assert False, "Operator {} (id: {}, original_id: {}) has a wrong distributed attributes {} .".format(
-                        dist_op.serial_op.type,
-                        dist_op.serial_op.desc.id(),
+                        dist_op.serial_op.type, dist_op.serial_op.desc.id(),
                         dist_op.serial_op.desc.original_id(), dist_op.dist_attr)
         return True
 
@@ -943,6 +946,7 @@ def prepare_context(self, src_op):
 
 
 class BlockState(object):
+
     def __init__(self):
         self.nblock = 0
         self.forward_indices = []
diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py
index aa315db5292de..03cc340fecd56 100644
--- a/python/paddle/distributed/auto_parallel/dist_loader.py
+++ b/python/paddle/distributed/auto_parallel/dist_loader.py
@@ -21,6 +21,7 @@
 
 
 class DistributedDataLoader(metaclass=abc.ABCMeta):
+
     def __init__(self,
                  dataset,
                  batch_size=1,
@@ -47,6 +48,7 @@ def __next__(self):
 
 
 class NonIterableGeneratorLoader(DistributedDataLoader):
+
     def __init__(self,
                  dataset,
                  feed_list,
@@ -63,9 +65,10 @@ def __init__(self,
         self.dp_world_size = 1 if data_parallel_world_size is None else data_parallel_world_size
         self.dp_rank = 0 if data_parallel_rank is None else data_parallel_rank
 
-        super(NonIterableGeneratorLoader, self).__init__(
-            dataset, batch_size, epochs, data_parallel_world_size,
-            data_parallel_rank, drop_last)
+        super(NonIterableGeneratorLoader,
+              self).__init__(dataset, batch_size, epochs,
+                             data_parallel_world_size, data_parallel_rank,
+                             drop_last)
         self._inner_dataloader = self._create_inner_dataloader()
         self._steps = self._infer_steps()
 
@@ -96,6 +99,7 @@ def _infer_steps(self):
         return steps_per_epoch
 
     def _create_inner_dataloader(self):
+
         def sample_data_generator():
             batch_data = None
             for step, data in enumerate(self.dataset):
diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/dist_op.py
index a2c2748a8cea3..d48804b71fc3e 100644
--- a/python/paddle/distributed/auto_parallel/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
@@ -26,6 +26,7 @@
 
 
 class DistributedOperator:
+
     def __init__(self, serial_op, dist_attr=None):
         self._serial_op = serial_op
         self._serial_inputs = {}
@@ -248,6 +249,7 @@ def __deepcopy__(self, memo):
 
 
 class DistributedModule:
+
     def __init__(self, serial_module, dist_attr=None):
         self._serial_module = serial_module
         self._dist_attr = dist_attr
diff --git a/python/paddle/distributed/auto_parallel/dist_saver.py b/python/paddle/distributed/auto_parallel/dist_saver.py
index 261b18a56ec63..c3dad9e287386 100644
--- a/python/paddle/distributed/auto_parallel/dist_saver.py
+++ b/python/paddle/distributed/auto_parallel/dist_saver.py
@@ -53,6 +53,7 @@ def _process_path(path):
 
 
 class DistributedSaver:
+
     def __init__(self):
         self._logger = get_logger(logging.INFO)
 
@@ -114,8 +115,8 @@ def load(self,
                               param_file):
                 param_file_list.append(os.path.join(dirname, param_file))
         param_file_list.sort()
-        self._logger.info("Load distributed attribute file: {}".format(
-            param_file_list))
+        self._logger.info(
+            "Load distributed attribute file: {}".format(param_file_list))
         param_dict = {}
         for param_file in param_file_list:
             with open(param_file, 'rb') as f:
@@ -131,11 +132,11 @@ def load(self,
         for dist_attr_file in os.listdir(dirname):
             if check_filename('{}(.*)_dist(.*).pdattr'.format(filename),
                               dist_attr_file):
-                dist_attr_file_list.append(
-                    os.path.join(dirname, dist_attr_file))
+                dist_attr_file_list.append(os.path.join(dirname,
+                                                        dist_attr_file))
         dist_attr_file_list.sort()
-        self._logger.info("Load distributed attribute file: {}".format(
-            dist_attr_file_list))
+        self._logger.info(
+            "Load distributed attribute file: {}".format(dist_attr_file_list))
         pre_dist_attr = {}
         for dist_attr_file in dist_attr_file_list:
             with open(dist_attr_file, 'rb') as f:
@@ -206,12 +207,11 @@ def save_inference_model(self, path, feed_vars, fetch_vars, exe, **kwargs):
         # NOTE: `paddle.static.save_inference_model` does not support subblock.
         dist_filename = filename + "_dist" + str(rank_id)
         dist_path = os.path.join(dirname, dist_filename)
-        paddle.static.save_inference_model(
-            dist_path,
-            dist_feed_vars,
-            dist_fetch_vars,
-            exe,
-            program=dist_main_prog)
+        paddle.static.save_inference_model(dist_path,
+                                           dist_feed_vars,
+                                           dist_fetch_vars,
+                                           exe,
+                                           program=dist_main_prog)
 
     def _save_rank_mapping(self, dirname):
         path = os.path.join(dirname, 'rank_mapping.csv')
diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/dist_tensor.py
index e3f06da275182..b6228f5ad0e38 100644
--- a/python/paddle/distributed/auto_parallel/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/dist_tensor.py
@@ -40,26 +40,26 @@ def _validate_sizes_and_dist_attr(sizes,
                                       processes,
                                       rank=None,
                                       shard_sizes=None):
-        if not (isinstance(sizes, (list, tuple)) and
-                all(map(lambda x: isinstance(x, int) and x >= 0, sizes))):
+        if not (isinstance(sizes, (list, tuple))
+                and all(map(lambda x: isinstance(x, int) and x >= 0, sizes))):
             raise ValueError(
-                "The sizes must be list or tuple and item in sizes must be non-negative integer, but got {}".
-                format(sizes))
+                "The sizes must be list or tuple and item in sizes must be non-negative integer, but got {}"
+                .format(sizes))
         if not (isinstance(dims_mapping, (list, tuple)) and all(
                 map(lambda x: isinstance(x, int) and x >= -1, dims_mapping))):
             raise ValueError(
-                "The dims_mapping must be list or tuple and item in dims_mapping must >= -1, but got {}".
-                format(dims_mapping))
-        if not (isinstance(processes, (list, tuple)) and
-                all(map(lambda x: isinstance(x, int) and x >= 0, processes))):
+                "The dims_mapping must be list or tuple and item in dims_mapping must >= -1, but got {}"
+                .format(dims_mapping))
+        if not (isinstance(processes, (list, tuple)) and all(
+                map(lambda x: isinstance(x, int) and x >= 0, processes))):
             raise ValueError(
-                "The processes must be list or tuple and item in processes must be integer, but got {}".
-                format(processes))
-        if not (isinstance(topology, (list, tuple)) and
-                all(map(lambda x: isinstance(x, int) and x > 0, topology))):
+                "The processes must be list or tuple and item in processes must be integer, but got {}"
+                .format(processes))
+        if not (isinstance(topology, (list, tuple))
+                and all(map(lambda x: isinstance(x, int) and x > 0, topology))):
             raise ValueError(
-                "The topology must be list or tuple and item in topology must be non-negative integer, but got {}".
-                format(topology))
+                "The topology must be list or tuple and item in topology must be non-negative integer, but got {}"
+                .format(topology))
         if rank is not None and not (isinstance(rank, int) and rank >= 0):
             raise ValueError("The rank must >= 0, but got {}".format(rank))
 
@@ -74,8 +74,10 @@ def get_local_sizes(global_sizes,
                         processes,
                         rank=None,
                         shard_sizes=None):
-        DistributedTensor._validate_sizes_and_dist_attr(
-            global_sizes, dims_mapping, topology, processes, rank, shard_sizes)
+        DistributedTensor._validate_sizes_and_dist_attr(global_sizes,
+                                                        dims_mapping, topology,
+                                                        processes, rank,
+                                                        shard_sizes)
 
         local_sizes = []
         # for even sharding, the local sizes of every rank are equal
@@ -97,8 +99,10 @@ def get_local_offsets(global_sizes,
                           processes,
                           rank,
                           shard_sizes=None):
-        local_sizes = DistributedTensor.get_local_sizes(
-            global_sizes, dims_mapping, topology, processes, rank, shard_sizes)
+        local_sizes = DistributedTensor.get_local_sizes(global_sizes,
+                                                        dims_mapping, topology,
+                                                        processes, rank,
+                                                        shard_sizes)
         local_offsets = []
         rank_relatvie = processes.index(rank)
         coordinate = _linear_idx2coordinate(topology, rank_relatvie)
@@ -118,8 +122,10 @@ def get_global_sizes(local_sizes,
                          processes,
                          rank=None,
                          shard_sizes=None):
-        DistributedTensor._validate_sizes_and_dist_attr(
-            local_sizes, dims_mapping, topology, processes, rank, shard_sizes)
+        DistributedTensor._validate_sizes_and_dist_attr(local_sizes,
+                                                        dims_mapping, topology,
+                                                        processes, rank,
+                                                        shard_sizes)
         global_sizes = []
         for idx, item in enumerate(local_sizes):
             if dims_mapping[idx] == -1:
@@ -137,8 +143,10 @@ def get_local_shard(global_sizes,
                         shard_sizes=None):
         local_offsets = DistributedTensor.get_local_offsets(
             global_sizes, dims_mapping, topology, processes, rank, shard_sizes)
-        local_sizes = DistributedTensor.get_local_sizes(
-            global_sizes, dims_mapping, topology, processes, rank, shard_sizes)
+        local_sizes = DistributedTensor.get_local_sizes(global_sizes,
+                                                        dims_mapping, topology,
+                                                        processes, rank,
+                                                        shard_sizes)
         assert len(local_sizes) == len(
             local_offsets
         ), "The length of local_sizes must be equal to local_offsets, but got {} and {}.".format(
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index ab9391cf66fdb..fb12ae4971ae1 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -19,7 +19,7 @@
 import paddle
 import paddle.distributed.auto_parallel as auto
 
-from paddle import fluid
+from paddle import fluid, static
 from paddle.io import Dataset
 from paddle.metric import Metric
 from paddle.static import InputSpec
@@ -48,6 +48,7 @@
 
 
 class Engine:
+
     def __init__(self,
                  model=None,
                  inputs_spec=None,
@@ -71,8 +72,8 @@ def __init__(self,
         self._logger = get_logger(logging.INFO)
 
         self._default_strategy = None
-        self._orig_main_prog = fluid.default_main_program()
-        self._orig_startup_prog = fluid.default_startup_program()
+        self._orig_main_prog = static.default_main_program()
+        self._orig_startup_prog = static.default_startup_program()
         self._orig_dist_context = get_default_distributed_context()
         self._dist_contexts = {}
         self._serial_main_progs = {}
@@ -87,29 +88,133 @@ def prepare(self,
                 loss=None,
                 gradient_scale=True,
                 metrics=None,
-                mode='train',
                 all_ranks=False):
+        if optimizer and not isinstance(
+                optimizer,
+            (paddle.optimizer.Optimizer, paddle.fluid.optimizer.Optimizer)):
+            raise TypeError(
+                    "'optimizer' must be object of class `paddle.optimizer.Optimizer`" \
+                        " or `paddle.fluid.optimizer.Optimizer`."
+                )
         self._optimizer = optimizer
-        # TODO: check loss type
+
+        if loss and not isinstance(loss,
+                                   paddle.nn.Layer) and not callable(loss):
+            raise TypeError(
+                "'loss' must be sub classes of `paddle.nn.Layer` or any callable function."
+            )
         self._loss = loss
+
+        metrics = metrics or []
+        for metric in to_list(metrics):
+            assert isinstance(metric, Metric), \
+                "{} is not sub class of Metric".format(
+                    metric.__class__.__name__)
         self._metrics = to_list(metrics)
-        self._mode = mode
         self._gradient_scale = gradient_scale
+
+        self._planned_mode = None
+        self._modes = ['train', 'eval', 'predict']
         # Build forward program
-        self._build(mode)
-        # Do the planning process
-        planner = Planner(mode, self._dist_contexts[mode])
-        planner.plan()
+        self._build()
+
+        # Do auto parallel process
+        for mode in self._modes:
+            # Do the planning process
+            self._plan(mode)
+            # Do the parallel process
+            self._parallel(mode, all_ranks)
+            # Init comm and startup program
+            self._initialize(mode)
+
+    def _build(self):
+        for mode in self._modes:
+            serial_main_prog = self._serial_main_progs.get(mode, None)
+            if serial_main_prog is not None:
+                return
+
+            losses = []
+            metrics = []
+            serial_main_prog = self._orig_main_prog.clone()
+            serial_startup_prog = self._orig_startup_prog.clone()
+            with static.program_guard(serial_main_prog, serial_startup_prog):
+                inputs_spec = self.inputs_spec
+                labels_spec = self.labels_spec if self.labels_spec else []
+                inputs = [s._create_feed_layer() for s in inputs_spec]
+                labels = [s._create_feed_layer() for s in labels_spec]
+                outputs = to_list(self.model(*inputs))
+                if mode != "predict" and self._loss:
+                    losses = to_list(self._loss(*(outputs + labels)))
+
+                if mode != "predict":
+                    for metric in self._metrics:
+                        metrics.extend(
+                            to_list(metric.compute(*(outputs + labels))))
+
+            default_ctx = get_default_distributed_context()
+            if not default_ctx.has_annotation or self._default_strategy:
+                inputs = [self._set_data_parallel(var) for var in inputs]
+                labels = [self._set_data_parallel(var) for var in labels]
+
+            # self._feed_vars[mode] = {"inputs": inputs, "labels": labels}
+            feed_vars = {"inputs": inputs, "labels": labels}
+
+            # self._fetch_vars[mode] = {
+            #     "outputs": flatten(outputs),
+            #     "loss": losses,
+            #     "metrics": metrics
+            # }
+            fetch_vars = {
+                "outputs": flatten(outputs),
+                "loss": losses,
+                "metrics": metrics
+            }
+
+            self._dist_contexts[mode] = DistributedContext(
+                serial_main_prog, serial_startup_prog, self._optimizer, losses,
+                feed_vars, fetch_vars, self.cluster, self.strategy)
+            self._dist_contexts[mode].gradient_scale = self._gradient_scale
+
+    def _plan(self, mode):
+        if self._planned_mode is None:
+            self._planned_mode = mode
+        else:
+            self._init_dist_context(mode)
+
+        self.planner = Planner(mode, self._dist_contexts[mode])
+        self.planner.plan()
+
+    def _parallel(self, mode, all_ranks):
         # Parallelize program based on the planner's results
         # For now, the completer has to be passed to the planner,
         # because we may use it to complete the annotation of the backwarkward and update.
-        parallelizer = Parallelizer(mode, planner.completer,
+        parallelizer = Parallelizer(mode, self.planner.completer,
                                     self._dist_contexts[mode])
         if not all_ranks:
             parallelizer.parallel(self._cur_rank)
         else:
             parallelizer.parallel_all()
-        # Get the current content from the distributed context 
+
+    def _init_dist_context(self, mode):
+        # Init dist_context['mode'] with the first planned dist_context
+        # to guarantee that train/eval/predict mode have same parallel strategy
+        dist_context = self._dist_contexts[mode]
+        origin_main_prog = dist_context._original_serial_main_program
+        ref_mode = self._planned_mode
+        ref_dist_context = self._dist_contexts[ref_mode]
+        ref_origin_main_prog = ref_dist_context._original_serial_main_program
+        ref_blocks = ref_origin_main_prog.blocks
+        for ib, block in enumerate(origin_main_prog.blocks):
+            for iop, op in enumerate(block.ops):
+                ref_op = ref_blocks[ib].ops[iop]
+                assert op.type == ref_op.type, \
+                    "'{}' mode op '{}' is different with '{}' op '{}'. ".format(mode, op.type, ref_mode, ref_op.type)
+                ref_op_dist_attr = ref_dist_context.get_op_dist_attr_for_program(
+                    ref_op)
+                dist_context.set_op_dist_attr_for_program(op, ref_op_dist_attr)
+
+    def _initialize(self, mode):
+        # Get the current content from the distributed context
         self._serial_main_progs[mode] = self._dist_contexts[
             mode].serial_main_program
         self._serial_startup_progs[mode] = self._dist_contexts[
@@ -120,52 +225,7 @@ def prepare(self,
             mode].dist_startup_programs
         self._feed_vars[mode] = self._dist_contexts[mode].serial_feed_vars
         self._fetch_vars[mode] = self._dist_contexts[mode].serial_fetch_vars
-        # Init comm and startup program
-        self._initialize(mode)
-
-    def _build(self, mode):
-        serial_main_prog = self._serial_main_progs.get(mode, None)
-        if serial_main_prog is not None:
-            return
-
-        losses = []
-        metrics = []
-        serial_main_prog = self._orig_main_prog.clone()
-        serial_startup_prog = self._orig_startup_prog.clone()
-        with fluid.program_guard(serial_main_prog, serial_startup_prog):
-            inputs_spec = self.inputs_spec
-            labels_spec = self.labels_spec if self.labels_spec else []
-            inputs = [s._create_feed_layer() for s in inputs_spec]
-            labels = [s._create_feed_layer() for s in labels_spec]
-            outputs = to_list(self.model(*inputs))
-            if mode != "predict" and self._loss:
-                losses = to_list(self._loss(*(outputs + labels)))
-
-        default_ctx = get_default_distributed_context()
-        if not default_ctx.has_annotation or self._default_strategy:
-            inputs = [self._set_data_parallel(var) for var in inputs]
-            labels = [self._set_data_parallel(var) for var in labels]
-
-        # self._feed_vars[mode] = {"inputs": inputs, "labels": labels}
-        feed_vars = {"inputs": inputs, "labels": labels}
-
-        # self._fetch_vars[mode] = {
-        #     "outputs": flatten(outputs),
-        #     "loss": losses,
-        #     "metrics": metrics
-        # }
-        fetch_vars = {
-            "outputs": flatten(outputs),
-            "loss": losses,
-            "metrics": metrics
-        }
-
-        self._dist_contexts[mode] = DistributedContext(
-            serial_main_prog, serial_startup_prog, self._optimizer, losses,
-            feed_vars, fetch_vars, self.cluster, self.strategy)
-        self._dist_contexts[mode].gradient_scale = self._gradient_scale
 
-    def _initialize(self, mode):
         if self._nranks > 1:
             # Traverse different rank programs and traverse each op of them,
             # instantiate communication by process_mapping.
@@ -203,7 +263,7 @@ def fit(self,
         # TODO: evaluate after training
         self.mode = 'train'
         assert self.mode in self._dist_main_progs, \
-            "train model is not ready, please call `engine.prepare(mode='train')` first."
+            "train model is not ready, please call `engine.prepare()` first."
         train_dataloader = self._create_dataloader(train_data, batch_size,
                                                    epochs, steps_per_epoch)
 
@@ -227,16 +287,19 @@ def evaluate(self,
                  return_numpy=True):
         self.mode = 'eval'
         assert self.mode in self._dist_main_progs, \
-            "eval model is not ready, please call `engine.prepare(mode='eval')` first."
+            "eval model is not ready, please call `engine.prepare()` first."
         eval_dataloader = self._create_dataloader(eval_data, batch_size)
 
-        outputs = []
         for step, data in enumerate(eval_dataloader):
-            logs, outs = self._eval_step(data, use_program_cache, return_numpy)
-            outputs.append(outs)
-            predict_logs = {"eval_" + name: val for name, val in logs.items()}
-            self._logger.info(predict_logs)
-        return outputs
+            eval_logs = dict()
+            outs = self._eval_step(data, use_program_cache, return_numpy)
+            eval_logs["eval_loss"] = outs[0] if len(outs) > 0 else []
+            for metric in self._metrics:
+                results = metric.accumulate()
+                for i, res in enumerate(to_list(results)):
+                    eval_logs["eval_" + metric.name()[i]] = res
+            self._logger.info(eval_logs)
+        return eval_logs
 
     def predict(self,
                 test_data,
@@ -245,7 +308,7 @@ def predict(self,
                 return_numpy=True):
         self.mode = 'predict'
         assert self.mode in self._dist_main_progs, \
-            "predict model is not ready, please call `engine.prepare(mode='predict')` first."
+            "predict model is not ready, please call `engine.prepare()` first."
         test_dataloader = self._create_dataloader(test_data, batch_size)
 
         outputs = []
@@ -262,57 +325,53 @@ def predict(self,
 
     def _train_step(self, data, use_program_cache=False, return_numpy=True):
         logs = {}
-        dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
-        fetch_var = self._fetch_vars[self.mode]["loss"][0]
-        if fetch_var.name not in dist_main_prog.global_block().vars:
-            loss = self._executor.run(dist_main_prog,
-                                      use_program_cache=use_program_cache)
-            logs["loss"] = None
-        else:
-            loss = self._executor.run(dist_main_prog,
-                                      fetch_list=to_list(fetch_var),
-                                      use_program_cache=use_program_cache,
-                                      return_numpy=return_numpy)
-            logs["loss"] = loss
+        fetch_vars = self._fetch_vars[self.mode]["loss"]
+        fetch_list = self._fetch_list(fetch_vars)
+
+        loss = self._executor.run(self.main_program,
+                                  fetch_list=fetch_list,
+                                  use_program_cache=use_program_cache,
+                                  return_numpy=return_numpy)
+        logs["loss"] = loss
         return logs, loss
 
     def _eval_step(self, data, use_program_cache=False, return_numpy=True):
         logs = {}
-        dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
-        fetch_var = self._fetch_vars[self.mode]["loss"][0]
-
-        if fetch_var.name not in dist_main_prog.global_block().vars:
-            outs = self._executor.run(dist_main_prog,
-                                      use_program_cache=use_program_cache)
-            logs["loss"] = outs
-        else:
-            outs = self._executor.run(dist_main_prog,
-                                      fetch_list=fetch_var,
-                                      use_program_cache=use_program_cache,
-                                      return_numpy=return_numpy)
-            logs["loss"] = outs
-        return logs, outs
+        metrics = self._fetch_vars[self.mode]["metrics"]
+        losses = self._fetch_vars[self.mode]["loss"]
+        fetch_loss = self._fetch_list(losses)
+        fetch_metrics = self._fetch_list(metrics)
+        fetch_list = fetch_loss + fetch_metrics
+
+        res = self._executor.run(self.main_program,
+                                 fetch_list=fetch_list,
+                                 use_program_cache=use_program_cache,
+                                 return_numpy=return_numpy)
+        if not res[len(fetch_loss):]:
+            return res[:len(fetch_loss)]
+        for metric in self._metrics:
+            metric.update(*res[len(fetch_loss):])
+        return res[:len(fetch_loss)]
 
     def _predict_step(self, data, use_program_cache=False, return_numpy=True):
         logs = {}
-        dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
-        fetch_var = []
-        for var in self._fetch_vars[self.mode]["outputs"]:
-            if var.name in dist_main_prog.global_block().vars:
-                fetch_var.append(var)
-
-        if fetch_var is []:
-            outs = self._executor.run(dist_main_prog,
-                                      use_program_cache=use_program_cache)
-            logs["pred"] = outs
-        else:
-            outs = self._executor.run(dist_main_prog,
-                                      fetch_list=fetch_var,
-                                      use_program_cache=use_program_cache,
-                                      return_numpy=return_numpy)
-            logs["pred"] = outs
+        fetch_vars = self._fetch_vars[self.mode]["outputs"]
+        fetch_list = self._fetch_list(fetch_vars)
+
+        outs = self._executor.run(self.main_program,
+                                  fetch_list=fetch_list,
+                                  use_program_cache=use_program_cache,
+                                  return_numpy=return_numpy)
+        logs["pred"] = outs
         return logs, outs
 
+    def _fetch_list(self, fetch_vars):
+        fetch_list = []
+        for var in fetch_vars:
+            if var.name in self.main_program.global_block().vars:
+                fetch_list.append(var.name)
+        return fetch_list
+
     def _create_dataloader(self,
                            dataset,
                            batch_size,
@@ -323,15 +382,17 @@ def _create_dataloader(self,
         dist_context = self._dist_contexts[self.mode]
         dist_main_block = dist_main_prog.global_block()
 
-        # get feed_list from dist_program
+        # NOTE: Get feed_list from dist_program, then insert dataloader op
+        # with sharded var shape. Because predict_program does not contain
+        # labels var, so we will filter dataset's value with length of feed_list.
         inputs_var = self._feed_vars[self.mode]["inputs"]
         labels_var = self._feed_vars[self.mode]["labels"]
         feed_list = []
         for var in inputs_var + labels_var:
             if var.name in dist_main_block.vars:
                 feed_list.append(dist_main_block.vars[var.name])
-        dp_world_size, dp_rank = self._get_data_parallel_info(feed_list[0],
-                                                              dist_context)
+        dp_world_size, dp_rank = self._get_data_parallel_info(
+            feed_list[0], dist_context)
 
         # remove the first three ops if multi run fit/evaluate/predict
         op_size = len(dist_main_block.ops)
@@ -342,7 +403,7 @@ def _create_dataloader(self,
 
         # insert read op at the end of program
         places = paddle.static.cuda_places()
-        with fluid.program_guard(dist_main_prog, dist_startup_prog):
+        with static.program_guard(dist_main_prog, dist_startup_prog):
             dataloader = NonIterableGeneratorLoader(
                 dataset,
                 feed_list,
@@ -359,8 +420,9 @@ def _create_dataloader(self,
             op = dist_main_block.ops[new_op_size - 1]
             new_op_desc = dist_main_block.desc._prepend_op()
             new_op_desc.copy_from(op.desc)
-            new_op = Operator(
-                dist_main_block, new_op_desc, type=new_op_desc.type())
+            new_op = Operator(dist_main_block,
+                              new_op_desc,
+                              type=new_op_desc.type())
             dist_main_block.ops.insert(0, new_op)
             dist_op = DistributedOperator(new_op)
             dist_context.add_dist_op_for_program(dist_op)
@@ -383,21 +445,21 @@ def _validate_spec(self, specs):
     def _set_data_parallel(self, var):
         if self._nranks == 1:
             self._default_strategy = 'serial'
-            auto.shard_tensor(
-                var,
-                dist_attr={
-                    "process_mesh": [0],
-                    "dims_mapping": [-1 for _ in range(len(var.shape))]
-                })
+            auto.shard_tensor(var,
+                              dist_attr={
+                                  "process_mesh": [0],
+                                  "dims_mapping":
+                                  [-1 for _ in range(len(var.shape))]
+                              })
         else:
             self._default_strategy = 'dp'
-            auto.shard_tensor(
-                var,
-                dist_attr={
-                    "process_mesh": list(range(self._nranks)),
-                    "dims_mapping":
-                    [0] + [-1 for _ in range(len(var.shape) - 1)]
-                })
+            auto.shard_tensor(var,
+                              dist_attr={
+                                  "process_mesh":
+                                  list(range(self._nranks)),
+                                  "dims_mapping":
+                                  [0] + [-1 for _ in range(len(var.shape) - 1)]
+                              })
 
         return var
 
@@ -433,22 +495,20 @@ def save(self, path, training=True, mode=None):
             serial_program = self._serial_main_progs["train"]
             dist_main_prog = self._dist_main_progs["train"][self._cur_rank]
             dist_context = self._dist_contexts["train"]
-            self._saver.save(
-                path,
-                serial_program=serial_program,
-                dist_main_program=dist_main_prog,
-                dist_context=dist_context)
+            self._saver.save(path,
+                             serial_program=serial_program,
+                             dist_main_program=dist_main_prog,
+                             dist_context=dist_context)
         else:
             assert mode, "Please set the 'mode' you want to save."
             feed_vars = self._feed_vars[mode]['inputs']
             fetch_vars = self._fetch_vars[mode]['outputs']
             dist_main_prog = self._dist_main_progs[mode][self._cur_rank]
-            self._saver.save_inference_model(
-                path,
-                feed_vars,
-                fetch_vars,
-                self._executor,
-                program=dist_main_prog)
+            self._saver.save_inference_model(path,
+                                             feed_vars,
+                                             fetch_vars,
+                                             self._executor,
+                                             program=dist_main_prog)
 
     def load(self, path, strict=True, load_optimizer=True, mode=None):
         if not mode:
@@ -468,10 +528,6 @@ def mode(self):
     def mode(self, mode):
         self._mode = mode
 
-    @property
-    def metrics(self):
-        return self._metrics
-
     @property
     def main_program(self):
         return self._dist_main_progs[self.mode][self._cur_rank]
diff --git a/python/paddle/distributed/auto_parallel/graph.py b/python/paddle/distributed/auto_parallel/graph.py
index 14856e390709e..de6505071abfe 100644
--- a/python/paddle/distributed/auto_parallel/graph.py
+++ b/python/paddle/distributed/auto_parallel/graph.py
@@ -14,6 +14,7 @@
 
 
 class Node:
+
     def __init__(self, id, **attrs):
         # Each node must has a unique id
         self._id = id
@@ -47,6 +48,7 @@ def __str__(self):
 
 
 class Edge:
+
     def __init__(self, src_id, tgt_id, **attrs):
         # The id of source node in an Edge
         self._src_id = src_id
@@ -88,6 +90,7 @@ def __str__(self):
 
 
 class Graph:
+
     def __init__(self, **attrs):
         # _nodes is dict for storing the nodes of the graph.
         # The key of this dict is the node id.
diff --git a/python/paddle/distributed/auto_parallel/mapper.py b/python/paddle/distributed/auto_parallel/mapper.py
index f5d9c32d33eb3..da76ae8127192 100644
--- a/python/paddle/distributed/auto_parallel/mapper.py
+++ b/python/paddle/distributed/auto_parallel/mapper.py
@@ -171,8 +171,9 @@ def build_process_graph(distributed_program):
             src_info, src_rank)
         graph.add_node(src_rank, resource_requirements=resource_requirements)
         for tgt_rank, comm_requirements in comm_requirements_to_ranks.items():
-            graph.add_edge(
-                src_rank, tgt_rank, comm_requirements=comm_requirements)
+            graph.add_edge(src_rank,
+                           tgt_rank,
+                           comm_requirements=comm_requirements)
     return graph
 
 
@@ -192,8 +193,9 @@ def build_cluster_graph(cluster):
             else:
                 graph.nodes[device.global_id]["occupied"] = False
         for link in machine.links.values():
-            graph.add_edge(
-                link.source.global_id, link.target.global_id, link=link)
+            graph.add_edge(link.source.global_id,
+                           link.target.global_id,
+                           link=link)
     return graph
 
 
@@ -233,8 +235,8 @@ def select_unvisited_rank_node(rank_node_list):
             device_type = cur_rank_node["resource_requirements"]["device_type"]
             cur_device_node = None
             for device_node in cluster_graph.nodes.values():
-                if (device_node["device"].type == device_type) and (
-                        not device_node["occupied"]):
+                if (device_node["device"].type
+                        == device_type) and (not device_node["occupied"]):
                     device_node["occupied"] = True
                     cur_rank_node["visited"] = True
                     cur_rank_node["device"] = device_node["device"]
@@ -257,8 +259,8 @@ def select_unvisited_rank_node(rank_node_list):
             nbr_device_edges.sort(key=sort_by_comm_bandwidth)
 
             for nbr_rank_edge in nbr_rank_edges:
-                src_rank_node = process_graph.nodes[nbr_rank_edge.src_id][
-                    "visited"]
+                src_rank_node = process_graph.nodes[
+                    nbr_rank_edge.src_id]["visited"]
                 if src_rank_node:
                     continue
                 device_type = src_rank_node["resource_requirements"][
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index 6b3c655f293bd..2e86f97d7a26d 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -32,6 +32,7 @@ def is_elementwise_op(op_type):
 
 
 class DistributedOperatorImplContainer:
+
     def __init__(self, op_type):
         self._type = op_type
         self._impls = []
@@ -81,6 +82,7 @@ def get_compatible_impls(self, dist_op):
 
 
 class DistributedOperatorImpl(abc.ABC):
+
     def __init__(self, name):
         self._name = name
         self._type = None
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
index 79a86169d5a12..0a4bfb1213d46 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
@@ -30,6 +30,7 @@
 
 
 class DistributedCheckFiniteAndUnscale(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedCheckFiniteAndUnscale, self).__init__(op_type)
 
@@ -39,6 +40,7 @@ def __init__(self, op_type):
 
 
 class DistributedCheckFiniteAndUnscaleImpl(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedCheckFiniteAndUnscaleImpl, self).__init__(name)
         self._name = name
@@ -122,41 +124,37 @@ def backward(ctx, *args, **kwargs):
         group = new_process_group(world_process_group.ranks)
 
         inf_var = main_block.var(kwargs['FoundInfinite'][0])
-        inf_var_int32 = main_block.create_var(
-            name=inf_var.name + "@cast_int32",
-            shape=inf_var.shape,
-            dtype=core.VarDesc.VarType.INT32)
+        inf_var_int32 = main_block.create_var(name=inf_var.name + "@cast_int32",
+                                              shape=inf_var.shape,
+                                              dtype=core.VarDesc.VarType.INT32)
         set_var_dist_attr(
             ctx, inf_var_int32,
             ctx.get_tensor_dist_attr_for_program(inf_var).dims_mapping,
             ctx.get_tensor_dist_attr_for_program(inf_var).process_mesh)
-        cast_op1 = main_block.append_op(
-            type='cast',
-            inputs={'X': inf_var},
-            outputs={'Out': inf_var_int32},
-            attrs={
-                "in_dtype": inf_var.dtype,
-                "out_dtype": inf_var_int32.dtype,
-                OP_ROLE_KEY: OpRole.Backward
-            })
-        allreduce_op = main_block.append_op(
-            type='c_allreduce_max',
-            inputs={'X': inf_var_int32},
-            outputs={'Out': inf_var_int32},
-            attrs={
-                'ring_id': group.id,
-                'use_calc_stream': True,
-                OP_ROLE_KEY: OpRole.Backward
-            })
-        cast_op2 = main_block.append_op(
-            type='cast',
-            inputs={'X': inf_var_int32},
-            outputs={'Out': inf_var},
-            attrs={
-                "in_dtype": inf_var_int32.dtype,
-                "out_dtype": inf_var.dtype,
-                OP_ROLE_KEY: OpRole.Backward
-            })
+        cast_op1 = main_block.append_op(type='cast',
+                                        inputs={'X': inf_var},
+                                        outputs={'Out': inf_var_int32},
+                                        attrs={
+                                            "in_dtype": inf_var.dtype,
+                                            "out_dtype": inf_var_int32.dtype,
+                                            OP_ROLE_KEY: OpRole.Backward
+                                        })
+        allreduce_op = main_block.append_op(type='c_allreduce_max',
+                                            inputs={'X': inf_var_int32},
+                                            outputs={'Out': inf_var_int32},
+                                            attrs={
+                                                'ring_id': group.id,
+                                                'use_calc_stream': True,
+                                                OP_ROLE_KEY: OpRole.Backward
+                                            })
+        cast_op2 = main_block.append_op(type='cast',
+                                        inputs={'X': inf_var_int32},
+                                        outputs={'Out': inf_var},
+                                        attrs={
+                                            "in_dtype": inf_var_int32.dtype,
+                                            "out_dtype": inf_var.dtype,
+                                            OP_ROLE_KEY: OpRole.Backward
+                                        })
         main_block._sync_with_cpp()
 
         for op in [cast_op1, allreduce_op, cast_op2]:
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index 78f30422e742f..a2b1b7826d51f 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -47,28 +47,26 @@ def prim_operator_data_parallel_functor(ctx, src_op):
         ctx.synced_gradient.add(var_name)
         sync_group = new_process_group(ctx.data_parallel_group)
 
-        allreduce_op = main_block.append_op(
-            type='c_allreduce_sum',
-            inputs={'X': [var_name]},
-            outputs={'Out': [var_name]},
-            attrs={
-                'ring_id': sync_group.id,
-                'use_calc_stream': True,
-                OP_ROLE_KEY: OpRole.Backward
-            })
+        allreduce_op = main_block.append_op(type='c_allreduce_sum',
+                                            inputs={'X': [var_name]},
+                                            outputs={'Out': [var_name]},
+                                            attrs={
+                                                'ring_id': sync_group.id,
+                                                'use_calc_stream': True,
+                                                OP_ROLE_KEY: OpRole.Backward
+                                            })
 
         param = ctx.grads_params[var_name]
         startup_block = dist_op_context.startup_block
-        new_op = startup_block.append_op(
-            type='c_broadcast',
-            inputs={'X': [param]},
-            outputs={'Out': [param]},
-            attrs={
-                'ring_id': sync_group.id,
-                'root': 0,
-                'use_calc_stream': True,
-                OP_ROLE_KEY: OpRole.Forward
-            })
+        new_op = startup_block.append_op(type='c_broadcast',
+                                         inputs={'X': [param]},
+                                         outputs={'Out': [param]},
+                                         attrs={
+                                             'ring_id': sync_group.id,
+                                             'root': 0,
+                                             'use_calc_stream': True,
+                                             OP_ROLE_KEY: OpRole.Forward
+                                         })
 
         grad_var = main_block.var(var_name)
         dims_mapping = ctx.get_tensor_dist_attr_for_program(
@@ -85,6 +83,7 @@ def prim_operator_data_parallel_functor(ctx, src_op):
 
 
 class DistributedDefault(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedDefault, self).__init__(op_type)
 
@@ -94,6 +93,7 @@ def __init__(self, op_type):
 
 # Replicated Default
 class DistributedDefaultImpl0(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedDefaultImpl0, self).__init__(name)
         self._forward_implemented = True
@@ -277,8 +277,8 @@ def update_dims_mapping(self, dist_op):
                 batch_dim_mappings.append(dims_mapping[1])
         for arg_name in op_desc.output_arg_names():
             if op_desc.type() == "fill_zeros_like":
-                input_tensor = dist_op.get_serial_input(op_desc.input_arg_names(
-                )[0])
+                input_tensor = dist_op.get_serial_input(
+                    op_desc.input_arg_names()[0])
                 if input_tensor.is_parameter:
                     continue
             serial_tensor = dist_op.get_serial_output(arg_name)
@@ -316,8 +316,8 @@ def update_dims_mapping(self, dist_op):
                     changed = True
         for arg_name in op_desc.output_arg_names():
             if op_desc.type() == "fill_zeros_like":
-                input_tensor = dist_op.get_serial_input(op_desc.input_arg_names(
-                )[0])
+                input_tensor = dist_op.get_serial_input(
+                    op_desc.input_arg_names()[0])
                 if input_tensor.is_parameter:
                     continue
             if op_desc.type() in ["shape", "slice"]:
@@ -363,7 +363,7 @@ def forward(ctx, *args, **kwargs):
                 output_name)
 
         # replicate op in dist program
-        dist_op_desc = main_block.append_op(type='nop').desc
+        dist_op_desc = main_block.desc.append_op()
         dist_op_desc.copy_from(src_op.desc)
         set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx)
         for input_name in src_op.desc.input_names():
@@ -371,6 +371,8 @@ def forward(ctx, *args, **kwargs):
         for output_name in src_op.desc.output_names():
             dist_op_desc.set_output(output_name, kwargs[output_name])
 
+        main_block._sync_with_cpp()
+
         # data parallel synchronization for primtive operators
         from paddle.incubate.autograd import prim_enabled
         if prim_enabled():
@@ -407,16 +409,19 @@ def forward(ctx, *args, **kwargs):
                                                       axis, rank_id)
                         sync_group = new_process_group(group_ranks)
 
-                        new_op = startup_block.append_op(
-                            type='c_broadcast',
-                            inputs={'X': param},
-                            outputs={'Out': param},
-                            attrs={
-                                'ring_id': sync_group.id,
-                                'root': 0,
-                                'use_calc_stream': True,
-                                OP_ROLE_KEY: OpRole.Forward
-                            })
+                        new_op = startup_block.append_op(type='c_broadcast',
+                                                         inputs={'X': param},
+                                                         outputs={'Out': param},
+                                                         attrs={
+                                                             'ring_id':
+                                                             sync_group.id,
+                                                             'root':
+                                                             0,
+                                                             'use_calc_stream':
+                                                             True,
+                                                             OP_ROLE_KEY:
+                                                             OpRole.Forward
+                                                         })
 
                         # set distributed attribute
                         op_attr = OperatorDistributedAttribute()
@@ -426,6 +431,8 @@ def forward(ctx, *args, **kwargs):
                         op_attr.set_input_dims_mapping(param.name, dims_mapping)
                         ctx.set_op_dist_attr_for_program(new_op, op_attr)
 
+                startup_block._sync_with_cpp()
+
     @staticmethod
     def backward(ctx, *args, **kwargs):
 
@@ -480,8 +487,8 @@ def backward(ctx, *args, **kwargs):
 
                     # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
                     if rank_id not in process_mesh.processes:
-                        rank_id = _get_corresponding_rank(ctx, process_mesh,
-                                                          rank_id)
+                        rank_id = _get_corresponding_rank(
+                            ctx, process_mesh, rank_id)
 
                     mesh_shape = process_mesh.topology
                     batch_size_axis = var_dim_mapping[0]
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
index 78589afc498ee..02f2741d884f9 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
@@ -35,6 +35,7 @@
 
 
 class DistributedElementwise(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedElementwise, self).__init__(op_type)
 
@@ -45,6 +46,7 @@ def __init__(self, op_type):
 
 # Replicated Elementwise
 class DistributedElementwiseImpl0(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedElementwiseImpl0, self).__init__(name)
         self._forward_implemented = False
@@ -208,8 +210,8 @@ def update_dims_mapping(self, dist_op):
                     changed = True
             else:
                 if compatible_dims_mapping != input_dims_mapping_dict[arg_name]:
-                    op_dist_attr.set_input_dims_mapping(arg_name,
-                                                        compatible_dims_mapping)
+                    op_dist_attr.set_input_dims_mapping(
+                        arg_name, compatible_dims_mapping)
                     changed = True
 
         for arg_name in output_arg_names:
@@ -222,12 +224,11 @@ def update_dims_mapping(self, dist_op):
                                output_dims_mapping_lens[arg_name]) + i
                     new_dims_mapping[i] = compatible_dims_mapping[new_idx]
                 if new_dims_mapping != output_dims_mapping_dict[arg_name]:
-                    op_dist_attr.set_output_dims_mapping(arg_name,
-                                                         new_dims_mapping)
+                    op_dist_attr.set_output_dims_mapping(
+                        arg_name, new_dims_mapping)
                     changed = True
             else:
-                if compatible_dims_mapping != output_dims_mapping_dict[
-                        arg_name]:
+                if compatible_dims_mapping != output_dims_mapping_dict[arg_name]:
                     op_dist_attr.set_output_dims_mapping(
                         arg_name, compatible_dims_mapping)
                     changed = True
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
index ae6397391ac12..2272400e60ddf 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -34,6 +34,7 @@
 
 
 class DistributedEmbedding(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedEmbedding, self).__init__(op_type)
 
@@ -46,6 +47,7 @@ def __init__(self, op_type):
 
 # RowParallel
 class DistributedEmbeddingImpl(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedEmbeddingImpl, self).__init__(name)
         self._forward_implemented = True
@@ -58,8 +60,8 @@ def is_input_compatible(self, dist_op):
         w_name = op_desc.input('W')[0]
         ids_dims_mapping = op_dist_attr.get_input_dims_mapping(ids_name)
         w_dims_mapping = op_dist_attr.get_input_dims_mapping(w_name)
-        if is_dim_replicate(w_dims_mapping[-2]) or is_dim_shard(w_dims_mapping[
-                -1]):
+        if is_dim_replicate(w_dims_mapping[-2]) or is_dim_shard(
+                w_dims_mapping[-1]):
             return False
         # Other dimensions must be replicate except the batch dimension
         for mapping in ids_dims_mapping[1:]:
@@ -215,8 +217,10 @@ def forward(ctx, *args, **kwargs):
 
         c_embedding_op = main_block.append_op(
             type='c_embedding',
-            inputs={'Ids': [Ids_var],
-                    'W': [Weight_var]},
+            inputs={
+                'Ids': [Ids_var],
+                'W': [Weight_var]
+            },
             outputs={'Out': [intermediate_var_0]},
             attrs={"start_index": relative_idx})
         if intermediate_var_0.shape != ref_shape:
@@ -295,16 +299,15 @@ def forward(ctx, *args, **kwargs):
                                                   rank_id)
                     sync_group = new_process_group(group_ranks)
 
-                    startup_block.append_op(
-                        type='c_broadcast',
-                        inputs={'X': param},
-                        outputs={'Out': param},
-                        attrs={
-                            'ring_id': sync_group.id,
-                            'root': 0,
-                            'use_calc_stream': True,
-                            OP_ROLE_KEY: OpRole.Forward
-                        })
+                    startup_block.append_op(type='c_broadcast',
+                                            inputs={'X': param},
+                                            outputs={'Out': param},
+                                            attrs={
+                                                'ring_id': sync_group.id,
+                                                'root': 0,
+                                                'use_calc_stream': True,
+                                                OP_ROLE_KEY: OpRole.Forward
+                                            })
             startup_block._sync_with_cpp()
 
     @staticmethod
@@ -440,21 +443,21 @@ def backward(ctx, *args, **kwargs):
 
         if need_gradient_allreduce:
             W_Grad_var = main_block.var(kwargs['W@GRAD'][0])
-            allreduce_op = main_block.append_op(
-                type='c_allreduce_sum',
-                inputs={'X': [W_Grad_var]},
-                outputs={'Out': [W_Grad_var]},
-                attrs={
-                    'ring_id': dp_group.id,
-                    'use_calc_stream': True,
-                    OP_ROLE_KEY: OpRole.Backward
-                })
-            scale_op = main_block.append_op(
-                type='scale',
-                inputs={'X': W_Grad_var},
-                outputs={'Out': W_Grad_var},
-                attrs={'scale': 1.0 / dp_degree,
-                       OP_ROLE_KEY: OpRole.Backward})
+            allreduce_op = main_block.append_op(type='c_allreduce_sum',
+                                                inputs={'X': [W_Grad_var]},
+                                                outputs={'Out': [W_Grad_var]},
+                                                attrs={
+                                                    'ring_id': dp_group.id,
+                                                    'use_calc_stream': True,
+                                                    OP_ROLE_KEY: OpRole.Backward
+                                                })
+            scale_op = main_block.append_op(type='scale',
+                                            inputs={'X': W_Grad_var},
+                                            outputs={'Out': W_Grad_var},
+                                            attrs={
+                                                'scale': 1.0 / dp_degree,
+                                                OP_ROLE_KEY: OpRole.Backward
+                                            })
             main_block._sync_with_cpp()
 
             dims_mapping = ctx.get_tensor_dist_attr_for_program(
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
index 80ac019e83035..763e47802b333 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
@@ -31,6 +31,7 @@
 
 
 class DistributedFillConstantBatchSizeLike(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedFillConstantBatchSizeLike, self).__init__(op_type)
 
@@ -40,6 +41,7 @@ def __init__(self, op_type):
 
 
 class DistributedFillConstantBatchSizeLikeImpl0(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedFillConstantBatchSizeLikeImpl0, self).__init__(name)
         self._forward_implemented = True
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py b/python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py
index bc3992ec03d4b..23519647d3398 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,6 +27,7 @@
 
 
 class DistributedFusedAttention(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedFusedAttention, self).__init__(op_type)
 
@@ -36,6 +37,7 @@ def __init__(self, op_type):
 
 
 class DistributedFusedAttentionImpl(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedFusedAttentionImpl, self).__init__(name)
         self._forward_implemented = True
@@ -60,8 +62,8 @@ def is_input_compatible(self, dist_op):
         for mapping in x_dims_mapping[1:-1]:
             if is_dim_shard(mapping):
                 return False
-        if len(qkv_w_dims_mapping) != 4 or is_dim_replicate(qkv_w_dims_mapping[
-                head_axis]):
+        if len(qkv_w_dims_mapping) != 4 or is_dim_replicate(
+                qkv_w_dims_mapping[head_axis]):
             return False
         if len(qkv_bias_dims_mapping) != 3 or is_dim_replicate(
                 qkv_bias_dims_mapping[head_axis]):
@@ -91,7 +93,7 @@ def is_output_compatible(self, dist_op):
         op_desc = dist_op.serial_op.desc
         op_dist_attr = dist_op.dist_attr
 
-        # none of output should be sharded 
+        # none of output should be sharded
         for out_name in op_desc.output_names():
             out = op_desc.output(out_name)[0]
             out_dims_mapping = op_dist_attr.get_output_dims_mapping(out)
@@ -152,8 +154,8 @@ def forward(ctx, *args, **kwargs):
         # infer logic comm presentation
         head_axis = 1
         qkv_w = src_op.input('QKVW')[0]
-        qkv_w_col_dim_mapping = op_dist_attr.get_input_dims_mapping(qkv_w)[
-            head_axis]
+        qkv_w_col_dim_mapping = op_dist_attr.get_input_dims_mapping(
+            qkv_w)[head_axis]
         assert qkv_w_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
             qkv_w_col_dim_mapping)
         process_mesh_shape = op_dist_attr.process_mesh.topology
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py b/python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py
index 76f526adbbfaa..50735cf285754 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,6 +27,7 @@
 
 
 class DistributedFusedFeedForward(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedFusedFeedForward, self).__init__(op_type)
 
@@ -36,6 +37,7 @@ def __init__(self, op_type):
 
 
 class DistributedFusedFeedForwardImpl(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedFusedFeedForwardImpl, self).__init__(name)
         self._forward_implemented = True
@@ -82,7 +84,7 @@ def is_output_compatible(self, dist_op):
         op_desc = dist_op.serial_op.desc
         op_dist_attr = dist_op.dist_attr
 
-        # none of output should be sharded 
+        # none of output should be sharded
         for out_name in op_desc.output_names():
             out = op_desc.output(out_name)[0]
             out_dims_mapping = op_dist_attr.get_output_dims_mapping(out)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 69e1c866de691..427932a77fbcd 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -169,13 +169,13 @@ def _is_auto_compatible_for_matmul(dist_op):
     # NOTE: Partition is not supported if matmul op has trans.
     if op_desc.type() == "matmul_v2":
         if op_desc.attr('trans_x') or op_desc.attr('trans_y'):
-            if x_dims_mapping[-2:] != [-1, -1] or y_dims_mapping[
-                    -2:] != [-1, -1]:
+            if x_dims_mapping[-2:] != [-1, -1
+                                       ] or y_dims_mapping[-2:] != [-1, -1]:
                 return False
     elif op_desc.type() == "matmul":
         if op_desc.attr('transpose_X') or op_desc.attr('transpose_Y'):
-            if x_dims_mapping[-2:] != [-1, -1] or y_dims_mapping[
-                    -2:] != [-1, -1]:
+            if x_dims_mapping[-2:] != [-1, -1
+                                       ] or y_dims_mapping[-2:] != [-1, -1]:
                 return False
 
     # Deal with dim > 2 and take care of broadcasting
@@ -197,8 +197,8 @@ def _is_auto_compatible_for_matmul(dist_op):
         for i in range(out_dims_mapping_len - 2):
             broadcast_out_dims_mapping.append(out_dims_mapping[i])
 
-        is_same = ((broadcast_x_dims_mapping == broadcast_y_dims_mapping) and
-                   (broadcast_x_dims_mapping == broadcast_out_dims_mapping))
+        is_same = ((broadcast_x_dims_mapping == broadcast_y_dims_mapping)
+                   and (broadcast_x_dims_mapping == broadcast_out_dims_mapping))
         if not is_same:
             return False
 
@@ -307,8 +307,9 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
             ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
                                                  out_grad_dist_attr)
 
-            group_ranks = _get_comm_group(
-                process_mesh_group, process_mesh_shape, parallel_axis, rank_id)
+            group_ranks = _get_comm_group(process_mesh_group,
+                                          process_mesh_shape, parallel_axis,
+                                          rank_id)
             group = new_process_group(group_ranks)
             c_identity_op = main_block.append_op(
                 type='c_identity',
@@ -325,8 +326,9 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
                                      'linear')
             check_dtype(intermediate_var_0.dtype, 'dtype',
                         ['float16', 'float32', 'float64'], 'linear')
-            set_comm_op_dist_attr_for_program(
-                c_identity_op, dist_attr.process_mesh, out_grad_dist_attr, ctx)
+            set_comm_op_dist_attr_for_program(c_identity_op,
+                                              dist_attr.process_mesh,
+                                              out_grad_dist_attr, ctx)
 
             new_kwargs = copy.deepcopy(kwargs)
             new_kwargs['Out@GRAD'] = [intermediate_var_0.name]
@@ -404,21 +406,21 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
 
     if need_gradient_allreduce and is_parameter_related(Y_var.name, main_block):
         Y_Grad_var = main_block.var(kwargs['Y@GRAD'][0])
-        allreduce_op = main_block.append_op(
-            type='c_allreduce_sum',
-            inputs={'X': [Y_Grad_var]},
-            outputs={'Out': [Y_Grad_var]},
-            attrs={
-                'ring_id': dp_group.id,
-                'use_calc_stream': True,
-                OP_ROLE_KEY: OpRole.Backward
-            })
-        scale_op = main_block.append_op(
-            type='scale',
-            inputs={'X': Y_Grad_var},
-            outputs={'Out': Y_Grad_var},
-            attrs={'scale': 1.0 / dp_degree,
-                   OP_ROLE_KEY: OpRole.Backward})
+        allreduce_op = main_block.append_op(type='c_allreduce_sum',
+                                            inputs={'X': [Y_Grad_var]},
+                                            outputs={'Out': [Y_Grad_var]},
+                                            attrs={
+                                                'ring_id': dp_group.id,
+                                                'use_calc_stream': True,
+                                                OP_ROLE_KEY: OpRole.Backward
+                                            })
+        scale_op = main_block.append_op(type='scale',
+                                        inputs={'X': Y_Grad_var},
+                                        outputs={'Out': Y_Grad_var},
+                                        attrs={
+                                            'scale': 1.0 / dp_degree,
+                                            OP_ROLE_KEY: OpRole.Backward
+                                        })
         main_block._sync_with_cpp()
 
         dims_mapping = ctx.get_tensor_dist_attr_for_program(
@@ -451,20 +453,20 @@ def _init_param_sync(Weight_var, dist_op_context, startup_block, ctx, rank_id):
                                           process_mesh.topology, axis, rank_id)
             sync_group = new_process_group(group_ranks)
 
-            startup_block.append_op(
-                type='c_broadcast',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={
-                    'ring_id': sync_group.id,
-                    'root': 0,
-                    'use_calc_stream': True,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
+            startup_block.append_op(type='c_broadcast',
+                                    inputs={'X': param},
+                                    outputs={'Out': param},
+                                    attrs={
+                                        'ring_id': sync_group.id,
+                                        'root': 0,
+                                        'use_calc_stream': True,
+                                        OP_ROLE_KEY: OpRole.Forward
+                                    })
     startup_block._sync_with_cpp()
 
 
 class DistributedMatmul(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedMatmul, self).__init__(op_type)
 
@@ -474,6 +476,7 @@ def __init__(self, op_type):
 
 # ColumnParallel
 class DistributedMatmulImpl0(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedMatmulImpl0, self).__init__(name)
         self._forward_implemented = True
@@ -488,8 +491,8 @@ def is_input_compatible(self, dist_op):
         y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
         if is_dim_shard(x_dims_mapping[-1]):
             return False
-        if is_dim_shard(y_dims_mapping[-2]) or is_dim_replicate(y_dims_mapping[
-                -1]):
+        if is_dim_shard(y_dims_mapping[-2]) or is_dim_replicate(
+                y_dims_mapping[-1]):
             return False
         for mapping in x_dims_mapping[1:-1]:
             if is_dim_shard(mapping):
@@ -628,8 +631,10 @@ def forward(ctx, *args, **kwargs):
             'alpha': 1,
         }
         inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]}
-        matmul_op = main_block.append_op(
-            type='matmul', inputs=inputs, outputs={'Out': Out_var}, attrs=attrs)
+        matmul_op = main_block.append_op(type='matmul',
+                                         inputs=inputs,
+                                         outputs={'Out': Out_var},
+                                         attrs=attrs)
         if Out_var.shape != ref_shape_out:
             Out_var.desc.set_shape(ref_shape_out)
 
@@ -695,6 +700,7 @@ def backward(ctx, *args, **kwargs):
 
 # RowParallel
 class DistributedMatmulImpl1(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedMatmulImpl1, self).__init__(name)
         self._forward_implemented = True
@@ -709,8 +715,8 @@ def is_input_compatible(self, dist_op):
         y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
         if is_dim_replicate(x_dims_mapping[-1]):
             return False
-        if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
-                -1]):
+        if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(
+                y_dims_mapping[-1]):
             return False
         # Other dimensions must be replicate except the batch dimension
         for mapping in x_dims_mapping[1:-1]:
@@ -833,11 +839,10 @@ def forward(ctx, *args, **kwargs):
         ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
                                              out_var_dist_attr)
 
-        matmul_op = main_block.append_op(
-            type='matmul',
-            inputs=inputs,
-            outputs={'Out': intermediate_var_0},
-            attrs=attrs)
+        matmul_op = main_block.append_op(type='matmul',
+                                         inputs=inputs,
+                                         outputs={'Out': intermediate_var_0},
+                                         attrs=attrs)
         if intermediate_var_0.shape != ref_shape:
             intermediate_var_0.desc.set_shape(ref_shape)
 
@@ -905,6 +910,7 @@ def backward(ctx, *args, **kwargs):
 
 # ReplicateParallel
 class DistributedMatmulImpl2(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedMatmulImpl2, self).__init__(name)
 
@@ -918,14 +924,14 @@ def is_input_compatible(self, dist_op):
 
         if is_dim_shard(x_dims_mapping[-1]):
             return False
-        if is_valid_list_index(x_dims_mapping,
-                               -2) and is_dim_shard(x_dims_mapping[-2]):
+        if is_valid_list_index(x_dims_mapping, -2) and is_dim_shard(
+                x_dims_mapping[-2]):
             return False
 
         if is_dim_shard(y_dims_mapping[-1]):
             return False
-        if is_valid_list_index(y_dims_mapping,
-                               -2) and is_dim_shard(y_dims_mapping[-2]):
+        if is_valid_list_index(y_dims_mapping, -2) and is_dim_shard(
+                y_dims_mapping[-2]):
             return False
 
         return True
@@ -938,8 +944,8 @@ def is_output_compatible(self, dist_op):
 
         if is_dim_shard(out_dims_mapping[-1]):
             return False
-        if is_valid_list_index(out_dims_mapping,
-                               -2) and is_dim_shard(out_dims_mapping[-2]):
+        if is_valid_list_index(out_dims_mapping, -2) and is_dim_shard(
+                out_dims_mapping[-2]):
             return False
 
         return True
@@ -979,6 +985,7 @@ def backward(ctx, *args, **kwargs):
 
 
 class DistributedMatmulV2(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedMatmulV2, self).__init__(op_type)
 
@@ -988,6 +995,7 @@ def __init__(self, op_type):
 
 # ColumnParallel
 class DistributedMatmulV2Impl0(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedMatmulV2Impl0, self).__init__(name)
         self._forward_implemented = True
@@ -1002,8 +1010,8 @@ def is_input_compatible(self, dist_op):
         y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
         if is_dim_shard(x_dims_mapping[-1]):
             return False
-        if is_dim_shard(y_dims_mapping[-2]) or is_dim_replicate(y_dims_mapping[
-                -1]):
+        if is_dim_shard(y_dims_mapping[-2]) or is_dim_replicate(
+                y_dims_mapping[-1]):
             return False
         for mapping in x_dims_mapping[1:-1]:
             if is_dim_shard(mapping):
@@ -1139,11 +1147,10 @@ def forward(ctx, *args, **kwargs):
                     ['float16', 'float32', 'float64'], 'linear')
         attrs = {'trans_x': False, 'trans_y': False}
         inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]}
-        matmul_v2_op = main_block.append_op(
-            type='matmul_v2',
-            inputs=inputs,
-            outputs={'Out': Out_var},
-            attrs=attrs)
+        matmul_v2_op = main_block.append_op(type='matmul_v2',
+                                            inputs=inputs,
+                                            outputs={'Out': Out_var},
+                                            attrs=attrs)
         if Out_var.shape != ref_shape_out:
             Out_var.desc.set_shape(ref_shape_out)
 
@@ -1177,14 +1184,14 @@ def forward(ctx, *args, **kwargs):
                     input_varname)
                 assert input_dist_attr is not None, "dist_attr is {}".format(
                     op_dist_attr)
-                matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
-                                                          input_dist_attr)
+                matmulv2_op_dist_attr.set_input_dist_attr(
+                    input_varname, input_dist_attr)
             else:
                 input_var = main_block.var(input_varname)
                 tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(
                     input_var)
-                matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
-                                                          tensor_dist_attr)
+                matmulv2_op_dist_attr.set_input_dist_attr(
+                    input_varname, tensor_dist_attr)
         for output_varname in matmul_v2_op.desc.output_arg_names():
             output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
             assert output_dist_attr is not None, "dist_attr is {}".format(
@@ -1205,6 +1212,7 @@ def backward(ctx, *args, **kwargs):
 
 # RowParallel
 class DistributedMatmulV2Impl1(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedMatmulV2Impl1, self).__init__(name)
         self._forward_implemented = True
@@ -1219,8 +1227,8 @@ def is_input_compatible(self, dist_op):
         y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
         if is_dim_replicate(x_dims_mapping[-1]):
             return False
-        if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
-                -1]):
+        if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(
+                y_dims_mapping[-1]):
             return False
         # Other dimensions must be replicate except the batch dimension
         for mapping in x_dims_mapping[1:-1]:
@@ -1339,11 +1347,10 @@ def forward(ctx, *args, **kwargs):
         ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
                                              out_var_dist_attr)
 
-        matmul_v2_op = main_block.append_op(
-            type='matmul_v2',
-            inputs=inputs,
-            outputs={'Out': intermediate_var_0},
-            attrs=attrs)
+        matmul_v2_op = main_block.append_op(type='matmul_v2',
+                                            inputs=inputs,
+                                            outputs={'Out': intermediate_var_0},
+                                            attrs=attrs)
         if intermediate_var_0.shape != ref_shape:
             intermediate_var_0.desc.set_shape(ref_shape)
 
@@ -1411,6 +1418,7 @@ def backward(ctx, *args, **kwargs):
 
 # ReplicateParallel
 class DistributedMatmulV2Impl2(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedMatmulV2Impl2, self).__init__(name)
 
@@ -1424,14 +1432,14 @@ def is_input_compatible(self, dist_op):
 
         if is_dim_shard(x_dims_mapping[-1]):
             return False
-        if is_valid_list_index(x_dims_mapping,
-                               -2) and is_dim_shard(x_dims_mapping[-2]):
+        if is_valid_list_index(x_dims_mapping, -2) and is_dim_shard(
+                x_dims_mapping[-2]):
             return False
 
         if is_dim_shard(y_dims_mapping[-1]):
             return False
-        if is_valid_list_index(y_dims_mapping,
-                               -2) and is_dim_shard(y_dims_mapping[-2]):
+        if is_valid_list_index(y_dims_mapping, -2) and is_dim_shard(
+                y_dims_mapping[-2]):
             return False
         return True
 
@@ -1445,8 +1453,8 @@ def is_output_compatible(self, dist_op):
 
         if is_dim_shard(out_dims_mapping[-1]):
             return False
-        if is_valid_list_index(out_dims_mapping,
-                               -2) and is_dim_shard(out_dims_mapping[-2]):
+        if is_valid_list_index(out_dims_mapping, -2) and is_dim_shard(
+                out_dims_mapping[-2]):
             return False
 
         return True
@@ -1486,6 +1494,7 @@ def backward(ctx, *args, **kwargs):
 
 
 class DistributedMul(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedMul, self).__init__(op_type)
 
@@ -1495,6 +1504,7 @@ def __init__(self, op_type):
 
 # ColumnParallel
 class DistributedMulImpl0(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedMulImpl0, self).__init__(name)
         self._forward_implemented = True
@@ -1509,8 +1519,8 @@ def is_input_compatible(self, dist_op):
         y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
         if is_dim_shard(x_dims_mapping[-1]):
             return False
-        if is_dim_shard(y_dims_mapping[-2]) or is_dim_replicate(y_dims_mapping[
-                -1]):
+        if is_dim_shard(y_dims_mapping[-2]) or is_dim_replicate(
+                y_dims_mapping[-1]):
             return False
         for mapping in x_dims_mapping[1:-1]:
             if is_dim_shard(mapping):
@@ -1650,8 +1660,10 @@ def forward(ctx, *args, **kwargs):
             "y_num_col_dims": src_op.desc.attr("y_num_col_dims")
         }
         inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]}
-        mul_op = main_block.append_op(
-            type='mul', inputs=inputs, outputs={'Out': Out_var}, attrs=attrs)
+        mul_op = main_block.append_op(type='mul',
+                                      inputs=inputs,
+                                      outputs={'Out': Out_var},
+                                      attrs=attrs)
         if Out_var.shape != ref_shape_out:
             Out_var.desc.set_shape(ref_shape_out)
 
@@ -1685,14 +1697,14 @@ def forward(ctx, *args, **kwargs):
                     input_varname)
                 assert input_dist_attr is not None, "dist_attr is {}".format(
                     op_dist_attr)
-                matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
-                                                          input_dist_attr)
+                matmulv2_op_dist_attr.set_input_dist_attr(
+                    input_varname, input_dist_attr)
             else:
                 input_var = main_block.var(input_varname)
                 tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(
                     input_var)
-                matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
-                                                          tensor_dist_attr)
+                matmulv2_op_dist_attr.set_input_dist_attr(
+                    input_varname, tensor_dist_attr)
         for output_varname in mul_op.desc.output_arg_names():
             output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
             assert output_dist_attr is not None, "dist_attr is {}".format(
@@ -1713,6 +1725,7 @@ def backward(ctx, *args, **kwargs):
 
 # RowParallel
 class DistributedMulImpl1(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedMulImpl1, self).__init__(name)
         self._forward_implemented = True
@@ -1727,8 +1740,8 @@ def is_input_compatible(self, dist_op):
         y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
         if is_dim_replicate(x_dims_mapping[-1]):
             return False
-        if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
-                -1]):
+        if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(
+                y_dims_mapping[-1]):
             return False
         # Other dimensions must be replicate except the batch dimension
         for mapping in x_dims_mapping[1:-1]:
@@ -1851,11 +1864,10 @@ def forward(ctx, *args, **kwargs):
         ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
                                              out_var_dist_attr)
 
-        mul_op = main_block.append_op(
-            type='mul',
-            inputs=inputs,
-            outputs={'Out': intermediate_var_0},
-            attrs=attrs)
+        mul_op = main_block.append_op(type='mul',
+                                      inputs=inputs,
+                                      outputs={'Out': intermediate_var_0},
+                                      attrs=attrs)
         if intermediate_var_0.shape != ref_shape:
             intermediate_var_0.desc.set_shape(ref_shape)
 
@@ -1923,6 +1935,7 @@ def backward(ctx, *args, **kwargs):
 
 # ReplicateParallel
 class DistributedMulImpl2(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedMulImpl2, self).__init__(name)
 
@@ -1936,13 +1949,13 @@ def is_input_compatible(self, dist_op):
 
         if is_dim_shard(x_dims_mapping[-1]):
             return False
-        if is_valid_list_index(x_dims_mapping,
-                               -2) and is_dim_shard(x_dims_mapping[-2]):
+        if is_valid_list_index(x_dims_mapping, -2) and is_dim_shard(
+                x_dims_mapping[-2]):
             return False
         if is_dim_shard(y_dims_mapping[-1]):
             return False
-        if is_valid_list_index(y_dims_mapping,
-                               -2) and is_dim_shard(y_dims_mapping[-2]):
+        if is_valid_list_index(y_dims_mapping, -2) and is_dim_shard(
+                y_dims_mapping[-2]):
             return False
         return True
 
@@ -1956,8 +1969,8 @@ def is_output_compatible(self, dist_op):
 
         if is_dim_shard(out_dims_mapping[-1]):
             return False
-        if is_valid_list_index(out_dims_mapping,
-                               -2) and is_dim_shard(out_dims_mapping[-2]):
+        if is_valid_list_index(out_dims_mapping, -2) and is_dim_shard(
+                out_dims_mapping[-2]):
             return False
 
         return True
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
index 4d52e5a94beb1..4629e4bef930e 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -34,6 +34,7 @@
 
 
 class DistributedPNorm(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedPNorm, self).__init__(op_type)
 
@@ -52,19 +53,21 @@ def _insert_fill_constant_op(block, op_role):
     attrs['value'] = int("1")
     attrs['dtype'] = out.dtype
     attrs['op_role'] = op_role
-    utils.get_shape_tensor_inputs(
-        inputs=inputs, attrs=attrs, shape=[0], op_type='fill_constant')
-    fill_constant_op = block.append_op(
-        type='fill_constant',
-        inputs=inputs,
-        outputs={'Out': [out]},
-        attrs=attrs)
+    utils.get_shape_tensor_inputs(inputs=inputs,
+                                  attrs=attrs,
+                                  shape=[0],
+                                  op_type='fill_constant')
+    fill_constant_op = block.append_op(type='fill_constant',
+                                       inputs=inputs,
+                                       outputs={'Out': [out]},
+                                       attrs=attrs)
     out.stop_gradient = True
     return out, fill_constant_op
 
 
 # Row Parallel
 class DistributedPNormImpl(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedPNormImpl, self).__init__(name)
         self._forward_implemented = True
@@ -193,15 +196,14 @@ def forward(ctx, *args, **kwargs):
         # set fill_constant op dist_attr
         constant_op_dist_attr = OperatorDistributedAttribute()
         constant_op_dist_attr.process_mesh = ref_process_mesh
-        constant_op_dist_attr.set_output_dims_mapping(fill_constant_out.name,
-                                                      constant_out_dims_mapping)
+        constant_op_dist_attr.set_output_dims_mapping(
+            fill_constant_out.name, constant_out_dims_mapping)
         ctx.set_op_dist_attr_for_program(fill_constant_op,
                                          constant_op_dist_attr)
-        barrier_op = main_block.append_op(
-            type='barrier',
-            inputs={'X': [fill_constant_out]},
-            outputs={'Out': [fill_constant_out]},
-            attrs={'ring_id': group.id})
+        barrier_op = main_block.append_op(type='barrier',
+                                          inputs={'X': [fill_constant_out]},
+                                          outputs={'Out': [fill_constant_out]},
+                                          attrs={'ring_id': group.id})
         # set barrier op dist attr
         set_comm_op_dist_attr_for_program(barrier_op, ref_process_mesh,
                                           constant_out_dist_attr, ctx)
@@ -223,16 +225,16 @@ def forward(ctx, *args, **kwargs):
         ]
         ctx.set_tensor_dist_attr_for_program(allgather_out,
                                              allgather_out_dist_attr)
-        c_allgather_op = main_block.append_op(
-            type='c_allgather',
-            inputs={'X': [X_var]},
-            outputs={'Out': [allgather_out]},
-            attrs={
-                'ring_id': group.id,
-                'use_calc_stream': True,
-                'nranks': group.nranks,
-                'op_role': src_op.attr('op_role')
-            })
+        c_allgather_op = main_block.append_op(type='c_allgather',
+                                              inputs={'X': [X_var]},
+                                              outputs={'Out': [allgather_out]},
+                                              attrs={
+                                                  'ring_id': group.id,
+                                                  'use_calc_stream': True,
+                                                  'nranks': group.nranks,
+                                                  'op_role':
+                                                  src_op.attr('op_role')
+                                              })
         # set c_allgather op dist_attr
         allgather_op_dist_attr = OperatorDistributedAttribute()
         allgather_op_dist_attr.process_mesh = op_dist_attr.process_mesh
@@ -344,11 +346,10 @@ def backward(ctx, *args, **kwargs):
             "infer_flags": infer_flags,
             "op_role": backward_op.attr('op_role')
         }
-        slice_op = main_block.append_op(
-            type='slice',
-            inputs={'Input': [new_X_grad]},
-            outputs={'Out': [X_grad_var]},
-            attrs=attrs)
+        slice_op = main_block.append_op(type='slice',
+                                        inputs={'Input': [new_X_grad]},
+                                        outputs={'Out': [X_grad_var]},
+                                        attrs=attrs)
         X_grad_var_dims_mapping = op_dist_attr.get_output_dims_mapping(
             X_grad_var.name)
         slice_op_dist_attr = OperatorDistributedAttribute()
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py b/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py
index 755dcab4be34f..6d750562c96d9 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py
@@ -34,6 +34,7 @@
 
 
 class DistributedReducePrimtive(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedReducePrimtive, self).__init__(op_type)
 
@@ -44,6 +45,7 @@ def __init__(self, op_type):
 
 # Batch Dimension Reduce Primitive
 class DistributedReducePrimtiveImpl0(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedReducePrimtiveImpl0, self).__init__(name)
         self._forward_implemented = True
@@ -107,26 +109,26 @@ def forward(ctx, *args, **kwargs):
                 output_name)
 
         # replicate op in dist program
-        dist_op_desc = main_block.append_op(type='nop').desc
+        dist_op_desc = main_block.desc.append_op()
         dist_op_desc.copy_from(src_op.desc)
         set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx)
         for input_name in src_op.desc.input_names():
             dist_op_desc.set_input(input_name, kwargs[input_name])
         for output_name in src_op.desc.output_names():
             dist_op_desc.set_output(output_name, kwargs[output_name])
+        main_block._sync_with_cpp()
 
         # batch dimension synchronization
         var_name = src_op.output_arg_names[0]
         sync_group = new_process_group(ctx.data_parallel_group)
-        allreduce_op = main_block.append_op(
-            type='c_allreduce_sum',
-            inputs={'X': [var_name]},
-            outputs={'Out': [var_name]},
-            attrs={
-                'ring_id': sync_group.id,
-                'use_calc_stream': True,
-                OP_ROLE_KEY: OpRole.Forward
-            })
+        allreduce_op = main_block.append_op(type='c_allreduce_sum',
+                                            inputs={'X': [var_name]},
+                                            outputs={'Out': [var_name]},
+                                            attrs={
+                                                'ring_id': sync_group.id,
+                                                'use_calc_stream': True,
+                                                OP_ROLE_KEY: OpRole.Forward
+                                            })
 
         # dist attr
         var = main_block.var(var_name)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
index da6ad933fd514..47a783a5f6d71 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
@@ -31,6 +31,7 @@
 
 
 class DistributedReshape2(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedReshape2, self).__init__(op_type)
 
@@ -39,6 +40,7 @@ def __init__(self, op_type):
 
 
 class DistributedReshapeImpl0(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedReshapeImpl0, self).__init__(name)
         self._forward_implemented = True
@@ -171,8 +173,8 @@ def forward(ctx, *args, **kwargs):
         for idx, axis in enumerate(dim_mapping):
             if axis >= 0:
                 if len(shape_list) > idx:
-                    shape_list[idx] = shape_list[idx] // process_mesh_shape[
-                        axis]
+                    shape_list[
+                        idx] = shape_list[idx] // process_mesh_shape[axis]
 
         # create op
         new_op_desc = main_block.desc.append_op()
@@ -193,6 +195,7 @@ def backward(ctx, *args, **kwargs):
 
 
 class DistributedReshapeImpl1(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedReshapeImpl1, self).__init__(name)
         self._forward_implemented = True
@@ -328,8 +331,8 @@ def forward(ctx, *args, **kwargs):
         for idx, axis in enumerate(dim_mapping):
             if axis >= 0:
                 if len(shape_list) > idx:
-                    shape_list[idx] = shape_list[idx] // process_mesh_shape[
-                        axis]
+                    shape_list[
+                        idx] = shape_list[idx] // process_mesh_shape[axis]
 
         # create op
         new_op_desc = main_block.desc.append_op()
@@ -350,6 +353,7 @@ def backward(ctx, *args, **kwargs):
 
 
 class DistributedReshapeImpl2(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedReshapeImpl2, self).__init__(name)
         self._forward_implemented = True
@@ -478,8 +482,8 @@ def forward(ctx, *args, **kwargs):
         for idx, axis in enumerate(out_dim_mapping):
             if axis >= 0:
                 if len(shape_list) > idx:
-                    shape_list[idx] = shape_list[idx] // process_mesh_shape[
-                        axis]
+                    shape_list[
+                        idx] = shape_list[idx] // process_mesh_shape[axis]
 
         # create op
         new_op_desc = main_block.desc.append_op()
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_slice.py b/python/paddle/distributed/auto_parallel/operators/dist_slice.py
index e3da47fd172ea..a37421ce61247 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_slice.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_slice.py
@@ -23,6 +23,7 @@
 
 
 class DistributedSlice(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedSlice, self).__init__(op_type)
 
@@ -31,6 +32,7 @@ def __init__(self, op_type):
 
 
 class DistributedSliceImpl(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedSliceImpl, self).__init__(name)
         self._forward_implemented = True
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py
index f78f1c58dbf07..afcdea4f0455d 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py
@@ -26,6 +26,7 @@
 
 
 class DistributedSoftmax(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedSoftmax, self).__init__(op_type)
 
@@ -34,6 +35,7 @@ def __init__(self, op_type):
 
 
 class DistributedSoftmaxImpl(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedSoftmaxImpl, self).__init__(name)
         self._forward_implemented = False
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_split.py b/python/paddle/distributed/auto_parallel/operators/dist_split.py
index 289da80e1a7ae..8f89020b53ca4 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_split.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_split.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,6 +25,7 @@
 
 
 class DistributedSplit(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedSplit, self).__init__(op_type)
 
@@ -33,6 +34,7 @@ def __init__(self, op_type):
 
 
 class DistributedSplitImpl(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedSplitImpl, self).__init__(name)
         self._forward_implemented = True
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py
index e6a96fb795ef8..0dc4177399e74 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py
@@ -26,6 +26,7 @@
 
 
 class DistributedTranspose2(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedTranspose2, self).__init__(op_type)
 
@@ -35,6 +36,7 @@ def __init__(self, op_type):
 
 
 class DistributedTranspose2Impl(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedTranspose2Impl, self).__init__(name)
         self._forward_implemented = False
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
index 4ea2e0a884716..9666f882200e5 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
@@ -20,6 +20,7 @@
 
 
 class DistributedUpdateLossScaling(DistributedOperatorImplContainer):
+
     def __init__(self, op_type):
         super(DistributedUpdateLossScaling, self).__init__(op_type)
 
@@ -29,6 +30,7 @@ def __init__(self, op_type):
 
 
 class DistributedUpdateLossScalingImpl(DistributedOperatorImpl):
+
     def __init__(self, name):
         super(DistributedUpdateLossScalingImpl, self).__init__(name)
         self._name = name
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index 2ea1223c6f2f3..1ad85598101a6 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -108,8 +108,8 @@ def _apply_pre_optimization_passes(self, main_program, startup_program,
             if config["use_pure_fp16"]:
                 config["base_opt"] = self._optimizer
                 auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config)
-                auto_parallel_fp16_pass.apply(
-                    [main_program], [startup_program], self._pass_context)
+                auto_parallel_fp16_pass.apply([main_program], [startup_program],
+                                              self._pass_context)
             else:
                 auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
                 auto_parallel_amp_pass.apply([main_program], [startup_program],
@@ -123,8 +123,9 @@ def _apply_pre_optimization_passes(self, main_program, startup_program,
             config["loss"] = loss
             auto_parallel_recompute_pass = new_pass("auto_parallel_recompute",
                                                     config)
-            auto_parallel_recompute_pass.apply(
-                [main_program], [startup_program], self._pass_context)
+            auto_parallel_recompute_pass.apply([main_program],
+                                               [startup_program],
+                                               self._pass_context)
 
     def _generate_backward(self, main_program, startup_program, loss,
                            parameter_list, no_grad_set, callbacks):
@@ -144,10 +145,10 @@ def _generate_backward(self, main_program, startup_program, loss,
     def _apply_optimize(self, main_program, startup_program, params_grads):
 
         with program_guard(main_program, startup_program):
-            optimize_ops = copy.deepcopy(self._optimizer).apply_gradients(
-                params_grads)
+            optimize_ops = copy.deepcopy(
+                self._optimizer).apply_gradients(params_grads)
 
-        # update completion 
+        # update completion
         self._completer = Completer(self._dist_context)
         self._completer.complete_update_annotation(main_program)
 
@@ -163,8 +164,8 @@ def _apply_post_optimization_passes(self, main_program, startup_program,
             config["global_rank"] = rank
             auto_parallel_sharding_pass = new_pass("auto_parallel_sharding",
                                                    config)
-            auto_parallel_sharding_pass.apply(
-                [main_program], [startup_program], self._pass_context)
+            auto_parallel_sharding_pass.apply([main_program], [startup_program],
+                                              self._pass_context)
 
         if self._dist_strategy.gradient_merge:
             config = copy.deepcopy(self._dist_strategy.gradient_merge_configs)
@@ -172,8 +173,9 @@ def _apply_post_optimization_passes(self, main_program, startup_program,
             config["params_grads"] = params_grads
             auto_parallel_gradient_merge_pass = new_pass(
                 "auto_parallel_gradient_merge_pass", config)
-            auto_parallel_gradient_merge_pass.apply(
-                [main_program], [startup_program], self._pass_context)
+            auto_parallel_gradient_merge_pass.apply([main_program],
+                                                    [startup_program],
+                                                    self._pass_context)
 
     def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False):
         completed_main_program = None
@@ -181,7 +183,7 @@ def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False):
         serial_startup_program = self._startup_program.clone()
         serial_loss = serial_main_program.global_block().var(self._loss.name)
 
-        # generating serial 
+        # generating serial
         if dist_context is None:
             # Annotation completion
             self._dist_context = DistributedContext()
@@ -205,15 +207,16 @@ def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False):
         self._apply_pre_optimization_passes(completed_main_program,
                                             serial_startup_program, serial_loss,
                                             params_grads, self._no_grad_set)
-        # Logical partition 
+        # Logical partition
         partitioner = Partitioner(self._dist_context, rank)
         dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
             completed_main_program, serial_startup_program, params_grads)
 
         # TODO refactor the placement of optimizer
         # generate optimize program
-        dist_optimize_ops = self._apply_optimize(
-            dist_main_prog, dist_startup_prog, dist_params_grads)
+        dist_optimize_ops = self._apply_optimize(dist_main_prog,
+                                                 dist_startup_prog,
+                                                 dist_params_grads)
 
         set_grad_var_shape(dist_main_prog, self._dist_context)
 
@@ -258,14 +261,17 @@ def parallelize(self,
             # auto search
             if self._dist_strategy.auto_search:
                 logging.info("Start searching dist attr.")
-                serial_program_info = SerialProgramInfo(
-                    self._main_program, self._startup_program, self._loss,
-                    self._optimizer, self._cluster)
-                planner = Planner(
-                    serial_program_info,
-                    self,
-                    algorithm_config={"name": "mcmc",
-                                      "max_search_times": 5})
+                serial_program_info = SerialProgramInfo(self._main_program,
+                                                        self._startup_program,
+                                                        self._loss,
+                                                        self._optimizer,
+                                                        self._cluster)
+                planner = Planner(serial_program_info,
+                                  self,
+                                  algorithm_config={
+                                      "name": "mcmc",
+                                      "max_search_times": 5
+                                  })
                 dist_context, _ = planner.search()
                 logging.info("End searching dist attr.")
 
@@ -325,8 +331,8 @@ def parallelize(self,
             else:
                 coverage_args = []
             new_cmd_args = "-m paddle.distributed.fleet.launch" + " " + rank_mapping_args + " " + original_cmd_args
-            new_cmd = [sys.executable, "-u"] + coverage_args + shlex.split(
-                new_cmd_args)
+            new_cmd = [sys.executable, "-u"
+                       ] + coverage_args + shlex.split(new_cmd_args)
             new_process = subprocess.Popen(new_cmd)
             new_process.wait()
             assert new_process.returncode == 0, \
@@ -368,13 +374,12 @@ def parallelize(self,
                         self._loss,
                         self._optimizer,
                         cluster=self._cluster)
-                    planner = Planner(
-                        serial_program_info,
-                        self,
-                        algorithm_config={
-                            "name": "mcmc",
-                            "max_search_times": 5
-                        })
+                    planner = Planner(serial_program_info,
+                                      self,
+                                      algorithm_config={
+                                          "name": "mcmc",
+                                          "max_search_times": 5
+                                      })
                     dist_context, _ = planner.search()
 
             # rebuild g_process_group
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
index 218513323dffb..ce543988ea4e1 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -31,6 +31,7 @@
 
 
 class Parallelizer:
+
     def __init__(self, mode, completer, dist_context):
         self._mode = mode
         self._completer = completer
@@ -54,8 +55,9 @@ def parallel(self, rank):
         if self._mode == "train" and serial_optimizer:
             # Generate backward
             serial_loss = self._dist_context.serial_loss
-            params_grads = self._generate_backward(
-                serial_main_program, serial_startup_program, serial_loss)
+            params_grads = self._generate_backward(serial_main_program,
+                                                   serial_startup_program,
+                                                   serial_loss)
             # Apply pre optimization passes
             self._apply_pre_optimization(serial_main_program,
                                          serial_startup_program, serial_loss,
@@ -78,8 +80,9 @@ def parallel(self, rank):
                                           rank, dist_params_grads)
         else:
             # Apply pre optimization passes
-            self._apply_pre_optimization(
-                serial_main_program, serial_startup_program, None, None, None)
+            self._apply_pre_optimization(serial_main_program,
+                                         serial_startup_program, None, None,
+                                         None)
             # Do logical partition
             partitioner = Partitioner(self._dist_context, rank)
             dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
@@ -128,8 +131,8 @@ def _apply_pre_optimization(self, main_program, startup_program, loss,
             if config["use_pure_fp16"]:
                 config["base_opt"] = optimizer
                 auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config)
-                auto_parallel_fp16_pass.apply(
-                    [main_program], [startup_program], self._pass_context)
+                auto_parallel_fp16_pass.apply([main_program], [startup_program],
+                                              self._pass_context)
             else:
                 auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
                 auto_parallel_amp_pass.apply([main_program], [startup_program],
@@ -143,8 +146,9 @@ def _apply_pre_optimization(self, main_program, startup_program, loss,
             config["loss"] = loss
             auto_parallel_recompute_pass = new_pass("auto_parallel_recompute",
                                                     config)
-            auto_parallel_recompute_pass.apply(
-                [main_program], [startup_program], self._dist_context)
+            auto_parallel_recompute_pass.apply([main_program],
+                                               [startup_program],
+                                               self._dist_context)
 
     def _apply_post_optimization(self, main_program, startup_program, rank,
                                  params_grads):
@@ -157,8 +161,8 @@ def _apply_post_optimization(self, main_program, startup_program, rank,
             config["global_rank"] = rank
             auto_parallel_sharding_pass = new_pass("auto_parallel_sharding",
                                                    config)
-            auto_parallel_sharding_pass.apply(
-                [main_program], [startup_program], self._dist_context)
+            auto_parallel_sharding_pass.apply([main_program], [startup_program],
+                                              self._dist_context)
 
         if self._strategy.gradient_merge:
             config = copy.deepcopy(self._strategy.gradient_merge_configs)
@@ -166,5 +170,6 @@ def _apply_post_optimization(self, main_program, startup_program, rank,
             config["params_grads"] = params_grads
             auto_parallel_gradient_merge_pass = new_pass(
                 "auto_parallel_gradient_merge_pass", config)
-            auto_parallel_gradient_merge_pass.apply(
-                [main_program], [startup_program], self._dist_context)
+            auto_parallel_gradient_merge_pass.apply([main_program],
+                                                    [startup_program],
+                                                    self._dist_context)
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index ce686fd6a5683..9056ab34fa711 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -25,7 +25,7 @@
 from .dist_attribute import OperatorDistributedAttribute
 from .process_group import new_process_group
 from .utils import set_dist_op_desc_original_id
-from .utils import print_program_with_dist_attr, is_forward_op, is_backward_op, is_loss_op
+from .utils import print_program_with_dist_attr, is_forward_op, is_backward_op, is_loss_op, is_optimize_op
 from .operators.common import BACKWARD_ONLY_DIST_OPS
 
 __varname_not_in_block__ = ["lod_tensor_blocking_queue_0"]
@@ -187,8 +187,8 @@ def partition_main_program(self, serial_main_program, params_and_grads):
                 dist_g = None
             else:
                 assert g.name in self._serial2dist_varname_mapping
-                dist_g = self._get_dist_var_by_serial_var(g,
-                                                          partitioned_main_prog)
+                dist_g = self._get_dist_var_by_serial_var(
+                    g, partitioned_main_prog)
             partitioned_params_and_grads.append((dist_p, dist_g))
 
         return partitioned_main_prog, partitioned_params_and_grads
@@ -211,15 +211,15 @@ def partition_block(self, ref_block, target_block):
         forward_op_id2forward_op = {}
         for idx in range(len(serial_ops)):
             if idx <= last_fwd_op_idx:
-                forward_op_id2forward_op[serial_ops[idx].desc.original_id(
-                )] = serial_ops[idx]
+                forward_op_id2forward_op[
+                    serial_ops[idx].desc.original_id()] = serial_ops[idx]
 
         appended_grad_times = 0
         # partiiton
         for idx, op in enumerate(serial_ops):
 
-            if is_backward_op(op) and (is_forward_op(serial_ops[idx - 1]) or
-                                       is_loss_op(serial_ops[idx - 1])):
+            if is_backward_op(op) and (is_forward_op(serial_ops[idx - 1])
+                                       or is_loss_op(serial_ops[idx - 1])):
                 appended_grad_times += 1
 
             # partititon input variables
@@ -263,15 +263,15 @@ def partition_block(self, ref_block, target_block):
                 dist_op_backward_impl.backward(
                     self._dist_context, **kinputs, **koutputs,
                     **{"grad_var_to_var": grad_var_to_var})
-            elif int(op.attr('op_role')) == 2:
+            elif is_optimize_op(op):
                 kinputs, koutputs = dist_op_context.prepare_context(op)
                 dist_op_impl = get_distributed_operator_impl_container(
                     "default").get_impl(0)
                 dist_op_impl.backward(self._dist_context, **kinputs, **koutputs)
             else:
                 raise NotImplementedError(
-                    "partitioner only support forward op and backward op, but got {}".
-                    format(str(op)))
+                    "partitioner only support forward and backward, optimize ops, but got {}"
+                    .format(str(op)))
 
     def _is_valid_annotated_program(self, program):
 
@@ -338,35 +338,33 @@ def _partition_parameter(dist_context, src_var, dst_block, dst_varname,
     copied_kwargs['do_model_average'] = src_var.do_model_average
     copied_kwargs['need_clip'] = src_var.need_clip
 
-    param = Parameter(
-        block=dst_block,
-        type=src_var.type,
-        name=dst_varname,
-        shape=dst_shape,
-        dtype=src_var.dtype,
-        lod_level=src_var.lod_level,
-        error_clip=src_var.error_clip,
-        stop_gradient=src_var.stop_gradient,
-        is_data=src_var.is_data,
-        belong_to_optimizer=src_var.belong_to_optimizer,
-        **copied_kwargs)
+    param = Parameter(block=dst_block,
+                      type=src_var.type,
+                      name=dst_varname,
+                      shape=dst_shape,
+                      dtype=src_var.dtype,
+                      lod_level=src_var.lod_level,
+                      error_clip=src_var.error_clip,
+                      stop_gradient=src_var.stop_gradient,
+                      is_data=src_var.is_data,
+                      belong_to_optimizer=src_var.belong_to_optimizer,
+                      **copied_kwargs)
 
     return param
 
 
 def _partition_intermediate_var(dist_context, src_var, dst_block, dst_varname,
                                 dst_shape):
-    var = dst_block.create_var(
-        type=src_var.type,
-        name=dst_varname,
-        shape=dst_shape,
-        dtype=src_var.dtype,
-        lod_level=src_var.lod_level,
-        persistable=src_var.persistable,
-        error_clip=src_var.error_clip,
-        stop_gradient=src_var.stop_gradient,
-        is_data=src_var.is_data,
-        belong_to_optimizer=src_var.belong_to_optimizer)
+    var = dst_block.create_var(type=src_var.type,
+                               name=dst_varname,
+                               shape=dst_shape,
+                               dtype=src_var.dtype,
+                               lod_level=src_var.lod_level,
+                               persistable=src_var.persistable,
+                               error_clip=src_var.error_clip,
+                               stop_gradient=src_var.stop_gradient,
+                               is_data=src_var.is_data,
+                               belong_to_optimizer=src_var.belong_to_optimizer)
 
     return var
 
@@ -380,11 +378,10 @@ def _partition_var(dist_context, src_block, dst_block, src_varname,
 
     if src_var.type in __not_shape_var_type__:
         persist = getattr(src_var, 'persistable', False)
-        new_var = dst_block.create_var(
-            type=src_var.type,
-            name=dst_varname,
-            persistable=persist,
-            stop_gradient=True)
+        new_var = dst_block.create_var(type=src_var.type,
+                                       name=dst_varname,
+                                       persistable=persist,
+                                       stop_gradient=True)
         target_shape = None
     else:
         dist_attr = dist_context.get_tensor_dist_attr_for_program(src_var)
@@ -394,8 +391,9 @@ def _partition_var(dist_context, src_block, dst_block, src_varname,
             new_var = _partition_parameter(dist_context, src_var, dst_block,
                                            dst_varname, target_shape)
         else:
-            new_var = _partition_intermediate_var(
-                dist_context, src_var, dst_block, dst_varname, target_shape)
+            new_var = _partition_intermediate_var(dist_context, src_var,
+                                                  dst_block, dst_varname,
+                                                  target_shape)
 
     dist_attr = copy.deepcopy(
         dist_context.get_tensor_dist_attr_for_program(src_var))
diff --git a/python/paddle/distributed/auto_parallel/planner.py b/python/paddle/distributed/auto_parallel/planner.py
index b97c09bd59da8..701fd78a7e8b9 100755
--- a/python/paddle/distributed/auto_parallel/planner.py
+++ b/python/paddle/distributed/auto_parallel/planner.py
@@ -41,6 +41,7 @@
 
 
 class PlanFilter:
+
     @staticmethod
     def check_dims_mapping_for_tensor(process_mesh_topology, tensor_shape,
                                       dims_mapping):
@@ -82,7 +83,7 @@ def check_dims_mapping_for_op(op, op_dist_attr, vars):
 
     @staticmethod
     def check_dims_mapping_for_special_op(op, op_dist_attr, vars):
-        # NOTE: Those ops has some partition limits, and will be solved when corresponding dist op implemented in the future. 
+        # NOTE: Those ops has some partition limits, and will be solved when corresponding dist op implemented in the future.
         if op.type == "elementwise_add" or op.type == 'layer_norm' or op.type == "softmax_with_cross_entropy":
             for name in op.input_arg_names:
                 for item in op_dist_attr.get_input_dims_mapping(name):
@@ -188,8 +189,7 @@ def _enum_valid_dist_attr_for_op(program, op, process_mesh):
         for var_name in chain(op.input_arg_names, op.output_arg_names):
             visited = [
                 False
-                for _ in range(
-                    len(list(range(-1, len(process_mesh.topology)))))
+                for _ in range(len(list(range(-1, len(process_mesh.topology)))))
             ]
             depth = 0
             path = []
@@ -213,13 +213,12 @@ def _enum_valid_dist_attr_for_op(program, op, process_mesh):
                     op_dist_attr.set_input_dims_mapping(var_names[idx],
                                                         dims_mapping)
                 elif var_names[idx] in op.output_arg_names:
-                    op_dist_attr.set_output_dims_mapping(var_names[idx],
-                                                         dims_mapping)
+                    op_dist_attr.set_output_dims_mapping(
+                        var_names[idx], dims_mapping)
                 else:
                     raise ValueError(
                         "The {varname} is not input or output of op {op}.".
-                        format(
-                            varname='var_names[idx]', op='op'))
+                        format(varname='var_names[idx]', op='op'))
 
             dist_op = DistributedOperator(op, op_dist_attr)
             if dist_op_impl_container is None:
@@ -339,16 +338,16 @@ def enum_valid_dist_attr_for_program(program,
                         op_dist_attr.set_input_dims_mapping(var_name, [])
                     else:
                         dims_mapping = [-1 for i in vars[var_name].shape]
-                        op_dist_attr.set_input_dims_mapping(var_name,
-                                                            dims_mapping)
+                        op_dist_attr.set_input_dims_mapping(
+                            var_name, dims_mapping)
 
                 for var_name in op.output_arg_names:
                     if var_name in PlanSpace.special_vars:
                         op_dist_attr.set_output_dims_mapping(var_name, [])
                     else:
                         dims_mapping = [-1 for i in vars[var_name].shape]
-                        op_dist_attr.set_output_dims_mapping(var_name,
-                                                             dims_mapping)
+                        op_dist_attr.set_output_dims_mapping(
+                            var_name, dims_mapping)
                 op_valid_dist_attrs = [op_dist_attr]
                 pipeline_stage = 0 if pipeline_stage != -1 else pipeline_stage
             else:
@@ -357,13 +356,15 @@ def enum_valid_dist_attr_for_program(program,
 
             assert op_valid_dist_attrs is not None, "Enumerate {} valid distributed attribute failed.".format(
                 op)
-            valid_dist_attr_dict[op.desc.id(
-            )] = [op_valid_dist_attrs, pipeline_stage]
+            valid_dist_attr_dict[op.desc.id()] = [
+                op_valid_dist_attrs, pipeline_stage
+            ]
 
         return valid_dist_attr_dict, pipeline_process_meshes, global_process_mesh
 
 
 class SearchAlgorithm:
+
     def __init__(self, name):
         self._name = name
 
@@ -376,6 +377,7 @@ def search(self):
 
 
 class MCMC(SearchAlgorithm):
+
     def __init__(self, serial_program_info, parallelizer, max_search_times=5):
         super(MCMC, self).__init__("mcmc")
         self._serial_program_info = serial_program_info
@@ -426,7 +428,8 @@ def make_special_op_unshard(self, op, ops, vars, dist_context,
                             break
                     if not has_changed:
                         raise ValueError(
-                            "Change softmax_with_cross_entropy dist attr failed")
+                            "Change softmax_with_cross_entropy dist attr failed"
+                        )
 
     def init_program(self, valid_dist_attr_dict, program,
                      pipeline_process_meshes, global_process_mesh):
@@ -443,8 +446,8 @@ def init_program(self, valid_dist_attr_dict, program,
             for var_name in op.input_arg_names:
                 if var_name == "lod_tensor_blocking_queue_0":
                     continue
-                if new_dist_context.get_tensor_dist_attr_for_program(vars[
-                        var_name]) is None:
+                if new_dist_context.get_tensor_dist_attr_for_program(
+                        vars[var_name]) is None:
                     tensor_dist_attr = TensorDistributedAttribute()
                     tensor_dist_attr.process_mesh = init_op_dist_attr.process_mesh
                     tensor_dist_attr.dims_mapping = init_op_dist_attr.get_input_dims_mapping(
@@ -498,12 +501,11 @@ def estimate_searched_strategy_cost(self,
         standalone_cost_data = get_standalone_cost_data(all_dist_main_program)
 
         # cost model does not support cluster argument
-        cost = estimate_cost(
-            all_dist_main_program,
-            cluster=None,
-            pipeline_config=pipeline_config,
-            standalone_cost_data=standalone_cost_data,
-            batch_size=microbatch_size)
+        cost = estimate_cost(all_dist_main_program,
+                             cluster=None,
+                             pipeline_config=pipeline_config,
+                             standalone_cost_data=standalone_cost_data,
+                             batch_size=microbatch_size)
 
         return cost
 
@@ -515,8 +517,8 @@ def set_tensor_dist_attr(self, op, op_dist_attr, vars, dist_context):
             tensor_dist_attr.process_mesh = process_mesh
             tensor_dist_attr.dims_mapping = op_dist_attr.get_output_dims_mapping(
                 var_name)
-            dist_context.set_tensor_dist_attr_for_program(vars[var_name],
-                                                          tensor_dist_attr)
+            dist_context.set_tensor_dist_attr_for_program(
+                vars[var_name], tensor_dist_attr)
 
         # set input tensor distributed attribute if input is data or parameter
         for var_name in op.input_arg_names:
@@ -526,19 +528,19 @@ def set_tensor_dist_attr(self, op, op_dist_attr, vars, dist_context):
                 tensor_dist_attr.process_mesh = process_mesh
                 tensor_dist_attr.dims_mapping = op_dist_attr.get_input_dims_mapping(
                     var_name)
-                dist_context.set_tensor_dist_attr_for_program(vars[var_name],
-                                                              tensor_dist_attr)
+                dist_context.set_tensor_dist_attr_for_program(
+                    vars[var_name], tensor_dist_attr)
 
     def change_process_mesh(self, op, changed_process_mesh, vars, dist_context):
         dist_context.get_op_dist_attr_for_program(
             op).process_mesh = changed_process_mesh
         for var_name in op.output_arg_names:
-            dist_context.get_tensor_dist_attr_for_program(vars[
-                var_name]).process_mesh = changed_process_mesh
+            dist_context.get_tensor_dist_attr_for_program(
+                vars[var_name]).process_mesh = changed_process_mesh
         for var_name in op.input_arg_names:
             if vars[var_name].is_parameter or vars[var_name].is_data:
-                dist_context.get_tensor_dist_attr_for_program(vars[
-                    var_name]).process_mesh = changed_process_mesh
+                dist_context.get_tensor_dist_attr_for_program(
+                    vars[var_name]).process_mesh = changed_process_mesh
 
     def search_once(self,
                     program,
@@ -561,8 +563,8 @@ def search_once(self,
         pipeline_stage = valid_dist_attr_dict[selected_op.desc.id()][1]
         random_selected_dist_attr_idx = np.random.randint(
             len(op_valid_dist_attr_list))
-        selected_op_dist_attr = copy.deepcopy(op_valid_dist_attr_list[
-            random_selected_dist_attr_idx])
+        selected_op_dist_attr = copy.deepcopy(
+            op_valid_dist_attr_list[random_selected_dist_attr_idx])
 
         start_idx = ops[0].desc.id()
         if pipeline_stage > -1:
@@ -604,8 +606,8 @@ def search_once(self,
                             selected_op_process_mesh, vars, new_dist_context)
 
                     # change the selected op stage and output dist attr
-                    new_valid_dist_attr_dict[selected_op.desc.id()][
-                        1] = changed_stage
+                    new_valid_dist_attr_dict[
+                        selected_op.desc.id()][1] = changed_stage
                     new_process_mesh = pipeline_process_meshes[changed_stage]
                     selected_op_dist_attr.process_mesh = new_process_mesh
                     for op_dist_attr in new_valid_dist_attr_dict[
@@ -621,17 +623,17 @@ def search_once(self,
                     # change the pre op stage
                     for idx in range(random_selected_op_idx - 1, -1, -1):
                         stage = new_valid_dist_attr_dict[ops[idx].desc.id()][1]
-                        valid_dist_attr_list = new_valid_dist_attr_dict[ops[
-                            idx].desc.id()][0]
+                        valid_dist_attr_list = new_valid_dist_attr_dict[
+                            ops[idx].desc.id()][0]
                         new_process_mesh = pipeline_process_meshes[
                             changed_stage]
                         if stage == changed_stage + 1:
-                            new_valid_dist_attr_dict[ops[idx].desc.id()][
-                                1] = changed_stage
+                            new_valid_dist_attr_dict[
+                                ops[idx].desc.id()][1] = changed_stage
                             for op_dist_attr in valid_dist_attr_list:
                                 op_dist_attr.process_mesh = new_process_mesh
-                            new_dist_context.get_op_dist_attr_for_program(ops[
-                                idx]).process_mesh = new_process_mesh
+                            new_dist_context.get_op_dist_attr_for_program(
+                                ops[idx]).process_mesh = new_process_mesh
                             # change process mesh of the output and input tensor
                             self.change_process_mesh(ops[idx], new_process_mesh,
                                                      vars, new_dist_context)
@@ -665,8 +667,8 @@ def search_once(self,
                             selected_op_process_mesh, vars, new_dist_context)
 
                     # change the selected op stage and output tensor dist attr
-                    new_valid_dist_attr_dict[selected_op.desc.id()][
-                        1] = changed_stage
+                    new_valid_dist_attr_dict[
+                        selected_op.desc.id()][1] = changed_stage
                     new_process_mesh = pipeline_process_meshes[changed_stage]
                     selected_op_dist_attr.process_mesh = new_process_mesh
                     for op_dist_attr in new_valid_dist_attr_dict[
@@ -681,26 +683,26 @@ def search_once(self,
                     # change the next op stage
                     for idx in range(random_selected_op_idx + 1, len(ops)):
                         stage = new_valid_dist_attr_dict[ops[idx].desc.id()][1]
-                        valid_dist_attr_list = new_valid_dist_attr_dict[ops[
-                            idx].desc.id()][0]
+                        valid_dist_attr_list = new_valid_dist_attr_dict[
+                            ops[idx].desc.id()][0]
                         new_process_mesh = pipeline_process_meshes[
                             changed_stage]
                         if stage == changed_stage - 1:
-                            new_valid_dist_attr_dict[ops[idx].desc.id()][
-                                1] = changed_stage
+                            new_valid_dist_attr_dict[
+                                ops[idx].desc.id()][1] = changed_stage
                             for op_dist_attr in valid_dist_attr_list:
                                 op_dist_attr.process_mesh = new_process_mesh
 
-                            new_dist_context.get_op_dist_attr_for_program(ops[
-                                idx]).process_mesh = new_process_mesh
+                            new_dist_context.get_op_dist_attr_for_program(
+                                ops[idx]).process_mesh = new_process_mesh
                             # change the output tensor dist attr
                             self.change_process_mesh(ops[idx], new_process_mesh,
                                                      vars, new_dist_context)
                         else:
                             break
         else:
-            new_dist_context.set_op_dist_attr_for_program(selected_op,
-                                                          selected_op_dist_attr)
+            new_dist_context.set_op_dist_attr_for_program(
+                selected_op, selected_op_dist_attr)
             self.set_tensor_dist_attr(selected_op, selected_op_dist_attr, vars,
                                       new_dist_context)
 
@@ -759,15 +761,16 @@ def search(self):
                   format(process_mesh_topology))
             valid_dist_attr_dict, pipeline_process_meshes, global_process_mesh = PlanSpace.enum_valid_dist_attr_for_program(
                 train_program, process_mesh_topology, True)
-            init_dist_context = self.init_program(
-                valid_dist_attr_dict, train_program, pipeline_process_meshes,
-                global_process_mesh)
-            best_dist_context, cost = self._search_core(valid_dist_attr_dict,
-                                                        init_dist_context,
-                                                        pipeline_process_meshes)
+            init_dist_context = self.init_program(valid_dist_attr_dict,
+                                                  train_program,
+                                                  pipeline_process_meshes,
+                                                  global_process_mesh)
+            best_dist_context, cost = self._search_core(
+                valid_dist_attr_dict, init_dist_context,
+                pipeline_process_meshes)
             print(
-                "MCMC search: the min cost is {} in the process mesh {} with pipeline mode.".
-                format(cost, process_mesh_topology))
+                "MCMC search: the min cost is {} in the process mesh {} with pipeline mode."
+                .format(cost, process_mesh_topology))
             best_dist_context._dist_op_context = DistributedOperatorContext()
             pipeline_min_cost = cost if pipeline_min_cost is None else pipeline_min_cost
             searched_pipeline_dist_context = best_dist_context if searched_pipeline_dist_context is None else searched_pipeline_dist_context
@@ -785,15 +788,16 @@ def search(self):
                   format(process_mesh_topology))
             valid_dist_attr_dict, pipeline_process_meshes, global_process_mesh = PlanSpace.enum_valid_dist_attr_for_program(
                 train_program, process_mesh_topology, False)
-            init_dist_context = self.init_program(
-                valid_dist_attr_dict, train_program, pipeline_process_meshes,
-                global_process_mesh)
-            best_dist_context, cost = self._search_core(valid_dist_attr_dict,
-                                                        init_dist_context,
-                                                        pipeline_process_meshes)
+            init_dist_context = self.init_program(valid_dist_attr_dict,
+                                                  train_program,
+                                                  pipeline_process_meshes,
+                                                  global_process_mesh)
+            best_dist_context, cost = self._search_core(
+                valid_dist_attr_dict, init_dist_context,
+                pipeline_process_meshes)
             print(
-                "MCMC search: the min cost is {} in the process mesh {} without pipeline mode.".
-                format(cost, process_mesh_topology))
+                "MCMC search: the min cost is {} in the process mesh {} without pipeline mode."
+                .format(cost, process_mesh_topology))
             best_dist_context._dist_op_context = DistributedOperatorContext()
             non_pipeline_min_cost = cost if non_pipeline_min_cost is None else non_pipeline_min_cost
             searched_non_pipeline_dist_context = best_dist_context if searched_non_pipeline_dist_context is None else searched_non_pipeline_dist_context
@@ -817,13 +821,16 @@ def search(self):
             pg0.add_ranks(process_mesh.processes)
         end_time = time.time()
         print(
-            "End MCMC searching: the min cost is {} and the search time is {}s.".
-            format(min_cost, end_time - start_time))
+            "End MCMC searching: the min cost is {} and the search time is {}s."
+            .format(min_cost, end_time - start_time))
         return searched_dist_context, min_cost
 
 
 class Planner:
-    def __init__(self, serial_program_info, parallelizer,
+
+    def __init__(self,
+                 serial_program_info,
+                 parallelizer,
                  algorithm_config=None):
         self._serial_program_info = serial_program_info
         self._parallelizer = parallelizer
diff --git a/python/paddle/distributed/auto_parallel/planner_v2.py b/python/paddle/distributed/auto_parallel/planner_v2.py
index 3625a25d74e0e..77496ed3e6d20 100755
--- a/python/paddle/distributed/auto_parallel/planner_v2.py
+++ b/python/paddle/distributed/auto_parallel/planner_v2.py
@@ -20,6 +20,7 @@
 
 
 class Planner:
+
     def __init__(self, mode, dist_context):
         self._mode = mode
         self._dist_context = dist_context
diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py
index d1b6e57ddc123..d583dcb32eb22 100644
--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -40,7 +40,7 @@ def get_world_process_group():
 
 def new_process_group(ranks):
     global _g_process_group_map
-    # A key constructed from ranks is used for avoiding duplication 
+    # A key constructed from ranks is used for avoiding duplication
     new_key = ''.join(map(str, sorted(ranks)))
     for pg_id, pg in _g_process_group_map.items():
         cur_key = ''.join(map(str, sorted(pg.ranks)))
@@ -57,12 +57,13 @@ def new_process_group(ranks):
 
 
 # This implementation refers to lots of Paddle/python/paddle/distributed/collective.py,
-# Fleet also has a collective helper which uses ops to initialize communication in 
+# Fleet also has a collective helper which uses ops to initialize communication in
 # Paddle/python/paddle/distributed/fleet/meta_optimizers/common.py. We use the first one
-# because it seems simple. This should be enhanced to manage the process membership and 
-# the instantiation process in a more general way. In the future, the process group may 
+# because it seems simple. This should be enhanced to manage the process membership and
+# the instantiation process in a more general way. In the future, the process group may
 # handle the communication implementation choice.
 class ProcessGroup:
+
     def __init__(self, group_id, ranks):
         if group_id == 0 and get_process_group(0) is not None:
             assert group_id != 0, "Process group id 0 is reserved for all ranks."
@@ -129,7 +130,7 @@ def instantiate(self):
             else:
                 assert False, ("No CUDA device found")
 
-        # TODO(shenliang03): This is a temporary solution to solve the problem of 
+        # TODO(shenliang03): This is a temporary solution to solve the problem of
         # hang caused by cross-creation of new_group
         tmp = paddle.to_tensor(
             [1], dtype="int32") if _non_static_mode() else fill_constant(
@@ -156,6 +157,6 @@ def __str__(self):
 
 
 # Note that Process group 0 is reserved for representing all ranks.
-# At the beginning, group 0 is empty and new ranks will be added automatically. 
+# At the beginning, group 0 is empty and new ranks will be added automatically.
 _g_process_group_map = {}
 _g_process_group_map[0] = ProcessGroup(0, [])
diff --git a/python/paddle/distributed/auto_parallel/process_mesh.py b/python/paddle/distributed/auto_parallel/process_mesh.py
index f95951a3bad73..f751087e29eb0 100644
--- a/python/paddle/distributed/auto_parallel/process_mesh.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh.py
@@ -97,7 +97,7 @@ def __init__(self, mesh):
         from .dist_context import get_default_distributed_context
         default_dist_cxt = get_default_distributed_context()
         default_dist_cxt.add_process_mesh(self)
-        # Add new processes to process group 0 
+        # Add new processes to process group 0
         from .process_group import get_process_group
         pg0 = get_process_group(0)
         pg0.add_ranks(self.processes)
diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py
index 7481ec736f09e..8fb3814221897 100644
--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -27,7 +27,7 @@
 from .dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
 from .process_group import new_process_group, ProcessGroup, _g_process_group_map
 
-# NOTE: If op in _g_special_ops, it will not be resharded. 
+# NOTE: If op in _g_special_ops, it will not be resharded.
 _g_special_ops = ['check_finite_and_unscale', 'update_loss_scaling']
 
 
@@ -195,34 +195,32 @@ class Inserter:
     def insert_send_op(block, idx, tensor, dst, op_role):
         """Insert send op into block at the given index."""
         op_type = 'send_v2'
-        block._insert_op(
-            idx,
-            type=op_type,
-            inputs={'X': [tensor]},
-            attrs={
-                'ring_id': 0,
-                'peer': dst,
-                'use_calc_stream': True,
-                'op_role': op_role
-            })
+        block._insert_op(idx,
+                         type=op_type,
+                         inputs={'X': [tensor]},
+                         attrs={
+                             'ring_id': 0,
+                             'peer': dst,
+                             'use_calc_stream': True,
+                             'op_role': op_role
+                         })
 
     @staticmethod
     def insert_recv_op(block, idx, tensor, src, op_role):
         """Insert recv op into block at the given index."""
         op_type = 'recv_v2'
-        block._insert_op(
-            idx,
-            type=op_type,
-            inputs={'X': [tensor]},
-            outputs={'Out': [tensor]},
-            attrs={
-                'ring_id': 0,
-                'peer': src,
-                'out_shape': tensor.shape,
-                'dtype': tensor.dtype,
-                'use_calc_stream': True,
-                'op_role': op_role
-            })
+        block._insert_op(idx,
+                         type=op_type,
+                         inputs={'X': [tensor]},
+                         outputs={'Out': [tensor]},
+                         attrs={
+                             'ring_id': 0,
+                             'peer': src,
+                             'out_shape': tensor.shape,
+                             'dtype': tensor.dtype,
+                             'use_calc_stream': True,
+                             'op_role': op_role
+                         })
 
     @staticmethod
     def insert_concat_op(block, idx, tensors, axis, op_role):
@@ -235,12 +233,11 @@ def insert_concat_op(block, idx, tensors, axis, op_role):
         with paddle.static.program_guard(block.program):
             out = helper.create_variable_for_type_inference(
                 dtype=helper.input_dtype())
-        block._insert_op(
-            idx,
-            type='concat',
-            inputs=inputs,
-            outputs={'Out': [out]},
-            attrs=attrs)
+        block._insert_op(idx,
+                         type='concat',
+                         inputs=inputs,
+                         outputs={'Out': [out]},
+                         attrs=attrs)
         return out
 
     @staticmethod
@@ -257,14 +254,14 @@ def insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name,
             'op_role': op_role
         }
         helper = LayerHelper('slice', **locals())
-        out = block.create_var(
-            name=new_var_name, dtype=tensor.dtype, type=tensor.type)
-        block._insert_op(
-            idx,
-            type="slice",
-            inputs=inputs,
-            outputs={'Out': [out]},
-            attrs=attrs)
+        out = block.create_var(name=new_var_name,
+                               dtype=tensor.dtype,
+                               type=tensor.type)
+        block._insert_op(idx,
+                         type="slice",
+                         inputs=inputs,
+                         outputs={'Out': [out]},
+                         attrs=attrs)
         return out
 
     @staticmethod
@@ -279,12 +276,11 @@ def insert_split_op(block, idx, tensor, num_or_sections, op_role):
                 helper.create_variable_for_type_inference(
                     dtype=helper.input_dtype()) for i in range(num_or_sections)
             ]
-        block._insert_op(
-            idx,
-            type="split",
-            inputs=inputs,
-            outputs={'Out': outs},
-            attrs=attrs)
+        block._insert_op(idx,
+                         type="split",
+                         inputs=inputs,
+                         outputs={'Out': outs},
+                         attrs=attrs)
         return outs
 
     @staticmethod
@@ -299,14 +295,15 @@ def insert_fill_constant_op(block, idx, op_role):
         attrs['value'] = int("1")
         attrs['dtype'] = out.dtype
         attrs['op_role'] = op_role
-        utils.get_shape_tensor_inputs(
-            inputs=inputs, attrs=attrs, shape=[0], op_type='fill_constant')
-        block._insert_op(
-            idx,
-            type='fill_constant',
-            inputs=inputs,
-            outputs={'Out': [out]},
-            attrs=attrs)
+        utils.get_shape_tensor_inputs(inputs=inputs,
+                                      attrs=attrs,
+                                      shape=[0],
+                                      op_type='fill_constant')
+        block._insert_op(idx,
+                         type='fill_constant',
+                         inputs=inputs,
+                         outputs={'Out': [out]},
+                         attrs=attrs)
         out.stop_gradient = True
         return out
 
@@ -320,29 +317,27 @@ def insert_allgather_op(block, idx, tensor, ranks, op_role):
         # instant process group before insert allgather op.
         if not group.is_instantiate():
             # insert fill_constant op
-            fill_constant_out = Inserter.insert_fill_constant_op(block, idx,
-                                                                 op_role)
+            fill_constant_out = Inserter.insert_fill_constant_op(
+                block, idx, op_role)
             fill_constant_out.stop_gradient = True
 
             # insert c_allreduce_sum op
-            block._insert_op(
-                idx + 1,
-                type="c_allreduce_sum",
-                inputs={'X': [fill_constant_out]},
-                outputs={'Out': [fill_constant_out]},
-                attrs={
-                    'ring_id': 0,
-                    'use_calc_stream': True,
-                    'op_role': op_role
-                })
+            block._insert_op(idx + 1,
+                             type="c_allreduce_sum",
+                             inputs={'X': [fill_constant_out]},
+                             outputs={'Out': [fill_constant_out]},
+                             attrs={
+                                 'ring_id': 0,
+                                 'use_calc_stream': True,
+                                 'op_role': op_role
+                             })
 
             # insert c_sync_calc_stream op
-            block._insert_op(
-                idx + 2,
-                type="c_sync_calc_stream",
-                inputs={'X': [fill_constant_out]},
-                outputs={'Out': [fill_constant_out]},
-                attrs={'op_role': op_role})
+            block._insert_op(idx + 2,
+                             type="c_sync_calc_stream",
+                             inputs={'X': [fill_constant_out]},
+                             outputs={'Out': [fill_constant_out]},
+                             attrs={'op_role': op_role})
             idx_offset = 3
 
         # insert c_allgather op
@@ -351,22 +346,22 @@ def insert_allgather_op(block, idx, tensor, ranks, op_role):
         with paddle.static.program_guard(block.program):
             allgather_out = helper.create_variable_for_type_inference(
                 dtype=tensor.dtype)
-        block._insert_op(
-            idx + idx_offset,
-            type=op_type,
-            inputs={'X': [tensor]},
-            outputs={'Out': [allgather_out]},
-            attrs={
-                'ring_id': group.id,
-                'use_calc_stream': True,
-                'nranks': group.nranks,
-                'op_role': op_role
-            })
+        block._insert_op(idx + idx_offset,
+                         type=op_type,
+                         inputs={'X': [tensor]},
+                         outputs={'Out': [allgather_out]},
+                         attrs={
+                             'ring_id': group.id,
+                             'use_calc_stream': True,
+                             'nranks': group.nranks,
+                             'op_role': op_role
+                         })
         idx_offset += 1
 
         # insert split op
-        split_out = Inserter.insert_split_op(
-            block, idx + idx_offset, allgather_out, group.nranks, op_role)
+        split_out = Inserter.insert_split_op(block, idx + idx_offset,
+                                             allgather_out, group.nranks,
+                                             op_role)
         idx_offset += 1
         tensor_list.extend(split_out)
         return tensor_list, idx_offset
@@ -740,12 +735,12 @@ def compute_concat_info(partition_index_x, partition_index_y):
         for idx, item in enumerate(partition_index_x):
             if item != partition_index_y[idx]:
                 differ_count += 1
-                if item[1] == partition_index_y[idx][0] and item[
-                        0] < partition_index_y[idx][1]:
+                if item[1] == partition_index_y[idx][
+                        0] and item[0] < partition_index_y[idx][1]:
                     concat_axis = idx
                     new_partition.append([item[0], partition_index_y[idx][1]])
-                elif item[0] == partition_index_y[idx][1] and item[
-                        1] > partition_index_y[idx][0]:
+                elif item[0] == partition_index_y[idx][
+                        1] and item[1] > partition_index_y[idx][0]:
                     first_order = 1
                     concat_axis = idx
                     new_partition.append([partition_index_y[idx][0], item[1]])
@@ -839,8 +834,8 @@ def change_while_op_input_and_output(auto_parallel_main_prog, dist_context):
     def is_overlapped(self, shape_x, shape_y):
         """Judge whether two partitions intersect on the specified dimension."""
         overlapped = False
-        if (shape_y[0] <= shape_x[0] < shape_y[1]) or (
-                shape_x[0] <= shape_y[0] < shape_x[1]):
+        if (shape_y[0] <= shape_x[0] < shape_y[1]) or (shape_x[0] <= shape_y[0]
+                                                       < shape_x[1]):
             overlapped = True
         return overlapped
 
@@ -986,9 +981,10 @@ def get_op_process_meshes(self, op):
         dist_op = self.dist_context.get_dist_op_for_program(op)
         op_process_mesh = dist_op.dist_attr.process_mesh
         for process_mesh in self.dist_context.process_meshes:
-            if set(process_mesh.processes) & (
-                    set(op_process_mesh.processes)
-            ) and len(process_mesh.processes) <= len(op_process_mesh.processes):
+            if set(process_mesh.processes) & (set(
+                    op_process_mesh.processes)) and len(
+                        process_mesh.processes) <= len(
+                            op_process_mesh.processes):
                 process_meshes.append(process_mesh)
 
         # it means the process mesh is not a union when process meshes is null
@@ -1085,9 +1081,8 @@ def find_op_desc_seq(self, dist_tensor, dist_op, actual_process_mesh):
                         process_list[index].append(source_process)
                         has_used[index].append(False)
                     else:
-                        partition_process_mapping_list.append([
-                            source_partition_index, [source_process], [False]
-                        ])
+                        partition_process_mapping_list.append(
+                            [source_partition_index, [source_process], [False]])
 
             for target_process in target_process_group:
                 has_sent = []
@@ -1152,8 +1147,8 @@ def find_op_desc_seq(self, dist_tensor, dist_op, actual_process_mesh):
                 slices_axes = []
                 concatenated_partition_index = partition_index_list[0]
                 for idx, item in enumerate(concatenated_partition_index):
-                    slice_starts.append(target_partition_index[idx][0] - item[
-                        0])
+                    slice_starts.append(target_partition_index[idx][0] -
+                                        item[0])
                     slice_ends.append(target_partition_index[idx][1] - item[0])
                     slices_axes.append(idx)
                 op_desc_seq[target_process].append(
@@ -1170,8 +1165,9 @@ def find_op_desc_seq(self, dist_tensor, dist_op, actual_process_mesh):
                     source_process_shape, source_process_group)
                 if source_partition_index not in partition_index_list:
                     partition_index_list.append(source_partition_index)
-                    process_index.append(
-                        [[source_process, ], source_partition_index])
+                    process_index.append([[
+                        source_process,
+                    ], source_partition_index])
                 else:
                     process_index[partition_index_list.index(
                         source_partition_index)][0].append(source_process)
@@ -1195,8 +1191,9 @@ def find_op_desc_seq(self, dist_tensor, dist_op, actual_process_mesh):
                         slice_ends.append(item[1])
                         slices_axes.append(idx)
 
-                    slice_op_desc = SliceOpDesc(
-                        starts=slice_starts, ends=slice_ends, axes=slices_axes)
+                    slice_op_desc = SliceOpDesc(starts=slice_starts,
+                                                ends=slice_ends,
+                                                axes=slices_axes)
                     op_desc_seq[process] = [AllGatherOpDesc(group=group),
                                             ConcatOpDesc(partition_index_list=all_partition_index_list), slice_op_desc] \
                         if len(group) > 1 else [slice_op_desc]
@@ -1227,9 +1224,8 @@ def parse_op_desc(self, block, op_desc_seq, var_name, reshard_op,
             if isinstance(op_desc, AllGatherOpDesc):  # noqa: F401
                 if var_name not in self.has_allgather.keys():
                     self.has_allgather[var_name] = []
-                if not self.has_allgather[
-                        var_name] or op_desc.group not in list(
-                            map(lambda x: x[0], self.has_allgather[var_name])):
+                if not self.has_allgather[var_name] or op_desc.group not in list(
+                        map(lambda x: x[0], self.has_allgather[var_name])):
                     tensor_list, idx_offset = Inserter.insert_allgather_op(
                         block, idx, source_tensor, op_desc.group,
                         reshard_op.attr('op_role'))
@@ -1317,11 +1313,11 @@ def parse_op_desc(self, block, op_desc_seq, var_name, reshard_op,
                     target_tensor, tensor_attr)
 
                 if op.type == "while":
-                    # var_reshard_mapping means the while op input need be changed to 
+                    # var_reshard_mapping means the while op input need be changed to
                     if "var_reshard_mapping" not in Resharder.while_block_info[
                             op.attr("sub_block").id].keys():
-                        Resharder.while_block_info[op.attr("sub_block").id][
-                            "var_reshard_mapping"] = {}
+                        Resharder.while_block_info[op.attr(
+                            "sub_block").id]["var_reshard_mapping"] = {}
                     Resharder.while_block_info[op.attr("sub_block").id][
                         "var_reshard_mapping"][var_name] = target_tensor.name
 
@@ -1370,8 +1366,8 @@ def reshard(self):
                                     op_dist_attr.set_input_dims_mapping(
                                         var_reshard_mapping[var_name],
                                         dims_mapping)
-                                    op_dist_attr.set_input_dist_attr(var_name,
-                                                                     None)
+                                    op_dist_attr.set_input_dist_attr(
+                                        var_name, None)
 
                         # the outputs also need to be renamed when the output name is the same with input name
                         for var_name in op.output_arg_names:
@@ -1388,8 +1384,8 @@ def reshard(self):
                                     op_dist_attr.set_output_dims_mapping(
                                         var_reshard_mapping[var_name],
                                         dims_mapping)
-                                    op_dist_attr.set_output_dist_attr(var_name,
-                                                                      None)
+                                    op_dist_attr.set_output_dist_attr(
+                                        var_name, None)
 
             idx = 0
             while idx < len(block.ops):
@@ -1412,10 +1408,10 @@ def reshard(self):
                         assert process_meshes
                         if op.attr("sub_block"
                                    ).id not in Resharder.while_block_info:
-                            Resharder.while_block_info[op.attr("sub_block")
-                                                       .id] = {}
-                        Resharder.while_block_info[op.attr("sub_block").id][
-                            "op_id"] = op.desc.id()
+                            Resharder.while_block_info[op.attr(
+                                "sub_block").id] = {}
+                        Resharder.while_block_info[op.attr(
+                            "sub_block").id]["op_id"] = op.desc.id()
                         Resharder.while_block_info[op.attr("sub_block").id][
                             "actual_process_mesh"] = self.get_while_op_actual_process_mesh(
                                 op)
@@ -1476,13 +1472,13 @@ def reshard(self):
                                 recv_rank = dist_tensor.dist_attr.process_mesh.processes[
                                     index]
                                 if self.rank_id == item:
-                                    Inserter.insert_send_op(block, idx + 1, var,
-                                                            recv_rank,
-                                                            op.attr('op_role'))
+                                    Inserter.insert_send_op(
+                                        block, idx + 1, var, recv_rank,
+                                        op.attr('op_role'))
                                 if self.rank_id == recv_rank:
-                                    Inserter.insert_recv_op(block, idx + 1, var,
-                                                            item,
-                                                            op.attr('op_role'))
+                                    Inserter.insert_recv_op(
+                                        block, idx + 1, var, item,
+                                        op.attr('op_role'))
                             cur_op_count = len(block.ops)
                             idx_offset = idx_offset + cur_op_count - pre_op_count
                             pre_op_count = cur_op_count
diff --git a/python/paddle/distributed/auto_parallel/tuner/recorder.py b/python/paddle/distributed/auto_parallel/tuner/recorder.py
index ba61843831a25..de3c9cb84295b 100644
--- a/python/paddle/distributed/auto_parallel/tuner/recorder.py
+++ b/python/paddle/distributed/auto_parallel/tuner/recorder.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Notice that the following codes are modified from KerasTuner for a different purpose. 
+# Notice that the following codes are modified from KerasTuner for a different purpose.
 # Please refer to https://github.com/keras-team/keras-tuner/blob/master/keras_tuner/engine/metrics_tracking.py.
 
 import numpy as np
diff --git a/python/paddle/distributed/auto_parallel/tuner/storable.py b/python/paddle/distributed/auto_parallel/tuner/storable.py
index 63e5eba77f15c..18a0669d62286 100644
--- a/python/paddle/distributed/auto_parallel/tuner/storable.py
+++ b/python/paddle/distributed/auto_parallel/tuner/storable.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Notice that the following codes are modified from KerasTuner for a different purpose. 
+# Notice that the following codes are modified from KerasTuner for a different purpose.
 # Please refer to https://github.com/keras-team/keras-tuner/blob/master/keras_tuner/engine/metrics_tracking.py.
 
 import json
 
 
 class Storable(object):
+
     def get_state(self):
         raise NotImplementedError
 
diff --git a/python/paddle/distributed/auto_parallel/tuner/trial.py b/python/paddle/distributed/auto_parallel/tuner/trial.py
index 1cda82f1edec9..78139cbd58b33 100644
--- a/python/paddle/distributed/auto_parallel/tuner/trial.py
+++ b/python/paddle/distributed/auto_parallel/tuner/trial.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Notice that the following codes are modified from KerasTuner to implement our own tuner. 
+# Notice that the following codes are modified from KerasTuner to implement our own tuner.
 # Please refer to https://github.com/keras-team/keras-tuner/blob/master/keras_tuner/engine/trial.py.
 
 import hashlib
@@ -33,7 +33,10 @@ class TrialStatus:
 
 
 class Trial(Storable):
-    def __init__(self, tunable_space, trial_id=None,
+
+    def __init__(self,
+                 tunable_space,
+                 trial_id=None,
                  status=TrialStatus.RUNNING):
         self._id = _generate_trial_id() if trial_id is None else trial_id
         self._space = tunable_space
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
index 2838a01958433..93ae25c9c4dd1 100644
--- a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Notice that the following codes are modified from KerasTuner to implement our own tuner. 
+# Notice that the following codes are modified from KerasTuner to implement our own tuner.
 # Please refer to https://github.com/keras-team/keras-tuner/blob/master/keras_tuner/engine/hyperparameters.py.
 
 import collections
@@ -103,13 +103,19 @@ def choice(self, name, values, default=None):
         return self._retrieve(tv)
 
     def int_range(self, name, start, stop, step=1, default=None):
-        tv = IntRange(
-            name=name, start=start, stop=stop, step=step, default=default)
+        tv = IntRange(name=name,
+                      start=start,
+                      stop=stop,
+                      step=step,
+                      default=default)
         return self._retrieve(tv)
 
     def float_range(self, name, start, stop, step=None, default=None):
-        tv = FloatRange(
-            name=name, start=start, stop=stop, step=step, default=default)
+        tv = FloatRange(name=name,
+                        start=start,
+                        stop=stop,
+                        step=step,
+                        default=default)
         return self._retrieve(tv)
 
     def get_state(self):
@@ -118,7 +124,8 @@ def get_state(self):
                 "class_name": v.__class__.__name__,
                 "state": v.get_state()
             } for v in self._variables.values()],
-            "values": dict((k, v) for (k, v) in self.values.items())
+            "values":
+            dict((k, v) for (k, v) in self.values.items())
         }
 
     @classmethod
@@ -138,8 +145,8 @@ def _deserialize_tunable_variable(state):
     if isinstance(state, classes):
         return state
 
-    if (not isinstance(state, dict) or "class_name" not in state or
-            "state" not in state):
+    if (not isinstance(state, dict) or "class_name" not in state
+            or "state" not in state):
         raise ValueError(
             "Expect state to be a python dict containing class_name and state as keys, but found {}"
             .format(state))
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
index 19f118fdde77a..424b6b74bb154 100644
--- a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Notice that the following codes are modified from KerasTuner to implement our own tuner. 
+# Notice that the following codes are modified from KerasTuner to implement our own tuner.
 # Please refer to https://github.com/keras-team/keras-tuner/blob/master/keras_tuner/engine/hyperparameters.py.
 
 import numpy as np
@@ -49,8 +49,8 @@ def __init__(self, name, default):
         self.name = name
         if not isinstance(default, (str, int, float, bool)):
             raise ValueError(
-                "Fixed must be an str, int, float or bool, but found {}"
-                .format(default))
+                "Fixed must be an str, int, float or bool, but found {}".format(
+                    default))
         self._default = default
 
     def random(self, seed=None):
@@ -76,11 +76,12 @@ def random(self, seed=None):
         return rng.choice((True, False))
 
     def __repr__(self):
-        return 'Boolean(name: "{}", default: {})'.format(self.name,
-                                                         self.default)
+        return 'Boolean(name: "{}", default: {})'.format(
+            self.name, self.default)
 
 
 class Choice(TunableVariable):
+
     def __init__(self, name, values, default=None):
         super(Choice, self).__init__(name=name, default=default)
 
@@ -114,8 +115,8 @@ def __init__(self, name, values, default=None):
 
         if default is not None and default not in values:
             raise ValueError(
-                "The default value should be one of the choices {}, but found {}".
-                format(values, default))
+                "The default value should be one of the choices {}, but found {}"
+                .format(values, default))
         self._default = default
 
     @property
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index 42d90b0d4d619..c0f6f90c6afc9 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -327,8 +327,8 @@ def _get_corresponding_rank(dist_context, target_mesh, rank):
     # assert coordinate is not None, "could NOT found rank [{}] in any registered mesh".format(
     #     rank)
     if coordinate is not None:
-        return target_mesh.processes[_coordinate2linear_idx(mesh.topology,
-                                                            coordinate)]
+        return target_mesh.processes[_coordinate2linear_idx(
+            mesh.topology, coordinate)]
     else:
         return target_mesh.processes[0]
 
@@ -381,8 +381,8 @@ def _update_addition_info(addition_info):
             if item not in ["epoch", "batch", "batch_size"]:
                 raise ValueError(
                     "The key of 'addition_info' should be one of the "
-                    "['epoch', 'batch', 'batch_size'], but got '{}'."
-                    .format(str(item)))
+                    "['epoch', 'batch', 'batch_size'], but got '{}'.".format(
+                        str(item)))
             if not isinstance(value, int):
                 raise ValueError(
                     "The value of 'addition_info' should be 'int', "
@@ -401,8 +401,8 @@ def _check_valid_path(file_path):
                 raise TypeError("The type of file path should be 'str', "
                                 "but got '{}'.".format(str(type(file))))
             if not os.path.exists(file):
-                raise ValueError("The file path '{}' does not exist."
-                                 .format(file))
+                raise ValueError(
+                    "The file path '{}' does not exist.".format(file))
         return file_path
     else:
         raise TypeError("The type of file path should be 'list', "
@@ -580,8 +580,9 @@ def load_checkpoint_into_program(checkpoint_path,
     all_cur_dist_attr = get_dist_attr(program, dist_context)
     all_param_dict = all_state_dict_info["model"]
     addition_info = all_state_dict_info["addition_info"]
-    sliced_param_dict = merge_and_slice_parameter(
-        all_param_dict, all_pre_dist_attr, all_cur_dist_attr)
+    sliced_param_dict = merge_and_slice_parameter(all_param_dict,
+                                                  all_pre_dist_attr,
+                                                  all_cur_dist_attr)
     load_parameter_into_program(sliced_param_dict, program)
 
     return addition_info
@@ -613,8 +614,8 @@ def _save_distributed_attribute(program, dist_attr_path, dist_context):
         "world_size": paddle.distributed.get_world_size()
     }
     paddle.save(dist_attr_dict, dist_attr_name)
-    logging.info("Already saved distributed attribute to '{}'.".format(
-        dist_attr_path))
+    logging.info(
+        "Already saved distributed attribute to '{}'.".format(dist_attr_path))
 
 
 def _load_distributed_attribute(dist_attr_path):
@@ -715,8 +716,8 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
     for name, value in dist_param_dict.items():
         if not isinstance(name, str):
             raise TypeError("The key of 'dist_param_dict' is parameter's name, "
-                            "and its type should be 'str', but got {}."
-                            .format(str(type(name))))
+                            "and its type should be 'str', but got {}.".format(
+                                str(type(name))))
         if not isinstance(value, list) or not all(
                 isinstance(v, np.ndarray) for v in value):
             raise TypeError(
@@ -748,16 +749,16 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
         pre_dims_mapping = pre_attr["dims_mapping"]
         cur_dims_mapping = cur_attr["dims_mapping"]
         if len(set(pre_dims_mapping)) > 1 or -1 not in pre_dims_mapping:
-            complete_param = _merge_parameter_with_dist_attr(pre_param,
-                                                             pre_attr)
+            complete_param = _merge_parameter_with_dist_attr(
+                pre_param, pre_attr)
             dist_param_dict[var_name] = complete_param
         else:
             complete_param = pre_param[0]
             dist_param_dict[var_name] = complete_param
 
         if len(set(cur_dims_mapping)) > 1 or -1 not in cur_dims_mapping:
-            sliced_param = _slice_parameter_with_dist_attr(complete_param,
-                                                           cur_attr)
+            sliced_param = _slice_parameter_with_dist_attr(
+                complete_param, cur_attr)
             dist_param_dict[var_name] = sliced_param
 
     for var_name in pre_dist_attr:
@@ -766,12 +767,13 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
             dist_param_dict.pop(var_name)
 
     if param_not_in_pre:
-        warnings.warn("Parameters '{}' are not found in last training process."
-                      .format(str(param_not_in_pre)))
+        warnings.warn(
+            "Parameters '{}' are not found in last training process.".format(
+                str(param_not_in_pre)))
     if param_not_in_cur:
         warnings.warn(
-            "Parameters '{}' are not found in current training process."
-            .format(str(param_not_in_cur)))
+            "Parameters '{}' are not found in current training process.".format(
+                str(param_not_in_cur)))
 
     return dist_param_dict
 
@@ -784,8 +786,9 @@ def _merge_parameter_with_dist_attr(param_list, dist_attr):
     process_shape = dist_attr["process_shape"]
     process_group = dist_attr["process_group"]
     # get the complete shape of the parameter
-    complete_shape = Resharder.compute_complete_shape(
-        param_list[0].shape, process_shape, dims_mapping)
+    complete_shape = Resharder.compute_complete_shape(param_list[0].shape,
+                                                      process_shape,
+                                                      dims_mapping)
     # merge the parameter with dist_attr
     partition_param_list = []
     merged_partiton = []
@@ -818,8 +821,9 @@ def _slice_parameter_with_dist_attr(param, dist_attr):
                                          len(partition_index_list))
     # get the current parameter's index in sliced_param_list
     rank_id = paddle.distributed.get_rank()
-    sliced_param_index = _get_sliced_param_index(
-        rank_id, param.shape, dims_mapping, process_shape, process_group)
+    sliced_param_index = _get_sliced_param_index(rank_id, param.shape,
+                                                 dims_mapping, process_shape,
+                                                 process_group)
     sliced_param = sliced_param_list[sliced_param_index]
     return sliced_param
 
@@ -899,8 +903,9 @@ def _slice_parameter(complete_param, partition_index_list, length):
     """
     sliced_param_list = []
     axis = len(complete_param.shape) - length
-    sliced_param = np.split(
-        complete_param, partition_index_list[axis], axis=axis)
+    sliced_param = np.split(complete_param,
+                            partition_index_list[axis],
+                            axis=axis)
     if length == 1:
         return sliced_param
     for param in sliced_param:
@@ -938,8 +943,10 @@ def _get_sliced_param_index(rank, complete_shape, dims_mapping, process_shape,
     """
     from .reshard import Resharder
 
-    partition_index = Resharder.compute_partition_index(
-        rank, complete_shape, dims_mapping, process_shape, process_group)
+    partition_index = Resharder.compute_partition_index(rank, complete_shape,
+                                                        dims_mapping,
+                                                        process_shape,
+                                                        process_group)
     sliced_param_index = 0
     for i, shape in enumerate(complete_shape):
         if dims_mapping[i] == -1:
@@ -1090,8 +1097,8 @@ def is_forward_op(op):
     ref_role1 = int(core.op_proto_and_checker_maker.OpRole.Forward)
     ref_role2 = int(core.op_proto_and_checker_maker.OpRole.Loss)
     op_role = int(op.attr('op_role'))
-    return OP_ROLE_KEY in op.attr_names and (op_role == ref_role1 or
-                                             op_role == ref_role2)
+    return OP_ROLE_KEY in op.attr_names and (op_role == ref_role1
+                                             or op_role == ref_role2)
 
 
 def is_backward_op(op):
@@ -1099,6 +1106,11 @@ def is_backward_op(op):
             int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Backward)
 
 
+def is_optimize_op(op):
+    return OP_ROLE_KEY in op.attr_names and \
+            int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Optimize)
+
+
 def is_loss_op(op):
     return OP_ROLE_KEY in op.attr_names and \
         int(op.all_attrs()[OP_ROLE_KEY]) == (int(core.op_proto_and_checker_maker.OpRole.Forward) | int(core.op_proto_and_checker_maker.OpRole.Loss))
@@ -1112,8 +1124,8 @@ def get_loss_op(block):
     loss_ops = []
     for op in block.ops:
         if is_loss_op(op):
-            assert len(op.desc.output_arg_names(
-            )) == 1, "loss op should only output loss var"
+            assert len(op.desc.output_arg_names()
+                       ) == 1, "loss op should only output loss var"
             loss_ops.append(op)
 
     assert len(loss_ops) == 1, "num of loss op is not equal to one"
@@ -1132,8 +1144,8 @@ def set_var_dist_attr(dist_context, var, dims_mapping, process_mesh, **kwargs):
     return tensor_dist_attr
 
 
-def naive_set_dist_op_attr_for_program_by_mesh_and_mapping(new_op, process_mesh,
-                                                           ref_mapping, ctx):
+def naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+        new_op, process_mesh, ref_mapping, ctx):
     assert process_mesh is not None
     assert ref_mapping is not None
 
@@ -1304,6 +1316,7 @@ def get_all_distributed_main_program(serial_program_info, dist_context,
 
 
 class SerialProgramInfo:
+
     def __init__(self,
                  train_program,
                  satrtup_program,
@@ -1338,6 +1351,7 @@ def cluster(self):
 
 
 def get_standalone_cost_data(distributed_programs):
+
     def _compute_runtime(op_cost, op, vars):
         runtime = 0
         try:
@@ -1356,8 +1370,8 @@ def _compute_runtime(op_cost, op, vars):
                 shape_left_boundary = info.find("[")
                 shape_right_boundary = info.find("]")
                 assert shape_left_boundary > 0 and shape_right_boundary > 0 and shape_right_boundary > shape_left_boundary, "Get shape failed."
-                shape = info[shape_left_boundary + 1:
-                             shape_right_boundary].split(",")
+                shape = info[shape_left_boundary +
+                             1:shape_right_boundary].split(",")
                 shape = list(map(lambda x: int(x.strip()), shape))
                 dtype_factor = 1
                 total_static_input_size += reduce(lambda x, y: x * y, shape)
@@ -1399,20 +1413,21 @@ def _compute_runtime(op_cost, op, vars):
             if op.type in not_enum_ops:
                 cost_data[op.desc.id()] = runtime
                 continue
-            dtype = str(vars[op.input_arg_names[0]]
-                        .dtype) if op.input_arg_names else "float32"
+            dtype = str(vars[op.input_arg_names[0]].dtype
+                        ) if op.input_arg_names else "float32"
             if int(op.attr('op_role')) == int(OpRole.Backward):
                 if "_grad" in op.type:
                     forward_op_name = op.type[:-5]
                     if forward_op_name in OP_NAME_MAPPING.keys():
                         forward_op_name = OP_NAME_MAPPING[forward_op_name]
-                    op_cost = cost_model.get_static_op_time(
-                        forward_op_name, forward=False, dtype=dtype)
+                    op_cost = cost_model.get_static_op_time(forward_op_name,
+                                                            forward=False,
+                                                            dtype=dtype)
                     if op_cost:
                         runtime = _compute_runtime(op_cost, op, vars)
                     else:
-                        op_cost = cost_model.get_static_op_time(
-                            forward_op_name, dtype=dtype)
+                        op_cost = cost_model.get_static_op_time(forward_op_name,
+                                                                dtype=dtype)
                         if op_cost:
                             runtime = 2 * _compute_runtime(op_cost, op, vars)
             elif int(op.attr('op_role')) == int(OpRole.Forward):
diff --git a/python/paddle/distributed/cloud_utils.py b/python/paddle/distributed/cloud_utils.py
index 34e55bf164673..a8eedb96a3ecd 100644
--- a/python/paddle/distributed/cloud_utils.py
+++ b/python/paddle/distributed/cloud_utils.py
@@ -66,8 +66,8 @@ def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_devices):
 
                 if paddle_ports_num >= len(
                         selected_devices) and paddle_port != args_port:
-                    logger.warning("Use Cloud specified port:{}.".format(
-                        paddle_port))
+                    logger.warning(
+                        "Use Cloud specified port:{}.".format(paddle_port))
                     started_port = paddle_port
 
             except Exception as e:
@@ -87,12 +87,13 @@ def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_devices):
         trainer_endpoints = []
         assert num_nodes * paddle_ports_num == len(trainer_endpoints_ori)
         for i in range(num_nodes):
-            trainer_endpoints.append(trainer_endpoints_ori[
-                i * paddle_ports_num:(i + 1) * paddle_ports_num])
+            trainer_endpoints.append(
+                trainer_endpoints_ori[i * paddle_ports_num:(i + 1) *
+                                      paddle_ports_num])
 
     logger.debug("parsed from args: node_ips:{} \
-        node_ip:{} node_rank:{} trainer_endpoints:{}"
-                 .format(node_ips, node_ip, node_rank, trainer_endpoints))
+        node_ip:{} node_rank:{} trainer_endpoints:{}".format(
+        node_ips, node_ip, node_rank, trainer_endpoints))
 
     cluster, pod = get_cluster(node_ips, node_ip, trainer_endpoints,
                                selected_devices)
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 5f481bd0dca41..1fd5bde1a5468 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -158,8 +158,9 @@ def _get_group_map():
     global _group_map
     if not _group_map:
         genv = _get_global_env()
-        _group_map[0] = Group(
-            genv.rank, genv.world_size, ranks=list(range(genv.world_size)))
+        _group_map[0] = Group(genv.rank,
+                              genv.world_size,
+                              ranks=list(range(genv.world_size)))
     return _group_map
 
 
@@ -264,20 +265,19 @@ def _new_process_group_impl(backend,
             cluster_id - 1]
         global_rank = cluster_offset + rank
         global_world_size = cluster_size_cumsum[-1]
-        pg = core.ProcessGroupHeter(
-            store,
-            rank=global_rank,
-            world_size=global_world_size,
-            place=place,
-            gid=group_id,
-            local_rank=rank,
-            local_size=world_size,
-            gloo_rank=cluster_id,
-            gloo_size=len(cluster_size),
-            with_switch=True,
-            switch_endpoint=switch_ep,
-            src_rank=src_rank,
-            dst_rank=dst_rank)
+        pg = core.ProcessGroupHeter(store,
+                                    rank=global_rank,
+                                    world_size=global_world_size,
+                                    place=place,
+                                    gid=group_id,
+                                    local_rank=rank,
+                                    local_size=world_size,
+                                    gloo_rank=cluster_id,
+                                    gloo_size=len(cluster_size),
+                                    with_switch=True,
+                                    switch_endpoint=switch_ep,
+                                    src_rank=src_rank,
+                                    dst_rank=dst_rank)
 
     return pg
 
@@ -323,11 +323,10 @@ def barrier(group=None):
     if not isinstance(ring_id, int):
         raise ValueError("The type of 'group' for barrier must be int.")
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [temp]},
-        outputs={'Out': [temp]},
-        attrs={'ring_id': ring_id})
+    helper.append_op(type=op_type,
+                     inputs={'X': [temp]},
+                     outputs={'Out': [temp]},
+                     attrs={'ring_id': ring_id})
 
 
 # _custom_gid provides a way for users to
@@ -386,16 +385,15 @@ def new_group(ranks=None, backend=None):
             rank = 0 if backend == 'heter' else ranks.index(global_rank)
             src_rank = ranks[0] if backend == 'heter' else None
             dst_rank = ranks[1] if backend == 'heter' else None
-            pg = _new_process_group_impl(
-                backend,
-                _default_store,
-                rank,
-                size,
-                group_name,
-                pg_options=None,
-                group_id=gid,
-                src_rank=src_rank,
-                dst_rank=dst_rank)
+            pg = _new_process_group_impl(backend,
+                                         _default_store,
+                                         rank,
+                                         size,
+                                         group_name,
+                                         pg_options=None,
+                                         group_id=gid,
+                                         src_rank=src_rank,
+                                         dst_rank=dst_rank)
         else:
             rank = -1
             pg = None
@@ -403,11 +401,9 @@ def new_group(ranks=None, backend=None):
         _group_map_by_name[group_name] = group
         _group_map[gid] = group
 
-        # TODO(shenliang03): This is a temporary solution to solve the problem of 
+        # TODO(shenliang03): This is a temporary solution to solve the problem of
         # hang caused by tcp
-        tmp = paddle.to_tensor([1], dtype="int32")
-        paddle.distributed.all_reduce(tmp, group=group, use_calc_stream=True)
-        paddle.distributed.wait(tmp)
+        paddle.distributed.barrier(group=group)
         return group
 
     if not backend:
@@ -456,7 +452,7 @@ def new_group(ranks=None, backend=None):
         else:
             return gp
 
-    # TODO(shenliang03): This is a temporary solution to solve the problem of 
+    # TODO(shenliang03): This is a temporary solution to solve the problem of
     # hang caused by cross-creation of new_group
     tmp = paddle.to_tensor(
         [1], dtype="int32") if _non_static_mode() else fill_constant(
@@ -514,7 +510,8 @@ def _sync_calc_stream(tensor):
     helper.append_op(
         type=op_type,
         inputs={'X': [tensor]},
-        outputs={'Out': [tensor]}, )
+        outputs={'Out': [tensor]},
+    )
 
 
 def _sync_comm_stream(tensor, ring_id=0):
@@ -529,7 +526,8 @@ def _sync_comm_stream(tensor, ring_id=0):
         type=op_type,
         inputs={'X': [tensor]},
         outputs={'Out': [tensor]},
-        attrs={'ring_id': ring_id}, )
+        attrs={'ring_id': ring_id},
+    )
 
 
 def broadcast(tensor, src, group=None, use_calc_stream=True):
@@ -607,15 +605,14 @@ def broadcast(tensor, src, group=None, use_calc_stream=True):
         'broadcast')
 
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [tensor]},
-        outputs={'Out': [tensor]},
-        attrs={
-            'root': gsrc,
-            'use_calc_stream': use_calc_stream,
-            'ring_id': ring_id,
-        })
+    helper.append_op(type=op_type,
+                     inputs={'X': [tensor]},
+                     outputs={'Out': [tensor]},
+                     attrs={
+                         'root': gsrc,
+                         'use_calc_stream': use_calc_stream,
+                         'ring_id': ring_id,
+                     })
 
 
 def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True):
@@ -715,12 +712,13 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True):
     if not isinstance(ring_id, int):
         raise ValueError("The type of 'ring_id' for all_reduce should be int.")
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [tensor]},
-        outputs={'Out': [tensor]},
-        attrs={'ring_id': ring_id,
-               'use_calc_stream': use_calc_stream})
+    helper.append_op(type=op_type,
+                     inputs={'X': [tensor]},
+                     outputs={'Out': [tensor]},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': use_calc_stream
+                     })
 
 
 def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True):
@@ -829,15 +827,14 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True):
         op_type = 'c_reduce_prod'
 
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [tensor]},
-        outputs={'Out': [tensor]},
-        attrs={
-            'ring_id': ring_id,
-            'use_calc_stream': use_calc_stream,
-            'root_id': gdst,
-        })
+    helper.append_op(type=op_type,
+                     inputs={'X': [tensor]},
+                     outputs={'Out': [tensor]},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': use_calc_stream,
+                         'root_id': gdst,
+                     })
 
 
 def all_gather(tensor_list, tensor, group=None, use_calc_stream=True):
@@ -927,15 +924,14 @@ def all_gather(tensor_list, tensor, group=None, use_calc_stream=True):
         check_variable_and_dtype(
             tensor, 'tensor',
             ['float16', 'float32', 'float64', 'int32', 'int64'], 'all_gather')
-        helper.append_op(
-            type=op_type,
-            inputs={'X': [tensor]},
-            outputs={'Out': [out]},
-            attrs={
-                'ring_id': ring_id,
-                'use_calc_stream': use_calc_stream,
-                'nranks': nranks
-            })
+        helper.append_op(type=op_type,
+                         inputs={'X': [tensor]},
+                         outputs={'Out': [out]},
+                         attrs={
+                             'ring_id': ring_id,
+                             'use_calc_stream': use_calc_stream,
+                             'nranks': nranks
+                         })
 
     tensor_list.extend(paddle.split(out, nranks, 0))
 
@@ -1028,16 +1024,15 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
         'scatter')
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [temp]},
-        outputs={'Out': [tensor]},
-        attrs={
-            'ring_id': ring_id,
-            'root': gsrc,
-            'use_calc_stream': use_calc_stream,
-            'nranks': nranks,
-        })
+    helper.append_op(type=op_type,
+                     inputs={'X': [temp]},
+                     outputs={'Out': [tensor]},
+                     attrs={
+                         'ring_id': ring_id,
+                         'root': gsrc,
+                         'use_calc_stream': use_calc_stream,
+                         'nranks': nranks,
+                     })
 
 
 def _c_identity(tensor, group=None):
@@ -1067,15 +1062,14 @@ def _c_identity(tensor, group=None):
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
         '_c_identity')
 
-    helper.append_op(
-        type=op_type,
-        inputs={'X': tensor},
-        outputs={'Out': out},
-        attrs={
-            'ring_id': ring_id,
-            'use_calc_stream': True,
-            'use_model_parallel': True,
-        })
+    helper.append_op(type=op_type,
+                     inputs={'X': tensor},
+                     outputs={'Out': out},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': True,
+                         'use_model_parallel': True,
+                     })
     return out
 
 
@@ -1113,17 +1107,16 @@ def _c_concat(tensor, group=None):
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
         '_c_concat')
 
-    helper.append_op(
-        type=op_type,
-        inputs={'X': tensor},
-        outputs={'Out': out},
-        attrs={
-            'ring_id': ring_id,
-            'use_calc_stream': True,
-            'use_model_parallel': True,
-            'nranks': nranks,
-            'rank': rank
-        })
+    helper.append_op(type=op_type,
+                     inputs={'X': tensor},
+                     outputs={'Out': out},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': True,
+                         'use_model_parallel': True,
+                         'nranks': nranks,
+                         'rank': rank
+                     })
     return out
 
 
@@ -1161,17 +1154,16 @@ def _c_split(tensor, group=None):
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
         '_c_split')
 
-    helper.append_op(
-        type=op_type,
-        inputs={'X': tensor},
-        outputs={'Out': out},
-        attrs={
-            'ring_id': ring_id,
-            'use_calc_stream': True,
-            'rank': rank,
-            'nranks': nranks,
-            'use_model_parallel': True,
-        })
+    helper.append_op(type=op_type,
+                     inputs={'X': tensor},
+                     outputs={'Out': out},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': True,
+                         'rank': rank,
+                         'nranks': nranks,
+                         'use_model_parallel': True,
+                     })
     return out
 
 
@@ -1192,13 +1184,15 @@ def _mp_allreduce(tensor,
         from paddle.autograd import EagerPyLayer
 
         class mp_allreduce_eager(EagerPyLayer):
+
             @staticmethod
             def forward(ctx, tensor, use_calc_stream, ring_id,
                         use_model_parallel):
                 ctx.ring_id = ring_id
-                return _C_ops.c_allreduce_sum_(
-                    tensor, 'use_calc_stream', use_calc_stream, 'ring_id',
-                    ring_id, "use_model_parallel", use_model_parallel)
+                return _C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
+                                               use_calc_stream, 'ring_id',
+                                               ring_id, "use_model_parallel",
+                                               use_model_parallel)
 
             @staticmethod
             def backward(ctx, dy):
@@ -1211,9 +1205,10 @@ def backward(ctx, dy):
 
     elif _in_legacy_dygraph():
         if op == ReduceOp.SUM:
-            return _C_ops.c_allreduce_sum_(
-                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id,
-                "use_model_parallel", use_model_parallel)
+            return _C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
+                                           use_calc_stream, 'ring_id', ring_id,
+                                           "use_model_parallel",
+                                           use_model_parallel)
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
 
@@ -1225,15 +1220,14 @@ def backward(ctx, dy):
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
         op_type)
 
-    helper.append_op(
-        type=op_type,
-        inputs={'X': tensor},
-        outputs={'Out': out},
-        attrs={
-            'ring_id': ring_id,
-            'use_calc_stream': use_calc_stream,
-            'use_model_parallel': use_model_parallel,
-        })
+    helper.append_op(type=op_type,
+                     inputs={'X': tensor},
+                     outputs={'Out': out},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': use_calc_stream,
+                         'use_model_parallel': use_model_parallel,
+                     })
     return out
 
 
@@ -1259,12 +1253,13 @@ def _c_lookup_table(table, index, start_index=0, name=None):
     dtype = helper.input_dtype(input_param_name='table')
     check_variable_and_dtype(index, 'input', ['int32', 'int64'], op_type)
     tmp = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='c_embedding',
-        inputs={'Ids': index,
-                'W': table},
-        outputs={'Out': tmp},
-        attrs={"start_index": start_index})
+    helper.append_op(type='c_embedding',
+                     inputs={
+                         'Ids': index,
+                         'W': table
+                     },
+                     outputs={'Out': tmp},
+                     attrs={"start_index": start_index})
     return tmp
 
 
@@ -1283,21 +1278,21 @@ def __init__(self,
         self._dtype = self._helper.get_default_dtype()
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
-        self.weight = self.create_parameter(
-            shape=[in_features, out_features],
-            attr=self._weight_attr,
-            dtype=self._dtype,
-            is_bias=False)
-        self.bias = self.create_parameter(
-            shape=[out_features],
-            attr=self._bias_attr,
-            dtype=self._dtype,
-            is_bias=True)
+        self.weight = self.create_parameter(shape=[in_features, out_features],
+                                            attr=self._weight_attr,
+                                            dtype=self._dtype,
+                                            is_bias=False)
+        self.bias = self.create_parameter(shape=[out_features],
+                                          attr=self._bias_attr,
+                                          dtype=self._dtype,
+                                          is_bias=True)
         self.name = name
 
     def forward(self, input):
-        out = _linear(
-            x=input, weight=self.weight, bias=self.bias, name=self.name)
+        out = _linear(x=input,
+                      weight=self.weight,
+                      bias=self.bias,
+                      name=self.name)
         return out
 
     def extra_repr(self):
@@ -1342,13 +1337,16 @@ def _c_softmax_with_cross_entropy(logits,
     helper = LayerHelper('c_softmax_with_cross_entropy', **locals())
     softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
     loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    helper.append_op(
-        type='c_softmax_with_cross_entropy',
-        inputs={'Logits': logits,
-                'Label': label},
-        outputs={'Softmax': softmax,
-                 'Loss': loss},
-        attrs=attrs)
+    helper.append_op(type='c_softmax_with_cross_entropy',
+                     inputs={
+                         'Logits': logits,
+                         'Label': label
+                     },
+                     outputs={
+                         'Softmax': softmax,
+                         'Loss': loss
+                     },
+                     attrs=attrs)
 
     if return_softmax:
         return loss, softmax
@@ -1364,8 +1362,9 @@ def _linear(x, weight, bias=None, name=None):
         pre_bias = _varbase_creator(dtype=x.dtype)
         _C_ops.matmul(x, weight, pre_bias, 'transpose_X', False, 'transpose_Y',
                       False, "alpha", 1)
-        return dygraph_utils._append_bias_in_dygraph(
-            pre_bias, bias, axis=len(x.shape) - 1)
+        return dygraph_utils._append_bias_in_dygraph(pre_bias,
+                                                     bias,
+                                                     axis=len(x.shape) - 1)
     else:
         helper = LayerHelper('linear', **locals())
         dtype = x.dtype
@@ -1383,16 +1382,19 @@ def _linear(x, weight, bias=None, name=None):
             'alpha': 1,
         }
         tmp = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(
-            type='matmul_v2', inputs=inputs, outputs={'Out': tmp}, attrs=attrs)
+        helper.append_op(type='matmul_v2',
+                         inputs=inputs,
+                         outputs={'Out': tmp},
+                         attrs=attrs)
         if bias is not None:
             res = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [tmp],
-                        'Y': [bias]},
-                outputs={'Out': [res]},
-                attrs={'axis': len(x.shape) - 1})
+            helper.append_op(type='elementwise_add',
+                             inputs={
+                                 'X': [tmp],
+                                 'Y': [bias]
+                             },
+                             outputs={'Out': [res]},
+                             attrs={'axis': len(x.shape) - 1})
         else:
             res = tmp
         return res
@@ -1441,12 +1443,11 @@ def _parallel_linear(x,
     else:
         x = _c_identity(x, group=group)
 
-    linear = paddle.nn.Linear(
-        num_rows,
-        num_cols,
-        weight_attr=param_attr,
-        bias_attr=bias_attr,
-        name=name)
+    linear = paddle.nn.Linear(num_rows,
+                              num_cols,
+                              weight_attr=param_attr,
+                              bias_attr=bias_attr,
+                              name=name)
 
     # NOTE: npu linear function use matmul_v2 but linear use matmul
     linear_function = _linear if core.is_compiled_with_npu()\
@@ -1479,29 +1480,27 @@ def _parallel_linear(x,
         is_data=False,
         need_check_feed=linear_out.desc.need_check_feed())
     if axis == 0:
-        main_block.append_op(
-            type='c_allreduce_sum',
-            inputs={'X': linear_out},
-            outputs={'Out': out},
-            attrs={
-                'ring_id': ring_id,
-                'use_calc_stream': True,
-                'use_model_parallel': True
-            })
+        main_block.append_op(type='c_allreduce_sum',
+                             inputs={'X': linear_out},
+                             outputs={'Out': out},
+                             attrs={
+                                 'ring_id': ring_id,
+                                 'use_calc_stream': True,
+                                 'use_model_parallel': True
+                             })
         if linear.bias is not None:
             out = out + linear.bias
     else:
-        main_block.append_op(
-            type='c_concat',
-            inputs={'X': linear_out},
-            outputs={'Out': out},
-            attrs={
-                'rank': inner_rank,
-                'ring_id': ring_id,
-                'nranks': nranks,
-                'use_calc_stream': True,
-                'use_model_parallel': True
-            })
+        main_block.append_op(type='c_concat',
+                             inputs={'X': linear_out},
+                             outputs={'Out': out},
+                             attrs={
+                                 'rank': inner_rank,
+                                 'ring_id': ring_id,
+                                 'nranks': nranks,
+                                 'use_calc_stream': True,
+                                 'use_model_parallel': True
+                             })
     return out
 
 
@@ -1529,12 +1528,17 @@ def _parallel_embedding(x,
     dtype = helper.get_default_dtype()
     size = [per_part_size, origin_size[1]]
 
-    weight = helper.create_parameter(
-        attr=param_attr, shape=size, dtype=dtype, is_bias=False)
+    weight = helper.create_parameter(attr=param_attr,
+                                     shape=size,
+                                     dtype=dtype,
+                                     is_bias=False)
 
     if num_partitions == 1:
-        return paddle.nn.functional.embedding(
-            x, weight=weight, padding_idx=None, sparse=False, name=name)
+        return paddle.nn.functional.embedding(x,
+                                              weight=weight,
+                                              padding_idx=None,
+                                              sparse=False,
+                                              name=name)
 
     startup_block = paddle.static.default_startup_program().global_block()
     main_block = paddle.static.default_main_program().global_block()
@@ -1543,11 +1547,10 @@ def _parallel_embedding(x,
 
     output_parallel = paddle.distributed.collective._c_lookup_table(
         weight, x, start_index=vocab_start_index, name=name)
-    out = paddle.distributed.collective._mp_allreduce(
-        output_parallel,
-        group=group,
-        use_calc_stream=True,
-        use_model_parallel=True)
+    out = paddle.distributed.collective._mp_allreduce(output_parallel,
+                                                      group=group,
+                                                      use_calc_stream=True,
+                                                      use_model_parallel=True)
     return out
 
 
@@ -1678,9 +1681,10 @@ def split(x,
                 num_partitions=2)
 
     """
-    assert isinstance(size, (list, tuple)), (
-        "The type of size for "
-        "paddle.distributed.split must be list or tuple.")
+    assert isinstance(
+        size,
+        (list, tuple)), ("The type of size for "
+                         "paddle.distributed.split must be list or tuple.")
     assert len(size) == 2, ("Number of elements in size of "
                             "paddle.distributed.split must be two.")
     assert isinstance(operation, str), ("The type of operation for "
@@ -1716,23 +1720,22 @@ def split(x,
             "but received vocabulary={} num_partitions={}".format(size[0], num_partitions)
 
         per_part_size = size[0] // num_partitions
-        emb_out = _parallel_embedding(
-            x,
-            per_part_size,
-            size,
-            weight_attr,
-            inner_rank,
-            num_partitions,
-            name,
-            group=None)
+        emb_out = _parallel_embedding(x,
+                                      per_part_size,
+                                      size,
+                                      weight_attr,
+                                      inner_rank,
+                                      num_partitions,
+                                      name,
+                                      group=None)
         return emb_out
     else:
         should_split = False
         if axis == 0:
             assert size[0] % num_partitions == 0, (
                 "Number of rows of the weight for linear ({}) must be"
-                " divisible by num_partitions ({})".format(size[0],
-                                                           num_partitions))
+                " divisible by num_partitions ({})".format(
+                    size[0], num_partitions))
             per_part_size = size[0] // num_partitions
             linear_size = (per_part_size, size[1])
             if x.shape[-1] == size[0]: should_split = True
@@ -1740,27 +1743,26 @@ def split(x,
         elif axis == 1:
             assert size[1] % num_partitions == 0, (
                 "Number of column of the weight for linear ({}) must be"
-                " divisible by num_partitions ({})".format(size[1],
-                                                           num_partitions))
+                " divisible by num_partitions ({})".format(
+                    size[1], num_partitions))
             per_part_size = size[1] // num_partitions
             linear_size = (size[0], per_part_size)
         else:
             raise ValueError("The value of axis must be 0 or 1, but the value "
                              "given is {}.".format(axis))
 
-        linear_out = _parallel_linear(
-            x,
-            linear_size[0],
-            linear_size[1],
-            axis,
-            weight_attr,
-            bias_attr,
-            gather_out,
-            inner_rank,
-            num_partitions,
-            should_split,
-            name=name,
-            group=None)
+        linear_out = _parallel_linear(x,
+                                      linear_size[0],
+                                      linear_size[1],
+                                      axis,
+                                      weight_attr,
+                                      bias_attr,
+                                      gather_out,
+                                      inner_rank,
+                                      num_partitions,
+                                      should_split,
+                                      name=name,
+                                      group=None)
         return linear_out
 
 
@@ -1855,14 +1857,13 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
         if len(out_tensor_list) != 0:
             raise ValueError("The 'out_tensor_list' for all_to_all "
                              "must be an empty list.")
-        helper.append_op(
-            type=op_type,
-            inputs={'X': [temp]},
-            outputs={'Out': [out]},
-            attrs={
-                'ring_id': ring_id,
-                'use_calc_stream': use_calc_stream,
-            })
+        helper.append_op(type=op_type,
+                         inputs={'X': [temp]},
+                         outputs={'Out': [out]},
+                         attrs={
+                             'ring_id': ring_id,
+                             'use_calc_stream': use_calc_stream,
+                         })
     out_tensor_list.extend(paddle.split(out, nranks, 0))
 
 
@@ -1919,14 +1920,13 @@ def send(tensor, dst=0, group=None, use_calc_stream=True):
         'send')
 
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [tensor]},
-        attrs={
-            'ring_id': ring_id,
-            'peer': dst,
-            'use_calc_stream': use_calc_stream,
-        })
+    helper.append_op(type=op_type,
+                     inputs={'X': [tensor]},
+                     attrs={
+                         'ring_id': ring_id,
+                         'peer': dst,
+                         'use_calc_stream': use_calc_stream,
+                     })
 
 
 def recv(tensor, src=0, group=None, use_calc_stream=True):
@@ -1982,13 +1982,12 @@ def recv(tensor, src=0, group=None, use_calc_stream=True):
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
         'recv')
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(
-        type=op_type,
-        outputs={'Out': [tensor]},
-        attrs={
-            'ring_id': ring_id,
-            'peer': src,
-            'out_shape': tensor.shape,
-            'dtype': tensor.dtype,
-            'use_calc_stream': use_calc_stream,
-        })
+    helper.append_op(type=op_type,
+                     outputs={'Out': [tensor]},
+                     attrs={
+                         'ring_id': ring_id,
+                         'peer': src,
+                         'out_shape': tensor.shape,
+                         'dtype': tensor.dtype,
+                         'use_calc_stream': use_calc_stream,
+                     })
diff --git a/python/paddle/distributed/elastic.py b/python/paddle/distributed/elastic.py
index 52f36a227f1c8..933550b75ad5b 100644
--- a/python/paddle/distributed/elastic.py
+++ b/python/paddle/distributed/elastic.py
@@ -18,6 +18,7 @@
 
 
 class Command(object):
+
     def __init__(self, server, name):
         import etcd3
 
@@ -47,8 +48,9 @@ def close(self):
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(description='Elastic Command')
-    parser.add_argument(
-        "--elastic_server", type=str, help="etcd server host:port")
+    parser.add_argument("--elastic_server",
+                        type=str,
+                        help="etcd server host:port")
     parser.add_argument("--job_id", type=str, help="job unique id")
     parser.add_argument(
         "--np",
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index ef0fff8283361..8c0394c9944fa 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -30,17 +30,11 @@
 from .base.topology import CommunicateTopology
 from .base.topology import HybridCommunicateGroup  # noqa: F401
 
-__all__ = [ #noqa
-      "CommunicateTopology",
-      "UtilBase",
-      "HybridCommunicateGroup",
-      "MultiSlotStringDataGenerator",
-      "UserDefinedRoleMaker",
-      "DistributedStrategy",
-      "Role",
-      "MultiSlotDataGenerator",
-      "PaddleCloudRoleMaker",
-      "Fleet"
+__all__ = [  #noqa
+    "CommunicateTopology", "UtilBase", "HybridCommunicateGroup",
+    "MultiSlotStringDataGenerator", "UserDefinedRoleMaker",
+    "DistributedStrategy", "Role", "MultiSlotDataGenerator",
+    "PaddleCloudRoleMaker", "Fleet"
 ]
 
 fleet = Fleet()
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
old mode 100644
new mode 100755
index 414edb9b66d8d..902854a7c7279
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -26,6 +26,7 @@
 
 
 def __non_auto_func_called__(func):
+
     def __impl__(*args, **kwargs):
         global non_auto_func_called
         non_auto_func_called = False
@@ -317,8 +318,8 @@ def a_sync(self, flag):
             self.a_sync_configs = {"k_steps": 0}
         else:
             raise ValueError(
-                "The type of `flag` is invalid, expected type is bool, but received {}".
-                format(type(flag)))
+                "The type of `flag` is invalid, expected type is bool, but received {}"
+                .format(type(flag)))
 
     @property
     def a_sync_configs(self):
@@ -429,8 +430,8 @@ def adam_d2sum(self, flag):
             self.strategy.adam_d2sum = flag
         else:
             raise ValueError(
-                "The type of `flag` is invalid, expected type is bool, but received {}".
-                format(type(flag)))
+                "The type of `flag` is invalid, expected type is bool, but received {}"
+                .format(type(flag)))
 
     @trainer_desc_configs.setter
     @is_strict_auto
@@ -492,8 +493,8 @@ def set_table_config(msg, config_name, configs, index=0):
                             data = getattr(msg, field.name).add()
                             set_table_config(data, name, configs, i)
                     else:
-                        set_table_config(
-                            getattr(msg, field.name), name, configs)
+                        set_table_config(getattr(msg, field.name), name,
+                                         configs)
                 else:
                     # print("not message:", name)
                     if name not in configs:
@@ -1022,7 +1023,8 @@ def find_unused_parameters(self, flag):
             self.strategy.find_unused_parameters = flag
         else:
             print(
-                "WARNING: find_unused_parameters should have value of bool type")
+                "WARNING: find_unused_parameters should have value of bool type"
+            )
 
     @property
     def _fuse_grad_size_in_TFLOPS(self):
@@ -1297,7 +1299,8 @@ def fuse_grad_size_in_num(self, num):
             self.strategy.fuse_grad_size_in_num = num
         else:
             print(
-                "WARNING: fuse_grad_size_in_num should have value of int32 type")
+                "WARNING: fuse_grad_size_in_num should have value of int32 type"
+            )
 
     @property
     def pipeline(self):
@@ -1318,6 +1321,18 @@ def pipeline(self):
         """
         return self.strategy.pipeline
 
+    @property
+    def is_fl_ps_mode(self):
+        return self.strategy.is_fl_ps_mode
+
+    @is_fl_ps_mode.setter
+    @is_strict_auto
+    def is_fl_ps_mode(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.is_fl_ps_mode = flag
+        else:
+            print("WARNING: is_fl_ps_mode should have value of bool type")
+
     @pipeline.setter
     @is_strict_auto
     def pipeline(self, flag):
@@ -2113,8 +2128,8 @@ def __repr__(self):
         length = max_k + max_v + spacing
 
         h1_format = "    " + "|{{:^{}s}}|\n".format(length)
-        h2_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(max_k, " " *
-                                                               spacing, max_v)
+        h2_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(
+            max_k, " " * spacing, max_v)
 
         border = "    +" + "".join(["="] * length) + "+"
         line = "    +" + "".join(["-"] * length) + "+"
@@ -2146,17 +2161,17 @@ def __repr__(self):
                             config_fields = my_configs.DESCRIPTOR.fields
                             for ff in config_fields:
                                 if isinstance(
-                                        getattr(my_configs, ff.name),
-                                        google.protobuf.pyext._message.
-                                        RepeatedScalarContainer):
+                                        getattr(my_configs,
+                                                ff.name), google.protobuf.pyext.
+                                        _message.RepeatedScalarContainer):
                                     values = getattr(my_configs, ff.name)
                                     for i, v in enumerate(values):
                                         if i == 0:
-                                            draws += h2_format.format(ff.name,
-                                                                      str(v))
+                                            draws += h2_format.format(
+                                                ff.name, str(v))
                                         else:
-                                            draws += h2_format.format("",
-                                                                      str(v))
+                                            draws += h2_format.format(
+                                                "", str(v))
                                 else:
                                     draws += h2_format.format(
                                         ff.name,
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index a1c967ab0639c..762b961da53ba 100755
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -46,6 +46,7 @@
 
 
 class _RecomputeModelWrapper(paddle.nn.Layer):
+
     def __init__(self, model, segments=2, preserve_rng_state=True):
         super(_RecomputeModelWrapper, self).__init__()
         assert isinstance(model, paddle.nn.Sequential), (
@@ -58,6 +59,7 @@ def __init__(self, model, segments=2, preserve_rng_state=True):
         self._segment_size = len(self._layers) // segments
 
     def _run_func(self, begin, end):
+
         def do_run(input):
             for i in range(begin, end):
                 input = self._layers[i](input)
@@ -91,10 +93,10 @@ def apply_ir_passes(main_program, startup_program, config):
     fuse_all_reduce = config._user_defined_strategy.fuse_all_reduce_ops
     if fuse_all_reduce and build_strategy.fuse_all_optimizer_ops:
         # FIXME(zjl): currently, fuse_all_optimizer_ops
-        # have conflict with fuse_all_reduce_ops because 
-        # RawProgramOptimizer also inserts coalesce_tensor 
-        # into program. These two procedures may conflict  
-        # in which vars are to be fused. 
+        # have conflict with fuse_all_reduce_ops because
+        # RawProgramOptimizer also inserts coalesce_tensor
+        # into program. These two procedures may conflict
+        # in which vars are to be fused.
         warnings.warn(
             'Currently, the fuse_all_optimizer_ops pass has conflict with fuse_all_reduce_ops pass. Disable the fuse_all_optimizer_ops pass temporarily.'
         )
@@ -105,6 +107,7 @@ def apply_ir_passes(main_program, startup_program, config):
 
 
 def _inited_runtime_handler_(func):
+
     def __impl__(*args, **kwargs):
         cls = args[0]
 
@@ -117,6 +120,7 @@ def __impl__(*args, **kwargs):
 
 
 def _is_non_distributed_check_(func):
+
     def __impl__(*args, **kwargs):
         cls = args[0]
 
@@ -275,8 +279,8 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
                 self._is_collective = role_maker._is_collective
             else:
                 raise ValueError(
-                    "`role_maker` should be subclass of `RoleMakerBase`, but got {}".
-                    format(type(role_maker)))
+                    "`role_maker` should be subclass of `RoleMakerBase`, but got {}"
+                    .format(type(role_maker)))
         self._role_maker._generate_role()
 
         import paddle.distributed.fleet as fleet
@@ -352,8 +356,8 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
 
             if use_tensor_parallel:
                 tensor_parallel_configs = self._user_defined_strategy.tensor_parallel_configs
-                mp_degree_tensor_parallel = int(tensor_parallel_configs[
-                    'tensor_parallel_degree'])
+                mp_degree_tensor_parallel = int(
+                    tensor_parallel_configs['tensor_parallel_degree'])
 
             if use_sharding and use_tensor_parallel:
                 assert mp_degree_sharding == mp_degree_tensor_parallel
@@ -773,14 +777,18 @@ def save(self, dirname, feed=[], fetch=[], **configs):
                 for name in fetch_var_names
             ]
 
-            self._runtime_handle._save_inference_model(
-                executor, dirname, feeded_var_names, fetch_vars, None, True, 0)
+            self._runtime_handle._save_inference_model(executor, dirname,
+                                                       feeded_var_names,
+                                                       fetch_vars, None, True,
+                                                       0)
         else:
             increment_mode = 0
             if "mode" in configs:
                 increment_mode = int(configs["mode"])
-            self._runtime_handle._save_persistables(
-                executor, dirname, main_program=None, mode=increment_mode)
+            self._runtime_handle._save_persistables(executor,
+                                                    dirname,
+                                                    main_program=None,
+                                                    mode=increment_mode)
 
     @is_non_distributed_check
     @inited_runtime_handler
@@ -815,9 +823,10 @@ def save_inference_model(self,
         #     "'save_inference_model' is a deprecated, will be deleted after v2.2.0, Please use fleet.save instead."
         # )
 
-        self._runtime_handle._save_inference_model(
-            executor, dirname, feeded_var_names, target_vars, main_program,
-            export_for_deployment, mode)
+        self._runtime_handle._save_inference_model(executor, dirname,
+                                                   feeded_var_names,
+                                                   target_vars, main_program,
+                                                   export_for_deployment, mode)
 
     @is_non_distributed_check
     @inited_runtime_handler
@@ -1000,12 +1009,11 @@ def forward(self, x):
             amp_enable = True
             amp_level = "O2" if strategy.amp_configs['use_pure_fp16'] else "O1"
             if amp_level.upper() == "O2":
-                model = paddle.amp.decorate(
-                    models=model,
-                    optimizers=None,
-                    level="O2",
-                    master_weight=None,
-                    save_dtype=None)
+                model = paddle.amp.decorate(models=model,
+                                            optimizers=None,
+                                            level="O2",
+                                            master_weight=None,
+                                            save_dtype=None)
             init_loss_scaling = strategy.amp_configs['init_loss_scaling']
             incr_ratio = strategy.amp_configs['incr_ratio']
             decr_ratio = strategy.amp_configs['decr_ratio']
@@ -1040,8 +1048,9 @@ def forward(self, x):
             return distributed_model
 
         if self._hcg.get_parallel_mode() == ParallelMode.SHARDING_PARALLEL:
-            model = ShardingParallel(
-                model, self._hcg, strategy=self._user_defined_strategy)
+            model = ShardingParallel(model,
+                                     self._hcg,
+                                     strategy=self._user_defined_strategy)
         elif self._hcg.get_parallel_mode() == ParallelMode.DATA_PARALLEL:
 
             # NOTE (JZ-LIANG) init parameters broadcast within sharding group
@@ -1060,11 +1069,13 @@ def forward(self, x):
                 find_unused_parameters=self._user_defined_strategy.
                 find_unused_parameters)
         elif self._hcg.get_parallel_mode() == ParallelMode.TENSOR_PARALLEL:
-            model = TensorParallel(
-                model, self._hcg, strategy=self._user_defined_strategy)
+            model = TensorParallel(model,
+                                   self._hcg,
+                                   strategy=self._user_defined_strategy)
         elif self._hcg.get_parallel_mode() == ParallelMode.PIPELINE_PARALLEL:
-            model = PipelineParallel(
-                model, self._hcg, strategy=self._user_defined_strategy)
+            model = PipelineParallel(model,
+                                     self._hcg,
+                                     strategy=self._user_defined_strategy)
 
         return model
 
@@ -1630,8 +1641,10 @@ def _minimize_impl(self,
                 self.origin_main_program).with_data_parallel(
                     loss_name=loss.name, share_vars_from=None)
             loss.block.program._graph = compiled_program
-            return self.user_defined_optimizer.minimize(
-                loss, startup_program, parameter_list, no_grad_set=no_grad_set)
+            return self.user_defined_optimizer.minimize(loss,
+                                                        startup_program,
+                                                        parameter_list,
+                                                        no_grad_set=no_grad_set)
 
         if meta_optimizer:
             # print("before minimize program id:", id(loss.block.program))
@@ -1765,6 +1778,7 @@ def _minimize_losses_impl(self,
 
     @dygraph_only
     def distributed_scaler(self, scaler):
+
         def unscale_method(self, optimizer):
             if not self._enable:
                 return
@@ -1789,13 +1803,13 @@ def unscale_method(self, optimizer):
                 ]
                 param_grads_fp16 = [
                     param._grad_ivar() for param in optimizer._parameter_list
-                    if (param._grad_ivar() is not None) and (param._grad_ivar(
-                    ).dtype == core.VarDesc.VarType.FP16)
+                    if (param._grad_ivar() is not None) and (
+                        param._grad_ivar().dtype == core.VarDesc.VarType.FP16)
                 ]
                 param_grads_fp32 = [
                     param._grad_ivar() for param in optimizer._parameter_list
-                    if (param._grad_ivar() is not None) and (param._grad_ivar(
-                    ).dtype == core.VarDesc.VarType.FP32)
+                    if (param._grad_ivar() is not None) and (
+                        param._grad_ivar().dtype == core.VarDesc.VarType.FP32)
                 ]
             temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
             temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
@@ -1811,11 +1825,12 @@ def unscale_method(self, optimizer):
             self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
             is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
 
-            # TODO(shenliang03) Since dp allreduce in the optimizer is 
-            # after the gradscaler, check_finite needs to synchronize global 
+            # TODO(shenliang03) Since dp allreduce in the optimizer is
+            # after the gradscaler, check_finite needs to synchronize global
             # information. In the future, we should use check_group to speed.
-            paddle.distributed.all_reduce(
-                is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None)
+            paddle.distributed.all_reduce(is_found_inf,
+                                          op=paddle.distributed.ReduceOp.MAX,
+                                          group=None)
             self._found_inf = is_found_inf.numpy()[0]
 
         # Only tensor_parallel and pipeline_parallel need to modify scaler
diff --git a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
index 322989099c856..c2a3e4047b399 100755
--- a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
+++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
@@ -26,6 +26,7 @@
 
 
 class MetaOptimizerFactory(object):
+
     def __init__(self):
         pass
 
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 860f7a52f39b8..36155bbf1a260 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -118,6 +118,7 @@ def init(self,
         self._http_server = http_server
 
     def _init_fs(self, fs_path, prefix):
+
         def init(rank, nodes, role):
             gloo = fluid.core.Gloo()
             gloo.set_rank(rank)
@@ -145,6 +146,7 @@ def init(rank, nodes, role):
             self._nodes_comm = gloo
 
     def _init_dfs(self, dfs_name, dfs_ugi, dfs_path, prefix):
+
         def init(rank, nodes, role):
             gloo = fluid.core.Gloo()
             gloo.set_rank(rank)
@@ -172,6 +174,7 @@ def init(rank, nodes, role):
             self._nodes_comm = gloo
 
     def _init_http(self, ip, port, prefix, start_http_server, http_server_d):
+
         def __start_kv_server(http_server_d, size_d):
             print("start http_server: {}, {}".format(port, size_d))
             from paddle.distributed.fleet.utils.http_server import KVServer
@@ -185,13 +188,15 @@ def __start_kv_server(http_server_d, size_d):
 
         def init_kv_server(http_server_d):
             worker_key = prefix + '_' + 'worker'
-            size_d = {worker_key: self._worker_num, }
+            size_d = {
+                worker_key: self._worker_num,
+            }
             print("worker_key:{}, size: {}".format(worker_key, size_d))
 
             http_server_d["running"] = True
             # child process for http server
-            _http_server = Process(
-                target=__start_kv_server, args=(http_server_d, size_d))
+            _http_server = Process(target=__start_kv_server,
+                                   args=(http_server_d, size_d))
             _http_server.daemon = True
             # set running status to True
             # start child process
@@ -224,7 +229,7 @@ def init(rank, nodes, role):
             self._worker_comm = gloo
         # TODO (sandyhouse): initialize gloo for server and all
 
-        # the closing of kv server may cause gloo init failure 
+        # the closing of kv server may cause gloo init failure
         # since it depend on the full mesh connection
         # e.g. 0 connected with 1,2,3 while 2-3 not connected yet
         # TODO(kuizhiqing)
@@ -517,6 +522,7 @@ def _barrier(self, comm_world):
 
 
 class PaddleCloudRoleMaker(RoleMakerBase):
+
     def __init__(self, is_collective=False, **kwargs):
         super(PaddleCloudRoleMaker, self).__init__()
         self._is_collective = is_collective
@@ -525,7 +531,7 @@ def __init__(self, is_collective=False, **kwargs):
         self._kwargs = kwargs
         self._role_is_generated = False
 
-        # for heterps  
+        # for heterps
         self._stage_id = 1
         self._stage_num = 1
         self._next_heter_trainer_endpoints = []
@@ -652,8 +658,8 @@ def _server_num(self):
         """
         if not self._role_is_generated:
             self._generate_role()
-        return len(self._get_pserver_endpoints(
-        )) if self._get_pserver_endpoints() is not None else 0
+        return len(self._get_pserver_endpoints()
+                   ) if self._get_pserver_endpoints() is not None else 0
 
     def _node_num(self):
         """
@@ -814,8 +820,8 @@ def _ps_env(self):
 
         if training_role not in ["TRAINER", "PSERVER", "HETER_TRAINER"]:
             raise ValueError(
-                "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment.".
-                format(training_role))
+                "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment."
+                .format(training_role))
 
         # For Heter Parameter Server env setting
         next_heter_trainer_eplist = os.getenv(
@@ -832,8 +838,8 @@ def _ps_env(self):
 
             if previous_heter_trainer_eplist == "":
                 assert training_role in (
-                    "TRAINER", "PSERVER"
-                ), "training_role should be trainer or pserver"
+                    "TRAINER",
+                    "PSERVER"), "training_role should be trainer or pserver"
             else:
                 try:
                     self._previous_heter_trainer_endpoints = previous_heter_trainer_eplist.split(
@@ -896,7 +902,8 @@ def _ps_env(self):
                 self._stage_num = os.getenv("STAGE_NUM", None)
                 if self._stage_num == None:
                     raise ValueError(
-                        "Can not find STAGE_NUM, please check your environment.")
+                        "Can not find STAGE_NUM, please check your environment."
+                    )
                 self._stage_num = int(self._stage_num)
                 self._stage_trainers = os.getenv("PADDLE_STAGE_TRAINERS_NUM",
                                                  None)
@@ -955,8 +962,8 @@ def _ps_env(self):
                     "Can not find HETER_DEVICE_TYPE, please check your environment."
                 )
             assert self._heter_trainer_device_type in (
-                "cpu", "gpu", "xpu"
-            ), "HETER_DEVICE_TYPE should be cpu,gpu or xpu"
+                "cpu", "gpu",
+                "xpu"), "HETER_DEVICE_TYPE should be cpu,gpu or xpu"
             if self._heter_trainer_device_type == "gpu":
                 heter_device_id = os.getenv("FLAGS_selected_gpus", "0")
                 self._heter_trainer_device = ":".join(
@@ -1068,14 +1075,13 @@ def _gloo_init(self):
         print("Gloo init with {}: need_init_all: {}, args: {}".format(
             type, need_init_all, kwargs))
 
-        self._gloo.init(
-            rendezvous=rendezvous_type,
-            role=self._role,
-            role_id=self._role_id(),
-            worker_num=self._worker_num(),
-            server_num=self._server_num(),
-            need_init_all=need_init_all,
-            kwargs=kwargs)
+        self._gloo.init(rendezvous=rendezvous_type,
+                        role=self._role,
+                        role_id=self._role_id(),
+                        worker_num=self._worker_num(),
+                        server_num=self._server_num(),
+                        need_init_all=need_init_all,
+                        kwargs=kwargs)
 
         if rendezvous_type == Gloo.RENDEZVOUS.HTTP:
             http_server_d['running'] = False
@@ -1095,9 +1101,11 @@ def _generate_role(self):
 
 
 class UserDefinedRoleMaker(PaddleCloudRoleMaker):
+
     def __init__(self, is_collective=False, init_gloo=False, **kwargs):
-        super(UserDefinedRoleMaker, self).__init__(
-            is_collective=is_collective, init_gloo=init_gloo, **kwargs)
+        super(UserDefinedRoleMaker, self).__init__(is_collective=is_collective,
+                                                   init_gloo=init_gloo,
+                                                   **kwargs)
         self._init_gloo = init_gloo
 
     def _user_defined_ps_env(self):
diff --git a/python/paddle/distributed/fleet/base/runtime_factory.py b/python/paddle/distributed/fleet/base/runtime_factory.py
index b162a9fea6837..79dac6716cb26 100644
--- a/python/paddle/distributed/fleet/base/runtime_factory.py
+++ b/python/paddle/distributed/fleet/base/runtime_factory.py
@@ -19,6 +19,7 @@
 
 
 class RuntimeFactory(object):
+
     def __init__(self):
         pass
 
diff --git a/python/paddle/distributed/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py
index b90e5b2bff7bf..823061f903543 100644
--- a/python/paddle/distributed/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
@@ -107,6 +107,7 @@ def maximum_path_len_algo(optimizer_list):
 
 
 class StrategyCompilerBase(object):
+
     def __init__(self):
         pass
 
@@ -192,15 +193,14 @@ def generate_optimizer(self, loss, role_maker, optimizer,
             self._meta_optimizers = [] if meta_optimizers is None else meta_optimizers
             self._graph_optimizers = [] if graph_optimizers is None else graph_optimizers
 
-            return_meta = None if meta_optimizers == None else meta_optimizers[
-                0]
+            return_meta = None if meta_optimizers == None else meta_optimizers[0]
             return_graph = None if graph_optimizers == None else graph_optimizers[
                 0]
 
             if meta_optimizers == None or graph_optimizers == None:
                 return return_meta, return_graph
 
-            # do heuristic filter here, if any meta optimizer in graph optimizers is in 
+            # do heuristic filter here, if any meta optimizer in graph optimizers is in
             # any meta optimizers' black list, set return_graph to None
             need_graph_opt = True
             for graph_opt in graph_optimizers:
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index ef34fd144a703..aef9c85adfb5c 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -50,6 +50,7 @@ class ParallelMode(object):
 
 
 class CommunicateTopology(object):
+
     def __init__(self,
                  hybrid_group_names=["data", "pipe", "sharding", "model"],
                  dims=[1, 1, 1, 1]):
@@ -131,6 +132,7 @@ def get_rank_from_stage(self, global_rank, **kwargs):
 
 
 class HybridCommunicateGroup(object):
+
     def __init__(self, topology):
         self.nranks = paddle.distributed.get_world_size()
         self.global_rank = paddle.distributed.get_rank()
@@ -189,7 +191,7 @@ def __init__(self, topology):
 
     def get_parallel_mode(self):
         # there are four modes : DataParallel / TensorParallel / PipelineParallel / ShardingParallel
-        # NOTE when sharding conjugates with other parallel, sharding should act like a optimizer and 
+        # NOTE when sharding conjugates with other parallel, sharding should act like a optimizer and
         # adding its parallel logic within that parallelism
         # when use sharding alone, it should have its own parallelism for its parallel logic
         # TODO modify 3 others parallel to support sharding
@@ -349,8 +351,9 @@ def get_check_parallel_group(self):
         return self._check_comm_group
 
     def get_rank_from_stage(self, stage_id, **kwargs):
-        return self._topo.get_rank_from_stage(
-            self.global_rank, pipe=stage_id, **kwargs)
+        return self._topo.get_rank_from_stage(self.global_rank,
+                                              pipe=stage_id,
+                                              **kwargs)
 
 
 class _CommunicateGroup(object):
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
old mode 100644
new mode 100755
index de101cd74c4e8..6705eb36bf348
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -32,6 +32,7 @@
 
 
 class UtilFactory(object):
+
     def _create_util(self, context=None):
         util = UtilBase()
         if context is not None and "valid_strategy" in context:
@@ -42,6 +43,7 @@ def _create_util(self, context=None):
 
 
 class UtilBase(object):
+
     def __init__(self):
         self.role_maker = None
         self.dist_strategy = None
@@ -204,6 +206,26 @@ def _broadcast(self):
     def _scatter(self):
         pass
 
+    def get_heter_file_shard(self, files):
+        if not isinstance(files, list):
+            raise TypeError("files should be a list of file need to be read.")
+        trainers = self.role_maker._worker_num()
+        trainer_id = self.role_maker._worker_index() - trainers
+        remainder = len(files) % trainers
+        blocksize = int(len(files) / trainers)
+
+        blocks = [blocksize] * trainers
+        for i in range(remainder):
+            blocks[i] += 1
+
+        trainer_files = [[]] * trainers
+        begin = 0
+        for i in range(trainers):
+            trainer_files[i] = files[begin:begin + blocks[i]]
+            begin += blocks[i]
+
+        return trainer_files[trainer_id]
+
     def get_file_shard(self, files):
         """
         Split files before distributed training, and return filelist assigned to the current trainer.
@@ -301,6 +323,7 @@ def _save_program(self, program, model_filename='__model__', is_text=False):
                 f.write(program.desc.serialize_to_string())
 
     def _load_program(self, path, is_text):
+
         def load_program_binary(path):
             """load program from binary string file"""
             with open(path, "rb") as f:
@@ -324,8 +347,8 @@ def load_program_text(path):
     def _program_type_trans(self, prog_dir, prog_fn, is_text):
         prog = self._load_program(os.path.join(prog_dir, prog_fn), is_text)
         prog_out_fn = prog_fn + ".bin" if is_text else prog_fn + ".pbtxt"
-        self._save_program(prog,
-                           os.path.join(prog_dir, prog_out_fn), 1 - is_text)
+        self._save_program(prog, os.path.join(prog_dir, prog_out_fn),
+                           1 - is_text)
         return prog_out_fn
 
     def _visualize_graphviz(self, program, output_dir, output_filename):
@@ -334,11 +357,10 @@ def _visualize_graphviz(self, program, output_dir, output_filename):
         pdf_path = os.path.join(output_dir, output_filename + '.pdf')
         debugger.draw_block_graphviz(block, path=dot_path)
         cmd = ["dot", "-Tpdf", dot_path, "-o", pdf_path]
-        p = subprocess.Popen(
-            cmd,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
+        p = subprocess.Popen(cmd,
+                             stdin=subprocess.PIPE,
+                             stdout=subprocess.PIPE,
+                             stderr=subprocess.PIPE)
         p.wait()
 
     def _proto_check(self, config):
@@ -375,14 +397,16 @@ def _proto_check(self, config):
                 continue
             if var.shape != train_prog_var.shape or var.dtype != train_prog_var.dtype:
                 print(
-                    "variable: {} not match. in pruned program shape: {} dtype:{}, in train program shape: {} dtype: {}".
-                    format(var_name, var.shape, var.dtype, train_prog_var.shape,
-                           train_prog_var.dtype))
+                    "variable: {} not match. in pruned program shape: {} dtype:{}, in train program shape: {} dtype: {}"
+                    .format(var_name, var.shape, var.dtype,
+                            train_prog_var.shape, train_prog_var.dtype))
                 is_match = False
         return is_match
 
     def _params_check(self, config):
+
         def feed_gen(batch_size, feeded_vars_dims, feeded_vars_filelist):
+
             def reader(batch_size, fn, dim):
                 data = []
                 if isinstance(dim, list) or isinstance(dim, tuple):
@@ -435,8 +459,8 @@ def check_not_expected_ops(prog, not_expected_op_types):
         not_expected_op_types = check_not_expected_ops(prog, ["lookup_table"])
         if len(not_expected_op_types) > 0:
             print(
-                "find op type '{}' in program, please check if your program is pruned correctly !".
-                format(list(not_expected_op_types)))
+                "find op type '{}' in program, please check if your program is pruned correctly !"
+                .format(list(not_expected_op_types)))
             return False
 
         place = fluid.CPUPlace()
@@ -461,8 +485,8 @@ def check_not_expected_ops(prog, not_expected_op_types):
                 if new_shape != orig_shape:
                     raise RuntimeError(
                         "Shape not matching: the Program requires a parameter with a shape of ({}), "
-                        "while the loaded parameter (namely [ {} ]) has a shape of  ({}).".
-                        format(orig_shape, each_var.name, new_shape))
+                        "while the loaded parameter (namely [ {} ]) has a shape of  ({})."
+                        .format(orig_shape, each_var.name, new_shape))
 
             # check feed/fetch vars in program and config
             feed_config = config.feed_config
@@ -476,8 +500,8 @@ def check_not_expected_ops(prog, not_expected_op_types):
             feed_name_list = feed_target_names
             if feed_config.feeded_vars_names is not None and feed_target_names != feed_config.feeded_vars_names:
                 print(
-                    "warning! feed vars in program and config are diff: feed in program: {}. feed in config {}.".
-                    format(feed_target_names, feed_config.feeded_vars_names))
+                    "warning! feed vars in program and config are diff: feed in program: {}. feed in config {}."
+                    .format(feed_target_names, feed_config.feeded_vars_names))
                 feed_name_list = feed_config.feeded_vars_names
                 # remove feed op in inference_program. new feed op will be added in exe.run
                 global_block = inference_program.global_block()
@@ -490,8 +514,8 @@ def check_not_expected_ops(prog, not_expected_op_types):
                     global_block._remove_op(index)
             if fetch_config.fetch_vars_names is not None and fetch_targets_names != fetch_config.fetch_vars_names:
                 print(
-                    "warning! fetch vars in program and config are diff: fetch in program: {}. fetch in config {}.".
-                    format(fetch_targets_names, fetch_config.fetch_vars_names))
+                    "warning! fetch vars in program and config are diff: fetch in program: {}. fetch in config {}."
+                    .format(fetch_targets_names, fetch_config.fetch_vars_names))
                 fetch_list = [
                     inference_program.global_block().var(i)
                     for i in fetch_config.fetch_vars_names
@@ -527,9 +551,9 @@ def check_not_expected_ops(prog, not_expected_op_types):
                 var_shape = var.shape[1:]
                 if tensor_shape != var_shape:
                     raise RuntimeError(
-                        "feed variable '{}' shape not match. infer program  shape: {}. feed tensor shape: {}".
-                        format(feed_config.feeded_vars_names[i], var_shape,
-                               tensor_shape))
+                        "feed variable '{}' shape not match. infer program  shape: {}. feed tensor shape: {}"
+                        .format(feed_config.feeded_vars_names[i], var_shape,
+                                tensor_shape))
 
             if not feed_config.feeded_vars_filelist:
                 print("generate random feed vars.")
@@ -539,20 +563,19 @@ def check_not_expected_ops(prog, not_expected_op_types):
                     # create fake feed tensor. if lod_level > 1, should create_lod_tensor()
                     if var.lod_level == 0:
                         feed_tensors.append(
-                            np.array(
-                                np.random.random(
-                                    tuple([config.batch_size] + list(
-                                        feed_config.feeded_vars_dims[i]))),
-                                dtype=feed_config.feeded_vars_types[i]))
+                            np.array(np.random.random(
+                                tuple([config.batch_size] +
+                                      list(feed_config.feeded_vars_dims[i]))),
+                                     dtype=feed_config.feeded_vars_types[i]))
                     elif var.lod_level == 1:
-                        t = np.array(
-                            np.random.random(
-                                tuple([config.batch_size] + list(
-                                    feed_config.feeded_vars_dims[i]))),
-                            dtype=feed_config.feeded_vars_types[i])
+                        t = np.array(np.random.random(
+                            tuple([config.batch_size] +
+                                  list(feed_config.feeded_vars_dims[i]))),
+                                     dtype=feed_config.feeded_vars_types[i])
                         feed_tensors.append(
-                            fluid.create_lod_tensor(t, [[1] * config.batch_size
-                                                        ], place))
+                            fluid.create_lod_tensor(t,
+                                                    [[1] * config.batch_size],
+                                                    place))
                     else:
                         raise RuntimeError(
                             "vars with lod_level >= 2 is not supported now in this infer program check tool."
diff --git a/python/paddle/distributed/fleet/cloud_utils.py b/python/paddle/distributed/fleet/cloud_utils.py
index 0b1169e442263..3b3097bfaa4f0 100644
--- a/python/paddle/distributed/fleet/cloud_utils.py
+++ b/python/paddle/distributed/fleet/cloud_utils.py
@@ -61,8 +61,8 @@ def get_cloud_cluster(args_node_ips,
 
                 if paddle_ports_num >= len(
                         devices_per_proc) and paddle_port != args_port:
-                    logger.warning("Use Cloud specified port:{}.".format(
-                        paddle_port))
+                    logger.warning(
+                        "Use Cloud specified port:{}.".format(paddle_port))
                     started_port = paddle_port
 
             except Exception as e:
@@ -82,12 +82,13 @@ def get_cloud_cluster(args_node_ips,
         trainer_endpoints = []
         assert num_nodes * paddle_ports_num == len(trainer_endpoints_ori)
         for i in range(num_nodes):
-            trainer_endpoints.append(trainer_endpoints_ori[
-                i * paddle_ports_num:(i + 1) * paddle_ports_num])
+            trainer_endpoints.append(
+                trainer_endpoints_ori[i * paddle_ports_num:(i + 1) *
+                                      paddle_ports_num])
 
     logger.debug("parsed from args: node_ips:{} \
-        node_ip:{} node_rank:{} trainer_endpoints:{}"
-                 .format(node_ips, node_ip, node_rank, trainer_endpoints))
+        node_ip:{} node_rank:{} trainer_endpoints:{}".format(
+        node_ips, node_ip, node_rank, trainer_endpoints))
 
     cluster, pod = get_cluster(node_ips, node_ip, trainer_endpoints,
                                device_mode, devices_per_proc)
diff --git a/python/paddle/distributed/fleet/data_generator/data_generator.py b/python/paddle/distributed/fleet/data_generator/data_generator.py
index cceb81838c1d2..47d9e4cc8ef0d 100644
--- a/python/paddle/distributed/fleet/data_generator/data_generator.py
+++ b/python/paddle/distributed/fleet/data_generator/data_generator.py
@@ -237,6 +237,7 @@ def local_iter():
 # add more generalized DataGenerator that can adapt user-defined slot
 # for example, [(name, float_list), (name, str_list), (name, int_list)]
 class MultiSlotStringDataGenerator(DataGenerator):
+
     def _gen_str(self, line):
         '''
         Further processing the output of the process() function rewritten by
@@ -281,6 +282,7 @@ def _gen_str(self, line):
 
 
 class MultiSlotDataGenerator(DataGenerator):
+
     def _gen_str(self, line):
         '''
         Further processing the output of the process() function rewritten by
@@ -338,8 +340,8 @@ def _gen_str(self, line):
                 for elem in elements:
                     if isinstance(elem, float):
                         self._proto_info[-1] = (name, "float")
-                    elif not isinstance(elem, int) and not isinstance(elem,
-                                                                      long):
+                    elif not isinstance(elem, int) and not isinstance(
+                            elem, long):
                         raise ValueError(
                             "the type of element%s must be in int or float" %
                             type(elem))
@@ -347,7 +349,8 @@ def _gen_str(self, line):
         else:
             if len(line) != len(self._proto_info):
                 raise ValueError(
-                    "the complete field set of two given line are inconsistent.")
+                    "the complete field set of two given line are inconsistent."
+                )
             for index, item in enumerate(line):
                 name, elements = item
                 if not isinstance(name, str):
@@ -370,8 +373,8 @@ def _gen_str(self, line):
                     if self._proto_info[index][1] != "float":
                         if isinstance(elem, float):
                             self._proto_info[index] = (name, "float")
-                        elif not isinstance(elem, int) and not isinstance(elem,
-                                                                          long):
+                        elif not isinstance(elem, int) and not isinstance(
+                                elem, long):
                             raise ValueError(
                                 "the type of element%s must be in int or float"
                                 % type(elem))
diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py
index 235f4ece62d00..2983457b8a7b7 100644
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -322,10 +322,10 @@ def _check_use_var_with_data_generator(self, var_list, data_generator_class,
                                 "Please check if var's type in data_generator is correct."
                                 % (ele[0], "float", ele[1]))
 
-                        if (var_list[i].dtype == core.VarDesc.VarType.INT64 or
-                                var_list[i].dtype == core.VarDesc.VarType.INT32
-                            ) and not all(
-                                isinstance(ele, int) for ele in ele[1]):
+                        if (var_list[i].dtype == core.VarDesc.VarType.INT64
+                                or var_list[i].dtype
+                                == core.VarDesc.VarType.INT32) and not all(
+                                    isinstance(ele, int) for ele in ele[1]):
                             raise TypeError(
                                 "var dtype mismatch error: var name = %s, var type in var_list = %s, while var in data_generator contains non-int value, which is %s \n"
                                 "Please check if order of var_list and data_generator are aligned. \n"
@@ -583,15 +583,14 @@ def init(self, **kwargs):
         pipe_command = kwargs.get("pipe_command", "cat")
         download_cmd = kwargs.get("download_cmd", "cat")
 
-        super(InMemoryDataset, self).init(
-            batch_size=batch_size,
-            thread_num=thread_num,
-            use_var=use_var,
-            pipe_command=pipe_command,
-            input_type=input_type,
-            fs_name=fs_name,
-            fs_ugi=fs_ugi,
-            download_cmd=download_cmd)
+        super(InMemoryDataset, self).init(batch_size=batch_size,
+                                          thread_num=thread_num,
+                                          use_var=use_var,
+                                          pipe_command=pipe_command,
+                                          input_type=input_type,
+                                          fs_name=fs_name,
+                                          fs_ugi=fs_ugi,
+                                          download_cmd=download_cmd)
 
         data_feed_type = kwargs.get("data_feed_type",
                                     "MultiSlotInMemoryDataFeed")
@@ -779,8 +778,9 @@ def _set_generate_unique_feasigns(self, generate_uni_feasigns, shard_num):
 
     def _generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num,
                                       consume_thread_num, shard_num):
-        self.dataset.generate_local_tables_unlock(
-            table_id, fea_dim, read_thread_num, consume_thread_num, shard_num)
+        self.dataset.generate_local_tables_unlock(table_id, fea_dim,
+                                                  read_thread_num,
+                                                  consume_thread_num, shard_num)
 
     def set_date(self, date):
         """
diff --git a/python/paddle/distributed/fleet/dataset/index_dataset.py b/python/paddle/distributed/fleet/dataset/index_dataset.py
index c4c424fe2dc7e..8b5a9c5a45bf6 100644
--- a/python/paddle/distributed/fleet/dataset/index_dataset.py
+++ b/python/paddle/distributed/fleet/dataset/index_dataset.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,11 +17,13 @@
 
 
 class Index(object):
+
     def __init__(self, name):
         self._name = name
 
 
 class TreeIndex(Index):
+
     def __init__(self, name, path):
         super(TreeIndex, self).__init__(name)
         self._wrapper = core.IndexWrapper()
diff --git a/python/paddle/distributed/fleet/elastic/__init__.py b/python/paddle/distributed/fleet/elastic/__init__.py
index 503d2966a80e7..b80a66c6f01d0 100644
--- a/python/paddle/distributed/fleet/elastic/__init__.py
+++ b/python/paddle/distributed/fleet/elastic/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/distributed/fleet/elastic/collective.py b/python/paddle/distributed/fleet/elastic/collective.py
index de350e15d35c0..f27987571d8d2 100644
--- a/python/paddle/distributed/fleet/elastic/collective.py
+++ b/python/paddle/distributed/fleet/elastic/collective.py
@@ -23,6 +23,7 @@
 
 
 class CollectiveLauncher(LauncherInterface):
+
     def __init__(self, args):
         self.args = args
         self.procs = []
diff --git a/python/paddle/distributed/fleet/elastic/manager.py b/python/paddle/distributed/fleet/elastic/manager.py
index 1716e332c8286..e0a6bd81c8ee8 100644
--- a/python/paddle/distributed/fleet/elastic/manager.py
+++ b/python/paddle/distributed/fleet/elastic/manager.py
@@ -59,6 +59,7 @@ class ElasticStatus:
 
 
 class LauncherInterface(object):
+
     def __init__(self, args):
         self.args = args
         self.procs = []
@@ -109,8 +110,8 @@ def _check_procs(self):
                     return ret
                 logger.error("ABORT!!! ABORT!!! ABORT!!!")
                 logger.error(
-                    "ERROR rank {} error with exit code {}, check log for detail.".
-                    format(p.rank, ret))
+                    "ERROR rank {} error with exit code {}, check log for detail."
+                    .format(p.rank, ret))
                 result = ret
         if not alive and result is None:
             return 0
@@ -128,6 +129,7 @@ def watch(self):
 
 
 class ElasticManager(object):
+
     def __init__(self, args, etcd_client):
 
         self.args = args
@@ -238,12 +240,13 @@ def host_call_back(event):
             ]
             self.hosts = list(set(self.hosts)) if self.hosts else self.hosts
             logger.info(
-                f"host_call_back curr_host={self.curr_host}, hosts:{self.hosts}")
+                f"host_call_back curr_host={self.curr_host}, hosts:{self.hosts}"
+            )
             self.need_sync = True
             self.elastic_startup_time = None
 
-        host_watch = self.etcd.add_watch_prefix_callback(self.node_prefix,
-                                                         host_call_back)
+        host_watch = self.etcd.add_watch_prefix_callback(
+            self.node_prefix, host_call_back)
         host_lease = self.etcd.lease(elastic_ttl)
 
         # register etcd lease heartbeat
@@ -267,13 +270,15 @@ def lease_heartbeat():
                                       six.b(self.curr_host),
                                       lease=host_lease)
                 except Exception as e:
-                    logger.error("[lease_heartbeat] internal error:{} {}".
-                                 format(e, traceback.format_exc()))
+                    logger.error(
+                        "[lease_heartbeat] internal error:{} {}".format(
+                            e, traceback.format_exc()))
                     break
                 time.sleep(elastic_ttl / 3)
 
-        keepalived_thread = threading.Thread(
-            name='lease_heartbeat', target=lease_heartbeat, daemon=True)
+        keepalived_thread = threading.Thread(name='lease_heartbeat',
+                                             target=lease_heartbeat,
+                                             daemon=True)
         keepalived_thread.start()
 
         self.etcd.put(self.host_path, six.b(self.curr_host), lease=host_lease)
@@ -300,7 +305,7 @@ def endpoints_call_back(event):
     def _host_to_endpoints(self,
                            ip_port_list: list,
                            devices_per_proc: list,
-                           start_port: int=6170) -> str:
+                           start_port: int = 6170) -> str:
         endpoint_list = []
         for ip_port in ip_port_list:
             endpoints = ip_port.split(":")
@@ -343,12 +348,11 @@ def pre_hook(self):
             return
         logger.info("execute pre_hook...")
         current_env = copy.copy(os.environ.copy())
-        out, err = subprocess.Popen(
-            self.args.elastic_pre_hook,
-            env=current_env,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            shell=True).communicate()
+        out, err = subprocess.Popen(self.args.elastic_pre_hook,
+                                    env=current_env,
+                                    stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE,
+                                    shell=True).communicate()
         if err:
             logger.warn("pre_hook exec failed")
         else:
@@ -390,7 +394,7 @@ def _completed(self):
 
         return int(self.etcd.get(self.prefix)[0]) == 1
 
-    def _match(self, host_list: list=None):
+    def _match(self, host_list: list = None):
         if host_list:
             self.hosts = host_list
         else:
@@ -449,7 +453,7 @@ def _update_fault_tolrance(self):
             logger.info("update env PADDLE_TRAINERS {} ".format(self.trainers))
             return
 
-        # fault tolerance 
+        # fault tolerance
         idx = self.hosts.index(self.curr_host)
 
         # swap if self.host not in the right position
@@ -490,7 +494,7 @@ def _update_elastic_scale_in(self):
         )
 
         # If scale in node from the first of the rank list, you need to minimize the movement of the rank
-        # eg: 
+        # eg:
         #   the source trainers is:10.10.10.0,10.10.10.1,10.10.10.2,10.10.10.3
         #   10.10.10.0 is removed
         #   the new trainers is:10.10.10.3,10.10.10.1,10.10.10.2
@@ -557,8 +561,8 @@ def wait(self):
                 logger.info('ready with hosts {}'.format(self.hosts))
                 self._update_hosts()
                 return
-            logger.info('not ready for np {} with hosts {}'.format(self.np,
-                                                                   self.hosts))
+            logger.info('not ready for np {} with hosts {}'.format(
+                self.np, self.hosts))
             idx += 1
             time.sleep(2)
         return
diff --git a/python/paddle/distributed/fleet/fleet_executor_utils.py b/python/paddle/distributed/fleet/fleet_executor_utils.py
index 67b4b5e8fe216..0e3a95397e363 100644
--- a/python/paddle/distributed/fleet/fleet_executor_utils.py
+++ b/python/paddle/distributed/fleet/fleet_executor_utils.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -54,11 +54,10 @@ def __init__(self,
         if ops is not None:
             assert role is not None and task_id is not None, \
                 "If init task node with ops, should provide `role` and `task_id`."
-            self.node = core.TaskNode(role, ops, cur_rank,
-                                      int(task_id), max_run_times,
-                                      max_slot_times)
-            print("Creating task node by ops. The role is:",
-                  self.role(), "and the id is:", self.task_id())
+            self.node = core.TaskNode(role, ops, cur_rank, int(task_id),
+                                      max_run_times, max_slot_times)
+            print("Creating task node by ops. The role is:", self.role(),
+                  "and the id is:", self.task_id())
         else:
             self.program = program
             self.node = core.TaskNode(program.desc, cur_rank, max_run_times,
@@ -218,39 +217,35 @@ def run1f1b(program, cur_rank, max_run_times, dist_opt, nrank):
     # Create task nodes.
     # The lr_sched and opt should be 'amplifier interceptor.
     # The fwd and bwd should be 'compute interceptor'.
-    lr_task_node = TaskNode(
-        cur_rank=cur_rank,
-        max_run_times=max_run_times,
-        max_slot_times=max_slot_times,
-        role=int(OpRole.Optimize.LRSched),
-        ops=lr_ops,
-        task_id=int(cur_rank * num_of_functionality + 0),
-        node_type="Amplifier")
+    lr_task_node = TaskNode(cur_rank=cur_rank,
+                            max_run_times=max_run_times,
+                            max_slot_times=max_slot_times,
+                            role=int(OpRole.Optimize.LRSched),
+                            ops=lr_ops,
+                            task_id=int(cur_rank * num_of_functionality + 0),
+                            node_type="Amplifier")
     lr_task_node.set_run_pre_steps(max_run_times)
-    fwd_task_node = TaskNode(
-        cur_rank=cur_rank,
-        max_run_times=max_run_times,
-        max_slot_times=max_slot_times,
-        role=int(OpRole.Forward),
-        ops=fwd_ops,
-        task_id=int(cur_rank * num_of_functionality + 1),
-        node_type="Compute")
-    bwd_task_node = TaskNode(
-        cur_rank=cur_rank,
-        max_run_times=max_run_times,
-        max_slot_times=max_slot_times,
-        role=int(OpRole.Backward),
-        ops=bwd_ops,
-        task_id=int(cur_rank * num_of_functionality + 2),
-        node_type="Compute")
-    opt_task_node = TaskNode(
-        cur_rank=cur_rank,
-        max_run_times=max_run_times,
-        max_slot_times=max_slot_times,
-        role=int(OpRole.Optimize),
-        ops=opt_ops,
-        task_id=int(cur_rank * num_of_functionality + 3),
-        node_type="Amplifier")
+    fwd_task_node = TaskNode(cur_rank=cur_rank,
+                             max_run_times=max_run_times,
+                             max_slot_times=max_slot_times,
+                             role=int(OpRole.Forward),
+                             ops=fwd_ops,
+                             task_id=int(cur_rank * num_of_functionality + 1),
+                             node_type="Compute")
+    bwd_task_node = TaskNode(cur_rank=cur_rank,
+                             max_run_times=max_run_times,
+                             max_slot_times=max_slot_times,
+                             role=int(OpRole.Backward),
+                             ops=bwd_ops,
+                             task_id=int(cur_rank * num_of_functionality + 2),
+                             node_type="Compute")
+    opt_task_node = TaskNode(cur_rank=cur_rank,
+                             max_run_times=max_run_times,
+                             max_slot_times=max_slot_times,
+                             role=int(OpRole.Optimize),
+                             ops=opt_ops,
+                             task_id=int(cur_rank * num_of_functionality + 3),
+                             node_type="Amplifier")
     opt_task_node.set_run_pre_steps(max_run_times)
     opt_task_node.set_run_at_offset(max_run_times - 1)
     task_nodes = [lr_task_node, fwd_task_node, bwd_task_node, opt_task_node]
@@ -318,8 +313,10 @@ def origin(program, cur_rank):
         task_id_to_rank (dict): a fake dict, since there is no upstream or downstream, this dict won't be used
     """
     print("fleet executor will use python side origin scheduler.")
-    task_node = TaskNode(
-        program=program, cur_rank=cur_rank, max_run_times=1, max_slot_times=1)
+    task_node = TaskNode(program=program,
+                         cur_rank=cur_rank,
+                         max_run_times=1,
+                         max_slot_times=1)
     task_node.set_type("Compute")
     task_id = task_node.task_id()
     task_id_to_rank = {task_id: cur_rank}
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 343cca7f4f0d3..583043c186abf 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -166,13 +166,12 @@ def _parse_args():
         )
         base_group.add_argument("--selected_mlus", dest="mlus")
 
-    base_group.add_argument(
-        "training_script",
-        type=str,
-        help="The full path to the single GPU training "
-        "program/script to be launched in parallel, "
-        "followed by all the arguments for the "
-        "training script")
+    base_group.add_argument("training_script",
+                            type=str,
+                            help="The full path to the single GPU training "
+                            "program/script to be launched in parallel, "
+                            "followed by all the arguments for the "
+                            "training script")
 
     base_group.add_argument('training_script_args', nargs=REMAINDER)
 
@@ -204,10 +203,14 @@ def _parse_args():
 
     ps_group = parser.add_argument_group("Parameter-Server Parameters")
     # for parameter server
-    ps_group.add_argument(
-        "--servers", type=str, default="", help="User defined servers ip:port")
-    ps_group.add_argument(
-        "--workers", type=str, default="", help="User defined workers ip:port")
+    ps_group.add_argument("--servers",
+                          type=str,
+                          default="",
+                          help="User defined servers ip:port")
+    ps_group.add_argument("--workers",
+                          type=str,
+                          default="",
+                          help="User defined workers ip:port")
     ps_group.add_argument(
         "--heter_workers",
         type=str,
@@ -221,26 +224,30 @@ def _parse_args():
 
     ps_group.add_argument("--worker_num", type=int, help="number of workers")
     ps_group.add_argument("--server_num", type=int, help="number of servers")
-    ps_group.add_argument(
-        "--heter_worker_num",
-        type=str,
-        help="number of heter_workers in each stage 1;2;3")
+    ps_group.add_argument("--heter_worker_num",
+                          type=str,
+                          help="number of heter_workers in each stage 1;2;3")
     ps_group.add_argument("--http_port", type=int, help="Gloo http Port")
 
     # parameter elastic mode
     elastic_group = parser.add_argument_group("Elastic Parameters")
-    elastic_group.add_argument(
-        "--elastic_server", type=str, help="etcd server host:port")
-    elastic_group.add_argument(
-        "--elastic_pre_hook", type=str, help="elastic pre_hook shell cmd")
+    elastic_group.add_argument("--elastic_server",
+                               type=str,
+                               help="etcd server host:port")
+    elastic_group.add_argument("--elastic_pre_hook",
+                               type=str,
+                               help="elastic pre_hook shell cmd")
 
     elastic_group.add_argument("--job_id", type=str, help="job unique id")
     elastic_group.add_argument("--np", type=int, help="job pod/node number")
     elastic_group.add_argument("--scale", type=int, default=0, help="scale np")
-    elastic_group.add_argument(
-        "--host", type=str, help="bind host, default to POD_IP env")
-    elastic_group.add_argument(
-        "--force", type=bool, default=False, help="update np force")
+    elastic_group.add_argument("--host",
+                               type=str,
+                               help="bind host, default to POD_IP env")
+    elastic_group.add_argument("--force",
+                               type=bool,
+                               default=False,
+                               help="update np force")
 
     known_args, _ = parser.parse_known_args()
     return known_args
@@ -351,15 +358,16 @@ def get_cluster_info(args):
             cluster, pod = launch_utils.get_mapped_cluster_from_args_with_rank_mapping(
                 args, device_mode)
     elif cloud_utils.use_paddlecloud() and trainers_num != 1:
-        cluster, pod = cloud_utils.get_cloud_cluster(
-            args.ips, device_mode, devices_per_proc, start_port)
+        cluster, pod = cloud_utils.get_cloud_cluster(args.ips, device_mode,
+                                                     devices_per_proc,
+                                                     start_port)
         logger.debug("get cluster from cloud:{}".format(cluster))
     elif device_mode == DeviceMode.ASCEND_NPU:
         # for ascend
-        cluster, pod = ascend_utils.get_cloud_cluster(
-            rank_table_file=os.getenv("RANK_TABLE_FILE", None),
-            device_mode=device_mode,
-            start_port=start_port)
+        cluster, pod = ascend_utils.get_cloud_cluster(rank_table_file=os.getenv(
+            "RANK_TABLE_FILE", None),
+                                                      device_mode=device_mode,
+                                                      start_port=start_port)
     else:
         # trainers_num = 1 or not use paddlecloud ips="a,b"
         cluster, pod = get_cluster_from_args(args, device_mode,
@@ -383,13 +391,12 @@ def launch_collective(args):
     cluster, pod = get_cluster_info(args)
     global_envs = get_global_envs(args, tmp_dir)
 
-    procs = start_local_trainers(
-        cluster,
-        pod,
-        training_script=args.training_script,
-        training_script_args=args.training_script_args,
-        log_dir=args.log_dir,
-        envs=global_envs)
+    procs = start_local_trainers(cluster,
+                                 pod,
+                                 training_script=args.training_script,
+                                 training_script_args=args.training_script_args,
+                                 log_dir=args.log_dir,
+                                 envs=global_envs)
 
     for idx, proc in enumerate(procs):
         print("launch proc_id:{} idx:{}".format(proc.proc.pid, idx))
@@ -492,16 +499,17 @@ def which_distributed_mode(args):
 
     if len(has_ps_args) > 0:
         logger.info(
-            "Run parameter-sever mode. pserver arguments:{}, accelerators count:{}".
-            format(has_ps_args, accelerators))
+            "Run parameter-sever mode. pserver arguments:{}, accelerators count:{}"
+            .format(has_ps_args, accelerators))
         has_ps_heter_args = list(set(has_ps_args) & set(ps_heter_args))
         if len(has_ps_heter_args) > 0:
             return DistributeMode.PS_HETER
         else:
             return DistributeMode.PS
     elif len(has_collective_args) > 0:
-        logger.info("Run collective mode. gpu arguments:{}, cuda count:{}".
-                    format(has_collective_args, accelerators))
+        logger.info(
+            "Run collective mode. gpu arguments:{}, cuda count:{}".format(
+                has_collective_args, accelerators))
         return DistributeMode.COLLECTIVE
     else:
         if not fluid.core.is_compiled_with_cuda(
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 2dec58c753853..e10709416f819 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -33,6 +33,7 @@
 import paddle.fluid as fluid
 from distutils.util import strtobool
 import paddle.utils.cpp_extension.extension_utils as utils
+
 logger = logging.getLogger("root")
 logger.propagate = False
 
@@ -61,6 +62,7 @@ class DeviceMode():
 
 
 class Cluster(object):
+
     def __init__(self, hdfs):
         self.job_server = None
         self.pods = []
@@ -130,6 +132,7 @@ def get_pod_by_id(self, pod_id):
 
 
 class JobServer(object):
+
     def __init__(self):
         self.endpoint = None
 
@@ -144,6 +147,7 @@ def __ne__(self, j):
 
 
 class Trainer(object):
+
     def __init__(self):
         self.accelerators = []
         self.endpoint = None
@@ -176,6 +180,7 @@ def rank(self):
 
 
 class Pod(object):
+
     def __init__(self):
         self.rank = None
         self.id = None
@@ -191,10 +196,10 @@ def __init__(self):
     def __str__(self):
         return "rank:{} id:{} addr:{} port:{} visible_accelerator:{} trainers:{} servers:{} \
             workers:{} heter_workers:{}".format(
-            self.rank, self.id, self.addr, self.port, self.accelerators, [
-                str(t) for t in self.trainers
-            ], [str(s) for s in self.servers], [str(w) for w in self.workers],
-            [str(h) for h in self.heter_workers])
+            self.rank, self.id, self.addr, self.port, self.accelerators,
+            [str(t) for t in self.trainers], [str(s) for s in self.servers],
+            [str(w)
+             for w in self.workers], [str(h) for h in self.heter_workers])
 
     def __eq__(self, pod):
         if self.rank != pod.rank or \
@@ -367,15 +372,15 @@ def add_arguments(argname, type, default, help, argparser, **kwargs):
         args = parser.parse_args()
     """
     type = strtobool if type == bool else type
-    argparser.add_argument(
-        "--" + argname,
-        default=default,
-        type=type,
-        help=help + ' Default: %(default)s.',
-        **kwargs)
+    argparser.add_argument("--" + argname,
+                           default=default,
+                           type=type,
+                           help=help + ' Default: %(default)s.',
+                           **kwargs)
 
 
 def find_free_ports(num):
+
     def __free_port():
         with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
             # Note(wangxi): Close the connection with a TCP RST instead
@@ -424,8 +429,8 @@ def pretty_print_envs(envs, header=None):
     for k, v in envs.items():
         max_k = max(max_k, len(k))
 
-    h_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(max_k, " " * spacing,
-                                                          max_v)
+    h_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(
+        max_k, " " * spacing, max_v)
     l_format = "    " + "|{{:>{}s}}{{}}{{:^{}s}}|\n".format(max_k, max_v)
     length = max_k + max_v + spacing
 
@@ -457,6 +462,7 @@ def pretty_print_envs(envs, header=None):
 
 
 class TrainerProc(object):
+
     def __init__(self):
         self.proc = None
         self.log_fn = None
@@ -502,14 +508,20 @@ def start_local_trainers(cluster,
     procs = []
     for idx, t in enumerate(pod.trainers):
         proc_env = {
-            "PADDLE_TRAINER_ID": "%d" % t.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
-            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
-            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
-            "PADDLE_RANK_IN_NODE": str(idx),
+            "PADDLE_TRAINER_ID":
+            "%d" % t.rank,
+            "PADDLE_CURRENT_ENDPOINT":
+            "%s" % t.endpoint,
+            "PADDLE_TRAINERS_NUM":
+            "%d" % cluster.trainers_nranks(),
+            "PADDLE_TRAINER_ENDPOINTS":
+            ",".join(cluster.trainers_endpoints()),
+            "PADDLE_RANK_IN_NODE":
+            str(idx),
             "PADDLE_LOCAL_DEVICE_IDS":
             ",".join([str(acc) for acc in t.accelerators]),
-            "PADDLE_WORLD_DEVICE_IDS": ",".join(res),
+            "PADDLE_WORLD_DEVICE_IDS":
+            ",".join(res),
         }
 
         # The following three environnement variables are used for auto mapping
@@ -527,8 +539,8 @@ def start_local_trainers(cluster,
             proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
                 [str(g) for g in t.accelerators])
 
-        elif len(t.
-                 accelerators) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU:
+        elif len(t.accelerators
+                 ) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU:
             proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
                 [str(g) for g in t.accelerators])
         elif len(t.accelerators) > 0 and pod.device_mode == DeviceMode.MLU:
@@ -558,8 +570,8 @@ def start_local_trainers(cluster,
             logger.info("Local start {} processes. First process distributed "
                         "environment info (Only For Debug): {}".format(
                             len(pod.trainers),
-                            pretty_print_envs(proc_env, ("Distributed Envs",
-                                                         "Value"))))
+                            pretty_print_envs(proc_env,
+                                              ("Distributed Envs", "Value"))))
             logger.info(
                 "details about PADDLE_TRAINER_ENDPOINTS can be found in "
                 "{}/endpoints.log, and detail running logs maybe found in "
@@ -578,8 +590,11 @@ def start_local_trainers(cluster,
                 fn = open("%s/prelaunchlog.%d" % (log_dir, idx), "a")
             else:
                 fn = open("%s/workerlog.%d" % (log_dir, idx), "a")
-            proc = subprocess.Popen(
-                cmd, env=current_env, stdout=fn, stderr=fn, preexec_fn=pre_fn)
+            proc = subprocess.Popen(cmd,
+                                    env=current_env,
+                                    stdout=fn,
+                                    stderr=fn,
+                                    preexec_fn=pre_fn)
         else:
             proc = subprocess.Popen(cmd, env=current_env, preexec_fn=pre_fn)
 
@@ -638,14 +653,14 @@ def watch_local_trainers(procs, nranks):
         return
     except SystemExit:
         logger.error(
-            "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
-            format(nranks, error_rank))
+            "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log."
+            .format(nranks, error_rank))
         terminate_local_procs(procs)
         return
     except:
         logger.error(
-            "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
-            format(nranks, error_rank))
+            "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log."
+            .format(nranks, error_rank))
         terminate_local_procs(procs)
         return
 
@@ -941,8 +956,9 @@ def get_custom_endpoints(origin_endpoints, offset=0):
 #        pretty_print_envs(environs)))
 
 
-def get_mapped_cluster_without_rank_mapping(
-        node_ips, node_ip, trainer_endpoints, device_mode, node_ranks):
+def get_mapped_cluster_without_rank_mapping(node_ips, node_ip,
+                                            trainer_endpoints, device_mode,
+                                            node_ranks):
     assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
     assert device_mode == DeviceMode.GPU, \
         "Only support get mapped cluster for gpu now."
@@ -1000,8 +1016,9 @@ def get_mapped_cluster_from_args_without_rank_mapping(args, device_mode):
         "ranks length should be equal to ips length."
 
     logger.debug("parsed from args: node_ips:{} node_ip:{} "
-                 "node_rank:{} node_ranks:{}".format(
-                     node_ips, node_ip, node_rank, node_ranks[node_rank]))
+                 "node_rank:{} node_ranks:{}".format(node_ips, node_ip,
+                                                     node_rank,
+                                                     node_ranks[node_rank]))
 
     # NOTE: there are different number of global mapped ranks on each node.
     free_ports = []
@@ -1011,23 +1028,22 @@ def get_mapped_cluster_from_args_without_rank_mapping(args, device_mode):
         if os.environ.get('PADDLE_PORT') is not None:
             start_port = int(os.getenv("PADDLE_PORT", ""))
             free_ports = [
-                x
-                for x in range(start_port, start_port + len(node_ranks[
-                    node_rank]))
+                x for x in range(start_port, start_port +
+                                 len(node_ranks[node_rank]))
             ]
         elif os.environ.get('FLAGS_START_PORT') is not None:
             start_port = int(os.environ.get('FLAGS_START_PORT'))
             free_ports = [
-                x
-                for x in range(start_port, start_port + len(node_ranks[
-                    node_rank]))
+                x for x in range(start_port, start_port +
+                                 len(node_ranks[node_rank]))
             ]
         else:
             free_ports = find_free_ports(len(node_ranks[node_rank]))
         trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
 
-    return get_mapped_cluster_without_rank_mapping(
-        node_ips, node_ip, trainer_endpoints, device_mode, node_ranks)
+    return get_mapped_cluster_without_rank_mapping(node_ips, node_ip,
+                                                   trainer_endpoints,
+                                                   device_mode, node_ranks)
 
 
 def get_mapped_cluster_with_rank_mapping(node_ips, node_ip, trainer_endpoints,
@@ -1066,8 +1082,8 @@ def get_relative_gpu_id(gpu_id):
                 ranks_per_node[i])]
             assert len(local_device_ids) == 1, \
                 "Only support one process to one device mapping"
-            trainer.accelerators.append(
-                get_relative_gpu_id(local_device_ids[0]))
+            trainer.accelerators.append(get_relative_gpu_id(
+                local_device_ids[0]))
             trainer.endpoint = "%s" % (cur_node_endpoints[i])
             trainer.rank = ranks_per_node[i]
             pod.trainers.append(trainer)
@@ -1121,8 +1137,9 @@ def get_mapped_cluster_from_args_with_rank_mapping(args, device_mode):
         "ranks length should be equal to ips length."
 
     logger.debug("parsed from args: node_ips:{} node_ip:{} "
-                 "node_rank:{} node_ranks:{}".format(
-                     node_ips, node_ip, node_rank, node_ranks[node_rank]))
+                 "node_rank:{} node_ranks:{}".format(node_ips, node_ip,
+                                                     node_rank,
+                                                     node_ranks[node_rank]))
 
     # NOTE: there are different number of global mapped ranks on each node.
     free_ports = []
@@ -1132,16 +1149,14 @@ def get_mapped_cluster_from_args_with_rank_mapping(args, device_mode):
         if os.environ.get('PADDLE_PORT') is not None:
             start_port = int(os.getenv("PADDLE_PORT", ""))
             free_ports = [
-                x
-                for x in range(start_port, start_port + len(node_ranks[
-                    node_rank]))
+                x for x in range(start_port, start_port +
+                                 len(node_ranks[node_rank]))
             ]
         elif os.environ.get('FLAGS_START_PORT') is not None:
             start_port = int(os.environ.get('FLAGS_START_PORT'))
             free_ports = [
-                x
-                for x in range(start_port, start_port + len(node_ranks[
-                    node_rank]))
+                x for x in range(start_port, start_port +
+                                 len(node_ranks[node_rank]))
             ]
         else:
             free_ports = find_free_ports(len(node_ranks[node_rank]))
@@ -1153,6 +1168,7 @@ def get_mapped_cluster_from_args_with_rank_mapping(args, device_mode):
 
 
 class ParameterServerLauncher(object):
+
     def __init__(self, args, distribute_mode):
         self.args = args
         self.distribute_mode = distribute_mode
@@ -1234,8 +1250,9 @@ def get_role_endpoints(self, args):
                 # create endpoints str
                 worker_endpoints = []
                 for i in range(self.worker_num):
-                    worker_endpoints.append(":".join((worker_endpoints_ips[
-                        i], str(worker_endpoints_port[i]))))
+                    worker_endpoints.append(":".join(
+                        (worker_endpoints_ips[i],
+                         str(worker_endpoints_port[i]))))
                 self.worker_endpoints = ",".join(worker_endpoints)
             else:
                 self.worker_endpoints = args.workers
@@ -1287,13 +1304,14 @@ def get_role_endpoints(self, args):
                         if 1 in heter_worker_endpoints_len:
                             # if no port value in heter_worker_endpoint, will set default port values.
                             heter_worker_endpoints_port = get_ports(
-                                len(heter_worker_endpoints_ips), self.worker_num
-                                + self.server_num + self.heter_worker_num)
+                                len(heter_worker_endpoints_ips),
+                                self.worker_num + self.server_num +
+                                self.heter_worker_num)
                             new_heter_worker_endpoints = []
                             for j in range(len(heter_worker_endpoints_ips)):
-                                new_heter_worker_endpoints.append(":".join((
-                                    heter_worker_endpoints_ips[j], str(
-                                        heter_worker_endpoints_port[j]))))
+                                new_heter_worker_endpoints.append(":".join(
+                                    (heter_worker_endpoints_ips[j],
+                                     str(heter_worker_endpoints_port[j]))))
                             ip_port_list = ",".join(new_heter_worker_endpoints)
                         else:
                             ip_port_list = ",".join(heter_worker_endpoints)
@@ -1307,9 +1325,9 @@ def get_role_endpoints(self, args):
                 else:
                     for i in range(len(self.stage_heter_trainer_num)):
                         heter_trainer_num = self.stage_heter_trainer_num[i]
-                        ports = get_ports(heter_trainer_num,
-                                          self.server_num + self.worker_num +
-                                          self.heter_worker_num)
+                        ports = get_ports(
+                            heter_trainer_num, self.server_num +
+                            self.worker_num + self.heter_worker_num)
                         ip_port_list = ",".join(
                             ["127.0.0.1:" + str(x) for x in ports])
                         self.stage_heter_map[i + 2] = ip_port_list
@@ -1344,9 +1362,9 @@ def get_role_endpoints(self, args):
 
                         new_heter_worker_endpoints = []
                         for j in range(len(heter_worker_endpoints_ips)):
-                            new_heter_worker_endpoints.append(":".join((
-                                heter_worker_endpoints_ips[j], str(
-                                    heter_worker_endpoints_port[j]))))
+                            new_heter_worker_endpoints.append(":".join(
+                                (heter_worker_endpoints_ips[j],
+                                 str(heter_worker_endpoints_port[j]))))
                         ip_port_list = ",".join(new_heter_worker_endpoints)
                     else:
                         ip_port_list = ",".join(heter_worker_endpoints)
@@ -1480,8 +1498,8 @@ def start_ps(self):
             self.start_pod_heter_worker(self.args, pod)
 
         logger.info(
-            "Please check servers, workers and heter_worker logs in {}/workerlog.*, {}/serverlog.* and {}/heterlog.*".
-            format(self.args.log_dir, self.args.log_dir, self.args.log_dir))
+            "Please check servers, workers and heter_worker logs in {}/workerlog.*, {}/serverlog.* and {}/heterlog.*"
+            .format(self.args.log_dir, self.args.log_dir, self.args.log_dir))
 
         # 4. wait for finish training
         if len(self.procs["worker"]) > 0:
@@ -1536,8 +1554,7 @@ def start_pod_server(self, args, pod):
                     "TRAINING_ROLE": "PSERVER",
                     "PADDLE_TRAINERS_NUM": str(self.worker_num),
                     "POD_IP": cur_server.endpoint.split(":")[0],
-                    "PADDLE_WITH_GLOO":
-                    str(os.getenv("PADDLE_WITH_GLOO", "0")),
+                    "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")),
                     "PADDLE_GLOO_RENDEZVOUS": "3",
                     "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
                     "PADDLE_GLOO_HTTP_ENDPOINT": self.http_port
@@ -1550,8 +1567,7 @@ def start_pod_server(self, args, pod):
                     "TRAINING_ROLE": "PSERVER",
                     "PADDLE_TRAINERS_NUM": str(self.worker_num),
                     "POD_IP": cur_server.endpoint.split(":")[0],
-                    "PADDLE_WITH_GLOO":
-                    str(os.getenv("PADDLE_WITH_GLOO", "0")),
+                    "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")),
                     "PADDLE_GLOO_RENDEZVOUS": "3",
                     "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
                     "PADDLE_GLOO_HTTP_ENDPOINT": self.http_port
@@ -1567,15 +1583,17 @@ def start_pod_server(self, args, pod):
                     "Local server start {} processes. First process distributed "
                     "environment info (Only For Debug): {}".format(
                         len(pod.servers),
-                        pretty_print_envs(proc_env, ("Distributed Envs", "Value"
-                                                     ))))
+                        pretty_print_envs(proc_env,
+                                          ("Distributed Envs", "Value"))))
 
             if args.log_dir is not None:
                 os.system("mkdir -p {}".format(args.log_dir))
                 fn = open("%s/serverlog.%d" % (args.log_dir, idx), "w")
                 self.log_fns["server"].append(fn)
-                proc = subprocess.Popen(
-                    cmd, env=current_env, stdout=fn, stderr=fn)
+                proc = subprocess.Popen(cmd,
+                                        env=current_env,
+                                        stdout=fn,
+                                        stderr=fn)
             else:
                 proc = subprocess.Popen(cmd, env=current_env)
 
@@ -1605,35 +1623,54 @@ def start_pod_worker(self, args, pod):
             device_list = [str(x) for x in range(0, heter_device_num)]
 
         for idx, cur_worker in enumerate(pod.workers):
-            device_id = "0" if heter_device_num == 0 else str(device_list[(
-                idx) % heter_device_num])
+            device_id = "0" if heter_device_num == 0 else str(
+                device_list[(idx) % heter_device_num])
             if self.distribute_mode == DistributeMode.PS_HETER:
                 proc_env = {
-                    "PADDLE_PSERVERS_IP_PORT_LIST": self.server_endpoints,
-                    "PADDLE_TRAINER_ENDPOINTS": self.worker_endpoints,
-                    "PADDLE_TRAINERS_NUM": str(self.worker_num),
-                    "PADDLE_STAGE_TRAINERS_NUM": str(self.stage_trainer_num),
-                    "STAGE_ID": "1",
-                    "STAGE_NUM": str(self.stage_num),
-                    "PADDLE_PREVIOUS_HETER_TRAINER_IP_PORT_LIST": "",
+                    "PADDLE_PSERVERS_IP_PORT_LIST":
+                    self.server_endpoints,
+                    "PADDLE_TRAINER_ENDPOINTS":
+                    self.worker_endpoints,
+                    "PADDLE_TRAINERS_NUM":
+                    str(self.worker_num),
+                    "PADDLE_STAGE_TRAINERS_NUM":
+                    str(self.stage_trainer_num),
+                    "STAGE_ID":
+                    "1",
+                    "STAGE_NUM":
+                    str(self.stage_num),
+                    "PADDLE_PREVIOUS_HETER_TRAINER_IP_PORT_LIST":
+                    "",
                     "PADDLE_NEXT_HETER_TRAINER_IP_PORT_LIST":
                     self.stage_heter_map[2],
                     "PADDLE_ALL_HETER_TRAINER_IP_PORT_LIST":
                     self.heter_worker_endpoints,
-                    "HETER_DEVICE_TYPE": self.stage_device_map[1],
-                    "TRAINING_ROLE": "TRAINER",
-                    "POD_IP": cur_worker.endpoint.split(":")[0],
-                    "PADDLE_PORT": cur_worker.endpoint.split(":")[1],
-                    "PADDLE_TRAINER_ID": str(cur_worker.rank),
+                    "HETER_DEVICE_TYPE":
+                    self.stage_device_map[1],
+                    "TRAINING_ROLE":
+                    "TRAINER",
+                    "POD_IP":
+                    cur_worker.endpoint.split(":")[0],
+                    "PADDLE_PORT":
+                    cur_worker.endpoint.split(":")[1],
+                    "PADDLE_TRAINER_ID":
+                    str(cur_worker.rank),
                     "PADDLE_WITH_GLOO":
                     str(os.getenv("PADDLE_WITH_GLOO", "0")),
-                    "PADDLE_GLOO_RENDEZVOUS": "3",
-                    "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
-                    "FLAGS_selected_gpus": "0",
-                    "FLAGS_selected_xpus": "0",
-                    "CUDA_VISIBLE_DEVICES": device_id,
-                    "XPU_VISIBLE_DEVICES": device_id,
-                    "PADDLE_GLOO_HTTP_ENDPOINT": self.http_port
+                    "PADDLE_GLOO_RENDEZVOUS":
+                    "3",
+                    "PADDLE_GLOO_FS_PATH":
+                    self.gloo_rendezvous_dir,
+                    "FLAGS_selected_gpus":
+                    "0",
+                    "FLAGS_selected_xpus":
+                    "0",
+                    "CUDA_VISIBLE_DEVICES":
+                    device_id,
+                    "XPU_VISIBLE_DEVICES":
+                    device_id,
+                    "PADDLE_GLOO_HTTP_ENDPOINT":
+                    self.http_port
                 }
             else:
                 proc_env = {
@@ -1644,8 +1681,7 @@ def start_pod_worker(self, args, pod):
                     "POD_IP": cur_worker.endpoint.split(":")[0],
                     "PADDLE_PORT": cur_worker.endpoint.split(":")[1],
                     "PADDLE_TRAINER_ID": str(cur_worker.rank),
-                    "PADDLE_WITH_GLOO":
-                    str(os.getenv("PADDLE_WITH_GLOO", "0")),
+                    "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")),
                     "PADDLE_GLOO_RENDEZVOUS": "3",
                     "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
                     "FLAGS_selected_gpus": "0",
@@ -1665,15 +1701,17 @@ def start_pod_worker(self, args, pod):
                     "Local worker start {} processes. First process distributed "
                     "environment info (Only For Debug): {}".format(
                         len(pod.workers),
-                        pretty_print_envs(proc_env, ("Distributed Envs", "Value"
-                                                     ))))
+                        pretty_print_envs(proc_env,
+                                          ("Distributed Envs", "Value"))))
 
             if args.log_dir is not None:
                 os.system("mkdir -p {}".format(args.log_dir))
                 fn = open("%s/workerlog.%d" % (args.log_dir, idx), "w")
                 self.log_fns["worker"].append(fn)
-                proc = subprocess.Popen(
-                    cmd, env=current_env, stdout=fn, stderr=fn)
+                proc = subprocess.Popen(cmd,
+                                        env=current_env,
+                                        stdout=fn,
+                                        stderr=fn)
             else:
                 proc = subprocess.Popen(cmd, env=current_env)
 
@@ -1703,12 +1741,14 @@ def start_pod_heter_worker(self, args, pod):
             device_list = [str(x) for x in range(0, heter_device_num)]
 
         for idx, cur_heter_worker in enumerate(pod.heter_workers):
-            device_id = "0" if heter_device_num == 0 else str(device_list[(
-                idx) % heter_device_num])
+            device_id = "0" if heter_device_num == 0 else str(
+                device_list[(idx) % heter_device_num])
             stage_id = cur_heter_worker.stage
             proc_env = {
-                "PADDLE_PSERVERS_IP_PORT_LIST": self.server_endpoints,
-                "PADDLE_TRAINER_ENDPOINTS": self.worker_endpoints,
+                "PADDLE_PSERVERS_IP_PORT_LIST":
+                self.server_endpoints,
+                "PADDLE_TRAINER_ENDPOINTS":
+                self.worker_endpoints,
                 "PADDLE_NEXT_HETER_TRAINER_IP_PORT_LIST":
                 self.stage_heter_map[stage_id + 1]
                 if stage_id <= self.stage_num - 1 else "",
@@ -1716,22 +1756,38 @@ def start_pod_heter_worker(self, args, pod):
                 self.stage_heter_map[stage_id - 1],
                 "PADDLE_ALL_HETER_TRAINER_IP_PORT_LIST":
                 self.heter_worker_endpoints,
-                "HETER_DEVICE_TYPE": self.stage_device_map[stage_id],
-                "STAGE_ID": str(stage_id),
-                "STAGE_NUM": str(self.stage_num),
-                "PADDLE_PORT": cur_heter_worker.endpoint.split(":")[1],
-                "TRAINING_ROLE": "HETER_TRAINER",
-                "PADDLE_TRAINERS_NUM": str(self.worker_num),
-                "PADDLE_STAGE_TRAINERS_NUM": str(self.stage_trainer_num),
-                "POD_IP": cur_heter_worker.endpoint.split(":")[0],
-                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "0")),
-                "PADDLE_GLOO_RENDEZVOUS": "3",
-                "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
-                "FLAGS_selected_gpus": "0",
-                "FLAGS_selected_xpus": "0",
-                "CUDA_VISIBLE_DEVICES": device_id,
-                "XPU_VISIBLE_DEVICES": device_id,
-                "PADDLE_GLOO_HTTP_ENDPOINT": self.http_port
+                "HETER_DEVICE_TYPE":
+                self.stage_device_map[stage_id],
+                "STAGE_ID":
+                str(stage_id),
+                "STAGE_NUM":
+                str(self.stage_num),
+                "PADDLE_PORT":
+                cur_heter_worker.endpoint.split(":")[1],
+                "TRAINING_ROLE":
+                "HETER_TRAINER",
+                "PADDLE_TRAINERS_NUM":
+                str(self.worker_num),
+                "PADDLE_STAGE_TRAINERS_NUM":
+                str(self.stage_trainer_num),
+                "POD_IP":
+                cur_heter_worker.endpoint.split(":")[0],
+                "PADDLE_WITH_GLOO":
+                str(os.getenv("PADDLE_WITH_GLOO", "0")),
+                "PADDLE_GLOO_RENDEZVOUS":
+                "3",
+                "PADDLE_GLOO_FS_PATH":
+                self.gloo_rendezvous_dir,
+                "FLAGS_selected_gpus":
+                "0",
+                "FLAGS_selected_xpus":
+                "0",
+                "CUDA_VISIBLE_DEVICES":
+                device_id,
+                "XPU_VISIBLE_DEVICES":
+                device_id,
+                "PADDLE_GLOO_HTTP_ENDPOINT":
+                self.http_port
             }
             current_env.update(proc_env)
 
@@ -1744,15 +1800,17 @@ def start_pod_heter_worker(self, args, pod):
                     "Local heter_worker start {} processes. First process distributed "
                     "environment info (Only For Debug): {}".format(
                         len(pod.heter_workers),
-                        pretty_print_envs(proc_env, ("Distributed Envs", "Value"
-                                                     ))))
+                        pretty_print_envs(proc_env,
+                                          ("Distributed Envs", "Value"))))
 
             if args.log_dir is not None:
                 os.system("mkdir -p {}".format(args.log_dir))
                 fn = open("%s/heterlog.%d" % (args.log_dir, idx), "w")
                 self.log_fns["heter_worker"].append(fn)
-                proc = subprocess.Popen(
-                    cmd, env=current_env, stdout=fn, stderr=fn)
+                proc = subprocess.Popen(cmd,
+                                        env=current_env,
+                                        stdout=fn,
+                                        stderr=fn)
             else:
                 proc = subprocess.Popen(cmd, env=current_env)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index e3a781424e6d5..78a53ccdba55e 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -18,6 +18,7 @@
 
 
 class AMPOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(AMPOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -33,8 +34,9 @@ def __init__(self, optimizer):
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
-        super(AMPOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        super(AMPOptimizer,
+              self)._set_basic_info(loss, role_maker, user_defined_optimizer,
+                                    user_defined_strategy)
 
     def _init_wrapped_opt(self):
         if self.wrapped_opt is not None:
@@ -103,8 +105,9 @@ def apply_gradients(self, params_grads):
         return self.wrapped_opt.apply_gradients(params_grads=params_grads)
 
     def apply_optimize(self, loss, startup_program, params_grads):
-        return self.wrapped_opt.apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
+        return self.wrapped_opt.apply_optimize(loss,
+                                               startup_program=startup_program,
+                                               params_grads=params_grads)
 
     def minimize_impl(self,
                       loss,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py
index b9a7651e44909..185a92b8d94d3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
index 6282ac7b50983..96d83ff4d39f0 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -28,6 +28,7 @@
 
 
 class AscendIRParser(object):
+
     def __init__(self, auto_dp=False, world_rank_size=1):
         self.graph_idx = 0
         self.hcom_endpoints = {}
@@ -45,13 +46,13 @@ def _construct_input_map(self, input_varlist):
                 ret_map[var.name] = ge_input
                 ge_in_operator.append(ge_input)
             else:  # param, learning ...
-                ge_input = core.GEOperatorFactory.create_operator(var.name,
-                                                                  "Variable")
-                ge_input.update_output_desc("y",
-                                            core.GETensorDesc(
-                                                core.GEShape(var.shape),
-                                                core.GEFormat.FORMAT_ND,
-                                                core.GEDataType.DT_FLOAT))
+                ge_input = core.GEOperatorFactory.create_operator(
+                    var.name, "Variable")
+                ge_input.update_output_desc(
+                    "y",
+                    core.GETensorDesc(core.GEShape(var.shape),
+                                      core.GEFormat.FORMAT_ND,
+                                      core.GEDataType.DT_FLOAT))
                 ret_map[var.name] = ge_input
         return ge_in_operator, ret_map
 
@@ -70,7 +71,7 @@ def parse_op(self, op):
             nccl_id = op.output_arg_names[0]
 
             # c_gen_nccl_id operator splits endpoints into local endpoint and other_endpoints
-            # we should combine these together to produce world_rank_ids 
+            # we should combine these together to produce world_rank_ids
             self.hcom_endpoints[nccl_id] = other_endpoints[:]
             self.hcom_endpoints[nccl_id].insert(rank, endpoint)
 
@@ -79,8 +80,8 @@ def parse_op(self, op):
         elif op.type == 'c_comm_init':
             nccl_id = op.input_arg_names[0]
             nranks = op.attr("nranks")
-            assert nranks == len(self.hcom_endpoints[
-                nccl_id]), "nranks doesn't match endpoint count"
+            assert nranks == len(self.hcom_endpoints[nccl_id]
+                                 ), "nranks doesn't match endpoint count"
             rank = op.attr("rank")
             ring_id = op.attr("ring_id")
 
@@ -90,8 +91,9 @@ def parse_op(self, op):
                 for endpoint in self.hcom_endpoints[nccl_id]
             ]
             self.groups_to_create.append(
-                HcomGroupConfig(
-                    name=group_name, nranks=nranks, rank_ids=global_rank_ids))
+                HcomGroupConfig(name=group_name,
+                                nranks=nranks,
+                                rank_ids=global_rank_ids))
             print("append to create group: %s, with rank_ids: %s" %
                   (group_name, global_rank_ids))
         elif op.type in ascend_parser.registerd_op:
@@ -121,8 +123,8 @@ def _parse_program(self,
 
         ge_in_operator, self.var2geop = self._construct_input_map(input_varlist)
 
-        self.parser_factory = ascend_parser.AscendParserFactory(graph,
-                                                                self.var2geop)
+        self.parser_factory = ascend_parser.AscendParserFactory(
+            graph, self.var2geop)
         for i, curop in list(enumerate(block.ops)):
             self.parse_op(curop)
 
@@ -151,11 +153,10 @@ def _parse_program(self,
 
         input_varlist = [var for var in input_varlist if var.is_data]
 
-        block.append_op(
-            type="ascend_trigger",
-            inputs={"FeedList": input_varlist},
-            outputs={"FetchList": fetch_list},
-            attrs={'graph_idx': self.graph_idx})
+        block.append_op(type="ascend_trigger",
+                        inputs={"FeedList": input_varlist},
+                        outputs={"FetchList": fetch_list},
+                        attrs={'graph_idx': self.graph_idx})
         self.graph_idx += 1
         return graph
 
@@ -170,10 +171,10 @@ def parse_program(self, startup_program, main_program, input_varlist,
 
             from paddle.distributed import fleet
             self.groups_to_create.append(
-                HcomGroupConfig(
-                    name="hcom_group_0",
-                    nranks=fleet.world_size(),
-                    rank_ids=[x for x in range(fleet.world_size())]))
+                HcomGroupConfig(name="hcom_group_0",
+                                nranks=fleet.world_size(),
+                                rank_ids=[x
+                                          for x in range(fleet.world_size())]))
 
         return startup_graph, main_graph
 
@@ -181,6 +182,7 @@ def parse_program(self, startup_program, main_program, input_varlist,
 # AscendOptimizer is a wrapper for basic optimizer now
 # We will make it part of fleet meta_optimizer in the future
 class AscendOptimizer(Optimizer):
+
     def __init__(self, optimizer, fetch_list=[]):
         self.inner_opt = optimizer
         self.fetch_list = fetch_list
@@ -220,8 +222,8 @@ def minimize(self,
                  precision_mode="must_keep_origin_dtype"):
         minimized = None
         if self.inner_opt:
-            minimized = self.inner_opt.minimize(
-                loss, startup_program=startup_program)
+            minimized = self.inner_opt.minimize(loss,
+                                                startup_program=startup_program)
 
         self.ascend_instance = core.AscendInstance()
 
@@ -252,8 +254,8 @@ def minimize(self,
         self.ascend_instance.init_global_resources()
 
         main_block = loss.block
-        self.parser = AscendIRParser(
-            auto_dp=auto_dp, world_rank_size=fleet.world_size())
+        self.parser = AscendIRParser(auto_dp=auto_dp,
+                                     world_rank_size=fleet.world_size())
 
         input_varlist = self._get_input_varlist(main_block.program)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
index 3a52041dc7e2c..99c5100b70e1a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
@@ -20,94 +20,94 @@
 
 __all__ = []
 
-registerd_op = {## forwards
-                "elementwise_add": "AddParser",
-                "matmul": "MatMulParser",
-                "mul": "MulParser",
-                "relu": "ReluParser",
-                "softmax_with_cross_entropy": "SoftmaxWithCrossEntropyParser",
-                "shape": "ShapeParser",
-                "fill_constant": "FillConstantParser",
-                "reduce_sum": "ReduceSumParser",
-                "elementwise_mul": "DotMulParser",
-                "elementwise_div": "DotDivParser",
-                "elementwise_pow": "DotPowParser",
-                "elementwise_max": "MaxParser",
-                "elementwise_min": "MinParser",
-                "elementwise_sub": "DotSubParser",
-                "pow": "PowParser",
-                "gelu": "GeluParser",
-                "sqrt": "SqrtParser",
-                "log": "LogParser",
-                "sum": "SumParser",
-                "logical_not": "LogicalNotParser",
-                "gather": "GatherParser",
-                "scatter": "ScatterParser",
-                "cast": "CastParser",
-                "tanh": "TanhParser",
-                "stack": "StackParser",
-                "square": "SquareParser",
-                "unsqueeze2": "UnSqueezeParser",
-                "assign": "AssignParser",
-                "softmax": "SoftMaxParser",
-                "reshape2": "ReshapeParser",
-                "transpose2": "TransposeParser",
-                "layer_norm": "LayerNormParser",
-                "less_than": "LessParser",
-                "mean": "MeanParser",
-                "scale": "ScaleParser",
-                "slice": "SliceParser",
-                "top_k": "TopkParser",
-                "accuracy": "AccuracyParser",
-                #"increment": "IncrementParser",
-                "lookup_table": "LookupTableParser",
-                "truncated_gaussian_random": "TruncatedNormalParser",
-                "c_allgather": "AllGatherParser",
-                "c_allreduce_sum": "AllReduceSumParser",
-                "c_allreduce_max": "AllReduceMaxParser",
-                "c_broadcast": "BroadcastParser",
-                "c_reduce_scatter": "ReduceScatterParser",
-                "c_send": "SendParser",
-                "c_receive": "ReceiveParser",
-                "uniform_random": "UniformRandomParser",
-                "range": "RangeParser",
-                "equal": "EqualParser",
-                "expand": "ExpandParser",
-                "squeeze2": "SqueezeParser",
-
-
-                ## backwords
-                "matmul_grad": "MatMulGradParser",
-                "mul_grad": "MulGradParser",
-                "relu_grad": "ReluGradParser",
-                "reduce_sum_grad": "ReduceSumGradParser",
-                "softmax_with_cross_entropy_grad": "SoftmaxWithCrossEntropyGradParser",
-                "tanh_grad":"TanhGradParser",
-                "log_grad":"LogGradParser",
-                "pow_grad": "PowGradParser",
-                "sqrt_grad": "SqrtGradParser",
-                "gelu_grad": "GeluGradParser",
-                "mean_grad": "MeanGradParser",
-                'lookup_table_grad': "LookUpTableGradParser",
-                "elementwise_mul_grad": "DotMulGradParser",
-                "elementwise_add_grad": "DotAddGradParser",
-                "elementwise_div_grad": "DotDivGradParser",
-                "softmax_grad": "SoftmaxGradParser",
-                "slice_grad": "SliceGradParser",
-                "reshape2_grad": "ReshapeGradParser",
-                "gather_grad": "GatherGradParser",
-                "transpose2_grad": "TransposeGradParser",
-                "layer_norm_grad": "LayerNormGradParser",
-
-                ## opt
-                "sgd": "SGDParser",
-                #"adam": "AdamParser",
-                }
+registerd_op = {  ## forwards
+    "elementwise_add": "AddParser",
+    "matmul": "MatMulParser",
+    "mul": "MulParser",
+    "relu": "ReluParser",
+    "softmax_with_cross_entropy": "SoftmaxWithCrossEntropyParser",
+    "shape": "ShapeParser",
+    "fill_constant": "FillConstantParser",
+    "reduce_sum": "ReduceSumParser",
+    "elementwise_mul": "DotMulParser",
+    "elementwise_div": "DotDivParser",
+    "elementwise_pow": "DotPowParser",
+    "elementwise_max": "MaxParser",
+    "elementwise_min": "MinParser",
+    "elementwise_sub": "DotSubParser",
+    "pow": "PowParser",
+    "gelu": "GeluParser",
+    "sqrt": "SqrtParser",
+    "log": "LogParser",
+    "sum": "SumParser",
+    "logical_not": "LogicalNotParser",
+    "gather": "GatherParser",
+    "scatter": "ScatterParser",
+    "cast": "CastParser",
+    "tanh": "TanhParser",
+    "stack": "StackParser",
+    "square": "SquareParser",
+    "unsqueeze2": "UnSqueezeParser",
+    "assign": "AssignParser",
+    "softmax": "SoftMaxParser",
+    "reshape2": "ReshapeParser",
+    "transpose2": "TransposeParser",
+    "layer_norm": "LayerNormParser",
+    "less_than": "LessParser",
+    "mean": "MeanParser",
+    "scale": "ScaleParser",
+    "slice": "SliceParser",
+    "top_k": "TopkParser",
+    "accuracy": "AccuracyParser",
+    #"increment": "IncrementParser",
+    "lookup_table": "LookupTableParser",
+    "truncated_gaussian_random": "TruncatedNormalParser",
+    "c_allgather": "AllGatherParser",
+    "c_allreduce_sum": "AllReduceSumParser",
+    "c_allreduce_max": "AllReduceMaxParser",
+    "c_broadcast": "BroadcastParser",
+    "c_reduce_scatter": "ReduceScatterParser",
+    "c_send": "SendParser",
+    "c_receive": "ReceiveParser",
+    "uniform_random": "UniformRandomParser",
+    "range": "RangeParser",
+    "equal": "EqualParser",
+    "expand": "ExpandParser",
+    "squeeze2": "SqueezeParser",
+
+    ## backwords
+    "matmul_grad": "MatMulGradParser",
+    "mul_grad": "MulGradParser",
+    "relu_grad": "ReluGradParser",
+    "reduce_sum_grad": "ReduceSumGradParser",
+    "softmax_with_cross_entropy_grad": "SoftmaxWithCrossEntropyGradParser",
+    "tanh_grad": "TanhGradParser",
+    "log_grad": "LogGradParser",
+    "pow_grad": "PowGradParser",
+    "sqrt_grad": "SqrtGradParser",
+    "gelu_grad": "GeluGradParser",
+    "mean_grad": "MeanGradParser",
+    'lookup_table_grad': "LookUpTableGradParser",
+    "elementwise_mul_grad": "DotMulGradParser",
+    "elementwise_add_grad": "DotAddGradParser",
+    "elementwise_div_grad": "DotDivGradParser",
+    "softmax_grad": "SoftmaxGradParser",
+    "slice_grad": "SliceGradParser",
+    "reshape2_grad": "ReshapeGradParser",
+    "gather_grad": "GatherGradParser",
+    "transpose2_grad": "TransposeGradParser",
+    "layer_norm_grad": "LayerNormGradParser",
+
+    ## opt
+    "sgd": "SGDParser",
+    #"adam": "AdamParser",
+}
 global_cnt = -1
 global_input_cnt = -1
 
 
 class AscendHelper(object):
+
     def __init__(self):
         self.dtype2ge_map = {
             0: core.GEDataType.DT_BOOL,
@@ -141,6 +141,7 @@ def dtype2np(self, index):
 
 
 class AscendParserFactory(object):
+
     def __init__(self, graph, var2geop):
         self.graph = graph
         self.var2geop = var2geop
@@ -154,6 +155,7 @@ def create_parse(self, parser_class):
 
 
 class AscendParserBase(object):
+
     def __init__(self, graph, var2geop):
         self.graph = graph
         self.var2geop = var2geop
@@ -177,11 +179,11 @@ def update_output(self, geop_list, index_list):
                 assert len(arguments) == len(
                     index_list[output_id]
                 ), "Parser[%s]'s %dth argument number[%d] is not equal to paddle's number[%d]" % (
-                    self.parser_name, output_id, len(index_list[output_id]),
-                    len(arguments))
+                    self.parser_name, output_id, len(
+                        index_list[output_id]), len(arguments))
                 for i in range(len(arguments)):
-                    self.var2geop[arguments[i]] = geop_list[index_list[
-                        output_id][i]]
+                    self.var2geop[arguments[i]] = geop_list[
+                        index_list[output_id][i]]
 
         for geop in geop_list:
             self.graph.add_op(geop)
@@ -206,22 +208,22 @@ def _accumulated_op_id(self):
         return name
 
     def _create_ge_tensor(self, shape, dtype, value):
-        tensor_desc = core.GETensorDesc(
-            core.GEShape(shape), core.GEFormat.FORMAT_ND,
-            self.ascend_helper.dtype2ge(dtype))
+        tensor_desc = core.GETensorDesc(core.GEShape(shape),
+                                        core.GEFormat.FORMAT_ND,
+                                        self.ascend_helper.dtype2ge(dtype))
         tensor = core.GETensor(tensor_desc)
 
-        data = (value * np.ones((
-            shape))).reshape(shape).astype(self.ascend_helper.dtype2np(dtype))
+        data = (value * np.ones(
+            (shape))).reshape(shape).astype(self.ascend_helper.dtype2np(dtype))
         buf = data.tobytes()
         data_8 = np.frombuffer(buf, dtype=np.uint8)
         tensor.set_data(data_8)
         return tensor
 
     def _get_ge_tensor(self, shape, dtype, value_list):
-        tensor_desc = core.GETensorDesc(
-            core.GEShape(shape), core.GEFormat.FORMAT_ND,
-            self.ascend_helper.dtype2ge(dtype))
+        tensor_desc = core.GETensorDesc(core.GEShape(shape),
+                                        core.GEFormat.FORMAT_ND,
+                                        self.ascend_helper.dtype2ge(dtype))
         tensor = core.GETensor(tensor_desc)
 
         data = np.array(value_list).reshape(shape).astype(
@@ -244,20 +246,20 @@ def _get_variable(self, shape, dtype, tensor):
 
         var = core.GEOperatorFactory.create_operator(
             "variable" + self._accumulated_op_id(), "Variable")
-        var.update_output_desc("y",
-                               core.GETensorDesc(
-                                   core.GEShape(shape), core.GEFormat.FORMAT_ND,
-                                   type))
+        var.update_output_desc(
+            "y",
+            core.GETensorDesc(core.GEShape(shape), core.GEFormat.FORMAT_ND,
+                              type))
         assign = core.GEOperatorFactory.create_operator(
-            "assign" + self._accumulated_op_id(), "Assign").set_input(
-                "value", tensor).set_input("ref", var)
+            "assign" + self._accumulated_op_id(),
+            "Assign").set_input("value", tensor).set_input("ref", var)
 
         return assign
 
     def _create_shape_tensor(self):
-        tensor_desc = core.GETensorDesc(
-            core.GEShape([2]), core.GEFormat.FORMAT_ND,
-            core.GEDataType.DT_INT32)
+        tensor_desc = core.GETensorDesc(core.GEShape([2]),
+                                        core.GEFormat.FORMAT_ND,
+                                        core.GEDataType.DT_INT32)
         tensor = core.GETensor(tensor_desc)
 
         data = np.ones((2)).astype("int32").reshape([2])
@@ -269,14 +271,16 @@ def _create_shape_tensor(self):
 
     def _get_GEtensor_shape(self, tensor):
         tensor_shape = core.GEOperatorFactory.create_operator(
-            "shape" + self._accumulated_op_id(), "Shape").set_input("x", tensor)
+            "shape" + self._accumulated_op_id(),
+            "Shape").set_input("x", tensor)
         tensor_shape = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", tensor_shape).set_attr_int32("dst_type", 0)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", tensor_shape).set_attr_int32("dst_type", 0)
         return tensor_shape
 
 
 class AddParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(AddParser, self).__init__(graph, var2geop)
         self.parser_name = "elementwise_add"
@@ -291,6 +295,7 @@ def _apply(self):
 
 
 class DotSubParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(DotSubParser, self).__init__(graph, var2geop)
         self.parser_name = "elementwise_sub"
@@ -305,6 +310,7 @@ def _apply(self):
 
 
 class DotMulParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(DotMulParser, self).__init__(graph, var2geop)
         self.parser_name = "elementwise_mul"
@@ -319,6 +325,7 @@ def _apply(self):
 
 
 class DotDivParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(DotDivParser, self).__init__(graph, var2geop)
         self.parser_name = "elementwise_div"
@@ -333,6 +340,7 @@ def _apply(self):
 
 
 class DotPowParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(DotPowParser, self).__init__(graph, var2geop)
         self.parser_name = "elementwise_pow"
@@ -347,6 +355,7 @@ def _apply(self):
 
 
 class LessParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(LessParser, self).__init__(graph, var2geop)
         self.parser_name = "less_than"
@@ -361,6 +370,7 @@ def _apply(self):
 
 
 class MaxParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(MaxParser, self).__init__(graph, var2geop)
         self.parser_name = "elementwise_max"
@@ -375,6 +385,7 @@ def _apply(self):
 
 
 class MinParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(MinParser, self).__init__(graph, var2geop)
         self.parser_name = "elementwise_min"
@@ -390,6 +401,7 @@ def _apply(self):
 
 ## cal
 class LogParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(LogParser, self).__init__(graph, var2geop)
         self.parser_name = "log"
@@ -402,6 +414,7 @@ def _apply(self):
 
 
 class SqrtParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(SqrtParser, self).__init__(graph, var2geop)
         self.parser_name = "sqrt"
@@ -414,6 +427,7 @@ def _apply(self):
 
 
 class PowParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(PowParser, self).__init__(graph, var2geop)
         self.parser_name = "pow"
@@ -424,12 +438,14 @@ def _apply(self):
         pow_value = core.GEOperatorFactory.create_operator(
             "pow" + self._accumulated_op_id(),
             "Power").set_input("x", x).set_attr_float(
-                "power", factor).set_attr_float("scale", 1.0).set_attr_float(
-                    "shift", 0.0)
+                "power",
+                factor).set_attr_float("scale",
+                                       1.0).set_attr_float("shift", 0.0)
         return [pow_value], [[0]]
 
 
 class SquareParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(SquareParser, self).__init__(graph, var2geop)
         self.parser_name = "square"
@@ -442,6 +458,7 @@ def _apply(self):
 
 
 class SumParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(SumParser, self).__init__(graph, var2geop)
         self.parser_name = "sum"
@@ -464,6 +481,7 @@ def _apply(self):
 
 
 class LogicalNotParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(LogicalNotParser, self).__init__(graph, var2geop)
         self.parser_name = "logical_not"
@@ -477,6 +495,7 @@ def _apply(self):
 
 
 class MeanParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(MeanParser, self).__init__(graph, var2geop)
         self.parser_name = "mean"
@@ -484,13 +503,14 @@ def __init__(self, graph, var2geop):
     def _apply(self):
         x = self._get_ge_input(self.op.input_arg_names[0])
         mean = core.GEOperatorFactory.create_operator(
-            "mean" + self._accumulated_op_id(),
-            "ReduceMeanD").set_input("x", x).set_attr_bool(
-                "keep_dims", False).set_attr_vec_int32("axes", [])
+            "mean" + self._accumulated_op_id(), "ReduceMeanD").set_input(
+                "x", x).set_attr_bool("keep_dims",
+                                      False).set_attr_vec_int32("axes", [])
         return [mean], [[0]]
 
 
 class ReduceSumParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(ReduceSumParser, self).__init__(graph, var2geop)
         self.parser_name = "reduce_sum"
@@ -515,18 +535,19 @@ def _apply(self):
 #        super(IncrementParser, self).__init__(graph, var2geop)
 #        self.parser_name = "increment"
 #
-#    def _apply(self): 
+#    def _apply(self):
 #        x = self._get_ge_input(self.op.input_arg_names[0])
 #        step = self.op.attr("step") #self._get_ge_input(self.op.input_arg_names[1])
 #        print("step: ", step)
-#            
+#
 #        increment = core.GEOperatorFactory.create_operator("adds" + self._accumulated_op_id(), "Adds").set_input("x", x).set_attr_float("value", step) #set_input("x2", bias)
-#        
+#
 #        return [increment]
 
 
 ## matrix cal
 class MatMulParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(MatMulParser, self).__init__(graph, var2geop)
         self.parser_name = "matmul"
@@ -550,14 +571,15 @@ def _apply(self):
             matmul = core.GEOperatorFactory.create_operator(
                 "matmul" + self._accumulated_op_id(),
                 "MatMul").set_input("x1", x).set_input("x2", y).set_attr_bool(
-                    "transpose_x1", transpose_x).set_attr_bool("transpose_x2",
-                                                               transpose_y)
+                    "transpose_x1",
+                    transpose_x).set_attr_bool("transpose_x2", transpose_y)
         else:
             assert False, "not support"
         return [matmul], [[0]]
 
 
 class MulParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(MulParser, self).__init__(graph, var2geop)
         self.parser_name = "mul"
@@ -580,8 +602,9 @@ def _apply(self):
                     "flatten" + self._accumulated_op_id(),
                     "Flatten").set_input("x", x)
                 matmul = core.GEOperatorFactory.create_operator(
-                    "mul" + self._accumulated_op_id(), "MatMul").set_input(
-                        "x1", flatten_x1, 0).set_input("x2", y, 0)
+                    "mul" + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", flatten_x1,
+                                        0).set_input("x2", y, 0)
             else:
                 assert False, "not support"
         else:
@@ -592,26 +615,27 @@ def _apply(self):
                     "FlattenV2").set_input("x", x).set_attr_int32(
                         "axis", 0).set_attr_int32("end_axis", 1)
                 matmul_m = core.GEOperatorFactory.create_operator(
-                    "mul" + self._accumulated_op_id(), "MatMul").set_input(
-                        "x1", flatten_x1, 0).set_input("x2", y, 0)
+                    "mul" + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", flatten_x1,
+                                        0).set_input("x2", y, 0)
                 matmul_transpose = core.GEOperatorFactory.create_operator(
                     "transpose" + self._accumulated_op_id(),
-                    "TransposeD").set_input(
-                        "x", matmul_m).set_attr_vec_int32("perm", [1, 0])
+                    "TransposeD").set_input("x", matmul_m).set_attr_vec_int32(
+                        "perm", [1, 0])
                 tensor = self._create_ge_tensor(
                     [3], 2, [shape_x2[1], shape_x1[0], shape_x1[1]])
                 const_shape = core.GEOperatorFactory.create_operator(
                     "shape" + self._accumulated_op_id(),
                     "Const").set_attr_tensor("value", tensor)
                 reshape_matmul = core.GEOperatorFactory.create_operator(
-                    "reshape" + self._accumulated_op_id(), "Reshape").set_input(
-                        "x", matmul_transpose).set_input(
-                            "shape", const_shape).set_attr_int32("axis", 0)
+                    "reshape" + self._accumulated_op_id(),
+                    "Reshape").set_input("x", matmul_transpose).set_input(
+                        "shape", const_shape).set_attr_int32("axis", 0)
                 matmul = core.GEOperatorFactory.create_operator(
                     "transpose" + self._accumulated_op_id(),
-                    "TransposeD").set_input(
-                        "x",
-                        reshape_matmul).set_attr_vec_int32("perm", [1, 2, 0])
+                    "TransposeD").set_input("x",
+                                            reshape_matmul).set_attr_vec_int32(
+                                                "perm", [1, 2, 0])
             else:
                 assert False, "not support"
 
@@ -619,6 +643,7 @@ def _apply(self):
 
 
 class LayerNormParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(LayerNormParser, self).__init__(graph, var2geop)
         self.parser_name = "layer_norm"
@@ -639,7 +664,8 @@ def _apply(self):
                                      scale).set_input("shape", shape_tensor)
         bias_expand = core.GEOperatorFactory.create_operator(
             "broadcast_to_d" + self._accumulated_op_id(),
-            "BroadcastTo").set_input("x", bias).set_input("shape", shape_tensor)
+            "BroadcastTo").set_input("x",
+                                     bias).set_input("shape", shape_tensor)
         layer_norm = core.GEOperatorFactory.create_operator(
             "layer_norm" + self._accumulated_op_id(),
             "LayerNorm").set_input("x", x).set_input(
@@ -652,19 +678,23 @@ def _apply(self):
         cast_dtype = 0 if self.ascend_helper.dtype2paddle_inv_map[str(
             x_dtype)] == 0 else 1
         y = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", layer_norm, 0).set_attr_int32("dst_type", cast_dtype)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", layer_norm,
+                              0).set_attr_int32("dst_type", cast_dtype)
         mean = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", layer_norm, 1).set_attr_int32("dst_type", cast_dtype)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", layer_norm,
+                              1).set_attr_int32("dst_type", cast_dtype)
         variance = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", layer_norm, 2).set_attr_int32("dst_type", cast_dtype)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", layer_norm,
+                              2).set_attr_int32("dst_type", cast_dtype)
         return [y, mean, variance], [[1], [2], [0]]
 
 
 ## activate function
 class ReluParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(ReluParser, self).__init__(graph, var2geop)
         self.parser_name = "relu"
@@ -677,6 +707,7 @@ def _apply(self):
 
 
 class GeluParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(GeluParser, self).__init__(graph, var2geop)
         self.parser_name = "gelu"
@@ -689,6 +720,7 @@ def _apply(self):
 
 
 class TanhParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(TanhParser, self).__init__(graph, var2geop)
         self.parser_name = "tanh"
@@ -702,6 +734,7 @@ def _apply(self):
 
 ## loss function
 class SoftmaxWithCrossEntropyParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(SoftmaxWithCrossEntropyParser, self).__init__(graph, var2geop)
         self.parser_name = "softmax_with_cross_entropy"
@@ -715,8 +748,8 @@ def _apply(self):
             "softmax" + self._accumulated_op_id(),
             "SoftmaxV2").set_input("x", logits)
         label = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", label).set_attr_int32("dst_type", 3)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", label).set_attr_int32("dst_type", 3)
 
         tensoron = self._create_ge_tensor([1], 5, 1)
         on = core.GEOperatorFactory.create_operator(
@@ -729,19 +762,23 @@ def _apply(self):
         self._mark_as_input(on)
         self._mark_as_input(off)
         onehot = core.GEOperatorFactory.create_operator(
-            "onehot" + self._accumulated_op_id(), "OneHotD").set_input(
-                "x", label).set_input("on_value", on).set_input(
-                    "off_value", off).set_attr_int32("depth", cls_num)
+            "onehot" + self._accumulated_op_id(),
+            "OneHotD").set_input("x",
+                                 label).set_input("on_value", on).set_input(
+                                     "off_value",
+                                     off).set_attr_int32("depth", cls_num)
         squeeze = core.GEOperatorFactory.create_operator(
-            "mul" + self._accumulated_op_id(), "Squeeze").set_input("x", onehot)
+            "mul" + self._accumulated_op_id(),
+            "Squeeze").set_input("x", onehot)
 
         loss_all = core.GEOperatorFactory.create_operator(
             "loss" + self._accumulated_op_id(),
-            "SoftmaxCrossEntropyWithLogits").set_input(
-                "features", logits).set_input("labels", squeeze)
+            "SoftmaxCrossEntropyWithLogits").set_input("features",
+                                                       logits).set_input(
+                                                           "labels", squeeze)
         loss = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", loss_all, 0).set_attr_int32("dst_type", 0)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", loss_all, 0).set_attr_int32("dst_type", 0)
         loss_expand = core.GEOperatorFactory.create_operator(
             "unsqueeze" + self._accumulated_op_id(),
             "Unsqueeze").set_input("x", loss).set_attr_vec_int32("axes", [1])
@@ -749,6 +786,7 @@ def _apply(self):
 
 
 class SoftMaxParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(SoftMaxParser, self).__init__(graph, var2geop)
         self.parser_name = "softmax"
@@ -758,13 +796,15 @@ def _apply(self):
         axes = self.op.attr("axis")
 
         softmax = core.GEOperatorFactory.create_operator(
-            "softmax" + self._accumulated_op_id(), "SoftmaxV2").set_input(
-                "x", logits).set_attr_vec_int32("axes", [axes])
+            "softmax" + self._accumulated_op_id(),
+            "SoftmaxV2").set_input("x",
+                                   logits).set_attr_vec_int32("axes", [axes])
         return [softmax], [[0]]
 
 
-## general 
+## general
 class ShapeParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(ShapeParser, self).__init__(graph, var2geop)
         self.parser_name = "shape"
@@ -777,6 +817,7 @@ def _apply(self):
 
 
 class FillConstantParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(FillConstantParser, self).__init__(graph, var2geop)
         self.parser_name = "fill_constant"
@@ -796,19 +837,19 @@ def _apply(self):
             #      (self.op.output('Out')[0]))
             var = core.GEOperatorFactory.create_operator(
                 self.op.output('Out')[0], "Variable")
-            var.update_output_desc("y",
-                                   core.GETensorDesc(
-                                       core.GEShape(shape),
-                                       core.GEFormat.FORMAT_ND,
-                                       core.GEDataType.DT_FLOAT))
+            var.update_output_desc(
+                "y",
+                core.GETensorDesc(core.GEShape(shape), core.GEFormat.FORMAT_ND,
+                                  core.GEDataType.DT_FLOAT))
             assign = core.GEOperatorFactory.create_operator(
-                "assign" + self._accumulated_op_id(), "Assign").set_input(
-                    "value", const).set_input("ref", var)
+                "assign" + self._accumulated_op_id(),
+                "Assign").set_input("value", const).set_input("ref", var)
             return [const], [[0]]
         return [const], [[0]]
 
 
 class TruncatedNormalParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(TruncatedNormalParser, self).__init__(graph, var2geop)
         self.parser_name = "truncated_gaussian_random"
@@ -850,11 +891,11 @@ def _apply(self):
         truncated_normal = core.GEOperatorFactory.create_operator(
             "truncated_normal" + self._accumulated_op_id(),
             "ParameterizedTruncatedNormal").set_input(
-                "shape", shape_tensor).set_input(
-                    "means", mean_tensor).set_input(
-                        "stdevs", std_tensor).set_input(
-                            "min", min_tensor).set_input(
-                                "max", max_tensor).set_attr_int32("seed", 0)
+                "shape",
+                shape_tensor).set_input("means", mean_tensor).set_input(
+                    "stdevs",
+                    std_tensor).set_input("min", min_tensor).set_input(
+                        "max", max_tensor).set_attr_int32("seed", 0)
 
         ## wirte the output of truncatedNormal from startup_program to main_program
         if self.op.block.var(self.op.output('Out')[0]).persistable:
@@ -862,14 +903,14 @@ def _apply(self):
             #      (self.op.output('Out')[0]))
             var = core.GEOperatorFactory.create_operator(
                 self.op.output('Out')[0], "Variable")
-            var.update_output_desc("y",
-                                   core.GETensorDesc(
-                                       core.GEShape(shape),
-                                       core.GEFormat.FORMAT_ND,
-                                       core.GEDataType.DT_FLOAT))
+            var.update_output_desc(
+                "y",
+                core.GETensorDesc(core.GEShape(shape), core.GEFormat.FORMAT_ND,
+                                  core.GEDataType.DT_FLOAT))
             assign = core.GEOperatorFactory.create_operator(
-                "assign" + self._accumulated_op_id(), "Assign").set_input(
-                    "value", truncated_normal).set_input("ref", var)
+                "assign" + self._accumulated_op_id(),
+                "Assign").set_input("value",
+                                    truncated_normal).set_input("ref", var)
             return [
                 shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor,
                 truncated_normal
@@ -882,6 +923,7 @@ def _apply(self):
 
 
 class GatherParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(GatherParser, self).__init__(graph, var2geop)
         self.parser_name = "gather"
@@ -892,13 +934,15 @@ def _apply(self):
         clo = self.op.block.var(self.op.input_arg_names[1]).shape[-1]
 
         gather = core.GEOperatorFactory.create_operator(
-            "gather" + self._accumulated_op_id(), "Gather").set_input(
-                "x", x).set_input("indices", index).set_attr_bool(
-                    "validate_indices", True)
+            "gather" + self._accumulated_op_id(),
+            "Gather").set_input("x", x).set_input("indices",
+                                                  index).set_attr_bool(
+                                                      "validate_indices", True)
         return [gather], [[0]]
 
 
 class ScatterParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(ScatterParser, self).__init__(graph, var2geop)
         self.parser_name = "scatter"
@@ -912,24 +956,24 @@ def _apply(self):
 
         if len(index_shape) == 1:
             index = core.GEOperatorFactory.create_operator(
-                "unsqueeze" + self.getid(), "Unsqueeze").set_input(
-                    "x", index).set_attr_vec_int32("axes", [1])
+                "unsqueeze" + self.getid(),
+                "Unsqueeze").set_input("x",
+                                       index).set_attr_vec_int32("axes", [1])
         if not overwrite:
             scatter_value = core.GEOperatorFactory.create_operator(
                 "scatter" + self._accumulated_op_id(),
-                "TensorScatterAdd").set_input(
-                    "x", x).set_input("indices", index).set_input("updates",
-                                                                  updates)
+                "TensorScatterAdd").set_input("x", x).set_input(
+                    "indices", index).set_input("updates", updates)
         else:
             scatter_value = core.GEOperatorFactory.create_operator(
                 "scatter" + self._accumulated_op_id(),
-                "TensorScatterUpdate").set_input(
-                    "x", x).set_input("indices", index).set_input("updates",
-                                                                  updates)
+                "TensorScatterUpdate").set_input("x", x).set_input(
+                    "indices", index).set_input("updates", updates)
         return [x, index, updates, scatter_value], [[-1]]
 
 
 class CastParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(CastParser, self).__init__(graph, var2geop)
         self.parser_name = "cast"
@@ -938,12 +982,13 @@ def _apply(self):
         x = self._get_ge_input(self.op.input_arg_names[0])
         dtype = self.op.attr("out_dtype")
         cast = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", x).set_attr_int32("dst_type", dtype)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", x).set_attr_int32("dst_type", dtype)
         return [cast], [[0]]
 
 
 class AssignParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(AssignParser, self).__init__(graph, var2geop)
         self.parser_name = "assign"
@@ -952,12 +997,13 @@ def _apply(self):
         const = self._get_ge_input(self.op.input_arg_names[0])
         var = self._get_ge_input(self.op.input_arg_names[1])
         assign = core.GEOperatorFactory.create_operator(
-            "assign" + self._accumulated_op_id(), "Assign").set_input(
-                "value", const).set_input("ref", var)
+            "assign" + self._accumulated_op_id(),
+            "Assign").set_input("value", const).set_input("ref", var)
         return [assign], [[0]]
 
 
 class ScaleParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(ScaleParser, self).__init__(graph, var2geop)
         self.parser_name = "scale"
@@ -970,22 +1016,26 @@ def _apply(self):
 
         if bias_after_scale:
             scale_value = core.GEOperatorFactory.create_operator(
-                "scale" + self._accumulated_op_id(), "Power").set_input(
-                    "x", x).set_attr_float("power", 1.0).set_attr_float(
-                        "scale", scale).set_attr_float("shift", bias)
+                "scale" + self._accumulated_op_id(),
+                "Power").set_input("x", x).set_attr_float(
+                    "power",
+                    1.0).set_attr_float("scale",
+                                        scale).set_attr_float("shift", bias)
         else:
             x_add_bias = core.GEOperatorFactory.create_operator(
-                "adds" + self._accumulated_op_id(), "Adds").set_input(
-                    "x", x).set_attr_float("value", bias)
+                "adds" + self._accumulated_op_id(),
+                "Adds").set_input("x", x).set_attr_float("value", bias)
             scale_value = core.GEOperatorFactory.create_operator(
-                "scale" + self._accumulated_op_id(), "Power").set_input(
-                    "x",
-                    x_add_bias).set_attr_float("power", 1.0).set_attr_float(
-                        "scale", scale).set_attr_float("shift", 0.0)
+                "scale" + self._accumulated_op_id(),
+                "Power").set_input("x", x_add_bias).set_attr_float(
+                    "power",
+                    1.0).set_attr_float("scale",
+                                        scale).set_attr_float("shift", 0.0)
         return [scale_value], [[0]]
 
 
 class SliceParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(SliceParser, self).__init__(graph, var2geop)
         self.parser_name = "slice"
@@ -1014,14 +1064,15 @@ def _apply(self):
         assert len(axes_cor) == len(starts_cor) == len(
             ends_cor), "the three fields must have same size"
         slice_value = core.GEOperatorFactory.create_operator(
-            "slice" + self._accumulated_op_id(), "SliceD").set_input(
-                "x", x).set_attr_vec_int32(
-                    "offsets", starts_cor).set_attr_vec_int32("size", size)
+            "slice" + self._accumulated_op_id(),
+            "SliceD").set_input("x", x).set_attr_vec_int32(
+                "offsets", starts_cor).set_attr_vec_int32("size", size)
 
         return [slice_value], [[0]]
 
 
 class ReshapeParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(ReshapeParser, self).__init__(graph, var2geop)
         self.parser_name = "reshape2"
@@ -1047,9 +1098,10 @@ def _apply(self):
             "shape" + self._accumulated_op_id(),
             "Const").set_attr_tensor("value", tensor)
         reshape = core.GEOperatorFactory.create_operator(
-            "reshape" + self._accumulated_op_id(), "Reshape").set_input(
-                "x",
-                x).set_input("shape", const_shape).set_attr_int32("axis", 0)
+            "reshape" + self._accumulated_op_id(),
+            "Reshape").set_input("x", x).set_input("shape",
+                                                   const_shape).set_attr_int32(
+                                                       "axis", 0)
         x_shape = core.GEOperatorFactory.create_operator(
             "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
 
@@ -1057,6 +1109,7 @@ def _apply(self):
 
 
 class TransposeParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(TransposeParser, self).__init__(graph, var2geop)
         self.parser_name = "transpose2"
@@ -1065,8 +1118,8 @@ def _apply(self):
         x = self._get_ge_input(self.op.input_arg_names[0])
         perm = self.op.attr("axis")
         transpose = core.GEOperatorFactory.create_operator(
-            "transpose" + self._accumulated_op_id(), "TransposeD").set_input(
-                "x", x).set_attr_vec_int32("perm", perm)
+            "transpose" + self._accumulated_op_id(),
+            "TransposeD").set_input("x", x).set_attr_vec_int32("perm", perm)
         x_shape = core.GEOperatorFactory.create_operator(
             "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
 
@@ -1074,6 +1127,7 @@ def _apply(self):
 
 
 class AccuracyParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(AccuracyParser, self).__init__(graph, var2geop)
         self.parser_name = "accuracy"
@@ -1084,40 +1138,41 @@ def _apply(self):
         logits = self._get_ge_input(self.op.input_arg_names[2])
 
         pred = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", pred).set_attr_int32("dst_type", 3)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", pred).set_attr_int32("dst_type", 3)
         label = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", label).set_attr_int32("dst_type", 3)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", label).set_attr_int32("dst_type", 3)
         equal = core.GEOperatorFactory.create_operator(
-            "equal" + self._accumulated_op_id(), "Equal").set_input(
-                "x1", pred).set_input("x2", label)
+            "equal" + self._accumulated_op_id(),
+            "Equal").set_input("x1", pred).set_input("x2", label)
         cast = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", equal).set_attr_int32("dst_type", 0)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", equal).set_attr_int32("dst_type", 0)
         acc = core.GEOperatorFactory.create_operator(
             "mean" + self._accumulated_op_id(), "ReduceMeanD").set_input(
-                "x", cast).set_attr_bool("keep_dims", False).set_attr_vec_int32(
-                    "axes", [])
+                "x", cast).set_attr_bool("keep_dims",
+                                         False).set_attr_vec_int32("axes", [])
         correct = core.GEOperatorFactory.create_operator(
             "sum" + self._accumulated_op_id(), "ReduceSumD").set_input(
-                "x", cast).set_attr_bool("keep_dims", False).set_attr_vec_int32(
-                    "axes", [])
+                "x", cast).set_attr_bool("keep_dims",
+                                         False).set_attr_vec_int32("axes", [])
         ones_tensor = core.GEOperatorFactory.create_operator(
             "oneslike" + self._accumulated_op_id(),
             "OnesLike").set_input("x", label)
         ones_tensor = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", ones_tensor).set_attr_int32("dst_type", 0)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", ones_tensor).set_attr_int32("dst_type", 0)
         total = core.GEOperatorFactory.create_operator(
-            "sum" + self._accumulated_op_id(), "ReduceSumD").set_input(
-                "x", ones_tensor).set_attr_bool(
-                    "keep_dims", False).set_attr_vec_int32("axes", [])
+            "sum" + self._accumulated_op_id(),
+            "ReduceSumD").set_input("x", ones_tensor).set_attr_bool(
+                "keep_dims", False).set_attr_vec_int32("axes", [])
 
         return [acc, correct, total], [[0], [1], [2]]
 
 
 class TopkParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(TopkParser, self).__init__(graph, var2geop)
         self.parser_name = "top_k"
@@ -1137,15 +1192,16 @@ def _apply(self):
             "topk" + self._accumulated_op_id(),
             "TopK").set_input("x", cast_x).set_input("k", const_k)
         value = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", topk, 0).set_attr_int32("dst_type", 0)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", topk, 0).set_attr_int32("dst_type", 0)
         index = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", topk, 1).set_attr_int32("dst_type", 0)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", topk, 1).set_attr_int32("dst_type", 0)
         return [value, index], [[1], [0]]
 
 
 class LookupTableParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(LookupTableParser, self).__init__(graph, var2geop)
         self.parser_name = "lookup_table"
@@ -1155,15 +1211,16 @@ def _apply(self):
         w = self._get_ge_input(self.op.input_arg_names[1])
 
         ids_squeeze = core.GEOperatorFactory.create_operator(
-            "squeeze" + self._accumulated_op_id(), "Squeeze").set_input(
-                "x", ids).set_attr_vec_int32("axes", [-1])
+            "squeeze" + self._accumulated_op_id(),
+            "Squeeze").set_input("x", ids).set_attr_vec_int32("axes", [-1])
         out = core.GEOperatorFactory.create_operator(
-            "lookup" + self._accumulated_op_id(), "Gather").set_input(
-                "x", w).set_input("indices", ids_squeeze)
+            "lookup" + self._accumulated_op_id(),
+            "Gather").set_input("x", w).set_input("indices", ids_squeeze)
         return [out], [[0]]
 
 
 class StackParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(StackParser, self).__init__(graph, var2geop)
         self.parser_name = "stack"
@@ -1172,8 +1229,8 @@ def _apply(self):
         tiles = len(self.op.input_arg_names)
         data_x_lst = []
         for index in range(tiles):
-            data_x_lst.append(
-                self._get_ge_input(self.op.input_arg_names[index]))
+            data_x_lst.append(self._get_ge_input(
+                self.op.input_arg_names[index]))
         axis = self.op.attr("axis")
 
         data_x = data_x_lst[0]
@@ -1186,14 +1243,16 @@ def _apply(self):
             "ExpandDims").set_input("x", data_x).set_input("axis", tensor_axis)
 
         stack = core.GEOperatorFactory.create_operator(
-            "stack" + self._accumulated_op_id(),
-            "TileWithAxis").set_input("x", expand).set_attr_int32(
-                "axis", axis).set_attr_int32("tiles", tiles)
+            "stack" + self._accumulated_op_id(), "TileWithAxis").set_input(
+                "x",
+                expand).set_attr_int32("axis",
+                                       axis).set_attr_int32("tiles", tiles)
 
         return [stack], [[0]]
 
 
 class UnSqueezeParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(UnSqueezeParser, self).__init__(graph, var2geop)
         self.parser_name = "unsqueeze2"
@@ -1206,12 +1265,14 @@ def _apply(self):
             "unsqueeze" + self._accumulated_op_id(),
             "Unsqueeze").set_input("x", x).set_attr_vec_int32("axes", axes)
         shape = core.GEOperatorFactory.create_operator(
-            "shape" + self._accumulated_op_id(), "Shape").set_input("x", output)
+            "shape" + self._accumulated_op_id(),
+            "Shape").set_input("x", output)
         return [shape, output], [[1], [0]]
 
 
 ## parallel
 class AllGatherParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(AllGatherParser, self).__init__(graph, var2geop)
         self.parser_name = "c_allgather"
@@ -1222,13 +1283,14 @@ def _apply(self):
         group = self.op.attr("group")
 
         allgather = core.GEOperatorFactory.create_operator(
-            "allgather" + self._accumulated_op_id(), "HcomAllGather").set_input(
-                "x", x).set_attr_int32(
-                    "rank_size", rank_size).set_attr_string("group", group)
+            "allgather" + self._accumulated_op_id(),
+            "HcomAllGather").set_input("x", x).set_attr_int32(
+                "rank_size", rank_size).set_attr_string("group", group)
         return [allgather], [[0]]
 
 
 class AllReduceParser(AscendParserBase):
+
     def __init__(self, graph, var2geop, reduction):
         super(AllReduceParser, self).__init__(graph, var2geop)
         self.parser_name = "c_allreduce_" + reduction
@@ -1243,9 +1305,9 @@ def _apply(self):
         fusion_id = None  #self.op.attr("fusion_id")
 
         allreduce = core.GEOperatorFactory.create_operator(
-            "allreduce" + self._accumulated_op_id(), "HcomAllReduce").set_input(
-                "x", x).set_attr_string(
-                    "reduction", reduction).set_attr_string("group", group)
+            "allreduce" + self._accumulated_op_id(),
+            "HcomAllReduce").set_input("x", x).set_attr_string(
+                "reduction", reduction).set_attr_string("group", group)
         if fusion is not None:
             allreduce.set_attr_int32("fusion", fusion)
 
@@ -1255,16 +1317,19 @@ def _apply(self):
 
 
 class AllReduceSumParser(AllReduceParser):
+
     def __init__(self, graph, var2geop):
         super(AllReduceSumParser, self).__init__(graph, var2geop, 'sum')
 
 
 class AllReduceMaxParser(AllReduceParser):
+
     def __init__(self, graph, var2geop):
         super(AllReduceMaxParser, self).__init__(graph, var2geop, 'max')
 
 
 class BroadcastParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(BroadcastParser, self).__init__(graph, var2geop)
         self.parser_name = "c_broadcast"
@@ -1275,13 +1340,14 @@ def _apply(self):
         group = self.op.attr("group")
 
         broadcast = core.GEOperatorFactory.create_operator(
-            "broadcast" + self._accumulated_op_id(), "HcomBroadcast").set_input(
-                "x", x).set_attr_int32(
-                    "root_rank", root_rank).set_attr_string("group", group)
+            "broadcast" + self._accumulated_op_id(),
+            "HcomBroadcast").set_input("x", x).set_attr_int32(
+                "root_rank", root_rank).set_attr_string("group", group)
         return [broadcast], [[0]]
 
 
 class ReduceScatterParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(ReduceScatterParser, self).__init__(graph, var2geop)
         self.parser_name = "c_reduce_scatter"
@@ -1295,12 +1361,14 @@ def _apply(self):
         reduce_scatter = core.GEOperatorFactory.create_operator(
             "reducescatter" + self._accumulated_op_id(),
             "HcomReduceScatter").set_input("x", x).set_attr_string(
-                "reduction", reduction).set_attr_string(
-                    "group", group).set_attr_int32("rank_size", rank_size)
+                "reduction",
+                reduction).set_attr_string("group", group).set_attr_int32(
+                    "rank_size", rank_size)
         return [reduce_scatter], [[0]]
 
 
 class SendParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(SendParser, self).__init__(graph, var2geop)
         self.parser_name = "c_send"
@@ -1319,6 +1387,7 @@ def _apply(self):
 
 
 class ReceiveParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(ReceiveParser, self).__init__(graph, var2geop)
         self.parser_name = "c_receive"
@@ -1332,15 +1401,18 @@ def _apply(self):
         dtype = self.op.attr("dtype")
 
         receive = core.GEOperatorFactory.create_operator(
-            "receive" + self._accumulated_op_id(), "HcomReceive").set_input(
-                "x", x).set_attr_int32("sr_tag", sr_tag).set_attr_int32(
-                    "src_rank", src_rank).set_attr_string(
-                        "group", group).set_attr_vec_int32(
-                            "shape", shape).set_attr_int32("dtype", dtype)
+            "receive" + self._accumulated_op_id(),
+            "HcomReceive").set_input("x", x).set_attr_int32(
+                "sr_tag",
+                sr_tag).set_attr_int32("src_rank", src_rank).set_attr_string(
+                    "group", group).set_attr_vec_int32("shape",
+                                                       shape).set_attr_int32(
+                                                           "dtype", dtype)
         return [receive], [[0]]
 
 
 class RangeParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(RangeParser, self).__init__(graph, var2geop)
         self.parser_name = "range"
@@ -1361,6 +1433,7 @@ def _apply(self):
 
 
 class UniformRandomParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(UniformRandomParser, self).__init__(graph, var2geop)
         self.parser_name = "uniform_random"
@@ -1390,14 +1463,17 @@ def _apply(self):
         scale = max_v - min_v
 
         scale_value = core.GEOperatorFactory.create_operator(
-            "scale" + self._accumulated_op_id(), "Power").set_input(
-                "x", ge_ur).set_attr_float("power", 1.0).set_attr_float(
-                    "scale", scale).set_attr_float("shift", min_v)
+            "scale" + self._accumulated_op_id(),
+            "Power").set_input("x", ge_ur).set_attr_float(
+                "power",
+                1.0).set_attr_float("scale",
+                                    scale).set_attr_float("shift", min_v)
 
         return [scale_value], [[0]]
 
 
 class EqualParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(EqualParser, self).__init__(graph, var2geop)
         self.parser_name = "equal"
@@ -1413,6 +1489,7 @@ def _apply(self):
 
 
 class ExpandParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(ExpandParser, self).__init__(graph, var2geop)
         self.parser_name = "expand"
@@ -1434,6 +1511,7 @@ def _apply(self):
 
 
 class SqueezeParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(SqueezeParser, self).__init__(graph, var2geop)
         self.parser_name = "squeeze2"
@@ -1461,6 +1539,7 @@ def _apply(self):
 #****************************************************************#
 ## grad
 class ReduceSumGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(ReduceSumGradParser, self).__init__(graph, var2geop)
         self.parser_name = "reduce_sum_grad"
@@ -1487,6 +1566,7 @@ def _apply(self):
 
 
 class MatMulGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(MatMulGradParser, self).__init__(graph, var2geop)
         self.parser_name = "matmul_grad"
@@ -1507,56 +1587,60 @@ def _apply(self):
                 x_grad = core.GEOperatorFactory.create_operator(
                     self.parser_name + self._accumulated_op_id(),
                     "BatchMatMul").set_input("x1", out_grad).set_input(
-                        "x2", y).set_attr_bool(
-                            "adj_x1", False).set_attr_bool("adj_x2", False)
+                        "x2",
+                        y).set_attr_bool("adj_x1",
+                                         False).set_attr_bool("adj_x2", False)
                 y_grad = core.GEOperatorFactory.create_operator(
                     self.parser_name + self._accumulated_op_id(),
                     "BatchMatMul").set_input("x1", out_grad).set_input(
-                        "x2", x).set_attr_bool(
-                            "adj_x1", True).set_attr_bool("adj_x2", False)
+                        "x2",
+                        x).set_attr_bool("adj_x1",
+                                         True).set_attr_bool("adj_x2", False)
             else:
                 x_grad = core.GEOperatorFactory.create_operator(
                     self.parser_name + self._accumulated_op_id(),
                     "BatchMatMul").set_input("x1", out_grad).set_input(
-                        "x2", y).set_attr_bool(
-                            "adj_x1", False).set_attr_bool("adj_x2", True)
+                        "x2",
+                        y).set_attr_bool("adj_x1",
+                                         False).set_attr_bool("adj_x2", True)
                 y_grad = core.GEOperatorFactory.create_operator(
                     self.parser_name + self._accumulated_op_id(),
-                    "BatchMatMul").set_input("x1", x).set_input(
-                        "x2", out_grad).set_attr_bool(
+                    "BatchMatMul").set_input(
+                        "x1", x).set_input("x2", out_grad).set_attr_bool(
                             "adj_x1", True).set_attr_bool("adj_x2", False)
         else:
             if transpose_y:
                 x_grad = core.GEOperatorFactory.create_operator(
                     self.parser_name + self._accumulated_op_id(),
                     "MatMul").set_input("x1", out_grad).set_input(
-                        "x2", y).set_attr_bool(
-                            "transpose_x1", False).set_attr_bool("transpose_x2",
-                                                                 False)
+                        "x2", y).set_attr_bool("transpose_x1",
+                                               False).set_attr_bool(
+                                                   "transpose_x2", False)
                 y_grad = core.GEOperatorFactory.create_operator(
                     self.parser_name + self._accumulated_op_id(),
                     "MatMul").set_input("x1", out_grad).set_input(
-                        "x2", x).set_attr_bool(
-                            "transpose_x1", True).set_attr_bool("transpose_x2",
-                                                                False)
+                        "x2", x).set_attr_bool("transpose_x1",
+                                               True).set_attr_bool(
+                                                   "transpose_x2", False)
             else:
                 x_grad = core.GEOperatorFactory.create_operator(
                     self.parser_name + self._accumulated_op_id(),
                     "MatMul").set_input("x1", out_grad).set_input(
-                        "x2", y).set_attr_bool(
-                            "transpose_x1", False).set_attr_bool("transpose_x2",
-                                                                 True)
+                        "x2", y).set_attr_bool("transpose_x1",
+                                               False).set_attr_bool(
+                                                   "transpose_x2", True)
                 y_grad = core.GEOperatorFactory.create_operator(
                     self.parser_name + self._accumulated_op_id(),
                     "MatMul").set_input("x1", x).set_input(
-                        "x2", out_grad).set_attr_bool(
-                            "transpose_x1", True).set_attr_bool("transpose_x2",
-                                                                False)
+                        "x2", out_grad).set_attr_bool("transpose_x1",
+                                                      True).set_attr_bool(
+                                                          "transpose_x2", False)
 
         return [x_grad, y_grad], [[0], [1]]
 
 
 class MulGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(MulGradParser, self).__init__(graph, var2geop)
         self.parser_name = "mul_grad"
@@ -1577,25 +1661,25 @@ def _apply(self):
                 x_grad = core.GEOperatorFactory.create_operator(
                     self.parser_name + self._accumulated_op_id(),
                     "MatMul").set_input("x1", out_grad).set_input(
-                        "x2", y).set_attr_bool(
-                            "transpose_x1", False).set_attr_bool("transpose_x2",
-                                                                 True)
+                        "x2", y).set_attr_bool("transpose_x1",
+                                               False).set_attr_bool(
+                                                   "transpose_x2", True)
                 y_grad = core.GEOperatorFactory.create_operator(
                     self.parser_name + self._accumulated_op_id(),
                     "MatMul").set_input("x1", x).set_input(
-                        "x2", out_grad).set_attr_bool(
-                            "transpose_x1", True).set_attr_bool("transpose_x2",
-                                                                False)
+                        "x2", out_grad).set_attr_bool("transpose_x1",
+                                                      True).set_attr_bool(
+                                                          "transpose_x2", False)
             elif len(shape_x) == 3 and len(shape_y) == 2:
                 flatten_x = core.GEOperatorFactory.create_operator(
                     "flatten" + self._accumulated_op_id(),
                     "Flatten").set_input("x", x)
                 x_grad = core.GEOperatorFactory.create_operator(
                     self.parser_name + self._accumulated_op_id(),
-                    "MatMul").set_input(
-                        "x1", out_grad).set_input("x2", y).set_attr_bool(
-                            "transpose_x1",
-                            False).set_attr_bool("transpose_x2", True)
+                    "MatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool("transpose_x1",
+                                               False).set_attr_bool(
+                                                   "transpose_x2", True)
                 if len(shape_out_grad) == 2:
                     x_grad = core.GEOperatorFactory.create_operator(
                         "unsqueeze" + self._accumulated_op_id(),
@@ -1604,11 +1688,10 @@ def _apply(self):
 
                 y_grad = core.GEOperatorFactory.create_operator(
                     self.parser_name + self._accumulated_op_id(),
-                    "MatMul").set_input(
-                        "x1",
-                        flatten_x).set_input("x2", out_grad).set_attr_bool(
-                            "transpose_x1",
-                            True).set_attr_bool("transpose_x2", False)
+                    "MatMul").set_input("x1", flatten_x).set_input(
+                        "x2", out_grad).set_attr_bool("transpose_x1",
+                                                      True).set_attr_bool(
+                                                          "transpose_x2", False)
         else:
             if len(shape_x) == 3 and len(shape_y) == 2:
                 assert x_num_col_dims == 2, "only support 2"
@@ -1632,8 +1715,9 @@ def _apply(self):
                 x_grad = core.GEOperatorFactory.create_operator(
                     self.parser_name + self._accumulated_op_id(),
                     "BatchMatMul").set_input("x1", out_grad).set_input(
-                        "x2", y_stack).set_attr_bool(
-                            "adj_x1", False).set_attr_bool("adj_x2", True)
+                        "x2", y_stack).set_attr_bool("adj_x1",
+                                                     False).set_attr_bool(
+                                                         "adj_x2", True)
                 y_grad = core.GEOperatorFactory.create_operator(
                     self.parser_name + self._accumulated_op_id(),
                     "MatMul").set_input("x1", flatten_x).set_input(
@@ -1645,6 +1729,7 @@ def _apply(self):
 
 
 class ReluGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(ReluGradParser, self).__init__(graph, var2geop)
         self.parser_name = "relu_grad"
@@ -1653,12 +1738,14 @@ def _apply(self):
         out = self._get_ge_input(self.op.input_arg_names[0])
         out_grad = self._get_ge_input(self.op.input_arg_names[1])
         relu_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "ReluGrad").set_input(
-                "gradients", out_grad).set_input("features", out)
+            self.parser_name + self._accumulated_op_id(),
+            "ReluGrad").set_input("gradients",
+                                  out_grad).set_input("features", out)
         return [relu_grad], [[0]]
 
 
 class SoftmaxWithCrossEntropyGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(SoftmaxWithCrossEntropyGradParser, self).__init__(graph, var2geop)
         self.parser_name = "softmax_with_cross_entropy_grad"
@@ -1685,18 +1772,20 @@ def _apply(self):
         self._mark_as_input(off)
 
         label = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", label).set_attr_int32("dst_type", 3)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", label).set_attr_int32("dst_type", 3)
         onehot = core.GEOperatorFactory.create_operator(
-            "onehot" + self._accumulated_op_id(), "OneHotD").set_input(
-                "x", label).set_input("on_value", on).set_input(
-                    "off_value", off).set_attr_int32("depth", cls_num)
+            "onehot" + self._accumulated_op_id(),
+            "OneHotD").set_input("x",
+                                 label).set_input("on_value", on).set_input(
+                                     "off_value",
+                                     off).set_attr_int32("depth", cls_num)
         squeeze = core.GEOperatorFactory.create_operator(
             "suqeeze" + self._accumulated_op_id(),
             "Squeeze").set_input("x", onehot)
         sub = core.GEOperatorFactory.create_operator(
-            "sub" + self._accumulated_op_id(), "Sub").set_input(
-                "x1", softmax).set_input("x2", squeeze)
+            "sub" + self._accumulated_op_id(),
+            "Sub").set_input("x1", softmax).set_input("x2", squeeze)
         grad = core.GEOperatorFactory.create_operator(
             "mul" + self._accumulated_op_id(),
             "Mul").set_input("x1", loss_grad).set_input("x2", sub)
@@ -1705,6 +1794,7 @@ def _apply(self):
 
 
 class DotMulGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(DotMulGradParser, self).__init__(graph, var2geop)
         self.parser_name = "elementwise_mul_grad"
@@ -1725,6 +1815,7 @@ def _apply(self):
 
 
 class DotAddGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(DotAddGradParser, self).__init__(graph, var2geop)
         self.parser_name = "elementwise_add_grad"
@@ -1769,6 +1860,7 @@ def _apply(self):
 
 
 class DotDivGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(DotDivGradParser, self).__init__(graph, var2geop)
         self.parser_name = "elementwise_div_grad"
@@ -1780,39 +1872,40 @@ def _apply(self):
         y = self._get_ge_input(self.op.input_arg_names[3])
 
         y_power = core.GEOperatorFactory.create_operator(
-            "power" + self._accumulated_op_id(), "Power").set_input(
-                "x", y).set_attr_float("power", -1)
+            "power" + self._accumulated_op_id(),
+            "Power").set_input("x", y).set_attr_float("power", -1)
 
         tensor_zeros = core.GEOperatorFactory.create_operator(
             "zeroslike" + self._accumulated_op_id(),
             "ZerosLike").set_input("x", x)
         x_zero = core.GEOperatorFactory.create_operator(
-            "equal" + self._accumulated_op_id(), "Equal").set_input(
-                "x1", x).set_input("x2", tensor_zeros)
+            "equal" + self._accumulated_op_id(),
+            "Equal").set_input("x1", x).set_input("x2", tensor_zeros)
         x_nozero = core.GEOperatorFactory.create_operator(
             "logical_not" + self._accumulated_op_id(),
             "LogicalNot").set_input("x", x_zero)
         x_nozero_f = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", x_nozero).set_attr_int32("dst_type", 0)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", x_nozero).set_attr_int32("dst_type", 0)
         x_grad_w = core.GEOperatorFactory.create_operator(
-            "mul" + self._accumulated_op_id(), "Mul").set_input(
-                "x1", x_nozero_f).set_input("x2", y_power)
+            "mul" + self._accumulated_op_id(),
+            "Mul").set_input("x1", x_nozero_f).set_input("x2", y_power)
         x_grad = core.GEOperatorFactory.create_operator(
             self.parser_name + self._accumulated_op_id(),
             "Mul").set_input("x1", x_grad_w).set_input("x2", out_grad)
 
         y_grad_w = core.GEOperatorFactory.create_operator(
-            "mul" + self._accumulated_op_id(), "Mul").set_input(
-                "x1", out).set_input("x2", y_power)
+            "mul" + self._accumulated_op_id(),
+            "Mul").set_input("x1", out).set_input("x2", y_power)
         y_grad = core.GEOperatorFactory.create_operator(
-            "mul" + self._accumulated_op_id(), "Mul").set_input(
-                "x1", y_grad_w).set_input("x2", out_grad)
+            "mul" + self._accumulated_op_id(),
+            "Mul").set_input("x1", y_grad_w).set_input("x2", out_grad)
 
         return [x_grad, y_grad], [[0], [1]]
 
 
 class SoftmaxGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(SoftmaxGradParser, self).__init__(graph, var2geop)
         self.parser_name = "softmax_grad"
@@ -1823,12 +1916,13 @@ def _apply(self):
 
         x_grad = core.GEOperatorFactory.create_operator(
             self.parser_name + self._accumulated_op_id(),
-            "SoftmaxGrad").set_input("softmax", out).set_input("grad_softmax",
-                                                               out_grad)
+            "SoftmaxGrad").set_input("softmax",
+                                     out).set_input("grad_softmax", out_grad)
         return [x_grad], [[0]]
 
 
 class ReshapeGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(ReshapeGradParser, self).__init__(graph, var2geop)
         self.parser_name = "reshape2_grad"
@@ -1846,13 +1940,14 @@ def _apply(self):
             "shape" + self._accumulated_op_id(),
             "Const").set_attr_tensor("value", tensor)
         x_grad = core.GEOperatorFactory.create_operator(
-            "reshape" + self._accumulated_op_id(), "Reshape").set_input(
-                "x", out_grad).set_input("shape", const_shape)
+            "reshape" + self._accumulated_op_id(),
+            "Reshape").set_input("x", out_grad).set_input("shape", const_shape)
 
         return [x_grad], [[0]]
 
 
 class GatherGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(GatherGradParser, self).__init__(graph, var2geop)
         self.parser_name = "gather_grad"
@@ -1868,8 +1963,9 @@ def _apply(self):
 
         if len(index_shape) == 1:
             index = core.GEOperatorFactory.create_operator(
-                "unsqueeze" + self._accumulated_op_id(), "Unsqueeze").set_input(
-                    "x", index).set_attr_vec_int32("axes", [1])
+                "unsqueeze" + self._accumulated_op_id(),
+                "Unsqueeze").set_input("x",
+                                       index).set_attr_vec_int32("axes", [1])
 
         tensor_zeros = core.GEOperatorFactory.create_operator(
             "zeroslike" + self._accumulated_op_id(),
@@ -1883,6 +1979,7 @@ def _apply(self):
 
 
 class TransposeGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(TransposeGradParser, self).__init__(graph, var2geop)
         self.parser_name = "transpose2_grad"
@@ -1897,13 +1994,15 @@ def _apply(self):
         assert list(map(lambda x: out_grad_shape[x], perm)) == list(x_shape)
 
         x_grad = core.GEOperatorFactory.create_operator(
-            "transpose" + self._accumulated_op_id(), "TransposeD").set_input(
-                "x", out_grad).set_attr_vec_int32("perm", perm)
+            "transpose" + self._accumulated_op_id(),
+            "TransposeD").set_input("x",
+                                    out_grad).set_attr_vec_int32("perm", perm)
 
         return [x_grad], [[0]]
 
 
 class LayerNormGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(LayerNormGradParser, self).__init__(graph, var2geop)
         self.parser_name = "layer_norm_grad"
@@ -1920,25 +2019,30 @@ def _apply(self):
         x_grad = core.GEOperatorFactory.create_operator(
             self.parser_name + self._accumulated_op_id(),
             "LayerNormGrad").set_input("dy", out_grad).set_input(
-                "x", x).set_input("variance", variance).set_input(
-                    "mean", mean).set_input("gamma", scale)
+                "x", x).set_input("variance",
+                                  variance).set_input("mean", mean).set_input(
+                                      "gamma", scale)
 
         cast_dtype = 0 if self.ascend_helper.dtype2paddle_inv_map[str(
             x_dtype)] == 0 else 1
         out_x_grad = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", x_grad, 0).set_attr_int32("dst_type", cast_dtype)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", x_grad,
+                              0).set_attr_int32("dst_type", cast_dtype)
         out_scale_grad = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", x_grad, 1).set_attr_int32("dst_type", cast_dtype)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", x_grad,
+                              1).set_attr_int32("dst_type", cast_dtype)
         out_bias_grad = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", x_grad, 2).set_attr_int32("dst_type", cast_dtype)
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", x_grad,
+                              2).set_attr_int32("dst_type", cast_dtype)
 
         return [out_x_grad, out_scale_grad, out_bias_grad], [[2], [1], [0]]
 
 
 class TanhGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(TanhGradParser, self).__init__(graph, var2geop)
         self.parser_name = 'tanh_grad'
@@ -1954,6 +2058,7 @@ def _apply(self):
 
 
 class LogGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(LogGradParser, self).__init__(graph, var2geop)
         self.parser_name = 'log_grad'
@@ -1968,6 +2073,7 @@ def _apply(self):
 
 
 class SqrtGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(SqrtGradParser, self).__init__(graph, var2geop)
         self.parser_name = "sqrt_grad"
@@ -1982,6 +2088,7 @@ def _apply(self):
 
 
 class PowGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(PowGradParser, self).__init__(graph, var2geop)
         self.parser_name = "pow_grad"
@@ -2000,15 +2107,15 @@ def _apply(self):
             "Const").set_attr_tensor("value", factor_scale)
         factor_tensor = core.GEOperatorFactory.create_operator(
             "broadcast_to_d" + self._accumulated_op_id(),
-            "BroadcastTo").set_input(
-                "x", factor_scale).set_input("shape", shape_tensor)
+            "BroadcastTo").set_input("x", factor_scale).set_input(
+                "shape", shape_tensor)
 
         x_power = core.GEOperatorFactory.create_operator(
-            "x_power" + self._accumulated_op_id(), "Power").set_input(
-                "x", x).set_attr_float("power", factor - 1)
+            "x_power" + self._accumulated_op_id(),
+            "Power").set_input("x", x).set_attr_float("power", factor - 1)
         x_power_mul_factor = core.GEOperatorFactory.create_operator(
-            "x_power_mul_factor" + self._accumulated_op_id(), "Mul").set_input(
-                "x1", x).set_input("x2", factor_tensor)
+            "x_power_mul_factor" + self._accumulated_op_id(),
+            "Mul").set_input("x1", x).set_input("x2", factor_tensor)
         x_power_mul_factor_grad = core.GEOperatorFactory.create_operator(
             "x_power_mul_factor_grad" + self._accumulated_op_id(),
             "Mul").set_input("x1", x_power_mul_factor).set_input("x2", grad)
@@ -2017,6 +2124,7 @@ def _apply(self):
 
 
 class GeluGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(GeluGradParser, self).__init__(graph, var2geop)
         self.parser_name = "gelu_grad"
@@ -2028,13 +2136,15 @@ def _apply(self):
         y = core.GEOperatorFactory.create_operator(
             "gelu" + self._accumulated_op_id(), "Gelu").set_input("x", x)
         gelu_grad = core.GEOperatorFactory.create_operator(
-            "gelu_grad" + self._accumulated_op_id(), "GeluGrad").set_input(
-                "x", x).set_input("dy", grad).set_input("y", y)
+            "gelu_grad" + self._accumulated_op_id(),
+            "GeluGrad").set_input("x", x).set_input("dy",
+                                                    grad).set_input("y", y)
 
         return [gelu_grad], [[0]]
 
 
 class MeanGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(MeanGradParser, self).__init__(graph, var2geop)
         self.parser_name = "mean_grad"
@@ -2047,12 +2157,12 @@ def _apply(self):
             "one_tensor" + self._accumulated_op_id(),
             "OnesLike").set_input("x", x)
         sum = core.GEOperatorFactory.create_operator(
-            "mean" + self._accumulated_op_id(), "ReduceSumD").set_input(
-                "x", ones_tensor).set_attr_bool(
-                    "keep_dims", False).set_attr_vec_int32("axes", [])
+            "mean" + self._accumulated_op_id(),
+            "ReduceSumD").set_input("x", ones_tensor).set_attr_bool(
+                "keep_dims", False).set_attr_vec_int32("axes", [])
         mean = core.GEOperatorFactory.create_operator(
-            "x_power" + self._accumulated_op_id(), "Power").set_input(
-                "x", sum).set_attr_float("power", -1)
+            "x_power" + self._accumulated_op_id(),
+            "Power").set_input("x", sum).set_attr_float("power", -1)
 
         mean_grad = core.GEOperatorFactory.create_operator(
             "mean_grad" + self._accumulated_op_id(),
@@ -2062,6 +2172,7 @@ def _apply(self):
 
 
 class SliceGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(SliceGradParser, self).__init__(graph, var2geop)
         self.parser_name = "slice_grad"
@@ -2100,6 +2211,7 @@ def _apply(self):
 
 
 class LookUpTableGradParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(LookUpTableGradParser, self).__init__(graph, var2geop)
         self.parser_name = "lookup_table_grad"
@@ -2115,26 +2227,26 @@ def _apply(self):
 
         ids_flatten = core.GEOperatorFactory.create_operator(
             "flatten" + self._accumulated_op_id(), "FlattenV2").set_input(
-                "x",
-                ids).set_attr_int32("axis", 0).set_attr_int32("end_axis", 1)
+                "x", ids).set_attr_int32("axis",
+                                         0).set_attr_int32("end_axis", 1)
         grad_flatten = core.GEOperatorFactory.create_operator(
             "flatten" + self._accumulated_op_id(), "FlattenV2").set_input(
-                "x",
-                grad).set_attr_int32("axis", 0).set_attr_int32("end_axis", 1)
+                "x", grad).set_attr_int32("axis",
+                                          0).set_attr_int32("end_axis", 1)
 
         tensor_zeros = core.GEOperatorFactory.create_operator(
             "zeroslike" + self._accumulated_op_id(),
             "ZerosLike").set_input("x", embedding)
         embedding_grad = core.GEOperatorFactory.create_operator(
             "scatteradd" + self._accumulated_op_id(),
-            "TensorScatterAdd").set_input(
-                "x", tensor_zeros).set_input("indices", ids_flatten).set_input(
-                    "updates", grad_flatten)
+            "TensorScatterAdd").set_input("x", tensor_zeros).set_input(
+                "indices", ids_flatten).set_input("updates", grad_flatten)
 
         return [embedding_grad], [[0]]
 
 
 class SGDParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(SGDParser, self).__init__(graph, var2geop)
         self.parser_name = "sgd"
@@ -2151,6 +2263,7 @@ def _apply(self):
 
 
 class AdamParser(AscendParserBase):
+
     def __init__(self, graph, var2geop):
         super(AdamParser, self).__init__(graph, var2geop)
         self.parser_name = "adam"
@@ -2168,23 +2281,26 @@ def _apply(self):
         epsilon = self.op.attr('epsilon')
 
         beta1 = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", self._create_ge_tensor([1], 5, beta1))
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value",
+                                     self._create_ge_tensor([1], 5, beta1))
         beta2 = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", self._create_ge_tensor([1], 5, beta2))
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value",
+                                     self._create_ge_tensor([1], 5, beta2))
         epsilon = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", self._create_ge_tensor([1], 5, epsilon))
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value",
+                                     self._create_ge_tensor([1], 5, epsilon))
 
         adam = core.GEOperatorFactory.create_operator(
             "adam" + self._accumulated_op_id(),
             "ApplyAdam").set_input("var", param).set_input(
                 "m", moment1).set_input("v", moment2).set_input(
                     "beta1_power", beta1_power).set_input(
-                        "beta2_power", beta2_power).set_input(
-                            "lr", lr).set_input("beta1", beta1).set_input(
-                                "beta2", beta2).set_input(
-                                    "epsilon", epsilon).set_input("grad", grad)
+                        "beta2_power",
+                        beta2_power).set_input("lr", lr).set_input(
+                            "beta1", beta1).set_input("beta2", beta2).set_input(
+                                "epsilon", epsilon).set_input("grad", grad)
 
         return [adam], [[0]]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/asp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/asp_optimizer.py
index ea9cb1c62bfec..2047c3172c260 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/asp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/asp_optimizer.py
@@ -19,6 +19,7 @@
 
 
 class ASPOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(ASPOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -32,8 +33,9 @@ def __init__(self, optimizer):
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
-        super(ASPOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        super(ASPOptimizer,
+              self)._set_basic_info(loss, role_maker, user_defined_optimizer,
+                                    user_defined_strategy)
 
     def _can_apply(self):
         if not self.role_maker._is_collective:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index a44607d13aafc..4c0cc90102587 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -50,6 +50,7 @@ def is_optimizer_op(op):
 
 
 class CollectiveHelper(object):
+
     def __init__(self, role_maker, nrings=1, wait_port=True):
         self.nrings = nrings
         self.wait_port = wait_port
@@ -63,9 +64,10 @@ def update_startup_program(self, startup_program=None):
         endpoints = self.role_maker._get_trainer_endpoints()
         current_endpoint = endpoints[self.role_maker._worker_index()]
         for ring_id in range(self.nrings):
-            self._init_communicator(
-                self.startup_program, current_endpoint, endpoints,
-                self.role_maker._worker_index(), ring_id, self.wait_port)
+            self._init_communicator(self.startup_program,
+                                    current_endpoint, endpoints,
+                                    self.role_maker._worker_index(), ring_id,
+                                    self.wait_port)
         self._broadcast_params()
 
     def _init_communicator(self,
@@ -88,36 +90,32 @@ def _init_communicator(self,
             wait_server_ready(other_endpoints)
 
         def _add_sync_by_allreduce(block):
-            sync_var = block.create_var(
-                name=unique_name.generate('sync_var'),
-                dtype=core.VarDesc.VarType.INT32,
-                persistable=False,
-                stop_gradient=True)
-            block.append_op(
-                type='fill_constant',
-                inputs={},
-                outputs={'Out': [sync_var]},
-                attrs={
-                    'shape': [1],
-                    'dtype': sync_var.dtype,
-                    'value': 1,
-                    'force_cpu': False,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
-            block.append_op(
-                type='c_allreduce_sum',
-                inputs={'X': [sync_var]},
-                outputs={'Out': [sync_var]},
-                attrs={
-                    'ring_id': global_ring_id,
-                    'use_calc_stream': True,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
-            block.append_op(
-                type='c_sync_calc_stream',
-                inputs={'X': sync_var},
-                outputs={'Out': sync_var},
-                attrs={OP_ROLE_KEY: OpRole.Forward})
+            sync_var = block.create_var(name=unique_name.generate('sync_var'),
+                                        dtype=core.VarDesc.VarType.INT32,
+                                        persistable=False,
+                                        stop_gradient=True)
+            block.append_op(type='fill_constant',
+                            inputs={},
+                            outputs={'Out': [sync_var]},
+                            attrs={
+                                'shape': [1],
+                                'dtype': sync_var.dtype,
+                                'value': 1,
+                                'force_cpu': False,
+                                OP_ROLE_KEY: OpRole.Forward
+                            })
+            block.append_op(type='c_allreduce_sum',
+                            inputs={'X': [sync_var]},
+                            outputs={'Out': [sync_var]},
+                            attrs={
+                                'ring_id': global_ring_id,
+                                'use_calc_stream': True,
+                                OP_ROLE_KEY: OpRole.Forward
+                            })
+            block.append_op(type='c_sync_calc_stream',
+                            inputs={'X': sync_var},
+                            outputs={'Out': sync_var},
+                            attrs={OP_ROLE_KEY: OpRole.Forward})
 
         block = program.global_block()
         if current_endpoint is None:
@@ -126,77 +124,71 @@ def _add_sync_by_allreduce(block):
             _add_sync_by_allreduce(block)
             return
 
-        comm_id_var = block.create_var(
-            name=unique_name.generate('comm_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
+        comm_id_var = block.create_var(name=unique_name.generate('comm_id'),
+                                       persistable=True,
+                                       type=core.VarDesc.VarType.RAW)
         if core.is_compiled_with_cuda():
-            block.append_op(
-                type='c_gen_nccl_id',
-                inputs={},
-                outputs={'Out': comm_id_var},
-                attrs={
-                    'rank': rank,
-                    'endpoint': current_endpoint,
-                    'other_endpoints': other_endpoints,
-                    'ring_id': ring_id,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
-            block.append_op(
-                type='c_comm_init',
-                inputs={'X': comm_id_var},
-                outputs={},
-                attrs={
-                    'nranks': nranks,
-                    'rank': rank,
-                    'ring_id': ring_id,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
+            block.append_op(type='c_gen_nccl_id',
+                            inputs={},
+                            outputs={'Out': comm_id_var},
+                            attrs={
+                                'rank': rank,
+                                'endpoint': current_endpoint,
+                                'other_endpoints': other_endpoints,
+                                'ring_id': ring_id,
+                                OP_ROLE_KEY: OpRole.Forward
+                            })
+            block.append_op(type='c_comm_init',
+                            inputs={'X': comm_id_var},
+                            outputs={},
+                            attrs={
+                                'nranks': nranks,
+                                'rank': rank,
+                                'ring_id': ring_id,
+                                OP_ROLE_KEY: OpRole.Forward
+                            })
         elif core.is_compiled_with_xpu():
-            block.append_op(
-                type='c_gen_bkcl_id',
-                inputs={},
-                outputs={'Out': comm_id_var},
-                attrs={
-                    'rank': rank,
-                    'endpoint': current_endpoint,
-                    'other_endpoints': other_endpoints,
-                    'ring_id': ring_id,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
-            block.append_op(
-                type='c_comm_init',
-                inputs={'X': comm_id_var},
-                outputs={},
-                attrs={
-                    'nranks': nranks,
-                    'rank': rank,
-                    'ring_id': ring_id,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
+            block.append_op(type='c_gen_bkcl_id',
+                            inputs={},
+                            outputs={'Out': comm_id_var},
+                            attrs={
+                                'rank': rank,
+                                'endpoint': current_endpoint,
+                                'other_endpoints': other_endpoints,
+                                'ring_id': ring_id,
+                                OP_ROLE_KEY: OpRole.Forward
+                            })
+            block.append_op(type='c_comm_init',
+                            inputs={'X': comm_id_var},
+                            outputs={},
+                            attrs={
+                                'nranks': nranks,
+                                'rank': rank,
+                                'ring_id': ring_id,
+                                OP_ROLE_KEY: OpRole.Forward
+                            })
         elif core.is_compiled_with_npu():
-            block.append_op(
-                type='c_gen_hccl_id',
-                inputs={},
-                outputs={'Out': comm_id_var},
-                attrs={
-                    'rank': rank,
-                    'endpoint': current_endpoint,
-                    'other_endpoints': other_endpoints,
-                    'ring_id': ring_id,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
-            block.append_op(
-                type='c_comm_init_hccl',
-                inputs={'X': comm_id_var},
-                outputs={},
-                attrs={
-                    'rank': rank,
-                    'ring_id': ring_id,
-                    'device_id': int(os.getenv("FLAGS_selected_npus")),
-                    'rank_ids': nranks,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
+            block.append_op(type='c_gen_hccl_id',
+                            inputs={},
+                            outputs={'Out': comm_id_var},
+                            attrs={
+                                'rank': rank,
+                                'endpoint': current_endpoint,
+                                'other_endpoints': other_endpoints,
+                                'ring_id': ring_id,
+                                OP_ROLE_KEY: OpRole.Forward
+                            })
+            block.append_op(type='c_comm_init_hccl',
+                            inputs={'X': comm_id_var},
+                            outputs={},
+                            attrs={
+                                'rank': rank,
+                                'ring_id': ring_id,
+                                'device_id':
+                                int(os.getenv("FLAGS_selected_npus")),
+                                'rank_ids': nranks,
+                                OP_ROLE_KEY: OpRole.Forward
+                            })
         else:
             raise ValueError(
                 "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
@@ -217,20 +209,20 @@ def _broadcast_params(self):
                 continue
 
             ring_id = (ring_id + 1) % self.nrings
-            block.append_op(
-                type='c_broadcast',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={
-                    'ring_id': ring_id,
-                    'root': 0,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
+            block.append_op(type='c_broadcast',
+                            inputs={'X': param},
+                            outputs={'Out': param},
+                            attrs={
+                                'ring_id': ring_id,
+                                'root': 0,
+                                OP_ROLE_KEY: OpRole.Forward
+                            })
 
         for ring_id in range(self.nrings):
-            block.append_op(
-                type='c_sync_comm_stream',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={'ring_id': ring_id,
-                       OP_ROLE_KEY: OpRole.Forward})
+            block.append_op(type='c_sync_comm_stream',
+                            inputs={'X': param},
+                            outputs={'Out': param},
+                            attrs={
+                                'ring_id': ring_id,
+                                OP_ROLE_KEY: OpRole.Forward
+                            })
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index b035f179317ac..d25cf9680236f 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -19,6 +19,7 @@
 
 
 class DGCOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(DGCOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -29,8 +30,9 @@ def __init__(self, optimizer):
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
-        super(DGCOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        super(DGCOptimizer,
+              self)._set_basic_info(loss, role_maker, user_defined_optimizer,
+                                    user_defined_strategy)
 
     def _init_dgc_opt(self):
         if self.dgc_opt is not None:
@@ -102,8 +104,9 @@ def apply_gradients(self, params_grads):
 
     def apply_optimize(self, loss, startup_program, params_grads):
         self._init_dgc_opt()
-        return self.dgc_opt.apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
+        return self.dgc_opt.apply_optimize(loss,
+                                           startup_program=startup_program,
+                                           params_grads=params_grads)
 
     def minimize_impl(self,
                       loss,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
index d487f35324df9..8a6ec33b39b73 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -35,7 +35,7 @@ class DygraphShardingOptimizer(object):
 
     """
 
-    # TODO (JZ-LIANG) 
+    # TODO (JZ-LIANG)
     # TO support following featrues in future:
     # 1. fused update parameter sync
     # 2. parameters_groups
@@ -124,7 +124,7 @@ def _map_param_to_rank(self):
     def _buid_inner_optimizer(self):
         # we rely on the inner opt to determine whether a parameter is stop_gradient or not:
         # create moment
-        # update related ops: clip, regular, opt  
+        # update related ops: clip, regular, opt
         self._inner_optimizer = self._inner_optimizer_class(
             parameters=self._rank2params[self._sharding_rank],
             **self._inner_optimizer_kargs)
@@ -142,8 +142,8 @@ def _sharding_sync_parameters(self):
                 for param in params:
                     paddle.distributed.broadcast(
                         param,
-                        # the collective API need src rank to be the global rank id 
-                        # instead of the relative logic rank id within group 
+                        # the collective API need src rank to be the global rank id
+                        # instead of the relative logic rank id within group
                         src=self._hcg.get_sharding_parallel_group().ranks[rank],
                         group=self._hcg.get_sharding_parallel_group(),
                         use_calc_stream=True)
@@ -160,13 +160,13 @@ def minimize(self,
                  parameters=None,
                  no_grad_set=None):
 
-        # NOTE in dygraph mode, the only different between step and minimize is that minimize 
+        # NOTE in dygraph mode, the only different between step and minimize is that minimize
         # allow user to customize the parameters for updating on each step
 
         input_param_names = set([param.name for param in parameters])
         parameters = list(
-            filter(lambda x: x.name in input_param_names, self._rank2params[
-                self._sharding_rank]))
+            filter(lambda x: x.name in input_param_names,
+                   self._rank2params[self._sharding_rank]))
         result = self._inner_optimizer.minimize(loss, startup_program,
                                                 parameters, no_grad_set)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
index 7d930c5a69c47..641bc25e5c59e 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -28,6 +28,7 @@
 
 
 class HybridParallelGradScaler:
+
     def __init__(self, scaler, hcg):
         self._scaler = scaler
         self._hcg = hcg
@@ -70,11 +71,12 @@ def _unscale(self, optimizer):
         # allreduce_max found_inf in check_group
         if not self._use_dp_mode:
             self._found_inf = paddle.cast(self._found_inf, dtype="int32")
-            # TODO(shenliang03) Since the minimize call in the optimizer is 
-            # after the gradscaler, check_finite needs to synchronize global 
+            # TODO(shenliang03) Since the minimize call in the optimizer is
+            # after the gradscaler, check_finite needs to synchronize global
             # information. In the future, we should use check_group
-            paddle.distributed.all_reduce(
-                self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None)
+            paddle.distributed.all_reduce(self._found_inf,
+                                          op=paddle.distributed.ReduceOp.MAX,
+                                          group=None)
             self._found_inf = paddle.cast(self._found_inf, dtype="bool")
 
     def __getattr__(self, item):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index 50bf8a2f9c7c5..14daba5ee330e 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -43,6 +43,7 @@ def _obtain_optimizer_parameters_list(optimizer):
 
 
 class HybridParallelClipGrad:
+
     def __init__(self, clip, hcg):
         self._clip = clip
         self._hcg = hcg
@@ -67,8 +68,8 @@ def _dygraph_clip(self, params_grads):
             sum_square = layers.reduce_sum(square)
 
             not_shared_enable = (not hasattr(p, 'is_firstly_shared')) or (
-                hasattr(p, 'is_firstly_shared') and
-                getattr(p, 'is_firstly_shared', True))
+                hasattr(p, 'is_firstly_shared')
+                and getattr(p, 'is_firstly_shared', True))
 
             if not_shared_enable:
                 if p.is_distributed:
@@ -88,19 +89,19 @@ def _dygraph_clip(self, params_grads):
         else:
             global_norm_dist_fp16 = layers.concat(sum_square_dist_fp16)
             global_norm_dist_fp16 = layers.reduce_sum(global_norm_dist_fp16)
-            global_norm_dist_fp16 = paddle.cast(
-                global_norm_dist_fp16, dtype=paddle.float32)
+            global_norm_dist_fp16 = paddle.cast(global_norm_dist_fp16,
+                                                dtype=paddle.float32)
 
         # global norm of non-distributed FP16 params_and_grads
         if len(sum_square_not_dist_fp16) == 0:
-            global_norm_not_dist_fp16 = paddle.to_tensor(
-                [0.], dtype=paddle.float32)
+            global_norm_not_dist_fp16 = paddle.to_tensor([0.],
+                                                         dtype=paddle.float32)
         else:
             global_norm_not_dist_fp16 = layers.concat(sum_square_not_dist_fp16)
             global_norm_not_dist_fp16 = layers.reduce_sum(
                 global_norm_not_dist_fp16)
-            global_norm_not_dist_fp16 = paddle.cast(
-                global_norm_not_dist_fp16, dtype=paddle.float32)
+            global_norm_not_dist_fp16 = paddle.cast(global_norm_not_dist_fp16,
+                                                    dtype=paddle.float32)
 
         # global norm of distributed FP32 params_and_grads
         global_norm_dist_fp32 = layers.concat(sum_square_dist_fp32) if len(
@@ -110,9 +111,9 @@ def _dygraph_clip(self, params_grads):
 
         # global norm of non-distributed FP32 params_and_grads
         global_norm_not_dist_fp32 = layers.concat(
-            sum_square_not_dist_fp32) if len(
-                sum_square_not_dist_fp32) != 0 else paddle.to_tensor(
-                    [0.], dtype=paddle.float32)
+            sum_square_not_dist_fp32
+        ) if len(sum_square_not_dist_fp32) != 0 else paddle.to_tensor(
+            [0.], dtype=paddle.float32)
         global_norm_not_dist_fp32 = layers.reduce_sum(global_norm_not_dist_fp32)
 
         global_norm_var_dist = global_norm_dist_fp16 + global_norm_dist_fp32
@@ -140,12 +141,13 @@ def _dygraph_clip(self, params_grads):
         global_norm_var_fp32 = layers.sqrt(global_norm_var_dist +
                                            global_norm_var_not_dist)
 
-        max_global_norm = layers.fill_constant(
-            shape=[1], dtype=global_norm_var_fp32.dtype, value=self.clip_norm)
-        clip_var = layers.elementwise_div(
-            x=max_global_norm,
-            y=layers.elementwise_max(
-                x=global_norm_var_fp32, y=max_global_norm))
+        max_global_norm = layers.fill_constant(shape=[1],
+                                               dtype=global_norm_var_fp32.dtype,
+                                               value=self.clip_norm)
+        clip_var = layers.elementwise_div(x=max_global_norm,
+                                          y=layers.elementwise_max(
+                                              x=global_norm_var_fp32,
+                                              y=max_global_norm))
         clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
         for p, g in params_grads:
             if g is None:
@@ -179,12 +181,12 @@ def __init__(self, optimizer, hcg, strategy):
 
         self._need_dp = (self._hcg.get_data_parallel_world_size() > 1)
 
-        # NOTE(shenliang03): Because of the pure DataParallel mode, the gradient synchronization 
-        # is achieved through reducer, so there is no need to call fuse_allreduce in optimizer. 
+        # NOTE(shenliang03): Because of the pure DataParallel mode, the gradient synchronization
+        # is achieved through reducer, so there is no need to call fuse_allreduce in optimizer.
         self._dp_enable = not self._use_dp_mode and self._need_dp
 
-        self._sharding_enable = (
-            self._hcg.get_sharding_parallel_world_size() > 1)
+        self._sharding_enable = (self._hcg.get_sharding_parallel_world_size() >
+                                 1)
 
         if isinstance(self._inner_opt._grad_clip,
                       ClipGradByGlobalNorm) and not self._use_dp_mode:
@@ -224,7 +226,7 @@ def minimize(self,
                  parameters=None,
                  no_grad_set=None):
 
-        # minimize does not support parameters in the form of param_group, 
+        # minimize does not support parameters in the form of param_group,
         # so no need use _obtain_optimizer_parameters_list
         parameter_list = parameters if parameters \
             else self._inner_opt._parameter_list
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index fb43b89e1a623..3359e63b1deff 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -57,7 +57,7 @@ class ShardingOptimizerStage2(Optimizer):
 
     """
 
-    # TODO (Baibaifan) 
+    # TODO (Baibaifan)
     # Feature Notes:
     # 1. Unified memory for parameters and parameters.grad to InternalStorage.
     # 2. Support the segmentation of optimizer parameters and partial updating of parameters.
@@ -97,8 +97,8 @@ def __init__(self,
                 filter(lambda x: x.trainable and x.dtype == Type.fp16.value,
                        self._local_params))) > 0
 
-        self.group = new_group(_get_global_group()
-                               .ranks) if group is None else group
+        self.group = new_group(
+            _get_global_group().ranks) if group is None else group
 
         self.world_size = self.group.nranks
         self.rank = self.group.rank
@@ -122,8 +122,8 @@ def __init__(self,
                 for item in self._optim._param_groups:
                     if "grad_clip" in item.keys():
                         item["grad_clip"] = ShardingClipGrad(
-                            self._optim._grad_clip,
-                            paddle.get_device(), self.group)
+                            self._optim._grad_clip, paddle.get_device(),
+                            self.group)
 
         if offload:
             assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16"
@@ -147,11 +147,10 @@ def _sync_params_and_buffers(self):
         """
 
         for p in self._local_params:
-            broadcast(
-                p,
-                src=self._global_root_rank,
-                group=self.group,
-                use_calc_stream=True)
+            broadcast(p,
+                      src=self._global_root_rank,
+                      group=self.group,
+                      use_calc_stream=True)
 
         # Multi stream operation will be supported later
         wait(tensor=p, group=self.group, use_calc_stream=True)
@@ -224,8 +223,9 @@ def dtype_rank_params(self):
             # Assign the parameters of each rank according to the type
             for param in self._local_params:
                 if param.dtype not in self._dtype_rank_params.keys():
-                    self._dtype_rank_params[
-                        param.dtype] = [[] for _ in range(self.world_size)]
+                    self._dtype_rank_params[param.dtype] = [
+                        [] for _ in range(self.world_size)
+                    ]
                 self._dtype_rank_params[param.dtype][self.param2rank[
                     param.name]].append(param)
 
@@ -379,8 +379,9 @@ def step(self):
             dev_id = int(paddle.get_device().split(":")[1])
             for param in self._local_params:
                 if param.name in self._master_params.keys():
-                    param.set_value(self._master_params[param.name].cuda(dev_id)
-                                    .cast(dtype=param.dtype))
+                    param.set_value(
+                        self._master_params[param.name].cuda(dev_id).cast(
+                            dtype=param.dtype))
         else:
             self._optim.step()
 
@@ -411,14 +412,12 @@ def _broadcast_params(self):
         # Exchange all the shards with the other ranks
         for dtype_per_rank in self.param_storages.values():
             for dst_rank, internal_storage in dtype_per_rank.items():
-                broadcast(
-                    tensor=internal_storage.buffer,
-                    src=self.group.ranks[dst_rank],
-                    group=self.group,
-                    use_calc_stream=True)
+                broadcast(tensor=internal_storage.buffer,
+                          src=self.group.ranks[dst_rank],
+                          group=self.group,
+                          use_calc_stream=True)
 
             # Multi stream operation will be supported later
-            wait(
-                tensor=internal_storage.buffer,
-                group=self.group,
-                use_calc_stream=True)
+            wait(tensor=internal_storage.buffer,
+                 group=self.group,
+                 use_calc_stream=True)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
index f636a31375785..93857461b26d2 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
@@ -18,6 +18,7 @@
 
 
 class FP16AllReduceOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(FP16AllReduceOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -35,8 +36,9 @@ def __init__(self, optimizer):
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
-        super(FP16AllReduceOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        super(FP16AllReduceOptimizer,
+              self)._set_basic_info(loss, role_maker, user_defined_optimizer,
+                                    user_defined_strategy)
 
     def _can_apply(self):
         if not self.role_maker._is_collective:
@@ -82,22 +84,23 @@ def fp16_compression(param_and_grads):
             else:
                 op._remove_attr(op_maker.kOpRoleVarAttrName())
 
-            new_grad = block.create_var(
-                name=unique_name.generate(grad.name + ".cast_fp16"),
-                dtype=core.VarDesc.VarType.FP16,
-                persistable=False,
-                stop_gradient=True)
+            new_grad = block.create_var(name=unique_name.generate(grad.name +
+                                                                  ".cast_fp16"),
+                                        dtype=core.VarDesc.VarType.FP16,
+                                        persistable=False,
+                                        stop_gradient=True)
 
             with block.program._backward_role_guard():
-                cast_op = block.append_op(
-                    type="cast",
-                    inputs={"X": grad},
-                    outputs={"Out": new_grad},
-                    attrs={
-                        "in_dtype": core.VarDesc.VarType.FP32,
-                        "out_dtype": core.VarDesc.VarType.FP16
-                    },
-                    stop_gradient=True)
+                cast_op = block.append_op(type="cast",
+                                          inputs={"X": grad},
+                                          outputs={"Out": new_grad},
+                                          attrs={
+                                              "in_dtype":
+                                              core.VarDesc.VarType.FP32,
+                                              "out_dtype":
+                                              core.VarDesc.VarType.FP16
+                                          },
+                                          stop_gradient=True)
 
                 backward = op_maker.OpRole.Backward
                 cast_op._set_attr(op_maker.kOpRoleAttrName(), backward)
@@ -119,30 +122,30 @@ def fp16_compression(param_and_grads):
                 continue
 
             block = grad.block
-            new_grad = block.create_var(
-                name=unique_name.generate(grad.name + ".cast_fp32"),
-                dtype=core.VarDesc.VarType.FP32,
-                persistable=False,
-                stop_gradient=True)
+            new_grad = block.create_var(name=unique_name.generate(grad.name +
+                                                                  ".cast_fp32"),
+                                        dtype=core.VarDesc.VarType.FP32,
+                                        persistable=False,
+                                        stop_gradient=True)
 
             with block.program._optimized_guard(
                 [param, grad]), framework.name_scope('fp16_allreduce'):
-                cast_op = block.append_op(
-                    type="cast",
-                    inputs={"X": grad},
-                    outputs={"Out": new_grad},
-                    attrs={
-                        "in_dtype": core.VarDesc.VarType.FP16,
-                        "out_dtype": core.VarDesc.VarType.FP32
-                    },
-                    stop_gradient=True)
+                cast_op = block.append_op(type="cast",
+                                          inputs={"X": grad},
+                                          outputs={"Out": new_grad},
+                                          attrs={
+                                              "in_dtype":
+                                              core.VarDesc.VarType.FP16,
+                                              "out_dtype":
+                                              core.VarDesc.VarType.FP32
+                                          },
+                                          stop_gradient=True)
             ret_param_and_grads.append((param, new_grad))
 
         return ret_param_and_grads
 
     def apply_optimize(self, loss, startup_program, params_grads):
         new_params_grads = self.fp16_compression(params_grads)
-        return self.inner_opt.apply_optimize(
-            loss,
-            startup_program=startup_program,
-            params_grads=new_params_grads)
+        return self.inner_opt.apply_optimize(loss,
+                                             startup_program=startup_program,
+                                             params_grads=new_params_grads)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
index 949ef3e5f3a78..10175f8936a70 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -18,6 +18,7 @@
 
 
 class GradientMergeOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(GradientMergeOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -33,8 +34,9 @@ def __init__(self, optimizer):
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
-        super(GradientMergeOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        super(GradientMergeOptimizer,
+              self)._set_basic_info(loss, role_maker, user_defined_optimizer,
+                                    user_defined_strategy)
 
     def _init_wrapped_opt(self):
         config = self.user_defined_strategy.gradient_merge_configs
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index 0fd7db56de54f..8f42553048fec 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -24,6 +24,7 @@
 
 
 class GraphExecutionOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(GraphExecutionOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -94,9 +95,12 @@ def _setup_nccl_op(self, startup_program, main_program, build_strategy):
                 inputs={},
                 outputs={"NCCLID": comm_id_var},
                 attrs={
-                    "trainers": trainer_endpoints,
-                    "trainer_id": trainer_id,
-                    "nccl_comm_num": build_strategy.nccl_comm_num,
+                    "trainers":
+                    trainer_endpoints,
+                    "trainer_id":
+                    trainer_id,
+                    "nccl_comm_num":
+                    build_strategy.nccl_comm_num,
                     "use_hierarchical_allreduce":
                     build_strategy.use_hierarchical_allreduce,
                     "hierarchical_allreduce_inter_ranks":
@@ -120,9 +124,12 @@ def _setup_nccl_op(self, startup_program, main_program, build_strategy):
                 inputs={},
                 outputs={"BKCLID": comm_id_var},
                 attrs={
-                    "trainers": trainer_endpoints,
-                    "trainer_id": trainer_id,
-                    "nccl_comm_num": build_strategy.nccl_comm_num,
+                    "trainers":
+                    trainer_endpoints,
+                    "trainer_id":
+                    trainer_id,
+                    "nccl_comm_num":
+                    build_strategy.nccl_comm_num,
                     "use_hierarchical_allreduce":
                     build_strategy.use_hierarchical_allreduce,
                     "hierarchical_allreduce_inter_ranks":
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index 6d2474d9352f8..3dc5bed03aeac 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -20,6 +20,7 @@
 
 
 class LambOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(LambOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -30,8 +31,9 @@ def __init__(self, optimizer):
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
-        super(LambOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        super(LambOptimizer,
+              self)._set_basic_info(loss, role_maker, user_defined_optimizer,
+                                    user_defined_strategy)
 
         opt = self.inner_opt
         if not isinstance(opt, AdamOptimizer):
@@ -70,8 +72,8 @@ def _can_apply(self):
         if self.user_defined_strategy.lamb:
             if not isinstance(self.inner_opt, AdamOptimizer):
                 logging.warn(
-                    "lamb need the inner optimizer to be AdamOptimizer optimizer but got {}.".
-                    format(self.inner_opt.type))
+                    "lamb need the inner optimizer to be AdamOptimizer optimizer but got {}."
+                    .format(self.inner_opt.type))
                 return False
             return True
         return False
@@ -101,8 +103,9 @@ def apply_gradients(self, params_grads):
         return self.lamb_opt.apply_gradients(params_grads=params_grads)
 
     def apply_optimize(self, loss, startup_program, params_grads):
-        return self.lamb_opt.apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
+        return self.lamb_opt.apply_optimize(loss,
+                                            startup_program=startup_program,
+                                            params_grads=params_grads)
 
     def minimize_impl(self,
                       loss,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index e1bf3722c191d..44f8fe473e2f9 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -19,6 +19,7 @@
 
 
 class LarsOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(LarsOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -29,8 +30,9 @@ def __init__(self, optimizer):
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
-        super(LarsOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        super(LarsOptimizer,
+              self)._set_basic_info(loss, role_maker, user_defined_optimizer,
+                                    user_defined_strategy)
 
         opt = self.inner_opt
         if not isinstance(opt, Momentum):
@@ -57,8 +59,8 @@ def _can_apply(self):
         if self.user_defined_strategy.lars:
             if not isinstance(self.inner_opt, Momentum):
                 logging.warn(
-                    "lars need the inner optimizer to be Momentum optimizer but got {}.".
-                    format(self.inner_opt.type))
+                    "lars need the inner optimizer to be Momentum optimizer but got {}."
+                    .format(self.inner_opt.type))
                 return False
             return True
         return False
@@ -88,8 +90,9 @@ def apply_gradients(self, params_grads):
         return self.lars_opt.apply_gradients(params_grads=params_grads)
 
     def apply_optimize(self, loss, startup_program, params_grads):
-        return self.lars_opt.apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
+        return self.lars_opt.apply_optimize(loss,
+                                            startup_program=startup_program,
+                                            params_grads=params_grads)
 
     def minimize_impl(self,
                       loss,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 9052111d22c2e..eb170dedb0b72 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -24,6 +24,7 @@
 
 
 class LocalSGDOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(LocalSGDOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -70,12 +71,11 @@ def create_snapshot_vars(self, program):
 
         p2s = []
         for param in non_dist_params:
-            snapshot = block.create_var(
-                name=self.snapshot_name(param.name),
-                shape=param.shape,
-                persistable=True,
-                stop_gradient=True,
-                dtype=param.dtype)
+            snapshot = block.create_var(name=self.snapshot_name(param.name),
+                                        shape=param.shape,
+                                        persistable=True,
+                                        stop_gradient=True,
+                                        dtype=param.dtype)
             p2s.append([param, snapshot])
         return p2s
 
@@ -89,8 +89,8 @@ def minimize_impl(self,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
-        minimized = self.inner_opt.minimize(
-            loss, startup_program=startup_program)
+        minimized = self.inner_opt.minimize(loss,
+                                            startup_program=startup_program)
 
         k_steps_value = self.user_defined_strategy.localsgd_configs['k_steps']
         begin_step_value = self.user_defined_strategy.localsgd_configs[
@@ -109,82 +109,78 @@ def minimize_impl(self,
         p2s = self.create_snapshot_vars(main_block.program)
         with program_guard(main_block.program, startup_program):
             step = layers.autoincreased_step_counter(begin=1)
-            k_steps = layers.create_global_var(
-                name="k_steps",
-                shape=[1],
-                value=k_steps_value,
-                dtype='int64',
-                persistable=True)
-
-            begin_step = layers.create_global_var(
-                name="begin_step",
-                shape=[1],
-                value=begin_step_value,
-                dtype='int64',
-                persistable=True)
-
-            last_step = layers.create_global_var(
-                name="last_step",
-                shape=[1],
-                value=begin_step_value,
-                dtype='int64',
-                persistable=True)
+            k_steps = layers.create_global_var(name="k_steps",
+                                               shape=[1],
+                                               value=k_steps_value,
+                                               dtype='int64',
+                                               persistable=True)
+
+            begin_step = layers.create_global_var(name="begin_step",
+                                                  shape=[1],
+                                                  value=begin_step_value,
+                                                  dtype='int64',
+                                                  persistable=True)
+
+            last_step = layers.create_global_var(name="last_step",
+                                                 shape=[1],
+                                                 value=begin_step_value,
+                                                 dtype='int64',
+                                                 persistable=True)
 
             def communicate():
                 sub_block = default_main_program().current_block()
                 ring_id = -1
                 for param, snapshot in p2s:
-                    sub_block.append_op(
-                        type='elementwise_sub',
-                        inputs={'X': [snapshot],
-                                'Y': [param]},
-                        outputs={'Out': [param]},
-                        attrs={OP_ROLE_KEY: OpRole.Optimize})
-                    sub_block.append_op(
-                        type='c_sync_calc_stream',
-                        inputs={'X': param},
-                        outputs={'Out': param},
-                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(type='elementwise_sub',
+                                        inputs={
+                                            'X': [snapshot],
+                                            'Y': [param]
+                                        },
+                                        outputs={'Out': [param]},
+                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(type='c_sync_calc_stream',
+                                        inputs={'X': param},
+                                        outputs={'Out': param},
+                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                     ring_id = (ring_id + 1) % self.nrings
-                    sub_block.append_op(
-                        type='c_allreduce_sum',
-                        inputs={'X': [param]},
-                        outputs={'Out': [param]},
-                        attrs={
-                            'ring_id': ring_id,
-                            OP_ROLE_KEY: OpRole.Optimize
-                        })
+                    sub_block.append_op(type='c_allreduce_sum',
+                                        inputs={'X': [param]},
+                                        outputs={'Out': [param]},
+                                        attrs={
+                                            'ring_id': ring_id,
+                                            OP_ROLE_KEY: OpRole.Optimize
+                                        })
 
                 for ring_id in range(self.nrings):
-                    sub_block.append_op(
-                        type='c_sync_comm_stream',
-                        inputs={'X': param},
-                        outputs={'Out': param},
-                        attrs={
-                            'ring_id': ring_id,
-                            OP_ROLE_KEY: OpRole.Optimize
-                        })
+                    sub_block.append_op(type='c_sync_comm_stream',
+                                        inputs={'X': param},
+                                        outputs={'Out': param},
+                                        attrs={
+                                            'ring_id': ring_id,
+                                            OP_ROLE_KEY: OpRole.Optimize
+                                        })
 
                 for param, snapshot in p2s:
-                    sub_block.append_op(
-                        type='scale',
-                        inputs={'X': [param]},
-                        outputs={'Out': [param]},
-                        attrs={
-                            'scale': 1.0 / self.role_maker._worker_num(),
-                            OP_ROLE_KEY: OpRole.Optimize
-                        })
-                    sub_block.append_op(
-                        type='elementwise_sub',
-                        inputs={'X': [snapshot],
-                                'Y': [param]},
-                        outputs={'Out': [param]},
-                        attrs={OP_ROLE_KEY: OpRole.Optimize})
-                    sub_block.append_op(
-                        type='assign',
-                        inputs={'X': [param]},
-                        outputs={'Out': [snapshot]},
-                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(type='scale',
+                                        inputs={'X': [param]},
+                                        outputs={'Out': [param]},
+                                        attrs={
+                                            'scale':
+                                            1.0 / self.role_maker._worker_num(),
+                                            OP_ROLE_KEY:
+                                            OpRole.Optimize
+                                        })
+                    sub_block.append_op(type='elementwise_sub',
+                                        inputs={
+                                            'X': [snapshot],
+                                            'Y': [param]
+                                        },
+                                        outputs={'Out': [param]},
+                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(type='assign',
+                                        inputs={'X': [param]},
+                                        outputs={'Out': [snapshot]},
+                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                 layers.assign(step, last_step)
 
             def begin_localsgd():
@@ -195,6 +191,7 @@ def begin_localsgd():
 
 
 class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(AdaptiveLocalSGDOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -243,12 +240,11 @@ def create_snapshot_vars(self, program):
 
         p2s = []
         for param in non_dist_params:
-            snapshot = block.create_var(
-                name=self.snapshot_name(param.name),
-                shape=param.shape,
-                persistable=True,
-                stop_gradient=True,
-                dtype=param.dtype)
+            snapshot = block.create_var(name=self.snapshot_name(param.name),
+                                        shape=param.shape,
+                                        persistable=True,
+                                        stop_gradient=True,
+                                        dtype=param.dtype)
             p2s.append([param, snapshot])
         return p2s
 
@@ -258,37 +254,35 @@ def init_snapshot_vars(self, startup_program, param2snapshot):
                 layers.assign(param, snapshot)
 
     def _generate_avg_loss(self, program_block, loss, avg_loss):
-        program_block.append_op(
-            type='c_allreduce_sum',
-            inputs={'X': [loss]},
-            outputs={'Out': [avg_loss]},
-            attrs={
-                'ring_id': 0,
-                OP_ROLE_KEY: OpRole.Optimize,
-                'use_calc_stream': True
-            })
-        program_block.append_op(
-            type='c_sync_calc_stream',
-            inputs={'X': [avg_loss]},
-            outputs={'Out': [avg_loss]},
-            attrs={OP_ROLE_KEY: OpRole.Optimize})
-
-        program_block.append_op(
-            type='scale',
-            inputs={'X': [avg_loss]},
-            outputs={'Out': [avg_loss]},
-            attrs={
-                'scale': 1.0 / self.role_maker._worker_num(),
-                OP_ROLE_KEY: OpRole.Optimize
-            })
+        program_block.append_op(type='c_allreduce_sum',
+                                inputs={'X': [loss]},
+                                outputs={'Out': [avg_loss]},
+                                attrs={
+                                    'ring_id': 0,
+                                    OP_ROLE_KEY: OpRole.Optimize,
+                                    'use_calc_stream': True
+                                })
+        program_block.append_op(type='c_sync_calc_stream',
+                                inputs={'X': [avg_loss]},
+                                outputs={'Out': [avg_loss]},
+                                attrs={OP_ROLE_KEY: OpRole.Optimize})
+
+        program_block.append_op(type='scale',
+                                inputs={'X': [avg_loss]},
+                                outputs={'Out': [avg_loss]},
+                                attrs={
+                                    'scale':
+                                    1.0 / self.role_maker._worker_num(),
+                                    OP_ROLE_KEY: OpRole.Optimize
+                                })
 
     def minimize_impl(self,
                       loss,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
-        minimized = self.inner_opt.minimize(
-            loss, startup_program=startup_program)
+        minimized = self.inner_opt.minimize(loss,
+                                            startup_program=startup_program)
 
         init_k_steps = self.user_defined_strategy.adaptive_localsgd_configs[
             'init_k_steps']
@@ -309,47 +303,41 @@ def minimize_impl(self,
         with program_guard(main_block.program, startup_program):
             step = layers.autoincreased_step_counter(begin=1)
 
-            k_steps = layers.create_global_var(
-                name="k_steps",
-                shape=[1],
-                value=int(init_k_steps),
-                dtype='int64',
-                persistable=True)
-
-            begin_step = layers.create_global_var(
-                name="begin_step",
-                shape=[1],
-                value=int(begin_step_value),
-                dtype='int64',
-                persistable=True)
-
-            last_step = layers.create_global_var(
-                name="last_step",
-                shape=[1],
-                value=int(0),
-                dtype='int64',
-                persistable=True)
-
-            avg_loss = layers.create_global_var(
-                name="avg_loss",
-                shape=[1],
-                value=float(0),
-                dtype=loss.dtype,
-                persistable=True)
-
-            lr_0 = layers.create_global_var(
-                name="lr_0",
-                shape=[1],
-                value=float(0),
-                dtype='float32',
-                persistable=True)
-
-            loss_0 = layers.create_global_var(
-                name="loss_0",
-                shape=[1],
-                value=float(0),
-                dtype='float32',
-                persistable=True)
+            k_steps = layers.create_global_var(name="k_steps",
+                                               shape=[1],
+                                               value=int(init_k_steps),
+                                               dtype='int64',
+                                               persistable=True)
+
+            begin_step = layers.create_global_var(name="begin_step",
+                                                  shape=[1],
+                                                  value=int(begin_step_value),
+                                                  dtype='int64',
+                                                  persistable=True)
+
+            last_step = layers.create_global_var(name="last_step",
+                                                 shape=[1],
+                                                 value=int(0),
+                                                 dtype='int64',
+                                                 persistable=True)
+
+            avg_loss = layers.create_global_var(name="avg_loss",
+                                                shape=[1],
+                                                value=float(0),
+                                                dtype=loss.dtype,
+                                                persistable=True)
+
+            lr_0 = layers.create_global_var(name="lr_0",
+                                            shape=[1],
+                                            value=float(0),
+                                            dtype='float32',
+                                            persistable=True)
+
+            loss_0 = layers.create_global_var(name="loss_0",
+                                              shape=[1],
+                                              value=float(0),
+                                              dtype='float32',
+                                              persistable=True)
 
             global_lr = self.inner_opt._global_learning_rate()
 
@@ -364,75 +352,75 @@ def communicate():
                 sub_block = default_main_program().current_block()
                 ring_id = -1
                 for param, snapshot in p2s:
-                    sub_block.append_op(
-                        type='elementwise_sub',
-                        inputs={'X': [snapshot],
-                                'Y': [param]},
-                        outputs={'Out': [param]},
-                        attrs={OP_ROLE_KEY: OpRole.Optimize})
-                    sub_block.append_op(
-                        type='c_sync_calc_stream',
-                        inputs={'X': param},
-                        outputs={'Out': param},
-                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(type='elementwise_sub',
+                                        inputs={
+                                            'X': [snapshot],
+                                            'Y': [param]
+                                        },
+                                        outputs={'Out': [param]},
+                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(type='c_sync_calc_stream',
+                                        inputs={'X': param},
+                                        outputs={'Out': param},
+                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                     ring_id = (ring_id + 1) % self.nrings
-                    sub_block.append_op(
-                        type='c_allreduce_sum',
-                        inputs={'X': [param]},
-                        outputs={'Out': [param]},
-                        attrs={
-                            'ring_id': ring_id,
-                            OP_ROLE_KEY: OpRole.Optimize
-                        })
+                    sub_block.append_op(type='c_allreduce_sum',
+                                        inputs={'X': [param]},
+                                        outputs={'Out': [param]},
+                                        attrs={
+                                            'ring_id': ring_id,
+                                            OP_ROLE_KEY: OpRole.Optimize
+                                        })
 
                 for ring_id in range(self.nrings):
-                    sub_block.append_op(
-                        type='c_sync_comm_stream',
-                        inputs={'X': param},
-                        outputs={'Out': param},
-                        attrs={
-                            'ring_id': ring_id,
-                            OP_ROLE_KEY: OpRole.Optimize
-                        })
+                    sub_block.append_op(type='c_sync_comm_stream',
+                                        inputs={'X': param},
+                                        outputs={'Out': param},
+                                        attrs={
+                                            'ring_id': ring_id,
+                                            OP_ROLE_KEY: OpRole.Optimize
+                                        })
 
                 for param, snapshot in p2s:
-                    sub_block.append_op(
-                        type='scale',
-                        inputs={'X': [param]},
-                        outputs={'Out': [param]},
-                        attrs={
-                            'scale': 1.0 / self.role_maker._worker_num(),
-                            OP_ROLE_KEY: OpRole.Optimize
-                        })
-                    sub_block.append_op(
-                        type='elementwise_sub',
-                        inputs={'X': [snapshot],
-                                'Y': [param]},
-                        outputs={'Out': [param]},
-                        attrs={OP_ROLE_KEY: OpRole.Optimize})
-                    sub_block.append_op(
-                        type='assign',
-                        inputs={'X': [param]},
-                        outputs={'Out': [snapshot]},
-                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(type='scale',
+                                        inputs={'X': [param]},
+                                        outputs={'Out': [param]},
+                                        attrs={
+                                            'scale':
+                                            1.0 / self.role_maker._worker_num(),
+                                            OP_ROLE_KEY:
+                                            OpRole.Optimize
+                                        })
+                    sub_block.append_op(type='elementwise_sub',
+                                        inputs={
+                                            'X': [snapshot],
+                                            'Y': [param]
+                                        },
+                                        outputs={'Out': [param]},
+                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(type='assign',
+                                        inputs={'X': [param]},
+                                        outputs={'Out': [snapshot]},
+                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                 layers.assign(step, last_step)
 
             def communicate_avg_loss():
                 communicate()
                 self._generate_avg_loss(main_block, loss, avg_loss)
-                next_local_steps = layers.cast(
-                    layers.ceil(
-                        layers.sqrt(lr_0 * avg_loss / (global_lr * loss_0) *
-                                    float(init_k_steps))),
-                    dtype='int64')
-                max_local_steps = layers.fill_constant(
-                    shape=[1], dtype='int64', value=16)
-                min_local_steps = layers.fill_constant(
-                    shape=[1], dtype='int64', value=1)
-                next_local_steps = layers.elementwise_min(next_local_steps,
-                                                          max_local_steps)
-                next_local_steps = layers.elementwise_max(next_local_steps,
-                                                          min_local_steps)
+                next_local_steps = layers.cast(layers.ceil(
+                    layers.sqrt(lr_0 * avg_loss / (global_lr * loss_0) *
+                                float(init_k_steps))),
+                                               dtype='int64')
+                max_local_steps = layers.fill_constant(shape=[1],
+                                                       dtype='int64',
+                                                       value=16)
+                min_local_steps = layers.fill_constant(shape=[1],
+                                                       dtype='int64',
+                                                       value=1)
+                next_local_steps = layers.elementwise_min(
+                    next_local_steps, max_local_steps)
+                next_local_steps = layers.elementwise_max(
+                    next_local_steps, min_local_steps)
                 layers.assign(next_local_steps, k_steps)
 
             def begin_localsgd():
diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
index 3bbaa055c5e59..35e11221b6f63 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -18,6 +18,7 @@
 
 
 class MetaOptimizerBase(Optimizer):
+
     def __init__(self, optimizer):
         self.inner_opt = optimizer
         self._learning_rate = self.inner_opt._learning_rate
@@ -47,12 +48,14 @@ def _can_update(self, optimizer):
         return False
 
     def _disable_strategy(self, dist_strategy):
-        raise NotImplementedError("you should implement disable strategy in {}".
-                                  format(type(self).__name__))
+        raise NotImplementedError(
+            "you should implement disable strategy in {}".format(
+                type(self).__name__))
 
     def _enable_strategy(self, dist_strategy, context=None):
-        raise NotImplementedError("you should implement enable strategy in {}".
-                                  format(type(self).__name__))
+        raise NotImplementedError(
+            "you should implement enable strategy in {}".format(
+                type(self).__name__))
 
     def apply_gradients(self, params_grads):
         return self.inner_opt.apply_gradients(params_grads=params_grads)
@@ -67,22 +70,23 @@ def backward(self,
                                        no_grad_set, callbacks)
 
     def apply_optimize(self, loss, startup_program, params_grads):
-        return self.inner_opt.apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
+        return self.inner_opt.apply_optimize(loss,
+                                             startup_program=startup_program,
+                                             params_grads=params_grads)
 
     def minimize_impl(self,
                       loss,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
-        params_grads = self.backward(
-            loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set)
+        params_grads = self.backward(loss,
+                                     startup_program=startup_program,
+                                     parameter_list=parameter_list,
+                                     no_grad_set=no_grad_set)
 
-        optimize_ops = self.apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
+        optimize_ops = self.apply_optimize(loss,
+                                           startup_program=startup_program,
+                                           params_grads=params_grads)
 
         return optimize_ops, params_grads
 
@@ -91,6 +95,7 @@ def minimize(self,
                  startup_program=None,
                  parameter_list=None,
                  no_grad_set=None):
-        optimize_ops, params_grads = self.minimize_impl(
-            loss, startup_program, parameter_list, no_grad_set)
+        optimize_ops, params_grads = self.minimize_impl(loss, startup_program,
+                                                        parameter_list,
+                                                        no_grad_set)
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
index ba2a0e84c7ab6..41a5da0d31505 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
@@ -19,6 +19,7 @@
 
 
 class ParameterServerGraphOptimizer(ParameterServerOptimizer):
+
     def __init__(self, optimizer):
         super(ParameterServerGraphOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -59,11 +60,10 @@ def _try_to_compile(self, main_program, loss):
 
         self._compiled_program = compiler.CompiledProgram(main_program)
 
-        self._compiled_program.with_data_parallel(
-            loss_name=loss.name,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy,
-            share_vars_from=None)
+        self._compiled_program.with_data_parallel(loss_name=loss.name,
+                                                  build_strategy=build_strategy,
+                                                  exec_strategy=exec_strategy,
+                                                  share_vars_from=None)
 
         return self._compiled_program
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index aec2436522300..c04215d45656c 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -24,6 +24,7 @@
 
 
 class ParameterServerOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(ParameterServerOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -32,8 +33,9 @@ def __init__(self, optimizer):
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
-        super(ParameterServerOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        super(ParameterServerOptimizer,
+              self)._set_basic_info(loss, role_maker, user_defined_optimizer,
+                                    user_defined_strategy)
 
         #self.micro_batch_size = user_defined_strategy.pipeline_configs[
         #    'micro_batch_size']
@@ -107,8 +109,8 @@ def _build_trainer_programs(self, compiled_config):
             if not use_ps_gpu:
                 _main = worker.delete_optimizer_pass(_main, compiled_config)
                 _main = worker.append_send_ops_pass(_main, compiled_config)
-                _startup = worker.delete_extra_optimizes_pass(_startup,
-                                                              compiled_config)
+                _startup = worker.delete_extra_optimizes_pass(
+                    _startup, compiled_config)
 
                 # for startup program
             _startup = worker.fake_init_ops_pass(_startup, compiled_config)
@@ -117,13 +119,12 @@ def _build_trainer_programs(self, compiled_config):
                 from paddle.fluid.transpiler.collective import SingleProcessMultiThread
                 t = SingleProcessMultiThread()
                 env = self.get_dist_env()
-                t.transpile(
-                    startup_program=_startup,
-                    main_program=_main,
-                    rank=env["trainer_id"],
-                    endpoints=env["trainer_endpoints"],
-                    current_endpoint=env['current_endpoint'],
-                    wait_port=False)
+                t.transpile(startup_program=_startup,
+                            main_program=_main,
+                            rank=env["trainer_id"],
+                            endpoints=env["trainer_endpoints"],
+                            current_endpoint=env['current_endpoint'],
+                            wait_port=False)
 
             compiled_config.set_origin_ps_main_program(_main)
             compiled_config.set_origin_ps_startup_program(_startup)
@@ -138,8 +139,8 @@ def _build_trainer_programs(self, compiled_config):
                         _main, compiled_config, stage_id, device)
                 else:
                     # for default worker
-                    _main = heter_worker.split_trainer_ops_pass(_main,
-                                                                compiled_config)
+                    _main = heter_worker.split_trainer_ops_pass(
+                        _main, compiled_config)
         else:
             _main = worker.append_send_ops_pass(_main, compiled_config)
             _startup = _startup
@@ -202,28 +203,29 @@ def _build_pserver_programs(self, compiled_config):
                                                       compiled_config, True)
 
             if not compiled_config.is_sync_mode():
-                _main = server.delete_unused_in_main_pass(_main,
-                                                          compiled_config)
+                _main = server.delete_unused_in_main_pass(
+                    _main, compiled_config)
 
-            _startup = server.delete_unused_in_startup_pass(_startup, _main,
-                                                            compiled_config)
+            _startup = server.delete_unused_in_startup_pass(
+                _startup, _main, compiled_config)
         else:
             _main = server.add_listen_and_serv_pass(_main, compiled_config)
             _main = server.add_rpc_global_flags_pass(_main, compiled_config)
             _main = server.add_geo_optimizer_pass(_main, compiled_config)
             _startup = server.build_pserver_startup_program_pass(
                 _startup, _main, compiled_config)
-            _startup = server.delete_unused_in_startup_pass(_startup, _main,
-                                                            compiled_config)
+            _startup = server.delete_unused_in_startup_pass(
+                _startup, _main, compiled_config)
 
         return _main, _startup
 
     def _can_apply_geo(self, dist_strategy, program):
+
         def get_sys_free_mem():
             plat = platform.system()
             if platform.system() == "Darwin":
-                vm = subprocess.Popen(
-                    ['vm_stat'], stdout=subprocess.PIPE).communicate()[0]
+                vm = subprocess.Popen(['vm_stat'],
+                                      stdout=subprocess.PIPE).communicate()[0]
                 # Process vm_stat
                 vmLines = vm.split('\n')
                 sep = re.compile(r':[\s]+')
@@ -231,8 +233,8 @@ def get_sys_free_mem():
                 for row in range(1, len(vmLines) - 2):
                     rowText = vmLines[row].strip()
                     rowElements = sep.split(rowText)
-                    vmStats[(rowElements[0]
-                             )] = int(rowElements[1].strip(r'\.')) * 4096
+                    vmStats[(rowElements[0])] = int(
+                        rowElements[1].strip(r'\.')) * 4096
                 return vmStats["Pages free"]
             elif platform.system() == "Linux":
                 mems = {}
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index 2988865887a92..d3f461850b8a1 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -26,6 +26,7 @@
 
 
 class PipelineOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(PipelineOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -33,15 +34,18 @@ def __init__(self, optimizer):
             "RecomputeOptimizer",
             "AMPOptimizer",
         ]
-        self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
+        self.meta_optimizers_black_list = [
+            "GraphExecutionOptimizer",
+        ]
         self.global_ring_id = 1
         self.dp_ring_id = 2
         self.start_pipeline_ring_id = 20  # Just a magic number
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
-        super(PipelineOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        super(PipelineOptimizer,
+              self)._set_basic_info(loss, role_maker, user_defined_optimizer,
+                                    user_defined_strategy)
         self.micro_batch_size = user_defined_strategy.pipeline_configs[
             'micro_batch_size']
         self.num_microbatches = user_defined_strategy.pipeline_configs[
@@ -85,23 +89,23 @@ def _broadcast_params(self, ring_id):
             if param.is_distributed:
                 continue
 
-            block.append_op(
-                type='c_broadcast',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={
-                    'ring_id': ring_id,
-                    'root': 0,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
+            block.append_op(type='c_broadcast',
+                            inputs={'X': param},
+                            outputs={'Out': param},
+                            attrs={
+                                'ring_id': ring_id,
+                                'root': 0,
+                                OP_ROLE_KEY: OpRole.Forward
+                            })
 
         if not param: return  # no parameter on this device
-        block.append_op(
-            type='c_sync_comm_stream',
-            inputs={'X': param},
-            outputs={'Out': param},
-            attrs={'ring_id': ring_id,
-                   OP_ROLE_KEY: OpRole.Forward})
+        block.append_op(type='c_sync_comm_stream',
+                        inputs={'X': param},
+                        outputs={'Out': param},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Forward
+                        })
 
     def _get_process_group_info(self):
         # global ring info
@@ -123,10 +127,12 @@ def _init_process_group(self, pipeline_pair, pipeline_ring_map):
         self._get_process_group_info()
         collective_helper = CollectiveHelper(self.role_maker, wait_port=False)
         # Create global ring for all gpus (ring_id = 0)
-        collective_helper._init_communicator(
-            self.startup_program, self.current_endpoint, self.global_endpoints,
-            self.global_rank, self.global_ring_id, True, self.global_ring_id,
-            True)
+        collective_helper._init_communicator(self.startup_program,
+                                             self.current_endpoint,
+                                             self.global_endpoints,
+                                             self.global_rank,
+                                             self.global_ring_id, True,
+                                             self.global_ring_id, True)
         # Create pipeline rings
         if self.inner_parallelism > 1:
             pipeline_id = self.rank // self.inner_parallelism
@@ -147,10 +153,12 @@ def _init_process_group(self, pipeline_pair, pipeline_ring_map):
                 ]
                 pipeline_rank = 0 if self.rank == first_node else 1
                 pipeline_nranks = 2
-                collective_helper._init_communicator(
-                    self.startup_program, self.current_endpoint,
-                    pipeline_endpoints, pipeline_rank, ring_id, False,
-                    self.global_ring_id, True)
+                collective_helper._init_communicator(self.startup_program,
+                                                     self.current_endpoint,
+                                                     pipeline_endpoints,
+                                                     pipeline_rank, ring_id,
+                                                     False, self.global_ring_id,
+                                                     True)
 
         # Create dp rings
         if self.pipeline_num > 1:
@@ -215,15 +223,14 @@ def _insert_loss_grad_ops(self, loss, pipeline_num):
         for idx, op in reversed(list(enumerate(block.ops))):
             if is_loss_grad_op(op):
                 loss_grad_var = block.vars[op.output_arg_names[0]]
-                block._insert_op(
-                    idx + 1,
-                    type='scale',
-                    inputs={'X': loss_grad_var},
-                    outputs={'Out': loss_grad_var},
-                    attrs={
-                        'scale': 1.0 / pipeline_num,
-                        OP_ROLE_KEY: OpRole.Backward
-                    })
+                block._insert_op(idx + 1,
+                                 type='scale',
+                                 inputs={'X': loss_grad_var},
+                                 outputs={'Out': loss_grad_var},
+                                 attrs={
+                                     'scale': 1.0 / pipeline_num,
+                                     OP_ROLE_KEY: OpRole.Backward
+                                 })
 
     def _insert_allreduce_ops(self, ring_id):
         block = self.main_program._pipeline_opt['section_program'].global_block(
@@ -256,13 +263,12 @@ def _insert_allreduce_ops(self, ring_id):
                     if origin_param.is_distributed:
                         continue
 
-                    block._insert_op(
-                        first_optimize_op_idx + offset,
-                        type='c_allreduce_sum',
-                        inputs={'X': grad},
-                        outputs={'Out': grad},
-                        attrs={
-                            'ring_id': ring_id,
-                            'use_calc_stream': True,
-                            OP_ROLE_KEY: OpRole.Optimize
-                        })
+                    block._insert_op(first_optimize_op_idx + offset,
+                                     type='c_allreduce_sum',
+                                     inputs={'X': grad},
+                                     outputs={'Out': grad},
+                                     attrs={
+                                         'ring_id': ring_id,
+                                         'use_calc_stream': True,
+                                         OP_ROLE_KEY: OpRole.Optimize
+                                     })
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
index d9062484bb550..cd6bc03a5d52a 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
@@ -26,6 +26,7 @@
 
 
 class ParameterServerOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(ParameterServerOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -34,8 +35,9 @@ def __init__(self, optimizer):
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
-        super(ParameterServerOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        super(ParameterServerOptimizer,
+              self)._set_basic_info(loss, role_maker, user_defined_optimizer,
+                                    user_defined_strategy)
 
     def _set_origin_programs(self, losses):
         self.origin_main_programs = []
@@ -69,12 +71,13 @@ def _init_ps_pass_context(self, loss, startup_program):
         attrs['is_worker'] = self.role_maker._is_worker()
         attrs['is_server'] = self.role_maker._is_server()
         attrs['is_heter_worker'] = self.role_maker._is_heter_worker()
-        logger.info("this process is heter? {}".format(attrs[
-            'is_heter_worker']))
+        logger.info("this process is heter? {}".format(
+            attrs['is_heter_worker']))
         attrs['use_ps_gpu'] = self.user_defined_strategy.a_sync_configs[
             "use_ps_gpu"]
         attrs['lr_decay_steps'] = self.user_defined_strategy.a_sync_configs[
             "lr_decay_steps"]
+        attrs['is_fl_ps_mode'] = self.user_defined_strategy.is_fl_ps_mode
         attrs['k_steps'] = self.user_defined_strategy.a_sync_configs["k_steps"]
         attrs['launch_barrier'] = self.user_defined_strategy.a_sync_configs[
             "launch_barrier"]
@@ -84,7 +87,7 @@ def _init_ps_pass_context(self, loss, startup_program):
 
         build_var_distributed(attrs)
 
-        # server 
+        # server
         attrs['_main_server'] = fluid.Program()
         attrs['_startup_server'] = fluid.Program()
         attrs['tensor_table'] = {}
@@ -111,6 +114,7 @@ def minimize_impl(self,
         if startup_program == None:
             startup_program = paddle.static.default_startup_program()
 
+
 #        print("program after inner optimizer minimize:",
 #              str(loss.block.program))
         self._set_origin_programs([loss])
@@ -143,11 +147,12 @@ def minimize_losses_impl(self,
         return None, None
 
     def _can_apply_geo(self, program):
+
         def get_sys_free_mem():
             plat = platform.system()
             if platform.system() == "Darwin":
-                vm = subprocess.Popen(
-                    ['vm_stat'], stdout=subprocess.PIPE).communicate()[0]
+                vm = subprocess.Popen(['vm_stat'],
+                                      stdout=subprocess.PIPE).communicate()[0]
                 # Process vm_stat
                 vmLines = vm.split('\n')
                 sep = re.compile(r':[\s]+')
@@ -155,8 +160,8 @@ def get_sys_free_mem():
                 for row in range(1, len(vmLines) - 2):
                     rowText = vmLines[row].strip()
                     rowElements = sep.split(rowText)
-                    vmStats[(rowElements[0]
-                             )] = int(rowElements[1].strip(r'\.')) * 4096
+                    vmStats[(rowElements[0])] = int(
+                        rowElements[1].strip(r'\.')) * 4096
                 return vmStats["Pages free"]
             elif platform.system() == "Linux":
                 mems = {}
diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
index d056d4e106597..2c7b1e45ebd1b 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
@@ -26,6 +26,7 @@
 
 
 class RawProgramOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(RawProgramOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -38,13 +39,16 @@ def __init__(self, optimizer):
             "DGCOptimizer",
             "LocalSGDOptimizer",
         ]
-        self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
+        self.meta_optimizers_black_list = [
+            "GraphExecutionOptimizer",
+        ]
         self.global_ring_id = 0
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
-        super(RawProgramOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        super(RawProgramOptimizer,
+              self)._set_basic_info(loss, role_maker, user_defined_optimizer,
+                                    user_defined_strategy)
         self.without_graph_optimization = user_defined_strategy.without_graph_optimization
         self.fuse_all_reduce_ops = user_defined_strategy.fuse_all_reduce_ops
         if self.fuse_all_reduce_ops:
@@ -72,23 +76,23 @@ def _broadcast_params(self, ring_id):
             if param.is_distributed:
                 continue
 
-            block.append_op(
-                type='c_broadcast',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={
-                    'ring_id': ring_id,
-                    'root': 0,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
+            block.append_op(type='c_broadcast',
+                            inputs={'X': param},
+                            outputs={'Out': param},
+                            attrs={
+                                'ring_id': ring_id,
+                                'root': 0,
+                                OP_ROLE_KEY: OpRole.Forward
+                            })
 
         if not param: return  # no parameter on this device
-        block.append_op(
-            type='c_sync_comm_stream',
-            inputs={'X': param},
-            outputs={'Out': param},
-            attrs={'ring_id': ring_id,
-                   OP_ROLE_KEY: OpRole.Forward})
+        block.append_op(type='c_sync_comm_stream',
+                        inputs={'X': param},
+                        outputs={'Out': param},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Forward
+                        })
 
     def _get_process_group_info(self):
         # global ring info
@@ -100,10 +104,12 @@ def _init_process_group(self):
         self._get_process_group_info()
         collective_helper = CollectiveHelper(self.role_maker, wait_port=False)
         # Create global ring for all gpus (ring_id = 0)
-        collective_helper._init_communicator(
-            self.startup_program, self.current_endpoint, self.global_endpoints,
-            self.global_rank, self.global_ring_id, True, self.global_ring_id,
-            True)
+        collective_helper._init_communicator(self.startup_program,
+                                             self.current_endpoint,
+                                             self.global_endpoints,
+                                             self.global_rank,
+                                             self.global_ring_id, True,
+                                             self.global_ring_id, True)
         self._broadcast_params(self.global_ring_id)
 
     def minimize_impl(self,
@@ -190,38 +196,35 @@ def _insert_allreduce_ops_for_gm(self, gm_block):
         if not grad_vars:
             return
 
-        gm_block._insert_op(
-            first_optimize_op_idx,
-            type="c_sync_calc_stream",
-            inputs={'X': grad_vars[0]},
-            outputs={'Out': grad_vars[0]},
-            attrs={OP_ROLE_KEY: OpRole.Backward})
+        gm_block._insert_op(first_optimize_op_idx,
+                            type="c_sync_calc_stream",
+                            inputs={'X': grad_vars[0]},
+                            outputs={'Out': grad_vars[0]},
+                            attrs={OP_ROLE_KEY: OpRole.Backward})
 
         insert_op_num = 1
         ring_id = self.global_ring_id
 
         # NOTE: can perform fuse allreduce inside the loop in the future
         for i, (p, g) in enumerate(zip(param_vars, grad_vars)):
-            gm_block._insert_op(
-                first_optimize_op_idx + insert_op_num,
-                type="c_allreduce_sum",
-                inputs={'X': g},
-                outputs={'Out': g},
-                attrs={
-                    'ring_id': ring_id,
-                    OP_ROLE_KEY: OpRole.Backward,
-                })
+            gm_block._insert_op(first_optimize_op_idx + insert_op_num,
+                                type="c_allreduce_sum",
+                                inputs={'X': g},
+                                outputs={'Out': g},
+                                attrs={
+                                    'ring_id': ring_id,
+                                    OP_ROLE_KEY: OpRole.Backward,
+                                })
             insert_op_num += 1
 
-        gm_block._insert_op(
-            first_optimize_op_idx + insert_op_num,
-            type="c_sync_comm_stream",
-            inputs={'X': grad_vars},
-            outputs={'Out': grad_vars},
-            attrs={
-                'ring_id': ring_id,
-                OP_ROLE_KEY: OpRole.Backward,
-            })
+        gm_block._insert_op(first_optimize_op_idx + insert_op_num,
+                            type="c_sync_comm_stream",
+                            inputs={'X': grad_vars},
+                            outputs={'Out': grad_vars},
+                            attrs={
+                                'ring_id': ring_id,
+                                OP_ROLE_KEY: OpRole.Backward,
+                            })
 
     def _transpile_main_program(self, loss):
         self._insert_loss_grad_ops(loss)
@@ -245,15 +248,14 @@ def _insert_loss_grad_ops(self, loss):
         for idx, op in reversed(list(enumerate(block.ops))):
             if is_loss_grad_op(op):
                 loss_grad_var = block.vars[op.output_arg_names[0]]
-                block._insert_op(
-                    idx + 1,
-                    type='scale',
-                    inputs={'X': loss_grad_var},
-                    outputs={'Out': loss_grad_var},
-                    attrs={
-                        'scale': 1.0 / self.nranks,
-                        OP_ROLE_KEY: OpRole.Backward
-                    })
+                block._insert_op(idx + 1,
+                                 type='scale',
+                                 inputs={'X': loss_grad_var},
+                                 outputs={'Out': loss_grad_var},
+                                 attrs={
+                                     'scale': 1.0 / self.nranks,
+                                     OP_ROLE_KEY: OpRole.Backward
+                                 })
 
     def _insert_allreduce_ops(self):
         block = self.main_program.global_block()
@@ -277,35 +279,36 @@ def _insert_allreduce_ops(self):
                         continue
 
                     grad_vars.append(grad)
-                    block._insert_op(
-                        idx + offset,
-                        type='c_sync_calc_stream',
-                        inputs={'X': grad},
-                        outputs={'Out': grad},
-                        attrs={OP_ROLE_KEY: OpRole.Backward, })
+                    block._insert_op(idx + offset,
+                                     type='c_sync_calc_stream',
+                                     inputs={'X': grad},
+                                     outputs={'Out': grad},
+                                     attrs={
+                                         OP_ROLE_KEY: OpRole.Backward,
+                                     })
                     offset += 1
-                    block._insert_op(
-                        idx + offset,
-                        type='c_allreduce_sum',
-                        inputs={'X': grad},
-                        outputs={'Out': grad},
-                        attrs={
-                            'ring_id': ring_id,
-                            OP_ROLE_KEY: OpRole.Backward
-                        })
+                    block._insert_op(idx + offset,
+                                     type='c_allreduce_sum',
+                                     inputs={'X': grad},
+                                     outputs={'Out': grad},
+                                     attrs={
+                                         'ring_id': ring_id,
+                                         OP_ROLE_KEY: OpRole.Backward
+                                     })
 
         if grad is None:
             return
 
         for idx, op in enumerate(block.ops):
             if is_optimizer_op(op):
-                block._insert_op(
-                    idx,
-                    type='c_sync_comm_stream',
-                    inputs={'X': grad_vars},
-                    outputs={'Out': grad_vars},
-                    attrs={'ring_id': ring_id,
-                           OP_ROLE_KEY: OpRole.Backward})
+                block._insert_op(idx,
+                                 type='c_sync_comm_stream',
+                                 inputs={'X': grad_vars},
+                                 outputs={'Out': grad_vars},
+                                 attrs={
+                                     'ring_id': ring_id,
+                                     OP_ROLE_KEY: OpRole.Backward
+                                 })
                 break
 
     # This function helps reduce the number of allreduce by integrating op, which can save communication time.
@@ -342,8 +345,8 @@ def _allreduce_fusion_program(self):
                         continue
                     param_grads.append((param, grad))
 
-        outputs_name_to_idx = self.__get_ouputs_name_to_idx(first_backward_idx,
-                                                            block)
+        outputs_name_to_idx = self.__get_ouputs_name_to_idx(
+            first_backward_idx, block)
 
         # structure of grad_param_segments is
         # [([grad0, grad1], [param0, param1]), ([grad2, grad3], [param2, param3])]
@@ -371,24 +374,23 @@ def _allreduce_fusion_program(self):
             # not to use reversed since needs the absolute index value
             grad_segment, param_segment = grad_param_segments[i]
             # insert coalesce tensor
-            fused_var = block.create_var(
-                name=unique_name.generate('FusedOutput_{}'.format(grad_segment[
-                    0].name)),
-                dtype=grad_segment[0].dtype,
-                persistable=False,
-                stop_gradient=True)
+            fused_var = block.create_var(name=unique_name.generate(
+                'FusedOutput_{}'.format(grad_segment[0].name)),
+                                         dtype=grad_segment[0].dtype,
+                                         persistable=False,
+                                         stop_gradient=True)
             fused_vars[i] = fused_var
             after_idx = outputs_name_to_idx[grad_segment[-1]][1]
-            block._insert_op_without_sync(
-                after_idx + 1,
-                type='c_allreduce_sum',
-                inputs={'X': fused_var},
-                outputs={'Out': fused_var},
-                attrs={
-                    'ring_id': ring_id,
-                    'use_calc_stream': self.calc_comm_same_stream,
-                    OP_ROLE_KEY: OpRole.Backward
-                })
+            block._insert_op_without_sync(after_idx + 1,
+                                          type='c_allreduce_sum',
+                                          inputs={'X': fused_var},
+                                          outputs={'Out': fused_var},
+                                          attrs={
+                                              'ring_id': ring_id,
+                                              'use_calc_stream':
+                                              self.calc_comm_same_stream,
+                                              OP_ROLE_KEY: OpRole.Backward
+                                          })
             if not self.calc_comm_same_stream:
                 block._insert_op_without_sync(
                     after_idx + 1,
@@ -398,8 +400,8 @@ def _allreduce_fusion_program(self):
                     attrs={OP_ROLE_KEY: OpRole.Backward})
 
         # update the outputs_name_to_idx after insertion of sync/allreduce ops
-        outputs_name_to_idx = self.__get_ouputs_name_to_idx(first_backward_idx,
-                                                            block)
+        outputs_name_to_idx = self.__get_ouputs_name_to_idx(
+            first_backward_idx, block)
         # the before_idx is not guaranteed sorted, therefore we have to find the
         # topology to insert the coalesce ops
         pos_for_coalesce = {}
@@ -413,25 +415,25 @@ def _allreduce_fusion_program(self):
             pos_for_coalesce[i] = before_idx
 
         # insert the coalesce op based on the sorted before_idx
-        pos_for_coalesce = sorted(
-            pos_for_coalesce.items(),
-            key=lambda kv: (kv[1], kv[0]),
-            reverse=True)
+        pos_for_coalesce = sorted(pos_for_coalesce.items(),
+                                  key=lambda kv: (kv[1], kv[0]),
+                                  reverse=True)
         for i, before_idx in pos_for_coalesce:
             grad_segment, param_segment = grad_param_segments[i]
             fused_var = fused_vars[i]
-            block._insert_op_without_sync(
-                before_idx,
-                type="coalesce_tensor",
-                inputs={"Input": param_segment},
-                outputs={"Output": grad_segment,
-                         "FusedOutput": fused_var},
-                attrs={
-                    "copy_data": False,
-                    "use_align": True,
-                    "dtype": grad_segment[0].dtype,
-                    OP_ROLE_KEY: OpRole.Backward
-                })
+            block._insert_op_without_sync(before_idx,
+                                          type="coalesce_tensor",
+                                          inputs={"Input": param_segment},
+                                          outputs={
+                                              "Output": grad_segment,
+                                              "FusedOutput": fused_var
+                                          },
+                                          attrs={
+                                              "copy_data": False,
+                                              "use_align": True,
+                                              "dtype": grad_segment[0].dtype,
+                                              OP_ROLE_KEY: OpRole.Backward
+                                          })
 
         if self.calc_comm_same_stream:
             block._sync_with_cpp()
@@ -440,13 +442,14 @@ def _allreduce_fusion_program(self):
         # insert the sync comm op
         for idx, op in enumerate(block.ops):
             if is_optimizer_op(op):
-                block._insert_op_without_sync(
-                    idx,
-                    type='c_sync_comm_stream',
-                    inputs={'X': fused_vars},
-                    outputs={'Out': fused_vars},
-                    attrs={'ring_id': ring_id,
-                           OP_ROLE_KEY: OpRole.Backward})
+                block._insert_op_without_sync(idx,
+                                              type='c_sync_comm_stream',
+                                              inputs={'X': fused_vars},
+                                              outputs={'Out': fused_vars},
+                                              attrs={
+                                                  'ring_id': ring_id,
+                                                  OP_ROLE_KEY: OpRole.Backward
+                                              })
                 break
         block._sync_with_cpp()
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index d79675448c042..c9054c793f491 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -18,6 +18,7 @@
 
 
 class RecomputeOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(RecomputeOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -33,8 +34,9 @@ def __init__(self, optimizer):
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
-        super(RecomputeOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        super(RecomputeOptimizer,
+              self)._set_basic_info(loss, role_maker, user_defined_optimizer,
+                                    user_defined_strategy)
 
     def _init_wrapped_opt(self):
         if self.wrapped_opt is not None:
@@ -54,8 +56,8 @@ def _can_apply(self):
             return False
 
         if self.user_defined_strategy.recompute == True:
-            if len(self.user_defined_strategy.recompute_configs[
-                    "checkpoints"]) == 0:
+            if len(self.user_defined_strategy.recompute_configs["checkpoints"]
+                   ) == 0:
                 return False
             else:
                 return True
@@ -83,8 +85,9 @@ def apply_gradients(self, params_grads):
         return self.wrapped_opt.apply_gradients(params_grads=params_grads)
 
     def apply_optimize(self, loss, startup_program, params_grads):
-        return self.wrapped_opt.apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
+        return self.wrapped_opt.apply_optimize(loss,
+                                               startup_program=startup_program,
+                                               params_grads=params_grads)
 
     def minimize_impl(self,
                       loss,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/__init__.py
index 5d358dbd35fa8..abf198b97e6e8 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
index c5b2d9227bc16..9e3537a3ced2d 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
@@ -21,6 +21,7 @@
 
 
 class FP16Utils(object):
+
     def __init__(self):
         pass
 
@@ -32,8 +33,8 @@ def is_fp16_cast_op(block, op, params):
             return False
         assert (len(op.desc.input_arg_names()) == 1)
         assert (len(op.desc.output_arg_names()) == 1)
-        input_name, output_name = op.desc.input_arg_names()[
-            0], op.desc.output_arg_names()[0]
+        input_name, output_name = op.desc.input_arg_names(
+        )[0], op.desc.output_arg_names()[0]
         if input_name not in params:
             return False
         input_var = block.var(input_name)
@@ -51,8 +52,8 @@ def is_fp32_cast_op(block, op):
             return False
         assert (len(op.desc.input_arg_names()) == 1)
         assert (len(op.desc.output_arg_names()) == 1)
-        input_name, output_name = op.desc.input_arg_names()[
-            0], op.desc.output_arg_names()[0]
+        input_name, output_name = op.desc.input_arg_names(
+        )[0], op.desc.output_arg_names()[0]
         input_var = block.var(input_name)
         output_var = block.var(output_name)
         if input_var.dtype != core.VarDesc.VarType.FP16 or \
@@ -88,9 +89,9 @@ def prune_fp16(block, shard, reduced_grads_to_param, ring_ids):
                 "@GRAD@MERGED"
             ) if "@MERGED" in output_name else output_name.strip("@GRAD")
             if param_name not in shard.global_params:
-                raise ValueError("Output 'X' of cast_op must be a grad of"
-                                 "model param, but {} is not a grad".format(
-                                     output_name))
+                raise ValueError(
+                    "Output 'X' of cast_op must be a grad of"
+                    "model param, but {} is not a grad".format(output_name))
             if output_name in reduced_grads_to_param:
                 continue
             if shard.has_param(param_name):
@@ -137,49 +138,45 @@ def prune_fp16(block, shard, reduced_grads_to_param, ring_ids):
         if update_loss_scaling_op_idx == -1:
             return
         inf_var = block.var(inf_var_name)
-        inf_var_int32 = block.create_var(
-            name=inf_var_name + "@cast_int32",
-            shape=inf_var.shape,
-            dtype=core.VarDesc.VarType.INT32)
-
-        block._insert_op_without_sync(
-            update_loss_scaling_op_idx,
-            type='cast',
-            inputs={'X': inf_var},
-            outputs={'Out': inf_var_int32},
-            attrs={
-                "in_dtype": inf_var.dtype,
-                "out_dtype": inf_var_int32.dtype,
-                OP_ROLE_KEY: OpRole.Optimize
-            })
+        inf_var_int32 = block.create_var(name=inf_var_name + "@cast_int32",
+                                         shape=inf_var.shape,
+                                         dtype=core.VarDesc.VarType.INT32)
+
+        block._insert_op_without_sync(update_loss_scaling_op_idx,
+                                      type='cast',
+                                      inputs={'X': inf_var},
+                                      outputs={'Out': inf_var_int32},
+                                      attrs={
+                                          "in_dtype": inf_var.dtype,
+                                          "out_dtype": inf_var_int32.dtype,
+                                          OP_ROLE_KEY: OpRole.Optimize
+                                      })
         update_loss_scaling_op_idx += 1
 
         # allreduce(mp)->allreduce(sharding)->allreduce(pp)
         for ring_id in ring_ids:
             if ring_id == -1: continue
             # this allreduce communication should not overlap with calc
-            block._insert_op_without_sync(
-                update_loss_scaling_op_idx,
-                type='c_allreduce_max',
-                inputs={'X': inf_var_int32},
-                outputs={'Out': inf_var_int32},
-                attrs={
-                    'ring_id': ring_id,
-                    'use_calc_stream': True,
-                    OP_ROLE_KEY: OpRole.Optimize
-                })
+            block._insert_op_without_sync(update_loss_scaling_op_idx,
+                                          type='c_allreduce_max',
+                                          inputs={'X': inf_var_int32},
+                                          outputs={'Out': inf_var_int32},
+                                          attrs={
+                                              'ring_id': ring_id,
+                                              'use_calc_stream': True,
+                                              OP_ROLE_KEY: OpRole.Optimize
+                                          })
             update_loss_scaling_op_idx += 1
 
-        block._insert_op_without_sync(
-            update_loss_scaling_op_idx,
-            type='cast',
-            inputs={'X': inf_var_int32},
-            outputs={'Out': inf_var},
-            attrs={
-                "in_dtype": inf_var_int32.dtype,
-                "out_dtype": inf_var.dtype,
-                OP_ROLE_KEY: OpRole.Optimize
-            })
+        block._insert_op_without_sync(update_loss_scaling_op_idx,
+                                      type='cast',
+                                      inputs={'X': inf_var_int32},
+                                      outputs={'Out': inf_var},
+                                      attrs={
+                                          "in_dtype": inf_var_int32.dtype,
+                                          "out_dtype": inf_var.dtype,
+                                          OP_ROLE_KEY: OpRole.Optimize
+                                      })
         update_loss_scaling_op_idx += 1
         block._sync_with_cpp()
 
@@ -201,46 +198,42 @@ def sync_amp_check_nan_inf(block, ring_ids):
         # 1. inf_var_int32 = allreduce_max(inf_var_int32)
         # 3. inf_var = cast(inf_var_int32)
         inf_var = block.var(inf_var_name)
-        inf_var_int32 = block.create_var(
-            name=inf_var_name + "@cast_int32",
-            shape=inf_var.shape,
-            dtype=core.VarDesc.VarType.INT32)
-        block._insert_op_without_sync(
-            update_loss_scaling_op_idx,
-            type='cast',
-            inputs={'X': inf_var},
-            outputs={'Out': inf_var_int32},
-            attrs={
-                "in_dtype": inf_var.dtype,
-                "out_dtype": inf_var_int32.dtype,
-                OP_ROLE_KEY: OpRole.Optimize
-            })
+        inf_var_int32 = block.create_var(name=inf_var_name + "@cast_int32",
+                                         shape=inf_var.shape,
+                                         dtype=core.VarDesc.VarType.INT32)
+        block._insert_op_without_sync(update_loss_scaling_op_idx,
+                                      type='cast',
+                                      inputs={'X': inf_var},
+                                      outputs={'Out': inf_var_int32},
+                                      attrs={
+                                          "in_dtype": inf_var.dtype,
+                                          "out_dtype": inf_var_int32.dtype,
+                                          OP_ROLE_KEY: OpRole.Optimize
+                                      })
         update_loss_scaling_op_idx += 1
 
         # allreduce(mp)->allreduce(pp)
         for ring_id in ring_ids:
             if ring_id == -1: continue
-            block._insert_op_without_sync(
-                update_loss_scaling_op_idx,
-                type='c_allreduce_max',
-                inputs={'X': inf_var_int32},
-                outputs={'Out': inf_var_int32},
-                attrs={
-                    'ring_id': ring_id,
-                    'use_calc_stream': True,
-                    OP_ROLE_KEY: OpRole.Optimize
-                })
+            block._insert_op_without_sync(update_loss_scaling_op_idx,
+                                          type='c_allreduce_max',
+                                          inputs={'X': inf_var_int32},
+                                          outputs={'Out': inf_var_int32},
+                                          attrs={
+                                              'ring_id': ring_id,
+                                              'use_calc_stream': True,
+                                              OP_ROLE_KEY: OpRole.Optimize
+                                          })
             update_loss_scaling_op_idx += 1
 
-        block._insert_op_without_sync(
-            update_loss_scaling_op_idx,
-            type='cast',
-            inputs={'X': inf_var_int32},
-            outputs={'Out': inf_var},
-            attrs={
-                "in_dtype": inf_var_int32.dtype,
-                "out_dtype": inf_var.dtype,
-                OP_ROLE_KEY: OpRole.Optimize
-            })
+        block._insert_op_without_sync(update_loss_scaling_op_idx,
+                                      type='cast',
+                                      inputs={'X': inf_var_int32},
+                                      outputs={'Out': inf_var},
+                                      attrs={
+                                          "in_dtype": inf_var_int32.dtype,
+                                          "out_dtype": inf_var.dtype,
+                                          OP_ROLE_KEY: OpRole.Optimize
+                                      })
         update_loss_scaling_op_idx += 1
         block._sync_with_cpp()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
index 5d28c2d5cebd9..03d955842f5fc 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
@@ -18,6 +18,7 @@
 
 
 class GradientClipHelper(object):
+
     def __init__(self, mp_ring_id):
         self.mp_ring_id = mp_ring_id
 
@@ -95,17 +96,20 @@ def prune_gradient_clip(self, block, shard, ring_ids):
                     namescope = op.attr("op_namescope")
 
                     block._remove_op(idx, sync=False)
-                    op = block._insert_op_without_sync(
-                        idx,
-                        type='fill_constant',
-                        inputs={},
-                        outputs={'Out': sum_res},
-                        attrs={
-                            'shape': sum_var.shape,
-                            'dtype': sum_var.dtype,
-                            'value': 0.0,
-                            OP_ROLE_KEY: OpRole.Optimize
-                        })
+                    op = block._insert_op_without_sync(idx,
+                                                       type='fill_constant',
+                                                       inputs={},
+                                                       outputs={'Out': sum_res},
+                                                       attrs={
+                                                           'shape':
+                                                           sum_var.shape,
+                                                           'dtype':
+                                                           sum_var.dtype,
+                                                           'value':
+                                                           0.0,
+                                                           OP_ROLE_KEY:
+                                                           OpRole.Optimize
+                                                       })
                     op._set_attr('op_namescope', namescope)
 
                 # allreduce(mp)->allreduce(sharding)->allreduce(pp)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
index 7b47cb6d2637c..9479dc5fceee2 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
@@ -58,23 +58,21 @@ def __init__(self, mp_ring_id=None, dp_ring_id=None):
     def _insert_cast_op(self, block, idx, src_name, dst_name):
         src_var = block.var(src_name)
         if not block.has_var(dst_name):
-            block.create_var(
-                name=dst_name,
-                shape=src_var.shape,
-                dtype=core.VarDesc.VarType.FP16,
-                persistable=True)
+            block.create_var(name=dst_name,
+                             shape=src_var.shape,
+                             dtype=core.VarDesc.VarType.FP16,
+                             persistable=True)
         dst_var = block.var(dst_name)
         assert dst_var.dtype == core.VarDesc.VarType.FP16
-        block._insert_op_without_sync(
-            idx,
-            type='cast',
-            inputs={'X': src_var},
-            outputs={'Out': dst_var},
-            attrs={
-                'in_dtype': src_var.dtype,
-                'out_dtype': dst_var.dtype,
-                OP_ROLE_KEY: OpRole.Optimize
-            })
+        block._insert_op_without_sync(idx,
+                                      type='cast',
+                                      inputs={'X': src_var},
+                                      outputs={'Out': dst_var},
+                                      attrs={
+                                          'in_dtype': src_var.dtype,
+                                          'out_dtype': dst_var.dtype,
+                                          OP_ROLE_KEY: OpRole.Optimize
+                                      })
 
     def _insert_broadcast_op(self, block, idx, param_name):
         rings = []
@@ -90,30 +88,28 @@ def _insert_broadcast_op(self, block, idx, param_name):
 
         # the insert op order is: mp, dp
         for ring in rings:
-            block._insert_op_without_sync(
-                idx,
-                type="c_broadcast",
-                inputs={'X': param_name},
-                outputs={'Out': param_name},
-                attrs={
-                    'ring_id': ring,
-                    'root': 0,
-                    'use_calc_stream': True,
-                    OP_ROLE_KEY: OpRole.Forward,
-                })
+            block._insert_op_without_sync(idx,
+                                          type="c_broadcast",
+                                          inputs={'X': param_name},
+                                          outputs={'Out': param_name},
+                                          attrs={
+                                              'ring_id': ring,
+                                              'root': 0,
+                                              'use_calc_stream': True,
+                                              OP_ROLE_KEY: OpRole.Forward,
+                                          })
 
     def _insert_memcpy_op(self, block, idx, src_name, dst_name, dst_place_type):
         src_var = block.var(src_name)
         dst_var = block.var(dst_name)
-        block._insert_op_without_sync(
-            idx,
-            type='memcpy',
-            inputs={'X': src_var},
-            outputs={'Out': dst_var},
-            attrs={
-                'dst_place_type': dst_place_type,
-                OP_ROLE_KEY: OpRole.Optimize,
-            })
+        block._insert_op_without_sync(idx,
+                                      type='memcpy',
+                                      inputs={'X': src_var},
+                                      outputs={'Out': dst_var},
+                                      attrs={
+                                          'dst_place_type': dst_place_type,
+                                          OP_ROLE_KEY: OpRole.Optimize,
+                                      })
 
     def _insert_fetch_op(self, block, idx, src_name, dst_name):
         self._insert_memcpy_op(block, idx, src_name, dst_name,
@@ -130,11 +126,10 @@ def _create_offload_var(self, var_name, offload_var_name, blocks):
         for block in blocks:
             var = block.var(var_name)
             var.persistable = False
-            offload_var = block.create_var(
-                name=offload_var_name,
-                shape=var.shape,
-                dtype=var.dtype,
-                persistable=True)
+            offload_var = block.create_var(name=offload_var_name,
+                                           shape=var.shape,
+                                           dtype=var.dtype,
+                                           persistable=True)
 
     def offload_fp32param(self, block, startup_block, offload=True):
         """
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
index 9e577ca0c670a..adbc00f25deb6 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
@@ -16,6 +16,7 @@
 
 
 class ProgramDeps(object):
+
     def __init__(self, block, start_vars, end_vars):
         self._block = block
         # vars where to start to build the deps
@@ -92,8 +93,8 @@ def crop_input_var_from_op(self, op_idx, var_name):
                     raise ValueError(
                         "op_idx: {} is not in self._var_to_use_op[{}], "
                         "self._var_to_use_op[{}] is {}".format(
-                            op_idx, var_name, var_name, self._var_to_use_op[
-                                var_name]))
+                            op_idx, var_name, var_name,
+                            self._var_to_use_op[var_name]))
                 self._var_to_use_op[var_name].remove(op_idx)
             # update _should_removed_var
             if var_name in self._start_vars:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
index 52dfed83d33c4..7002dfa2be514 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
@@ -20,6 +20,7 @@
 
 
 class Shard(object):
+
     def __init__(self, ):
         self.global_params = set([])
         self.worker_idx = -1
@@ -30,7 +31,7 @@ def __init__(self, ):
     def setup(self, params_grads, worker_idx, worker_num):
         # param names of all devices
         self.global_params = set([x[0].name for x in params_grads])
-        # _param(str) -> device_id(int) 
+        # _param(str) -> device_id(int)
         self.worker_idx = worker_idx
         self.worker_num = worker_num
         # global_param2device contains fp32 params and fp16 params
@@ -138,6 +139,7 @@ def filter_grads(self, grads):
 
 
 class ProgramSegment(object):
+
     def __init__(self, block):
         self._block = block
         self._allreduce_vars = []
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index 1a3a8a4883d8b..39f71be0cde76 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -38,10 +38,11 @@ def check_broadcast(block):
                 var_name = op.desc.input_arg_names()[0]
                 if "@BroadCast" in var_name:
                     if var_name in broadcast_vars:
-                        raise ValueError("var_name areadly exist: {}"
-                                         "the old pos is {}, the new pos is {}".
-                                         format(var_name, broadcast_vars[
-                                             var_name]["broadcast_pos"], idx))
+                        raise ValueError(
+                            "var_name areadly exist: {}"
+                            "the old pos is {}, the new pos is {}".format(
+                                var_name,
+                                broadcast_vars[var_name]["broadcast_pos"], idx))
                     broadcast_vars[var_name] = {
                         "fill_constant_pos": -1,
                         "broadcast_pos": idx,
@@ -149,9 +150,9 @@ def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1):
                     else:
                         _status = dp_grads_status[var_name]
                     if _status == -1:
-                        raise ValueError("{} is not generated, but you are"
-                                         "trying to all-reduce it".format(
-                                             var_name))
+                        raise ValueError(
+                            "{} is not generated, but you are"
+                            "trying to all-reduce it".format(var_name))
                     if _status == 0:
                         raise ValueError("There should be a sync_calc op "
                                          "after generate Var: {} and before the"
@@ -190,18 +191,19 @@ def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1):
             for input_name in op.desc.input_arg_names():
                 if input_name in vars_status:
                     if vars_status[input_name] != 3:
-                        raise ValueError("There should be a sync_comm op "
-                                         "after allreduce the Var: {}".format(
-                                             input_name))
+                        raise ValueError(
+                            "There should be a sync_comm op "
+                            "after allreduce the Var: {}".format(input_name))
                     raise ValueError(
-                        "The reduce output grad [{}] should NOT be be used in Non-root rank.".
-                        format(input_name))
+                        "The reduce output grad [{}] should NOT be be used in Non-root rank."
+                        .format(input_name))
                 if input_name in dp_grads_status:
                     if dp_ring_id == -1:
                         if dp_grads_status[input_name] != 3:
-                            raise ValueError("There should be a sync_comm op "
-                                             "after allreduce the Var: {}".
-                                             format(input_name))
+                            raise ValueError(
+                                "There should be a sync_comm op "
+                                "after allreduce the Var: {}".format(
+                                    input_name))
                     else:
                         if dp_grads_status[input_name] != 5:
                             raise ValueError(
@@ -232,8 +234,9 @@ def get_valid_op_role(block, insert_idx):
     return OpRole.Forward or OpRole.Backward
     """
     op_role = block.ops[insert_idx].attr('op_role')
-    if (insert_idx >= len(block.ops)) or (
-            op_role in [int(OpRole.Backward), int(OpRole.Optimize)]):
+    if (insert_idx >= len(block.ops)) or (op_role in [
+            int(OpRole.Backward), int(OpRole.Optimize)
+    ]):
         return OpRole.Backward
     if op_role in [int(OpRole.Forward), int(OpRole.Loss)]:
         return OpRole.Forward
@@ -246,12 +249,11 @@ def insert_sync_calc_op(block, insert_idx, calc_dep_vars):
     _insert_sync_calc_op
     """
     op_role = get_valid_op_role(block, insert_idx)
-    block._insert_op_without_sync(
-        insert_idx,
-        type='c_sync_calc_stream',
-        inputs={'X': calc_dep_vars},
-        outputs={'Out': calc_dep_vars},
-        attrs={OP_ROLE_KEY: op_role})
+    block._insert_op_without_sync(insert_idx,
+                                  type='c_sync_calc_stream',
+                                  inputs={'X': calc_dep_vars},
+                                  outputs={'Out': calc_dep_vars},
+                                  attrs={OP_ROLE_KEY: op_role})
     return
 
 
@@ -260,13 +262,14 @@ def insert_sync_comm_op(block, insert_idx, ring_id, comm_dep_vars):
     insert sync_comm_op for single var
     """
     op_role = get_valid_op_role(block, insert_idx)
-    block._insert_op_without_sync(
-        insert_idx,
-        type='c_sync_comm_stream',
-        inputs={'X': comm_dep_vars},
-        outputs={'Out': comm_dep_vars},
-        attrs={'ring_id': ring_id,
-               OP_ROLE_KEY: op_role})
+    block._insert_op_without_sync(insert_idx,
+                                  type='c_sync_comm_stream',
+                                  inputs={'X': comm_dep_vars},
+                                  outputs={'Out': comm_dep_vars},
+                                  attrs={
+                                      'ring_id': ring_id,
+                                      OP_ROLE_KEY: op_role
+                                  })
     return 1
 
 
@@ -274,18 +277,19 @@ def insert_sync_comm_ops(block, insert_idx, ring_id, comm_dep_vars):
     """
     insert sync_comm_op for vars
     """
-    # NOTE (JZ-LIANG) to be check, may result undefined case 
+    # NOTE (JZ-LIANG) to be check, may result undefined case
     if len(comm_dep_vars) == 0:
         return 0
 
     op_role = get_valid_op_role(block, insert_idx)
-    block._insert_op_without_sync(
-        insert_idx,
-        type='c_sync_comm_stream',
-        inputs={'X': comm_dep_vars},
-        outputs={'Out': comm_dep_vars},
-        attrs={'ring_id': int(ring_id),
-               OP_ROLE_KEY: op_role})
+    block._insert_op_without_sync(insert_idx,
+                                  type='c_sync_comm_stream',
+                                  inputs={'X': comm_dep_vars},
+                                  outputs={'Out': comm_dep_vars},
+                                  attrs={
+                                      'ring_id': int(ring_id),
+                                      OP_ROLE_KEY: op_role
+                                  })
     return 1
 
 
@@ -296,16 +300,15 @@ def insert_fill_constant_ops(block, insert_idx, fill_constant_vars):
     op_role = get_valid_op_role(block, insert_idx)
     for broadcast_name in fill_constant_vars:
         broadcast_var = block.var(broadcast_name)
-        block._insert_op_without_sync(
-            insert_idx,
-            type="fill_constant",
-            outputs={"Out": broadcast_var.name},
-            attrs={
-                "shape": broadcast_var.shape,
-                "dtype": broadcast_var.dtype,
-                "value": 0.0,
-                OP_ROLE_KEY: op_role
-            })
+        block._insert_op_without_sync(insert_idx,
+                                      type="fill_constant",
+                                      outputs={"Out": broadcast_var.name},
+                                      attrs={
+                                          "shape": broadcast_var.shape,
+                                          "dtype": broadcast_var.dtype,
+                                          "value": 0.0,
+                                          OP_ROLE_KEY: op_role
+                                      })
     return
 
 
@@ -315,16 +318,16 @@ def insert_cast_ops(block, insert_idx, cast_ops):
     """
     op_role = get_valid_op_role(block, insert_idx)
     for fp16_name, fp32_name in cast_ops.items():
-        block._insert_op_without_sync(
-            insert_idx,
-            type="cast",
-            inputs={"X": fp32_name},
-            outputs={"Out": fp16_name},
-            attrs={
-                "in_dtype": core.VarDesc.VarType.FP32,
-                "out_dtype": core.VarDesc.VarType.FP16,
-                OP_ROLE_KEY: op_role
-            })
+        block._insert_op_without_sync(insert_idx,
+                                      type="cast",
+                                      inputs={"X": fp32_name},
+                                      outputs={"Out": fp16_name},
+                                      attrs={
+                                          "in_dtype": core.VarDesc.VarType.FP32,
+                                          "out_dtype":
+                                          core.VarDesc.VarType.FP16,
+                                          OP_ROLE_KEY: op_role
+                                      })
     return
 
 
@@ -351,21 +354,22 @@ def insert_allreduce_ops(block,
                                    user_defined_strategy.fuse_grad_size_in_MB)
     else:
         for var in allreduce_vars:
-            block._insert_op_without_sync(
-                insert_idx,
-                type='c_allreduce_sum',
-                inputs={'X': var},
-                outputs={'Out': var},
-                attrs={
-                    'ring_id': ring_id,
-                    'use_calc_stream': use_calc_stream,
-                    OP_ROLE_KEY: op_role
-                })
+            block._insert_op_without_sync(insert_idx,
+                                          type='c_allreduce_sum',
+                                          inputs={'X': var},
+                                          outputs={'Out': var},
+                                          attrs={
+                                              'ring_id': ring_id,
+                                              'use_calc_stream':
+                                              use_calc_stream,
+                                              OP_ROLE_KEY: op_role
+                                          })
 
     return
 
 
 class FuseHelper(object):
+
     @staticmethod
     def sort_vars_by_dtype(block, vars_name):
         fp32_vars = []
@@ -419,25 +423,25 @@ def insert_coalesce_tensor(block,
                 fused_vars.append(group[0])
                 continue
 
-            fused_var = block.create_var(
-                name=unique_name.generate('Fused{}_{}'.format(prefix, group[0]
-                                                              .name)),
-                dtype=group[0].dtype,
-                persistable=False,
-                stop_gradient=True)
+            fused_var = block.create_var(name=unique_name.generate(
+                'Fused{}_{}'.format(prefix, group[0].name)),
+                                         dtype=group[0].dtype,
+                                         persistable=False,
+                                         stop_gradient=True)
             fused_vars.append(fused_var)
-            block._insert_op_without_sync(
-                index,
-                type="coalesce_tensor",
-                inputs={"Input": group},
-                outputs={"Output": group,
-                         "FusedOutput": fused_var},
-                attrs={
-                    "copy_data": True,
-                    "use_align": True,
-                    "dtype": group[0].dtype,
-                    OP_ROLE_KEY: op_role
-                })
+            block._insert_op_without_sync(index,
+                                          type="coalesce_tensor",
+                                          inputs={"Input": group},
+                                          outputs={
+                                              "Output": group,
+                                              "FusedOutput": fused_var
+                                          },
+                                          attrs={
+                                              "copy_data": True,
+                                              "use_align": True,
+                                              "dtype": group[0].dtype,
+                                              OP_ROLE_KEY: op_role
+                                          })
             insert_num += 1
         return fused_vars, insert_num
 
@@ -452,27 +456,28 @@ def insert_fused_allreduce_ops(block,
     groups = FuseHelper.get_fused_groups(block, allreduce_vars,
                                          fuse_grad_size_in_MB)
 
-    fused_vars, insert_num = FuseHelper.insert_coalesce_tensor(
-        block, insert_idx, groups, op_role, prefix="Grad")
+    fused_vars, insert_num = FuseHelper.insert_coalesce_tensor(block,
+                                                               insert_idx,
+                                                               groups,
+                                                               op_role,
+                                                               prefix="Grad")
 
     for fused_var in fused_vars:
-        block._insert_op_without_sync(
-            insert_idx + insert_num,
-            type='c_allreduce_sum',
-            inputs={'X': fused_var},
-            outputs={'Out': fused_var},
-            attrs={
-                'ring_id': ring_id,
-                'use_calc_stream': use_calc_stream,
-                OP_ROLE_KEY: op_role
-            })
+        block._insert_op_without_sync(insert_idx + insert_num,
+                                      type='c_allreduce_sum',
+                                      inputs={'X': fused_var},
+                                      outputs={'Out': fused_var},
+                                      attrs={
+                                          'ring_id': ring_id,
+                                          'use_calc_stream': use_calc_stream,
+                                          OP_ROLE_KEY: op_role
+                                      })
         if not use_calc_stream:
-            block._insert_op_without_sync(
-                insert_idx + insert_num,
-                type='c_sync_calc_stream',
-                inputs={'X': fused_var},
-                outputs={'Out': fused_var},
-                attrs={OP_ROLE_KEY: op_role})
+            block._insert_op_without_sync(insert_idx + insert_num,
+                                          type='c_sync_calc_stream',
+                                          inputs={'X': fused_var},
+                                          outputs={'Out': fused_var},
+                                          attrs={OP_ROLE_KEY: op_role})
 
 
 def insert_fused_reduce_ops(block,
@@ -501,24 +506,23 @@ def insert_fused_reduce_ops(block,
             block, insert_idx, groups, op_role, prefix="Grad")
 
         for fused_var in fused_vars:
-            block._insert_op_without_sync(
-                insert_idx + insert_num,
-                type='c_reduce_sum',
-                inputs={'X': fused_var},
-                outputs={'Out': fused_var},
-                attrs={
-                    'ring_id': ring_id,
-                    'root_id': root_id,
-                    'use_calc_stream': use_calc_stream,
-                    OP_ROLE_KEY: op_role
-                })
+            block._insert_op_without_sync(insert_idx + insert_num,
+                                          type='c_reduce_sum',
+                                          inputs={'X': fused_var},
+                                          outputs={'Out': fused_var},
+                                          attrs={
+                                              'ring_id': ring_id,
+                                              'root_id': root_id,
+                                              'use_calc_stream':
+                                              use_calc_stream,
+                                              OP_ROLE_KEY: op_role
+                                          })
             if not use_calc_stream:
-                block._insert_op_without_sync(
-                    insert_idx + insert_num,
-                    type='c_sync_calc_stream',
-                    inputs={'X': fused_var},
-                    outputs={'Out': fused_var},
-                    attrs={OP_ROLE_KEY: op_role})
+                block._insert_op_without_sync(insert_idx + insert_num,
+                                              type='c_sync_calc_stream',
+                                              inputs={'X': fused_var},
+                                              outputs={'Out': fused_var},
+                                              attrs={OP_ROLE_KEY: op_role})
 
     return [] if rank is None else device_to_vars[rank]
 
@@ -554,17 +558,16 @@ def insert_reduce_ops(block,
             root_id)
         if rank is not None and rank == root_id:
             grad_in_this_device.append(var)
-        block._insert_op_without_sync(
-            insert_idx,
-            type='c_reduce_sum',
-            inputs={'X': var},
-            outputs={'Out': var},
-            attrs={
-                'ring_id': ring_id,
-                'root_id': root_id,
-                'use_calc_stream': use_calc_stream,
-                OP_ROLE_KEY: op_role
-            })
+        block._insert_op_without_sync(insert_idx,
+                                      type='c_reduce_sum',
+                                      inputs={'X': var},
+                                      outputs={'Out': var},
+                                      attrs={
+                                          'ring_id': ring_id,
+                                          'root_id': root_id,
+                                          'use_calc_stream': use_calc_stream,
+                                          OP_ROLE_KEY: op_role
+                                      })
 
     return grad_in_this_device
 
@@ -595,24 +598,23 @@ def insert_fused_broadcast_param_ops(block,
             block, insert_idx, groups, op_role, prefix="Param")
 
         for fused_var in fused_vars:
-            block._insert_op_without_sync(
-                insert_idx + insert_num,
-                type='c_broadcast',
-                inputs={'X': fused_var},
-                outputs={'Out': fused_var},
-                attrs={
-                    'ring_id': ring_id,
-                    'root': root_id,
-                    'use_calc_stream': use_calc_stream,
-                    OP_ROLE_KEY: op_role
-                })
+            block._insert_op_without_sync(insert_idx + insert_num,
+                                          type='c_broadcast',
+                                          inputs={'X': fused_var},
+                                          outputs={'Out': fused_var},
+                                          attrs={
+                                              'ring_id': ring_id,
+                                              'root': root_id,
+                                              'use_calc_stream':
+                                              use_calc_stream,
+                                              OP_ROLE_KEY: op_role
+                                          })
             if not use_calc_stream:
-                block._insert_op_without_sync(
-                    insert_idx + insert_num,
-                    type='c_sync_calc_stream',
-                    inputs={'X': fused_var},
-                    outputs={'Out': fused_var},
-                    attrs={OP_ROLE_KEY: op_role})
+                block._insert_op_without_sync(insert_idx + insert_num,
+                                              type='c_sync_calc_stream',
+                                              inputs={'X': fused_var},
+                                              outputs={'Out': fused_var},
+                                              attrs={OP_ROLE_KEY: op_role})
 
     return [] if rank is None else device_to_vars[rank]
 
@@ -631,9 +633,10 @@ def insert_broadcast_param_ops(block,
     """
     if strategy and strategy.fuse_all_reduce_ops:
         # TODO(wangxi): put fused var in startup_program, only need exec once
-        return insert_fused_broadcast_param_ops(
-            block, insert_idx, ring_id, params, shard, op_role, use_calc_stream,
-            rank, strategy.fuse_grad_size_in_MB)
+        return insert_fused_broadcast_param_ops(block, insert_idx, ring_id,
+                                                params, shard, op_role,
+                                                use_calc_stream, rank,
+                                                strategy.fuse_grad_size_in_MB)
 
     param_in_this_device = []
     for param in params:
@@ -642,17 +645,16 @@ def insert_broadcast_param_ops(block,
             root_id)
         if rank is not None and rank == root_id:
             param_in_this_device.append(param)
-        block._insert_op_without_sync(
-            insert_idx,
-            type='c_broadcast',
-            inputs={'X': param},
-            outputs={'Out': param},
-            attrs={
-                'ring_id': ring_id,
-                'root': root_id,
-                'use_calc_stream': use_calc_stream,
-                OP_ROLE_KEY: op_role
-            })
+        block._insert_op_without_sync(insert_idx,
+                                      type='c_broadcast',
+                                      inputs={'X': param},
+                                      outputs={'Out': param},
+                                      attrs={
+                                          'ring_id': ring_id,
+                                          'root': root_id,
+                                          'use_calc_stream': use_calc_stream,
+                                          OP_ROLE_KEY: op_role
+                                      })
 
     return param_in_this_device
 
@@ -690,17 +692,16 @@ def fuse_opt_broadcast_param_ops(block,
             block, insert_idx, groups, op_role, prefix="Param")
 
         for fused_var in fused_vars:
-            block._insert_op_without_sync(
-                insert_idx + insert_num,
-                type='c_broadcast',
-                inputs={'X': fused_var},
-                outputs={'Out': fused_var},
-                attrs={
-                    'ring_id': ring_id,
-                    'root': root_id,
-                    'use_calc_stream': True,
-                    OP_ROLE_KEY: op_role
-                })
+            block._insert_op_without_sync(insert_idx + insert_num,
+                                          type='c_broadcast',
+                                          inputs={'X': fused_var},
+                                          outputs={'Out': fused_var},
+                                          attrs={
+                                              'ring_id': ring_id,
+                                              'root': root_id,
+                                              'use_calc_stream': True,
+                                              OP_ROLE_KEY: op_role
+                                          })
 
     block._sync_with_cpp()
 
@@ -759,16 +760,15 @@ def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root):
     """
     op_role = get_valid_op_role(block, insert_idx)
     for broadcast_name, root_device in broadcast2root:
-        block._insert_op_without_sync(
-            insert_idx,
-            type='c_broadcast',
-            inputs={'X': broadcast_name},
-            outputs={'Out': broadcast_name},
-            attrs={
-                'ring_id': ring_id,
-                'root': root_device,
-                OP_ROLE_KEY: op_role
-            })
+        block._insert_op_without_sync(insert_idx,
+                                      type='c_broadcast',
+                                      inputs={'X': broadcast_name},
+                                      outputs={'Out': broadcast_name},
+                                      attrs={
+                                          'ring_id': ring_id,
+                                          'root': root_device,
+                                          OP_ROLE_KEY: op_role
+                                      })
 
     return
 
@@ -825,8 +825,8 @@ def comm_analyse(main_program):
         if op.type == "c_broadcast":
             var_name = op.desc.input_arg_names()[0]
             # convert MB to KB
-            broadcast_vars[var_name] = get_var_size(block.var(
-                var_name)) * 1024.0
+            broadcast_vars[var_name] = get_var_size(
+                block.var(var_name)) * 1024.0
         elif op.type == "c_allreduce_sum":
             var_name = op.desc.input_arg_names()[0]
             reduce_vars[var_name] = get_var_size(block.var(var_name)) * 1024.0
@@ -877,14 +877,15 @@ def add_sync_comm(program, sharding_ring_id):
             for input_name in op.desc.input_arg_names():
                 not_sync_vars.remove(input_name)
     if not_sync_vars:
-        block.append_op(
-            type='c_sync_comm_stream',
-            inputs={'X': list(not_sync_vars)},
-            outputs={'Out': list(not_sync_vars)},
-            attrs={
-                'ring_id': sharding_ring_id,
-                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-            })
+        block.append_op(type='c_sync_comm_stream',
+                        inputs={'X': list(not_sync_vars)},
+                        outputs={'Out': list(not_sync_vars)},
+                        attrs={
+                            'ring_id':
+                            sharding_ring_id,
+                            'op_role':
+                            core.op_proto_and_checker_maker.OpRole.Forward
+                        })
     return
 
 
@@ -926,41 +927,39 @@ def sharding_predicate(var):
             var)
 
     if int(os.environ.get('PADDLE_TRAINER_ID', 0)) == 0:
-        paddle.fluid.io.save_persistables(
-            exe, dirname, main_program=main_program, filename=None)
+        paddle.fluid.io.save_persistables(exe,
+                                          dirname,
+                                          main_program=main_program,
+                                          filename=None)
     else:
-        paddle.fluid.io.save_vars(
-            exe,
-            dirname,
-            main_program=main_program,
-            predicate=sharding_predicate,
-            filename=None)
+        paddle.fluid.io.save_vars(exe,
+                                  dirname,
+                                  main_program=main_program,
+                                  predicate=sharding_predicate,
+                                  filename=None)
 
     return
 
 
 def append_naive_sync(block, sync_var, ring_id):
     # NOTE (JZ-LIANG) update this to use barrier sync for more elegent logic
-    # sync within global 
-    block.append_op(
-        type="fill_constant",
-        outputs={"Out": sync_var},
-        attrs={
-            "shape": sync_var.shape,
-            "dtype": sync_var.dtype,
-            "value": int(1),
-        })
-    block.append_op(
-        type='c_allreduce_sum',
-        inputs={'X': sync_var},
-        outputs={'Out': sync_var},
-        attrs={
-            'ring_id': ring_id,
-            'use_calc_stream': True,
-            OP_ROLE_KEY: OpRole.Forward
-        })
-    block.append_op(
-        type='c_sync_calc_stream',
-        inputs={'X': [sync_var]},
-        outputs={'Out': [sync_var]},
-        attrs={OP_ROLE_KEY: OpRole.Forward})
+    # sync within global
+    block.append_op(type="fill_constant",
+                    outputs={"Out": sync_var},
+                    attrs={
+                        "shape": sync_var.shape,
+                        "dtype": sync_var.dtype,
+                        "value": int(1),
+                    })
+    block.append_op(type='c_allreduce_sum',
+                    inputs={'X': sync_var},
+                    outputs={'Out': sync_var},
+                    attrs={
+                        'ring_id': ring_id,
+                        'use_calc_stream': True,
+                        OP_ROLE_KEY: OpRole.Forward
+                    })
+    block.append_op(type='c_sync_calc_stream',
+                    inputs={'X': [sync_var]},
+                    outputs={'Out': [sync_var]},
+                    attrs={OP_ROLE_KEY: OpRole.Forward})
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
index ab0c79bca554c..42c52af44311c 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
@@ -18,6 +18,7 @@
 
 
 class WeightDecayHelper(object):
+
     def __init__(self):
         pass
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 90440ff9d0ea9..fcecc3a9a671e 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -32,9 +32,10 @@
 from .sharding.utils import *
 
 import logging
+
 logger = logging.getLogger(__name__)
-formatter = logging.Formatter(
-    fmt='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
+                              datefmt='%Y-%m-%d %H:%M:%S')
 ch = logging.StreamHandler()
 ch.setFormatter(formatter)
 logger.addHandler(ch)
@@ -57,7 +58,9 @@ def __init__(self, optimizer):
             # "ModelParallelOptimizer",
             # "PipelineOptimizer",
         ]
-        self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
+        self.meta_optimizers_black_list = [
+            "GraphExecutionOptimizer",
+        ]
         self._main_program = None
         self._startup_program = None
         self._segments = []
@@ -418,8 +421,8 @@ def _insert_allreduce_for_pp(self, params_grads):
                 use_calc_stream=True,
                 rank=self.dp_rank,
                 strategy=strategy)
-            logger.info("Optimizer grad in this rank {}".format(
-                accumulated_grad_names))
+            logger.info(
+                "Optimizer grad in this rank {}".format(accumulated_grad_names))
             first_optimize_op_index += (len(main_block.ops) - len_of_ops)
             len_of_ops = len(main_block.ops)
 
@@ -434,8 +437,8 @@ def _insert_allreduce_for_pp(self, params_grads):
                 use_calc_stream=True,
                 rank=self.dp_rank,
                 strategy=None if optimize_cast else strategy)
-            logger.info("Optimizer param in this rank {}".format(
-                optimizer_param))
+            logger.info(
+                "Optimizer param in this rank {}".format(optimizer_param))
             if not strategy.fuse_grad_merge and not optimize_cast:
                 assert len(accumulated_grad_names) == len(optimizer_param)
         elif self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp":
@@ -519,8 +522,9 @@ def _adapt_amp_clip_without_sharding(self):
         FP16Utils.sync_amp_check_nan_inf(main_block, rings)
 
         gradientclip_helper = GradientClipHelper(None)
-        gradientclip_helper.sync_global_norm(
-            main_block, [self.mp_ring_id, self.pp_ring_id], self.mp_rank)
+        gradientclip_helper.sync_global_norm(main_block,
+                                             [self.mp_ring_id, self.pp_ring_id],
+                                             self.mp_rank)
 
     def _insert_loss_grad_scale_op(self):
         main_block = self._main_program.global_block()
@@ -541,8 +545,8 @@ def _apply_optimize_offload_pass(self, params_grads):
 
         mp_ring_id = self.mp_ring_id if self.mp_degree > 1 else None
         dp_ring_id = self.dp_ring_id if self.dp_degree > 1 else None
-        offload_helper = OffloadHelper(
-            mp_ring_id=mp_ring_id, dp_ring_id=dp_ring_id)
+        offload_helper = OffloadHelper(mp_ring_id=mp_ring_id,
+                                       dp_ring_id=dp_ring_id)
 
         # optimize offload should be enable while gradient merge is enable and
         # acc_step is quite large (e.g. >> 100). Since its memcpy could not be
@@ -561,11 +565,13 @@ def _apply_optimize_offload_pass(self, params_grads):
                     main_block, startup_block,
                     [x[0].name for x in params_grads])
                 # NOTE(wangxi): fused after optimize_cast
-                utils.fuse_opt_broadcast_param_ops(
-                    main_block, dp_ring_id, self._shard, strategy=strategy)
+                utils.fuse_opt_broadcast_param_ops(main_block,
+                                                   dp_ring_id,
+                                                   self._shard,
+                                                   strategy=strategy)
             else:
-                offload_helper.cast_fp32param_in_optimize(main_block,
-                                                          startup_block)
+                offload_helper.cast_fp32param_in_optimize(
+                    main_block, startup_block)
 
     def _dump_program_for_debug(self):
         main_block = self._main_program.global_block()
@@ -645,14 +651,13 @@ def _init_pair_comm(self, pair, ring_id):
         ]
         pp_rank = 0 if self.pp_rank == pair[0] else 1
         if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None) is None:
-            self._collective_helper._init_communicator(
-                self._startup_program,
-                self.current_endpoint,
-                pp_group_endpoints,
-                pp_rank,
-                ring_id,
-                False,
-                sync=False)
+            self._collective_helper._init_communicator(self._startup_program,
+                                                       self.current_endpoint,
+                                                       pp_group_endpoints,
+                                                       pp_rank,
+                                                       ring_id,
+                                                       False,
+                                                       sync=False)
 
     def _init_npu_pipeline_comm(self, startup_block):
         # NOTE(wangxi): some bug with hccl, must set pp_degree be even number
@@ -670,12 +675,13 @@ def _init_npu_pipeline_comm(self, startup_block):
                 my_pair.append(pair)
 
         # for example: self.pp_rank=2, self.pp_degree=4
-        send_to_next_pair = (self.pp_rank,
-                             (self.pp_rank + 1) % self.pp_degree)  # 2->3
-        recv_from_next_pair = ((self.pp_rank + 1) % self.pp_degree,
-                               self.pp_rank)  # 3->2
-        recv_from_prev_pair = ((self.pp_rank - 1 + self.pp_degree) %
-                               self.pp_degree, self.pp_rank)  # 1->2
+        send_to_next_pair = (self.pp_rank, (self.pp_rank + 1) % self.pp_degree
+                             )  # 2->3
+        recv_from_next_pair = (
+            (self.pp_rank + 1) % self.pp_degree, self.pp_rank)  # 3->2
+        recv_from_prev_pair = (
+            (self.pp_rank - 1 + self.pp_degree) % self.pp_degree, self.pp_rank
+        )  # 1->2
         send_to_prev_pair = (self.pp_rank, (self.pp_rank - 1 + self.pp_degree) %
                              self.pp_degree)  # 2->1
 
@@ -686,24 +692,24 @@ def _init_npu_pipeline_comm(self, startup_block):
         ring_id = self.pp_ring_map[pair[0] * 1000 + pair[1]]
         self._init_pair_comm(pair, ring_id)
         my_pair.remove(pair)
-        logger.info("pair0(even->odd): pp pair:{}, ring_id: {}".format(pair,
-                                                                       ring_id))
+        logger.info("pair0(even->odd): pp pair:{}, ring_id: {}".format(
+            pair, ring_id))
 
         # 2. even recv from next, odd send to prev, 1->0, 3->2
         pair = recv_from_next_pair if even else send_to_prev_pair
         ring_id = self.pp_ring_map[pair[0] * 1000 + pair[1]]
         self._init_pair_comm(pair, ring_id)
         my_pair.remove(pair)
-        logger.info("pair1(even<-odd): pp pair:{}, ring_id: {}".format(pair,
-                                                                       ring_id))
+        logger.info("pair1(even<-odd): pp pair:{}, ring_id: {}".format(
+            pair, ring_id))
 
         # if pp_degree is 2, only need pair(0->1, 1->0)
         if self.pp_degree > 2:
             # 3. odd send to next, even recv from prev, 1->2, 3->0
             pair = send_to_next_pair if not even else recv_from_prev_pair
-            ring_id = self.pp_ring_map.get(
-                pair[0] * 1000 + pair[1],
-                max_ring_id + 1)  # 3->0 not in pp_ring_map
+            ring_id = self.pp_ring_map.get(pair[0] * 1000 + pair[1],
+                                           max_ring_id +
+                                           1)  # 3->0 not in pp_ring_map
             self._init_pair_comm(pair, ring_id)
             if self.pp_rank != 0 and self.pp_rank != self.pp_degree - 1:
                 my_pair.remove(pair)
@@ -712,9 +718,9 @@ def _init_npu_pipeline_comm(self, startup_block):
 
             # 4. odd recv from next, even send to prev, 2->1, 0->3
             pair = recv_from_next_pair if not even else send_to_prev_pair
-            ring_id = self.pp_ring_map.get(
-                pair[0] * 1000 + pair[1],
-                max_ring_id + 2)  # 0->3 not in pp_ring_map
+            ring_id = self.pp_ring_map.get(pair[0] * 1000 + pair[1],
+                                           max_ring_id +
+                                           2)  # 0->3 not in pp_ring_map
             self._init_pair_comm(pair, ring_id)
             if self.pp_rank != 0 and self.pp_rank != self.pp_degree - 1:
                 my_pair.remove(pair)
@@ -727,14 +733,13 @@ def _init_npu_pipeline_comm(self, startup_block):
     def _init_pipeline_comm(self, startup_block):
         # TODO (JZ-LIANG) to unify pp_rank_ and pp_rank
         if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None) is None:
-            self._collective_helper._init_communicator(
-                self._startup_program,
-                self.current_endpoint,
-                self.pp_group_endpoints,
-                self.pp_rank,
-                self.pp_ring_id,
-                False,
-                sync=False)
+            self._collective_helper._init_communicator(self._startup_program,
+                                                       self.current_endpoint,
+                                                       self.pp_group_endpoints,
+                                                       self.pp_rank,
+                                                       self.pp_ring_id,
+                                                       False,
+                                                       sync=False)
 
         if core.is_compiled_with_npu():
             self._init_npu_pipeline_comm(startup_block)
@@ -754,14 +759,13 @@ def _init_comm(self):
 
         # mp ring
         if self.mp_degree > 1:
-            self._collective_helper._init_communicator(
-                self._startup_program,
-                self.current_endpoint,
-                self.mp_group_endpoints,
-                self.mp_rank,
-                self.mp_ring_id,
-                False,
-                sync=False)
+            self._collective_helper._init_communicator(self._startup_program,
+                                                       self.current_endpoint,
+                                                       self.mp_group_endpoints,
+                                                       self.mp_rank,
+                                                       self.mp_ring_id,
+                                                       False,
+                                                       sync=False)
 
         # sharding ring
         if self.sharding_degree > 1:
@@ -780,14 +784,13 @@ def _init_comm(self):
 
         # pure dp ring
         if self.dp_degree > 1:
-            self._collective_helper._init_communicator(
-                self._startup_program,
-                self.current_endpoint,
-                self.dp_group_endpoints,
-                self.dp_rank,
-                self.dp_ring_id,
-                False,
-                sync=False)
+            self._collective_helper._init_communicator(self._startup_program,
+                                                       self.current_endpoint,
+                                                       self.dp_group_endpoints,
+                                                       self.dp_rank,
+                                                       self.dp_ring_id,
+                                                       False,
+                                                       sync=False)
 
         startup_block._sync_with_cpp()
 
@@ -839,12 +842,12 @@ def _split_program(self, block):
                             if ".cast_fp16@GRAD" not in input_name:
                                 continue
                             else:
-                                input_name = input_name[:input_name.find(
-                                    ".cast_fp16@GRAD")]
+                                input_name = input_name[:input_name.
+                                                        find(".cast_fp16@GRAD")]
 
                         if input_name in self._backward_remain_anchors:
-                            segment = self.collect_segment(segment, op_idx,
-                                                           block)
+                            segment = self.collect_segment(
+                                segment, op_idx, block)
                             assert input_name not in self._forward_remain_anchors, "segment anchor [{}] met twice !".format(
                                 input_name)
                             self._backward_remain_anchors.remove(input_name)
@@ -852,8 +855,8 @@ def _split_program(self, block):
                 elif int(op.attr('op_role')) == int(OpRole.Forward):
                     for output_name in op.desc.output_arg_names():
                         if output_name in self._forward_remain_anchors:
-                            segment = self.collect_segment(segment, op_idx,
-                                                           block)
+                            segment = self.collect_segment(
+                                segment, op_idx, block)
                             self._forward_remain_anchors.remove(output_name)
 
             # find broadcast vars
@@ -878,8 +881,8 @@ def _split_program(self, block):
                 if "subprog" in broadcast_var_base_name:
                     # remove suffix
                     broadcast_var_base_name = broadcast_var_base_name[:
-                                                                      broadcast_var_base_name.
-                                                                      find(
+                                                                      broadcast_var_base_name
+                                                                      .find(
                                                                           ".subprog"
                                                                       )]
 
@@ -888,8 +891,8 @@ def _split_program(self, block):
                         broadcast_var_base_name, 0) + 1
 
                 segment._param2broadcast[input_name] = broadcast_var_name
-                segment._broadcast_vars.append((broadcast_var_name,
-                                                self._shard.device(input_name)))
+                segment._broadcast_vars.append(
+                    (broadcast_var_name, self._shard.device(input_name)))
                 segment._param_mem += get_var_size(
                     self._main_program.global_block().var(input_name))
 
@@ -904,11 +907,12 @@ def _split_program(self, block):
                     if len(op_role_var) != 0:
                         assert len(op_role_var) % 2 == 0
                         for i in range(0, len(op_role_var), 2):
-                            param, reduced_grad = op_role_var[i], op_role_var[
-                                i + 1]
+                            param, reduced_grad = op_role_var[i], op_role_var[i
+                                                                              +
+                                                                              1]
                             segment._allreduce_vars.append(reduced_grad)
-                            assert (reduced_grad not in
-                                    self._reduced_grads_to_param)
+                            assert (reduced_grad
+                                    not in self._reduced_grads_to_param)
                             self._reduced_grads_to_param[reduced_grad] = param
 
             # find cast op
@@ -931,19 +935,20 @@ def _split_program(self, block):
                     self._backward_remain_anchors)
 
         if self._verbose:
-            for varname in sorted(
-                    var2broadcast_time, key=var2broadcast_time.get,
-                    reverse=True):
+            for varname in sorted(var2broadcast_time,
+                                  key=var2broadcast_time.get,
+                                  reverse=True):
                 logger.info("Sharding broadcast: [{}] times [{}]".format(
                     var2broadcast_time[varname], varname))
             for idx_ in range(len(self._segments)):
                 logger.info("segment [{}] :".format(idx_))
-                logger.info("start op: [{}]  [{}]".format(block.ops[
-                    self._segments[idx_]._start_idx].desc.type(), block.ops[
-                        self._segments[idx_]._start_idx].desc.input_arg_names(
-                        )))
-                logger.info("end   op: [{}]  [{}]".format(block.ops[
-                    self._segments[idx_]._end_idx].desc.type(), block.ops[
+                logger.info("start op: [{}]  [{}]".format(
+                    block.ops[self._segments[idx_]._start_idx].desc.type(),
+                    block.ops[self._segments[idx_].
+                              _start_idx].desc.input_arg_names()))
+                logger.info("end   op: [{}]  [{}]".format(
+                    block.ops[self._segments[idx_]._end_idx].desc.type(),
+                    block.ops[
                         self._segments[idx_]._end_idx].desc.input_arg_names()))
         return
 
@@ -1044,7 +1049,7 @@ def _prune_main_program(self, block, shard, rings):
                     program_deps.remove_op(idx, reserved_vars)
 
         # NOTE (JZ-LIANG) revise and unify logic here
-        # sharding support fp16_allreduce logic            
+        # sharding support fp16_allreduce logic
         block._sync_with_cpp()
         for idx, op in reversed(list(enumerate(block.ops))):
             if op.type == 'concat' and is_optimizer_op(op):
@@ -1084,8 +1089,8 @@ def _add_broadcast_allreduce(self, block):
         self._segments[-1]._end_idx = new_end_idx
 
         if self._segments[-1]._allreduce_vars:
-            shard_allredue_vars = self._shard.filter_grads(self._segments[-1]
-                                                           ._allreduce_vars)
+            shard_allredue_vars = self._shard.filter_grads(
+                self._segments[-1]._allreduce_vars)
             if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1:
                 if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len(
                         shard_allredue_vars) >= 1:
@@ -1097,38 +1102,36 @@ def _add_broadcast_allreduce(self, block):
                         self.dp_ring_id,
                         shard_allredue_vars,
                         user_defined_strategy=self.user_defined_strategy)
-            # gradient merge 
+            # gradient merge
             elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
                 self.create_persistable_gradients_and_insert_merge_ops(
-                    block,
-                    self._startup_program.global_block(),
+                    block, self._startup_program.global_block(),
                     self._segments[-1]._end_idx, shard_allredue_vars,
                     self._shard)
 
             insert_sync_comm_ops(block, self._segments[-1]._end_idx,
                                  self.sharding_ring_id,
                                  self._segments[-1]._allreduce_vars)
-            # allreduce --> reduce 
-            insert_reduce_ops(
-                block,
-                self._segments[-1]._end_idx,
-                self.sharding_ring_id,
-                self._segments[-1]._allreduce_vars,
-                self._shard,
-                op_role=OpRole.Backward,
-                use_calc_stream=False)
+            # allreduce --> reduce
+            insert_reduce_ops(block,
+                              self._segments[-1]._end_idx,
+                              self.sharding_ring_id,
+                              self._segments[-1]._allreduce_vars,
+                              self._shard,
+                              op_role=OpRole.Backward,
+                              use_calc_stream=False)
 
         for idx, segment in reversed(list(enumerate(self._segments))):
             allreduce_vars = self._segments[
                 idx - 1]._allreduce_vars if idx > 0 else []
-            broadcast_vars = self._segments[idx +
-                                            1]._broadcast_vars if idx < len(
-                                                self._segments) - 1 else []
+            broadcast_vars = self._segments[
+                idx +
+                1]._broadcast_vars if idx < len(self._segments) - 1 else []
             fill_constant_vars = self._segments[
-                idx + 2]._fill_constant_vars if idx < len(
-                    self._segments) - 2 else []
-            cast_ops = self._segments[idx + 2]._cast_ops if idx < len(
-                self._segments) - 2 else {}
+                idx +
+                2]._fill_constant_vars if idx < len(self._segments) - 2 else []
+            cast_ops = self._segments[
+                idx + 2]._cast_ops if idx < len(self._segments) - 2 else {}
 
             for op_idx in reversed(range(segment._start_idx, segment._end_idx)):
                 op = block.ops[op_idx]
@@ -1144,14 +1147,14 @@ def _add_broadcast_allreduce(self, block):
                         name=broadcast_name,
                         shape=self._main_program.global_block().var(
                             param_name).shape,
-                        dtype=self._main_program.global_block().var(param_name)
-                        .dtype,
+                        dtype=self._main_program.global_block().var(
+                            param_name).dtype,
                         persistable=False)
 
             # step1: remove cast ops
             block._sync_with_cpp()
-            segment._end_idx += FP16Utils.remove_cast_op(block, self._params,
-                                                         segment, 0)
+            segment._end_idx += FP16Utils.remove_cast_op(
+                block, self._params, segment, 0)
 
             # step2: add Sync ops
             shard_allredue_vars = self._shard.filter_grads(allreduce_vars)
@@ -1190,20 +1193,19 @@ def _add_broadcast_allreduce(self, block):
                 insert_sync_calc_op(block, segment._end_idx,
                                     [calc_dep_vars[-1]])
 
-            # step3: insert `fill_constant` ops 
+            # step3: insert `fill_constant` ops
             insert_fill_constant_ops(block, segment._end_idx,
                                      fill_constant_vars)
 
-            # step4: add `cast` ops     
+            # step4: add `cast` ops
             insert_cast_ops(block, segment._end_idx, cast_ops)
 
             # step5: add broadcast ops
             # gradient merge
             if self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
                 self.create_persistable_gradients_and_insert_merge_ops(
-                    block,
-                    self._startup_program.global_block(), segment._start_idx,
-                    shard_allredue_vars, self._shard)
+                    block, self._startup_program.global_block(),
+                    segment._start_idx, shard_allredue_vars, self._shard)
 
             insert_broadcast_ops(block, segment._start_idx,
                                  self.sharding_ring_id, broadcast_vars)
@@ -1226,17 +1228,16 @@ def _add_broadcast_allreduce(self, block):
                 insert_sync_comm_ops(block, segment._start_idx,
                                      self.sharding_ring_id, allreduce_vars)
             # sharding
-            # allreduce --> reduce 
+            # allreduce --> reduce
             # TODO temp change
             if len(allreduce_vars) > 0:
-                insert_reduce_ops(
-                    block,
-                    segment._start_idx,
-                    self.sharding_ring_id,
-                    allreduce_vars,
-                    self._shard,
-                    op_role=OpRole.Backward,
-                    use_calc_stream=False)
+                insert_reduce_ops(block,
+                                  segment._start_idx,
+                                  self.sharding_ring_id,
+                                  allreduce_vars,
+                                  self._shard,
+                                  op_role=OpRole.Backward,
+                                  use_calc_stream=False)
 
             block._sync_with_cpp()
 
@@ -1308,8 +1309,8 @@ def _build_groups(self):
         self.global_rank = self.role_maker._worker_index()
         self.global_endpoints = self.role_maker._get_trainer_endpoints()
         self.current_endpoint = self.global_endpoints[self.global_rank]
-        self._collective_helper = CollectiveHelper(
-            self.role_maker, nrings=self._nrings_sharding)
+        self._collective_helper = CollectiveHelper(self.role_maker,
+                                                   nrings=self._nrings_sharding)
         assert self.global_word_size % self.mp_degree == 0, \
             "global_word_size: {} should be divisible to the mp_degree: {}".format(self.global_word_size, self.mp_degree)
         assert self.global_word_size % self.sharding_degree == 0, \
@@ -1340,7 +1341,7 @@ def _build_groups(self):
             self.mp_group_id = -1
             self.mp_group_endpoints = []
 
-        # sharding 
+        # sharding
         if self.sharding_degree > 1:
             self.sharding_ring_id = 1
             self.sharding_rank = (self.global_rank //
@@ -1354,7 +1355,7 @@ def _build_groups(self):
                     if (idx // (self.mp_degree * self.sharding_degree)) == self.
                     sharding_group_id and idx % self.mp_degree == self.mp_rank
                 ]
-            # sharding + ...    
+            # sharding + ...
             else:
                 self.sharding_group_endpoints = [
                     ep for idx, ep in enumerate(self.global_endpoints)
@@ -1385,8 +1386,9 @@ def _build_groups(self):
             pp_stage_offset = self.sharding_degree * self.mp_degree
             self.pp_group_endpoints = []
             for i in range(self.pp_degree):
-                self.pp_group_endpoints.append(self.global_endpoints[
-                    pp_first_stage_idx + pp_stage_offset * i])
+                self.pp_group_endpoints.append(
+                    self.global_endpoints[pp_first_stage_idx +
+                                          pp_stage_offset * i])
             assert self.current_endpoint in self.pp_group_endpoints
         else:
             self.pp_ring_id = -1
@@ -1399,7 +1401,7 @@ def _build_groups(self):
         # outter-pure-dp group
         # NOTE (JZ-LIANG) support outter-pure-dp to scale the throughput in 3D parallelism
         # e.g. mp-sharding-pp-dp
-        # sharding-hybrid-dp as one senario of outter-pure-dp 
+        # sharding-hybrid-dp as one senario of outter-pure-dp
         local_pp_degree = self.pp_degree
         if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None):
             assert self.pp_degree == 2, ("For manually set pipeline, only "
@@ -1423,8 +1425,8 @@ def _build_groups(self):
                          local_pp_degree)
             self.dp_group_endpoints = []
             for i in range(self.dp_degree):
-                self.dp_group_endpoints.append(self.global_endpoints[
-                    dp_first_rank_idx + dp_offset * i])
+                self.dp_group_endpoints.append(
+                    self.global_endpoints[dp_first_rank_idx + dp_offset * i])
             assert self.current_endpoint in self.dp_group_endpoints
             logger.info("Hybrid DP mode turn on !")
         else:
@@ -1475,6 +1477,7 @@ def _build_groups(self):
         return
 
     def _recreate_not_persist_param_as_var(self):
+
         def recreate_not_persist_param_as_var(program):
             block = program.global_block()
             params = block.all_parameters()
@@ -1498,15 +1501,14 @@ def recreate_not_persist_param_as_var(program):
                     is_distributed = param.is_distributed
 
                 block._remove_var(name, sync=False)
-                var = block.create_var(
-                    name=name,
-                    shape=shape,
-                    dtype=dtype,
-                    type=type,
-                    lod_level=lod_level,
-                    stop_gradient=stop_gradient,
-                    trainable=trainable,
-                    persistable=False)
+                var = block.create_var(name=name,
+                                       shape=shape,
+                                       dtype=dtype,
+                                       type=type,
+                                       lod_level=lod_level,
+                                       stop_gradient=stop_gradient,
+                                       trainable=trainable,
+                                       persistable=False)
                 if have_dist_attr:
                     var.is_distributed = is_distributed
 
@@ -1552,16 +1554,15 @@ def _initialization_broadcast(self):
                 rings.append(self.dp_ring_id)
 
             for ring in rings:
-                startup_block.append_op(
-                    type='c_broadcast',
-                    inputs={'X': param},
-                    outputs={'Out': param},
-                    attrs={
-                        'ring_id': ring,
-                        'root': 0,
-                        'use_calc_stream': True,
-                        OP_ROLE_KEY: OpRole.Forward
-                    })
+                startup_block.append_op(type='c_broadcast',
+                                        inputs={'X': param},
+                                        outputs={'Out': param},
+                                        attrs={
+                                            'ring_id': ring,
+                                            'root': 0,
+                                            'use_calc_stream': True,
+                                            OP_ROLE_KEY: OpRole.Forward
+                                        })
 
         startup_block._sync_with_cpp()
 
@@ -1595,8 +1596,10 @@ def create_persistable_gradients_and_insert_merge_ops(
             main_block._insert_op_without_sync(
                 insert_idx,
                 type="elementwise_add",
-                inputs={'X': grad_name,
-                        'Y': gradient_merge_var},
+                inputs={
+                    'X': grad_name,
+                    'Y': gradient_merge_var
+                },
                 outputs={'Out': gradient_merge_var},
                 attrs={
                     'axis': -1,
@@ -1605,14 +1608,13 @@ def create_persistable_gradients_and_insert_merge_ops(
                 })
 
             # startup initialization
-            startup_block.append_op(
-                type="fill_constant",
-                outputs={"Out": startup_gradient_merge_var},
-                attrs={
-                    "shape": grad_var.shape,
-                    "dtype": grad_var.dtype,
-                    "value": float(0),
-                })
+            startup_block.append_op(type="fill_constant",
+                                    outputs={"Out": startup_gradient_merge_var},
+                                    attrs={
+                                        "shape": grad_var.shape,
+                                        "dtype": grad_var.dtype,
+                                        "value": float(0),
+                                    })
 
         main_block._sync_with_cpp()
         startup_block._sync_with_cpp()
@@ -1627,13 +1629,12 @@ def _create_gm_cond(self, main_block):
             persistable=True,
             force_cpu=True)
 
-        zero_var = layers.create_global_var(
-            name="gradient_merge_zero",
-            shape=[1],
-            value=int(0),
-            dtype='int32',
-            persistable=True,
-            force_cpu=True)
+        zero_var = layers.create_global_var(name="gradient_merge_zero",
+                                            shape=[1],
+                                            value=int(0),
+                                            dtype='int32',
+                                            persistable=True,
+                                            force_cpu=True)
 
         # Add step var & cond var
         current_step_var = layers.create_global_var(
@@ -1644,36 +1645,40 @@ def _create_gm_cond(self, main_block):
             persistable=True,
             force_cpu=True)
 
-        cond_var = main_block.create_var(
-            name="gradient_merge_cond", shape=[1], dtype='bool')
+        cond_var = main_block.create_var(name="gradient_merge_cond",
+                                         shape=[1],
+                                         dtype='bool')
 
         with device_guard("cpu"):
             # step_var = (step_var + 1) % k_step
-            main_block.append_op(
-                type='increment',
-                inputs={'X': [current_step_var]},
-                outputs={'Out': [current_step_var]},
-                attrs={'step': float(1),
-                       OP_ROLE_KEY: OpRole.Optimize})
-
-            main_block.append_op(
-                type='elementwise_mod',
-                inputs={'X': current_step_var,
-                        'Y': acc_step_var},
-                outputs={'Out': current_step_var},
-                attrs={
-                    'axis': -1,
-                    OP_ROLE_KEY: OpRole.Optimize,
-                    'use_mkldnn': False
-                })
+            main_block.append_op(type='increment',
+                                 inputs={'X': [current_step_var]},
+                                 outputs={'Out': [current_step_var]},
+                                 attrs={
+                                     'step': float(1),
+                                     OP_ROLE_KEY: OpRole.Optimize
+                                 })
+
+            main_block.append_op(type='elementwise_mod',
+                                 inputs={
+                                     'X': current_step_var,
+                                     'Y': acc_step_var
+                                 },
+                                 outputs={'Out': current_step_var},
+                                 attrs={
+                                     'axis': -1,
+                                     OP_ROLE_KEY: OpRole.Optimize,
+                                     'use_mkldnn': False
+                                 })
 
             # cond_var = (step_var == 0)
-            main_block.append_op(
-                type='equal',
-                inputs={'X': current_step_var,
-                        'Y': zero_var},
-                outputs={'Out': cond_var},
-                attrs={OP_ROLE_KEY: OpRole.Optimize})
+            main_block.append_op(type='equal',
+                                 inputs={
+                                     'X': current_step_var,
+                                     'Y': zero_var
+                                 },
+                                 outputs={'Out': cond_var},
+                                 attrs={OP_ROLE_KEY: OpRole.Optimize})
         # paddle.static.Print(current_step_var, message="in FWBW last conditional")
         return cond_var
 
@@ -1698,35 +1703,37 @@ def _true_apply_gradient(self):
         # cur_block's forward_block & backward_block is itself
         cur_block._set_forward_block_idx(cur_block_idx)
 
-        # allreduce grad@gradientmerge  
+        # allreduce grad@gradientmerge
         if self.hybrid_dp:
             assert self.dp_ring_id >= 0, "dp_ring_id should larger than 0 when in sharding&DP mode"
             for grad, merged_grad in self._grad2merged_grad.items():
                 merged_grad_var = main_block.var(merged_grad)
-                cur_block.append_op(
-                    type='c_allreduce_sum',
-                    inputs={'X': merged_grad_var},
-                    outputs={'Out': merged_grad_var},
-                    attrs={
-                        'ring_id': self.dp_ring_id,
-                        'use_calc_stream': True,
-                        OP_ROLE_KEY: OpRole.Optimize
-                    })
+                cur_block.append_op(type='c_allreduce_sum',
+                                    inputs={'X': merged_grad_var},
+                                    outputs={'Out': merged_grad_var},
+                                    attrs={
+                                        'ring_id': self.dp_ring_id,
+                                        'use_calc_stream': True,
+                                        OP_ROLE_KEY: OpRole.Optimize
+                                    })
 
         # grad@gradientmerge / acc_step
         for grad, merged_grad in self._grad2merged_grad.items():
             # grad /= k_steps
             merged_grad_var = main_block.var(merged_grad)
-            cur_block.append_op(
-                type='scale',
-                inputs={'X': merged_grad_var},
-                outputs={'Out': merged_grad_var},
-                attrs={
-                    'scale': 1.0 / float(self._gradient_merge_acc_step),
-                    'bias': 0.0,
-                    'bias_after_scale': False,
-                    OP_ROLE_KEY: OpRole.Optimize
-                })
+            cur_block.append_op(type='scale',
+                                inputs={'X': merged_grad_var},
+                                outputs={'Out': merged_grad_var},
+                                attrs={
+                                    'scale':
+                                    1.0 / float(self._gradient_merge_acc_step),
+                                    'bias':
+                                    0.0,
+                                    'bias_after_scale':
+                                    False,
+                                    OP_ROLE_KEY:
+                                    OpRole.Optimize
+                                })
 
         # re-create optimize ops
         already_moved_var_names = []
@@ -1755,11 +1762,10 @@ def _true_apply_gradient(self):
                         type_ = var_.dtype
                         self._main_program.global_block()._remove_var(
                             var_.name, sync=False)
-                        self.cond_block.create_var(
-                            name=name_,
-                            shape=shape_,
-                            dtype=type_,
-                            persistable=False)
+                        self.cond_block.create_var(name=name_,
+                                                   shape=shape_,
+                                                   dtype=type_,
+                                                   persistable=False)
                         already_moved_var_names.append(name_)
 
         self._main_program.global_block()._sync_with_cpp()
@@ -1768,15 +1774,14 @@ def _true_apply_gradient(self):
         # fill zero to grad@gradientmerge
         for grad, merged_grad in self._grad2merged_grad.items():
             merged_grad_var = main_block.var(merged_grad)
-            cur_block.append_op(
-                type='fill_constant',
-                outputs={'Out': merged_grad_var},
-                attrs={
-                    "shape": merged_grad_var.shape,
-                    "dtype": merged_grad_var.dtype,
-                    "value": float(0),
-                    OP_ROLE_KEY: OpRole.Optimize
-                })
+            cur_block.append_op(type='fill_constant',
+                                outputs={'Out': merged_grad_var},
+                                attrs={
+                                    "shape": merged_grad_var.shape,
+                                    "dtype": merged_grad_var.dtype,
+                                    "value": float(0),
+                                    OP_ROLE_KEY: OpRole.Optimize
+                                })
 
         # lr_var = main_block.var("gradient_merge_current_step")
         # paddle.static.Print(lr_var, message="in OPTIMIZE last conditional")
@@ -1831,8 +1836,10 @@ def _sharding_gradient_merge(self):
                 'Cond': cond,
                 'Input': [],
             },
-            outputs={'Out': [],
-                     'Scope': [step_scope]},
+            outputs={
+                'Out': [],
+                'Scope': [step_scope]
+            },
             attrs={
                 'sub_block': cond_block,
                 'is_scalar_condition': True,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py
index 9d099a2af24fa..c628964db35db 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py
@@ -23,6 +23,7 @@
 
 
 class TensorParallelOptimizer(MetaOptimizerBase):
+
     def __init__(self, optimizer):
         super(TensorParallelOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -32,15 +33,18 @@ def __init__(self, optimizer):
             "LarsOptimizer",
             "LambOptimizer",
         ]
-        self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
+        self.meta_optimizers_black_list = [
+            "GraphExecutionOptimizer",
+        ]
         self.mp_ring_id = 0
         self.global_ring_id = 1
         self.dp_ring_id = 2
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
-        super(TensorParallelOptimizer, self)._set_basic_info(
-            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        super(TensorParallelOptimizer,
+              self)._set_basic_info(loss, role_maker, user_defined_optimizer,
+                                    user_defined_strategy)
         self.mp_degree = user_defined_strategy.tensor_parallel_configs[
             'tensor_parallel_degree']
 
@@ -58,7 +62,9 @@ def _disable_strategy(self, dist_strategy):
 
     def _enable_strategy(self, dist_strategy, context):
         dist_strategy.tensor_parallel = True
-        dist_strategy.tensor_parallel_configs = {"tensor_parallel_degree": 1, }
+        dist_strategy.tensor_parallel_configs = {
+            "tensor_parallel_degree": 1,
+        }
 
     def _broadcast_params(self, ring_id, mp_mode):
         block = self.startup_program.global_block()
@@ -67,23 +73,23 @@ def _broadcast_params(self, ring_id, mp_mode):
             if param.is_distributed and mp_mode:
                 continue
 
-            block.append_op(
-                type='c_broadcast',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={
-                    'ring_id': ring_id,
-                    'root': 0,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
+            block.append_op(type='c_broadcast',
+                            inputs={'X': param},
+                            outputs={'Out': param},
+                            attrs={
+                                'ring_id': ring_id,
+                                'root': 0,
+                                OP_ROLE_KEY: OpRole.Forward
+                            })
 
         if not param: return  # no parameter on this device
-        block.append_op(
-            type='c_sync_comm_stream',
-            inputs={'X': param},
-            outputs={'Out': param},
-            attrs={'ring_id': ring_id,
-                   OP_ROLE_KEY: OpRole.Forward})
+        block.append_op(type='c_sync_comm_stream',
+                        inputs={'X': param},
+                        outputs={'Out': param},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Forward
+                        })
 
     def _get_process_group_info(self):
         # global ring info
@@ -115,15 +121,19 @@ def _init_process_group(self):
         collective_helper = CollectiveHelper(self.role_maker, wait_port=False)
 
         # Create global ring for all gpus
-        collective_helper._init_communicator(
-            self.startup_program, self.current_endpoint, self.global_endpoints,
-            self.global_rank, self.global_ring_id, True, self.global_ring_id,
-            True)
+        collective_helper._init_communicator(self.startup_program,
+                                             self.current_endpoint,
+                                             self.global_endpoints,
+                                             self.global_rank,
+                                             self.global_ring_id, True,
+                                             self.global_ring_id, True)
 
         # Create model parallel ring for all gpus
-        collective_helper._init_communicator(
-            self.startup_program, self.current_endpoint, self.mp_endpoints,
-            self.mp_rank, self.mp_ring_id, True, self.global_ring_id, True)
+        collective_helper._init_communicator(self.startup_program,
+                                             self.current_endpoint,
+                                             self.mp_endpoints, self.mp_rank,
+                                             self.mp_ring_id, True,
+                                             self.global_ring_id, True)
         self._broadcast_params(self.mp_ring_id, mp_mode=True)
 
         # Create dp rings
@@ -174,15 +184,14 @@ def _insert_loss_grad_ops(self, loss, dp_degree):
         for idx, op in reversed(list(enumerate(block.ops))):
             if is_loss_grad_op(op):
                 loss_grad_var = block.vars[op.output_arg_names[0]]
-                block._insert_op(
-                    idx + 1,
-                    type='scale',
-                    inputs={'X': loss_grad_var},
-                    outputs={'Out': loss_grad_var},
-                    attrs={
-                        'scale': 1.0 / dp_degree,
-                        OP_ROLE_KEY: OpRole.Backward
-                    })
+                block._insert_op(idx + 1,
+                                 type='scale',
+                                 inputs={'X': loss_grad_var},
+                                 outputs={'Out': loss_grad_var},
+                                 attrs={
+                                     'scale': 1.0 / dp_degree,
+                                     OP_ROLE_KEY: OpRole.Backward
+                                 })
                 break
 
     def _insert_allreduce_ops(self, loss, ring_id):
@@ -200,34 +209,33 @@ def _insert_allreduce_ops(self, loss, ring_id):
                     grad = block.vars[op_role_var[i + 1]]
                     if offset == idx:
                         offset += 1
-                        block._insert_op(
-                            offset,
-                            type='c_sync_calc_stream',
-                            inputs={'X': grad},
-                            outputs={'Out': grad},
-                            attrs={OP_ROLE_KEY: OpRole.Backward})
+                        block._insert_op(offset,
+                                         type='c_sync_calc_stream',
+                                         inputs={'X': grad},
+                                         outputs={'Out': grad},
+                                         attrs={OP_ROLE_KEY: OpRole.Backward})
                         offset += 1
 
-                    block._insert_op(
-                        offset,
-                        type='c_allreduce_sum',
-                        inputs={'X': grad},
-                        outputs={'Out': grad},
-                        attrs={
-                            'ring_id': ring_id,
-                            OP_ROLE_KEY: OpRole.Backward
-                        })
+                    block._insert_op(offset,
+                                     type='c_allreduce_sum',
+                                     inputs={'X': grad},
+                                     outputs={'Out': grad},
+                                     attrs={
+                                         'ring_id': ring_id,
+                                         OP_ROLE_KEY: OpRole.Backward
+                                     })
 
         if grad is None:
             return
 
         for idx, op in list(enumerate(block.ops)):
             if is_optimizer_op(op):
-                block._insert_op(
-                    idx,
-                    type='c_sync_comm_stream',
-                    inputs={'X': grad},
-                    outputs={'Out': grad},
-                    attrs={'ring_id': ring_id,
-                           OP_ROLE_KEY: OpRole.Backward})
+                block._insert_op(idx,
+                                 type='c_sync_comm_stream',
+                                 inputs={'X': grad},
+                                 outputs={'Out': grad},
+                                 attrs={
+                                     'ring_id': ring_id,
+                                     OP_ROLE_KEY: OpRole.Backward
+                                 })
                 break
diff --git a/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py b/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py
index 69e41ab0edab2..f5b8660bd88d4 100644
--- a/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py
+++ b/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py
@@ -18,6 +18,7 @@
 
 
 class MetaParallelBase(Layer):
+
     def __init__(self, layers, hcg, strategy):
         super(MetaParallelBase,
               self).__init__(layers.full_name() + "_meta_parallel_base")
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
index 2ce8cf7bdeb74..14ca1322e789f 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -23,11 +23,12 @@
 __all__ = []
 
 # Follow this paper to achieve the file:
-# Shoeybi M, Patwary M, Puri R, et al. Megatron-lm: Training multi-billion parameter 
+# Shoeybi M, Patwary M, Puri R, et al. Megatron-lm: Training multi-billion parameter
 # language models using model parallelism[J]. arXiv preprint arXiv:1909.08053, 2019. (https://arxiv.org/abs/1909.08053)
 
 
 class VocabParallelEmbedding(Layer):
+
     def __init__(self,
                  num_embeddings,
                  embedding_dim,
@@ -58,17 +59,15 @@ def __init__(self,
 
         if self.is_mp and paddle.in_dynamic_mode():
             with get_rng_state_tracker().rng_state():
-                self.weight = self.create_parameter(
-                    attr=self._weight_attr,
-                    shape=self._size,
-                    dtype=self._dtype,
-                    is_bias=False)
+                self.weight = self.create_parameter(attr=self._weight_attr,
+                                                    shape=self._size,
+                                                    dtype=self._dtype,
+                                                    is_bias=False)
         else:
-            self.weight = self.create_parameter(
-                attr=self._weight_attr,
-                shape=self._size,
-                dtype=self._dtype,
-                is_bias=False)
+            self.weight = self.create_parameter(attr=self._weight_attr,
+                                                shape=self._size,
+                                                dtype=self._dtype,
+                                                is_bias=False)
 
         self.weight.is_distributed = True if self.is_mp else False
 
@@ -85,16 +84,16 @@ def forward(self, x):
                 use_calc_stream=True,
                 use_model_parallel=True)
         else:
-            output = F.embedding(
-                x,
-                weight=self.weight,
-                padding_idx=None,
-                sparse=False,
-                name=self._name)
+            output = F.embedding(x,
+                                 weight=self.weight,
+                                 padding_idx=None,
+                                 sparse=False,
+                                 name=self._name)
         return output
 
 
 class ColumnParallelLinear(Layer):
+
     def __init__(self,
                  in_features,
                  out_features,
@@ -114,8 +113,8 @@ def __init__(self,
         self.gather_output = gather_output
         assert out_features % self.world_size == 0, (
             "Number of column of the weight for linear ({}) must be"
-            " divisible by model parallel size ({})".format(out_features,
-                                                            self.world_size))
+            " divisible by model parallel size ({})".format(
+                out_features, self.world_size))
         self.output_size_per_partition = out_features // self.world_size
 
         self._weight_attr = weight_attr
@@ -156,8 +155,10 @@ def forward(self, x):
         else:
             input_parallel = x
 
-        output_parallel = F.linear(
-            input_parallel, self.weight, self.bias, name=self._name)
+        output_parallel = F.linear(input_parallel,
+                                   self.weight,
+                                   self.bias,
+                                   name=self._name)
 
         if self.gather_output and self.is_mp:
             output = paddle.distributed.collective._c_concat(
@@ -168,6 +169,7 @@ def forward(self, x):
 
 
 class RowParallelLinear(Layer):
+
     def __init__(self,
                  in_features,
                  out_features,
@@ -193,8 +195,8 @@ def __init__(self,
         self.is_mp = (self.world_size > 1)
         assert in_features % self.world_size == 0, (
             "Number of row of the weight for linear ({}) must be"
-            " divisible by model parallel size ({})".format(in_features,
-                                                            self.world_size))
+            " divisible by model parallel size ({})".format(
+                in_features, self.world_size))
 
         self.input_size_per_partition = in_features // self.world_size
 
@@ -247,6 +249,7 @@ def forward(self, x):
 
 
 class ParallelCrossEntropy(Layer):
+
     def __init__(self, name=None):
         super(ParallelCrossEntropy, self).__init__()
         self.name = name
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
index a39b77303757a..58b0515e0bac8 100755
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -56,6 +56,7 @@
 
 
 class LayerDesc(object):
+
     def __init__(self, layer_func, *inputs, **kwargs):
         self.layer_func = layer_func
         self.inputs = inputs
@@ -74,6 +75,7 @@ def __repr__(self):
 
 
 class SharedLayerDesc(LayerDesc):
+
     def __init__(self,
                  key,
                  layer_func,
@@ -88,6 +90,7 @@ def __init__(self,
 
 
 class SegmentLayers(object):
+
     def __init__(self, layers_desc, num_parts, method="uniform"):
         self._layers_desc = layers_desc
         self.method = method
@@ -157,6 +160,7 @@ def uniform(self, num_items, num_parts):
 
 
 class PipelineLayer(Layer):
+
     def __init__(self,
                  layers,
                  num_stages=None,
@@ -184,8 +188,8 @@ def __init__(self,
 
         if recompute_interval > 0:
             logger.info(
-                "Start Recompute for PipeLineParallel. recompute_offload: {}, recompute_partition: {}".
-                format(recompute_offload, recompute_partition))
+                "Start Recompute for PipeLineParallel. recompute_offload: {}, recompute_partition: {}"
+                .format(recompute_offload, recompute_partition))
         _initialize_recompute_setting(recompute_offload, recompute_partition)
 
         world_size = dist.get_world_size()
@@ -200,9 +204,10 @@ def __init__(self,
         else:
             # construct default topology
             if world_size % num_stages != 0:
-                raise ValueError("should provide correct num_stages({}) "
-                                 "which can be divided by world_size({})".
-                                 format(num_stages, world_size))
+                raise ValueError(
+                    "should provide correct num_stages({}) "
+                    "which can be divided by world_size({})".format(
+                        num_stages, world_size))
             dp_num = world_size // num_stages
             self._topo = fleet.CommunicateTopology(["data", "pipe", "model"],
                                                    [dp_num, num_stages, 1])
@@ -238,8 +243,8 @@ def _construct_shared_comm(self):
             return
 
         layers_desc = self._layers_desc
-        shared_layer_names = set(
-            s.layer_name for s in layers_desc if isinstance(s, SharedLayerDesc))
+        shared_layer_names = set(s.layer_name for s in layers_desc
+                                 if isinstance(s, SharedLayerDesc))
         for key in shared_layer_names:
             shared_layers = []
             for idx, layer in enumerate(layers_desc):
@@ -283,10 +288,10 @@ def _construct_shared_comm(self):
     def _synchronize_shared_weights(self):
         for key, comm in self.shared_comm.items():
             with paddle.framework.no_grad():
-                paddle.distributed.broadcast(
-                    getattr(comm['layer'], comm['weight_attr']),
-                    src=min(comm['ranks']),
-                    group=comm['group'])
+                paddle.distributed.broadcast(getattr(comm['layer'],
+                                                     comm['weight_attr']),
+                                             src=min(comm['ranks']),
+                                             group=comm['group'])
 
             for param in comm['layer'].parameters():
                 if self.global_rank != min(comm['ranks']):
@@ -298,8 +303,8 @@ def allreduce_shared_weight_gradients(self):
             # need use trace_op to allreduce weight
             if in_dygraph_mode():
                 with paddle.framework.no_grad():
-                    paddle.distributed.all_reduce(
-                        param.grad, group=comm['group'])
+                    paddle.distributed.all_reduce(param.grad,
+                                                  group=comm['group'])
             else:
                 with paddle.framework.no_grad():
                     paddle.fluid.framework._dygraph_tracer().trace_op(
@@ -313,12 +318,13 @@ def allreduce_shared_weight_gradients(self):
 
     def _segment_network(self, seg_method):
         logger.info("start segment network..")
-        seg = SegmentLayers(
-            self._layers_desc, num_parts=self._num_stages, method=seg_method)
+        seg = SegmentLayers(self._layers_desc,
+                            num_parts=self._num_stages,
+                            method=seg_method)
         self.segment_parts = seg.do_segment()
 
-        logger.info("segment result:" + ", ".join(
-            str(arg) for arg in self.segment_parts))
+        logger.info("segment result:" +
+                    ", ".join(str(arg) for arg in self.segment_parts))
 
         self._start_pos = self.segment_parts[self._stage_id]
         self._end_pos = self.segment_parts[self._stage_id + 1]
@@ -357,13 +363,13 @@ def _build_layer(self):
                         setattr(param, "is_firstly_shared", True)
 
                 if layer.forward_func is None:
-                    self.run_function.append(self.shared_layers[
-                        layer.layer_name])
+                    self.run_function.append(
+                        self.shared_layers[layer.layer_name])
 
                 else:
                     self.run_function.append(
-                        partial(layer.forward_func, self.shared_layers[
-                            layer.layer_name]))
+                        partial(layer.forward_func,
+                                self.shared_layers[layer.layer_name]))
 
             elif isinstance(layer, LayerDesc):
                 model = layer.build_layer()
@@ -373,6 +379,7 @@ def _build_layer(self):
                 self.run_function.append(layer)
 
     def forward_function(self, start, end):
+
         def execute_func(*x):
             if len(x) == 1:
                 x = x[0]
@@ -403,8 +410,8 @@ def forward(self, input):
         return input
 
     def _need_recompute(self, funcs, inputs):
-        if not any(input_.stop_gradient == False for input_ in inputs
-                   if isinstance(input_, paddle.Tensor)):
+        if not any(input_.stop_gradient == False
+                   for input_ in inputs if isinstance(input_, paddle.Tensor)):
             return False
 
         params = [f.parameters() for f in funcs if isinstance(f, Layer)]
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
index a59d86f129197..fdbf0312db664 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
@@ -105,12 +105,13 @@ def determinate_seed(rng_name):
     helper = LayerHelper('seed', **locals())
     out = helper.create_variable_for_type_inference(dtype=paddle.int32)
     # set force_cpu to reduce sync copy from CPU->GPU->CPU, and reduce pipeline hang
-    helper.append_op(
-        type='seed',
-        outputs={'Out': out},
-        attrs={'deterministic': True,
-               'rng_name': rng_name,
-               'force_cpu': True})
+    helper.append_op(type='seed',
+                     outputs={'Out': out},
+                     attrs={
+                         'deterministic': True,
+                         'rng_name': rng_name,
+                         'force_cpu': True
+                     })
     return out
 
 
@@ -218,15 +219,18 @@ def dropout(x,
     mask = helper.create_variable_for_type_inference(
         dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
 
-    helper.append_op(
-        type='dropout',
-        inputs={'X': [x],
-                'Seed': seed},
-        outputs={'Out': [out],
-                 'Mask': [mask]},
-        attrs={
-            'dropout_prob': p,
-            'is_test': not training,
-            'dropout_implementation': mode,
-        })
+    helper.append_op(type='dropout',
+                     inputs={
+                         'X': [x],
+                         'Seed': seed
+                     },
+                     outputs={
+                         'Out': [out],
+                         'Mask': [mask]
+                     },
+                     attrs={
+                         'dropout_prob': p,
+                         'is_test': not training,
+                         'dropout_implementation': mode,
+                     })
     return out
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index d2171920f2bb6..3135c5379e880 100755
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -29,6 +29,7 @@
 
 
 class PipelineParallel(MetaParallelBase):
+
     def __init__(self, layers, hcg, strategy):
         if not isinstance(layers, PipelineLayer):
             raise TypeError(
@@ -239,9 +240,10 @@ def _forward_step(self, input_tensor):
                 assert self._layers._loss_fn is not None, "loss function should exist to compute loss"
                 labels = self._load_micro_batch(self.micro_batch_id)
                 output_tensor = self._layers._loss_fn(output_tensor, labels)
-                assert isinstance(output_tensor, (
-                    paddle.Tensor, core.eager.Tensor
-                )), "Currently, loss_fn should obtain Paddle.Tensor dtype"
+                assert isinstance(
+                    output_tensor,
+                    (paddle.Tensor, core.eager.Tensor
+                     )), "Currently, loss_fn should obtain Paddle.Tensor dtype"
 
                 with paddle.amp.auto_cast(enable=False):
                     if self.accumulate_steps > 1:
@@ -270,9 +272,8 @@ def _backward_step(self, input_tensor, output_tensor, output_tensor_grad):
                         tensors=outputs,
                         grad_tensors=[t for t in output_tensor_grad])
                 else:
-                    paddle.autograd.backward(
-                        tensors=[output_tensor],
-                        grad_tensors=[output_tensor_grad])
+                    paddle.autograd.backward(tensors=[output_tensor],
+                                             grad_tensors=[output_tensor_grad])
 
             input_tensor_grad = None
             if input_tensor is not None:
@@ -327,16 +328,14 @@ def _broadcast_final_loss(self):
             loss = self.total_loss.detach()
             is_fp32 = paddle.to_tensor(
                 1) if loss.dtype == paddle.float32 else paddle.to_tensor(0)
-            paddle.distributed.broadcast(
-                is_fp32,
-                src=self.global_rank,
-                use_calc_stream=True,
-                group=self.pp_group)
-            paddle.distributed.broadcast(
-                loss,
-                src=self.global_rank,
-                use_calc_stream=True,
-                group=self.pp_group)
+            paddle.distributed.broadcast(is_fp32,
+                                         src=self.global_rank,
+                                         use_calc_stream=True,
+                                         group=self.pp_group)
+            paddle.distributed.broadcast(loss,
+                                         src=self.global_rank,
+                                         use_calc_stream=True,
+                                         group=self.pp_group)
         else:
             is_fp32 = paddle.to_tensor(1)
             paddle.distributed.broadcast(
@@ -344,10 +343,10 @@ def _broadcast_final_loss(self):
                 src=self._hcg.get_rank_from_stage(self.num_stages - 1),
                 use_calc_stream=True,
                 group=self.pp_group)
-            loss = paddle.zeros(
-                shape=[1],
-                dtype="float32") if is_fp32.numpy()[0] else paddle.zeros(
-                    shape=[1], dtype="float16")
+            loss = paddle.zeros(shape=[
+                1
+            ], dtype="float32") if is_fp32.numpy()[0] else paddle.zeros(
+                shape=[1], dtype="float16")
             paddle.distributed.broadcast(
                 loss,
                 src=self._hcg.get_rank_from_stage(self.num_stages - 1),
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
index de36f8503a651..17c7f5a9bbc4a 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -155,7 +155,7 @@ def _is_valid_send_recv_partial(tensor, mp_degree):
         assert tensor_numel != 0, "can't send/recv zero element"
         return mp_degree > 1 and tensor_numel % mp_degree == 0
     elif in_dygraph_mode():
-        # TODO(shenliang03) support mp+pp optimizer in future. 
+        # TODO(shenliang03) support mp+pp optimizer in future.
         # (partial_send/partial_recv/partial_allgather_)
         return False
 
@@ -175,11 +175,10 @@ def send_partial(tensor,
                                    use_calc_stream, 'ring_id', ring_id, 'peer',
                                    dst, 'num', nranks, 'id', rank_id)
     else:
-        return paddle.distributed.send(
-            tensor.detach(),
-            dst=dst,
-            group=group,
-            use_calc_stream=use_calc_stream)
+        return paddle.distributed.send(tensor.detach(),
+                                       dst=dst,
+                                       group=group,
+                                       use_calc_stream=use_calc_stream)
 
 
 def recv_partial(tensor,
@@ -198,11 +197,10 @@ def recv_partial(tensor,
                             'id', rank_id, 'dtype', tensor.dtype, 'out_shape',
                             tensor.shape)
     else:
-        paddle.distributed.recv(
-            tensor.detach(),
-            src=src,
-            group=group,
-            use_calc_stream=use_calc_stream)
+        paddle.distributed.recv(tensor.detach(),
+                                src=src,
+                                group=group,
+                                use_calc_stream=use_calc_stream)
 
 
 def allgather_partial(tensor,
@@ -244,8 +242,8 @@ def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
         if isinstance(recv_shape_msg, tuple):
             tensor_recv_prev = []
             for idx, shape in enumerate(recv_shape_msg):
-                tmp = paddle.empty(
-                    shape=shape, dtype=number_2_dtype(recv_dtype_msg[idx]))
+                tmp = paddle.empty(shape=shape,
+                                   dtype=number_2_dtype(recv_dtype_msg[idx]))
                 tmp.stop_gradient = recv_stop_gradient[idx]
                 tensor_recv_prev.append(tmp)
             tensor_recv_prev = tuple(tensor_recv_prev)
@@ -260,8 +258,8 @@ def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
             tensor_recv_next = []
             for idx, shape in enumerate(send_shape_msg):
                 tensor_recv_next.append(
-                    paddle.empty(
-                        shape=shape, dtype=number_2_dtype(send_dtype_msg[idx])))
+                    paddle.empty(shape=shape,
+                                 dtype=number_2_dtype(send_dtype_msg[idx])))
             tensor_recv_next = tuple(tensor_recv_next)
         else:
             tensor_recv_next = paddle.empty(
@@ -272,107 +270,95 @@ def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
         if isinstance(tensor_send_prev, tuple):
             for d in tensor_send_prev:
                 paddle.distributed.wait(d, use_calc_stream=True)
-                send_partial(
-                    d,
-                    dst=0,
-                    nranks=mp_degree,
-                    rank_id=mp_rank,
-                    group=_hcg.send_prev_group,
-                    use_calc_stream=False)
+                send_partial(d,
+                             dst=0,
+                             nranks=mp_degree,
+                             rank_id=mp_rank,
+                             group=_hcg.send_prev_group,
+                             use_calc_stream=False)
         else:
             paddle.distributed.wait(tensor_send_prev, use_calc_stream=True)
-            send_partial(
-                tensor_send_prev,
-                dst=0,
-                nranks=mp_degree,
-                rank_id=mp_rank,
-                group=_hcg.send_prev_group,
-                use_calc_stream=False)
+            send_partial(tensor_send_prev,
+                         dst=0,
+                         nranks=mp_degree,
+                         rank_id=mp_rank,
+                         group=_hcg.send_prev_group,
+                         use_calc_stream=False)
 
     if tensor_recv_prev is not None:
         if isinstance(tensor_recv_prev, tuple):
             for d in tensor_recv_prev:
-                recv_partial(
-                    d,
-                    src=0,
-                    nranks=mp_degree,
-                    rank_id=mp_rank,
-                    group=_hcg.recv_prev_group,
-                    use_calc_stream=True)
-                allgather_partial(
-                    d,
-                    nranks=mp_degree,
-                    rank_id=mp_rank,
-                    group=mp_group,
-                    use_calc_stream=True)
+                recv_partial(d,
+                             src=0,
+                             nranks=mp_degree,
+                             rank_id=mp_rank,
+                             group=_hcg.recv_prev_group,
+                             use_calc_stream=True)
+                allgather_partial(d,
+                                  nranks=mp_degree,
+                                  rank_id=mp_rank,
+                                  group=mp_group,
+                                  use_calc_stream=True)
         else:
-            recv_partial(
-                tensor_recv_prev,
-                src=0,
-                nranks=mp_degree,
-                rank_id=mp_rank,
-                group=_hcg.recv_prev_group,
-                use_calc_stream=True)
-            allgather_partial(
-                tensor_recv_prev,
-                nranks=mp_degree,
-                rank_id=mp_rank,
-                group=mp_group,
-                use_calc_stream=True)
+            recv_partial(tensor_recv_prev,
+                         src=0,
+                         nranks=mp_degree,
+                         rank_id=mp_rank,
+                         group=_hcg.recv_prev_group,
+                         use_calc_stream=True)
+            allgather_partial(tensor_recv_prev,
+                              nranks=mp_degree,
+                              rank_id=mp_rank,
+                              group=mp_group,
+                              use_calc_stream=True)
 
     if tensor_send_next is not None:
         if isinstance(tensor_send_next, tuple):
             for d in tensor_send_next:
                 paddle.distributed.wait(d, use_calc_stream=True)
-                send_partial(
-                    d,
-                    dst=1,
-                    nranks=mp_degree,
-                    rank_id=mp_rank,
-                    group=_hcg.send_next_group,
-                    use_calc_stream=False)
+                send_partial(d,
+                             dst=1,
+                             nranks=mp_degree,
+                             rank_id=mp_rank,
+                             group=_hcg.send_next_group,
+                             use_calc_stream=False)
         else:
             paddle.distributed.wait(tensor_send_next, use_calc_stream=True)
-            send_partial(
-                tensor_send_next,
-                dst=1,
-                nranks=mp_degree,
-                rank_id=mp_rank,
-                group=_hcg.send_next_group,
-                use_calc_stream=False)
+            send_partial(tensor_send_next,
+                         dst=1,
+                         nranks=mp_degree,
+                         rank_id=mp_rank,
+                         group=_hcg.send_next_group,
+                         use_calc_stream=False)
 
     if tensor_recv_next is not None:
         if isinstance(tensor_recv_next, tuple):
             for d in tensor_recv_next:
-                recv_partial(
-                    d,
-                    src=1,
-                    nranks=mp_degree,
-                    rank_id=mp_rank,
-                    group=_hcg.recv_next_group,
-                    use_calc_stream=True)
-                allgather_partial(
-                    d,
-                    nranks=mp_degree,
-                    rank_id=mp_rank,
-                    group=mp_group,
-                    use_calc_stream=True)
+                recv_partial(d,
+                             src=1,
+                             nranks=mp_degree,
+                             rank_id=mp_rank,
+                             group=_hcg.recv_next_group,
+                             use_calc_stream=True)
+                allgather_partial(d,
+                                  nranks=mp_degree,
+                                  rank_id=mp_rank,
+                                  group=mp_group,
+                                  use_calc_stream=True)
 
         else:
-            recv_partial(
-                tensor_recv_next,
-                src=1,
-                nranks=mp_degree,
-                rank_id=mp_rank,
-                group=_hcg.recv_next_group,
-                use_calc_stream=True)
-
-            allgather_partial(
-                tensor_recv_next,
-                nranks=mp_degree,
-                rank_id=mp_rank,
-                group=mp_group,
-                use_calc_stream=True)
+            recv_partial(tensor_recv_next,
+                         src=1,
+                         nranks=mp_degree,
+                         rank_id=mp_rank,
+                         group=_hcg.recv_next_group,
+                         use_calc_stream=True)
+
+            allgather_partial(tensor_recv_next,
+                              nranks=mp_degree,
+                              rank_id=mp_rank,
+                              group=mp_group,
+                              use_calc_stream=True)
     return tensor_recv_prev, tensor_recv_next
 
 
@@ -384,11 +370,10 @@ def recv_forward():
             _send_recv_meta.recv_meta(_hcg.recv_prev_group)
             _send_recv_meta.has_recv_meta = _use_cache
 
-        input_tensor, _ = _p2p_helper(
-            tensor_send_next=None,
-            tensor_send_prev=None,
-            recv_prev=True,
-            recv_next=False)
+        input_tensor, _ = _p2p_helper(tensor_send_next=None,
+                                      tensor_send_prev=None,
+                                      recv_prev=True,
+                                      recv_next=False)
     return input_tensor
 
 
@@ -396,11 +381,10 @@ def recv_backward():
     if _hcg.is_last_stage:
         output_tensor_grad = None
     else:
-        _, output_tensor_grad = _p2p_helper(
-            tensor_send_next=None,
-            tensor_send_prev=None,
-            recv_prev=False,
-            recv_next=True)
+        _, output_tensor_grad = _p2p_helper(tensor_send_next=None,
+                                            tensor_send_prev=None,
+                                            recv_prev=False,
+                                            recv_next=True)
     return output_tensor_grad
 
 
@@ -411,31 +395,28 @@ def send_forward(output_tensor):
             _send_recv_meta.send_meta(output_tensor, _hcg.send_next_group)
             _send_recv_meta.has_send_meta = _use_cache
 
-        _p2p_helper(
-            tensor_send_next=output_tensor,
-            tensor_send_prev=None,
-            recv_prev=False,
-            recv_next=False)
+        _p2p_helper(tensor_send_next=output_tensor,
+                    tensor_send_prev=None,
+                    recv_prev=False,
+                    recv_next=False)
 
 
 def send_backward(input_tensor_grad):
     if not _hcg.is_first_stage:
-        _p2p_helper(
-            tensor_send_next=None,
-            tensor_send_prev=input_tensor_grad,
-            recv_prev=False,
-            recv_next=False)
+        _p2p_helper(tensor_send_next=None,
+                    tensor_send_prev=input_tensor_grad,
+                    recv_prev=False,
+                    recv_next=False)
 
 
 def send_forward_recv_backward(output_tensor):
     if _hcg.is_last_stage:
         output_tensor_grad = None
     else:
-        _, output_tensor_grad = _p2p_helper(
-            tensor_send_next=output_tensor,
-            tensor_send_prev=None,
-            recv_prev=False,
-            recv_next=True)
+        _, output_tensor_grad = _p2p_helper(tensor_send_next=output_tensor,
+                                            tensor_send_prev=None,
+                                            recv_prev=False,
+                                            recv_next=True)
     return output_tensor_grad
 
 
@@ -443,9 +424,8 @@ def send_backward_recv_forward(input_tensor_grad):
     if _hcg.is_first_stage:
         input_tensor = None
     else:
-        input_tensor, _ = _p2p_helper(
-            tensor_send_next=None,
-            tensor_send_prev=input_tensor_grad,
-            recv_prev=True,
-            recv_next=False)
+        input_tensor, _ = _p2p_helper(tensor_send_next=None,
+                                      tensor_send_prev=input_tensor_grad,
+                                      recv_prev=True,
+                                      recv_next=False)
     return input_tensor
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
index 6c8badd64e161..4fed58fe133dd 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -164,7 +164,7 @@ class _HPEagerRecomputeFunction(EagerPyLayer):
     def forward(ctx, run_function, all_outputs, *args):
         check_recompute_necessary(args)
 
-        # store for recomputing 
+        # store for recomputing
         ctx.run_function = run_function
 
         # store the rng states
@@ -237,8 +237,8 @@ def backward(ctx, *args):
             for i, idx in enumerate(tensor_indices):
                 if _recompute_partition:
                     state = tensors[i].stop_gradient
-                    tensors[i] = _merge_activation(tensors[i]).detach(
-                    ).reshape_(tensor_shapes[i])
+                    tensors[i] = _merge_activation(
+                        tensors[i]).detach().reshape_(tensor_shapes[i])
                     tensors[i].stop_gradient = state
                 inputs[idx] = tensors[i].cuda(
                     device_id) if _recompute_offload else tensors[i]
@@ -249,11 +249,10 @@ def backward(ctx, *args):
             # need restore auto_cast state as well as w/b list
             with swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
                                          ctx.fwd_cuda_rng_state_tracker):
-                with paddle.amp.auto_cast(
-                        enable=ctx.is_fw_autocast,
-                        custom_white_list=ctx.amp_white_list,
-                        custom_black_list=ctx.amp_black_list,
-                        level=ctx.amp_level):
+                with paddle.amp.auto_cast(enable=ctx.is_fw_autocast,
+                                          custom_white_list=ctx.amp_white_list,
+                                          custom_black_list=ctx.amp_black_list,
+                                          level=ctx.amp_level):
                     detached_inputs = detach_variable(tuple(inputs))
                     outputs = ctx.run_function(*detached_inputs)
 
@@ -276,7 +275,7 @@ def backward(ctx, *args):
                     "none of output has stop_gradient=False, this recompute() is not necessary"
                 )
 
-            # actually backward            
+            # actually backward
             paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
             grads = tuple(inp._grad_ivar() for inp in detached_inputs
                           if isinstance(inp, core.eager.Tensor))
@@ -296,7 +295,7 @@ class _HPRecomputeFunction(PyLayer):
     def forward(ctx, run_function, all_outputs, *args):
         check_recompute_necessary(args)
 
-        # store for recomputing 
+        # store for recomputing
         ctx.run_function = run_function
 
         # store the rng states
@@ -369,8 +368,8 @@ def backward(ctx, *args):
             for i, idx in enumerate(tensor_indices):
                 if _recompute_partition:
                     state = tensors[i].stop_gradient
-                    tensors[i] = _merge_activation(tensors[i]).detach(
-                    ).reshape_(tensor_shapes[i])
+                    tensors[i] = _merge_activation(
+                        tensors[i]).detach().reshape_(tensor_shapes[i])
                     tensors[i].stop_gradient = state
                 inputs[idx] = tensors[i].cuda(
                     device_id) if _recompute_offload else tensors[i]
@@ -381,11 +380,10 @@ def backward(ctx, *args):
             # need restore auto_cast state as well as w/b list
             with swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
                                          ctx.fwd_cuda_rng_state_tracker):
-                with paddle.amp.auto_cast(
-                        enable=ctx.is_fw_autocast,
-                        custom_white_list=ctx.amp_white_list,
-                        custom_black_list=ctx.amp_black_list,
-                        level=ctx.amp_level):
+                with paddle.amp.auto_cast(enable=ctx.is_fw_autocast,
+                                          custom_white_list=ctx.amp_white_list,
+                                          custom_black_list=ctx.amp_black_list,
+                                          level=ctx.amp_level):
                     detached_inputs = detach_variable(tuple(inputs))
                     outputs = ctx.run_function(*detached_inputs)
 
@@ -407,7 +405,7 @@ def backward(ctx, *args):
                     "none of output has stop_gradient=False, this recompute() is not necessary"
                 )
 
-            # actually backward            
+            # actually backward
             paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
             grads = tuple(inp._grad_ivar() for inp in detached_inputs
                           if isinstance(inp, core.VarBase))
@@ -415,7 +413,7 @@ def backward(ctx, *args):
 
 
 def _hp_recompute(function, *args):
-    # NODTE(shenliang03)The current hybrid parallel recompute has limitations. 
+    # NODTE(shenliang03)The current hybrid parallel recompute has limitations.
     # It cannot handle the following situations:
     # 1. The calculation output of recompute, there are tensors that do not require gradients.
     # 2. The forward output tensor has no gradient. This problem can be solved temporarily by detach().
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
index 70d2d2a1930c9..7bdbe2ce32e47 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
@@ -55,7 +55,7 @@ class GroupShardedOptimizerStage2(Optimizer):
 
     """
 
-    # TODO (Baibaifan) 
+    # TODO (Baibaifan)
     # Feature Notes:
     # 1. Unified memory for parameters and parameters.grad to InternalStorage.
     # 2. Support the segmentation of optimizer parameters and partial updating of parameters.
@@ -103,8 +103,8 @@ def __init__(self,
                 filter(lambda x: x.trainable and x.dtype == Type.fp16.value,
                        self._local_params))) > 0
 
-        self._group = new_group(_get_global_group()
-                                .ranks) if group is None else group
+        self._group = new_group(
+            _get_global_group().ranks) if group is None else group
 
         self.world_size = self._group.nranks
         self._rank = self._group.rank
@@ -152,11 +152,10 @@ def _sync_params_and_buffers(self):
         """
 
         for p in self._local_params:
-            broadcast(
-                p,
-                src=self._global_root_rank,
-                group=self._group,
-                use_calc_stream=True)
+            broadcast(p,
+                      src=self._global_root_rank,
+                      group=self._group,
+                      use_calc_stream=True)
 
     def _generate_master_params(self, trainable_params):
         if self.offload:
@@ -225,8 +224,9 @@ def dtype_rank_params(self):
             # Assign the parameters of each rank according to the type
             for param in self._local_params:
                 if param.dtype not in self._dtype_rank_params.keys():
-                    self._dtype_rank_params[
-                        param.dtype] = [[] for _ in range(self.world_size)]
+                    self._dtype_rank_params[param.dtype] = [
+                        [] for _ in range(self.world_size)
+                    ]
                 self._dtype_rank_params[param.dtype][self.param2rank[
                     param.name]].append(param)
 
@@ -410,8 +410,7 @@ def _broadcast_params(self):
         # Exchange all the shards with the other ranks
         for dtype_per_rank in self.param_storages.values():
             for dst_rank, internal_storage in dtype_per_rank.items():
-                broadcast(
-                    tensor=internal_storage.buffer,
-                    src=self._group.ranks[dst_rank],
-                    group=self._group,
-                    use_calc_stream=True)
+                broadcast(tensor=internal_storage.buffer,
+                          src=self._group.ranks[dst_rank],
+                          group=self._group,
+                          use_calc_stream=True)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
index 0c045c45fd599..39e92f8878028 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
@@ -53,7 +53,7 @@ class GroupShardedStage2(nn.Layer):
     .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
     """
 
-    # TODO (Baibaifan) 
+    # TODO (Baibaifan)
     # Feature Notes::
     # 1. Unified memory for param and param.grad to InternalStorage.
     # 2. Divide param.grad according to rank to centrally apply for and release GPU memory.
@@ -74,8 +74,9 @@ def __init__(
 
         # training options
         self._layer = layer
-        self._sharding_optimizers = [sharding_optimizer] if not isinstance(
-            sharding_optimizer, list) else sharding_optimizer
+        self._sharding_optimizers = [
+            sharding_optimizer
+        ] if not isinstance(sharding_optimizer, list) else sharding_optimizer
         assert all(
             list(
                 map(lambda opt: isinstance(opt, GroupShardedOptimizerStage2),
@@ -85,8 +86,8 @@ def __init__(
         self._auto_refresh_trainable = auto_refresh_trainable
 
         # Communication related attributes
-        self._group = collective.new_group(collective._get_global_group()
-                                           .ranks) if group is None else group
+        self._group = collective.new_group(
+            collective._get_global_group().ranks) if group is None else group
         self._world_size_scaling = 1.0 / self._group.nranks
         assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1"
         self._rank = self._group.rank
@@ -166,8 +167,8 @@ def forward(self, *inputs, **kwargs):
         return fw
 
     def set_state_dict(self, state_dict, use_structured_name=True):
-        self._layer.set_state_dict(
-            state_dict, use_structured_name=use_structured_name)
+        self._layer.set_state_dict(state_dict,
+                                   use_structured_name=use_structured_name)
 
     def state_dict(self,
                    destination=None,
@@ -228,7 +229,7 @@ def _init_internal_storage(self, needs_fresh):
         else:
             self._build_grad_storages()
 
-        # Clear all flags state 
+        # Clear all flags state
         self._clear_counters()
 
     def to(self, device=None, dtype=None, blocking=True):
@@ -282,11 +283,10 @@ def __sync_buffers(self):
         """
 
         for buffer in self._layer.buffers(include_sublayers=True):
-            collective.broadcast(
-                buffer,
-                self._global_root_rank,
-                self._group,
-                use_calc_stream=True)
+            collective.broadcast(buffer,
+                                 self._global_root_rank,
+                                 self._group,
+                                 use_calc_stream=True)
 
     def __getattr__(self, name):
         """Forward missing attributes to wrapped layer."""
@@ -337,10 +337,9 @@ def cleanup():
                             param.clear_gradient(False)
 
                     # Synchronize the reduce parameter gradient
-                    collective.reduce(
-                        tensor=param.grad,
-                        dst=self._group.ranks[dst_rank],
-                        group=self._group)
+                    collective.reduce(tensor=param.grad,
+                                      dst=self._group.ranks[dst_rank],
+                                      group=self._group)
                     #  TODO (Baibaifan) Asynchronous the reduce parameter gradient
 
                     # Clear the task flow and trigger callback to clear the redundant gradient
@@ -452,10 +451,10 @@ def _setup_use_grad_storage(self):
             else:
                 self._param_grads.append(param.name)
                 print(
-                    "Can not add param: {}, param's shape: {}, param align: {}, grad_storages fill: {}, ".
-                    format(param.name, param.shape, self._trainable_param2align[
-                        param.name], self._grad_storages[param.dtype][dst_rank]
-                           ._fill))
+                    "Can not add param: {}, param's shape: {}, param align: {}, grad_storages fill: {}, "
+                    .format(param.name, param.shape,
+                            self._trainable_param2align[param.name],
+                            self._grad_storages[param.dtype][dst_rank]._fill))
 
         for dtype in self._grad_storages.keys():
             self._grad_storage_list.extend(
@@ -511,15 +510,15 @@ def _rank_buffer_size(self, buffer_max_size, model_size):
         if Type.fp16.value in rank_buffer_size.keys():
             # FP16 GradStorage and model size
             logger_.info(
-                "====== FP16 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======".
-                format(rank_buffer_size[Type.fp16.value] / 2**19, model_size / 2
-                       **19))
+                "====== FP16 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======"
+                .format(rank_buffer_size[Type.fp16.value] / 2**19,
+                        model_size / 2**19))
         if Type.fp32.value in rank_buffer_size.keys():
             # FP32 GradStorage and model size
             logger_.info(
-                "====== FP32 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======".
-                format(rank_buffer_size[Type.fp32.value] / 2**18, model_size / 2
-                       **18))
+                "====== FP32 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======"
+                .format(rank_buffer_size[Type.fp32.value] / 2**18,
+                        model_size / 2**18))
         return rank_buffer_size
 
     def _redefine_opt_step(self):
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
index e44b5d2515d83..0d6bfcf922431 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -45,7 +45,9 @@ def _all_gather(tensor, buffer_size, group):
 
 
 # CUDA alignment 256 bytes
-alignment = {"gpu": 256, }
+alignment = {
+    "gpu": 256,
+}
 align = {
     Type.fp16.value: 2,
     Type.fp32.value: 4,
@@ -64,7 +66,7 @@ class GroupShardedStage3(nn.Layer):
     .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
     """
 
-    # TODO (Baibaifan) 
+    # TODO (Baibaifan)
     # Feature Notes::
     # 1. The model supports the segmentation of parameters by global ranks in layers.
     # 2. Support communication flow and computing flow.
@@ -98,14 +100,14 @@ def __init__(self,
         DEV = "cpu" if paddle.get_device() == "cpu" else paddle.get_device(
         ).split(":")[0]
         global DEV_ID
-        DEV_ID = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
-                                                            .split(":")[1])
+        DEV_ID = 0 if paddle.get_device() == "cpu" else int(
+            paddle.get_device().split(":")[1])
         global param2dtype
         param2dtype = dict()
 
         # Communication group establishment
-        self._group = collective.new_group(collective._get_global_group()
-                                           .ranks) if group is None else group
+        self._group = collective.new_group(
+            collective._get_global_group().ranks) if group is None else group
         self._world_size_scaling = 1.0 / self._group.nranks
         assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1."
         self._rank = self._group.rank
@@ -176,11 +178,10 @@ def _sync_params_and_buffers(self):
         """
 
         for p in self._layer.parameters():
-            collective.broadcast(
-                p,
-                src=self._global_root_rank,
-                group=self._group,
-                use_calc_stream=True)
+            collective.broadcast(p,
+                                 src=self._global_root_rank,
+                                 group=self._group,
+                                 use_calc_stream=True)
 
     def _clear_gradients(self):
         assert len(self._trainable_params.keys()) > 0
@@ -190,9 +191,9 @@ def _clear_gradients(self):
             filter(lambda p: p.trainable and p not in self._unslice_params,
                    current_layer_params))
         for param in trainable_params:
-            assert hasattr(
-                param, "fw_storage"
-            ), "Find {} don't have fw_storage attribute.".format(param.name)
+            assert hasattr(param, "fw_storage"
+                           ), "Find {} don't have fw_storage attribute.".format(
+                               param.name)
 
             param.fw_storage.clear_gradient(False)
             param.bw_storage._clear()
@@ -250,8 +251,8 @@ def forward(self, *inputs, **kwargs):
         return fw
 
     def set_state_dict(self, state_dict, use_structured_name=True):
-        self._layer.set_state_dict(
-            state_dict, use_structured_name=use_structured_name)
+        self._layer.set_state_dict(state_dict,
+                                   use_structured_name=use_structured_name)
 
     def state_dict(self,
                    destination=None,
@@ -376,16 +377,16 @@ def _param_storage(self, param, buffer_size):
         if self._offload:
             with device_guard():
                 tmp_tensor = buffer._slice(start, end)
-            param.fw_storage = core.eager.Tensor(
-                value=tmp_tensor,
-                place=core.CPUPlace(),
-                name="slice@" + param.name)
+            param.fw_storage = core.eager.Tensor(value=tmp_tensor,
+                                                 place=core.CPUPlace(),
+                                                 name="slice@" + param.name)
             with device_guard():
                 param.master_weight = paddle.cast(param.fw_storage,
                                                   Type.fp32.value)
         else:
-            param.fw_storage = core.eager.Tensor(
-                value=buffer._slice(start, end), name="slice@" + param.name)
+            param.fw_storage = core.eager.Tensor(value=buffer._slice(
+                start, end),
+                                                 name="slice@" + param.name)
         param.status = "part"
 
         # Updata optimizer master weights
@@ -414,6 +415,7 @@ def _register_forward_hooks(self, layer):
             self._register_forward_hooks(sub_layer)
 
     def _register_forward_all_hooks(self, sub_layer, task_flow):
+
         def _forward_pre_hook(layer, inputs):
             return ForwardPreHooks(layer, self._order_tracer,
                                    self._trainable_params,
@@ -421,10 +423,12 @@ def _forward_pre_hook(layer, inputs):
                                    self._sync_comm, self._offload, task_flow)
 
         def _forward_post_hook(layer, inputs, outputs):
-            return ForwardPostHooks.apply(
-                outputs, layer, self._order_tracer, self._trainable_params,
-                self._param2buffer, self._param2buffer_size, self._rank,
-                self._group, self._sync_comm, self._offload, task_flow)
+            return ForwardPostHooks.apply(outputs, layer, self._order_tracer,
+                                          self._trainable_params,
+                                          self._param2buffer,
+                                          self._param2buffer_size, self._rank,
+                                          self._group, self._sync_comm,
+                                          self._offload, task_flow)
 
         # register previous forward hooks
         sub_layer.register_forward_pre_hook(_forward_pre_hook)
@@ -439,11 +443,10 @@ def _sync_buffers(self):
         """
 
         for buffer in self._layer.buffers(include_sublayers=True):
-            collective.broadcast(
-                buffer,
-                self._global_root_rank,
-                self._group,
-                use_calc_stream=True)
+            collective.broadcast(buffer,
+                                 self._global_root_rank,
+                                 self._group,
+                                 use_calc_stream=True)
 
     def __getattr__(self, name):
         """Forward missing attributes to wrapped layer."""
@@ -507,15 +510,14 @@ def get_all_parameters(self, convert2cpu=False):
         trainable_params = list(
             filter(lambda p: p.trainable and p not in self._unslice_params,
                    current_layer_params))
-        t_flow = _allgather_buffer(
-            trainable_params,
-            self._group,
-            param2buffer_size=self._param2buffer_size,
-            use_calc_stream=True,
-            task_flow=TaskFlow(),
-            sync_wait=True,
-            offload=self._offload,
-            convert2cpu=convert2cpu)
+        t_flow = _allgather_buffer(trainable_params,
+                                   self._group,
+                                   param2buffer_size=self._param2buffer_size,
+                                   use_calc_stream=True,
+                                   task_flow=TaskFlow(),
+                                   sync_wait=True,
+                                   offload=self._offload,
+                                   convert2cpu=convert2cpu)
         if convert2cpu:
             for param in trainable_params:
                 t_flow.full_param[param.name][0]._share_buffer_to(param)
@@ -534,6 +536,7 @@ def _register_backward_hooks(self):
             param._register_backward_hook(allreduce_function)
 
     def _get_allreduce_fn(self, param):
+
         @paddle.autograd.no_grad()
         def allreduce_(*_):
             if param.name in self._task_flow.full_grad.keys():
@@ -552,8 +555,8 @@ def allreduce_(*_):
                         cpu_grad = _device2cpu(
                             full_grad._slice(start, end).detach().clone(), True)
                         with device_guard():
-                            param.bw_storage = paddle.add(param.bw_storage,
-                                                          cpu_grad)
+                            param.bw_storage = paddle.add(
+                                param.bw_storage, cpu_grad)
                     else:
                         param.bw_storage = paddle.add(
                             param.bw_storage,
@@ -566,8 +569,8 @@ def allreduce_(*_):
                     param.use_count = 0
                     param._clear_data()
                     start, end = self._param2buffer[param.name][self._rank]
-                    param.fw_storage = self._task_flow.full_param[param.name][
-                        0]._slice(start, end).detach().clone()
+                    param.fw_storage = self._task_flow.full_param[
+                        param.name][0]._slice(start, end).detach().clone()
                     param.status = "part"
                     del self._task_flow.full_param[param.name]
 
@@ -639,19 +642,19 @@ def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer_size,
         order_ = order_tracer[layer_id]
         layer_id = order_tracer["layer"][order_ + 1]
 
-    _allgather_buffer(
-        trainable_params[layer_id],
-        group,
-        param2buffer_size=param2buffer_size,
-        use_calc_stream=use_calc,
-        task_flow=task_flow,
-        sync_wait=sync_wait,
-        offload=offload)
+    _allgather_buffer(trainable_params[layer_id],
+                      group,
+                      param2buffer_size=param2buffer_size,
+                      use_calc_stream=use_calc,
+                      task_flow=task_flow,
+                      sync_wait=sync_wait,
+                      offload=offload)
 
     return
 
 
 class ForwardPostHooks(EagerPyLayer):
+
     @staticmethod
     def forward(ctx, inputs, layer, order_tracer, trainable_params,
                 param2buffer, param2buffer_size, rank, group, sync_comm,
@@ -668,7 +671,7 @@ def forward(ctx, inputs, layer, order_tracer, trainable_params,
             order_tracer["order"] += 1
             order_tracer["layer"].append(layer_id)
 
-        #Record fw info 
+        #Record fw info
         ctx.order_tracer = order_tracer
         ctx.task_flow = task_flow
         ctx.group = group
@@ -696,14 +699,13 @@ def backward(ctx, *args):
         # Allgather params synchronization
         if sync_comm:
             use_calc, sync_wait = True, True
-            _allgather_buffer(
-                trainable_params[layer_id],
-                group,
-                param2buffer_size=param2buffer_size,
-                use_calc_stream=use_calc,
-                task_flow=task_flow,
-                sync_wait=sync_wait,
-                offload=offload)
+            _allgather_buffer(trainable_params[layer_id],
+                              group,
+                              param2buffer_size=param2buffer_size,
+                              use_calc_stream=use_calc,
+                              task_flow=task_flow,
+                              sync_wait=sync_wait,
+                              offload=offload)
         else:
             _wait_layer(trainable_params[layer_id], task_flow, group,
                         param2buffer_size, use_calc, offload)
@@ -716,14 +718,13 @@ def backward(ctx, *args):
         task_flow.use_calc[layer_id] = use_calc
         if layer_id != order_tracer["layer"][0] and not sync_comm:
             layer_next_id = order_tracer["layer"][order_tracer[layer_id] - 1]
-            _allgather_buffer(
-                trainable_params[layer_next_id],
-                group,
-                param2buffer_size=param2buffer_size,
-                use_calc_stream=use_calc,
-                task_flow=task_flow,
-                sync_wait=sync_wait,
-                offload=offload)
+            _allgather_buffer(trainable_params[layer_next_id],
+                              group,
+                              param2buffer_size=param2buffer_size,
+                              use_calc_stream=use_calc,
+                              task_flow=task_flow,
+                              sync_wait=sync_wait,
+                              offload=offload)
 
         return args
 
@@ -757,8 +758,8 @@ def _release_param(trainable_params,
             if param.name in task_flow.full_param.keys():
                 start, end = param2buffer[param.name][rank]
                 with paddle.amp.auto_cast(enable=False):
-                    param.fw_storage = task_flow.full_param[param.name][
-                        0]._slice(start, end).detach().clone()
+                    param.fw_storage = task_flow.full_param[
+                        param.name][0]._slice(start, end).detach().clone()
                 param.status = "part"
                 del task_flow.full_param[param.name]
 
@@ -787,14 +788,13 @@ def _wait_layer(trainable_params,
             param.status = "all"
             param.use_count += 1
         else:
-            _allgather_buffer(
-                trainable_params,
-                group,
-                param2buffer_size=param2buffer_size,
-                use_calc_stream=True,
-                task_flow=task_flow,
-                sync_wait=True,
-                offload=offload)
+            _allgather_buffer(trainable_params,
+                              group,
+                              param2buffer_size=param2buffer_size,
+                              use_calc_stream=True,
+                              task_flow=task_flow,
+                              sync_wait=True,
+                              offload=offload)
             break
     return task_flow
 
@@ -831,7 +831,7 @@ def _allgather_buffer(trainable_params,
             param.use_count += 1
         task_flow.full_param[param.name] = (full_param, task)
 
-        # parameter converts to cpu 
+        # parameter converts to cpu
         if convert2cpu:
             p_name = param.name
             param = _device2cpu(param)
@@ -847,8 +847,8 @@ def _create_params_grad(trainable_params, param2buffer_size, task_flow):
         if param.name in task_flow.full_grad.keys():
             continue
         assert isinstance(param2buffer_size[param.name], int)
-        temp_grad = paddle.zeros(
-            [param2buffer_size[param.name]], dtype=param.dtype)
+        temp_grad = paddle.zeros([param2buffer_size[param.name]],
+                                 dtype=param.dtype)
         temp_tensor = temp_grad._slice(0, param._numel())
         temp_tensor.get_tensor()._set_dims(param.shape)
         param._copy_gradient_from(temp_tensor)
@@ -876,8 +876,9 @@ def _UnsliceParam(param):
 
 def _VarBaseWrapper(param):
     varbase = param.fw_storage
-    tmp_param = EagerParamBase(
-        shape=varbase.shape, dtype=varbase.dtype, name="slice@" + param.name)
+    tmp_param = EagerParamBase(shape=varbase.shape,
+                               dtype=varbase.dtype,
+                               name="slice@" + param.name)
     varbase._share_buffer_to(tmp_param)
     tmp_param.regularizer = param.regularizer
     tmp_param.optimize_attr['learning_rate'] = param.optimize_attr[
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
index 4d706870a91e9..c44872491093e 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
@@ -64,8 +64,8 @@ def to(self, device, dtype=None, keep_alignment=True):
         Move the underlying buffer
         """
         assert self.buffer is not None, "Cannot move a collapsed bucket, please rebuild it"
-        assert (dtype == Type.fp32.value or
-                Type.fp16.value), "Conversion type is not supported now"
+        assert (dtype == Type.fp32.value
+                or Type.fp16.value), "Conversion type is not supported now"
 
         if self._device != device:
             tmp_buffer = self.buffer.cuda(
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index eae8f87b01420..b1e0f6cc13068 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -45,6 +45,7 @@ class Type(Enum):
 
 
 class GroupShardedClipGrad:
+
     def __init__(self, clip, device, group):
         self._clip = clip
         self._device = device
@@ -82,8 +83,8 @@ def _dygraph_clip(self, params_grads):
         else:
             global_norm_fp16 = layers.concat(sum_square_fp16)
             global_norm_fp16 = layers.reduce_sum(global_norm_fp16)
-            global_norm_fp16 = paddle.cast(
-                global_norm_fp16, dtype=paddle.float32)
+            global_norm_fp16 = paddle.cast(global_norm_fp16,
+                                           dtype=paddle.float32)
 
         # global norm of non-distributed FP16 params_and_grads for unslice parameters
         if len(unslice_params_fp16) == 0:
@@ -91,12 +92,12 @@ def _dygraph_clip(self, params_grads):
         else:
             global_unslice_fp16 = layers.concat(unslice_params_fp16)
             global_unslice_fp16 = layers.reduce_sum(global_unslice_fp16)
-            global_unslice_fp16 = paddle.cast(
-                global_unslice_fp16, dtype=paddle.float32)
+            global_unslice_fp16 = paddle.cast(global_unslice_fp16,
+                                              dtype=paddle.float32)
 
         # global norm of non-distributed FP32 params_and_grads
-        global_norm_fp32 = layers.concat(sum_square_fp32) if len(
-            sum_square_fp32) != 0 else paddle.to_tensor(
+        global_norm_fp32 = layers.concat(
+            sum_square_fp32) if len(sum_square_fp32) != 0 else paddle.to_tensor(
                 [0.], dtype=paddle.float32)
         global_norm_fp32 = layers.reduce_sum(global_norm_fp32)
 
@@ -118,13 +119,14 @@ def _dygraph_clip(self, params_grads):
             paddle.distributed.all_reduce(global_norm_var, group=self._group)
 
         global_norm_var = layers.sqrt(global_norm_var)
-        max_global_norm = layers.fill_constant(
-            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
-
-        clip_var = layers.elementwise_div(
-            x=max_global_norm,
-            y=layers.elementwise_max(
-                x=global_norm_var, y=max_global_norm))
+        max_global_norm = layers.fill_constant(shape=[1],
+                                               dtype=global_norm_var.dtype,
+                                               value=self.clip_norm)
+
+        clip_var = layers.elementwise_div(x=max_global_norm,
+                                          y=layers.elementwise_max(
+                                              x=global_norm_var,
+                                              y=max_global_norm))
         clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
 
         for p, g in params_grads:
@@ -163,6 +165,7 @@ def device_guard(dev_id=0, device="cpu"):
 
 @dygraph_only
 def GroupShardedScaler(scaler):
+
     def unscale_method(self, optimizer):
         if not self._enable:
             return
@@ -201,8 +204,8 @@ def unscale_method(self, optimizer):
         temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
 
         device = "cpu" if optimizer.offload else "gpu"
-        dev_id = 0 if device == "cpu" else int(paddle.get_device().split(":")[
-            1])
+        dev_id = 0 if device == "cpu" else int(
+            paddle.get_device().split(":")[1])
 
         with device_guard(dev_id, device):
             if len(param_grads_fp16):
@@ -217,10 +220,9 @@ def unscale_method(self, optimizer):
         self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
         is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
 
-        paddle.distributed.all_reduce(
-            is_found_inf,
-            op=paddle.distributed.ReduceOp.MAX,
-            group=optimizer._group)
+        paddle.distributed.all_reduce(is_found_inf,
+                                      op=paddle.distributed.ReduceOp.MAX,
+                                      group=optimizer._group)
         self._found_inf = is_found_inf.numpy()[0]
 
     scaler._unscale = MethodType(unscale_method, scaler)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index b09d256d9bb60..7834e6d93984e 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -54,7 +54,7 @@ class ShardingStage2(nn.Layer):
     .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
     """
 
-    # TODO (Baibaifan) 
+    # TODO (Baibaifan)
     # Feature Notes::
     # 1. Unified memory for param and param.grad to InternalStorage.
     # 2. Divide param.grad according to rank to centrally apply for and release GPU memory.
@@ -75,8 +75,9 @@ def __init__(
 
         # training options
         self._layer = layer
-        self._sharding_optimizers = [sharding_optimizer] if not isinstance(
-            sharding_optimizer, list) else sharding_optimizer
+        self._sharding_optimizers = [
+            sharding_optimizer
+        ] if not isinstance(sharding_optimizer, list) else sharding_optimizer
         assert all(
             list(
                 map(lambda opt: isinstance(opt, ShardingOptimizerStage2),
@@ -86,8 +87,8 @@ def __init__(
         self._auto_refresh_trainable = auto_refresh_trainable
 
         # Communication related attributes
-        self._group = dist.new_group(_get_global_group()
-                                     .ranks) if group is None else group
+        self._group = dist.new_group(
+            _get_global_group().ranks) if group is None else group
         self._world_size_scaling = 1.0 / self._group.nranks
         assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1"
         self._rank = self._group.rank
@@ -106,8 +107,8 @@ def __init__(
         self._param_grads = []
 
         # Set grad storage size & Display param sizes and model sizes
-        model_size = sum(
-            [np.prod(p.shape) for p in self._layer.parameters()]).item()
+        model_size = sum([np.prod(p.shape)
+                          for p in self._layer.parameters()]).item()
         assert buffer_max_size >= 0, "buffer_max_size must be GE than 0."
         self._buffer_max_size = self._rank_buffer_size(buffer_max_size,
                                                        model_size)
@@ -166,15 +167,16 @@ def forward(self, *inputs, **kwargs):
         return fw
 
     def set_state_dict(self, state_dict, use_structured_name=True):
-        self._layer.set_state_dict(
-            state_dict, use_structured_name=use_structured_name)
+        self._layer.set_state_dict(state_dict,
+                                   use_structured_name=use_structured_name)
 
     def state_dict(self,
                    destination=None,
                    include_sublayers=True,
                    structured_name_prefix=""):
-        return self._layer.state_dict(
-            destination=None, include_sublayers=True, structured_name_prefix="")
+        return self._layer.state_dict(destination=None,
+                                      include_sublayers=True,
+                                      structured_name_prefix="")
 
     def _clear_gradients(self):
         """
@@ -226,7 +228,7 @@ def _init_internal_storage(self, needs_fresh):
         else:
             self._build_grad_storages()
 
-        # Clear all flags state 
+        # Clear all flags state
         self._clear_counters()
 
     def to(self, device=None, dtype=None, blocking=True):
@@ -280,11 +282,10 @@ def __sync_buffers(self):
         """
 
         for buffer in self._layer.buffers(include_sublayers=True):
-            dist.broadcast(
-                buffer,
-                self._global_root_rank,
-                self._group,
-                use_calc_stream=True)
+            dist.broadcast(buffer,
+                           self._global_root_rank,
+                           self._group,
+                           use_calc_stream=True)
         # Multi stream operation will be supported later
         dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
 
@@ -335,19 +336,17 @@ def cleanup():
 
                     # Synchronize the reduce parameter gradient
                     self._tasks_flow.append(
-                        Taskflow(
-                            task=dist.reduce(
-                                tensor=param.grad,
-                                dst=self._group.ranks[dst_rank],
-                                group=self._group,
-                                use_calc_stream=True),
-                            callback=cleanup))
+                        Taskflow(task=dist.reduce(
+                            tensor=param.grad,
+                            dst=self._group.ranks[dst_rank],
+                            group=self._group,
+                            use_calc_stream=True),
+                                 callback=cleanup))
 
                     # Multi stream operation will be supported later
-                    dist.wait(
-                        tensor=param.grad,
-                        group=self._group,
-                        use_calc_stream=True)
+                    dist.wait(tensor=param.grad,
+                              group=self._group,
+                              use_calc_stream=True)
 
                     # Clear the task flow and trigger callback to clear the redundant gradient
                     self._clear_task_flow()
@@ -393,20 +392,17 @@ def cleanup():
                         # Reduce the bucket
                         grad_storage.sent = True
                         self._tasks_flow.append(
-                            Taskflow(
-                                task=dist.reduce(
-                                    tensor=grad_storage.buffer,
-                                    dst=self._group.ranks[
-                                        grad_storage.destination],
-                                    group=self._group,
-                                    use_calc_stream=True),
-                                callback=cleanup))
+                            Taskflow(task=dist.reduce(
+                                tensor=grad_storage.buffer,
+                                dst=self._group.ranks[grad_storage.destination],
+                                group=self._group,
+                                use_calc_stream=True),
+                                     callback=cleanup))
 
                         # Multi stream operation will be supported later
-                        dist.wait(
-                            tensor=grad_storage.buffer,
-                            group=self._group,
-                            use_calc_stream=True)
+                        dist.wait(tensor=grad_storage.buffer,
+                                  group=self._group,
+                                  use_calc_stream=True)
 
                     # Clear the task flow and trigger callback to clear the redundant gradient
                     self._clear_task_flow()
@@ -466,10 +462,10 @@ def _setup_use_grad_storage(self):
             else:
                 self._param_grads.append(param.name)
                 print(
-                    "Can not add param: {}, param's shape: {}, param align: {}, grad_storages fill: {}, ".
-                    format(param.name, param.shape, self._trainable_param2align[
-                        param.name], self._grad_storages[param.dtype][dst_rank]
-                           ._fill))
+                    "Can not add param: {}, param's shape: {}, param align: {}, grad_storages fill: {}, "
+                    .format(param.name, param.shape,
+                            self._trainable_param2align[param.name],
+                            self._grad_storages[param.dtype][dst_rank]._fill))
 
         self._grad_storage_list = list(
             chain(*[
@@ -526,15 +522,15 @@ def _rank_buffer_size(self, buffer_max_size, model_size):
         if Type.fp16.value in rank_buffer_size.keys():
             # FP16 GradStorage and model size
             print(
-                "====== FP16 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======".
-                format(rank_buffer_size[Type.fp16.value] / 2**19, model_size / 2
-                       **19))
+                "====== FP16 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======"
+                .format(rank_buffer_size[Type.fp16.value] / 2**19,
+                        model_size / 2**19))
         if Type.fp32.value in rank_buffer_size.keys():
             # FP32 GradStorage and model size
             print(
-                "====== FP32 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======".
-                format(rank_buffer_size[Type.fp32.value] / 2**18, model_size / 2
-                       **18))
+                "====== FP32 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======"
+                .format(rank_buffer_size[Type.fp32.value] / 2**18,
+                        model_size / 2**18))
         return rank_buffer_size
 
     def _redefine_opt_step(self):
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index 7bb1517f12169..67d48c8abba1b 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -37,7 +37,9 @@
 from ...utils.internal_storage import GradStorage
 
 # CUDA alignment 256 bytes
-alignment = {"gpu": 256, }
+alignment = {
+    "gpu": 256,
+}
 align = {
     Type.fp16.value: 2,
     Type.fp32.value: 4,
@@ -56,7 +58,7 @@ class ShardingStage3(nn.Layer):
     .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
     """
 
-    # TODO (Baibaifan) 
+    # TODO (Baibaifan)
     # Feature Notes::
     # 1. The model supports the segmentation of parameters by global ranks in layers.
     # 2. Support communication flow and computing flow.
@@ -90,14 +92,14 @@ def __init__(self,
         DEV = "cpu" if paddle.get_device() == "cpu" else paddle.get_device(
         ).split(":")[0]
         global DEV_ID
-        DEV_ID = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
-                                                            .split(":")[1])
+        DEV_ID = 0 if paddle.get_device() == "cpu" else int(
+            paddle.get_device().split(":")[1])
         global param2dtype
         param2dtype = dict()
 
         # Communication group establishment
-        self._group = dist.new_group(_get_global_group()
-                                     .ranks) if group is None else group
+        self._group = dist.new_group(
+            _get_global_group().ranks) if group is None else group
         self._world_size_scaling = 1.0 / self._group.nranks
         assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1."
         self._rank = self._group.rank
@@ -165,11 +167,10 @@ def _sync_params_and_buffers(self):
         """
 
         for p in self._layer.parameters():
-            dist.broadcast(
-                p,
-                src=self._global_root_rank,
-                group=self._group,
-                use_calc_stream=True)
+            dist.broadcast(p,
+                           src=self._global_root_rank,
+                           group=self._group,
+                           use_calc_stream=True)
 
         # Multi stream operation will be supported later
         dist.wait(tensor=p, group=self._group, use_calc_stream=True)
@@ -182,9 +183,9 @@ def _clear_gradients(self):
             filter(lambda p: p.trainable and p not in self._unslice_params,
                    current_layer_params))
         for param in trainable_params:
-            assert hasattr(
-                param, "fw_storage"
-            ), "Find {} don't have fw_storage attribute.".format(param.name)
+            assert hasattr(param, "fw_storage"
+                           ), "Find {} don't have fw_storage attribute.".format(
+                               param.name)
 
             param.fw_storage.clear_gradient(False)
             param.fw_storage._gradient_set_empty(False)
@@ -244,15 +245,16 @@ def forward(self, *inputs, **kwargs):
         return fw
 
     def set_state_dict(self, state_dict, use_structured_name=True):
-        self._layer.set_state_dict(
-            state_dict, use_structured_name=use_structured_name)
+        self._layer.set_state_dict(state_dict,
+                                   use_structured_name=use_structured_name)
 
     def state_dict(self,
                    destination=None,
                    include_sublayers=True,
                    structured_name_prefix=""):
-        return self._layer.state_dict(
-            destination=None, include_sublayers=True, structured_name_prefix="")
+        return self._layer.state_dict(destination=None,
+                                      include_sublayers=True,
+                                      structured_name_prefix="")
 
     def _handle_unslice_params(self):
         buffer_size = dict()
@@ -357,8 +359,8 @@ def _param_storage(self, param, buffer_size):
         start, end = self._param2buffer[param.name][self._rank]
 
         # Copy the current param value
-        tmp_var = core.VarBase(
-            tensor=buffer._slice(0, param._numel()), place=core.CPUPlace())
+        tmp_var = core.VarBase(tensor=buffer._slice(0, param._numel()),
+                               place=core.CPUPlace())
         param_cpu = param.cpu()
         tmp_var.value().get_tensor().set(param_cpu.value().get_tensor(),
                                          core.CPUPlace())
@@ -366,15 +368,15 @@ def _param_storage(self, param, buffer_size):
 
         # Current rank param_storage
         if self._offload:
-            param.fw_storage = core.VarBase(
-                buffer._slice(start, end),
-                core.CPUPlace(), "slice@" + param.name)
+            param.fw_storage = core.VarBase(buffer._slice(start, end),
+                                            core.CPUPlace(),
+                                            "slice@" + param.name)
             with device_guard(device="cpu"):
                 param.master_weight = paddle.cast(param.fw_storage,
                                                   Type.fp32.value)
         else:
-            param.fw_storage = core.VarBase(
-                buffer._slice(start, end), "slice@" + param.name)
+            param.fw_storage = core.VarBase(buffer._slice(start, end),
+                                            "slice@" + param.name)
         param.status = "part"
 
         # Updata optimizer master weights
@@ -402,6 +404,7 @@ def _register_forward_hooks(self, layer):
             self._register_forward_hooks(sub_layer)
 
     def _register_forward_all_hooks(self, sub_layer, task_flow):
+
         def _forward_pre_hook(layer, inputs):
             return ForwardPreHooks(layer, self._order_tracer,
                                    self._trainable_params, self._param2buffer,
@@ -409,10 +412,12 @@ def _forward_pre_hook(layer, inputs):
                                    self._offload, task_flow)
 
         def _forward_post_hook(layer, inputs, outputs):
-            return ForwardPostHooks.apply(
-                outputs, layer, self._order_tracer, self._trainable_params,
-                self._param2buffer, self._param2buffer_size, self._rank,
-                self._group, self._sync_comm, self._offload, task_flow)
+            return ForwardPostHooks.apply(outputs, layer, self._order_tracer,
+                                          self._trainable_params,
+                                          self._param2buffer,
+                                          self._param2buffer_size, self._rank,
+                                          self._group, self._sync_comm,
+                                          self._offload, task_flow)
 
         # register previous forward hooks
         sub_layer.register_forward_pre_hook(_forward_pre_hook)
@@ -427,11 +432,10 @@ def _sync_buffers(self):
         """
 
         for buffer in self._layer.buffers(include_sublayers=True):
-            dist.broadcast(
-                buffer,
-                self._global_root_rank,
-                self._group,
-                use_calc_stream=True)
+            dist.broadcast(buffer,
+                           self._global_root_rank,
+                           self._group,
+                           use_calc_stream=True)
         # Multi stream operation will be supported later
         dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
 
@@ -472,14 +476,12 @@ def _update_params(self):
         # 2.Handle unslice param
         for grad_storage in self._grad_storages.values():
             grad_storage.buffer.scale_(scale=self._world_size_scaling)
-            dist.all_reduce(
-                tensor=grad_storage.buffer,
-                group=self._group,
-                use_calc_stream=True)
-            dist.wait(
-                tensor=grad_storage.buffer,
-                group=self._group,
-                use_calc_stream=True)
+            dist.all_reduce(tensor=grad_storage.buffer,
+                            group=self._group,
+                            use_calc_stream=True)
+            dist.wait(tensor=grad_storage.buffer,
+                      group=self._group,
+                      use_calc_stream=True)
 
         if self._offload:
             for param in list(self._unslice_params):
@@ -506,14 +508,13 @@ def get_all_parameters(self, convert2cpu=False):
         trainable_params = list(
             filter(lambda p: p.trainable and p not in self._unslice_params,
                    current_layer_params))
-        t_flow = _allgather_buffer(
-            trainable_params,
-            self._group,
-            use_calc_stream=True,
-            task_flow=TaskFlow(),
-            sync_wait=True,
-            offload=self._offload,
-            convert2cpu=convert2cpu)
+        t_flow = _allgather_buffer(trainable_params,
+                                   self._group,
+                                   use_calc_stream=True,
+                                   task_flow=TaskFlow(),
+                                   sync_wait=True,
+                                   offload=self._offload,
+                                   convert2cpu=convert2cpu)
         if convert2cpu:
             for param in trainable_params:
                 t_flow.full_param[param.name]._share_buffer_to(param)
@@ -532,38 +533,41 @@ def _register_backward_hooks(self):
             param._register_backward_hook(allreduce_function)
 
     def _get_allreduce_fn(self, param):
+
         @paddle.autograd.no_grad()
         def allreduce_(*_):
             if param.name in self._task_flow.full_grad.keys():
                 full_grad = self._task_flow.full_grad[param.name]
                 # Only support sync allreduce current rank's layer now
-                dist.all_reduce(
-                    tensor=full_grad, group=self._group, use_calc_stream=True)
-                dist.wait(
-                    tensor=full_grad, group=self._group, use_calc_stream=True)
+                dist.all_reduce(tensor=full_grad,
+                                group=self._group,
+                                use_calc_stream=True)
+                dist.wait(tensor=full_grad,
+                          group=self._group,
+                          use_calc_stream=True)
 
                 start, end = self._param2buffer[param.name][self._rank]
                 if param.bw_storage is None:
-                    param.bw_storage = core.VarBase(
-                        full_grad._slice(start, end)).detach().clone()
+                    param.bw_storage = core.VarBase(full_grad._slice(
+                        start, end)).detach().clone()
                     if self._offload:
                         param.bw_storage = _device2cpu(param.bw_storage, True)
                 else:
                     if self._offload:
                         cpu_grad = _device2cpu(
-                            core.VarBase(full_grad._slice(start, end))
-                            .detach().clone(), True)
+                            core.VarBase(full_grad._slice(
+                                start, end)).detach().clone(), True)
                         with device_guard(device="cpu"):
-                            param.bw_storage = paddle.add(param.bw_storage,
-                                                          cpu_grad)
+                            param.bw_storage = paddle.add(
+                                param.bw_storage, cpu_grad)
                     else:
                         # param.bw_storage.add_(
                         #     core.VarBase(full_grad._slice(start, end))
                         #     .detach().clone())
                         param.bw_storage = paddle.add(
                             param.bw_storage,
-                            core.VarBase(full_grad._slice(start, end)).detach(
-                            ).clone())
+                            core.VarBase(full_grad._slice(
+                                start, end)).detach().clone())
                 param.clear_gradient(False)
                 param._gradient_set_empty(False)
                 tmp_var = self._task_flow.full_grad.pop(param.name)
@@ -576,7 +580,8 @@ def allreduce_(*_):
                     start, end = self._param2buffer[param.name][self._rank]
                     param.fw_storage = core.VarBase(
                         self._task_flow.full_param[param.name]._slice(
-                            start, end), param.name + "@slice").detach().clone()
+                            start, end),
+                        param.name + "@slice").detach().clone()
                     param.status = "part"
                     tmp_var = self._task_flow.full_param.pop(param.name)
                     tmp_var._clear()
@@ -649,18 +654,18 @@ def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer, rank,
         order_ = order_tracer[layer_id]
         layer_id = order_tracer["layer"][order_ + 1]
 
-    _allgather_buffer(
-        trainable_params[layer_id],
-        group,
-        use_calc_stream=use_calc,
-        task_flow=task_flow,
-        sync_wait=sync_wait,
-        offload=offload)
+    _allgather_buffer(trainable_params[layer_id],
+                      group,
+                      use_calc_stream=use_calc,
+                      task_flow=task_flow,
+                      sync_wait=sync_wait,
+                      offload=offload)
 
     return
 
 
 class ForwardPostHooks(PyLayer):
+
     @staticmethod
     def forward(ctx, inputs, layer, order_tracer, trainable_params,
                 param2buffer, param2buffer_size, rank, group, sync_comm,
@@ -677,7 +682,7 @@ def forward(ctx, inputs, layer, order_tracer, trainable_params,
             order_tracer["order"] += 1
             order_tracer["layer"].append(layer_id)
 
-        #Record bw info 
+        #Record bw info
         ctx.order_tracer = order_tracer
         ctx.task_flow = task_flow
         ctx.group = group
@@ -706,13 +711,12 @@ def backward(ctx, *args):
         # Allgather params synchronization
         if sync_comm:
             use_calc, sync_wait = True, True
-            _allgather_buffer(
-                trainable_params[layer_id],
-                group,
-                use_calc_stream=use_calc,
-                task_flow=task_flow,
-                sync_wait=sync_wait,
-                offload=offload)
+            _allgather_buffer(trainable_params[layer_id],
+                              group,
+                              use_calc_stream=use_calc,
+                              task_flow=task_flow,
+                              sync_wait=sync_wait,
+                              offload=offload)
         else:
             _wait_layer(trainable_params[layer_id], task_flow, group, use_calc,
                         offload)
@@ -725,13 +729,12 @@ def backward(ctx, *args):
         task_flow.use_calc[layer_id] = use_calc
         if layer_id != order_tracer["layer"][0] and not sync_comm:
             layer_next_id = order_tracer["layer"][order_tracer[layer_id] - 1]
-            _allgather_buffer(
-                trainable_params[layer_next_id],
-                group,
-                use_calc_stream=use_calc,
-                task_flow=task_flow,
-                sync_wait=sync_wait,
-                offload=offload)
+            _allgather_buffer(trainable_params[layer_next_id],
+                              group,
+                              use_calc_stream=use_calc,
+                              task_flow=task_flow,
+                              sync_wait=sync_wait,
+                              offload=offload)
 
         return args
 
@@ -789,20 +792,19 @@ def _wait_layer(trainable_params,
             continue
         if param.name in task_flow.full_param.keys():
             full_param = task_flow.full_param[param.name]
-            core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to(
-                param)
+            core.VarBase(full_param._slice(
+                0, param._numel()))._share_buffer_to(param)
             param.fw_storage._clear()
             param.fw_storage = None
             param.status = "all"
             param.use_count += 1
         else:
-            _allgather_buffer(
-                trainable_params,
-                group,
-                use_calc_stream=True,
-                task_flow=task_flow,
-                sync_wait=True,
-                offload=offload)
+            _allgather_buffer(trainable_params,
+                              group,
+                              use_calc_stream=True,
+                              task_flow=task_flow,
+                              sync_wait=True,
+                              offload=offload)
             break
     return task_flow
 
@@ -824,25 +826,25 @@ def _allgather_buffer(trainable_params,
             param.fw_storage = _cpu2device(param)
 
         with paddle.amp.auto_cast(enable=False):
-            full_param = _all_gather(
-                param.fw_storage, group, use_calc_stream=use_calc_stream)
+            full_param = _all_gather(param.fw_storage,
+                                     group,
+                                     use_calc_stream=use_calc_stream)
 
         # Allgather current layer in the 1st step synchronously
         if sync_wait:
             with paddle.amp.auto_cast(enable=False):
-                dist.wait(
-                    tensor=full_param,
-                    group=group,
-                    use_calc_stream=use_calc_stream)
-            core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to(
-                param)
+                dist.wait(tensor=full_param,
+                          group=group,
+                          use_calc_stream=use_calc_stream)
+            core.VarBase(full_param._slice(
+                0, param._numel()))._share_buffer_to(param)
             param.fw_storage._clear()
             param.fw_storage = None
             param.status = "all"
             param.use_count += 1
         task_flow.full_param[param.name] = full_param
 
-        # parameter converts to cpu 
+        # parameter converts to cpu
         if convert2cpu:
             p_name = param.name
             param = _device2cpu(param)
@@ -859,8 +861,8 @@ def _create_params_grad(trainable_params, param2buffer_size, task_flow):
         if param.name in task_flow.full_grad.keys():
             continue
         assert isinstance(param2buffer_size[param.name], int)
-        temp_grad = paddle.zeros(
-            [param2buffer_size[param.name]], dtype=param.dtype)
+        temp_grad = paddle.zeros([param2buffer_size[param.name]],
+                                 dtype=param.dtype)
         param._copy_gradient_from(
             core.VarBase(temp_grad._slice(0, param._numel())))
         task_flow.full_grad[param.name] = temp_grad
@@ -886,8 +888,9 @@ def _UnsliceParam(param):
 
 def _VarBaseWrapper(param):
     varbase = param.fw_storage
-    tmp_param = ParamBase(
-        shape=varbase.shape, dtype=varbase.dtype, name="slice@" + param.name)
+    tmp_param = ParamBase(shape=varbase.shape,
+                          dtype=varbase.dtype,
+                          name="slice@" + param.name)
     varbase._share_buffer_to(tmp_param)
     tmp_param.regularizer = param.regularizer
     tmp_param.optimize_attr['learning_rate'] = param.optimize_attr[
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index 6a30276e02ba2..ae98d4bdf7b1e 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -49,6 +49,7 @@ class Type(Enum):
 
 
 class ShardingClipGrad:
+
     def __init__(self, clip, device, group):
         self._clip = clip
         self._device = device
@@ -86,8 +87,8 @@ def _dygraph_clip(self, params_grads):
         else:
             global_norm_fp16 = layers.concat(sum_square_fp16)
             global_norm_fp16 = layers.reduce_sum(global_norm_fp16)
-            global_norm_fp16 = paddle.cast(
-                global_norm_fp16, dtype=paddle.float32)
+            global_norm_fp16 = paddle.cast(global_norm_fp16,
+                                           dtype=paddle.float32)
 
         # global norm of non-distributed FP16 params_and_grads for unslice parameter
         if len(unslice_params_fp16) == 0:
@@ -95,12 +96,12 @@ def _dygraph_clip(self, params_grads):
         else:
             global_unslice_fp16 = layers.concat(unslice_params_fp16)
             global_unslice_fp16 = layers.reduce_sum(global_unslice_fp16)
-            global_unslice_fp16 = paddle.cast(
-                global_unslice_fp16, dtype=paddle.float32)
+            global_unslice_fp16 = paddle.cast(global_unslice_fp16,
+                                              dtype=paddle.float32)
 
         # global norm of non-distributed FP32 params_and_grads
-        global_norm_fp32 = layers.concat(sum_square_fp32) if len(
-            sum_square_fp32) != 0 else paddle.to_tensor(
+        global_norm_fp32 = layers.concat(
+            sum_square_fp32) if len(sum_square_fp32) != 0 else paddle.to_tensor(
                 [0.], dtype=paddle.float32)
         global_norm_fp32 = layers.reduce_sum(global_norm_fp32)
 
@@ -119,13 +120,14 @@ def _dygraph_clip(self, params_grads):
             paddle.distributed.all_reduce(global_norm_var, group=self._group)
 
         global_norm_var = layers.sqrt(global_norm_var)
-        max_global_norm = layers.fill_constant(
-            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
-
-        clip_var = layers.elementwise_div(
-            x=max_global_norm,
-            y=layers.elementwise_max(
-                x=global_norm_var, y=max_global_norm))
+        max_global_norm = layers.fill_constant(shape=[1],
+                                               dtype=global_norm_var.dtype,
+                                               value=self.clip_norm)
+
+        clip_var = layers.elementwise_div(x=max_global_norm,
+                                          y=layers.elementwise_max(
+                                              x=global_norm_var,
+                                              y=max_global_norm))
         clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
 
         for p, g in params_grads:
@@ -164,6 +166,7 @@ def device_guard(dev_id=0, device="cpu"):
 
 @dygraph_only
 def ShardingScaler(scaler):
+
     def unscale_method(self, optimizer):
         if not self._enable:
             return
@@ -181,8 +184,9 @@ def unscale_method(self, optimizer):
                 for param in group['params']:
                     if param._grad_ivar() is not None:
                         param_grads.append(param._grad_ivar())
-                        if param._grad_ivar(
-                        ).dtype in [core.VarDesc.VarType.FP16, paddle.float16]:
+                        if param._grad_ivar().dtype in [
+                                core.VarDesc.VarType.FP16, paddle.float16
+                        ]:
                             param_grads_fp16.append(param._grad_ivar())
                         else:
                             param_grads_fp32.append(param._grad_ivar())
@@ -201,8 +205,8 @@ def unscale_method(self, optimizer):
         temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
 
         device = "cpu" if optimizer.offload else "gpu"
-        dev_id = 0 if device == "cpu" else int(paddle.get_device().split(":")[
-            1])
+        dev_id = 0 if device == "cpu" else int(
+            paddle.get_device().split(":")[1])
 
         with device_guard(dev_id, device):
             if len(param_grads_fp16):
@@ -217,10 +221,9 @@ def unscale_method(self, optimizer):
         self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
         is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
 
-        paddle.distributed.all_reduce(
-            is_found_inf,
-            op=paddle.distributed.ReduceOp.MAX,
-            group=optimizer.group)
+        paddle.distributed.all_reduce(is_found_inf,
+                                      op=paddle.distributed.ReduceOp.MAX,
+                                      group=optimizer.group)
         self._found_inf = is_found_inf.numpy()[0]
 
     scaler._unscale = MethodType(unscale_method, scaler)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding_parallel.py b/python/paddle/distributed/fleet/meta_parallel/sharding_parallel.py
index 953a76d874e55..1bc76570f17a3 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding_parallel.py
@@ -21,6 +21,7 @@
 
 
 class ShardingParallel(MetaParallelBase):
+
     def __init__(self, layers, hcg, **kwargs):
         super(ShardingParallel, self).__init__(layers, hcg, **kwargs)
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py b/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
index 171df7cf033be..5814ed898fafb 100755
--- a/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
@@ -23,6 +23,7 @@
 
 
 class TensorParallel(MetaParallelBase):
+
     def __init__(self, layers, hcg, **kwargs):
         super(TensorParallel, self).__init__(layers, hcg, **kwargs)
 
diff --git a/python/paddle/distributed/fleet/runtime/collective_runtime.py b/python/paddle/distributed/fleet/runtime/collective_runtime.py
index a23b15f1fca1b..183fa9e7c156e 100644
--- a/python/paddle/distributed/fleet/runtime/collective_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/collective_runtime.py
@@ -19,6 +19,7 @@
 
 
 class CollectiveRuntime(RuntimeBase):
+
     def __init__(self):
         super(CollectiveRuntime, self).__init__()
 
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 0767158d23f00..6e30ff7969e1d 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -30,6 +30,7 @@
 
 
 class ParameterServerRuntime(RuntimeBase):
+
     def __init__(self):
         super(ParameterServerRuntime, self).__init__()
         self._communicator = None
@@ -67,9 +68,10 @@ def _get_distributed_strategy(self):
     def build_compiled_startegy(self):
         from paddle.fluid.incubate.fleet.parameter_server.ir.public import CompileTimeStrategy
 
-        compiled_config = CompileTimeStrategy(
-            self.origin_main_program, self.origin_main_program,
-            self.async_strategy, self.role_maker)
+        compiled_config = CompileTimeStrategy(self.origin_main_program,
+                                              self.origin_main_program,
+                                              self.async_strategy,
+                                              self.role_maker)
         return compiled_config
 
     def _load_sparse_params(self,
@@ -86,7 +88,8 @@ def _in_varnames(var):
             return var.name in varnames
 
         load_vars = list(
-            filter(_in_varnames, fluid.default_main_program().list_vars()))
+            filter(_in_varnames,
+                   fluid.default_main_program().list_vars()))
         if main_program is None:
             main_program = self.origin_main_program
 
@@ -99,20 +102,24 @@ def _in_varnames(var):
             new_var = fluid.io._clone_var_in_block_(load_block, each_var)
             var_path = os.path.join(dirname, origin_varname)
             if not os.path.exists(var_path):
-                raise ValueError("SelectedRows var {} can not find at {}".
-                                 format(new_var.name, var_path))
+                raise ValueError(
+                    "SelectedRows var {} can not find at {}".format(
+                        new_var.name, var_path))
 
             if os.path.isfile(var_path):
-                load_block.append_op(
-                    type='sparse_tensor_load',
-                    inputs={},
-                    outputs={'Out': [new_var]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_varname),
-                        'node_index': self.role_maker._server_index(),
-                        'node_num': self.role_maker._server_num(),
-                        'shape': each_var.shape
-                    })
+                load_block.append_op(type='sparse_tensor_load',
+                                     inputs={},
+                                     outputs={'Out': [new_var]},
+                                     attrs={
+                                         'file_path':
+                                         os.path.join(dirname, origin_varname),
+                                         'node_index':
+                                         self.role_maker._server_index(),
+                                         'node_num':
+                                         self.role_maker._server_num(),
+                                         'shape':
+                                         each_var.shape
+                                     })
             check_vars.append(each_var)
 
         executor.run(load_prog)
@@ -129,6 +136,7 @@ def _load_distributed_params(self, dirname, varnames):
 
     @staticmethod
     def __exclude_vars(exclude_var_names=[]):
+
         def is_valid(var):
             if var.name in exclude_var_names:
                 return False
@@ -151,6 +159,7 @@ def is_valid(var):
         return is_valid
 
     def _init_worker(self):
+
         def sync_strategy_envs():
             kwargs = {}
             kwargs[
@@ -166,8 +175,9 @@ def get_sparse_attrs():
                 opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
                 opt_init_map["fill_constant"] = ["value"]
                 opt_init_map["uniform_random"] = ["seed", "min", "max"]
-                opt_init_map[
-                    "truncated_gaussian_random"] = ["seed", "mean", "std"]
+                opt_init_map["truncated_gaussian_random"] = [
+                    "seed", "mean", "std"
+                ]
 
                 dist_varnames = get_sparse_tablenames(self.origin_main_program,
                                                       True)
@@ -181,8 +191,8 @@ def get_sparse_attrs():
 
                 init_attrs = []
                 for value_name in sparse_varnames:
-                    value_var = self.origin_main_program.global_block().vars[
-                        value_name]
+                    value_var = self.origin_main_program.global_block(
+                    ).vars[value_name]
                     value_attr = [
                         value_name,
                         ",".join([str(dim) for dim in value_var.shape])
@@ -287,8 +297,8 @@ def _init_server(self, *args, **kwargs):
             model_dirname = None
 
         executor = self._get_executor()
-        if self.role_maker._is_heter_worker() and self.context[
-                "valid_strategy"].a_sync_configs["launch_barrier"]:
+        if self.role_maker._is_heter_worker(
+        ) and self.context["valid_strategy"].a_sync_configs["launch_barrier"]:
             # for heter trainer wait server ready
             wait_server_ready(self.role_maker._get_pserver_endpoints())
         executor.run(fluid.default_startup_program())
@@ -328,23 +338,21 @@ def _init_server(self, *args, **kwargs):
             raise ValueError("There is no directory named '%s'", model_dirname)
 
         # load dense
-        fluid.io.load_vars(
-            executor,
-            main_program=fluid.default_main_program(),
-            dirname=model_dirname,
-            vars=remaining_vars)
+        fluid.io.load_vars(executor,
+                           main_program=fluid.default_main_program(),
+                           dirname=model_dirname,
+                           vars=remaining_vars)
 
         # load sparse
-        self._load_sparse_params(
-            executor=executor,
-            dirname=model_dirname,
-            varnames=sparse_varnames + sparse_related_optimize_varnames)
+        self._load_sparse_params(executor=executor,
+                                 dirname=model_dirname,
+                                 varnames=sparse_varnames +
+                                 sparse_related_optimize_varnames)
 
         # load large scale
-        self._load_distributed_params(
-            dirname=model_dirname,
-            varnames=distribtued_varnames +
-            distributed_related_optimize_varnames)
+        self._load_distributed_params(dirname=model_dirname,
+                                      varnames=distribtued_varnames +
+                                      distributed_related_optimize_varnames)
 
     def _run_server(self):
         executor = self._get_executor()
@@ -368,8 +376,9 @@ def _get_optimizer_status(self, op, param_name):
         reshaped_val_map["adamax"] = ["moment_0", "inf_norm_0"]
         reshaped_val_map["momentum"] = ["velocity_0"]
         reshaped_val_map["lars_momentum"] = ["velocity_0"]
-        reshaped_val_map[
-            "rmsprop"] = ["momentum_0", "mean_square_0", "mean_grad_0"]
+        reshaped_val_map["rmsprop"] = [
+            "momentum_0", "mean_square_0", "mean_grad_0"
+        ]
         reshaped_val_map["decayed_adagrad"] = ["moment_0"]
         reshaped_val_map["ftrl"] = ["squared_0", "linear_0"]
 
@@ -379,8 +388,8 @@ def _get_optimizer_status(self, op, param_name):
 
         if op not in supported_opts:
             raise ValueError(
-                "fleet can not support optimizer: {}, only this can be supported: {}".
-                format(op, supported_opts))
+                "fleet can not support optimizer: {}, only this can be supported: {}"
+                .format(op, supported_opts))
 
         reshaped_names = [
             param_name + "_" + val for val in reshaped_val_map[op]
@@ -423,19 +432,23 @@ def _save_dense_params(self, executor, dirname, context, main_program):
 
             for var_name in [varname] + reshaped_varnames + origin_varnames:
                 var = self.origin_main_program.global_block().vars[var_name]
-                block.append_op(
-                    type='recv_save',
-                    attrs={
-                        "trainer_id": self.role_maker._worker_index(),
-                        "shape": var.shape,
-                        "slice_shapes":
-                        [",".join([str(i) for i in var.shape])],
-                        "slice_varnames": [var.name],
-                        "remote_varnames": [var.name],
-                        "is_sparse": False,
-                        "endpoints": var_ctx.split_endpoints(),
-                        "file_path": os.path.join(dirname, var.name)
-                    })
+                block.append_op(type='recv_save',
+                                attrs={
+                                    "trainer_id":
+                                    self.role_maker._worker_index(),
+                                    "shape":
+                                    var.shape,
+                                    "slice_shapes":
+                                    [",".join([str(i) for i in var.shape])],
+                                    "slice_varnames": [var.name],
+                                    "remote_varnames": [var.name],
+                                    "is_sparse":
+                                    False,
+                                    "endpoints":
+                                    var_ctx.split_endpoints(),
+                                    "file_path":
+                                    os.path.join(dirname, var.name)
+                                })
 
         executor.run(prog)
         return local_vars
@@ -463,30 +476,37 @@ def _save_sparse_params(self, executor, dirname, context, main_program):
             for section in var_ctx.sections():
                 slice_shapes.append(str(section) + dims1)
 
-            block.append_op(
-                type='recv_save',
-                attrs={
-                    "trainer_id": self.role_maker._worker_index(),
-                    "shape": var.shape,
-                    "slice_shapes": slice_shapes,
-                    "slice_varnames": var_ctx.split_varnames(),
-                    "remote_varnames": var_ctx.split_varnames(),
-                    "is_sparse": True,
-                    "endpoints": var_ctx.split_endpoints(),
-                    "pserver_num":
-                    len(self.role_maker._get_pserver_endpoints()),
-                    "file_path": os.path.join(dirname, var.name)
-                })
+            block.append_op(type='recv_save',
+                            attrs={
+                                "trainer_id":
+                                self.role_maker._worker_index(),
+                                "shape":
+                                var.shape,
+                                "slice_shapes":
+                                slice_shapes,
+                                "slice_varnames":
+                                var_ctx.split_varnames(),
+                                "remote_varnames":
+                                var_ctx.split_varnames(),
+                                "is_sparse":
+                                True,
+                                "endpoints":
+                                var_ctx.split_endpoints(),
+                                "pserver_num":
+                                len(self.role_maker._get_pserver_endpoints()),
+                                "file_path":
+                                os.path.join(dirname, var.name)
+                            })
 
             for reshaped_varname in reshaped_varnames:
-                var = self.origin_main_program.global_block().vars[
-                    reshaped_varname]
+                var = self.origin_main_program.global_block(
+                ).vars[reshaped_varname]
 
                 slice_varnames = []
                 remote_varnames = []
                 for i in range(len(var_ctx.split_varnames())):
-                    slice_varnames.append("{}.block{}".format(reshaped_varname,
-                                                              i))
+                    slice_varnames.append("{}.block{}".format(
+                        reshaped_varname, i))
                     remote_varnames.append(reshaped_varname)
 
                 block.append_op(
@@ -505,22 +525,26 @@ def _save_sparse_params(self, executor, dirname, context, main_program):
                     })
 
             for origin_varname in origin_varnames:
-                var = self.origin_main_program.global_block().vars[
-                    origin_varname]
-
-                block.append_op(
-                    type='recv_save',
-                    attrs={
-                        "trainer_id": self.role_maker._worker_index(),
-                        "shape": var.shape,
-                        "slice_shapes":
-                        [",".join([str(i) for i in var.shape])],
-                        "slice_varnames": [origin_varname],
-                        "remote_varnames": [origin_varname],
-                        "is_sparse": False,
-                        "endpoints": var_ctx.split_endpoints()[:1],
-                        "file_path": os.path.join(dirname, var.name)
-                    })
+                var = self.origin_main_program.global_block(
+                ).vars[origin_varname]
+
+                block.append_op(type='recv_save',
+                                attrs={
+                                    "trainer_id":
+                                    self.role_maker._worker_index(),
+                                    "shape":
+                                    var.shape,
+                                    "slice_shapes":
+                                    [",".join([str(i) for i in var.shape])],
+                                    "slice_varnames": [origin_varname],
+                                    "remote_varnames": [origin_varname],
+                                    "is_sparse":
+                                    False,
+                                    "endpoints":
+                                    var_ctx.split_endpoints()[:1],
+                                    "file_path":
+                                    os.path.join(dirname, var.name)
+                                })
         executor.run(prog)
         return context.keys()
 
@@ -529,16 +553,15 @@ def _save_distributed_params(self, executor, dirname, context, mode):
         block = prog.global_block()
 
         for name, var_ctx in context.items():
-            block.append_op(
-                type='checkpoint_notify',
-                attrs={
-                    "varname": name,
-                    "mode": mode,
-                    "slice_varnames": var_ctx.split_varnames(),
-                    "remote_varnames": var_ctx.split_varnames(),
-                    "endpoints": var_ctx.split_endpoints(),
-                    "dirname": dirname
-                })
+            block.append_op(type='checkpoint_notify',
+                            attrs={
+                                "varname": name,
+                                "mode": mode,
+                                "slice_varnames": var_ctx.split_varnames(),
+                                "remote_varnames": var_ctx.split_varnames(),
+                                "endpoints": var_ctx.split_endpoints(),
+                                "dirname": dirname
+                            })
 
         executor.run(prog)
         return context.keys()
@@ -557,8 +580,9 @@ def _save_distributed_persistables(self, executor, dirname, main_program,
         recv_dense_varnames = self._save_dense_params(executor, dirname,
                                                       dense_ctx, main_program)
 
-        recv_sparse_varnames = self._save_sparse_params(
-            executor, dirname, sparse_ctx, main_program)
+        recv_sparse_varnames = self._save_sparse_params(executor, dirname,
+                                                        sparse_ctx,
+                                                        main_program)
 
         recv_distributed_varnames = self._save_distributed_params(
             executor, dirname, distributed_ctx, mode)
@@ -567,15 +591,13 @@ def _save_distributed_persistables(self, executor, dirname, main_program,
             recv_sparse_varnames) + list(recv_distributed_varnames)
 
         remaining_vars = list(
-            filter(
-                ParameterServerRuntime.__exclude_vars(saved_varnames),
-                main_program.list_vars()))
+            filter(ParameterServerRuntime.__exclude_vars(saved_varnames),
+                   main_program.list_vars()))
 
-        fluid.io.save_vars(
-            executor,
-            main_program=main_program,
-            dirname=dirname,
-            vars=remaining_vars)
+        fluid.io.save_vars(executor,
+                           main_program=main_program,
+                           dirname=dirname,
+                           vars=remaining_vars)
 
     def _ps_inference_save_persistables(self,
                                         executor,
@@ -659,8 +681,10 @@ def _ps_inference_save_inference_model(self,
 
             program = Program.parse_from_string(program_desc_str)
             program._copy_dist_param_info_from(fluid.default_main_program())
-            self._ps_inference_save_persistables(
-                executor, dirname, program, mode=0)
+            self._ps_inference_save_persistables(executor,
+                                                 dirname,
+                                                 program,
+                                                 mode=0)
 
     def _save_inference_model(self, *args, **kwargs):
         self._ps_inference_save_inference_model(*args, **kwargs)
diff --git a/python/paddle/distributed/fleet/runtime/runtime_base.py b/python/paddle/distributed/fleet/runtime/runtime_base.py
index 2e8bacfbc3b1d..38bb31ce3fc1d 100644
--- a/python/paddle/distributed/fleet/runtime/runtime_base.py
+++ b/python/paddle/distributed/fleet/runtime/runtime_base.py
@@ -16,6 +16,7 @@
 
 
 class RuntimeBase(object):
+
     def __init__(self):
         pass
 
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index c90fab6af5c15..82cef558b1f44 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -131,16 +131,17 @@ def check_embedding_dim(accessor, varname, o_main_program):
     fea_dim = accessor.fea_dim
     if fea_dim != embedding_dim:
         raise ValueError(
-            "The fea_dim is wrong, it will be sparse_embedding_dim: {}, but got {}".
-            format(embedding_dim, fea_dim))
+            "The fea_dim is wrong, it will be sparse_embedding_dim: {}, but got {}"
+            .format(embedding_dim, fea_dim))
     embedx_dim = accessor.embedx_dim
     if embedx_dim != embedding_dim - 3:
         raise ValueError(
-            "The embedx_dim is wrong, it will be sparse_embedding_dim - 3: {}, but got {}".
-            format(embedding_dim - 3, embedx_dim))
+            "The embedx_dim is wrong, it will be sparse_embedding_dim - 3: {}, but got {}"
+            .format(embedding_dim - 3, embedx_dim))
 
 
 class Accessor:
+
     def __init__(self):
         self.accessor_class = ""
         self.optimizer = None
@@ -157,11 +158,12 @@ def to_string(self, indent):
         attrs += "\n"
         if self.optimizer is not None:
             attrs += self.optimizer.to_string(indent)
-        return accessor_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+        return accessor_str.format(conv_indent(indent), attrs,
+                                   conv_indent(indent))
 
 
 class CommonAccessor:
+
     def __init__(self):
         self.accessor_class = ""
         self.table_name = None
@@ -185,11 +187,11 @@ def define_optimize_map(self):
         opt_input_map["adam"] = [("Param", None), ("Moment1", None),
                                  ("Moment2", None), ("Beta1Pow", 1),
                                  ("Beta2Pow", 1), ("LearningRate", 1)]
-        opt_input_map["adam_d2sum"] = [
-            ("Param", None), ("D2Sum", None), ("G2Sum", None), ("Moment", None),
-            ("MomentDecayRate", 1), ("AdaDecayRate", 1), ("AdaEpsilon", 1),
-            ("LearningRate", 1)
-        ]
+        opt_input_map["adam_d2sum"] = [("Param", None), ("D2Sum", None),
+                                       ("G2Sum", None), ("Moment", None),
+                                       ("MomentDecayRate", 1),
+                                       ("AdaDecayRate", 1), ("AdaEpsilon", 1),
+                                       ("LearningRate", 1)]
         opt_input_map["sum"] = [("Param", None)]
         opt_input_map["naive_adagrad"] = [("Param", None), ("G2Sum", 1),
                                           ("LearningRate", 1)]
@@ -269,8 +271,8 @@ def parse_by_optimizer(self, grad_name, is_sparse, size, single_dim,
         oop = None
 
         for op in optimizer_ops:
-            if ("Param" in op.input_names) and (
-                    op.input("Param")[0] == param_name):
+            if ("Param" in op.input_names) and (op.input("Param")[0]
+                                                == param_name):
                 oop = op
                 break
 
@@ -325,11 +327,11 @@ def parse_by_optimizer(self, grad_name, is_sparse, size, single_dim,
                     #TODO: for dense learning_rate, can be different from sparse lr
                     if formal_name == "LearningRate" and param.name != "learning_rate_0":
                         warnings.warn("will support decay soon")
-                        param = main_program.global_block().vars[
-                            "learning_rate_0"]
+                        param = main_program.global_block(
+                        ).vars["learning_rate_0"]
 
-                    initializer = self.get_initializer_attr(param.name,
-                                                            startup_program)
+                    initializer = self.get_initializer_attr(
+                        param.name, startup_program)
                 elif formal_name == "MomentDecayRate":
                     initializer = "fill_constant&0.99"
                 elif formal_name == "AdaDecayRate":
@@ -349,8 +351,8 @@ def parse_by_optimizer(self, grad_name, is_sparse, size, single_dim,
                         formal_name)[0]]
                     if formal_name == "LearningRate" and param.name != "learning_rate_0":
                         warnings.warn("will support decay soon")
-                        param = main_program.global_block().vars[
-                            "learning_rate_0"]
+                        param = main_program.global_block(
+                        ).vars["learning_rate_0"]
 
                     if shape is None:
                         if is_sparse:
@@ -360,8 +362,8 @@ def parse_by_optimizer(self, grad_name, is_sparse, size, single_dim,
                                                    pserver_id)
                     dims.append(shape)
 
-                    initializer = self.get_initializer_attr(param.name,
-                                                            startup_program)
+                    initializer = self.get_initializer_attr(
+                        param.name, startup_program)
                     initializers.append(initializer)
 
         for (attr_varname, type_) in attr_varnames:
@@ -400,11 +402,12 @@ def to_string(self, indent):
             attrs += "initializers: \"{}\" ".format(initializer)
 
         attrs += "\n"
-        return accessor_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+        return accessor_str.format(conv_indent(indent), attrs,
+                                   conv_indent(indent))
 
 
 class Tensor:
+
     def __init__(self):
         self.main_program_id = None
         self.startup_program_id = None
@@ -422,11 +425,12 @@ def to_string(self, indent):
         attrs += "tensor_table_class: \"{}\" ".format(
             str(self.tensor_table_class))
         attrs += "\n"
-        return program_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+        return program_str.format(conv_indent(indent), attrs,
+                                  conv_indent(indent))
 
 
 class Table:
+
     def __init__(self):
         self.id = -1
         self.table_class = None
@@ -455,8 +459,9 @@ def to_string(self, indent):
 
         if self.accessor_proto is not None:
             accessor_str = "{}accessor {{{}\n{}}}"
-            accessor_str = accessor_str.format(
-                conv_indent(indent), self.accessor_proto, conv_indent(indent))
+            accessor_str = accessor_str.format(conv_indent(indent),
+                                               self.accessor_proto,
+                                               conv_indent(indent))
             attrs += accessor_str + "\n"
         elif self.accessor is not None:
             attrs += self.accessor.to_string(indent)
@@ -474,6 +479,7 @@ def to_string(self, indent):
 
 
 class Service:
+
     def __init__(self):
         self.server_class = "BrpcPsServer"
         self.client_class = "BrpcPsClient"
@@ -491,11 +497,12 @@ def to_string(self, indent):
         attrs += "start_server_port: {} ".format(self.start_server_port)
         attrs += "server_thread_num: {} ".format(self.server_thread_num)
 
-        return service_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+        return service_str.format(conv_indent(indent), attrs,
+                                  conv_indent(indent))
 
 
 class DownpourServer:
+
     def __init__(self):
         self.service = None
         self.tables = []
@@ -520,11 +527,12 @@ def to_string(self, indent):
         for table in self.tables:
             table_strs += "\n"
             table_strs += table.to_string(indent)
-        return server_str.format(
-            conv_indent(indent), table_strs, conv_indent(indent))
+        return server_str.format(conv_indent(indent), table_strs,
+                                 conv_indent(indent))
 
 
 class Server:
+
     def __init__(self):
         self.servers = []
 
@@ -545,6 +553,7 @@ def __str__(self):
 
 
 class DownpourWorker:
+
     def __init__(self):
         self.tables = []
 
@@ -561,11 +570,12 @@ def to_string(self, indent):
             table_strs += "\n"
             table_strs += table.to_string(indent)
 
-        return worker_str.format(
-            conv_indent(indent), table_strs, conv_indent(indent))
+        return worker_str.format(conv_indent(indent), table_strs,
+                                 conv_indent(indent))
 
 
 class Worker:
+
     def __init__(self):
         self.workers = []
 
@@ -586,6 +596,7 @@ def __str__(self):
 
 
 class fsClient:
+
     def __init__(self, proto):
         self.proto = proto
         self.uri = proto.uri
@@ -604,6 +615,7 @@ def to_string(self):
 
 
 class TheOnePSRuntime(RuntimeBase):
+
     def __init__(self):
         super(TheOnePSRuntime, self).__init__()
         self._communicator = None
@@ -648,9 +660,10 @@ def _get_distributed_strategy(self):
     def build_compiled_startegy(self):
         from paddle.fluid.incubate.fleet.parameter_server.ir.public import CompileTimeStrategy
 
-        compiled_config = CompileTimeStrategy(
-            self.origin_main_program, self.origin_main_program,
-            self.async_strategy, self.role_maker)
+        compiled_config = CompileTimeStrategy(self.origin_main_program,
+                                              self.origin_main_program,
+                                              self.async_strategy,
+                                              self.role_maker)
         if self.async_strategy.use_ps_gpu:
             compiled_config.use_ps_gpu = True
         return compiled_config
@@ -671,8 +684,9 @@ def _init_worker(self):
                 main_program._fleet_opt = {}
             main_program._fleet_opt["use_ps_gpu"] = True
             gpus_env = os.getenv("FLAGS_selected_gpus")
-            main_program._fleet_opt[
-                "worker_places"] = [int(s) for s in gpus_env.split(",")]
+            main_program._fleet_opt["worker_places"] = [
+                int(s) for s in gpus_env.split(",")
+            ]
 
         def sync_strategy_envs():
             kwargs = {}
@@ -748,7 +762,7 @@ def sync_strategy_envs():
                 warnings.warn("gloo may not initialize correctly")
                 all_info = [all_info]
             self._communicator.set_clients(all_info)
-            # create_c2c_connection default param: 
+            # create_c2c_connection default param:
             #  pserver_timeout_ms=500000
             #  pserver_connect_timeout_ms=10000
             #  max_retry=3
@@ -811,8 +825,9 @@ def _get_executor(self):
             if self.role_maker._is_heter_worker():
                 heter_device_type = self.role_maker._heter_device_type().upper()
                 if heter_device_type not in ["GPU", "XPU", "CPU"]:
-                    raise ValueError("Heter Worker Not Support Device {}".
-                                     format(device_type))
+                    raise ValueError(
+                        "Heter Worker Not Support Device {}".format(
+                            device_type))
                 if heter_device_type == "GPU":
                     executor = Executor(
                         fluid.CUDAPlace(
@@ -824,6 +839,7 @@ def _get_executor(self):
         return executor
 
     def _get_fleet_proto(self, is_server, is_sync, **kwargs):
+
         def _build_merge_accessor(ctx):
             accessor = Accessor()
             accessor.accessor_class = "CommMergeAccessor"
@@ -856,8 +872,8 @@ def _build_barrier_table(idx):
             common.table_name = "barrier_table"
             trainer_num = self.compiled_strategy.get_trainers()
             if self.role_maker._is_heter_parameter_server_mode:
-                trainer_num += len(self.role_maker._get_heter_worker_endpoints(
-                ))
+                trainer_num += len(
+                    self.role_maker._get_heter_worker_endpoints())
             common.trainer_num = trainer_num
             common.attrs = ""
             common.dims = []
@@ -904,18 +920,18 @@ def _add_tensor_table(tables):
                 if tensor_table_dict[table_name]["startup_program"] != None:
                     tensor_table_dict[table_name][
                         "startup_program_id"] = program_idx
-                    self._server_sub_program.append(tensor_table_dict[
-                        table_name]["startup_program"].desc)
+                    self._server_sub_program.append(
+                        tensor_table_dict[table_name]["startup_program"].desc)
                     program_idx += 1
                 if tensor_table_dict[table_name]["main_program"] != None:
                     tensor_table_dict[table_name][
                         "main_program_id"] = program_idx
-                    self._server_sub_program.append(tensor_table_dict[
-                        table_name]["main_program"].desc)
+                    self._server_sub_program.append(
+                        tensor_table_dict[table_name]["main_program"].desc)
                     program_idx += 1
                 # Todo: Hard code for lr_decay table apply table id
-                new_table = _build_tensor_table(
-                    len(tables), tensor_table_dict[table_name])
+                new_table = _build_tensor_table(len(tables),
+                                                tensor_table_dict[table_name])
                 tables.append(new_table)
             return tables
 
@@ -989,12 +1005,11 @@ def _get_tables():
                     common.table_name = "MergedDense"
 
                 adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
-                common.parse_by_optimizer(ctx.origin_varnames()[0],
-                                          ctx.is_sparse(),
-                                          ctx.sections()[0],
-                                          ctx.sections()[1]
-                                          if ctx.is_sparse() else 1,
-                                          self.compiled_strategy, adam_d2sum)
+                common.parse_by_optimizer(
+                    ctx.origin_varnames()[0], ctx.is_sparse(),
+                    ctx.sections()[0],
+                    ctx.sections()[1] if ctx.is_sparse() else 1,
+                    self.compiled_strategy, adam_d2sum)
 
                 if ctx.is_sparse():
                     common.parse_entry(common.table_name,
@@ -1056,8 +1071,8 @@ def _init_server(self, dirname=None, var_names=None, **kwargs):
             trainers += len(self.role_maker._get_heter_worker_endpoints())
         server = self._get_fleet_proto(is_server=True, is_sync=is_sync)
         proto_txt = str(server)
-        fs_client = fsClient(self.context["user_defined_strategy"]
-                             .fs_client_param)
+        fs_client = fsClient(
+            self.context["user_defined_strategy"].fs_client_param)
         proto_txt = proto_txt + "\n" + fs_client.to_string()
 
         debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
@@ -1087,8 +1102,8 @@ def _init_server(self, dirname=None, var_names=None, **kwargs):
             for var_name in var_names:
                 if var_name not in distributed_varnames:
                     raise ValueError(
-                        "fleet.init server can only load sparse variables in {}".
-                        format(distributed_varnames))
+                        "fleet.init server can only load sparse variables in {}"
+                        .format(distributed_varnames))
             load_varnames = var_names
 
         if dirname is None or not load_varnames:
@@ -1125,6 +1140,7 @@ def _stop_worker(self):
 
     @staticmethod
     def __exclude_vars(exclude_var_names=[]):
+
         def is_valid(var):
             if var.name in exclude_var_names:
                 return False
@@ -1199,17 +1215,17 @@ def _save_distributed_persistables(self,
         saved_varnames = sparse_varnames
 
         remaining_vars = list(
-            filter(
-                TheOnePSRuntime.__exclude_vars(saved_varnames),
-                main_program.list_vars()))
+            filter(TheOnePSRuntime.__exclude_vars(saved_varnames),
+                   main_program.list_vars()))
 
         import paddle
         for var in remaining_vars:
             # if var.name not in recv_dense_varnames:
             #     continue
             tensor = var.get_value()
-            paddle.save(
-                tensor, os.path.join(dirname, var.name), use_binary_format=True)
+            paddle.save(tensor,
+                        os.path.join(dirname, var.name),
+                        use_binary_format=True)
 
     def _ps_inference_save_persistables(self,
                                         executor,
@@ -1312,16 +1328,14 @@ def _ps_inference_save_inference_model(self,
             "user_defined_strategy"].trainer_desc_configs["stat_var_names"]
         generate_vars = [var for var in generate_vars]
         remaining_vars = list(
-            filter(
-                TheOnePSRuntime.__exclude_vars(sparse_names),
-                infer_program.list_vars()))
+            filter(TheOnePSRuntime.__exclude_vars(sparse_names),
+                   infer_program.list_vars()))
 
         for var in remaining_vars:
             tensor = var.get_value()
-            paddle.save(
-                tensor,
-                os.path.join(model_path, var.name),
-                use_binary_format=True)
+            paddle.save(tensor,
+                        os.path.join(model_path, var.name),
+                        use_binary_format=True)
 
     def _save_inference_model(self, *args, **kwargs):
         self._ps_inference_save_inference_model(*args, **kwargs)
@@ -1374,9 +1388,8 @@ def _ps_inference_load_inference_model(self,
         loaded_varnames = sparse_varnames
 
         remaining_vars = list(
-            filter(
-                TheOnePSRuntime.__exclude_vars(loaded_varnames),
-                main_program.list_vars()))
+            filter(TheOnePSRuntime.__exclude_vars(loaded_varnames),
+                   main_program.list_vars()))
 
         if dirname.startswith("afs:") or dirname.startswith("hdfs:"):
             model_path = "./dnn_plugin"
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index fab7b4ff4ce3d..7e0456f279373 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -55,6 +55,7 @@ class FSShellCmdAborted(ExecuteError):
 
 
 class FS(object):
+
     @abc.abstractmethod
     def ls_dir(self, fs_path):
         raise NotImplementedError
@@ -386,7 +387,9 @@ def list_dirs(self, fs_path):
 
 
 def _handle_errors(max_time_out=None):
+
     def decorator(f):
+
         @functools.wraps(f)
         def handler(*args, **kwargs):
             o = args[0]
@@ -406,13 +409,15 @@ def handler(*args, **kwargs):
                 except ExecuteError as e:
                     if time.time() - start >= time_out:
                         raise FSTimeOut("args:{} timeout:{}".format(
-                            args, time.time() - start))
+                            args,
+                            time.time() - start))
 
                     time.sleep(inter)
 
                 if time.time() - last_print_time > 30:
                     print("hadoop operator timeout:args:{} timeout:{}".format(
-                        args, time.time() - start))
+                        args,
+                        time.time() - start))
                     last_print_time = time.time()
 
         return handler
@@ -778,8 +783,8 @@ def get_local_files(path):
         procs = []
         for i in range(multi_processes):
             process_datas = self._split_files(all_files, i, multi_processes)
-            p = multiprocessing.Process(
-                target=__subprocess_upload, args=(fs_path, process_datas))
+            p = multiprocessing.Process(target=__subprocess_upload,
+                                        args=(fs_path, process_datas))
             procs.append(p)
             p.start()
 
@@ -847,8 +852,8 @@ def __subprocess_download(local_path, datas):
         procs = []
         for i in range(multi_processes):
             process_datas = self._split_files(all_files, i, multi_processes)
-            p = multiprocessing.Process(
-                target=__subprocess_download, args=(local_path, process_datas))
+            p = multiprocessing.Process(target=__subprocess_download,
+                                        args=(local_path, process_datas))
             procs.append(p)
             p.start()
 
@@ -943,8 +948,8 @@ def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
 
         if test_exists:
             if not self.is_exist(fs_src_path):
-                raise FSFileNotExistsError("{} is not exists".format(
-                    fs_src_path))
+                raise FSFileNotExistsError(
+                    "{} is not exists".format(fs_src_path))
 
             if self.is_exist(fs_dst_path):
                 raise FSFileExistsError("{} exists already".format(fs_dst_path))
@@ -1398,8 +1403,8 @@ def __subprocess_download(local_path, datas):
         procs = []
         for i in range(multi_processes):
             process_datas = self._split_files(all_files, i, multi_processes)
-            p = multiprocessing.Process(
-                target=__subprocess_download, args=(local_path, process_datas))
+            p = multiprocessing.Process(target=__subprocess_download,
+                                        args=(local_path, process_datas))
             procs.append(p)
             p.start()
 
@@ -1453,8 +1458,8 @@ def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
 
         if test_exists:
             if not self.is_exist(fs_src_path):
-                raise FSFileNotExistsError("{} is not exists".format(
-                    fs_src_path))
+                raise FSFileNotExistsError(
+                    "{} is not exists".format(fs_src_path))
 
             if self.is_exist(fs_dst_path):
                 raise FSFileExistsError("{} exists already".format(fs_dst_path))
diff --git a/python/paddle/distributed/fleet/utils/http_server.py b/python/paddle/distributed/fleet/utils/http_server.py
index 7d30fc5e0dff0..4653b22f96e07 100644
--- a/python/paddle/distributed/fleet/utils/http_server.py
+++ b/python/paddle/distributed/fleet/utils/http_server.py
@@ -38,8 +38,9 @@ def get_logger(name, level, fmt):
     return logger
 
 
-_http_server_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_http_server_logger = get_logger(__name__,
+                                 logging.INFO,
+                                 fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
index 0ac2df76d6aec..e6b581464fa4d 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -275,10 +275,12 @@ def _init_communication_group(self):
             self.role_maker, wait_port=False)
 
         # Create global rings
-        collective_helper._init_communicator(
-            self._startup_program, self.current_endpoint, self.global_endpoints,
-            self.global_rank, self.global_ring_id, True, self.global_ring_id,
-            True)
+        collective_helper._init_communicator(self._startup_program,
+                                             self.current_endpoint,
+                                             self.global_endpoints,
+                                             self.global_rank,
+                                             self.global_ring_id, True,
+                                             self.global_ring_id, True)
 
         # Create mp rings
         if self.num_mp > 1:
@@ -287,9 +289,11 @@ def _init_communication_group(self):
                 idx for idx, mp_idx in enumerate(self.mp_group)
                 if mp_idx == self.rank
             ][0]
-            collective_helper._init_communicator(
-                self._startup_program, self.current_endpoint, mp_endpoints,
-                mp_rank, self.mp_ring_id, True, self.global_ring_id, True)
+            collective_helper._init_communicator(self._startup_program,
+                                                 self.current_endpoint,
+                                                 mp_endpoints, mp_rank,
+                                                 self.mp_ring_id, True,
+                                                 self.global_ring_id, True)
 
         # Create pipeline rings
         if self.num_pp > 1:
@@ -309,10 +313,12 @@ def _init_communication_group(self):
                     self.endpoints[first_node], self.endpoints[second_node]
                 ]
                 pipeline_rank = 0 if self.rank == first_node else 1
-                collective_helper._init_communicator(
-                    self._startup_program, self.current_endpoint,
-                    pipeline_endpoints, pipeline_rank, ring_id, False,
-                    self.global_ring_id, True)
+                collective_helper._init_communicator(self._startup_program,
+                                                     self.current_endpoint,
+                                                     pipeline_endpoints,
+                                                     pipeline_rank, ring_id,
+                                                     False, self.global_ring_id,
+                                                     True)
 
     def _get_input_output_info(self, block):
         '''
@@ -367,8 +373,8 @@ def _split_program(self, program, stage, block_idx):
                 op_idx += 1
                 if op.type == "while":
                     sub_block_id = int(op.attr('sub_block').id)
-                    sub_used_var_names = self._split_program(program, stage,
-                                                             sub_block_id)
+                    sub_used_var_names = self._split_program(
+                        program, stage, sub_block_id)
 
                     used_var_names.update(sub_used_var_names)
 
@@ -403,6 +409,7 @@ def _split_program(self, program, stage, block_idx):
 
         return used_var_names
 
+
 #     def _find_post_op(self, index, var_name):
 #         """
 #         Find the post op that has variable named var_name as input.
@@ -474,23 +481,25 @@ def _check_validation(self, block):
 
         pre_stage_id = None
         for op in block.ops:
-            assert op.has_attr(self._op_role_key), (
-                "{} has no {} set .".format(op.type, self._op_role_key))
+            assert op.has_attr(self._op_role_key), ("{} has no {} set .".format(
+                op.type, self._op_role_key))
             op_role = op.attr(self._op_role_key)
             assert op_role == int(self._op_role.Forward), (
                 "Only forward is supported for inference.")
             if not op._has_kernel(op.type):
-                assert op.type in ["while", "conditional_block"], (
-                    "The only supported op without kernel is while.")
+                assert op.type in [
+                    "while", "conditional_block"
+                ], ("The only supported op without kernel is while.")
                 sub_block_id = op.attr('sub_block').id
                 sub_block = block.program.block(sub_block_id)
                 self._check_validation(sub_block)
-            assert op.has_attr(self._op_device_key), (
-                "{} has no {} set.".format(op.type, self._op_device_key))
+            assert op.has_attr(
+                self._op_device_key), ("{} has no {} set.".format(
+                    op.type, self._op_device_key))
 
             device = op.attr(self._op_device_key)
-            assert device, (
-                "{} has no {} set.".format(op.type, self._op_device_key))
+            assert device, ("{} has no {} set.".format(op.type,
+                                                       self._op_device_key))
             if device.split(':')[1] == "all": continue
 
             dev_type = device.split(':')[0]
@@ -507,7 +516,9 @@ def _insert_sendrecv_ops_for_boundaries(self, block, is_while_block):
         # avoiding multiple send and recv ops.
         input_var_to_device = dict()
 
-        extra_index_info = {'index': 0, }
+        extra_index_info = {
+            'index': 0,
+        }
 
         for index, op in enumerate(list(block.ops)):
             cur_device = op.attr(self._op_device_key)
@@ -542,8 +553,8 @@ def _insert_sendrecv_ops_for_boundaries(self, block, is_while_block):
                 if (cur_device, prev_device) in input_var_to_device[var_name]:
                     continue
 
-                assert self._device == cur_device.split(':')[
-                    0], "More than one device type found."
+                assert self._device == cur_device.split(
+                    ':')[0], "More than one device type found."
                 device_type = cur_device.split(':')[0] + ':'
 
                 def _insert_send_recv(cur_id, prev_id):
@@ -614,9 +625,8 @@ def _insert_send_recv(cur_id, prev_id):
                         })
                     extra_index_info['index'] += 1
 
-                _insert_send_recv(
-                    int(cur_device.split(':')[1]),
-                    int(prev_device.split(':')[1]))
+                _insert_send_recv(int(cur_device.split(':')[1]),
+                                  int(prev_device.split(':')[1]))
         block._sync_with_cpp()
 
     def _insert_sendrecv_ops_in_while_block(
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index 5e2ad43c16431..e2f7af769d39e 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -51,8 +51,10 @@ def _apply_collective_grads(parameters, comm_group):
         paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
         paddle.fluid.framework._dygraph_tracer().trace_op(
             type="elementwise_div",
-            inputs={'X': coalesced_grad,
-                    'Y': div_factor},
+            inputs={
+                'X': coalesced_grad,
+                'Y': div_factor
+            },
             outputs={'Out': coalesced_grad},
             attrs={'axis': -1})
 
@@ -76,7 +78,7 @@ def _apply_collective_grads_eager(parameters, comm_group):
 
     div_factor = 1.0 / comm_group.nranks
     for coalesced_grad, _, _ in coalesced_grads_and_vars:
-        # need to div nranks 
+        # need to div nranks
         coalesced_grad.scale_(div_factor)
         paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
 
@@ -89,22 +91,20 @@ def _broadcast_data_help(data, shape, dtype, hcg):
     mp_rank = hcg.get_model_parallel_rank()
 
     shape_gpu = paddle.to_tensor(shape, dtype="int32")
-    paddle.distributed.broadcast(
-        shape_gpu,
-        src=src_rank,
-        group=model_parallel_group,
-        use_calc_stream=True)
+    paddle.distributed.broadcast(shape_gpu,
+                                 src=src_rank,
+                                 group=model_parallel_group,
+                                 use_calc_stream=True)
 
     if mp_rank != 0:
         input_data = paddle.zeros(shape_gpu, dtype=dtype)
     else:
         input_data = data
 
-    paddle.distributed.broadcast(
-        input_data,
-        src=src_rank,
-        group=model_parallel_group,
-        use_calc_stream=True)
+    paddle.distributed.broadcast(input_data,
+                                 src=src_rank,
+                                 group=model_parallel_group,
+                                 use_calc_stream=True)
 
 
 def broadcast_input_data(hcg, *inputs, **kwargs):
@@ -128,15 +128,19 @@ def broadcast_input_data(hcg, *inputs, **kwargs):
 def broadcast_mp_parameters(model, hcg):
     model_parallel_group = hcg.get_model_parallel_group()
     src_rank = hcg.get_model_parallel_group_src_rank()
-    sync_params_buffers(
-        model, model_parallel_group, src_rank, is_model_parallel=True)
+    sync_params_buffers(model,
+                        model_parallel_group,
+                        src_rank,
+                        is_model_parallel=True)
 
 
 def broadcast_dp_parameters(model, hcg):
     data_parallel_group = hcg.get_data_parallel_group()
     src_rank = hcg.get_data_parallel_group_src_rank()
-    sync_params_buffers(
-        model, data_parallel_group, src_rank, is_model_parallel=False)
+    sync_params_buffers(model,
+                        data_parallel_group,
+                        src_rank,
+                        is_model_parallel=False)
 
 
 def fused_allreduce_gradients(parameter_list, hcg):
@@ -150,7 +154,7 @@ def fused_allreduce_gradients(parameter_list, hcg):
 
 def sharding_reduce_gradients(parameter_list, hcg):
     # TODO allreduce --> reduce
-    # TODO merge grad / nrank with dp 
+    # TODO merge grad / nrank with dp
     logger.debug("sharding start gradients sync")
     with framework.no_grad():
 
@@ -166,7 +170,7 @@ def sharding_reduce_gradients(parameter_list, hcg):
 
                 elif _in_legacy_dygraph():
                     g_var = param._grad_ivar()
-                    # need use trace_op to allreduce 
+                    # need use trace_op to allreduce
                     # paddle.distributed.all_reduce(
                     #     g_var, group=hcg.get_sharding_parallel_group(), use_calc_stream=True)
                     paddle.fluid.framework._dygraph_tracer().trace_op(
@@ -179,12 +183,14 @@ def sharding_reduce_gradients(parameter_list, hcg):
                         })
 
                     # grad / sharding_rank
-                    div_factor = paddle.to_tensor(
-                        sharding_nrank, dtype=g_var.dtype)
+                    div_factor = paddle.to_tensor(sharding_nrank,
+                                                  dtype=g_var.dtype)
                     paddle.fluid.framework._dygraph_tracer().trace_op(
                         type="elementwise_div",
-                        inputs={'X': g_var,
-                                'Y': div_factor},
+                        inputs={
+                            'X': g_var,
+                            'Y': div_factor
+                        },
                         outputs={'Out': g_var},
                         attrs={'axis': -1})
 
@@ -194,5 +200,7 @@ def broadcast_sharding_parameters(model, hcg):
     logger.debug("sharding start init parameters sync")
     sharding_parallel_group = hcg.get_sharding_parallel_group()
     src_rank = hcg.get_sharding_parallel_group_src_rank()
-    sync_params_buffers(
-        model, sharding_parallel_group, src_rank, is_model_parallel=False)
+    sync_params_buffers(model,
+                        sharding_parallel_group,
+                        src_rank,
+                        is_model_parallel=False)
diff --git a/python/paddle/distributed/fleet/utils/internal_storage.py b/python/paddle/distributed/fleet/utils/internal_storage.py
index 80d8d8562d48f..421111d5b8894 100644
--- a/python/paddle/distributed/fleet/utils/internal_storage.py
+++ b/python/paddle/distributed/fleet/utils/internal_storage.py
@@ -62,11 +62,11 @@ def to(self, device, dtype=None, keep_alignment=True):
         Move the underlying buffer
         """
         assert self.buffer is not None, "Cannot move a collapsed bucket, please rebuild it"
-        assert (dtype == Type.fp32.value or
-                Type.fp16.value), "Conversion type is not supported now"
+        assert (dtype == Type.fp32.value
+                or Type.fp16.value), "Conversion type is not supported now"
 
-        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
-                                                            .split(":")[1])
+        dev_id = 0 if paddle.get_device() == "cpu" else int(
+            paddle.get_device().split(":")[1])
 
         if self._device != device:
             tmp_buffer = self.buffer.cuda(
@@ -154,11 +154,11 @@ def _add_param_as_view(self, param, align, convert_gpu=True):
         param.stop_gradient = origin_state
 
         # Copy the current param value
-        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
-                                                            .split(":")[1])
+        dev_id = 0 if paddle.get_device() == "cpu" else int(
+            paddle.get_device().split(":")[1])
         with device_guard(dev_id, "cpu"):
-            tmp_var = core.VarBase(tensor=self.buffer._slice(self._fill,
-                                                             var_end))
+            tmp_var = core.VarBase(
+                tensor=self.buffer._slice(self._fill, var_end))
             if convert_gpu:
                 param_cpu = param.cpu()
                 param.value().get_tensor()._clear()
@@ -316,8 +316,8 @@ def _add_grad_as_view(self, param, align):
         assert offset <= np.prod(self.buffer.shape)
 
         # Copy the current grad value to InternalStorage
-        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
-                                                            .split(":")[1])
+        dev_id = 0 if paddle.get_device() == "cpu" else int(
+            paddle.get_device().split(":")[1])
         if self._device == "cpu":
             with device_guard(dev_id, self._device):
                 tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
diff --git a/python/paddle/distributed/fleet/utils/log_util.py b/python/paddle/distributed/fleet/utils/log_util.py
index 77eb641e0c6fe..cf90527c07fe4 100644
--- a/python/paddle/distributed/fleet/utils/log_util.py
+++ b/python/paddle/distributed/fleet/utils/log_util.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,6 +19,7 @@
 
 
 class LoggerFactory:
+
     @staticmethod
     def build_logger(name=None, level=logging.INFO):
         assert name is not None, "name for logger should not be None"
diff --git a/python/paddle/distributed/fleet/utils/ps_util.py b/python/paddle/distributed/fleet/utils/ps_util.py
index e4dcd59b3f1ba..0e141d66c1a17 100644
--- a/python/paddle/distributed/fleet/utils/ps_util.py
+++ b/python/paddle/distributed/fleet/utils/ps_util.py
@@ -53,10 +53,10 @@ def init_distributed_infer_env(self,
             fake_optimizer = paddle.optimizer.SGD()
             strategy = fleet.DistributedStrategy()
             strategy.a_sync = True
-            optimizer = fleet.distributed_optimizer(
-                fake_optimizer, strategy=strategy)
-            optimizer.minimize(
-                loss, startup_program=self.origin_startup_program)
+            optimizer = fleet.distributed_optimizer(fake_optimizer,
+                                                    strategy=strategy)
+            optimizer.minimize(loss,
+                               startup_program=self.origin_startup_program)
 
             if fleet.is_server():
                 fleet.init_server(dirname=dirname)
@@ -100,11 +100,10 @@ def _init_dense_params(self, exe=None, dirname=None):
                 v[1] for v in dense_persist_vars
                 if os.path.isfile(os.path.join(dirname, v[0]))
             ]
-            paddle.static.load_vars(
-                exe,
-                dirname,
-                main_program=self.origin_main_program,
-                vars=need_load_vars)
+            paddle.static.load_vars(exe,
+                                    dirname,
+                                    main_program=self.origin_main_program,
+                                    vars=need_load_vars)
 
     def get_dist_infer_program(self):
         varname2tables = self._get_sparse_table_map()
@@ -113,6 +112,7 @@ def get_dist_infer_program(self):
         return convert_program
 
     def _convert_program(self, main_program, varname2tables):
+
         def distributed_ops_pass(program):
             SPARSE_OP_TYPE_DICT = {"lookup_table": "W", "lookup_table_v2": "W"}
 
@@ -128,6 +128,7 @@ def _get_pull_sparse_ops(_program):
                 return pull_sparse_ops
 
             def _pull_sparse_fuse(_program, pull_sparse_ops):
+
                 def dag_check_up_and_reorder(program, inputs, outputs):
                     global_block = program.global_block()
                     min_output_index = len(global_block.ops)
@@ -152,8 +153,8 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                             for out_id, out_var in enumerate(outputs):
                                 if out_var.name in ins:
                                     output_indexes[idx] = 1
-                                    min_output_index = min(min_output_index,
-                                                           idx)
+                                    min_output_index = min(
+                                        min_output_index, idx)
 
                     for i in range(len(global_block.ops)):
                         if input_indexes[i] == 1 and output_indexes[i] == 1:
@@ -213,8 +214,8 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                                 desc = global_block.desc._insert_op(
                                     min_output_index)
                                 desc.copy_from(global_block.ops[index].desc)
-                                global_block.desc._remove_op(index + 1,
-                                                             index + 2)
+                                global_block.desc._remove_op(
+                                    index + 1, index + 2)
                                 global_block.ops[index].desc = desc
                                 insert_op = global_block.ops.pop(index)
                                 input_state = input_indexes.pop(index)
@@ -230,8 +231,8 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                         assert global_block.desc.op_size() == len(
                             global_block.ops)
                         for i in range(len(global_block.ops)):
-                            assert global_block.desc.op(i) == global_block.ops[
-                                i].desc
+                            assert global_block.desc.op(
+                                i) == global_block.ops[i].desc
 
                 for param, ops in pull_sparse_ops.items():
                     all_ops = program.global_block().ops
@@ -245,8 +246,8 @@ def dag_check_up_and_reorder(program, inputs, outputs):
 
                     if w.name not in varname2tables.keys():
                         raise ValueError(
-                            "can not find variable {}, please check your configuration".
-                            format(w.name))
+                            "can not find variable {}, please check your configuration"
+                            .format(w.name))
 
                     table_id = varname2tables[w.name]
 
@@ -266,16 +267,16 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                         program.global_block()._remove_op(idx)
 
                     inputs_idxs = [-1] * len(inputs)
-                    outputs_idxs = [len(program.global_block().ops) + 1] * len(
-                        outputs)
+                    outputs_idxs = [len(program.global_block().ops) + 1
+                                    ] * len(outputs)
 
                     for idx, op in enumerate(program.global_block().ops):
                         for i in range(0, len(op.output_names)):
                             outs = op.output(op.output_names[i])
                             for in_id, in_var in enumerate(inputs):
                                 if in_var.name in outs:
-                                    inputs_idxs[in_id] = max(idx,
-                                                             inputs_idxs[in_id])
+                                    inputs_idxs[in_id] = max(
+                                        idx, inputs_idxs[in_id])
                         for i in range(0, len(op.input_names)):
                             ins = op.input(op.input_names[i])
                             for out_id, out_var in enumerate(outputs):
@@ -289,8 +290,10 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                         program.global_block()._insert_op(
                             index=distributed_idx,
                             type="distributed_lookup_table",
-                            inputs={"Ids": inputs,
-                                    'W': w},
+                            inputs={
+                                "Ids": inputs,
+                                'W': w
+                            },
                             outputs={"Outputs": outputs},
                             attrs={
                                 "is_distributed": is_distributed,
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
index b8d1c881a08f9..423536b095a40 100755
--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,9 +21,10 @@
 from paddle.fluid.framework import in_dygraph_mode
 
 import logging
+
 logger = logging.getLogger(__name__)
-formatter = logging.Formatter(
-    fmt='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
+                              datefmt='%Y-%m-%d %H:%M:%S')
 ch = logging.StreamHandler()
 ch.setFormatter(formatter)
 logger.addHandler(ch)
@@ -68,13 +69,14 @@ def swith_rng_state_tracker(rng_state, tracker):
 
 
 class EagerRecomputeFunction(EagerPyLayer):
+
     @staticmethod
     def forward(ctx, run_function, preserve_rng_state, *args):
         from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
         if framework._dygraph_tracer()._has_grad:
             check_recompute_necessary(args)
 
-        # store for recomputing 
+        # store for recomputing
         ctx.run_function = run_function
         ctx.preserve_rng_state = preserve_rng_state
 
@@ -101,8 +103,8 @@ def forward(ctx, run_function, preserve_rng_state, *args):
             cur_device = paddle.get_device()
             if 'gpu:' not in cur_device:
                 raise RuntimeError(
-                    "Recompute with RNG perserve is not support current device: {}.".
-                    format(cur_device))
+                    "Recompute with RNG perserve is not support current device: {}."
+                    .format(cur_device))
             ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state()
             ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker(
             ).get_states_tracker()
@@ -163,12 +165,11 @@ def backward(ctx, *args):
                         detached_inputs = detach_variable(tuple(inputs))
                         outputs = ctx.run_function(*detached_inputs)
             else:
-                with paddle.amp.auto_cast(
-                        enable=ctx.is_fw_autocast,
-                        custom_white_list=ctx.amp_white_list,
-                        custom_black_list=ctx.amp_black_list,
-                        level=ctx.amp_level,
-                        dtype=ctx.amp_dtype):
+                with paddle.amp.auto_cast(enable=ctx.is_fw_autocast,
+                                          custom_white_list=ctx.amp_white_list,
+                                          custom_black_list=ctx.amp_black_list,
+                                          level=ctx.amp_level,
+                                          dtype=ctx.amp_dtype):
                     detached_inputs = detach_variable(tuple(inputs))
                     outputs = ctx.run_function(*detached_inputs)
 
@@ -179,7 +180,7 @@ def backward(ctx, *args):
             # run backward() with only tensor that requires grad
             forward_outputs_with_grad = []
             # NOTE In Transformer-like network, if user put the attention mask into the recompute segment output,
-            # pylayer will force the stop_gradient of attention mask to be False, which will make the number of 
+            # pylayer will force the stop_gradient of attention mask to be False, which will make the number of
             # tensor that need grad does not match.
             # the following backward_inputs_with_grad is used to avoid this case.
             backward_inputs_with_grad = []
@@ -200,20 +201,20 @@ def backward(ctx, *args):
                 paddle.autograd.backward(forward_outputs_with_grad,
                                          backward_inputs_with_grad)
 
-            grads = tuple(
-                inp.grad for inp in detached_inputs
-                if isinstance(inp, core.eager.Tensor))
+            grads = tuple(inp.grad for inp in detached_inputs
+                          if isinstance(inp, core.eager.Tensor))
             return grads
 
 
 class RecomputeFunction(PyLayer):
+
     @staticmethod
     def forward(ctx, run_function, preserve_rng_state, *args):
         from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
         if framework._dygraph_tracer()._has_grad:
             check_recompute_necessary(args)
 
-        # store for recomputing 
+        # store for recomputing
         ctx.run_function = run_function
         ctx.preserve_rng_state = preserve_rng_state
 
@@ -240,8 +241,8 @@ def forward(ctx, run_function, preserve_rng_state, *args):
             cur_device = paddle.get_device()
             if 'gpu:' not in cur_device:
                 raise RuntimeError(
-                    "Recompute with RNG perserve is not support current device: {}.".
-                    format(cur_device))
+                    "Recompute with RNG perserve is not support current device: {}."
+                    .format(cur_device))
             ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state()
             ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker(
             ).get_states_tracker()
@@ -302,12 +303,11 @@ def backward(ctx, *args):
                         detached_inputs = detach_variable(tuple(inputs))
                         outputs = ctx.run_function(*detached_inputs)
             else:
-                with paddle.amp.auto_cast(
-                        enable=ctx.is_fw_autocast,
-                        custom_white_list=ctx.amp_white_list,
-                        custom_black_list=ctx.amp_black_list,
-                        level=ctx.amp_level,
-                        dtype=ctx.amp_dtype):
+                with paddle.amp.auto_cast(enable=ctx.is_fw_autocast,
+                                          custom_white_list=ctx.amp_white_list,
+                                          custom_black_list=ctx.amp_black_list,
+                                          level=ctx.amp_level,
+                                          dtype=ctx.amp_dtype):
                     detached_inputs = detach_variable(tuple(inputs))
                     outputs = ctx.run_function(*detached_inputs)
 
@@ -318,7 +318,7 @@ def backward(ctx, *args):
             # run backward() with only tensor that requires grad
             forward_outputs_with_grad = []
             # NOTE In Transformer-like network, if user put the attention mask into the recompute segment output,
-            # pylayer will force the stop_gradient of attention mask to be False, which will make the number of 
+            # pylayer will force the stop_gradient of attention mask to be False, which will make the number of
             # tensor that need grad does not match.
             # the following backward_inputs_with_grad is used to avoid this case.
             backward_inputs_with_grad = []
@@ -463,8 +463,8 @@ def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
     # Hack to mix *args with **kwargs in a python 2.7-compliant way
     preserve = kwargs.pop('preserve_rng_state', True)
     if kwargs:
-        raise ValueError("Unexpected keyword arguments: " + ",".join(
-            arg for arg in kwargs))
+        raise ValueError("Unexpected keyword arguments: " +
+                         ",".join(arg for arg in kwargs))
 
     if in_dygraph_mode():
         return EagerRecomputeFunction.apply(function, preserve, *args)
diff --git a/python/paddle/distributed/launch/__main__.py b/python/paddle/distributed/launch/__main__.py
index 42f844ca71774..52b0ed3a012cc 100644
--- a/python/paddle/distributed/launch/__main__.py
+++ b/python/paddle/distributed/launch/__main__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py
index fbea5d0db869e..902c8189b1720 100644
--- a/python/paddle/distributed/launch/context/__init__.py
+++ b/python/paddle/distributed/launch/context/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 
 class Context(object):
+
     def __init__(self, enable_plugin=True):
         self.args, self.unknown_args = parse_args()
         self.envs = fetch_envs()
diff --git a/python/paddle/distributed/launch/context/args_envs.py b/python/paddle/distributed/launch/context/args_envs.py
index b70dd7d3f759f..f6624e88e276d 100644
--- a/python/paddle/distributed/launch/context/args_envs.py
+++ b/python/paddle/distributed/launch/context/args_envs.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,7 +35,6 @@
     'PADDLE_TRAINERS_ENDPOINTS': 'trainers',
     'PADDLE_GLOO_PORT': 'gloo_port',
     'PADDLE_WITH_GLOO': 'with_gloo',
-    'PADDLE_DEVICE_NUM': 'device_num'
 }
 
 
@@ -51,102 +50,107 @@ def parse_args():
 
     base_group = parser.add_argument_group("Base Parameters")
 
-    base_group.add_argument(
-        "--master",
-        type=str,
-        default=None,
-        help="the master/rendezvous server, ip:port")
+    base_group.add_argument("--master",
+                            type=str,
+                            default=None,
+                            help="the master/rendezvous server, ip:port")
 
-    base_group.add_argument(
-        "--legacy", type=bool, default=False, help="use legacy launch")
+    base_group.add_argument("--legacy",
+                            type=bool,
+                            default=False,
+                            help="use legacy launch")
 
-    base_group.add_argument(
-        "--rank", type=int, default=-1, help="the node rank")
+    base_group.add_argument("--rank",
+                            type=int,
+                            default=-1,
+                            help="the node rank")
 
-    base_group.add_argument(
-        "--log_level", type=str, default="INFO", help="log level. Default INFO")
+    base_group.add_argument("--log_level",
+                            type=str,
+                            default="INFO",
+                            help="log level. Default INFO")
 
-    base_group.add_argument(
-        "--nnodes",
-        type=str,
-        default="1",
-        help="the number of nodes, i.e. pod/node number")
+    base_group.add_argument("--nnodes",
+                            type=str,
+                            default="1",
+                            help="the number of nodes, i.e. pod/node number")
 
-    base_group.add_argument(
-        "--nproc_per_node",
-        type=int,
-        default=None,
-        help="the number of processes in a pod")
+    base_group.add_argument("--nproc_per_node",
+                            type=int,
+                            default=None,
+                            help="the number of processes in a pod")
 
     base_group.add_argument(
         "--log_dir",
         type=str,
         default="log",
         help="the path for each process's log. Default ./log")
-    base_group.add_argument(
-        "--run_mode",
-        type=str,
-        default=None,
-        help="run mode of the job, collective/ps/ps-heter")
+    base_group.add_argument("--run_mode",
+                            type=str,
+                            default=None,
+                            help="run mode of the job, collective/ps/ps-heter")
 
-    base_group.add_argument(
-        "--job_id",
-        type=str,
-        default="default",
-        help="unique id of the job. Default default")
+    base_group.add_argument("--job_id",
+                            type=str,
+                            default="default",
+                            help="unique id of the job. Default default")
 
-    base_group.add_argument(
-        "--devices",
-        type=str,
-        default=None,
-        help="accelerate devices. as --gpus,npus,xps")
-
-    base_group.add_argument(
-        "--device_num",
-        type=int,
-        default=None,
-        help="the number of accelerate devices.")
+    base_group.add_argument("--devices",
+                            type=str,
+                            default=None,
+                            help="accelerate devices. as --gpus,npus,xps")
 
     base_group.add_argument("--host", type=str, default=None, help="host ip")
 
-    base_group.add_argument(
-        "training_script",
-        type=str,
-        help="the full path of py script,"
-        "followed by arguments for the "
-        "training script")
+    base_group.add_argument("training_script",
+                            type=str,
+                            help="the full path of py script,"
+                            "followed by arguments for the "
+                            "training script")
 
     base_group.add_argument('training_script_args', nargs=REMAINDER)
 
     ps_group = parser.add_argument_group("Parameter-Server Parameters")
     # for parameter server
-    ps_group.add_argument(
-        "--servers", type=str, default='', help="servers endpoints full list")
-    ps_group.add_argument(
-        "--trainers", type=str, default='', help="trainers endpoints full list")
-
-    ps_group.add_argument(
-        "--trainer_num", type=int, default=None, help="number of trainers")
-    ps_group.add_argument(
-        "--server_num", type=int, default=None, help="number of servers")
-    ps_group.add_argument(
-        "--gloo_port", type=int, default=6767, help="gloo http port")
-    ps_group.add_argument(
-        "--with_gloo", type=str, default="1", help="use gloo or not")
+    ps_group.add_argument("--servers",
+                          type=str,
+                          default='',
+                          help="servers endpoints full list")
+    ps_group.add_argument("--trainers",
+                          type=str,
+                          default='',
+                          help="trainers endpoints full list")
+
+    ps_group.add_argument("--trainer_num",
+                          type=int,
+                          default=None,
+                          help="number of trainers")
+    ps_group.add_argument("--server_num",
+                          type=int,
+                          default=None,
+                          help="number of servers")
+    ps_group.add_argument("--gloo_port",
+                          type=int,
+                          default=6767,
+                          help="gloo http port")
+    ps_group.add_argument("--with_gloo",
+                          type=str,
+                          default="1",
+                          help="use gloo or not")
 
     # parameter elastic mode
     elastic_group = parser.add_argument_group("Elastic Parameters")
-    elastic_group.add_argument(
-        "--max_restart",
-        type=int,
-        default=3,
-        help="the times can restart. Default 3")
+    elastic_group.add_argument("--max_restart",
+                               type=int,
+                               default=3,
+                               help="the times can restart. Default 3")
 
     elastic_group.add_argument(
         "--elastic_level",
         type=int,
         default=-1,
-        help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart"
+        help=
+        "elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart"
     )
 
     elastic_group.add_argument(
diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py
index 61ffe8e809564..7df7db28f7877 100644
--- a/python/paddle/distributed/launch/context/device.py
+++ b/python/paddle/distributed/launch/context/device.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,10 +21,10 @@ class DeviceType:
     XPU = 'xpu'
     NPU = 'npu'
     MLU = 'mlu'
-    IPU = 'ipu'
 
 
 class Device(object):
+
     def __init__(self, dtype=None, memory="", labels=""):
         self._dtype = dtype
         self._memory = memory
@@ -69,18 +69,12 @@ def get_selected_device_key(self):
             return 'FLAGS_selected_xpus'
         if self._dtype == DeviceType.MLU:
             return 'FLAGS_selected_mlus'
-        if self._dtype == DeviceType.IPU:
-            return 'FLAGS_selected_ipus'
         return 'FLAGS_selected_devices'
 
-    def get_selected_devices(self, devices='', device_num=None):
+    def get_selected_devices(self, devices=''):
         '''
         return the device label/id relative to the visible devices
         '''
-        if self._dtype == DeviceType.IPU:
-            if not device_num:
-                raise RuntimeError("The \'device_num\' is required by IPUs.")
-            return [str(device_num)]
         if not devices:
             return [str(x) for x in range(0, len(self._labels))]
         else:
@@ -136,9 +130,6 @@ def detect_device(self):
             dev._dtype = DeviceType.MLU
             num = fluid.core.get_mlu_device_count()
             visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
-        elif fluid.core.is_compiled_with_ipu():
-            dev._dtype = DeviceType.IPU
-            num = fluid.core.get_ipu_device_count()
 
         if num == 0:
             dev._dtype = DeviceType.CPU
diff --git a/python/paddle/distributed/launch/context/event.py b/python/paddle/distributed/launch/context/event.py
index 23e8e7a501400..cb39e1529fc82 100644
--- a/python/paddle/distributed/launch/context/event.py
+++ b/python/paddle/distributed/launch/context/event.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,6 +14,7 @@
 
 
 class Event(object):
+
     def __init__(self, kind="status", message="", fatal=False):
         self.kind = kind
         self.message = message
diff --git a/python/paddle/distributed/launch/context/node.py b/python/paddle/distributed/launch/context/node.py
index 8082541ffe06c..39f42d02107a2 100644
--- a/python/paddle/distributed/launch/context/node.py
+++ b/python/paddle/distributed/launch/context/node.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@
 
 
 class Node(object):
+
     def __init__(self):
         # self.device = Device.detect_device()
         self.device = Device.parse_device()
diff --git a/python/paddle/distributed/launch/context/resource.py b/python/paddle/distributed/launch/context/resource.py
index faffed704c1f0..d523c3c5cdfe8 100644
--- a/python/paddle/distributed/launch/context/resource.py
+++ b/python/paddle/distributed/launch/context/resource.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,5 +14,6 @@
 
 
 class Resource(object):
+
     def __init__(self):
         self.devices = []
diff --git a/python/paddle/distributed/launch/context/status.py b/python/paddle/distributed/launch/context/status.py
index cfbf3623ec22e..b87b7b3fb82d8 100644
--- a/python/paddle/distributed/launch/context/status.py
+++ b/python/paddle/distributed/launch/context/status.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py
index 166eb3a4f9dfd..5d2bc8cb07e3b 100644
--- a/python/paddle/distributed/launch/controllers/collective.py
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,6 +21,7 @@
 
 
 class CollectiveController(Controller):
+
     @classmethod
     def enable(cls, ctx):
         # collective is the default mode
@@ -54,9 +55,10 @@ def build_pod(self):
             'endpoints': ",".join(endpoints),
         })
 
-        peer_list, rank = self.master.sync_peers(
-            '/{}/info'.format(self.job.id), self.pod.name, data,
-            self.job.replicas, self.pod.rank)
+        peer_list, rank = self.master.sync_peers('/{}/info'.format(self.job.id),
+                                                 self.pod.name, data,
+                                                 self.job.replicas,
+                                                 self.pod.rank)
         self.pod.rank = rank
 
         if len(peer_list) < 1:
@@ -79,8 +81,7 @@ def build_pod(self):
         self.pod.reset()
         selected_dev_key = self.ctx.node.device.get_selected_device_key()
         selected_dev_list = self.ctx.node.device.get_selected_devices(
-            self.ctx.args.devices, self.ctx.args.device_num)
-
+            self.ctx.args.devices)
         for i in range(self.pod.replicas):
             e = {
                 "PADDLE_MASTER": collective_master,
@@ -96,8 +97,7 @@ def build_pod(self):
                 "PADDLE_TRAINERS_NUM": "{}".format(global_size),
                 "PADDLE_RANK_IN_NODE": str(i),
             }
-
-            if self.pod.replicas == 1 or self.ctx.node.device.dtype == "ipu":
+            if self.pod.replicas == 1:
                 e.update({selected_dev_key: ",".join(selected_dev_list)})
             else:
                 e.update({selected_dev_key: selected_dev_list[i]})
@@ -107,6 +107,7 @@ def build_pod(self):
 
 
 class CollectiveElasticController(CollectiveController):
+
     @classmethod
     def enable(cls, ctx):
         if ctx.args.master and ctx.args.master.startswith("etcd://"):
@@ -135,8 +136,9 @@ def run(self):
 
             self.ctx.logger.info("Waiting peer ready...")
 
-            ok, replicas = self.master.wait_peer_ready(
-                self.job.replicas_min, self.job.replicas_max, timeout)
+            ok, replicas = self.master.wait_peer_ready(self.job.replicas_min,
+                                                       self.job.replicas_max,
+                                                       timeout)
             if ok:
                 self.job.replicas = replicas
             else:
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
index f069bfbcd3501..a8ae155562ae9 100644
--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -32,6 +32,7 @@ class ControleMode:
 
 
 class ControllerBase(object):
+
     def __init__(self, ctx):
         signal.signal(signal.SIGTERM, self.signal_handler)
         signal.signal(signal.SIGABRT, self.signal_handler)
@@ -110,8 +111,8 @@ def watch(self) -> bool:
                     return False
 
             # peer failure
-            if self.ctx.status.is_restarting() and self.master.get_status(
-            ) != self.ctx.status.COMPLETED:
+            if self.ctx.status.is_restarting(
+            ) and self.master.get_status() != self.ctx.status.COMPLETED:
                 self.pod.stop()
                 return False
 
@@ -185,7 +186,8 @@ def new_container(self,
                       err=None):
         c = Container(
             entrypoint=(entrypoint or self._get_entrypoint()),
-            env=(self.ctx.get_envs() if use_ctx_env else {}), )
+            env=(self.ctx.get_envs() if use_ctx_env else {}),
+        )
         c.outfile, c.errfile = self._get_out_err_file(out, err)
         c.update_env(envs)
         return c
@@ -203,8 +205,10 @@ def add_container(self,
             log_file = None
 
         if not container:
-            container = self.new_container(
-                entrypoint=entrypoint, envs=envs, out=log_file, err=log_file)
+            container = self.new_container(entrypoint=entrypoint,
+                                           envs=envs,
+                                           out=log_file,
+                                           err=log_file)
 
         if is_init:
             self.pod.add_init_container(container)
diff --git a/python/paddle/distributed/launch/controllers/master.py b/python/paddle/distributed/launch/controllers/master.py
index 742fea9e16de7..8e8d31f86dd9f 100644
--- a/python/paddle/distributed/launch/controllers/master.py
+++ b/python/paddle/distributed/launch/controllers/master.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -64,6 +64,7 @@ def factory(cls, ctx):
 
 
 class HTTPMaster(Master):
+
     def lazy_init(self):
         if self.initialized:
             return
@@ -81,8 +82,8 @@ def lazy_init(self):
                         self.role = Master.MAIN
                         break
                     except Exception as e:
-                        self.ctx.logger.warning("start master failed {}".format(
-                            e))
+                        self.ctx.logger.warning(
+                            "start master failed {}".format(e))
                         time.sleep(0.1)
                         continue
         else:
@@ -172,6 +173,7 @@ def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
 
 
 class ETCDMaster(Master):
+
     def __init__(self, ctx):
         super().__init__(ctx)
 
@@ -263,8 +265,9 @@ def _heartbeat():
             self.ctx.logger.debug("Heartbeat done")
             self.client.cancel_watch(beat_watch)
 
-        self.beat_thread = threading.Thread(
-            name='heartbeat', target=_heartbeat, daemon=True)
+        self.beat_thread = threading.Thread(name='heartbeat',
+                                            target=_heartbeat,
+                                            daemon=True)
         self.beat_thread.start()
 
     def fetch_peer_alive(self):
diff --git a/python/paddle/distributed/launch/controllers/ps.py b/python/paddle/distributed/launch/controllers/ps.py
index 037bd313bbc03..19429ce19614e 100644
--- a/python/paddle/distributed/launch/controllers/ps.py
+++ b/python/paddle/distributed/launch/controllers/ps.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,6 +19,7 @@
 
 
 class PSController(Controller):
+
     @classmethod
     def enable(cls, ctx):
         if ctx.args.run_mode == ControleMode.PS or ctx.args.server_num or len(
@@ -132,9 +133,10 @@ def _build_pod_with_master(self):
             'gloo_port': self.ctx.node.get_free_port(),
         })
 
-        peer_list, rank = self.master.sync_peers(
-            '/{}/info'.format(self.job.id), self.pod.name, data,
-            self.job.replicas, self.pod.rank)
+        peer_list, rank = self.master.sync_peers('/{}/info'.format(self.job.id),
+                                                 self.pod.name, data,
+                                                 self.job.replicas,
+                                                 self.pod.rank)
 
         self.ctx.logger.debug("sync peers done {}".format(peer_list))
 
@@ -171,15 +173,22 @@ def _build_pod_with_master(self):
 
         for i in range(server_num):
             e = {
-                "PADDLE_NNODES": "{}".format(self.job.replicas),
-                "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
-                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_NNODES":
+                "{}".format(self.job.replicas),
+                "PADDLE_PSERVERS_IP_PORT_LIST":
+                ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS":
+                ",".join(trainer_endpoints),
                 "PADDLE_PORT":
                 server_endpoints[i + server_rank_offset].split(":")[1],
-                "PADDLE_ROLE": "PSERVER",
-                "TRAINING_ROLE": "PSERVER",
-                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
-                "POD_IP": self.ctx.node.ip,
+                "PADDLE_ROLE":
+                "PSERVER",
+                "TRAINING_ROLE":
+                "PSERVER",
+                "PADDLE_TRAINERS_NUM":
+                "{}".format(len(trainer_endpoints)),
+                "POD_IP":
+                self.ctx.node.ip,
             }
             e.update(_gloo_envs)
             log_tag = "ps.{}".format(i)
@@ -187,16 +196,24 @@ def _build_pod_with_master(self):
 
         for i in range(trainer_num):
             e = {
-                "PADDLE_NNODES": "{}".format(self.job.replicas),
-                "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
-                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_NNODES":
+                "{}".format(self.job.replicas),
+                "PADDLE_PSERVERS_IP_PORT_LIST":
+                ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS":
+                ",".join(trainer_endpoints),
                 "PADDLE_PORT":
                 trainer_endpoints[i + trainer_rank_offset].split(":")[1],
-                "PADDLE_ROLE": "TRAINER",
-                "TRAINING_ROLE": "TRAINER",
-                "PADDLE_TRAINER_ID": "{}".format(i + trainer_rank_offset),
-                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
-                "POD_IP": self.ctx.node.ip,
+                "PADDLE_ROLE":
+                "TRAINER",
+                "TRAINING_ROLE":
+                "TRAINER",
+                "PADDLE_TRAINER_ID":
+                "{}".format(i + trainer_rank_offset),
+                "PADDLE_TRAINERS_NUM":
+                "{}".format(len(trainer_endpoints)),
+                "POD_IP":
+                self.ctx.node.ip,
             }
             e.update(_gloo_envs)
             log_tag = "trainer.{}".format(i)
diff --git a/python/paddle/distributed/launch/controllers/watcher.py b/python/paddle/distributed/launch/controllers/watcher.py
index 4d49b924f1e81..131d915292e17 100644
--- a/python/paddle/distributed/launch/controllers/watcher.py
+++ b/python/paddle/distributed/launch/controllers/watcher.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@
 
 
 class Watcher(object):
+
     def __init__(self, ctx):
         self.ctx = ctx
 
diff --git a/python/paddle/distributed/launch/job/container.py b/python/paddle/distributed/launch/job/container.py
index a1ad6dbe24e8e..9f7b1733d1af2 100644
--- a/python/paddle/distributed/launch/job/container.py
+++ b/python/paddle/distributed/launch/job/container.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -106,8 +106,10 @@ def start(self):
         elif self._err:
             self._stderr = self._get_fd(self._err) or sys.stderr
 
-        self._proc = ProcessContext(
-            self._entrypoint, env=self._env, out=self._stdout, err=self._stderr)
+        self._proc = ProcessContext(self._entrypoint,
+                                    env=self._env,
+                                    out=self._stdout,
+                                    err=self._stderr)
         self._proc.start()
 
     def terminate(self, force=False):
@@ -143,7 +145,8 @@ def __str__(self):
             self._entrypoint,
             self.exit_code,
             self.errfile,
-            self._env, )
+            self._env,
+        )
 
     def logs(self, fn=None, offset=0, whence=1, limit=1000):
         if not self._log_handler:
diff --git a/python/paddle/distributed/launch/job/job.py b/python/paddle/distributed/launch/job/job.py
index 31827968ddce6..4bad1209c1859 100644
--- a/python/paddle/distributed/launch/job/job.py
+++ b/python/paddle/distributed/launch/job/job.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@ class JobMode:
 
 
 class Job(object):
+
     def __init__(self, jid='default', mode=JobMode.COLLECTIVE, nnodes="1"):
         self._mode = mode
         self._id = jid
diff --git a/python/paddle/distributed/launch/job/pod.py b/python/paddle/distributed/launch/job/pod.py
index 701adf45f94e8..cda400f0a324a 100644
--- a/python/paddle/distributed/launch/job/pod.py
+++ b/python/paddle/distributed/launch/job/pod.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class PodSepc(object):
+
     def __init__(self):
         self._name = ''.join(
             random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(6))
@@ -41,12 +42,14 @@ def __init__(self):
 
 
 class Pod(PodSepc):
+
     def __init__(self):
         super().__init__()
 
     def __str__(self):
-        return "Pod: {}, replicas {}, status {}".format(
-            self.name, self.replicas, self.status)
+        return "Pod: {}, replicas {}, status {}".format(self.name,
+                                                        self.replicas,
+                                                        self.status)
 
     def failed_container(self):
         cs = []
diff --git a/python/paddle/distributed/launch/job/status.py b/python/paddle/distributed/launch/job/status.py
index ae10c5adb6cbf..88fd09bbf2267 100644
--- a/python/paddle/distributed/launch/job/status.py
+++ b/python/paddle/distributed/launch/job/status.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index 92585c9e7657a..f90fa7401e9a0 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -52,9 +52,7 @@ def launch():
 
         - ``--job_id``: The job unique id, it affects the log files' name. e.g., ``--job_id=job1``. Default ``--job_id=default``.
 
-        - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu/mlu/ipu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
-
-        - ``--device_num``: The number of selected accelerate devices on nodes, can be gpu/xpu/npu/mlu/ipu etc.. e.g., ``--device_num=4`` will require four devices per node.
+        - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu/mlu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
 
         - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py``
 
diff --git a/python/paddle/distributed/launch/plugins/__init__.py b/python/paddle/distributed/launch/plugins/__init__.py
index faa8f2823733c..fe8116207e6d8 100644
--- a/python/paddle/distributed/launch/plugins/__init__.py
+++ b/python/paddle/distributed/launch/plugins/__init__.py
@@ -25,20 +25,6 @@ def log(ctx):
     ctx.logger.info("--------------------------------------------------")
 
 
-def rewrite_ipu_script(ctx):
-    import paddle.fluid as fluid
-    if fluid.core.is_compiled_with_ipu():
-        import os
-        if ctx.args.training_script != "ipu":
-            raise RuntimeError(
-                "Only support to run the script \'ipu\' for IPU distributed computing."
-            )
-        ctx.args.training_script = os.path.abspath(
-            os.path.join(
-                os.path.dirname(os.path.dirname(__file__)),
-                "utils/ipu_launch.py"))
-
-
 def process_args(ctx):
     # reset device by args
     #argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus
@@ -55,8 +41,8 @@ def collective_compatible(ctx):
         hosts = set([h.split(':')[0] for h in eps])
         ctx.args.master = eps[0] if ':' in eps[0] else '{}:6768'.format(eps[0])
         ctx.args.nnodes = len(hosts)
-        ctx.logger.info('args reset by env PADDLE_TRAINER_ENDPOINTS\n{}'.format(
-            eps))
+        ctx.logger.info(
+            'args reset by env PADDLE_TRAINER_ENDPOINTS\n{}'.format(eps))
     '''
     if 'DISTRIBUTED_TRAINER_ENDPOINTS' in ctx.envs:
         eps = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')
@@ -74,6 +60,4 @@ def rewrite_host_ip(ctx):
         ctx.node.ip = ctx.args.host
 
 
-enabled_plugins = [
-    collective_compatible, rewrite_host_ip, process_args, rewrite_ipu_script
-]
+enabled_plugins = [collective_compatible, rewrite_host_ip, process_args]
diff --git a/python/paddle/distributed/launch/utils/ipu_launch.py b/python/paddle/distributed/launch/utils/ipu_launch.py
deleted file mode 100644
index 595243cdf9d9c..0000000000000
--- a/python/paddle/distributed/launch/utils/ipu_launch.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-
-import subprocess
-import argparse
-import os
-import logging
-import sys
-
-
-class IPULaunch(object):
-    def __init__(self, hosts, ipus_per_replica, nproc_per_host, ipu_partition,
-                 vipu_server, training_script, training_script_args):
-        if not fluid.core.is_compiled_with_ipu():
-            raise RuntimeError(
-                "Can not call ipu_launch.py in non IPU compiled environment, please re-compile with WITH_IPU=ON."
-            )
-        self._hosts = hosts
-        self._ipus_per_replica = ipus_per_replica
-        self._nproc_per_host = nproc_per_host
-        self._ipu_partition = ipu_partition
-        self._vipu_server = vipu_server
-        self._training_script = training_script
-        self._training_script_args = training_script_args
-
-        self._num_ipus = int(os.getenv("FLAGS_selected_ipus"))
-        self.logger = self.get_logger()
-
-    @classmethod
-    def parse_ipu_args(self):
-        parser = argparse.ArgumentParser()
-        parser.add_argument(
-            "--hosts",
-            type=str,
-            help="The hosts for IPU PopRun distributd computing.")
-        parser.add_argument(
-            "--ipus_per_replica",
-            type=int,
-            help="The number of IPUs per replica.")
-        parser.add_argument(
-            "--nproc_per_host",
-            type=int,
-            help="The number of processes per host.")
-        parser.add_argument(
-            "--ipu_partition", type=str, help="The partition name of IPU.")
-        parser.add_argument(
-            "--vipu_server",
-            type=str,
-            help="The vipu server host to enable vipu.")
-        parser.add_argument(
-            "training_script",
-            type=str,
-            help="The full path to the single IPU replica training program/script to be launched in parallel."
-        )
-        parser.add_argument('training_script_args', nargs=argparse.REMAINDER)
-        args = parser.parse_args()
-
-        ipu_launch = IPULaunch(
-            hosts=args.hosts,
-            ipus_per_replica=args.ipus_per_replica,
-            nproc_per_host=args.nproc_per_host,
-            ipu_partition=args.ipu_partition,
-            vipu_server=args.vipu_server,
-            training_script=args.training_script,
-            training_script_args=args.training_script_args, )
-
-        return ipu_launch
-
-    def get_logger(self, level=logging.INFO):
-        logger = logging.getLogger("LAUNCH")
-        logger.setLevel(level)
-        formatter = logging.Formatter(
-            fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
-        ch = logging.StreamHandler()
-        ch.setFormatter(formatter)
-        logger.addHandler(ch)
-        return logger
-
-    def launch(self):
-        # The number of replicas for data parallel
-        assert (self._num_ipus % self._ipus_per_replica) == 0, \
-                    "The number of IPUs:{} mod the number of IPUs per replica:{} must == 0".format(self._num_ipus, self._ipus_per_replica)
-        num_replicas = self._num_ipus // self._ipus_per_replica
-        self.logger.info("The number of total replicas is {}.".format(
-            num_replicas))
-
-        # The number of processes
-        num_nodes = len(self._hosts.split(','))
-        num_procs = num_nodes * self._nproc_per_host
-        self.logger.info("The number of total processes is {}.".format(
-            num_procs))
-        assert (num_replicas % num_procs) == 0, \
-                    "The number of replicas:{} mod the number of processes:{} must == 0".format(num_replicas, num_procs)
-
-        # hosts and endpoints
-        hosts = self._hosts.replace(' ', '').split(',')
-        endpoints = [x + ":8090" for x in hosts]
-
-        # args for poprun
-        poprun_command = ['poprun']
-
-        poprun_command.append('--num-instances={}'.format(num_procs))
-        poprun_command.append('--num-replicas={}'.format(num_replicas))
-        poprun_command.append('--ipus-per-replica={}'.format(
-            self._ipus_per_replica))
-        poprun_command.append('--host={}'.format(','.join(hosts)))
-        poprun_command.append('--vipu-partition={}'.format(self._ipu_partition))
-        poprun_command.append('--vipu-server-host={}'.format(self._vipu_server))
-
-        poprun_command.extend([
-            '--update-partition=no', '--vipu-server-timeout=120',
-            '--print-topology=yes', '--numa-aware=yes'
-        ])
-
-        # global envs
-        global_envs = '--mpi-local-args=\''
-        log_level = os.getenv('POPART_LOG_LEVEL', None)
-        if log_level:
-            global_envs += '-x POPART_LOG_LEVEL={} '.format(log_level)
-        global_envs += '-x PADDLE_TRAINERS_NUM={} -x PADDLE_TRAINER_ENDPOINTS={}'.format(
-            num_procs, ','.join(endpoints))
-        global_envs += '\''
-        poprun_command.append(global_envs)
-
-        # local envs
-        for idx in range(num_procs):
-            cur_endpoint = endpoints[idx // self._nproc_per_host]
-            rank_in_node = idx % self._nproc_per_host
-            poprun_command.append(
-                '--instance-mpi-local-args={}:\"-x PADDLE_TRAINER_ID={} -x PADDLE_CURRENT_ENDPOINT={} -x PADDLE_RANK_IN_NODE={}\"'.
-                format(idx, idx, cur_endpoint, rank_in_node))
-
-        # executor
-        poprun_command.append(sys.executable)
-
-        # script and script args
-        poprun_command.append(self._training_script)
-        for arg in self._training_script_args:
-            poprun_command.append(arg)
-
-        # for debug
-        print("-----------  PopRun Command -----------")
-        for i in range(len(poprun_command) - 1):
-            print("%s \\" % (poprun_command[i]))
-        print("%s" % (poprun_command[len(poprun_command) - 1]))
-        print("---------------------------------------")
-
-        # Launch
-        subprocess.run(" ".join(poprun_command), shell=True)
-
-
-if __name__ == '__main__':
-    ipu_launch = IPULaunch.parse_ipu_args()
-    ipu_launch.launch()
diff --git a/python/paddle/distributed/launch/utils/kv_client.py b/python/paddle/distributed/launch/utils/kv_client.py
index e19195412268a..a66ca800c58c2 100644
--- a/python/paddle/distributed/launch/utils/kv_client.py
+++ b/python/paddle/distributed/launch/utils/kv_client.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,6 +17,7 @@
 
 
 class KVClient(object):
+
     def __init__(self, endpoint='localhost:2379'):
         self.endpoint = endpoint if endpoint.startswith(
             "http://") else "http://{}".format(endpoint)
diff --git a/python/paddle/distributed/launch/utils/kv_server.py b/python/paddle/distributed/launch/utils/kv_server.py
index 2d7ae15f13d63..ddf5685c988b7 100644
--- a/python/paddle/distributed/launch/utils/kv_server.py
+++ b/python/paddle/distributed/launch/utils/kv_server.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
+
     def do_GET(self):
         with self.server.kv_lock:
             ret = {}
@@ -68,6 +69,7 @@ def log_message(self, format, *args):
 
 
 class KVServer(HTTPServer, object):
+
     def __init__(self, port):
         super(KVServer, self).__init__(('', port), KVHandler)
         self.kv_lock = threading.Lock()
@@ -89,6 +91,7 @@ def stop(self):
 
 
 class PKVServer():
+
     def __init__(self, port):
         self._server = KVServer(port)
 
diff --git a/python/paddle/distributed/launch/utils/nvsmi.py b/python/paddle/distributed/launch/utils/nvsmi.py
index 82a23189ac6af..dc07fbc1d21cb 100644
--- a/python/paddle/distributed/launch/utils/nvsmi.py
+++ b/python/paddle/distributed/launch/utils/nvsmi.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@
 
 
 class Info(object):
+
     def __repr__(self):
         return str(self.__dict__)
 
diff --git a/python/paddle/distributed/launch/utils/process_context.py b/python/paddle/distributed/launch/utils/process_context.py
index 4d6fa8de794ff..075536c8a8cb5 100644
--- a/python/paddle/distributed/launch/utils/process_context.py
+++ b/python/paddle/distributed/launch/utils/process_context.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,6 +17,7 @@
 
 
 class ProcessContext(object):
+
     def __init__(self,
                  cmd,
                  env=os.environ,
@@ -35,12 +36,11 @@ def __init__(self,
 
     def _start(self):
         pre_fn = os.setsid if self._group else None
-        self._proc = subprocess.Popen(
-            self._cmd,
-            env=self._env,
-            stdout=self._stdout,
-            stderr=self._stderr,
-            preexec_fn=self._preexec_fn or pre_fn)
+        self._proc = subprocess.Popen(self._cmd,
+                                      env=self._env,
+                                      stdout=self._stdout,
+                                      stderr=self._stderr,
+                                      preexec_fn=self._preexec_fn or pre_fn)
 
     def _close_std(self):
         try:
diff --git a/python/paddle/distributed/metric/__init__.py b/python/paddle/distributed/metric/__init__.py
index a5b0f4cb49d5f..f87fe885824b8 100644
--- a/python/paddle/distributed/metric/__init__.py
+++ b/python/paddle/distributed/metric/__init__.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 
 from .metrics import init_metric  # noqa: F401
-from .metrics import print_auc  # noqa: F401 
+from .metrics import print_auc  # noqa: F401
diff --git a/python/paddle/distributed/metric/metrics.py b/python/paddle/distributed/metric/metrics.py
index 5685b6f053eba..08d185efd971a 100644
--- a/python/paddle/distributed/metric/metrics.py
+++ b/python/paddle/distributed/metric/metrics.py
@@ -50,11 +50,12 @@ def init_metric(metric_ptr,
         phase = 1 if is_join else 0
 
         if metric_runner['method'] == 'AucCalculator':
-            metric_ptr.init_metric(
-                metric_runner['method'], metric_runner['name'],
-                metric_runner['label'], metric_runner['target'],
-                cmatch_rank_var, mask_var, uid_var, phase, cmatch_rank_group,
-                ignore_rank, bucket_size)
+            metric_ptr.init_metric(metric_runner['method'],
+                                   metric_runner['name'],
+                                   metric_runner['label'],
+                                   metric_runner['target'], cmatch_rank_var,
+                                   mask_var, uid_var, phase, cmatch_rank_group,
+                                   ignore_rank, bucket_size)
         elif metric_runner['method'] == 'MultiTaskAucCalculator':
             metric_ptr.init_metric(
                 metric_runner['method'], metric_runner['name'],
@@ -69,11 +70,12 @@ def init_metric(metric_ptr,
                 metric_runner['cmatch_group'], metric_runner['ignore_rank'],
                 bucket_size)
         elif metric_runner['method'] == 'MaskAucCalculator':
-            metric_ptr.init_metric(
-                metric_runner['method'], metric_runner['name'],
-                metric_runner['label'], metric_runner['target'],
-                cmatch_rank_var, metric_runner['mask'], uid_var, phase,
-                cmatch_rank_group, ignore_rank, bucket_size)
+            metric_ptr.init_metric(metric_runner['method'],
+                                   metric_runner['name'],
+                                   metric_runner['label'],
+                                   metric_runner['target'], cmatch_rank_var,
+                                   metric_runner['mask'], uid_var, phase,
+                                   cmatch_rank_group, ignore_rank, bucket_size)
         elif metric_runner['method'] == 'CmatchRankMaskAucCalculator':
             metric_ptr.init_metric(
                 metric_runner['method'], metric_runner['name'],
@@ -82,17 +84,19 @@ def init_metric(metric_ptr,
                 phase, metric_runner['cmatch_group'],
                 metric_runner['ignore_rank'], bucket_size)
         elif metric_runner['method'] == 'WuAucCalculator':
-            metric_ptr.init_metric(
-                metric_runner['method'], metric_runner['name'],
-                metric_runner['label'], metric_runner['target'],
-                cmatch_rank_var, mask_var, metric_runner['uid'], phase,
-                cmatch_rank_group, ignore_rank, bucket_size)
+            metric_ptr.init_metric(metric_runner['method'],
+                                   metric_runner['name'],
+                                   metric_runner['label'],
+                                   metric_runner['target'], cmatch_rank_var,
+                                   mask_var, metric_runner['uid'], phase,
+                                   cmatch_rank_group, ignore_rank, bucket_size)
         else:
-            metric_ptr.init_metric(
-                metric_runner['method'], metric_runner['name'],
-                metric_runner['label'], metric_runner['target'],
-                cmatch_rank_var, mask_var, phase, cmatch_rank_group,
-                ignore_rank, bucket_size)
+            metric_ptr.init_metric(metric_runner['method'],
+                                   metric_runner['name'],
+                                   metric_runner['label'],
+                                   metric_runner['target'], cmatch_rank_var,
+                                   mask_var, phase, cmatch_rank_group,
+                                   ignore_rank, bucket_size)
 
 
 def print_metric(metric_ptr, name):
diff --git a/python/paddle/distributed/models/__init__.py b/python/paddle/distributed/models/__init__.py
index e1663029ef1f8..97043fd7ba688 100644
--- a/python/paddle/distributed/models/__init__.py
+++ b/python/paddle/distributed/models/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/distributed/models/moe/__init__.py b/python/paddle/distributed/models/moe/__init__.py
index e1663029ef1f8..97043fd7ba688 100644
--- a/python/paddle/distributed/models/moe/__init__.py
+++ b/python/paddle/distributed/models/moe/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py
index ea3dc43d0c712..3b955dd2a8d2f 100644
--- a/python/paddle/distributed/models/moe/utils.py
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -51,11 +51,10 @@ def _number_count(numbers, upper_range):
         helper = LayerHelper(op_type, **locals())
         out = helper.create_variable_for_type_inference(dtype=numbers.dtype)
 
-        helper.append_op(
-            type=op_type,
-            inputs={'numbers': numbers},
-            outputs={'Out': out},
-            attrs={'upper_range': upper_range})
+        helper.append_op(type=op_type,
+                         inputs={'numbers': numbers},
+                         outputs={'Out': out},
+                         attrs={'upper_range': upper_range})
         return out
 
 
@@ -99,14 +98,13 @@ def _assign_pos(x, cum_count):
         helper = LayerHelper(op_type, **locals())
         out = helper.create_variable_for_type_inference(dtype=cum_count.dtype)
 
-        helper.append_op(
-            type=op_type,
-            inputs={
-                'X': [x],
-                'cum_count': [cum_count],
-                "eff_num_len": [cum_count[-1]]
-            },
-            outputs={'Out': [out]})
+        helper.append_op(type=op_type,
+                         inputs={
+                             'X': [x],
+                             'cum_count': [cum_count],
+                             "eff_num_len": [cum_count[-1]]
+                         },
+                         outputs={'Out': [out]})
         return out
 
 
@@ -169,12 +167,13 @@ def _limit_by_capacity(expert_count, capacity, n_worker):
         out = helper.create_variable_for_type_inference(
             dtype=expert_count.dtype)
 
-        helper.append_op(
-            type=op_type,
-            inputs={'expert_count': expert_count,
-                    'capacity': capacity},
-            outputs={'Out': out},
-            attrs={'n_worker': n_worker})
+        helper.append_op(type=op_type,
+                         inputs={
+                             'expert_count': expert_count,
+                             'capacity': capacity
+                         },
+                         outputs={'Out': out},
+                         attrs={'n_worker': n_worker})
         return out
 
 
@@ -206,8 +205,9 @@ def _prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker):
         return _C_ops.prune_gate_by_capacity(gate_idx, expert_count, "n_expert",
                                              n_expert, "n_worker", n_worker)
     elif _in_legacy_dygraph():
-        return core.ops.prune_gate_by_capacity(
-            gate_idx, expert_count, "n_expert", n_expert, "n_worker", n_worker)
+        return core.ops.prune_gate_by_capacity(gate_idx, expert_count,
+                                               "n_expert", n_expert, "n_worker",
+                                               n_worker)
     check_variable_and_dtype(gate_idx, 'GateIdx', ['int32', 'int64'],
                              'paddle.distributed.utils.prune_gate_by_capacity')
     check_variable_and_dtype(expert_count, 'ExpertCount', ['int32', 'int64'],
@@ -216,12 +216,15 @@ def _prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker):
     helper = LayerHelper('prune_gate_by_capacity', **locals())
     new_gate_idx = helper.create_variable_for_type_inference(
         dtype=gate_idx.dtype)
-    helper.append_op(
-        type='prune_gate_by_capacity',
-        inputs={'GateIdx': gate_idx,
-                "ExpertCount": expert_count},
-        outputs={'NewGateIdx': new_gate_idx},
-        attrs={"n_expert": n_expert,
-               "n_worker": n_worker})
+    helper.append_op(type='prune_gate_by_capacity',
+                     inputs={
+                         'GateIdx': gate_idx,
+                         "ExpertCount": expert_count
+                     },
+                     outputs={'NewGateIdx': new_gate_idx},
+                     attrs={
+                         "n_expert": n_expert,
+                         "n_worker": n_worker
+                     })
 
     return new_gate_idx
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 8cd6c4647dce4..79b680ef2d187 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -19,6 +19,7 @@
 from multiprocessing import Manager  # noqa: F401
 import time
 import sys
+import paddle
 
 from paddle import compat as cpt
 
@@ -46,7 +47,7 @@
 
 ParallelStrategy = core.ParallelStrategy
 
-# NOTE(chenweihang): Maintain a global parallel env to avoid 
+# NOTE(chenweihang): Maintain a global parallel env to avoid
 # initializing ParallelEnv every time and improve performance
 _global_parallel_env = None
 
@@ -70,9 +71,10 @@ def _start_kv_server(port, http_server_d, size):
 
 def _is_cpuonly(backend):
     check_backend(backend)
-    if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'] and (
-            core.is_compiled_with_cuda() or core.is_compiled_with_xpu() or
-            core.is_compiled_with_npu() or core.is_compiled_with_mlu()):
+    if backend in [
+            'auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'
+    ] and (core.is_compiled_with_cuda() or core.is_compiled_with_xpu()
+           or core.is_compiled_with_npu() or core.is_compiled_with_mlu()):
 
         # passes 'auto' and can use cuda or xpu, use the default logics. so return False
         return False
@@ -158,14 +160,14 @@ def train():
             "Currently not a parallel execution environment, `paddle.distributed.init_parallel_env` will not do anything."
         )
         return
-    # NOTE(xiongkun): support cpu gloo only, add this environment variable to 
+    # NOTE(xiongkun): support cpu gloo only, add this environment variable to
     #                 enable cpu only gloo prarllel training)
     backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto')
     is_cpu_only = _is_cpuonly(backend)
-    # 1. gpu xpu check, must be gpu or xpu, 
-    if not (is_cpu_only or core.is_compiled_with_cuda() or
-            core.is_compiled_with_xpu() or core.is_compiled_with_npu() or
-            core.is_compiled_with_mlu()):
+    # 1. gpu xpu check, must be gpu or xpu,
+    if not (is_cpu_only or core.is_compiled_with_cuda()
+            or core.is_compiled_with_xpu() or core.is_compiled_with_npu()
+            or core.is_compiled_with_mlu()):
         raise NotImplementedError(
             "If you want to use CPU-only version, please use 'gloo' as backend")
 
@@ -219,8 +221,8 @@ def train():
             "required to create a process group.")
         master_addr = os.getenv("MASTER_ADDR", None)
         master_port = os.getenv("MASTER_PORT", None)
-        endpoints = ":".join(
-            [master_addr, master_port]) if master_addr and master_port else None
+        endpoints = ":".join([master_addr, master_port
+                              ]) if master_addr and master_port else None
         if endpoints is None:
             endpoints = os.getenv("PADDLE_MASTER", None)
         if endpoints is None:
@@ -234,31 +236,30 @@ def train():
         master_port = int(master_port)
         is_master = rank == 0
         stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900"))
-        default_store = core.TCPStore(
-            master_addr,
-            master_port,
-            is_master,
-            world_size,
-            stop_check_timeout=stop_check_timeout)
+        default_store = core.TCPStore(master_addr,
+                                      master_port,
+                                      is_master,
+                                      world_size,
+                                      stop_check_timeout=stop_check_timeout)
         _set_default_store(default_store)
-        pg = _new_process_group_impl(
-            backend,
-            default_store,
-            rank,
-            world_size,
-            _default_group_name,
-            pg_options=None)
+        pg = _new_process_group_impl(backend,
+                                     default_store,
+                                     rank,
+                                     world_size,
+                                     _default_group_name,
+                                     pg_options=None)
         ranks = list(range(world_size))
-        group = Group(
-            rank,
-            world_size,
-            id=0,
-            ranks=ranks,
-            pg=pg,
-            name=_default_group_name)
+        group = Group(rank,
+                      world_size,
+                      id=0,
+                      ranks=ranks,
+                      pg=pg,
+                      name=_default_group_name)
         _set_group_map_by_name(_default_group_name, group)
         _set_group_map(0, group)
         parallel_helper._set_parallel_ctx(True)
+
+        paddle.distributed.barrier(group=group)
         return group
 
     node_num = set([i.split(":")[0] for i in parallel_env.trainer_endpoints])
@@ -275,9 +276,8 @@ def train():
             size = {'_worker': parallel_env.world_size}
             if backend == "heter":
                 size = {'_worker': len(node_num)}
-            http_server = Process(
-                target=_start_kv_server,
-                args=(int(ep_rank_0[1]), http_server_d, size))
+            http_server = Process(target=_start_kv_server,
+                                  args=(int(ep_rank_0[1]), http_server_d, size))
             http_server.daemon = True
             http_server_d["running"] = True
             http_server.start()
@@ -325,7 +325,7 @@ def train():
     # are separately looking for free ports which sometimes
     # leads to port-conflict.
     if (is_cpu_only or backend == "heter") and parallel_env.rank == 0:
-        # compare to init_gloo, we don't need to 
+        # compare to init_gloo, we don't need to
         # init gloo, because we do this in _init_parallel_ctx;
         http_server_d["running"] = False
         http_server.join()
diff --git a/python/paddle/distributed/parallel_with_gloo.py b/python/paddle/distributed/parallel_with_gloo.py
index 5a6f58e05bad5..363de6a5505bd 100755
--- a/python/paddle/distributed/parallel_with_gloo.py
+++ b/python/paddle/distributed/parallel_with_gloo.py
@@ -103,9 +103,9 @@ def test_gloo_init_with_multiprocess(num_of_ranks):
     if rank_id == 0:
         # The scope for worker used by http server is '_worker'
         size = {'_worker': rank_num}
-        http_server_proc = Process(
-            target=_start_kv_server,
-            args=(int(server_endpoint.split(":")[1]), http_server_status, size))
+        http_server_proc = Process(target=_start_kv_server,
+                                   args=(int(server_endpoint.split(":")[1]),
+                                         http_server_status, size))
         http_server_proc.daemon = True
         http_server_status["running"] = True
         http_server_proc.start()
diff --git a/python/paddle/distributed/passes/__init__.py b/python/paddle/distributed/passes/__init__.py
index bfa760698fe8b..3649d571aa403 100644
--- a/python/paddle/distributed/passes/__init__.py
+++ b/python/paddle/distributed/passes/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py
index 3cd04affa29c2..3a552d76a2d46 100644
--- a/python/paddle/distributed/passes/auto_parallel_amp.py
+++ b/python/paddle/distributed/passes/auto_parallel_amp.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,10 +26,12 @@
 from paddle.fluid.contrib.mixed_precision.fp16_utils import _valid_types, find_true_post_op, find_true_prev_op
 from paddle.fluid.contrib.mixed_precision.fp16_utils import _is_in_black_varnames, _dtype_to_str, _rename_arg
 from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute
+
 world_process_group = get_world_process_group()
 
 
 class AMPState(object):
+
     def __init__(self, block):
         self._block = block
         self._op_fp16_dict = {
@@ -88,8 +90,8 @@ def _mark_black_white_ops(self, amp_lists):
                             if in_var.op is None:
                                 continue
                             elif in_var.op is op:
-                                prev_op = find_true_prev_op(ops, op,
-                                                            in_var_name)
+                                prev_op = find_true_prev_op(
+                                    ops, op, in_var_name)
                                 if prev_op is None:
                                     continue
                             else:
@@ -166,8 +168,8 @@ def _insert_cast_op_forward(self, op, idx, src_dtype, dst_dtype,
                         assert in_var_dist_attr is not None
                         ref_mesh = in_var_dist_attr.process_mesh
                         ref_mapping = in_var_dist_attr.dims_mapping
-                        consume_op_attr.set_input_dist_attr(cast_name,
-                                                            in_var_dist_attr)
+                        consume_op_attr.set_input_dist_attr(
+                            cast_name, in_var_dist_attr)
 
                         out_var = self._block.create_var(
                             name=cast_name,
@@ -192,8 +194,8 @@ def _insert_cast_op_forward(self, op, idx, src_dtype, dst_dtype,
                     else:
                         in_var_dist_attr = consume_op_attr.get_input_dist_attr(
                             in_var.name)
-                        consume_op_attr.set_input_dist_attr(cast_name,
-                                                            in_var_dist_attr)
+                        consume_op_attr.set_input_dist_attr(
+                            cast_name, in_var_dist_attr)
                     _rename_arg(op, in_var.name, cast_name)
                 else:
                     if op.has_attr('in_dtype'):
@@ -297,8 +299,8 @@ def _keep_fp32_output(op, out_name):
                         grad_op.desc._rename_input(in_var_name, cast_name)
                         in_var_dist_attr = consume_op_attr.get_input_dist_attr(
                             in_var_name)
-                        consume_op_attr.set_input_dist_attr(cast_name,
-                                                            in_var_dist_attr)
+                        consume_op_attr.set_input_dist_attr(
+                            cast_name, in_var_dist_attr)
                     else:
                         assert in_var.dtype == dst_dtype
 
@@ -382,8 +384,8 @@ def _update_backward_cast_ops(params_grads, dist_context):
     for p, g in params_grads:
         op = g.op
         if g.dtype == core.VarDesc.VarType.FP32 and op.type == 'cast':
-            if int(op.attr('op_role')) == int(OpRole.Backward) and op.has_attr(
-                    'op_role_var'):
+            if int(op.attr('op_role')) == int(
+                    OpRole.Backward) and op.has_attr('op_role_var'):
                 op._remove_attr("op_role_var")
 
             post_ops = find_true_post_op(main_block.ops, op, g.name)
@@ -398,13 +400,12 @@ def _update_backward_cast_ops(params_grads, dist_context):
             # add new op in the python and cpp at the same time
             new_op_desc = main_block.desc.append_op()
             new_op_desc.copy_from(op.desc)
-            new_op = paddle.fluid.framework.Operator(
-                block=main_block,
-                desc=new_op_desc,
-                type=None,
-                inputs=None,
-                outputs=None,
-                attrs=None)
+            new_op = paddle.fluid.framework.Operator(block=main_block,
+                                                     desc=new_op_desc,
+                                                     type=None,
+                                                     inputs=None,
+                                                     outputs=None,
+                                                     attrs=None)
             main_block.ops.append(new_op)
 
             # dist attr
@@ -452,11 +453,10 @@ def _check_and_update_gradient(params_grads, loss_scaling, dist_context):
     inputs = {'X': grads, 'Scale': loss_scaling}
     outputs = {'Out': grads, 'FoundInfinite': found_inf}
     attrs = {'op_role': OpRole.Backward}
-    new_op = main_block.append_op(
-        type='check_finite_and_unscale',
-        inputs=inputs,
-        outputs=outputs,
-        attrs=attrs)
+    new_op = main_block.append_op(type='check_finite_and_unscale',
+                                  inputs=inputs,
+                                  outputs=outputs,
+                                  attrs=attrs)
 
     new_op_dist_attr = OperatorDistributedAttribute()
     new_op_dist_attr.process_mesh = world_process_group.ranks
@@ -476,6 +476,7 @@ def _check_and_update_gradient(params_grads, loss_scaling, dist_context):
 
 @register_pass("auto_parallel_amp")
 class AMPPass(PassBase):
+
     def __init__(self):
         super(AMPPass, self).__init__()
         self.set_attr("loss", None)
@@ -514,8 +515,8 @@ def _check_conflict(self, other_pass):
 
         return True
 
-    # NOTE: why AMPBackwardPass can override apply_single_impl instead of 
-    # apply_impl? AMP is an optimization pass for serial program, 
+    # NOTE: why AMPBackwardPass can override apply_single_impl instead of
+    # apply_impl? AMP is an optimization pass for serial program,
     # in distributed scenario, all ranks should have the same modification.
     def _apply_single_impl(self, main_program, startup_program, context):
         self.dist_context = self.get_attr("dist_context")
@@ -532,12 +533,12 @@ def _apply_single_impl(self, main_program, startup_program, context):
         with paddle.static.program_guard(main_program, startup_program):
             amp_state.cast_forward_program(self.dist_context)
             amp_state.cast_backward_program(params_grads, self.dist_context)
-            # TODO (JZ-LIANG)support cast forward program only when inference 
+            # TODO (JZ-LIANG)support cast forward program only when inference
             self._init_amp_var()
             self._scale_loss()
 
-            if self.get_attr("use_dynamic_loss_scaling") or self.get_attr(
-                    "init_loss_scaling") != 1.0:
+            if self.get_attr("use_dynamic_loss_scaling"
+                             ) or self.get_attr("init_loss_scaling") != 1.0:
                 grads, found_inf = _check_and_update_gradient(
                     params_grads, self._loss_scaling, self.dist_context)
 
@@ -587,8 +588,8 @@ def _scale_loss(self):
 
         if loss.dtype != core.VarDesc.VarType.FP32:
             # cast loss here will change the effective loss tensor for the computation graph
-            # and therefore will effect all following passes whose logic is based on the loss tensor(Recompute & Gradient Merge), 
-            # so we it is not allowed by now. fixed it in future.   
+            # and therefore will effect all following passes whose logic is based on the loss tensor(Recompute & Gradient Merge),
+            # so we it is not allowed by now. fixed it in future.
             raise NotImplementedError(
                 "Loss's generator op is not support in FP16 in Auto Parallel by now, please put that op into your black-list."
             )
@@ -598,8 +599,8 @@ def _scale_loss(self):
             loss_dist_attr = self.dist_context.get_tensor_dist_attr_for_program(
                 loss)
             ref_mesh = loss_op_dist_attr.process_mesh
-            self.dist_context.set_tensor_dist_attr_for_program(cast_loss,
-                                                               loss_dist_attr)
+            self.dist_context.set_tensor_dist_attr_for_program(
+                cast_loss, loss_dist_attr)
 
             loss_op_idx = find_op_index(main_block.desc, loss_op.desc)
             cast_op = main_block._insert_op(
@@ -619,8 +620,8 @@ def _scale_loss(self):
                 cast_op, ref_mesh, [-1], self.dist_context)
             loss = loss.astype('float32')
 
-        if self.get_attr("use_dynamic_loss_scaling") or self.get_attr(
-                "init_loss_scaling") != 1.0:
+        if self.get_attr("use_dynamic_loss_scaling"
+                         ) or self.get_attr("init_loss_scaling") != 1.0:
 
             loss_op_idx = find_op_index(main_block.desc, loss_op.desc)
 
@@ -637,10 +638,14 @@ def _scale_loss(self):
             elementwise_mul_op = main_block._insert_op(
                 loss_op_idx + 1,
                 type='elementwise_mul',
-                inputs={'X': [loss],
-                        'Y': [self._loss_scaling]},
+                inputs={
+                    'X': [loss],
+                    'Y': [self._loss_scaling]
+                },
                 outputs={'Out': [self._scaled_loss]},
-                attrs={'op_role': loss_op.all_attrs()[OP_ROLE_KEY], })
+                attrs={
+                    'op_role': loss_op.all_attrs()[OP_ROLE_KEY],
+                })
             loss_op._set_attr(OP_ROLE_KEY,
                               core.op_proto_and_checker_maker.OpRole.Forward)
             naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
@@ -730,11 +735,10 @@ def _update_loss_scaling(self, grads, found_inf):
             'op_role': OpRole.Backward
         }
 
-        new_op = main_block.append_op(
-            type='update_loss_scaling',
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs)
+        new_op = main_block.append_op(type='update_loss_scaling',
+                                      inputs=inputs,
+                                      outputs=outputs,
+                                      attrs=attrs)
 
         new_op_dist_attr = OperatorDistributedAttribute()
         new_op_dist_attr.process_mesh = world_process_group.ranks
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index b01f3975aefdd..8bfde1cba1cab 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -95,6 +95,7 @@ def _keep_fp32_output(op, out_name):
 
 
 class FP16State(object):
+
     def __init__(self,
                  program,
                  amp_list,
@@ -169,8 +170,8 @@ def _mark_op(self, op):
             if op.desc.original_id() in self.grad_op_to_op_map:
                 fwd_op_id = self.grad_op_to_op_map[op.desc.original_id()]
                 assert fwd_op_id in self._op_fp16_dict, "{}".format(str(op))
-                self._op_fp16_dict[op.desc.original_id()] = self._op_fp16_dict[
-                    fwd_op_id]
+                self._op_fp16_dict[
+                    op.desc.original_id()] = self._op_fp16_dict[fwd_op_id]
 
         if int(op.attr('op_role')) == 257:
             self.is_train = True
@@ -182,7 +183,7 @@ def set_var_to_fp16(self, var_name, block):
         except ValueError as e:
             var = self.program.global_block().var(var_name)
 
-        # NOTE(JZ-LIANG) "array_" is a hack to adopt for ernie3.0 inference, since there is  
+        # NOTE(JZ-LIANG) "array_" is a hack to adopt for ernie3.0 inference, since there is
         # a trick which make the LOD_TENSOR_ARRAY to the float32 in while block to reset the LOD_TENSOR_ARRAY
         if var is None or var.type not in _valid_types or "array_" in var_name:
             return
@@ -299,8 +300,9 @@ def _insert_forward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
                     cast_name = in_var.name + '.cast_' + _dtype_to_str(
                         dst_dtype)
                     cast_var = block.vars.get(cast_name)
-                    self.forward_input_cast_ops[op.desc.original_id()] += [(
-                        cast_name, in_var.name, dst_dtype, src_dtype, in_name)]
+                    self.forward_input_cast_ops[op.desc.original_id()] += [
+                        (cast_name, in_var.name, dst_dtype, src_dtype, in_name)
+                    ]
 
                     in_var_dist_attr = consume_op_attr.get_input_dist_attr(
                         in_var.name)
@@ -367,9 +369,8 @@ def _insert_backward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
 
             # rename input
             assert src_name in op.input(
-                slot_name), "var: {} not in op's {}. {}".format(src_name,
-                                                                slot_name,
-                                                                str(op))
+                slot_name), "var: {} not in op's {}. {}".format(
+                    src_name, slot_name, str(op))
             src_var_dist_attr = grad_op_attr.get_input_dist_attr(src_name)
             assert src_var_dist_attr is not None
             op._rename_input(src_name, cast_name)
@@ -394,8 +395,8 @@ def _insert_backward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
                 type=grad.type,
                 persistable=grad.persistable,
                 stop_gradient=grad.stop_gradient)
-            dist_context.set_tensor_dist_attr_for_program(cast_grad,
-                                                          grad_dist_attr)
+            dist_context.set_tensor_dist_attr_for_program(
+                cast_grad, grad_dist_attr)
             op._rename_output(grad_name, cast_grad.name)
             grad_op_attr.set_output_dist_attr(cast_grad.name, grad_dist_attr)
 
@@ -441,11 +442,10 @@ def _check_and_update_gradient(grads, loss_scaling, name, dist_context):
     inputs = {'X': grads, 'Scale': loss_scaling}
     outputs = {'Out': grads, 'FoundInfinite': found_inf}
     attrs = {'op_role': OpRole.Backward}
-    new_op = main_block.append_op(
-        type='check_finite_and_unscale',
-        inputs=inputs,
-        outputs=outputs,
-        attrs=attrs)
+    new_op = main_block.append_op(type='check_finite_and_unscale',
+                                  inputs=inputs,
+                                  outputs=outputs,
+                                  attrs=attrs)
 
     new_op_dist_attr = OperatorDistributedAttribute()
     new_op_dist_attr.process_mesh = world_process_group.ranks
@@ -493,11 +493,12 @@ def _set_op_dist_attr_with_ranks(new_op, ranks, block, dist_context):
 
 @register_pass("auto_parallel_fp16")
 class FP16Pass(AMPPass):
+
     def __init__(self):
         super(FP16Pass, self).__init__()
 
-    # NOTE: why FP16Pass can override apply_single_impl instead of 
-    # apply_impl? AMP is an optimization pass for serial program, 
+    # NOTE: why FP16Pass can override apply_single_impl instead of
+    # apply_impl? AMP is an optimization pass for serial program,
     # in distributed scenario, all ranks should have the same modification.
     def _apply_single_impl(self, main_program, startup_program, context):
         self.dist_context = self.get_attr("dist_context")
@@ -507,7 +508,7 @@ def _apply_single_impl(self, main_program, startup_program, context):
             set(self.get_attr("custom_white_list")),
             set(self.get_attr("custom_black_list")), None)
 
-        # NOTE don't not change input data dtype, since it is controled by dataloader 
+        # NOTE don't not change input data dtype, since it is controled by dataloader
         # and which is out of control of FP16 Pass
         input_data_var_names = [var.name for var in self.get_attr("input_data")]
 
@@ -519,14 +520,14 @@ def _apply_single_impl(self, main_program, startup_program, context):
 
         if is_train:
             with paddle.static.program_guard(main_program, startup_program):
-                # TODO (JZ-LIANG)support cast forward program only when inference 
+                # TODO (JZ-LIANG)support cast forward program only when inference
                 self._init_amp_var()
                 self._scale_loss()
 
                 grads, fp32_grads, fp16_grads = _split_grads(params_grads)
 
-                if self.get_attr("use_dynamic_loss_scaling") or self.get_attr(
-                        "init_loss_scaling") != 1.0:
+                if self.get_attr("use_dynamic_loss_scaling"
+                                 ) or self.get_attr("init_loss_scaling") != 1.0:
                     found_infs = []
                     if fp32_grads:
                         with main_program._backward_role_guard():
@@ -573,8 +574,9 @@ def _apply_single_impl(self, main_program, startup_program, context):
             base_opt._multi_precision = True
             if self.get_attr("use_optimizer_fp16"):
                 base_opt._multi_precision = False
-            if isinstance(base_opt, (paddle.fluid.optimizer.Adam,
-                                     paddle.optimizer.AdamW)):
+            if isinstance(
+                    base_opt,
+                (paddle.fluid.optimizer.Adam, paddle.optimizer.AdamW)):
                 # with main_program._optimized_guard([]):
                 #     found_inf = paddle.tensor.creation._memcpy(
                 #         found_inf, paddle.CPUPlace())
diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
index accac81133825..bc40dad8ac0d9 100644
--- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
+++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,6 +25,7 @@
 from paddle.distributed.auto_parallel.utils import set_var_dist_attr
 from paddle.distributed.auto_parallel.utils import naive_set_dist_op_attr_for_program_by_mesh_and_mapping
 from paddle.distributed.auto_parallel.process_group import get_world_process_group
+
 world_process_group = get_world_process_group()
 
 
@@ -79,57 +80,59 @@ def _remove_op_role_var(param, grad):
 def _get_gm_cond_var(main_program, k_steps, dist_context):
     main_block = main_program.global_block()
     # Add const var
-    k_step_var = layers.create_global_var(
-        name="gradient_merge_k",
-        shape=[1],
-        value=int(k_steps),
-        dtype='int32',
-        persistable=True,
-        force_cpu=True)
+    k_step_var = layers.create_global_var(name="gradient_merge_k",
+                                          shape=[1],
+                                          value=int(k_steps),
+                                          dtype='int32',
+                                          persistable=True,
+                                          force_cpu=True)
     set_var_dist_attr(dist_context, k_step_var, [-1], world_process_group.ranks)
 
-    zero_var = layers.create_global_var(
-        name="gradient_merge_zero",
-        shape=[1],
-        value=int(0),
-        dtype='int32',
-        persistable=True,
-        force_cpu=True)
+    zero_var = layers.create_global_var(name="gradient_merge_zero",
+                                        shape=[1],
+                                        value=int(0),
+                                        dtype='int32',
+                                        persistable=True,
+                                        force_cpu=True)
     set_var_dist_attr(dist_context, zero_var, [-1], world_process_group.ranks)
 
     # Add step var & cond var
-    step_var = layers.create_global_var(
-        name="gradient_merge_step",
-        shape=[1],
-        value=int(0),
-        dtype='int32',
-        persistable=True,
-        force_cpu=True)
+    step_var = layers.create_global_var(name="gradient_merge_step",
+                                        shape=[1],
+                                        value=int(0),
+                                        dtype='int32',
+                                        persistable=True,
+                                        force_cpu=True)
     set_var_dist_attr(dist_context, step_var, [-1], world_process_group.ranks)
 
-    cond_var = main_block.create_var(
-        name="gradient_merge_cond", shape=[1], dtype='bool')
+    cond_var = main_block.create_var(name="gradient_merge_cond",
+                                     shape=[1],
+                                     dtype='bool')
     set_var_dist_attr(dist_context, cond_var, [-1], world_process_group.ranks)
 
     with device_guard("cpu"):
         # step_var = (step_var + 1) % k_step
         layers.increment(x=step_var, value=1.0, in_place=True)
-        elementwise_mod_op = main_block.append_op(
-            type='elementwise_mod',
-            inputs={'X': step_var,
-                    'Y': k_step_var},
-            outputs={'Out': step_var},
-            attrs={'axis': -1,
-                   'use_mkldnn': False})
+        elementwise_mod_op = main_block.append_op(type='elementwise_mod',
+                                                  inputs={
+                                                      'X': step_var,
+                                                      'Y': k_step_var
+                                                  },
+                                                  outputs={'Out': step_var},
+                                                  attrs={
+                                                      'axis': -1,
+                                                      'use_mkldnn': False
+                                                  })
         naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
             elementwise_mod_op, world_process_group.ranks, [-1], dist_context)
 
         # cond_var = (step_var == 0)
-        equal_op = main_block.append_op(
-            type='equal',
-            inputs={'X': step_var,
-                    'Y': zero_var},
-            outputs={'Out': cond_var})
+        equal_op = main_block.append_op(type='equal',
+                                        inputs={
+                                            'X': step_var,
+                                            'Y': zero_var
+                                        },
+                                        outputs={'Out': cond_var})
         naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
             equal_op, world_process_group.ranks, [-1], dist_context)
 
@@ -137,9 +140,7 @@ def _get_gm_cond_var(main_program, k_steps, dist_context):
 
 
 def _append_gradient_merge_backward_op(
-        main_program,
-        startup_program,
-        params_grads: List[Tuple[Any, Any]],
+        main_program, startup_program, params_grads: List[Tuple[Any, Any]],
         cond_var_name: str,
         dist_context) -> Tuple[List[Tuple[Any, Any]], Dict[str, Any]]:
     main_block = main_program.global_block()
@@ -162,11 +163,11 @@ def _append_gradient_merge_backward_op(
         assert (param_var is not None)
         ref_dist_attr = dist_context.get_tensor_dist_attr_for_program(param_var)
         assert ref_dist_attr is not None
-        gradient_merge_var = main_block.create_var(
-            name=param_name + "@GRAD@GradientMerge",
-            shape=param_var.shape,
-            dtype=param_var.dtype,
-            persistable=True)
+        gradient_merge_var = main_block.create_var(name=param_name +
+                                                   "@GRAD@GradientMerge",
+                                                   shape=param_var.shape,
+                                                   dtype=param_var.dtype,
+                                                   persistable=True)
         param_to_gradient_merge[param_name] = gradient_merge_var
         ref_process_mesh = ref_dist_attr.process_mesh
         ref_dims_mapping = ref_dist_attr.dims_mapping
@@ -179,23 +180,25 @@ def _append_gradient_merge_backward_op(
             shape=param_var.shape,
             dtype=param_var.dtype,
             persistable=True)
-        startup_block.append_op(
-            type="fill_constant",
-            outputs={"Out": startup_gradient_merge_var},
-            attrs={
-                "shape": param_var.shape,
-                "dtype": param_var.dtype,
-                "value": float(0),
-            })
+        startup_block.append_op(type="fill_constant",
+                                outputs={"Out": startup_gradient_merge_var},
+                                attrs={
+                                    "shape": param_var.shape,
+                                    "dtype": param_var.dtype,
+                                    "value": float(0),
+                                })
 
         # grad_merge += grad
-        new_grad_op = main_block.append_op(
-            type="elementwise_add",
-            inputs={'X': grad,
-                    'Y': gradient_merge_var},
-            outputs={'Out': gradient_merge_var},
-            attrs={'axis': -1,
-                   'use_mkldnn': False})
+        new_grad_op = main_block.append_op(type="elementwise_add",
+                                           inputs={
+                                               'X': grad,
+                                               'Y': gradient_merge_var
+                                           },
+                                           outputs={'Out': gradient_merge_var},
+                                           attrs={
+                                               'axis': -1,
+                                               'use_mkldnn': False
+                                           })
         new_params_to_grads.append([param, gradient_merge_var])
         naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
             new_grad_op, ref_process_mesh, ref_dims_mapping, dist_context)
@@ -203,13 +206,10 @@ def _append_gradient_merge_backward_op(
 
 
 def _create_cond_block_and_update_optimizer(
-        main_program,
-        cond_var,
-        new_params_to_grads: List[Tuple[Any, Any]],
-        param_to_gradient_merge: Dict[str, Any],
-        optimize_ops_desc: List[Any],
-        k_steps,
-        avg):
+        main_program, cond_var, new_params_to_grads: List[Tuple[Any, Any]],
+        param_to_gradient_merge: Dict[str, Any], optimize_ops_desc: List[Any],
+        k_steps, avg):
+
     def true_apply_gradient():
         cur_block_idx = main_program.current_block_idx
         cur_block = main_program.current_block()
@@ -220,15 +220,14 @@ def true_apply_gradient():
         if avg:
             for param, new_grad in new_params_to_grads:
                 # grad /= k_steps
-                cur_block.append_op(
-                    type='scale',
-                    inputs={'X': new_grad},
-                    outputs={'Out': new_grad},
-                    attrs={
-                        'scale': 1.0 / k_steps,
-                        'bias': 0.0,
-                        'bias_after_scale': False
-                    })
+                cur_block.append_op(type='scale',
+                                    inputs={'X': new_grad},
+                                    outputs={'Out': new_grad},
+                                    attrs={
+                                        'scale': 1.0 / k_steps,
+                                        'bias': 0.0,
+                                        'bias_after_scale': False
+                                    })
                 new_grad.op._set_attr(op_maker.kOpRoleAttrName(),
                                       op_maker.OpRole.Optimize)
 
@@ -264,11 +263,10 @@ def true_apply_gradient():
 
         # clear gradient_merge_vars
         for param, new_grad in new_params_to_grads:
-            layers.fill_constant(
-                shape=new_grad.shape,
-                dtype=new_grad.dtype,
-                value=0.0,
-                out=new_grad)
+            layers.fill_constant(shape=new_grad.shape,
+                                 dtype=new_grad.dtype,
+                                 value=0.0,
+                                 out=new_grad)
             new_grad.op._set_attr(op_maker.kOpRoleAttrName(),
                                   op_maker.OpRole.Optimize)
 
@@ -292,13 +290,15 @@ def parse_program(main_program, startup_program, params_grads, k_steps, avg,
         dist_context)
 
     # 4 create ConditionalBlock and append gradient merge optimizer ops
-    _create_cond_block_and_update_optimizer(
-        main_program, cond_var, new_params_to_grads, param_to_gradient_merge,
-        optimize_ops_desc, k_steps, avg)
+    _create_cond_block_and_update_optimizer(main_program, cond_var,
+                                            new_params_to_grads,
+                                            param_to_gradient_merge,
+                                            optimize_ops_desc, k_steps, avg)
 
 
 @register_pass("auto_parallel_gradient_merge_pass")
 class GradientMergePass(PassBase):
+
     def __init__(self):
         super(GradientMergePass, self).__init__()
         self.set_attr("k_steps", -1)
diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py
index c6d1685446277..fcd7d24377117 100644
--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -28,6 +28,7 @@
 
 
 class RecomputeState(ProgramStats):
+
     def __init__(self, block, ops):
         super(RecomputeState, self).__init__(block=block, ops=ops)
         self._block = block
@@ -70,22 +71,25 @@ def get_recompute_segments(self, checkpoints):
                 flag, min_idx, max_idx = self.is_subgraph(
                     [checkpoints[start_idx]], [checkpoints[start_idx + 1]])
                 if flag:
-                    min_idx = self._update_segment_start(min_idx,
-                                                         pre_segment_end_idx)
+                    min_idx = self._update_segment_start(
+                        min_idx, pre_segment_end_idx)
                     segments.append([min_idx, max_idx + 1])
                 else:
-                    logging.info("Could not recompute op range [{}] - [{}] ".
-                                 format(min_idx, max_idx + 1))
+                    logging.info(
+                        "Could not recompute op range [{}] - [{}] ".format(
+                            min_idx, max_idx + 1))
             start_idx += 1
 
         for i, (idx1, idx2) in enumerate(segments):
             logging.info("recompute segment[{}]".format(i))
-            logging.info("segment start op: [{}]: [{}] [{}]".format(self._ops[
-                idx1].desc.type(), self._ops[idx1].desc.input_arg_names(
-                ), self._ops[idx1].desc.output_arg_names()))
-            logging.info("segment end op: [{}]: [{}] [{}]".format(self._ops[
-                idx2 - 1].desc.type(), self._ops[idx2 - 1].desc.input_arg_names(
-                ), self._ops[idx2 - 1].desc.output_arg_names()))
+            logging.info("segment start op: [{}]: [{}] [{}]".format(
+                self._ops[idx1].desc.type(),
+                self._ops[idx1].desc.input_arg_names(),
+                self._ops[idx1].desc.output_arg_names()))
+            logging.info("segment end op: [{}]: [{}] [{}]".format(
+                self._ops[idx2 - 1].desc.type(),
+                self._ops[idx2 - 1].desc.input_arg_names(),
+                self._ops[idx2 - 1].desc.output_arg_names()))
 
         return segments
 
@@ -125,8 +129,9 @@ def modify_forward_desc_for_recompute(self, dist_context):
             # set new seed_var's dist_attr
             ref_dims_mapping = [-1]
             ref_process_mesh = cur_op_dist_attr.process_mesh
-            seed_var_dist_attr = set_var_dist_attr(
-                dist_context, seed_var, ref_dims_mapping, ref_process_mesh)
+            seed_var_dist_attr = set_var_dist_attr(dist_context, seed_var,
+                                                   ref_dims_mapping,
+                                                   ref_process_mesh)
 
             seed = 0 if cur_op.attr("fix_seed") is False else int(
                 cur_op.attr("seed"))
@@ -135,8 +140,10 @@ def modify_forward_desc_for_recompute(self, dist_context):
                 type="seed",
                 inputs={},
                 outputs={"Out": seed_var},
-                attrs={"seed": seed,
-                       "force_cpu": True})
+                attrs={
+                    "seed": seed,
+                    "force_cpu": True
+                })
             # set new seed op's dist_attr
             naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
                 seed_op, ref_process_mesh, ref_dims_mapping, dist_context)
@@ -209,6 +216,7 @@ def _add_needed_descs_to_block(descs, block, main_block, in_memory_vars,
 
 @register_pass("auto_parallel_recompute")
 class RecomputePass(PassBase):
+
     def __init__(self):
         super(RecomputePass, self).__init__()
         self.set_attr("checkpoints", None)
@@ -254,9 +262,10 @@ def _apply_single_impl(self, main_programs, startup_programs, context):
             vars_should_be_hold.extend(
                 rc_state.get_out_of_subgraph_vars(segment[0], segment[1]))
         cross_vars = set(vars_should_be_hold) - set(checkpoints)
-        logging.info("found [{}] vars which cross recompute segment: [{}],"
-                     "better checkpoints might be set to reduce those vars".
-                     format(len(cross_vars), cross_vars))
+        logging.info(
+            "found [{}] vars which cross recompute segment: [{}],"
+            "better checkpoints might be set to reduce those vars".format(
+                len(cross_vars), cross_vars))
         vars_should_be_hold.extend(rc_state.get_reserved_vars())
         vars_should_be_hold.extend(rc_state.get_input_nodes())
         vars_should_be_hold = list(set(vars_should_be_hold))
@@ -304,15 +313,16 @@ def _apply_single_impl(self, main_programs, startup_programs, context):
                         set_var_dist_attr(self._dist_context, rc_var,
                                           ref_dims_mapping, ref_process_mesh)
             # get recomputed segment's descs
-            segment_descs = _add_needed_descs_to_block(
-                fwd_ops, buffer_block, main_block, vars_in_memory,
-                self._dist_context)
+            segment_descs = _add_needed_descs_to_block(fwd_ops, buffer_block,
+                                                       main_block,
+                                                       vars_in_memory,
+                                                       self._dist_context)
             # rename recomputed ops' input and output var name
             for key in var_name_dict:
                 _rename_arg_(segment_descs, key, var_name_dict[key])
 
             # NOTE: one forward op could be correspond to multiple xxx_grad op.
-            # When traversing all grad_ops in reverse, need to set a flag to indicate 
+            # When traversing all grad_ops in reverse, need to set a flag to indicate
             # whether the ckpt and its segment_descs can be used.
             ckpt_op = op_path[segment[1] - 1]
             ckpt_ops_dict[ckpt_op.desc.original_id()] = [True, segment_descs]
diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py
index 7729d1c2bd0d1..3c1f0443e03dc 100644
--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -38,9 +38,10 @@
 # NOTE we add the "auto_parallel" prefix to the pass in order to
 # indicate that this pass should obey some constrains by auto_parallel
 # for example all ops and vars should has dist attr before and after pass
-# should use dist op instead of custom comm op 
+# should use dist op instead of custom comm op
 @register_pass("auto_parallel_sharding")
 class ShardingPass(PassBase):
+
     def __init__(self):
         super(ShardingPass, self).__init__()
         self.set_attr("dist_context", None)
@@ -101,12 +102,12 @@ def _collective_data_parallel_groups(self, main_block):
             if group is not None:
                 self.dp_groups.add(group)
 
-        # TODO(JZ-LIANG) allow more than one dp groups in network, support more general distribution 
+        # TODO(JZ-LIANG) allow more than one dp groups in network, support more general distribution
         # genetated by auto search
         if len(self.dp_groups) != 1:
             raise NotImplementedError(
-                "So far Only and Exactly one data parallel group in network are supported, but got [{}] different data parallel groups".
-                format(len(self.dp_groups)))
+                "So far Only and Exactly one data parallel group in network are supported, but got [{}] different data parallel groups"
+                .format(len(self.dp_groups)))
 
     def _build_sharding_infos(self, params_grads):
 
@@ -123,7 +124,7 @@ def _build_sharding_infos(self, params_grads):
             ) >= self.sharding_world_size, "number of parameters [{}] is not enough to be shard among [{}] ranks".format(
                 len(params_grads), self.sharding_world_size)
 
-            # sharding hybrid data parallel: partial sharding param within 
+            # sharding hybrid data parallel: partial sharding param within
             if dp_group.nranks > self.sharding_world_size:
                 self.partial_sharding = True
                 assert len(
@@ -138,8 +139,8 @@ def _build_sharding_infos(self, params_grads):
 
             # TODO(JZ-LIANG) when support multiple dp groups in future, should group param and bind them to corresponding dp group
             params_in_group = [p for p, g in params_grads]
-            assert len(params_in_group) == len(set(
-                params_in_group)), "found duplicated param in params_grads"
+            assert len(params_in_group) == len(
+                set(params_in_group)), "found duplicated param in params_grads"
             sharding_info = ShardingInfo(sharding_group, self.global_rank,
                                          params_in_group)
             self.sharding_infos.append(sharding_info)
@@ -307,16 +308,20 @@ def _insert_optimizer_broadcasts(self, main_block, startup_block):
                 assert main_block.has_var(param.name)
                 assert startup_block.has_var(param.name)
 
-                new_op = main_block.append_op(
-                    type='c_broadcast',
-                    inputs={'X': param},
-                    outputs={'Out': param},
-                    attrs={
-                        'ring_id': sharding_info.group.id,
-                        'root': sharding_info.get_var_rank(param.name),
-                        'use_calc_stream': True,
-                        OP_ROLE_KEY: OpRole.Optimize
-                    })
+                new_op = main_block.append_op(type='c_broadcast',
+                                              inputs={'X': param},
+                                              outputs={'Out': param},
+                                              attrs={
+                                                  'ring_id':
+                                                  sharding_info.group.id,
+                                                  'root':
+                                                  sharding_info.get_var_rank(
+                                                      param.name),
+                                                  'use_calc_stream':
+                                                  True,
+                                                  OP_ROLE_KEY:
+                                                  OpRole.Optimize
+                                              })
                 param_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
                     param)
                 assert param_dist_attr is not None
@@ -341,9 +346,10 @@ def _shard_gradient_synchronization(self, main_block):
                 input_name = op.input_arg_names[0]
                 base_name = _get_base_name_from_grad_name(input_name)
                 sharding_info = self.varname_to_sharding_info[base_name]
-                _insert_reduce_op(
-                    main_block, idx, input_name, sharding_info.group.id,
-                    sharding_info.get_var_rank(base_name), self._dist_context)
+                _insert_reduce_op(main_block, idx, input_name,
+                                  sharding_info.group.id,
+                                  sharding_info.get_var_rank(base_name),
+                                  self._dist_context)
                 if not self.partial_sharding:
                     main_block._remove_op(idx + 1, sync=False)
                 else:
@@ -382,11 +388,10 @@ def _shard_parameter(self, main_block, startup_block):
                         broadcast_varname = unique_name.generate(input_name +
                                                                  "@BroadCast")
                         input_var = main_block.var(input_name)
-                        new_var = main_block.create_var(
-                            name=broadcast_varname,
-                            shape=input_var.shape,
-                            dtype=input_var.dtype,
-                            persistable=False)
+                        new_var = main_block.create_var(name=broadcast_varname,
+                                                        shape=input_var.shape,
+                                                        dtype=input_var.dtype,
+                                                        persistable=False)
                         ref_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
                             input_var)
                         out_var_dist_attr = set_var_dist_attr(
@@ -395,11 +400,13 @@ def _shard_parameter(self, main_block, startup_block):
                             ref_dist_attr.process_mesh)
                         op._rename_input(input_name, broadcast_varname)
 
-                    _insert_init_and_broadcast_op(
-                        main_block, idx, broadcast_varname,
-                        sharding_info.local_rank, root_rank,
-                        sharding_info.group.id,
-                        op.attr('op_role'), self._dist_context)
+                    _insert_init_and_broadcast_op(main_block, idx,
+                                                  broadcast_varname,
+                                                  sharding_info.local_rank,
+                                                  root_rank,
+                                                  sharding_info.group.id,
+                                                  op.attr('op_role'),
+                                                  self._dist_context)
 
             for idx, op in reversed(list(enumerate(main_block.ops))):
                 if op.type != "cast":
@@ -446,17 +453,16 @@ def _insert_init_and_broadcast_op(block, insert_idx, varname, local_rank,
     broadcast_var_dist_attr = dist_context.get_tensor_dist_attr_for_program(
         broadcast_var)
 
-    new_op = block._insert_op_without_sync(
-        insert_idx,
-        type='c_broadcast',
-        inputs={'X': varname},
-        outputs={'Out': varname},
-        attrs={
-            'ring_id': ring_id,
-            'root': root_rank,
-            'use_calc_stream': True,
-            OP_ROLE_KEY: op_role
-        })
+    new_op = block._insert_op_without_sync(insert_idx,
+                                           type='c_broadcast',
+                                           inputs={'X': varname},
+                                           outputs={'Out': varname},
+                                           attrs={
+                                               'ring_id': ring_id,
+                                               'root': root_rank,
+                                               'use_calc_stream': True,
+                                               OP_ROLE_KEY: op_role
+                                           })
     naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
         new_op, broadcast_var_dist_attr.process_mesh,
         broadcast_var_dist_attr.dims_mapping, dist_context)
@@ -487,17 +493,17 @@ def _insert_reduce_op(block,
                       use_calc_stream=True):
     assert root_id >= 0, "root id should be a positive int, but now root id is {}".format(
         root_id)
-    new_op = block._insert_op_without_sync(
-        insert_idx,
-        type='c_reduce_sum',
-        inputs={'X': [reduce_var]},
-        outputs={'Out': [reduce_var]},
-        attrs={
-            'ring_id': ring_id,
-            'root_id': root_id,
-            'use_calc_stream': use_calc_stream,
-            OP_ROLE_KEY: op_role
-        })
+    new_op = block._insert_op_without_sync(insert_idx,
+                                           type='c_reduce_sum',
+                                           inputs={'X': [reduce_var]},
+                                           outputs={'Out': [reduce_var]},
+                                           attrs={
+                                               'ring_id': ring_id,
+                                               'root_id': root_id,
+                                               'use_calc_stream':
+                                               use_calc_stream,
+                                               OP_ROLE_KEY: op_role
+                                           })
 
     dist_attr = dist_context.get_tensor_dist_attr_for_program(
         block.var(reduce_var))
@@ -641,6 +647,7 @@ def shard_parameters(params, group_size):
 
 
 class ShardingInfo(object):
+
     def __init__(self, group, rank, params):
         self.group = group
         self.params = params
diff --git a/python/paddle/distributed/passes/cpp_pass.py b/python/paddle/distributed/passes/cpp_pass.py
index 72525255b7eaa..1d99a93624fc3 100644
--- a/python/paddle/distributed/passes/cpp_pass.py
+++ b/python/paddle/distributed/passes/cpp_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,6 +18,7 @@
 
 @register_pass("fuse_elewise_add_act")
 class FuseElementwiseAddActPass(CPPPassWrapper):
+
     def __init__(self):
         super(FuseElementwiseAddActPass, self).__init__()
 
@@ -31,6 +32,7 @@ def _type(self):
 
 @register_pass("fuse_bn_act")
 class FuseBatchNormActPass(CPPPassWrapper):
+
     def __init__(self):
         super(FuseBatchNormActPass, self).__init__()
 
@@ -44,6 +46,7 @@ def _type(self):
 
 @register_pass("fuse_bn_add_act")
 class FuseBatchNormAddActPass(CPPPassWrapper):
+
     def __init__(self):
         super(FuseBatchNormAddActPass, self).__init__()
 
@@ -57,6 +60,7 @@ def _type(self):
 
 @register_pass("fuse_relu_depthwise_conv")
 class FuseReluDepthwiseConvPass(CPPPassWrapper):
+
     def __init__(self):
         super(FuseReluDepthwiseConvPass, self).__init__()
 
@@ -70,6 +74,7 @@ def _type(self):
 
 @register_pass("fuse_optimizer")
 class FuseOptimizerPass(CPPPassWrapper):
+
     def __init__(self):
         super(FuseOptimizerPass, self).__init__()
 
@@ -85,6 +90,7 @@ def _type(self):
 
 @register_pass("inplace_addto_op")
 class InplaceAddtoOpPass(CPPPassWrapper):
+
     def __init__(self):
         super(InplaceAddtoOpPass, self).__init__()
 
@@ -98,6 +104,7 @@ def _type(self):
 
 @register_pass("build_cinn")
 class BuildCINNPass(CPPPassWrapper):
+
     def __init__(self):
         super(BuildCINNPass, self).__init__()
         self.set_attr("allow_ops", [])
diff --git a/python/paddle/distributed/passes/fuse_all_reduce.py b/python/paddle/distributed/passes/fuse_all_reduce.py
index 317a66c008a81..33a58a67c9d16 100644
--- a/python/paddle/distributed/passes/fuse_all_reduce.py
+++ b/python/paddle/distributed/passes/fuse_all_reduce.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -44,12 +44,12 @@ def find_adjacent_match_sequences(iterable,
 
 def insert_fuse_all_reduce_ops(block, reversed_op_indices, input_var_names,
                                output_var_names, dtype, attrs):
-    fused_var = block.create_var(
-        name=unique_name.generate("FusedOutput_{}".format(input_var_names[0])),
-        dtype=dtype)
+    fused_var = block.create_var(name=unique_name.generate(
+        "FusedOutput_{}".format(input_var_names[0])),
+                                 dtype=dtype)
 
-    # FIXME(zengjinle): here we assume that we use 
-    # c_sync_calc_stream/c_sync_comm_stream to do sync. 
+    # FIXME(zengjinle): here we assume that we use
+    # c_sync_calc_stream/c_sync_comm_stream to do sync.
     # But someone may use c_wait_compute/c_wait_comm instead.
     if not attrs["use_calc_stream"]:
         ring_id = attrs["ring_id"]
@@ -103,21 +103,21 @@ def insert_fuse_all_reduce_ops(block, reversed_op_indices, input_var_names,
     }
 
     if not attrs["use_calc_stream"]:
-        block._insert_op_without_sync(
-            insert_idx,
-            type="c_sync_calc_stream",
-            inputs={"X": fused_var},
-            outputs={"Out": fused_var,
-                     op_role_key: attrs[op_role_key]})
+        block._insert_op_without_sync(insert_idx,
+                                      type="c_sync_calc_stream",
+                                      inputs={"X": fused_var},
+                                      outputs={
+                                          "Out": fused_var,
+                                          op_role_key: attrs[op_role_key]
+                                      })
         insert_idx += 1
 
-    # c_allreduce_sum should insert  
-    block._insert_op_without_sync(
-        insert_idx,
-        type="c_allreduce_sum",
-        inputs={"X": fused_var},
-        outputs={"Out": fused_var},
-        attrs=attrs)
+    # c_allreduce_sum should insert
+    block._insert_op_without_sync(insert_idx,
+                                  type="c_allreduce_sum",
+                                  inputs={"X": fused_var},
+                                  outputs={"Out": fused_var},
+                                  attrs=attrs)
 
     for op_idx in reversed_op_indices:
         block._remove_op(op_idx)
@@ -186,8 +186,9 @@ def is_same_adjacent_op(ref_op, new_op):
             return False
         return True
 
-    match_seqs = find_adjacent_match_sequences(
-        collective_ops, is_valid_allreduce_op, is_same_adjacent_op)
+    match_seqs = find_adjacent_match_sequences(collective_ops,
+                                               is_valid_allreduce_op,
+                                               is_same_adjacent_op)
     new_match_seqs = []
     for i, j in match_seqs:
         new_match_seqs.append([collective_op_indices[k] for k in range(i, j)])
@@ -330,6 +331,7 @@ def insert_fuse_all_reduce_by_memory_size(block, groups, max_memory_size):
 
 @register_pass("fuse_all_reduce")
 class FuseAllReducePass(PassBase):
+
     def __init__(self):
         super(FuseAllReducePass, self).__init__()
         self.set_attr("max_memory_size", -1)
@@ -344,11 +346,11 @@ def _check_conflict(self, other_pass):
     def _type(self):
         return PassType.COMM_OPT
 
-    # NOTE: why FuseAllReducePass can override apply_single_impl instead of 
-    # apply_impl? AllReduce is a collective operation, so the program of each 
-    # rank inside the same communication group should have the same 
-    # c_allreduce_sum operations. Therefore, FuseAllReducePass can override 
-    # apply_single_impl directly.  
+    # NOTE: why FuseAllReducePass can override apply_single_impl instead of
+    # apply_impl? AllReduce is a collective operation, so the program of each
+    # rank inside the same communication group should have the same
+    # c_allreduce_sum operations. Therefore, FuseAllReducePass can override
+    # apply_single_impl directly.
     def _apply_single_impl(self, main_program, startup_program, context):
         max_memory_size = self.get_attr("max_memory_size")
         op_deps = main_program.desc.get_op_deps()
@@ -356,8 +358,8 @@ def _apply_single_impl(self, main_program, startup_program, context):
         for i in range(num_blocks):
             block = main_program.block(i)
             groups = find_all_fuse_all_reduce_groups(block)
-            groups = split_fuse_all_reduce_groups_by_deps(block, groups,
-                                                          op_deps[i])
+            groups = split_fuse_all_reduce_groups_by_deps(
+                block, groups, op_deps[i])
             insert_fuse_all_reduce_by_memory_size(block, groups,
                                                   max_memory_size)
         main_program._sync_with_cpp()
diff --git a/python/paddle/distributed/passes/pass_base.py b/python/paddle/distributed/passes/pass_base.py
index 3afca5d63556f..b733f88669375 100644
--- a/python/paddle/distributed/passes/pass_base.py
+++ b/python/paddle/distributed/passes/pass_base.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,6 +19,7 @@
 
 
 class PassContext:
+
     def __init__(self):
         self._applied_passes = []
         self._attrs = {}
@@ -118,6 +119,7 @@ def _apply_single_impl(self, main_program, startup_program, context):
 
 
 def register_pass(name):
+
     def impl(cls):
         PassBase._register(name, cls)
         cls.name = name
@@ -136,6 +138,7 @@ def new_pass(name, pass_attrs={}):
 
 
 class CPPPassWrapper(PassBase):
+
     def __init__(self):
         super(CPPPassWrapper, self).__init__()
 
@@ -159,8 +162,8 @@ def _apply_single_impl(self, main_program, startup_program, context):
 
 
 def _fusion_opt_last_rule(pass_before, pass_after):
-    if pass_before._type() == PassType.FUSION_OPT and pass_after._type(
-    ) != PassType.FUSION_OPT:
+    if pass_before._type(
+    ) == PassType.FUSION_OPT and pass_after._type() != PassType.FUSION_OPT:
         return False
     else:
         return True
@@ -168,6 +171,7 @@ def _fusion_opt_last_rule(pass_before, pass_after):
 
 def _make_rule_from_white_lists_dict(before_white_lists_dict,
                                      after_white_lists_dict):
+
     def collect_pass_names(white_lists_dict, result):
         for k, v in white_lists_dict.items():
             result.add(k)
@@ -202,8 +206,8 @@ def rule(pass_before, pass_after):
     return rule
 
 
-# The key-value pair (k, [v1, v2, ..., vn]) means the pass k can be 
-# applied before any of pass [v1, v2, ..., vn] is applied 
+# The key-value pair (k, [v1, v2, ..., vn]) means the pass k can be
+# applied before any of pass [v1, v2, ..., vn] is applied
 PassBase._BEFORE_WHITE_LISTS_DICT = {
     "fuse_gradient_merge": ["fuse_all_reduce"],
     # Add more white lists here
@@ -212,7 +216,7 @@ def rule(pass_before, pass_after):
 # The key-value pair (k, [v1, v2, ..., vn]) means the pass k can be
 # applied after any of pass [v1, v2, ..., vn] is applied
 PassBase._AFTER_WHITE_LISTS_DICT = {
-    # Add more white lists here 
+    # Add more white lists here
 }
 
 PassBase._COMMON_RULES = [
@@ -292,6 +296,7 @@ def _solve_pass_conflict(passes, context):
 
 
 class PassManager:
+
     def __init__(self, passes, context=None, auto_solve_conflict=True):
         if context is None:
             context = PassContext()
diff --git a/python/paddle/distributed/passes/pass_utils.py b/python/paddle/distributed/passes/pass_utils.py
index bd1eddce3bbd3..6e43930d2e176 100644
--- a/python/paddle/distributed/passes/pass_utils.py
+++ b/python/paddle/distributed/passes/pass_utils.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,7 +27,7 @@ def list_to_ordered_dict(list_obj, ordered_dict=None):
 
 
 # The inputs of a program are the variables
-# that first occur as the input of the op. 
+# that first occur as the input of the op.
 def get_inputs_of_program(program):
     visited_vars = set()
     input_vars = []
diff --git a/python/paddle/distributed/passes/ps_server_pass.py b/python/paddle/distributed/passes/ps_server_pass.py
index 30f6542fa2574..0b77468338784 100755
--- a/python/paddle/distributed/passes/ps_server_pass.py
+++ b/python/paddle/distributed/passes/ps_server_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 @register_pass("add_lr_decay_table_pass")
 class AddLrDecayTablePass(PassBase):
+
     def __init__(self):
         super(AddLrDecayTablePass, self).__init__()
 
@@ -101,8 +102,8 @@ def _get_lr_sheduler_program(self, lr_sheduler, lr_decay_steps):
                     % lr_decay_steps)
         else:
             raise ValueError(
-                "Not supported current LearningRate strategy, please use follow decay strategy: {}".
-                format(schedler_decay))
+                "Not supported current LearningRate strategy, please use follow decay strategy: {}"
+                .format(schedler_decay))
 
         return decay_main_program, decay_startup_program, lr_name
 
@@ -125,6 +126,7 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
 
 @register_pass("add_listen_and_serv_pass")
 class AddListenAndServPass(PassBase):
+
     def __init__(self):
         super(AddListenAndServPass, self).__init__()
 
@@ -152,12 +154,15 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
             "rpc_send_thread_num": -1,
             "rpc_prefetch_thread_num": -1
         }
-        main_program.global_block().append_op(
-            type="listen_and_serv", inputs={'X': []}, outputs={}, attrs=opt)
+        main_program.global_block().append_op(type="listen_and_serv",
+                                              inputs={'X': []},
+                                              outputs={},
+                                              attrs=opt)
 
 
 @register_pass("add_rpc_global_flags_pass")
 class AddRpcGlobalFlagsPass(PassBase):
+
     def __init__(self):
         super(AddRpcGlobalFlagsPass, self).__init__()
 
@@ -173,6 +178,7 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
 
 @register_pass("add_optimizer_pass")
 class AddOptimizerPass(PassBase):
+
     def __init__(self):
         super(AddOptimizerPass, self).__init__()
 
@@ -188,6 +194,7 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
 
 @register_pass("add_geo_optimizer_pass")
 class AddGeoOptimizerPass(PassBase):
+
     def __init__(self):
         super(AddGeoOptimizerPass, self).__init__()
 
@@ -203,6 +210,7 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
 
 @register_pass("build_pserver_startup_program_pass")
 class BuildPserverStartupProgramPass(PassBase):
+
     def __init__(self):
         super(BuildPserverStartupProgramPass, self).__init__()
 
@@ -218,6 +226,7 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
 
 @register_pass("delete_unused_in_startup_pass")
 class DeleteUnusedInStartupPass(PassBase):
+
     def __init__(self):
         super(DeleteUnusedInStartupPass, self).__init__()
 
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 6112a9a1f45b6..4a015fea30a1d 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,13 +17,16 @@
 import paddle.compat as cpt
 from ..ps.utils.public import *
 from paddle.framework import core
-from .pass_base import PassBase, register_pass
+from paddle.distributed.passes.pass_base import PassBase, register_pass
 from paddle.fluid.transpiler.details.program_utils import delete_ops
 from paddle.fluid.transpiler.collective import SingleProcessMultiThread
+from _collections import deque, defaultdict
+from paddle.fluid.framework import Program, Parameter
 
 
 @register_pass("append_send_ops_pass")
 class AppendSendOpsPass(PassBase):  # 该 pass 被多种模式复用
+
     def __init__(self):
         super(AppendSendOpsPass, self).__init__()
 
@@ -47,64 +50,64 @@ def _append_send_op(self, program, union_vars, queue, is_sparse, table_id,
         if ps_mode in [DistributedMode.SYNC, DistributedMode.HALF_ASYNC]:
             dummy_output = program.global_block().create_var(
                 name=framework.generate_control_dev_var_name())
-        logger.info("dummy_output: {}".format(dummy_output))
-        program.global_block().append_op(
-            type="send",
-            inputs={"X": send_input_vars},
-            outputs={"Out": dummy_output},
-            attrs={
-                "send_varnames": [queue],
-                "is_sparse": is_sparse,
-                "table_id": table_id,
-                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-            })
+        program.global_block().append_op(type="send",
+                                         inputs={"X": send_input_vars},
+                                         outputs={"Out": dummy_output},
+                                         attrs={
+                                             "send_varnames": [queue],
+                                             "is_sparse":
+                                             is_sparse,
+                                             "table_id":
+                                             table_id,
+                                             RPC_OP_ROLE_ATTR_NAME:
+                                             RPC_OP_ROLE_ATTR_VALUE
+                                         })
 
         return dummy_output
 
     def _append_barrier_op(self, program, dummys, trainer_id):
-        program.global_block().append_op(
-            type="send_barrier",
-            inputs={"X": dummys},
-            outputs={"Out": []},
-            attrs={
-                "trainer_id": trainer_id,
-                "half_async": True,
-                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-            })
+        program.global_block().append_op(type="send_barrier",
+                                         inputs={"X": dummys},
+                                         outputs={"Out": []},
+                                         attrs={
+                                             "trainer_id":
+                                             trainer_id,
+                                             "half_async":
+                                             True,
+                                             RPC_OP_ROLE_ATTR_NAME:
+                                             RPC_OP_ROLE_ATTR_VALUE
+                                         })
 
     def _apply_single_impl(self, main_program, startup_program, pass_ctx):
         attrs = pass_ctx._attrs
-        print("pass loss program id:", id(attrs['loss'].block.program))
-        print("pass main program id:", id(main_program))
         ps_mode = attrs['ps_mode']
         if ps_mode == DistributedMode.GEO:
             send_ctx = get_geo_trainer_send_context(attrs)  # geo 模式
+        elif attrs['is_heter_ps_mode'] == True:
+            print("is_heter_ps_mode in append_send_ops_pass!!")
+            send_ctx = get_the_one_send_context(attrs, split_dense_table=True)
         else:
             send_ctx = get_the_one_send_context(attrs)  # async、sync 等各种模式
-        logger.info("send_ctx: {}".format(send_ctx))
         dummys = []
         for merged_name, send in send_ctx.items():
             if send.is_sparse() and ps_mode != DistributedMode.GEO:
                 continue
             if send.program_id() != id(attrs['loss'].block.program):
                 continue
-            logger.info('merged_name, send: {}, {}'.format(merged_name, send))
             is_sparse = 1 if send.is_sparse() else 0
             is_sparse = 2 if send.is_distributed() else is_sparse
             dummys.append(
-                self._append_send_op(main_program,
-                                     send.origin_varnames(), merged_name,
-                                     is_sparse, send.table_id(), ps_mode))
-        logger.info('ps trainer pass - ps mode: {}'.format(ps_mode))
-        logger.info('dummys: {}'.format(dummys))
+                self._append_send_op(main_program, send.origin_varnames(),
+                                     merged_name, is_sparse, send.table_id(),
+                                     ps_mode))
         if ps_mode in [DistributedMode.SYNC, DistributedMode.HALF_ASYNC]:
-            logger.info('insert send_barrier_op')
             trainer_id = get_role_id(attrs['role_maker'])
             self._append_barrier_op(main_program, dummys, trainer_id)
 
 
 @register_pass("distributed_ops_pass")
 class DistributedOpsPass(PassBase):
+
     def __init__(self):
         super(DistributedOpsPass, self).__init__()
         self.w_2_table_id = {}
@@ -150,32 +153,30 @@ def _push_sparse_fuse(self, _program, push_sparse_ops, attrs, use_cvm_op):
                 dtype=core.VarDesc.VarType.INT64,
                 persistable=False,
                 stop_gradient=True)
-            _program.global_block()._insert_op(
-                index=0,
-                type='fill_constant',
-                inputs={},
-                outputs={'Out': show},
-                attrs={
-                    'shape': [1],
-                    'dtype': show.dtype,
-                    'value': 1,
-                })
+            _program.global_block()._insert_op(index=0,
+                                               type='fill_constant',
+                                               inputs={},
+                                               outputs={'Out': show},
+                                               attrs={
+                                                   'shape': [1],
+                                                   'dtype': show.dtype,
+                                                   'value': 1,
+                                               })
 
             clk = _program.global_block().create_var(
                 name="clk",
                 dtype=core.VarDesc.VarType.INT64,
                 persistable=False,
                 stop_gradient=True)
-            _program.global_block()._insert_op(
-                index=0,
-                type='fill_constant',
-                inputs={},
-                outputs={'Out': clk},
-                attrs={
-                    'shape': [1],
-                    'dtype': clk.dtype,
-                    'value': 0,
-                })
+            _program.global_block()._insert_op(index=0,
+                                               type='fill_constant',
+                                               inputs={},
+                                               outputs={'Out': clk},
+                                               attrs={
+                                                   'shape': [1],
+                                                   'dtype': clk.dtype,
+                                                   'value': 0,
+                                               })
 
         for param, ops in push_sparse_ops.items():
             all_ops = _program.global_block().ops
@@ -197,25 +198,26 @@ def _push_sparse_fuse(self, _program, push_sparse_ops, attrs, use_cvm_op):
             for idx in op_idxs[::-1]:
                 _program.global_block()._remove_op(idx)
 
-            _program.global_block().append_op(
-                type="distributed_push_sparse",
-                inputs={
-                    "Ids": inputs,
-                    'W': w,
-                    "Outputs": outputs,
-                    "Shows": show,
-                    "Clicks": clk
-                },
-                outputs={"Outputs": outputs},
-                attrs={
-                    "is_distributed": is_distributed,
-                    "padding_idx": padding_idx,
-                    "table_id": table_id,
-                    "size": self.emb_size[param],
-                    "use_cvm_op": use_cvm_op
-                })
+            _program.global_block().append_op(type="distributed_push_sparse",
+                                              inputs={
+                                                  "Ids": inputs,
+                                                  'W': w,
+                                                  "Outputs": outputs,
+                                                  "Shows": show,
+                                                  "Clicks": clk
+                                              },
+                                              outputs={"Outputs": outputs},
+                                              attrs={
+                                                  "is_distributed":
+                                                  is_distributed,
+                                                  "padding_idx": padding_idx,
+                                                  "table_id": table_id,
+                                                  "size": self.emb_size[param],
+                                                  "use_cvm_op": use_cvm_op
+                                              })
 
     def _pull_sparse_fuse(self, _program, pull_sparse_ops, attrs, send_ctx):
+
         def dag_check_up_and_reorder(program, inputs, outputs):
             global_block = program.global_block()
             min_output_index = len(global_block.ops)
@@ -376,8 +378,10 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                     _program.global_block()._insert_op(
                         index=distributed_idx,
                         type="pull_gpups_sparse",
-                        inputs={"Ids": inputs,
-                                'W': w},
+                        inputs={
+                            "Ids": inputs,
+                            'W': w
+                        },
                         outputs={"Out": outputs},
                         attrs={
                             "size": [w.shape[1] for i in inputs],
@@ -388,8 +392,10 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                     _program.global_block()._insert_op(
                         index=distributed_idx,
                         type="distributed_lookup_table",
-                        inputs={"Ids": inputs,
-                                'W': w},
+                        inputs={
+                            "Ids": inputs,
+                            'W': w
+                        },
                         outputs={"Outputs": outputs},
                         attrs={
                             "is_distributed": is_distributed,
@@ -405,8 +411,10 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                     _program.global_block()._insert_op(
                         index=distributed_idx,
                         type="distributed_lookup_table",
-                        inputs={"Ids": [inputs[i]],
-                                'W': w},
+                        inputs={
+                            "Ids": [inputs[i]],
+                            'W': w
+                        },
                         outputs={"Outputs": [outputs[i]]},
                         attrs={
                             "is_distributed": is_distributed,
@@ -441,8 +449,8 @@ def _get_pull_sparse_ops(self, _program, attrs):
         for op in _program.global_block().ops:
             if op.type in SPARSE_GRAD_OP_TYPE_DICT.keys():
                 param_name = op.input(SPARSE_GRAD_OP_TYPE_DICT[op.type])[0]
-                if param_name in pull_sparse_ids and op.input("Ids")[
-                        0] in pull_sparse_ids[param_name]:
+                if param_name in pull_sparse_ids and op.input(
+                        "Ids")[0] in pull_sparse_ids[param_name]:
                     ops = push_sparse_ops.get(param_name, [])
                     ops.append(op)
                     push_sparse_ops[param_name] = ops
@@ -453,6 +461,8 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
         attrs = pass_ctx._attrs
         pull_sparse_ops, push_sparse_ops, use_cvm_op = self._get_pull_sparse_ops(
             main_program, attrs)
+        print("is_heter_ps_mode in distributed_ops_pass {}?".format(
+            attrs['is_heter_ps_mode']))
         send_ctx = get_the_one_send_context(
             attrs, split_dense_table=attrs['is_heter_ps_mode'])
         self._pull_sparse_fuse(main_program, pull_sparse_ops, attrs, send_ctx)
@@ -461,6 +471,7 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
 
 @register_pass("delete_optimizer_pass")
 class DeleteOptimizesPass(PassBase):
+
     def __init__(self):
         super(DeleteOptimizesPass, self).__init__()
 
@@ -494,18 +505,16 @@ def _delete_optimizer_op_and_vars(self, _program, optimize_ops):
 
     def _add_lr_var(self, main_program, attrs):
         # Todo: hard code for pe
-        lr_var = attrs['origin_main_program'].global_block().vars[
-            "learning_rate_0"]
-        main_program.global_block().create_var(
-            name=lr_var.name,
-            shape=lr_var.shape,
-            dtype=lr_var.dtype,
-            type=lr_var.type,
-            lod_level=lr_var.lod_level,
-            persistable=True)
+        lr_var = attrs['origin_main_program'].global_block(
+        ).vars["learning_rate_0"]
+        main_program.global_block().create_var(name=lr_var.name,
+                                               shape=lr_var.shape,
+                                               dtype=lr_var.dtype,
+                                               type=lr_var.type,
+                                               lod_level=lr_var.lod_level,
+                                               persistable=True)
 
     def _apply_single_impl(self, main_program, startup_program, pass_ctx):
-        print("delete_optimizer_pass")
         attrs = pass_ctx._attrs
         optimizer_ops = get_optimize_ops(main_program)
         lr_ops = get_lr_ops(main_program)
@@ -518,6 +527,7 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
 
 @register_pass("delete_extra_optimizer_pass")
 class DeleteExtraOptimizerPass(PassBase):
+
     def __init__(self):
         super(DeleteExtraOptimizerPass, self).__init__()
 
@@ -560,6 +570,7 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
 
 @register_pass("fake_init_ops_pass")
 class FakeInitOpsPass(PassBase):
+
     def __init__(self):
         super(FakeInitOpsPass, self).__init__()
 
@@ -586,8 +597,8 @@ def _fake_init_sparsetable(self, program, sparse_table_names):
                     table_param_init_op.append(op)
             init_op_num = len(table_param_init_op)
             if init_op_num != 1:
-                raise ValueError("table init op num should be 1, now is " + str(
-                    init_op_num))
+                raise ValueError("table init op num should be 1, now is " +
+                                 str(init_op_num))
             table_init_op = table_param_init_op[0]
             program.global_block().append_op(
                 type="fake_init",
@@ -604,6 +615,7 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
 
 @register_pass("ps_gpu_pass")
 class PsGpuPass(PassBase):
+
     def __init__(self):
         super(PsGpuPass, self).__init__()
 
@@ -628,8 +640,8 @@ def _add_push_box_sparse_op(self, program):
                     insert_index + 1)
                 new_op_desc.copy_from(op_desc)
                 new_op_desc._set_attr(op_role_attr_name, backward)
-                new_op = paddle.fluid.framework.Operator(program.global_block(),
-                                                         new_op_desc)
+                new_op = paddle.fluid.framework.Operator(
+                    program.global_block(), new_op_desc)
                 program.global_block().ops.insert(insert_index + 1, new_op)
                 program.global_block()._sync_with_cpp()
 
@@ -703,6 +715,7 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
 
 @register_pass("ps_transpile_pass")
 class PsTranspilePass(PassBase):
+
     def __init__(self):
         super(PsTranspilePass, self).__init__()
 
@@ -716,17 +729,17 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
         attrs = pass_ctx._attrs
         t = SingleProcessMultiThread()
         env = get_dist_env()
-        t.transpile(
-            startup_program=startup_program,
-            main_program=main_program,
-            rank=env["trainer_id"],
-            endpoints=env["trainer_endpoints"],
-            current_endpoint=env['current_endpoint'],
-            wait_port=False)
+        t.transpile(startup_program=startup_program,
+                    main_program=main_program,
+                    rank=env["trainer_id"],
+                    endpoints=env["trainer_endpoints"],
+                    current_endpoint=env['current_endpoint'],
+                    wait_port=False)
 
 
 @register_pass("split_heter_worker_ops_pass")
 class SplitHeterWorkerOpsPass(PassBase):
+
     def __init__(self):
         super(SplitHeterWorkerOpsPass, self).__init__()
 
@@ -764,10 +777,10 @@ def _create_heter_program(self, program, attrs, heter_program,
         current_device = role_maker._heter_device_type().lower()
         stage_id = int(role_maker._get_stage_id())
 
-        heter_block_ops_forward = program_block_ops_list[stage_id - 1][
-            "forward"]
-        heter_block_ops_backward = program_block_ops_list[stage_id - 1][
-            "backward"]
+        heter_block_ops_forward = program_block_ops_list[stage_id -
+                                                         1]["forward"]
+        heter_block_ops_backward = program_block_ops_list[stage_id -
+                                                          1]["backward"]
 
         heter_block = heter_program._create_block(pre_block_idx)
         optimizer_block.append(heter_block)
@@ -789,15 +802,17 @@ def _create_heter_program(self, program, attrs, heter_program,
             for _, op in enumerate(heter_block_ops_backward):
                 block_append_op(heter_program, program, heter_block_bp, op)
 
-            bp_entrance_vars = block_var_detail[stage_id - 1]["backward"][
-                "entrance"]
+            bp_entrance_vars = block_var_detail[stage_id -
+                                                1]["backward"]["entrance"]
             add_vars_by_var_list(bp_entrance_vars, program, heter_program,
                                  heter_block_bp)
             bp_exit_vars = block_var_detail[stage_id - 1]["backward"]["exit"]
             add_vars_by_var_list(bp_exit_vars, program, heter_program,
                                  heter_block_bp)
-            backward_comm_info = get_communicate_var_info(
-                program, stage_id, bp_entrance_vars, type="backward")
+            backward_comm_info = get_communicate_var_info(program,
+                                                          stage_id,
+                                                          bp_entrance_vars,
+                                                          type="backward")
 
             grad_to_block_id.append(backward_comm_info["block_input_var_name"] +
                                     ":" + str(heter_block_bp.idx))
@@ -806,8 +821,8 @@ def _create_heter_program(self, program, attrs, heter_program,
             for _, op in enumerate(heter_block_ops_backward):
                 block_append_op(heter_program, program, heter_block, op)
 
-            bp_entrance_vars = block_var_detail[stage_id - 1]["backward"][
-                "entrance"]
+            bp_entrance_vars = block_var_detail[stage_id -
+                                                1]["backward"]["entrance"]
             add_vars_by_var_list(bp_entrance_vars, program, heter_program,
                                  heter_block)
             bp_exit_vars = block_var_detail[stage_id - 1]["backward"]["exit"]
@@ -816,11 +831,13 @@ def _create_heter_program(self, program, attrs, heter_program,
 
             heter_block_bp = heter_block
 
-        forward_comm_info = get_communicate_var_info(
-            program, stage_id, entrance_vars, type="forward")
+        forward_comm_info = get_communicate_var_info(program,
+                                                     stage_id,
+                                                     entrance_vars,
+                                                     type="forward")
 
-        grad_to_block_id.append(forward_comm_info["block_input_var_name"] + ":"
-                                + str(heter_block.idx))
+        grad_to_block_id.append(forward_comm_info["block_input_var_name"] +
+                                ":" + str(heter_block.idx))
 
         first_op_index_bp = len(heter_block_bp.ops)
 
@@ -828,14 +845,16 @@ def _create_heter_program(self, program, attrs, heter_program,
             static_var = insert_communicate_op(program, role_maker, heter_block,
                                                stage_id, first_op_index_fp,
                                                block_var_detail, current_device)
-        static_var_bp = insert_communicate_op(
-            program, role_maker, heter_block_bp, stage_id, first_op_index_bp,
-            block_var_detail, current_device, False)
+        static_var_bp = insert_communicate_op(program, role_maker,
+                                              heter_block_bp, stage_id,
+                                              first_op_index_bp,
+                                              block_var_detail, current_device,
+                                              False)
 
         # add send op
-        send_grad_var_list = add_heter_send_op(program, heter_program,
-                                               heter_block_bp,
-                                               block_var_detail[stage_id - 1])
+        send_grad_var_list = add_send_op(
+            program, heter_block_bp,
+            block_var_detail[stage_id - 1]["backward"]["persistables"])
 
         # add step conter
         send_input_vars = []
@@ -853,11 +872,10 @@ def _create_heter_program(self, program, attrs, heter_program,
             RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
         }
         # append the listen_and_serv op
-        heter_program.global_block().append_op(
-            type="heter_listen_and_serv",
-            inputs={'X': []},
-            outputs={},
-            attrs=attrs)
+        heter_program.global_block().append_op(type="heter_listen_and_serv",
+                                               inputs={'X': []},
+                                               outputs={},
+                                               attrs=attrs)
         # TODO check heter program
 
     def _apply_single_impl(self, main_program, startup_program, pass_ctx):
@@ -890,6 +908,7 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
 
 @register_pass("split_trainer_ops_pass")
 class SplitTrainerOpsPass(PassBase):
+
     def __init__(self):
         super(SplitTrainerOpsPass, self).__init__()
 
@@ -909,7 +928,7 @@ def _replace_ops_by_communicate_op(self, program, attrs, heter_block_index,
                 first_op_idx = all_op.index(op)
                 break
         assert first_op_idx != -1
-        self._delete_same_ops(program.global_block(), ops_list)
+        delete_same_ops(program.global_block(), ops_list)
 
         entrance_var = []
         role_maker = attrs['role_maker']
@@ -939,17 +958,6 @@ def _replace_ops_by_communicate_op(self, program, attrs, heter_block_index,
 
         return entrance_var
 
-    def _delete_same_ops(self, block, ops):
-        for op in ops:
-            try:
-                for origin_op in block.ops:
-                    if str(origin_op) == str(op):
-                        idx = list(block.ops).index(origin_op)
-                        block._remove_op(idx)
-                        break
-            except Exception as e:
-                print(e)
-
     def _remove_var_pair_by_grad(self, var_name, attrs):
         for index, pair in enumerate(attrs['merged_variables_pairs']):
             var = pair[0]
@@ -1019,17 +1027,19 @@ def _create_trainer_program(self, program, origin_program, attrs,
         grad_to_block_id = []
 
         bp_ops_list = program_block_ops_list[0]["backward"]
-        self._delete_same_ops(program.global_block(), bp_ops_list)
+        delete_same_ops(program.global_block(), bp_ops_list)
         delete_trainer_useless_var(program, static_var)
         backward_block = create_backward_block(program, origin_program,
                                                bp_ops_list, block_var_detail)
 
         bp_entrance_vars = block_var_detail[0]["backward"]["entrance"]
-        backward_comm_info = get_communicate_var_info(
-            origin_program, 1, bp_entrance_vars, type="backward")
+        backward_comm_info = get_communicate_var_info(origin_program,
+                                                      1,
+                                                      bp_entrance_vars,
+                                                      type="backward")
 
-        grad_to_block_id.append(backward_comm_info["block_input_var_name"] + ":"
-                                + str(backward_block.idx))
+        grad_to_block_id.append(backward_comm_info["block_input_var_name"] +
+                                ":" + str(backward_block.idx))
         optimizer_block.append(backward_block)
         role_maker = attrs['role_maker']
         attrs = {
@@ -1045,12 +1055,11 @@ def _create_trainer_program(self, program, origin_program, attrs,
             RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
         }
         # append the listen_and_serv op
-        program.global_block()._insert_op(
-            index=0,
-            type="heter_listen_and_serv",
-            inputs={'X': []},
-            outputs={},
-            attrs=attrs)
+        program.global_block()._insert_op(index=0,
+                                          type="heter_listen_and_serv",
+                                          inputs={'X': []},
+                                          outputs={},
+                                          attrs=attrs)
 
         ## TODO add check for bp block
         #check_op_device(program.global_block(), DEFAULT_DEVICE)
@@ -1078,6 +1087,7 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
 
 @register_pass("set_heter_pipeline_opt_pass")
 class SetHeterPipelineOptPass(PassBase):
+
     def __init__(self):
         super(SetHeterPipelineOptPass, self).__init__()
 
@@ -1093,12 +1103,13 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
         num_microbatches = attrs['user_defined_strategy'].pipeline_configs[
             'accumulate_steps']
 
-        attrs['origin_startup_program']._heter_pipeline_opt = {
+        startup_program._heter_pipeline_opt = {
             "startup_program": startup_program,
             "pipeline_stage": int(role_maker._get_stage_id()) - 1,
             "heter_place": role_maker._heter_device(),
+            "is_fl_mode": 1
         }
-        attrs['origin_main_program']._heter_pipeline_opt = {
+        main_program._heter_pipeline_opt = {
             "trainer": "HeterPipelineTrainer",
             "device_worker": "HeterSection",
             "trainers":
@@ -1109,4 +1120,312 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
             "section_program": main_program,
             "num_microbatches": num_microbatches,
             "heter_place": role_maker._heter_device(),
+            "is_fl_mode": 1
         }
+
+
+@register_pass("split_fl_ops_pass")
+class SplitFlOpsPass(PassBase):
+
+    def __init__(self):
+        super(SplitFlOpsPass, self).__init__()
+        self.PART_A_DEVICE_FlAG = 'gpu:0'
+        self.PART_A_JOINT_OP_DEVICE_FlAG = 'gpu:2'
+        self.PART_B_DEVICE_FlAG = 'gpu:1'
+        self.PART_B_JOINT_OP_DEVICE_FlAG = 'gpu:3'
+
+    def _check_self(self):
+        return True
+
+    def _check_conflict(self, other_pass):
+        return True
+
+    def _insert_encrypt_op(self):
+        pass
+
+    def _insert_decrypt_op(self):
+        pass
+
+    def _clear_op_device_flag(self, program):
+        for block in program.blocks:
+            for op in block.ops:
+                device = op.attr(OP_DEVICE_KEY)
+                op._set_attr(OP_DEVICE_KEY, '') if device != '' else None
+
+    def _split_fl_program(self):
+        self.partA_ops = []
+        self.partB_ops = []
+        party_program_map = defaultdict(Program)
+        block = self.ori_main_program.block(0)
+        for op in block.ops:
+            device = op.attr(OP_DEVICE_KEY)
+            if device == self.PART_A_DEVICE_FlAG or device == '' or device == self.PART_A_JOINT_OP_DEVICE_FlAG:
+                program = party_program_map['a']
+                self.partA_ops.append(op)
+            elif device == self.PART_B_DEVICE_FlAG or device == self.PART_B_JOINT_OP_DEVICE_FlAG:
+                program = party_program_map['b']
+                self.partB_ops.append(op)
+            op_desc = op.desc
+            ap_op = program.global_block().desc.append_op()
+            ap_op.copy_from(op_desc)
+            ap_op._set_attr(OP_DEVICE_KEY, device)
+
+        for key in ['a', 'b']:
+            program = party_program_map[key]
+            program._sync_with_cpp()
+
+        return party_program_map
+
+    def _insert_partA_communicate_op(self, block, idx):
+        comm_info = "forward_joint_{}_{}@fl_ps".format(1, 2)
+        block._insert_op(
+            idx,
+            type='send_and_recv',
+            inputs={'X': self.partA_to_partB_tensor},
+            outputs={'Out': []},
+            attrs={
+                'mode': 'forward',  # mode 直接关联前向和反向 channel 选择
+                'send_var_name':
+                self.partA_to_partB_tensor_name + ["microbatch_id"],
+                'recv_var_name': [],
+                'message_name': comm_info,
+                'next_endpoints':
+                get_next_stage_trainers(self.role_maker),  # partB_endpoints
+                'previous_endpoints':
+                get_previous_stage_trainers(self.role_maker),
+                'trainer_id': get_role_id(self.role_maker),  # global id
+                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+            })
+        return
+
+    def _insert_partB_communicate_op(self, block, idx):
+        comm_info = ("backward_joint_{}_{}@fl_ps".format(2, 1))
+        block._insert_op(
+            idx,
+            type='send_and_recv',
+            inputs={'X': self.partB_to_partA_grad},
+            outputs={'Out': []},
+            attrs={
+                'mode': 'backward',
+                'send_var_name':
+                self.partB_to_partA_grad_name + ["microbatch_id"],
+                'recv_var_name': [],
+                'message_name': comm_info,
+                'next_endpoints':
+                get_next_stage_trainers(self.role_maker),  # partA_endpoints
+                'previous_endpoints':
+                get_previous_stage_trainers(self.role_maker),
+                'trainer_id': get_role_id(self.role_maker),  # global id
+                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+            })
+        return
+
+    def _create_var_for_block(self, vars, block):
+        for var in vars:
+            if block._find_var_recursive(str(var)):
+                continue
+            source_var = self.ori_main_block._var_recursive(str(var))
+            if isinstance(var, Parameter):
+                dest_var = block.create_parameter(
+                    name=source_var.name,
+                    shape=source_var.shape,
+                    dtype=source_var.dtype,
+                    type=source_var.type,
+                    lod_level=source_var.lod_level,
+                    stop_gradient=source_var.stop_gradient,
+                    trainable=source_var.trainable,
+                    optimize_attr=source_var.optimize_attr,
+                    regularizer=source_var.regularizer,
+                    error_clip=source_var.error_clip)
+            else:
+                dest_var = block._clone_variable(source_var, False)
+            dest_var.stop_gradient = source_var.stop_gradient
+            if hasattr(source_var, 'is_distributed'):
+                dest_var.is_distributed = source_var.is_distributed
+
+    def _get_block_by_idx(self, op_list, program, block_idx):
+        if block_idx < len(program.blocks):
+            new_block = program.block(block_idx)
+        else:
+            new_block = program._create_block()
+        for _, op in enumerate(op_list):
+            ap_op = new_block.desc.append_op()
+            ap_op.copy_from(op.desc)
+            ap_op._set_attr(OP_DEVICE_KEY, op.attr(OP_DEVICE_KEY))
+            vars = op.desc.input_arg_names() + op.desc.output_arg_names()
+            self._create_var_for_block(vars, new_block)
+        new_block._sync_with_cpp()
+        return new_block
+
+    def _find_joint_forward_op(self, block, flag):
+        op_idx = 0
+        for op in block.ops:
+            if is_forward_op(op) and op.attr(OP_DEVICE_KEY) == flag:
+                return op_idx
+            else:
+                op_idx += 1
+        return op_idx
+
+    def _find_joint_backward_op(self, block, flag):
+        op_idx = 0
+        for op in block.ops:
+            if is_backward_op(op) and op.attr(OP_DEVICE_KEY) == flag:
+                return op_idx
+            else:
+                op_idx += 1
+        return op_idx
+
+    def _get_partB_to_partA_grad(self, block, flag):
+        op_idx = self._find_joint_backward_op(block, flag)
+        op = block.ops[op_idx]
+        vars1 = op.desc.input_arg_names()
+        op_idx = self._find_joint_forward_op(block, flag)
+        op = block.ops[op_idx]
+        vars2 = op.desc.output_arg_names()
+        self.partB_to_partA_grad_name = list(set(vars1) - set(vars2))
+        self.partB_to_partA_grad = []
+        for var_name in self.partB_to_partA_grad_name:
+            self.partB_to_partA_grad.append(self.ori_main_block.var(var_name))
+
+    def _find_dense_grad_vars(self, bp_op_list):
+        program = self.ori_main_program
+        bp_op_input, bp_op_output = find_ops_list_input_output(
+            program, bp_op_list)
+        return (screen_persistables(program, bp_op_input) +
+                screen_persistables(program, bp_op_output))
+
+    def _get_partA_program(self, block):
+        # 1. create block 0
+        # 1.1 insert send op
+        op_idx = self._find_joint_forward_op(block,
+                                             self.PART_A_JOINT_OP_DEVICE_FlAG)
+        op_list = []
+        for i in range(len(block.ops)):
+            op = block.ops[i]
+            op_list.append(op)
+            if i == op_idx:
+                out_name = op.desc.output_arg_names()[0]
+                self.partA_to_partB_tensor_name = op.desc.output_arg_names()
+                self.partA_to_partB_tensor = self.ori_main_block.var(out_name)
+                break
+        first_block = self._get_block_by_idx(op_list, self.partA_program, 0)
+        self._insert_partA_communicate_op(first_block, op_idx + 1)
+        # logger.info('partA-first_block:{}'.format(first_block))
+
+        # 2. create block 1
+        bp_op_list = get_bp_op_list(block)
+        push_sparse_op_list = get_distributed_push_sparse_op_list(block)
+        # logger.info('bp_op_list: {}'.format(bp_op_list))
+        second_block = self._get_block_by_idx(bp_op_list + push_sparse_op_list,
+                                              self.partA_program, 1)
+        # 2.1. insert partA recv op
+        block_input_flag = "backward_joint_{}_{}@fl_ps".format(2, 1)
+        grad_to_block_id = block_input_flag + ":" + str(second_block.idx)
+        attrs = {
+            "message_to_block_id": [grad_to_block_id],
+            "optimize_blocks": [second_block],
+            "endpoint": get_trainer_endpoint(self.role_maker),  ##
+            "fanin": 0,
+            "pserver_id": get_role_id(self.role_maker),
+            "distributed_mode": self.ps_mode,
+            "rpc_exec_thread_num": int(os.getenv("CPU_NUM", 32)),
+            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+        }
+        second_block._insert_op(index=0,
+                                type='heter_listen_and_serv',
+                                inputs={'X': []},
+                                outputs={},
+                                attrs=attrs)
+        # 2.2 insert push dense grad op
+        send_ops = find_send_op(self.ori_main_program)  # push dense
+        delete_same_ops(block, send_ops)
+        dense_grad_vars = self._find_dense_grad_vars(bp_op_list)
+        add_send_op(self.ori_main_program, second_block, dense_grad_vars)
+        # logger.info('partA-second_block:{}'.format(second_block))
+
+    def _get_partB_program(self, block):
+        op_idx1 = self._find_joint_forward_op(
+            block, self.PART_B_JOINT_OP_DEVICE_FlAG)  # elementwise_add op
+        op_idx2 = self._find_joint_backward_op(block,
+                                               self.PART_B_JOINT_OP_DEVICE_FlAG)
+        op_cnt = 0
+        op_list1 = []
+        op_list2 = []
+        op_list3 = []
+        for op in block.ops:
+            if op_cnt < op_idx1:
+                op_list1.append(op)
+            elif op_cnt <= op_idx2:
+                op_list2.append(op)
+            else:
+                op_list3.append(op)
+            op_cnt += 1
+
+        # 1. create block 0
+        first_block = self._get_block_by_idx(op_list1, self.partB_program, 0)
+
+        # 2. create block 1
+        second_block = self._get_block_by_idx(op_list2, self.partB_program, 1)
+        # 2.1 insert send op
+        self._insert_partB_communicate_op(second_block, len(op_list2))
+        # 2.2 insert remain ops
+        second_block = self._get_block_by_idx(op_list3, self.partB_program, 1)
+        # 2.3 insert push dense grad op
+        bp_op_list = get_bp_op_list(second_block)
+        dense_grad_vars = self._find_dense_grad_vars(bp_op_list)
+        add_send_op(self.ori_main_program, second_block, dense_grad_vars)
+
+        # 3. insert partB recv op
+        block_input_flag = "forward_joint_{}_{}@fl_ps".format(1, 2)
+        grad_to_block_id = block_input_flag + ":" + str(second_block.idx)
+        attrs = {
+            "message_to_block_id": [grad_to_block_id],
+            "optimize_blocks": [second_block],  ## what to do?
+            "endpoint": get_heter_worker_endpoint(self.role_maker),
+            "fanin": len(get_previous_stage_trainers(self.role_maker)),
+            "pserver_id": 1,  # TODO
+            "distributed_mode": self.ps_mode,
+            "rpc_exec_thread_num": int(os.getenv("CPU_NUM", 32)),
+            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+        }
+        first_block._insert_op(index=len(op_list1),
+                               type="heter_listen_and_serv",
+                               inputs={'X': []},
+                               outputs={},
+                               attrs=attrs)
+
+        #logger.info('partB-first_block:{}'.format(first_block))
+        #logger.info('partB-second_block:{}'.format(second_block))
+
+    def _apply_single_impl(self, main_program, startup_program, pass_ctx):
+        attrs = pass_ctx._attrs
+        self.role_maker = attrs['role_maker']
+        self.ps_mode = attrs['ps_mode']
+        self.is_part_b = attrs['is_heter_worker']  # TODO
+        self.ori_main_program = main_program
+        self.ori_main_block = main_program.block(0)
+
+        party_program_map = self._split_fl_program()
+
+        prog_a = party_program_map['a']
+        _main_file = ps_log_root_dir + '6_fl_A_main_program.prototxt'
+        debug_program(_main_file, prog_a)
+        self._get_partB_to_partA_grad(prog_a.global_block(),
+                                      self.PART_A_JOINT_OP_DEVICE_FlAG)
+
+        prog_b = party_program_map['b']
+        _main_file = ps_log_root_dir + '6_fl_B_main_program.prototxt'
+        debug_program(_main_file, prog_b)
+
+        if not self.is_part_b:
+            self.partA_program = framework.Program()
+            self._get_partA_program(prog_a.global_block())
+            pass_ctx._attrs['part_a_main_program'] = self.partA_program
+            self._clear_op_device_flag(self.partA_program)
+            check_program(self.partA_program)
+        else:
+            self.partB_program = framework.Program()
+            self._get_partB_program(prog_b.global_block())
+            pass_ctx._attrs['part_b_main_program'] = self.partB_program
+            self._clear_op_device_flag(self.partB_program)
+            check_program(self.partB_program)
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index 888d517116a15..e10794085dc8d 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -73,28 +73,29 @@ def check_embedding_dim(accessor_proto, varname, program_id, context):
     if accessor_proto.accessor_class == "SparseAccessor":
         if fea_dim != embedding_dim + 2:
             raise ValueError(
-                "The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}".
-                format(embedding_dim + 2, fea_dim))
+                "The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}"
+                .format(embedding_dim + 2, fea_dim))
     else:
         if fea_dim != embedding_dim:
             raise ValueError(
-                "The fea_dim is wrong, it will be sparse_embedding_dim: {}, but got {}".
-                format(embedding_dim, fea_dim))
+                "The fea_dim is wrong, it will be sparse_embedding_dim: {}, but got {}"
+                .format(embedding_dim, fea_dim))
 
     embedx_dim = accessor_proto.embedx_dim
     if accessor_proto.accessor_class == "SparseAccessor":
         if embedx_dim != embedding_dim - 1:
             raise ValueError(
-                "The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}".
-                format(embedding_dim - 1, embedx_dim))
+                "The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}"
+                .format(embedding_dim - 1, embedx_dim))
     else:
         if embedx_dim != embedding_dim - 3:
             raise ValueError(
-                "The embedx_dim is wrong, it will be sparse_embedding_dim - 3: {}, but got {}".
-                format(embedding_dim - 3, embedx_dim))
+                "The embedx_dim is wrong, it will be sparse_embedding_dim - 3: {}, but got {}"
+                .format(embedding_dim - 3, embedx_dim))
 
 
 class Service:
+
     def __init__(self):
         pass
 
@@ -107,6 +108,7 @@ def _set(self, service_proto):
 
 
 class GpuService(Service):
+
     def __init__(self):
         super(GpuService, self).__init__()
 
@@ -116,6 +118,7 @@ def _set(self, service_proto):
 
 
 class Accessor:
+
     def __init__(self):
         self.accessor_class = ""
         self.optimizer = None
@@ -124,8 +127,8 @@ def __init__(self):
 
     # TableAccessorParameter accessor
     def _set(self, accessor_proto, varname, program_id, context):
-        main_program, startup_program, idx = get_program_by_id(context,
-                                                               program_id)
+        main_program, startup_program, idx = get_program_by_id(
+            context, program_id)
         embedding_dim = 0
         for var in main_program.list_vars():
             if var.name == varname:
@@ -208,6 +211,7 @@ def _set(self, accessor_proto, varname, program_id, context):
 
 
 class CommonAccessor(Accessor):
+
     def __init__(self):
         super(CommonAccessor, self).__init__()
         self.table_name = ''
@@ -229,11 +233,11 @@ def define_optimize_map(self):
         opt_input_map["adam"] = [("Param", None), ("Moment1", None),
                                  ("Moment2", None), ("Beta1Pow", 1),
                                  ("Beta2Pow", 1), ("LearningRate", 1)]
-        opt_input_map["adam_d2sum"] = [
-            ("Param", None), ("D2Sum", None), ("G2Sum", None), ("Moment", None),
-            ("MomentDecayRate", 1), ("AdaDecayRate", 1), ("AdaEpsilon", 1),
-            ("LearningRate", 1)
-        ]
+        opt_input_map["adam_d2sum"] = [("Param", None), ("D2Sum", None),
+                                       ("G2Sum", None), ("Moment", None),
+                                       ("MomentDecayRate", 1),
+                                       ("AdaDecayRate", 1), ("AdaEpsilon", 1),
+                                       ("LearningRate", 1)]
         opt_input_map["sum"] = [("Param", None)]
         opt_input_map["naive_adagrad"] = [("Param", None), ("G2Sum", 1),
                                           ("LearningRate", 1)]
@@ -260,8 +264,8 @@ def define_optimize_map(self):
         self.opt_init_map = opt_init_map
 
     def parse_entry(self, varname, program_id, context):
-        main_program, startup_program, idx = get_program_by_id(context,
-                                                               program_id)
+        main_program, startup_program, idx = get_program_by_id(
+            context, program_id)
         for op in main_program.global_block().ops:
             if not is_distributed_sparse_op(op) and not is_sparse_op(op):
                 continue
@@ -315,8 +319,8 @@ def parse_by_optimizer(self, ctx, context):
         # print("parse_by_optimizer table_id:{} is_datanorm:{}".format(
         #     ctx.table_id(), ctx.is_datanorm_table()))
 
-        main_program, startup_program, idx = get_program_by_id(context,
-                                                               ctx.program_id())
+        main_program, startup_program, idx = get_program_by_id(
+            context, ctx.program_id())
         pserver_id = get_role_id(context['role_maker'])
         pserver_num = len(get_ps_endpoints(context['role_maker']))
         optimizer_ops = get_optimize_ops(main_program)
@@ -326,8 +330,8 @@ def parse_by_optimizer(self, ctx, context):
 
         for op in optimizer_ops:
             if ("Param" in op.input_names) and (
-                    op.input("Param")[0] ==
-                    context['grad_name_to_param_name'][grad_name]):
+                    op.input("Param")[0]
+                    == context['grad_name_to_param_name'][grad_name]):
                 oop = op
                 break
 
@@ -390,8 +394,8 @@ def parse_by_optimizer(self, ctx, context):
                         param = main_program.global_block().vars[
                             "learning_rate_" + str(idx)]
 
-                    initializer = self.get_initializer_attr(param.name,
-                                                            startup_program)
+                    initializer = self.get_initializer_attr(
+                        param.name, startup_program)
                 elif formal_name == "MomentDecayRate":
                     initializer = "fill_constant&0.99"
                 elif formal_name == "AdaDecayRate":
@@ -415,8 +419,8 @@ def parse_by_optimizer(self, ctx, context):
                     param = main_program.global_block().vars[oop.input(
                         formal_name)[0]]
 
-                    initializer = self.get_initializer_attr(param.name,
-                                                            startup_program)
+                    initializer = self.get_initializer_attr(
+                        param.name, startup_program)
                 elif formal_name == "SummaryDecayRate":
                     initializer = "fill_constant&0.999999"
                 else:
@@ -444,8 +448,8 @@ def parse_by_optimizer(self, ctx, context):
                                                    pserver_id)
                     dims.append(shape)
 
-                    initializer = self.get_initializer_attr(param.name,
-                                                            startup_program)
+                    initializer = self.get_initializer_attr(
+                        param.name, startup_program)
                     initializers.append(initializer)
 
         for (attr_varname, type_) in attr_varnames:
@@ -472,12 +476,13 @@ def _set(self, proto):
 
 
 class Tensor:
+
     def __init__(self, tesnor_dcit):
         self.tensor_dict = tesnor_dcit
 
     def _set(self, tensor_proto):
-        tensor_proto.main_program_id = self.tensor_dict.get("main_program_id",
-                                                            0)
+        tensor_proto.main_program_id = self.tensor_dict.get(
+            "main_program_id", 0)
         tensor_proto.startup_program_id = self.tensor_dict.get(
             "startup_program_id", 0)
         tensor_proto.feed_var_name = self.tensor_dict.get("feed_var_name", '')
@@ -487,6 +492,7 @@ def _set(self, tensor_proto):
 
 
 class Table:
+
     def __init__(self):
         self.table_class = None
         self.shard_num = -1
@@ -501,6 +507,7 @@ def _set(self, table_proto):
 
 
 class BarrierTable(Table):
+
     def __init__(self, context, idx):
         super(BarrierTable, self).__init__()
         self.type = None
@@ -536,6 +543,7 @@ def _set(self, table_proto):
 
 
 class TensorTable(Table):
+
     def __init__(self, idx, tensor_dict, role_maker):
         super(TensorTable, self).__init__()
         self.idx = idx
@@ -549,8 +557,8 @@ def _set(self, table_proto):
 
         table_proto.accessor.accessor_class = "CommMergeAccessor"
 
-        table_proto.common.table_name = self.tensor_dict.get("feed_var_name",
-                                                             '')
+        table_proto.common.table_name = self.tensor_dict.get(
+            "feed_var_name", '')
         table_proto.common.trainer_num = get_trainers(self.role_maker)
 
         tensor = Tensor(self.tensor_dict)
@@ -558,6 +566,7 @@ def _set(self, table_proto):
 
 
 class SparseTable(Table):
+
     def __init__(self, context, send_ctx):
         super(SparseTable, self).__init__()
         self.context = context
@@ -568,8 +577,8 @@ def __init__(self, context, send_ctx):
 
     def _set(self, table_proto):
         ctx = self.ctx
-        if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or (
-                ctx.is_sparse() == False):
+        if ctx.is_tensor_table() or len(
+                ctx.origin_varnames()) < 1 or (ctx.is_sparse() == False):
             return
         table_proto.table_id = ctx.table_id()
         table_proto.table_class = self.table_class
@@ -610,14 +619,15 @@ def _set(self, table_proto):
                             ctx.program_id(), self.context)
 
         self.common.parse_by_optimizer(ctx, self.context)
-        self.common.parse_entry(self.common.table_name,
-                                ctx.program_id(), self.context)
+        self.common.parse_entry(self.common.table_name, ctx.program_id(),
+                                self.context)
         self.common.sync = True if self.context['is_sync'] else False
 
         self.common._set(table_proto.common)
 
 
 class GeoSparseTable(SparseTable):
+
     def __init__(self, context, send_ctx):
         super(GeoSparseTable, self).__init__(context, send_ctx)
         self.table_class = "MemorySparseGeoTable"
@@ -626,8 +636,8 @@ def __init__(self, context, send_ctx):
 
     def _set(self, table_proto):
         ctx = self.ctx
-        if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or (
-                ctx.is_sparse() == False):
+        if ctx.is_tensor_table() or len(
+                ctx.origin_varnames()) < 1 or (ctx.is_sparse() == False):
             return
         table_proto.table_id = ctx.table_id()
         table_proto.table_class = self.table_class
@@ -641,13 +651,14 @@ def _set(self, table_proto):
         self.common.table_name = self.context['grad_name_to_param_name'][
             ctx.origin_varnames()[0]]
         self.common.parse_by_optimizer(ctx, self.context)
-        self.common.parse_entry(self.common.table_name,
-                                ctx.program_id(), self.context)
+        self.common.parse_entry(self.common.table_name, ctx.program_id(),
+                                self.context)
         self.common.sync = False
         self.common._set(table_proto.common)
 
 
 class DenseTable(Table):
+
     def __init__(self, context, send_ctx):
         super(DenseTable, self).__init__()
         self.context = context
@@ -656,8 +667,8 @@ def __init__(self, context, send_ctx):
 
     def _set(self, table_proto):
         ctx = self.ctx
-        if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or (
-                ctx.is_sparse() == True):
+        if ctx.is_tensor_table() or len(
+                ctx.origin_varnames()) < 1 or (ctx.is_sparse() == True):
             return
 
         table_proto.table_id = ctx.table_id()
@@ -672,14 +683,15 @@ def _set(self, table_proto):
 
         self.common.table_name = "MergedDense"
         self.common.parse_by_optimizer(ctx, self.context)
-        self.common.parse_entry(self.common.table_name,
-                                ctx.program_id(), self.context)
+        self.common.parse_entry(self.common.table_name, ctx.program_id(),
+                                self.context)
         self.common.sync = True if self.context['is_sync'] else False
 
         self.common._set(table_proto.common)
 
 
 class Server:
+
     def __init__(self):
         pass
 
@@ -688,6 +700,7 @@ def _set(self):
 
 
 class DownpourServer(Server):
+
     def __init__(self):
         super(DownpourServer, self).__init__()
 
@@ -696,6 +709,7 @@ def _set(self):
 
 
 class Worker:
+
     def __init__(self):
         pass
 
@@ -704,6 +718,7 @@ def _set(self):
 
 
 class DownpourWorker(Worker):
+
     def __init__(self):
         super(DownpourWorker, self).__init__()
 
@@ -712,6 +727,7 @@ def _set(self):
 
 
 class fsClient:
+
     def __init__(self, fs_client_param):
         self.fs_client_param = fs_client_param
 
@@ -725,6 +741,7 @@ def _set(self, proto):
 
 
 class PsDescBuilder(object):
+
     def __init__(self, context):
         self.context = context
         self.is_sync = context['is_sync']
@@ -732,6 +749,8 @@ def __init__(self, context):
         self.is_heter_ps_mode = context['is_heter_ps_mode']
         self.use_ps_gpu = context['use_ps_gpu']
         self.barrier_table_id = None
+        print("is_heter_ps_mode in the_one_ps.py? {}".format(
+            self.is_heter_ps_mode))
         self.send_ctx = get_the_one_send_context(
             self.context,
             use_origin_program=True,
@@ -772,6 +791,7 @@ def _get_tables(self):
         self.tensor_tables = self._get_tensor_tables()
         tables.extend(self.tensor_tables)
         tables.append(globals()['BarrierTable'](self.context, len(tables)))
+        print("test_fl_ps: tables len: {}".format(len(tables)))
         return tables
 
     def _get_service(self):
@@ -814,6 +834,7 @@ def build_server_desc(self):
 
 
 class TheOnePSRuntime(RuntimeBase):
+
     def __init__(self):
         super(TheOnePSRuntime, self).__init__()
         self._communicator = None
@@ -836,8 +857,8 @@ def _set_basic_info(self, context):
         self.context[
             'is_heter_ps_mode'] = self.role_maker._is_heter_parameter_server_mode
         self.is_heter_ps_mode = self.context['is_heter_ps_mode']
-        self.context['trainer'] = TrainerRuntimeConfig(context[
-            'valid_strategy'])
+        self.context['trainer'] = TrainerRuntimeConfig(
+            context['valid_strategy'])
         self.context['ps_mode'] = self.context['trainer'].mode
         self.context['use_ps_gpu'] = context['valid_strategy'].a_sync_configs[
             'use_ps_gpu']
@@ -864,7 +885,7 @@ def _init_all_params(self, scopes, send_ctx, recv_map):
             scope = scopes[idx]
             table_id = ctx.table_id()
             var_names = recv_map[table_id]
-            # print("init params:", idx, table_id, var_names)
+            #print("init params:", idx, table_id, var_names)
             self._worker.push_dense_params(scope, table_id, var_names)
 
     def _pull_all_dense(self, scopes, send_ctx, recv_map):
@@ -875,7 +896,7 @@ def _pull_all_dense(self, scopes, send_ctx, recv_map):
             scope = scopes[idx]
             table_id = ctx.table_id()
             var_names = recv_map[table_id]
-            # print("pull all dense:", idx, table_id, var_names)
+            #print("pull all dense:", idx, table_id, var_names)
             self._worker.pull_dense_params(scope, table_id, var_names)
 
     def _init_params(self, program, scope, send_ctx, recv_map):
@@ -902,15 +923,17 @@ def _pull_dense(self, program, scope, send_ctx, recv_map):
 
     def _init_worker(self, scopes=None):
         worker_desc = self.ps_desc_builder.build_worker_desc()
-
+        #with open("test_fl_ps_worker_desc", "w") as f:
+        #    f.write(worker_desc)
         if self.context['use_ps_gpu']:
             main_program = self.context['loss'].block.program
             if not main_program._fleet_opt:
                 main_program._fleet_opt = {}
             main_program._fleet_opt["use_ps_gpu"] = True
             gpus_env = os.getenv("FLAGS_selected_gpus")
-            main_program._fleet_opt[
-                "worker_places"] = [int(s) for s in gpus_env.split(",")]
+            main_program._fleet_opt["worker_places"] = [
+                int(s) for s in gpus_env.split(",")
+            ]
 
         def sync_strategy_envs():
             kwargs = {}
@@ -955,7 +978,8 @@ def sync_strategy_envs():
         role_id = get_role_id(self.role_maker)
         self._worker.init_worker(proto_txt, self.string_hosts, role_id)
 
-        if self.context['ps_mode'] == DistributedMode.GEO:
+        if self.context[
+                'ps_mode'] == DistributedMode.GEO or self.is_heter_ps_mode:
             self._communicator = Communicator(
                 trainer_config.mode, kwargs,
                 trainer_config.get_communicator_flags())
@@ -988,8 +1012,9 @@ def sync_strategy_envs():
         # for GEO
         if self.role_maker._is_first_worker() and self.is_heter_ps_mode:
             # for ps-heter mode load all parameters on first_worker
-            init_params = get_the_one_recv_context(
-                self.context, split_dense_table=True, use_origin_program=True)
+            init_params = get_the_one_recv_context(self.context,
+                                                   split_dense_table=True,
+                                                   use_origin_program=True)
         else:
             init_params = dense_map
 
@@ -1010,19 +1035,27 @@ def sync_strategy_envs():
 
         self.scopes = scopes
         if not is_test:
-            if self.context['ps_mode'] == DistributedMode.GEO:
+            if self.context[
+                    'ps_mode'] == DistributedMode.GEO or self.is_heter_ps_mode == True:
                 self._communicator.init_params(init_params)
             else:
                 if not self.context['use_ps_gpu']:
                     if role_id == 0:
+                        print("entering self._init_all_params()")
                         self._init_all_params(scopes, send_ctx, dense_map)
 
-            fleet.util.barrier()
+            fleet.util.barrier()  # 保证 0 号 worker 参数 push_dense_param over
+
         if not self.context['use_ps_gpu']:
-            self._pull_all_dense(scopes, send_ctx, dense_map)
+            if self.is_heter_ps_mode == True and not self.role_maker._is_first_worker(
+            ):
+                self._communicator.pull_dense(init_params)
+            else:
+                self._pull_all_dense(scopes, send_ctx, dense_map)
         fleet.util.barrier()
 
-        if self.context['ps_mode'] == DistributedMode.GEO:
+        if self.context[
+                'ps_mode'] == DistributedMode.GEO or self.is_heter_ps_mode == True:
             if not self._communicator.is_running():
                 self._communicator.start()
             else:
@@ -1031,7 +1064,6 @@ def sync_strategy_envs():
         launch_barrier = dist_strategy.a_sync_configs["launch_barrier"]
         launch_barrier_flag = int(os.getenv("FLAGS_LAUNCH_BARRIER", "1"))
         if launch_barrier and launch_barrier_flag:
-            # for trainer wait server ready
             wait_server_ready(self.role_maker._get_pserver_endpoints())
             if self.is_heter_ps_mode and self.role_maker._get_next_trainers(
             ) != []:
@@ -1043,12 +1075,14 @@ def sync_strategy_envs():
                 next_trainers = []
                 if self.role_maker._get_next_trainers() != []:
                     next_trainers = self.role_maker._get_next_trainers()
-                self._heter_client = HeterClient(next_trainers,
-                                                 previous_trainers,
-                                                 self.role_maker._role_id())
+                self._heter_client = HeterClient(
+                    next_trainers, previous_trainers,
+                    self.role_maker._role_id())  # --> HeterClient::GetInstance
 
     def _init_server(self, dirname=None, var_names=None, **kwargs):
         server_desc = self.ps_desc_builder.build_server_desc()
+        #with open("test_fl_ps_server_desc", "w") as f:
+        #    f.write(server_desc)
         role_id = get_role_id(self.role_maker)
         trainers = get_trainers(self.role_maker)
         if self.is_heter_ps_mode:
@@ -1074,8 +1108,8 @@ def _init_server(self, dirname=None, var_names=None, **kwargs):
             for var_name in var_names:
                 if var_name not in distributed_varnames:
                     raise ValueError(
-                        "fleet.init server can only load sparse variables in {}".
-                        format(distributed_varnames))
+                        "fleet.init server can only load sparse variables in {}"
+                        .format(distributed_varnames))
             load_varnames = var_names
 
         if dirname is None or not load_varnames:
@@ -1105,6 +1139,7 @@ def _stop_worker(self):
 
     @staticmethod
     def __exclude_vars(exclude_var_names=[]):
+
         def is_valid(var):
             if var.name in exclude_var_names:
                 return False
@@ -1179,17 +1214,17 @@ def _save_distributed_persistables(self,
         saved_varnames = sparse_varnames
 
         remaining_vars = list(
-            filter(
-                TheOnePSRuntime.__exclude_vars(saved_varnames),
-                main_program.list_vars()))
+            filter(TheOnePSRuntime.__exclude_vars(saved_varnames),
+                   main_program.list_vars()))
 
         import paddle
         for var in remaining_vars:
             # if var.name not in recv_dense_varnames:
             #     continue
             tensor = var.get_value()
-            paddle.save(
-                tensor, os.path.join(dirname, var.name), use_binary_format=True)
+            paddle.save(tensor,
+                        os.path.join(dirname, var.name),
+                        use_binary_format=True)
 
     def _ps_inference_save_persistables(self,
                                         executor,
@@ -1299,16 +1334,14 @@ def _ps_inference_save_inference_model(self,
             "user_defined_strategy"].trainer_desc_configs["stat_var_names"]
         generate_vars = [var for var in generate_vars]
         remaining_vars = list(
-            filter(
-                TheOnePSRuntime.__exclude_vars(sparse_names),
-                infer_program.list_vars()))
+            filter(TheOnePSRuntime.__exclude_vars(sparse_names),
+                   infer_program.list_vars()))
 
         for var in remaining_vars:
             tensor = var.get_value(scope)
-            paddle.save(
-                tensor,
-                os.path.join(model_path, var.name),
-                use_binary_format=True)
+            paddle.save(tensor,
+                        os.path.join(model_path, var.name),
+                        use_binary_format=True)
 
     def _save_inference_model(self, *args, **kwargs):
         self._ps_inference_save_inference_model(*args, **kwargs)
@@ -1392,9 +1425,8 @@ def _ps_inference_load_inference_model(self,
         loaded_varnames = sparse_varnames
 
         remaining_vars = list(
-            filter(
-                TheOnePSRuntime.__exclude_vars(loaded_varnames),
-                main_program.list_vars()))
+            filter(TheOnePSRuntime.__exclude_vars(loaded_varnames),
+                   main_program.list_vars()))
 
         if dirname.startswith("afs:") or dirname.startswith("hdfs:"):
             model_path = "./dnn_plugin"
diff --git a/python/paddle/distributed/ps/utils/ps_factory.py b/python/paddle/distributed/ps/utils/ps_factory.py
index 701ae8be6cb9c..d2914b0ac44a4 100755
--- a/python/paddle/distributed/ps/utils/ps_factory.py
+++ b/python/paddle/distributed/ps/utils/ps_factory.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,6 +24,7 @@
 
 
 class PsProgramBuilderFactory(object):
+
     def __init__(self):
         pass
 
@@ -33,10 +34,9 @@ def _create_ps_program_builder(self, pass_ctx):
             return globals()['GeoPsProgramBuilder'](pass_ctx)
         elif attrs['use_ps_gpu']:
             return globals()['GpuPsProgramBuilder'](pass_ctx)
-        elif attrs['is_heter_ps_mode']:
+        elif attrs['is_heter_ps_mode'] and not attrs['is_fl_ps_mode']:
             return globals()['HeterAsyncPsProgramBuilder'](pass_ctx)
-        elif 'is_fl_ps_mode' in attrs and attrs[
-                'is_fl_ps_mode'] == DistributedMode.FL:
+        elif 'is_fl_ps_mode' in attrs and attrs['is_fl_ps_mode']:
             return globals()['FlPsProgramBuilder'](pass_ctx)
         elif attrs['ps_mode'] == DistributedMode.SYNC:
             return globals()['CpuSyncPsProgramBuilder'](pass_ctx)
diff --git a/python/paddle/distributed/ps/utils/ps_infer_utils.py b/python/paddle/distributed/ps/utils/ps_infer_utils.py
index e1663029ef1f8..97043fd7ba688 100755
--- a/python/paddle/distributed/ps/utils/ps_infer_utils.py
+++ b/python/paddle/distributed/ps/utils/ps_infer_utils.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py
index f1d6a1f04a331..2d7246d1db9d3 100755
--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,10 +19,14 @@
 
 
 class PsProgramBuilder(object):
+
     def __init__(self, pass_ctx):
         self.pass_ctx = pass_ctx
         self.attrs = self.pass_ctx._attrs
         self.loss = self.attrs['loss']
+        self.origin_startup_program = self.attrs['origin_startup_program']
+        self.main_program = self.attrs['origin_main_programs']
+
         self.cloned_main = self.attrs['cloned_main']
         self.cloned_startup = self.attrs['cloned_startup']
 
@@ -30,6 +34,7 @@ def __init__(self, pass_ctx):
         self.use_heter_ps = self.attrs['is_heter_ps_mode']
         self.is_worker = self.attrs['is_worker']
         self.is_heter_worker = self.attrs['is_heter_worker']
+        self.is_server = self.attrs['is_server']
         self.ps_mode = self.attrs['ps_mode']
 
         self.launch_barrier = self.attrs['launch_barrier']
@@ -67,9 +72,10 @@ def _build_pserver_programs(self):
 
     def _build_programs(self):
         if self.attrs['is_worker']:
-            logger.info("start building trainer program")
             self._build_trainer_programs()
             fluid.framework.switch_startup_program(self.cloned_startup)
+            print("fluid.default_startup_program: {}".format(
+                fluid.default_startup_program))
             # print("ps_program_build before =", id(self.loss.block.program))
             self._build_trainer_desc()
             self.loss.block.program = self.cloned_main
@@ -81,16 +87,15 @@ def _build_programs(self):
             #       self.loss.block.program._fleet_opt)
 
         elif self.attrs['is_server']:
-            logger.info("start building pserver program")
             self._build_pserver_programs()
             self.loss.block.program = self.attrs['_main_server']
-            fluid.framework.switch_startup_program(self.attrs[
-                '_startup_server'])
+            fluid.framework.switch_startup_program(
+                self.attrs['_startup_server'])
 
 
 class GeoPsProgramBuilder(PsProgramBuilder):  # 仅 CPU 模式
+
     def __init__(self, pass_ctx):
-        logger.info("start building geo-ps program")
         super(GeoPsProgramBuilder, self).__init__(pass_ctx)
         if self.ps_mode != DistributedMode.GEO:
             raise ValueError("ps mode: {} not matched {}",
@@ -105,8 +110,6 @@ def _build_trainer_programs(self):
         if self.launch_barrier and self.launch_barrier_flag:
             wait_server_ready(self.server_endpoints)
 
-        return
-
     def _build_pserver_programs(self):
         add_listen_and_serv_pass = new_pass('add_listen_and_serv_pass',
                                             self.attrs)
@@ -116,10 +119,9 @@ def _build_pserver_programs(self):
 
 
 class CpuSyncPsProgramBuilder(PsProgramBuilder):
+
     def __init__(self, pass_ctx):
         super(CpuSyncPsProgramBuilder, self).__init__(pass_ctx)
-        if self.ps_mode == DistributedMode.SYNC:
-            logger.info("start building cpu-sync-ps program")
         if self.ps_mode != DistributedMode.SYNC and self.ps_mode != DistributedMode.ASYNC:
             raise ValueError("ps mode: {} not matched {}",
                              format(self.ps_mode, "PsProgramBuilder"))
@@ -160,8 +162,8 @@ def _build_trainer_programs(self):
 
 
 class CpuAsyncPsProgramBuilder(CpuSyncPsProgramBuilder):
+
     def __init__(self, pass_ctx):
-        logger.info("start building cpu-async-ps program")
         super(CpuAsyncPsProgramBuilder, self).__init__(pass_ctx)
 
     def _build_trainer_desc(self):
@@ -197,8 +199,8 @@ def _build_trainer_desc(self):
 
 
 class GpuPsProgramBuilder(PsProgramBuilder):
+
     def __init__(self, pass_ctx):
-        logger.info("start building gpu-ps program")
         super(GpuPsProgramBuilder, self).__init__(pass_ctx)
 
     def _build_trainer_programs(self):
@@ -230,13 +232,9 @@ def _build_trainer_programs(self):
 
 
 class HeterAsyncPsProgramBuilder(PsProgramBuilder):
+
     def __init__(self, pass_ctx):
-        logger.info("start building heter-async-ps program")
         super(HeterAsyncPsProgramBuilder, self).__init__(pass_ctx)
-        if self.use_ps_gpu or self.ps_mode == DistributedMode.GEO or self.attrs[
-                'is_heter_ps_mode'] == False:
-            raise ValueError("ps mode: {} not matched {}",
-                             format(self.ps_mode, "HeterAsyncPsProgramBuilder"))
 
     def _build_trainer_programs(self):
         add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
@@ -286,25 +284,105 @@ def _build_programs(self):
             self._build_trainer_programs()
             ps_set_heter_pipeline_opt_pass = new_pass(
                 "set_heter_pipeline_opt_pass", self.attrs)
-            ps_set_heter_pipeline_opt_pass.apply(
-                [self.cloned_main], [self.cloned_startup], self.pass_ctx)
+            ps_set_heter_pipeline_opt_pass.apply([self.cloned_main],
+                                                 [self.cloned_startup],
+                                                 self.pass_ctx)
 
         elif self.attrs['is_server']:
             self._build_pserver_programs()
             self.loss.block.program = self.attrs['_main_server']
-            fluid.framework.switch_startup_program(self.attrs[
-                '_startup_server'])
+            fluid.framework.switch_startup_program(
+                self.attrs['_startup_server'])
+
 
+class FlPsProgramBuilder(HeterAsyncPsProgramBuilder):
 
-class FlPsProgramBuilder(PsProgramBuilder):
     def __init__(self, pass_ctx):
         super(FlPsProgramBuilder, self).__init__(pass_ctx)
 
     def _build_trainer_programs(self):
-        pass
+        _main_file = ps_log_root_dir + '0_fl_worker_main_program.prototxt'
+        #debug_program(_main_file, self.cloned_main)
+
+        distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs)
+        distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)
+
+        _main_file = ps_log_root_dir + '1_fl_worker_main_program.prototxt'
+        #debug_program(_main_file, self.cloned_main)
+
+        delete_optimizer_pass = new_pass("delete_optimizer_pass", self.attrs)
+        delete_optimizer_pass.apply([self.cloned_main], [None], self.pass_ctx)
+
+        _main_file = ps_log_root_dir + '2_fl_worker_main_program.prototxt'
+        #debug_program(_main_file, self.cloned_main)
+
+        append_send_ops_pass = new_pass("append_send_ops_pass", self.attrs)
+        append_send_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)
+
+        _main_file = ps_log_root_dir + '3_fl_worker_main_program.prototxt'
+        #debug_program(_main_file, self.cloned_main)
+
+        delete_extra_optimizer_pass = new_pass("delete_extra_optimizer_pass",
+                                               self.attrs)
+        delete_extra_optimizer_pass.apply([self.attrs['origin_main_program']],
+                                          [self.cloned_startup], self.pass_ctx)
+
+        _main_file = ps_log_root_dir + '4_fl_worker_main_program.prototxt'
+        #debug_program(_main_file, self.cloned_main)
+
+        fake_init_ops_pass = new_pass("fake_init_ops_pass", self.attrs)
+        fake_init_ops_pass.apply([None], [self.cloned_startup], self.pass_ctx)
+
+        _main_file = ps_log_root_dir + '5_fl_worker_main_program.prototxt'
+        #debug_program(_main_file, self.cloned_main)
+
+        split_trainer_ops_pass = new_pass("split_fl_ops_pass", self.attrs)
+        split_trainer_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)
+
+        if not self.is_heter_worker:
+            self.part_a_program = self.pass_ctx._attrs['part_a_main_program']
+            self.cloned_main = self.part_a_program
+            _main_file = ps_log_root_dir + '8_fl_A_main_program.prototxt'
+            debug_program(_main_file, self.cloned_main)
+        else:
+            self.part_b_program = self.pass_ctx._attrs['part_b_main_program']
+            self.cloned_main = self.part_b_program
+            _main_file = ps_log_root_dir + '8_fl_B_main_program.prototxt'
+            debug_program(_main_file, self.cloned_main)
+
+        set_heter_pipeline_opt_pass = new_pass('set_heter_pipeline_opt_pass',
+                                               self.attrs)
+        set_heter_pipeline_opt_pass.apply([self.cloned_main],
+                                          [self.cloned_startup], self.pass_ctx)
+
+        self.attrs['origin_startup_program'] = self.cloned_startup
+        self.attrs['origin_main_program'] = self.cloned_main
+
+        if not self.is_heter_worker:
+            _main_file = ps_log_root_dir + 'final_fl_A_main_program.prototxt'
+            debug_program(
+                _main_file, self.attrs['origin_main_program'].
+                _heter_pipeline_opt['section_program'])
+        else:
+            _main_file = ps_log_root_dir + 'final_fl_B_main_program.prototxt'
+            debug_program(
+                _main_file, self.attrs['origin_main_program'].
+                _heter_pipeline_opt['section_program'])
+
+        return
 
     def _build_pserver_programs(self):
-        pass
+        self.loss.block.program = self.attrs['_main_server']
 
     def _build_programs(self):
-        pass
+        if not self.is_server:
+            self._build_trainer_programs()
+            fluid.framework.switch_startup_program(self.cloned_startup)
+            fluid.framework.switch_main_program(self.cloned_main)
+            print("fluid.default_startup_program: {}".format(
+                fluid.default_startup_program()._heter_pipeline_opt))
+        else:
+            self._build_pserver_programs()
+            fluid.framework.switch_startup_program(
+                self.attrs['_startup_server'])
+            fluid.framework.switch_main_program(self.attrs['_main_server'])
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index 7acfd6cfe19f5..a57b30a8c1921 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -37,10 +37,12 @@
 OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
 RPC_OP_ROLE_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleAttrName()
 RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
+op_role = core.op_proto_and_checker_maker.OpRole
 op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
 OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize
 backward = core.op_proto_and_checker_maker.OpRole.Backward
+OP_DEVICE_KEY = core.op_proto_and_checker_maker.kOpDeviceAttrName()
 
 DEVICE_LIST = ["cpu", "gpu", "xpu"]
 COMMUNICATE_OPS_TYPE = ["send", "recv", "fetch_barrier", "send_barrier"]
@@ -59,8 +61,10 @@
 def logger_config(log_path, logging_name):
     logger = logging.getLogger(logging_name)
     logger.setLevel(level=logging.WARNING)
-    handler = logging.FileHandler(
-        log_path, mode='a', encoding='UTF-8', delay=True)
+    handler = logging.FileHandler(log_path,
+                                  mode='a',
+                                  encoding='UTF-8',
+                                  delay=True)
     handler.setLevel(logging.INFO)
     formatter = logging.Formatter(
         '%(levelname)s - %(asctime)s - %(pathname)s: %(lineno)s - %(message)s')
@@ -73,8 +77,8 @@ def logger_config(log_path, logging_name):
 
 
 ps_log_root_dir = './ps_log/'
-logger = logger_config(
-    log_path='./ps_usr_print_log', logging_name='ps_usr_print_log')
+logger = logger_config(log_path='./ps_usr_print_log',
+                       logging_name='ps_usr_print_log')
 
 
 class DistributedMode:
@@ -86,13 +90,13 @@ class DistributedMode:
 
 
 class TrainerRuntimeConfig(object):
+
     def __init__(self, valid_strategy):
         self.mode = None
         num_threads = os.getenv("CPU_NUM", "1")
         send_queue_size = num_threads
         k_steps = valid_strategy.a_sync_configs["k_steps"]
-        logger.info("ps mode in strategy: {}, {}".format(
-            valid_strategy.a_sync, valid_strategy.a_sync_configs["k_steps"]))
+
         if not valid_strategy.a_sync and k_steps == 0:
             self.mode = DistributedMode.SYNC
 
@@ -150,21 +154,22 @@ def get_communicator_flags(self):
             send_queue_size = self.runtime_configs[
                 'communicator_send_queue_size']
             if max_merge_var_num != num_threads:
-                print('WARNING: In {} mode, communicator_max_merge_var_num '
-                      'must be equal to CPU_NUM. But received, '
-                      'communicator_max_merge_var_num = {}, CPU_NUM = '
-                      '{}. communicator_max_merge_var_num will be forced to {}.'
-                      .format(mode_str, max_merge_var_num, num_threads,
-                              num_threads))
+                print(
+                    'WARNING: In {} mode, communicator_max_merge_var_num '
+                    'must be equal to CPU_NUM. But received, '
+                    'communicator_max_merge_var_num = {}, CPU_NUM = '
+                    '{}. communicator_max_merge_var_num will be forced to {}.'.
+                    format(mode_str, max_merge_var_num, num_threads,
+                           num_threads))
                 self.runtime_configs[
                     'communicator_max_merge_var_num'] = num_threads
             if send_queue_size != num_threads:
                 print('WARNING: In {} mode, communicator_send_queue_size '
                       'must be equal to CPU_NUM. But received, '
                       'communicator_send_queue_size = {}, CPU_NUM = '
-                      '{}. communicator_send_queue_size will be forced to {}.'
-                      .format(mode_str, send_queue_size, num_threads,
-                              num_threads))
+                      '{}. communicator_send_queue_size will be forced to {}.'.
+                      format(mode_str, send_queue_size, num_threads,
+                             num_threads))
                 self.runtime_configs[
                     'communicator_send_queue_size'] = num_threads
 
@@ -238,17 +243,11 @@ def get_ps_endpoints(role_maker):
 
 
 def get_heter_worker_endpoint(role_maker):
-    try:
-        return role_maker._get_heter_worker_endpoint()
-    except Exception:
-        return role_maker.get_heter_worker_endpoint()
+    return role_maker._get_heter_worker_endpoint()
 
 
 def get_trainer_endpoint(role_maker):
-    try:
-        return role_maker._get_trainer_endpoint()
-    except Exception:
-        return role_maker.get_trainer_endpoint()
+    return role_maker._get_trainer_endpoint()
 
 
 def get_previous_stage_trainers(role_maker):
@@ -339,8 +338,8 @@ def get_dense_send_context(program,
             var_numel += reduce(lambda x, y: x * y, var.shape)
         grad_name = "Dense@GRAD_" + str(idx)
         aggregate = True
-        print("public get_dense_send_context dense_table:", grad_name,
-              var_numel, origin_varnames)
+        # print("public get_dense_send_context dense_table:", grad_name,
+        #      var_numel, origin_varnames)
         from paddle.fluid.core import CommContext
         dense_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
                                 [var_numel], origin_varnames, trainer_id,
@@ -362,8 +361,8 @@ def get_dense_send_context(program,
             var_numel += reduce(lambda x, y: x * y, var.shape)
         grad_name = "DataNorm@GRAD_" + str(idx)
         aggregate = True
-        print("public get_dense_send_context data_norm table:", grad_name,
-              var_numel, origin_varnames)
+        # print("public get_dense_send_context data_norm table:", grad_name,
+        #      var_numel, origin_varnames)
         from paddle.fluid.core import CommContext
         data_norm_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
                                     [var_numel], origin_varnames, trainer_id,
@@ -410,10 +409,9 @@ def get_geo_trainer_send_context(context):
             var = program.global_block().vars[grad.merged_var.name]
             var_numel = reduce(lambda x, y: x * y, var.shape[1:])
             from paddle.fluid.core import CommContext
-            sparse_ctx = CommContext(grad_name, [grad_name],
-                                     ["127.0.0.1:6071"], [var_numel],
-                                     [grad_name], trainer_id, True, True,
-                                     is_distributed, idx, False, False,
+            sparse_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
+                                     [var_numel], [grad_name], trainer_id, True,
+                                     True, is_distributed, idx, False, False,
                                      id(program))
             idx += 1
             send_ctx[sparse_ctx.var_name()] = sparse_ctx
@@ -441,14 +439,15 @@ def _step_ctx(idx, role_maker):
 
 
 def get_the_one_send_context(context,
-                             split_dense_table=False,
                              use_origin_program=False,
+                             split_dense_table=False,
                              ep_list=None):
     if ep_list is None:
         ep_list = ["127.0.0.1:6071"]
     send_ctx = {}
     trainer_id = get_role_id(context['role_maker'])
     origin_programs = context['origin_main_programs']
+    print("is_heter_ps_mode? {}".format(split_dense_table))
 
     idx = 0
     distibuted_varnames = get_sparse_tablenames(origin_programs, True)
@@ -471,8 +470,8 @@ def get_the_one_send_context(context,
             shape = list(var.shape)
             shape[0] = 0 if is_distributed else shape[0]
 
-            # print("public get_the_one_send_context sparse:", grad_name,
-            #       splited_varname, shape)
+            #print("public get_the_one_send_context sparse:", grad_name,
+            #      splited_varname, shape)
             if grad_name in send_ctx:
                 continue
             from paddle.fluid.core import CommContext
@@ -554,10 +553,8 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
                     op_list = list(block.ops)
                     sum_op = op_list[var2idx[param_name]]
                     sum_op_inputs = {
-                        sum_op.input_names[0]: [
-                            block.vars[input]
-                            for input in sum_op.input_arg_names
-                        ]
+                        sum_op.input_names[0]:
+                        [block.vars[input] for input in sum_op.input_arg_names]
                     }
                     sum_op_outputs = {
                         sum_op.output_names[0]: [
@@ -565,12 +562,11 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
                             for output in sum_op.output_arg_names
                         ]
                     }
-                    block._insert_op(
-                        index=i + 1,
-                        type=sum_op.type,
-                        inputs=sum_op_inputs,
-                        outputs=sum_op_outputs,
-                        attrs=sum_op.all_attrs())
+                    block._insert_op(index=i + 1,
+                                     type=sum_op.type,
+                                     inputs=sum_op_inputs,
+                                     outputs=sum_op_outputs,
+                                     attrs=sum_op.all_attrs())
                     block._remove_op(var2idx[param_name] + 1)
                     var2idx.pop(param_name)
                     for var_ in var2idx:
@@ -608,12 +604,11 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
                                 for output in sum_op.output_arg_names
                             ]
                         }
-                        block._insert_op(
-                            index=i + 1,
-                            type=sum_op.type,
-                            inputs=sum_op_inputs,
-                            outputs=sum_op_outputs,
-                            attrs=sum_op.all_attrs())
+                        block._insert_op(index=i + 1,
+                                         type=sum_op.type,
+                                         inputs=sum_op_inputs,
+                                         outputs=sum_op_outputs,
+                                         attrs=sum_op.all_attrs())
                         block._remove_op(var2idx[no_grad_var] + 1)
                         var2idx.pop(no_grad_var)
                         for var_ in var2idx:
@@ -628,8 +623,8 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
                         forward_op_type = pre_op.type.split("_grad")[0]
                         if forward_op_type in SPARSE_OP_TYPE_DICT.keys() \
                             and pre_op.attr('remote_prefetch') is True:
-                            param_name = pre_op.input(SPARSE_OP_TYPE_DICT[
-                                forward_op_type])[0]
+                            param_name = pre_op.input(
+                                SPARSE_OP_TYPE_DICT[forward_op_type])[0]
                             if param_name == origin_var and op.attr(
                                     "op_device") == pre_op.attr("op_device"):
                                 continue
@@ -729,7 +724,8 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
     if len(heter_ops) == 0:
         warnings.warn(
             "No heterogeneous OP was found in your program , "
-            " please using fluid.device_guard() to run OPs on different device.")
+            " please using fluid.device_guard() to run OPs on different device."
+        )
 
     total_heter_ops = 0
     heter_blocks = 0
@@ -739,8 +735,8 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
         for _, heter_block in heter_block_dict.items():
             total_heter_ops += len(heter_block)
     print(
-        "There are {} OPs in your main_program, and contains {} heter-OPs which is made up of {} heter-blocks.".
-        format(len(block.ops), total_heter_ops, heter_blocks))
+        "There are {} OPs in your main_program, and contains {} heter-OPs which is made up of {} heter-blocks."
+        .format(len(block.ops), total_heter_ops, heter_blocks))
 
     return origin_porgram, heter_ops, default_ops, program_block_ops
 
@@ -760,9 +756,8 @@ def union_forward_gradient_op(program_block_ops_list):
     assert block_length % 2 != 0, "the length of program_block_ops_list should be odd"
     for i in range(0, block_length // 2):
         block_op_list = {"forward": program_block_ops_list[i]}
-        block_op_list.update({
-            "backward": program_block_ops_list[block_length - 1 - i]
-        })
+        block_op_list.update(
+            {"backward": program_block_ops_list[block_length - 1 - i]})
         union_program_block_ops_list.append(block_op_list)
 
     block_op_list = {"forward": [], "backward": []}
@@ -780,8 +775,9 @@ def find_block_joints(program, program_block_ops_list, heter_ops):
                                                   program_block_ops_list)
     block_var_detail = entrance_exit_check(program, program_block_ops_list,
                                            block_var_detail, heter_ops)
-    block_var_detail = delete_block_useless_exit(
-        program, program_block_ops_list, block_var_detail)
+    block_var_detail = delete_block_useless_exit(program,
+                                                 program_block_ops_list,
+                                                 block_var_detail)
 
     return block_var_detail
 
@@ -826,8 +822,8 @@ def find_entrance_exit_private(program, program_block_ops_list):
         bp_block_input, bp_block_output = find_ops_list_input_output(
             program, block_op_list["backward"])
         bp_persistables = screen_persistables(
-            program, bp_block_input) + screen_persistables(program,
-                                                           bp_block_output)
+            program, bp_block_input) + screen_persistables(
+                program, bp_block_output)
         # find entrance & exit
         bp_block_private_vars = list(set(bp_block_input) & set(bp_block_output))
         bp_block_entrance = list(
@@ -875,10 +871,10 @@ def entrance_exit_check(program, program_block_ops_list, block_var_detail,
         #need_add_vars = find_need_var_from_previous_block(
         #    need_add_vars, block_var_detail, index, heter_ops)
 
-        previous_block_private = block_var_detail[index - 1]["forward"][
-            "private"]
-        previous_block_entrance = block_var_detail[index - 1]["forward"][
-            "entrance"]
+        previous_block_private = block_var_detail[index -
+                                                  1]["forward"]["private"]
+        previous_block_entrance = block_var_detail[index -
+                                                   1]["forward"]["entrance"]
         for var in need_add_vars:
             if var not in previous_block_private and var not in previous_block_entrance:
                 previous_block_entrance.append(var)
@@ -904,10 +900,10 @@ def entrance_exit_check(program, program_block_ops_list, block_var_detail,
                 need_ignore_vars.append(var)
         need_add_vars = list(
             set(need_add_vars).difference(set(need_ignore_vars)))
-        previous_block_private = block_var_detail[index + 1]["backward"][
-            "private"]
-        previous_block_entrance = block_var_detail[index + 1]["backward"][
-            "entrance"]
+        previous_block_private = block_var_detail[index +
+                                                  1]["backward"]["private"]
+        previous_block_entrance = block_var_detail[index +
+                                                   1]["backward"]["entrance"]
         for var in need_add_vars:
             if var not in previous_block_private and var not in previous_block_entrance:
                 previous_block_entrance.append(var)
@@ -935,8 +931,8 @@ def delete_block_useless_exit(program, program_block_ops_list,
         if index - 1 < 0:
             break
         current_block_exit = block_var_detail[index]["backward"]["exit"]
-        next_block_entrance = block_var_detail[index - 1]["backward"][
-            "entrance"]
+        next_block_entrance = block_var_detail[index -
+                                               1]["backward"]["entrance"]
         need_delete_var = []
         for var in current_block_exit:
             if var not in next_block_entrance:
@@ -986,8 +982,8 @@ def add_vars_by_var_list(var_name_list, origin_program, program, block):
         ).vars and var_name not in block.vars:
             var = origin_program.global_block().vars[var_name]
             if var.persistable:
-                program.global_block()._clone_variable(
-                    var, force_persistable=False)
+                program.global_block()._clone_variable(var,
+                                                       force_persistable=False)
             else:
                 block._clone_variable(var, force_persistable=False)
 
@@ -1089,19 +1085,20 @@ def block_append_op(program, origin_program, block, op):
 
     if "_grad" not in op.type:
         # for forward op
-        return block.append_op(
-            type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs())
+        return block.append_op(type=op.type,
+                               inputs=inputs,
+                               outputs=outputs,
+                               attrs=op.all_attrs())
     else:
         # for grad op
         op_desc = op.desc
-        op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
         backward = core.op_proto_and_checker_maker.OpRole.Backward
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
 
         # append grad op
         new_op_desc = block.desc.append_op()
         new_op_desc.copy_from(op_desc)
-        new_op_desc._set_attr(op_role_attr_name, backward)
+        new_op_desc._set_attr(RPC_OP_ROLE_ATTR_NAME, backward)
 
         # set device gard
         if op.desc.has_attr(device_attr_name):
@@ -1142,22 +1139,24 @@ def insert_communicate_op(orign_program,
         comm_info = get_communicate_var_info(orign_program, stage_id - 1,
                                              entrance_var, "backward")
 
-    heter_block._insert_op(
-        index=first_op_index,
-        type="send_and_recv",
-        inputs={"X": heter_block.vars[entrance_var[0]]},
-        outputs={"Out": []},
-        attrs={
-            "mode": "forward" if is_forward else "backward",
-            "send_var_name": entrance_var + ["microbatch_id"],
-            "recv_var_name": [],
-            "message_name": comm_info["block_input_var_name"],
-            "next_endpoints": next_heter_worker_endpoints,
-            "previous_endpoints": previous_heter_worker_endpoints,
-            "trainer_id": get_role_id(role_maker),
-            "op_device": device,
-            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-        })
+    heter_block._insert_op(index=first_op_index,
+                           type="send_and_recv",
+                           inputs={"X": heter_block.vars[entrance_var[0]]},
+                           outputs={"Out": []},
+                           attrs={
+                               "mode": "forward" if is_forward else "backward",
+                               "send_var_name":
+                               entrance_var + ["microbatch_id"],
+                               "recv_var_name": [],
+                               "message_name":
+                               comm_info["block_input_var_name"],
+                               "next_endpoints": next_heter_worker_endpoints,
+                               "previous_endpoints":
+                               previous_heter_worker_endpoints,
+                               "trainer_id": get_role_id(role_maker),
+                               "op_device": device,
+                               RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                           })
 
     return entrance_var
 
@@ -1187,11 +1186,10 @@ def get_the_one_recv_context(context,
                 param_names.append(param_name)
             recv_id_maps[ctx.table_id()] = param_names
     else:
-        send_ctx = get_the_one_send_context(
-            context,
-            split_dense_table=False,
-            use_origin_program=False,
-            ep_list=None)
+        send_ctx = get_the_one_send_context(context,
+                                            split_dense_table=False,
+                                            use_origin_program=False,
+                                            ep_list=None)
         for idx, (name, ctx) in enumerate(send_ctx.items()):
             if not ctx.is_sparse():
                 continue
@@ -1244,6 +1242,7 @@ def get_var_mem_size(var):
 
 
 class MergedVariable:
+
     def __init__(self, merged, ordered, offsets):
         self.merged_var = merged
         self.ordered_vars = ordered
@@ -1347,6 +1346,7 @@ def _is_opt_role_op(op):
 
 
 def get_param_grads(origin_program):
+
     def _get_params_grads(sparse_varnames):
         block = origin_program.global_block()
 
@@ -1422,7 +1422,8 @@ def find_op_input_output(program, block, op):
     return input_var_list, output_var_list
 
 
-def add_heter_send_op(program, heter_program, block, block_var_detail):
+def add_send_op(program, block, _vars):
+
     def _get_send_op_dict():
         send_op_dict = {}
         send_op_list = find_send_op(program)
@@ -1436,7 +1437,7 @@ def _get_send_op_dict():
     send_grad_var_list = []
     send_op_dict = _get_send_op_dict()
     table_dict = {}
-    for persistable_var in block_var_detail["backward"]["persistables"]:
+    for persistable_var in _vars:
         if "@GRAD" not in persistable_var:
             continue
         if "GRAD" != persistable_var.split("@")[-1]:
@@ -1462,16 +1463,16 @@ def _get_send_op_dict():
             block.vars[union_var]
             for union_var in table_dict[table_id]['var_list']
         ]
-        block.append_op(
-            type="send",
-            inputs={"X": send_input_vars},
-            outputs={"Out": dummy_output},
-            attrs={
-                "send_varnames": table_dict[table_id]['send_varnames'],
-                "is_sparse": is_sparse,
-                "table_id": table_id,
-                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-            })
+        block.append_op(type="send",
+                        inputs={"X": send_input_vars},
+                        outputs={"Out": dummy_output},
+                        attrs={
+                            "send_varnames":
+                            table_dict[table_id]['send_varnames'],
+                            "is_sparse": is_sparse,
+                            "table_id": table_id,
+                            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                        })
 
     return send_grad_var_list
 
@@ -1482,6 +1483,7 @@ def get_vars_name_in_block(block):
     return vars_name_list
 
 
+# reserve static_var
 def delete_trainer_useless_var(program, static_var):
     static_var = list(set(static_var))
     program_useful_var_list = []
@@ -1525,6 +1527,67 @@ def create_backward_block(program, origin_program, bp_ops_list,
     return heter_block
 
 
+def is_backward_op(op):
+    return op_role_attr_name in op.attr_names and (
+        int(op.attr(op_role_attr_name)) & int(op_role.Backward))
+
+
+def is_forward_op(op):
+    return op_role_attr_name in op.attr_names and (int(
+        op.attr(op_role_attr_name)) == int(op_role.Forward))
+
+
+def is_push_sparse_op(op):
+    return op.type == 'distributed_push_sparse'
+
+
+def get_distributed_push_sparse_op_list(block):
+    push_sparse_op_list = []
+    for op_idx in range(block.desc.op_size()):
+        op = block.ops[op_idx]
+        if is_push_sparse_op(op):
+            push_sparse_op_list.append(op)
+    return push_sparse_op_list
+
+
+def get_bp_op_list(block):
+    bp_op_list = []
+    for op_idx in range(block.desc.op_size()):
+        op = block.ops[op_idx]
+        if is_backward_op(op):
+            bp_op_list.append(op)
+    return bp_op_list
+
+
+def delete_same_ops(block, ops):
+    for op in ops:
+        try:
+            for origin_op in block.ops:
+                if str(origin_op) == str(op):
+                    idx = list(block.ops).index(origin_op)
+                    block._remove_op(idx)
+                    break
+        except Exception as e:
+            print(e)
+
+
+def check_program(program):
+    block_idx = 0
+    for block in program.blocks:
+        for op in block.ops:
+            input_var_names = op.desc.input_arg_names()
+            output_var_names = op.desc.output_arg_names()
+            for var_name in (input_var_names + output_var_names):
+                if not block._find_var_recursive(str(var_name)):
+                    raise ValueError(
+                        'var: {} needed by op is not found in block: {}'.format(
+                            str(var_name), block_idx))
+        block_idx += 1
+    print('program checked valid')
+
+
 def debug_program(file, program):
+    # py >= 3.2
+    os.makedirs(os.path.dirname(file), exist_ok=True)
     with open(file, 'w+') as f:
         f.write(str(program))
diff --git a/python/paddle/distributed/sharding/__init__.py b/python/paddle/distributed/sharding/__init__.py
index d14e3dd099ffe..e938c12d5af0e 100644
--- a/python/paddle/distributed/sharding/__init__.py
+++ b/python/paddle/distributed/sharding/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
index 4c22028b2304c..ad270c1a51733 100644
--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -105,8 +105,8 @@ def group_sharded_parallel(model,
     assert isinstance(
         optimizer, Optimizer
     ), "The optimizer must be the instance of paddle.optimizer.Optimizer."
-    assert level in ['os', 'os_g', 'p_g_os'
-                     ], "The level must be os, os_g or p_g_os."
+    assert level in ['os', 'os_g',
+                     'p_g_os'], "The level must be os, os_g or p_g_os."
 
     def check_dtype(param):
         return param.dtype == paddle.float16
@@ -125,43 +125,38 @@ def check_dtype(param):
                 optim=optimizer,
                 group=group,
                 offload=offload)
-            model = GroupShardedStage2(
-                model,
-                optimizer,
-                group=group,
-                sync_buffers=sync_buffers,
-                buffer_max_size=buffer_max_size)
+            model = GroupShardedStage2(model,
+                                       optimizer,
+                                       group=group,
+                                       sync_buffers=sync_buffers,
+                                       buffer_max_size=buffer_max_size)
         else:
-            optimizer = ShardingOptimizerStage2(
-                params=model.parameters(),
-                optim=optimizer,
-                group=group,
-                offload=offload)
-            model = ShardingStage2(
-                model,
-                optimizer,
-                group=group,
-                sync_buffers=sync_buffers,
-                buffer_max_size=buffer_max_size)
+            optimizer = ShardingOptimizerStage2(params=model.parameters(),
+                                                optim=optimizer,
+                                                group=group,
+                                                offload=offload)
+            model = ShardingStage2(model,
+                                   optimizer,
+                                   group=group,
+                                   sync_buffers=sync_buffers,
+                                   buffer_max_size=buffer_max_size)
     elif level == 'p_g_os':
         if in_dygraph_mode():
-            model = GroupShardedStage3(
-                model,
-                optimizer=optimizer,
-                group=group,
-                sync_buffers=sync_buffers,
-                segment_size=segment_size,
-                offload=offload,
-                sync_comm=sync_comm)
+            model = GroupShardedStage3(model,
+                                       optimizer=optimizer,
+                                       group=group,
+                                       sync_buffers=sync_buffers,
+                                       segment_size=segment_size,
+                                       offload=offload,
+                                       sync_comm=sync_comm)
         else:
-            model = ShardingStage3(
-                model,
-                optimizer=optimizer,
-                group=group,
-                sync_buffers=sync_buffers,
-                segment_size=segment_size,
-                offload=offload,
-                sync_comm=sync_comm)
+            model = ShardingStage3(model,
+                                   optimizer=optimizer,
+                                   group=group,
+                                   sync_buffers=sync_buffers,
+                                   segment_size=segment_size,
+                                   offload=offload,
+                                   sync_comm=sync_comm)
     else:
         raise ValueError("Please enter the correct level.")
     if params_fp16 and isinstance(scaler, paddle.amp.GradScaler):
@@ -238,7 +233,8 @@ def save_group_sharded_model(model, output, optimizer=None):
         paddle.save(model._layer.state_dict(), output_model)
     else:
         raise ValueError(
-            "Please use the layer which is wrapped with group_sharded_parallel.")
+            "Please use the layer which is wrapped with group_sharded_parallel."
+        )
 
     if optimizer is not None:
         assert hasattr(
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index 66545a8a249ba..c0ff2bc273dc5 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -38,6 +38,7 @@
 
 
 class ParallelEnvArgs(object):
+
     def __init__(self):
         # Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..
         self.cluster_node_ips = None
@@ -55,9 +56,9 @@ def __init__(self):
         # Print the config or not
         self.print_config = True
 
-        # It's for gpu training and the training process will run 
-        # on the selected_devices, each process is bound to a single GPU. 
-        # And if it's not set, this module will use all the gpu cards 
+        # It's for gpu training and the training process will run
+        # on the selected_devices, each process is bound to a single GPU.
+        # And if it's not set, this module will use all the gpu cards
         # for training.
         self.selected_devices = None
 
@@ -105,8 +106,8 @@ def _get_default_nprocs():
         return multiprocessing.cpu_count()
     else:
         raise RuntimeError(
-            "`paddle.distributed.spawn` does not support parallel training on device `{}` now.".
-            format(device))
+            "`paddle.distributed.spawn` does not support parallel training on device `{}` now."
+            .format(device))
 
 
 def _get_default_backend():
@@ -121,8 +122,8 @@ def _get_default_backend():
         return 'gloo'
     else:
         raise RuntimeError(
-            "`paddle.distributed.spawn` does not support parallel training on device `{}` now.".
-            format(device))
+            "`paddle.distributed.spawn` does not support parallel training on device `{}` now."
+            .format(device))
 
 
 def _get_node_ip(ips):
@@ -136,9 +137,9 @@ def _get_node_ip(ips):
 
 
 def _get_subprocess_env_list(nprocs, options):
-    # NOTE (xiongkun03) Why put backend deduction  here ? 
-    # Becase _get_subprocess_env_list is used by many testcases. 
-    # So for campability, we put backend deduction here 
+    # NOTE (xiongkun03) Why put backend deduction  here ?
+    # Becase _get_subprocess_env_list is used by many testcases.
+    # So for campability, we put backend deduction here
 
     # logic for handle backend option
     if 'backend' not in options or options['backend'] == 'auto':
@@ -329,8 +330,8 @@ def _remove_risky_env():
 
 def _set_trainer_env(env_dict, backend):
     # NOTE(chenweihang): [ Why need set FLAGS_selected_gpus or FLAGS_selected_xpus here? ]
-    # When the child process starts, it will inherit the configuration of the 
-    # main process and set the FLAGS once, but the environment variable has 
+    # When the child process starts, it will inherit the configuration of the
+    # main process and set the FLAGS once, but the environment variable has
     # not been set at this time, which leads to the FLAGS_selected_gpus or FLAGS_selected_xpus
     # is keep same with mainprocess(usually empty), so manually update the flags here
 
@@ -344,8 +345,8 @@ def _set_trainer_env(env_dict, backend):
     elif backend == 'cncl':
         set_flags({'FLAGS_selected_mlus': env_dict['FLAGS_selected_mlus']})
     else:
-        #NOTE(xiongkun) why not raise Error ? 
-        # So far, we added support for CPU parallel, and will be applied when paddle is not 
+        #NOTE(xiongkun) why not raise Error ?
+        # So far, we added support for CPU parallel, and will be applied when paddle is not
         # compiled with cuda or xp. just do nothing.
         pass
 
@@ -371,13 +372,14 @@ def _func_wrapper(func, args, error_queue, return_queue, env_dict, backend):
 
 
 class MultiprocessContext(object):
+
     def __init__(self, processes, error_queues, return_queues):
         _py_supported_check()
         self.error_queues = error_queues
-        # NOTE(chenweihang): The `spawn` method is mainly used 
-        # to wrap the outermost execution function of the program for 
-        # parallel execution. Generally, the return value is not concerned, 
-        # but if the user needs to obtain the return value, users can get  
+        # NOTE(chenweihang): The `spawn` method is mainly used
+        # to wrap the outermost execution function of the program for
+        # parallel execution. Generally, the return value is not concerned,
+        # but if the user needs to obtain the return value, users can get
         # the return result of each process from context.return_queues
         self.return_queues = return_queues
         self.processes = processes
@@ -390,8 +392,8 @@ def join(self, timeout=None):
         if len(self.sentinels) == 0:
             return True
 
-        ready = multiprocessing.connection.wait(
-            self.sentinels.keys(), timeout=timeout)
+        ready = multiprocessing.connection.wait(self.sentinels.keys(),
+                                                timeout=timeout)
 
         error_index = None
         for sentinel in ready:
@@ -554,12 +556,12 @@ def train(print_result=False):
     """
     # NOTE(chenweihang): [ why only supports python3.4+ ? ]
     # Python supported setting the child process startup method
-    # since 3.4. The previous version can only use the default startup 
-    # method, while the default startup method of Unix is fork, which 
+    # since 3.4. The previous version can only use the default startup
+    # method, while the default startup method of Unix is fork, which
     # cannot support CUDA runtime multi-process
     _py_supported_check()
 
-    # Give an error hint when the users enter a configuration option 
+    # Give an error hint when the users enter a configuration option
     # that does not exist
     _options_valid_check(options)
 
@@ -568,15 +570,15 @@ def train(print_result=False):
         nprocs = _get_default_nprocs()
 
     # NOTE(chenweihang): [ why need get cluster info before run? ]
-    # when using `paddle.distributed.spawn` start parallel training, 
-    # we should get cluster info before starting subprocess, and pass 
+    # when using `paddle.distributed.spawn` start parallel training,
+    # we should get cluster info before starting subprocess, and pass
     # correct info to each subprocess
     procs_env_list = _get_subprocess_env_list(nprocs, options)
 
     # start processes
     # NOTE(chenweihang): [ why default start method is spawn? ]
-    # The CUDA runtime does not support the fork start method, 
-    # either the spawn or forkserver start method are required 
+    # The CUDA runtime does not support the fork start method,
+    # either the spawn or forkserver start method are required
     # to use CUDA in subprocesses.
     start_method = options.get('start_method', None)
     if start_method is None:
@@ -589,10 +591,9 @@ def train(print_result=False):
     for i in range(nprocs):
         error_queue = mp.SimpleQueue()
         return_queue = mp.SimpleQueue()
-        process = mp.Process(
-            target=_func_wrapper,
-            args=(func, args, error_queue, return_queue, procs_env_list[i],
-                  options['backend']))
+        process = mp.Process(target=_func_wrapper,
+                             args=(func, args, error_queue, return_queue,
+                                   procs_env_list[i], options['backend']))
         process.daemon = daemon
         process.start()
         error_queues.append(error_queue)
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 30cd63ed80ea7..ec8ef80d5da60 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -33,24 +33,24 @@
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle import _C_ops
 
-__all__ = [     #noqa
-           'get_host_name_ip',
-           'Trainer',
-           'get_cluster',
-           'start_local_trainers',
-           'watch_local_trainers',
-           'find_free_ports',
-           'JobServer',
-           'Cluster',
-           'Pod',
-           'Hdfs',
-           'add_arguments',
-           'terminate_local_procs',
-           'TrainerProc',
-           'get_logger',
-           'pull_worker_log',
-           'global_scatter',
-           'global_gather',
+__all__ = [  #noqa
+    'get_host_name_ip',
+    'Trainer',
+    'get_cluster',
+    'start_local_trainers',
+    'watch_local_trainers',
+    'find_free_ports',
+    'JobServer',
+    'Cluster',
+    'Pod',
+    'Hdfs',
+    'add_arguments',
+    'terminate_local_procs',
+    'TrainerProc',
+    'get_logger',
+    'pull_worker_log',
+    'global_scatter',
+    'global_gather',
 ]
 
 
@@ -163,16 +163,17 @@ def global_scatter(x,
         helper = LayerHelper(op_type, **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-        helper.append_op(
-            type=op_type,
-            inputs={
-                'X': [x],
-                'local_count': [local_count],
-                'global_count': [global_count],
-            },
-            outputs={'Out': [out]},
-            attrs={'ring_id': ring_id,
-                   'use_calc_stream': use_calc_stream})
+        helper.append_op(type=op_type,
+                         inputs={
+                             'X': [x],
+                             'local_count': [local_count],
+                             'global_count': [global_count],
+                         },
+                         outputs={'Out': [out]},
+                         attrs={
+                             'ring_id': ring_id,
+                             'use_calc_stream': use_calc_stream
+                         })
         return out
 
 
@@ -276,18 +277,17 @@ def global_gather(x,
         helper = LayerHelper(op_type, **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-        helper.append_op(
-            type=op_type,
-            inputs={
-                'X': [x],
-                'local_count': [local_count],
-                'global_count': [global_count]
-            },
-            outputs={'Out': [out]},
-            attrs={
-                'ring_id': group,
-                'use_calc_stream': use_calc_stream,
-            })
+        helper.append_op(type=op_type,
+                         inputs={
+                             'X': [x],
+                             'local_count': [local_count],
+                             'global_count': [global_count]
+                         },
+                         outputs={'Out': [out]},
+                         attrs={
+                             'ring_id': group,
+                             'use_calc_stream': use_calc_stream,
+                         })
         return out
 
 
@@ -362,6 +362,7 @@ def _print_arguments(args):
 
 
 class Hdfs(object):
+
     def __init__(self):
         self.hdfs_ugi = None
         self.hdfs_name = None
@@ -386,6 +387,7 @@ def __ne__(self, n):
 
 
 class Cluster(object):
+
     def __init__(self, hdfs):
         self.job_server = None
         self.pods = []
@@ -448,6 +450,7 @@ def get_pod_by_id(self, pod_id):
 
 
 class JobServer(object):
+
     def __init__(self):
         self.endpoint = None
 
@@ -462,6 +465,7 @@ def __ne__(self, j):
 
 
 class Trainer(object):
+
     def __init__(self):
         self.gpus = []
         self.endpoint = None
@@ -493,6 +497,7 @@ def get_rank(self):
 
 
 class Pod(object):
+
     def __init__(self):
         self.rank = None
         self.id = None
@@ -631,15 +636,15 @@ def add_arguments(argname, type, default, help, argparser, **kwargs):
         args = parser.parse_args()
     """
     type = strtobool if type == bool else type
-    argparser.add_argument(
-        "--" + argname,
-        default=default,
-        type=type,
-        help=help + ' Default: %(default)s.',
-        **kwargs)
+    argparser.add_argument("--" + argname,
+                           default=default,
+                           type=type,
+                           help=help + ' Default: %(default)s.',
+                           **kwargs)
 
 
 def find_free_ports(num):
+
     def __free_port():
         with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
             s.bind(('', 0))
@@ -712,6 +717,7 @@ def _prepare_trainer_env(cluster, trainer, backend=None):
 
 
 class TrainerProc(object):
+
     def __init__(self):
         self.proc = None
         self.log_fn = None
@@ -808,14 +814,14 @@ def watch_local_trainers(procs, nranks):
         raise
     except SystemExit:
         logger.error(
-            "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
-            format(nranks, error_rank))
+            "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log."
+            .format(nranks, error_rank))
         terminate_local_procs(procs)
         raise
     except:
         logger.error(
-            "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
-            format(nranks, error_rank))
+            "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log."
+            .format(nranks, error_rank))
         terminate_local_procs(procs)
         raise
 
diff --git a/python/paddle/distribution/__init__.py b/python/paddle/distribution/__init__.py
index 3a9af812add6e..64d59b04864ba 100644
--- a/python/paddle/distribution/__init__.py
+++ b/python/paddle/distribution/__init__.py
@@ -28,18 +28,9 @@
 from paddle.distribution.uniform import Uniform
 
 __all__ = [  # noqa
-    'Beta',
-    'Categorical',
-    'Dirichlet',
-    'Distribution',
-    'ExponentialFamily',
-    'Multinomial',
-    'Normal',
-    'Uniform',
-    'kl_divergence',
-    'register_kl',
-    'Independent',
-    'TransformedDistribution'
+    'Beta', 'Categorical', 'Dirichlet', 'Distribution', 'ExponentialFamily',
+    'Multinomial', 'Normal', 'Uniform', 'kl_divergence', 'register_kl',
+    'Independent', 'TransformedDistribution'
 ]
 
 __all__.extend(transform.__all__)
diff --git a/python/paddle/distribution/categorical.py b/python/paddle/distribution/categorical.py
index 97a3df490b1d0..fffcd94ad680e 100644
--- a/python/paddle/distribution/categorical.py
+++ b/python/paddle/distribution/categorical.py
@@ -162,8 +162,8 @@ def sample(self, shape):
             sample_shape = shape
             logits = self.logits
 
-        sample_index = multinomial(
-            self._logits_to_probs(logits), num_samples, True)
+        sample_index = multinomial(self._logits_to_probs(logits), num_samples,
+                                   True)
 
         # multinomial sample shape is (logits.shape[:-1], num_samples), need to
         # tanspose to (num_samples, logits.shape[:-1])
@@ -220,11 +220,12 @@ def kl_divergence(self, other):
         z = paddle.sum(e_logits, axis=-1, keepdim=True)
         other_z = paddle.sum(other_e_logits, axis=-1, keepdim=True)
         prob = e_logits / z
-        kl = paddle.sum(prob * (
-            logits - paddle.log(z) - other_logits + paddle.log(other_z)),
-                        axis=-1,
-                        keepdim=True,
-                        name=name)
+        kl = paddle.sum(
+            prob *
+            (logits - paddle.log(z) - other_logits + paddle.log(other_z)),
+            axis=-1,
+            keepdim=True,
+            name=name)
 
         return kl
 
@@ -300,17 +301,16 @@ def probs(self, value):
         """
         name = self.name + '_probs'
         if len(self._prob.shape) == 1:  # batch_shape is empty
-            return paddle.gather(
-                self._prob, value.reshape(
-                    [-1], name=name), name=name).reshape(
-                        value.shape, name=name)
+            return paddle.gather(self._prob,
+                                 value.reshape([-1], name=name),
+                                 name=name).reshape(value.shape, name=name)
         else:
             if len(value.shape) == 1:
                 return paddle.take_along_axis(
                     self._prob,
-                    paddle.reshape(
-                        value, (len(self._prob.shape) - 1) * [1] + [-1],
-                        name=name),
+                    paddle.reshape(value,
+                                   (len(self._prob.shape) - 1) * [1] + [-1],
+                                   name=name),
                     axis=-1)
             else:
                 return paddle.take_along_axis(self._prob, value, axis=-1)
diff --git a/python/paddle/distribution/constraint.py b/python/paddle/distribution/constraint.py
index d094a7607da96..4cde3d30a565c 100644
--- a/python/paddle/distribution/constraint.py
+++ b/python/paddle/distribution/constraint.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,11 +23,13 @@ def __call__(self, value):
 
 
 class Real(Constraint):
+
     def __call__(self, value):
         return value == value
 
 
 class Range(Constraint):
+
     def __init__(self, lower, upper):
         self._lower = lower
         self._upper = upper
@@ -38,14 +40,16 @@ def __call__(self, value):
 
 
 class Positive(Constraint):
+
     def __call__(self, value):
         return value >= 0.
 
 
 class Simplex(Constraint):
+
     def __call__(self, value):
-        return paddle.all(value >= 0, axis=-1) and (
-            (value.sum(-1) - 1).abs() < 1e-6)
+        return paddle.all(value >= 0,
+                          axis=-1) and ((value.sum(-1) - 1).abs() < 1e-6)
 
 
 real = Real()
diff --git a/python/paddle/distribution/dirichlet.py b/python/paddle/distribution/dirichlet.py
index 740f850b7c1da..63466bda7c0af 100644
--- a/python/paddle/distribution/dirichlet.py
+++ b/python/paddle/distribution/dirichlet.py
@@ -125,8 +125,8 @@ def log_prob(self, value):
         Args:
             value (Tensor): Value to be evaluated.
         """
-        return ((paddle.log(value) * (self.concentration - 1.0)
-                 ).sum(-1) + paddle.lgamma(self.concentration.sum(-1)) -
+        return ((paddle.log(value) * (self.concentration - 1.0)).sum(-1) +
+                paddle.lgamma(self.concentration.sum(-1)) -
                 paddle.lgamma(self.concentration).sum(-1))
 
     def entropy(self):
@@ -139,9 +139,9 @@ def entropy(self):
         k = self.concentration.shape[-1]
         return (paddle.lgamma(self.concentration).sum(-1) -
                 paddle.lgamma(concentration0) -
-                (k - concentration0) * paddle.digamma(concentration0) - (
-                    (self.concentration - 1.0
-                     ) * paddle.digamma(self.concentration)).sum(-1))
+                (k - concentration0) * paddle.digamma(concentration0) -
+                ((self.concentration - 1.0) *
+                 paddle.digamma(self.concentration)).sum(-1))
 
     @property
     def _natural_parameters(self):
@@ -164,9 +164,8 @@ def _dirichlet(concentration, name=None):
         helper = LayerHelper(op_type, **locals())
         out = helper.create_variable_for_type_inference(
             dtype=concentration.dtype)
-        helper.append_op(
-            type=op_type,
-            inputs={"Alpha": concentration},
-            outputs={'Out': out},
-            attrs={})
+        helper.append_op(type=op_type,
+                         inputs={"Alpha": concentration},
+                         outputs={'Out': out},
+                         attrs={})
         return out
diff --git a/python/paddle/distribution/distribution.py b/python/paddle/distribution/distribution.py
index 1c8edfa138d2e..901f5e88e0c2f 100644
--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
@@ -177,8 +177,8 @@ def _to_tensor(self, *args):
                 arg = [arg]
             if not isinstance(arg, (list, tuple, np.ndarray, tensor.Variable)):
                 raise TypeError(
-                    "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}".
-                    format(type(arg)))
+                    "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}"
+                    .format(type(arg)))
 
             arg_np = np.array(arg)
             arg_dtype = arg_np.dtype
diff --git a/python/paddle/distribution/exponential_family.py b/python/paddle/distribution/exponential_family.py
index e0236f9e6e2be..b78e77497043b 100644
--- a/python/paddle/distribution/exponential_family.py
+++ b/python/paddle/distribution/exponential_family.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -62,8 +62,9 @@ def entropy(self):
         log_norm = self._log_normalizer(*natural_parameters)
 
         if _non_static_mode():
-            grads = paddle.grad(
-                log_norm.sum(), natural_parameters, create_graph=True)
+            grads = paddle.grad(log_norm.sum(),
+                                natural_parameters,
+                                create_graph=True)
         else:
             grads = paddle.static.gradients(log_norm.sum(), natural_parameters)
 
diff --git a/python/paddle/distribution/independent.py b/python/paddle/distribution/independent.py
index 3534a31591b27..884c34b4b6adb 100644
--- a/python/paddle/distribution/independent.py
+++ b/python/paddle/distribution/independent.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -51,7 +51,8 @@ class Independent(distribution.Distribution):
     def __init__(self, base, reinterpreted_batch_rank):
         if not isinstance(base, distribution.Distribution):
             raise TypeError(
-                f"Expected type of 'base' is Distribution, but got {type(base)}")
+                f"Expected type of 'base' is Distribution, but got {type(base)}"
+            )
         if not (0 < reinterpreted_batch_rank <= len(base.batch_shape)):
             raise ValueError(
                 f"Expected 0 < reinterpreted_batch_rank <= {len(base.batch_shape)}, but got {reinterpreted_batch_rank}"
@@ -60,11 +61,11 @@ def __init__(self, base, reinterpreted_batch_rank):
         self._reinterpreted_batch_rank = reinterpreted_batch_rank
 
         shape = base.batch_shape + base.event_shape
-        super(Independent, self).__init__(
-            batch_shape=shape[:len(base.batch_shape) -
-                              reinterpreted_batch_rank],
-            event_shape=shape[len(base.batch_shape) -
-                              reinterpreted_batch_rank:])
+        super(Independent,
+              self).__init__(batch_shape=shape[:len(base.batch_shape) -
+                                               reinterpreted_batch_rank],
+                             event_shape=shape[len(base.batch_shape) -
+                                               reinterpreted_batch_rank:])
 
     @property
     def mean(self):
@@ -78,8 +79,8 @@ def sample(self, shape=()):
         return self._base.sample(shape)
 
     def log_prob(self, value):
-        return self._sum_rightmost(
-            self._base.log_prob(value), self._reinterpreted_batch_rank)
+        return self._sum_rightmost(self._base.log_prob(value),
+                                   self._reinterpreted_batch_rank)
 
     def prob(self, value):
         return self.log_prob(value).exp()
diff --git a/python/paddle/distribution/kl.py b/python/paddle/distribution/kl.py
index 6310214117e9d..c5ad3f04358dc 100644
--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -83,8 +83,8 @@ def register_kl(cls_p, cls_q):
             def kl_beta_beta():
                 pass # insert implementation here
     """
-    if (not issubclass(cls_p, Distribution) or
-            not issubclass(cls_q, Distribution)):
+    if (not issubclass(cls_p, Distribution)
+            or not issubclass(cls_q, Distribution)):
         raise TypeError('cls_p and cls_q must be subclass of Distribution')
 
     def decorator(f):
@@ -117,6 +117,7 @@ def _dispatch(cls_p, cls_q):
 
 @functools.total_ordering
 class _Compare(object):
+
     def __init__(self, *classes):
         self.classes = classes
 
@@ -136,20 +137,20 @@ def __le__(self, other):
 def _kl_beta_beta(p, q):
     return ((q.alpha.lgamma() + q.beta.lgamma() + (p.alpha + p.beta).lgamma()) -
             (p.alpha.lgamma() + p.beta.lgamma() + (q.alpha + q.beta).lgamma()) +
-            ((p.alpha - q.alpha) * p.alpha.digamma()) + (
-                (p.beta - q.beta) * p.beta.digamma()) + (
-                    ((q.alpha + q.beta) -
-                     (p.alpha + p.beta)) * (p.alpha + p.beta).digamma()))
+            ((p.alpha - q.alpha) * p.alpha.digamma()) +
+            ((p.beta - q.beta) * p.beta.digamma()) +
+            (((q.alpha + q.beta) - (p.alpha + p.beta)) *
+             (p.alpha + p.beta).digamma()))
 
 
 @register_kl(Dirichlet, Dirichlet)
 def _kl_dirichlet_dirichlet(p, q):
     return (
         (p.concentration.sum(-1).lgamma() - q.concentration.sum(-1).lgamma()) -
-        ((p.concentration.lgamma() - q.concentration.lgamma()).sum(-1)) + (
-            ((p.concentration - q.concentration) *
-             (p.concentration.digamma() -
-              p.concentration.sum(-1).digamma().unsqueeze(-1))).sum(-1)))
+        ((p.concentration.lgamma() - q.concentration.lgamma()).sum(-1)) +
+        (((p.concentration - q.concentration) *
+          (p.concentration.digamma() -
+           p.concentration.sum(-1).digamma().unsqueeze(-1))).sum(-1)))
 
 
 @register_kl(Categorical, Categorical)
@@ -186,15 +187,15 @@ def _kl_expfamily_expfamily(p, q):
 
     try:
         if _non_static_mode():
-            p_grads = paddle.grad(
-                p_log_norm, p_natural_params, create_graph=True)
+            p_grads = paddle.grad(p_log_norm,
+                                  p_natural_params,
+                                  create_graph=True)
         else:
             p_grads = paddle.static.gradients(p_log_norm, p_natural_params)
     except RuntimeError as e:
         raise TypeError(
-            "Cann't compute kl_divergence({cls_p}, {cls_q}) use bregman divergence. Please register_kl({cls_p}, {cls_q}).".
-            format(
-                cls_p=type(p).__name__, cls_q=type(q).__name__)) from e
+            "Cann't compute kl_divergence({cls_p}, {cls_q}) use bregman divergence. Please register_kl({cls_p}, {cls_q})."
+            .format(cls_p=type(p).__name__, cls_q=type(q).__name__)) from e
 
     kl = q._log_normalizer(*q_natural_params) - p_log_norm
     for p_param, q_param, p_grad in zip(p_natural_params, q_natural_params,
diff --git a/python/paddle/distribution/multinomial.py b/python/paddle/distribution/multinomial.py
index 837eb53eab1ea..424ec4b120d1b 100644
--- a/python/paddle/distribution/multinomial.py
+++ b/python/paddle/distribution/multinomial.py
@@ -145,9 +145,11 @@ def sample(self, shape=()):
         if not isinstance(shape, Iterable):
             raise TypeError('sample shape must be Iterable object.')
 
-        samples = self._categorical.sample([self.total_count, ] + list(shape))
-        return paddle.nn.functional.one_hot(
-            samples, self.probs.shape[-1]).cast(self.probs.dtype).sum(0)
+        samples = self._categorical.sample([
+            self.total_count,
+        ] + list(shape))
+        return paddle.nn.functional.one_hot(samples, self.probs.shape[-1]).cast(
+            self.probs.dtype).sum(0)
 
     def entropy(self):
         """entropy of multinomial distribution
@@ -155,16 +157,18 @@ def entropy(self):
         Returns:
             Tensor: entropy value
         """
-        n = paddle.full(
-            shape=[1], fill_value=self.total_count, dtype=self.probs.dtype)
+        n = paddle.full(shape=[1],
+                        fill_value=self.total_count,
+                        dtype=self.probs.dtype)
         support = paddle.arange(
-            self.total_count + 1, dtype=self.probs.dtype).reshape((-1, ) + (
-                1, ) * len(self.probs.shape))[1:]
+            self.total_count + 1,
+            dtype=self.probs.dtype).reshape((-1, ) +
+                                            (1, ) * len(self.probs.shape))[1:]
 
         binomial_pmf = paddle.exp(self._binomial_logpmf(n, support))
 
-        return ((n * self._categorical.entropy() - paddle.lgamma(n + 1)) + (
-            (binomial_pmf * paddle.lgamma(support + 1)).sum([0, -1])))
+        return ((n * self._categorical.entropy() - paddle.lgamma(n + 1)) +
+                ((binomial_pmf * paddle.lgamma(support + 1)).sum([0, -1])))
 
     def _binomial_logpmf(self, count, value):
         logits = self._probs_to_logits(self.probs, is_binary=True)
@@ -173,8 +177,9 @@ def _binomial_logpmf(self, count, value):
         factor_k = paddle.lgamma(value + 1)
         factor_nmk = paddle.lgamma(count - value + 1)
 
-        norm = (count * _clip_by_zero(logits) + count *
-                paddle.log1p(paddle.exp(-paddle.abs(logits))) - factor_n)
+        norm = (count * _clip_by_zero(logits) +
+                count * paddle.log1p(paddle.exp(-paddle.abs(logits))) -
+                factor_n)
 
         return value * logits - factor_k - factor_nmk - norm
 
diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py
index 51a180271c63b..71bc98a72de4b 100644
--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -115,13 +115,11 @@ def __init__(self, loc, scale, name=None):
         else:
             if isinstance(loc, float) and isinstance(scale, float):
                 self.all_arg_is_float = True
-            if isinstance(
-                    loc,
-                    np.ndarray) and str(loc.dtype) in ['float32', 'float64']:
+            if isinstance(loc, np.ndarray) and str(
+                    loc.dtype) in ['float32', 'float64']:
                 self.dtype = loc.dtype
-            elif isinstance(
-                    scale,
-                    np.ndarray) and str(scale.dtype) in ['float32', 'float64']:
+            elif isinstance(scale, np.ndarray) and str(
+                    scale.dtype) in ['float32', 'float64']:
                 self.dtype = scale.dtype
             # pylint: disable=unbalanced-tuple-unpacking
             self.loc, self.scale = self._to_tensor(loc, scale)
@@ -154,8 +152,11 @@ def sample(self, shape, seed=0):
                 self.loc + self.scale, batch_shape + shape, self.dtype, 0.)
             zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
             zero_tmp_shape = nn.shape(zero_tmp_reshape)
-            normal_random_tmp = nn.gaussian_random(
-                zero_tmp_shape, mean=0., std=1., seed=seed, dtype=self.dtype)
+            normal_random_tmp = nn.gaussian_random(zero_tmp_shape,
+                                                   mean=0.,
+                                                   std=1.,
+                                                   seed=seed,
+                                                   dtype=self.dtype)
             output = normal_random_tmp * (zero_tmp_reshape + self.scale)
             output = elementwise_add(output, self.loc, name=name)
             return output
@@ -188,12 +189,13 @@ def entropy(self):
         """
         name = self.name + '_entropy'
         batch_shape = list((self.loc + self.scale).shape)
-        zero_tmp = tensor.fill_constant_batch_size_like(
-            self.loc + self.scale, batch_shape, self.dtype, 0.)
-        return elementwise_add(
-            0.5 + zero_tmp,
-            0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
-            name=name)
+        zero_tmp = tensor.fill_constant_batch_size_like(self.loc + self.scale,
+                                                        batch_shape, self.dtype,
+                                                        0.)
+        return elementwise_add(0.5 + zero_tmp,
+                               0.5 * math.log(2 * math.pi) + nn.log(
+                                   (self.scale + zero_tmp)),
+                               name=name)
 
     def log_prob(self, value):
         """Log probability density/mass function.
@@ -210,10 +212,10 @@ def log_prob(self, value):
 
         var = self.scale * self.scale
         log_scale = nn.log(self.scale)
-        return elementwise_sub(
-            -1. * ((value - self.loc) * (value - self.loc)) / (2. * var),
-            log_scale + math.log(math.sqrt(2. * math.pi)),
-            name=name)
+        return elementwise_sub(-1. * ((value - self.loc) * (value - self.loc)) /
+                               (2. * var),
+                               log_scale + math.log(math.sqrt(2. * math.pi)),
+                               name=name)
 
     def probs(self, value):
         """Probability density/mass function.
@@ -229,10 +231,10 @@ def probs(self, value):
         value = self._check_values_dtype_in_probs(self.loc, value)
 
         var = self.scale * self.scale
-        return elementwise_div(
-            ops.exp(-1. * ((value - self.loc) * (value - self.loc)) /
-                    (2. * var)), (math.sqrt(2 * math.pi) * self.scale),
-            name=name)
+        return elementwise_div(ops.exp(-1. * ((value - self.loc) *
+                                              (value - self.loc)) / (2. * var)),
+                               (math.sqrt(2 * math.pi) * self.scale),
+                               name=name)
 
     def kl_divergence(self, other):
         r"""The KL-divergence between two normal distributions.
@@ -275,5 +277,6 @@ def kl_divergence(self, other):
         var_ratio = (var_ratio * var_ratio)
         t1 = (self.loc - other.loc) / other.scale
         t1 = (t1 * t1)
-        return elementwise_add(
-            0.5 * var_ratio, 0.5 * (t1 - 1. - nn.log(var_ratio)), name=name)
+        return elementwise_add(0.5 * var_ratio,
+                               0.5 * (t1 - 1. - nn.log(var_ratio)),
+                               name=name)
diff --git a/python/paddle/distribution/transform.py b/python/paddle/distribution/transform.py
index 31b1dedbc5fb3..3fabd27ec3401 100644
--- a/python/paddle/distribution/transform.py
+++ b/python/paddle/distribution/transform.py
@@ -25,19 +25,10 @@
                                  transformed_distribution, variable)
 
 __all__ = [  # noqa
-    'Transform',
-    'AbsTransform',
-    'AffineTransform',
-    'ChainTransform',
-    'ExpTransform',
-    'IndependentTransform',
-    'PowerTransform',
-    'ReshapeTransform',
-    'SigmoidTransform',
-    'SoftmaxTransform',
-    'StackTransform',
-    'StickBreakingTransform',
-    'TanhTransform'
+    'Transform', 'AbsTransform', 'AffineTransform', 'ChainTransform',
+    'ExpTransform', 'IndependentTransform', 'PowerTransform',
+    'ReshapeTransform', 'SigmoidTransform', 'SoftmaxTransform',
+    'StackTransform', 'StickBreakingTransform', 'TanhTransform'
 ]
 
 
@@ -147,8 +138,8 @@ def __call__(self, input):
             [Tensor|TransformedDistribution|ChainTransform]: The return value.
         """
         if isinstance(input, distribution.Distribution):
-            return transformed_distribution.TransformedDistribution(input,
-                                                                    [self])
+            return transformed_distribution.TransformedDistribution(
+                input, [self])
         if isinstance(input, Transform):
             return ChainTransform([self, input])
         return self.forward(x)
@@ -207,8 +198,8 @@ def forward_log_det_jacobian(self, x):
         if not isinstance(x, paddle.fluid.framework.Variable):
             raise TypeError(
                 f"Expected 'y' is a Tensor or Real, but got {type(x)}.")
-        if isinstance(x, paddle.fluid.framework.Variable) and x.dim(
-        ) < self._domain.event_rank:
+        if isinstance(x, paddle.fluid.framework.Variable
+                      ) and x.dim() < self._domain.event_rank:
             raise ValueError(
                 f'The dimensions of x({x.dim()}) should be '
                 f'grater than or equal to {self._domain.event_rank}')
@@ -536,9 +527,8 @@ def _forward_log_det_jacobian(self, x):
         value = 0.
         event_rank = self._domain.event_rank
         for t in self.transforms:
-            value += self._sum_rightmost(
-                t.forward_log_det_jacobian(x),
-                event_rank - t._domain.event_rank)
+            value += self._sum_rightmost(t.forward_log_det_jacobian(x),
+                                         event_rank - t._domain.event_rank)
             x = t.forward(x)
             event_rank += t._codomain.event_rank - t._domain.event_rank
         return value
diff --git a/python/paddle/distribution/transformed_distribution.py b/python/paddle/distribution/transformed_distribution.py
index 2d7aa5886ae24..ce386971e5fcc 100644
--- a/python/paddle/distribution/transformed_distribution.py
+++ b/python/paddle/distribution/transformed_distribution.py
@@ -112,8 +112,8 @@ def log_prob(self, value):
                 _sum_rightmost(t.forward_log_det_jacobian(
                     x), event_rank-t._domain.event_rank)
             y = x
-        log_prob += _sum_rightmost(
-            self._base.log_prob(y), event_rank - len(self._base.event_shape))
+        log_prob += _sum_rightmost(self._base.log_prob(y),
+                                   event_rank - len(self._base.event_shape))
         return log_prob
 
 
diff --git a/python/paddle/distribution/uniform.py b/python/paddle/distribution/uniform.py
index 5957dab14ef38..cbc83eba625cd 100644
--- a/python/paddle/distribution/uniform.py
+++ b/python/paddle/distribution/uniform.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -116,13 +116,11 @@ def __init__(self, low, high, name=None):
         else:
             if isinstance(low, float) and isinstance(high, float):
                 self.all_arg_is_float = True
-            if isinstance(
-                    low,
-                    np.ndarray) and str(low.dtype) in ['float32', 'float64']:
+            if isinstance(low, np.ndarray) and str(
+                    low.dtype) in ['float32', 'float64']:
                 self.dtype = low.dtype
-            elif isinstance(
-                    high,
-                    np.ndarray) and str(high.dtype) in ['float32', 'float64']:
+            elif isinstance(high, np.ndarray) and str(
+                    high.dtype) in ['float32', 'float64']:
                 self.dtype = high.dtype
             # pylint: disable=unbalanced-tuple-unpacking
             self.low, self.high = self._to_tensor(low, high)
@@ -161,16 +159,16 @@ def sample(self, shape, seed=0):
             zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
             uniform_random_tmp_reshape = nn.reshape(uniform_random_tmp,
                                                     output_shape)
-            output = uniform_random_tmp_reshape * (
-                zero_tmp_reshape + self.high - self.low)
+            output = uniform_random_tmp_reshape * (zero_tmp_reshape +
+                                                   self.high - self.low)
             output = elementwise_add(output, self.low, name=name)
             return output
         else:
             output_shape = shape + batch_shape
             output = nn.uniform_random(
                 output_shape, dtype=self.dtype, min=0., max=1.,
-                seed=seed) * (tensor.zeros(
-                    output_shape, dtype=self.dtype) + (self.high - self.low))
+                seed=seed) * (tensor.zeros(output_shape, dtype=self.dtype) +
+                              (self.high - self.low))
             output = elementwise_add(output, self.low, name=name)
             if self.all_arg_is_float:
                 return nn.reshape(output, shape, name=name)
@@ -204,8 +202,9 @@ def log_prob(self, value):
         ub_bool = value < self.high
         lb = tensor.cast(lb_bool, dtype=value.dtype)
         ub = tensor.cast(ub_bool, dtype=value.dtype)
-        return elementwise_sub(
-            nn.log(lb * ub), nn.log(self.high - self.low), name=name)
+        return elementwise_sub(nn.log(lb * ub),
+                               nn.log(self.high - self.low),
+                               name=name)
 
     def probs(self, value):
         """Probability density/mass function.
diff --git a/python/paddle/distribution/variable.py b/python/paddle/distribution/variable.py
index 6ece1c3a1d83e..b5c3d71d3faa8 100644
--- a/python/paddle/distribution/variable.py
+++ b/python/paddle/distribution/variable.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -43,11 +43,13 @@ def constraint(self, value):
 
 
 class Real(Variable):
+
     def __init__(self, event_rank=0):
         super(Real, self).__init__(False, event_rank, constraint.real)
 
 
 class Positive(Variable):
+
     def __init__(self, event_rank=0):
         super(Positive, self).__init__(False, event_rank, constraint.positive)
 
@@ -64,8 +66,9 @@ class Independent(Variable):
     def __init__(self, base, reinterpreted_batch_rank):
         self._base = base
         self._reinterpreted_batch_rank = reinterpreted_batch_rank
-        super(Independent, self).__init__(
-            base.is_discrete, base.event_rank + reinterpreted_batch_rank)
+        super(Independent,
+              self).__init__(base.is_discrete,
+                             base.event_rank + reinterpreted_batch_rank)
 
     def constraint(self, value):
         ret = self._base.constraint(value)
@@ -73,11 +76,13 @@ def constraint(self, value):
             raise ValueError(
                 "Input dimensions must be equal or grater than  {}".format(
                     self._reinterpreted_batch_rank))
-        return ret.reshape(ret.shape[:ret.dim() - self.reinterpreted_batch_rank]
-                           + (-1, )).all(-1)
+        return ret.reshape(ret.shape[:ret.dim() -
+                                     self.reinterpreted_batch_rank] +
+                           (-1, )).all(-1)
 
 
 class Stack(Variable):
+
     def __init__(self, vars, axis=0):
         self._vars = vars
         self._axis = axis
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index 10d637ff8b9ba..f44111cb76618 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -104,8 +104,8 @@ def _check_fft_axes(x, axes):
     for axis in axes:
         if not isinstance(axis, int) or axis < -ndim or axis >= ndim:
             raise ValueError(
-                "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})".
-                format(axes, axis, ndim, ndim))
+                "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})"
+                .format(axes, axis, ndim, ndim))
 
 
 def _resize_fft_input(x, s, axes):
@@ -127,11 +127,10 @@ def _resize_fft_input(x, s, axes):
             slices.append((0, s[i]))
 
     if axes_to_slice:
-        x = paddle.slice(
-            x,
-            axes_to_slice,
-            starts=[item[0] for item in slices],
-            ends=[item[1] for item in slices])
+        x = paddle.slice(x,
+                         axes_to_slice,
+                         starts=[item[0] for item in slices],
+                         ends=[item[1] for item in slices])
     if axes_to_pad:
         padding_widths = [0] * (2 * ndim)
         for axis, pad in zip(axes_to_pad, paddings):
@@ -198,8 +197,13 @@ def fft(x, n=None, axis=-1, norm="backward", name=None):
 
     """
     if is_integer(x) or is_floating_point(x):
-        return fft_r2c(
-            x, n, axis, norm, forward=True, onesided=False, name=name)
+        return fft_r2c(x,
+                       n,
+                       axis,
+                       norm,
+                       forward=True,
+                       onesided=False,
+                       name=name)
     else:
         return fft_c2c(x, n, axis, norm, forward=True, name=name)
 
@@ -262,8 +266,13 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
 
     """
     if is_integer(x) or is_floating_point(x):
-        return fft_r2c(
-            x, n, axis, norm, forward=False, onesided=False, name=name)
+        return fft_r2c(x,
+                       n,
+                       axis,
+                       norm,
+                       forward=False,
+                       onesided=False,
+                       name=name)
     else:
         return fft_c2c(x, n, axis, norm, forward=False, name=name)
 
@@ -523,8 +532,13 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
             #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]]
     """
     if is_integer(x) or is_floating_point(x):
-        return fftn_r2c(
-            x, s, axes, norm, forward=True, onesided=False, name=name)
+        return fftn_r2c(x,
+                        s,
+                        axes,
+                        norm,
+                        forward=True,
+                        onesided=False,
+                        name=name)
     else:
         return fftn_c2c(x, s, axes, norm, forward=True, name=name)
 
@@ -587,8 +601,13 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None):
 
     """
     if is_integer(x) or is_floating_point(x):
-        return fftn_r2c(
-            x, s, axes, norm, forward=False, onesided=False, name=name)
+        return fftn_r2c(x,
+                        s,
+                        axes,
+                        norm,
+                        forward=False,
+                        onesided=False,
+                        name=name)
     else:
         return fftn_c2c(x, s, axes, norm, forward=False, name=name)
 
@@ -878,13 +897,13 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
+                .format(s))
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
+                .format(axes))
     return fftn(x, s, axes, norm, name)
 
 
@@ -948,13 +967,13 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
+                .format(s))
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
+                .format(axes))
     return ifftn(x, s, axes, norm, name)
 
 
@@ -1002,13 +1021,13 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
+                .format(s))
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
+                .format(axes))
     return rfftn(x, s, axes, norm, name)
 
 
@@ -1054,13 +1073,13 @@ def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
+                .format(s))
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
+                .format(axes))
     return irfftn(x, s, axes, norm, name)
 
 
@@ -1107,13 +1126,13 @@ def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
+                .format(s))
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
+                .format(axes))
     return hfftn(x, s, axes, norm, name)
 
 
@@ -1159,13 +1178,13 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
+                .format(s))
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
+                .format(axes))
     return ihfftn(x, s, axes, norm, name)
 
 
@@ -1377,14 +1396,18 @@ def fft_c2c(x, n, axis, norm, forward, name):
         attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
         out = getattr(_C_ops, op_type)(x, *attrs)
     else:
-        inputs = {'X': [x], }
+        inputs = {
+            'X': [x],
+        }
         attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
         outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        helper.append_op(type=op_type,
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=attrs)
     return out
 
 
@@ -1408,7 +1431,9 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
                  'onesided', onesided)
         out = getattr(_C_ops, op_type)(x, *attrs)
     else:
-        inputs = {'X': [x], }
+        inputs = {
+            'X': [x],
+        }
         attrs = {
             'axes': axes,
             'normalization': norm,
@@ -1420,8 +1445,10 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
         out = helper.create_variable_for_type_inference(
             _real_to_complex_dtype(dtype))
         outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        helper.append_op(type=op_type,
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=attrs)
     return out
 
 
@@ -1450,7 +1477,9 @@ def fft_c2r(x, n, axis, norm, forward, name):
             attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
         out = getattr(_C_ops, op_type)(x, *attrs)
     else:
-        inputs = {'X': [x], }
+        inputs = {
+            'X': [x],
+        }
         attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
         if n is not None:
             attrs['last_dim_size'] = n
@@ -1459,8 +1488,10 @@ def fft_c2r(x, n, axis, norm, forward, name):
         out = helper.create_variable_for_type_inference(
             _complex_to_real_dtype(dtype))
         outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        helper.append_op(type=op_type,
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=attrs)
     return out
 
 
@@ -1501,14 +1532,18 @@ def fftn_c2c(x, s, axes, norm, forward, name):
         attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
         out = getattr(_C_ops, op_type)(x, *attrs)
     else:
-        inputs = {'X': [x], }
+        inputs = {
+            'X': [x],
+        }
         attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
         outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        helper.append_op(type=op_type,
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=attrs)
     return out
 
 
@@ -1549,7 +1584,9 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
                  'onesided', onesided)
         out = getattr(_C_ops, op_type)(x, *attrs)
     else:
-        inputs = {'X': [x], }
+        inputs = {
+            'X': [x],
+        }
         attrs = {
             'axes': axes,
             'normalization': norm,
@@ -1561,8 +1598,10 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
         out = helper.create_variable_for_type_inference(
             _real_to_complex_dtype(dtype))
         outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        helper.append_op(type=op_type,
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=attrs)
 
     return out
 
@@ -1611,7 +1650,9 @@ def fftn_c2r(x, s, axes, norm, forward, name):
             attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
         out = getattr(_C_ops, op_type)(x, *attrs)
     else:
-        inputs = {'X': [x], }
+        inputs = {
+            'X': [x],
+        }
         attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
         if s:
             attrs["last_dim_size"] = s[-1]
@@ -1620,6 +1661,8 @@ def fftn_c2r(x, s, axes, norm, forward, name):
         out = helper.create_variable_for_type_inference(
             _complex_to_real_dtype(dtype))
         outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        helper.append_op(type=op_type,
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=attrs)
     return out
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 8dbeb3eeb27c3..aa07124ad4981 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -23,8 +23,8 @@
 if os.name == 'nt':
     core_suffix = 'pyd'
 
-legacy_core = os.path.abspath(os.path.dirname(
-    __file__)) + os.sep + 'core.' + core_suffix
+legacy_core = os.path.abspath(
+    os.path.dirname(__file__)) + os.sep + 'core.' + core_suffix
 if os.path.exists(legacy_core):
     sys.stderr.write('Deleting legacy file ' + legacy_core + '\n')
     try:
@@ -159,8 +159,8 @@ def __bootstrap__():
     import platform
     from . import core
 
-    # NOTE(zhiqiu): When (1)numpy < 1.19; (2) python < 3.7, 
-    # unittest is always imported in numpy (maybe some versions not). 
+    # NOTE(zhiqiu): When (1)numpy < 1.19; (2) python < 3.7,
+    # unittest is always imported in numpy (maybe some versions not).
     # so is_test is True and p2p is not inited.
     in_test = 'unittest' in sys.modules
 
@@ -170,12 +170,11 @@ def __bootstrap__():
         num_threads = 1
 
     if num_threads > 1:
-        print(
-            'WARNING: OMP_NUM_THREADS set to {0}, not 1. The computation '
-            'speed will not be optimized if you use data parallel. It will '
-            'fail if this PaddlePaddle binary is compiled with OpenBlas since'
-            ' OpenBlas does not support multi-threads.'.format(num_threads),
-            file=sys.stderr)
+        print('WARNING: OMP_NUM_THREADS set to {0}, not 1. The computation '
+              'speed will not be optimized if you use data parallel. It will '
+              'fail if this PaddlePaddle binary is compiled with OpenBlas since'
+              ' OpenBlas does not support multi-threads.'.format(num_threads),
+              file=sys.stderr)
         print('PLEASE USE OMP_NUM_THREADS WISELY.', file=sys.stderr)
 
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
@@ -203,7 +202,7 @@ def remove_flag_if_exists(name):
         read_env_flags += []
 
     core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
-    # Note(zhouwei25): sys may not have argv in some cases, 
+    # Note(zhouwei25): sys may not have argv in some cases,
     # Such as: use Python/C API to call Python from C++
     try:
         core.init_glog(sys.argv[0])
diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py
index a7d64d37bc7a0..bb5f4cb84f665 100644
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -29,8 +29,8 @@
 
 
 def _is_number_(var):
-    return isinstance(var, int) or isinstance(var, float) or (isinstance(
-        var, np.ndarray) and var.shape == (1, ))
+    return isinstance(var, int) or isinstance(
+        var, float) or (isinstance(var, np.ndarray) and var.shape == (1, ))
 
 
 def _is_number_or_matrix_(var):
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index ed3e0bc98ed6d..0ca69b5f94de4 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -38,11 +38,13 @@
     'gradients',
 ]
 
-_logger = log_helper.get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = log_helper.get_logger(__name__,
+                                logging.INFO,
+                                fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 class ProgramStats(object):
+
     def __init__(self, block, ops):
         self.block = block
         self.ops = ops
@@ -122,9 +124,9 @@ def is_amp_cast(op):
         updated_min_idx = min_idx
         while idx_ > pre_segment_end_idx:
             if is_amp_cast(self.ops[idx_]):
-                _logger.info("found amp-cast op: {}, : {}".format(self.ops[
-                    idx_].desc.type(), self.ops[idx_].desc.input_arg_names()[
-                        0]))
+                _logger.info("found amp-cast op: {}, : {}".format(
+                    self.ops[idx_].desc.type(),
+                    self.ops[idx_].desc.input_arg_names()[0]))
                 updated_min_idx = idx_
                 idx_ -= 1
             else:
@@ -137,8 +139,8 @@ def build_stats(self):
             self.op_deps[i] = {"in_ops": [], "out_ops": []}
             for j, name in enumerate(op.desc.input_arg_names()):
                 if name in self.var_op_deps:
-                    self.op_deps[i]["in_ops"].extend(self.var_op_deps[name][
-                        "var_as_output_ops"])
+                    self.op_deps[i]["in_ops"].extend(
+                        self.var_op_deps[name]["var_as_output_ops"])
             for j, name in enumerate(op.desc.input_arg_names()):
                 if name in self.var_op_deps:
                     self.var_op_deps[name]["var_as_input_ops"].extend([i])
@@ -209,16 +211,15 @@ def modify_forward_desc_for_recompute(self):
 
             # Setting the force_cpu of seed to true will make the output of seed in cpu memory,
             # reduce the synchronous copy from GPU to CPU in dropout, and reduce the communication hang
-            added_op = self.block._insert_op(
-                index=op.idx,
-                type='seed',
-                inputs={},
-                outputs={'Out': [added_var]},
-                attrs={
-                    'seed': seed,
-                    'op_device': op_device,
-                    'force_cpu': True
-                })
+            added_op = self.block._insert_op(index=op.idx,
+                                             type='seed',
+                                             inputs={},
+                                             outputs={'Out': [added_var]},
+                                             attrs={
+                                                 'seed': seed,
+                                                 'op_device': op_device,
+                                                 'force_cpu': True
+                                             })
             self.ops.insert(op_idx, added_op)
             # modify dropout op desc so that it accept a seed var as input
             op.desc.set_input("Seed", [var_unique_name])
@@ -287,8 +288,8 @@ def _add_descs_to_block(descs, block):
 def _find_loss_op_(loss):
     for op in reversed(loss.block.ops):
         assert isinstance(op, framework.Operator)
-        if len(op.output_arg_names) == 1 and op.output_arg_names[
-                0] == loss.name:
+        if len(op.output_arg_names
+               ) == 1 and op.output_arg_names[0] == loss.name:
             loss.op = op
             break
     if loss.op is None:
@@ -329,14 +330,16 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
         op_desc.set_input(
             para,
             list(
-                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
-                    args)))
+                map(
+                    lambda arg: arg.decode()
+                    if isinstance(arg, six.binary_type) else arg, args)))
     for para, args in six.iteritems(outputs):
         op_desc.set_output(
             para,
             list(
-                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
-                    args)))
+                map(
+                    lambda arg: arg.decode()
+                    if isinstance(arg, six.binary_type) else arg, args)))
 
     op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
     op_device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
@@ -358,12 +361,15 @@ def _create_loss_op_desc_(loss):
     op_desc = _create_op_desc_(
         "fill_constant", {}, {"Out": [_append_grad_suffix_(loss.name)]}, {
             "shape": [1],
-            "value": 1.0,
-            "dtype": loss.dtype,
-            "force_cpu": False,
+            "value":
+            1.0,
+            "dtype":
+            loss.dtype,
+            "force_cpu":
+            False,
             core.op_proto_and_checker_maker.kOpRoleAttrName():
-            int(core.op_proto_and_checker_maker.OpRole.Backward) |
-            int(core.op_proto_and_checker_maker.OpRole.Loss),
+            int(core.op_proto_and_checker_maker.OpRole.Backward)
+            | int(core.op_proto_and_checker_maker.OpRole.Loss),
             core.op_proto_and_checker_maker.kOpDeviceAttrName():
             loss.op.attr(core.op_proto_and_checker_maker.kOpDeviceAttrName())
         })
@@ -383,8 +389,8 @@ def _infer_var_data_type_shape_(grad_var_name, block):
     else:
         # TODO(jiabin): Maybe we should not to this to cause some unexpected error on dtype
         warnings.warn(
-            "Set grad var: {} dtype to default FP32, since we can't find its related forward var".
-            format(grad_var_name))
+            "Set grad var: {} dtype to default FP32, since we can't find its related forward var"
+            .format(grad_var_name))
         grad_var.set_dtype(core.VarDesc.VarType.FP32)
 
 
@@ -446,10 +452,11 @@ def _accumulate_gradients_by_sum_op_(var_name,
     if op_idx not in pending_sum_ops.keys():
         pending_sum_ops[op_idx] = []
     pending_sum_ops[op_idx].append(
-        _create_op_desc_("sum", {"X": renamed_vars[var_name]}, {
-            "Out": [var_name]
-        }, {"use_mkldnn": False,
-            "op_device": op_device}))
+        _create_op_desc_("sum", {"X": renamed_vars[var_name]},
+                         {"Out": [var_name]}, {
+                             "use_mkldnn": False,
+                             "op_device": op_device
+                         }))
     renamed_vars[var_name] = [var_name]
 
 
@@ -472,10 +479,13 @@ def _accumulate_gradients_by_add_ops_(var_name,
         else:
             out_name = var_name
         pending_sum_ops[op_idx].append(
-            _create_op_desc_("grad_add", {"X": [x_name],
-                                          "Y": [y_name]}, {"Out": [out_name]},
-                             {"use_mkldnn": False,
-                              "op_device": op_device}))
+            _create_op_desc_("grad_add", {
+                "X": [x_name],
+                "Y": [y_name]
+            }, {"Out": [out_name]}, {
+                "use_mkldnn": False,
+                "op_device": op_device
+            }))
     renamed_vars[var_name] = [var_name]
 
 
@@ -585,13 +595,14 @@ def _addup_repetitive_outputs_(op_descs, block_idx, grad_var_to_var=None):
     for var_name, inputs in six.iteritems(renamed_vars):
         if len(renamed_vars[var_name]) > 1:
             if len(renamed_vars[var_name]) > _MAX_ADD_NUM_:
-                _accumulate_gradients_by_sum_op_(
-                    var_name, renamed_vars, pending_sum_ops,
-                    len(op_descs), var_device[var_name])
+                _accumulate_gradients_by_sum_op_(var_name, renamed_vars,
+                                                 pending_sum_ops, len(op_descs),
+                                                 var_device[var_name])
             else:
-                _accumulate_gradients_by_add_ops_(
-                    var_name, renamed_vars, pending_sum_ops,
-                    len(op_descs), var_device[var_name])
+                _accumulate_gradients_by_add_ops_(var_name,
+                                                  renamed_vars, pending_sum_ops,
+                                                  len(op_descs),
+                                                  var_device[var_name])
 
     # sum_op descs are sorted according to their insert position
     for key, value in collections.OrderedDict(
@@ -642,8 +653,9 @@ def _op_can_be_removed_(op_desc, no_grad_set):
                 x_in = _strip_grad_suffix_(arg)
                 # the reason should be: arg can be input of another grad op
                 # and the op is a not-to-remove op
-                to_insert.append((_create_op_desc_(
-                    "fill_zeros_like", {"X": [x_in]}, {"Out": [arg]}, {}), idx))
+                to_insert.append(
+                    (_create_op_desc_("fill_zeros_like", {"X": [x_in]},
+                                      {"Out": [arg]}, {}), idx))
 
     list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])
 
@@ -669,6 +681,7 @@ def _find_not_need_ops(grad_op_descs, forward_ops, input_grad_names_set):
     """
 
     class Var(object):
+
         def __init__(self, var_name):
             self.var_name = var_name
             self.gen_op = None
@@ -684,6 +697,7 @@ def add_pending_op(self, op):
             self.pendding_ops.append(op)
 
     class Op(object):
+
         def __init__(self, op_desc):
             self.op_desc = op_desc
             self.inputs = []
@@ -780,8 +794,9 @@ def serialize_op_decs(op_desc):
     return proto.__str__()
 
 
-def _append_backward_ops_with_checkpoints_(
-        block, ops, target_block, no_grad_dict, grad_to_var, checkpoints):
+def _append_backward_ops_with_checkpoints_(block, ops, target_block,
+                                           no_grad_dict, grad_to_var,
+                                           checkpoints):
     """
     Create grad ops with forward ops, and insert them into given block
 
@@ -867,15 +882,15 @@ def _append_backward_ops_with_checkpoints_(
 
     for i, (idx1, idx2) in enumerate(recompute_segments):
         _logger.info("recompute segment[{}]".format(i))
-        _logger.info("segment start op: [{}]: [{}]".format(ops[idx1].desc.type(
-        ), ops[idx1].desc.input_arg_names()))
-        _logger.info("segment end op: [{}]: [{}]".format(ops[
-            idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names()))
+        _logger.info("segment start op: [{}]: [{}]".format(
+            ops[idx1].desc.type(), ops[idx1].desc.input_arg_names()))
+        _logger.info("segment end op: [{}]: [{}]".format(
+            ops[idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names()))
         _logger.info("recompute segment[{}]".format(i))
-        _logger.info("segment start op: [{}]: [{}]".format(ops[idx1].desc.type(
-        ), ops[idx1].desc.input_arg_names()))
-        _logger.info("segment end op: [{}]: [{}]".format(ops[
-            idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names()))
+        _logger.info("segment start op: [{}]: [{}]".format(
+            ops[idx1].desc.type(), ops[idx1].desc.input_arg_names()))
+        _logger.info("segment end op: [{}]: [{}]".format(
+            ops[idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names()))
 
     # 2) go through all forward ops and induct all variables that will be hold in memory
     vars_should_be_hold = []
@@ -960,13 +975,12 @@ def _append_backward_ops_with_checkpoints_(
 
                     # we should create the rename var in subprog, otherwise its VarType will be BOOL
                     ref_var = block.program.global_block().var(name)
-                    block.create_var(
-                        name=var_name_dict[name],
-                        shape=ref_var.shape,
-                        dtype=ref_var.dtype,
-                        type=ref_var.type,
-                        persistable=ref_var.persistable,
-                        stop_gradient=ref_var.stop_gradient)
+                    block.create_var(name=var_name_dict[name],
+                                     shape=ref_var.shape,
+                                     dtype=ref_var.dtype,
+                                     type=ref_var.type,
+                                     persistable=ref_var.persistable,
+                                     stop_gradient=ref_var.stop_gradient)
 
         # 3.a. add ops in current recompute_segment as forward recomputation ops
         buffer_descs = _add_needed_descs_to_block(ff_ops, buffer_block, block,
@@ -1109,8 +1123,8 @@ def update_distop_context(distop_context, op_grad_to_var,
         for op_desc in grad_op_desc:
             assert op_desc.original_id(
             ) not in distop_context.grad_op_id_to_op_id
-            distop_context.grad_op_id_to_op_id[op_desc.original_id(
-            )] = op.desc.original_id()
+            distop_context.grad_op_id_to_op_id[
+                op_desc.original_id()] = op.desc.original_id()
 
     if callbacks is not None:
         assert (isinstance(callbacks, (list, tuple)))
@@ -1206,7 +1220,8 @@ def update_distop_context(distop_context, op_grad_to_var,
         # But this strategy is not suited for while op for some control flow,
         # for example, for while op, the grads maybe generated in next loop.
         if input_grad_names_set is not None:
-            is_grad_name = lambda name: name.find(core.grad_var_suffix()) != -1 or name in input_grad_names_set
+            is_grad_name = lambda name: name.find(core.grad_var_suffix(
+            )) != -1 or name in input_grad_names_set
             is_append_grad = False
             for op_desc in grad_op_desc:
                 input_grad_names = [
@@ -1356,8 +1371,8 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
             if grad_var_ins:
                 existing_grad_var_ins = [
                     var for var in grad_var_ins
-                    if block.desc.has_var_recursive(cpt.to_bytes(var)) or var in
-                    parent_op_vars
+                    if block.desc.has_var_recursive(cpt.to_bytes(var))
+                    or var in parent_op_vars
                 ]
                 if not existing_grad_var_ins:
                     '''
@@ -1458,8 +1473,8 @@ def _get_no_grad_set_name(no_grad_set):
                         % (type(no_grad_var)))
         else:
             raise TypeError(
-                "The type of no_grad_set should be set or list or tuple, but received {}".
-                format(type(no_grad_set)))
+                "The type of no_grad_set should be set or list or tuple, but received {}"
+                .format(type(no_grad_set)))
     return no_grad_set_name
 
 
@@ -1577,9 +1592,10 @@ def append_backward(loss,
         # the loss is from a cloned program. Find loss op manually.
         _find_loss_op_(loss)
 
-    loss.op._set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(),
-                      int(core.op_proto_and_checker_maker.OpRole.Forward) |
-                      int(core.op_proto_and_checker_maker.OpRole.Loss))
+    loss.op._set_attr(
+        core.op_proto_and_checker_maker.kOpRoleAttrName(),
+        int(core.op_proto_and_checker_maker.OpRole.Forward)
+        | int(core.op_proto_and_checker_maker.OpRole.Loss))
 
     if callbacks is not None:
         check_type(callbacks, 'callbacks', (list, tuple),
@@ -1685,7 +1701,8 @@ def append_backward(loss,
                 callbacks,
                 input_grad_names_set=input_grad_names_set,
                 op_path_dict=op_path_dict,
-                distop_context=distop_context, )
+                distop_context=distop_context,
+            )
 
     grad_info_map = dict()
 
@@ -1710,8 +1727,8 @@ def append_backward(loss,
                    'fluid.backward.append_backward')
         parameters = []
         for i, param in enumerate(parameter_list):
-            check_type(param, 'parameter_list[%s]' % i, (framework.Variable,
-                                                         six.string_types),
+            check_type(param, 'parameter_list[%s]' % i,
+                       (framework.Variable, six.string_types),
                        'fluid.backward.append_backward')
             if isinstance(param, framework.Variable):
                 parameters.append(param.name)
@@ -1875,9 +1892,9 @@ def _find_op_path_(block,
     # All the inputs of the block are used if inputs is empty,
     if inputs:
         for i, op in enumerate(block.ops):
-            if _some_in_set_(
-                    op.desc.input_arg_names(),
-                    input_names) and core.has_non_empty_grad_op_maker(op.type):
+            if _some_in_set_(op.desc.input_arg_names(),
+                             input_names) and core.has_non_empty_grad_op_maker(
+                                 op.type):
                 for name in op.desc.output_arg_names():
                     if name not in no_grad_set:
                         input_names.add(name)
@@ -1889,14 +1906,14 @@ def _find_op_path_(block,
             sub_block_id = op._block_attr_id("sub_block")
             sub_block = block.program.block(sub_block_id)
             sub_block_target_names = output_names & set(op.output_arg_names)
-            sub_block_path = _get_sub_block_path(sub_block, op,
-                                                 set(), op_path_dict,
+            sub_block_path = _get_sub_block_path(sub_block, op, set(),
+                                                 op_path_dict,
                                                  sub_block_target_names)
             op_path_dict[sub_block_id] = sub_block_path
 
-        if _some_in_set_(
-                op.desc.output_arg_names(),
-                output_names) and core.has_non_empty_grad_op_maker(op.type):
+        if _some_in_set_(op.desc.output_arg_names(),
+                         output_names) and core.has_non_empty_grad_op_maker(
+                             op.type):
             for name in op.desc.input_arg_names():
                 if name not in no_grad_set:
                     output_names.add(name)
@@ -2000,8 +2017,8 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
                 raise ValueError("all targets must be in the same block")
             if target.shape != grad.shape:
                 raise ValueError(
-                    "The shapes of target and grad are different: %s %s" % (
-                        target.name, grad.name))
+                    "The shapes of target and grad are different: %s %s" %
+                    (target.name, grad.name))
             target_grad_map[_append_grad_suffix_(target.name)] = grad.name
             input_grad_names_set.add(grad.name)
             rename_var_map[grad_name] = grad.name
@@ -2031,15 +2048,14 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
     no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
     grad_to_var = dict()
     grad_info_map = dict()
-    _append_backward_ops_(
-        block,
-        op_path,
-        block,
-        no_grad_dict,
-        grad_to_var,
-        input_grad_names_set=input_grad_names_set,
-        op_path_dict=op_path_dict,
-        rename_var_map=rename_var_map)
+    _append_backward_ops_(block,
+                          op_path,
+                          block,
+                          no_grad_dict,
+                          grad_to_var,
+                          input_grad_names_set=input_grad_names_set,
+                          op_path_dict=op_path_dict,
+                          rename_var_map=rename_var_map)
 
     # Because calc_gradient may be called multiple times,
     # we need rename the internal gradient variables so that they have
@@ -2107,8 +2123,9 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
                'paddle.static.gradients')
     check_type(inputs, 'inputs', (framework.Variable, list, tuple),
                'paddle.static.gradients')
-    check_type(target_gradients, 'target_gradients', (
-        framework.Variable, list, tuple, type(None)), 'paddle.static.gradients')
+    check_type(target_gradients, 'target_gradients',
+               (framework.Variable, list, tuple, type(None)),
+               'paddle.static.gradients')
 
     from ..incubate.autograd.primx import _gradients
     from ..incubate.autograd.utils import prim_enabled
@@ -2180,8 +2197,8 @@ def gradients_with_optimizer(program, optimizer, inputs=None, outputs=None):
 
     with program_guard(program, None):
         pram_grads = [(pram, grad) for pram, grad in zip(inputs, grads)
-                      if isinstance(pram, paddle.fluid.framework.Parameter) and
-                      grad is not None]
+                      if isinstance(pram, paddle.fluid.framework.Parameter)
+                      and grad is not None]
 
         optimize_ops = optimizer.apply_gradients(pram_grads)
 
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 172929608dbde..df48de8ea29b1 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -91,6 +91,7 @@ def _squared_l2_norm(x):
 
 
 class BaseErrorClipAttr(object):
+
     def __str__(self):
         raise NotImplementedError()
 
@@ -165,8 +166,8 @@ def error_clip_callback(block, context):
     for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
         fwd_var = block._var_recursive(grad_to_var[grad_n])
         error_clip = getattr(fwd_var, "error_clip", None)
-        if not (error_clip is None or isinstance(error_clip,
-                                                 BaseErrorClipAttr)):
+        if not (error_clip is None
+                or isinstance(error_clip, BaseErrorClipAttr)):
             raise TypeError(
                 "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
             )
@@ -175,6 +176,7 @@ def error_clip_callback(block, context):
 
 
 class ClipGradBase(object):
+
     def __init__(self):
         super(ClipGradBase, self).__init__()
 
@@ -526,21 +528,22 @@ def _dygraph_clip(self, params_grads):
             global_norm_var.append(global_norm_var_fp64)
         global_norm_var = paddle.add_n(global_norm_var)
         global_norm_var = layers.sqrt(global_norm_var)
-        max_global_norm = layers.fill_constant(
-            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
+        max_global_norm = layers.fill_constant(shape=[1],
+                                               dtype=global_norm_var.dtype,
+                                               value=self.clip_norm)
 
         need_clip = False
         if not self.auto_skip_clip:  # always apply clip
             need_clip = True
-            clip_var = layers.elementwise_div(
-                x=max_global_norm,
-                y=layers.elementwise_max(
-                    x=global_norm_var, y=max_global_norm))
+            clip_var = layers.elementwise_div(x=max_global_norm,
+                                              y=layers.elementwise_max(
+                                                  x=global_norm_var,
+                                                  y=max_global_norm))
         elif global_norm_var > max_global_norm:
             # only when global_norm_var > max_global_norm, grad need clip
             need_clip = True
-            clip_var = layers.elementwise_div(
-                x=max_global_norm, y=global_norm_var)
+            clip_var = layers.elementwise_div(x=max_global_norm,
+                                              y=global_norm_var)
 
         for p, g in params_grads:
             if g is None:
@@ -550,9 +553,8 @@ def _dygraph_clip(self, params_grads):
                 continue
             # TODO(wangxi): use inplace elementwise_mul
             if need_clip:
-                clip_input = (clip_var.astype('float16')
-                              if g.dtype == core.VarDesc.VarType.FP16 else
-                              clip_var)
+                clip_input = (clip_var.astype('float16') if g.dtype
+                              == core.VarDesc.VarType.FP16 else clip_var)
                 new_grad = _C_ops.elementwise_mul(g, clip_input)
                 params_and_grads.append((p, new_grad))
             else:
@@ -621,10 +623,10 @@ def _static_clip(self, params_grads):
                     shape=[1],
                     dtype=global_norm_var.dtype,
                     value=self.clip_norm)
-                scale_var = layers.elementwise_div(
-                    x=max_global_norm,
-                    y=layers.elementwise_max(
-                        x=max_global_norm, y=global_norm_var))
+                scale_var = layers.elementwise_div(x=max_global_norm,
+                                                   y=layers.elementwise_max(
+                                                       x=max_global_norm,
+                                                       y=global_norm_var))
             param_new_grad_name_dict = dict()
             for p, g in params_grads:
                 if g is None:
@@ -645,20 +647,20 @@ def _static_clip(self, params_grads):
                     # We need to handle the correct block, otherwise will encounter
                     # a 'NotFoundError' during compile time.
                     block = default_main_program().current_block()
-                    block.append_op(
-                        type='elementwise_mul',
-                        inputs={'X': new_g,
-                                'Y': scale_input},
-                        outputs={'Out': new_g})
+                    block.append_op(type='elementwise_mul',
+                                    inputs={
+                                        'X': new_g,
+                                        'Y': scale_input
+                                    },
+                                    outputs={'Out': new_g})
                     if new_g is not g:
-                        block.append_op(
-                            type='cast',
-                            inputs={'X': new_g},
-                            outputs={'Out': g},
-                            attrs={
-                                'in_dtype': new_g.dtype,
-                                'out_dtype': g.dtype
-                            })
+                        block.append_op(type='cast',
+                                        inputs={'X': new_g},
+                                        outputs={'Out': g},
+                                        attrs={
+                                            'in_dtype': new_g.dtype,
+                                            'out_dtype': g.dtype
+                                        })
 
                 param_new_grad_name_dict[p.name] = g.name
                 params_and_grads.append((p, g))
@@ -694,19 +696,20 @@ def _create_operators(self, param, grad):
             group_norm_var = layers.sums(input=self.context[self.group_name])
             group_norm_var = layers.sqrt(x=group_norm_var)
             clip_var = self.context[self.group_name + "_clip"]
-            group_scale_var = layers.elementwise_div(
-                x=clip_var,
-                y=layers.elementwise_max(
-                    x=clip_var, y=group_norm_var))
+            group_scale_var = layers.elementwise_div(x=clip_var,
+                                                     y=layers.elementwise_max(
+                                                         x=clip_var,
+                                                         y=group_norm_var))
             assert group_scale_var.shape == (1, )
             self.context[group_scale_name] = group_scale_var
 
         # inplace
-        param.block.append_op(
-            type='elementwise_mul',
-            inputs={'X': grad,
-                    'Y': self.context[group_scale_name]},
-            outputs={'Out': grad})
+        param.block.append_op(type='elementwise_mul',
+                              inputs={
+                                  'X': grad,
+                                  'Y': self.context[group_scale_name]
+                              },
+                              outputs={'Out': grad})
 
         return param, grad
 
@@ -868,7 +871,7 @@ def append_gradient_clip_ops(param_grads):
 
 
 # change wrong mapping relation between param & grad in clip op
-# Note: This function is sensitive to the time cost of the network with gradient clipping 
+# Note: This function is sensitive to the time cost of the network with gradient clipping
 # and should not be changed easily. If you must change, please test the time cost.
 def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
     block_id_list = []
diff --git a/python/paddle/fluid/communicator.py b/python/paddle/fluid/communicator.py
index d12af8ee72389..291a6b583778c 100644
--- a/python/paddle/fluid/communicator.py
+++ b/python/paddle/fluid/communicator.py
@@ -38,6 +38,7 @@
 
 
 class Communicator(object):
+
     def __init__(self, mode, kwargs=None, envs=None):
         """
         Communicator is used for async distribute training in distribute_transpiler mode.
@@ -67,8 +68,8 @@ def __init__(self, mode, kwargs=None, envs=None):
                 envs = {}
         else:
             if mode == DistributedMode.SYNC:
-                envs["pserver_endpoints"] = ','.join(kwargs[
-                    "pserver_endpoints"])
+                envs["pserver_endpoints"] = ','.join(
+                    kwargs["pserver_endpoints"])
 
             envs["trainers"] = str(kwargs["trainers"])
             envs["trainer_id"] = str(kwargs["trainer_id"])
@@ -208,6 +209,7 @@ def push_sparse_param(self, var_name, table_id=-1, scope=None):
 
 
 class LargeScaleKV(object):
+
     def __init__(self):
         self.scale_kv = core.LargeScaleKV()
 
@@ -222,6 +224,7 @@ def size(self, varname):
 
 
 class HeterClient(object):
+
     def __init__(self, endpoint, previous_endpoint, trainer_id):
         self.heter_client_ = core.HeterClient(endpoint, previous_endpoint,
                                               trainer_id)
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 47c64ff8bd605..06f206c36d111 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -426,9 +426,9 @@ def _compile_data_parallel(self, places, use_device, scope=None):
 
         return core.ParallelExecutor(
             places, self._persistable_vars,
-            cpt.to_text(self._loss_name)
-            if self._loss_name else six.u(''), self._scope, self._local_scopes,
-            self._exec_strategy, self._build_strategy, self._graph)
+            cpt.to_text(self._loss_name) if self._loss_name else six.u(''),
+            self._scope, self._local_scopes, self._exec_strategy,
+            self._build_strategy, self._graph)
 
     def _compile_inference(self):
         return core.create_paddle_predictor(self._infer_config)
@@ -477,8 +477,9 @@ def _compile(self, scope, place):
                 use_device = DeviceType.XPU
             else:
                 use_device = DeviceType.CPU
-            self._executor = self._compile_data_parallel(
-                use_device=use_device, scope=self._scope, places=self._places)
+            self._executor = self._compile_data_parallel(use_device=use_device,
+                                                         scope=self._scope,
+                                                         places=self._places)
         return self
 
     def _get_places(self, place, place_list):
@@ -659,7 +660,9 @@ def set_precision_config(self, enable_fp16=False):
                 ipu_strategy = static.IpuStrategy()
                 ipu_strategy.set_precision_config(enable_fp16=False)
         """
-        options = {'enable_fp16': enable_fp16, }
+        options = {
+            'enable_fp16': enable_fp16,
+        }
         self.set_options(options)
 
     def add_custom_op(self,
diff --git a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
index 7d22e9796ccb9..6032238910ec6 100644
--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
@@ -78,10 +78,13 @@ def __init__(self,
             self._init = init
         elif init_boot is None:
             raise ValueError(
-                'init_boot must be provided to infer the shape of InitState .\n')
+                'init_boot must be provided to infer the shape of InitState .\n'
+            )
         else:
-            self._init = layers.fill_constant_batch_size_like(
-                input=init_boot, value=value, shape=shape, dtype=dtype)
+            self._init = layers.fill_constant_batch_size_like(input=init_boot,
+                                                              value=value,
+                                                              shape=shape,
+                                                              dtype=dtype)
 
         self._shape = shape
         self._value = value
@@ -98,6 +101,7 @@ def need_reorder(self):
 
 
 class _MemoryState(object):
+
     def __init__(self, state_name, rnn_obj, init_state):
         self._state_name = state_name  # each is a rnn.memory
         self._rnn_obj = rnn_obj
@@ -112,6 +116,7 @@ def update_state(self, state):
 
 
 class _ArrayState(object):
+
     def __init__(self, state_name, block, init_state):
         self._state_name = state_name
         self._block = block
@@ -127,25 +132,25 @@ def __init__(self, state_name, block, init_state):
             dtype='int64')
 
         # initialize counter
-        self._block.append_op(
-            type='fill_constant',
-            inputs={},
-            outputs={'Out': [self._counter]},
-            attrs={
-                'shape': [1],
-                'dtype': self._counter.dtype,
-                'value': float(0.0),
-                'force_cpu': True
-            })
+        self._block.append_op(type='fill_constant',
+                              inputs={},
+                              outputs={'Out': [self._counter]},
+                              attrs={
+                                  'shape': [1],
+                                  'dtype': self._counter.dtype,
+                                  'value': float(0.0),
+                                  'force_cpu': True
+                              })
 
         self._counter.stop_gradient = True
 
         # write initial state
-        block.append_op(
-            type='write_to_array',
-            inputs={'X': init_state.value,
-                    'I': self._counter},
-            outputs={'Out': self._state_array})
+        block.append_op(type='write_to_array',
+                        inputs={
+                            'X': init_state.value,
+                            'I': self._counter
+                        },
+                        outputs={'Out': self._state_array})
 
     def get_state(self):
         state = layers.array_read(array=self._state_array, i=self._counter)
@@ -588,18 +593,21 @@ def __init__(self,
         self._counter = layers.zeros(shape=[1], dtype='int64')
         self._counter.stop_gradient = True
         self._type = _DecoderType.BEAM_SEARCH
-        self._max_len = layers.fill_constant(
-            shape=[1], dtype='int64', value=max_len)
-        self._cond = layers.less_than(
-            x=self._counter,
-            y=layers.fill_constant(
-                shape=[1], dtype='int64', value=max_len))
+        self._max_len = layers.fill_constant(shape=[1],
+                                             dtype='int64',
+                                             value=max_len)
+        self._cond = layers.less_than(x=self._counter,
+                                      y=layers.fill_constant(shape=[1],
+                                                             dtype='int64',
+                                                             value=max_len))
         self._while_op = layers.While(self._cond)
         self._state_cell = state_cell
         self._state_cell._enter_decoder(self)
         self._status = BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER
-        self._zero_idx = layers.fill_constant(
-            shape=[1], value=0, dtype='int64', force_cpu=True)
+        self._zero_idx = layers.fill_constant(shape=[1],
+                                              value=0,
+                                              dtype='int64',
+                                              force_cpu=True)
         self._array_dict = {}
         self._array_link = []
         self._ids_array = None
@@ -632,11 +640,13 @@ def block(self):
                     layers.increment(x=self._counter, value=1.0, in_place=True)
 
                     for value, array in self._array_link:
-                        layers.array_write(
-                            x=value, i=self._counter, array=array)
+                        layers.array_write(x=value,
+                                           i=self._counter,
+                                           array=array)
 
-                    layers.less_than(
-                        x=self._counter, y=self._max_len, cond=self._cond)
+                    layers.less_than(x=self._counter,
+                                     y=self._max_len,
+                                     cond=self._cond)
 
         self._status = BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER
         self._state_cell._leave_decoder(self)
@@ -649,8 +659,11 @@ def early_stop(self):
         """
         Stop the generation process in advance. Could be used as "break".
         """
-        layers.fill_constant(
-            shape=[1], value=0, dtype='bool', force_cpu=True, out=self._cond)
+        layers.fill_constant(shape=[1],
+                             value=0,
+                             dtype='bool',
+                             force_cpu=True,
+                             out=self._cond)
 
     def decode(self):
         """
@@ -665,8 +678,8 @@ def decode(self):
         """
         with self.block():
             prev_ids = self.read_array(init=self._init_ids, is_ids=True)
-            prev_scores = self.read_array(
-                init=self._init_scores, is_scores=True)
+            prev_scores = self.read_array(init=self._init_scores,
+                                          is_scores=True)
             prev_ids_embedding = layers.embedding(
                 input=prev_ids,
                 size=[self._target_dict_dim, self._word_dim],
@@ -683,14 +696,14 @@ def decode(self):
 
                 read_var = self.read_array(init=init_var)
                 update_dict[init_var_name] = read_var
-                feed_var_expanded = layers.sequence_expand(read_var,
-                                                           prev_scores)
+                feed_var_expanded = layers.sequence_expand(
+                    read_var, prev_scores)
                 feed_dict[init_var_name] = feed_var_expanded
 
             for state_str in self._state_cell._state_names:
                 prev_state = self.state_cell.get_state(state_str)
-                prev_state_expanded = layers.sequence_expand(prev_state,
-                                                             prev_scores)
+                prev_state_expanded = layers.sequence_expand(
+                    prev_state, prev_scores)
                 self.state_cell.set_state(state_str, prev_state_expanded)
 
             for i, input_name in enumerate(self._state_cell._inputs):
@@ -699,25 +712,23 @@ def decode(self):
 
             self.state_cell.compute_state(inputs=feed_dict)
             current_state = self.state_cell.out_state()
-            current_state_with_lod = layers.lod_reset(
-                x=current_state, y=prev_scores)
+            current_state_with_lod = layers.lod_reset(x=current_state,
+                                                      y=prev_scores)
             scores = layers.fc(input=current_state_with_lod,
                                size=self._target_dict_dim,
                                act='softmax')
             topk_scores, topk_indices = layers.topk(scores, k=self._topk_size)
-            accu_scores = layers.elementwise_add(
-                x=layers.log(x=topk_scores),
-                y=layers.reshape(
-                    prev_scores, shape=[-1]),
-                axis=0)
-            selected_ids, selected_scores = layers.beam_search(
-                prev_ids,
-                prev_scores,
-                topk_indices,
-                accu_scores,
-                self._beam_size,
-                end_id=1,
-                level=0)
+            accu_scores = layers.elementwise_add(x=layers.log(x=topk_scores),
+                                                 y=layers.reshape(prev_scores,
+                                                                  shape=[-1]),
+                                                 axis=0)
+            selected_ids, selected_scores = layers.beam_search(prev_ids,
+                                                               prev_scores,
+                                                               topk_indices,
+                                                               accu_scores,
+                                                               self._beam_size,
+                                                               end_id=1,
+                                                               level=0)
 
             with layers.Switch() as switch:
                 with switch.case(layers.is_empty(selected_ids)):
@@ -764,11 +775,12 @@ def read_array(self, init, is_ids=False, is_scores=False):
             name=unique_name.generate('beam_search_decoder_array'),
             type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
             dtype=init.dtype)
-        parent_block.append_op(
-            type='write_to_array',
-            inputs={'X': init,
-                    'I': self._zero_idx},
-            outputs={'Out': array})
+        parent_block.append_op(type='write_to_array',
+                               inputs={
+                                   'X': init,
+                                   'I': self._zero_idx
+                               },
+                               outputs={'Out': array})
 
         if is_ids:
             self._ids_array = array
@@ -813,11 +825,10 @@ def __call__(self):
         if self._status != BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER:
             raise ValueError('Output of BeamSearchDecoder object can '
                              'only be visited outside the block.')
-        return layers.beam_search_decode(
-            ids=self._ids_array,
-            scores=self._scores_array,
-            beam_size=self._beam_size,
-            end_id=self._end_id)
+        return layers.beam_search_decode(ids=self._ids_array,
+                                         scores=self._scores_array,
+                                         beam_size=self._beam_size,
+                                         end_id=self._end_id)
 
     @property
     def state_cell(self):
diff --git a/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py b/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
index fcc99c07346ea..6a87bb54d3f8b 100644
--- a/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
+++ b/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
@@ -18,6 +18,7 @@
 
 
 class DecoupledWeightDecay(object):
+
     def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs):
         if not isinstance(coeff, float) and \
                 not isinstance(coeff, framework.Variable):
@@ -75,11 +76,10 @@ def minimize(self,
                  startup_program=None,
                  parameter_list=None,
                  no_grad_set=None):
-        params_grads = self.backward(
-            loss=loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set)
+        params_grads = self.backward(loss=loss,
+                                     startup_program=startup_program,
+                                     parameter_list=parameter_list,
+                                     no_grad_set=no_grad_set)
         scaled_params = self._scale_parameters(params_grads)
         for p_grad_sgrad in scaled_params:
             param, grad, scaled_param = p_grad_sgrad
@@ -89,10 +89,9 @@ def minimize(self,
                     x=param, y=scaled_param)
                 paddle.fluid.layers.assign(input=updated_param, output=param)
 
-        optimize_ops = self.apply_optimize(
-            loss=loss,
-            params_grads=params_grads,
-            startup_program=startup_program)
+        optimize_ops = self.apply_optimize(loss=loss,
+                                           params_grads=params_grads,
+                                           startup_program=startup_program)
         return optimize_ops, params_grads
 
     def __str__(self):
@@ -146,7 +145,7 @@ class OptimizerWithDecoupledWeightDecay(DecoupledWeightDecay,
         """
 
         def __init__(self, weight_decay, apply_decay_param_fun=None, **kwargs):
-            super(OptimizerWithDecoupledWeightDecay, self).__init__(
-                weight_decay, apply_decay_param_fun, **kwargs)
+            super(OptimizerWithDecoupledWeightDecay,
+                  self).__init__(weight_decay, apply_decay_param_fun, **kwargs)
 
     return OptimizerWithDecoupledWeightDecay
diff --git a/python/paddle/fluid/contrib/layers/metric_op.py b/python/paddle/fluid/contrib/layers/metric_op.py
index f76a3283f2f81..812f616ef9912 100644
--- a/python/paddle/fluid/contrib/layers/metric_op.py
+++ b/python/paddle/fluid/contrib/layers/metric_op.py
@@ -67,122 +67,136 @@ def ctr_metric_bundle(input, label):
     assert input.shape == label.shape
     helper = LayerHelper("ctr_metric_bundle", **locals())
 
-    local_abserr = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_sqrerr = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_prob = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_q = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_pos_num = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-    local_ins_num = helper.create_global_variable(
-        persistable=True, dtype='float32', shape=[1])
-
-    tmp_res_elesub = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[-1])
-    tmp_res_sigmoid = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[-1])
-    tmp_ones = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[-1])
-
-    batch_prob = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_abserr = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_sqrerr = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_q = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_pos_num = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
-    batch_ins_num = helper.create_global_variable(
-        persistable=False, dtype='float32', shape=[1])
+    local_abserr = helper.create_global_variable(persistable=True,
+                                                 dtype='float32',
+                                                 shape=[1])
+    local_sqrerr = helper.create_global_variable(persistable=True,
+                                                 dtype='float32',
+                                                 shape=[1])
+    local_prob = helper.create_global_variable(persistable=True,
+                                               dtype='float32',
+                                               shape=[1])
+    local_q = helper.create_global_variable(persistable=True,
+                                            dtype='float32',
+                                            shape=[1])
+    local_pos_num = helper.create_global_variable(persistable=True,
+                                                  dtype='float32',
+                                                  shape=[1])
+    local_ins_num = helper.create_global_variable(persistable=True,
+                                                  dtype='float32',
+                                                  shape=[1])
+
+    tmp_res_elesub = helper.create_global_variable(persistable=False,
+                                                   dtype='float32',
+                                                   shape=[-1])
+    tmp_res_sigmoid = helper.create_global_variable(persistable=False,
+                                                    dtype='float32',
+                                                    shape=[-1])
+    tmp_ones = helper.create_global_variable(persistable=False,
+                                             dtype='float32',
+                                             shape=[-1])
+
+    batch_prob = helper.create_global_variable(persistable=False,
+                                               dtype='float32',
+                                               shape=[1])
+    batch_abserr = helper.create_global_variable(persistable=False,
+                                                 dtype='float32',
+                                                 shape=[1])
+    batch_sqrerr = helper.create_global_variable(persistable=False,
+                                                 dtype='float32',
+                                                 shape=[1])
+    batch_q = helper.create_global_variable(persistable=False,
+                                            dtype='float32',
+                                            shape=[1])
+    batch_pos_num = helper.create_global_variable(persistable=False,
+                                                  dtype='float32',
+                                                  shape=[1])
+    batch_ins_num = helper.create_global_variable(persistable=False,
+                                                  dtype='float32',
+                                                  shape=[1])
     for var in [
             local_abserr, batch_abserr, local_sqrerr, batch_sqrerr, local_prob,
             batch_prob, local_q, batch_q, batch_pos_num, batch_ins_num,
             local_pos_num, local_ins_num
     ]:
-        helper.set_variable_initializer(
-            var, Constant(
-                value=0.0, force_cpu=True))
-
-    helper.append_op(
-        type="elementwise_sub",
-        inputs={"X": [input],
-                "Y": [label]},
-        outputs={"Out": [tmp_res_elesub]})
-
-    helper.append_op(
-        type="squared_l2_norm",
-        inputs={"X": [tmp_res_elesub]},
-        outputs={"Out": [batch_sqrerr]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_sqrerr],
-                "Y": [local_sqrerr]},
-        outputs={"Out": [local_sqrerr]})
-
-    helper.append_op(
-        type="l1_norm",
-        inputs={"X": [tmp_res_elesub]},
-        outputs={"Out": [batch_abserr]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_abserr],
-                "Y": [local_abserr]},
-        outputs={"Out": [local_abserr]})
-
-    helper.append_op(
-        type="reduce_sum", inputs={"X": [input]},
-        outputs={"Out": [batch_prob]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_prob],
-                "Y": [local_prob]},
-        outputs={"Out": [local_prob]})
-    helper.append_op(
-        type="sigmoid",
-        inputs={"X": [input]},
-        outputs={"Out": [tmp_res_sigmoid]})
-    helper.append_op(
-        type="reduce_sum",
-        inputs={"X": [tmp_res_sigmoid]},
-        outputs={"Out": [batch_q]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_q],
-                "Y": [local_q]},
-        outputs={"Out": [local_q]})
-
-    helper.append_op(
-        type="reduce_sum",
-        inputs={"X": [label]},
-        outputs={"Out": [batch_pos_num]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_pos_num],
-                "Y": [local_pos_num]},
-        outputs={"Out": [local_pos_num]})
-
-    helper.append_op(
-        type='fill_constant_batch_size_like',
-        inputs={"Input": label},
-        outputs={'Out': [tmp_ones]},
-        attrs={
-            'shape': [-1, 1],
-            'dtype': tmp_ones.dtype,
-            'value': float(1.0),
-        })
-    helper.append_op(
-        type="reduce_sum",
-        inputs={"X": [tmp_ones]},
-        outputs={"Out": [batch_ins_num]})
-    helper.append_op(
-        type="elementwise_add",
-        inputs={"X": [batch_ins_num],
-                "Y": [local_ins_num]},
-        outputs={"Out": [local_ins_num]})
+        helper.set_variable_initializer(var, Constant(value=0.0,
+                                                      force_cpu=True))
+
+    helper.append_op(type="elementwise_sub",
+                     inputs={
+                         "X": [input],
+                         "Y": [label]
+                     },
+                     outputs={"Out": [tmp_res_elesub]})
+
+    helper.append_op(type="squared_l2_norm",
+                     inputs={"X": [tmp_res_elesub]},
+                     outputs={"Out": [batch_sqrerr]})
+    helper.append_op(type="elementwise_add",
+                     inputs={
+                         "X": [batch_sqrerr],
+                         "Y": [local_sqrerr]
+                     },
+                     outputs={"Out": [local_sqrerr]})
+
+    helper.append_op(type="l1_norm",
+                     inputs={"X": [tmp_res_elesub]},
+                     outputs={"Out": [batch_abserr]})
+    helper.append_op(type="elementwise_add",
+                     inputs={
+                         "X": [batch_abserr],
+                         "Y": [local_abserr]
+                     },
+                     outputs={"Out": [local_abserr]})
+
+    helper.append_op(type="reduce_sum",
+                     inputs={"X": [input]},
+                     outputs={"Out": [batch_prob]})
+    helper.append_op(type="elementwise_add",
+                     inputs={
+                         "X": [batch_prob],
+                         "Y": [local_prob]
+                     },
+                     outputs={"Out": [local_prob]})
+    helper.append_op(type="sigmoid",
+                     inputs={"X": [input]},
+                     outputs={"Out": [tmp_res_sigmoid]})
+    helper.append_op(type="reduce_sum",
+                     inputs={"X": [tmp_res_sigmoid]},
+                     outputs={"Out": [batch_q]})
+    helper.append_op(type="elementwise_add",
+                     inputs={
+                         "X": [batch_q],
+                         "Y": [local_q]
+                     },
+                     outputs={"Out": [local_q]})
+
+    helper.append_op(type="reduce_sum",
+                     inputs={"X": [label]},
+                     outputs={"Out": [batch_pos_num]})
+    helper.append_op(type="elementwise_add",
+                     inputs={
+                         "X": [batch_pos_num],
+                         "Y": [local_pos_num]
+                     },
+                     outputs={"Out": [local_pos_num]})
+
+    helper.append_op(type='fill_constant_batch_size_like',
+                     inputs={"Input": label},
+                     outputs={'Out': [tmp_ones]},
+                     attrs={
+                         'shape': [-1, 1],
+                         'dtype': tmp_ones.dtype,
+                         'value': float(1.0),
+                     })
+    helper.append_op(type="reduce_sum",
+                     inputs={"X": [tmp_ones]},
+                     outputs={"Out": [batch_ins_num]})
+    helper.append_op(type="elementwise_add",
+                     inputs={
+                         "X": [batch_ins_num],
+                         "Y": [local_ins_num]
+                     },
+                     outputs={"Out": [local_ins_num]})
 
     return local_sqrerr, local_abserr, local_prob, local_q, local_pos_num, local_ins_num
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index c73ea8b5b0e1a..e71c73b3914b1 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -97,18 +97,21 @@ def fused_elemwise_activation(x,
     helper = LayerHelper('fused_elemwise_activation', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     intermediate_out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='fused_elemwise_activation',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': out,
-                 'IntermediateOut': intermediate_out},
-        attrs={
-            'axis': axis,
-            'scale': scale,
-            'save_intermediate_out': save_intermediate_out,
-            'functor_list': functor_list
-        })
+    helper.append_op(type='fused_elemwise_activation',
+                     inputs={
+                         'X': x,
+                         'Y': y
+                     },
+                     outputs={
+                         'Out': out,
+                         'IntermediateOut': intermediate_out
+                     },
+                     attrs={
+                         'axis': axis,
+                         'scale': scale,
+                         'save_intermediate_out': save_intermediate_out,
+                         'functor_list': functor_list
+                     })
     return out
 
 
@@ -202,30 +205,32 @@ def var_conv_2d(input,
     filter_param = helper.create_parameter(
         attr=helper.param_attr,
         shape=filter_shape,
-        dtype=dtype, )
+        dtype=dtype,
+    )
 
     conv_res = helper.create_variable_for_type_inference(dtype)
-    tmp_res = helper.create_variable_for_type_inference(
-        dtype, stop_gradient=True)
-
-    helper.append_op(
-        type='var_conv_2d',
-        inputs={
-            'X': input,
-            'ROW': row,
-            'COLUMN': col,
-            'W': filter_param,
-        },
-        outputs={"Out": conv_res,
-                 "Col": tmp_res},
-        attrs={
-            'InputChannel': input_channel,
-            'OutputChannel': output_channel,
-            'StrideH': stride[0],
-            'StrideW': stride[1],
-            'KernelH': filter_size[0],
-            'KernelW': filter_size[1],
-        })
+    tmp_res = helper.create_variable_for_type_inference(dtype,
+                                                        stop_gradient=True)
+
+    helper.append_op(type='var_conv_2d',
+                     inputs={
+                         'X': input,
+                         'ROW': row,
+                         'COLUMN': col,
+                         'W': filter_param,
+                     },
+                     outputs={
+                         "Out": conv_res,
+                         "Col": tmp_res
+                     },
+                     attrs={
+                         'InputChannel': input_channel,
+                         'OutputChannel': output_channel,
+                         'StrideH': stride[0],
+                         'StrideW': stride[1],
+                         'KernelH': filter_size[0],
+                         'KernelW': filter_size[1],
+                     })
 
     return helper.append_activation(conv_res)
 
@@ -294,25 +299,28 @@ def match_matrix_tensor(x,
 
     x_shape = list(x.shape)
     y_shape = list(y.shape)
-    assert len(x_shape) == 2 and len(y_shape) == 2 and x_shape[-1] == y_shape[
-        -1]
+    assert len(x_shape) == 2 and len(
+        y_shape) == 2 and x_shape[-1] == y_shape[-1]
 
     weight_shape = [x_shape[-1], channel_num, y_shape[-1]]
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=weight_shape, dtype=dtype, is_bias=False)
+    w = helper.create_parameter(attr=helper.param_attr,
+                                shape=weight_shape,
+                                dtype=dtype,
+                                is_bias=False)
     mm_res = helper.create_variable_for_type_inference(dtype)
-    tmp_res = helper.create_variable_for_type_inference(
-        dtype, stop_gradient=True)
-    helper.append_op(
-        type='match_matrix_tensor',
-        inputs={
-            'X': x,
-            'Y': y,
-            'W': w,
-        },
-        outputs={"Out": mm_res,
-                 "Tmp": tmp_res},
-        attrs={'dim_t': channel_num})
+    tmp_res = helper.create_variable_for_type_inference(dtype,
+                                                        stop_gradient=True)
+    helper.append_op(type='match_matrix_tensor',
+                     inputs={
+                         'X': x,
+                         'Y': y,
+                         'W': w,
+                     },
+                     outputs={
+                         "Out": mm_res,
+                         "Tmp": tmp_res
+                     },
+                     attrs={'dim_t': channel_num})
 
     return helper.append_activation(mm_res), tmp_res
 
@@ -370,17 +378,22 @@ def sequence_topk_avg_pooling(input, row, col, topks, channel_num):
     """
     helper = LayerHelper('sequence_topk_avg_pooling', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    pos = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype(), stop_gradient=True)
-    helper.append_op(
-        type='sequence_topk_avg_pooling',
-        inputs={'X': input,
-                'ROW': row,
-                'COLUMN': col},
-        outputs={'Out': out,
-                 'pos': pos},
-        attrs={'topks': topks,
-               'channel_num': channel_num})
+    pos = helper.create_variable_for_type_inference(dtype=helper.input_dtype(),
+                                                    stop_gradient=True)
+    helper.append_op(type='sequence_topk_avg_pooling',
+                     inputs={
+                         'X': input,
+                         'ROW': row,
+                         'COLUMN': col
+                     },
+                     outputs={
+                         'Out': out,
+                         'pos': pos
+                     },
+                     attrs={
+                         'topks': topks,
+                         'channel_num': channel_num
+                     })
 
     return out
 
@@ -439,16 +452,21 @@ def tree_conv(nodes_vector,
     dtype = helper.input_dtype('nodes_vector')
     feature_size = nodes_vector.shape[2]
     W_shape = [feature_size, 3, output_size, num_filters]
-    W = helper.create_parameter(
-        attr=param_attr, shape=W_shape, dtype=dtype, is_bias=False)
+    W = helper.create_parameter(attr=param_attr,
+                                shape=W_shape,
+                                dtype=dtype,
+                                is_bias=False)
     out = helper.create_variable_for_type_inference(dtype=dtype)
-    helper.append_op(
-        type='tree_conv',
-        inputs={'NodesVector': nodes_vector,
-                'EdgeSet': edge_set,
-                'Filter': W},
-        outputs={'Out': out, },
-        attrs={'max_depth': max_depth})
+    helper.append_op(type='tree_conv',
+                     inputs={
+                         'NodesVector': nodes_vector,
+                         'EdgeSet': edge_set,
+                         'Filter': W
+                     },
+                     outputs={
+                         'Out': out,
+                     },
+                     attrs={'max_depth': max_depth})
     if helper.bias_attr:
         pre_activation = helper.append_bias_op(out)
     else:
@@ -505,21 +523,24 @@ def fused_embedding_seq_pool(input,
                 is_sparse=False)
     """
     helper = LayerHelper('fused_embedding_seq_pool', **locals())
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
+    w = helper.create_parameter(attr=helper.param_attr,
+                                shape=size,
+                                dtype=dtype,
+                                is_bias=False)
     out = helper.create_variable_for_type_inference(dtype)
     padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
         size[0] + padding_idx)
-    helper.append_op(
-        type='fused_embedding_seq_pool',
-        inputs={'Ids': input,
-                'W': w},
-        outputs={'Out': out},
-        attrs={
-            'is_sparse': is_sparse,
-            'combiner': combiner,
-            'padding_idx': padding_idx
-        })
+    helper.append_op(type='fused_embedding_seq_pool',
+                     inputs={
+                         'Ids': input,
+                         'W': w
+                     },
+                     outputs={'Out': out},
+                     attrs={
+                         'is_sparse': is_sparse,
+                         'combiner': combiner,
+                         'padding_idx': padding_idx
+                     })
     return out
 
 
@@ -589,17 +610,18 @@ def fused_seqpool_cvm(input,
         for i in range(len(inputs))
     ]
 
-    helper.append_op(
-        type="fused_seqpool_cvm",
-        inputs={"X": inputs,
-                "CVM": cvm},
-        outputs={"Out": outs},
-        attrs={
-            "pooltype": pool_type.upper(),
-            "pad_value": pad_value,
-            "use_cvm": use_cvm,
-            "cvm_offset": cvm_offset,
-        })
+    helper.append_op(type="fused_seqpool_cvm",
+                     inputs={
+                         "X": inputs,
+                         "CVM": cvm
+                     },
+                     outputs={"Out": outs},
+                     attrs={
+                         "pooltype": pool_type.upper(),
+                         "pad_value": pad_value,
+                         "use_cvm": use_cvm,
+                         "cvm_offset": cvm_offset,
+                     })
 
     return outs
 
@@ -710,21 +732,24 @@ class number
 
     output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
     index = helper.create_variable_for_type_inference(dtype='int')
-    helper.append_op(
-        type="multiclass_nms2",
-        inputs={'BBoxes': bboxes,
-                'Scores': scores},
-        attrs={
-            'background_label': background_label,
-            'score_threshold': score_threshold,
-            'nms_top_k': nms_top_k,
-            'nms_threshold': nms_threshold,
-            'keep_top_k': keep_top_k,
-            'nms_eta': nms_eta,
-            'normalized': normalized
-        },
-        outputs={'Out': output,
-                 'Index': index})
+    helper.append_op(type="multiclass_nms2",
+                     inputs={
+                         'BBoxes': bboxes,
+                         'Scores': scores
+                     },
+                     attrs={
+                         'background_label': background_label,
+                         'score_threshold': score_threshold,
+                         'nms_top_k': nms_top_k,
+                         'nms_threshold': nms_threshold,
+                         'keep_top_k': keep_top_k,
+                         'nms_eta': nms_eta,
+                         'normalized': normalized
+                     },
+                     outputs={
+                         'Out': output,
+                         'Index': index
+                     })
     output.stop_gradient = True
     index.stop_gradient = True
 
@@ -787,22 +812,28 @@ def search_pyramid_hash(input,
     helper = LayerHelper('search_pyramid_hash', **locals())
 
     w_shape = [space_len + rand_len, 1]
-    w = helper.create_parameter(
-        attr=param_attr, shape=w_shape, dtype=dtype, is_bias=False)
+    w = helper.create_parameter(attr=param_attr,
+                                shape=w_shape,
+                                dtype=dtype,
+                                is_bias=False)
     w.stop_gradient = True
 
     input_vars = {'X': input, 'W': w}
     if white_list_len > 0:
         wl_shape = [white_list_len, 1]
-        white_list = helper.create_parameter(
-            attr=param_attr_wl, shape=wl_shape, dtype=dtype, is_bias=False)
+        white_list = helper.create_parameter(attr=param_attr_wl,
+                                             shape=wl_shape,
+                                             dtype=dtype,
+                                             is_bias=False)
         white_list.stop_gradient = True
         input_vars['WhiteList'] = white_list
 
     if black_list_len >= 0:
         bl_shape = [black_list_len, 1]
-        black_list = helper.create_parameter(
-            attr=param_attr_bl, shape=bl_shape, dtype=dtype, is_bias=False)
+        black_list = helper.create_parameter(attr=param_attr_bl,
+                                             shape=bl_shape,
+                                             dtype=dtype,
+                                             is_bias=False)
         black_list.stop_gradient = True
         input_vars['BlackList'] = black_list
 
@@ -825,26 +856,27 @@ def search_pyramid_hash(input,
     res = helper.create_variable_for_type_inference(dtype)
     drop_pos = helper.create_variable_for_type_inference(dtype)
     x_temp_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='pyramid_hash',
-        inputs=input_vars,
-        outputs={"Out": res,
-                 "X_Temp_Out": x_temp_out,
-                 'DropPos': drop_pos},
-        attrs={
-            'num_emb': num_emb,
-            'space_len': space_len,
-            'pyramid_layer': pyramid_layer,
-            'rand_len': rand_len,
-            'drop_out_percent': drop_out_percent,
-            'is_training': is_training,
-            'use_filter': use_filter,
-            'white_list_len': white_list_len,
-            'black_list_len': black_list_len,
-            'seed': seed,
-            'lr': lr,
-            'distribute_update_vars': distribute_update_vars_str
-        })
+    helper.append_op(type='pyramid_hash',
+                     inputs=input_vars,
+                     outputs={
+                         "Out": res,
+                         "X_Temp_Out": x_temp_out,
+                         'DropPos': drop_pos
+                     },
+                     attrs={
+                         'num_emb': num_emb,
+                         'space_len': space_len,
+                         'pyramid_layer': pyramid_layer,
+                         'rand_len': rand_len,
+                         'drop_out_percent': drop_out_percent,
+                         'is_training': is_training,
+                         'use_filter': use_filter,
+                         'white_list_len': white_list_len,
+                         'black_list_len': black_list_len,
+                         'seed': seed,
+                         'lr': lr,
+                         'distribute_update_vars': distribute_update_vars_str
+                     })
 
     return res
 
@@ -902,14 +934,17 @@ def shuffle_batch(x, seed=None):
             name=unique_name.generate("shuffle_batch_seed"),
             dtype="int64",
             persistable=True)
-    helper.append_op(
-        type='shuffle_batch',
-        inputs={'X': x,
-                'Seed': seed},
-        outputs={'Out': out,
-                 'ShuffleIdx': shuffle_idx,
-                 'SeedOut': seed},
-        attrs=op_attrs)
+    helper.append_op(type='shuffle_batch',
+                     inputs={
+                         'X': x,
+                         'Seed': seed
+                     },
+                     outputs={
+                         'Out': out,
+                         'ShuffleIdx': shuffle_idx,
+                         'SeedOut': seed
+                     },
+                     attrs=op_attrs)
     return out
 
 
@@ -968,11 +1003,10 @@ def partial_concat(input, start_index=0, length=-1):
     attrs = {'start_index': start_index, 'length': length}
     helper = LayerHelper('partial_concat', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='partial_concat',
-        inputs=inputs,
-        outputs={'Out': [out]},
-        attrs=attrs)
+    helper.append_op(type='partial_concat',
+                     inputs=inputs,
+                     outputs={'Out': [out]},
+                     attrs=attrs)
     return out
 
 
@@ -1025,8 +1059,10 @@ def partial_sum(input, start_index=0, length=-1):
     attrs['length'] = length
     helper = LayerHelper('partial_sum', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='partial_sum', inputs=inputs, outputs={'Out': [out]}, attrs=attrs)
+    helper.append_op(type='partial_sum',
+                     inputs=inputs,
+                     outputs={'Out': [out]},
+                     attrs=attrs)
     return out
 
 
@@ -1155,12 +1191,11 @@ def sparse_embedding(input,
     check_dtype(dtype, 'dtype', ['float32', 'float64'],
                 'paddle.static.nn.sparse_embedding')
 
-    w = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=size,
-        type=core.VarDesc.VarType.SELECTED_ROWS,
-        dtype=dtype,
-        is_bias=False)
+    w = helper.create_parameter(attr=helper.param_attr,
+                                shape=size,
+                                type=core.VarDesc.VarType.SELECTED_ROWS,
+                                dtype=dtype,
+                                is_bias=False)
 
     tmp = helper.create_variable_for_type_inference(dtype)
 
@@ -1185,20 +1220,21 @@ def sparse_embedding(input,
             )
         entry_str = entry._to_attr()
 
-    helper.append_op(
-        type='lookup_table',
-        inputs={'Ids': input,
-                'W': w},
-        outputs={'Out': tmp},
-        attrs={
-            'padding_idx': padding_idx,
-            'is_sparse': True,
-            'is_distributed': True,
-            'remote_prefetch': True,
-            'is_test': is_test,
-            'entry': entry_str,
-            'table_class': table_class
-        })
+    helper.append_op(type='lookup_table',
+                     inputs={
+                         'Ids': input,
+                         'W': w
+                     },
+                     outputs={'Out': tmp},
+                     attrs={
+                         'padding_idx': padding_idx,
+                         'is_sparse': True,
+                         'is_distributed': True,
+                         'remote_prefetch': True,
+                         'is_test': is_test,
+                         'entry': entry_str,
+                         'table_class': table_class
+                     })
 
     return tmp
 
@@ -1266,25 +1302,29 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
     check_dtype(dtype, 'dtype', ['int32', 'int64'],
                 'fluid.contrib.layers.tdm_child')
     c_dtype = convert_np_dtype_to_dtype_(dtype)
-    tree_info = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[node_nums, 3 + child_nums],
-        dtype=dtype,
-        default_initializer=Constant(0))
+    tree_info = helper.create_parameter(attr=helper.param_attr,
+                                        shape=[node_nums, 3 + child_nums],
+                                        dtype=dtype,
+                                        default_initializer=Constant(0))
     tree_info.stop_gradient = True
 
     child = helper.create_variable_for_type_inference(dtype=dtype)
     leaf_mask = helper.create_variable_for_type_inference(dtype=dtype)
 
-    helper.append_op(
-        type='tdm_child',
-        inputs={'X': x,
-                'TreeInfo': tree_info},
-        outputs={'Child': child,
-                 'LeafMask': leaf_mask},
-        attrs={'child_nums': child_nums,
-               'dtype': c_dtype},
-        stop_gradient=True)
+    helper.append_op(type='tdm_child',
+                     inputs={
+                         'X': x,
+                         'TreeInfo': tree_info
+                     },
+                     outputs={
+                         'Child': child,
+                         'LeafMask': leaf_mask
+                     },
+                     attrs={
+                         'child_nums': child_nums,
+                         'dtype': c_dtype
+                     },
+                     stop_gradient=True)
     return (child, leaf_mask)
 
 
@@ -1411,23 +1451,21 @@ def tdm_sampler(x,
                 "The number of negative samples must be less than the number of nodes "
                 "in the layer {}, But received negative nums {}, and num of node at layer {} "
                 "is {}, please check your input.".format(
-                    layer_idx, neg_samples_num_list[
-                        layer_idx], layer_idx, layer_node_num_list[layer_idx]))
+                    layer_idx, neg_samples_num_list[layer_idx], layer_idx,
+                    layer_node_num_list[layer_idx]))
     assert leaf_node_num < node_nums, "leaf_node_num must be less than total node nums."
 
     travel_shape = [leaf_node_num, layer_nums]
-    travel = helper.create_parameter(
-        attr=tree_travel_attr,
-        shape=travel_shape,
-        dtype=tree_dtype,
-        default_initializer=Constant(0))
+    travel = helper.create_parameter(attr=tree_travel_attr,
+                                     shape=travel_shape,
+                                     dtype=tree_dtype,
+                                     default_initializer=Constant(0))
 
     layer_shape = [node_nums, 1]
-    layer = helper.create_parameter(
-        attr=tree_layer_attr,
-        shape=layer_shape,
-        dtype=tree_dtype,
-        default_initializer=Constant(0))
+    layer = helper.create_parameter(attr=tree_layer_attr,
+                                    shape=layer_shape,
+                                    dtype=tree_dtype,
+                                    default_initializer=Constant(0))
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
     out.stop_gradient = True
@@ -1438,21 +1476,24 @@ def tdm_sampler(x,
     mask = helper.create_variable_for_type_inference(dtype=dtype)
     mask.stop_gradient = True
 
-    helper.append_op(
-        type='tdm_sampler',
-        inputs={"X": x,
-                "Travel": travel,
-                "Layer": layer},
-        outputs={'Out': out,
-                 'Labels': labels,
-                 'Mask': mask},
-        attrs={
-            'neg_samples_num_list': neg_samples_num_list,
-            'output_positive': output_positive,
-            'layer_offset_lod': tree_layer_offset_lod,
-            'seed': seed,
-            'dtype': c_dtype
-        })
+    helper.append_op(type='tdm_sampler',
+                     inputs={
+                         "X": x,
+                         "Travel": travel,
+                         "Layer": layer
+                     },
+                     outputs={
+                         'Out': out,
+                         'Labels': labels,
+                         'Mask': mask
+                     },
+                     attrs={
+                         'neg_samples_num_list': neg_samples_num_list,
+                         'output_positive': output_positive,
+                         'layer_offset_lod': tree_layer_offset_lod,
+                         'seed': seed,
+                         'dtype': c_dtype
+                     })
 
     if output_list:
         output_list = []
@@ -1466,12 +1507,18 @@ def tdm_sampler(x,
         for layer_sample_num in neg_samples_num_list:
             end_offset = start_offset + \
                 layer_sample_num + positive_flag
-            layer_samples = slice(
-                out, axes=[1], starts=[start_offset], ends=[end_offset])
-            layer_labels = slice(
-                labels, axes=[1], starts=[start_offset], ends=[end_offset])
-            layer_mask = slice(
-                mask, axes=[1], starts=[start_offset], ends=[end_offset])
+            layer_samples = slice(out,
+                                  axes=[1],
+                                  starts=[start_offset],
+                                  ends=[end_offset])
+            layer_labels = slice(labels,
+                                 axes=[1],
+                                 starts=[start_offset],
+                                 ends=[end_offset])
+            layer_mask = slice(mask,
+                               axes=[1],
+                               starts=[start_offset],
+                               ends=[end_offset])
 
             layer_samples = reshape(layer_samples,
                                     [-1, layer_sample_num + positive_flag, 1])
@@ -1540,28 +1587,32 @@ def rank_attention(input,
     input_shape = input.shape
     assert input_shape[1] * max_rank * max_rank == rank_param_shape[0]
 
-    rank_param = helper.create_parameter(
-        attr=rank_param_attr, shape=rank_param_shape, dtype=dtype)
+    rank_param = helper.create_parameter(attr=rank_param_attr,
+                                         shape=rank_param_shape,
+                                         dtype=dtype)
     rank_param.stop_gradient = False
 
     output = helper.create_variable_for_type_inference(dtype)
-    input_help = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-    ins_rank = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-
-    helper.append_op(
-        type="rank_attention",
-        inputs={
-            "X": input,
-            "RankOffset": rank_offset,
-            "RankParam": rank_param
-        },
-        outputs={"Out": output,
-                 "InputHelp": input_help,
-                 "InsRank": ins_rank},
-        attrs={"MaxRank": max_rank,
-               "MaxSize": max_size})
+    input_help = helper.create_variable_for_type_inference(dtype=dtype,
+                                                           stop_gradient=True)
+    ins_rank = helper.create_variable_for_type_inference(dtype=dtype,
+                                                         stop_gradient=True)
+
+    helper.append_op(type="rank_attention",
+                     inputs={
+                         "X": input,
+                         "RankOffset": rank_offset,
+                         "RankParam": rank_param
+                     },
+                     outputs={
+                         "Out": output,
+                         "InputHelp": input_help,
+                         "InsRank": ins_rank
+                     },
+                     attrs={
+                         "MaxRank": max_rank,
+                         "MaxSize": max_size
+                     })
     return output
 
 
@@ -1614,17 +1665,22 @@ def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None):
     dtype = helper.input_dtype()
     check_dtype(dtype, 'input', ['float32', 'float64'], 'batch_fc')
 
-    w = helper.create_parameter(
-        attr=param_attr, shape=param_size, dtype=dtype, is_bias=False)
-    b = helper.create_parameter(
-        attr=bias_attr, shape=bias_size, dtype=dtype, is_bias=False)
+    w = helper.create_parameter(attr=param_attr,
+                                shape=param_size,
+                                dtype=dtype,
+                                is_bias=False)
+    b = helper.create_parameter(attr=bias_attr,
+                                shape=bias_size,
+                                dtype=dtype,
+                                is_bias=False)
     pre_act = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="batch_fc",
-        inputs={"Input": input,
-                "W": w,
-                "Bias": b},
-        outputs={"Out": pre_act})
+    helper.append_op(type="batch_fc",
+                     inputs={
+                         "Input": input,
+                         "W": w,
+                         "Bias": b
+                     },
+                     outputs={"Out": pre_act})
     return helper.append_activation(pre_act)
 
 
@@ -1663,13 +1719,16 @@ def _pull_box_extended_sparse(input, size, extend_size=64, dtype='float32'):
         helper.create_variable_for_type_inference(dtype)
         for i in range(len(inputs))
     ]
-    helper.append_op(
-        type='pull_box_extended_sparse',
-        inputs={'Ids': inputs},
-        outputs={'Out': outs,
-                 'OutExtend': outs_extend},
-        attrs={'emb_size': size,
-               'emb_extended_size': extend_size})
+    helper.append_op(type='pull_box_extended_sparse',
+                     inputs={'Ids': inputs},
+                     outputs={
+                         'Out': outs,
+                         'OutExtend': outs_extend
+                     },
+                     attrs={
+                         'emb_size': size,
+                         'emb_extended_size': extend_size
+                     })
     if len(outs) == 1:
         return outs[0], outs_extend[0]
     return outs, outs_extend
@@ -1730,11 +1789,10 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
     helper = LayerHelper("bilateral_slice", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     inputs = {'X': x, 'Guide': guide, 'Grid': grid}
-    helper.append_op(
-        type='bilateral_slice',
-        inputs=inputs,
-        attrs={'has_offset': has_offset},
-        outputs={'Out': out})
+    helper.append_op(type='bilateral_slice',
+                     inputs=inputs,
+                     attrs={'has_offset': has_offset},
+                     outputs={'Out': out})
     return out
 
 
@@ -1800,19 +1858,20 @@ def correlation(x,
     else:
         helper = LayerHelper("correlation", **locals())
         output = helper.create_variable_for_type_inference(dtype=x.dtype)
-        helper.append_op(
-            type="correlation",
-            inputs={"Input1": x,
-                    "Input2": y},
-            attrs={
-                "pad_size": pad_size,
-                "kernel_size": kernel_size,
-                "max_displacement": max_displacement,
-                "stride1": stride1,
-                "stride2": stride2,
-                "corr_type_multiply": corr_type_multiply
-            },
-            outputs={"Output": output})
+        helper.append_op(type="correlation",
+                         inputs={
+                             "Input1": x,
+                             "Input2": y
+                         },
+                         attrs={
+                             "pad_size": pad_size,
+                             "kernel_size": kernel_size,
+                             "max_displacement": max_displacement,
+                             "stride1": stride1,
+                             "stride2": stride2,
+                             "corr_type_multiply": corr_type_multiply
+                         },
+                         outputs={"Output": output})
     return output
 
 
@@ -1939,29 +1998,25 @@ def build_program(main_program, startup_program):
     param_shape = [channel_num]
 
     # create parameter
-    scale = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=bn_param_dtype,
-        default_initializer=Constant(1.0))
-    bias = helper.create_parameter(
-        attr=helper.bias_attr,
-        shape=param_shape,
-        dtype=bn_param_dtype,
-        is_bias=True)
-    mean = helper.create_parameter(
-        attr=ParamAttr(
-            name=moving_mean_name, initializer=Constant(0.0), trainable=False),
-        shape=param_shape,
-        dtype=bn_param_dtype)
+    scale = helper.create_parameter(attr=helper.param_attr,
+                                    shape=param_shape,
+                                    dtype=bn_param_dtype,
+                                    default_initializer=Constant(1.0))
+    bias = helper.create_parameter(attr=helper.bias_attr,
+                                   shape=param_shape,
+                                   dtype=bn_param_dtype,
+                                   is_bias=True)
+    mean = helper.create_parameter(attr=ParamAttr(name=moving_mean_name,
+                                                  initializer=Constant(0.0),
+                                                  trainable=False),
+                                   shape=param_shape,
+                                   dtype=bn_param_dtype)
     mean.stop_gradient = True
-    variance = helper.create_parameter(
-        attr=ParamAttr(
-            name=moving_variance_name,
-            initializer=Constant(1.0),
-            trainable=False),
-        shape=param_shape,
-        dtype=bn_param_dtype)
+    variance = helper.create_parameter(attr=ParamAttr(name=moving_variance_name,
+                                                      initializer=Constant(1.0),
+                                                      trainable=False),
+                                       shape=param_shape,
+                                       dtype=bn_param_dtype)
     variance.stop_gradient = True
 
     # create output
@@ -1969,8 +2024,8 @@ def build_program(main_program, startup_program):
     mean_out = mean
     # variance and variance out share the same memory
     variance_out = variance
-    saved_mean = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(dtype=bn_param_dtype,
+                                                           stop_gradient=True)
     saved_variance = helper.create_variable_for_type_inference(
         dtype=bn_param_dtype, stop_gradient=True)
     reserve_space = helper.create_variable_for_type_inference(
@@ -1995,11 +2050,10 @@ def build_program(main_program, startup_program):
         "ReserveSpace": reserve_space
     }
 
-    helper.append_op(
-        type="fused_bn_add_activation",
-        inputs=inputs,
-        outputs=outputs,
-        attrs=attrs)
+    helper.append_op(type="fused_bn_add_activation",
+                     inputs=inputs,
+                     outputs=outputs,
+                     attrs=attrs)
 
     return batch_norm_out
 
@@ -2019,21 +2073,25 @@ def pow2_decay_with_linear_warmup(warmup_steps,
     helper.set_variable_initializer(
         lr, Constant(value=float(base_lr) / warmup_steps))
 
-    step = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[1])
+    step = helper.create_global_variable(persistable=True,
+                                         dtype='int64',
+                                         shape=[1])
     helper.set_variable_initializer(step, Constant(value=0))
     assert warmup_steps <= total_steps, "warmup_steps cannot be larger than total_steps"
 
-    helper.append_op(
-        type="pow2_decay_with_linear_warmup",
-        inputs={"LearningRate": lr,
-                "Step": step},
-        outputs={"LearningRateOut": lr,
-                 "StepOut": step},
-        attrs={
-            "warmup_steps": warmup_steps,
-            "total_steps": total_steps,
-            "base_lr": base_lr,
-            "end_lr": end_lr,
-        })
+    helper.append_op(type="pow2_decay_with_linear_warmup",
+                     inputs={
+                         "LearningRate": lr,
+                         "Step": step
+                     },
+                     outputs={
+                         "LearningRateOut": lr,
+                         "StepOut": step
+                     },
+                     attrs={
+                         "warmup_steps": warmup_steps,
+                         "total_steps": total_steps,
+                         "base_lr": base_lr,
+                         "end_lr": end_lr,
+                     })
     return lr
diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py
index a2dd0835b6064..0b14948bff984 100644
--- a/python/paddle/fluid/contrib/layers/rnn_impl.py
+++ b/python/paddle/fluid/contrib/layers/rnn_impl.py
@@ -128,16 +128,14 @@ def _build_once(self, input, pre_hidden):
             gate_bias_attr = self._bias_attr
             candidate_bias_attr = self._bias_attr
 
-        self._gate_bias = self.create_parameter(
-            attr=gate_bias_attr,
-            shape=[2 * self._hiden_size],
-            dtype=self._dtype,
-            is_bias=True)
-        self._candidate_bias = self.create_parameter(
-            attr=candidate_bias_attr,
-            shape=[self._hiden_size],
-            dtype=self._dtype,
-            is_bias=True)
+        self._gate_bias = self.create_parameter(attr=gate_bias_attr,
+                                                shape=[2 * self._hiden_size],
+                                                dtype=self._dtype,
+                                                is_bias=True)
+        self._candidate_bias = self.create_parameter(attr=candidate_bias_attr,
+                                                     shape=[self._hiden_size],
+                                                     dtype=self._dtype,
+                                                     is_bias=True)
 
     def forward(self, input, pre_hidden):
         concat_input_hidden = layers.concat([input, pre_hidden], 1)
@@ -151,8 +149,8 @@ def forward(self, input, pre_hidden):
 
         r_hidden = r * pre_hidden
 
-        candidate = layers.matmul(
-            layers.concat([input, r_hidden], 1), self._candidate_weight)
+        candidate = layers.matmul(layers.concat([input, r_hidden], 1),
+                                  self._candidate_weight)
         candidate = layers.elementwise_add(candidate, self._candidate_bias)
 
         c = self._activation(candidate)
@@ -304,8 +302,9 @@ def basic_gru(input,
     mask = None
     if sequence_length:
         max_seq_len = layers.shape(input)[0]
-        mask = layers.sequence_mask(
-            sequence_length, maxlen=max_seq_len, dtype='float32')
+        mask = layers.sequence_mask(sequence_length,
+                                    maxlen=max_seq_len,
+                                    dtype='float32')
         mask = layers.transpose(mask, [1, 0])
 
     direc_num = 1
@@ -330,10 +329,9 @@ def get_single_direction_output(rnn_input,
                 if init_hidden:
                     pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
                 else:
-                    pre_hidden = rnn.memory(
-                        batch_ref=rnn_input,
-                        shape=[-1, hidden_size],
-                        ref_batch_dim_idx=1)
+                    pre_hidden = rnn.memory(batch_ref=rnn_input,
+                                            shape=[-1, hidden_size],
+                                            ref_batch_dim_idx=1)
 
                 new_hidden = unit_list[i](step_input, pre_hidden)
 
@@ -349,7 +347,8 @@ def get_single_direction_output(rnn_input,
                 if dropout_prob != None and dropout_prob > 0.0:
                     step_input = layers.dropout(
                         step_input,
-                        dropout_prob=dropout_prob, )
+                        dropout_prob=dropout_prob,
+                    )
 
             rnn.step_output(step_input)
 
@@ -363,22 +362,26 @@ def get_single_direction_output(rnn_input,
             last_hidden_array.append(last_hidden)
 
         last_hidden_output = layers.concat(last_hidden_array, axis=0)
-        last_hidden_output = layers.reshape(
-            last_hidden_output, shape=[num_layers, -1, hidden_size])
+        last_hidden_output = layers.reshape(last_hidden_output,
+                                            shape=[num_layers, -1, hidden_size])
 
         return rnn_output, last_hidden_output
         # seq_len, batch_size, hidden_size
 
-    fw_rnn_out, fw_last_hidden = get_single_direction_output(
-        input, fw_unit_list, mask, direc_index=0)
+    fw_rnn_out, fw_last_hidden = get_single_direction_output(input,
+                                                             fw_unit_list,
+                                                             mask,
+                                                             direc_index=0)
 
     if bidirectional:
         bw_input = layers.reverse(input, axis=[0])
         bw_mask = None
         if mask:
             bw_mask = layers.reverse(mask, axis=[0])
-        bw_rnn_out, bw_last_hidden = get_single_direction_output(
-            bw_input, bw_unit_list, bw_mask, direc_index=1)
+        bw_rnn_out, bw_last_hidden = get_single_direction_output(bw_input,
+                                                                 bw_unit_list,
+                                                                 bw_mask,
+                                                                 direc_index=1)
 
         bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0])
 
@@ -532,15 +535,14 @@ def basic_lstm(input,
         else:
             layer_bias_attr = bias_attr
         fw_unit_list.append(
-            BasicLSTMUnit(
-                new_name,
-                hidden_size,
-                param_attr=layer_param_attr,
-                bias_attr=layer_bias_attr,
-                gate_activation=gate_activation,
-                activation=activation,
-                forget_bias=forget_bias,
-                dtype=dtype))
+            BasicLSTMUnit(new_name,
+                          hidden_size,
+                          param_attr=layer_param_attr,
+                          bias_attr=layer_bias_attr,
+                          gate_activation=gate_activation,
+                          activation=activation,
+                          forget_bias=forget_bias,
+                          dtype=dtype))
     if bidirectional:
         bw_unit_list = []
 
@@ -557,15 +559,14 @@ def basic_lstm(input,
             else:
                 layer_bias_attr = param_attr
             bw_unit_list.append(
-                BasicLSTMUnit(
-                    new_name,
-                    hidden_size,
-                    param_attr=layer_param_attr,
-                    bias_attr=layer_bias_attr,
-                    gate_activation=gate_activation,
-                    activation=activation,
-                    forget_bias=forget_bias,
-                    dtype=dtype))
+                BasicLSTMUnit(new_name,
+                              hidden_size,
+                              param_attr=layer_param_attr,
+                              bias_attr=layer_bias_attr,
+                              gate_activation=gate_activation,
+                              activation=activation,
+                              forget_bias=forget_bias,
+                              dtype=dtype))
 
     if batch_first:
         input = layers.transpose(input, [1, 0, 2])
@@ -573,8 +574,9 @@ def basic_lstm(input,
     mask = None
     if sequence_length:
         max_seq_len = layers.shape(input)[0]
-        mask = layers.sequence_mask(
-            sequence_length, maxlen=max_seq_len, dtype='float32')
+        mask = layers.sequence_mask(sequence_length,
+                                    maxlen=max_seq_len,
+                                    dtype='float32')
 
         mask = layers.transpose(mask, [1, 0])
 
@@ -605,10 +607,10 @@ def get_single_direction_output(rnn_input,
                     pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
                     pre_cell = rnn.memory(init=init_cell[i, direc_index])
                 else:
-                    pre_hidden = rnn.memory(
-                        batch_ref=rnn_input, shape=[-1, hidden_size])
-                    pre_cell = rnn.memory(
-                        batch_ref=rnn_input, shape=[-1, hidden_size])
+                    pre_hidden = rnn.memory(batch_ref=rnn_input,
+                                            shape=[-1, hidden_size])
+                    pre_cell = rnn.memory(batch_ref=rnn_input,
+                                          shape=[-1, hidden_size])
 
                 new_hidden, new_cell = unit_list[i](step_input, pre_hidden,
                                                     pre_cell)
@@ -650,11 +652,11 @@ def get_single_direction_output(rnn_input,
             last_cell_array.append(last_cell)
 
         last_hidden_output = layers.concat(last_hidden_array, axis=0)
-        last_hidden_output = layers.reshape(
-            last_hidden_output, shape=[num_layers, -1, hidden_size])
+        last_hidden_output = layers.reshape(last_hidden_output,
+                                            shape=[num_layers, -1, hidden_size])
         last_cell_output = layers.concat(last_cell_array, axis=0)
-        last_cell_output = layers.reshape(
-            last_cell_output, shape=[num_layers, -1, hidden_size])
+        last_cell_output = layers.reshape(last_cell_output,
+                                          shape=[num_layers, -1, hidden_size])
 
         return rnn_output, last_hidden_output, last_cell_output
         # seq_len, batch_size, hidden_size
@@ -788,8 +790,9 @@ def __init__(self,
         self._bias_attr = bias_attr
         self._gate_activation = gate_activation or layers.sigmoid
         self._activation = activation or layers.tanh
-        self._forget_bias = layers.fill_constant(
-            [1], dtype=dtype, value=forget_bias)
+        self._forget_bias = layers.fill_constant([1],
+                                                 dtype=dtype,
+                                                 value=forget_bias)
         self._forget_bias.stop_gradient = False
         self._dtype = dtype
 
@@ -802,11 +805,10 @@ def _build_once(self, input, pre_hidden, pre_cell):
             shape=[self._input_size + self._hiden_size, 4 * self._hiden_size],
             dtype=self._dtype)
 
-        self._bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[4 * self._hiden_size],
-            dtype=self._dtype,
-            is_bias=True)
+        self._bias = self.create_parameter(attr=self._bias_attr,
+                                           shape=[4 * self._hiden_size],
+                                           dtype=self._dtype,
+                                           is_bias=True)
 
     def forward(self, input, pre_hidden, pre_cell):
         concat_input_hidden = layers.concat([input, pre_hidden], 1)
diff --git a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
index c5b9b9e71f6be..62b98e75ea1d2 100644
--- a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
+++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
@@ -51,8 +51,9 @@ def check_finite_and_unscale(x, scale, name=None, float_status=None):
                                  'check_finite_and_unscale')
         inputs['FloatStatus'] = float_status
     outputs = {'Out': x, 'FoundInfinite': found_inf}
-    helper.append_op(
-        type='check_finite_and_unscale', inputs=inputs, outputs=outputs)
+    helper.append_op(type='check_finite_and_unscale',
+                     inputs=inputs,
+                     outputs=outputs)
 
     return x, found_inf
 
@@ -136,7 +137,9 @@ def update_loss_scaling(x,
     else:
         attrs['stop_update'] = stop_update
 
-    helper.append_op(
-        type='update_loss_scaling', inputs=inputs, outputs=outputs, attrs=attrs)
+    helper.append_op(type='update_loss_scaling',
+                     inputs=inputs,
+                     outputs=outputs,
+                     attrs=attrs)
 
     return x
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py
index 0fb86593b2d62..d2528c0e11eda 100644
--- a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py
@@ -34,8 +34,9 @@
     "cast_parameters_to_bf16", "convert_float_to_uint16"
 ]
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 _valid_types = [
     core.VarDesc.VarType.LOD_TENSOR, core.VarDesc.VarType.SELECTED_ROWS,
@@ -102,15 +103,14 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                         persistable=False,
                         stop_gradient=in_var.stop_gradient)
 
-                    block._insert_op(
-                        idx,
-                        type="cast",
-                        inputs={"X": in_var},
-                        outputs={"Out": out_var},
-                        attrs={
-                            "in_dtype": in_var.dtype,
-                            "out_dtype": out_var.dtype
-                        })
+                    block._insert_op(idx,
+                                     type="cast",
+                                     inputs={"X": in_var},
+                                     outputs={"Out": out_var},
+                                     attrs={
+                                         "in_dtype": in_var.dtype,
+                                         "out_dtype": out_var.dtype
+                                     })
                     num_cast_ops += 1
                 _rename_arg(op, in_var.name, out_var.name)
             else:
@@ -146,18 +146,18 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name,
     cast_name = target_var.name + '.cast_' + _dtype_to_str(dest_dtype)
     cast_var = block.vars.get(cast_name)
     if cast_var is None or cast_var.dtype != dest_dtype:
-        cast_var = block.create_var(
-            name=cast_name,
-            dtype=dest_dtype,
-            persistable=False,
-            stop_gradient=target_var.stop_gradient)
-        block._insert_op(
-            idx,
-            type="cast",
-            inputs={"X": target_var},
-            outputs={"Out": cast_var},
-            attrs={"in_dtype": target_var.dtype,
-                   "out_dtype": cast_var.dtype})
+        cast_var = block.create_var(name=cast_name,
+                                    dtype=dest_dtype,
+                                    persistable=False,
+                                    stop_gradient=target_var.stop_gradient)
+        block._insert_op(idx,
+                         type="cast",
+                         inputs={"X": target_var},
+                         outputs={"Out": cast_var},
+                         attrs={
+                             "in_dtype": target_var.dtype,
+                             "out_dtype": cast_var.dtype
+                         })
         num_cast_ops += 1
         op_var_rename_map[block.idx][target_var.name] = cast_var.name
 
@@ -363,8 +363,8 @@ def cast_model_to_bf16(program,
                         out_var.desc.set_dtype(core.VarDesc.VarType.BF16)
 
                     _logger.debug(
-                        "-- op type: {}, out var name: {}, out var dtype: {} --".
-                        format(op.type, out_var_name, out_var.dtype))
+                        "-- op type: {}, out var name: {}, out var dtype: {} --"
+                        .format(op.type, out_var_name, out_var.dtype))
             for attr_name in ['in_dtype', 'out_dtype', 'dtype']:
                 if op.has_attr(attr_name) and op.attr(
                         attr_name) == core.VarDesc.VarType.FP32:
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
index 4189abda0588f..41fce89a9e9da 100644
--- a/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
@@ -104,8 +104,9 @@ def backward(self,
             if loss.dtype != core.VarDesc.VarType.FP32:
                 loss = loss.astype('float32')
 
-            params_grads = self._optimizer.backward(
-                loss, startup_program, parameter_list, no_grad_set, callbacks)
+            params_grads = self._optimizer.backward(loss, startup_program,
+                                                    parameter_list, no_grad_set,
+                                                    callbacks)
         return params_grads
 
     def amp_init(self,
@@ -171,10 +172,9 @@ def run_example_code():
                                     self._to_bf16_var_names)
         if test_program is not None:
             if self._use_pure_bf16:
-                cast_model_to_bf16(
-                    test_program,
-                    amp_lists=self._amp_lists,
-                    use_bf16_guard=self._use_bf16_guard)
+                cast_model_to_bf16(test_program,
+                                   amp_lists=self._amp_lists,
+                                   use_bf16_guard=self._use_bf16_guard)
             elif use_bf16_test:
                 rewrite_program_bf16(test_program, amp_lists=self._amp_lists)
 
@@ -223,11 +223,10 @@ def minimize(self,
                 "The decorated optimizer has its own `minimize` method, but it will not be executed."
             )
 
-        params_grads = self.backward(
-            loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set)
+        params_grads = self.backward(loss,
+                                     startup_program=startup_program,
+                                     parameter_list=parameter_list,
+                                     no_grad_set=no_grad_set)
 
         optimize_ops = self.apply_optimize(loss, startup_program, params_grads)
 
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index c3720396e1d77..787a4e90a0f43 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -171,15 +171,18 @@ def backward(self,
 
         # NOTE(zhiqiu): _float_status is only used for NPU.
         if core.is_compiled_with_npu():
-            float_status = paddle.static.data(
-                name="float_status", shape=[8], dtype='float32')
+            float_status = paddle.static.data(name="float_status",
+                                              shape=[8],
+                                              dtype='float32')
             self._train_program.global_block().append_op(
                 type="alloc_float_status",
-                outputs={"FloatStatus": float_status}, )
+                outputs={"FloatStatus": float_status},
+            )
             self._train_program.global_block().append_op(
                 type="clear_float_status",
                 inputs={"FloatStatus": float_status},
-                outputs={"FloatStatusOut": float_status}, )
+                outputs={"FloatStatusOut": float_status},
+            )
             self._float_status = float_status
         else:
             self._float_status = None
@@ -202,9 +205,10 @@ def backward(self,
             else:
                 self._scaled_loss = loss
 
-            params_grads = self._optimizer.backward(
-                self._scaled_loss, startup_program, parameter_list, no_grad_set,
-                callbacks)
+            params_grads = self._optimizer.backward(self._scaled_loss,
+                                                    startup_program,
+                                                    parameter_list, no_grad_set,
+                                                    callbacks)
             if self._supports_check_nan_inf():
                 self._add_cast_ops_to_startup_program(startup_program)
         return params_grads
@@ -221,16 +225,16 @@ def _add_cast_ops_to_startup_program(self, startup_program):
                 continue
 
             tmp = block.create_var(dtype=core.VarDesc.VarType.FP32)
-            block.append_op(
-                type='assign', inputs={'X': [name]}, outputs={'Out': [tmp]})
-            block.append_op(
-                type='cast',
-                inputs={'X': [tmp]},
-                outputs={'Out': [name]},
-                attrs={
-                    'in_dtype': core.VarDesc.VarType.FP32,
-                    'out_dtype': core.VarDesc.VarType.FP16,
-                })
+            block.append_op(type='assign',
+                            inputs={'X': [name]},
+                            outputs={'Out': [tmp]})
+            block.append_op(type='cast',
+                            inputs={'X': [tmp]},
+                            outputs={'Out': [name]},
+                            attrs={
+                                'in_dtype': core.VarDesc.VarType.FP32,
+                                'out_dtype': core.VarDesc.VarType.FP16,
+                            })
         self._to_fp16_var_names = None
 
     def amp_init(self,
@@ -342,13 +346,13 @@ def apply_gradients(self, params_grads):
         real_optimizer = self._optimizer
         while hasattr(real_optimizer, "inner_opt"):
             real_optimizer = real_optimizer.inner_opt
-        if isinstance(real_optimizer, (paddle.fluid.optimizer.Adam,
-                                       paddle.optimizer.AdamW)):
+        if isinstance(real_optimizer,
+                      (paddle.fluid.optimizer.Adam, paddle.optimizer.AdamW)):
             # NOTE(zhiqiu): Since found_inf needs to be on cpu in adam op, we
             # copy it in advance to avoid multiple time copies.
             with self._train_program._optimized_guard([]):
-                found_inf = paddle.tensor.creation._memcpy(found_inf,
-                                                           paddle.CPUPlace())
+                found_inf = paddle.tensor.creation._memcpy(
+                    found_inf, paddle.CPUPlace())
             real_optimizer._set_auxiliary_var('found_inf', found_inf)
         elif hasattr(real_optimizer, "_set_auxiliary_var"):
             real_optimizer._set_auxiliary_var('found_inf', found_inf)
@@ -382,7 +386,9 @@ def _check_finite_and_unscale(self, params_grads):
                 for p, g in params_grads:
                     with self._train_program._optimized_guard([p, g]):
                         _, found_inf = check_finite_and_unscale(
-                            [g, ],
+                            [
+                                g,
+                            ],
                             self._loss_scaling,
                             name="find_infinite_scale",
                             float_status=self._float_status)
@@ -441,45 +447,42 @@ def _add_dynamic_loss_scaling(self, params_grads, found_inf):
             stop_update = False
             with self._train_program._optimized_guard([]):
                 if fp32_grads:
-                    update_loss_scaling(
-                        fp32_grads,
-                        found_inf,
-                        self._loss_scaling,
-                        self._num_good_steps,
-                        self._num_bad_steps,
-                        self._incr_every_n_steps,
-                        self._decr_every_n_nan_or_inf,
-                        self._incr_ratio,
-                        self._decr_ratio,
-                        stop_update=stop_update,
-                        name="update_loss_scaling_fp32")
+                    update_loss_scaling(fp32_grads,
+                                        found_inf,
+                                        self._loss_scaling,
+                                        self._num_good_steps,
+                                        self._num_bad_steps,
+                                        self._incr_every_n_steps,
+                                        self._decr_every_n_nan_or_inf,
+                                        self._incr_ratio,
+                                        self._decr_ratio,
+                                        stop_update=stop_update,
+                                        name="update_loss_scaling_fp32")
                     stop_update = True
                 if fp16_grads:
-                    update_loss_scaling(
-                        fp16_grads,
-                        found_inf,
-                        self._loss_scaling,
-                        self._num_good_steps,
-                        self._num_bad_steps,
-                        self._incr_every_n_steps,
-                        self._decr_every_n_nan_or_inf,
-                        self._incr_ratio,
-                        self._decr_ratio,
-                        stop_update=stop_update,
-                        name="update_loss_scaling_fp16")
+                    update_loss_scaling(fp16_grads,
+                                        found_inf,
+                                        self._loss_scaling,
+                                        self._num_good_steps,
+                                        self._num_bad_steps,
+                                        self._incr_every_n_steps,
+                                        self._decr_every_n_nan_or_inf,
+                                        self._incr_ratio,
+                                        self._decr_ratio,
+                                        stop_update=stop_update,
+                                        name="update_loss_scaling_fp16")
         else:
             with self._train_program._optimized_guard([]):
-                update_loss_scaling(
-                    grads,
-                    found_inf,
-                    self._loss_scaling,
-                    self._num_good_steps,
-                    self._num_bad_steps,
-                    self._incr_every_n_steps,
-                    self._decr_every_n_nan_or_inf,
-                    self._incr_ratio,
-                    self._decr_ratio,
-                    name="update_loss_scaling")
+                update_loss_scaling(grads,
+                                    found_inf,
+                                    self._loss_scaling,
+                                    self._num_good_steps,
+                                    self._num_bad_steps,
+                                    self._incr_every_n_steps,
+                                    self._decr_every_n_nan_or_inf,
+                                    self._incr_ratio,
+                                    self._decr_ratio,
+                                    name="update_loss_scaling")
 
     def apply_optimize(self, loss, startup_program, params_grads):
         program = loss.block.program
@@ -514,11 +517,10 @@ def minimize(self,
                 "The decorated optimizer has its own `minimize` method, but it will not be executed."
             )
 
-        scaled_params_grads = self.backward(
-            loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set)
+        scaled_params_grads = self.backward(loss,
+                                            startup_program=startup_program,
+                                            parameter_list=parameter_list,
+                                            no_grad_set=no_grad_set)
 
         optimize_ops = self.apply_optimize(loss, startup_program,
                                            scaled_params_grads)
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 0100866806cdc..b23c94c7e4994 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -27,8 +27,9 @@
 
 __all__ = ["fp16_guard", "cast_model_to_fp16", "cast_parameters_to_fp16"]
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 _valid_types = [
     core.VarDesc.VarType.LOD_TENSOR, core.VarDesc.VarType.SELECTED_ROWS,
@@ -147,8 +148,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
     num_cast_ops = 0
 
     for in_name in op.input_names:
-        if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_input(op,
-                                                                       in_name):
+        if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_input(
+                op, in_name):
             continue
         for in_var_name in op.input(in_name):
             in_var = block._find_var_recursive(in_var_name)
@@ -185,17 +186,18 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                         persistable=False,
                         stop_gradient=in_var.stop_gradient)
 
-                    block._insert_op_without_sync(
-                        idx,
-                        type="cast",
-                        inputs={"X": in_var},
-                        outputs={"Out": out_var},
-                        attrs={
-                            "in_dtype": in_var.dtype,
-                            "out_dtype": out_var.dtype,
-                            "op_device": op_device,
-                            "op_role": op.attr("op_role"),
-                        })
+                    block._insert_op_without_sync(idx,
+                                                  type="cast",
+                                                  inputs={"X": in_var},
+                                                  outputs={"Out": out_var},
+                                                  attrs={
+                                                      "in_dtype": in_var.dtype,
+                                                      "out_dtype":
+                                                      out_var.dtype,
+                                                      "op_device": op_device,
+                                                      "op_role":
+                                                      op.attr("op_role"),
+                                                  })
                     num_cast_ops += 1
                 _rename_arg(op, in_var.name, out_var.name)
             else:
@@ -231,22 +233,20 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name,
     cast_name = target_var.name + '.cast_' + _dtype_to_str(dest_dtype)
     cast_var = block.vars.get(cast_name)
     if cast_var is None or cast_var.dtype != dest_dtype:
-        cast_var = block.create_var(
-            name=cast_name,
-            dtype=dest_dtype,
-            persistable=False,
-            stop_gradient=target_var.stop_gradient)
-        block._insert_op(
-            idx,
-            type="cast",
-            inputs={"X": target_var},
-            outputs={"Out": cast_var},
-            attrs={
-                "in_dtype": target_var.dtype,
-                "out_dtype": cast_var.dtype,
-                "op_device": op.attr("op_device"),
-                "op_role": op.attr("op_role"),
-            })
+        cast_var = block.create_var(name=cast_name,
+                                    dtype=dest_dtype,
+                                    persistable=False,
+                                    stop_gradient=target_var.stop_gradient)
+        block._insert_op(idx,
+                         type="cast",
+                         inputs={"X": target_var},
+                         outputs={"Out": cast_var},
+                         attrs={
+                             "in_dtype": target_var.dtype,
+                             "out_dtype": cast_var.dtype,
+                             "op_device": op.attr("op_device"),
+                             "op_role": op.attr("op_role"),
+                         })
         num_cast_ops += 1
         op_var_rename_map[block.idx][target_var.name] = cast_var.name
 
@@ -474,8 +474,8 @@ def cast_model_to_fp16(program, amp_lists=None, use_fp16_guard=True):
                         out_var.desc.set_dtype(core.VarDesc.VarType.FP16)
 
                     _logger.debug(
-                        "-- op type: {}, out var name: {}, out var dtype: {} --".
-                        format(op.type, out_var_name, out_var.dtype))
+                        "-- op type: {}, out var name: {}, out var dtype: {} --"
+                        .format(op.type, out_var_name, out_var.dtype))
             if op.has_attr('in_dtype') and op.attr(
                     'in_dtype') == core.VarDesc.VarType.FP32:
                 op._set_attr('in_dtype', core.VarDesc.VarType.FP16)
@@ -696,13 +696,12 @@ def update_role_var_grad(main_prog, params_grads):
             # add new op in the python and cpp at the same time
             new_op_desc = block.desc.append_op()
             new_op_desc.copy_from(op.desc)
-            new_op = framework.Operator(
-                block=block,
-                desc=new_op_desc,
-                type=None,
-                inputs=None,
-                outputs=None,
-                attrs=None)
+            new_op = framework.Operator(block=block,
+                                        desc=new_op_desc,
+                                        type=None,
+                                        inputs=None,
+                                        outputs=None,
+                                        attrs=None)
             block.ops.append(new_op)
             op_idx = find_op_index(block.desc, op.desc)
             if op_idx == -1:
diff --git a/python/paddle/fluid/contrib/model_stat.py b/python/paddle/fluid/contrib/model_stat.py
index 11ab8800f287f..ed6d82671f24f 100644
--- a/python/paddle/fluid/contrib/model_stat.py
+++ b/python/paddle/fluid/contrib/model_stat.py
@@ -200,8 +200,8 @@ def _print_summary(summary_table, total):
     parmas = total['params']
     flops = total['flops']
     print(summary_table)
-    print('Total PARAMs: {}({:.4f}M)'.format(
-        sum(parmas), sum(parmas) / (10**6)))
+    print('Total PARAMs: {}({:.4f}M)'.format(sum(parmas),
+                                             sum(parmas) / (10**6)))
     print('Total FLOPs: {}({:.2f}G)'.format(sum(flops), sum(flops) / 10**9))
     print(
         "Notice: \n now supported ops include [Conv, DepthwiseConv, FC(mul), BatchNorm, Pool, Activation(sigmoid, tanh, relu, leaky_relu, prelu)]"
diff --git a/python/paddle/fluid/contrib/op_frequence.py b/python/paddle/fluid/contrib/op_frequence.py
index 68dd0a946b4b6..ec9b7b1073d9e 100644
--- a/python/paddle/fluid/contrib/op_frequence.py
+++ b/python/paddle/fluid/contrib/op_frequence.py
@@ -96,9 +96,11 @@ def op_freq_statistic(program):
             else:
                 adj_2_op_freq[op_op] = 1
 
-    uni_op_freq = sorted(
-        uni_op_freq.items(), key=lambda item: item[1], reverse=True)
-    adj_2_op_freq = sorted(
-        adj_2_op_freq.items(), key=lambda item: item[1], reverse=True)
+    uni_op_freq = sorted(uni_op_freq.items(),
+                         key=lambda item: item[1],
+                         reverse=True)
+    adj_2_op_freq = sorted(adj_2_op_freq.items(),
+                           key=lambda item: item[1],
+                           reverse=True)
 
     return uni_op_freq, adj_2_op_freq
diff --git a/python/paddle/fluid/contrib/optimizer.py b/python/paddle/fluid/contrib/optimizer.py
index 1b3ec21bf3c82..9265198485c78 100644
--- a/python/paddle/fluid/contrib/optimizer.py
+++ b/python/paddle/fluid/contrib/optimizer.py
@@ -118,12 +118,11 @@ def __init__(self,
         assert momentum is not None
         predicate = lambda regular: isinstance(regular, L2DecayRegularizer)
         py_regular = None if predicate(regularization) else regularization
-        super(Momentum, self).__init__(
-            learning_rate=learning_rate,
-            parameter_list=parameter_list,
-            regularization=py_regular,
-            grad_clip=grad_clip,
-            name=name)
+        super(Momentum, self).__init__(learning_rate=learning_rate,
+                                       parameter_list=parameter_list,
+                                       regularization=py_regular,
+                                       grad_clip=grad_clip,
+                                       name=name)
         self.type = "momentum"
         self._momentum = momentum
         self._use_nesterov = bool(use_nesterov)
@@ -141,21 +140,19 @@ def _create_master_weight(self, param):
 
         var_name = param.name + "_fp32_master"
         var_name = unique_name.generate(var_name)
-        var = layers.create_global_var(
-            name=var_name,
-            shape=param.shape,
-            value=0,
-            dtype='float32',
-            persistable=True)
+        var = layers.create_global_var(name=var_name,
+                                       shape=param.shape,
+                                       value=0,
+                                       dtype='float32',
+                                       persistable=True)
         block = self.helper.startup_program.global_block()
-        block.append_op(
-            type="cast",
-            inputs={"X": [param]},
-            outputs={"Out": [var]},
-            attrs={
-                "in_dtype": param.dtype,
-                "out_dtype": core.VarDesc.VarType.FP32
-            })
+        block.append_op(type="cast",
+                        inputs={"X": [param]},
+                        outputs={"Out": [var]},
+                        attrs={
+                            "in_dtype": param.dtype,
+                            "out_dtype": core.VarDesc.VarType.FP32
+                        })
         self._master_weights[param.name] = var
         return var
 
@@ -175,10 +172,11 @@ def _get_accumulator(self, name, param):
         target_param = self._master_weights[
             param.name] if find_master else param
         target_name = target_param.name
-        if (name not in self._accumulators or
-                target_name not in self._accumulators[name]):
-            raise Exception("Accumulator {} does not exist for parameter {}".
-                            format(name, target_name))
+        if (name not in self._accumulators
+                or target_name not in self._accumulators[name]):
+            raise Exception(
+                "Accumulator {} does not exist for parameter {}".format(
+                    name, target_name))
         return self._accumulators[name][target_name]
 
     def _create_accumulators(self, block, parameters):
@@ -242,11 +240,10 @@ def _append_optimize_op(self, block, param_and_grad):
             outputs["MasterParamOut"] = master_weight
 
         # create the momentum optimize op
-        momentum_op = block.append_op(
-            type=self.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs,
-            stop_gradient=True)
+        momentum_op = block.append_op(type=self.type,
+                                      inputs=inputs,
+                                      outputs=outputs,
+                                      attrs=attrs,
+                                      stop_gradient=True)
 
         return momentum_op
diff --git a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
index 807d3c6a43078..de4c10040862a 100644
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
@@ -78,6 +78,7 @@ def quant(x, scale, num_bits):
 
 
 class QuantizeTranspiler(object):
+
     def __init__(self,
                  weight_bits=8,
                  activation_bits=8,
@@ -280,19 +281,20 @@ def _insert_post_dequant_op(block, op):
                 raise ValueError("Only support one output, but op %s has"
                                  " more than one output." % (op.type))
             out_var = block.var(op.output_arg_names[0])
-            dequant_var = block.create_var(
-                name=_dequantized_var_name(out_var.name),
-                type=out_var.type,
-                shape=out_var.shape,
-                dtype=out_var.dtype)
+            dequant_var = block.create_var(name=_dequantized_var_name(
+                out_var.name),
+                                           type=out_var.type,
+                                           shape=out_var.shape,
+                                           dtype=out_var.dtype)
             # insert fake_dequantize_op
-            dequant_op = block._insert_op(
-                idx + 1,
-                type="fake_dequantize_max_abs",
-                attrs={'max_range': float(max_range)},
-                inputs={"X": out_var,
-                        'Scale': scale_var},
-                outputs={"Out": dequant_var})
+            dequant_op = block._insert_op(idx + 1,
+                                          type="fake_dequantize_max_abs",
+                                          attrs={'max_range': float(max_range)},
+                                          inputs={
+                                              "X": out_var,
+                                              'Scale': scale_var
+                                          },
+                                          outputs={"Out": dequant_var})
             op_out_rename_map[block_id][out_var.name] = dequant_var.name
             return dequant_var
 
@@ -406,40 +408,37 @@ def _remove_unused_var(self, program):
     def _insert_quant_abs_max_op(self, block, idx, var, quant_bits):
         """Insert fake_quantize_abs_max op.
         """
-        quant_var = block.create_var(
-            name=_quantized_var_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype)
-        scale = block.create_var(
-            name=_quantized_scale_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype)
-        quant_op = block._insert_op(
-            idx,
-            type='fake_quantize_abs_max',
-            attrs={'bit_length': quant_bits},
-            inputs={'X': var},
-            outputs={'Out': quant_var,
-                     'OutScale': scale})
+        quant_var = block.create_var(name=_quantized_var_name(var.name),
+                                     type=var.type,
+                                     shape=var.shape,
+                                     dtype=var.dtype)
+        scale = block.create_var(name=_quantized_scale_name(var.name),
+                                 type=var.type,
+                                 shape=var.shape,
+                                 dtype=var.dtype)
+        quant_op = block._insert_op(idx,
+                                    type='fake_quantize_abs_max',
+                                    attrs={'bit_length': quant_bits},
+                                    inputs={'X': var},
+                                    outputs={
+                                        'Out': quant_var,
+                                        'OutScale': scale
+                                    })
         return quant_var, scale
 
     def _insert_quant_range_abs_max_op(self, block, idx, var, quant_bits):
         """Insert fake_quantize_range_abs_max
         """
-        quant_var = block.create_var(
-            name=_quantized_var_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype)
-        scale = self.helper.create_parameter(
-            attr=ParamAttr(
-                name=_quantized_scale_name(var.name),
-                initializer=Constant(0.001),
-                trainable=False),
-            shape=[1],
-            dtype=var.dtype)
+        quant_var = block.create_var(name=_quantized_var_name(var.name),
+                                     type=var.type,
+                                     shape=var.shape,
+                                     dtype=var.dtype)
+        scale = self.helper.create_parameter(attr=ParamAttr(
+            name=_quantized_scale_name(var.name),
+            initializer=Constant(0.001),
+            trainable=False),
+                                             shape=[1],
+                                             dtype=var.dtype)
         scale.stop_gradient = True
 
         ins = {'X': var, 'InScale': scale}
@@ -451,8 +450,8 @@ def _insert_quant_range_abs_max_op(self, block, idx, var, quant_bits):
                 persistable=True,
                 dtype=var.dtype,
                 shape=[self.window_size])
-            self.helper.set_variable_initializer(
-                scales, initializer=Constant(value=0))
+            self.helper.set_variable_initializer(scales,
+                                                 initializer=Constant(value=0))
 
             ins['Iter'] = self.global_step
             outs['OutScales'] = scales
@@ -463,12 +462,11 @@ def _insert_quant_range_abs_max_op(self, block, idx, var, quant_bits):
             'is_test': self.is_test
         }
 
-        quant_op = block._insert_op(
-            idx,
-            type='fake_quantize_range_abs_max',
-            attrs=attrs,
-            inputs=ins,
-            outputs=outs)
+        quant_op = block._insert_op(idx,
+                                    type='fake_quantize_range_abs_max',
+                                    attrs=attrs,
+                                    inputs=ins,
+                                    outputs=outs)
 
         return quant_var, scale
 
@@ -476,32 +474,30 @@ def _insert_quant_moving_average_abs_max_op(self, block, idx, var,
                                                 quant_bits):
         """Insert fake_quantize_moving_average_abs_max
         """
-        quant_var = block.create_var(
-            name=_quantized_var_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype)
+        quant_var = block.create_var(name=_quantized_var_name(var.name),
+                                     type=var.type,
+                                     shape=var.shape,
+                                     dtype=var.dtype)
         state = self.helper.create_global_variable(
             name=unique_name.generate('state'),
             persistable=True,
             dtype=var.dtype,
             shape=[1])
-        self.helper.set_variable_initializer(
-            state, initializer=Constant(value=1))
+        self.helper.set_variable_initializer(state,
+                                             initializer=Constant(value=1))
         accum = self.helper.create_global_variable(
             name=unique_name.generate('accum'),
             persistable=True,
             dtype=var.dtype,
             shape=[1])
-        self.helper.set_variable_initializer(
-            accum, initializer=Constant(value=1))
-        scale = self.helper.create_parameter(
-            attr=ParamAttr(
-                name=_quantized_scale_name(var.name),
-                initializer=Constant(0.001),
-                trainable=False),
-            shape=[1],
-            dtype=var.dtype)
+        self.helper.set_variable_initializer(accum,
+                                             initializer=Constant(value=1))
+        scale = self.helper.create_parameter(attr=ParamAttr(
+            name=_quantized_scale_name(var.name),
+            initializer=Constant(0.001),
+            trainable=False),
+                                             shape=[1],
+                                             dtype=var.dtype)
         scale.stop_gradient = True
 
         ins = {'X': var, 'InScale': scale}
@@ -518,12 +514,11 @@ def _insert_quant_moving_average_abs_max_op(self, block, idx, var,
             'is_test': self.is_test
         }
 
-        quant_op = block._insert_op(
-            idx,
-            type='fake_quantize_moving_average_abs_max',
-            attrs=attrs,
-            inputs=ins,
-            outputs=outs)
+        quant_op = block._insert_op(idx,
+                                    type='fake_quantize_moving_average_abs_max',
+                                    attrs=attrs,
+                                    inputs=ins,
+                                    outputs=outs)
 
         return quant_var, scale
 
@@ -537,25 +532,25 @@ def _insert_quant_op(self, block, idx, var, quant_bits, quant_type):
             return self._insert_quant_range_abs_max_op(block, idx, var,
                                                        quant_bits)
         elif quant_type == 'moving_average_abs_max':
-            return self._insert_quant_moving_average_abs_max_op(block, idx, var,
-                                                                quant_bits)
+            return self._insert_quant_moving_average_abs_max_op(
+                block, idx, var, quant_bits)
 
     def _insert_dequant_op(self, block, idx, var, scale, quant_bits):
         """
         Insert fake_quantize_op
         """
-        dequant_var = block.create_var(
-            name=_dequantized_var_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype)
+        dequant_var = block.create_var(name=_dequantized_var_name(var.name),
+                                       type=var.type,
+                                       shape=var.shape,
+                                       dtype=var.dtype)
         # insert fake_dequantize_op
         max_range = (1 << (quant_bits - 1)) - 1
-        dequant_op = block._insert_op(
-            idx,
-            type="fake_dequantize_max_abs",
-            attrs={'max_range': float(max_range)},
-            inputs={"X": var,
-                    'Scale': scale},
-            outputs={"Out": dequant_var})
+        dequant_op = block._insert_op(idx,
+                                      type="fake_dequantize_max_abs",
+                                      attrs={'max_range': float(max_range)},
+                                      inputs={
+                                          "X": var,
+                                          'Scale': scale
+                                      },
+                                      outputs={"Out": dequant_var})
         return dequant_var
diff --git a/python/paddle/fluid/contrib/slim/quantization/adaround.py b/python/paddle/fluid/contrib/slim/quantization/adaround.py
index f6908d7e836a7..be3201044f6c3 100644
--- a/python/paddle/fluid/contrib/slim/quantization/adaround.py
+++ b/python/paddle/fluid/contrib/slim/quantization/adaround.py
@@ -22,24 +22,29 @@
 from ....log_helper import get_logger
 from .utils import load_variable_data, set_variable_data, stable_sigmoid, quant_tensor, dequant_tensor, _channelwise_quant_axis1_ops, calculate_quant_cos_error
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 GAMMA = -0.1
 ZETA = 1.1
 
 
 def compute_soft_rounding(alpha_v):
-    return fluid.layers.clip(
-        fluid.layers.sigmoid(alpha_v) * (ZETA - GAMMA) + GAMMA, min=0, max=1)
+    return fluid.layers.clip(fluid.layers.sigmoid(alpha_v) * (ZETA - GAMMA) +
+                             GAMMA,
+                             min=0,
+                             max=1)
 
 
 def compute_soft_rounding_np(alpha_v):
-    return np.clip(
-        stable_sigmoid(alpha_v) * (ZETA - GAMMA) + GAMMA, a_min=0, a_max=1)
+    return np.clip(stable_sigmoid(alpha_v) * (ZETA - GAMMA) + GAMMA,
+                   a_min=0,
+                   a_max=1)
 
 
 class AdaRoundLoss(object):
+
     def __init__(self, reg_param=0.01, default_beta_range=(20, 2)):
         self.default_reg_param = reg_param
         self.default_beta_range = default_beta_range
@@ -48,26 +53,29 @@ def compute_recon_loss(self, ada_quantized_output, orig_output):
         square_cost = fluid.layers.square_error_cost(ada_quantized_output,
                                                      orig_output)
         recon_loss = fluid.layers.reduce_mean(
-            fluid.layers.reduce_sum(
-                square_cost, dim=-1))
+            fluid.layers.reduce_sum(square_cost, dim=-1))
         return recon_loss
 
     def compute_round_loss(self, alpha_v, warm_start, beta):
+
         def round_loss_fn():
             # compute rectified sigmoid of parameter 'alpha' which maps it between zero and one
             h_v = compute_soft_rounding(alpha_v)
 
             # calculate regularization term - which ensures parameter to converge to exactly zeros and ones
             # at the end of optimization
-            reg_term = fluid.layers.reduce_sum(-fluid.layers.pow(
-                fluid.layers.abs(2 * h_v - 1), factor=beta) + 1)
+            reg_term = fluid.layers.reduce_sum(
+                -fluid.layers.pow(fluid.layers.abs(2 * h_v - 1), factor=beta) +
+                1)
 
             # calculate the rounding loss
             round_loss = self.default_reg_param * reg_term
 
             return round_loss
 
-        round_loss = fluid.layers.cond(warm_start, lambda: fluid.layers.fill_constant(shape=[1], dtype='float32', value=0.0), round_loss_fn)
+        round_loss = fluid.layers.cond(
+            warm_start, lambda: fluid.layers.fill_constant(
+                shape=[1], dtype='float32', value=0.0), round_loss_fn)
 
         return round_loss
 
@@ -80,15 +88,16 @@ def compute_beta(self, max_iter, cur_iter, warm_start):
         warm_start_end_iter = warm_start * max_iter
 
         # compute relative iteration of current iteration
-        rel_iter = (cur_iter - warm_start_end_iter) / (
-            max_iter - warm_start_end_iter)
-        beta = end_beta + 0.5 * (start_beta - end_beta) * (1 + np.cos(rel_iter *
-                                                                      np.pi))
+        rel_iter = (cur_iter - warm_start_end_iter) / (max_iter -
+                                                       warm_start_end_iter)
+        beta = end_beta + 0.5 * (start_beta -
+                                 end_beta) * (1 + np.cos(rel_iter * np.pi))
 
         return beta
 
 
 class AdaRound(object):
+
     def __init__(self,
                  scale,
                  weight_tensor,
@@ -145,10 +154,9 @@ def _calculate_quant_weight(self):
         h_alpha = compute_soft_rounding_np(np_alpha)
 
         # Scale the tensor
-        tensor_scale = quant_tensor(
-            self.ori_weight_tensor.copy(),
-            self.scale,
-            quant_axis=self.quant_axis)
+        tensor_scale = quant_tensor(self.ori_weight_tensor.copy(),
+                                    self.scale,
+                                    quant_axis=self.quant_axis)
 
         weight_tensor = np.floor(tensor_scale)
 
@@ -160,10 +168,10 @@ def _calculate_adarounded_weights(self):
         weight_tensor_quant = self._calculate_quant_weight()
 
         # Dequantize the tensor
-        weight_tensor_dequant = dequant_tensor(
-            weight_tensor_quant + self.offset,
-            self.scale,
-            quant_axis=self.quant_axis)
+        weight_tensor_dequant = dequant_tensor(weight_tensor_quant +
+                                               self.offset,
+                                               self.scale,
+                                               quant_axis=self.quant_axis)
         return weight_tensor_dequant
 
     def update_final_weights(self):
@@ -171,10 +179,10 @@ def update_final_weights(self):
         return weight_tensor_quant
 
     def get_loss(self, beta, warm_start, adaround_out_tensor, orig_out_tensor):
-        round_loss = self.adaround_loss.compute_round_loss(self.alpha_v,
-                                                           warm_start, beta)
-        recon_loss = self.adaround_loss.compute_recon_loss(adaround_out_tensor,
-                                                           orig_out_tensor)
+        round_loss = self.adaround_loss.compute_round_loss(
+            self.alpha_v, warm_start, beta)
+        recon_loss = self.adaround_loss.compute_recon_loss(
+            adaround_out_tensor, orig_out_tensor)
         loss = round_loss + recon_loss
         losses = {
             'loss': loss,
@@ -226,29 +234,29 @@ def run_adaround(data_loader,
         with fluid.program_guard(train_program, startup_program):
             with fluid.unique_name.guard():
                 # initialize adaround
-                adaround = AdaRound(
-                    scale,
-                    weight_var_tensor,
-                    scope=scope,
-                    weight_var_name=weight_var_name,
-                    weight_op_type=weight_op_type,
-                    num_iterations=num_iterations)
-                orig_out_tensor = fluid.data(
-                    name='orig_out_tensor',
-                    shape=fp32_fetch_list.shape,
-                    dtype='float32')
-                adaround_out_tensor = fluid.data(
-                    name='adaround_out_tensor',
-                    shape=fp32_fetch_list.shape,
-                    dtype='float32')
-                beta_tensor = fluid.data(
-                    name='beta', shape=[1], dtype='float32')
-                warm_start_tensor = fluid.data(
-                    name='warm_start', shape=[1], dtype='bool')
-
-                train_fetches_loss = adaround.get_loss(
-                    beta_tensor, warm_start_tensor, adaround_out_tensor,
-                    orig_out_tensor)
+                adaround = AdaRound(scale,
+                                    weight_var_tensor,
+                                    scope=scope,
+                                    weight_var_name=weight_var_name,
+                                    weight_op_type=weight_op_type,
+                                    num_iterations=num_iterations)
+                orig_out_tensor = fluid.data(name='orig_out_tensor',
+                                             shape=fp32_fetch_list.shape,
+                                             dtype='float32')
+                adaround_out_tensor = fluid.data(name='adaround_out_tensor',
+                                                 shape=fp32_fetch_list.shape,
+                                                 dtype='float32')
+                beta_tensor = fluid.data(name='beta',
+                                         shape=[1],
+                                         dtype='float32')
+                warm_start_tensor = fluid.data(name='warm_start',
+                                               shape=[1],
+                                               dtype='bool')
+
+                train_fetches_loss = adaround.get_loss(beta_tensor,
+                                                       warm_start_tensor,
+                                                       adaround_out_tensor,
+                                                       orig_out_tensor)
                 optimizer = fluid.optimizer.Adam(learning_rate=lr)
                 loss = train_fetches_loss['loss']
                 optimizer.minimize(loss)
@@ -291,11 +299,9 @@ def run_adaround(data_loader,
                 fetch_list=[v.name for v in train_fetches_loss.values()],
                 return_numpy=True)
             _logger.info(
-                "Iter {:d}, lr {:.5f}, loss {:.5f}, loss_round {:.5f}, loss_recon {:.5f}, time {:.5f}s".
-                format(i, lr,
-                       np.mean(out[0]),
-                       np.mean(out[1]),
-                       np.mean(out[2]), start_time - prev_start_time))
+                "Iter {:d}, lr {:.5f}, loss {:.5f}, loss_round {:.5f}, loss_recon {:.5f}, time {:.5f}s"
+                .format(i, lr, np.mean(out[0]), np.mean(out[1]),
+                        np.mean(out[2]), start_time - prev_start_time))
             sys.stdout.flush()
             if i == num_iterations:
                 break
diff --git a/python/paddle/fluid/contrib/slim/quantization/cal_kl_threshold.py b/python/paddle/fluid/contrib/slim/quantization/cal_kl_threshold.py
index 390859236d91c..69cd3f6406162 100644
--- a/python/paddle/fluid/contrib/slim/quantization/cal_kl_threshold.py
+++ b/python/paddle/fluid/contrib/slim/quantization/cal_kl_threshold.py
@@ -17,8 +17,9 @@
 import numpy as np
 from ....log_helper import get_logger
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 __all__ = ['cal_kl_threshold']
 
@@ -37,8 +38,8 @@ def expand_quantized_bins(quantized_bins, reference_bins):
         if zero_count == num_merged_bins:
             avg_bin_ele = 0
         else:
-            avg_bin_ele = quantized_bins[idx] / (
-                num_merged_bins - zero_count + 0.0)
+            avg_bin_ele = quantized_bins[idx] / (num_merged_bins - zero_count +
+                                                 0.0)
         for idx1 in range(j_start, j_end):
             expanded_quantized_bins[idx1] = (0 if reference_bins[idx1] == 0 else
                                              avg_bin_ele)
@@ -103,8 +104,8 @@ def cal_kl_threshold(hist, bin_width, bits):
         j_start = 0
         j_end = num_merged_bins
         for idx in range(quant_range):
-            candidate_distr_Q_quantized[idx] = sum(candidate_distr_Q[j_start:
-                                                                     j_end])
+            candidate_distr_Q_quantized[idx] = sum(
+                candidate_distr_Q[j_start:j_end])
             j_start += num_merged_bins
             j_end += num_merged_bins
             if (idx + 1) == quant_range - 1:
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py
index 1f7a01f17b066..4ae949bf0fe37 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py
@@ -82,8 +82,8 @@ def _fuse_layers(model, layers_list):
         layer_list.append(getattr(parent_layer, sub_name))
     new_layers = _fuse_func(layer_list)
     for i, item in enumerate(layers_list):
-        parent_layer, sub_name = utils.find_parent_layer_and_sub_name(model,
-                                                                      item)
+        parent_layer, sub_name = utils.find_parent_layer_and_sub_name(
+            model, item)
         setattr(parent_layer, sub_name, new_layers[i])
 
 
@@ -123,9 +123,10 @@ def _fuse_conv_bn_eval(conv, bn):
     assert (not (conv.training or bn.training)), "Fusion only for eval!"
     fused_conv = copy.deepcopy(conv)
 
-    fused_weight, fused_bias = _fuse_conv_bn_weights(
-        fused_conv.weight, fused_conv.bias, bn._mean, bn._variance, bn._epsilon,
-        bn.weight, bn.bias)
+    fused_weight, fused_bias = _fuse_conv_bn_weights(fused_conv.weight,
+                                                     fused_conv.bias, bn._mean,
+                                                     bn._variance, bn._epsilon,
+                                                     bn.weight, bn.bias)
     fused_conv.weight.set_value(fused_weight)
     if fused_conv.bias is None:
         fused_conv.bias = paddle.create_parameter(
@@ -166,9 +167,11 @@ def _fuse_linear_bn_eval(linear, bn):
     assert (not (linear.training or bn.training)), "Fusion only for eval!"
     fused_linear = copy.deepcopy(linear)
 
-    fused_weight, fused_bias = _fuse_linear_bn_weights(
-        fused_linear.weight, fused_linear.bias, bn._mean, bn._variance,
-        bn._epsilon, bn.weight, bn.bias)
+    fused_weight, fused_bias = _fuse_linear_bn_weights(fused_linear.weight,
+                                                       fused_linear.bias,
+                                                       bn._mean, bn._variance,
+                                                       bn._epsilon, bn.weight,
+                                                       bn.bias)
     fused_linear.weight.set_value(fused_weight)
     if fused_linear.bias is None:
         fused_linear.bias = paddle.create_parameter(
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py
index 5c595a8d38c92..cccc5d90fbab3 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py
@@ -31,8 +31,9 @@
 
 __all__ = ['ImperativePTQ']
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 class ImperativePTQ(object):
@@ -155,12 +156,12 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
         model_filename = basename + INFER_MODEL_SUFFIX
         params_filename = basename + INFER_PARAMS_SUFFIX
 
-        [infer_program, feed_target_names, fetch_targets] = (
-            paddle.fluid.io.load_inference_model(
-                dirname=dirname,
-                executor=exe,
-                model_filename=model_filename,
-                params_filename=params_filename))
+        [infer_program, feed_target_names,
+         fetch_targets] = (paddle.fluid.io.load_inference_model(
+             dirname=dirname,
+             executor=exe,
+             model_filename=model_filename,
+             params_filename=params_filename))
 
         # Process inference program
         self._clean_up(infer_program)
@@ -168,14 +169,13 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
         self._remove_scale_op(infer_program)
 
         # Save final program
-        paddle.fluid.io.save_inference_model(
-            dirname=dirname,
-            feeded_var_names=feed_target_names,
-            target_vars=fetch_targets,
-            executor=exe,
-            main_program=infer_program.clone(),
-            model_filename=model_filename,
-            params_filename=params_filename)
+        paddle.fluid.io.save_inference_model(dirname=dirname,
+                                             feeded_var_names=feed_target_names,
+                                             target_vars=fetch_targets,
+                                             executor=exe,
+                                             main_program=infer_program.clone(),
+                                             model_filename=model_filename,
+                                             params_filename=params_filename)
 
         if is_dynamic_mode:
             paddle.disable_static()
@@ -310,8 +310,8 @@ def _wrap_simulated_layers(self, model):
                 assert hasattr(quant_layer, "_fake_quant_input")
                 assert hasattr(quant_layer._fake_quant_input, "_scale")
                 assert len(in_act_quantizer.thresholds) == 1
-                input_threshold = np.array(
-                    [in_act_quantizer.thresholds[0]], dtype=np.float32)
+                input_threshold = np.array([in_act_quantizer.thresholds[0]],
+                                           dtype=np.float32)
                 quant_layer._fake_quant_input._scale.set_value(input_threshold)
 
                 assert hasattr(quant_layer, "_fake_quant_weight")
@@ -319,11 +319,11 @@ def _wrap_simulated_layers(self, model):
                 assert len(wt_quantizer.thresholds) == 1
                 weight_threshold = wt_quantizer.thresholds[0]
                 if isinstance(weight_threshold, list):
-                    weight_threshold = np.array(
-                        weight_threshold, dtype=np.float32)
+                    weight_threshold = np.array(weight_threshold,
+                                                dtype=np.float32)
                 else:
-                    weight_threshold = np.array(
-                        [weight_threshold], dtype=np.float32)
+                    weight_threshold = np.array([weight_threshold],
+                                                dtype=np.float32)
                 quant_layer._fake_quant_weight._scale.set_value(
                     weight_threshold)
 
@@ -356,8 +356,8 @@ def _gather_input_thresholds(self, program, scope):
                     attr_name = previous_op.output('OutScale')[0]
                     in_threshold = utils.load_variable_data(scope, attr_name)
                     in_threshold = utils.fp_numpy_to_naive(in_threshold)
-                    argname, index = utils._get_input_name_index(op,
-                                                                 in_var_name)
+                    argname, index = utils._get_input_name_index(
+                        op, in_var_name)
                     op._set_attr(argname + str(index) + "_threshold",
                                  in_threshold)
                     op._set_attr("with_quant_attr", True)
@@ -417,7 +417,8 @@ def _helper(op, next_op, old_attr_name, new_attr_name):
                 old_attr_name = argname + str(index) + "_threshold"
 
                 argname, index = utils._get_output_name_index(
-                    next_op, next_op.output("Out")[0])
+                    next_op,
+                    next_op.output("Out")[0])
                 new_attr_name = argname + str(index) + "_threshold"
 
                 _helper(op, next_op, old_attr_name, new_attr_name)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_quantizer.py b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_quantizer.py
index 63b3578871710..0988f24a1837f 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_quantizer.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_quantizer.py
@@ -60,18 +60,20 @@ def combine_abs_max_and_hist(tensor, origin_max, origin_hist, bins,
     if new_max == 0.0:
         return origin_max, origin_hist
     elif origin_max == 0.0:
-        new_hist, _ = np.histogram(
-            paddle.abs(tensor).numpy(), range=(0, new_max), bins=bins)
+        new_hist, _ = np.histogram(paddle.abs(tensor).numpy(),
+                                   range=(0, new_max),
+                                   bins=bins)
         new_hist = new_hist.astype(np.float32)
         return new_max, new_hist
     elif new_max <= origin_max:
-        new_hist, _ = np.histogram(
-            paddle.abs(tensor).numpy(), range=(0, origin_max), bins=bins)
+        new_hist, _ = np.histogram(paddle.abs(tensor).numpy(),
+                                   range=(0, origin_max),
+                                   bins=bins)
         new_hist = new_hist.astype(np.float32)
         new_hist += origin_hist
         return origin_max, new_hist
     else:
-        # bin_width = origin_max / (bins * upsample_bins) 
+        # bin_width = origin_max / (bins * upsample_bins)
         #           = new_max / (bins * downsample_bins)
         bin_width = origin_max / (bins * upsample_bins)
         downsampe_bins = int(math.ceil(new_max / (bins * bin_width)))
@@ -87,8 +89,9 @@ def combine_abs_max_and_hist(tensor, origin_max, origin_hist, bins,
         sampled_hist = (cumsumed_hist - shift_cumsumed_hist) / upsample_bins
         sampled_hist = sampled_hist.astype(np.float32)
 
-        new_hist, _ = np.histogram(
-            paddle.abs(tensor).numpy(), range=(0, new_max), bins=bins)
+        new_hist, _ = np.histogram(paddle.abs(tensor).numpy(),
+                                   range=(0, new_max),
+                                   bins=bins)
         new_hist = new_hist.astype(np.float32)
         new_hist += sampled_hist
 
@@ -193,10 +196,9 @@ def sample_data(self, layer, tensors):
                 if abs_max_vals[idx] == 0.0:
                     self.hists.append(None)
                 else:
-                    hist, _ = np.histogram(
-                        paddle.abs(tensor).numpy(),
-                        range=(0., abs_max_vals[idx]),
-                        bins=self.bins)
+                    hist, _ = np.histogram(paddle.abs(tensor).numpy(),
+                                           range=(0., abs_max_vals[idx]),
+                                           bins=self.bins)
                     hist = hist.astype(np.float32)
                     self.hists.append(hist)
         else:
@@ -228,6 +230,7 @@ def __init__(self,
         self.hist_percent = hist_percent
 
     def cal_thresholds(self):
+
         def _helper(abs_max, hist, percent):
             assert hist.ndim == 1 and percent < 1.0
             hist = hist / np.sum(hist, dtype=np.float64)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index d5c3d9ab82d74..29f4707124c6c 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -37,8 +37,9 @@
 
 __all__ = ['ImperativeQuantAware']
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 class ImperativeQuantAware(object):
@@ -46,19 +47,18 @@ class ImperativeQuantAware(object):
     Applying quantization aware training (QAT) to the dgraph model.
     """
 
-    def __init__(
-            self,
-            quantizable_layer_type=['Conv2D', 'Linear', 'Conv2DTranspose'],
-            weight_quantize_type='abs_max',
-            activation_quantize_type='moving_average_abs_max',
-            weight_bits=8,
-            activation_bits=8,
-            moving_rate=0.9,
-            fuse_conv_bn=False,
-            weight_preprocess_layer=None,
-            act_preprocess_layer=None,
-            weight_quantize_layer=None,
-            act_quantize_layer=None):
+    def __init__(self,
+                 quantizable_layer_type=['Conv2D', 'Linear', 'Conv2DTranspose'],
+                 weight_quantize_type='abs_max',
+                 activation_quantize_type='moving_average_abs_max',
+                 weight_bits=8,
+                 activation_bits=8,
+                 moving_rate=0.9,
+                 fuse_conv_bn=False,
+                 weight_preprocess_layer=None,
+                 act_preprocess_layer=None,
+                 weight_quantize_layer=None,
+                 act_quantize_layer=None):
         """
         The constructor for ImperativeQuantAware.
 
@@ -280,18 +280,17 @@ class ImperativeQuantizeInputs(object):
     logic both for activation inputs and weight inputs.
     """
 
-    def __init__(
-            self,
-            quantizable_layer_type=['Conv2D', 'Linear', 'Conv2DTranspose'],
-            weight_quantize_type='abs_max',
-            activation_quantize_type='moving_average_abs_max',
-            weight_bits=8,
-            activation_bits=8,
-            moving_rate=0.9,
-            weight_preprocess_layer=None,
-            act_preprocess_layer=None,
-            weight_quantize_layer=None,
-            act_quantize_layer=None):
+    def __init__(self,
+                 quantizable_layer_type=['Conv2D', 'Linear', 'Conv2DTranspose'],
+                 weight_quantize_type='abs_max',
+                 activation_quantize_type='moving_average_abs_max',
+                 weight_bits=8,
+                 activation_bits=8,
+                 moving_rate=0.9,
+                 weight_preprocess_layer=None,
+                 act_preprocess_layer=None,
+                 weight_quantize_layer=None,
+                 act_quantize_layer=None):
         """
         The constructor for ImperativeQuantizeInputs. 
 
@@ -300,9 +299,8 @@ def __init__(
         super(ImperativeQuantizeInputs, self).__init__()
 
         self._quantizable_layer_type = tuple(
-            utils.layer_name_map[layer]
-            if layer in utils.layer_name_map else layer
-            for layer in quantizable_layer_type)
+            utils.layer_name_map[layer] if layer in
+            utils.layer_name_map else layer for layer in quantizable_layer_type)
         for layer in self._quantizable_layer_type:
             assert not isinstance(layer, str) \
                 and layer in utils.fake_quant_input_layers, \
@@ -496,12 +494,11 @@ def save_quantized_model(self,
         model_filename = basename + INFER_MODEL_SUFFIX
         params_filename = basename + INFER_PARAMS_SUFFIX
 
-        [infer_program, feed_target_names, fetch_targets] = (
-            load_inference_model(
-                dirname=dirname,
-                executor=exe,
-                model_filename=model_filename,
-                params_filename=params_filename))
+        [infer_program, feed_target_names, fetch_targets
+         ] = (load_inference_model(dirname=dirname,
+                                   executor=exe,
+                                   model_filename=model_filename,
+                                   params_filename=params_filename))
 
         self._gather_scales(infer_program, scope, fetch_targets)
 
@@ -528,15 +525,14 @@ def save_quantized_model(self,
 
             clip_extra = True
 
-        save_inference_model(
-            dirname=dirname,
-            feeded_var_names=feed_target_names,
-            target_vars=fetch_targets,
-            executor=exe,
-            main_program=infer_program.clone(),
-            model_filename=model_filename,
-            params_filename=params_filename,
-            clip_extra=clip_extra)
+        save_inference_model(dirname=dirname,
+                             feeded_var_names=feed_target_names,
+                             target_vars=fetch_targets,
+                             executor=exe,
+                             main_program=infer_program.clone(),
+                             model_filename=model_filename,
+                             params_filename=params_filename,
+                             clip_extra=clip_extra)
 
         if is_dynamic_mode:
             paddle.disable_static()
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index 758928f8dafe8..1ac6eec80d94f 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -68,7 +68,7 @@
     quant_layers.QuantizedConv2DTranspose
 ]
 
-# The weight format of these layers is Cin * Cout * H * W 
+# The weight format of these layers is Cin * Cout * H * W
 spec_channel_axis_layers = [paddle.nn.Conv2DTranspose, paddle.nn.Linear]
 
 weight_op_types = [
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index d4c34efb7b900..5c16e0fe273c4 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -32,8 +32,9 @@
 
 __all__ = ['PostTrainingQuantization', 'WeightQuantization']
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 def _all_persistable_var_names(program):
@@ -84,7 +85,8 @@ def _apply_pass(scope,
         cpp_graph.set_not_owned('__param_scope__', scope)
     if attrs:
         assert attr_values and len(attrs) == len(
-            attr_values), "Different number of pass attributes and their values."
+            attr_values
+        ), "Different number of pass attributes and their values."
         for attr, value in zip(attrs, attr_values):
             ir_pass.set(attr, value)
     ir_pass.apply(cpp_graph)
@@ -440,18 +442,17 @@ def _adaround_apply(self):
             scale_dict = self._quantized_var_threshold
         else:
             scale_dict = self._quantized_threshold
-        run_adaround(
-            self._data_loader,
-            self._program,
-            self._fetch_list,
-            self._executor,
-            self._scope,
-            self._place,
-            self._quantized_op_pairs,
-            self._weight_op_pairs,
-            scale_dict,
-            num_iterations=self._batch_nums,
-            lr=self._learning_rate)
+        run_adaround(self._data_loader,
+                     self._program,
+                     self._fetch_list,
+                     self._executor,
+                     self._scope,
+                     self._place,
+                     self._quantized_op_pairs,
+                     self._weight_op_pairs,
+                     scale_dict,
+                     num_iterations=self._batch_nums,
+                     lr=self._learning_rate)
 
     def save_quantized_model(self,
                              save_model_path,
@@ -472,15 +473,14 @@ def save_quantized_model(self,
             None
         '''
         clip_extra = True if self._onnx_format else False
-        io.save_inference_model(
-            dirname=save_model_path,
-            model_filename=model_filename,
-            params_filename=params_filename,
-            feeded_var_names=self._feed_list,
-            target_vars=self._fetch_list,
-            executor=self._executor,
-            main_program=self._program,
-            clip_extra=clip_extra)
+        io.save_inference_model(dirname=save_model_path,
+                                model_filename=model_filename,
+                                params_filename=params_filename,
+                                feeded_var_names=self._feed_list,
+                                target_vars=self._fetch_list,
+                                executor=self._executor,
+                                main_program=self._program,
+                                clip_extra=clip_extra)
         _logger.info("The quantized model is saved in " + save_model_path)
 
     def _load_model_data(self):
@@ -502,17 +502,18 @@ def _load_model_data(self):
 
         if self._data_loader is not None:
             return
-        self._data_loader = io.DataLoader.from_generator(
-            feed_list=feed_vars, capacity=3 * self._batch_size, iterable=True)
+        self._data_loader = io.DataLoader.from_generator(feed_list=feed_vars,
+                                                         capacity=3 *
+                                                         self._batch_size,
+                                                         iterable=True)
         if self._sample_generator is not None:
-            self._data_loader.set_sample_generator(
-                self._sample_generator,
-                batch_size=self._batch_size,
-                drop_last=True,
-                places=self._place)
+            self._data_loader.set_sample_generator(self._sample_generator,
+                                                   batch_size=self._batch_size,
+                                                   drop_last=True,
+                                                   places=self._place)
         elif self._batch_generator is not None:
-            self._data_loader.set_batch_generator(
-                self._batch_generator, places=self._place)
+            self._data_loader.set_batch_generator(self._batch_generator,
+                                                  places=self._place)
 
     def _optimize_fp32_model(self):
         '''
@@ -563,12 +564,10 @@ def collect_var_name(var_name_list, persistable_var_names, op_type):
                                     " is not supported for quantization.")
                 # For quantized ops, sample inputs and outputs
                 if op_type in self._quantizable_op_type:
-                    collect_var_name(
-                        utils._get_op_input_var_names(op),
-                        persistable_var_names, op_type)
-                    collect_var_name(
-                        utils._get_op_output_var_names(op),
-                        persistable_var_names, op_type)
+                    collect_var_name(utils._get_op_input_var_names(op),
+                                     persistable_var_names, op_type)
+                    collect_var_name(utils._get_op_output_var_names(op),
+                                     persistable_var_names, op_type)
                     # collect quanted op output var name
                     for out_var_name in utils._get_op_output_var_names(op):
                         for in_var_name in utils._get_op_input_var_names(op):
@@ -577,9 +576,8 @@ def collect_var_name(var_name_list, persistable_var_names, op_type):
                                     in_var_name] = out_var_name
                 # For other op, only sample output scale
                 elif op_type in self._out_scale_op_list:
-                    collect_var_name(
-                        utils._get_op_output_var_names(op),
-                        persistable_var_names, op_type)
+                    collect_var_name(utils._get_op_output_var_names(op),
+                                     persistable_var_names, op_type)
 
     def _set_activation_persistable(self):
         '''
@@ -823,8 +821,9 @@ def _collect_activation_abs_min_max(self):
             min_value = float(np.min(var_tensor))
             max_value = float(np.max(var_tensor))
             if var_name not in self._sampling_act_abs_min_max:
-                self._sampling_act_abs_min_max[
-                    var_name] = [min_value, max_value]
+                self._sampling_act_abs_min_max[var_name] = [
+                    min_value, max_value
+                ]
             else:
                 if min_value < self._sampling_act_abs_min_max[var_name][0]:
                     self._sampling_act_abs_min_max[var_name][0] = min_value
@@ -839,8 +838,9 @@ def _init_sampling_act_histogram(self):
             if var_name not in self._sampling_act_histogram:
                 min_val = self._sampling_act_abs_min_max[var_name][0]
                 max_val = self._sampling_act_abs_min_max[var_name][1]
-                hist, hist_edeges = np.histogram(
-                    [], bins=self._histogram_bins, range=(min_val, max_val))
+                hist, hist_edeges = np.histogram([],
+                                                 bins=self._histogram_bins,
+                                                 range=(min_val, max_val))
                 self._sampling_act_histogram[var_name] = [hist, hist_edeges]
 
     def _calculate_kl_hist_threshold(self):
@@ -944,18 +944,11 @@ def _update_program(self):
         else:
             scale_dict = self._quantized_threshold
         for key, val in scale_dict.items():
-            utils.set_variable_data(
-                self._scope,
-                self._place,
-                key + ".scale",
-                np.array(
-                    [val], dtype=np.float32))
-            utils.set_variable_data(
-                self._scope,
-                self._place,
-                key + ".quant_dequant.scale",
-                np.array(
-                    [val], dtype=np.float32))
+            utils.set_variable_data(self._scope, self._place, key + ".scale",
+                                    np.array([val], dtype=np.float32))
+            utils.set_variable_data(self._scope, self._place,
+                                    key + ".quant_dequant.scale",
+                                    np.array([val], dtype=np.float32))
 
         if not self._onnx_format:
             # apply QuantizationFreezePass, and obtain the final quant model
@@ -1031,8 +1024,8 @@ def analysis_and_save_info(op_node, out_var_name):
 
         for block_id in range(len(self._program.blocks)):
             for op in self._program.blocks[block_id].ops:
-                if op.type in (
-                        self._quantizable_op_type + self._out_scale_op_list):
+                if op.type in (self._quantizable_op_type +
+                               self._out_scale_op_list):
                     out_var_names = utils._get_op_output_var_names(op)
                     for var_name in out_var_names:
                         analysis_and_save_info(op, var_name)
@@ -1168,10 +1161,11 @@ def quantize_weight_to_int(self,
 
         if generate_test_model:
             test_model_dir = os.path.join(save_model_dir, "test_model")
-            self._quantize_weight_to_int(
-                test_model_dir, save_model_filename, save_params_filename,
-                quantizable_op_type, weight_bits, weight_quantize_type, True,
-                threshold_rate)
+            self._quantize_weight_to_int(test_model_dir, save_model_filename,
+                                         save_params_filename,
+                                         quantizable_op_type, weight_bits,
+                                         weight_quantize_type, True,
+                                         threshold_rate)
 
     def convert_weight_to_fp16(self, save_model_dir):
         """
@@ -1209,16 +1203,17 @@ def convert_weight_to_fp16(self, save_model_dir):
             if self._params_filename is not None:
                 save_var_map[new_var.name] = new_var
             else:
-                save_file_path = os.path.join(
-                    os.path.normpath(save_model_dir), new_var.name)
-                save_block.append_op(
-                    type='save',
-                    inputs={'X': [new_var]},
-                    outputs={},
-                    attrs={
-                        'file_path': os.path.normpath(save_file_path),
-                        'save_as_fp16': True
-                    })
+                save_file_path = os.path.join(os.path.normpath(save_model_dir),
+                                              new_var.name)
+                save_block.append_op(type='save',
+                                     inputs={'X': [new_var]},
+                                     outputs={},
+                                     attrs={
+                                         'file_path':
+                                         os.path.normpath(save_file_path),
+                                         'save_as_fp16':
+                                         True
+                                     })
 
         if self._params_filename is not None:
             save_var_list = []
@@ -1230,14 +1225,15 @@ def convert_weight_to_fp16(self, save_model_dir):
                 name=unique_name.generate("saved_params"))
             saved_params_var.desc.set_persistable(True)
 
-            save_path = os.path.join(
-                os.path.normpath(save_model_dir), self._params_filename)
-            save_block.append_op(
-                type='save_combine',
-                inputs={'X': save_var_list},
-                outputs={'Y': saved_params_var},
-                attrs={'file_path': save_path,
-                       'save_as_fp16': True})
+            save_path = os.path.join(os.path.normpath(save_model_dir),
+                                     self._params_filename)
+            save_block.append_op(type='save_combine',
+                                 inputs={'X': save_var_list},
+                                 outputs={'Y': saved_params_var},
+                                 attrs={
+                                     'file_path': save_path,
+                                     'save_as_fp16': True
+                                 })
 
         save_program._sync_with_cpp()
         exe.run(save_program)
@@ -1286,14 +1282,13 @@ def _quantize_weight_to_int(self, save_model_dir, save_model_filename,
                         self._weight_channel_wise_abs_max_quantization(
                             scope, place, weight_bits, op, var_name, for_test)
 
-        io.save_inference_model(
-            dirname=save_model_dir,
-            feeded_var_names=feed_list,
-            target_vars=fetch_list,
-            executor=exe,
-            main_program=program,
-            model_filename=save_model_filename,
-            params_filename=save_params_filename)
+        io.save_inference_model(dirname=save_model_dir,
+                                feeded_var_names=feed_list,
+                                target_vars=fetch_list,
+                                executor=exe,
+                                main_program=program,
+                                model_filename=save_model_filename,
+                                params_filename=save_params_filename)
 
     def _weight_abs_max_quantization(self, scope, place, weight_bits,
                                      threshold_rate, op, var_name, for_test):
@@ -1332,8 +1327,9 @@ def _weight_abs_max_quantization(self, scope, place, weight_bits,
         op._set_attr(var_name + "_quant_scale", [scale])  # Save as list
         op._set_attr("with_quant_attr", True)
 
-    def _weight_channel_wise_abs_max_quantization(
-            self, scope, place, weight_bits, op, var_name, for_test):
+    def _weight_channel_wise_abs_max_quantization(self, scope, place,
+                                                  weight_bits, op, var_name,
+                                                  for_test):
         ''' 
         Use channel_wise_abs_max method to quantize weight.
         '''
@@ -1383,8 +1379,8 @@ def _conv_channel_wise_quantization(self, weight_data, quantize_range,
         and quantize the weights.
         '''
         scales = []
-        quantized_weight_data = np.zeros_like(
-            weight_data, dtype=save_weight_dtype)
+        quantized_weight_data = np.zeros_like(weight_data,
+                                              dtype=save_weight_dtype)
         channel_num = weight_data.shape[0]
         for i in range(channel_num):
             scale = np.max(np.abs(weight_data[i])) / quantize_range
@@ -1397,8 +1393,8 @@ def _conv_channel_wise_dequantization(self, quantized_weight_data, scales):
         '''
         For conv2d and depthwise_conv2d, dequantize the weights to fp32.
         '''
-        dequantized_weight_data = np.zeros_like(
-            quantized_weight_data, dtype=np.float32)
+        dequantized_weight_data = np.zeros_like(quantized_weight_data,
+                                                dtype=np.float32)
         for i in range(len(scales)):
             dequantized_weight_data[i] = \
                 (quantized_weight_data[i] * scales[i]).astype(np.float32)
@@ -1411,8 +1407,8 @@ def _mul_channel_wise_quantization(self, weight_data, quantize_range,
         and quantize the weights.
         '''
         scales = []
-        quantized_weight_data = np.zeros_like(
-            weight_data, dtype=save_weight_dtype)
+        quantized_weight_data = np.zeros_like(weight_data,
+                                              dtype=save_weight_dtype)
         channel_num = weight_data.shape[-1]
         for i in range(channel_num):
             scale = np.max(np.abs(weight_data[:, i])) / quantize_range
@@ -1425,8 +1421,8 @@ def _mul_channel_wise_dequantization(self, quantized_weight_data, scales):
         '''
         For mul, dequantize the weights to fp32.
         '''
-        dequantized_weight_data = np.zeros_like(
-            quantized_weight_data, dtype=np.float32)
+        dequantized_weight_data = np.zeros_like(quantized_weight_data,
+                                                dtype=np.float32)
         for i in range(len(scales)):
             dequantized_weight_data[:, i] = \
                 (quantized_weight_data[:, i] * scales[i]).astype(np.float32)
@@ -1434,8 +1430,9 @@ def _mul_channel_wise_dequantization(self, quantized_weight_data, scales):
 
     def _calculate_threshold(self, input, threshold_rate, histogram_bins=5000):
         input_abs = np.abs(input)
-        hist, hist_edeges = np.histogram(
-            input_abs, bins=histogram_bins, range=(0, np.max(input_abs)))
+        hist, hist_edeges = np.histogram(input_abs,
+                                         bins=histogram_bins,
+                                         range=(0, np.max(input_abs)))
         hist = hist / float(sum(hist))
         hist_sum = 0
         hist_index = 0
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 348d914943521..220016bd653bc 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -131,9 +131,9 @@ def _is_any_of_op_types_in_graph(self, op_types, graph):
 
     def _is_any_of_op_types_quantized(self, op_types, graph):
         return self._is_any_of_op_types_in_graph(
-            op_types, graph) and (self._is_quantizing_all_ops() or
-                                  any(op_type in self._ops_to_quantize
-                                      for op_type in op_types))
+            op_types, graph) and (self._is_quantizing_all_ops()
+                                  or any(op_type in self._ops_to_quantize
+                                         for op_type in op_types))
 
     def _is_conv_quantized(self, graph):
         return self._is_any_of_op_types_quantized(self._conv_ops, graph)
@@ -188,8 +188,9 @@ def _gather_input_scales_from_fake(self, graph):
                 scale_name = op.input("InScale")[0]
                 output_name = op.output("Out")[0]
                 # Gather new weight scales after folding batchnorm in convolution
-                scale = np.array(1.0 / self._load_param(
-                    self._scope, scale_name)[0]).astype(np.float64)
+                scale = np.array(
+                    1.0 / self._load_param(self._scope, scale_name)[0]).astype(
+                        np.float64)
                 scale[scale == np.Inf] = 0.0
                 lod_tensor = self._convert_scale2tensor(scale)
                 use_unsigned_int = False
@@ -206,13 +207,13 @@ def _gather_weight_thresholds_from_fake(self, graph):
                     _max_range = np.array(op.op().attr("max_range")).astype(
                         np.float64)
                     self._weight_thresholds[input_name] = np.array(
-                        self._s8_max * self._s8_max /
-                        _max_range).astype(np.float64)
+                        self._s8_max * self._s8_max / _max_range).astype(
+                            np.float64)
                 else:
                     scale_name = op.input("Scales")[0]
                     self._weight_thresholds[input_name] = np.array(
-                        self._load_param(self._scope, scale_name)).astype(
-                            np.float64)
+                        self._load_param(self._scope,
+                                         scale_name)).astype(np.float64)
 
         return graph
 
@@ -228,12 +229,14 @@ def _gather_output_scales_from_attr(self, graph):
                 use_unsigned_int = False
                 for output_name in op.op().outputs():
                     for out_var_name in op.op().output(output_name):
-                        self._add_scale_for_vars(
-                            [out_var_name], use_unsigned_int, scale_lod_tensor)
+                        self._add_scale_for_vars([out_var_name],
+                                                 use_unsigned_int,
+                                                 scale_lod_tensor)
 
         return graph
 
     def _propagate_scales(self, graph):
+
         def _update_scale_op_in_scale(op, input, output):
             unsigned, tensor = self._var_quant_scales[output]
             scale = np.array(tensor) * op.op().attr("scale")
@@ -299,7 +302,8 @@ def _remove_fake_quantize(self, graph, op):
         fake_quant_out = graph._find_node_by_name(op.outputs,
                                                   op.output("Out")[0])
         fake_quant_out_scale = graph._find_node_by_name(
-            op.outputs, op.output("OutScale")[0])
+            op.outputs,
+            op.output("OutScale")[0])
 
         next_ops = fake_quant_out.outputs
         for next_op in next_ops:
@@ -332,6 +336,7 @@ def _swap_inputs(self, op, old_input, new_input):
                 ])
 
     def _dequantize_weights(self, graph):
+
         def _is_int8_weights(op_node, weight_name):
             weight_var_name = op_node.input(weight_name)[0]
             if self._scope.find_var(weight_var_name) is None:
@@ -371,8 +376,8 @@ def _restore_var(self, name, array):
 
     def _update_activations(self, graph):
         for op in graph.all_op_nodes():
-            if op.name() in self._conv_ops and not op.op().has_attr(
-                    "fuse_activation"):
+            if op.name(
+            ) in self._conv_ops and not op.op().has_attr("fuse_activation"):
                 activation = ""
                 if op.op().has_attr("fuse_relu") and op.op().attr("fuse_relu"):
                     activation = "relu"
@@ -463,8 +468,9 @@ def _apply_pass(self, graph, pass_name, attrs=None, attr_values=None):
                 ir_pass.set(attr, value)
         ir_pass.apply(cpp_graph)
         if self._debug:
-            graph.draw('.', '{}_{}_{}'.format(self._pass_group, self._pass_idx,
-                                              pass_name), graph.all_op_nodes())
+            graph.draw(
+                '.', '{}_{}_{}'.format(self._pass_group, self._pass_idx,
+                                       pass_name), graph.all_op_nodes())
         self._remove_unused_var_nodes(graph)
         self._pass_idx += 1
         return graph
@@ -506,16 +512,17 @@ def _set_op_role_forward(self, graph):
         return graph
 
     def _compute_weight_scales(self, graph):
+
         def _compute_var_scales(ops, w_name, axis):
             for op in graph.all_op_nodes():
                 if op.op().type() in ops:
                     weight_var_name = op.input(w_name)[0]
                     weights = np.array(
                         self._load_param(self._scope, weight_var_name))
-                    scales = 1.0 / np.amax(
-                        np.abs(weights.reshape(weights.shape[0], -1)).astype(
+                    scales = 1.0 / np.amax(np.abs(
+                        weights.reshape(weights.shape[0], -1)).astype(
                             np.float64),
-                        axis=axis)
+                                           axis=axis)
                     scales[scales == np.Inf] = 0.0
 
                     lod_tensor = self._convert_scale2tensor(scales)
@@ -528,20 +535,18 @@ def _compute_single_gru_weight_scales(wx_var_name, wh_var_name):
             wh = np.array(self._load_param(self._scope, wh_var_name))
             OC = wh.shape[0]
             scale_ur = 1.0 / np.max(np.abs(
-                np.concatenate(
-                    [
-                        wx[:, :2 * OC], wh.flatten()[:2 * OC * OC].reshape(OC, 2
-                                                                           * OC)
-                    ],
-                    axis=0)),
+                np.concatenate([
+                    wx[:, :2 * OC],
+                    wh.flatten()[:2 * OC * OC].reshape(OC, 2 * OC)
+                ],
+                               axis=0)),
                                     axis=0)
             scale_o = 1.0 / np.max(np.abs(
-                np.concatenate(
-                    [
-                        wx[:, 2 * OC:], wh.flatten()[2 * OC * OC:].reshape(OC,
-                                                                           OC)
-                    ],
-                    axis=0)),
+                np.concatenate([
+                    wx[:, 2 * OC:],
+                    wh.flatten()[2 * OC * OC:].reshape(OC, OC)
+                ],
+                               axis=0)),
                                    axis=0)
 
             gru_weights_scale = np.concatenate([scale_ur,
@@ -569,8 +574,7 @@ def _compute_single_lstm_weight_scales(wx_var_name, wh_var_name):
             wh = np.array(self._load_param(self._scope, wh_var_name))
 
             lstm_weights_scale = 1.0 / np.max(
-                np.abs(np.concatenate(
-                    [wx[:, :], wh[:, :]], axis=0)), axis=0)
+                np.abs(np.concatenate([wx[:, :], wh[:, :]], axis=0)), axis=0)
             lstm_weights_scale = lstm_weights_scale.astype('float')
 
             return self._convert_scale2tensor(lstm_weights_scale)
@@ -606,6 +610,7 @@ def _find_avg_pooling_ids(self, graph):
         return self._op_ids_to_skip
 
     def _update_relu_output_scales(self, graph):
+
         def _set_unsigned_scale(graph, ops, op_out_name, predicate):
             '''
             Sets the type of an output scale of a passed op type(s) to 'unsigned int8' if the
@@ -615,8 +620,8 @@ def _set_unsigned_scale(graph, ops, op_out_name, predicate):
             for op in graph.all_op_nodes():
                 if op.name() in ops:
                     out_name = op.output(op_out_name)[0]
-                    if out_name in self._var_quant_scales and predicate(op.op(
-                    )):
+                    if out_name in self._var_quant_scales and predicate(
+                            op.op()):
                         is_unsigned, tensor = self._var_quant_scales[out_name]
                         if is_unsigned is False:
                             # If the variable is signed, it means that the scales for this var
@@ -651,15 +656,17 @@ def _quantize_fp32_graph(self, graph):
         graph = self._apply_pass(graph, 'scale_matmul_fuse_pass')
         graph = self._apply_pass(graph,
                                  'reshape_transpose_matmul_mkldnn_fuse_pass')
-        graph = self._apply_pass(graph,
-                                 'reshape_transpose_matmul_v2_mkldnn_fuse_pass')
+        graph = self._apply_pass(
+            graph, 'reshape_transpose_matmul_v2_mkldnn_fuse_pass')
         graph = self._apply_pass(
             graph, 'cpu_quantize_placement_pass',
             ['quantize_enabled_op_types', 'quantize_excluded_op_ids'],
-            [self._ops_to_quantize, self._find_avg_pooling_ids(graph)])
+            [self._ops_to_quantize,
+             self._find_avg_pooling_ids(graph)])
         graph = self._apply_pass(
             graph, 'cpu_quantize_pass', ['quant_var_scales', 'data_layout'],
-            [self._var_quant_scales, self._get_data_layout(graph)])
+            [self._var_quant_scales,
+             self._get_data_layout(graph)])
         graph = self._apply_pass(graph, 'cpu_quantize_squash_pass')
         graph = self._apply_pass(graph, 'int8_scale_calculation_mkldnn_pass')
         return graph
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
index 2ed06a48c29f7..d56aeb79f3f7c 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
@@ -103,8 +103,8 @@ def apply(self, graph):
             if op_node.name() in self._dequantize_type:
                 input_name = op_node.input("X")[0]
                 scale_name = op_node.input("Scale")[0]
-                self._in_scale[input_name] = self._load_param(self._scope,
-                                                              scale_name)[0]
+                self._in_scale[input_name] = self._load_param(
+                    self._scope, scale_name)[0]
                 self._max_range[input_name] = op_node.op().attr("max_range")
                 self._new_output[input_name] = op_node.output("Out")[0]
 
@@ -113,8 +113,8 @@ def apply(self, graph):
                 attrs = op_node.op().attr_names()
                 input_name = op_node.input("X")[0]
                 scale_name = op_node.input("InScale")[0]
-                self._in_scale[input_name] = self._load_param(self._scope,
-                                                              scale_name)[0]
+                self._in_scale[input_name] = self._load_param(
+                    self._scope, scale_name)[0]
                 #  self._max_range[input_name] = op_node.op().attr("max_range")
                 self._new_output[input_name] = op_node.output("Out")[0]
 
@@ -142,8 +142,8 @@ def _transform_to_conv_mkldnn(self, graph, op_node):
         output_name = op_node.output("Output")[0]
         # Convert int8 range weights to fp32 range weights
         weight = self._load_param(self._scope, weight_name)
-        w_fp32 = np.divide(
-            np.multiply(weight, self._s8_max), self._max_range[output_name])
+        w_fp32 = np.divide(np.multiply(weight, self._s8_max),
+                           self._max_range[output_name])
         w_fp32 = w_fp32.reshape(weight.shape)
         self._restore_var(weight_name, w_fp32)
         input_var_node = graph._find_node_by_name(op_node.inputs,
@@ -158,12 +158,13 @@ def _transform_to_conv_mkldnn(self, graph, op_node):
             for name in op_node.op().attr_names()
         }
 
-        conv_op_node = graph.create_op_node(
-            op_type='conv2d',
-            attrs=attrs,
-            inputs={'Input': input_var_node,
-                    'Filter': weight_var_node},
-            outputs={'Output': output_var_node})
+        conv_op_node = graph.create_op_node(op_type='conv2d',
+                                            attrs=attrs,
+                                            inputs={
+                                                'Input': input_var_node,
+                                                'Filter': weight_var_node
+                                            },
+                                            outputs={'Output': output_var_node})
 
         # Based on the Quant's scales to calculate the scales of MKL-DNN INT8 conv2d
         scale_in = self._s8_max / self._in_scale[output_name]
@@ -186,8 +187,8 @@ def _transform_to_mul_mkldnn(self, graph, op_node):
         output_name = op_node.output("Out")[0]
         # Convert int8 range weights to fp32 range weights
         weight = self._load_param(self._scope, weight_name)
-        w_fp32 = np.divide(
-            np.multiply(weight, self._s8_max), self._max_range[output_name])
+        w_fp32 = np.divide(np.multiply(weight, self._s8_max),
+                           self._max_range[output_name])
         w_fp32 = w_fp32.reshape(weight.shape)
         self._restore_var(weight_name, w_fp32)
         input_var_node = graph._find_node_by_name(op_node.inputs,
@@ -202,12 +203,13 @@ def _transform_to_mul_mkldnn(self, graph, op_node):
             for name in op_node.op().attr_names()
         }
 
-        mul_op_node = graph.create_op_node(
-            op_type='mul',
-            attrs=attrs,
-            inputs={'X': input_var_node,
-                    'Y': weight_var_node},
-            outputs={'Out': output_var_node})
+        mul_op_node = graph.create_op_node(op_type='mul',
+                                           attrs=attrs,
+                                           inputs={
+                                               'X': input_var_node,
+                                               'Y': weight_var_node
+                                           },
+                                           outputs={'Out': output_var_node})
 
         # Based on the Quant's scales to calculate MKL-DNN INT8 mul's scales
         scale_in = self._s8_max / self._in_scale[output_name]
@@ -233,7 +235,8 @@ def _transform_to_quantize_mkldnn(self, graph, op_node):
         output_var_node = graph._find_node_by_name(op_node.outputs,
                                                    op_node.output("Out")[0])
         scale_in = self._s8_max / self._load_param(
-            self._scope, op_node.input("InScale")[0])[0]
+            self._scope,
+            op_node.input("InScale")[0])[0]
         quant_op_node = graph.create_op_node(
             op_type='quantize',
             attrs={
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 17ddedd9d300a..eaf9bed3d6fe9 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -294,17 +294,18 @@ def _transform_forward(graph, op):
                         else False
 
                     # if var node is weight and weight_preprocess_func is not None,
-                    # will insert weight preprocess func 
+                    # will insert weight preprocess func
                     # to preorocess weight before quantization
-                    # if var node is activation and act_preprocess_func is not None, 
-                    # will insert activation preprocess func 
+                    # if var node is activation and act_preprocess_func is not None,
+                    # will insert activation preprocess func
                     # to preorocess activation before quantization
                     if is_weight and self._weight_preprocess_func is not None:
                         var_node = self._insert_func(
                             graph, self._weight_preprocess_func, var_node, op)
                     elif not is_weight and self._act_preprocess_func is not None:
-                        var_node = self._insert_func(
-                            graph, self._act_preprocess_func, var_node, op)
+                        var_node = self._insert_func(graph,
+                                                     self._act_preprocess_func,
+                                                     var_node, op)
 
                     # if var node is weight and weight_quantize_func is not None,
                     # will insert weight quantize func to quantize and dequantize weight
@@ -396,12 +397,8 @@ def _create_global_step(self, graph):
                     var_type=core.VarDesc.VarType.LOD_TENSOR,
                     shape=[1],
                     var_dtype=core.VarDesc.VarType.INT64)
-                _init_var_node(
-                    global_step_in,
-                    np.zeros(
-                        [1], dtype='int64'),
-                    self._scope,
-                    self._place)
+                _init_var_node(global_step_in, np.zeros([1], dtype='int64'),
+                               self._scope, self._place)
                 global_step_out = graph.create_var_node_from_desc(
                     global_step_in.var())
                 # The attribute of `op_role` is needed by ParallelExecutor.
@@ -450,12 +447,9 @@ def _insert_quant_abs_max_op(self, graph, var_node, name, quant_bits):
             var_dtype=var_node.dtype())
         data_type = 'float64' if var_node.dtype(
         ) == core.VarDesc.VarType.FP64 else 'float32'
-        _init_var_node(
-            scale_var_node,
-            np.zeros(
-                scale_var_node.shape(), dtype=data_type),
-            self._scope,
-            self._place)
+        _init_var_node(scale_var_node,
+                       np.zeros(scale_var_node.shape(), dtype=data_type),
+                       self._scope, self._place)
         quant_op_node = graph.create_op_node(
             op_type='fake_quantize_abs_max',
             attrs={
@@ -463,8 +457,10 @@ def _insert_quant_abs_max_op(self, graph, var_node, name, quant_bits):
                 'op_role': core.op_proto_and_checker_maker.OpRole.Forward
             },
             inputs={'X': var_node},
-            outputs={'Out': quant_var_node,
-                     'OutScale': scale_var_node})
+            outputs={
+                'Out': quant_var_node,
+                'OutScale': scale_var_node
+            })
         graph.link_to(var_node, quant_op_node)
         graph.link_to(quant_op_node, quant_var_node)
         graph.link_to(quant_op_node, scale_var_node)
@@ -489,12 +485,9 @@ def _insert_quant_range_abs_max_op(self, graph, var_node, name, quant_bits):
             var_dtype=var_node.dtype())
         data_type = 'float64' if var_node.dtype(
         ) == core.VarDesc.VarType.FP64 else 'float32'
-        _init_var_node(
-            scale_in_node,
-            np.array(
-                [_SCALE_DEFAULT_VALUE], dtype=data_type),
-            self._scope,
-            self._place)
+        _init_var_node(scale_in_node,
+                       np.array([_SCALE_DEFAULT_VALUE], dtype=data_type),
+                       self._scope, self._place)
 
         scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
         inputs = {'X': var_node, 'InScale': scale_in_node}
@@ -509,12 +502,9 @@ def _insert_quant_range_abs_max_op(self, graph, var_node, name, quant_bits):
                 var_dtype=var_node.dtype())
             data_type = 'float64' if var_node.dtype(
             ) == core.VarDesc.VarType.FP64 else 'float32'
-            _init_var_node(
-                scales_node,
-                np.zeros(
-                    [self._window_size], dtype=data_type),
-                self._scope,
-                self._place)
+            _init_var_node(scales_node,
+                           np.zeros([self._window_size], dtype=data_type),
+                           self._scope, self._place)
 
             inputs['Iter'] = self._global_step
             outputs['OutScales'] = scales_node
@@ -557,12 +547,9 @@ def _insert_quant_moving_average_abs_max_op(self, graph, var_node, name,
             var_dtype=var_node.dtype())
         data_type = 'float64' if var_node.dtype(
         ) == core.VarDesc.VarType.FP64 else 'float32'
-        _init_var_node(
-            scale_in_node,
-            np.array(
-                [_SCALE_DEFAULT_VALUE], dtype=data_type),
-            self._scope,
-            self._place)
+        _init_var_node(scale_in_node,
+                       np.array([_SCALE_DEFAULT_VALUE], dtype=data_type),
+                       self._scope, self._place)
 
         scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
         ins = {'X': var_node, 'InScale': scale_in_node}
@@ -575,27 +562,19 @@ def _insert_quant_moving_average_abs_max_op(self, graph, var_node, name,
                 shape=[1])
             data_type = 'float64' if var_node.dtype(
             ) == core.VarDesc.VarType.FP64 else 'float32'
-            _init_var_node(
-                state_in_node,
-                np.ones(
-                    [1], dtype=data_type),
-                self._scope,
-                self._place)
+            _init_var_node(state_in_node, np.ones([1], dtype=data_type),
+                           self._scope, self._place)
             accum_in_node = graph.create_persistable_node(
                 name=unique_name.generate('accum'),
                 var_type=core.VarDesc.VarType.LOD_TENSOR,
                 var_dtype=var_node.dtype(),
                 shape=[1])
-            _init_var_node(
-                accum_in_node,
-                np.ones(
-                    [1], dtype=data_type),
-                self._scope,
-                self._place)
-            state_out_node = graph.create_var_node_from_desc(state_in_node.var(
-            ))
-            accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
-            ))
+            _init_var_node(accum_in_node, np.ones([1], dtype=data_type),
+                           self._scope, self._place)
+            state_out_node = graph.create_var_node_from_desc(
+                state_in_node.var())
+            accum_out_node = graph.create_var_node_from_desc(
+                accum_in_node.var())
 
             ins['InState'] = state_in_node
             ins['InAccum'] = accum_in_node
@@ -647,12 +626,9 @@ def _insert_channel_quant_op(self, graph, var_node, name, quant_bits,
             var_dtype=var_node.dtype())
         data_type = 'float64' if var_node.dtype(
         ) == core.VarDesc.VarType.FP64 else 'float32'
-        _init_var_node(
-            scale_var_node,
-            np.zeros(
-                scale_var_node.shape(), dtype=data_type),
-            self._scope,
-            self._place)
+        _init_var_node(scale_var_node,
+                       np.zeros(scale_var_node.shape(), dtype=data_type),
+                       self._scope, self._place)
         quant_op_node = graph.create_op_node(
             op_type='fake_channel_wise_quantize_abs_max',
             attrs={
@@ -662,8 +638,10 @@ def _insert_channel_quant_op(self, graph, var_node, name, quant_bits,
                 'op_role': core.op_proto_and_checker_maker.OpRole.Forward
             },
             inputs={'X': var_node},
-            outputs={'Out': quant_var_node,
-                     'OutScale': scale_var_node})
+            outputs={
+                'Out': quant_var_node,
+                'OutScale': scale_var_node
+            })
         graph.link_to(var_node, quant_op_node)
         graph.link_to(quant_op_node, quant_var_node)
         graph.link_to(quant_op_node, scale_var_node)
@@ -687,8 +665,10 @@ def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits):
                 'max_range': float(max_range),
                 'op_role': core.op_proto_and_checker_maker.OpRole.Forward
             },
-            inputs={'X': var_node,
-                    'Scale': scale_var_node},
+            inputs={
+                'X': var_node,
+                'Scale': scale_var_node
+            },
             outputs={'Out': dequant_var_node})
         graph.link_to(var_node, dequant_op_node)
         graph.link_to(scale_var_node, dequant_op_node)
@@ -714,8 +694,10 @@ def _insert_channel_dequant_op(self, graph, var_node, scale_var_nodes,
                 'quant_axis': quant_axis,
                 'op_role': core.op_proto_and_checker_maker.OpRole.Forward
             },
-            inputs={'X': var_node,
-                    'Scales': scale_var_nodes},
+            inputs={
+                'X': var_node,
+                'Scales': scale_var_nodes
+            },
             outputs={'Out': dequant_var_node})
         graph.link_to(var_node, dequant_op_node)
         for scale_n in scale_var_nodes:
@@ -803,10 +785,9 @@ def _insert_func(self, graph, func, var_node, op):
         startup_program = Program()
         with program_guard(tmp_program, startup_program):
             with unique_name.guard(var_node.name() + "_"):
-                in_node = data(
-                    var_node.name() + '_tmp_input',
-                    shape=var_node.shape(),
-                    dtype='float32')
+                in_node = data(var_node.name() + '_tmp_input',
+                               shape=var_node.shape(),
+                               dtype='float32')
                 out_node = func(in_node)
                 graph.out_node_mapping_table[out_node.name] = var_node.name()
                 # loss shape must be 1 when minimize
@@ -819,8 +800,8 @@ def _insert_func(self, graph, func, var_node, op):
         with scope_guard(self._scope):
             self._exe.run(startup_program)
 
-        tmp_graph = IrGraph(
-            core.Graph(tmp_program.desc), for_test=graph._for_test)
+        tmp_graph = IrGraph(core.Graph(tmp_program.desc),
+                            for_test=graph._for_test)
         in_node = tmp_graph._find_node_by_name(tmp_graph.all_var_nodes(),
                                                in_node.name)
         out_node = tmp_graph._find_node_by_name(tmp_graph.all_var_nodes(),
@@ -861,9 +842,11 @@ def _insert_func(self, graph, func, var_node, op):
             # find op's gradient op, such as conv2d_grad
             op_grad = op_out_grad.outputs[0]
             target_out_grad_node = graph._find_node_by_name(
-                graph.all_var_nodes(), target_out_node.name() + "@GRAD")
+                graph.all_var_nodes(),
+                target_out_node.name() + "@GRAD")
             in_node_grad = graph._find_node_by_name(
-                graph.all_var_nodes(), target_in_node.name() + "@GRAD")
+                graph.all_var_nodes(),
+                target_in_node.name() + "@GRAD")
             in_node_grad_op = in_node_grad.inputs
             # update op_grad's input
             graph.update_input_link(var_node, target_out_node, op_grad)
@@ -936,6 +919,7 @@ def _is_skip_quant(self, graph, op_node):
 
 
 class QuantizationFreezePass(object):
+
     def __init__(self,
                  scope,
                  place,
@@ -1008,7 +992,8 @@ def apply(self, graph):
                             input_arg_name]
                 if input_arg_name not in persistable_vars:
                     scale_v = graph._find_node_by_name(
-                        op_node.outputs, op_node.output('OutScale')[0])
+                        op_node.outputs,
+                        op_node.output('OutScale')[0])
                     self._quant_var_scale_map[input_arg_name] = scale_v
                 else:
                     # Obtain scale from OutScale var node
@@ -1063,8 +1048,8 @@ def apply(self, graph):
                 if self._weight_quantize_type == 'channel_wise_abs_max':
                     quant_axis = 1 if op_node.name() in \
                         utils._channelwise_quant_axis1_ops else 0
-                    self._insert_post_channel_dequant_op(graph, op_node,
-                                                         quant_axis)
+                    self._insert_post_channel_dequant_op(
+                        graph, op_node, quant_axis)
                 else:
                     self._insert_post_dequant_op(graph, op_node)
 
@@ -1119,7 +1104,8 @@ def _insert_post_channel_dequant_op(self, graph, op_node, quant_axis):
                              " more than one output." % (op_node.name()))
 
         output_var_node = graph._find_node_by_name(
-            op_node.outputs, op_node.output_arg_names()[0])
+            op_node.outputs,
+            op_node.output_arg_names()[0])
         weight_scale_node = graph.create_persistable_node(
             name=unique_name.generate('channel_scale'),
             var_type=core.VarDesc.VarType.LOD_TENSOR,
@@ -1127,9 +1113,8 @@ def _insert_post_channel_dequant_op(self, graph, op_node, quant_axis):
             var_dtype=output_var_node.dtype())
         data_type = 'float64' if output_var_node.dtype(
         ) == core.VarDesc.VarType.FP64 else 'float32'
-        _init_var_node(weight_scale_node,
-                       channel_scale.astype(data_type), self._scope,
-                       self._place)
+        _init_var_node(weight_scale_node, channel_scale.astype(data_type),
+                       self._scope, self._place)
         dequant_var_node = graph.create_var_node(
             name=self._dequantized_var_name(output_var_node.name()),
             var_type=output_var_node.type(),
@@ -1192,7 +1177,8 @@ def _insert_post_dequant_op(self, graph, op_node):
                              " more than one output." % (op_node.name()))
 
         output_var_node = graph._find_node_by_name(
-            op_node.outputs, op_node.output_arg_names()[0])
+            op_node.outputs,
+            op_node.output_arg_names()[0])
         dequant_var_node = graph.create_var_node(
             name=self._dequantized_var_name(output_var_node.name()),
             var_type=output_var_node.type(),
@@ -1204,8 +1190,10 @@ def _insert_post_dequant_op(self, graph, op_node):
                 'max_range': float(max_range),
                 'op_role': core.op_proto_and_checker_maker.OpRole.Forward
             },
-            inputs={'X': output_var_node,
-                    'Scale': scale_var_node},
+            inputs={
+                'X': output_var_node,
+                'Scale': scale_var_node
+            },
             outputs={'Out': dequant_var_node})
         graph.link_to(output_var_node, dequant_op_node)
         graph.link_to(scale_var_node, dequant_op_node)
@@ -1264,6 +1252,7 @@ def _is_float(self, v):
 
 
 class ConvertToInt8Pass(object):
+
     def __init__(self, scope, place, quantizable_op_type=None):
         """
         Convert the weights into int8_t type.
@@ -1303,8 +1292,8 @@ def apply(self, graph):
                     name = var_node.name()
                     if name in persistable_vars:
                         if name not in input_map:
-                            int8_var_node = self._convert_to_int8(graph,
-                                                                  var_node)
+                            int8_var_node = self._convert_to_int8(
+                                graph, var_node)
                             input_map[name] = int8_var_node
                         graph.update_input_link(var_node, input_map[name],
                                                 op_node)
@@ -1352,6 +1341,7 @@ def _remove_unused_var_nodes(self, graph):
 
 
 class TransformForMobilePass(object):
+
     def __init__(self):
         """
         This pass is used to convert the frozen graph for paddle-mobile execution.
@@ -1394,6 +1384,7 @@ def apply(self, graph):
 
 
 class OutScaleForTrainingPass(object):
+
     def __init__(self, scope=None, place=None, moving_rate=0.9):
         """
         This pass is used for calculating output scales of some operators.
@@ -1441,12 +1432,8 @@ def apply(self, graph):
                     var_dtype=in_node.dtype())
                 data_type = 'float64' if in_node.dtype() \
                     == core.VarDesc.VarType.FP64 else 'float32'
-                _init_var_node(
-                    scale_node,
-                    np.ones(
-                        [1], dtype=data_type),
-                    self._scope,
-                    self._place)
+                _init_var_node(scale_node, np.ones([1], dtype=data_type),
+                               self._scope, self._place)
                 ins = {'X': in_node}
                 outs = {'OutScale': scale_node}
                 if not self._is_test:
@@ -1455,23 +1442,15 @@ def apply(self, graph):
                         var_type=core.VarDesc.VarType.LOD_TENSOR,
                         var_dtype=in_node.dtype(),
                         shape=[1])
-                    _init_var_node(
-                        state_in_node,
-                        np.ones(
-                            [1], dtype=data_type),
-                        self._scope,
-                        self._place)
+                    _init_var_node(state_in_node, np.ones([1], dtype=data_type),
+                                   self._scope, self._place)
                     accum_in_node = graph.create_persistable_node(
                         name=unique_name.generate('scale_accum@'),
                         var_type=core.VarDesc.VarType.LOD_TENSOR,
                         var_dtype=in_node.dtype(),
                         shape=[1])
-                    _init_var_node(
-                        accum_in_node,
-                        np.ones(
-                            [1], dtype=data_type),
-                        self._scope,
-                        self._place)
+                    _init_var_node(accum_in_node, np.ones([1], dtype=data_type),
+                                   self._scope, self._place)
                     state_out_node = graph.create_var_node_from_desc(
                         state_in_node.var())
                     accum_out_node = graph.create_var_node_from_desc(
@@ -1509,6 +1488,7 @@ def _scale_name(self, var_name):
 
 
 class OutScaleForInferencePass(object):
+
     def __init__(self, scope=None):
         """
         This pass is used for setting output scales of some operators.
@@ -1550,8 +1530,8 @@ def apply(self, graph):
                     # For compatibility, we save output threshold by two methods.
                     op_node.op()._set_attr("out_threshold", float(scale_value))
 
-                    argname_index = utils._get_output_name_index(op_node,
-                                                                 var_name)
+                    argname_index = utils._get_output_name_index(
+                        op_node, var_name)
                     assert argname_index is not None, \
                         var_name + " is not the output of the op"
                     op_node.op()._set_attr(argname_index[0] + str(argname_index[1]) \
@@ -1680,8 +1660,8 @@ def apply(self, graph):
             if op_node.name() in self._quantizable_grad_op_type:
                 for input_name in op_node.input_arg_names():
                     if input_name in dequantized_vars_map:
-                        in_node = graph._find_node_by_name(op_node.inputs,
-                                                           input_name)
+                        in_node = graph._find_node_by_name(
+                            op_node.inputs, input_name)
                         dequant_var_node = dequantized_vars_map[input_name]
                         graph.update_input_link(in_node, dequant_var_node,
                                                 op_node)
@@ -1693,11 +1673,11 @@ def _inser_quant_dequant_moving_average_abs_max_op(self, graph, var_node,
                                                        quant_bits):
         """Insert fake_quantize_dequantize_moving_average_abs_max op.
         """
-        quant_var_node = graph.create_var_node(
-            name="{}.quant_dequant".format(var_node.name()),
-            var_type=var_node.type(),
-            shape=var_node.shape(),
-            var_dtype=var_node.dtype())
+        quant_var_node = graph.create_var_node(name="{}.quant_dequant".format(
+            var_node.name()),
+                                               var_type=var_node.type(),
+                                               shape=var_node.shape(),
+                                               var_dtype=var_node.dtype())
         scale_in_node = graph.create_persistable_node(
             name="{}.quant_dequant.scale".format(var_node.name()),
             var_type=core.VarDesc.VarType.LOD_TENSOR,
@@ -1705,12 +1685,9 @@ def _inser_quant_dequant_moving_average_abs_max_op(self, graph, var_node,
             var_dtype=var_node.dtype())
         data_type = 'float64' if var_node.dtype(
         ) == core.VarDesc.VarType.FP64 else 'float32'
-        _init_var_node(
-            scale_in_node,
-            np.array(
-                [_SCALE_DEFAULT_VALUE], dtype=data_type),
-            self._scope,
-            self._place)
+        _init_var_node(scale_in_node,
+                       np.array([_SCALE_DEFAULT_VALUE], dtype=data_type),
+                       self._scope, self._place)
 
         scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
         ins = {'X': var_node, 'InScale': scale_in_node}
@@ -1723,27 +1700,19 @@ def _inser_quant_dequant_moving_average_abs_max_op(self, graph, var_node,
                 shape=[1])
             data_type = 'float64' if var_node.dtype(
             ) == core.VarDesc.VarType.FP64 else 'float32'
-            _init_var_node(
-                state_in_node,
-                np.ones(
-                    [1], dtype=data_type),
-                self._scope,
-                self._place)
+            _init_var_node(state_in_node, np.ones([1], dtype=data_type),
+                           self._scope, self._place)
             accum_in_node = graph.create_persistable_node(
                 name=unique_name.generate('quant_dequant.accum'),
                 var_type=core.VarDesc.VarType.LOD_TENSOR,
                 var_dtype=var_node.dtype(),
                 shape=[1])
-            _init_var_node(
-                accum_in_node,
-                np.ones(
-                    [1], dtype=data_type),
-                self._scope,
-                self._place)
-            state_out_node = graph.create_var_node_from_desc(state_in_node.var(
-            ))
-            accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
-            ))
+            _init_var_node(accum_in_node, np.ones([1], dtype=data_type),
+                           self._scope, self._place)
+            state_out_node = graph.create_var_node_from_desc(
+                state_in_node.var())
+            accum_out_node = graph.create_var_node_from_desc(
+                accum_in_node.var())
 
             ins['InState'] = state_in_node
             ins['InAccum'] = accum_in_node
@@ -1810,11 +1779,11 @@ def __init__(self,
     def insert_quant_op(self, graph, var_node):
         assert var_node.is_var(), '{} is not a var'.format(var_node.name())
 
-        quant_var_node = graph.create_var_node(
-            name=self._quantized_var_name(var_node.name()),
-            var_type=var_node.type(),
-            shape=var_node.shape(),
-            var_dtype=var_node.dtype())
+        quant_var_node = graph.create_var_node(name=self._quantized_var_name(
+            var_node.name()),
+                                               var_type=var_node.type(),
+                                               shape=var_node.shape(),
+                                               var_dtype=var_node.dtype())
         data_type = 'float64' if var_node.dtype(
         ) == core.VarDesc.VarType.FP64 else 'float32'
         if self.channel_wise:
@@ -1840,12 +1809,9 @@ def insert_quant_op(self, graph, var_node):
                 var_type=core.VarDesc.VarType.LOD_TENSOR,
                 shape=scale_var_node.shape(),
                 var_dtype=core.VarDesc.VarType.INT32)
-            _init_var_node(
-                zero_point_node,
-                np.zeros(
-                    scale_var_node.shape(), dtype="int32"),
-                self._scope,
-                self._place)
+            _init_var_node(zero_point_node,
+                           np.zeros(scale_var_node.shape(), dtype="int32"),
+                           self._scope, self._place)
 
         inputs = {"X": var_node, "Scale": scale_var_node}
         if zero_point_node is not None:
@@ -1856,15 +1822,14 @@ def insert_quant_op(self, graph, var_node):
         if not self._is_test:
             attrs["is_test"] = self._is_test
             attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
-            scale_out_node = graph.create_var_node_from_desc(scale_var_node.var(
-            ))
+            scale_out_node = graph.create_var_node_from_desc(
+                scale_var_node.var())
             outputs["OutScale"] = scale_out_node
 
-        quant_op_node = graph.create_op_node(
-            op_type="quantize_linear",
-            attrs=attrs,
-            inputs=inputs,
-            outputs=outputs)
+        quant_op_node = graph.create_op_node(op_type="quantize_linear",
+                                             attrs=attrs,
+                                             inputs=inputs,
+                                             outputs=outputs)
 
         graph.link_to(var_node, quant_op_node)
         graph.link_to(scale_var_node, quant_op_node)
@@ -1891,12 +1856,9 @@ def insert_dequant_op(self, graph, var_node, scale_var_node):
                 var_type=core.VarDesc.VarType.LOD_TENSOR,
                 shape=scale_var_node.shape(),
                 var_dtype=core.VarDesc.VarType.INT32)
-            _init_var_node(
-                zero_point_node,
-                np.zeros(
-                    scale_var_node.shape(), dtype="int32"),
-                self._scope,
-                self._place)
+            _init_var_node(zero_point_node,
+                           np.zeros(scale_var_node.shape(), dtype="int32"),
+                           self._scope, self._place)
 
         inputs = {"X": var_node, "Scale": scale_var_node}
         if zero_point_node is not None:
@@ -1906,11 +1868,10 @@ def insert_dequant_op(self, graph, var_node, scale_var_node):
         if not self._is_test:
             attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
 
-        quant_op_node = graph.create_op_node(
-            op_type="dequantize_linear",
-            attrs=attrs,
-            inputs=inputs,
-            outputs={"Y": dequant_var_node})
+        quant_op_node = graph.create_op_node(op_type="dequantize_linear",
+                                             attrs=attrs,
+                                             inputs=inputs,
+                                             outputs={"Y": dequant_var_node})
 
         graph.link_to(var_node, quant_op_node)
         graph.link_to(scale_var_node, quant_op_node)
@@ -2122,17 +2083,19 @@ def _transform_forward(self, graph, op):
                     else False
 
                 # if var node is weight and weight_preprocess_func is not None,
-                # will insert weight preprocess func 
+                # will insert weight preprocess func
                 # to preorocess weight before quantization
-                # if var node is activation and act_preprocess_func is not None, 
-                # will insert activation preprocess func 
+                # if var node is activation and act_preprocess_func is not None,
+                # will insert activation preprocess func
                 # to preorocess activation before quantization
                 if is_weight and self._weight_preprocess_func is not None:
-                    var_node = self._insert_func(
-                        graph, self._weight_preprocess_func, var_node, op)
+                    var_node = self._insert_func(graph,
+                                                 self._weight_preprocess_func,
+                                                 var_node, op)
                 elif not is_weight and self._act_preprocess_func is not None:
-                    var_node = self._insert_func(
-                        graph, self._act_preprocess_func, var_node, op)
+                    var_node = self._insert_func(graph,
+                                                 self._act_preprocess_func,
+                                                 var_node, op)
 
                 # if var node is weight and weight_quantize_func is not None,
                 # will insert weight quantize func to quantize and dequantize weight
@@ -2144,8 +2107,9 @@ def _transform_forward(self, graph, op):
                     processed_vars.append(name)
                     continue
                 elif not is_weight and self._act_quantize_func is not None:
-                    target_out_node = self._insert_func(
-                        graph, self._act_quantize_func, var_node, op)
+                    target_out_node = self._insert_func(graph,
+                                                        self._act_quantize_func,
+                                                        var_node, op)
                     processed_vars.append(name)
                     continue
 
@@ -2389,8 +2353,8 @@ def apply(self, graph):
             if op_node.name() in self._quantizable_grad_op_type:
                 for input_name in op_node.input_arg_names():
                     if input_name in dequantized_vars_map:
-                        in_node = graph._find_node_by_name(op_node.inputs,
-                                                           input_name)
+                        in_node = graph._find_node_by_name(
+                            op_node.inputs, input_name)
                         dequant_var_node = dequantized_vars_map[input_name]
                         graph.update_input_link(in_node, dequant_var_node,
                                                 op_node)
@@ -2466,43 +2430,42 @@ def _replace_op(self, graph, op):
                 var_type=core.VarDesc.VarType.LOD_TENSOR,
                 shape=scale_node.shape(),
                 var_dtype=core.VarDesc.VarType.INT32)
-            _init_var_node(
-                zero_point_node,
-                np.zeros(
-                    scale_node.shape(), dtype="int32"),
-                self._scope,
-                self._place)
-
-        quant_var_node = graph.create_var_node(
-            name=self._quantized_var_name(x_node.name()),
-            var_type=x_node.type(),
-            shape=x_node.shape(),
-            var_dtype=x_node.dtype())
-        quant_op_node = graph.create_op_node(
-            op_type="quantize_linear",
-            attrs={"quant_axis": quant_axis,
-                   "bit_length": bit_length},
-            inputs={
-                "X": x_node,
-                "Scale": scale_node,
-                "ZeroPoint": zero_point_node
-            },
-            outputs={"Y": quant_var_node})
+            _init_var_node(zero_point_node,
+                           np.zeros(scale_node.shape(), dtype="int32"),
+                           self._scope, self._place)
+
+        quant_var_node = graph.create_var_node(name=self._quantized_var_name(
+            x_node.name()),
+                                               var_type=x_node.type(),
+                                               shape=x_node.shape(),
+                                               var_dtype=x_node.dtype())
+        quant_op_node = graph.create_op_node(op_type="quantize_linear",
+                                             attrs={
+                                                 "quant_axis": quant_axis,
+                                                 "bit_length": bit_length
+                                             },
+                                             inputs={
+                                                 "X": x_node,
+                                                 "Scale": scale_node,
+                                                 "ZeroPoint": zero_point_node
+                                             },
+                                             outputs={"Y": quant_var_node})
         graph.link_to(x_node, quant_op_node)
         graph.link_to(scale_node, quant_op_node)
         if zero_point_node is not None:
             graph.link_to(zero_point_node, quant_op_node)
         graph.link_to(quant_op_node, quant_var_node)
-        dequant_op_node = graph.create_op_node(
-            op_type="dequantize_linear",
-            attrs={"quant_axis": quant_axis,
-                   "bit_length": bit_length},
-            inputs={
-                "X": quant_var_node,
-                "Scale": scale_node,
-                "ZeroPoint": zero_point_node
-            },
-            outputs={"Y": out_node})
+        dequant_op_node = graph.create_op_node(op_type="dequantize_linear",
+                                               attrs={
+                                                   "quant_axis": quant_axis,
+                                                   "bit_length": bit_length
+                                               },
+                                               inputs={
+                                                   "X": quant_var_node,
+                                                   "Scale": scale_node,
+                                                   "ZeroPoint": zero_point_node
+                                               },
+                                               outputs={"Y": out_node})
         graph.link_to(quant_var_node, dequant_op_node)
         graph.link_to(scale_node, dequant_op_node)
         if zero_point_node is not None:
@@ -2581,7 +2544,8 @@ def apply(self, graph):
                 scale_node = graph._find_node_by_name(_op.inputs,
                                                       _op.input("Scale")[0])
                 zero_point_node = graph._find_node_by_name(
-                    _op.inputs, _op.input("ZeroPoint")[0])
+                    _op.inputs,
+                    _op.input("ZeroPoint")[0])
                 out_node = graph._find_node_by_name(_op.outputs,
                                                     _op.output("Y")[0])
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py b/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py
index 32768fff089a3..892b027de531e 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py
@@ -24,8 +24,9 @@
 from ....initializer import Constant
 from ....log_helper import get_logger
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 def find_next_ops(block, var_name):
@@ -50,6 +51,7 @@ def load_variable_data(scope, var_name):
 
 
 class QuantizeTranspilerV2(object):
+
     def __init__(self,
                  weight_bits=8,
                  activation_bits=8,
@@ -197,15 +199,15 @@ def _transform_forward(self, block, op, var_rename_map, is_test):
                         else self._activation_quantize_type
 
                 if quant_type == "abs_max":
-                    new_var = self._insert_abs_max_fq_op(block, idx, in_var,
-                                                         quant_bits)
+                    new_var = self._insert_abs_max_fq_op(
+                        block, idx, in_var, quant_bits)
                 elif quant_type == "moving_average_abs_max":
-                    new_var = self._insert_ma_abs_max_fq_op(block, idx, in_var,
-                                                            quant_bits, is_test)
+                    new_var = self._insert_ma_abs_max_fq_op(
+                        block, idx, in_var, quant_bits, is_test)
                 elif quant_type == "channel_wise_abs_max":
                     ch_axis = 1 if op.type in self._out_ch_axis1_ops else 0
-                    new_var = self._insert_pc_abs_max_fq_op(block, idx, in_var,
-                                                            quant_bits, ch_axis)
+                    new_var = self._insert_pc_abs_max_fq_op(
+                        block, idx, in_var, quant_bits, ch_axis)
                 else:
                     _logger.error("Don't support the quant_type: %s" %
                                   quant_type)
@@ -264,67 +266,62 @@ def _insert_abs_max_fq_op(self, block, idx, in_var, quant_bits):
         """
         Inset abs max fake quant op.
         """
-        quant_dequant_var = block.create_var(
-            type=in_var.type,
-            name="{}.quant_dequant".format(in_var.name),
-            shape=in_var.shape,
-            dtype=in_var.dtype)
-        scale_var = self._helper.create_parameter(
-            attr=ParamAttr(
-                name="{}.quant_dequant.scale".format(in_var.name),
-                initializer=Constant(0.),
-                trainable=False),
-            shape=[1],
-            dtype=in_var.dtype)
+        quant_dequant_var = block.create_var(type=in_var.type,
+                                             name="{}.quant_dequant".format(
+                                                 in_var.name),
+                                             shape=in_var.shape,
+                                             dtype=in_var.dtype)
+        scale_var = self._helper.create_parameter(attr=ParamAttr(
+            name="{}.quant_dequant.scale".format(in_var.name),
+            initializer=Constant(0.),
+            trainable=False),
+                                                  shape=[1],
+                                                  dtype=in_var.dtype)
         scale_var.stop_gradient = True
 
         inputs = {'X': in_var}
         outputs = {'Out': quant_dequant_var, 'OutScale': scale_var}
         attrs = {'bit_length': quant_bits}
-        block._insert_op(
-            idx,
-            type='fake_quantize_dequantize_abs_max',
-            attrs=attrs,
-            inputs=inputs,
-            outputs=outputs)
+        block._insert_op(idx,
+                         type='fake_quantize_dequantize_abs_max',
+                         attrs=attrs,
+                         inputs=inputs,
+                         outputs=outputs)
         return quant_dequant_var
 
     def _insert_ma_abs_max_fq_op(self, block, idx, in_var, quant_bits, is_test):
         """
         Insert moving average abs max fake quant op.
         """
-        quant_dequant_var = block.create_var(
-            type=in_var.type,
-            name="{}.quant_dequant".format(in_var.name),
-            shape=in_var.shape,
-            dtype=in_var.dtype)
-
-        scale_var = self._helper.create_parameter(
-            attr=ParamAttr(
-                name="{}.quant_dequant.scale".format(in_var.name),
-                initializer=Constant(0.),
-                trainable=False),
-            shape=[1],
-            dtype=in_var.dtype)
+        quant_dequant_var = block.create_var(type=in_var.type,
+                                             name="{}.quant_dequant".format(
+                                                 in_var.name),
+                                             shape=in_var.shape,
+                                             dtype=in_var.dtype)
+
+        scale_var = self._helper.create_parameter(attr=ParamAttr(
+            name="{}.quant_dequant.scale".format(in_var.name),
+            initializer=Constant(0.),
+            trainable=False),
+                                                  shape=[1],
+                                                  dtype=in_var.dtype)
         scale_var.stop_gradient = True
 
         if not is_test:
-            state_var = self._helper.create_parameter(
-                attr=ParamAttr(
-                    name="{}.quant_dequant.state".format(in_var.name),
-                    initializer=Constant(0),
-                    trainable=False),
-                shape=[1],
-                dtype=in_var.dtype)
+            state_var = self._helper.create_parameter(attr=ParamAttr(
+                name="{}.quant_dequant.state".format(in_var.name),
+                initializer=Constant(0),
+                trainable=False),
+                                                      shape=[1],
+                                                      dtype=in_var.dtype)
             state_var.stop_gradient = True
 
-            accum_var = self._helper.create_parameter(
-                attr=ParamAttr(
-                    name="{}.quant_dequant.accum".format(in_var.name),
-                    initializer=Constant(0),
-                    trainable=False),
-                shape=[1],
-                dtype=in_var.dtype)
+            accum_var = self._helper.create_parameter(attr=ParamAttr(
+                name="{}.quant_dequant.accum".format(in_var.name),
+                initializer=Constant(0),
+                trainable=False),
+                                                      shape=[1],
+                                                      dtype=in_var.dtype)
             accum_var.stop_gradient = True
 
         attrs = {
@@ -340,42 +337,39 @@ def _insert_ma_abs_max_fq_op(self, block, idx, in_var, quant_bits, is_test):
             outputs['OutState'] = state_var
             outputs['OutAccum'] = accum_var
 
-        block._insert_op(
-            idx,
-            type='fake_quantize_dequantize_moving_average_abs_max',
-            attrs=attrs,
-            inputs=inputs,
-            outputs=outputs)
+        block._insert_op(idx,
+                         type='fake_quantize_dequantize_moving_average_abs_max',
+                         attrs=attrs,
+                         inputs=inputs,
+                         outputs=outputs)
         return quant_dequant_var
 
     def _insert_pc_abs_max_fq_op(self, block, idx, in_var, quant_bits, ch_axis):
         """
         Insert per channel abs max fake quant op.
         """
-        quant_dequant_var = block.create_var(
-            type=in_var.type,
-            name="{}.quant_dequant".format(in_var.name),
-            shape=in_var.shape,
-            dtype=in_var.dtype)
-
-        scale_var = self._helper.create_parameter(
-            attr=ParamAttr(
-                name="{}.quant_dequant.scale".format(in_var.name),
-                initializer=Constant(0.),
-                trainable=False),
-            shape=[in_var.shape[ch_axis]],
-            dtype=in_var.dtype)
+        quant_dequant_var = block.create_var(type=in_var.type,
+                                             name="{}.quant_dequant".format(
+                                                 in_var.name),
+                                             shape=in_var.shape,
+                                             dtype=in_var.dtype)
+
+        scale_var = self._helper.create_parameter(attr=ParamAttr(
+            name="{}.quant_dequant.scale".format(in_var.name),
+            initializer=Constant(0.),
+            trainable=False),
+                                                  shape=[in_var.shape[ch_axis]],
+                                                  dtype=in_var.dtype)
         scale_var.stop_gradient = True
 
         inputs = {'X': in_var}
         outputs = {'Out': quant_dequant_var, 'OutScale': scale_var}
         attrs = {'bit_length': quant_bits, 'quant_axis': ch_axis}
-        block._insert_op(
-            idx,
-            type='fake_channel_wise_quantize_dequantize_abs_max',
-            attrs=attrs,
-            inputs=inputs,
-            outputs=outputs)
+        block._insert_op(idx,
+                         type='fake_channel_wise_quantize_dequantize_abs_max',
+                         attrs=attrs,
+                         inputs=inputs,
+                         outputs=outputs)
         return quant_dequant_var
 
     def _insert_ma_abs_max_scale_op(self,
@@ -387,13 +381,12 @@ def _insert_ma_abs_max_scale_op(self,
         """
         Insert moving average abs max scale op.
         """
-        scale_var = self._helper.create_parameter(
-            attr=ParamAttr(
-                name="{}.outscale.scale".format(in_var.name),
-                initializer=Constant(0.),
-                trainable=False),
-            shape=[1],
-            dtype=in_var.dtype)
+        scale_var = self._helper.create_parameter(attr=ParamAttr(
+            name="{}.outscale.scale".format(in_var.name),
+            initializer=Constant(0.),
+            trainable=False),
+                                                  shape=[1],
+                                                  dtype=in_var.dtype)
         scale_var.stop_gradient = True
 
         attrs = {'moving_rate': self._moving_rate, 'is_test': is_test}
@@ -401,22 +394,20 @@ def _insert_ma_abs_max_scale_op(self,
         outputs = {'OutScale': scale_var}
 
         if not is_test:
-            state_var = self._helper.create_parameter(
-                attr=ParamAttr(
-                    name="{}.outscale.state".format(in_var.name),
-                    initializer=Constant(0),
-                    trainable=False),
-                shape=[1],
-                dtype=in_var.dtype)
+            state_var = self._helper.create_parameter(attr=ParamAttr(
+                name="{}.outscale.state".format(in_var.name),
+                initializer=Constant(0),
+                trainable=False),
+                                                      shape=[1],
+                                                      dtype=in_var.dtype)
             state_var.stop_gradient = True
 
-            accum_var = self._helper.create_parameter(
-                attr=ParamAttr(
-                    name="{}.outscale.accum".format(in_var.name),
-                    initializer=Constant(0),
-                    trainable=False),
-                shape=[1],
-                dtype=in_var.dtype)
+            accum_var = self._helper.create_parameter(attr=ParamAttr(
+                name="{}.outscale.accum".format(in_var.name),
+                initializer=Constant(0),
+                trainable=False),
+                                                      shape=[1],
+                                                      dtype=in_var.dtype)
             accum_var.stop_gradient = True
 
             inputs['InState'] = state_var
@@ -425,20 +416,18 @@ def _insert_ma_abs_max_scale_op(self,
             outputs['OutAccum'] = accum_var
 
         if has_out_var:
-            out_var = block.create_var(
-                type=in_var.type,
-                name="{}.tmp".format(in_var.name),
-                shape=in_var.shape,
-                dtype=in_var.dtype)
+            out_var = block.create_var(type=in_var.type,
+                                       name="{}.tmp".format(in_var.name),
+                                       shape=in_var.shape,
+                                       dtype=in_var.dtype)
 
             outputs['Out'] = out_var
 
-        block._insert_op(
-            idx,
-            type='moving_average_abs_max_scale',
-            attrs=attrs,
-            inputs=inputs,
-            outputs=outputs)
+        block._insert_op(idx,
+                         type='moving_average_abs_max_scale',
+                         attrs=attrs,
+                         inputs=inputs,
+                         outputs=outputs)
 
         if has_out_var:
             return out_var
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 0140283b915ff..88dc33f581ad2 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -1,352 +1,523 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-function(_inference_analysis_python_api_int8_test target model_dir data_path filename use_mkldnn)
-    py_test(${target} SRCS ${filename}
-        ENVS CPU_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-             FLAGS_use_mkldnn=${use_mkldnn}
-        ARGS --infer_model ${model_dir}/model
-             --infer_data ${data_path}
-             --int8_model_save_path int8_models/${target}
-             --warmup_batch_size ${WARMUP_BATCH_SIZE}
-             --batch_size 50)
+function(_inference_analysis_python_api_int8_test target model_dir data_path
+         filename use_mkldnn)
+  py_test(
+    ${target}
+    SRCS ${filename}
+         ENVS
+         CPU_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
+         FLAGS_use_mkldnn=${use_mkldnn}
+         ARGS
+         --infer_model
+         ${model_dir}/model
+         --infer_data
+         ${data_path}
+         --int8_model_save_path
+         int8_models/${target}
+         --warmup_batch_size
+         ${WARMUP_BATCH_SIZE}
+         --batch_size
+         50)
 endfunction()
 
-function(inference_analysis_python_api_int8_test target model_dir data_path filename)
-    _inference_analysis_python_api_int8_test(${target} ${model_dir} ${data_path} ${filename} False)
+function(inference_analysis_python_api_int8_test target model_dir data_path
+         filename)
+  _inference_analysis_python_api_int8_test(${target} ${model_dir} ${data_path}
+                                           ${filename} False)
 endfunction()
 
-function(inference_analysis_python_api_int8_test_custom_warmup_batch_size target model_dir data_dir filename warmup_batch_size)
-    set(WARMUP_BATCH_SIZE ${warmup_batch_size})
-    inference_analysis_python_api_int8_test(${target} ${model_dir} ${data_dir} ${filename})
+function(inference_analysis_python_api_int8_test_custom_warmup_batch_size
+         target model_dir data_dir filename warmup_batch_size)
+  set(WARMUP_BATCH_SIZE ${warmup_batch_size})
+  inference_analysis_python_api_int8_test(${target} ${model_dir} ${data_dir}
+                                          ${filename})
 endfunction()
 
-function(inference_analysis_python_api_int8_test_mkldnn target model_dir data_path filename)
-    _inference_analysis_python_api_int8_test(${target} ${model_dir} ${data_path} ${filename} True)
+function(inference_analysis_python_api_int8_test_mkldnn target model_dir
+         data_path filename)
+  _inference_analysis_python_api_int8_test(${target} ${model_dir} ${data_path}
+                                           ${filename} True)
 endfunction()
 
 function(download_data install_dir url data_file check_sum)
-    if (NOT EXISTS ${install_dir}/${data_file})
-	    inference_download_and_uncompress(${install_dir} ${url} ${data_file} ${check_sum})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress(${install_dir} ${url} ${data_file}
+                                      ${check_sum})
+  endif()
 endfunction()
 
 function(download_quant_data install_dir data_file check_sum)
-    if (NOT EXISTS ${install_dir}/${data_file})
-	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8
+                                      ${data_file} ${check_sum})
+  endif()
 endfunction()
 
 function(download_quant_model install_dir data_file check_sum)
-    if (NOT EXISTS ${install_dir}/${data_file})
-	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file} ${check_sum})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress(
+      ${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file} ${check_sum})
+  endif()
 endfunction()
 
 function(download_quant_fp32_model install_dir data_file check_sum)
-    if (NOT EXISTS ${install_dir}/${data_file})
-	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models/fp32 ${data_file} ${check_sum})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress(
+      ${install_dir} ${INFERENCE_URL}/int8/QAT_models/fp32 ${data_file}
+      ${check_sum})
+  endif()
 endfunction()
 
 function(download_lstm_model install_dir data_file check_sum)
-    if (NOT EXISTS ${install_dir}/${data_file})
-	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/lstm ${data_file} ${check_sum})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/lstm
+                                      ${data_file} ${check_sum})
+  endif()
 endfunction()
 
-function(inference_quant_int8_image_classification_test target quant_model_dir dataset_path)
-    py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant_int8_image_classification_comparison.py"
-            ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 FLAGS_use_mkldnn=true
-            ARGS --quant_model ${quant_model_dir}
-                 --infer_data ${dataset_path}
-                 --batch_size 25
-                 --batch_num 2
-                 --acc_diff_threshold 0.1)
+function(inference_quant_int8_image_classification_test target quant_model_dir
+         dataset_path)
+  py_test(
+    ${target}
+    SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant_int8_image_classification_comparison.py"
+         ENVS
+         FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
+         OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
+         FLAGS_use_mkldnn=true
+         ARGS
+         --quant_model
+         ${quant_model_dir}
+         --infer_data
+         ${dataset_path}
+         --batch_size
+         25
+         --batch_num
+         2
+         --acc_diff_threshold
+         0.1)
 endfunction()
 
-
-# set batch_size 10 for UT only (avoid OOM). For whole dataset, use batch_size 25 
-function(inference_quant2_int8_image_classification_test target quant_model_dir fp32_model_dir dataset_path)
-    py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant2_int8_image_classification_comparison.py"
-            ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 FLAGS_use_mkldnn=true
-            ARGS --quant_model ${quant_model_dir}
-                 --fp32_model ${fp32_model_dir}
-                 --infer_data ${dataset_path}
-                 --batch_size 50
-                 --batch_num 2
-                 --acc_diff_threshold 0.1)
+# set batch_size 10 for UT only (avoid OOM). For whole dataset, use batch_size 25
+function(inference_quant2_int8_image_classification_test target quant_model_dir
+         fp32_model_dir dataset_path)
+  py_test(
+    ${target}
+    SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant2_int8_image_classification_comparison.py"
+         ENVS
+         FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
+         OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
+         FLAGS_use_mkldnn=true
+         ARGS
+         --quant_model
+         ${quant_model_dir}
+         --fp32_model
+         ${fp32_model_dir}
+         --infer_data
+         ${dataset_path}
+         --batch_size
+         50
+         --batch_num
+         2
+         --acc_diff_threshold
+         0.1)
 endfunction()
 
-# set batch_size 10 for UT only (avoid OOM). For whole dataset, use batch_size 20 
-function(inference_quant2_int8_nlp_test target quant_model_dir fp32_model_dir dataset_path labels_path ops_to_quantize)
-    py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant2_int8_nlp_comparison.py"
-            ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 FLAGS_use_mkldnn=true
-            ARGS --quant_model ${quant_model_dir}
-		 --fp32_model ${fp32_model_dir}
-                 --infer_data ${dataset_path}
-		 --labels ${labels_path}
-                 --batch_size 10
-                 --batch_num 2
-                 --acc_diff_threshold 0.1
-		 --ops_to_quantize ${ops_to_quantize})
+# set batch_size 10 for UT only (avoid OOM). For whole dataset, use batch_size 20
+function(
+  inference_quant2_int8_nlp_test
+  target
+  quant_model_dir
+  fp32_model_dir
+  dataset_path
+  labels_path
+  ops_to_quantize)
+  py_test(
+    ${target}
+    SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant2_int8_nlp_comparison.py"
+         ENVS
+         FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
+         OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
+         FLAGS_use_mkldnn=true
+         ARGS
+         --quant_model
+         ${quant_model_dir}
+         --fp32_model
+         ${fp32_model_dir}
+         --infer_data
+         ${dataset_path}
+         --labels
+         ${labels_path}
+         --batch_size
+         10
+         --batch_num
+         2
+         --acc_diff_threshold
+         0.1
+         --ops_to_quantize
+         ${ops_to_quantize})
 endfunction()
 
-function(inference_quant2_int8_lstm_model_test target fp32_model quant_model dataset_path)
-    py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant2_int8_lstm_model.py"
-            ARGS --fp32_model ${fp32_model}
-                 --quant_model ${quant_model}
-                 --infer_data ${dataset_path}
-                 --num_threads 1
-                 --mkldnn_cache_capacity 100
-                 --warmup_iter 100
-                 --acc_diff_threshold 0.11)
+function(inference_quant2_int8_lstm_model_test target fp32_model quant_model
+         dataset_path)
+  py_test(
+    ${target}
+    SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant2_int8_lstm_model.py"
+         ARGS
+         --fp32_model
+         ${fp32_model}
+         --quant_model
+         ${quant_model}
+         --infer_data
+         ${dataset_path}
+         --num_threads
+         1
+         --mkldnn_cache_capacity
+         100
+         --warmup_iter
+         100
+         --acc_diff_threshold
+         0.11)
 endfunction()
 
 function(download_quant_data install_dir data_file check_sum)
-    if (NOT EXISTS ${install_dir}/${data_file})
-           inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8
+                                      ${data_file} ${check_sum})
+  endif()
 endfunction()
 
 function(download_quant_model install_dir data_file check_sum)
-    if (NOT EXISTS ${install_dir}/${data_file})
-           inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file} ${check_sum})
-    endif()
+  if(NOT EXISTS ${install_dir}/${data_file})
+    inference_download_and_uncompress(
+      ${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file} ${check_sum})
+  endif()
 endfunction()
 
 function(save_quant_ic_model_test target quant_model_dir int8_model_save_path)
-    py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_quant_model.py
-            ARGS --quant_model_path ${quant_model_dir}
-	         --int8_model_save_path ${int8_model_save_path}
-		 --debug)
+  py_test(
+    ${target}
+    SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_quant_model.py
+         ARGS
+         --quant_model_path
+         ${quant_model_dir}
+         --int8_model_save_path
+         ${int8_model_save_path}
+         --debug)
 endfunction()
 
-function(save_quant_nlp_model_test target quant_model_dir int8_model_save_path ops_to_quantize)
-    py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_quant_model.py
-            ARGS --quant_model_path ${quant_model_dir}
-	         --int8_model_save_path ${int8_model_save_path}
-		 --ops_to_quantize ${ops_to_quantize})
+function(save_quant_nlp_model_test target quant_model_dir int8_model_save_path
+         ops_to_quantize)
+  py_test(
+    ${target}
+    SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_quant_model.py
+         ARGS
+         --quant_model_path
+         ${quant_model_dir}
+         --int8_model_save_path
+         ${int8_model_save_path}
+         --ops_to_quantize
+         ${ops_to_quantize})
 endfunction()
 
-function(convert_model2dot_test target model_path save_graph_dir save_graph_name)
-    py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/convert_model2dot.py
-            ARGS --model_path ${model_path}
-	         --save_graph_dir ${save_graph_dir}
-	         --save_graph_name ${save_graph_name})
+function(convert_model2dot_test target model_path save_graph_dir
+         save_graph_name)
+  py_test(
+    ${target}
+    SRCS ${CMAKE_CURRENT_SOURCE_DIR}/convert_model2dot.py
+         ARGS
+         --model_path
+         ${model_path}
+         --save_graph_dir
+         ${save_graph_dir}
+         --save_graph_name
+         ${save_graph_name})
 endfunction()
 
 if(WIN32)
-	list(REMOVE_ITEM TEST_OPS test_light_nas)
-	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist)
-	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_while)
-	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1)
-	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50)
-	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_lstm_model)
-	list(REMOVE_ITEM TEST_OPS test_imperative_ptq)
-	list(REMOVE_ITEM TEST_OPS test_weight_quantization_mobilenetv1)
-	list(REMOVE_ITEM TEST_OPS test_quantize_transpiler_v2)
-	list(REMOVE_ITEM TEST_OPS test_imperative_qat_amp)
+  list(REMOVE_ITEM TEST_OPS test_light_nas)
+  list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist)
+  list(REMOVE_ITEM TEST_OPS test_post_training_quantization_while)
+  list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1)
+  list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50)
+  list(REMOVE_ITEM TEST_OPS test_post_training_quantization_lstm_model)
+  list(REMOVE_ITEM TEST_OPS test_imperative_ptq)
+  list(REMOVE_ITEM TEST_OPS test_weight_quantization_mobilenetv1)
+  list(REMOVE_ITEM TEST_OPS test_quantize_transpiler_v2)
+  list(REMOVE_ITEM TEST_OPS test_imperative_qat_amp)
 endif()
 
 if(LINUX AND WITH_MKLDNN)
 
-	#### Image classification dataset: ImageNet (small)
-	# The dataset should already be downloaded for INT8v2 unit tests
-	set(IMAGENET_DATA_PATH "${INFERENCE_DEMO_INSTALL_DIR}/imagenet/data.bin")
-
-	#### INT8 image classification python api test
-	# Models should be already downloaded for INT8v2 unit tests
-
-	set(INT8_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")
-
-	#### QUANT & INT8 comparison python api tests
-
-	set(QUANT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant")
-
-	### Quant1 for image classification
-
-	# Quant ResNet50
-	set(QUANT_RESNET50_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant")
-	set(QUANT_RESNET50_MODEL_ARCHIVE "ResNet50_qat_model.tar.gz")
-	download_quant_model(${QUANT_RESNET50_MODEL_DIR} ${QUANT_RESNET50_MODEL_ARCHIVE} ff89b934ab961c3a4a844193ece2e8a7)
-	inference_quant_int8_image_classification_test(test_quant_int8_resnet50_mkldnn ${QUANT_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# Quant ResNet101
-	set(QUANT_RESNET101_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet101_quant")
-	set(QUANT_RESNET101_MODEL_ARCHIVE "ResNet101_qat_model.tar.gz")
-	download_quant_model(${QUANT_RESNET101_MODEL_DIR} ${QUANT_RESNET101_MODEL_ARCHIVE} 95c6d01e3aeba31c13efb2ba8057d558)
-	# inference_quant_int8_image_classification_test(test_quant_int8_resnet101_mkldnn ${QUANT_RESNET101_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# Quant GoogleNet
-	set(QUANT_GOOGLENET_MODEL_DIR "${QUANT_INSTALL_DIR}/GoogleNet_quant")
-	set(QUANT_GOOGLENET_MODEL_ARCHIVE "GoogleNet_qat_model.tar.gz")
-	download_quant_model(${QUANT_GOOGLENET_MODEL_DIR} ${QUANT_GOOGLENET_MODEL_ARCHIVE} 1d4a7383baa63e7d1c423e8db2b791d5)
-	inference_quant_int8_image_classification_test(test_quant_int8_googlenet_mkldnn ${QUANT_GOOGLENET_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# Quant MobileNetV1
-	set(QUANT_MOBILENETV1_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV1_quant")
-	set(QUANT_MOBILENETV1_MODEL_ARCHIVE "MobileNetV1_qat_model.tar.gz")
-	download_quant_model(${QUANT_MOBILENETV1_MODEL_DIR} ${QUANT_MOBILENETV1_MODEL_ARCHIVE} 3b774d94a9fcbb604d09bdb731fc1162)
-	inference_quant_int8_image_classification_test(test_quant_int8_mobilenetv1_mkldnn ${QUANT_MOBILENETV1_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# Quant MobileNetV2
-	set(QUANT_MOBILENETV2_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV2_quant")
-	set(QUANT_MOBILENETV2_MODEL_ARCHIVE "MobileNetV2_qat_model.tar.gz")
-	download_quant_model(${QUANT_MOBILENETV2_MODEL_DIR} ${QUANT_MOBILENETV2_MODEL_ARCHIVE} 758a99d9225d8b73e1a8765883f96cdd)
-	inference_quant_int8_image_classification_test(test_quant_int8_mobilenetv2_mkldnn ${QUANT_MOBILENETV2_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# Quant VGG16
-	set(QUANT_VGG16_MODEL_DIR "${QUANT_INSTALL_DIR}/VGG16_quant")
-	set(QUANT_VGG16_MODEL_ARCHIVE "VGG16_qat_model.tar.gz")
-	download_quant_model(${QUANT_VGG16_MODEL_DIR} ${QUANT_VGG16_MODEL_ARCHIVE} c37e63ca82a102f47be266f8068b0b55)
-	# inference_quant_int8_image_classification_test(test_quant_int8_vgg16_mkldnn ${QUANT_VGG16_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# Quant VGG19
-	set(QUANT_VGG19_MODEL_DIR "${QUANT_INSTALL_DIR}/VGG19_quant")
-	set(QUANT_VGG19_MODEL_ARCHIVE "VGG19_qat_model.tar.gz")
-	download_quant_model(${QUANT_VGG19_MODEL_DIR} ${QUANT_VGG19_MODEL_ARCHIVE} 62bcd4b6c3ca2af67e8251d1c96ea18f)
-	# inference_quant_int8_image_classification_test(test_quant_int8_vgg19_mkldnn ${QUANT_VGG19_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	### Quant2 for image classification
-
-	# Quant2 ResNet50 with input/output scales in `fake_quantize_moving_average_abs_max` operators,
-	# with weight scales in `fake_dequantize_max_abs` operators
-        set(QUANT2_RESNET50_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2")
-	set(QUANT2_RESNET50_MODEL_ARCHIVE "ResNet50_qat_perf.tar.gz")
-	download_quant_model(${QUANT2_RESNET50_MODEL_DIR} ${QUANT2_RESNET50_MODEL_ARCHIVE} e87309457e8c462a579340607f064d66)
-	set(FP32_RESNET50_MODEL_DIR "${INT8_INSTALL_DIR}/resnet50")
-	inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_mkldnn ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes,
-	# with weight scales in `fake_dequantize_max_abs` operators
-	set(QUANT2_RESNET50_RANGE_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2_range")
-	set(QUANT2_RESNET50_RANGE_MODEL_ARCHIVE "ResNet50_qat_range.tar.gz")
-	download_quant_model(${QUANT2_RESNET50_RANGE_MODEL_DIR} ${QUANT2_RESNET50_RANGE_MODEL_ARCHIVE} 2fdc8a139f041c0d270abec826b2d304)
-	inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_range_mkldnn ${QUANT2_RESNET50_RANGE_MODEL_DIR}/ResNet50_qat_range ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes,
-	# with weight scales in `fake_channel_wise_dequantize_max_abs` operators
-	set(QUANT2_RESNET50_CHANNELWISE_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2_channelwise")
-	set(QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE "ResNet50_qat_channelwise.tar.gz")
-	download_quant_model(${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR} ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE} 887a1b1b0e9a4efd10f263a43764db26)
-	inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_channelwise_mkldnn ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# Quant2 MobileNetV1
-        set(QUANT2_MOBILENETV1_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV1_quant2")
-	set(QUANT2_MOBILENETV1_MODEL_ARCHIVE "MobileNet_qat_perf.tar.gz")
-	download_quant_model(${QUANT2_MOBILENETV1_MODEL_DIR} ${QUANT2_MOBILENETV1_MODEL_ARCHIVE} 7f626e453db2d56fed6c2538621ffacf)
-	set(FP32_MOBILENETV1_MODEL_DIR "${INT8_INSTALL_DIR}/mobilenetv1")
-	inference_quant2_int8_image_classification_test(test_quant2_int8_mobilenetv1_mkldnn ${QUANT2_MOBILENETV1_MODEL_DIR}/MobileNet_qat_perf/float ${FP32_MOBILENETV1_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-	
-	### Quant2 for NLP
-
-	set(NLP_DATA_ARCHIVE "Ernie_dataset.tar.gz")
-	set(NLP_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie_dataset")
-	set(NLP_DATA_PATH "${NLP_DATA_DIR}/Ernie_dataset/1.8w.bs1")
-	set(NLP_LABLES_PATH "${NLP_DATA_DIR}/Ernie_dataset/label.xnli.dev")
-	download_quant_data(${NLP_DATA_DIR} ${NLP_DATA_ARCHIVE} e650ce0cbc1fadbed5cc2c01d4e734dc)
-
-	# Quant2 Ernie
-	set(QUANT2_ERNIE_MODEL_ARCHIVE "ernie_qat.tar.gz")
-	set(QUANT2_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_quant2")
-	download_quant_model(${QUANT2_ERNIE_MODEL_DIR} ${QUANT2_ERNIE_MODEL_ARCHIVE} f7cdf4720755ecf66efbc8044e9922d9)
-	set(FP32_ERNIE_MODEL_ARCHIVE "ernie_fp32_model.tar.gz")
-	set(FP32_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_float")
-	download_quant_fp32_model(${FP32_ERNIE_MODEL_DIR} ${FP32_ERNIE_MODEL_ARCHIVE} 114f38804a3ef8c45e7259e68bbd838b)
-	set(QUANT2_ERNIE_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add,slice")
-	inference_quant2_int8_nlp_test(test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE})
-
-	# Quant2 GRU
-	set(QUANT2_GRU_MODEL_ARCHIVE "GRU_quant_acc.tar.gz")
-	set(QUANT2_GRU_MODEL_DIR "${QUANT_INSTALL_DIR}/GRU_quant2")
-	download_quant_model(${QUANT2_GRU_MODEL_DIR} ${QUANT2_GRU_MODEL_ARCHIVE} cf207f8076dcfb8b74d8b6bdddf9090c)
-	set(QUANT2_GRU_OPS_TO_QUANTIZE "multi_gru")
-
-	# Quant2 LSTM
-	set(QUANT2_LSTM_MODEL_ARCHIVE "lstm_quant.tar.gz")
-	set(QUANT2_LSTM_MODEL_DIR "${QUANT_INSTALL_DIR}/lstm_quant_test")
-	download_quant_model(${QUANT2_LSTM_MODEL_DIR} ${QUANT2_LSTM_MODEL_ARCHIVE} 40a693803b12ee9e251258f32559abcb)
-	set(QUANT2_LSTM_OPS_TO_QUANTIZE "fusion_lstm")
-
-	### Save FP32 model or INT8 model from Quant model
-        
-	set(QUANT2_INT8_RESNET50_SAVE_PATH "${QUANT_INSTALL_DIR}/ResNet50_quant2_int8")
-	save_quant_ic_model_test(save_quant2_model_resnet50 ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${QUANT2_INT8_RESNET50_SAVE_PATH})
-
-	set(QUANT2_INT8_ERNIE_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8")
-	save_quant_nlp_model_test(save_quant2_model_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QUANT2_INT8_ERNIE_SAVE_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE})
-
-	set(QUANT2_INT8_GRU_SAVE_PATH "${QUANT_INSTALL_DIR}/GRU_quant2_int8")
-	save_quant_nlp_model_test(save_quant2_model_gru ${QUANT2_GRU_MODEL_DIR}/GRU_quant_acc ${QUANT2_INT8_GRU_SAVE_PATH} ${QUANT2_GRU_OPS_TO_QUANTIZE})
-
-	set(QUANT2_INT8_LSTM_SAVE_PATH "${QUANT_INSTALL_DIR}/lstm_quant2_int8")
-	save_quant_nlp_model_test(save_quant2_model_lstm ${QUANT2_LSTM_MODEL_DIR}/lstm_quant ${QUANT2_INT8_LSTM_SAVE_PATH} ${QUANT2_LSTM_OPS_TO_QUANTIZE})
-
-	# Convert Quant2 model to dot and pdf files 
-	set(QUANT2_INT8_ERNIE_DOT_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8_dot_file")
-	convert_model2dot_test(convert_model2dot_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QUANT2_INT8_ERNIE_DOT_SAVE_PATH} "Ernie_quant2_int8")
-
-	### PTQ INT8
-
-	# PTQ int8 lstm model
-	set(LSTM_DATA_FILE "quant_lstm_input_data.tar.gz")
-	set(LSTM_URL "${INFERENCE_URL}/int8/unittest_model_data")
-	download_data(${QUANT2_INT8_LSTM_SAVE_PATH} ${LSTM_URL} ${LSTM_DATA_FILE} add84c754e9b792fea1fbd728d134ab7)
-	set(QUANT2_FP32_LSTM_MODEL_ARCHIVE "lstm_fp32_model.tar.gz")
-	download_lstm_model(${QUANT2_INT8_LSTM_SAVE_PATH} ${QUANT2_FP32_LSTM_MODEL_ARCHIVE} eecd9f44d69a84acc1cf2235c4b8b743)
-	inference_quant2_int8_lstm_model_test(test_quant2_int8_lstm_mkldnn ${QUANT2_INT8_LSTM_SAVE_PATH}/lstm_fp32_model ${QUANT2_LSTM_MODEL_DIR}/lstm_quant ${QUANT2_INT8_LSTM_SAVE_PATH}/quant_lstm_input_data)
+  #### Image classification dataset: ImageNet (small)
+  # The dataset should already be downloaded for INT8v2 unit tests
+  set(IMAGENET_DATA_PATH "${INFERENCE_DEMO_INSTALL_DIR}/imagenet/data.bin")
+
+  #### INT8 image classification python api test
+  # Models should be already downloaded for INT8v2 unit tests
+
+  set(INT8_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")
+
+  #### QUANT & INT8 comparison python api tests
+
+  set(QUANT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant")
+
+  ### Quant1 for image classification
+
+  # Quant ResNet50
+  set(QUANT_RESNET50_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant")
+  set(QUANT_RESNET50_MODEL_ARCHIVE "ResNet50_qat_model.tar.gz")
+  download_quant_model(
+    ${QUANT_RESNET50_MODEL_DIR} ${QUANT_RESNET50_MODEL_ARCHIVE}
+    ff89b934ab961c3a4a844193ece2e8a7)
+  inference_quant_int8_image_classification_test(
+    test_quant_int8_resnet50_mkldnn ${QUANT_RESNET50_MODEL_DIR}/model
+    ${IMAGENET_DATA_PATH})
+
+  # Quant ResNet101
+  set(QUANT_RESNET101_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet101_quant")
+  set(QUANT_RESNET101_MODEL_ARCHIVE "ResNet101_qat_model.tar.gz")
+  download_quant_model(
+    ${QUANT_RESNET101_MODEL_DIR} ${QUANT_RESNET101_MODEL_ARCHIVE}
+    95c6d01e3aeba31c13efb2ba8057d558)
+  # inference_quant_int8_image_classification_test(test_quant_int8_resnet101_mkldnn ${QUANT_RESNET101_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+  # Quant GoogleNet
+  set(QUANT_GOOGLENET_MODEL_DIR "${QUANT_INSTALL_DIR}/GoogleNet_quant")
+  set(QUANT_GOOGLENET_MODEL_ARCHIVE "GoogleNet_qat_model.tar.gz")
+  download_quant_model(
+    ${QUANT_GOOGLENET_MODEL_DIR} ${QUANT_GOOGLENET_MODEL_ARCHIVE}
+    1d4a7383baa63e7d1c423e8db2b791d5)
+  inference_quant_int8_image_classification_test(
+    test_quant_int8_googlenet_mkldnn ${QUANT_GOOGLENET_MODEL_DIR}/model
+    ${IMAGENET_DATA_PATH})
+
+  # Quant MobileNetV1
+  set(QUANT_MOBILENETV1_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV1_quant")
+  set(QUANT_MOBILENETV1_MODEL_ARCHIVE "MobileNetV1_qat_model.tar.gz")
+  download_quant_model(
+    ${QUANT_MOBILENETV1_MODEL_DIR} ${QUANT_MOBILENETV1_MODEL_ARCHIVE}
+    3b774d94a9fcbb604d09bdb731fc1162)
+  inference_quant_int8_image_classification_test(
+    test_quant_int8_mobilenetv1_mkldnn ${QUANT_MOBILENETV1_MODEL_DIR}/model
+    ${IMAGENET_DATA_PATH})
+
+  # Quant MobileNetV2
+  set(QUANT_MOBILENETV2_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV2_quant")
+  set(QUANT_MOBILENETV2_MODEL_ARCHIVE "MobileNetV2_qat_model.tar.gz")
+  download_quant_model(
+    ${QUANT_MOBILENETV2_MODEL_DIR} ${QUANT_MOBILENETV2_MODEL_ARCHIVE}
+    758a99d9225d8b73e1a8765883f96cdd)
+  inference_quant_int8_image_classification_test(
+    test_quant_int8_mobilenetv2_mkldnn ${QUANT_MOBILENETV2_MODEL_DIR}/model
+    ${IMAGENET_DATA_PATH})
+
+  # Quant VGG16
+  set(QUANT_VGG16_MODEL_DIR "${QUANT_INSTALL_DIR}/VGG16_quant")
+  set(QUANT_VGG16_MODEL_ARCHIVE "VGG16_qat_model.tar.gz")
+  download_quant_model(${QUANT_VGG16_MODEL_DIR} ${QUANT_VGG16_MODEL_ARCHIVE}
+                       c37e63ca82a102f47be266f8068b0b55)
+  # inference_quant_int8_image_classification_test(test_quant_int8_vgg16_mkldnn ${QUANT_VGG16_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+  # Quant VGG19
+  set(QUANT_VGG19_MODEL_DIR "${QUANT_INSTALL_DIR}/VGG19_quant")
+  set(QUANT_VGG19_MODEL_ARCHIVE "VGG19_qat_model.tar.gz")
+  download_quant_model(${QUANT_VGG19_MODEL_DIR} ${QUANT_VGG19_MODEL_ARCHIVE}
+                       62bcd4b6c3ca2af67e8251d1c96ea18f)
+  # inference_quant_int8_image_classification_test(test_quant_int8_vgg19_mkldnn ${QUANT_VGG19_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+  ### Quant2 for image classification
+
+  # Quant2 ResNet50 with input/output scales in `fake_quantize_moving_average_abs_max` operators,
+  # with weight scales in `fake_dequantize_max_abs` operators
+  set(QUANT2_RESNET50_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2")
+  set(QUANT2_RESNET50_MODEL_ARCHIVE "ResNet50_qat_perf.tar.gz")
+  download_quant_model(
+    ${QUANT2_RESNET50_MODEL_DIR} ${QUANT2_RESNET50_MODEL_ARCHIVE}
+    e87309457e8c462a579340607f064d66)
+  set(FP32_RESNET50_MODEL_DIR "${INT8_INSTALL_DIR}/resnet50")
+  inference_quant2_int8_image_classification_test(
+    test_quant2_int8_resnet50_mkldnn
+    ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float
+    ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+  # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes,
+  # with weight scales in `fake_dequantize_max_abs` operators
+  set(QUANT2_RESNET50_RANGE_MODEL_DIR
+      "${QUANT_INSTALL_DIR}/ResNet50_quant2_range")
+  set(QUANT2_RESNET50_RANGE_MODEL_ARCHIVE "ResNet50_qat_range.tar.gz")
+  download_quant_model(
+    ${QUANT2_RESNET50_RANGE_MODEL_DIR} ${QUANT2_RESNET50_RANGE_MODEL_ARCHIVE}
+    2fdc8a139f041c0d270abec826b2d304)
+  inference_quant2_int8_image_classification_test(
+    test_quant2_int8_resnet50_range_mkldnn
+    ${QUANT2_RESNET50_RANGE_MODEL_DIR}/ResNet50_qat_range
+    ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+  # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes,
+  # with weight scales in `fake_channel_wise_dequantize_max_abs` operators
+  set(QUANT2_RESNET50_CHANNELWISE_MODEL_DIR
+      "${QUANT_INSTALL_DIR}/ResNet50_quant2_channelwise")
+  set(QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE
+      "ResNet50_qat_channelwise.tar.gz")
+  download_quant_model(
+    ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}
+    ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE}
+    887a1b1b0e9a4efd10f263a43764db26)
+  inference_quant2_int8_image_classification_test(
+    test_quant2_int8_resnet50_channelwise_mkldnn
+    ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise
+    ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+  # Quant2 MobileNetV1
+  set(QUANT2_MOBILENETV1_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV1_quant2")
+  set(QUANT2_MOBILENETV1_MODEL_ARCHIVE "MobileNet_qat_perf.tar.gz")
+  download_quant_model(
+    ${QUANT2_MOBILENETV1_MODEL_DIR} ${QUANT2_MOBILENETV1_MODEL_ARCHIVE}
+    7f626e453db2d56fed6c2538621ffacf)
+  set(FP32_MOBILENETV1_MODEL_DIR "${INT8_INSTALL_DIR}/mobilenetv1")
+  inference_quant2_int8_image_classification_test(
+    test_quant2_int8_mobilenetv1_mkldnn
+    ${QUANT2_MOBILENETV1_MODEL_DIR}/MobileNet_qat_perf/float
+    ${FP32_MOBILENETV1_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+  ### Quant2 for NLP
+
+  set(NLP_DATA_ARCHIVE "Ernie_dataset.tar.gz")
+  set(NLP_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie_dataset")
+  set(NLP_DATA_PATH "${NLP_DATA_DIR}/Ernie_dataset/1.8w.bs1")
+  set(NLP_LABLES_PATH "${NLP_DATA_DIR}/Ernie_dataset/label.xnli.dev")
+  download_quant_data(${NLP_DATA_DIR} ${NLP_DATA_ARCHIVE}
+                      e650ce0cbc1fadbed5cc2c01d4e734dc)
+
+  # Quant2 Ernie
+  set(QUANT2_ERNIE_MODEL_ARCHIVE "ernie_qat.tar.gz")
+  set(QUANT2_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_quant2")
+  download_quant_model(${QUANT2_ERNIE_MODEL_DIR} ${QUANT2_ERNIE_MODEL_ARCHIVE}
+                       f7cdf4720755ecf66efbc8044e9922d9)
+  set(FP32_ERNIE_MODEL_ARCHIVE "ernie_fp32_model.tar.gz")
+  set(FP32_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_float")
+  download_quant_fp32_model(${FP32_ERNIE_MODEL_DIR} ${FP32_ERNIE_MODEL_ARCHIVE}
+                            114f38804a3ef8c45e7259e68bbd838b)
+  set(QUANT2_ERNIE_OPS_TO_QUANTIZE
+      "fc,reshape2,transpose2,matmul,elementwise_add,slice")
+  inference_quant2_int8_nlp_test(
+    test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float
+    ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH}
+    ${NLP_LABLES_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE})
+
+  # Quant2 GRU
+  set(QUANT2_GRU_MODEL_ARCHIVE "GRU_quant_acc.tar.gz")
+  set(QUANT2_GRU_MODEL_DIR "${QUANT_INSTALL_DIR}/GRU_quant2")
+  download_quant_model(${QUANT2_GRU_MODEL_DIR} ${QUANT2_GRU_MODEL_ARCHIVE}
+                       cf207f8076dcfb8b74d8b6bdddf9090c)
+  set(QUANT2_GRU_OPS_TO_QUANTIZE "multi_gru")
+
+  # Quant2 LSTM
+  set(QUANT2_LSTM_MODEL_ARCHIVE "lstm_quant.tar.gz")
+  set(QUANT2_LSTM_MODEL_DIR "${QUANT_INSTALL_DIR}/lstm_quant_test")
+  download_quant_model(${QUANT2_LSTM_MODEL_DIR} ${QUANT2_LSTM_MODEL_ARCHIVE}
+                       40a693803b12ee9e251258f32559abcb)
+  set(QUANT2_LSTM_OPS_TO_QUANTIZE "fusion_lstm")
+
+  ### Save FP32 model or INT8 model from Quant model
+
+  set(QUANT2_INT8_RESNET50_SAVE_PATH
+      "${QUANT_INSTALL_DIR}/ResNet50_quant2_int8")
+  save_quant_ic_model_test(
+    save_quant2_model_resnet50
+    ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float
+    ${QUANT2_INT8_RESNET50_SAVE_PATH})
+
+  set(QUANT2_INT8_ERNIE_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8")
+  save_quant_nlp_model_test(
+    save_quant2_model_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float
+    ${QUANT2_INT8_ERNIE_SAVE_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE})
+
+  set(QUANT2_INT8_GRU_SAVE_PATH "${QUANT_INSTALL_DIR}/GRU_quant2_int8")
+  save_quant_nlp_model_test(
+    save_quant2_model_gru ${QUANT2_GRU_MODEL_DIR}/GRU_quant_acc
+    ${QUANT2_INT8_GRU_SAVE_PATH} ${QUANT2_GRU_OPS_TO_QUANTIZE})
+
+  set(QUANT2_INT8_LSTM_SAVE_PATH "${QUANT_INSTALL_DIR}/lstm_quant2_int8")
+  save_quant_nlp_model_test(
+    save_quant2_model_lstm ${QUANT2_LSTM_MODEL_DIR}/lstm_quant
+    ${QUANT2_INT8_LSTM_SAVE_PATH} ${QUANT2_LSTM_OPS_TO_QUANTIZE})
+
+  # Convert Quant2 model to dot and pdf files
+  set(QUANT2_INT8_ERNIE_DOT_SAVE_PATH
+      "${QUANT_INSTALL_DIR}/Ernie_quant2_int8_dot_file")
+  convert_model2dot_test(
+    convert_model2dot_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float
+    ${QUANT2_INT8_ERNIE_DOT_SAVE_PATH} "Ernie_quant2_int8")
+
+  ### PTQ INT8
+
+  # PTQ int8 lstm model
+  set(LSTM_DATA_FILE "quant_lstm_input_data.tar.gz")
+  set(LSTM_URL "${INFERENCE_URL}/int8/unittest_model_data")
+  download_data(${QUANT2_INT8_LSTM_SAVE_PATH} ${LSTM_URL} ${LSTM_DATA_FILE}
+                add84c754e9b792fea1fbd728d134ab7)
+  set(QUANT2_FP32_LSTM_MODEL_ARCHIVE "lstm_fp32_model.tar.gz")
+  download_lstm_model(
+    ${QUANT2_INT8_LSTM_SAVE_PATH} ${QUANT2_FP32_LSTM_MODEL_ARCHIVE}
+    eecd9f44d69a84acc1cf2235c4b8b743)
+  inference_quant2_int8_lstm_model_test(
+    test_quant2_int8_lstm_mkldnn ${QUANT2_INT8_LSTM_SAVE_PATH}/lstm_fp32_model
+    ${QUANT2_LSTM_MODEL_DIR}/lstm_quant
+    ${QUANT2_INT8_LSTM_SAVE_PATH}/quant_lstm_input_data)
 
 endif()
 
-# Since the tests for Quant & INT8 comparison support only testing on Linux 
+# Since the tests for Quant & INT8 comparison support only testing on Linux
 # with MKL-DNN, we remove it here to not test it on other systems.
-list(REMOVE_ITEM TEST_OPS
-	test_mkldnn_int8_quantization_strategy
-	quant_int8_image_classification_comparison
-	quant_int8_nlp_comparison)
+list(REMOVE_ITEM TEST_OPS test_mkldnn_int8_quantization_strategy
+     quant_int8_image_classification_comparison quant_int8_nlp_comparison)
 
 #TODO(wanghaoshuang): Fix this unitest failed on GCC8.
-LIST(REMOVE_ITEM TEST_OPS test_auto_pruning)
-LIST(REMOVE_ITEM TEST_OPS test_filter_pruning)
-	
+list(REMOVE_ITEM TEST_OPS test_auto_pruning)
+list(REMOVE_ITEM TEST_OPS test_filter_pruning)
+
 # fix
 if(WIN32)
-    SET(SINGLE_CARD_TEST_OPS
-        test_user_defined_quantization
-        test_quantization_scale_pass
-        test_quantization_pass
-        test_moving_average_abs_max_scale_op
-        test_imperative_qat_channelwise
-        test_imperative_qat
-        test_imperative_out_scale
-        test_graph)
-    LIST(REMOVE_ITEM TEST_OPS ${SINGLE_CARD_TEST_OPS})
-    foreach(src ${SINGLE_CARD_TEST_OPS})
-        py_test(${src} SRCS ${src}.py ENVS CUDA_VISIBLE_DEVICES=0)
-    endforeach()
+  set(SINGLE_CARD_TEST_OPS
+      test_user_defined_quantization
+      test_quantization_scale_pass
+      test_quantization_pass
+      test_moving_average_abs_max_scale_op
+      test_imperative_qat_channelwise
+      test_imperative_qat
+      test_imperative_out_scale
+      test_graph)
+  list(REMOVE_ITEM TEST_OPS ${SINGLE_CARD_TEST_OPS})
+  foreach(src ${SINGLE_CARD_TEST_OPS})
+    py_test(${src} SRCS ${src}.py ENVS CUDA_VISIBLE_DEVICES=0)
+  endforeach()
 endif()
 
-
 foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
+  py_test(${src} SRCS ${src}.py)
 endforeach()
 
 # setting timeout value for old unittests
 if(NOT WIN32)
-    set_tests_properties(test_post_training_quantization_lstm_model PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
-    set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
-    set_tests_properties(test_post_training_quantization_mnist PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_post_training_quantization_while PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_imperative_ptq PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_post_training_quantization_lstm_model
+                       PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_post_training_quantization_mobilenetv1
+                       PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
+  set_tests_properties(test_post_training_quantization_resnet50
+                       PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
+  set_tests_properties(test_post_training_quantization_mnist PROPERTIES TIMEOUT
+                                                                        120)
+  set_tests_properties(test_post_training_quantization_while PROPERTIES TIMEOUT
+                                                                        120)
+  set_tests_properties(test_imperative_ptq PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT
+                                                                       120)
 endif()
 
 set_tests_properties(test_graph PROPERTIES TIMEOUT 120)
@@ -359,23 +530,30 @@ set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_qat_user_defined PROPERTIES TIMEOUT 200)
 
 if(LINUX AND WITH_MKLDNN)
-    set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
-    set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_quant2_int8_resnet50_channelwise_mkldnn PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_quant_int8_mobilenetv2_mkldnn PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_quant2_int8_resnet50_range_mkldnn PROPERTIES TIMEOUT 120)
-    set_tests_properties(save_quant2_model_resnet50 PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_quant_int8_resnet50_mkldnn PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_quant_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_quant2_int8_ernie_mkldnn PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_quant_int8_googlenet_mkldnn PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_quant2_int8_resnet50_mkldnn PROPERTIES TIMEOUT 120)
-	set_tests_properties(test_quant2_int8_lstm_mkldnn PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT
+                                                                      120)
+  set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_quant2_int8_resnet50_channelwise_mkldnn
+                       PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_quant_int8_mobilenetv2_mkldnn PROPERTIES TIMEOUT
+                                                                     120)
+  set_tests_properties(test_quant2_int8_resnet50_range_mkldnn PROPERTIES TIMEOUT
+                                                                         120)
+  set_tests_properties(save_quant2_model_resnet50 PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_quant_int8_resnet50_mkldnn PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_quant_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT
+                                                                     120)
+  set_tests_properties(test_quant2_int8_ernie_mkldnn PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_quant_int8_googlenet_mkldnn PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_quant2_int8_resnet50_mkldnn PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_quant2_int8_lstm_mkldnn PROPERTIES TIMEOUT 120)
 endif()
 
 if(APPLE)
-    set_tests_properties(test_post_training_quantization_mnist PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_post_training_quantization_while PROPERTIES TIMEOUT 300)
-	set_tests_properties(test_imperative_ptq PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_imperative_skip_op PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_post_training_quantization_mnist PROPERTIES TIMEOUT
+                                                                        300)
+  set_tests_properties(test_post_training_quantization_while PROPERTIES TIMEOUT
+                                                                        300)
+  set_tests_properties(test_imperative_ptq PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_imperative_skip_op PROPERTIES TIMEOUT 300)
 endif()
diff --git a/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py b/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py
index 0018d81dbf248..3573f53e22db7 100644
--- a/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py
+++ b/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py
@@ -26,19 +26,20 @@
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--model_path', type=str, default='', help='A path to a model.')
-    parser.add_argument(
-        '--save_graph_dir',
-        type=str,
-        default='',
-        help='A path to save the graph.')
+    parser.add_argument('--model_path',
+                        type=str,
+                        default='',
+                        help='A path to a model.')
+    parser.add_argument('--save_graph_dir',
+                        type=str,
+                        default='',
+                        help='A path to save the graph.')
     parser.add_argument(
         '--save_graph_name',
         type=str,
         default='',
-        help='A name to save the graph. Default - name from model path will be used'
-    )
+        help=
+        'A name to save the graph. Default - name from model path will be used')
 
     test_args, args = parser.parse_known_args(namespace=unittest)
     return test_args, sys.argv[:1] + args
@@ -53,9 +54,9 @@ def generate_dot_for_model(model_path, save_graph_dir, save_graph_name):
             [inference_program, feed_target_names,
              fetch_targets] = fluid.io.load_inference_model(model_path, exe)
         else:
-            [inference_program, feed_target_names,
-             fetch_targets] = fluid.io.load_inference_model(model_path, exe,
-                                                            'model', 'params')
+            [inference_program, feed_target_names, fetch_targets
+             ] = fluid.io.load_inference_model(model_path, exe, 'model',
+                                               'params')
         graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
         if not os.path.exists(save_graph_dir):
             os.makedirs(save_graph_dir)
@@ -64,8 +65,8 @@ def generate_dot_for_model(model_path, save_graph_dir, save_graph_name):
             save_graph_name = model_name
         graph.draw(save_graph_dir, save_graph_name, graph.all_op_nodes())
         print(
-            "Success! Generated dot and pdf files for {0} model, that can be found at {1} named {2}.\n".
-            format(model_name, save_graph_dir, save_graph_name))
+            "Success! Generated dot and pdf files for {0} model, that can be found at {1} named {2}.\n"
+            .format(model_name, save_graph_dir, save_graph_name))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py b/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py
index 466cc14eae098..36302aea187af 100644
--- a/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py
+++ b/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py
@@ -24,8 +24,9 @@
 
 from paddle.fluid.log_helper import get_logger
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 def fix_model_dict(model):
@@ -81,6 +82,7 @@ def train_lenet(lenet, reader, optimizer):
 
 
 class ImperativeLenet(fluid.dygraph.Layer):
+
     def __init__(self, num_classes=10):
         super(ImperativeLenet, self).__init__()
         conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
@@ -93,50 +95,36 @@ def __init__(self, num_classes=10):
         fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
         fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
         self.features = Sequential(
-            Conv2D(
-                in_channels=1,
-                out_channels=6,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=conv2d_w1_attr,
-                bias_attr=False),
-            BatchNorm2D(6),
-            ReLU(),
-            MaxPool2D(
-                kernel_size=2, stride=2),
-            Conv2D(
-                in_channels=6,
-                out_channels=16,
-                kernel_size=5,
-                stride=1,
-                padding=0,
-                weight_attr=conv2d_w2_attr,
-                bias_attr=conv2d_b2_attr),
-            BatchNorm2D(16),
-            PReLU(),
-            MaxPool2D(
-                kernel_size=2, stride=2))
+            Conv2D(in_channels=1,
+                   out_channels=6,
+                   kernel_size=3,
+                   stride=1,
+                   padding=1,
+                   weight_attr=conv2d_w1_attr,
+                   bias_attr=False), BatchNorm2D(6), ReLU(),
+            MaxPool2D(kernel_size=2, stride=2),
+            Conv2D(in_channels=6,
+                   out_channels=16,
+                   kernel_size=5,
+                   stride=1,
+                   padding=0,
+                   weight_attr=conv2d_w2_attr,
+                   bias_attr=conv2d_b2_attr), BatchNorm2D(16), PReLU(),
+            MaxPool2D(kernel_size=2, stride=2))
 
         self.fc = Sequential(
-            Linear(
-                in_features=400,
-                out_features=120,
-                weight_attr=fc_w1_attr,
-                bias_attr=fc_b1_attr),
-            LeakyReLU(),
-            Linear(
-                in_features=120,
-                out_features=84,
-                weight_attr=fc_w2_attr,
-                bias_attr=fc_b2_attr),
-            Sigmoid(),
-            Linear(
-                in_features=84,
-                out_features=num_classes,
-                weight_attr=fc_w3_attr,
-                bias_attr=fc_b3_attr),
-            Softmax())
+            Linear(in_features=400,
+                   out_features=120,
+                   weight_attr=fc_w1_attr,
+                   bias_attr=fc_b1_attr), LeakyReLU(),
+            Linear(in_features=120,
+                   out_features=84,
+                   weight_attr=fc_w2_attr,
+                   bias_attr=fc_b2_attr), Sigmoid(),
+            Linear(in_features=84,
+                   out_features=num_classes,
+                   weight_attr=fc_w3_attr,
+                   bias_attr=fc_b3_attr), Softmax())
         self.add = paddle.nn.quant.add()
         self.quant_stub = paddle.nn.quant.QuantStub()
 
@@ -151,6 +139,7 @@ def forward(self, inputs):
 
 
 class ImperativeLenetWithSkipQuant(fluid.dygraph.Layer):
+
     def __init__(self, num_classes=10):
         super(ImperativeLenetWithSkipQuant, self).__init__()
 
@@ -164,53 +153,48 @@ def __init__(self, num_classes=10):
         fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
         fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
         fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-        self.conv2d_0 = Conv2D(
-            in_channels=1,
-            out_channels=6,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            weight_attr=conv2d_w1_attr,
-            bias_attr=conv2d_b1_attr)
+        self.conv2d_0 = Conv2D(in_channels=1,
+                               out_channels=6,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1,
+                               weight_attr=conv2d_w1_attr,
+                               bias_attr=conv2d_b1_attr)
         self.conv2d_0.skip_quant = True
 
         self.batch_norm_0 = BatchNorm2D(6)
         self.relu_0 = ReLU()
         self.pool2d_0 = MaxPool2D(kernel_size=2, stride=2)
-        self.conv2d_1 = Conv2D(
-            in_channels=6,
-            out_channels=16,
-            kernel_size=5,
-            stride=1,
-            padding=0,
-            weight_attr=conv2d_w2_attr,
-            bias_attr=conv2d_b2_attr)
+        self.conv2d_1 = Conv2D(in_channels=6,
+                               out_channels=16,
+                               kernel_size=5,
+                               stride=1,
+                               padding=0,
+                               weight_attr=conv2d_w2_attr,
+                               bias_attr=conv2d_b2_attr)
         self.conv2d_1.skip_quant = False
 
         self.batch_norm_1 = BatchNorm2D(16)
         self.relu6_0 = ReLU6()
         self.pool2d_1 = MaxPool2D(kernel_size=2, stride=2)
-        self.linear_0 = Linear(
-            in_features=400,
-            out_features=120,
-            weight_attr=fc_w1_attr,
-            bias_attr=fc_b1_attr)
+        self.linear_0 = Linear(in_features=400,
+                               out_features=120,
+                               weight_attr=fc_w1_attr,
+                               bias_attr=fc_b1_attr)
         self.linear_0.skip_quant = True
 
         self.leaky_relu_0 = LeakyReLU()
-        self.linear_1 = Linear(
-            in_features=120,
-            out_features=84,
-            weight_attr=fc_w2_attr,
-            bias_attr=fc_b2_attr)
+        self.linear_1 = Linear(in_features=120,
+                               out_features=84,
+                               weight_attr=fc_w2_attr,
+                               bias_attr=fc_b2_attr)
         self.linear_1.skip_quant = False
 
         self.sigmoid_0 = Sigmoid()
-        self.linear_2 = Linear(
-            in_features=84,
-            out_features=num_classes,
-            weight_attr=fc_w3_attr,
-            bias_attr=fc_b3_attr)
+        self.linear_2 = Linear(in_features=84,
+                               out_features=num_classes,
+                               weight_attr=fc_w3_attr,
+                               bias_attr=fc_b3_attr)
         self.linear_2.skip_quant = False
         self.softmax_0 = Softmax()
 
@@ -237,6 +221,7 @@ def forward(self, inputs):
 
 
 class ImperativeLinearBn(fluid.dygraph.Layer):
+
     def __init__(self):
         super(ImperativeLinearBn, self).__init__()
 
@@ -250,11 +235,10 @@ def __init__(self):
             name="bn_weight",
             initializer=paddle.nn.initializer.Constant(value=0.5))
 
-        self.linear = Linear(
-            in_features=10,
-            out_features=10,
-            weight_attr=fc_w_attr,
-            bias_attr=fc_b_attr)
+        self.linear = Linear(in_features=10,
+                             out_features=10,
+                             weight_attr=fc_w_attr,
+                             bias_attr=fc_b_attr)
         self.bn = BatchNorm1D(10, weight_attr=bn_w_attr)
 
     def forward(self, inputs):
@@ -265,6 +249,7 @@ def forward(self, inputs):
 
 
 class ImperativeLinearBn_hook(fluid.dygraph.Layer):
+
     def __init__(self):
         super(ImperativeLinearBn_hook, self).__init__()
 
@@ -272,8 +257,9 @@ def __init__(self):
             name="linear_weight",
             initializer=paddle.nn.initializer.Constant(value=0.5))
 
-        self.linear = Linear(
-            in_features=10, out_features=10, weight_attr=fc_w_attr)
+        self.linear = Linear(in_features=10,
+                             out_features=10,
+                             weight_attr=fc_w_attr)
         self.bn = BatchNorm1D(10)
 
         forward_pre = self.linear.register_forward_pre_hook(pre_hook)
diff --git a/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py b/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
index 188f14f0a6973..52ebf463cdd2d 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
@@ -43,27 +43,32 @@ def parse_args():
         default=0,
         help='Number of the first minibatches to skip in performance statistics.'
     )
-    parser.add_argument(
-        '--quant_model', type=str, default='', help='A path to a Quant model.')
-    parser.add_argument(
-        '--fp32_model', type=str, default='', help='A path to an FP32 model.')
+    parser.add_argument('--quant_model',
+                        type=str,
+                        default='',
+                        help='A path to a Quant model.')
+    parser.add_argument('--fp32_model',
+                        type=str,
+                        default='',
+                        help='A path to an FP32 model.')
     parser.add_argument('--infer_data', type=str, default='', help='Data file.')
     parser.add_argument(
         '--batch_num',
         type=int,
         default=0,
-        help='Number of batches to process. 0 or less means whole dataset. Default: 0.'
+        help=
+        'Number of batches to process. 0 or less means whole dataset. Default: 0.'
     )
-    parser.add_argument(
-        '--acc_diff_threshold',
-        type=float,
-        default=0.01,
-        help='Accepted accuracy difference threshold.')
+    parser.add_argument('--acc_diff_threshold',
+                        type=float,
+                        default=0.01,
+                        help='Accepted accuracy difference threshold.')
     parser.add_argument(
         '--ops_to_quantize',
         type=str,
         default='',
-        help='A comma separated list of operators to quantize. Only quantizable operators are taken into account. If the option is not used, an attempt to quantize all quantizable operators will be made.'
+        help=
+        'A comma separated list of operators to quantize. Only quantizable operators are taken into account. If the option is not used, an attempt to quantize all quantizable operators will be made.'
     )
     parser.add_argument(
         '--op_ids_to_skip',
@@ -74,12 +79,12 @@ def parse_args():
         '--targets',
         type=str,
         default='quant,int8,fp32',
-        help='A comma separated list of inference types to run ("int8", "fp32", "quant"). Default: "quant,int8,fp32"'
+        help=
+        'A comma separated list of inference types to run ("int8", "fp32", "quant"). Default: "quant,int8,fp32"'
     )
-    parser.add_argument(
-        '--debug',
-        action='store_true',
-        help='If used, the graph of Quant model is drawn.')
+    parser.add_argument('--debug',
+                        action='store_true',
+                        help='If used, the graph of Quant model is drawn.')
 
     test_args, args = parser.parse_known_args(namespace=unittest)
     return test_args, sys.argv[:1] + args
@@ -91,6 +96,7 @@ class Quant2Int8ImageClassificationComparisonTest(unittest.TestCase):
     """
 
     def _reader_creator(self, data_file='data.bin'):
+
         def reader():
             with open(data_file, 'rb') as fp:
                 num = fp.read(8)
@@ -143,11 +149,14 @@ def _prepare_for_fp32_mkldnn(self, graph):
             name = op_node.name()
             if name in ['depthwise_conv2d']:
                 input_var_node = graph._find_node_by_name(
-                    op_node.inputs, op_node.input("Input")[0])
+                    op_node.inputs,
+                    op_node.input("Input")[0])
                 weight_var_node = graph._find_node_by_name(
-                    op_node.inputs, op_node.input("Filter")[0])
+                    op_node.inputs,
+                    op_node.input("Filter")[0])
                 output_var_node = graph._find_node_by_name(
-                    graph.all_var_nodes(), op_node.output("Output")[0])
+                    graph.all_var_nodes(),
+                    op_node.output("Output")[0])
                 attrs = {
                     name: op_node.op().attr(name)
                     for name in op_node.op().attr_names()
@@ -182,12 +191,12 @@ def _predict(self,
         inference_scope = fluid.executor.global_scope()
         with fluid.scope_guard(inference_scope):
             if os.path.exists(os.path.join(model_path, '__model__')):
-                [inference_program, feed_target_names,
-                 fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+                [inference_program, feed_target_names, fetch_targets
+                 ] = fluid.io.load_inference_model(model_path, exe)
             else:
-                [inference_program, feed_target_names,
-                 fetch_targets] = fluid.io.load_inference_model(
-                     model_path, exe, 'model', 'params')
+                [inference_program, feed_target_names, fetch_targets
+                 ] = fluid.io.load_inference_model(model_path, exe, 'model',
+                                                   'params')
 
             graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
             if (self._debug):
@@ -252,8 +261,8 @@ def _predict(self,
                     batch_time = (time.time() - start) * 1000  # in miliseconds
                     outputs.append(out[0])
                     # Calculate accuracy result
-                    batch_acc1, batch_acc5 = self._get_batch_accuracy(out[0],
-                                                                      labels)
+                    batch_acc1, batch_acc5 = self._get_batch_accuracy(
+                        out[0], labels)
 
                 infer_accs1.append(batch_acc1)
                 infer_accs5.append(batch_acc5)
@@ -266,8 +275,8 @@ def _predict(self,
                 appx = ' (warm-up)' if iters <= skip_batch_num else ''
                 _logger.info('batch {0}{5}, acc1: {1:.4f}, acc5: {2:.4f}, '
                              'latency: {3:.4f} ms, fps: {4:.2f}'.format(
-                                 iters, batch_acc1, batch_acc5, batch_time /
-                                 batch_size, fps, appx))
+                                 iters, batch_acc1, batch_acc5,
+                                 batch_time / batch_size, fps, appx))
 
             # Postprocess benchmark data
             batch_latencies = batch_times[skip_batch_num:]
@@ -278,8 +287,8 @@ def _predict(self,
             infer_total_time = time.time() - infer_start_time
             acc1_avg = np.mean(infer_accs1)
             acc5_avg = np.mean(infer_accs5)
-            _logger.info('Total inference run time: {:.2f} s'.format(
-                infer_total_time))
+            _logger.info(
+                'Total inference run time: {:.2f} s'.format(infer_total_time))
 
             return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg
 
@@ -359,17 +368,18 @@ def test_graph_transformation(self):
         _logger.info('Batch size: {}'.format(batch_size))
         _logger.info('Batch number: {}'.format(batch_num))
         _logger.info('Accuracy drop threshold: {}.'.format(acc_diff_threshold))
-        _logger.info('Quantized ops: {}.'.format(','.join(
-            self._quantized_ops) if self._quantized_ops else 'all quantizable'))
-        _logger.info('Op ids to skip quantization: {}.'.format(','.join(
-            map(str, self._op_ids_to_skip)) if test_case_args.op_ids_to_skip
-                                                               else 'none'))
+        _logger.info(
+            'Quantized ops: {}.'.format(','.join(self._quantized_ops) if self.
+                                        _quantized_ops else 'all quantizable'))
+        _logger.info('Op ids to skip quantization: {}.'.format(
+            ','.join(map(str, self._op_ids_to_skip)
+                     ) if test_case_args.op_ids_to_skip else 'none'))
         _logger.info('Targets: {}.'.format(','.join(self._targets)))
 
         if 'quant' in self._targets:
             _logger.info('--- Quant prediction start ---')
-            val_reader = paddle.batch(
-                self._reader_creator(data_path), batch_size=batch_size)
+            val_reader = paddle.batch(self._reader_creator(data_path),
+                                      batch_size=batch_size)
             quant_output, quant_acc1, quant_acc5, quant_fps, quant_lat = self._predict(
                 val_reader,
                 quant_model_path,
@@ -382,8 +392,8 @@ def test_graph_transformation(self):
 
         if 'int8' in self._targets:
             _logger.info('--- INT8 prediction start ---')
-            val_reader = paddle.batch(
-                self._reader_creator(data_path), batch_size=batch_size)
+            val_reader = paddle.batch(self._reader_creator(data_path),
+                                      batch_size=batch_size)
             int8_output, int8_acc1, int8_acc5, int8_fps, int8_lat = self._predict(
                 val_reader,
                 quant_model_path,
@@ -397,8 +407,8 @@ def test_graph_transformation(self):
         fp32_acc1 = fp32_acc5 = fp32_fps = fp32_lat = -1
         if 'fp32' in self._targets and fp32_model_path:
             _logger.info('--- FP32 prediction start ---')
-            val_reader = paddle.batch(
-                self._reader_creator(data_path), batch_size=batch_size)
+            val_reader = paddle.batch(self._reader_creator(data_path),
+                                      batch_size=batch_size)
             fp32_output, fp32_acc1, fp32_acc5, fp32_fps, fp32_lat = self._predict(
                 val_reader,
                 fp32_model_path,
diff --git a/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py b/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py
index 4f4a2ddd4ab41..0a9abe61e0e4b 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py
@@ -25,28 +25,35 @@
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--fp32_model', type=str, default='', help='A path to a FP32 model.')
-    parser.add_argument(
-        '--quant_model', type=str, default='', help='A path to a quant model.')
+    parser.add_argument('--fp32_model',
+                        type=str,
+                        default='',
+                        help='A path to a FP32 model.')
+    parser.add_argument('--quant_model',
+                        type=str,
+                        default='',
+                        help='A path to a quant model.')
     parser.add_argument('--infer_data', type=str, default='', help='Data file.')
     parser.add_argument(
         '--warmup_iter',
         type=int,
         default=1,
-        help='Number of the first iterations to skip in performance statistics.')
-    parser.add_argument(
-        '--acc_diff_threshold',
-        type=float,
-        default=0.01,
-        help='Accepted accuracy difference threshold.')
-    parser.add_argument(
-        '--num_threads', type=int, default=1, help='Number of threads.')
+        help='Number of the first iterations to skip in performance statistics.'
+    )
+    parser.add_argument('--acc_diff_threshold',
+                        type=float,
+                        default=0.01,
+                        help='Accepted accuracy difference threshold.')
+    parser.add_argument('--num_threads',
+                        type=int,
+                        default=1,
+                        help='Number of threads.')
     parser.add_argument(
         '--mkldnn_cache_capacity',
         type=int,
         default=0,
-        help='Mkldnn cache capacity. The default value in Python API is 15, which can slow down int8 models. Default 0 means unlimited cache.'
+        help=
+        'Mkldnn cache capacity. The default value in Python API is 15, which can slow down int8 models. Default 0 means unlimited cache.'
     )
 
     test_args, args = parser.parse_known_args(namespace=unittest)
@@ -54,6 +61,7 @@ def parse_args():
 
 
 class TestLstmModelPTQ(unittest.TestCase):
+
     def get_warmup_tensor(self, data_path, place):
         data = []
         with open(data_path, 'rb') as in_f:
@@ -67,11 +75,11 @@ def get_warmup_tensor(self, data_path, place):
                 seq_len = (alllen >> 16) & 0xFFFF
 
                 label = in_f.read(4 * label_len)
-                label = np.frombuffer(
-                    label, dtype=np.int32).reshape([len(label) // 4])
+                label = np.frombuffer(label,
+                                      dtype=np.int32).reshape([len(label) // 4])
                 feat = in_f.read(4 * seq_len * 8)
-                feat = np.frombuffer(
-                    feat, dtype=np.float32).reshape([len(feat) // 4 // 8, 8])
+                feat = np.frombuffer(feat, dtype=np.float32).reshape(
+                    [len(feat) // 4 // 8, 8])
                 lod_feat = [feat.shape[0]]
                 minputs = fluid.create_lod_tensor(feat, [lod_feat], place)
 
@@ -189,22 +197,25 @@ def test_lstm_model(self):
         warmup_iter = test_case_args.warmup_iter
         acc_diff_threshold = test_case_args.acc_diff_threshold
 
-        (fp32_hx_acc, fp32_ctc_acc, fp32_fps) = self.run_program(
-            fp32_model, infer_data, num_threads, mkldnn_cache_capacity,
-            warmup_iter, False, False)
+        (fp32_hx_acc, fp32_ctc_acc,
+         fp32_fps) = self.run_program(fp32_model, infer_data, num_threads,
+                                      mkldnn_cache_capacity, warmup_iter, False,
+                                      False)
 
-        (int8_hx_acc, int8_ctc_acc, int8_fps) = self.run_program(
-            fp32_model, infer_data, num_threads, mkldnn_cache_capacity,
-            warmup_iter, True, True)
+        (int8_hx_acc, int8_ctc_acc,
+         int8_fps) = self.run_program(fp32_model, infer_data, num_threads,
+                                      mkldnn_cache_capacity, warmup_iter, True,
+                                      True)
 
         quant_model_save_path = quant_model + "_int8"
         # transform model to quant2
         transform_and_save_int8_model(quant_model, quant_model_save_path,
                                       "fusion_lstm,concat")
 
-        (quant_hx_acc, quant_ctc_acc, quant_fps) = self.run_program(
-            quant_model_save_path, infer_data, num_threads,
-            mkldnn_cache_capacity, warmup_iter, True, False)
+        (quant_hx_acc, quant_ctc_acc,
+         quant_fps) = self.run_program(quant_model_save_path, infer_data,
+                                       num_threads, mkldnn_cache_capacity,
+                                       warmup_iter, True, False)
 
         print("FP32: fps {0}, hx_acc {1}, ctc_acc {2}".format(
             fp32_fps, fp32_hx_acc, fp32_ctc_acc))
diff --git a/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py b/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
index 12d1cfcc41d53..fecead6d6de03 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
@@ -41,33 +41,39 @@ def parse_args():
         default=0,
         help='Number of the first minibatches to skip in performance statistics.'
     )
-    parser.add_argument(
-        '--quant_model', type=str, default='', help='A path to a Quant model.')
+    parser.add_argument('--quant_model',
+                        type=str,
+                        default='',
+                        help='A path to a Quant model.')
     parser.add_argument(
         '--fp32_model',
         type=str,
         default='',
-        help='A path to an FP32 model. If empty, the Quant model will be used for FP32 inference.'
+        help=
+        'A path to an FP32 model. If empty, the Quant model will be used for FP32 inference.'
     )
     parser.add_argument('--infer_data', type=str, default='', help='Data file.')
-    parser.add_argument(
-        '--labels', type=str, default='', help='File with labels.')
+    parser.add_argument('--labels',
+                        type=str,
+                        default='',
+                        help='File with labels.')
     parser.add_argument(
         '--batch_num',
         type=int,
         default=0,
-        help='Number of batches to process. 0 or less means whole dataset. Default: 0.'
+        help=
+        'Number of batches to process. 0 or less means whole dataset. Default: 0.'
     )
-    parser.add_argument(
-        '--acc_diff_threshold',
-        type=float,
-        default=0.01,
-        help='Accepted accuracy difference threshold.')
+    parser.add_argument('--acc_diff_threshold',
+                        type=float,
+                        default=0.01,
+                        help='Accepted accuracy difference threshold.')
     parser.add_argument(
         '--ops_to_quantize',
         type=str,
         default='',
-        help='A comma separated list of operators to quantize. Only quantizable operators are taken into account. If the option is not used, an attempt to quantize all quantizable operators will be made.'
+        help=
+        'A comma separated list of operators to quantize. Only quantizable operators are taken into account. If the option is not used, an attempt to quantize all quantizable operators will be made.'
     )
     parser.add_argument(
         '--op_ids_to_skip',
@@ -78,12 +84,12 @@ def parse_args():
         '--targets',
         type=str,
         default='quant,int8,fp32',
-        help='A comma separated list of inference types to run ("int8", "fp32", "quant"). Default: "quant,int8,fp32"'
+        help=
+        'A comma separated list of inference types to run ("int8", "fp32", "quant"). Default: "quant,int8,fp32"'
     )
-    parser.add_argument(
-        '--debug',
-        action='store_true',
-        help='If used, the graph of Quant model is drawn.')
+    parser.add_argument('--debug',
+                        action='store_true',
+                        help='If used, the graph of Quant model is drawn.')
 
     test_args, args = parser.parse_known_args(namespace=unittest)
 
@@ -156,12 +162,12 @@ def _predict(self,
         inference_scope = fluid.executor.global_scope()
         with fluid.scope_guard(inference_scope):
             if os.path.exists(os.path.join(model_path, '__model__')):
-                [inference_program, feed_target_names,
-                 fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+                [inference_program, feed_target_names, fetch_targets
+                 ] = fluid.io.load_inference_model(model_path, exe)
             else:
-                [inference_program, feed_target_names,
-                 fetch_targets] = fluid.io.load_inference_model(
-                     model_path, exe, 'model', 'params')
+                [inference_program, feed_target_names, fetch_targets
+                 ] = fluid.io.load_inference_model(model_path, exe, 'model',
+                                                   'params')
 
             graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
             if (self._debug):
@@ -229,8 +235,8 @@ def _predict(self,
             ppses = ppses[skip_batch_num:]
             pps_avg = np.average(ppses)
             acc_avg = float(np.sum(total_correct)) / float(total_samples)
-            _logger.info('Total inference run time: {:.2f} s'.format(
-                infer_total_time))
+            _logger.info(
+                'Total inference run time: {:.2f} s'.format(infer_total_time))
 
             return acc_avg, pps_avg, latency_avg
 
@@ -310,56 +316,54 @@ def test_graph_transformation(self):
         _logger.info('Batch size: {}'.format(batch_size))
         _logger.info('Batch number: {}'.format(batch_num))
         _logger.info('Accuracy drop threshold: {}.'.format(acc_diff_threshold))
-        _logger.info('Quantized ops: {}.'.format(','.join(
-            self._quantized_ops) if self._quantized_ops else 'all quantizable'))
-        _logger.info('Op ids to skip quantization: {}.'.format(','.join(
-            map(str, self._op_ids_to_skip)) if test_case_args.op_ids_to_skip
-                                                               else 'none'))
+        _logger.info(
+            'Quantized ops: {}.'.format(','.join(self._quantized_ops) if self.
+                                        _quantized_ops else 'all quantizable'))
+        _logger.info('Op ids to skip quantization: {}.'.format(
+            ','.join(map(str, self._op_ids_to_skip)
+                     ) if test_case_args.op_ids_to_skip else 'none'))
         _logger.info('Targets: {}.'.format(','.join(self._targets)))
 
         if 'quant' in self._targets:
             _logger.info('--- Quant prediction start ---')
-            val_reader = paddle.batch(
-                self._reader_creator(data_path, labels_path),
-                batch_size=batch_size)
-            quant_acc, quant_pps, quant_lat = self._predict(
-                val_reader,
-                quant_model_path,
-                batch_size,
-                batch_num,
-                skip_batch_num,
-                target='quant')
+            val_reader = paddle.batch(self._reader_creator(
+                data_path, labels_path),
+                                      batch_size=batch_size)
+            quant_acc, quant_pps, quant_lat = self._predict(val_reader,
+                                                            quant_model_path,
+                                                            batch_size,
+                                                            batch_num,
+                                                            skip_batch_num,
+                                                            target='quant')
             self._print_performance('Quant', quant_pps, quant_lat)
             self._print_accuracy('Quant', quant_acc)
 
         if 'int8' in self._targets:
             _logger.info('--- INT8 prediction start ---')
-            val_reader = paddle.batch(
-                self._reader_creator(data_path, labels_path),
-                batch_size=batch_size)
-            int8_acc, int8_pps, int8_lat = self._predict(
-                val_reader,
-                quant_model_path,
-                batch_size,
-                batch_num,
-                skip_batch_num,
-                target='int8')
+            val_reader = paddle.batch(self._reader_creator(
+                data_path, labels_path),
+                                      batch_size=batch_size)
+            int8_acc, int8_pps, int8_lat = self._predict(val_reader,
+                                                         quant_model_path,
+                                                         batch_size,
+                                                         batch_num,
+                                                         skip_batch_num,
+                                                         target='int8')
             self._print_performance('INT8', int8_pps, int8_lat)
             self._print_accuracy('INT8', int8_acc)
 
         fp32_acc = fp32_pps = fp32_lat = -1
         if 'fp32' in self._targets and fp32_model_path:
             _logger.info('--- FP32 prediction start ---')
-            val_reader = paddle.batch(
-                self._reader_creator(data_path, labels_path),
-                batch_size=batch_size)
-            fp32_acc, fp32_pps, fp32_lat = self._predict(
-                val_reader,
-                fp32_model_path,
-                batch_size,
-                batch_num,
-                skip_batch_num,
-                target='fp32')
+            val_reader = paddle.batch(self._reader_creator(
+                data_path, labels_path),
+                                      batch_size=batch_size)
+            fp32_acc, fp32_pps, fp32_lat = self._predict(val_reader,
+                                                         fp32_model_path,
+                                                         batch_size,
+                                                         batch_num,
+                                                         skip_batch_num,
+                                                         target='fp32')
             self._print_performance('FP32', fp32_pps, fp32_lat)
             self._print_accuracy('FP32', fp32_acc)
 
diff --git a/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py b/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
index fac41ce8a22df..41ddfa513a0cc 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
@@ -43,24 +43,25 @@ def parse_args():
         default=0,
         help='Number of the first minibatches to skip in performance statistics.'
     )
-    parser.add_argument(
-        '--debug',
-        action='store_true',
-        help='If used, the graph of Quant model is drawn.')
-    parser.add_argument(
-        '--quant_model', type=str, default='', help='A path to a Quant model.')
+    parser.add_argument('--debug',
+                        action='store_true',
+                        help='If used, the graph of Quant model is drawn.')
+    parser.add_argument('--quant_model',
+                        type=str,
+                        default='',
+                        help='A path to a Quant model.')
     parser.add_argument('--infer_data', type=str, default='', help='Data file.')
     parser.add_argument(
         '--batch_num',
         type=int,
         default=0,
-        help='Number of batches to process. 0 or less means whole dataset. Default: 0.'
+        help=
+        'Number of batches to process. 0 or less means whole dataset. Default: 0.'
     )
-    parser.add_argument(
-        '--acc_diff_threshold',
-        type=float,
-        default=0.01,
-        help='Accepted accuracy difference threshold.')
+    parser.add_argument('--acc_diff_threshold',
+                        type=float,
+                        default=0.01,
+                        help='Accepted accuracy difference threshold.')
 
     test_args, args = parser.parse_known_args(namespace=unittest)
     return test_args, sys.argv[:1] + args
@@ -72,6 +73,7 @@ class QuantInt8ImageClassificationComparisonTest(unittest.TestCase):
     """
 
     def _reader_creator(self, data_file='data.bin'):
+
         def reader():
             with open(data_file, 'rb') as fp:
                 num = fp.read(8)
@@ -124,11 +126,14 @@ def _prepare_for_fp32_mkldnn(self, graph):
             name = op_node.name()
             if name in ['depthwise_conv2d']:
                 input_var_node = graph._find_node_by_name(
-                    op_node.inputs, op_node.input("Input")[0])
+                    op_node.inputs,
+                    op_node.input("Input")[0])
                 weight_var_node = graph._find_node_by_name(
-                    op_node.inputs, op_node.input("Filter")[0])
+                    op_node.inputs,
+                    op_node.input("Filter")[0])
                 output_var_node = graph._find_node_by_name(
-                    graph.all_var_nodes(), op_node.output("Output")[0])
+                    graph.all_var_nodes(),
+                    op_node.output("Output")[0])
                 attrs = {
                     name: op_node.op().attr(name)
                     for name in op_node.op().attr_names()
@@ -162,19 +167,19 @@ def _predict(self,
         inference_scope = fluid.executor.global_scope()
         with fluid.scope_guard(inference_scope):
             if os.path.exists(os.path.join(model_path, '__model__')):
-                [inference_program, feed_target_names,
-                 fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+                [inference_program, feed_target_names, fetch_targets
+                 ] = fluid.io.load_inference_model(model_path, exe)
             else:
-                [inference_program, feed_target_names,
-                 fetch_targets] = fluid.io.load_inference_model(
-                     model_path, exe, 'model', 'params')
+                [inference_program, feed_target_names, fetch_targets
+                 ] = fluid.io.load_inference_model(model_path, exe, 'model',
+                                                   'params')
 
             graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
             if (self._debug):
                 graph.draw('.', 'quant_orig', graph.all_op_nodes())
             if (transform_to_int8):
-                mkldnn_int8_pass = QuantInt8MkldnnPass(
-                    _scope=inference_scope, _place=place)
+                mkldnn_int8_pass = QuantInt8MkldnnPass(_scope=inference_scope,
+                                                       _place=place)
                 graph = mkldnn_int8_pass.apply(graph)
             else:
                 graph = self._prepare_for_fp32_mkldnn(graph)
@@ -206,8 +211,8 @@ def _predict(self,
                               fetch_list=fetch_targets)
                 batch_time = (time.time() - start) * 1000  # in miliseconds
                 outputs.append(out[0])
-                batch_acc1, batch_acc5 = self._get_batch_accuracy(out[0],
-                                                                  labels)
+                batch_acc1, batch_acc5 = self._get_batch_accuracy(
+                    out[0], labels)
                 infer_accs1.append(batch_acc1)
                 infer_accs5.append(batch_acc5)
                 samples = len(data)
@@ -219,8 +224,8 @@ def _predict(self,
                 appx = ' (warm-up)' if iters <= skip_batch_num else ''
                 _logger.info('batch {0}{5}, acc1: {1:.4f}, acc5: {2:.4f}, '
                              'latency: {3:.4f} ms, fps: {4:.2f}'.format(
-                                 iters, batch_acc1, batch_acc5, batch_time /
-                                 batch_size, fps, appx))
+                                 iters, batch_acc1, batch_acc5,
+                                 batch_time / batch_size, fps, appx))
 
             # Postprocess benchmark data
             batch_latencies = batch_times[skip_batch_num:]
@@ -231,8 +236,8 @@ def _predict(self,
             infer_total_time = time.time() - infer_start_time
             acc1_avg = np.mean(infer_accs1)
             acc5_avg = np.mean(infer_accs5)
-            _logger.info('Total inference run time: {:.2f} s'.format(
-                infer_total_time))
+            _logger.info(
+                'Total inference run time: {:.2f} s'.format(infer_total_time))
 
             return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg
 
@@ -281,8 +286,8 @@ def test_graph_transformation(self):
         _logger.info('Accuracy drop threshold: {0}.'.format(acc_diff_threshold))
 
         _logger.info('--- Quant FP32 prediction start ---')
-        val_reader = paddle.batch(
-            self._reader_creator(data_path), batch_size=batch_size)
+        val_reader = paddle.batch(self._reader_creator(data_path),
+                                  batch_size=batch_size)
         fp32_output, fp32_acc1, fp32_acc5, fp32_fps, fp32_lat = self._predict(
             val_reader,
             quant_model_path,
@@ -291,8 +296,8 @@ def test_graph_transformation(self):
             skip_batch_num,
             transform_to_int8=False)
         _logger.info('--- Quant INT8 prediction start ---')
-        val_reader = paddle.batch(
-            self._reader_creator(data_path), batch_size=batch_size)
+        val_reader = paddle.batch(self._reader_creator(data_path),
+                                  batch_size=batch_size)
         int8_output, int8_acc1, int8_acc5, int8_fps, int8_lat = self._predict(
             val_reader,
             quant_model_path,
diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
index 73ec8cf3e023d..cb15b3da4735c 100644
--- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
@@ -27,54 +27,56 @@
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--quant_model_path',
-        type=str,
-        default='',
-        help='A path to a Quant model.')
-    parser.add_argument(
-        '--int8_model_save_path',
-        type=str,
-        default='',
-        help='Saved optimized and quantized INT8 model')
+    parser.add_argument('--quant_model_path',
+                        type=str,
+                        default='',
+                        help='A path to a Quant model.')
+    parser.add_argument('--int8_model_save_path',
+                        type=str,
+                        default='',
+                        help='Saved optimized and quantized INT8 model')
     parser.add_argument(
         '--ops_to_quantize',
         type=str,
         default='',
-        help='A comma separated list of operators to quantize. Only quantizable operators are taken into account. If the option is not used, an attempt to quantize all quantizable operators will be made.'
+        help=
+        'A comma separated list of operators to quantize. Only quantizable operators are taken into account. If the option is not used, an attempt to quantize all quantizable operators will be made.'
     )
     parser.add_argument(
         '--op_ids_to_skip',
         type=str,
         default='',
         help='A comma separated list of operator ids to skip in quantization.')
-    parser.add_argument(
-        '--debug',
-        action='store_true',
-        help='If used, the graph of Quant model is drawn.')
+    parser.add_argument('--debug',
+                        action='store_true',
+                        help='If used, the graph of Quant model is drawn.')
     parser.add_argument(
         '--quant_model_filename',
         type=str,
         default="",
-        help='The input model`s file name. If empty, search default `__model__` and separate parameter files and use them or in case if not found, attempt loading `model` and `params` files.'
+        help=
+        'The input model`s file name. If empty, search default `__model__` and separate parameter files and use them or in case if not found, attempt loading `model` and `params` files.'
     )
     parser.add_argument(
         '--quant_params_filename',
         type=str,
         default="",
-        help='If quant_model_filename is empty, this field is ignored. The input model`s all parameters file name. If empty load parameters from separate files.'
+        help=
+        'If quant_model_filename is empty, this field is ignored. The input model`s all parameters file name. If empty load parameters from separate files.'
     )
     parser.add_argument(
         '--save_model_filename',
         type=str,
         default="__model__",
-        help='The name of file to save the inference program itself. If is set None, a default filename __model__ will be used.'
+        help=
+        'The name of file to save the inference program itself. If is set None, a default filename __model__ will be used.'
     )
     parser.add_argument(
         '--save_params_filename',
         type=str,
         default=None,
-        help='The name of file to save all related parameters. If it is set None, parameters will be saved in separate files'
+        help=
+        'The name of file to save all related parameters. If it is set None, parameters will be saved in separate files'
     )
 
     test_args, args = parser.parse_known_args(namespace=unittest)
@@ -96,18 +98,17 @@ def transform_and_save_int8_model(original_path,
     with fluid.scope_guard(inference_scope):
         if not quant_model_filename:
             if os.path.exists(os.path.join(original_path, '__model__')):
-                [inference_program, feed_target_names,
-                 fetch_targets] = fluid.io.load_inference_model(original_path,
-                                                                exe)
+                [inference_program, feed_target_names, fetch_targets
+                 ] = fluid.io.load_inference_model(original_path, exe)
             else:
-                [inference_program, feed_target_names,
-                 fetch_targets] = fluid.io.load_inference_model(
-                     original_path, exe, 'model', 'params')
+                [inference_program, feed_target_names, fetch_targets
+                 ] = fluid.io.load_inference_model(original_path, exe, 'model',
+                                                   'params')
         else:
-            [inference_program, feed_target_names,
-             fetch_targets] = fluid.io.load_inference_model(
-                 original_path, exe, quant_model_filename,
-                 quant_params_filename)
+            [inference_program, feed_target_names, fetch_targets
+             ] = fluid.io.load_inference_model(original_path, exe,
+                                               quant_model_filename,
+                                               quant_params_filename)
 
         ops_to_quantize_set = set()
         print(ops_to_quantize)
@@ -132,14 +133,13 @@ def transform_and_save_int8_model(original_path,
         graph = transform_to_mkldnn_int8_pass.apply(graph)
         inference_program = graph.to_program()
         with fluid.scope_guard(inference_scope):
-            fluid.io.save_inference_model(
-                save_path,
-                feed_target_names,
-                fetch_targets,
-                exe,
-                inference_program,
-                model_filename=save_model_filename,
-                params_filename=save_params_filename)
+            fluid.io.save_inference_model(save_path,
+                                          feed_target_names,
+                                          fetch_targets,
+                                          exe,
+                                          inference_program,
+                                          model_filename=save_model_filename,
+                                          params_filename=save_params_filename)
         print(
             "Success! INT8 model obtained from the Quant model can be found at {}\n"
             .format(save_path))
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/fluid/contrib/slim/tests/test_graph.py
index 435cefd73e733..d8887e1964128 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
@@ -31,21 +31,19 @@
 def conv_block():
     img = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(input=img,
+                                                  filter_size=5,
+                                                  num_filters=20,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
     conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(input=conv_pool_1,
+                                                  filter_size=5,
+                                                  num_filters=50,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
     prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
     avg_loss = fluid.layers.mean(loss)
@@ -53,6 +51,7 @@ def conv_block():
 
 
 class TestGraph(unittest.TestCase):
+
     def graph_apis(self, use_cuda=False, for_ci=True):
         main = fluid.Program()
         startup = fluid.Program()
@@ -77,8 +76,8 @@ def graph_apis(self, use_cuda=False, for_ci=True):
         exe.run(startup)
         iters = 5
         batch_size = 8
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                    batch_size=batch_size)
         feeder = fluid.DataFeeder(feed_list=feeds, place=place)
 
         def _train(binary):
@@ -101,18 +100,18 @@ def _set_zero(var_name, scope, place):
             var.set(var_array, place)
 
         sum_before = np.sum(
-            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor(
-            )))
+            np.array(
+                fluid.global_scope().find_var('conv2d_1.w_0').get_tensor()))
         fluid.io._save_persistable_nodes(exe, checkponit_dir, graph)
         _set_zero('conv2d_1.w_0', fluid.global_scope(), place)
         set_after = np.sum(
-            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor(
-            )))
+            np.array(
+                fluid.global_scope().find_var('conv2d_1.w_0').get_tensor()))
         self.assertEqual(set_after, 0)
         fluid.io._load_persistable_nodes(exe, checkponit_dir, graph)
         sum_after = np.sum(
-            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor(
-            )))
+            np.array(
+                fluid.global_scope().find_var('conv2d_1.w_0').get_tensor()))
         self.assertEqual(sum_before, sum_after)
 
         marked_nodes = set()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
index 7b9cd7958b2d3..5e0269a271790 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -43,8 +43,9 @@
 if core.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 def get_vaild_warning_num(warning, w):
@@ -56,6 +57,7 @@ def get_vaild_warning_num(warning, w):
 
 
 class ImperativeLenet(fluid.dygraph.Layer):
+
     def __init__(self, num_classes=10):
         super(ImperativeLenet, self).__init__()
         conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
@@ -68,50 +70,36 @@ def __init__(self, num_classes=10):
         fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
         fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
         self.features = Sequential(
-            Conv2D(
-                in_channels=1,
-                out_channels=6,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=conv2d_w1_attr,
-                bias_attr=False),
-            BatchNorm2D(6),
-            ReLU(),
-            MaxPool2D(
-                kernel_size=2, stride=2),
-            Conv2D(
-                in_channels=6,
-                out_channels=16,
-                kernel_size=5,
-                stride=1,
-                padding=0,
-                weight_attr=conv2d_w2_attr,
-                bias_attr=conv2d_b2_attr),
-            BatchNorm2D(16),
-            PReLU(),
-            MaxPool2D(
-                kernel_size=2, stride=2))
+            Conv2D(in_channels=1,
+                   out_channels=6,
+                   kernel_size=3,
+                   stride=1,
+                   padding=1,
+                   weight_attr=conv2d_w1_attr,
+                   bias_attr=False), BatchNorm2D(6), ReLU(),
+            MaxPool2D(kernel_size=2, stride=2),
+            Conv2D(in_channels=6,
+                   out_channels=16,
+                   kernel_size=5,
+                   stride=1,
+                   padding=0,
+                   weight_attr=conv2d_w2_attr,
+                   bias_attr=conv2d_b2_attr), BatchNorm2D(16), PReLU(),
+            MaxPool2D(kernel_size=2, stride=2))
 
         self.fc = Sequential(
-            Linear(
-                in_features=400,
-                out_features=120,
-                weight_attr=fc_w1_attr,
-                bias_attr=fc_b1_attr),
-            LeakyReLU(),
-            Linear(
-                in_features=120,
-                out_features=84,
-                weight_attr=fc_w2_attr,
-                bias_attr=fc_b2_attr),
-            Sigmoid(),
-            Linear(
-                in_features=84,
-                out_features=num_classes,
-                weight_attr=fc_w3_attr,
-                bias_attr=fc_b3_attr),
-            Softmax())
+            Linear(in_features=400,
+                   out_features=120,
+                   weight_attr=fc_w1_attr,
+                   bias_attr=fc_b1_attr), LeakyReLU(),
+            Linear(in_features=120,
+                   out_features=84,
+                   weight_attr=fc_w2_attr,
+                   bias_attr=fc_b2_attr), Sigmoid(),
+            Linear(in_features=84,
+                   out_features=num_classes,
+                   weight_attr=fc_w3_attr,
+                   bias_attr=fc_b3_attr), Softmax())
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -122,6 +110,7 @@ def forward(self, inputs):
 
 
 class TestImperativeOutSclae(unittest.TestCase):
+
     def func_out_scale_acc(self):
         seed = 1000
         lr = 0.001
@@ -141,10 +130,11 @@ def func_out_scale_acc(self):
             lenet = fix_model_dict(lenet)
             imperative_out_scale.quantize(lenet)
 
-            reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
-            adam = AdamOptimizer(
-                learning_rate=lr, parameter_list=lenet.parameters())
+            reader = paddle.batch(paddle.dataset.mnist.test(),
+                                  batch_size=32,
+                                  drop_last=True)
+            adam = AdamOptimizer(learning_rate=lr,
+                                 parameter_list=lenet.parameters())
             loss_list = train_lenet(lenet, reader, adam)
             lenet.eval()
 
@@ -157,14 +147,13 @@ def func_out_scale_acc(self):
             layer=lenet,
             path=save_path,
             input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, 1, 28, 28], dtype='float32')
+                paddle.static.InputSpec(shape=[None, 1, 28, 28],
+                                        dtype='float32')
             ])
 
         for i in range(len(loss_list) - 1):
-            self.assertTrue(
-                loss_list[i] > loss_list[i + 1],
-                msg='Failed to do the imperative qat.')
+            self.assertTrue(loss_list[i] > loss_list[i + 1],
+                            msg='Failed to do the imperative qat.')
 
     def test_out_scale_acc(self):
         with _test_eager_guard():
@@ -173,6 +162,7 @@ def test_out_scale_acc(self):
 
 
 class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
+
     def func_save_quantized_model(self):
         lr = 0.001
 
@@ -191,10 +181,11 @@ def func_save_quantized_model(self):
             imperative_out_scale.quantize(lenet)
             lenet.set_dict(load_dict)
 
-            reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
-            adam = AdamOptimizer(
-                learning_rate=lr, parameter_list=lenet.parameters())
+            reader = paddle.batch(paddle.dataset.mnist.test(),
+                                  batch_size=32,
+                                  drop_last=True)
+            adam = AdamOptimizer(learning_rate=lr,
+                                 parameter_list=lenet.parameters())
             loss_list = train_lenet(lenet, reader, adam)
             lenet.eval()
 
@@ -202,14 +193,13 @@ def func_save_quantized_model(self):
             layer=lenet,
             path=save_path,
             input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, 1, 28, 28], dtype='float32')
+                paddle.static.InputSpec(shape=[None, 1, 28, 28],
+                                        dtype='float32')
             ])
 
         for i in range(len(loss_list) - 1):
-            self.assertTrue(
-                loss_list[i] > loss_list[i + 1],
-                msg='Failed to do the imperative qat.')
+            self.assertTrue(loss_list[i] > loss_list[i + 1],
+                            msg='Failed to do the imperative qat.')
 
     def test_save_quantized_model(self):
         with _test_eager_guard():
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
index fad4c8f9d580b..402113e5f8d78 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
@@ -34,8 +34,9 @@
 from imperative_test_utils import fix_model_dict, ImperativeLenet, ImperativeLinearBn
 from imperative_test_utils import ImperativeLinearBn_hook
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 class TestFuseLinearBn(unittest.TestCase):
@@ -54,15 +55,15 @@ def test_fuse(self):
         quant_h = ptq.quantize(model_h, fuse=True, fuse_list=f_l)
         for name, layer in quant_model.named_sublayers():
             if name in f_l:
-                assert not (isinstance(layer, nn.BatchNorm1D) or
-                            isinstance(layer, nn.BatchNorm2D))
+                assert not (isinstance(layer, nn.BatchNorm1D)
+                            or isinstance(layer, nn.BatchNorm2D))
         out = model(inputs)
         out_h = model_h(inputs)
         out_quant = quant_model(inputs)
         out_quant_h = quant_h(inputs)
         cos_sim_func = nn.CosineSimilarity(axis=0)
-        print('fuse linear+bn',
-              cos_sim_func(out.flatten(), out_quant.flatten()))
+        print('fuse linear+bn', cos_sim_func(out.flatten(),
+                                             out_quant.flatten()))
         print(cos_sim_func(out_h.flatten(), out_quant_h.flatten()))
 
 
@@ -98,8 +99,8 @@ def tearDownClass(cls):
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
-            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
-                                                          zip_path)
+            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
+                target_folder, zip_path)
             os.system(cmd)
 
     def download_model(self, data_url, data_md5, folder_name):
@@ -126,23 +127,23 @@ def set_vars(self):
             'batch_norm2d_0': [[0.37673383951187134], [0.44249194860458374]],
             're_lu_0': [[0.44249194860458374], [0.25804123282432556]],
             'max_pool2d_0': [[0.25804123282432556], [0.25804123282432556]],
-            'linear_0':
-            [[1.7058950662612915], [14.405526161193848], [0.4373355209827423]],
+            'linear_0': [[1.7058950662612915], [14.405526161193848],
+                         [0.4373355209827423]],
             'add_0': [[1.7058950662612915, 0.0], [1.7058950662612915]],
         }
 
     def model_test(self, model, batch_num=-1, batch_size=8):
         model.eval()
 
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
 
         eval_acc_top1_list = []
         for batch_id, data in enumerate(test_reader()):
             x_data = np.array([x[0].reshape(1, 28, 28)
                                for x in data]).astype('float32')
-            y_data = np.array(
-                [x[1] for x in data]).astype('int64').reshape(-1, 1)
+            y_data = np.array([x[1]
+                               for x in data]).astype('int64').reshape(-1, 1)
 
             img = paddle.to_tensor(x_data)
             label = paddle.to_tensor(y_data)
@@ -165,11 +166,11 @@ def model_test(self, model, batch_num=-1, batch_size=8):
 
     def program_test(self, program_path, batch_num=-1, batch_size=8):
         exe = paddle.static.Executor(paddle.CPUPlace())
-        [inference_program, feed_target_names, fetch_targets] = (
-            paddle.static.load_inference_model(program_path, exe))
+        [inference_program, feed_target_names, fetch_targets
+         ] = (paddle.static.load_inference_model(program_path, exe))
 
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
 
         top1_correct_num = 0.
         total_num = 0.
@@ -214,11 +215,11 @@ def func_ptq(self):
                                           self.batch_size)
 
         input_spec = [
-            paddle.static.InputSpec(
-                shape=[None, 1, 28, 28], dtype='float32')
+            paddle.static.InputSpec(shape=[None, 1, 28, 28], dtype='float32')
         ]
-        self.ptq.save_quantized_model(
-            model=quant_model, path=self.save_path, input_spec=input_spec)
+        self.ptq.save_quantized_model(model=quant_model,
+                                      path=self.save_path,
+                                      input_spec=input_spec)
         print('Quantized model saved in {%s}' % self.save_path)
 
         after_acc_top1 = self.model_test(quant_model, self.batch_num,
@@ -234,13 +235,11 @@ def func_ptq(self):
         print('After converted acc_top1: %s' % after_acc_top1)
         print('Infer acc_top1: %s' % infer_acc_top1)
 
-        self.assertTrue(
-            after_acc_top1 >= self.eval_acc_top1,
-            msg="The test acc {%f} is less than {%f}." %
-            (after_acc_top1, self.eval_acc_top1))
-        self.assertTrue(
-            infer_acc_top1 >= after_acc_top1,
-            msg='The acc is lower after converting model.')
+        self.assertTrue(after_acc_top1 >= self.eval_acc_top1,
+                        msg="The test acc {%f} is less than {%f}." %
+                        (after_acc_top1, self.eval_acc_top1))
+        self.assertTrue(infer_acc_top1 >= after_acc_top1,
+                        msg='The acc is lower after converting model.')
 
         end_time = time.time()
         print("total time: %ss \n" % (end_time - start_time))
@@ -252,6 +251,7 @@ def test_ptq(self):
 
 
 class TestImperativePTQfuse(TestImperativePTQ):
+
     def func_ptq(self):
         start_time = time.time()
 
@@ -270,17 +270,17 @@ def func_ptq(self):
         quant_model = self.ptq.quantize(model, fuse=True, fuse_list=f_l)
         for name, layer in quant_model.named_sublayers():
             if name in f_l:
-                assert not (isinstance(layer, nn.BatchNorm1D) or
-                            isinstance(layer, nn.BatchNorm2D))
+                assert not (isinstance(layer, nn.BatchNorm1D)
+                            or isinstance(layer, nn.BatchNorm2D))
         before_acc_top1 = self.model_test(quant_model, self.batch_num,
                                           self.batch_size)
 
         input_spec = [
-            paddle.static.InputSpec(
-                shape=[None, 1, 28, 28], dtype='float32')
+            paddle.static.InputSpec(shape=[None, 1, 28, 28], dtype='float32')
         ]
-        self.ptq.save_quantized_model(
-            model=quant_model, path=self.save_path, input_spec=input_spec)
+        self.ptq.save_quantized_model(model=quant_model,
+                                      path=self.save_path,
+                                      input_spec=input_spec)
         print('Quantized model saved in {%s}' % self.save_path)
 
         after_acc_top1 = self.model_test(quant_model, self.batch_num,
@@ -298,15 +298,13 @@ def func_ptq(self):
 
         #Check whether the quant_model is correct after converting.
         #The acc of quantized model should be higher than 0.95.
-        self.assertTrue(
-            after_acc_top1 >= self.eval_acc_top1,
-            msg="The test acc {%f} is less than {%f}." %
-            (after_acc_top1, self.eval_acc_top1))
-        #Check the saved infer_model.The acc of infer model 
+        self.assertTrue(after_acc_top1 >= self.eval_acc_top1,
+                        msg="The test acc {%f} is less than {%f}." %
+                        (after_acc_top1, self.eval_acc_top1))
+        #Check the saved infer_model.The acc of infer model
         #should not be lower than the one of dygraph model.
-        self.assertTrue(
-            infer_acc_top1 >= after_acc_top1,
-            msg='The acc is lower after converting model.')
+        self.assertTrue(infer_acc_top1 >= after_acc_top1,
+                        msg='The acc is lower after converting model.')
 
         end_time = time.time()
         print("total time: %ss \n" % (end_time - start_time))
@@ -318,6 +316,7 @@ def test_ptq(self):
 
 
 class TestImperativePTQHist(TestImperativePTQ):
+
     def set_vars(self):
         config = PTQConfig(HistQuantizer(), AbsmaxQuantizer())
         self.ptq = ImperativePTQ(config)
@@ -327,18 +326,19 @@ def set_vars(self):
         self.eval_acc_top1 = 0.98
 
         self.gt_thresholds = {
-            'conv2d_0':
-            [[0.99853515625], [0.35732391771364225], [0.10933732241392136]],
+            'conv2d_0': [[0.99853515625], [0.35732391771364225],
+                         [0.10933732241392136]],
             'batch_norm2d_0': [[0.35732391771364225], [0.4291427868761275]],
             're_lu_0': [[0.4291427868761275], [0.2359918110742001]],
             'max_pool2d_0': [[0.2359918110742001], [0.25665526917146053]],
-            'linear_0':
-            [[1.7037603475152991], [14.395224522473026], [0.4373355209827423]],
+            'linear_0': [[1.7037603475152991], [14.395224522473026],
+                         [0.4373355209827423]],
             'add_0': [[1.7037603475152991, 0.0], [1.7037603475152991]],
         }
 
 
 class TestImperativePTQKL(TestImperativePTQ):
+
     def set_vars(self):
         config = PTQConfig(KLQuantizer(), PerChannelAbsmaxQuantizer())
         self.ptq = ImperativePTQ(config)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 0d035390e2c00..0bb246f9ac923 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -41,8 +41,9 @@
 if core.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 class TestImperativeQat(unittest.TestCase):
@@ -68,21 +69,22 @@ def func_qat(self):
 
         with fluid.dygraph.guard():
             # For CI coverage
-            conv1 = Conv2D(
-                in_channels=3,
-                out_channels=2,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                padding_mode='replicate')
+            conv1 = Conv2D(in_channels=3,
+                           out_channels=2,
+                           kernel_size=3,
+                           stride=1,
+                           padding=1,
+                           padding_mode='replicate')
             quant_conv1 = QuantizedConv2D(conv1)
             data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
             quant_conv1(fluid.dygraph.to_variable(data))
 
             conv_transpose = Conv2DTranspose(4, 6, (3, 3))
             quant_conv_transpose = QuantizedConv2DTranspose(conv_transpose)
-            x_var = paddle.uniform(
-                (2, 4, 8, 8), dtype='float32', min=-1.0, max=1.0)
+            x_var = paddle.uniform((2, 4, 8, 8),
+                                   dtype='float32',
+                                   min=-1.0,
+                                   max=1.0)
             quant_conv_transpose(x_var)
 
             seed = 1
@@ -93,13 +95,14 @@ def func_qat(self):
             lenet = ImperativeLenet()
             lenet = fix_model_dict(lenet)
             imperative_qat.quantize(lenet)
-            adam = AdamOptimizer(
-                learning_rate=0.001, parameter_list=lenet.parameters())
+            adam = AdamOptimizer(learning_rate=0.001,
+                                 parameter_list=lenet.parameters())
 
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=32, drop_last=True)
-            test_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=32)
+            train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                        batch_size=32,
+                                        drop_last=True)
+            test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                       batch_size=32)
 
             epoch_num = 1
             for epoch in range(epoch_num):
@@ -107,8 +110,8 @@ def func_qat(self):
                 for batch_id, data in enumerate(train_reader()):
                     x_data = np.array([x[0].reshape(1, 28, 28)
                                        for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+                    y_data = np.array([x[1] for x in data
+                                       ]).astype('int64').reshape(-1, 1)
 
                     img = fluid.dygraph.to_variable(x_data)
                     label = fluid.dygraph.to_variable(y_data)
@@ -122,8 +125,8 @@ def func_qat(self):
                     if batch_id % 100 == 0:
                         _logger.info(
                             "Train | At epoch {} step {}: loss = {:}, acc= {:}".
-                            format(epoch, batch_id,
-                                   avg_loss.numpy(), acc.numpy()))
+                            format(epoch, batch_id, avg_loss.numpy(),
+                                   acc.numpy()))
                     if batch_id == 500:  # For shortening CI time
                         break
 
@@ -132,39 +135,41 @@ def func_qat(self):
                 for batch_id, data in enumerate(test_reader()):
                     x_data = np.array([x[0].reshape(1, 28, 28)
                                        for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+                    y_data = np.array([x[1] for x in data
+                                       ]).astype('int64').reshape(-1, 1)
 
                     img = fluid.dygraph.to_variable(x_data)
                     label = fluid.dygraph.to_variable(y_data)
 
                     out = lenet(img)
-                    acc_top1 = fluid.layers.accuracy(
-                        input=out, label=label, k=1)
-                    acc_top5 = fluid.layers.accuracy(
-                        input=out, label=label, k=5)
+                    acc_top1 = fluid.layers.accuracy(input=out,
+                                                     label=label,
+                                                     k=1)
+                    acc_top5 = fluid.layers.accuracy(input=out,
+                                                     label=label,
+                                                     k=5)
 
                     if batch_id % 100 == 0:
                         eval_acc_top1_list.append(float(acc_top1.numpy()))
                         _logger.info(
-                            "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".
-                            format(epoch, batch_id,
-                                   acc_top1.numpy(), acc_top5.numpy()))
+                            "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}"
+                            .format(epoch, batch_id, acc_top1.numpy(),
+                                    acc_top5.numpy()))
 
                 # check eval acc
                 eval_acc_top1 = sum(eval_acc_top1_list) / len(
                     eval_acc_top1_list)
                 print('eval_acc_top1', eval_acc_top1)
-                self.assertTrue(
-                    eval_acc_top1 > 0.9,
-                    msg="The test acc {%f} is less than 0.9." % eval_acc_top1)
+                self.assertTrue(eval_acc_top1 > 0.9,
+                                msg="The test acc {%f} is less than 0.9." %
+                                eval_acc_top1)
 
             # test the correctness of `paddle.jit.save`
             data = next(test_reader())
             test_data = np.array([x[0].reshape(1, 28, 28)
                                   for x in data]).astype('float32')
-            y_data = np.array(
-                [x[1] for x in data]).astype('int64').reshape(-1, 1)
+            y_data = np.array([x[1]
+                               for x in data]).astype('int64').reshape(-1, 1)
             test_img = fluid.dygraph.to_variable(test_data)
             label = fluid.dygraph.to_variable(y_data)
             lenet.eval()
@@ -177,8 +182,8 @@ def func_qat(self):
                 layer=lenet,
                 path=os.path.join(tmpdir, "lenet"),
                 input_spec=[
-                    paddle.static.InputSpec(
-                        shape=[None, 1, 28, 28], dtype='float32')
+                    paddle.static.InputSpec(shape=[None, 1, 28, 28],
+                                            dtype='float32')
                 ],
                 onnx_format=self.onnx_format)
             print('Quantized model saved in %s' % tmpdir)
@@ -211,6 +216,7 @@ def test_qat(self):
 
 
 class TestImperativeQatONNXFormat(unittest.TestCase):
+
     def set_vars(self):
         self.weight_quantize_type = 'abs_max'
         self.activation_quantize_type = 'moving_average_abs_max'
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
index 76a6e11d98dff..804c56cfd873b 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
@@ -34,8 +34,9 @@
 if paddle.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 class TestImperativeQatAmp(unittest.TestCase):
@@ -71,8 +72,8 @@ def tearDownClass(cls):
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
-            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
-                                                          zip_path)
+            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
+                target_folder, zip_path)
             os.system(cmd)
 
     def download_model(self, data_url, data_md5, folder_name):
@@ -97,17 +98,17 @@ def set_vars(self):
     def model_train(self, model, batch_num=-1, batch_size=32, use_amp=False):
         model.train()
 
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=batch_size)
-        adam = paddle.optimizer.Adam(
-            learning_rate=0.001, parameters=model.parameters())
+        train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                    batch_size=batch_size)
+        adam = paddle.optimizer.Adam(learning_rate=0.001,
+                                     parameters=model.parameters())
         scaler = paddle.amp.GradScaler(init_loss_scaling=500)
 
         for batch_id, data in enumerate(train_reader()):
             x_data = np.array([x[0].reshape(1, 28, 28)
                                for x in data]).astype('float32')
-            y_data = np.array(
-                [x[1] for x in data]).astype('int64').reshape(-1, 1)
+            y_data = np.array([x[1]
+                               for x in data]).astype('int64').reshape(-1, 1)
 
             img = paddle.to_tensor(x_data)
             label = paddle.to_tensor(y_data)
@@ -143,15 +144,15 @@ def model_train(self, model, batch_num=-1, batch_size=32, use_amp=False):
     def model_test(self, model, batch_num=-1, batch_size=32, use_amp=False):
         model.eval()
 
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
 
         acc_top1_list = []
         for batch_id, data in enumerate(test_reader()):
             x_data = np.array([x[0].reshape(1, 28, 28)
                                for x in data]).astype('float32')
-            y_data = np.array(
-                [x[1] for x in data]).astype('int64').reshape(-1, 1)
+            y_data = np.array([x[1]
+                               for x in data]).astype('int64').reshape(-1, 1)
 
             img = paddle.to_tensor(x_data)
             label = paddle.to_tensor(y_data)
@@ -202,14 +203,12 @@ def ptq(self):
 
             _logger.info('fp32_acc_top1: %f, int8_acc_top1: %f' %
                          (fp32_acc_top1, int8_acc_top1))
-            self.assertTrue(
-                int8_acc_top1 > fp32_acc_top1 - 0.01,
-                msg='fp32_acc_top1: %f, int8_acc_top1: %f' %
-                (fp32_acc_top1, int8_acc_top1))
+            self.assertTrue(int8_acc_top1 > fp32_acc_top1 - 0.01,
+                            msg='fp32_acc_top1: %f, int8_acc_top1: %f' %
+                            (fp32_acc_top1, int8_acc_top1))
 
         input_spec = [
-            paddle.static.InputSpec(
-                shape=[None, 1, 28, 28], dtype='float32')
+            paddle.static.InputSpec(shape=[None, 1, 28, 28], dtype='float32')
         ]
         paddle.jit.save(layer=model, path=self.save_path, input_spec=input_spec)
         print('Quantized model saved in {%s}' % self.save_path)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
index 94e0681d1f57e..3770ee486499d 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
@@ -33,11 +33,13 @@
 if core.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 class TestImperativeQatChannelWise(TestImperativeQat):
+
     def set_vars(self):
         self.weight_quantize_type = 'channel_wise_abs_max'
         self.activation_quantize_type = 'moving_average_abs_max'
@@ -48,6 +50,7 @@ def set_vars(self):
 
 
 class TestImperativeQatChannelWiseONNXFormat(TestImperativeQat):
+
     def set_vars(self):
         self.weight_quantize_type = 'channel_wise_abs_max'
         self.activation_quantize_type = 'moving_average_abs_max'
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py
index d580eb7ae7aef..db7f15c4cef17 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py
@@ -33,11 +33,13 @@
 if core.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 class TestImperativeQatfuseBN(TestImperativeQat):
+
     def set_vars(self):
         self.weight_quantize_type = 'abs_max'
         self.activation_quantize_type = 'moving_average_abs_max'
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
index 0bc80694a12cb..4d2a990d81daa 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
@@ -31,20 +31,24 @@
 from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.framework import _test_eager_guard
+
 os.environ["CPU_NUM"] = "1"
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 class PACT(nn.Layer):
+
     def __init__(self, init_value=20):
         super(PACT, self).__init__()
         alpha_attr = paddle.ParamAttr(
             name=self.full_name() + ".pact",
             initializer=paddle.nn.initializer.Constant(value=init_value))
-        self.alpha = self.create_parameter(
-            shape=[1], attr=alpha_attr, dtype='float32')
+        self.alpha = self.create_parameter(shape=[1],
+                                           attr=alpha_attr,
+                                           dtype='float32')
 
     def forward(self, x):
         out_left = paddle.nn.functional.relu(x - self.alpha)
@@ -54,24 +58,31 @@ def forward(self, x):
 
 
 class CustomQAT(nn.Layer):
+
     def __init__(self):
         super(CustomQAT, self).__init__()
-        attr = paddle.ParamAttr(
-            initializer=paddle.nn.initializer.Constant(value=1.0))
-        self.u_param = self.create_parameter(
-            shape=[1], attr=attr, dtype='float32')
-        self.l_param = self.create_parameter(
-            shape=[1], attr=attr, dtype='float32')
-        self.alpha_param = self.create_parameter(
-            shape=[1], attr=attr, dtype='float32')
-        self.upper = self.create_parameter(
-            shape=[1], attr=attr, dtype='float32')
+        attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(
+            value=1.0))
+        self.u_param = self.create_parameter(shape=[1],
+                                             attr=attr,
+                                             dtype='float32')
+        self.l_param = self.create_parameter(shape=[1],
+                                             attr=attr,
+                                             dtype='float32')
+        self.alpha_param = self.create_parameter(shape=[1],
+                                                 attr=attr,
+                                                 dtype='float32')
+        self.upper = self.create_parameter(shape=[1],
+                                           attr=attr,
+                                           dtype='float32')
         self.upper.stop_gradient = True
-        self.lower = self.create_parameter(
-            shape=[1], attr=attr, dtype='float32')
+        self.lower = self.create_parameter(shape=[1],
+                                           attr=attr,
+                                           dtype='float32')
         self.lower.stop_gradient = True
 
     def forward(self, x):
+
         def clip(x, upper, lower):
             x = x + paddle.nn.functional.relu(lower - x)
             x = x - paddle.nn.functional.relu(x - upper)
@@ -102,6 +113,7 @@ def dequantize(x, lower_bound, delta, interval):
 
 
 class ModelForConv2dT(nn.Layer):
+
     def __init__(self, num_classes=10):
         super(ModelForConv2dT, self).__init__()
         self.features = nn.Conv2DTranspose(4, 6, (3, 3))
@@ -115,34 +127,29 @@ def forward(self, inputs):
 
 
 class ImperativeLenet(paddle.nn.Layer):
+
     def __init__(self, num_classes=10, classifier_activation='softmax'):
         super(ImperativeLenet, self).__init__()
         self.features = Sequential(
-            Conv2D(
-                num_channels=1,
-                num_filters=6,
-                filter_size=3,
-                stride=1,
-                padding=1),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2),
-            Conv2D(
-                num_channels=6,
-                num_filters=16,
-                filter_size=5,
-                stride=1,
-                padding=0),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2))
+            Conv2D(num_channels=1,
+                   num_filters=6,
+                   filter_size=3,
+                   stride=1,
+                   padding=1),
+            Pool2D(pool_size=2, pool_type='max', pool_stride=2),
+            Conv2D(num_channels=6,
+                   num_filters=16,
+                   filter_size=5,
+                   stride=1,
+                   padding=0),
+            Pool2D(pool_size=2, pool_type='max', pool_stride=2))
 
         self.fc = Sequential(
-            Linear(
-                input_dim=400, output_dim=120),
-            Linear(
-                input_dim=120, output_dim=84),
-            Linear(
-                input_dim=84, output_dim=num_classes,
-                act=classifier_activation))
+            Linear(input_dim=400, output_dim=120),
+            Linear(input_dim=120, output_dim=84),
+            Linear(input_dim=84,
+                   output_dim=num_classes,
+                   act=classifier_activation))
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -153,6 +160,7 @@ def forward(self, inputs):
 
 
 class TestUserDefinedActPreprocess(unittest.TestCase):
+
     def setUp(self):
         _logger.info("test act_preprocess")
         self.imperative_qat = ImperativeQuantAware(act_preprocess_layer=PACT)
@@ -196,8 +204,8 @@ def train(model):
                 for batch_id, data in enumerate(train_reader()):
                     x_data = np.array([x[0].reshape(1, 28, 28)
                                        for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+                    y_data = np.array([x[1] for x in data
+                                       ]).astype('int64').reshape(-1, 1)
 
                     img = paddle.to_tensor(x_data)
                     label = paddle.to_tensor(y_data)
@@ -211,8 +219,8 @@ def train(model):
                     if batch_id % 50 == 0:
                         _logger.info(
                             "Train | At epoch {} step {}: loss = {:}, acc= {:}".
-                            format(epoch, batch_id,
-                                   avg_loss.numpy(), acc.numpy()))
+                            format(epoch, batch_id, avg_loss.numpy(),
+                                   acc.numpy()))
                         break
 
         def test(model):
@@ -221,8 +229,8 @@ def test(model):
             for batch_id, data in enumerate(test_reader()):
                 x_data = np.array([x[0].reshape(1, 28, 28)
                                    for x in data]).astype('float32')
-                y_data = np.array(
-                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
+                y_data = np.array([x[1] for x in data
+                                   ]).astype('int64').reshape(-1, 1)
 
                 img = paddle.to_tensor(x_data)
                 label = paddle.to_tensor(y_data)
@@ -237,8 +245,9 @@ def test(model):
                         "Test | step {}: acc1 = {:}, acc5 = {:}".format(
                             batch_id, acc_top1.numpy(), acc_top5.numpy()))
 
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=512, drop_last=True)
+        train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                    batch_size=512,
+                                    drop_last=True)
         test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=512)
         train(lenet)
         test(lenet)
@@ -250,18 +259,21 @@ def test_quant_aware_training(self):
 
 
 class TestUserDefinedWeightPreprocess(TestUserDefinedActPreprocess):
+
     def setUp(self):
         _logger.info("test weight_preprocess")
         self.imperative_qat = ImperativeQuantAware(weight_preprocess_layer=PACT)
 
 
 class TestUserDefinedActQuantize(TestUserDefinedActPreprocess):
+
     def setUp(self):
         _logger.info("test act_quantize")
         self.imperative_qat = ImperativeQuantAware(act_quantize_layer=CustomQAT)
 
 
 class TestUserDefinedWeightQuantize(TestUserDefinedActPreprocess):
+
     def setUp(self):
         _logger.info("test weight_quantize")
         self.imperative_qat = ImperativeQuantAware(
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index d77134d72a959..e562cc2452aa4 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -38,11 +38,13 @@
 if core.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 class TestImperativeOutSclae(unittest.TestCase):
+
     def func_out_scale_acc(self):
         paddle.disable_static()
         seed = 1000
@@ -51,15 +53,16 @@ def func_out_scale_acc(self):
         qat = ImperativeQuantAware()
 
         np.random.seed(seed)
-        reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=512, drop_last=True)
+        reader = paddle.batch(paddle.dataset.mnist.test(),
+                              batch_size=512,
+                              drop_last=True)
 
         lenet = ImperativeLenetWithSkipQuant()
         lenet = fix_model_dict(lenet)
         qat.quantize(lenet)
 
-        adam = AdamOptimizer(
-            learning_rate=lr, parameter_list=lenet.parameters())
+        adam = AdamOptimizer(learning_rate=lr,
+                             parameter_list=lenet.parameters())
         dynamic_loss_rec = []
         lenet.train()
         loss_list = train_lenet(lenet, reader, adam)
@@ -69,13 +72,13 @@ def func_out_scale_acc(self):
         path = "./save_dynamic_quant_infer_model/lenet"
         save_dir = "./save_dynamic_quant_infer_model"
 
-        qat.save_quantized_model(
-            layer=lenet,
-            path=path,
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, 1, 28, 28], dtype='float32')
-            ])
+        qat.save_quantized_model(layer=lenet,
+                                 path=path,
+                                 input_spec=[
+                                     paddle.static.InputSpec(
+                                         shape=[None, 1, 28, 28],
+                                         dtype='float32')
+                                 ])
 
         paddle.enable_static()
 
@@ -85,12 +88,12 @@ def func_out_scale_acc(self):
             place = core.CPUPlace()
         exe = fluid.Executor(place)
 
-        [inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=save_dir,
-                executor=exe,
-                model_filename="lenet" + INFER_MODEL_SUFFIX,
-                params_filename="lenet" + INFER_PARAMS_SUFFIX))
+        [inference_program, feed_target_names,
+         fetch_targets] = (fluid.io.load_inference_model(
+             dirname=save_dir,
+             executor=exe,
+             model_filename="lenet" + INFER_MODEL_SUFFIX,
+             params_filename="lenet" + INFER_PARAMS_SUFFIX))
         model_ops = inference_program.global_block().ops
 
         conv2d_count, matmul_count = 0, 0
diff --git a/python/paddle/fluid/contrib/slim/tests/test_moving_average_abs_max_scale_op.py b/python/paddle/fluid/contrib/slim/tests/test_moving_average_abs_max_scale_op.py
index 656fb1dda3bd1..fcf82c2fc890b 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_moving_average_abs_max_scale_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_moving_average_abs_max_scale_op.py
@@ -37,19 +37,21 @@ def init_data(batch_size=32, img_shape=[784], label_range=9):
 
 
 class TestMovingAverageAbsMaxScaleOp(unittest.TestCase):
+
     def check_backward(self, use_cuda):
         main_program = fluid.Program()
         startup_program = fluid.Program()
         with fluid.program_guard(main_program, startup_program):
-            image = fluid.layers.data(
-                name='image', shape=[784], dtype='float32')
+            image = fluid.layers.data(name='image',
+                                      shape=[784],
+                                      dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             fc_tmp = fluid.layers.fc(image, size=10, act='softmax')
             out_scale = quant_layers.MovingAverageAbsMaxScale(
                 name=fc_tmp.name, dtype=fc_tmp.dtype)
             fc_tmp_1 = out_scale(fc_tmp)
-            cross_entropy = fluid.layers.softmax_with_cross_entropy(fc_tmp,
-                                                                    label)
+            cross_entropy = fluid.layers.softmax_with_cross_entropy(
+                fc_tmp, label)
             loss = fluid.layers.reduce_mean(cross_entropy)
             sgd = fluid.optimizer.SGD(learning_rate=1e-3)
             sgd.minimize(loss)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
index 89e0e099f44c2..1beb0f916d48e 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
@@ -33,6 +33,7 @@
 
 
 class TestPostTrainingQuantization(unittest.TestCase):
+
     def setUp(self):
         self.download_path = 'int8/download'
         self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
@@ -43,21 +44,21 @@ def setUp(self):
         try:
             os.system("mkdir -p " + self.int8_model_path)
         except Exception as e:
-            print("Failed to create {} due to {}".format(self.int8_model_path,
-                                                         str(e)))
+            print("Failed to create {} due to {}".format(
+                self.int8_model_path, str(e)))
             sys.exit(-1)
 
     def tearDown(self):
         try:
             os.system("rm -rf {}".format(self.int8_model_path))
         except Exception as e:
-            print("Failed to delete {} due to {}".format(self.int8_model_path,
-                                                         str(e)))
+            print("Failed to delete {} due to {}".format(
+                self.int8_model_path, str(e)))
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
-            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
-                                                          zip_path)
+            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
+                target_folder, zip_path)
             os.system(cmd)
 
     def download_model(self, data_url, data_md5, folder_name):
@@ -71,6 +72,7 @@ def download_model(self, data_url, data_md5, folder_name):
         return data_cache_folder
 
     def get_batch_reader(self, data_path, place):
+
         def reader():
             with open(data_path, 'rb') as in_file:
                 while True:
@@ -83,15 +85,14 @@ def reader():
                     seq_len = (alllen >> 16) & 0xFFFF
 
                     label = in_file.read(4 * label_len)
-                    label = np.frombuffer(
-                        label, dtype=np.int32).reshape([len(label) // 4])
+                    label = np.frombuffer(label, dtype=np.int32).reshape(
+                        [len(label) // 4])
                     if label.shape[0] != 1 or label[0] > 6350:
                         continue
 
                     feat = in_file.read(4 * seq_len * 8)
-                    feat = np.frombuffer(
-                        feat,
-                        dtype=np.float32).reshape([len(feat) // 4 // 8, 8])
+                    feat = np.frombuffer(feat, dtype=np.float32).reshape(
+                        [len(feat) // 4 // 8, 8])
                     lod_feat = [feat.shape[0]]
 
                     minputs = fluid.create_lod_tensor(feat, [lod_feat], place)
@@ -100,6 +101,7 @@ def reader():
         return reader
 
     def get_simple_reader(self, data_path, place):
+
         def reader():
             with open(data_path, 'rb') as in_file:
                 while True:
@@ -112,15 +114,14 @@ def reader():
                     seq_len = (alllen >> 16) & 0xFFFF
 
                     label = in_file.read(4 * label_len)
-                    label = np.frombuffer(
-                        label, dtype=np.int32).reshape([len(label) // 4])
+                    label = np.frombuffer(label, dtype=np.int32).reshape(
+                        [len(label) // 4])
                     if label.shape[0] != 1 or label[0] > 6350:
                         continue
 
                     feat = in_file.read(4 * seq_len * 8)
-                    feat = np.frombuffer(
-                        feat,
-                        dtype=np.float32).reshape([len(feat) // 4 // 8, 8])
+                    feat = np.frombuffer(feat, dtype=np.float32).reshape(
+                        [len(feat) // 4 // 8, 8])
                     lod_feat = [feat.shape[0]]
 
                     minputs = fluid.create_lod_tensor(feat, [lod_feat], place)
@@ -181,18 +182,17 @@ def generate_quantized_model(self,
         scope = fluid.global_scope()
         batch_generator = self.get_batch_reader(data_path, place)
 
-        ptq = PostTrainingQuantization(
-            executor=exe,
-            model_dir=model_path,
-            batch_generator=batch_generator,
-            batch_nums=batch_nums,
-            algo=algo,
-            quantizable_op_type=quantizable_op_type,
-            round_type=round_type,
-            is_full_quantize=is_full_quantize,
-            optimize_model=is_optimize_model,
-            onnx_format=onnx_format,
-            is_use_cache_file=is_use_cache_file)
+        ptq = PostTrainingQuantization(executor=exe,
+                                       model_dir=model_path,
+                                       batch_generator=batch_generator,
+                                       batch_nums=batch_nums,
+                                       algo=algo,
+                                       quantizable_op_type=quantizable_op_type,
+                                       round_type=round_type,
+                                       is_full_quantize=is_full_quantize,
+                                       optimize_model=is_optimize_model,
+                                       onnx_format=onnx_format,
+                                       is_use_cache_file=is_use_cache_file)
         ptq.quantize()
         ptq.save_quantized_model(self.int8_model_path)
 
@@ -226,10 +226,11 @@ def run_test(self,
 
         print("Start post training quantization for {0} on {1} samples ...".
               format(model_name, quant_iterations))
-        self.generate_quantized_model(
-            fp32_model_path, data_path, algo, round_type, quantizable_op_type,
-            is_full_quantize, is_use_cache_file, is_optimize_model,
-            quant_iterations, onnx_format)
+        self.generate_quantized_model(fp32_model_path, data_path, algo,
+                                      round_type, quantizable_op_type,
+                                      is_full_quantize, is_use_cache_file,
+                                      is_optimize_model, quant_iterations,
+                                      onnx_format)
 
         print("Start INT8 inference for {0} on {1} samples ...".format(
             model_name, infer_iterations))
@@ -248,6 +249,7 @@ def run_test(self,
 
 
 class TestPostTrainingAvgForLSTM(TestPostTrainingQuantization):
+
     def test_post_training_avg(self):
         model_name = "nlp_lstm_fp32_model"
         model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
@@ -271,6 +273,7 @@ def test_post_training_avg(self):
 
 
 class TestPostTrainingAvgForLSTMONNXFormat(TestPostTrainingQuantization):
+
     def test_post_training_avg_onnx_format(self):
         model_name = "nlp_lstm_fp32_model"
         model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
@@ -288,23 +291,22 @@ def test_post_training_avg_onnx_format(self):
         infer_iterations = 100
         quant_iterations = 10
         onnx_format = True
-        self.run_test(
-            model_name,
-            model_url,
-            model_md5,
-            data_name,
-            data_url,
-            data_md5,
-            algo,
-            round_type,
-            quantizable_op_type,
-            is_full_quantize,
-            is_use_cache_file,
-            is_optimize_model,
-            diff_threshold,
-            infer_iterations,
-            quant_iterations,
-            onnx_format=onnx_format)
+        self.run_test(model_name,
+                      model_url,
+                      model_md5,
+                      data_name,
+                      data_url,
+                      data_md5,
+                      algo,
+                      round_type,
+                      quantizable_op_type,
+                      is_full_quantize,
+                      is_use_cache_file,
+                      is_optimize_model,
+                      diff_threshold,
+                      infer_iterations,
+                      quant_iterations,
+                      onnx_format=onnx_format)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
index 4c3a758f0e36d..cb76f4bbac084 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -32,6 +32,7 @@
 
 
 class TestPostTrainingQuantization(unittest.TestCase):
+
     def setUp(self):
         self.download_path = 'int8/download'
         self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
@@ -42,21 +43,21 @@ def setUp(self):
         try:
             os.system("mkdir -p " + self.int8_model_path)
         except Exception as e:
-            print("Failed to create {} due to {}".format(self.int8_model_path,
-                                                         str(e)))
+            print("Failed to create {} due to {}".format(
+                self.int8_model_path, str(e)))
             sys.exit(-1)
 
     def tearDown(self):
         try:
             os.system("rm -rf {}".format(self.int8_model_path))
         except Exception as e:
-            print("Failed to delete {} due to {}".format(self.int8_model_path,
-                                                         str(e)))
+            print("Failed to delete {} due to {}".format(
+                self.int8_model_path, str(e)))
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
-            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
-                                                          zip_path)
+            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
+                target_folder, zip_path)
             os.system(cmd)
 
     def download_model(self, data_url, data_md5, folder_name):
@@ -82,8 +83,8 @@ def run_program(self, model_path, batch_size, infer_iterations):
         cnt = 0
         periods = []
         for batch_id, data in enumerate(val_reader()):
-            image = np.array(
-                [x[0].reshape(img_shape) for x in data]).astype("float32")
+            image = np.array([x[0].reshape(img_shape)
+                              for x in data]).astype("float32")
             input_label = np.array([x[1] for x in data]).astype("int64")
 
             t1 = time.time()
@@ -125,20 +126,19 @@ def generate_quantized_model(self,
         scope = fluid.global_scope()
         val_reader = paddle.dataset.mnist.train()
 
-        ptq = PostTrainingQuantization(
-            executor=exe,
-            model_dir=model_path,
-            sample_generator=val_reader,
-            batch_size=batch_size,
-            batch_nums=batch_nums,
-            algo=algo,
-            quantizable_op_type=quantizable_op_type,
-            round_type=round_type,
-            is_full_quantize=is_full_quantize,
-            optimize_model=is_optimize_model,
-            onnx_format=onnx_format,
-            skip_tensor_list=skip_tensor_list,
-            is_use_cache_file=is_use_cache_file)
+        ptq = PostTrainingQuantization(executor=exe,
+                                       model_dir=model_path,
+                                       sample_generator=val_reader,
+                                       batch_size=batch_size,
+                                       batch_nums=batch_nums,
+                                       algo=algo,
+                                       quantizable_op_type=quantizable_op_type,
+                                       round_type=round_type,
+                                       is_full_quantize=is_full_quantize,
+                                       optimize_model=is_optimize_model,
+                                       onnx_format=onnx_format,
+                                       skip_tensor_list=skip_tensor_list,
+                                       is_use_cache_file=is_use_cache_file)
         ptq.quantize()
         ptq.save_quantized_model(self.int8_model_path)
 
@@ -164,30 +164,33 @@ def run_test(self,
 
         print("Start FP32 inference for {0} on {1} images ...".format(
             model_name, infer_iterations * batch_size))
-        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
-            origin_model_path, batch_size, infer_iterations)
+        (fp32_throughput, fp32_latency,
+         fp32_acc1) = self.run_program(origin_model_path, batch_size,
+                                       infer_iterations)
 
         print("Start INT8 post training quantization for {0} on {1} images ...".
               format(model_name, quant_iterations * batch_size))
-        self.generate_quantized_model(
-            origin_model_path, algo, round_type, quantizable_op_type,
-            is_full_quantize, is_use_cache_file, is_optimize_model, batch_size,
-            quant_iterations, onnx_format, skip_tensor_list)
+        self.generate_quantized_model(origin_model_path, algo, round_type,
+                                      quantizable_op_type, is_full_quantize,
+                                      is_use_cache_file, is_optimize_model,
+                                      batch_size, quant_iterations, onnx_format,
+                                      skip_tensor_list)
 
         print("Start INT8 inference for {0} on {1} images ...".format(
             model_name, infer_iterations * batch_size))
-        (int8_throughput, int8_latency, int8_acc1) = self.run_program(
-            self.int8_model_path, batch_size, infer_iterations)
+        (int8_throughput, int8_latency,
+         int8_acc1) = self.run_program(self.int8_model_path, batch_size,
+                                       infer_iterations)
 
         print("---Post training quantization of {} method---".format(algo))
         print(
-            "FP32 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.".
-            format(model_name, batch_size, fp32_throughput, fp32_latency,
-                   fp32_acc1))
+            "FP32 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}."
+            .format(model_name, batch_size, fp32_throughput, fp32_latency,
+                    fp32_acc1))
         print(
-            "INT8 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.\n".
-            format(model_name, batch_size, int8_throughput, int8_latency,
-                   int8_acc1))
+            "INT8 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.\n"
+            .format(model_name, batch_size, int8_throughput, int8_latency,
+                    int8_acc1))
         sys.stdout.flush()
 
         delta_value = fp32_acc1 - int8_acc1
@@ -195,6 +198,7 @@ def run_test(self,
 
 
 class TestPostTrainingKLForMnist(TestPostTrainingQuantization):
+
     def test_post_training_kl(self):
         model_name = "mnist_model"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
@@ -216,6 +220,7 @@ def test_post_training_kl(self):
 
 
 class TestPostTraininghistForMnist(TestPostTrainingQuantization):
+
     def test_post_training_hist(self):
         model_name = "mnist_model"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
@@ -237,6 +242,7 @@ def test_post_training_hist(self):
 
 
 class TestPostTrainingmseForMnist(TestPostTrainingQuantization):
+
     def test_post_training_mse(self):
         model_name = "mnist_model"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
@@ -258,6 +264,7 @@ def test_post_training_mse(self):
 
 
 class TestPostTrainingemdForMnist(TestPostTrainingQuantization):
+
     def test_post_training_mse(self):
         model_name = "mnist_model"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
@@ -279,6 +286,7 @@ def test_post_training_mse(self):
 
 
 class TestPostTrainingavgForMnist(TestPostTrainingQuantization):
+
     def test_post_training_avg(self):
         model_name = "mnist_model"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
@@ -300,6 +308,7 @@ def test_post_training_avg(self):
 
 
 class TestPostTrainingAbsMaxForMnist(TestPostTrainingQuantization):
+
     def test_post_training_abs_max(self):
         model_name = "mnist_model"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
@@ -321,6 +330,7 @@ def test_post_training_abs_max(self):
 
 
 class TestPostTrainingmseAdaroundForMnist(TestPostTrainingQuantization):
+
     def test_post_training_mse(self):
         model_name = "mnist_model"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
@@ -342,6 +352,7 @@ def test_post_training_mse(self):
 
 
 class TestPostTrainingKLAdaroundForMnist(TestPostTrainingQuantization):
+
     def test_post_training_kl(self):
         model_name = "mnist_model"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
@@ -363,6 +374,7 @@ def test_post_training_kl(self):
 
 
 class TestPostTrainingmseForMnistONNXFormat(TestPostTrainingQuantization):
+
     def test_post_training_mse_onnx_format(self):
         model_name = "mnist_model"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
@@ -378,25 +390,25 @@ def test_post_training_mse_onnx_format(self):
         batch_size = 10
         infer_iterations = 50
         quant_iterations = 5
-        self.run_test(
-            model_name,
-            data_url,
-            data_md5,
-            algo,
-            round_type,
-            quantizable_op_type,
-            is_full_quantize,
-            is_use_cache_file,
-            is_optimize_model,
-            diff_threshold,
-            batch_size,
-            infer_iterations,
-            quant_iterations,
-            onnx_format=onnx_format)
+        self.run_test(model_name,
+                      data_url,
+                      data_md5,
+                      algo,
+                      round_type,
+                      quantizable_op_type,
+                      is_full_quantize,
+                      is_use_cache_file,
+                      is_optimize_model,
+                      diff_threshold,
+                      batch_size,
+                      infer_iterations,
+                      quant_iterations,
+                      onnx_format=onnx_format)
 
 
 class TestPostTrainingmseForMnistONNXFormatFullQuant(
         TestPostTrainingQuantization):
+
     def test_post_training_mse_onnx_format_full_quant(self):
         model_name = "mnist_model"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
@@ -412,24 +424,24 @@ def test_post_training_mse_onnx_format_full_quant(self):
         batch_size = 10
         infer_iterations = 50
         quant_iterations = 5
-        self.run_test(
-            model_name,
-            data_url,
-            data_md5,
-            algo,
-            round_type,
-            quantizable_op_type,
-            is_full_quantize,
-            is_use_cache_file,
-            is_optimize_model,
-            diff_threshold,
-            batch_size,
-            infer_iterations,
-            quant_iterations,
-            onnx_format=onnx_format)
+        self.run_test(model_name,
+                      data_url,
+                      data_md5,
+                      algo,
+                      round_type,
+                      quantizable_op_type,
+                      is_full_quantize,
+                      is_use_cache_file,
+                      is_optimize_model,
+                      diff_threshold,
+                      batch_size,
+                      infer_iterations,
+                      quant_iterations,
+                      onnx_format=onnx_format)
 
 
 class TestPostTrainingavgForMnistSkipOP(TestPostTrainingQuantization):
+
     def test_post_training_avg_skip_op(self):
         model_name = "mnist_model"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
@@ -445,21 +457,20 @@ def test_post_training_avg_skip_op(self):
         infer_iterations = 50
         quant_iterations = 5
         skip_tensor_list = ["fc_0.w_0"]
-        self.run_test(
-            model_name,
-            data_url,
-            data_md5,
-            algo,
-            round_type,
-            quantizable_op_type,
-            is_full_quantize,
-            is_use_cache_file,
-            is_optimize_model,
-            diff_threshold,
-            batch_size,
-            infer_iterations,
-            quant_iterations,
-            skip_tensor_list=skip_tensor_list)
+        self.run_test(model_name,
+                      data_url,
+                      data_md5,
+                      algo,
+                      round_type,
+                      quantizable_op_type,
+                      is_full_quantize,
+                      is_use_cache_file,
+                      is_optimize_model,
+                      diff_threshold,
+                      batch_size,
+                      infer_iterations,
+                      quant_iterations,
+                      skip_tensor_list=skip_tensor_list)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 56d77f77b5083..b36f036d41590 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -82,6 +82,7 @@ def _reader_creator(file_list,
                     color_jitter=False,
                     rotate=False,
                     data_dir=DATA_DIR):
+
     def reader():
         with open(file_list) as flist:
             full_lines = [line.strip() for line in flist]
@@ -96,8 +97,10 @@ def reader():
                     continue
                 yield img_path, int(label)
 
-    mapper = functools.partial(
-        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
+    mapper = functools.partial(process_image,
+                               mode=mode,
+                               color_jitter=color_jitter,
+                               rotate=rotate)
 
     return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
 
@@ -108,6 +111,7 @@ def val(data_dir=DATA_DIR):
 
 
 class TestPostTrainingQuantization(unittest.TestCase):
+
     def setUp(self):
         self.int8_download = 'int8/download'
         self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
@@ -154,13 +158,13 @@ def tearDown(self):
         try:
             os.system("rm -rf {}".format(self.int8_model))
         except Exception as e:
-            print("Failed to delete {} due to {}".format(self.int8_model,
-                                                         str(e)))
+            print("Failed to delete {} due to {}".format(
+                self.int8_model, str(e)))
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
-            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
-                                                          zip_path)
+            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
+                target_folder, zip_path)
             os.system(cmd)
 
     def download_data(self, data_urls, data_md5s, folder_name, is_model=True):
@@ -207,17 +211,18 @@ def run_program(self, model_path, batch_size, infer_iterations):
         cnt = 0
         periods = []
         for batch_id, data in enumerate(val_reader()):
-            image = np.array(
-                [x[0].reshape(image_shape) for x in data]).astype("float32")
+            image = np.array([x[0].reshape(image_shape)
+                              for x in data]).astype("float32")
             label = np.array([x[1] for x in data]).astype("int64")
             label = label.reshape([-1, 1])
 
             t1 = time.time()
-            _, acc1, _ = exe.run(
-                infer_program,
-                feed={feed_dict[0]: image,
-                      feed_dict[1]: label},
-                fetch_list=fetch_targets)
+            _, acc1, _ = exe.run(infer_program,
+                                 feed={
+                                     feed_dict[0]: image,
+                                     feed_dict[1]: label
+                                 },
+                                 fetch_list=fetch_targets)
             t2 = time.time()
             period = t2 - t1
             periods.append(period)
@@ -248,8 +253,8 @@ def generate_quantized_model(self,
         try:
             os.system("mkdir " + self.int8_model)
         except Exception as e:
-            print("Failed to create {} due to {}".format(self.int8_model,
-                                                         str(e)))
+            print("Failed to create {} due to {}".format(
+                self.int8_model, str(e)))
             sys.exit(-1)
 
         place = fluid.CPUPlace()
@@ -257,17 +262,16 @@ def generate_quantized_model(self,
         scope = fluid.global_scope()
         val_reader = val()
 
-        ptq = PostTrainingQuantization(
-            executor=exe,
-            sample_generator=val_reader,
-            model_dir=model_path,
-            algo=algo,
-            quantizable_op_type=quantizable_op_type,
-            round_type=round_type,
-            is_full_quantize=is_full_quantize,
-            optimize_model=is_optimize_model,
-            onnx_format=onnx_format,
-            is_use_cache_file=is_use_cache_file)
+        ptq = PostTrainingQuantization(executor=exe,
+                                       sample_generator=val_reader,
+                                       model_dir=model_path,
+                                       algo=algo,
+                                       quantizable_op_type=quantizable_op_type,
+                                       round_type=round_type,
+                                       is_full_quantize=is_full_quantize,
+                                       optimize_model=is_optimize_model,
+                                       onnx_format=onnx_format,
+                                       is_use_cache_file=is_use_cache_file)
         ptq.quantize()
         ptq.save_quantized_model(self.int8_model)
 
@@ -291,8 +295,9 @@ def run_test(self,
 
         print("Start FP32 inference for {0} on {1} images ...".format(
             model, infer_iterations * batch_size))
-        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
-            model_cache_folder + "/model", batch_size, infer_iterations)
+        (fp32_throughput, fp32_latency,
+         fp32_acc1) = self.run_program(model_cache_folder + "/model",
+                                       batch_size, infer_iterations)
 
         print("Start INT8 post training quantization for {0} on {1} images ...".
               format(model, sample_iterations * batch_size))
@@ -303,16 +308,19 @@ def run_test(self,
 
         print("Start INT8 inference for {0} on {1} images ...".format(
             model, infer_iterations * batch_size))
-        (int8_throughput, int8_latency, int8_acc1) = self.run_program(
-            self.int8_model, batch_size, infer_iterations)
+        (int8_throughput, int8_latency,
+         int8_acc1) = self.run_program(self.int8_model, batch_size,
+                                       infer_iterations)
 
         print("---Post training quantization of {} method---".format(algo))
         print(
-            "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}.".
-            format(model, batch_size, fp32_throughput, fp32_latency, fp32_acc1))
+            "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}."
+            .format(model, batch_size, fp32_throughput, fp32_latency,
+                    fp32_acc1))
         print(
-            "INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}.\n".
-            format(model, batch_size, int8_throughput, int8_latency, int8_acc1))
+            "INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}.\n"
+            .format(model, batch_size, int8_throughput, int8_latency,
+                    int8_acc1))
         sys.stdout.flush()
 
         delta_value = fp32_acc1 - int8_acc1
@@ -320,6 +328,7 @@ def run_test(self,
 
 
 class TestPostTrainingKLForMobilenetv1(TestPostTrainingQuantization):
+
     def test_post_training_kl_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "KL"
@@ -344,6 +353,7 @@ def test_post_training_kl_mobilenetv1(self):
 
 
 class TestPostTrainingavgForMobilenetv1(TestPostTrainingQuantization):
+
     def test_post_training_avg_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "avg"
@@ -367,6 +377,7 @@ def test_post_training_avg_mobilenetv1(self):
 
 
 class TestPostTraininghistForMobilenetv1(TestPostTrainingQuantization):
+
     def test_post_training_hist_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "hist"
@@ -390,6 +401,7 @@ def test_post_training_hist_mobilenetv1(self):
 
 
 class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
+
     def test_post_training_abs_max_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "abs_max"
@@ -413,6 +425,7 @@ def test_post_training_abs_max_mobilenetv1(self):
 
 
 class TestPostTrainingAvgONNXFormatForMobilenetv1(TestPostTrainingQuantization):
+
     def test_post_training_onnx_format_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "avg"
@@ -431,18 +444,17 @@ def test_post_training_onnx_format_mobilenetv1(self):
         is_optimize_model = True
         onnx_format = True
         diff_threshold = 0.05
-        self.run_test(
-            model,
-            algo,
-            round_type,
-            data_urls,
-            data_md5s,
-            quantizable_op_type,
-            is_full_quantize,
-            is_use_cache_file,
-            is_optimize_model,
-            diff_threshold,
-            onnx_format=onnx_format)
+        self.run_test(model,
+                      algo,
+                      round_type,
+                      data_urls,
+                      data_md5s,
+                      quantizable_op_type,
+                      is_full_quantize,
+                      is_use_cache_file,
+                      is_optimize_model,
+                      diff_threshold,
+                      onnx_format=onnx_format)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
index dc12026a21ab1..c79499100cee3 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
@@ -21,6 +21,7 @@
 
 
 class TestPostTrainingForResnet50(TestPostTrainingQuantization):
+
     def test_post_training_resnet50(self):
         model = "ResNet-50"
         algo = "min_max"
@@ -40,6 +41,7 @@ def test_post_training_resnet50(self):
 
 
 class TestPostTrainingForResnet50ONNXFormat(TestPostTrainingQuantization):
+
     def test_post_training_resnet50(self):
         model = "ResNet-50"
         algo = "min_max"
@@ -54,18 +56,17 @@ def test_post_training_resnet50(self):
         is_optimize_model = False
         diff_threshold = 0.025
         onnx_format = True
-        self.run_test(
-            model,
-            algo,
-            round_type,
-            data_urls,
-            data_md5s,
-            quantizable_op_type,
-            is_full_quantize,
-            is_use_cache_file,
-            is_optimize_model,
-            diff_threshold,
-            onnx_format=onnx_format)
+        self.run_test(model,
+                      algo,
+                      round_type,
+                      data_urls,
+                      data_md5s,
+                      quantizable_op_type,
+                      is_full_quantize,
+                      is_use_cache_file,
+                      is_optimize_model,
+                      diff_threshold,
+                      onnx_format=onnx_format)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
index 642bcf2a47679..f4eaf5d9bc777 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
@@ -32,6 +32,7 @@
 
 
 class TestPostTrainingQuantization(unittest.TestCase):
+
     def setUp(self):
         self.download_path = 'int8/download'
         self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
@@ -42,16 +43,16 @@ def setUp(self):
         try:
             os.system("mkdir -p " + self.int8_model_path)
         except Exception as e:
-            print("Failed to create {} due to {}".format(self.int8_model_path,
-                                                         str(e)))
+            print("Failed to create {} due to {}".format(
+                self.int8_model_path, str(e)))
             sys.exit(-1)
 
     def tearDown(self):
         try:
             os.system("rm -rf {}".format(self.int8_model_path))
         except Exception as e:
-            print("Failed to delete {} due to {}".format(self.int8_model_path,
-                                                         str(e)))
+            print("Failed to delete {} due to {}".format(
+                self.int8_model_path, str(e)))
 
     def cache_unzipping(self, target_folder, zip_path):
         cmd = 'tar xf {0} -C {1}'.format(zip_path, target_folder)
@@ -82,8 +83,8 @@ def run_program(self, model_path, batch_size, infer_iterations):
         cnt = 0
         periods = []
         for batch_id, data in enumerate(val_reader()):
-            image = np.array(
-                [x[0].reshape(img_shape) for x in data]).astype("float32")
+            image = np.array([x[0].reshape(img_shape)
+                              for x in data]).astype("float32")
             input_label = np.array([x[1] for x in data]).astype("int64")
 
             t1 = time.time()
@@ -147,10 +148,9 @@ def val_data_generator():
             optimize_model=is_optimize_model,
             is_use_cache_file=is_use_cache_file)
         ptq.quantize()
-        ptq.save_quantized_model(
-            self.int8_model_path,
-            model_filename='model.pdmodel',
-            params_filename='model.pdiparams')
+        ptq.save_quantized_model(self.int8_model_path,
+                                 model_filename='model.pdmodel',
+                                 params_filename='model.pdiparams')
 
     def run_test(self,
                  model_name,
@@ -172,36 +172,37 @@ def run_test(self,
 
         print("Start FP32 inference for {0} on {1} images ...".format(
             model_name, infer_iterations * batch_size))
-        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
-            origin_model_path, batch_size, infer_iterations)
+        (fp32_throughput, fp32_latency,
+         fp32_acc1) = self.run_program(origin_model_path, batch_size,
+                                       infer_iterations)
 
         print("Start INT8 post training quantization for {0} on {1} images ...".
               format(model_name, quant_iterations * batch_size))
-        self.generate_quantized_model(
-            origin_model_path,
-            algo,
-            quantizable_op_type,
-            is_full_quantize,
-            is_use_cache_file,
-            is_optimize_model,
-            batch_size,
-            quant_iterations,
-            is_data_loader=is_data_loader)
+        self.generate_quantized_model(origin_model_path,
+                                      algo,
+                                      quantizable_op_type,
+                                      is_full_quantize,
+                                      is_use_cache_file,
+                                      is_optimize_model,
+                                      batch_size,
+                                      quant_iterations,
+                                      is_data_loader=is_data_loader)
 
         print("Start INT8 inference for {0} on {1} images ...".format(
             model_name, infer_iterations * batch_size))
-        (int8_throughput, int8_latency, int8_acc1) = self.run_program(
-            self.int8_model_path, batch_size, infer_iterations)
+        (int8_throughput, int8_latency,
+         int8_acc1) = self.run_program(self.int8_model_path, batch_size,
+                                       infer_iterations)
 
         print("---Post training quantization of {} method---".format(algo))
         print(
-            "FP32 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.".
-            format(model_name, batch_size, fp32_throughput, fp32_latency,
-                   fp32_acc1))
+            "FP32 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}."
+            .format(model_name, batch_size, fp32_throughput, fp32_latency,
+                    fp32_acc1))
         print(
-            "INT8 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.\n".
-            format(model_name, batch_size, int8_throughput, int8_latency,
-                   int8_acc1))
+            "INT8 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.\n"
+            .format(model_name, batch_size, int8_throughput, int8_latency,
+                    int8_acc1))
         sys.stdout.flush()
 
         delta_value = fp32_acc1 - int8_acc1
@@ -209,6 +210,7 @@ def run_test(self,
 
 
 class TestPostTrainingKLForWhile(TestPostTrainingQuantization):
+
     def test_post_training_kl(self):
         model_name = "mnist_while"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
@@ -229,6 +231,7 @@ def test_post_training_kl(self):
 
 
 class TestPostTraininghistForWhile(TestPostTrainingQuantization):
+
     def test_post_training_hist(self):
         model_name = "mnist_while"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
@@ -249,6 +252,7 @@ def test_post_training_hist(self):
 
 
 class TestPostTrainingmseForWhile(TestPostTrainingQuantization):
+
     def test_post_training_mse(self):
         model_name = "mnist_while"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
@@ -269,6 +273,7 @@ def test_post_training_mse(self):
 
 
 class TestPostTrainingavgForWhile(TestPostTrainingQuantization):
+
     def test_post_training_avg(self):
         model_name = "mnist_while"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
@@ -289,6 +294,7 @@ def test_post_training_avg(self):
 
 
 class TestPostTrainingMinMaxForWhile(TestPostTrainingQuantization):
+
     def test_post_training_min_max(self):
         model_name = "mnist_while"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
@@ -309,6 +315,7 @@ def test_post_training_min_max(self):
 
 
 class TestPostTrainingAbsMaxForWhile(TestPostTrainingQuantization):
+
     def test_post_training_abs_max(self):
         model_name = "mnist_while"
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
@@ -326,20 +333,19 @@ def test_post_training_abs_max(self):
                       is_full_quantize, is_use_cache_file, is_optimize_model,
                       diff_threshold, batch_size, infer_iterations,
                       quant_iterations)
-        self.run_test(
-            model_name,
-            data_url,
-            data_md5,
-            algo,
-            quantizable_op_type,
-            is_full_quantize,
-            is_use_cache_file,
-            is_optimize_model,
-            diff_threshold,
-            batch_size,
-            infer_iterations,
-            quant_iterations,
-            is_data_loader=True)
+        self.run_test(model_name,
+                      data_url,
+                      data_md5,
+                      algo,
+                      quantizable_op_type,
+                      is_full_quantize,
+                      is_use_cache_file,
+                      is_optimize_model,
+                      diff_threshold,
+                      batch_size,
+                      infer_iterations,
+                      quant_iterations,
+                      is_data_loader=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
index 04e1decd4af68..dea0fcd489768 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
@@ -24,6 +24,7 @@
 
 
 class TestQuant2Int8MkldnnPassMul(unittest.TestCase):
+
     def op_name(self):
         return "mul"
 
@@ -54,19 +55,17 @@ def setUp(self):
     def prepare_program_mul(self, program):
         block = program.global_block()
         for name in self.variables_mul:
-            block.create_var(
-                name=name,
-                dtype="float32",
-                shape=self.variables_mul[name].shape)
-
-        mul_op1 = block.append_op(
-            type=self.op_name(),
-            inputs={
-                "X": block.var('mul_input'),
-                "Y": block.var('mul_weights')
-            },
-            outputs={"Out": block.var('mul_output')},
-            attrs={'use_mkldnn': self.use_mkldnn})
+            block.create_var(name=name,
+                             dtype="float32",
+                             shape=self.variables_mul[name].shape)
+
+        mul_op1 = block.append_op(type=self.op_name(),
+                                  inputs={
+                                      "X": block.var('mul_input'),
+                                      "Y": block.var('mul_weights')
+                                  },
+                                  outputs={"Out": block.var('mul_output')},
+                                  attrs={'use_mkldnn': self.use_mkldnn})
 
     def test_dequantize_op_weights(self):
         program = fluid.Program()
@@ -81,12 +80,11 @@ def test_dequantize_op_weights(self):
                     break
             assert op_node != "", "op of type %s not found" % self.op_name()
 
-            qpass = Quant2Int8MkldnnPass(
-                self.quantized_ops,
-                _scope=self.scope,
-                _place=self.place,
-                _core=core,
-                _debug=False)
+            qpass = Quant2Int8MkldnnPass(self.quantized_ops,
+                                         _scope=self.scope,
+                                         _place=self.place,
+                                         _core=core,
+                                         _debug=False)
             qpass._weight_thresholds["mul_output"] = self.mul_output_scale
             param = self.scope.var("mul_weights").get_tensor()
             param.set(self.variables_mul["mul_weights"], self.place)
@@ -105,11 +103,13 @@ def test_dequantize_op_weights(self):
 
 
 class TestQuant2Int8MkldnnPassMatmulV2(TestQuant2Int8MkldnnPassMul):
+
     def op_name(self):
         return "matmul_v2"
 
 
 class TestQuant2Int8MkldnnPassConv2D(unittest.TestCase):
+
     def setUp(self):
         self.scope = fluid.Scope()
         self.place = fluid.CPUPlace()
@@ -144,8 +144,9 @@ def setUp(self):
     def prepare_program_conv2d(self, program):
         block = program.global_block()
         for name in self.variables:
-            block.create_var(
-                name=name, dtype="float32", shape=self.variables[name].shape)
+            block.create_var(name=name,
+                             dtype="float32",
+                             shape=self.variables[name].shape)
         conv2d_op1 = block.append_op(
             type="conv2d",
             inputs={
@@ -203,16 +204,16 @@ def test_quant_update_activation(self):
             graph = IrGraph(core.Graph(program.desc), for_test=True)
             graph = self.remove_fuse_activation_attribute(graph)
             self.check_graph_before_pass(graph)
-            quant2_int8_mkldnn_pass = Quant2Int8MkldnnPass(
-                self.quantized_ops,
-                _scope=self.scope,
-                _place=self.place,
-                _core=core,
-                _debug=False)
+            quant2_int8_mkldnn_pass = Quant2Int8MkldnnPass(self.quantized_ops,
+                                                           _scope=self.scope,
+                                                           _place=self.place,
+                                                           _core=core,
+                                                           _debug=False)
             graph = quant2_int8_mkldnn_pass._update_activations(graph)
             self.check_graph_after_pass(graph)
 
     class TestQuant2Int8MkldnnPassNearestInterp(unittest.TestCase):
+
         def op_name(self):
             return "nearest_interp"
 
@@ -268,47 +269,49 @@ def setUp(self):
         def prepare_program(self, program):
             block = program.global_block()
             for name in self.variables:
-                block.create_var(
-                    name=name,
-                    dtype="float32",
-                    shape=self.variables[name].shape)
-            block.append_op(
-                type="conv2d",
-                inputs={
-                    "Input": block.var('input'),
-                    'Filter': block.var('filter')
-                },
-                outputs={"Output": block.var('conv_output')},
-                attrs={
-                    'strides': self.stride,
-                    'paddings': self.pad,
-                    'groups': self.groups,
-                    'dilations': self.dilations,
-                    'use_cudnn': self.use_cudnn,
-                    'use_mkldnn': self.use_mkldnn,
-                    'data_format': self.data_format,
-                    'fuse_relu': True
-                })
-            block.append_op(
-                type=self.op_name(),
-                inputs={"X": block.var('conv_output'), },
-                outputs={"Out": block.var('nearest_interp_output')},
-                attrs={
-                    'interp_method': self.interp_method,
-                    'out_h': self.out_h,
-                    'out_w': self.out_w,
-                    'scale': self.scale,
-                    'data_layout': self.data_layout,
-                    'use_mkldnn': self.use_mkldnn
-                })
-            block.append_op(
-                type='dropout',
-                inputs={"X": block.var('nearest_interp_output'), },
-                outputs={
-                    'Out': block.var('dropout_out'),
-                    'Mask': block.var('dropout_mask')
-                },
-                attrs={'dropout_prob': self.dropout_prob, })
+                block.create_var(name=name,
+                                 dtype="float32",
+                                 shape=self.variables[name].shape)
+            block.append_op(type="conv2d",
+                            inputs={
+                                "Input": block.var('input'),
+                                'Filter': block.var('filter')
+                            },
+                            outputs={"Output": block.var('conv_output')},
+                            attrs={
+                                'strides': self.stride,
+                                'paddings': self.pad,
+                                'groups': self.groups,
+                                'dilations': self.dilations,
+                                'use_cudnn': self.use_cudnn,
+                                'use_mkldnn': self.use_mkldnn,
+                                'data_format': self.data_format,
+                                'fuse_relu': True
+                            })
+            block.append_op(type=self.op_name(),
+                            inputs={
+                                "X": block.var('conv_output'),
+                            },
+                            outputs={"Out": block.var('nearest_interp_output')},
+                            attrs={
+                                'interp_method': self.interp_method,
+                                'out_h': self.out_h,
+                                'out_w': self.out_w,
+                                'scale': self.scale,
+                                'data_layout': self.data_layout,
+                                'use_mkldnn': self.use_mkldnn
+                            })
+            block.append_op(type='dropout',
+                            inputs={
+                                "X": block.var('nearest_interp_output'),
+                            },
+                            outputs={
+                                'Out': block.var('dropout_out'),
+                                'Mask': block.var('dropout_mask')
+                            },
+                            attrs={
+                                'dropout_prob': self.dropout_prob,
+                            })
 
         def check_graph_after_pass(self, graph):
             for op in graph.all_op_nodes():
@@ -344,6 +347,7 @@ def test_quant_update_activation(self):
                     self.check_graph_after_pass(graph)
 
     class TestQuant2Int8MkldnnPassNearestInterpV2(unittest.TestCase):
+
         def op_name(self):
             return "nearest_interp_v2"
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
index 7ee0fd1d3e28f..28706d34c63fd 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
@@ -30,21 +30,19 @@
 
 
 def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(input=img,
+                                                  filter_size=5,
+                                                  num_filters=20,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
     conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(input=conv_pool_1,
+                                                  filter_size=5,
+                                                  num_filters=50,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
     prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
     avg_loss = fluid.layers.mean(loss)
@@ -52,6 +50,7 @@ def conv_net(img, label):
 
 
 class TestMKLDNNTransformBasedFreezePass(unittest.TestCase):
+
     def setUp(self):
         self.quantizable_op_and_inputs = {
             'conv2d': ['Input', 'Filter'],
@@ -76,10 +75,12 @@ def build_program(self, main, startup, is_test, seed):
         startup.random_seed = seed
         with fluid.unique_name.guard():
             with fluid.program_guard(main, startup):
-                img = fluid.layers.data(
-                    name='image', shape=[1, 28, 28], dtype='float32')
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64')
+                img = fluid.layers.data(name='image',
+                                        shape=[1, 28, 28],
+                                        dtype='float32')
+                label = fluid.layers.data(name='label',
+                                          shape=[1],
+                                          dtype='int64')
                 loss = conv_net(img, label)
                 if not is_test:
                     opt = fluid.optimizer.Adam(learning_rate=0.001)
@@ -128,12 +129,11 @@ def mkldnn_based_freeze_graph(self,
         iters = 5
         batch_size = 8
 
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=500),
-            batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+                                    batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
         feeder = fluid.DataFeeder(feed_list=feeds, place=place)
 
         # Training the model to get the weights value
@@ -158,9 +158,9 @@ def mkldnn_based_freeze_graph(self,
             for op in test_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            test_graph.draw('.', 'test_mkldnn' + dev_name +
-                            activation_quant_type + '_' + weight_quant_type,
-                            marked_nodes)
+            test_graph.draw(
+                '.', 'test_mkldnn' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, marked_nodes)
         mkldnn_program = test_graph.to_program()
 
         # Check the transformation weights of conv2d and mul
@@ -174,8 +174,9 @@ def mkldnn_based_freeze_graph(self,
         # output
         self.check_program(mkldnn_program)
         if not for_ci:
-            print('{}: {}'.format('w_mkldnn' + dev_name + activation_quant_type
-                                  + '_' + weight_quant_type, np.sum(w_mkldnn)))
+            print('{}: {}'.format(
+                'w_mkldnn' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, np.sum(w_mkldnn)))
 
     def test_mkldnn_graph_cpu_static(self):
         with fluid.unique_name.guard():
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index fe261237f1227..c42777d673a7d 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -46,6 +46,7 @@ def linear_fc(num):
 
 
 def residual_block(num, quant_skip_pattern=None):
+
     def conv_bn_layer(input,
                       ch_out,
                       filter_size,
@@ -53,38 +54,42 @@ def conv_bn_layer(input,
                       padding,
                       act='relu',
                       bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
+        tmp = fluid.layers.conv2d(input=input,
+                                  filter_size=filter_size,
+                                  num_filters=ch_out,
+                                  stride=stride,
+                                  padding=padding,
+                                  act=None,
+                                  bias_attr=bias_attr)
         return fluid.layers.batch_norm(input=tmp, act=act)
 
-    data = fluid.layers.data(
-        name='image',
-        shape=[1, 1, 32, 32],
-        dtype='float32',
-        append_batch_size=False)
-    label = fluid.layers.data(
-        name='label', shape=[1, 1], dtype='int64', append_batch_size=False)
+    data = fluid.layers.data(name='image',
+                             shape=[1, 1, 32, 32],
+                             dtype='float32',
+                             append_batch_size=False)
+    label = fluid.layers.data(name='label',
+                              shape=[1, 1],
+                              dtype='int64',
+                              append_batch_size=False)
     hidden = data
     for _ in six.moves.xrange(num):
         conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
         short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
         hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
-    matmul_weight = fluid.layers.create_parameter(
-        shape=[1, 16, 32, 32], dtype='float32')
+    matmul_weight = fluid.layers.create_parameter(shape=[1, 16, 32, 32],
+                                                  dtype='float32')
     hidden = fluid.layers.matmul(hidden, matmul_weight, True, True)
     if quant_skip_pattern:
         with fluid.name_scope(quant_skip_pattern):
-            pool = fluid.layers.pool2d(
-                input=hidden, pool_size=2, pool_type='avg', pool_stride=2)
+            pool = fluid.layers.pool2d(input=hidden,
+                                       pool_size=2,
+                                       pool_type='avg',
+                                       pool_stride=2)
     else:
-        pool = fluid.layers.pool2d(
-            input=hidden, pool_size=2, pool_type='avg', pool_stride=2)
+        pool = fluid.layers.pool2d(input=hidden,
+                                   pool_size=2,
+                                   pool_type='avg',
+                                   pool_stride=2)
     fc = fluid.layers.fc(input=pool, size=10)
     loss = fluid.layers.cross_entropy(input=fc, label=label)
     loss = fluid.layers.mean(loss)
@@ -92,23 +97,21 @@ def conv_bn_layer(input,
 
 
 def conv_net(img, label, quant_skip_pattern):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='max',
-        act="relu")
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(input=img,
+                                                  filter_size=5,
+                                                  num_filters=20,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  pool_type='max',
+                                                  act="relu")
     conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='avg',
-        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(input=conv_pool_1,
+                                                  filter_size=5,
+                                                  num_filters=50,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  pool_type='avg',
+                                                  act="relu")
     hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
     with fluid.name_scope(quant_skip_pattern):
         prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
@@ -118,6 +121,7 @@ def conv_net(img, label, quant_skip_pattern):
 
 
 class TestQuantizationTransformPass(unittest.TestCase):
+
     def setUp(self):
         self.quantizable_op_and_inputs = {
             'conv2d': ['Input', 'Filter'],
@@ -193,8 +197,9 @@ def test_linear_fc_quant_range_abs_max(self):
         self.linear_fc_quant('range_abs_max', 'abs_max', for_ci=True)
 
     def test_linear_fc_quant_moving_average_abs_max(self):
-        self.linear_fc_quant(
-            'moving_average_abs_max', 'channel_wise_abs_max', for_ci=True)
+        self.linear_fc_quant('moving_average_abs_max',
+                             'channel_wise_abs_max',
+                             for_ci=True)
 
     def residual_block_quant(self,
                              activation_quant_type,
@@ -236,24 +241,28 @@ def residual_block_quant(self,
 
     def test_residual_block_abs_max(self):
         quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul', 'matmul']
-        self.residual_block_quant(
-            'abs_max', 'abs_max', quantizable_op_type, for_ci=True)
+        self.residual_block_quant('abs_max',
+                                  'abs_max',
+                                  quantizable_op_type,
+                                  for_ci=True)
 
     def test_residual_block_range_abs_max(self):
         quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul', 'matmul']
-        self.residual_block_quant(
-            'range_abs_max', 'abs_max', quantizable_op_type, for_ci=True)
+        self.residual_block_quant('range_abs_max',
+                                  'abs_max',
+                                  quantizable_op_type,
+                                  for_ci=True)
 
     def test_residual_block_moving_average_abs_max(self):
         quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul', 'matmul']
-        self.residual_block_quant(
-            'moving_average_abs_max',
-            'channel_wise_abs_max',
-            quantizable_op_type,
-            for_ci=True)
+        self.residual_block_quant('moving_average_abs_max',
+                                  'channel_wise_abs_max',
+                                  quantizable_op_type,
+                                  for_ci=True)
 
 
 class TestQuantizationFreezePass(unittest.TestCase):
+
     def freeze_graph(self,
                      use_cuda,
                      seed,
@@ -262,15 +271,18 @@ def freeze_graph(self,
                      weight_quant_type='abs_max',
                      for_ci=True,
                      quant_skip_pattern='skip_quant'):
+
         def build_program(main, startup, is_test):
             main.random_seed = seed
             startup.random_seed = seed
             with fluid.unique_name.guard():
                 with fluid.program_guard(main, startup):
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
+                    img = fluid.layers.data(name='image',
+                                            shape=[1, 28, 28],
+                                            dtype='float32')
+                    label = fluid.layers.data(name='label',
+                                              shape=[1],
+                                              dtype='int64')
                     loss = conv_net(img, label, quant_skip_pattern)
                     if not is_test:
                         opt = fluid.optimizer.Adam(learning_rate=0.001)
@@ -308,14 +320,16 @@ def build_program(main, startup, is_test):
             for op in main_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            main_graph.draw('.', 'main' + dev_name + activation_quant_type + '_'
-                            + weight_quant_type, marked_nodes)
+            main_graph.draw(
+                '.', 'main' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, marked_nodes)
             marked_nodes = set()
             for op in test_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            test_graph.draw('.', 'test' + dev_name + activation_quant_type + '_'
-                            + weight_quant_type, marked_nodes)
+            test_graph.draw(
+                '.', 'test' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, marked_nodes)
 
         build_strategy = fluid.BuildStrategy()
         build_strategy.memory_optimize = False
@@ -327,12 +341,11 @@ def build_program(main, startup, is_test):
         iters = 5
         batch_size = 8
 
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=500),
-            batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+                                    batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
         feeder = fluid.DataFeeder(feed_list=feeds, place=place)
         with fluid.scope_guard(scope):
             for _ in range(iters):
@@ -341,9 +354,9 @@ def build_program(main, startup, is_test):
                                  feed=feeder.feed(data),
                                  fetch_list=[loss])
                 if not for_ci:
-                    print('{}: {}'.format('loss' + dev_name +
-                                          activation_quant_type + '_' +
-                                          weight_quant_type, loss_v))
+                    print('{}: {}'.format(
+                        'loss' + dev_name + activation_quant_type + '_' +
+                        weight_quant_type, loss_v))
 
         test_data = next(test_reader())
         with fluid.program_guard(quantized_test_program):
@@ -365,9 +378,9 @@ def build_program(main, startup, is_test):
             for op in test_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            test_graph.draw('.', 'test_freeze' + dev_name +
-                            activation_quant_type + '_' + weight_quant_type,
-                            marked_nodes)
+            test_graph.draw(
+                '.', 'test_freeze' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, marked_nodes)
 
         server_program = test_graph.to_program()
         with fluid.scope_guard(scope):
@@ -376,20 +389,22 @@ def build_program(main, startup, is_test):
                                   fetch_list=[loss])
         self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
         if not for_ci:
-            print(
-                '{}: {}'.format('test_loss1' + dev_name + activation_quant_type
-                                + '_' + weight_quant_type, test_loss1))
-            print(
-                '{}: {}'.format('test_loss2' + dev_name + activation_quant_type
-                                + '_' + weight_quant_type, test_loss2))
+            print('{}: {}'.format(
+                'test_loss1' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, test_loss1))
+            print('{}: {}'.format(
+                'test_loss2' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, test_loss2))
         w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
         # Maybe failed, this is due to the calculation precision
         # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
         if not for_ci:
-            print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type
-                                  + '_' + weight_quant_type, np.sum(w_freeze)))
-            print('{}: {}'.format('w_quant' + dev_name + activation_quant_type +
-                                  '_' + weight_quant_type, np.sum(w_quant)))
+            print('{}: {}'.format(
+                'w_freeze' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, np.sum(w_freeze)))
+            print('{}: {}'.format(
+                'w_quant' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, np.sum(w_quant)))
 
         # Convert parameter to 8-bit.
         convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
@@ -399,8 +414,9 @@ def build_program(main, startup, is_test):
             for op in test_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            test_graph.draw('.', 'test_int8' + dev_name + activation_quant_type
-                            + '_' + weight_quant_type, marked_nodes)
+            test_graph.draw(
+                '.', 'test_int8' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, marked_nodes)
         server_program_int8 = test_graph.to_program()
         # Save the 8-bit parameter and model file.
         with fluid.scope_guard(scope):
@@ -417,10 +433,12 @@ def build_program(main, startup, is_test):
         self.assertEqual(w_8bit.dtype, np.int8)
         self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
         if not for_ci:
-            print('{}: {}'.format('w_8bit' + dev_name + activation_quant_type +
-                                  '_' + weight_quant_type, np.sum(w_8bit)))
-            print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type
-                                  + '_' + weight_quant_type, np.sum(w_freeze)))
+            print('{}: {}'.format(
+                'w_8bit' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, np.sum(w_8bit)))
+            print('{}: {}'.format(
+                'w_freeze' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, np.sum(w_freeze)))
 
         mobile_pass = TransformForMobilePass()
         mobile_pass.apply(test_graph)
@@ -429,9 +447,9 @@ def build_program(main, startup, is_test):
             for op in test_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            test_graph.draw('.', 'test_mobile' + dev_name +
-                            activation_quant_type + '_' + weight_quant_type,
-                            marked_nodes)
+            test_graph.draw(
+                '.', 'test_mobile' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, marked_nodes)
 
         mobile_program = test_graph.to_program()
         with fluid.scope_guard(scope):
@@ -443,63 +461,56 @@ def build_program(main, startup, is_test):
     def test_freeze_graph_cuda_dynamic(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.unique_name.guard():
-                self.freeze_graph(
-                    True,
-                    seed=1,
-                    activation_quant_type='abs_max',
-                    weight_quant_type='abs_max',
-                    for_ci=True)
+                self.freeze_graph(True,
+                                  seed=1,
+                                  activation_quant_type='abs_max',
+                                  weight_quant_type='abs_max',
+                                  for_ci=True)
             with fluid.unique_name.guard():
-                self.freeze_graph(
-                    True,
-                    seed=1,
-                    activation_quant_type='abs_max',
-                    weight_quant_type='channel_wise_abs_max',
-                    for_ci=True)
+                self.freeze_graph(True,
+                                  seed=1,
+                                  activation_quant_type='abs_max',
+                                  weight_quant_type='channel_wise_abs_max',
+                                  for_ci=True)
 
     def test_freeze_graph_cpu_dynamic(self):
         with fluid.unique_name.guard():
-            self.freeze_graph(
-                False,
-                seed=2,
-                activation_quant_type='abs_max',
-                weight_quant_type='abs_max',
-                for_ci=True)
-            self.freeze_graph(
-                False,
-                seed=2,
-                activation_quant_type='abs_max',
-                weight_quant_type='channel_wise_abs_max',
-                for_ci=True)
+            self.freeze_graph(False,
+                              seed=2,
+                              activation_quant_type='abs_max',
+                              weight_quant_type='abs_max',
+                              for_ci=True)
+            self.freeze_graph(False,
+                              seed=2,
+                              activation_quant_type='abs_max',
+                              weight_quant_type='channel_wise_abs_max',
+                              for_ci=True)
 
     def test_freeze_graph_cuda_static(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.unique_name.guard():
-                self.freeze_graph(
-                    True,
-                    seed=1,
-                    activation_quant_type='range_abs_max',
-                    bias_correction=True,
-                    weight_quant_type='abs_max',
-                    for_ci=True)
-                self.freeze_graph(
-                    True,
-                    seed=1,
-                    activation_quant_type='range_abs_max',
-                    weight_quant_type='abs_max',
-                    for_ci=True)
+                self.freeze_graph(True,
+                                  seed=1,
+                                  activation_quant_type='range_abs_max',
+                                  bias_correction=True,
+                                  weight_quant_type='abs_max',
+                                  for_ci=True)
+                self.freeze_graph(True,
+                                  seed=1,
+                                  activation_quant_type='range_abs_max',
+                                  weight_quant_type='abs_max',
+                                  for_ci=True)
                 self.freeze_graph(
                     True,
                     seed=1,
                     activation_quant_type='moving_average_abs_max',
                     weight_quant_type='abs_max',
                     for_ci=True)
-                self.freeze_graph(
-                    True,
-                    seed=1,
-                    activation_quant_type='range_abs_max',
-                    weight_quant_type='channel_wise_abs_max',
-                    for_ci=True)
+                self.freeze_graph(True,
+                                  seed=1,
+                                  activation_quant_type='range_abs_max',
+                                  weight_quant_type='channel_wise_abs_max',
+                                  for_ci=True)
                 self.freeze_graph(
                     True,
                     seed=1,
@@ -516,33 +527,30 @@ def test_freeze_graph_cuda_static(self):
 
     def test_freeze_graph_cpu_static(self):
         with fluid.unique_name.guard():
-            self.freeze_graph(
-                False,
-                seed=2,
-                activation_quant_type='range_abs_max',
-                weight_quant_type='abs_max',
-                for_ci=True)
-            self.freeze_graph(
-                False,
-                seed=2,
-                activation_quant_type='moving_average_abs_max',
-                weight_quant_type='abs_max',
-                for_ci=True)
-            self.freeze_graph(
-                False,
-                seed=2,
-                activation_quant_type='range_abs_max',
-                weight_quant_type='channel_wise_abs_max',
-                for_ci=True)
-            self.freeze_graph(
-                False,
-                seed=2,
-                activation_quant_type='moving_average_abs_max',
-                weight_quant_type='channel_wise_abs_max',
-                for_ci=True)
+            self.freeze_graph(False,
+                              seed=2,
+                              activation_quant_type='range_abs_max',
+                              weight_quant_type='abs_max',
+                              for_ci=True)
+            self.freeze_graph(False,
+                              seed=2,
+                              activation_quant_type='moving_average_abs_max',
+                              weight_quant_type='abs_max',
+                              for_ci=True)
+            self.freeze_graph(False,
+                              seed=2,
+                              activation_quant_type='range_abs_max',
+                              weight_quant_type='channel_wise_abs_max',
+                              for_ci=True)
+            self.freeze_graph(False,
+                              seed=2,
+                              activation_quant_type='moving_average_abs_max',
+                              weight_quant_type='channel_wise_abs_max',
+                              for_ci=True)
 
 
 def quant_dequant_residual_block(num, quant_skip_pattern=None):
+
     def conv_bn_layer(input,
                       ch_out,
                       filter_size,
@@ -550,19 +558,19 @@ def conv_bn_layer(input,
                       padding,
                       act='relu',
                       bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
+        tmp = fluid.layers.conv2d(input=input,
+                                  filter_size=filter_size,
+                                  num_filters=ch_out,
+                                  stride=stride,
+                                  padding=padding,
+                                  act=None,
+                                  bias_attr=bias_attr)
         return fluid.layers.batch_norm(input=tmp, act=act)
 
     data1 = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
-    data2 = fluid.layers.data(
-        name='matmul_input', shape=[16, 32, 32], dtype='float32')
+    data2 = fluid.layers.data(name='matmul_input',
+                              shape=[16, 32, 32],
+                              dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
     hidden = data1
     for _ in six.moves.xrange(num):
@@ -572,29 +580,43 @@ def conv_bn_layer(input,
     hidden = fluid.layers.matmul(hidden, data2, True, True)
     if isinstance(quant_skip_pattern, str):
         with fluid.name_scope(quant_skip_pattern):
-            pool1 = fluid.layers.pool2d(
-                input=hidden, pool_size=2, pool_type='avg', pool_stride=2)
-            pool2 = fluid.layers.pool2d(
-                input=hidden, pool_size=2, pool_type='max', pool_stride=2)
-            pool_add = fluid.layers.elementwise_add(
-                x=pool1, y=pool2, act='relu')
+            pool1 = fluid.layers.pool2d(input=hidden,
+                                        pool_size=2,
+                                        pool_type='avg',
+                                        pool_stride=2)
+            pool2 = fluid.layers.pool2d(input=hidden,
+                                        pool_size=2,
+                                        pool_type='max',
+                                        pool_stride=2)
+            pool_add = fluid.layers.elementwise_add(x=pool1,
+                                                    y=pool2,
+                                                    act='relu')
     elif isinstance(quant_skip_pattern, list):
         assert len(
             quant_skip_pattern
         ) > 1, 'test config error: the len of quant_skip_pattern list should be greater than 1.'
         with fluid.name_scope(quant_skip_pattern[0]):
-            pool1 = fluid.layers.pool2d(
-                input=hidden, pool_size=2, pool_type='avg', pool_stride=2)
-            pool2 = fluid.layers.pool2d(
-                input=hidden, pool_size=2, pool_type='max', pool_stride=2)
+            pool1 = fluid.layers.pool2d(input=hidden,
+                                        pool_size=2,
+                                        pool_type='avg',
+                                        pool_stride=2)
+            pool2 = fluid.layers.pool2d(input=hidden,
+                                        pool_size=2,
+                                        pool_type='max',
+                                        pool_stride=2)
         with fluid.name_scope(quant_skip_pattern[1]):
-            pool_add = fluid.layers.elementwise_add(
-                x=pool1, y=pool2, act='relu')
+            pool_add = fluid.layers.elementwise_add(x=pool1,
+                                                    y=pool2,
+                                                    act='relu')
     else:
-        pool1 = fluid.layers.pool2d(
-            input=hidden, pool_size=2, pool_type='avg', pool_stride=2)
-        pool2 = fluid.layers.pool2d(
-            input=hidden, pool_size=2, pool_type='max', pool_stride=2)
+        pool1 = fluid.layers.pool2d(input=hidden,
+                                    pool_size=2,
+                                    pool_type='avg',
+                                    pool_stride=2)
+        pool2 = fluid.layers.pool2d(input=hidden,
+                                    pool_size=2,
+                                    pool_type='max',
+                                    pool_stride=2)
         pool_add = fluid.layers.elementwise_add(x=pool1, y=pool2, act='relu')
     fc = fluid.layers.fc(input=pool_add, size=10)
     loss = fluid.layers.cross_entropy(input=fc, label=label)
@@ -603,6 +625,7 @@ def conv_bn_layer(input,
 
 
 class TestAddQuantDequantPass(unittest.TestCase):
+
     def setUp(self):
         self._target_ops = {'elementwise_add', 'pool2d'}
         self._target_grad_ops = {'elementwise_add_grad', 'pool2d_grad'}
@@ -626,9 +649,9 @@ def check_graph(self, graph, skip_pattern=None):
                 for input_name in op_node.input_arg_names():
                     in_node = graph._find_node_by_name(op_node.inputs,
                                                        input_name)
-                    in_nodes_all_not_persistable = (
-                        in_nodes_all_not_persistable and
-                        not in_node.persistable())
+                    in_nodes_all_not_persistable = (in_nodes_all_not_persistable
+                                                    and
+                                                    not in_node.persistable())
                 if not in_nodes_all_not_persistable:
                     continue
                 input_names = op_node.input_arg_names()
@@ -671,23 +694,25 @@ def residual_block_quant(self,
 
     def test_residual_block(self):
         quantizable_op_type = ['elementwise_add', 'pool2d', 'mul', 'matmul']
-        self.residual_block_quant(
-            quantizable_op_type, skip_pattern=None, for_ci=True)
+        self.residual_block_quant(quantizable_op_type,
+                                  skip_pattern=None,
+                                  for_ci=True)
 
     def test_residual_block_skip_pattern(self):
         quantizable_op_type = ['elementwise_add', 'pool2d', 'mul', 'matmul']
-        self.residual_block_quant(
-            quantizable_op_type, skip_pattern='skip_quant', for_ci=True)
+        self.residual_block_quant(quantizable_op_type,
+                                  skip_pattern='skip_quant',
+                                  for_ci=True)
 
     def test_residual_block_skip_pattern_1(self):
         quantizable_op_type = ['elementwise_add', 'pool2d', 'mul', 'matmul']
-        self.residual_block_quant(
-            quantizable_op_type,
-            skip_pattern=['skip_quant1', 'skip_quant2'],
-            for_ci=True)
+        self.residual_block_quant(quantizable_op_type,
+                                  skip_pattern=['skip_quant1', 'skip_quant2'],
+                                  for_ci=True)
 
 
 class TestQuantizationTransformPassV2(unittest.TestCase):
+
     def setUp(self):
         self.quantizable_op_and_inputs = {
             'conv2d': ['Input', 'Filter'],
@@ -802,13 +827,17 @@ def residual_block_quant(self,
 
     def test_residual_block_abs_max(self):
         quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul', 'matmul']
-        self.residual_block_quant(
-            'abs_max', 'abs_max', quantizable_op_type, for_ci=True)
+        self.residual_block_quant('abs_max',
+                                  'abs_max',
+                                  quantizable_op_type,
+                                  for_ci=True)
 
     def test_residual_block_channel_wise_abs_max(self):
         quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul', 'matmul']
-        self.residual_block_quant(
-            'abs_max', 'channel_wise_abs_max', quantizable_op_type, for_ci=True)
+        self.residual_block_quant('abs_max',
+                                  'channel_wise_abs_max',
+                                  quantizable_op_type,
+                                  for_ci=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
index ec2c7a91f96ab..acf3c68600ce6 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
@@ -34,23 +34,21 @@
 
 
 def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='max',
-        act="relu")
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(input=img,
+                                                  filter_size=5,
+                                                  num_filters=20,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  pool_type='max',
+                                                  act="relu")
     conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='avg',
-        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(input=conv_pool_1,
+                                                  filter_size=5,
+                                                  num_filters=50,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  pool_type='avg',
+                                                  act="relu")
     hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
     prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
@@ -59,21 +57,25 @@ def conv_net(img, label):
 
 
 class TestQuantizationScalePass(unittest.TestCase):
+
     def quantization_scale(self,
                            use_cuda,
                            seed,
                            activation_quant_type,
                            weight_quant_type='abs_max',
                            for_ci=False):
+
         def build_program(main, startup, is_test):
             main.random_seed = seed
             startup.random_seed = seed
             with fluid.unique_name.guard():
                 with fluid.program_guard(main, startup):
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
+                    img = fluid.layers.data(name='image',
+                                            shape=[1, 28, 28],
+                                            dtype='float32')
+                    label = fluid.layers.data(name='label',
+                                              shape=[1],
+                                              dtype='int64')
                     loss = conv_net(img, label)
                     if not is_test:
                         opt = fluid.optimizer.Adam(learning_rate=0.0001)
@@ -135,10 +137,9 @@ def build_program(main, startup, is_test):
         iters = 5
         batch_size = 8
 
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=500),
-            batch_size=batch_size)
+        train_reader = paddle.batch(paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+                                    batch_size=batch_size)
         feeder = fluid.DataFeeder(feed_list=feeds, place=place)
         with fluid.scope_guard(scope):
             for _ in range(iters):
@@ -169,11 +170,11 @@ def build_program(main, startup, is_test):
             f.write(str(server_program))
 
         with fluid.scope_guard(scope):
-            fluid.io.save_inference_model(
-                'quant_scale_model' + dev_name, ['image', 'label'], [loss],
-                exe,
-                server_program,
-                clip_extra=True)
+            fluid.io.save_inference_model('quant_scale_model' + dev_name,
+                                          ['image', 'label'], [loss],
+                                          exe,
+                                          server_program,
+                                          clip_extra=True)
 
     def test_quant_scale_cuda(self):
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py b/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py
index f5eb7d347ca09..80fe720504efd 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py
@@ -30,22 +30,20 @@
 
 
 def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='max',
-        act="relu")
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='avg',
-        act="relu")
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(input=img,
+                                                  filter_size=5,
+                                                  num_filters=20,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  pool_type='max',
+                                                  act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(input=conv_pool_1,
+                                                  filter_size=5,
+                                                  num_filters=50,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  pool_type='avg',
+                                                  act="relu")
     with fluid.name_scope("skip_quant"):
         hidden = fluid.layers.fc(input=conv_pool_1, size=100, act='relu')
     prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
@@ -55,21 +53,25 @@ def conv_net(img, label):
 
 
 class TestQuantizeProgramPass(unittest.TestCase):
+
     def quantize_program(self,
                          use_cuda,
                          seed,
                          activation_quant_type='abs_max',
                          weight_quant_type='abs_max',
                          for_ci=False):
+
         def build_program(main, startup, is_test):
             main.random_seed = seed
             startup.random_seed = seed
             with fluid.unique_name.guard():
                 with fluid.program_guard(main, startup):
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
+                    img = fluid.layers.data(name='image',
+                                            shape=[1, 28, 28],
+                                            dtype='float32')
+                    label = fluid.layers.data(name='label',
+                                              shape=[1],
+                                              dtype='int64')
                     loss = conv_net(img, label)
                     if not is_test:
                         opt = fluid.optimizer.Adam(learning_rate=0.0001)
@@ -88,8 +90,8 @@ def build_program(main, startup, is_test):
         test_program = test_program.clone(for_test=True)
 
         if not for_ci:
-            train_graph = IrGraph(
-                core.Graph(train_program.desc), for_test=False)
+            train_graph = IrGraph(core.Graph(train_program.desc),
+                                  for_test=False)
             train_graph.draw('.', 'train_program_1')
             test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
             test_graph.draw('.', 'test_program_1')
@@ -108,8 +110,8 @@ def build_program(main, startup, is_test):
         with fluid.scope_guard(scope):
             exe.run(startup_program)
         if not for_ci:
-            train_graph = IrGraph(
-                core.Graph(train_program.desc), for_test=False)
+            train_graph = IrGraph(core.Graph(train_program.desc),
+                                  for_test=False)
             train_graph.draw('.', 'train_program_2')
             test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
             test_graph.draw('.', 'test_program_2')
@@ -123,8 +125,8 @@ def build_program(main, startup, is_test):
         iters = 5
         batch_size = 8
 
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                    batch_size=batch_size)
         feeder = fluid.DataFeeder(feed_list=feeds, place=place)
         with fluid.scope_guard(scope):
             for idx in range(iters):
@@ -141,20 +143,19 @@ def build_program(main, startup, is_test):
         qt.convert(test_program, scope)
         if not for_ci:
             with fluid.scope_guard(scope):
-                fluid.io.save_inference_model(
-                    './infer_model', ['image', 'label'], [loss],
-                    exe,
-                    test_program,
-                    clip_extra=True)
+                fluid.io.save_inference_model('./infer_model',
+                                              ['image', 'label'], [loss],
+                                              exe,
+                                              test_program,
+                                              clip_extra=True)
 
     def test_gpu_1(self):
         if fluid.core.is_compiled_with_cuda():
-            self.quantize_program(
-                use_cuda=True,
-                seed=1,
-                activation_quant_type='abs_max',
-                weight_quant_type='abs_max',
-                for_ci=True)
+            self.quantize_program(use_cuda=True,
+                                  seed=1,
+                                  activation_quant_type='abs_max',
+                                  weight_quant_type='abs_max',
+                                  for_ci=True)
 
     def test_gpu_2(self):
         if fluid.core.is_compiled_with_cuda():
@@ -166,20 +167,18 @@ def test_gpu_2(self):
                 for_ci=True)
 
     def test_cpu_1(self):
-        self.quantize_program(
-            use_cuda=False,
-            seed=2,
-            activation_quant_type='abs_max',
-            weight_quant_type='abs_max',
-            for_ci=True)
+        self.quantize_program(use_cuda=False,
+                              seed=2,
+                              activation_quant_type='abs_max',
+                              weight_quant_type='abs_max',
+                              for_ci=True)
 
     def test_cpu_2(self):
-        self.quantize_program(
-            use_cuda=False,
-            seed=2,
-            activation_quant_type='moving_average_abs_max',
-            weight_quant_type='channel_wise_abs_max',
-            for_ci=True)
+        self.quantize_program(use_cuda=False,
+                              seed=2,
+                              activation_quant_type='moving_average_abs_max',
+                              weight_quant_type='channel_wise_abs_max',
+                              for_ci=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
index f03d0faa3981b..96c56529cf14b 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
@@ -36,23 +36,21 @@
 
 
 def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='max',
-        act="relu")
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(input=img,
+                                                  filter_size=5,
+                                                  num_filters=20,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  pool_type='max',
+                                                  act="relu")
     conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='avg',
-        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(input=conv_pool_1,
+                                                  filter_size=5,
+                                                  num_filters=50,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  pool_type='avg',
+                                                  act="relu")
     hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
     prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
@@ -79,6 +77,7 @@ def pact(x, name=None):
 
 
 class TestUserDefinedQuantization(unittest.TestCase):
+
     def quantization_scale(self,
                            use_cuda,
                            seed,
@@ -89,16 +88,19 @@ def quantization_scale(self,
                            weight_preprocess_func=None,
                            act_quantize_func=None,
                            weight_quantize_func=None):
+
         def build_program(main, startup, is_test):
             main.random_seed = seed
             startup.random_seed = seed
             with fluid.unique_name.guard():
                 with fluid.program_guard(main, startup):
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
+                    img = fluid.layers.data(name='image',
+                                            shape=[1, 28, 28],
+                                            dtype='float32')
                     img.stop_gradient = False
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
+                    label = fluid.layers.data(name='label',
+                                              shape=[1],
+                                              dtype='int64')
                     loss = conv_net(img, label)
                     if not is_test:
                         opt = fluid.optimizer.SGD(learning_rate=0.0001)
@@ -180,10 +182,9 @@ def save_dict(Dict):
         iters = 5
         batch_size = 8
 
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=500),
-            batch_size=batch_size)
+        train_reader = paddle.batch(paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+                                    batch_size=batch_size)
         feeder = fluid.DataFeeder(feed_list=feeds, place=place)
         with fluid.scope_guard(scope):
             for _ in range(iters):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
index 744c97c514b36..cbe0326c46a41 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
@@ -46,6 +46,7 @@ def _set_variable_data(scope, place, var_name, np_value):
 
 
 class TestWeightQuantization(unittest.TestCase):
+
     def setUp(self):
         self.weight_quantization_dir = 'weight_quantization'
         self.cache_folder = os.path.join(DATA_HOME,
@@ -64,8 +65,8 @@ def download_model(self, model_name, data_url, data_md5):
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
-            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
-                                                          zip_path)
+            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
+                target_folder, zip_path)
             os.system(cmd)
 
     def quantize_to_int(self, model_name, model_data_url, model_data_md5,
@@ -94,8 +95,8 @@ def quantize_to_int(self, model_name, model_data_url, model_data_md5,
         try:
             os.system("rm -rf {}".format(save_model_dir))
         except Exception as e:
-            print("Failed to delete {} due to {}".format(save_model_dir, str(
-                e)))
+            print("Failed to delete {} due to {}".format(
+                save_model_dir, str(e)))
 
     def convert_to_fp16(self, model_name, model_data_url, model_data_md5,
                         model_filename, params_filename):
@@ -123,15 +124,18 @@ def convert_to_fp16(self, model_name, model_data_url, model_data_md5,
                                    params_filename, input_data, True)
 
         self.assertTrue(
-            np.allclose(
-                res_fp32, res_fp16, rtol=1e-5, atol=1e-08, equal_nan=True),
+            np.allclose(res_fp32,
+                        res_fp16,
+                        rtol=1e-5,
+                        atol=1e-08,
+                        equal_nan=True),
             msg='Failed to test the accuracy of the fp32 and fp16 model.')
 
         try:
             os.system("rm -rf {}".format(save_model_dir))
         except Exception as e:
-            print("Failed to delete {} due to {}".format(save_model_dir, str(
-                e)))
+            print("Failed to delete {} due to {}".format(
+                save_model_dir, str(e)))
 
     def run_models(self, model_dir, model_filename, params_filename, input_data,
                    is_fp16_model):
diff --git a/python/paddle/fluid/contrib/sparsity/__init__.py b/python/paddle/fluid/contrib/sparsity/__init__.py
index ec288a1287119..b08778a707d23 100644
--- a/python/paddle/fluid/contrib/sparsity/__init__.py
+++ b/python/paddle/fluid/contrib/sparsity/__init__.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index c366af7237d1b..0710ee9c722ea 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -118,8 +118,8 @@ def forward(self, img):
     """
     if main_program is None:
         main_program = paddle.static.default_main_program()
-    ASPHelper.set_excluded_layers(
-        param_names=param_names, main_program=main_program)
+    ASPHelper.set_excluded_layers(param_names=param_names,
+                                  main_program=main_program)
 
 
 def reset_excluded_layers(main_program=None):
@@ -454,16 +454,15 @@ def forward(self, img):
             place = paddle.CUDAPlace(gpu_id)
     else:
         raise TypeError(
-            "model should be paddle.nn.Layer or paddle.static.Program, but got {}".
-            format(type(model)))
+            "model should be paddle.nn.Layer or paddle.static.Program, but got {}"
+            .format(type(model)))
 
-    return prune_func(
-        place,
-        model,
-        n=n,
-        m=m,
-        mask_algo=MaskAlgo_mapping[mask_algo],
-        with_mask=with_mask)
+    return prune_func(place,
+                      model,
+                      n=n,
+                      m=m,
+                      mask_algo=MaskAlgo_mapping[mask_algo],
+                      with_mask=with_mask)
 
 
 class ProgramASPInfo(object):
@@ -624,8 +623,8 @@ def prune_model_by_layer(cls,
                     param.set_value(weight_pruned_nparray)
 
                     if with_mask:
-                        weight_mask_param = asp_info.mask_vars.get(param.name,
-                                                                   None)
+                        weight_mask_param = asp_info.mask_vars.get(
+                            param.name, None)
                         assert weight_mask_param is not None, \
                             'Cannot find {} variable, please call sparsity.decorate() to' \
                             ' decorate your optimizer first!'.format(ASPHelper._get_mask_name(param.name))
@@ -642,13 +641,12 @@ def prune_model_by_layer(cls,
                 target_program = param.block.program
             assert target_program is not None, \
                     'Cannot get paddle.static.Program from Paddle.nn.Layer.'
-            return ASPHelper.prune_model_by_program(
-                place,
-                target_program,
-                n=n,
-                m=m,
-                mask_algo=mask_algo,
-                with_mask=with_mask)
+            return ASPHelper.prune_model_by_program(place,
+                                                    target_program,
+                                                    n=n,
+                                                    m=m,
+                                                    mask_algo=mask_algo,
+                                                    with_mask=with_mask)
 
     @staticmethod
     def _get_mask_name(param_name):
@@ -746,10 +744,10 @@ def _get_prune_func_by_name(cls, param_name):
                 param_name_no_weight_suffix, None)
         if func is None:
             layer_name = param_name_no_weight_suffix[:
-                                                     param_name_no_weight_suffix.
-                                                     rfind('_')]
-            func = supported_layers_and_prune_func_map.get(layer_name,
-                                                           _default_pruning)
+                                                     param_name_no_weight_suffix
+                                                     .rfind('_')]
+            func = supported_layers_and_prune_func_map.get(
+                layer_name, _default_pruning)
         return func
 
     @classmethod
@@ -859,16 +857,17 @@ def _insert_sparse_mask_ops(cls, main_program, params):
         asp_info = cls._get_program_asp_info(main_program)
         for param in params:
             if param.name in asp_info.mask_vars:
-                block.append_op(
-                    type='elementwise_mul',
-                    inputs={"X": param,
-                            'Y': asp_info.mask_vars[param.name]},
-                    outputs={'Out': param},
-                    attrs={
-                        'axis': -1,
-                        'use_mkldnn': False,
-                        OP_ROLE_KEY: int(OpRole.Optimize)
-                    })
+                block.append_op(type='elementwise_mul',
+                                inputs={
+                                    "X": param,
+                                    'Y': asp_info.mask_vars[param.name]
+                                },
+                                outputs={'Out': param},
+                                attrs={
+                                    'axis': -1,
+                                    'use_mkldnn': False,
+                                    OP_ROLE_KEY: int(OpRole.Optimize)
+                                })
 
 
 class OptimizerWithSparsityGuarantee(object):
@@ -903,12 +902,11 @@ def minimize(self,
             list: operators from :attr:`optimizer`.minimize(:attr:`loss`).
             list: pairs of parameters and their gradients.
         """
-        return ASPHelper._minimize(
-            self._optimizer,
-            loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set)
+        return ASPHelper._minimize(self._optimizer,
+                                   loss,
+                                   startup_program=startup_program,
+                                   parameter_list=parameter_list,
+                                   no_grad_set=no_grad_set)
 
     @dygraph_only
     def step(self):
diff --git a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
index 105c2ded9eee7..d9d8c262ada1b 100644
--- a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
+++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -30,21 +30,23 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
     # cuSparseLt would prune matrix A along k dimension.
     # In sparse training, layer weight matrices is viewed sparse matrix A, so
     # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle
-    #  is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed 
-    # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension 
-    # of W^T, which is m dimension of W. Moreove, all mask generating functions in 
-    # sparsity/utils is row-major pruning. That is the reason we have to transpose weight 
-    # matrices beforce invoking create_mask. Then we transpose the result mask to make 
+    #  is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed
+    # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension
+    # of W^T, which is m dimension of W. Moreove, all mask generating functions in
+    # sparsity/utils is row-major pruning. That is the reason we have to transpose weight
+    # matrices beforce invoking create_mask. Then we transpose the result mask to make
     # sure its shape to be the same as the input weight.
-    weight_sparse_mask = sparsity.create_mask(
-        weight_nparray.T, func_name=func_name, n=n, m=m).T
+    weight_sparse_mask = sparsity.create_mask(weight_nparray.T,
+                                              func_name=func_name,
+                                              n=n,
+                                              m=m).T
     weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask)
     assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
                     'Pruning {} weight matrix failure!!!'.format(param_name)
     return weight_pruned_nparray, weight_sparse_mask
 
 
-# When value of given key in this DICT is None, 
+# When value of given key in this DICT is None,
 # ASP will call default pruning function in pruning stage.
 _supported_layers_and_prune_func_map_lock = threading.Lock()
 supported_layers_and_prune_func_map = {}
diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py
index a28f7fc2b4ed6..1d0694c4dde3c 100644
--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -406,8 +406,8 @@ def _compute_valid_2d_patterns(n, m):
         patterns = patterns + patterns
         patterns = np.asarray(list(set(permutations(patterns, m))))
 
-        valid = ((patterns.sum(axis=1) <= n).sum(axis=1) == m
-                 ).nonzero()[0].reshape(-1)
+        valid = ((patterns.sum(axis=1) <= n).sum(
+            axis=1) == m).nonzero()[0].reshape(-1)
         valid_patterns = np.empty((valid.shape[0], m, m))
         valid_patterns[:] = patterns[valid[:]]
 
@@ -454,9 +454,9 @@ def get_mask_2d_best(mat, n, m):
 
     mat_flattern, shape = _reshape_2d(mat, m)
     mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m)
-    pmax = np.argmax(
-        np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T),
-        axis=1)
+    pmax = np.argmax(np.matmul(mat_flattern,
+                               patterns.reshape(patterns.shape[0], m * m).T),
+                     axis=1)
 
     mask_flattern[:] = patterns[pmax[:]]
     mask = np.empty(shape)
@@ -578,8 +578,8 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
         t = t.reshape(shape[0] * shape[1], shape[2])
     # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
     elif len(shape) == 4:
-        t = t.transpose([0, 1, 3, 2]).reshape(
-            [shape[0] * shape[1] * shape[3], shape[2]])
+        t = t.transpose([0, 1, 3,
+                         2]).reshape([shape[0] * shape[1] * shape[3], shape[2]])
     else:
         raise ValueError("The dimension of input tensor is not supported in create_mask, " \
                          "Only dimension < 4 is supported but got {}".format(len(shape)))
diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt
index b4c5ad057f986..48e107c4b4d7a 100644
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
@@ -1,19 +1,29 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 list(REMOVE_ITEM TEST_OPS test_multi_precision_fp16_train)
 
 foreach(src ${TEST_OPS})
-        py_test(${src} SRCS ${src}.py)
+  py_test(${src} SRCS ${src}.py)
 endforeach()
 
-py_test_modules(test_multi_precision_fp16_train MODULES test_multi_precision_fp16_train ENVS FLAGS_cudnn_deterministic=true FLAGS_cudnn_batchnorm_spatial_persistent=true FLAGS_conv_workspace_size_limit=1000)
+py_test_modules(
+  test_multi_precision_fp16_train
+  MODULES
+  test_multi_precision_fp16_train
+  ENVS
+  FLAGS_cudnn_deterministic=true
+  FLAGS_cudnn_batchnorm_spatial_persistent=true
+  FLAGS_conv_workspace_size_limit=1000)
 
 set_tests_properties(test_image_classification_fp16 PROPERTIES TIMEOUT 120)
 set_tests_properties(test_weight_decay_extend PROPERTIES TIMEOUT 120)
 set_tests_properties(test_multi_precision_fp16_train PROPERTIES TIMEOUT 120)
 
-if (APPLE)
-        set_tests_properties(test_model_cast_to_bf16 PROPERTIES TIMEOUT 300)
-        set_tests_properties(test_quantize_transpiler PROPERTIES TIMEOUT 300)
+if(APPLE)
+  set_tests_properties(test_model_cast_to_bf16 PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_quantize_transpiler PROPERTIES TIMEOUT 300)
 endif()
diff --git a/python/paddle/fluid/contrib/tests/test_amp_list.py b/python/paddle/fluid/contrib/tests/test_amp_list.py
index 9133a404fa0e2..fb46df1377627 100644
--- a/python/paddle/fluid/contrib/tests/test_amp_list.py
+++ b/python/paddle/fluid/contrib/tests/test_amp_list.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,6 +18,7 @@
 
 
 class TestAMPList(unittest.TestCase):
+
     def test_main(self):
         custom_white_list = [
             'lookup_table',
diff --git a/python/paddle/fluid/contrib/tests/test_bf16_utils.py b/python/paddle/fluid/contrib/tests/test_bf16_utils.py
index a1439c487b6aa..c456b1263ce23 100644
--- a/python/paddle/fluid/contrib/tests/test_bf16_utils.py
+++ b/python/paddle/fluid/contrib/tests/test_bf16_utils.py
@@ -22,6 +22,7 @@
 
 
 class AMPTest(unittest.TestCase):
+
     def setUp(self):
         self.bf16_list = copy.copy(amp.bf16.amp_lists.bf16_list)
         self.fp32_list = copy.copy(amp.bf16.amp_lists.fp32_list)
@@ -95,6 +96,7 @@ def test_amp_list_8(self):
 
 
 class AMPTest2(unittest.TestCase):
+
     def test_amp_lists_(self):
         # 7. w={'lstm'} b={'lstm'}
         # raise ValueError
@@ -113,10 +115,12 @@ def test_is_in_fp32_varnames(self):
         var1 = block.create_var(name="X", shape=[3], dtype='float32')
         var2 = block.create_var(name="Y", shape=[3], dtype='float32')
         var3 = block.create_var(name="Z", shape=[3], dtype='float32')
-        op1 = block.append_op(
-            type="abs", inputs={"X": [var1]}, outputs={"Out": [var2]})
-        op2 = block.append_op(
-            type="abs", inputs={"X": [var2]}, outputs={"Out": [var3]})
+        op1 = block.append_op(type="abs",
+                              inputs={"X": [var1]},
+                              outputs={"Out": [var2]})
+        op2 = block.append_op(type="abs",
+                              inputs={"X": [var2]},
+                              outputs={"Out": [var3]})
         amp_lists_1 = amp.bf16.AutoMixedPrecisionListsBF16(
             custom_fp32_varnames={'X'})
         assert amp.bf16.amp_utils._is_in_fp32_varnames(op1, amp_lists_1)
@@ -132,10 +136,12 @@ def test_find_true_post_op(self):
         var1 = block.create_var(name="X", shape=[3], dtype='float32')
         var2 = block.create_var(name="Y", shape=[3], dtype='float32')
         var3 = block.create_var(name="Z", shape=[3], dtype='float32')
-        op1 = block.append_op(
-            type="abs", inputs={"X": [var1]}, outputs={"Out": [var2]})
-        op2 = block.append_op(
-            type="abs", inputs={"X": [var2]}, outputs={"Out": [var3]})
+        op1 = block.append_op(type="abs",
+                              inputs={"X": [var1]},
+                              outputs={"Out": [var2]})
+        op2 = block.append_op(type="abs",
+                              inputs={"X": [var2]},
+                              outputs={"Out": [var3]})
         res = amp.bf16.amp_utils.find_true_post_op(block.ops, op1, "Y")
         assert (res == [op2])
 
@@ -146,20 +152,26 @@ def test_find_true_post_op_with_search_all(self):
 
         var1 = block.create_var(name="X", shape=[3], dtype='float32')
         var2 = block.create_var(name="Y", shape=[3], dtype='float32')
-        inititializer_op = startup_block._prepend_op(
-            type="fill_constant",
-            outputs={"Out": var1},
-            attrs={"shape": var1.shape,
-                   "dtype": var1.dtype,
-                   "value": 1.0})
-
-        op1 = block.append_op(
-            type="abs", inputs={"X": [var1]}, outputs={"Out": [var2]})
-        result = amp.bf16.amp_utils.find_true_post_op(
-            block.ops, inititializer_op, "X", search_all=False)
+        inititializer_op = startup_block._prepend_op(type="fill_constant",
+                                                     outputs={"Out": var1},
+                                                     attrs={
+                                                         "shape": var1.shape,
+                                                         "dtype": var1.dtype,
+                                                         "value": 1.0
+                                                     })
+
+        op1 = block.append_op(type="abs",
+                              inputs={"X": [var1]},
+                              outputs={"Out": [var2]})
+        result = amp.bf16.amp_utils.find_true_post_op(block.ops,
+                                                      inititializer_op,
+                                                      "X",
+                                                      search_all=False)
         assert (len(result) == 0)
-        result = amp.bf16.amp_utils.find_true_post_op(
-            block.ops, inititializer_op, "X", search_all=True)
+        result = amp.bf16.amp_utils.find_true_post_op(block.ops,
+                                                      inititializer_op,
+                                                      "X",
+                                                      search_all=True)
         assert (result == [op1])
 
 
diff --git a/python/paddle/fluid/contrib/tests/test_correlation.py b/python/paddle/fluid/contrib/tests/test_correlation.py
index 50b091415a52a..c98cbd1dd9310 100644
--- a/python/paddle/fluid/contrib/tests/test_correlation.py
+++ b/python/paddle/fluid/contrib/tests/test_correlation.py
@@ -55,15 +55,17 @@ def corr(x_1,
                         y1_index = j + pad_size
                         x2_index = x1_index + k
                         y2_index = y1_index + l
-                        output[b, l + d + D * (k + d), i, j] = np.mean(
-                            rinput1[b, x1_index:x1_index + K, y1_index:y1_index
-                                    + K] * rinput2[b, x2_index:x2_index + K,
-                                                   y2_index:y2_index + K])
+                        output[b, l + d + D * (k + d), i,
+                               j] = np.mean(rinput1[b, x1_index:x1_index + K,
+                                                    y1_index:y1_index + K] *
+                                            rinput2[b, x2_index:x2_index + K,
+                                                    y2_index:y2_index + K])
 
     return output
 
 
 class TestCorrelationOp(unittest.TestCase):
+
     def test_check_output(self):
         if not fluid.core.is_compiled_with_cuda():
             return
@@ -71,38 +73,34 @@ def test_check_output(self):
         np.set_printoptions(threshold=np.inf)
         x_shape = (2, 10, 3, 3)
         x_type = 'float32'
-        x1 = fluid.layers.data(
-            name='x1',
-            shape=x_shape,
-            dtype=x_type,
-            append_batch_size=False,
-            stop_gradient=False)
-        x2 = fluid.layers.data(
-            name='x2',
-            shape=x_shape,
-            dtype=x_type,
-            append_batch_size=False,
-            stop_gradient=False)
+        x1 = fluid.layers.data(name='x1',
+                               shape=x_shape,
+                               dtype=x_type,
+                               append_batch_size=False,
+                               stop_gradient=False)
+        x2 = fluid.layers.data(name='x2',
+                               shape=x_shape,
+                               dtype=x_type,
+                               append_batch_size=False,
+                               stop_gradient=False)
 
         x1_np = np.random.randn(2, 3, 4, 5).astype(x_type)
         x2_np = np.random.randn(2, 3, 4, 5).astype(x_type)
-        out_np = corr(
-            x1_np,
-            x2_np,
-            pad_size=4,
-            kernel_size=1,
-            max_displacement=4,
-            stride1=1,
-            stride2=1)
-
-        out = fluid.contrib.correlation(
-            x1,
-            x2,
-            pad_size=4,
-            kernel_size=1,
-            max_displacement=4,
-            stride1=1,
-            stride2=1)
+        out_np = corr(x1_np,
+                      x2_np,
+                      pad_size=4,
+                      kernel_size=1,
+                      max_displacement=4,
+                      stride1=1,
+                      stride2=1)
+
+        out = fluid.contrib.correlation(x1,
+                                        x2,
+                                        pad_size=4,
+                                        kernel_size=1,
+                                        max_displacement=4,
+                                        stride1=1,
+                                        stride2=1)
 
         loss = fluid.layers.reduce_mean(out)
         optimizer = fluid.optimizer.Momentum(0.0001, 0.9)
@@ -110,30 +108,33 @@ def test_check_output(self):
 
         place = fluid.CUDAPlace(0)
         exe = fluid.Executor(place)
-        res = exe.run(feed={'x1': x1_np,
-                            'x2': x2_np},
+        res = exe.run(feed={
+            'x1': x1_np,
+            'x2': x2_np
+        },
                       fetch_list=[out.name, loss.name])
 
         self.assertTrue(np.allclose(res[0], out_np))
 
 
 class Net(fluid.dygraph.Layer):
+
     def __init__(self, name_scope):
         super(Net, self).__init__(name_scope)
 
     def forward(self, x1, x2):
-        y = fluid.contrib.correlation(
-            x1,
-            x2,
-            pad_size=4,
-            kernel_size=1,
-            max_displacement=4,
-            stride1=1,
-            stride2=1)
+        y = fluid.contrib.correlation(x1,
+                                      x2,
+                                      pad_size=4,
+                                      kernel_size=1,
+                                      max_displacement=4,
+                                      stride1=1,
+                                      stride2=1)
         return y
 
 
 class TestCorrelationOpDyGraph(unittest.TestCase):
+
     def test_check_output(self):
         if not fluid.core.is_compiled_with_cuda():
             return
@@ -145,14 +146,13 @@ def test_check_output(self):
         with fluid.dygraph.guard(place):
             x1_np = np.random.randn(2, 3, 4, 5).astype(x_type)
             x2_np = np.random.randn(2, 3, 4, 5).astype(x_type)
-            out_np = corr(
-                x1_np,
-                x2_np,
-                pad_size=4,
-                kernel_size=1,
-                max_displacement=4,
-                stride1=1,
-                stride2=1)
+            out_np = corr(x1_np,
+                          x2_np,
+                          pad_size=4,
+                          kernel_size=1,
+                          max_displacement=4,
+                          stride1=1,
+                          stride2=1)
 
             x1 = to_variable(x1_np)
             x2 = to_variable(x2_np)
diff --git a/python/paddle/fluid/contrib/tests/test_fp16_utils.py b/python/paddle/fluid/contrib/tests/test_fp16_utils.py
index 0b51f2dcc869e..54753ce4479a3 100644
--- a/python/paddle/fluid/contrib/tests/test_fp16_utils.py
+++ b/python/paddle/fluid/contrib/tests/test_fp16_utils.py
@@ -22,6 +22,7 @@
 
 
 class AMPTest(unittest.TestCase):
+
     def test_find_op_index(self):
         block = fluid.default_main_program().global_block()
         op_desc = core.OpDesc()
@@ -34,10 +35,12 @@ def test_find_true_post_op(self):
         var1 = block.create_var(name="X", shape=[3], dtype='float32')
         var2 = block.create_var(name="Y", shape=[3], dtype='float32')
         var3 = block.create_var(name="Z", shape=[3], dtype='float32')
-        op1 = block.append_op(
-            type="abs", inputs={"X": [var1]}, outputs={"Out": [var2]})
-        op2 = block.append_op(
-            type="abs", inputs={"X": [var2]}, outputs={"Out": [var3]})
+        op1 = block.append_op(type="abs",
+                              inputs={"X": [var1]},
+                              outputs={"Out": [var2]})
+        op2 = block.append_op(type="abs",
+                              inputs={"X": [var2]},
+                              outputs={"Out": [var3]})
         res = fp16_utils.find_true_post_op(block.ops, op1, "Y")
         assert (res == [op2])
 
diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
index 66af517c3e1f2..028fd57229e56 100644
--- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
+++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
@@ -30,6 +30,7 @@
 
 
 def resnet_cifar10(input, depth=32):
+
     def conv_bn_layer(input,
                       ch_out,
                       filter_size,
@@ -37,14 +38,13 @@ def conv_bn_layer(input,
                       padding,
                       act='relu',
                       bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
+        tmp = fluid.layers.conv2d(input=input,
+                                  filter_size=filter_size,
+                                  num_filters=ch_out,
+                                  stride=stride,
+                                  padding=padding,
+                                  act=None,
+                                  bias_attr=bias_attr)
         return fluid.layers.batch_norm(input=tmp, act=act)
 
     def shortcut(input, ch_in, ch_out, stride):
@@ -67,28 +67,33 @@ def layer_warp(block_func, input, ch_in, ch_out, count, stride):
 
     assert (depth - 2) % 6 == 0
     n = (depth - 2) // 6
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    conv1 = conv_bn_layer(input=input,
+                          ch_out=16,
+                          filter_size=3,
+                          stride=1,
+                          padding=1)
     res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
     res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
     res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    pool = fluid.layers.pool2d(input=res3,
+                               pool_size=8,
+                               pool_type='avg',
+                               pool_stride=1)
     return pool
 
 
 def vgg16_bn_drop(input):
+
     def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
+        return fluid.nets.img_conv_group(input=input,
+                                         pool_size=2,
+                                         pool_stride=2,
+                                         conv_num_filter=[num_filter] * groups,
+                                         conv_filter_size=3,
+                                         conv_act='relu',
+                                         conv_with_batchnorm=True,
+                                         conv_batchnorm_drop_rate=dropouts,
+                                         pool_type='max')
 
     conv1 = conv_block(input, 64, 2, [0.3, 0])
     conv2 = conv_block(conv1, 128, 2, [0.4, 0])
@@ -113,8 +118,9 @@ def train(net_type, use_cuda, save_dirname, is_local):
     train_program.random_seed = 123
     startup_prog.random_seed = 456
     with fluid.program_guard(train_program, startup_prog):
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
+        images = fluid.layers.data(name='pixel',
+                                   shape=data_shape,
+                                   dtype='float32')
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
         if net_type == "vgg":
@@ -139,11 +145,10 @@ def train(net_type, use_cuda, save_dirname, is_local):
 
         amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
             custom_black_varnames={"loss", "conv2d_0.w_0"})
-        mp_optimizer = decorate(
-            optimizer=optimizer,
-            amp_lists=amp_lists,
-            init_loss_scaling=8.0,
-            use_dynamic_loss_scaling=True)
+        mp_optimizer = decorate(optimizer=optimizer,
+                                amp_lists=amp_lists,
+                                init_loss_scaling=8.0,
+                                use_dynamic_loss_scaling=True)
 
         mp_optimizer.minimize(avg_cost)
         loss_scaling = mp_optimizer.get_loss_scaling()
@@ -153,11 +158,11 @@ def train(net_type, use_cuda, save_dirname, is_local):
     PASS_NUM = 1
 
     # no shuffle for unit test
-    train_reader = paddle.batch(
-        paddle.dataset.cifar.train10(), batch_size=BATCH_SIZE)
+    train_reader = paddle.batch(paddle.dataset.cifar.train10(),
+                                batch_size=BATCH_SIZE)
 
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(paddle.dataset.cifar.test10(),
+                               batch_size=BATCH_SIZE)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
@@ -173,9 +178,9 @@ def train_loop(main_program):
                     feed=feeder.feed(data),
                     fetch_list=[scaled_loss, avg_cost])
                 print(
-                    'PassID {0:1}, BatchID {1:04}, train loss {2:2.4}, scaled train closs {3:2.4}'.
-                    format(pass_id, batch_id + 1,
-                           float(loss), float(np_scaled_loss)))
+                    'PassID {0:1}, BatchID {1:04}, train loss {2:2.4}, scaled train closs {3:2.4}'
+                    .format(pass_id, batch_id + 1, float(loss),
+                            float(np_scaled_loss)))
                 if (batch_id % 10) == 0:
                     acc_list = []
                     avg_loss_list = []
@@ -193,9 +198,9 @@ def train_loop(main_program):
                     avg_loss_value = numpy.array(avg_loss_list).mean()
 
                     print(
-                        'PassID {0:1}, BatchID {1:04}, test loss {2:2.2}, acc {3:2.2}'.
-                        format(pass_id, batch_id + 1,
-                               float(avg_loss_value), float(acc_value)))
+                        'PassID {0:1}, BatchID {1:04}, test loss {2:2.2}, acc {3:2.2}'
+                        .format(pass_id, batch_id + 1, float(avg_loss_value),
+                                float(acc_value)))
 
                     if acc_value > 0.08:  # Low threshold for speeding up CI
                         fluid.io.save_inference_model(
@@ -259,13 +264,12 @@ def infer(use_cuda, save_dirname=None):
 
         print("infer results: ", results[0])
 
-        fluid.io.save_inference_model(
-            save_dirname,
-            feed_target_names,
-            fetch_targets,
-            exe,
-            inference_program,
-            clip_extra=True)
+        fluid.io.save_inference_model(save_dirname,
+                                      feed_target_names,
+                                      fetch_targets,
+                                      exe,
+                                      inference_program,
+                                      clip_extra=True)
 
 
 def main(net_type, use_cuda, is_local=True):
@@ -280,6 +284,7 @@ def main(net_type, use_cuda, is_local=True):
 
 
 class TestImageClassification(unittest.TestCase):
+
     def test_amp_lists(self):
         white_list = copy.copy(
             fluid.contrib.mixed_precision.fp16_lists.white_list)
@@ -425,15 +430,18 @@ def scope_prog_guard(self):
 
 
 class TestAmpWithNonIterableDataLoader(unittest.TestCase):
+
     def decorate_with_data_loader(self):
         main_prog = paddle.static.Program()
         start_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, start_prog):
             with paddle.fluid.unique_name.guard():
-                image = fluid.layers.data(
-                    name='image', shape=[3, 224, 224], dtype='float32')
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64')
+                image = fluid.layers.data(name='image',
+                                          shape=[3, 224, 224],
+                                          dtype='float32')
+                label = fluid.layers.data(name='label',
+                                          shape=[1],
+                                          dtype='int64')
                 py_reader = fluid.io.DataLoader.from_generator(
                     feed_list=[image, label],
                     capacity=4,
@@ -449,11 +457,10 @@ def decorate_with_data_loader(self):
                 optimizer = fluid.optimizer.Lamb(learning_rate=0.001)
                 amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
                     custom_black_varnames={"loss", "conv2d_0.w_0"})
-                mp_optimizer = decorate(
-                    optimizer=optimizer,
-                    amp_lists=amp_lists,
-                    init_loss_scaling=8.0,
-                    use_dynamic_loss_scaling=True)
+                mp_optimizer = decorate(optimizer=optimizer,
+                                        amp_lists=amp_lists,
+                                        init_loss_scaling=8.0,
+                                        use_dynamic_loss_scaling=True)
 
                 mp_optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py b/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py
index 5362a6ecd16d8..4682be8114a8b 100644
--- a/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py
+++ b/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py
@@ -44,6 +44,7 @@ def convert_uint16_to_float(in_list):
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestModelCastBF16(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         cls.seed = 111
@@ -75,8 +76,8 @@ def get_static_graph_result(self,
                                 with_lod=False,
                                 startup_prog=None):
         exe = fluid.Executor(core.CPUPlace())
-        exe.run(fluid.default_startup_program()
-                if startup_prog is None else startup_prog)
+        exe.run(fluid.default_startup_program(
+        ) if startup_prog is None else startup_prog)
         prog = fluid.default_main_program()
         if amp_fun is not None:
             if startup_prog is not None:
@@ -97,10 +98,12 @@ def _graph_common(self, _amp_fun, startup_prog=None):
         nn_bf16 = amp.bf16.convert_float_to_uint16(nn)
 
         with self.static_graph():
-            t_bf16 = layers.data(
-                name='t_bf16', shape=[size, size], dtype=np.uint16)
-            tt_bf16 = layers.data(
-                name='tt_bf16', shape=[size, size], dtype=np.uint16)
+            t_bf16 = layers.data(name='t_bf16',
+                                 shape=[size, size],
+                                 dtype=np.uint16)
+            tt_bf16 = layers.data(name='tt_bf16',
+                                  shape=[size, size],
+                                  dtype=np.uint16)
             t = layers.data(name='t', shape=[size, size], dtype='float32')
             tt = layers.data(name='tt', shape=[size, size], dtype='float32')
 
@@ -151,27 +154,26 @@ def _graph_common(self, _amp_fun, startup_prog=None):
                     amp_fun=_amp_fun,
                     startup_prog=startup_prog
                 )
-        self.assertTrue(
-            static_ret_bf16, np.ones(
-                [size, size], dtype='float32') * -1.1)
+        self.assertTrue(static_ret_bf16,
+                        np.ones([size, size], dtype='float32') * -1.1)
 
     def test_graph_rewrite(self):
         self._graph_common(lambda prog: amp.bf16.rewrite_program_bf16(
             prog,
             amp.bf16.AutoMixedPrecisionListsBF16(
                 custom_bf16_list={'elementwise_add'},
-                custom_fp32_varnames={'elementwise_add_0.tmp_0'})
-        ))
+                custom_fp32_varnames={'elementwise_add_0.tmp_0'})))
 
     def test_graph_cast(self):
-        self._graph_common(lambda prog, startup_prog: amp.bf16.cast_model_to_bf16(
-            prog,
-            startup_prog,
-            amp.bf16.AutoMixedPrecisionListsBF16(
-                custom_bf16_list={'elementwise_add'},
-                custom_fp32_list={'elementwise_mul'}),
-            use_bf16_guard=True
-        ), startup_prog=fluid.default_startup_program())
+        self._graph_common(
+            lambda prog, startup_prog: amp.bf16.cast_model_to_bf16(
+                prog,
+                startup_prog,
+                amp.bf16.AutoMixedPrecisionListsBF16(
+                    custom_bf16_list={'elementwise_add'},
+                    custom_fp32_list={'elementwise_mul'}),
+                use_bf16_guard=True),
+            startup_prog=fluid.default_startup_program())
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
index 92786f2835277..c062a039f28d0 100644
--- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
+++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
@@ -26,6 +26,7 @@
 
 
 class RandomDataset(Dataset):
+
     def __init__(self, num_samples, seed=123):
         super(RandomDataset, self).__init__()
         np.random.seed(seed)
@@ -41,6 +42,7 @@ def __len__(self):
 
 
 def reader_decorator(reader):
+
     def __reader__():
         for i in range(len(reader)):
             yield reader[i]
@@ -49,6 +51,7 @@ def __reader__():
 
 
 def resnet_cifar10(input, depth=32):
+
     def conv_bn_layer(input,
                       ch_out,
                       filter_size,
@@ -56,14 +59,13 @@ def conv_bn_layer(input,
                       padding,
                       act='relu',
                       bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
+        tmp = fluid.layers.conv2d(input=input,
+                                  filter_size=filter_size,
+                                  num_filters=ch_out,
+                                  stride=stride,
+                                  padding=padding,
+                                  act=None,
+                                  bias_attr=bias_attr)
         return fluid.layers.batch_norm(input=tmp, act=act)
 
     def shortcut(input, ch_in, ch_out, stride):
@@ -86,14 +88,19 @@ def layer_warp(block_func, input, ch_in, ch_out, count, stride):
 
     assert (depth - 2) % 6 == 0
     n = (depth - 2) // 6
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    conv1 = conv_bn_layer(input=input,
+                          ch_out=16,
+                          filter_size=3,
+                          stride=1,
+                          padding=1)
     with paddle.static.amp.fp16_guard():
         res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
         res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
         res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    pool = fluid.layers.pool2d(input=res3,
+                               pool_size=8,
+                               pool_type='avg',
+                               pool_stride=1)
     return pool
 
 
@@ -107,24 +114,25 @@ def train(use_pure_fp16=True, use_nesterov=False, optimizer=""):
     train_program.random_seed = 123
     startup_prog.random_seed = 456
     with fluid.program_guard(train_program, startup_prog):
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
+        images = fluid.layers.data(name='pixel',
+                                   shape=data_shape,
+                                   dtype='float32')
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
         net = resnet_cifar10(images)
         logits = fluid.layers.fc(input=net, size=classdim, act="softmax")
-        cost = fluid.layers.softmax_with_cross_entropy(
-            logits, label, return_softmax=False)
+        cost = fluid.layers.softmax_with_cross_entropy(logits,
+                                                       label,
+                                                       return_softmax=False)
         sum_cost = fluid.layers.reduce_sum(cost)
 
         # Test program
         test_program = train_program.clone(for_test=True)
 
         if optimizer == "Adam":
-            optimizer = paddle.optimizer.AdamW(
-                learning_rate=0.001,
-                epsilon=1e-8,
-                weight_decay=0.0,
-                multi_precision=True)
+            optimizer = paddle.optimizer.AdamW(learning_rate=0.001,
+                                               epsilon=1e-8,
+                                               weight_decay=0.0,
+                                               multi_precision=True)
         elif optimizer == "Lars":
             optimizer = paddle.fluid.optimizer.LarsMomentumOptimizer(
                 learning_rate=0.001,
@@ -147,17 +155,14 @@ def train(use_pure_fp16=True, use_nesterov=False, optimizer=""):
 
         optimizer.minimize(sum_cost)
 
-    train_reader = paddle.batch(
-        reader_decorator(RandomDataset(
-            16 * 5, seed=123)),
-        batch_size=16,
-        drop_last=True)
+    train_reader = paddle.batch(reader_decorator(RandomDataset(16 * 5,
+                                                               seed=123)),
+                                batch_size=16,
+                                drop_last=True)
 
-    test_reader = paddle.batch(
-        reader_decorator(RandomDataset(
-            4 * 5, seed=456)),
-        batch_size=4,
-        drop_last=True)
+    test_reader = paddle.batch(reader_decorator(RandomDataset(4 * 5, seed=456)),
+                               batch_size=4,
+                               drop_last=True)
 
     place = fluid.CUDAPlace(0)
     exe = fluid.Executor(place)
@@ -166,8 +171,9 @@ def train(use_pure_fp16=True, use_nesterov=False, optimizer=""):
     def train_loop():
         exe.run(startup_prog)
         if use_pure_fp16:
-            optimizer.amp_init(
-                place, test_program=test_program, use_fp16_test=True)
+            optimizer.amp_init(place,
+                               test_program=test_program,
+                               use_fp16_test=True)
 
         train_loss_list = []
         test_loss_list = []
@@ -195,6 +201,7 @@ def train_loop():
 
 
 class TestImageMultiPrecision(unittest.TestCase):
+
     def test_resnet_pure_fp16(self):
         if not fluid.core.is_compiled_with_cuda():
             return
@@ -221,22 +228,18 @@ def do_test(use_nesterov=False, optimizer=""):
                     use_nesterov=use_nesterov,
                     optimizer=optimizer)
 
-            self.assertTrue(
-                np.allclose(
-                    np.array(train_loss_fp16),
-                    np.array(train_loss_fp32),
-                    rtol=1e-02,
-                    atol=1e-05,
-                    equal_nan=True),
-                msg='Failed to train in pure FP16.')
-            self.assertTrue(
-                np.allclose(
-                    np.array(test_loss_fp16),
-                    np.array(test_loss_fp32),
-                    rtol=1e-02,
-                    atol=1e-05,
-                    equal_nan=True),
-                msg='Failed to test in pure FP16.')
+            self.assertTrue(np.allclose(np.array(train_loss_fp16),
+                                        np.array(train_loss_fp32),
+                                        rtol=1e-02,
+                                        atol=1e-05,
+                                        equal_nan=True),
+                            msg='Failed to train in pure FP16.')
+            self.assertTrue(np.allclose(np.array(test_loss_fp16),
+                                        np.array(test_loss_fp32),
+                                        rtol=1e-02,
+                                        atol=1e-05,
+                                        equal_nan=True),
+                            msg='Failed to test in pure FP16.')
 
         do_test(use_nesterov=False)
         do_test(use_nesterov=True)
@@ -254,24 +257,29 @@ def scope_prog_guard(self):
 
 
 class TestAmpWithNonIterableDataLoader(unittest.TestCase):
+
     def decorate_with_data_loader(self):
         main_prog = paddle.static.Program()
         start_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, start_prog):
             with paddle.fluid.unique_name.guard():
-                image = fluid.layers.data(
-                    name='image', shape=[3, 224, 224], dtype='float32')
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64')
+                image = fluid.layers.data(name='image',
+                                          shape=[3, 224, 224],
+                                          dtype='float32')
+                label = fluid.layers.data(name='label',
+                                          shape=[1],
+                                          dtype='int64')
                 py_reader = fluid.io.DataLoader.from_generator(
                     feed_list=[image, label],
                     capacity=4,
                     iterable=False,
                     use_double_buffer=False)
-                zero_var = fluid.layers.fill_constant(
-                    shape=[1], dtype='int64', value=0)
-                one_var = fluid.layers.fill_constant(
-                    shape=[1], dtype='int64', value=1)
+                zero_var = fluid.layers.fill_constant(shape=[1],
+                                                      dtype='int64',
+                                                      value=0)
+                one_var = fluid.layers.fill_constant(shape=[1],
+                                                     dtype='int64',
+                                                     value=1)
                 with fluid.layers.control_flow.Switch() as switch:
                     with switch.case(label != zero_var):
                         fluid.layers.assign(input=zero_var, output=label)
diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
index c3099ec88f282..dd900ff428135 100644
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -37,6 +37,7 @@ def linear_fc(num):
 
 
 def residual_block(num):
+
     def conv_bn_layer(input,
                       ch_out,
                       filter_size,
@@ -44,14 +45,13 @@ def conv_bn_layer(input,
                       padding,
                       act='relu',
                       bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
+        tmp = fluid.layers.conv2d(input=input,
+                                  filter_size=filter_size,
+                                  num_filters=ch_out,
+                                  stride=stride,
+                                  padding=padding,
+                                  act=None,
+                                  bias_attr=bias_attr)
         return fluid.layers.batch_norm(input=tmp, act=act)
 
     data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
@@ -68,21 +68,19 @@ def conv_bn_layer(input,
 
 
 def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(input=img,
+                                                  filter_size=5,
+                                                  num_filters=20,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
     conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(input=conv_pool_1,
+                                                  filter_size=5,
+                                                  num_filters=50,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
     prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
     avg_loss = fluid.layers.mean(loss)
@@ -90,6 +88,7 @@ def conv_net(img, label):
 
 
 class TestQuantizeTranspiler(unittest.TestCase):
+
     def setUp(self):
         # since quant_op and dequant_op is not ready, use cos and sin for test
         self.weight_quant_op_type = 'fake_quantize_abs_max'
@@ -180,15 +179,18 @@ def test_residual_block_range_abs_max(self):
         self.residual_block_quant('range_abs_max')
 
     def freeze_program(self, use_cuda, seed):
+
         def build_program(main, startup, is_test):
             main.random_seed = seed
             startup.random_seed = seed
             with fluid.unique_name.guard():
                 with fluid.program_guard(main, startup):
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
+                    img = fluid.layers.data(name='image',
+                                            shape=[1, 28, 28],
+                                            dtype='float32')
+                    label = fluid.layers.data(name='label',
+                                              shape=[1],
+                                              dtype='int64')
                     loss = conv_net(img, label)
                     if not is_test:
                         opt = fluid.optimizer.Adam(learning_rate=0.001)
@@ -220,12 +222,11 @@ def build_program(main, startup, is_test):
         class_num = 10
         exe.run(startup)
 
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=500),
-            batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+                                    batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
         feeder = fluid.DataFeeder(feed_list=feeds, place=place)
 
         with fluid.program_guard(main):
@@ -250,25 +251,25 @@ def build_program(main, startup, is_test):
                                   feed=feeder.feed(test_data),
                                   fetch_list=[loss])
             self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
-            w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0')
-                                .get_tensor())
+            w_freeze = np.array(
+                fluid.global_scope().find_var('conv2d_1.w_0').get_tensor())
             # fail: -432.0 != -433.0, this is due to the calculation precision
             #self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
 
             # Convert parameter to 8-bit.
             quant_transpiler.convert_to_int8(test_program, place)
             # Save the 8-bit parameter and model file.
-            fluid.io.save_inference_model(
-                'model_8bit', ['image', 'label'], [loss],
-                exe,
-                test_program,
-                clip_extra=True)
+            fluid.io.save_inference_model('model_8bit', ['image', 'label'],
+                                          [loss],
+                                          exe,
+                                          test_program,
+                                          clip_extra=True)
             # Test whether the 8-bit parameter and model file can be loaded successfully.
-            [infer, feed, fetch] = fluid.io.load_inference_model('model_8bit',
-                                                                 exe)
+            [infer, feed,
+             fetch] = fluid.io.load_inference_model('model_8bit', exe)
             # Check the loaded 8-bit weight.
-            w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8')
-                              .get_tensor())
+            w_8bit = np.array(
+                fluid.global_scope().find_var('conv2d_1.w_0.int8').get_tensor())
 
             self.assertEqual(w_8bit.dtype, np.int8)
             self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
index 9eb2fe6cbd1a1..bbc61d34613da 100644
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
@@ -32,14 +32,18 @@ def fake_imdb_reader(word_dict_size,
                      lower_seq_len=100,
                      upper_seq_len=200,
                      class_dim=2):
+
     def __reader__():
         for _ in six.moves.range(sample_num):
-            length = np.random.random_integers(
-                low=lower_seq_len, high=upper_seq_len, size=[1])[0]
-            ids = np.random.random_integers(
-                low=0, high=word_dict_size - 1, size=[length]).astype('int64')
-            label = np.random.random_integers(
-                low=0, high=class_dim - 1, size=[1]).astype('int64')[0]
+            length = np.random.random_integers(low=lower_seq_len,
+                                               high=upper_seq_len,
+                                               size=[1])[0]
+            ids = np.random.random_integers(low=0,
+                                            high=word_dict_size - 1,
+                                            size=[length]).astype('int64')
+            label = np.random.random_integers(low=0,
+                                              high=class_dim - 1,
+                                              size=[1]).astype('int64')[0]
             yield ids, label
 
     return __reader__
@@ -74,8 +78,9 @@ def bow_net(data,
     This model is from https://github.com/PaddlePaddle/models:
     fluid/PaddleNLP/text_classification/nets.py
     """
-    emb = fluid.layers.embedding(
-        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
+    emb = fluid.layers.embedding(input=data,
+                                 is_sparse=is_sparse,
+                                 size=[dict_dim, emb_dim])
     bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
     bow_tanh = fluid.layers.tanh(bow)
     fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
@@ -88,6 +93,7 @@ def bow_net(data,
 
 
 class TestWeightDecay(unittest.TestCase):
+
     def setUp(self):
         # set seed
         np.random.seed(SEED)
@@ -125,16 +131,17 @@ def check_weight_decay(self, place, model):
         startup_prog = fluid.framework.Program()
 
         with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
+            data = fluid.layers.data(name="words",
+                                     shape=[1],
+                                     dtype="int64",
+                                     lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
             avg_cost = model(data, label, self.word_dict_len)
             AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
                 fluid.optimizer.Adam)
 
-            optimizer = AdamW(
-                learning_rate=self.learning_rate,
-                weight_decay=self.learning_rate)
+            optimizer = AdamW(learning_rate=self.learning_rate,
+                              weight_decay=self.learning_rate)
 
             optimizer.minimize(avg_cost)
             param_sum = self.run_program(place, [data, label])
@@ -146,8 +153,10 @@ def check_weight_decay2(self, place, model):
         startup_prog = fluid.framework.Program()
 
         with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
+            data = fluid.layers.data(name="words",
+                                     shape=[1],
+                                     dtype="int64",
+                                     lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
             avg_cost = model(data, label, self.word_dict_len)
@@ -160,8 +169,8 @@ def check_weight_decay2(self, place, model):
                           for var in main_prog.block(0).all_parameters()]
 
             for params in param_list:
-                updated_p = fluid.layers.elementwise_sub(
-                    x=params[0], y=params[1])
+                updated_p = fluid.layers.elementwise_sub(x=params[0],
+                                                         y=params[1])
                 fluid.layers.assign(input=updated_p, output=params[0])
 
             optimizer.apply_optimize(avg_cost, startup_prog, params_grads)
@@ -179,9 +188,10 @@ def test_weight_decay(self):
                 self.assertTrue(
                     np.allclose(param_sum1[i], param_sum2[i]),
                     "Current place: {}, i: {}, sum1: {}, sum2: {}".format(
-                        place, i, param_sum1[i][~np.isclose(param_sum1[
-                            i], param_sum2[i])], param_sum2[i][~np.isclose(
-                                param_sum1[i], param_sum2[i])]))
+                        place, i, param_sum1[i]
+                        [~np.isclose(param_sum1[i], param_sum2[i])],
+                        param_sum2[i]
+                        [~np.isclose(param_sum1[i], param_sum2[i])]))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 625728c0fcef2..1fa3c769d77fb 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -127,9 +127,9 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
             # Enable execute permissions
             PAGE_EXECUTE = ctypes.c_ulong(0x10)
             pfnVirtualProtect = ctypes.windll.kernel32.VirtualProtect
-            res = pfnVirtualProtect(
-                ctypes.c_void_p(address), ONE_PAGE, PAGE_EXECUTE,
-                ctypes.byref(ctypes.c_ulong(0)))
+            res = pfnVirtualProtect(ctypes.c_void_p(address),
+                                    ONE_PAGE, PAGE_EXECUTE,
+                                    ctypes.byref(ctypes.c_ulong(0)))
             if not res:
                 raise Exception("Failed VirtualProtect")
 
@@ -156,8 +156,8 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
             # Convert the code_str into a function that returns uint
             func, address = asm_func(code_str)
             retval = func()
-            ctypes.windll.kernel32.VirtualFree(
-                ctypes.c_void_p(address), ctypes.c_size_t(0), ONE_PAGE)
+            ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(address),
+                                               ctypes.c_size_t(0), ONE_PAGE)
         except Exception as e:
             sys.stderr.write('Failed getting the AVX flag on Windows.\n'
                              'The original error is: %s\n' %
@@ -170,9 +170,10 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
 
 def run_shell_command(cmd):
     import subprocess
-    out, err = subprocess.Popen(
-        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
-        shell=True).communicate()
+    out, err = subprocess.Popen(cmd,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE,
+                                shell=True).communicate()
     if err:
         return None
     else:
@@ -232,7 +233,7 @@ def to_list(s):
     return operator.lt(to_list(a), to_list(b))
 
 
-# NOTE(zhiqiu): An error may occurs when import paddle in linux platform with glibc < 2.22, 
+# NOTE(zhiqiu): An error may occurs when import paddle in linux platform with glibc < 2.22,
 # the error message of which is "dlopen: cannot load any more object with static TLS".
 # This happens when:
 # (1) the number of dynamic shared librarys (DSO) loaded > 14,
diff --git a/python/paddle/fluid/data.py b/python/paddle/fluid/data.py
index 31906c465a074..4a15b6a8ea272 100644
--- a/python/paddle/fluid/data.py
+++ b/python/paddle/fluid/data.py
@@ -115,12 +115,11 @@ def data(name, shape, dtype='float32', lod_level=0):
         if shape[i] is None:
             shape[i] = -1
 
-    return helper.create_global_variable(
-        name=name,
-        shape=shape,
-        dtype=dtype,
-        type=core.VarDesc.VarType.LOD_TENSOR,
-        stop_gradient=True,
-        lod_level=lod_level,
-        is_data=True,
-        need_check_feed=True)
+    return helper.create_global_variable(name=name,
+                                         shape=shape,
+                                         dtype=dtype,
+                                         type=core.VarDesc.VarType.LOD_TENSOR,
+                                         stop_gradient=True,
+                                         lod_level=lod_level,
+                                         is_data=True,
+                                         need_check_feed=True)
diff --git a/python/paddle/fluid/data_feed_desc.py b/python/paddle/fluid/data_feed_desc.py
index eaa8985092d0d..fb4ce735fca81 100644
--- a/python/paddle/fluid/data_feed_desc.py
+++ b/python/paddle/fluid/data_feed_desc.py
@@ -174,8 +174,8 @@ def set_dense_slots(self, dense_slots_name):
                 "Only MultiSlotDataFeed needs set_dense_slots, please check your datafeed.proto"
             )
         for name in dense_slots_name:
-            self.proto_desc.multi_slot_desc.slots[self.__name_to_index[
-                name]].is_dense = True
+            self.proto_desc.multi_slot_desc.slots[
+                self.__name_to_index[name]].is_dense = True
 
     def set_use_slots(self, use_slots_name):
         """
@@ -219,8 +219,8 @@ def set_use_slots(self, use_slots_name):
                 "Only MultiSlotDataFeed needs set_use_slots, please check your datafeed.proto"
             )
         for name in use_slots_name:
-            self.proto_desc.multi_slot_desc.slots[self.__name_to_index[
-                name]].is_used = True
+            self.proto_desc.multi_slot_desc.slots[
+                self.__name_to_index[name]].is_used = True
 
     def desc(self):
         """
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index c7a68c6027be4..30cfb9f4b8591 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -24,6 +24,7 @@
 
 from .framework import Variable, default_main_program, _current_expected_place, _non_static_mode, _in_eager_without_dygraph_check
 from .framework import _cpu_num, _cuda_ids
+
 __all__ = ['DataFeeder']
 
 _PADDLE_DTYPE_2_NUMPY_DTYPE = {
@@ -172,6 +173,7 @@ def check_shape(shape,
 
 
 class DataToLoDTensorConverter(object):
+
     def __init__(self, place, lod_level, shape, dtype):
         self.place = place
         self.lod_level = lod_level
@@ -205,8 +207,8 @@ def _check_shape(self, shape):
         for s1, s2 in zip(self.shape, shape):
             if s1 != s2 and s1 >= 0 and s2 >= 0:
                 raise ValueError(
-                    "Shape not match. What is defined in data layer is {}, but receive {}".
-                    format(self.shape, shape))
+                    "Shape not match. What is defined in data layer is {}, but receive {}"
+                    .format(self.shape, shape))
 
     def done(self):
         arr = np.array(self.data, dtype=self.dtype)
@@ -227,6 +229,7 @@ def done(self):
 
 
 class BatchedTensorProvider(object):
+
     def __init__(self, feed_list, place, batch_size, generator, drop_last):
         self.place = place
         self.batch_size = batch_size
@@ -237,11 +240,10 @@ def __init__(self, feed_list, place, batch_size, generator, drop_last):
         for var in feed_list:
             assert var.lod_level == 0, "lod_level must be 0"
             self.converters.append(
-                DataToLoDTensorConverter(
-                    place=self.place,
-                    lod_level=0,
-                    shape=var.shape,
-                    dtype=var.dtype))
+                DataToLoDTensorConverter(place=self.place,
+                                         lod_level=0,
+                                         shape=var.shape,
+                                         dtype=var.dtype))
 
     def _done(self):
         return [c.done() for c in self.converters]
@@ -249,8 +251,8 @@ def _done(self):
     def __call__(self):
         idx = 0
         for each_sample in self.generator():
-            for each_slot, each_converter in six.moves.zip(each_sample,
-                                                           self.converters):
+            for each_slot, each_converter in six.moves.zip(
+                    each_sample, self.converters):
                 each_converter.data.append(each_slot)
 
             idx += 1
@@ -383,21 +385,21 @@ def reader(limit=5):
 
         """
         converter = []
-        for lod_level, shape, dtype in six.moves.zip(
-                self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
+        for lod_level, shape, dtype in six.moves.zip(self.feed_lod_level,
+                                                     self.feed_shapes,
+                                                     self.feed_dtypes):
             converter.append(
-                DataToLoDTensorConverter(
-                    place=self.place,
-                    lod_level=lod_level,
-                    shape=shape,
-                    dtype=dtype))
+                DataToLoDTensorConverter(place=self.place,
+                                         lod_level=lod_level,
+                                         shape=shape,
+                                         dtype=dtype))
 
         for each_sample in iterable:
             assert len(each_sample) == len(converter), (
                 "The number of fields in data (%d) does not match " +
                 "len(feed_list) (%d)") % (len(each_sample), len(converter))
-            for each_converter, each_slot in six.moves.zip(converter,
-                                                           each_sample):
+            for each_converter, each_slot in six.moves.zip(
+                    converter, each_sample):
                 each_converter.feed(each_slot)
         ret_dict = {}
         for each_name, each_converter in six.moves.zip(self.feed_names,
@@ -461,14 +463,12 @@ def _reader():
         """
         if isinstance(self.place, core.CUDAPlace):
             places = [
-                core.CUDAPlace(i)
-                for i in six.moves.xrange(
+                core.CUDAPlace(i) for i in six.moves.xrange(
                     self._get_number_of_places_(num_places))
             ]
         else:
             places = [
-                core.CPUPlace()
-                for _ in six.moves.xrange(
+                core.CPUPlace() for _ in six.moves.xrange(
                     self._get_number_of_places_(num_places))
             ]
 
diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py
index 3a23c852563da..8187faef0086b 100644
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
@@ -148,6 +148,7 @@ def __len__(self):
 
 
 class _InfiniteIterableSampler(object):
+
     def __init__(self, dataset, batch_size=1):
         assert isinstance(
             dataset, IterableDataset
@@ -277,9 +278,10 @@ def _get_indices_by_batch_size(indices):
                 subsampled_indices.extend(indices[i:i + self.batch_size])
 
             indices = indices[len(indices) - last_batch_size:]
-            subsampled_indices.extend(indices[
-                self.local_rank * last_local_batch_size:(
-                    self.local_rank + 1) * last_local_batch_size])
+            subsampled_indices.extend(
+                indices[self.local_rank *
+                        last_local_batch_size:(self.local_rank + 1) *
+                        last_local_batch_size])
             return subsampled_indices
 
         if self.nranks > 1:
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 430578db51022..0d7fc17da172c 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -96,6 +96,7 @@ def __init__(self, loader):
         self._auto_collate_batch = loader.auto_collate_batch
         self._num_workers = loader.num_workers
         self._use_buffer_reader = loader.use_buffer_reader
+        self._prefetch_factor = loader.prefetch_factor
         self._use_shared_memory = loader.use_shared_memory
         self._timeout = loader.timeout if loader.timeout > 0 else MP_STATUS_CHECK_INTERVAL
         self._worker_init_fn = loader.worker_init_fn
@@ -166,9 +167,10 @@ def __init__(self, loader):
         self._structure_infos = []
 
         # NOTE: len(self._places) batch data compose as an output
-        # iteration, set blocking_queue can cache 2 iteration datas
+        # iteration, set blocking_queue can cache "self._prefetch_factor" iteration datas
         # at most here
-        self._blocking_queue_capacity = 1 * len(self._places)
+        self._blocking_queue_capacity = self._prefetch_factor * len(
+            self._places)
 
         self._init_thread()
         self._shutdown = False
@@ -192,8 +194,8 @@ def _init_thread(self):
             self._need_check_feed, self._places, self._use_buffer_reader, True,
             self._pin_memory)
 
-        self._thread = threading.Thread(
-            target=self._thread_loop, args=(_current_expected_place(), ))
+        self._thread = threading.Thread(target=self._thread_loop,
+                                        args=(_current_expected_place(), ))
         self._thread.daemon = True
         self._thread.start()
 
@@ -201,7 +203,7 @@ def _thread_loop(self, legacy_expected_place):
         #NOTE(zhiqiu): Set the expected place for new thread as the same as father thread,
         # and it will call platform::SetDeviceId() in c++ internally.
         # If we do not set cudaDeviceId in new thread, the default cudaDeviceId will be 0,
-        # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda 
+        # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda
         # APIs in this thread.
         _set_expected_place(legacy_expected_place)
 
@@ -275,12 +277,11 @@ def __next__(self):
                         for i in range(len(data)):
                             data[i] = data[i]._move_to_list()
                         data = [
-                            _restore_batch(d, s)
-                            for d, s in zip(data, self._structure_infos[:len(
-                                self._places)])
+                            _restore_batch(d, s) for d, s in zip(
+                                data, self._structure_infos[:len(self._places)])
                         ]
-                        self._structure_infos = self._structure_infos[len(
-                            self._places):]
+                        self._structure_infos = self._structure_infos[
+                            len(self._places):]
                         # static graph organized data on multi-device with list, if
                         # place number is 1, there is only 1 device, extra the data
                         # from list for devices to be compatible with dygraph mode
@@ -339,6 +340,7 @@ def __del__(self):
 
 
 class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
+
     def __init__(self, loader):
         super(_DataLoaderIterMultiProcess, self).__init__(loader)
 
@@ -352,7 +354,7 @@ def __init__(self, loader):
         self._data_queue = None
 
         # data get from _data_queue will be reordered by _rcvd_idx
-        # for data order keeping, data index not equal _rcvd_idx 
+        # for data order keeping, data index not equal _rcvd_idx
         # will be cached in _task_infos
         self._send_idx = 0
         self._rcvd_idx = 0
@@ -363,11 +365,11 @@ def __init__(self, loader):
         # indices outstand as _outstanding_capacity at first, and
         # blocking_queue capacity is also _outstanding_capacity.
         # _outstanding_capacity here to make sure each indices_queue
-        # has at least 2 indices, and outstanding batch cached
-        # output data for at least 2 iterations(Note that len(_places)
+        # has at least "_prefetch_factor" indices, and outstanding batch cached
+        # output data for at least "_prefetch_factor" iterations(Note that len(_places)
         # batches will be composed as an iteration output)
-        self._outstanding_capacity = 2 * max(self._num_workers,
-                                             len(self._places))
+        self._outstanding_capacity = self._prefetch_factor * max(
+            self._num_workers, len(self._places))
 
         # see _try_put_indices
         self._thread_lock = threading.Lock()
@@ -390,7 +392,7 @@ def _init_workers(self):
         # create data_queue for workers
         self._data_queue = multiprocessing.Queue()
 
-        # event for workers and thread, thread event is only need 
+        # event for workers and thread, thread event is only need
         # in multi-processing mode
         self._workers_done_event = multiprocessing.Event()
         self._thread_done_event = threading.Event()
@@ -432,7 +434,8 @@ def _init_thread(self):
         ]
         # if only 1 place, do not need to keep order
         self._blocking_queue = core.init_lod_tensor_blocking_queue(
-            core.Variable(), self._outstanding_capacity, len(self._places) > 1)
+            core.Variable(), self._outstanding_capacity,
+            len(self._places) > 1)
         self._reader = core.create_py_reader(
             self._blocking_queue, self._var_names, self._shapes, self._dtypes,
             self._need_check_feed, self._places, self._use_buffer_reader, True,
@@ -440,8 +443,8 @@ def _init_thread(self):
 
         self._thread_done_event = threading.Event()
         # thread event is only need in multi-processing mode
-        self._thread = threading.Thread(
-            target=self._thread_loop, args=(_current_expected_place(), ))
+        self._thread = threading.Thread(target=self._thread_loop,
+                                        args=(_current_expected_place(), ))
         self._thread.daemon = True
         self._thread.start()
 
@@ -490,8 +493,8 @@ def _reset(self):
             self._try_put_indices()
 
     def _shutdown_worker(self, worker_id, shutdown=False):
-        if self._worker_status[worker_id] or (self._persistent_workers and
-                                              shutdown):
+        if self._worker_status[worker_id] or (self._persistent_workers
+                                              and shutdown):
             self._indices_queues[worker_id].put(None)
             self._worker_status[worker_id] = False
 
@@ -522,7 +525,7 @@ def _thread_loop(self, legacy_expected_place):
         #NOTE(zhiqiu): Set the expected place for new thread as the same as father thread,
         # and it will call platform::SetDeviceId() in c++ internally.
         # If we do not set cudaDeviceId in new thread, the default cudaDeviceId will be 0,
-        # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda 
+        # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda
         # APIs in this thread.
         _set_expected_place(legacy_expected_place)
 
@@ -546,8 +549,9 @@ def _thread_loop(self, legacy_expected_place):
                             # LoDTensor not in shared memory is not
                             # serializable, cannot be create in workers
                             for slot in batch:
-                                if isinstance(slot, (paddle.Tensor,
-                                                     core.eager.Tensor)):
+                                if isinstance(
+                                        slot,
+                                    (paddle.Tensor, core.eager.Tensor)):
                                     slot = slot.value().get_tensor()
                                 elif not isinstance(slot, core.LoDTensor):
                                     tmp = core.LoDTensor()
@@ -568,7 +572,7 @@ def _get_data(self):
             # For IterableDataset, batch indices is generated infinitely
             # for each worker to raise StopIteration, but a StopIteration
             # raising process will discard a batch indices which is count
-            # in _send_idx but will not increase _rcvd_idx, so we check 
+            # in _send_idx but will not increase _rcvd_idx, so we check
             # whether the worker is still alive here to skip the discarded
             # batch indices and increase _rcvd_idx
             if self._dataset_kind == _DatasetKind.ITER:
@@ -747,12 +751,11 @@ def __next__(self):
                         for i in range(len(data)):
                             data[i] = data[i]._move_to_list()
                         data = [
-                            _restore_batch(d, s)
-                            for d, s in zip(data, self._structure_infos[:len(
-                                self._places)])
+                            _restore_batch(d, s) for d, s in zip(
+                                data, self._structure_infos[:len(self._places)])
                         ]
-                        self._structure_infos = self._structure_infos[len(
-                            self._places):]
+                        self._structure_infos = self._structure_infos[
+                            len(self._places):]
                         # static graph organized data on multi-device with list, if
                         # place number is 1, there is only 1 device, extra the data
                         # from list for devices to be compatible with dygraph mode
diff --git a/python/paddle/fluid/dataloader/fetcher.py b/python/paddle/fluid/dataloader/fetcher.py
index ec3240a326b8e..387032cdfbbd3 100644
--- a/python/paddle/fluid/dataloader/fetcher.py
+++ b/python/paddle/fluid/dataloader/fetcher.py
@@ -20,6 +20,7 @@
 
 
 class _DatasetFetcher(object):
+
     def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
         self.dataset = dataset
         self.auto_collate_batch = auto_collate_batch
@@ -67,15 +68,17 @@ def _log_warning(self):
                     "dtype=float32)]', and in Paddle >= 2.1, data is in format" \
                     " 'Tensor(shape=(1, 2, 3), dtype=float32)'\n"
 
-        logger = get_logger(
-            "DataLoader", logging.INFO, fmt='%(levelname)s: %(message)s')
+        logger = get_logger("DataLoader",
+                            logging.INFO,
+                            fmt='%(levelname)s: %(message)s')
         logger.warning(warn_str)
 
 
 class _IterableDatasetFetcher(_DatasetFetcher):
+
     def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
-        super(_IterableDatasetFetcher, self).__init__(
-            dataset, auto_collate_batch, collate_fn, drop_last)
+        super(_IterableDatasetFetcher,
+              self).__init__(dataset, auto_collate_batch, collate_fn, drop_last)
         self.dataset_iter = iter(dataset)
 
     def fetch(self, batch_indices, done_event=None):
@@ -91,8 +94,8 @@ def fetch(self, batch_indices, done_event=None):
                 else:
                     return None
 
-            if len(data) == 0 or (self.drop_last and
-                                  len(data) < len(batch_indices)):
+            if len(data) == 0 or (self.drop_last
+                                  and len(data) < len(batch_indices)):
                 raise StopIteration
 
             global _WARNING_TO_LOG
@@ -109,6 +112,7 @@ def fetch(self, batch_indices, done_event=None):
 
 
 class _MapDatasetFetcher(_DatasetFetcher):
+
     def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
         super(_MapDatasetFetcher, self).__init__(dataset, auto_collate_batch,
                                                  collate_fn, drop_last)
diff --git a/python/paddle/fluid/dataloader/sampler.py b/python/paddle/fluid/dataloader/sampler.py
index 7207ebcbacfdb..25a46f3b5df2d 100644
--- a/python/paddle/fluid/dataloader/sampler.py
+++ b/python/paddle/fluid/dataloader/sampler.py
@@ -204,7 +204,8 @@ def __init__(self,
 
         if self._num_samples is not None and not replacement:
             raise ValueError(
-                "num_samples should not be specified while replacement is False")
+                "num_samples should not be specified while replacement is False"
+            )
 
         if not isinstance(self.num_samples, int) or self.num_samples <= 0:
             raise ValueError("num_samples should be a positive integer, "
@@ -227,12 +228,13 @@ def __iter__(self):
                 yield index
         else:
             if self.replacement:
-                for index in np.random.choice(
-                        np.arange(n), self.num_samples, replace=True).tolist():
+                for index in np.random.choice(np.arange(n),
+                                              self.num_samples,
+                                              replace=True).tolist():
                     yield index
             else:
-                for index in np.random.choice(
-                        np.arange(n), n, replace=False).tolist():
+                for index in np.random.choice(np.arange(n), n,
+                                              replace=False).tolist():
                     yield index
 
     def __len__(self):
diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py
index 6dc3813fa6d0c..0c3ec898aadfd 100644
--- a/python/paddle/fluid/dataloader/worker.py
+++ b/python/paddle/fluid/dataloader/worker.py
@@ -32,6 +32,7 @@
 
 
 class _IterableDatasetStopIteration(object):
+
     def __init__(self, worker_id):
         self.worker_id = worker_id
 
@@ -58,6 +59,7 @@ def create_fetcher(kind, dataset, auto_collate_batch, collate_fn,
 
 
 class ParentWatchDog(object):
+
     def __init__(self):
         self._parent_pid = os.getppid()
         self._parent_alive = True
@@ -155,6 +157,7 @@ def __setattr__(self, key, val):
 
 
 class _WorkerException(object):
+
     def __init__(self, worker_id, exc_info=None):
         self.worker_id = worker_id
         exc_info = exc_info or sys.exc_info()
@@ -275,8 +278,9 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
             np.random.seed(_generate_states(int(time.time()), worker_id))
 
         global _worker_info
-        _worker_info = WorkerInfo(
-            id=worker_id, num_workers=num_workers, dataset=dataset)
+        _worker_info = WorkerInfo(id=worker_id,
+                                  num_workers=num_workers,
+                                  dataset=dataset)
 
         init_exception = None
         try:
@@ -300,8 +304,9 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
             if isinstance(data, _ResumeIteration):
                 out_queue.put((data, None, None))
                 iterator_drained = False
-                fetcher = _DatasetKind.create_fetcher(
-                    dataset_kind, dataset, auto_collate_batch, collate_fn, True)
+                fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
+                                                      auto_collate_batch,
+                                                      collate_fn, True)
                 continue
 
             # None as poison piil, so worker event should be set
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index 84064669c0dc6..8ea3e15ca4d3c 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -17,6 +17,7 @@
 from google.protobuf import text_format
 from . import core
 from ..utils import deprecated
+
 __all__ = ['DatasetFactory', 'InMemoryDataset', 'QueueDataset']
 
 
@@ -388,9 +389,8 @@ def __init__(self):
         self.fleet_send_sleep_seconds = None
         self.trainer_num = -1
 
-    @deprecated(
-        since="2.0.0",
-        update_to="paddle.distributed.InMemoryDataset._set_feed_type")
+    @deprecated(since="2.0.0",
+                update_to="paddle.distributed.InMemoryDataset._set_feed_type")
     def set_feed_type(self, data_feed_type):
         """
         Set data_feed_desc
@@ -399,9 +399,8 @@ def set_feed_type(self, data_feed_type):
         if (self.proto_desc.name == "SlotRecordInMemoryDataFeed"):
             self.dataset = core.Dataset("SlotRecordDataset")
 
-    @deprecated(
-        since="2.0.0",
-        update_to="paddle.distributed.InMemoryDataset._prepare_to_run")
+    @deprecated(since="2.0.0",
+                update_to="paddle.distributed.InMemoryDataset._prepare_to_run")
     def _prepare_to_run(self):
         """
         Set data_feed_desc before load or shuffle,
@@ -424,8 +423,8 @@ def _prepare_to_run(self):
 
     @deprecated(
         since="2.0.0",
-        update_to="paddle.distributed.InMemoryDataset._dynamic_adjust_before_train"
-    )
+        update_to=
+        "paddle.distributed.InMemoryDataset._dynamic_adjust_before_train")
     def _dynamic_adjust_before_train(self, thread_num):
         if not self.is_user_set_queue_num:
             if self.use_ps_gpu:
@@ -446,9 +445,8 @@ def _dynamic_adjust_after_train(self):
                 self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
         self.dataset.dynamic_adjust_readers_num(self.thread_num)
 
-    @deprecated(
-        since="2.0.0",
-        update_to="paddle.distributed.InMemoryDataset._set_queue_num")
+    @deprecated(since="2.0.0",
+                update_to="paddle.distributed.InMemoryDataset._set_queue_num")
     def set_queue_num(self, queue_num):
         """
         Set Dataset output queue num, training threads get data from queues
@@ -467,9 +465,9 @@ def set_queue_num(self, queue_num):
         self.is_user_set_queue_num = True
         self.queue_num = queue_num
 
-    @deprecated(
-        since="2.0.0",
-        update_to="paddle.distributed.InMemoryDataset._set_parse_ins_id")
+    @deprecated(since="2.0.0",
+                update_to="paddle.distributed.InMemoryDataset._set_parse_ins_id"
+                )
     def set_parse_ins_id(self, parse_ins_id):
         """
         Set id Dataset need to parse insid
@@ -541,9 +539,9 @@ def _set_trainer_num(self, trainer_num):
         """
         self.trainer_num = trainer_num
 
-    @deprecated(
-        since="2.0.0",
-        update_to="paddle.distributed.InMemoryDataset._set_merge_by_sid")
+    @deprecated(since="2.0.0",
+                update_to="paddle.distributed.InMemoryDataset._set_merge_by_sid"
+                )
     def set_merge_by_sid(self, merge_by_sid):
         """
         Set if Dataset need to merge sid. If not, one ins means one Pv.
@@ -656,8 +654,8 @@ def set_fleet_send_batch_size(self, fleet_send_batch_size=1024):
 
     @deprecated(
         since="2.0.0",
-        update_to="paddle.distributed.InMemoryDataset._set_fleet_send_sleep_seconds"
-    )
+        update_to=
+        "paddle.distributed.InMemoryDataset._set_fleet_send_sleep_seconds")
     def set_fleet_send_sleep_seconds(self, fleet_send_sleep_seconds=0):
         """
         Set fleet send sleep time, default is 0
@@ -700,8 +698,8 @@ def set_merge_by_lineid(self, merge_size=2):
 
     @deprecated(
         since="2.0.0",
-        update_to="paddle.distributed.InMemoryDataset._set_generate_unique_feasigns"
-    )
+        update_to=
+        "paddle.distributed.InMemoryDataset._set_generate_unique_feasigns")
     def set_generate_unique_feasigns(self, generate_uni_feasigns, shard_num):
         self.dataset.set_generate_unique_feasigns(generate_uni_feasigns)
         self.gen_uni_feasigns = generate_uni_feasigns
@@ -709,12 +707,13 @@ def set_generate_unique_feasigns(self, generate_uni_feasigns, shard_num):
 
     @deprecated(
         since="2.0.0",
-        update_to="paddle.distributed.InMemoryDataset._generate_local_tables_unlock"
-    )
+        update_to=
+        "paddle.distributed.InMemoryDataset._generate_local_tables_unlock")
     def generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num,
                                      consume_thread_num, shard_num):
-        self.dataset.generate_local_tables_unlock(
-            table_id, fea_dim, read_thread_num, consume_thread_num, shard_num)
+        self.dataset.generate_local_tables_unlock(table_id, fea_dim,
+                                                  read_thread_num,
+                                                  consume_thread_num, shard_num)
 
     def set_date(self, date):
         """
@@ -739,9 +738,8 @@ def set_date(self, date):
         if self.use_ps_gpu and core._is_compiled_with_heterps():
             self.psgpu.set_date(year, month, day)
 
-    @deprecated(
-        since="2.0.0",
-        update_to="paddle.distributed.InMemoryDataset.load_into_memory")
+    @deprecated(since="2.0.0",
+                update_to="paddle.distributed.InMemoryDataset.load_into_memory")
     def load_into_memory(self, is_shuffle=False):
         """
         Load data into memory
@@ -794,9 +792,9 @@ def preload_into_memory(self, thread_num=None):
         self.dataset.create_preload_readers()
         self.dataset.preload_into_memory()
 
-    @deprecated(
-        since="2.0.0",
-        update_to="paddle.distributed.InMemoryDataset.wait_preload_done")
+    @deprecated(since="2.0.0",
+                update_to="paddle.distributed.InMemoryDataset.wait_preload_done"
+                )
     def wait_preload_done(self):
         """
         Wait preload_into_memory done
@@ -815,9 +813,8 @@ def wait_preload_done(self):
         self.dataset.wait_preload_done()
         self.dataset.destroy_preload_readers()
 
-    @deprecated(
-        since="2.0.0",
-        update_to="paddle.distributed.InMemoryDataset.local_shuffle")
+    @deprecated(since="2.0.0",
+                update_to="paddle.distributed.InMemoryDataset.local_shuffle")
     def local_shuffle(self):
         """
         Local shuffle
@@ -835,9 +832,8 @@ def local_shuffle(self):
         """
         self.dataset.local_shuffle()
 
-    @deprecated(
-        since="2.0.0",
-        update_to="paddle.distributed.InMemoryDataset.global_shuffle")
+    @deprecated(since="2.0.0",
+                update_to="paddle.distributed.InMemoryDataset.global_shuffle")
     def global_shuffle(self, fleet=None, thread_num=12):
         """
         Global shuffle.
@@ -897,9 +893,8 @@ def global_shuffle(self, fleet=None, thread_num=12):
             else:
                 fleet._role_maker.barrier_worker()
 
-    @deprecated(
-        since="2.0.0",
-        update_to="paddle.distributed.InMemoryDataset.release_memory")
+    @deprecated(since="2.0.0",
+                update_to="paddle.distributed.InMemoryDataset.release_memory")
     def release_memory(self):
         """
         :api_attr: Static Graph
@@ -1063,9 +1058,8 @@ def __init__(self):
         super(QueueDataset, self).__init__()
         self.proto_desc.name = "MultiSlotDataFeed"
 
-    @deprecated(
-        since="2.0.0",
-        update_to="paddle.distributed.QueueDataset._prepare_to_run")
+    @deprecated(since="2.0.0",
+                update_to="paddle.distributed.QueueDataset._prepare_to_run")
     def _prepare_to_run(self):
         """
         Set data_feed_desc/thread num/filelist before run,
diff --git a/python/paddle/fluid/debugger.py b/python/paddle/fluid/debugger.py
index 75dc14a1d754c..76b7c3d7dc120 100644
--- a/python/paddle/fluid/debugger.py
+++ b/python/paddle/fluid/debugger.py
@@ -68,31 +68,30 @@ def repr_lodtensor(proto):
 
     level = proto.type.lod_tensor.lod_level
     reprs = repr_tensor(proto.type.lod_tensor.tensor)
-    return reprtpl.format(
-        ttype="LoDTensor" if level > 0 else "Tensor",
-        name=proto.name,
-        reprs="level=%d, %s" % (level, reprs) if level > 0 else reprs)
+    return reprtpl.format(ttype="LoDTensor" if level > 0 else "Tensor",
+                          name=proto.name,
+                          reprs="level=%d, %s" %
+                          (level, reprs) if level > 0 else reprs)
 
 
 def repr_selected_rows(proto):
     if proto.type.type != framework_pb2.VarType.SELECTED_ROWS:
         return
 
-    return reprtpl.format(
-        ttype="SelectedRows",
-        name=proto.name,
-        reprs=repr_tensor(proto.type.selected_rows))
+    return reprtpl.format(ttype="SelectedRows",
+                          name=proto.name,
+                          reprs=repr_tensor(proto.type.selected_rows))
 
 
 def repr_tensor_array(proto):
     if proto.type.type != framework_pb2.VarType.LOD_TENSOR_ARRAY:
         return
 
-    return reprtpl.format(
-        ttype="TensorArray",
-        name=proto.name,
-        reprs="level=%d, %s" % (proto.type.tensor_array.lod_level,
-                                repr_tensor(proto.type.lod_tensor.tensor)))
+    return reprtpl.format(ttype="TensorArray",
+                          name=proto.name,
+                          reprs="level=%d, %s" %
+                          (proto.type.tensor_array.lod_level,
+                           repr_tensor(proto.type.lod_tensor.tensor)))
 
 
 type_handlers = [
@@ -119,6 +118,7 @@ def pprint_program_codes(program_desc):
 
 
 def pprint_block_codes(block_desc, show_backward=False):
+
     def is_op_backward(op_desc):
         if op_desc.type.endswith('_grad'): return True
 
@@ -155,7 +155,8 @@ def is_var_backward(var_desc):
         idx=block_desc.idx,
         pidx=block_desc.parent_idx,
         vars='\n'.join(var_reprs),
-        ops='\n'.join(op_reprs), )
+        ops='\n'.join(op_reprs),
+    )
 
 
 def repr_attr(desc):
@@ -187,7 +188,9 @@ def _repr_op_fill_constant(optype, inputs, outputs, attrs):
             shape=str(attrs['shape']))
 
 
-op_repr_handlers = [_repr_op_fill_constant, ]
+op_repr_handlers = [
+    _repr_op_fill_constant,
+]
 
 
 def repr_op(opdesc):
@@ -218,12 +221,11 @@ def repr_op(opdesc):
         res = handler(opdesc.type, inputs, outputs, attr_dict)
         if res: return res
 
-    return tpl.format(
-        outputs=', '.join(outputs),
-        optype=opdesc.type,
-        inputs=', '.join(inputs),
-        attrs="{%s}" % ','.join(attrs),
-        is_target=", is_target" if is_target else "")
+    return tpl.format(outputs=', '.join(outputs),
+                      optype=opdesc.type,
+                      inputs=', '.join(inputs),
+                      attrs="{%s}" % ','.join(attrs),
+                      is_target=", is_target" if is_target else "")
 
 
 def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
@@ -251,10 +253,9 @@ def need_highlight(name):
         # TODO(gongwb): format the var.type
         # create var
         if var.persistable:
-            varn = graph.add_param(
-                var.name,
-                str(var.type).replace("\n", "<br />", 1),
-                highlight=need_highlight(var.name))
+            varn = graph.add_param(var.name,
+                                   str(var.type).replace("\n", "<br />", 1),
+                                   highlight=need_highlight(var.name))
         else:
             varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
         vars[var.name] = varn
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index 8a5e3584ed866..f0c094a84f758 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -164,14 +164,14 @@ def _gen_worker_desc(self, trainer_desc):
             sparse_len = len(worker.get_desc().sparse_table)
             for i in range(sparse_len):
                 sparse_table = downpour.sparse_table.add()
-                sparse_table.table_id = worker.get_desc().sparse_table[
-                    i].table_id
-                sparse_table.sparse_key_name.extend(worker.get_desc()
-                                                    .sparse_table[i].slot_key)
-                sparse_table.sparse_value_name.extend(worker.get_desc(
-                ).sparse_table[i].slot_value)
-                sparse_table.sparse_grad_name.extend(worker.get_desc(
-                ).sparse_table[i].slot_gradient)
+                sparse_table.table_id = worker.get_desc(
+                ).sparse_table[i].table_id
+                sparse_table.sparse_key_name.extend(
+                    worker.get_desc().sparse_table[i].slot_key)
+                sparse_table.sparse_value_name.extend(
+                    worker.get_desc().sparse_table[i].slot_value)
+                sparse_table.sparse_grad_name.extend(
+                    worker.get_desc().sparse_table[i].slot_gradient)
                 sparse_table.fea_dim = \
                     self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
                         i].accessor.fea_dim
@@ -291,14 +291,14 @@ def _gen_worker_desc(self, trainer_desc):
             sparse_len = len(worker.get_desc().sparse_table)
             for i in range(sparse_len):
                 sparse_table = downpour.sparse_table.add()
-                sparse_table.table_id = worker.get_desc().sparse_table[
-                    i].table_id
-                sparse_table.sparse_key_name.extend(worker.get_desc()
-                                                    .sparse_table[i].slot_key)
-                sparse_table.sparse_value_name.extend(worker.get_desc(
-                ).sparse_table[i].slot_value)
-                sparse_table.sparse_grad_name.extend(worker.get_desc(
-                ).sparse_table[i].slot_gradient)
+                sparse_table.table_id = worker.get_desc(
+                ).sparse_table[i].table_id
+                sparse_table.sparse_key_name.extend(
+                    worker.get_desc().sparse_table[i].slot_key)
+                sparse_table.sparse_value_name.extend(
+                    worker.get_desc().sparse_table[i].slot_value)
+                sparse_table.sparse_grad_name.extend(
+                    worker.get_desc().sparse_table[i].slot_gradient)
                 sparse_table.fea_dim = \
                     self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
                         i].accessor.fea_dim
@@ -400,12 +400,12 @@ def _gen_worker_desc(self, trainer_desc):
         for i in range(sparse_len):
             sparse_table = downpour.sparse_table.add()
             sparse_table.table_id = worker.get_desc().sparse_table[i].table_id
-            sparse_table.sparse_key_name.extend(worker.get_desc().sparse_table[
-                i].slot_key)
-            sparse_table.sparse_value_name.extend(worker.get_desc()
-                                                  .sparse_table[i].slot_value)
-            sparse_table.sparse_grad_name.extend(worker.get_desc().sparse_table[
-                i].slot_gradient)
+            sparse_table.sparse_key_name.extend(
+                worker.get_desc().sparse_table[i].slot_key)
+            sparse_table.sparse_value_name.extend(
+                worker.get_desc().sparse_table[i].slot_value)
+            sparse_table.sparse_grad_name.extend(
+                worker.get_desc().sparse_table[i].slot_gradient)
             if opt_info["use_cvm"] or "no_cvm" in opt_info and opt_info[
                     "no_cvm"] == True:
                 sparse_table.emb_dim = \
@@ -500,12 +500,12 @@ def _gen_worker_desc(self, trainer_desc):
         for i in range(sparse_len):
             sparse_table = downpour.sparse_table.add()
             sparse_table.table_id = worker.get_desc().sparse_table[i].table_id
-            sparse_table.sparse_key_name.extend(worker.get_desc().sparse_table[
-                i].slot_key)
-            sparse_table.sparse_value_name.extend(worker.get_desc()
-                                                  .sparse_table[i].slot_value)
-            sparse_table.sparse_grad_name.extend(worker.get_desc().sparse_table[
-                i].slot_gradient)
+            sparse_table.sparse_key_name.extend(
+                worker.get_desc().sparse_table[i].slot_key)
+            sparse_table.sparse_value_name.extend(
+                worker.get_desc().sparse_table[i].slot_value)
+            sparse_table.sparse_grad_name.extend(
+                worker.get_desc().sparse_table[i].slot_gradient)
             if opt_info["use_cvm"] or "no_cvm" in opt_info and opt_info[
                     "no_cvm"] == True:
                 sparse_table.emb_dim = \
@@ -569,15 +569,16 @@ def _gen_worker_desc(self, trainer_desc):
         # then runs Backward phase for all microbatches.
         # 1F1B scheduler, which runs forward phase and backward phase altertively
         # after startup phase.
-        assert schedule_mode_str in ["F-then-B", "1F1B"], (
-            "The schedule mode "
+        assert schedule_mode_str in [
+            "F-then-B", "1F1B"
+        ], ("The schedule mode "
             "for pipeline must be one of F-then-B or 1F1B")
         schedule_mode = 0 if schedule_mode_str == "F-then-B" else 1
         section_param.schedule_mode = schedule_mode
         cfg = section_param.section_config
         program = pipeline_opt["section_program"]
-        cfg.program_desc.ParseFromString(program._get_desc()
-                                         .serialize_to_string())
+        cfg.program_desc.ParseFromString(
+            program._get_desc().serialize_to_string())
         # TODO: why does not work
         # cfg.program_desc.CopyFrom(program.program._get_desc())
         place = pipeline_opt["place"]
@@ -616,11 +617,12 @@ def _gen_worker_desc(self, trainer_desc):
             "num_pipeline_stages"]
         cfg = heter_section_param.section_config
         program = heter_pipeline_opt["section_program"]
-        cfg.program_desc.ParseFromString(program._get_desc()
-                                         .serialize_to_string())
+        cfg.program_desc.ParseFromString(
+            program._get_desc().serialize_to_string())
 
 
 class DeviceWorkerFactory(object):
+
     def _create_device_worker(self, worker_type):
         classname = worker_type.capitalize()
         return globals()[classname]()
diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py
index 89e9a6a907632..4d6cc88ea7e66 100644
--- a/python/paddle/fluid/distributed/downpour.py
+++ b/python/paddle/fluid/distributed/downpour.py
@@ -93,14 +93,13 @@ def minimize(self,
         param_grads_list = []
         for loss_index in range(len(losses)):
             program_config = ps_param.trainer_param.program_config.add()
-            program_config.program_id = str(
-                id(losses[loss_index].block.program))
+            program_config.program_id = str(id(
+                losses[loss_index].block.program))
             program_config.pull_sparse_table_id.extend([sparse_table_index])
             program_config.push_sparse_table_id.extend([sparse_table_index])
-            params_grads = sorted(
-                append_backward(losses[loss_index], parameter_list,
-                                no_grad_set),
-                key=lambda x: x[0].name)
+            params_grads = sorted(append_backward(losses[loss_index],
+                                                  parameter_list, no_grad_set),
+                                  key=lambda x: x[0].name)
             param_grads_list.append(params_grads)
             params = []
             grads = []
diff --git a/python/paddle/fluid/distributed/fleet.py b/python/paddle/fluid/distributed/fleet.py
index cd2611956850f..6c2bcdc213b4b 100644
--- a/python/paddle/fluid/distributed/fleet.py
+++ b/python/paddle/fluid/distributed/fleet.py
@@ -37,8 +37,8 @@ def stop(self):
 
     def init_pserver(self, opt_info):
         if "fleet_desc" in opt_info:
-            self.dist_desc_str_ = text_format.MessageToString(opt_info[
-                "fleet_desc"])
+            self.dist_desc_str_ = text_format.MessageToString(
+                opt_info["fleet_desc"])
             self.dist_desc_ = opt_info["fleet_desc"]
         else:
             print(
@@ -54,8 +54,8 @@ def init_pserver(self, opt_info):
 
     def init_worker(self, opt_info):
         if "fleet_desc" in opt_info:
-            self.dist_desc_str_ = text_format.MessageToString(opt_info[
-                "fleet_desc"])
+            self.dist_desc_str_ = text_format.MessageToString(
+                opt_info["fleet_desc"])
             self.dist_desc_ = opt_info["fleet_desc"]
         else:
             print(
diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py
index 5c9b2def0761a..f1262ebae12ff 100644
--- a/python/paddle/fluid/distributed/ps_pb2.py
+++ b/python/paddle/fluid/distributed/ps_pb2.py
@@ -16,6 +16,7 @@
 # source: ps.proto
 
 import sys
+
 _b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode('latin1'))
 from google.protobuf.internal import enum_type_wrapper
 from google.protobuf import descriptor as _descriptor
@@ -42,15 +43,22 @@
     filename=None,
     file=DESCRIPTOR,
     values=[
-        _descriptor.EnumValueDescriptor(
-            name='PS_SPARSE_TABLE', index=0, number=0, options=None, type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_DENSE_TABLE', index=1, number=1, options=None, type=None),
+        _descriptor.EnumValueDescriptor(name='PS_SPARSE_TABLE',
+                                        index=0,
+                                        number=0,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_DENSE_TABLE',
+                                        index=1,
+                                        number=1,
+                                        options=None,
+                                        type=None),
     ],
     containing_type=None,
     options=None,
     serialized_start=3489,
-    serialized_end=3541, )
+    serialized_end=3541,
+)
 _sym_db.RegisterEnumDescriptor(_TABLETYPE)
 
 TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE)
@@ -60,82 +68,77 @@
     filename=None,
     file=DESCRIPTOR,
     values=[
-        _descriptor.EnumValueDescriptor(
-            name='PS_PULL_DENSE_TABLE',
-            index=0,
-            number=0,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_PUSH_DENSE_TABLE',
-            index=1,
-            number=1,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_PULL_SPARSE_TABLE',
-            index=2,
-            number=2,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_PUSH_SPARSE_TABLE',
-            index=3,
-            number=3,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_SHRINK_TABLE', index=4, number=4, options=None, type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_SAVE_ONE_TABLE',
-            index=5,
-            number=5,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_SAVE_ALL_TABLE',
-            index=6,
-            number=6,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_LOAD_ONE_TABLE',
-            index=7,
-            number=7,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_LOAD_ALL_TABLE',
-            index=8,
-            number=8,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_CLEAR_ONE_TABLE',
-            index=9,
-            number=9,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_CLEAR_ALL_TABLE',
-            index=10,
-            number=10,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_PUSH_DENSE_PARAM',
-            index=11,
-            number=11,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_STOP_SERVER', index=12, number=12, options=None,
-            type=None),
+        _descriptor.EnumValueDescriptor(name='PS_PULL_DENSE_TABLE',
+                                        index=0,
+                                        number=0,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_PUSH_DENSE_TABLE',
+                                        index=1,
+                                        number=1,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_PULL_SPARSE_TABLE',
+                                        index=2,
+                                        number=2,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_PUSH_SPARSE_TABLE',
+                                        index=3,
+                                        number=3,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_SHRINK_TABLE',
+                                        index=4,
+                                        number=4,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_SAVE_ONE_TABLE',
+                                        index=5,
+                                        number=5,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_SAVE_ALL_TABLE',
+                                        index=6,
+                                        number=6,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_LOAD_ONE_TABLE',
+                                        index=7,
+                                        number=7,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_LOAD_ALL_TABLE',
+                                        index=8,
+                                        number=8,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_CLEAR_ONE_TABLE',
+                                        index=9,
+                                        number=9,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_CLEAR_ALL_TABLE',
+                                        index=10,
+                                        number=10,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_PUSH_DENSE_PARAM',
+                                        index=11,
+                                        number=11,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_STOP_SERVER',
+                                        index=12,
+                                        number=12,
+                                        options=None,
+                                        type=None),
     ],
     containing_type=None,
     options=None,
     serialized_start=3544,
-    serialized_end=3861, )
+    serialized_end=3861,
+)
 _sym_db.RegisterEnumDescriptor(_PSCMDID)
 
 PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID)
@@ -161,15 +164,22 @@
     filename=None,
     file=DESCRIPTOR,
     values=[
-        _descriptor.EnumValueDescriptor(
-            name='HDFS', index=0, number=0, options=None, type=None),
-        _descriptor.EnumValueDescriptor(
-            name='AFS', index=1, number=1, options=None, type=None),
+        _descriptor.EnumValueDescriptor(name='HDFS',
+                                        index=0,
+                                        number=0,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='AFS',
+                                        index=1,
+                                        number=1,
+                                        options=None,
+                                        type=None),
     ],
     containing_type=None,
     options=None,
     serialized_start=3457,
-    serialized_end=3487, )
+    serialized_end=3487,
+)
 _sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE)
 
 _PSPARAMETER = _descriptor.Descriptor(
@@ -179,38 +189,36 @@
     file=DESCRIPTOR,
     containing_type=None,
     fields=[
-        _descriptor.FieldDescriptor(
-            name='worker_class',
-            full_name='paddle.PSParameter.worker_class',
-            index=0,
-            number=1,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b("").decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
-        _descriptor.FieldDescriptor(
-            name='server_class',
-            full_name='paddle.PSParameter.server_class',
-            index=1,
-            number=2,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b("").decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='worker_class',
+                                    full_name='paddle.PSParameter.worker_class',
+                                    index=0,
+                                    number=1,
+                                    type=9,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=_b("").decode('utf-8'),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
+        _descriptor.FieldDescriptor(name='server_class',
+                                    full_name='paddle.PSParameter.server_class',
+                                    index=1,
+                                    number=2,
+                                    type=9,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=_b("").decode('utf-8'),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='instance_class',
             full_name='paddle.PSParameter.instance_class',
@@ -227,38 +235,36 @@
             is_extension=False,
             extension_scope=None,
             options=None),
-        _descriptor.FieldDescriptor(
-            name='worker_param',
-            full_name='paddle.PSParameter.worker_param',
-            index=3,
-            number=101,
-            type=11,
-            cpp_type=10,
-            label=1,
-            has_default_value=False,
-            default_value=None,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
-        _descriptor.FieldDescriptor(
-            name='server_param',
-            full_name='paddle.PSParameter.server_param',
-            index=4,
-            number=102,
-            type=11,
-            cpp_type=10,
-            label=1,
-            has_default_value=False,
-            default_value=None,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='worker_param',
+                                    full_name='paddle.PSParameter.worker_param',
+                                    index=3,
+                                    number=101,
+                                    type=11,
+                                    cpp_type=10,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=None,
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
+        _descriptor.FieldDescriptor(name='server_param',
+                                    full_name='paddle.PSParameter.server_param',
+                                    index=4,
+                                    number=102,
+                                    type=11,
+                                    cpp_type=10,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=None,
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='trainer_param',
             full_name='paddle.PSParameter.trainer_param',
@@ -301,7 +307,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=21,
-    serialized_end=307, )
+    serialized_end=307,
+)
 
 _WORKERPARAMETER = _descriptor.Descriptor(
     name='WorkerParameter',
@@ -336,7 +343,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=309,
-    serialized_end=390, )
+    serialized_end=390,
+)
 
 _SERVERPARAMETER = _descriptor.Descriptor(
     name='ServerParameter',
@@ -371,7 +379,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=392,
-    serialized_end=473, )
+    serialized_end=473,
+)
 
 _DOWNPOURWORKERPARAMETER = _descriptor.Descriptor(
     name='DownpourWorkerParameter',
@@ -406,7 +415,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=475,
-    serialized_end=554, )
+    serialized_end=554,
+)
 
 _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor(
     name='DownpourTrainerParameter',
@@ -521,7 +531,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=557,
-    serialized_end=810, )
+    serialized_end=810,
+)
 
 _PROGRAMCONFIG = _descriptor.Descriptor(
     name='ProgramConfig',
@@ -530,22 +541,21 @@
     file=DESCRIPTOR,
     containing_type=None,
     fields=[
-        _descriptor.FieldDescriptor(
-            name='program_id',
-            full_name='paddle.ProgramConfig.program_id',
-            index=0,
-            number=1,
-            type=9,
-            cpp_type=9,
-            label=2,
-            has_default_value=False,
-            default_value=_b("").decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='program_id',
+                                    full_name='paddle.ProgramConfig.program_id',
+                                    index=0,
+                                    number=1,
+                                    type=9,
+                                    cpp_type=9,
+                                    label=2,
+                                    has_default_value=False,
+                                    default_value=_b("").decode('utf-8'),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='push_sparse_table_id',
             full_name='paddle.ProgramConfig.push_sparse_table_id',
@@ -620,7 +630,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=813,
-    serialized_end=966, )
+    serialized_end=966,
+)
 
 _DENSETABLEPARAMETER = _descriptor.Descriptor(
     name='DenseTableParameter',
@@ -703,7 +714,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=968,
-    serialized_end=1091, )
+    serialized_end=1091,
+)
 
 _SPARSETABLEPARAMETER = _descriptor.Descriptor(
     name='SparseTableParameter',
@@ -802,7 +814,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=1093,
-    serialized_end=1215, )
+    serialized_end=1215,
+)
 
 _DOWNPOURSERVERPARAMETER = _descriptor.Descriptor(
     name='DownpourServerParameter',
@@ -853,7 +866,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=1218,
-    serialized_end=1352, )
+    serialized_end=1352,
+)
 
 _SERVERSERVICEPARAMETER = _descriptor.Descriptor(
     name='ServerServiceParameter',
@@ -952,7 +966,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=1355,
-    serialized_end=1570, )
+    serialized_end=1570,
+)
 
 _TABLEPARAMETER = _descriptor.Descriptor(
     name='TableParameter',
@@ -961,22 +976,21 @@
     file=DESCRIPTOR,
     containing_type=None,
     fields=[
-        _descriptor.FieldDescriptor(
-            name='table_id',
-            full_name='paddle.TableParameter.table_id',
-            index=0,
-            number=1,
-            type=4,
-            cpp_type=4,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='table_id',
+                                    full_name='paddle.TableParameter.table_id',
+                                    index=0,
+                                    number=1,
+                                    type=4,
+                                    cpp_type=4,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=0,
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='table_class',
             full_name='paddle.TableParameter.table_class',
@@ -1009,38 +1023,36 @@
             is_extension=False,
             extension_scope=None,
             options=None),
-        _descriptor.FieldDescriptor(
-            name='accessor',
-            full_name='paddle.TableParameter.accessor',
-            index=3,
-            number=4,
-            type=11,
-            cpp_type=10,
-            label=1,
-            has_default_value=False,
-            default_value=None,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
-        _descriptor.FieldDescriptor(
-            name='type',
-            full_name='paddle.TableParameter.type',
-            index=4,
-            number=5,
-            type=14,
-            cpp_type=8,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='accessor',
+                                    full_name='paddle.TableParameter.accessor',
+                                    index=3,
+                                    number=4,
+                                    type=11,
+                                    cpp_type=10,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=None,
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
+        _descriptor.FieldDescriptor(name='type',
+                                    full_name='paddle.TableParameter.type',
+                                    index=4,
+                                    number=5,
+                                    type=14,
+                                    cpp_type=8,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=0,
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='compress_in_save',
             full_name='paddle.TableParameter.compress_in_save',
@@ -1067,7 +1079,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=1573,
-    serialized_end=1764, )
+    serialized_end=1764,
+)
 
 _TABLEACCESSORPARAMETER = _descriptor.Descriptor(
     name='TableAccessorParameter',
@@ -1214,7 +1227,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=1767,
-    serialized_end=2136, )
+    serialized_end=2136,
+)
 
 _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
     name='DownpourTableAccessorParameter',
@@ -1305,7 +1319,8 @@
             options=None),
         _descriptor.FieldDescriptor(
             name='show_click_decay_rate',
-            full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate',
+            full_name=
+            'paddle.DownpourTableAccessorParameter.show_click_decay_rate',
             index=5,
             number=6,
             type=2,
@@ -1345,7 +1360,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=2139,
-    serialized_end=2345, )
+    serialized_end=2345,
+)
 
 _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor(
     name='TableAccessorSaveParameter',
@@ -1412,7 +1428,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=2347,
-    serialized_end=2430, )
+    serialized_end=2430,
+)
 
 _PSREQUESTMESSAGE = _descriptor.Descriptor(
     name='PsRequestMessage',
@@ -1421,22 +1438,21 @@
     file=DESCRIPTOR,
     containing_type=None,
     fields=[
-        _descriptor.FieldDescriptor(
-            name='cmd_id',
-            full_name='paddle.PsRequestMessage.cmd_id',
-            index=0,
-            number=1,
-            type=13,
-            cpp_type=3,
-            label=2,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='cmd_id',
+                                    full_name='paddle.PsRequestMessage.cmd_id',
+                                    index=0,
+                                    number=1,
+                                    type=13,
+                                    cpp_type=3,
+                                    label=2,
+                                    has_default_value=False,
+                                    default_value=0,
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='table_id',
             full_name='paddle.PsRequestMessage.table_id',
@@ -1453,22 +1469,21 @@
             is_extension=False,
             extension_scope=None,
             options=None),
-        _descriptor.FieldDescriptor(
-            name='params',
-            full_name='paddle.PsRequestMessage.params',
-            index=2,
-            number=3,
-            type=12,
-            cpp_type=9,
-            label=3,
-            has_default_value=False,
-            default_value=[],
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='params',
+                                    full_name='paddle.PsRequestMessage.params',
+                                    index=2,
+                                    number=3,
+                                    type=12,
+                                    cpp_type=9,
+                                    label=3,
+                                    has_default_value=False,
+                                    default_value=[],
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='client_id',
             full_name='paddle.PsRequestMessage.client_id',
@@ -1485,22 +1500,21 @@
             is_extension=False,
             extension_scope=None,
             options=None),
-        _descriptor.FieldDescriptor(
-            name='data',
-            full_name='paddle.PsRequestMessage.data',
-            index=4,
-            number=5,
-            type=12,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b(""),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='data',
+                                    full_name='paddle.PsRequestMessage.data',
+                                    index=4,
+                                    number=5,
+                                    type=12,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=_b(""),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
     ],
     extensions=[],
     nested_types=[],
@@ -1511,7 +1525,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=2432,
-    serialized_end=2533, )
+    serialized_end=2533,
+)
 
 _SPARSESGDRULEPARAMETER = _descriptor.Descriptor(
     name='SparseSGDRuleParameter',
@@ -1594,7 +1609,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=2535,
-    serialized_end=2654, )
+    serialized_end=2654,
+)
 
 _DENSESGDRULEPARAMETER = _descriptor.Descriptor(
     name='DenseSGDRuleParameter',
@@ -1693,7 +1709,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=2657,
-    serialized_end=2882, )
+    serialized_end=2882,
+)
 
 _ADAMSGDPARAMETER = _descriptor.Descriptor(
     name='AdamSGDParameter',
@@ -1792,7 +1809,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=2885,
-    serialized_end=3019, )
+    serialized_end=3019,
+)
 
 _NAIVESGDPARAMETER = _descriptor.Descriptor(
     name='NaiveSGDParameter',
@@ -1843,7 +1861,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=3021,
-    serialized_end=3087, )
+    serialized_end=3087,
+)
 
 _SUMMARYSGDPARAMETER = _descriptor.Descriptor(
     name='SummarySGDParameter',
@@ -1878,7 +1897,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=3089,
-    serialized_end=3148, )
+    serialized_end=3148,
+)
 
 _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor(
     name='MovingAverageRuleParameter',
@@ -1913,7 +1933,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=3150,
-    serialized_end=3196, )
+    serialized_end=3196,
+)
 
 _PSRESPONSEMESSAGE = _descriptor.Descriptor(
     name='PsResponseMessage',
@@ -1954,22 +1975,21 @@
             is_extension=False,
             extension_scope=None,
             options=None),
-        _descriptor.FieldDescriptor(
-            name='data',
-            full_name='paddle.PsResponseMessage.data',
-            index=2,
-            number=3,
-            type=12,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b(""),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='data',
+                                    full_name='paddle.PsResponseMessage.data',
+                                    index=2,
+                                    number=3,
+                                    type=12,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=_b(""),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
     ],
     extensions=[],
     nested_types=[],
@@ -1980,7 +2000,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=3198,
-    serialized_end=3271, )
+    serialized_end=3271,
+)
 
 _FSCLIENTPARAMETER = _descriptor.Descriptor(
     name='FsClientParameter',
@@ -2005,54 +2026,51 @@
             is_extension=False,
             extension_scope=None,
             options=None),
-        _descriptor.FieldDescriptor(
-            name='uri',
-            full_name='paddle.FsClientParameter.uri',
-            index=1,
-            number=2,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b("").decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
-        _descriptor.FieldDescriptor(
-            name='user',
-            full_name='paddle.FsClientParameter.user',
-            index=2,
-            number=3,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b("").decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
-        _descriptor.FieldDescriptor(
-            name='passwd',
-            full_name='paddle.FsClientParameter.passwd',
-            index=3,
-            number=4,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b("").decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='uri',
+                                    full_name='paddle.FsClientParameter.uri',
+                                    index=1,
+                                    number=2,
+                                    type=9,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=_b("").decode('utf-8'),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
+        _descriptor.FieldDescriptor(name='user',
+                                    full_name='paddle.FsClientParameter.user',
+                                    index=2,
+                                    number=3,
+                                    type=9,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=_b("").decode('utf-8'),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
+        _descriptor.FieldDescriptor(name='passwd',
+                                    full_name='paddle.FsClientParameter.passwd',
+                                    index=3,
+                                    number=4,
+                                    type=9,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=_b("").decode('utf-8'),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='buffer_size',
             full_name='paddle.FsClientParameter.buffer_size',
@@ -2104,14 +2122,17 @@
     ],
     extensions=[],
     nested_types=[],
-    enum_types=[_FSCLIENTPARAMETER_FSAPITYPE, ],
+    enum_types=[
+        _FSCLIENTPARAMETER_FSAPITYPE,
+    ],
     options=None,
     is_extendable=False,
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
     serialized_start=3274,
-    serialized_end=3487, )
+    serialized_end=3487,
+)
 
 _PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER
 _PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER
@@ -2193,121 +2214,109 @@
 PSParameter = _reflection.GeneratedProtocolMessageType(
     'PSParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_PSPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.PSParameter)
-    ))
+    dict(DESCRIPTOR=_PSPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.PSParameter)
+         ))
 _sym_db.RegisterMessage(PSParameter)
 
 WorkerParameter = _reflection.GeneratedProtocolMessageType(
     'WorkerParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_WORKERPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.WorkerParameter)
-    ))
+    dict(DESCRIPTOR=_WORKERPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.WorkerParameter)
+         ))
 _sym_db.RegisterMessage(WorkerParameter)
 
 ServerParameter = _reflection.GeneratedProtocolMessageType(
     'ServerParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_SERVERPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.ServerParameter)
-    ))
+    dict(DESCRIPTOR=_SERVERPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.ServerParameter)
+         ))
 _sym_db.RegisterMessage(ServerParameter)
 
 DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType(
     'DownpourWorkerParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_DOWNPOURWORKERPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter)
-    ))
+    dict(DESCRIPTOR=_DOWNPOURWORKERPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter)
+         ))
 _sym_db.RegisterMessage(DownpourWorkerParameter)
 
 DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType(
     'DownpourTrainerParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_DOWNPOURTRAINERPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter)
-    ))
+    dict(DESCRIPTOR=_DOWNPOURTRAINERPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter)
+         ))
 _sym_db.RegisterMessage(DownpourTrainerParameter)
 
 ProgramConfig = _reflection.GeneratedProtocolMessageType(
     'ProgramConfig',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_PROGRAMCONFIG,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.ProgramConfig)
-    ))
+    dict(DESCRIPTOR=_PROGRAMCONFIG,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.ProgramConfig)
+         ))
 _sym_db.RegisterMessage(ProgramConfig)
 
 DenseTableParameter = _reflection.GeneratedProtocolMessageType(
     'DenseTableParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_DENSETABLEPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter)
-    ))
+    dict(DESCRIPTOR=_DENSETABLEPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter)
+         ))
 _sym_db.RegisterMessage(DenseTableParameter)
 
 SparseTableParameter = _reflection.GeneratedProtocolMessageType(
     'SparseTableParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_SPARSETABLEPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter)
-    ))
+    dict(DESCRIPTOR=_SPARSETABLEPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter)
+         ))
 _sym_db.RegisterMessage(SparseTableParameter)
 
 DownpourServerParameter = _reflection.GeneratedProtocolMessageType(
     'DownpourServerParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_DOWNPOURSERVERPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter)
-    ))
+    dict(DESCRIPTOR=_DOWNPOURSERVERPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter)
+         ))
 _sym_db.RegisterMessage(DownpourServerParameter)
 
 ServerServiceParameter = _reflection.GeneratedProtocolMessageType(
     'ServerServiceParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_SERVERSERVICEPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter)
-    ))
+    dict(DESCRIPTOR=_SERVERSERVICEPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter)
+         ))
 _sym_db.RegisterMessage(ServerServiceParameter)
 
 TableParameter = _reflection.GeneratedProtocolMessageType(
     'TableParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_TABLEPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.TableParameter)
-    ))
+    dict(DESCRIPTOR=_TABLEPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.TableParameter)
+         ))
 _sym_db.RegisterMessage(TableParameter)
 
 TableAccessorParameter = _reflection.GeneratedProtocolMessageType(
     'TableAccessorParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_TABLEACCESSORPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter)
-    ))
+    dict(DESCRIPTOR=_TABLEACCESSORPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter)
+         ))
 _sym_db.RegisterMessage(TableAccessorParameter)
 
 DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType(
@@ -2333,61 +2342,55 @@
 PsRequestMessage = _reflection.GeneratedProtocolMessageType(
     'PsRequestMessage',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_PSREQUESTMESSAGE,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage)
-    ))
+    dict(DESCRIPTOR=_PSREQUESTMESSAGE,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage)
+         ))
 _sym_db.RegisterMessage(PsRequestMessage)
 
 SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType(
     'SparseSGDRuleParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_SPARSESGDRULEPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter)
-    ))
+    dict(DESCRIPTOR=_SPARSESGDRULEPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter)
+         ))
 _sym_db.RegisterMessage(SparseSGDRuleParameter)
 
 DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType(
     'DenseSGDRuleParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_DENSESGDRULEPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter)
-    ))
+    dict(DESCRIPTOR=_DENSESGDRULEPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter)
+         ))
 _sym_db.RegisterMessage(DenseSGDRuleParameter)
 
 AdamSGDParameter = _reflection.GeneratedProtocolMessageType(
     'AdamSGDParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_ADAMSGDPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter)
-    ))
+    dict(DESCRIPTOR=_ADAMSGDPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter)
+         ))
 _sym_db.RegisterMessage(AdamSGDParameter)
 
 NaiveSGDParameter = _reflection.GeneratedProtocolMessageType(
     'NaiveSGDParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_NAIVESGDPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter)
-    ))
+    dict(DESCRIPTOR=_NAIVESGDPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter)
+         ))
 _sym_db.RegisterMessage(NaiveSGDParameter)
 
 SummarySGDParameter = _reflection.GeneratedProtocolMessageType(
     'SummarySGDParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_SUMMARYSGDPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter)
-    ))
+    dict(DESCRIPTOR=_SUMMARYSGDPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter)
+         ))
 _sym_db.RegisterMessage(SummarySGDParameter)
 
 MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType(
@@ -2403,21 +2406,19 @@
 PsResponseMessage = _reflection.GeneratedProtocolMessageType(
     'PsResponseMessage',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_PSRESPONSEMESSAGE,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage)
-    ))
+    dict(DESCRIPTOR=_PSRESPONSEMESSAGE,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage)
+         ))
 _sym_db.RegisterMessage(PsResponseMessage)
 
 FsClientParameter = _reflection.GeneratedProtocolMessageType(
     'FsClientParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_FSCLIENTPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.FsClientParameter)
-    ))
+    dict(DESCRIPTOR=_FSCLIENTPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.FsClientParameter)
+         ))
 _sym_db.RegisterMessage(FsClientParameter)
 
 DESCRIPTOR.has_options = True
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index 5da5dbbd7bdfc..f441a35ca0ff8 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -181,8 +181,8 @@ def check_models(models):
     for model in models:
         if not isinstance(model, paddle.nn.Layer):
             raise RuntimeError(
-                "Current train mode is pure fp16, models should be paddle.nn.Layer, but receive {}.".
-                format(type(model)))
+                "Current train mode is pure fp16, models should be paddle.nn.Layer, but receive {}."
+                .format(type(model)))
         if isinstance(model, paddle.DataParallel):
             raise RuntimeError(
                 "For distributed AMP training, you should first use paddle.amp.decorate() to decotate origin model, and then call paddle.DataParallel get distributed model."
@@ -191,11 +191,12 @@ def check_models(models):
 
 def check_optimizers(optimizers):
     for optimizer in optimizers:
-        if not isinstance(optimizer, (paddle.optimizer.Optimizer,
-                                      paddle.fluid.optimizer.Optimizer)):
+        if not isinstance(
+                optimizer,
+            (paddle.optimizer.Optimizer, paddle.fluid.optimizer.Optimizer)):
             raise RuntimeError(
-                "Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer, but receive {}.".
-                format(type(optimizer)))
+                "Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer, but receive {}."
+                .format(type(optimizer)))
 
 
 @signature_safe_contextmanager
@@ -273,11 +274,11 @@ def amp_guard(enable=True,
     # check device_type:
     # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, mlu for float16, npu for float16.
     # Maybe we will support cpu for bfloat16.
-    if enable and not (tracer._expected_place.is_gpu_place() or
-                       tracer._expected_place.is_xpu_place() or
-                       tracer._expected_place.is_mlu_place() or
-                       tracer._expected_place.is_npu_place() or
-                       tracer._expected_place.is_custom_place()):
+    if enable and not (tracer._expected_place.is_gpu_place()
+                       or tracer._expected_place.is_xpu_place()
+                       or tracer._expected_place.is_mlu_place()
+                       or tracer._expected_place.is_npu_place()
+                       or tracer._expected_place.is_custom_place()):
         warnings.warn(
             'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace, and CustomPlace, current place is %s, so it makes no effect.'
             % tracer._expected_place)
@@ -384,6 +385,7 @@ def amp_guard(enable=True,
 
 
 class StateDictHook(object):
+
     def __init__(self, save_dtype):
         self._save_dtype = save_dtype
 
@@ -492,8 +494,9 @@ def amp_decorate(models,
     if optimizers is not None:
         # check optimizers
         optimizers_is_list = False
-        if isinstance(optimizers, (paddle.optimizer.Optimizer,
-                                   paddle.fluid.optimizer.Optimizer)):
+        if isinstance(
+                optimizers,
+            (paddle.optimizer.Optimizer, paddle.fluid.optimizer.Optimizer)):
             optimizers_is_list = False
             optimizers = [optimizers]
             check_optimizers(optimizers)
@@ -504,7 +507,7 @@ def amp_decorate(models,
             raise TypeError(
                 "optimizers must be either a single optimizer or a list of optimizers."
             )
-        # supprot master_weight    
+        # supprot master_weight
         for idx_opt in range(len(optimizers)):
             if hasattr(optimizers[idx_opt], '_multi_precision'):
                 if master_weight is False:
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index df79b5ab5e482..9da69b1e45e0b 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -104,11 +104,11 @@ def __init__(self,
             raise ValueError(
                 "current_tracer is None, maybe it is not in imperative mode.")
 
-        if enable and not (tracer._expected_place.is_gpu_place() or
-                           tracer._expected_place.is_xpu_place() or
-                           tracer._expected_place.is_mlu_place() or
-                           tracer._expected_place.is_npu_place() or
-                           tracer._expected_place.is_custom_place()):
+        if enable and not (tracer._expected_place.is_gpu_place()
+                           or tracer._expected_place.is_xpu_place()
+                           or tracer._expected_place.is_mlu_place()
+                           or tracer._expected_place.is_npu_place()
+                           or tracer._expected_place.is_custom_place()):
             warnings.warn(
                 'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace and CustomPlace, current place is %s, so it makes no effect.'
                 % tracer._expected_place)
@@ -279,15 +279,13 @@ def _unscale(self, optimizer):
             ]
             param_grads_fp16 = [
                 param._grad_ivar() for param in optimizer._parameter_list
-                if (param._grad_ivar() is not None
-                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16
-                           )
+                if (param._grad_ivar() is not None) and (
+                    param._grad_ivar().dtype == core.VarDesc.VarType.FP16)
             ]
             param_grads_fp32 = [
                 param._grad_ivar() for param in optimizer._parameter_list
-                if (param._grad_ivar() is not None
-                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
-                           )
+                if (param._grad_ivar() is not None) and (
+                    param._grad_ivar().dtype == core.VarDesc.VarType.FP32)
             ]
         if core.is_compiled_with_npu():
             float_status = _C_ops.alloc_float_status()
@@ -332,10 +330,9 @@ def _update(self):
             self._decr_count = self._decr_count + 1
             if self._decr_count == self._decr_every_n_nan_or_inf:
                 print(
-                    'Found inf or nan, current scale is: {}, decrease to: {}*{}'.
-                    format(
-                        float(self._scale),
-                        float(self._scale), float(self._decr_ratio)))
+                    'Found inf or nan, current scale is: {}, decrease to: {}*{}'
+                    .format(float(self._scale), float(self._scale),
+                            float(self._decr_ratio)))
                 self._scale = self._scale * self._decr_ratio
                 self._decr_count = 0
         else:
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 6e61f998b28dc..4e22af9cfdb64 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -46,6 +46,7 @@ def in_declarative_mode():
 
 
 def _switch_to_static_graph_(func):
+
     def __impl__(*args, **kwargs):
         with framework._dygraph_guard(None):
             return func(*args, **kwargs)
@@ -85,8 +86,8 @@ def program_desc_tracing_guard(enable):
 @signature_safe_contextmanager
 def param_guard(parameters):
     # Note: parameters is a reference of self._parameters or self._buffers
-    if in_declarative_mode() and not framework._non_static_mode(
-    ) and parameters:
+    if in_declarative_mode(
+    ) and not framework._non_static_mode() and parameters:
         origin_parameters = parameters.copy()
         for name, var_base in parameters.items():
             if isinstance(var_base, list):
@@ -124,8 +125,8 @@ def _convert_into_variable(tensor):
             # non-persistable. See case of `drop_state` in lstm api.
             is_persistable = len(tensor.shape) > 0
 
-            new_var = tensor._to_static_var(
-                to_parameter=False, persistable=is_persistable)
+            new_var = tensor._to_static_var(to_parameter=False,
+                                            persistable=is_persistable)
         return new_var
     else:
         return tensor
@@ -348,6 +349,7 @@ def test_layer():
     """
 
     def __call__(self, func):
+
         @decorator.decorator
         def _decorate_function(func, *args, **kwargs):
             with self:
@@ -569,8 +571,8 @@ def check_in_out(in_out_list, name):
             for each_var in in_out_list:
                 if _in_eager_without_dygraph_check():
                     assert isinstance(
-                        each_var, core.eager.
-                        Tensor), "Elements of {} must be Tensor".format(name)
+                        each_var, core.eager.Tensor
+                    ), "Elements of {} must be Tensor".format(name)
                 else:
                     assert isinstance(
                         each_var,
@@ -580,8 +582,8 @@ def check_in_out(in_out_list, name):
         else:
             if _in_eager_without_dygraph_check():
                 assert isinstance(
-                    in_out_list, core.eager.
-                    Tensor), "{} must be Tensor or list of Tensor".format(name)
+                    in_out_list, core.eager.Tensor
+                ), "{} must be Tensor or list of Tensor".format(name)
             else:
                 assert isinstance(
                     in_out_list, core.VarBase
@@ -632,7 +634,8 @@ def check_in_out(in_out_list, name):
     else:
         if _in_eager_without_dygraph_check():
             raise AssertionError(
-                "no_grad_vars must be None, Tensor or list/tuple/set of Tensors")
+                "no_grad_vars must be None, Tensor or list/tuple/set of Tensors"
+            )
         else:
             raise AssertionError(
                 "no_grad_vars must be None, Variable or list/tuple/set of Variables"
@@ -652,15 +655,17 @@ def check_in_out(in_out_list, name):
     assert only_inputs, "only_inputs=False is not supported yet"
 
     if _in_eager_without_dygraph_check():
-        return core.eager.run_partial_grad(
-            outputs, inputs, grad_outputs, retain_graph, create_graph,
-            only_inputs, allow_unused, no_grad_vars)
+        return core.eager.run_partial_grad(outputs, inputs, grad_outputs,
+                                           retain_graph, create_graph,
+                                           only_inputs, allow_unused,
+                                           no_grad_vars)
     else:
         place = core.Place()
         place.set_place(framework._current_expected_place())
-        return core.dygraph_partial_grad(
-            inputs, outputs, grad_outputs, no_grad_vars, place, create_graph,
-            retain_graph, allow_unused, only_inputs)
+        return core.dygraph_partial_grad(inputs, outputs, grad_outputs,
+                                         no_grad_vars, place, create_graph,
+                                         retain_graph, allow_unused,
+                                         only_inputs)
 
 
 @framework.dygraph_only
@@ -756,14 +761,13 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
                 value = value.astype(dtype)
 
         if _in_eager_without_dygraph_check():
-            return core.eager.Tensor(value,
-                                     framework._current_expected_place(), False,
-                                     zero_copy, name if name else None, True)
+            return core.eager.Tensor(value, framework._current_expected_place(),
+                                     False, zero_copy, name if name else None,
+                                     True)
         else:
-            py_var = core.VarBase(
-                value=value,
-                place=framework._current_expected_place(),
-                persistable=False,
-                zero_copy=zero_copy,
-                name=name if name else '')
+            py_var = core.VarBase(value=value,
+                                  place=framework._current_expected_place(),
+                                  persistable=False,
+                                  zero_copy=zero_copy,
+                                  name=name if name else '')
             return py_var
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index ba5c709b1d877..0fe5d236a58e5 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -119,7 +119,7 @@ def save_dygraph(state_dict, model_path):
         pickle.dump(model_dict, f, protocol=2)
 
 
-# NOTE(chenweihang): load_dygraph will deprecated in future, we don't 
+# NOTE(chenweihang): load_dygraph will deprecated in future, we don't
 # support new loading features for it
 # TODO(qingqing01): remove dygraph_only to support loading static model.
 # maybe need to unify the loading interface after 2.0 API is ready.
@@ -217,11 +217,11 @@ def load_dygraph(model_path, **configs):
         if os.path.exists(model_file_path):
             # Load state dict by `jit.save/io.save_inference_model` save format
             # NOTE(chenweihang): [ Compatibility of save_inference_model save format ]
-            # The model saved by `save_inference_model` does not completely correspond to 
-            # the information required by the `state_dict` under the dygraph. 
-            # `save_inference_model` not save structured name, we need to remind 
+            # The model saved by `save_inference_model` does not completely correspond to
+            # the information required by the `state_dict` under the dygraph.
+            # `save_inference_model` not save structured name, we need to remind
             # the user to configure the `use_structured_name` argument when `set_state_dict`
-            # NOTE(chenweihang): `jit.save` doesn't save optimizer state 
+            # NOTE(chenweihang): `jit.save` doesn't save optimizer state
 
             # 1. load program desc & construct _ProgramHolder
             programs = _construct_program_holders(model_path,
@@ -259,11 +259,11 @@ def load_dygraph(model_path, **configs):
             # load state dict by `io.save_params/persistables` save format
             # TODO(chenweihang): [ Now only supports loading parameters separately ]
             # If users save all parameters as one file, the [ variable.name -> variable ]
-            # mapping info will lost, so users need to give variable list, but users build 
+            # mapping info will lost, so users need to give variable list, but users build
             # variable list in dygraph mode is difficult, we recommend users to use
             # paddle.static.load_program_state in this case
 
-            # Try to load all the files in the directory in VarBase format, 
+            # Try to load all the files in the directory in VarBase format,
             # the file name is used as the name of VarBase
             load_var_list = []
 
diff --git a/python/paddle/fluid/dygraph/container.py b/python/paddle/fluid/dygraph/container.py
index ca40781a5f9c4..854df39355748 100644
--- a/python/paddle/fluid/dygraph/container.py
+++ b/python/paddle/fluid/dygraph/container.py
@@ -222,8 +222,8 @@ def _get_abs_idx(self, idx):
         if isinstance(idx, int):
             if not (-len(self) <= idx < len(self)):
                 raise IndexError(
-                    'index {} is out of range, should be an integer in range [{}, {})'.
-                    format(idx, -len(self), len(self)))
+                    'index {} is out of range, should be an integer in range [{}, {})'
+                    .format(idx, -len(self), len(self)))
             if idx < 0:
                 idx += len(self)
         return idx
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
index 401ad1c8e84e4..b85a2137dad81 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
@@ -60,7 +60,8 @@ def transform(self):
                 i += len(new_stmts)
                 return new_stmts
         raise ValueError(
-            "parent_node doesn't contain the loop_node in ForToWhileTransformer")
+            "parent_node doesn't contain the loop_node in ForToWhileTransformer"
+        )
 
     def get_for_stmt_nodes(self, node):
         assert isinstance(
@@ -74,12 +75,13 @@ def get_for_stmt_nodes(self, node):
         init_stmts, cond_stmt, body_stmts = stmts_tuple
 
         # 2. append break statement
-        new_cond_stmt = gast.BoolOp(
-            op=gast.And(), values=[cond_stmt, self.condition_node])
+        new_cond_stmt = gast.BoolOp(op=gast.And(),
+                                    values=[cond_stmt, self.condition_node])
 
         # 3. construct gast.While node
-        while_node = gast.While(
-            test=new_cond_stmt, body=body_stmts, orelse=node.orelse)
+        while_node = gast.While(test=new_cond_stmt,
+                                body=body_stmts,
+                                orelse=node.orelse)
         init_stmts.append(while_node)
         return init_stmts
 
@@ -141,17 +143,15 @@ def visit_Break(self, node):
         assign_false_node = create_fill_constant_node(variable_name, False)
         self._add_stmt_before_cur_node(loop_node_index, assign_false_node)
 
-        cond_var_node = gast.UnaryOp(
-            op=gast.Not(),
-            operand=gast.Name(
-                id=variable_name,
-                ctx=gast.Load(),
-                annotation=None,
-                type_comment=None))
+        cond_var_node = gast.UnaryOp(op=gast.Not(),
+                                     operand=gast.Name(id=variable_name,
+                                                       ctx=gast.Load(),
+                                                       annotation=None,
+                                                       type_comment=None))
 
         if isinstance(loop_node, gast.While):
-            loop_node.test = gast.BoolOp(
-                op=gast.And(), values=[loop_node.test, cond_var_node])
+            loop_node.test = gast.BoolOp(op=gast.And(),
+                                         values=[loop_node.test, cond_var_node])
         elif isinstance(loop_node, gast.For):
             parent_node = self.ancestor_nodes[loop_node_index - 1]
             for_to_while = ForToWhileTransformer(parent_node, loop_node,
@@ -180,8 +180,9 @@ def visit_Continue(self, node):
         assign_false_node = create_fill_constant_node(variable_name, False)
         loop_node.body.insert(0, assign_false_node)
 
-    def _remove_stmts_after_break_continue(
-            self, break_continue_node, break_continue_name, loop_node_index):
+    def _remove_stmts_after_break_continue(self, break_continue_node,
+                                           break_continue_name,
+                                           loop_node_index):
         for first_block_index in range(
                 len(self.ancestor_nodes) - 1, loop_node_index - 1, -1):
             first_block = self.ancestor_nodes[first_block_index]
@@ -214,8 +215,9 @@ def _replace_if_stmt(self, loop_node_index, first_block_index,
                         cur_node.orelse, son_node, break_continue_name):
                 continue
 
-    def _replace_break_continue_in_stmt_list(
-            self, stmt_list, break_continue_node, break_continue_name):
+    def _replace_break_continue_in_stmt_list(self, stmt_list,
+                                             break_continue_node,
+                                             break_continue_name):
         i = index_in_list(stmt_list, break_continue_node)
         if i == -1:
             return False
@@ -233,13 +235,12 @@ def _replace_after_node_to_if_in_stmt_list(self, stmt_list, node,
             # No need to add, we consider this as added successfully
             return True
 
-        if_stmt = gast.If(test=gast.UnaryOp(
-            op=gast.Not(),
-            operand=gast.Name(
-                id=break_continue_name,
-                ctx=gast.Store(),
-                annotation=None,
-                type_comment=None)),
+        if_stmt = gast.If(test=gast.UnaryOp(op=gast.Not(),
+                                            operand=gast.Name(
+                                                id=break_continue_name,
+                                                ctx=gast.Store(),
+                                                annotation=None,
+                                                type_comment=None)),
                           body=stmt_list[i + 1:],
                           orelse=[])
         stmt_list[i + 1:] = []
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
index 50733e4d896e4..bf7791c788ccf 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
@@ -39,8 +39,8 @@ def visit_Call(self, node):
         func_str = ast_to_source_code(node.func).strip()
         if func_str in self._castable_type and len(node.args) > 0:
             args_str = ast_to_source_code(node.args[0]).strip()
-            new_func_str = "_jst.convert_var_dtype({}, '{}')".format(args_str,
-                                                                     func_str)
+            new_func_str = "_jst.convert_var_dtype({}, '{}')".format(
+                args_str, func_str)
             new_node = gast.parse(new_func_str).body[0].value
             return new_node
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index 0b009c0049dcb..cf3383f5d0638 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -96,8 +96,8 @@ def is_unsupported(func):
             if func_in_dict:
                 translator_logger.log(
                     2,
-                    "Whitelist: {} is part of built-in module and does not have to be transformed.".
-                    format(func))
+                    "Whitelist: {} is part of built-in module and does not have to be transformed."
+                    .format(func))
                 return True
 
     # NOTE: should be placed before `is_paddle_func`
@@ -107,8 +107,8 @@ def is_unsupported(func):
     if is_paddle_func(func):
         translator_logger.log(
             2,
-            "Whitelist: {} is part of Paddle module and does not have to be transformed.".
-            format(func))
+            "Whitelist: {} is part of Paddle module and does not have to be transformed."
+            .format(func))
         return True
 
 
@@ -161,8 +161,8 @@ def dyfunc(x):
     if options is not None and options.not_convert:
         translator_logger.log(
             2,
-            "{} is not converted when it is decorated by 'paddle.jit.not_to_static'.".
-            format(func))
+            "{} is not converted when it is decorated by 'paddle.jit.not_to_static'."
+            .format(func))
         return func
 
     if is_builtin_len(func):
@@ -175,7 +175,7 @@ def dyfunc(x):
         return func
 
     if inspect.isgeneratorfunction(func):
-        # NOTE(xiongkun03): inspect.isfunction() will return True even though func is a generator function. 
+        # NOTE(xiongkun03): inspect.isfunction() will return True even though func is a generator function.
         # If we don't deal generatorfunction here, we will regard it as normal function and get errors in some
         # occasion.
         number_of_stars = 30
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index 576baf6cc299a..7933ddfe590c9 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -219,8 +219,8 @@ def _remove_no_value_return_var(out):
         align_ret = out[0]
         if isinstance(align_ret, tuple):
             for index, item in enumerate(align_ret):
-                if isinstance(item, Variable) and (
-                        RETURN_NO_VALUE_VAR_NAME in item.name):
+                if isinstance(item, Variable) and (RETURN_NO_VALUE_VAR_NAME
+                                                   in item.name):
                     # return None
                     if index == 0:
                         processed_out = (None, ) + out[1:]
@@ -231,8 +231,8 @@ def _remove_no_value_return_var(out):
                     break
 
         for index, item in enumerate(processed_out):
-            if isinstance(item, Variable) and (
-                    RETURN_NO_VALUE_VAR_NAME in item.name):
+            if isinstance(item, Variable) and (RETURN_NO_VALUE_VAR_NAME
+                                               in item.name):
                 processed_out = processed_out[:index]
 
         if not processed_out:
@@ -316,8 +316,8 @@ def has_negative(list_shape, idx=None):
     #      # Assume x.shape=[3, -1] in static mode
     #      y = paddle.reshape(x, shape=[1, x.shape[1]])
     #      ```
-    if isinstance(x, Variable) and (in_control_flow or has_negative(x.shape,
-                                                                    idx)):
+    if isinstance(x, Variable) and (in_control_flow
+                                    or has_negative(x.shape, idx)):
         return nn.shape(x) if idx is None else nn.shape(x)[idx]
     else:
         return x.shape if idx is None else x.shape[idx]
@@ -549,6 +549,7 @@ def body(i, new_array):
 # TODO(liym27): A better way to slice tensor array.
 #  Maybe support start == end for slice op.
 def _slice_tensor_array(array, start, end):
+
     def true_fn():
         null_array = create_array("float32")
         return null_array
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/error.py b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
index 69ec89a5af644..c422c5269e75d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
@@ -114,9 +114,9 @@ def __init__(self, location, function_name):
         for i in range(len(self.source_code)):
             # if source_code[i] is empty line between two code line, dont add blank
             if self.source_code[i]:
-                self.source_code[i] = ' ' * (blank_count[i] - min_black_count +
-                                             BLANK_COUNT_BEFORE_FILE_STR * 2
-                                             ) + self.source_code[i]
+                self.source_code[i] = ' ' * (
+                    blank_count[i] - min_black_count +
+                    BLANK_COUNT_BEFORE_FILE_STR * 2) + self.source_code[i]
 
     def formated_message(self):
         msg = ' ' * BLANK_COUNT_BEFORE_FILE_STR + 'File "{}", line {}, in {}\n'.format(
@@ -126,6 +126,7 @@ def formated_message(self):
 
 
 class SuggestionDict(object):
+
     def __init__(self):
         # {(keywords): (suggestions)}
         self.suggestion_dict = {
@@ -212,16 +213,16 @@ def create_message(self):
             -1] + 1 if user_code_traceback_index else 0
         for filepath, lineno, funcname, code in self.origin_traceback[
                 paddle_traceback_start_index:]:
-            traceback_frame = TraceBackFrame(
-                Location(filepath, lineno), funcname, code)
+            traceback_frame = TraceBackFrame(Location(filepath, lineno),
+                                             funcname, code)
             message_lines.append(traceback_frame.formated_message())
         message_lines.append("")
 
         # Step3: Adds error message like "TypeError: dtype must be int32, but received float32".
         # NOTE: `format_exception` is a list, its length is 1 in most cases, but sometimes its length
         # is gather than 1, for example, the error_type is IndentationError.
-        format_exception = traceback.format_exception_only(self.error_type,
-                                                           self.error_value)
+        format_exception = traceback.format_exception_only(
+            self.error_type, self.error_value)
         error_message = [
             " " * BLANK_COUNT_BEFORE_FILE_STR + line
             for line in format_exception
@@ -281,8 +282,8 @@ def _simplify_error_value(self):
             if error_value_lines_strip[i].startswith("File "):
                 re_result = re.search(pattern, error_value_lines_strip[i])
                 tmp_filepath, lineno_str, function_name = re_result.groups()
-                code = error_value_lines_strip[i + 1] if i + 1 < len(
-                    error_value_lines_strip) else ''
+                code = error_value_lines_strip[
+                    i + 1] if i + 1 < len(error_value_lines_strip) else ''
                 if i == 0:
                     user_filepath = tmp_filepath
                 if tmp_filepath == user_filepath:
@@ -299,8 +300,8 @@ def _simplify_error_value(self):
                 traceback_frame = TraceBackFrameRange(
                     Location(filepath, lineno), funcname)
             else:
-                traceback_frame = TraceBackFrame(
-                    Location(filepath, lineno), funcname, code)
+                traceback_frame = TraceBackFrame(Location(filepath, lineno),
+                                                 funcname, code)
             error_frame.append(traceback_frame.formated_message())
         error_frame.append("")
 
@@ -309,8 +310,8 @@ def _simplify_error_value(self):
             -1] + 1 if user_code_traceback_index else 0
         for filepath, lineno, funcname, code in error_traceback[
                 paddle_traceback_start_index:]:
-            traceback_frame = TraceBackFrame(
-                Location(filepath, lineno), funcname, code)
+            traceback_frame = TraceBackFrame(Location(filepath, lineno),
+                                             funcname, code)
             error_frame.append(traceback_frame.formated_message())
         error_frame.append("")
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
index 900541459f6fc..e8afef0946898 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -73,8 +73,8 @@ def unified_args_and_kwargs(self, args, kwargs):
         """
         if len(self._arg_names) < len(args):
             error_msg = "The decorated function `{}` requires {} arguments: {}, but received {} with {}.".format(
-                self._dygraph_function.__name__,
-                len(self._arg_names), self._arg_names, len(args), args)
+                self._dygraph_function.__name__, len(self._arg_names),
+                self._arg_names, len(args), args)
             if args and inspect.isclass(args[0]):
                 error_msg += "\n\tMaybe the function has more than one decorator, we don't support this for now."
                 raise NotImplementedError(error_msg)
@@ -91,9 +91,9 @@ def unified_args_and_kwargs(self, args, kwargs):
             else:
                 if arg_name not in self._default_kwargs:
                     raise ValueError(
-                        "`{}()` requires `{}` arguments, but not found in input `args`: {} and `kwargs`: {}.".
-                        format(self._dygraph_function.__name__, arg_name, args,
-                               kwargs))
+                        "`{}()` requires `{}` arguments, but not found in input `args`: {} and `kwargs`: {}."
+                        .format(self._dygraph_function.__name__, arg_name, args,
+                                kwargs))
                 args.append(self._default_kwargs[arg_name])
 
         return tuple(args), kwargs
@@ -136,16 +136,16 @@ def args_to_input_spec(self, args, kwargs):
             # So we don't support to deal this case while specificing `input_spec` currently.
             if kwargs:
                 raise ValueError(
-                    "{} got unexpected keyword arguments: {}. Cannot trace the function when `input_spec` is specificed.".
-                    format(self._dygraph_function.__name__, kwargs))
+                    "{} got unexpected keyword arguments: {}. Cannot trace the function when `input_spec` is specificed."
+                    .format(self._dygraph_function.__name__, kwargs))
 
             # Note: The length of `input_spec` can be greater than `args`,
             # because `args` may contains non-tensor value merged form `kwargs`
             # after `unified_args_and_kwargs`.
             if len(args) < len(self._input_spec):
                 raise ValueError(
-                    "Requires len(arguments) >= len(input_spec), but received len(args):{} < len(InputSpec): {}".
-                    format(len(args), len(self._input_spec)))
+                    "Requires len(arguments) >= len(input_spec), but received len(args):{} < len(InputSpec): {}"
+                    .format(len(args), len(self._input_spec)))
 
             # replace argument with corresponding InputSpec.
             args_with_spec = convert_to_input_spec(args, self._input_spec)
@@ -196,8 +196,8 @@ def _verify_input_spec(self, input_spec):
         """
         if not isinstance(input_spec, (tuple, list)):
             raise TypeError(
-                "The type(input_spec) should be one of (tuple, list), but received {}.".
-                format(type_name(input_spec)))
+                "The type(input_spec) should be one of (tuple, list), but received {}."
+                .format(type_name(input_spec)))
 
         return tuple(input_spec)
 
@@ -289,8 +289,8 @@ def check_type_and_len(input, spec, check_length=False):
                 type(spec), type(input)))
         if check_length and len(input) < len(spec):
             raise ValueError(
-                'Requires len(inputs) >= len(input_spec), but received len(inputs):{} < len(input_spec):{}'.
-                format(len(inputs), len(input_spec)))
+                'Requires len(inputs) >= len(input_spec), but received len(inputs):{} < len(input_spec):{}'
+                .format(len(inputs), len(input_spec)))
 
     if isinstance(input_spec, (tuple, list)):
         input_with_spec = []
@@ -307,8 +307,8 @@ def check_type_and_len(input, spec, check_length=False):
                 if isinstance(rest_input, (core.VarBase, np.ndarray)):
                     logging_utils.warn(
                         "The inputs constain `{}` without specificing InputSpec, its shape and dtype will be treated immutable. "
-                        "Please specific InputSpec information in `@to_static` if you expect them as mutable inputs.".
-                        format(type_name(rest_input)))
+                        "Please specific InputSpec information in `@to_static` if you expect them as mutable inputs."
+                        .format(type_name(rest_input)))
         input_with_spec.extend(inputs[len(input_spec):])
 
         return input_with_spec
@@ -317,8 +317,8 @@ def check_type_and_len(input, spec, check_length=False):
         check_type_and_len(inputs, input_spec, True)
         for name, input in six.iteritems(inputs):
             if name in input_spec:
-                input_with_spec[name] = convert_to_input_spec(input,
-                                                              input_spec[name])
+                input_with_spec[name] = convert_to_input_spec(
+                    input, input_spec[name])
             else:
                 input_with_spec[name] = input
         return input_with_spec
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
index 98045b3aae432..d8d8d0bc043dd 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
@@ -69,8 +69,8 @@ def visit_Call(self, node):
                 warnings.warn("paddle.grad has unsupported parameter in jit: " +
                               kw.arg + ", jit will discard it")
                 continue
-            kw = gast.keyword(
-                arg=to_static_grad_param[arg_name], value=node.args[i])
+            kw = gast.keyword(arg=to_static_grad_param[arg_name],
+                              value=node.args[i])
             static_keywords.append(kw)
 
         node.func = gast.parse('paddle.static.gradients').body[0].value
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
index 157822430d234..4c003dd599906 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
@@ -91,6 +91,7 @@ def visit_IfExp(self, node):
 
 
 class NameVisitor(gast.NodeVisitor):
+
     def __init__(self, after_node=None, end_node=None):
         # The start node (exclusive) of the visitor
         self.after_node = after_node
@@ -159,8 +160,8 @@ def visit_If(self, node):
                 else:
                     # Blocks the vars in `if.body` and only inserts the vars both created in 'if/else' branch
                     # into name_ids.
-                    new_name_ids = self._find_new_name_ids(body_name_ids,
-                                                           else_name_ids)
+                    new_name_ids = self._find_new_name_ids(
+                        body_name_ids, else_name_ids)
                     for new_name_id in new_name_ids:
                         before_if_name_ids[new_name_id].append(gast.Store())
 
@@ -219,14 +220,15 @@ def _visit_child(self, node):
         return copy.deepcopy(self.name_ids)
 
     def _find_new_name_ids(self, body_name_ids, else_name_ids):
+
         def is_required_ctx(ctxs, required_ctx):
             for ctx in ctxs:
                 if isinstance(ctx, required_ctx):
                     return True
             return False
 
-        candidate_name_ids = set(body_name_ids.keys()) & set(else_name_ids.keys(
-        ))
+        candidate_name_ids = set(body_name_ids.keys()) & set(
+            else_name_ids.keys())
         store_ctx = gast.Store
         new_name_ids = set()
         for name_id in candidate_name_ids:
@@ -309,18 +311,18 @@ def parse_cond_args(parent_ids_dict,
 
     arg_name_ids.sort()
     args = [
-        gast.Name(
-            id=name_id, ctx=gast.Load(), annotation=None, type_comment=None)
-        for name_id in arg_name_ids
+        gast.Name(id=name_id,
+                  ctx=gast.Load(),
+                  annotation=None,
+                  type_comment=None) for name_id in arg_name_ids
     ]
-    arguments = gast.arguments(
-        args=args,
-        posonlyargs=[],
-        vararg=None,
-        kwonlyargs=[],
-        kw_defaults=None,
-        kwarg=None,
-        defaults=[])
+    arguments = gast.arguments(args=args,
+                               posonlyargs=[],
+                               vararg=None,
+                               kwonlyargs=[],
+                               kw_defaults=None,
+                               kwarg=None,
+                               defaults=[])
 
     return arguments
 
@@ -398,9 +400,8 @@ def _vars_with_store(ids_dict):
         return vars
 
     def _modified_vars(child_dict, parent_dict):
-        return set([
-            var for var in _vars_with_store(child_dict) if var in parent_dict
-        ])
+        return set(
+            [var for var in _vars_with_store(child_dict) if var in parent_dict])
 
     def _vars_loaded(ids_dict):
         """
@@ -446,8 +447,8 @@ def _vars_loaded(ids_dict):
     new_vars_to_create = new_vars_in_one_of_body_or_orelse & used_vars_after_ifelse | new_vars_in_body_and_orelse
 
     # 4. generate return_ids of if/else node.
-    return_ids = list(modified_vars_from_parent | new_vars_in_body_and_orelse |
-                      new_vars_to_create)
+    return_ids = list(modified_vars_from_parent | new_vars_in_body_and_orelse
+                      | new_vars_to_create)
     return_ids.sort()
 
     return return_ids, modified_vars_from_parent, new_vars_to_create
@@ -515,9 +516,10 @@ def create_name_nodes(name_ids):
             return gast.Tuple(elts=[], ctx=gast.Load())
 
         gast_names = [
-            gast.Name(
-                id=name_id, ctx=gast.Load(), annotation=None, type_comment=None)
-            for name_id in name_ids
+            gast.Name(id=name_id,
+                      ctx=gast.Load(),
+                      annotation=None,
+                      type_comment=None) for name_id in name_ids
         ]
         name_node = gast.Tuple(elts=gast_names, ctx=gast.Load())
         return name_node
@@ -537,14 +539,13 @@ def create_name_nodes(name_ids):
 
     convert_ifelse_layer = gast.parse(
         '_jst.convert_ifelse('
-        '{pred}, {true_fn}, {false_fn}, {true_args}, {false_args}, {return_vars})'.
-        format(
-            pred=ast_to_source_code(pred),
-            true_fn=true_func_source,
-            false_fn=false_func_source,
-            true_args=ast_to_source_code(true_args),
-            false_args=ast_to_source_code(false_args),
-            return_vars=ast_to_source_code(return_vars))).body[0].value
+        '{pred}, {true_fn}, {false_fn}, {true_args}, {false_args}, {return_vars})'
+        .format(pred=ast_to_source_code(pred),
+                true_fn=true_func_source,
+                false_fn=false_func_source,
+                true_args=ast_to_source_code(true_args),
+                false_args=ast_to_source_code(false_args),
+                return_vars=ast_to_source_code(return_vars))).body[0].value
 
     if return_name_ids:
         _, cond_node = create_assign_node(return_name_ids, convert_ifelse_layer)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
index 4a6d855a893f6..3ae10997c8e7f 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
@@ -30,6 +30,7 @@
 
 
 def synchronized(func):
+
     def wrapper(*args, **kwargs):
         with threading.Lock():
             return func(*args, **kwargs)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
index 8014a00bff983..045878ed54e1d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
@@ -240,7 +240,9 @@ def visit_Name(self, node):
 
         self.current_seen_vars.add(node)
         write_context = {
-            type(gast.Store()), type(gast.AugStore()), type(gast.Del())
+            type(gast.Store()),
+            type(gast.AugStore()),
+            type(gast.Del())
         }
         for loop_node in self.current_loop:
             self.in_loop_vars[loop_node].append(node)
@@ -581,20 +583,18 @@ def get_for_stmt_nodes(self, node):
         # 5. create & append condition function node
         condition_func_node = gast.FunctionDef(
             name=unique_name.generate(FOR_CONDITION_PREFIX),
-            args=gast.arguments(
-                args=[
-                    gast.Name(
-                        id=name,
-                        ctx=gast.Param(),
-                        annotation=None,
-                        type_comment=None) for name in loop_var_names
-                ],
-                posonlyargs=[],
-                vararg=None,
-                kwonlyargs=[],
-                kw_defaults=None,
-                kwarg=None,
-                defaults=[]),
+            args=gast.arguments(args=[
+                gast.Name(id=name,
+                          ctx=gast.Param(),
+                          annotation=None,
+                          type_comment=None) for name in loop_var_names
+            ],
+                                posonlyargs=[],
+                                vararg=None,
+                                kwonlyargs=[],
+                                kw_defaults=None,
+                                kwarg=None,
+                                defaults=[]),
             body=[gast.Return(value=cond_stmt)],
             decorator_list=[],
             returns=None,
@@ -613,20 +613,18 @@ def get_for_stmt_nodes(self, node):
                 loop_var_names, ctx=gast.Load(), gen_tuple_if_single=True)))
         body_func_node = gast.FunctionDef(
             name=unique_name.generate(FOR_BODY_PREFIX),
-            args=gast.arguments(
-                args=[
-                    gast.Name(
-                        id=name,
-                        ctx=gast.Param(),
-                        annotation=None,
-                        type_comment=None) for name in loop_var_names
-                ],
-                posonlyargs=[],
-                vararg=None,
-                kwonlyargs=[],
-                kw_defaults=None,
-                kwarg=None,
-                defaults=[]),
+            args=gast.arguments(args=[
+                gast.Name(id=name,
+                          ctx=gast.Param(),
+                          annotation=None,
+                          type_comment=None) for name in loop_var_names
+            ],
+                                posonlyargs=[],
+                                vararg=None,
+                                kwonlyargs=[],
+                                kw_defaults=None,
+                                kwarg=None,
+                                defaults=[]),
             body=body_stmts,
             decorator_list=[],
             returns=None,
@@ -639,8 +637,9 @@ def get_for_stmt_nodes(self, node):
         new_stmts.append(body_func_node)
 
         # 7. create & append while loop node
-        while_loop_nodes = create_while_nodes(
-            condition_func_node.name, body_func_node.name, loop_var_names)
+        while_loop_nodes = create_while_nodes(condition_func_node.name,
+                                              body_func_node.name,
+                                              loop_var_names)
         new_stmts.extend(while_loop_nodes)
 
         return new_stmts
@@ -664,20 +663,18 @@ def get_while_stmt_nodes(self, node):
 
         condition_func_node = gast.FunctionDef(
             name=unique_name.generate(WHILE_CONDITION_PREFIX),
-            args=gast.arguments(
-                args=[
-                    gast.Name(
-                        id=name,
-                        ctx=gast.Param(),
-                        annotation=None,
-                        type_comment=None) for name in loop_var_names
-                ],
-                posonlyargs=[],
-                vararg=None,
-                kwonlyargs=[],
-                kw_defaults=None,
-                kwarg=None,
-                defaults=[]),
+            args=gast.arguments(args=[
+                gast.Name(id=name,
+                          ctx=gast.Param(),
+                          annotation=None,
+                          type_comment=None) for name in loop_var_names
+            ],
+                                posonlyargs=[],
+                                vararg=None,
+                                kwonlyargs=[],
+                                kw_defaults=None,
+                                kwarg=None,
+                                defaults=[]),
             body=[gast.Return(value=node.test)],
             decorator_list=[],
             returns=None,
@@ -696,20 +693,18 @@ def get_while_stmt_nodes(self, node):
                 loop_var_names, ctx=gast.Load(), gen_tuple_if_single=True)))
         body_func_node = gast.FunctionDef(
             name=unique_name.generate(WHILE_BODY_PREFIX),
-            args=gast.arguments(
-                args=[
-                    gast.Name(
-                        id=name,
-                        ctx=gast.Param(),
-                        annotation=None,
-                        type_comment=None) for name in loop_var_names
-                ],
-                posonlyargs=[],
-                vararg=None,
-                kwonlyargs=[],
-                kw_defaults=None,
-                kwarg=None,
-                defaults=[]),
+            args=gast.arguments(args=[
+                gast.Name(id=name,
+                          ctx=gast.Param(),
+                          annotation=None,
+                          type_comment=None) for name in loop_var_names
+            ],
+                                posonlyargs=[],
+                                vararg=None,
+                                kwonlyargs=[],
+                                kw_defaults=None,
+                                kwarg=None,
+                                defaults=[]),
             body=new_body,
             decorator_list=[],
             returns=None,
@@ -721,7 +716,8 @@ def get_while_stmt_nodes(self, node):
                     name, unique_name.generate(GENERATE_VARIABLE_PREFIX))
         new_stmts.append(body_func_node)
 
-        while_loop_nodes = create_while_nodes(
-            condition_func_node.name, body_func_node.name, loop_var_names)
+        while_loop_nodes = create_while_nodes(condition_func_node.name,
+                                              body_func_node.name,
+                                              loop_var_names)
         new_stmts.extend(while_loop_nodes)
         return new_stmts
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
index 60043c42121bd..de12677768332 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
@@ -38,7 +38,8 @@ class Location(object):
     __slots__ = (
         "filepath",
         "lineno",
-        "col_offset", )
+        "col_offset",
+    )
 
     def __init__(self, filepath, lineno, col_offset=None):
         self.filepath = filepath
@@ -61,7 +62,8 @@ class OriginInfo(object):
     __slots__ = (
         "location",
         "function_name",
-        "source_code", )
+        "source_code",
+    )
 
     def __init__(self, location, function_name, source_code):
         self.location = location
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 90f960798ef2c..43a05cbb2f9fd 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -61,8 +61,8 @@ def restore(self, value_list):
     def _get_var_ids(self):
         var_ids = []
         for idx, var in enumerate(self.__input_list):
-            if isinstance(var, (framework.Variable, core.VarBase,
-                                core.eager.Tensor)):
+            if isinstance(
+                    var, (framework.Variable, core.VarBase, core.eager.Tensor)):
                 var_ids.append(idx)
 
         return var_ids
@@ -74,8 +74,9 @@ def _check_non_variable(self, need_check):
         if need_check:
             warning_types = set()
             for var in self.__input_list:
-                if not isinstance(var, (framework.Variable, core.VarBase,
-                                        core.eager.Tensor)):
+                if not isinstance(
+                        var,
+                    (framework.Variable, core.VarBase, core.eager.Tensor)):
                     warning_types.add(type(var))
             if warning_types:
                 logging_utils.warn(
@@ -136,7 +137,11 @@ class PartialProgramLayer:
         Layer: A Layer object that run all ops internally in static mode.
     """
 
-    def __init__(self, main_program, inputs, outputs, parameters=None,
+    def __init__(self,
+                 main_program,
+                 inputs,
+                 outputs,
+                 parameters=None,
                  **kwargs):
         super(PartialProgramLayer, self).__init__()
         self._inputs = NestSequence(inputs)
@@ -148,6 +153,9 @@ def __init__(self, main_program, inputs, outputs, parameters=None,
 
         self._origin_main_program = self._verify_program(main_program)
         self._tmp_scope_vec = self._create_scope_vec()
+        self._cuda_graph_vec = self._create_cuda_graph_vec()
+        self._cuda_graph_capture_mode = ""
+        self._cuda_graph_pool_id = 0
         # Set default mode to train
         self.training = True
 
@@ -216,8 +224,9 @@ def _infer_pure_fp16_program(self):
         """
         infer_pure_fp16_program = self._origin_main_program.clone()
         with program_guard(infer_pure_fp16_program):
-            cast_model_to_fp16(
-                infer_pure_fp16_program, self._amp_list, use_fp16_guard=False)
+            cast_model_to_fp16(infer_pure_fp16_program,
+                               self._amp_list,
+                               use_fp16_guard=False)
 
         return infer_pure_fp16_program
 
@@ -339,17 +348,23 @@ def _get_end_op_index(self):
     def __call__(self, inputs):
         in_vars, out_vars = self._prepare(inputs)
 
-        attrs = ('global_block', self.program.desc.block(0), 'start_op_index',
-                 0, 'end_op_index', self._get_end_op_index(), 'is_test',
-                 not self.training, 'program_id', self.program_id)
+        attrs = [
+            'global_block',
+            self.program.desc.block(0), 'start_op_index', 0, 'end_op_index',
+            self._get_end_op_index(), 'is_test', not self.training,
+            'program_id', self.program_id
+        ]
+        if self._cuda_graph_capture_mode:
+            attrs.extend(
+                ('cuda_graph_capture_mode', self._cuda_graph_capture_mode,
+                 'cuda_graph_pool_id', self._cuda_graph_pool_id))
 
         self._cast_fp16_if_pure_fp16(in_vars)
 
-        _C_ops.run_program(
-            self._valid_vars(in_vars),
-            self._valid_vars(self._params),
-            self._valid_vars(out_vars), self._tmp_scope_vec, self._double_grads,
-            *attrs)
+        _C_ops.run_program(self._valid_vars(in_vars),
+                           self._valid_vars(self._params),
+                           self._valid_vars(out_vars), self._tmp_scope_vec,
+                           self._double_grads, self._cuda_graph_vec, *attrs)
         self.drop_scope_if_no_grad()
         restored_nest_out = self._restore_out(out_vars)
         return self._remove_no_value(restored_nest_out)
@@ -358,9 +373,9 @@ def _cast_fp16_if_pure_fp16(self, in_vars):
         if _in_pure_fp16_guard():
             for i, var in enumerate(in_vars):
                 name = var.name
-                if (self.program.global_block().has_var(name) and
-                        self.program.global_block().var(name).dtype ==
-                        paddle.float16):
+                if (self.program.global_block().has_var(name)
+                        and self.program.global_block().var(name).dtype
+                        == paddle.float16):
                     in_vars[i] = var.astype('float16')
                     in_vars[i].name = name
 
@@ -409,19 +424,17 @@ def _prepare(self, inputs):
             if isinstance(value, np.ndarray):
                 var = None
                 if not framework._in_eager_mode_:
-                    var = core.VarBase(
-                        value=value,
-                        name=self._inputs[i].desc.name(),
-                        persistable=False,
-                        place=expected_place,
-                        zero_copy=True)
+                    var = core.VarBase(value=value,
+                                       name=self._inputs[i].desc.name(),
+                                       persistable=False,
+                                       place=expected_place,
+                                       zero_copy=True)
                 else:
-                    var = core.eager.Tensor(
-                        value=value,
-                        name=self._inputs[i].desc.name(),
-                        persistable=False,
-                        place=expected_place,
-                        zero_copy=True)
+                    var = core.eager.Tensor(value=value,
+                                            name=self._inputs[i].desc.name(),
+                                            persistable=False,
+                                            place=expected_place,
+                                            zero_copy=True)
             elif isinstance(value, (core.VarBase, core.eager.Tensor)):
                 # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times
                 # into CUDAPlace when it's as input of multi Ops. so we move it in advance
@@ -443,14 +456,12 @@ def create_out(var_id):
             var_desc = var.desc
             varbase = None
             if not framework._in_eager_mode_:
-                var_base = core.VarBase(var_desc.dtype(),
-                                        var_desc.shape(),
+                var_base = core.VarBase(var_desc.dtype(), var_desc.shape(),
                                         var_desc.name(), var_desc.type(), False)
             else:
-                var_base = core.eager.Tensor(var_desc.dtype(),
-                                             var_desc.shape(),
-                                             var_desc.name(),
-                                             var_desc.type(), False)
+                var_base = core.eager.Tensor(var_desc.dtype(), var_desc.shape(),
+                                             var_desc.name(), var_desc.type(),
+                                             False)
             return var_base
 
         # Create VarBase to receive output data.
@@ -471,6 +482,12 @@ def _create_scope_vec(self):
             tmp_scope_vec = [inner_scope]
         return tmp_scope_vec
 
+    def _create_cuda_graph_vec(self):
+        var = core.VarBase(core.VarDesc.VarType.FP32, [], "cuda_graph",
+                           core.VarDesc.VarType.RAW, True)
+        var.stop_gradient = True
+        return var
+
     def _restore_out(self, out_vars):
         """
         Restores same nested outputs by only replacing the Variable with VarBase.
@@ -507,8 +524,8 @@ def _remove_no_value(self, out_vars):
             return out_vars
         elif isinstance(out_vars, (tuple, list)):
             if isinstance(out_vars, tuple):
-                res = tuple(
-                    var for var in out_vars if not self._is_no_value(var))
+                res = tuple(var for var in out_vars
+                            if not self._is_no_value(var))
             else:
                 # isinstance(out_vars, list)
                 res = [var for var in out_vars if not self._is_no_value(var)]
@@ -570,8 +587,8 @@ def _check_params_all_inited(self, main_program):
             # self._params constains parameters and buffers with persistable=True.
             if not isinstance(var, (core.VarBase, core.eager.Tensor)):
                 raise TypeError(
-                    'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.
-                    format(i, type(var)))
+                    'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'
+                    .format(i, type(var)))
             param_and_buffer_names_set.add(var.name)
 
         for block in main_program.blocks:
@@ -617,6 +634,7 @@ def partial_program_from(concrete_program):
     if inputs and isinstance(inputs[0], layers.Layer):
         inputs = inputs[1:]
 
-    return PartialProgramLayer(
-        concrete_program.main_program, inputs, concrete_program.outputs,
-        concrete_program.parameters, **concrete_program.kwargs)
+    return PartialProgramLayer(concrete_program.main_program, inputs,
+                               concrete_program.outputs,
+                               concrete_program.parameters,
+                               **concrete_program.kwargs)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 2efb6965085de..54c2b2216cd1c 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -61,7 +61,7 @@ class FunctionCache(object):
 
     def __init__(self):
         # Caches the converted static functions. {dygraph_func: static_func}
-        self._converted_static_func_caches = dict()
+        self._converted_static_func_caches = weakref.WeakKeyDictionary()
         # Caches the converted ast node for same source code. {source_code: ast_root}
         self._code_to_ast_caches = dict()
         self._dygraph_to_static = DygraphToStaticAst()
@@ -198,11 +198,11 @@ def __hash__(self):
         error_msg = "Arguments to a `@paddle.jit.to_static` must be a hashable Python objects (or nested structures of these types)."
         with_hook = self.kwargs.get("with_hook", False)
         is_train = self.kwargs.get("is_train", False)
-        return hash(
-            (id(self.function_spec),
-             make_hashable(self.input_args_with_spec, error_msg),
-             make_hashable(self.input_kwargs_with_spec, error_msg),
-             self._spec_names_id, self.class_instance, with_hook, is_train))
+        return hash((id(self.function_spec),
+                     make_hashable(self.input_args_with_spec, error_msg),
+                     make_hashable(self.input_kwargs_with_spec,
+                                   error_msg), self._spec_names_id,
+                     self.class_instance, with_hook, is_train))
 
     def __eq__(self, other):
         return (type(self) is type(other)) and hash(self) == hash(other)
@@ -267,6 +267,8 @@ def __init__(self, function, input_spec=None, **kwargs):
         self._program_trans = ProgramTranslator()
         self._kwargs = kwargs
         self._training = True
+        self._cuda_graph_capture_mode = ""
+        self._cuda_graph_pool_id = 0
 
     def train(self):
         if isinstance(self._class_instance,
@@ -367,6 +369,9 @@ def __call__(self, *args, **kwargs):
             else:
                 partial_program_layer.training = self._training
 
+            partial_program_layer._cuda_graph_capture_mode = self._cuda_graph_capture_mode
+            partial_program_layer._cuda_graph_pool_id = self._cuda_graph_pool_id
+
             # 4. return outputs.
             try:
                 return partial_program_layer(args)
@@ -428,20 +433,19 @@ def get_concrete_program(self, *args, **kwargs):
         if "with_hook" in kwargs: kwargs.pop("with_hook")
         # 1. unify args/kwargs and replace Tensor with InputSpec
         if len(args) != len(self._function_spec.args_name):
-            args, kwargs = self._function_spec.unified_args_and_kwargs(args,
-                                                                       kwargs)
+            args, kwargs = self._function_spec.unified_args_and_kwargs(
+                args, kwargs)
         input_args_with_spec, input_kwargs_with_spec = self._function_spec.args_to_input_spec(
             args, kwargs)
 
         # 2. generate cache key
-        cache_key = CacheKey(
-            self._function_spec,
-            input_args_with_spec,
-            input_kwargs_with_spec,
-            self._class_instance,
-            **self._kwargs,
-            with_hook=with_hook,
-            is_train=is_train)
+        cache_key = CacheKey(self._function_spec,
+                             input_args_with_spec,
+                             input_kwargs_with_spec,
+                             self._class_instance,
+                             **self._kwargs,
+                             with_hook=with_hook,
+                             is_train=is_train)
 
         # 3. check whether hit the cache or build a new program for the input arguments
         concrete_program, partial_program_layer = self._program_cache[cache_key]
@@ -523,15 +527,15 @@ def concrete_program_specify_input_spec(self,
                         flatten(input_spec),
                         flatten(self._function_spec.input_spec)):
                     raise ValueError(
-                        "The `input_spec`: {} used to construct concrete_program is conflict with the `input_spec`: {} in `@paddle.jit.to_static`".
-                        format(input_spec, self._function_spec.input_spec))
+                        "The `input_spec`: {} used to construct concrete_program is conflict with the `input_spec`: {} in `@paddle.jit.to_static`"
+                        .format(input_spec, self._function_spec.input_spec))
                 # NOTE(chenweihang): we should always translated program based on the `input_spec`
                 # decorated on forward if it is valid
                 desired_input_spec = self._function_spec.input_spec
                 if input_spec is not None:
                     logging_utils.warn(
-                        "\n\nYou have specified `input_spec` both in function definition (higher priority) and `paddle.jit.save` (will be ignored.)\n\n\t Using: {}\n\n\t Ignore: {}\n".
-                        format(desired_input_spec, input_spec))
+                        "\n\nYou have specified `input_spec` both in function definition (higher priority) and `paddle.jit.save` (will be ignored.)\n\n\t Using: {}\n\n\t Ignore: {}\n"
+                        .format(desired_input_spec, input_spec))
 
             has_input_spec = (desired_input_spec is not None)
             if has_input_spec:
@@ -542,8 +546,8 @@ def concrete_program_specify_input_spec(self,
                 return concrete_program
             else:
                 raise ValueError(
-                    "No valid transformed program for {}.\n\t    Please specific `input_spec` in `@paddle.jit.to_static` or feed input tensor to call the decorated function at once.\n".
-                    format(self._function_spec))
+                    "No valid transformed program for {}.\n\t    Please specific `input_spec` in `@paddle.jit.to_static` or feed input tensor to call the decorated function at once.\n"
+                    .format(self._function_spec))
         elif with_hook:
             cache_key = self._program_cache._recent_cache_key
             cache_key.kwargs["with_hook"] = True
@@ -553,8 +557,8 @@ def concrete_program_specify_input_spec(self,
         # If more than one programs have been cached, return the recent converted program by default.
         elif cached_program_len > 1:
             logging_utils.warn(
-                "Current {} has more than one cached programs: {}, the last traced progam will be return by default.".
-                format(self._function_spec, cached_program_len))
+                "Current {} has more than one cached programs: {}, the last traced progam will be return by default."
+                .format(self._function_spec, cached_program_len))
 
         cache_key, (concrete_program,
                     partial_layer) = self._program_cache.last()
@@ -627,8 +631,8 @@ def __init__(self, func, class_instance, with_hook=False):
         self.class_instance = class_instance
         self.with_hook = with_hook
         self.need_apply_hook = with_hook and isinstance(
-            self.class_instance,
-            layers.Layer) and getattr(func, "__name__") == "forward"
+            self.class_instance, layers.Layer) and getattr(
+                func, "__name__") == "forward"
 
     def apply_pre_hooks(self, inputs):
         """
@@ -726,8 +730,8 @@ def from_func_spec(func_spec, input_spec, input_kwargs_spec, class_instance,
                 _kwargs = func_spec.to_static_inputs_with_spec(
                     input_kwargs_spec, main_program)
                 if class_instance:
-                    static_inputs = tuple([class_instance] + list(
-                        static_inputs))
+                    static_inputs = tuple([class_instance] +
+                                          list(static_inputs))
 
                 # 2. Gets all ParamBases and buffered VarBases in the function
                 all_parameters_and_buffers = _extract_indeed_params_buffers(
@@ -735,8 +739,9 @@ def from_func_spec(func_spec, input_spec, input_kwargs_spec, class_instance,
 
                 # 3. Builds program only once and returns the output Variables.
                 with param_guard(get_parameters(
-                        class_instance, False)), param_guard(
-                            get_buffers(class_instance, False)):
+                        class_instance,
+                        False)), param_guard(get_buffers(class_instance,
+                                                         False)):
                     try:
                         # only for jit.save, do nothing while train and eval process
                         inputs = hook_helper.apply_pre_hooks(static_inputs)
@@ -754,21 +759,20 @@ def from_func_spec(func_spec, input_spec, input_kwargs_spec, class_instance,
                         raise
 
                 if outputs is not None:
-                    need_wrap_into_list = not isinstance(outputs, (
-                        tuple, list)) or len(outputs) == 1
+                    need_wrap_into_list = not isinstance(
+                        outputs, (tuple, list)) or len(outputs) == 1
                     if need_wrap_into_list:
                         outputs = [outputs]
 
         main_program = update_op_callstack_with_origin_info(main_program)
 
-        return ConcreteProgram(
-            inputs=static_inputs,
-            outputs=outputs,
-            parameters=all_parameters_and_buffers,
-            function=dygraph_function,
-            main_program=main_program,
-            startup_program=startup_program,
-            **kwargs)
+        return ConcreteProgram(inputs=static_inputs,
+                               outputs=outputs,
+                               parameters=all_parameters_and_buffers,
+                               function=dygraph_function,
+                               main_program=main_program,
+                               startup_program=startup_program,
+                               **kwargs)
 
 
 def _extract_indeed_params_buffers(class_instance):
@@ -790,7 +794,7 @@ class ProgramCache(object):
     def __init__(self):
         # {hash_id : (concrete_program, partial_layer)}
         self._caches = collections.OrderedDict()
-        # trace mostly recent used program 
+        # trace mostly recent used program
         self._recent_key = None
         self._recent_cache_key = None
 
@@ -817,8 +821,8 @@ def __getitem__(self, item):
             if current_tracing_count > MAX_TRACED_PROGRAM_COUNT:
                 logging_utils.warn(
                     "Current traced program number: {} > `max_tracing_count`:{}. Too much cached programs will bring expensive overhead. "
-                    "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors.".
-                    format(current_tracing_count, MAX_TRACED_PROGRAM_COUNT))
+                    "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors."
+                    .format(current_tracing_count, MAX_TRACED_PROGRAM_COUNT))
 
         return self._caches[item_id]
 
@@ -998,9 +1002,9 @@ def func(x):
             return dygraph_func(*args, **kwargs)
         try:
             function_spec = FunctionSpec(dygraph_func)
-            cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
-                                                    getattr(dygraph_func,
-                                                            '__self__', None))
+            cache_key = CacheKey.from_func_and_args(
+                function_spec, args, kwargs,
+                getattr(dygraph_func, '__self__', None))
             _, partial_program_layer = self._program_cache[cache_key]
 
             if args and isinstance(args[0], layers.Layer):
@@ -1128,9 +1132,9 @@ def func(x):
             return dygraph_func(*args, **kwargs)
 
         function_spec = FunctionSpec(dygraph_func)
-        cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
-                                                getattr(dygraph_func,
-                                                        '__self__', None))
+        cache_key = CacheKey.from_func_and_args(
+            function_spec, args, kwargs, getattr(dygraph_func, '__self__',
+                                                 None))
         concrete_program, partial_program_layer = self._program_cache[cache_key]
 
         # Note: concrete_program hold all input/output infos include non-Variable
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
index 8ac659dbead99..7e387b45c4020 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
@@ -211,11 +211,10 @@ def visit_FunctionDef(self, node):
         value_name = self.return_value_name[node]
         if value_name is not None:
             node.body.append(
-                gast.Return(value=gast.Name(
-                    id=value_name,
-                    ctx=gast.Load(),
-                    annotation=None,
-                    type_comment=None)))
+                gast.Return(value=gast.Name(id=value_name,
+                                            ctx=gast.Load(),
+                                            annotation=None,
+                                            type_comment=None)))
             init_names = [
                 unique_name.generate(RETURN_VALUE_INIT_NAME)
                 for i in range(max_return_length)
@@ -224,32 +223,27 @@ def visit_FunctionDef(self, node):
                 create_fill_constant_node(iname, 0.0) for iname in init_names
             ]
             if len(init_names) == 1:
-                return_value_nodes = gast.Name(
-                    id=init_names[0],
-                    ctx=gast.Load(),
-                    annotation=None,
-                    type_comment=None)
+                return_value_nodes = gast.Name(id=init_names[0],
+                                               ctx=gast.Load(),
+                                               annotation=None,
+                                               type_comment=None)
             else:
                 # We need to initialize return value as a tuple because control
                 # flow requires some inputs or outputs have same structure
-                return_value_nodes = gast.Tuple(
-                    elts=[
-                        gast.Name(
-                            id=iname,
-                            ctx=gast.Load(),
-                            annotation=None,
-                            type_comment=None) for iname in init_names
-                    ],
-                    ctx=gast.Load())
-            assign_return_value_node = gast.Assign(
-                targets=[
-                    gast.Name(
-                        id=value_name,
-                        ctx=gast.Store(),
-                        annotation=None,
-                        type_comment=None)
+                return_value_nodes = gast.Tuple(elts=[
+                    gast.Name(id=iname,
+                              ctx=gast.Load(),
+                              annotation=None,
+                              type_comment=None) for iname in init_names
                 ],
-                value=return_value_nodes)
+                                                ctx=gast.Load())
+            assign_return_value_node = gast.Assign(targets=[
+                gast.Name(id=value_name,
+                          ctx=gast.Store(),
+                          annotation=None,
+                          type_comment=None)
+            ],
+                                                   value=return_value_nodes)
             node.body.insert(0, assign_return_value_node)
             node.body[:0] = assign_zero_nodes
 
@@ -276,43 +270,43 @@ def visit_Return(self, node):
             if hasattr(ancestor,
                        "body") and index_in_list(ancestor.body, cur_node) != -1:
                 if cur_node == node:
-                    self._replace_return_in_stmt_list(
-                        ancestor.body, cur_node, return_name, max_return_length,
-                        parent_node_of_return)
+                    self._replace_return_in_stmt_list(ancestor.body, cur_node,
+                                                      return_name,
+                                                      max_return_length,
+                                                      parent_node_of_return)
                 self._replace_after_node_to_if_in_stmt_list(
                     ancestor.body, cur_node, return_name, parent_node_of_return)
-            elif hasattr(ancestor, "orelse") and index_in_list(ancestor.orelse,
-                                                               cur_node) != -1:
+            elif hasattr(ancestor, "orelse") and index_in_list(
+                    ancestor.orelse, cur_node) != -1:
                 if cur_node == node:
-                    self._replace_return_in_stmt_list(
-                        ancestor.orelse, cur_node, return_name,
-                        max_return_length, parent_node_of_return)
+                    self._replace_return_in_stmt_list(ancestor.orelse, cur_node,
+                                                      return_name,
+                                                      max_return_length,
+                                                      parent_node_of_return)
                 self._replace_after_node_to_if_in_stmt_list(
                     ancestor.orelse, cur_node, return_name,
                     parent_node_of_return)
 
             # If return node in while loop, add `not return_name` in gast.While.test
             if isinstance(ancestor, gast.While):
-                cond_var_node = gast.UnaryOp(
-                    op=gast.Not(),
-                    operand=gast.Name(
-                        id=return_name,
-                        ctx=gast.Load(),
-                        annotation=None,
-                        type_comment=None))
+                cond_var_node = gast.UnaryOp(op=gast.Not(),
+                                             operand=gast.Name(
+                                                 id=return_name,
+                                                 ctx=gast.Load(),
+                                                 annotation=None,
+                                                 type_comment=None))
                 ancestor.test = gast.BoolOp(
                     op=gast.And(), values=[ancestor.test, cond_var_node])
                 continue
 
             # If return node in for loop, add `not return_name` in gast.While.test
             if isinstance(ancestor, gast.For):
-                cond_var_node = gast.UnaryOp(
-                    op=gast.Not(),
-                    operand=gast.Name(
-                        id=return_name,
-                        ctx=gast.Load(),
-                        annotation=None,
-                        type_comment=None))
+                cond_var_node = gast.UnaryOp(op=gast.Not(),
+                                             operand=gast.Name(
+                                                 id=return_name,
+                                                 ctx=gast.Load(),
+                                                 annotation=None,
+                                                 type_comment=None))
                 parent_node = self.ancestor_nodes[ancestor_index - 1]
                 for_to_while = ForToWhileTransformer(parent_node, ancestor,
                                                      cond_var_node)
@@ -363,27 +357,23 @@ def _replace_return_in_stmt_list(self, stmt_list, return_node, return_name,
             # Handle tuple/non-tuple case
             if max_return_length == 1:
                 assign_nodes.append(
-                    gast.Assign(
-                        targets=[
-                            gast.Name(
-                                id=self.return_value_name[cur_func_node],
-                                ctx=gast.Store(),
-                                annotation=None,
-                                type_comment=None)
-                        ],
-                        value=gast.Name(
-                            id=no_value_names[0],
-                            ctx=gast.Load(),
-                            annotation=None,
-                            type_comment=None)))
+                    gast.Assign(targets=[
+                        gast.Name(id=self.return_value_name[cur_func_node],
+                                  ctx=gast.Store(),
+                                  annotation=None,
+                                  type_comment=None)
+                    ],
+                                value=gast.Name(id=no_value_names[0],
+                                                ctx=gast.Load(),
+                                                annotation=None,
+                                                type_comment=None)))
             else:
                 # max_return_length > 1 which means we should assign tuple
                 fill_tuple = [
-                    gast.Name(
-                        id=n,
-                        ctx=gast.Load(),
-                        annotation=None,
-                        type_comment=None) for n in no_value_names
+                    gast.Name(id=n,
+                              ctx=gast.Load(),
+                              annotation=None,
+                              type_comment=None) for n in no_value_names
                 ]
                 if return_node.value is not None:
                     if isinstance(return_node.value, gast.Tuple):
@@ -392,16 +382,14 @@ def _replace_return_in_stmt_list(self, stmt_list, return_node, return_name,
                         fill_tuple.insert(0, return_node.value)
 
                 assign_nodes.append(
-                    gast.Assign(
-                        targets=[
-                            gast.Name(
-                                id=self.return_value_name[cur_func_node],
-                                ctx=gast.Store(),
-                                annotation=None,
-                                type_comment=None)
-                        ],
-                        value=gast.Tuple(
-                            elts=fill_tuple, ctx=gast.Load())))
+                    gast.Assign(targets=[
+                        gast.Name(id=self.return_value_name[cur_func_node],
+                                  ctx=gast.Store(),
+                                  annotation=None,
+                                  type_comment=None)
+                    ],
+                                value=gast.Tuple(elts=fill_tuple,
+                                                 ctx=gast.Load())))
         else:
             # In this case we should NOT append RETURN_NO_VALUE placeholder
             if return_node.value is not None:
@@ -412,21 +400,20 @@ def _replace_return_in_stmt_list(self, stmt_list, return_node, return_name,
                             RETURN_VALUE_PREFIX)
 
                 assign_nodes.append(
-                    gast.Assign(
-                        targets=[
-                            gast.Name(
-                                id=self.return_value_name[cur_func_node],
-                                ctx=gast.Store(),
-                                annotation=None,
-                                type_comment=None)
-                        ],
-                        value=return_node.value))
+                    gast.Assign(targets=[
+                        gast.Name(id=self.return_value_name[cur_func_node],
+                                  ctx=gast.Store(),
+                                  annotation=None,
+                                  type_comment=None)
+                    ],
+                                value=return_node.value))
 
         stmt_list[i:] = assign_nodes
         return True
 
-    def _replace_after_node_to_if_in_stmt_list(
-            self, stmt_list, node, return_name, parent_node_of_return):
+    def _replace_after_node_to_if_in_stmt_list(self, stmt_list, node,
+                                               return_name,
+                                               parent_node_of_return):
         i = index_in_list(stmt_list, node)
         if i < 0 or i >= len(stmt_list):
             return False
@@ -434,13 +421,12 @@ def _replace_after_node_to_if_in_stmt_list(
             # No need to add, we consider this as added successfully
             return True
 
-        if_stmt = gast.If(test=gast.UnaryOp(
-            op=gast.Not(),
-            operand=gast.Name(
-                id=return_name,
-                ctx=gast.Store(),
-                annotation=None,
-                type_comment=None)),
+        if_stmt = gast.If(test=gast.UnaryOp(op=gast.Not(),
+                                            operand=gast.Name(
+                                                id=return_name,
+                                                ctx=gast.Store(),
+                                                annotation=None,
+                                                type_comment=None)),
                           body=stmt_list[i + 1:],
                           orelse=[])
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
index 98e76c0f46ffc..82177b343aaf4 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
@@ -181,8 +181,9 @@ def __init__(self):
         self.cur_scope = AstVarScope()
 
     def enter_scope(self, scope_name, scope_type):
-        self.cur_scope = AstVarScope(
-            scope_name, scope_type, parent_scope=self.cur_scope)
+        self.cur_scope = AstVarScope(scope_name,
+                                     scope_type,
+                                     parent_scope=self.cur_scope)
         return self.cur_scope
 
     def exit_scope(self):
@@ -351,8 +352,8 @@ def _get_node_var_type(self, cur_wrapper):
             if node.value:
                 node_value_type = self.node_to_wrapper_map[
                     node.value].node_var_type
-                if not (node_value_type &
-                        {NodeVarType.UNKNOWN, NodeVarType.STATEMENT}):
+                if not (node_value_type
+                        & {NodeVarType.UNKNOWN, NodeVarType.STATEMENT}):
                     ret_type = node_value_type
             if isinstance(node.target, gast.Name):
                 self.node_to_wrapper_map[node.target].node_var_type = ret_type
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
index d5b23d2f53b1c..a04171dfc3031 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
@@ -47,8 +47,9 @@ def create_convert_shape_node(var_shape_node,
         api_shape_node = gast.parse(convert_var_shape_func).body[0].value
 
         if slice_node is not None and not slice_is_num(slice_node):
-            return gast.Subscript(
-                value=api_shape_node, slice=slice_node.slice, ctx=gast.Load())
+            return gast.Subscript(value=api_shape_node,
+                                  slice=slice_node.slice,
+                                  ctx=gast.Load())
         return api_shape_node
 
     if isinstance(var_shape_node, gast.Subscript):
@@ -65,12 +66,13 @@ def create_choose_shape_node(attr_shape_name, api_shape_name, slice_node=None):
 
     if slice_node is not None and slice_is_num(slice_node):
         args.append(ast_to_source_code(slice_node.slice).strip())
-    choose_shape_func = "_jst.choose_shape_attr_or_api({})".format(",".join(
-        args))
+    choose_shape_func = "_jst.choose_shape_attr_or_api({})".format(
+        ",".join(args))
     choose_shape_node = gast.parse(choose_shape_func).body[0].value
     if slice_node is not None and not slice_is_num(slice_node):
-        return gast.Subscript(
-            value=choose_shape_node, slice=slice_node.slice, ctx=gast.Load())
+        return gast.Subscript(value=choose_shape_node,
+                              slice=slice_node.slice,
+                              ctx=gast.Load())
     return choose_shape_node
 
 
@@ -226,9 +228,10 @@ def _transform_var_shape_if_necessary(self, cond):
                 for field, value in gast.iter_fields(parent_node):
                     if child_node is value:
                         if var_shape_node is child_node:
-                            setattr(parent_node, field,
-                                    create_convert_shape_node(var_shape_node,
-                                                              None, True))
+                            setattr(
+                                parent_node, field,
+                                create_convert_shape_node(
+                                    var_shape_node, None, True))
                         else:
                             setattr(parent_node, field, var_shape_node)
                         break
@@ -283,8 +286,8 @@ def _is_var_shape(self, node):
 
         if isinstance(node, gast.Attribute):
             # If node is `paddle.shape`, return False
-            if (node.attr == 'shape' and isinstance(node.value, gast.Name) and
-                    node.value.id == 'paddle'):
+            if (node.attr == 'shape' and isinstance(node.value, gast.Name)
+                    and node.value.id == 'paddle'):
                 return False
             if node.attr != 'shape':
                 return False
@@ -323,9 +326,8 @@ def _update_name_to_var_shape(self, node):
                         sub_node = gast.parse(sub_node_str).body[0].value
 
                         update_static_shape_var_node.append(
-                            gast.Assign(
-                                targets=[static_shape_var_node],
-                                value=sub_node))
+                            gast.Assign(targets=[static_shape_var_node],
+                                        value=sub_node))
 
                         self.name_to_var_shape[
                             target_id] = static_shape_var_name
@@ -346,16 +348,15 @@ def _update_name_to_var_shape(self, node):
                             idx)
                         sub_node = gast.parse(sub_node_str).body[0].value
                         # Note(Aurelius84): Becuase static_shape_var_name is used in
-                        # eval_if_exist_else_none() as plain string, so it will not 
+                        # eval_if_exist_else_none() as plain string, so it will not
                         # be pasred as argument in convert_loop/ifelse. We delcare it
                         # as global var because it has unique name.
                         update_static_shape_var_node.append(
                             gast.Global(names=[static_shape_var_name]))
 
                         update_static_shape_var_node.append(
-                            gast.Assign(
-                                targets=[static_shape_var_node],
-                                value=sub_node))
+                            gast.Assign(targets=[static_shape_var_node],
+                                        value=sub_node))
                         self.name_to_var_shape[
                             target_id] = static_shape_var_name
             return update_static_shape_var_node
@@ -373,16 +374,15 @@ def _update_name_to_var_shape(self, node):
                         static_shape_value_name).body[0].value
 
                     update_static_shape_var_node = [
-                        gast.Assign(
-                            targets=[static_shape_var_node],
-                            value=static_shape_value_node)
+                        gast.Assign(targets=[static_shape_var_node],
+                                    value=static_shape_value_node)
                     ]
                     self.name_to_var_shape[target_id] = static_shape_var_name
             elif self._is_var_shape(value_node):  # eg: x.shape or x.shape[0]
                 static_shape_var_name = unique_name.generate(
                     STATIC_CONVERT_VAR_SHAPE_SUFFIX)
-                static_shape_var_node = gast.parse(static_shape_var_name).body[
-                    0].value
+                static_shape_var_node = gast.parse(
+                    static_shape_var_name).body[0].value
                 static_shape_value_node = copy.deepcopy(value_node)
                 # x.shape becomes convert_var_shape_simple(x)
                 static_shape_value_node = ShapeAttributeTransformer().visit(
@@ -392,8 +392,7 @@ def _update_name_to_var_shape(self, node):
                     gast.Global(names=[static_shape_var_name])
                 ]
                 update_static_shape_var_node.append(
-                    gast.Assign(
-                        targets=[static_shape_var_node],
-                        value=static_shape_value_node))
+                    gast.Assign(targets=[static_shape_var_node],
+                                value=static_shape_value_node))
                 self.name_to_var_shape[target_id] = static_shape_var_name
         return update_static_shape_var_node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 91c2c5dc65aab..4a477fb7d7cb6 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -92,14 +92,13 @@ def getfullargspec(target):
         return inspect.getfullargspec(target)
     else:
         argspec = inspect.getargspec(target)
-        return FullArgSpec(
-            args=argspec.args,
-            varargs=argspec.varargs,
-            varkw=argspec.keywords,
-            defaults=argspec.defaults,
-            kwonlyargs=[],
-            kwonlydefaults=None,
-            annotations={})
+        return FullArgSpec(args=argspec.args,
+                           varargs=argspec.varargs,
+                           varkw=argspec.keywords,
+                           defaults=argspec.defaults,
+                           kwonlyargs=[],
+                           kwonlydefaults=None,
+                           annotations={})
 
 
 def parse_arg_and_kwargs(function):
@@ -190,8 +189,8 @@ def is_api_in_module(node, module_prefix):
         from paddle.fluid.dygraph import to_variable
         from paddle import to_tensor
 
-        return eval("_is_api_in_module_helper({}, '{}')".format(func_str,
-                                                                module_prefix))
+        return eval("_is_api_in_module_helper({}, '{}')".format(
+            func_str, module_prefix))
     except Exception:
         return False
 
@@ -241,8 +240,9 @@ def is_control_flow_to_transform(node,
     """
     assert isinstance(node, gast.AST), \
         "The type of input node must be gast.AST, but received %s." % type(node)
-    visitor = IsControlFlowVisitor(
-        node, static_analysis_visitor, node_var_type_map=var_name_to_type)
+    visitor = IsControlFlowVisitor(node,
+                                   static_analysis_visitor,
+                                   node_var_type_map=var_name_to_type)
     need_to_transform = visitor.transform()
     return need_to_transform
 
@@ -262,9 +262,9 @@ def to_static_api(dygraph_class):
     if dygraph_class in dygraph_class_to_static_api:
         return dygraph_class_to_static_api[dygraph_class]
     else:
-        raise NotImplementedError("Paddle dygraph API {} cannot be converted "
-                                  "to static graph at present.".format(
-                                      dygraph_class))
+        raise NotImplementedError(
+            "Paddle dygraph API {} cannot be converted "
+            "to static graph at present.".format(dygraph_class))
 
 
 def _add_keywords_to(node, dygraph_api_name):
@@ -275,10 +275,8 @@ def _add_keywords_to(node, dygraph_api_name):
                 ast_keyword.arg = "size"
 
         node.keywords.append(
-            gast.keyword(
-                arg="num_flatten_dims",
-                value=gast.Constant(
-                    value=-1, kind=None)))
+            gast.keyword(arg="num_flatten_dims",
+                         value=gast.Constant(value=-1, kind=None)))
 
     if dygraph_api_name == "BilinearTensorProduct":
         for ast_keyword in node.keywords:
@@ -297,15 +295,15 @@ def to_static_ast(node, class_node):
     assert isinstance(class_node, gast.Call)
     static_api = to_static_api(class_node.func.attr)
 
-    node.func = gast.Attribute(
-        attr=static_api,
-        ctx=gast.Load(),
-        value=gast.Attribute(
-            attr='layers',
-            ctx=gast.Load(),
-            value=gast.Name(
-                ctx=gast.Load(), id='fluid', annotation=None,
-                type_comment=None)))
+    node.func = gast.Attribute(attr=static_api,
+                               ctx=gast.Load(),
+                               value=gast.Attribute(attr='layers',
+                                                    ctx=gast.Load(),
+                                                    value=gast.Name(
+                                                        ctx=gast.Load(),
+                                                        id='fluid',
+                                                        annotation=None,
+                                                        type_comment=None)))
 
     update_args_of_func(node, class_node, 'forward')
 
@@ -330,8 +328,8 @@ def update_args_of_func(node, dygraph_node, method_name):
     import paddle.fluid as fluid
     if method_name == "__init__" or eval(
             "issubclass({}, fluid.dygraph.Layer)".format(class_src)):
-        full_args = eval("inspect.getargspec({}.{})".format(class_src,
-                                                            method_name))
+        full_args = eval("inspect.getargspec({}.{})".format(
+            class_src, method_name))
         full_args_name = [
             arg_name for arg_name in full_args[0] if arg_name != "self"
         ]
@@ -394,11 +392,11 @@ def generate_name_node(name_ids, ctx=gast.Load(), gen_tuple_if_single=False):
     if isinstance(name_ids, six.string_types):
         name_ids = [name_ids]
     if not isinstance(name_ids, (list, tuple, set)):
-        raise TypeError('name_ids must be list or tuple or set, but received %s'
-                        % type(type(name_ids)))
+        raise TypeError(
+            'name_ids must be list or tuple or set, but received %s' %
+            type(type(name_ids)))
     gast_names = [
-        gast.Name(
-            id=name_id, ctx=ctx, annotation=None, type_comment=None)
+        gast.Name(id=name_id, ctx=ctx, annotation=None, type_comment=None)
         for name_id in name_ids
     ]
     if len(gast_names) == 1 and not gen_tuple_if_single:
@@ -419,13 +417,12 @@ def create_funcDef_node(nodes, name, input_args, return_name_ids):
         nodes.append(gast.Return(value=generate_name_node(return_name_ids)))
     else:
         nodes.append(gast.Return(value=None))
-    func_def_node = gast.FunctionDef(
-        name=name,
-        args=input_args,
-        body=nodes,
-        decorator_list=[],
-        returns=None,
-        type_comment=None)
+    func_def_node = gast.FunctionDef(name=name,
+                                     args=input_args,
+                                     body=nodes,
+                                     decorator_list=[],
+                                     returns=None,
+                                     type_comment=None)
     return func_def_node
 
 
@@ -447,6 +444,7 @@ def create_assign_node(name, node):
 
 
 class RenameTransformer(gast.NodeTransformer):
+
     def __init__(self, node):
         assert isinstance(
             node, gast.AST), "RenameTransformer only accepts gast.AST as input"
@@ -488,8 +486,10 @@ def remove_if_exit(filepath):
     source = ast_to_source_code(ast_root)
     source = _inject_import_statements() + source
 
-    f = tempfile.NamedTemporaryFile(
-        mode='w', suffix='.py', delete=False, encoding='utf-8')
+    f = tempfile.NamedTemporaryFile(mode='w',
+                                    suffix='.py',
+                                    delete=False,
+                                    encoding='utf-8')
     with f:
         module_name = os.path.basename(f.name[:-3])
         f.write(source)
@@ -546,8 +546,8 @@ def func_to_source_code(function, dedent=True):
     """
     if not (inspect.isfunction(function) or inspect.ismethod(function)):
         raise TypeError(
-            "The type of 'function' should be a function or method, but received {}.".
-            format(type(function).__name__))
+            "The type of 'function' should be a function or method, but received {}."
+            .format(type(function).__name__))
     source_code_list, _ = inspect.getsourcelines(function)
     # Replace comments with blank lines so that error messages are not misplaced
     source_code_list = [
@@ -596,8 +596,9 @@ def compare_with_none(node):
             # node.comparators is a list.
             if isinstance(child, list):
                 child = child[0]
-            if (isinstance(child, gast.Constant) and child.value is None) or (
-                    isinstance(child, gast.Name) and child.id == 'None'):
+            if (isinstance(child, gast.Constant)
+                    and child.value is None) or (isinstance(child, gast.Name)
+                                                 and child.id == 'None'):
                 return True
     return False
 
@@ -869,54 +870,46 @@ def visit_For(self, node):
                 tuple_iter_name = unique_name.generate(
                     FOR_ITER_TUPLE_INDEX_PREFIX)
                 tuple_var_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX)
-                node.target = gast.Tuple(
-                    elts=[
-                        gast.Name(
-                            id=tuple_iter_name,
-                            ctx=gast.Store(),
-                            annotation=None,
-                            type_comment=None), gast.Name(
-                                id=tuple_var_name,
-                                ctx=gast.Store(),
-                                annotation=None,
-                                type_comment=None)
-                    ],
-                    ctx=gast.Store())
+                node.target = gast.Tuple(elts=[
+                    gast.Name(id=tuple_iter_name,
+                              ctx=gast.Store(),
+                              annotation=None,
+                              type_comment=None),
+                    gast.Name(id=tuple_var_name,
+                              ctx=gast.Store(),
+                              annotation=None,
+                              type_comment=None)
+                ],
+                                         ctx=gast.Store())
                 node.body.insert(
                     0,
-                    gast.Assign(
-                        targets=[
-                            gast.Name(
-                                id=out_tuple_name,
-                                ctx=gast.Store(),
-                                annotation=None,
-                                type_comment=None)
-                        ],
-                        value=gast.Tuple(
-                            elts=[
-                                gast.Name(
-                                    id=tuple_iter_name,
-                                    ctx=gast.Load(),
-                                    annotation=None,
-                                    type_comment=None), gast.Name(
-                                        id=tuple_var_name,
-                                        ctx=gast.Load(),
-                                        annotation=None,
-                                        type_comment=None)
-                            ],
-                            ctx=gast.Load())))
-            elif isinstance(node.target, (
-                    gast.List,
-                    gast.Tuple)) and len(node.target.elts) >= 2 and isinstance(
+                    gast.Assign(targets=[
+                        gast.Name(id=out_tuple_name,
+                                  ctx=gast.Store(),
+                                  annotation=None,
+                                  type_comment=None)
+                    ],
+                                value=gast.Tuple(elts=[
+                                    gast.Name(id=tuple_iter_name,
+                                              ctx=gast.Load(),
+                                              annotation=None,
+                                              type_comment=None),
+                                    gast.Name(id=tuple_var_name,
+                                              ctx=gast.Load(),
+                                              annotation=None,
+                                              type_comment=None)
+                                ],
+                                                 ctx=gast.Load())))
+            elif isinstance(node.target, (gast.List, gast.Tuple)) and len(
+                    node.target.elts) >= 2 and isinstance(
                         node.target.elts[1], (gast.List, gast.Tuple)):
                 # Inner tuple case
                 inner_tuple_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX)
                 origin_inner_tuple_node = node.target.elts[1]
-                node.target.elts[1] = gast.Name(
-                    id=inner_tuple_name,
-                    ctx=gast.Store(),
-                    annotation=None,
-                    type_comment=None)
+                node.target.elts[1] = gast.Name(id=inner_tuple_name,
+                                                ctx=gast.Store(),
+                                                annotation=None,
+                                                type_comment=None)
                 node.body[0:0] = self.tuple_to_stmts(origin_inner_tuple_node,
                                                      inner_tuple_name)
         elif self.is_for_iter(node) and isinstance(node.target,
@@ -924,11 +917,10 @@ def visit_For(self, node):
             # Non-enumrate case:
             tuple_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX)
             origin_tuple_node = node.target
-            node.target = gast.Name(
-                id=tuple_name,
-                ctx=gast.Store(),
-                annotation=None,
-                type_comment=None)
+            node.target = gast.Name(id=tuple_name,
+                                    ctx=gast.Store(),
+                                    annotation=None,
+                                    type_comment=None)
             node.body[0:0] = self.tuple_to_stmts(origin_tuple_node, tuple_name)
         return node
 
@@ -1144,8 +1136,8 @@ def _build_index_init_node(self):
             if self.args_length == 1:
                 index_init_value_str = '0'
             else:
-                index_init_value_str = ast_to_source_code(self.iter_args[
-                    0]).strip()
+                index_init_value_str = ast_to_source_code(
+                    self.iter_args[0]).strip()
 
             index_init_var_name = self.iter_var_name
         else:
@@ -1164,8 +1156,8 @@ def _build_var_len_assign_node(self):
         if isinstance(self.iter_node, gast.Call) and isinstance(
                 self.iter_node.func,
                 gast.Attribute) and self.iter_node.func.attr == 'numpy':
-            iter_var_name = ast_to_source_code(self.iter_node.func.value).strip(
-            )
+            iter_var_name = ast_to_source_code(
+                self.iter_node.func.value).strip()
         else:
             iter_var_name = ast_to_source_code(self.iter_node).strip()
 
@@ -1196,11 +1188,10 @@ def _build_iter_node(self):
                 zip_to_list_node = gast.parse(zip_to_list_str).body[0]
                 new_nodes.append(zip_to_list_node)
 
-                self.iter_node = gast.Name(
-                    id=self.iter_zip_to_list_name,
-                    ctx=gast.Load(),
-                    annotation=None,
-                    type_comment=None)
+                self.iter_node = gast.Name(id=self.iter_zip_to_list_name,
+                                           ctx=gast.Load(),
+                                           annotation=None,
+                                           type_comment=None)
 
         return new_nodes
 
@@ -1220,18 +1211,17 @@ def _build_compare_node(self):
             compare_node = self.iter_args[
                 0] if self.args_length == 1 else self.iter_args[1]
         else:
-            compare_node = gast.Name(
-                id=self.iter_var_len_name,
-                ctx=gast.Load(),
-                annotation=None,
-                type_comment=None)
+            compare_node = gast.Name(id=self.iter_var_len_name,
+                                     ctx=gast.Load(),
+                                     annotation=None,
+                                     type_comment=None)
         return compare_node
 
     def _build_step_node(self):
         if self.is_for_range_iter():
             step_node = self.iter_args[
-                2] if self.args_length == 3 else gast.Constant(
-                    value=1, kind=None)
+                2] if self.args_length == 3 else gast.Constant(value=1,
+                                                               kind=None)
         else:
             step_node = gast.Constant(value=1, kind=None)
         return step_node
@@ -1248,40 +1238,37 @@ def _build_cond_stmt(self, step_node, compare_node):
             # range(max, min, -2)
             # ->
             # i > min
-            return gast.Compare(
-                left=gast.Name(
-                    id=self.iter_var_name
-                    if self.is_for_range_iter() else self.iter_idx_name,
-                    ctx=gast.Load(),
-                    annotation=None,
-                    type_comment=None),
-                ops=[gast.Gt()],
-                comparators=[compare_node])
+            return gast.Compare(left=gast.Name(
+                id=self.iter_var_name
+                if self.is_for_range_iter() else self.iter_idx_name,
+                ctx=gast.Load(),
+                annotation=None,
+                type_comment=None),
+                                ops=[gast.Gt()],
+                                comparators=[compare_node])
         else:
             # eg:
             # range(min, max, 2)
             # ->
             # i < max
-            return gast.Compare(
-                left=gast.Name(
-                    id=self.iter_var_name
-                    if self.is_for_range_iter() else self.iter_idx_name,
-                    ctx=gast.Load(),
-                    annotation=None,
-                    type_comment=None),
-                ops=[gast.Lt()],
-                comparators=[compare_node])
-
-    def _build_index_increase_node(self, step_node):
-        return gast.AugAssign(
-            target=gast.Name(
+            return gast.Compare(left=gast.Name(
                 id=self.iter_var_name
                 if self.is_for_range_iter() else self.iter_idx_name,
-                ctx=gast.Store(),
+                ctx=gast.Load(),
                 annotation=None,
                 type_comment=None),
-            op=gast.Add(),
-            value=step_node)
+                                ops=[gast.Lt()],
+                                comparators=[compare_node])
+
+    def _build_index_increase_node(self, step_node):
+        return gast.AugAssign(target=gast.Name(
+            id=self.iter_var_name
+            if self.is_for_range_iter() else self.iter_idx_name,
+            ctx=gast.Store(),
+            annotation=None,
+            type_comment=None),
+                              op=gast.Add(),
+                              value=step_node)
 
     def _build_assign_var_slice_node(self):
         var_slice_str = "{}[{}]".format(
@@ -1293,15 +1280,12 @@ def _build_assign_var_slice_node(self):
         return target_node, assign_node
 
     def _build_enum_increase_node(self):
-        return gast.AugAssign(
-            target=gast.Name(
-                id=self.enum_idx_name,
-                ctx=gast.Store(),
-                annotation=None,
-                type_comment=None),
-            op=gast.Add(),
-            value=gast.Constant(
-                value=1, kind=None))
+        return gast.AugAssign(target=gast.Name(id=self.enum_idx_name,
+                                               ctx=gast.Store(),
+                                               annotation=None,
+                                               type_comment=None),
+                              op=gast.Add(),
+                              value=gast.Constant(value=1, kind=None))
 
     def _get_iter_var_name(self):
         if self.is_for_range_iter():
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
index 7ce5aede4995d..66885536ae46f 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
@@ -65,15 +65,14 @@ def data_layer_not_check(name, shape, dtype='float32', lod_level=0):
         if shape[i] is None:
             shape[i] = -1
 
-    return helper.create_global_variable(
-        name=name,
-        shape=shape,
-        dtype=dtype,
-        type=core.VarDesc.VarType.LOD_TENSOR,
-        stop_gradient=True,
-        lod_level=lod_level,
-        is_data=True,
-        need_check_feed=False)
+    return helper.create_global_variable(name=name,
+                                         shape=shape,
+                                         dtype=dtype,
+                                         type=core.VarDesc.VarType.LOD_TENSOR,
+                                         stop_gradient=True,
+                                         lod_level=lod_level,
+                                         is_data=True,
+                                         need_check_feed=False)
 
 
 def to_static_variable_gast_node(name):
diff --git a/python/paddle/fluid/dygraph/inplace_utils.py b/python/paddle/fluid/dygraph/inplace_utils.py
index 5fa38c9d5f0fb..14e875b8b06c4 100644
--- a/python/paddle/fluid/dygraph/inplace_utils.py
+++ b/python/paddle/fluid/dygraph/inplace_utils.py
@@ -23,12 +23,13 @@
 # in dygraph mode. If static mode is used, the inplace mechanism will not be used, and the static method
 # of the original API will be called.
 def _inplace_apis_in_dygraph_only_(func):
+
     def __impl__(*args, **kwargs):
         if not _non_static_mode():
             origin_api_name = func.__name__[:-1]
             warnings.warn(
-                "In static mode, {}() is the same as {}() and does not perform inplace operation.".
-                format(func.__name__, origin_api_name))
+                "In static mode, {}() is the same as {}() and does not perform inplace operation."
+                .format(func.__name__, origin_api_name))
             origin_func = "{}.{}".format(func.__module__, origin_api_name)
             return eval(origin_func)(*args, **kwargs)
         return func(*args, **kwargs)
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index f10b652220214..a778cc3a1c688 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -206,8 +206,8 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
             is_double_grad_var = "@GRAD" in name_old
             has_double_grad = has_double_grad or is_double_grad_var
             should_rename = (include is None or name_old in include) and (
-                exclude is None or
-                name_old not in exclude) and not is_double_grad_var
+                exclude is None
+                or name_old not in exclude) and not is_double_grad_var
             if should_rename:
                 temp_name = name_old.split('_')
                 if len(temp_name) > 1 and temp_name[-1].isnumeric():
@@ -223,8 +223,8 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
             else:
                 name_new = name_old
             if name_old != name_new:
-                cur_block._rename_var(
-                    cpt.to_bytes(name_old), cpt.to_bytes(name_new))
+                cur_block._rename_var(cpt.to_bytes(name_old),
+                                      cpt.to_bytes(name_new))
             if not is_double_grad_var:
                 dict_rename_var_old_new[name_old] = name_new
                 dict_rename_var_new_old[name_new] = name_old
@@ -244,8 +244,8 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
         for var_name in double_grad_rename_dict:
             dict_rename_var_old_new[var_name] = double_grad_rename_dict[
                 var_name]
-            dict_rename_var_new_old[double_grad_rename_dict[
-                var_name]] = var_name
+            dict_rename_var_new_old[
+                double_grad_rename_dict[var_name]] = var_name
 
     # Rename on program desc
     for b_idx in six.moves.range(program_desc.num_blocks()):
@@ -254,16 +254,15 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
             op = cur_block.op(op_idx)
             for input_arg_name in op.input_arg_names():
                 if input_arg_name in dict_rename_var_old_new:
-                    if input_arg_name != dict_rename_var_old_new[
-                            input_arg_name]:
+                    if input_arg_name != dict_rename_var_old_new[input_arg_name]:
                         op._rename_input(
                             input_arg_name,
                             dict_rename_var_old_new[input_arg_name])
                         if cur_block.has_var(cpt.to_bytes(input_arg_name)):
                             cur_block._rename_var(
                                 cpt.to_bytes(input_arg_name),
-                                cpt.to_bytes(dict_rename_var_old_new[
-                                    input_arg_name]))
+                                cpt.to_bytes(
+                                    dict_rename_var_old_new[input_arg_name]))
             for output_arg_name in op.output_arg_names():
                 if output_arg_name in dict_rename_var_old_new:
                     if output_arg_name != dict_rename_var_old_new[
@@ -274,8 +273,8 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
                         if cur_block.has_var(cpt.to_bytes(output_arg_name)):
                             cur_block._rename_var(
                                 cpt.to_bytes(output_arg_name),
-                                cpt.to_bytes(dict_rename_var_old_new[
-                                    output_arg_name]))
+                                cpt.to_bytes(
+                                    dict_rename_var_old_new[output_arg_name]))
     program_desc.flush()
     return dict_rename_var_new_old, dict_rename_var_old_new
 
@@ -364,8 +363,8 @@ def scope(self):
     def _preprocess(self, program_desc):
         # rename persistable variables of 'program_desc'
         list_persistable_var = _get_persistable_var_names(program_desc)
-        rename_new_old_dict, _ = _rename_var_program_desc(program_desc,
-                                                          list_persistable_var)
+        rename_new_old_dict, _ = _rename_var_program_desc(
+            program_desc, list_persistable_var)
         # 1. Prune original program
         # remove feed, fetch and scale-1 op, remove op_callstack attr
         ops_to_remove = []
@@ -412,16 +411,16 @@ def _preprocess(self, program_desc):
         # 3. Output processing, add scale for outputs
         tmp_program = _build_program_by_desc(program_desc)
         # NOTE: [why need append scale for outputs]
-        # When dealing with some more complex pre-training models, there 
-        # will be situations where the pre-training model has multiple 
-        # fetch outputs. In the scenario of multiple fetch outputs, 
-        # there is a special case where multiple outputs of the model 
-        # may be on the same branch. According to the user's subsequent 
+        # When dealing with some more complex pre-training models, there
+        # will be situations where the pre-training model has multiple
+        # fetch outputs. In the scenario of multiple fetch outputs,
+        # there is a special case where multiple outputs of the model
+        # may be on the same branch. According to the user's subsequent
         # use, multiple outputs may be associated with multiple branches.
-        # These subsequent operations are added in TranslatedLayer is 
-        # agnostic during initialization, which results in subsequent 
-        # gradient accumulation operations that are required on the 
-        # output node in the middle of the branch will not be performed, 
+        # These subsequent operations are added in TranslatedLayer is
+        # agnostic during initialization, which results in subsequent
+        # gradient accumulation operations that are required on the
+        # output node in the middle of the branch will not be performed,
         # resulting in error, details see pull request:
         # [https://github.com/PaddlePaddle/Paddle/pull/24627]
         self._append_scale_to_output(tmp_program)
@@ -429,15 +428,15 @@ def _preprocess(self, program_desc):
         # 4. Persistable vars processing
         # - append loaded suffix to persistable vars
         # NOTE: [why need to append suffix to persistable vars]
-        # Dygraph and static graph mode use the same naming mechanism. 
-        # If users want to load the model fine-tune, it is possible 
-        # to add the existing Layer in the loaded model to enhance 
-        # the network. For example, the original saved model has linear, 
-        # and later after loading, a new linear is added. At this time, 
-        # there will be a problem of duplicate names, so here is unified 
+        # Dygraph and static graph mode use the same naming mechanism.
+        # If users want to load the model fine-tune, it is possible
+        # to add the existing Layer in the loaded model to enhance
+        # the network. For example, the original saved model has linear,
+        # and later after loading, a new linear is added. At this time,
+        # there will be a problem of duplicate names, so here is unified
         # to add the LOADED suffix to the parameters of the model loaded
-        self._suffix_varname_dict = _get_loaded_var_new_old(program_desc,
-                                                            rename_new_old_dict)
+        self._suffix_varname_dict = _get_loaded_var_new_old(
+            program_desc, rename_new_old_dict)
 
         # - get persistable var
         self._persistable_names = _get_persistable_var_names(program_desc)
@@ -451,8 +450,9 @@ def _append_scale_to_output(self, program):
         with framework.program_guard(program):
             for i, out in enumerate(self._output_descs):
                 var = program.global_block().var(out.name())
-                var = nn.scale(
-                    var, 1., name="translated_layer/scale_{}".format(i))
+                var = nn.scale(var,
+                               1.,
+                               name="translated_layer/scale_{}".format(i))
                 scale_output_vars.append(var)
         # 2. update output names & descs
         for i, var in enumerate(scale_output_vars):
@@ -468,7 +468,7 @@ def _append_backward_desc(self, infer_program_desc):
         # 2. prepare program and related var
         # NOTE: To reuse backward interfaces, build Program firstly.
         # Originally, there is no need to build a program, but need to almost
-        # rewrite a series of methods for append_backward for program_desc. 
+        # rewrite a series of methods for append_backward for program_desc.
         # Therefore, in order to reuse the method of backward.py, build the program here.
         program = _build_program_by_desc(program_desc_copy)
         # 3. Add the outputs which is only used for training and not saved in
@@ -498,7 +498,7 @@ def _append_backward_desc(self, infer_program_desc):
 
 
 # [ TranslatedLayer : Run program in imperative mode ]
-# 
+#
 # DESIGN IDEA: using an special operator `RunProgram`, execute program inside operator.
 #
 # Op's Inputs:
@@ -506,21 +506,21 @@ def _append_backward_desc(self, infer_program_desc):
 #   - the necessary parameters of the network
 # Op's Outputs:
 #   - the output variable of fetch
-# 
+#
 # This op receives a complete program desc, internally creates scope
 # and executor, executes this program. Key points:
 #
-# 1. Data Sharing: 
+# 1. Data Sharing:
 #   The varBase of the dynamic graph is not in the scope, so before the op
 #   executes the program internally, create persistent variables with the
 #   same name as feed, parameters, and fetch in the scope, and share the
 #   LoDTensor of the op input.
-# 
+#
 # 2. Forward and Backward Separation:
 #   Because the dynamic graph op performs the forward and backward separately,
 #   in the forward op RunProgram, we only execute the forward part of whole program,
 #   and in the backward op RunProgramGrad, we execute the backward part of program.
-#   We can not separate the program into forward and backward part, which will 
+#   We can not separate the program into forward and backward part, which will
 #   make some control flow execution logic wrong.
 
 
@@ -537,26 +537,23 @@ def _load_persistable_vars_by_program(model_path,
         if _is_parameter(each_var, program_holder.infer_program):
             # create output varbase
             if framework._in_eager_without_dygraph_check():
-                new_var = framework.EagerParamBase(
-                    shape=each_var.shape(),
-                    dtype=each_var.dtype(),
-                    name=each_var.name(),
-                    type=each_var.type(),
-                    persistable=True)
+                new_var = framework.EagerParamBase(shape=each_var.shape(),
+                                                   dtype=each_var.dtype(),
+                                                   name=each_var.name(),
+                                                   type=each_var.type(),
+                                                   persistable=True)
             else:
-                new_var = framework.ParamBase(
-                    shape=each_var.shape(),
-                    dtype=each_var.dtype(),
-                    name=each_var.name(),
-                    type=each_var.type(),
-                    persistable=True)
+                new_var = framework.ParamBase(shape=each_var.shape(),
+                                              dtype=each_var.dtype(),
+                                              name=each_var.name(),
+                                              type=each_var.type(),
+                                              persistable=True)
         else:
-            new_var = framework._varbase_creator(
-                type=each_var.type(),
-                name=each_var.name(),
-                shape=each_var.shape(),
-                dtype=each_var.dtype(),
-                persistable=True)
+            new_var = framework._varbase_creator(type=each_var.type(),
+                                                 name=each_var.name(),
+                                                 shape=each_var.shape(),
+                                                 dtype=each_var.dtype(),
+                                                 persistable=True)
         if params_filename is None:
             framework._dygraph_tracer().trace_op(
                 type='load',
@@ -588,7 +585,7 @@ def _load_persistable_vars_by_program(model_path,
             param.stop_gradient = False
 
     # NOTE: [Recovery stop gradient information based on the program]
-    # After loading the model, the stop_gradient information 
+    # After loading the model, the stop_gradient information
     # of the original variable is lost, but if a parameter does not
     # have a corresponding @GRAD variable in the backward program,
     # it can be said that it is also stop_gradient
@@ -617,7 +614,7 @@ def _load_persistable_vars(model_path, var_info_path, program_holder,
 
     # NOTE(chenweihang): we need load persistable vars based the program,
     # because the program may be pruned when `save_inference_model`, some
-    # var in `extra_var_info` may have been pruned 
+    # var in `extra_var_info` may have been pruned
     for name in sorted(inv_suffix_varname_dict):
         if name not in extra_var_info:
             raise RuntimeError(
@@ -646,8 +643,8 @@ def _load_persistable_vars(model_path, var_info_path, program_holder,
                     name=new_name,
                     persistable=True)
         else:
-            new_var = framework._varbase_creator(
-                name=new_name, persistable=True)
+            new_var = framework._varbase_creator(name=new_name,
+                                                 persistable=True)
 
         new_var.stop_gradient = extra_var_info[name]['stop_gradient']
         load_var_dict[new_name] = new_var
@@ -660,11 +657,10 @@ def _load_persistable_vars(model_path, var_info_path, program_holder,
         if len(extra_var_info) != 0:
             raise ValueError("The model to be loaded is incomplete.")
     else:
-        framework._dygraph_tracer().trace_op(
-            type='load_combine',
-            inputs={},
-            outputs={'Out': load_var_list},
-            attrs={'file_path': var_file_path})
+        framework._dygraph_tracer().trace_op(type='load_combine',
+                                             inputs={},
+                                             outputs={'Out': load_var_list},
+                                             attrs={'file_path': var_file_path})
 
     return load_var_dict
 
@@ -694,8 +690,9 @@ def _construct_program_holders(model_path, model_filename=None):
                 model_file_path = os.path.join(model_path, model_filename)
             elif filename.endswith(INFER_MODEL_SUFFIX) and filename.startswith(
                     model_name):
-                parsing_names = filename[len(model_name):-len(
-                    INFER_MODEL_SUFFIX) + 1].split('.')
+                parsing_names = filename[len(model_name
+                                             ):-len(INFER_MODEL_SUFFIX) +
+                                         1].split('.')
                 if len(parsing_names) == 3 and len(parsing_names[1]) > 0:
                     func_name = parsing_names[1]
                     model_file_path = os.path.join(model_path, filename)
@@ -737,8 +734,9 @@ def _construct_params_and_buffers(model_path,
         for file_name in os.listdir(model_path):
             if file_name.startswith(model_name) and file_name.endswith(
                     INFER_PARAMS_SUFFIX):
-                parsing_names = file_name[len(model_name):-len(
-                    INFER_PARAMS_SUFFIX) + 1].split('.')
+                parsing_names = file_name[len(model_name
+                                              ):-len(INFER_PARAMS_SUFFIX) +
+                                          1].split('.')
                 if len(parsing_names) == 3 and len(parsing_names[1]) > 0:
                     func_name = parsing_names[1]
                 else:
@@ -747,14 +745,15 @@ def _construct_params_and_buffers(model_path,
                 continue
             var_info_path = os.path.join(model_path, var_info_filename)
             var_dict.update(
-                _load_persistable_vars(model_path, var_info_path, programs[
-                    func_name], file_name))
+                _load_persistable_vars(model_path, var_info_path,
+                                       programs[func_name], file_name))
     elif params_filename is not None and not os.path.exists(params_path):
         # When saving XX, there is only '*.pdmodel'
         return dict()
     else:
-        var_dict = _load_persistable_vars_by_program(
-            model_path, programs['forward'], params_filename)
+        var_dict = _load_persistable_vars_by_program(model_path,
+                                                     programs['forward'],
+                                                     params_filename)
 
     if not append_suffix:
         var_dict = _remove_varname_suffix(var_dict, programs['forward'])
@@ -796,15 +795,14 @@ def _run_dygraph(instance, input, program_holder):
                     place=framework._current_expected_place(),
                     zero_copy=True)
             else:
-                var = core.VarBase(
-                    value=value,
-                    name=program_holder.input_descs[i].name(),
-                    persistable=False,
-                    place=framework._current_expected_place(),
-                    zero_copy=True)
+                var = core.VarBase(value=value,
+                                   name=program_holder.input_descs[i].name(),
+                                   persistable=False,
+                                   place=framework._current_expected_place(),
+                                   zero_copy=True)
         else:
             var = value
-            # NOTE: we changed var name here, 
+            # NOTE: we changed var name here,
             # but it may be an important name set by user
             var.name = program_holder.input_descs[i].name()
         input_vars.append(var)
@@ -828,15 +826,13 @@ def _run_dygraph(instance, input, program_holder):
     output_vars = []
     for var_desc in program_holder.output_descs:
         if framework._in_eager_without_dygraph_check():
-            var = core.eager.Tensor(
-                dtype=var_desc.dtype(),
-                dims=var_desc.shape(),
-                name=var_desc.name(),
-                type=var_desc.type(),
-                persistable=False)
+            var = core.eager.Tensor(dtype=var_desc.dtype(),
+                                    dims=var_desc.shape(),
+                                    name=var_desc.name(),
+                                    type=var_desc.type(),
+                                    persistable=False)
         else:
-            var = core.VarBase(var_desc.dtype(),
-                               var_desc.shape(),
+            var = core.VarBase(var_desc.dtype(), var_desc.shape(),
                                var_desc.name(), var_desc.type(), False)
         output_vars.append(var)
 
@@ -852,15 +848,13 @@ def _run_dygraph(instance, input, program_holder):
     double_grad_vars = []
     for var_desc in program_holder.double_grad_descs:
         if framework._in_eager_without_dygraph_check():
-            var = core.eager.Tensor(
-                dtype=var_desc.dtype(),
-                dims=var_desc.shape(),
-                name=var_desc.name(),
-                type=var_desc.type(),
-                persistable=False)
+            var = core.eager.Tensor(dtype=var_desc.dtype(),
+                                    dims=var_desc.shape(),
+                                    name=var_desc.name(),
+                                    type=var_desc.type(),
+                                    persistable=False)
         else:
-            var = core.VarBase(var_desc.dtype(),
-                               var_desc.shape(),
+            var = core.VarBase(var_desc.dtype(), var_desc.shape(),
                                var_desc.name(), var_desc.type(), False)
         double_grad_vars.append(var)
 
@@ -870,11 +864,9 @@ def _run_dygraph(instance, input, program_holder):
     attrs = ('global_block', trace_program.block(0), 'start_op_index', 0,
              'end_op_index', end_op_index, 'is_test', instance._is_test,
              'program_id', _hash_with_id(trace_program, instance))
-    _C_ops.run_program(
-        _valid_vars(input_vars),
-        _valid_vars(persistable_vars),
-        _valid_vars(output_vars), tmp_scope_vec,
-        _valid_vars(double_grad_vars), *attrs)
+    _C_ops.run_program(_valid_vars(input_vars), _valid_vars(persistable_vars),
+                       _valid_vars(output_vars), tmp_scope_vec,
+                       _valid_vars(double_grad_vars), None, *attrs)
     # NOTE: [ why need set param's gradient type here ]
     # if user set sparse gradient mode, the param's gradient
     # will be SelectedRows, not LoDTensor. But tracer will just
@@ -885,7 +877,7 @@ def _run_dygraph(instance, input, program_holder):
     for persistable_var in persistable_vars:
         grad_var_name = persistable_var.name + core.grad_var_suffix()
         grad_var = trace_program.block(0).find_var(cpt.to_bytes(grad_var_name))
-        # NOTE: cannot find var desc maybe not problem, 
+        # NOTE: cannot find var desc maybe not problem,
         # such as in batch_norm
         if grad_var is None:
             continue
@@ -902,8 +894,8 @@ def _run_dygraph(instance, input, program_holder):
 
 def drop_scope_if_no_grad(instance, scope_vec):
     tracer = framework._dygraph_tracer()
-    scope = scope_vec.value().get_scope() if isinstance(scope_vec, (
-        core.VarBase)) else scope_vec[0]
+    scope = scope_vec.value().get_scope() if isinstance(
+        scope_vec, (core.VarBase)) else scope_vec[0]
     if (not instance._is_test) and (not tracer._has_grad):
         scope.drop_kids()
 
@@ -968,10 +960,9 @@ def _append_block(dest_program,
     origin_block_idx = dest_program.current_block_idx
     param_var_names = _collect_current_and_parent_var(dest_program,
                                                       origin_block_idx)
-    append_var_from_block_desc_static(
-        dest_program.block(origin_block_idx),
-        src_program_desc.block(0),
-        exclude=param_var_names)
+    append_var_from_block_desc_static(dest_program.block(origin_block_idx),
+                                      src_program_desc.block(0),
+                                      exclude=param_var_names)
 
     name_inp_desc = [inp.name() for inp in program_holder.input_descs]
     input_names = [inp.name for inp in input_variables]
@@ -1002,10 +993,11 @@ def _append_block(dest_program,
             else:
                 parent_idx = origin_block_idx
             dest_block = dest_program._create_block(parent_idx=parent_idx)
-            append_var_from_block_desc_static(
-                dest_block, src_block, exclude=param_var_names)
-            append_ops += append_op_from_block_desc_static(dest_block,
-                                                           src_block)
+            append_var_from_block_desc_static(dest_block,
+                                              src_block,
+                                              exclude=param_var_names)
+            append_ops += append_op_from_block_desc_static(
+                dest_block, src_block)
 
     dest_program._sync_with_cpp()
     for op in append_ops:
@@ -1072,13 +1064,12 @@ def append_op_from_desc_static(block, op_desc):
     op_type = op_desc.type()
     op_append = block.desc.append_op()
     op_append.copy_from(op_desc)
-    op = framework.Operator(
-        block=block,
-        desc=op_append,
-        type=op_type,
-        inputs=None,
-        outputs=None,
-        attrs=None)
+    op = framework.Operator(block=block,
+                            desc=op_append,
+                            type=op_type,
+                            inputs=None,
+                            outputs=None,
+                            attrs=None)
     block.ops.append(op)
     return op
 
@@ -1298,8 +1289,8 @@ def _construct(model_path, configs=None):
         programs = _construct_program_holders(model_path, model_filename)
 
         # 2. load layer parameters & buffers
-        persistable_vars = _construct_params_and_buffers(model_path, programs,
-                                                         params_filename)
+        persistable_vars = _construct_params_and_buffers(
+            model_path, programs, params_filename)
 
         # 3. construct TranslatedLayer object
         translated_layer = TranslatedLayer(programs, persistable_vars)
@@ -1310,9 +1301,10 @@ def _construct(model_path, configs=None):
                 translated_layer._input_args_names = [
                     ins.name() for ins in program_holder.input_descs
                 ]
-            setattr(TranslatedLayer, method_name,
-                    TranslatedLayer._execution_method_creator(method_name,
-                                                              program_holder))
+            setattr(
+                TranslatedLayer, method_name,
+                TranslatedLayer._execution_method_creator(
+                    method_name, program_holder))
 
         # 5. set TranslatedLayer's default mode to eval
         translated_layer.eval()
@@ -1321,6 +1313,7 @@ def _construct(model_path, configs=None):
 
     @staticmethod
     def _execution_method_creator(method_name, program_holder):
+
         def __i_m_p_l__(self, *input):
             program_holder = self._program_holder_dict[__i_m_p_l__.__name__]
             # When using jit.save, it runs in static graph mode.
@@ -1457,10 +1450,9 @@ def _input_spec(self, method_name='forward'):
         # 2. build input spec by input desc
         input_spec = []
         for var_desc in program_holder.input_descs:
-            spec = paddle.static.InputSpec(
-                shape=var_desc.shape(),
-                dtype=var_desc.dtype(),
-                name=var_desc.name())
+            spec = paddle.static.InputSpec(shape=var_desc.shape(),
+                                           dtype=var_desc.dtype(),
+                                           name=var_desc.name())
             input_spec.append(spec)
 
         return input_spec
@@ -1472,13 +1464,12 @@ def _output_spec(self, method_name='forward'):
         # 2. build output spec by output desc
         output_spec = []
         for var_desc in program_holder.output_descs:
-            # NOTE(chenweihang): InputSpec describes a tensor, not just input. 
-            # Maybe the name is not good enough. Here we use InputSpec to 
+            # NOTE(chenweihang): InputSpec describes a tensor, not just input.
+            # Maybe the name is not good enough. Here we use InputSpec to
             # construct the description of Output tensor
-            spec = paddle.static.InputSpec(
-                shape=var_desc.shape(),
-                dtype=var_desc.dtype(),
-                name=var_desc.name())
+            spec = paddle.static.InputSpec(shape=var_desc.shape(),
+                                           dtype=var_desc.dtype(),
+                                           name=var_desc.name())
             output_spec.append(spec)
 
         return output_spec
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index e0e259215c509..b6847efab1d68 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -64,8 +64,8 @@ def _extract_vars(inputs, result_list, err_tag='inputs'):
             _extract_vars(var, result_list, err_tag)
     else:
         raise TypeError(
-            "The type of 'each element of {}' in fluid.dygraph.jit.TracedLayer.trace must be fluid.Variable, but received {}.".
-            format(err_tag, type(inputs)))
+            "The type of 'each element of {}' in fluid.dygraph.jit.TracedLayer.trace must be fluid.Variable, but received {}."
+            .format(err_tag, type(inputs)))
 
 
 def extract_vars(inputs, err_tag='inputs'):
@@ -211,20 +211,19 @@ def decorated(python_func):
         _, python_func = unwrap_decorators(python_func)
 
         # Step 2. copy some attributes from original python function.
-        static_layer = copy_decorator_attrs(
-            original_func=python_func,
-            decorated_obj=StaticFunction(
-                function=python_func,
-                input_spec=input_spec,
-                build_strategy=build_strategy))
+        static_layer = copy_decorator_attrs(original_func=python_func,
+                                            decorated_obj=StaticFunction(
+                                                function=python_func,
+                                                input_spec=input_spec,
+                                                build_strategy=build_strategy))
 
         return static_layer
 
     build_strategy = build_strategy or BuildStrategy()
     if not isinstance(build_strategy, BuildStrategy):
         raise TypeError(
-            "Required type(build_strategy) shall be `paddle.static.BuildStrategy`, but received {}".
-            format(type(build_strategy).__name__))
+            "Required type(build_strategy) shall be `paddle.static.BuildStrategy`, but received {}"
+            .format(type(build_strategy).__name__))
 
     # for usage: `declarative(foo, ...)`
     if function is not None:
@@ -232,8 +231,8 @@ def decorated(python_func):
             if isinstance(function.forward, StaticFunction):
                 class_name = function.__class__.__name__
                 logging_utils.warn(
-                    "`{}.forward` has already been decorated somewhere. It will be redecorated to replace previous one.".
-                    format(class_name))
+                    "`{}.forward` has already been decorated somewhere. It will be redecorated to replace previous one."
+                    .format(class_name))
             function.forward = decorated(function.forward)
             return function
         else:
@@ -284,6 +283,7 @@ def func(x):
 
 
 class _SaveLoadConfig(object):
+
     def __init__(self):
         self._output_spec = None
         self._model_filename = None
@@ -622,6 +622,7 @@ def _remove_save_pre_hook(hook):
 
 
 def _run_save_pre_hooks(func):
+
     def wrapper(layer, path, input_spec=None, **configs):
         global _save_pre_hooks
         for hook in _save_pre_hooks:
@@ -775,8 +776,8 @@ def fun(inputs):
             "The paddle.jit.save doesn't work when setting ProgramTranslator.enable to False."
         )
 
-    if not (isinstance(layer, Layer) or inspect.isfunction(layer) or isinstance(
-            layer, StaticFunction)):
+    if not (isinstance(layer, Layer) or inspect.isfunction(layer)
+            or isinstance(layer, StaticFunction)):
         raise TypeError(
             "The input of paddle.jit.save should be 'Layer' or 'Function', but received input type is %s."
             % type(layer))
@@ -837,7 +838,7 @@ def fun(inputs):
     # parse configs
     configs = _parse_save_configs(configs)
     # whether outermost layer has pre/post hook, if does, we need also save
-    # these operators in program. 
+    # these operators in program.
     with_hook = configs.with_hook
 
     scope = core.Scope()
@@ -848,7 +849,9 @@ def fun(inputs):
             with_hook = True
     else:
         # layer is function
-        functions = [layer, ]
+        functions = [
+            layer,
+        ]
     for attr_func in functions:
         if isinstance(layer, Layer):
             static_func = getattr(inner_layer, attr_func, None)
@@ -862,8 +865,8 @@ def fun(inputs):
                 if inner_input_spec:
                     inner_input_spec = pack_sequence_as(input_spec,
                                                         inner_input_spec)
-                static_forward = declarative(
-                    inner_layer.forward, input_spec=inner_input_spec)
+                static_forward = declarative(inner_layer.forward,
+                                             input_spec=inner_input_spec)
                 concrete_program = static_forward.concrete_program_specify_input_spec(
                     with_hook=with_hook)
                 # the input_spec has been used in declarative, which is equal to
@@ -882,14 +885,14 @@ def fun(inputs):
                 if inner_input_spec:
                     inner_input_spec = pack_sequence_as(input_spec,
                                                         inner_input_spec)
-                static_function = declarative(
-                    attr_func, input_spec=inner_input_spec)
+                static_function = declarative(attr_func,
+                                              input_spec=inner_input_spec)
                 concrete_program = static_function.concrete_program
 
                 if static_function._class_instance is None:
                     warnings.warn(
-                        '`jit.save` will only save the `Program`, not the parameters. If you have to save the parameters, please make sure that {} is a member function of `paddle.nn.Layer` and the saved parameters are in `state_dict`'.
-                        format(layer))
+                        '`jit.save` will only save the `Program`, not the parameters. If you have to save the parameters, please make sure that {} is a member function of `paddle.nn.Layer` and the saved parameters are in `state_dict`'
+                        .format(layer))
 
         dygraph_state_dict = None
         if isinstance(inner_layer, Layer):
@@ -922,8 +925,8 @@ def fun(inputs):
                         param_or_buffer_tensor = scope.var(
                             param_or_buffer.name).get_tensor()
                         #src_tensor = param_or_buffer.value().get_tensor()
-                        src_tensor = state_var_dict[param_or_buffer.name].value(
-                        ).get_tensor()
+                        src_tensor = state_var_dict[
+                            param_or_buffer.name].value().get_tensor()
                         param_or_buffer_tensor._share_data_with(src_tensor)
                     # record var info
                     if param_or_buffer.name not in extra_var_info:
@@ -1534,14 +1537,16 @@ def forward(self, input):
                    "fluid.dygraph.jit.TracedLayer.save_inference_model")
         if isinstance(feed, list):
             for f in feed:
-                check_type(f, "each element of feed", int,
-                           "fluid.dygraph.jit.TracedLayer.save_inference_model")
+                check_type(
+                    f, "each element of feed", int,
+                    "fluid.dygraph.jit.TracedLayer.save_inference_model")
         check_type(fetch, "fetch", (type(None), list),
                    "fluid.dygraph.jit.TracedLayer.save_inference_model")
         if isinstance(fetch, list):
             for f in fetch:
-                check_type(f, "each element of fetch", int,
-                           "fluid.dygraph.jit.TracedLayer.save_inference_model")
+                check_type(
+                    f, "each element of fetch", int,
+                    "fluid.dygraph.jit.TracedLayer.save_inference_model")
         clip_extra = kwargs.get('clip_extra', False)
         # path check
         file_prefix = os.path.basename(path)
@@ -1575,12 +1580,11 @@ def get_feed_fetch(all_vars, partial_vars):
             model_filename = file_prefix + INFER_MODEL_SUFFIX
             params_filename = file_prefix + INFER_PARAMS_SUFFIX
 
-            save_inference_model(
-                dirname=dirname,
-                feeded_var_names=feeded_var_names,
-                target_vars=target_vars,
-                executor=self._exe,
-                main_program=self._program.clone(),
-                model_filename=model_filename,
-                params_filename=params_filename,
-                clip_extra=clip_extra)
+            save_inference_model(dirname=dirname,
+                                 feeded_var_names=feeded_var_names,
+                                 target_vars=target_vars,
+                                 executor=self._exe,
+                                 main_program=self._program.clone(),
+                                 model_filename=model_filename,
+                                 params_filename=params_filename,
+                                 clip_extra=clip_extra)
diff --git a/python/paddle/fluid/dygraph/layer_hooks.py b/python/paddle/fluid/dygraph/layer_hooks.py
index f93ba569807a8..68c3d463e5deb 100644
--- a/python/paddle/fluid/dygraph/layer_hooks.py
+++ b/python/paddle/fluid/dygraph/layer_hooks.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -37,14 +37,14 @@ def record_program_ops_pre_hook(layer, inputs):
     """
     if not _non_static_mode():
         if layer._op_recorder.start < 0:
-            layer._op_recorder.start = len(default_main_program().current_block(
-            ).ops)
+            layer._op_recorder.start = len(
+                default_main_program().current_block().ops)
             layer._op_recorder.is_valid = True
         else:
             layer._op_recorder.is_valid = False
             warnings.warn(
-                "{} has recorded the op information before. Please check whether you call this layer twice.".
-                format(layer._full_name))
+                "{} has recorded the op information before. Please check whether you call this layer twice."
+                .format(layer._full_name))
 
     return None
 
diff --git a/python/paddle/fluid/dygraph/layer_object_helper.py b/python/paddle/fluid/dygraph/layer_object_helper.py
index 5da9013fb7e14..394df321811d8 100644
--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
@@ -25,6 +25,7 @@
 
 
 class LayerObjectHelper(LayerHelperBase):
+
     def __init__(self, name):
         super(LayerObjectHelper, self).__init__(name, layer_type=name)
 
@@ -169,11 +170,10 @@ def append_activation(self, input_var, act=None, use_cudnn=None):
             return res
         else:
             tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
-            self.append_op(
-                type=act_type,
-                inputs={"X": [input_var]},
-                outputs={"Out": [tmp]},
-                attrs=act)
+            self.append_op(type=act_type,
+                           inputs={"X": [input_var]},
+                           outputs={"Out": [tmp]},
+                           attrs=act)
             return tmp
 
     def is_instance(self, param, cls):
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 088fed03c3595..b67f7d0a91fee 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -423,10 +423,9 @@ def forward(self, input):
         return self._helper.create_parameter(temp_attr, shape, dtype, is_bias,
                                              default_initializer)
 
-    @deprecated(
-        since="2.0.0",
-        update_to="paddle.nn.Layer.create_tensor",
-        reason="New api in create_tensor, easier to use.")
+    @deprecated(since="2.0.0",
+                update_to="paddle.nn.Layer.create_tensor",
+                reason="New api in create_tensor, easier to use.")
     def create_variable(self, name=None, persistable=None, dtype=None):
         """
 
@@ -541,8 +540,7 @@ def parameters(self, include_sublayers=True):
 
         """
         ret = [
-            param
-            for _, param in self.named_parameters(
+            param for _, param in self.named_parameters(
                 include_sublayers=include_sublayers)
         ]
         return ret
@@ -658,8 +656,8 @@ def named_parameters(self, prefix='', include_sublayers=True):
         """
         params_set = set()
         named_sublayers = self.named_sublayers(
-            prefix=prefix,
-            include_self=True) if include_sublayers else zip([prefix], [self])
+            prefix=prefix, include_self=True) if include_sublayers else zip(
+                [prefix], [self])
         for layer_prefix, sublayer in named_sublayers:
             params = sublayer._parameters.items()
             for key, param in params:
@@ -703,9 +701,9 @@ def named_sublayers(self, prefix='', include_self=False, layers_set=None):
             if layer is None:
                 continue
             layer_prefix = prefix + ('.' if prefix else '') + key
-            for p, l in layer.named_sublayers(
-                    prefix=layer_prefix, include_self=True,
-                    layers_set=layers_set):
+            for p, l in layer.named_sublayers(prefix=layer_prefix,
+                                              include_self=True,
+                                              layers_set=layers_set):
                 yield p, l
 
     def register_buffer(self, name, tensor, persistable=True):
@@ -762,11 +760,11 @@ def register_buffer(self, name, tensor, persistable=True):
             raise KeyError("The name of buffer can not be empty.")
         elif hasattr(self, name) and name not in self._buffers:
             raise KeyError("attribute '{}' already exists.".format(name))
-        elif tensor is not None and not (type(tensor) == core.VarBase or
-                                         type(tensor) == core.eager.Tensor):
+        elif tensor is not None and not (type(tensor) == core.VarBase
+                                         or type(tensor) == core.eager.Tensor):
             raise TypeError(
-                "The registered buffer should be a Paddle.Tensor, but received {}.".
-                format(type(tensor).__name__))
+                "The registered buffer should be a Paddle.Tensor, but received {}."
+                .format(type(tensor).__name__))
         else:
             self._buffers[name] = tensor
             if persistable:
@@ -799,8 +797,7 @@ def buffers(self, include_sublayers=True):
 
         """
         ret = [
-            buffer
-            for _, buffer in self.named_buffers(
+            buffer for _, buffer in self.named_buffers(
                 include_sublayers=include_sublayers)
         ]
         return ret
@@ -843,8 +840,8 @@ def named_buffers(self, prefix='', include_sublayers=True):
         """
         buffers_set = set()
         named_sublayers = self.named_sublayers(
-            prefix=prefix,
-            include_self=True) if include_sublayers else zip([prefix], [self])
+            prefix=prefix, include_self=True) if include_sublayers else zip(
+                [prefix], [self])
         for layer_prefix, sublayer in named_sublayers:
             buffers = sublayer._buffers.items()
             for key, buffer in buffers:
@@ -1034,8 +1031,8 @@ def forward(self, input):
         elif parameter is not None and not isinstance(parameter,
                                                       framework.Parameter):
             raise TypeError(
-                "The parameter to be added should be a Parameter, but received {}.".
-                format(type(parameter).__name__))
+                "The parameter to be added should be a Parameter, but received {}."
+                .format(type(parameter).__name__))
         else:
             if parameter is None:
                 self._parameters[name] = None
@@ -1072,8 +1069,9 @@ def is_already_registered(is_pre_hook):
             return already_registed
 
         if not isinstance(attrs, dict):
-            raise TypeError("attrs should be type(dict), but received {}".
-                            format(type(attrs).__name__))
+            raise TypeError(
+                "attrs should be type(dict), but received {}".format(
+                    type(attrs).__name__))
 
         # NOTE: Overwrite behavior for same key.
         self._customized_attrs.update(attrs)
@@ -1089,8 +1087,8 @@ def is_already_registered(is_pre_hook):
             post_hook_helper = self.register_forward_post_hook(
                 set_op_customized_attrs_post_hook)
             if len(self._forward_post_hooks) > 1:
-                self._forward_post_hooks.move_to_end(
-                    post_hook_helper._hook_id, last=False)
+                self._forward_post_hooks.move_to_end(post_hook_helper._hook_id,
+                                                     last=False)
 
             assert len(self._op_recorder.hooks) == 1
 
@@ -1123,6 +1121,7 @@ def __getattr__(self, name):
         return object.__getattribute__(self, name)
 
     def __setattr__(self, name, value):
+
         def _remove_if_exist(*dicts):
             for d in dicts:
                 if name in d:
@@ -1147,7 +1146,8 @@ def _remove_if_exist(*dicts):
             if value is not None:
                 raise TypeError(
                     "assignment to parameter '{}' should be of type Parameter or None, but got '{}'"
-                    .format(name, type(value).__name__))
+                    .format(name,
+                            type(value).__name__))
             params[name] = None
         else:
             layers = self.__dict__.get('_sub_layers', None)
@@ -1163,7 +1163,8 @@ def _remove_if_exist(*dicts):
                 if value is not None:
                     raise TypeError(
                         "assignment to sublayer '{}' should be of type Layer or None, but got '{}'"
-                        .format(name, type(value).__name__))
+                        .format(name,
+                                type(value).__name__))
                 layers[name] = None
             else:
                 _buffers = self.__dict__.get('_buffers', None)
@@ -1194,17 +1195,18 @@ def _remove_if_exist(*dicts):
                         if in_declarative_mode() and _buffers[name] is None:
                             raise RuntimeError(
                                 'In Dy2stat, self.{0} is a buffer and self.{0} is '
-                                'not allowed to be set to Variable when self.{0} is None.'.
-                                format(name))
-                        elif _buffers[name] is None or type(
-                                getattr(self, name)) == core.VarBase:
+                                'not allowed to be set to Variable when self.{0} is None.'
+                                .format(name))
+                        elif _buffers[name] is None or type(getattr(
+                                self, name)) == core.VarBase:
                             _buffers[name] = assign(value)
                         else:
                             assign(value, getattr(self, name))
                     elif value is not None:
                         raise TypeError(
                             "assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'"
-                            .format(name, type(value).__name__))
+                            .format(name,
+                                    type(value).__name__))
                     else:
                         # Assigning None will remove the buffer, but if re-assign a new varBase to it,
                         # it will be remarked as a buffer with same `persistable` attribute.
@@ -1454,8 +1456,8 @@ def set_state_dict(self, state_dict, use_structured_name=True):
         def _check_match(key, param):
             state = state_dict.get(key, None)
             if state is None:
-                raise ValueError("{} is not found in the provided dict.".format(
-                    key))
+                raise ValueError(
+                    "{} is not found in the provided dict.".format(key))
             if (isinstance(state, dict) or isinstance(state, list)):
                 if (len(state) != len(param)):
                     raise ValueError("{} receieves the length of {}, "
@@ -1507,8 +1509,8 @@ def _set_var(var, ndarray):
             executor = Executor(_get_device())._default_executor
             # restore parameter states
             core._create_loaded_parameter(
-                [param for param, state in matched_param_state],
-                global_scope(), executor)
+                [param for param, state in matched_param_state], global_scope(),
+                executor)
             for param, state in matched_param_state:
                 _set_var(param, state)
 
@@ -1560,11 +1562,10 @@ def to(self, device=None, dtype=None, blocking=None):
                 #        [ 0.33960250,  0.96878713]])
 
         '''
-        return self._to_impl(
-            device=device,
-            dtype=dtype,
-            blocking=blocking,
-            include_sublayers=True)
+        return self._to_impl(device=device,
+                             dtype=dtype,
+                             blocking=blocking,
+                             include_sublayers=True)
 
     def _apply(self, func, device, dtype, blocking, include_sublayers=True):
         if include_sublayers:
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index a6c1993dbbf03..4b9c50127f046 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -67,7 +67,7 @@ def create_lr_var(self, lr):
             persistable=False)
         return lr
 
-    # Note: If you want to change what optimizer.state_dict stores, just overwrite this functions, 
+    # Note: If you want to change what optimizer.state_dict stores, just overwrite this functions,
     # "self.step_num" will be stored by default.
     def state_dict(self):
         """
@@ -107,8 +107,8 @@ def set_state_dict(self, state_dict):
                 self.__dict__[key] = state_dict[key]
             else:
                 raise RuntimeError(
-                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".
-                    format(key))
+                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict"
+                    .format(key))
         if len(state_dict) > len(self.keys):
             warnings.warn(
                 "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
@@ -259,8 +259,8 @@ def step(self):
         div_res = self.create_lr_var(self.step_num / self.decay_steps)
         if self.staircase:
             div_res = layers.floor(div_res)
-        decayed_lr = self.learning_rate * layers.exp(-1 * self.decay_rate *
-                                                     div_res)
+        decayed_lr = self.learning_rate * layers.exp(
+            -1 * self.decay_rate * div_res)
 
         return decayed_lr
 
@@ -510,9 +510,9 @@ def step(self):
                 div_res = self.create_lr_var(1.0)
             tmp_decay_steps = self.decay_steps * div_res
         else:
-            tmp_step_num = self.create_lr_var(tmp_step_num
-                                              if tmp_step_num < self.decay_steps
-                                              else self.decay_steps)
+            tmp_step_num = self.create_lr_var(
+                tmp_step_num if tmp_step_num < self.decay_steps else self.
+                decay_steps)
 
         decayed_lr = (self.learning_rate - self.end_learning_rate) * \
             ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
@@ -639,8 +639,8 @@ def step(self):
         from .. import layers
         a = self.create_lr_var(self.step_num**-0.5)
         b = self.create_lr_var((self.warmup_steps**-1.5) * self.step_num)
-        lr_value = self.learning_rate * (self.d_model
-                                         **-0.5) * layers.elementwise_min(a, b)
+        lr_value = self.learning_rate * (self.d_model**
+                                         -0.5) * layers.elementwise_min(a, b)
         return lr_value
 
 
@@ -713,15 +713,15 @@ def __init__(self,
             learning_rate, int) or isinstance(learning_rate, LearningRateDecay)
         if not type_check:
             raise TypeError(
-                "the type of learning_rate should be [int, float or LearningRateDecay], the current type is {}".
-                format(learning_rate))
+                "the type of learning_rate should be [int, float or LearningRateDecay], the current type is {}"
+                .format(learning_rate))
         self.learning_rate = learning_rate
         self.warmup_steps = warmup_steps
         self.start_lr = start_lr
         assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(
             end_lr, start_lr)
-        self.lr_ratio_before_warmup = (
-            float(end_lr) - float(start_lr)) / float(warmup_steps)
+        self.lr_ratio_before_warmup = (float(end_lr) -
+                                       float(start_lr)) / float(warmup_steps)
 
     def step(self):
         base_lr = self.learning_rate
@@ -913,15 +913,16 @@ def step(self, loss):
                 from .. import layers
                 self.cooldown_counter = self.cooldown
                 self.num_bad_epochs = 0
-                new_lr = layers.elementwise_max(self.learning_rate *
-                                                self.decay_rate, self.min_lr)
+                new_lr = layers.elementwise_max(
+                    self.learning_rate * self.decay_rate, self.min_lr)
                 if self.learning_rate - new_lr > self.eps:
                     if self.verbose:
                         old_lr = self.learning_rate.numpy()[0] if isinstance(
                             self.learning_rate,
                             Variable) else self.learning_rate
                         print('Epoch {}: reducing learning rate from {} to {}.'.
-                              format(self.epoch_num, old_lr, new_lr.numpy()[0]))
+                              format(self.epoch_num, old_lr,
+                                     new_lr.numpy()[0]))
                     self.learning_rate = new_lr
 
     def _is_better(self, current, best):
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 8a19be640a7ff..13f11ea161bbe 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -33,10 +33,10 @@
     core.VarDesc.VarType.BOOL,
 ]
 
-# NOTE(chenweihang): We currently do not fully support the type promotion 
-# between tensors. Parting support here is because the interoperation of 
-# real and complex numbers in paddle quantum is very frequent, such as the 
-# binary operation between `float` and `complex64`, so we must support the 
+# NOTE(chenweihang): We currently do not fully support the type promotion
+# between tensors. Parting support here is because the interoperation of
+# real and complex numbers in paddle quantum is very frequent, such as the
+# binary operation between `float` and `complex64`, so we must support the
 # correct type promotion on the APIs paddle quantum used.
 # Now only check in dygraph (paddle quantum based dygraph)
 # Full type promotion support will need to be fully verified later.
@@ -200,6 +200,7 @@ def _binary_creator_(method_name,
                          reverse=False,
                          scalar_method=None,
                          call_final_api=False):
+
         def __impl__(self, other_var):
             # 1. scalar exists cases
             # we need combine the tensor.dtype and scalar.dtype, cast correct object
@@ -217,13 +218,13 @@ def __impl__(self, other_var):
                 other_var = float(other_var)
                 # division is a special case
                 # NOTE(chenweihang): because we cast tensor to float32 instead float64,
-                # the division result can only guarantee the numerical accuracy of 6 digits 
-                # after the decimal point. The result of numpy calculation is of float64 type, 
-                # so the calculation result here and the calculation result of numpy are 
+                # the division result can only guarantee the numerical accuracy of 6 digits
+                # after the decimal point. The result of numpy calculation is of float64 type,
+                # so the calculation result here and the calculation result of numpy are
                 # different after 6 decimal point. If necessary, we can also use float64 here.
                 # torch's behavior here is consistent with ours
-                if (op_type == "final_state_divide" or
-                        op_type == "elementwise_div"
+                if (op_type == "final_state_divide"
+                        or op_type == "elementwise_div"
                     ) and self.dtype in _supported_int_dtype_:
                     self = astype(self, 'float32')
                 # here use `scale` replace `elementwise` to get better performance
@@ -246,19 +247,20 @@ def __impl__(self, other_var):
                     other_var = paddle.to_tensor(other_var, dtype='complex64')
                 else:
                     if reverse:
-                        other_var = create_tensor(
-                            other_var, dtype=lhs_dtype, shape=self.shape)
+                        other_var = create_tensor(other_var,
+                                                  dtype=lhs_dtype,
+                                                  shape=self.shape)
                     else:
                         # add fill_op
-                        other_var = create_scalar(
-                            value=other_var, dtype=lhs_dtype)
+                        other_var = create_scalar(value=other_var,
+                                                  dtype=lhs_dtype)
 
             # 3. promote types or unify right var type to left var
             rhs_dtype = other_var.dtype
             if lhs_dtype != rhs_dtype:
                 if method_name in _supported_promote_complex_types_ and (
-                        lhs_dtype in _complex_dtypes or
-                        rhs_dtype in _complex_dtypes):
+                        lhs_dtype in _complex_dtypes
+                        or rhs_dtype in _complex_dtypes):
                     # only when lhs_dtype or rhs_dtype is complex type,
                     # the dtype will promote, in other cases, directly
                     # use lhs_dtype, this is consistent will original rule
@@ -270,8 +272,8 @@ def __impl__(self, other_var):
                         other_var, promote_dtype)
                 else:
                     warnings.warn(
-                        'The dtype of left and right variables are not the same, left dtype is {}, but right dtype is {}, the right dtype will convert to {}'.
-                        format(lhs_dtype, rhs_dtype, lhs_dtype))
+                        'The dtype of left and right variables are not the same, left dtype is {}, but right dtype is {}, the right dtype will convert to {}'
+                        .format(lhs_dtype, rhs_dtype, lhs_dtype))
                     other_var = astype(other_var, lhs_dtype)
 
             if reverse:
@@ -320,54 +322,66 @@ def __impl__(self, other_var):
         ('ndim', _ndim_),
         ('size', _size_),
         ('T', _T_),
-        ('__add__', _binary_creator_('__add__', 'final_state_add', False,
-                                     _scalar_add_, True))
-        if framework._in_eager_mode_ else ('__add__', _binary_creator_(
-            '__add__', 'elementwise_add', False, _scalar_add_)),
+        ('__add__',
+         _binary_creator_('__add__', 'final_state_add', False, _scalar_add_,
+                          True)) if framework._in_eager_mode_ else
+        ('__add__',
+         _binary_creator_('__add__', 'elementwise_add', False, _scalar_add_)),
         ##  a+b == b+a. Do not need to reverse explicitly
-        ('__radd__', _binary_creator_('__radd__', 'final_state_add', False,
-                                      _scalar_add_, True))
-        if framework._in_eager_mode_ else ('__radd__', _binary_creator_(
-            '__radd__', 'elementwise_add', False, _scalar_add_)),
-        ('__sub__', _binary_creator_('__sub__', 'final_state_subtract', False,
-                                     _scalar_sub_, True))
-        if framework._in_eager_mode_ else ('__sub__', _binary_creator_(
-            '__sub__', 'elementwise_sub', False, _scalar_sub_)),
-        ('__rsub__', _binary_creator_('__rsub__', 'final_state_subtract', True,
-                                      _scalar_rsub_, True))
-        if framework._in_eager_mode_ else ('__rsub__', _binary_creator_(
-            '__rsub__', 'elementwise_sub', True, _scalar_rsub_)),
-        ('__mul__', _binary_creator_('__mul__', 'final_state_multiply', False,
-                                     _scalar_mul_, True))
-        if framework._in_eager_mode_ else ('__mul__', _binary_creator_(
-            '__mul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__radd__',
+         _binary_creator_('__radd__', 'final_state_add', False, _scalar_add_,
+                          True)) if framework._in_eager_mode_ else
+        ('__radd__',
+         _binary_creator_('__radd__', 'elementwise_add', False, _scalar_add_)),
+        ('__sub__',
+         _binary_creator_('__sub__', 'final_state_subtract', False,
+                          _scalar_sub_, True)) if framework._in_eager_mode_ else
+        ('__sub__',
+         _binary_creator_('__sub__', 'elementwise_sub', False, _scalar_sub_)),
+        ('__rsub__',
+         _binary_creator_('__rsub__', 'final_state_subtract', True,
+                          _scalar_rsub_, True))
+        if framework._in_eager_mode_ else
+        ('__rsub__',
+         _binary_creator_('__rsub__', 'elementwise_sub', True, _scalar_rsub_)),
+        ('__mul__',
+         _binary_creator_('__mul__', 'final_state_multiply', False,
+                          _scalar_mul_, True)) if framework._in_eager_mode_ else
+        ('__mul__',
+         _binary_creator_('__mul__', 'elementwise_mul', False, _scalar_mul_)),
         ## a*b == b*a. Do not need to reverse explicitly
-        ('__rmul__', _binary_creator_('__rmul__', 'final_state_multiply', False,
-                                      _scalar_mul_, True))
-        if framework._in_eager_mode_ else ('__rmul__', _binary_creator_(
-            '__rmul__', 'elementwise_mul', False, _scalar_mul_)),
-        ('__div__', _binary_creator_('__div__', 'final_state_divide', False,
-                                     _scalar_div_, True))
-        if framework._in_eager_mode_ else ('__div__', _binary_creator_(
-            '__div__', 'elementwise_div', False, _scalar_div_)),
-        ('__truediv__', _binary_creator_('__truediv__', 'final_state_divide',
-                                         False, _scalar_div_, True))
-        if framework._in_eager_mode_ else ('__truediv__', _binary_creator_(
-            '__truediv__', 'elementwise_div', False, _scalar_div_)),
-        ('__rdiv__', _binary_creator_('__rdiv__', 'final_state_divide', True,
-                                      None, True)) if framework._in_eager_mode_
-        else ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
-                                           None)),
-        ('__rtruediv__', _binary_creator_('rtruediv__', 'final_state_divide',
-                                          True, None, True))
-        if framework._in_eager_mode_ else ('__rtruediv__', _binary_creator_(
-            'rtruediv__', 'elementwise_div', True, None)),
+        ('__rmul__',
+         _binary_creator_('__rmul__', 'final_state_multiply', False,
+                          _scalar_mul_, True)) if framework._in_eager_mode_ else
+        ('__rmul__',
+         _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__div__',
+         _binary_creator_('__div__', 'final_state_divide', False, _scalar_div_,
+                          True)) if framework._in_eager_mode_ else
+        ('__div__',
+         _binary_creator_('__div__', 'elementwise_div', False, _scalar_div_)),
+        ('__truediv__',
+         _binary_creator_('__truediv__', 'final_state_divide', False,
+                          _scalar_div_, True)) if framework._in_eager_mode_ else
+        ('__truediv__',
+         _binary_creator_('__truediv__', 'elementwise_div', False,
+                          _scalar_div_)),
+        ('__rdiv__',
+         _binary_creator_('__rdiv__', 'final_state_divide', True, None, True))
+        if framework._in_eager_mode_ else
+        ('__rdiv__',
+         _binary_creator_('__rdiv__', 'elementwise_div', True, None)),
+        ('__rtruediv__',
+         _binary_creator_('rtruediv__', 'final_state_divide', True, None, True))
+        if framework._in_eager_mode_ else
+        ('__rtruediv__',
+         _binary_creator_('rtruediv__', 'elementwise_div', True, None)),
         ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
                                      None)),
         ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
                                       None)),
-        ('__floordiv__', _binary_creator_('__floordiv__',
-                                          'elementwise_floordiv', False, None)),
+        ('__floordiv__',
+         _binary_creator_('__floordiv__', 'elementwise_floordiv', False, None)),
         ('__mod__', _binary_creator_('__mod__', 'elementwise_mod', False,
                                      None)),
         ('__matmul__', _binary_creator_('__matmul__', "matmul_v2", False,
@@ -377,22 +391,25 @@ def __impl__(self, other_var):
          _binary_creator_('__eq__', 'final_state_equal', False, None, True))
         if framework._in_eager_mode_ else
         ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
-        ('__ne__', _binary_creator_('__ne__', 'final_state_not_equal', False,
-                                    None, True)) if framework._in_eager_mode_
-        else ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
-        ('__lt__', _binary_creator_('__lt__', 'final_state_less_than', False,
-                                    None, True)) if framework._in_eager_mode_
-        else ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
-        ('__le__', _binary_creator_('__le__', 'final_state_less_equal', False,
-                                    None, True)) if framework._in_eager_mode_
-        else ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
-        ('__gt__', _binary_creator_('__gt__', 'final_state_greater_than', False,
-                                    None, True))
+        ('__ne__',
+         _binary_creator_('__ne__', 'final_state_not_equal', False, None, True))
         if framework._in_eager_mode_ else
-        ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
-        ('__ge__', _binary_creator_('__ge__', 'final_state_greater_equal',
-                                    False, None, True))
+        ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
+        ('__lt__',
+         _binary_creator_('__lt__', 'final_state_less_than', False, None, True))
         if framework._in_eager_mode_ else
+        ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
+        ('__le__',
+         _binary_creator_('__le__', 'final_state_less_equal', False, None,
+                          True)) if framework._in_eager_mode_ else
+        ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
+        ('__gt__',
+         _binary_creator_('__gt__', 'final_state_greater_than', False, None,
+                          True)) if framework._in_eager_mode_ else
+        ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
+        ('__ge__',
+         _binary_creator_('__ge__', 'final_state_greater_equal', False, None,
+                          True)) if framework._in_eager_mode_ else
         ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
         ('__array_ufunc__', None)
     ]
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 4d985097088f8..26bda1a34ef63 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -196,17 +196,17 @@ def __init__(self,
         self._bias_attr = bias_attr
         self._dtype = dtype
 
-        if (self._num_channels == self._groups and
-                num_filters % self._num_channels == 0 and
-                not self._use_cudnn and not self._use_mkldnn):
+        if (self._num_channels == self._groups
+                and num_filters % self._num_channels == 0
+                and not self._use_cudnn and not self._use_mkldnn):
             self._l_type = 'depthwise_conv2d'
         else:
             self._l_type = 'conv2d'
 
         # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
         if core.is_compiled_with_npu():
-            if (self._num_channels == self._groups and
-                    self._num_channels == self._num_filters):
+            if (self._num_channels == self._groups
+                    and self._num_channels == self._num_filters):
                 self._l_type = 'depthwise_conv2d'
             else:
                 self._l_type = 'conv2d'
@@ -233,11 +233,10 @@ def _get_default_param_initializer():
             dtype=self._dtype,
             default_initializer=_get_default_param_initializer())
 
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
+        self.bias = self.create_parameter(attr=self._bias_attr,
+                                          shape=[self._num_filters],
+                                          dtype=self._dtype,
+                                          is_bias=True)
 
     def forward(self, input):
         if in_dygraph_mode() and self._l_type == "conv2d":
@@ -252,12 +251,12 @@ def forward(self, input):
             return dygraph_utils._append_activation_in_dygraph(
                 pre_act, self._act, use_mkldnn=self._use_mkldnn)
 
-        if _non_static_mode() and (self._l_type == 'conv2d' or
-                                   self._l_type == 'depthwise_conv2d'):
+        if _non_static_mode() and (self._l_type == 'conv2d'
+                                   or self._l_type == 'depthwise_conv2d'):
             attrs = ('strides', self._stride, 'paddings', self._padding,
-                     'dilations', self._dilation, 'groups', self._groups
-                     if self._groups else 1, 'use_cudnn', self._use_cudnn,
-                     'use_mkldnn', self._use_mkldnn)
+                     'dilations', self._dilation, 'groups',
+                     self._groups if self._groups else 1, 'use_cudnn',
+                     self._use_cudnn, 'use_mkldnn', self._use_mkldnn)
             out = _C_ops.conv2d(input, self.weight, *attrs)
             pre_bias = out
 
@@ -283,25 +282,27 @@ def forward(self, input):
         pre_bias = self._helper.create_variable_for_type_inference(
             dtype=self._dtype)
 
-        self._helper.append_op(
-            type=self._l_type,
-            inputs={
-                'Input': input,
-                'Filter': self.weight,
-            },
-            outputs={"Output": pre_bias},
-            attrs=attrs)
+        self._helper.append_op(type=self._l_type,
+                               inputs={
+                                   'Input': input,
+                                   'Filter': self.weight,
+                               },
+                               outputs={"Output": pre_bias},
+                               attrs=attrs)
 
         if self.bias is not None:
             pre_act = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self.bias]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1,
-                       'use_mkldnn': self._use_mkldnn})
+            self._helper.append_op(type='elementwise_add',
+                                   inputs={
+                                       'X': [pre_bias],
+                                       'Y': [self.bias]
+                                   },
+                                   outputs={'Out': [pre_act]},
+                                   attrs={
+                                       'axis': 1,
+                                       'use_mkldnn': self._use_mkldnn
+                                   })
         else:
             pre_act = pre_bias
 
@@ -470,41 +471,41 @@ def _get_default_param_initializer():
             dtype=self._dtype,
             default_initializer=_get_default_param_initializer())
 
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
+        self.bias = self.create_parameter(attr=self._bias_attr,
+                                          shape=[self._num_filters],
+                                          dtype=self._dtype,
+                                          is_bias=True)
 
     def forward(self, input):
         pre_bias = self._helper.create_variable_for_type_inference(
             dtype=self._dtype)
 
-        self._helper.append_op(
-            type='conv3d',
-            inputs={
-                'Input': input,
-                'Filter': self.weight,
-            },
-            outputs={"Output": pre_bias},
-            attrs={
-                'strides': self._stride,
-                'paddings': self._padding,
-                'dilations': self._dilation,
-                'groups': self._groups if self._groups else 1,
-                'use_cudnn': self._use_cudnn,
-                'use_mkldnn': False
-            })
+        self._helper.append_op(type='conv3d',
+                               inputs={
+                                   'Input': input,
+                                   'Filter': self.weight,
+                               },
+                               outputs={"Output": pre_bias},
+                               attrs={
+                                   'strides': self._stride,
+                                   'paddings': self._padding,
+                                   'dilations': self._dilation,
+                                   'groups':
+                                   self._groups if self._groups else 1,
+                                   'use_cudnn': self._use_cudnn,
+                                   'use_mkldnn': False
+                               })
 
         if self.bias is not None:
             pre_act = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self.bias]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1})
+            self._helper.append_op(type='elementwise_add',
+                                   inputs={
+                                       'X': [pre_bias],
+                                       'Y': [self.bias]
+                                   },
+                                   outputs={'Out': [pre_act]},
+                                   attrs={'axis': 1})
         else:
             pre_act = pre_bias
 
@@ -688,39 +689,42 @@ def __init__(self,
 
         filter_shape = [self._num_channels, self._num_filters // self._groups
                         ] + self._filter_size
-        self.weight = self.create_parameter(
-            dtype=self._dtype, shape=filter_shape, attr=self._param_attr)
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
+        self.weight = self.create_parameter(dtype=self._dtype,
+                                            shape=filter_shape,
+                                            attr=self._param_attr)
+        self.bias = self.create_parameter(attr=self._bias_attr,
+                                          shape=[self._num_filters],
+                                          dtype=self._dtype,
+                                          is_bias=True)
 
     def forward(self, input):
         pre_bias = self._helper.create_variable_for_type_inference(
             dtype=self._dtype)
-        self._helper.append_op(
-            type="conv3d_transpose",
-            inputs={'Input': [input],
-                    'Filter': [self.weight]},
-            outputs={'Output': pre_bias},
-            attrs={
-                'strides': self._stride,
-                'paddings': self._padding,
-                'dilations': self._dilation,
-                'groups': self._groups if self._groups else 1,
-                'use_cudnn': self._use_cudnn
-            })
+        self._helper.append_op(type="conv3d_transpose",
+                               inputs={
+                                   'Input': [input],
+                                   'Filter': [self.weight]
+                               },
+                               outputs={'Output': pre_bias},
+                               attrs={
+                                   'strides': self._stride,
+                                   'paddings': self._padding,
+                                   'dilations': self._dilation,
+                                   'groups':
+                                   self._groups if self._groups else 1,
+                                   'use_cudnn': self._use_cudnn
+                               })
 
         if self._bias_attr:
             pre_act = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self.bias]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1})
+            self._helper.append_op(type='elementwise_add',
+                                   inputs={
+                                       'X': [pre_bias],
+                                       'Y': [self.bias]
+                                   },
+                                   outputs={'Out': [pre_act]},
+                                   attrs={'axis': 1})
         else:
             pre_act = pre_bias
 
@@ -910,11 +914,10 @@ def forward(self, input):
 
         pool_out = self._helper.create_variable_for_type_inference(self._dtype)
 
-        self._helper.append_op(
-            type=self._l_type,
-            inputs={"X": input},
-            outputs={"Out": pool_out},
-            attrs=attrs)
+        self._helper.append_op(type=self._l_type,
+                               inputs={"X": input},
+                               outputs={"Out": pool_out},
+                               attrs=attrs)
         return pool_out
 
 
@@ -980,13 +983,14 @@ def __init__(self,
         super(Linear, self).__init__()
         self._act = act
         self._dtype = dtype
-        self.weight = self.create_parameter(
-            shape=[input_dim, output_dim],
-            attr=param_attr,
-            dtype=dtype,
-            is_bias=False)
-        self.bias = self.create_parameter(
-            shape=[output_dim], attr=bias_attr, dtype=dtype, is_bias=True)
+        self.weight = self.create_parameter(shape=[input_dim, output_dim],
+                                            attr=param_attr,
+                                            dtype=dtype,
+                                            is_bias=False)
+        self.bias = self.create_parameter(shape=[output_dim],
+                                          attr=bias_attr,
+                                          dtype=dtype,
+                                          is_bias=True)
 
         self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
 
@@ -1017,20 +1021,23 @@ def forward(self, input):
         inputs = {"X": [input], "Y": [self.weight]}
 
         tmp = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="matmul", inputs=inputs, outputs={"Out": tmp}, attrs=attrs)
+        self._helper.append_op(type="matmul",
+                               inputs=inputs,
+                               outputs={"Out": tmp},
+                               attrs=attrs)
         if self.bias is not None:
             pre_activation = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [tmp],
-                        'Y': [self.bias]},
-                outputs={'Out': [pre_activation]},
-                attrs={
-                    'axis': len(input.shape) - 1,
-                    'use_mkldnn': self._use_mkldnn
-                })
+            self._helper.append_op(type='elementwise_add',
+                                   inputs={
+                                       'X': [tmp],
+                                       'Y': [self.bias]
+                                   },
+                                   outputs={'Out': [pre_activation]},
+                                   attrs={
+                                       'axis': len(input.shape) - 1,
+                                       'use_mkldnn': self._use_mkldnn
+                                   })
         else:
             pre_activation = tmp
         return self._helper.append_activation(pre_activation, act=self._act)
@@ -1126,18 +1133,21 @@ def __init__(self,
                 dtype=self._dtype,
                 default_initializer=Constant(1.0),
                 is_bias=False)
-            self.bias = self.create_parameter(
-                attr=self._bias_attr,
-                shape=[num_channels],
-                dtype=self._dtype,
-                default_initializer=Constant(0.0),
-                is_bias=True)
+            self.bias = self.create_parameter(attr=self._bias_attr,
+                                              shape=[num_channels],
+                                              dtype=self._dtype,
+                                              default_initializer=Constant(0.0),
+                                              is_bias=True)
         else:
             self.scale = None
             self.bias = None
 
     def forward(self, input):
-        if _non_static_mode():
+        if in_dygraph_mode():
+            out, _, _, = _C_ops.final_state_instance_norm(
+                input, self.scale, self.bias, self._epsilon)
+            return out
+        if _in_legacy_dygraph():
             out, _, _ = _C_ops.instance_norm(input, self.scale, self.bias,
                                              'epsilon', self._epsilon)
             return out
@@ -1165,8 +1175,10 @@ def forward(self, input):
             "SavedVariance": [saved_variance]
         }
 
-        self._helper.append_op(
-            type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+        self._helper.append_op(type="instance_norm",
+                               inputs=inputs,
+                               outputs=outputs,
+                               attrs=attrs)
         return instance_norm_out
 
 
@@ -1300,38 +1312,34 @@ def __init__(self,
         param_shape = [num_channels]
 
         # create parameter
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            default_initializer=Constant(1.0))
+        self.weight = self.create_parameter(attr=self._param_attr,
+                                            shape=param_shape,
+                                            dtype=self._dtype,
+                                            default_initializer=Constant(1.0))
         self.weight.stop_gradient = use_global_stats and self._param_attr.learning_rate == 0.
 
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=True)
+        self.bias = self.create_parameter(attr=self._bias_attr,
+                                          shape=param_shape,
+                                          dtype=self._dtype,
+                                          is_bias=True)
         self.bias.stop_gradient = use_global_stats and self._param_attr.learning_rate == 0.
 
-        self._mean = self.create_parameter(
-            attr=ParamAttr(
-                name=moving_mean_name,
-                initializer=Constant(0.0),
-                trainable=False,
-                do_model_average=do_model_average_for_mean_and_var),
-            shape=param_shape,
-            dtype=self._dtype)
+        self._mean = self.create_parameter(attr=ParamAttr(
+            name=moving_mean_name,
+            initializer=Constant(0.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
+                                           shape=param_shape,
+                                           dtype=self._dtype)
         self._mean.stop_gradient = True
 
-        self._variance = self.create_parameter(
-            attr=ParamAttr(
-                name=moving_variance_name,
-                initializer=Constant(1.0),
-                trainable=False,
-                do_model_average=do_model_average_for_mean_and_var),
-            shape=param_shape,
-            dtype=self._dtype)
+        self._variance = self.create_parameter(attr=ParamAttr(
+            name=moving_variance_name,
+            initializer=Constant(1.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
+                                               shape=param_shape,
+                                               dtype=self._dtype)
         self._variance.stop_gradient = True
 
         self._in_place = in_place
@@ -1416,8 +1424,10 @@ def forward(self, input):
         if reserve_space is not None:
             outputs["ReserveSpace"] = [reserve_space]
 
-        self._helper.append_op(
-            type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+        self._helper.append_op(type="batch_norm",
+                               inputs=inputs,
+                               outputs=outputs,
+                               attrs=attrs)
 
         # Currently, we don't support inplace in dygraph mode
         return self._helper.append_activation(batch_norm_out, self._act)
@@ -1510,8 +1520,8 @@ def forward(self, input):
             self._seed = prog.random_seed
         attrs = {
             'dropout_prob': self._dropout_prob,
-            'is_test': not self.training
-            if _non_static_mode() else self._is_test,
+            'is_test':
+            not self.training if _non_static_mode() else self._is_test,
             'fix_seed': self._seed is not None,
             'seed': self._seed if self._seed is not None else 0,
             'dropout_implementation': self._dropout_implementation,
@@ -1526,12 +1536,13 @@ def forward(self, input):
         mask = self._helper.create_variable_for_type_inference(
             dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
 
-        self._helper.append_op(
-            type='dropout',
-            inputs={'X': [input]},
-            outputs={'Out': [out],
-                     'Mask': [mask]},
-            attrs=attrs)
+        self._helper.append_op(type='dropout',
+                               inputs={'X': [input]},
+                               outputs={
+                                   'Out': [out],
+                                   'Mask': [mask]
+                               },
+                               attrs=attrs)
         return out
 
 
@@ -1663,18 +1674,19 @@ def __init__(self,
         if self._remote_prefetch:
             assert self._is_sparse is True and self._is_distributed is False
 
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=self._size,
-            dtype=self._dtype,
-            is_bias=False)
+        self.weight = self.create_parameter(attr=self._param_attr,
+                                            shape=self._size,
+                                            dtype=self._dtype,
+                                            is_bias=False)
 
     def forward(self, input):
         if _non_static_mode():
-            return _C_ops.lookup_table_v2(
-                self.weight, input, 'is_sparse', self._is_sparse,
-                'is_distributed', self._is_distributed, 'remote_prefetch',
-                self._remote_prefetch, 'padding_idx', self._padding_idx)
+            return _C_ops.lookup_table_v2(self.weight, input, 'is_sparse',
+                                          self._is_sparse, 'is_distributed',
+                                          self._is_distributed,
+                                          'remote_prefetch',
+                                          self._remote_prefetch, 'padding_idx',
+                                          self._padding_idx)
 
         check_variable_and_dtype(input, 'input',
                                  ['uint8', 'int8', 'int16', 'int32', 'int64'],
@@ -1687,12 +1699,13 @@ def forward(self, input):
         }
 
         out = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type='lookup_table_v2',
-            inputs={'Ids': input,
-                    'W': self.weight},
-            outputs={'Out': out},
-            attrs=attrs)
+        self._helper.append_op(type='lookup_table_v2',
+                               inputs={
+                                   'Ids': input,
+                                   'W': self.weight
+                               },
+                               outputs={'Out': out},
+                               attrs=attrs)
 
         return out
 
@@ -1803,11 +1816,10 @@ def __init__(self,
 
         if self._shift:
             assert self._bias_attr is not False
-            self.bias = self.create_parameter(
-                attr=self._bias_attr,
-                shape=param_shape,
-                dtype=self._dtype,
-                is_bias=True)
+            self.bias = self.create_parameter(attr=self._bias_attr,
+                                              shape=param_shape,
+                                              dtype=self._dtype,
+                                              is_bias=True)
         else:
             if self._bias_attr:
                 logging.warn("bias_attr are only available with shift is True")
@@ -1821,10 +1833,11 @@ def forward(self, input):
         if input_ndim < normalized_ndim or input_shape[
                 self._begin_norm_axis:] != self._normalized_shape:
             str_normalized_shape = str(self._normalized_shape)
-            raise ValueError(
-                'Given normalized_shape is ' + str_normalized_shape +
-                ', expected input with shape [*, ' + str_normalized_shape[
-                    1:] + ', but got input shape ' + str(input_shape))
+            raise ValueError('Given normalized_shape is ' +
+                             str_normalized_shape +
+                             ', expected input with shape [*, ' +
+                             str_normalized_shape[1:] +
+                             ', but got input shape ' + str(input_shape))
 
         if _non_static_mode():
             if in_dygraph_mode():
@@ -1834,9 +1847,10 @@ def forward(self, input):
                 return dygraph_utils._append_activation_in_dygraph(
                     pre_act, act=self._act)
             else:
-                pre_act, _, _ = _C_ops.layer_norm(
-                    input, self.weight, self.bias, 'epsilon', self._epsilon,
-                    'begin_norm_axis', self._begin_norm_axis)
+                pre_act, _, _ = _C_ops.layer_norm(input, self.weight, self.bias,
+                                                  'epsilon', self._epsilon,
+                                                  'begin_norm_axis',
+                                                  self._begin_norm_axis)
                 return dygraph_utils._append_activation_in_dygraph(
                     pre_act, act=self._act)
 
@@ -1862,18 +1876,17 @@ def forward(self, input):
         layer_norm_out = self._helper.create_variable_for_type_inference(
             self._dtype)
 
-        self._helper.append_op(
-            type="layer_norm",
-            inputs=inputs,
-            outputs={
-                "Y": layer_norm_out,
-                "Mean": mean_out,
-                "Variance": variance_out,
-            },
-            attrs={
-                "epsilon": self._epsilon,
-                "begin_norm_axis": self._begin_norm_axis
-            })
+        self._helper.append_op(type="layer_norm",
+                               inputs=inputs,
+                               outputs={
+                                   "Y": layer_norm_out,
+                                   "Mean": mean_out,
+                                   "Variance": variance_out,
+                               },
+                               attrs={
+                                   "epsilon": self._epsilon,
+                                   "begin_norm_axis": self._begin_norm_axis
+                               })
 
         return self._helper.append_activation(layer_norm_out, act=self._act)
 
@@ -2001,21 +2014,25 @@ def __init__(self,
             identity=0,
             sigmoid=1,
             tanh=2,
-            relu=3, )
+            relu=3,
+        )
         self.activation = activation_dict[activation]
         self.gate_activation = activation_dict[gate_activation]
 
         self._dtype = dtype
         size = size // 3
         # create weight
-        self.weight = self.create_parameter(
-            attr=param_attr, shape=[size, 3 * size], dtype=dtype)
+        self.weight = self.create_parameter(attr=param_attr,
+                                            shape=[size, 3 * size],
+                                            dtype=dtype)
 
         # create bias
         bias_size = [1, 3 * size]
         self._bias_size = bias_size
-        self.bias = self.create_parameter(
-            attr=bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+        self.bias = self.create_parameter(attr=bias_attr,
+                                          shape=bias_size,
+                                          dtype=dtype,
+                                          is_bias=True)
 
     def forward(self, input, hidden):
         if _non_static_mode():
@@ -2040,18 +2057,17 @@ def forward(self, input, hidden):
             self._dtype)
         updated_hidden = self._helper.create_variable_for_type_inference(
             self._dtype)
-        self._helper.append_op(
-            type='gru_unit',
-            inputs=inputs,
-            outputs={
-                'Gate': gate,
-                'ResetHiddenPrev': reset_hidden_pre,
-                'Hidden': updated_hidden,
-            },
-            attrs={
-                'activation': self.activation,
-                'gate_activation': self.gate_activation,
-            })
+        self._helper.append_op(type='gru_unit',
+                               inputs=inputs,
+                               outputs={
+                                   'Gate': gate,
+                                   'ResetHiddenPrev': reset_hidden_pre,
+                                   'Hidden': updated_hidden,
+                               },
+                               attrs={
+                                   'activation': self.activation,
+                                   'gate_activation': self.gate_activation,
+                               })
 
         return updated_hidden, reset_hidden_pre, gate
 
@@ -2270,11 +2286,12 @@ def forward(self, input, label, sample_weight=None):
                      self._attrs['seed'], 'sampler', self._attrs['sampler'],
                      'is_sparse', self._attrs['is_sparse'], 'remote_prefetch',
                      self._attrs['remote_prefetch'])
-            cost, _, _ = _C_ops.nce(
-                input, label, self.weight, self.bias,
-                self._inputs['SampleWeight'], self._inputs['CustomDistProbs'],
-                self._inputs['CustomDistAlias'],
-                self._inputs['CustomDistAliasProbs'], *attrs)
+            cost, _, _ = _C_ops.nce(input, label, self.weight, self.bias,
+                                    self._inputs['SampleWeight'],
+                                    self._inputs['CustomDistProbs'],
+                                    self._inputs['CustomDistAlias'],
+                                    self._inputs['CustomDistAliasProbs'],
+                                    *attrs)
             return cost / (self._num_neg_samples + 1)
 
         check_variable_and_dtype(input, "input", ['float32', 'float64'], "NCE")
@@ -2295,15 +2312,14 @@ def forward(self, input, label, sample_weight=None):
         sample_labels = self._helper.create_variable_for_type_inference(
             dtype=label.dtype)
 
-        self._helper.append_op(
-            type='nce',
-            inputs=self._inputs,
-            outputs={
-                'Cost': cost,
-                'SampleLogits': sample_logits,
-                'SampleLabels': sample_labels
-            },
-            attrs=self._attrs)
+        self._helper.append_op(type='nce',
+                               inputs=self._inputs,
+                               outputs={
+                                   'Cost': cost,
+                                   'SampleLogits': sample_logits,
+                                   'SampleLabels': sample_labels
+                               },
+                               attrs=self._attrs)
         return cost / (self._num_neg_samples + 1)
 
 
@@ -2385,33 +2401,34 @@ def __init__(self,
                 channel,
                 int), "channel argument is required when mode is 'channel'."
             #NOTE(zhiqiu): The _alpha_shape should be [1, channel] + [1] * len(input_shape[2:]), not [1, channel, 1, 1].
-            # However, the suffix 1 in the list is useless, since the tensor is viewed as one demension array during kernel calculation. 
+            # However, the suffix 1 in the list is useless, since the tensor is viewed as one demension array during kernel calculation.
             # And, input_shape is not required when mode is 'channel', so it is simplified.
             #NOTE(zhiqiu): Revert shape to [1, channel, 1, 1] for compatibility with saved model of old version.
             self._alpha_shape = [1, channel, 1, 1]
         elif mode == 'element':
-            assert isinstance(input_shape, (
-                list, tuple
-            )), "input_shape argument is required when mode is 'element'."
+            assert isinstance(
+                input_shape,
+                (list, tuple
+                 )), "input_shape argument is required when mode is 'element'."
             self._alpha_shape = [1] + list(input_shape)[1:]
         else:
             raise ValueError('mode should be one of all, channel, element.')
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=self._alpha_shape,
-            dtype='float32',
-            is_bias=False,
-            default_initializer=Constant(1.0))
+        self.weight = self.create_parameter(attr=self._param_attr,
+                                            shape=self._alpha_shape,
+                                            dtype='float32',
+                                            is_bias=False,
+                                            default_initializer=Constant(1.0))
 
     def forward(self, input):
         check_variable_and_dtype(input, 'input', ['float32'], 'PRelu')
         out = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="prelu",
-            inputs={"X": input,
-                    'Alpha': self.weight},
-            attrs={"mode": self._mode},
-            outputs={"Out": out})
+        self._helper.append_op(type="prelu",
+                               inputs={
+                                   "X": input,
+                                   'Alpha': self.weight
+                               },
+                               attrs={"mode": self._mode},
+                               outputs={"Out": out})
         return out
 
 
@@ -2491,22 +2508,19 @@ def __init__(self,
         self._dtype = dtype
 
         param_shape = [self._output_dim, self._input1_dim, self._input2_dim]
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=False)
+        self.weight = self.create_parameter(attr=self._param_attr,
+                                            shape=param_shape,
+                                            dtype=self._dtype,
+                                            is_bias=False)
         bias_size = [1, self._output_dim]
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=bias_size,
-            dtype=self._dtype,
-            is_bias=True)
-
-    @deprecated(
-        since="2.0.0",
-        update_to="paddle.nn.Bilinear",
-        reason="New name and new args in Bilinear, easier to use.")
+        self.bias = self.create_parameter(attr=self._bias_attr,
+                                          shape=bias_size,
+                                          dtype=self._dtype,
+                                          is_bias=True)
+
+    @deprecated(since="2.0.0",
+                update_to="paddle.nn.Bilinear",
+                reason="New name and new args in Bilinear, easier to use.")
     def forward(self, x, y):
         check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                  'BilinearTensorProduct')
@@ -2516,17 +2530,16 @@ def forward(self, x, y):
         if self.bias is not None:
             self._inputs["Bias"] = self.bias
         if self._name is not None:
-            out = self._helper.create_variable(
-                name=".".join([self.full_name(), self._name]),
-                dtype=self._dtype,
-                persistable=False)
+            out = self._helper.create_variable(name=".".join(
+                [self.full_name(), self._name]),
+                                               dtype=self._dtype,
+                                               persistable=False)
         else:
-            out = self._helper.create_variable(
-                dtype=self._dtype, persistable=False)
-        self._helper.append_op(
-            type="bilinear_tensor_product",
-            inputs=self._inputs,
-            outputs={"Out": out})
+            out = self._helper.create_variable(dtype=self._dtype,
+                                               persistable=False)
+        self._helper.append_op(type="bilinear_tensor_product",
+                               inputs=self._inputs,
+                               outputs={"Out": out})
 
         # add activation
         return self._helper.append_activation(out, act=self._act)
@@ -2680,9 +2693,9 @@ def __init__(self,
         self._output_size = output_size
         self._dtype = dtype
 
-        if (self._num_channels == self._groups and
-                self._num_filters == self._num_channels and
-                not self._use_cudnn):
+        if (self._num_channels == self._groups
+                and self._num_filters == self._num_channels
+                and not self._use_cudnn):
             self._op_type = 'depthwise_conv2d_transpose'
         else:
             self._op_type = 'conv2d_transpose'
@@ -2707,14 +2720,14 @@ def __init__(self,
         filter_shape = [self._num_channels, self._num_filters // self._groups
                         ] + self._filter_size
 
-        self.weight = self.create_parameter(
-            dtype=self._dtype, shape=filter_shape, attr=self._param_attr)
+        self.weight = self.create_parameter(dtype=self._dtype,
+                                            shape=filter_shape,
+                                            attr=self._param_attr)
 
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
+        self.bias = self.create_parameter(attr=self._bias_attr,
+                                          shape=[self._num_filters],
+                                          dtype=self._dtype,
+                                          is_bias=True)
 
     def forward(self, input):
         if _non_static_mode():
@@ -2724,10 +2737,10 @@ def forward(self, input):
                      'dilations', self._dilation, 'groups', self._groups,
                      'use_cudnn', self._use_cudnn)
             pre_bias = out
-            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, self.bias,
-                                                            1)
-            return dygraph_utils._append_activation_in_dygraph(
-                pre_act, act=self._act)
+            pre_act = dygraph_utils._append_bias_in_dygraph(
+                pre_bias, self.bias, 1)
+            return dygraph_utils._append_activation_in_dygraph(pre_act,
+                                                               act=self._act)
 
         check_variable_and_dtype(input, 'input',
                                  ['float16', 'float32', 'float64'],
@@ -2745,21 +2758,21 @@ def forward(self, input):
 
         pre_bias = self._helper.create_variable_for_type_inference(
             dtype=input.dtype)
-        self._helper.append_op(
-            type=self._op_type,
-            inputs=inputs,
-            outputs={'Output': pre_bias},
-            attrs=attrs)
+        self._helper.append_op(type=self._op_type,
+                               inputs=inputs,
+                               outputs={'Output': pre_bias},
+                               attrs=attrs)
 
         if self.bias is not None:
             pre_act = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self.bias]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1})
+            self._helper.append_op(type='elementwise_add',
+                                   inputs={
+                                       'X': [pre_bias],
+                                       'Y': [self.bias]
+                                   },
+                                   outputs={'Out': [pre_act]},
+                                   attrs={'axis': 1})
         else:
             pre_act = pre_bias
 
@@ -2822,39 +2835,39 @@ def __init__(self,
     def _build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
         filter_shape = [self._filter_size * input.shape[1], self._num_filters]
-        self.weight = self.create_parameter(
-            attr=self._param_attr, shape=filter_shape, dtype=self._dtype)
+        self.weight = self.create_parameter(attr=self._param_attr,
+                                            shape=filter_shape,
+                                            dtype=self._dtype)
 
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
+        self.bias = self.create_parameter(attr=self._bias_attr,
+                                          shape=[self._num_filters],
+                                          dtype=self._dtype,
+                                          is_bias=True)
 
     def forward(self, input):
         pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type='sequence_conv',
-            inputs={
-                'X': [input],
-                'Filter': [self.weight],
-            },
-            outputs={"Out": pre_bias},
-            attrs={
-                'contextStride': self._filter_stride,
-                'contextStart': -int(self._filter_size // 2),
-                'contextLength': self._filter_size
-            })
+        self._helper.append_op(type='sequence_conv',
+                               inputs={
+                                   'X': [input],
+                                   'Filter': [self.weight],
+                               },
+                               outputs={"Out": pre_bias},
+                               attrs={
+                                   'contextStride': self._filter_stride,
+                                   'contextStart': -int(self._filter_size // 2),
+                                   'contextLength': self._filter_size
+                               })
 
         if self.bias is not None:
             pre_act = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self.bias]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1})
+            self._helper.append_op(type='elementwise_add',
+                                   inputs={
+                                       'X': [pre_bias],
+                                       'Y': [self.bias]
+                                   },
+                                   outputs={'Out': [pre_act]},
+                                   attrs={'axis': 1})
         else:
             pre_act = pre_bias
 
@@ -2924,19 +2937,19 @@ def __init__(self,
     def _build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
         filter_shape = [self._future_context_size + 1, input.shape[1]]
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            is_bias=False)
+        self.weight = self.create_parameter(attr=self._param_attr,
+                                            shape=filter_shape,
+                                            dtype=self._dtype,
+                                            is_bias=False)
 
     def forward(self, input):
         out = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type='row_conv',
-            inputs={'X': [input],
-                    'Filter': [self.weight]},
-            outputs={'Out': [out]})
+        self._helper.append_op(type='row_conv',
+                               inputs={
+                                   'X': [input],
+                                   'Filter': [self.weight]
+                               },
+                               outputs={'Out': [out]})
         return self._helper.append_activation(out, act=self._act)
 
 
@@ -3003,17 +3016,15 @@ def __init__(self,
 
         param_shape = [self._channels]
 
-        self.weight = self.create_parameter(
-            attr=self._param_attr or False,
-            shape=param_shape,
-            dtype=self._dtype,
-            default_initializer=Constant(1.0))
+        self.weight = self.create_parameter(attr=self._param_attr or False,
+                                            shape=param_shape,
+                                            dtype=self._dtype,
+                                            default_initializer=Constant(1.0))
 
-        self.bias = self.create_parameter(
-            attr=self._bias_attr or False,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=True)
+        self.bias = self.create_parameter(attr=self._bias_attr or False,
+                                          shape=param_shape,
+                                          dtype=self._dtype,
+                                          is_bias=True)
 
     def forward(self, input):
         mean_out = self._helper.create_variable_for_type_inference(
@@ -3038,16 +3049,17 @@ def forward(self, input):
             group_norm_out = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
 
-            self._helper.append_op(
-                type="group_norm",
-                inputs=inputs,
-                outputs={
-                    "Y": group_norm_out,
-                    "Mean": mean_out,
-                    "Variance": variance_out,
-                },
-                attrs={"epsilon": self._epsilon,
-                       "groups": self._groups})
+            self._helper.append_op(type="group_norm",
+                                   inputs=inputs,
+                                   outputs={
+                                       "Y": group_norm_out,
+                                       "Mean": mean_out,
+                                       "Variance": variance_out,
+                                   },
+                                   attrs={
+                                       "epsilon": self._epsilon,
+                                       "groups": self._groups
+                                   })
 
             return self._helper.append_activation(group_norm_out, self._act)
 
@@ -3133,18 +3145,18 @@ def __init__(self,
         h = self._weight_shape[self._dim]
         w = np.prod(self._weight_shape) // h
 
-        self.weight_u = self.create_parameter(
-            attr=ParamAttr(),
-            shape=[h],
-            dtype=self._dtype,
-            default_initializer=Normal(0., 1.))
+        self.weight_u = self.create_parameter(attr=ParamAttr(),
+                                              shape=[h],
+                                              dtype=self._dtype,
+                                              default_initializer=Normal(
+                                                  0., 1.))
         self.weight_u.stop_gradient = True
 
-        self.weight_v = self.create_parameter(
-            attr=ParamAttr(),
-            shape=[w],
-            dtype=self._dtype,
-            default_initializer=Normal(0., 1.))
+        self.weight_v = self.create_parameter(attr=ParamAttr(),
+                                              shape=[w],
+                                              dtype=self._dtype,
+                                              default_initializer=Normal(
+                                                  0., 1.))
         self.weight_v.stop_gradient = True
 
     def forward(self, weight):
@@ -3152,15 +3164,16 @@ def forward(self, weight):
                                  'SpectralNorm')
         inputs = {'Weight': weight, 'U': self.weight_u, 'V': self.weight_v}
         out = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="spectral_norm",
-            inputs=inputs,
-            outputs={"Out": out, },
-            attrs={
-                "dim": self._dim,
-                "power_iters": self._power_iters,
-                "eps": self._eps,
-            })
+        self._helper.append_op(type="spectral_norm",
+                               inputs=inputs,
+                               outputs={
+                                   "Out": out,
+                               },
+                               attrs={
+                                   "dim": self._dim,
+                                   "power_iters": self._power_iters,
+                                   "eps": self._eps,
+                               })
 
         return out
 
@@ -3232,44 +3245,45 @@ def __init__(self,
         self._dtype = dtype
         w_shape = [self._feature_size, 3, self._output_size, self._num_filters]
         if self._bias_attr:
-            self.bias = self.create_parameter(
-                attr=self._bias_attr,
-                shape=[self._num_filters],
-                dtype=self._dtype,
-                is_bias=True)
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=w_shape,
-            dtype=self._dtype,
-            is_bias=False)
+            self.bias = self.create_parameter(attr=self._bias_attr,
+                                              shape=[self._num_filters],
+                                              dtype=self._dtype,
+                                              is_bias=True)
+        self.weight = self.create_parameter(attr=self._param_attr,
+                                            shape=w_shape,
+                                            dtype=self._dtype,
+                                            is_bias=False)
 
     def forward(self, nodes_vector, edge_set):
         check_type(nodes_vector, 'nodes_vector', (Variable), 'TreeConv')
         check_type(edge_set, 'edge_set', (Variable), 'TreeConv')
         if self._name:
-            out = self.create_variable(
-                name=self._name, dtype=self._dtype, persistable=False)
+            out = self.create_variable(name=self._name,
+                                       dtype=self._dtype,
+                                       persistable=False)
         else:
             out = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
-        self._helper.append_op(
-            type='tree_conv',
-            inputs={
-                'NodesVector': nodes_vector,
-                'EdgeSet': edge_set,
-                'Filter': self.weight
-            },
-            outputs={'Out': out, },
-            attrs={'max_depth': self._max_depth})
+        self._helper.append_op(type='tree_conv',
+                               inputs={
+                                   'NodesVector': nodes_vector,
+                                   'EdgeSet': edge_set,
+                                   'Filter': self.weight
+                               },
+                               outputs={
+                                   'Out': out,
+                               },
+                               attrs={'max_depth': self._max_depth})
         if self._bias_attr:
             pre_activation = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [out],
-                        'Y': [self.bias]},
-                outputs={'Out': [pre_activation]},
-                attrs={'axis': 1})
+            self._helper.append_op(type='elementwise_add',
+                                   inputs={
+                                       'X': [out],
+                                       'Y': [self.bias]
+                                   },
+                                   outputs={'Out': [pre_activation]},
+                                   attrs={'axis': 1})
         else:
             pre_activation = out
         return self._helper.append_activation(pre_activation, act=self._act)
@@ -3308,6 +3322,7 @@ def __init__(self, start_axis=1, stop_axis=-1):
         self.stop_axis = stop_axis
 
     def forward(self, input):
-        out = paddle.tensor.manipulation.flatten(
-            input, start_axis=self.start_axis, stop_axis=self.stop_axis)
+        out = paddle.tensor.manipulation.flatten(input,
+                                                 start_axis=self.start_axis,
+                                                 stop_axis=self.stop_axis)
         return out
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index fe1b56931f89d..09036ed942d10 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -287,8 +287,7 @@ def _coalesce_tensors(var_groups):
         for g_var in grad_vars:
             g_var_shapes.append(g_var.shape)
             flattened_vars.append(
-                nn.reshape(
-                    x=g_var, shape=[np.prod(g_var.shape)]))
+                nn.reshape(x=g_var, shape=[np.prod(g_var.shape)]))
         coalesced_grad = nn.concat(flattened_vars)
         coalesced_grads_and_grad_vars.append(
             [coalesced_grad, grad_vars, g_var_shapes])
@@ -298,12 +297,13 @@ def _coalesce_tensors(var_groups):
 @framework.dygraph_only
 def _reshape_inplace(x, shape):
     x_shape = framework._varbase_creator(dtype=x.dtype)
-    framework._dygraph_tracer().trace_op(
-        type="reshape2",
-        inputs={'X': x},
-        outputs={'Out': x,
-                 'XShape': x_shape},
-        attrs={'shape': shape})
+    framework._dygraph_tracer().trace_op(type="reshape2",
+                                         inputs={'X': x},
+                                         outputs={
+                                             'Out': x,
+                                             'XShape': x_shape
+                                         },
+                                         attrs={'shape': shape})
 
 
 @framework.dygraph_only
@@ -315,8 +315,10 @@ def _split_tensors(coalesced_grads_and_grad_vars):
                 type='split',
                 inputs={'X': coalesced_grad},
                 outputs={'Out': origin_grad_vars},
-                attrs={'sections': grad_var_len,
-                       'axis': 0})
+                attrs={
+                    'sections': grad_var_len,
+                    'axis': 0
+                })
             for g_var, g_shape in zip(origin_grad_vars, grad_shapes):
                 _reshape_inplace(x=g_var, shape=g_shape)
                 assert g_var.shape == g_shape
@@ -382,7 +384,7 @@ def sync_params_buffers(model,
             if is_model_parallel and param.is_distributed:
                 continue
 
-            # NOTE(shenliang03): Support situations that do not require synchronization parameters, 
+            # NOTE(shenliang03): Support situations that do not require synchronization parameters,
             # such as moe's expert parameters
             if getattr(param, "no_sync", False):
                 continue
@@ -397,8 +399,10 @@ def sync_params_buffers(model,
     coalesced_vars = build_groups(model_vars, 128 * 1024 * 1024)
 
     for coalesced_var, _, _ in coalesced_vars:
-        paddle.distributed.broadcast(
-            coalesced_var, src=src_rank, group=comm_group, use_calc_stream=True)
+        paddle.distributed.broadcast(coalesced_var,
+                                     src=src_rank,
+                                     group=comm_group,
+                                     use_calc_stream=True)
 
     for coalesced_var, origin_vars, var_shapes in coalesced_vars:
         var_len = [np.prod(v_shape) for v_shape in var_shapes]
@@ -406,8 +410,10 @@ def sync_params_buffers(model,
             type='split',
             inputs={'X': coalesced_var},
             outputs={'Out': origin_vars},
-            attrs={'sections': var_len,
-                   'axis': 0})
+            attrs={
+                'sections': var_len,
+                'axis': 0
+            })
 
 
 class DataParallel(layers.Layer):
@@ -591,8 +597,8 @@ def __init__(self,
         self.var_dtype = core.eager.Tensor if in_dygraph_mode(
         ) else core.VarBase
 
-        # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. 
-        # It just stores some environment variables, which can be constructed by 
+        # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy.
+        # It just stores some environment variables, which can be constructed by
         # ParallelEnv. Here it is set as an optional argument.
         # This parameter is not removed because of compatibility with 1.x writing.
         if strategy is not None:
@@ -614,15 +620,15 @@ def __init__(self,
                     "ProcessGroup must be an instance of Group in DataParallel."
 
             # sync buffer and params
-            # TODO(liuyuhui) Currently not support xpu. xpu is 
+            # TODO(liuyuhui) Currently not support xpu. xpu is
             # still broadcasting parameters when calling layer
             if not paddle.is_compiled_with_xpu():
                 sync_params_buffers(self._layers)
 
             self.comm_buffer_size = int(comm_buffer_size * 1024 * 1024)
-            # NOTE(shenliang03): We can set environment variables to control 
-            # the size of the group, Default: 1MB. The role of this small group is: 
-            # when the last group allreduce, the overlap cannot work. Making the 
+            # NOTE(shenliang03): We can set environment variables to control
+            # the size of the group, Default: 1MB. The role of this small group is:
+            # when the last group allreduce, the overlap cannot work. Making the
             # the last group small is useful to improve performance.
             self.last_comm_buffer_size = int(last_comm_buffer_size * 1024 *
                                              1024)
@@ -660,7 +666,7 @@ def init_reducer(self):
         def check_layer_sparse(sublayer):
             if isinstance(sublayer, paddle.nn.layer.common.Embedding):
                 return sublayer._sparse
-            # NOTE(shenliang03):This is for compatibility. If paddle.fluid.dygraph.Embedding 
+            # NOTE(shenliang03):This is for compatibility. If paddle.fluid.dygraph.Embedding
             # is removed in the future, the check will also be removed here.
             if isinstance(sublayer, paddle.fluid.dygraph.Embedding):
                 return sublayer._is_sparse
@@ -676,9 +682,8 @@ def check_layer_sparse(sublayer):
                 [self.last_comm_buffer_size, self.comm_buffer_size])
 
             self._reducer = core.EagerReducer(
-                trainable_parameters,
-                list(reversed(self.group_indices)), is_sparse_gradient,
-                self.group.process_group,
+                trainable_parameters, list(reversed(self.group_indices)),
+                is_sparse_gradient, self.group.process_group,
                 [self.last_comm_buffer_size, self.comm_buffer_size],
                 self.find_unused_parameters)
         elif _in_legacy_dygraph():
@@ -687,9 +692,8 @@ def check_layer_sparse(sublayer):
                 [self.last_comm_buffer_size, self.comm_buffer_size])
 
             self._reducer = core.Reducer(
-                trainable_parameters,
-                list(reversed(self.group_indices)), is_sparse_gradient,
-                parallel_helper.__parallel_ctx__clz__,
+                trainable_parameters, list(reversed(self.group_indices)),
+                is_sparse_gradient, parallel_helper.__parallel_ctx__clz__,
                 [self.last_comm_buffer_size, self.comm_buffer_size],
                 self.find_unused_parameters)
 
@@ -752,12 +756,12 @@ def forward(self, *inputs, **kwargs):
         outputs = self._layers(*inputs, **kwargs)
         if self._strategy.nranks > 1 and framework._dygraph_tracer(
         )._has_grad and self.grad_need_sync:
-            self._reducer.prepare_for_backward(
-                list(self._find_varbase(outputs)))
+            self._reducer.prepare_for_backward(list(
+                self._find_varbase(outputs)))
         return outputs
 
-    @deprecated(
-        since="2.0.0", reason="This method does not need to be called anymore.")
+    @deprecated(since="2.0.0",
+                reason="This method does not need to be called anymore.")
     def scale_loss(self, loss):
         """
         Deprecated method, now ``scale_loss`` is an empty method,  
@@ -765,8 +769,8 @@ def scale_loss(self, loss):
         """
         return loss
 
-    @deprecated(
-        since="2.0.0", reason="This method does not need to be called anymore.")
+    @deprecated(since="2.0.0",
+                reason="This method does not need to be called anymore.")
     def apply_collective_grads(self):
         """
         Deprecated method, now ``apply_collective_grads`` is an empty method, 
@@ -840,8 +844,8 @@ def set_state_dict(self, state_dict, use_structured_name=True):
 
         '''
 
-        self._layers.set_state_dict(
-            state_dict, use_structured_name=use_structured_name)
+        self._layers.set_state_dict(state_dict,
+                                    use_structured_name=use_structured_name)
 
     # [aliases] Compatible with old method names
     set_dict = set_state_dict
diff --git a/python/paddle/fluid/dygraph/parallel_helper.py b/python/paddle/fluid/dygraph/parallel_helper.py
index 5fe4d4162e6e3..bc0bb4603525e 100644
--- a/python/paddle/fluid/dygraph/parallel_helper.py
+++ b/python/paddle/fluid/dygraph/parallel_helper.py
@@ -14,6 +14,7 @@
 import os
 from ..layers import collective
 from ..framework import Parameter
+
 __parallel_ctx__clz__ = None
 
 
diff --git a/python/paddle/fluid/dygraph/rnn.py b/python/paddle/fluid/dygraph/rnn.py
index 05a76a8d12586..837287faa0f48 100644
--- a/python/paddle/fluid/dygraph/rnn.py
+++ b/python/paddle/fluid/dygraph/rnn.py
@@ -162,21 +162,20 @@ def __init__(self,
                 shape=[4 * self._hidden_size, self._hidden_size],
                 dtype=self._dtype)
 
-            self._bias_ih = self.create_parameter(
-                attr=bias_ih_param_attr,
-                shape=[4 * self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True)
-            self._bias_hh = self.create_parameter(
-                attr=bias_hh_param_attr,
-                shape=[4 * self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True)
+            self._bias_ih = self.create_parameter(attr=bias_ih_param_attr,
+                                                  shape=[4 * self._hidden_size],
+                                                  dtype=self._dtype,
+                                                  is_bias=True)
+            self._bias_hh = self.create_parameter(attr=bias_hh_param_attr,
+                                                  shape=[4 * self._hidden_size],
+                                                  dtype=self._dtype,
+                                                  is_bias=True)
 
         else:
 
-            self._forget_bias = fill_constant(
-                [1], dtype=dtype, value=forget_bias)
+            self._forget_bias = fill_constant([1],
+                                              dtype=dtype,
+                                              value=forget_bias)
             self._forget_bias.stop_gradient = False
 
             self._weight = self.create_parameter(
@@ -186,11 +185,10 @@ def __init__(self,
                 ],
                 dtype=dtype)
 
-            self._bias = self.create_parameter(
-                attr=self._bias_attr,
-                shape=[4 * self._hidden_size],
-                dtype=dtype,
-                is_bias=True)
+            self._bias = self.create_parameter(attr=self._bias_attr,
+                                               shape=[4 * self._hidden_size],
+                                               dtype=dtype,
+                                               is_bias=True)
 
     def forward(self, input, pre_hidden, pre_cell):
 
@@ -226,9 +224,10 @@ def forward(self, input, pre_hidden, pre_cell):
             gate_input = elementwise_add(gate_input, self._bias)
             i, j, f, o = split(gate_input, num_or_sections=4, dim=-1)
             new_cell = elementwise_add(
-                elementwise_mul(pre_cell,
-                                self._gate_activation(
-                                    elementwise_add(f, self._forget_bias))),
+                elementwise_mul(
+                    pre_cell,
+                    self._gate_activation(elementwise_add(f,
+                                                          self._forget_bias))),
                 elementwise_mul(sigmoid(i), tanh(j)))
             new_hidden = self._activation(new_cell) * self._gate_activation(o)
 
@@ -363,16 +362,14 @@ def __init__(self,
                 shape=[3 * self._hidden_size, self._hidden_size],
                 dtype=self._dtype)
 
-            self._bias_ih = self.create_parameter(
-                attr=bias_ih_param_attr,
-                shape=[3 * self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True)
-            self._bias_hh = self.create_parameter(
-                attr=bias_hh_param_attr,
-                shape=[3 * self._hidden_size],
-                dtype=self._dtype,
-                is_bias=True)
+            self._bias_ih = self.create_parameter(attr=bias_ih_param_attr,
+                                                  shape=[3 * self._hidden_size],
+                                                  dtype=self._dtype,
+                                                  is_bias=True)
+            self._bias_hh = self.create_parameter(attr=bias_hh_param_attr,
+                                                  shape=[3 * self._hidden_size],
+                                                  dtype=self._dtype,
+                                                  is_bias=True)
 
         else:
 
@@ -403,9 +400,7 @@ def __init__(self,
 
             self._candidate_weight = self.create_parameter(
                 attr=candidate_weight_param_attr,
-                shape=[
-                    self._input_size + self._hidden_size, self._hidden_size
-                ],
+                shape=[self._input_size + self._hidden_size, self._hidden_size],
                 dtype=dtype)
 
             self._gate_bias = self.create_parameter(
@@ -455,8 +450,8 @@ def forward(self, input, pre_hidden):
 
             r_hidden = r * pre_hidden
 
-            candidate = matmul(
-                concat([input, r_hidden], 1), self._candidate_weight)
+            candidate = matmul(concat([input, r_hidden], 1),
+                               self._candidate_weight)
             candidate = elementwise_add(candidate, self._candidate_bias)
 
             c = self._activation(candidate)
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index 44a49148ca044..046a98293e832 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -189,15 +189,15 @@ def eager_trace_op(self,
                     # Replaced outputs by function returns
                     if isinstance(returns[i], list):
                         for j in range(len(returns[i])):
-                            outputs[retname][j].reconstruct_from_(returns[i][j],
-                                                                  False)
+                            outputs[retname][j].reconstruct_from_(
+                                returns[i][j], False)
                     else:
                         if isinstance(outputs[retname], list):
-                            outputs[retname][0].reconstruct_from_(returns[i],
-                                                                  False)
+                            outputs[retname][0].reconstruct_from_(
+                                returns[i], False)
                         else:
-                            outputs[retname].reconstruct_from_(returns[i],
-                                                               False)
+                            outputs[retname].reconstruct_from_(
+                                returns[i], False)
         elif isinstance(returns, list):
             assert len(outputs.keys()) == 1
             key = list(outputs.keys())[0]
@@ -277,8 +277,8 @@ def eager_final_state_trace_op(self,
                     # Replaced outputs by function returns
                     if isinstance(returns[i], list):
                         for j in range(len(returns[i])):
-                            outputs[retname][j].reconstruct_from_(returns[i][j],
-                                                                  False)
+                            outputs[retname][j].reconstruct_from_(
+                                returns[i][j], False)
                     else:
                         outputs[retname][0].reconstruct_from_(returns[i], False)
         elif isinstance(returns, list):
@@ -316,8 +316,9 @@ def trace_op(self,
                                     inplace_map)
         else:
             self.trace(type, inputs, outputs, attrs,
-                       framework._current_expected_place(), self._has_grad and
-                       not stop_gradient, inplace_map if inplace_map else {})
+                       framework._current_expected_place(), self._has_grad
+                       and not stop_gradient,
+                       inplace_map if inplace_map else {})
 
     def train_mode(self):
         self._train_mode = True
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index add3d73efc7e1..2422c68622a00 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -70,6 +70,7 @@ def remove(self):
 
 
 def monkey_patch_varbase():
+
     @switch_to_static_graph
     def _to_static_var(self, to_parameter=False, **kwargs):
         """
@@ -258,12 +259,12 @@ def backward(self, grad_tensor=None, retain_graph=False):
             if grad_tensor is not None:
                 if framework._in_eager_mode_:
                     assert isinstance(
-                        grad_tensor, core.eager.
-                        Tensor), "The type of grad_tensor must be paddle.Tensor"
+                        grad_tensor, core.eager.Tensor
+                    ), "The type of grad_tensor must be paddle.Tensor"
                 else:
                     assert isinstance(
-                        grad_tensor, paddle.
-                        Tensor), "The type of grad_tensor must be paddle.Tensor"
+                        grad_tensor, paddle.Tensor
+                    ), "The type of grad_tensor must be paddle.Tensor"
                 assert grad_tensor.shape == self.shape, \
                     "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
                     grad_tensor.name, grad_tensor.shape, self.name, self.shape)
@@ -304,7 +305,8 @@ def backward(self, grad_tensor=None, retain_graph=False):
     @deprecated(
         since="2.1.0",
         level=1,
-        reason="Please use tensor.grad, which returns the tensor value of the gradient."
+        reason=
+        "Please use tensor.grad, which returns the tensor value of the gradient."
     )
     def gradient(self):
         """
@@ -341,9 +343,9 @@ def gradient(self):
 
             new_ivar = self._grad_ivar()._copy_to(core.CPUPlace(), True)
             if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS:
-                return (
-                    np.array(new_ivar.value().get_selected_rows().get_tensor()),
-                    np.array(new_ivar.value().get_selected_rows().rows()))
+                return (np.array(
+                    new_ivar.value().get_selected_rows().get_tensor()),
+                        np.array(new_ivar.value().get_selected_rows().rows()))
             else:
                 return np.array(new_ivar.value().get_tensor())
 
@@ -715,7 +717,9 @@ def contain_tensor(item):
         return False
 
     def __getitem__(self, item):
+
         def is_list_tuple(index, contain_type):
+
             def _is_list_tuple(item):
                 if isinstance(item, (tuple, list)):
                     for s in item:
@@ -743,6 +747,7 @@ def _is_list_tuple(item):
             return self._getitem_index_not_tensor(item)
 
     def __setitem__(self, item, value):
+
         def contain_tensor_or_list(item):
             if not isinstance(item, tuple):
                 item = [item]
@@ -902,7 +907,7 @@ def values(self):
                     indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
                     values = [1, 2, 3, 4, 5]
                     dense_shape = [3, 4]
-                    sparse_x = paddle.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int32'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
+                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int32'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
                     print(sparse_x.values())
                     #[1, 2, 3, 4, 5]
         """
@@ -932,7 +937,7 @@ def to_dense(self):
                     indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
                     values = [1, 2, 3, 4, 5]
                     dense_shape = [3, 4]
-                    sparse_x = paddle.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int64'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
+                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int64'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
                     dense_x = sparse_x.to_dense()
                     #[[0., 1., 0., 2.],
                     # [0., 0., 3., 0.],
@@ -984,17 +989,25 @@ def to_sparse_coo(self, sparse_dim):
     if framework._in_eager_mode_ and not hasattr(core, "eager"):
         return
 
-    for method_name, method in (
-        ("__bool__", __bool__), ("__nonzero__", __nonzero__),
-        ("_to_static_var", _to_static_var), ("set_value", set_value),
-        ("block", block), ("backward", backward), ("clear_grad", clear_grad),
-        ("inplace_version", inplace_version), ("gradient", gradient),
-        ("register_hook", register_hook), ("__str__", __str__),
-        ("__repr__", __str__), ("__deepcopy__", __deepcopy__),
-        ("__module__", "paddle"), ("__array__", __array__),
-        ("__getitem__", __getitem__), ("item", item),
-        ("__setitem__", __setitem__), ("_to", _to), ("values", values),
-        ("to_dense", to_dense), ("to_sparse_coo", to_sparse_coo)):
+    for method_name, method in (("__bool__", __bool__), ("__nonzero__",
+                                                         __nonzero__),
+                                ("_to_static_var",
+                                 _to_static_var), ("set_value", set_value),
+                                ("block", block), ("backward", backward),
+                                ("clear_grad", clear_grad), ("inplace_version",
+                                                             inplace_version),
+                                ("gradient", gradient), ("register_hook",
+                                                         register_hook),
+                                ("__str__", __str__), ("__repr__", __str__),
+                                ("__deepcopy__", __deepcopy__), ("__module__",
+                                                                 "paddle"),
+                                ("__array__",
+                                 __array__), ("__getitem__",
+                                              __getitem__), ("item", item),
+                                ("__setitem__",
+                                 __setitem__), ("_to", _to), ("values", values),
+                                ("to_dense", to_dense), ("to_sparse_coo",
+                                                         to_sparse_coo)):
         if framework._in_eager_mode_:
             setattr(core.eager.Tensor, method_name, method)
         else:
diff --git a/python/paddle/fluid/entry_attr.py b/python/paddle/fluid/entry_attr.py
index 0fbbf7c36e8f5..375079570429d 100644
--- a/python/paddle/fluid/entry_attr.py
+++ b/python/paddle/fluid/entry_attr.py
@@ -39,6 +39,7 @@ def _to_attr(self):
 
 
 class ProbabilityEntry(EntryAttr):
+
     def __init__(self, probability):
         super(ProbabilityEntry, self).__init__()
 
@@ -56,6 +57,7 @@ def _to_attr(self):
 
 
 class CountFilterEntry(EntryAttr):
+
     def __init__(self, count_filter):
         super(CountFilterEntry, self).__init__()
 
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 5c8386af3a7ce..510733d4c1c7c 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -33,13 +33,12 @@
 
 def _clone_var_(block, var):
     assert isinstance(var, Variable)
-    return block.create_var(
-        name=var.name,
-        shape=var.shape,
-        dtype=var.dtype,
-        type=var.type,
-        lod_level=var.lod_level,
-        persistable=True)
+    return block.create_var(name=var.name,
+                            shape=var.shape,
+                            dtype=var.dtype,
+                            type=var.type,
+                            lod_level=var.lod_level,
+                            persistable=True)
 
 
 class Evaluator(object):
@@ -89,8 +88,10 @@ def reset(self, executor, reset_program=None):
             for var in self.states:
                 assert isinstance(var, Variable)
                 g_var = _clone_var_(reset_program.current_block(), var)
-                layers.fill_constant(
-                    shape=g_var.shape, value=0.0, dtype=g_var.dtype, out=g_var)
+                layers.fill_constant(shape=g_var.shape,
+                                     value=0.0,
+                                     dtype=g_var.dtype,
+                                     out=g_var)
 
         executor.run(reset_program)
 
@@ -115,11 +116,11 @@ def _create_state(self, suffix, dtype, shape):
         Returns: State variable
 
         """
-        state = self.helper.create_variable(
-            name="_".join([unique_name.generate(self.helper.name), suffix]),
-            persistable=True,
-            dtype=dtype,
-            shape=shape)
+        state = self.helper.create_variable(name="_".join(
+            [unique_name.generate(self.helper.name), suffix]),
+                                            persistable=True,
+                                            dtype=dtype,
+                                            shape=shape)
         self.states.append(state)
         return state
 
@@ -158,21 +159,24 @@ class ChunkEvaluator(Evaluator):
     """
 
     def __init__(
-            self,
-            input,
-            label,
-            chunk_scheme,
-            num_chunk_types,
-            excluded_chunk_types=None, ):
+        self,
+        input,
+        label,
+        chunk_scheme,
+        num_chunk_types,
+        excluded_chunk_types=None,
+    ):
         super(ChunkEvaluator, self).__init__("chunk_eval")
         main_program = self.helper.main_program
         if main_program.current_block().idx != 0:
             raise ValueError("You can only invoke Evaluator in root block")
 
-        self.num_infer_chunks = self._create_state(
-            dtype='int64', shape=[1], suffix='num_infer_chunks')
-        self.num_label_chunks = self._create_state(
-            dtype='int64', shape=[1], suffix='num_label_chunks')
+        self.num_infer_chunks = self._create_state(dtype='int64',
+                                                   shape=[1],
+                                                   suffix='num_infer_chunks')
+        self.num_label_chunks = self._create_state(dtype='int64',
+                                                   shape=[1],
+                                                   suffix='num_label_chunks')
         self.num_correct_chunks = self._create_state(
             dtype='int64', shape=[1], suffix='num_correct_chunks')
         precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
@@ -180,16 +184,14 @@ def __init__(
             label=label,
             chunk_scheme=chunk_scheme,
             num_chunk_types=num_chunk_types,
-            excluded_chunk_types=excluded_chunk_types, )
-        layers.sums(
-            input=[self.num_infer_chunks, num_infer_chunks],
-            out=self.num_infer_chunks)
-        layers.sums(
-            input=[self.num_label_chunks, num_label_chunks],
-            out=self.num_label_chunks)
-        layers.sums(
-            input=[self.num_correct_chunks, num_correct_chunks],
-            out=self.num_correct_chunks)
+            excluded_chunk_types=excluded_chunk_types,
+        )
+        layers.sums(input=[self.num_infer_chunks, num_infer_chunks],
+                    out=self.num_infer_chunks)
+        layers.sums(input=[self.num_label_chunks, num_label_chunks],
+                    out=self.num_label_chunks)
+        layers.sums(input=[self.num_correct_chunks, num_correct_chunks],
+                    out=self.num_correct_chunks)
 
         self.metrics.extend([precision, recall, f1_score])
 
@@ -209,10 +211,8 @@ def eval(self, executor, eval_program=None):
             num_correct_chunks) / num_label_chunks if num_label_chunks else 0
         f1_score = float(2 * precision * recall) / (
             precision + recall) if num_correct_chunks else 0
-        return np.array(
-            [precision], dtype='float32'), np.array(
-                [recall], dtype='float32'), np.array(
-                    [f1_score], dtype='float32')
+        return np.array([precision], dtype='float32'), np.array(
+            [recall], dtype='float32'), np.array([f1_score], dtype='float32')
 
 
 class EditDistance(Evaluator):
@@ -252,29 +252,31 @@ def __init__(self, input, label, ignored_tokens=None, **kwargs):
         if main_program.current_block().idx != 0:
             raise ValueError("You can only invoke Evaluator in root block")
 
-        self.total_distance = self._create_state(
-            dtype='float32', shape=[1], suffix='total_distance')
-        self.seq_num = self._create_state(
-            dtype='int64', shape=[1], suffix='seq_num')
-        self.instance_error = self._create_state(
-            dtype='int64', shape=[1], suffix='instance_error')
-        distances, seq_num = layers.edit_distance(
-            input=input, label=label, ignored_tokens=ignored_tokens)
+        self.total_distance = self._create_state(dtype='float32',
+                                                 shape=[1],
+                                                 suffix='total_distance')
+        self.seq_num = self._create_state(dtype='int64',
+                                          shape=[1],
+                                          suffix='seq_num')
+        self.instance_error = self._create_state(dtype='int64',
+                                                 shape=[1],
+                                                 suffix='instance_error')
+        distances, seq_num = layers.edit_distance(input=input,
+                                                  label=label,
+                                                  ignored_tokens=ignored_tokens)
 
         zero = layers.fill_constant(shape=[1], value=0.0, dtype='float32')
         compare_result = layers.equal(distances, zero)
         compare_result_int = layers.cast(x=compare_result, dtype='int64')
         seq_right_count = layers.reduce_sum(compare_result_int)
-        instance_error_count = layers.elementwise_sub(
-            x=seq_num, y=seq_right_count)
+        instance_error_count = layers.elementwise_sub(x=seq_num,
+                                                      y=seq_right_count)
         total_distance = layers.reduce_sum(distances)
-        layers.sums(
-            input=[self.total_distance, total_distance],
-            out=self.total_distance)
+        layers.sums(input=[self.total_distance, total_distance],
+                    out=self.total_distance)
         layers.sums(input=[self.seq_num, seq_num], out=self.seq_num)
-        layers.sums(
-            input=[self.instance_error, instance_error_count],
-            out=self.instance_error)
+        layers.sums(input=[self.instance_error, instance_error_count],
+                    out=self.instance_error)
         self.metrics.append(total_distance)
         self.metrics.append(instance_error_count)
 
@@ -289,10 +291,10 @@ def eval(self, executor, eval_program=None):
             seq_num = layers.cast(x=seq_num, dtype='float32')
             instance_error = layers.cast(x=instance_error, dtype='float32')
             avg_distance = layers.elementwise_div(x=total_distance, y=seq_num)
-            avg_instance_error = layers.elementwise_div(
-                x=instance_error, y=seq_num)
-            result = executor.run(
-                eval_program, fetch_list=[avg_distance, avg_instance_error])
+            avg_instance_error = layers.elementwise_div(x=instance_error,
+                                                        y=seq_num)
+            result = executor.run(eval_program,
+                                  fetch_list=[avg_distance, avg_instance_error])
         return np.array(result[0]), np.array(result[1])
 
 
@@ -375,25 +377,26 @@ def __init__(self,
             label = layers.concat([gt_label, gt_box], axis=1)
 
         # calculate mean average precision (mAP) of current mini-batch
-        map = detection.detection_map(
-            input,
-            label,
-            class_num,
-            background_label,
-            overlap_threshold=overlap_threshold,
-            evaluate_difficult=evaluate_difficult,
-            ap_version=ap_version)
+        map = detection.detection_map(input,
+                                      label,
+                                      class_num,
+                                      background_label,
+                                      overlap_threshold=overlap_threshold,
+                                      evaluate_difficult=evaluate_difficult,
+                                      ap_version=ap_version)
 
         self._create_state(dtype='int32', shape=None, suffix='accum_pos_count')
         self._create_state(dtype='float32', shape=None, suffix='accum_true_pos')
-        self._create_state(
-            dtype='float32', shape=None, suffix='accum_false_pos')
+        self._create_state(dtype='float32',
+                           shape=None,
+                           suffix='accum_false_pos')
 
         self.has_state = None
-        var = self.helper.create_variable(
-            persistable=True, dtype='int32', shape=[1])
-        self.helper.set_variable_initializer(
-            var, initializer=Constant(value=int(0)))
+        var = self.helper.create_variable(persistable=True,
+                                          dtype='int32',
+                                          shape=[1])
+        self.helper.set_variable_initializer(var,
+                                             initializer=Constant(value=int(0)))
         self.has_state = var
 
         # calculate accumulative mAP
@@ -409,11 +412,10 @@ def __init__(self,
             out_states=self.states,
             ap_version=ap_version)
 
-        layers.fill_constant(
-            shape=self.has_state.shape,
-            value=1,
-            dtype=self.has_state.dtype,
-            out=self.has_state)
+        layers.fill_constant(shape=self.has_state.shape,
+                             value=1,
+                             dtype=self.has_state.dtype,
+                             out=self.has_state)
 
         self.cur_map = map
         self.accum_map = accum_map
@@ -426,6 +428,8 @@ def reset(self, executor, reset_program=None):
             reset_program = Program()
         with program_guard(main_program=reset_program):
             var = _clone_var_(reset_program.current_block(), self.has_state)
-            layers.fill_constant(
-                shape=var.shape, value=0, dtype=var.dtype, out=var)
+            layers.fill_constant(shape=var.shape,
+                                 value=0,
+                                 dtype=var.dtype,
+                                 out=var)
         executor.run(reset_program)
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
old mode 100644
new mode 100755
index 164545d0a0595..0d4acf5fe6d86
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -278,8 +278,9 @@ def has_feed_operators(block, feed_targets, feed_holder_name):
             assert op.desc.input('X')[0] == feed_holder_name
             feed_target_name = op.desc.output('Out')[0]
             if feed_target_name not in feed_targets:
-                raise Exception("'feed_targets' does not have {} variable".
-                                format(feed_target_name))
+                raise Exception(
+                    "'feed_targets' does not have {} variable".format(
+                        feed_target_name))
         else:
             break
     if feed_count > 0 and feed_count != len(feed_targets):
@@ -322,8 +323,9 @@ def has_fetch_operators(block,
             if fetch_target_name not in [
                     var.desc.name() for var in fetch_targets
             ]:
-                raise Exception("'fetch_targets' does not have {} variable".
-                                format(fetch_target_name))
+                raise Exception(
+                    "'fetch_targets' does not have {} variable".format(
+                        fetch_target_name))
             idx = op.desc.attr('col')
             assert fetch_target_name == fetch_targets[idx].desc.name()
     if fetch_count > 0 and fetch_count != len(fetch_targets):
@@ -366,6 +368,7 @@ def _fetch_var(name, scope=None, return_numpy=True):
 
 
 def _to_name_str(var):
+
     def _to_str(var):
         if isinstance(var, Variable):
             return var.desc.name()
@@ -426,7 +429,7 @@ def _prepare_fleet_executor():
 
 
 def _get_strong_program_cache_key(program, feed, fetch_list):
-    # NOTE(xiongkun) id(proram) may be duplicate. So add addition var_name as cache key. 
+    # NOTE(xiongkun) id(proram) may be duplicate. So add addition var_name as cache key.
     def _get_varname_from_block(block):
         block_str = []
         for var_name in list(block.vars.keys()):
@@ -435,8 +438,8 @@ def _get_varname_from_block(block):
 
     inner_program = program._program if isinstance(
         program, compiler.CompiledProgram) else program
-    return _get_varname_from_block(inner_program.blocks[0]) + str(id(
-        program)) + _get_program_cache_key(feed, fetch_list)
+    return _get_varname_from_block(inner_program.blocks[0]) + str(
+        id(program)) + _get_program_cache_key(feed, fetch_list)
 
 
 def _get_program_cache_key(feed, fetch_list):
@@ -499,6 +502,7 @@ def _as_lodtensor(data, place, dtype=None):
 
 
 class FetchHandler(object):
+
     def __init__(self, var_dict=None, period_secs=60):
         assert var_dict != None
         self.var_dict = var_dict
@@ -525,6 +529,7 @@ def handler(self, res_dict):
 
 
 class _StandaloneExecutor(object):
+
     def __init__(self, place, main_program, scope):
         self._place = core.Place()
         self._place.set_place(place)
@@ -610,6 +615,7 @@ def _check_fetch(self, fetch_list):
 
 
 class _ExecutorCache(object):
+
     def __init__(self, place):
         # {Program : _StandaloneExecutor}
         self._place = place
@@ -789,11 +795,10 @@ def _add_feed_fetch_ops(self,
             for i, name in enumerate(feed):
                 if global_block.has_var(name):
                     out = global_block.var(name)
-                    global_block._prepend_op(
-                        type='feed',
-                        inputs={'X': [feed_var]},
-                        outputs={'Out': [out]},
-                        attrs={'col': i})
+                    global_block._prepend_op(type='feed',
+                                             inputs={'X': [feed_var]},
+                                             outputs={'Out': [out]},
+                                             attrs={'col': i})
                 else:
                     warnings.warn(
                         "The variable %s is not found in program. It is not declared or is pruned."
@@ -809,13 +814,13 @@ def _add_feed_fetch_ops(self,
                                    fetch_op):
             for i, var in enumerate(fetch_list):
                 assert isinstance(var, Variable) or isinstance(
-                    var, six.string_types), (
-                        "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
-                global_block.append_op(
-                    type=fetch_op,
-                    inputs={'X': [var]},
-                    outputs={'Out': [fetch_var]},
-                    attrs={'col': i})
+                    var,
+                    six.string_types), ("Wrong type for fetch_list[%s]: %s" %
+                                        (i, type(var)))
+                global_block.append_op(type=fetch_op,
+                                       inputs={'X': [var]},
+                                       outputs={'Out': [fetch_var]},
+                                       attrs={'col': i})
 
         return tmp_program
 
@@ -886,8 +891,9 @@ def _get_targets(_optimize_ops, _fetch_list, item):
             elif isinstance(item, tuple):
                 if not isinstance(item[0], (list, tuple)):
                     raise TypeError(
-                        "Requires fetch_list[{}][0] shall be one of (list, tuple) when type(fetch_list[{}]) is `tuple`, but received fetch_list[{}][0]'s type is `{}`.".
-                        format(index, index, index, type(item[0]).__name__))
+                        "Requires fetch_list[{}][0] shall be one of (list, tuple) when type(fetch_list[{}]) is `tuple`, but received fetch_list[{}][0]'s type is `{}`."
+                        .format(index, index, index,
+                                type(item[0]).__name__))
                 for i in item[0]:
                     _get_targets(_optimize_ops, _fetch_list, i)
             else:
@@ -1051,8 +1057,8 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
                     # always set to CPU place, since the tensor need to be split
                     # it is fast in CPU
                     feed_tensor = _as_lodtensor(feed[feed_name],
-                                                core.CPUPlace(), var.dtype
-                                                if var else None)
+                                                core.CPUPlace(),
+                                                var.dtype if var else None)
                 if need_check_feed:
                     check_feed_shape_type(var, feed_tensor, exe.device_count())
                 feed_tensor_dict[feed_name] = feed_tensor
@@ -1071,8 +1077,8 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
                         feed_name) if need_check_feed else None
                     if not isinstance(tensor, core.LoDTensor):
                         tensor = _as_lodtensor(each[feed_name],
-                                               program._places[i], var.dtype
-                                               if var else None)
+                                               program._places[i],
+                                               var.dtype if var else None)
                     if need_check_feed:
                         check_feed_shape_type(var, tensor)
                     res_dict[feed_name] = tensor
@@ -1092,9 +1098,8 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
                     "take any effect! Please set the learning rate manually before each batch!"
                 )
             else:
-                exe.feed_and_split_tensor_into_local_scopes({
-                    lr_sheduler._var_name: lr_tensor
-                })
+                exe.feed_and_split_tensor_into_local_scopes(
+                    {lr_sheduler._var_name: lr_tensor})
 
         fetch_var_names = list(map(_to_name_str, fetch_list))
         tensors = exe.run(fetch_var_names, return_merged)._move_to_list()
@@ -1282,17 +1287,16 @@ def run(self,
 
         """
         try:
-            res = self._run_impl(
-                program=program,
-                feed=feed,
-                fetch_list=fetch_list,
-                feed_var_name=feed_var_name,
-                fetch_var_name=fetch_var_name,
-                scope=scope,
-                return_numpy=return_numpy,
-                use_program_cache=use_program_cache,
-                use_prune=use_prune,
-                return_merged=return_merged)
+            res = self._run_impl(program=program,
+                                 feed=feed,
+                                 fetch_list=fetch_list,
+                                 feed_var_name=feed_var_name,
+                                 fetch_var_name=fetch_var_name,
+                                 scope=scope,
+                                 return_numpy=return_numpy,
+                                 use_program_cache=use_program_cache,
+                                 use_prune=use_prune,
+                                 return_merged=return_merged)
             core.update_autotune_status()
             return res
         except Exception as e:
@@ -1315,18 +1319,20 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
                 # Move prepare here for port conflict with nccl in startup program
                 if self._fleet_executor is None:
                     self._fleet_executor = _prepare_fleet_executor()
-                return self._run_using_fleet_executor(
-                    program=program, feed=feed, fetch_list=fetch_list)
+                return self._run_using_fleet_executor(program=program,
+                                                      feed=feed,
+                                                      fetch_list=fetch_list)
             if "startup_program" in program._pipeline_opt:
                 program = program._pipeline_opt["startup_program"]
             else:
-                return self._run_pipeline(
-                    program,
-                    fetch_list=fetch_list,
-                    use_program_cache=use_program_cache)
+                return self._run_pipeline(program,
+                                          fetch_list=fetch_list,
+                                          use_program_cache=use_program_cache)
 
         if isinstance(program, Program) and program._heter_pipeline_opt:
-            ## change default executor 
+            #print("program._heter_pipeline_opt: {}".format(
+            #    program._heter_pipeline_opt))
+            ## change default executor
             heter_place = program._heter_pipeline_opt["heter_place"]
             heter_place = framework._get_paddle_place(heter_place)
             p = core.Place()
@@ -1334,6 +1340,7 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
             self._default_executor = core.Executor(p)
             # TODO(zhangminxu): support heterps pipeline training using exe.run
             if "startup_program" in program._heter_pipeline_opt:
+                #print("get startup_program from _pipeline_opt")
                 program = program._heter_pipeline_opt["startup_program"]
 
         if isinstance(program, Program) and \
@@ -1391,6 +1398,7 @@ def _can_use_interpreter_core(program, place):
                 return False
 
             compiled = isinstance(program, compiler.CompiledProgram)
+            # print("compiled is : {}".format(compiled))
             # NOTE(zhiqiu): do not support compiled program now
             if compiled:
                 return False
@@ -1453,11 +1461,11 @@ def _can_use_interpreter_core(program, place):
                     lr_sheduler = program.lr_sheduler
                     lr_value = lr_sheduler()
                     lr_var = program.global_block().vars[lr_sheduler._var_name]
-                    data = np.array(
-                        [lr_value]).astype(convert_dtype(lr_var.dtype))
+                    data = np.array([lr_value
+                                     ]).astype(convert_dtype(lr_var.dtype))
                     tensor = core.get_variable_tensor(scope,
                                                       lr_sheduler._var_name)
-                    # NOTE(dev): `set` always call TensorCopySync that is a 
+                    # NOTE(dev): `set` always call TensorCopySync that is a
                     # blocking behavior. So we use `_copy_from` to replace it.
                     cpu_tensor = _as_lodtensor(data, core.CPUPlace())
                     tensor._copy_from(cpu_tensor, self.place)
@@ -1499,37 +1507,34 @@ def _can_use_interpreter_core(program, place):
                 # _graph in program does not support inference since the _graph is optimized
                 # through optimizer.minimize function and should not be used as inference graph
                 # assert not program._graph._is_inference
-                return self._run_parallel(
-                    program._graph,
-                    scope=scope,
-                    feed=feed,
-                    fetch_list=fetch_list,
-                    fetch_var_name=fetch_var_name,
-                    return_numpy=return_numpy,
-                    return_merged=return_merged)
-
-            return self._run_program(
-                program,
-                feed=feed,
-                fetch_list=fetch_list,
-                feed_var_name=feed_var_name,
-                fetch_var_name=fetch_var_name,
-                scope=scope,
-                return_numpy=return_numpy,
-                use_program_cache=use_program_cache)
+                return self._run_parallel(program._graph,
+                                          scope=scope,
+                                          feed=feed,
+                                          fetch_list=fetch_list,
+                                          fetch_var_name=fetch_var_name,
+                                          return_numpy=return_numpy,
+                                          return_merged=return_merged)
+
+            return self._run_program(program,
+                                     feed=feed,
+                                     fetch_list=fetch_list,
+                                     feed_var_name=feed_var_name,
+                                     fetch_var_name=fetch_var_name,
+                                     scope=scope,
+                                     return_numpy=return_numpy,
+                                     use_program_cache=use_program_cache)
 
         program._compile(scope, self.place)
         if program._is_inference:
             return self._run_inference(program._executor, feed)
         else:
-            return self._run_parallel(
-                program,
-                scope=scope,
-                feed=feed,
-                fetch_list=fetch_list,
-                fetch_var_name=fetch_var_name,
-                return_numpy=return_numpy,
-                return_merged=return_merged)
+            return self._run_parallel(program,
+                                      scope=scope,
+                                      feed=feed,
+                                      fetch_list=fetch_list,
+                                      fetch_var_name=fetch_var_name,
+                                      return_numpy=return_numpy,
+                                      return_merged=return_merged)
 
     def _run_program(self, program, feed, fetch_list, feed_var_name,
                      fetch_var_name, scope, return_numpy, use_program_cache):
@@ -1586,12 +1591,11 @@ def _run_program(self, program, feed, fetch_list, feed_var_name,
             ctx = cached_ctx
             scope = cached_scope
         else:
-            program = self._add_feed_fetch_ops(
-                program=program,
-                feed=feed,
-                fetch_list=fetch_list,
-                feed_var_name=feed_var_name,
-                fetch_var_name=fetch_var_name)
+            program = self._add_feed_fetch_ops(program=program,
+                                               feed=feed,
+                                               fetch_list=fetch_list,
+                                               feed_var_name=feed_var_name,
+                                               fetch_var_name=fetch_var_name)
 
         self._feed_data(program, feed, feed_var_name, scope)
         if hasattr(program, 'lr_sheduler'):
@@ -1621,7 +1625,8 @@ def _run_inference(self, exe, feed):
         return exe.run(feed)
 
     def _check_fetch_list(self, fetch_list):
-        is_fetch_var = lambda var: isinstance(var, (Variable, str, six.string_types))
+        is_fetch_var = lambda var: isinstance(var,
+                                              (Variable, str, six.string_types))
         is_tuple_list = lambda var: isinstance(var, (tuple, list))
 
         if fetch_list is None: return []
@@ -1644,8 +1649,9 @@ def _check_fetch_list(self, fetch_list):
                     res.append(var)
             else:
                 raise TypeError(
-                    "Require fetch_list[{}] 's type shall be one of (Variable, str), but received {}.".
-                    format(i, type(var).__name__))
+                    "Require fetch_list[{}] 's type shall be one of (Variable, str), but received {}."
+                    .format(i,
+                            type(var).__name__))
 
         return res
 
@@ -1760,7 +1766,7 @@ def _run_from_dataset(self,
             import paddle
             if dataset is not None:
                 raise RuntimeError("dataset should be None for pipeline mode")
-            # The following fake dataset is created to call 
+            # The following fake dataset is created to call
             # the _prepare_trainer api, and it is meaningless.
             data_vars = []
             for var in program.global_block().vars.values():
@@ -1778,24 +1784,26 @@ def _run_from_dataset(self,
             dataset.set_use_var(data_vars)
         elif program._heter_pipeline_opt is not None:
             stage_id = program._heter_pipeline_opt["pipeline_stage"]
+            #print("test_fl_stage_id: {}".format(stage_id))
             heter_place = program._heter_pipeline_opt["heter_place"]
             if stage_id != 0:
-                import paddle
-                if dataset is not None:
-                    raise RuntimeError(
-                        "dataset should be None for heter pipeline mode")
-                # The following fake dataset is created to call 
-                # the _prepare_trainer api, and it is meaningless.
-                data_vars = []
-                for var in program.global_block().vars.values():
-                    if var.is_data:
-                        data_vars.append(var)
-                dataset = paddle.fluid.DatasetFactory().create_dataset(
-                    'InMemoryDataset')
-                dataset.set_batch_size(1)
-                dataset.set_thread(1)
-                dataset.set_filelist(['None'])
-                dataset.set_use_var(data_vars)
+                if "is_fl_mode" not in program._heter_pipeline_opt:
+                    import paddle
+                    if dataset is not None:
+                        raise RuntimeError(
+                            "dataset should be None for heter pipeline mode")
+                    # The following fake dataset is created to call
+                    # the _prepare_trainer api, and it is meaningless.
+                    data_vars = []
+                    for var in program.global_block().vars.values():
+                        if var.is_data:
+                            data_vars.append(var)
+                    dataset = paddle.fluid.DatasetFactory().create_dataset(
+                        'InMemoryDataset')
+                    dataset.set_batch_size(1)
+                    dataset.set_thread(1)
+                    dataset.set_filelist(['None'])
+                    dataset.set_use_var(data_vars)
             else:
                 if dataset is None:
                     raise RuntimeError(
@@ -1836,15 +1844,14 @@ def _run_from_dataset(self,
                         'op_role',
                         core.op_proto_and_checker_maker.OpRole.Optimize)
             fetch_list = None
-        scope, trainer = self._prepare_trainer(
-            program=program,
-            dataset=dataset,
-            scope=scope,
-            thread=thread,
-            debug=debug,
-            fetch_list=fetch_list,
-            fetch_info=fetch_info,
-            print_period=print_period)
+        scope, trainer = self._prepare_trainer(program=program,
+                                               dataset=dataset,
+                                               scope=scope,
+                                               thread=thread,
+                                               debug=debug,
+                                               fetch_list=fetch_list,
+                                               fetch_info=fetch_info,
+                                               print_period=print_period)
 
         trainer._set_infer(is_infer)
         trainer._gen_trainer_desc()
@@ -1855,10 +1862,11 @@ def _run_from_dataset(self,
         # warning if dataset not set psgpu in psgpu mode
         if dataset.use_ps_gpu is False and trainer.proto_desc.use_ps_gpu:
             logging.warning("dataset should call set_use_ps_gpu in PsGpu mode")
+
         dataset._dynamic_adjust_before_train(trainer.proto_desc.thread_num)
 
         if program._heter_pipeline_opt is None:
-            trainer_instance = self._default_executor.init_for_dataset(
+            trainer_instance = self._default_executor.init_for_dataset(  # -->InitForDataset
                 program.desc, trainer._desc(), scope, dataset.dataset)
         else:
             # cache trainer instance for heterps pipeline training
@@ -1869,6 +1877,7 @@ def _run_from_dataset(self,
             if trainer_instance is None:
                 trainer_instance = self._default_executor.init_for_dataset(
                     program.desc, trainer._desc(), scope, dataset.dataset)
+                #print("test_fl_ps - trainer_desc: {}\n".format(trainer))
                 self._add_trainer_cache(cache_key, trainer_instance)
             else:
                 trainer_instance.ResetDataset(dataset.dataset)
@@ -1950,12 +1959,11 @@ def _get_real_program_fetch_list():
                 if fetch_var_name in real_program.global_block().vars:
                     real_fetch_list.append(fetch_var)
 
-            real_program = self._add_feed_fetch_ops(
-                program=real_program,
-                feed=[],
-                fetch_list=real_fetch_list,
-                feed_var_name='feed',
-                fetch_var_name='fetch')
+            real_program = self._add_feed_fetch_ops(program=real_program,
+                                                    feed=[],
+                                                    fetch_list=real_fetch_list,
+                                                    feed_var_name='feed',
+                                                    fetch_var_name='fetch')
             main_block = real_program.block(0)
             for op in main_block.ops:
                 # set the op_role of fetch op to Optimize to avoid
@@ -1971,15 +1979,14 @@ def _get_real_program_fetch_list():
         program._pipeline_opt["section_program"] = real_program
         fetch_list = None
 
-        scope, trainer = self._prepare_trainer(
-            program=program,
-            dataset=dataset,
-            scope=scope,
-            thread=thread,
-            debug=debug,
-            fetch_list=fetch_list,
-            fetch_info=fetch_info,
-            print_period=print_period)
+        scope, trainer = self._prepare_trainer(program=program,
+                                               dataset=dataset,
+                                               scope=scope,
+                                               thread=thread,
+                                               debug=debug,
+                                               fetch_list=fetch_list,
+                                               fetch_info=fetch_info,
+                                               print_period=print_period)
 
         trainer._set_infer(is_infer)
         trainer._gen_trainer_desc()
@@ -2030,8 +2037,7 @@ def _prepare_fleet_executor_carrier(self,
                    fleet_opt["dist_strategy"]["pp_degree"] == 1:
                     warnings.warn("Using 1F1B scheduler with pp_degree == 1.")
                 tasks, task_id_to_rank = run1f1b(
-                    program, cur_rank,
-                    fleet_opt.get('num_micro_batches', 1),
+                    program, cur_rank, fleet_opt.get('num_micro_batches', 1),
                     fleet_opt.get('dist_strategy', {}), nrank)
             elif scheduler == 'Origin':
                 from paddle.distributed.fleet.fleet_executor_utils import origin
@@ -2054,8 +2060,7 @@ def _prepare_fleet_executor_carrier(self,
         # NOTE: the last argument is used to force create some vars in root scope,
         # won't be used during train.
         self._fleet_executor.init(carrier_id, program.desc, scope, place,
-                                  num_micro_batches, tasks, task_id_to_rank,
-                                  [])
+                                  num_micro_batches, tasks, task_id_to_rank, [])
 
     def _run_using_fleet_executor(self,
                                   program=None,
@@ -2102,10 +2107,9 @@ def _run_using_fleet_executor(self,
                 feed_task = fleet_opt['tasks'][0]
                 print("Inserting feed ops for task", feed_task.task_id())
                 feed_program = feed_task.get_program()
-                feed_program = self._add_feed_ops(
-                    program=feed_program,
-                    feed=real_feed,
-                    feed_var_name=feed_var_name)
+                feed_program = self._add_feed_ops(program=feed_program,
+                                                  feed=real_feed,
+                                                  feed_var_name=feed_var_name)
                 feed_task.set_program(feed_program)
 
                 # Insert fetch ops
@@ -2126,11 +2130,10 @@ def _run_using_fleet_executor(self,
                             core.op_proto_and_checker_maker.OpRole.Optimize)
                 fetch_task.set_program(fetch_program)
 
-            self._prepare_fleet_executor_carrier(
-                cache_key,
-                program=cached_program,
-                scope=cached_scope,
-                fleet_opt=fleet_opt)
+            self._prepare_fleet_executor_carrier(cache_key,
+                                                 program=cached_program,
+                                                 scope=cached_scope,
+                                                 fleet_opt=fleet_opt)
 
         if feed:
             # NOTE: don't have to traverse programs in task nodes,
@@ -2175,11 +2178,10 @@ def _add_feed_ops(self, program, feed, feed_var_name):
             for i, name in enumerate(feed):
                 if global_block.has_var(name):
                     out = global_block.var(name)
-                    global_block._prepend_op(
-                        type='feed',
-                        inputs={'X': [feed_var]},
-                        outputs={'Out': [out]},
-                        attrs={'col': i})
+                    global_block._prepend_op(type='feed',
+                                             inputs={'X': [feed_var]},
+                                             outputs={'Out': [out]},
+                                             attrs={'col': i})
                 else:
                     warnings.warn(
                         "The variable %s is not found in program. It is not declared or is pruned."
@@ -2214,13 +2216,13 @@ def _add_fetch_ops(self,
                                    fetch_op):
             for i, var in enumerate(fetch_list):
                 assert isinstance(var, Variable) or isinstance(
-                    var, six.string_types), (
-                        "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
-                global_block.append_op(
-                    type=fetch_op,
-                    inputs={'X': [var]},
-                    outputs={'Out': [fetch_var]},
-                    attrs={'col': i})
+                    var,
+                    six.string_types), ("Wrong type for fetch_list[%s]: %s" %
+                                        (i, type(var)))
+                global_block.append_op(type=fetch_op,
+                                       inputs={'X': [var]},
+                                       outputs={'Out': [fetch_var]},
+                                       attrs={'col': i})
 
         return tmp_program
 
@@ -2341,31 +2343,16 @@ def start_heter_trainer(self,
                             fetch_info=None,
                             print_period=100,
                             fetch_handler=None):
-        return self._start_heter_trainer(program, scope, False, debug,
-                                         fetch_list, fetch_info, print_period,
-                                         fetch_handler)
-
-    def _start_heter_trainer(self,
-                             program=None,
-                             scope=None,
-                             is_infer=False,
-                             debug=False,
-                             fetch_list=None,
-                             fetch_info=None,
-                             print_period=100,
-                             fetch_handler=None):
-
-        scope, trainer = self._prepare_trainer(
-            program=program,
-            dataset=None,
-            scope=scope,
-            thread=1,
-            debug=debug,
-            fetch_list=fetch_list,
-            fetch_info=fetch_info,
-            print_period=print_period)
-
-        trainer._set_infer(is_infer)
+        scope, trainer = self._prepare_trainer(program=program,
+                                               dataset=None,
+                                               scope=scope,
+                                               thread=1,
+                                               debug=debug,
+                                               fetch_list=fetch_list,
+                                               fetch_info=fetch_info,
+                                               print_period=print_period)
+
+        trainer._set_infer(False)
         trainer._gen_trainer_desc()
 
         self._dump_debug_info(program=program, trainer=trainer)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index bd453b3ddaa00..e0b4f8d19e861 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -84,22 +84,22 @@
 _global_flags_ = core.globals()
 
 # Some explanation of our execution system 2022.03
-# For now we have 3 kinds of execution system, since we refactored dygraph mode to 
+# For now we have 3 kinds of execution system, since we refactored dygraph mode to
 # build a fast execution system for dynamic mode. But we can't just remove all legacy
-# code once we present the new system for some historical reason. That's why we have 
+# code once we present the new system for some historical reason. That's why we have
 # these flags.
-# 
+#
 # 1. _non_static_mode():
-# _non_static_mode means  we are now running in legacy dygraph mode or dygraph mode. 
+# _non_static_mode means  we are now running in legacy dygraph mode or dygraph mode.
 # 2. dygraph_mode():
 # This flags inidicates we are now running in dygraph mode which called eager mode before.
 # 3. _in_legacy_dygraph():
 # This flags inidicates we are now running in legacy dygraph mode
-# 
+#
 # They have a relation ship as below:
-# Both dygraph_mode and _in_legacy_dygraph are _non_static_mode, but if you are running in 
+# Both dygraph_mode and _in_legacy_dygraph are _non_static_mode, but if you are running in
 # dygraph mode means you are not in _in_legacy_dygraph.
-# 
+#
 # Why we have to make different of _in_legacy_dygraph and dygraph_mode?
 # In some performance issue, we find that python if statement cause server performance problem
 # and we need our new dygraph mode becomes as fast as it could be. That's why we make these flags
@@ -396,13 +396,13 @@ def version_cmp(ver_a, ver_b):
         return
 
     min_version_split = min_version.split('.')
-    min_version_to_check = min_version_split + zero_version[len(
-        min_version_split):]
+    min_version_to_check = min_version_split + zero_version[
+        len(min_version_split):]
 
     if max_version is not None:
         max_version_split = max_version.split('.')
-        max_version_to_check = max_version_split + zero_version[len(
-            max_version_split):]
+        max_version_to_check = max_version_split + zero_version[
+            len(max_version_split):]
 
         if version_cmp(version_installed,
                        max_version_to_check) > 0 or version_cmp(
@@ -419,6 +419,7 @@ def version_cmp(ver_a, ver_b):
 
 
 def _dygraph_not_support_(func):
+
     def __impl__(*args, **kwargs):
         assert not _non_static_mode(
         ), "We don't support %s in dynamic graph mode" % func.__name__
@@ -428,6 +429,7 @@ def __impl__(*args, **kwargs):
 
 
 def _dygraph_only_(func):
+
     def __impl__(*args, **kwargs):
         assert _non_static_mode(
         ), "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode." % func.__name__
@@ -437,6 +439,7 @@ def __impl__(*args, **kwargs):
 
 
 def _static_only_(func):
+
     def __impl__(*args, **kwargs):
         assert not _non_static_mode(
         ), "In PaddlePaddle 2.x, we turn on dynamic graph mode by default, and '%s()' is only supported in static graph mode. So if you want to use this api, please call 'paddle.enable_static()' before this api to enter static graph mode." % func.__name__
@@ -458,6 +461,7 @@ def _set_pipeline_stage(stage):
 # TODO(zhiqiu): We should make VarBase consistent with Variable in future, for example, by inheritting
 # same base class.
 def _fake_interface_only_(func):
+
     def __impl__(*args, **kwargs):
         raise AssertionError(
             "'%s' only can be called by `paddle.Tensor` in dynamic graph mode. Suggestions:\n"
@@ -475,6 +479,7 @@ def __impl__(*args, **kwargs):
 # NOTE(chenweihang): not using `wrap_decorator` here is because `wrap_decorator` will
 # move kwargs to args, which doesn't work in this decorate case
 def deprecate_stat_dict(func):
+
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
         if 'stat_dict' in kwargs:
@@ -967,6 +972,7 @@ def mlu_places(device_ids=None):
 
 
 class NameScope(object):
+
     def __init__(self, name="", parent=None):
         self._children = dict()
         self._name = name
@@ -1154,8 +1160,9 @@ def _debug_string_(proto, throw_on_error=True):
     """
     error_fields = list()
     if not proto.IsInitialized(error_fields) and throw_on_error:
-        raise ValueError("{0} are not initialized.\nThe message is {1}:\n".
-                         format(error_fields, proto))
+        raise ValueError(
+            "{0} are not initialized.\nThe message is {1}:\n".format(
+                error_fields, proto))
     return proto.__str__()
 
 
@@ -1172,19 +1179,20 @@ def _varbase_creator(type=core.VarDesc.VarType.LOD_TENSOR,
     if _in_eager_mode_:
         eager_tensor = core.eager.Tensor(
             dtype if dtype else core.VarDesc.VarType.FP32,
-            list(shape) if shape else [], name, type
-            if type else core.VarDesc.VarType.LOD_TENSOR, True
-            if persistable else False)
+            list(shape) if shape else [], name,
+            type if type else core.VarDesc.VarType.LOD_TENSOR,
+            True if persistable else False)
         eager_tensor.retain_grads()
         return eager_tensor
     else:
         return core.VarBase(dtype if dtype else core.VarDesc.VarType.FP32,
-                            list(shape) if shape else [], name, type
-                            if type else core.VarDesc.VarType.LOD_TENSOR, True
-                            if persistable else False)
+                            list(shape) if shape else [], name,
+                            type if type else core.VarDesc.VarType.LOD_TENSOR,
+                            True if persistable else False)
 
 
 class VariableMetaClass(type):
+
     @classmethod
     def __instancecheck__(cls, instance):
         t = type(instance)
@@ -1197,6 +1205,7 @@ def __instancecheck__(cls, instance):
 
 
 class ParameterMetaClass(VariableMetaClass):
+
     @classmethod
     def __instancecheck__(cls, instance):
         t = type(instance)
@@ -1394,8 +1403,9 @@ def detach(self):
             persistable=self.persistable,
             stop_gradient=True)
 
-        self.block.append_op(
-            type='share_data', inputs={'X': [self]}, outputs={'Out': [output]})
+        self.block.append_op(type='share_data',
+                             inputs={'X': [self]},
+                             outputs={'Out': [output]})
         return output
 
     @fake_interface_only
@@ -1609,8 +1619,8 @@ def _to_readable_code(self):
         dist_context = get_default_distributed_context()
         dist_tensor = dist_context.get_dist_tensor_for_program(self)
         if dist_tensor is not None:
-            var_str += ", {name} = {value}".format(
-                name="dist_attr", value=dist_tensor)
+            var_str += ", {name} = {value}".format(name="dist_attr",
+                                                   value=dist_tensor)
 
         return var_str
 
@@ -1643,8 +1653,8 @@ def to_string(self, throw_on_error, with_details=False):
                 print("=============with detail===============")
                 print(new_variable.to_string(True, True))
         """
-        assert isinstance(throw_on_error, bool) and isinstance(with_details,
-                                                               bool)
+        assert isinstance(throw_on_error, bool) and isinstance(
+            with_details, bool)
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.VarDesc.FromString(six.binary_type(protostr))
         res_str = _debug_string_(proto, throw_on_error)
@@ -1955,12 +1965,13 @@ def T(self):
             persistable=False,
             stop_gradient=False)
 
-        self.block.append_op(
-            type='transpose2',
-            inputs={'X': [self]},
-            outputs={'Out': [out],
-                     'XShape': [input_shape]},
-            attrs={'axis': perm})
+        self.block.append_op(type='transpose2',
+                             inputs={'X': [self]},
+                             outputs={
+                                 'Out': [out],
+                                 'XShape': [input_shape]
+                             },
+                             attrs={'axis': perm})
         return out
 
     def clone(self):
@@ -1993,8 +2004,9 @@ def clone(self):
             persistable=self.persistable,
             stop_gradient=self.stop_gradient)
 
-        self.block.append_op(
-            type='assign', inputs={'X': [self]}, outputs={'Out': [output]})
+        self.block.append_op(type='assign',
+                             inputs={'X': [self]},
+                             outputs={'Out': [output]})
         return output
 
     def _set_error_clip(self, error_clip):
@@ -2060,8 +2072,8 @@ def _slice_indices(self, slice, length):
             start = upper if step < 0 else lower
         else:
             start = slice.start
-            start = max(start + length, lower) if start < 0 else min(start,
-                                                                     upper)
+            start = max(start +
+                        length, lower) if start < 0 else min(start, upper)
 
         # Compute stop.
         if slice.stop is None:
@@ -2135,22 +2147,24 @@ def _cloneVar(self, copy=False):
 
     def _sliceVar(self, axes, starts, ends):
         new_var = self._cloneVar()
-        self.block.append_op(
-            type="slice",
-            inputs={'Input': [self]},
-            outputs={'Out': [new_var]},
-            attrs={'axes': axes,
-                   'starts': starts,
-                   'ends': ends})
+        self.block.append_op(type="slice",
+                             inputs={'Input': [self]},
+                             outputs={'Out': [new_var]},
+                             attrs={
+                                 'axes': axes,
+                                 'starts': starts,
+                                 'ends': ends
+                             })
         return new_var
 
     def _concatVar(self, inputs, axis):
         new_var = self._cloneVar()
-        self.block.append_op(
-            type="concat",
-            inputs={'X': inputs},
-            outputs={'Out': [new_var]},
-            attrs={'axis': axis, })
+        self.block.append_op(type="concat",
+                             inputs={'X': inputs},
+                             outputs={'Out': [new_var]},
+                             attrs={
+                                 'axis': axis,
+                             })
         return new_var
 
     def _sliceAndConcatVar(self, item, axis):
@@ -2164,13 +2178,13 @@ def _sliceAndConcatVar(self, item, axis):
                 vars = []
                 if step > 0:
                     while start < stop:
-                        vars.append(
-                            self._sliceVar([axis], [start], [start + 1]))
+                        vars.append(self._sliceVar([axis], [start],
+                                                   [start + 1]))
                         start += step
                 else:
                     while start > stop:
-                        vars.append(
-                            self._sliceVar([axis], [start], [start + 1]))
+                        vars.append(self._sliceVar([axis], [start],
+                                                   [start + 1]))
                         start += step
                 return self._concatVar(vars, axis)
         elif isinstance(item, int):
@@ -2231,14 +2245,14 @@ def get_value(self, scope=None):
                         t_load = paddle.load(path+var.name+'.pdtensor')
                         var.set_value(t_load)
         """
-        # The 'framework' is a low-level module, and 'executor' 
-        # can not be imported at the begainning of this file. 
+        # The 'framework' is a low-level module, and 'executor'
+        # can not be imported at the begainning of this file.
         # Therefore, the above two modules are dynamically imported.
         from .executor import global_scope
         if scope is not None and not isinstance(scope, core._Scope):
             raise TypeError(
-                "`scope` should be None or `paddle.static.Scope` type, but received {}.".
-                format(type(scope)))
+                "`scope` should be None or `paddle.static.Scope` type, but received {}."
+                .format(type(scope)))
 
         if scope is None:
             scope = global_scope()
@@ -2293,19 +2307,19 @@ def set_value(self, value, scope=None):
         '''
 
         # The 'framework' is a low-level module, and 'executor'
-        # can not be imported at the begainning of this file. 
+        # can not be imported at the begainning of this file.
         # Therefore, the above two modules are dynamically imported.
         from .executor import global_scope
 
         if not (isinstance(value, np.ndarray) or hasattr(value, '__array__')):
             raise TypeError(
-                "`value` should be `numpy.ndarray` or `LoDTensor`, but received {}.".
-                format(type(value)))
+                "`value` should be `numpy.ndarray` or `LoDTensor`, but received {}."
+                .format(type(value)))
 
         if scope is not None and not isinstance(scope, core._Scope):
             raise TypeError(
-                "`scope` should be None or `paddle.static.Scope` type, but received {}.".
-                format(type(scope)))
+                "`scope` should be None or `paddle.static.Scope` type, but received {}."
+                .format(type(scope)))
 
         if scope is None:
             scope = global_scope()
@@ -2376,8 +2390,9 @@ def size(self):
             name=unique_name.generate_with_ignorable_key(self.name + "_size"),
             dtype=core.VarDesc.VarType.INT64)
 
-        self.block.append_op(
-            type='size', inputs={'Input': [self]}, outputs={'Out': [output]})
+        self.block.append_op(type='size',
+                             inputs={'Input': [self]},
+                             outputs={'Out': [output]})
         return output
 
     def _set_attr(self, name, val):
@@ -2610,12 +2625,12 @@ def __init__(self,
             op_maker = core.op_proto_and_checker_maker
 
             if op_maker.kOpRoleAttrName() not in op_attrs:
-                op_attrs[op_maker.kOpRoleAttrName(
-                )] = self.block.program._op_role
+                op_attrs[
+                    op_maker.kOpRoleAttrName()] = self.block.program._op_role
 
             role_var_name = op_maker.kOpRoleVarAttrName()
-            if len(self.block.program.
-                   _op_role_var) != 0 and role_var_name not in op_attrs:
+            if len(self.block.program._op_role_var
+                   ) != 0 and role_var_name not in op_attrs:
                 op_attrs[role_var_name] = self.block.program._op_role_var
 
             if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
@@ -2636,10 +2651,10 @@ def __init__(self,
                 op_attrs[callstack_var_name] = []
                 for frame in traceback.extract_stack():
                     op_attrs[callstack_var_name].append(
-                        '  File "{}", line {}, in {}'.format(frame[0], frame[1],
-                                                             frame[2]))
-                    op_attrs[callstack_var_name].append('    {}'.format(frame[
-                        3]))
+                        '  File "{}", line {}, in {}'.format(
+                            frame[0], frame[1], frame[2]))
+                    op_attrs[callstack_var_name].append('    {}'.format(
+                        frame[3]))
 
             self.desc.set_type(type)
             proto = OpProtoHolder.instance().get_op_proto(type)
@@ -2713,9 +2728,10 @@ def find_name(var_list, name):
                     if (m.name not in outputs) and m.dispensable:
                         continue
                     if not ((m.name in outputs) or m.dispensable):
-                        raise ValueError(("Incorrect setting for output(s) of "
-                                          "operator \"%s\", should set: [%s].")
-                                         % (type, m.name))
+                        raise ValueError(
+                            ("Incorrect setting for output(s) of "
+                             "operator \"%s\", should set: [%s].") %
+                            (type, m.name))
                 for out_proto in proto.outputs:
                     if out_proto.name not in outputs:
                         continue
@@ -2745,8 +2761,8 @@ def find_name(var_list, name):
                     raise TypeError("'attrs' should be a dict.")
                 for attr in proto.attrs:
                     attr_name = attr.name
-                    if (attr_name not in op_attrs) or (
-                            op_attrs[attr_name] is None):
+                    if (attr_name
+                            not in op_attrs) or (op_attrs[attr_name] is None):
                         continue
                     attr_val = op_attrs[attr_name]
                     self._update_desc_attr(attr_name, attr_val)
@@ -2863,7 +2879,7 @@ def _to_readable_code(self, skip_op_callstack=True):
                     attrs_str += ", "
                 continue
 
-            # it is bytes of serialized protobuf 
+            # it is bytes of serialized protobuf
             if is_compiled_with_cinn(
             ) and self.type == 'cinn_launch' and name == 'compilation_key':
                 key = self.desc.attr(name)
@@ -2877,8 +2893,9 @@ def _to_readable_code(self, skip_op_callstack=True):
             else:
                 value = self.desc.attr(name)
 
-            a = "{name} = {value}".format(
-                name=name, type=attr_type, value=value)
+            a = "{name} = {value}".format(name=name,
+                                          type=attr_type,
+                                          value=value)
 
             attrs_str += a
             if i != len(attr_names) - 1:
@@ -2888,8 +2905,8 @@ def _to_readable_code(self, skip_op_callstack=True):
         dist_context = get_default_distributed_context()
         dist_op = dist_context.get_dist_op_for_program(self)
         if dist_op is not None:
-            attrs_str += ", {name} = {value}".format(
-                name="dist_attr", value=dist_op)
+            attrs_str += ", {name} = {value}".format(name="dist_attr",
+                                                     value=dist_op)
 
         if outputs_str != "{}":
             op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".\
@@ -3298,8 +3315,8 @@ def to_string(self, throw_on_error, with_details=False):
         Returns:
             str: The debug string.
         """
-        assert isinstance(throw_on_error, bool) and isinstance(with_details,
-                                                               bool)
+        assert isinstance(throw_on_error, bool) and isinstance(
+            with_details, bool)
         if with_details:
             re_add_indent = re.compile(r"\n(.)")
             res_str = "blocks {\n  idx: %d\n  parent_idx: %d" % (
@@ -3491,47 +3508,43 @@ def _rename_var(self, name, new_name):
         d = self.desc.find_var(cpt.to_bytes(new_name))
         if var_type == "Parameter":
             if in_dygraph_mode():
-                var = EagerParamBase(
-                    d.shape(),
-                    d.dtype(),
-                    type=orig_var_type,
-                    name=new_name,
-                    stop_gradient=stop_gradient,
-                    trainable=trainable,
-                    optimize_attr=optimize_attr,
-                    regularizer=regularizer,
-                    error_clip=error_clip)
+                var = EagerParamBase(d.shape(),
+                                     d.dtype(),
+                                     type=orig_var_type,
+                                     name=new_name,
+                                     stop_gradient=stop_gradient,
+                                     trainable=trainable,
+                                     optimize_attr=optimize_attr,
+                                     regularizer=regularizer,
+                                     error_clip=error_clip)
             else:
                 if _in_legacy_dygraph():
-                    var = ParamBase(
-                        d.shape(),
-                        d.dtype(),
-                        type=orig_var_type,
-                        name=new_name,
-                        stop_gradient=stop_gradient,
-                        trainable=trainable,
-                        optimize_attr=optimize_attr,
-                        regularizer=regularizer,
-                        error_clip=error_clip)
+                    var = ParamBase(d.shape(),
+                                    d.dtype(),
+                                    type=orig_var_type,
+                                    name=new_name,
+                                    stop_gradient=stop_gradient,
+                                    trainable=trainable,
+                                    optimize_attr=optimize_attr,
+                                    regularizer=regularizer,
+                                    error_clip=error_clip)
                 else:
-                    var = Parameter(
-                        self,
-                        d.shape(),
-                        d.dtype(),
-                        type=orig_var_type,
-                        name=new_name,
-                        stop_gradient=stop_gradient,
-                        trainable=trainable,
-                        optimize_attr=optimize_attr,
-                        regularizer=regularizer,
-                        error_clip=error_clip)
+                    var = Parameter(self,
+                                    d.shape(),
+                                    d.dtype(),
+                                    type=orig_var_type,
+                                    name=new_name,
+                                    stop_gradient=stop_gradient,
+                                    trainable=trainable,
+                                    optimize_attr=optimize_attr,
+                                    regularizer=regularizer,
+                                    error_clip=error_clip)
         elif var_type == "Variable":
-            var = Variable(
-                self,
-                type=orig_var_type,
-                name=new_name,
-                error_clip=error_clip,
-                stop_gradient=stop_gradient)
+            var = Variable(self,
+                           type=orig_var_type,
+                           name=new_name,
+                           error_clip=error_clip,
+                           stop_gradient=stop_gradient)
 
         # rename the python side, _sync_with_cpp will only add
         # new vars/ops to python side.
@@ -3580,8 +3593,8 @@ def _is_inited_by(block, var):
             init_ops_len = len(init_ops)
             if init_ops_len > 1:
                 raise RuntimeError("param " + param.name +
-                                   " is inited by multiple init ops " + str(
-                                       init_ops))
+                                   " is inited by multiple init ops " +
+                                   str(init_ops))
             elif init_ops_len == 1:
                 # TODO already inited, do nothing, should log a warning
                 pass
@@ -3604,23 +3617,21 @@ def append_op(self, *args, **kwargs):
                 "Op `%s` is executed through `append_op` under the dynamic mode, "
                 "the corresponding API implementation needs to be upgraded to "
                 "using `_C_ops` method." % type, DeprecationWarning)
-            op = Operator(
-                block=self,
-                desc=None,
-                type=type,
-                inputs=None,
-                outputs=None,
-                attrs=attrs)
+            op = Operator(block=self,
+                          desc=None,
+                          type=type,
+                          inputs=None,
+                          outputs=None,
+                          attrs=attrs)
 
             # record ops in tracer rather than blocks
             #
             # TODO(minqiyang): add op stop_gradient support in static mode too.
             # currently, we only support stop_gradient in dygraph mode.
 
-            _dygraph_tracer().trace_op(type,
-                                       kwargs.get("inputs", {}),
-                                       kwargs.get("outputs", {}), attrs
-                                       if attrs else {},
+            _dygraph_tracer().trace_op(type, kwargs.get("inputs", {}),
+                                       kwargs.get("outputs",
+                                                  {}), attrs if attrs else {},
                                        kwargs.get("stop_gradient", False),
                                        inplace_map)
         else:
@@ -3633,13 +3644,12 @@ def append_op(self, *args, **kwargs):
             inputs = kwargs.get("inputs", None)
             outputs = kwargs.get("outputs", None)
             with param_guard(inputs), param_guard(outputs):
-                op = Operator(
-                    block=self,
-                    desc=op_desc,
-                    type=kwargs.get("type", None),
-                    inputs=inputs,
-                    outputs=outputs,
-                    attrs=kwargs.get("attrs", None))
+                op = Operator(block=self,
+                              desc=op_desc,
+                              type=kwargs.get("type", None),
+                              inputs=inputs,
+                              outputs=outputs,
+                              attrs=kwargs.get("attrs", None))
 
             self.ops.append(op)
 
@@ -3706,23 +3716,25 @@ def _prepend_op(self, *args, **kwargs):
         if _non_static_mode():
             type = kwargs.get("type", None)
             attrs = kwargs.get("attrs", {})
-            op = Operator(
-                self, None, type=type, inputs=None, outputs=None, attrs=attrs)
-
-            _dygraph_tracer().trace_op(type,
-                                       kwargs.get("inputs", {}),
-                                       kwargs.get("outputs", {}), attrs
-                                       if attrs else {},
+            op = Operator(self,
+                          None,
+                          type=type,
+                          inputs=None,
+                          outputs=None,
+                          attrs=attrs)
+
+            _dygraph_tracer().trace_op(type, kwargs.get("inputs", {}),
+                                       kwargs.get("outputs", {}),
+                                       attrs if attrs else {},
                                        kwargs.get("stop_gradient", False))
         else:
             op_desc = self.desc._prepend_op()
-            op = Operator(
-                self,
-                op_desc,
-                type=kwargs.get("type", None),
-                inputs=kwargs.get("inputs", None),
-                outputs=kwargs.get("outputs", None),
-                attrs=kwargs.get("attrs", None))
+            op = Operator(self,
+                          op_desc,
+                          type=kwargs.get("type", None),
+                          inputs=kwargs.get("inputs", None),
+                          outputs=kwargs.get("outputs", None),
+                          attrs=kwargs.get("attrs", None))
             self.ops.insert(0, op)
 
         return op
@@ -3739,19 +3751,17 @@ def _sync_with_cpp(self):
                 if var.has_stop_gradient():
                     is_stop_gradient = var.stop_gradient()
                 if var.has_is_parameter() and var.is_parameter():
-                    self.create_parameter(
-                        name=var.name(),
-                        desc=var,
-                        type=var.type(),
-                        shape=var.shape(),
-                        dtype=var.dtype(),
-                        stop_gradient=is_stop_gradient)
+                    self.create_parameter(name=var.name(),
+                                          desc=var,
+                                          type=var.type(),
+                                          shape=var.shape(),
+                                          dtype=var.dtype(),
+                                          stop_gradient=is_stop_gradient)
                 else:
-                    self.create_var(
-                        name=var.name(),
-                        desc=var,
-                        type=var.type(),
-                        stop_gradient=is_stop_gradient)
+                    self.create_var(name=var.name(),
+                                    desc=var,
+                                    type=var.type(),
+                                    stop_gradient=is_stop_gradient)
 
         # sync variables removed from c++ end
         for var in list(self.vars.keys()):
@@ -3835,30 +3845,28 @@ def _copy_param_info_from(self, other):
             assert isinstance(v, Variable)
             new_p = None
             if in_dygraph_mode():
-                new_p = EagerParamBase(
-                    shape=v.shape,
-                    dtype=v.dtype,
-                    type=v.type,
-                    lod_level=v.lod_level,
-                    stop_gradient=p.stop_gradient,
-                    trainable=p.trainable,
-                    optimize_attr=p.optimize_attr,
-                    regularizer=p.regularizer,
-                    error_clip=p.error_clip,
-                    name=v.name)
+                new_p = EagerParamBase(shape=v.shape,
+                                       dtype=v.dtype,
+                                       type=v.type,
+                                       lod_level=v.lod_level,
+                                       stop_gradient=p.stop_gradient,
+                                       trainable=p.trainable,
+                                       optimize_attr=p.optimize_attr,
+                                       regularizer=p.regularizer,
+                                       error_clip=p.error_clip,
+                                       name=v.name)
             else:
                 if _in_legacy_dygraph():
-                    new_p = ParamBase(
-                        shape=v.shape,
-                        dtype=v.dtype,
-                        type=v.type,
-                        lod_level=v.lod_level,
-                        stop_gradient=p.stop_gradient,
-                        trainable=p.trainable,
-                        optimize_attr=p.optimize_attr,
-                        regularizer=p.regularizer,
-                        error_clip=p.error_clip,
-                        name=v.name)
+                    new_p = ParamBase(shape=v.shape,
+                                      dtype=v.dtype,
+                                      type=v.type,
+                                      lod_level=v.lod_level,
+                                      stop_gradient=p.stop_gradient,
+                                      trainable=p.trainable,
+                                      optimize_attr=p.optimize_attr,
+                                      regularizer=p.regularizer,
+                                      error_clip=p.error_clip,
+                                      name=v.name)
                 else:
                     new_p = Parameter(
                         block=self,
@@ -3892,11 +3900,13 @@ def _clone_variable(self, var, force_persistable=True):
         ret_var = None
         # make STEP_SCOPES var can be safely cloned.
         if var.type == core.VarDesc.VarType.STEP_SCOPES:
-            ret_var = self.create_var(
-                name=var.name, persistable=var.persistable, type=var.type)
+            ret_var = self.create_var(name=var.name,
+                                      persistable=var.persistable,
+                                      type=var.type)
         elif var.type == core.VarDesc.VarType.RAW:
-            ret_var = self.create_var(
-                name=var.name, persistable=var.persistable, type=var.type)
+            ret_var = self.create_var(name=var.name,
+                                      persistable=var.persistable,
+                                      type=var.type)
         elif var.type == core.VarDesc.VarType.SELECTED_ROWS:
             ret_var = self.create_var(
                 name=var.name,
@@ -3923,7 +3933,7 @@ def _clone_variable(self, var, force_persistable=True):
 # some Python Variable and all Python Operators should not be used
 # again. Because all Python Variables and all Python Operators are
 # re-constructed inside this method. The underlying VarDesc(OpDesc)
-# of some old Python Variables(all old Python Operators) may have 
+# of some old Python Variables(all old Python Operators) may have
 # been destructed.
 def _apply_pass(main_program,
                 startup_program,
@@ -4437,8 +4447,7 @@ def all_sub_graphs(self, for_test=False):
         """
 
         return [
-            IrGraph(
-                self.graph.get_sub_graph(i), for_test=for_test)
+            IrGraph(self.graph.get_sub_graph(i), for_test=for_test)
             for i in range(self.graph.sub_graph_size())
         ]
 
@@ -4697,9 +4706,9 @@ def draw(self, save_path, name, marked_nodes=None, remove_ctr_var=True):
 
         def _convert_to_pdf(dot_file_path):
             pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
-            exited_code = subprocess.call(
-                'dot -Tpdf ' + dot_file_path + ' -o ' + pdf_save_path,
-                shell=True)
+            exited_code = subprocess.call('dot -Tpdf ' + dot_file_path +
+                                          ' -o ' + pdf_save_path,
+                                          shell=True)
             if exited_code != 0:
                 print('The dot command is needed for creating pdf files.')
                 print('The {} is saved as the dot filetype.'.format(
@@ -4905,14 +4914,18 @@ def get_var_desc_attr_or_none(var_desc, attr_name, allowed_types):
                     old_var = None
 
                 kwargs = {
-                    'type': new_var_desc.type(),
-                    'name': new_var_desc.name(),
-                    'shape': get_var_desc_attr_or_none(new_var_desc, "shape", [
+                    'type':
+                    new_var_desc.type(),
+                    'name':
+                    new_var_desc.name(),
+                    'shape':
+                    get_var_desc_attr_or_none(new_var_desc, "shape", [
                         core.VarDesc.VarType.LOD_TENSOR,
                         core.VarDesc.VarType.SELECTED_ROWS,
                         core.VarDesc.VarType.LOD_TENSOR_ARRAY,
                     ]),
-                    'dtype': get_var_desc_attr_or_none(new_var_desc, "dtype", [
+                    'dtype':
+                    get_var_desc_attr_or_none(new_var_desc, "dtype", [
                         core.VarDesc.VarType.LOD_TENSOR,
                         core.VarDesc.VarType.SELECTED_ROWS,
                         core.VarDesc.VarType.LOD_TENSOR_ARRAY,
@@ -4922,14 +4935,16 @@ def get_var_desc_attr_or_none(var_desc, attr_name, allowed_types):
                         core.VarDesc.VarType.LOD_TENSOR,
                         core.VarDesc.VarType.LOD_TENSOR_ARRAY,
                     ]),
-                    'error_clip': old_var.error_clip
-                    if old_var is not None else None,
-                    'stop_gradient': old_var.stop_gradient
-                    if old_var is not None else False,
-                    'is_data': old_var.is_data
-                    if old_var is not None else False,
-                    'need_check_feed': new_var_desc.need_check_feed(),
-                    'belong_to_optimizer': old_var.belong_to_optimizer
+                    'error_clip':
+                    old_var.error_clip if old_var is not None else None,
+                    'stop_gradient':
+                    old_var.stop_gradient if old_var is not None else False,
+                    'is_data':
+                    old_var.is_data if old_var is not None else False,
+                    'need_check_feed':
+                    new_var_desc.need_check_feed(),
+                    'belong_to_optimizer':
+                    old_var.belong_to_optimizer
                     if old_var is not None else False,
                 }
 
@@ -5578,9 +5593,8 @@ def _prune_with_input(self, feeded_var_names, targets):
                 targets_idx.append([t.block.idx, t.idx])
 
         res = Program()
-        res.desc, pruned_origin_block_id_map = core.prune(self.desc,
-                                                          set(feeded_var_names),
-                                                          targets_idx)
+        res.desc, pruned_origin_block_id_map = core.prune(
+            self.desc, set(feeded_var_names), targets_idx)
         res.blocks = [
             Block(res, i) for i in six.moves.range(res.desc.num_blocks())
         ]
@@ -6194,20 +6208,21 @@ def state_dict(self, mode='all', scope=None):
                 paddle.save(prog.state_dict(), path)
         """
         # The 'framework' is a low-level module, and 'executor'
-        # can not be imported at the begainning of this file. 
+        # can not be imported at the begainning of this file.
         # Therefore, the above two modules are dynamically imported.
         from .executor import global_scope
         if scope is not None and not isinstance(scope, core._Scope):
             raise TypeError(
-                "`scope` should be None or `paddle.static.Scope'` type, but received {}.".
-                format(type(scope)))
+                "`scope` should be None or `paddle.static.Scope'` type, but received {}."
+                .format(type(scope)))
 
         if scope is None:
             scope = global_scope()
 
         if not isinstance(mode, str):
-            raise TypeError("Type of `mode` should be string, but received {}.".
-                            format(type(mode)))
+            raise TypeError(
+                "Type of `mode` should be string, but received {}.".format(
+                    type(mode)))
 
         def is_parameter(var):
             return isinstance(var, Parameter)
@@ -6234,8 +6249,8 @@ def condition(var):
                 return is_parameter(var) or is_belong_to_optimizer(var)
             else:
                 raise ValueError(
-                    "`mode` string should be 'param', 'opt' or 'all', but received {}.".
-                    format(mode))
+                    "`mode` string should be 'param', 'opt' or 'all', but received {}."
+                    .format(mode))
 
         var_list = filter(condition, self.list_vars())
 
@@ -6244,8 +6259,8 @@ def condition(var):
             var_temp = scope.find_var(var.name)
             if var_temp is None:
                 raise ValueError(
-                    "Can not find Variable '{}' in the scope. Make sure it is initialized".
-                    format(var.name))
+                    "Can not find Variable '{}' in the scope. Make sure it is initialized"
+                    .format(var.name))
             state_dict[var.name] = var_temp.get_tensor()
 
         return state_dict
@@ -6315,9 +6330,9 @@ def set_state_dict(self, state_dict, scope=None):
                     warnings.warn(
                         ("Skip loading for '{}'. ".format(name) + str(err)))
             else:
-                warnings.warn((
-                    "Skip loading for '{0}'. Because '{0}' not in the program.".
-                    format(name)))
+                warnings.warn(
+                    ("Skip loading for '{0}'. Because '{0}' not in the program."
+                     .format(name)))
 
 
 @six.add_metaclass(ParameterMetaClass)
@@ -6366,14 +6381,13 @@ def __init__(self,
                     "Each dimension of shape for Parameter must be greater than 0, but received %s"
                     % list(shape))
 
-        Variable.__init__(
-            self,
-            block,
-            persistable=True,
-            shape=shape,
-            dtype=dtype,
-            type=type,
-            **kwargs)
+        Variable.__init__(self,
+                          block,
+                          persistable=True,
+                          shape=shape,
+                          dtype=dtype,
+                          type=type,
+                          **kwargs)
         self.trainable = kwargs.get('trainable', True)
 
         self.optimize_attr = kwargs.get('optimize_attr', {'learning_rate': 1.0})
@@ -6413,8 +6427,8 @@ def to_string(self, throw_on_error, with_details=False):
                 debug_str = prog.to_string(throw_on_error=True, with_details=False)
                 print(debug_str)
         """
-        assert isinstance(throw_on_error, bool) and isinstance(with_details,
-                                                               bool)
+        assert isinstance(throw_on_error, bool) and isinstance(
+            with_details, bool)
         if with_details:
             res_str = Variable.to_string(self, throw_on_error, True)
             additional_attr = ("trainable", "optimize_attr", "regularizer",
@@ -6477,10 +6491,10 @@ def __init__(self, shape, dtype, **kwargs):
 
         name = kwargs.get('name', unique_name.generate('_param_base'))
 
-        super(ParamBase, self).__init__(dtype
-                                        if dtype else core.VarDesc.VarType.FP32,
-                                        list(shape) if shape else [], name,
-                                        core.VarDesc.VarType.LOD_TENSOR, True)
+        super(ParamBase,
+              self).__init__(dtype if dtype else core.VarDesc.VarType.FP32,
+                             list(shape) if shape else [], name,
+                             core.VarDesc.VarType.LOD_TENSOR, True)
 
         trainable = kwargs.get('trainable', True)
         self.stop_gradient = not trainable
@@ -6626,10 +6640,10 @@ def __init__(self, shape, dtype, **kwargs):
         if isinstance(shape, core.eager.Tensor):
             shape = shape.numpy()
 
-        super(EagerParamBase, self).__init__(
-            dtype if dtype else core.VarDesc.VarType.FP32,
-            list(shape)
-            if shape else [], name, core.VarDesc.VarType.LOD_TENSOR, True)
+        super(EagerParamBase,
+              self).__init__(dtype if dtype else core.VarDesc.VarType.FP32,
+                             list(shape) if shape else [], name,
+                             core.VarDesc.VarType.LOD_TENSOR, True)
         self.retain_grads()
 
         trainable = kwargs.get('trainable', True)
@@ -7158,8 +7172,8 @@ def _get_paddle_place(place):
         return core.MLUPlace(device_id)
 
     raise ValueError(
-        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlace, MLUPlace and NPUPlace, but received {}.".
-        format(place))
+        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlace, MLUPlace and NPUPlace, but received {}."
+        .format(place))
 
 
 def _get_paddle_place_list(places):
diff --git a/python/paddle/fluid/graphviz.py b/python/paddle/fluid/graphviz.py
index 2b18d854d18bc..798c9914b79ca 100644
--- a/python/paddle/fluid/graphviz.py
+++ b/python/paddle/fluid/graphviz.py
@@ -29,6 +29,7 @@ def crepr(v):
 
 
 class Rank(object):
+
     def __init__(self, kind, name, priority):
         '''
         kind: str
@@ -86,31 +87,28 @@ def edge(self, source, target, **attrs):
     def compile(self, dot_path):
         file = open(dot_path, 'w')
         file.write(self.__str__())
-        image_path = os.path.join(
-            os.path.dirname(dot_path), dot_path[:-3] + "pdf")
+        image_path = os.path.join(os.path.dirname(dot_path),
+                                  dot_path[:-3] + "pdf")
         cmd = ["dot", "-Tpdf", dot_path, "-o", image_path]
-        subprocess.Popen(
-            cmd,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
+        subprocess.Popen(cmd,
+                         stdin=subprocess.PIPE,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE)
         logging.warning("write block debug graph to {}".format(image_path))
         return image_path
 
     def show(self, dot_path):
         image = self.compile(dot_path)
         cmd = ["open", image]
-        subprocess.Popen(
-            cmd,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
+        subprocess.Popen(cmd,
+                         stdin=subprocess.PIPE,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE)
 
     def _rank_repr(self):
-        ranks = sorted(
-            six.iteritems(self.rank_groups),
-            key=functools.cmp_to_key(
-                lambda a, b: a[1].priority > b[1].priority))
+        ranks = sorted(six.iteritems(self.rank_groups),
+                       key=functools.cmp_to_key(
+                           lambda a, b: a[1].priority > b[1].priority))
         repr = []
         for x in ranks:
             repr.append(str(x[1]))
@@ -123,8 +121,8 @@ def __str__(self):
         ]
 
         for attr in self.attrs:
-            reprs.append("{key}={value};".format(
-                key=attr, value=crepr(self.attrs[attr])))
+            reprs.append("{key}={value};".format(key=attr,
+                                                 value=crepr(self.attrs[attr])))
 
         reprs.append(self._rank_repr())
 
@@ -159,6 +157,7 @@ def __str__(self):
 
 
 class Edge(object):
+
     def __init__(self, source, target, **attrs):
         '''
         Link source to target.
@@ -175,9 +174,9 @@ def __str__(self):
         repr = "{source} -> {target} {extra}".format(
             source=self.source.name,
             target=self.target.name,
-            extra="" if not self.attrs else
-            "[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
-                           for attr in six.iteritems(self.attrs)) + "]")
+            extra="" if not self.attrs else "[" +
+            ','.join("{}={}".format(attr[0], crepr(attr[1]))
+                     for attr in six.iteritems(self.attrs)) + "]")
         return repr
 
 
@@ -192,7 +191,8 @@ def __init__(self, title):
             title,
             layout="dot",
             concentrate="true",
-            rankdir="TB", )
+            rankdir="TB",
+        )
 
         self.op_rank = self.graph.rank_group('same', 2)
         self.param_rank = self.graph.rank_group('same', 1)
@@ -221,16 +221,15 @@ def add_param(self, name, data_type, highlight=False):
             '  </tr>',
             '</table>>',
         ])
-        return self.graph.node(
-            label,
-            prefix="param",
-            description=name,
-            shape="none",
-            style="rounded,filled,bold",
-            width="1.3",
-            color="#148b97" if not highlight else "orange",
-            fontcolor="#ffffff",
-            fontname="Arial")
+        return self.graph.node(label,
+                               prefix="param",
+                               description=name,
+                               shape="none",
+                               style="rounded,filled,bold",
+                               width="1.3",
+                               color="#148b97" if not highlight else "orange",
+                               fontcolor="#ffffff",
+                               fontname="Arial")
 
     def add_op(self, opType, **kwargs):
         highlight = False
@@ -247,26 +246,25 @@ def add_op(self, opType, **kwargs):
             fontname="Arial",
             fontcolor="#ffffff",
             width="1.3",
-            height="0.84", )
+            height="0.84",
+        )
 
     def add_arg(self, name, highlight=False):
-        return self.graph.node(
-            crepr(name),
-            prefix="arg",
-            description=name,
-            shape="box",
-            style="rounded,filled,bold",
-            fontname="Arial",
-            fontcolor="#999999",
-            color="#dddddd" if not highlight else "orange")
+        return self.graph.node(crepr(name),
+                               prefix="arg",
+                               description=name,
+                               shape="box",
+                               style="rounded,filled,bold",
+                               fontname="Arial",
+                               fontcolor="#999999",
+                               color="#dddddd" if not highlight else "orange")
 
     def add_edge(self, source, target, **kwargs):
         highlight = False
         if 'highlight' in kwargs:
             highlight = kwargs['highlight']
             del kwargs['highlight']
-        return self.graph.edge(
-            source,
-            target,
-            color="#00000" if not highlight else "orange",
-            **kwargs)
+        return self.graph.edge(source,
+                               target,
+                               color="#00000" if not highlight else "orange",
+                               **kwargs)
diff --git a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
index 6446642b153bc..b5dd3222b8ff5 100644
--- a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
@@ -69,6 +69,7 @@ def _thread_checker():
 
 
 class AutoCheckpointChecker(object):
+
     def __init__(self):
         self._run_env = None
         self._platform = None
@@ -191,6 +192,7 @@ def generate_range_name():
 
 
 class ExeTrainStatus(SerializableBase):
+
     def __init__(self):
         self._epoch_no = -1  # start epoch_no
         self._hash_key = None
@@ -263,6 +265,7 @@ def __str__(self):
 
 
 class TrainEpochRange(SerializableBase):
+
     def __init__(self,
                  max_epoch_num,
                  name,
@@ -319,11 +322,10 @@ def _look_for_valid(self, cp_nos):
         epoch_no = -1
         for i in cp_nos[::-1]:
             t = TrainEpochRange(self._max_epoch_num, self.name, restored=False)
-            self._cper.load_checkpoint(
-                self._checkpoint_path, [t],
-                self._checker.trainer_id,
-                checkpoint_no=i,
-                local_cache_path=self._checker._fs_cache)
+            self._cper.load_checkpoint(self._checkpoint_path, [t],
+                                       self._checker.trainer_id,
+                                       checkpoint_no=i,
+                                       local_cache_path=self._checker._fs_cache)
             cps.append(t)
             logger.debug("look for valid:{} t:{}".format(i, t._serialize()))
             if epoch_no < 0:
@@ -343,10 +345,9 @@ def _get_last_valid_checkpoint(self):
 
         if g_acp_type == CONST_ACP_TYPE:
             # get the last one
-            self._cper.load_checkpoint(
-                self._checkpoint_path, [self],
-                self._checker.trainer_id,
-                local_cache_path=self._checker._fs_cache)
+            self._cper.load_checkpoint(self._checkpoint_path, [self],
+                                       self._checker.trainer_id,
+                                       local_cache_path=self._checker._fs_cache)
             self._restored_from = CONST_CHECKPOINT
             self._checkpoint_epoch_no = self._epoch_no
 
@@ -359,11 +360,10 @@ def _get_last_valid_checkpoint(self):
                 self._restored_from = CONST_MEMORYINIT
                 return
 
-            self._cper.load_checkpoint(
-                self._checkpoint_path, [self],
-                self._checker.trainer_id,
-                checkpoint_no=i,
-                local_cache_path=self._checker._fs_cache)
+            self._cper.load_checkpoint(self._checkpoint_path, [self],
+                                       self._checker.trainer_id,
+                                       checkpoint_no=i,
+                                       local_cache_path=self._checker._fs_cache)
 
             self._restored_from = CONST_CHECKPOINT
             self._checkpoint_epoch_no = self._epoch_no
@@ -497,9 +497,8 @@ def _save_checkpoint(self):
             logger.debug("save executor checkpoint:{}".format(t._serialize()))
 
         if len(self._exe_status) > 0:
-            self._cper.save_checkpoint(
-                self._checkpoint_path, [self],
-                local_cache_path=self._checker._fs_cache)
+            self._cper.save_checkpoint(self._checkpoint_path, [self],
+                                       local_cache_path=self._checker._fs_cache)
             logger.info("save train_epoch_range checkpoint:{}".format(
                 self._serialize()))
 
@@ -658,11 +657,10 @@ def _auto_checkpoint(exe, prog):
         if t._restored_from is None:
             a = CheckpointSaver(g_train_epoch_range._hdfs)
             m = PaddleModel(exe, program)
-            a.load_checkpoint(
-                g_checker.get_exe_checkpoint_path(key), [m],
-                trainer_id=g_checker.trainer_id,
-                checkpoint_no=t._checkpoint_no,
-                local_cache_path=g_checker._fs_cache)
+            a.load_checkpoint(g_checker.get_exe_checkpoint_path(key), [m],
+                              trainer_id=g_checker.trainer_id,
+                              checkpoint_no=t._checkpoint_no,
+                              local_cache_path=g_checker._fs_cache)
             t._restored_from = CONST_CHECKPOINT
             logger.info("load executor checkpoint {}".format(t))
         t._exe = exe
diff --git a/python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py b/python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
index 08400ab13a25d..c8aeb50f157c0 100644
--- a/python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
+++ b/python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
@@ -16,6 +16,7 @@
 
 
 class SerializableBase(object):
+
     def serialize(self, path):
         raise NotImplementedError
 
@@ -24,6 +25,7 @@ def deserialize(self, path):
 
 
 class PaddleModel(SerializableBase):
+
     def __init__(self, exe, program):
         self._exe = exe
         self._origin_program = program
@@ -35,22 +37,21 @@ def __init__(self, exe, program):
 
     def serialize(self, path):
         from ...io import save_persistables
-        save_persistables(
-            executor=self._exe,
-            dirname=path,
-            main_program=self._program,
-            filename=self._file_name)
+        save_persistables(executor=self._exe,
+                          dirname=path,
+                          main_program=self._program,
+                          filename=self._file_name)
 
     def deserialize(self, path):
         from ...io import load_persistables
-        load_persistables(
-            executor=self._exe,
-            dirname=path,
-            main_program=self._program,
-            filename=self._file_name)
+        load_persistables(executor=self._exe,
+                          dirname=path,
+                          main_program=self._program,
+                          filename=self._file_name)
 
 
 class CheckpointSaver(object):
+
     def __init__(self, fs):
         self._fs = fs
         self._checkpoint_prefix = "__paddle_checkpoint__"
@@ -84,8 +85,9 @@ def save_checkpoint(self,
 
         cache_path = None
         if self._fs.need_upload_download():
-            cache_path = "{}/{}.{}.saved_cache".format(
-                local_cache_path, self._checkpoint_prefix, max_no)
+            cache_path = "{}/{}.{}.saved_cache".format(local_cache_path,
+                                                       self._checkpoint_prefix,
+                                                       max_no)
 
             if trainer_id is not None:
                 cache_path = "{}.{}".format(cache_path, trainer_id)
@@ -137,8 +139,9 @@ def load_checkpoint(self,
         from paddle.distributed.fleet.utils.fs import LocalFS
         local_fs = LocalFS()
         if self._fs.need_upload_download():
-            cache_path = "{}/{}.{}.load_cache".format(
-                local_cache_path, self._checkpoint_prefix, checkpoint_no)
+            cache_path = "{}/{}.{}.load_cache".format(local_cache_path,
+                                                      self._checkpoint_prefix,
+                                                      checkpoint_no)
 
             if trainer_id is not None:
                 cache_path = "{}.{}".format(cache_path, trainer_id)
diff --git a/python/paddle/fluid/incubate/data_generator/__init__.py b/python/paddle/fluid/incubate/data_generator/__init__.py
index 7ff80039ae2e4..0ef851f52e7b8 100644
--- a/python/paddle/fluid/incubate/data_generator/__init__.py
+++ b/python/paddle/fluid/incubate/data_generator/__init__.py
@@ -214,6 +214,7 @@ def local_iter():
 # add more generalized DataGenerator that can adapt user-defined slot
 # for example, [(name, float_list), (name, str_list), (name, int_list)]
 class MultiSlotStringDataGenerator(DataGenerator):
+
     def _gen_str(self, line):
         '''
         Further processing the output of the process() function rewritten by
@@ -251,6 +252,7 @@ def _gen_str(self, line):
 
 
 class MultiSlotDataGenerator(DataGenerator):
+
     def _gen_str(self, line):
         '''
         Further processing the output of the process() function rewritten by
@@ -302,8 +304,8 @@ def _gen_str(self, line):
                 for elem in elements:
                     if isinstance(elem, float):
                         self._proto_info[-1] = (name, "float")
-                    elif not isinstance(elem, int) and not isinstance(elem,
-                                                                      long):
+                    elif not isinstance(elem, int) and not isinstance(
+                            elem, long):
                         raise ValueError(
                             "the type of element%s must be in int or float" %
                             type(elem))
@@ -311,7 +313,8 @@ def _gen_str(self, line):
         else:
             if len(line) != len(self._proto_info):
                 raise ValueError(
-                    "the complete field set of two given line are inconsistent.")
+                    "the complete field set of two given line are inconsistent."
+                )
             for index, item in enumerate(line):
                 name, elements = item
                 if not isinstance(name, str):
@@ -334,8 +337,8 @@ def _gen_str(self, line):
                     if self._proto_info[index][1] != "float":
                         if isinstance(elem, float):
                             self._proto_info[index] = (name, "float")
-                        elif not isinstance(elem, int) and not isinstance(elem,
-                                                                          long):
+                        elif not isinstance(elem, int) and not isinstance(
+                                elem, long):
                             raise ValueError(
                                 "the type of element%s must be in int or float"
                                 % type(elem))
diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py
index 90387337faa2a..f97f46a7c49b3 100644
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
@@ -630,8 +630,8 @@ def generate_role(self):
                 raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
             self._is_barrier_all = 1
             if "PADDLE_IS_BARRIER_ALL_ROLE" in os.environ:
-                self._is_barrier_all = int(os.environ[
-                    "PADDLE_IS_BARRIER_ALL_ROLE"])
+                self._is_barrier_all = int(
+                    os.environ["PADDLE_IS_BARRIER_ALL_ROLE"])
             if training_role == "TRAINER":
                 role = Role.WORKER
                 current_id = int(os.environ["PADDLE_TRAINER_ID"])
@@ -642,9 +642,9 @@ def generate_role(self):
                         "all": len(worker_endpoints) + len(eplist)
                     }
                     # child process for http server
-                    self._http_server = Process(
-                        target=self.__start_kv_server,
-                        args=(self._http_server_d, size_d))
+                    self._http_server = Process(target=self.__start_kv_server,
+                                                args=(self._http_server_d,
+                                                      size_d))
                     self._http_server.daemon = True
                     # set running status to True
                     self._http_server_d["running"] = True
diff --git a/python/paddle/fluid/incubate/fleet/collective/__init__.py b/python/paddle/fluid/incubate/fleet/collective/__init__.py
index 6466ce4b42e6e..da4fe609ca3e6 100644
--- a/python/paddle/fluid/incubate/fleet/collective/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py
@@ -39,16 +39,19 @@
 
 
 class LambConfig(object):
+
     def __init__(self):
         pass
 
 
 class DistFCConfig(object):
+
     def __init__(self):
         pass
 
 
 class Collective(Fleet):
+
     def __init__(self):
         super(Collective, self).__init__(Mode.COLLECTIVE)
         self._local_ip = 0
@@ -186,11 +189,10 @@ def load_checkpoint(self,
 
         m = PaddleModel(executor, main_program)
         c = CheckpointSaver(fs)
-        return c.load_checkpoint(
-            path, [m, train_status],
-            trainer_id=trainer_id,
-            ignore_empty=ignore_empty,
-            local_cache_path=local_cache_path)
+        return c.load_checkpoint(path, [m, train_status],
+                                 trainer_id=trainer_id,
+                                 ignore_empty=ignore_empty,
+                                 local_cache_path=local_cache_path)
 
 
 fleet = Collective()
@@ -294,27 +296,24 @@ def _check_collective_mode(self, main_program, optimizer, strategy):
         if strategy.use_local_sgd:
             strategy.mode = "collective"
             strategy.collective_mode = "local_sgd"
-            self._check_condition(
-                "use_local_sgd",
-                use_dgc=main_program._enable_dgc,
-                use_dist_fc=strategy.use_dist_fc,
-                use_lamb=main_program._use_lamb)
+            self._check_condition("use_local_sgd",
+                                  use_dgc=main_program._enable_dgc,
+                                  use_dist_fc=strategy.use_dist_fc,
+                                  use_lamb=main_program._use_lamb)
 
         if strategy.use_dist_fc:
-            self._check_condition(
-                "use_dist_fc",
-                use_dgc=main_program._enable_dgc,
-                use_local_sgd=strategy.use_local_sgd,
-                use_lamb=main_program._use_lamb)
+            self._check_condition("use_dist_fc",
+                                  use_dgc=main_program._enable_dgc,
+                                  use_local_sgd=strategy.use_local_sgd,
+                                  use_lamb=main_program._use_lamb)
             assert strategy.dist_fc_config is not None, "DistributedStrategy.dist_fc_config should be set"
 
         if strategy._ut4grad_allreduce:
             strategy.mode = "collective"
             strategy.collective_mode = "grad_allreduce"
-            self._check_condition(
-                "_ut4grad_allreduce",
-                use_dgc=main_program._enable_dgc,
-                use_lamb=main_program._use_lamb)
+            self._check_condition("_ut4grad_allreduce",
+                                  use_dgc=main_program._enable_dgc,
+                                  use_lamb=main_program._use_lamb)
 
         if self._strategy.collective_mode=="local_sgd" \
                 or self._strategy.collective_mode == "grad_allreduce":
@@ -346,12 +345,11 @@ def _transpile(self, startup_program, main_program):
         config.hierarchical_allreduce_inter_nranks = self._strategy.hierarchical_allreduce_inter_nranks
 
         t = dist_transpiler.DistributeTranspiler(config=config)
-        t.transpile(
-            trainer_id=trainer_id,
-            trainers=worker_endpoints_env,
-            startup_program=startup_program,
-            program=main_program,
-            current_endpoint=current_endpoint)
+        t.transpile(trainer_id=trainer_id,
+                    trainers=worker_endpoints_env,
+                    startup_program=startup_program,
+                    program=main_program,
+                    current_endpoint=current_endpoint)
 
     def _get_node_ips_from_endpoints(self, endpoints):
         ss = set()
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
index e556a98ed7504..1354c317b0a85 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -109,13 +109,15 @@ def sync_strategy_envs():
             return kwargs
 
         def geo_strategy_envs():
+
             def get_sparse_attrs():
                 opt_init_map = {}
                 opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
                 opt_init_map["fill_constant"] = ["value"]
                 opt_init_map["uniform_random"] = ["seed", "min", "max"]
-                opt_init_map[
-                    "truncated_gaussian_random"] = ["seed", "mean", "std"]
+                opt_init_map["truncated_gaussian_random"] = [
+                    "seed", "mean", "std"
+                ]
 
                 dist_varnames = get_sparse_tablenames(self._origin_main_program,
                                                       True)
@@ -129,8 +131,8 @@ def get_sparse_attrs():
 
                 init_attrs = []
                 for value_name in sparse_varnames:
-                    value_var = self._origin_main_program.global_block().vars[
-                        value_name]
+                    value_var = self._origin_main_program.global_block(
+                    ).vars[value_name]
                     value_attr = [
                         value_name,
                         ",".join([str(dim) for dim in value_var.shape])
@@ -238,14 +240,13 @@ def _init_transpiler_server(self, model_dir=None):
                                                    distribtued_varnames),
                     self.main_program.list_vars()))
 
-            fluid.io.load_vars(
-                self._executor,
-                main_program=self.main_program,
-                dirname=model_dir,
-                vars=remaining_vars)
+            fluid.io.load_vars(self._executor,
+                               main_program=self.main_program,
+                               dirname=model_dir,
+                               vars=remaining_vars)
 
-            self._load_sparse_params(
-                dirname=model_dir, varnames=sparse_varnames)
+            self._load_sparse_params(dirname=model_dir,
+                                     varnames=sparse_varnames)
 
             # todo(tangwei12) load distributed vars
             # self._load_sparse_params(dirname=model_dir, varnames=distribtued_varnames)
@@ -324,7 +325,8 @@ def distributed_optimizer(self, optimizer, strategy=None):
             raise ValueError("optimizer must be an instance of Optimizer")
         if not self._is_initialized:
             raise ValueError(
-                "fleet.init(role) to initialize before optimizer.minimize(loss)")
+                "fleet.init(role) to initialize before optimizer.minimize(loss)"
+            )
 
         if not strategy:
             _strategy = StrategyFactory.create_async_strategy()
@@ -439,8 +441,9 @@ def _get_optimizer_status(self, op, param_name):
         reshaped_val_map["adamax"] = ["moment_0", "inf_norm_0"]
         reshaped_val_map["momentum"] = ["velocity_0"]
         reshaped_val_map["lars_momentum"] = ["velocity_0"]
-        reshaped_val_map[
-            "rmsprop"] = ["momentum_0", "mean_square_0", "mean_grad_0"]
+        reshaped_val_map["rmsprop"] = [
+            "momentum_0", "mean_square_0", "mean_grad_0"
+        ]
         reshaped_val_map["decayed_adagrad"] = ["moment_0"]
         reshaped_val_map["ftrl"] = ["squared_0", "linear_0"]
 
@@ -450,8 +453,8 @@ def _get_optimizer_status(self, op, param_name):
 
         if op not in supported_opts:
             raise ValueError(
-                "fleet can not support optimizer: {}, only this can be supported: {}".
-                format(op, supported_opts))
+                "fleet can not support optimizer: {}, only this can be supported: {}"
+                .format(op, supported_opts))
 
         reshaped_names = [
             param_name + "_" + val for val in reshaped_val_map[op]
@@ -492,19 +495,23 @@ def _save_dense_params(self, executor, dirname, context, main_program):
 
             for var_name in [varname] + reshaped_varnames + origin_varnames:
                 var = self._origin_main_program.global_block().vars[var_name]
-                block.append_op(
-                    type='recv_save',
-                    attrs={
-                        "trainer_id": self._role_maker.worker_index(),
-                        "shape": var.shape,
-                        "slice_shapes":
-                        [",".join([str(i) for i in var.shape])],
-                        "slice_varnames": [var.name],
-                        "remote_varnames": [var.name],
-                        "is_sparse": False,
-                        "endpoints": var_ctx.split_endpoints(),
-                        "file_path": os.path.join(dirname, var.name)
-                    })
+                block.append_op(type='recv_save',
+                                attrs={
+                                    "trainer_id":
+                                    self._role_maker.worker_index(),
+                                    "shape":
+                                    var.shape,
+                                    "slice_shapes":
+                                    [",".join([str(i) for i in var.shape])],
+                                    "slice_varnames": [var.name],
+                                    "remote_varnames": [var.name],
+                                    "is_sparse":
+                                    False,
+                                    "endpoints":
+                                    var_ctx.split_endpoints(),
+                                    "file_path":
+                                    os.path.join(dirname, var.name)
+                                })
 
         executor.run(prog)
         return local_vars
@@ -532,30 +539,37 @@ def _save_sparse_params(self, executor, dirname, context, main_program):
             for section in var_ctx.sections():
                 slice_shapes.append(str(section) + dims1)
 
-            block.append_op(
-                type='recv_save',
-                attrs={
-                    "trainer_id": self._role_maker.worker_index(),
-                    "shape": var.shape,
-                    "slice_shapes": slice_shapes,
-                    "slice_varnames": var_ctx.split_varnames(),
-                    "remote_varnames": var_ctx.split_varnames(),
-                    "is_sparse": True,
-                    "endpoints": var_ctx.split_endpoints(),
-                    "pserver_num":
-                    len(self._role_maker.get_pserver_endpoints()),
-                    "file_path": os.path.join(dirname, var.name)
-                })
+            block.append_op(type='recv_save',
+                            attrs={
+                                "trainer_id":
+                                self._role_maker.worker_index(),
+                                "shape":
+                                var.shape,
+                                "slice_shapes":
+                                slice_shapes,
+                                "slice_varnames":
+                                var_ctx.split_varnames(),
+                                "remote_varnames":
+                                var_ctx.split_varnames(),
+                                "is_sparse":
+                                True,
+                                "endpoints":
+                                var_ctx.split_endpoints(),
+                                "pserver_num":
+                                len(self._role_maker.get_pserver_endpoints()),
+                                "file_path":
+                                os.path.join(dirname, var.name)
+                            })
 
             for reshaped_varname in reshaped_varnames:
-                var = self._origin_main_program.global_block().vars[
-                    reshaped_varname]
+                var = self._origin_main_program.global_block(
+                ).vars[reshaped_varname]
 
                 slice_varnames = []
                 remote_varnames = []
                 for i in range(len(var_ctx.split_varnames())):
-                    slice_varnames.append("{}.block{}".format(reshaped_varname,
-                                                              i))
+                    slice_varnames.append("{}.block{}".format(
+                        reshaped_varname, i))
                     remote_varnames.append(reshaped_varname)
 
                 block.append_op(
@@ -574,22 +588,26 @@ def _save_sparse_params(self, executor, dirname, context, main_program):
                     })
 
             for origin_varname in origin_varnames:
-                var = self._origin_main_program.global_block().vars[
-                    origin_varname]
-
-                block.append_op(
-                    type='recv_save',
-                    attrs={
-                        "trainer_id": self._role_maker.worker_index(),
-                        "shape": var.shape,
-                        "slice_shapes":
-                        [",".join([str(i) for i in var.shape])],
-                        "slice_varnames": [origin_varname],
-                        "remote_varnames": [origin_varname],
-                        "is_sparse": False,
-                        "endpoints": var_ctx.split_endpoints()[:1],
-                        "file_path": os.path.join(dirname, var.name)
-                    })
+                var = self._origin_main_program.global_block(
+                ).vars[origin_varname]
+
+                block.append_op(type='recv_save',
+                                attrs={
+                                    "trainer_id":
+                                    self._role_maker.worker_index(),
+                                    "shape":
+                                    var.shape,
+                                    "slice_shapes":
+                                    [",".join([str(i) for i in var.shape])],
+                                    "slice_varnames": [origin_varname],
+                                    "remote_varnames": [origin_varname],
+                                    "is_sparse":
+                                    False,
+                                    "endpoints":
+                                    var_ctx.split_endpoints()[:1],
+                                    "file_path":
+                                    os.path.join(dirname, var.name)
+                                })
         executor.run(prog)
         return context.keys()
 
@@ -599,16 +617,15 @@ def _save_distributed_params(self, executor, dirname, context,
         block = prog.global_block()
 
         for name, var_ctx in context.items():
-            block.append_op(
-                type='checkpoint_notify',
-                attrs={
-                    "varname": name,
-                    "is_slice": True,
-                    "slice_varnames": var_ctx.split_varnames(),
-                    "remote_varnames": var_ctx.split_varnames(),
-                    "endpoints": var_ctx.split_endpoints(),
-                    "dirname": dirname
-                })
+            block.append_op(type='checkpoint_notify',
+                            attrs={
+                                "varname": name,
+                                "is_slice": True,
+                                "slice_varnames": var_ctx.split_varnames(),
+                                "remote_varnames": var_ctx.split_varnames(),
+                                "endpoints": var_ctx.split_endpoints(),
+                                "dirname": dirname
+                            })
 
         executor.run(prog)
         return context.keys()
@@ -626,8 +643,9 @@ def _save_distributed_persistables(self, executor, dirname, main_program):
         recv_dense_varnames = self._save_dense_params(executor, dirname,
                                                       dense_ctx, main_program)
 
-        recv_sparse_varnames = self._save_sparse_params(
-            executor, dirname, sparse_ctx, main_program)
+        recv_sparse_varnames = self._save_sparse_params(executor, dirname,
+                                                        sparse_ctx,
+                                                        main_program)
 
         recv_distributed_varnames = self._save_distributed_params(
             executor, dirname, distributed_ctx, main_program)
@@ -636,15 +654,13 @@ def _save_distributed_persistables(self, executor, dirname, main_program):
             recv_sparse_varnames) + list(recv_distributed_varnames)
 
         remaining_vars = list(
-            filter(
-                FleetTranspiler.__exclude_vars(saved_varnames),
-                main_program.list_vars()))
+            filter(FleetTranspiler.__exclude_vars(saved_varnames),
+                   main_program.list_vars()))
 
-        fluid.io.save_vars(
-            executor,
-            main_program=main_program,
-            dirname=dirname,
-            vars=remaining_vars)
+        fluid.io.save_vars(executor,
+                           main_program=main_program,
+                           dirname=dirname,
+                           vars=remaining_vars)
 
     def save_persistables(self, executor, dirname, main_program=None, **kwargs):
         """
@@ -690,6 +706,7 @@ def save_persistables(self, executor, dirname, main_program=None, **kwargs):
 
     @staticmethod
     def __exclude_vars(exclude_var_names=[]):
+
         def is_valid(var):
             if var.name in exclude_var_names:
                 return False
@@ -738,10 +755,11 @@ def __init__(self, optimizer, strategy, mode=PSMode.TRANSPILER):
         if self._mode == PSMode.PSLIB:
             self._optimizer_name = "Distributed%s" % optimizer.type.capitalize()
             if optimizer.type != "adam":
-                print("Currently, distributed optimizer only support Adam"
-                      "Will config built-in adam for you."
-                      "We will support more functions in DistributedOptimizer",
-                      sys.stderr)
+                print(
+                    "Currently, distributed optimizer only support Adam"
+                    "Will config built-in adam for you."
+                    "We will support more functions in DistributedOptimizer",
+                    sys.stderr)
                 self._optimizer_name = "DistributedAdam"
 
             self._optimizer = globals()[self._optimizer_name](optimizer)
@@ -779,8 +797,8 @@ def _build_trainer_programs(self, compiled_config):
             # for startup program
             _startup = worker.fake_init_ops_pass(_startup, compiled_config)
             _startup = worker.init_from_server_pass(_startup, compiled_config)
-            _startup = worker.delet_extra_optimizes_pass(_startup,
-                                                         compiled_config)
+            _startup = worker.delet_extra_optimizes_pass(
+                _startup, compiled_config)
         else:
             _main = worker.append_send_ops_pass(_main, compiled_config)
             _startup = _startup
@@ -803,11 +821,11 @@ def _build_pserver_programs(self, compiled_config):
                                                       compiled_config, True)
 
             if not compiled_config.is_sync_mode():
-                _main = server.delete_unused_in_main_pass(_main,
-                                                          compiled_config)
+                _main = server.delete_unused_in_main_pass(
+                    _main, compiled_config)
 
-            _startup = server.delete_unused_in_startup_pass(_startup, _main,
-                                                            compiled_config)
+            _startup = server.delete_unused_in_startup_pass(
+                _startup, _main, compiled_config)
         else:
             _main = server.add_listen_and_serv_pass(_main, compiled_config)
             _main = server.add_rpc_global_flags_pass(_main, compiled_config)
@@ -818,8 +836,8 @@ def _build_pserver_programs(self, compiled_config):
                 _startup, _main, compiled_config)
             _startup = server.large_scale_sparse_pass(_startup, _main,
                                                       compiled_config, True)
-            _startup = server.delete_unused_in_startup_pass(_startup, _main,
-                                                            compiled_config)
+            _startup = server.delete_unused_in_startup_pass(
+                _startup, _main, compiled_config)
 
         return _main, _startup
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
index 2a9d26daaed90..8e40fa81ebbc4 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
@@ -24,6 +24,7 @@
 
 
 class TrainerRuntimeConfig(object):
+
     def __init__(self):
         self.mode = None
         num_threads = os.getenv("CPU_NUM", "1")
@@ -46,9 +47,9 @@ def __init__(self):
         self.runtime_configs['communicator_is_sgd_optimizer'] = os.getenv(
             "FLAGS_communicator_is_sgd_optimizer", "1")
 
-        # not used 
-        self.runtime_configs['rpc_deadline'] = os.getenv("FLAGS_rpc_deadline",
-                                                         "180000")
+        # not used
+        self.runtime_configs['rpc_deadline'] = os.getenv(
+            "FLAGS_rpc_deadline", "180000")
         self.runtime_configs['rpc_retry_times'] = os.getenv(
             "FLAGS_rpc_retry_times", "3")
 
@@ -84,18 +85,18 @@ def get_communicator_flags(self):
                 print('WARNING: In {} mode, communicator_max_merge_var_num '
                       'must be equal to CPU_NUM. But received, '
                       'communicator_max_merge_var_num = {}, CPU_NUM = '
-                      '{}. communicator_max_merge_var_num will be fored to {}.'
-                      .format(mode_str, max_merge_var_num, num_threads,
-                              num_threads))
+                      '{}. communicator_max_merge_var_num will be fored to {}.'.
+                      format(mode_str, max_merge_var_num, num_threads,
+                             num_threads))
                 self.runtime_configs[
                     'communicator_max_merge_var_num'] = num_threads
             if send_queue_size != num_threads:
                 print('WARNING: In {} mode, communicator_send_queue_size '
                       'must be equal to CPU_NUM. But received, '
                       'communicator_send_queue_size = {}, CPU_NUM = '
-                      '{}. communicator_send_queue_size will be fored to {}.'
-                      .format(mode_str, send_queue_size, num_threads,
-                              num_threads))
+                      '{}. communicator_send_queue_size will be fored to {}.'.
+                      format(mode_str, send_queue_size, num_threads,
+                             num_threads))
                 self.runtime_configs[
                     'communicator_send_queue_size'] = num_threads
 
@@ -127,6 +128,7 @@ def __repr__(self):
 
 
 class PSLibRuntimeConfig(object):
+
     def __init__(self):
         self.runtime_configs = {}
 
@@ -135,6 +137,7 @@ def get_runtime_configs(self):
 
 
 class DistributedStrategy(object):
+
     def __init__(self):
         self._program_config = DistributeTranspilerConfig()
         self._trainer_runtime_config = TrainerRuntimeConfig()
@@ -295,6 +298,7 @@ def check_build_strategy(self):
 
 
 class SyncStrategy(DistributedStrategy):
+
     def __init__(self):
         super(SyncStrategy, self).__init__()
         self.check_program_config()
@@ -323,6 +327,7 @@ def check_build_strategy(self):
 
 
 class AsyncStrategy(DistributedStrategy):
+
     def __init__(self):
         super(AsyncStrategy, self).__init__()
         self.check_program_config()
@@ -349,6 +354,7 @@ def check_build_strategy(self):
 
 
 class HalfAsyncStrategy(DistributedStrategy):
+
     def __init__(self):
         super(HalfAsyncStrategy, self).__init__()
         self.check_program_config()
@@ -376,6 +382,7 @@ def check_build_strategy(self):
 
 
 class GeoStrategy(DistributedStrategy):
+
     def __init__(self, update_frequency=100):
         super(GeoStrategy, self).__init__()
         self._program_config.geo_sgd_need_push_nums = update_frequency
@@ -410,6 +417,7 @@ def check_build_strategy(self):
 
 
 class StrategyFactory(object):
+
     def __init_(self):
         pass
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py
index ebf9395361ce1..0018b73e26479 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py
@@ -36,8 +36,8 @@ def split_heter_worker_ops_pass(program, config, stage_id, device):
     3. create heter worker program, add listen&serv op
     """
     default_deveice = "cpu"
-    program, heter_ops, _, program_block_ops = find_heter_ops(program,
-                                                              default_deveice)
+    program, heter_ops, _, program_block_ops = find_heter_ops(
+        program, default_deveice)
     if len(heter_ops) == 0:
         warnings.warn(
             "Currently running in Heter Parameter Server mode, but no OP running on heterogeneous devices, Please check your code."
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
index 295f02e73cf2d..38a4a14b02f38 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
@@ -83,6 +83,7 @@ def _get_optimizer_input_shape(op_type, varkey, orig_shape, param_shape):
 
 
 def _append_pserver_non_opt_ops(optimize_block, opt_op, origin_program, config):
+
     def _get_pserver_grad_param_var(var, var_dict):
         """
         Return pserver side grad/param variable, return None
@@ -122,7 +123,8 @@ def _get_pserver_grad_param_var(var, var_dict):
             # for ops like clipping and weight decay, get the split var(xxx.block0)
             # for inputs / outputs
             grad_block = _get_pserver_grad_param_var(
-                var, program.global_block().vars)
+                var,
+                program.global_block().vars)
             if grad_block:
                 varlist[i] = grad_block
             elif var.name not in program.global_block().vars:
@@ -140,7 +142,8 @@ def _get_pserver_grad_param_var(var, var_dict):
         for i in range(len(varlist)):
             var = varlist[i]
             grad_block = _get_pserver_grad_param_var(
-                var, program.global_block().vars)
+                var,
+                program.global_block().vars)
             if grad_block:
                 varlist[i] = grad_block
             elif var.name not in program.global_block().vars:
@@ -150,11 +153,10 @@ def _get_pserver_grad_param_var(var, var_dict):
                 varlist[i] = program.global_block().vars[var.name]
         outputs[key] = varlist
 
-    return optimize_block.append_op(
-        type=opt_op.type,
-        inputs=inputs,
-        outputs=outputs,
-        attrs=opt_op.all_attrs())
+    return optimize_block.append_op(type=opt_op.type,
+                                    inputs=inputs,
+                                    outputs=outputs,
+                                    attrs=opt_op.all_attrs())
 
 
 def _append_pserver_ops(optimize_block, opt_op, endpoint, grad_to_block_id,
@@ -221,11 +223,10 @@ def _get_param_block(opt_op):
 
             if not param_block:
                 return
-            tmpvar = pserver_block.create_var(
-                name=param_block.name,
-                persistable=True,
-                dtype=param_block.dtype,
-                shape=param_block.shape)
+            tmpvar = pserver_block.create_var(name=param_block.name,
+                                              persistable=True,
+                                              dtype=param_block.dtype,
+                                              shape=param_block.shape)
             new_inputs[key] = tmpvar
 
         elif key == "LearningRate":
@@ -255,22 +256,20 @@ def _get_param_block(opt_op):
         # update accumulator variable shape
         new_shape = _get_optimizer_input_shape(opt_op.type, key, var.shape,
                                                param_var.shape)
-        tmpvar = pserver_block.create_var(
-            name=var.name,
-            persistable=var.persistable,
-            dtype=var.dtype,
-            shape=new_shape)
+        tmpvar = pserver_block.create_var(name=var.name,
+                                          persistable=var.persistable,
+                                          dtype=var.dtype,
+                                          shape=new_shape)
         new_inputs[key] = tmpvar
 
     # change output's ParamOut variable
     outputs = _get_output_map_from_op(origin_program.global_block().vars,
                                       opt_op)
     outputs["ParamOut"] = new_inputs["Param"]
-    optimize_block.append_op(
-        type=opt_op.type,
-        inputs=new_inputs,
-        outputs=outputs,
-        attrs=opt_op.all_attrs())
+    optimize_block.append_op(type=opt_op.type,
+                             inputs=new_inputs,
+                             outputs=outputs,
+                             attrs=opt_op.all_attrs())
 
     # record sparse grad to param name
     if new_inputs["Grad"].type == core.VarDesc.VarType.SELECTED_ROWS:
@@ -332,8 +331,10 @@ def add_listen_and_serv_pass(program, config):
     }
 
     # step5 append the listen_and_serv op
-    program.global_block().append_op(
-        type="listen_and_serv", inputs={'X': []}, outputs={}, attrs=attrs)
+    program.global_block().append_op(type="listen_and_serv",
+                                     inputs={'X': []},
+                                     outputs={},
+                                     attrs=attrs)
 
     return program
 
@@ -358,16 +359,16 @@ def add_rpc_global_flags_pass(program, config):
 
 
 def _clone_var(block, var, persistable=True):
-    return block.create_var(
-        name=var.name,
-        shape=var.shape,
-        dtype=var.dtype,
-        type=var.type,
-        lod_level=var.lod_level,
-        persistable=persistable)
+    return block.create_var(name=var.name,
+                            shape=var.shape,
+                            dtype=var.dtype,
+                            type=var.type,
+                            lod_level=var.lod_level,
+                            persistable=persistable)
 
 
 def add_optimizer_pass(program, config):
+
     def _append_pserver_grad_merge_ops(optimize_block, grad_varname_for_block,
                                        endpoint, grad_to_block_id):
         trainers = config.get_trainers()
@@ -395,12 +396,11 @@ def _append_pserver_grad_merge_ops(optimize_block, grad_varname_for_block,
         else:
             merged_var_name = orig_varname
 
-        merged_var = pserver_block.create_var(
-            name=grad_block.name,
-            persistable=True,
-            type=grad_block.type,
-            dtype=grad_block.dtype,
-            shape=grad_block.shape)
+        merged_var = pserver_block.create_var(name=grad_block.name,
+                                              persistable=True,
+                                              type=grad_block.type,
+                                              dtype=grad_block.dtype,
+                                              shape=grad_block.shape)
 
         grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx))
         if config.is_sync_mode() and trainers > 1:
@@ -416,16 +416,14 @@ def _append_pserver_grad_merge_ops(optimize_block, grad_varname_for_block,
                     shape=grad_block.shape)
                 vars2merge.append(per_trainer_var)
 
-            optimize_block.append_op(
-                type="sum",
-                inputs={"X": vars2merge},
-                outputs={"Out": merged_var},
-                attrs={"use_mkldnn": False})
-            optimize_block.append_op(
-                type="scale",
-                inputs={"X": merged_var},
-                outputs={"Out": merged_var},
-                attrs={"scale": 1.0 / float(trainers)})
+            optimize_block.append_op(type="sum",
+                                     inputs={"X": vars2merge},
+                                     outputs={"Out": merged_var},
+                                     attrs={"use_mkldnn": False})
+            optimize_block.append_op(type="scale",
+                                     inputs={"X": merged_var},
+                                     outputs={"Out": merged_var},
+                                     attrs={"scale": 1.0 / float(trainers)})
         return merged_var
 
     origin_program = config.get_origin_main_program()
@@ -660,24 +658,25 @@ def add_fuse_large_scale_op(block, global_block, table_name, value_names,
             grad = main_program.global_block().vars[op.input("Grad")[0]]
             lr = main_program.global_block().vars[op.input("LearningRate")[0]]
 
-            block._insert_op(
-                opt_idx,
-                type="lookup_sparse_table_fuse_sgd",
-                inputs={"Grad": grad,
-                        "LearningRate": lr},
-                attrs={
-                    "is_entry": is_entry,
-                    "tablename": table_name,
-                    "value_names": value_names
-                })
+            block._insert_op(opt_idx,
+                             type="lookup_sparse_table_fuse_sgd",
+                             inputs={
+                                 "Grad": grad,
+                                 "LearningRate": lr
+                             },
+                             attrs={
+                                 "is_entry": is_entry,
+                                 "tablename": table_name,
+                                 "value_names": value_names
+                             })
 
         elif op.type == "adam":
             grad = main_program.global_block().vars[op.input("Grad")[0]]
             lr = main_program.global_block().vars[op.input("LearningRate")[0]]
-            beta1_pow = main_program.global_block().vars[op.input("Beta1Pow")[
-                0]]
-            beta2_pow = main_program.global_block().vars[op.input("Beta2Pow")[
-                0]]
+            beta1_pow = main_program.global_block().vars[op.input("Beta1Pow")
+                                                         [0]]
+            beta2_pow = main_program.global_block().vars[op.input("Beta2Pow")
+                                                         [0]]
             beta1_pow_o = main_program.global_block().vars[op.output(
                 "Beta1PowOut")[0]]
             beta2_pow_o = main_program.global_block().vars[op.output(
@@ -687,68 +686,71 @@ def add_fuse_large_scale_op(block, global_block, table_name, value_names,
             beta2 = op.attr('beta2')
             epsilon = op.attr('epsilon')
 
-            block._insert_op(
-                opt_idx,
-                type="lookup_sparse_table_fuse_adam",
-                inputs={
-                    "Grad": grad,
-                    "LearningRate": lr,
-                    "Beta1Pow": beta1_pow,
-                    "Beta2Pow": beta2_pow
-                },
-                outputs={
-                    "Beta1PowOut": beta1_pow_o,
-                    "Beta2PowOut": beta2_pow_o
-                },
-                attrs={
-                    "beta1": beta1,
-                    "beta2": beta2,
-                    "epsilon": epsilon,
-                    "is_entry": is_entry,
-                    "tablename": table_name,
-                    "value_names": value_names
-                })
+            block._insert_op(opt_idx,
+                             type="lookup_sparse_table_fuse_adam",
+                             inputs={
+                                 "Grad": grad,
+                                 "LearningRate": lr,
+                                 "Beta1Pow": beta1_pow,
+                                 "Beta2Pow": beta2_pow
+                             },
+                             outputs={
+                                 "Beta1PowOut": beta1_pow_o,
+                                 "Beta2PowOut": beta2_pow_o
+                             },
+                             attrs={
+                                 "beta1": beta1,
+                                 "beta2": beta2,
+                                 "epsilon": epsilon,
+                                 "is_entry": is_entry,
+                                 "tablename": table_name,
+                                 "value_names": value_names
+                             })
         else:
             raise ValueError("only support sgd/adam optimizer now")
 
     def add_large_scale_op(block, global_block, table_name, value_names,
                            acture_names, grad, is_entry, opt_idx):
-        ids = global_block.create_var(
-            name="kSparseIDs@{}".format(table_name),
-            persistable=False,
-            dtype="int64",
-            shape=[1, 1],
-            lod_level=0)
+        ids = global_block.create_var(name="kSparseIDs@{}".format(table_name),
+                                      persistable=False,
+                                      dtype="int64",
+                                      shape=[1, 1],
+                                      lod_level=0)
 
         # insert grad split to ids and tensor op
-        block._insert_op(
-            opt_idx,
-            type="lookup_sparse_table_grad_split",
-            inputs={"Grad": grad},
-            outputs={"Row": ids,
-                     "Value": grad},
-            attrs={"tablename": table_name,
-                   "is_entry": is_entry})
+        block._insert_op(opt_idx,
+                         type="lookup_sparse_table_grad_split",
+                         inputs={"Grad": grad},
+                         outputs={
+                             "Row": ids,
+                             "Value": grad
+                         },
+                         attrs={
+                             "tablename": table_name,
+                             "is_entry": is_entry
+                         })
 
         # insert read at first
         vars = [global_block.vars[acture_name] for acture_name in acture_names]
-        block._insert_op(
-            opt_idx + 1,
-            type="lookup_sparse_table_read",
-            inputs={"Ids": ids},
-            outputs={"Out": vars},
-            attrs={"tablename": table_name,
-                   "value_names": value_names})
+        block._insert_op(opt_idx + 1,
+                         type="lookup_sparse_table_read",
+                         inputs={"Ids": ids},
+                         outputs={"Out": vars},
+                         attrs={
+                             "tablename": table_name,
+                             "value_names": value_names
+                         })
 
         # append write at last
         inputs = {"Ids": ids, "In": vars}
 
-        block.append_op(
-            type="lookup_sparse_table_write",
-            inputs=inputs,
-            outputs={},
-            attrs={"tablename": table_name,
-                   "value_names": value_names})
+        block.append_op(type="lookup_sparse_table_write",
+                        inputs=inputs,
+                        outputs={},
+                        attrs={
+                            "tablename": table_name,
+                            "value_names": value_names
+                        })
 
     op = get_op_by_type(main_program.global_block(), "listen_and_serv")
 
@@ -783,14 +785,13 @@ def add_large_scale_op(block, global_block, table_name, value_names,
             is_entry = False if entry_attr == "none" else True
 
             if fuse:
-                add_fuse_large_scale_op(opt_block,
-                                        program.global_block(), param,
-                                        value_names, acture_names, grad,
+                add_fuse_large_scale_op(opt_block, program.global_block(),
+                                        param, value_names, acture_names, grad,
                                         is_entry, opt_idx)
             else:
-                add_large_scale_op(opt_block,
-                                   program.global_block(), param, value_names,
-                                   acture_names, grad, is_entry, opt_idx)
+                add_large_scale_op(opt_block, program.global_block(), param,
+                                   value_names, acture_names, grad, is_entry,
+                                   opt_idx)
     else:
         large_scale_kv_metas = []
         for param, blockid in param_blockid_map.items():
@@ -844,8 +845,8 @@ def get_distributed_from_listen_and_serv(program, origin_program):
 
 def delete_unused_in_main_pass(program, config):
     origin_program = config.get_origin_main_program()
-    sparse_params = get_distributed_from_listen_and_serv(program,
-                                                         origin_program)
+    sparse_params = get_distributed_from_listen_and_serv(
+        program, origin_program)
 
     for var in sparse_params:
         if program.global_block().has_var(var):
@@ -855,8 +856,8 @@ def delete_unused_in_main_pass(program, config):
 
 def delete_unused_in_startup_pass(program, main_program, config):
     origin_program = config.get_origin_main_program()
-    sparse_params = get_distributed_from_listen_and_serv(main_program,
-                                                         origin_program)
+    sparse_params = get_distributed_from_listen_and_serv(
+        main_program, origin_program)
     remove_ops = []
 
     for op in program.global_block().ops:
@@ -944,11 +945,10 @@ def _get_splited_name_and_shape(varname):
             ]:
                 op._set_attr("shape", list(new_outputs["Out"].shape))
 
-            program.global_block().append_op(
-                type=op.type,
-                inputs=new_inputs,
-                outputs=new_outputs,
-                attrs=op.all_attrs())
+            program.global_block().append_op(type=op.type,
+                                             inputs=new_inputs,
+                                             outputs=new_outputs,
+                                             attrs=op.all_attrs())
 
     return program
 
@@ -981,17 +981,15 @@ def add_geo_optimizer_pass(program, config):
         if origin_varname in sparse_tablenames:
             sparse_grad_to_param.append(":".join([delta_var_name, param.name]))
 
-        delta_var = pserver_block.create_var(
-            name=delta_var_name,
-            persistable=False,
-            type=param.type,
-            dtype=param.dtype,
-            shape=param.shape)
-
-        per_opt_block.append_op(
-            type="sum",
-            inputs={"X": [param, delta_var]},
-            outputs={"Out": param})
+        delta_var = pserver_block.create_var(name=delta_var_name,
+                                             persistable=False,
+                                             type=param.type,
+                                             dtype=param.dtype,
+                                             shape=param.shape)
+
+        per_opt_block.append_op(type="sum",
+                                inputs={"X": [param, delta_var]},
+                                outputs={"Out": param})
 
         param_to_block_id.append(delta_var_name + ":" + str(per_opt_block.idx))
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index b6ec09bab7254..6fb0c85d05c51 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -109,6 +109,7 @@ def get_sparse_tablenames(program, is_distributed):
 
 
 class MergedVariable:
+
     def __init__(self, merged, ordered, offsets):
         self.merged_var = merged
         self.ordered_vars = ordered
@@ -128,6 +129,7 @@ def _singleton(*args, **kargs):
 
 @Singleton
 class CompileTimeStrategy(object):
+
     def __init__(self, main_program, startup_program, strategy, role_maker):
         self.min_block_size = 81920
 
@@ -356,6 +358,7 @@ def build_ctx(self,
                   is_sparse,
                   is_send,
                   is_distributed=False):
+
         def get_grad_var_ep(slices):
             names = []
             eps = []
@@ -367,8 +370,8 @@ def get_grad_var_ep(slices):
                         names.append("{}.delta".format(slice.name))
                     else:
                         names.append(slice.name)
-                elif is_grad and self.is_sync_mode() and self.get_trainers(
-                ) > 1:
+                elif is_grad and self.is_sync_mode(
+                ) and self.get_trainers() > 1:
                     names.append("{}.trainer_{}".format(slice.name,
                                                         self.get_role_id()))
                 else:
@@ -447,8 +450,7 @@ def get_trainer_send_context(self):
                                   param_ctx.split_endpoints(),
                                   param_ctx.sections(),
                                   grad_ctx.origin_varnames(),
-                                  param_ctx.trainer_id(),
-                                  param_ctx.aggregate(),
+                                  param_ctx.trainer_id(), param_ctx.aggregate(),
                                   param_ctx.is_sparse(),
                                   param_ctx.is_distributed())
 
@@ -623,8 +625,8 @@ def get_dense_send_context(self,
             for merged in merged_dense_pairs:
                 grad = merged[1]
                 origin_varname = grad.merged_var.name
-                var = self.origin_main_program.global_block().vars[
-                    origin_varname]
+                var = self.origin_main_program.global_block(
+                ).vars[origin_varname]
                 var_numel = reduce(lambda x, y: x * y, var.shape)
                 grad_name = origin_varname
                 aggregate = True
@@ -782,13 +784,12 @@ def _create_vars_from_blocklist(self, block_list):
 
             if len(split) == 1:
                 var_mapping[varname] = [orig_var]
-                self.var_distributed.add_distributed_var(
-                    origin_var=orig_var,
-                    slice_var=orig_var,
-                    block_id=0,
-                    offset=0,
-                    is_slice=False,
-                    vtype="Param")
+                self.var_distributed.add_distributed_var(origin_var=orig_var,
+                                                         slice_var=orig_var,
+                                                         block_id=0,
+                                                         offset=0,
+                                                         is_slice=False,
+                                                         vtype="Param")
             else:
                 var_mapping[varname] = []
                 orig_shape = orig_var.shape
@@ -921,8 +922,8 @@ def _slice_variable(self,
                         # update split_count after aligning
                 split_count = int(math.ceil(var_numel / float(block_size)))
                 for block_id in range(split_count):
-                    curr_block_size = min(block_size, var_numel - (
-                        (block_id) * block_size))
+                    curr_block_size = min(block_size,
+                                          var_numel - ((block_id) * block_size))
                     block = vars_metatools.VarBlock(var.name, block_id,
                                                     curr_block_size)
                     blocks.append(str(block))
@@ -1010,12 +1011,10 @@ def _var_slice_and_distribute(self):
         # create mapping of endpoint->split var to create pserver side program
         self.param_grad_ep_mapping = collections.OrderedDict()
         [
-            self.param_grad_ep_mapping.update({
-                ep: {
-                    "params": [],
-                    "grads": []
-                }
-            }) for ep in self.get_ps_endpoints()
+            self.param_grad_ep_mapping.update({ep: {
+                "params": [],
+                "grads": []
+            }}) for ep in self.get_ps_endpoints()
         ]
 
     def _build_var_distributed(self):
@@ -1193,9 +1192,10 @@ def _add_lr_decay_table_pass(main_program, compiled_config, lr_decay_steps):
         lr_decay_main_program, lr_decay_startup_program, lr_name = _get_lr_sheduler_program(
             compiled_config.origin_main_program.lr_sheduler, lr_param_dict,
             lr_decay_steps)
-        compiled_config.add_tensor_table(
-            "@LR_DECAY_COUNTER@", lr_name, lr_decay_startup_program,
-            lr_decay_main_program, "GlobalStepTable")
+        compiled_config.add_tensor_table("@LR_DECAY_COUNTER@", lr_name,
+                                         lr_decay_startup_program,
+                                         lr_decay_main_program,
+                                         "GlobalStepTable")
 
 
 def _get_lr_param_dict(opt_ops):
@@ -1260,8 +1260,8 @@ def _get_lr_sheduler_program(lr_sheduler, lr_param_dict, lr_decay_steps):
                 % lr_decay_steps)
     else:
         raise ValueError(
-            "Not supported current LearningRate strategy, please use follow decay strategy: {}".
-            format(schedler_decay))
+            "Not supported current LearningRate strategy, please use follow decay strategy: {}"
+            .format(schedler_decay))
 
     return decay_main_program, decay_startup_program, lr_name
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 51e89cc301cf3..18755212cc16b 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -53,6 +53,7 @@
 
 
 def delete_optimizer_pass(program, config):
+
     def _delete_optimizer_op_and_vars(_program, optimize_ops):
         optimize_vars = []
         optimize_op_role_vars = []
@@ -77,15 +78,14 @@ def _delete_optimizer_op_and_vars(_program, optimize_ops):
 
     def _add_lr_var(main_program, compiled_config):
         # Todo: hard code for pe
-        lr_var = compiled_config.origin_main_program.global_block().vars[
-            "learning_rate_0"]
-        main_program.global_block().create_var(
-            name=lr_var.name,
-            shape=lr_var.shape,
-            dtype=lr_var.dtype,
-            type=lr_var.type,
-            lod_level=lr_var.lod_level,
-            persistable=True)
+        lr_var = compiled_config.origin_main_program.global_block(
+        ).vars["learning_rate_0"]
+        main_program.global_block().create_var(name=lr_var.name,
+                                               shape=lr_var.shape,
+                                               dtype=lr_var.dtype,
+                                               type=lr_var.type,
+                                               lod_level=lr_var.lod_level,
+                                               persistable=True)
 
     optimizer_ops = _get_optimize_ops(program)
     lr_ops = _get_lr_ops(program)
@@ -126,14 +126,15 @@ def _get_pull_sparse_ops(_program):
         for op in _program.global_block().ops:
             if op.type in SPARSE_GRAD_OP_TYPE_DICT.keys():
                 param_name = op.input(SPARSE_GRAD_OP_TYPE_DICT[op.type])[0]
-                if param_name in pull_sparse_ids and op.input("Ids")[
-                        0] in pull_sparse_ids[param_name]:
+                if param_name in pull_sparse_ids and op.input(
+                        "Ids")[0] in pull_sparse_ids[param_name]:
                     ops = push_sparse_ops.get(param_name, [])
                     ops.append(op)
                     push_sparse_ops[param_name] = ops
         return pull_sparse_ops, push_sparse_ops
 
     def _pull_sparse_fuse(_program, pull_sparse_ops, use_ps_gpu):
+
         def dag_check_up_and_reorder(program, inputs, outputs):
             global_block = program.global_block()
             min_output_index = len(global_block.ops)
@@ -294,8 +295,10 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                     program.global_block()._insert_op(
                         index=distributed_idx,
                         type="pull_gpups_sparse",
-                        inputs={"Ids": inputs,
-                                'W': w},
+                        inputs={
+                            "Ids": inputs,
+                            'W': w
+                        },
                         outputs={"Out": outputs},
                         attrs={
                             "size": [w.shape[1] for i in inputs],
@@ -306,8 +309,10 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                     program.global_block()._insert_op(
                         index=distributed_idx,
                         type="distributed_lookup_table",
-                        inputs={"Ids": inputs,
-                                'W': w},
+                        inputs={
+                            "Ids": inputs,
+                            'W': w
+                        },
                         outputs={"Outputs": outputs},
                         attrs={
                             "is_distributed": is_distributed,
@@ -323,8 +328,10 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                     program.global_block()._insert_op(
                         index=distributed_idx,
                         type="distributed_lookup_table",
-                        inputs={"Ids": [inputs[i]],
-                                'W': w},
+                        inputs={
+                            "Ids": [inputs[i]],
+                            'W': w
+                        },
                         outputs={"Outputs": [outputs[i]]},
                         attrs={
                             "is_distributed": is_distributed,
@@ -419,6 +426,7 @@ def _push_sparse_fuse(_program, push_sparse_ops, use_ps_gpu):
             for idx in op_idxs[::-1]:
                 program.global_block()._remove_op(idx)
 
+
 #            if use_ps_gpu:
 #                program.global_block().append_op(
 #                    type="push_box_sparse",
@@ -431,22 +439,22 @@ def _push_sparse_fuse(_program, push_sparse_ops, use_ps_gpu):
 #                        "is_sparse": True
 #                    })
 #            else:
-            program.global_block().append_op(
-                type="distributed_push_sparse",
-                inputs={
-                    "Ids": inputs,
-                    'W': w,
-                    "Outputs": outputs,
-                    "Shows": show,
-                    "Clicks": clk
-                },
-                outputs={"Outputs": outputs},
-                attrs={
-                    "is_distributed": is_distributed,
-                    "padding_idx": padding_idx,
-                    "table_id": table_id,
-                    "size": emb_size[param]
-                })
+            program.global_block().append_op(type="distributed_push_sparse",
+                                             inputs={
+                                                 "Ids": inputs,
+                                                 'W': w,
+                                                 "Outputs": outputs,
+                                                 "Shows": show,
+                                                 "Clicks": clk
+                                             },
+                                             outputs={"Outputs": outputs},
+                                             attrs={
+                                                 "is_distributed":
+                                                 is_distributed,
+                                                 "padding_idx": padding_idx,
+                                                 "table_id": table_id,
+                                                 "size": emb_size[param]
+                                             })
 
     pull_sparse_ops, push_sparse_ops = _get_pull_sparse_ops(program)
     _pull_sparse_fuse(program, pull_sparse_ops, use_ps_gpu)
@@ -473,29 +481,33 @@ def _append_send_op(union_vars, queue, is_sparse, table_id):
             dummy_output = program.global_block().create_var(
                 name=framework.generate_control_dev_var_name())
 
-        program.global_block().append_op(
-            type="send",
-            inputs={"X": send_input_vars},
-            outputs={"Out": dummy_output},
-            attrs={
-                "send_varnames": [queue],
-                "is_sparse": is_sparse,
-                "table_id": table_id,
-                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-            })
+        program.global_block().append_op(type="send",
+                                         inputs={"X": send_input_vars},
+                                         outputs={"Out": dummy_output},
+                                         attrs={
+                                             "send_varnames": [queue],
+                                             "is_sparse":
+                                             is_sparse,
+                                             "table_id":
+                                             table_id,
+                                             RPC_OP_ROLE_ATTR_NAME:
+                                             RPC_OP_ROLE_ATTR_VALUE
+                                         })
 
         return dummy_output
 
     def _append_barrier_op(dummys):
-        program.global_block().append_op(
-            type="send_barrier",
-            inputs={"X": dummys},
-            outputs={"Out": []},
-            attrs={
-                "trainer_id": trainer_id,
-                "half_async": True,
-                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-            })
+        program.global_block().append_op(type="send_barrier",
+                                         inputs={"X": dummys},
+                                         outputs={"Out": []},
+                                         attrs={
+                                             "trainer_id":
+                                             trainer_id,
+                                             "half_async":
+                                             True,
+                                             RPC_OP_ROLE_ATTR_NAME:
+                                             RPC_OP_ROLE_ATTR_VALUE
+                                         })
 
     dummys = []
 
@@ -525,15 +537,17 @@ def init_from_server_pass(program, config):
     fetch_barrier_out = program.global_block().create_var(
         name=framework.generate_control_dev_var_name())
 
-    program.global_block().append_op(
-        type="fetch_barrier",
-        inputs={},
-        outputs={"Out": fetch_barrier_out},
-        attrs={
-            "endpoints": config.get_ps_endpoints(),
-            "trainer_id": config.get_role_id(),
-            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-        })
+    program.global_block().append_op(type="fetch_barrier",
+                                     inputs={},
+                                     outputs={"Out": fetch_barrier_out},
+                                     attrs={
+                                         "endpoints":
+                                         config.get_ps_endpoints(),
+                                         "trainer_id":
+                                         config.get_role_id(),
+                                         RPC_OP_ROLE_ATTR_NAME:
+                                         RPC_OP_ROLE_ATTR_VALUE
+                                     })
     return program
 
 
@@ -555,8 +569,8 @@ def _fake_init_sparsetable(sparse_table_names):
                     table_param_init_op.append(op)
             init_op_num = len(table_param_init_op)
             if init_op_num != 1:
-                raise ValueError("table init op num should be 1, now is " + str(
-                    init_op_num))
+                raise ValueError("table init op num should be 1, now is " +
+                                 str(init_op_num))
             table_init_op = table_param_init_op[0]
             program.global_block().append_op(
                 type="fake_init",
@@ -572,6 +586,7 @@ def _fake_init_sparsetable(sparse_table_names):
 
 
 def ps_gpu_pass(program):
+
     def _add_push_box_sparse_op(program):
         op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
         backward = core.op_proto_and_checker_maker.OpRole.Backward
@@ -744,10 +759,8 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
                     op_list = list(block.ops)
                     sum_op = op_list[var2idx[param_name]]
                     sum_op_inputs = {
-                        sum_op.input_names[0]: [
-                            block.vars[input]
-                            for input in sum_op.input_arg_names
-                        ]
+                        sum_op.input_names[0]:
+                        [block.vars[input] for input in sum_op.input_arg_names]
                     }
                     sum_op_outputs = {
                         sum_op.output_names[0]: [
@@ -755,12 +768,11 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
                             for output in sum_op.output_arg_names
                         ]
                     }
-                    block._insert_op(
-                        index=i + 1,
-                        type=sum_op.type,
-                        inputs=sum_op_inputs,
-                        outputs=sum_op_outputs,
-                        attrs=sum_op.all_attrs())
+                    block._insert_op(index=i + 1,
+                                     type=sum_op.type,
+                                     inputs=sum_op_inputs,
+                                     outputs=sum_op_outputs,
+                                     attrs=sum_op.all_attrs())
                     block._remove_op(var2idx[param_name] + 1)
                     var2idx.pop(param_name)
                     for var_ in var2idx:
@@ -798,12 +810,11 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
                                 for output in sum_op.output_arg_names
                             ]
                         }
-                        block._insert_op(
-                            index=i + 1,
-                            type=sum_op.type,
-                            inputs=sum_op_inputs,
-                            outputs=sum_op_outputs,
-                            attrs=sum_op.all_attrs())
+                        block._insert_op(index=i + 1,
+                                         type=sum_op.type,
+                                         inputs=sum_op_inputs,
+                                         outputs=sum_op_outputs,
+                                         attrs=sum_op.all_attrs())
                         block._remove_op(var2idx[no_grad_var] + 1)
                         var2idx.pop(no_grad_var)
                         for var_ in var2idx:
@@ -818,8 +829,8 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
                         forward_op_type = pre_op.type.split("_grad")[0]
                         if forward_op_type in SPARSE_OP_TYPE_DICT.keys() \
                             and pre_op.attr('remote_prefetch') is True:
-                            param_name = pre_op.input(SPARSE_OP_TYPE_DICT[
-                                forward_op_type])[0]
+                            param_name = pre_op.input(
+                                SPARSE_OP_TYPE_DICT[forward_op_type])[0]
                             if param_name == origin_var and op.attr(
                                     "op_device") == pre_op.attr("op_device"):
                                 continue
@@ -919,7 +930,8 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
     if len(heter_ops) == 0:
         warnings.warn(
             "No heterogeneous OP was found in your program , "
-            " please using fluid.device_guard() to run OPs on different device.")
+            " please using fluid.device_guard() to run OPs on different device."
+        )
 
     total_heter_ops = 0
     heter_blocks = 0
@@ -929,8 +941,8 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
         for _, heter_block in heter_block_dict.items():
             total_heter_ops += len(heter_block)
     print(
-        "There are {} OPs in your main_program, and contains {} heter-OPs which is made up of {} heter-blocks.".
-        format(len(block.ops), total_heter_ops, heter_blocks))
+        "There are {} OPs in your main_program, and contains {} heter-OPs which is made up of {} heter-blocks."
+        .format(len(block.ops), total_heter_ops, heter_blocks))
 
     return origin_porgram, heter_ops, default_ops, program_block_ops
 
@@ -984,25 +996,27 @@ def create_heter_program(program, config, heter_program, program_block_ops_list,
         for _, op in enumerate(heter_block_ops_backward):
             block_append_op(heter_program, program, heter_block_bp, op)
 
-        bp_entrance_vars = block_var_detail[stage_id - 1]["backward"][
-            "entrance"]
+        bp_entrance_vars = block_var_detail[stage_id -
+                                            1]["backward"]["entrance"]
         add_vars_by_var_list(bp_entrance_vars, program, heter_program,
                              heter_block_bp)
         bp_exit_vars = block_var_detail[stage_id - 1]["backward"]["exit"]
         add_vars_by_var_list(bp_exit_vars, program, heter_program,
                              heter_block_bp)
-        backward_comm_info = get_communicate_var_info(
-            program, stage_id, bp_entrance_vars, type="backward")
+        backward_comm_info = get_communicate_var_info(program,
+                                                      stage_id,
+                                                      bp_entrance_vars,
+                                                      type="backward")
 
-        grad_to_block_id.append(backward_comm_info["block_input_var_name"] + ":"
-                                + str(heter_block_bp.idx))
+        grad_to_block_id.append(backward_comm_info["block_input_var_name"] +
+                                ":" + str(heter_block_bp.idx))
 
     else:
         for _, op in enumerate(heter_block_ops_backward):
             block_append_op(heter_program, program, heter_block, op)
 
-        bp_entrance_vars = block_var_detail[stage_id - 1]["backward"][
-            "entrance"]
+        bp_entrance_vars = block_var_detail[stage_id -
+                                            1]["backward"]["entrance"]
         add_vars_by_var_list(bp_entrance_vars, program, heter_program,
                              heter_block)
         bp_exit_vars = block_var_detail[stage_id - 1]["backward"]["exit"]
@@ -1010,8 +1024,10 @@ def create_heter_program(program, config, heter_program, program_block_ops_list,
 
         heter_block_bp = heter_block
 
-    forward_comm_info = get_communicate_var_info(
-        program, stage_id, entrance_vars, type="forward")
+    forward_comm_info = get_communicate_var_info(program,
+                                                 stage_id,
+                                                 entrance_vars,
+                                                 type="forward")
 
     grad_to_block_id.append(forward_comm_info["block_input_var_name"] + ":" +
                             str(heter_block.idx))
@@ -1022,13 +1038,15 @@ def create_heter_program(program, config, heter_program, program_block_ops_list,
         static_var = insert_communicate_op(program, config, heter_block,
                                            stage_id, first_op_index_fp,
                                            block_var_detail, current_device)
-    static_var_bp = insert_communicate_op(
-        program, config, heter_block_bp, stage_id, first_op_index_bp,
-        block_var_detail, current_device, False)
+    static_var_bp = insert_communicate_op(program, config, heter_block_bp,
+                                          stage_id, first_op_index_bp,
+                                          block_var_detail, current_device,
+                                          False)
 
     # add send op
-    send_grad_var_list = add_heter_send_op(
-        program, heter_program, heter_block_bp, block_var_detail[stage_id - 1])
+    send_grad_var_list = add_heter_send_op(program, heter_program,
+                                           heter_block_bp,
+                                           block_var_detail[stage_id - 1])
 
     # ---------------
     # add step conter
@@ -1063,8 +1081,10 @@ def create_heter_program(program, config, heter_program, program_block_ops_list,
         RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
     }
     # append the listen_and_serv op
-    heter_program.global_block().append_op(
-        type="heter_listen_and_serv", inputs={'X': []}, outputs={}, attrs=attrs)
+    heter_program.global_block().append_op(type="heter_listen_and_serv",
+                                           inputs={'X': []},
+                                           outputs={},
+                                           attrs=attrs)
     check_heter_compile_time_strategy(program, config, send_grad_var_list)
 
 
@@ -1098,8 +1118,9 @@ def create_trainer_program(program, origin_program, config,
     for heter_block_index in range(1, len(program_block_ops_list)):
         ops_list = program_block_ops_list[heter_block_index][
             "forward"] + program_block_ops_list[heter_block_index]["backward"]
-        static_var += replace_ops_by_communicate_op(
-            program, config, heter_block_index, ops_list, block_var_detail)
+        static_var += replace_ops_by_communicate_op(program, config,
+                                                    heter_block_index, ops_list,
+                                                    block_var_detail)
         remove_trainer_send_op(program, config, heter_block_index,
                                block_var_detail)
 
@@ -1113,8 +1134,10 @@ def create_trainer_program(program, origin_program, config,
                                            bp_ops_list, block_var_detail)
 
     bp_entrance_vars = block_var_detail[0]["backward"]["entrance"]
-    backward_comm_info = get_communicate_var_info(
-        origin_program, 1, bp_entrance_vars, type="backward")
+    backward_comm_info = get_communicate_var_info(origin_program,
+                                                  1,
+                                                  bp_entrance_vars,
+                                                  type="backward")
 
     grad_to_block_id.append(backward_comm_info["block_input_var_name"] + ":" +
                             str(backward_block.idx))
@@ -1135,12 +1158,11 @@ def create_trainer_program(program, origin_program, config,
         RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
     }
     # append the listen_and_serv op
-    program.global_block()._insert_op(
-        index=0,
-        type="heter_listen_and_serv",
-        inputs={'X': []},
-        outputs={},
-        attrs=attrs)
+    program.global_block()._insert_op(index=0,
+                                      type="heter_listen_and_serv",
+                                      inputs={'X': []},
+                                      outputs={},
+                                      attrs=attrs)
 
     ## TODO add check for bp block
     check_op_device(program.global_block(), DEFAULT_DEVICE)
@@ -1171,22 +1193,24 @@ def insert_communicate_op(orign_program,
         comm_info = get_communicate_var_info(orign_program, stage_id - 1,
                                              entrance_var, "backward")
 
-    heter_block._insert_op(
-        index=first_op_index,
-        type="send_and_recv",
-        inputs={"X": heter_block.vars[entrance_var[0]]},
-        outputs={"Out": []},
-        attrs={
-            "mode": "forward" if is_forward else "backward",
-            "send_var_name": entrance_var + ["microbatch_id"],
-            "recv_var_name": [],
-            "message_name": comm_info["block_input_var_name"],
-            "next_endpoints": next_heter_worker_endpoints,
-            "previous_endpoints": previous_heter_worker_endpoints,
-            "trainer_id": config.get_role_id(),
-            "op_device": device,
-            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-        })
+    heter_block._insert_op(index=first_op_index,
+                           type="send_and_recv",
+                           inputs={"X": heter_block.vars[entrance_var[0]]},
+                           outputs={"Out": []},
+                           attrs={
+                               "mode": "forward" if is_forward else "backward",
+                               "send_var_name":
+                               entrance_var + ["microbatch_id"],
+                               "recv_var_name": [],
+                               "message_name":
+                               comm_info["block_input_var_name"],
+                               "next_endpoints": next_heter_worker_endpoints,
+                               "previous_endpoints":
+                               previous_heter_worker_endpoints,
+                               "trainer_id": config.get_role_id(),
+                               "op_device": device,
+                               RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                           })
 
     return entrance_var
 
@@ -1269,8 +1293,8 @@ def remove_trainer_send_op(program, config, heter_block_index,
     need_remove_send_op = []
     need_remove_grad_var = []
     for op in find_send_op(program):
-        input_list, _ = find_op_input_output(program,
-                                             program.global_block(), op)
+        input_list, _ = find_op_input_output(program, program.global_block(),
+                                             op)
         for var_name in input_list:
             origin_var_name = var_name.split("@GRAD")[0]
             if origin_var_name in persistables:
@@ -1283,6 +1307,7 @@ def remove_trainer_send_op(program, config, heter_block_index,
 
 
 def add_heter_send_op(program, heter_program, block, block_var_detail):
+
     def _get_send_op_dict():
         send_op_dict = {}
         send_op_list = find_send_op(program)
@@ -1328,16 +1353,16 @@ def _get_send_op_dict():
             block.vars[union_var]
             for union_var in table_dict[table_id]['var_list']
         ]
-        block.append_op(
-            type="send",
-            inputs={"X": send_input_vars},
-            outputs={"Out": dummy_output},
-            attrs={
-                "send_varnames": table_dict[table_id]['send_varnames'],
-                "is_sparse": is_sparse,
-                "table_id": table_id,
-                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-            })
+        block.append_op(type="send",
+                        inputs={"X": send_input_vars},
+                        outputs={"Out": dummy_output},
+                        attrs={
+                            "send_varnames":
+                            table_dict[table_id]['send_varnames'],
+                            "is_sparse": is_sparse,
+                            "table_id": table_id,
+                            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                        })
 
     return send_grad_var_list
 
@@ -1454,9 +1479,8 @@ def union_forward_gradient_op(program_block_ops_list):
     assert block_length % 2 != 0, "the length of program_block_ops_list should be odd"
     for i in range(0, block_length // 2):
         block_op_list = {"forward": program_block_ops_list[i]}
-        block_op_list.update({
-            "backward": program_block_ops_list[block_length - 1 - i]
-        })
+        block_op_list.update(
+            {"backward": program_block_ops_list[block_length - 1 - i]})
         union_program_block_ops_list.append(block_op_list)
 
     block_op_list = {"forward": [], "backward": []}
@@ -1474,8 +1498,9 @@ def find_block_joints(program, program_block_ops_list, heter_ops):
                                                   program_block_ops_list)
     block_var_detail = entrance_exit_check(program, program_block_ops_list,
                                            block_var_detail, heter_ops)
-    block_var_detail = delete_block_useless_exit(
-        program, program_block_ops_list, block_var_detail)
+    block_var_detail = delete_block_useless_exit(program,
+                                                 program_block_ops_list,
+                                                 block_var_detail)
 
     return block_var_detail
 
@@ -1506,8 +1531,8 @@ def find_entrance_exit_private(program, program_block_ops_list):
         bp_block_input, bp_block_output = find_ops_list_input_output(
             program, block_op_list["backward"])
         bp_persistables = screen_persistables(
-            program, bp_block_input) + screen_persistables(program,
-                                                           bp_block_output)
+            program, bp_block_input) + screen_persistables(
+                program, bp_block_output)
         # find entrance & exit
         bp_block_private_vars = list(set(bp_block_input) & set(bp_block_output))
         bp_block_entrance = list(
@@ -1555,10 +1580,10 @@ def entrance_exit_check(program, program_block_ops_list, block_var_detail,
         #need_add_vars = find_need_var_from_previous_block(
         #    need_add_vars, block_var_detail, index, heter_ops)
 
-        previous_block_private = block_var_detail[index - 1]["forward"][
-            "private"]
-        previous_block_entrance = block_var_detail[index - 1]["forward"][
-            "entrance"]
+        previous_block_private = block_var_detail[index -
+                                                  1]["forward"]["private"]
+        previous_block_entrance = block_var_detail[index -
+                                                   1]["forward"]["entrance"]
         for var in need_add_vars:
             if var not in previous_block_private and var not in previous_block_entrance:
                 previous_block_entrance.append(var)
@@ -1584,10 +1609,10 @@ def entrance_exit_check(program, program_block_ops_list, block_var_detail,
                 need_ignore_vars.append(var)
         need_add_vars = list(
             set(need_add_vars).difference(set(need_ignore_vars)))
-        previous_block_private = block_var_detail[index + 1]["backward"][
-            "private"]
-        previous_block_entrance = block_var_detail[index + 1]["backward"][
-            "entrance"]
+        previous_block_private = block_var_detail[index +
+                                                  1]["backward"]["private"]
+        previous_block_entrance = block_var_detail[index +
+                                                   1]["backward"]["entrance"]
         for var in need_add_vars:
             if var not in previous_block_private and var not in previous_block_entrance:
                 previous_block_entrance.append(var)
@@ -1648,8 +1673,8 @@ def delete_block_useless_exit(program, program_block_ops_list,
         if index - 1 < 0:
             break
         current_block_exit = block_var_detail[index]["backward"]["exit"]
-        next_block_entrance = block_var_detail[index - 1]["backward"][
-            "entrance"]
+        next_block_entrance = block_var_detail[index -
+                                               1]["backward"]["entrance"]
         need_delete_var = []
         for var in current_block_exit:
             if var not in next_block_entrance:
@@ -1693,61 +1718,62 @@ def insert_reshape_op(program,
     input_var = block.vars[var_name]
 
     if new_var_name not in block.vars:
-        out = block.create_var(
-            name=new_var_name,
-            shape=new_var_shape,
-            dtype=input_var.dtype,
-            type=input_var.type)
+        out = block.create_var(name=new_var_name,
+                               shape=new_var_shape,
+                               dtype=input_var.dtype,
+                               type=input_var.type)
     else:
         out = block.vars[new_var_name]
         new_var_shape = out.shape
 
-    x_shape = block.create_var(
-        name="{}.xshape@Heter".format(var_name), dtype=input_var.dtype)
-    block._insert_op(
-        index=index,
-        type="reshape2",
-        inputs={"X": input_var},
-        attrs={'shape': new_var_shape},
-        outputs={"Out": out,
-                 "XShape": x_shape})
+    x_shape = block.create_var(name="{}.xshape@Heter".format(var_name),
+                               dtype=input_var.dtype)
+    block._insert_op(index=index,
+                     type="reshape2",
+                     inputs={"X": input_var},
+                     attrs={'shape': new_var_shape},
+                     outputs={
+                         "Out": out,
+                         "XShape": x_shape
+                     })
 
 
 def insert_send_concat_op(program, block, index, var_name_list, new_var_name,
                           new_var_shape):
     input_var_list = [block.vars[var_name] for var_name in var_name_list]
 
-    out = program.global_block().create_var(
-        name=new_var_name,
-        shape=new_var_shape,
-        dtype=input_var_list[0].dtype,
-        type=input_var_list[0].type)
+    out = program.global_block().create_var(name=new_var_name,
+                                            shape=new_var_shape,
+                                            dtype=input_var_list[0].dtype,
+                                            type=input_var_list[0].type)
 
-    block._insert_op(
-        index=index,
-        type='concat',
-        inputs={"X": input_var_list},
-        outputs={'Out': [out]},
-        attrs={'axis': -1,
-               'use_stack': False})
+    block._insert_op(index=index,
+                     type='concat',
+                     inputs={"X": input_var_list},
+                     outputs={'Out': [out]},
+                     attrs={
+                         'axis': -1,
+                         'use_stack': False
+                     })
 
 
 def insert_recv_slice_op(program, block, index, var_name, var_shape, dtype,
                          type, new_var_name_list, new_var_shape_list):
     if var_name not in program.global_block().vars:
-        input_var = program.global_block().create_var(
-            name=var_name, shape=var_shape, dtype=dtype, type=type)
+        input_var = program.global_block().create_var(name=var_name,
+                                                      shape=var_shape,
+                                                      dtype=dtype,
+                                                      type=type)
     else:
         input_var = program.global_block().vars[var_name]
 
     out_list = []
     for i in range(len(new_var_name_list)):
         if new_var_name_list[i] not in block.vars:
-            out = block.create_var(
-                name=new_var_name_list[i],
-                shape=new_var_shape_list[i],
-                dtype=input_var.dtype,
-                type=input_var.type)
+            out = block.create_var(name=new_var_name_list[i],
+                                   shape=new_var_shape_list[i],
+                                   dtype=input_var.dtype,
+                                   type=input_var.type)
         else:
             out = block.vars[new_var_name_list[i]]
         out_list.append(out)
@@ -1764,12 +1790,11 @@ def insert_recv_slice_op(program, block, index, var_name, var_shape, dtype,
         attrs['starts'] = starts
         attrs['ends'] = ends
 
-        block._insert_op(
-            index=index,
-            type='slice',
-            inputs={'Input': input_var},
-            attrs=attrs,
-            outputs={'Out': out_list[i]})
+        block._insert_op(index=index,
+                         type='slice',
+                         inputs={'Input': input_var},
+                         attrs=attrs,
+                         outputs={'Out': out_list[i]})
         start_index = end_index
         index += 1
 
@@ -1838,8 +1863,10 @@ def block_append_op(program, origin_program, block, op):
 
     if "_grad" not in op.type:
         # for forward op
-        return block.append_op(
-            type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs())
+        return block.append_op(type=op.type,
+                               inputs=inputs,
+                               outputs=outputs,
+                               attrs=op.all_attrs())
     else:
         # for grad op
         op_desc = op.desc
@@ -1865,8 +1892,8 @@ def add_vars_by_var_list(var_name_list, origin_program, program, block):
         ).vars and var_name not in block.vars:
             var = origin_program.global_block().vars[var_name]
             if var.persistable:
-                program.global_block()._clone_variable(
-                    var, force_persistable=False)
+                program.global_block()._clone_variable(var,
+                                                       force_persistable=False)
             else:
                 block._clone_variable(var, force_persistable=False)
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py
index c80b4a800bd14..f852c1a0311d4 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py
@@ -30,6 +30,7 @@
 
 
 class VarBlock:
+
     def __init__(self, varname, offset, size):
         self.varname = varname
         # NOTE: real offset is offset * size
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index dd9d7e760a8e5..3d625d47f3090 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -85,8 +85,8 @@ def init_worker(self):
                 if self._role_maker.is_xpu():
                     local_endpoint = self._role_maker.get_local_endpoint()
                     local_endpoint = local_endpoint.split(":")
-                    self._heter_ptr.start_xpu_service(
-                        str(local_endpoint[0]), int(local_endpoint[1]))
+                    self._heter_ptr.start_xpu_service(str(local_endpoint[0]),
+                                                      int(local_endpoint[1]))
             self._role_maker._barrier_all()
             self.all_ips_ = self._role_maker._all_gather(self._local_ip)
             # worker_index * 2 is for compatible with older versions of pslib
@@ -136,8 +136,9 @@ def init_worker(self):
                             var_name = table.dense_variable_name[i]
                             if scope.find_var(var_name) is None:
                                 raise ValueError(
-                                    "var " + var_name + " not found in scope, "
-                                    + "you should run startup program first")
+                                    "var " + var_name +
+                                    " not found in scope, " +
+                                    "you should run startup program first")
                             var_name_list.append(var_name)
                         if not self._opt_info["use_ps_gpu"]:
                             self._fleet_ptr.init_model(scope,
@@ -249,9 +250,10 @@ def start_heter_trainer(self,
 
         """
 
-        trainer_instance = executor.start_heter_trainer(
-            program, scope, debug, fetch_list, fetch_info, print_period,
-            fetch_handler)
+        trainer_instance = executor.start_heter_trainer(program, scope, debug,
+                                                        fetch_list, fetch_info,
+                                                        print_period,
+                                                        fetch_handler)
         if self._role_maker.is_xpu():
             print("barrier heter")
             self._role_maker._barrier_heter()
@@ -1006,10 +1008,11 @@ def __init__(self, optimizer, strategy=None):
         self._optimizer = optimizer
         self._optimizer_name = "Distributed%s" % optimizer.type.capitalize()
         if optimizer.type != "adam":
-            print("Currently, distributed optimizer only support Adam"
-                  "Will config built-in adam for you."
-                  "We will support more functions in DistributedOptimizer",
-                  sys.stderr)
+            print(
+                "Currently, distributed optimizer only support Adam"
+                "Will config built-in adam for you."
+                "We will support more functions in DistributedOptimizer",
+                sys.stderr)
             self._optimizer_name = "DistributedAdam"
 
         self._distributed_optimizer = globals()[self._optimizer_name](optimizer)
@@ -1152,13 +1155,12 @@ def minimize(self,
                 t = MultiThread(trans_mode=program_mode)
                 start_program = startup_programs[i]
                 main_program = programs[i]
-                t.transpile(
-                    startup_program=start_program,
-                    main_program=main_program,
-                    rank=env["trainer_id"],
-                    endpoints=env["trainer_endpoints"],
-                    current_endpoint=env['current_endpoint'],
-                    wait_port=False)
+                t.transpile(startup_program=start_program,
+                            main_program=main_program,
+                            rank=env["trainer_id"],
+                            endpoints=env["trainer_endpoints"],
+                            current_endpoint=env['current_endpoint'],
+                            wait_port=False)
                 if i > 0:
                     self._remove_collective_ops(start_program,
                                                 "c_comm_init_all")
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
index 5f0af296441ff..308261cea0676 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
@@ -109,8 +109,8 @@ def add_sparse_table(self, table_id, strategy):
         if table_class == 'DownpourSparseTable' or table_class == 'DownpourSparseSSDTable':
             table.enable_sparse_table_cache = strategy.get(
                 'sparse_enable_cache', True)
-            table.sparse_table_cache_rate = strategy.get('sparse_cache_rate',
-                                                         0.00055)
+            table.sparse_table_cache_rate = strategy.get(
+                'sparse_cache_rate', 0.00055)
             table.sparse_table_cache_file_num = strategy.get(
                 'sparse_cache_file_num', 16)
             table.compress_in_save = strategy.get('sparse_compress_in_save',
@@ -313,8 +313,8 @@ def add_dense_table(self, table_id, param_var, grad_var, strategy,
         table.compress_in_save = strategy.get('dense_compress_in_save', True)
         table.accessor.accessor_class = strategy.get(
             'dense_accessor_class', "DownpourDenseValueAccessor")
-        table.accessor.dense_sgd_param.name = strategy.get('dense_optimizer',
-                                                           "adam")
+        table.accessor.dense_sgd_param.name = strategy.get(
+            'dense_optimizer', "adam")
         table.accessor.dense_sgd_param.adam.learning_rate = strategy.get(
             'dense_learning_rate', 5e-06)
         table.accessor.dense_sgd_param.adam.avg_decay_rate = strategy.get(
@@ -377,8 +377,8 @@ def add_data_norm_table(self, table_id, learning_rate, param_var, grad_var,
         table.compress_in_save = strategy.get('datanorm_compress_in_save', True)
         table.accessor.accessor_class = strategy.get(
             'datanorm_accessor_class', 'DownpourDenseValueAccessor')
-        table.accessor.dense_sgd_param.name = strategy.get('datanorm_operation',
-                                                           'summary')
+        table.accessor.dense_sgd_param.name = strategy.get(
+            'datanorm_operation', 'summary')
         table.accessor.dense_sgd_param.summary.summary_decay_rate = strategy.get(
             'datanorm_decay_rate', 0.999999)
         table.accessor.fea_dim = fea_dim
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index 9483556d46f59..35cda4c34b009 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -203,7 +203,7 @@ def _find_multi_distributed_lookup_table(self, losses):
         return ret_list
 
     def _if_last_block(self, op, _equal_dict):
-        # for conditional_block op 
+        # for conditional_block op
         cond_str = op.input('Cond')[0]
         bool_test = False
         if cond_str.startswith('equal'):
@@ -345,25 +345,25 @@ def _check_config_fleet_with_program_op(self, strategy, table_name,
             if st.get("sparse_embedx_dim") is not None \
                     and strategy.get("use_cvm") == True \
                     and st["sparse_embedx_dim"] != emb_to_size[table_name] - 3:
-                raise ValueError("fleet config sparse_embedx_dim=%s not"
-                                 " equal to embedding dim - 3 = %s" %
-                                 (st["sparse_embedx_dim"],
-                                  emb_to_size[table_name] - 3))
+                raise ValueError(
+                    "fleet config sparse_embedx_dim=%s not"
+                    " equal to embedding dim - 3 = %s" %
+                    (st["sparse_embedx_dim"], emb_to_size[table_name] - 3))
             if st.get("sparse_embedx_dim") is not None \
                     and strategy.get("use_cvm") == False \
                     and st["sparse_embedx_dim"] != emb_to_size[table_name] - 1:
-                raise ValueError("fleet config sparse_embedx_dim=%s not"
-                                 " equal to embedding dim - 1 = %s" %
-                                 (st["sparse_embedx_dim"],
-                                  emb_to_size[table_name] - 1))
+                raise ValueError(
+                    "fleet config sparse_embedx_dim=%s not"
+                    " equal to embedding dim - 1 = %s" %
+                    (st["sparse_embedx_dim"], emb_to_size[table_name] - 1))
             if st.get("sparse_embedx_dim") is None \
                     and strategy.get("use_cvm") == True:
                 logger.warning(
                     "sparse embedding dim for table name '{}' is: {}, while sparse_embedx_dim "
                     "with same sparse table name is not set in config_fleet.py. "
                     "Hence automatically set sparse_embedx_dim = {} - 3.".
-                    format(table_name, emb_to_size[table_name], emb_to_size[
-                        table_name]))
+                    format(table_name, emb_to_size[table_name],
+                           emb_to_size[table_name]))
                 st["sparse_embedx_dim"] = emb_to_size[table_name] - 3
             if st.get("sparse_embedx_dim") is None \
                     and strategy.get("use_cvm") == False:
@@ -371,23 +371,23 @@ def _check_config_fleet_with_program_op(self, strategy, table_name,
                     "sparse embedding dim for table name '{}' is: {}, while sparse_embedx_dim "
                     "with same sparse table name is not set in config_fleet.py. "
                     "Hence automatically set sparse_embedx_dim = {} - 1.".
-                    format(table_name, emb_to_size[table_name], emb_to_size[
-                        table_name]))
+                    format(table_name, emb_to_size[table_name],
+                           emb_to_size[table_name]))
                 st["sparse_embedx_dim"] = emb_to_size[table_name] - 1
         elif accessor == "DownpourSparseValueAccessor":
             if st.get("sparse_embedx_dim") is not None \
                     and st["sparse_embedx_dim"] != emb_to_size[table_name]:
-                raise ValueError("fleet config sparse_embedx_dim=%s not"
-                                 " equal to embedding dim = %s" %
-                                 (st["sparse_embedx_dim"],
-                                  emb_to_size[table_name]))
+                raise ValueError(
+                    "fleet config sparse_embedx_dim=%s not"
+                    " equal to embedding dim = %s" %
+                    (st["sparse_embedx_dim"], emb_to_size[table_name]))
             if st.get("sparse_embedx_dim") is None:
                 logger.warning(
                     "sparse embedding dim for table name '{}' is: {}, while sparse_embedx_dim "
                     "with same sparse table name is not set in config_fleet.py. "
                     "Hence automatically set sparse_embedx_dim = {}.".format(
-                        table_name, emb_to_size[table_name], emb_to_size[
-                            table_name]))
+                        table_name, emb_to_size[table_name],
+                        emb_to_size[table_name]))
                 st["sparse_embedx_dim"] = emb_to_size[table_name]
 
         return strategy
@@ -439,9 +439,9 @@ def _minimize(self,
                 parameters = parameter_list[num]
             prog_id = str(id(loss.block.program))
             # param_grads of program
-            params_grads = sorted(
-                fluid.backward.append_backward(loss, parameters, no_grad_set),
-                key=lambda x: x[0].name)
+            params_grads = sorted(fluid.backward.append_backward(
+                loss, parameters, no_grad_set),
+                                  key=lambda x: x[0].name)
 
             flag_use_ps_gpu = strategy.get("use_ps_gpu", False)
             if flag_use_ps_gpu:
@@ -455,7 +455,7 @@ def _minimize(self,
                 embedding_table = self._find_multi_distributed_lookup_table(
                     [loss])
                 self._remove_optimize_op_for_embedding(loss, embedding_table)
-            # has condition_block op means multi-task 
+            # has condition_block op means multi-task
             flag_multi_task = self._has_conditional_block(loss)
             if flag_multi_task:
                 self._cond_params = dict()
@@ -593,25 +593,25 @@ def _minimize(self,
                         or accessor == "DownpourUnitAccessor":
                     if st.get("sparse_embedx_dim") is not None \
                             and st["sparse_embedx_dim"] != emb_to_size[key] - 3:
-                        raise ValueError("fleet config sparse_embedx_dim=%s not"
-                                         " equal to embedding size - 3 = %s" %
-                                         (st["sparse_embedx_dim"],
-                                          emb_to_size[key] - 3))
+                        raise ValueError(
+                            "fleet config sparse_embedx_dim=%s not"
+                            " equal to embedding size - 3 = %s" %
+                            (st["sparse_embedx_dim"], emb_to_size[key] - 3))
                     st["sparse_embedx_dim"] = emb_to_size[key] - 3
                 elif accessor == "DownpourSparseValueAccessor":
                     if st.get("sparse_embedx_dim") is not None \
                             and st["sparse_embedx_dim"] != emb_to_size[key]:
-                        raise ValueError("fleet config sparse_embedx_dim=%s not"
-                                         " equal to embedding size = %s" %
-                                         (st["sparse_embedx_dim"],
-                                          emb_to_size[key]))
+                        raise ValueError(
+                            "fleet config sparse_embedx_dim=%s not"
+                            " equal to embedding size = %s" %
+                            (st["sparse_embedx_dim"], emb_to_size[key]))
                     st["sparse_embedx_dim"] = emb_to_size[key]
 
         # ServerParameter add all sparse tables
         for tn in sparse_table_to_index:
             sparse_table_index = sparse_table_to_index[tn]
-            st = self._check_config_fleet_with_program_op(strategy, tn,
-                                                          emb_to_size)
+            st = self._check_config_fleet_with_program_op(
+                strategy, tn, emb_to_size)
             if st.get(tn) is not None:
                 server.add_sparse_table(sparse_table_index, st[tn])
             else:
@@ -692,22 +692,25 @@ def _minimize(self,
                         if flag_multi_task:
                             server_dense_table_index = dense_table_index
                             if len(root_params_list) > 0:
-                                server.add_dense_table(
-                                    server_dense_table_index, root_params_list,
-                                    root_grads_list, strategy['dense_table'],
-                                    sparse_table_names)
+                                server.add_dense_table(server_dense_table_index,
+                                                       root_params_list,
+                                                       root_grads_list,
+                                                       strategy['dense_table'],
+                                                       sparse_table_names)
                                 server_dense_table_index += 1
 
                             for i in range(len(lists_params)):
-                                server.add_dense_table(
-                                    server_dense_table_index, lists_params[i],
-                                    lists_grads[i], strategy['dense_table'],
-                                    sparse_table_names)
+                                server.add_dense_table(server_dense_table_index,
+                                                       lists_params[i],
+                                                       lists_grads[i],
+                                                       strategy['dense_table'],
+                                                       sparse_table_names)
                                 server_dense_table_index += 1
                         else:
-                            server.add_dense_table(
-                                dense_table_index, params, grads,
-                                strategy['dense_table'], sparse_table_names)
+                            server.add_dense_table(dense_table_index, params,
+                                                   grads,
+                                                   strategy['dense_table'],
+                                                   sparse_table_names)
 
                     else:
                         server.add_dense_table(dense_table_index, params, grads,
@@ -716,24 +719,29 @@ def _minimize(self,
                     if flag_multi_task:
 
                         if len(root_params_list) > 0:
-                            worker.add_dense_table(
-                                dense_table_index, self._learning_rate,
-                                root_params_list, root_grads_list,
-                                dense_start_table_id, sparse_table_names)
+                            worker.add_dense_table(dense_table_index,
+                                                   self._learning_rate,
+                                                   root_params_list,
+                                                   root_grads_list,
+                                                   dense_start_table_id,
+                                                   sparse_table_names)
                             dense_table_index += 1
 
                         for i in range(len(lists_params)):
-                            worker.add_dense_table(
-                                dense_table_index, self._learning_rate,
-                                lists_params[i], lists_grads[i],
-                                dense_start_table_id, sparse_table_names)
+                            worker.add_dense_table(dense_table_index,
+                                                   self._learning_rate,
+                                                   lists_params[i],
+                                                   lists_grads[i],
+                                                   dense_start_table_id,
+                                                   sparse_table_names)
                             dense_table_index += 1
 
                         dense_table_index -= 1
                     else:
-                        worker.add_dense_table(
-                            dense_table_index, self._learning_rate, params,
-                            grads, dense_start_table_id, sparse_table_names)
+                        worker.add_dense_table(dense_table_index,
+                                               self._learning_rate, params,
+                                               grads, dense_start_table_id,
+                                               sparse_table_names)
 
                     if FLEET_GLOBAL_DICT["enable"]:
                         cur_prog = losses[loss_index].block.program
@@ -749,8 +757,8 @@ def _minimize(self,
 
                     if "pull_dense" in program_configs[
                             program_id] and "push_dense" in program_configs[
-                                program_id] and len(program_configs[program_id][
-                                    "pull_dense"]) > 0:
+                                program_id] and len(program_configs[program_id]
+                                                    ["pull_dense"]) > 0:
                         if flag_multi_task:
                             program_configs[program_id]["pull_dense"].extend(
                                 multi_task_dense_tables_pull)
@@ -768,10 +776,12 @@ def _minimize(self,
                             program_configs[program_id][
                                 "push_dense"] = multi_task_dense_tables_push
                         else:
-                            program_configs[program_id][
-                                "pull_dense"] = [dense_table_index]
-                            program_configs[program_id][
-                                "push_dense"] = [dense_table_index]
+                            program_configs[program_id]["pull_dense"] = [
+                                dense_table_index
+                            ]
+                            program_configs[program_id]["push_dense"] = [
+                                dense_table_index
+                            ]
 
                     if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
                         dense_table_index += 1
@@ -781,15 +791,18 @@ def _minimize(self,
                                 data_norm_params, data_norm_grads,
                                 strategy['datanorm_table'], sparse_table_names)
                         else:
-                            server.add_data_norm_table(
-                                dense_table_index, self._learning_rate,
-                                data_norm_params, data_norm_grads, None,
-                                sparse_table_names)
-
-                        worker.add_dense_table(
-                            dense_table_index, self._learning_rate,
-                            data_norm_params, data_norm_grads,
-                            dense_start_table_id, sparse_table_names)
+                            server.add_data_norm_table(dense_table_index,
+                                                       self._learning_rate,
+                                                       data_norm_params,
+                                                       data_norm_grads, None,
+                                                       sparse_table_names)
+
+                        worker.add_dense_table(dense_table_index,
+                                               self._learning_rate,
+                                               data_norm_params,
+                                               data_norm_grads,
+                                               dense_start_table_id,
+                                               sparse_table_names)
 
                         if FLEET_GLOBAL_DICT["enable"]:
                             cur_prog = losses[loss_index].block.program
@@ -799,7 +812,8 @@ def _minimize(self,
                                 attrs={
                                     "InputNames":
                                     [i.name for i in data_norm_grads],
-                                    "TableId": dense_table_index,
+                                    "TableId":
+                                    dense_table_index,
                                     "ScaleDataNorm":
                                     strategy.get("scale_datanorm", -1)
                                 })
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
index 363475b3013b9..eec51ef827c57 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
@@ -16,6 +16,7 @@
 # source: ps.proto
 
 import sys
+
 _b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode('latin1'))
 from google.protobuf.internal import enum_type_wrapper
 from google.protobuf import descriptor as _descriptor
@@ -42,15 +43,22 @@
     filename=None,
     file=DESCRIPTOR,
     values=[
-        _descriptor.EnumValueDescriptor(
-            name='PS_SPARSE_TABLE', index=0, number=0, options=None, type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_DENSE_TABLE', index=1, number=1, options=None, type=None),
+        _descriptor.EnumValueDescriptor(name='PS_SPARSE_TABLE',
+                                        index=0,
+                                        number=0,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_DENSE_TABLE',
+                                        index=1,
+                                        number=1,
+                                        options=None,
+                                        type=None),
     ],
     containing_type=None,
     options=None,
     serialized_start=4679,
-    serialized_end=4731, )
+    serialized_end=4731,
+)
 _sym_db.RegisterEnumDescriptor(_TABLETYPE)
 
 TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE)
@@ -60,103 +68,96 @@
     filename=None,
     file=DESCRIPTOR,
     values=[
-        _descriptor.EnumValueDescriptor(
-            name='PS_PULL_DENSE_TABLE',
-            index=0,
-            number=0,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_PUSH_DENSE_TABLE',
-            index=1,
-            number=1,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_PULL_SPARSE_TABLE',
-            index=2,
-            number=2,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_PUSH_SPARSE_TABLE',
-            index=3,
-            number=3,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_SHRINK_TABLE', index=4, number=4, options=None, type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_SAVE_ONE_TABLE',
-            index=5,
-            number=5,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_SAVE_ALL_TABLE',
-            index=6,
-            number=6,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_LOAD_ONE_TABLE',
-            index=7,
-            number=7,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_LOAD_ALL_TABLE',
-            index=8,
-            number=8,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_CLEAR_ONE_TABLE',
-            index=9,
-            number=9,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_CLEAR_ALL_TABLE',
-            index=10,
-            number=10,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_PUSH_DENSE_PARAM',
-            index=11,
-            number=11,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_STOP_SERVER', index=12, number=12, options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_SAVE_ONE_CACHE_TABLE',
-            index=13,
-            number=13,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_GET_CACHE_THRESHOLD',
-            index=14,
-            number=14,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_CACHE_SHUFFLE',
-            index=15,
-            number=15,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_COPY_TABLE', index=16, number=16, options=None, type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_COPY_TABLE_BY_FEASIGN',
-            index=17,
-            number=17,
-            options=None,
-            type=None),
+        _descriptor.EnumValueDescriptor(name='PS_PULL_DENSE_TABLE',
+                                        index=0,
+                                        number=0,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_PUSH_DENSE_TABLE',
+                                        index=1,
+                                        number=1,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_PULL_SPARSE_TABLE',
+                                        index=2,
+                                        number=2,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_PUSH_SPARSE_TABLE',
+                                        index=3,
+                                        number=3,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_SHRINK_TABLE',
+                                        index=4,
+                                        number=4,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_SAVE_ONE_TABLE',
+                                        index=5,
+                                        number=5,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_SAVE_ALL_TABLE',
+                                        index=6,
+                                        number=6,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_LOAD_ONE_TABLE',
+                                        index=7,
+                                        number=7,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_LOAD_ALL_TABLE',
+                                        index=8,
+                                        number=8,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_CLEAR_ONE_TABLE',
+                                        index=9,
+                                        number=9,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_CLEAR_ALL_TABLE',
+                                        index=10,
+                                        number=10,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_PUSH_DENSE_PARAM',
+                                        index=11,
+                                        number=11,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_STOP_SERVER',
+                                        index=12,
+                                        number=12,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_SAVE_ONE_CACHE_TABLE',
+                                        index=13,
+                                        number=13,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_GET_CACHE_THRESHOLD',
+                                        index=14,
+                                        number=14,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_CACHE_SHUFFLE',
+                                        index=15,
+                                        number=15,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_COPY_TABLE',
+                                        index=16,
+                                        number=16,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_COPY_TABLE_BY_FEASIGN',
+                                        index=17,
+                                        number=17,
+                                        options=None,
+                                        type=None),
         _descriptor.EnumValueDescriptor(
             name='PS_PULL_SPARSE_TABLE_WITH_DEPENDENCY',
             index=18,
@@ -169,19 +170,22 @@
             number=19,
             options=None,
             type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_PRINT_TABLE_STAT',
-            index=20,
-            number=20,
-            options=None,
-            type=None),
-        _descriptor.EnumValueDescriptor(
-            name='PS_S2S_MSG', index=21, number=101, options=None, type=None),
+        _descriptor.EnumValueDescriptor(name='PS_PRINT_TABLE_STAT',
+                                        index=20,
+                                        number=20,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='PS_S2S_MSG',
+                                        index=21,
+                                        number=101,
+                                        options=None,
+                                        type=None),
     ],
     containing_type=None,
     options=None,
     serialized_start=4734,
-    serialized_end=5304, )
+    serialized_end=5304,
+)
 _sym_db.RegisterEnumDescriptor(_PSCMDID)
 
 PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID)
@@ -216,15 +220,22 @@
     filename=None,
     file=DESCRIPTOR,
     values=[
-        _descriptor.EnumValueDescriptor(
-            name='HDFS', index=0, number=0, options=None, type=None),
-        _descriptor.EnumValueDescriptor(
-            name='AFS', index=1, number=1, options=None, type=None),
+        _descriptor.EnumValueDescriptor(name='HDFS',
+                                        index=0,
+                                        number=0,
+                                        options=None,
+                                        type=None),
+        _descriptor.EnumValueDescriptor(name='AFS',
+                                        index=1,
+                                        number=1,
+                                        options=None,
+                                        type=None),
     ],
     containing_type=None,
     options=None,
     serialized_start=4647,
-    serialized_end=4677, )
+    serialized_end=4677,
+)
 _sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE)
 
 _PSPARAMETER = _descriptor.Descriptor(
@@ -234,38 +245,36 @@
     file=DESCRIPTOR,
     containing_type=None,
     fields=[
-        _descriptor.FieldDescriptor(
-            name='worker_class',
-            full_name='paddle.PSParameter.worker_class',
-            index=0,
-            number=1,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b("").decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
-        _descriptor.FieldDescriptor(
-            name='server_class',
-            full_name='paddle.PSParameter.server_class',
-            index=1,
-            number=2,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b("").decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='worker_class',
+                                    full_name='paddle.PSParameter.worker_class',
+                                    index=0,
+                                    number=1,
+                                    type=9,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=_b("").decode('utf-8'),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
+        _descriptor.FieldDescriptor(name='server_class',
+                                    full_name='paddle.PSParameter.server_class',
+                                    index=1,
+                                    number=2,
+                                    type=9,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=_b("").decode('utf-8'),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='instance_class',
             full_name='paddle.PSParameter.instance_class',
@@ -282,54 +291,51 @@
             is_extension=False,
             extension_scope=None,
             options=None),
-        _descriptor.FieldDescriptor(
-            name='init_gflags',
-            full_name='paddle.PSParameter.init_gflags',
-            index=3,
-            number=4,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=True,
-            default_value=_b("").decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
-        _descriptor.FieldDescriptor(
-            name='worker_param',
-            full_name='paddle.PSParameter.worker_param',
-            index=4,
-            number=101,
-            type=11,
-            cpp_type=10,
-            label=1,
-            has_default_value=False,
-            default_value=None,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
-        _descriptor.FieldDescriptor(
-            name='server_param',
-            full_name='paddle.PSParameter.server_param',
-            index=5,
-            number=102,
-            type=11,
-            cpp_type=10,
-            label=1,
-            has_default_value=False,
-            default_value=None,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='init_gflags',
+                                    full_name='paddle.PSParameter.init_gflags',
+                                    index=3,
+                                    number=4,
+                                    type=9,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=True,
+                                    default_value=_b("").decode('utf-8'),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
+        _descriptor.FieldDescriptor(name='worker_param',
+                                    full_name='paddle.PSParameter.worker_param',
+                                    index=4,
+                                    number=101,
+                                    type=11,
+                                    cpp_type=10,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=None,
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
+        _descriptor.FieldDescriptor(name='server_param',
+                                    full_name='paddle.PSParameter.server_param',
+                                    index=5,
+                                    number=102,
+                                    type=11,
+                                    cpp_type=10,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=None,
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='trainer_param',
             full_name='paddle.PSParameter.trainer_param',
@@ -372,7 +378,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=21,
-    serialized_end=330, )
+    serialized_end=330,
+)
 
 _WORKERPARAMETER = _descriptor.Descriptor(
     name='WorkerParameter',
@@ -407,7 +414,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=332,
-    serialized_end=413, )
+    serialized_end=413,
+)
 
 _SERVERPARAMETER = _descriptor.Descriptor(
     name='ServerParameter',
@@ -442,7 +450,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=415,
-    serialized_end=496, )
+    serialized_end=496,
+)
 
 _DOWNPOURWORKERPARAMETER = _descriptor.Descriptor(
     name='DownpourWorkerParameter',
@@ -477,7 +486,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=498,
-    serialized_end=577, )
+    serialized_end=577,
+)
 
 _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor(
     name='DownpourTrainerParameter',
@@ -592,7 +602,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=580,
-    serialized_end=833, )
+    serialized_end=833,
+)
 
 _PROGRAMCONFIG = _descriptor.Descriptor(
     name='ProgramConfig',
@@ -601,22 +612,21 @@
     file=DESCRIPTOR,
     containing_type=None,
     fields=[
-        _descriptor.FieldDescriptor(
-            name='program_id',
-            full_name='paddle.ProgramConfig.program_id',
-            index=0,
-            number=1,
-            type=9,
-            cpp_type=9,
-            label=2,
-            has_default_value=False,
-            default_value=_b("").decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='program_id',
+                                    full_name='paddle.ProgramConfig.program_id',
+                                    index=0,
+                                    number=1,
+                                    type=9,
+                                    cpp_type=9,
+                                    label=2,
+                                    has_default_value=False,
+                                    default_value=_b("").decode('utf-8'),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='push_sparse_table_id',
             full_name='paddle.ProgramConfig.push_sparse_table_id',
@@ -691,7 +701,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=836,
-    serialized_end=989, )
+    serialized_end=989,
+)
 
 _DENSETABLEPARAMETER = _descriptor.Descriptor(
     name='DenseTableParameter',
@@ -774,7 +785,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=991,
-    serialized_end=1114, )
+    serialized_end=1114,
+)
 
 _SPARSETABLEPARAMETER = _descriptor.Descriptor(
     name='SparseTableParameter',
@@ -873,7 +885,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=1116,
-    serialized_end=1238, )
+    serialized_end=1238,
+)
 
 _DOWNPOURSERVERPARAMETER = _descriptor.Descriptor(
     name='DownpourServerParameter',
@@ -924,7 +937,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=1241,
-    serialized_end=1375, )
+    serialized_end=1375,
+)
 
 _SERVERSERVICEPARAMETER = _descriptor.Descriptor(
     name='ServerServiceParameter',
@@ -1023,7 +1037,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=1378,
-    serialized_end=1593, )
+    serialized_end=1593,
+)
 
 _TABLEPARAMETER = _descriptor.Descriptor(
     name='TableParameter',
@@ -1032,22 +1047,21 @@
     file=DESCRIPTOR,
     containing_type=None,
     fields=[
-        _descriptor.FieldDescriptor(
-            name='table_id',
-            full_name='paddle.TableParameter.table_id',
-            index=0,
-            number=1,
-            type=4,
-            cpp_type=4,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='table_id',
+                                    full_name='paddle.TableParameter.table_id',
+                                    index=0,
+                                    number=1,
+                                    type=4,
+                                    cpp_type=4,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=0,
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='table_class',
             full_name='paddle.TableParameter.table_class',
@@ -1064,54 +1078,51 @@
             is_extension=False,
             extension_scope=None,
             options=None),
-        _descriptor.FieldDescriptor(
-            name='shard_num',
-            full_name='paddle.TableParameter.shard_num',
-            index=2,
-            number=3,
-            type=4,
-            cpp_type=4,
-            label=1,
-            has_default_value=True,
-            default_value=1000,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
-        _descriptor.FieldDescriptor(
-            name='accessor',
-            full_name='paddle.TableParameter.accessor',
-            index=3,
-            number=4,
-            type=11,
-            cpp_type=10,
-            label=1,
-            has_default_value=False,
-            default_value=None,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
-        _descriptor.FieldDescriptor(
-            name='type',
-            full_name='paddle.TableParameter.type',
-            index=4,
-            number=5,
-            type=14,
-            cpp_type=8,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='shard_num',
+                                    full_name='paddle.TableParameter.shard_num',
+                                    index=2,
+                                    number=3,
+                                    type=4,
+                                    cpp_type=4,
+                                    label=1,
+                                    has_default_value=True,
+                                    default_value=1000,
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
+        _descriptor.FieldDescriptor(name='accessor',
+                                    full_name='paddle.TableParameter.accessor',
+                                    index=3,
+                                    number=4,
+                                    type=11,
+                                    cpp_type=10,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=None,
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
+        _descriptor.FieldDescriptor(name='type',
+                                    full_name='paddle.TableParameter.type',
+                                    index=4,
+                                    number=5,
+                                    type=14,
+                                    cpp_type=8,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=0,
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='compress_in_save',
             full_name='paddle.TableParameter.compress_in_save',
@@ -1186,7 +1197,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=1596,
-    serialized_end=1916, )
+    serialized_end=1916,
+)
 
 _TABLEACCESSORPARAMETER = _descriptor.Descriptor(
     name='TableAccessorParameter',
@@ -1381,7 +1393,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=1919,
-    serialized_end=2496, )
+    serialized_end=2496,
+)
 
 _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
     name='DownpourTableAccessorParameter',
@@ -1472,7 +1485,8 @@
             options=None),
         _descriptor.FieldDescriptor(
             name='show_click_decay_rate',
-            full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate',
+            full_name=
+            'paddle.DownpourTableAccessorParameter.show_click_decay_rate',
             index=5,
             number=6,
             type=2,
@@ -1504,7 +1518,8 @@
             options=None),
         _descriptor.FieldDescriptor(
             name='delete_after_unseen_days',
-            full_name='paddle.DownpourTableAccessorParameter.delete_after_unseen_days',
+            full_name=
+            'paddle.DownpourTableAccessorParameter.delete_after_unseen_days',
             index=7,
             number=8,
             type=2,
@@ -1520,7 +1535,8 @@
             options=None),
         _descriptor.FieldDescriptor(
             name='ssd_unseenday_threshold',
-            full_name='paddle.DownpourTableAccessorParameter.ssd_unseenday_threshold',
+            full_name=
+            'paddle.DownpourTableAccessorParameter.ssd_unseenday_threshold',
             index=8,
             number=9,
             type=5,
@@ -1544,7 +1560,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=2499,
-    serialized_end=2813, )
+    serialized_end=2813,
+)
 
 _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor(
     name='TableAccessorSaveParameter',
@@ -1611,7 +1628,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=2815,
-    serialized_end=2898, )
+    serialized_end=2898,
+)
 
 _PSREQUESTMESSAGE = _descriptor.Descriptor(
     name='PsRequestMessage',
@@ -1620,22 +1638,21 @@
     file=DESCRIPTOR,
     containing_type=None,
     fields=[
-        _descriptor.FieldDescriptor(
-            name='cmd_id',
-            full_name='paddle.PsRequestMessage.cmd_id',
-            index=0,
-            number=1,
-            type=13,
-            cpp_type=3,
-            label=2,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='cmd_id',
+                                    full_name='paddle.PsRequestMessage.cmd_id',
+                                    index=0,
+                                    number=1,
+                                    type=13,
+                                    cpp_type=3,
+                                    label=2,
+                                    has_default_value=False,
+                                    default_value=0,
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='table_id',
             full_name='paddle.PsRequestMessage.table_id',
@@ -1652,22 +1669,21 @@
             is_extension=False,
             extension_scope=None,
             options=None),
-        _descriptor.FieldDescriptor(
-            name='params',
-            full_name='paddle.PsRequestMessage.params',
-            index=2,
-            number=3,
-            type=12,
-            cpp_type=9,
-            label=3,
-            has_default_value=False,
-            default_value=[],
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='params',
+                                    full_name='paddle.PsRequestMessage.params',
+                                    index=2,
+                                    number=3,
+                                    type=12,
+                                    cpp_type=9,
+                                    label=3,
+                                    has_default_value=False,
+                                    default_value=[],
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='client_id',
             full_name='paddle.PsRequestMessage.client_id',
@@ -1684,22 +1700,21 @@
             is_extension=False,
             extension_scope=None,
             options=None),
-        _descriptor.FieldDescriptor(
-            name='data',
-            full_name='paddle.PsRequestMessage.data',
-            index=4,
-            number=5,
-            type=12,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b(""),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='data',
+                                    full_name='paddle.PsRequestMessage.data',
+                                    index=4,
+                                    number=5,
+                                    type=12,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=_b(""),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
     ],
     extensions=[],
     nested_types=[],
@@ -1710,7 +1725,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=2900,
-    serialized_end=3001, )
+    serialized_end=3001,
+)
 
 _SPARSESGDRULEPARAMETER = _descriptor.Descriptor(
     name='SparseSGDRuleParameter',
@@ -1793,7 +1809,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=3004,
-    serialized_end=3137, )
+    serialized_end=3137,
+)
 
 _SPARSECOMMONSGDRULEPARAMETER = _descriptor.Descriptor(
     name='SparseCommonSGDRuleParameter',
@@ -1876,7 +1893,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=3140,
-    serialized_end=3338, )
+    serialized_end=3338,
+)
 
 _SPARSENAIVESGDRULEPARAMETER = _descriptor.Descriptor(
     name='SparseNaiveSGDRuleParameter',
@@ -1943,7 +1961,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=3340,
-    serialized_end=3452, )
+    serialized_end=3452,
+)
 
 _SPARSEADAGRADSGDRULEPARAMETER = _descriptor.Descriptor(
     name='SparseAdagradSGDRuleParameter',
@@ -2026,7 +2045,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=3455,
-    serialized_end=3595, )
+    serialized_end=3595,
+)
 
 _SPARSEADAMSGDPARAMETER = _descriptor.Descriptor(
     name='SparseAdamSGDParameter',
@@ -2141,7 +2161,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=3598,
-    serialized_end=3798, )
+    serialized_end=3798,
+)
 
 _DENSESGDRULEPARAMETER = _descriptor.Descriptor(
     name='DenseSGDRuleParameter',
@@ -2240,7 +2261,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=3801,
-    serialized_end=4026, )
+    serialized_end=4026,
+)
 
 _ADAMSGDPARAMETER = _descriptor.Descriptor(
     name='AdamSGDParameter',
@@ -2339,7 +2361,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=4029,
-    serialized_end=4201, )
+    serialized_end=4201,
+)
 
 _NAIVESGDPARAMETER = _descriptor.Descriptor(
     name='NaiveSGDParameter',
@@ -2390,7 +2413,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=4203,
-    serialized_end=4277, )
+    serialized_end=4277,
+)
 
 _SUMMARYSGDPARAMETER = _descriptor.Descriptor(
     name='SummarySGDParameter',
@@ -2425,7 +2449,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=4279,
-    serialized_end=4338, )
+    serialized_end=4338,
+)
 
 _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor(
     name='MovingAverageRuleParameter',
@@ -2460,7 +2485,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=4340,
-    serialized_end=4386, )
+    serialized_end=4386,
+)
 
 _PSRESPONSEMESSAGE = _descriptor.Descriptor(
     name='PsResponseMessage',
@@ -2501,22 +2527,21 @@
             is_extension=False,
             extension_scope=None,
             options=None),
-        _descriptor.FieldDescriptor(
-            name='data',
-            full_name='paddle.PsResponseMessage.data',
-            index=2,
-            number=3,
-            type=12,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b(""),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='data',
+                                    full_name='paddle.PsResponseMessage.data',
+                                    index=2,
+                                    number=3,
+                                    type=12,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=_b(""),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
     ],
     extensions=[],
     nested_types=[],
@@ -2527,7 +2552,8 @@
     extension_ranges=[],
     oneofs=[],
     serialized_start=4388,
-    serialized_end=4461, )
+    serialized_end=4461,
+)
 
 _FSCLIENTPARAMETER = _descriptor.Descriptor(
     name='FsClientParameter',
@@ -2552,54 +2578,51 @@
             is_extension=False,
             extension_scope=None,
             options=None),
-        _descriptor.FieldDescriptor(
-            name='uri',
-            full_name='paddle.FsClientParameter.uri',
-            index=1,
-            number=2,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b("").decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
-        _descriptor.FieldDescriptor(
-            name='user',
-            full_name='paddle.FsClientParameter.user',
-            index=2,
-            number=3,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b("").decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
-        _descriptor.FieldDescriptor(
-            name='passwd',
-            full_name='paddle.FsClientParameter.passwd',
-            index=3,
-            number=4,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b("").decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            options=None),
+        _descriptor.FieldDescriptor(name='uri',
+                                    full_name='paddle.FsClientParameter.uri',
+                                    index=1,
+                                    number=2,
+                                    type=9,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=_b("").decode('utf-8'),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
+        _descriptor.FieldDescriptor(name='user',
+                                    full_name='paddle.FsClientParameter.user',
+                                    index=2,
+                                    number=3,
+                                    type=9,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=_b("").decode('utf-8'),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
+        _descriptor.FieldDescriptor(name='passwd',
+                                    full_name='paddle.FsClientParameter.passwd',
+                                    index=3,
+                                    number=4,
+                                    type=9,
+                                    cpp_type=9,
+                                    label=1,
+                                    has_default_value=False,
+                                    default_value=_b("").decode('utf-8'),
+                                    message_type=None,
+                                    enum_type=None,
+                                    containing_type=None,
+                                    is_extension=False,
+                                    extension_scope=None,
+                                    options=None),
         _descriptor.FieldDescriptor(
             name='buffer_size',
             full_name='paddle.FsClientParameter.buffer_size',
@@ -2651,14 +2674,17 @@
     ],
     extensions=[],
     nested_types=[],
-    enum_types=[_FSCLIENTPARAMETER_FSAPITYPE, ],
+    enum_types=[
+        _FSCLIENTPARAMETER_FSAPITYPE,
+    ],
     options=None,
     is_extendable=False,
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
     serialized_start=4464,
-    serialized_end=4677, )
+    serialized_end=4677,
+)
 
 _PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER
 _PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER
@@ -2760,121 +2786,109 @@
 PSParameter = _reflection.GeneratedProtocolMessageType(
     'PSParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_PSPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.PSParameter)
-    ))
+    dict(DESCRIPTOR=_PSPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.PSParameter)
+         ))
 _sym_db.RegisterMessage(PSParameter)
 
 WorkerParameter = _reflection.GeneratedProtocolMessageType(
     'WorkerParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_WORKERPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.WorkerParameter)
-    ))
+    dict(DESCRIPTOR=_WORKERPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.WorkerParameter)
+         ))
 _sym_db.RegisterMessage(WorkerParameter)
 
 ServerParameter = _reflection.GeneratedProtocolMessageType(
     'ServerParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_SERVERPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.ServerParameter)
-    ))
+    dict(DESCRIPTOR=_SERVERPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.ServerParameter)
+         ))
 _sym_db.RegisterMessage(ServerParameter)
 
 DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType(
     'DownpourWorkerParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_DOWNPOURWORKERPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter)
-    ))
+    dict(DESCRIPTOR=_DOWNPOURWORKERPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter)
+         ))
 _sym_db.RegisterMessage(DownpourWorkerParameter)
 
 DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType(
     'DownpourTrainerParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_DOWNPOURTRAINERPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter)
-    ))
+    dict(DESCRIPTOR=_DOWNPOURTRAINERPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter)
+         ))
 _sym_db.RegisterMessage(DownpourTrainerParameter)
 
 ProgramConfig = _reflection.GeneratedProtocolMessageType(
     'ProgramConfig',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_PROGRAMCONFIG,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.ProgramConfig)
-    ))
+    dict(DESCRIPTOR=_PROGRAMCONFIG,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.ProgramConfig)
+         ))
 _sym_db.RegisterMessage(ProgramConfig)
 
 DenseTableParameter = _reflection.GeneratedProtocolMessageType(
     'DenseTableParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_DENSETABLEPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter)
-    ))
+    dict(DESCRIPTOR=_DENSETABLEPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter)
+         ))
 _sym_db.RegisterMessage(DenseTableParameter)
 
 SparseTableParameter = _reflection.GeneratedProtocolMessageType(
     'SparseTableParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_SPARSETABLEPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter)
-    ))
+    dict(DESCRIPTOR=_SPARSETABLEPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter)
+         ))
 _sym_db.RegisterMessage(SparseTableParameter)
 
 DownpourServerParameter = _reflection.GeneratedProtocolMessageType(
     'DownpourServerParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_DOWNPOURSERVERPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter)
-    ))
+    dict(DESCRIPTOR=_DOWNPOURSERVERPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter)
+         ))
 _sym_db.RegisterMessage(DownpourServerParameter)
 
 ServerServiceParameter = _reflection.GeneratedProtocolMessageType(
     'ServerServiceParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_SERVERSERVICEPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter)
-    ))
+    dict(DESCRIPTOR=_SERVERSERVICEPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter)
+         ))
 _sym_db.RegisterMessage(ServerServiceParameter)
 
 TableParameter = _reflection.GeneratedProtocolMessageType(
     'TableParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_TABLEPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.TableParameter)
-    ))
+    dict(DESCRIPTOR=_TABLEPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.TableParameter)
+         ))
 _sym_db.RegisterMessage(TableParameter)
 
 TableAccessorParameter = _reflection.GeneratedProtocolMessageType(
     'TableAccessorParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_TABLEACCESSORPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter)
-    ))
+    dict(DESCRIPTOR=_TABLEACCESSORPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter)
+         ))
 _sym_db.RegisterMessage(TableAccessorParameter)
 
 DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType(
@@ -2900,21 +2914,19 @@
 PsRequestMessage = _reflection.GeneratedProtocolMessageType(
     'PsRequestMessage',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_PSREQUESTMESSAGE,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage)
-    ))
+    dict(DESCRIPTOR=_PSREQUESTMESSAGE,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage)
+         ))
 _sym_db.RegisterMessage(PsRequestMessage)
 
 SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType(
     'SparseSGDRuleParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_SPARSESGDRULEPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter)
-    ))
+    dict(DESCRIPTOR=_SPARSESGDRULEPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter)
+         ))
 _sym_db.RegisterMessage(SparseSGDRuleParameter)
 
 SparseCommonSGDRuleParameter = _reflection.GeneratedProtocolMessageType(
@@ -2950,51 +2962,46 @@
 SparseAdamSGDParameter = _reflection.GeneratedProtocolMessageType(
     'SparseAdamSGDParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_SPARSEADAMSGDPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.SparseAdamSGDParameter)
-    ))
+    dict(DESCRIPTOR=_SPARSEADAMSGDPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.SparseAdamSGDParameter)
+         ))
 _sym_db.RegisterMessage(SparseAdamSGDParameter)
 
 DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType(
     'DenseSGDRuleParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_DENSESGDRULEPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter)
-    ))
+    dict(DESCRIPTOR=_DENSESGDRULEPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter)
+         ))
 _sym_db.RegisterMessage(DenseSGDRuleParameter)
 
 AdamSGDParameter = _reflection.GeneratedProtocolMessageType(
     'AdamSGDParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_ADAMSGDPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter)
-    ))
+    dict(DESCRIPTOR=_ADAMSGDPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter)
+         ))
 _sym_db.RegisterMessage(AdamSGDParameter)
 
 NaiveSGDParameter = _reflection.GeneratedProtocolMessageType(
     'NaiveSGDParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_NAIVESGDPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter)
-    ))
+    dict(DESCRIPTOR=_NAIVESGDPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter)
+         ))
 _sym_db.RegisterMessage(NaiveSGDParameter)
 
 SummarySGDParameter = _reflection.GeneratedProtocolMessageType(
     'SummarySGDParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_SUMMARYSGDPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter)
-    ))
+    dict(DESCRIPTOR=_SUMMARYSGDPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter)
+         ))
 _sym_db.RegisterMessage(SummarySGDParameter)
 
 MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType(
@@ -3010,21 +3017,19 @@
 PsResponseMessage = _reflection.GeneratedProtocolMessageType(
     'PsResponseMessage',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_PSRESPONSEMESSAGE,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage)
-    ))
+    dict(DESCRIPTOR=_PSRESPONSEMESSAGE,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage)
+         ))
 _sym_db.RegisterMessage(PsResponseMessage)
 
 FsClientParameter = _reflection.GeneratedProtocolMessageType(
     'FsClientParameter',
     (_message.Message, ),
-    dict(
-        DESCRIPTOR=_FSCLIENTPARAMETER,
-        __module__='ps_pb2'
-        # @@protoc_insertion_point(class_scope:paddle.FsClientParameter)
-    ))
+    dict(DESCRIPTOR=_FSCLIENTPARAMETER,
+         __module__='ps_pb2'
+         # @@protoc_insertion_point(class_scope:paddle.FsClientParameter)
+         ))
 _sym_db.RegisterMessage(FsClientParameter)
 
 DESCRIPTOR.has_options = True
diff --git a/python/paddle/fluid/incubate/fleet/tests/ctr_dataset_reader.py b/python/paddle/fluid/incubate/fleet/tests/ctr_dataset_reader.py
index 83343933074c0..95d47ee9baac6 100644
--- a/python/paddle/fluid/incubate/fleet/tests/ctr_dataset_reader.py
+++ b/python/paddle/fluid/incubate/fleet/tests/ctr_dataset_reader.py
@@ -22,8 +22,9 @@
 import paddle.distributed.fleet as fleet
 from paddle.fluid.log_helper import get_logger
 
-logger = get_logger(
-    "paddle", logging.INFO, fmt='%(asctime)s - %(levelname)s - %(message)s')
+logger = get_logger("paddle",
+                    logging.INFO,
+                    fmt='%(asctime)s - %(levelname)s - %(message)s')
 
 DATA_URL = "http://paddle-ctr-data.bj.bcebos.com/avazu_ctr_data.tgz"
 DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e"
@@ -60,7 +61,9 @@ def load_lr_input_record(sent):
 
 
 class DatasetCtrReader(fleet.MultiSlotDataGenerator):
+
     def generate_sample(self, line):
+
         def iter():
             fs = line.strip().split('\t')
             dnn_input = load_dnn_input_record(fs[0])
@@ -84,8 +87,7 @@ def prepare_data():
         lines = f.readlines()
     err_info = "wrong meta format"
     assert len(lines) == 2, err_info
-    assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[
-        1], err_info
+    assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[1], err_info
     res = map(int, [_.split(':')[1] for _ in lines])
     res = list(res)
     dnn_input_dim = res[0]
diff --git a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
index 06a90b78fd2e5..806de1e6da900 100644
--- a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
+++ b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
@@ -25,8 +25,9 @@
 
 import ctr_dataset_reader
 
-logger = get_logger(
-    "fluid", logging.INFO, fmt='%(asctime)s - %(levelname)s - %(message)s')
+logger = get_logger("fluid",
+                    logging.INFO,
+                    fmt='%(asctime)s - %(levelname)s - %(message)s')
 
 
 def parse_args():
@@ -48,16 +49,14 @@ def parse_args():
         type=str,
         default='127.0.0.1:6000',
         help='The path for model to store (default: 127.0.0.1:6000)')
-    parser.add_argument(
-        '--trainer_id',
-        type=int,
-        default=0,
-        help='The path for model to store (default: models)')
-    parser.add_argument(
-        '--trainers',
-        type=int,
-        default=1,
-        help='The num of trainers, (default: 1)')
+    parser.add_argument('--trainer_id',
+                        type=int,
+                        default=0,
+                        help='The path for model to store (default: models)')
+    parser.add_argument('--trainers',
+                        type=int,
+                        default=1,
+                        help='The num of trainers, (default: 1)')
 
     return parser.parse_args()
 
@@ -66,24 +65,21 @@ def model():
     dnn_input_dim, lr_input_dim, train_file_path = ctr_dataset_reader.prepare_data(
     )
     """ network definition """
-    dnn_data = fluid.layers.data(
-        name="dnn_data",
-        shape=[-1, 1],
-        dtype="int64",
-        lod_level=1,
-        append_batch_size=False)
-    lr_data = fluid.layers.data(
-        name="lr_data",
-        shape=[-1, 1],
-        dtype="int64",
-        lod_level=1,
-        append_batch_size=False)
-    label = fluid.layers.data(
-        name="click",
-        shape=[-1, 1],
-        dtype="int64",
-        lod_level=0,
-        append_batch_size=False)
+    dnn_data = fluid.layers.data(name="dnn_data",
+                                 shape=[-1, 1],
+                                 dtype="int64",
+                                 lod_level=1,
+                                 append_batch_size=False)
+    lr_data = fluid.layers.data(name="lr_data",
+                                shape=[-1, 1],
+                                dtype="int64",
+                                lod_level=1,
+                                append_batch_size=False)
+    label = fluid.layers.data(name="click",
+                              shape=[-1, 1],
+                              dtype="int64",
+                              lod_level=0,
+                              append_batch_size=False)
 
     datas = [dnn_data, lr_data, label]
 
@@ -104,8 +100,8 @@ def model():
             input=dnn_out,
             size=dim,
             act="relu",
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01)),
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.01)),
             name='dnn-fc-%d' % i)
         dnn_out = fc
 
@@ -186,16 +182,15 @@ def train(args):
             logger.info("epoch {} start".format(epoch_id))
             pass_start = time.time()
             dataset.set_filelist(filelist)
-            exe.train_from_dataset(
-                program=fleet.main_program,
-                dataset=dataset,
-                fetch_list=[avg_cost],
-                fetch_info=["cost"],
-                print_period=100,
-                debug=False)
+            exe.train_from_dataset(program=fleet.main_program,
+                                   dataset=dataset,
+                                   fetch_list=[avg_cost],
+                                   fetch_info=["cost"],
+                                   print_period=100,
+                                   debug=False)
             pass_time = time.time() - pass_start
-            logger.info("epoch {} finished, pass_time {}".format(epoch_id,
-                                                                 pass_time))
+            logger.info("epoch {} finished, pass_time {}".format(
+                epoch_id, pass_time))
         fleet.stop_worker()
 
 
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
index 5fc8fbd011629..48ce51b372489 100644
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
@@ -27,12 +27,14 @@
 from paddle.fluid.log_helper import get_logger
 from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, AFSClient
 from . import utils
+
 OpRole = core.op_proto_and_checker_maker.OpRole
 
 __all__ = ["FleetUtil", "GPUPSUtil"]
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s %(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s %(levelname)s: %(message)s')
 
 fleet = None
 
@@ -777,8 +779,10 @@ def save_cache_model(self, output_path, day, pass_id, mode=1, **kwargs):
         suffix_name = "/%s/delta-%s" % (day, pass_id)
         model_path = output_path.rstrip("/") + suffix_name
         self.rank0_print("going to save_cache_model %s" % model_path)
-        key_num = fleet.save_cache_model(
-            None, model_path, mode=mode, table_id=table_id)
+        key_num = fleet.save_cache_model(None,
+                                         model_path,
+                                         mode=mode,
+                                         table_id=table_id)
         self.rank0_print("save_cache_model done")
         return key_num
 
@@ -809,8 +813,10 @@ def save_cache_base_model(self, output_path, day, **kwargs):
         suffix_name = "/%s/base" % day
         model_path = output_path.rstrip("/") + suffix_name
         self.rank0_print("going to save_cache_base_model %s" % model_path)
-        key_num = fleet.save_cache_model(
-            None, model_path, mode=2, table_id=table_id)
+        key_num = fleet.save_cache_model(None,
+                                         model_path,
+                                         mode=2,
+                                         table_id=table_id)
         self.rank0_print("save_cache_base_model done")
         return key_num
 
@@ -853,8 +859,8 @@ def pull_all_dense_params(self, scope, program):
                                          " not found in scope " +
                                          "when pull dense")
                     var_name_list.append(var_name)
-                fleet._fleet_ptr.pull_dense(scope,
-                                            int(table.table_id), var_name_list)
+                fleet._fleet_ptr.pull_dense(scope, int(table.table_id),
+                                            var_name_list)
         fleet._role_maker._barrier_worker()
 
     def save_paddle_inference_model(self,
@@ -1022,8 +1028,11 @@ def save_paddle_params(self,
             vars = [program.global_block().var(i) for i in var_names]
             with fluid.scope_guard(scope):
                 if save_combine:
-                    fluid.io.save_vars(
-                        executor, "./", program, vars=vars, filename=model_name)
+                    fluid.io.save_vars(executor,
+                                       "./",
+                                       program,
+                                       vars=vars,
+                                       filename=model_name)
                 else:
                     fluid.io.save_vars(executor, model_name, program, vars=vars)
 
@@ -1431,7 +1440,8 @@ def get_metric(name):
 
         return [
             auc, bucket_error, mae, rmse, return_actual_ctr, predicted_ctr,
-            copc, mean_predict_qvalue, int(total_ins_num)
+            copc, mean_predict_qvalue,
+            int(total_ins_num)
         ]
 
     def print_global_metrics(self,
@@ -1523,12 +1533,12 @@ def print_global_metrics(self,
             mean_predict_qvalue, total_ins_num = self.get_global_metrics(\
             scope, stat_pos_name, stat_neg_name, sqrerr_name, abserr_name,\
             prob_name, q_name, pos_ins_num_name, total_ins_num_name)
-        self.rank0_print("%s global AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f "
-                         "RMSE=%.6f Actural_CTR=%.6f Predicted_CTR=%.6f "
-                         "COPC=%.6f MEAN Q_VALUE=%.6f Ins number=%s" %
-                         (print_prefix, auc, bucket_error, mae, rmse,
-                          actual_ctr, predicted_ctr, copc, mean_predict_qvalue,
-                          total_ins_num))
+        self.rank0_print(
+            "%s global AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f "
+            "RMSE=%.6f Actural_CTR=%.6f Predicted_CTR=%.6f "
+            "COPC=%.6f MEAN Q_VALUE=%.6f Ins number=%s" %
+            (print_prefix, auc, bucket_error, mae, rmse, actual_ctr,
+             predicted_ctr, copc, mean_predict_qvalue, total_ins_num))
 
     def program_type_trans(self, prog_dir, prog_fn, is_text):
         return utils.program_type_trans(prog_dir, prog_fn, is_text)
@@ -1609,8 +1619,8 @@ def split_program_by_device(self, program):
             if self._is_optimizer_op(op):
                 break
             if op.has_attr("op_device"):
-                cur_attr = op.attr("op_device") if op.attr(
-                    "op_device") != "" else type_cpu
+                cur_attr = op.attr(
+                    "op_device") if op.attr("op_device") != "" else type_cpu
                 if pre is None or pre != cur_attr:
                     ops_list.append([])
                     type_list.append(cur_attr)
@@ -1700,8 +1710,8 @@ def split_program_by_device(self, program):
                 send_list[i].extend(list(in_from_pre[i + 1]))
             prog = program.clone()
             if merged_type_list[i] != type_cpu:
-                prog = prog._prune_with_input(
-                    list(in_from_pre[i]), list(send_list[i]))
+                prog = prog._prune_with_input(list(in_from_pre[i]),
+                                              list(send_list[i]))
                 program_list.append(prog)
             else:
                 program_list.append(prog)
diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
index e5b2129e857f4..fb1b36e33c504 100644
--- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
@@ -36,7 +36,9 @@
 
 
 def _handle_errors(max_time_out=None):
+
     def decorator(f):
+
         @functools.wraps(f)
         def handler(*args, **kwargs):
             o = args[0]
@@ -56,13 +58,15 @@ def handler(*args, **kwargs):
                 except ExecuteError as e:
                     if time.time() - start >= time_out:
                         raise FSTimeOut("args:{} timeout:{}".format(
-                            args, time.time() - start))
+                            args,
+                            time.time() - start))
 
                     time.sleep(inter)
 
                 if time.time() - last_print_time > 30:
                     print("hadoop operator timeout:args:{} timeout:{}".format(
-                        args, time.time() - start))
+                        args,
+                        time.time() - start))
                     last_print_time = time.time()
 
         return handler
@@ -71,6 +75,7 @@ def handler(*args, **kwargs):
 
 
 class HDFSClient(FS):
+
     def __init__(
             self,
             hadoop_home,
@@ -264,8 +269,8 @@ def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
 
         if test_exists:
             if not self.is_exist(fs_src_path):
-                raise FSFileNotExistsError("{} is not exists".format(
-                    fs_src_path))
+                raise FSFileNotExistsError(
+                    "{} is not exists".format(fs_src_path))
 
             if self.is_exist(fs_dst_path):
                 raise FSFileExistsError("{} exists already".format(fs_dst_path))
diff --git a/python/paddle/fluid/incubate/fleet/utils/http_server.py b/python/paddle/fluid/incubate/fleet/utils/http_server.py
index b4ee29a065a7c..685228f07490e 100644
--- a/python/paddle/fluid/incubate/fleet/utils/http_server.py
+++ b/python/paddle/fluid/incubate/fleet/utils/http_server.py
@@ -32,8 +32,9 @@ def get_logger(name, level, fmt):
     return logger
 
 
-_http_server_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_http_server_logger = get_logger(__name__,
+                                 logging.INFO,
+                                 fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
diff --git a/python/paddle/fluid/incubate/fleet/utils/utils.py b/python/paddle/fluid/incubate/fleet/utils/utils.py
index 5cb4948a859d6..3890da7ecec4c 100644
--- a/python/paddle/fluid/incubate/fleet/utils/utils.py
+++ b/python/paddle/fluid/incubate/fleet/utils/utils.py
@@ -90,8 +90,8 @@ def check_pruned_program_vars(train_prog, pruned_prog):
                    if fluid.io.is_persistable(v)]
     pruned_vars = OrderedDict(pruned_vars)
     pruned_vars_name = [name for name in pruned_vars]
-    logger.info("persistable vars in pruned program: {}".format(
-        pruned_vars_name))
+    logger.info(
+        "persistable vars in pruned program: {}".format(pruned_vars_name))
 
     for var_name in pruned_vars:
         var = pruned_vars[var_name]
@@ -108,9 +108,9 @@ def check_pruned_program_vars(train_prog, pruned_prog):
             continue
         if var.shape != train_prog_var.shape or var.dtype != train_prog_var.dtype:
             logger.error(
-                "variable: {} not match. in pruned program shape: {} dtype:{}, in train program shape: {} dtype: {}".
-                format(var_name, var.shape, var.dtype, train_prog_var.shape,
-                       train_prog_var.dtype))
+                "variable: {} not match. in pruned program shape: {} dtype:{}, in train program shape: {} dtype: {}"
+                .format(var_name, var.shape, var.dtype, train_prog_var.shape,
+                        train_prog_var.dtype))
             is_match = False
     return is_match
 
@@ -120,11 +120,10 @@ def graphviz(block, output_dir="", filename='debug'):
     pdf_path = os.path.join(output_dir, filename + '.pdf')
     debugger.draw_block_graphviz(block, path=dot_path)
     cmd = ["dot", "-Tpdf", dot_path, "-o", pdf_path]
-    p = subprocess.Popen(
-        cmd,
-        stdin=subprocess.PIPE,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE)
+    p = subprocess.Popen(cmd,
+                         stdin=subprocess.PIPE,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE)
     p.wait()
 
 
@@ -136,17 +135,17 @@ def program_type_trans(prog_dir, prog_fn, is_text):
 
 
 def append_save_op(block, var, path):
-    block.append_op(
-        type='save', inputs={'X': [var]}, outputs={},
-        attrs={'file_path': path})
+    block.append_op(type='save',
+                    inputs={'X': [var]},
+                    outputs={},
+                    attrs={'file_path': path})
 
 
 def append_load_op(block, var, path):
-    block.append_op(
-        type='load',
-        inputs={},
-        outputs={'Out': [var]},
-        attrs={'file_path': path})
+    block.append_op(type='load',
+                    inputs={},
+                    outputs={'Out': [var]},
+                    attrs={'file_path': path})
 
 
 def save_var(np_array, var_name, shape_list, dtype, save_path):
@@ -229,8 +228,8 @@ def try_load_model_vars(dump_dir, dump_prog_fn, is_text_dump_program,
             if new_shape != orig_shape:
                 raise RuntimeError(
                     "Shape not matching: the Program requires a parameter with a shape of ({}), "
-                    "while the loaded parameter (namely [ {} ]) has a shape of  ({}).".
-                    format(orig_shape, each_var.name, new_shape))
+                    "while the loaded parameter (namely [ {} ]) has a shape of  ({})."
+                    .format(orig_shape, each_var.name, new_shape))
 
         # check feed/fetch vars in program and config
         fetch_targets_names = [v.name for v in fetch_targets]
@@ -242,8 +241,8 @@ def try_load_model_vars(dump_dir, dump_prog_fn, is_text_dump_program,
         feed_name_list = feed_target_names
         if feed_config.feeded_vars_names is not None and feed_target_names != feed_config.feeded_vars_names:
             logger.warning(
-                "feed vars in program and config are diff: feed in program: {}. feed in config {}.".
-                format(feed_target_names, feed_config.feeded_vars_names))
+                "feed vars in program and config are diff: feed in program: {}. feed in config {}."
+                .format(feed_target_names, feed_config.feeded_vars_names))
             feed_name_list = feed_config.feeded_vars_names
             # remove feed op in inference_program. new feed op will be added in exe.run
             global_block = inference_program.global_block()
@@ -256,8 +255,8 @@ def try_load_model_vars(dump_dir, dump_prog_fn, is_text_dump_program,
                 global_block._remove_op(index)
         if fetch_config.fetch_vars_names is not None and fetch_targets_names != fetch_config.fetch_vars_names:
             logger.warning(
-                "fetch vars in program and config are diff: fetch in program: {}. fetch in config {}.".
-                format(fetch_targets_names, fetch_config.fetch_vars_names))
+                "fetch vars in program and config are diff: fetch in program: {}. fetch in config {}."
+                .format(fetch_targets_names, fetch_config.fetch_vars_names))
             fetch_list = [
                 inference_program.global_block().var(i)
                 for i in fetch_config.fetch_vars_names
@@ -291,9 +290,9 @@ def try_load_model_vars(dump_dir, dump_prog_fn, is_text_dump_program,
             var_shape = var.shape[1:]
             if tensor_shape != var_shape:
                 raise RuntimeError(
-                    "feed variable '{}' shape not match. infer program  shape: {}. feed tensor shape: {}".
-                    format(feed_config.feeded_vars_names[i], var_shape,
-                           tensor_shape))
+                    "feed variable '{}' shape not match. infer program  shape: {}. feed tensor shape: {}"
+                    .format(feed_config.feeded_vars_names[i], var_shape,
+                            tensor_shape))
 
         if not feed_config.feeded_vars_filelist:
             logger.info("generate random feed vars.")
@@ -303,17 +302,15 @@ def try_load_model_vars(dump_dir, dump_prog_fn, is_text_dump_program,
                 # create fake feed tensor. if lod_level > 1, should create_lod_tensor()
                 if var.lod_level == 0:
                     feed_tensors.append(
-                        np.array(
-                            np.random.random(
-                                tuple([batch_size] + list(
-                                    feed_config.feeded_vars_dims[i]))),
-                            dtype=feed_config.feeded_vars_types[i]))
+                        np.array(np.random.random(
+                            tuple([batch_size] +
+                                  list(feed_config.feeded_vars_dims[i]))),
+                                 dtype=feed_config.feeded_vars_types[i]))
                 elif var.lod_level == 1:
-                    t = np.array(
-                        np.random.random(
-                            tuple([batch_size] + list(
-                                feed_config.feeded_vars_dims[i]))),
-                        dtype=feed_config.feeded_vars_types[i])
+                    t = np.array(np.random.random(
+                        tuple([batch_size] +
+                              list(feed_config.feeded_vars_dims[i]))),
+                                 dtype=feed_config.feeded_vars_types[i])
                     feed_tensors.append(
                         fluid.create_lod_tensor(t, [[1] * batch_size], place))
                 else:
@@ -354,8 +351,8 @@ def check_not_expected_ops(prog):
     for op in prog.global_block().ops:
         if op.type in not_expected_op_types and op.type not in op_types_set:
             logger.warning(
-                "find op type '{}' in program, please check if your program is pruned correctly !".
-                format(op.type))
+                "find op type '{}' in program, please check if your program is pruned correctly !"
+                .format(op.type))
             op_types_set.add(op.type)
 
 
@@ -366,8 +363,8 @@ def check_saved_vars_try_dump(dump_dir,
                               fetch_config,
                               batch_size=1,
                               save_filename=None):
-    dump_prog = load_program(
-        os.path.join(dump_dir, dump_prog_fn), is_text_dump_program)
+    dump_prog = load_program(os.path.join(dump_dir, dump_prog_fn),
+                             is_text_dump_program)
     saved_params = [
         v for v in dump_prog.list_vars() if fluid.io.is_persistable(v)
     ]
diff --git a/python/paddle/fluid/inference/wrapper.py b/python/paddle/fluid/inference/wrapper.py
index 950a89d08bcb9..c81ad03df73e4 100644
--- a/python/paddle/fluid/inference/wrapper.py
+++ b/python/paddle/fluid/inference/wrapper.py
@@ -30,9 +30,8 @@ def tensor_copy_from_cpu(self, data):
     '''
     Support input type check based on tensor.copy_from_cpu.
     '''
-    if isinstance(data, np.ndarray) or (isinstance(data, list) and
-                                        len(data) > 0 and
-                                        isinstance(data[0], str)):
+    if isinstance(data, np.ndarray) or (isinstance(data, list) and len(data) > 0
+                                        and isinstance(data[0], str)):
         self.copy_from_cpu_bind(data)
     else:
         raise TypeError(
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 1c8e399436625..47199fcd1adbe 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -133,30 +133,28 @@ def __call__(self, var, block=None):
         """
         block = self._check_block(block)
 
-        assert (isinstance(var, framework.Variable) or
-                isinstance(var, framework.EagerParamBase))
+        assert (isinstance(var, framework.Variable)
+                or isinstance(var, framework.EagerParamBase))
         assert isinstance(block, framework.Block)
 
         if framework._non_static_mode():
-            _C_ops.fill_constant(var, 'value',
-                                 float(self._value), 'force_cpu',
-                                 self._force_cpu, 'dtype',
+            _C_ops.fill_constant(var, 'value', float(self._value),
+                                 'force_cpu', self._force_cpu, 'dtype',
                                  int(var.dtype), 'str_value',
                                  str(float(self._value)), 'shape', var.shape)
             return None
         else:
             # fill constant should set the "str_value" to preserve precision
-            op = block.append_op(
-                type="fill_constant",
-                outputs={"Out": var},
-                attrs={
-                    "shape": var.shape,
-                    "dtype": int(var.dtype),
-                    "value": float(self._value),
-                    'str_value': str(float(self._value)),
-                    'force_cpu': self._force_cpu
-                },
-                stop_gradient=True)
+            op = block.append_op(type="fill_constant",
+                                 outputs={"Out": var},
+                                 attrs={
+                                     "shape": var.shape,
+                                     "dtype": int(var.dtype),
+                                     "value": float(self._value),
+                                     'str_value': str(float(self._value)),
+                                     'force_cpu': self._force_cpu
+                                 },
+                                 stop_gradient=True)
 
             var.op = op
             return op
@@ -233,13 +231,12 @@ def __call__(self, var, block=None):
         # to be compatible of fp16 initializers
         if var.dtype == VarDesc.VarType.FP16:
             out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['uniform_random', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
+            out_var = block.create_var(name=unique_name.generate(".".join(
+                ['uniform_random', var.name, 'tmp'])),
+                                       shape=var.shape,
+                                       dtype=out_dtype,
+                                       type=VarDesc.VarType.LOD_TENSOR,
+                                       persistable=False)
         else:
             out_dtype = var.dtype
             out_var = var
@@ -257,29 +254,29 @@ def __call__(self, var, block=None):
                 out_var._share_underline_tensor_to(var)
             return None
         else:
-            op = block.append_op(
-                type="uniform_random",
-                inputs={},
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": var.shape,
-                    "dtype": out_dtype,
-                    "min": self._low,
-                    "max": self._high,
-                    "seed": self._seed,
-                    "diag_num": self._diag_num,
-                    "diag_step": self._diag_step,
-                    "diag_val": self._diag_val
-                },
-                stop_gradient=True)
+            op = block.append_op(type="uniform_random",
+                                 inputs={},
+                                 outputs={"Out": out_var},
+                                 attrs={
+                                     "shape": var.shape,
+                                     "dtype": out_dtype,
+                                     "min": self._low,
+                                     "max": self._high,
+                                     "seed": self._seed,
+                                     "diag_num": self._diag_num,
+                                     "diag_step": self._diag_step,
+                                     "diag_val": self._diag_val
+                                 },
+                                 stop_gradient=True)
 
             if var.dtype == VarDesc.VarType.FP16:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype,
-                           "out_dtype": var.dtype})
+                block.append_op(type="cast",
+                                inputs={"X": out_var},
+                                outputs={"Out": var},
+                                attrs={
+                                    "in_dtype": out_var.dtype,
+                                    "out_dtype": var.dtype
+                                })
 
             var.op = op
             return op
@@ -334,13 +331,12 @@ def __call__(self, var, block=None):
         # to be compatible of fp16 initalizers
         if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['normal_init', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
+            out_var = block.create_var(name=unique_name.generate(".".join(
+                ['normal_init', var.name, 'tmp'])),
+                                       shape=var.shape,
+                                       dtype=out_dtype,
+                                       type=VarDesc.VarType.LOD_TENSOR,
+                                       persistable=False)
         else:
             out_dtype = var.dtype
             out_var = var
@@ -362,9 +358,10 @@ def __call__(self, var, block=None):
             return None
 
         if _in_legacy_dygraph():
-            out_var = _C_ops.gaussian_random(
-                'shape', var.shape, 'dtype', out_dtype, 'mean', self._mean,
-                'std', self._std_dev, 'seed', self._seed, 'use_mkldnn', False)
+            out_var = _C_ops.gaussian_random('shape', var.shape, 'dtype',
+                                             out_dtype, 'mean', self._mean,
+                                             'std', self._std_dev, 'seed',
+                                             self._seed, 'use_mkldnn', False)
 
             if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
                 var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
@@ -374,26 +371,26 @@ def __call__(self, var, block=None):
                 out_var._share_underline_tensor_to(var)
             return None
         else:
-            op = block.append_op(
-                type="gaussian_random",
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": var.shape,
-                    "dtype": out_dtype,
-                    "mean": self._mean,
-                    "std": self._std_dev,
-                    "seed": self._seed,
-                    "use_mkldnn": False
-                },
-                stop_gradient=True)
+            op = block.append_op(type="gaussian_random",
+                                 outputs={"Out": out_var},
+                                 attrs={
+                                     "shape": var.shape,
+                                     "dtype": out_dtype,
+                                     "mean": self._mean,
+                                     "std": self._std_dev,
+                                     "seed": self._seed,
+                                     "use_mkldnn": False
+                                 },
+                                 stop_gradient=True)
 
             if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype,
-                           "out_dtype": var.dtype})
+                block.append_op(type="cast",
+                                inputs={"X": out_var},
+                                outputs={"Out": var},
+                                attrs={
+                                    "in_dtype": out_var.dtype,
+                                    "out_dtype": var.dtype
+                                })
             var.op = op
             return op
 
@@ -446,13 +443,12 @@ def __call__(self, var, block=None):
         # to be compatible of fp16 initalizers
         if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['truncated_gaussian_random', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
+            out_var = block.create_var(name=unique_name.generate(".".join(
+                ['truncated_gaussian_random', var.name, 'tmp'])),
+                                       shape=var.shape,
+                                       dtype=out_dtype,
+                                       type=VarDesc.VarType.LOD_TENSOR,
+                                       persistable=False)
         else:
             out_dtype = var.dtype
             out_var = var
@@ -469,9 +465,11 @@ def __call__(self, var, block=None):
             return None
 
         if _in_legacy_dygraph():
-            out_var = _C_ops.truncated_gaussian_random(
-                'shape', var.shape, 'dtype', out_dtype, 'mean', self._mean,
-                'std', self._std_dev, 'seed', self._seed)
+            out_var = _C_ops.truncated_gaussian_random('shape', var.shape,
+                                                       'dtype', out_dtype,
+                                                       'mean', self._mean,
+                                                       'std', self._std_dev,
+                                                       'seed', self._seed)
             if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
                 var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
                                       'out_dtype', var.dtype)
@@ -480,25 +478,25 @@ def __call__(self, var, block=None):
                 out_var._share_underline_tensor_to(var)
             return None
         else:
-            op = block.append_op(
-                type="truncated_gaussian_random",
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": var.shape,
-                    "dtype": out_dtype,
-                    "mean": self._mean,
-                    "std": self._std_dev,
-                    "seed": self._seed
-                },
-                stop_gradient=True)
+            op = block.append_op(type="truncated_gaussian_random",
+                                 outputs={"Out": out_var},
+                                 attrs={
+                                     "shape": var.shape,
+                                     "dtype": out_dtype,
+                                     "mean": self._mean,
+                                     "std": self._std_dev,
+                                     "seed": self._seed
+                                 },
+                                 stop_gradient=True)
 
             if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype,
-                           "out_dtype": var.dtype})
+                block.append_op(type="cast",
+                                inputs={"X": out_var},
+                                outputs={"Out": var},
+                                attrs={
+                                    "in_dtype": out_var.dtype,
+                                    "out_dtype": var.dtype
+                                })
             var.op = op
             return op
 
@@ -588,13 +586,12 @@ def __call__(self, var, block=None):
         if var.dtype == VarDesc.VarType.FP16 or (
                 var.dtype == VarDesc.VarType.BF16 and not self._uniform):
             out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['xavier_init', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
+            out_var = block.create_var(name=unique_name.generate(".".join(
+                ['xavier_init', var.name, 'tmp'])),
+                                       shape=var.shape,
+                                       dtype=out_dtype,
+                                       type=VarDesc.VarType.LOD_TENSOR,
+                                       persistable=False)
         else:
             out_dtype = var.dtype
             out_var = var
@@ -613,9 +610,10 @@ def __call__(self, var, block=None):
                     out_var = _C_ops.final_state_gaussian_random(
                         out_var.shape, 0.0, std, self._seed, out_dtype, place)
                 else:
-                    out_var = _C_ops.gaussian_random(
-                        'shape', out_var.shape, 'dtype', out_dtype, 'mean', 0.0,
-                        'std', std, 'seed', self._seed)
+                    out_var = _C_ops.gaussian_random('shape', out_var.shape,
+                                                     'dtype', out_dtype, 'mean',
+                                                     0.0, 'std', std, 'seed',
+                                                     self._seed)
 
             if var.dtype == VarDesc.VarType.FP16 or (
                     var.dtype == VarDesc.VarType.BF16 and not self._uniform):
@@ -628,40 +626,39 @@ def __call__(self, var, block=None):
         else:
             if self._uniform:
                 limit = math.sqrt(6.0 / float(fan_in + fan_out))
-                op = block.append_op(
-                    type="uniform_random",
-                    inputs={},
-                    outputs={"Out": out_var},
-                    attrs={
-                        "shape": out_var.shape,
-                        "dtype": out_dtype,
-                        "min": -limit,
-                        "max": limit,
-                        "seed": self._seed
-                    },
-                    stop_gradient=True)
+                op = block.append_op(type="uniform_random",
+                                     inputs={},
+                                     outputs={"Out": out_var},
+                                     attrs={
+                                         "shape": out_var.shape,
+                                         "dtype": out_dtype,
+                                         "min": -limit,
+                                         "max": limit,
+                                         "seed": self._seed
+                                     },
+                                     stop_gradient=True)
             else:
                 std = math.sqrt(2.0 / float(fan_in + fan_out))
-                op = block.append_op(
-                    type="gaussian_random",
-                    outputs={"Out": out_var},
-                    attrs={
-                        "shape": out_var.shape,
-                        "dtype": out_dtype,
-                        "mean": 0.0,
-                        "std": std,
-                        "seed": self._seed
-                    },
-                    stop_gradient=True)
+                op = block.append_op(type="gaussian_random",
+                                     outputs={"Out": out_var},
+                                     attrs={
+                                         "shape": out_var.shape,
+                                         "dtype": out_dtype,
+                                         "mean": 0.0,
+                                         "std": std,
+                                         "seed": self._seed
+                                     },
+                                     stop_gradient=True)
 
             if var.dtype == VarDesc.VarType.FP16 or (
                     var.dtype == VarDesc.VarType.BF16 and not self._uniform):
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype,
-                           "out_dtype": var.dtype})
+                block.append_op(type="cast",
+                                inputs={"X": out_var},
+                                outputs={"Out": var},
+                                attrs={
+                                    "in_dtype": out_var.dtype,
+                                    "out_dtype": var.dtype
+                                })
 
             var.op = op
             return op
@@ -746,13 +743,12 @@ def __call__(self, var, block=None):
         if var.dtype == VarDesc.VarType.FP16 or (
                 var.dtype == VarDesc.VarType.BF16 and not self._uniform):
             out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['masra_init', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
+            out_var = block.create_var(name=unique_name.generate(".".join(
+                ['masra_init', var.name, 'tmp'])),
+                                       shape=var.shape,
+                                       dtype=out_dtype,
+                                       type=VarDesc.VarType.LOD_TENSOR,
+                                       persistable=False)
         else:
             out_dtype = var.dtype
             out_var = var
@@ -771,10 +767,11 @@ def __call__(self, var, block=None):
                     out_var = _C_ops.final_state_gaussian_random(
                         out_var.shape, 0.0, std, self._seed, out_dtype, place)
                 else:
-                    out_var = _C_ops.gaussian_random(
-                        'shape', out_var.shape, 'dtype',
-                        int(out_dtype), 'mean', 0.0, 'std', std, 'seed',
-                        self._seed)
+                    out_var = _C_ops.gaussian_random('shape',
+                                                     out_var.shape, 'dtype',
+                                                     int(out_dtype), 'mean',
+                                                     0.0, 'std', std, 'seed',
+                                                     self._seed)
 
             if var.dtype == VarDesc.VarType.FP16 or (
                     var.dtype == VarDesc.VarType.BF16 and not self._uniform):
@@ -787,41 +784,40 @@ def __call__(self, var, block=None):
         else:
             if self._uniform:
                 limit = math.sqrt(6.0 / float(fan_in))
-                op = block.append_op(
-                    type="uniform_random",
-                    inputs={},
-                    outputs={"Out": out_var},
-                    attrs={
-                        "shape": out_var.shape,
-                        "dtype": int(out_dtype),
-                        "min": -limit,
-                        "max": limit,
-                        "seed": self._seed
-                    },
-                    stop_gradient=True)
+                op = block.append_op(type="uniform_random",
+                                     inputs={},
+                                     outputs={"Out": out_var},
+                                     attrs={
+                                         "shape": out_var.shape,
+                                         "dtype": int(out_dtype),
+                                         "min": -limit,
+                                         "max": limit,
+                                         "seed": self._seed
+                                     },
+                                     stop_gradient=True)
 
             else:
                 std = math.sqrt(2.0 / float(fan_in))
-                op = block.append_op(
-                    type="gaussian_random",
-                    outputs={"Out": out_var},
-                    attrs={
-                        "shape": out_var.shape,
-                        "dtype": int(out_dtype),
-                        "mean": 0.0,
-                        "std": std,
-                        "seed": self._seed
-                    },
-                    stop_gradient=True)
+                op = block.append_op(type="gaussian_random",
+                                     outputs={"Out": out_var},
+                                     attrs={
+                                         "shape": out_var.shape,
+                                         "dtype": int(out_dtype),
+                                         "mean": 0.0,
+                                         "std": std,
+                                         "seed": self._seed
+                                     },
+                                     stop_gradient=True)
 
             if var.dtype == VarDesc.VarType.FP16 or (
                     var.dtype == VarDesc.VarType.BF16 and not self._uniform):
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype,
-                           "out_dtype": var.dtype})
+                block.append_op(type="cast",
+                                inputs={"X": out_var},
+                                outputs={"Out": var},
+                                attrs={
+                                    "in_dtype": out_var.dtype,
+                                    "out_dtype": var.dtype
+                                })
 
             var.op = op
             return op
@@ -918,13 +914,12 @@ def __call__(self, var, block=None):
                 VarDesc.VarType.FP16, VarDesc.VarType.BF16, VarDesc.VarType.FP64
         ]:
             out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['bilinear_init', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
+            out_var = block.create_var(name=unique_name.generate(".".join(
+                ['bilinear_init', var.name, 'tmp'])),
+                                       shape=var.shape,
+                                       dtype=out_dtype,
+                                       type=VarDesc.VarType.LOD_TENSOR,
+                                       persistable=False)
         else:
             out_dtype = var.dtype
             out_var = var
@@ -939,9 +934,8 @@ def __call__(self, var, block=None):
             raise ValueError("The size of input is too big. ")
 
         if framework._non_static_mode():
-            _C_ops.assign_value(out_var, 'shape',
-                                list(shape), 'dtype', out_dtype, value_name,
-                                values)
+            _C_ops.assign_value(out_var, 'shape', list(shape), 'dtype',
+                                out_dtype, value_name, values)
             if var.dtype in [
                     VarDesc.VarType.FP16, VarDesc.VarType.BF16,
                     VarDesc.VarType.FP64
@@ -953,25 +947,25 @@ def __call__(self, var, block=None):
                 out_var._share_underline_tensor_to(var)
             return None
         else:
-            op = block.append_op(
-                type='assign_value',
-                outputs={'Out': [out_var]},
-                attrs={
-                    'dtype': out_dtype,
-                    'shape': list(shape),
-                    value_name: values
-                })
+            op = block.append_op(type='assign_value',
+                                 outputs={'Out': [out_var]},
+                                 attrs={
+                                     'dtype': out_dtype,
+                                     'shape': list(shape),
+                                     value_name: values
+                                 })
 
             if var.dtype in [
                     VarDesc.VarType.FP16, VarDesc.VarType.BF16,
                     VarDesc.VarType.FP64
             ]:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype,
-                           "out_dtype": var.dtype})
+                block.append_op(type="cast",
+                                inputs={"X": out_var},
+                                outputs={"Out": var},
+                                attrs={
+                                    "in_dtype": out_var.dtype,
+                                    "out_dtype": var.dtype
+                                })
 
             var.op = op
             return op
@@ -1023,13 +1017,12 @@ def __call__(self, var, block=None):
         if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
             np_value = self._value.astype("float32")
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['numpy_array_init', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
+            out_var = block.create_var(name=unique_name.generate(".".join(
+                ['numpy_array_init', var.name, 'tmp'])),
+                                       shape=var.shape,
+                                       dtype=out_dtype,
+                                       type=VarDesc.VarType.LOD_TENSOR,
+                                       persistable=False)
         else:
             out_var = var
             out_dtype = var.dtype
@@ -1048,9 +1041,8 @@ def __call__(self, var, block=None):
                              "saving it to file and 'load_op' to load it")
 
         if framework._non_static_mode():
-            _C_ops.assign_value(out_var, 'shape',
-                                list(self._value.shape), 'dtype', out_dtype,
-                                value_name, values)
+            _C_ops.assign_value(out_var, 'shape', list(self._value.shape),
+                                'dtype', out_dtype, value_name, values)
             if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
                 var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
                                       'out_dtype', var.dtype)
@@ -1059,23 +1051,23 @@ def __call__(self, var, block=None):
                 out_var._share_underline_tensor_to(var)
             return None
         else:
-            op = block.append_op(
-                type='assign_value',
-                outputs={'Out': out_var},
-                attrs={
-                    'dtype': out_dtype,
-                    'shape': list(self._value.shape),
-                    value_name: values
-                },
-                stop_gradient=True)
+            op = block.append_op(type='assign_value',
+                                 outputs={'Out': out_var},
+                                 attrs={
+                                     'dtype': out_dtype,
+                                     'shape': list(self._value.shape),
+                                     value_name: values
+                                 },
+                                 stop_gradient=True)
 
             if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype,
-                           "out_dtype": var.dtype})
+                block.append_op(type="cast",
+                                inputs={"X": out_var},
+                                outputs={"Out": var},
+                                attrs={
+                                    "in_dtype": out_var.dtype,
+                                    "out_dtype": var.dtype
+                                })
 
             var.op = op
             return op
@@ -1200,8 +1192,9 @@ def calculate_gain(nonlinearity, param=None):
     if nonlinearity in recommended_gain.keys():
         return recommended_gain[nonlinearity]
     else:
-        raise ValueError("nonlinearity function {} is not suppported now.".
-                         format(nonlinearity))
+        raise ValueError(
+            "nonlinearity function {} is not suppported now.".format(
+                nonlinearity))
 
 
 # We short the class name, since users will use the initializer with the package
diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
index 3e46ac520903b..502a89ec36d36 100644
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -116,19 +116,18 @@ def one_hot(input, depth, allow_out_of_range=False):
         attrs = {'depth': depth, 'allow_out_of_range': allow_out_of_range}
     else:
         if not isinstance(depth, Variable):
-            # user attribute 
+            # user attribute
             inputs = {'X': input}
             attrs = {'depth': depth, 'allow_out_of_range': allow_out_of_range}
         else:
             depth.stop_gradient = True
             inputs = {'X': input, 'depth_tensor': depth}
             attrs = {'allow_out_of_range': allow_out_of_range}
-    helper.append_op(
-        type="one_hot_v2",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={'Out': one_hot_out},
-        stop_gradient=True)
+    helper.append_op(type="one_hot_v2",
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={'Out': one_hot_out},
+                     stop_gradient=True)
     return one_hot_out
 
 
@@ -317,20 +316,23 @@ def embedding(input,
     remote_prefetch = is_sparse and (not is_distributed)
     if remote_prefetch:
         assert is_sparse is True and is_distributed is False
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
+    w = helper.create_parameter(attr=helper.param_attr,
+                                shape=size,
+                                dtype=dtype,
+                                is_bias=False)
     tmp = helper.create_variable_for_type_inference(dtype)
     padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
         size[0] + padding_idx)
-    helper.append_op(
-        type='lookup_table_v2',
-        inputs={'Ids': input,
-                'W': w},
-        outputs={'Out': tmp},
-        attrs={
-            'is_sparse': is_sparse,
-            'is_distributed': is_distributed,
-            'remote_prefetch': remote_prefetch,
-            'padding_idx': padding_idx
-        })
+    helper.append_op(type='lookup_table_v2',
+                     inputs={
+                         'Ids': input,
+                         'W': w
+                     },
+                     outputs={'Out': tmp},
+                     attrs={
+                         'is_sparse': is_sparse,
+                         'is_distributed': is_distributed,
+                         'remote_prefetch': remote_prefetch,
+                         'padding_idx': padding_idx
+                     })
     return tmp
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index 111f33e613a16..0c621766b3794 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -31,6 +31,7 @@
 
 
 class SimpleLayer(Layer):
+
     def __init__(self, input_size):
         super(SimpleLayer, self).__init__()
         self._linear1 = nn.Linear(
@@ -123,8 +124,9 @@ def test_simple_exe():
         with executor.scope_guard(scope):
             with program_guard(train_prog, startup_prog):
                 with unique_name.guard():
-                    inp0 = layers.data(
-                        name="inp", shape=[2, 2], append_batch_size=False)
+                    inp0 = layers.data(name="inp",
+                                       shape=[2, 2],
+                                       append_batch_size=False)
                     simple_layer0 = SimpleLayer(input_size=2)
                     out0 = simple_layer0(inp0)
                     param_grads = backward.append_backward(
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 8b25c93d7ce08..3d071fce6c77e 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -68,11 +68,13 @@
     'get_program_persistable_vars',
 ] + reader.__all__
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 class _open_buffer(object):
+
     def __init__(self, buffer):
         self.buffer = buffer
 
@@ -81,6 +83,7 @@ def __enter__(self):
 
 
 class _buffer_reader(_open_buffer):
+
     def __init__(self, buffer):
         super(_buffer_reader, self).__init__(buffer)
         self.initial_tell = self.buffer.tell()
@@ -92,6 +95,7 @@ def __exit__(self, *args):
 
 
 class _buffer_writer(_open_buffer):
+
     def __exit__(self, *args):
         self.buffer.flush()
 
@@ -110,8 +114,8 @@ def _open_file_buffer(path_or_buffer, mode):
         elif 'r' in mode:
             return _buffer_reader(path_or_buffer)
         else:
-            raise ValueError("Expected 'r' or 'w' in mode but got {}".format(
-                mode))
+            raise ValueError(
+                "Expected 'r' or 'w' in mode but got {}".format(mode))
 
 
 def _is_memory_buffer(buffer):
@@ -236,20 +240,18 @@ def get_program_persistable_vars(program):
 def _clone_var_in_block_(block, var):
     assert isinstance(var, Variable)
     if var.desc.type() == core.VarDesc.VarType.LOD_TENSOR:
-        return block.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            lod_level=var.lod_level,
-            persistable=True)
+        return block.create_var(name=var.name,
+                                shape=var.shape,
+                                dtype=var.dtype,
+                                type=var.type,
+                                lod_level=var.lod_level,
+                                persistable=True)
     else:
-        return block.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            persistable=True)
+        return block.create_var(name=var.name,
+                                shape=var.shape,
+                                dtype=var.dtype,
+                                type=var.type,
+                                persistable=True)
 
 
 @signature_safe_contextmanager
@@ -368,12 +370,11 @@ def name_has_fc(var):
     main_program = _get_valid_program(main_program)
 
     if vars is None:
-        return save_vars(
-            executor,
-            main_program=main_program,
-            dirname=dirname,
-            vars=list(filter(predicate, main_program.list_vars())),
-            filename=filename)
+        return save_vars(executor,
+                         main_program=main_program,
+                         dirname=dirname,
+                         vars=list(filter(predicate, main_program.list_vars())),
+                         filename=filename)
     else:
         params_var_name = unique_name.generate("saved_params")
         # give warning when there is no var in model
@@ -393,8 +394,8 @@ def name_has_fc(var):
                 continue
             new_var = _clone_var_in_block_(save_block, each_var)
             if filename is None and save_to_memory is False:
-                save_file_path = os.path.join(
-                    os.path.normpath(dirname), new_var.name)
+                save_file_path = os.path.join(os.path.normpath(dirname),
+                                              new_var.name)
                 save_block.append_op(
                     type='save',
                     inputs={'X': [new_var]},
@@ -412,17 +413,16 @@ def name_has_fc(var):
             if save_to_memory is False:
                 save_path = os.path.join(os.path.normpath(dirname), filename)
 
-            saved_params = save_block.create_var(
-                type=core.VarDesc.VarType.RAW, name=params_var_name)
+            saved_params = save_block.create_var(type=core.VarDesc.VarType.RAW,
+                                                 name=params_var_name)
             saved_params.desc.set_persistable(True)
-            save_block.append_op(
-                type='save_combine',
-                inputs={'X': save_var_list},
-                outputs={'Y': saved_params},
-                attrs={
-                    'file_path': save_path,
-                    'save_to_memory': save_to_memory
-                })
+            save_block.append_op(type='save_combine',
+                                 inputs={'X': save_var_list},
+                                 outputs={'Y': saved_params},
+                                 attrs={
+                                     'file_path': save_path,
+                                     'save_to_memory': save_to_memory
+                                 })
 
         # NOTE(zhiqiu): save op will add variable kLookupTablePath in save_program.desc,
         # which leads to diff on save_program and its desc. Call _sync_with_cpp
@@ -501,13 +501,12 @@ def save_params(executor, dirname, main_program=None, filename=None):
             # The parameters weights and bias of the fc layer in the network are going to
             # be saved in different files in the path "./my_paddle_model"
     """
-    return save_vars(
-        executor,
-        dirname=dirname,
-        main_program=main_program,
-        vars=None,
-        predicate=is_parameter,
-        filename=filename)
+    return save_vars(executor,
+                     dirname=dirname,
+                     main_program=main_program,
+                     vars=None,
+                     predicate=is_parameter,
+                     filename=filename)
 
 
 def _save_distributed_persistables(executor, dirname, main_program):
@@ -581,17 +580,16 @@ def __save_remote_params(executor, dirname, remote_params_map):
                 tmp = [str(dim) for dim in slice.shape]
                 slice_shapes.append(",".join(tmp))
 
-            block.append_op(
-                type='recv_save',
-                attrs={
-                    "trainer_id": 0,
-                    "shape": origin.shape,
-                    "slice_shapes": slice_shapes,
-                    "slice_varnames": slice_varnames,
-                    "remote_varnames": remote_varnames,
-                    "endpoints": endpoints,
-                    "file_path": os.path.join(dirname, origin.name)
-                })
+            block.append_op(type='recv_save',
+                            attrs={
+                                "trainer_id": 0,
+                                "shape": origin.shape,
+                                "slice_shapes": slice_shapes,
+                                "slice_varnames": slice_varnames,
+                                "remote_varnames": remote_varnames,
+                                "endpoints": endpoints,
+                                "file_path": os.path.join(dirname, origin.name)
+                            })
 
         executor.run(prog)
 
@@ -613,11 +611,14 @@ def __save_distributed_lookup_tables(executor, dirname,
         attrs['epmap'] = endpoints
         attrs['dir'] = lookup_table_filename
         attrs['lookup_table'] = distributed_lookup_table
-        block.append_op(
-            type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+        block.append_op(type='checkpoint_notify',
+                        inputs={},
+                        outputs={},
+                        attrs=attrs)
         executor.run(prog)
 
     def __exclude_vars(exclude_var_names=[]):
+
         def is_valid(var):
             if var.name in exclude_var_names:
                 return False
@@ -652,8 +653,10 @@ def is_valid(var):
 
     local_vars = list(
         filter(__exclude_vars(exclude_var_names), main_program.list_vars()))
-    save_vars(
-        executor, main_program=main_program, dirname=dirname, vars=local_vars)
+    save_vars(executor,
+              main_program=main_program,
+              dirname=dirname,
+              vars=local_vars)
 
     if main_program._is_chief:
         if remote_params_map:
@@ -725,16 +728,16 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
             # "./my_paddle_model"
     """
     if main_program and main_program._is_distributed:
-        return _save_distributed_persistables(
-            executor, dirname=dirname, main_program=main_program)
+        return _save_distributed_persistables(executor,
+                                              dirname=dirname,
+                                              main_program=main_program)
     else:
-        return save_vars(
-            executor,
-            dirname=dirname,
-            main_program=main_program,
-            vars=None,
-            predicate=is_persistable,
-            filename=filename)
+        return save_vars(executor,
+                         dirname=dirname,
+                         main_program=main_program,
+                         vars=None,
+                         predicate=is_persistable,
+                         filename=filename)
 
 
 def load_vars(executor,
@@ -836,12 +839,11 @@ def name_has_fc(var):
                 "The type of input main_program is invalid, expected type is fluid.Program, but received %s"
                 % type(main_program))
 
-        load_vars(
-            executor,
-            dirname=dirname,
-            main_program=main_program,
-            vars=list(filter(predicate, main_program.list_vars())),
-            filename=filename)
+        load_vars(executor,
+                  dirname=dirname,
+                  main_program=main_program,
+                  vars=list(filter(predicate, main_program.list_vars())),
+                  filename=filename)
     else:
         load_prog = Program()
         load_block = load_prog.global_block()
@@ -868,8 +870,8 @@ def name_has_fc(var):
                 continue
 
             if isinstance(each_var, Parameter):
-                orig_para_shape[each_var.name] = tuple(each_var.desc.get_shape(
-                ))
+                orig_para_shape[each_var.name] = tuple(
+                    each_var.desc.get_shape())
 
             if each_var.type == core.VarDesc.VarType.SELECTED_ROWS:
                 sparse_vars.append(each_var)
@@ -902,8 +904,9 @@ def name_has_fc(var):
 
             var_path = os.path.join(dirname, new_var.name)
             if not os.path.exists(var_path):
-                raise ValueError("SelectedRows var {} can not find at {}".
-                                 format(new_var.name, var_path))
+                raise ValueError(
+                    "SelectedRows var {} can not find at {}".format(
+                        new_var.name, var_path))
 
             if os.path.isfile(var_path):
                 load_block.append_op(
@@ -921,26 +924,23 @@ def name_has_fc(var):
 
                 slices = []
                 for block in blocks:
-                    slice = load_block.create_var(
-                        name=block,
-                        type=new_var.type,
-                        shape=new_var.shape,
-                        dtype=new_var.dtype,
-                        persistable=False)
+                    slice = load_block.create_var(name=block,
+                                                  type=new_var.type,
+                                                  shape=new_var.shape,
+                                                  dtype=new_var.dtype,
+                                                  persistable=False)
                     slices.append(slice)
 
                     file_path = os.path.join(var_path, block, "Param")
-                    load_block.append_op(
-                        type='load',
-                        inputs={},
-                        outputs={'Out': [slice]},
-                        attrs={'file_path': file_path})
+                    load_block.append_op(type='load',
+                                         inputs={},
+                                         outputs={'Out': [slice]},
+                                         attrs={'file_path': file_path})
 
-                load_block.append_op(
-                    type='lookup_sparse_table_merge',
-                    inputs={'X': slices},
-                    outputs={'Out': new_var},
-                    attrs={})
+                load_block.append_op(type='lookup_sparse_table_merge',
+                                     inputs={'X': slices},
+                                     outputs={'Out': new_var},
+                                     attrs={})
 
         if filename is not None:
             load_var_list = []
@@ -950,14 +950,13 @@ def name_has_fc(var):
             if vars_from_memory is False:
                 filename = os.path.join(dirname, filename)
 
-            load_block.append_op(
-                type='load_combine',
-                inputs={},
-                outputs={"Out": load_var_list},
-                attrs={
-                    'file_path': filename,
-                    'model_from_memory': vars_from_memory
-                })
+            load_block.append_op(type='load_combine',
+                                 inputs={},
+                                 outputs={"Out": load_var_list},
+                                 attrs={
+                                     'file_path': filename,
+                                     'model_from_memory': vars_from_memory
+                                 })
         executor.run(load_prog)
 
         # check var shape
@@ -972,8 +971,8 @@ def name_has_fc(var):
             if new_shape != orig_shape:
                 raise RuntimeError(
                     "Variable's shape does not match, the Program requires a parameter with the shape of ({}), "
-                    "while the loaded parameter (namely [ {} ]) has a shape of  ({}).".
-                    format(orig_shape, each_var.name, new_shape))
+                    "while the loaded parameter (namely [ {} ]) has a shape of  ({})."
+                    .format(orig_shape, each_var.name, new_shape))
 
 
 @dygraph_not_support
@@ -1030,12 +1029,11 @@ def load_params(executor, dirname, main_program=None, filename=None):
             fluid.io.load_params(executor=exe, dirname=param_path,
                                 main_program=None)
     """
-    load_vars(
-        executor,
-        dirname=dirname,
-        main_program=main_program,
-        predicate=is_parameter,
-        filename=filename)
+    load_vars(executor,
+              dirname=dirname,
+              main_program=main_program,
+              predicate=is_parameter,
+              filename=filename)
 
 
 @dygraph_not_support
@@ -1083,15 +1081,15 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
     """
 
     if main_program and main_program._is_distributed:
-        _load_distributed_persistables(
-            executor, dirname=dirname, main_program=main_program)
+        _load_distributed_persistables(executor,
+                                       dirname=dirname,
+                                       main_program=main_program)
     else:
-        load_vars(
-            executor,
-            dirname=dirname,
-            main_program=main_program,
-            predicate=is_persistable,
-            filename=filename)
+        load_vars(executor,
+                  dirname=dirname,
+                  main_program=main_program,
+                  predicate=is_persistable,
+                  filename=filename)
 
 
 def _load_distributed_persistables(executor, dirname, main_program=None):
@@ -1141,40 +1139,40 @@ def __load_persistable_vars(executor, dirname, need_load_vars):
             offset = param.offset
 
             if is_slice:
-                slice = load_block.create_var(
-                    name=slice_var.name,
-                    type=slice_var.type,
-                    shape=slice_var.shape,
-                    dtype=slice_var.dtype,
-                    persistable=True)
-
-                load_block.append_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': [slice]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_var.name),
-                        'seek': offset,
-                        'shape': slice.shape
-                    })
+                slice = load_block.create_var(name=slice_var.name,
+                                              type=slice_var.type,
+                                              shape=slice_var.shape,
+                                              dtype=slice_var.dtype,
+                                              persistable=True)
+
+                load_block.append_op(type='load',
+                                     inputs={},
+                                     outputs={'Out': [slice]},
+                                     attrs={
+                                         'file_path':
+                                         os.path.join(dirname, origin_var.name),
+                                         'seek':
+                                         offset,
+                                         'shape':
+                                         slice.shape
+                                     })
             else:
-                origin = load_block.create_var(
-                    name="{}".format(origin_var.name),
-                    type=origin_var.type,
-                    shape=origin_var.shape,
-                    dtype=origin_var.dtype,
-                    persistable=True)
+                origin = load_block.create_var(name="{}".format(
+                    origin_var.name),
+                                               type=origin_var.type,
+                                               shape=origin_var.shape,
+                                               dtype=origin_var.dtype,
+                                               persistable=True)
                 load_block.append_op(
                     type='load',
                     inputs={},
                     outputs={'Out': [origin]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_var.name)
-                    })
+                    attrs={'file_path': os.path.join(dirname, origin_var.name)})
 
         load_block.append_op(
             type='delete_var',
-            inputs={'X': need_delete_vars}, )
+            inputs={'X': need_delete_vars},
+        )
 
         executor.run(load_prog)
 
@@ -1203,10 +1201,9 @@ def prepend_feed_ops(inference_program,
         return
 
     global_block = inference_program.global_block()
-    feed_var = global_block.create_var(
-        name=feed_holder_name,
-        type=core.VarDesc.VarType.FEED_MINIBATCH,
-        persistable=True)
+    feed_var = global_block.create_var(name=feed_holder_name,
+                                       type=core.VarDesc.VarType.FEED_MINIBATCH,
+                                       persistable=True)
 
     for i, name in enumerate(feed_target_names):
         if not global_block.has_var(name):
@@ -1214,31 +1211,27 @@ def prepend_feed_ops(inference_program,
                 "The feeded_var_names[{i}]: '{name}' doesn't exist in pruned inference program. "
                 "Please check whether '{name}' is a valid feed_var name, or remove it from feeded_var_names "
                 "if '{name}' is not involved in the target_vars calculation.".
-                format(
-                    i=i, name=name))
+                format(i=i, name=name))
         out = global_block.var(name)
-        global_block._prepend_op(
-            type='feed',
-            inputs={'X': [feed_var]},
-            outputs={'Out': [out]},
-            attrs={'col': i})
+        global_block._prepend_op(type='feed',
+                                 inputs={'X': [feed_var]},
+                                 outputs={'Out': [out]},
+                                 attrs={'col': i})
 
 
 def append_fetch_ops(inference_program,
                      fetch_target_names,
                      fetch_holder_name='fetch'):
     global_block = inference_program.global_block()
-    fetch_var = global_block.create_var(
-        name=fetch_holder_name,
-        type=core.VarDesc.VarType.FETCH_LIST,
-        persistable=True)
+    fetch_var = global_block.create_var(name=fetch_holder_name,
+                                        type=core.VarDesc.VarType.FETCH_LIST,
+                                        persistable=True)
 
     for i, name in enumerate(fetch_target_names):
-        global_block.append_op(
-            type='fetch',
-            inputs={'X': [name]},
-            outputs={'Out': [fetch_var]},
-            attrs={'col': i})
+        global_block.append_op(type='fetch',
+                               inputs={'X': [name]},
+                               outputs={'Out': [fetch_var]},
+                               attrs={'col': i})
 
 
 @static_only
@@ -1355,8 +1348,8 @@ def save_inference_model(dirname,
     if isinstance(target_vars, Variable):
         target_vars = [target_vars]
     elif export_for_deployment:
-        if not (bool(target_vars) and
-                all(isinstance(var, Variable) for var in target_vars)):
+        if not (bool(target_vars)
+                and all(isinstance(var, Variable) for var in target_vars)):
             raise ValueError("'target_vars' should be a list of Variable.")
 
     main_program = _get_valid_program(main_program)
@@ -1436,15 +1429,15 @@ def save_inference_model(dirname,
         paddle.fluid.core.save_op_version_info(main_program.desc)
         with open(model_basename, "wb") as f:
             f.write(
-                main_program._remove_training_info(clip_extra=clip_extra)
-                .desc.serialize_to_string())
+                main_program._remove_training_info(
+                    clip_extra=clip_extra).desc.serialize_to_string())
     else:
         # TODO(panyx0718): Save more information so that it can also be used
         # for training and more flexible post-processing.
         with open(model_basename + ".main_program", "wb") as f:
             f.write(
-                main_program._remove_training_info(clip_extra=clip_extra)
-                .desc.serialize_to_string())
+                main_program._remove_training_info(
+                    clip_extra=clip_extra).desc.serialize_to_string())
 
     if program_only:
         warnings.warn(
@@ -1788,8 +1781,9 @@ def _unpack_saved_dict(saved_obj, protocol):
                             part_name = key + "@@." + str(i)
                             unpack_infor[key]["slices"].append(part_name)
                             temp_saved_obj[part_name] = value[
-                                i * MAX_NUMBER_OF_ELEMENT:MAX_NUMBER_OF_ELEMENT
-                                * (i + 1)]
+                                i *
+                                MAX_NUMBER_OF_ELEMENT:MAX_NUMBER_OF_ELEMENT *
+                                (i + 1)]
 
     if unpack_infor:
         for key, value in unpack_infor.items():
@@ -1808,8 +1802,8 @@ def _pack_loaded_dict(load_obj):
             removes = []
             for key, value in load_obj[unpack_info].items():
                 slices = [load_obj[part] for part in value["slices"]]
-                load_obj[key] = np.concatenate(slices).reshape(value[
-                    "OriginShape"])
+                load_obj[key] = np.concatenate(slices).reshape(
+                    value["OriginShape"])
                 removes += value["slices"]
             for key in removes:
                 load_obj.pop(key)
@@ -1820,6 +1814,7 @@ def _pack_loaded_dict(load_obj):
 
 @static_only
 def _legacy_save(param_dict, model_path, protocol=2):
+
     def get_tensor(var):
         if isinstance(var, (core.VarBase, core.eager.Tensor)):
             return var.numpy()
@@ -1897,8 +1892,9 @@ def save(program, model_path, protocol=4, **configs):
             type(protocol)))
 
     if protocol < 2 or protocol > 4:
-        raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
-                         format(protocol))
+        raise ValueError(
+            "Expected 1<'protocol'<5, but received protocol={}".format(
+                protocol))
 
     dir_name = os.path.dirname(model_path)
     if dir_name and not os.path.exists(dir_name):
@@ -2011,8 +2007,8 @@ def load(program, model_path, executor=None, var_list=None):
         # model file save by fluid.save not found, try to load model file saved with
         # [save_vars, save_params, save_persistables]
         _logger.debug(
-            "{} not found, try to load model file saved with [ save_params, save_persistables, save_vars ]".
-            format(parameter_file_name))
+            "{} not found, try to load model file saved with [ save_params, save_persistables, save_vars ]"
+            .format(parameter_file_name))
         if executor is None:
             raise ValueError(
                 "executor is required when loading model file saved with [ save_params, save_persistables, save_vars ]"
@@ -2042,8 +2038,9 @@ def load(program, model_path, executor=None, var_list=None):
                 _logger.warning("variable file [ %s ] not used" %
                                 (" ".join(list(binary_file_set))))
             try:
-                load_vars(
-                    executor=executor, dirname=model_path, vars=loaded_var_list)
+                load_vars(executor=executor,
+                          dirname=model_path,
+                          vars=loaded_var_list)
             except RuntimeError as e:
                 _logger.error(e)
                 raise e
@@ -2069,11 +2066,10 @@ def load(program, model_path, executor=None, var_list=None):
 
             dir_name, file_name = os.path.split(model_path)
             try:
-                load_vars(
-                    executor=executor,
-                    dirname=dir_name,
-                    vars=var_list,
-                    filename=file_name)
+                load_vars(executor=executor,
+                          dirname=dir_name,
+                          vars=var_list,
+                          filename=file_name)
             except RuntimeError as e:
                 _logger.error(e)
                 raise e
@@ -2200,8 +2196,8 @@ def load_program_state(model_path, var_list=None):
         # model file saved with fluid.save is not found, try to load model file saved with
         # [save_vars, save_params, save_persistables]
         _logger.debug(
-            "{} not found, try to load model file saved with [ save_params, save_persistables, save_vars ]".
-            format(parameter_file_name))
+            "{} not found, try to load model file saved with [ save_params, save_persistables, save_vars ]"
+            .format(parameter_file_name))
 
         var_name_list = []
         if var_list is None and os.path.isfile(model_path):
@@ -2227,9 +2223,8 @@ def clone_var_to_block(block, var):
                     shape=var.shape,
                     dtype=var.dtype,
                     type=var.type,
-                    lod_level=var.lod_level
-                    if var.desc.type() == core.VarDesc.VarType.LOD_TENSOR else
-                    None,
+                    lod_level=var.lod_level if var.desc.type()
+                    == core.VarDesc.VarType.LOD_TENSOR else None,
                     persistable=True)
 
             def _load_vars_with_try_catch(exe,
@@ -2238,11 +2233,10 @@ def _load_vars_with_try_catch(exe,
                                           filename,
                                           raise_error=True):
                 try:
-                    load_vars(
-                        executor=exe,
-                        dirname=dirname,
-                        vars=vars,
-                        filename=filename)
+                    load_vars(executor=exe,
+                              dirname=dirname,
+                              vars=vars,
+                              filename=filename)
                     return True
                 except:
                     error_str = "Failed to load model/variables `%s`, please make sure " \
@@ -2278,21 +2272,21 @@ def _load_vars_with_try_catch(exe,
                                               None)
                 else:
                     for var_name in var_name_list:
-                        # NOTE(chenweihang): If identify which files the user wants 
-                        # to load from the disk, we load these variables one by one. 
-                        # If a file does not exist, we only warn the user that the 
-                        # file may be an irrelevant file, but does not throw an error 
+                        # NOTE(chenweihang): If identify which files the user wants
+                        # to load from the disk, we load these variables one by one.
+                        # If a file does not exist, we only warn the user that the
+                        # file may be an irrelevant file, but does not throw an error
                         # to ensure that other legal variables can be loaded.
-                        temp_var = load_block.create_var(
-                            name=var_name, persistable=True)
+                        temp_var = load_block.create_var(name=var_name,
+                                                         persistable=True)
                         if _load_vars_with_try_catch(exe, model_path,
                                                      [temp_var], None, False):
                             loaded_var_list.append(temp_var)
 
             res_dict = {}
             for var in loaded_var_list:
-                res_dict[var.name] = np.asarray(paddle.fluid.global_scope(
-                ).find_var(var.name).get_tensor())
+                res_dict[var.name] = np.asarray(
+                    paddle.fluid.global_scope().find_var(var.name).get_tensor())
 
             return res_dict
 
@@ -2412,5 +2406,5 @@ def set_program_state(program, state_dict):
             unused_para_list.append(k)
     if len(unused_para_list) > 0:
         warnings.warn(
-            "This list is not set, Because of Paramerter not found in program. There are: {}".
-            format(" ".join(unused_para_list)))
+            "This list is not set, Because of Paramerter not found in program. There are: {}"
+            .format(" ".join(unused_para_list)))
diff --git a/python/paddle/fluid/ir.py b/python/paddle/fluid/ir.py
index 55297ed516ffb..aca134a1df55a 100644
--- a/python/paddle/fluid/ir.py
+++ b/python/paddle/fluid/ir.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -67,6 +67,7 @@ def _update_grad_persistable(main_program):
 
 def apply_build_strategy(main_program, startup_program, build_strategy,
                          pass_attrs):
+
     def update_attr(attrs, attr_types, name, value, typ=None):
         if name not in attrs:
             attrs[name] = value
@@ -101,6 +102,9 @@ def apply_pass(name):
     if build_strategy.enable_auto_fusion and use_cuda:
         apply_pass("fusion_group_pass")
         build_strategy.enable_auto_fusion = False
+    if build_strategy.fuse_gemm_epilogue:
+        apply_pass("fuse_gemm_epilogue_pass")
+        build_strategy.fuse_gemm_epilogue = False
     if build_strategy.fuse_elewise_add_act_ops:
         apply_pass("fuse_elewise_add_act_pass")
         build_strategy.fuse_elewise_add_act_ops = False
@@ -117,7 +121,7 @@ def apply_pass(name):
         apply_pass("runtime_context_cache_pass")
         build_strategy.cache_runtime_context = False
     if build_strategy.enable_addto and use_cuda:
-        # NOTE: how to get fetch vars to skip memory optimization?  
+        # NOTE: how to get fetch vars to skip memory optimization?
         apply_pass("inplace_addto_op_pass")
         build_strategy.enable_addto = False
     if build_strategy.enable_inplace:
@@ -188,8 +192,8 @@ def _func_to_program_desc(self, func, ops):
                     op_outs = out.Outputs()
                     if len(op_outs) != 1:
                         raise ValueError(
-                            "Operator '{}' has multiple outputs, please specify one output variable.".
-                            format(out._type))
+                            "Operator '{}' has multiple outputs, please specify one output variable."
+                            .format(out._type))
                     for op_out in op_outs.values():
                         vars.extend(op_out)
                 else:
@@ -201,6 +205,7 @@ def _func_to_program_desc(self, func, ops):
         return vars, program.current_block().ops
 
     def _convert_vars_to_pass_desc(self, patterns, replaces, desc):
+
         def _add_element_conditions(conditions, elements):
             for element in elements:
                 if element._condition:
@@ -247,7 +252,7 @@ def SerializeMultiPassDesc(self):
         multi_pass_desc = pass_desc_pb2.MultiPassDesc()
         multi_pass_desc.pass_type = self._pass_type
         # Traverse all pass pairs and convert them to PassDesc data.
-        # Here need to add cache in the future. 
+        # Here need to add cache in the future.
         for (pattern, replace) in self._pass_pairs:
             pass_desc = multi_pass_desc.pass_descs.add()
             # Convert ProgramDescs of pattern and replace subgraphs.
@@ -264,7 +269,9 @@ def SerializeMultiPassDesc(self):
 
 
 class PassDesc(object):
+
     class AttrHelper(object):
+
         def __init__(self, obj, name, element_index=None):
             self._obj = obj
             self._name = name
@@ -276,8 +283,9 @@ def __init__(self, obj, name, element_index=None):
             self._mapped = None
 
         def __getitem__(self, index):
-            element = PassDesc.AttrHelper(
-                self._obj, self._name, element_index=index)
+            element = PassDesc.AttrHelper(self._obj,
+                                          self._name,
+                                          element_index=index)
             self._elements.append(element)
             return element
 
@@ -370,12 +378,14 @@ def mapped_op(pattern_ops):
                     raise ValueError(
                         "Index '{}' of operator '{}' is incorrect.".format(
                             index, op))
-                return PassDesc.AttrHelper(
-                    ops[index], name, element_index=element_index)
+                return PassDesc.AttrHelper(ops[index],
+                                           name,
+                                           element_index=element_index)
 
             self._mapped = mapped_op if var is None else mapped_var
 
     class VarHelper(paddle.static.Variable):
+
         def __init__(self, *args, **kwargs):
             block = paddle.static.default_main_program().current_block()
             self._var = paddle.static.data(*args, **kwargs)
@@ -392,6 +402,7 @@ def Attr(self, name):
             return attr
 
     class OpHelper(object):
+
         def __init__(self, type=None):
             self._type = type
 
@@ -422,8 +433,8 @@ def __call__(self, *args, **kwargs):
                         op_outs = in_arg.Outputs()
                         if len(op_outs) != 1:
                             raise ValueError(
-                                "The size of outputs of operator '{}' is not equal 1, please specify one output variable.".
-                                format(in_arg._type))
+                                "The size of outputs of operator '{}' is not equal 1, please specify one output variable."
+                                .format(in_arg._type))
                         for op_out in op_outs.values():
                             op_input.extend(op_out)
                     else:
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index f60d1a9059452..42b67a5a0dfa8 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -28,6 +28,7 @@
 
 
 class LayerHelper(LayerHelperBase):
+
     def __init__(self, layer_type, **kwargs):
         self.kwargs = kwargs
         name = self.kwargs.get('name', None)
@@ -37,8 +38,8 @@ def __init__(self, layer_type, **kwargs):
         if name is None:
             self.kwargs['name'] = unique_name.generate(layer_type)
 
-        super(LayerHelper, self).__init__(
-            self.kwargs['name'], layer_type=layer_type)
+        super(LayerHelper, self).__init__(self.kwargs['name'],
+                                          layer_type=layer_type)
 
     def append_op(self, *args, **kwargs):
         return self.main_program.current_block().append_op(*args, **kwargs)
@@ -125,15 +126,18 @@ def append_bias_op(self, input_var, dim_start=1, dim_end=None):
         if not bias_attr:
             return input_var
 
-        b = self.create_parameter(
-            attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
+        b = self.create_parameter(attr=bias_attr,
+                                  shape=size,
+                                  dtype=input_var.dtype,
+                                  is_bias=True)
         tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
-        self.append_op(
-            type='elementwise_add',
-            inputs={'X': [input_var],
-                    'Y': [b]},
-            outputs={'Out': [tmp]},
-            attrs={'axis': dim_start})
+        self.append_op(type='elementwise_add',
+                       inputs={
+                           'X': [input_var],
+                           'Y': [b]
+                       },
+                       outputs={'Out': [tmp]},
+                       attrs={'axis': dim_start})
         return tmp
 
     #TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of act
@@ -151,7 +155,8 @@ def append_activation(self, input_var):
             use_cudnn = self.kwargs.get('use_cudnn')
             act['use_cudnn'] = use_cudnn
         use_mkldnn = self.kwargs.get(
-            'use_mkldnn', _global_flags().get("FLAGS_use_mkldnn", False))
+            'use_mkldnn',
+            _global_flags().get("FLAGS_use_mkldnn", False))
         if use_mkldnn:
             act['use_mkldnn'] = use_mkldnn
         act_type = act.pop('type')
@@ -161,11 +166,10 @@ def append_activation(self, input_var):
             return res
         else:
             tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
-            self.append_op(
-                type=act_type,
-                inputs={"X": [input_var]},
-                outputs={"Out": [tmp]},
-                attrs=act)
+            self.append_op(type=act_type,
+                           inputs={"X": [input_var]},
+                           outputs={"Out": [tmp]},
+                           attrs=act)
             return tmp
 
     #TODO (jiabin): should we remove this since it has never be used
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 47f0c02d28725..cb604b1ce89a8 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -83,16 +83,15 @@ def to_variable(self, value, name=None):
         """
         if isinstance(value, np.ndarray):
             if _in_eager_without_dygraph_check():
-                return core.eager.Tensor(value,
-                                         _current_expected_place(), False,
-                                         False, name if name else None, True)
+                return core.eager.Tensor(value, _current_expected_place(),
+                                         False, False, name if name else None,
+                                         True)
             else:
-                py_var = core.VarBase(
-                    value=value,
-                    name=name if name else '',
-                    persistable=False,
-                    place=_current_expected_place(),
-                    zero_copy=False)
+                py_var = core.VarBase(value=value,
+                                      name=name if name else '',
+                                      persistable=False,
+                                      place=_current_expected_place(),
+                                      zero_copy=False)
                 return py_var
         elif isinstance(value, (core.VarBase, Variable, core.eager.Tensor)):
             return value
@@ -123,37 +122,35 @@ def __norm_op(x,
                     [self.name, 'weight_norm_abs'])),
                 dtype=dtype,
                 persistable=False)
-            block.append_op(
-                type='abs', inputs={'X': x}, outputs={'Out': abs_out})
+            block.append_op(type='abs',
+                            inputs={'X': x},
+                            outputs={'Out': abs_out})
             pow_out = block.create_var(
                 name=unique_name.generate_with_ignorable_key(".".join(
                     [self.name, 'weight_norm_pow'])),
                 dtype=dtype,
                 persistable=False)
-            block.append_op(
-                type='pow',
-                inputs={'X': abs_out},
-                outputs={'Out': pow_out},
-                attrs={'factor': float(p)})
+            block.append_op(type='pow',
+                            inputs={'X': abs_out},
+                            outputs={'Out': pow_out},
+                            attrs={'factor': float(p)})
             sum_out = block.create_var(
                 name=unique_name.generate_with_ignorable_key(".".join(
                     [self.name, 'weight_norm_sum'])),
                 dtype=dtype,
                 persistable=False)
-            block.append_op(
-                type='reduce_sum',
-                inputs={'X': pow_out},
-                outputs={'Out': sum_out},
-                attrs={
-                    'dim': dim,
-                    'keep_dim': keep_dim,
-                    'reduce_all': True if dim is None else False
-                })
-            block.append_op(
-                type='pow',
-                inputs={'X': sum_out},
-                outputs={'Out': out},
-                attrs={'factor': 1. / p})
+            block.append_op(type='reduce_sum',
+                            inputs={'X': pow_out},
+                            outputs={'Out': sum_out},
+                            attrs={
+                                'dim': dim,
+                                'keep_dim': keep_dim,
+                                'reduce_all': True if dim is None else False
+                            })
+            block.append_op(type='pow',
+                            inputs={'X': sum_out},
+                            outputs={'Out': out},
+                            attrs={'factor': 1. / p})
             return out
 
         def __reshape_op(x,
@@ -166,11 +163,10 @@ def __reshape_op(x,
                         [self.name, 'weight_norm_reshape'])),
                     dtype=dtype,
                     persistable=False)
-            block.append_op(
-                type='reshape',
-                inputs={'X': x},
-                outputs={'Out': out},
-                attrs={'shape': shape})
+            block.append_op(type='reshape',
+                            inputs={'X': x},
+                            outputs={'Out': out},
+                            attrs={'shape': shape})
             return out
 
         def __transpose_op(x,
@@ -183,11 +179,10 @@ def __transpose_op(x,
                         [self.name, 'weight_norm_transpose'])),
                     dtype=dtype,
                     persistable=False)
-            block.append_op(
-                type='transpose',
-                inputs={'X': x},
-                outputs={'Out': out},
-                attrs={'axis': axis})
+            block.append_op(type='transpose',
+                            inputs={'X': x},
+                            outputs={'Out': out},
+                            attrs={'axis': axis})
             return out
 
         def __norm_except_dim(x,
@@ -217,10 +212,11 @@ def __norm_except_dim(x,
                 perm = list(range(len(x.shape)))
                 perm[0], perm[dim] = dim, 0
                 transpose = __transpose_op(x, perm, block=block)
-                out_shape = [transpose.shape[0]] + [1] * (len(transpose.shape) -
-                                                          1)
-                reshape = __reshape_op(
-                    transpose, shape=[transpose.shape[0], -1], block=block)
+                out_shape = [transpose.shape[0]
+                             ] + [1] * (len(transpose.shape) - 1)
+                reshape = __reshape_op(transpose,
+                                       shape=[transpose.shape[0], -1],
+                                       block=block)
                 norm = __norm_op(reshape, dim=[1], block=block)
                 reshape2 = __reshape_op(norm, shape=out_shape, block=block)
                 __transpose_op(reshape2, perm, out=out, block=block)
@@ -228,18 +224,18 @@ def __norm_except_dim(x,
 
         def __weight_normalize(g, v, dim):
             """Calculations for weight normalization"""
-            norm = __norm_except_dim(
-                v, dim=dim, block=self.main_program.current_block())
+            norm = __norm_except_dim(v,
+                                     dim=dim,
+                                     block=self.main_program.current_block())
             scale = elementwise_div(
                 x=g, y=norm)  # The shapes of g and norm are the same.
             # Currently, elementwise_mul only support broadcast when the shape
             # of y is a subset of the shape of x. Thus, we reshape y to squeeze
             # to achieve the subset.
-            w = elementwise_mul(
-                x=v,
-                y=scale if dim is None else reshape(
-                    x=scale, shape=[v.shape[dim]]),
-                axis=-1 if dim is None else dim)
+            w = elementwise_mul(x=v,
+                                y=scale if dim is None else reshape(
+                                    x=scale, shape=[v.shape[dim]]),
+                                axis=-1 if dim is None else dim)
             # To serialize the original parameter for inference, maybe a
             # parameter rather than a variable should be returned.
             return w
@@ -268,18 +264,16 @@ def __weight_normalize(g, v, dim):
             dtype=dtype,
             shape=v_param_shape,
             **v_param_attr._to_kwargs(with_initializer=True))
-        __norm_except_dim(
-            x=v_param,
-            out=g_param,
-            dim=attr.dim,
-            block=self.startup_program.global_block())
+        __norm_except_dim(x=v_param,
+                          out=g_param,
+                          dim=attr.dim,
+                          block=self.startup_program.global_block())
 
         # keep g_param shape to be consistent with that in main_program
-        __reshape_op(
-            g_param,
-            g_param_shape,
-            out=g_param,
-            block=self.startup_program.global_block())
+        __reshape_op(g_param,
+                     g_param_shape,
+                     out=g_param,
+                     block=self.startup_program.global_block())
 
         # Add weight normalization to main_program
         g_param = self.main_program.global_block().create_parameter(
@@ -316,9 +310,9 @@ def create_parameter(self,
             return None
         assert isinstance(attr, ParamAttr)
         for i, size in enumerate(shape):
-            assert size > 0, (
-                "Expected every dim's size to be larger than 0, "
-                "but the size of the {}-th dim is {}".format(i, size))
+            assert size > 0, ("Expected every dim's size to be larger than 0, "
+                              "but the size of the {}-th dim is {}".format(
+                                  i, size))
         # set global dtype
         if not dtype:
             dtype = self.__dtype
@@ -344,8 +338,8 @@ def create_parameter(self,
                         "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
                     )
             else:
-                if not (dtype.startswith("float") or
-                        dtype in ["double", "uint16"]):
+                if not (dtype.startswith("float")
+                        or dtype in ["double", "uint16"]):
                     raise TypeError(
                         "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
                     )
diff --git a/python/paddle/fluid/layers/collective.py b/python/paddle/fluid/layers/collective.py
index 0b4211cbb63dc..b0e285e036ebc 100644
--- a/python/paddle/fluid/layers/collective.py
+++ b/python/paddle/fluid/layers/collective.py
@@ -43,23 +43,25 @@ def _allreduce(x, out=None, reduce_type="sum", sync_mode=False):
             type=x.type,
             persistable=x.persistable,
             stop_gradient=True)
-    helper.append_op(
-        type='allreduce',
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={"reduce_type": red_typ_int,
-               "sync_mode": sync_mode})
+    helper.append_op(type='allreduce',
+                     inputs={'X': [x]},
+                     outputs={'Out': [out]},
+                     attrs={
+                         "reduce_type": red_typ_int,
+                         "sync_mode": sync_mode
+                     })
     return out
 
 
 def _broadcast(x, root, sync_mode=False):
     helper = LayerHelper("broadcast", **locals())
-    helper.append_op(
-        type='broadcast',
-        inputs={'X': [x]},
-        outputs={'Out': [x]},
-        attrs={"sync_mode": sync_mode,
-               "root": root})
+    helper.append_op(type='broadcast',
+                     inputs={'X': [x]},
+                     outputs={'Out': [x]},
+                     attrs={
+                         "sync_mode": sync_mode,
+                         "root": root
+                     })
     return x
 
 
@@ -83,27 +85,27 @@ def _c_allreduce(x,
             type=x.type,
             persistable=x.persistable)
 
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={'ring_id': ring_id,
-               'use_calc_stream': use_calc_stream})
+    helper.append_op(type=op_type,
+                     inputs={'X': [x]},
+                     outputs={'Out': [out]},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': use_calc_stream
+                     })
     return out
 
 
 def _c_broadcast(x, root=0, ring_id=0, use_calc_stream=False):
     op_type = 'c_broadcast'
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [x]},
-        outputs={'Out': [x]},
-        attrs={
-            'root': root,
-            'ring_id': ring_id,
-            'use_calc_stream': use_calc_stream
-        })
+    helper.append_op(type=op_type,
+                     inputs={'X': [x]},
+                     outputs={'Out': [x]},
+                     attrs={
+                         'root': root,
+                         'ring_id': ring_id,
+                         'use_calc_stream': use_calc_stream
+                     })
     return x
 
 
@@ -128,22 +130,20 @@ def _c_allgather(x, nranks, ring_id=0, use_calc_stream=False):
     out_shape = list(x.shape[:])
     if out_shape[0] > 0:
         out_shape[0] *= nranks
-    out = helper.create_variable(
-        name=unique_name.generate_with_ignorable_key('.'.join(
-            [x.name, op_type])),
-        shape=out_shape,
-        dtype=x.dtype,
-        type=x.type,
-        persistable=x.persistable)
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={
-            'nranks': nranks,
-            'ring_id': ring_id,
-            'use_calc_stream': use_calc_stream
-        })
+    out = helper.create_variable(name=unique_name.generate_with_ignorable_key(
+        '.'.join([x.name, op_type])),
+                                 shape=out_shape,
+                                 dtype=x.dtype,
+                                 type=x.type,
+                                 persistable=x.persistable)
+    helper.append_op(type=op_type,
+                     inputs={'X': [x]},
+                     outputs={'Out': [out]},
+                     attrs={
+                         'nranks': nranks,
+                         'ring_id': ring_id,
+                         'use_calc_stream': use_calc_stream
+                     })
     return out
 
 
@@ -152,30 +152,29 @@ def _c_reducescatter(x, nranks, ring_id=0, use_calc_stream=False):
         raise TypeError('x must be a Variable')
 
     if x.shape[0] > 0 and x.shape[0] % nranks != 0:
-        raise ValueError('x.shape[0](%d) cannot be evenly divided by nranks(%d)'
-                         % (x.shape[0], nranks))
+        raise ValueError(
+            'x.shape[0](%d) cannot be evenly divided by nranks(%d)' %
+            (x.shape[0], nranks))
 
     op_type = 'c_reducescatter'
     helper = LayerHelper(op_type, **locals())
     out_shape = list(x.shape[:])
     if out_shape[0] > 0:
         out_shape[0] //= nranks
-    out = helper.create_variable(
-        name=unique_name.generate_with_ignorable_key('.'.join(
-            [x.name, op_type])),
-        shape=out_shape,
-        dtype=x.dtype,
-        type=x.type,
-        persistable=x.persistable)
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={
-            'nranks': nranks,
-            'ring_id': ring_id,
-            'use_calc_stream': use_calc_stream
-        })
+    out = helper.create_variable(name=unique_name.generate_with_ignorable_key(
+        '.'.join([x.name, op_type])),
+                                 shape=out_shape,
+                                 dtype=x.dtype,
+                                 type=x.type,
+                                 persistable=x.persistable)
+    helper.append_op(type=op_type,
+                     inputs={'X': [x]},
+                     outputs={'Out': [out]},
+                     attrs={
+                         'nranks': nranks,
+                         'ring_id': ring_id,
+                         'use_calc_stream': use_calc_stream
+                     })
     return out
 
 
@@ -189,9 +188,8 @@ def _c_sync_calc_stream(x):
 def _c_sync_comm_stream(x, ring_id):
     op_type = 'c_sync_comm_stream'
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [x]},
-        outputs={'Out': [x]},
-        attrs={'ring_id': ring_id})
+    helper.append_op(type=op_type,
+                     inputs={'X': [x]},
+                     outputs={'Out': [x]},
+                     attrs={'ring_id': ring_id})
     return x
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index d143a6637f821..4c3a4e5e8fcb1 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -61,11 +61,12 @@ def select_output(input, outputs, mask):
     check_variable_and_dtype(mask, 'mask', ['int32'], 'select_output')
     check_type(outputs, 'outputs', (list, tuple), 'select_output')
 
-    helper.append_op(
-        type='select_output',
-        inputs={'X': input,
-                'Mask': mask},
-        outputs={'Out': outputs})
+    helper.append_op(type='select_output',
+                     inputs={
+                         'X': input,
+                         'Mask': mask
+                     },
+                     outputs={'Out': outputs})
     return outputs
 
 
@@ -92,13 +93,15 @@ def select_input(inputs, mask):
     input_shape = inputs[0].shape
     input_type = inputs[0].type
 
-    out = helper.create_variable(
-        dtype=input_dtype, shape=input_shape, type=input_type)
-    helper.append_op(
-        type='select_input',
-        inputs={'X': inputs,
-                'Mask': mask},
-        outputs={'Out': out})
+    out = helper.create_variable(dtype=input_dtype,
+                                 shape=input_shape,
+                                 type=input_type)
+    helper.append_op(type='select_input',
+                     inputs={
+                         'X': inputs,
+                         'Mask': mask
+                     },
+                     outputs={'Out': out})
     return out
 
 
@@ -110,19 +113,20 @@ def select_input_with_buildin_type(inputs, mask):
     if isinstance(false_var, Variable) and isinstance(true_var, Variable):
         return select_input(inputs, mask)
 
-    elif (isinstance(false_var, (support_ret_buildin_type)) and
-          isinstance(false_var, type(true_var))):
+    elif (isinstance(false_var, (support_ret_buildin_type))
+          and isinstance(false_var, type(true_var))):
         if false_var == true_var:
             return false_var
         else:
             inputs = [
-                to_static_variable(false_var), to_static_variable(true_var)
+                to_static_variable(false_var),
+                to_static_variable(true_var)
             ]
     # Deal with the situations like this: false_var is int and true_var is Variable
-    elif ((isinstance(false_var, support_ret_buildin_type) and
-           isinstance(true_var, Variable)) or
-          (isinstance(true_var, support_ret_buildin_type) and
-           isinstance(false_var, Variable))):
+    elif ((isinstance(false_var, support_ret_buildin_type)
+           and isinstance(true_var, Variable))
+          or (isinstance(true_var, support_ret_buildin_type)
+              and isinstance(false_var, Variable))):
         inputs = [to_static_variable(false_var), to_static_variable(true_var)]
         warnings.warn(
             "Return results from different branches in cond are not same type: "
@@ -178,15 +182,16 @@ def split_lod_tensor(input, mask, level=0):
     helper = LayerHelper('split_lod_tensor', **locals())
     out_true = helper.create_variable_for_type_inference(dtype=input.dtype)
     out_false = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='split_lod_tensor',
-        inputs={
-            'X': input,
-            'Mask': mask,
-        },
-        outputs={'OutTrue': out_true,
-                 'OutFalse': out_false},
-        attrs={'level': level})
+    helper.append_op(type='split_lod_tensor',
+                     inputs={
+                         'X': input,
+                         'Mask': mask,
+                     },
+                     outputs={
+                         'OutTrue': out_true,
+                         'OutFalse': out_false
+                     },
+                     attrs={'level': level})
     return out_true, out_false
 
 
@@ -236,14 +241,15 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
     check_type(in_false, 'in_false', (Variable, list, tuple, type(None)),
                'fluid.layers.merge_lod_tensor')
     out = helper.create_variable_for_type_inference(dtype=in_true.dtype)
-    helper.append_op(
-        type='merge_lod_tensor',
-        inputs={'X': x,
-                'Mask': mask,
-                'InTrue': in_true,
-                'InFalse': in_false},
-        outputs={'Out': out},
-        attrs={'level': level})
+    helper.append_op(type='merge_lod_tensor',
+                     inputs={
+                         'X': x,
+                         'Mask': mask,
+                         'InTrue': in_true,
+                         'InFalse': in_false
+                     },
+                     outputs={'Out': out},
+                     attrs={'level': level})
     return out
 
 
@@ -321,21 +327,20 @@ def Print(input,
 
     helper = LayerHelper('print' + "_" + input.name, **locals())
     output = helper.create_variable_for_type_inference(input.dtype)
-    helper.append_op(
-        type='print',
-        inputs={'In': input},
-        outputs={'Out': output},
-        attrs={
-            'first_n': first_n,
-            'summarize': summarize,
-            'message': message or "",
-            'print_tensor_name': print_tensor_name,
-            'print_tensor_type': print_tensor_type,
-            'print_tensor_shape': print_tensor_shape,
-            'print_tensor_layout': print_tensor_layout,
-            'print_tensor_lod': print_tensor_lod,
-            'print_phase': print_phase.upper()
-        })
+    helper.append_op(type='print',
+                     inputs={'In': input},
+                     outputs={'Out': output},
+                     attrs={
+                         'first_n': first_n,
+                         'summarize': summarize,
+                         'message': message or "",
+                         'print_tensor_name': print_tensor_name,
+                         'print_tensor_type': print_tensor_type,
+                         'print_tensor_shape': print_tensor_shape,
+                         'print_tensor_layout': print_tensor_layout,
+                         'print_tensor_lod': print_tensor_lod,
+                         'print_phase': print_phase.upper()
+                     })
     return output
 
 
@@ -402,11 +407,12 @@ def Assert(cond, data=None, summarize=20, name=None):
     layer_name = name if name else ('assert_' + cond.name)
     helper = LayerHelper(layer_name, **locals())
 
-    op = helper.append_op(
-        type="assert",
-        inputs={"Cond": cond,
-                "Data": [] if data is None else list(data)},
-        attrs={"summarize": summarize})
+    op = helper.append_op(type="assert",
+                          inputs={
+                              "Cond": cond,
+                              "Data": [] if data is None else list(data)
+                          },
+                          attrs={"summarize": summarize})
 
     return op
 
@@ -456,8 +462,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             return False
         self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
         self.rnn._complete_op()
-        return super(BlockGuardWithCompletion, self).__exit__(exc_type, exc_val,
-                                                              exc_tb)
+        return super(BlockGuardWithCompletion,
+                     self).__exit__(exc_type, exc_val, exc_tb)
 
 
 class StaticRNNMemoryLink(object):
@@ -652,23 +658,21 @@ def memory(self,
             parent_block = self._parent_block()
             var_name = unique_name.generate_with_ignorable_key("@".join(
                 [self.helper.name, "memory_boot"]))
-            boot_var = parent_block.create_var(
-                name=var_name,
-                shape=shape,
-                dtype=batch_ref.dtype,
-                persistable=False)
-
-            parent_block.append_op(
-                type="fill_constant_batch_size_like",
-                inputs={'Input': [batch_ref]},
-                outputs={'Out': [boot_var]},
-                attrs={
-                    'value': init_value,
-                    'shape': boot_var.shape,
-                    'dtype': boot_var.dtype,
-                    'input_dim_idx': ref_batch_dim_idx,
-                    'output_dim_idx': init_batch_dim_idx
-                })
+            boot_var = parent_block.create_var(name=var_name,
+                                               shape=shape,
+                                               dtype=batch_ref.dtype,
+                                               persistable=False)
+
+            parent_block.append_op(type="fill_constant_batch_size_like",
+                                   inputs={'Input': [batch_ref]},
+                                   outputs={'Out': [boot_var]},
+                                   attrs={
+                                       'value': init_value,
+                                       'shape': boot_var.shape,
+                                       'dtype': boot_var.dtype,
+                                       'input_dim_idx': ref_batch_dim_idx,
+                                       'output_dim_idx': init_batch_dim_idx
+                                   })
 
             return self.memory(init=boot_var)
         else:
@@ -677,8 +681,8 @@ def memory(self,
                     [self.helper.name, "mem"])),
                 dtype=init.dtype,
                 shape=init.shape)
-            self.memories[pre_mem.name] = StaticRNNMemoryLink(
-                init=init, pre_mem=pre_mem)
+            self.memories[pre_mem.name] = StaticRNNMemoryLink(init=init,
+                                                              pre_mem=pre_mem)
             return pre_mem
 
     def step_input(self, x):
@@ -727,8 +731,10 @@ def step_input(self, x):
         elif x.shape[0] != -1 and self.seq_len != x.shape[0]:
             raise ValueError("Static RNN only take fix seq_len input")
 
-        ipt = self.helper.create_variable(
-            name=x.name, dtype=x.dtype, shape=list(x.shape[1:]), type=x.type)
+        ipt = self.helper.create_variable(name=x.name,
+                                          dtype=x.dtype,
+                                          shape=list(x.shape[1:]),
+                                          type=x.type)
         self.inputs.append(ipt)
         return ipt
 
@@ -777,16 +783,15 @@ def step_output(self, o):
         check_type(o, "o", Variable, "fluid.layers.StaticRNN.step_output")
 
         tmp_o = self.helper.create_variable_for_type_inference(dtype=o.dtype)
-        self.helper.append_op(
-            type='rnn_memory_helper',
-            inputs={'X': [o]},
-            outputs={'Out': tmp_o},
-            attrs={'dtype': o.dtype})
+        self.helper.append_op(type='rnn_memory_helper',
+                              inputs={'X': [o]},
+                              outputs={'Out': tmp_o},
+                              attrs={'dtype': o.dtype})
 
-        out_var = self._parent_block().create_var(
-            name=tmp_o.name,
-            shape=[self.seq_len] + list(tmp_o.shape),
-            dtype=tmp_o.dtype)
+        out_var = self._parent_block().create_var(name=tmp_o.name,
+                                                  shape=[self.seq_len] +
+                                                  list(tmp_o.shape),
+                                                  dtype=tmp_o.dtype)
 
         self.outputs.append(out_var)
 
@@ -920,32 +925,33 @@ def _complete_op(self):
             assert isinstance(mem_var, Variable)
             new_mem = self.helper.create_variable_for_type_inference(
                 dtype=mem_var.dtype)
-            rnn_block.append_op(
-                type='rnn_memory_helper',
-                inputs={'X': [mem_var]},
-                outputs={'Out': [new_mem]},
-                attrs={'dtype': mem_var.dtype})
+            rnn_block.append_op(type='rnn_memory_helper',
+                                inputs={'X': [mem_var]},
+                                outputs={'Out': [new_mem]},
+                                attrs={'dtype': mem_var.dtype})
 
             memories.append(new_mem.name)
 
-        parent_block.append_op(
-            type='recurrent',
-            inputs={
-                'inputs': inlinks,
-                'initial_states': boot_memories,
-                'parameters': parameters
-            },
-            outputs={'outputs': outlinks,
-                     'step_scopes': [step_scope]},
-            attrs={
-                'has_states': len(pre_memories) > 0,
-                'ex_states': pre_memories,
-                'states': memories,
-                'sub_block': rnn_block
-            })
+        parent_block.append_op(type='recurrent',
+                               inputs={
+                                   'inputs': inlinks,
+                                   'initial_states': boot_memories,
+                                   'parameters': parameters
+                               },
+                               outputs={
+                                   'outputs': outlinks,
+                                   'step_scopes': [step_scope]
+                               },
+                               attrs={
+                                   'has_states': len(pre_memories) > 0,
+                                   'ex_states': pre_memories,
+                                   'states': memories,
+                                   'sub_block': rnn_block
+                               })
 
 
 class WhileGuard(BlockGuard):
+
     def __init__(self, while_op):
         if not isinstance(while_op, While):
             raise TypeError("WhileGuard takes a while op")
@@ -1114,8 +1120,8 @@ def block(self):
     def _complete(self):
         main_program = self.helper.main_program
         while_block = main_program.current_block()
-        parent_block = main_program.block(main_program.current_block()
-                                          .parent_idx)
+        parent_block = main_program.block(
+            main_program.current_block().parent_idx)
 
         inner_outputs = {self.cond_var.name}
         x_name_list = set()
@@ -1134,16 +1140,18 @@ def _complete(self):
         parent_block.append_op(
             type='while',
             inputs={
-                'X': [
-                    parent_block._var_recursive(x_name)
-                    for x_name in x_name_list
-                ],
+                'X':
+                [parent_block._var_recursive(x_name) for x_name in x_name_list],
                 'Condition': [self.cond_var]
             },
-            outputs={'Out': out_vars,
-                     'StepScopes': [step_scope]},
-            attrs={'sub_block': while_block,
-                   "is_test": self.is_test})
+            outputs={
+                'Out': out_vars,
+                'StepScopes': [step_scope]
+            },
+            attrs={
+                'sub_block': while_block,
+                "is_test": self.is_test
+            })
 
 
 def assign_skip_lod_tensor_array(input, output):
@@ -1156,8 +1164,8 @@ def assign_skip_lod_tensor_array(input, output):
 
     if input.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         main_program = input.block.program
-        parent_block = main_program.block(main_program.current_block()
-                                          .parent_idx)
+        parent_block = main_program.block(
+            main_program.current_block().parent_idx)
         if parent_block and not parent_block._find_var_recursive(input.name):
             assign(input, output)
     else:
@@ -1260,9 +1268,9 @@ def body(i, ten):
         try:
             assert_same_structure(output_vars, loop_vars, check_types=False)
         except ValueError as e:
-            raise ValueError("body in while_loop should return the same arity "
-                             "(length and structure) as loop_vars: {0}".format(
-                                 e))
+            raise ValueError(
+                "body in while_loop should return the same arity "
+                "(length and structure) as loop_vars: {0}".format(e))
         now_cond = cond(*output_vars)
         map_structure(assign_skip_lod_tensor_array, output_vars, loop_vars)
         assign(now_cond, pre_cond)
@@ -1324,14 +1332,12 @@ def lod_rank_table(x, level=0):
                        'lod_rank_table')
 
     helper = LayerHelper("lod_rank_table", **locals())
-    table = helper.create_variable(
-        type=core.VarDesc.VarType.LOD_RANK_TABLE,
-        name=unique_name.generate("lod_rank_table"))
-    helper.append_op(
-        type='lod_rank_table',
-        inputs={'X': x},
-        outputs={'Out': table},
-        attrs={'level': level})
+    table = helper.create_variable(type=core.VarDesc.VarType.LOD_RANK_TABLE,
+                                   name=unique_name.generate("lod_rank_table"))
+    helper.append_op(type='lod_rank_table',
+                     inputs={'X': x},
+                     outputs={'Out': table},
+                     attrs={'level': level})
     return table
 
 
@@ -1354,10 +1360,9 @@ def max_sequence_len(rank_table):
     """
     helper = LayerHelper("max_seqence_len", **locals())
     res = helper.create_variable_for_type_inference(dtype="int64")
-    helper.append_op(
-        type="max_sequence_len",
-        inputs={"RankTable": rank_table},
-        outputs={"Out": res})
+    helper.append_op(type="max_sequence_len",
+                     inputs={"RankTable": rank_table},
+                     outputs={"Out": res})
     return res
 
 
@@ -1405,11 +1410,12 @@ def lod_tensor_to_array(x, table):
         name=unique_name.generate("lod_tensor_to_array"),
         type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
         dtype=x.dtype)
-    helper.append_op(
-        type='lod_tensor_to_array',
-        inputs={'X': x,
-                'RankTable': table},
-        outputs={'Out': array})
+    helper.append_op(type='lod_tensor_to_array',
+                     inputs={
+                         'X': x,
+                         'RankTable': table
+                     },
+                     outputs={'Out': array})
     return array
 
 
@@ -1448,11 +1454,12 @@ def array_to_lod_tensor(x, table):
 
     helper = LayerHelper("array_to_lod_tensor", **locals())
     tmp = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="array_to_lod_tensor",
-        inputs={'X': x,
-                'RankTable': table},
-        outputs={'Out': tmp})
+    helper.append_op(type="array_to_lod_tensor",
+                     inputs={
+                         'X': x,
+                         'RankTable': table
+                     },
+                     outputs={'Out': tmp})
     return tmp
 
 
@@ -1484,11 +1491,10 @@ def increment(x, value=1.0, in_place=True):
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = x
-    helper.append_op(
-        type='increment',
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={'step': float(value)})
+    helper.append_op(type='increment',
+                     inputs={'X': [x]},
+                     outputs={'Out': [out]},
+                     attrs={'step': float(value)})
     return out
 
 
@@ -1572,8 +1578,8 @@ def array_write(x, i, array=None):
     helper = LayerHelper('array_write', **locals())
     if array is not None:
         if not isinstance(
-                array,
-                Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+                array, Variable
+        ) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
             raise TypeError(
                 "array should be tensor array vairable in array_write Op")
     if array is None:
@@ -1581,11 +1587,12 @@ def array_write(x, i, array=None):
             name="{0}.out".format(helper.name),
             type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
             dtype=x.dtype)
-    helper.append_op(
-        type='write_to_array',
-        inputs={'X': [x],
-                'I': [i]},
-        outputs={'Out': [array]})
+    helper.append_op(type='write_to_array',
+                     inputs={
+                         'X': [x],
+                         'I': [i]
+                     },
+                     outputs={'Out': [array]})
     return array
 
 
@@ -1616,16 +1623,16 @@ def create_array(dtype, initialized_list=None):
     if initialized_list is not None:
         if not isinstance(initialized_list, (list, tuple)):
             raise TypeError(
-                "Require type(initialized_list) should be list/tuple, but received {}".
-                format(type(initialized_list)))
+                "Require type(initialized_list) should be list/tuple, but received {}"
+                .format(type(initialized_list)))
         array = list(initialized_list)
 
     # NOTE: Only support plain list like [x, y,...], not support nested list in static mode.
     for val in array:
         if not isinstance(val, Variable):
             raise TypeError(
-                "All values in `initialized_list` should be Variable, but recevied {}.".
-                format(type(val)))
+                "All values in `initialized_list` should be Variable, but recevied {}."
+                .format(type(val)))
 
     if _non_static_mode():
         return array
@@ -1689,12 +1696,13 @@ def less_than(x, y, force_cpu=None, cond=None, name=None):
     if force_cpu is not None:
         attrs['force_cpu'] = force_cpu
 
-    helper.append_op(
-        type='less_than',
-        inputs={'X': [x],
-                'Y': [y]},
-        outputs={'Out': [cond]},
-        attrs=attrs)
+    helper.append_op(type='less_than',
+                     inputs={
+                         'X': [x],
+                         'Y': [y]
+                     },
+                     outputs={'Out': [cond]},
+                     attrs=attrs)
     return cond
 
 
@@ -1743,12 +1751,13 @@ def less_equal(x, y, cond=None, name=None):
 
     attrs = dict()
 
-    helper.append_op(
-        type='less_equal',
-        inputs={'X': [x],
-                'Y': [y]},
-        outputs={'Out': [cond]},
-        attrs=attrs)
+    helper.append_op(type='less_equal',
+                     inputs={
+                         'X': [x],
+                         'Y': [y]
+                     },
+                     outputs={'Out': [cond]},
+                     attrs=attrs)
     return cond
 
 
@@ -1799,12 +1808,13 @@ def greater_than(x, y, cond=None, name=None):
     if in_dygraph_mode():
         return _C_ops.final_state_greater_than(x, y, -1)
     else:
-        helper.append_op(
-            type='greater_than',
-            inputs={'X': [x],
-                    'Y': [y]},
-            outputs={'Out': [cond]},
-            attrs=attrs)
+        helper.append_op(type='greater_than',
+                         inputs={
+                             'X': [x],
+                             'Y': [y]
+                         },
+                         outputs={'Out': [cond]},
+                         attrs=attrs)
         return cond
 
 
@@ -1854,12 +1864,13 @@ def greater_equal(x, y, cond=None, name=None):
 
     attrs = dict()
 
-    helper.append_op(
-        type='greater_equal',
-        inputs={'X': [x],
-                'Y': [y]},
-        outputs={'Out': [cond]},
-        attrs=attrs)
+    helper.append_op(type='greater_equal',
+                     inputs={
+                         'X': [x],
+                         'Y': [y]
+                     },
+                     outputs={'Out': [cond]},
+                     attrs=attrs)
     return cond
 
 
@@ -1904,9 +1915,12 @@ def equal(x, y, cond=None, name=None):
         cond = helper.create_variable_for_type_inference(dtype='bool')
         cond.stop_gradient = True
 
-    helper.append_op(
-        type='equal', inputs={'X': [x],
-                              'Y': [y]}, outputs={'Out': [cond]})
+    helper.append_op(type='equal',
+                     inputs={
+                         'X': [x],
+                         'Y': [y]
+                     },
+                     outputs={'Out': [cond]})
     return cond
 
 
@@ -1950,9 +1964,12 @@ def not_equal(x, y, cond=None, name=None):
         cond = helper.create_variable_for_type_inference(dtype='bool')
         cond.stop_gradient = True
 
-    helper.append_op(
-        type='not_equal', inputs={'X': [x],
-                                  'Y': [y]}, outputs={'Out': [cond]})
+    helper.append_op(type='not_equal',
+                     inputs={
+                         'X': [x],
+                         'Y': [y]
+                     },
+                     outputs={'Out': [cond]})
     return cond
 
 
@@ -2037,11 +2054,12 @@ def array_read(array, i):
             Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         raise TypeError("array should be tensor array vairable")
     out = helper.create_variable_for_type_inference(dtype=array.dtype)
-    helper.append_op(
-        type='read_from_array',
-        inputs={'X': [array],
-                'I': [i]},
-        outputs={'Out': [out]})
+    helper.append_op(type='read_from_array',
+                     inputs={
+                         'X': [array],
+                         'I': [i]
+                     },
+                     outputs={'Out': [out]})
     return out
 
 
@@ -2075,13 +2093,14 @@ def shrink_memory(x, i, table):
     check_type(i, 'i', Variable, 'shrink_memory')
     check_type(table, 'table', Variable, 'shrink_memory')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='shrink_rnn_memory',
-        inputs={'X': [x],
-                'I': [i],
-                'RankTable': [table]},
-        outputs={'Out': [out]},
-        attrs={})
+    helper.append_op(type='shrink_rnn_memory',
+                     inputs={
+                         'X': [x],
+                         'I': [i],
+                         'RankTable': [table]
+                     },
+                     outputs={'Out': [out]},
+                     attrs={})
     return out
 
 
@@ -2146,8 +2165,9 @@ def array_length(array):
     helper = LayerHelper('array_length', **locals())
     tmp = helper.create_variable_for_type_inference(dtype='int64')
     tmp.stop_gradient = True
-    helper.append_op(
-        type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
+    helper.append_op(type='lod_array_length',
+                     inputs={'X': [array]},
+                     outputs={'Out': [tmp]})
     return tmp
 
 
@@ -2169,8 +2189,8 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.block.complete()
-        return super(ConditionalBlockGuard, self).__exit__(exc_type, exc_val,
-                                                           exc_tb)
+        return super(ConditionalBlockGuard,
+                     self).__exit__(exc_type, exc_val, exc_tb)
 
 
 class ConditionalBlock(object):
@@ -2216,8 +2236,10 @@ def complete(self):
 
         intermediate = set()
         params = set()
-        params, intermediate = get_inputs_outputs_in_block(
-            inside_block, params, intermediate, helper=self.helper)
+        params, intermediate = get_inputs_outputs_in_block(inside_block,
+                                                           params,
+                                                           intermediate,
+                                                           helper=self.helper)
 
         # Todo(liym27) Here assume that all params are in recursive parent block
         # but when minimize() called in control flow, some params may be in
@@ -2240,8 +2262,10 @@ def complete(self):
                 'Cond': self.inputs,
                 'Input': param_list,
             },
-            outputs={'Out': out_list,
-                     'Scope': [step_scope]},
+            outputs={
+                'Out': out_list,
+                'Scope': [step_scope]
+            },
             attrs={
                 'sub_block': inside_block,
                 'is_scalar_condition': self.is_scalar_condition
@@ -2299,8 +2323,8 @@ def append_conditional_block_grad(self, parent_block, inside_block,
                 param_list.append(cpt.to_text(inner_var.name))
 
         grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-            conditional_block_op.desc,
-            cpt.to_text(set()), [grad_sub_block.desc])
+            conditional_block_op.desc, cpt.to_text(set()),
+            [grad_sub_block.desc])
 
         # append op_desc in grad_op_descs to target_block
         op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
@@ -2315,9 +2339,8 @@ def append_conditional_block_grad(self, parent_block, inside_block,
 
         new_vars = set()
         for grad_var_name in new_op_desc.output_arg_names():
-            if grad_sub_block.desc.has_var_recursive(
-                    cpt.to_bytes(grad_var_name)
-            ) or grad_var_name == core.empty_var_name():
+            if grad_sub_block.desc.has_var_recursive(cpt.to_bytes(
+                    grad_var_name)) or grad_var_name == core.empty_var_name():
                 continue
             grad_sub_block.desc.var(cpt.to_bytes(grad_var_name))
             new_vars.add(grad_var_name)
@@ -2347,8 +2370,9 @@ def copy_var_to_parent_block(var, layer_helper):
             and parent_block._find_var_recursive(var.name):
         parent_block_var = var
     else:
-        parent_block_var = parent_block.create_var(
-            dtype=var.dtype, shape=var.shape, type=var.type)
+        parent_block_var = parent_block.create_var(dtype=var.dtype,
+                                                   shape=var.shape,
+                                                   type=var.type)
         assign(var, parent_block_var)
     return parent_block_var
 
@@ -2464,8 +2488,8 @@ def false_func():
             if false_fn is not None:
                 if not callable(false_fn):
                     raise TypeError(
-                        "The false_fn in cond must be callable, but received {}".
-                        format(type(false_fn).__name__))
+                        "The false_fn in cond must be callable, but received {}"
+                        .format(type(false_fn).__name__))
                 return false_fn()
         return None
 
@@ -2491,8 +2515,8 @@ def false_func():
             raise TypeError(
                 "The false_fn in cond must be callable, but received {}".format(
                     type(false_fn).__name__))
-        false_cond_block = ConditionalBlock(
-            [logical_not(pred)], is_scalar_condition=True)
+        false_cond_block = ConditionalBlock([logical_not(pred)],
+                                            is_scalar_condition=True)
         with false_cond_block.block():
             origin_false_output = false_fn()
             if origin_false_output is not None:
@@ -2520,7 +2544,8 @@ def false_func():
             format(e))
 
     mask = cast(pred, dtype='int32')
-    merge_func = lambda false_var, true_var : select_input_with_buildin_type([false_var, true_var], mask)
+    merge_func = lambda false_var, true_var: select_input_with_buildin_type(
+        [false_var, true_var], mask)
     merged_output = map_structure(merge_func, false_output, true_output)
     return merged_output
 
@@ -2618,7 +2643,8 @@ def _case_check_args(pred_fn_pairs, default):
             if len(pred_fn) != 2:
                 raise TypeError(
                     _error_message("The tuple's size", "pred_fn_pairs", "case",
-                                   "2", str(len(pred_fn)) + "-tuple"))
+                                   "2",
+                                   str(len(pred_fn)) + "-tuple"))
             pred, fn = pred_fn
 
             if not isinstance(pred, Variable):
@@ -2741,12 +2767,11 @@ def case(self, condition):
         else:
             pre_cond_num = len(self.pre_not_conditions)
             pre_not_cond = self.pre_not_conditions[pre_cond_num - 1]
-            new_not_cond = logical_and(
-                x=pre_not_cond, y=logical_not(x=condition))
+            new_not_cond = logical_and(x=pre_not_cond,
+                                       y=logical_not(x=condition))
             self.pre_not_conditions.append(new_not_cond)
             cond_block = ConditionalBlock(
-                [logical_and(
-                    x=pre_not_cond, y=condition)],
+                [logical_and(x=pre_not_cond, y=condition)],
                 is_scalar_condition=True)
 
         return ConditionalBlockGuard(cond_block)
@@ -2777,6 +2802,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 class IfElseBlockGuard(object):
+
     def __init__(self, is_true, ifelse):
         if not isinstance(ifelse, IfElse):
             raise TypeError("ifelse must be an instance of IfElse class")
@@ -2913,15 +2939,16 @@ def input(self, x):
                 name=unique_name.generate_with_ignorable_key('ifelse_input' +
                                                              self.helper.name),
                 dtype=x.dtype)
-            parent_block.append_op(
-                type='split_lod_tensor',
-                inputs={
-                    'X': x,
-                    'Mask': self.cond,
-                },
-                outputs={'OutTrue': out_true,
-                         'OutFalse': out_false},
-                attrs={'level': 0})
+            parent_block.append_op(type='split_lod_tensor',
+                                   inputs={
+                                       'X': x,
+                                       'Mask': self.cond,
+                                   },
+                                   outputs={
+                                       'OutTrue': out_true,
+                                       'OutFalse': out_false
+                                   },
+                                   attrs={'level': 0})
             self.input_table[id(x)] = (out_true, out_false)
         else:
             out_true, out_false = self.input_table[id(x)]
@@ -2978,12 +3005,11 @@ def __call__(self):
         rlist = []
         for false_var, true_var in zip(*self.output_table):
             rlist.append(
-                merge_lod_tensor(
-                    in_true=true_var,
-                    in_false=false_var,
-                    mask=self.cond,
-                    x=self.cond,
-                    level=0))
+                merge_lod_tensor(in_true=true_var,
+                                 in_false=false_var,
+                                 mask=self.cond,
+                                 x=self.cond,
+                                 level=0))
         return rlist
 
 
@@ -3173,37 +3199,37 @@ def step_input(self, x, level=0):
                 name=unique_name.generate('lod_rank_table'),
                 type=core.VarDesc.VarType.LOD_RANK_TABLE)
             self.lod_rank_table.stop_gradient = True
-            parent_block.append_op(
-                type='lod_rank_table',
-                inputs={"X": x},
-                outputs={"Out": self.lod_rank_table},
-                attrs={"level": level})
+            parent_block.append_op(type='lod_rank_table',
+                                   inputs={"X": x},
+                                   outputs={"Out": self.lod_rank_table},
+                                   attrs={"level": level})
             self.max_seq_len = parent_block.create_var(
                 name=unique_name.generate('dynamic_rnn_max_seq_len'),
                 dtype='int64')
             self.max_seq_len.stop_gradient = False
-            parent_block.append_op(
-                type='max_sequence_len',
-                inputs={'RankTable': self.lod_rank_table},
-                outputs={"Out": self.max_seq_len})
+            parent_block.append_op(type='max_sequence_len',
+                                   inputs={'RankTable': self.lod_rank_table},
+                                   outputs={"Out": self.max_seq_len})
             self.cond.stop_gradient = True
-            parent_block.append_op(
-                type='less_than',
-                inputs={'X': self.step_idx,
-                        'Y': self.max_seq_len},
-                outputs={'Out': self.cond},
-                attrs={'force_cpu': True})
+            parent_block.append_op(type='less_than',
+                                   inputs={
+                                       'X': self.step_idx,
+                                       'Y': self.max_seq_len
+                                   },
+                                   outputs={'Out': self.cond},
+                                   attrs={'force_cpu': True})
 
         input_array = parent_block.create_var(
             name=unique_name.generate('dynamic_rnn_input_array'),
             type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
             dtype=x.dtype)
         self.input_array.append((input_array, x.dtype))
-        parent_block.append_op(
-            type='lod_tensor_to_array',
-            inputs={'X': x,
-                    'RankTable': self.lod_rank_table},
-            outputs={'Out': input_array})
+        parent_block.append_op(type='lod_tensor_to_array',
+                               inputs={
+                                   'X': x,
+                                   'RankTable': self.lod_rank_table
+                               },
+                               outputs={'Out': input_array})
         return array_read(array=input_array, i=self.step_idx)
 
     def static_input(self, x):
@@ -3342,11 +3368,12 @@ def static_input(self, x):
             name=unique_name.generate("dynamic_rnn_static_input_reordered"),
             type=core.VarDesc.VarType.LOD_TENSOR,
             dtype=x.dtype)
-        parent_block.append_op(
-            type='reorder_lod_tensor_by_rank',
-            inputs={'X': [x],
-                    'RankTable': [self.lod_rank_table]},
-            outputs={'Out': [x_reordered]})
+        parent_block.append_op(type='reorder_lod_tensor_by_rank',
+                               inputs={
+                                   'X': [x],
+                                   'RankTable': [self.lod_rank_table]
+                               },
+                               outputs={'Out': [x_reordered]})
         return shrink_memory(x_reordered, self.step_idx, self.lod_rank_table)
 
     @signature_safe_contextmanager
@@ -3361,8 +3388,10 @@ def block(self):
         """
         if self.status != DynamicRNN.BEFORE_RNN:
             raise ValueError("rnn.block() can only be invoke once")
-        self.step_idx = fill_constant(
-            shape=[1], dtype='int64', value=0, force_cpu=True)
+        self.step_idx = fill_constant(shape=[1],
+                                      dtype='int64',
+                                      value=0,
+                                      force_cpu=True)
         self.step_idx.stop_gradient = False
         self.status = DynamicRNN.IN_RNN
         with self.while_op.block():
@@ -3372,17 +3401,15 @@ def block(self):
             for new_mem, mem_array in self.mem_link:
                 array_write(x=new_mem, i=self.step_idx, array=mem_array)
 
-            less_than(
-                x=self.step_idx,
-                y=self.max_seq_len,
-                force_cpu=True,
-                cond=self.cond)
+            less_than(x=self.step_idx,
+                      y=self.max_seq_len,
+                      force_cpu=True,
+                      cond=self.cond)
 
         self.status = DynamicRNN.AFTER_RNN
         for each_array in self.output_array:
             self.outputs.append(
-                array_to_lod_tensor(
-                    x=each_array, table=self.lod_rank_table))
+                array_to_lod_tensor(x=each_array, table=self.lod_rank_table))
 
     def __call__(self, *args, **kwargs):
         """
@@ -3516,26 +3543,27 @@ def memory(self,
                     name=unique_name.generate('dynamic_rnn_mem_init_reordered'),
                     type=core.VarDesc.VarType.LOD_TENSOR,
                     dtype=init.dtype)
-                parent_block.append_op(
-                    type='reorder_lod_tensor_by_rank',
-                    inputs={
-                        'X': [init_tensor],
-                        'RankTable': [self.lod_rank_table]
-                    },
-                    outputs={'Out': [init_reordered]})
+                parent_block.append_op(type='reorder_lod_tensor_by_rank',
+                                       inputs={
+                                           'X': [init_tensor],
+                                           'RankTable': [self.lod_rank_table]
+                                       },
+                                       outputs={'Out': [init_reordered]})
                 init_tensor = init_reordered
             mem_array = parent_block.create_var(
                 name=unique_name.generate('dynamic_rnn_mem_array'),
                 type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
                 dtype=init.dtype)
-            parent_block.append_op(
-                type='write_to_array',
-                inputs={'X': init_tensor,
-                        'I': self.zero_idx},
-                outputs={'Out': mem_array})
+            parent_block.append_op(type='write_to_array',
+                                   inputs={
+                                       'X': init_tensor,
+                                       'I': self.zero_idx
+                                   },
+                                   outputs={'Out': mem_array})
             retv = array_read(array=mem_array, i=self.step_idx)
-            retv = shrink_memory(
-                x=retv, i=self.step_idx, table=self.lod_rank_table)
+            retv = shrink_memory(x=retv,
+                                 i=self.step_idx,
+                                 table=self.lod_rank_table)
             self.mem_dict[retv.name] = mem_array
             return retv
         else:
@@ -3547,22 +3575,22 @@ def memory(self,
             init = parent_block.create_var(
                 name=unique_name.generate('mem_init'), dtype=dtype)
             arr, dtype = self.input_array[0]
-            in0 = parent_block.create_var(
-                name=unique_name.generate('in0'), dtype=dtype)
-            parent_block.append_op(
-                type='read_from_array',
-                inputs={'X': [arr],
-                        'I': [self.zero_idx]},
-                outputs={'Out': [in0]})
-            parent_block.append_op(
-                type='fill_constant_batch_size_like',
-                inputs={'Input': [in0]},
-                outputs={'Out': [init]},
-                attrs={
-                    'shape': [-1] + shape,
-                    'value': float(value),
-                    'dtype': init.dtype
-                })
+            in0 = parent_block.create_var(name=unique_name.generate('in0'),
+                                          dtype=dtype)
+            parent_block.append_op(type='read_from_array',
+                                   inputs={
+                                       'X': [arr],
+                                       'I': [self.zero_idx]
+                                   },
+                                   outputs={'Out': [in0]})
+            parent_block.append_op(type='fill_constant_batch_size_like',
+                                   inputs={'Input': [in0]},
+                                   outputs={'Out': [init]},
+                                   attrs={
+                                       'shape': [-1] + shape,
+                                       'value': float(value),
+                                       'dtype': init.dtype
+                                   })
             return self.memory(init=init)
 
     def update_memory(self, ex_mem, new_mem):
@@ -3629,16 +3657,15 @@ def _init_zero_idx_(self):
             parent_block = self._parent_block_()
             self.zero_idx = parent_block.create_var(
                 name=unique_name.generate('zero_idx'), dtype='int64')
-            parent_block.append_op(
-                type='fill_constant',
-                inputs={},
-                outputs={'Out': [self.zero_idx]},
-                attrs={
-                    'shape': [1],
-                    'dtype': self.zero_idx.dtype,
-                    'value': float(0),
-                    'force_cpu': True
-                })
+            parent_block.append_op(type='fill_constant',
+                                   inputs={},
+                                   outputs={'Out': [self.zero_idx]},
+                                   attrs={
+                                       'shape': [1],
+                                       'dtype': self.zero_idx.dtype,
+                                       'value': float(0),
+                                       'force_cpu': True
+                                   })
 
     def _parent_block_(self):
         prog = self.helper.main_program
@@ -3650,8 +3677,8 @@ def _parent_block_(self):
 
     def _assert_in_rnn_block_(self, method):
         if self.status != DynamicRNN.IN_RNN:
-            raise ValueError("{0} can only be invoked inside rnn block.".format(
-                method))
+            raise ValueError(
+                "{0} can only be invoked inside rnn block.".format(method))
 
 
 def switch_case(branch_index, branch_fns, default=None, name=None):
@@ -3764,16 +3791,16 @@ def _check_args(branch_index, branch_fns, default):
 
             if key in keys_of_fns:
                 raise ValueError(
-                    "The key in 'branch_fns' must be unique, but '{}' appears more than once.".
-                    format(key))
+                    "The key in 'branch_fns' must be unique, but '{}' appears more than once."
+                    .format(key))
             else:
                 keys_of_fns.append(key)
 
             if not callable(fn):
                 raise TypeError(
-                    _error_message("The type of function for key {}".format(
-                        key), "branch_fns", "switch_case", "callable", type(
-                            fn)))
+                    _error_message(
+                        "The type of function for key {}".format(key),
+                        "branch_fns", "switch_case", "callable", type(fn)))
 
         if default is None:
             default = sorted(branch_fns)[-1][1]
@@ -3832,11 +3859,12 @@ def reorder_lod_tensor_by_rank(x, rank_table):
     helper = LayerHelper('reorder_lod_tensor_by_rank', **locals())
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='reorder_lod_tensor_by_rank',
-        inputs={'X': [x],
-                'RankTable': [rank_table]},
-        outputs={'Out': [out]})
+    helper.append_op(type='reorder_lod_tensor_by_rank',
+                     inputs={
+                         'X': [x],
+                         'RankTable': [rank_table]
+                     },
+                     outputs={'Out': [out]})
     return out
 
 
@@ -3882,6 +3910,7 @@ def is_empty(x, name=None):
     helper = LayerHelper("is_empty", **locals())
     cond = helper.create_variable_for_type_inference(dtype='bool')
     cond.stop_gradient = True
-    helper.append_op(
-        type='is_empty', inputs={'X': [x]}, outputs={'Out': [cond]})
+    helper.append_op(type='is_empty',
+                     inputs={'X': [x]},
+                     outputs={'Out': [cond]})
     return cond
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 75b2b26fb9dfc..f89c95b93a1d3 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -272,27 +272,26 @@ def retinanet_target_assign(bbox_pred,
     bbox_inside_weight = helper.create_variable_for_type_inference(
         dtype=anchor_box.dtype)
     fg_num = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type="retinanet_target_assign",
-        inputs={
-            'Anchor': anchor_box,
-            'GtBoxes': gt_boxes,
-            'GtLabels': gt_labels,
-            'IsCrowd': is_crowd,
-            'ImInfo': im_info
-        },
-        outputs={
-            'LocationIndex': loc_index,
-            'ScoreIndex': score_index,
-            'TargetLabel': target_label,
-            'TargetBBox': target_bbox,
-            'BBoxInsideWeight': bbox_inside_weight,
-            'ForegroundNumber': fg_num
-        },
-        attrs={
-            'positive_overlap': positive_overlap,
-            'negative_overlap': negative_overlap
-        })
+    helper.append_op(type="retinanet_target_assign",
+                     inputs={
+                         'Anchor': anchor_box,
+                         'GtBoxes': gt_boxes,
+                         'GtLabels': gt_labels,
+                         'IsCrowd': is_crowd,
+                         'ImInfo': im_info
+                     },
+                     outputs={
+                         'LocationIndex': loc_index,
+                         'ScoreIndex': score_index,
+                         'TargetLabel': target_label,
+                         'TargetBBox': target_bbox,
+                         'BBoxInsideWeight': bbox_inside_weight,
+                         'ForegroundNumber': fg_num
+                     },
+                     attrs={
+                         'positive_overlap': positive_overlap,
+                         'negative_overlap': negative_overlap
+                     })
 
     loc_index.stop_gradient = True
     score_index.stop_gradient = True
@@ -434,29 +433,28 @@ def rpn_target_assign(bbox_pred,
         dtype=anchor_box.dtype)
     bbox_inside_weight = helper.create_variable_for_type_inference(
         dtype=anchor_box.dtype)
-    helper.append_op(
-        type="rpn_target_assign",
-        inputs={
-            'Anchor': anchor_box,
-            'GtBoxes': gt_boxes,
-            'IsCrowd': is_crowd,
-            'ImInfo': im_info
-        },
-        outputs={
-            'LocationIndex': loc_index,
-            'ScoreIndex': score_index,
-            'TargetLabel': target_label,
-            'TargetBBox': target_bbox,
-            'BBoxInsideWeight': bbox_inside_weight
-        },
-        attrs={
-            'rpn_batch_size_per_im': rpn_batch_size_per_im,
-            'rpn_straddle_thresh': rpn_straddle_thresh,
-            'rpn_positive_overlap': rpn_positive_overlap,
-            'rpn_negative_overlap': rpn_negative_overlap,
-            'rpn_fg_fraction': rpn_fg_fraction,
-            'use_random': use_random
-        })
+    helper.append_op(type="rpn_target_assign",
+                     inputs={
+                         'Anchor': anchor_box,
+                         'GtBoxes': gt_boxes,
+                         'IsCrowd': is_crowd,
+                         'ImInfo': im_info
+                     },
+                     outputs={
+                         'LocationIndex': loc_index,
+                         'ScoreIndex': score_index,
+                         'TargetLabel': target_label,
+                         'TargetBBox': target_bbox,
+                         'BBoxInsideWeight': bbox_inside_weight
+                     },
+                     attrs={
+                         'rpn_batch_size_per_im': rpn_batch_size_per_im,
+                         'rpn_straddle_thresh': rpn_straddle_thresh,
+                         'rpn_positive_overlap': rpn_positive_overlap,
+                         'rpn_negative_overlap': rpn_negative_overlap,
+                         'rpn_fg_fraction': rpn_fg_fraction,
+                         'use_random': use_random
+                     })
 
     loc_index.stop_gradient = True
     score_index.stop_gradient = True
@@ -608,14 +606,17 @@ def build_model(mode='train'):
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type="sigmoid_focal_loss",
-        inputs={"X": x,
-                "Label": label,
-                "FgNum": fg_num},
-        attrs={"gamma": gamma,
-               'alpha': alpha},
-        outputs={"Out": out})
+    helper.append_op(type="sigmoid_focal_loss",
+                     inputs={
+                         "X": x,
+                         "Label": label,
+                         "FgNum": fg_num
+                     },
+                     attrs={
+                         "gamma": gamma,
+                         'alpha': alpha
+                     },
+                     outputs={"Out": out})
     return out
 
 
@@ -714,11 +715,10 @@ class number, M is number of bounding boxes.
                                        return_index=True)
     """
     helper = LayerHelper("detection_output", **locals())
-    decoded_box = box_coder(
-        prior_box=prior_box,
-        prior_box_var=prior_box_var,
-        target_box=loc,
-        code_type='decode_center_size')
+    decoded_box = box_coder(prior_box=prior_box,
+                            prior_box_var=prior_box_var,
+                            target_box=loc,
+                            code_type='decode_center_size')
     scores = nn.softmax(input=scores)
     scores = nn.transpose(scores, perm=[0, 2, 1])
     scores.stop_gradient = True
@@ -726,35 +726,39 @@ class number, M is number of bounding boxes.
         dtype=decoded_box.dtype)
     if return_index:
         index = helper.create_variable_for_type_inference(dtype='int')
-        helper.append_op(
-            type="multiclass_nms2",
-            inputs={'Scores': scores,
-                    'BBoxes': decoded_box},
-            outputs={'Out': nmsed_outs,
-                     'Index': index},
-            attrs={
-                'background_label': 0,
-                'nms_threshold': nms_threshold,
-                'nms_top_k': nms_top_k,
-                'keep_top_k': keep_top_k,
-                'score_threshold': score_threshold,
-                'nms_eta': 1.0,
-            })
+        helper.append_op(type="multiclass_nms2",
+                         inputs={
+                             'Scores': scores,
+                             'BBoxes': decoded_box
+                         },
+                         outputs={
+                             'Out': nmsed_outs,
+                             'Index': index
+                         },
+                         attrs={
+                             'background_label': 0,
+                             'nms_threshold': nms_threshold,
+                             'nms_top_k': nms_top_k,
+                             'keep_top_k': keep_top_k,
+                             'score_threshold': score_threshold,
+                             'nms_eta': 1.0,
+                         })
         index.stop_gradient = True
     else:
-        helper.append_op(
-            type="multiclass_nms",
-            inputs={'Scores': scores,
-                    'BBoxes': decoded_box},
-            outputs={'Out': nmsed_outs},
-            attrs={
-                'background_label': 0,
-                'nms_threshold': nms_threshold,
-                'nms_top_k': nms_top_k,
-                'keep_top_k': keep_top_k,
-                'score_threshold': score_threshold,
-                'nms_eta': 1.0,
-            })
+        helper.append_op(type="multiclass_nms",
+                         inputs={
+                             'Scores': scores,
+                             'BBoxes': decoded_box
+                         },
+                         outputs={'Out': nmsed_outs},
+                         attrs={
+                             'background_label': 0,
+                             'nms_threshold': nms_threshold,
+                             'nms_top_k': nms_top_k,
+                             'keep_top_k': keep_top_k,
+                             'score_threshold': score_threshold,
+                             'nms_eta': 1.0,
+                         })
     nmsed_outs.stop_gradient = True
     if return_index:
         return nmsed_outs, index
@@ -806,12 +810,13 @@ def iou_similarity(x, y, box_normalized=True, name=None):
     helper = LayerHelper("iou_similarity", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type="iou_similarity",
-        inputs={"X": x,
-                "Y": y},
-        attrs={"box_normalized": box_normalized},
-        outputs={"Out": out})
+    helper.append_op(type="iou_similarity",
+                     inputs={
+                         "X": x,
+                         "Y": y
+                     },
+                     attrs={"box_normalized": box_normalized},
+                     outputs={"Out": out})
     return out
 
 
@@ -958,11 +963,10 @@ def box_coder(prior_box,
         attrs['variance'] = prior_box_var
     else:
         raise TypeError("Input variance of box_coder must be Variable or lisz")
-    helper.append_op(
-        type="box_coder",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={"OutputBox": output_box})
+    helper.append_op(type="box_coder",
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={"OutputBox": output_box})
     return output_box
 
 
@@ -992,11 +996,10 @@ def polygon_box_transform(input, name=None):
     helper = LayerHelper("polygon_box_transform", **locals())
     output = helper.create_variable_for_type_inference(dtype=input.dtype)
 
-    helper.append_op(
-        type="polygon_box_transform",
-        inputs={"Input": input},
-        attrs={},
-        outputs={"Output": output})
+    helper.append_op(type="polygon_box_transform",
+                     inputs={"Input": input},
+                     attrs={},
+                     outputs={"Output": output})
     return output
 
 
@@ -1125,15 +1128,14 @@ def yolov3_loss(x,
         "scale_x_y": scale_x_y,
     }
 
-    helper.append_op(
-        type='yolov3_loss',
-        inputs=inputs,
-        outputs={
-            'Loss': loss,
-            'ObjectnessMask': objectness_mask,
-            'GTMatchMask': gt_match_mask
-        },
-        attrs=attrs)
+    helper.append_op(type='yolov3_loss',
+                     inputs=inputs,
+                     outputs={
+                         'Loss': loss,
+                         'ObjectnessMask': objectness_mask,
+                         'GTMatchMask': gt_match_mask
+                     },
+                     attrs=attrs)
     return loss
 
 
@@ -1220,17 +1222,16 @@ def yolo_box(x,
         "iou_aware_factor": iou_aware_factor
     }
 
-    helper.append_op(
-        type='yolo_box',
-        inputs={
-            "X": x,
-            "ImgSize": img_size,
-        },
-        outputs={
-            'Boxes': boxes,
-            'Scores': scores,
-        },
-        attrs=attrs)
+    helper.append_op(type='yolo_box',
+                     inputs={
+                         "X": x,
+                         "ImgSize": img_size,
+                     },
+                     outputs={
+                         'Boxes': boxes,
+                         'Scores': scores,
+                     },
+                     attrs=attrs)
     return boxes, scores
 
 
@@ -1303,28 +1304,27 @@ def __create_var(type):
     true_pos = input_states[1] if input_states is not None else None
     false_pos = input_states[2] if input_states is not None else None
 
-    helper.append_op(
-        type="detection_map",
-        inputs={
-            'Label': label,
-            'DetectRes': detect_res,
-            'HasState': has_state,
-            'PosCount': pos_count,
-            'TruePos': true_pos,
-            'FalsePos': false_pos
-        },
-        outputs={
-            'MAP': map_out,
-            'AccumPosCount': accum_pos_count_out,
-            'AccumTruePos': accum_true_pos_out,
-            'AccumFalsePos': accum_false_pos_out
-        },
-        attrs={
-            'overlap_threshold': overlap_threshold,
-            'evaluate_difficult': evaluate_difficult,
-            'ap_type': ap_version,
-            'class_num': class_num,
-        })
+    helper.append_op(type="detection_map",
+                     inputs={
+                         'Label': label,
+                         'DetectRes': detect_res,
+                         'HasState': has_state,
+                         'PosCount': pos_count,
+                         'TruePos': true_pos,
+                         'FalsePos': false_pos
+                     },
+                     outputs={
+                         'MAP': map_out,
+                         'AccumPosCount': accum_pos_count_out,
+                         'AccumTruePos': accum_true_pos_out,
+                         'AccumFalsePos': accum_false_pos_out
+                     },
+                     attrs={
+                         'overlap_threshold': overlap_threshold,
+                         'evaluate_difficult': evaluate_difficult,
+                         'ap_type': ap_version,
+                         'class_num': class_num,
+                     })
     return map_out
 
 
@@ -1404,17 +1404,16 @@ def bipartite_match(dist_matrix,
     match_indices = helper.create_variable_for_type_inference(dtype='int32')
     match_distance = helper.create_variable_for_type_inference(
         dtype=dist_matrix.dtype)
-    helper.append_op(
-        type='bipartite_match',
-        inputs={'DistMat': dist_matrix},
-        attrs={
-            'match_type': match_type,
-            'dist_threshold': dist_threshold,
-        },
-        outputs={
-            'ColToRowMatchIndices': match_indices,
-            'ColToRowMatchDist': match_distance
-        })
+    helper.append_op(type='bipartite_match',
+                     inputs={'DistMat': dist_matrix},
+                     attrs={
+                         'match_type': match_type,
+                         'dist_threshold': dist_threshold,
+                     },
+                     outputs={
+                         'ColToRowMatchIndices': match_indices,
+                         'ColToRowMatchDist': match_distance
+                     })
     return match_indices, match_distance
 
 
@@ -1511,16 +1510,17 @@ def target_assign(input,
     helper = LayerHelper('target_assign', **locals())
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     out_weight = helper.create_variable_for_type_inference(dtype='float32')
-    helper.append_op(
-        type='target_assign',
-        inputs={
-            'X': input,
-            'MatchIndices': matched_indices,
-            'NegIndices': negative_indices
-        },
-        outputs={'Out': out,
-                 'OutWeight': out_weight},
-        attrs={'mismatch_value': mismatch_value})
+    helper.append_op(type='target_assign',
+                     inputs={
+                         'X': input,
+                         'MatchIndices': matched_indices,
+                         'NegIndices': negative_indices
+                     },
+                     outputs={
+                         'Out': out,
+                         'OutWeight': out_weight
+                     },
+                     attrs={'mismatch_value': mismatch_value})
     return out, out_weight
 
 
@@ -1676,11 +1676,12 @@ def __reshape_to_2d(var):
 
     # 2. Compute confidence for mining hard examples
     # 2.1. Get the target label based on matched indices
-    gt_label = nn.reshape(
-        x=gt_label, shape=(len(gt_label.shape) - 1) * (0, ) + (-1, 1))
+    gt_label = nn.reshape(x=gt_label,
+                          shape=(len(gt_label.shape) - 1) * (0, ) + (-1, 1))
     gt_label.stop_gradient = True
-    target_label, _ = target_assign(
-        gt_label, matched_indices, mismatch_value=background_label)
+    target_label, _ = target_assign(gt_label,
+                                    matched_indices,
+                                    mismatch_value=background_label)
     # 2.2. Compute confidence loss.
     # Reshape confidence to 2D tensor.
     confidence = __reshape_to_2d(confidence)
@@ -1693,39 +1694,38 @@ def __reshape_to_2d(var):
     actual_shape.stop_gradient = True
     # shape=(-1, 0) is set for compile-time, the correct shape is set by
     # actual_shape in runtime.
-    conf_loss = nn.reshape(
-        x=conf_loss, shape=(-1, 0), actual_shape=actual_shape)
+    conf_loss = nn.reshape(x=conf_loss,
+                           shape=(-1, 0),
+                           actual_shape=actual_shape)
     conf_loss.stop_gradient = True
     neg_indices = helper.create_variable_for_type_inference(dtype='int32')
     dtype = matched_indices.dtype
     updated_matched_indices = helper.create_variable_for_type_inference(
         dtype=dtype)
-    helper.append_op(
-        type='mine_hard_examples',
-        inputs={
-            'ClsLoss': conf_loss,
-            'LocLoss': None,
-            'MatchIndices': matched_indices,
-            'MatchDist': matched_dist,
-        },
-        outputs={
-            'NegIndices': neg_indices,
-            'UpdatedMatchIndices': updated_matched_indices
-        },
-        attrs={
-            'neg_pos_ratio': neg_pos_ratio,
-            'neg_dist_threshold': neg_overlap,
-            'mining_type': mining_type,
-            'sample_size': sample_size,
-        })
+    helper.append_op(type='mine_hard_examples',
+                     inputs={
+                         'ClsLoss': conf_loss,
+                         'LocLoss': None,
+                         'MatchIndices': matched_indices,
+                         'MatchDist': matched_dist,
+                     },
+                     outputs={
+                         'NegIndices': neg_indices,
+                         'UpdatedMatchIndices': updated_matched_indices
+                     },
+                     attrs={
+                         'neg_pos_ratio': neg_pos_ratio,
+                         'neg_dist_threshold': neg_overlap,
+                         'mining_type': mining_type,
+                         'sample_size': sample_size,
+                     })
 
     # 4. Assign classification and regression targets
     # 4.1. Encoded bbox according to the prior boxes.
-    encoded_bbox = box_coder(
-        prior_box=prior_box,
-        prior_box_var=prior_box_var,
-        target_box=gt_box,
-        code_type='encode_center_size')
+    encoded_bbox = box_coder(prior_box=prior_box,
+                             prior_box_var=prior_box_var,
+                             target_box=gt_box,
+                             code_type='encode_center_size')
     # 4.2. Assign regression targets
     target_bbox, target_loc_weight = target_assign(
         encoded_bbox, updated_matched_indices, mismatch_value=background_label)
@@ -1888,8 +1888,9 @@ def prior_box(input,
     """
     helper = LayerHelper("prior_box", **locals())
     dtype = helper.input_dtype()
-    check_variable_and_dtype(
-        input, 'input', ['uint8', 'int8', 'float32', 'float64'], 'prior_box')
+    check_variable_and_dtype(input, 'input',
+                             ['uint8', 'int8', 'float32', 'float64'],
+                             'prior_box')
 
     def _is_list_or_tuple_(data):
         return (isinstance(data, list) or isinstance(data, tuple))
@@ -1926,11 +1927,16 @@ def _is_list_or_tuple_(data):
     var = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="prior_box",
-        inputs={"Input": input,
-                "Image": image},
-        outputs={"Boxes": box,
-                 "Variances": var},
-        attrs=attrs, )
+        inputs={
+            "Input": input,
+            "Image": image
+        },
+        outputs={
+            "Boxes": box,
+            "Variances": var
+        },
+        attrs=attrs,
+    )
     box.stop_gradient = True
     var.stop_gradient = True
     return box, var
@@ -2106,11 +2112,16 @@ def _is_list_or_tuple_(data):
     var = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="density_prior_box",
-        inputs={"Input": input,
-                "Image": image},
-        outputs={"Boxes": box,
-                 "Variances": var},
-        attrs=attrs, )
+        inputs={
+            "Input": input,
+            "Image": image
+        },
+        outputs={
+            "Boxes": box,
+            "Variances": var
+        },
+        attrs=attrs,
+    )
     box.stop_gradient = True
     var.stop_gradient = True
     return box, var
@@ -2362,12 +2373,11 @@ def _is_list_or_tuple_and_equal(data, length, err_info):
 
         # get loc
         num_loc_output = num_boxes * 4
-        mbox_loc = nn.conv2d(
-            input=input,
-            num_filters=num_loc_output,
-            filter_size=kernel_size,
-            padding=pad,
-            stride=stride)
+        mbox_loc = nn.conv2d(input=input,
+                             num_filters=num_loc_output,
+                             filter_size=kernel_size,
+                             padding=pad,
+                             stride=stride)
 
         mbox_loc = nn.transpose(mbox_loc, perm=[0, 2, 3, 1])
         mbox_loc_flatten = nn.flatten(mbox_loc, axis=1)
@@ -2375,12 +2385,11 @@ def _is_list_or_tuple_and_equal(data, length, err_info):
 
         # get conf
         num_conf_output = num_boxes * num_classes
-        conf_loc = nn.conv2d(
-            input=input,
-            num_filters=num_conf_output,
-            filter_size=kernel_size,
-            padding=pad,
-            stride=stride)
+        conf_loc = nn.conv2d(input=input,
+                             num_filters=num_conf_output,
+                             filter_size=kernel_size,
+                             padding=pad,
+                             stride=stride)
         conf_loc = nn.transpose(conf_loc, perm=[0, 2, 3, 1])
         conf_loc_flatten = nn.flatten(conf_loc, axis=1)
         mbox_confs.append(conf_loc_flatten)
@@ -2402,8 +2411,8 @@ def _is_list_or_tuple_and_equal(data, length, err_info):
         mbox_locs_concat = tensor.concat(mbox_locs, axis=1)
         mbox_locs_concat = nn.reshape(mbox_locs_concat, shape=[0, -1, 4])
         mbox_confs_concat = tensor.concat(mbox_confs, axis=1)
-        mbox_confs_concat = nn.reshape(
-            mbox_confs_concat, shape=[0, -1, num_classes])
+        mbox_confs_concat = nn.reshape(mbox_confs_concat,
+                                       shape=[0, -1, num_classes])
 
     box.stop_gradient = True
     var.stop_gradient = True
@@ -2507,9 +2516,12 @@ def _is_list_or_tuple_(data):
     helper.append_op(
         type="anchor_generator",
         inputs={"Input": input},
-        outputs={"Anchors": anchor,
-                 "Variances": var},
-        attrs=attrs, )
+        outputs={
+            "Anchors": anchor,
+            "Variances": var
+        },
+        attrs=attrs,
+    )
     anchor.stop_gradient = True
     var.stop_gradient = True
     return anchor, var
@@ -2588,22 +2600,23 @@ def roi_perspective_transform(input,
     transform_matrix = helper.create_variable_for_type_inference(dtype)
     out2in_idx = helper.create_variable_for_type_inference(dtype="int32")
     out2in_w = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="roi_perspective_transform",
-        inputs={"X": input,
-                "ROIs": rois},
-        outputs={
-            "Out": out,
-            "Out2InIdx": out2in_idx,
-            "Out2InWeights": out2in_w,
-            "Mask": mask,
-            "TransformMatrix": transform_matrix
-        },
-        attrs={
-            "transformed_height": transformed_height,
-            "transformed_width": transformed_width,
-            "spatial_scale": spatial_scale
-        })
+    helper.append_op(type="roi_perspective_transform",
+                     inputs={
+                         "X": input,
+                         "ROIs": rois
+                     },
+                     outputs={
+                         "Out": out,
+                         "Out2InIdx": out2in_idx,
+                         "Out2InWeights": out2in_w,
+                         "Mask": mask,
+                         "TransformMatrix": transform_matrix
+                     },
+                     attrs={
+                         "transformed_height": transformed_height,
+                         "transformed_width": transformed_width,
+                         "spatial_scale": spatial_scale
+                     })
     return out, mask, transform_matrix
 
 
@@ -2723,29 +2736,28 @@ def generate_proposal_labels(rpn_rois,
     }
     if max_overlap is not None:
         inputs['MaxOverlap'] = max_overlap
-    helper.append_op(
-        type="generate_proposal_labels",
-        inputs=inputs,
-        outputs={
-            'Rois': rois,
-            'LabelsInt32': labels_int32,
-            'BboxTargets': bbox_targets,
-            'BboxInsideWeights': bbox_inside_weights,
-            'BboxOutsideWeights': bbox_outside_weights,
-            'MaxOverlapWithGT': max_overlap_with_gt
-        },
-        attrs={
-            'batch_size_per_im': batch_size_per_im,
-            'fg_fraction': fg_fraction,
-            'fg_thresh': fg_thresh,
-            'bg_thresh_hi': bg_thresh_hi,
-            'bg_thresh_lo': bg_thresh_lo,
-            'bbox_reg_weights': bbox_reg_weights,
-            'class_nums': class_nums,
-            'use_random': use_random,
-            'is_cls_agnostic': is_cls_agnostic,
-            'is_cascade_rcnn': is_cascade_rcnn
-        })
+    helper.append_op(type="generate_proposal_labels",
+                     inputs=inputs,
+                     outputs={
+                         'Rois': rois,
+                         'LabelsInt32': labels_int32,
+                         'BboxTargets': bbox_targets,
+                         'BboxInsideWeights': bbox_inside_weights,
+                         'BboxOutsideWeights': bbox_outside_weights,
+                         'MaxOverlapWithGT': max_overlap_with_gt
+                     },
+                     attrs={
+                         'batch_size_per_im': batch_size_per_im,
+                         'fg_fraction': fg_fraction,
+                         'fg_thresh': fg_thresh,
+                         'bg_thresh_hi': bg_thresh_hi,
+                         'bg_thresh_lo': bg_thresh_lo,
+                         'bbox_reg_weights': bbox_reg_weights,
+                         'class_nums': class_nums,
+                         'use_random': use_random,
+                         'is_cls_agnostic': is_cls_agnostic,
+                         'is_cascade_rcnn': is_cascade_rcnn
+                     })
 
     rois.stop_gradient = True
     labels_int32.stop_gradient = True
@@ -2880,23 +2892,24 @@ def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
     mask_int32 = helper.create_variable_for_type_inference(
         dtype=gt_classes.dtype)
 
-    helper.append_op(
-        type="generate_mask_labels",
-        inputs={
-            'ImInfo': im_info,
-            'GtClasses': gt_classes,
-            'IsCrowd': is_crowd,
-            'GtSegms': gt_segms,
-            'Rois': rois,
-            'LabelsInt32': labels_int32
-        },
-        outputs={
-            'MaskRois': mask_rois,
-            'RoiHasMaskInt32': roi_has_mask_int32,
-            'MaskInt32': mask_int32
-        },
-        attrs={'num_classes': num_classes,
-               'resolution': resolution})
+    helper.append_op(type="generate_mask_labels",
+                     inputs={
+                         'ImInfo': im_info,
+                         'GtClasses': gt_classes,
+                         'IsCrowd': is_crowd,
+                         'GtSegms': gt_segms,
+                         'Rois': rois,
+                         'LabelsInt32': labels_int32
+                     },
+                     outputs={
+                         'MaskRois': mask_rois,
+                         'RoiHasMaskInt32': roi_has_mask_int32,
+                         'MaskInt32': mask_int32
+                     },
+                     attrs={
+                         'num_classes': num_classes,
+                         'resolution': resolution
+                     })
 
     mask_rois.stop_gradient = True
     roi_has_mask_int32.stop_gradient = True
@@ -3028,23 +3041,22 @@ def generate_proposals(scores,
         rpn_rois_num.stop_gradient = True
         outputs['RpnRoisNum'] = rpn_rois_num
 
-    helper.append_op(
-        type="generate_proposals",
-        inputs={
-            'Scores': scores,
-            'BboxDeltas': bbox_deltas,
-            'ImInfo': im_info,
-            'Anchors': anchors,
-            'Variances': variances
-        },
-        attrs={
-            'pre_nms_topN': pre_nms_top_n,
-            'post_nms_topN': post_nms_top_n,
-            'nms_thresh': nms_thresh,
-            'min_size': min_size,
-            'eta': eta
-        },
-        outputs=outputs)
+    helper.append_op(type="generate_proposals",
+                     inputs={
+                         'Scores': scores,
+                         'BboxDeltas': bbox_deltas,
+                         'ImInfo': im_info,
+                         'Anchors': anchors,
+                         'Variances': variances
+                     },
+                     attrs={
+                         'pre_nms_topN': pre_nms_top_n,
+                         'post_nms_topN': post_nms_top_n,
+                         'nms_thresh': nms_thresh,
+                         'min_size': min_size,
+                         'eta': eta
+                     },
+                     outputs=outputs)
     rpn_rois.stop_gradient = True
     rpn_roi_probs.stop_gradient = True
 
@@ -3253,22 +3265,21 @@ def retinanet_detection_output(bboxes,
     helper = LayerHelper('retinanet_detection_output', **locals())
     output = helper.create_variable_for_type_inference(
         dtype=helper.input_dtype('scores'))
-    helper.append_op(
-        type="retinanet_detection_output",
-        inputs={
-            'BBoxes': bboxes,
-            'Scores': scores,
-            'Anchors': anchors,
-            'ImInfo': im_info
-        },
-        attrs={
-            'score_threshold': score_threshold,
-            'nms_top_k': nms_top_k,
-            'nms_threshold': nms_threshold,
-            'keep_top_k': keep_top_k,
-            'nms_eta': 1.,
-        },
-        outputs={'Out': output})
+    helper.append_op(type="retinanet_detection_output",
+                     inputs={
+                         'BBoxes': bboxes,
+                         'Scores': scores,
+                         'Anchors': anchors,
+                         'ImInfo': im_info
+                     },
+                     attrs={
+                         'score_threshold': score_threshold,
+                         'nms_top_k': nms_top_k,
+                         'nms_threshold': nms_threshold,
+                         'keep_top_k': keep_top_k,
+                         'nms_eta': 1.,
+                     },
+                     outputs={'Out': output})
     output.stop_gradient = True
     return output
 
@@ -3408,20 +3419,21 @@ class number. The data type is float32 or float64.
 
     helper = LayerHelper('multiclass_nms', **locals())
     output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
-    helper.append_op(
-        type="multiclass_nms",
-        inputs={'BBoxes': bboxes,
-                'Scores': scores},
-        attrs={
-            'background_label': background_label,
-            'score_threshold': score_threshold,
-            'nms_top_k': nms_top_k,
-            'nms_threshold': nms_threshold,
-            'nms_eta': nms_eta,
-            'keep_top_k': keep_top_k,
-            'normalized': normalized
-        },
-        outputs={'Out': output})
+    helper.append_op(type="multiclass_nms",
+                     inputs={
+                         'BBoxes': bboxes,
+                         'Scores': scores
+                     },
+                     attrs={
+                         'background_label': background_label,
+                         'score_threshold': score_threshold,
+                         'nms_top_k': nms_top_k,
+                         'nms_threshold': nms_threshold,
+                         'nms_eta': nms_eta,
+                         'keep_top_k': keep_top_k,
+                         'normalized': normalized
+                     },
+                     outputs={'Out': output})
     output.stop_gradient = True
 
     return output
@@ -3537,21 +3549,22 @@ def locality_aware_nms(bboxes,
     output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
     out = {'Out': output}
 
-    helper.append_op(
-        type="locality_aware_nms",
-        inputs={'BBoxes': bboxes,
-                'Scores': scores},
-        attrs={
-            'background_label': background_label,
-            'score_threshold': score_threshold,
-            'nms_top_k': nms_top_k,
-            'nms_threshold': nms_threshold,
-            'nms_eta': nms_eta,
-            'keep_top_k': keep_top_k,
-            'nms_eta': nms_eta,
-            'normalized': normalized
-        },
-        outputs={'Out': output})
+    helper.append_op(type="locality_aware_nms",
+                     inputs={
+                         'BBoxes': bboxes,
+                         'Scores': scores
+                     },
+                     attrs={
+                         'background_label': background_label,
+                         'score_threshold': score_threshold,
+                         'nms_top_k': nms_top_k,
+                         'nms_threshold': nms_threshold,
+                         'nms_eta': nms_eta,
+                         'keep_top_k': keep_top_k,
+                         'nms_eta': nms_eta,
+                         'normalized': normalized
+                     },
+                     outputs={'Out': output})
     output.stop_gradient = True
 
     return output
@@ -3660,22 +3673,25 @@ def matrix_nms(bboxes,
     helper = LayerHelper('matrix_nms', **locals())
     output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
     index = helper.create_variable_for_type_inference(dtype='int')
-    helper.append_op(
-        type="matrix_nms",
-        inputs={'BBoxes': bboxes,
-                'Scores': scores},
-        attrs={
-            'background_label': background_label,
-            'score_threshold': score_threshold,
-            'post_threshold': post_threshold,
-            'nms_top_k': nms_top_k,
-            'gaussian_sigma': gaussian_sigma,
-            'use_gaussian': use_gaussian,
-            'keep_top_k': keep_top_k,
-            'normalized': normalized
-        },
-        outputs={'Out': output,
-                 'Index': index})
+    helper.append_op(type="matrix_nms",
+                     inputs={
+                         'BBoxes': bboxes,
+                         'Scores': scores
+                     },
+                     attrs={
+                         'background_label': background_label,
+                         'score_threshold': score_threshold,
+                         'post_threshold': post_threshold,
+                         'nms_top_k': nms_top_k,
+                         'gaussian_sigma': gaussian_sigma,
+                         'use_gaussian': use_gaussian,
+                         'keep_top_k': keep_top_k,
+                         'normalized': normalized
+                     },
+                     outputs={
+                         'Out': output,
+                         'Index': index
+                     })
     output.stop_gradient = True
 
     if return_index:
@@ -3792,16 +3808,15 @@ def distribute_fpn_proposals(fpn_rois,
         ]
         outputs['MultiLevelRoIsNum'] = rois_num_per_level
 
-    helper.append_op(
-        type='distribute_fpn_proposals',
-        inputs=inputs,
-        outputs=outputs,
-        attrs={
-            'min_level': min_level,
-            'max_level': max_level,
-            'refer_level': refer_level,
-            'refer_scale': refer_scale
-        })
+    helper.append_op(type='distribute_fpn_proposals',
+                     inputs=inputs,
+                     outputs=outputs,
+                     attrs={
+                         'min_level': min_level,
+                         'max_level': max_level,
+                         'refer_level': refer_level,
+                         'refer_scale': refer_scale
+                     })
     if rois_num is not None:
         return multi_rois, restore_ind, rois_num_per_level
     return multi_rois, restore_ind
@@ -3866,19 +3881,18 @@ def box_decoder_and_assign(prior_box,
     output_assign_box = helper.create_variable_for_type_inference(
         dtype=prior_box.dtype)
 
-    helper.append_op(
-        type="box_decoder_and_assign",
-        inputs={
-            "PriorBox": prior_box,
-            "PriorBoxVar": prior_box_var,
-            "TargetBox": target_box,
-            "BoxScore": box_score
-        },
-        attrs={"box_clip": box_clip},
-        outputs={
-            "DecodeBox": decoded_box,
-            "OutputAssignBox": output_assign_box
-        })
+    helper.append_op(type="box_decoder_and_assign",
+                     inputs={
+                         "PriorBox": prior_box,
+                         "PriorBoxVar": prior_box_var,
+                         "TargetBox": target_box,
+                         "BoxScore": box_score
+                     },
+                     attrs={"box_clip": box_clip},
+                     outputs={
+                         "DecodeBox": decoded_box,
+                         "OutputAssignBox": output_assign_box
+                     })
     return decoded_box, output_assign_box
 
 
@@ -3982,11 +3996,10 @@ def collect_fpn_proposals(multi_rois,
         rois_num = helper.create_variable_for_type_inference(dtype='int32')
         rois_num.stop_gradient = True
         outputs['RoisNum'] = rois_num
-    helper.append_op(
-        type='collect_fpn_proposals',
-        inputs=inputs,
-        outputs=outputs,
-        attrs={'post_nms_topN': post_nms_top_n})
+    helper.append_op(type='collect_fpn_proposals',
+                     inputs=inputs,
+                     outputs=outputs,
+                     attrs={'post_nms_topN': post_nms_top_n})
     if rois_num_per_level is not None:
         return output_rois, rois_num
     return output_rois
diff --git a/python/paddle/fluid/layers/device.py b/python/paddle/fluid/layers/device.py
index 42ccdbb8d267a..a4b967a8509e4 100644
--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
@@ -37,7 +37,8 @@ def get_places(device_count=None, device_type=None):
     if device_type is not None:
         attrs['device_type'] = str(device_type)
 
-    helper.append_op(
-        type='get_places', outputs={"Out": [out_places]}, attrs=attrs)
+    helper.append_op(type='get_places',
+                     outputs={"Out": [out_places]},
+                     attrs=attrs)
 
     return out_places
diff --git a/python/paddle/fluid/layers/distributions.py b/python/paddle/fluid/layers/distributions.py
index 4e4c8dfd2a010..757ba0dc8855f 100644
--- a/python/paddle/fluid/layers/distributions.py
+++ b/python/paddle/fluid/layers/distributions.py
@@ -214,15 +214,14 @@ def sample(self, shape, seed=0):
                 self.low + self.high, batch_shape + shape, self.low.dtype, 0.)
             uniform_random_tmp = nn.uniform_random_batch_size_like(
                 zero_tmp, zero_tmp.shape, min=0., max=1., seed=seed)
-            output = uniform_random_tmp * (zero_tmp + self.high - self.low
-                                           ) + self.low
+            output = uniform_random_tmp * (zero_tmp + self.high -
+                                           self.low) + self.low
             return nn.reshape(output, output_shape)
         else:
             output_shape = shape + batch_shape
-            output = nn.uniform_random(
-                output_shape, seed=seed) * (tensor.zeros(
-                    output_shape, dtype=self.low.dtype) +
-                                            (self.high - self.low)) + self.low
+            output = nn.uniform_random(output_shape, seed=seed) * (
+                tensor.zeros(output_shape, dtype=self.low.dtype) +
+                (self.high - self.low)) + self.low
             if self.all_arg_is_float:
                 return nn.reshape(output, shape)
             else:
@@ -358,8 +357,10 @@ def sample(self, shape, seed=0):
             zero_tmp = tensor.fill_constant_batch_size_like(
                 self.loc + self.scale, batch_shape + shape, self.loc.dtype, 0.)
             zero_tmp_shape = nn.shape(zero_tmp)
-            normal_random_tmp = nn.gaussian_random(
-                zero_tmp_shape, mean=0., std=1., seed=seed)
+            normal_random_tmp = nn.gaussian_random(zero_tmp_shape,
+                                                   mean=0.,
+                                                   std=1.,
+                                                   seed=seed)
             output = normal_random_tmp * (zero_tmp + self.scale) + self.loc
             return nn.reshape(output, output_shape)
         else:
@@ -379,8 +380,9 @@ def entropy(self):
 
         """
         batch_shape = list((self.loc + self.scale).shape)
-        zero_tmp = tensor.fill_constant_batch_size_like(
-            self.loc + self.scale, batch_shape, self.loc.dtype, 0.)
+        zero_tmp = tensor.fill_constant_batch_size_like(self.loc + self.scale,
+                                                        batch_shape,
+                                                        self.loc.dtype, 0.)
         return 0.5 + 0.5 * math.log(2 * math.pi) + nn.log(
             (self.scale + zero_tmp))
 
@@ -399,8 +401,9 @@ def log_prob(self, value):
 
         var = self.scale * self.scale
         log_scale = nn.log(self.scale)
-        return -1. * ((value - self.loc) * (value - self.loc)) / (
-            2. * var) - log_scale - math.log(math.sqrt(2. * math.pi))
+        return -1. * ((value - self.loc) *
+                      (value - self.loc)) / (2. * var) - log_scale - math.log(
+                          math.sqrt(2. * math.pi))
 
     def kl_divergence(self, other):
         """The KL-divergence between two normal distributions.
@@ -613,8 +616,7 @@ def _det(self, value):
         batch_shape = list(value.shape)
         one_all = tensor.ones(shape=batch_shape, dtype=self.loc.dtype)
         one_diag = tensor.diag(
-            tensor.ones(
-                shape=[batch_shape[0]], dtype=self.loc.dtype))
+            tensor.ones(shape=[batch_shape[0]], dtype=self.loc.dtype))
         det_diag = nn.reduce_prod(value + one_all - one_diag)
 
         return det_diag
@@ -624,8 +626,7 @@ def _inv(self, value):
         batch_shape = list(value.shape)
         one_all = tensor.ones(shape=batch_shape, dtype=self.loc.dtype)
         one_diag = tensor.diag(
-            tensor.ones(
-                shape=[batch_shape[0]], dtype=self.loc.dtype))
+            tensor.ones(shape=[batch_shape[0]], dtype=self.loc.dtype))
         inv_diag = nn.elementwise_pow(value, (one_all - 2 * one_diag))
 
         return inv_diag
@@ -637,9 +638,8 @@ def entropy(self):
           Variable: Shannon entropy of Multivariate Normal distribution. The data type is float32.
 
         """
-        entropy = 0.5 * (
-            self.scale.shape[0] *
-            (1.0 + math.log(2 * math.pi)) + nn.log(self._det(self.scale)))
+        entropy = 0.5 * (self.scale.shape[0] * (1.0 + math.log(2 * math.pi)) +
+                         nn.log(self._det(self.scale)))
 
         return entropy
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index c8a5235a586a5..c24a0477ffc0f 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -125,14 +125,13 @@ def data(name,
     if append_batch_size:
         shape = [-1] + shape  # append batch size as -1
 
-    data_var = helper.create_global_variable(
-        name=name,
-        shape=shape,
-        dtype=dtype,
-        type=type,
-        stop_gradient=stop_gradient,
-        lod_level=lod_level,
-        is_data=True)
+    data_var = helper.create_global_variable(name=name,
+                                             shape=shape,
+                                             dtype=dtype,
+                                             type=type,
+                                             stop_gradient=stop_gradient,
+                                             lod_level=lod_level,
+                                             is_data=True)
     return data_var
 
 
@@ -247,9 +246,9 @@ def complete_op(self):
             attrs={
                 'endpoint': self.endpoint,
                 'Fanin': self.fan_in,
-                'optimize_blocks': [
-                    current_block
-                ],  # did not support multiple optimize blocks in layers
+                'optimize_blocks':
+                [current_block
+                 ],  # did not support multiple optimize blocks in layers
                 'distributed_mode':
                 DistributedMode.SYNC,  # did not support async now in layers
                 'grad_to_block_id': [""]
@@ -283,21 +282,22 @@ def Send(endpoints, send_vars, dummy_output=None, sync=True):
     helper = LayerHelper("Send", **locals())
     rpc_op_role_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
-    helper.append_op(
-        type="send",
-        inputs={"X": send_vars},
-        outputs={"Out": dummy_output},
-        attrs={
-            "endpoints": endpoints,
-            "epmap": epmap,
-            rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC
-        })
+    helper.append_op(type="send",
+                     inputs={"X": send_vars},
+                     outputs={"Out": dummy_output},
+                     attrs={
+                         "endpoints":
+                         endpoints,
+                         "epmap":
+                         epmap,
+                         rpc_op_role_name:
+                         core.op_proto_and_checker_maker.OpRole.RPC
+                     })
     if sync:
-        helper.append_op(
-            type="send_barrier",
-            inputs={"X": dummy_output},
-            outputs={"Out": []},
-            attrs={"endpoints": endpoints})
+        helper.append_op(type="send_barrier",
+                         inputs={"X": dummy_output},
+                         outputs={"Out": []},
+                         attrs={"endpoints": endpoints})
 
 
 def Recv(endpoints, get_vars, dummy_input=None, sync=True):
@@ -326,21 +326,22 @@ def Recv(endpoints, get_vars, dummy_input=None, sync=True):
     endpoints = list(set(epmap))
 
     helper = LayerHelper("Recv", **locals())
-    helper.append_op(
-        type="recv",
-        inputs={"X": dummy_input},
-        outputs={"Out": get_vars},
-        attrs={"endpoints": endpoints,
-               "epmap": epmap})
+    helper.append_op(type="recv",
+                     inputs={"X": dummy_input},
+                     outputs={"Out": get_vars},
+                     attrs={
+                         "endpoints": endpoints,
+                         "epmap": epmap
+                     })
     if sync:
-        helper.append_op(
-            type="fetch_barrier",
-            outputs={"Out": get_vars},
-            attrs={"endpoints": endpoints})
+        helper.append_op(type="fetch_barrier",
+                         outputs={"Out": get_vars},
+                         attrs={"endpoints": endpoints})
     return get_vars
 
 
 def monkey_patch_reader_methods(reader):
+
     def __get_reader__():
         scope = global_scope()
         var = scope.find_var(reader.name)
@@ -381,11 +382,10 @@ def _copy_reader_create_op_(block, op):
         for arg_name in arg_names:
             new_output_map[param_name].append(block.var(arg_name))
 
-    new_op = block.append_op(
-        type=op.type,
-        inputs=new_input_map,
-        outputs=new_output_map,
-        attrs=op.all_attrs())
+    new_op = block.append_op(type=op.type,
+                             inputs=new_input_map,
+                             outputs=new_output_map,
+                             attrs=op.all_attrs())
     return new_op
 
 
@@ -441,17 +441,16 @@ def _py_reader(capacity,
 
     startup_blk = default_startup_program().current_block()
     startup_var = startup_blk.create_var(name=reader_name)
-    startup_blk.append_op(
-        type='create_py_reader',
-        inputs={'blocking_queue': [queue_name]},
-        outputs={'Out': [startup_var]},
-        attrs={
-            'shape_concat': shape_concat,
-            'lod_levels': lod_levels,
-            'dtypes': dtype_int,
-            'need_check_feed': need_check_feed,
-            'ranks': ranks
-        })
+    startup_blk.append_op(type='create_py_reader',
+                          inputs={'blocking_queue': [queue_name]},
+                          outputs={'Out': [startup_var]},
+                          attrs={
+                              'shape_concat': shape_concat,
+                              'lod_levels': lod_levels,
+                              'dtypes': dtype_int,
+                              'need_check_feed': need_check_feed,
+                              'ranks': ranks
+                          })
 
     startup_var.desc.set_dtypes(dtypes)
     startup_var.persistable = True
@@ -475,6 +474,7 @@ def _py_reader(capacity,
     reader.exited = False
 
     def start_provide_thread(func):
+
         def __provider_thread__(legacy_expected_place):
             try:
                 # See _DataLoaderIterSingleProcess._thread_loop() for why set expected place here.
@@ -501,8 +501,8 @@ def __provider_thread__(legacy_expected_place):
                 logging.warn('Your decorated reader has raised an exception!')
                 six.reraise(*sys.exc_info())
 
-        reader.thread = threading.Thread(
-            target=__provider_thread__, args=(_current_expected_place(), ))
+        reader.thread = threading.Thread(target=__provider_thread__,
+                                         args=(_current_expected_place(), ))
         reader.thread.daemon = True
         reader.thread.start()
 
@@ -518,18 +518,17 @@ def __set_paddle_reader__(paddle_reader):
                 for dtype, shape, lod_level in zip(dtypes, shapes, lod_levels):
                     name = str(counter)
                     actual_feed_list.append(
-                        data(
-                            name=name,
-                            dtype=dtype,
-                            shape=shape,
-                            lod_level=lod_level))
+                        data(name=name,
+                             dtype=dtype,
+                             shape=shape,
+                             lod_level=lod_level))
                     counter += 1
 
             data_names = [feed_data.name for feed_data in actual_feed_list]
-            feeder = DataFeeder(
-                feed_list=actual_feed_list, place=core.CPUPlace())
-            paddle_reader = feeder.decorate_reader(
-                paddle_reader, multi_devices=False)
+            feeder = DataFeeder(feed_list=actual_feed_list,
+                                place=core.CPUPlace())
+            paddle_reader = feeder.decorate_reader(paddle_reader,
+                                                   multi_devices=False)
 
         def __tensor_provider__():
             for slots in paddle_reader():
@@ -720,13 +719,12 @@ def network(reader):
     logging.warn(
         'paddle.fluid.layers.py_reader() may be deprecated in the near future. '
         'Please use paddle.fluid.io.DataLoader.from_generator() instead.')
-    return _py_reader(
-        capacity=capacity,
-        shapes=shapes,
-        dtypes=dtypes,
-        lod_levels=lod_levels,
-        name=name,
-        use_double_buffer=use_double_buffer)
+    return _py_reader(capacity=capacity,
+                      shapes=shapes,
+                      dtypes=dtypes,
+                      lod_levels=lod_levels,
+                      name=name,
+                      use_double_buffer=use_double_buffer)
 
 
 def create_py_reader_by_data(capacity,
@@ -802,25 +800,23 @@ def network(img, label):
     logging.warn(
         'paddle.fluid.layers.create_py_reader_by_data() may be deprecated in the near future. '
         'Please use paddle.fluid.io.DataLoader.from_generator() instead.')
-    return _py_reader(
-        capacity=capacity,
-        shapes=None,
-        dtypes=None,
-        lod_levels=None,
-        name=name,
-        use_double_buffer=use_double_buffer,
-        feed_list=feed_list)
+    return _py_reader(capacity=capacity,
+                      shapes=None,
+                      dtypes=None,
+                      lod_levels=None,
+                      name=name,
+                      use_double_buffer=use_double_buffer,
+                      feed_list=feed_list)
 
 
 def __create_shared_decorated_reader__(op_type, reader, attrs):
     var_name = unique_name(op_type)
     startup_blk = default_startup_program().current_block()
     startup_var = startup_blk.create_var(name=var_name)
-    startop_op = startup_blk.append_op(
-        type=op_type,
-        inputs={'UnderlyingReader': reader},
-        outputs={'Out': [startup_var]},
-        attrs=attrs)
+    startop_op = startup_blk.append_op(type=op_type,
+                                       inputs={'UnderlyingReader': reader},
+                                       outputs={'Out': [startup_var]},
+                                       attrs=attrs)
     startup_var.persistable = True
     main_prog_block = default_main_program().current_block()
     main_prog_var = _copy_reader_var_(main_prog_block, startup_var)
@@ -832,11 +828,10 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
     new_reader_name = name if name is not None else unique_name(op_type)
     main_blk = default_main_program().current_block()
     new_reader = main_blk.create_var(name=new_reader_name)
-    main_blk.append_op(
-        type=op_type,
-        inputs={'UnderlyingReader': reader},
-        outputs={'Out': [new_reader]},
-        attrs=attrs)
+    main_blk.append_op(type=op_type,
+                       inputs={'UnderlyingReader': reader},
+                       outputs={'Out': [new_reader]},
+                       attrs=attrs)
     return monkey_patch_reader_methods(new_reader)
 
 
@@ -869,8 +864,10 @@ def double_buffer(reader, place=None, name=None):
     if place is not None:
         attrs['place'] = str(_get_paddle_place(place)).upper()
 
-    return __create_unshared_decorated_reader__(
-        'create_double_buffer_reader', reader, attrs, name=name)
+    return __create_unshared_decorated_reader__('create_double_buffer_reader',
+                                                reader,
+                                                attrs,
+                                                name=name)
 
 
 def read_file(reader):
@@ -901,12 +898,13 @@ def read_file(reader):
     """
     helper = LayerHelper('read_file')
     out = [
-        helper.create_variable_for_type_inference(
-            stop_gradient=True, dtype='float32')
+        helper.create_variable_for_type_inference(stop_gradient=True,
+                                                  dtype='float32')
         for _ in range(len(reader.desc.shapes()))
     ]
-    helper.append_op(
-        type='read', inputs={'Reader': [reader]}, outputs={'Out': out})
+    helper.append_op(type='read',
+                     inputs={'Reader': [reader]},
+                     outputs={'Out': out})
     if len(out) == 1:
         return out[0]
     else:
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index ec99f7c64f36f..4fe9cbb087412 100755
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -187,8 +187,8 @@ def infer_and_check_dtype(op_proto, *args, **kwargs):
 
             for each in val:
                 if not isinstance(each, Variable):
-                    raise ValueError("input of {0} must be variable".format(
-                        op_type))
+                    raise ValueError(
+                        "input of {0} must be variable".format(op_type))
 
                 if dtype is None:
                     dtype = each.dtype
@@ -227,8 +227,8 @@ def func(*args, **kwargs):
         outputs = dict()
         out = kwargs.pop(_convert_(o_name), [])
         if out:
-            out_var = out[0] if (isinstance(out, list) or
-                                 isinstance(out, tuple)) else out
+            out_var = out[0] if (isinstance(out, list)
+                                 or isinstance(out, tuple)) else out
         else:
             out_var = helper.create_variable_for_type_inference(dtype=dtype)
         outputs[o_name] = [out_var]
@@ -236,8 +236,10 @@ def func(*args, **kwargs):
             outputs[name] = [
                 helper.create_variable_for_type_inference(dtype=dtype)
             ]
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
+        helper.append_op(type=op_type,
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=kwargs)
         return helper.append_activation(out_var)
 
     func.__name__ = op_type
@@ -309,8 +311,8 @@ def func(x, name=None):
             op = getattr(_C_ops, inplace_op_type)
             return op(x)
         warnings.warn(
-            "In static mode, {}() is the same as {}() and does not perform inplace operation.".
-            format(inplace_op_type, origin_op_type))
+            "In static mode, {}() is the same as {}() and does not perform inplace operation."
+            .format(inplace_op_type, origin_op_type))
         return generate_activation_fn(origin_op_type)(x, name)
 
     func.__name__ = inplace_op_type
@@ -323,9 +325,10 @@ def func(x, name=None):
 
 
 def autodoc(comment=""):
+
     def __impl__(func):
-        func.__doc__ = _generate_doc_string_(OpProtoHolder.instance(
-        ).get_op_proto(func.__name__)) + comment
+        func.__doc__ = _generate_doc_string_(
+            OpProtoHolder.instance().get_op_proto(func.__name__)) + comment
         return func
 
     return __impl__
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 924cc35ea9f62..e1a65633e60e2 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -96,16 +96,17 @@ def noam_decay(d_model, warmup_steps, learning_rate=1.0):
     """
     with default_main_program()._lr_schedule_guard():
         if _non_static_mode():
-            decay = imperate_lr.NoamDecay(
-                d_model, warmup_steps, learning_rate=learning_rate)
+            decay = imperate_lr.NoamDecay(d_model,
+                                          warmup_steps,
+                                          learning_rate=learning_rate)
             return decay
         else:
             global_step = _decay_step_counter(1)
 
             a = global_step**-0.5
             b = (warmup_steps**-1.5) * global_step
-            lr_value = learning_rate * (d_model**-0.5) * nn.elementwise_min(a,
-                                                                            b)
+            lr_value = learning_rate * (d_model**-0.5) * nn.elementwise_min(
+                a, b)
 
             return lr_value
 
@@ -341,20 +342,23 @@ def polynomial_decay(learning_rate,
 
             if cycle:
                 div_res = ops.ceil(global_step / decay_steps)
-                zero_var = tensor.fill_constant(
-                    shape=[1], dtype='float32', value=0.0)
-                one_var = tensor.fill_constant(
-                    shape=[1], dtype='float32', value=1.0)
+                zero_var = tensor.fill_constant(shape=[1],
+                                                dtype='float32',
+                                                value=0.0)
+                one_var = tensor.fill_constant(shape=[1],
+                                               dtype='float32',
+                                               value=1.0)
 
                 with control_flow.Switch() as switch:
                     with switch.case(global_step == zero_var):
                         tensor.assign(input=one_var, output=div_res)
                 decay_steps = decay_steps * div_res
             else:
-                decay_steps_var = tensor.fill_constant(
-                    shape=[1], dtype='float32', value=float(decay_steps))
-                global_step = nn.elementwise_min(
-                    x=global_step, y=decay_steps_var)
+                decay_steps_var = tensor.fill_constant(shape=[1],
+                                                       dtype='float32',
+                                                       value=float(decay_steps))
+                global_step = nn.elementwise_min(x=global_step,
+                                                 y=decay_steps_var)
 
             decayed_lr = (learning_rate - end_learning_rate) * \
                 ((1 - global_step / decay_steps) ** power) + end_learning_rate
@@ -411,32 +415,29 @@ def piecewise_decay(boundaries, values):
         else:
             global_step = _decay_step_counter()
 
-            lr = tensor.create_global_var(
-                shape=[1],
-                value=0.0,
-                dtype='float32',
-                persistable=True,
-                name="learning_rate")
+            lr = tensor.create_global_var(shape=[1],
+                                          value=0.0,
+                                          dtype='float32',
+                                          persistable=True,
+                                          name="learning_rate")
 
             with control_flow.Switch() as switch:
                 for i in range(len(boundaries)):
-                    boundary_val = tensor.fill_constant(
-                        shape=[1],
-                        dtype='float32',
-                        value=float(boundaries[i]),
-                        force_cpu=True)
+                    boundary_val = tensor.fill_constant(shape=[1],
+                                                        dtype='float32',
+                                                        value=float(
+                                                            boundaries[i]),
+                                                        force_cpu=True)
                     with switch.case(global_step < boundary_val):
-                        tensor.fill_constant(
-                            shape=[1],
-                            dtype="float32",
-                            value=float(values[i]),
-                            out=lr)
+                        tensor.fill_constant(shape=[1],
+                                             dtype="float32",
+                                             value=float(values[i]),
+                                             out=lr)
                 with switch.default():
-                    tensor.fill_constant(
-                        shape=[1],
-                        dtype="float32",
-                        value=float(values[len(values) - 1]),
-                        out=lr)
+                    tensor.fill_constant(shape=[1],
+                                         dtype="float32",
+                                         value=float(values[len(values) - 1]),
+                                         out=lr)
 
             return lr
 
@@ -556,12 +557,11 @@ def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
                                             start_lr, end_lr)
             return lr
         else:
-            lr = tensor.create_global_var(
-                shape=[1],
-                value=0.0,
-                dtype=dtype,
-                persistable=True,
-                name="learning_rate_warmup")
+            lr = tensor.create_global_var(shape=[1],
+                                          value=0.0,
+                                          dtype=dtype,
+                                          persistable=True,
+                                          name="learning_rate_warmup")
 
             global_step = _decay_step_counter()
 
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 99c0a2e70b771..1ad4e3c4298c2 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -114,8 +114,9 @@ def center_loss(input,
     check_variable_and_dtype(label, 'label', ['int32', 'int64'], 'center_loss')
 
     centers_shape = [num_classes, input.shape[1]]
-    centers_param = helper.create_parameter(
-        attr=param_attr, shape=centers_shape, dtype=dtype)
+    centers_param = helper.create_parameter(attr=param_attr,
+                                            shape=centers_shape,
+                                            dtype=dtype)
     centers_param.stop_gradient = True
 
     if isinstance(alpha, Variable):
@@ -135,21 +136,22 @@ def center_loss(input,
 
     centersdiff = helper.create_variable_for_type_inference(dtype=input.dtype)
     loss = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='center_loss',
-        inputs={
-            'X': [input],
-            'Label': [label],
-            'Centers': [centers_param],
-            'CenterUpdateRate': [alpha_param]
-        },
-        outputs={
-            'SampleCenterDiff': [centersdiff],
-            'Loss': [loss],
-            'CentersOut': [centers_param]
-        },
-        attrs={'cluster_num': num_classes,
-               'need_update': update_center})
+    helper.append_op(type='center_loss',
+                     inputs={
+                         'X': [input],
+                         'Label': [label],
+                         'Centers': [centers_param],
+                         'CenterUpdateRate': [alpha_param]
+                     },
+                     outputs={
+                         'SampleCenterDiff': [centersdiff],
+                         'Loss': [loss],
+                         'CentersOut': [centers_param]
+                     },
+                     attrs={
+                         'cluster_num': num_classes,
+                         'need_update': update_center
+                     })
     return loss
 
 
@@ -197,11 +199,12 @@ def bpr_loss(input, label, name=None):
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
                              'bpr_loss')
-    helper.append_op(
-        type='bpr_loss',
-        inputs={'X': [input],
-                'Label': [label]},
-        outputs={'Y': [out]})
+    helper.append_op(type='bpr_loss',
+                     inputs={
+                         'X': [input],
+                         'Label': [label]
+                     },
+                     outputs={'Y': [out]})
     return out
 
 
@@ -273,8 +276,10 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
                              'cross_entropy')
     helper = LayerHelper('cross_entropy', **locals())
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='cross_entropy', inputs=inputs, outputs={'Y': [out]}, attrs=attrs)
+    helper.append_op(type='cross_entropy',
+                     inputs=inputs,
+                     outputs={'Y': [out]},
+                     attrs=attrs)
     return out
 
 
@@ -292,13 +297,14 @@ def cross_entropy2(input, label, ignore_index=kIgnoreIndex):
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     xshape = helper.create_variable_for_type_inference(dtype=input.dtype)
     match_x = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='cross_entropy2',
-        inputs=inputs,
-        outputs={'Y': [out],
-                 'MatchX': [match_x],
-                 'XShape': [xshape]},
-        attrs=attrs)
+    helper.append_op(type='cross_entropy2',
+                     inputs=inputs,
+                     outputs={
+                         'Y': [out],
+                         'MatchX': [match_x],
+                         'XShape': [xshape]
+                     },
+                     attrs=attrs)
     return out
 
 
@@ -412,8 +418,9 @@ def edit_distance(input,
             # [4]
 
     """
-    return paddle.nn.functional.loss.edit_distance(
-        input, label, normalized, ignored_tokens, input_length, label_length)
+    return paddle.nn.functional.loss.edit_distance(input, label, normalized,
+                                                   ignored_tokens, input_length,
+                                                   label_length)
 
 
 def warpctc(input,
@@ -552,7 +559,8 @@ def warpctc(input,
             'blank',
             blank,
             'norm_by_times',
-            norm_by_times, )
+            norm_by_times,
+        )
         return loss_out
     helper = LayerHelper('warpctc', **locals())
     check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc")
@@ -569,15 +577,16 @@ def warpctc(input,
     loss_out = helper.create_variable_for_type_inference(dtype=input.dtype)
     grad_out = helper.create_variable_for_type_inference(dtype=input.dtype)
 
-    helper.append_op(
-        type='warpctc',
-        inputs=this_inputs,
-        outputs={'WarpCTCGrad': [grad_out],
-                 'Loss': [loss_out]},
-        attrs={
-            'blank': blank,
-            'norm_by_times': norm_by_times,
-        })
+    helper.append_op(type='warpctc',
+                     inputs=this_inputs,
+                     outputs={
+                         'WarpCTCGrad': [grad_out],
+                         'Loss': [loss_out]
+                     },
+                     attrs={
+                         'blank': blank,
+                         'norm_by_times': norm_by_times,
+                     })
     return loss_out
 
 
@@ -682,18 +691,16 @@ def nce(input,
 
     dim = input.shape[1]
     num_true_class = label.shape[1]
-    w = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[num_total_classes, dim],
-        is_bias=False,
-        dtype=input.dtype)
+    w = helper.create_parameter(attr=helper.param_attr,
+                                shape=[num_total_classes, dim],
+                                is_bias=False,
+                                dtype=input.dtype)
     inputs = {}
     if helper.bias_attr:
-        b = helper.create_parameter(
-            attr=helper.bias_attr,
-            shape=[num_total_classes, 1],
-            is_bias=True,
-            dtype=input.dtype)
+        b = helper.create_parameter(attr=helper.bias_attr,
+                                    shape=[num_total_classes, 1],
+                                    is_bias=True,
+                                    dtype=input.dtype)
         inputs['Bias'] = b
     cost = helper.create_variable_for_type_inference(dtype=input.dtype)
     sample_logits = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -791,15 +798,14 @@ def _init_by_numpy_array(numpy_array):
         'remote_prefetch': remote_prefetch
     }
 
-    helper.append_op(
-        type='nce',
-        inputs=inputs,
-        outputs={
-            'Cost': cost,
-            'SampleLogits': sample_logits,
-            'SampleLabels': sample_labels
-        },
-        attrs=attrs)
+    helper.append_op(type='nce',
+                     inputs=inputs,
+                     outputs={
+                         'Cost': cost,
+                         'SampleLogits': sample_logits,
+                         'SampleLabels': sample_labels
+                     },
+                     attrs=attrs)
     return cost / (num_neg_samples + 1)
 
 
@@ -921,17 +927,15 @@ def hsigmoid(input,
         "With sparse mode, if your models has only small parameter prefetch may cause speed down"
     )
     if not is_custom:
-        weights = helper.create_parameter(
-            attr=helper.param_attr,
-            shape=[num_classes - 1, dim],
-            is_bias=False,
-            dtype=input.dtype)
+        weights = helper.create_parameter(attr=helper.param_attr,
+                                          shape=[num_classes - 1, dim],
+                                          is_bias=False,
+                                          dtype=input.dtype)
     else:
-        weights = helper.create_parameter(
-            attr=helper.param_attr,
-            shape=[num_classes, dim],
-            is_bias=False,
-            dtype=input.dtype)
+        weights = helper.create_parameter(attr=helper.param_attr,
+                                          shape=[num_classes, dim],
+                                          is_bias=False,
+                                          dtype=input.dtype)
     inputs = {
         "X": input,
         "W": weights,
@@ -941,30 +945,29 @@ def hsigmoid(input,
     }
     if helper.bias_attr:
         if not is_custom:
-            bias = helper.create_parameter(
-                attr=helper.bias_attr,
-                shape=[num_classes - 1, 1],
-                is_bias=True,
-                dtype=input.dtype)
+            bias = helper.create_parameter(attr=helper.bias_attr,
+                                           shape=[num_classes - 1, 1],
+                                           is_bias=True,
+                                           dtype=input.dtype)
             inputs['Bias'] = bias
         else:
-            bias = helper.create_parameter(
-                attr=helper.bias_attr,
-                shape=[num_classes, 1],
-                is_bias=True,
-                dtype=input.dtype)
+            bias = helper.create_parameter(attr=helper.bias_attr,
+                                           shape=[num_classes, 1],
+                                           is_bias=True,
+                                           dtype=input.dtype)
             inputs['Bias'] = bias
-    helper.append_op(
-        type="hierarchical_sigmoid",
-        inputs=inputs,
-        outputs={"Out": out,
-                 "PreOut": pre_out,
-                 "W_Out": weights},
-        attrs={
-            "num_classes": num_classes,
-            "is_sparse": is_sparse,
-            "remote_prefetch": remote_prefetch
-        })
+    helper.append_op(type="hierarchical_sigmoid",
+                     inputs=inputs,
+                     outputs={
+                         "Out": out,
+                         "PreOut": pre_out,
+                         "W_Out": weights
+                     },
+                     attrs={
+                         "num_classes": num_classes,
+                         "is_sparse": is_sparse,
+                         "remote_prefetch": remote_prefetch
+                     })
     return out
 
 
@@ -1075,48 +1078,49 @@ def sampled_softmax_with_cross_entropy(logits,
     logits_dim = helper.create_variable_for_type_inference(dtype=logits.dtype)
     labels_dim = helper.create_variable_for_type_inference(dtype=label.type)
 
-    helper.append_op(
-        type='sample_logits',
-        inputs={
-            'Logits': logits,
-            'Labels': label,
-            'CustomizedSamples': customized_samples,
-            'CustomizedProbabilities': customized_probabilities
-        },
-        outputs={
-            'Samples': samples,
-            'Probabilities': probabilities,
-            'SampledLabels': sampled_label,
-            'SampledLogits': sampled_logits,
-            'LogitsDim': logits_dim,
-            'LabelsDim': labels_dim
-        },
-        attrs={
-            'use_customized_samples': use_customized_samples,
-            'uniq': True,
-            'remove_accidental_hits': remove_accidental_hits,
-            'num_samples': num_samples,
-            'seed': seed
-        })
+    helper.append_op(type='sample_logits',
+                     inputs={
+                         'Logits': logits,
+                         'Labels': label,
+                         'CustomizedSamples': customized_samples,
+                         'CustomizedProbabilities': customized_probabilities
+                     },
+                     outputs={
+                         'Samples': samples,
+                         'Probabilities': probabilities,
+                         'SampledLabels': sampled_label,
+                         'SampledLogits': sampled_logits,
+                         'LogitsDim': logits_dim,
+                         'LabelsDim': labels_dim
+                     },
+                     attrs={
+                         'use_customized_samples': use_customized_samples,
+                         'uniq': True,
+                         'remove_accidental_hits': remove_accidental_hits,
+                         'num_samples': num_samples,
+                         'seed': seed
+                     })
     loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
     softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    helper.append_op(
-        type='one_hot',
-        inputs={'X': sampled_label},
-        attrs={'depth': num_samples + 1},
-        outputs={'Out': sampled_softlabel})
-
-    helper.append_op(
-        type='softmax_with_cross_entropy',
-        inputs={'Logits': sampled_logits,
-                'Label': sampled_softlabel},
-        outputs={'Softmax': softmax,
-                 'Loss': loss},
-        attrs={
-            'soft_label': True,
-            'ignore_index': False,
-            'numeric_stable_mode': False
-        })
+    helper.append_op(type='one_hot',
+                     inputs={'X': sampled_label},
+                     attrs={'depth': num_samples + 1},
+                     outputs={'Out': sampled_softlabel})
+
+    helper.append_op(type='softmax_with_cross_entropy',
+                     inputs={
+                         'Logits': sampled_logits,
+                         'Label': sampled_softlabel
+                     },
+                     outputs={
+                         'Softmax': softmax,
+                         'Loss': loss
+                     },
+                     attrs={
+                         'soft_label': True,
+                         'ignore_index': False,
+                         'numeric_stable_mode': False
+                     })
     return loss / num_true
 
 
@@ -1280,12 +1284,13 @@ def rank_loss(label, left, right, name=None):
 
     out = helper.create_variable_for_type_inference("float32")
 
-    helper.append_op(
-        type='rank_loss',
-        inputs={"Label": label,
-                "Left": left,
-                "Right": right},
-        outputs={'Out': out})
+    helper.append_op(type='rank_loss',
+                     inputs={
+                         "Label": label,
+                         "Left": left,
+                         "Right": right
+                     },
+                     outputs={'Out': out})
     return out
 
 
@@ -1330,14 +1335,17 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
     check_variable_and_dtype(label, 'right', ['float32'], 'margin_rank_loss')
     out = helper.create_variable_for_type_inference(left.dtype)
     act = helper.create_variable_for_type_inference(left.dtype)
-    helper.append_op(
-        type='margin_rank_loss',
-        inputs={"Label": label,
-                "X1": left,
-                "X2": right},
-        outputs={'Out': out,
-                 'Activated': act},
-        attrs={'margin': margin})
+    helper.append_op(type='margin_rank_loss',
+                     inputs={
+                         "Label": label,
+                         "X1": left,
+                         "X2": right
+                     },
+                     outputs={
+                         'Out': out,
+                         'Activated': act
+                     },
+                     attrs={'margin': margin})
     return out
 
 
@@ -1392,13 +1400,16 @@ def sigmoid_cross_entropy_with_logits(x,
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type="sigmoid_cross_entropy_with_logits",
-        inputs={"X": x,
-                "Label": label},
-        attrs={"ignore_index": ignore_index,
-               'normalize': normalize},
-        outputs={"Out": out})
+    helper.append_op(type="sigmoid_cross_entropy_with_logits",
+                     inputs={
+                         "X": x,
+                         "Label": label
+                     },
+                     attrs={
+                         "ignore_index": ignore_index,
+                         'normalize': normalize
+                     },
+                     outputs={"Out": out})
     return out
 
 
@@ -1521,13 +1532,16 @@ def huber_loss(input, label, delta):
     residual = helper.create_variable_for_type_inference(
         dtype=helper.input_dtype())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='huber_loss',
-        inputs={'X': input,
-                'Y': label},
-        outputs={'Out': out,
-                 'Residual': residual},
-        attrs={'delta': delta})
+    helper.append_op(type='huber_loss',
+                     inputs={
+                         'X': input,
+                         'Y': label
+                     },
+                     outputs={
+                         'Out': out,
+                         'Residual': residual
+                     },
+                     attrs={'delta': delta})
     return out
 
 
@@ -1581,12 +1595,13 @@ def kldiv_loss(x, target, reduction='mean', name=None):
                              'kldiv_loss')
     check_type(reduction, 'reduction', str, 'kldiv_loss')
     loss = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='kldiv_loss',
-        inputs={'X': x,
-                'Target': target},
-        outputs={'Loss': loss},
-        attrs={'reduction': reduction})
+    helper.append_op(type='kldiv_loss',
+                     inputs={
+                         'X': x,
+                         'Target': target
+                     },
+                     outputs={'Loss': loss},
+                     attrs={'reduction': reduction})
     return loss
 
 
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 47b42f65e4854..f6810fc063080 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -61,6 +61,7 @@
 
 
 def monkey_patch_variable():
+
     def unique_tmp_name():
         return unique_name.generate("tmp")
 
@@ -81,16 +82,15 @@ def create_new_tmp_var(block, dtype):
     def create_tensor(block, value, dtype, shape):
         value = float(value)
         var = create_new_tmp_var(block, dtype)
-        block.append_op(
-            type="fill_constant",
-            outputs={'Out': [var]},
-            attrs={
-                'dtype': var.dtype,
-                'shape': shape,
-                'value': value,
-                'force_cpu': False
-            },
-            stop_gradient=True)
+        block.append_op(type="fill_constant",
+                        outputs={'Out': [var]},
+                        attrs={
+                            'dtype': var.dtype,
+                            'shape': shape,
+                            'value': value,
+                            'force_cpu': False
+                        },
+                        stop_gradient=True)
         var.stop_gradient = True
         return var
 
@@ -114,17 +114,16 @@ def create_tensor_with_batchsize(ref_var, value, dtype):
             else:
                 out_shape.append(d)
         assert batch_dim != -1
-        block.append_op(
-            type='fill_constant_batch_size_like',
-            outputs={'Out': [var]},
-            inputs={'Input': [ref_var]},
-            attrs={
-                'shape': out_shape,
-                'value': value,
-                'input_dim_idx': batch_dim,
-                'output_dim_idx': batch_dim
-            },
-            stop_gradient=True)
+        block.append_op(type='fill_constant_batch_size_like',
+                        outputs={'Out': [var]},
+                        inputs={'Input': [ref_var]},
+                        attrs={
+                            'shape': out_shape,
+                            'value': value,
+                            'input_dim_idx': batch_dim,
+                            'output_dim_idx': batch_dim
+                        },
+                        stop_gradient=True)
 
         var.stop_gradient = True
         return var
@@ -176,12 +175,13 @@ def astype(self, dtype):
         """
         block = current_block(self)
         out = create_new_tmp_var(block, dtype)
-        block.append_op(
-            type="cast",
-            inputs={"X": [self]},
-            outputs={"Out": [out]},
-            attrs={"in_dtype": self.dtype,
-                   "out_dtype": out.dtype})
+        block.append_op(type="cast",
+                        inputs={"X": [self]},
+                        outputs={"Out": [out]},
+                        attrs={
+                            "in_dtype": self.dtype,
+                            "out_dtype": out.dtype
+                        })
         out.stop_gradient = self.stop_gradient
         return out
 
@@ -198,20 +198,21 @@ def append(self, var):
                     type(var)))
         if self.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
             raise TypeError(
-                "Only Variable with VarType.LOD_TENSOR_ARRAY support `append` method, but received type: {}".
-                format(self.type))
+                "Only Variable with VarType.LOD_TENSOR_ARRAY support `append` method, but received type: {}"
+                .format(self.type))
 
         array_write(x=var, i=array_length(self), array=self)
 
     def _scalar_op_(var, scale, bias):
         block = current_block(var)
         out = create_new_tmp_var(block, var.dtype)
-        block.append_op(
-            type="scale",
-            inputs={"X": [var]},
-            outputs={"Out": [out]},
-            attrs={"scale": scale,
-                   "bias": bias})
+        block.append_op(type="scale",
+                        inputs={"X": [var]},
+                        outputs={"Out": [out]},
+                        attrs={
+                            "scale": scale,
+                            "bias": bias
+                        })
         return out
 
     def _neg_(var):
@@ -258,6 +259,7 @@ def _binary_creator_(method_name,
                          op_type,
                          reverse=False,
                          scalar_method=None):
+
         def __impl__(self, other_var):
             # 1. scalar exists cases
             # we need combine the tensor.dtype and scalar.dtype, cast correct object
@@ -300,18 +302,18 @@ def __impl__(self, other_var):
                             has_batch_size = True
                             break
                     if not has_batch_size:
-                        other_var = create_tensor(
-                            current_block(self),
-                            other_var,
-                            dtype=lhs_dtype,
-                            shape=self.shape)
+                        other_var = create_tensor(current_block(self),
+                                                  other_var,
+                                                  dtype=lhs_dtype,
+                                                  shape=self.shape)
                     else:
                         other_var = create_tensor_with_batchsize(
                             self, other_var, lhs_dtype)
                 else:
                     # add fill_op to current_block
-                    other_var = create_scalar(
-                        current_block(self), value=other_var, dtype=lhs_dtype)
+                    other_var = create_scalar(current_block(self),
+                                              value=other_var,
+                                              dtype=lhs_dtype)
 
             # 3. unify right var type to left var
             rhs_dtype = safe_get_dtype(other_var)
@@ -339,12 +341,13 @@ def __impl__(self, other_var):
                     "%s(X, Y, axis=0) instead of %s. This transitional warning will be dropped in the future."
                     % (file_name, line_num, EXPRESSION_MAP[method_name],
                        op_type, op_type, EXPRESSION_MAP[method_name]))
-            current_block(self).append_op(
-                type=op_type,
-                inputs={'X': [self],
-                        'Y': [other_var]},
-                outputs={'Out': out},
-                attrs={'axis': axis})
+            current_block(self).append_op(type=op_type,
+                                          inputs={
+                                              'X': [self],
+                                              'Y': [other_var]
+                                          },
+                                          outputs={'Out': out},
+                                          attrs={'axis': axis})
             return out
 
         comment = OpProtoHolder.instance().get_op_proto(op_type).comment
@@ -369,34 +372,35 @@ def __impl__(self, other_var):
         ('dim', lambda x: len(x.shape)),
         ('ndimension', lambda x: len(x.shape)),
         ('ndim', _ndim_),
-        ('__add__', _binary_creator_('__add__', 'elementwise_add', False,
-                                     _scalar_add_)),
+        ('__add__',
+         _binary_creator_('__add__', 'elementwise_add', False, _scalar_add_)),
         #  a+b == b+a. Do not need to reverse explicitly
         ('__radd__',
          _binary_creator_('__radd__', 'elementwise_add', False, _scalar_add_)),
-        ('__sub__', _binary_creator_('__sub__', 'elementwise_sub', False,
-                                     _scalar_sub_)),
-        ('__rsub__', _binary_creator_('__rsub__', 'elementwise_sub', True,
-                                      _scalar_rsub_)),
-        ('__mul__', _binary_creator_('__mul__', 'elementwise_mul', False,
-                                     _scalar_mul_)),
+        ('__sub__',
+         _binary_creator_('__sub__', 'elementwise_sub', False, _scalar_sub_)),
+        ('__rsub__',
+         _binary_creator_('__rsub__', 'elementwise_sub', True, _scalar_rsub_)),
+        ('__mul__',
+         _binary_creator_('__mul__', 'elementwise_mul', False, _scalar_mul_)),
         #  a*b == b*a. Do not need to reverse explicitly
         ('__rmul__',
          _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
-        ('__div__', _binary_creator_('__div__', 'elementwise_div', False,
-                                     _scalar_div_)),
-        ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
-                                         False, _scalar_div_)),
+        ('__div__',
+         _binary_creator_('__div__', 'elementwise_div', False, _scalar_div_)),
+        ('__truediv__',
+         _binary_creator_('__truediv__', 'elementwise_div', False,
+                          _scalar_div_)),
         ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
                                       None)),
-        ('__rtruediv__', _binary_creator_('__rtruediv__', 'elementwise_div',
-                                          True, None)),
+        ('__rtruediv__',
+         _binary_creator_('__rtruediv__', 'elementwise_div', True, None)),
         ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
                                      None)),
         ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
                                       None)),
-        ('__floordiv__', _binary_creator_('__floordiv__',
-                                          'elementwise_floordiv', False, None)),
+        ('__floordiv__',
+         _binary_creator_('__floordiv__', 'elementwise_floordiv', False, None)),
         ('__mod__', _binary_creator_('__mod__', 'elementwise_mod', False,
                                      None)),
         ('__matmul__', _binary_creator_('__matmul__', "matmul_v2", False,
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index 7616e49c48ffc..57b8411a54fd6 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -102,29 +102,29 @@ def accuracy(input, label, k=1, correct=None, total=None):
     else:
         attrs = {'k': k}
     attrs['sorted'] = False
-    helper.append_op(
-        type="top_k_v2",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]})
+    helper.append_op(type="top_k_v2",
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={
+                         "Out": [topk_out],
+                         "Indices": [topk_indices]
+                     })
     acc_out = helper.create_variable_for_type_inference(dtype="float32")
     if correct is None:
         correct = helper.create_variable_for_type_inference(dtype="int32")
     if total is None:
         total = helper.create_variable_for_type_inference(dtype="int32")
-    helper.append_op(
-        type="accuracy",
-        inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
-        },
-        outputs={
-            "Accuracy": [acc_out],
-            "Correct": [correct],
-            "Total": [total],
-        })
+    helper.append_op(type="accuracy",
+                     inputs={
+                         "Out": [topk_out],
+                         "Indices": [topk_indices],
+                         "Label": [label]
+                     },
+                     outputs={
+                         "Accuracy": [acc_out],
+                         "Correct": [correct],
+                         "Total": [total],
+                     })
     return acc_out
 
 
@@ -206,8 +206,8 @@ def auc(input,
     # make tp, tn, fp, fn persistable, so that can accumulate all batches.
 
     # for batch auc
-    # we create slide_step+1 buckets, the first slide_steps buckets store 
-    # historical batch-level values, and the last bucket stores the sum values of 
+    # we create slide_step+1 buckets, the first slide_steps buckets store
+    # historical batch-level values, and the last bucket stores the sum values of
     # previous slide_step buckets.
     # The index of bucket that the newest batch will use is determined by batch_id mod slide_steps,
     # and batch_id is store in the last posision of following variable
@@ -222,54 +222,53 @@ def auc(input,
 
     # for global auc
     # Needn't maintain the batch id
-    stat_pos = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
-    stat_neg = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[1, num_thresholds + 1])
+    stat_pos = helper.create_global_variable(persistable=True,
+                                             dtype='int64',
+                                             shape=[1, num_thresholds + 1])
+    stat_neg = helper.create_global_variable(persistable=True,
+                                             dtype='int64',
+                                             shape=[1, num_thresholds + 1])
 
     for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
-        helper.set_variable_initializer(
-            var, Constant(
-                value=0.0, force_cpu=False))
+        helper.set_variable_initializer(var, Constant(value=0.0,
+                                                      force_cpu=False))
 
     # Batch AUC
-    helper.append_op(
-        type="auc",
-        inputs={
-            "Predict": [input],
-            "Label": [label],
-            "StatPos": [batch_stat_pos],
-            "StatNeg": [batch_stat_neg]
-        },
-        attrs={
-            "curve": curve,
-            "num_thresholds": num_thresholds,
-            "slide_steps": slide_steps
-        },
-        outputs={
-            "AUC": [batch_auc_out],
-            "StatPosOut": [batch_stat_pos],
-            "StatNegOut": [batch_stat_neg]
-        })
+    helper.append_op(type="auc",
+                     inputs={
+                         "Predict": [input],
+                         "Label": [label],
+                         "StatPos": [batch_stat_pos],
+                         "StatNeg": [batch_stat_neg]
+                     },
+                     attrs={
+                         "curve": curve,
+                         "num_thresholds": num_thresholds,
+                         "slide_steps": slide_steps
+                     },
+                     outputs={
+                         "AUC": [batch_auc_out],
+                         "StatPosOut": [batch_stat_pos],
+                         "StatNegOut": [batch_stat_neg]
+                     })
     # Global AUC
-    helper.append_op(
-        type="auc",
-        inputs={
-            "Predict": [input],
-            "Label": [label],
-            "StatPos": [stat_pos],
-            "StatNeg": [stat_neg]
-        },
-        attrs={
-            "curve": curve,
-            "num_thresholds": num_thresholds,
-            "slide_steps": 0
-        },
-        outputs={
-            "AUC": [auc_out],
-            "StatPosOut": [stat_pos],
-            "StatNegOut": [stat_neg]
-        })
+    helper.append_op(type="auc",
+                     inputs={
+                         "Predict": [input],
+                         "Label": [label],
+                         "StatPos": [stat_pos],
+                         "StatNeg": [stat_neg]
+                     },
+                     attrs={
+                         "curve": curve,
+                         "num_thresholds": num_thresholds,
+                         "slide_steps": 0
+                     },
+                     outputs={
+                         "AUC": [auc_out],
+                         "StatPosOut": [stat_pos],
+                         "StatNegOut": [stat_neg]
+                     })
     return auc_out, batch_auc_out, [
         batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
     ]
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 7fb9f6057b55a..2c3cb903d83ca 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -215,6 +215,7 @@ def _elementwise_op_in_dygraph(x,
                                act=None,
                                use_mkldnn=False,
                                op_name=None):
+
     def is_inplace(op_name):
         return op_name[-1] == "_"
 
@@ -223,15 +224,17 @@ def is_inplace(op_name):
         out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
     else:
         if in_dygraph_mode():
-            op = getattr(_C_ops, OP_NAMEMAPPING[op_name]
-                         if not is_inplace(op_name) else op_name)
+            op = getattr(
+                _C_ops,
+                OP_NAMEMAPPING[op_name] if not is_inplace(op_name) else op_name)
             out = op(x, y)
 
         if _in_legacy_dygraph():
             op = getattr(_C_ops, op_name)
             out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
-    return dygraph_utils._append_activation_in_dygraph(
-        out, act, use_mkldnn=use_mkldnn)
+    return dygraph_utils._append_activation_in_dygraph(out,
+                                                       act,
+                                                       use_mkldnn=use_mkldnn)
 
 
 def fc(input,
@@ -369,27 +372,31 @@ def fc(input,
             reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
         ] + [size]
 
-        w = helper.create_parameter(
-            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
+        w = helper.create_parameter(attr=param_attr,
+                                    shape=param_shape,
+                                    dtype=dtype,
+                                    is_bias=False)
         tmp = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(
-            type="mul",
-            inputs={"X": input_var,
-                    "Y": w},
-            outputs={"Out": tmp},
-            attrs={"x_num_col_dims": num_flatten_dims,
-                   "y_num_col_dims": 1})
+        helper.append_op(type="mul",
+                         inputs={
+                             "X": input_var,
+                             "Y": w
+                         },
+                         outputs={"Out": tmp},
+                         attrs={
+                             "x_num_col_dims": num_flatten_dims,
+                             "y_num_col_dims": 1
+                         })
         mul_results.append(tmp)
 
     if len(mul_results) == 1:
         pre_bias = mul_results[0]
     else:
         pre_bias = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(
-            type="sum",
-            inputs={"X": mul_results},
-            outputs={"Out": pre_bias},
-            attrs={"use_mkldnn": False})
+        helper.append_op(type="sum",
+                         inputs={"X": mul_results},
+                         outputs={"Out": pre_bias},
+                         attrs={"use_mkldnn": False})
     # add bias
     pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
     # add activation
@@ -529,22 +536,25 @@ def embedding(input,
 
     remote_prefetch = True if is_sparse else False
 
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
+    w = helper.create_parameter(attr=helper.param_attr,
+                                shape=size,
+                                dtype=dtype,
+                                is_bias=False)
     tmp = helper.create_variable_for_type_inference(dtype)
     padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
         size[0] + padding_idx)
-    helper.append_op(
-        type='lookup_table',
-        inputs={'Ids': input,
-                'W': w},
-        outputs={'Out': tmp},
-        attrs={
-            'is_sparse': is_sparse,
-            'is_distributed': is_distributed,
-            'remote_prefetch': remote_prefetch,
-            'padding_idx': padding_idx
-        })
+    helper.append_op(type='lookup_table',
+                     inputs={
+                         'Ids': input,
+                         'W': w
+                     },
+                     outputs={'Out': tmp},
+                     attrs={
+                         'is_sparse': is_sparse,
+                         'is_distributed': is_distributed,
+                         'remote_prefetch': remote_prefetch,
+                         'padding_idx': padding_idx
+                     })
     return tmp
 
 
@@ -606,14 +616,18 @@ def _pull_sparse(input,
         'is_distributed': True
     }
     # this is only for compatible with embedding op
-    w, _ = helper.create_or_get_global_variable(
-        name=name, shape=[size], dtype=dtype, is_bias=False, persistable=True)
-    helper.append_op(
-        type='pull_sparse',
-        inputs={'Ids': inputs,
-                'W': w},
-        outputs={'Out': outs},
-        attrs=attrs)
+    w, _ = helper.create_or_get_global_variable(name=name,
+                                                shape=[size],
+                                                dtype=dtype,
+                                                is_bias=False,
+                                                persistable=True)
+    helper.append_op(type='pull_sparse',
+                     inputs={
+                         'Ids': inputs,
+                         'W': w
+                     },
+                     outputs={'Out': outs},
+                     attrs=attrs)
     if len(outs) == 1:
         return outs[0]
     return outs
@@ -677,14 +691,18 @@ def _pull_sparse_v2(input,
         'is_distributed': True
     }
     # this is only for compatible with embedding op
-    w, _ = helper.create_or_get_global_variable(
-        name=name, shape=[size], dtype=dtype, is_bias=False, persistable=True)
-    helper.append_op(
-        type='pull_sparse_v2',
-        inputs={'Ids': inputs,
-                'W': w},
-        outputs={'Out': outs},
-        attrs=attrs)
+    w, _ = helper.create_or_get_global_variable(name=name,
+                                                shape=[size],
+                                                dtype=dtype,
+                                                is_bias=False,
+                                                persistable=True)
+    helper.append_op(type='pull_sparse_v2',
+                     inputs={
+                         'Ids': inputs,
+                         'W': w
+                     },
+                     outputs={'Out': outs},
+                     attrs=attrs)
     if len(outs) == 1:
         return outs[0]
     return outs
@@ -736,18 +754,21 @@ def _pull_gpups_sparse(input,
         helper.create_variable_for_type_inference(dtype)
         for i in range(len(inputs))
     ]
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=[size[0]], dtype=dtype, is_bias=False)
-    helper.append_op(
-        type='pull_gpups_sparse',
-        inputs={'Ids': inputs,
-                'W': w},
-        outputs={'Out': outs},
-        attrs={
-            'size': size,
-            'is_distributed': is_distributed,
-            'is_sparse': is_sparse
-        })
+    w = helper.create_parameter(attr=helper.param_attr,
+                                shape=[size[0]],
+                                dtype=dtype,
+                                is_bias=False)
+    helper.append_op(type='pull_gpups_sparse',
+                     inputs={
+                         'Ids': inputs,
+                         'W': w
+                     },
+                     outputs={'Out': outs},
+                     attrs={
+                         'size': size,
+                         'is_distributed': is_distributed,
+                         'is_sparse': is_sparse
+                     })
     if len(outs) == 1:
         return outs[0]
     return outs
@@ -795,18 +816,21 @@ def _pull_box_sparse(input,
         helper.create_variable_for_type_inference(dtype)
         for i in range(len(inputs))
     ]
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=[size], dtype=dtype, is_bias=False)
-    helper.append_op(
-        type='pull_box_sparse',
-        inputs={'Ids': inputs,
-                'W': w},
-        outputs={'Out': outs},
-        attrs={
-            'size': size,
-            'is_distributed': is_distributed,
-            'is_sparse': is_sparse
-        })
+    w = helper.create_parameter(attr=helper.param_attr,
+                                shape=[size],
+                                dtype=dtype,
+                                is_bias=False)
+    helper.append_op(type='pull_box_sparse',
+                     inputs={
+                         'Ids': inputs,
+                         'W': w
+                     },
+                     outputs={'Out': outs},
+                     attrs={
+                         'size': size,
+                         'is_distributed': is_distributed,
+                         'is_sparse': is_sparse
+                     })
     if len(outs) == 1:
         return outs[0]
     return outs
@@ -907,10 +931,9 @@ def linear_chain_crf(input, label, param_attr=None, length=None):
     check_variable_and_dtype(label, 'label', ['int64'], 'linear_chain_crf')
     helper = LayerHelper('linear_chain_crf', **locals())
     size = input.shape[2] if length else input.shape[1]
-    transition = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[size + 2, size],
-        dtype=helper.input_dtype())
+    transition = helper.create_parameter(attr=helper.param_attr,
+                                         shape=[size + 2, size],
+                                         dtype=helper.input_dtype())
     alpha = helper.create_variable_for_type_inference(
         dtype=helper.input_dtype())
     emission_exps = helper.create_variable_for_type_inference(
@@ -926,15 +949,14 @@ def linear_chain_crf(input, label, param_attr=None, length=None):
     }
     if length:
         this_inputs['Length'] = [length]
-    helper.append_op(
-        type='linear_chain_crf',
-        inputs=this_inputs,
-        outputs={
-            "Alpha": [alpha],
-            "EmissionExps": [emission_exps],
-            "TransitionExps": transition_exps,
-            "LogLikelihood": log_likelihood
-        })
+    helper.append_op(type='linear_chain_crf',
+                     inputs=this_inputs,
+                     outputs={
+                         "Alpha": [alpha],
+                         "EmissionExps": [emission_exps],
+                         "TransitionExps": transition_exps,
+                         "LogLikelihood": log_likelihood
+                     })
 
     return log_likelihood
 
@@ -999,10 +1021,9 @@ def crf_decoding(input, param_attr, label=None, length=None):
     inputs = {"Emission": [input], "Transition": transition, "Label": label}
     if length:
         inputs['Length'] = length
-    helper.append_op(
-        type='crf_decoding',
-        inputs=inputs,
-        outputs={"ViterbiPath": [viterbi_path]})
+    helper.append_op(type='crf_decoding',
+                     inputs=inputs,
+                     outputs={"ViterbiPath": [viterbi_path]})
 
     return viterbi_path
 
@@ -1036,13 +1057,16 @@ def cos_sim(X, Y):
     out = helper.create_variable_for_type_inference(dtype=X.dtype)
     xnorm = helper.create_variable_for_type_inference(dtype=X.dtype)
     ynorm = helper.create_variable_for_type_inference(dtype=X.dtype)
-    helper.append_op(
-        type='cos_sim',
-        inputs={'X': [X],
-                'Y': [Y]},
-        outputs={'Out': [out],
-                 'XNorm': [xnorm],
-                 'YNorm': [ynorm]})
+    helper.append_op(type='cos_sim',
+                     inputs={
+                         'X': [X],
+                         'Y': [Y]
+                     },
+                     outputs={
+                         'Out': [out],
+                         'XNorm': [xnorm],
+                         'YNorm': [ynorm]
+                     })
     return out
 
 
@@ -1113,15 +1137,16 @@ def dropout(x,
         return x
 
     if _non_static_mode():
-        if (seed is None or
-                seed == 0) and default_main_program().random_seed != 0:
+        if (seed is None
+                or seed == 0) and default_main_program().random_seed != 0:
             seed = default_main_program().random_seed
         if is_test is None:
             is_test = not _dygraph_tracer()._train_mode
-        out, mask = _C_ops.dropout(
-            x, 'dropout_prob', dropout_prob, 'is_test', is_test, 'fix_seed',
-            seed is not None, 'seed', seed if seed is not None else 0,
-            'dropout_implementation', dropout_implementation)
+        out, mask = _C_ops.dropout(x, 'dropout_prob', dropout_prob, 'is_test',
+                                   is_test, 'fix_seed', seed is not None,
+                                   'seed', seed if seed is not None else 0,
+                                   'dropout_implementation',
+                                   dropout_implementation)
         return out
 
     def get_attrs(prog, dropout_prob, is_test, seed):
@@ -1146,12 +1171,13 @@ def get_attrs(prog, dropout_prob, is_test, seed):
 
     attrs = get_attrs(helper.main_program, dropout_prob, is_test, seed)
 
-    helper.append_op(
-        type='dropout',
-        inputs={'X': [x]},
-        outputs={'Out': [out],
-                 'Mask': [mask]},
-        attrs=attrs)
+    helper.append_op(type='dropout',
+                     inputs={'X': [x]},
+                     outputs={
+                         'Out': [out],
+                         'Mask': [mask]
+                     },
+                     attrs=attrs)
     return out
 
 
@@ -1287,22 +1313,21 @@ def chunk_eval(input,
     if seq_length is not None:
         this_input["SeqLength"] = [seq_length]
 
-    helper.append_op(
-        type="chunk_eval",
-        inputs=this_input,
-        outputs={
-            "Precision": [precision],
-            "Recall": [recall],
-            "F1-Score": [f1_score],
-            "NumInferChunks": [num_infer_chunks],
-            "NumLabelChunks": [num_label_chunks],
-            "NumCorrectChunks": [num_correct_chunks]
-        },
-        attrs={
-            "num_chunk_types": num_chunk_types,
-            "chunk_scheme": chunk_scheme,
-            "excluded_chunk_types": excluded_chunk_types or []
-        })
+    helper.append_op(type="chunk_eval",
+                     inputs=this_input,
+                     outputs={
+                         "Precision": [precision],
+                         "Recall": [recall],
+                         "F1-Score": [f1_score],
+                         "NumInferChunks": [num_infer_chunks],
+                         "NumLabelChunks": [num_label_chunks],
+                         "NumCorrectChunks": [num_correct_chunks]
+                     },
+                     attrs={
+                         "num_chunk_types": num_chunk_types,
+                         "chunk_scheme": chunk_scheme,
+                         "excluded_chunk_types": excluded_chunk_types or []
+                     })
     return (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
             num_correct_chunks)
 
@@ -1433,11 +1458,10 @@ def softmax(input, use_cudnn=True, name=None, axis=-1):
 
     dtype = helper.input_dtype()
     softmax_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="softmax",
-        inputs={"X": input},
-        outputs={"Out": softmax_out},
-        attrs=attrs)
+    helper.append_op(type="softmax",
+                     inputs={"X": input},
+                     outputs={"Out": softmax_out},
+                     attrs=attrs)
     return softmax_out
 
 
@@ -1615,9 +1639,9 @@ def conv2d(input,
     if groups is None:
         num_filter_channels = num_channels
     elif groups <= 0:
-        raise ValueError("the groups of input must be greater than 0, "
-                         "but received the groups of input is {}".format(
-                             groups))
+        raise ValueError(
+            "the groups of input must be greater than 0, "
+            "but received the groups of input is {}".format(groups))
     else:
         if num_channels % groups != 0:
             raise ValueError(
@@ -1627,12 +1651,12 @@ def conv2d(input,
         num_filter_channels = num_channels // groups
 
     l_type = 'conv2d'
-    if (num_channels == groups and num_filters % num_channels == 0 and
-            not use_cudnn):
+    if (num_channels == groups and num_filters % num_channels == 0
+            and not use_cudnn):
         l_type = 'depthwise_conv2d'
 
-    if (num_channels == groups and num_filters % num_channels == 0 and
-            core.is_compiled_with_rocm()):
+    if (num_channels == groups and num_filters % num_channels == 0
+            and core.is_compiled_with_rocm()):
         l_type = 'depthwise_conv2d'
 
     # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
@@ -1651,6 +1675,7 @@ def conv2d(input,
 
     # padding
     def _update_padding(padding, data_format):
+
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -1720,24 +1745,23 @@ def _get_default_param_initializer():
             "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
         use_cudnn = False
 
-    helper.append_op(
-        type=l_type,
-        inputs={
-            'Input': input,
-            'Filter': filter_param,
-        },
-        outputs={"Output": pre_bias},
-        attrs={
-            'strides': stride,
-            'paddings': padding,
-            'dilations': dilation,
-            'groups': groups,
-            'use_cudnn': use_cudnn,
-            'use_mkldnn': False,
-            'fuse_relu_before_depthwise_conv': False,
-            "padding_algorithm": padding_algorithm,
-            "data_format": data_format,
-        })
+    helper.append_op(type=l_type,
+                     inputs={
+                         'Input': input,
+                         'Filter': filter_param,
+                     },
+                     outputs={"Output": pre_bias},
+                     attrs={
+                         'strides': stride,
+                         'paddings': padding,
+                         'dilations': dilation,
+                         'groups': groups,
+                         'use_cudnn': use_cudnn,
+                         'use_mkldnn': False,
+                         'fuse_relu_before_depthwise_conv': False,
+                         "padding_algorithm": padding_algorithm,
+                         "data_format": data_format,
+                     })
 
     if data_format == 'NCHW':
         pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -1913,8 +1937,8 @@ def conv3d(input,
     channel_last = (data_format == "NDHWC")
     if len(input.shape) != 5:
         raise ValueError(
-            "Input should be 5D tensor, but received input with the shape of {}".
-            format(input.shape))
+            "Input should be 5D tensor, but received input with the shape of {}"
+            .format(input.shape))
     num_channels = input.shape[4] if channel_last else input.shape[1]
     if num_channels < 0:
         raise ValueError(
@@ -1925,8 +1949,8 @@ def conv3d(input,
         num_filter_channels = num_channels
     elif groups <= 0:
         raise ValueError(
-            "the groups of conv3d should be greater than 0. Received groups: {}".
-            format(groups))
+            "the groups of conv3d should be greater than 0. Received groups: {}"
+            .format(groups))
     else:
         if num_channels % groups != 0:
             raise ValueError(
@@ -1940,6 +1964,7 @@ def conv3d(input,
     dilation = utils.convert_to_list(dilation, 3, 'dilation')
 
     def _update_padding(padding, data_format):
+
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -2011,23 +2036,22 @@ def _get_default_param_initializer():
 
     pre_bias = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type=l_type,
-        inputs={
-            'Input': input,
-            'Filter': filter_param,
-        },
-        outputs={"Output": pre_bias},
-        attrs={
-            'strides': stride,
-            'paddings': padding,
-            'dilations': dilation,
-            'groups': groups,
-            'use_cudnn': use_cudnn,
-            'use_mkldnn': False,
-            "padding_algorithm": padding_algorithm,
-            "data_format": data_format,
-        })
+    helper.append_op(type=l_type,
+                     inputs={
+                         'Input': input,
+                         'Filter': filter_param,
+                     },
+                     outputs={"Output": pre_bias},
+                     attrs={
+                         'strides': stride,
+                         'paddings': padding,
+                         'dilations': dilation,
+                         'groups': groups,
+                         'use_cudnn': use_cudnn,
+                         'use_mkldnn': False,
+                         "padding_algorithm": padding_algorithm,
+                         "data_format": data_format,
+                     })
 
     if data_format == 'NCDHW':
         pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -2179,6 +2203,7 @@ def pool2d(input,
     pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
 
     def update_padding(padding, data_format):
+
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -2233,23 +2258,22 @@ def is_list_or_tuple(ele):
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type=op_type,
-        inputs={"X": input},
-        outputs={"Out": pool_out},
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "global_pooling": global_pooling,
-            "strides": pool_stride,
-            "paddings": pool_padding,
-            "padding_algorithm": padding_algorithm,
-            "use_cudnn": use_cudnn,
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": exclusive,
-            "data_format": data_format,
-        })
+    helper.append_op(type=op_type,
+                     inputs={"X": input},
+                     outputs={"Out": pool_out},
+                     attrs={
+                         "pooling_type": pool_type,
+                         "ksize": pool_size,
+                         "global_pooling": global_pooling,
+                         "strides": pool_stride,
+                         "paddings": pool_padding,
+                         "padding_algorithm": padding_algorithm,
+                         "use_cudnn": use_cudnn,
+                         "ceil_mode": ceil_mode,
+                         "use_mkldnn": False,
+                         "exclusive": exclusive,
+                         "data_format": data_format,
+                     })
 
     return pool_out
 
@@ -2403,6 +2427,7 @@ def pool3d(input,
     pool_stride = utils.convert_to_list(pool_stride, 3, 'pool_stride')
 
     def update_padding(padding, data_format):
+
         def is_list_or_tuple(ele):
             if isinstance(ele, (list, tuple)):
                 return True
@@ -2461,23 +2486,22 @@ def is_list_or_tuple(ele):
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type=op_type,
-        inputs={"X": input},
-        outputs={"Out": pool_out},
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "global_pooling": global_pooling,
-            "strides": pool_stride,
-            "paddings": pool_padding,
-            "padding_algorithm": padding_algorithm,
-            "use_cudnn": use_cudnn,
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": exclusive,
-            "data_format": data_format,
-        })
+    helper.append_op(type=op_type,
+                     inputs={"X": input},
+                     outputs={"Out": pool_out},
+                     attrs={
+                         "pooling_type": pool_type,
+                         "ksize": pool_size,
+                         "global_pooling": global_pooling,
+                         "strides": pool_stride,
+                         "paddings": pool_padding,
+                         "padding_algorithm": padding_algorithm,
+                         "use_cudnn": use_cudnn,
+                         "ceil_mode": ceil_mode,
+                         "use_mkldnn": False,
+                         "exclusive": exclusive,
+                         "data_format": data_format,
+                     })
 
     return pool_out
 
@@ -2615,15 +2639,14 @@ def adaptive_pool2d(input,
         mask = helper.create_variable_for_type_inference(dtype)
         outputs["Mask"] = mask
 
-    helper.append_op(
-        type=l_type,
-        inputs={"X": input},
-        outputs=outputs,
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "adaptive": True,
-        })
+    helper.append_op(type=l_type,
+                     inputs={"X": input},
+                     outputs=outputs,
+                     attrs={
+                         "pooling_type": pool_type,
+                         "ksize": pool_size,
+                         "adaptive": True,
+                     })
 
     return (pool_out, mask) if require_index else pool_out
 
@@ -2775,15 +2798,14 @@ def adaptive_pool3d(input,
         mask = helper.create_variable_for_type_inference(dtype)
         outputs["Mask"] = mask
 
-    helper.append_op(
-        type=l_type,
-        inputs={"X": input},
-        outputs=outputs,
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "adaptive": True,
-        })
+    helper.append_op(type=l_type,
+                     inputs={"X": input},
+                     outputs=outputs,
+                     attrs={
+                         "pooling_type": pool_type,
+                         "ksize": pool_size,
+                         "adaptive": True,
+                     })
 
     return (pool_out, mask) if require_index else pool_out
 
@@ -2939,32 +2961,31 @@ def batch_norm(input,
     param_shape = [channel_num]
 
     # create parameter
-    scale = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        default_initializer=Constant(1.0))
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
-
-    mean = helper.create_parameter(
-        attr=ParamAttr(
-            name=moving_mean_name,
-            initializer=Constant(0.0),
-            trainable=False,
-            do_model_average=do_model_average_for_mean_and_var),
-        shape=param_shape,
-        dtype=dtype)
+    scale = helper.create_parameter(attr=helper.param_attr,
+                                    shape=param_shape,
+                                    dtype=dtype,
+                                    default_initializer=Constant(1.0))
+    bias = helper.create_parameter(attr=helper.bias_attr,
+                                   shape=param_shape,
+                                   dtype=dtype,
+                                   is_bias=True)
+
+    mean = helper.create_parameter(attr=ParamAttr(
+        name=moving_mean_name,
+        initializer=Constant(0.0),
+        trainable=False,
+        do_model_average=do_model_average_for_mean_and_var),
+                                   shape=param_shape,
+                                   dtype=dtype)
     mean.stop_gradient = True
 
-    variance = helper.create_parameter(
-        attr=ParamAttr(
-            name=moving_variance_name,
-            initializer=Constant(1.0),
-            trainable=False,
-            do_model_average=do_model_average_for_mean_and_var),
-        shape=param_shape,
-        dtype=dtype)
+    variance = helper.create_parameter(attr=ParamAttr(
+        name=moving_variance_name,
+        initializer=Constant(1.0),
+        trainable=False,
+        do_model_average=do_model_average_for_mean_and_var),
+                                       shape=param_shape,
+                                       dtype=dtype)
     variance.stop_gradient = True
 
     # create output
@@ -3001,11 +3022,12 @@ def batch_norm(input,
                 input, scale, bias, mean, variance, None, mean_out,
                 variance_out, *attrs_)
 
-        return dygraph_utils._append_activation_in_dygraph(
-            batch_norm_out, act=act, use_mkldnn=False)
+        return dygraph_utils._append_activation_in_dygraph(batch_norm_out,
+                                                           act=act,
+                                                           use_mkldnn=False)
 
-    saved_mean = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(dtype=dtype,
+                                                           stop_gradient=True)
     saved_variance = helper.create_variable_for_type_inference(
         dtype=dtype, stop_gradient=True)
     reserve_space = None
@@ -3048,8 +3070,10 @@ def batch_norm(input,
     if reserve_space is not None:
         outputs["ReserveSpace"] = reserve_space
 
-    helper.append_op(
-        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+    helper.append_op(type="batch_norm",
+                     inputs=inputs,
+                     outputs=outputs,
+                     attrs=attrs)
 
     return helper.append_activation(batch_norm_out)
 
@@ -3168,32 +3192,31 @@ def inplace_abn(input,
     param_shape = [channel_num]
 
     # create parameter
-    scale = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        default_initializer=Constant(1.0))
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
-
-    mean = helper.create_parameter(
-        attr=ParamAttr(
-            name=moving_mean_name,
-            initializer=Constant(0.0),
-            trainable=False,
-            do_model_average=do_model_average_for_mean_and_var),
-        shape=param_shape,
-        dtype=dtype)
+    scale = helper.create_parameter(attr=helper.param_attr,
+                                    shape=param_shape,
+                                    dtype=dtype,
+                                    default_initializer=Constant(1.0))
+    bias = helper.create_parameter(attr=helper.bias_attr,
+                                   shape=param_shape,
+                                   dtype=dtype,
+                                   is_bias=True)
+
+    mean = helper.create_parameter(attr=ParamAttr(
+        name=moving_mean_name,
+        initializer=Constant(0.0),
+        trainable=False,
+        do_model_average=do_model_average_for_mean_and_var),
+                                   shape=param_shape,
+                                   dtype=dtype)
     mean.stop_gradient = True
 
-    variance = helper.create_parameter(
-        attr=ParamAttr(
-            name=moving_variance_name,
-            initializer=Constant(1.0),
-            trainable=False,
-            do_model_average=do_model_average_for_mean_and_var),
-        shape=param_shape,
-        dtype=dtype)
+    variance = helper.create_parameter(attr=ParamAttr(
+        name=moving_variance_name,
+        initializer=Constant(1.0),
+        trainable=False,
+        do_model_average=do_model_average_for_mean_and_var),
+                                       shape=param_shape,
+                                       dtype=dtype)
     variance.stop_gradient = True
 
     # create output
@@ -3235,8 +3258,8 @@ def inplace_abn(input,
                 variance_out, *attrs__)
             return batch_norm_out
 
-    saved_mean = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(dtype=dtype,
+                                                           stop_gradient=True)
     saved_variance = helper.create_variable_for_type_inference(
         dtype=dtype, stop_gradient=True)
     reserve_space = helper.create_variable_for_type_inference(
@@ -3273,8 +3296,10 @@ def inplace_abn(input,
     if reserve_space is not None:
         outputs["ReserveSpace"] = reserve_space
 
-    helper.append_op(
-        type="inplace_abn", inputs=inputs, outputs=outputs, attrs=attrs)
+    helper.append_op(type="inplace_abn",
+                     inputs=inputs,
+                     outputs=outputs,
+                     attrs=attrs)
 
     return batch_norm_out
 
@@ -3362,29 +3387,27 @@ def instance_norm(input,
     input_shape = input.shape
     if len(input.shape) < 2 or len(input.shape) > 5:
         raise ValueError(
-            'expected 2D or 3D or 4D or 5D input (got {}D input, input shape is: {})'.
-            format(len(input.shape), input_shape))
+            'expected 2D or 3D or 4D or 5D input (got {}D input, input shape is: {})'
+            .format(len(input.shape), input_shape))
     channel_num = input_shape[1]
 
     param_shape = [channel_num]
 
     if param_attr != False and bias_attr != False:
         # create parameter
-        scale = helper.create_parameter(
-            attr=helper.param_attr,
-            shape=param_shape,
-            dtype=dtype,
-            default_initializer=Constant(1.0))
-        bias = helper.create_parameter(
-            attr=helper.bias_attr,
-            shape=param_shape,
-            dtype=dtype,
-            is_bias=True,
-            default_initializer=Constant(0.0))
+        scale = helper.create_parameter(attr=helper.param_attr,
+                                        shape=param_shape,
+                                        dtype=dtype,
+                                        default_initializer=Constant(1.0))
+        bias = helper.create_parameter(attr=helper.bias_attr,
+                                       shape=param_shape,
+                                       dtype=dtype,
+                                       is_bias=True,
+                                       default_initializer=Constant(0.0))
 
     # create output
-    saved_mean = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(dtype=dtype,
+                                                           stop_gradient=True)
     saved_variance = helper.create_variable_for_type_inference(
         dtype=dtype, stop_gradient=True)
 
@@ -3395,15 +3418,16 @@ def instance_norm(input,
         inputs["Scale"] = scale
         inputs["Bias"] = bias
 
-    helper.append_op(
-        type="instance_norm",
-        inputs=inputs,
-        outputs={
-            "Y": instance_norm_out,
-            "SavedMean": saved_mean,
-            "SavedVariance": saved_variance
-        },
-        attrs={"epsilon": epsilon, })
+    helper.append_op(type="instance_norm",
+                     inputs=inputs,
+                     outputs={
+                         "Y": instance_norm_out,
+                         "SavedMean": saved_mean,
+                         "SavedVariance": saved_variance
+                     },
+                     attrs={
+                         "epsilon": epsilon,
+                     })
 
     return instance_norm_out
 
@@ -3520,44 +3544,39 @@ def data_norm(input,
     if name == None:
         name = "dn"
     if enable_scale_and_shift:
-        scale_w = helper.create_parameter(
-            attr=ParamAttr(
-                name=name + '.scale_w',
-                initializer=Constant(value=float(scale_w_default)),
-                trainable=True),
-            shape=param_shape,
-            dtype=input.dtype)
-        bias = helper.create_parameter(
-            attr=ParamAttr(
-                name=name + '.bias',
-                initializer=Constant(value=float(bias_default)),
-                trainable=True),
-            shape=param_shape,
-            dtype=input.dtype)
-    # create parameter
-    batch_size = helper.create_parameter(
-        attr=ParamAttr(
-            name=name + '.batch_size',
-            initializer=Constant(value=float(batch_size_default)),
+        scale_w = helper.create_parameter(attr=ParamAttr(
+            name=name + '.scale_w',
+            initializer=Constant(value=float(scale_w_default)),
             trainable=True),
-        shape=param_shape,
-        dtype=input.dtype)
-
-    batch_sum = helper.create_parameter(
-        attr=ParamAttr(
-            name=name + '.batch_sum',
-            initializer=Constant(value=float(batch_sum_default)),
-            trainable=True),
-        shape=param_shape,
-        dtype=input.dtype)
-
-    batch_square_sum = helper.create_parameter(
-        attr=ParamAttr(
-            name=name + '.batch_square_sum',
-            initializer=Constant(value=float(batch_square_sum_default)),
+                                          shape=param_shape,
+                                          dtype=input.dtype)
+        bias = helper.create_parameter(attr=ParamAttr(
+            name=name + '.bias',
+            initializer=Constant(value=float(bias_default)),
             trainable=True),
-        shape=param_shape,
-        dtype=input.dtype)
+                                       shape=param_shape,
+                                       dtype=input.dtype)
+    # create parameter
+    batch_size = helper.create_parameter(attr=ParamAttr(
+        name=name + '.batch_size',
+        initializer=Constant(value=float(batch_size_default)),
+        trainable=True),
+                                         shape=param_shape,
+                                         dtype=input.dtype)
+
+    batch_sum = helper.create_parameter(attr=ParamAttr(
+        name=name + '.batch_sum',
+        initializer=Constant(value=float(batch_sum_default)),
+        trainable=True),
+                                        shape=param_shape,
+                                        dtype=input.dtype)
+
+    batch_square_sum = helper.create_parameter(attr=ParamAttr(
+        name=name + '.batch_square_sum',
+        initializer=Constant(value=float(batch_square_sum_default)),
+        trainable=True),
+                                               shape=param_shape,
+                                               dtype=input.dtype)
 
     means = helper.create_variable(dtype=dtype, stop_gradient=True)
     scales = helper.create_variable(dtype=dtype, stop_gradient=True)
@@ -3583,18 +3602,17 @@ def data_norm(input,
     if enable_scale_and_shift:
         inputs["scale_w"] = scale_w
         inputs["bias"] = bias
-    helper.append_op(
-        type="data_norm",
-        inputs=inputs,
-        outputs={
-            "Y": data_norm_out,
-            "Means": means,
-            "Scales": scales,
-            "BatchSize": batch_size,
-            "BatchSum": batch_sum,
-            "BatchSquareSum": batch_square_sum
-        },
-        attrs=attrs)
+    helper.append_op(type="data_norm",
+                     inputs=inputs,
+                     outputs={
+                         "Y": data_norm_out,
+                         "Means": means,
+                         "Scales": scales,
+                         "BatchSize": batch_size,
+                         "BatchSum": batch_sum,
+                         "BatchSquareSum": batch_square_sum
+                     },
+                     attrs=attrs)
 
     return helper.append_activation(data_norm_out)
 
@@ -3684,41 +3702,43 @@ def layer_norm(input,
     param_shape = [reduce(lambda x, y: x * y, input_shape[begin_norm_axis:])]
     if scale:
         assert param_attr is not False, "param_attr should not be False when using scale."
-        scale = helper.create_parameter(
-            attr=helper.param_attr,
-            shape=param_shape,
-            dtype=dtype,
-            default_initializer=Constant(1.0))
+        scale = helper.create_parameter(attr=helper.param_attr,
+                                        shape=param_shape,
+                                        dtype=dtype,
+                                        default_initializer=Constant(1.0))
         inputs['Scale'] = scale
     else:
         if param_attr:
             warnings.warn("param_attr is only available with scale is True.")
     if shift:
         assert bias_attr is not False, "bias_attr should not be False when using shift."
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+        bias = helper.create_parameter(attr=helper.bias_attr,
+                                       shape=param_shape,
+                                       dtype=dtype,
+                                       is_bias=True)
         inputs['Bias'] = bias
     else:
         if bias_attr:
             warnings.warn("bias_attr is only available with shift is True.")
 
     # create output
-    mean_out = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-    variance_out = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+    mean_out = helper.create_variable_for_type_inference(dtype=dtype,
+                                                         stop_gradient=True)
+    variance_out = helper.create_variable_for_type_inference(dtype=dtype,
+                                                             stop_gradient=True)
     layer_norm_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type="layer_norm",
-        inputs=inputs,
-        outputs={
-            "Y": layer_norm_out,
-            "Mean": mean_out,
-            "Variance": variance_out,
-        },
-        attrs={"epsilon": epsilon,
-               "begin_norm_axis": begin_norm_axis})
+    helper.append_op(type="layer_norm",
+                     inputs=inputs,
+                     outputs={
+                         "Y": layer_norm_out,
+                         "Mean": mean_out,
+                         "Variance": variance_out,
+                     },
+                     attrs={
+                         "epsilon": epsilon,
+                         "begin_norm_axis": begin_norm_axis
+                     })
 
     return helper.append_activation(layer_norm_out)
 
@@ -3792,15 +3812,16 @@ def group_norm(input,
     channel_num = input_shape[1] if data_layout == 'NCHW' else input_shape[-1]
     param_shape = [channel_num]
     if param_attr:
-        scale = helper.create_parameter(
-            attr=helper.param_attr,
-            shape=param_shape,
-            dtype=dtype,
-            default_initializer=Constant(1.0))
+        scale = helper.create_parameter(attr=helper.param_attr,
+                                        shape=param_shape,
+                                        dtype=dtype,
+                                        default_initializer=Constant(1.0))
         inputs['Scale'] = scale
     if bias_attr:
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+        bias = helper.create_parameter(attr=helper.bias_attr,
+                                       shape=param_shape,
+                                       dtype=dtype,
+                                       is_bias=True)
         inputs['Bias'] = bias
 
     # create output
@@ -3808,19 +3829,18 @@ def group_norm(input,
     variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
     group_norm_out = helper.create_variable(dtype=dtype)
 
-    helper.append_op(
-        type="group_norm",
-        inputs=inputs,
-        outputs={
-            "Y": group_norm_out,
-            "Mean": mean_out,
-            "Variance": variance_out,
-        },
-        attrs={
-            "epsilon": epsilon,
-            "groups": groups,
-            "data_layout": data_layout
-        })
+    helper.append_op(type="group_norm",
+                     inputs=inputs,
+                     outputs={
+                         "Y": group_norm_out,
+                         "Mean": mean_out,
+                         "Variance": variance_out,
+                     },
+                     attrs={
+                         "epsilon": epsilon,
+                         "groups": groups,
+                         "data_layout": data_layout
+                     })
 
     return helper.append_activation(group_norm_out)
 
@@ -3906,33 +3926,32 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
     h = input_shape[dim]
     w = np.prod(input_shape) // h
 
-    u = helper.create_parameter(
-        attr=ParamAttr(),
-        shape=[h],
-        dtype=dtype,
-        default_initializer=Normal(0., 1.))
+    u = helper.create_parameter(attr=ParamAttr(),
+                                shape=[h],
+                                dtype=dtype,
+                                default_initializer=Normal(0., 1.))
     u.stop_gradient = True
     inputs['U'] = u
-    v = helper.create_parameter(
-        attr=ParamAttr(),
-        shape=[w],
-        dtype=dtype,
-        default_initializer=Normal(0., 1.))
+    v = helper.create_parameter(attr=ParamAttr(),
+                                shape=[w],
+                                dtype=dtype,
+                                default_initializer=Normal(0., 1.))
     inputs['V'] = v
     v.stop_gradient = True
 
     # create output
     out = helper.create_variable(dtype=dtype)
 
-    helper.append_op(
-        type="spectral_norm",
-        inputs=inputs,
-        outputs={"Out": out, },
-        attrs={
-            "dim": dim,
-            "power_iters": power_iters,
-            "eps": eps,
-        })
+    helper.append_op(type="spectral_norm",
+                     inputs=inputs,
+                     outputs={
+                         "Out": out,
+                     },
+                     attrs={
+                         "dim": dim,
+                         "power_iters": power_iters,
+                         "eps": eps,
+                     })
 
     return out
 
@@ -4118,8 +4137,8 @@ def conv2d_transpose(input,
 
     input_channel = input.shape[1] if data_format == 'NCHW' else input.shape[-1]
     op_type = 'conv2d_transpose'
-    if (input_channel == groups and num_filters == input_channel and
-            not use_cudnn):
+    if (input_channel == groups and num_filters == input_channel
+            and not use_cudnn):
         op_type = 'depthwise_conv2d_transpose'
 
     helper = LayerHelper(op_type, **locals())
@@ -4133,6 +4152,7 @@ def conv2d_transpose(input,
         raise ValueError("use_cudnn should be True or False")
 
     def _update_padding(padding, data_format):
+
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -4206,31 +4226,33 @@ def is_list_or_tuple(ele):
     if groups is None:
         groups = 1
     elif groups <= 0:
-        raise ValueError("the groups of input must be greater than 0, "
-                         "but received the groups of input is {}".format(
-                             groups))
+        raise ValueError(
+            "the groups of input must be greater than 0, "
+            "but received the groups of input is {}".format(groups))
 
     filter_shape = [input_channel, num_filters // groups] + filter_size
 
-    img_filter = helper.create_parameter(
-        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
+    img_filter = helper.create_parameter(dtype=input.dtype,
+                                         shape=filter_shape,
+                                         attr=helper.param_attr)
 
     pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type=op_type,
-        inputs={'Input': [input],
-                'Filter': [img_filter]},
-        outputs={'Output': pre_bias},
-        attrs={
-            'output_size': output_size,
-            'strides': stride,
-            'paddings': padding,
-            'padding_algorithm': padding_algorithm,
-            'dilations': dilation,
-            'groups': groups,
-            'use_cudnn': use_cudnn,
-            'data_format': data_format
-        })
+    helper.append_op(type=op_type,
+                     inputs={
+                         'Input': [input],
+                         'Filter': [img_filter]
+                     },
+                     outputs={'Output': pre_bias},
+                     attrs={
+                         'output_size': output_size,
+                         'strides': stride,
+                         'paddings': padding,
+                         'padding_algorithm': padding_algorithm,
+                         'dilations': dilation,
+                         'groups': groups,
+                         'use_cudnn': use_cudnn,
+                         'data_format': data_format
+                     })
 
     if data_format == 'NCHW':
         pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -4429,10 +4451,9 @@ def conv3d_transpose(input,
         raise TypeError("Input of conv3d_transpose must be Variable")
     if len(input.shape) != 5:
         raise ValueError(
-            "Input should be 5D tensor, but received input with the shape of {}".
-            format(input.shape))
-    input_channel = input.shape[1] if data_format == 'NCDHW' else input.shape[
-        -1]
+            "Input should be 5D tensor, but received input with the shape of {}"
+            .format(input.shape))
+    input_channel = input.shape[1] if data_format == 'NCDHW' else input.shape[-1]
 
     stride = utils.convert_to_list(stride, 3, 'stride')
     dilation = utils.convert_to_list(dilation, 3, 'dilation')
@@ -4441,6 +4462,7 @@ def conv3d_transpose(input,
         raise ValueError("use_cudnn should be True or False")
 
     def _update_padding(padding, data_format):
+
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -4524,16 +4546,18 @@ def is_list_or_tuple(ele):
     groups = 1 if groups is None else groups
     if groups <= 0:
         raise ValueError(
-            "the groups of conv3d_transpose should be greater than 0. Received groups: {}".
-            format(groups))
+            "the groups of conv3d_transpose should be greater than 0. Received groups: {}"
+            .format(groups))
     if num_filters % groups != 0:
-        raise ValueError("Attr(num_filters) must be divisible by groups,"
-                         "Received: Attr(num_filters) is {}, the groups is {}".
-                         format(num_filters, groups))
+        raise ValueError(
+            "Attr(num_filters) must be divisible by groups,"
+            "Received: Attr(num_filters) is {}, the groups is {}".format(
+                num_filters, groups))
 
     filter_shape = [input_channel, num_filters // groups] + filter_size
-    img_filter = helper.create_parameter(
-        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
+    img_filter = helper.create_parameter(dtype=input.dtype,
+                                         shape=filter_shape,
+                                         attr=helper.param_attr)
 
     if data_format == 'NCDHW':
         data_format = 'NCHW'
@@ -4541,21 +4565,22 @@ def is_list_or_tuple(ele):
         data_format = 'NHWC'
 
     pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type=l_type,
-        inputs={'Input': [input],
-                'Filter': [img_filter]},
-        outputs={'Output': pre_bias},
-        attrs={
-            'output_size': output_size,
-            'strides': stride,
-            'paddings': padding,
-            'padding_algorithm': padding_algorithm,
-            'dilations': dilation,
-            'groups': groups,
-            'use_cudnn': use_cudnn,
-            'data_format': data_format
-        })
+    helper.append_op(type=l_type,
+                     inputs={
+                         'Input': [input],
+                         'Filter': [img_filter]
+                     },
+                     outputs={'Output': pre_bias},
+                     attrs={
+                         'output_size': output_size,
+                         'strides': stride,
+                         'paddings': padding,
+                         'padding_algorithm': padding_algorithm,
+                         'dilations': dilation,
+                         'groups': groups,
+                         'use_cudnn': use_cudnn,
+                         'data_format': data_format
+                     })
 
     if data_format == 'NCHW':
         pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -4627,9 +4652,12 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
         return _C_ops.reduce_sum(input, 'dim', dim, 'keep_dim', keep_dim,
                                  'reduce_all', reduce_all)
     attrs = {
-        'dim': dim if dim != None and dim != [] else [0],
-        'keep_dim': keep_dim,
-        'reduce_all': True
+        'dim':
+        dim if dim != None and dim != [] else [0],
+        'keep_dim':
+        keep_dim,
+        'reduce_all':
+        True
         if dim == None or dim == [] or len(dim) == len(input.shape) else False
     }
     check_variable_and_dtype(
@@ -4637,11 +4665,10 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
         'reduce_sum')
     helper = LayerHelper('reduce_sum', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='reduce_sum',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs=attrs)
+    helper.append_op(type='reduce_sum',
+                     inputs={'X': input},
+                     outputs={'Out': out},
+                     attrs=attrs)
     return out
 
 
@@ -4754,16 +4781,18 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
-    helper.append_op(
-        type='reduce_max',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'dim': dim if dim != None and dim != [] else [0],
-            'keep_dim': keep_dim,
-            'reduce_all': True if dim == None or dim == [] or
-            len(dim) == len(input.shape) else False
-        })
+    helper.append_op(type='reduce_max',
+                     inputs={'X': input},
+                     outputs={'Out': out},
+                     attrs={
+                         'dim':
+                         dim if dim != None and dim != [] else [0],
+                         'keep_dim':
+                         keep_dim,
+                         'reduce_all':
+                         True if dim == None or dim == []
+                         or len(dim) == len(input.shape) else False
+                     })
     return out
 
 
@@ -4820,16 +4849,18 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
-    helper.append_op(
-        type='reduce_min',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'dim': dim if dim != None and dim != [] else [0],
-            'keep_dim': keep_dim,
-            'reduce_all': True if dim == None or dim == [] or
-            len(dim) == len(input.shape) else False
-        })
+    helper.append_op(type='reduce_min',
+                     inputs={'X': input},
+                     outputs={'Out': out},
+                     attrs={
+                         'dim':
+                         dim if dim != None and dim != [] else [0],
+                         'keep_dim':
+                         keep_dim,
+                         'reduce_all':
+                         True if dim == None or dim == []
+                         or len(dim) == len(input.shape) else False
+                     })
     return out
 
 
@@ -4898,19 +4929,22 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             dim == None or dim == [] or len(dim) == len(input.shape) else False)
 
     helper = LayerHelper('reduce_prod', **locals())
-    check_variable_and_dtype(
-        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'reduce_prod')
+    check_variable_and_dtype(input, 'input',
+                             ['float32', 'float64', 'int32', 'int64'],
+                             'reduce_prod')
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='reduce_prod',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'dim': dim if dim != None and dim != [] else [0],
-            'keep_dim': keep_dim,
-            'reduce_all': True if dim == None or dim == [] or
-            len(dim) == len(input.shape) else False
-        })
+    helper.append_op(type='reduce_prod',
+                     inputs={'X': input},
+                     outputs={'Out': out},
+                     attrs={
+                         'dim':
+                         dim if dim != None and dim != [] else [0],
+                         'keep_dim':
+                         keep_dim,
+                         'reduce_all':
+                         True if dim == None or dim == []
+                         or len(dim) == len(input.shape) else False
+                     })
     return out
 
 
@@ -4963,16 +4997,18 @@ def reduce_all(input, dim=None, keep_dim=False, name=None):
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
-    helper.append_op(
-        type='reduce_all',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'dim': dim if dim != None and dim != [] else [0],
-            'keep_dim': keep_dim,
-            'reduce_all': True if dim == None or dim == [] or
-            len(dim) == len(input.shape) else False
-        })
+    helper.append_op(type='reduce_all',
+                     inputs={'X': input},
+                     outputs={'Out': out},
+                     attrs={
+                         'dim':
+                         dim if dim != None and dim != [] else [0],
+                         'keep_dim':
+                         keep_dim,
+                         'reduce_all':
+                         True if dim == None or dim == []
+                         or len(dim) == len(input.shape) else False
+                     })
     return out
 
 
@@ -5024,16 +5060,18 @@ def reduce_any(input, dim=None, keep_dim=False, name=None):
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
-    helper.append_op(
-        type='reduce_any',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'dim': dim if dim != None and dim != [] else [0],
-            'keep_dim': keep_dim,
-            'reduce_all': True if dim == None or dim == [] or
-            len(dim) == len(input.shape) else False
-        })
+    helper.append_op(type='reduce_any',
+                     inputs={'X': input},
+                     outputs={'Out': out},
+                     attrs={
+                         'dim':
+                         dim if dim != None and dim != [] else [0],
+                         'keep_dim':
+                         keep_dim,
+                         'reduce_all':
+                         True if dim == None or dim == []
+                         or len(dim) == len(input.shape) else False
+                     })
     return out
 
 
@@ -5108,8 +5146,8 @@ def split(input, num_or_sections, dim=-1, name=None):
             if utils._contain_var(num_or_sections):
                 for index, item in enumerate(num_or_sections):
                     if isinstance(item, Variable):
-                        num_or_sections[index] = num_or_sections[index].numpy()[
-                            0]
+                        num_or_sections[index] = num_or_sections[index].numpy(
+                        )[0]
                 attrs += ('sections', list(num_or_sections))
             else:
                 attrs += ('sections', list(num_or_sections))
@@ -5154,8 +5192,11 @@ def _get_SectionsTensorList(one_list):
                         idx)
                     unk_dim_idx = idx
                 temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant(
-                    [1], 'int32', dim_size, force_cpu=True, out=temp_out)
+                fill_constant([1],
+                              'int32',
+                              dim_size,
+                              force_cpu=True,
+                              out=temp_out)
                 tensor_list.append(temp_out)
         return tensor_list
 
@@ -5181,8 +5222,8 @@ def _get_SectionsTensorList(one_list):
                 dim], 'len(num_or_sections) must not be more than input.shape[dim].'
         num = len(num_or_sections)
         attrs['sections'] = list(
-            map(lambda ele: -1 if isinstance(ele, Variable) else ele,
-                num_or_sections))
+            map(lambda ele: -1
+                if isinstance(ele, Variable) else ele, num_or_sections))
         if utils._contain_var(num_or_sections):
             inputs['SectionsTensorList'] = _get_SectionsTensorList(
                 num_or_sections)
@@ -5191,8 +5232,10 @@ def _get_SectionsTensorList(one_list):
         helper.create_variable_for_type_inference(dtype=helper.input_dtype())
         for i in range(num)
     ]
-    helper.append_op(
-        type='split', inputs=inputs, outputs={'Out': outs}, attrs=attrs)
+    helper.append_op(type='split',
+                     inputs=inputs,
+                     outputs={'Out': outs},
+                     attrs=attrs)
     return outs
 
 
@@ -5240,8 +5283,8 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     if len(x.shape) == 1:
         axis = 0
     if _non_static_mode():
-        _, out = _C_ops.norm(x, 'axis', 1
-                             if axis is None else axis, 'epsilon', epsilon)
+        _, out = _C_ops.norm(x, 'axis', 1 if axis is None else axis, 'epsilon',
+                             epsilon)
         return out
 
     check_variable_and_dtype(x, "X", ("float16", "float32", "float64"), "norm")
@@ -5249,15 +5292,16 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     helper = LayerHelper("l2_normalize", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     norm = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="norm",
-        inputs={"X": x},
-        outputs={"Out": out,
-                 "Norm": norm},
-        attrs={
-            "axis": 1 if axis is None else axis,
-            "epsilon": epsilon,
-        })
+    helper.append_op(type="norm",
+                     inputs={"X": x},
+                     outputs={
+                         "Out": out,
+                         "Norm": norm
+                     },
+                     attrs={
+                         "axis": 1 if axis is None else axis,
+                         "epsilon": epsilon,
+                     })
     return out
 
 
@@ -5345,8 +5389,9 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
     def __check_input(x, y):
         var_names = {'x': x, 'y': y}
         for name, val in var_names.items():
-            check_variable_and_dtype(
-                val, name, ['float16', 'float32', 'float64'], 'matmul')
+            check_variable_and_dtype(val, name,
+                                     ['float16', 'float32', 'float64'],
+                                     'matmul')
         x_shape = list(x.shape)
         y_shape = list(y.shape)
         if len(x_shape) == 1:
@@ -5388,12 +5433,13 @@ def __check_input(x, y):
 
     helper = LayerHelper('matmul', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='matmul',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': out},
-        attrs=attrs)
+    helper.append_op(type='matmul',
+                     inputs={
+                         'X': x,
+                         'Y': y
+                     },
+                     outputs={'Out': out},
+                     attrs=attrs)
     return out
 
 
@@ -5486,12 +5532,13 @@ def topk(input, k, name=None):
     values = helper.create_variable_for_type_inference(dtype=input.dtype)
     indices = helper.create_variable_for_type_inference(dtype="int64")
 
-    helper.append_op(
-        type="top_k",
-        inputs=inputs,
-        outputs={"Out": [values],
-                 "Indices": [indices]},
-        attrs=attrs)
+    helper.append_op(type="top_k",
+                     inputs=inputs,
+                     outputs={
+                         "Out": [values],
+                         "Indices": [indices]
+                     },
+                     attrs=attrs)
     values.stop_gradient = True
     indices.stop_gradient = True
     return values, indices
@@ -5637,28 +5684,32 @@ def ctc_greedy_decoder(input,
     ctc_out = helper.create_variable_for_type_inference(dtype="int64")
 
     if input_length is None:
-        helper.append_op(
-            type="ctc_align",
-            inputs={"Input": [topk_indices]},
-            outputs={"Output": [ctc_out]},
-            attrs={"merge_repeated": True,
-                   "blank": blank})
+        helper.append_op(type="ctc_align",
+                         inputs={"Input": [topk_indices]},
+                         outputs={"Output": [ctc_out]},
+                         attrs={
+                             "merge_repeated": True,
+                             "blank": blank
+                         })
         return ctc_out
     else:
         ctc_out_len = helper.create_variable_for_type_inference(dtype="int64")
         ctc_input = squeeze(topk_indices, [2])
 
-        helper.append_op(
-            type="ctc_align",
-            inputs={"Input": [ctc_input],
-                    "InputLength": [input_length]},
-            outputs={"Output": [ctc_out],
-                     "OutputLength": [ctc_out_len]},
-            attrs={
-                "merge_repeated": True,
-                "blank": blank,
-                "padding_value": padding_value
-            })
+        helper.append_op(type="ctc_align",
+                         inputs={
+                             "Input": [ctc_input],
+                             "InputLength": [input_length]
+                         },
+                         outputs={
+                             "Output": [ctc_out],
+                             "OutputLength": [ctc_out_len]
+                         },
+                         attrs={
+                             "merge_repeated": True,
+                             "blank": blank,
+                             "padding_value": padding_value
+                         })
         return ctc_out, ctc_out_len
 
 
@@ -5742,12 +5793,13 @@ def transpose(x, perm, name=None):
     helper = LayerHelper('transpose', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='transpose2',
-        inputs={'X': [x]},
-        outputs={'Out': [out],
-                 'XShape': [x_shape]},
-        attrs={'axis': perm})
+    helper.append_op(type='transpose2',
+                     inputs={'X': [x]},
+                     outputs={
+                         'Out': [out],
+                         'XShape': [x_shape]
+                     },
+                     attrs={'axis': perm})
     return out
 
 
@@ -5894,8 +5946,10 @@ def im2sequence(input,
         attrs["out_stride"] = out_stride
     helper = LayerHelper('im2sequence', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    helper.append_op(type='im2sequence',
+                     inputs=inputs,
+                     outputs={'Out': out},
+                     attrs=attrs)
     return out
 
 
@@ -5935,14 +5989,16 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
     check_variable_and_dtype(input, 'input', ['float32'], 'row_conv')
     dtype = helper.input_dtype()
     filter_shape = [future_context_size + 1, input.shape[-1]]
-    filter_param = helper.create_parameter(
-        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
+    filter_param = helper.create_parameter(attr=helper.param_attr,
+                                           shape=filter_shape,
+                                           dtype=dtype)
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='row_conv',
-        inputs={'X': [input],
-                'Filter': [filter_param]},
-        outputs={'Out': [out]})
+    helper.append_op(type='row_conv',
+                     inputs={
+                         'X': [input],
+                         'Filter': [filter_param]
+                     },
+                     outputs={'Out': [out]})
     return helper.append_activation(out)
 
 
@@ -6016,11 +6072,12 @@ def multiplex(inputs, index, name=None):
     check_variable_and_dtype(index, "index", ['int32', 'int64'], 'multiplex')
 
     out = helper.create_variable_for_type_inference(inputs[0].dtype)
-    helper.append_op(
-        type='multiplex',
-        inputs={'X': inputs,
-                'Ids': index},
-        outputs={'Out': [out]})
+    helper.append_op(type='multiplex',
+                     inputs={
+                         'X': inputs,
+                         'Ids': index
+                     },
+                     outputs={'Out': [out]})
     return out
 
 
@@ -6087,17 +6144,18 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
 
     diff = helper.create_variable_for_type_inference(dtype=x.dtype)
     loss = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='smooth_l1_loss',
-        inputs={
-            'X': x,
-            'Y': y,
-            'InsideWeight': inside_weight,
-            'OutsideWeight': outside_weight
-        },
-        outputs={'Diff': diff,
-                 'Out': loss},
-        attrs={'sigma': sigma if sigma is not None else 1.0})
+    helper.append_op(type='smooth_l1_loss',
+                     inputs={
+                         'X': x,
+                         'Y': y,
+                         'InsideWeight': inside_weight,
+                         'OutsideWeight': outside_weight
+                     },
+                     outputs={
+                         'Diff': diff,
+                         'Out': loss
+                     },
+                     attrs={'sigma': sigma if sigma is not None else 1.0})
     return loss
 
 
@@ -6209,11 +6267,10 @@ def one_hot(input, depth, allow_out_of_range=False):
         depth.stop_gradient = True
         inputs = {'X': input, 'depth_tensor': depth}
         attrs = {'allow_out_of_range': allow_out_of_range}
-    helper.append_op(
-        type="one_hot",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={'Out': one_hot_out})
+    helper.append_op(type="one_hot",
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={'Out': one_hot_out})
     one_hot_out.stop_gradient = True
     return one_hot_out
 
@@ -6253,9 +6310,9 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
         persistable=True,
         belong_to_optimizer=True)
     if is_new_var:
-        helper.set_variable_initializer(
-            counter, initializer=Constant(
-                value=begin - 1, force_cpu=True))
+        helper.set_variable_initializer(counter,
+                                        initializer=Constant(value=begin - 1,
+                                                             force_cpu=True))
         helper.main_program.global_block()._prepend_op(
             type='increment',
             inputs={'X': [counter]},
@@ -6469,12 +6526,13 @@ def get_attr_shape(list_shape):
     out = x if inplace else helper.create_variable_for_type_inference(
         dtype=x.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="reshape2",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={"Out": out,
-                 "XShape": x_shape})
+    helper.append_op(type="reshape2",
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={
+                         "Out": out,
+                         "XShape": x_shape
+                     })
 
     return helper.append_activation(out)
 
@@ -6546,12 +6604,13 @@ def squeeze(input, axes, name=None):
     check_type(axes, 'axis/axes', (list, tuple), 'squeeze')
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type="squeeze2",
-        inputs={"X": input},
-        attrs={"axes": axes},
-        outputs={"Out": out,
-                 "XShape": x_shape})
+    helper.append_op(type="squeeze2",
+                     inputs={"X": input},
+                     attrs={"axes": axes},
+                     outputs={
+                         "Out": out,
+                         "XShape": x_shape
+                     })
 
     return out
 
@@ -6630,12 +6689,13 @@ def unsqueeze(input, axes, name=None):
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type="unsqueeze2",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={"Out": out,
-                 "XShape": x_shape})
+    helper.append_op(type="unsqueeze2",
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={
+                         "Out": out,
+                         "XShape": x_shape
+                     })
 
     return out
 
@@ -6728,15 +6788,17 @@ def lod_reset(x, y=None, target_lod=None):
     if y is not None:
         check_type(y, 'y', (Variable), 'lod_reset')
         #TODO: check y.lod_level = 0 dtype
-        helper.append_op(
-            type="lod_reset", inputs={'X': x,
-                                      'Y': y}, outputs={'Out': out})
+        helper.append_op(type="lod_reset",
+                         inputs={
+                             'X': x,
+                             'Y': y
+                         },
+                         outputs={'Out': out})
     elif target_lod is not None:
-        helper.append_op(
-            type="lod_reset",
-            inputs={'X': x},
-            attrs={'target_lod': target_lod},
-            outputs={'Out': out})
+        helper.append_op(type="lod_reset",
+                         inputs={'X': x},
+                         attrs={'target_lod': target_lod},
+                         outputs={'Out': out})
     else:
         raise ValueError("y and target_lod should not be both none.")
     return out
@@ -6804,12 +6866,19 @@ def lod_append(x, level):
         #TODO: check y.lod_level = 0 dtype
     else:
         attrs['target_lod'] = level
-    helper.append_op(
-        type="lod_reset", inputs=inputs, attrs=attrs, outputs={'Out': out})
+    helper.append_op(type="lod_reset",
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={'Out': out})
     return out
 
 
-def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None,
+def lrn(input,
+        n=5,
+        k=1.0,
+        alpha=1e-4,
+        beta=0.75,
+        name=None,
         data_format='NCHW'):
     r"""
     :alias_main: paddle.nn.functional.lrn
@@ -6879,23 +6948,22 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None,
             "Attr(data_format) of Op(lrn) got wrong value: received " +
             data_format + " but only NCHW or NHWC supported.")
 
-    mid_out = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+    mid_out = helper.create_variable_for_type_inference(dtype=dtype,
+                                                        stop_gradient=True)
     lrn_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="lrn",
-        inputs={"X": input},
-        outputs={
-            "Out": lrn_out,
-            "MidOut": mid_out,
-        },
-        attrs={
-            "n": n,
-            "k": k,
-            "alpha": alpha,
-            "beta": beta,
-            "data_format": data_format
-        })
+    helper.append_op(type="lrn",
+                     inputs={"X": input},
+                     outputs={
+                         "Out": lrn_out,
+                         "MidOut": mid_out,
+                     },
+                     attrs={
+                         "n": n,
+                         "k": k,
+                         "alpha": alpha,
+                         "beta": beta,
+                         "data_format": data_format
+                     })
 
     return lrn_out
 
@@ -6963,12 +7031,13 @@ def pad(x, paddings, pad_value=0., name=None):
     helper = LayerHelper('pad', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='pad',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'paddings': paddings,
-               'pad_value': float(pad_value)})
+    helper.append_op(type='pad',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={
+                         'paddings': paddings,
+                         'pad_value': float(pad_value)
+                     })
     return out
 
 
@@ -7058,12 +7127,13 @@ def pad_constant_like(x, y, pad_value=0., name=None):
     helper = LayerHelper('pad_constant_like', **locals())
     dtype = helper.input_dtype(input_param_name='y')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='pad_constant_like',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': out},
-        attrs={'pad_value': float(pad_value)})
+    helper.append_op(type='pad_constant_like',
+                     inputs={
+                         'X': x,
+                         'Y': y
+                     },
+                     outputs={'Out': out},
+                     attrs={'pad_value': float(pad_value)})
     return out
 
 
@@ -7145,12 +7215,13 @@ def label_smooth(label,
     helper = LayerHelper("label_smooth", **locals())
     label.stop_gradient = True
     smooth_label = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="label_smooth",
-        inputs={"X": label,
-                "PriorDist": prior_dist} if prior_dist else {"X": label},
-        outputs={"Out": smooth_label},
-        attrs={"epsilon": float(epsilon)})
+    helper.append_op(type="label_smooth",
+                     inputs={
+                         "X": label,
+                         "PriorDist": prior_dist
+                     } if prior_dist else {"X": label},
+                     outputs={"Out": smooth_label},
+                     attrs={"epsilon": float(epsilon)})
     return smooth_label
 
 
@@ -7228,9 +7299,10 @@ def roi_pool(input,
     """
     if _non_static_mode():
         assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        pool_out, argmaxes = _C_ops.roi_pool(
-            input, rois, rois_num, "pooled_height", pooled_height,
-            "pooled_width", pooled_width, "spatial_scale", spatial_scale)
+        pool_out, argmaxes = _C_ops.roi_pool(input, rois, rois_num,
+                                             "pooled_height", pooled_height,
+                                             "pooled_width", pooled_width,
+                                             "spatial_scale", spatial_scale)
         return pool_out, argmaxes
 
     check_variable_and_dtype(input, 'input', ['float32'], 'roi_pool')
@@ -7246,16 +7318,17 @@ def roi_pool(input,
     }
     if rois_num is not None:
         inputs['RoisNum'] = rois_num
-    helper.append_op(
-        type="roi_pool",
-        inputs=inputs,
-        outputs={"Out": pool_out,
-                 "Argmax": argmaxes},
-        attrs={
-            "pooled_height": pooled_height,
-            "pooled_width": pooled_width,
-            "spatial_scale": spatial_scale
-        })
+    helper.append_op(type="roi_pool",
+                     inputs=inputs,
+                     outputs={
+                         "Out": pool_out,
+                         "Argmax": argmaxes
+                     },
+                     attrs={
+                         "pooled_height": pooled_height,
+                         "pooled_width": pooled_width,
+                         "spatial_scale": spatial_scale
+                     })
     return pool_out
 
 
@@ -7316,15 +7389,17 @@ def roi_align(input,
     """
     if in_dygraph_mode():
         assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        return _C_ops.final_state_roi_align(
-            input, rois, rois_num, pooled_height, pooled_width, spatial_scale,
-            sampling_ratio, False)
+        return _C_ops.final_state_roi_align(input, rois, rois_num,
+                                            pooled_height, pooled_width,
+                                            spatial_scale, sampling_ratio,
+                                            False)
     if _in_legacy_dygraph():
         assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        align_out = _C_ops.roi_align(
-            input, rois, rois_num, "pooled_height", pooled_height,
-            "pooled_width", pooled_width, "spatial_scale", spatial_scale,
-            "sampling_ratio", sampling_ratio)
+        align_out = _C_ops.roi_align(input, rois, rois_num, "pooled_height",
+                                     pooled_height, "pooled_width",
+                                     pooled_width, "spatial_scale",
+                                     spatial_scale, "sampling_ratio",
+                                     sampling_ratio)
         return align_out
 
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
@@ -7339,16 +7414,15 @@ def roi_align(input,
     }
     if rois_num is not None:
         inputs['RoisNum'] = rois_num
-    helper.append_op(
-        type="roi_align",
-        inputs=inputs,
-        outputs={"Out": align_out},
-        attrs={
-            "pooled_height": pooled_height,
-            "pooled_width": pooled_width,
-            "spatial_scale": spatial_scale,
-            "sampling_ratio": sampling_ratio
-        })
+    helper.append_op(type="roi_align",
+                     inputs=inputs,
+                     outputs={"Out": align_out},
+                     attrs={
+                         "pooled_height": pooled_height,
+                         "pooled_width": pooled_width,
+                         "spatial_scale": spatial_scale,
+                         "sampling_ratio": sampling_ratio
+                     })
     return align_out
 
 
@@ -7394,8 +7468,10 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
             predictions = F.softmax(x)
             loss = F.dice_loss(input=predictions, label=label)
     """
-    return paddle.nn.functional.dice_loss(
-        input, label, epsilon=epsilon, name=name)
+    return paddle.nn.functional.dice_loss(input,
+                                          label,
+                                          epsilon=epsilon,
+                                          name=name)
 
 
 def image_resize(input,
@@ -7808,8 +7884,11 @@ def _is_list_or_turple_(data):
                         assert (isinstance(dim, int))
                         temp_out = helper.create_variable_for_type_inference(
                             'int32')
-                        fill_constant(
-                            [1], 'int32', dim, force_cpu=True, out=temp_out)
+                        fill_constant([1],
+                                      'int32',
+                                      dim,
+                                      force_cpu=True,
+                                      out=temp_out)
                         new_size_tensor.append(temp_out)
                         size_list.append(dim)
                 inputs['SizeTensor'] = new_size_tensor
@@ -7892,11 +7971,10 @@ def _is_list_or_turple_(data):
         return out
 
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='{}_interp'.format(resample_type),
-        inputs=inputs,
-        outputs={"Out": out},
-        attrs=attrs)
+    helper.append_op(type='{}_interp'.format(resample_type),
+                     inputs=inputs,
+                     outputs={"Out": out},
+                     attrs=attrs)
     return out
 
 
@@ -8514,16 +8592,15 @@ def resize_nearest(input,
 
     """
 
-    return image_resize(
-        input,
-        out_shape,
-        scale,
-        name,
-        'NEAREST',
-        actual_shape,
-        align_corners,
-        align_mode=1,
-        data_format=data_format)
+    return image_resize(input,
+                        out_shape,
+                        scale,
+                        name,
+                        'NEAREST',
+                        actual_shape,
+                        align_corners,
+                        align_mode=1,
+                        data_format=data_format)
 
 
 def image_resize_short(input, out_short_len, resample='BILINEAR'):
@@ -8558,8 +8635,8 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
     out_shape = list(hw)
     out_shape[short_idx] = out_short_len
     out_shape[long_idx] = int(
-        float(out_shape[long_idx]) * (float(out_short_len) / float(hw[
-            short_idx])) + 0.5)
+        float(out_shape[long_idx]) *
+        (float(out_short_len) / float(hw[short_idx])) + 0.5)
     return image_resize(input=input, out_shape=out_shape, resample=resample)
 
 
@@ -8626,12 +8703,13 @@ def gather(input, index, overwrite=True):
     helper = LayerHelper('gather', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="gather",
-        inputs={"X": input,
-                "Index": index},
-        outputs={"Out": out},
-        attrs={'overwrite': overwrite})
+    helper.append_op(type="gather",
+                     inputs={
+                         "X": input,
+                         "Index": index
+                     },
+                     outputs={"Out": out},
+                     attrs={'overwrite': overwrite})
     return out
 
 
@@ -8722,11 +8800,12 @@ def gather_nd(input, index, name=None):
     helper = LayerHelper('gather_nd', **locals())
     dtype = helper.input_dtype()
     output = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="gather_nd",
-        inputs={"X": input,
-                "Index": index},
-        outputs={"Out": output})
+    helper.append_op(type="gather_nd",
+                     inputs={
+                         "X": input,
+                         "Index": index
+                     },
+                     outputs={"Out": output})
     return output
 
 
@@ -8811,13 +8890,14 @@ def scatter(input, index, updates, name=None, overwrite=True):
     helper = LayerHelper('scatter', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="scatter",
-        inputs={"X": input,
-                "Ids": index,
-                "Updates": updates},
-        attrs={'overwrite': overwrite},
-        outputs={"Out": out})
+    helper.append_op(type="scatter",
+                     inputs={
+                         "X": input,
+                         "Ids": index,
+                         "Updates": updates
+                     },
+                     attrs={'overwrite': overwrite},
+                     outputs={"Out": out})
     return out
 
 
@@ -8902,12 +8982,13 @@ def scatter_nd_add(ref, index, updates, name=None):
             helper = LayerHelper('scatter_nd_add', **locals())
             dtype = helper.input_dtype(input_param_name='ref')
             output = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(
-                type="scatter_nd_add",
-                inputs={"X": ref,
-                        "Index": index,
-                        "Updates": updates},
-                outputs={"Out": output})
+            helper.append_op(type="scatter_nd_add",
+                             inputs={
+                                 "X": ref,
+                                 "Index": index,
+                                 "Updates": updates
+                             },
+                             outputs={"Out": output})
             return output
 
 
@@ -9001,13 +9082,16 @@ def random_crop(x, shape, seed=None):
             persistable=True)
     elif not isinstance(seed, Variable):
         raise ValueError("'seed' must be a Variable or an int.")
-    helper.append_op(
-        type="random_crop",
-        inputs={"X": x,
-                "Seed": seed},
-        outputs={"Out": out,
-                 "SeedOut": seed},
-        attrs=op_attrs)
+    helper.append_op(type="random_crop",
+                     inputs={
+                         "X": x,
+                         "Seed": seed
+                     },
+                     outputs={
+                         "Out": out,
+                         "SeedOut": seed
+                     },
+                     attrs=op_attrs)
     return out
 
 
@@ -9092,8 +9176,9 @@ def relu(x, name=None):
     helper = LayerHelper('relu', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out})
+    helper.append_op(type="relu",
+                     inputs={"X": helper.input('x')},
+                     outputs={"Out": out})
     return out
 
 
@@ -9163,8 +9248,10 @@ def selu(x, scale=None, alpha=None, name=None):
     if alpha is not None:
         attrs["alpha"] = alpha
 
-    helper.append_op(
-        type="selu", inputs={"X": x}, outputs={"Out": out}, attrs=attrs)
+    helper.append_op(type="selu",
+                     inputs={"X": x},
+                     outputs={"Out": out},
+                     attrs=attrs)
     return out
 
 
@@ -9221,16 +9308,17 @@ def mean_iou(input, label, num_classes):
     out_mean_iou = helper.create_variable_for_type_inference(dtype='float32')
     out_wrong = helper.create_variable_for_type_inference(dtype='int32')
     out_correct = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type="mean_iou",
-        inputs={"Predictions": input,
-                "Labels": label},
-        outputs={
-            "OutMeanIou": out_mean_iou,
-            "OutWrong": out_wrong,
-            "OutCorrect": out_correct
-        },
-        attrs={"num_classes": num_classes})
+    helper.append_op(type="mean_iou",
+                     inputs={
+                         "Predictions": input,
+                         "Labels": label
+                     },
+                     outputs={
+                         "OutMeanIou": out_mean_iou,
+                         "OutWrong": out_wrong,
+                         "OutCorrect": out_correct
+                     },
+                     attrs={"num_classes": num_classes})
     return out_mean_iou, out_wrong, out_correct
 
 
@@ -9333,11 +9421,10 @@ def crop(x, shape=None, offsets=None, name=None):
     else:
         attrs['offsets'] = offsets
 
-    helper.append_op(
-        type='crop',
-        inputs=ipts,
-        outputs={'Out': out},
-        attrs=None if len(attrs) == 0 else attrs)
+    helper.append_op(type='crop',
+                     inputs=ipts,
+                     outputs={'Out': out},
+                     attrs=None if len(attrs) == 0 else attrs)
     return out
 
 
@@ -9506,8 +9593,11 @@ def _attr_offsets_check(offset_val):
             else:
                 _attr_shape_check(dim_size)
                 temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant(
-                    [1], 'int32', dim_size, force_cpu=True, out=temp_out)
+                fill_constant([1],
+                              'int32',
+                              dim_size,
+                              force_cpu=True,
+                              out=temp_out)
                 new_shape_tensor.append(temp_out)
                 shape_attr.append(dim_size)
         ipts['ShapeTensor'] = new_shape_tensor
@@ -9517,11 +9607,10 @@ def _attr_offsets_check(offset_val):
             _attr_shape_check(dim_size)
         attrs['shape'] = shape
 
-    helper.append_op(
-        type='crop_tensor',
-        inputs=ipts,
-        outputs={'Out': out},
-        attrs=None if len(attrs) == 0 else attrs)
+    helper.append_op(type='crop_tensor',
+                     inputs=ipts,
+                     outputs={'Out': out},
+                     attrs=None if len(attrs) == 0 else attrs)
     return out
 
 
@@ -9595,11 +9684,10 @@ def affine_grid(theta, out_shape, name=None):
         # ROCM platform do not have MIOPEN kernel for affine_grid
         attrs['use_cudnn'] = False
 
-    helper.append_op(
-        type='affine_grid',
-        inputs=ipts,
-        outputs={'Output': out},
-        attrs=None if len(attrs) == 0 else attrs)
+    helper.append_op(type='affine_grid',
+                     inputs=ipts,
+                     outputs={'Output': out},
+                     attrs=None if len(attrs) == 0 else attrs)
     return out
 
 
@@ -9720,8 +9808,10 @@ def pad2d(input,
     dtype = helper.input_dtype(input_param_name='input')
     out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type='pad2d', inputs=inputs, outputs={"Out": out}, attrs=attrs)
+    helper.append_op(type='pad2d',
+                     inputs=inputs,
+                     outputs={"Out": out},
+                     attrs=attrs)
 
     return out
 
@@ -9760,11 +9850,10 @@ def elu(x, alpha=1.0, name=None):
     helper = LayerHelper('elu', **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='elu',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'alpha': alpha})
+    helper.append_op(type='elu',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={'alpha': alpha})
     return out
 
 
@@ -9802,14 +9891,13 @@ def relu6(x, threshold=6.0, name=None):
 
     helper = LayerHelper('relu6', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='relu6',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={
-            'threshold': threshold,
-            'use_mkldnn': _global_flags()["FLAGS_use_mkldnn"]
-        })
+    helper.append_op(type='relu6',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={
+                         'threshold': threshold,
+                         'use_mkldnn': _global_flags()["FLAGS_use_mkldnn"]
+                     })
     return out
 
 
@@ -9859,8 +9947,10 @@ def pow(x, factor=1.0, name=None):
         attrs['factor'] = factor
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='pow', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    helper.append_op(type='pow',
+                     inputs=inputs,
+                     outputs={'Out': out},
+                     attrs=attrs)
     return out
 
 
@@ -9900,12 +9990,13 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
 
     helper = LayerHelper('stanh', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='stanh',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'scale_a': scale_a,
-               'scale_b': scale_b})
+    helper.append_op(type='stanh',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={
+                         'scale_a': scale_a,
+                         'scale_b': scale_b
+                     })
     return out
 
 
@@ -9943,12 +10034,13 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
 
     helper = LayerHelper('hard_sigmoid', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='hard_sigmoid',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'slope': slope,
-               'offset': offset})
+    helper.append_op(type='hard_sigmoid',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={
+                         'slope': slope,
+                         'offset': offset
+                     })
     return out
 
 
@@ -10029,11 +10121,10 @@ def swish(x, beta=1.0, name=None):
 
     helper = LayerHelper('swish', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='swish',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'slope': beta})
+    helper.append_op(type='swish',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={'slope': beta})
     return out
 
 
@@ -10121,20 +10212,22 @@ def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
         ) >= 1, "The size of input shape should be equal or larger than 1 in prelu() when mode is 'element'"
         alpha_shape = [1] + list(x.shape)[1:]
     dtype = helper.input_dtype(input_param_name='x')
-    alpha = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=alpha_shape,
-        dtype=dtype,
-        is_bias=False,
-        default_initializer=Constant(0.25))
+    alpha = helper.create_parameter(attr=helper.param_attr,
+                                    shape=alpha_shape,
+                                    dtype=dtype,
+                                    is_bias=False,
+                                    default_initializer=Constant(0.25))
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="prelu",
-        inputs={"X": x,
-                'Alpha': alpha},
-        attrs={"mode": mode,
-               "data_format": data_format},
-        outputs={"Out": out})
+    helper.append_op(type="prelu",
+                     inputs={
+                         "X": x,
+                         'Alpha': alpha
+                     },
+                     attrs={
+                         "mode": mode,
+                         "data_format": data_format
+                     },
+                     outputs={"Out": out})
     return out
 
 
@@ -10175,12 +10268,13 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
 
     helper = LayerHelper('brelu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='brelu',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'t_min': t_min,
-               't_max': t_max})
+    helper.append_op(type='brelu',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={
+                         't_min': t_min,
+                         't_max': t_max
+                     })
     return out
 
 
@@ -10252,11 +10346,10 @@ def soft_relu(x, threshold=40.0, name=None):
 
     helper = LayerHelper('soft_relu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='soft_relu',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'threshold': threshold})
+    helper.append_op(type='soft_relu',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={'threshold': threshold})
     return out
 
 
@@ -10340,12 +10433,13 @@ def flatten(x, axis=1, name=None):
 
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='flatten2',
-        inputs={"X": x},
-        outputs={'Out': out,
-                 'XShape': x_shape},
-        attrs={"axis": axis})
+    helper.append_op(type='flatten2',
+                     inputs={"X": x},
+                     outputs={
+                         'Out': out,
+                         'XShape': x_shape
+                     },
+                     attrs={"axis": axis})
     return out
 
 
@@ -10443,10 +10537,10 @@ def stack(x, axis=0, name=None):
         ) == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
             x = [x]
         else:
-            raise TypeError("The type of '%s' in %s must be %s, but received %s"
-                            % ('x', 'stack',
-                               'list[Tensor], tuple[Tensor] or TensorArray',
-                               type(x)))
+            raise TypeError(
+                "The type of '%s' in %s must be %s, but received %s" %
+                ('x', 'stack', 'list[Tensor], tuple[Tensor] or TensorArray',
+                 type(x)))
 
     helper = LayerHelper('stack', **locals())
 
@@ -10460,19 +10554,21 @@ def stack(x, axis=0, name=None):
             check_variable_and_dtype(i, 'x', \
                 ['float16', 'float32', 'float64', 'int32', 'int64'], 'stack')
 
-        helper.append_op(
-            type='tensor_array_to_tensor',
-            inputs={'X': x[0]},
-            outputs={'Out': [out],
-                     'OutIndex': [out_index]},
-            attrs={'axis': axis,
-                   'use_stack': True})
+        helper.append_op(type='tensor_array_to_tensor',
+                         inputs={'X': x[0]},
+                         outputs={
+                             'Out': [out],
+                             'OutIndex': [out_index]
+                         },
+                         attrs={
+                             'axis': axis,
+                             'use_stack': True
+                         })
     else:
-        helper.append_op(
-            type='stack',
-            inputs={'X': x},
-            outputs={'Y': out},
-            attrs={'axis': axis})
+        helper.append_op(type='stack',
+                         inputs={'X': x},
+                         outputs={'Y': out},
+                         attrs={'axis': axis})
 
     return out
 
@@ -10536,16 +10632,21 @@ def filter_by_instag(ins, ins_tag, filter_tag, is_lod, out_val_if_empty=0):
     out = helper.create_variable_for_type_inference(dtype=ins.dtype)
     loss_weight = helper.create_variable_for_type_inference(dtype=np.float64)
     mmap = helper.create_variable_for_type_inference(dtype=ins_tag.dtype)
-    helper.append_op(
-        type='filter_by_instag',
-        inputs={'Ins': ins,
-                'Ins_tag': ins_tag,
-                'Filter_tag': filter_tag},
-        outputs={'Out': out,
-                 'LossWeight': loss_weight,
-                 'IndexMap': mmap},
-        attrs={'is_lod': is_lod,
-               'out_val_if_empty': out_val_if_empty})
+    helper.append_op(type='filter_by_instag',
+                     inputs={
+                         'Ins': ins,
+                         'Ins_tag': ins_tag,
+                         'Filter_tag': filter_tag
+                     },
+                     outputs={
+                         'Out': out,
+                         'LossWeight': loss_weight,
+                         'IndexMap': mmap
+                     },
+                     attrs={
+                         'is_lod': is_lod,
+                         'out_val_if_empty': out_val_if_empty
+                     })
 
     return [out, loss_weight]
 
@@ -10602,12 +10703,13 @@ def unstack(x, axis=0, num=None):
     for _ in range(num):
         outs.append(helper.create_variable_for_type_inference(x.dtype))
 
-    helper.append_op(
-        type='unstack',
-        inputs={'X': [x]},
-        outputs={'Y': outs},
-        attrs={'axis': axis,
-               'num': num})
+    helper.append_op(type='unstack',
+                     inputs={'X': [x]},
+                     outputs={'Y': outs},
+                     attrs={
+                         'axis': axis,
+                         'num': num
+                     })
     return outs
 
 
@@ -10721,8 +10823,10 @@ def get_attr_expand_times(list_expand_times):
 
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='expand', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    helper.append_op(type='expand',
+                     inputs=inputs,
+                     outputs={'Out': out},
+                     attrs=attrs)
     return out
 
 
@@ -10797,8 +10901,9 @@ def expand_as(x, target_tensor, name=None):
     if _non_static_mode():
         return _C_ops.expand_as(x, target_tensor)
 
-    check_variable_and_dtype(
-        x, 'x', ['float32', 'float64', 'int32', 'int64', 'bool'], 'expand_as')
+    check_variable_and_dtype(x, 'x',
+                             ['float32', 'float64', 'int32', 'int64', 'bool'],
+                             'expand_as')
     check_variable_and_dtype(target_tensor, 'target_tensor',
                              ['float32', 'float64', 'int32', 'int64', 'bool'],
                              'expand_as')
@@ -10894,19 +10999,18 @@ def uniform_random_batch_size_like(input,
     helper = LayerHelper('uniform_random_batch_size_like', **locals())
     out = helper.create_variable_for_type_inference(dtype)
     c_dtype = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(
-        type='uniform_random_batch_size_like',
-        inputs={'Input': input},
-        outputs={'Out': out},
-        attrs={
-            'shape': shape,
-            'input_dim_idx': input_dim_idx,
-            'output_dim_idx': output_dim_idx,
-            'min': min,
-            'max': max,
-            'seed': seed,
-            'dtype': c_dtype
-        })
+    helper.append_op(type='uniform_random_batch_size_like',
+                     inputs={'Input': input},
+                     outputs={'Out': out},
+                     attrs={
+                         'shape': shape,
+                         'input_dim_idx': input_dim_idx,
+                         'output_dim_idx': output_dim_idx,
+                         'min': min,
+                         'max': max,
+                         'seed': seed,
+                         'dtype': c_dtype
+                     })
 
     return out
 
@@ -11017,15 +11121,14 @@ def gaussian_random(shape,
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
         place = _current_expected_place()
-        return _C_ops.final_state_gaussian_random(shape,
-                                                  float(mean),
+        return _C_ops.final_state_gaussian_random(shape, float(mean),
                                                   float(std), seed, dtype,
                                                   place)
 
     if _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        return _C_ops.gaussian_random('shape', shape, 'mean',
-                                      float(mean), 'std',
+        return _C_ops.gaussian_random('shape',
+                                      shape, 'mean', float(mean), 'std',
                                       float(std), 'seed', seed, 'dtype', dtype)
 
     check_type(shape, 'shape', (list, tuple, Variable), 'gaussian_random/randn')
@@ -11039,19 +11142,17 @@ def gaussian_random(shape,
         'dtype': dtype,
         'use_mkldnn': False
     }
-    utils.get_shape_tensor_inputs(
-        inputs=inputs,
-        attrs=attrs,
-        shape=shape,
-        op_type='gaussian_random/randn')
+    utils.get_shape_tensor_inputs(inputs=inputs,
+                                  attrs=attrs,
+                                  shape=shape,
+                                  op_type='gaussian_random/randn')
 
     helper = LayerHelper('gaussian_random', **locals())
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='gaussian_random',
-        inputs=inputs,
-        outputs={'Out': out},
-        attrs=attrs)
+    helper.append_op(type='gaussian_random',
+                     inputs=inputs,
+                     outputs={'Out': out},
+                     attrs=attrs)
 
     return out
 
@@ -11085,13 +11186,14 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
 
     helper = LayerHelper('sampling_id', **locals())
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='sampling_id',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'min': min,
-               'max': max,
-               'seed': seed})
+    helper.append_op(type='sampling_id',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={
+                         'min': min,
+                         'max': max,
+                         'seed': seed
+                     })
 
     return out
 
@@ -11144,19 +11246,18 @@ def gaussian_random_batch_size_like(input,
                 'fluid.layers.gaussian_random_batch_size_like')
     out = helper.create_variable_for_type_inference(dtype)
     c_dtype = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(
-        type='gaussian_random_batch_size_like',
-        inputs={'Input': input},
-        outputs={'Out': out},
-        attrs={
-            'shape': shape,
-            'input_dim_idx': input_dim_idx,
-            'output_dim_idx': output_dim_idx,
-            'mean': mean,
-            'std': std,
-            'seed': seed,
-            'dtype': c_dtype
-        })
+    helper.append_op(type='gaussian_random_batch_size_like',
+                     inputs={'Input': input},
+                     outputs={'Out': out},
+                     attrs={
+                         'shape': shape,
+                         'input_dim_idx': input_dim_idx,
+                         'output_dim_idx': output_dim_idx,
+                         'mean': mean,
+                         'std': std,
+                         'seed': seed,
+                         'dtype': c_dtype
+                     })
 
     return out
 
@@ -11371,8 +11472,8 @@ def slice(input, axes, starts, ends):
 
             else:
                 raise ValueError(
-                    "Input axes must be a python list or tuple, but reveived {}".
-                    format(type(axes)))
+                    "Input axes must be a python list or tuple, but reveived {}"
+                    .format(type(axes)))
 
             infer_flags = list(1 for i in range(len(axes)))
 
@@ -11459,8 +11560,10 @@ def slice(input, axes, starts, ends):
     attrs['infer_flags'] = infer_flags
     out = helper.create_variable_for_type_inference(
         dtype=helper.input_dtype('input'))
-    helper.append_op(
-        type='slice', inputs=inputs, attrs=attrs, outputs={'Out': out})
+    helper.append_op(type='slice',
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={'Out': out})
 
     return out
 
@@ -11675,8 +11778,10 @@ def get_new_list_tensor(old_list):
         attrs['infer_flags'] = infer_flags
     out = helper.create_variable_for_type_inference(
         dtype=helper.input_dtype('input'))
-    helper.append_op(
-        type='strided_slice', inputs=inputs, attrs=attrs, outputs={'Out': out})
+    helper.append_op(type='strided_slice',
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={'Out': out})
 
     return out
 
@@ -11749,11 +11854,10 @@ def shape(input):
     ], 'shape')
     helper = LayerHelper('shape', **locals())
     out = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type='shape',
-        inputs={'Input': input},
-        outputs={'Out': out},
-        stop_gradient=True)
+    helper.append_op(type='shape',
+                     inputs={'Input': input},
+                     outputs={'Out': out},
+                     stop_gradient=True)
 
     return out
 
@@ -11849,13 +11953,16 @@ def _elementwise_op(helper):
     name = helper.kwargs.get('name', None)
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type=op_type,
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': out},
-        attrs={'axis': axis,
-               'use_mkldnn': use_mkldnn})
+    helper.append_op(type=op_type,
+                     inputs={
+                         'X': x,
+                         'Y': y
+                     },
+                     outputs={'Out': out},
+                     attrs={
+                         'axis': axis,
+                         'use_mkldnn': use_mkldnn
+                     })
     return helper.append_activation(out)
 
 
@@ -11911,9 +12018,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
         return dygraph_utils._append_activation_in_dygraph(out)
     if _non_static_mode():
         _scale = scale.numpy().item(0) if isinstance(scale, Variable) else scale
-        out = _C_ops.scale(x, 'scale',
-                           float(_scale), 'bias',
-                           float(bias), 'bias_after_scale', bias_after_scale)
+        out = _C_ops.scale(x, 'scale', float(_scale), 'bias', float(bias),
+                           'bias_after_scale', bias_after_scale)
         return dygraph_utils._append_activation_in_dygraph(out)
 
     check_variable_and_dtype(x, "x", [
@@ -11932,8 +12038,10 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     helper = LayerHelper('scale', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type='scale', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    helper.append_op(type='scale',
+                     inputs=inputs,
+                     outputs={'Out': out},
+                     attrs=attrs)
     return helper.append_activation(out)
 
 
@@ -12112,8 +12220,11 @@ def gen_data():
 
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(
-            x, y, axis=axis, act=act, op_name='elementwise_div')
+        return _elementwise_op_in_dygraph(x,
+                                          y,
+                                          axis=axis,
+                                          act=act,
+                                          op_name='elementwise_div')
 
     return _elementwise_op(LayerHelper('elementwise_div', **locals()))
 
@@ -12200,8 +12311,11 @@ def gen_data():
 
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(
-            x, y, axis=axis, act=act, op_name='elementwise_sub')
+        return _elementwise_op_in_dygraph(x,
+                                          y,
+                                          axis=axis,
+                                          act=act,
+                                          op_name='elementwise_sub')
 
     return _elementwise_op(LayerHelper('elementwise_sub', **locals()))
 
@@ -12289,8 +12403,11 @@ def gen_data():
 
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(
-            x, y, axis=axis, act=act, op_name='elementwise_mul')
+        return _elementwise_op_in_dygraph(x,
+                                          y,
+                                          axis=axis,
+                                          act=act,
+                                          op_name='elementwise_mul')
 
     return _elementwise_op(LayerHelper('elementwise_mul', **locals()))
 
@@ -12353,8 +12470,11 @@ def gen_data():
 
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(
-            x, y, axis=axis, act=act, op_name='elementwise_max')
+        return _elementwise_op_in_dygraph(x,
+                                          y,
+                                          axis=axis,
+                                          act=act,
+                                          op_name='elementwise_max')
 
     return _elementwise_op(LayerHelper('elementwise_max', **locals()))
 
@@ -12415,8 +12535,11 @@ def gen_data():
         print(z_value)#[[[[0., 0., 0., 0., 0.] .... [0., 0., 0., 0., 0.]]]]
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(
-            x, y, axis=axis, act=act, op_name='elementwise_min')
+        return _elementwise_op_in_dygraph(x,
+                                          y,
+                                          axis=axis,
+                                          act=act,
+                                          op_name='elementwise_min')
 
     return _elementwise_op(LayerHelper('elementwise_min', **locals()))
 
@@ -12450,8 +12573,11 @@ def gen_data():
         print(z_value) #[2, 243, 16]
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(
-            x, y, axis=axis, act=act, op_name='elementwise_pow')
+        return _elementwise_op_in_dygraph(x,
+                                          y,
+                                          axis=axis,
+                                          act=act,
+                                          op_name='elementwise_pow')
     return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
 
 
@@ -12485,8 +12611,11 @@ def gen_data():
         print(z_value) #[1, 3, 3]
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(
-            x, y, axis=axis, act=act, op_name='elementwise_mod')
+        return _elementwise_op_in_dygraph(x,
+                                          y,
+                                          axis=axis,
+                                          act=act,
+                                          op_name='elementwise_mod')
 
     return _elementwise_op(LayerHelper('elementwise_mod', **locals()))
 
@@ -12521,8 +12650,11 @@ def gen_data():
         print(z_value) #[3, 2, 1]
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(
-            x, y, axis=axis, act=act, op_name='elementwise_floordiv')
+        return _elementwise_op_in_dygraph(x,
+                                          y,
+                                          axis=axis,
+                                          act=act,
+                                          op_name='elementwise_floordiv')
 
     return _elementwise_op(LayerHelper('elementwise_floordiv', **locals()))
 
@@ -12622,13 +12754,15 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
             return op(x, y)
         else:
             return op(x)
-    check_variable_and_dtype(x, "x", [
-        "bool", "int8", "int16", "int32", "int64", "float32", "float64"
-    ], op_name)
+    check_variable_and_dtype(
+        x, "x",
+        ["bool", "int8", "int16", "int32", "int64", "float32", "float64"],
+        op_name)
     if y is not None:
-        check_variable_and_dtype(y, "y", [
-            "bool", "int8", "int16", "int32", "int64", "float32", "float64"
-        ], op_name)
+        check_variable_and_dtype(
+            y, "y",
+            ["bool", "int8", "int16", "int32", "int64", "float32", "float64"],
+            op_name)
     if out is not None:
         check_type(out, "out", Variable, op_name)
 
@@ -12643,9 +12777,12 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     if binary_op:
-        helper.append_op(
-            type=op_name, inputs={"X": x,
-                                  "Y": y}, outputs={"Out": out})
+        helper.append_op(type=op_name,
+                         inputs={
+                             "X": x,
+                             "Y": y
+                         },
+                         outputs={"Out": out})
     else:
         helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out})
 
@@ -12687,8 +12824,12 @@ def logical_and(x, y, out=None, name=None):
     if in_dygraph_mode():
         return _C_ops.final_state_logical_and(x, y)
 
-    return _logical_op(
-        op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True)
+    return _logical_op(op_name="logical_and",
+                       x=x,
+                       y=y,
+                       name=name,
+                       out=out,
+                       binary_op=True)
 
 
 def logical_or(x, y, out=None, name=None):
@@ -12728,8 +12869,12 @@ def logical_or(x, y, out=None, name=None):
     """
     if in_dygraph_mode():
         return _C_ops.final_state_logical_or(x, y)
-    return _logical_op(
-        op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True)
+    return _logical_op(op_name="logical_or",
+                       x=x,
+                       y=y,
+                       name=name,
+                       out=out,
+                       binary_op=True)
 
 
 def logical_xor(x, y, out=None, name=None):
@@ -12770,8 +12915,12 @@ def logical_xor(x, y, out=None, name=None):
     if in_dygraph_mode():
         return _C_ops.final_state_logical_xor(x, y)
 
-    return _logical_op(
-        op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True)
+    return _logical_op(op_name="logical_xor",
+                       x=x,
+                       y=y,
+                       name=name,
+                       out=out,
+                       binary_op=True)
 
 
 @templatedoc()
@@ -12804,8 +12953,12 @@ def logical_not(x, out=None, name=None):
     """
     if in_dygraph_mode():
         return _C_ops.final_state_logical_not(x)
-    return _logical_op(
-        op_name="logical_not", x=x, y=None, name=name, out=out, binary_op=False)
+    return _logical_op(op_name="logical_not",
+                       x=x,
+                       y=None,
+                       name=name,
+                       out=out,
+                       binary_op=False)
 
 
 @templatedoc()
@@ -12845,15 +12998,18 @@ def clip(x, min, max, name=None):
         name = unique_name.generate_with_ignorable_key(".".join(
             [helper.name, 'tmp']))
 
-    out = helper.create_variable(
-        type=x.type, name=name, dtype=x.dtype, persistable=False)
+    out = helper.create_variable(type=x.type,
+                                 name=name,
+                                 dtype=x.dtype,
+                                 persistable=False)
 
-    helper.append_op(
-        type="clip",
-        inputs={"X": x},
-        attrs={"min": min,
-               "max": max},
-        outputs={"Out": out})
+    helper.append_op(type="clip",
+                     inputs={"X": x},
+                     attrs={
+                         "min": min,
+                         "max": max
+                     },
+                     outputs={"Out": out})
 
     return out
 
@@ -12898,14 +13054,15 @@ def clip_by_norm(x, max_norm, name=None):
         name = unique_name.generate_with_ignorable_key(".".join(
             [helper.name, 'tmp']))
 
-    out = helper.create_variable(
-        type=x.type, name=name, dtype=x.dtype, persistable=False)
+    out = helper.create_variable(type=x.type,
+                                 name=name,
+                                 dtype=x.dtype,
+                                 persistable=False)
 
-    helper.append_op(
-        type="clip_by_norm",
-        inputs={"X": x},
-        attrs={"max_norm": max_norm},
-        outputs={"Out": out})
+    helper.append_op(type="clip_by_norm",
+                     inputs={"X": x},
+                     attrs={"max_norm": max_norm},
+                     outputs={"Out": out})
 
     return out
 
@@ -12944,8 +13101,10 @@ def mean(x, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'mean')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type="mean", inputs={"X": x}, attrs={}, outputs={"Out": out})
+    helper.append_op(type="mean",
+                     inputs={"X": x},
+                     attrs={},
+                     outputs={"Out": out})
 
     return out
 
@@ -12975,11 +13134,10 @@ def merge_selected_rows(x, name=None):
 
     helper = LayerHelper("merge_selected_rows", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="merge_selected_rows",
-        inputs={"X": x},
-        attrs={},
-        outputs={"Out": out})
+    helper.append_op(type="merge_selected_rows",
+                     inputs={"X": x},
+                     attrs={},
+                     outputs={"Out": out})
     return out
 
 
@@ -13029,9 +13187,13 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
     check_variable_and_dtype(y, 'y', ['float16', 'float32', 'float64'], 'mul')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type="mul", inputs={"X": x,
-                            "Y": y}, attrs=attrs, outputs={"Out": out})
+    helper.append_op(type="mul",
+                     inputs={
+                         "X": x,
+                         "Y": y
+                     },
+                     attrs=attrs,
+                     outputs={"Out": out})
     return out
 
 
@@ -13170,11 +13332,10 @@ def space_to_depth(x, blocksize, name=None):
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type="space_to_depth",
-        inputs={"X": x},
-        attrs={"blocksize": blocksize},
-        outputs={"Out": out})
+    helper.append_op(type="space_to_depth",
+                     inputs={"X": x},
+                     attrs={"blocksize": blocksize},
+                     outputs={"Out": out})
     return out
 
 
@@ -13250,13 +13411,14 @@ def affine_channel(x,
     check_type(bias, 'bias', (Variable, type(None)), 'affine_channel')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type="affine_channel",
-        inputs={"X": x,
-                'Scale': scale,
-                'Bias': bias},
-        attrs={"data_layout": data_layout},
-        outputs={"Out": out})
+    helper.append_op(type="affine_channel",
+                     inputs={
+                         "X": x,
+                         'Scale': scale,
+                         'Bias': bias
+                     },
+                     attrs={"data_layout": data_layout},
+                     outputs={"Out": out})
     return helper.append_activation(out)
 
 
@@ -13365,12 +13527,13 @@ def similarity_focus(input, axis, indexes, name=None):
         raise ValueError("indexes can not be empty.")
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='similarity_focus',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={"axis": axis,
-               "indexes": indexes})
+    helper.append_op(type='similarity_focus',
+                     inputs={'X': input},
+                     outputs={'Out': out},
+                     attrs={
+                         "axis": axis,
+                         "indexes": indexes
+                     })
     return out
 
 
@@ -13425,14 +13588,15 @@ def hash(input, hash_size, num_hash=1, name=None):
     check_type(hash_size, 'hash_size', int, 'hash')
     check_type(num_hash, 'num_hash', int, 'hash')
     helper = LayerHelper('hash', **locals())
-    out = helper.create_variable_for_type_inference(
-        helper.input_dtype(), stop_gradient=True)
-    helper.append_op(
-        type='hash',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={'num_hash': num_hash,
-               'mod_by': hash_size})
+    out = helper.create_variable_for_type_inference(helper.input_dtype(),
+                                                    stop_gradient=True)
+    helper.append_op(type='hash',
+                     inputs={'X': input},
+                     outputs={'Out': out},
+                     attrs={
+                         'num_hash': num_hash,
+                         'mod_by': hash_size
+                     })
     return out
 
 
@@ -13539,8 +13703,10 @@ def grid_sampler(x, grid, name=None):
 
     attrs = {'use_cudnn': False} if core.is_compiled_with_rocm() else {}
 
-    helper.append_op(
-        type='grid_sampler', inputs=ipts, outputs={'Output': out}, attrs=attrs)
+    helper.append_op(type='grid_sampler',
+                     inputs=ipts,
+                     outputs={'Output': out},
+                     attrs=attrs)
     return out
 
 
@@ -13642,12 +13808,13 @@ def add_position_encoding(input, alpha, beta, name=None):
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
-    helper.append_op(
-        type="add_position_encoding",
-        inputs={"X": input},
-        outputs={"Out": out},
-        attrs={"alpha": alpha,
-               "beta": beta})
+    helper.append_op(type="add_position_encoding",
+                     inputs={"X": input},
+                     outputs={"Out": out},
+                     attrs={
+                         "alpha": alpha,
+                         "beta": beta
+                     })
     return out
 
 
@@ -13708,18 +13875,23 @@ def bilinear_tensor_product(x,
 
     param_shape = [size, x.shape[1], y.shape[1]]
 
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=False)
+    w = helper.create_parameter(attr=helper.param_attr,
+                                shape=param_shape,
+                                dtype=dtype,
+                                is_bias=False)
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
     inputs = {"X": x, "Y": y, "Weight": w}
     if helper.bias_attr:
         bias_size = [1, size]
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+        bias = helper.create_parameter(attr=helper.bias_attr,
+                                       shape=bias_size,
+                                       dtype=dtype,
+                                       is_bias=True)
         inputs["Bias"] = bias
-    helper.append_op(
-        type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out})
+    helper.append_op(type="bilinear_tensor_product",
+                     inputs=inputs,
+                     outputs={"Out": out})
 
     # add activation
     return helper.append_activation(out)
@@ -13769,11 +13941,10 @@ def get_tensor_from_selected_rows(x, name=None):
         )
     helper = LayerHelper('get_tensor_from_selected_rows', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='get_tensor_from_selected_rows',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={})
+    helper.append_op(type='get_tensor_from_selected_rows',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={})
     return out
 
 
@@ -13843,11 +14014,10 @@ def shuffle_channel(x, group, name=None):
     if not isinstance(group, int):
         raise TypeError("group must be int type")
 
-    helper.append_op(
-        type="shuffle_channel",
-        inputs={"X": x},
-        outputs={"Out": out},
-        attrs={"group": group})
+    helper.append_op(type="shuffle_channel",
+                     inputs={"X": x},
+                     outputs={"Out": out},
+                     attrs={"group": group})
     return out
 
 
@@ -14185,19 +14355,18 @@ def py_func_demo():
         for v in skip_vars_in_backward_input:
             if not v.name in fwd_in_out:
                 raise ValueError(
-                    'Variable {} is not found in forward inputs and outputs'
-                    .format(v.name))
+                    'Variable {} is not found in forward inputs and outputs'.
+                    format(v.name))
             backward_skip_vars.add(v.name)
 
-    helper.append_op(
-        type='py_func',
-        inputs={'X': x},
-        outputs={'Out': out_list},
-        attrs={
-            'forward_callable_id': fwd_func_id,
-            'backward_callable_id': bwd_func_id,
-            'backward_skip_vars': list(backward_skip_vars)
-        })
+    helper.append_op(type='py_func',
+                     inputs={'X': x},
+                     outputs={'Out': out_list},
+                     attrs={
+                         'forward_callable_id': fwd_func_id,
+                         'backward_callable_id': bwd_func_id,
+                         'backward_skip_vars': list(backward_skip_vars)
+                     })
     return out
 
 
@@ -14261,17 +14430,18 @@ def psroi_pool(input,
         raise TypeError("pooled_width must be int type")
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='psroi_pool',
-        inputs={'X': input,
-                'ROIs': rois},
-        outputs={'Out': out},
-        attrs={
-            'output_channels': output_channels,
-            'spatial_scale': spatial_scale,
-            'pooled_height': pooled_height,
-            'pooled_width': pooled_width
-        })
+    helper.append_op(type='psroi_pool',
+                     inputs={
+                         'X': input,
+                         'ROIs': rois
+                     },
+                     outputs={'Out': out},
+                     attrs={
+                         'output_channels': output_channels,
+                         'spatial_scale': spatial_scale,
+                         'pooled_height': pooled_height,
+                         'pooled_width': pooled_width
+                     })
     return out
 
 
@@ -14345,15 +14515,14 @@ def prroi_pool(input,
     inputs_op = {'X': input, 'ROIs': rois}
     if batch_roi_nums is not None:
         inputs_op['BatchRoINums'] = batch_roi_nums
-    helper.append_op(
-        type='prroi_pool',
-        inputs=inputs_op,
-        outputs={'Out': out},
-        attrs={
-            'spatial_scale': spatial_scale,
-            'pooled_height': pooled_height,
-            'pooled_width': pooled_width
-        })
+    helper.append_op(type='prroi_pool',
+                     inputs=inputs_op,
+                     outputs={'Out': out},
+                     attrs={
+                         'spatial_scale': spatial_scale,
+                         'pooled_height': pooled_height,
+                         'pooled_width': pooled_width
+                     })
     return out
 
 
@@ -14410,11 +14579,10 @@ def pixel_shuffle(x, upscale_factor):
     if not isinstance(upscale_factor, int):
         raise TypeError("upscale factor must be int type")
 
-    helper.append_op(
-        type="pixel_shuffle",
-        inputs={"X": x},
-        outputs={"Out": out},
-        attrs={"upscale_factor": upscale_factor})
+    helper.append_op(type="pixel_shuffle",
+                     inputs={"X": x},
+                     outputs={"Out": out},
+                     attrs={"upscale_factor": upscale_factor})
     return out
 
 
@@ -14517,12 +14685,13 @@ def continuous_value_model(input, cvm, use_cvm=True):
     out = helper.create_variable(dtype=input.dtype)
     check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
                              'cvm')
-    helper.append_op(
-        type='cvm',
-        inputs={'X': [input],
-                'CVM': [cvm]},
-        outputs={'Y': [out]},
-        attrs={"use_cvm": use_cvm})
+    helper.append_op(type='cvm',
+                     inputs={
+                         'X': [input],
+                         'CVM': [cvm]
+                     },
+                     outputs={'Y': [out]},
+                     attrs={"use_cvm": use_cvm})
     return out
 
 
@@ -14570,10 +14739,9 @@ def where(condition):
     out = helper.create_variable_for_type_inference(
         dtype=core.VarDesc.VarType.INT64)
 
-    helper.append_op(
-        type='where_index',
-        inputs={'Condition': condition},
-        outputs={'Out': [out]})
+    helper.append_op(type='where_index',
+                     inputs={'Condition': condition},
+                     outputs={'Out': [out]})
     return out
 
 
@@ -14640,12 +14808,13 @@ def unique(x, dtype='int32'):
 
     index = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type='unique',
-        inputs={'X': x},
-        attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
-        outputs={'Out': [out],
-                 'Index': [index]})
+    helper.append_op(type='unique',
+                     inputs={'X': x},
+                     attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
+                     outputs={
+                         'Out': [out],
+                         'Index': [index]
+                     })
 
     return out, index
 
@@ -14697,13 +14866,14 @@ def unique_with_counts(x, dtype='int32'):
 
     count = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type='unique_with_counts',
-        inputs={'X': x},
-        attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
-        outputs={'Out': [out],
-                 'Index': [index],
-                 'Count': [count]})
+    helper.append_op(type='unique_with_counts',
+                     inputs={'X': x},
+                     attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
+                     outputs={
+                         'Out': [out],
+                         'Index': [index],
+                         'Count': [count]
+                     })
 
     return out, index, count
 
@@ -14904,41 +15074,39 @@ def _get_default_param_initializer():
     pre_bias = helper.create_variable_for_type_inference(dtype)
 
     if modulated:
-        helper.append_op(
-            type='deformable_conv',
-            inputs={
-                'Input': input,
-                'Filter': filter_param,
-                'Offset': offset,
-                'Mask': mask,
-            },
-            outputs={"Output": pre_bias},
-            attrs={
-                'strides': stride,
-                'paddings': padding,
-                'dilations': dilation,
-                'groups': groups,
-                'deformable_groups': deformable_groups,
-                'im2col_step': im2col_step,
-            })
+        helper.append_op(type='deformable_conv',
+                         inputs={
+                             'Input': input,
+                             'Filter': filter_param,
+                             'Offset': offset,
+                             'Mask': mask,
+                         },
+                         outputs={"Output": pre_bias},
+                         attrs={
+                             'strides': stride,
+                             'paddings': padding,
+                             'dilations': dilation,
+                             'groups': groups,
+                             'deformable_groups': deformable_groups,
+                             'im2col_step': im2col_step,
+                         })
 
     else:
-        helper.append_op(
-            type='deformable_conv_v1',
-            inputs={
-                'Input': input,
-                'Filter': filter_param,
-                'Offset': offset,
-            },
-            outputs={"Output": pre_bias},
-            attrs={
-                'strides': stride,
-                'paddings': padding,
-                'dilations': dilation,
-                'groups': groups,
-                'deformable_groups': deformable_groups,
-                'im2col_step': im2col_step,
-            })
+        helper.append_op(type='deformable_conv_v1',
+                         inputs={
+                             'Input': input,
+                             'Filter': filter_param,
+                             'Offset': offset,
+                         },
+                         outputs={"Output": pre_bias},
+                         attrs={
+                             'strides': stride,
+                             'paddings': padding,
+                             'dilations': dilation,
+                             'groups': groups,
+                             'deformable_groups': deformable_groups,
+                             'im2col_step': im2col_step,
+                         })
 
     output = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
     return output
@@ -15163,24 +15331,27 @@ def deformable_roi_pooling(input,
     dtype = helper.input_dtype()
     output = helper.create_variable_for_type_inference(dtype)
     top_count = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type="deformable_psroi_pooling",
-        inputs={"Input": input,
-                "ROIs": rois,
-                "Trans": trans},
-        outputs={"Output": output,
-                 "TopCount": top_count},
-        attrs={
-            "no_trans": no_trans,
-            "spatial_scale": spatial_scale,
-            "output_dim": output_channels,
-            "group_size": group_size,
-            "pooled_height": pooled_height,
-            "pooled_width": pooled_width,
-            "part_size": part_size,
-            "sample_per_part": sample_per_part,
-            "trans_std": trans_std
-        })
+    helper.append_op(type="deformable_psroi_pooling",
+                     inputs={
+                         "Input": input,
+                         "ROIs": rois,
+                         "Trans": trans
+                     },
+                     outputs={
+                         "Output": output,
+                         "TopCount": top_count
+                     },
+                     attrs={
+                         "no_trans": no_trans,
+                         "spatial_scale": spatial_scale,
+                         "output_dim": output_channels,
+                         "group_size": group_size,
+                         "pooled_height": pooled_height,
+                         "pooled_width": pooled_width,
+                         "part_size": part_size,
+                         "sample_per_part": sample_per_part,
+                         "trans_std": trans_std
+                     })
     return output
 
 
@@ -15243,17 +15414,16 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
                          (shard_id, nshards))
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [input]},
-        outputs={'Out': out},
-        attrs={
-            'index_num': index_num,
-            'nshards': nshards,
-            'shard_id': shard_id,
-            'ignore_value': ignore_value
-        },
-        stop_gradient=True)
+    helper.append_op(type=op_type,
+                     inputs={'X': [input]},
+                     outputs={'Out': out},
+                     attrs={
+                         'index_num': index_num,
+                         'nshards': nshards,
+                         'shard_id': shard_id,
+                         'ignore_value': ignore_value
+                     },
+                     stop_gradient=True)
     return out
 
 
@@ -15316,13 +15486,14 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
 
     helper = LayerHelper('hard_swish', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='hard_swish',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'threshold': threshold,
-               'scale': scale,
-               'offset': offset})
+    helper.append_op(type='hard_swish',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={
+                         'threshold': threshold,
+                         'scale': scale,
+                         'offset': offset
+                     })
     return out
 
 
@@ -15398,11 +15569,10 @@ def mish(x, threshold=20, name=None):
 
     helper = LayerHelper('mish', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='mish',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'threshold': threshold})
+    helper.append_op(type='mish',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={'threshold': threshold})
     return out
 
 
@@ -15472,7 +15642,11 @@ def gather_tree(ids, parents):
 
 @deprecated(since="2.0.0", update_to="paddle.uniform")
 @templatedoc()
-def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
+def uniform_random(shape,
+                   dtype='float32',
+                   min=-1.0,
+                   max=1.0,
+                   seed=0,
                    name=None):
     """
     This OP returns a Tensor filled with random values sampled from a uniform
@@ -15553,8 +15727,7 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
 
     if _non_static_mode():
         shape = utils.convert_shape_to_list(shape)
-        return _C_ops.uniform_random('shape', shape, 'min',
-                                     float(min), 'max',
+        return _C_ops.uniform_random('shape', shape, 'min', float(min), 'max',
                                      float(max), 'seed', seed, 'dtype', dtype)
 
     check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
@@ -15563,14 +15736,17 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
 
     inputs = dict()
     attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
-    utils.get_shape_tensor_inputs(
-        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform_random/rand')
+    utils.get_shape_tensor_inputs(inputs=inputs,
+                                  attrs=attrs,
+                                  shape=shape,
+                                  op_type='uniform_random/rand')
 
     helper = LayerHelper("uniform_random", **locals())
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="uniform_random", inputs=inputs, attrs=attrs,
-        outputs={"Out": out})
+    helper.append_op(type="uniform_random",
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={"Out": out})
     utils.try_set_static_shape_tensor(out, shape)
     return out
 
@@ -15621,9 +15797,8 @@ def unbind(input, axis=0):
         for i in range(num)
     ]
 
-    helper.append_op(
-        type="unbind",
-        inputs={"X": input},
-        outputs={"Out": outs},
-        attrs={"axis": axis})
+    helper.append_op(type="unbind",
+                     inputs={"X": input},
+                     outputs={"Out": outs},
+                     attrs={"axis": axis})
     return outs
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index d8cd7f6abf6df..84d147bc97c92 100755
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -92,8 +92,8 @@
     if _OP in __deprecated_func_name__:
         _new_OP = __deprecated_func_name__[_OP]
     _func = generate_activation_fn(_OP)
-    _func = deprecated(
-        since="2.0.0", update_to="paddle.nn.functional.%s" % (_new_OP))(_func)
+    _func = deprecated(since="2.0.0",
+                       update_to="paddle.nn.functional.%s" % (_new_OP))(_func)
     globals()[_OP] = _func
 
 for _OP in set(__unary_func__):
@@ -112,7 +112,8 @@
     _func = deprecated(since="2.0.0", update_to="paddle.%s" % (_new_OP))(_func)
     globals()[_OP] = _func
 
-add_sample_code(globals()["sigmoid"], r"""
+add_sample_code(
+    globals()["sigmoid"], r"""
 Examples:
     .. code-block:: python
 
@@ -126,7 +127,8 @@
 
 """)
 
-add_sample_code(globals()["silu"], r"""
+add_sample_code(
+    globals()["silu"], r"""
 Examples:
     .. code-block:: python
 
@@ -140,7 +142,8 @@
 
 """)
 
-add_sample_code(globals()["logsigmoid"], r"""
+add_sample_code(
+    globals()["logsigmoid"], r"""
 Examples:
     .. code-block:: python
 
@@ -154,7 +157,8 @@
 
 """)
 
-add_sample_code(globals()["exp"], r"""
+add_sample_code(
+    globals()["exp"], r"""
 Examples:
     .. code-block:: python
 
@@ -167,7 +171,8 @@
 
 """)
 
-add_sample_code(globals()["expm1"], r"""
+add_sample_code(
+    globals()["expm1"], r"""
 Examples:
     .. code-block:: python
 
@@ -180,7 +185,8 @@
 
 """)
 
-add_sample_code(globals()["tanh"], r"""
+add_sample_code(
+    globals()["tanh"], r"""
 Examples:
     .. code-block:: python
 
@@ -193,7 +199,8 @@
 
 """)
 
-add_sample_code(globals()["atan"], r"""
+add_sample_code(
+    globals()["atan"], r"""
 Examples:
     .. code-block:: python
 
@@ -206,7 +213,8 @@
 
 """)
 
-add_sample_code(globals()["tanh_shrink"], r"""
+add_sample_code(
+    globals()["tanh_shrink"], r"""
 Examples:
     .. code-block:: python
 
@@ -220,7 +228,8 @@
 
 """)
 
-add_sample_code(globals()["sqrt"], r"""
+add_sample_code(
+    globals()["sqrt"], r"""
 Examples:
     .. code-block:: python
 
@@ -233,7 +242,8 @@
 
 """)
 
-add_sample_code(globals()["rsqrt"], r"""
+add_sample_code(
+    globals()["rsqrt"], r"""
 Examples:
     .. code-block:: python
 
@@ -246,7 +256,8 @@
 
 """)
 
-add_sample_code(globals()["abs"], r"""
+add_sample_code(
+    globals()["abs"], r"""
 Examples:
     .. code-block:: python
 
@@ -259,7 +270,8 @@
 
 """)
 
-add_sample_code(globals()["ceil"], r"""
+add_sample_code(
+    globals()["ceil"], r"""
 Examples:
     .. code-block:: python
 
@@ -272,7 +284,8 @@
 
 """)
 
-add_sample_code(globals()["floor"], r"""
+add_sample_code(
+    globals()["floor"], r"""
 Examples:
     .. code-block:: python
 
@@ -285,7 +298,8 @@
 
 """)
 
-add_sample_code(globals()["cos"], r"""
+add_sample_code(
+    globals()["cos"], r"""
 Examples:
     .. code-block:: python
 
@@ -298,7 +312,8 @@
 
 """)
 
-add_sample_code(globals()["tan"], r"""
+add_sample_code(
+    globals()["tan"], r"""
 Examples:
     .. code-block:: python
 
@@ -311,7 +326,8 @@
 
 """)
 
-add_sample_code(globals()["acos"], r"""
+add_sample_code(
+    globals()["acos"], r"""
 Examples:
     .. code-block:: python
 
@@ -324,7 +340,8 @@
 
 """)
 
-add_sample_code(globals()["sin"], r"""
+add_sample_code(
+    globals()["sin"], r"""
 Examples:
     .. code-block:: python
 
@@ -337,7 +354,8 @@
 
 """)
 
-add_sample_code(globals()["asin"], r"""
+add_sample_code(
+    globals()["asin"], r"""
 Examples:
     .. code-block:: python
 
@@ -350,7 +368,8 @@
 
 """)
 
-add_sample_code(globals()["cosh"], r"""
+add_sample_code(
+    globals()["cosh"], r"""
 Examples:
     .. code-block:: python
 
@@ -363,7 +382,8 @@
 
 """)
 
-add_sample_code(globals()["sinh"], r"""
+add_sample_code(
+    globals()["sinh"], r"""
 Examples:
     .. code-block:: python
 
@@ -376,7 +396,8 @@
 
 """)
 
-add_sample_code(globals()["asinh"], r"""
+add_sample_code(
+    globals()["asinh"], r"""
 Examples:
     .. code-block:: python
 
@@ -389,7 +410,8 @@
 
 """)
 
-add_sample_code(globals()["acosh"], r"""
+add_sample_code(
+    globals()["acosh"], r"""
 Examples:
     .. code-block:: python
 
@@ -402,7 +424,8 @@
 
 """)
 
-add_sample_code(globals()["atanh"], r"""
+add_sample_code(
+    globals()["atanh"], r"""
 Examples:
     .. code-block:: python
 
@@ -415,7 +438,8 @@
 
 """)
 
-add_sample_code(globals()["round"], r"""
+add_sample_code(
+    globals()["round"], r"""
 Examples:
     .. code-block:: python
 
@@ -428,7 +452,8 @@
 
 """)
 
-add_sample_code(globals()["reciprocal"], r"""
+add_sample_code(
+    globals()["reciprocal"], r"""
 Examples:
     .. code-block:: python
 
@@ -441,7 +466,8 @@
 
 """)
 
-add_sample_code(globals()["square"], r"""
+add_sample_code(
+    globals()["square"], r"""
 Examples:
     .. code-block:: python
 
@@ -454,7 +480,8 @@
 
 """)
 
-add_sample_code(globals()["lgamma"], r"""
+add_sample_code(
+    globals()["lgamma"], r"""
 Examples:
     .. code-block:: python
 
@@ -467,7 +494,8 @@
 
 """)
 
-add_sample_code(globals()["softplus"], r"""
+add_sample_code(
+    globals()["softplus"], r"""
 Examples:
     .. code-block:: python
 
@@ -481,7 +509,8 @@
 
 """)
 
-add_sample_code(globals()["softsign"], r"""
+add_sample_code(
+    globals()["softsign"], r"""
 Examples:
     .. code-block:: python
 
@@ -576,10 +605,9 @@ def hard_shrink(x, threshold=None):
 _cum_sum_ = generate_layer_fn('cumsum')
 
 
-@deprecated(
-    since="2.0.0",
-    update_to="paddle.cumsum",
-    reason="New APIs for Paddle 2.0 are coming.")
+@deprecated(since="2.0.0",
+            update_to="paddle.cumsum",
+            reason="New APIs for Paddle 2.0 are coming.")
 def cumsum(x, axis=None, exclusive=None, reverse=None):
     check_type(x, 'x', (Variable), 'cumsum')
     locals_var = locals().copy()
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index b04cf90e1d8f9..6b51721aafc41 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -131,7 +131,8 @@ def get_initial_states(self,
         if sys.version_info < (3, ):
             integer_types = (
                 int,
-                long, )
+                long,
+            )
         else:
             integer_types = (int, )
         check_variable_and_dtype(batch_ref, 'batch_ref',
@@ -156,7 +157,8 @@ def _is_shape_sequence(seq):
             if sys.version_info < (3, ):
                 integer_types = (
                     int,
-                    long, )
+                    long,
+                )
             else:
                 integer_types = (int, )
             """For shape, list/tuple of integer is the finest-grained objection"""
@@ -167,10 +169,11 @@ def _is_shape_sequence(seq):
             # TODO: Add check for the illegal
             if isinstance(seq, dict):
                 return True
-            return (isinstance(seq, Sequence) and
-                    not isinstance(seq, six.string_types))
+            return (isinstance(seq, Sequence)
+                    and not isinstance(seq, six.string_types))
 
         class Shape(object):
+
             def __init__(self, shape):
                 self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
 
@@ -507,6 +510,7 @@ def rnn(cell,
 
 
 class ArrayWrapper(object):
+
     def __init__(self, x):
         self.array = [x]
 
@@ -549,8 +553,9 @@ def _rnn_dynamic_graph(cell,
         inputs = map_structure(_transpose_batch_time, inputs)
 
     if sequence_length is not None:
-        mask = sequence_lod.sequence_mask(
-            sequence_length, maxlen=time_steps, dtype=inputs.dtype)
+        mask = sequence_lod.sequence_mask(sequence_length,
+                                          maxlen=time_steps,
+                                          dtype=inputs.dtype)
         mask = nn.transpose(mask, [1, 0])
 
     if is_reverse:
@@ -564,9 +569,8 @@ def _rnn_dynamic_graph(cell,
         step_inputs = map_structure(lambda x: x[i], inputs)
         step_outputs, new_states = cell(step_inputs, states, **kwargs)
         if sequence_length is not None:
-            new_states = map_structure(
-                partial(
-                    _maybe_copy, step_mask=mask[i]), states, new_states)
+            new_states = map_structure(partial(_maybe_copy, step_mask=mask[i]),
+                                       states, new_states)
         states = new_states
         outputs = map_structure(lambda x: ArrayWrapper(x),
                                 step_outputs) if i == 0 else map_structure(
@@ -574,13 +578,11 @@ def _rnn_dynamic_graph(cell,
                                     step_outputs, outputs)
 
     final_outputs = map_structure(
-        lambda x: nn.stack(x.array, axis=time_step_index),
-        outputs)
+        lambda x: nn.stack(x.array, axis=time_step_index), outputs)
 
     if is_reverse:
         final_outputs = map_structure(
-            lambda x: tensor.reverse(x, axis=time_step_index),
-            final_outputs)
+            lambda x: tensor.reverse(x, axis=time_step_index), final_outputs)
 
     final_states = new_states
     return final_outputs, final_states
@@ -638,8 +640,7 @@ def _switch_grad(x, stop=False):
         if sequence_length:
             step_mask = rnn.step_input(mask)
             new_states = map_structure(
-                partial(
-                    _maybe_copy, step_mask=step_mask), states, new_states)
+                partial(_maybe_copy, step_mask=step_mask), states, new_states)
 
         map_structure(rnn.update_memory, states, new_states)
         flat_outputs = flatten(outputs)
@@ -963,15 +964,16 @@ def tile_beam_merge_with_batch(x, beam_size):
         expand_times = [1] * len(x.shape)
         expand_times[1] = beam_size
         x = paddle.tile(x, expand_times)  # [batch_size, beam_size, ...]
-        x = nn.transpose(x, list(range(2, len(x.shape))) +
+        x = nn.transpose(x,
+                         list(range(2, len(x.shape))) +
                          [0, 1])  # [..., batch_size, beam_size]
         # use 0 to copy to avoid wrong shape
-        x = nn.reshape(
-            x, shape=[0] *
-            (len(x.shape) - 2) + [-1])  # [..., batch_size * beam_size]
+        x = nn.reshape(x, shape=[0] * (len(x.shape) - 2) +
+                       [-1])  # [..., batch_size * beam_size]
         x = nn.transpose(
             x, [len(x.shape) - 1] +
-            list(range(0, len(x.shape) - 1)))  # [batch_size * beam_size, ...]
+            list(range(0,
+                       len(x.shape) - 1)))  # [batch_size * beam_size, ...]
         return x
 
     def _split_batch_beams(self, x):
@@ -1056,8 +1058,7 @@ def _mask_probs(self, probs, finished):
         probs = nn.elementwise_mul(
             paddle.tile(nn.unsqueeze(finished, [2]), [1, 1, self.vocab_size]),
             self.noend_mask_tensor,
-            axis=-1) - nn.elementwise_mul(
-                probs, (finished - 1), axis=0)
+            axis=-1) - nn.elementwise_mul(probs, (finished - 1), axis=0)
         return probs
 
     def _gather(self, x, indices, batch_size):
@@ -1085,10 +1086,8 @@ def _gather(self, x, indices, batch_size):
             indices.dtype) if batch_size.dtype != indices.dtype else batch_size
         batch_size.stop_gradient = True  # TODO: remove this
         batch_pos = paddle.tile(
-            nn.unsqueeze(
-                tensor.range(
-                    0, batch_size, 1, dtype=indices.dtype), [1]),
-            [1, self.beam_size])
+            nn.unsqueeze(tensor.range(0, batch_size, 1, dtype=indices.dtype),
+                         [1]), [1, self.beam_size])
         topk_coordinates = nn.stack([batch_pos, indices], axis=2)
         topk_coordinates.stop_gradient = True
         return nn.gather_nd(x, topk_coordinates)
@@ -1137,22 +1136,22 @@ def initialize(self, initial_cell_states):
         state = flatten(initial_cell_states)[0]
         self.batch_size = nn.shape(state)[0]
 
-        self.start_token_tensor = tensor.fill_constant(
-            shape=[1], dtype="int64", value=self.start_token)
-        self.end_token_tensor = tensor.fill_constant(
-            shape=[1], dtype="int64", value=self.end_token)
+        self.start_token_tensor = tensor.fill_constant(shape=[1],
+                                                       dtype="int64",
+                                                       value=self.start_token)
+        self.end_token_tensor = tensor.fill_constant(shape=[1],
+                                                     dtype="int64",
+                                                     value=self.end_token)
 
         init_cell_states = map_structure(self._expand_to_beam_size,
                                          initial_cell_states)
-        init_inputs = paddle.full(
-            shape=[self.batch_size, self.beam_size],
-            fill_value=self.start_token_tensor,
-            dtype=self.start_token_tensor.dtype)
+        init_inputs = paddle.full(shape=[self.batch_size, self.beam_size],
+                                  fill_value=self.start_token_tensor,
+                                  dtype=self.start_token_tensor.dtype)
         log_probs = paddle.tile(
             tensor.assign(
-                np.array(
-                    [[0.] + [-self.kinf] * (self.beam_size - 1)],
-                    dtype="float32")), [self.batch_size, 1])
+                np.array([[0.] + [-self.kinf] * (self.beam_size - 1)],
+                         dtype="float32")), [self.batch_size, 1])
         if paddle.get_default_dtype() == "float64":
             log_probs = tensor.cast(log_probs, "float64")
         # TODO: remove the restriction of force_cpu
@@ -1198,8 +1197,9 @@ def _beam_search_step(self, time, logits, next_cell_states, beam_state):
 
         """
         self.vocab_size = logits.shape[-1]
-        self.vocab_size_tensor = tensor.fill_constant(
-            shape=[1], dtype="int64", value=self.vocab_size)
+        self.vocab_size_tensor = tensor.fill_constant(shape=[1],
+                                                      dtype="int64",
+                                                      value=self.vocab_size)
         noend_array = [-self.kinf] * self.vocab_size
         noend_array[self.end_token] = 0
 
@@ -1210,8 +1210,9 @@ def _beam_search_step(self, time, logits, next_cell_states, beam_state):
 
         step_log_probs = nn.log(nn.softmax(logits))
         step_log_probs = self._mask_probs(step_log_probs, beam_state.finished)
-        log_probs = nn.elementwise_add(
-            x=step_log_probs, y=beam_state.log_probs, axis=0)
+        log_probs = nn.elementwise_add(x=step_log_probs,
+                                       y=beam_state.log_probs,
+                                       axis=0)
         # TODO: length penalty
         scores = log_probs
         scores = nn.reshape(scores, [-1, self.beam_size * self.vocab_size])
@@ -1230,8 +1231,8 @@ def _beam_search_step(self, time, logits, next_cell_states, beam_state):
                                      self.batch_size)
         next_lengths = self._gather(beam_state.lengths, beam_indices,
                                     self.batch_size)
-        next_lengths = next_lengths + tensor.cast(
-            nn.logical_not(next_finished), beam_state.lengths.dtype)
+        next_lengths = next_lengths + tensor.cast(nn.logical_not(next_finished),
+                                                  beam_state.lengths.dtype)
         next_finished = control_flow.logical_or(
             next_finished,
             control_flow.equal(token_indices, self.end_token_tensor))
@@ -1345,6 +1346,7 @@ def _dynamic_decode_imperative(decoder,
                                is_test=False,
                                return_length=False,
                                **kwargs):
+
     def _maybe_copy(state, new_state, step_mask):
         # TODO: use where_op
         state_dtype = state.dtype
@@ -1357,8 +1359,9 @@ def _maybe_copy(state, new_state, step_mask):
             # to sum(bool) error.
             step_mask.stop_gradient = True
         new_state = nn.elementwise_mul(
-            state, step_mask, axis=0) - nn.elementwise_mul(
-                new_state, (step_mask - 1), axis=0)
+            state, step_mask, axis=0) - nn.elementwise_mul(new_state,
+                                                           (step_mask - 1),
+                                                           axis=0)
         if convert_dtype(state_dtype) in ["bool"]:
             new_state = tensor.cast(new_state, dtype=state_dtype)
         return new_state
@@ -1371,11 +1374,13 @@ def _maybe_copy(state, new_state, step_mask):
     outputs = None
 
     step_idx = 0
-    step_idx_tensor = tensor.fill_constant(
-        shape=[1], dtype="int64", value=step_idx)
+    step_idx_tensor = tensor.fill_constant(shape=[1],
+                                           dtype="int64",
+                                           value=step_idx)
     while cond.numpy():
-        (step_outputs, next_states, next_inputs, next_finished) = decoder.step(
-            step_idx_tensor, inputs, states, **kwargs)
+        (step_outputs, next_states, next_inputs,
+         next_finished) = decoder.step(step_idx_tensor, inputs, states,
+                                       **kwargs)
         if not decoder.tracks_own_finished:
             # BeamSearchDecoder would track it own finished, since
             # beams would be reordered and the finished status of each
@@ -1387,8 +1392,8 @@ def _maybe_copy(state, new_state, step_mask):
             tensor.assign(next_finished, finished)
             next_sequence_lengths = nn.elementwise_add(
                 sequence_lengths,
-                tensor.cast(
-                    control_flow.logical_not(finished), sequence_lengths.dtype))
+                tensor.cast(control_flow.logical_not(finished),
+                            sequence_lengths.dtype))
             if impute_finished:  # rectify the states for the finished.
                 next_states = map_structure(
                     lambda x, y: _maybe_copy(x, y, finished), states,
@@ -1404,8 +1409,9 @@ def _maybe_copy(state, new_state, step_mask):
             lambda x: ArrayWrapper(x),
             step_outputs) if step_idx == 0 else map_structure(
                 lambda x, x_array: x_array.append(x), step_outputs, outputs)
-        inputs, states, finished, sequence_lengths = (
-            next_inputs, next_states, next_finished, next_sequence_lengths)
+        inputs, states, finished, sequence_lengths = (next_inputs, next_states,
+                                                      next_finished,
+                                                      next_sequence_lengths)
 
         control_flow.increment(x=step_idx_tensor, value=1.0, in_place=True)
         step_idx += 1
@@ -1418,8 +1424,9 @@ def _maybe_copy(state, new_state, step_mask):
     final_states = states
 
     try:
-        final_outputs, final_states = decoder.finalize(
-            final_outputs, final_states, sequence_lengths)
+        final_outputs, final_states = decoder.finalize(final_outputs,
+                                                       final_states,
+                                                       sequence_lengths)
     except NotImplementedError:
         pass
 
@@ -1442,15 +1449,17 @@ def _dynamic_decode_declarative(decoder,
                                 return_length=False,
                                 **kwargs):
     initial_inputs, initial_states, initial_finished = decoder.initialize(inits)
-    global_inputs, global_states, global_finished = (
-        initial_inputs, initial_states, initial_finished)
+    global_inputs, global_states, global_finished = (initial_inputs,
+                                                     initial_states,
+                                                     initial_finished)
     global_finished.stop_gradient = True
     step_idx = tensor.fill_constant(shape=[1], dtype="int64", value=0)
 
     cond = control_flow.logical_not((nn.reduce_all(initial_finished)))
     if max_step_num is not None:
-        max_step_num = tensor.fill_constant(
-            shape=[1], dtype="int64", value=max_step_num)
+        max_step_num = tensor.fill_constant(shape=[1],
+                                            dtype="int64",
+                                            value=max_step_num)
     while_op = control_flow.While(cond, is_test=is_test)
 
     sequence_lengths = tensor.cast(tensor.zeros_like(initial_finished), "int64")
@@ -1479,8 +1488,9 @@ def _maybe_copy(state, new_state, step_mask):
             # to sum(bool) error.
             step_mask.stop_gradient = True
         new_state = nn.elementwise_mul(
-            state, step_mask, axis=0) - nn.elementwise_mul(
-                new_state, (step_mask - 1), axis=0)
+            state, step_mask, axis=0) - nn.elementwise_mul(new_state,
+                                                           (step_mask - 1),
+                                                           axis=0)
         if convert_dtype(state_dtype) in ["bool"]:
             new_state = tensor.cast(new_state, dtype=state_dtype)
         return new_state
@@ -1516,14 +1526,14 @@ def _create_array_out_of_while(dtype):
                                                     global_finished)
             next_sequence_lengths = nn.elementwise_add(
                 sequence_lengths,
-                tensor.cast(
-                    control_flow.logical_not(global_finished),
-                    sequence_lengths.dtype))
+                tensor.cast(control_flow.logical_not(global_finished),
+                            sequence_lengths.dtype))
             if impute_finished:  # rectify the states for the finished.
                 next_states = map_structure(
                     lambda x, y: _maybe_copy(x, y, global_finished),
                     states,
-                    next_states, )
+                    next_states,
+                )
         else:
             warnings.warn(
                 "`next_states` has no `lengths` attribute, the returned `sequence_lengths` would be all zeros."
@@ -1571,8 +1581,9 @@ def _create_array_out_of_while(dtype):
             states_arrays)
 
     try:
-        final_outputs, final_states = decoder.finalize(
-            final_outputs, final_states, sequence_lengths)
+        final_outputs, final_states = decoder.finalize(final_outputs,
+                                                       final_states,
+                                                       sequence_lengths)
     except NotImplementedError:
         pass
 
@@ -1821,8 +1832,9 @@ def initialize(self):
         """
         init_finished = control_flow.equal(
             self.sequence_length,
-            tensor.fill_constant(
-                shape=[1], dtype=self.sequence_length.dtype, value=0))
+            tensor.fill_constant(shape=[1],
+                                 dtype=self.sequence_length.dtype,
+                                 value=0))
         # TODO: support zero length
         init_inputs = map_structure(
             lambda x: x[0] if self.time_major else x[:, 0], self.inputs)
@@ -1879,9 +1891,8 @@ def next_inputs(self, time, outputs, states, sample_ids):
                 shape `[batch_size]`.
         """
         # TODO: compatibility of int32 and int64
-        time = tensor.cast(
-            time,
-            "int32") if convert_dtype(time.dtype) not in ["int32"] else time
+        time = tensor.cast(time, "int32") if convert_dtype(
+            time.dtype) not in ["int32"] else time
         if self.sequence_length.dtype != time.dtype:
             self.sequence_length = tensor.cast(self.sequence_length, time.dtype)
         next_time = time + 1
@@ -1889,10 +1900,11 @@ def next_inputs(self, time, outputs, states, sample_ids):
 
         def _slice(x):  # TODO: use Variable.__getitem__
             axes = [0 if self.time_major else 1]
-            return nn.squeeze(
-                nn.slice(
-                    x, axes=axes, starts=[next_time], ends=[next_time + 1]),
-                axes=axes)
+            return nn.squeeze(nn.slice(x,
+                                       axes=axes,
+                                       starts=[next_time],
+                                       ends=[next_time + 1]),
+                              axes=axes)
 
         next_inputs = map_structure(_slice, self.inputs_)
         return finished, next_inputs, states
@@ -1950,8 +1962,9 @@ def __init__(self, embedding_fn, start_tokens, end_token):
         """
         self.embedding_fn = embedding_fn
         self.start_tokens = start_tokens
-        self.end_token = tensor.fill_constant(
-            shape=[1], dtype="int64", value=end_token)
+        self.end_token = tensor.fill_constant(shape=[1],
+                                              dtype="int64",
+                                              value=end_token)
 
     def initialize(self):
         r"""
@@ -2125,8 +2138,9 @@ def sample(self, time, outputs, states):
         # not pass to probs, since sampling_id op does not have corresponding
         # grad op and thus can not pass.
         probs.stop_gradient = True
-        sample_ids = nn.sampling_id(
-            probs, seed=self.seed, dtype=self.start_tokens.dtype)
+        sample_ids = nn.sampling_id(probs,
+                                    seed=self.seed,
+                                    dtype=self.start_tokens.dtype)
         return sample_ids
 
 
@@ -2253,14 +2267,15 @@ def step(self, time, inputs, states, **kwargs):
         cell_outputs, cell_states = self.cell(inputs, states, **kwargs)
         if self.output_fn is not None:
             cell_outputs = self.output_fn(cell_outputs)
-        sample_ids = self.helper.sample(
-            time=time, outputs=cell_outputs, states=cell_states)
+        sample_ids = self.helper.sample(time=time,
+                                        outputs=cell_outputs,
+                                        states=cell_states)
         sample_ids.stop_gradient = True
-        (finished, next_inputs, next_states) = self.helper.next_inputs(
-            time=time,
-            outputs=cell_outputs,
-            states=cell_states,
-            sample_ids=sample_ids)
+        (finished, next_inputs,
+         next_states) = self.helper.next_inputs(time=time,
+                                                outputs=cell_outputs,
+                                                states=cell_states,
+                                                sample_ids=sample_ids)
         outputs = self.OutputWrapper(cell_outputs, sample_ids)
         return (outputs, next_states, next_inputs, finished)
 
@@ -2396,13 +2411,16 @@ def dynamic_lstm(input,
 
     helper = LayerHelper('lstm', **locals())
     size = size // 4
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+    weight = helper.create_parameter(attr=helper.param_attr,
+                                     shape=[size, 4 * size],
+                                     dtype=dtype)
     bias_size = [1, 7 * size]
     if not use_peepholes:
         bias_size[1] = 4 * size
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+    bias = helper.create_parameter(attr=helper.bias_attr,
+                                   shape=bias_size,
+                                   dtype=dtype,
+                                   is_bias=True)
 
     hidden = helper.create_variable_for_type_inference(dtype)
     cell = helper.create_variable_for_type_inference(dtype)
@@ -2419,29 +2437,27 @@ def dynamic_lstm(input,
             'The shape of c0 should be (batch_size, %d)' % size
         inputs['C0'] = c_0
 
-    helper.append_op(
-        type='lstm',
-        inputs=inputs,
-        outputs={
-            'Hidden': hidden,
-            'Cell': cell,
-            'BatchGate': batch_gate,
-            'BatchCellPreAct': batch_cell_pre_act
-        },
-        attrs={
-            'use_peepholes': use_peepholes,
-            'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
-            'cell_activation': cell_activation,
-            'candidate_activation': candidate_activation
-        })
+    helper.append_op(type='lstm',
+                     inputs=inputs,
+                     outputs={
+                         'Hidden': hidden,
+                         'Cell': cell,
+                         'BatchGate': batch_gate,
+                         'BatchCellPreAct': batch_cell_pre_act
+                     },
+                     attrs={
+                         'use_peepholes': use_peepholes,
+                         'is_reverse': is_reverse,
+                         'gate_activation': gate_activation,
+                         'cell_activation': cell_activation,
+                         'candidate_activation': candidate_activation
+                     })
     return hidden, cell
 
 
-@deprecated(
-    since='2.0.0',
-    update_to='paddle.nn.LSTM',
-    reason="This API may occur CUDNN errors.")
+@deprecated(since='2.0.0',
+            update_to='paddle.nn.LSTM',
+            reason="This API may occur CUDNN errors.")
 def lstm(input,
          init_h,
          init_c,
@@ -2580,11 +2596,10 @@ def lstm(input,
         weight_size += input_weight_size + hidden_weight_size
         weight_size += hidden_size * 8 * num_dirrection
 
-    weight = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[weight_size],
-        dtype=dtype,
-        default_initializer=default_initializer)
+    weight = helper.create_parameter(attr=helper.param_attr,
+                                     shape=[weight_size],
+                                     dtype=dtype,
+                                     default_initializer=default_initializer)
 
     out = helper.create_variable_for_type_inference(dtype)
     last_h = helper.create_variable_for_type_inference(dtype)
@@ -2595,30 +2610,29 @@ def lstm(input,
         dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
     state_out.persistable = True
 
-    helper.append_op(
-        type='cudnn_lstm',
-        inputs={
-            'Input': input,
-            'InitH': init_h,
-            'InitC': init_c,
-            'W': weight,
-        },
-        outputs={
-            'Out': out,
-            'LastH': last_h,
-            'LastC': last_c,
-            'Reserve': reserve,
-            'StateOut': state_out,
-        },
-        attrs={
-            'is_bidirec': is_bidirec,
-            'input_size': input_size,
-            'hidden_size': hidden_size,
-            'num_layers': num_layers,
-            'is_test': is_test,
-            'dropout_prob': dropout_prob,
-            'seed': seed,
-        })
+    helper.append_op(type='cudnn_lstm',
+                     inputs={
+                         'Input': input,
+                         'InitH': init_h,
+                         'InitC': init_c,
+                         'W': weight,
+                     },
+                     outputs={
+                         'Out': out,
+                         'LastH': last_h,
+                         'LastC': last_c,
+                         'Reserve': reserve,
+                         'StateOut': state_out,
+                     },
+                     attrs={
+                         'is_bidirec': is_bidirec,
+                         'input_size': input_size,
+                         'hidden_size': hidden_size,
+                         'num_layers': num_layers,
+                         'is_test': is_test,
+                         'dropout_prob': dropout_prob,
+                         'seed': seed,
+                     })
     return out, last_h, last_c
 
 
@@ -2781,15 +2795,19 @@ def dynamic_lstmp(input,
 
     helper = LayerHelper('lstmp', **locals())
     size = size // 4
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[proj_size, 4 * size], dtype=dtype)
-    proj_weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, proj_size], dtype=dtype)
+    weight = helper.create_parameter(attr=helper.param_attr,
+                                     shape=[proj_size, 4 * size],
+                                     dtype=dtype)
+    proj_weight = helper.create_parameter(attr=helper.param_attr,
+                                          shape=[size, proj_size],
+                                          dtype=dtype)
     bias_size = [1, 7 * size]
     if not use_peepholes:
         bias_size[1] = 4 * size
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+    bias = helper.create_parameter(attr=helper.bias_attr,
+                                   shape=bias_size,
+                                   dtype=dtype,
+                                   is_bias=True)
 
     projection = helper.create_variable_for_type_inference(dtype)
     cell = helper.create_variable_for_type_inference(dtype)
@@ -2818,26 +2836,25 @@ def dynamic_lstmp(input,
     if proj_clip:
         assert proj_clip >= 0, "proj_clip should not be negative."
 
-    helper.append_op(
-        type='lstmp',
-        inputs=inputs,
-        outputs={
-            'Projection': projection,
-            'Cell': cell,
-            'BatchHidden': batch_hidden,
-            'BatchGate': batch_gate,
-            'BatchCellPreAct': batch_cell_pre_act
-        },
-        attrs={
-            'use_peepholes': use_peepholes,
-            'cell_clip': cell_clip,
-            'proj_clip': proj_clip,
-            'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
-            'cell_activation': cell_activation,
-            'candidate_activation': candidate_activation,
-            'proj_activation': proj_activation
-        })
+    helper.append_op(type='lstmp',
+                     inputs=inputs,
+                     outputs={
+                         'Projection': projection,
+                         'Cell': cell,
+                         'BatchHidden': batch_hidden,
+                         'BatchGate': batch_gate,
+                         'BatchCellPreAct': batch_cell_pre_act
+                     },
+                     attrs={
+                         'use_peepholes': use_peepholes,
+                         'cell_clip': cell_clip,
+                         'proj_clip': proj_clip,
+                         'is_reverse': is_reverse,
+                         'gate_activation': gate_activation,
+                         'cell_activation': cell_activation,
+                         'candidate_activation': candidate_activation,
+                         'proj_activation': proj_activation
+                     })
     return projection, cell
 
 
@@ -2969,16 +2986,19 @@ def dynamic_gru(input,
     helper = LayerHelper('gru', **locals())
     dtype = helper.input_dtype()
 
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
+    weight = helper.create_parameter(attr=helper.param_attr,
+                                     shape=[size, 3 * size],
+                                     dtype=dtype)
+    bias = helper.create_parameter(attr=helper.bias_attr,
+                                   shape=[1, 3 * size],
+                                   dtype=dtype,
+                                   is_bias=True)
     batch_size = input.shape[0]
     inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
     if h_0:
         assert h_0.shape == (
-            batch_size, size
-        ), 'The shape of h0 should be(batch_size, %d)' % size
+            batch_size,
+            size), 'The shape of h0 should be(batch_size, %d)' % size
         inputs['H0'] = h_0
 
     hidden = helper.create_variable_for_type_inference(dtype)
@@ -2986,21 +3006,20 @@ def dynamic_gru(input,
     batch_reset_hidden_prev = helper.create_variable_for_type_inference(dtype)
     batch_hidden = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type='gru',
-        inputs=inputs,
-        outputs={
-            'Hidden': hidden,
-            'BatchGate': batch_gate,
-            'BatchResetHiddenPrev': batch_reset_hidden_prev,
-            'BatchHidden': batch_hidden
-        },
-        attrs={
-            'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
-            'activation': candidate_activation,
-            'origin_mode': origin_mode
-        })
+    helper.append_op(type='gru',
+                     inputs=inputs,
+                     outputs={
+                         'Hidden': hidden,
+                         'BatchGate': batch_gate,
+                         'BatchResetHiddenPrev': batch_reset_hidden_prev,
+                         'BatchHidden': batch_hidden
+                     },
+                     attrs={
+                         'is_reverse': is_reverse,
+                         'gate_activation': gate_activation,
+                         'activation': candidate_activation,
+                         'origin_mode': origin_mode
+                     })
     return hidden
 
 
@@ -3120,7 +3139,8 @@ def gru_unit(input,
         identity=0,
         sigmoid=1,
         tanh=2,
-        relu=3, )
+        relu=3,
+    )
     activation = activation_dict[activation]
     gate_activation = activation_dict[gate_activation]
 
@@ -3129,8 +3149,9 @@ def gru_unit(input,
     size = size // 3
 
     # create weight
-    weight = helper.create_parameter(
-        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+    weight = helper.create_parameter(attr=helper.param_attr,
+                                     shape=[size, 3 * size],
+                                     dtype=dtype)
 
     gate = helper.create_variable_for_type_inference(dtype)
     reset_hidden_pre = helper.create_variable_for_type_inference(dtype)
@@ -3139,8 +3160,10 @@ def gru_unit(input,
     # create bias
     if helper.bias_attr:
         bias_size = [1, 3 * size]
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+        bias = helper.create_parameter(attr=helper.bias_attr,
+                                       shape=bias_size,
+                                       dtype=dtype,
+                                       is_bias=True)
         inputs['Bias'] = bias
 
     helper.append_op(
@@ -3384,16 +3407,19 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
     sentence_scores = helper.create_variable_for_type_inference(
         dtype=scores.dtype)
 
-    helper.append_op(
-        type="beam_search_decode",
-        inputs={"Ids": ids,
-                "Scores": scores},
-        outputs={
-            "SentenceIds": sentence_ids,
-            "SentenceScores": sentence_scores
-        },
-        attrs={"beam_size": beam_size,
-               "end_id": end_id})
+    helper.append_op(type="beam_search_decode",
+                     inputs={
+                         "Ids": ids,
+                         "Scores": scores
+                     },
+                     outputs={
+                         "SentenceIds": sentence_ids,
+                         "SentenceScores": sentence_scores
+                     },
+                     attrs={
+                         "beam_size": beam_size,
+                         "end_id": end_id
+                     })
 
     return sentence_ids, sentence_scores
 
@@ -3491,8 +3517,8 @@ def lstm_unit(x_t,
     check_variable_and_dtype(x_t, 'x_t', ['float32', 'float64'], 'lstm_unit')
     check_variable_and_dtype(hidden_t_prev, 'hidden_t_prev',
                              ['float32', 'float64'], 'lstm_unit')
-    check_variable_and_dtype(cell_t_prev, 'cell_t_prev',
-                             ['float32', 'float64'], 'lstm_unit')
+    check_variable_and_dtype(cell_t_prev, 'cell_t_prev', ['float32', 'float64'],
+                             'lstm_unit')
     if len(x_t.shape) != 2:
         raise ValueError("Rank of x_t must be 2.")
 
@@ -3524,12 +3550,15 @@ def lstm_unit(x_t,
     c = helper.create_variable_for_type_inference(dtype)
     h = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type='lstm_unit',
-        inputs={"X": fc_out,
-                "C_prev": cell_t_prev},
-        outputs={"C": c,
-                 "H": h},
-        attrs={"forget_bias": forget_bias})
+    helper.append_op(type='lstm_unit',
+                     inputs={
+                         "X": fc_out,
+                         "C_prev": cell_t_prev
+                     },
+                     outputs={
+                         "C": c,
+                         "H": h
+                     },
+                     attrs={"forget_bias": forget_bias})
 
     return h, c
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index 702e38f3d2368..4a213a7a146c8 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -155,24 +155,24 @@ def sequence_conv(input,
     helper = LayerHelper('sequence_conv', **locals())
     dtype = helper.input_dtype()
     filter_shape = [filter_size * input.shape[1], num_filters]
-    filter_param = helper.create_parameter(
-        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
+    filter_param = helper.create_parameter(attr=helper.param_attr,
+                                           shape=filter_shape,
+                                           dtype=dtype)
     pre_bias = helper.create_variable_for_type_inference(dtype)
     if padding_start is None:
         padding_start = -int(filter_size // 2)
 
-    helper.append_op(
-        type='sequence_conv',
-        inputs={
-            'X': [input],
-            'Filter': [filter_param],
-        },
-        outputs={"Out": pre_bias},
-        attrs={
-            'contextStride': filter_stride,
-            'contextStart': padding_start,
-            'contextLength': filter_size,
-        })
+    helper.append_op(type='sequence_conv',
+                     inputs={
+                         'X': [input],
+                         'Filter': [filter_param],
+                     },
+                     outputs={"Out": pre_bias},
+                     attrs={
+                         'contextStride': filter_stride,
+                         'contextStart': padding_start,
+                         'contextLength': filter_size,
+                     })
     pre_act = helper.append_bias_op(pre_bias)
     return helper.append_activation(pre_act)
 
@@ -255,11 +255,10 @@ def sequence_softmax(input, use_cudnn=False, name=None):
                              'sequence_softmax')
     dtype = helper.input_dtype()
     softmax_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="sequence_softmax",
-        inputs={"X": input},
-        outputs={"Out": softmax_out},
-        attrs={"use_cudnn": use_cudnn})
+    helper.append_op(type="sequence_softmax",
+                     inputs={"X": input},
+                     outputs={"Out": softmax_out},
+                     attrs={"use_cudnn": use_cudnn})
     return softmax_out
 
 
@@ -359,16 +358,17 @@ def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
     pool_out = helper.create_variable_for_type_inference(dtype)
     max_index = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type="sequence_pool",
-        inputs={"X": input},
-        outputs={"Out": pool_out,
-                 "MaxIndex": max_index},
-        attrs={
-            "pooltype": pool_type.upper(),
-            "is_test": is_test,
-            "pad_value": pad_value
-        })
+    helper.append_op(type="sequence_pool",
+                     inputs={"X": input},
+                     outputs={
+                         "Out": pool_out,
+                         "MaxIndex": max_index
+                     },
+                     attrs={
+                         "pooltype": pool_type.upper(),
+                         "is_test": is_test,
+                         "pad_value": pad_value
+                     })
 
     # when pool_type is max, variable max_index is initialized,
     # so we stop the gradient explicitly here
@@ -437,8 +437,9 @@ def sequence_concat(input, name=None):
                                  'fluid.layers.sequence_concat')
 
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(
-        type='sequence_concat', inputs={'X': input}, outputs={'Out': [out]})
+    helper.append_op(type='sequence_concat',
+                     inputs={'X': input},
+                     outputs={'Out': [out]})
     return out
 
 
@@ -639,12 +640,13 @@ def sequence_slice(input, offset, length, name=None):
     offset.stop_gradient = True
     length.stop_gradient = True
 
-    helper.append_op(
-        type="sequence_slice",
-        inputs={"X": input,
-                "Offset": offset,
-                "Length": length},
-        outputs={"Out": out})
+    helper.append_op(type="sequence_slice",
+                     inputs={
+                         "X": input,
+                         "Offset": offset,
+                         "Length": length
+                     },
+                     outputs={"Out": out})
 
     return out
 
@@ -777,12 +779,13 @@ def sequence_expand(x, y, ref_level=-1, name=None):
     helper = LayerHelper('sequence_expand', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     tmp = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='sequence_expand',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': tmp},
-        attrs={'ref_level': ref_level})
+    helper.append_op(type='sequence_expand',
+                     inputs={
+                         'X': x,
+                         'Y': y
+                     },
+                     outputs={'Out': tmp},
+                     attrs={'ref_level': ref_level})
     return tmp
 
 
@@ -899,11 +902,12 @@ def sequence_expand_as(x, y, name=None):
     helper = LayerHelper('sequence_expand_as', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     tmp = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='sequence_expand_as',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': tmp})
+    helper.append_op(type='sequence_expand_as',
+                     inputs={
+                         'X': x,
+                         'Y': y
+                     },
+                     outputs={'Out': tmp})
     return tmp
 
 
@@ -1012,13 +1016,16 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
 
     if maxlen is None:
         maxlen = -1
-    helper.append_op(
-        type='sequence_pad',
-        inputs={'X': x,
-                'PadValue': pad_value},
-        outputs={'Out': out,
-                 'Length': length},
-        attrs={'padded_length': maxlen})
+    helper.append_op(type='sequence_pad',
+                     inputs={
+                         'X': x,
+                         'PadValue': pad_value
+                     },
+                     outputs={
+                         'Out': out,
+                         'Length': length
+                     },
+                     attrs={'padded_length': maxlen})
     return out, length
 
 
@@ -1091,11 +1098,12 @@ def sequence_unpad(x, length, name=None):
 
     length.stop_gradient = True
 
-    helper.append_op(
-        type='sequence_unpad',
-        inputs={'X': x,
-                'Length': length},
-        outputs={'Out': out})
+    helper.append_op(type='sequence_unpad',
+                     inputs={
+                         'X': x,
+                         'Length': length
+                     },
+                     outputs={'Out': out})
     return out
 
 
@@ -1155,11 +1163,10 @@ def sequence_reshape(input, new_dim):
                              ['float32', 'float64', 'int32', 'int64'],
                              'fluid.layers.sequence_reshape')
     out = helper.create_variable_for_type_inference(helper.input_dtype())
-    helper.append_op(
-        type='sequence_reshape',
-        inputs={'X': [input]},
-        outputs={'Out': [out]},
-        attrs={'new_dim': new_dim})
+    helper.append_op(type='sequence_reshape',
+                     inputs={'X': [input]},
+                     outputs={'Out': [out]},
+                     attrs={'new_dim': new_dim})
     return out
 
 
@@ -1245,12 +1252,13 @@ def sequence_scatter(input, index, updates, name=None):
 
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="sequence_scatter",
-        inputs={"X": input,
-                "Ids": index,
-                "Updates": updates},
-        outputs={"Out": out})
+    helper.append_op(type="sequence_scatter",
+                     inputs={
+                         "X": input,
+                         "Ids": index,
+                         "Updates": updates
+                     },
+                     outputs={"Out": out})
     return out
 
 
@@ -1312,14 +1320,15 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
     check_variable_and_dtype(input, 'input', ['int32', 'int64'],
                              'sequence_enumerate')
     helper = LayerHelper('sequence_enumerate', **locals())
-    out = helper.create_variable_for_type_inference(
-        helper.input_dtype(), stop_gradient=True)
-    helper.append_op(
-        type='sequence_enumerate',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={'win_size': win_size,
-               'pad_value': pad_value})
+    out = helper.create_variable_for_type_inference(helper.input_dtype(),
+                                                    stop_gradient=True)
+    helper.append_op(type='sequence_enumerate',
+                     inputs={'X': input},
+                     outputs={'Out': out},
+                     attrs={
+                         'win_size': win_size,
+                         'pad_value': pad_value
+                     })
     return out
 
 
@@ -1441,9 +1450,8 @@ def sequence_reverse(x, name=None):
                              'fluid.layers.sequence_reverse')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type="sequence_reverse",
-        inputs={"X": x},
-        outputs={"Y": out},
-        attrs=dict())
+    helper.append_op(type="sequence_reverse",
+                     inputs={"X": x},
+                     outputs={"Y": out},
+                     attrs=dict())
     return out
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 3b1fcc15ab95f..3c2b044277625 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -90,8 +90,9 @@ def create_tensor(dtype, name=None, persistable=False):
         'int64'
     ], 'create_tensor')
     helper = LayerHelper("create_tensor", **locals())
-    return helper.create_variable(
-        name=helper.name, dtype=dtype, persistable=persistable)
+    return helper.create_variable(name=helper.name,
+                                  dtype=dtype,
+                                  persistable=persistable)
 
 
 def create_parameter(shape,
@@ -148,8 +149,7 @@ def create_parameter(shape,
     helper = LayerHelper("create_parameter", **locals())
     if attr is None:
         attr = ParamAttr(name=name)
-    return helper.create_parameter(attr, shape,
-                                   convert_dtype(dtype), is_bias,
+    return helper.create_parameter(attr, shape, convert_dtype(dtype), is_bias,
                                    default_initializer)
 
 
@@ -206,15 +206,14 @@ def create_global_var(shape,
     ], 'create_global_var')
 
     helper = LayerHelper("global_var", **locals())
-    var = helper.create_global_variable(
-        dtype=dtype,
-        shape=shape,
-        persistable=persistable,
-        name=name,
-        stop_gradient=True)
-    helper.set_variable_initializer(
-        var, initializer=Constant(
-            value=float(value), force_cpu=force_cpu))
+    var = helper.create_global_variable(dtype=dtype,
+                                        shape=shape,
+                                        persistable=persistable,
+                                        name=name,
+                                        stop_gradient=True)
+    helper.set_variable_initializer(var,
+                                    initializer=Constant(value=float(value),
+                                                         force_cpu=force_cpu))
 
     return var
 
@@ -266,12 +265,13 @@ def cast(x, dtype):
     helper = LayerHelper('cast', **locals())
     out = helper.create_variable_for_type_inference(
         dtype=dtype, stop_gradient=x.stop_gradient)
-    helper.append_op(
-        type='cast',
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={'in_dtype': x.dtype,
-               'out_dtype': out.dtype})
+    helper.append_op(type='cast',
+                     inputs={'X': [x]},
+                     outputs={'Out': [out]},
+                     attrs={
+                         'in_dtype': x.dtype,
+                         'out_dtype': out.dtype
+                     })
     return out
 
 
@@ -352,7 +352,8 @@ def concat(input, axis=0, name=None):
                 'concat')
             if x.dtype != input[0].dtype:
                 raise TypeError(
-                    "All the Tensors in the input must have the same data type.")
+                    "All the Tensors in the input must have the same data type."
+                )
     else:
         input = [input]
     check_type(axis, 'axis', (int, Variable), 'concat')
@@ -360,7 +361,8 @@ def concat(input, axis=0, name=None):
     if isinstance(axis, Variable):
         check_dtype(
             axis.dtype, 'axis', ['int32', 'int64'], 'concat',
-            "The data type of axis must be int32 or int64 when axis is a Tensor")
+            "The data type of axis must be int32 or int64 when axis is a Tensor"
+        )
 
     helper = LayerHelper('concat', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
@@ -373,13 +375,16 @@ def concat(input, axis=0, name=None):
         assert len(input) == 1, "If the elements of 'input' in concat are Variable(LoDTensorArray), " \
                 "number of the elements must be 1, but received %s." % len(input)
         out_index = helper.create_variable_for_type_inference(dtype="int32")
-        helper.append_op(
-            type='tensor_array_to_tensor',
-            inputs={'X': input[0]},
-            outputs={'Out': [out],
-                     'OutIndex': [out_index]},
-            attrs={'axis': axis,
-                   'use_stack': False})
+        helper.append_op(type='tensor_array_to_tensor',
+                         inputs={'X': input[0]},
+                         outputs={
+                             'Out': [out],
+                             'OutIndex': [out_index]
+                         },
+                         attrs={
+                             'axis': axis,
+                             'use_stack': False
+                         })
     else:
         inputs = {'X': input}
         attrs = {}
@@ -389,8 +394,10 @@ def concat(input, axis=0, name=None):
         else:
             attrs['axis'] = axis
 
-        helper.append_op(
-            type='concat', inputs=inputs, outputs={'Out': [out]}, attrs=attrs)
+        helper.append_op(type='concat',
+                         inputs=inputs,
+                         outputs={'Out': [out]},
+                         attrs=attrs)
     return out
 
 
@@ -493,13 +500,16 @@ def tensor_array_to_tensor(input, axis=1, name=None, use_stack=False):
     helper = LayerHelper('tensor_array_to_tensor', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     out_index = helper.create_variable_for_type_inference(dtype="int32")
-    helper.append_op(
-        type='tensor_array_to_tensor',
-        inputs={'X': input},
-        outputs={'Out': [out],
-                 'OutIndex': [out_index]},
-        attrs={'axis': axis,
-               'use_stack': use_stack})
+    helper.append_op(type='tensor_array_to_tensor',
+                     inputs={'X': input},
+                     outputs={
+                         'Out': [out],
+                         'OutIndex': [out_index]
+                     },
+                     attrs={
+                         'axis': axis,
+                         'use_stack': use_stack
+                     })
     return out, out_index
 
 
@@ -567,14 +577,14 @@ def sums(input, out=None):
         out = helper.create_variable_for_type_inference(
             dtype=helper.input_dtype())
     else:
-        check_variable_and_dtype(
-            out, "out", ['float32', 'float64', 'int32', 'int64'], 'sums')
-
-    helper.append_op(
-        type='sum',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={'use_mkldnn': False})
+        check_variable_and_dtype(out, "out",
+                                 ['float32', 'float64', 'int32', 'int64'],
+                                 'sums')
+
+    helper.append_op(type='sum',
+                     inputs={'X': input},
+                     outputs={'Out': out},
+                     attrs={'use_mkldnn': False})
     return out
 
 
@@ -609,8 +619,9 @@ def assign(input, output=None):
           result3 = paddle.assign(np.array([[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]], dtype='float32')) # result3 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
     """
     helper = LayerHelper('assign', **locals())
-    check_type(input, 'input', (Variable, numpy.ndarray, list, tuple, float,
-                                int, bool), 'assign')
+    check_type(input, 'input',
+               (Variable, numpy.ndarray, list, tuple, float, int, bool),
+               'assign')
     is_inplace = True if output is not None else False
 
     if numpy.isscalar(input) and not isinstance(input, str):
@@ -641,9 +652,9 @@ def assign(input, output=None):
             if output is None:
                 output = helper.create_variable_for_type_inference(
                     dtype=input.dtype)
-            helper.append_op(
-                type='assign', inputs={'X': [input]},
-                outputs={'Out': [output]})
+            helper.append_op(type='assign',
+                             inputs={'X': [input]},
+                             outputs={'Out': [output]})
     elif isinstance(input, numpy.ndarray):
         # Not support [var, var, ...] currently.
         if len(input.shape) > 0 and any(isinstance(x, Variable) for x in input):
@@ -682,18 +693,16 @@ def assign(input, output=None):
         if output is None:
             output = helper.create_variable_for_type_inference(dtype=dtype)
         if _non_static_mode():
-            _C_ops.assign_value(output, 'shape',
-                                list(input.shape), 'dtype', dtype, value_name,
-                                values)
+            _C_ops.assign_value(output, 'shape', list(input.shape), 'dtype',
+                                dtype, value_name, values)
         else:
-            helper.append_op(
-                type='assign_value',
-                outputs={'Out': [output]},
-                attrs={
-                    'dtype': dtype,
-                    'shape': list(input.shape),
-                    value_name: values
-                })
+            helper.append_op(type='assign_value',
+                             outputs={'Out': [output]},
+                             attrs={
+                                 'dtype': dtype,
+                                 'shape': list(input.shape),
+                                 value_name: values
+                             })
 
     if is_inplace and _non_static_mode():
         output._bump_inplace_version()
@@ -769,8 +778,9 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
                 for item in shape:
                     if not isinstance(item, Variable):
                         shape = list(
-                            map(lambda x: x.numpy().flat[0] if isinstance(x, Variable) else x,
-                                shape))
+                            map(
+                                lambda x: x.numpy().flat[0]
+                                if isinstance(x, Variable) else x, shape))
                         break
 
             if not isinstance(dtype, core.VarDesc.VarType):
@@ -790,10 +800,9 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
                 else:
                     attrs['str_value'] = str(float(value.numpy().item(0)))
 
-            _C_ops.fill_constant(out, 'value',
-                                 float(value), 'force_cpu', force_cpu, 'dtype',
-                                 out.dtype, 'str_value', attrs['str_value'],
-                                 'shape', shape)
+            _C_ops.fill_constant(out, 'value', float(value), 'force_cpu',
+                                 force_cpu, 'dtype', out.dtype, 'str_value',
+                                 attrs['str_value'], 'shape', shape)
             out.stop_gradient = True
             return out
 
@@ -816,18 +825,19 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
                                  'fill_constant')
 
     helper = LayerHelper("fill_constant", **locals())
-    utils.get_shape_tensor_inputs(
-        inputs=inputs, attrs=attrs, shape=shape, op_type='fill_constant')
+    utils.get_shape_tensor_inputs(inputs=inputs,
+                                  attrs=attrs,
+                                  shape=shape,
+                                  op_type='fill_constant')
 
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=dtype)
     attrs['dtype'] = out.dtype
-    helper.append_op(
-        type='fill_constant',
-        inputs=inputs,
-        outputs={'Out': [out]},
-        attrs=attrs,
-        stop_gradient=True)
+    helper.append_op(type='fill_constant',
+                     inputs=inputs,
+                     outputs={'Out': [out]},
+                     attrs=attrs,
+                     stop_gradient=True)
     out.stop_gradient = True
     return out
 
@@ -882,8 +892,9 @@ def fill_constant_batch_size_like(input,
         place = _current_expected_place()
         if force_cpu:
             place = core.CPUPlace()
-        out = _C_ops.final_state_full_batch_size_like(
-            input, shape, dtype, value, input_dim_idx, output_dim_idx, place)
+        out = _C_ops.final_state_full_batch_size_like(input, shape, dtype,
+                                                      value, input_dim_idx,
+                                                      output_dim_idx, place)
         out.stop_gradient = True
         return out
 
@@ -901,11 +912,10 @@ def fill_constant_batch_size_like(input,
         attrs['str_value'] = str(int(value))
     else:
         attrs['str_value'] = str(float(value))
-    helper.append_op(
-        type='fill_constant_batch_size_like',
-        inputs={'Input': input},
-        outputs={'Out': [out]},
-        attrs=attrs)
+    helper.append_op(type='fill_constant_batch_size_like',
+                     inputs={'Input': input},
+                     outputs={'Out': [out]},
+                     attrs=attrs)
     out.stop_gradient = True
     return out
 
@@ -968,11 +978,10 @@ def argmin(x, axis=0):
         'argmin')
     helper = LayerHelper("arg_min", **locals())
     out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64)
-    helper.append_op(
-        type='arg_min',
-        inputs={'X': x},
-        outputs={'Out': [out]},
-        attrs={'axis': axis})
+    helper.append_op(type='arg_min',
+                     inputs={'X': x},
+                     outputs={'Out': [out]},
+                     attrs={'axis': axis})
     out.stop_gradient = True
     return out
 
@@ -1031,11 +1040,10 @@ def argmax(x, axis=0):
         'argmax')
     helper = LayerHelper("arg_max", **locals())
     out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64)
-    helper.append_op(
-        type='arg_max',
-        inputs={'X': x},
-        outputs={'Out': [out]},
-        attrs={'axis': axis})
+    helper.append_op(type='arg_max',
+                     inputs={'X': x},
+                     outputs={'Out': [out]},
+                     attrs={'axis': axis})
     out.stop_gradient = True
     return out
 
@@ -1118,17 +1126,20 @@ def argsort(input, axis=-1, descending=False, name=None):
         input, 'input',
         ['float32', 'float64', 'int16', 'int32', 'int64', 'uint8'], 'argsort')
     helper = LayerHelper("argsort", **locals())
-    out = helper.create_variable_for_type_inference(
-        dtype=input.dtype, stop_gradient=True)
-    ids = helper.create_variable_for_type_inference(
-        VarDesc.VarType.INT64, stop_gradient=True)
-    helper.append_op(
-        type='argsort',
-        inputs={'X': input},
-        outputs={'Out': out,
-                 'Indices': ids},
-        attrs={'axis': axis,
-               'descending': descending})
+    out = helper.create_variable_for_type_inference(dtype=input.dtype,
+                                                    stop_gradient=True)
+    ids = helper.create_variable_for_type_inference(VarDesc.VarType.INT64,
+                                                    stop_gradient=True)
+    helper.append_op(type='argsort',
+                     inputs={'X': input},
+                     outputs={
+                         'Out': out,
+                         'Indices': ids
+                     },
+                     attrs={
+                         'axis': axis,
+                         'descending': descending
+                     })
     return out, ids
 
 
@@ -1254,18 +1265,18 @@ def reverse(x, axis):
 
           reversed_tensor_array = fluid.layers.reverse(tensor_array, 0) # {[[3, 4, 5]], [[0, 1, 2]]}
     """
-    check_variable_and_dtype(
-        x, 'x', ('float32', 'float64', 'int32', 'int64', 'uint8'), 'reverse')
+    check_variable_and_dtype(x, 'x',
+                             ('float32', 'float64', 'int32', 'int64', 'uint8'),
+                             'reverse')
     check_type(axis, 'axis', (int, tuple, list), 'reverse')
     if isinstance(axis, int):
         axis = [axis]
     helper = LayerHelper("reverse", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='reverse',
-        inputs={'X': x},
-        outputs={'Out': [out]},
-        attrs={'axis': axis})
+    helper.append_op(type='reverse',
+                     inputs={'X': x},
+                     outputs={'Out': [out]},
+                     attrs={'axis': axis})
     return out
 
 
@@ -1281,12 +1292,13 @@ def save(x, file_path, overwrite=True):
             error will be thrown.
     """
     helper = LayerHelper("save", **locals())
-    helper.append_op(
-        type="save",
-        inputs={"input": x},
-        outputs={},
-        args={"file_path": file_path,
-              "overwrite": overwrite})
+    helper.append_op(type="save",
+                     inputs={"input": x},
+                     outputs={},
+                     args={
+                         "file_path": file_path,
+                         "overwrite": overwrite
+                     })
 
 
 def save_combine(x, file_path, overwrite=True):
@@ -1318,12 +1330,13 @@ def save_combine(x, file_path, overwrite=True):
             normed = fluid.layers.save_combine([v1, v2], file_path="output")
     """
     helper = LayerHelper("save_combine", **locals())
-    helper.append_op(
-        type="save_combine",
-        inputs={"input": x},
-        outputs={},
-        args={"file_path": file_path,
-              "overwrite": overwrite})
+    helper.append_op(type="save_combine",
+                     inputs={"input": x},
+                     outputs={},
+                     args={
+                         "file_path": file_path,
+                         "overwrite": overwrite
+                     })
 
 
 def load_combine(out, file_path):
@@ -1335,11 +1348,10 @@ def load_combine(out, file_path):
         file_path(str): The path of the disk file.
     """
     helper = LayerHelper("load_combine", **locals())
-    helper.append_op(
-        type="load_combine",
-        inputs={},
-        output={"Out": out},
-        args={"file_path": file_path})
+    helper.append_op(type="load_combine",
+                     inputs={},
+                     output={"Out": out},
+                     args={"file_path": file_path})
 
 
 def has_inf(x):
@@ -1520,12 +1532,13 @@ def range(start, end, step, dtype, name=None):
                 'range/arange')
     helper = LayerHelper('range', **locals())
     out = helper.create_variable_for_type_inference(dtype, shape=out_shape)
-    helper.append_op(
-        type='range',
-        inputs={'Start': start,
-                'End': end,
-                'Step': step},
-        outputs={'Out': out})
+    helper.append_op(type='range',
+                     inputs={
+                         'Start': start,
+                         'End': end,
+                         'Step': step
+                     },
+                     outputs={'Out': out})
     out.stop_gradient = True
     if out_shape is not None:
         out.desc.set_shape(out_shape)
@@ -1605,10 +1618,10 @@ def linspace(start, stop, num, dtype=None, name=None):
         check_dtype(num.dtype, 'num', ['int32'], 'linspace')
     check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'],
                 'linspace')
-    if ((stop_dtype == "float64" or start_dtype == "float64") and
-            out_dtype in ["float32", "int32"]) or ((stop_dtype == "int64" or
-                                                    start_dtype == "int64") and
-                                                   out_dtype == "int32"):
+    if ((stop_dtype == "float64" or start_dtype == "float64")
+            and out_dtype in ["float32", "int32"]) or (
+                (stop_dtype == "int64" or start_dtype == "int64")
+                and out_dtype == "int32"):
         raise ValueError(
             "The dtype of start/stop is {}/{} but the attr(dtype) of linspace is {}, "
             "which may cause data type overflows. Please reset attr(dtype) of linspace."
@@ -1616,13 +1629,14 @@ def linspace(start, stop, num, dtype=None, name=None):
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
-    helper.append_op(
-        type='linspace',
-        inputs={'Start': tensor_start,
-                'Stop': tensor_stop,
-                'Num': tensor_num},
-        attrs={'dtype': dtype},
-        outputs={'Out': [out]})
+    helper.append_op(type='linspace',
+                     inputs={
+                         'Start': tensor_start,
+                         'Stop': tensor_stop,
+                         'Num': tensor_num
+                     },
+                     attrs={'dtype': dtype},
+                     outputs={'Out': [out]})
     if isinstance(num, int):
         out.desc.set_shape((num, ))
     return out
@@ -1655,8 +1669,9 @@ def zeros_like(x, out=None):
 
     """
 
-    check_variable_and_dtype(
-        x, "x", ['bool', 'float32', 'float64', 'int32', 'int64'], 'ones_like')
+    check_variable_and_dtype(x, "x",
+                             ['bool', 'float32', 'float64', 'int32', 'int64'],
+                             'ones_like')
     helper = LayerHelper("zeros_like", **locals())
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -1665,8 +1680,9 @@ def zeros_like(x, out=None):
             out, "out", ['bool', 'float32', 'float64', 'int32', 'int64'],
             'zeros_like')
 
-    helper.append_op(
-        type='fill_zeros_like', inputs={'X': [x]}, outputs={'Out': [out]})
+    helper.append_op(type='fill_zeros_like',
+                     inputs={'X': [x]},
+                     outputs={'Out': [out]})
     out.stop_gradient = True
     return out
 
@@ -1712,8 +1728,9 @@ def diag(diagonal):
 
     out = helper.create_variable_for_type_inference(dtype=diagonal.dtype)
 
-    helper.append_op(
-        type='diag', inputs={'Diagonal': [diagonal]}, outputs={'Out': [out]})
+    helper.append_op(type='diag',
+                     inputs={'Diagonal': [diagonal]},
+                     outputs={'Out': [out]})
 
     out.stop_gradient = True
     return out
@@ -1782,16 +1799,15 @@ def eye(num_rows,
         if not isinstance(num_rows, int) or num_rows < 0:
             raise TypeError("num_rows should be a non-negative int")
         out = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(
-            type='eye',
-            inputs={},
-            outputs={'Out': [out]},
-            attrs={
-                'num_rows': num_rows,
-                'num_columns': num_columns,
-                'dtype': dtype
-            },
-            stop_gradient=True)
+        helper.append_op(type='eye',
+                         inputs={},
+                         outputs={'Out': [out]},
+                         attrs={
+                             'num_rows': num_rows,
+                             'num_columns': num_columns,
+                             'dtype': dtype
+                         },
+                         stop_gradient=True)
 
     if batch_shape is not None:
         re_shape = [1] * len(batch_shape)
@@ -1838,8 +1854,9 @@ def ones_like(x, out=None):
           data = fluid.layers.ones_like(x) # [1.0, 1.0, 1.0]
 
     """
-    check_variable_and_dtype(
-        x, "x", ['bool', 'float32', 'float64', 'int32', 'int64'], 'ones_like')
+    check_variable_and_dtype(x, "x",
+                             ['bool', 'float32', 'float64', 'int32', 'int64'],
+                             'ones_like')
 
     helper = LayerHelper("ones_like", **locals())
     if out is None:
@@ -1848,11 +1865,10 @@ def ones_like(x, out=None):
         check_variable_and_dtype(
             out, "out", ['bool', 'float32', 'float64', 'int32', 'int64'],
             'ones_like')
-    helper.append_op(
-        type='fill_any_like',
-        inputs={'X': [x]},
-        attrs={'value': 1.0},
-        outputs={'Out': [out]})
+    helper.append_op(type='fill_any_like',
+                     inputs={'X': [x]},
+                     attrs={'value': 1.0},
+                     outputs={'Out': [out]})
     return out
 
 
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index 5d781a437fe8f..ca11727221f23 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -48,14 +48,16 @@ def convert_to_list(value, n, name, dtype=int):
         passed.
     """
     if isinstance(value, dtype):
-        return [value, ] * n
+        return [
+            value,
+        ] * n
     else:
         try:
             value_list = list(value)
         except TypeError:
             raise ValueError("The " + name +
-                             "'s type must be list or tuple. Received: " + str(
-                                 value))
+                             "'s type must be list or tuple. Received: " +
+                             str(value))
         if len(value_list) != n:
             raise ValueError("The " + name + "'s length must be " + str(n) +
                              ". Received: " + str(value))
@@ -63,12 +65,12 @@ def convert_to_list(value, n, name, dtype=int):
             try:
                 dtype(single_value)
             except (ValueError, TypeError):
-                raise ValueError(
-                    "The " + name + "'s type must be a list or tuple of " + str(
-                        n) + " " + str(dtype) + " . Received: " + str(
-                            value) + " "
-                    "including element " + str(single_value) + " of type" + " "
-                    + str(type(single_value)))
+                raise ValueError("The " + name +
+                                 "'s type must be a list or tuple of " +
+                                 str(n) + " " + str(dtype) + " . Received: " +
+                                 str(value) + " "
+                                 "including element " + str(single_value) +
+                                 " of type" + " " + str(type(single_value)))
         return value_list
 
 
@@ -148,11 +150,11 @@ def _sequence_like(instance, args):
         # ordered and plain dicts (e.g., flattening a dict but using a
         # corresponding `OrderedDict` to pack it back).
         result = dict(zip(_sorted(instance), args))
-        return type(instance)((key, result[key])
-                              for key in six.iterkeys(instance))
-    elif (isinstance(instance, tuple) and hasattr(instance, "_fields") and
-          isinstance(instance._fields, Sequence) and
-          all(isinstance(f, six.string_types) for f in instance._fields)):
+        return type(instance)(
+            (key, result[key]) for key in six.iterkeys(instance))
+    elif (isinstance(instance, tuple) and hasattr(instance, "_fields")
+          and isinstance(instance._fields, Sequence)
+          and all(isinstance(f, six.string_types) for f in instance._fields)):
         # This is a namedtuple
         return type(instance)(*args)
     else:
@@ -332,9 +334,9 @@ def _get_shape_tensor(list_shape):
             shape = cast(shape, 'int32')
         inputs["ShapeTensor"] = shape
     elif isinstance(shape, (list, tuple)):
-        assert len(shape) > 0, (
-            "The size of 'shape' in" + op_type + " can't be zero, "
-            "but received %s." % len(shape))
+        assert len(shape) > 0, ("The size of 'shape' in" + op_type +
+                                " can't be zero, "
+                                "but received %s." % len(shape))
         attrs["shape"] = _get_attr_shape(shape)
         if _contain_var(shape):
             inputs['ShapeTensorList'] = _get_shape_tensor(shape)
@@ -366,8 +368,8 @@ def convert_shape_to_list(shape):
     """
     if isinstance(shape, (list, tuple)):
         shape = list(
-            map(lambda x: x.numpy().flat[0] if isinstance(x, Variable) else x,
-                shape))
+            map(lambda x: x.numpy().flat[0]
+                if isinstance(x, Variable) else x, shape))
     else:
         shape = shape.numpy().astype(int).tolist()
     return shape
@@ -434,8 +436,8 @@ def try_get_constant_shape_from_tensor(shape_tensor):
             if shape_tensor.op is not None:
                 generate_op = shape_tensor.op
                 if generate_op.type == 'shape':
-                    var = shape_tensor.block.vars[generate_op.input_arg_names[
-                        0]]
+                    var = shape_tensor.block.vars[
+                        generate_op.input_arg_names[0]]
                     return var.shape
         except:
             return None
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index 1c9a1709d3e64..ffc9494129473 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -74,11 +74,10 @@ def create_lod_tensor(data, recursive_seq_lens, place):
     elif isinstance(data, list):
         # dtype and shape are not important here,
         # we only want to reuse code of DataToLoDTensorConverter
-        converter = DataToLoDTensorConverter(
-            place=place,
-            lod_level=len(recursive_seq_lens),
-            shape=[],
-            dtype=core.VarDesc.VarType.FP32)
+        converter = DataToLoDTensorConverter(place=place,
+                                             lod_level=len(recursive_seq_lens),
+                                             shape=[],
+                                             dtype=core.VarDesc.VarType.FP32)
 
         new_recursive_seq_lens = []
         for seq in data:
@@ -114,7 +113,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
 def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
                                 high):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     Create a LoDTensor containing random integers.
 
diff --git a/python/paddle/fluid/memory_analysis.py b/python/paddle/fluid/memory_analysis.py
index 0bcfeed351615..de9a260ada89e 100644
--- a/python/paddle/fluid/memory_analysis.py
+++ b/python/paddle/fluid/memory_analysis.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -31,8 +31,8 @@ def get_var_and_memory_size(block, var_name, batch_size=None):
             assert not has_none
             shape[i] = batch_size
             has_none = True
-    assert all(
-        [s >= 0 for s in shape]), "shape {} is not deterministic".format(shape)
+    assert all([s >= 0
+                for s in shape]), "shape {} is not deterministic".format(shape)
     mem_size = int(np.prod(shape)) * core.size_of_dtype(var.dtype)
     return var, mem_size
 
@@ -44,7 +44,7 @@ def pre_allocate_memory(size, place):
     del t
 
 
-# NOTE: does not consider inplace yet. 
+# NOTE: does not consider inplace yet.
 def get_max_memory_info(program, batch_size=None):
     assert program.num_blocks == 1, "only support to analysis program with only one block"
     cur_tmp_mem = 0
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index a3b61f2e91122..9ee27e0c3cfe9 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -905,28 +905,30 @@ def __init__(self,
             label = layers.concat([gt_label, gt_box], axis=1)
 
         # calculate mean average precision (mAP) of current mini-batch
-        map = detection.detection_map(
-            input,
-            label,
-            class_num,
-            background_label,
-            overlap_threshold=overlap_threshold,
-            evaluate_difficult=evaluate_difficult,
-            ap_version=ap_version)
+        map = detection.detection_map(input,
+                                      label,
+                                      class_num,
+                                      background_label,
+                                      overlap_threshold=overlap_threshold,
+                                      evaluate_difficult=evaluate_difficult,
+                                      ap_version=ap_version)
 
         states = []
         states.append(
-            self._create_state(
-                dtype='int32', shape=None, suffix='accum_pos_count'))
+            self._create_state(dtype='int32',
+                               shape=None,
+                               suffix='accum_pos_count'))
         states.append(
-            self._create_state(
-                dtype='float32', shape=None, suffix='accum_true_pos'))
+            self._create_state(dtype='float32',
+                               shape=None,
+                               suffix='accum_true_pos'))
         states.append(
-            self._create_state(
-                dtype='float32', shape=None, suffix='accum_false_pos'))
+            self._create_state(dtype='float32',
+                               shape=None,
+                               suffix='accum_false_pos'))
         var = self._create_state(dtype='int32', shape=[1], suffix='has_state')
-        self.helper.set_variable_initializer(
-            var, initializer=Constant(value=int(0)))
+        self.helper.set_variable_initializer(var,
+                                             initializer=Constant(value=int(0)))
         self.has_state = var
 
         # calculate accumulative mAP
@@ -942,11 +944,10 @@ def __init__(self,
             out_states=states,
             ap_version=ap_version)
 
-        layers.fill_constant(
-            shape=self.has_state.shape,
-            value=1,
-            dtype=self.has_state.dtype,
-            out=self.has_state)
+        layers.fill_constant(shape=self.has_state.shape,
+                             value=1,
+                             dtype=self.has_state.dtype,
+                             out=self.has_state)
 
         self.cur_map = map
         self.accum_map = accum_map
@@ -960,11 +961,11 @@ def _create_state(self, suffix, dtype, shape):
             shape(tuple|list): the shape of state
         Returns: State variable
         """
-        state = self.helper.create_variable(
-            name="_".join([unique_name.generate(self.helper.name), suffix]),
-            persistable=True,
-            dtype=dtype,
-            shape=shape)
+        state = self.helper.create_variable(name="_".join(
+            [unique_name.generate(self.helper.name), suffix]),
+                                            persistable=True,
+                                            dtype=dtype,
+                                            shape=shape)
         return state
 
     def get_map_var(self):
@@ -986,18 +987,19 @@ def reset(self, executor, reset_program=None):
 
         def _clone_var_(block, var):
             assert isinstance(var, Variable)
-            return block.create_var(
-                name=var.name,
-                shape=var.shape,
-                dtype=var.dtype,
-                type=var.type,
-                lod_level=var.lod_level,
-                persistable=var.persistable)
+            return block.create_var(name=var.name,
+                                    shape=var.shape,
+                                    dtype=var.dtype,
+                                    type=var.type,
+                                    lod_level=var.lod_level,
+                                    persistable=var.persistable)
 
         if reset_program is None:
             reset_program = Program()
         with program_guard(main_program=reset_program):
             var = _clone_var_(reset_program.current_block(), self.has_state)
-            layers.fill_constant(
-                shape=var.shape, value=0, dtype=var.dtype, out=var)
+            layers.fill_constant(shape=var.shape,
+                                 value=0,
+                                 dtype=var.dtype,
+                                 out=var)
         executor.run(reset_program)
diff --git a/python/paddle/fluid/multiprocess_utils.py b/python/paddle/fluid/multiprocess_utils.py
index d622172dced92..73bba0069cdd2 100644
--- a/python/paddle/fluid/multiprocess_utils.py
+++ b/python/paddle/fluid/multiprocess_utils.py
@@ -64,6 +64,7 @@ class CleanupFuncRegistrar():
 
     @classmethod
     def register(cls, function, signals=[]):
+
         def _func_exectuor():
             if function not in cls._executed_func_set:
                 try:
@@ -92,8 +93,8 @@ def _signal_register(signals):
             for sig in signals:
                 orig_handler = signal.signal(sig, _signal_handler)
                 if orig_handler not in (signal.SIG_DFL, signal.SIG_IGN):
-                    if (sig == signal.SIGINT and
-                            orig_handler is signal.default_int_handler):
+                    if (sig == signal.SIGINT
+                            and orig_handler is signal.default_int_handler):
                         continue
                     if orig_handler not in cls._registered_func_set:
                         atexit.register(orig_handler)
diff --git a/python/paddle/fluid/net_drawer.py b/python/paddle/fluid/net_drawer.py
index fd8f6eaf364c4..a7323d1ead2d9 100644
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -46,12 +46,15 @@
 
 VAR_STYLE = {}
 
-GRAPH_STYLE = {"rankdir": "TB", }
+GRAPH_STYLE = {
+    "rankdir": "TB",
+}
 
 GRAPH_ID = 0
 
 
 def unique_id():
+
     def generator():
         GRAPH_ID += 1
         return GRAPH_ID
@@ -112,13 +115,12 @@ def draw_graph(startup_program, main_program, **kwargs):
     filename = kwargs.get("filename")
     if filename == None:
         filename = str(graph_id) + ".gv"
-    g = Graph(
-        name=str(graph_id),
-        filename=filename,
-        graph_attr=GRAPH_STYLE,
-        node_attr=OP_STYLE,
-        edge_attr=VAR_STYLE,
-        **kwargs)
+    g = Graph(name=str(graph_id),
+              filename=filename,
+              graph_attr=GRAPH_STYLE,
+              node_attr=OP_STYLE,
+              edge_attr=VAR_STYLE,
+              **kwargs)
 
     var_dict = {}
     parse_graph(startup_program, g, var_dict)
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index e8f8bdd3f9add..abafb48d866bb 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -117,27 +117,25 @@ def simple_img_conv_pool(input,
                                                         pool_stride=2,
                                                         act="relu")
     """
-    conv_out = layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=conv_stride,
-        padding=conv_padding,
-        dilation=conv_dilation,
-        groups=conv_groups,
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        act=act,
-        use_cudnn=use_cudnn)
-
-    pool_out = layers.pool2d(
-        input=conv_out,
-        pool_size=pool_size,
-        pool_type=pool_type,
-        pool_stride=pool_stride,
-        pool_padding=pool_padding,
-        global_pooling=global_pooling,
-        use_cudnn=use_cudnn)
+    conv_out = layers.conv2d(input=input,
+                             num_filters=num_filters,
+                             filter_size=filter_size,
+                             stride=conv_stride,
+                             padding=conv_padding,
+                             dilation=conv_dilation,
+                             groups=conv_groups,
+                             param_attr=param_attr,
+                             bias_attr=bias_attr,
+                             act=act,
+                             use_cudnn=use_cudnn)
+
+    pool_out = layers.pool2d(input=conv_out,
+                             pool_size=pool_size,
+                             pool_type=pool_type,
+                             pool_stride=pool_stride,
+                             pool_padding=pool_padding,
+                             global_pooling=global_pooling,
+                             use_cudnn=use_cudnn)
     return pool_out
 
 
@@ -235,14 +233,13 @@ def __extend_list__(obj):
         if conv_with_batchnorm[i]:
             local_conv_act = None
 
-        tmp = layers.conv2d(
-            input=tmp,
-            num_filters=conv_num_filter[i],
-            filter_size=conv_filter_size[i],
-            padding=conv_padding[i],
-            param_attr=param_attr[i],
-            act=local_conv_act,
-            use_cudnn=use_cudnn)
+        tmp = layers.conv2d(input=tmp,
+                            num_filters=conv_num_filter[i],
+                            filter_size=conv_filter_size[i],
+                            padding=conv_padding[i],
+                            param_attr=param_attr[i],
+                            act=local_conv_act,
+                            use_cudnn=use_cudnn)
 
         if conv_with_batchnorm[i]:
             tmp = layers.batch_norm(input=tmp, act=conv_act)
@@ -250,12 +247,11 @@ def __extend_list__(obj):
             if abs(drop_rate) > 1e-5:
                 tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
 
-    pool_out = layers.pool2d(
-        input=tmp,
-        pool_size=pool_size,
-        pool_type=pool_type,
-        pool_stride=pool_stride,
-        use_cudnn=use_cudnn)
+    pool_out = layers.pool2d(input=tmp,
+                             pool_size=pool_size,
+                             pool_type=pool_type,
+                             pool_stride=pool_stride,
+                             use_cudnn=use_cudnn)
     return pool_out
 
 
@@ -321,13 +317,12 @@ def sequence_conv_pool(input,
     """
 
     check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'input')
-    conv_out = layers.sequence_conv(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        act=act)
+    conv_out = layers.sequence_conv(input=input,
+                                    num_filters=num_filters,
+                                    filter_size=filter_size,
+                                    param_attr=param_attr,
+                                    bias_attr=bias_attr,
+                                    act=act)
 
     pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)
     return pool_out
@@ -468,8 +463,8 @@ def scaled_dot_product_attention(queries,
             "The dtype of keys, values and queries should be the same."
             "But received queries.dtype = %s, "
             " keys.dtype = %s, values.dtype) = %s." %
-            (convert_dtype(queries.dtype), convert_dtype(keys.dtype),
-             convert_dtype(values.dtype)))
+            (convert_dtype(queries.dtype), convert_dtype(
+                keys.dtype), convert_dtype(values.dtype)))
 
     if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
         raise ValueError(
@@ -542,9 +537,9 @@ def __split_heads(x, num_heads):
         # reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
         # into a 4-D output:
         # [batch_size, max_sequence_length, num_heads, hidden_size_per_head].
-        reshaped = layers.reshape(
-            x=x,
-            shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads])
+        reshaped = layers.reshape(x=x,
+                                  shape=list(x.shape[:-1]) +
+                                  [num_heads, hidden_size // num_heads])
 
         # permute the dimensions into:
         # [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
@@ -569,13 +564,12 @@ def __combine_heads(x):
             raise ValueError("Input(x) should be a 4-D Tensor.")
 
         trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        return layers.reshape(
-            x=trans_x,
-            shape=list(
-                map(int, [
-                    trans_x.shape[0], trans_x.shape[1], trans_x.shape[2] *
-                    trans_x.shape[3]
-                ])))
+        return layers.reshape(x=trans_x,
+                              shape=list(
+                                  map(int, [
+                                      trans_x.shape[0], trans_x.shape[1],
+                                      trans_x.shape[2] * trans_x.shape[3]
+                                  ])))
 
     q, k, v = __compute_qkv(queries, keys, values, num_heads)
 
@@ -587,12 +581,13 @@ def __combine_heads(x):
     scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5)
     product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
 
-    weights = layers.reshape(
-        x=layers.reshape(
-            x=product, shape=[-1, product.shape[-1]], act="softmax"),
-        shape=product.shape)
+    weights = layers.reshape(x=layers.reshape(x=product,
+                                              shape=[-1, product.shape[-1]],
+                                              act="softmax"),
+                             shape=product.shape)
     if dropout_rate:
-        weights = layers.dropout(
-            weights, dropout_prob=dropout_rate, is_test=False)
+        weights = layers.dropout(weights,
+                                 dropout_prob=dropout_rate,
+                                 is_test=False)
     ctx_multiheads = layers.matmul(weights, v)
     return __combine_heads(ctx_multiheads)
diff --git a/python/paddle/fluid/op.py b/python/paddle/fluid/op.py
index ee61ec1c3da3f..d5be4423775b4 100644
--- a/python/paddle/fluid/op.py
+++ b/python/paddle/fluid/op.py
@@ -126,8 +126,8 @@ def __call__(self, *args, **kwargs):
                     new_attr.longs.extend(user_defined_attr)
                 else:
                     raise NotImplementedError(
-                        "A not supported attribute type: %s." % (
-                            str(attr.type)))
+                        "A not supported attribute type: %s." %
+                        (str(attr.type)))
 
         return op_desc
 
@@ -144,6 +144,7 @@ def any_is_true(generator):
 
 
 class OpInfo(object):
+
     def __init__(self, name, method, inputs, outputs, attrs):
         self.name = name
         self.method = method
@@ -162,15 +163,17 @@ def __impl__(*args, **kwargs):
         opdesc = method(*args, **kwargs)
         return core.Operator.create(opdesc.SerializeToString())
 
-    return OpInfo(
-        method=__impl__,
-        name=op_proto.type,
-        inputs=[(var.name, var.duplicable) for var in op_proto.inputs],
-        outputs=[(var.name, var.duplicable) for var in op_proto.outputs],
-        attrs=[attr.name for attr in op_proto.attrs])
+    return OpInfo(method=__impl__,
+                  name=op_proto.type,
+                  inputs=[(var.name, var.duplicable)
+                          for var in op_proto.inputs],
+                  outputs=[(var.name, var.duplicable)
+                           for var in op_proto.outputs],
+                  attrs=[attr.name for attr in op_proto.attrs])
 
 
 class OperatorFactory(object):
+
     def __init__(self):
         self.op_methods = dict()
 
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 49fb5399d8aec..20e39e89f305b 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -130,8 +130,8 @@ def __init__(self,
         # program -> Variable(learning_rate)
         self._learning_rate_map = dict()
         if isinstance(self._learning_rate, framework.Variable):
-            self._learning_rate_map[framework.default_main_program(
-            )] = self._learning_rate
+            self._learning_rate_map[
+                framework.default_main_program()] = self._learning_rate
         # Dictionary of accumulators. Some optimizer subclasses need to
         # allocate and manage extra variables associated with the parameters
         # to train. These variables are called accumulators.
@@ -145,7 +145,7 @@ def __init__(self,
         self._param_device_map = dict()
         # NOTE(zhiqiu): sometimes we want to add some variables(Tenosr) to the optimizer for a specific optimization,
         # for example, we want to pass 'found_inf' to adam optimizer so it can skip update when found_inf is True.
-        # And these variables should not be the parameters of Optimizer's construnctor (because not commonly used). 
+        # And these variables should not be the parameters of Optimizer's construnctor (because not commonly used).
         # Use _auxiliary_vars together with _set_auxiliary_var/_get_auxiliary_var to achieve that.
         self._auxiliary_vars = dict()
 
@@ -187,11 +187,14 @@ def state_dict(self):
 
             if not isinstance(self._learning_rate, _LearningRateEpochDecay):
                 var_tmp = None
-                var_temp = framework._varbase_creator(
-                    None, name='global_step', dtype='int32')
+                var_temp = framework._varbase_creator(None,
+                                                      name='global_step',
+                                                      dtype='int32')
 
-                tensor.fill_constant(
-                    [1], "int32", self._learning_rate.step_num, out=var_temp)
+                tensor.fill_constant([1],
+                                     "int32",
+                                     self._learning_rate.step_num,
+                                     out=var_temp)
 
                 state_dict['global_step'] = var_temp
         return state_dict
@@ -326,8 +329,8 @@ def _create_global_learning_rate(self):
                 main_prog = framework.default_main_program()
                 main_prog.lr_sheduler = self._learning_rate
                 main_prog.lr_var = lr_var
-                self._learning_rate_map[framework.default_main_program(
-                )] = lr_var
+                self._learning_rate_map[
+                    framework.default_main_program()] = lr_var
 
             lr_value = float(self._learning_rate())
             self.helper.set_variable_initializer(
@@ -351,8 +354,8 @@ def _create_global_learning_rate(self):
                         persistable=True)
             # get learning rate Variable from LearningRateDecay
             elif isinstance(self._learning_rate, LearningRateDecay):
-                self._learning_rate_map[framework.default_main_program(
-                )] = self._learning_rate()
+                self._learning_rate_map[
+                    framework.default_main_program()] = self._learning_rate()
             else:
                 raise TypeError(
                     "optimizer's learning rate must be float or LearningRateDecay"
@@ -370,13 +373,13 @@ def _create_global_learning_rate(self):
                     )
 
             # create learning rate in the current main program
-            self._learning_rate_map[framework.default_main_program(
-            )] = layers.create_global_var(
-                name=unique_name.generate("learning_rate"),
-                shape=[1],
-                value=float(self._learning_rate),
-                dtype='float32' if self._dtype is None else self._dtype,
-                persistable=True)
+            self._learning_rate_map[
+                framework.default_main_program()] = layers.create_global_var(
+                    name=unique_name.generate("learning_rate"),
+                    shape=[1],
+                    value=float(self._learning_rate),
+                    dtype='float32' if self._dtype is None else self._dtype,
+                    persistable=True)
 
     @framework.dygraph_only
     def set_lr(self, value):
@@ -441,22 +444,20 @@ def set_lr(self, value):
             current_lr = self._global_learning_rate()
             if current_lr is not None:
                 if framework._non_static_mode():
-                    _C_ops.fill_constant(current_lr, 'value',
-                                         float(value), 'dtype',
-                                         current_lr.dtype, 'shape',
+                    _C_ops.fill_constant(current_lr, 'value', float(value),
+                                         'dtype', current_lr.dtype, 'shape',
                                          list(current_lr.shape))
                 else:
                     global_block = framework.default_main_program(
                     ).global_block()
-                    global_block.append_op(
-                        type='fill_constant',
-                        outputs={'Out': [current_lr]},
-                        attrs={
-                            'dtype': current_lr.dtype,
-                            'shape': list(current_lr.shape),
-                            'value': float(value)
-                        },
-                        stop_gradient=True)
+                    global_block.append_op(type='fill_constant',
+                                           outputs={'Out': [current_lr]},
+                                           attrs={
+                                               'dtype': current_lr.dtype,
+                                               'shape': list(current_lr.shape),
+                                               'value': float(value)
+                                           },
+                                           stop_gradient=True)
         else:
             assert len(value.shape) == 1 and value.shape[
                 0] == 1, "optimizer's learning rate must be 1-D Tensor with shape[1]"
@@ -596,12 +597,13 @@ def _add_accumulator(self,
         """
         if self._name is not None:
             name = self._name + "_" + name
-        if (name in self._accumulators and
-                param.name in self._accumulators[name]):
+        if (name in self._accumulators
+                and param.name in self._accumulators[name]):
             if framework._non_static_mode():
                 return self._accumulators[name][param.name]
-            raise Exception("Accumulator {} already exists for parameter {}".
-                            format(name, param.name))
+            raise Exception(
+                "Accumulator {} already exists for parameter {}".format(
+                    name, param.name))
         if shape == None:
             shape = param.shape
         assert isinstance(self.helper, LayerHelper)
@@ -615,8 +617,8 @@ def _add_accumulator(self,
             persistable=True,
             dtype=dtype or param.dtype,
             type=core.VarDesc.VarType.LOD_TENSOR
-            if framework._non_static_mode() else (param.type
-                                                  if type is None else type),
+            if framework._non_static_mode() else
+            (param.type if type is None else type),
             shape=shape,
             belong_to_optimizer=True)
         if device is None:
@@ -700,10 +702,11 @@ def _get_accumulator(self, name, param):
         """
         if self._name is not None:
             name = self._name + "_" + name
-        if (name not in self._accumulators or
-                param.name not in self._accumulators[name]):
-            raise Exception("Accumulator {} does not exist for parameter {}".
-                            format(name, param.name))
+        if (name not in self._accumulators
+                or param.name not in self._accumulators[name]):
+            raise Exception(
+                "Accumulator {} does not exist for parameter {}".format(
+                    name, param.name))
         return self._accumulators[name][param.name]
 
     def _get_global_accumulator(self, name):
@@ -795,8 +798,8 @@ def _create_optimization_pass(self, parameters_and_grads):
                 with param_and_grad[0].block.program._optimized_guard(
                         param_and_grad), name_scope("optimizer"):
                     if param_and_grad[0].trainable is True:
-                        device = self._get_device_for_param(param_and_grad[0]
-                                                            .name)
+                        device = self._get_device_for_param(
+                            param_and_grad[0].name)
                         with device_guard(device):
                             optimize_op = self._append_optimize_op(
                                 target_block, param_and_grad)
@@ -925,10 +928,10 @@ def _create_regularization_of_grad(self, param, grad, regularization=None):
         Function helper of append_regularization_ops.
         """
         # If no gradient or no regularization is specified,  then we don't need to do anything
-        if grad is None or ((not hasattr(param, 'regularizer') or
-                             (hasattr(param, 'regularizer') and
-                              param.regularizer is None)) and
-                            regularization is None):
+        if grad is None or (
+            (not hasattr(param, 'regularizer') or
+             (hasattr(param, 'regularizer') and param.regularizer is None))
+                and regularization is None):
             return grad
         regularization_term = None
         if hasattr(param, 'regularizer') and param.regularizer is not None:
@@ -987,8 +990,8 @@ def append_regularization_ops(self,
         params_and_grads = []
         if framework._non_static_mode():
             for param, grad in parameters_and_grads:
-                new_grad = self._create_regularization_of_grad(param, grad,
-                                                               regularization)
+                new_grad = self._create_regularization_of_grad(
+                    param, grad, regularization)
                 params_and_grads.append((param, new_grad))
         else:
             repeate_regularizer = False
@@ -1048,40 +1051,38 @@ def flatten_param_grads(self, params_grads):
             belong_to_optimizer=True)
 
         with program_guard(default_main_program()):
-            block.append_op(
-                type="coalesce_tensor",
-                inputs={"Input": need_flatten_params},
-                outputs={
-                    "Output": need_flatten_params,
-                    "FusedOutput": flatten_param
-                },
-                attrs={
-                    "copy_data": True,
-                    "use_align": True,
-                    "align_size": self._align_size,
-                    "dtype": need_flatten_params[0].dtype
-                })
+            block.append_op(type="coalesce_tensor",
+                            inputs={"Input": need_flatten_params},
+                            outputs={
+                                "Output": need_flatten_params,
+                                "FusedOutput": flatten_param
+                            },
+                            attrs={
+                                "copy_data": True,
+                                "use_align": True,
+                                "align_size": self._align_size,
+                                "dtype": need_flatten_params[0].dtype
+                            })
 
-            block.append_op(
-                type="coalesce_tensor",
-                inputs={"Input": need_flatten_grads},
-                outputs={
-                    "Output": need_flatten_grads,
-                    "FusedOutput": flatten_grad
-                },
-                attrs={
-                    "copy_data": True,
-                    "use_align": True,
-                    "align_size": self._align_size,
-                    "dtype": need_flatten_grads[0].dtype
-                })
+            block.append_op(type="coalesce_tensor",
+                            inputs={"Input": need_flatten_grads},
+                            outputs={
+                                "Output": need_flatten_grads,
+                                "FusedOutput": flatten_grad
+                            },
+                            attrs={
+                                "copy_data": True,
+                                "use_align": True,
+                                "align_size": self._align_size,
+                                "dtype": need_flatten_grads[0].dtype
+                            })
 
         #NOTE(zhiqiu): the initializer should be set after coalesce_tensor op,
         # so the shape of flatten_param and flatten_grad will be inferred.
-        self.helper.set_variable_initializer(
-            flatten_param, initializer=Constant(0.0))
-        self.helper.set_variable_initializer(
-            flatten_grad, initializer=Constant(0.0))
+        self.helper.set_variable_initializer(flatten_param,
+                                             initializer=Constant(0.0))
+        self.helper.set_variable_initializer(flatten_grad,
+                                             initializer=Constant(0.0))
 
         return [(flatten_param, flatten_grad)]
 
@@ -1233,14 +1234,14 @@ def minimize(self,
         parameter_list = parameter_list if parameter_list \
             else self._parameter_list
 
-        params_grads = self.backward(
-            loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set)
+        params_grads = self.backward(loss,
+                                     startup_program=startup_program,
+                                     parameter_list=parameter_list,
+                                     no_grad_set=no_grad_set)
 
-        optimize_ops = self.apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
+        optimize_ops = self.apply_optimize(loss,
+                                           startup_program=startup_program,
+                                           params_grads=params_grads)
 
         return optimize_ops, params_grads
 
@@ -1309,12 +1310,11 @@ def __init__(self,
                  multi_precision=False,
                  name=None):
         assert learning_rate is not None
-        super(SGDOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            parameter_list=parameter_list,
-            regularization=regularization,
-            grad_clip=grad_clip,
-            name=name)
+        super(SGDOptimizer, self).__init__(learning_rate=learning_rate,
+                                           parameter_list=parameter_list,
+                                           regularization=regularization,
+                                           grad_clip=grad_clip,
+                                           name=name)
         self.type = "sgd"
         self._use_mkldnn = False
         self._multi_precision = multi_precision
@@ -1328,21 +1328,19 @@ def _create_master_weight(self, param):
 
             var_name = param.name + "_fp32_master"
             var_name = unique_name.generate(var_name)
-            var = layers.create_global_var(
-                name=var_name,
-                shape=param.shape,
-                value=0,
-                dtype='float32',
-                persistable=True)
+            var = layers.create_global_var(name=var_name,
+                                           shape=param.shape,
+                                           value=0,
+                                           dtype='float32',
+                                           persistable=True)
             block = self.helper.startup_program.global_block()
-            block.append_op(
-                type="cast",
-                inputs={"X": [param]},
-                outputs={"Out": [var]},
-                attrs={
-                    "in_dtype": param.dtype,
-                    "out_dtype": core.VarDesc.VarType.FP32
-                })
+            block.append_op(type="cast",
+                            inputs={"X": [param]},
+                            outputs={"Out": [var]},
+                            attrs={
+                                "in_dtype": param.dtype,
+                                "out_dtype": core.VarDesc.VarType.FP32
+                            })
             self._master_weights[param.name] = var
         return var
 
@@ -1396,12 +1394,11 @@ def _append_optimize_op(self, block, param_and_grad):
             inputs["MasterParam"] = master_weight
             outputs["MasterParamOut"] = master_weight
 
-        sgd_op = block.append_op(
-            type=self.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs,
-            stop_gradient=True)
+        sgd_op = block.append_op(type=self.type,
+                                 inputs=inputs,
+                                 outputs=outputs,
+                                 attrs=attrs,
+                                 stop_gradient=True)
 
         return sgd_op
 
@@ -1488,12 +1485,11 @@ def __init__(self,
                  name=None):
         assert learning_rate is not None
         assert momentum is not None
-        super(MomentumOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            parameter_list=parameter_list,
-            regularization=regularization,
-            grad_clip=grad_clip,
-            name=name)
+        super(MomentumOptimizer, self).__init__(learning_rate=learning_rate,
+                                                parameter_list=parameter_list,
+                                                regularization=regularization,
+                                                grad_clip=grad_clip,
+                                                name=name)
         self.type = "momentum"
         self._momentum = momentum
         self._use_nesterov = bool(use_nesterov)
@@ -1512,10 +1508,11 @@ def _append_optimize_op(self, block, param_and_grad):
         lr = self._create_param_lr(param_and_grad)
         master_weight = None
         if framework._non_static_mode():
-            _, _, _ = _C_ops.momentum(
-                param_and_grad[0], param_and_grad[1], velocity_acc, lr,
-                master_weight, param_and_grad[0], velocity_acc, master_weight,
-                'mu', self._momentum, 'use_nesterov', self._use_nesterov)
+            _, _, _ = _C_ops.momentum(param_and_grad[0], param_and_grad[1],
+                                      velocity_acc, lr, master_weight,
+                                      param_and_grad[0], velocity_acc,
+                                      master_weight, 'mu', self._momentum,
+                                      'use_nesterov', self._use_nesterov)
             return None
 
         attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov}
@@ -1531,12 +1528,11 @@ def _append_optimize_op(self, block, param_and_grad):
             "VelocityOut": [velocity_acc]
         }
         # create the momentum optimize op
-        momentum_op = block.append_op(
-            type=self.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs,
-            stop_gradient=True)
+        momentum_op = block.append_op(type=self.type,
+                                      inputs=inputs,
+                                      outputs=outputs,
+                                      attrs=attrs,
+                                      stop_gradient=True)
 
         return momentum_op
 
@@ -1629,12 +1625,12 @@ def __init__(self,
 
         assert learning_rate is not None
         assert momentum is not None
-        super(DGCMomentumOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            parameter_list=parameter_list,
-            regularization=regularization,
-            grad_clip=grad_clip,
-            name=name)
+        super(DGCMomentumOptimizer,
+              self).__init__(learning_rate=learning_rate,
+                             parameter_list=parameter_list,
+                             regularization=regularization,
+                             grad_clip=grad_clip,
+                             name=name)
         self.type = "dgc_momentum"
         self._momentum = momentum
         self._use_nesterov = bool(use_nesterov)
@@ -1719,12 +1715,11 @@ def _append_optimize_op(self, block, param_and_grad):
             attrs.update({"rampup_begin_step": float(self._rampup_begin_step)})
 
         # create the dgc momentum optimize op
-        dgc_momentum_op = block.append_op(
-            type=type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs,
-            stop_gradient=True)
+        dgc_momentum_op = block.append_op(type=type,
+                                          inputs=inputs,
+                                          outputs=outputs,
+                                          attrs=attrs,
+                                          stop_gradient=True)
         return dgc_momentum_op
 
     def _add_auto_increment_var(self, counter_name, begin, step=1):
@@ -1732,10 +1727,10 @@ def _add_auto_increment_var(self, counter_name, begin, step=1):
         counter, is_new_var = helper.create_or_get_global_variable(
             name=counter_name, dtype='float32', shape=[1], persistable=True)
         if is_new_var:
-            helper.set_variable_initializer(
-                counter,
-                initializer=Constant(
-                    value=float(begin - 1), force_cpu=True))
+            helper.set_variable_initializer(counter,
+                                            initializer=Constant(
+                                                value=float(begin - 1),
+                                                force_cpu=True))
             helper.main_program.global_block()._prepend_op(
                 type='increment',
                 inputs={'X': [counter]},
@@ -1751,10 +1746,10 @@ def _add_nranks_var(self, name, value=-1):
         counter, is_new_var = helper.create_or_get_global_variable(
             name=name, dtype='float32', shape=[1], persistable=True)
         if is_new_var:
-            helper.set_variable_initializer(
-                counter,
-                initializer=Constant(
-                    value=float(value), force_cpu=True))
+            helper.set_variable_initializer(counter,
+                                            initializer=Constant(
+                                                value=float(value),
+                                                force_cpu=True))
             counter.stop_gradient = True
 
         return counter
@@ -1767,8 +1762,8 @@ def _append_dgc_ops(self, param_and_grads):
         self._global_step_var = self._add_auto_increment_var(
             counter_name=core.dgc.kDGCCounterName(), begin=0)
 
-        self._nranks_var = self._add_nranks_var(
-            name=core.dgc.kDGCNRanksName(), value=-1)
+        self._nranks_var = self._add_nranks_var(name=core.dgc.kDGCNRanksName(),
+                                                value=-1)
 
         # rampup begin step var for all_reduce_op_handle
         self._rampup_begin_step_var = tensor.create_global_var(
@@ -1790,29 +1785,29 @@ def _append_dgc_ops(self, param_and_grads):
 
             v_var = self._add_accumulator(self._v_velocity_acc_str, param_var)
 
-            k_var = tensor.create_global_var(
-                shape=[1],
-                dtype=param_var.dtype,
-                persistable=True,
-                name=param_var.name + core.dgc.kDGCKName(),
-                value=0.0,
-                force_cpu=True)
-
-            encoded_var = tensor.create_global_var(
-                shape=[1],
-                dtype=param_var.dtype,
-                persistable=True,
-                name=param_var.name + core.dgc.kDGCEncodedName(),
-                value=0.0,
-                force_cpu=False)
-
-            gather_var = tensor.create_global_var(
-                shape=[1],
-                dtype=param_var.dtype,
-                persistable=True,
-                name=param_var.name + core.dgc.kDGCGatherName(),
-                value=0.0,
-                force_cpu=False)
+            k_var = tensor.create_global_var(shape=[1],
+                                             dtype=param_var.dtype,
+                                             persistable=True,
+                                             name=param_var.name +
+                                             core.dgc.kDGCKName(),
+                                             value=0.0,
+                                             force_cpu=True)
+
+            encoded_var = tensor.create_global_var(shape=[1],
+                                                   dtype=param_var.dtype,
+                                                   persistable=True,
+                                                   name=param_var.name +
+                                                   core.dgc.kDGCEncodedName(),
+                                                   value=0.0,
+                                                   force_cpu=False)
+
+            gather_var = tensor.create_global_var(shape=[1],
+                                                  dtype=param_var.dtype,
+                                                  persistable=True,
+                                                  name=param_var.name +
+                                                  core.dgc.kDGCGatherName(),
+                                                  value=0.0,
+                                                  force_cpu=False)
 
             # del back oprolevarname
             op_maker = core.op_proto_and_checker_maker
@@ -1855,24 +1850,28 @@ def _clip_by_norm(self, x, max_norm, name=None):
             name = unique_name.generate_with_ignorable_key(".".join(
                 [helper.name, 'tmp']))
 
-        out = helper.create_variable(
-            type=x.type, name=name, dtype=x.dtype, persistable=False)
-
-        helper.append_op(
-            type="dgc_clip_by_norm",
-            inputs={"X": x,
-                    "current_step": self._global_step_var},
-            attrs={
-                "max_norm": max_norm,
-                "rampup_begin_step": float(self._rampup_begin_step)
-            },
-            outputs={"Out": out})
+        out = helper.create_variable(type=x.type,
+                                     name=name,
+                                     dtype=x.dtype,
+                                     persistable=False)
+
+        helper.append_op(type="dgc_clip_by_norm",
+                         inputs={
+                             "X": x,
+                             "current_step": self._global_step_var
+                         },
+                         attrs={
+                             "max_norm": max_norm,
+                             "rampup_begin_step": float(self._rampup_begin_step)
+                         },
+                         outputs={"Out": out})
         return out
 
     def _append_clip_norm(self, grad_var, clip_norm):
         with grad_var.block.program._backward_role_guard():
-            return self._clip_by_norm(
-                x=grad_var, max_norm=clip_norm, name=grad_var.name)
+            return self._clip_by_norm(x=grad_var,
+                                      max_norm=clip_norm,
+                                      name=grad_var.name)
 
     def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var,
                 encoded_var, gather_var):
@@ -1886,34 +1885,40 @@ def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var,
             regular_type, regular_coeff = self._get_regularization_param(
                 param_var.regularizer)
 
-        dgc_op = block.append_op(
-            type="dgc",
-            inputs={
-                "U": u_var,
-                "V": v_var,
-                "Grad": clip_var,
-                "Param": param_var,
-                "current_step": self._global_step_var,
-                "nranks": self._nranks_var,
-            },
-            outputs={
-                "U_out": u_var,
-                "V_out": v_var,
-                "EncodeGrad": encoded_var,
-                "k": k_var,
-                "Grad_out": grad_var,
-                "GatherBuff": gather_var,
-            },
-            attrs={
-                "m": self._momentum,
-                "sparsity": self._sparsity,
-                "use_nesterov": self._use_nesterov,
-                "rampup_begin_step": float(self._rampup_begin_step),
-                "rampup_step": float(self._rampup_step),
-                "regular_coeff": float(regular_coeff),
-                "regular_type": int(regular_type),
-            },
-            stop_gradient=True)
+        dgc_op = block.append_op(type="dgc",
+                                 inputs={
+                                     "U": u_var,
+                                     "V": v_var,
+                                     "Grad": clip_var,
+                                     "Param": param_var,
+                                     "current_step": self._global_step_var,
+                                     "nranks": self._nranks_var,
+                                 },
+                                 outputs={
+                                     "U_out": u_var,
+                                     "V_out": v_var,
+                                     "EncodeGrad": encoded_var,
+                                     "k": k_var,
+                                     "Grad_out": grad_var,
+                                     "GatherBuff": gather_var,
+                                 },
+                                 attrs={
+                                     "m":
+                                     self._momentum,
+                                     "sparsity":
+                                     self._sparsity,
+                                     "use_nesterov":
+                                     self._use_nesterov,
+                                     "rampup_begin_step":
+                                     float(self._rampup_begin_step),
+                                     "rampup_step":
+                                     float(self._rampup_step),
+                                     "regular_coeff":
+                                     float(regular_coeff),
+                                     "regular_type":
+                                     int(regular_type),
+                                 },
+                                 stop_gradient=True)
 
         backward = op_maker.OpRole.Backward
         dgc_op._set_attr(op_maker.kOpRoleAttrName(), backward)
@@ -2039,12 +2044,12 @@ def __init__(self,
                  rescale_grad=1.0):
         assert learning_rate is not None
         assert momentum is not None
-        super(LarsMomentumOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            parameter_list=parameter_list,
-            regularization=regularization,
-            grad_clip=grad_clip,
-            name=name)
+        super(LarsMomentumOptimizer,
+              self).__init__(learning_rate=learning_rate,
+                             parameter_list=parameter_list,
+                             regularization=regularization,
+                             grad_clip=grad_clip,
+                             name=name)
         self.type = "lars_momentum"
         self._momentum = momentum
         self._lars_coeff = float(lars_coeff)
@@ -2066,21 +2071,19 @@ def _create_master_weight(self, param):
 
             var_name = param.name + '_fp32_master'
             var_name = unique_name.generate(var_name)
-            var = layers.create_global_var(
-                name=var_name,
-                shape=param.shape,
-                value=0,
-                dtype='float32',
-                persistable=True)
+            var = layers.create_global_var(name=var_name,
+                                           shape=param.shape,
+                                           value=0,
+                                           dtype='float32',
+                                           persistable=True)
             block = self.helper.startup_program.global_block()
-            block.append_op(
-                type="cast",
-                inputs={"X": [param]},
-                outputs={"Out": [var]},
-                attrs={
-                    "in_dtype": param.dtype,
-                    "out_dtype": core.VarDesc.VarType.FP32
-                })
+            block.append_op(type="cast",
+                            inputs={"X": [param]},
+                            outputs={"Out": [var]},
+                            attrs={
+                                "in_dtype": param.dtype,
+                                "out_dtype": core.VarDesc.VarType.FP32
+                            })
             self._master_weights[param.name] = var
         return var
 
@@ -2098,10 +2101,11 @@ def _get_accumulator(self, name, param):
         target_param = self._master_weights[
             param.name] if find_master else param
         target_name = target_param.name
-        if (name not in self._accumulators or
-                target_name not in self._accumulators[name]):
-            raise Exception("Accumulator {} does not exist for parameter {}".
-                            format(name, target_name))
+        if (name not in self._accumulators
+                or target_name not in self._accumulators[name]):
+            raise Exception(
+                "Accumulator {} does not exist for parameter {}".format(
+                    name, target_name))
         return self._accumulators[name][target_name]
 
     def _create_accumulators(self, block, parameters):
@@ -2169,12 +2173,11 @@ def _append_optimize_op(self, block, param_and_grad):
                 self._epsilon, "rescale_grad", self._rescale_grad)
         else:
             # create the momentum optimize op
-            momentum_op = block.append_op(
-                type=self.type,
-                inputs=inputs,
-                outputs=outputs,
-                attrs=attrs,
-                stop_gradient=True)
+            momentum_op = block.append_op(type=self.type,
+                                          inputs=inputs,
+                                          outputs=outputs,
+                                          attrs=attrs,
+                                          stop_gradient=True)
 
             return momentum_op
 
@@ -2254,12 +2257,11 @@ def __init__(self,
                  initial_accumulator_value=0.0):
         assert learning_rate is not None
         assert epsilon is not None
-        super(AdagradOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            parameter_list=parameter_list,
-            regularization=regularization,
-            grad_clip=grad_clip,
-            name=name)
+        super(AdagradOptimizer, self).__init__(learning_rate=learning_rate,
+                                               parameter_list=parameter_list,
+                                               regularization=regularization,
+                                               grad_clip=grad_clip,
+                                               name=name)
         self.type = "adagrad"
         self._epsilon = epsilon
         self.initial_accumulator_value = initial_accumulator_value
@@ -2268,10 +2270,9 @@ def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
         for p in parameters:
-            self._add_accumulator(
-                self._moment_acc_str,
-                p,
-                fill_value=self.initial_accumulator_value)
+            self._add_accumulator(self._moment_acc_str,
+                                  p,
+                                  fill_value=self.initial_accumulator_value)
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -2483,14 +2484,14 @@ def __init__(self,
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(AdamOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            parameter_list=parameter_list,
-            regularization=regularization,
-            grad_clip=grad_clip,
-            flatten_param_grads=flatten_param_grads,
-            align_size=align_size,
-            name=name)
+        super(AdamOptimizer,
+              self).__init__(learning_rate=learning_rate,
+                             parameter_list=parameter_list,
+                             regularization=regularization,
+                             grad_clip=grad_clip,
+                             flatten_param_grads=flatten_param_grads,
+                             align_size=align_size,
+                             name=name)
         self.type = "adam"
         self._beta1 = beta1
         self._beta2 = beta2
@@ -2613,12 +2614,11 @@ def _append_optimize_op(self, block, param_and_grad):
         else:
             attrs['epsilon'] = self._epsilon
 
-        adam_op = block.append_op(
-            type=self.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs,
-            stop_gradient=True)
+        adam_op = block.append_op(type=self.type,
+                                  inputs=inputs,
+                                  outputs=outputs,
+                                  attrs=attrs,
+                                  stop_gradient=True)
 
         return adam_op
 
@@ -2639,20 +2639,18 @@ def _finish_update(self, block, parameters_and_grads):
                 if isinstance(self._beta1, Variable):
                     inputs["Y"] = self._beta1
                     # use elementwise_mul for better performance
-                    block.append_op(
-                        type="elementwise_mul",
-                        inputs=inputs,
-                        outputs=outputs,
-                        attrs=attrs,
-                        stop_gradient=True)
+                    block.append_op(type="elementwise_mul",
+                                    inputs=inputs,
+                                    outputs=outputs,
+                                    attrs=attrs,
+                                    stop_gradient=True)
                 else:
                     attrs['scale'] = self._beta1
-                    block.append_op(
-                        type="scale",
-                        inputs=inputs,
-                        outputs=outputs,
-                        attrs=attrs,
-                        stop_gradient=True)
+                    block.append_op(type="scale",
+                                    inputs=inputs,
+                                    outputs=outputs,
+                                    attrs=attrs,
+                                    stop_gradient=True)
 
                 inputs = {"X": beta2_pow_acc}
                 outputs = {"Out": beta2_pow_acc}
@@ -2660,20 +2658,18 @@ def _finish_update(self, block, parameters_and_grads):
                 if isinstance(self._beta2, Variable):
                     inputs["Y"] = self._beta2
                     # use elementwise_mul for better performance
-                    block.append_op(
-                        type="elementwise_mul",
-                        inputs=inputs,
-                        outputs=outputs,
-                        attrs=attrs,
-                        stop_gradient=True)
+                    block.append_op(type="elementwise_mul",
+                                    inputs=inputs,
+                                    outputs=outputs,
+                                    attrs=attrs,
+                                    stop_gradient=True)
                 else:
                     attrs['scale'] = self._beta2
-                    block.append_op(
-                        type="scale",
-                        inputs=inputs,
-                        outputs=outputs,
-                        attrs=attrs,
-                        stop_gradient=True)
+                    block.append_op(type="scale",
+                                    inputs=inputs,
+                                    outputs=outputs,
+                                    attrs=attrs,
+                                    stop_gradient=True)
 
 
 class AdamaxOptimizer(Optimizer):
@@ -2774,12 +2770,11 @@ def __init__(self,
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(AdamaxOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            parameter_list=parameter_list,
-            regularization=regularization,
-            grad_clip=grad_clip,
-            name=name)
+        super(AdamaxOptimizer, self).__init__(learning_rate=learning_rate,
+                                              parameter_list=parameter_list,
+                                              regularization=regularization,
+                                              grad_clip=grad_clip,
+                                              name=name)
         self.type = "adamax"
         self._beta1 = beta1
         self._beta2 = beta2
@@ -2790,11 +2785,10 @@ def _create_accumulators(self, block, parameters):
         for p in parameters:
             self._add_accumulator(self._moment_acc_str, p)
             self._add_accumulator(self._inf_norm_acc_str, p)
-            self._add_accumulator(
-                name=self._beta1_pow_acc_str,
-                param=p,
-                fill_value=self._beta1,
-                shape=[1])
+            self._add_accumulator(name=self._beta1_pow_acc_str,
+                                  param=p,
+                                  fill_value=self._beta1,
+                                  shape=[1])
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -2843,8 +2837,8 @@ def _finish_update(self, block, parameters_and_grads):
         for param, grad in parameters_and_grads:
             if grad is None or param.trainable is False:
                 continue
-            with param.block.program._optimized_guard(
-                [param, grad]), name_scope('adamx'):
+            with param.block.program._optimized_guard([param, grad
+                                                       ]), name_scope('adamx'):
                 beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
                                                       param)
                 if framework._non_static_mode():
@@ -2855,12 +2849,11 @@ def _finish_update(self, block, parameters_and_grads):
                         tmp = _C_ops.scale(beta1_pow_acc, "scale", self._beta1)
                     beta1_pow_acc.copy_(tmp, False)
                 else:
-                    block.append_op(
-                        type="scale",
-                        inputs={"X": beta1_pow_acc},
-                        outputs={"Out": beta1_pow_acc},
-                        attrs={"scale": self._beta1},
-                        stop_gradient=True)
+                    block.append_op(type="scale",
+                                    inputs={"X": beta1_pow_acc},
+                                    outputs={"Out": beta1_pow_acc},
+                                    attrs={"scale": self._beta1},
+                                    stop_gradient=True)
 
 
 class DpsgdOptimizer(Optimizer):
@@ -2918,8 +2911,8 @@ def __init__(self,
         assert clip is not None
         assert batch_size is not None
         assert sigma is not None
-        super(DpsgdOptimizer, self).__init__(
-            learning_rate=learning_rate, parameter_list=parameter_list)
+        super(DpsgdOptimizer, self).__init__(learning_rate=learning_rate,
+                                             parameter_list=parameter_list)
         self.type = "dpsgd"
         self._clip = clip
         self._batch_size = batch_size
@@ -2946,21 +2939,23 @@ def _append_optimize_op(self, block, param_and_grad):
                          self._batch_size, "sigma", self._sigma, "seed",
                          self._seed)
         else:
-            dpsgd_op = block.append_op(
-                type=self.type,
-                inputs={
-                    "Param": param_and_grad[0],
-                    "Grad": param_and_grad[1],
-                    "LearningRate": self._create_param_lr(param_and_grad)
-                },
-                outputs={"ParamOut": param_and_grad[0]},
-                attrs={
-                    "clip": self._clip,
-                    "batch_size": self._batch_size,
-                    "sigma": self._sigma,
-                    "seed": self._seed
-                },
-                stop_gradient=True)
+            dpsgd_op = block.append_op(type=self.type,
+                                       inputs={
+                                           "Param":
+                                           param_and_grad[0],
+                                           "Grad":
+                                           param_and_grad[1],
+                                           "LearningRate":
+                                           self._create_param_lr(param_and_grad)
+                                       },
+                                       outputs={"ParamOut": param_and_grad[0]},
+                                       attrs={
+                                           "clip": self._clip,
+                                           "batch_size": self._batch_size,
+                                           "sigma": self._sigma,
+                                           "seed": self._seed
+                                       },
+                                       stop_gradient=True)
 
             return dpsgd_op
 
@@ -3035,12 +3030,12 @@ def __init__(self,
         assert decay is not None
         assert epsilon is not None
 
-        super(DecayedAdagradOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            parameter_list=parameter_list,
-            regularization=regularization,
-            grad_clip=grad_clip,
-            name=name)
+        super(DecayedAdagradOptimizer,
+              self).__init__(learning_rate=learning_rate,
+                             parameter_list=parameter_list,
+                             regularization=regularization,
+                             grad_clip=grad_clip,
+                             name=name)
         self.type = "decayed_adagrad"
         self._decay = decay
         self._epsilon = epsilon
@@ -3058,10 +3053,11 @@ def _append_optimize_op(self, block, param_and_grad):
                                            param_and_grad[0])
 
         if framework._non_static_mode():
-            _C_ops.decayed_adagrad(
-                param_and_grad[0], param_and_grad[1], moment_acc,
-                self._create_param_lr(param_and_grad), param_and_grad[0],
-                moment_acc, "epsilon", self._epsilon, "decay", self._decay)
+            _C_ops.decayed_adagrad(param_and_grad[0], param_and_grad[1],
+                                   moment_acc,
+                                   self._create_param_lr(param_and_grad),
+                                   param_and_grad[0], moment_acc, "epsilon",
+                                   self._epsilon, "decay", self._decay)
         else:
             # Create the decayed adagrad optimizer op
             decayed_adagrad_op = block.append_op(
@@ -3076,8 +3072,10 @@ def _append_optimize_op(self, block, param_and_grad):
                     "ParamOut": param_and_grad[0],
                     "MomentOut": moment_acc
                 },
-                attrs={"epsilon": self._epsilon,
-                       "decay": self._decay},
+                attrs={
+                    "epsilon": self._epsilon,
+                    "decay": self._decay
+                },
                 stop_gradient=True)
 
             return decayed_adagrad_op
@@ -3154,12 +3152,11 @@ def __init__(self,
             raise ValueError("epsilon is not set.")
         if rho is None:
             raise ValueError("rho is not set.")
-        super(AdadeltaOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            parameter_list=parameter_list,
-            regularization=regularization,
-            grad_clip=grad_clip,
-            name=name)
+        super(AdadeltaOptimizer, self).__init__(learning_rate=learning_rate,
+                                                parameter_list=parameter_list,
+                                                regularization=regularization,
+                                                grad_clip=grad_clip,
+                                                name=name)
         self.type = "adadelta"
         self._epsilon = epsilon
         self._rho = rho
@@ -3189,22 +3186,30 @@ def _append_optimize_op(self, block, param_and_grad):
                             "rho", self._rho)
         else:
             # Create the adadelta optimizer op
-            adadelta_op = block.append_op(
-                type=self.type,
-                inputs={
-                    "Param": param_and_grad[0],
-                    "Grad": param_and_grad[1],
-                    "AvgSquaredGrad": avg_squared_grad_acc,
-                    "AvgSquaredUpdate": avg_squared_update_acc
-                },
-                outputs={
-                    "ParamOut": param_and_grad[0],
-                    "AvgSquaredGradOut": avg_squared_grad_acc,
-                    "AvgSquaredUpdateOut": avg_squared_update_acc
-                },
-                attrs={"epsilon": self._epsilon,
-                       "rho": self._rho},
-                stop_gradient=True)
+            adadelta_op = block.append_op(type=self.type,
+                                          inputs={
+                                              "Param":
+                                              param_and_grad[0],
+                                              "Grad":
+                                              param_and_grad[1],
+                                              "AvgSquaredGrad":
+                                              avg_squared_grad_acc,
+                                              "AvgSquaredUpdate":
+                                              avg_squared_update_acc
+                                          },
+                                          outputs={
+                                              "ParamOut":
+                                              param_and_grad[0],
+                                              "AvgSquaredGradOut":
+                                              avg_squared_grad_acc,
+                                              "AvgSquaredUpdateOut":
+                                              avg_squared_update_acc
+                                          },
+                                          attrs={
+                                              "epsilon": self._epsilon,
+                                              "rho": self._rho
+                                          },
+                                          stop_gradient=True)
 
             return adadelta_op
 
@@ -3330,12 +3335,11 @@ def __init__(self,
                  regularization=None,
                  grad_clip=None,
                  name=None):
-        super(RMSPropOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            parameter_list=parameter_list,
-            regularization=regularization,
-            grad_clip=grad_clip,
-            name=name)
+        super(RMSPropOptimizer, self).__init__(learning_rate=learning_rate,
+                                               parameter_list=parameter_list,
+                                               regularization=regularization,
+                                               grad_clip=grad_clip,
+                                               name=name)
         if learning_rate is None:
             raise ValueError("learning_rate is not set.")
         if rho is None:
@@ -3371,12 +3375,13 @@ def _append_optimize_op(self, block, param_and_grad):
         mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
                                               param_and_grad[0])
         if framework._non_static_mode():
-            _C_ops.rmsprop(
-                param_and_grad[0], mean_square_acc,
-                self._create_param_lr(param_and_grad), param_and_grad[1],
-                momentum_acc, param_and_grad[0], momentum_acc, mean_square_acc,
-                mean_grad_acc, "epsilon", self._epsilon, "decay", self._rho,
-                "momentum", self._momentum, "centered", self._centered)
+            _C_ops.rmsprop(param_and_grad[0], mean_square_acc,
+                           self._create_param_lr(param_and_grad),
+                           param_and_grad[1], momentum_acc, param_and_grad[0],
+                           momentum_acc, mean_square_acc, mean_grad_acc,
+                           "epsilon", self._epsilon, "decay", self._rho,
+                           "momentum", self._momentum, "centered",
+                           self._centered)
         else:
             rmsprop_op = block.append_op(
                 type=self.type,
@@ -3512,12 +3517,11 @@ def __init__(self,
                  regularization=None,
                  grad_clip=None,
                  name=None):
-        super(FtrlOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            parameter_list=parameter_list,
-            regularization=regularization,
-            grad_clip=grad_clip,
-            name=name)
+        super(FtrlOptimizer, self).__init__(learning_rate=learning_rate,
+                                            parameter_list=parameter_list,
+                                            regularization=regularization,
+                                            grad_clip=grad_clip,
+                                            name=name)
         if learning_rate is None:
             raise ValueError("learning_rate is not set.")
 
@@ -3543,33 +3547,37 @@ def _append_optimize_op(self, block, param_and_grad):
         linear_acc = self._get_accumulator(self._linear_acc_str,
                                            param_and_grad[0])
         if framework._non_static_mode():
-            _C_ops.ftrl(param_and_grad[0], squared_acc, linear_acc,
-                        param_and_grad[1],
+            _C_ops.ftrl(param_and_grad[0], squared_acc,
+                        linear_acc, param_and_grad[1],
                         self._create_param_lr(param_and_grad),
                         param_and_grad[0], squared_acc, linear_acc, "l1",
                         self._l1, "l2", self._l2, "lr_power", self._lr_power)
 
         else:
-            ftrl_op = block.append_op(
-                type=self.type,
-                inputs={
-                    "Param": param_and_grad[0],
-                    "Grad": param_and_grad[1],
-                    "SquaredAccumulator": squared_acc,
-                    "LinearAccumulator": linear_acc,
-                    "LearningRate": self._create_param_lr(param_and_grad),
-                },
-                outputs={
-                    "ParamOut": param_and_grad[0],
-                    "SquaredAccumOut": squared_acc,
-                    "LinearAccumOut": linear_acc
-                },
-                attrs={
-                    "l1": self._l1,
-                    "l2": self._l2,
-                    "lr_power": self._lr_power
-                },
-                stop_gradient=True)
+            ftrl_op = block.append_op(type=self.type,
+                                      inputs={
+                                          "Param":
+                                          param_and_grad[0],
+                                          "Grad":
+                                          param_and_grad[1],
+                                          "SquaredAccumulator":
+                                          squared_acc,
+                                          "LinearAccumulator":
+                                          linear_acc,
+                                          "LearningRate":
+                                          self._create_param_lr(param_and_grad),
+                                      },
+                                      outputs={
+                                          "ParamOut": param_and_grad[0],
+                                          "SquaredAccumOut": squared_acc,
+                                          "LinearAccumOut": linear_acc
+                                      },
+                                      attrs={
+                                          "l1": self._l1,
+                                          "l2": self._l2,
+                                          "lr_power": self._lr_power
+                                      },
+                                      stop_gradient=True)
 
             return ftrl_op
 
@@ -3668,15 +3676,14 @@ def __init__(self,
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(LambOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            parameter_list=parameter_list,
-            regularization=regularization,
-            grad_clip=grad_clip,
-            beta1=beta1,
-            beta2=beta2,
-            epsilon=epsilon,
-            name=name)
+        super(LambOptimizer, self).__init__(learning_rate=learning_rate,
+                                            parameter_list=parameter_list,
+                                            regularization=regularization,
+                                            grad_clip=grad_clip,
+                                            beta1=beta1,
+                                            beta2=beta2,
+                                            epsilon=epsilon,
+                                            name=name)
         self.type = "lamb"
         self._weight_decay = lamb_weight_decay
         self._exclude_from_weight_decay_fn = exclude_from_weight_decay_fn
@@ -3711,31 +3718,30 @@ def _append_optimize_op(self, block, param_and_grad):
             return None
 
         # create the lamb optimize op
-        lamb_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "LearningRate": lr,
-                "Moment1": moment1,
-                "Moment2": moment2,
-                "Beta1Pow": beta1_pow_acc,
-                "Beta2Pow": beta2_pow_acc
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "Moment1Out": moment1,
-                "Moment2Out": moment2,
-                "Beta1PowOut": beta1_pow_acc,
-                "Beta2PowOut": beta2_pow_acc
-            },
-            attrs={
-                "beta1": self._beta1,
-                "beta2": self._beta2,
-                "epsilon": self._epsilon,
-                "weight_decay": weight_decay
-            },
-            stop_gradient=True)
+        lamb_op = block.append_op(type=self.type,
+                                  inputs={
+                                      "Param": param_and_grad[0],
+                                      "Grad": param_and_grad[1],
+                                      "LearningRate": lr,
+                                      "Moment1": moment1,
+                                      "Moment2": moment2,
+                                      "Beta1Pow": beta1_pow_acc,
+                                      "Beta2Pow": beta2_pow_acc
+                                  },
+                                  outputs={
+                                      "ParamOut": param_and_grad[0],
+                                      "Moment1Out": moment1,
+                                      "Moment2Out": moment2,
+                                      "Beta1PowOut": beta1_pow_acc,
+                                      "Beta2PowOut": beta2_pow_acc
+                                  },
+                                  attrs={
+                                      "beta1": self._beta1,
+                                      "beta2": self._beta2,
+                                      "epsilon": self._epsilon,
+                                      "weight_decay": weight_decay
+                                  },
+                                  stop_gradient=True)
 
         return lamb_op
 
@@ -3857,8 +3863,9 @@ def __init__(self,
                  name=None):
         if framework._non_static_mode():
             raise Exception("In dygraph, don't support ModelAverage.")
-        super(ModelAverage, self).__init__(
-            0.0, regularization=regularization, name=name)
+        super(ModelAverage, self).__init__(0.0,
+                                           regularization=regularization,
+                                           name=name)
         self.average_window = average_window_rate
         self.min_average_window = min_average_window
         self.max_average_window = max_average_window
@@ -3927,38 +3934,44 @@ def _append_average_accumulate_op(self, param):
         sum_1 = self._add_accumulator('sum_1', param)
         sum_2 = self._add_accumulator('sum_2', param)
         sum_3 = self._add_accumulator('sum_3', param)
-        num_accumulates = self._add_accumulator(
-            'num_accumulates', param, dtype='int64', shape=[1])
-        old_num_accumulates = self._add_accumulator(
-            'old_num_accumulates', param, dtype='int64', shape=[1])
-        num_updates = self._add_accumulator(
-            'num_updates', param, dtype='int64', shape=[1])
-
-        self.helper.append_op(
-            type='average_accumulates',
-            inputs={
-                "param": param,
-                "in_sum_1": sum_1,
-                "in_sum_2": sum_2,
-                "in_sum_3": sum_3,
-                "in_num_accumulates": num_accumulates,
-                "in_old_num_accumulates": old_num_accumulates,
-                "in_num_updates": num_updates
-            },
-            outputs={
-                "out_sum_1": sum_1,
-                "out_sum_2": sum_2,
-                "out_sum_3": sum_3,
-                "out_num_accumulates": num_accumulates,
-                "out_old_num_accumulates": old_num_accumulates,
-                "out_num_updates": num_updates,
-            },
-            attrs={
-                "average_window": self.average_window,
-                "min_average_window": self.min_average_window,
-                "max_average_window": self.max_average_window,
-            },
-            stop_gradient=True)
+        num_accumulates = self._add_accumulator('num_accumulates',
+                                                param,
+                                                dtype='int64',
+                                                shape=[1])
+        old_num_accumulates = self._add_accumulator('old_num_accumulates',
+                                                    param,
+                                                    dtype='int64',
+                                                    shape=[1])
+        num_updates = self._add_accumulator('num_updates',
+                                            param,
+                                            dtype='int64',
+                                            shape=[1])
+
+        self.helper.append_op(type='average_accumulates',
+                              inputs={
+                                  "param": param,
+                                  "in_sum_1": sum_1,
+                                  "in_sum_2": sum_2,
+                                  "in_sum_3": sum_3,
+                                  "in_num_accumulates": num_accumulates,
+                                  "in_old_num_accumulates": old_num_accumulates,
+                                  "in_num_updates": num_updates
+                              },
+                              outputs={
+                                  "out_sum_1": sum_1,
+                                  "out_sum_2": sum_2,
+                                  "out_sum_3": sum_3,
+                                  "out_num_accumulates": num_accumulates,
+                                  "out_old_num_accumulates":
+                                  old_num_accumulates,
+                                  "out_num_updates": num_updates,
+                              },
+                              attrs={
+                                  "average_window": self.average_window,
+                                  "min_average_window": self.min_average_window,
+                                  "max_average_window": self.max_average_window,
+                              },
+                              stop_gradient=True)
 
     @signature_safe_contextmanager
     def apply(self, executor, need_restore=True):
@@ -4181,12 +4194,11 @@ def __init__(self, decay=0.999, thres_steps=None, name=None):
         self._params_tmps = []
         for param in default_main_program().global_block().all_parameters():
             if param.do_model_average != False:
-                tmp = param.block.create_var(
-                    name=unique_name.generate(".".join(
-                        [self._name + param.name, 'ema_tmp'])),
-                    dtype=param.dtype,
-                    persistable=False,
-                    stop_gradient=True)
+                tmp = param.block.create_var(name=unique_name.generate(".".join(
+                    [self._name + param.name, 'ema_tmp'])),
+                                             dtype=param.dtype,
+                                             persistable=False,
+                                             stop_gradient=True)
                 self._params_tmps.append((param, tmp))
 
         self._ema_vars = {}
@@ -4207,8 +4219,8 @@ def __init__(self, decay=0.999, thres_steps=None, name=None):
                 # bias correction
                 with layers.control_flow.Switch() as switch:
                     with switch.case(global_step > 0):
-                        layers.assign(
-                            output=param, input=ema / (1.0 - decay_pow))
+                        layers.assign(output=param,
+                                      input=ema / (1.0 - decay_pow))
                     with switch.default():
                         layers.assign(output=param, input=ema)
 
@@ -4236,18 +4248,16 @@ def _get_ema_decay(self):
                         layers.tensor.assign(decay_t, decay_var)
                     with switch.default():
                         layers.tensor.assign(
-                            np.array(
-                                [self._decay], dtype=np.float32),
+                            np.array([self._decay], dtype=np.float32),
                             decay_var)
         return decay_var
 
     def _get_decay_pow(self, block):
-        global_step = layers.create_global_var(
-            name=self._step_counter_name,
-            shape=[1],
-            value=0,
-            dtype='int64',
-            persistable=True)
+        global_step = layers.create_global_var(name=self._step_counter_name,
+                                               shape=[1],
+                                               value=0,
+                                               dtype='int64',
+                                               persistable=True)
         global_step = layers.cast(global_step, "float32")
         decay_var = block._clone_variable(self._decay_var)
         decay_pow_acc = layers.elementwise_pow(decay_var, global_step)
@@ -4432,18 +4442,18 @@ def _insert_allreduce_op(self, op_idx, block):
         if op.type == "reduce_any":
             # cast the bool var to int32 to use allreduce_max op
             temp_var_name = unique_name.generate(out_name + "_cast_int32")
-            temp_var = block.create_var(
-                name=temp_var_name, shape=[1], dtype="int32")
-            block._insert_op(
-                op_idx + 1 + offset,
-                type='cast',
-                inputs={'X': out_var},
-                outputs={'Out': temp_var},
-                attrs={
-                    'in_dtype': out_var.dtype,
-                    'out_dtype': temp_var.dtype,
-                    self._op_role_key: self._op_role.Optimize
-                })
+            temp_var = block.create_var(name=temp_var_name,
+                                        shape=[1],
+                                        dtype="int32")
+            block._insert_op(op_idx + 1 + offset,
+                             type='cast',
+                             inputs={'X': out_var},
+                             outputs={'Out': temp_var},
+                             attrs={
+                                 'in_dtype': out_var.dtype,
+                                 'out_dtype': temp_var.dtype,
+                                 self._op_role_key: self._op_role.Optimize
+                             })
             offset += 1
         block._insert_op(
             op_idx + 1 + offset,
@@ -4458,16 +4468,15 @@ def _insert_allreduce_op(self, op_idx, block):
             })
         offset += 1
         if op.type == "reduce_any":
-            block._insert_op(
-                op_idx + 1 + offset,
-                type='cast',
-                inputs={'X': temp_var},
-                outputs={'Out': out_var},
-                attrs={
-                    'in_dtype': temp_var.dtype,
-                    'out_dtype': out_var.dtype,
-                    self._op_role_key: self._op_role.Optimize
-                })
+            block._insert_op(op_idx + 1 + offset,
+                             type='cast',
+                             inputs={'X': temp_var},
+                             outputs={'Out': out_var},
+                             attrs={
+                                 'in_dtype': temp_var.dtype,
+                                 'out_dtype': out_var.dtype,
+                                 self._op_role_key: self._op_role.Optimize
+                             })
             offset += 1
         return offset
 
@@ -4483,7 +4492,7 @@ def _create_vars(self, block, ori_block):
             # get the global information, so allreduce op is needed.
             should_insert = False
             op = block.ops[op_idx]
-            # For op process vars on all devices, remove its input 
+            # For op process vars on all devices, remove its input
             # vars not in this block
             reserved_x = []
             if op.type == 'reduce_any' and self._is_optimize_op(op):
@@ -4518,8 +4527,8 @@ def _create_vars(self, block, ori_block):
 
             vars = op.desc.input_arg_names() + op.desc.output_arg_names()
             for var in vars:
-                # a var whose name contains "blocking_queue" 
-                # only exists in startup program 
+                # a var whose name contains "blocking_queue"
+                # only exists in startup program
                 if var in used_var_set or "_blocking_queue" in var:
                     continue
                 used_var_set.add(var)
@@ -4561,8 +4570,8 @@ def _is_loss_grad_op(self, op):
             self._op_role.Loss)
 
     def _is_forward_op(self, op):
-        return self._op_role_key in op.attr_names and (
-            int(op.attr(self._op_role_key)) == int(self._op_role.Forward))
+        return self._op_role_key in op.attr_names and (int(
+            op.attr(self._op_role_key)) == int(self._op_role.Forward))
 
     def _is_backward_op(self, op):
         return self._op_role_key in op.attr_names and (
@@ -4769,8 +4778,8 @@ def _add_op_device_attr_for_op(self, op, idx, block):
             device = post_op.attr(self._op_device_key)
             assert device, "The post op must have op_device set."
             op._set_attr(self._op_device_key, device)
-        elif (op.type == "cast" or
-              op.type == "scale") and self._is_backward_op(op):
+        elif (op.type == "cast"
+              or op.type == "scale") and self._is_backward_op(op):
             prev_op = self._find_prev_op(idx, op.desc.input("X")[0])
             op._set_attr(self._op_device_key, prev_op.attr(self._op_device_key))
         elif op.type == "memcpy" and not self._is_optimize_op(op):
@@ -4790,8 +4799,8 @@ def _add_op_device_attr_for_op(self, op, idx, block):
         elif self._is_loss_op(op):
             # For loss * loss_scaling op added by AMP
             offset = 1
-            while (not block.ops[idx + offset].has_attr(self._op_device_key) or
-                   not block.ops[idx + offset].attr(self._op_device_key)):
+            while (not block.ops[idx + offset].has_attr(self._op_device_key)
+                   or not block.ops[idx + offset].attr(self._op_device_key)):
                 offset += 1
             device = block.ops[idx + offset].attr(self._op_device_key)
             assert device, "Please put you program within device_guard scope."
@@ -4814,12 +4823,12 @@ def _add_op_device_attr_for_op(self, op, idx, block):
                 "regularization ops must have two elements."
             param_name = op_role_var[0]
             device = self._param_device_map[param_name]
-            # For sum op added by global gradient clip, it must be 
+            # For sum op added by global gradient clip, it must be
             # put on all devices
-            if (op.type == 'sum' or op.type == 'sqrt' or
-                    op.type == 'fill_constant' or
-                    op.type == 'elementwise_max' or
-                    op.type == 'elementwise_div'):
+            if (op.type == 'sum' or op.type == 'sqrt'
+                    or op.type == 'fill_constant'
+                    or op.type == 'elementwise_max'
+                    or op.type == 'elementwise_div'):
                 device = f"{self._device}:all"
             op._set_attr(self._op_device_key, device)
         elif op.type == "alloc_float_status" or op.type == "clear_float_status":
@@ -4851,9 +4860,9 @@ def _add_op_device_attr(self, block):
         not that attribute set.
         """
         for idx, op in enumerate(list(block.ops)):
-            if (op.type == "create_py_reader" or op.type == "read" or
-                    op.type == "create_double_buffer_reader"):
-                # Copy read related ops to all section to make them exit 
+            if (op.type == "create_py_reader" or op.type == "read"
+                    or op.type == "create_double_buffer_reader"):
+                # Copy read related ops to all section to make them exit
                 # after each epoch.
                 # We use "gpu:all" to represent the op should be put on all
                 # sub-programs, such as lr-related ops. Note that: "gpu:all"
@@ -4882,13 +4891,13 @@ def _check_validation(self, block):
         ]
         for op in block.ops:
             if not op._has_kernel(op.type):
-                assert op.type == "conditional_block" and (
-                    op.attr(self._op_role_key) == int(self._op_role.LRSched)), (
+                assert op.type == "conditional_block" and (op.attr(
+                    self._op_role_key) == int(self._op_role.LRSched)), (
                         "Now, the only supported op without kernel is "
                         "conditional_block, and its op role must be LRSched.")
-            assert op.has_attr(self._op_role_key), (
-                "op ({}) has no {} attribute.".format(op.type,
-                                                      self._op_role_key))
+            assert op.has_attr(
+                self._op_role_key), ("op ({}) has no {} attribute.".format(
+                    op.type, self._op_role_key))
             op_role = op.attr(self._op_role_key)
             assert int(op_role) in valid_op_role_value, \
                 "op_role {} for op {} must be one of {}".format(
@@ -4896,9 +4905,9 @@ def _check_validation(self, block):
                     op.type,
                     valid_op_role_value)
 
-            assert op.has_attr(self._op_device_key), (
-                "op ({}) has no {} attribute.".format(op.type,
-                                                      self._op_device_key))
+            assert op.has_attr(
+                self._op_device_key), ("op ({}) has no {} attribute.".format(
+                    op.type, self._op_device_key))
 
             device = op.attr(self._op_device_key)
             assert device, ("op_device attribute for op "
@@ -5055,8 +5064,8 @@ def _insert_send_recv(cur_id, prev_id):
                             0] < 0 else var_shape[0]
 
                         numel = np.prod(var_shape)
-                        use_mp = (self.mp_degree > 1) and (
-                            numel % self.mp_degree == 0)
+                        use_mp = (self.mp_degree > 1) and (numel %
+                                                           self.mp_degree == 0)
 
                         if 'subprog' in var.name:
                             # For recompute, if the checkpoints var is layer_norm_6.tmp_2
@@ -5085,15 +5094,17 @@ def _insert_send_recv(cur_id, prev_id):
 
                         _check_stage(cur_id, prev_id)
 
-                        block._insert_op_without_sync(
-                            index=index + extra_index_info['index'],
-                            type='c_sync_calc_stream',
-                            inputs={'X': [var]},
-                            outputs={'Out': [var]},
-                            attrs={
-                                self._op_device_key: prev_dev,
-                                self._op_role_key: op_role,
-                            })
+                        block._insert_op_without_sync(index=index +
+                                                      extra_index_info['index'],
+                                                      type='c_sync_calc_stream',
+                                                      inputs={'X': [var]},
+                                                      outputs={'Out': [var]},
+                                                      attrs={
+                                                          self._op_device_key:
+                                                          prev_dev,
+                                                          self._op_role_key:
+                                                          op_role,
+                                                      })
                         extra_index_info['index'] += 1
                         prefix_name = var.name.split('@')[0]
                         prefix_var = block.var(prefix_name)
@@ -5175,9 +5186,8 @@ def _insert_send_recv(cur_id, prev_id):
                             "Now only 'F-then-B' and '1F1B' are supported."
                             "The given value is {}.".format(self.schedule_mode))
 
-                _insert_send_recv(
-                    int(cur_device.split(':')[1]),
-                    int(prev_device.split(':')[1]))
+                _insert_send_recv(int(cur_device.split(':')[1]),
+                                  int(prev_device.split(':')[1]))
         block._sync_with_cpp()
 
     def _insert_loss_scale(self, block):
@@ -5247,8 +5257,8 @@ def _accumulate_gradients(self,
                 # maybe have no optimize
                 # if first_opt_op_idx == len(block.ops): return
 
-            if self._is_backward_op(op) and (
-                    self._op_role_var_key in op.attr_names):
+            if self._is_backward_op(op) and (self._op_role_var_key
+                                             in op.attr_names):
                 op_role_var = op.attr(self._op_role_var_key)
                 if len(op_role_var) == 0: continue
                 assert len(op_role_var) % 2 == 0
@@ -5274,11 +5284,15 @@ def _accumulate_gradients(self,
                         inputs={},
                         outputs={'Out': [merged_param_grad_var]},
                         attrs={
-                            'shape': merged_param_grad_var.shape,
-                            'dtype': merged_param_grad_var.dtype,
-                            'value': float(0),
+                            'shape':
+                            merged_param_grad_var.shape,
+                            'dtype':
+                            merged_param_grad_var.dtype,
+                            'value':
+                            float(0),
                             # a trick to run this op once per mini-batch
-                            self._op_role_key: self._op_role.Optimize.LRSched,
+                            self._op_role_key:
+                            self._op_role.Optimize.LRSched,
                         })
                     offset += 1
                     grad_name = op_role_var[i + 1]
@@ -5296,16 +5310,18 @@ def _accumulate_gradients(self,
                         cast_grad_var = self._create_var(
                             block, param_grad_var, cast_grad_var_name, dtype)
                         cast_grad_var.persistable = False
-                        block._insert_op(
-                            index=first_opt_op_idx + offset,
-                            type='cast',
-                            inputs={'X': grad_var},
-                            outputs={'Out': cast_grad_var},
-                            attrs={
-                                'in_dtype': grad_var.dtype,
-                                'out_dtype': cast_grad_var.dtype,
-                                self._op_role_key: self._op_role.Backward,
-                            })
+                        block._insert_op(index=first_opt_op_idx + offset,
+                                         type='cast',
+                                         inputs={'X': grad_var},
+                                         outputs={'Out': cast_grad_var},
+                                         attrs={
+                                             'in_dtype':
+                                             grad_var.dtype,
+                                             'out_dtype':
+                                             cast_grad_var.dtype,
+                                             self._op_role_key:
+                                             self._op_role.Backward,
+                                         })
                         offset += 1
                         grad_var = cast_grad_var
 
@@ -5314,7 +5330,9 @@ def _accumulate_gradients(self,
                         type='sum',
                         inputs={'X': [merged_param_grad_var, grad_var]},
                         outputs={'Out': merged_param_grad_var},
-                        attrs={self._op_role_key: self._op_role.Backward, })
+                        attrs={
+                            self._op_role_key: self._op_role.Backward,
+                        })
                     offset += 1
                     merged_gradient_names.append(merged_param_grad_name)
 
@@ -5342,24 +5360,23 @@ def _accumulate_gradients(self,
             grad_var = block.var(grad_name)
             grad_var.persistable = False
 
-            block._insert_op(
-                index=first_opt_op_idx,
-                type='cast',
-                inputs={'X': fp16_grad_var},
-                outputs={'Out': grad_var},
-                attrs={
-                    'in_dtype': fp16_grad_var.dtype,
-                    'out_dtype': grad_var.dtype,
-                    self._op_role_key: self._op_role.Optimize,
-                })
+            block._insert_op(index=first_opt_op_idx,
+                             type='cast',
+                             inputs={'X': fp16_grad_var},
+                             outputs={'Out': grad_var},
+                             attrs={
+                                 'in_dtype': fp16_grad_var.dtype,
+                                 'out_dtype': grad_var.dtype,
+                                 self._op_role_key: self._op_role.Optimize,
+                             })
 
         return merged_gradient_names
 
     def _insert_accumulate_gradients_with_fuse(self, main_block, fp16,
                                                fused_size, grad_param_pairs,
                                                first_opt_op_idx):
-        grad_param_pairs = self._sort_grad_param_by_dtype(main_block,
-                                                          grad_param_pairs)
+        grad_param_pairs = self._sort_grad_param_by_dtype(
+            main_block, grad_param_pairs)
 
         grad_param_segments = []
         merged_suffix = '@MERGED@FP16' if fp16 else '@MERGED'
@@ -5402,11 +5419,11 @@ def _insert_accumulate_gradients_with_fuse(self, main_block, fp16,
         for grad_param_segment in grad_param_segments:
             grad_segment = grad_param_segment[0]
             merged_grad_segment = grad_param_segment[2]
-            fused_grad = main_block.create_var(
-                name='FusedGrad_{}'.format(grad_segment[0].name),
-                dtype=grad_segment[0].dtype,
-                persistable=False,
-                stop_gradient=False)
+            fused_grad = main_block.create_var(name='FusedGrad_{}'.format(
+                grad_segment[0].name),
+                                               dtype=grad_segment[0].dtype,
+                                               persistable=False,
+                                               stop_gradient=False)
             # keep the '.cast_fp16' info in the fuse var name
             fused_merged_grad_name_prefix = 'FusedMergedGrad.cast_fp16.' if \
                 merged_grad_segment[0].dtype == paddle.float16 else 'FusedMergedGrad'
@@ -5442,8 +5459,10 @@ def _insert_accumulate_gradients_with_fuse(self, main_block, fp16,
                 first_back_op_idx + offset,
                 type="coalesce_tensor",
                 inputs={"Input": params},
-                outputs={"Output": grads,
-                         "FusedOutput": fused_grad},
+                outputs={
+                    "Output": grads,
+                    "FusedOutput": fused_grad
+                },
                 attrs={
                     # Explanation of user_defined_size_of_dtype:
                     # In coalesce op, the align size is 256 bytes
@@ -5503,21 +5522,20 @@ def _insert_accumulate_gradients_with_fuse(self, main_block, fp16,
                 # for fp16 allreduce, cast fp32 grad to fp16
                 # for fp32 allreduce, cast fp16 grad to fp32
                 cast_grad_var_name = fused_grad.name + '@TMP'
-                cast_grad_var = main_block.create_var(
-                    name=cast_grad_var_name,
-                    dtype=dtype,
-                    persistable=False,
-                    stop_gradient=False)
-                main_block._insert_op(
-                    index=first_opt_op_idx + offset,
-                    type='cast',
-                    inputs={'X': fused_grad},
-                    outputs={'Out': cast_grad_var},
-                    attrs={
-                        'in_dtype': fused_grad.dtype,
-                        'out_dtype': cast_grad_var.dtype,
-                        self._op_role_key: self._op_role.Backward,
-                    })
+                cast_grad_var = main_block.create_var(name=cast_grad_var_name,
+                                                      dtype=dtype,
+                                                      persistable=False,
+                                                      stop_gradient=False)
+                main_block._insert_op(index=first_opt_op_idx + offset,
+                                      type='cast',
+                                      inputs={'X': fused_grad},
+                                      outputs={'Out': cast_grad_var},
+                                      attrs={
+                                          'in_dtype': fused_grad.dtype,
+                                          'out_dtype': cast_grad_var.dtype,
+                                          self._op_role_key:
+                                          self._op_role.Backward,
+                                      })
                 offset += 1
                 fused_grad = cast_grad_var
             main_block._insert_op(
@@ -5536,22 +5554,21 @@ def _insert_accumulate_gradients_with_fuse(self, main_block, fp16,
                 assert main_block.has_var(fp16_grad_name)
                 fp16_grad = main_block.var(fp16_grad_name)
                 fp32_grad_name = param + core.grad_var_suffix() + '@MERGED'
-                fp32_grad = main_block.create_var(
-                    name=fp32_grad_name,
-                    dtype=paddle.float32,
-                    shape=real_grad.shape,
-                    persistable=False,
-                    stop_gradient=False)
-                main_block._insert_op(
-                    index=first_opt_op_idx + offset,
-                    type='cast',
-                    inputs={'X': fp16_grad},
-                    outputs={'Out': fp32_grad},
-                    attrs={
-                        'in_dtype': paddle.float16,
-                        'out_dtype': paddle.float32,
-                        self._op_role_key: self._op_role.Optimize,
-                    })
+                fp32_grad = main_block.create_var(name=fp32_grad_name,
+                                                  dtype=paddle.float32,
+                                                  shape=real_grad.shape,
+                                                  persistable=False,
+                                                  stop_gradient=False)
+                main_block._insert_op(index=first_opt_op_idx + offset,
+                                      type='cast',
+                                      inputs={'X': fp16_grad},
+                                      outputs={'Out': fp32_grad},
+                                      attrs={
+                                          'in_dtype': paddle.float16,
+                                          'out_dtype': paddle.float32,
+                                          self._op_role_key:
+                                          self._op_role.Optimize,
+                                      })
                 offset += 1
 
         # replace the var with it's name, which will be used for inserting allreduce
@@ -5584,8 +5601,8 @@ def _accumulate_gradients_with_fuse(self,
                 if first_opt_op_idx == len(main_block.ops):
                     return
 
-            if self._is_backward_op(op) and (
-                    self._op_role_var_key in op.attr_names):
+            if self._is_backward_op(op) and (self._op_role_var_key
+                                             in op.attr_names):
                 op_role_var = op.attr(self._op_role_var_key)
                 if len(op_role_var) == 0:
                     continue
@@ -5747,30 +5764,44 @@ def _process_persistable_vars_in_multi_sections(self, main_program,
                 write_block._insert_op(
                     index=0,
                     type='send_v2',
-                    inputs={'X': write_block.var(var_name), },
+                    inputs={
+                        'X': write_block.var(var_name),
+                    },
                     attrs={
-                        self._op_device_key: write_device,
-                        'use_calc_stream': False,
+                        self._op_device_key:
+                        write_device,
+                        'use_calc_stream':
+                        False,
                         # A trick to make the role LRSched to avoid copy every
                         # microbatch
-                        self._op_role_key: self._op_role.LRSched,
-                        'peer': read_dev_index,
-                        'ring_id': ring_id
+                        self._op_role_key:
+                        self._op_role.LRSched,
+                        'peer':
+                        read_dev_index,
+                        'ring_id':
+                        ring_id
                     })
                 read_block._insert_op(
                     index=0,
                     type='recv_v2',
                     outputs={'Out': [read_block.var(var_name)]},
                     attrs={
-                        'out_shape': read_block.var(var_name).shape,
-                        'dtype': read_block.var(var_name).dtype,
-                        self._op_device_key: read_device,
-                        'use_calc_stream': False,
+                        'out_shape':
+                        read_block.var(var_name).shape,
+                        'dtype':
+                        read_block.var(var_name).dtype,
+                        self._op_device_key:
+                        read_device,
+                        'use_calc_stream':
+                        False,
                         # A trick to make the role LRSched to avoid copy every
                         # microbatch
-                        self._op_role_key: self._op_role.LRSched,
-                        'peer': write_dev_index,
-                        'ring_id': ring_id
+                        self._op_role_key:
+                        self._op_role.LRSched,
+                        'peer':
+                        write_dev_index,
+                        'ring_id':
+                        ring_id
                     })
                 read_block._insert_op(
                     index=1,
@@ -5778,11 +5809,14 @@ def _process_persistable_vars_in_multi_sections(self, main_program,
                     inputs={'X': [read_block.var(var_name)]},
                     outputs={'Out': [read_block.var(var_name)]},
                     attrs={
-                        self._op_device_key: read_device,
+                        self._op_device_key:
+                        read_device,
                         # A trick to make the role LRSched to avoid copy every
                         # microbatch
-                        self._op_role_key: self._op_role.LRSched,
-                        'ring_id': ring_id
+                        self._op_role_key:
+                        self._op_role.LRSched,
+                        'ring_id':
+                        ring_id
                     })
 
     def _is_gradient_clip_op(self, op):
@@ -5891,12 +5925,11 @@ def _mv_head_recv(self, program):
             op_outputs = dict()
             for name in op.output_names:
                 op_outputs[name] = op.output(name)
-            block._insert_op_without_sync(
-                index=insert_index,
-                type=op.type,
-                inputs=op_inputs,
-                outputs=op_outputs,
-                attrs=op.all_attrs())
+            block._insert_op_without_sync(index=insert_index,
+                                          type=op.type,
+                                          inputs=op_inputs,
+                                          outputs=op_outputs,
+                                          attrs=op.all_attrs())
             block._remove_op(i + 1)
             if op_role == int(self._op_role.Forward):
                 forward_insert_index += 1
@@ -6018,7 +6051,7 @@ def device_cmp(device1, device2):
 
         # Step4: Special Case: process persistable vars that exist in
         # multiple sections
-        # FIXME 
+        # FIXME
         # self._process_persistable_vars_in_multi_sections(
         #     main_program, startup_program, program_list)
 
@@ -6034,8 +6067,8 @@ def device_cmp(device1, device2):
                 place_list.append(core.NPUPlace(dev_index % 1))
 
         # Step6: Split startup program
-        new_startup_program = self._split_startup_program(startup_program,
-                                                          self.local_rank)
+        new_startup_program = self._split_startup_program(
+            startup_program, self.local_rank)
 
         startup_program._pipeline_opt = {
             "startup_program": new_startup_program,
@@ -6044,7 +6077,7 @@ def device_cmp(device1, device2):
         if not self.scale_gradient:
             self._insert_loss_scale(real_block)
         if not self.use_sharding:
-            # Step7: clear gradients before each mini-batch and 
+            # Step7: clear gradients before each mini-batch and
             # accumulate gradients during backward
             self._rename_gradient_var_name(real_block)
             real_block._sync_with_cpp()
@@ -6162,11 +6195,12 @@ def _set_checkpoints(self, checkpoints):
         ), "_checkpoints should be a list of Variable or a list of String"
         for ckpt in checkpoints:
             assert (
-                isinstance(ckpt, six.string_types) or isinstance(ckpt, Variable)
+                isinstance(ckpt, six.string_types)
+                or isinstance(ckpt, Variable)
             ), "_checkpoints should be a list of Variable or a list of String"
         self._checkpoints = checkpoints
 
-    # should enable offload before calling backward 
+    # should enable offload before calling backward
     def _enable_offload(self):
         self.enable_offload = True
 
@@ -6300,16 +6334,15 @@ def _append_fill_constant_ops(self, startup_program):
                 dtype=self._main_program.global_block().var(var.name).dtype,
                 persistable=False,
                 stop_gradient=True)
-            block.append_op(
-                type='fill_constant',
-                outputs={'Out': varname},
-                attrs={
-                    "shape": var.shape,
-                    "dtype": var.dtype,
-                    "value": 0.0,
-                    "place_type": 2,
-                    OP_ROLE_KEY: op_role,
-                })
+            block.append_op(type='fill_constant',
+                            outputs={'Out': varname},
+                            attrs={
+                                "shape": var.shape,
+                                "dtype": var.dtype,
+                                "value": 0.0,
+                                "place_type": 2,
+                                OP_ROLE_KEY: op_role,
+                            })
 
         return
 
@@ -6343,7 +6376,7 @@ def _insert_offload_op(self, idx, varname):
         self._insert_async_memcpy_op(idx, varname, pinned_varname, 0, 2)
 
     def _insert_sync_op(self, op_idx, checkpoint_name):
-        # single stream offload no need sync 
+        # single stream offload no need sync
         pass
 
     def _record_fetch_op(self, idx):
@@ -6372,7 +6405,7 @@ def _record_sync_op(self, idx, checkpoint_name):
     def _parse_backward(self):
 
         self.idx2insertions = {}
-        # don't offload the last checkpoints, to favor throughput        
+        # don't offload the last checkpoints, to favor throughput
         self.un_fetch_checkpoint_names = self.sorted_checkpoint_names[:]
         self.un_fetch_checkpoint_names.pop(-1)
         need_fetch_checkpoint_names = self.un_fetch_checkpoint_names[:]
@@ -6405,12 +6438,12 @@ def _parse_backward(self):
                         if self.checkpoint_usage_count[input_var] == 0:
                             # TODO (JZ-LIANG) sync memcpy_stream if extra stream for memcpy
                             second_to_last_fetch_checkpoint = fetched_checkpoint_varname
-                            # there is NO fetch ahead the first checkpoint 
+                            # there is NO fetch ahead the first checkpoint
                             if input_var != self.sorted_checkpoint_names[0]:
                                 fetched_checkpoint_varname = self._record_fetch_op(
                                     idx)
 
-                        # should check the current used checkpoint is ths last fetch one 
+                        # should check the current used checkpoint is ths last fetch one
                         assert second_to_last_fetch_checkpoint == input_var, "Current recompute segment should use [{}] BUT got [{}]".format(
                             second_to_last_fetch_checkpoint, input_var)
                         # rename
@@ -6436,8 +6469,8 @@ def _update_backward(self):
                 operation, checkpoint_name = self.idx2insertions[op_idx]
                 if operation == "fetch":
                     self._insert_fetch_op(op_idx, checkpoint_name)
-                    logging.debug("Insert [{}] fetch op.".format(
-                        checkpoint_name))
+                    logging.debug(
+                        "Insert [{}] fetch op.".format(checkpoint_name))
                     del self.idx2insertions[op_idx]
                 elif operation == "sync":
                     self._insert_sync_op(op_idx, checkpoint_name)
@@ -6450,7 +6483,7 @@ def _update_backward(self):
     def _parse_forward(self):
 
         self.idx2insertions = {}
-        # don't offload the last checkpoints, faster, less memory saving       
+        # don't offload the last checkpoints, faster, less memory saving
         self.un_offload_checkpoint_names = self.sorted_checkpoint_names[:]
         last_checkpoint = self.un_offload_checkpoint_names.pop(-1)
         need_offload_checkpoint_names = self.un_offload_checkpoint_names[:]
@@ -6471,8 +6504,8 @@ def _parse_forward(self):
             self.block.ops), "Could NOT found Forward op in prog"
         last_offload_checkpoint = None
 
-        for i, op in enumerate(self.block.ops[self.fw_strart_op_idx:
-                                              self.bw_strart_op_idx]):
+        for i, op in enumerate(
+                self.block.ops[self.fw_strart_op_idx:self.bw_strart_op_idx]):
 
             idx = self.fw_strart_op_idx + i
             output_vars = op.desc.output_arg_names()
@@ -6504,8 +6537,8 @@ def _parse_forward(self):
                         last_offload_checkpoint = output_var
                     else:
                         raise ValueError(
-                            "There should be just ONE op that output checkpoint [{}]".
-                            format(output_var))
+                            "There should be just ONE op that output checkpoint [{}]"
+                            .format(output_var))
                 # need to sync the last need to offload checkpoint before the last checkpoint as output op
                 if output_var == last_checkpoint:
                     assert len(
@@ -6527,7 +6560,7 @@ def _parse_forward(self):
                             last_offload_checkpoint)
                         self._record_sync_op(last_usage_idx + 1,
                                              last_offload_checkpoint)
-            # record checkpoint usage  
+            # record checkpoint usage
             for input_var in input_vars:
                 if input_var in need_offload_checkpoint_names:
                     assert input_var not in self.synced_checkpoints, "checkpoint [{}] used after sync".format(
@@ -6552,13 +6585,13 @@ def _update_forward(self):
                 operation, checkpoint_name = self.idx2insertions[op_idx]
                 if operation == "offload":
                     self._insert_offload_op(op_idx, checkpoint_name)
-                    logging.debug("Insert [{}] offload op.".format(
-                        checkpoint_name))
+                    logging.debug(
+                        "Insert [{}] offload op.".format(checkpoint_name))
                     del self.idx2insertions[op_idx]
                 elif operation == "sync":
                     self._insert_sync_op(op_idx, checkpoint_name)
-                    logging.debug("Insert [{}] offload_sync op.".format(
-                        checkpoint_name))
+                    logging.debug(
+                        "Insert [{}] offload_sync op.".format(checkpoint_name))
                     del self.idx2insertions[op_idx]
 
         self.block._sync_with_cpp()
@@ -6585,11 +6618,11 @@ def _offload(self, loss, startup_program=None):
 
         with program_guard(self._main_program, startup_program):
             assert len(self.checkpoint_shape) > 0, (
-                "checkpoints shape {} should be an non empty list like: [12, 512, 1024]".
-                format(self.checkpoint_shape))
+                "checkpoints shape {} should be an non empty list like: [12, 512, 1024]"
+                .format(self.checkpoint_shape))
             assert all([ele > 0 for ele in self.checkpoint_shape]), (
-                "all ele in checkpoints shape {} should be a determined integer larger than 0".
-                format(self.checkpoint_shape))
+                "all ele in checkpoints shape {} should be a determined integer larger than 0"
+                .format(self.checkpoint_shape))
             self.checkpoint_name2pinned_name = dict()
             self.checkpoint_name2fetch_name = dict()
             for checkpoint_varname in self.sorted_checkpoint_names:
@@ -6659,8 +6692,8 @@ def mlp(input_x, input_y, hid_dim=128, label_dim=2):
                     no_grad_set=None)
                 print("Finished backward")
         """
-        assert (self._checkpoints is not None
-                ), "You should call _set_checkpoints first"
+        assert (self._checkpoints
+                is not None), "You should call _set_checkpoints first"
 
         if framework._non_static_mode():
             raise NotImplementedError(
@@ -6684,11 +6717,10 @@ def mlp(input_x, input_y, hid_dim=128, label_dim=2):
                     no_grad_set,
                     checkpoints=checkpoint_vars)
             else:
-                params_grads = append_backward(
-                    loss,
-                    parameter_list,
-                    no_grad_set,
-                    checkpoints=checkpoint_vars)
+                params_grads = append_backward(loss,
+                                               parameter_list,
+                                               no_grad_set,
+                                               checkpoints=checkpoint_vars)
 
         if self.enable_offload:
             self.sorted_checkpoint_names = sorted_checkpoint_names
@@ -6738,8 +6770,9 @@ def mlp(input_x, input_y, hid_dim=128, label_dim=2):
         func = self._optimizer.apply_optimize if hasattr(
             self._optimizer,
             'apply_optimize') else self._optimizer._apply_optimize
-        return func(
-            loss, startup_program=startup_program, params_grads=params_grads)
+        return func(loss,
+                    startup_program=startup_program,
+                    params_grads=params_grads)
 
     def minimize(self,
                  loss,
@@ -6747,19 +6780,19 @@ def minimize(self,
                  parameter_list=None,
                  no_grad_set=None):
         assert isinstance(loss, Variable), "The loss should be an Variable."
-        assert (self._checkpoints is not None
-                ), "You should call _set_checkpoints first"
+        assert (self._checkpoints
+                is not None), "You should call _set_checkpoints first"
         if framework._non_static_mode():
             raise NotImplementedError(
                 "DyGraph current does not support recompute")
-        params_grads = self.backward(
-            loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set)
+        params_grads = self.backward(loss,
+                                     startup_program=startup_program,
+                                     parameter_list=parameter_list,
+                                     no_grad_set=no_grad_set)
 
-        optimize_ops = self.apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
+        optimize_ops = self.apply_optimize(loss,
+                                           startup_program=startup_program,
+                                           params_grads=params_grads)
 
         return optimize_ops, params_grads
 
@@ -6857,11 +6890,10 @@ def minimize(self, loss, startup_program=None):
         for param in params:
             fast_var = main_block.var(param)
             assert (fast_var is not None)
-            slow_var = main_block.create_var(
-                name=param + "@SLOW",
-                shape=fast_var.shape,
-                dtype=fast_var.dtype,
-                persistable=True)
+            slow_var = main_block.create_var(name=param + "@SLOW",
+                                             shape=fast_var.shape,
+                                             dtype=fast_var.dtype,
+                                             persistable=True)
             param_to_slow[param] = slow_var
 
         # add some vars to the startup_program
@@ -6869,49 +6901,46 @@ def minimize(self, loss, startup_program=None):
         for param in params:
             fast_var = startup_block.var(param)
             assert (fast_var is not None)
-            slow_var = startup_block.create_var(
-                name=param + "@SLOW",
-                shape=fast_var.shape,
-                dtype=fast_var.dtype,
-                persistable=True)
+            slow_var = startup_block.create_var(name=param + "@SLOW",
+                                                shape=fast_var.shape,
+                                                dtype=fast_var.dtype,
+                                                persistable=True)
 
-            startup_block.append_op(
-                type="assign",
-                inputs={"X": fast_var},
-                outputs={"Out": slow_var})
+            startup_block.append_op(type="assign",
+                                    inputs={"X": fast_var},
+                                    outputs={"Out": slow_var})
 
         with framework.program_guard(main_block.program, startup_program):
             # Add Var k to main prog and startup prog
-            k = layers.create_global_var(
-                name="lookahead_k",
-                shape=[1],
-                value=int(self.k),
-                dtype='int32',
-                persistable=True)
+            k = layers.create_global_var(name="lookahead_k",
+                                         shape=[1],
+                                         value=int(self.k),
+                                         dtype='int32',
+                                         persistable=True)
 
             # Add Var alpha to main prog and startup prog
-            alpha = layers.create_global_var(
-                name="lookahead_alpha",
-                shape=[1],
-                value=float(self.alpha),
-                dtype='float32',
-                persistable=True)
+            alpha = layers.create_global_var(name="lookahead_alpha",
+                                             shape=[1],
+                                             value=float(self.alpha),
+                                             dtype='float32',
+                                             persistable=True)
 
             # Add Var step
-            step = layers.create_global_var(
-                name="lookahead_step",
-                shape=[1],
-                value=int(0),
-                dtype='int32',
-                persistable=True)
+            step = layers.create_global_var(name="lookahead_step",
+                                            shape=[1],
+                                            value=int(0),
+                                            dtype='int32',
+                                            persistable=True)
             layers.increment(x=step, value=1.0, in_place=True)
 
             # lookahead
-            zero_var = layers.fill_constant(
-                shape=[1], dtype='float32', value=0.0)
+            zero_var = layers.fill_constant(shape=[1],
+                                            dtype='float32',
+                                            value=0.0)
 
-            one_var = layers.fill_constant(
-                shape=[1], dtype='float32', value=1.0)
+            one_var = layers.fill_constant(shape=[1],
+                                           dtype='float32',
+                                           value=1.0)
 
             mod = layers.elementwise_mod(step, k)
             with layers.control_flow.Switch() as switch:
@@ -7001,8 +7030,8 @@ def __init__(self, inner_optimizer, k_steps=1, avg=True):
                 "and one-time optimizer.minimize()")
 
         assert (inner_optimizer is not None), "inner optimizer can not be None"
-        assert (isinstance(k_steps, int) and
-                k_steps > 0), "k_steps should be a positive integer"
+        assert (isinstance(k_steps, int)
+                and k_steps > 0), "k_steps should be a positive integer"
 
         self.inner_optimizer = inner_optimizer
         self.k_steps = k_steps
@@ -7089,51 +7118,53 @@ def _add_gm_op_role_var(self, op, param, grad, cond):
 
     def _get_gm_cond_var(self, main_block):
         # Add const var
-        k_step_var = layers.create_global_var(
-            name="gradient_merge_k",
-            shape=[1],
-            value=int(self.k_steps),
-            dtype='int32',
-            persistable=True,
-            force_cpu=True)
-
-        zero_var = layers.create_global_var(
-            name="gradient_merge_zero",
-            shape=[1],
-            value=int(0),
-            dtype='int32',
-            persistable=True,
-            force_cpu=True)
+        k_step_var = layers.create_global_var(name="gradient_merge_k",
+                                              shape=[1],
+                                              value=int(self.k_steps),
+                                              dtype='int32',
+                                              persistable=True,
+                                              force_cpu=True)
+
+        zero_var = layers.create_global_var(name="gradient_merge_zero",
+                                            shape=[1],
+                                            value=int(0),
+                                            dtype='int32',
+                                            persistable=True,
+                                            force_cpu=True)
 
         # Add step var & cond var
-        step_var = layers.create_global_var(
-            name="gradient_merge_step",
-            shape=[1],
-            value=int(0),
-            dtype='int32',
-            persistable=True,
-            force_cpu=True)
+        step_var = layers.create_global_var(name="gradient_merge_step",
+                                            shape=[1],
+                                            value=int(0),
+                                            dtype='int32',
+                                            persistable=True,
+                                            force_cpu=True)
 
-        cond_var = main_block.create_var(
-            name="gradient_merge_cond", shape=[1], dtype='bool')
+        cond_var = main_block.create_var(name="gradient_merge_cond",
+                                         shape=[1],
+                                         dtype='bool')
 
         with device_guard("cpu"):
             # step_var = (step_var + 1) % k_step
             layers.increment(x=step_var, value=1.0, in_place=True)
-            main_block.append_op(
-                type='elementwise_mod',
-                inputs={'X': step_var,
-                        'Y': k_step_var},
-                outputs={'Out': step_var},
-                attrs={'axis': -1,
-                       'use_mkldnn': False})
+            main_block.append_op(type='elementwise_mod',
+                                 inputs={
+                                     'X': step_var,
+                                     'Y': k_step_var
+                                 },
+                                 outputs={'Out': step_var},
+                                 attrs={
+                                     'axis': -1,
+                                     'use_mkldnn': False
+                                 })
 
             # cond_var = (step_var == 0)
-            main_block.append_op(
-                type='equal',
-                inputs={'X': step_var,
-                        'Y': zero_var},
-                outputs={'Out': cond_var})
+            main_block.append_op(type='equal',
+                                 inputs={
+                                     'X': step_var,
+                                     'Y': zero_var
+                                 },
+                                 outputs={'Out': cond_var})
 
         return cond_var
 
@@ -7165,11 +7196,11 @@ def apply_gradients(self, params_grads):
             param_name = param.name
             param_var = main_block.var(param_name)
             assert (param_var is not None)
-            gradient_merge_var = main_block.create_var(
-                name=param_name + "@GRAD@GradientMerge",
-                shape=param_var.shape,
-                dtype=param_var.dtype,
-                persistable=True)
+            gradient_merge_var = main_block.create_var(name=param_name +
+                                                       "@GRAD@GradientMerge",
+                                                       shape=param_var.shape,
+                                                       dtype=param_var.dtype,
+                                                       persistable=True)
             param_to_gradient_merge[param_name] = gradient_merge_var
 
             startup_gradient_merge_var = startup_block.create_var(
@@ -7177,23 +7208,26 @@ def apply_gradients(self, params_grads):
                 shape=param_var.shape,
                 dtype=param_var.dtype,
                 persistable=True)
-            startup_block.append_op(
-                type="fill_constant",
-                outputs={"Out": startup_gradient_merge_var},
-                attrs={
-                    "shape": param_var.shape,
-                    "dtype": param_var.dtype,
-                    "value": float(0),
-                })
+            startup_block.append_op(type="fill_constant",
+                                    outputs={"Out": startup_gradient_merge_var},
+                                    attrs={
+                                        "shape": param_var.shape,
+                                        "dtype": param_var.dtype,
+                                        "value": float(0),
+                                    })
 
             # grad_merge += grad
             new_grad_op = main_block.append_op(
                 type="elementwise_add",
-                inputs={'X': grad,
-                        'Y': gradient_merge_var},
+                inputs={
+                    'X': grad,
+                    'Y': gradient_merge_var
+                },
                 outputs={'Out': gradient_merge_var},
-                attrs={'axis': -1,
-                       'use_mkldnn': False})
+                attrs={
+                    'axis': -1,
+                    'use_mkldnn': False
+                })
             self._add_gm_op_role_var(new_grad_op, param, gradient_merge_var,
                                      cond)
             new_params_grads.append([param, gradient_merge_var])
@@ -7209,15 +7243,14 @@ def true_apply_gradient():
             if self.avg:
                 for param, new_grad in new_params_grads:
                     # grad /= k_steps
-                    cur_block.append_op(
-                        type='scale',
-                        inputs={'X': new_grad},
-                        outputs={'Out': new_grad},
-                        attrs={
-                            'scale': 1.0 / self.k_steps,
-                            'bias': 0.0,
-                            'bias_after_scale': False
-                        })
+                    cur_block.append_op(type='scale',
+                                        inputs={'X': new_grad},
+                                        outputs={'Out': new_grad},
+                                        attrs={
+                                            'scale': 1.0 / self.k_steps,
+                                            'bias': 0.0,
+                                            'bias_after_scale': False
+                                        })
                     new_grad.op._set_attr(op_maker.kOpRoleAttrName(),
                                           op_maker.OpRole.Backward)
 
@@ -7233,11 +7266,10 @@ def true_apply_gradient():
 
             # clear gradient_merge_vars
             for param, new_grad in new_params_grads:
-                layers.fill_constant(
-                    shape=new_grad.shape,
-                    dtype=new_grad.dtype,
-                    value=0.0,
-                    out=new_grad)
+                layers.fill_constant(shape=new_grad.shape,
+                                     dtype=new_grad.dtype,
+                                     value=0.0,
+                                     out=new_grad)
                 new_grad.op._set_attr(op_maker.kOpRoleAttrName(),
                                       op_maker.OpRole.Optimize)
 
@@ -7253,13 +7285,13 @@ def minimize(self,
                  no_grad_set=None):
         assert isinstance(loss, Variable), "The loss should be an Variable."
 
-        params_grads = self.backward(
-            loss,
-            startup_program=startup_program,
-            parameter_list=parameter_list,
-            no_grad_set=no_grad_set)
+        params_grads = self.backward(loss,
+                                     startup_program=startup_program,
+                                     parameter_list=parameter_list,
+                                     no_grad_set=no_grad_set)
 
-        optimize_ops = self.apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
+        optimize_ops = self.apply_optimize(loss,
+                                           startup_program=startup_program,
+                                           params_grads=params_grads)
 
         return optimize_ops, params_grads
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index a10ce1ce808f6..6580c82536a61 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -295,12 +295,12 @@ def __init__(self,
                  trainable=True,
                  do_model_average=False,
                  need_clip=True):
-        super(WeightNormParamAttr, self).__init__(
-            name=name,
-            initializer=initializer,
-            learning_rate=learning_rate,
-            regularizer=regularizer,
-            trainable=trainable,
-            do_model_average=do_model_average,
-            need_clip=need_clip)
+        super(WeightNormParamAttr,
+              self).__init__(name=name,
+                             initializer=initializer,
+                             learning_rate=learning_rate,
+                             regularizer=regularizer,
+                             trainable=trainable,
+                             do_model_average=do_model_average,
+                             need_clip=need_clip)
         self.dim = dim
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 4d39d38853063..5739cdb2f593c 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -42,8 +42,8 @@
     since="2.3.0",
     update_to="paddle.profiler.Profiler",
     level=1,
-    reason="Please use new profiler tool, this profiler tool is no longer maintained."
-)
+    reason=
+    "Please use new profiler tool, this profiler tool is no longer maintained.")
 @signature_safe_contextmanager
 def cuda_profiler(output_file, output_mode=None, config=None):
     """
@@ -121,8 +121,8 @@ def npu_profiler(output_file, config=None):
     since="2.3.0",
     update_to="paddle.profiler.Profiler",
     level=1,
-    reason="Please use new profiler tool, this profiler tool is no longer maintained."
-)
+    reason=
+    "Please use new profiler tool, this profiler tool is no longer maintained.")
 def reset_profiler():
     """
     Clear the previous time record. It works for
@@ -149,8 +149,8 @@ def reset_profiler():
     since="2.3.0",
     update_to="paddle.profiler.Profiler",
     level=1,
-    reason="Please use new profiler tool, this profiler tool is no longer maintained."
-)
+    reason=
+    "Please use new profiler tool, this profiler tool is no longer maintained.")
 def start_profiler(state, tracer_option='Default'):
     """
     Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
@@ -223,8 +223,8 @@ def start_profiler(state, tracer_option='Default'):
     since="2.3.0",
     update_to="paddle.profiler.Profiler",
     level=1,
-    reason="Please use new profiler tool, this profiler tool is no longer maintained."
-)
+    reason=
+    "Please use new profiler tool, this profiler tool is no longer maintained.")
 def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
     """
     Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
@@ -286,8 +286,8 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
     since="2.3.0",
     update_to="paddle.profiler.Profiler",
     level=1,
-    reason="Please use new profiler tool, this profiler tool is no longer maintained."
-)
+    reason=
+    "Please use new profiler tool, this profiler tool is no longer maintained.")
 @signature_safe_contextmanager
 def profiler(state,
              sorted_key=None,
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 3ea3af9ed1cb5..ff299bcca9ba3 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -120,6 +120,7 @@ def _reader_process_loop(batch_reader, data_queue):
 
 
 class DataLoaderBase(object):
+
     def __init__(self):
         self._places = None
 
@@ -155,6 +156,7 @@ def _check_input_array(cls, item):
 
 
 class AuToTune(object):
+
     def __init__(self, loader):
         self.loader = loader
         self.max_num_worker = multiprocessing.cpu_count() / 2
@@ -172,12 +174,12 @@ def __call__(self):
         # pick the best num_workers
         auto_tune_start = time.time()
         logging.debug("========= DataLoader Auto Tune =========")
-        logging.debug("User config for DataLoader: " + str(
-            self.loader.num_workers))
+        logging.debug("User config for DataLoader: " +
+                      str(self.loader.num_workers))
         best_num_workers = 0
         min_cost = float("inf")
-        logging.debug("Tuning Range for num_workers: 0 ~ " + str(
-            self.max_num_worker))
+        logging.debug("Tuning Range for num_workers: 0 ~ " +
+                      str(self.max_num_worker))
         num_workers = 0
         while num_workers < self.max_num_worker:
             auto_tune_loader.num_workers = num_workers
@@ -195,10 +197,10 @@ def __call__(self):
             logging.debug("num_workers: " + str(num_workers) + " avg_cost: " +
                           str(avg_cost))
             num_workers += 2
-        logging.info("auto_tune dataLoader best_num_workers: " + str(
-            best_num_workers))
-        logging.debug("AutoTuning Cost for DataLoader: " + str(time.time(
-        ) - auto_tune_start) + ' seconds')
+        logging.info("auto_tune dataLoader best_num_workers: " +
+                     str(best_num_workers))
+        logging.debug("AutoTuning Cost for DataLoader: " +
+                      str(time.time() - auto_tune_start) + ' seconds')
 
         # tune the default loader's num_workers
         return best_num_workers
@@ -314,56 +316,58 @@ class DataLoader(object):
         dataset(Dataset): the dataset to load data from, should be an
             instance of subclass of :code:`paddle.io.Dataset` or
             :code:`paddle.io.IterableDataset`.
-        feed_list (list(Tensor)|tuple(Tensor)): feed Tensor list.
+        feed_list (list(Tensor)|tuple(Tensor), optional): feed Tensor list.
             The Tensors should be created by :code:`paddle.static.data()`.
             :attr:`feed_list` must be set if :attr:`return_list` is
             False. Default None.
-        places(list(Place)|tuple(Place)|list(str)|optional): a list of Place,
+        places(list(Place)|tuple(Place)|list(str), optional): a list of Place,
             to put data onto, :attr:`places` can be None, if 
             :attr:`places` is None, default place(CPUPlace or CUDAPlace(0))
             will be used. Default None. If ``places`` is list of string,
             the string in the list can be ``cpu``, ``gpu:x`` and ``gpu_pinned``,
             where ``x`` is the index of the GPUs.
-        return_list (bool): whether the return value on each device is 
+        return_list (bool, optional): whether the return value on each device is 
             presented as a list. If :attr:`return_list=False`, the return
             value on each device would be a dict of str -> Tensor, where
             the key of the dict is the name of each fed Tensors. If 
             :attr:`return_list=True`, the return value on each device would
             be a list(Tensor). :attr:`return_list` can only be True
             in dynamic graph mode. Default True.
-        batch_sampler(BatchSampler): an instance of `paddle.io.BatchSampler`
+        batch_sampler(BatchSampler, optional): an instance of `paddle.io.BatchSampler`
             to generate batch indices to draw samples from :attr:`dataset`
             and combine a batch. Default None.
-        batch_size(int|None): sample number in a mini-batch, a substitution
+        batch_size(int|None, optional): sample number in a mini-batch, a substitution
             parameter for :attr:`batch_sampler`, if :attr:`batch_sampler`
             is not set, a default `paddle.io.BatchSampler` will be used
             and initialize by :attr:`batch_size`, :attr:`shuffle` and
             :attr:`drop_last`. Default 1.
-        shuffle(bool): whther to shuffle indices order before genrate
+        shuffle(bool, optional): whther to shuffle indices order before genrate
             batch indices, a substitution parameter for :attr:`batch_sampler`
             see :attr:`batch_size`. Default False.
-        drop_last(bool): whether drop the last incomplete batch dataset size
+        drop_last(bool, optional): whether drop the last incomplete batch dataset size
             is not divisible by the batch size, a substitution parameter
             for :attr:`batch_sampler`, see :attr:`batch_size`. Default False
-        collate_fn(callable): function to generate mini-batch data by merging
+        collate_fn(callable, optional): function to generate mini-batch data by merging
             the sample list, None for only stack each fields of sample in axis
             0(same as :attr::`np.stack(..., axis=0)`). Default None
-        num_workers(int): the number of subprocess to load data, 0 for no
+        num_workers(int, optional): the number of subprocess to load data, 0 for no
             subprocess used and loading data in main process. Default 0
-        use_buffer_reader (bool): whether to use bufferred reader. 
-            If use_buffer_reader=True, the DataLoader would prefetch next 
+        use_buffer_reader (bool, optional): whether to use bufferred reader. 
+            If use_buffer_reader=True, the DataLoader would prefetch
             batch data asynchronously, so it would speed up data feeding 
             and occupies a little more CPU or GPU memory, i.e., the memory
             of one batch input data. Default True.
-        use_shared_memory (bool): whether to use shared memory to speed up
+        prefetch_factor (int, optional): Number of batch data the DataLoader would prefetch
+            if use_buffer_reader=True. Default 2.
+        use_shared_memory (bool, optional): whether to use shared memory to speed up
             putting data into inter-process queue, set :attr:`use_shared_memory`
             as True only when the shared memory space on your machine(e.g.
             space of '/dev/shm' on Linux operating sysytem) is large enough.
             Shared memory will only be enabled in multi-process mode(num_workers
             > 0). Default True.
-        timeout(int): the timeout value for getting data form output queue
+        timeout(int, optional): the timeout value for getting data form output queue
             of subprocesses. Default 0.
-        worker_init_fn(callable): init function which will be called with
+        worker_init_fn(callable, optional): init function which will be called with
             worker id on each subproces starting if not set as None. Default
             None.
 
@@ -450,6 +454,7 @@ def __init__(self,
                  collate_fn=None,
                  num_workers=0,
                  use_buffer_reader=True,
+                 prefetch_factor=2,
                  use_shared_memory=True,
                  timeout=0,
                  worker_init_fn=None,
@@ -457,6 +462,7 @@ def __init__(self,
         self.return_list = return_list
         self.collate_fn = collate_fn
         self.use_buffer_reader = use_buffer_reader
+        self.prefetch_factor = prefetch_factor
         self.worker_init_fn = worker_init_fn
 
         self.dataset = dataset
@@ -475,14 +481,16 @@ def __init__(self,
         self.places = _convert_places(places)
 
         assert num_workers >= 0, "num_workers should be a non-negative value"
-        if num_workers > 0 and (sys.platform == 'darwin' or
-                                sys.platform == 'win32'):
+        if num_workers > 0 and (sys.platform == 'darwin'
+                                or sys.platform == 'win32'):
             warnings.warn(
                 "DataLoader with multi-process mode is not supported on MacOs and Windows currently." \
                 " Please use signle-process mode with num_workers = 0 instead")
             num_workers = 0
         self.num_workers = num_workers
 
+        assert prefetch_factor > 0, "prefetch_factor should be a positive value"
+
         self.use_shared_memory = use_shared_memory
         if use_shared_memory and num_workers == 0:
             self.use_shared_memory = False
@@ -517,14 +525,13 @@ def __init__(self,
                 "batch_sampler is not given"
             self.batch_size = batch_size
             if isinstance(dataset, IterableDataset):
-                self.batch_sampler = _InfiniteIterableSampler(dataset,
-                                                              batch_size)
+                self.batch_sampler = _InfiniteIterableSampler(
+                    dataset, batch_size)
             else:
-                self.batch_sampler = BatchSampler(
-                    dataset=dataset,
-                    batch_size=batch_size,
-                    shuffle=shuffle,
-                    drop_last=drop_last)
+                self.batch_sampler = BatchSampler(dataset=dataset,
+                                                  batch_size=batch_size,
+                                                  shuffle=shuffle,
+                                                  drop_last=drop_last)
 
         self.drop_last = drop_last
         self.auto_collate_batch = self.batch_sampler is not None
@@ -972,8 +979,8 @@ def __init__(self,
 
         # NOTE: the multiprocessing in different platform is incompatible, we will solve it later
         self._use_multiprocess = use_multiprocess
-        if self._use_multiprocess and (sys.platform == 'darwin' or
-                                       sys.platform == 'win32'):
+        if self._use_multiprocess and (sys.platform == 'darwin'
+                                       or sys.platform == 'win32'):
             warnings.warn(
                 "NOTE: DygraphGeneratorLoader with multiprocess mode is not currently supported on MacOs and Windows."
             )
@@ -989,7 +996,7 @@ def __init__(self,
         self._blocking_queue = None
         # NOTE: 1. In multiprocess mode, this thread is used to get next batch data from
         # self._data_queue, then push it into self._blocking_queue; 2. In singleprocess
-        # mode, this thread is used to get next batch data from self._batch_reader, then 
+        # mode, this thread is used to get next batch data from self._batch_reader, then
         # push it into self._blocking_queue
         self._thread = None
         self._pin_memory = True if use_pinned_memory(
@@ -1037,10 +1044,12 @@ def _init_iterable(self):
         self._blocking_queue = core.init_lod_tensor_blocking_queue(
             core.Variable(), self._capacity, False)
         self._reader = None
-        self._reader = core.create_py_reader(
-            self.queue, self._var_names, self._shapes, self._dtypes,
-            self._need_check_feed, self._places, self._use_double_buffer, True,
-            self._pin_memory)
+        self._reader = core.create_py_reader(self.queue, self._var_names,
+                                             self._shapes, self._dtypes,
+                                             self._need_check_feed,
+                                             self._places,
+                                             self._use_double_buffer, True,
+                                             self._pin_memory)
 
     def _start(self):
         if self._use_multiprocess:
@@ -1051,17 +1060,17 @@ def _start(self):
             # add _data_queue into global queue set
             global multiprocess_queue_set
             multiprocess_queue_set.add(self._data_queue)
-            self._process = multiprocessing.Process(
-                target=_reader_process_loop,
-                args=(self._batch_reader, self._data_queue))
+            self._process = multiprocessing.Process(target=_reader_process_loop,
+                                                    args=(self._batch_reader,
+                                                          self._data_queue))
             self._process.daemon = True
             self._process.start()
 
             # Set child process signal handler
             # NOTE: [ avoiding hang ] 1. if the child process dies due to bus error/segfault
-            # or just hang, the main process will hang waiting for data, so here need to deal 
+            # or just hang, the main process will hang waiting for data, so here need to deal
             # with SIGSEGV and SIGBUS of child process; 2. if the main process end before child
-            # process, it shuts the all its daemonic children down with a SIGTERM (instead of 
+            # process, it shuts the all its daemonic children down with a SIGTERM (instead of
             # joining them without a timeout), so here nedd to deal with SIGTERM.
             core._set_process_pids(id(self), [self._process.pid])
             _set_SIGCHLD_handler()
@@ -1121,10 +1130,10 @@ def _reader_thread_loop_for_multiprocess(self, legacy_expected_place):
 
         while not self._thread_done_event.is_set():
             try:
-                # NOTE: [ avoid hanging ] Even with carefully designed data dependencies 
-                # (i.e., a put() always corresponding to a get()), hanging on get() can 
-                # still happen when data in queue is corrupted (e.g., due to 
-                # Queue.cancel_join_thread or unexpected exit). So we set a timeout whenever 
+                # NOTE: [ avoid hanging ] Even with carefully designed data dependencies
+                # (i.e., a put() always corresponding to a get()), hanging on get() can
+                # still happen when data in queue is corrupted (e.g., due to
+                # Queue.cancel_join_thread or unexpected exit). So we set a timeout whenever
                 # we try to get data from `data_queue`
                 # NOTE: [ avoid failed quickly ] Here, the time setting of QUEUE_GET_TIMEOUT
                 # is relatively long, currently it is 60 seconds, because in some models,
@@ -1195,10 +1204,10 @@ def set_sample_generator(self,
             places = _get_paddle_place_list(places)
         else:
             places = _get_paddle_place(places)
-        self.set_sample_list_generator(
-            paddle.batch(
-                reader, batch_size=batch_size, drop_last=drop_last),
-            places=places)
+        self.set_sample_list_generator(paddle.batch(reader,
+                                                    batch_size=batch_size,
+                                                    drop_last=drop_last),
+                                       places=places)
         return self
 
     def set_sample_list_generator(self, reader, places=None):
@@ -1236,6 +1245,7 @@ def set_batch_generator(self, reader, places=None):
 
 
 class GeneratorLoader(DataLoaderBase):
+
     def __init__(self,
                  feed_list=None,
                  capacity=None,
@@ -1281,10 +1291,12 @@ def _init_iterable(self):
         self._queue = core.init_lod_tensor_blocking_queue(
             core.Variable(), self._capacity, self._keep_order)
         self._reader = None
-        self._reader = core.create_py_reader(
-            self.queue, self._var_names, self._shapes, self._dtypes,
-            self._need_check_feed, self._places, self._use_double_buffer,
-            self._drop_last, False)
+        self._reader = core.create_py_reader(self.queue, self._var_names,
+                                             self._shapes, self._dtypes,
+                                             self._need_check_feed,
+                                             self._places,
+                                             self._use_double_buffer,
+                                             self._drop_last, False)
 
     def _init_non_iterable(self):
         lod_levels = []
@@ -1308,8 +1320,8 @@ def _init_non_iterable(self):
         double_buffer_name = data_loader_unique_name_generator('double_buffer')
 
         var = global_scope().var(queue_name)
-        self._queue = core.init_lod_tensor_blocking_queue(var, self._capacity,
-                                                          self._keep_order)
+        self._queue = core.init_lod_tensor_blocking_queue(
+            var, self._capacity, self._keep_order)
 
         if self._keep_order:
             block = default_main_program().current_block()
@@ -1319,17 +1331,16 @@ def _init_non_iterable(self):
         reader_var = block.create_var(name=reader_name)
 
         dtype_int = [int(t) for t in dtypes]
-        block.append_op(
-            type='create_py_reader',
-            inputs={'blocking_queue': [queue_name]},
-            outputs={'Out': [reader_var]},
-            attrs={
-                'shape_concat': shape_concat,
-                'lod_levels': lod_levels,
-                'dtypes': dtype_int,
-                'need_check_feed': need_check_feed,
-                'ranks': ranks
-            })
+        block.append_op(type='create_py_reader',
+                        inputs={'blocking_queue': [queue_name]},
+                        outputs={'Out': [reader_var]},
+                        attrs={
+                            'shape_concat': shape_concat,
+                            'lod_levels': lod_levels,
+                            'dtypes': dtype_int,
+                            'need_check_feed': need_check_feed,
+                            'ranks': ranks
+                        })
 
         reader_var.desc.set_dtypes(dtypes)
         reader_var.persistable = True
@@ -1349,8 +1360,8 @@ def _init_non_iterable(self):
             reader = monkey_patch_reader_methods(main_prog_var)
 
         if self._use_double_buffer:
-            double_buffer_reader = double_buffer(
-                reader, name=double_buffer_name)
+            double_buffer_reader = double_buffer(reader,
+                                                 name=double_buffer_name)
             # we return a double buffer reader. However, the reset method comes from
             # py_reader.
             double_buffer_reader.reset = reader.reset
@@ -1404,6 +1415,7 @@ def reset(self):
         self._reset()
 
     def _start(self):
+
         def __thread_main__(legacy_expected_place):
             try:
                 # See _DataLoaderIterSingleProcess._thread_loop() for why set expected place here.
@@ -1435,8 +1447,8 @@ def __thread_main__(legacy_expected_place):
                 logging.warning('Your reader has raised an exception!')
                 six.reraise(*sys.exc_info())
 
-        self._thread = threading.Thread(
-            target=__thread_main__, args=(_current_expected_place(), ))
+        self._thread = threading.Thread(target=__thread_main__,
+                                        args=(_current_expected_place(), ))
         self._thread.daemon = True
         self._thread.start()
 
@@ -1467,17 +1479,16 @@ def set_sample_generator(self,
                 break
 
         if has_lod:
-            self.set_sample_list_generator(
-                paddle.batch(
-                    reader, batch_size=batch_size, drop_last=drop_last),
-                places=places)
+            self.set_sample_list_generator(paddle.batch(reader,
+                                                        batch_size=batch_size,
+                                                        drop_last=drop_last),
+                                           places=places)
         else:
-            reader = BatchedTensorProvider(
-                feed_list=self._feed_list,
-                place=core.CPUPlace(),
-                batch_size=batch_size,
-                generator=reader,
-                drop_last=drop_last)
+            reader = BatchedTensorProvider(feed_list=self._feed_list,
+                                           place=core.CPUPlace(),
+                                           batch_size=batch_size,
+                                           generator=reader,
+                                           drop_last=drop_last)
             self.set_batch_generator(reader, places=places)
         return self
 
@@ -1487,8 +1498,8 @@ def set_sample_list_generator(self, reader, places=None):
         else:
             places = _get_paddle_place(places)
         with program_guard(Program(), Program()):
-            feeder = DataFeeder(
-                feed_list=self._feed_list, place=core.CPUPlace())
+            feeder = DataFeeder(feed_list=self._feed_list,
+                                place=core.CPUPlace())
             paddle_reader = feeder.decorate_reader(reader, multi_devices=False)
 
         def __tensor_reader_impl__():
@@ -1688,8 +1699,9 @@ def __init__(self,
                  use_double_buffer=True,
                  iterable=True,
                  return_list=False):
-        self._loader = DataLoader.from_generator(
-            feed_list, capacity, use_double_buffer, iterable, return_list)
+        self._loader = DataLoader.from_generator(feed_list, capacity,
+                                                 use_double_buffer, iterable,
+                                                 return_list)
 
     @property
     def queue(self):
@@ -1972,9 +1984,10 @@ def generator():
 
 
 class DatasetLoader(DataLoaderBase):
+
     def __init__(self, dataset, places, drop_last):
-        assert isinstance(dataset, paddle.distributed.fleet.dataset.
-                          DatasetBase), "dataset must be type of DatasetBase"
+        assert isinstance(dataset, paddle.distributed.fleet.dataset.DatasetBase
+                          ), "dataset must be type of DatasetBase"
         assert not _non_static_mode(
         ), "DatasetLoader is not supported in dygraph mode yet"
         if isinstance(places, (list, tuple)):
@@ -1988,15 +2001,17 @@ def __init__(self, dataset, places, drop_last):
             "Filelist number of dataset {} must be not less than place number {}".format(len(dataset.filelist), thread_num)
 
         if dataset.thread_num != 0 and dataset.thread_num != thread_num:
-            logging.warn('thread_num {} which is set in Dataset is ignored'.
-                         format(dataset.thread_num))
+            logging.warn(
+                'thread_num {} which is set in Dataset is ignored'.format(
+                    dataset.thread_num))
 
         dataset._set_thread(thread_num)
 
-        if isinstance(dataset, paddle.distributed.fleet.dataset.
-                      InMemoryDataset) and dataset.queue_num > thread_num:
-            logging.warn("queue_num {} which is set in Dataset is ignored".
-                         format(dataset.queue_num))
+        if isinstance(dataset, paddle.distributed.fleet.dataset.InMemoryDataset
+                      ) and dataset.queue_num > thread_num:
+            logging.warn(
+                "queue_num {} which is set in Dataset is ignored".format(
+                    dataset.queue_num))
             dataset._set_queue_num(thread_num)
 
         self._dataset = dataset
@@ -2006,8 +2021,8 @@ def __init__(self, dataset, places, drop_last):
         ]
 
         self._iterable_dataset = core.IterableDatasetWrapper(
-            dataset.dataset, use_slots,
-            _convert_places(places), dataset.proto_desc.batch_size, drop_last)
+            dataset.dataset, use_slots, _convert_places(places),
+            dataset.proto_desc.batch_size, drop_last)
 
     def __iter__(self):
         self._dataset._finish_to_run()
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index ed28a2813e225..da0b91cc5c962 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -135,20 +135,21 @@ def __call__(self, param, grad, block):
 
         if framework._non_static_mode():
             if framework.in_dygraph_mode():
-                return _C_ops.final_state_scale(
-                    param, self._regularization_coeff, 0.0, True)
+                return _C_ops.final_state_scale(param,
+                                                self._regularization_coeff, 0.0,
+                                                True)
             else:
                 return _C_ops.scale(param, "scale", self._regularization_coeff)
         else:
-            decay = block.create_var(
-                dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
+            decay = block.create_var(dtype=param.dtype,
+                                     shape=param.shape,
+                                     lod_level=param.lod_level)
 
             # Append Op to calculate decay
-            block.append_op(
-                type='scale',
-                inputs={"X": param},
-                outputs={"Out": decay},
-                attrs={"scale": self._regularization_coeff})
+            block.append_op(type='scale',
+                            inputs={"X": param},
+                            outputs={"Out": decay},
+                            attrs={"scale": self._regularization_coeff})
 
             return decay
 
@@ -245,20 +246,21 @@ def __call__(self, param, grad, block):
             sign = block.create_var(dtype=param.dtype, shape=param.shape)
             decay = block.create_var(dtype=param.dtype, shape=param.shape)
         else:
-            sign = block.create_var(
-                dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
-            decay = block.create_var(
-                dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
+            sign = block.create_var(dtype=param.dtype,
+                                    shape=param.shape,
+                                    lod_level=param.lod_level)
+            decay = block.create_var(dtype=param.dtype,
+                                     shape=param.shape,
+                                     lod_level=param.lod_level)
 
         # Append sign op
         block.append_op(type='sign', inputs={"X": param}, outputs={"Out": sign})
 
         # Append scale op to the output of sign op
-        block.append_op(
-            type='scale',
-            inputs={"X": sign},
-            outputs={"Out": decay},
-            attrs={"scale": self._regularization_coeff})
+        block.append_op(type='scale',
+                        inputs={"X": sign},
+                        outputs={"Out": decay},
+                        attrs={"scale": self._regularization_coeff})
 
         return decay
 
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index 587d4aee34ca2..6acee6dc11c89 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -1,7 +1,9 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-
 foreach(src ${TEST_OPS})
   py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt
index 09c650f16e2fb..9e807a79353bb 100644
--- a/python/paddle/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/CMakeLists.txt
@@ -1,10 +1,13 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 # default test
 foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-    set_tests_properties(${src} PROPERTIES FIXTURES_SETUP ${src}_infer_model)
+  py_test(${src} SRCS ${src}.py)
+  set_tests_properties(${src} PROPERTIES FIXTURES_SETUP ${src}_infer_model)
 endforeach()
 set_tests_properties(test_word2vec_book PROPERTIES TIMEOUT 120)
 set_tests_properties(test_recognize_digits PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
index 9ce90a2bd71f8..d96e640f77a96 100644
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -25,22 +25,25 @@
 import os
 
 
-def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
+def convolution_net(data,
+                    label,
+                    input_dim,
+                    class_dim=2,
+                    emb_dim=32,
                     hid_dim=32):
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=3,
-        act="tanh",
-        pool_type="sqrt")
-    conv_4 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=4,
-        act="tanh",
-        pool_type="sqrt")
+    emb = fluid.layers.embedding(input=data,
+                                 size=[input_dim, emb_dim],
+                                 is_sparse=True)
+    conv_3 = fluid.nets.sequence_conv_pool(input=emb,
+                                           num_filters=hid_dim,
+                                           filter_size=3,
+                                           act="tanh",
+                                           pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(input=emb,
+                                           num_filters=hid_dim,
+                                           filter_size=4,
+                                           act="tanh",
+                                           pool_type="sqrt")
     prediction = fluid.layers.fc(input=[conv_3, conv_4],
                                  size=class_dim,
                                  act="softmax")
@@ -50,10 +53,15 @@ def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
     return avg_cost, accuracy, prediction
 
 
-def dyn_rnn_lstm(data, label, input_dim, class_dim=2, emb_dim=32,
+def dyn_rnn_lstm(data,
+                 label,
+                 input_dim,
+                 class_dim=2,
+                 emb_dim=32,
                  lstm_size=128):
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    emb = fluid.layers.embedding(input=data,
+                                 size=[input_dim, emb_dim],
+                                 is_sparse=True)
     sentence = fluid.layers.fc(input=emb, size=lstm_size, act='tanh')
 
     rnn = fluid.layers.DynamicRNN()
@@ -67,14 +75,14 @@ def gate_common(ipt, hidden, size):
             gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
             return gate0 + gate1
 
-        forget_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                         lstm_size))
-        input_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                        lstm_size))
-        output_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                         lstm_size))
-        cell_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                       lstm_size))
+        forget_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        input_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        output_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        cell_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
 
         cell = forget_gate * prev_cell + input_gate * cell_gate
         hidden = output_gate * fluid.layers.tanh(x=cell)
@@ -99,8 +107,9 @@ def stacked_lstm_net(data,
                      stacked_num=3):
     assert stacked_num % 2 == 1
 
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    emb = fluid.layers.embedding(input=data,
+                                 size=[input_dim, emb_dim],
+                                 is_sparse=True)
     # add bias attr
 
     # TODO(qijun) linear act
@@ -111,8 +120,9 @@ def stacked_lstm_net(data,
 
     for i in range(2, stacked_num + 1):
         fc = fluid.layers.fc(input=inputs, size=hid_dim)
-        lstm, cell = fluid.layers.dynamic_lstm(
-            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        lstm, cell = fluid.layers.dynamic_lstm(input=fc,
+                                               size=hid_dim,
+                                               is_reverse=(i % 2) == 0)
         inputs = [fc, lstm]
 
     fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
@@ -138,23 +148,26 @@ def train(word_dict,
     dict_dim = len(word_dict)
     class_dim = 2
 
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
+    data = fluid.layers.data(name="words",
+                             shape=[1],
+                             dtype="int64",
+                             lod_level=1)
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
     if not parallel:
-        cost, acc_out, prediction = net_method(
-            data, label, input_dim=dict_dim, class_dim=class_dim)
+        cost, acc_out, prediction = net_method(data,
+                                               label,
+                                               input_dim=dict_dim,
+                                               class_dim=class_dim)
     else:
         raise NotImplementedError()
 
     adagrad = fluid.optimizer.Adagrad(learning_rate=0.002)
     adagrad.minimize(cost)
 
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=BATCH_SIZE)
+    train_data = paddle.batch(paddle.reader.shuffle(
+        paddle.dataset.imdb.train(word_dict), buf_size=1000),
+                              batch_size=BATCH_SIZE)
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
     feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
@@ -234,12 +247,11 @@ def infer(word_dict, use_cuda, save_dirname=None):
         recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
         # The range of random integers is [low, high]
-        tensor_words = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=word_dict_len - 1)
+        tensor_words = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                         base_shape,
+                                                         place,
+                                                         low=0,
+                                                         high=word_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -258,16 +270,16 @@ def main(word_dict, net_method, use_cuda, parallel=False, save_dirname=None):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
-    train(
-        word_dict,
-        net_method,
-        use_cuda,
-        parallel=parallel,
-        save_dirname=save_dirname)
+    train(word_dict,
+          net_method,
+          use_cuda,
+          parallel=parallel,
+          save_dirname=save_dirname)
     infer(word_dict, use_cuda, save_dirname)
 
 
 class TestUnderstandSentiment(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         cls.word_dict = paddle.dataset.imdb.word_dict()
@@ -283,19 +295,17 @@ def new_program_scope(self):
 
     def test_conv_cpu(self):
         with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=False,
-                save_dirname="understand_sentiment_conv.inference.model")
+            main(self.word_dict,
+                 net_method=convolution_net,
+                 use_cuda=False,
+                 save_dirname="understand_sentiment_conv.inference.model")
 
     def test_conv_cpu_parallel(self):
         with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=False,
-                parallel=True)
+            main(self.word_dict,
+                 net_method=convolution_net,
+                 use_cuda=False,
+                 parallel=True)
 
     @unittest.skip(reason="make CI faster")
     def test_stacked_lstm_cpu(self):
@@ -304,31 +314,29 @@ def test_stacked_lstm_cpu(self):
                 self.word_dict,
                 net_method=stacked_lstm_net,
                 use_cuda=False,
-                save_dirname="understand_sentiment_stacked_lstm.inference.model")
+                save_dirname="understand_sentiment_stacked_lstm.inference.model"
+            )
 
     def test_stacked_lstm_cpu_parallel(self):
         with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=stacked_lstm_net,
-                use_cuda=False,
-                parallel=True)
+            main(self.word_dict,
+                 net_method=stacked_lstm_net,
+                 use_cuda=False,
+                 parallel=True)
 
     def test_conv_gpu(self):
         with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=True,
-                save_dirname="understand_sentiment_conv.inference.model")
+            main(self.word_dict,
+                 net_method=convolution_net,
+                 use_cuda=True,
+                 save_dirname="understand_sentiment_conv.inference.model")
 
     def test_conv_gpu_parallel(self):
         with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=True,
-                parallel=True)
+            main(self.word_dict,
+                 net_method=convolution_net,
+                 use_cuda=True,
+                 parallel=True)
 
     @unittest.skip(reason="make CI faster")
     def test_stacked_lstm_gpu(self):
@@ -337,32 +345,30 @@ def test_stacked_lstm_gpu(self):
                 self.word_dict,
                 net_method=stacked_lstm_net,
                 use_cuda=True,
-                save_dirname="understand_sentiment_stacked_lstm.inference.model")
+                save_dirname="understand_sentiment_stacked_lstm.inference.model"
+            )
 
     def test_stacked_lstm_gpu_parallel(self):
         with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=stacked_lstm_net,
-                use_cuda=True,
-                parallel=True)
+            main(self.word_dict,
+                 net_method=stacked_lstm_net,
+                 use_cuda=True,
+                 parallel=True)
 
     @unittest.skip(reason='make CI faster')
     def test_dynrnn_lstm_gpu(self):
         with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=dyn_rnn_lstm,
-                use_cuda=True,
-                parallel=False)
+            main(self.word_dict,
+                 net_method=dyn_rnn_lstm,
+                 use_cuda=True,
+                 parallel=False)
 
     def test_dynrnn_lstm_gpu_parallel(self):
         with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=dyn_rnn_lstm,
-                use_cuda=True,
-                parallel=True)
+            main(self.word_dict,
+                 net_method=dyn_rnn_lstm,
+                 use_cuda=True,
+                 parallel=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 4324e582fc991..668373838c0b0 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -75,15 +75,14 @@ def train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16):
             amp_lists=amp.bf16.AutoMixedPrecisionListsBF16(),
             use_bf16_guard=False,
             use_pure_bf16=pure_bf16)
-    sgd_optimizer.minimize(
-        avg_cost, startup_program=fluid.default_startup_program())
+    sgd_optimizer.minimize(avg_cost,
+                           startup_program=fluid.default_startup_program())
 
     BATCH_SIZE = 20
 
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.uci_housing.train(), buf_size=500),
-        batch_size=BATCH_SIZE)
+    train_reader = paddle.batch(paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+                                batch_size=BATCH_SIZE)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
@@ -93,8 +92,9 @@ def train_loop(main_program):
         exe.run(fluid.default_startup_program())
         test_prog = main_program.clone(for_test=True)
         if pure_bf16:
-            sgd_optimizer.amp_init(
-                exe.place, test_program=test_prog, use_bf16_test=True)
+            sgd_optimizer.amp_init(exe.place,
+                                   test_program=test_prog,
+                                   use_bf16_test=True)
 
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
@@ -106,10 +106,10 @@ def train_loop(main_program):
                     avg_loss_value = convert_uint16_to_float(avg_loss_value)
                 if avg_loss_value[0] < 10.0:
                     if save_dirname is not None:
-                        paddle.static.save_inference_model(
-                            save_dirname, [x], [y_predict],
-                            exe,
-                            clip_extra=False)
+                        paddle.static.save_inference_model(save_dirname, [x],
+                                                           [y_predict],
+                                                           exe,
+                                                           clip_extra=False)
                     return
                 if math.isnan(float(avg_loss_value)):
                     sys.exit("got NaN loss, training failed.")
@@ -161,18 +161,18 @@ def infer(use_cuda, save_dirname=None, use_bf16=False):
         # The input data should be >= 0
         batch_size = 10
 
-        test_reader = paddle.batch(
-            paddle.dataset.uci_housing.test(), batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.uci_housing.test(),
+                                   batch_size=batch_size)
 
         test_data = next(test_reader())
-        test_feat = numpy.array(
-            [data[0] for data in test_data]).astype("float32")
+        test_feat = numpy.array([data[0]
+                                 for data in test_data]).astype("float32")
 
         if use_bf16:
             test_feat = convert_float_to_uint16(test_feat)
 
-        test_label = numpy.array(
-            [data[1] for data in test_data]).astype("float32")
+        test_label = numpy.array([data[1]
+                                  for data in test_data]).astype("float32")
 
         assert feed_target_names[0] == 'x'
         results = exe.run(inference_program,
@@ -200,6 +200,7 @@ def main(use_cuda, is_local=True, use_bf16=False, pure_bf16=False):
 
 
 class TestFitALineBase(unittest.TestCase):
+
     @contextlib.contextmanager
     def program_scope_guard(self):
         prog = fluid.Program()
@@ -211,6 +212,7 @@ def program_scope_guard(self):
 
 
 class TestFitALine(TestFitALineBase):
+
     def test_cpu(self):
         with self.program_scope_guard():
             main(use_cuda=False)
@@ -223,6 +225,7 @@ def test_cuda(self):
 @unittest.skipIf(not fluid.core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestFitALineBF16(TestFitALineBase):
+
     def test_bf16(self):
         with self.program_scope_guard():
             main(use_cuda=False, use_bf16=True)
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 7c2d5c693a9fd..7096a16d89faf 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -28,6 +28,7 @@
 
 
 def resnet_cifar10(input, depth=32):
+
     def conv_bn_layer(input,
                       ch_out,
                       filter_size,
@@ -35,14 +36,13 @@ def conv_bn_layer(input,
                       padding,
                       act='relu',
                       bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
+        tmp = fluid.layers.conv2d(input=input,
+                                  filter_size=filter_size,
+                                  num_filters=ch_out,
+                                  stride=stride,
+                                  padding=padding,
+                                  act=None,
+                                  bias_attr=bias_attr)
         return fluid.layers.batch_norm(input=tmp, act=act)
 
     def shortcut(input, ch_in, ch_out, stride):
@@ -65,28 +65,33 @@ def layer_warp(block_func, input, ch_in, ch_out, count, stride):
 
     assert (depth - 2) % 6 == 0
     n = (depth - 2) // 6
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    conv1 = conv_bn_layer(input=input,
+                          ch_out=16,
+                          filter_size=3,
+                          stride=1,
+                          padding=1)
     res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
     res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
     res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    pool = fluid.layers.pool2d(input=res3,
+                               pool_size=8,
+                               pool_type='avg',
+                               pool_stride=1)
     return pool
 
 
 def vgg16_bn_drop(input):
+
     def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
+        return fluid.nets.img_conv_group(input=input,
+                                         pool_size=2,
+                                         pool_stride=2,
+                                         conv_num_filter=[num_filter] * groups,
+                                         conv_filter_size=3,
+                                         conv_act='relu',
+                                         conv_with_batchnorm=True,
+                                         conv_batchnorm_drop_rate=dropouts,
+                                         pool_type='max')
 
     conv1 = conv_block(input, 64, 2, [0.3, 0])
     conv2 = conv_block(conv1, 128, 2, [0.4, 0])
@@ -132,13 +137,12 @@ def train(net_type, use_cuda, save_dirname, is_local):
     BATCH_SIZE = 128
     PASS_NUM = 1
 
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10(), buf_size=128 * 10),
-        batch_size=BATCH_SIZE)
+    train_reader = paddle.batch(paddle.reader.shuffle(
+        paddle.dataset.cifar.train10(), buf_size=128 * 10),
+                                batch_size=BATCH_SIZE)
 
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(paddle.dataset.cifar.test10(),
+                               batch_size=BATCH_SIZE)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
@@ -168,9 +172,9 @@ def train_loop(main_program):
                     avg_loss_value = numpy.array(avg_loss_list).mean()
 
                     print(
-                        'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
-                        format(pass_id, batch_id + 1,
-                               float(avg_loss_value), float(acc_value)))
+                        'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'
+                        .format(pass_id, batch_id + 1, float(avg_loss_value),
+                                float(acc_value)))
 
                     if acc_value > 0.01:  # Low threshold for speeding up CI
                         fluid.io.save_inference_model(save_dirname, ["pixel"],
@@ -247,6 +251,7 @@ def main(net_type, use_cuda, is_local=True):
 
 
 class TestImageClassification(unittest.TestCase):
+
     def test_vgg_cuda(self):
         with self.scope_prog_guard():
             main('vgg', use_cuda=True)
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index 568d7518a1e0b..eee1d7959eef7 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -55,26 +55,24 @@ def load_parameter(file_name, h, w):
 def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
             **ignored):
     # 8 features
-    predicate_embedding = fluid.layers.embedding(
-        input=predicate,
-        size=[pred_dict_len, word_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE,
-        param_attr='vemb')
-
-    mark_embedding = fluid.layers.embedding(
-        input=mark,
-        size=[mark_dict_len, mark_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE)
+    predicate_embedding = fluid.layers.embedding(input=predicate,
+                                                 size=[pred_dict_len, word_dim],
+                                                 dtype='float32',
+                                                 is_sparse=IS_SPARSE,
+                                                 param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(input=mark,
+                                            size=[mark_dict_len, mark_dim],
+                                            dtype='float32',
+                                            is_sparse=IS_SPARSE)
 
     word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
     emb_layers = [
-        fluid.layers.embedding(
-            size=[word_dict_len, word_dim],
-            input=x,
-            param_attr=fluid.ParamAttr(
-                name=embedding_name, trainable=False)) for x in word_input
+        fluid.layers.embedding(size=[word_dict_len, word_dim],
+                               input=x,
+                               param_attr=fluid.ParamAttr(name=embedding_name,
+                                                          trainable=False))
+        for x in word_input
     ]
     emb_layers.append(predicate_embedding)
     emb_layers.append(mark_embedding)
@@ -85,12 +83,11 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
 
     hidden_0 = fluid.layers.sums(input=hidden_0_layers)
 
-    lstm_0 = fluid.layers.dynamic_lstm(
-        input=hidden_0,
-        size=hidden_dim,
-        candidate_activation='relu',
-        gate_activation='sigmoid',
-        cell_activation='sigmoid')
+    lstm_0 = fluid.layers.dynamic_lstm(input=hidden_0,
+                                       size=hidden_dim,
+                                       candidate_activation='relu',
+                                       gate_activation='sigmoid',
+                                       cell_activation='sigmoid')
 
     # stack L-LSTM and R-LSTM with direct edges
     input_tmp = [hidden_0, lstm_0]
@@ -101,13 +98,12 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
             fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
         ])
 
-        lstm = fluid.layers.dynamic_lstm(
-            input=mix_hidden,
-            size=hidden_dim,
-            candidate_activation='relu',
-            gate_activation='sigmoid',
-            cell_activation='sigmoid',
-            is_reverse=((i % 2) == 1))
+        lstm = fluid.layers.dynamic_lstm(input=mix_hidden,
+                                         size=hidden_dim,
+                                         candidate_activation='relu',
+                                         gate_activation='sigmoid',
+                                         cell_activation='sigmoid',
+                                         is_reverse=((i % 2) == 1))
 
         input_tmp = [mix_hidden, lstm]
 
@@ -121,40 +117,57 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
 
 def train(use_cuda, save_dirname=None, is_local=True):
     # define network topology
-    word = fluid.layers.data(
-        name='word_data', shape=[1], dtype='int64', lod_level=1)
-    predicate = fluid.layers.data(
-        name='verb_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n2 = fluid.layers.data(
-        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n1 = fluid.layers.data(
-        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_0 = fluid.layers.data(
-        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p1 = fluid.layers.data(
-        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p2 = fluid.layers.data(
-        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-    mark = fluid.layers.data(
-        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+    word = fluid.layers.data(name='word_data',
+                             shape=[1],
+                             dtype='int64',
+                             lod_level=1)
+    predicate = fluid.layers.data(name='verb_data',
+                                  shape=[1],
+                                  dtype='int64',
+                                  lod_level=1)
+    ctx_n2 = fluid.layers.data(name='ctx_n2_data',
+                               shape=[1],
+                               dtype='int64',
+                               lod_level=1)
+    ctx_n1 = fluid.layers.data(name='ctx_n1_data',
+                               shape=[1],
+                               dtype='int64',
+                               lod_level=1)
+    ctx_0 = fluid.layers.data(name='ctx_0_data',
+                              shape=[1],
+                              dtype='int64',
+                              lod_level=1)
+    ctx_p1 = fluid.layers.data(name='ctx_p1_data',
+                               shape=[1],
+                               dtype='int64',
+                               lod_level=1)
+    ctx_p2 = fluid.layers.data(name='ctx_p2_data',
+                               shape=[1],
+                               dtype='int64',
+                               lod_level=1)
+    mark = fluid.layers.data(name='mark_data',
+                             shape=[1],
+                             dtype='int64',
+                             lod_level=1)
     feature_out = db_lstm(**locals())
-    target = fluid.layers.data(
-        name='target', shape=[1], dtype='int64', lod_level=1)
-    crf_cost = fluid.layers.linear_chain_crf(
-        input=feature_out,
-        label=target,
-        param_attr=fluid.ParamAttr(
-            name='crfw', learning_rate=mix_hidden_lr))
+    target = fluid.layers.data(name='target',
+                               shape=[1],
+                               dtype='int64',
+                               lod_level=1)
+    crf_cost = fluid.layers.linear_chain_crf(input=feature_out,
+                                             label=target,
+                                             param_attr=fluid.ParamAttr(
+                                                 name='crfw',
+                                                 learning_rate=mix_hidden_lr))
     avg_cost = fluid.layers.mean(crf_cost)
 
     # TODO(qiao)
     # check other optimizers and check why out will be NAN
     sgd_optimizer = fluid.optimizer.SGD(
-        learning_rate=fluid.layers.exponential_decay(
-            learning_rate=0.01,
-            decay_steps=100000,
-            decay_rate=0.5,
-            staircase=True))
+        learning_rate=fluid.layers.exponential_decay(learning_rate=0.01,
+                                                     decay_steps=100000,
+                                                     decay_rate=0.5,
+                                                     staircase=True))
     sgd_optimizer.minimize(avg_cost)
 
     # TODO(qiao)
@@ -162,17 +175,15 @@ def train(use_cuda, save_dirname=None, is_local=True):
     crf_decode = fluid.layers.crf_decoding(
         input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
 
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.conll05.test(), buf_size=8192),
-        batch_size=BATCH_SIZE)
+    train_data = paddle.batch(paddle.reader.shuffle(
+        paddle.dataset.conll05.test(), buf_size=8192),
+                              batch_size=BATCH_SIZE)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    feeder = fluid.DataFeeder(
-        feed_list=[
-            word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
-        ],
-        place=place)
+    feeder = fluid.DataFeeder(feed_list=[
+        word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
+    ],
+                              place=place)
     exe = fluid.Executor(place)
 
     def train_loop(main_program):
@@ -195,17 +206,18 @@ def train_loop(main_program):
                 if batch_id % 10 == 0:
                     print("avg_cost:" + str(cost))
                     if batch_id != 0:
-                        print("second per batch: " + str((time.time(
-                        ) - start_time) / batch_id))
+                        print("second per batch: " +
+                              str((time.time() - start_time) / batch_id))
                     # Set the threshold low to speed up the CI test
                     if float(cost) < 80.0:
                         if save_dirname is not None:
                             # TODO(liuyiqun): Change the target to crf_decode
-                            fluid.io.save_inference_model(save_dirname, [
-                                'word_data', 'verb_data', 'ctx_n2_data',
-                                'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
-                                'ctx_p2_data', 'mark_data'
-                            ], [feature_out], exe)
+                            fluid.io.save_inference_model(
+                                save_dirname, [
+                                    'word_data', 'verb_data', 'ctx_n2_data',
+                                    'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
+                                    'ctx_p2_data', 'mark_data'
+                                ], [feature_out], exe)
                         return
 
                 batch_id = batch_id + 1
@@ -268,54 +280,46 @@ def infer(use_cuda, save_dirname=None):
         recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
         # The range of random integers is [low, high]
-        word = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=word_dict_len - 1)
-        pred = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=pred_dict_len - 1)
-        ctx_n2 = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=word_dict_len - 1)
-        ctx_n1 = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=word_dict_len - 1)
-        ctx_0 = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=word_dict_len - 1)
-        ctx_p1 = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=word_dict_len - 1)
-        ctx_p2 = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=word_dict_len - 1)
-        mark = fluid.create_random_int_lodtensor(
-            recursive_seq_lens,
-            base_shape,
-            place,
-            low=0,
-            high=mark_dict_len - 1)
+        word = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                 base_shape,
+                                                 place,
+                                                 low=0,
+                                                 high=word_dict_len - 1)
+        pred = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                 base_shape,
+                                                 place,
+                                                 low=0,
+                                                 high=pred_dict_len - 1)
+        ctx_n2 = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                   base_shape,
+                                                   place,
+                                                   low=0,
+                                                   high=word_dict_len - 1)
+        ctx_n1 = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                   base_shape,
+                                                   place,
+                                                   low=0,
+                                                   high=word_dict_len - 1)
+        ctx_0 = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                  base_shape,
+                                                  place,
+                                                  low=0,
+                                                  high=word_dict_len - 1)
+        ctx_p1 = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                   base_shape,
+                                                   place,
+                                                   low=0,
+                                                   high=word_dict_len - 1)
+        ctx_p2 = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                   base_shape,
+                                                   place,
+                                                   low=0,
+                                                   high=word_dict_len - 1)
+        mark = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                 base_shape,
+                                                 place,
+                                                 low=0,
+                                                 high=mark_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -358,6 +362,7 @@ def main(use_cuda, is_local=True):
 
 
 class TestLabelSemanticRoles(unittest.TestCase):
+
     def test_cuda(self):
         with self.scope_prog_guard():
             main(use_cuda=True)
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index a0056ba3bab06..f0595d52f7f2e 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -41,14 +41,15 @@
 
 def encoder(is_sparse):
     # encoder
-    src_word_id = pd.data(
-        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
-    src_embedding = pd.embedding(
-        input=src_word_id,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=is_sparse,
-        param_attr=fluid.ParamAttr(name='vemb'))
+    src_word_id = pd.data(name="src_word_id",
+                          shape=[1],
+                          dtype='int64',
+                          lod_level=1)
+    src_embedding = pd.embedding(input=src_word_id,
+                                 size=[dict_size, word_dim],
+                                 dtype='float32',
+                                 is_sparse=is_sparse,
+                                 param_attr=fluid.ParamAttr(name='vemb'))
 
     fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
     lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
@@ -58,14 +59,15 @@ def encoder(is_sparse):
 
 def decoder_train(context, is_sparse):
     # decoder
-    trg_language_word = pd.data(
-        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
-    trg_embedding = pd.embedding(
-        input=trg_language_word,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=is_sparse,
-        param_attr=fluid.ParamAttr(name='vemb'))
+    trg_language_word = pd.data(name="target_language_word",
+                                shape=[1],
+                                dtype='int64',
+                                lod_level=1)
+    trg_embedding = pd.embedding(input=trg_language_word,
+                                 size=[dict_size, word_dim],
+                                 dtype='float32',
+                                 is_sparse=is_sparse,
+                                 param_attr=fluid.ParamAttr(name='vemb'))
 
     rnn = pd.DynamicRNN()
     with rnn.block():
@@ -98,8 +100,10 @@ def decoder_decode(context, is_sparse):
     scores_array = pd.create_array('float32')
 
     init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
-    init_scores = pd.data(
-        name="init_scores", shape=[1], dtype="float32", lod_level=2)
+    init_scores = pd.data(name="init_scores",
+                          shape=[1],
+                          dtype="float32",
+                          lod_level=2)
 
     pd.array_write(init_ids, array=ids_array, i=counter)
     pd.array_write(init_scores, array=scores_array, i=counter)
@@ -115,11 +119,10 @@ def decoder_decode(context, is_sparse):
         # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
         pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
 
-        pre_ids_emb = pd.embedding(
-            input=pre_ids,
-            size=[dict_size, word_dim],
-            dtype='float32',
-            is_sparse=is_sparse)
+        pre_ids_emb = pd.embedding(input=pre_ids,
+                                   size=[dict_size, word_dim],
+                                   dtype='float32',
+                                   is_sparse=is_sparse)
 
         # use rnn unit to update rnn
         current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
@@ -132,17 +135,16 @@ def decoder_decode(context, is_sparse):
                               act='softmax')
         topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
         # calculate accumulated scores after topk to reduce computation cost
-        accu_scores = pd.elementwise_add(
-            x=pd.log(topk_scores), y=pd.reshape(
-                pre_score, shape=[-1]), axis=0)
-        selected_ids, selected_scores = pd.beam_search(
-            pre_ids,
-            pre_score,
-            topk_indices,
-            accu_scores,
-            beam_size,
-            end_id=10,
-            level=0)
+        accu_scores = pd.elementwise_add(x=pd.log(topk_scores),
+                                         y=pd.reshape(pre_score, shape=[-1]),
+                                         axis=0)
+        selected_ids, selected_scores = pd.beam_search(pre_ids,
+                                                       pre_score,
+                                                       topk_indices,
+                                                       accu_scores,
+                                                       beam_size,
+                                                       end_id=10,
+                                                       level=0)
 
         pd.increment(x=counter, value=1, in_place=True)
 
@@ -172,8 +174,10 @@ def train_main(use_cuda, is_sparse, is_local=True):
 
     context = encoder(is_sparse)
     rnn_out = decoder_train(context, is_sparse)
-    label = pd.data(
-        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    label = pd.data(name="target_language_next_word",
+                    shape=[1],
+                    dtype='int64',
+                    lod_level=1)
     cost = pd.cross_entropy(input=rnn_out, label=label)
     avg_cost = pd.mean(cost)
 
@@ -183,10 +187,9 @@ def train_main(use_cuda, is_sparse, is_local=True):
             regularization_coeff=0.1))
     optimizer.minimize(avg_cost)
 
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
+    train_data = paddle.batch(paddle.reader.shuffle(
+        paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+                              batch_size=batch_size)
 
     feed_order = [
         'src_word_id', 'target_language_word', 'target_language_next_word'
@@ -252,8 +255,8 @@ def decode_main(use_cuda, is_sparse):
     exe.run(framework.default_startup_program())
 
     init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
-    init_scores_data = np.array(
-        [1. for _ in range(batch_size)], dtype='float32')
+    init_scores_data = np.array([1. for _ in range(batch_size)],
+                                dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
     init_recursive_seq_lens = [1] * batch_size
@@ -264,10 +267,9 @@ def decode_main(use_cuda, is_sparse):
     init_scores = fluid.create_lod_tensor(init_scores_data,
                                           init_recursive_seq_lens, place)
 
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
+    train_data = paddle.batch(paddle.reader.shuffle(
+        paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+                              batch_size=batch_size)
 
     feed_order = ['src_word_id']
     feed_list = [
@@ -305,8 +307,8 @@ def scope_prog_guard():
 
 
 def inject_test_train(use_cuda, is_sparse):
-    f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse'
-                                         if is_sparse else 'dense')
+    f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu',
+                                         'sparse' if is_sparse else 'dense')
 
     def f(*args):
         with scope_prog_guard():
@@ -316,9 +318,8 @@ def f(*args):
 
 
 def inject_test_decode(use_cuda, is_sparse, decorator=None):
-    f_name = 'test_{0}_{1}_decode'.format('cuda'
-                                          if use_cuda else 'cpu', 'sparse'
-                                          if is_sparse else 'dense')
+    f_name = 'test_{0}_{1}_decode'.format('cuda' if use_cuda else 'cpu',
+                                          'sparse' if is_sparse else 'dense')
 
     def f(*args):
         with scope_prog_guard():
@@ -342,8 +343,9 @@ def f(*args):
             _decorator_ = unittest.skip(
                 reason='Beam Search does not support CUDA!')
 
-        inject_test_decode(
-            is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_)
+        inject_test_decode(is_sparse=_is_sparse_,
+                           use_cuda=_use_cuda_,
+                           decorator=_decorator_)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 71c57b851600d..5301f9aa7607c 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -46,21 +46,19 @@ def mlp(img, label):
 
 
 def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(input=img,
+                                                  filter_size=5,
+                                                  num_filters=20,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
     conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(input=conv_pool_1,
+                                                  filter_size=5,
+                                                  num_filters=50,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
     return loss_net(conv_pool_2, label)
 
 
@@ -96,12 +94,11 @@ def train(nn_type,
 
     exe = fluid.Executor(place)
 
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=500),
-        batch_size=BATCH_SIZE)
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+    train_reader = paddle.batch(paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+                                batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                               batch_size=BATCH_SIZE)
     feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
 
     def train_loop(main_program):
@@ -143,9 +140,9 @@ def train_loop(main_program):
                         return
                     else:
                         print(
-                            'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
-                            format(pass_id, batch_id + 1,
-                                   float(avg_loss_val), float(acc_val)))
+                            'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'
+                            .format(pass_id, batch_id + 1, float(avg_loss_val),
+                                    float(acc_val)))
                         if math.isnan(float(avg_loss_val)):
                             sys.exit("got NaN loss, training failed.")
         raise AssertionError("Loss of recognize digits is too large")
@@ -192,8 +189,9 @@ def infer(use_cuda,
         # data using feed operators), and the fetch_targets (variables that
         # we want to obtain data from using fetch operators).
         [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             save_dirname, exe, model_filename, params_filename)
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe,
+                                                        model_filename,
+                                                        params_filename)
 
         # The input's dimension of conv should be 4-D or 5-D.
         # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
@@ -222,19 +220,17 @@ def main(use_cuda, parallel, nn_type, combine):
             params_filename = "__params_combined__"
 
     # call train() with is_local argument to run distributed train
-    train(
-        nn_type=nn_type,
-        use_cuda=use_cuda,
-        parallel=parallel,
-        save_dirname=save_dirname,
-        save_full_dirname=save_full_dirname,
-        model_filename=model_filename,
-        params_filename=params_filename)
-    infer(
-        use_cuda=use_cuda,
-        save_dirname=save_dirname,
-        model_filename=model_filename,
-        params_filename=params_filename)
+    train(nn_type=nn_type,
+          use_cuda=use_cuda,
+          parallel=parallel,
+          save_dirname=save_dirname,
+          save_full_dirname=save_full_dirname,
+          model_filename=model_filename,
+          params_filename=params_filename)
+    infer(use_cuda=use_cuda,
+          save_dirname=save_dirname,
+          model_filename=model_filename,
+          params_filename=params_filename)
 
 
 class TestRecognizeDigits(unittest.TestCase):
@@ -242,6 +238,7 @@ class TestRecognizeDigits(unittest.TestCase):
 
 
 def inject_test_method(use_cuda, parallel, nn_type, combine):
+
     def __impl__(self):
         prog = fluid.Program()
         startup_prog = fluid.Program()
@@ -250,10 +247,9 @@ def __impl__(self):
             with fluid.program_guard(prog, startup_prog):
                 main(use_cuda, parallel, nn_type, combine)
 
-    fn = 'test_{0}_{1}_{2}_{3}'.format(nn_type, 'cuda'
-                                       if use_cuda else 'cpu', 'parallel'
-                                       if parallel else 'normal', 'combine'
-                                       if combine else 'separate')
+    fn = 'test_{0}_{1}_{2}_{3}'.format(nn_type, 'cuda' if use_cuda else 'cpu',
+                                       'parallel' if parallel else 'normal',
+                                       'combine' if combine else 'separate')
 
     setattr(TestRecognizeDigits, fn, __impl__)
 
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index c2ab249f5713d..8a4b4c2683747 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -41,12 +41,11 @@ def get_usr_combined_features():
 
     uid = layers.data(name='user_id', shape=[1], dtype='int64')
 
-    usr_emb = layers.embedding(
-        input=uid,
-        dtype='float32',
-        size=[USR_DICT_SIZE, 32],
-        param_attr='user_table',
-        is_sparse=IS_SPARSE)
+    usr_emb = layers.embedding(input=uid,
+                               dtype='float32',
+                               size=[USR_DICT_SIZE, 32],
+                               param_attr='user_table',
+                               is_sparse=IS_SPARSE)
 
     usr_fc = layers.fc(input=usr_emb, size=32)
 
@@ -54,33 +53,30 @@ def get_usr_combined_features():
 
     usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
 
-    usr_gender_emb = layers.embedding(
-        input=usr_gender_id,
-        size=[USR_GENDER_DICT_SIZE, 16],
-        param_attr='gender_table',
-        is_sparse=IS_SPARSE)
+    usr_gender_emb = layers.embedding(input=usr_gender_id,
+                                      size=[USR_GENDER_DICT_SIZE, 16],
+                                      param_attr='gender_table',
+                                      is_sparse=IS_SPARSE)
 
     usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
 
     USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
     usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
 
-    usr_age_emb = layers.embedding(
-        input=usr_age_id,
-        size=[USR_AGE_DICT_SIZE, 16],
-        is_sparse=IS_SPARSE,
-        param_attr='age_table')
+    usr_age_emb = layers.embedding(input=usr_age_id,
+                                   size=[USR_AGE_DICT_SIZE, 16],
+                                   is_sparse=IS_SPARSE,
+                                   param_attr='age_table')
 
     usr_age_fc = layers.fc(input=usr_age_emb, size=16)
 
     USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
     usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
 
-    usr_job_emb = layers.embedding(
-        input=usr_job_id,
-        size=[USR_JOB_DICT_SIZE, 16],
-        param_attr='job_table',
-        is_sparse=IS_SPARSE)
+    usr_job_emb = layers.embedding(input=usr_job_id,
+                                   size=[USR_JOB_DICT_SIZE, 16],
+                                   param_attr='job_table',
+                                   is_sparse=IS_SPARSE)
 
     usr_job_fc = layers.fc(input=usr_job_emb, size=16)
 
@@ -98,40 +94,44 @@ def get_mov_combined_features():
 
     mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
 
-    mov_emb = layers.embedding(
-        input=mov_id,
-        dtype='float32',
-        size=[MOV_DICT_SIZE, 32],
-        param_attr='movie_table',
-        is_sparse=IS_SPARSE)
+    mov_emb = layers.embedding(input=mov_id,
+                               dtype='float32',
+                               size=[MOV_DICT_SIZE, 32],
+                               param_attr='movie_table',
+                               is_sparse=IS_SPARSE)
 
     mov_fc = layers.fc(input=mov_emb, size=32)
 
     CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
 
-    category_id = layers.data(
-        name='category_id', shape=[1], dtype='int64', lod_level=1)
+    category_id = layers.data(name='category_id',
+                              shape=[1],
+                              dtype='int64',
+                              lod_level=1)
 
-    mov_categories_emb = layers.embedding(
-        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+    mov_categories_emb = layers.embedding(input=category_id,
+                                          size=[CATEGORY_DICT_SIZE, 32],
+                                          is_sparse=IS_SPARSE)
 
-    mov_categories_hidden = layers.sequence_pool(
-        input=mov_categories_emb, pool_type="sum")
+    mov_categories_hidden = layers.sequence_pool(input=mov_categories_emb,
+                                                 pool_type="sum")
 
     MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
 
-    mov_title_id = layers.data(
-        name='movie_title', shape=[1], dtype='int64', lod_level=1)
+    mov_title_id = layers.data(name='movie_title',
+                               shape=[1],
+                               dtype='int64',
+                               lod_level=1)
 
-    mov_title_emb = layers.embedding(
-        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+    mov_title_emb = layers.embedding(input=mov_title_id,
+                                     size=[MOV_TITLE_DICT_SIZE, 32],
+                                     is_sparse=IS_SPARSE)
 
-    mov_title_conv = nets.sequence_conv_pool(
-        input=mov_title_emb,
-        num_filters=32,
-        filter_size=3,
-        act="tanh",
-        pool_type="sum")
+    mov_title_conv = nets.sequence_conv_pool(input=mov_title_emb,
+                                             num_filters=32,
+                                             filter_size=3,
+                                             act="tanh",
+                                             pool_type="sum")
 
     concat_embed = layers.concat(
         input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
@@ -170,12 +170,11 @@ def train(use_cuda, save_dirname, is_local=True):
 
     exe = Executor(place)
 
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.movielens.train(), buf_size=8192),
-        batch_size=BATCH_SIZE)
-    test_reader = paddle.batch(
-        paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
+    train_reader = paddle.batch(paddle.reader.shuffle(
+        paddle.dataset.movielens.train(), buf_size=8192),
+                                batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(paddle.dataset.movielens.test(),
+                               batch_size=BATCH_SIZE)
 
     feed_order = [
         'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
@@ -212,10 +211,11 @@ def train_loop(main_program):
                     if test_avg_cost < 6.0:
                         # if avg_cost less than 6.0, we think our code is good.
                         if save_dirname is not None:
-                            fluid.io.save_inference_model(save_dirname, [
-                                "user_id", "gender_id", "age_id", "job_id",
-                                "movie_id", "category_id", "movie_title"
-                            ], [scale_infer], exe)
+                            fluid.io.save_inference_model(
+                                save_dirname, [
+                                    "user_id", "gender_id", "age_id", "job_id",
+                                    "movie_id", "category_id", "movie_title"
+                                ], [scale_infer], exe)
                         return
 
                 if math.isnan(float(out[0])):
@@ -289,13 +289,11 @@ def infer(use_cuda, save_dirname=None):
 
         assert feed_target_names[5] == "category_id"
         category_id = fluid.create_lod_tensor(
-            [np.array(
-                [10, 8, 9], dtype='int64')], [[3]], place)
+            [np.array([10, 8, 9], dtype='int64')], [[3]], place)
 
         assert feed_target_names[6] == "movie_title"
         movie_title = fluid.create_lod_tensor(
-            [np.array(
-                [1069, 4140, 2923, 710, 988], dtype='int64')], [[5]],
+            [np.array([1069, 4140, 2923, 710, 988], dtype='int64')], [[5]],
             place)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index 3791e386ecfde..7a31035d2fb22 100644
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -46,18 +46,16 @@ def bi_lstm_encoder(input_seq, hidden_size):
     input_forward_proj = fluid.layers.fc(input=input_seq,
                                          size=hidden_size * 4,
                                          bias_attr=True)
-    forward, _ = fluid.layers.dynamic_lstm(
-        input=input_forward_proj,
-        size=hidden_size * 4,
-        use_peepholes=USE_PEEPHOLES)
+    forward, _ = fluid.layers.dynamic_lstm(input=input_forward_proj,
+                                           size=hidden_size * 4,
+                                           use_peepholes=USE_PEEPHOLES)
     input_backward_proj = fluid.layers.fc(input=input_seq,
                                           size=hidden_size * 4,
                                           bias_attr=True)
-    backward, _ = fluid.layers.dynamic_lstm(
-        input=input_backward_proj,
-        size=hidden_size * 4,
-        is_reverse=True,
-        use_peepholes=USE_PEEPHOLES)
+    backward, _ = fluid.layers.dynamic_lstm(input=input_backward_proj,
+                                            size=hidden_size * 4,
+                                            is_reverse=True,
+                                            use_peepholes=USE_PEEPHOLES)
 
     forward_last = fluid.layers.sequence_last_step(input=forward)
     backward_first = fluid.layers.sequence_first_step(input=backward)
@@ -67,6 +65,7 @@ def bi_lstm_encoder(input_seq, hidden_size):
 
 # FIXME(peterzhang2029): Replace this function with the lstm_unit_op.
 def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+
     def linear(inputs):
         return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
 
@@ -76,13 +75,12 @@ def linear(inputs):
     cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
 
     cell_t = fluid.layers.sums(input=[
-        fluid.layers.elementwise_mul(
-            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
-                x=input_gate, y=cell_tilde)
+        fluid.layers.elementwise_mul(x=forget_gate, y=cell_t_prev),
+        fluid.layers.elementwise_mul(x=input_gate, y=cell_tilde)
     ])
 
-    hidden_t = fluid.layers.elementwise_mul(
-        x=output_gate, y=fluid.layers.tanh(x=cell_t))
+    hidden_t = fluid.layers.elementwise_mul(x=output_gate,
+                                            y=fluid.layers.tanh(x=cell_t))
 
     return hidden_t, cell_t
 
@@ -104,8 +102,8 @@ def lstm_decoder_without_attention(target_embedding, decoder_boot, context,
 
         hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
         cell_mem = rnn.memory(init=cell_init)
-        decoder_inputs = fluid.layers.concat(
-            input=[context, current_word], axis=1)
+        decoder_inputs = fluid.layers.concat(input=[context, current_word],
+                                             axis=1)
         h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
         rnn.update_memory(hidden_mem, h)
         rnn.update_memory(cell_mem, c)
@@ -120,8 +118,10 @@ def lstm_decoder_without_attention(target_embedding, decoder_boot, context,
 def seq_to_seq_net():
     """Construct a seq2seq network."""
 
-    src_word_idx = fluid.layers.data(
-        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+    src_word_idx = fluid.layers.data(name='source_sequence',
+                                     shape=[1],
+                                     dtype='int64',
+                                     lod_level=1)
 
     src_embedding = fluid.layers.embedding(
         input=src_word_idx,
@@ -139,8 +139,10 @@ def seq_to_seq_net():
                                    bias_attr=False,
                                    act='tanh')
 
-    trg_word_idx = fluid.layers.data(
-        name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+    trg_word_idx = fluid.layers.data(name='target_sequence',
+                                     shape=[1],
+                                     dtype='int64',
+                                     lod_level=1)
 
     trg_embedding = fluid.layers.embedding(
         input=trg_word_idx,
@@ -149,8 +151,10 @@ def seq_to_seq_net():
 
     prediction = lstm_decoder_without_attention(trg_embedding, decoder_boot,
                                                 encoded_vector, decoder_size)
-    label = fluid.layers.data(
-        name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+    label = fluid.layers.data(name='label_sequence',
+                              shape=[1],
+                              dtype='int64',
+                              lod_level=1)
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
     avg_cost = fluid.layers.mean(cost)
 
@@ -163,10 +167,9 @@ def train(use_cuda, save_dirname=None):
     optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
     optimizer.minimize(avg_cost)
 
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
+    train_data = paddle.batch(paddle.reader.shuffle(
+        paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+                              batch_size=batch_size)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = Executor(place)
@@ -194,8 +197,8 @@ def train(use_cuda, save_dirname=None):
             if batch_id > 3:
                 if save_dirname is not None:
                     fluid.io.save_inference_model(
-                        save_dirname, ['source_sequence',
-                                       'target_sequence'], [prediction], exe)
+                        save_dirname, ['source_sequence', 'target_sequence'],
+                        [prediction], exe)
                 return
 
             batch_id += 1
@@ -230,10 +233,16 @@ def infer(use_cuda, save_dirname=None):
         recursive_seq_lens = [[4, 6]]
         base_shape = [1]
         # The range of random integers is [low, high]
-        word_data = fluid.create_random_int_lodtensor(
-            recursive_seq_lens, base_shape, place, low=0, high=1)
-        trg_word = fluid.create_random_int_lodtensor(
-            recursive_seq_lens, base_shape, place, low=0, high=1)
+        word_data = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                      base_shape,
+                                                      place,
+                                                      low=0,
+                                                      high=1)
+        trg_word = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                     base_shape,
+                                                     place,
+                                                     low=0,
+                                                     high=1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -264,6 +273,7 @@ def main(use_cuda):
 
 
 class TestRnnEncoderDecoder(unittest.TestCase):
+
     def test_cuda(self):
         with self.scope_prog_guard():
             main(use_cuda=True)
diff --git a/python/paddle/fluid/tests/book/test_word2vec_book.py b/python/paddle/fluid/tests/book/test_word2vec_book.py
index 650ccc0776a50..37d5106e8502d 100644
--- a/python/paddle/fluid/tests/book/test_word2vec_book.py
+++ b/python/paddle/fluid/tests/book/test_word2vec_book.py
@@ -54,30 +54,26 @@ def train(target,
     IS_SPARSE = is_sparse
 
     def __network__(words):
-        embed_first = fluid.layers.embedding(
-            input=words[0],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
-        embed_second = fluid.layers.embedding(
-            input=words[1],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
-        embed_third = fluid.layers.embedding(
-            input=words[2],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
-        embed_forth = fluid.layers.embedding(
-            input=words[3],
-            size=[dict_size, EMBED_SIZE],
-            dtype='float32',
-            is_sparse=IS_SPARSE,
-            param_attr='shared_w')
+        embed_first = fluid.layers.embedding(input=words[0],
+                                             size=[dict_size, EMBED_SIZE],
+                                             dtype='float32',
+                                             is_sparse=IS_SPARSE,
+                                             param_attr='shared_w')
+        embed_second = fluid.layers.embedding(input=words[1],
+                                              size=[dict_size, EMBED_SIZE],
+                                              dtype='float32',
+                                              is_sparse=IS_SPARSE,
+                                              param_attr='shared_w')
+        embed_third = fluid.layers.embedding(input=words[2],
+                                             size=[dict_size, EMBED_SIZE],
+                                             dtype='float32',
+                                             is_sparse=IS_SPARSE,
+                                             param_attr='shared_w')
+        embed_forth = fluid.layers.embedding(input=words[3],
+                                             size=[dict_size, EMBED_SIZE],
+                                             dtype='float32',
+                                             is_sparse=IS_SPARSE,
+                                             param_attr='shared_w')
 
         concat_embed = fluid.layers.concat(
             input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
@@ -117,8 +113,8 @@ def __network__(words):
 
     sgd_optimizer.minimize(avg_cost, fluid.default_startup_program())
 
-    train_reader = paddle.batch(
-        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+    train_reader = paddle.batch(paddle.dataset.imikolov.train(word_dict, N),
+                                BATCH_SIZE)
 
     place = get_place(target)
     exe = fluid.Executor(place)
@@ -138,9 +134,10 @@ def train_loop(main_program):
                                       fetch_list=[avg_cost])
                 if avg_cost_np[0] < 5.0:
                     if save_dirname is not None and not pure_bf16:
-                        fluid.io.save_inference_model(save_dirname, [
-                            'firstw', 'secondw', 'thirdw', 'forthw'
-                        ], [predict_word], exe)
+                        fluid.io.save_inference_model(
+                            save_dirname,
+                            ['firstw', 'secondw', 'thirdw', 'forthw'],
+                            [predict_word], exe)
                     return
                 if math.isnan(float(avg_cost_np[0])):
                     sys.exit("got NaN loss, training failed.")
@@ -200,14 +197,26 @@ def infer(target, save_dirname=None):
         recursive_seq_lens = [[1]]
         base_shape = [1]
         # The range of random integers is [low, high]
-        first_word = fluid.create_random_int_lodtensor(
-            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
-        second_word = fluid.create_random_int_lodtensor(
-            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
-        third_word = fluid.create_random_int_lodtensor(
-            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
-        fourth_word = fluid.create_random_int_lodtensor(
-            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
+        first_word = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                       base_shape,
+                                                       place,
+                                                       low=0,
+                                                       high=dict_size - 1)
+        second_word = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                        base_shape,
+                                                        place,
+                                                        low=0,
+                                                        high=dict_size - 1)
+        third_word = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                       base_shape,
+                                                       place,
+                                                       low=0,
+                                                       high=dict_size - 1)
+        fourth_word = fluid.create_random_int_lodtensor(recursive_seq_lens,
+                                                        base_shape,
+                                                        place,
+                                                        low=0,
+                                                        high=dict_size - 1)
 
         assert feed_target_names[0] == 'firstw'
         assert feed_target_names[1] == 'secondw'
@@ -274,13 +283,12 @@ def main(target, is_sparse, is_parallel, use_bf16, pure_bf16):
         # so only inference is turned on.
         train("cpu", is_sparse, is_parallel, save_dirname)
     else:
-        train(
-            target,
-            is_sparse,
-            is_parallel,
-            save_dirname,
-            use_bf16=use_bf16,
-            pure_bf16=pure_bf16)
+        train(target,
+              is_sparse,
+              is_parallel,
+              save_dirname,
+              use_bf16=use_bf16,
+              pure_bf16=pure_bf16)
     infer(target, save_dirname)
 
 
@@ -298,11 +306,10 @@ def inject_test_method(target,
                        is_parallel,
                        use_bf16=False,
                        pure_bf16=False):
-    fn_name = "test_{0}_{1}_{2}{3}".format(target, "sparse"
-                                           if is_sparse else "dense", "parallel"
-                                           if is_parallel else "normal",
-                                           "_purebf16" if pure_bf16 else "_bf16"
-                                           if use_bf16 else "")
+    fn_name = "test_{0}_{1}_{2}{3}".format(
+        target, "sparse" if is_sparse else "dense",
+        "parallel" if is_parallel else "normal",
+        "_purebf16" if pure_bf16 else "_bf16" if use_bf16 else "")
 
     def __impl__(*args, **kwargs):
         prog = fluid.Program()
@@ -312,13 +319,13 @@ def __impl__(*args, **kwargs):
             with fluid.program_guard(prog, startup_prog):
                 main(target, is_sparse, is_parallel, use_bf16, pure_bf16)
 
-    if (not fluid.core.is_compiled_with_cuda() or
-            target == "cuda") and is_sparse:
+    if (not fluid.core.is_compiled_with_cuda()
+            or target == "cuda") and is_sparse:
         fn = __impl__
     else:
         # skip the other test when on CI server
-        fn = unittest.skipUnless(
-            condition=FULL_TEST, reason=SKIP_REASON)(__impl__)
+        fn = unittest.skipUnless(condition=FULL_TEST,
+                                 reason=SKIP_REASON)(__impl__)
 
     setattr(W2VTest, fn_name, fn)
 
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
index d52882acfc9ac..94de1a39ccfbb 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
@@ -25,6 +25,7 @@
 # cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid
 # for C/ObjC but not for C++
 class BuildExt(build_ext):
+
     def build_extensions(self):
         if '-Wstrict-prototypes' in self.compiler.compiler_so:
             self.compiler.compiler_so.remove('-Wstrict-prototypes')
@@ -74,9 +75,8 @@ def build_extensions(self):
     libraries=libs,
     extra_compile_args=paddle_extra_compile_args)
 
-setup(
-    name='custom_kernel_dot',
-    version='1.0',
-    description='custom kernel fot compiling',
-    cmdclass={'build_ext': BuildExt},
-    ext_modules=[custom_kernel_dot_module])
+setup(name='custom_kernel_dot',
+      version='1.0',
+      description='custom kernel fot compiling',
+      cmdclass={'build_ext': BuildExt},
+      ext_modules=[custom_kernel_dot_module])
diff --git a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
index 13d8a29e71b41..d1929fef5cc54 100644
--- a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
+++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,6 +21,7 @@
 
 # use dot <CPU, ANY, INT8> as test case.
 class TestCustomKernelDot(unittest.TestCase):
+
     def setUp(self):
         # compile so and set to current path
         cur_dir = os.path.dirname(os.path.abspath(__file__))
@@ -48,8 +49,8 @@ def test_custom_kernel_dot_run(self):
 
         self.assertTrue(
             np.array_equal(out.numpy(), result),
-            "custom kernel dot out: {},\n numpy dot out: {}".format(out.numpy(),
-                                                                    result))
+            "custom kernel dot out: {},\n numpy dot out: {}".format(
+                out.numpy(), result))
 
     def tearDown(self):
         del os.environ['CUSTOM_DEVICE_ROOT']
diff --git a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py
index 1d7b29e851192..a4def8df9e08c 100644
--- a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py
+++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py
@@ -20,6 +20,7 @@
 
 
 class TestCustomKernelLoad(unittest.TestCase):
+
     def setUp(self):
         # compile so and set to current path
         cur_dir = os.path.dirname(os.path.abspath(__file__))
@@ -32,8 +33,9 @@ def setUp(self):
         # get paddle lib path and place so
         paddle_lib_path = ''
         site_dirs = site.getsitepackages() if hasattr(
-            site, 'getsitepackages'
-        ) else [x for x in sys.path if 'site-packages' in x]
+            site, 'getsitepackages') else [
+                x for x in sys.path if 'site-packages' in x
+            ]
         for site_dir in site_dirs:
             lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs'])
             if os.path.exists(lib_dir):
@@ -65,8 +67,8 @@ def test_custom_kernel_dot_load(self):
 
         self.assertTrue(
             np.array_equal(out.numpy(), result),
-            "custom kernel dot out: {},\n numpy dot out: {}".format(out.numpy(),
-                                                                    result))
+            "custom kernel dot out: {},\n numpy dot out: {}".format(
+                out.numpy(), result))
 
     def tearDown(self):
         cmd = 'rm -rf {}'.format(self.default_path)
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index b4adeb9575af6..f21fc730fc8de 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -1,20 +1,22 @@
 # New custom OP can support Windows/Linux/Mac now
 if(WITH_GPU OR APPLE)
-    py_test(test_custom_relu_op_setup SRCS test_custom_relu_op_setup.py)
-    py_test(test_custom_relu_op_jit SRCS test_custom_relu_op_jit.py)
-    py_test(test_custom_relu_model SRCS test_custom_relu_model.py)
-    py_test(test_context_pool SRCS test_context_pool.py)
+  py_test(test_custom_relu_op_setup SRCS test_custom_relu_op_setup.py)
+  py_test(test_custom_relu_op_jit SRCS test_custom_relu_op_jit.py)
+  py_test(test_custom_relu_model SRCS test_custom_relu_model.py)
+  py_test(test_context_pool SRCS test_context_pool.py)
 
-    # Compiling shared library will cost some time, but running process is very fast.
-    set_tests_properties(test_custom_relu_op_setup PROPERTIES TIMEOUT 250)
-    set_tests_properties(test_custom_relu_op_jit PROPERTIES TIMEOUT 180)
-    set_tests_properties(test_custom_relu_model PROPERTIES TIMEOUT 180)
-    set_tests_properties(test_context_pool PROPERTIES TIMEOUT 180)
-    if($ENV{USE_STANDALONE_EXECUTOR})
-        # these test will fail in some server due to PR#42149, temporarily set it use old executor.
-        set_tests_properties(test_custom_relu_op_setup PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-        set_tests_properties(test_custom_relu_model PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-    endif()
+  # Compiling shared library will cost some time, but running process is very fast.
+  set_tests_properties(test_custom_relu_op_setup PROPERTIES TIMEOUT 250)
+  set_tests_properties(test_custom_relu_op_jit PROPERTIES TIMEOUT 180)
+  set_tests_properties(test_custom_relu_model PROPERTIES TIMEOUT 180)
+  set_tests_properties(test_context_pool PROPERTIES TIMEOUT 180)
+  if($ENV{USE_STANDALONE_EXECUTOR})
+    # these test will fail in some server due to PR#42149, temporarily set it use old executor.
+    set_tests_properties(test_custom_relu_op_setup
+                         PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    set_tests_properties(test_custom_relu_model
+                         PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+  endif()
 endif()
 
 py_test(test_custom_raw_op_kernel_op SRCS test_custom_raw_op_kernel_op.py)
@@ -35,5 +37,5 @@ py_test(test_sysconfig SRCS test_sysconfig.py)
 py_test(test_check_abi SRCS test_check_abi.py)
 
 if(APPLE)
-    set_tests_properties(test_custom_simple_slice PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_custom_simple_slice PROPERTIES TIMEOUT 300)
 endif()
diff --git a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op_setup.py b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op_setup.py
index 8889a56ad204d..e751a335d7231 100644
--- a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op_setup.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -40,11 +40,9 @@
     macros.append(("PADDLE_WITH_NCCL", None))
 
 include_dirs = list(paddle_includes) + [cwd]
-setup(
-    name=os.getenv("MODULE_NAME", "custom_raw_op_kernel_op_setup"),
-    ext_modules=extension(
-        sources=sources,
-        include_dirs=include_dirs,
-        extra_compile_args=extra_compile_args,
-        _compile_dir=compile_dir,
-        define_macros=macros))
+setup(name=os.getenv("MODULE_NAME", "custom_raw_op_kernel_op_setup"),
+      ext_modules=extension(sources=sources,
+                            include_dirs=include_dirs,
+                            extra_compile_args=extra_compile_args,
+                            _compile_dir=compile_dir,
+                            define_macros=macros))
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index 04399a9826cfe..f1860635ed5f4 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -17,8 +17,7 @@
 
 #include "paddle/extension.h"
 
-#define CHECK_CPU_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+#define CHECK_CPU_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 template <typename data_t>
 void relu_cpu_forward_kernel(const data_t* x_data,
@@ -26,7 +25,7 @@ void relu_cpu_forward_kernel(const data_t* x_data,
                              int64_t x_numel) {
   PD_CHECK(x_data != nullptr, "x_data is nullptr.");
   PD_CHECK(out_data != nullptr, "out_data is nullptr.");
-  for (int i = 0; i < x_numel; ++i) {
+  for (int64_t i = 0; i < x_numel; ++i) {
     out_data[i] = std::max(static_cast<data_t>(0.), x_data[i]);
   }
 }
@@ -36,7 +35,7 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data,
                               const data_t* out_data,
                               data_t* grad_x_data,
                               int64_t out_numel) {
-  for (int i = 0; i < out_numel; ++i) {
+  for (int64_t i = 0; i < out_numel; ++i) {
     grad_x_data[i] =
         grad_out_data[i] * (out_data[i] > static_cast<data_t>(0) ? 1. : 0.);
   }
@@ -54,12 +53,12 @@ void relu_cpu_double_backward_kernel(const data_t* out_data,
 }
 
 std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
-  auto out = paddle::empty(x.shape(), x.dtype(), x.place());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
         relu_cpu_forward_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(x.place()), x.size());
+            x.data<data_t>(), out.data<data_t>(), x.numel());
       }));
 
   return {out};
@@ -68,13 +67,13 @@ std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
 std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x,
                                               const paddle::Tensor& out,
                                               const paddle::Tensor& grad_out) {
-  auto grad_x = paddle::empty(x.shape(), x.dtype(), x.place());
+  auto grad_x = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
                                relu_cpu_backward_kernel<data_t>(
                                    grad_out.data<data_t>(),
                                    out.data<data_t>(),
-                                   grad_x.mutable_data<data_t>(x.place()),
+                                   grad_x.data<data_t>(),
                                    out.size());
                              }));
 
@@ -108,9 +107,9 @@ std::vector<paddle::Tensor> relu_cuda_double_backward(
     const paddle::Tensor& out, const paddle::Tensor& ddx);
 
 std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
-  if (x.place() == paddle::PlaceType::kCPU) {
+  if (x.is_cpu()) {
     return relu_cpu_forward(x);
-  } else if (x.place() == paddle::PlaceType::kGPU) {
+  } else if (x.is_gpu()) {
     return relu_cuda_forward(x);
   } else {
     PD_THROW("Not implemented.");
@@ -120,10 +119,9 @@ std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
 std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
                                          const paddle::Tensor& out,
                                          const paddle::Tensor& grad_out) {
-  // TODO(chenweihang): Check Input
-  if (x.place() == paddle::PlaceType::kCPU) {
+  if (x.is_cpu()) {
     return relu_cpu_backward(x, out, grad_out);
-  } else if (x.place() == paddle::PlaceType::kGPU) {
+  } else if (x.is_gpu()) {
     return relu_cuda_backward(x, out, grad_out);
   } else {
     PD_THROW("Not implemented.");
@@ -214,7 +212,7 @@ void relu_cpu_forward_out(const paddle::Tensor& x, paddle::Tensor* out) {
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
         relu_cpu_forward_kernel<data_t>(
-            x.data<data_t>(), out->mutable_data<data_t>(x.place()), x.size());
+            x.data<data_t>(), out->mutable_data<data_t>(x.place()), x.numel());
       }));
 }
 
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index 18f1a2b95c2ee..f9314ea4b1066 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -14,15 +14,14 @@
 
 #include "paddle/extension.h"
 
-#define CHECK_GPU_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.")
+#define CHECK_GPU_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
 
 template <typename data_t>
 __global__ void relu_cuda_forward_kernel(const data_t* x,
                                          data_t* y,
-                                         const int num) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+                                         int64_t num) {
+  int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int64_t i = gid; i < num; i += blockDim.x * gridDim.x) {
     y[i] = x[i] > static_cast<data_t>(0.) ? x[i] : static_cast<data_t>(0.);
   }
 }
@@ -31,9 +30,9 @@ template <typename data_t>
 __global__ void relu_cuda_backward_kernel(const data_t* dy,
                                           const data_t* y,
                                           data_t* dx,
-                                          const int num) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+                                          int64_t num) {
+  int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int64_t i = gid; i < num; i += blockDim.x * gridDim.x) {
     dx[i] = dy[i] * (y[i] > static_cast<data_t>(0.) ? static_cast<data_t>(1.)
                                                     : static_cast<data_t>(0.));
   }
@@ -54,15 +53,15 @@ __global__ void relu_cuda_double_backward_kernel(const data_t* out_data,
 
 std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
   CHECK_GPU_INPUT(x);
-  auto out = paddle::empty(x.shape(), x.dtype(), x.place());
+  auto out = paddle::empty_like(x);
 
-  int numel = x.size();
-  int block = 512;
-  int grid = (numel + block - 1) / block;
+  int64_t numel = x.numel();
+  int64_t block = 512;
+  int64_t grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "relu_cuda_forward_kernel", ([&] {
         relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
-            x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
+            x.data<data_t>(), out.data<data_t>(), numel);
       }));
 
   return {out};
@@ -74,11 +73,11 @@ std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
   CHECK_GPU_INPUT(x);
   CHECK_GPU_INPUT(out);
   CHECK_GPU_INPUT(grad_out);
-  auto grad_x = paddle::empty(x.shape(), x.dtype(), x.place());
+  auto grad_x = paddle::empty_like(x);
 
-  int numel = out.size();
-  int block = 512;
-  int grid = (numel + block - 1) / block;
+  int64_t numel = out.numel();
+  int64_t block = 512;
+  int64_t grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       out.type(), "relu_cuda_backward_kernel", ([&] {
         relu_cuda_backward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
@@ -97,7 +96,7 @@ std::vector<paddle::Tensor> relu_cuda_double_backward(
   CHECK_GPU_INPUT(ddx);
   auto ddout = paddle::empty(out.shape(), out.dtype(), out.place());
 
-  int64_t numel = out.size();
+  int64_t numel = out.numel();
   int64_t block = 512;
   int64_t grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
@@ -119,7 +118,7 @@ std::vector<paddle::Tensor> relu_cuda_backward_without_x(
     const paddle::Tensor& out, const paddle::Tensor& grad_out) {
   auto grad_x = paddle::empty(out.shape(), out.dtype(), out.place());
 
-  int numel = out.size();
+  int numel = out.numel();
   int block = 512;
   int grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
@@ -135,7 +134,7 @@ std::vector<paddle::Tensor> relu_cuda_backward_without_x(
 }
 
 void relu_cuda_forward_out(const paddle::Tensor& x, paddle::Tensor* out) {
-  int numel = x.size();
+  int numel = x.numel();
   int block = 512;
   int grid = (numel + block - 1) / block;
   out->reshape(x.shape());
@@ -150,7 +149,7 @@ void relu_cuda_backward_out(const paddle::Tensor& x,
                             const paddle::Tensor& out,
                             const paddle::Tensor& grad_out,
                             paddle::Tensor* grad_x) {
-  int numel = out.size();
+  int numel = out.numel();
   int block = 512;
   int grid = (numel + block - 1) / block;
   grad_x->reshape(x.shape());
diff --git a/python/paddle/fluid/tests/custom_op/ps_usr_print_log b/python/paddle/fluid/tests/custom_op/ps_usr_print_log
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/python/paddle/fluid/tests/custom_op/test_check_abi.py b/python/paddle/fluid/tests/custom_op/test_check_abi.py
index baef25d2d1162..727e02f62cd37 100644
--- a/python/paddle/fluid/tests/custom_op/test_check_abi.py
+++ b/python/paddle/fluid/tests/custom_op/test_check_abi.py
@@ -21,6 +21,7 @@
 
 
 class TestABIBase(unittest.TestCase):
+
     def test_environ(self):
         compiler_list = ['gcc', 'cl']
         for compiler in compiler_list:
@@ -35,6 +36,7 @@ def del_environ(self):
 
 
 class TestCheckCompiler(TestABIBase):
+
     def test_expected_compiler(self):
         if utils.OS_NAME.startswith('linux'):
             gt = ['gcc', 'g++', 'gnu-c++', 'gnu-cc']
@@ -85,8 +87,8 @@ def test_exception_windows(self):
                 self.assertFalse(flag)
                 # check ABI Compatibility WARNING
                 self.assertTrue(len(error) == 1)
-                self.assertTrue("Failed to check compiler version for" in
-                                str(error[0].message))
+                self.assertTrue("Failed to check compiler version for" in str(
+                    error[0].message))
 
     def test_exception_linux(self):
         # clear environ
@@ -106,8 +108,8 @@ def fake():
                 self.assertFalse(flag)
                 # check ABI Compatibility WARNING
                 self.assertTrue(len(error) == 1)
-                self.assertTrue("Failed to check compiler version for" in
-                                str(error[0].message))
+                self.assertTrue("Failed to check compiler version for" in str(
+                    error[0].message))
 
             # restore
             utils._expected_compiler_current_platform = raw_func
@@ -136,6 +138,7 @@ def fake():
 
 
 class TestRunCMDException(unittest.TestCase):
+
     def test_exception(self):
         for verbose in [True, False]:
             with self.assertRaisesRegexp(RuntimeError, "Failed to run command"):
diff --git a/python/paddle/fluid/tests/custom_op/test_context_pool.py b/python/paddle/fluid/tests/custom_op/test_context_pool.py
index d532b29688b39..d4a079ee4fe10 100644
--- a/python/paddle/fluid/tests/custom_op/test_context_pool.py
+++ b/python/paddle/fluid/tests/custom_op/test_context_pool.py
@@ -24,8 +24,8 @@
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
-file = '{}\\context_pool_jit\\context_pool_jit.pyd'.format(get_build_directory(
-))
+file = '{}\\context_pool_jit\\context_pool_jit.pyd'.format(
+    get_build_directory())
 if os.name == 'nt' and os.path.isfile(file):
     cmd = 'del {}'.format(file)
     run_cmd(cmd, True)
@@ -41,6 +41,7 @@
 
 
 class TestContextPool(unittest.TestCase):
+
     def setUp(self):
         self.devices = ['cpu']
         if paddle.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
index f69451252434c..953ca5519060f 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
@@ -22,10 +22,10 @@
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from paddle.fluid.framework import _test_eager_guard
 
-# Because Windows don't use docker, the shared lib already exists in the 
+# Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
-file = '{}\\custom_attrs_jit\\custom_attrs_jit.pyd'.format(get_build_directory(
-))
+file = '{}\\custom_attrs_jit\\custom_attrs_jit.pyd'.format(
+    get_build_directory())
 if os.name == 'nt' and os.path.isfile(file):
     cmd = 'del {}'.format(file)
     run_cmd(cmd, True)
@@ -41,6 +41,7 @@
 
 
 class TestJitCustomAttrs(unittest.TestCase):
+
     def setUp(self):
         paddle.set_device('cpu')
         # prepare test value
@@ -57,10 +58,11 @@ def setUp(self):
     def func_attr_value(self):
         x = paddle.ones([2, 2], dtype='float32')
         x.stop_gradient = False
-        out = custom_attrs.attr_test(
-            x, self.bool_attr, self.int_attr, self.float_attr, self.int64_attr,
-            self.str_attr, self.int_vec_attr, self.float_vec_attr,
-            self.int64_vec_attr, self.str_vec_attr)
+        out = custom_attrs.attr_test(x, self.bool_attr, self.int_attr,
+                                     self.float_attr, self.int64_attr,
+                                     self.str_attr, self.int_vec_attr,
+                                     self.float_vec_attr, self.int64_vec_attr,
+                                     self.str_vec_attr)
         out.stop_gradient = False
         out.backward()
 
@@ -74,10 +76,12 @@ def test_attr_value(self):
     def func_const_attr_value(self):
         x = paddle.ones([2, 2], dtype='float32')
         x.stop_gradient = False
-        out = custom_attrs.const_attr_test(
-            x, self.bool_attr, self.int_attr, self.float_attr, self.int64_attr,
-            self.str_attr, self.int_vec_attr, self.float_vec_attr,
-            self.int64_vec_attr, self.str_vec_attr)
+        out = custom_attrs.const_attr_test(x, self.bool_attr, self.int_attr,
+                                           self.float_attr, self.int64_attr,
+                                           self.str_attr, self.int_vec_attr,
+                                           self.float_vec_attr,
+                                           self.int64_vec_attr,
+                                           self.str_vec_attr)
         out.stop_gradient = False
         out.backward()
 
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_concat.py b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
index 2a5d037bdad49..83be96a95a85d 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_concat.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -48,8 +48,7 @@
 def concat_dynamic(func, dtype, np_inputs, axis_v, with_attr=False):
     paddle.set_device("cpu")
     inputs = [
-        paddle.to_tensor(
-            x, dtype=dtype, stop_gradient=False) for x in np_inputs
+        paddle.to_tensor(x, dtype=dtype, stop_gradient=False) for x in np_inputs
     ]
     if with_attr:
         axis = axis_v
@@ -103,6 +102,7 @@ def concat_static(func, dtype, np_inputs, axis_v, with_attr=False):
 
 
 class TestCustomConcatDynamicAxisJit(unittest.TestCase):
+
     def setUp(self):
         self.dtypes = ['float32', 'float64', 'int32', 'int64']
         self.np_inputs = [
@@ -114,8 +114,8 @@ def setUp(self):
     def check_output(self, out, pd_out, name):
         self.assertTrue(
             np.array_equal(out, pd_out),
-            "custom op {}: {},\n paddle api {}: {}".format(name, out, name,
-                                                           pd_out))
+            "custom op {}: {},\n paddle api {}: {}".format(
+                name, out, name, pd_out))
 
     def func_dynamic(self):
         for dtype in self.dtypes:
@@ -137,8 +137,9 @@ def test_dynamic(self):
     def test_static(self):
         for dtype in self.dtypes:
             for axis in self.axises:
-                out, x1_grad, x2_grad = concat_static(
-                    custom_ops.custom_concat, dtype, self.np_inputs, axis)
+                out, x1_grad, x2_grad = concat_static(custom_ops.custom_concat,
+                                                      dtype, self.np_inputs,
+                                                      axis)
                 pd_out, pd_x1_grad, pd_x2_grad = concat_static(
                     paddle.concat, dtype, self.np_inputs, axis)
 
@@ -152,8 +153,9 @@ def func_dynamic_with_attr(self):
                 out, grad_inputs = concat_dynamic(
                     custom_ops.custom_concat_with_attr, dtype, self.np_inputs,
                     axis, True)
-                pd_out, pd_grad_inputs = concat_dynamic(
-                    paddle.concat, dtype, self.np_inputs, axis, True)
+                pd_out, pd_grad_inputs = concat_dynamic(paddle.concat, dtype,
+                                                        self.np_inputs, axis,
+                                                        True)
 
                 self.check_output(out, pd_out, "out")
                 for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs):
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_conj.py b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
index 5f3c107a9b22a..ea916ff55ecab 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_conj.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -91,6 +91,7 @@ def conj_static(func, shape, dtype, np_input):
 
 
 class TestCustomConjJit(unittest.TestCase):
+
     def setUp(self):
         self.dtypes = ['float32', 'float64']
         self.shape = [2, 20, 2, 3]
@@ -98,8 +99,8 @@ def setUp(self):
     def check_output(self, out, pd_out, name):
         self.assertTrue(
             np.array_equal(out, pd_out),
-            "custom op {}: {},\n paddle api {}: {}".format(name, out, name,
-                                                           pd_out))
+            "custom op {}: {},\n paddle api {}: {}".format(
+                name, out, name, pd_out))
 
     def run_dynamic(self, dtype, np_input):
         out, x_grad = conj_dynamic(custom_ops.custom_conj, dtype, np_input)
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_linear.py b/python/paddle/fluid/tests/custom_op/test_custom_linear.py
index fba512d511c36..2309751659afe 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_linear.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_linear.py
@@ -56,8 +56,9 @@ def linear_static(func, device, dtype, np_x, np_weight, np_bias):
     with static.scope_guard(static.Scope()):
         with static.program_guard(static.Program()):
             x = static.data(name="x", shape=[None, np_x.shape[1]], dtype=dtype)
-            weight = static.data(
-                name="weight", shape=np_weight.shape, dtype=dtype)
+            weight = static.data(name="weight",
+                                 shape=np_weight.shape,
+                                 dtype=dtype)
             bias = static.data(name="bias", shape=np_bias.shape, dtype=dtype)
             x.stop_gradient = False
             weight.stop_gradient = False
@@ -85,6 +86,7 @@ def linear_static(func, device, dtype, np_x, np_weight, np_bias):
 
 
 class TestCustomLinearJit(unittest.TestCase):
+
     def setUp(self):
         self.dtypes = ['float32', 'float64']
         self.devices = ['cpu']
@@ -97,8 +99,8 @@ def setUp(self):
     def check_output(self, out, pd_out, name):
         self.assertTrue(
             np.array_equal(out, pd_out),
-            "custom op {}: {},\n paddle api {}: {}".format(name, out, name,
-                                                           pd_out))
+            "custom op {}: {},\n paddle api {}: {}".format(
+                name, out, name, pd_out))
 
     def test_static(self):
         for device in self.devices:
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py
index 4da99b1ea1041..f95f57b4b7a99 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -36,8 +36,8 @@ def prepare_module_path():
     else:
         site_dir = site.getsitepackages()[0]
     custom_egg_path = [x for x in os.listdir(site_dir) if MODULE_NAME in x]
-    assert len(custom_egg_path) == 1, "Matched egg number is %d." % len(
-        custom_egg_path)
+    assert len(custom_egg_path
+               ) == 1, "Matched egg number is %d." % len(custom_egg_path)
     sys.path.append(os.path.join(site_dir, custom_egg_path[0]))
 
 
@@ -46,6 +46,7 @@ def prepare_module_path():
 # temporarily.
 @unittest.skipIf(os.name == "nt", "Windows does not support yet.")
 class TestCustomRawReluOp(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         path = os.path.dirname(os.path.abspath(__file__))
@@ -77,8 +78,8 @@ def test_static(self):
 
         exe = paddle.static.Executor()
         exe.run(paddle.static.default_startup_program())
-        x_np = np.random.uniform(
-            low=-1.0, high=1.0, size=[2, 3]).astype('float32')
+        x_np = np.random.uniform(low=-1.0, high=1.0, size=[2,
+                                                           3]).astype('float32')
         y1_value, y2_value = exe.run(paddle.static.default_main_program(),
                                      feed={x.name: x_np},
                                      fetch_list=[y1, y2])
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
index 4980a15922502..78078963a7dea 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
@@ -72,6 +72,7 @@ def forward(self, x):
 
 
 class TestDygraphModel(unittest.TestCase):
+
     def setUp(self):
 
         self.seed = 2021
@@ -96,8 +97,9 @@ def setUp(self):
         self.model_dy2stat_path = "infer_model/custom_relu_model_dy2sta"
 
         # for dy2stat
-        self.x_spec = paddle.static.InputSpec(
-            shape=[None, self.in_dim], dtype='float32', name='x')
+        self.x_spec = paddle.static.InputSpec(shape=[None, self.in_dim],
+                                              dtype='float32',
+                                              name='x')
 
     def func_train_eval(self):
         for device in self.devices:
@@ -107,7 +109,7 @@ def func_train_eval(self):
             # for train
             origin_relu_train_out = self.train_model(use_custom_op=False)
             custom_relu_train_out = self.train_model(use_custom_op=True)
-            # open this when dy2stat is ready for eager 
+            # open this when dy2stat is ready for eager
             if _in_legacy_dygraph():
                 custom_relu_dy2stat_train_out = self.train_model(
                     use_custom_op=True, dy2stat=True)  # for to_static
@@ -188,6 +190,7 @@ def eval_model(self, use_custom_op=False, dy2stat=False):
 
 
 class TestStaticModel(unittest.TestCase):
+
     def setUp(self):
         self.seed = 2021
         self.in_dim = 10
@@ -217,14 +220,16 @@ def tearDown(self):
     def test_train_eval(self):
         for device in self.devices:
             # for train
-            original_relu_train_out = self.train_model(
-                device, use_custom_op=False)
+            original_relu_train_out = self.train_model(device,
+                                                       use_custom_op=False)
             custom_relu_train_out = self.train_model(device, use_custom_op=True)
             # using PE
-            original_relu_train_pe_out = self.train_model(
-                device, use_custom_op=False, use_pe=True)
-            custom_relu_train_pe_out = self.train_model(
-                device, use_custom_op=True, use_pe=True)
+            original_relu_train_pe_out = self.train_model(device,
+                                                          use_custom_op=False,
+                                                          use_pe=True)
+            custom_relu_train_pe_out = self.train_model(device,
+                                                        use_custom_op=True,
+                                                        use_pe=True)
 
             self.assertTrue(
                 np.array_equal(original_relu_train_out, custom_relu_train_out))
@@ -233,14 +238,16 @@ def test_train_eval(self):
                                custom_relu_train_pe_out))
 
             # for eval
-            original_relu_eval_out = self.eval_model(
-                device, use_custom_op=False)
+            original_relu_eval_out = self.eval_model(device,
+                                                     use_custom_op=False)
             custom_relu_eval_out = self.eval_model(device, use_custom_op=True)
             # using PE
-            original_relu_eval_pe_out = self.eval_model(
-                device, use_custom_op=False, use_pe=True)
-            custom_relu_eval_pe_out = self.eval_model(
-                device, use_custom_op=True, use_pe=True)
+            original_relu_eval_pe_out = self.eval_model(device,
+                                                        use_custom_op=False,
+                                                        use_pe=True)
+            custom_relu_eval_pe_out = self.eval_model(device,
+                                                      use_custom_op=True,
+                                                      use_pe=True)
 
             self.assertTrue(
                 np.array_equal(original_relu_eval_out, custom_relu_eval_out))
@@ -258,10 +265,12 @@ def train_model(self, device, use_custom_op=False, use_pe=False):
         with paddle.static.scope_guard(paddle.static.Scope()):
             with paddle.static.program_guard(paddle.static.Program(),
                                              paddle.static.Program()):
-                x = paddle.static.data(
-                    shape=[None, self.in_dim], name='x', dtype='float32')
-                y = paddle.static.data(
-                    shape=[None, 1], name='y', dtype='float32')
+                x = paddle.static.data(shape=[None, self.in_dim],
+                                       name='x',
+                                       dtype='float32')
+                y = paddle.static.data(shape=[None, 1],
+                                       name='y',
+                                       dtype='float32')
 
                 net = Net(self.in_dim, self.out_dim, use_custom_op)
                 out = net(x)
@@ -279,8 +288,8 @@ def train_model(self, device, use_custom_op=False, use_pe=False):
                     ) if device is 'cpu' else paddle.static.cuda_places()
                     main_program = paddle.static.CompiledProgram(
                         paddle.static.default_main_program(
-                        )).with_data_parallel(
-                            loss_name=loss.name, places=places)
+                        )).with_data_parallel(loss_name=loss.name,
+                                              places=places)
                 else:
                     main_program = paddle.static.default_main_program()
 
@@ -289,14 +298,16 @@ def train_model(self, device, use_custom_op=False, use_pe=False):
                     y_data = self.labels[batch_id]
 
                     res = exe.run(main_program,
-                                  feed={'x': x_data,
-                                        'y': y_data},
+                                  feed={
+                                      'x': x_data,
+                                      'y': y_data
+                                  },
                                   fetch_list=[out])
 
                 # save model
                 paddle.static.save_inference_model(
-                    self.model_path_template.format(use_custom_op, use_pe),
-                    [x], [out], exe)
+                    self.model_path_template.format(use_custom_op, use_pe), [x],
+                    [out], exe)
 
                 return res[0]
 
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
index 072b3c6484e72..5052a0989bb89 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -47,6 +47,7 @@
 
 
 class TestJITLoad(unittest.TestCase):
+
     def setUp(self):
         self.custom_ops = [
             custom_module.custom_relu, custom_module.custom_relu_dup,
@@ -84,8 +85,8 @@ def func_dynamic(self):
                 for custom_op in self.custom_ops:
                     out, x_grad = custom_relu_dynamic(custom_op, device, dtype,
                                                       x)
-                    pd_out, pd_x_grad = custom_relu_dynamic(custom_op, device,
-                                                            dtype, x, False)
+                    pd_out, pd_x_grad = custom_relu_dynamic(
+                        custom_op, device, dtype, x, False)
                     self.assertTrue(
                         np.array_equal(out, pd_out),
                         "custom op out: {},\n paddle api out: {}".format(
@@ -132,8 +133,8 @@ def func_exception(self):
                 "function \"relu_cuda_forward_kernel\" is not implemented for data type `int32`"
                 in str(e))
             self.assertTrue(
-                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu" in
-                str(e))
+                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu" in str(
+                    e))
         self.assertTrue(caught_exception)
 
     def test_exception(self):
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index 55c9571d44f11..29433b17153f5 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -85,9 +85,9 @@ def custom_relu_static_pe(func, device, dtype, np_x, use_func=True):
             exe.run(static.default_startup_program())
 
             # in static mode, x data has been covered by out
-            compiled_prog = static.CompiledProgram(static.default_main_program(
-            )).with_data_parallel(
-                loss_name=out.name, places=places)
+            compiled_prog = static.CompiledProgram(
+                static.default_main_program()).with_data_parallel(
+                    loss_name=out.name, places=places)
             out_v = exe.run(compiled_prog,
                             feed={'X': np_x},
                             fetch_list=[out.name])
@@ -102,8 +102,9 @@ def custom_relu_static_inference(func, device, np_data, np_label, path_prefix):
     with static.scope_guard(static.Scope()):
         with static.program_guard(static.Program()):
             # simple module
-            data = static.data(
-                name='data', shape=[None, 1, 28, 28], dtype='float32')
+            data = static.data(name='data',
+                               shape=[None, 1, 28, 28],
+                               dtype='float32')
             label = static.data(name='label', shape=[None, 1], dtype='int64')
 
             hidden = static.nn.fc(data, size=128)
@@ -123,8 +124,10 @@ def custom_relu_static_inference(func, device, np_data, np_label, path_prefix):
             # train
             for i in range(4):
                 avg_loss_v = exe.run(static.default_main_program(),
-                                     feed={'data': np_data,
-                                           'label': np_label},
+                                     feed={
+                                         'data': np_data,
+                                         'label': np_label
+                                     },
                                      fetch_list=[avg_loss])
 
             # save inference model
@@ -132,8 +135,10 @@ def custom_relu_static_inference(func, device, np_data, np_label, path_prefix):
 
             # get train predict value
             predict_v = exe.run(static.default_main_program(),
-                                feed={'data': np_data,
-                                      'label': np_label},
+                                feed={
+                                    'data': np_data,
+                                    'label': np_label
+                                },
                                 fetch_list=[predict])
 
     return predict_v
@@ -147,8 +152,10 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
     out = func(t) if use_func else paddle.nn.functional.relu(t)
     out.stop_gradient = False
 
-    dx = paddle.grad(
-        outputs=[out], inputs=[t], create_graph=True, retain_graph=True)
+    dx = paddle.grad(outputs=[out],
+                     inputs=[t],
+                     create_graph=True,
+                     retain_graph=True)
 
     dx[0].backward()
 
@@ -157,6 +164,7 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
 
 
 class TestNewCustomOpSetUpInstall(unittest.TestCase):
+
     def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         # compile, install the custom op egg into site-packages under background
@@ -181,8 +189,8 @@ def setUp(self):
         custom_egg_path = [
             x for x in os.listdir(site_dir) if 'custom_relu_module_setup' in x
         ]
-        assert len(custom_egg_path) == 1, "Matched egg number is %d." % len(
-            custom_egg_path)
+        assert len(custom_egg_path
+                   ) == 1, "Matched egg number is %d." % len(custom_egg_path)
         sys.path.append(os.path.join(site_dir, custom_egg_path[0]))
 
         # usage: import the package directly
@@ -244,8 +252,8 @@ def func_dynamic(self):
                 for custom_op in self.custom_ops:
                     out, x_grad = custom_relu_dynamic(custom_op, device, dtype,
                                                       x)
-                    pd_out, pd_x_grad = custom_relu_dynamic(custom_op, device,
-                                                            dtype, x, False)
+                    pd_out, pd_x_grad = custom_relu_dynamic(
+                        custom_op, device, dtype, x, False)
                     self.assertTrue(
                         np.array_equal(out, pd_out),
                         "custom op out: {},\n paddle api out: {}".format(
@@ -266,8 +274,9 @@ def test_static_save_and_load_inference_model(self):
         np_label = np.random.random((1, 1)).astype("int64")
         path_prefix = "custom_op_inference/custom_relu"
         for device in self.devices:
-            predict = custom_relu_static_inference(
-                self.custom_ops[0], device, np_data, np_label, path_prefix)
+            predict = custom_relu_static_inference(self.custom_ops[0], device,
+                                                   np_data, np_label,
+                                                   path_prefix)
             # load inference model
             with static.scope_guard(static.Scope()):
                 exe = static.Executor()
@@ -290,14 +299,15 @@ def test_static_save_and_run_inference_predictor(self):
         from paddle.inference import Config
         from paddle.inference import create_predictor
         for device in self.devices:
-            predict = custom_relu_static_inference(
-                self.custom_ops[0], device, np_data, np_label, path_prefix)
+            predict = custom_relu_static_inference(self.custom_ops[0], device,
+                                                   np_data, np_label,
+                                                   path_prefix)
             # load inference model
             config = Config(path_prefix + ".pdmodel",
                             path_prefix + ".pdiparams")
             predictor = create_predictor(config)
-            input_tensor = predictor.get_input_handle(predictor.get_input_names(
-            )[0])
+            input_tensor = predictor.get_input_handle(
+                predictor.get_input_names()[0])
             input_tensor.reshape(np_data.shape)
             input_tensor.copy_from_cpu(np_data.copy())
             predictor.run()
@@ -305,8 +315,7 @@ def test_static_save_and_run_inference_predictor(self):
                 predictor.get_output_names()[0])
             predict_infer = output_tensor.copy_to_cpu()
             self.assertTrue(
-                np.isclose(
-                    predict, predict_infer, rtol=5e-5).any(),
+                np.isclose(predict, predict_infer, rtol=5e-5).any(),
                 "custom op predict: {},\n custom op infer predict: {}".format(
                     predict, predict_infer))
         paddle.disable_static()
@@ -323,8 +332,8 @@ def test_func_double_grad_dynamic(self):
                     self.custom_ops[0], device, dtype, x, False)
                 self.assertTrue(
                     np.array_equal(out, pd_out),
-                    "custom op out: {},\n paddle api out: {}".format(out,
-                                                                     pd_out))
+                    "custom op out: {},\n paddle api out: {}".format(
+                        out, pd_out))
                 self.assertTrue(
                     np.array_equal(dx_grad, pd_dx_grad),
                     "custom op dx grad: {},\n paddle api dx grad: {}".format(
@@ -335,24 +344,22 @@ def test_with_dataloader(self):
             paddle.set_device(device)
             # data loader
             transform = Compose(
-                [Normalize(
-                    mean=[127.5], std=[127.5], data_format='CHW')])
-            train_dataset = paddle.vision.datasets.MNIST(
-                mode='train', transform=transform)
-            train_loader = paddle.io.DataLoader(
-                train_dataset,
-                batch_size=64,
-                shuffle=True,
-                drop_last=True,
-                num_workers=0)
+                [Normalize(mean=[127.5], std=[127.5], data_format='CHW')])
+            train_dataset = paddle.vision.datasets.MNIST(mode='train',
+                                                         transform=transform)
+            train_loader = paddle.io.DataLoader(train_dataset,
+                                                batch_size=64,
+                                                shuffle=True,
+                                                drop_last=True,
+                                                num_workers=0)
 
             for batch_id, (image, _) in enumerate(train_loader()):
                 out = self.custom_ops[0](image)
                 pd_out = paddle.nn.functional.relu(image)
                 self.assertTrue(
                     np.array_equal(out, pd_out),
-                    "custom op out: {},\n paddle api out: {}".format(out,
-                                                                     pd_out))
+                    "custom op out: {},\n paddle api out: {}".format(
+                        out, pd_out))
 
                 if batch_id == 5:
                     break
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py
index 6b1fb7c71ae7e..4202545759cfd 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtaina copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -40,6 +40,7 @@
 
 
 class TestCustomSimpleSliceJit(unittest.TestCase):
+
     def func_slice_output(self):
         np_x = np.random.random((5, 2)).astype("float32")
         x = paddle.to_tensor(np_x)
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
index 3b3a0e2edec98..5609376799371 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,6 +17,7 @@
 import numpy as np
 
 import paddle
+import paddle.fluid as fluid
 import paddle.static as static
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
@@ -40,6 +41,7 @@
 
 
 def custom_tanh_double_grad_dynamic(func, device, dtype, np_x):
+    fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
     paddle.set_device(device)
 
     t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
@@ -47,17 +49,21 @@ def custom_tanh_double_grad_dynamic(func, device, dtype, np_x):
     out = func(t)
     out.stop_gradient = False
 
-    dx = paddle.grad(
-        outputs=[out], inputs=[t], create_graph=True, retain_graph=True)
+    dx = paddle.grad(outputs=[out],
+                     inputs=[t],
+                     create_graph=True,
+                     retain_graph=True)
 
     dx[0].backward()
 
     assert out.grad is not None
     assert dx[0].grad is not None
     return dx[0].numpy(), dx[0].grad.numpy(), out.grad.numpy()
+    fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 class TestCustomTanhDoubleGradJit(unittest.TestCase):
+
     def setUp(self):
         paddle.set_device('cpu')
         self.dtypes = ['float32', 'float64']
@@ -73,8 +79,8 @@ def func_double_grad_dynamic(self):
                     paddle.tanh, device, dtype, x)
                 self.assertTrue(
                     np.allclose(out, pd_out),
-                    "custom op out: {},\n paddle api out: {}".format(out,
-                                                                     pd_out))
+                    "custom op out: {},\n paddle api out: {}".format(
+                        out, pd_out))
                 self.assertTrue(
                     np.allclose(dx_grad, pd_dx_grad),
                     "custom op dx grad: {},\n paddle api dx grad: {}".format(
@@ -85,9 +91,11 @@ def func_double_grad_dynamic(self):
                         dout, pd_dout))
 
     def test_func_double_grad_dynamic(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_double_grad_dynamic()
         self.func_double_grad_dynamic()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
index 0d2cb941eafaa..d48d25ea3b1c1 100644
--- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
@@ -20,7 +20,7 @@
 from utils import paddle_includes, extra_cc_args
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from paddle.fluid.framework import _test_eager_guard
-# Because Windows don't use docker, the shared lib already exists in the 
+# Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\dispatch_op\\dispatch_op.pyd'.format(get_build_directory())
 if os.name == 'nt' and os.path.isfile(file):
@@ -36,6 +36,7 @@
 
 
 class TestJitDispatch(unittest.TestCase):
+
     def setUp(self):
         paddle.set_device('cpu')
 
diff --git a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
index 4fc9270b0f44c..83731de32a4f0 100644
--- a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
@@ -23,7 +23,7 @@
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args
 from paddle.fluid.framework import _test_eager_guard
-# Because Windows don't use docker, the shared lib already exists in the 
+# Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\multi_out_jit\\multi_out_jit.pyd'.format(get_build_directory())
 if os.name == 'nt' and os.path.isfile(file):
@@ -40,6 +40,7 @@
 
 
 class TestMultiOutputDtypes(unittest.TestCase):
+
     def setUp(self):
         self.custom_op = multi_out_module.multi_out
         self.dtypes = ['float32', 'float64']
@@ -70,11 +71,13 @@ def check_multi_outputs(self, outs, is_dynamic=False):
         # Fake_float64
         self.assertTrue('float64' in str(zero_float64.dtype))
         self.assertTrue(
-            np.array_equal(zero_float64, np.zeros([4, 8]).astype('float64')))
+            np.array_equal(zero_float64,
+                           np.zeros([4, 8]).astype('float64')))
         # ZFake_int32
         self.assertTrue('int32' in str(one_int32.dtype))
         self.assertTrue(
-            np.array_equal(one_int32, np.ones([4, 8]).astype('int32')))
+            np.array_equal(one_int32,
+                           np.ones([4, 8]).astype('int32')))
 
     def test_static(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/custom_op/test_sysconfig.py b/python/paddle/fluid/tests/custom_op/test_sysconfig.py
index 78c0cdf0316ea..89ef36931f3ff 100644
--- a/python/paddle/fluid/tests/custom_op/test_sysconfig.py
+++ b/python/paddle/fluid/tests/custom_op/test_sysconfig.py
@@ -18,6 +18,7 @@
 
 
 class SysConfigTest(unittest.TestCase):
+
     def test_include(self):
         inc_dir = paddle.sysconfig.get_include()
         inc_dirs = inc_dir.split(os.sep)
diff --git a/python/paddle/fluid/tests/custom_op/utils.py b/python/paddle/fluid/tests/custom_op/utils.py
index 2d492da3d9725..82361f2a30465 100644
--- a/python/paddle/fluid/tests/custom_op/utils.py
+++ b/python/paddle/fluid/tests/custom_op/utils.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py
index 301bd0ff0039e..f37090f67e257 100644
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
@@ -47,13 +47,14 @@
 
 def encoder():
     # encoder
-    src_word = layers.data(
-        name="src_word", shape=[1], dtype='int64', lod_level=1)
-    src_embedding = layers.embedding(
-        input=src_word,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE)
+    src_word = layers.data(name="src_word",
+                           shape=[1],
+                           dtype='int64',
+                           lod_level=1)
+    src_embedding = layers.embedding(input=src_word,
+                                     size=[dict_size, word_dim],
+                                     dtype='float32',
+                                     is_sparse=IS_SPARSE)
 
     fc1 = layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
     lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
@@ -80,13 +81,14 @@ def updater(state_cell):
 
 def decoder_train(state_cell):
     # decoder
-    trg_language_word = layers.data(
-        name="target_word", shape=[1], dtype='int64', lod_level=1)
-    trg_embedding = layers.embedding(
-        input=trg_language_word,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE)
+    trg_language_word = layers.data(name="target_word",
+                                    shape=[1],
+                                    dtype='int64',
+                                    lod_level=1)
+    trg_embedding = layers.embedding(input=trg_language_word,
+                                     size=[dict_size, word_dim],
+                                     dtype='float32',
+                                     is_sparse=IS_SPARSE)
 
     decoder = TrainingDecoder(state_cell)
 
@@ -103,24 +105,27 @@ def decoder_train(state_cell):
 
 
 def decoder_decode(state_cell):
-    init_ids = layers.data(
-        name="init_ids", shape=[1], dtype="int64", lod_level=2)
-    init_scores = layers.data(
-        name="init_scores", shape=[1], dtype="float32", lod_level=2)
-
-    decoder = BeamSearchDecoder(
-        state_cell=state_cell,
-        init_ids=init_ids,
-        init_scores=init_scores,
-        target_dict_dim=target_dict_dim,
-        word_dim=word_dim,
-        input_var_dict={},
-        topk_size=topk_size,
-        sparse_emb=IS_SPARSE,
-        max_len=max_length,
-        beam_size=beam_size,
-        end_id=1,
-        name=None)
+    init_ids = layers.data(name="init_ids",
+                           shape=[1],
+                           dtype="int64",
+                           lod_level=2)
+    init_scores = layers.data(name="init_scores",
+                              shape=[1],
+                              dtype="float32",
+                              lod_level=2)
+
+    decoder = BeamSearchDecoder(state_cell=state_cell,
+                                init_ids=init_ids,
+                                init_scores=init_scores,
+                                target_dict_dim=target_dict_dim,
+                                word_dim=word_dim,
+                                input_var_dict={},
+                                topk_size=topk_size,
+                                sparse_emb=IS_SPARSE,
+                                max_len=max_length,
+                                beam_size=beam_size,
+                                end_id=1,
+                                name=None)
     decoder.decode()
     translation_ids, translation_scores = decoder()
 
@@ -135,18 +140,19 @@ def train_main(use_cuda):
     context = encoder()
     state_cell = decoder_state_cell(context)
     rnn_out = decoder_train(state_cell)
-    label = layers.data(
-        name="target_next_word", shape=[1], dtype='int64', lod_level=1)
+    label = layers.data(name="target_next_word",
+                        shape=[1],
+                        dtype='int64',
+                        lod_level=1)
     cost = layers.cross_entropy(input=rnn_out, label=label)
     avg_cost = layers.mean(x=cost)
 
     optimizer = fluid.optimizer.Adagrad(learning_rate=1e-3)
     optimizer.minimize(avg_cost)
 
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
+    train_reader = paddle.batch(paddle.reader.shuffle(
+        paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+                                batch_size=batch_size)
     feed_order = ['src_word', 'target_word', 'target_next_word']
 
     exe = Executor(place)
@@ -186,8 +192,8 @@ def decode_main(use_cuda):
     exe.run(framework.default_startup_program())
 
     init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64')
-    init_scores_data = np.array(
-        [1. for _ in range(batch_size)], dtype='float32')
+    init_scores_data = np.array([1. for _ in range(batch_size)],
+                                dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
     init_lod = [1] * batch_size
@@ -196,10 +202,9 @@ def decode_main(use_cuda):
     init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
     init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
 
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
+    train_reader = paddle.batch(paddle.reader.shuffle(
+        paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+                                batch_size=batch_size)
 
     feed_order = ['src_word']
     feed_list = [
diff --git a/python/paddle/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py
index d50c57e670b07..54ff53e370634 100644
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -22,6 +22,7 @@
 
 
 class TestDataFeeder(unittest.TestCase):
+
     def test_lod_level_0_converter(self):
         img = fluid.layers.data(name='image', shape=[1, 28, 28])
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
@@ -42,16 +43,18 @@ def test_lod_level_0_converter(self):
     def test_lod_level_1_converter(self):
         # lod_level = 1
         # each sentence has a different number of words
-        sentences = fluid.layers.data(
-            name='sentences', shape=[1], dtype='int64', lod_level=1)
+        sentences = fluid.layers.data(name='sentences',
+                                      shape=[1],
+                                      dtype='int64',
+                                      lod_level=1)
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
         feeder = fluid.DataFeeder([sentences, label], fluid.CPUPlace())
 
         # lod = [[0, 3, 5, 9]]
         # data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
         # label = [1] * len(data)
-        result = feeder.feed(
-            [([1, 2, 3], [1]), ([4, 5], [1]), ([6, 7, 8, 9], [1])])
+        result = feeder.feed([([1, 2, 3], [1]), ([4, 5], [1]),
+                              ([6, 7, 8, 9], [1])])
 
         self.assertEqual(result['sentences'].shape(), [9, 1])
         self.assertEqual(result['label'].shape(), [3, 1])
@@ -62,16 +65,18 @@ def test_lod_level_1_converter(self):
     def test_lod_level_2_converter(self):
         # lod_level = 2
         # paragraphs -> sentences -> words
-        paragraphs = fluid.layers.data(
-            name='paragraphs', shape=[1], dtype='int64', lod_level=2)
+        paragraphs = fluid.layers.data(name='paragraphs',
+                                       shape=[1],
+                                       dtype='int64',
+                                       lod_level=2)
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
         feeder = fluid.DataFeeder([paragraphs, label], fluid.CPUPlace())
 
         # lod = [[0, 2, 3], [0, 3, 5, 9]]
         # data = [[[1, 2, 3], [4, 5]], [[6, 7, 8, 9]]]
         # label = [1] * len(data)
-        result = feeder.feed(
-            [([[1, 2, 3], [4, 5]], [1]), ([[6, 7, 8, 9]], [1])])
+        result = feeder.feed([([[1, 2, 3], [4, 5]], [1]), ([[6, 7, 8,
+                                                             9]], [1])])
 
         self.assertEqual(result['paragraphs'].shape(), [9, 1])
         self.assertEqual(result['label'].shape(), [2, 1])
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index c45045509201d..046aa4c1f1726 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -30,6 +30,7 @@
 
 
 class LayerTest(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         cls.seed = 111
@@ -76,37 +77,35 @@ def dynamic_graph(self, force_to_use_cpu=False):
 
 
 class TestDetection(unittest.TestCase):
+
     def test_detection_output(self):
         program = Program()
         with program_guard(program):
-            pb = layers.data(
-                name='prior_box',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            pbv = layers.data(
-                name='prior_box_var',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            loc = layers.data(
-                name='target_box',
-                shape=[2, 10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            scores = layers.data(
-                name='scores',
-                shape=[2, 10, 20],
-                append_batch_size=False,
-                dtype='float32')
-            out = layers.detection_output(
-                scores=scores, loc=loc, prior_box=pb, prior_box_var=pbv)
-            out2, index = layers.detection_output(
-                scores=scores,
-                loc=loc,
-                prior_box=pb,
-                prior_box_var=pbv,
-                return_index=True)
+            pb = layers.data(name='prior_box',
+                             shape=[10, 4],
+                             append_batch_size=False,
+                             dtype='float32')
+            pbv = layers.data(name='prior_box_var',
+                              shape=[10, 4],
+                              append_batch_size=False,
+                              dtype='float32')
+            loc = layers.data(name='target_box',
+                              shape=[2, 10, 4],
+                              append_batch_size=False,
+                              dtype='float32')
+            scores = layers.data(name='scores',
+                                 shape=[2, 10, 20],
+                                 append_batch_size=False,
+                                 dtype='float32')
+            out = layers.detection_output(scores=scores,
+                                          loc=loc,
+                                          prior_box=pb,
+                                          prior_box_var=pbv)
+            out2, index = layers.detection_output(scores=scores,
+                                                  loc=loc,
+                                                  prior_box=pb,
+                                                  prior_box_var=pbv,
+                                                  return_index=True)
             self.assertIsNotNone(out)
             self.assertIsNotNone(out2)
             self.assertIsNotNone(index)
@@ -118,11 +117,10 @@ def test_box_coder_api(self):
         with program_guard(program):
             x = layers.data(name='x', shape=[4], dtype='float32')
             y = layers.data(name='z', shape=[4], dtype='float32', lod_level=1)
-            bcoder = layers.box_coder(
-                prior_box=x,
-                prior_box_var=[0.1, 0.2, 0.1, 0.2],
-                target_box=y,
-                code_type='encode_center_size')
+            bcoder = layers.box_coder(prior_box=x,
+                                      prior_box_var=[0.1, 0.2, 0.1, 0.2],
+                                      target_box=y,
+                                      code_type='encode_center_size')
             self.assertIsNotNone(bcoder)
         print(str(program))
 
@@ -130,26 +128,28 @@ def test_box_coder_error(self):
         program = Program()
         with program_guard(program):
             x1 = fluid.data(name='x1', shape=[10, 4], dtype='int32')
-            y1 = fluid.data(
-                name='y1', shape=[10, 4], dtype='float32', lod_level=1)
+            y1 = fluid.data(name='y1',
+                            shape=[10, 4],
+                            dtype='float32',
+                            lod_level=1)
             x2 = fluid.data(name='x2', shape=[10, 4], dtype='float32')
-            y2 = fluid.data(
-                name='y2', shape=[10, 4], dtype='int32', lod_level=1)
-
-            self.assertRaises(
-                TypeError,
-                layers.box_coder,
-                prior_box=x1,
-                prior_box_var=[0.1, 0.2, 0.1, 0.2],
-                target_box=y1,
-                code_type='encode_center_size')
-            self.assertRaises(
-                TypeError,
-                layers.box_coder,
-                prior_box=x2,
-                prior_box_var=[0.1, 0.2, 0.1, 0.2],
-                target_box=y2,
-                code_type='encode_center_size')
+            y2 = fluid.data(name='y2',
+                            shape=[10, 4],
+                            dtype='int32',
+                            lod_level=1)
+
+            self.assertRaises(TypeError,
+                              layers.box_coder,
+                              prior_box=x1,
+                              prior_box_var=[0.1, 0.2, 0.1, 0.2],
+                              target_box=y1,
+                              code_type='encode_center_size')
+            self.assertRaises(TypeError,
+                              layers.box_coder,
+                              prior_box=x2,
+                              prior_box_var=[0.1, 0.2, 0.1, 0.2],
+                              target_box=y2,
+                              code_type='encode_center_size')
 
     def test_detection_api(self):
         program = Program()
@@ -158,11 +158,10 @@ def test_detection_api(self):
             y = layers.data(name='y', shape=[4], dtype='float32')
             z = layers.data(name='z', shape=[4], dtype='float32', lod_level=1)
             iou = layers.iou_similarity(x=x, y=y)
-            bcoder = layers.box_coder(
-                prior_box=x,
-                prior_box_var=y,
-                target_box=z,
-                code_type='encode_center_size')
+            bcoder = layers.box_coder(prior_box=x,
+                                      prior_box_var=y,
+                                      target_box=z,
+                                      code_type='encode_center_size')
             self.assertIsNotNone(iou)
             self.assertIsNotNone(bcoder)
 
@@ -170,17 +169,23 @@ def test_detection_api(self):
             self.assertIsNotNone(matched_indices)
             self.assertIsNotNone(matched_dist)
 
-            gt = layers.data(
-                name='gt', shape=[1, 1], dtype='int32', lod_level=1)
-            trg, trg_weight = layers.target_assign(
-                gt, matched_indices, mismatch_value=0)
+            gt = layers.data(name='gt',
+                             shape=[1, 1],
+                             dtype='int32',
+                             lod_level=1)
+            trg, trg_weight = layers.target_assign(gt,
+                                                   matched_indices,
+                                                   mismatch_value=0)
             self.assertIsNotNone(trg)
             self.assertIsNotNone(trg_weight)
 
-            gt2 = layers.data(
-                name='gt2', shape=[10, 4], dtype='float32', lod_level=1)
-            trg, trg_weight = layers.target_assign(
-                gt2, matched_indices, mismatch_value=0)
+            gt2 = layers.data(name='gt2',
+                              shape=[10, 4],
+                              dtype='float32',
+                              lod_level=1)
+            trg, trg_weight = layers.target_assign(gt2,
+                                                   matched_indices,
+                                                   mismatch_value=0)
             self.assertIsNotNone(trg)
             self.assertIsNotNone(trg_weight)
 
@@ -189,22 +194,24 @@ def test_detection_api(self):
     def test_ssd_loss(self):
         program = Program()
         with program_guard(program):
-            pb = layers.data(
-                name='prior_box',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            pbv = layers.data(
-                name='prior_box_var',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
+            pb = layers.data(name='prior_box',
+                             shape=[10, 4],
+                             append_batch_size=False,
+                             dtype='float32')
+            pbv = layers.data(name='prior_box_var',
+                              shape=[10, 4],
+                              append_batch_size=False,
+                              dtype='float32')
             loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
             scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
-            gt_box = layers.data(
-                name='gt_box', shape=[4], lod_level=1, dtype='float32')
-            gt_label = layers.data(
-                name='gt_label', shape=[1], lod_level=1, dtype='int32')
+            gt_box = layers.data(name='gt_box',
+                                 shape=[4],
+                                 lod_level=1,
+                                 dtype='float32')
+            gt_label = layers.data(name='gt_label',
+                                   shape=[1],
+                                   lod_level=1,
+                                   dtype='int32')
             loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
             self.assertIsNotNone(loss)
             self.assertEqual(loss.shape[-1], 1)
@@ -212,69 +219,73 @@ def test_ssd_loss(self):
 
 
 class TestPriorBox(unittest.TestCase):
+
     def test_prior_box(self):
         program = Program()
         with program_guard(program):
             data_shape = [3, 224, 224]
-            images = fluid.layers.data(
-                name='pixel', shape=data_shape, dtype='float32')
+            images = fluid.layers.data(name='pixel',
+                                       shape=data_shape,
+                                       dtype='float32')
             conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-            box, var = layers.prior_box(
-                input=conv1,
-                image=images,
-                min_sizes=[100.0],
-                aspect_ratios=[1.],
-                flip=True,
-                clip=True)
+            box, var = layers.prior_box(input=conv1,
+                                        image=images,
+                                        min_sizes=[100.0],
+                                        aspect_ratios=[1.],
+                                        flip=True,
+                                        clip=True)
             assert len(box.shape) == 4
             assert box.shape == var.shape
             assert box.shape[3] == 4
 
 
 class TestPriorBox2(unittest.TestCase):
+
     def test_prior_box(self):
         program = Program()
         with program_guard(program):
             data_shape = [None, 3, None, None]
             images = fluid.data(name='pixel', shape=data_shape, dtype='float32')
             conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-            box, var = layers.prior_box(
-                input=conv1,
-                image=images,
-                min_sizes=[100.0],
-                aspect_ratios=[1.],
-                flip=True,
-                clip=True)
+            box, var = layers.prior_box(input=conv1,
+                                        image=images,
+                                        min_sizes=[100.0],
+                                        aspect_ratios=[1.],
+                                        flip=True,
+                                        clip=True)
             assert len(box.shape) == 4
             assert box.shape == var.shape
             assert box.shape[3] == 4
 
 
 class TestDensityPriorBox(unittest.TestCase):
+
     def test_density_prior_box(self):
         program = Program()
         with program_guard(program):
             data_shape = [3, 224, 224]
-            images = fluid.layers.data(
-                name='pixel', shape=data_shape, dtype='float32')
+            images = fluid.layers.data(name='pixel',
+                                       shape=data_shape,
+                                       dtype='float32')
             conv1 = fluid.layers.conv2d(images, 3, 3, 2)
-            box, var = layers.density_prior_box(
-                input=conv1,
-                image=images,
-                densities=[3, 4],
-                fixed_sizes=[50., 60.],
-                fixed_ratios=[1.0],
-                clip=True)
+            box, var = layers.density_prior_box(input=conv1,
+                                                image=images,
+                                                densities=[3, 4],
+                                                fixed_sizes=[50., 60.],
+                                                fixed_ratios=[1.0],
+                                                clip=True)
             assert len(box.shape) == 4
             assert box.shape == var.shape
             assert box.shape[-1] == 4
 
 
 class TestAnchorGenerator(unittest.TestCase):
+
     def test_anchor_generator(self):
         data_shape = [3, 224, 224]
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
+        images = fluid.layers.data(name='pixel',
+                                   shape=data_shape,
+                                   dtype='float32')
         conv1 = fluid.layers.conv2d(images, 3, 3, 2)
         anchor, var = fluid.layers.anchor_generator(
             input=conv1,
@@ -289,6 +300,7 @@ def test_anchor_generator(self):
 
 
 class TestGenerateProposalLabels(unittest.TestCase):
+
     def check_out(self, outs):
         rois = outs[0]
         labels_int32 = outs[1]
@@ -310,17 +322,27 @@ def check_out(self, outs):
     def test_generate_proposal_labels(self):
         program = Program()
         with program_guard(program):
-            rpn_rois = fluid.data(
-                name='rpn_rois', shape=[4, 4], dtype='float32', lod_level=1)
-            gt_classes = fluid.data(
-                name='gt_classes', shape=[6], dtype='int32', lod_level=1)
-            is_crowd = fluid.data(
-                name='is_crowd', shape=[6], dtype='int32', lod_level=1)
-            gt_boxes = fluid.data(
-                name='gt_boxes', shape=[6, 4], dtype='float32', lod_level=1)
+            rpn_rois = fluid.data(name='rpn_rois',
+                                  shape=[4, 4],
+                                  dtype='float32',
+                                  lod_level=1)
+            gt_classes = fluid.data(name='gt_classes',
+                                    shape=[6],
+                                    dtype='int32',
+                                    lod_level=1)
+            is_crowd = fluid.data(name='is_crowd',
+                                  shape=[6],
+                                  dtype='int32',
+                                  lod_level=1)
+            gt_boxes = fluid.data(name='gt_boxes',
+                                  shape=[6, 4],
+                                  dtype='float32',
+                                  lod_level=1)
             im_info = fluid.data(name='im_info', shape=[1, 3], dtype='float32')
-            max_overlap = fluid.data(
-                name='max_overlap', shape=[4], dtype='float32', lod_level=1)
+            max_overlap = fluid.data(name='max_overlap',
+                                     shape=[4],
+                                     dtype='float32',
+                                     lod_level=1)
             self.class_nums = 5
             outs = fluid.layers.generate_proposal_labels(
                 rpn_rois=rpn_rois,
@@ -358,62 +380,57 @@ def test_generate_proposal_labels(self):
 
 
 class TestGenerateMaskLabels(unittest.TestCase):
+
     def test_generate_mask_labels(self):
         program = Program()
         with program_guard(program):
-            im_info = layers.data(
-                name='im_info',
-                shape=[1, 3],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            gt_classes = layers.data(
-                name='gt_classes',
-                shape=[2, 1],
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            is_crowd = layers.data(
-                name='is_crowd',
-                shape=[2, 1],
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            gt_segms = layers.data(
-                name='gt_segms',
-                shape=[20, 2],
-                dtype='float32',
-                lod_level=3,
-                append_batch_size=False)
-            rois = layers.data(
-                name='rois',
-                shape=[4, 4],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            labels_int32 = layers.data(
-                name='labels_int32',
-                shape=[4, 1],
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
+            im_info = layers.data(name='im_info',
+                                  shape=[1, 3],
+                                  dtype='float32',
+                                  lod_level=1,
+                                  append_batch_size=False)
+            gt_classes = layers.data(name='gt_classes',
+                                     shape=[2, 1],
+                                     dtype='int32',
+                                     lod_level=1,
+                                     append_batch_size=False)
+            is_crowd = layers.data(name='is_crowd',
+                                   shape=[2, 1],
+                                   dtype='int32',
+                                   lod_level=1,
+                                   append_batch_size=False)
+            gt_segms = layers.data(name='gt_segms',
+                                   shape=[20, 2],
+                                   dtype='float32',
+                                   lod_level=3,
+                                   append_batch_size=False)
+            rois = layers.data(name='rois',
+                               shape=[4, 4],
+                               dtype='float32',
+                               lod_level=1,
+                               append_batch_size=False)
+            labels_int32 = layers.data(name='labels_int32',
+                                       shape=[4, 1],
+                                       dtype='int32',
+                                       lod_level=1,
+                                       append_batch_size=False)
             num_classes = 5
             resolution = 14
-            outs = fluid.layers.generate_mask_labels(
-                im_info=im_info,
-                gt_classes=gt_classes,
-                is_crowd=is_crowd,
-                gt_segms=gt_segms,
-                rois=rois,
-                labels_int32=labels_int32,
-                num_classes=num_classes,
-                resolution=resolution)
+            outs = fluid.layers.generate_mask_labels(im_info=im_info,
+                                                     gt_classes=gt_classes,
+                                                     is_crowd=is_crowd,
+                                                     gt_segms=gt_segms,
+                                                     rois=rois,
+                                                     labels_int32=labels_int32,
+                                                     num_classes=num_classes,
+                                                     resolution=resolution)
             mask_rois, roi_has_mask_int32, mask_int32 = outs
             assert mask_rois.shape[1] == 4
             assert mask_int32.shape[1] == num_classes * resolution * resolution
 
 
 class TestMultiBoxHead(unittest.TestCase):
+
     def test_multi_box_head(self):
         data_shape = [3, 224, 224]
         mbox_locs, mbox_confs, box, var = self.multi_box_head_output(data_shape)
@@ -424,8 +441,9 @@ def test_multi_box_head(self):
         assert mbox_locs.shape[1] == mbox_confs.shape[1]
 
     def multi_box_head_output(self, data_shape):
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
+        images = fluid.layers.data(name='pixel',
+                                   shape=data_shape,
+                                   dtype='float32')
         conv1 = fluid.layers.conv2d(images, 3, 3, 2)
         conv2 = fluid.layers.conv2d(conv1, 3, 3, 2)
         conv3 = fluid.layers.conv2d(conv2, 3, 3, 2)
@@ -448,19 +466,18 @@ def multi_box_head_output(self, data_shape):
 
 
 class TestDetectionMAP(unittest.TestCase):
+
     def test_detection_map(self):
         program = Program()
         with program_guard(program):
-            detect_res = layers.data(
-                name='detect_res',
-                shape=[10, 6],
-                append_batch_size=False,
-                dtype='float32')
-            label = layers.data(
-                name='label',
-                shape=[10, 6],
-                append_batch_size=False,
-                dtype='float32')
+            detect_res = layers.data(name='detect_res',
+                                     shape=[10, 6],
+                                     append_batch_size=False,
+                                     dtype='float32')
+            label = layers.data(name='label',
+                                shape=[10, 6],
+                                append_batch_size=False,
+                                dtype='float32')
 
             map_out = detection.detection_map(detect_res, label, 21)
             self.assertIsNotNone(map_out)
@@ -469,6 +486,7 @@ def test_detection_map(self):
 
 
 class TestRpnTargetAssign(unittest.TestCase):
+
     def test_rpn_target_assign(self):
         program = Program()
         with program_guard(program):
@@ -476,54 +494,49 @@ def test_rpn_target_assign(self):
             cls_logits_shape = [10, 50, 2]
             anchor_shape = [50, 4]
 
-            bbox_pred = layers.data(
-                name='bbox_pred',
-                shape=bbox_pred_shape,
-                append_batch_size=False,
-                dtype='float32')
-            cls_logits = layers.data(
-                name='cls_logits',
-                shape=cls_logits_shape,
-                append_batch_size=False,
-                dtype='float32')
-            anchor_box = layers.data(
-                name='anchor_box',
-                shape=anchor_shape,
-                append_batch_size=False,
-                dtype='float32')
-            anchor_var = layers.data(
-                name='anchor_var',
-                shape=anchor_shape,
-                append_batch_size=False,
-                dtype='float32')
-            gt_boxes = layers.data(
-                name='gt_boxes', shape=[4], lod_level=1, dtype='float32')
-            is_crowd = layers.data(
-                name='is_crowd',
-                shape=[1, 10],
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            im_info = layers.data(
-                name='im_info',
-                shape=[1, 3],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            outs = layers.rpn_target_assign(
-                bbox_pred=bbox_pred,
-                cls_logits=cls_logits,
-                anchor_box=anchor_box,
-                anchor_var=anchor_var,
-                gt_boxes=gt_boxes,
-                is_crowd=is_crowd,
-                im_info=im_info,
-                rpn_batch_size_per_im=256,
-                rpn_straddle_thresh=0.0,
-                rpn_fg_fraction=0.5,
-                rpn_positive_overlap=0.7,
-                rpn_negative_overlap=0.3,
-                use_random=False)
+            bbox_pred = layers.data(name='bbox_pred',
+                                    shape=bbox_pred_shape,
+                                    append_batch_size=False,
+                                    dtype='float32')
+            cls_logits = layers.data(name='cls_logits',
+                                     shape=cls_logits_shape,
+                                     append_batch_size=False,
+                                     dtype='float32')
+            anchor_box = layers.data(name='anchor_box',
+                                     shape=anchor_shape,
+                                     append_batch_size=False,
+                                     dtype='float32')
+            anchor_var = layers.data(name='anchor_var',
+                                     shape=anchor_shape,
+                                     append_batch_size=False,
+                                     dtype='float32')
+            gt_boxes = layers.data(name='gt_boxes',
+                                   shape=[4],
+                                   lod_level=1,
+                                   dtype='float32')
+            is_crowd = layers.data(name='is_crowd',
+                                   shape=[1, 10],
+                                   dtype='int32',
+                                   lod_level=1,
+                                   append_batch_size=False)
+            im_info = layers.data(name='im_info',
+                                  shape=[1, 3],
+                                  dtype='float32',
+                                  lod_level=1,
+                                  append_batch_size=False)
+            outs = layers.rpn_target_assign(bbox_pred=bbox_pred,
+                                            cls_logits=cls_logits,
+                                            anchor_box=anchor_box,
+                                            anchor_var=anchor_var,
+                                            gt_boxes=gt_boxes,
+                                            is_crowd=is_crowd,
+                                            im_info=im_info,
+                                            rpn_batch_size_per_im=256,
+                                            rpn_straddle_thresh=0.0,
+                                            rpn_fg_fraction=0.5,
+                                            rpn_positive_overlap=0.7,
+                                            rpn_negative_overlap=0.3,
+                                            use_random=False)
             pred_scores = outs[0]
             pred_loc = outs[1]
             tgt_lbl = outs[2]
@@ -542,6 +555,7 @@ def test_rpn_target_assign(self):
 
 
 class TestGenerateProposals(LayerTest):
+
     def test_generate_proposals(self):
         scores_np = np.random.rand(2, 3, 4, 4).astype('float32')
         bbox_deltas_np = np.random.rand(2, 12, 4, 4).astype('float32')
@@ -551,15 +565,19 @@ def test_generate_proposals(self):
         variances_np = np.ones((4, 4, 3, 4)).astype('float32')
 
         with self.static_graph():
-            scores = fluid.data(
-                name='scores', shape=[2, 3, 4, 4], dtype='float32')
-            bbox_deltas = fluid.data(
-                name='bbox_deltas', shape=[2, 12, 4, 4], dtype='float32')
+            scores = fluid.data(name='scores',
+                                shape=[2, 3, 4, 4],
+                                dtype='float32')
+            bbox_deltas = fluid.data(name='bbox_deltas',
+                                     shape=[2, 12, 4, 4],
+                                     dtype='float32')
             im_info = fluid.data(name='im_info', shape=[2, 3], dtype='float32')
-            anchors = fluid.data(
-                name='anchors', shape=[4, 4, 3, 4], dtype='float32')
-            variances = fluid.data(
-                name='var', shape=[4, 4, 3, 4], dtype='float32')
+            anchors = fluid.data(name='anchors',
+                                 shape=[4, 4, 3, 4],
+                                 dtype='float32')
+            variances = fluid.data(name='var',
+                                   shape=[4, 4, 3, 4],
+                                   dtype='float32')
             rois, roi_probs, rois_num = fluid.layers.generate_proposals(
                 scores,
                 bbox_deltas,
@@ -605,6 +623,7 @@ def test_generate_proposals(self):
 
 
 class TestYoloDetection(unittest.TestCase):
+
     def test_yolov3_loss(self):
         program = Program()
         with program_guard(program):
@@ -612,15 +631,14 @@ def test_yolov3_loss(self):
             gt_box = layers.data(name='gt_box', shape=[10, 4], dtype='float32')
             gt_label = layers.data(name='gt_label', shape=[10], dtype='int32')
             gt_score = layers.data(name='gt_score', shape=[10], dtype='float32')
-            loss = layers.yolov3_loss(
-                x,
-                gt_box,
-                gt_label, [10, 13, 30, 13], [0, 1],
-                10,
-                0.7,
-                32,
-                gt_score=gt_score,
-                use_label_smooth=False)
+            loss = layers.yolov3_loss(x,
+                                      gt_box,
+                                      gt_label, [10, 13, 30, 13], [0, 1],
+                                      10,
+                                      0.7,
+                                      32,
+                                      gt_score=gt_score,
+                                      use_label_smooth=False)
 
             self.assertIsNotNone(loss)
 
@@ -641,16 +659,15 @@ def test_yolov3_loss_with_scale(self):
             gt_box = layers.data(name='gt_box', shape=[10, 4], dtype='float32')
             gt_label = layers.data(name='gt_label', shape=[10], dtype='int32')
             gt_score = layers.data(name='gt_score', shape=[10], dtype='float32')
-            loss = layers.yolov3_loss(
-                x,
-                gt_box,
-                gt_label, [10, 13, 30, 13], [0, 1],
-                10,
-                0.7,
-                32,
-                gt_score=gt_score,
-                use_label_smooth=False,
-                scale_x_y=1.2)
+            loss = layers.yolov3_loss(x,
+                                      gt_box,
+                                      gt_label, [10, 13, 30, 13], [0, 1],
+                                      10,
+                                      0.7,
+                                      32,
+                                      gt_score=gt_score,
+                                      use_label_smooth=False,
+                                      scale_x_y=1.2)
 
             self.assertIsNotNone(loss)
 
@@ -659,29 +676,38 @@ def test_yolo_box_with_scale(self):
         with program_guard(program):
             x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
             img_size = layers.data(name='img_size', shape=[2], dtype='int32')
-            boxes, scores = layers.yolo_box(
-                x, img_size, [10, 13, 30, 13], 10, 0.01, 32, scale_x_y=1.2)
+            boxes, scores = layers.yolo_box(x,
+                                            img_size, [10, 13, 30, 13],
+                                            10,
+                                            0.01,
+                                            32,
+                                            scale_x_y=1.2)
             self.assertIsNotNone(boxes)
             self.assertIsNotNone(scores)
 
 
 class TestBoxClip(unittest.TestCase):
+
     def test_box_clip(self):
         program = Program()
         with program_guard(program):
-            input_box = layers.data(
-                name='input_box', shape=[7, 4], dtype='float32', lod_level=1)
+            input_box = layers.data(name='input_box',
+                                    shape=[7, 4],
+                                    dtype='float32',
+                                    lod_level=1)
             im_info = layers.data(name='im_info', shape=[3], dtype='float32')
             out = layers.box_clip(input_box, im_info)
             self.assertIsNotNone(out)
 
 
 class TestMulticlassNMS(unittest.TestCase):
+
     def test_multiclass_nms(self):
         program = Program()
         with program_guard(program):
-            bboxes = layers.data(
-                name='bboxes', shape=[-1, 10, 4], dtype='float32')
+            bboxes = layers.data(name='bboxes',
+                                 shape=[-1, 10, 4],
+                                 dtype='float32')
             scores = layers.data(name='scores', shape=[-1, 10], dtype='float32')
             output = layers.multiclass_nms(bboxes, scores, 0.3, 400, 200, 0.7)
             self.assertIsNotNone(output)
@@ -689,48 +715,57 @@ def test_multiclass_nms(self):
     def test_multiclass_nms_error(self):
         program = Program()
         with program_guard(program):
-            bboxes1 = fluid.data(
-                name='bboxes1', shape=[10, 10, 4], dtype='int32')
-            scores1 = fluid.data(
-                name='scores1', shape=[10, 10], dtype='float32')
-            bboxes2 = fluid.data(
-                name='bboxes2', shape=[10, 10, 4], dtype='float32')
+            bboxes1 = fluid.data(name='bboxes1',
+                                 shape=[10, 10, 4],
+                                 dtype='int32')
+            scores1 = fluid.data(name='scores1',
+                                 shape=[10, 10],
+                                 dtype='float32')
+            bboxes2 = fluid.data(name='bboxes2',
+                                 shape=[10, 10, 4],
+                                 dtype='float32')
             scores2 = fluid.data(name='scores2', shape=[10, 10], dtype='int32')
-            self.assertRaises(
-                TypeError,
-                layers.multiclass_nms,
-                bboxes=bboxes1,
-                scores=scores1,
-                score_threshold=0.5,
-                nms_top_k=400,
-                keep_top_k=200)
-            self.assertRaises(
-                TypeError,
-                layers.multiclass_nms,
-                bboxes=bboxes2,
-                scores=scores2,
-                score_threshold=0.5,
-                nms_top_k=400,
-                keep_top_k=200)
+            self.assertRaises(TypeError,
+                              layers.multiclass_nms,
+                              bboxes=bboxes1,
+                              scores=scores1,
+                              score_threshold=0.5,
+                              nms_top_k=400,
+                              keep_top_k=200)
+            self.assertRaises(TypeError,
+                              layers.multiclass_nms,
+                              bboxes=bboxes2,
+                              scores=scores2,
+                              score_threshold=0.5,
+                              nms_top_k=400,
+                              keep_top_k=200)
 
 
 class TestMulticlassNMS2(unittest.TestCase):
+
     def test_multiclass_nms2(self):
         program = Program()
         with program_guard(program):
-            bboxes = layers.data(
-                name='bboxes', shape=[-1, 10, 4], dtype='float32')
+            bboxes = layers.data(name='bboxes',
+                                 shape=[-1, 10, 4],
+                                 dtype='float32')
             scores = layers.data(name='scores', shape=[-1, 10], dtype='float32')
             output = fluid.contrib.multiclass_nms2(bboxes, scores, 0.3, 400,
                                                    200, 0.7)
-            output2, index = fluid.contrib.multiclass_nms2(
-                bboxes, scores, 0.3, 400, 200, 0.7, return_index=True)
+            output2, index = fluid.contrib.multiclass_nms2(bboxes,
+                                                           scores,
+                                                           0.3,
+                                                           400,
+                                                           200,
+                                                           0.7,
+                                                           return_index=True)
             self.assertIsNotNone(output)
             self.assertIsNotNone(output2)
             self.assertIsNotNone(index)
 
 
 class TestCollectFpnPropsals(LayerTest):
+
     def test_collect_fpn_proposals(self):
         multi_bboxes_np = []
         multi_scores_np = []
@@ -748,18 +783,17 @@ def test_collect_fpn_proposals(self):
             multi_scores = []
             rois_num_per_level = []
             for i in range(4):
-                bboxes = fluid.data(
-                    name='rois' + str(i),
-                    shape=[5, 4],
-                    dtype='float32',
-                    lod_level=1)
-                scores = fluid.data(
-                    name='scores' + str(i),
-                    shape=[5, 1],
-                    dtype='float32',
-                    lod_level=1)
-                rois_num = fluid.data(
-                    name='rois_num' + str(i), shape=[None], dtype='int32')
+                bboxes = fluid.data(name='rois' + str(i),
+                                    shape=[5, 4],
+                                    dtype='float32',
+                                    lod_level=1)
+                scores = fluid.data(name='scores' + str(i),
+                                    shape=[5, 1],
+                                    dtype='float32',
+                                    lod_level=1)
+                rois_num = fluid.data(name='rois_num' + str(i),
+                                      shape=[None],
+                                      dtype='int32')
 
                 multi_bboxes.append(bboxes)
                 multi_scores.append(scores)
@@ -807,50 +841,52 @@ def test_collect_fpn_proposals(self):
         self.assertTrue(np.array_equal(rois_num_stat, rois_num_dy))
 
     def test_collect_fpn_proposals_error(self):
+
         def generate_input(bbox_type, score_type, name):
             multi_bboxes = []
             multi_scores = []
             for i in range(4):
-                bboxes = fluid.data(
-                    name='rois' + name + str(i),
-                    shape=[10, 4],
-                    dtype=bbox_type,
-                    lod_level=1)
-                scores = fluid.data(
-                    name='scores' + name + str(i),
-                    shape=[10, 1],
-                    dtype=score_type,
-                    lod_level=1)
+                bboxes = fluid.data(name='rois' + name + str(i),
+                                    shape=[10, 4],
+                                    dtype=bbox_type,
+                                    lod_level=1)
+                scores = fluid.data(name='scores' + name + str(i),
+                                    shape=[10, 1],
+                                    dtype=score_type,
+                                    lod_level=1)
                 multi_bboxes.append(bboxes)
                 multi_scores.append(scores)
             return multi_bboxes, multi_scores
 
         program = Program()
         with program_guard(program):
-            bbox1 = fluid.data(
-                name='rois', shape=[5, 10, 4], dtype='float32', lod_level=1)
-            score1 = fluid.data(
-                name='scores', shape=[5, 10, 1], dtype='float32', lod_level=1)
+            bbox1 = fluid.data(name='rois',
+                               shape=[5, 10, 4],
+                               dtype='float32',
+                               lod_level=1)
+            score1 = fluid.data(name='scores',
+                                shape=[5, 10, 1],
+                                dtype='float32',
+                                lod_level=1)
             bbox2, score2 = generate_input('int32', 'float32', '2')
-            self.assertRaises(
-                TypeError,
-                layers.collect_fpn_proposals,
-                multi_rois=bbox1,
-                multi_scores=score1,
-                min_level=2,
-                max_level=5,
-                post_nms_top_n=2000)
-            self.assertRaises(
-                TypeError,
-                layers.collect_fpn_proposals,
-                multi_rois=bbox2,
-                multi_scores=score2,
-                min_level=2,
-                max_level=5,
-                post_nms_top_n=2000)
+            self.assertRaises(TypeError,
+                              layers.collect_fpn_proposals,
+                              multi_rois=bbox1,
+                              multi_scores=score1,
+                              min_level=2,
+                              max_level=5,
+                              post_nms_top_n=2000)
+            self.assertRaises(TypeError,
+                              layers.collect_fpn_proposals,
+                              multi_rois=bbox2,
+                              multi_scores=score2,
+                              min_level=2,
+                              max_level=5,
+                              post_nms_top_n=2000)
 
 
 class TestDistributeFpnProposals(LayerTest):
+
     def test_distribute_fpn_proposals(self):
         rois_np = np.random.rand(10, 4).astype('float32')
         rois_num_np = np.array([4, 6]).astype('int32')
@@ -865,11 +901,12 @@ def test_distribute_fpn_proposals(self):
                 refer_scale=224,
                 rois_num=rois_num)
             fetch_list = multi_rois + [restore_ind] + rois_num_per_level
-            output_stat = self.get_static_graph_result(
-                feed={'rois': rois_np,
-                      'rois_num': rois_num_np},
-                fetch_list=fetch_list,
-                with_lod=True)
+            output_stat = self.get_static_graph_result(feed={
+                'rois': rois_np,
+                'rois_num': rois_num_np
+            },
+                                                       fetch_list=fetch_list,
+                                                       with_lod=True)
             output_stat_np = []
             for output in output_stat:
                 output_np = np.array(output)
@@ -900,43 +937,52 @@ def test_distribute_fpn_proposals(self):
     def test_distribute_fpn_proposals_error(self):
         program = Program()
         with program_guard(program):
-            fpn_rois = fluid.data(
-                name='data_error', shape=[10, 4], dtype='int32', lod_level=1)
-            self.assertRaises(
-                TypeError,
-                layers.distribute_fpn_proposals,
-                fpn_rois=fpn_rois,
-                min_level=2,
-                max_level=5,
-                refer_level=4,
-                refer_scale=224)
+            fpn_rois = fluid.data(name='data_error',
+                                  shape=[10, 4],
+                                  dtype='int32',
+                                  lod_level=1)
+            self.assertRaises(TypeError,
+                              layers.distribute_fpn_proposals,
+                              fpn_rois=fpn_rois,
+                              min_level=2,
+                              max_level=5,
+                              refer_level=4,
+                              refer_scale=224)
 
 
 class TestBoxDecoderAndAssign(unittest.TestCase):
+
     def test_box_decoder_and_assign(self):
         program = Program()
         with program_guard(program):
             pb = fluid.data(name='prior_box', shape=[None, 4], dtype='float32')
             pbv = fluid.data(name='prior_box_var', shape=[4], dtype='float32')
-            loc = fluid.data(
-                name='target_box', shape=[None, 4 * 81], dtype='float32')
-            scores = fluid.data(
-                name='scores', shape=[None, 81], dtype='float32')
+            loc = fluid.data(name='target_box',
+                             shape=[None, 4 * 81],
+                             dtype='float32')
+            scores = fluid.data(name='scores',
+                                shape=[None, 81],
+                                dtype='float32')
             decoded_box, output_assign_box = fluid.layers.box_decoder_and_assign(
                 pb, pbv, loc, scores, 4.135)
             self.assertIsNotNone(decoded_box)
             self.assertIsNotNone(output_assign_box)
 
     def test_box_decoder_and_assign_error(self):
+
         def generate_input(pb_type, pbv_type, loc_type, score_type, name):
-            pb = fluid.data(
-                name='prior_box' + name, shape=[None, 4], dtype=pb_type)
-            pbv = fluid.data(
-                name='prior_box_var' + name, shape=[4], dtype=pbv_type)
-            loc = fluid.data(
-                name='target_box' + name, shape=[None, 4 * 81], dtype=loc_type)
-            scores = fluid.data(
-                name='scores' + name, shape=[None, 81], dtype=score_type)
+            pb = fluid.data(name='prior_box' + name,
+                            shape=[None, 4],
+                            dtype=pb_type)
+            pbv = fluid.data(name='prior_box_var' + name,
+                             shape=[4],
+                             dtype=pbv_type)
+            loc = fluid.data(name='target_box' + name,
+                             shape=[None, 4 * 81],
+                             dtype=loc_type)
+            scores = fluid.data(name='scores' + name,
+                                shape=[None, 81],
+                                dtype=score_type)
             return pb, pbv, loc, scores
 
         program = Program()
@@ -947,30 +993,27 @@ def generate_input(pb_type, pbv_type, loc_type, score_type, name):
                                                       'int32', 'float32', '2')
             pb3, pbv3, loc3, scores3 = generate_input('float32', 'float32',
                                                       'float32', 'int32', '3')
-            self.assertRaises(
-                TypeError,
-                layers.box_decoder_and_assign,
-                prior_box=pb1,
-                prior_box_var=pbv1,
-                target_box=loc1,
-                box_score=scores1,
-                box_clip=4.0)
-            self.assertRaises(
-                TypeError,
-                layers.box_decoder_and_assign,
-                prior_box=pb2,
-                prior_box_var=pbv2,
-                target_box=loc2,
-                box_score=scores2,
-                box_clip=4.0)
-            self.assertRaises(
-                TypeError,
-                layers.box_decoder_and_assign,
-                prior_box=pb3,
-                prior_box_var=pbv3,
-                target_box=loc3,
-                box_score=scores3,
-                box_clip=4.0)
+            self.assertRaises(TypeError,
+                              layers.box_decoder_and_assign,
+                              prior_box=pb1,
+                              prior_box_var=pbv1,
+                              target_box=loc1,
+                              box_score=scores1,
+                              box_clip=4.0)
+            self.assertRaises(TypeError,
+                              layers.box_decoder_and_assign,
+                              prior_box=pb2,
+                              prior_box_var=pbv2,
+                              target_box=loc2,
+                              box_score=scores2,
+                              box_clip=4.0)
+            self.assertRaises(TypeError,
+                              layers.box_decoder_and_assign,
+                              prior_box=pb3,
+                              prior_box_var=pbv3,
+                              target_box=loc3,
+                              box_score=scores3,
+                              box_clip=4.0)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index 7859fca15f643..e3b20c323929a 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -39,13 +39,12 @@
 
 prog_clip = prog.clone()
 prog_clip.block(0).var(hidden1.name)._set_error_clip(
-    fluid.clip.ErrorClipByValue(
-        max=CLIP_MAX, min=CLIP_MIN))
+    fluid.clip.ErrorClipByValue(max=CLIP_MAX, min=CLIP_MIN))
 
 avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
 fluid.backward.append_backward(loss=avg_cost)
-fluid.backward.append_backward(
-    loss=avg_cost_clip, callbacks=[fluid.clip.error_clip_callback])
+fluid.backward.append_backward(loss=avg_cost_clip,
+                               callbacks=[fluid.clip.error_clip_callback])
 
 hidden1_grad = prog.block(0).var(hidden1.name + "@GRAD")
 hidden1_grad_clip = prog_clip.block(0).var(hidden1.name + "@GRAD")
@@ -53,10 +52,9 @@
 hidden2_grad = prog.block(0).var(hidden2.name + "@GRAD")
 hidden2_grad_clip = prog_clip.block(0).var(hidden2.name + "@GRAD")
 
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=8192),
-    batch_size=BATCH_SIZE)
+train_reader = paddle.batch(paddle.reader.shuffle(paddle.dataset.mnist.train(),
+                                                  buf_size=8192),
+                            batch_size=BATCH_SIZE)
 
 place = fluid.CPUPlace()
 exe = fluid.Executor(place)
@@ -75,8 +73,7 @@
         prog_clip,
         feed=feeder.feed(data),
         fetch_list=[hidden1_grad_clip, hidden2_grad_clip])
-    if not ((out1.clip(
-            min=CLIP_MIN, max=CLIP_MAX) == out1_clip).all() and
+    if not ((out1.clip(min=CLIP_MIN, max=CLIP_MAX) == out1_clip).all() and
             (out2 == out2_clip).all()):
         exit(1)
 
diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
index b7792e5ce27a5..12d33d1c724df 100644
--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -61,18 +61,19 @@ def not_test_raw_api(self):
                 prob = layers.fc(input=hidden, size=10, act='softmax')
                 layers.assign(input=prob, output=false_out)
 
-            prob = merge_lod_tensor(
-                in_true=true_out, in_false=false_out, mask=cond, x=image)
+            prob = merge_lod_tensor(in_true=true_out,
+                                    in_false=false_out,
+                                    mask=cond,
+                                    x=image)
             loss = layers.cross_entropy(input=prob, label=label)
             avg_loss = layers.mean(loss)
 
             optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
             optimizer.minimize(avg_loss, startup_prog)
 
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=10)
+        train_reader = paddle.batch(paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=8192),
+                                    batch_size=10)
 
         place = core.CPUPlace()
         exe = Executor(place)
@@ -86,8 +87,10 @@ def not_test_raw_api(self):
                 y_data = np.expand_dims(y_data, axis=1)
 
                 outs = exe.run(prog,
-                               feed={'x': x_data,
-                                     'y': y_data},
+                               feed={
+                                   'x': x_data,
+                                   'y': y_data
+                               },
                                fetch_list=[avg_loss])
                 print(outs[0])
                 if outs[0] < 1.0:
@@ -125,10 +128,9 @@ def not_test_ifelse(self):
 
             optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
             optimizer.minimize(avg_loss, startup_prog)
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=200)
+        train_reader = paddle.batch(paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=8192),
+                                    batch_size=200)
 
         place = core.CPUPlace()
         exe = Executor(place)
@@ -142,8 +144,10 @@ def not_test_ifelse(self):
                 y_data = y_data.reshape((y_data.shape[0], 1))
 
                 outs = exe.run(prog,
-                               feed={'x': x_data,
-                                     'y': y_data},
+                               feed={
+                                   'x': x_data,
+                                   'y': y_data
+                               },
                                fetch_list=[avg_loss])
                 print(outs[0])
                 if outs[0] < 1.0:
@@ -152,6 +156,7 @@ def not_test_ifelse(self):
 
 
 class TestIfElse(unittest.TestCase):
+
     def set_test_case(self):
         # condiction is: self.data < self.cond_value
         self.cond_value = 0.5
@@ -171,8 +176,9 @@ def compare_ifelse_op_and_numpy(self, place):
         startup_prog = Program()
         with program_guard(prog, startup_prog):
             src = layers.data(name='data', shape=[1], dtype='float32')
-            cond = layers.fill_constant(
-                [1], dtype='float32', value=self.cond_value)
+            cond = layers.fill_constant([1],
+                                        dtype='float32',
+                                        value=self.cond_value)
             ifcond = layers.less_than(x=src, y=cond)
             ie = layers.IfElse(ifcond)
             with ie.true_block():
@@ -196,8 +202,7 @@ def compare_ifelse_op_and_numpy(self, place):
             o2 = self.numpy_cal()
 
             self.assertTrue(
-                np.allclose(
-                    o1, o2, atol=1e-8),
+                np.allclose(o1, o2, atol=1e-8),
                 "IfElse result : " + str(o1) + "\n Numpy result :" + str(o2))
 
     def test_cpu(self):
@@ -210,6 +215,7 @@ def test_cuda(self):
 
 
 class TestIfElseTrueBranch(TestIfElse):
+
     def set_test_case(self):
         # condiction is: self.data < self.cond_value
         self.cond_value = 10.
@@ -217,6 +223,7 @@ def set_test_case(self):
 
 
 class TestIfElseFalseBranch(TestIfElse):
+
     def set_test_case(self):
         # condiction is: self.data < self.cond_value
         self.cond_value = -10.
@@ -224,13 +231,15 @@ def set_test_case(self):
 
 
 class TestIfElseError(unittest.TestCase):
+
     def test_input_type_error(self):
         main_program = Program()
         startup_program = Program()
         with program_guard(main_program, startup_program):
             src = layers.data(name='data', shape=[1], dtype='float32')
-            const_value = layers.fill_constant(
-                [1], dtype='float32', value=123.0)
+            const_value = layers.fill_constant([1],
+                                               dtype='float32',
+                                               value=123.0)
             ifcond = layers.less_than(x=src, y=const_value)
             with self.assertRaises(TypeError):
                 ie = layers.IfElse(set())
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
index e21224c909f58..cc97b0eb5aea4 100644
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -22,6 +22,7 @@
 
 
 class TestLoDTensor(unittest.TestCase):
+
     def test_pybind_recursive_seq_lens(self):
         tensor = fluid.LoDTensor()
         recursive_seq_lens = []
@@ -58,8 +59,8 @@ def test_pybind_recursive_seq_lens(self):
 
     def test_create_lod_tensor(self):
         # Create LoDTensor from a list
-        data = [[np.int64(1), np.int64(2), np.int64(3)],
-                [np.int64(3), np.int64(4)]]
+        data = [[np.int64(1), np.int64(2),
+                 np.int64(3)], [np.int64(3), np.int64(4)]]
         wrong_recursive_seq_lens = [[2, 2]]
         correct_recursive_seq_lens = [[3, 2]]
         self.assertRaises(AssertionError, create_lod_tensor, data,
@@ -73,8 +74,8 @@ def test_create_lod_tensor(self):
         self.assertTrue(
             np.array_equal(
                 np.array(tensor),
-                np.array([1, 2, 3, 3, 4]).reshape(tensor.shape()).astype(
-                    'int64')))
+                np.array([1, 2, 3, 3,
+                          4]).reshape(tensor.shape()).astype('int64')))
 
         # Create LoDTensor from numpy array
         data = np.random.random([10, 1]).astype('float64')
@@ -133,9 +134,8 @@ def test_dlpack_support(self):
         tensor_from_dlpack = fluid.core.from_dlpack(dltensor)
         self.assertTrue(isinstance(tensor_from_dlpack, fluid.core.Tensor))
         self.assertTrue(
-            np.array_equal(
-                np.array(tensor_from_dlpack),
-                np.array([[1], [2], [3], [4]]).astype('int')))
+            np.array_equal(np.array(tensor_from_dlpack),
+                           np.array([[1], [2], [3], [4]]).astype('int')))
         # when build with cuda
         if core.is_compiled_with_cuda():
             gtensor = fluid.create_lod_tensor(
@@ -145,9 +145,8 @@ def test_dlpack_support(self):
             gtensor_from_dlpack = fluid.core.from_dlpack(gdltensor)
             self.assertTrue(isinstance(gtensor_from_dlpack, fluid.core.Tensor))
             self.assertTrue(
-                np.array_equal(
-                    np.array(gtensor_from_dlpack),
-                    np.array([[1], [2], [3], [4]]).astype('int')))
+                np.array_equal(np.array(gtensor_from_dlpack),
+                               np.array([[1], [2], [3], [4]]).astype('int')))
 
     def test_as_type(self):
         tensor = fluid.create_lod_tensor(
diff --git a/python/paddle/fluid/tests/test_python_operator_overriding.py b/python/paddle/fluid/tests/test_python_operator_overriding.py
index fd9dc961988df..50bfdd287b6f5 100644
--- a/python/paddle/fluid/tests/test_python_operator_overriding.py
+++ b/python/paddle/fluid/tests/test_python_operator_overriding.py
@@ -27,6 +27,7 @@
 
 
 class TestPythonOperatorOverride(unittest.TestCase):
+
     def check_result(self, fn, place, dtype):
         shape = [9, 10]
 
@@ -34,18 +35,26 @@ def check_result(self, fn, place, dtype):
         y_data = np.random.random(size=shape).astype(dtype)
         python_out = fn(x_data, y_data)
 
-        x_var = layers.create_global_var(
-            name='x', shape=shape, value=0.0, dtype=dtype, persistable=True)
-        y_var = layers.create_global_var(
-            name='y', shape=shape, value=0.0, dtype=dtype, persistable=True)
+        x_var = layers.create_global_var(name='x',
+                                         shape=shape,
+                                         value=0.0,
+                                         dtype=dtype,
+                                         persistable=True)
+        y_var = layers.create_global_var(name='y',
+                                         shape=shape,
+                                         value=0.0,
+                                         dtype=dtype,
+                                         persistable=True)
         out = fn(x_var, y_var)
 
         exe = fluid.Executor(place)
 
         exe.run(fluid.default_startup_program())
         fluid_out = exe.run(fluid.default_main_program(),
-                            feed={'x': x_data,
-                                  'y': y_data},
+                            feed={
+                                'x': x_data,
+                                'y': y_data
+                            },
                             fetch_list=[out])
 
         np.testing.assert_array_equal(python_out, fluid_out[0])
diff --git a/python/paddle/fluid/tests/test_sequential.py b/python/paddle/fluid/tests/test_sequential.py
index 7446bb83841aa..09cfbcdd7e378 100644
--- a/python/paddle/fluid/tests/test_sequential.py
+++ b/python/paddle/fluid/tests/test_sequential.py
@@ -17,6 +17,7 @@
 
 
 class TestDataFeeder(unittest.TestCase):
+
     def test_lod_level_1_converter(self):
         sequential = paddle.nn.Sequential()
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 34237d47a5659..214c68c250ea9 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1,22 +1,30 @@
-file(GLOB TEST_OPS RELATIVE
-"${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FLAGS_memory_fraction_of_eager_deletion=1.0)
+set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1
+            FLAGS_memory_fraction_of_eager_deletion=1.0)
 set(dist_ENVS http_proxy="" https_proxy="")
 
-file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
+file(
+  GLOB DIST_TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_dist_*.py")
 list(REMOVE_ITEM DIST_TEST_OPS "test_dist_op")
-if ((NOT WITH_NCCL) AND (NOT WITH_RCCL))
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
+if((NOT WITH_NCCL) AND (NOT WITH_RCCL))
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
 endif()
 
 string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
 
-if ((NOT WITH_GPU) AND (NOT WITH_XPU) AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_grad_clip")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_heter_ctr")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ps_gpu_ctr")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_batch_merge")
+if((NOT WITH_GPU)
+   AND (NOT WITH_XPU)
+   AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_grad_clip")
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_heter_ctr")
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ps_gpu_ctr")
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_batch_merge")
 endif()
 
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
@@ -30,13 +38,16 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
-if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
-    list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
-    list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
-    list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer_with_recompute)
-    list(APPEND DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
-    list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
-    list(APPEND DIST_TEST_OPS test_rnn_dp)
+if(WITH_GPU
+   OR WITH_XPU
+   OR WITH_ASCEND
+   OR WITH_ASCEND_CL)
+  list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
+  list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
+  list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer_with_recompute)
+  list(APPEND DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
+  list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
+  list(APPEND DIST_TEST_OPS test_rnn_dp)
 endif()
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow)
@@ -89,7 +100,8 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_2)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_3)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer_with_recompute)
+list(APPEND MIXED_DIST_TEST_OPS
+     test_fleet_pipeline_meta_optimizer_with_recompute)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_rnn_dp)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer)
@@ -122,237 +134,252 @@ foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
 endforeach()
 
 if(NOT WITH_PYTHON AND ON_INFER)
-    LIST(REMOVE_ITEM TEST_OPS test_eager_trace_op)
+  list(REMOVE_ITEM TEST_OPS test_eager_trace_op)
 endif()
 
 if(NOT WITH_GPU)
-    LIST(REMOVE_ITEM TEST_OPS test_fused_feedforward_op)
-    LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op)
-    LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op_api)
-    LIST(REMOVE_ITEM TEST_OPS test_fused_multi_transformer_op)
-    LIST(REMOVE_ITEM TEST_OPS test_fused_transformer_encoder_layer)
-    LIST(REMOVE_ITEM TEST_OPS test_fused_bias_dropout_residual_layer_norm_op)
-    LIST(REMOVE_ITEM TEST_OPS test_fused_bias_dropout_residual_layer_norm_op_api)
+  list(REMOVE_ITEM TEST_OPS test_fused_feedforward_op)
+  list(REMOVE_ITEM TEST_OPS test_fused_attention_op)
+  list(REMOVE_ITEM TEST_OPS test_fused_attention_op_api)
+  list(REMOVE_ITEM TEST_OPS test_fused_multi_transformer_op)
+  list(REMOVE_ITEM TEST_OPS test_fused_transformer_encoder_layer)
+  list(REMOVE_ITEM TEST_OPS test_fused_bias_dropout_residual_layer_norm_op)
+  list(REMOVE_ITEM TEST_OPS test_fused_bias_dropout_residual_layer_norm_op_api)
 endif()
 
-LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op)
-LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op)
-LIST(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass)
+list(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op)
+list(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op)
+list(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass)
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
-    LIST(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op)
-    LIST(REMOVE_ITEM TEST_OPS test_c_concat)
-    LIST(REMOVE_ITEM TEST_OPS test_c_split)
-    LIST(REMOVE_ITEM TEST_OPS test_allgather)
-    LIST(REMOVE_ITEM TEST_OPS test_c_identity)
-    LIST(REMOVE_ITEM TEST_OPS test_c_embedding_op)
-    LIST(REMOVE_ITEM TEST_OPS test_allreduce)
-    LIST(REMOVE_ITEM TEST_OPS test_broadcast)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_reduce)
-    LIST(REMOVE_ITEM TEST_OPS test_pipeline_parallel)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_scatter)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_sendrecv)
-    LIST(REMOVE_ITEM TEST_OPS test_reducescatter)
-    LIST(REMOVE_ITEM TEST_OPS test_reducescatter_api)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_split_embedding)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_split_embedding_none_divisible)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_split_row_linear)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_split_col_linear)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_reduce_api)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_scatter_api)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_barrier_api)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_allreduce_api)
-    LIST(REMOVE_ITEM TEST_OPS test_new_group_api)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_broadcast_api)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_allgather_api)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_alltoall_api)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_global_gather)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_global_scatter)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_sendrecv_api)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_wait)
-    LIST(REMOVE_ITEM TEST_OPS test_memcpy_op)
-    LIST(REMOVE_ITEM TEST_OPS test_raw_program_optimizer)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_gradient_scale)
-    LIST(REMOVE_ITEM TEST_OPS test_disable_signal_handler)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_executor)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_executor_with_task_nodes)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_executor_multi_devices)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_executor_origin_scheduler)
-    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_mapper)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_executor_task_node)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_exe_dist_model_run)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_exe_dist_model_tensor)
+  list(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op)
+  list(REMOVE_ITEM TEST_OPS test_c_concat)
+  list(REMOVE_ITEM TEST_OPS test_c_split)
+  list(REMOVE_ITEM TEST_OPS test_allgather)
+  list(REMOVE_ITEM TEST_OPS test_c_identity)
+  list(REMOVE_ITEM TEST_OPS test_c_embedding_op)
+  list(REMOVE_ITEM TEST_OPS test_allreduce)
+  list(REMOVE_ITEM TEST_OPS test_broadcast)
+  list(REMOVE_ITEM TEST_OPS test_collective_reduce)
+  list(REMOVE_ITEM TEST_OPS test_pipeline_parallel)
+  list(REMOVE_ITEM TEST_OPS test_collective_scatter)
+  list(REMOVE_ITEM TEST_OPS test_collective_sendrecv)
+  list(REMOVE_ITEM TEST_OPS test_reducescatter)
+  list(REMOVE_ITEM TEST_OPS test_reducescatter_api)
+  list(REMOVE_ITEM TEST_OPS test_collective_split_embedding)
+  list(REMOVE_ITEM TEST_OPS test_collective_split_embedding_none_divisible)
+  list(REMOVE_ITEM TEST_OPS test_collective_split_row_linear)
+  list(REMOVE_ITEM TEST_OPS test_collective_split_col_linear)
+  list(REMOVE_ITEM TEST_OPS test_collective_reduce_api)
+  list(REMOVE_ITEM TEST_OPS test_collective_scatter_api)
+  list(REMOVE_ITEM TEST_OPS test_collective_barrier_api)
+  list(REMOVE_ITEM TEST_OPS test_collective_allreduce_api)
+  list(REMOVE_ITEM TEST_OPS test_new_group_api)
+  list(REMOVE_ITEM TEST_OPS test_collective_broadcast_api)
+  list(REMOVE_ITEM TEST_OPS test_collective_allgather_api)
+  list(REMOVE_ITEM TEST_OPS test_collective_alltoall_api)
+  list(REMOVE_ITEM TEST_OPS test_collective_global_gather)
+  list(REMOVE_ITEM TEST_OPS test_collective_global_scatter)
+  list(REMOVE_ITEM TEST_OPS test_collective_sendrecv_api)
+  list(REMOVE_ITEM TEST_OPS test_collective_wait)
+  list(REMOVE_ITEM TEST_OPS test_memcpy_op)
+  list(REMOVE_ITEM TEST_OPS test_raw_program_optimizer)
+  list(REMOVE_ITEM TEST_OPS test_fleet_gradient_scale)
+  list(REMOVE_ITEM TEST_OPS test_disable_signal_handler)
+  list(REMOVE_ITEM TEST_OPS test_fleet_executor)
+  list(REMOVE_ITEM TEST_OPS test_fleet_executor_with_task_nodes)
+  list(REMOVE_ITEM TEST_OPS test_fleet_executor_multi_devices)
+  list(REMOVE_ITEM TEST_OPS test_fleet_executor_origin_scheduler)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_mapper)
+  list(REMOVE_ITEM TEST_OPS test_fleet_executor_task_node)
+  list(REMOVE_ITEM TEST_OPS test_fleet_exe_dist_model_run)
+  list(REMOVE_ITEM TEST_OPS test_fleet_exe_dist_model_tensor)
 endif()
 
 # Temporally disable test_deprecated_decorator
-LIST(REMOVE_ITEM TEST_OPS test_deprecated_decorator)
+list(REMOVE_ITEM TEST_OPS test_deprecated_decorator)
 
-LIST(REMOVE_ITEM TEST_OPS test_tensordot)
+list(REMOVE_ITEM TEST_OPS test_tensordot)
 
 if(WIN32)
-    LIST(REMOVE_ITEM TEST_OPS test_multiprocess_reader_exception)
-    LIST(REMOVE_ITEM TEST_OPS test_trainer_desc)
-    LIST(REMOVE_ITEM TEST_OPS test_checkpoint_notify_op)
-    LIST(REMOVE_ITEM TEST_OPS test_downpoursgd)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_nocvm_1)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_rolemaker)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_3)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_unitaccessor)
-    LIST(REMOVE_ITEM TEST_OPS test_ps_dispatcher)
-    LIST(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_nlp)
-    LIST(REMOVE_ITEM TEST_OPS test_nvprof)
-
-    # TODO: Fix these unittests failed on Windows
-    LIST(REMOVE_ITEM TEST_OPS test_debugger)
-    if (WITH_GPU)
-        LIST(REMOVE_ITEM TEST_OPS test_update_loss_scaling_op)
-    endif()
+  list(REMOVE_ITEM TEST_OPS test_multiprocess_reader_exception)
+  list(REMOVE_ITEM TEST_OPS test_trainer_desc)
+  list(REMOVE_ITEM TEST_OPS test_checkpoint_notify_op)
+  list(REMOVE_ITEM TEST_OPS test_downpoursgd)
+  list(REMOVE_ITEM TEST_OPS test_fleet)
+  list(REMOVE_ITEM TEST_OPS test_fleet_nocvm_1)
+  list(REMOVE_ITEM TEST_OPS test_fleet_rolemaker)
+  list(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_3)
+  list(REMOVE_ITEM TEST_OPS test_fleet_unitaccessor)
+  list(REMOVE_ITEM TEST_OPS test_ps_dispatcher)
+  list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_nlp)
+  list(REMOVE_ITEM TEST_OPS test_nvprof)
+
+  # TODO: Fix these unittests failed on Windows
+  list(REMOVE_ITEM TEST_OPS test_debugger)
+  if(WITH_GPU)
+    list(REMOVE_ITEM TEST_OPS test_update_loss_scaling_op)
+  endif()
 endif()
 
 if(NOT WITH_DISTRIBUTE OR WIN32)
-    # DISTRIBUTE related
-    LIST(REMOVE_ITEM TEST_OPS test_avoid_twice_initialization)
-    LIST(REMOVE_ITEM TEST_OPS test_distributed_strategy)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_metric)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_ps)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_2)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_utils)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_cpu_barrier_with_gloo)
-
-    # TODO: Fix these unittests failed on Windows
-    list(REMOVE_ITEM TEST_OPS test_fake_init_op)
+  # DISTRIBUTE related
+  list(REMOVE_ITEM TEST_OPS test_avoid_twice_initialization)
+  list(REMOVE_ITEM TEST_OPS test_distributed_strategy)
+  list(REMOVE_ITEM TEST_OPS test_fleet_metric)
+  list(REMOVE_ITEM TEST_OPS test_fleet_ps)
+  list(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_2)
+  list(REMOVE_ITEM TEST_OPS test_fleet_utils)
+  list(REMOVE_ITEM TEST_OPS test_collective_cpu_barrier_with_gloo)
+
+  # TODO: Fix these unittests failed on Windows
+  list(REMOVE_ITEM TEST_OPS test_fake_init_op)
 endif()
 
 if(NOT WITH_DISTRIBUTE)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_new)
-    LIST(REMOVE_ITEM TEST_OPS test_desc_clone_dist)
+  list(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_new)
+  list(REMOVE_ITEM TEST_OPS test_desc_clone_dist)
 endif()
 
 if(WIN32)
-    LIST(REMOVE_ITEM TEST_OPS test_complex_matmul)
-    LIST(REMOVE_ITEM TEST_OPS test_ops_nms)
+  list(REMOVE_ITEM TEST_OPS test_complex_matmul)
+  list(REMOVE_ITEM TEST_OPS test_ops_nms)
 endif()
 
-LIST(REMOVE_ITEM TEST_OPS test_fleet_checkpoint)
-LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint)
-LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint1)
-LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint2)
-LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint3)
-LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint_multiple)
-LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint_dist_basic)
-LIST(REMOVE_ITEM TEST_OPS test_hdfs1)
-LIST(REMOVE_ITEM TEST_OPS test_hdfs2)
-LIST(REMOVE_ITEM TEST_OPS test_hdfs3)
-LIST(REMOVE_ITEM TEST_OPS test_checkpoint_saver)
+list(REMOVE_ITEM TEST_OPS test_fleet_checkpoint)
+list(REMOVE_ITEM TEST_OPS test_auto_checkpoint)
+list(REMOVE_ITEM TEST_OPS test_auto_checkpoint1)
+list(REMOVE_ITEM TEST_OPS test_auto_checkpoint2)
+list(REMOVE_ITEM TEST_OPS test_auto_checkpoint3)
+list(REMOVE_ITEM TEST_OPS test_auto_checkpoint_multiple)
+list(REMOVE_ITEM TEST_OPS test_auto_checkpoint_dist_basic)
+list(REMOVE_ITEM TEST_OPS test_hdfs1)
+list(REMOVE_ITEM TEST_OPS test_hdfs2)
+list(REMOVE_ITEM TEST_OPS test_hdfs3)
+list(REMOVE_ITEM TEST_OPS test_checkpoint_saver)
 
 if(APPLE OR WIN32)
-    LIST(REMOVE_ITEM TEST_OPS test_fs_interface)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_metric)
+  list(REMOVE_ITEM TEST_OPS test_fs_interface)
+  list(REMOVE_ITEM TEST_OPS test_fleet_metric)
 endif()
 
 list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_hybrid_parallel)
 
-LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer_gloo) # NOTE: @xiongkun03, cpu is too slow, fix it in next PR
+list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer_gloo
+)# NOTE: @xiongkun03, cpu is too slow, fix it in next PR
 
-if (NOT WITH_GLOO)
-    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel_cpuonly)
+if(NOT WITH_GLOO)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel_cpuonly)
 
-    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_unused_variables_gloo)
-    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_over_height_gloo)
-    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_gloo)
-    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_diff_length_gloo)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_unused_variables_gloo)
+  list(REMOVE_ITEM TEST_OPS
+       test_parallel_dygraph_sparse_embedding_over_height_gloo)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_gloo)
+  list(REMOVE_ITEM TEST_OPS
+       test_parallel_dygraph_sparse_embedding_diff_length_gloo)
 endif()
 
-if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
-    LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
-    LIST(REMOVE_ITEM TEST_OPS test_rank_attention_op) # TODO(shenliang03): rank_attention_op support CPU device in future
-    LIST(REMOVE_ITEM TEST_OPS test_batch_fc_op) # TODO(shenliang03): batch_fc_op support CPU device in future
-    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mnist) # TODO(Yancey1989): parallel dygraph support CPU device in future
-    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_unused_variables)
-    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_se_resnext)
-    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding)
-    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
-    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer)
-    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
-    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow)
-    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_no_sync)
-    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_no_sync_gradient_check)
-    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel)
-    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_parallel)
-    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_tensor_parallel)
-    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sharding_parallel)
-    list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2)
-    list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2)
-    list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3)
-    list(REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api)
-    list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer)
-    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
-    LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
-    LIST(REMOVE_ITEM TEST_OPS test_mixed_precision)
-    LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single)
-    LIST(REMOVE_ITEM TEST_OPS test_dygraph_recompute)
-    list(REMOVE_ITEM TEST_OPS test_hybrid_parallel_inference_helper)
-    list(REMOVE_ITEM TEST_OPS test_parallel_class_center_sample)
-    LIST(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy)
-    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner)
-    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner_gpt)
-    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_searcher)
-    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard)
-    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_dist_tensor)
-    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_serial)
-    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_mppp)
-    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_dpmppp)
-    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_cost_model)
-    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_data_unshard)
-    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_save_load)
-    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_autoconvert)
-    LIST(REMOVE_ITEM TEST_OPS test_collective_process_group)
-    LIST(REMOVE_ITEM TEST_OPS test_eager_dist_api)
+if((NOT WITH_GPU) AND (NOT WITH_ROCM))
+  list(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
+  list(REMOVE_ITEM TEST_OPS test_rank_attention_op
+  )# TODO(shenliang03): rank_attention_op support CPU device in future
+  list(REMOVE_ITEM TEST_OPS test_batch_fc_op
+  )# TODO(shenliang03): batch_fc_op support CPU device in future
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mnist
+  )# TODO(Yancey1989): parallel dygraph support CPU device in future
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_unused_variables)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_se_resnext)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_no_sync)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_no_sync_gradient_check)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_parallel)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_tensor_parallel)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sharding_parallel)
+  list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2)
+  list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2)
+  list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3)
+  list(REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer)
+  list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
+  list(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
+  list(REMOVE_ITEM TEST_OPS test_mixed_precision)
+  list(REMOVE_ITEM TEST_OPS test_fleet_base_single)
+  list(REMOVE_ITEM TEST_OPS test_dygraph_recompute)
+  list(REMOVE_ITEM TEST_OPS test_hybrid_parallel_inference_helper)
+  list(REMOVE_ITEM TEST_OPS test_parallel_class_center_sample)
+  list(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner_gpt)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_searcher)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_dist_tensor)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_serial)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_mppp)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_dpmppp)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_cost_model)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_data_unshard)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_save_load)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_autoconvert)
+  list(REMOVE_ITEM TEST_OPS test_collective_process_group)
+  list(REMOVE_ITEM TEST_OPS test_eager_dist_api)
 elseif(WITH_GPU)
-    if (${CUDNN_VERSION} VERSION_LESS 7100)
-        LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
-    endif()
+  if(${CUDNN_VERSION} VERSION_LESS 7100)
+    list(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
+  endif()
 endif()
 
-if (WITH_NCCL)
-    if (${NCCL_VERSION} VERSION_LESS 2212)
-        LIST(REMOVE_ITEM DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
-        LIST(REMOVE_ITEM DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
-        LIST(REMOVE_ITEM DIST_TEST_OPS test_parallel_dygraph_transformer)
-    endif()
+if(WITH_NCCL)
+  if(${NCCL_VERSION} VERSION_LESS 2212)
+    list(REMOVE_ITEM DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
+    list(REMOVE_ITEM DIST_TEST_OPS
+         test_parallel_dygraph_sparse_embedding_over_height)
+    list(REMOVE_ITEM DIST_TEST_OPS test_parallel_dygraph_transformer)
+  endif()
 endif()
 
-if ((NOT WITH_NCCL) AND (NOT WITH_RCCL))
-    list(REMOVE_ITEM TEST_OPS test_imperative_group)
-    LIST(REMOVE_ITEM TEST_OPS test_new_group_api)
+if((NOT WITH_NCCL) AND (NOT WITH_RCCL))
+  list(REMOVE_ITEM TEST_OPS test_imperative_group)
+  list(REMOVE_ITEM TEST_OPS test_new_group_api)
 endif()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
-    LIST(REMOVE_ITEM TEST_OPS test_fused_gate_attention_op)
-    LIST(REMOVE_ITEM TEST_OPS test_boxps)
+  list(REMOVE_ITEM TEST_OPS test_fused_gate_attention_op)
+  list(REMOVE_ITEM TEST_OPS test_boxps)
 endif()
-list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
-list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
-list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
+list(REMOVE_ITEM TEST_OPS test_seq_concat_op
+)# FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
+list(REMOVE_ITEM TEST_OPS test_lstm_unit_op
+)# # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
+list(REMOVE_ITEM TEST_OPS test_cond_op)
+
+# FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
 
 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
-list(REMOVE_ITEM TEST_OPS decorator_helper) # decorator_helper is a helper python file, not a test
+list(REMOVE_ITEM TEST_OPS decorator_helper
+)# decorator_helper is a helper python file, not a test
 
 if(APPLE)
-    if(NOT WITH_DISTRIBUTE)
-        list(REMOVE_ITEM TEST_OPS test_desc_clone)
-        list(REMOVE_ITEM TEST_OPS test_program_code)
-    endif(NOT WITH_DISTRIBUTE)
-    message(WARNING "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext_*")
-    # this op is not support on mac
-    list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
-    list(REMOVE_ITEM TEST_OPS test_detection_map_op)
-    list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
+  if(NOT WITH_DISTRIBUTE)
+    list(REMOVE_ITEM TEST_OPS test_desc_clone)
+    list(REMOVE_ITEM TEST_OPS test_program_code)
+  endif(NOT WITH_DISTRIBUTE)
+  message(
+    WARNING
+      "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext_*"
+  )
+  # this op is not support on mac
+  list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
+  list(REMOVE_ITEM TEST_OPS test_detection_map_op)
+  list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
 endif()
 if(NOT WITH_MKLML)
-    # this op is not support on openblas
-    list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
+  # this op is not support on openblas
+  list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
 endif()
 
 if(NOT WITH_MKL OR NOT WITH_AVX)
@@ -360,19 +387,21 @@ if(NOT WITH_MKL OR NOT WITH_AVX)
   list(REMOVE_ITEM TEST_OPS test_var_conv_2d)
 endif()
 
-if(WITH_COVERAGE OR WIN32 OR WITH_NV_JETSON)
+if(WITH_COVERAGE
+   OR WIN32
+   OR WITH_NV_JETSON)
   list(REMOVE_ITEM TEST_OPS test_pyramid_hash_op)
 endif()
 
 list(REMOVE_ITEM TEST_OPS test_fleet_pyramid_hash)
 
 if((WITH_ROCM OR WITH_GPU) OR NOT WITH_MKLML)
-    # matmul with multiple heads need MKL support
-    LIST(REMOVE_ITEM TEST_OPS test_matmul_op_with_head)
+  # matmul with multiple heads need MKL support
+  list(REMOVE_ITEM TEST_OPS test_matmul_op_with_head)
 endif()
 
 if(NOT WITH_CRYPTO)
-    LIST(REMOVE_ITEM TEST_OPS test_crypto)
+  list(REMOVE_ITEM TEST_OPS test_crypto)
 endif()
 
 function(py_test_modules TARGET_NAME)
@@ -380,126 +409,158 @@ function(py_test_modules TARGET_NAME)
     set(options SERIAL)
     set(oneValueArgs "")
     set(multiValueArgs MODULES DEPS ENVS)
-    cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if(WITH_COVERAGE AND NOT (WITH_INCREMENTAL_COVERAGE AND "$ENV{PADDLE_GIT_DIFF_PY_FILE}" STREQUAL ""))
-        if(WITH_ASCEND_CL)
-            add_test(NAME ${TARGET_NAME}
-                    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python:$ENV{PYTHONPATH} ${py_test_modules_ENVS}
-                    COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-                    ${PYTHON_EXECUTABLE} -m coverage run --branch -p ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-        else()
-            add_test(NAME ${TARGET_NAME}
-                    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
-                    COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-                    ${PYTHON_EXECUTABLE} -m coverage run --branch -p ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-        endif()
+    cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+
+    if(WITH_COVERAGE AND NOT (WITH_INCREMENTAL_COVERAGE
+                              AND "$ENV{PADDLE_GIT_DIFF_PY_FILE}" STREQUAL ""))
+      if(WITH_ASCEND_CL)
+        add_test(
+          NAME ${TARGET_NAME}
+          COMMAND
+            ${CMAKE_COMMAND} -E env
+            PYTHONPATH=${PADDLE_BINARY_DIR}/python:$ENV{PYTHONPATH}
+            ${py_test_modules_ENVS}
+            COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
+            ${PYTHON_EXECUTABLE} -m coverage run --branch -p
+            ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
+          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+      else()
+        add_test(
+          NAME ${TARGET_NAME}
+          COMMAND
+            ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
+            ${py_test_modules_ENVS}
+            COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
+            ${PYTHON_EXECUTABLE} -m coverage run --branch -p
+            ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
+          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+      endif()
     else()
-        if(WITH_ASCEND_CL)
-            add_test(NAME ${TARGET_NAME}
-                    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python:$ENV{PYTHONPATH} ${py_test_modules_ENVS}
-                    ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-        else()
-            add_test(NAME ${TARGET_NAME}
-                    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
-                    ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-        endif()
+      if(WITH_ASCEND_CL)
+        add_test(
+          NAME ${TARGET_NAME}
+          COMMAND
+            ${CMAKE_COMMAND} -E env
+            PYTHONPATH=${PADDLE_BINARY_DIR}/python:$ENV{PYTHONPATH}
+            ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE}
+            ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
+          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+      else()
+        add_test(
+          NAME ${TARGET_NAME}
+          COMMAND
+            ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
+            ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE}
+            ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
+          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+      endif()
     endif()
 
-    if (py_test_modules_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    if(py_test_modules_SERIAL)
+      set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
     if(WIN32)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
+      set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
     endif()
   endif()
 endfunction()
 
-
 function(bash_test_modules TARGET_NAME)
-    if(NOT WITH_TESTING)
-        return()
-    endif()
-
-    set(options SERIAL)
-    set(oneValueArgs TIMEOUT START_BASH)
-    set(multiValueArgs DEPS ENVS LABELS)
-    cmake_parse_arguments(bash_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  if(NOT WITH_TESTING)
+    return()
+  endif()
 
+  set(options SERIAL)
+  set(oneValueArgs TIMEOUT START_BASH)
+  set(multiValueArgs DEPS ENVS LABELS)
+  cmake_parse_arguments(bash_test_modules "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
 
-    set(timeout 350)
-    if(${bash_test_modules_TIMEOUT})
-        set(timeout ${bash_test_modules_TIMEOUT})
-    endif()
+  set(timeout 350)
+  if(${bash_test_modules_TIMEOUT})
+    set(timeout ${bash_test_modules_TIMEOUT})
+  endif()
 
-    if(WITH_COVERAGE)
-        add_test(NAME ${TARGET_NAME}
-            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
-            TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${bash_test_modules_ENVS}
-            WITH_COVERAGE=ON COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-            bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_START_BASH}
-            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    else()
-        add_test(NAME ${TARGET_NAME}
-            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
-            TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${bash_test_modules_ENVS}
-            bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_START_BASH}
-            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    endif()
+  if(WITH_COVERAGE)
+    add_test(
+      NAME ${TARGET_NAME}
+      COMMAND
+        ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
+        TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout}
+        ${bash_test_modules_ENVS} WITH_COVERAGE=ON
+        COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data bash
+        ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_START_BASH}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  else()
+    add_test(
+      NAME ${TARGET_NAME}
+      COMMAND
+        ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
+        TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout}
+        ${bash_test_modules_ENVS} bash
+        ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_START_BASH}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  endif()
 
-    if (bash_test_modules_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
-    endif()
+  if(bash_test_modules_SERIAL)
+    set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+  endif()
 
-    if(bash_test_modules_LABELS)
-        set_tests_properties(${TARGET_NAME} PROPERTIES LABELS ${bash_test_modules_LABELS})
-    endif()
+  if(bash_test_modules_LABELS)
+    set_tests_properties(${TARGET_NAME} PROPERTIES LABELS
+                                                   ${bash_test_modules_LABELS})
+  endif()
 endfunction()
 
 function(parallel_bash_test_modules TARGET_NAME)
-    if(NOT WITH_TESTING)
-        return()
-    endif()
-
-    set(options SERIAL)
-    set(oneValueArgs TIMEOUT START_BASH)
-    set(multiValueArgs DEPS ENVS LABELS UnitTests)
-    cmake_parse_arguments(parallel_bash_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  if(NOT WITH_TESTING)
+    return()
+  endif()
 
+  set(options SERIAL)
+  set(oneValueArgs TIMEOUT START_BASH)
+  set(multiValueArgs DEPS ENVS LABELS UnitTests)
+  cmake_parse_arguments(parallel_bash_test_modules "${options}"
+                        "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-    set(timeout 120)
-    if(${parallel_bash_test_modules_TIMEOUT})
-        set(timeout ${parallel_bash_test_modules_TIMEOUT})
-    endif()
-
-    list(JOIN  parallel_bash_test_modules_UnitTests " " uts_string)
+  set(timeout 120)
+  if(${parallel_bash_test_modules_TIMEOUT})
+    set(timeout ${parallel_bash_test_modules_TIMEOUT})
+  endif()
 
-    if(WITH_COVERAGE)
-        add_test(NAME ${TARGET_NAME}
-            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
-            TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${parallel_bash_test_modules_ENVS} UnitTests=${uts_string}
-            WITH_COVERAGE=ON COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-            bash ${CMAKE_CURRENT_BINARY_DIR}/${parallel_bash_test_modules_START_BASH}
-            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    else()
-        add_test(NAME ${TARGET_NAME}
-            COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
-            TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout} ${parallel_bash_test_modules_ENVS} UnitTests=${uts_string}
-            bash ${CMAKE_CURRENT_BINARY_DIR}/${parallel_bash_test_modules_START_BASH}
-            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    endif()
+  list(JOIN parallel_bash_test_modules_UnitTests " " uts_string)
+
+  if(WITH_COVERAGE)
+    add_test(
+      NAME ${TARGET_NAME}
+      COMMAND
+        ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
+        TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout}
+        ${parallel_bash_test_modules_ENVS} UnitTests=${uts_string}
+        WITH_COVERAGE=ON COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
+        bash
+        ${CMAKE_CURRENT_BINARY_DIR}/${parallel_bash_test_modules_START_BASH}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  else()
+    add_test(
+      NAME ${TARGET_NAME}
+      COMMAND
+        ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
+        TEST_TARGET_NAME=${TARGET_NAME} TEST_TIMEOUT=${timeout}
+        ${parallel_bash_test_modules_ENVS} UnitTests=${uts_string} bash
+        ${CMAKE_CURRENT_BINARY_DIR}/${parallel_bash_test_modules_START_BASH}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  endif()
 
-    if (parallel_bash_test_modules_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
-    endif()
+  if(parallel_bash_test_modules_SERIAL)
+    set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+  endif()
 
-    if(parallel_bash_test_modules_LABELS)
-        set_tests_properties(${TARGET_NAME} PROPERTIES LABELS ${parallel_bash_test_modules_LABELS})
-    endif()
+  if(parallel_bash_test_modules_LABELS)
+    set_tests_properties(${TARGET_NAME}
+                         PROPERTIES LABELS ${parallel_bash_test_modules_LABELS})
+  endif()
 endfunction()
 
 list(REMOVE_ITEM TEST_OPS test_feed_data_check_shape_type)
@@ -522,7 +583,8 @@ list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
 list(REMOVE_ITEM TEST_OPS test_layers)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_base_cpu)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_with_reduce_cpu)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
+list(REMOVE_ITEM TEST_OPS
+     test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
 list(REMOVE_ITEM TEST_OPS test_imperative_ocr_attention_model)
 list(REMOVE_ITEM TEST_OPS test_async_ssa_graph_executor_mnist)
 list(REMOVE_ITEM TEST_OPS test_install_check)
@@ -542,11 +604,14 @@ list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_while)
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
 
 # disable sparse_attention which not in suitable env
-if ( (NOT WITH_GPU) OR (WIN32) OR (PADDLE_WITH_ARM) OR (WITH_ROCM) )
-    list(REMOVE_ITEM TEST_OPS test_sparse_attention_op)
+if((NOT WITH_GPU)
+   OR (WIN32)
+   OR (PADDLE_WITH_ARM)
+   OR (WITH_ROCM))
+  list(REMOVE_ITEM TEST_OPS test_sparse_attention_op)
 endif()
 
-if (APPLE OR WIN32)
+if(APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_dataset)
   list(REMOVE_ITEM TEST_OPS test_dataset_dataloader)
   list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_base)
@@ -563,33 +628,35 @@ if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_paddle_multiprocessing)
 endif()
 
-if (NOT WITH_GLOO)
-    LIST(REMOVE_ITEM TEST_OPS test_cpuonly_spawn)
+if(NOT WITH_GLOO)
+  list(REMOVE_ITEM TEST_OPS test_cpuonly_spawn)
 endif()
 
-if(NOT WITH_GPU OR WIN32 OR APPLE)
+if(NOT WITH_GPU
+   OR WIN32
+   OR APPLE)
   list(REMOVE_ITEM TEST_OPS test_build_strategy_fusion_group_pass)
 endif()
 
 # Some ops need to check results when gc is enabled
 # Currently, only ops that register NoNeedBufferVarsInference need to do this test
 set(TEST_OPS_WITH_GC
-  test_affine_channel_op
-  test_concat_op
-  test_elementwise_add_op
-  test_elementwise_sub_op
-  test_fill_zeros_like2_op
-  test_gather_op
-  test_gather_nd_op
-  test_linear_chain_crf_op
-  test_lod_reset_op
-  test_lookup_table_op
-  test_mean_op
-  test_pad2d_op
-  test_scatter_op
-  test_slice_op
-  test_space_to_depth_op
-  test_squared_l2_distance_op)
+    test_affine_channel_op
+    test_concat_op
+    test_elementwise_add_op
+    test_elementwise_sub_op
+    test_fill_zeros_like2_op
+    test_gather_op
+    test_gather_nd_op
+    test_linear_chain_crf_op
+    test_lod_reset_op
+    test_lookup_table_op
+    test_mean_op
+    test_pad2d_op
+    test_scatter_op
+    test_slice_op
+    test_space_to_depth_op
+    test_squared_l2_distance_op)
 
 foreach(TEST_OP ${TEST_OPS_WITH_GC})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
@@ -603,184 +670,310 @@ foreach(TEST_OP ${TEST_EAGER_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS FLAGS_enable_eager_mode=1)
 endforeach()
 
-if ((NOT WITH_GPU) AND (NOT WITH_XPU) AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
-    list(REMOVE_ITEM TEST_OPS "test_fleet_graph_execution_meta_optimizer")
-    list(REMOVE_ITEM TEST_OPS "test_gen_nccl_id_op")
-    list(REMOVE_ITEM TEST_OPS "test_dist_fleet_grad_clip")
-    list(REMOVE_ITEM TEST_OPS "test_dist_fleet_heter_ctr")
-    list(REMOVE_ITEM TEST_OPS "test_dist_fleet_ps_gpu_ctr")
-    list(REMOVE_ITEM TEST_OPS "test_dist_mnist_batch_merge")
+if((NOT WITH_GPU)
+   AND (NOT WITH_XPU)
+   AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+  list(REMOVE_ITEM TEST_OPS "test_fleet_graph_execution_meta_optimizer")
+  list(REMOVE_ITEM TEST_OPS "test_gen_nccl_id_op")
+  list(REMOVE_ITEM TEST_OPS "test_dist_fleet_grad_clip")
+  list(REMOVE_ITEM TEST_OPS "test_dist_fleet_heter_ctr")
+  list(REMOVE_ITEM TEST_OPS "test_dist_fleet_ps_gpu_ctr")
+  list(REMOVE_ITEM TEST_OPS "test_dist_mnist_batch_merge")
 endif()
 
 foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
-py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
-if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL OR APPLE)
-    py_test_modules(test_warpctc_op MODULES test_warpctc_op)
-    set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120)
+py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS
+                FLAGS_inner_op_parallelism=4)
+if(WITH_GPU
+   OR WITH_XPU
+   OR WITH_ASCEND
+   OR WITH_ASCEND_CL
+   OR APPLE)
+  py_test_modules(test_warpctc_op MODULES test_warpctc_op)
+  set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120)
 endif()
-py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS})
-py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
+py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS
+                ${GC_ENVS})
+py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS
+                ${GC_ENVS})
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
-    FLAGS_cudnn_deterministic=1)
-set_tests_properties(test_imperative_resnet PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
-py_test_modules(test_imperative_resnet_sorted_gradient MODULES test_imperative_resnet_sorted_gradient ENVS
-        FLAGS_cudnn_deterministic=1)
-set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+                FLAGS_cudnn_deterministic=1)
+set_tests_properties(test_imperative_resnet
+                     PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+py_test_modules(
+  test_imperative_resnet_sorted_gradient MODULES
+  test_imperative_resnet_sorted_gradient ENVS FLAGS_cudnn_deterministic=1)
+set_tests_properties(test_imperative_resnet_sorted_gradient
+                     PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
-    FLAGS_cudnn_deterministic=1)
-py_test_modules(test_imperative_mnist_sorted_gradient MODULES test_imperative_mnist_sorted_gradient ENVS
-        FLAGS_cudnn_deterministic=1)
-py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS
-    FLAGS_cudnn_deterministic=1)
-set_tests_properties(test_imperative_se_resnext PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
-py_test_modules(test_imperative_ocr_attention_model MODULES test_imperative_ocr_attention_model ENVS
-        FLAGS_cudnn_deterministic=1)
+                FLAGS_cudnn_deterministic=1)
+py_test_modules(
+  test_imperative_mnist_sorted_gradient MODULES
+  test_imperative_mnist_sorted_gradient ENVS FLAGS_cudnn_deterministic=1)
+py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext
+                ENVS FLAGS_cudnn_deterministic=1)
+set_tests_properties(test_imperative_se_resnext
+                     PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+py_test_modules(
+  test_imperative_ocr_attention_model MODULES
+  test_imperative_ocr_attention_model ENVS FLAGS_cudnn_deterministic=1)
 py_test_modules(test_install_check MODULES test_install_check ENVS
-        FLAGS_cudnn_deterministic=1)
+                FLAGS_cudnn_deterministic=1)
 set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")
-py_test_modules(test_imperative_static_runner_mnist MODULES test_imperative_static_runner_mnist ENVS
-    FLAGS_cudnn_deterministic=1)
-py_test_modules(test_imperative_static_runner_while MODULES test_imperative_static_runner_while ENVS
-    FLAGS_cudnn_deterministic=1)
-
-if ((WITH_GPU) AND (CUDA_VERSION GREATER_EQUAL 11.6))
-    py_test_modules(test_fused_gemm_epilogue_op MODULES test_fused_gemm_epilogue_op)
-    py_test_modules(test_fused_gemm_epilogue_grad_op MODULES test_fused_gemm_epilogue_grad_op)
-    py_test_modules(test_fused_gemm_epilogue_op_with_es MODULES test_fused_gemm_epilogue_op ENVS FLAGS_cublaslt_exhaustive_search_times=30)
-    py_test_modules(test_fused_gemm_epilogue_grad_op_with_es MODULES test_fused_gemm_epilogue_grad_op ENVS FLAGS_cublaslt_exhaustive_search_times=30)
-    py_test_modules(test_fuse_gemm_epilogue_pass MODULES test_fuse_gemm_epilogue_pass)
+py_test_modules(
+  test_imperative_static_runner_mnist MODULES
+  test_imperative_static_runner_mnist ENVS FLAGS_cudnn_deterministic=1)
+py_test_modules(
+  test_imperative_static_runner_while MODULES
+  test_imperative_static_runner_while ENVS FLAGS_cudnn_deterministic=1)
+
+if((WITH_GPU) AND (CUDA_VERSION GREATER_EQUAL 11.6))
+  py_test_modules(test_fused_gemm_epilogue_op MODULES
+                  test_fused_gemm_epilogue_op)
+  py_test_modules(test_fused_gemm_epilogue_grad_op MODULES
+                  test_fused_gemm_epilogue_grad_op)
+  py_test_modules(
+    test_fused_gemm_epilogue_op_with_es MODULES test_fused_gemm_epilogue_op
+    ENVS FLAGS_cublaslt_exhaustive_search_times=30)
+  py_test_modules(
+    test_fused_gemm_epilogue_grad_op_with_es MODULES
+    test_fused_gemm_epilogue_grad_op ENVS
+    FLAGS_cublaslt_exhaustive_search_times=30)
+  py_test_modules(test_fuse_gemm_epilogue_pass MODULES
+                  test_fuse_gemm_epilogue_pass)
 endif()
 
 set_tests_properties(test_conv2d_op PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-set_tests_properties(test_faster_tokenizer_op PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-set_tests_properties(test_conv2d_op_depthwise_conv PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_faster_tokenizer_op PROPERTIES LABELS
+                                                         "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_conv2d_op_depthwise_conv
+                     PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_conv2d_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_conv_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_norm_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 if(WITH_DISTRIBUTE)
-    add_subdirectory(distributed_passes)
-    add_subdirectory(ps)
-    add_subdirectory(auto_parallel)
-
-    # FIXME(typhoonzero): add these tests back
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler")
-
-    # TODO(sandyhouse): fix and add the ut back
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_hallreduce")
-
-    #not need
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_base")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_base")
-
-
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_ctr")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_lars")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_train")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_save_load")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_text_classification")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_train")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_word2vec")
-
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo")
-
-    py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
-    py_test_modules(test_communicator_async MODULES test_communicator_async ENVS ${dist_ENVS})
-    py_test_modules(test_communicator_ps_gpu MODULES test_communicator_ps_gpu ENVS ${dist_ENVS})
-    py_test_modules(test_communicator_geo MODULES test_communicator_geo ENVS ${dist_ENVS})
-    py_test_modules(test_communicator_half_async MODULES test_communicator_half_async ENVS ${dist_ENVS} FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1)
-    py_test_modules(test_communicator_sync MODULES test_communicator_sync ENVS ${dist_ENVS} FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1)
-    py_test_modules(test_collective_optimizer MODULES test_collective_optimizer)
-    if(NOT APPLE)
-        py_test_modules(test_fleet_base MODULES test_fleet_base ENVS ${dist_ENVS})
-        py_test_modules(test_fleet_base_2 MODULES test_fleet_base_2 ENVS ${dist_ENVS})
-        py_test_modules(test_fleet_base_3 MODULES test_fleet_base_3 ENVS ${dist_ENVS})
-        py_test_modules(test_fleet_amp_init MODULES test_fleet_amp_init ENVS ${dist_ENVS})
-        py_test_modules(test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS ${dist_ENVS})
-        py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
-        py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
-        py_test_modules(test_fleet_distributed_strategy MODULES test_fleet_distributed_strategy)
-        py_test_modules(test_fleet_static_mp_layers MODULES test_fleet_static_mp_layers)
-        #py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
-        if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
-            py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
-            py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS})
-            py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
-            py_test_modules(test_fleet_hybrid_meta_optimizer MODULES test_fleet_hybrid_meta_optimizer ENVS ${dist_ENVS})
-            py_test_modules(test_fleet_recompute_meta_optimizer MODULES test_fleet_recompute_meta_optimizer ENVS ${dist_ENVS})
-            py_test_modules(test_fleet_sharding_meta_optimizer MODULES test_fleet_sharding_meta_optimizer ENVS ${dist_ENVS})
-        endif()
-        if(NOT WIN32)
-            py_test_modules(test_auto_parallel_partitioner MODULES test_auto_parallel_partitioner ENVS ${dist_ENVS})
-            py_test_modules(test_auto_parallel_partitioner_gpt MODULES test_auto_parallel_partitioner_gpt ENVS ${dist_ENVS})
-            py_test_modules(test_auto_parallel_searcher MODULES test_auto_parallel_searcher ENVS ${dist_ENVS})
-            py_test_modules(test_auto_parallel_reshard MODULES test_auto_parallel_reshard ENVS ${dist_ENVS})
-            py_test_modules(test_auto_parallel_dist_tensor MODULES test_auto_parallel_dist_tensor ENVS ${dist_ENVS})
-            py_test_modules(test_auto_parallel_reshard_serial MODULES test_auto_parallel_reshard_serial ENVS ${dist_ENVS})
-            py_test_modules(test_auto_parallel_reshard_mppp MODULES test_auto_parallel_reshard_mppp ENVS ${dist_ENVS})
-            py_test_modules(test_auto_parallel_reshard_dpmppp MODULES test_auto_parallel_reshard_dpmppp ENVS ${dist_ENVS})
-            py_test_modules(test_auto_parallel_cost_model MODULES test_auto_parallel_cost_model ENVS ${dist_ENVS})
-            if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
-                py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
-                py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
-                py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
-
-
-
-            endif()
-        endif(NOT WIN32)
-    endif(NOT APPLE)
-    if(WITH_DGC)
-        # if with dgc, test all dgc tests.
-        # NOTE. dist dgc tests is already in DIST_TEST_OPS
-        py_test_modules(test_dgc_op MODULES test_dgc_op)
-        py_test_modules(test_dgc_momentum_op MODULES test_dgc_momentum_op)
-        py_test_modules(test_dgc_optimizer MODULES test_dgc_optimizer)
-        py_test_modules(test_fleet_dgc_meta_optimizer MODULES test_fleet_dgc_meta_optimizer)
-    else()
-        # if not with dgc, must close all dgc tests
-        list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
-        list(REMOVE_ITEM DIST_TEST_OPS "test_dist_se_resnext_dgc")
+  add_subdirectory(distributed_passes)
+  add_subdirectory(ps)
+  add_subdirectory(auto_parallel)
+
+  # FIXME(typhoonzero): add these tests back
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler")
+
+  # TODO(sandyhouse): fix and add the ut back
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_hallreduce")
+
+  #not need
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_base")
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_base")
+
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_ctr")
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_lars")
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_train")
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_save_load")
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_text_classification")
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_train")
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_word2vec")
+
+  list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo")
+
+  py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
+  py_test_modules(test_communicator_async MODULES test_communicator_async ENVS
+                  ${dist_ENVS})
+  py_test_modules(test_communicator_ps_gpu MODULES test_communicator_ps_gpu
+                  ENVS ${dist_ENVS})
+  py_test_modules(test_communicator_geo MODULES test_communicator_geo ENVS
+                  ${dist_ENVS})
+  py_test_modules(
+    test_communicator_half_async
+    MODULES
+    test_communicator_half_async
+    ENVS
+    ${dist_ENVS}
+    FLAGS_communicator_send_queue_size=1
+    FLAGS_communicator_max_merge_var_num=1)
+  py_test_modules(
+    test_communicator_sync
+    MODULES
+    test_communicator_sync
+    ENVS
+    ${dist_ENVS}
+    FLAGS_communicator_send_queue_size=1
+    FLAGS_communicator_max_merge_var_num=1)
+  py_test_modules(test_collective_optimizer MODULES test_collective_optimizer)
+  if(NOT APPLE)
+    py_test_modules(test_fleet_base MODULES test_fleet_base ENVS ${dist_ENVS})
+    py_test_modules(test_fleet_base_2 MODULES test_fleet_base_2 ENVS
+                    ${dist_ENVS})
+    py_test_modules(test_fleet_base_3 MODULES test_fleet_base_3 ENVS
+                    ${dist_ENVS})
+    py_test_modules(test_fleet_amp_init MODULES test_fleet_amp_init ENVS
+                    ${dist_ENVS})
+    py_test_modules(test_fleet_fp16_allreduce_meta_optimizer MODULES
+                    test_fleet_fp16_allreduce_meta_optimizer ENVS ${dist_ENVS})
+    py_test_modules(test_fleet_private_function MODULES
+                    test_fleet_private_function ENVS ${dist_ENVS})
+    py_test_modules(test_fleet_meta_optimizer_base MODULES
+                    test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
+    py_test_modules(test_fleet_distributed_strategy MODULES
+                    test_fleet_distributed_strategy)
+    py_test_modules(test_fleet_static_mp_layers MODULES
+                    test_fleet_static_mp_layers)
+    #py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
+    if(WITH_GPU
+       OR WITH_XPU
+       OR WITH_ASCEND
+       OR WITH_ASCEND_CL)
+      py_test_modules(test_fleet_amp_meta_optimizer MODULES
+                      test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
+      py_test_modules(
+        test_fleet_gradient_merge_meta_optimizer MODULES
+        test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS})
+      py_test_modules(test_fleet_graph_executor MODULES
+                      test_fleet_graph_executor ENVS ${dist_ENVS})
+      py_test_modules(test_fleet_hybrid_meta_optimizer MODULES
+                      test_fleet_hybrid_meta_optimizer ENVS ${dist_ENVS})
+      py_test_modules(test_fleet_recompute_meta_optimizer MODULES
+                      test_fleet_recompute_meta_optimizer ENVS ${dist_ENVS})
+      py_test_modules(test_fleet_sharding_meta_optimizer MODULES
+                      test_fleet_sharding_meta_optimizer ENVS ${dist_ENVS})
+    endif()
+    if(NOT WIN32)
+      py_test_modules(test_auto_parallel_partitioner MODULES
+                      test_auto_parallel_partitioner ENVS ${dist_ENVS})
+      py_test_modules(test_auto_parallel_partitioner_gpt MODULES
+                      test_auto_parallel_partitioner_gpt ENVS ${dist_ENVS})
+      py_test_modules(test_auto_parallel_searcher MODULES
+                      test_auto_parallel_searcher ENVS ${dist_ENVS})
+      py_test_modules(test_auto_parallel_reshard MODULES
+                      test_auto_parallel_reshard ENVS ${dist_ENVS})
+      py_test_modules(test_auto_parallel_dist_tensor MODULES
+                      test_auto_parallel_dist_tensor ENVS ${dist_ENVS})
+      py_test_modules(test_auto_parallel_reshard_serial MODULES
+                      test_auto_parallel_reshard_serial ENVS ${dist_ENVS})
+      py_test_modules(test_auto_parallel_reshard_mppp MODULES
+                      test_auto_parallel_reshard_mppp ENVS ${dist_ENVS})
+      py_test_modules(test_auto_parallel_reshard_dpmppp MODULES
+                      test_auto_parallel_reshard_dpmppp ENVS ${dist_ENVS})
+      py_test_modules(test_auto_parallel_cost_model MODULES
+                      test_auto_parallel_cost_model ENVS ${dist_ENVS})
+      if(WITH_GPU
+         OR WITH_XPU
+         OR WITH_ASCEND
+         OR WITH_ASCEND_CL)
+        py_test_modules(test_fleet_lamb_meta_optimizer MODULES
+                        test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_lars_meta_optimizer MODULES
+                        test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_localsgd_meta_optimizer MODULES
+                        test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
+
+      endif()
+    endif(NOT WIN32)
+  endif(NOT APPLE)
+  if(WITH_DGC)
+    # if with dgc, test all dgc tests.
+    # NOTE. dist dgc tests is already in DIST_TEST_OPS
+    py_test_modules(test_dgc_op MODULES test_dgc_op)
+    py_test_modules(test_dgc_momentum_op MODULES test_dgc_momentum_op)
+    py_test_modules(test_dgc_optimizer MODULES test_dgc_optimizer)
+    py_test_modules(test_fleet_dgc_meta_optimizer MODULES
+                    test_fleet_dgc_meta_optimizer)
+  else()
+    # if not with dgc, must close all dgc tests
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_se_resnext_dgc")
+  endif()
+  if(NOT APPLE)
+    if(WITH_GPU OR WITH_ROCM)
+      bash_test_modules(test_c_comm_init_op START_BASH test_c_comm_init_op.sh
+                        ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+      py_test_modules(test_launch_coverage MODULES test_launch_coverage)
+    endif()
+
+    bash_test_modules(test_fleetrun START_BASH test_fleetrun.sh ENVS
+                      PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+    if(WITH_GPU
+       OR WITH_XPU
+       OR WITH_ASCEND
+       OR WITH_ASCEND_CL)
+      bash_test_modules(
+        test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS
+        PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+      bash_test_modules(
+        test_fleet_run_random_port START_BASH test_fleet_run_random_port.sh
+        ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+      bash_test_modules(
+        test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS
+        PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+      bash_test_modules(
+        test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS
+        PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+    endif()
+    if(WITH_ASCEND OR WITH_ASCEND_CL)
+      bash_test_modules(
+        test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS
+        PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+      bash_test_modules(test_ascend_group START_BASH test_ascend_group.sh ENVS
+                        PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
     endif()
-    if(NOT APPLE)
-        if(WITH_GPU OR WITH_ROCM)
-            bash_test_modules(test_c_comm_init_op START_BASH test_c_comm_init_op.sh  ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-            py_test_modules(test_launch_coverage MODULES test_launch_coverage)
-        endif()
-
-        bash_test_modules(test_fleetrun START_BASH test_fleetrun.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
-            bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-            bash_test_modules(test_fleet_run_random_port START_BASH test_fleet_run_random_port.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-            bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-            bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        endif()
-        if(WITH_ASCEND OR WITH_ASCEND_CL)
-            bash_test_modules(test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-            bash_test_modules(test_ascend_group START_BASH test_ascend_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        endif()
-
-        # port range (20000, 23000) is reserved for dist-ops
-        set(dist_ut_port 20001)
-        foreach(TEST_OP ${DIST_TEST_OPS})
-            bash_test_modules(${TEST_OP} START_BASH dist_test.sh LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
-            MATH(EXPR dist_ut_port "${dist_ut_port}+20")
-            if(dist_ut_port GREATER_EQUAL 22998)
-                message(FATAL_ERROR "available ports have been exhausted:${dist_ut_port}")
-            endif()
-        endforeach(TEST_OP)
-        # solve it later.
-        bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
-        if (WITH_GLOO)
-            bash_test_modules(test_cpuonly_launch START_BASH test_cpuonly_launch.sh LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
-        endif()
-        if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
-            bash_test_modules(test_new_group START_BASH test_new_group.sh LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
-        endif()
-    endif(NOT APPLE)
+
+    # port range (20000, 23000) is reserved for dist-ops
+    set(dist_ut_port 20001)
+    foreach(TEST_OP ${DIST_TEST_OPS})
+      bash_test_modules(
+        ${TEST_OP}
+        START_BASH
+        dist_test.sh
+        LABELS
+        "RUN_TYPE=EXCLUSIVE"
+        ENVS
+        "PADDLE_DIST_UT_PORT=${dist_ut_port}")
+      math(EXPR dist_ut_port "${dist_ut_port}+20")
+      if(dist_ut_port GREATER_EQUAL 22998)
+        message(
+          FATAL_ERROR "available ports have been exhausted:${dist_ut_port}")
+      endif()
+    endforeach(TEST_OP)
+    # solve it later.
+    bash_test_modules(
+      test_fleet_launch_ps
+      START_BASH
+      test_fleet_launch_ps.sh
+      LABELS
+      "RUN_TYPE=EXCLUSIVE"
+      ENVS
+      "PADDLE_DIST_UT_PORT=${dist_ut_port}"
+      PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+    if(WITH_GLOO)
+      bash_test_modules(
+        test_cpuonly_launch
+        START_BASH
+        test_cpuonly_launch.sh
+        LABELS
+        "RUN_TYPE=EXCLUSIVE"
+        ENVS
+        "PADDLE_DIST_UT_PORT=${dist_ut_port}"
+        PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+    endif()
+    if(WITH_GPU
+       OR WITH_XPU
+       OR WITH_ASCEND
+       OR WITH_ASCEND_CL)
+      bash_test_modules(
+        test_new_group
+        START_BASH
+        test_new_group.sh
+        LABELS
+        "RUN_TYPE=EXCLUSIVE"
+        ENVS
+        "PADDLE_DIST_UT_PORT=${dist_ut_port}+20"
+        PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+    endif()
+  endif(NOT APPLE)
 endif()
 
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf)
@@ -789,65 +982,172 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf)
 # We guess there are some bugs in cuda 10.1 or 10.2,
 # since this unittest is stable in cuda 11 (py3 pipeline) now.
 if(NOT WITH_COVERAGE)
-  py_test_modules(test_parallel_executor_profiler MODULES test_parallel_executor_profiler)
-  set_tests_properties(test_parallel_executor_profiler PROPERTIES LABELS "RUN_TYPE=DIST")
+  py_test_modules(test_parallel_executor_profiler MODULES
+                  test_parallel_executor_profiler)
+  set_tests_properties(test_parallel_executor_profiler
+                       PROPERTIES LABELS "RUN_TYPE=DIST")
   set_tests_properties(test_parallel_executor_profiler PROPERTIES TIMEOUT 120)
 endif()
-py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer)
+py_test_modules(test_parallel_executor_transformer MODULES
+                test_parallel_executor_transformer)
 if(WIN32)
-    py_test_modules(test_parallel_executor_transformer_auto_growth MODULES test_parallel_executor_transformer_auto_growth ENVS FLAGS_allocator_strategy=auto_growth CUDA_VISIBLE_DEVICES=0)
-    py_test_modules(test_fuse_all_reduce_pass MODULES test_fuse_all_reduce_pass ENVS CUDA_VISIBLE_DEVICES=0)
-    py_test_modules(test_feed_data_check_shape_type MODULES test_feed_data_check_shape_type ENVS CUDA_VISIBLE_DEVICES=0)
-    py_test_modules(test_fetch_lod_tensor_array MODULES test_fetch_lod_tensor_array ENVS CUDA_VISIBLE_DEVICES=0)
+  py_test_modules(
+    test_parallel_executor_transformer_auto_growth MODULES
+    test_parallel_executor_transformer_auto_growth ENVS
+    FLAGS_allocator_strategy=auto_growth CUDA_VISIBLE_DEVICES=0)
+  py_test_modules(test_fuse_all_reduce_pass MODULES test_fuse_all_reduce_pass
+                  ENVS CUDA_VISIBLE_DEVICES=0)
+  py_test_modules(test_feed_data_check_shape_type MODULES
+                  test_feed_data_check_shape_type ENVS CUDA_VISIBLE_DEVICES=0)
+  py_test_modules(test_fetch_lod_tensor_array MODULES
+                  test_fetch_lod_tensor_array ENVS CUDA_VISIBLE_DEVICES=0)
 else()
-    py_test_modules(test_parallel_executor_transformer_auto_growth MODULES test_parallel_executor_transformer_auto_growth ENVS FLAGS_allocator_strategy=auto_growth)
-    py_test_modules(test_fuse_all_reduce_pass MODULES test_fuse_all_reduce_pass)
-    py_test_modules(test_feed_data_check_shape_type MODULES test_feed_data_check_shape_type)
-    py_test_modules(test_fetch_lod_tensor_array MODULES test_fetch_lod_tensor_array)
+  py_test_modules(
+    test_parallel_executor_transformer_auto_growth MODULES
+    test_parallel_executor_transformer_auto_growth ENVS
+    FLAGS_allocator_strategy=auto_growth)
+  py_test_modules(test_fuse_all_reduce_pass MODULES test_fuse_all_reduce_pass)
+  py_test_modules(test_feed_data_check_shape_type MODULES
+                  test_feed_data_check_shape_type)
+  py_test_modules(test_fetch_lod_tensor_array MODULES
+                  test_fetch_lod_tensor_array)
 endif()
 
 py_test_modules(test_data_norm_op MODULES test_data_norm_op)
-py_test_modules(test_fuse_bn_act_pass MODULES test_fuse_bn_act_pass ENVS FLAGS_cudnn_deterministic=1 FLAGS_cudnn_batchnorm_spatial_persistent=1 FLAGS_conv_workspace_size_limit=1000)
-py_test_modules(test_fuse_bn_add_act_pass MODULES test_fuse_bn_add_act_pass ENVS FLAGS_cudnn_deterministic=1 FLAGS_cudnn_batchnorm_spatial_persistent=1 FLAGS_conv_workspace_size_limit=1000)
+py_test_modules(
+  test_fuse_bn_act_pass
+  MODULES
+  test_fuse_bn_act_pass
+  ENVS
+  FLAGS_cudnn_deterministic=1
+  FLAGS_cudnn_batchnorm_spatial_persistent=1
+  FLAGS_conv_workspace_size_limit=1000)
+py_test_modules(
+  test_fuse_bn_add_act_pass
+  MODULES
+  test_fuse_bn_add_act_pass
+  ENVS
+  FLAGS_cudnn_deterministic=1
+  FLAGS_cudnn_batchnorm_spatial_persistent=1
+  FLAGS_conv_workspace_size_limit=1000)
 
 # NOTE: These unittests will appear NaN steadily in windows CI. After analysis,
 # it is found that windows CI will run all the training unittests with the ON_INFER option turned on,
 # which will not appear in other CIs. The calculation behavior of some ops in inference mode is
 # inconsistent with that in non-inference mode.
 if(NOT ON_INFER)
-    py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES test_parallel_executor_seresnext_base_cpu)
-    py_test_modules(test_parallel_executor_seresnext_with_reduce_cpu MODULES test_parallel_executor_seresnext_with_reduce_cpu)
-    py_test_modules(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu MODULES test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
-    set_tests_properties(test_parallel_executor_seresnext_base_cpu PROPERTIES TIMEOUT 900)
-    set_tests_properties(test_parallel_executor_seresnext_base_cpu PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
-    set_tests_properties(test_parallel_executor_seresnext_with_reduce_cpu PROPERTIES TIMEOUT 750)
-    set_tests_properties(test_parallel_executor_seresnext_with_reduce_cpu PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
-    set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu PROPERTIES TIMEOUT 750)
-    set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
+  py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES
+                  test_parallel_executor_seresnext_base_cpu)
+  py_test_modules(test_parallel_executor_seresnext_with_reduce_cpu MODULES
+                  test_parallel_executor_seresnext_with_reduce_cpu)
+  py_test_modules(
+    test_parallel_executor_seresnext_with_fuse_all_reduce_cpu MODULES
+    test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
+  set_tests_properties(test_parallel_executor_seresnext_base_cpu
+                       PROPERTIES TIMEOUT 900)
+  set_tests_properties(test_parallel_executor_seresnext_base_cpu
+                       PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
+  set_tests_properties(test_parallel_executor_seresnext_with_reduce_cpu
+                       PROPERTIES TIMEOUT 750)
+  set_tests_properties(test_parallel_executor_seresnext_with_reduce_cpu
+                       PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
+  set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu
+                       PROPERTIES TIMEOUT 750)
+  set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu
+                       PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
 endif()
 
 if(NOT WIN32)
-    # TODO: fix these unittests failure on Windows
-    py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1)
-    py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer)
-    # FIXME(zcd): temporally disable test_parallel_executor_fetch_feed in Windows CI because of the random failure.
-    py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed)
-    set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
+  # TODO: fix these unittests failure on Windows
+  py_test_modules(test_layers MODULES test_layers ENVS
+                  FLAGS_cudnn_deterministic=1)
+  py_test_modules(test_ir_memory_optimize_transformer MODULES
+                  test_ir_memory_optimize_transformer)
+  # FIXME(zcd): temporally disable test_parallel_executor_fetch_feed in Windows CI because of the random failure.
+  py_test_modules(test_parallel_executor_fetch_feed MODULES
+                  test_parallel_executor_fetch_feed)
+  set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
 endif()
 
-if(WITH_DISTRIBUTE AND NOT APPLE AND NOT WIN32)
-    py_test_modules(test_fleet_checkpoint MODULES test_fleet_checkpoint)
-    set_tests_properties(test_fleet_checkpoint PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_fleet_checkpoint PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
-    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 200 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
-    bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 200  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
-    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 200  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
-    bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 200  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
-    bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 200  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
-    bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 200  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
-    bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 200  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
-    bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 200   LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
-    bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 200  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+if(WITH_DISTRIBUTE
+   AND NOT APPLE
+   AND NOT WIN32)
+  py_test_modules(test_fleet_checkpoint MODULES test_fleet_checkpoint)
+  set_tests_properties(test_fleet_checkpoint PROPERTIES TIMEOUT 200)
+  set_tests_properties(test_fleet_checkpoint
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+  bash_test_modules(
+    test_auto_checkpoint
+    START_BASH
+    dist_test.sh
+    TIMEOUT
+    200
+    LABELS
+    "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+  bash_test_modules(
+    test_auto_checkpoint1
+    START_BASH
+    dist_test.sh
+    TIMEOUT
+    200
+    LABELS
+    "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+  bash_test_modules(
+    test_auto_checkpoint2
+    START_BASH
+    dist_test.sh
+    TIMEOUT
+    200
+    LABELS
+    "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+  bash_test_modules(
+    test_auto_checkpoint3
+    START_BASH
+    dist_test.sh
+    TIMEOUT
+    200
+    LABELS
+    "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+  bash_test_modules(
+    test_auto_checkpoint_multiple
+    START_BASH
+    dist_test.sh
+    TIMEOUT
+    200
+    LABELS
+    "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+  bash_test_modules(
+    test_auto_checkpoint_dist_basic
+    START_BASH
+    dist_test.sh
+    TIMEOUT
+    200
+    LABELS
+    "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+  bash_test_modules(
+    test_hdfs1
+    START_BASH
+    dist_test.sh
+    TIMEOUT
+    200
+    LABELS
+    "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+  bash_test_modules(
+    test_hdfs2
+    START_BASH
+    dist_test.sh
+    TIMEOUT
+    200
+    LABELS
+    "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+  bash_test_modules(
+    test_hdfs3
+    START_BASH
+    dist_test.sh
+    TIMEOUT
+    200
+    LABELS
+    "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 endif()
 
 add_subdirectory(sequence)
@@ -856,38 +1156,42 @@ add_subdirectory(rnn)
 add_subdirectory(autograd)
 add_subdirectory(distribution)
 
-if (NOT WIN32 OR NOT WITH_GPU)
-    add_subdirectory(fft)
+if(NOT WIN32 OR NOT WITH_GPU)
+  add_subdirectory(fft)
 endif()
 
-if (WITH_XPU)
-    add_subdirectory(xpu)
+if(WITH_XPU)
+  add_subdirectory(xpu)
 endif()
 
 # dist xpu tests:
-if (WITH_XPU_BKCL)
-    #py_test(test_collective_reduce_api_xpu SRCS "test_collective_reduce_api.py")
-    py_test(test_collective_allreduce_api_xpu SRCS "test_collective_allreduce_api.py")
+if(WITH_XPU_BKCL)
+  #py_test(test_collective_reduce_api_xpu SRCS "test_collective_reduce_api.py")
+  py_test(test_collective_allreduce_api_xpu
+          SRCS "test_collective_allreduce_api.py")
 endif()
 
 if(WIN32)
-    cc_test(cc_imp_py_test SRCS cc_imp_py_test.cc DEPS python)
+  cc_test(
+    cc_imp_py_test
+    SRCS cc_imp_py_test.cc
+    DEPS python)
 endif()
 
-if (WITH_ASCEND_CL)
-    add_subdirectory(npu)
+if(WITH_ASCEND_CL)
+  add_subdirectory(npu)
 endif()
 
-if (WITH_MKLDNN)
-    add_subdirectory(mkldnn)
+if(WITH_MKLDNN)
+  add_subdirectory(mkldnn)
 endif()
 
-if (WITH_IPU)
-    add_subdirectory(ipu)
+if(WITH_IPU)
+  add_subdirectory(ipu)
 endif()
 
-if (WITH_MLU)
-    add_subdirectory(mlu)
+if(WITH_MLU)
+  add_subdirectory(mlu)
 endif()
 
 add_subdirectory(asp)
@@ -896,92 +1200,120 @@ add_subdirectory(ir)
 
 add_subdirectory(interpreter)
 
-if (WITH_TESTING)
-    set_property(TEST test_parallel_executor_mnist PROPERTY ENVIRONMENT GLOG_vmodule=all_reduce_deps_pass=10)
-    set_property(TEST test_parallel_executor_fix_op_run_order PROPERTY ENVIRONMENT GLOG_vmodule=fix_op_run_order_pass=10)
+if(WITH_TESTING)
+  set_property(TEST test_parallel_executor_mnist
+               PROPERTY ENVIRONMENT GLOG_vmodule=all_reduce_deps_pass=10)
+  set_property(TEST test_parallel_executor_fix_op_run_order
+               PROPERTY ENVIRONMENT GLOG_vmodule=fix_op_run_order_pass=10)
 endif()
 
-set_tests_properties(test_parallel_executor_test_while_train test_parallel_executor_mnist
-        test_parallel_executor_feed_persistable_var
-        test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
-        test_data_norm_op
-        test_dataloader_keep_order
-        test_dataloader_unkeep_order
-        test_parallel_executor_inference_feed_partial_data
-        test_parallel_ssa_graph_inference_feed_partial_data
-        test_fetch_unmerged
-        test_buffer_shared_memory_reuse_pass PROPERTIES LABELS "RUN_TYPE=DIST")
+set_tests_properties(
+  test_parallel_executor_test_while_train
+  test_parallel_executor_mnist
+  test_parallel_executor_feed_persistable_var
+  test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
+  test_data_norm_op
+  test_dataloader_keep_order
+  test_dataloader_unkeep_order
+  test_parallel_executor_inference_feed_partial_data
+  test_parallel_ssa_graph_inference_feed_partial_data
+  test_fetch_unmerged
+  test_buffer_shared_memory_reuse_pass
+  PROPERTIES LABELS "RUN_TYPE=DIST")
 # disable test_parallel_executor_fetch_isolated_var
 # set_tests_properties(test_parallel_executor_fetch_isolated_var PROPERTIES LABELS "RUN_TYPE=DIST")
-set_tests_properties(test_parallel_executor_crf test_sync_batch_norm_op test_inplace_abn_op
-        test_parallel_executor_seresnext_base_gpu
-        test_parallel_executor_seresnext_with_reduce_gpu
-        test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
-        test_distributed_fused_lamb_op_with_clip
-        test_distributed_fused_lamb_op_without_clip
-        test_distributed_fused_lamb_op_with_gradient_merge
-        test_parallel_executor_fetch_isolated_var
-        PROPERTIES LABELS "RUN_TYPE=DIST")
+set_tests_properties(
+  test_parallel_executor_crf
+  test_sync_batch_norm_op
+  test_inplace_abn_op
+  test_parallel_executor_seresnext_base_gpu
+  test_parallel_executor_seresnext_with_reduce_gpu
+  test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
+  test_distributed_fused_lamb_op_with_clip
+  test_distributed_fused_lamb_op_without_clip
+  test_distributed_fused_lamb_op_with_gradient_merge
+  test_parallel_executor_fetch_isolated_var
+  PROPERTIES LABELS "RUN_TYPE=DIST")
 
 if(NOT WIN32 AND NOT APPLE)
-    set_tests_properties(test_imperative_signal_handler PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-    set_tests_properties(test_imperative_data_loader_base PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-    set_tests_properties(test_imperative_data_loader_fds_clear PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-    # set_tests_properties(test_imperative_data_loader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-    set_tests_properties(test_multiprocess_dataloader_static PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-    set_tests_properties(test_multiprocess_dataloader_dynamic PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-    set_tests_properties(test_multiprocess_dataloader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-    set_tests_properties(test_multiprocess_dataloader_iterable_dataset_static PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-    set_tests_properties(test_multiprocess_dataloader_iterable_dataset_dynamic PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-    set_tests_properties(test_multiprocess_dataloader_dataset PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-    set_tests_properties(test_multiprocess_dataloader_static PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_imperative_signal_handler
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  set_tests_properties(test_imperative_data_loader_base
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  set_tests_properties(test_imperative_data_loader_fds_clear
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  # set_tests_properties(test_imperative_data_loader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  set_tests_properties(test_multiprocess_dataloader_static
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  set_tests_properties(test_multiprocess_dataloader_dynamic
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  set_tests_properties(test_multiprocess_dataloader_exception
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  set_tests_properties(test_multiprocess_dataloader_iterable_dataset_static
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  set_tests_properties(test_multiprocess_dataloader_iterable_dataset_dynamic
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  set_tests_properties(test_multiprocess_dataloader_dataset
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  set_tests_properties(test_multiprocess_dataloader_static PROPERTIES TIMEOUT
+                                                                      120)
 endif()
 
-if (NOT WIN32)
-    set_tests_properties(test_multiprocess_reader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-    set_tests_properties(test_layers PROPERTIES TIMEOUT 120)
-    if (WITH_NV_JETSON)
-    set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT 1200)
-    else ()
-    set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT 120)
-    endif ()
+if(NOT WIN32)
+  set_tests_properties(test_multiprocess_reader_exception
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  set_tests_properties(test_layers PROPERTIES TIMEOUT 120)
+  if(WITH_NV_JETSON)
+    set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT
+                                                                        1200)
+  else()
+    set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT
+                                                                        120)
+  endif()
 endif()
 
-if (WITH_DISTRIBUTE AND NOT WIN32)
-    set_tests_properties(test_fleet_utils PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_cpu_barrier_with_gloo PROPERTIES TIMEOUT 40)
+if(WITH_DISTRIBUTE AND NOT WIN32)
+  set_tests_properties(test_fleet_utils PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_collective_cpu_barrier_with_gloo PROPERTIES TIMEOUT
+                                                                        40)
 endif()
 
-if (WITH_DISTRIBUTE)
-    set_tests_properties(test_communicator_half_async PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_dist_fleet_ctr2 PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_dist_fleet_sparse_embedding_ctr PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_dist_fleet_infer PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_dist_fleet_raw_program_optimizer PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_dist_fleet_raw_program_optimizer_fuse_allreduce PROPERTIES TIMEOUT 60)
-    set_tests_properties(test_dist_dygraph_apis PROPERTIES TIMEOUT 120)
+if(WITH_DISTRIBUTE)
+  set_tests_properties(test_communicator_half_async PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_dist_fleet_ctr2 PROPERTIES TIMEOUT 200)
+  set_tests_properties(test_dist_fleet_sparse_embedding_ctr PROPERTIES TIMEOUT
+                                                                       200)
+  set_tests_properties(test_dist_fleet_infer PROPERTIES TIMEOUT 200)
+  set_tests_properties(test_dist_fleet_raw_program_optimizer PROPERTIES TIMEOUT
+                                                                        120)
+  set_tests_properties(test_dist_fleet_raw_program_optimizer_fuse_allreduce
+                       PROPERTIES TIMEOUT 60)
+  set_tests_properties(test_dist_dygraph_apis PROPERTIES TIMEOUT 120)
 endif()
 
-if (WITH_DISTRIBUTE AND NOT APPLE)
-    if(WITH_GPU OR WITH_ROCM)
-        set_tests_properties(test_c_comm_init_op PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_dist_mnist_gradient_merge PROPERTIES TIMEOUT 360)
-    endif()
+if(WITH_DISTRIBUTE AND NOT APPLE)
+  if(WITH_GPU OR WITH_ROCM)
+    set_tests_properties(test_c_comm_init_op PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_dist_mnist_gradient_merge PROPERTIES TIMEOUT 360)
+  endif()
 endif()
 
 # setting timeout value as 15S
 set_tests_properties(test_run PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_lod_tensor_to_selected_rows
+                     PROPERTIES TIMEOUT 200)
 set_tests_properties(test_lstm_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_star_gan_with_gradient_penalty PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_star_gan_with_gradient_penalty
+                     PROPERTIES TIMEOUT 120)
 
 set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 200)
 set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_profiler PROPERTIES TIMEOUT 120)
-set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT 120)
+set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT
+                                                                        120)
 set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cross_entropy_loss PROPERTIES TIMEOUT 180)
 set_tests_properties(test_fetch_unmerged PROPERTIES TIMEOUT 120)
@@ -993,18 +1325,20 @@ set_tests_properties(test_elementwise_div_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_regularizer_api PROPERTIES TIMEOUT 150)
 set_tests_properties(test_multiclass_nms_op PROPERTIES TIMEOUT 120)
 if(NOT WIN32)
-    if (WITH_NV_JETSON)
+  if(WITH_NV_JETSON)
     set_tests_properties(test_ir_memory_optimize_nlp PROPERTIES TIMEOUT 1200)
-    else ()
+  else()
     set_tests_properties(test_ir_memory_optimize_nlp PROPERTIES TIMEOUT 120)
-    endif ()
+  endif()
 endif()
 set_tests_properties(test_add_reader_dependency PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bilateral_slice_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_buffer_shared_memory_reuse_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_buffer_shared_memory_reuse_pass PROPERTIES TIMEOUT
+                                                                     120)
 set_tests_properties(test_fuse_relu_depthwise_conv_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fleet_util PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_transformer_sorted_gradient PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_transformer_sorted_gradient
+                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_matmul_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nearest_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trilinear_interp_op PROPERTIES TIMEOUT 120)
@@ -1013,56 +1347,66 @@ set_tests_properties(test_gather_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_static_save_load PROPERTIES TIMEOUT 250)
 set_tests_properties(test_pylayer_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_paddle_save_load_binary PROPERTIES TIMEOUT 120)
-if (WIN32)
-    set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 900)
-    set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250)
+if(WIN32)
+  set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 900)
+  set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250)
 else()
-    set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 600)
-    set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250)
+  set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 600)
+  set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250)
 endif()
-if (WITH_NV_JETSON)
-    set_tests_properties(test_concat_op PROPERTIES TIMEOUT 1200)
-    set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 1200)
-    set_tests_properties(test_conv3d_transpose_op PROPERTIES TIMEOUT 1200)
-    set_tests_properties(test_conv3d_op PROPERTIES TIMEOUT 1200)
-    set_tests_properties(test_norm_op PROPERTIES TIMEOUT 1200)
-    set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 1500)
-    set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 1500)
+if(WITH_NV_JETSON)
+  set_tests_properties(test_concat_op PROPERTIES TIMEOUT 1200)
+  set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 1200)
+  set_tests_properties(test_conv3d_transpose_op PROPERTIES TIMEOUT 1200)
+  set_tests_properties(test_conv3d_op PROPERTIES TIMEOUT 1200)
+  set_tests_properties(test_norm_op PROPERTIES TIMEOUT 1200)
+  set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 1500)
+  set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 1500)
 else()
-    set_tests_properties(test_concat_op PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_conv3d_transpose_op PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_conv3d_op PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_norm_op PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
-    set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
+  set_tests_properties(test_concat_op PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_conv3d_transpose_op PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_conv3d_op PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_norm_op PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
+  set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
 endif()
-set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_selected_rows_to_lod_tensor
+                     PROPERTIES TIMEOUT 200)
 set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_parallel_ssa_graph_inference_feed_partial_data PROPERTIES TIMEOUT 120)
+set_tests_properties(test_parallel_ssa_graph_inference_feed_partial_data
+                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_crf PROPERTIES TIMEOUT 120)
 #set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_save_load PROPERTIES TIMEOUT 120)
-set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT 120)
-set_tests_properties(test_parallel_executor_seresnext_with_reduce_gpu PROPERTIES TIMEOUT 120)
+set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT
+                                                                        120)
+set_tests_properties(test_parallel_executor_seresnext_with_reduce_gpu
+                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dropout_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_argsort_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_gather_nd_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nn_grad PROPERTIES TIMEOUT 180)
 set_tests_properties(test_elementwise_sub_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_gpu PROPERTIES TIMEOUT 120)
-set_tests_properties(test_distributed_fused_lamb_op_with_clip PROPERTIES TIMEOUT 120)
-set_tests_properties(test_distributed_fused_lamb_op_without_clip PROPERTIES TIMEOUT 120)
-set_tests_properties(test_distributed_fused_lamb_op_with_gradient_merge PROPERTIES TIMEOUT 120)
+set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
+                     PROPERTIES TIMEOUT 120)
+set_tests_properties(test_distributed_fused_lamb_op_with_clip PROPERTIES TIMEOUT
+                                                                         120)
+set_tests_properties(test_distributed_fused_lamb_op_without_clip
+                     PROPERTIES TIMEOUT 120)
+set_tests_properties(test_distributed_fused_lamb_op_with_gradient_merge
+                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_min_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300)
-set_tests_properties(test_parallel_executor_transformer_auto_growth PROPERTIES TIMEOUT 120)
+set_tests_properties(test_parallel_executor_transformer_auto_growth
+                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_py_reader_using_executor PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_add_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_weight_decay PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_ptb_rnn_sorted_gradient PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_ptb_rnn_sorted_gradient PROPERTIES TIMEOUT
+                                                                        120)
 set_tests_properties(test_crop_tensor_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eager_deletion_lstm_net PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_mnist PROPERTIES TIMEOUT 120)
@@ -1070,7 +1414,8 @@ set_tests_properties(test_imperative_ptb_rnn PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_save_load_v2 PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv2d_transpose_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_prroi_pool_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_multiprocess_dataloader_iterable_dataset_static PROPERTIES TIMEOUT 120)
+set_tests_properties(test_multiprocess_dataloader_iterable_dataset_static
+                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_lstm_cudnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_stack_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bilinear_interp_v2_op PROPERTIES TIMEOUT 120)
@@ -1081,14 +1426,16 @@ set_tests_properties(test_deformable_psroi_pooling PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trilinear_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_static_runner_mnist PROPERTIES TIMEOUT 120)
 set_tests_properties(test_masked_select_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_sigmoid_cross_entropy_with_logits_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_sigmoid_cross_entropy_with_logits_op
+                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 150)
 set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cond PROPERTIES TIMEOUT 120)
 set_tests_properties(test_space_to_depth_op PROPERTIES TIMEOUT 200)
 set_tests_properties(test_dyn_rnn PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sgd_op PROPERTIES TIMEOUT 250)
-set_tests_properties(test_parallel_executor_seresnext_base_gpu PROPERTIES TIMEOUT 120)
+set_tests_properties(test_parallel_executor_seresnext_base_gpu
+                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_norm_nn_grad PROPERTIES TIMEOUT 180)
 set_tests_properties(test_matrix_nms_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_generator_dataloader PROPERTIES TIMEOUT 120)
@@ -1098,7 +1445,9 @@ set_tests_properties(test_softmax_with_cross_entropy_op PROPERTIES TIMEOUT 220)
 set_tests_properties(test_reduce_op PROPERTIES TIMEOUT 500)
 set_tests_properties(test_adam_optimizer_fp32_fp64 PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_nn_grad PROPERTIES TIMEOUT 120)
-set_tests_properties(test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(
+  test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
+  PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv_nn_grad PROPERTIES TIMEOUT 120)
 set_tests_properties(test_program_prune_backward PROPERTIES TIMEOUT 120)
 set_tests_properties(test_group_norm_op PROPERTIES TIMEOUT 120)
@@ -1123,17 +1472,20 @@ set_tests_properties(test_dygraph_multi_forward PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_ocr_attention_model PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_mnist PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 270)
-set_tests_properties(test_fused_elemwise_activation_op PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
+set_tests_properties(test_fused_elemwise_activation_op
+                     PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
 set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
 set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
 set_tests_properties(test_imperative_resnet PROPERTIES TIMEOUT 200)
-set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES TIMEOUT
+                                                                       200)
 set_tests_properties(test_imperative_se_resnext PROPERTIES TIMEOUT 200)
 set_tests_properties(test_matmul_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_slice_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_strided_slice_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_translated_layer PROPERTIES TIMEOUT 120)
-set_tests_properties(test_parallel_executor_inference_feed_partial_data PROPERTIES TIMEOUT 120)
+set_tests_properties(test_parallel_executor_inference_feed_partial_data
+                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pad3d_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataloader_keep_order PROPERTIES TIMEOUT 120)
 set_tests_properties(test_mean_op PROPERTIES TIMEOUT 120)
@@ -1142,118 +1494,155 @@ set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_split_program PROPERTIES TIMEOUT 120)
-if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
-    set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_mnist PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_parallel_dygraph_se_resnext PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 350)
-    set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 350)
-    set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30)
-    set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 500)
-    set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 350)
-    set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_class_center_sample PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_margin_cross_entropy PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_auto_parallel_data_unshard PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_auto_parallel_autoconvert PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_process_group PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT 100)
-    
-    if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
-        set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 200)
-        set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 200)
-        set_tests_properties(test_parallel_dygraph_sparse_embedding_over_height PROPERTIES TIMEOUT 150)
-    endif()
+if(WITH_DISTRIBUTE
+   AND WITH_GPU
+   AND WITH_NCCL)
+  set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT
+                                                                     120)
+  set_tests_properties(test_parallel_dygraph_mnist PROPERTIES TIMEOUT 200)
+  set_tests_properties(test_parallel_dygraph_se_resnext PROPERTIES TIMEOUT 200)
+  set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT
+                                                                         350)
+  set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT
+                                                                     350)
+  set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_parallel_dygraph_no_sync_gradient_check
+                       PROPERTIES TIMEOUT 30)
+  set_tests_properties(test_parallel_dygraph_pipeline_parallel
+                       PROPERTIES TIMEOUT 500)
+  set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT
+                                                                        200)
+  set_tests_properties(test_parallel_dygraph_sharding_parallel
+                       PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT
+                                                                         120)
+  set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200)
+  set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 350)
+  set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT
+                                                                        120)
+  set_tests_properties(test_parallel_class_center_sample PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_parallel_margin_cross_entropy PROPERTIES TIMEOUT
+                                                                     120)
+  set_tests_properties(test_auto_parallel_data_unshard PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_auto_parallel_autoconvert PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_collective_process_group PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT 100)
+
+  if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
+    set_tests_properties(test_parallel_dygraph_sparse_embedding
+                         PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT
+                                                                      200)
+    set_tests_properties(test_parallel_dygraph_sparse_embedding_over_height
+                         PROPERTIES TIMEOUT 150)
+  endif()
 endif()
 
 if(APPLE)
-    set_tests_properties(test_imperative_transformer_sorted_gradient PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_multiclass_nms_op PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_weight_decay PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_imperative_static_runner_mnist PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_imperative_transformer_sorted_gradient
+                       PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_multiclass_nms_op PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_weight_decay PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_imperative_static_runner_mnist PROPERTIES TIMEOUT
+                                                                      300)
 endif()
 
 if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
-    set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_alltoall_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_global_gather PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_collective_global_scatter PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_collective_sendrecv_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120)
-    if(WITH_DISTRIBUTE)
-        set_tests_properties(test_new_group_api PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
-        set_tests_properties(test_static_model_parallel_fused_feedforward PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_static_model_parallel_fused_attention PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_static_model_parallel_fused_multi_transformer PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_collective_split_embedding
-            test_collective_split_embedding_none_divisible
-            test_collective_split_row_linear
-            test_collective_split_col_linear
-            test_collective_scatter_api
-            test_collective_barrier_api
-            test_collective_reduce_api
-            test_pipeline_parallel
-            test_collective_allreduce_api
-            test_new_group_api
-            test_collective_broadcast_api
-            test_collective_allgather_api
-            test_collective_alltoall_api
-            test_collective_global_gather
-            test_collective_global_scatter
-            PROPERTIES LABELS "RUN_TYPE=DIST")
-    endif()
-    set_tests_properties(test_paddle_multiprocessing PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_reduce_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_pipeline_parallel PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_reduce PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_allreduce PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_c_concat PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_c_split PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_allgather PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_c_identity PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_scatter_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_barrier_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_scatter PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_sendrecv PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_collective_alltoall_api PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_collective_global_gather PROPERTIES TIMEOUT 200)
+  set_tests_properties(test_collective_global_scatter PROPERTIES TIMEOUT 200)
+  set_tests_properties(test_collective_sendrecv_api PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120)
+  if(WITH_DISTRIBUTE)
+    set_tests_properties(test_new_group_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
+    set_tests_properties(test_static_model_parallel_fused_feedforward
+                         PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_static_model_parallel_fused_attention
+                         PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_static_model_parallel_fused_multi_transformer
+                         PROPERTIES TIMEOUT 120)
+    set_tests_properties(
+      test_collective_split_embedding
+      test_collective_split_embedding_none_divisible
+      test_collective_split_row_linear
+      test_collective_split_col_linear
+      test_collective_scatter_api
+      test_collective_barrier_api
+      test_collective_reduce_api
+      test_pipeline_parallel
+      test_collective_allreduce_api
+      test_new_group_api
+      test_collective_broadcast_api
+      test_collective_allgather_api
+      test_collective_alltoall_api
+      test_collective_global_gather
+      test_collective_global_scatter
+      PROPERTIES LABELS "RUN_TYPE=DIST")
+  endif()
+  set_tests_properties(test_paddle_multiprocessing PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_collective_reduce_api PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_pipeline_parallel PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_collective_reduce PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_allreduce PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_c_concat PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_c_split PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_allgather PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_c_identity PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_collective_scatter_api PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_collective_barrier_api PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_collective_scatter PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_collective_sendrecv PROPERTIES TIMEOUT 120)
 endif()
 if(WITH_GPU OR WITH_ROCM)
-    set_tests_properties(test_imperative_auto_mixed_precision PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_parallel_dygraph_sync_batch_norm PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_rank_attention_op PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_imperative_auto_mixed_precision PROPERTIES TIMEOUT
+                                                                       300)
+  set_tests_properties(test_parallel_dygraph_sync_batch_norm PROPERTIES TIMEOUT
+                                                                        120)
+  set_tests_properties(test_rank_attention_op PROPERTIES TIMEOUT 120)
 endif()
 set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400)
-set_tests_properties(test_cuda_memory_reserved PROPERTIES ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")
-if (WITH_GLOO)
-    set_tests_properties(test_parallel_dygraph_dataparallel_cpuonly PROPERTIES TIMEOUT 30)
-    set_tests_properties(test_parallel_dygraph_unused_variables_gloo PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_sparse_embedding_gloo PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_sparse_embedding_over_height_gloo PROPERTIES TIMEOUT 120)
+set_tests_properties(
+  test_cuda_memory_reserved PROPERTIES ENVIRONMENT
+                                       "FLAGS_allocator_strategy=auto_growth")
+if(WITH_GLOO)
+  set_tests_properties(test_parallel_dygraph_dataparallel_cpuonly
+                       PROPERTIES TIMEOUT 30)
+  set_tests_properties(test_parallel_dygraph_unused_variables_gloo
+                       PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_parallel_dygraph_sparse_embedding_gloo
+                       PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_parallel_dygraph_sparse_embedding_over_height_gloo
+                       PROPERTIES TIMEOUT 120)
 endif()
 
 if($ENV{USE_STANDALONE_EXECUTOR})
-    # these test will fail in some server due to PR#42149, temporarily set it use old executor.
-    set_tests_properties(test_apply_pass_to_program PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-    set_tests_properties(test_buffer_shared_memory_reuse_pass PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-    set_tests_properties(test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-    set_tests_properties(test_imperative_optimizer PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-    set_tests_properties(test_imperative_star_gan_with_gradient_penalty PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-    set_tests_properties(test_switch_autotune PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
-    set_tests_properties(test_imperative_mnist_sorted_gradient PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+  # these test will fail in some server due to PR#42149, temporarily set it use old executor.
+  set_tests_properties(test_apply_pass_to_program
+                       PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+  set_tests_properties(test_buffer_shared_memory_reuse_pass
+                       PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+  set_tests_properties(
+    test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
+    PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+  set_tests_properties(test_imperative_optimizer
+                       PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+  set_tests_properties(test_imperative_star_gan_with_gradient_penalty
+                       PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+  set_tests_properties(test_switch_autotune
+                       PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+  set_tests_properties(test_imperative_mnist_sorted_gradient
+                       PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/__init__.py b/python/paddle/fluid/tests/unittests/__init__.py
index 193b91cdaa132..e427eb512474f 100644
--- a/python/paddle/fluid/tests/unittests/__init__.py
+++ b/python/paddle/fluid/tests/unittests/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.p
 
-# Note: On Windows, import form subdirectories such as dirA()->dirB(), current directory 
+# Note: On Windows, import form subdirectories such as dirA()->dirB(), current directory
 # will still be dirA(), But is should be dirB(). So it will ModulNotFoundError
 # please refer to https://stackoverflow.com/questions/8953844/import-module-from-subfolder
 
diff --git a/python/paddle/fluid/tests/unittests/ascend_group.py b/python/paddle/fluid/tests/unittests/ascend_group.py
index 851544e165980..1d3f308a61117 100644
--- a/python/paddle/fluid/tests/unittests/ascend_group.py
+++ b/python/paddle/fluid/tests/unittests/ascend_group.py
@@ -46,59 +46,55 @@ def init_communicator(startup_program, main_program, current_endpoint,
     assert group_rank >= 0
 
     block = startup_program.global_block()
-    nccl_id_var = block.create_var(
-        name=unique_name.generate('nccl_id'),
-        persistable=True,
-        type=core.VarDesc.VarType.RAW)
-    block.append_op(
-        type='c_gen_nccl_id',
-        inputs={},
-        outputs={'Out': nccl_id_var},
-        attrs={
-            'rank': group_rank,
-            'endpoint': current_endpoint,
-            'other_endpoints': other_endpoints,
-            OP_ROLE_KEY: OpRole.Forward,
-        })
-    block.append_op(
-        type='c_comm_init',
-        inputs={'X': nccl_id_var},
-        outputs={},
-        attrs={
-            'nranks': nranks,
-            'rank': group_rank,
-            'ring_id': ring_id,
-            OP_ROLE_KEY: OpRole.Forward,
-        })
+    nccl_id_var = block.create_var(name=unique_name.generate('nccl_id'),
+                                   persistable=True,
+                                   type=core.VarDesc.VarType.RAW)
+    block.append_op(type='c_gen_nccl_id',
+                    inputs={},
+                    outputs={'Out': nccl_id_var},
+                    attrs={
+                        'rank': group_rank,
+                        'endpoint': current_endpoint,
+                        'other_endpoints': other_endpoints,
+                        OP_ROLE_KEY: OpRole.Forward,
+                    })
+    block.append_op(type='c_comm_init',
+                    inputs={'X': nccl_id_var},
+                    outputs={},
+                    attrs={
+                        'nranks': nranks,
+                        'rank': group_rank,
+                        'ring_id': ring_id,
+                        OP_ROLE_KEY: OpRole.Forward,
+                    })
 
     # add input op for test
     fill_var_name = "tensor@Filled"
-    fill_var = block.create_var(
-        name=fill_var_name,
-        shape=[10, 10],
-        dtype='float32',
-        persistable=False,
-        stop_gradient=True)
-    block.append_op(
-        type="fill_constant",
-        outputs={"Out": fill_var_name},
-        attrs={
-            "shape": [10, 10],
-            "dtype": fill_var.dtype,
-            "value": 1.0,
-            "place_type": 1
-        })
+    fill_var = block.create_var(name=fill_var_name,
+                                shape=[10, 10],
+                                dtype='float32',
+                                persistable=False,
+                                stop_gradient=True)
+    block.append_op(type="fill_constant",
+                    outputs={"Out": fill_var_name},
+                    attrs={
+                        "shape": [10, 10],
+                        "dtype": fill_var.dtype,
+                        "value": 1.0,
+                        "place_type": 1
+                    })
 
     with fluid.program_guard(main_program):
         op_type = "c_allreduce_sum"
         data = fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5)
         helper = LayerHelper(op_type, **locals())
-        helper.append_op(
-            type=op_type,
-            inputs={'X': [data]},
-            outputs={'Out': [data]},
-            attrs={'ring_id': ring_id,
-                   'use_calc_stream': True})
+        helper.append_op(type=op_type,
+                         inputs={'X': [data]},
+                         outputs={'Out': [data]},
+                         attrs={
+                             'ring_id': ring_id,
+                             'use_calc_stream': True
+                         })
 
     print("startup program:", startup_program)
     print("main program:", main_program)
@@ -138,11 +134,10 @@ def train(world_endpoints, world_device_ids, local_device_ids, local_rank):
     main_program = main_programs[local_rank]
     loss = Loss(Block(main_program))
     optimizer = ascend_optimizer.AscendOptimizer(None, fetch_list=[])
-    optimizer.minimize(
-        loss,
-        startup_program,
-        auto_dp=True,
-        rank_table_file=os.getenv("RANK_TABLE_FILE", None))
+    optimizer.minimize(loss,
+                       startup_program,
+                       auto_dp=True,
+                       rank_table_file=os.getenv("RANK_TABLE_FILE", None))
 
     exe = paddle.static.Executor(paddle.CPUPlace())
     exe.run(startup_program)
diff --git a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
index 76856d88e1789..4fd16354e6c1a 100644
--- a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
@@ -1,4 +1,7 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_static")
@@ -6,20 +9,31 @@ list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_dynamic")
 list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_sharding")
 
 foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 
 if(WITH_DISTRIBUTE)
-    if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
-        py_test_modules(test_fleet_with_asp_dynamic MODULES test_fleet_with_asp_dynamic ENVS ${dist_ENVS})
-        py_test_modules(test_fleet_with_asp_static MODULES test_fleet_with_asp_static ENVS ${dist_ENVS})
-    endif()
+  if(WITH_GPU
+     OR WITH_XPU
+     OR WITH_ASCEND
+     OR WITH_ASCEND_CL)
+    py_test_modules(test_fleet_with_asp_dynamic MODULES
+                    test_fleet_with_asp_dynamic ENVS ${dist_ENVS})
+    py_test_modules(test_fleet_with_asp_static MODULES
+                    test_fleet_with_asp_static ENVS ${dist_ENVS})
+  endif()
 endif()
 
-if((WITH_DISTRIBUTE) AND (NOT WIN32) AND (NOT APPLE))
-    if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
-        py_test_modules(test_fleet_with_asp_sharding MODULES test_fleet_with_asp_sharding ENVS ${dist_ENVS})
-    endif()
+if((WITH_DISTRIBUTE)
+   AND (NOT WIN32)
+   AND (NOT APPLE))
+  if(WITH_GPU
+     OR WITH_XPU
+     OR WITH_ASCEND
+     OR WITH_ASCEND_CL)
+    py_test_modules(test_fleet_with_asp_sharding MODULES
+                    test_fleet_with_asp_sharding ENVS ${dist_ENVS})
+  endif()
 endif()
 
 set_tests_properties(test_asp_pruning_dynamic PROPERTIES TIMEOUT 30)
diff --git a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
index e594bc5c34eb3..1b387c081208d 100644
--- a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
+++ b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,16 +27,21 @@
 
 
 class TestASPHelperPruningBase(unittest.TestCase):
+
     def setUp(self):
         self.main_program = fluid.Program()
         self.startup_program = fluid.Program()
 
         def build_model():
-            img = fluid.data(
-                name='img', shape=[None, 3, 32, 32], dtype='float32')
+            img = fluid.data(name='img',
+                             shape=[None, 3, 32, 32],
+                             dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            hidden = fluid.layers.conv2d(
-                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.conv2d(input=img,
+                                         num_filters=4,
+                                         filter_size=3,
+                                         padding=2,
+                                         act="relu")
             hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
             prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
             return img, label, prediction
@@ -57,8 +62,8 @@ def run_inference_pruning_test(self, get_mask_gen_func,
     def run_training_pruning_test(self, get_mask_gen_func, get_mask_check_func):
         with fluid.program_guard(self.main_program, self.startup_program):
             loss = fluid.layers.mean(
-                fluid.layers.cross_entropy(
-                    input=self.predict, label=self.label))
+                fluid.layers.cross_entropy(input=self.predict,
+                                           label=self.label))
             optimizer = paddle.incubate.asp.decorate(
                 fluid.optimizer.SGD(learning_rate=0.01))
             optimizer.minimize(loss, self.startup_program)
@@ -74,12 +79,13 @@ def run_training_pruning_test(self, get_mask_gen_func, get_mask_check_func):
     def __pruning_and_checking(self, exe, place, mask_func_name,
                                check_func_name, with_mask):
         exe.run(self.startup_program)
-        paddle.incubate.asp.prune_model(
-            self.main_program, mask_algo=mask_func_name, with_mask=with_mask)
+        paddle.incubate.asp.prune_model(self.main_program,
+                                        mask_algo=mask_func_name,
+                                        with_mask=with_mask)
         for param in self.main_program.global_block().all_parameters():
             if ASPHelper._is_supported_layer(self.main_program, param.name):
-                mat = np.array(fluid.global_scope().find_var(param.name)
-                               .get_tensor())
+                mat = np.array(fluid.global_scope().find_var(
+                    param.name).get_tensor())
                 self.assertTrue(
                     paddle.fluid.contrib.sparsity.check_sparsity(
                         mat.T, func_name=check_func_name, n=2, m=4))
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
index dca56076dbceb..4ee7c2a99fb2f 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,6 +26,7 @@
 
 
 class MyOwnLayer(Layer):
+
     def __init__(self):
         super(MyOwnLayer, self).__init__()
 
@@ -48,6 +49,7 @@ def my_own_pruning(tensor, m, n, mask_algo, param_name):
 
 
 class TestASPAddSupportedLayer(unittest.TestCase):
+
     def test_add_supported_layer_via_name(self):
         sparsity.add_supported_layer("test_supported_1")
         sparsity.add_supported_layer("test_supported_2", my_own_pruning)
@@ -67,21 +69,25 @@ def test_add_supported_layer_via_name(self):
 
 
 class TestASPDynamicCustomerizedPruneFunc(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
 
         class CustomerLayer(paddle.nn.Layer):
+
             def __init__(self):
                 super(CustomerLayer, self).__init__()
 
-                self.weight = self.create_parameter(
-                    shape=[32, 32], attr=None, dtype='float32', is_bias=False)
+                self.weight = self.create_parameter(shape=[32, 32],
+                                                    attr=None,
+                                                    dtype='float32',
+                                                    is_bias=False)
                 self.linear1 = paddle.nn.Linear(32, 32)
                 self.linear2 = paddle.nn.Linear(32, 10)
 
             def forward(self, input_):
-                hidden = paddle.nn.functional.linear(
-                    x=input_, weight=self.weight)
+                hidden = paddle.nn.functional.linear(x=input_,
+                                                     weight=self.weight)
                 hidden = self.linear1(hidden)
                 out = self.linear2(hidden)
                 return out
@@ -139,8 +145,8 @@ def test_training_pruning(self):
                     self.assertLessEqual(
                         np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
                     self.assertLessEqual(
-                        np.sum(mat_mask.flatten() - static_tensor_mask.flatten(
-                        )), 1e-4)
+                        np.sum(mat_mask.flatten() -
+                               static_tensor_mask.flatten()), 1e-4)
                 else:
                     self.assertTrue(
                         sparsity.check_sparsity(
@@ -158,6 +164,7 @@ def test_training_pruning(self):
 
 
 class TestASPStaticCustomerizedPruneFunc(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
 
@@ -167,11 +174,15 @@ def setUp(self):
         self.customer_prefix = "customer_layer"
 
         def build_model():
-            img = fluid.data(
-                name='img', shape=[None, 3, 32, 32], dtype='float32')
+            img = fluid.data(name='img',
+                             shape=[None, 3, 32, 32],
+                             dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            hidden = fluid.layers.conv2d(
-                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.conv2d(input=img,
+                                         num_filters=4,
+                                         filter_size=3,
+                                         padding=2,
+                                         act="relu")
             hidden = fluid.layers.fc(input=hidden,
                                      size=32,
                                      act='relu',
@@ -198,15 +209,16 @@ def build_model():
     def test_inference_pruning(self):
         self.exe.run(self.startup_program)
 
-        sparsity.prune_model(
-            self.main_program, mask_algo="mask_1d", with_mask=False)
+        sparsity.prune_model(self.main_program,
+                             mask_algo="mask_1d",
+                             with_mask=False)
 
         supported_layer_count = 0
         for param in self.main_program.global_block().all_parameters():
-            mat = np.array(fluid.global_scope().find_var(param.name).get_tensor(
-            ))
-            if sparsity.asp.ASPHelper._is_supported_layer(self.main_program,
-                                                          param.name):
+            mat = np.array(fluid.global_scope().find_var(
+                param.name).get_tensor())
+            if sparsity.asp.ASPHelper._is_supported_layer(
+                    self.main_program, param.name):
                 supported_layer_count += 1
                 if (self.customer_prefix in param.name):
                     self.assertLessEqual(
@@ -223,33 +235,34 @@ def test_inference_pruning(self):
     def test_training_pruning(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             loss = fluid.layers.mean(
-                fluid.layers.cross_entropy(
-                    input=self.predict, label=self.label))
+                fluid.layers.cross_entropy(input=self.predict,
+                                           label=self.label))
             optimizer = sparsity.decorate(
                 fluid.optimizer.SGD(learning_rate=0.01))
             optimizer.minimize(loss, self.startup_program)
 
         self.exe.run(self.startup_program)
 
-        sparsity.prune_model(
-            self.main_program, mask_algo="mask_1d", with_mask=True)
+        sparsity.prune_model(self.main_program,
+                             mask_algo="mask_1d",
+                             with_mask=True)
 
         supported_layer_count = 0
         for param in self.main_program.global_block().all_parameters():
-            mat = np.array(fluid.global_scope().find_var(param.name).get_tensor(
-            ))
-            if sparsity.asp.ASPHelper._is_supported_layer(self.main_program,
-                                                          param.name):
+            mat = np.array(fluid.global_scope().find_var(
+                param.name).get_tensor())
+            if sparsity.asp.ASPHelper._is_supported_layer(
+                    self.main_program, param.name):
                 mat_mask = np.array(fluid.global_scope().find_var(
-                    sparsity.asp.ASPHelper._get_mask_name(param.name))
-                                    .get_tensor())
+                    sparsity.asp.ASPHelper._get_mask_name(
+                        param.name)).get_tensor())
                 supported_layer_count += 1
                 if (self.customer_prefix in param.name):
                     self.assertLessEqual(
                         np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
                     self.assertLessEqual(
-                        np.sum(mat_mask.flatten() - static_tensor_mask.flatten(
-                        )), 1e-4)
+                        np.sum(mat_mask.flatten() -
+                               static_tensor_mask.flatten()), 1e-4)
                 else:
                     self.assertTrue(
                         sparsity.check_sparsity(
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py
index e127dca225116..b58fea9b779b3 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,10 +24,13 @@
 
 
 class MyLayer(paddle.nn.Layer):
+
     def __init__(self):
         super(MyLayer, self).__init__()
-        self.conv1 = paddle.nn.Conv2D(
-            in_channels=3, out_channels=2, kernel_size=3, padding=2)
+        self.conv1 = paddle.nn.Conv2D(in_channels=3,
+                                      out_channels=2,
+                                      kernel_size=3,
+                                      padding=2)
         self.linear1 = paddle.nn.Linear(1352, 32)
         self.linear2 = paddle.nn.Linear(32, 32)
         self.linear3 = paddle.nn.Linear(32, 10)
@@ -42,6 +45,7 @@ def forward(self, img):
 
 
 class TestASPDynamicOptimize(unittest.TestCase):
+
     def setUp(self):
 
         self.layer = MyLayer()
@@ -106,17 +110,14 @@ def test_asp_training(self):
 
         paddle.incubate.asp.prune_model(self.layer)
 
-        imgs = paddle.to_tensor(
-            np.random.randn(32, 3, 24, 24),
-            dtype='float32',
-            place=self.place,
-            stop_gradient=False)
-        labels = paddle.to_tensor(
-            np.random.randint(
-                10, size=(32, 1)),
-            dtype='float32',
-            place=self.place,
-            stop_gradient=False)
+        imgs = paddle.to_tensor(np.random.randn(32, 3, 24, 24),
+                                dtype='float32',
+                                place=self.place,
+                                stop_gradient=False)
+        labels = paddle.to_tensor(np.random.randint(10, size=(32, 1)),
+                                  dtype='float32',
+                                  place=self.place,
+                                  stop_gradient=False)
 
         loss_fn = paddle.nn.MSELoss(reduction='mean')
 
@@ -131,25 +132,23 @@ def test_asp_training(self):
                     paddle.static.default_main_program(), param.name):
                 mat = param.numpy()
                 self.assertTrue(
-                    paddle.fluid.contrib.sparsity.check_sparsity(
-                        mat.T, n=2, m=4))
+                    paddle.fluid.contrib.sparsity.check_sparsity(mat.T,
+                                                                 n=2,
+                                                                 m=4))
 
     def test_asp_training_with_amp(self):
         self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
 
         paddle.incubate.asp.prune_model(self.layer)
 
-        imgs = paddle.to_tensor(
-            np.random.randn(32, 3, 24, 24),
-            dtype='float32',
-            place=self.place,
-            stop_gradient=False)
-        labels = paddle.to_tensor(
-            np.random.randint(
-                10, size=(32, 1)),
-            dtype='float32',
-            place=self.place,
-            stop_gradient=False)
+        imgs = paddle.to_tensor(np.random.randn(32, 3, 24, 24),
+                                dtype='float32',
+                                place=self.place,
+                                stop_gradient=False)
+        labels = paddle.to_tensor(np.random.randint(10, size=(32, 1)),
+                                  dtype='float32',
+                                  place=self.place,
+                                  stop_gradient=False)
 
         loss_fn = paddle.nn.MSELoss(reduction='mean')
         scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
@@ -167,8 +166,9 @@ def test_asp_training_with_amp(self):
                     paddle.static.default_main_program(), param.name):
                 mat = param.numpy()
                 self.assertTrue(
-                    paddle.fluid.contrib.sparsity.check_sparsity(
-                        mat.T, n=2, m=4))
+                    paddle.fluid.contrib.sparsity.check_sparsity(mat.T,
+                                                                 n=2,
+                                                                 m=4))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
index b51e28cdcb9fc..4fdfe21de0185 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,16 +27,21 @@
 
 
 class TestASPStaticOptimize(unittest.TestCase):
+
     def setUp(self):
         self.main_program = fluid.Program()
         self.startup_program = fluid.Program()
 
         def build_model():
-            img = fluid.data(
-                name='img', shape=[None, 3, 24, 24], dtype='float32')
+            img = fluid.data(name='img',
+                             shape=[None, 3, 24, 24],
+                             dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            hidden = fluid.layers.conv2d(
-                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.conv2d(input=img,
+                                         num_filters=4,
+                                         filter_size=3,
+                                         padding=2,
+                                         act="relu")
             hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
             prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
             return img, label, prediction
@@ -44,11 +49,11 @@ def build_model():
         with fluid.program_guard(self.main_program, self.startup_program):
             self.img, self.label, predict = build_model()
             self.loss = fluid.layers.mean(
-                fluid.layers.cross_entropy(
-                    input=predict, label=self.label))
+                fluid.layers.cross_entropy(input=predict, label=self.label))
             self.optimizer = fluid.optimizer.SGD(learning_rate=0.01)
 
     def test_get_not_ASP_relevant_vars(self):
+
         def check_params(params, params_from_asp):
             if len(params_from_asp) != len(params):
                 return False
@@ -105,8 +110,8 @@ def test_is_supported_layers(self):
                 ref[i] == ASPHelper._is_supported_layer(program, name))
 
     def test_decorate(self):
-        param_names = self.__get_param_names(self.main_program.global_block()
-                                             .all_parameters())
+        param_names = self.__get_param_names(
+            self.main_program.global_block().all_parameters())
         with fluid.program_guard(self.main_program, self.startup_program):
             self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
             self.optimizer.minimize(self.loss, self.startup_program)
@@ -130,17 +135,18 @@ def test_asp_training(self):
         exe.run(self.startup_program)
         paddle.incubate.asp.prune_model(self.main_program)
 
-        data = (np.random.randn(32, 3, 24, 24), np.random.randint(
-            10, size=(32, 1)))
+        data = (np.random.randn(32, 3, 24,
+                                24), np.random.randint(10, size=(32, 1)))
         exe.run(self.main_program, feed=feeder.feed([data]))
 
         for param in self.main_program.global_block().all_parameters():
             if ASPHelper._is_supported_layer(self.main_program, param.name):
-                mat = np.array(fluid.global_scope().find_var(param.name)
-                               .get_tensor())
+                mat = np.array(fluid.global_scope().find_var(
+                    param.name).get_tensor())
                 self.assertTrue(
-                    paddle.fluid.contrib.sparsity.check_sparsity(
-                        mat.T, n=2, m=4))
+                    paddle.fluid.contrib.sparsity.check_sparsity(mat.T,
+                                                                 n=2,
+                                                                 m=4))
 
     def test_asp_training_with_amp(self):
         if core.is_compiled_with_cuda():
@@ -152,23 +158,24 @@ def test_asp_training_with_amp(self):
                 self.optimizer.minimize(self.loss, self.startup_program)
 
             exe = fluid.Executor(place)
-            feeder = fluid.DataFeeder(
-                feed_list=[self.img, self.label], place=place)
+            feeder = fluid.DataFeeder(feed_list=[self.img, self.label],
+                                      place=place)
 
             exe.run(self.startup_program)
             paddle.incubate.asp.prune_model(self.main_program)
 
-            data = (np.random.randn(32, 3, 24, 24), np.random.randint(
-                10, size=(32, 1)))
+            data = (np.random.randn(32, 3, 24,
+                                    24), np.random.randint(10, size=(32, 1)))
             exe.run(self.main_program, feed=feeder.feed([data]))
 
             for param in self.main_program.global_block().all_parameters():
                 if ASPHelper._is_supported_layer(self.main_program, param.name):
-                    mat = np.array(fluid.global_scope().find_var(param.name)
-                                   .get_tensor())
+                    mat = np.array(fluid.global_scope().find_var(
+                        param.name).get_tensor())
                     self.assertTrue(
-                        paddle.fluid.contrib.sparsity.check_sparsity(
-                            mat.T, n=2, m=4))
+                        paddle.fluid.contrib.sparsity.check_sparsity(mat.T,
+                                                                     n=2,
+                                                                     m=4))
 
     def __get_param_names(self, params):
         param_names = []
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_dynamic.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_dynamic.py
index b0fad0b64002a..fd592785a2826 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_dynamic.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,10 +24,13 @@
 
 
 class MyLayer(paddle.nn.Layer):
+
     def __init__(self):
         super(MyLayer, self).__init__()
-        self.conv1 = paddle.nn.Conv2D(
-            in_channels=3, out_channels=2, kernel_size=3, padding=2)
+        self.conv1 = paddle.nn.Conv2D(in_channels=3,
+                                      out_channels=2,
+                                      kernel_size=3,
+                                      padding=2)
         self.linear1 = paddle.nn.Linear(1352, 32)
         self.linear2 = paddle.nn.Linear(32, 10)
 
@@ -40,6 +43,7 @@ def forward(self, img):
 
 
 class TestASPDynamicPruningBase(unittest.TestCase):
+
     def setUp(self):
         self.layer = MyLayer()
 
@@ -47,12 +51,12 @@ def setUp(self):
         if core.is_compiled_with_cuda():
             place = paddle.CUDAPlace(0)
 
-        self.img = paddle.to_tensor(
-            np.random.uniform(
-                low=-0.5, high=0.5, size=(32, 3, 24, 24)),
-            dtype=np.float32,
-            place=place,
-            stop_gradient=False)
+        self.img = paddle.to_tensor(np.random.uniform(low=-0.5,
+                                                      high=0.5,
+                                                      size=(32, 3, 24, 24)),
+                                    dtype=np.float32,
+                                    place=place,
+                                    stop_gradient=False)
 
         self.set_config()
 
@@ -73,8 +77,9 @@ def test_training_pruning(self):
 
     def __pruning_and_checking(self, with_mask):
 
-        paddle.incubate.asp.prune_model(
-            self.layer, mask_algo=self.mask_gen_func, with_mask=with_mask)
+        paddle.incubate.asp.prune_model(self.layer,
+                                        mask_algo=self.mask_gen_func,
+                                        with_mask=with_mask)
 
         for param in self.layer.parameters():
             if ASPHelper._is_supported_layer(
@@ -86,18 +91,21 @@ def __pruning_and_checking(self, with_mask):
 
 
 class TestASPDynamicPruning1D(TestASPDynamicPruningBase):
+
     def set_config(self):
         self.mask_gen_func = 'mask_1d'
         self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D
 
 
 class TestASPDynamicPruning2DBest(TestASPDynamicPruningBase):
+
     def set_config(self):
         self.mask_gen_func = 'mask_2d_best'
         self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
 
 
 class TestASPDynamicPruning2DGreedy(TestASPDynamicPruningBase):
+
     def set_config(self):
         self.mask_gen_func = 'mask_2d_greedy'
         self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py
index a9986f24b0265..6f137e086eb5c 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,16 +27,21 @@
 
 
 class TestASPStaticPruningBase(unittest.TestCase):
+
     def setUp(self):
         self.main_program = fluid.Program()
         self.startup_program = fluid.Program()
 
         def build_model():
-            img = fluid.data(
-                name='img', shape=[None, 3, 24, 24], dtype='float32')
+            img = fluid.data(name='img',
+                             shape=[None, 3, 24, 24],
+                             dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            hidden = fluid.layers.conv2d(
-                input=img, num_filters=2, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.conv2d(input=img,
+                                         num_filters=2,
+                                         filter_size=3,
+                                         padding=2,
+                                         act="relu")
             hidden = fluid.layers.fc(input=hidden, size=32, act='softmax')
             prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
             return img, label, prediction
@@ -61,8 +66,8 @@ def test_inference_pruning(self):
     def test_training_pruning(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             loss = fluid.layers.mean(
-                fluid.layers.cross_entropy(
-                    input=self.predict, label=self.label))
+                fluid.layers.cross_entropy(input=self.predict,
+                                           label=self.label))
             optimizer = paddle.incubate.asp.decorate(
                 fluid.optimizer.SGD(learning_rate=0.01))
             optimizer.minimize(loss, self.startup_program)
@@ -76,32 +81,34 @@ def test_training_pruning(self):
 
     def __pruning_and_checking(self, exe, place, with_mask):
         exe.run(self.startup_program)
-        paddle.incubate.asp.prune_model(
-            self.main_program,
-            mask_algo=self.mask_gen_func,
-            with_mask=with_mask)
+        paddle.incubate.asp.prune_model(self.main_program,
+                                        mask_algo=self.mask_gen_func,
+                                        with_mask=with_mask)
         for param in self.main_program.global_block().all_parameters():
             if ASPHelper._is_supported_layer(self.main_program, param.name):
-                mat = np.array(fluid.global_scope().find_var(param.name)
-                               .get_tensor())
+                mat = np.array(fluid.global_scope().find_var(
+                    param.name).get_tensor())
                 self.assertTrue(
                     paddle.fluid.contrib.sparsity.check_sparsity(
                         mat.T, func_name=self.mask_check_func, n=2, m=4))
 
 
 class TestASPStaticPruning1D(TestASPStaticPruningBase):
+
     def set_config(self):
         self.mask_gen_func = 'mask_1d'
         self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D
 
 
 class TestASPStaticPruning2DBest(TestASPStaticPruningBase):
+
     def set_config(self):
         self.mask_gen_func = 'mask_2d_best'
         self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
 
 
 class TestASPStaticPruning2DGreedy(TestASPStaticPruningBase):
+
     def set_config(self):
         self.mask_gen_func = 'mask_2d_greedy'
         self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py b/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py
index 653cbbf84091b..710bbcc658269 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,10 +24,13 @@
 
 
 class MyLayer(paddle.nn.Layer):
+
     def __init__(self):
         super(MyLayer, self).__init__()
-        self.conv1 = paddle.nn.Conv2D(
-            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+        self.conv1 = paddle.nn.Conv2D(in_channels=3,
+                                      out_channels=4,
+                                      kernel_size=3,
+                                      padding=2)
         self.linear1 = paddle.nn.Linear(4624, 32)
         self.linear2 = paddle.nn.Linear(32, 32)
         self.linear3 = paddle.nn.Linear(32, 10)
@@ -42,6 +45,7 @@ def forward(self, img):
 
 
 class TestASPDynamicOptimize(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
 
@@ -69,8 +73,7 @@ def test_save_and_load(self):
         for param_name in asp_info.mask_vars:
             mask = asp_info.mask_vars[param_name]
             asp_info.update_mask_vars(
-                param_name, paddle.ones(
-                    shape=mask.shape, dtype=mask.dtype))
+                param_name, paddle.ones(shape=mask.shape, dtype=mask.dtype))
             asp_info.update_masks(param_name, np.ones(shape=mask.shape))
 
         net_state_dict = paddle.load(net_path)
@@ -79,17 +82,14 @@ def test_save_and_load(self):
         self.layer.set_state_dict(net_state_dict)
         self.optimizer.set_state_dict(opt_state_dict)
 
-        imgs = paddle.to_tensor(
-            np.random.randn(64, 3, 32, 32),
-            dtype='float32',
-            place=self.place,
-            stop_gradient=False)
-        labels = paddle.to_tensor(
-            np.random.randint(
-                10, size=(64, 1)),
-            dtype='float32',
-            place=self.place,
-            stop_gradient=False)
+        imgs = paddle.to_tensor(np.random.randn(64, 3, 32, 32),
+                                dtype='float32',
+                                place=self.place,
+                                stop_gradient=False)
+        labels = paddle.to_tensor(np.random.randint(10, size=(64, 1)),
+                                  dtype='float32',
+                                  place=self.place,
+                                  stop_gradient=False)
 
         loss_fn = paddle.nn.MSELoss(reduction='mean')
 
@@ -104,11 +104,13 @@ def test_save_and_load(self):
                     paddle.static.default_main_program(), param.name):
                 mat = param.numpy()
                 self.assertTrue(
-                    paddle.fluid.contrib.sparsity.check_sparsity(
-                        mat.T, n=2, m=4))
+                    paddle.fluid.contrib.sparsity.check_sparsity(mat.T,
+                                                                 n=2,
+                                                                 m=4))
 
 
 class TestASPStaticOptimize(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
 
@@ -116,11 +118,15 @@ def setUp(self):
         self.startup_program = fluid.Program()
 
         def build_model():
-            img = fluid.data(
-                name='img', shape=[None, 3, 32, 32], dtype='float32')
+            img = fluid.data(name='img',
+                             shape=[None, 3, 32, 32],
+                             dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            hidden = fluid.layers.conv2d(
-                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.conv2d(input=img,
+                                         num_filters=4,
+                                         filter_size=3,
+                                         padding=2,
+                                         act="relu")
             hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
             prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
             return img, label, prediction
@@ -128,8 +134,7 @@ def build_model():
         with fluid.program_guard(self.main_program, self.startup_program):
             self.img, self.label, predict = build_model()
             self.loss = fluid.layers.mean(
-                fluid.layers.cross_entropy(
-                    input=predict, label=self.label))
+                fluid.layers.cross_entropy(input=predict, label=self.label))
             self.optimizer = fluid.optimizer.SGD(learning_rate=0.01)
             self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
             self.optimizer.minimize(self.loss, self.startup_program)
@@ -155,20 +160,21 @@ def test_save_and_load(self):
         state_dict = paddle.load(param_path)
         prog.set_state_dict(state_dict)
 
-        feeder = fluid.DataFeeder(
-            feed_list=[self.img, self.label], place=self.place)
+        feeder = fluid.DataFeeder(feed_list=[self.img, self.label],
+                                  place=self.place)
 
-        data = (np.random.randn(64, 3, 32, 32), np.random.randint(
-            10, size=(64, 1)))
+        data = (np.random.randn(64, 3, 32,
+                                32), np.random.randint(10, size=(64, 1)))
         self.exe.run(prog, feed=feeder.feed([data]))
 
         for param in prog.global_block().all_parameters():
             if ASPHelper._is_supported_layer(prog, param.name):
-                mat = np.array(fluid.global_scope().find_var(param.name)
-                               .get_tensor())
+                mat = np.array(fluid.global_scope().find_var(
+                    param.name).get_tensor())
                 self.assertTrue(
-                    paddle.fluid.contrib.sparsity.check_sparsity(
-                        mat.T, n=2, m=4))
+                    paddle.fluid.contrib.sparsity.check_sparsity(mat.T,
+                                                                 n=2,
+                                                                 m=4))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py b/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
index 67ec54367d382..a65721aa0bef7 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class TestASPUtils(unittest.TestCase):
+
     def test_get_check_method(self):
         self.assertEqual(
             paddle.fluid.contrib.sparsity.CheckMethod.get_checking_method(
@@ -59,13 +60,13 @@ def test_get_mask_1d(self):
         for _ in range(10):
             x = np.random.randint(10, size=(5, 5))
             x = paddle.fluid.contrib.sparsity.get_mask_1d(x, 2, 4)
-            self.assertTrue(
-                paddle.fluid.contrib.sparsity.check_mask_1d(x, 2, 4))
+            self.assertTrue(paddle.fluid.contrib.sparsity.check_mask_1d(
+                x, 2, 4))
 
             x = np.random.randn(5, 4)
             x = paddle.fluid.contrib.sparsity.get_mask_1d(x, 2, 4)
-            self.assertTrue(
-                paddle.fluid.contrib.sparsity.check_mask_1d(x, 2, 4))
+            self.assertTrue(paddle.fluid.contrib.sparsity.check_mask_1d(
+                x, 2, 4))
 
     def test_check_mask_2d(self):
         x = np.array([[1.0, 0.0, 0.0, 1.0, 1.0], [0.0, 1.0, 0.0, 0.0, 0.0],
@@ -82,27 +83,28 @@ def test_get_mask_2d_greedy(self):
         for _ in range(10):
             x = np.random.randint(10, size=(5, 5))
             x = paddle.fluid.contrib.sparsity.get_mask_2d_greedy(x, 2, 4)
-            self.assertTrue(
-                paddle.fluid.contrib.sparsity.check_mask_2d(x, 2, 4))
+            self.assertTrue(paddle.fluid.contrib.sparsity.check_mask_2d(
+                x, 2, 4))
 
             x = np.random.randn(5, 4)
             x = paddle.fluid.contrib.sparsity.get_mask_2d_greedy(x, 2, 4)
-            self.assertTrue(
-                paddle.fluid.contrib.sparsity.check_mask_2d(x, 2, 4))
+            self.assertTrue(paddle.fluid.contrib.sparsity.check_mask_2d(
+                x, 2, 4))
 
     def test_get_mask_2d_best(self):
         for _ in range(10):
             x = np.random.randint(10, size=(5, 5))
             x = paddle.fluid.contrib.sparsity.get_mask_2d_best(x, 2, 4)
-            self.assertTrue(
-                paddle.fluid.contrib.sparsity.check_mask_2d(x, 2, 4))
+            self.assertTrue(paddle.fluid.contrib.sparsity.check_mask_2d(
+                x, 2, 4))
 
             x = np.random.randn(5, 4)
             x = paddle.fluid.contrib.sparsity.get_mask_2d_best(x, 2, 4)
-            self.assertTrue(
-                paddle.fluid.contrib.sparsity.check_mask_2d(x, 2, 4))
+            self.assertTrue(paddle.fluid.contrib.sparsity.check_mask_2d(
+                x, 2, 4))
 
     def test_threadsafe_valid_2d_patterns(self):
+
         def get_reference(m=4, n=2):
             from itertools import permutations
 
@@ -112,8 +114,8 @@ def get_reference(m=4, n=2):
             patterns = patterns + patterns
             patterns = np.asarray(list(set(permutations(patterns, m))))
 
-            valid = ((patterns.sum(axis=1) <= n).sum(axis=1) == m
-                     ).nonzero()[0].reshape(-1)
+            valid = ((patterns.sum(axis=1) <= n).sum(
+                axis=1) == m).nonzero()[0].reshape(-1)
             valid_patterns = np.empty((valid.shape[0], m, m))
             valid_patterns[:] = patterns[valid[:]]
             return valid_patterns
@@ -131,8 +133,8 @@ def get_reference(m=4, n=2):
 
         self.assertTrue(reference_key in patterns_map)
         self.assertTrue(len(patterns_map) == 1)
-        self.assertTrue((reference_patterns == patterns_map[reference_key]).all(
-        ))
+        self.assertTrue(
+            (reference_patterns == patterns_map[reference_key]).all())
 
     def test_check_sparsity(self):
         for _ in range(10):
@@ -173,16 +175,14 @@ def __test_1D_2D_sparsity_checking_methods(self, x_2d):
                 mask,
                 func_name=paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D,
                 n=2,
-                m=4),
-            paddle.fluid.contrib.sparsity.check_mask_1d(mask, 2, 4))
+                m=4), paddle.fluid.contrib.sparsity.check_mask_1d(mask, 2, 4))
         mask = paddle.fluid.contrib.sparsity.get_mask_2d_best(x_2d, 2, 4)
         self.assertEqual(
             paddle.fluid.contrib.sparsity.check_sparsity(
                 mask,
                 func_name=paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D,
                 n=2,
-                m=4),
-            paddle.fluid.contrib.sparsity.check_mask_2d(mask, 2, 4))
+                m=4), paddle.fluid.contrib.sparsity.check_mask_2d(mask, 2, 4))
 
     def __test_1D_2D_sparse_mask_generation_methods(self, x):
         mask = paddle.fluid.contrib.sparsity.create_mask(
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py
index 3ced15bf15881..7aaf1fd33a98a 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py
@@ -22,6 +22,7 @@
 import os
 from paddle.fluid.contrib.sparsity.asp import ASPHelper
 import numpy as np
+
 cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
 if cuda_visible_devices is None or cuda_visible_devices == "":
     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
@@ -30,6 +31,7 @@
 
 
 class MyLayer(paddle.nn.Layer):
+
     def __init__(self):
         super(MyLayer, self).__init__()
         self.linear1 = paddle.nn.Linear(32, 32)
@@ -42,6 +44,7 @@ def forward(self, x):
 
 
 class TestFleetWithASPDynamic(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
         os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
@@ -66,17 +69,14 @@ def test_with_asp(self):
         self.optimizer = fleet.distributed_optimizer(self.optimizer)
         self.layer = fleet.distributed_model(self.layer)
 
-        imgs = paddle.to_tensor(
-            np.random.randn(64, 32),
-            dtype='float32',
-            place=self.place,
-            stop_gradient=False)
-        labels = paddle.to_tensor(
-            np.random.randint(
-                10, size=(64, 1)),
-            dtype='float32',
-            place=self.place,
-            stop_gradient=False)
+        imgs = paddle.to_tensor(np.random.randn(64, 32),
+                                dtype='float32',
+                                place=self.place,
+                                stop_gradient=False)
+        labels = paddle.to_tensor(np.random.randint(10, size=(64, 1)),
+                                  dtype='float32',
+                                  place=self.place,
+                                  stop_gradient=False)
 
         loss_fn = paddle.nn.MSELoss(reduction='mean')
 
@@ -91,11 +91,13 @@ def test_with_asp(self):
                     paddle.static.default_main_program(), param.name):
                 mat = param.numpy()
                 self.assertTrue(
-                    paddle.fluid.contrib.sparsity.check_sparsity(
-                        mat.T, n=2, m=4))
+                    paddle.fluid.contrib.sparsity.check_sparsity(mat.T,
+                                                                 n=2,
+                                                                 m=4))
 
 
 class TestFleetWithASPAMPDynamic(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
         os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
@@ -120,17 +122,14 @@ def test_with_asp(self):
         self.optimizer = fleet.distributed_optimizer(self.optimizer)
         self.layer = fleet.distributed_model(self.layer)
 
-        imgs = paddle.to_tensor(
-            np.random.randn(64, 32),
-            dtype='float32',
-            place=self.place,
-            stop_gradient=False)
-        labels = paddle.to_tensor(
-            np.random.randint(
-                10, size=(64, 1)),
-            dtype='float32',
-            place=self.place,
-            stop_gradient=False)
+        imgs = paddle.to_tensor(np.random.randn(64, 32),
+                                dtype='float32',
+                                place=self.place,
+                                stop_gradient=False)
+        labels = paddle.to_tensor(np.random.randint(10, size=(64, 1)),
+                                  dtype='float32',
+                                  place=self.place,
+                                  stop_gradient=False)
 
         loss_fn = paddle.nn.MSELoss(reduction='mean')
         scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
@@ -148,8 +147,9 @@ def test_with_asp(self):
                     paddle.static.default_main_program(), param.name):
                 mat = param.numpy()
                 self.assertTrue(
-                    paddle.fluid.contrib.sparsity.check_sparsity(
-                        mat.T, n=2, m=4))
+                    paddle.fluid.contrib.sparsity.check_sparsity(mat.T,
+                                                                 n=2,
+                                                                 m=4))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
index d9ddd6c88d727..1feb3e28c1370 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
@@ -23,6 +23,7 @@
 from paddle.static import sparsity
 from paddle.fluid.contrib.sparsity.asp import ASPHelper
 import numpy as np
+
 cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
 if cuda_visible_devices is None or cuda_visible_devices == "":
     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
@@ -33,6 +34,7 @@
 
 
 class TestFleetWithASPSharding(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
         os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
@@ -49,8 +51,9 @@ def setUp(self):
 
     def net(self, main_prog, startup_prog):
         with fluid.program_guard(main_prog, startup_prog):
-            input_x = paddle.static.data(
-                name="x", shape=[-1, 32], dtype='float32')
+            input_x = paddle.static.data(name="x",
+                                         shape=[-1, 32],
+                                         dtype='float32')
             input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
 
             fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
@@ -84,8 +87,8 @@ def test_with_asp_sharding(self):
 
         with fluid.program_guard(train_prog, startup_prog):
             optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(
-                optimizer, strategy=strategy)
+            optimizer = fleet.distributed_optimizer(optimizer,
+                                                    strategy=strategy)
             optimizer.minimize(avg_cost)
 
         if paddle.fluid.is_compiled_with_cuda():
@@ -105,11 +108,12 @@ def test_with_asp_sharding(self):
 
         for param in train_prog.global_block().all_parameters():
             if ASPHelper._is_supported_layer(train_prog, param.name):
-                mat = np.array(fluid.global_scope().find_var(param.name)
-                               .get_tensor())
+                mat = np.array(fluid.global_scope().find_var(
+                    param.name).get_tensor())
                 self.assertTrue(
-                    paddle.fluid.contrib.sparsity.check_sparsity(
-                        mat.T, n=2, m=4))
+                    paddle.fluid.contrib.sparsity.check_sparsity(mat.T,
+                                                                 n=2,
+                                                                 m=4))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_static.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_static.py
index 2023c0051401f..23110bb7ff744 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_static.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_static.py
@@ -23,6 +23,7 @@
 from paddle.static import sparsity
 from paddle.fluid.contrib.sparsity.asp import ASPHelper
 import numpy as np
+
 cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
 if cuda_visible_devices is None or cuda_visible_devices == "":
     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
@@ -33,6 +34,7 @@
 
 
 class TestFleetWithASPStatic(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
         os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
@@ -41,8 +43,9 @@ def setUp(self):
 
     def net(self, main_prog, startup_prog):
         with fluid.program_guard(main_prog, startup_prog):
-            input_x = paddle.static.data(
-                name="x", shape=[-1, 32], dtype='float32')
+            input_x = paddle.static.data(name="x",
+                                         shape=[-1, 32],
+                                         dtype='float32')
             input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
 
             fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
@@ -62,12 +65,12 @@ def test_with_asp(self):
 
         with fluid.program_guard(train_prog, startup_prog):
             optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(
-                optimizer, strategy=strategy)
+            optimizer = fleet.distributed_optimizer(optimizer,
+                                                    strategy=strategy)
             optimizer.minimize(avg_cost)
 
-        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if paddle.fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 
         exe = fluid.Executor(place)
         feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
@@ -80,14 +83,16 @@ def test_with_asp(self):
 
         for param in train_prog.global_block().all_parameters():
             if ASPHelper._is_supported_layer(train_prog, param.name):
-                mat = np.array(fluid.global_scope().find_var(param.name)
-                               .get_tensor())
+                mat = np.array(fluid.global_scope().find_var(
+                    param.name).get_tensor())
                 self.assertTrue(
-                    paddle.fluid.contrib.sparsity.check_sparsity(
-                        mat.T, n=2, m=4))
+                    paddle.fluid.contrib.sparsity.check_sparsity(mat.T,
+                                                                 n=2,
+                                                                 m=4))
 
 
 class TestFleetWithASPAMPStatic(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
         os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
@@ -96,8 +101,9 @@ def setUp(self):
 
     def net(self, main_prog, startup_prog):
         with fluid.program_guard(main_prog, startup_prog):
-            input_x = paddle.static.data(
-                name="x", shape=[-1, 32], dtype='float32')
+            input_x = paddle.static.data(name="x",
+                                         shape=[-1, 32],
+                                         dtype='float32')
             input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
 
             fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
@@ -118,12 +124,12 @@ def test_with_asp_and_amp(self):
 
         with fluid.program_guard(train_prog, startup_prog):
             optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(
-                optimizer, strategy=strategy)
+            optimizer = fleet.distributed_optimizer(optimizer,
+                                                    strategy=strategy)
             optimizer.minimize(avg_cost)
 
-        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if paddle.fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 
         exe = fluid.Executor(place)
         feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
@@ -138,11 +144,12 @@ def test_with_asp_and_amp(self):
 
         for param in train_prog.global_block().all_parameters():
             if ASPHelper._is_supported_layer(train_prog, param.name):
-                mat = np.array(fluid.global_scope().find_var(param.name)
-                               .get_tensor())
+                mat = np.array(fluid.global_scope().find_var(
+                    param.name).get_tensor())
                 self.assertTrue(
-                    paddle.fluid.contrib.sparsity.check_sparsity(
-                        mat.T, n=2, m=4))
+                    paddle.fluid.contrib.sparsity.check_sparsity(mat.T,
+                                                                 n=2,
+                                                                 m=4))
 
     def test_with_asp_and_pure_fp16(self):
         fleet.init(is_collective=True)
@@ -158,12 +165,12 @@ def test_with_asp_and_pure_fp16(self):
             with paddle.static.amp.fp16_guard():
                 optimizer = optimizer = paddle.optimizer.Momentum(
                     learning_rate=0.01, multi_precision=True)
-                optimizer = fleet.distributed_optimizer(
-                    optimizer, strategy=strategy)
+                optimizer = fleet.distributed_optimizer(optimizer,
+                                                        strategy=strategy)
                 optimizer.minimize(avg_cost)
 
-        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if paddle.fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 
         exe = fluid.Executor(place)
         feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
@@ -178,11 +185,12 @@ def test_with_asp_and_pure_fp16(self):
 
         for param in train_prog.global_block().all_parameters():
             if ASPHelper._is_supported_layer(train_prog, param.name):
-                mat = np.array(fluid.global_scope().find_var(param.name)
-                               .get_tensor())
+                mat = np.array(fluid.global_scope().find_var(
+                    param.name).get_tensor())
                 self.assertTrue(
-                    paddle.fluid.contrib.sparsity.check_sparsity(
-                        mat.T, n=2, m=4))
+                    paddle.fluid.contrib.sparsity.check_sparsity(mat.T,
+                                                                 n=2,
+                                                                 m=4))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
index 2464882d617ef..47db2793dc08f 100644
--- a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
+++ b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
@@ -54,6 +54,7 @@ def get_random_images_and_labels(image_shape, label_shape):
 
 
 def sample_list_generator_creator():
+
     def __reader__():
         for _ in range(BATCH_NUM):
             sample_list = []
@@ -67,19 +68,21 @@ def __reader__():
 
 
 class AutoCheckpointBase(unittest.TestCase):
+
     def _init_env(self,
                   exe,
                   main_prog,
                   startup_prog,
                   minimize=True,
                   iterable=True):
+
         def simple_net():
             image = fluid.data(name='image', shape=[-1, 4, 4], dtype='float32')
             label = fluid.data(name='label', shape=[-1, 1], dtype='int64')
 
             fc_tmp = fluid.layers.fc(image, size=CLASS_NUM)
-            cross_entropy = fluid.layers.softmax_with_cross_entropy(fc_tmp,
-                                                                    label)
+            cross_entropy = fluid.layers.softmax_with_cross_entropy(
+                fc_tmp, label)
             loss = fluid.layers.reduce_mean(cross_entropy)
             sgd = fluid.optimizer.SGD(learning_rate=1e-3)
             if minimize:
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 381461130ed5c..10498bf48e9d0 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -1,35 +1,51 @@
 # file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 # string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 if(WITH_DISTRIBUTE AND WITH_GPU)
-    py_test_modules(test_auto_parallel_relaunch MODULES test_auto_parallel_relaunch ENVS ${dist_ENVS})
-    set_tests_properties(test_auto_parallel_relaunch PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
+  py_test_modules(test_auto_parallel_relaunch MODULES
+                  test_auto_parallel_relaunch ENVS ${dist_ENVS})
+  set_tests_properties(test_auto_parallel_relaunch
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
 
-    py_test_modules(test_relaunch_with_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS})
-    set_tests_properties(test_relaunch_with_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
+  py_test_modules(test_relaunch_with_planner MODULES test_relaunch_with_planner
+                  ENVS ${dist_ENVS})
+  set_tests_properties(test_relaunch_with_planner
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
 
-    py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_gpt_planner ENVS ${dist_ENVS})
-    set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240)
+  py_test_modules(test_relaunch_with_gpt_planner MODULES
+                  test_relaunch_with_gpt_planner ENVS ${dist_ENVS})
+  set_tests_properties(test_relaunch_with_gpt_planner
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240)
 
-    py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS})
-    set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
+  py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS})
+  set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
+                                                  TIMEOUT 80)
 
-    py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS})
-    set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
-    py_test_modules(test_high_order_grad MODULES test_high_order_grad ENVS ${dist_ENVS})
-    set_tests_properties(test_high_order_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+  py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS})
+  set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
+                                                 TIMEOUT 50)
+  py_test_modules(test_high_order_grad MODULES test_high_order_grad ENVS
+                  ${dist_ENVS})
+  set_tests_properties(test_high_order_grad
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
 
-    py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS})
-    py_test_modules(test_while_op_partition MODULES test_while_op_partition ENVS ${dist_ENVS})
-    py_test_modules(test_tunable_variable MODULES test_tunable_variable ENVS ${dist_ENVS})
-    py_test_modules(test_tunable_space MODULES test_tunable_space ENVS ${dist_ENVS})
-    py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS})
-    py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS})
-    py_test_modules(test_new_cost_model MODULES test_new_cost_model ENVS ${dist_ENVS})
-    py_test_modules(test_dist_reshape MODULES test_dist_reshape ENVS ${dist_ENVS})
-    py_test_modules(test_dist_pnorm MODULES test_dist_pnorm ENVS ${dist_ENVS})
-    py_test_modules(test_dist_slice MODULES test_dist_slice ENVS ${dist_ENVS})
-    py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS})
-    py_test_modules(test_comm_cost MODULES test_comm_cost ENVS ${dist_ENVS})
-    py_test_modules(test_comp_cost MODULES test_comp_cost ENVS ${dist_ENVS})
-    py_test_modules(test_dist_context MODULES test_dist_context ENVS ${dist_ENVS})
+  py_test_modules(test_while_op_completion MODULES test_while_op_completion
+                  ENVS ${dist_ENVS})
+  py_test_modules(test_while_op_partition MODULES test_while_op_partition ENVS
+                  ${dist_ENVS})
+  py_test_modules(test_tunable_variable MODULES test_tunable_variable ENVS
+                  ${dist_ENVS})
+  py_test_modules(test_tunable_space MODULES test_tunable_space ENVS
+                  ${dist_ENVS})
+  py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS})
+  py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS})
+  py_test_modules(test_new_cost_model MODULES test_new_cost_model ENVS
+                  ${dist_ENVS})
+  py_test_modules(test_dist_reshape MODULES test_dist_reshape ENVS ${dist_ENVS})
+  py_test_modules(test_dist_pnorm MODULES test_dist_pnorm ENVS ${dist_ENVS})
+  py_test_modules(test_dist_slice MODULES test_dist_slice ENVS ${dist_ENVS})
+  py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS})
+  py_test_modules(test_comm_cost MODULES test_comm_cost ENVS ${dist_ENVS})
+  py_test_modules(test_comp_cost MODULES test_comp_cost ENVS ${dist_ENVS})
+  py_test_modules(test_dist_context MODULES test_dist_context ENVS ${dist_ENVS})
+  py_test_modules(test_prim_dist_op MODULES test_prim_dist_op ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
index 8e5221ed5ffa6..d459ffd6d680d 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
@@ -45,6 +45,7 @@ def get_random_inputs_and_labels(input_shape, label_shape):
 
 
 def batch_generator_creator():
+
     def __reader__():
         for _ in range(batch_size):
             batch_input, batch_label = get_random_inputs_and_labels(
@@ -56,6 +57,7 @@ def __reader__():
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -64,14 +66,18 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
         self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
@@ -90,32 +96,31 @@ def forward(self, input):
 def mlp_pretrain_forward(train_program, start_program):
     with static.program_guard(train_program,
                               start_program), utils.unique_name.guard():
-        input = static.data(
-            name="input",
-            shape=[batch_size, sequence_len, hidden_size],
-            dtype='float32')
-        label = static.data(
-            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
-
-        auto.shard_tensor(
-            input,
-            dist_attr={
-                "process_mesh": _global_process_mesh,
-                "dims_mappig": [-1, -1, -1]
-            })
-
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+        input = static.data(name="input",
+                            shape=[batch_size, sequence_len, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, sequence_len, 1],
+                            dtype='float32')
+
+        auto.shard_tensor(input,
+                          dist_attr={
+                              "process_mesh": _global_process_mesh,
+                              "dims_mappig": [-1, -1, -1]
+                          })
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       dropout_ratio=0.1,
+                       initializer_range=0.02)
 
         predict = mlp(input)
         error_cost = paddle.nn.functional.square_error_cost(predict, label)
         loss = paddle.mean(error_cost)
 
-        loader = paddle.io.DataLoader.from_generator(
-            feed_list=[input, label], capacity=4 * batch_size, iterable=True)
+        loader = paddle.io.DataLoader.from_generator(feed_list=[input, label],
+                                                     capacity=4 * batch_size,
+                                                     iterable=True)
 
     return loss, train_program, start_program, loader
 
@@ -138,12 +143,11 @@ def train():
     loss, train_program, start_program, loader = mlp_pretrain_forward(
         train_program, start_program)
 
-    optimizer = paddle.fluid.optimizer.AdamOptimizer(
-        learning_rate=0.00001,
-        beta1=0.9,
-        beta2=0.999,
-        epsilon=1e-08,
-        grad_clip=None)
+    optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
+                                                     beta1=0.9,
+                                                     beta2=0.999,
+                                                     epsilon=1e-08,
+                                                     grad_clip=None)
 
     optimizer = fleet.distributed_optimizer(optimizer)
     _, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py
index 014a8048364fe..6bd48fb1963ed 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py
@@ -22,6 +22,7 @@
 import paddle.distributed.auto_parallel as auto
 from auto_parallel_relaunch_model import mlp_pretrain_forward
 from auto_parallel_relaunch_model import batch_generator_creator
+
 sys.path.append("..")
 import auto_parallel_gpt_model as modeling
 from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion
@@ -31,41 +32,44 @@ def get_gpt_model(train_program, start_program, place, batch_size, sequence_len,
                   vocab_size):
     modeling.init_global()
     with static.program_guard(train_program, start_program):
-        tokens = paddle.static.data(
-            name="tokens", shape=[batch_size, sequence_len], dtype='int64')
-        position_ids = paddle.static.data(
-            name="position_ids",
-            shape=[batch_size, sequence_len],
-            dtype='int64')
+        tokens = paddle.static.data(name="tokens",
+                                    shape=[batch_size, sequence_len],
+                                    dtype='int64')
+        position_ids = paddle.static.data(name="position_ids",
+                                          shape=[batch_size, sequence_len],
+                                          dtype='int64')
         attention_mask = paddle.static.data(
             name="attention_mask",
             shape=[batch_size, 1, sequence_len, sequence_len],
             dtype='float32')
-        labels = paddle.static.data(
-            name="labels", shape=[batch_size, sequence_len], dtype='int64')
-        loss_mask = paddle.static.data(
-            name="loss_mask", shape=[batch_size, sequence_len], dtype='float32')
+        labels = paddle.static.data(name="labels",
+                                    shape=[batch_size, sequence_len],
+                                    dtype='int64')
+        loss_mask = paddle.static.data(name="loss_mask",
+                                       shape=[batch_size, sequence_len],
+                                       dtype='float32')
         data_holder = [tokens, position_ids, attention_mask, labels, loss_mask]
 
-        gpt = GPTModel(
-            vocab_size=1000,
-            hidden_size=64,
-            num_hidden_layers=2,
-            num_attention_heads=8,
-            intermediate_size=256,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.0,
-            attention_probs_dropout_prob=0.0,
-            max_position_embeddings=1024,
-            type_vocab_size=1,
-            initializer_range=0.02,
-            pad_token_id=0,
-            eos_token_id=7,
-            bos_token_id=0,
-            eol_token_id=3)
-
-        model = GPTForPretraining(
-            gpt, vocab_size=1000, hidden_size=64, initializer_range=0.02)
+        gpt = GPTModel(vocab_size=1000,
+                       hidden_size=64,
+                       num_hidden_layers=2,
+                       num_attention_heads=8,
+                       intermediate_size=256,
+                       hidden_act="gelu",
+                       hidden_dropout_prob=0.0,
+                       attention_probs_dropout_prob=0.0,
+                       max_position_embeddings=1024,
+                       type_vocab_size=1,
+                       initializer_range=0.02,
+                       pad_token_id=0,
+                       eos_token_id=7,
+                       bos_token_id=0,
+                       eol_token_id=3)
+
+        model = GPTForPretraining(gpt,
+                                  vocab_size=1000,
+                                  hidden_size=64,
+                                  initializer_range=0.02)
         preds = model(tokens, position_ids, attention_mask)
         criterion = GPTPretrainingCriterion()
         loss = criterion(preds, labels, loss_mask)
@@ -105,12 +109,11 @@ def train():
         train_program, start_program, place, batch_size, sequence_len,
         vocab_size)
 
-    optimizer = paddle.fluid.optimizer.AdamOptimizer(
-        learning_rate=0.00001,
-        beta1=0.9,
-        beta2=0.999,
-        epsilon=1e-08,
-        grad_clip=None)
+    optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
+                                                     beta1=0.9,
+                                                     beta2=0.999,
+                                                     epsilon=1e-08,
+                                                     grad_clip=None)
     optimizer = fleet.distributed_optimizer(optimizer)
     _, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
         loss, start_program)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_with_planner.py b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_with_planner.py
index a93663cb95ed0..20d45e32b7a02 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_with_planner.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_with_planner.py
@@ -29,12 +29,11 @@ def train():
     loss, train_program, start_program, loader = mlp_pretrain_forward(
         train_program, start_program)
 
-    optimizer = paddle.fluid.optimizer.AdamOptimizer(
-        learning_rate=0.00001,
-        beta1=0.9,
-        beta2=0.999,
-        epsilon=1e-08,
-        grad_clip=None)
+    optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
+                                                     beta1=0.9,
+                                                     beta2=0.999,
+                                                     epsilon=1e-08,
+                                                     grad_clip=None)
 
     optimizer = fleet.distributed_optimizer(optimizer)
     _, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
index 66addd1be085b..e6a730f0a64d6 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -47,6 +47,7 @@
 
 
 class MyDataset(Dataset):
+
     def __init__(self, num_samples):
         super(MyDataset, self).__init__()
         self.num_samples = num_samples
@@ -61,6 +62,7 @@ def __len__(self):
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -69,45 +71,46 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
         self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input):
-        out = auto.shard_op(
-            self.norm, dist_attr={"process_mesh": PP_MESH_0})(input)[0]
+        out = auto.shard_op(self.norm, dist_attr={"process_mesh":
+                                                  PP_MESH_0})(input)[0]
         out = self.linear0(input)
         out = F.gelu(out, approximate=True)
-        out = auto.shard_op(
-            self.linear1, dist_attr={"process_mesh": PP_MESH_1})(out)[0]
+        out = auto.shard_op(self.linear1, dist_attr={"process_mesh":
+                                                     PP_MESH_1})(out)[0]
         out = self.dropout(out)
         out = self.linear2(out)
         return out
 
 
 def train():
-    mlp = MLPLayer(
-        hidden_size=hidden_size,
-        intermediate_size=4 * hidden_size,
-        dropout_ratio=0.1,
-        initializer_range=0.02)
+    mlp = MLPLayer(hidden_size=hidden_size,
+                   intermediate_size=4 * hidden_size,
+                   dropout_ratio=0.1,
+                   initializer_range=0.02)
     loss = paddle.nn.CrossEntropyLoss()
-    optimizer = paddle.fluid.optimizer.AdamOptimizer(
-        learning_rate=0.00001,
-        beta1=0.9,
-        beta2=0.999,
-        epsilon=1e-08,
-        grad_clip=None)
-
-    dataset = MyDataset(batch_num * batch_size)
+    optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
+                                                     beta1=0.9,
+                                                     beta2=0.999,
+                                                     epsilon=1e-08,
+                                                     grad_clip=None)
+
     inputs_spec = InputSpec([batch_size, hidden_size], 'float32', 'x')
     labels_spec = InputSpec([batch_size], 'int64', 'label')
 
@@ -119,23 +122,28 @@ def train():
     dist_strategy.semi_auto = True
     fleet.init(is_collective=True, strategy=dist_strategy)
 
-    engine = Engine(
-        mlp,
-        inputs_spec=inputs_spec,
-        labels_spec=labels_spec,
-        strategy=dist_strategy)
-    engine.prepare(optimizer, loss)
-    engine.fit(dataset,
+    # init engine
+    engine = Engine(mlp,
+                    inputs_spec=inputs_spec,
+                    labels_spec=labels_spec,
+                    strategy=dist_strategy)
+    engine.prepare(optimizer, loss, metrics=paddle.metric.Accuracy())
+
+    # train
+    train_dataset = MyDataset(batch_num * batch_size)
+    engine.fit(train_dataset,
                batch_size=batch_size,
                steps_per_epoch=batch_num * batch_size)
 
+    # eval
     eval_dataset = MyDataset(batch_size)
-    engine.prepare(optimizer, loss, mode='eval')
     engine.evaluate(eval_dataset, batch_size)
 
+    # predict
     test_dataset = MyDataset(batch_size)
-    engine.prepare(mode='predict')
     engine.predict(test_dataset, batch_size)
+
+    # save
     engine.save('./mlp_inf', training=False, mode='predict')
 
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
index cc0acae2fb1c1..1de44e91a78df 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
@@ -28,6 +28,7 @@
 
 
 class FCNet:
+
     def __init__(self, num_ins, num_outs, num_layers, hidden_size):
         self.num_ins = num_ins
         self.num_outs = num_outs
@@ -48,10 +49,12 @@ def __init__(self, num_ins, num_outs, num_layers, hidden_size):
                 lsize = self.hidden_size
                 rsize = self.hidden_size
 
-            w = paddle.static.create_parameter(
-                shape=[lsize, rsize], dtype="float32", is_bias=False)
-            b = paddle.static.create_parameter(
-                shape=[rsize], dtype="float32", is_bias=True)
+            w = paddle.static.create_parameter(shape=[lsize, rsize],
+                                               dtype="float32",
+                                               is_bias=False)
+            b = paddle.static.create_parameter(shape=[rsize],
+                                               dtype="float32",
+                                               is_bias=True)
             self.weights.append(w)
             self.biases.append(b)
 
@@ -65,13 +68,13 @@ def nn_func(self, ins):
 
 
 class LaplaceModel(paddle.nn.Layer):
+
     def __init__(self, num_ins=2, num_outs=1, num_layers=5, hidden_size=20):
         super(LaplaceModel, self).__init__()
-        self.net = FCNet(
-            num_ins=num_ins,
-            num_outs=num_outs,
-            num_layers=num_layers,
-            hidden_size=hidden_size)
+        self.net = FCNet(num_ins=num_ins,
+                         num_outs=num_outs,
+                         num_layers=num_layers,
+                         hidden_size=hidden_size)
 
     def forward(self, inputs, bc_index):
         inputs.stop_gradient = False
@@ -85,6 +88,7 @@ def forward(self, inputs, bc_index):
 
 
 class LaplaceDataset:
+
     def __init__(self, num_sample):
         self.num_sample = num_sample
 
@@ -127,7 +131,8 @@ def main():
 
     # spec
     inputs_spec = [
-        InputSpec([100, 2], 'float32', 'x'), InputSpec([36], 'int64', 'bc_idx')
+        InputSpec([100, 2], 'float32', 'x'),
+        InputSpec([36], 'int64', 'bc_idx')
     ]
     labels_spec = InputSpec([36, 1], 'float32', 'bc_v')
 
@@ -135,11 +140,10 @@ def main():
     dist_strategy.semi_auto = True
     fleet.init(is_collective=True, strategy=dist_strategy)
 
-    engine = Engine(
-        laplace,
-        inputs_spec=inputs_spec,
-        labels_spec=labels_spec,
-        strategy=dist_strategy)
+    engine = Engine(laplace,
+                    inputs_spec=inputs_spec,
+                    labels_spec=labels_spec,
+                    strategy=dist_strategy)
     engine.prepare(optimizer=optimizer, loss=loss_func)
     res = engine.fit(train_dataset, batch_size=None)
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/launch.py b/python/paddle/fluid/tests/unittests/auto_parallel/launch.py
index c225fe85cd844..ee9ff484523d3 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/launch.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/launch.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
index 321b262286218..4ff72173382da 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
@@ -79,6 +79,7 @@
 
 
 class TestAutoParallelReLaunch(unittest.TestCase):
+
     def test_relaunch(self):
         file_dir = os.path.dirname(os.path.abspath(__file__))
         cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
@@ -105,8 +106,8 @@ def test_relaunch(self):
         # Remove unnecessary files
         if os.path.exists(cluster_json_path):
             os.remove(cluster_json_path)
-        rank_mapping_json_path = os.path.join(file_dir,
-                                              "auto_parallel_rank_mapping.json")
+        rank_mapping_json_path = os.path.join(
+            file_dir, "auto_parallel_rank_mapping.json")
         if os.path.exists(rank_mapping_json_path):
             os.remove(rank_mapping_json_path)
         log_path = os.path.join(file_dir, "log")
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
index dc22263b52040..5b6f898d5b7d1 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
@@ -1967,6 +1967,7 @@
 
 
 class TestCluster(unittest.TestCase):
+
     def test_single_machine(self):
         # Build cluster
         file_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
index 898408becacdf..0d3f193e8bce8 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
@@ -31,6 +31,7 @@
 
 
 class TestCommOpCost(unittest.TestCase):
+
     def test_comm_cost(self):
         # Build cluster
         file_dir = os.path.dirname(os.path.abspath(__file__))
@@ -47,9 +48,10 @@ def test_comm_cost(self):
         comm_context = CommContext(cluster)
 
         # Check AllreduceSumCost 128MB ring cost
-        allreduce_sum_op_desc = build_comm_desc(
-            "c_allreduce_sum", [0, 1, 2, 3, 4, 5, 6, 7], paddle.float32,
-            [1, 32 * (10**6)])
+        allreduce_sum_op_desc = build_comm_desc("c_allreduce_sum",
+                                                [0, 1, 2, 3, 4, 5, 6, 7],
+                                                paddle.float32,
+                                                [1, 32 * (10**6)])
         allreduce_sum_op_cost = AllreduceSumOpCost(
             op_desc=allreduce_sum_op_desc, comm_context=comm_context)
 
@@ -57,37 +59,37 @@ def test_comm_cost(self):
         allgather_op_desc = build_comm_desc("c_allgather",
                                             [0, 1, 2, 3, 4, 5, 6, 7],
                                             paddle.float32, [1, 32 * (10**6)])
-        allgather_op_cost = AllgatherOpCost(
-            op_desc=allgather_op_desc, comm_context=comm_context)
+        allgather_op_cost = AllgatherOpCost(op_desc=allgather_op_desc,
+                                            comm_context=comm_context)
         self.assertTrue(allgather_op_cost.time > 0)
 
         # Check BroadcastOpCost cost
         broadcast_op_desc = build_comm_desc("c_broadcast",
                                             [0, 1, 2, 3, 4, 5, 6, 7],
                                             paddle.float32, [1, 32 * (10**6)])
-        broadcast_op_cost = BroadcastOpCost(
-            op_desc=broadcast_op_desc, comm_context=comm_context)
+        broadcast_op_cost = BroadcastOpCost(op_desc=broadcast_op_desc,
+                                            comm_context=comm_context)
         self.assertTrue(broadcast_op_cost.time > 0)
 
         # Check SendOpCost cost
         send_op_desc = build_comm_desc("send_v2", [0, 1], paddle.float32,
                                        [1, 32 * (10**6)])
-        send_op_cost = SendOpCost(
-            op_desc=send_op_desc, comm_context=comm_context)
+        send_op_cost = SendOpCost(op_desc=send_op_desc,
+                                  comm_context=comm_context)
         self.assertTrue(send_op_cost.time > 0)
 
         # Check RecvOpCost cost
         recv_op_desc = build_comm_desc("recv_v2", [0, 1], paddle.float32,
                                        [1, 32 * (10**6)])
-        recv_op_cost = RecvOpCost(
-            op_desc=recv_op_desc, comm_context=comm_context)
+        recv_op_cost = RecvOpCost(op_desc=recv_op_desc,
+                                  comm_context=comm_context)
         self.assertTrue(recv_op_cost.time > 0)
 
         # Check IdentityOpCost cost
         identity_op_desc = build_comm_desc("c_identity", [0, 1], paddle.float32,
                                            [1, 32 * (10**6)])
-        identity_op_cost = IdentityOpCost(
-            op_desc=identity_op_desc, comm_context=comm_context)
+        identity_op_cost = IdentityOpCost(op_desc=identity_op_desc,
+                                          comm_context=comm_context)
         self.assertTrue(identity_op_cost.time >= 0)
 
         # Remove unnecessary files
@@ -122,8 +124,8 @@ def test_cross_machine_comm_cost(self):
             "c_allgather",
             [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
             paddle.float32, [1, 32 * (10**6)])
-        allgather_op_cost = AllgatherOpCost(
-            op_desc=allgather_op_desc, comm_context=comm_context)
+        allgather_op_cost = AllgatherOpCost(op_desc=allgather_op_desc,
+                                            comm_context=comm_context)
         self.assertTrue(allgather_op_cost.time > 0)
 
         # Check BroadcastOpCost cost
@@ -131,22 +133,22 @@ def test_cross_machine_comm_cost(self):
             "c_broadcast",
             [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
             paddle.float32, [1, 32 * (10**6)])
-        broadcast_op_cost = BroadcastOpCost(
-            op_desc=broadcast_op_desc, comm_context=comm_context)
+        broadcast_op_cost = BroadcastOpCost(op_desc=broadcast_op_desc,
+                                            comm_context=comm_context)
         self.assertTrue(broadcast_op_cost.time > 0)
 
         # Check SendOpCost cost
         send_op_desc = build_comm_desc("send_v2", [0, 1], paddle.float32,
                                        [1, 32 * (10**6)])
-        send_op_cost = SendOpCost(
-            op_desc=send_op_desc, comm_context=comm_context)
+        send_op_cost = SendOpCost(op_desc=send_op_desc,
+                                  comm_context=comm_context)
         self.assertTrue(send_op_cost.time > 0)
 
         # Check RecvOpCost cost
         recv_op_desc = build_comm_desc("recv_v2", [0, 1], paddle.float32,
                                        [1, 32 * (10**6)])
-        recv_op_cost = RecvOpCost(
-            op_desc=recv_op_desc, comm_context=comm_context)
+        recv_op_cost = RecvOpCost(op_desc=recv_op_desc,
+                                  comm_context=comm_context)
         self.assertTrue(recv_op_cost.time > 0)
 
         # Remove unnecessary files
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
index af7a44b5aaa23..8472354826d2f 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
@@ -88,6 +88,7 @@
 
 
 class TestCompOpCost(unittest.TestCase):
+
     def test_comp_cost(self):
         # Build cluster
         file_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py
index fbadbb7d8c1cf..22abd6d799554 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py
@@ -22,6 +22,7 @@
 
 
 class TestConverter(unittest.TestCase):
+
     def test_converter(self):
         file_dir = os.path.dirname(os.path.abspath(__file__))
         launch_model_path = os.path.join(file_dir, "converter.py")
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
index f7718e584f5e1..24b056e9f5b64 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
@@ -43,6 +43,7 @@ def get_random_inputs_and_labels(input_shape, label_shape):
 
 
 def batch_generator_creator():
+
     def __reader__():
         for _ in range(batch_size):
             batch_input, batch_label = get_random_inputs_and_labels(
@@ -54,6 +55,7 @@ def __reader__():
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -62,8 +64,8 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        param_initializer = nn.initializer.Normal(
-            mean=0.0, std=initializer_range)
+        param_initializer = nn.initializer.Normal(mean=0.0,
+                                                  std=initializer_range)
 
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
         self.linear0 = nn.Linear(
@@ -79,20 +81,18 @@ def __init__(self,
 
     def forward(self, input):
         out = self.norm(input)
-        auto.shard_tensor(
-            self.linear0.weight,
-            dist_attr={
-                "process_mesh": _g_process_mesh[0],
-                "dims_mapping": [-1, 0]
-            })
+        auto.shard_tensor(self.linear0.weight,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh[0],
+                              "dims_mapping": [-1, 0]
+                          })
         out = self.linear0(out)
         out = F.gelu(out, approximate=True)
-        auto.shard_tensor(
-            self.linear1.weight,
-            dist_attr={
-                "process_mesh": _g_process_mesh[1],
-                "dims_mapping": [0, -1]
-            })
+        auto.shard_tensor(self.linear1.weight,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh[1],
+                              "dims_mapping": [0, -1]
+                          })
         out = self.linear1(out)
 
         return out
@@ -107,62 +107,58 @@ def get_program():
     start_program = static.Program()
     with static.program_guard(train_program, start_program):
         # input
-        input = static.data(
-            name="input",
-            shape=[batch_size, sequence_len, hidden_size],
-            dtype='float32')
-        label = static.data(
-            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+        input = static.data(name="input",
+                            shape=[batch_size, sequence_len, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, sequence_len, 1],
+                            dtype='float32')
         data_holder = [input, label]
         # dataloader
-        dataloader = paddle.io.DataLoader.from_generator(
-            feed_list=data_holder, capacity=4 * batch_size, iterable=False)
-        dataloader.set_batch_generator(
-            batch_generator_creator(), places=paddle.static.cuda_places())
+        dataloader = paddle.io.DataLoader.from_generator(feed_list=data_holder,
+                                                         capacity=4 *
+                                                         batch_size,
+                                                         iterable=False)
+        dataloader.set_batch_generator(batch_generator_creator(),
+                                       places=paddle.static.cuda_places())
         # data dist_attr
-        auto.shard_tensor(
-            input,
-            dist_attr={
-                "process_mesh": _g_process_mesh[0],
-                "dims_mapping": [0, -1, -1]
-            })
-        auto.shard_tensor(
-            label,
-            dist_attr={
-                "process_mesh": _g_process_mesh[0],
-                "dims_mapping": [0, -1, -1]
-            })
-
-        mlp_start = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+        auto.shard_tensor(input,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh[0],
+                              "dims_mapping": [0, -1, -1]
+                          })
+        auto.shard_tensor(label,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh[0],
+                              "dims_mapping": [0, -1, -1]
+                          })
+
+        mlp_start = MLPLayer(hidden_size=hidden_size,
+                             intermediate_size=4 * hidden_size,
+                             dropout_ratio=0.1,
+                             initializer_range=0.02)
         pred = mlp_start(input)
 
-        mlp_mid = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+        mlp_mid = MLPLayer(hidden_size=hidden_size,
+                           intermediate_size=4 * hidden_size,
+                           dropout_ratio=0.1,
+                           initializer_range=0.02)
         pred = mlp_mid(pred)
 
-        mlp_end = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+        mlp_end = MLPLayer(hidden_size=hidden_size,
+                           intermediate_size=4 * hidden_size,
+                           dropout_ratio=0.1,
+                           initializer_range=0.02)
         pred = mlp_end(pred)
 
         error_cost = paddle.nn.functional.square_error_cost(pred, label)
         loss = paddle.mean(error_cost)
 
-        optimizer = paddle.optimizer.Adam(
-            learning_rate=0.00001,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=1e-08,
-            grad_clip=None)
+        optimizer = paddle.optimizer.Adam(learning_rate=0.00001,
+                                          beta1=0.9,
+                                          beta2=0.999,
+                                          epsilon=1e-08,
+                                          grad_clip=None)
 
         feed_vars = {"inputs": [input], "labels": [label]}
         fetch_vars = {"loss": [loss]}
@@ -171,6 +167,7 @@ def get_program():
 
 
 class TestDistributedContext(unittest.TestCase):
+
     def test_backup_restore(self):
         train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars = get_program(
         )
@@ -180,18 +177,16 @@ def test_backup_restore(self):
         dist_context.initialize()
 
         dist_context._backup(serial=True, dist=True)
-        dist_context._restore(
-            serial=True,
-            serial_mode="to_backup",
-            dist=True,
-            dist_mode="to_backup")
+        dist_context._restore(serial=True,
+                              serial_mode="to_backup",
+                              dist=True,
+                              dist_mode="to_backup")
 
         dist_context._backup(serial=True, dist=True)
-        dist_context._restore(
-            serial=True,
-            serial_mode="to_original",
-            dist=True,
-            dist_mode="to_original")
+        dist_context._restore(serial=True,
+                              serial_mode="to_original",
+                              dist=True,
+                              dist_mode="to_original")
 
         dist_context._backup(serial=True, dist=True)
         dist_context._restore(serial=True, dist=True, dist_mode="to_default")
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py
index 946f33b7e4f31..74664062303f3 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py
@@ -29,12 +29,11 @@ def make_program_dp2():
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
         x.stop_gradient = False
-        auto.shard_tensor(
-            x,
-            dist_attr={
-                "process_mesh": auto.ProcessMesh([0, 1]),
-                "dims_mapping": [0, -1, -1]
-            })
+        auto.shard_tensor(x,
+                          dist_attr={
+                              "process_mesh": auto.ProcessMesh([0, 1]),
+                              "dims_mapping": [0, -1, -1]
+                          })
         tmp_0 = paddle.norm(x, p=2)
     return main_program, start_program, tmp_0
 
@@ -45,12 +44,11 @@ def make_program_serial():
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
         x.stop_gradient = False
-        auto.shard_tensor(
-            x,
-            dist_attr={
-                "process_mesh": auto.ProcessMesh([0]),
-                "dims_mapping": [-1, -1, -1]
-            })
+        auto.shard_tensor(x,
+                          dist_attr={
+                              "process_mesh": auto.ProcessMesh([0]),
+                              "dims_mapping": [-1, -1, -1]
+                          })
         tmp_0 = paddle.norm(x, p=2)
     return main_program, start_program, tmp_0
 
@@ -81,6 +79,7 @@ def parallelizer(program_func, rank):
 
 
 class TestDistPNorm(unittest.TestCase):
+
     def test_dist_pnorm_dp2(self):
 
         for rank in range(2):
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py
index 8777bf3ff1f2e..60b43ef9fe3bc 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py
@@ -29,12 +29,11 @@ def make_program_dp2():
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 4, 8], dtype='float32')
         x.stop_gradient = False
-        auto.shard_tensor(
-            x,
-            dist_attr={
-                "process_mesh": auto.ProcessMesh([0, 1]),
-                "dims_mapping": [0, -1, -1]
-            })
+        auto.shard_tensor(x,
+                          dist_attr={
+                              "process_mesh": auto.ProcessMesh([0, 1]),
+                              "dims_mapping": [0, -1, -1]
+                          })
         tmp_0 = paddle.reshape(x, shape=[0, 0, 4, 2])
         tmp_1 = paddle.reshape(tmp_0, shape=[0, 0, 8])
         tmp_2 = tmp_1.reshape((tmp_1.shape[0], tmp_1.shape[1], -1))
@@ -61,6 +60,7 @@ def parallelizer(program_func, rank):
 
 
 class TestDistReshape(unittest.TestCase):
+
     def test_dist_reshape_mp2(self):
 
         for rank in range(2):
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
index 8af055a09a343..e12fd0f922a5e 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
@@ -25,12 +25,11 @@ def make_program_dp2():
     start_program = paddle.fluid.Program()
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
-        auto.shard_tensor(
-            x,
-            dist_attr={
-                "process_mesh": auto.ProcessMesh([0, 1]),
-                "dims_mapping": [0, -1, -1]
-            })
+        auto.shard_tensor(x,
+                          dist_attr={
+                              "process_mesh": auto.ProcessMesh([0, 1]),
+                              "dims_mapping": [0, -1, -1]
+                          })
         tmp_0 = x[0]
         tmp_1 = x[:, 0, :]
         tmp_2 = x[:, :, 1]
@@ -43,12 +42,11 @@ def make_program_serial():
     start_program = paddle.fluid.Program()
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
-        auto.shard_tensor(
-            x,
-            dist_attr={
-                "process_mesh": auto.ProcessMesh([0]),
-                "dims_mapping": [-1, -1, -1]
-            })
+        auto.shard_tensor(x,
+                          dist_attr={
+                              "process_mesh": auto.ProcessMesh([0]),
+                              "dims_mapping": [-1, -1, -1]
+                          })
         tmp_0 = x[0]
         tmp_1 = x[:, 0, :]
         tmp_2 = x[:, :, 1]
@@ -78,6 +76,7 @@ def parallelizer(program_func, rank):
 
 
 class TestDistSlice(unittest.TestCase):
+
     def test_dist_slice_dp2(self):
         for rank in range(2):
             dist_main_prog, dist_context = parallelizer(make_program_dp2, rank)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
index efcad7eb11268..b8ad54cbb79e1 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
@@ -21,6 +21,7 @@
 
 
 class TestEngineAPI(unittest.TestCase):
+
     def test_engine_api(self):
         file_dir = os.path.dirname(os.path.abspath(__file__))
         launch_model_path = os.path.join(file_dir, "engine_api.py")
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_high_order_grad.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_high_order_grad.py
index ab4a34cf99cbf..9fb1c22d76cbf 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_high_order_grad.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_high_order_grad.py
@@ -21,6 +21,7 @@
 
 
 class TestHighOrderGrad(unittest.TestCase):
+
     def test_dp2(self):
         file_dir = os.path.dirname(os.path.abspath(__file__))
         launch_model_path = os.path.join(file_dir, "high_order_grad.py")
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
index c0df01ada58f9..911f20f114912 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
@@ -35,6 +35,7 @@ def check_cost(cost):
 
 
 class TestCost(unittest.TestCase):
+
     def test_base_cost(self):
         cost = cost_model.Cost(memory=100, flops=200, time=0.5)
         self.assertTrue(check_cost(cost))
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_prim_dist_op.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_prim_dist_op.py
index f9ab6f37f3ce7..67894f6dd93df 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_prim_dist_op.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_prim_dist_op.py
@@ -33,6 +33,7 @@
 
 
 class TestPrimDistOp(unittest.TestCase):
+
     def setUp(self):
         self.main_program = paddle.static.Program()
         self.startup_program = paddle.static.Program()
@@ -45,41 +46,42 @@ def setUp(self):
     def init_prog(self):
         # block = self.main_program.global_block()
         # block = self.main_program.global_block()
-        self.w = self.layer_help.create_parameter(
-            dtype="float", shape=[20], attr=None)
-        self.w_grad = paddle.static.data(
-            name='w_grad', shape=[20], dtype='float')
+        self.w = self.layer_help.create_parameter(dtype="float",
+                                                  shape=[20],
+                                                  attr=None)
+        self.w_grad = paddle.static.data(name='w_grad',
+                                         shape=[20],
+                                         dtype='float')
         self.tmp1 = paddle.static.data(name='tmp1', shape=[20], dtype='float')
         self.tmp2 = paddle.static.data(name='tmp2', shape=[20], dtype='float')
-        self.batch_reduced = paddle.static.data(
-            name='batch_reduced', shape=[1], dtype='float')
+        self.batch_reduced = paddle.static.data(name='batch_reduced',
+                                                shape=[1],
+                                                dtype='float')
         self.attrs = {}
 
         default_dist_context = get_default_distributed_context()
         _global_process_mesh = auto.ProcessMesh(list(range(nranks)))
-        tensor_dist_attr = set_var_dist_attr(
-            default_dist_context,
-            self.tmp1, [-1],
-            _global_process_mesh,
-            mark_annotated=True)
-        tensor_dist_attr = set_var_dist_attr(
-            default_dist_context,
-            self.tmp1, [-1],
-            _global_process_mesh,
-            mark_annotated=True)
-
-        op = self.layer_help.append_op(
-            type="add_p",
-            inputs={'X': self.tmp1,
-                    'Y': self.w},
-            outputs={'Z': self.w_grad},
-            attrs=self.attrs)
-
-        op = self.layer_help.append_op(
-            type="reduce_p",
-            inputs={'X': self.tmp2},
-            outputs={'Y': self.batch_reduced},
-            attrs={"axis": [0]})
+        tensor_dist_attr = set_var_dist_attr(default_dist_context,
+                                             self.tmp1, [-1],
+                                             _global_process_mesh,
+                                             mark_annotated=True)
+        tensor_dist_attr = set_var_dist_attr(default_dist_context,
+                                             self.tmp1, [-1],
+                                             _global_process_mesh,
+                                             mark_annotated=True)
+
+        op = self.layer_help.append_op(type="add_p",
+                                       inputs={
+                                           'X': self.tmp1,
+                                           'Y': self.w
+                                       },
+                                       outputs={'Z': self.w_grad},
+                                       attrs=self.attrs)
+
+        op = self.layer_help.append_op(type="reduce_p",
+                                       inputs={'X': self.tmp2},
+                                       outputs={'Y': self.batch_reduced},
+                                       attrs={"axis": [0]})
 
     def test_loss_and_grad_allreduce(self):
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py
index ab704a6a25714..d9594b951983f 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py
@@ -19,6 +19,7 @@
 
 
 class TestRecorder(unittest.TestCase):
+
     def test_register(self):
         recorder = rd.MetricsRecorder()
         recorder.register("metric")
@@ -34,8 +35,8 @@ def test_update(self):
         recorder = rd.MetricsRecorder()
         recorder.update("metric", 4, 1000)
         self.assertEqual(recorder.records["metric"].direction, "min")
-        self.assertEqual(
-            recorder.get_records("metric"), [rd.MetricRecord(4, 1000)])
+        self.assertEqual(recorder.get_records("metric"),
+                         [rd.MetricRecord(4, 1000)])
 
     def test_get_records(self):
         recorder = rd.MetricsRecorder()
@@ -43,13 +44,12 @@ def test_get_records(self):
         recorder.update("metric", 2, step=1)
         recorder.update("metric", 3, step=2)
         recorder.update("metric", 4, step=3)
-        self.assertEqual(
-            recorder.get_records("metric"), [
-                rd.MetricRecord(1, 0),
-                rd.MetricRecord(2, 1),
-                rd.MetricRecord(3, 2),
-                rd.MetricRecord(4, 3),
-            ])
+        self.assertEqual(recorder.get_records("metric"), [
+            rd.MetricRecord(1, 0),
+            rd.MetricRecord(2, 1),
+            rd.MetricRecord(3, 2),
+            rd.MetricRecord(4, 3),
+        ])
 
     def test_set_records(self):
         recorder = rd.MetricsRecorder()
@@ -60,14 +60,14 @@ def test_set_records(self):
                 rd.MetricRecord(2, 1),
                 rd.MetricRecord(3, 2),
                 rd.MetricRecord(4, 3),
-            ], )
-        self.assertEqual(
-            recorder.get_records("metric"), [
-                rd.MetricRecord(1, 0),
-                rd.MetricRecord(2, 1),
-                rd.MetricRecord(3, 2),
-                rd.MetricRecord(4, 3),
-            ])
+            ],
+        )
+        self.assertEqual(recorder.get_records("metric"), [
+            rd.MetricRecord(1, 0),
+            rd.MetricRecord(2, 1),
+            rd.MetricRecord(3, 2),
+            rd.MetricRecord(4, 3),
+        ])
 
     def test_get_best_value(self):
         recorder = rd.MetricsRecorder()
@@ -81,7 +81,8 @@ def test_get_best_value(self):
                 rd.MetricRecord(2, 1),
                 rd.MetricRecord(3, 2),
                 rd.MetricRecord(4, 3),
-            ], )
+            ],
+        )
         self.assertEqual(recorder.get_best_value("metric_min"), 1)
 
         recorder.set_records(
@@ -91,7 +92,8 @@ def test_get_best_value(self):
                 rd.MetricRecord(2, 1),
                 rd.MetricRecord(3, 2),
                 rd.MetricRecord(4, 3),
-            ], )
+            ],
+        )
         self.assertEqual(recorder.get_best_value("metric_max"), 4)
 
     def test_get_best_step(self):
@@ -105,7 +107,8 @@ def test_get_best_step(self):
                 rd.MetricRecord(2, 1),
                 rd.MetricRecord(3, 2),
                 rd.MetricRecord(4, 3),
-            ], )
+            ],
+        )
         self.assertEqual(recorder.get_best_step("metric_min"), 0)
 
         recorder.register("metric_max", "max")
@@ -116,7 +119,8 @@ def test_get_best_step(self):
                 rd.MetricRecord(2, 1),
                 rd.MetricRecord(3, 2),
                 rd.MetricRecord(4, 3),
-            ], )
+            ],
+        )
         self.assertEqual(recorder.get_best_step("metric_max"), 3)
 
     def test_get_statistics(self):
@@ -142,7 +146,8 @@ def test_serialization(self):
                 rd.MetricRecord(2, 1),
                 rd.MetricRecord(3, 2),
                 rd.MetricRecord(4, 3),
-            ], )
+            ],
+        )
         print(recorder.get_state())
         new_recorder = rd.MetricsRecorder.from_state(recorder.get_state())
         self.assertEqual(new_recorder.records.keys(), recorder.records.keys())
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py
index bc1d0a70182b4..88ad5f98bf7d2 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py
@@ -22,6 +22,7 @@
 
 
 class TestPlannerReLaunch(unittest.TestCase):
+
     def test_relaunch_with_planner(self):
         from test_auto_parallel_relaunch import cluster_json
         file_dir = os.path.dirname(os.path.abspath(__file__))
@@ -49,8 +50,8 @@ def test_relaunch_with_planner(self):
         # Remove unnecessary files
         if os.path.exists(cluster_json_path):
             os.remove(cluster_json_path)
-        rank_mapping_json_path = os.path.join(file_dir,
-                                              "auto_parallel_rank_mapping.json")
+        rank_mapping_json_path = os.path.join(
+            file_dir, "auto_parallel_rank_mapping.json")
         if os.path.exists(rank_mapping_json_path):
             os.remove(rank_mapping_json_path)
         files_path = [path for path in os.listdir('.') if '.pkl' in path]
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_planner.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_planner.py
index 5a7ae87e646ad..b6fc0d7a1fa41 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_planner.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_planner.py
@@ -22,6 +22,7 @@
 
 
 class TestPlannerReLaunch(unittest.TestCase):
+
     def test_relaunch_with_planner(self):
         from test_auto_parallel_relaunch import cluster_json
         file_dir = os.path.dirname(os.path.abspath(__file__))
@@ -49,8 +50,8 @@ def test_relaunch_with_planner(self):
         # Remove unnecessary files
         if os.path.exists(cluster_json_path):
             os.remove(cluster_json_path)
-        rank_mapping_json_path = os.path.join(file_dir,
-                                              "auto_parallel_rank_mapping.json")
+        rank_mapping_json_path = os.path.join(
+            file_dir, "auto_parallel_rank_mapping.json")
         if os.path.exists(rank_mapping_json_path):
             os.remove(rank_mapping_json_path)
         log_path = os.path.join(file_dir, "log")
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py
index fc52d1c394eff..e39991fcaa514 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py
@@ -19,6 +19,7 @@
 
 
 class TestTiral(unittest.TestCase):
+
     def test_trial(self):
         space = ts.TunableSpace()
         space.choice("choice", [0, 1, 2, 3], default=2)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py
index cb7104f9ef641..f0c6a0b7cdf79 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py
@@ -18,6 +18,7 @@
 
 
 class TestTunableSpace(unittest.TestCase):
+
     def test_fixed(self):
         space = ts.TunableSpace()
         fixed = space.fixed("fixed", default=4)
@@ -72,8 +73,10 @@ def test_int_range(self):
 
     def test_float_range(self):
         space = ts.TunableSpace()
-        float_range = space.float_range(
-            "float_range", start=0.4, stop=4.4, default=2.0)
+        float_range = space.float_range("float_range",
+                                        start=0.4,
+                                        stop=4.4,
+                                        default=2.0)
         self.assertEqual(space.values["float_range"], 2.0)
         self.assertEqual(len(space.variables), 1)
         self.assertEqual(space.variables["float_range"].name, "float_range")
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py
index ade228f6c494b..ce0a076c83e7e 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py
@@ -18,6 +18,7 @@
 
 
 class TestTunableVariable(unittest.TestCase):
+
     def test_fixed(self):
         fixed = tv.Fixed("fixed", True)
         fixed = tv.Fixed.from_state(fixed.get_state())
@@ -63,8 +64,12 @@ def test_int_range(self):
         self.assertIn(int_range.random(1234), [1, 2, 3, 4])
         self.assertNotEqual(int_range.default, 4)
 
-        int_range = tv.IntRange(
-            "int_range", start=1, stop=8, step=2, default=3, endpoint=True)
+        int_range = tv.IntRange("int_range",
+                                start=1,
+                                stop=8,
+                                step=2,
+                                default=3,
+                                endpoint=True)
         int_range = tv.IntRange.from_state(int_range.get_state())
         self.assertEqual(int_range.default, 3)
         self.assertIn(int_range.random(), [1, 3, 5, 7])
@@ -72,8 +77,10 @@ def test_int_range(self):
         self.assertNotEqual(int_range.default, 2)
 
     def test_float_range(self):
-        float_range = tv.FloatRange(
-            "float_range", start=0.4, stop=4.4, default=2.0)
+        float_range = tv.FloatRange("float_range",
+                                    start=0.4,
+                                    stop=4.4,
+                                    default=2.0)
         float_range = tv.FloatRange.from_state(float_range.get_state())
         self.assertEqual(float_range.default, 2.0)
         self.assertGreaterEqual(float_range.random(), 0.4)
@@ -81,13 +88,12 @@ def test_float_range(self):
         self.assertNotAlmostEqual(float_range.random(), 1)
         self.assertNotAlmostEqual(float_range.random(), 4.4)
 
-        float_range = tv.FloatRange(
-            "float_range",
-            start=0.4,
-            stop=8.4,
-            step=2.0,
-            default=3.0,
-            endpoint=True)
+        float_range = tv.FloatRange("float_range",
+                                    start=0.4,
+                                    stop=8.4,
+                                    step=2.0,
+                                    default=3.0,
+                                    endpoint=True)
         float_range = tv.FloatRange.from_state(float_range.get_state())
         self.assertEqual(float_range.default, 3.0)
         self.assertGreaterEqual(float_range.random(), 0.4)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
index 9989f5bbdc605..3dabe38ff6e1d 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
@@ -46,6 +46,7 @@ def get_random_inputs_and_labels(input_shape, label_shape):
 
 
 def batch_generator_creator():
+
     def __reader__():
         for _ in range(batch_size):
             batch_input, batch_label = get_random_inputs_and_labels(
@@ -57,6 +58,7 @@ def __reader__():
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -65,8 +67,8 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        param_initializer = nn.initializer.Normal(
-            mean=0.0, std=initializer_range)
+        param_initializer = nn.initializer.Normal(mean=0.0,
+                                                  std=initializer_range)
 
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
         self.linear0 = nn.Linear(
@@ -82,20 +84,18 @@ def __init__(self,
 
     def forward(self, input):
         out = self.norm(input)
-        auto.shard_tensor(
-            self.linear0.weight,
-            dist_attr={
-                "process_mesh": _g_process_mesh[0],
-                "dims_mapping": [-1, 0]
-            })
+        auto.shard_tensor(self.linear0.weight,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh[0],
+                              "dims_mapping": [-1, 0]
+                          })
         out = self.linear0(out)
         out = F.gelu(out, approximate=True)
-        auto.shard_tensor(
-            self.linear1.weight,
-            dist_attr={
-                "process_mesh": _g_process_mesh[1],
-                "dims_mapping": [0, -1]
-            })
+        auto.shard_tensor(self.linear1.weight,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh[1],
+                              "dims_mapping": [0, -1]
+                          })
         out = self.linear1(out)
 
         return out
@@ -107,17 +107,15 @@ def loop_cond(i, loop_len, input_array):
 
 def loop_body(i, loop_len, input_array):
     pre_input = paddle.tensor.array_read(array=input_array, i=i)
-    mlp_while0 = MLPLayer(
-        hidden_size=hidden_size,
-        intermediate_size=4 * hidden_size,
-        dropout_ratio=0.1,
-        initializer_range=0.02)
-
-    mlp_while1 = MLPLayer(
-        hidden_size=hidden_size,
-        intermediate_size=4 * hidden_size,
-        dropout_ratio=0.1,
-        initializer_range=0.02)
+    mlp_while0 = MLPLayer(hidden_size=hidden_size,
+                          intermediate_size=4 * hidden_size,
+                          dropout_ratio=0.1,
+                          initializer_range=0.02)
+
+    mlp_while1 = MLPLayer(hidden_size=hidden_size,
+                          intermediate_size=4 * hidden_size,
+                          dropout_ratio=0.1,
+                          initializer_range=0.02)
 
     output = mlp_while0(pre_input)
     cur_pred = mlp_while1(output)
@@ -142,37 +140,36 @@ def get_program():
         loop_len = paddle.full(shape=[1], fill_value=epoch_num, dtype='int64')
 
         # input
-        input = static.data(
-            name="input",
-            shape=[batch_size, sequence_len, hidden_size],
-            dtype='float32')
-        label = static.data(
-            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+        input = static.data(name="input",
+                            shape=[batch_size, sequence_len, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, sequence_len, 1],
+                            dtype='float32')
         data_holder = [input, label]
         # dataloader
-        dataloader = paddle.io.DataLoader.from_generator(
-            feed_list=data_holder, capacity=4 * batch_size, iterable=False)
-        dataloader.set_batch_generator(
-            batch_generator_creator(), places=paddle.static.cuda_places())
+        dataloader = paddle.io.DataLoader.from_generator(feed_list=data_holder,
+                                                         capacity=4 *
+                                                         batch_size,
+                                                         iterable=False)
+        dataloader.set_batch_generator(batch_generator_creator(),
+                                       places=paddle.static.cuda_places())
         # data dist_attr
-        auto.shard_tensor(
-            input,
-            dist_attr={
-                "process_mesh": _g_process_mesh[0],
-                "dims_mapping": [-1, -1, -1]
-            })
-        auto.shard_tensor(
-            label,
-            dist_attr={
-                "process_mesh": _g_process_mesh[0],
-                "dims_mapping": [-1, -1, -1]
-            })
-
-        mlp_start = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+        auto.shard_tensor(input,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh[0],
+                              "dims_mapping": [-1, -1, -1]
+                          })
+        auto.shard_tensor(label,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh[0],
+                              "dims_mapping": [-1, -1, -1]
+                          })
+
+        mlp_start = MLPLayer(hidden_size=hidden_size,
+                             intermediate_size=4 * hidden_size,
+                             dropout_ratio=0.1,
+                             initializer_range=0.02)
         pred = mlp_start(input)
 
         input_array = paddle.tensor.array_write(pred, i)
@@ -182,11 +179,10 @@ def get_program():
             loop_vars=[i, loop_len, input_array])
         end_pred = paddle.tensor.array_read(array=input_array, i=i)
 
-        mlp_end = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+        mlp_end = MLPLayer(hidden_size=hidden_size,
+                           intermediate_size=4 * hidden_size,
+                           dropout_ratio=0.1,
+                           initializer_range=0.02)
         pred = mlp_end(end_pred)
 
         error_cost = paddle.nn.functional.square_error_cost(pred, label)
@@ -196,6 +192,7 @@ def get_program():
 
 
 class TestMLP(unittest.TestCase):
+
     def test_completer(self):
         train_program, start_program, dataloader, i, loss = get_program()
         dist_context = DistributedContext()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
index d296d9433302d..3c6e086ae7fac 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -47,6 +47,7 @@ def get_random_inputs_and_labels(input_shape, label_shape):
 
 
 def batch_generator_creator():
+
     def __reader__():
         for _ in range(batch_size):
             batch_input, batch_label = get_random_inputs_and_labels(
@@ -58,6 +59,7 @@ def __reader__():
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -66,8 +68,8 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        param_initializer = nn.initializer.Normal(
-            mean=0.0, std=initializer_range)
+        param_initializer = nn.initializer.Normal(mean=0.0,
+                                                  std=initializer_range)
 
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
         self.linear0 = nn.Linear(
@@ -83,63 +85,61 @@ def __init__(self,
 
     def forward(self, input):
 
-        auto.shard_tensor(
-            self.norm.weight,
-            dist_attr={"process_mesh": _g_process_mesh,
-                       "dims_mapping": [-1]})
-        auto.shard_tensor(
-            self.norm.bias,
-            dist_attr={"process_mesh": _g_process_mesh,
-                       "dims_mapping": [-1]})
-        auto.shard_tensor(
-            self.linear0.weight,
-            dist_attr={
-                "process_mesh": _g_process_mesh,
-                "dims_mapping": [-1, 0]
-            })
-        auto.shard_tensor(
-            self.linear0.bias,
-            dist_attr={"process_mesh": _g_process_mesh,
-                       "dims_mapping": [0]})
-        auto.shard_tensor(
-            self.linear1.weight,
-            dist_attr={
-                "process_mesh": _g_process_mesh,
-                "dims_mapping": [0, -1]
-            })
-        auto.shard_tensor(
-            self.linear1.bias,
-            dist_attr={"process_mesh": _g_process_mesh,
-                       "dims_mapping": [-1]})
+        auto.shard_tensor(self.norm.weight,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1]
+                          })
+        auto.shard_tensor(self.norm.bias,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1]
+                          })
+        auto.shard_tensor(self.linear0.weight,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1, 0]
+                          })
+        auto.shard_tensor(self.linear0.bias,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [0]
+                          })
+        auto.shard_tensor(self.linear1.weight,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [0, -1]
+                          })
+        auto.shard_tensor(self.linear1.bias,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1]
+                          })
 
         out = self.norm(input)
-        auto.shard_tensor(
-            out,
-            dist_attr={
-                "process_mesh": _g_process_mesh,
-                "dims_mapping": [-1, -1, -1]
-            })
+        auto.shard_tensor(out,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1, -1, -1]
+                          })
         out = self.linear0(out)
-        auto.shard_tensor(
-            out,
-            dist_attr={
-                "process_mesh": _g_process_mesh,
-                "dims_mapping": [-1, -1, 0]
-            })
+        auto.shard_tensor(out,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1, -1, 0]
+                          })
         out = F.gelu(out, approximate=True)
-        auto.shard_tensor(
-            out,
-            dist_attr={
-                "process_mesh": _g_process_mesh,
-                "dims_mapping": [-1, -1, 0]
-            })
+        auto.shard_tensor(out,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1, -1, 0]
+                          })
         out = self.linear1(out)
-        auto.shard_tensor(
-            out,
-            dist_attr={
-                "process_mesh": _g_process_mesh,
-                "dims_mapping": [-1, -1, -1]
-            })
+        auto.shard_tensor(out,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1, -1, -1]
+                          })
 
         return out
 
@@ -155,95 +155,94 @@ def get_program():
 
         # 循环计数器
         i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
-        auto.shard_tensor(
-            i,
-            dist_attr={"process_mesh": _g_process_mesh,
-                       "dims_mapping": [-1]})
+        auto.shard_tensor(i,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1]
+                          })
 
         # 循环次数
-        loop_len = fluid.layers.fill_constant(
-            shape=[1], dtype='int64', value=epoch_num)
-        auto.shard_tensor(
-            loop_len,
-            dist_attr={"process_mesh": _g_process_mesh,
-                       "dims_mapping": [-1]})
+        loop_len = fluid.layers.fill_constant(shape=[1],
+                                              dtype='int64',
+                                              value=epoch_num)
+        auto.shard_tensor(loop_len,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1]
+                          })
 
         # input
-        input = static.data(
-            name="input",
-            shape=[batch_size, sequence_len, hidden_size],
-            dtype='float32')
-        label = static.data(
-            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+        input = static.data(name="input",
+                            shape=[batch_size, sequence_len, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, sequence_len, 1],
+                            dtype='float32')
 
         data_holder = [input, label]
         # dataloader
-        dataloader = paddle.io.DataLoader.from_generator(
-            feed_list=data_holder, capacity=4 * batch_size, iterable=False)
-        dataloader.set_batch_generator(
-            batch_generator_creator(), places=paddle.static.cuda_places())
+        dataloader = paddle.io.DataLoader.from_generator(feed_list=data_holder,
+                                                         capacity=4 *
+                                                         batch_size,
+                                                         iterable=False)
+        dataloader.set_batch_generator(batch_generator_creator(),
+                                       places=paddle.static.cuda_places())
         # data dist_attr
-        auto.shard_tensor(
-            input,
-            dist_attr={
-                "process_mesh": _g_process_mesh,
-                "dims_mapping": [-1, -1, -1]
-            })
-        auto.shard_tensor(
-            label,
-            dist_attr={
-                "process_mesh": _g_process_mesh,
-                "dims_mapping": [-1, -1, -1]
-            })
+        auto.shard_tensor(input,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1, -1, -1]
+                          })
+        auto.shard_tensor(label,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1, -1, -1]
+                          })
 
         # fill constant bsz like
         tmp = paddle.fluid.layers.fill_constant_batch_size_like(
             input=input, shape=[-1, 16, 0, 48], dtype='float32', value=0)
-        auto.shard_tensor(
-            tmp,
-            dist_attr={
-                "process_mesh": _g_process_mesh,
-                "dims_mapping": [-1, 0, -1, -1]
-            })
+        auto.shard_tensor(tmp,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1, 0, -1, -1]
+                          })
 
         # model
-        mlp_start = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+        mlp_start = MLPLayer(hidden_size=hidden_size,
+                             intermediate_size=4 * hidden_size,
+                             dropout_ratio=0.1,
+                             initializer_range=0.02)
         pred = mlp_start(input)
 
         input_array = fluid.layers.array_write(pred, i)
-        auto.shard_tensor(
-            input_array,
-            dist_attr={
-                "process_mesh": _g_process_mesh,
-                "dims_mapping": [-1, -1, -1]
-            })
+        auto.shard_tensor(input_array,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1, -1, -1]
+                          })
 
         cond = fluid.layers.less_than(x=i, y=loop_len)
-        auto.shard_tensor(
-            cond,
-            dist_attr={"process_mesh": _g_process_mesh,
-                       "dims_mapping": [-1]})
+        auto.shard_tensor(cond,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1]
+                          })
 
         while_op = fluid.layers.While(cond=cond)
         with while_op.block():
 
             pre_input = fluid.layers.array_read(array=input_array, i=i)
-            auto.shard_tensor(
-                pre_input,
-                dist_attr={
-                    "process_mesh": _g_process_mesh,
-                    "dims_mapping": [-1, -1, -1]
-                })
-
-            mlp_while = MLPLayer(
-                hidden_size=hidden_size,
-                intermediate_size=4 * hidden_size,
-                dropout_ratio=0.1,
-                initializer_range=0.02)
+            auto.shard_tensor(pre_input,
+                              dist_attr={
+                                  "process_mesh": _g_process_mesh,
+                                  "dims_mapping": [-1, -1, -1]
+                              })
+
+            mlp_while = MLPLayer(hidden_size=hidden_size,
+                                 intermediate_size=4 * hidden_size,
+                                 dropout_ratio=0.1,
+                                 initializer_range=0.02)
             cur_pred = mlp_while(pre_input)
 
             # 更新循环条件
@@ -252,33 +251,31 @@ def get_program():
             fluid.layers.less_than(x=i, y=loop_len, cond=cond)
 
         end_pred = fluid.layers.array_read(array=input_array, i=i)
-        auto.shard_tensor(
-            end_pred,
-            dist_attr={
-                "process_mesh": _g_process_mesh,
-                "dims_mapping": [-1, -1, -1]
-            })
-
-        mlp_end = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+        auto.shard_tensor(end_pred,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1, -1, -1]
+                          })
+
+        mlp_end = MLPLayer(hidden_size=hidden_size,
+                           intermediate_size=4 * hidden_size,
+                           dropout_ratio=0.1,
+                           initializer_range=0.02)
         pred = mlp_end(end_pred)
 
         error_cost = paddle.nn.functional.square_error_cost(pred, label)
-        auto.shard_tensor(
-            error_cost,
-            dist_attr={
-                "process_mesh": _g_process_mesh,
-                "dims_mapping": [-1, -1, -1]
-            })
+        auto.shard_tensor(error_cost,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1, -1, -1]
+                          })
 
         loss = paddle.mean(error_cost)
-        auto.shard_tensor(
-            loss,
-            dist_attr={"process_mesh": _g_process_mesh,
-                       "dims_mapping": [-1]})
+        auto.shard_tensor(loss,
+                          dist_attr={
+                              "process_mesh": _g_process_mesh,
+                              "dims_mapping": [-1]
+                          })
 
     return train_program, start_program, dataloader, i, loss
 
@@ -437,6 +434,7 @@ def partition(train_program, start_program, dist_context):
 
 
 class TestMLP(unittest.TestCase):
+
     def test_partitioner(self):
 
         train_program, start_program, dataloader, i, loss = get_program()
@@ -445,8 +443,9 @@ def test_partitioner(self):
                                                   dist_context)
         dist_context.block_state.parse_forward_blocks(train_program)
 
-        dist_main_prog, dist_startup_prog = partition(
-            train_program, start_program, dist_context)
+        dist_main_prog, dist_startup_prog = partition(train_program,
+                                                      start_program,
+                                                      dist_context)
         global_block_ops = dist_main_prog.blocks[0].ops
 
         fill_op = None
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
index ffc222d349294..c3f64e30fc596 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
@@ -41,6 +41,7 @@
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=64,
                  intermediate_size=4 * 64,
@@ -54,52 +55,50 @@ def __init__(self,
         weight_attr0 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr0))
         weight_attr1 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr1))
         bias_attr = None
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr1, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr0,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr1,
+                                 bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
 
     def forward(self, input):
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": PP_MESH_0,
-                    "dims_mapping": [-1, -1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": PP_MESH_1,
-                    "dims_mapping": [-1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_0,
+                                  "dims_mapping": [-1, -1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_1,
+                                  "dims_mapping": [-1, -1]
+                              })
         elif _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -113,43 +112,40 @@ def mlp_forward(train_program, start_program):
         utils.unique_name.guard():
         batch_size = 4
         hidden_size = 64
-        input = static.data(
-            name="input", shape=[batch_size, hidden_size], dtype='float32')
-        label = static.data(
-            name="label", shape=[batch_size, 1], dtype='float32')
+        input = static.data(name="input",
+                            shape=[batch_size, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, 1],
+                            dtype='float32')
 
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": PP_MESH_0,
-                    "dims_mapping": [-1, -1]
-                })
-            auto.shard_tensor(
-                label,
-                dist_attr={
-                    "process_mesh": PP_MESH_1,
-                    "dims_mapping": [-1, -1]
-                })
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_0,
+                                  "dims_mapping": [-1, -1]
+                              })
+            auto.shard_tensor(label,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_1,
+                                  "dims_mapping": [-1, -1]
+                              })
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
-
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            initializer_range=0.02)
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       initializer_range=0.02)
         predict = mlp(input)
         error_cost = paddle.nn.functional.square_error_cost(predict, label)
         loss = paddle.mean(error_cost)
@@ -173,6 +169,7 @@ def get_distributed_program():
 
 
 class TestMLPAutoConvert(unittest.TestCase):
+
     def setUp(self):
         paddle.seed(2021)
         random.seed(2021)
@@ -201,8 +198,9 @@ def test_mlp_mp2pp(self):
 
         for step in range(20):
             if step == 10:
-                save_distributed_checkpoint(
-                    dist_main_prog, ".", dist_attr_path=".")
+                save_distributed_checkpoint(dist_main_prog,
+                                            ".",
+                                            dist_attr_path=".")
 
             res = exe.run(dist_main_prog,
                           feed={
@@ -253,6 +251,7 @@ def test_mlp_mp2pp(self):
 
 
 class TestMLPAutoConvert2(unittest.TestCase):
+
     def setUp(self):
         paddle.seed(2021)
         random.seed(2021)
@@ -340,6 +339,7 @@ def test_mlp_pp2mp(self):
 
 
 class TestMLPAutoConvertInvalid(unittest.TestCase):
+
     def setUp(self):
         paddle.seed(2021)
         random.seed(2021)
@@ -353,14 +353,14 @@ def test_input_invalid(self):
         _global_process_mesh = auto.ProcessMesh([0, 1])
         dist_main_prog, _, _ = get_distributed_program()
         with self.assertRaises(TypeError):
-            save_distributed_checkpoint(
-                dist_main_prog, [""], [""], addition_info=[0])
+            save_distributed_checkpoint(dist_main_prog, [""], [""],
+                                        addition_info=[0])
         with self.assertRaises(ValueError):
-            save_distributed_checkpoint(
-                dist_main_prog, [""], [""], addition_info={"step": 0})
+            save_distributed_checkpoint(dist_main_prog, [""], [""],
+                                        addition_info={"step": 0})
         with self.assertRaises(ValueError):
-            save_distributed_checkpoint(
-                dist_main_prog, [""], [""], addition_info={"batch": 0.0})
+            save_distributed_checkpoint(dist_main_prog, [""], [""],
+                                        addition_info={"batch": 0.0})
         with self.assertRaises(ValueError):
             load_checkpoint_into_program(["./model_state_rank.pdmodel"],
                                          ["./dist_attr_rank.pdattr"],
@@ -369,9 +369,8 @@ def test_input_invalid(self):
             load_distributed_checkpoint(["./model_state_rank.pdmodel"],
                                         ["./dist_attr_rank.pdattr"])
         with self.assertRaises(TypeError):
-            load_distributed_checkpoint({
-                "0": "./model_state_rank.pdmodel"
-            }, {"1": "./dist_attr_rank.pdattr"})
+            load_distributed_checkpoint({"0": "./model_state_rank.pdmodel"},
+                                        {"1": "./dist_attr_rank.pdattr"})
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py b/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py
index ed8cb8a23c372..d3a4a4898bf9a 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py
@@ -32,7 +32,9 @@
 
 
 class TestDataUnshard(unittest.TestCase):
+
     def test_dp2pp1mp1(self):
+
         def create_model(train_program, start_program):
             with paddle.static.program_guard(train_program, start_program):
 
@@ -41,41 +43,36 @@ def create_model(train_program, start_program):
                 label = paddle.static.data(name='label', shape=[2, 8])
 
                 weight_attr = paddle.ParamAttr(
-                    initializer=nn.initializer.Normal(
-                        mean=0.0, std=0.02))
+                    initializer=nn.initializer.Normal(mean=0.0, std=0.02))
                 linear0 = nn.Linear(8, 8, weight_attr)
                 linear1 = nn.Linear(8, 8, weight_attr)
 
-                auto.shard_tensor(
-                    input,
-                    dist_attr={
-                        "process_mesh": MESH_0,
-                        "dims_mapping": [0, -1]
-                    })
-                auto.shard_tensor(
-                    label,
-                    dist_attr={
-                        "process_mesh": MESH_0,
-                        "dims_mapping": [0, -1]
-                    })
-                auto.shard_tensor(
-                    linear0.weight,
-                    dist_attr={
-                        "process_mesh": MESH_0,
-                        "dims_mapping": [-1, -1]
-                    })
-                auto.shard_tensor(
-                    linear1.weight,
-                    dist_attr={
-                        "process_mesh": MESH_0,
-                        "dims_mapping": [-1, -1]
-                    })
+                auto.shard_tensor(input,
+                                  dist_attr={
+                                      "process_mesh": MESH_0,
+                                      "dims_mapping": [0, -1]
+                                  })
+                auto.shard_tensor(label,
+                                  dist_attr={
+                                      "process_mesh": MESH_0,
+                                      "dims_mapping": [0, -1]
+                                  })
+                auto.shard_tensor(linear0.weight,
+                                  dist_attr={
+                                      "process_mesh": MESH_0,
+                                      "dims_mapping": [-1, -1]
+                                  })
+                auto.shard_tensor(linear1.weight,
+                                  dist_attr={
+                                      "process_mesh": MESH_0,
+                                      "dims_mapping": [-1, -1]
+                                  })
 
                 linear0_out = linear0(input)
                 gelu_out = F.gelu(linear0_out)
                 linear1_out = linear1(gelu_out)
-                error_cost = paddle.nn.functional.square_error_cost(linear1_out,
-                                                                    label)
+                error_cost = paddle.nn.functional.square_error_cost(
+                    linear1_out, label)
                 loss = paddle.mean(error_cost)
                 return train_program, start_program, loss, input, label
 
@@ -88,12 +85,11 @@ def create_model(train_program, start_program):
         dist_strategy = fleet.DistributedStrategy()
         dist_strategy.semi_auto = True
         fleet.init(is_collective=True, strategy=dist_strategy)
-        optimizer = paddle.fluid.optimizer.AdamOptimizer(
-            learning_rate=0.00001,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=1e-08,
-            grad_clip=None)
+        optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
+                                                         beta1=0.9,
+                                                         beta2=0.999,
+                                                         epsilon=1e-08,
+                                                         grad_clip=None)
 
         optimizer = fleet.distributed_optimizer(optimizer)
         _, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
@@ -112,15 +108,17 @@ def create_model(train_program, start_program):
         label_data = np.random.randint(0, 10, [2, 8]).astype("float32")
 
         fetchs = [loss.name, 'input@RESHARD_0']
-        loss_np, shard_data_np = exe.run(
-            distributed_main_program,
-            feed={"input": input_data,
-                  "label": label_data},
-            fetch_list=fetchs)
+        loss_np, shard_data_np = exe.run(distributed_main_program,
+                                         feed={
+                                             "input": input_data,
+                                             "label": label_data
+                                         },
+                                         fetch_list=fetchs)
         desired = input_data[worker_index].reshape(shard_data_np.shape)
         np.testing.assert_allclose(shard_data_np, desired)
 
     def dp1pp1mp2(self):
+
         def create_model(train_program, start_program):
             with paddle.static.program_guard(train_program, start_program):
 
@@ -129,44 +127,39 @@ def create_model(train_program, start_program):
                 label = paddle.static.data(name='label', shape=[8, 8])
 
                 weight_attr = paddle.ParamAttr(
-                    initializer=nn.initializer.Normal(
-                        mean=0.0, std=0.02))
+                    initializer=nn.initializer.Normal(mean=0.0, std=0.02))
                 linear0 = nn.Linear(8, 8, weight_attr)
                 linear1 = nn.Linear(8, 8, weight_attr)
 
-                auto.shard_tensor(
-                    input,
-                    dist_attr={
-                        "process_mesh": MESH_0,
-                        "dims_mapping": [-1, -1]
-                    })
-                auto.shard_tensor(
-                    label,
-                    dist_attr={
-                        "process_mesh": MESH_0,
-                        "dims_mapping": [-1, -1]
-                    })
-
-                auto.shard_tensor(
-                    linear0.weight,
-                    dist_attr={
-                        "process_mesh": MESH_0,
-                        "dims_mapping": [-1, 0]
-                    })
-                auto.shard_tensor(
-                    linear1.weight,
-                    dist_attr={
-                        "process_mesh": MESH_0,
-                        "dims_mapping": [0, -1]
-                    })
+                auto.shard_tensor(input,
+                                  dist_attr={
+                                      "process_mesh": MESH_0,
+                                      "dims_mapping": [-1, -1]
+                                  })
+                auto.shard_tensor(label,
+                                  dist_attr={
+                                      "process_mesh": MESH_0,
+                                      "dims_mapping": [-1, -1]
+                                  })
+
+                auto.shard_tensor(linear0.weight,
+                                  dist_attr={
+                                      "process_mesh": MESH_0,
+                                      "dims_mapping": [-1, 0]
+                                  })
+                auto.shard_tensor(linear1.weight,
+                                  dist_attr={
+                                      "process_mesh": MESH_0,
+                                      "dims_mapping": [0, -1]
+                                  })
 
                 linear0_out = linear0(input)
                 gelu_out = F.gelu(linear0_out)
 
                 linear1_out = linear1(gelu_out)
 
-                error_cost = paddle.nn.functional.square_error_cost(linear1_out,
-                                                                    label)
+                error_cost = paddle.nn.functional.square_error_cost(
+                    linear1_out, label)
                 loss = paddle.mean(error_cost)
                 return train_program, start_program, loss, input, label
 
@@ -179,12 +172,11 @@ def create_model(train_program, start_program):
         dist_strategy = fleet.DistributedStrategy()
         dist_strategy.semi_auto = True
         fleet.init(is_collective=True, strategy=dist_strategy)
-        optimizer = paddle.fluid.optimizer.AdamOptimizer(
-            learning_rate=0.00001,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=1e-08,
-            grad_clip=None)
+        optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
+                                                         beta1=0.9,
+                                                         beta2=0.999,
+                                                         epsilon=1e-08,
+                                                         grad_clip=None)
 
         optimizer = fleet.distributed_optimizer(optimizer)
         _, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
@@ -203,11 +195,12 @@ def create_model(train_program, start_program):
         label_data = np.random.randint(0, 10, [8, 8]).astype("float32")
 
         fetchs = [loss.name, 'input']
-        loss_np, shard_data_np = exe.run(
-            distributed_main_program,
-            feed={"input": input_data,
-                  "label": label_data},
-            fetch_list=fetchs)
+        loss_np, shard_data_np = exe.run(distributed_main_program,
+                                         feed={
+                                             "input": input_data,
+                                             "label": label_data
+                                         },
+                                         fetch_list=fetchs)
 
         desired = input_data.reshape(shard_data_np.shape)
         np.testing.assert_allclose(shard_data_np, desired)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
index b1c15c5ce6265..4695f6a4a9425 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
@@ -76,26 +76,27 @@ def __init__(self,
         if self.fuse:
             assert self.kdim == embed_dim
             assert self.vdim == embed_dim
-            self.qkv_proj = nn.Linear(
-                embed_dim, 3 * embed_dim, weight_attr, bias_attr=bias_attr)
+            self.qkv_proj = nn.Linear(embed_dim,
+                                      3 * embed_dim,
+                                      weight_attr,
+                                      bias_attr=bias_attr)
         else:
-            self.q_proj = nn.Linear(
-                embed_dim,
-                embed_dim,
-                weight_attr=weight_attr,
-                bias_attr=bias_attr)
-            self.k_proj = nn.Linear(
-                self.kdim,
-                embed_dim,
-                weight_attr=weight_attr,
-                bias_attr=bias_attr)
-            self.v_proj = nn.Linear(
-                self.vdim,
-                embed_dim,
-                weight_attr=weight_attr,
-                bias_attr=bias_attr)
-        self.out_proj = nn.Linear(
-            embed_dim, embed_dim, weight_attr=weight_attr, bias_attr=bias_attr)
+            self.q_proj = nn.Linear(embed_dim,
+                                    embed_dim,
+                                    weight_attr=weight_attr,
+                                    bias_attr=bias_attr)
+            self.k_proj = nn.Linear(self.kdim,
+                                    embed_dim,
+                                    weight_attr=weight_attr,
+                                    bias_attr=bias_attr)
+            self.v_proj = nn.Linear(self.vdim,
+                                    embed_dim,
+                                    weight_attr=weight_attr,
+                                    bias_attr=bias_attr)
+        self.out_proj = nn.Linear(embed_dim,
+                                  embed_dim,
+                                  weight_attr=weight_attr,
+                                  bias_attr=bias_attr)
 
     def _fuse_prepare_qkv(self, query):
         mix_layer = self.qkv_proj(query)
@@ -113,33 +114,30 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
         """
         q = self.q_proj(query)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": DPMPPP_MESH_LIST[self.mesh_idx],
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh":
+                                  DPMPPP_MESH_LIST[self.mesh_idx],
+                                  "dims_mapping": [-1, 1]
+                              })
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
         q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
         if isinstance(cache, self.StaticCache):
@@ -167,62 +165,56 @@ def compute_kv(self, key, value):
         """
         k = self.k_proj(key)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": DPMPPP_MESH_LIST[self.mesh_idx],
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh":
+                                  DPMPPP_MESH_LIST[self.mesh_idx],
+                                  "dims_mapping": [-1, 1]
+                              })
         v = self.v_proj(value)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": DPMPPP_MESH_LIST[self.mesh_idx],
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh":
+                                  DPMPPP_MESH_LIST[self.mesh_idx],
+                                  "dims_mapping": [-1, 1]
+                              })
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
         v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
@@ -276,17 +268,18 @@ def forward(self,
         else:
             q, k, v, cache = self._prepare_qkv(query, key, value, use_cache,
                                                cache)
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
+        product = layers.matmul(x=q,
+                                y=k,
+                                transpose_y=True,
+                                alpha=self.head_dim**-0.5)
         if attn_mask is not None:
             product = product + attn_mask
         weights = F.softmax(product)
         if self.dropout:
-            weights = F.dropout(
-                weights,
-                self.dropout,
-                training=self.training,
-                mode="upscale_in_train")
+            weights = F.dropout(weights,
+                                self.dropout,
+                                training=self.training,
+                                mode="upscale_in_train")
         out = tensor.matmul(weights, v)
         # combine heads
         out = tensor.transpose(out, perm=[0, 2, 1, 3])
@@ -294,33 +287,30 @@ def forward(self,
         # project to output
         out = self.out_proj(out)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": DPMPPP_MESH_LIST[self.mesh_idx],
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh":
+                                  DPMPPP_MESH_LIST[self.mesh_idx],
+                                  "dims_mapping": [1, -1]
+                              })
         outs = [out]
         if self.need_weights:
             outs.append(weights)
@@ -362,36 +352,37 @@ def forward(self,
         new_caches = []
         self.checkpoints = []
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(
-                output,
-                dist_attr={
-                    "process_mesh": PP_MESH_LIST[0],
-                    "dims_mapping": [-1 for i in range(len(output.shape))]
-                })
+            auto.shard_tensor(output,
+                              dist_attr={
+                                  "process_mesh":
+                                  PP_MESH_LIST[0],
+                                  "dims_mapping":
+                                  [-1 for i in range(len(output.shape))]
+                              })
         if _global_parallel_strategy == "dp_pp":
-            auto.shard_tensor(
-                output,
-                dist_attr={
-                    "process_mesh": DPPP_MESH_LIST[0],
-                    "dims_mapping":
-                    [0] + [-1 for i in range(len(output.shape) - 1)]
-                })
+            auto.shard_tensor(output,
+                              dist_attr={
+                                  "process_mesh":
+                                  DPPP_MESH_LIST[0],
+                                  "dims_mapping": [0] +
+                                  [-1 for i in range(len(output.shape) - 1)]
+                              })
         if _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(
-                output,
-                dist_attr={
-                    "process_mesh": MPPP_MESH_LIST[0],
-                    "dims_mapping":
-                    [-1] + [-1 for i in range(len(output.shape) - 1)]
-                })
+            auto.shard_tensor(output,
+                              dist_attr={
+                                  "process_mesh":
+                                  MPPP_MESH_LIST[0],
+                                  "dims_mapping": [-1] +
+                                  [-1 for i in range(len(output.shape) - 1)]
+                              })
         if _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(
-                output,
-                dist_attr={
-                    "process_mesh": DPMPPP_MESH_LIST[0],
-                    "dims_mapping":
-                    [0] + [-1 for i in range(len(output.shape) - 1)]
-                })
+            auto.shard_tensor(output,
+                              dist_attr={
+                                  "process_mesh":
+                                  DPMPPP_MESH_LIST[0],
+                                  "dims_mapping": [0] +
+                                  [-1 for i in range(len(output.shape) - 1)]
+                              })
         for i, mod in enumerate(self.layers):
             if cache is None:
                 if use_cache:
@@ -404,7 +395,8 @@ def forward(self,
                         auto.shard_tensor(
                             output,
                             dist_attr={
-                                "process_mesh": PP_MESH_LIST[mod.mesh_idx],
+                                "process_mesh":
+                                PP_MESH_LIST[mod.mesh_idx],
                                 "dims_mapping":
                                 [-1 for i in range(len(output.shape))]
                             })
@@ -417,7 +409,8 @@ def forward(self,
                         auto.shard_tensor(
                             output,
                             dist_attr={
-                                "process_mesh": DPPP_MESH_LIST[mod.mesh_idx],
+                                "process_mesh":
+                                DPPP_MESH_LIST[mod.mesh_idx],
                                 "dims_mapping": [0] +
                                 [-1 for i in range(len(output.shape) - 1)]
                             })
@@ -430,7 +423,8 @@ def forward(self,
                         auto.shard_tensor(
                             output,
                             dist_attr={
-                                "process_mesh": MPPP_MESH_LIST[mod.mesh_idx],
+                                "process_mesh":
+                                MPPP_MESH_LIST[mod.mesh_idx],
                                 "dims_mapping": [-1] +
                                 [-1 for i in range(len(output.shape) - 1)]
                             })
@@ -443,7 +437,8 @@ def forward(self,
                         auto.shard_tensor(
                             output,
                             dist_attr={
-                                "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx],
+                                "process_mesh":
+                                DPMPPP_MESH_LIST[mod.mesh_idx],
                                 "dims_mapping": [0] +
                                 [-1 for i in range(len(output.shape) - 1)]
                             })
@@ -456,41 +451,47 @@ def forward(self,
                     new_caches.append(new_cache)
                 else:
                     if _global_parallel_strategy == "pp":
-                        output = auto.shard_op(
-                            mod,
-                            dist_attr={
-                                "process_mesh": PP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)[0]
+                        output = auto.shard_op(mod,
+                                               dist_attr={
+                                                   "process_mesh":
+                                                   PP_MESH_LIST[mod.mesh_idx]
+                                               })(output, memory, tgt_mask,
+                                                  use_cache, cache)[0]
                         auto.shard_tensor(
                             output,
                             dist_attr={
-                                "process_mesh": PP_MESH_LIST[mod.mesh_idx],
+                                "process_mesh":
+                                PP_MESH_LIST[mod.mesh_idx],
                                 "dims_mapping":
                                 [-1 for i in range(len(output.shape))]
                             })
                     elif _global_parallel_strategy == "dp_pp":
-                        output = auto.shard_op(
-                            mod,
-                            dist_attr={
-                                "process_mesh": DPPP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)[0]
+                        output = auto.shard_op(mod,
+                                               dist_attr={
+                                                   "process_mesh":
+                                                   DPPP_MESH_LIST[mod.mesh_idx]
+                                               })(output, memory, tgt_mask,
+                                                  use_cache, cache)[0]
                         auto.shard_tensor(
                             output,
                             dist_attr={
-                                "process_mesh": DPPP_MESH_LIST[mod.mesh_idx],
+                                "process_mesh":
+                                DPPP_MESH_LIST[mod.mesh_idx],
                                 "dims_mapping": [0] +
                                 [-1 for i in range(len(output.shape) - 1)]
                             })
                     elif _global_parallel_strategy == "mp_pp":
-                        output = auto.shard_op(
-                            mod,
-                            dist_attr={
-                                "process_mesh": MPPP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)[0]
+                        output = auto.shard_op(mod,
+                                               dist_attr={
+                                                   "process_mesh":
+                                                   MPPP_MESH_LIST[mod.mesh_idx]
+                                               })(output, memory, tgt_mask,
+                                                  use_cache, cache)[0]
                         auto.shard_tensor(
                             output,
                             dist_attr={
-                                "process_mesh": MPPP_MESH_LIST[mod.mesh_idx],
+                                "process_mesh":
+                                MPPP_MESH_LIST[mod.mesh_idx],
                                 "dims_mapping": [-1] +
                                 [-1 for i in range(len(output.shape) - 1)]
                             })
@@ -503,7 +504,8 @@ def forward(self,
                         auto.shard_tensor(
                             output,
                             dist_attr={
-                                "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx],
+                                "process_mesh":
+                                DPMPPP_MESH_LIST[mod.mesh_idx],
                                 "dims_mapping": [0] +
                                 [-1 for i in range(len(output.shape) - 1)]
                             })
@@ -517,8 +519,9 @@ def forward(self,
                 if _global_parallel_strategy == "pp":
                     output, new_cache = auto.shard_op(
                         mod,
-                        dist_attr={"process_mesh": PP_MESH_LIST[mod.mesh_idx]})(
-                            output, memory, tgt_mask, use_cache, cache)
+                        dist_attr={"process_mesh": PP_MESH_LIST[mod.mesh_idx]
+                                   })(output, memory, tgt_mask, use_cache,
+                                      cache)
                     auto.shard_tensor(
                         output,
                         dist_attr={
@@ -535,7 +538,8 @@ def forward(self,
                     auto.shard_tensor(
                         output,
                         dist_attr={
-                            "process_mesh": DPPP_MESH_LIST[mod.mesh_idx],
+                            "process_mesh":
+                            DPPP_MESH_LIST[mod.mesh_idx],
                             "dims_mapping":
                             [0] + [-1 for i in range(len(output.shape) - 1)]
                         })
@@ -548,7 +552,8 @@ def forward(self,
                     auto.shard_tensor(
                         output,
                         dist_attr={
-                            "process_mesh": MPPP_MESH_LIST[mod.mesh_idx],
+                            "process_mesh":
+                            MPPP_MESH_LIST[mod.mesh_idx],
                             "dims_mapping":
                             [-1] + [-1 for i in range(len(output.shape) - 1)]
                         })
@@ -561,7 +566,8 @@ def forward(self,
                     auto.shard_tensor(
                         output,
                         dist_attr={
-                            "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx],
+                            "process_mesh":
+                            DPMPPP_MESH_LIST[mod.mesh_idx],
                             "dims_mapping":
                             [0] + [-1 for i in range(len(output.shape) - 1)]
                         })
@@ -619,17 +625,20 @@ def __init__(self,
         self.normalize_before = normalize_before
         weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
         bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
-        self.self_attn = MultiHeadAttention(
-            d_model,
-            nhead,
-            dropout=attn_dropout,
-            weight_attr=weight_attrs[0],
-            bias_attr=bias_attrs[0],
-            mesh_idx=self.mesh_idx)
-        self.linear1 = nn.Linear(
-            d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])
-        self.linear2 = nn.Linear(
-            dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])
+        self.self_attn = MultiHeadAttention(d_model,
+                                            nhead,
+                                            dropout=attn_dropout,
+                                            weight_attr=weight_attrs[0],
+                                            bias_attr=bias_attrs[0],
+                                            mesh_idx=self.mesh_idx)
+        self.linear1 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attrs[2],
+                                 bias_attr=bias_attrs[2])
+        self.linear2 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attrs[2],
+                                 bias_attr=bias_attrs[2])
         self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
         self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
         self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
@@ -652,72 +661,65 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
         if self.normalize_before:
             tgt = self.norm2(tgt)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
+                                  "dims_mapping": [-1, 0]
+                              })
         if _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": DPMPPP_MESH_LIST[self.mesh_idx],
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh":
+                                  DPMPPP_MESH_LIST[self.mesh_idx],
+                                  "dims_mapping": [-1, 1]
+                              })
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.linear2.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.linear2.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.linear2.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.linear2.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(
-                self.linear2.weight,
-                dist_attr={
-                    "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.linear2.weight,
+                              dist_attr={
+                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(
-                self.linear2.weight,
-                dist_attr={
-                    "process_mesh": DPMPPP_MESH_LIST[self.mesh_idx],
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.linear2.weight,
+                              dist_attr={
+                                  "process_mesh":
+                                  DPMPPP_MESH_LIST[self.mesh_idx],
+                                  "dims_mapping": [1, -1]
+                              })
         tgt = self.dropout2(
-            self.linear2(F.gelu(
-                self.linear1(tgt), approximate=True)))
+            self.linear2(F.gelu(self.linear1(tgt), approximate=True)))
         tgt = residual + tgt
         if not self.normalize_before:
             tgt = self.norm2(tgt)
         return tgt if use_cache is False else (tgt, incremental_cache)
 
     def gen_cache(self, memory):
-        incremental_cache = self.self_attn.gen_cache(
-            memory, type=self.self_attn.Cache)
+        incremental_cache = self.self_attn.gen_cache(memory,
+                                                     type=self.self_attn.Cache)
         return incremental_cache
 
 
@@ -737,17 +739,15 @@ def __init__(self,
         self.word_embeddings = nn.Embedding(
             vocab_size,
             hidden_size,
-            weight_attr=paddle.ParamAttr(
-                name="word_embeddings",
-                initializer=nn.initializer.Normal(
-                    mean=0.0, std=initializer_range)))
+            weight_attr=paddle.ParamAttr(name="word_embeddings",
+                                         initializer=nn.initializer.Normal(
+                                             mean=0.0, std=initializer_range)))
         self.position_embeddings = nn.Embedding(
             max_position_embeddings,
             hidden_size,
-            weight_attr=paddle.ParamAttr(
-                name="pos_embeddings",
-                initializer=nn.initializer.Normal(
-                    mean=0.0, std=initializer_range)))
+            weight_attr=paddle.ParamAttr(name="pos_embeddings",
+                                         initializer=nn.initializer.Normal(
+                                             mean=0.0, std=initializer_range)))
         self.dropout = nn.Dropout(hidden_dropout_prob)
 
     def forward(self, input_ids, position_ids=None):
@@ -757,33 +757,29 @@ def forward(self, input_ids, position_ids=None):
             position_ids = seq_length - ones
         input_embedings = self.word_embeddings(input_ids)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.word_embeddings.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.word_embeddings.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.word_embeddings.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.word_embeddings.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(
-                self.word_embeddings.weight,
-                dist_attr={
-                    "process_mesh": MPPP_MESH_LIST[0],
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.word_embeddings.weight,
+                              dist_attr={
+                                  "process_mesh": MPPP_MESH_LIST[0],
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(
-                self.word_embeddings.weight,
-                dist_attr={
-                    "process_mesh": DPMPPP_MESH_LIST[0],
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.word_embeddings.weight,
+                              dist_attr={
+                                  "process_mesh": DPMPPP_MESH_LIST[0],
+                                  "dims_mapping": [1, -1]
+                              })
         position_embeddings = self.position_embeddings(position_ids)
         embeddings = input_embedings + position_embeddings
         embeddings = self.dropout(embeddings)
@@ -821,9 +817,10 @@ def __init__(self,
         self.pipline_mode = (pp_degree is not None and pp_degree > 1)
         if self.pipline_mode:
             self.layer_per_stage = num_hidden_layers // pp_degree
-        self.embeddings = GPTEmbeddings(
-            vocab_size, hidden_size, hidden_dropout_prob,
-            max_position_embeddings, type_vocab_size, self.initializer_range)
+        self.embeddings = GPTEmbeddings(vocab_size, hidden_size,
+                                        hidden_dropout_prob,
+                                        max_position_embeddings,
+                                        type_vocab_size, self.initializer_range)
         decoder_layers = nn.LayerList()
         for i in range(num_hidden_layers):
             mesh_index = None
@@ -831,25 +828,23 @@ def __init__(self,
             if self.layer_per_stage is not None:
                 mesh_index = i // self.layer_per_stage
             decoder_layers.append(
-                DecoderLayer(
-                    d_model=hidden_size,
-                    nhead=num_attention_heads,
-                    dim_feedforward=intermediate_size,
-                    dropout=hidden_dropout_prob,
-                    activation=hidden_act,
-                    attn_dropout=attention_probs_dropout_prob,
-                    act_dropout=hidden_dropout_prob,
-                    weight_attr=paddle.ParamAttr(
-                        initializer=nn.initializer.Normal(
-                            mean=0.0, std=self.initializer_range)),
-                    bias_attr=None,
-                    mesh_idx=mesh_index))
+                DecoderLayer(d_model=hidden_size,
+                             nhead=num_attention_heads,
+                             dim_feedforward=intermediate_size,
+                             dropout=hidden_dropout_prob,
+                             activation=hidden_act,
+                             attn_dropout=attention_probs_dropout_prob,
+                             act_dropout=hidden_dropout_prob,
+                             weight_attr=paddle.ParamAttr(
+                                 initializer=nn.initializer.Normal(
+                                     mean=0.0, std=self.initializer_range)),
+                             bias_attr=None,
+                             mesh_idx=mesh_index))
         Decoder = TransformerDecoder
-        self.decoder = Decoder(
-            decoder_layers,
-            num_hidden_layers,
-            norm="LayerNorm",
-            hidden_size=hidden_size)
+        self.decoder = Decoder(decoder_layers,
+                               num_hidden_layers,
+                               norm="LayerNorm",
+                               hidden_size=hidden_size)
         self.checkpoints = []
 
     def forward(self,
@@ -863,44 +858,44 @@ def forward(self,
             past_length = 0
             if cache is not None:
                 past_length = paddle.shape(cache[0].k)[-2]
-            position_ids = paddle.arange(
-                past_length,
-                paddle.shape(input_ids)[-1] + past_length,
-                dtype='int64')
+            position_ids = paddle.arange(past_length,
+                                         paddle.shape(input_ids)[-1] +
+                                         past_length,
+                                         dtype='int64')
             position_ids = position_ids.unsqueeze(0)
-            position_ids = paddle.fluid.layers.expand_as(position_ids,
-                                                         input_ids)
-        embedding_output = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids)
+            position_ids = paddle.fluid.layers.expand_as(
+                position_ids, input_ids)
+        embedding_output = self.embeddings(input_ids=input_ids,
+                                           position_ids=position_ids)
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(
-                input_ids,
-                dist_attr={
-                    "process_mesh": PP_MESH_LIST[0],
-                    "dims_mapping": [-1 for i in range(len(input_ids.shape))]
-                })
+            auto.shard_tensor(input_ids,
+                              dist_attr={
+                                  "process_mesh":
+                                  PP_MESH_LIST[0],
+                                  "dims_mapping":
+                                  [-1 for i in range(len(input_ids.shape))]
+                              })
         if _global_parallel_strategy == "dp_pp":
-            auto.shard_tensor(
-                input_ids,
-                dist_attr={
-                    "process_mesh": DPPP_MESH_LIST[0],
-                    "dims_mapping":
-                    [0] + [-1 for i in range(len(input_ids.shape) - 1)]
-                })
+            auto.shard_tensor(input_ids,
+                              dist_attr={
+                                  "process_mesh":
+                                  DPPP_MESH_LIST[0],
+                                  "dims_mapping": [0] +
+                                  [-1 for i in range(len(input_ids.shape) - 1)]
+                              })
         if _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(
-                input_ids,
-                dist_attr={
-                    "process_mesh": DPMPPP_MESH_LIST[0],
-                    "dims_mapping":
-                    [0] + [-1 for i in range(len(input_ids.shape) - 1)]
-                })
-        encoder_outputs = self.decoder(
-            embedding_output,
-            memory=None,
-            tgt_mask=attention_mask,
-            use_cache=use_cache,
-            cache=cache)
+            auto.shard_tensor(input_ids,
+                              dist_attr={
+                                  "process_mesh":
+                                  DPMPPP_MESH_LIST[0],
+                                  "dims_mapping": [0] +
+                                  [-1 for i in range(len(input_ids.shape) - 1)]
+                              })
+        encoder_outputs = self.decoder(embedding_output,
+                                       memory=None,
+                                       tgt_mask=attention_mask,
+                                       use_cache=use_cache,
+                                       cache=cache)
         self.checkpoints.extend(self.decoder.checkpoints)
         return encoder_outputs
 
@@ -912,19 +907,19 @@ class GPTForPretraining(nn.Layer):
     """
 
     def __init__(
-            self,
-            gpt,
-            vocab_size=50304,
-            hidden_size=768,
-            initializer_range=0.02, ):
+        self,
+        gpt,
+        vocab_size=50304,
+        hidden_size=768,
+        initializer_range=0.02,
+    ):
         super(GPTForPretraining, self).__init__()
         self.output_embeddings = nn.Embedding(
             vocab_size,
             hidden_size,
-            weight_attr=paddle.ParamAttr(
-                name="output_embeddings",
-                initializer=nn.initializer.Normal(
-                    mean=0.0, std=initializer_range)))
+            weight_attr=paddle.ParamAttr(name="output_embeddings",
+                                         initializer=nn.initializer.Normal(
+                                             mean=0.0, std=initializer_range)))
         self.gpt = gpt
 
     def forward(self,
@@ -943,8 +938,9 @@ def forward(self,
             encoder_outputs, cached_kvs = outputs[:2]
         else:
             encoder_outputs = outputs
-        logits = paddle.matmul(
-            encoder_outputs, self.output_embeddings.weight, transpose_y=True)
+        logits = paddle.matmul(encoder_outputs,
+                               self.output_embeddings.weight,
+                               transpose_y=True)
         if use_cache:
             return logits, cached_kvs
         else:
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py b/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py
index 3ddd41158a69e..7d738d3678926 100755
--- a/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py
@@ -33,6 +33,7 @@
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -41,14 +42,18 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
         self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
@@ -70,25 +75,23 @@ def mlp_pretrain_forward(train_program, start_program):
         batch_size = 4
         hidden_size = 1024
         sequence_len = 512
-        input = static.data(
-            name="input",
-            shape=[batch_size, sequence_len, hidden_size],
-            dtype='float32')
-        label = static.data(
-            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
-
-        auto.shard_tensor(
-            input,
-            dist_attr={
-                "process_mesh": _global_process_mesh,
-                "dims_mappig": [-1, -1, -1]
-            })
-
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+        input = static.data(name="input",
+                            shape=[batch_size, sequence_len, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, sequence_len, 1],
+                            dtype='float32')
+
+        auto.shard_tensor(input,
+                          dist_attr={
+                              "process_mesh": _global_process_mesh,
+                              "dims_mappig": [-1, -1, -1]
+                          })
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       dropout_ratio=0.1,
+                       initializer_range=0.02)
 
         predict = mlp(input)
 
@@ -99,6 +102,7 @@ def mlp_pretrain_forward(train_program, start_program):
 
 
 class TestMLPAutoParallelizer(unittest.TestCase):
+
     def test_mlp_serial(self):
 
         global _global_process_mesh
@@ -116,15 +120,14 @@ def test_mlp_serial(self):
 
         train_program = static.Program()
         start_program = static.Program()
-        loss, train_program, start_program = mlp_pretrain_forward(train_program,
-                                                                  start_program)
-
-        optimizer = paddle.fluid.optimizer.AdamOptimizer(
-            learning_rate=0.00001,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=1e-08,
-            grad_clip=None)
+        loss, train_program, start_program = mlp_pretrain_forward(
+            train_program, start_program)
+
+        optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
+                                                         beta1=0.9,
+                                                         beta2=0.999,
+                                                         epsilon=1e-08,
+                                                         grad_clip=None)
 
         optimizer = fleet.distributed_optimizer(optimizer)
         _, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
index 35ee4f30da00c..12f4cc08b0874 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
@@ -39,6 +39,7 @@
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=64,
                  intermediate_size=4 * 64,
@@ -51,52 +52,50 @@ def __init__(self,
         weight_attr = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr))
         bias_attr = None
 
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
 
     def forward(self, input):
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": PP_MESH_0,
-                    "dims_mapping": [-1, -1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": PP_MESH_1,
-                    "dims_mapping": [-1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_0,
+                                  "dims_mapping": [-1, -1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_1,
+                                  "dims_mapping": [-1, -1]
+                              })
         elif _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -112,43 +111,40 @@ def mlp_forward(train_program, start_program):
 
         batch_size = 4
         hidden_size = 64
-        input = static.data(
-            name="input", shape=[batch_size, hidden_size], dtype='float32')
-        label = static.data(
-            name="label", shape=[batch_size, 1], dtype='float32')
+        input = static.data(name="input",
+                            shape=[batch_size, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, 1],
+                            dtype='float32')
 
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": PP_MESH_0,
-                    "dims_mapping": [-1, -1]
-                })
-            auto.shard_tensor(
-                label,
-                dist_attr={
-                    "process_mesh": PP_MESH_1,
-                    "dims_mapping": [-1, -1]
-                })
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_0,
+                                  "dims_mapping": [-1, -1]
+                              })
+            auto.shard_tensor(label,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_1,
+                                  "dims_mapping": [-1, -1]
+                              })
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
-
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            initializer_range=0.02)
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       initializer_range=0.02)
 
         predict = mlp(input)
         error_cost = paddle.nn.functional.square_error_cost(predict, label)
@@ -177,6 +173,7 @@ def get_distributed_program():
 
 
 class TestMLPSaveLoad(unittest.TestCase):
+
     def setUp(self):
         paddle.seed(2021)
         random.seed(2021)
diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
index 37216241b8f08..b5ebeb659a649 100644
--- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
@@ -1,9 +1,12 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0)
 
 foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
 endforeach(TEST_OP)
 
 set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 160)
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
index 40aead9076569..6e097b6335bcc 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 import paddle
+import paddle.fluid as fluid
 import paddle.compat as cpt
 import paddle.nn.functional as F
 from paddle.autograd.functional import _as_tensors
@@ -37,6 +38,7 @@ def make_v(f, inputs):
 
 
 class TestAutogradFunctional(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         cls.RAW_INPUTS = {
@@ -54,8 +56,8 @@ def setUp(self):
     def gen_input(self, inp, stop_gradient=False):
         if isinstance(inp, paddle.Tensor):
             return inp
-        return paddle.to_tensor(
-            self.RAW_INPUTS[inp], stop_gradient=stop_gradient)
+        return paddle.to_tensor(self.RAW_INPUTS[inp],
+                                stop_gradient=stop_gradient)
 
     def gen_inputs(self, inputs):
         if isinstance(inputs, list):
@@ -70,6 +72,7 @@ def gen_test_pairs(self,
                        v=None,
                        create_graph=False,
                        allow_unused=False):
+
         def vjp_test():
             nonlocal v
             xs = self.gen_inputs(inputs)
@@ -87,18 +90,16 @@ def grad_test():
                 v = self.gen_inputs(v)
             outputs = func(*xs)
             if v is not None:
-                inputs_grad = paddle.grad(
-                    outputs,
-                    xs,
-                    v,
-                    create_graph=create_graph,
-                    allow_unused=allow_unused)
+                inputs_grad = paddle.grad(outputs,
+                                          xs,
+                                          v,
+                                          create_graph=create_graph,
+                                          allow_unused=allow_unused)
             else:
-                inputs_grad = paddle.grad(
-                    outputs,
-                    xs,
-                    create_graph=create_graph,
-                    allow_unused=allow_unused)
+                inputs_grad = paddle.grad(outputs,
+                                          xs,
+                                          create_graph=create_graph,
+                                          allow_unused=allow_unused)
             return outputs, inputs_grad
 
         return vjp_test, grad_test
@@ -109,6 +110,7 @@ def gen_jvp_tests(self,
                       v=None,
                       create_graph=False,
                       allow_unused=False):
+
         def jvp_test():
             nonlocal v
             xs = self.gen_inputs(inputs)
@@ -146,6 +148,7 @@ def check_results(self, ref, res):
 
 
 class TestVJP(TestAutogradFunctional):
+
     def func_vjp_i1o1(self):
         test_cases = [
             [reduce, 'A'],  # noqa
@@ -223,14 +226,14 @@ def test_all_cases(self):
 
 @utils.place(config.DEVICES)
 @utils.parameterize(
-    (utils.TEST_CASE_NAME, 'fun', 'xs', 'v', 'expected_exception'), (
-        ('v_shape_not_equal_ys', utils.square, np.random.rand(3),
-         np.random.rand(1), RuntimeError), ))
+    (utils.TEST_CASE_NAME, 'fun', 'xs', 'v', 'expected_exception'),
+    (('v_shape_not_equal_ys', utils.square, np.random.rand(3),
+      np.random.rand(1), RuntimeError), ))
 class TestVJPException(unittest.TestCase):
+
     def func_vjp(self):
         with self.assertRaises(self.expected_exception):
-            paddle.autograd.vjp(self.fun,
-                                paddle.to_tensor(self.xs),
+            paddle.autograd.vjp(self.fun, paddle.to_tensor(self.xs),
                                 paddle.to_tensor(self.v))
 
     def test_all_cases(self):
@@ -268,6 +271,7 @@ def jac(grad_fn, f, inputs):
 
 
 class TestJVP(TestAutogradFunctional):
+
     def func_jvp_i1o1(self):
         test_cases = [
             [reduce, 'A'],  # noqa
@@ -328,17 +332,19 @@ def test_all_cases(self):
     ('3d_in_3d_out', utils.square, np.random.rand(2, 3, 4)),
     ('single_in_single_out', utils.square, np.random.rand(2, 3)),
     ('multi_in_single_out', paddle.matmul,
-     (np.random.rand(2, 2), np.random.rand(2, 2))), ))
+     (np.random.rand(2, 2), np.random.rand(2, 2))),
+))
 class TestJacobianClassNoBatch(unittest.TestCase):
+
     def setUp(self):
         self._dtype = self.xs[0].dtype if isinstance(
             self.xs, typing.Sequence) else self.xs.dtype
-        self._eps = config.TOLERANCE.get(str(self._dtype)).get(
-            "first_order_grad").get("eps")
-        self._rtol = config.TOLERANCE.get(str(self._dtype)).get(
-            "first_order_grad").get("rtol")
-        self._atol = config.TOLERANCE.get(str(self._dtype)).get(
-            "first_order_grad").get("atol")
+        self._eps = config.TOLERANCE.get(str(
+            self._dtype)).get("first_order_grad").get("eps")
+        self._rtol = config.TOLERANCE.get(str(
+            self._dtype)).get("first_order_grad").get("rtol")
+        self._atol = config.TOLERANCE.get(str(
+            self._dtype)).get("first_order_grad").get("atol")
 
     def func_jacobian(self):
         xs = [paddle.to_tensor(x) for x in self.xs] if isinstance(
@@ -358,7 +364,8 @@ def func_jacobian(self):
                 self._expected.__getitem__(index.value),
                 rtol=self._rtol,
                 atol=self._atol,
-                err_msg=f'Testcase {index.type} index not passed, value is {index.value}'
+                err_msg=
+                f'Testcase {index.type} index not passed, value is {index.value}'
             )
 
     def _get_expected(self):
@@ -378,17 +385,19 @@ def test_all_cases(self):
 @utils.parameterize((utils.TEST_CASE_NAME, 'func', 'xs'), (
     ('1d_in_1d_out', utils.square, np.array([[1., 2., 3.], [3., 4., 3.]])),
     ('3d_in_3d_out', utils.square, np.random.rand(2, 3, 4)),
-    ('multi_in_single_out', utils.square, np.random.rand(2, 3)), ))
+    ('multi_in_single_out', utils.square, np.random.rand(2, 3)),
+))
 class TestJacobianClassBatchFirst(unittest.TestCase):
+
     def setUp(self):
         self._dtype = self.xs[0].dtype if isinstance(
             self.xs, typing.Sequence) else self.xs.dtype
-        self._eps = config.TOLERANCE.get(str(self._dtype)).get(
-            "first_order_grad").get("eps")
-        self._rtol = config.TOLERANCE.get(str(self._dtype)).get(
-            "first_order_grad").get("rtol")
-        self._atol = config.TOLERANCE.get(str(self._dtype)).get(
-            "first_order_grad").get("atol")
+        self._eps = config.TOLERANCE.get(str(
+            self._dtype)).get("first_order_grad").get("eps")
+        self._rtol = config.TOLERANCE.get(str(
+            self._dtype)).get("first_order_grad").get("rtol")
+        self._atol = config.TOLERANCE.get(str(
+            self._dtype)).get("first_order_grad").get("atol")
 
     def func_jacobian(self):
         xs = [paddle.to_tensor(x) for x in self.xs] if isinstance(
@@ -397,16 +406,18 @@ def func_jacobian(self):
         self._expected = self._get_expected()
 
         Index = collections.namedtuple('Index', ('type', 'value'))
-        indexes = (
-            Index('all', (slice(0, None, None), slice(0, None, None),
-                          slice(0, None, None))),
-            Index('row', (slice(0, None, None), 0, slice(0, None, None))),
-            Index('col',
-                  (slice(0, None, None), slice(0, None, None), 0)), Index(
-                      'batch', (slice(0, 2, None), slice(0, None, None),
-                                slice(0, None, None))),
-            Index('multi_row',
-                  (slice(0, 1, None), slice(0, 2, 1), slice(0, None, None))))
+        indexes = (Index(
+            'all',
+            (slice(0, None, None), slice(0, None, None), slice(0, None, None))),
+                   Index('row',
+                         (slice(0, None, None), 0, slice(0, None, None))),
+                   Index('col',
+                         (slice(0, None, None), slice(0, None, None), 0)),
+                   Index('batch', (slice(0, 2, None), slice(
+                       0, None, None), slice(0, None, None))),
+                   Index('multi_row',
+                         (slice(0, 1, None), slice(0, 2, 1), slice(
+                             0, None, None))))
         self.assertEqual(self._actual[:].numpy().dtype, self._expected.dtype)
         for index in indexes:
             np.testing.assert_allclose(
@@ -414,7 +425,8 @@ def func_jacobian(self):
                 self._expected.__getitem__(index.value),
                 rtol=self._rtol,
                 atol=self._atol,
-                err_msg=f'Testcase {index.type} index not passed, value is {index.value}'
+                err_msg=
+                f'Testcase {index.type} index not passed, value is {index.value}'
             )
 
     def _get_expected(self):
@@ -433,21 +445,23 @@ def test_all_cases(self):
 
 
 class TestHessianClassNoBatch(unittest.TestCase):
+
     @classmethod
     def setUpClass(self):
         self.shape = (2, 2)
         self.dtype = 'float32'
         self.np_dtype = np.float32
-        self.numerical_delta = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("atol")
+        self.numerical_delta = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("eps")
+        self.rtol = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("rtol")
+        self.atol = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("atol")
         self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
         self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
 
     def func_single_input(self):
+
         def func(x):
             return paddle.sum(paddle.matmul(x, x))
 
@@ -461,6 +475,7 @@ def func(x):
                                    self.rtol, self.atol)
 
     def func_multi_input(self):
+
         def func(x, y):
             return paddle.sum(paddle.matmul(x, y))
 
@@ -470,13 +485,13 @@ def func(x, y):
         self.x.stop_gradient = False
         self.y.stop_gradient = False
         hessian = paddle.autograd.Hessian(func, [self.x, self.y])
-        np.testing.assert_allclose(
-            hessian[:].numpy(),
-            numerical_hessian,
-            rtol=self.rtol,
-            atol=self.atol)
+        np.testing.assert_allclose(hessian[:].numpy(),
+                                   numerical_hessian,
+                                   rtol=self.rtol,
+                                   atol=self.atol)
 
     def func_allow_unused_true(self):
+
         def func(x, y):
             return paddle.sum(paddle.matmul(x, x))
 
@@ -490,6 +505,8 @@ def func(x, y):
                                    self.rtol, self.atol)
 
     def func_create_graph_true(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
         def func(x):
             return paddle.sum(F.sigmoid(x))
 
@@ -501,8 +518,10 @@ def func(x):
         assert hessian[:].stop_gradient == False
         np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian,
                                    self.rtol, self.atol)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_out_not_single(self):
+
         def func(x):
             return x * x
 
@@ -526,6 +545,7 @@ def test_all_cases(self):
 
 
 class TestHessianClassBatchFirst(unittest.TestCase):
+
     @classmethod
     def setUpClass(self):
         self.x_shape = (5, 2)
@@ -534,17 +554,18 @@ def setUpClass(self):
         self.nbatch, self.nrow = 5, 2
         self.dtype = 'float32'
         self.np_dtype = np.float32
-        self.numerical_delta = config.TOLERANCE.get(self.dtype).get(
-            'second_order_grad').get('eps')
-        self.rtol = config.TOLERANCE.get(self.dtype).get(
-            'second_order_grad').get('rtol')
-        self.atol = config.TOLERANCE.get(self.dtype).get(
-            'second_order_grad').get('atol')
+        self.numerical_delta = config.TOLERANCE.get(
+            self.dtype).get('second_order_grad').get('eps')
+        self.rtol = config.TOLERANCE.get(
+            self.dtype).get('second_order_grad').get('rtol')
+        self.atol = config.TOLERANCE.get(
+            self.dtype).get('second_order_grad').get('atol')
         self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
         self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
         self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
 
     def func_single_input(self):
+
         def func(x):
             return paddle.matmul(x * x, self.weight)[:, 0:1]
 
@@ -552,13 +573,15 @@ def func(x):
             func, self.x, self.numerical_delta, self.np_dtype)
 
         H = paddle.autograd.Hessian(func, self.x, is_batched=True)
-        actual = utils._np_transpose_matrix_format(
-            H[:].numpy(), utils.MatrixFormat.BNM, utils.MatrixFormat.NBM)
+        actual = utils._np_transpose_matrix_format(H[:].numpy(),
+                                                   utils.MatrixFormat.BNM,
+                                                   utils.MatrixFormat.NBM)
         actual = actual.reshape((H.shape[1], -1))
 
         np.testing.assert_allclose(actual, expected, self.rtol, self.atol)
 
     def func_multi_input(self):
+
         def func(x, y):
             return paddle.matmul(x * x * y * y, self.weight)[:, 0:1]
 
@@ -574,12 +597,14 @@ def func(x, y):
         self.x.stop_gradient = False
         self.y.stop_gradient = False
         H = paddle.autograd.Hessian(func, [self.x, self.y], is_batched=True)
-        actual = utils._np_transpose_matrix_format(
-            H[:].numpy(), utils.MatrixFormat.BNM, utils.MatrixFormat.NBM)
+        actual = utils._np_transpose_matrix_format(H[:].numpy(),
+                                                   utils.MatrixFormat.BNM,
+                                                   utils.MatrixFormat.NBM)
 
         np.testing.assert_allclose(actual, expected, self.rtol, self.atol)
 
     def func_allow_unused(self):
+
         def func(x, y):
             return paddle.matmul(x * x, self.weight)[:, 0:1]
 
@@ -591,16 +616,20 @@ def func(x, y):
             (xs_len, xs_len, self.nrow, self.nbatch, self.nrow))
         expected = [[n for n in row] for row in expected]
         expected = utils._np_concat_matrix_sequence(expected)
-        expected = utils._np_transpose_matrix_format(
-            expected, utils.MatrixFormat.NBM, utils.MatrixFormat.BNM)
+        expected = utils._np_transpose_matrix_format(expected,
+                                                     utils.MatrixFormat.NBM,
+                                                     utils.MatrixFormat.BNM)
 
-        actual = paddle.autograd.Hessian(
-            func, [self.x, self.y], is_batched=True)[:]
+        actual = paddle.autograd.Hessian(func, [self.x, self.y],
+                                         is_batched=True)[:]
 
-        np.testing.assert_allclose(
-            actual, expected, rtol=self.rtol, atol=self.atol)
+        np.testing.assert_allclose(actual,
+                                   expected,
+                                   rtol=self.rtol,
+                                   atol=self.atol)
 
     def func_stop_gradient(self):
+
         def func(x):
             return paddle.matmul(x * x, self.weight)[:, 0:1]
 
@@ -610,13 +639,15 @@ def func(x):
         x = self.x.clone()
         x.stop_gradient = True
         H = paddle.autograd.Hessian(func, self.x, is_batched=True)[:]
-        actual = utils._np_transpose_matrix_format(
-            H[:].numpy(), utils.MatrixFormat.BNM, utils.MatrixFormat.NBM)
+        actual = utils._np_transpose_matrix_format(H[:].numpy(),
+                                                   utils.MatrixFormat.BNM,
+                                                   utils.MatrixFormat.NBM)
         actual = actual.reshape((H.shape[1], -1))
 
         np.testing.assert_allclose(actual, expected, self.rtol, self.atol)
 
     def func_out_not_single(self):
+
         def func(x):
             return (x * x)
 
@@ -640,27 +671,30 @@ def test_all_cases(self):
 
 
 class TestHessian(unittest.TestCase):
+
     @classmethod
     def setUpClass(self):
         self.shape = (2, 2)
         self.dtype = 'float32'
         self.np_dtype = np.float32
-        self.numerical_delta = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("atol")
+        self.numerical_delta = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("eps")
+        self.rtol = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("rtol")
+        self.atol = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("atol")
 
         self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
         self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
 
     def func_single_input(self):
+
         def func(x):
             return paddle.sum(paddle.matmul(x, x))
 
-        numerical_hessian = _compute_numerical_hessian(
-            func, self.x, self.numerical_delta, self.np_dtype)
+        numerical_hessian = _compute_numerical_hessian(func, self.x,
+                                                       self.numerical_delta,
+                                                       self.np_dtype)
 
         self.x.stop_gradient = False
         hessian = paddle.autograd.hessian(func, self.x)
@@ -668,11 +702,13 @@ def func(x):
                                    self.rtol, self.atol)
 
     def func_multi_input(self):
+
         def func(x, y):
             return paddle.sum(paddle.matmul(x, y))
 
-        numerical_hessian = _compute_numerical_hessian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
+        numerical_hessian = _compute_numerical_hessian(func, [self.x, self.y],
+                                                       self.numerical_delta,
+                                                       self.np_dtype)
 
         self.x.stop_gradient = False
         self.y.stop_gradient = False
@@ -684,6 +720,7 @@ def func(x, y):
                                            self.atol)
 
     def func_allow_unused_false(self):
+
         def func(x, y):
             return paddle.sum(paddle.matmul(x, x))
 
@@ -696,15 +733,17 @@ def func(x, y):
             assert error_msg.find("allow_unused") > 0
 
     def func_allow_unused_true(self):
+
         def func(x, y):
             return paddle.sum(paddle.matmul(x, x))
 
-        numerical_hessian = _compute_numerical_hessian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
+        numerical_hessian = _compute_numerical_hessian(func, [self.x, self.y],
+                                                       self.numerical_delta,
+                                                       self.np_dtype)
         self.x.stop_gradient = False
         self.y.stop_gradient = False
-        hessian = paddle.autograd.hessian(
-            func, [self.x, self.y], allow_unused=True)
+        hessian = paddle.autograd.hessian(func, [self.x, self.y],
+                                          allow_unused=True)
         for i in range(len(hessian)):
             for j in range(len(hessian[0])):
                 if i == j == 0:
@@ -715,11 +754,13 @@ def func(x, y):
                     assert hessian[i][j] is None
 
     def func_create_graph_false(self):
+
         def func(x):
             return paddle.sum(paddle.matmul(x, x))
 
-        numerical_hessian = _compute_numerical_hessian(
-            func, self.x, self.numerical_delta, self.np_dtype)
+        numerical_hessian = _compute_numerical_hessian(func, self.x,
+                                                       self.numerical_delta,
+                                                       self.np_dtype)
         self.x.stop_gradient = False
         hessian = paddle.autograd.hessian(func, self.x)
         assert hessian.stop_gradient == True
@@ -733,11 +774,14 @@ def func(x):
                 "does not appear") > 0
 
     def func_create_graph_true(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
         def func(x):
             return paddle.sum(F.sigmoid(x))
 
-        numerical_hessian = _compute_numerical_hessian(
-            func, self.x, self.numerical_delta, self.np_dtype)
+        numerical_hessian = _compute_numerical_hessian(func, self.x,
+                                                       self.numerical_delta,
+                                                       self.np_dtype)
         self.x.stop_gradient = False
         hessian = paddle.autograd.hessian(func, self.x, create_graph=True)
         assert hessian.stop_gradient == False
@@ -745,6 +789,7 @@ def func(x):
                                    self.rtol, self.atol)
         triple_grad = paddle.grad(hessian, self.x)
         assert triple_grad is not None
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_all_cases(self):
         with _test_eager_guard():
@@ -765,22 +810,24 @@ def test_all_cases(self):
 
 
 class TestHessianFloat64(TestHessian):
+
     @classmethod
     def setUpClass(self):
         self.shape = (2, 2)
         self.dtype = 'float64'
         self.np_dtype = np.float64
-        self.numerical_delta = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("atol")
+        self.numerical_delta = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("eps")
+        self.rtol = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("rtol")
+        self.atol = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("atol")
         self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
         self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
 
 
 class TestBatchHessian(unittest.TestCase):
+
     @classmethod
     def setUpClass(self):
         self.x_shape = (5, 2)
@@ -788,17 +835,18 @@ def setUpClass(self):
         self.y_shape = (5, 2)
         self.dtype = 'float32'
         self.np_dtype = np.float32
-        self.numerical_delta = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("atol")
+        self.numerical_delta = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("eps")
+        self.rtol = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("rtol")
+        self.atol = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("atol")
         self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
         self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
         self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
 
     def func_single_input(self):
+
         def func(x):
             return paddle.matmul(x * x, self.weight)[:, 0:1]
 
@@ -810,6 +858,7 @@ def func(x):
                                    self.atol)
 
     def func_multi_input(self):
+
         def func(x, y):
             return paddle.matmul(x * x * y * y, self.weight)[:, 0:1]
 
@@ -826,6 +875,7 @@ def func(x, y):
                                    self.rtol, self.atol)
 
     def func_allow_unused_false(self):
+
         def func(x, y):
             return paddle.matmul(x * x, self.weight)[:, 0:1]
 
@@ -838,6 +888,7 @@ def func(x, y):
             assert error_msg.find("allow_unused") > 0
 
     def func_allow_unused_true(self):
+
         def func(x, y):
             return paddle.matmul(x * x, self.weight)[:, 0:1]
 
@@ -845,8 +896,8 @@ def func(x, y):
             func, [self.x, self.y], self.numerical_delta, self.np_dtype)
         self.x.stop_gradient = False
         self.y.stop_gradient = False
-        hessian = paddle.autograd.batch_hessian(
-            func, [self.x, self.y], allow_unused=True)
+        hessian = paddle.autograd.batch_hessian(func, [self.x, self.y],
+                                                allow_unused=True)
 
         for i in range(len(hessian)):
             for j in range(len(hessian[0])):
@@ -860,6 +911,7 @@ def func(x, y):
                     assert hessian[i][j] is None
 
     def func_create_graph_false(self):
+
         def func(x):
             return paddle.matmul(x * x, self.weight)[:, 0:1]
 
@@ -878,6 +930,7 @@ def func(x):
                 "does not appear") > 0
 
     def func_create_graph_true(self):
+
         def func(x):
             return paddle.matmul(x * x, self.weight)[:, 0:1]
 
@@ -910,6 +963,7 @@ def test_all_cases(self):
 
 
 class TestBatchHessianFloat64(TestBatchHessian):
+
     @classmethod
     def setUpClass(self):
         self.x_shape = (5, 2)
@@ -917,41 +971,44 @@ def setUpClass(self):
         self.y_shape = (5, 2)
         self.dtype = 'float64'
         self.np_dtype = np.float64
-        self.numerical_delta = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("atol")
+        self.numerical_delta = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("eps")
+        self.rtol = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("rtol")
+        self.atol = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("atol")
         self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
         self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
         self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
 
 
 class TestVHP(unittest.TestCase):
+
     @classmethod
     def setUpClass(self):
         self.shape = (2, 2)
         self.dtype = 'float32'
         self.np_dtype = np.float32
-        self.numerical_delta = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(self.dtype).get(
-            "second_order_grad").get("atol")
+        self.numerical_delta = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("eps")
+        self.rtol = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("rtol")
+        self.atol = config.TOLERANCE.get(
+            self.dtype).get("second_order_grad").get("atol")
         self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
         self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
         self.vx = paddle.rand(shape=self.shape, dtype=self.dtype)
         self.vy = paddle.rand(shape=self.shape, dtype=self.dtype)
 
     def func_single_input(self):
+
         def func(x):
             return paddle.sum(paddle.matmul(x, x))
 
         numerical_func_output = func(self.x).numpy()
-        numerical_vhp = _compute_numerical_vhp(
-            func, self.x, self.vx, self.numerical_delta, self.np_dtype)
+        numerical_vhp = _compute_numerical_vhp(func, self.x, self.vx,
+                                               self.numerical_delta,
+                                               self.np_dtype)
 
         self.x.stop_gradient = False
         func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx)
@@ -961,13 +1018,15 @@ def func(x):
                                    self.atol)
 
     def func_multi_input(self):
+
         def func(x, y):
             return paddle.sum(paddle.matmul(x, y))
 
         numerical_func_output = func(self.x, self.y).numpy()
-        numerical_vhp = _compute_numerical_vhp(
-            func, [self.x, self.y], [self.vx, self.vy], self.numerical_delta,
-            self.np_dtype)
+        numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y],
+                                               [self.vx, self.vy],
+                                               self.numerical_delta,
+                                               self.np_dtype)
 
         self.x.stop_gradient = False
         self.y.stop_gradient = False
@@ -980,14 +1039,15 @@ def func(x, y):
                                        self.rtol, self.atol)
 
     def func_v_default(self):
+
         def func(x, y):
             return paddle.sum(paddle.matmul(x, y))
 
         numerical_func_output = func(self.x, self.y).numpy()
         vx = paddle.ones(self.vx.shape, dtype=self.vx.dtype)
         vy = paddle.ones(self.vy.shape, dtype=self.vy.dtype)
-        numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y],
-                                               [vx, vy], self.numerical_delta,
+        numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y], [vx, vy],
+                                               self.numerical_delta,
                                                self.np_dtype)
 
         self.x.stop_gradient = False
@@ -1000,13 +1060,15 @@ def func(x, y):
                                        self.rtol, self.atol)
 
     def func_allow_unused_true(self):
+
         def func(x, y):
             return paddle.sum(paddle.matmul(x, x))
 
         numerical_func_output = func(self.x, self.y).numpy()
-        numerical_vhp = _compute_numerical_vhp(
-            func, [self.x, self.y], [self.vx, self.vy], self.numerical_delta,
-            self.np_dtype)
+        numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y],
+                                               [self.vx, self.vy],
+                                               self.numerical_delta,
+                                               self.np_dtype)
 
         self.x.stop_gradient = False
         self.y.stop_gradient = False
@@ -1018,12 +1080,15 @@ def func(x, y):
                                    self.atol)
 
     def func_create_graph_true(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
         def func(x):
             return paddle.sum(F.sigmoid(x))
 
         numerical_func_output = func(self.x).numpy()
-        numerical_vhp = _compute_numerical_vhp(
-            func, self.x, self.vx, self.numerical_delta, self.np_dtype)
+        numerical_vhp = _compute_numerical_vhp(func, self.x, self.vx,
+                                               self.numerical_delta,
+                                               self.np_dtype)
 
         self.x.stop_gradient = False
         func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx)
@@ -1034,6 +1099,7 @@ def func(x):
                                    self.atol)
         triple_grad = paddle.grad(vhp, self.x)
         assert triple_grad is not None
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_all_cases(self):
         with _test_eager_guard():
@@ -1052,6 +1118,7 @@ def test_all_cases(self):
 
 
 class TestJacobian(unittest.TestCase):
+
     @classmethod
     def setUpClass(self):
         self.shape = (4, 4)
@@ -1064,6 +1131,7 @@ def setUpClass(self):
         self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
 
     def func_single_input_and_single_output(self):
+
         def func(x):
             return paddle.matmul(x, x)
 
@@ -1075,6 +1143,7 @@ def func(x):
                                    self.rtol, self.atol)
 
     def func_single_input_and_multi_output(self):
+
         def func(x):
             return paddle.matmul(x, x), x * x
 
@@ -1088,6 +1157,7 @@ def func(x):
                                        self.atol)
 
     def func_multi_input_and_single_output(self):
+
         def func(x, y):
             return paddle.matmul(x, y)
 
@@ -1102,6 +1172,8 @@ def func(x, y):
                                        self.atol)
 
     def func_multi_input_and_multi_output(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
         def func(x, y):
             return paddle.matmul(x, y), x * y
 
@@ -1115,8 +1187,10 @@ def func(x, y):
                 np.testing.assert_allclose(jacobian[i][j].numpy(),
                                            numerical_jacobian[i][j], self.rtol,
                                            self.atol)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_allow_unused_false(self):
+
         def func(x, y):
             return paddle.matmul(x, x)
 
@@ -1129,6 +1203,7 @@ def func(x, y):
             assert error_msg.find("allow_unused") > 0
 
     def func_allow_unused_true(self):
+
         def func(x, y):
             return paddle.matmul(x, x)
 
@@ -1136,13 +1211,15 @@ def func(x, y):
             func, [self.x, self.y], self.numerical_delta, self.np_dtype)
         self.x.stop_gradient = False
         self.y.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(
-            func, [self.x, self.y], allow_unused=True)
-        np.testing.assert_allclose(
-            jacobian[0].numpy(), numerical_jacobian[0][0], self.rtol, self.atol)
+        jacobian = paddle.autograd.jacobian(func, [self.x, self.y],
+                                            allow_unused=True)
+        np.testing.assert_allclose(jacobian[0].numpy(),
+                                   numerical_jacobian[0][0], self.rtol,
+                                   self.atol)
         assert jacobian[1] is None
 
     def func_create_graph_false(self):
+
         def func(x, y):
             return paddle.matmul(x, y)
 
@@ -1164,6 +1241,7 @@ def func(x, y):
                 "does not appear") > 0
 
     def func_create_graph_true(self):
+
         def func(x, y):
             return paddle.matmul(x, y)
 
@@ -1171,8 +1249,8 @@ def func(x, y):
             func, [self.x, self.y], self.numerical_delta, self.np_dtype)
         self.x.stop_gradient = False
         self.y.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(
-            func, [self.x, self.y], create_graph=True)
+        jacobian = paddle.autograd.jacobian(func, [self.x, self.y],
+                                            create_graph=True)
         for j in range(len(jacobian)):
             assert jacobian[j].stop_gradient == False
             np.testing.assert_allclose(jacobian[j].numpy(),
@@ -1204,6 +1282,7 @@ def test_all_cases(self):
 
 
 class TestJacobianFloat64(TestJacobian):
+
     @classmethod
     def setUpClass(self):
         self.shape = (4, 4)
@@ -1217,6 +1296,7 @@ def setUpClass(self):
 
 
 class TestJacobianBatch(unittest.TestCase):
+
     @classmethod
     def setUpClass(self):
         self.x_shape = (4, 2)
@@ -1232,6 +1312,7 @@ def setUpClass(self):
         self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
 
     def func_batch_single_input_and_batch_single_output(self):
+
         def func(x):
             return paddle.matmul(paddle.matmul(x, self.weight), self.y)
 
@@ -1241,13 +1322,15 @@ def func(x):
         self.x.stop_gradient = False
         batch_jacobian = paddle.autograd.batch_jacobian(
             func,
-            self.x, )
+            self.x,
+        )
 
         self.assertTrue(
-            np.allclose(batch_jacobian.numpy().all(), numerical_jacobian[0][0]
-                        .all()))
+            np.allclose(batch_jacobian.numpy().all(),
+                        numerical_jacobian[0][0].all()))
 
     def func_batch_single_input_and_batch_multi_output(self):
+
         def func(x):
             return paddle.matmul(paddle.matmul(x, self.weight), self.y), x * x
 
@@ -1257,7 +1340,8 @@ def func(x):
         self.x.stop_gradient = False
         batch_jacobian = paddle.autograd.batch_jacobian(
             func,
-            self.x, )
+            self.x,
+        )
 
         for i in range(len(batch_jacobian)):
             np.testing.assert_allclose(batch_jacobian[i].numpy(),
@@ -1265,6 +1349,7 @@ def func(x):
                                        self.atol)
 
     def func_batch_multi_input_and_batch_single_output(self):
+
         def func(x, y):
             return x * y
 
@@ -1281,6 +1366,7 @@ def func(x, y):
                                        self.atol)
 
     def func_batch_multi_input_and_batch_multi_output(self):
+
         def func(x, y):
             return x * y, x * y
 
@@ -1296,6 +1382,7 @@ def func(x, y):
                                        self.rtol, self.atol)
 
     def func_allow_unused_false(self):
+
         def func(x, y):
             return x * x
 
@@ -1308,6 +1395,7 @@ def func(x, y):
             assert error_msg.find("allow_unused") > 0
 
     def func_allow_unused_true(self):
+
         def func(x, y):
             return x * x
 
@@ -1315,14 +1403,16 @@ def func(x, y):
             func, [self.x, self.y], self.numerical_delta, self.np_dtype)
         self.x.stop_gradient = False
         self.y.stop_gradient = False
-        jacobian = paddle.autograd.batch_jacobian(
-            func, [self.x, self.y], allow_unused=True)
+        jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y],
+                                                  allow_unused=True)
 
-        np.testing.assert_allclose(
-            jacobian[0].numpy(), numerical_jacobian[0][0], self.rtol, self.atol)
+        np.testing.assert_allclose(jacobian[0].numpy(),
+                                   numerical_jacobian[0][0], self.rtol,
+                                   self.atol)
         assert jacobian[1] is None
 
     def func_create_graph_false(self):
+
         def func(x, y):
             return x * y
 
@@ -1344,6 +1434,7 @@ def func(x, y):
                 "does not appear") > 0
 
     def func_create_graph_true(self):
+
         def func(x, y):
             return x * y
 
@@ -1351,8 +1442,8 @@ def func(x, y):
             func, [self.x, self.y], self.numerical_delta, self.np_dtype)
         self.x.stop_gradient = False
         self.y.stop_gradient = False
-        jacobian = paddle.autograd.batch_jacobian(
-            func, [self.x, self.y], create_graph=True)
+        jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y],
+                                                  create_graph=True)
         for j in range(len(jacobian)):
             assert jacobian[j].stop_gradient == False
             np.testing.assert_allclose(jacobian[j].numpy(),
@@ -1384,6 +1475,7 @@ def test_all_cases(self):
 
 
 class TestJacobianBatchFloat64(TestJacobianBatch):
+
     @classmethod
     def setUpClass(self):
         self.x_shape = (12, 2)
@@ -1391,12 +1483,12 @@ def setUpClass(self):
         self.y_shape = (12, 2)
         self.dtype = 'float64'
         self.np_dtype = np.float64
-        self.numerical_delta = config.TOLERANCE.get(self.dtype).get(
-            'second_order_grad').get('eps')
-        self.rtol = config.TOLERANCE.get(self.dtype).get(
-            'second_order_grad').get('rtol')
-        self.atol = config.TOLERANCE.get(self.dtype).get(
-            'second_order_grad').get('atol')
+        self.numerical_delta = config.TOLERANCE.get(
+            self.dtype).get('second_order_grad').get('eps')
+        self.rtol = config.TOLERANCE.get(
+            self.dtype).get('second_order_grad').get('rtol')
+        self.atol = config.TOLERANCE.get(
+            self.dtype).get('second_order_grad').get('atol')
         self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
         self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
         self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py
index 8801664fdca9a..06d3bb5eb2495 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -32,24 +32,26 @@
 @utils.parameterize((utils.TEST_CASE_NAME, 'fun', 'xs', 'v', 'stop_gradient'), (
     ('tensor_input', utils.reduce, np.random.rand(2, 3), None, False),
     ('tensor_sequence_input', utils.reduce, np.random.rand(2, 3), None, False),
-    ('v_not_none', utils.reduce, np.random.rand(2, 3), np.random.rand(1),
-     False),
-    ('xs_stop_gradient', utils.reduce, np.random.rand(2, 3), np.random.rand(1),
-     True),
-    ('func_mutmul', utils.matmul, (np.random.rand(3, 2), np.random.rand(2, 3)),
-     None, False),
-    ('func_mul', utils.mul, (np.random.rand(3, 3), np.random.rand(3, 3)), None,
-     False),
-    ('func_out_two', utils.o2, (np.random.rand(10), np.random.rand(10)), None,
-     False), ))
+    ('v_not_none', utils.reduce, np.random.rand(2,
+                                                3), np.random.rand(1), False),
+    ('xs_stop_gradient', utils.reduce, np.random.rand(
+        2, 3), np.random.rand(1), True),
+    ('func_mutmul', utils.matmul,
+     (np.random.rand(3, 2), np.random.rand(2, 3)), None, False),
+    ('func_mul', utils.mul,
+     (np.random.rand(3, 3), np.random.rand(3, 3)), None, False),
+    ('func_out_two', utils.o2,
+     (np.random.rand(10), np.random.rand(10)), None, False),
+))
 class TestVJP(unittest.TestCase):
+
     def setUp(self):
         self.dtype = str(self.xs[0].dtype) if isinstance(
             self.xs, typing.Sequence) else str(self.xs.dtype)
-        self._rtol = config.TOLERANCE.get(str(self.dtype)).get(
-            "first_order_grad").get("rtol")
-        self._atol = config.TOLERANCE.get(str(self.dtype)).get(
-            "first_order_grad").get("atol")
+        self._rtol = config.TOLERANCE.get(str(
+            self.dtype)).get("first_order_grad").get("rtol")
+        self._atol = config.TOLERANCE.get(str(
+            self.dtype)).get("first_order_grad").get("atol")
 
     def _vjp(self):
         exe = paddle.static.Executor()
@@ -67,8 +69,8 @@ def _expected_vjp(self):
         sp = paddle.static.Program()
         mp = paddle.static.Program()
         with paddle.static.program_guard(mp, sp):
-            feed, static_xs, static_v = gen_static_data_and_feed(self.xs,
-                                                                 self.v, False)
+            feed, static_xs, static_v = gen_static_data_and_feed(
+                self.xs, self.v, False)
             ys = self.fun(*static_xs) if isinstance(
                 static_xs, typing.Sequence) else self.fun(static_xs)
             xs_grads = paddle.static.gradients(ys, static_xs, static_v)
@@ -80,16 +82,19 @@ def test_vjp(self):
         expected = self._expected_vjp()
         self.assertEqual(len(actual), len(expected))
         for i in range(len(actual)):
-            np.testing.assert_allclose(
-                actual[i], expected[i], rtol=self._rtol, atol=self._atol)
+            np.testing.assert_allclose(actual[i],
+                                       expected[i],
+                                       rtol=self._rtol,
+                                       atol=self._atol)
 
 
 @utils.place(config.DEVICES)
 @utils.parameterize(
-    (utils.TEST_CASE_NAME, 'fun', 'xs', 'v', 'expected_exception'), (
-        ('v_shape_not_equal_ys', utils.square, np.random.rand(3),
-         np.random.rand(1), RuntimeError), ))
+    (utils.TEST_CASE_NAME, 'fun', 'xs', 'v', 'expected_exception'),
+    (('v_shape_not_equal_ys', utils.square, np.random.rand(3),
+      np.random.rand(1), RuntimeError), ))
 class TestVJPException(unittest.TestCase):
+
     def setUp(self):
         self.exe = paddle.static.Executor()
 
@@ -97,8 +102,8 @@ def _vjp(self):
         sp = paddle.static.Program()
         mp = paddle.static.Program()
         with paddle.static.program_guard(mp, sp):
-            feed, static_xs, static_v = gen_static_data_and_feed(self.xs,
-                                                                 self.v)
+            feed, static_xs, static_v = gen_static_data_and_feed(
+                self.xs, self.v)
             ys, xs_grads = paddle.autograd.vjp(self.fun, static_xs, static_v)
         self.exe.run(sp)
         return self.exe.run(mp, feed, fetch_list=[ys, xs_grads])
@@ -194,8 +199,7 @@ def _f(x):
 def make_tensors(inps):
     if isinstance(inps, list):
         xs = [
-            paddle.static.data(
-                f'x{i}', inp.shape, dtype=inp.dtype)
+            paddle.static.data(f'x{i}', inp.shape, dtype=inp.dtype)
             for i, inp in enumerate(inps)
         ]
     else:
@@ -218,6 +222,7 @@ def prepare_data(test, input_shapes, dtype):
 
 
 class TestJacobianFloat32(unittest.TestCase):
+
     @classmethod
     def setUpClass(self):
         paddle.enable_static()
@@ -228,8 +233,8 @@ def setUpClass(self):
         self.dtype = 'float32'
         self.np_dtype = np.float32
         prepare_data(self, all_data_shapes, self.dtype)
-        self.eps = config.TOLERANCE.get(self.dtype).get('first_order_grad').get(
-            'eps')
+        self.eps = config.TOLERANCE.get(
+            self.dtype).get('first_order_grad').get('eps')
         # self.rtol = config.TOLERANCE.get(self.dtype).get('first_order_grad').get('rtol')
         # self.atol = config.TOLERANCE.get(self.dtype).get('first_order_grad').get('atol')
         # Do't use tolerance in config, which will cause this test case failed.
@@ -254,8 +259,11 @@ def run_test_by_fullmatrix(self, pd_f, np_f, inps, batch=False):
         else:
             feeds = {'x': inps}
         pd_jacobians = exe.run(main, feed=feeds, fetch_list=[full_jacobian])[0]
-        np_jacobians = approx_jacobian(
-            np_f, inps, self.dtype, self.eps, batch=batch)
+        np_jacobians = approx_jacobian(np_f,
+                                       inps,
+                                       self.dtype,
+                                       self.eps,
+                                       batch=batch)
         if batch:
             np_jacobians = utils._np_transpose_matrix_format(
                 np_jacobians, utils.MatrixFormat.NBM, utils.MatrixFormat.BNM)
@@ -317,6 +325,7 @@ def run_test_by_entries(self, pd_f, np_f, inps, batch=False):
             np.testing.assert_allclose(pd_entry, np_entry, self.rtol, self.atol)
 
     def test_square(self):
+
         def pd_f(x):
             return paddle.multiply(x, x)
 
@@ -328,6 +337,7 @@ def np_f(x):
         self.run_test_by_entries(pd_f, np_f, self.A)
 
     def test_mul(self):
+
         def pd_f(x, y):
             return paddle.multiply(x, y)
 
@@ -338,11 +348,13 @@ def np_f(xs):
         self.run_test_by_fullmatrix(
             pd_f,
             np_f,
-            [self.B, self.C], )
+            [self.B, self.C],
+        )
         self.run_test_by_rows(pd_f, np_f, [self.B, self.C])
         self.run_test_by_entries(pd_f, np_f, [self.B, self.C])
 
     def test_matmul(self):
+
         def pd_f(x, y):
             return paddle.matmul(x, y)
 
@@ -355,6 +367,7 @@ def np_f(xs):
         self.run_test_by_entries(pd_f, np_f, [self.B, self.C])
 
     def test_batch_matmul(self):
+
         def pd_f(x, y):
             return paddle.matmul(x, y)
 
@@ -368,6 +381,7 @@ def np_f(xs):
 
 
 class TestJacobianFloat64(TestJacobianFloat32):
+
     @classmethod
     def setUpClass(self):
         paddle.enable_static()
@@ -377,15 +391,16 @@ def setUpClass(self):
             self.place = fluid.CPUPlace()
         self.dtype = 'float64'
         prepare_data(self, all_data_shapes, self.dtype)
-        self.eps = config.TOLERANCE.get(self.dtype).get('first_order_grad').get(
-            'eps')
-        self.rtol = config.TOLERANCE.get(self.dtype).get(
-            'first_order_grad').get('rtol')
-        self.atol = config.TOLERANCE.get(self.dtype).get(
-            'first_order_grad').get('atol')
+        self.eps = config.TOLERANCE.get(
+            self.dtype).get('first_order_grad').get('eps')
+        self.rtol = config.TOLERANCE.get(
+            self.dtype).get('first_order_grad').get('rtol')
+        self.atol = config.TOLERANCE.get(
+            self.dtype).get('first_order_grad').get('atol')
 
 
 class TestHessianFloat32(unittest.TestCase):
+
     @classmethod
     def setUpClass(self):
         paddle.enable_static()
@@ -395,12 +410,12 @@ def setUpClass(self):
             self.place = fluid.CPUPlace()
         self.dtype = 'float32'
         prepare_data(self, all_data_shapes, self.dtype)
-        self.eps = config.TOLERANCE.get(self.dtype).get(
-            'second_order_grad').get('eps')
-        self.rtol = config.TOLERANCE.get(self.dtype).get(
-            'second_order_grad').get('rtol')
-        self.atol = config.TOLERANCE.get(self.dtype).get(
-            'second_order_grad').get('atol')
+        self.eps = config.TOLERANCE.get(
+            self.dtype).get('second_order_grad').get('eps')
+        self.rtol = config.TOLERANCE.get(
+            self.dtype).get('second_order_grad').get('rtol')
+        self.atol = config.TOLERANCE.get(
+            self.dtype).get('second_order_grad').get('atol')
 
     def run_test_by_fullmatrix(self, pd_f, inps, np_hess, batch=False):
         main = fluid.Program()
@@ -420,6 +435,7 @@ def run_test_by_fullmatrix(self, pd_f, inps, np_hess, batch=False):
         np.testing.assert_allclose(pd_hess, np_hess, self.rtol, self.atol)
 
     def test_square(self):
+
         def pd_f(x):
             """Input is a square matrix."""
             return paddle.matmul(x, x.T).flatten().sum()
@@ -434,6 +450,7 @@ def np_hess(x):
 
 
 class TestHessianFloat64(TestHessianFloat32):
+
     @classmethod
     def setUpClass(self):
         paddle.enable_static()
@@ -443,12 +460,12 @@ def setUpClass(self):
             self.place = fluid.CPUPlace()
         self.dtype = 'float64'
         prepare_data(self, all_data_shapes, self.dtype)
-        self.eps = config.TOLERANCE.get(self.dtype).get(
-            'second_order_grad').get('eps')
-        self.rtol = config.TOLERANCE.get(self.dtype).get(
-            'second_order_grad').get('rtol')
-        self.atol = config.TOLERANCE.get(self.dtype).get(
-            'second_order_grad').get('atol')
+        self.eps = config.TOLERANCE.get(
+            self.dtype).get('second_order_grad').get('eps')
+        self.rtol = config.TOLERANCE.get(
+            self.dtype).get('second_order_grad').get('rtol')
+        self.atol = config.TOLERANCE.get(
+            self.dtype).get('second_order_grad').get('atol')
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py b/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py
index 092ddb4094d03..67ebe01d9f027 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py
@@ -23,6 +23,7 @@
 
 
 class TestGradients(unittest.TestCase):
+
     def test_third_order(self):
         enable_prim()
         main = paddle.static.Program()
@@ -71,7 +72,9 @@ def test_fourth_order(self):
 
             prim2orig(main.block(0))
 
-        feed = {x.name: np.array([2.]).astype('float32'), }
+        feed = {
+            x.name: np.array([2.]).astype('float32'),
+        }
         fetch_list = [grad4.name]
         # (3*(-5*x^2-16*x-16))/(16*(x+1)^3.5)
         result = [np.array([-0.27263762711])]
@@ -87,6 +90,7 @@ def test_fourth_order(self):
 
 
 class TestMinimize(unittest.TestCase):
+
     def model(self, x, w, bias, opt):
         paddle.seed(0)
         place = paddle.CPUPlace()
@@ -98,10 +102,12 @@ def model(self, x, w, bias, opt):
         with paddle.static.program_guard(main, startup):
             input_x = paddle.static.data('x', x.shape, dtype=x.dtype)
             input_x.stop_gradient = False
-            params_w = paddle.static.create_parameter(
-                shape=w.shape, dtype=w.dtype, is_bias=False)
-            params_bias = paddle.static.create_parameter(
-                shape=bias.shape, dtype=bias.dtype, is_bias=True)
+            params_w = paddle.static.create_parameter(shape=w.shape,
+                                                      dtype=w.dtype,
+                                                      is_bias=False)
+            params_bias = paddle.static.create_parameter(shape=bias.shape,
+                                                         dtype=bias.dtype,
+                                                         is_bias=True)
             y = paddle.tanh(paddle.matmul(input_x, params_w) + params_bias)
             loss = paddle.norm(y, p=2)
             opt = opt
@@ -110,9 +116,11 @@ def model(self, x, w, bias, opt):
                 prim2orig(main.block(0))
         exe.run(startup)
         grads = exe.run(main,
-                        feed={'x': x,
-                              'w': w,
-                              'bias': bias},
+                        feed={
+                            'x': x,
+                            'w': w,
+                            'bias': bias
+                        },
                         fetch_list=grads)
         return grads
 
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py b/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py
index d6ff931a936a2..f99bb9074c921 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,6 +24,7 @@
 
 ############################ Test linearize rules ############################
 class TestAddPJVPAndTranspose(unittest.TestCase):
+
     def setUp(self):
         self.main_program = paddle.static.Program()
         self.startup_program = paddle.static.Program()
@@ -68,11 +69,10 @@ def init_data(self):
     def test_op(self):
         with paddle.static.program_guard(self.main_program,
                                          self.startup_program):
-            op = self.layer_help.append_op(
-                type=self.op_type,
-                inputs=self.prim_input,
-                outputs=self.prim_output,
-                attrs=self.prim_attrs)
+            op = self.layer_help.append_op(type=self.op_type,
+                                           inputs=self.prim_input,
+                                           outputs=self.prim_output,
+                                           attrs=self.prim_attrs)
 
             jvp_out = _jvp(op, *self.jvp_args)
             jvp_out = flatten(jvp_out)
@@ -91,6 +91,7 @@ def test_op(self):
 
 
 class TestSubPJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'sub_p'
@@ -127,6 +128,7 @@ def init_data(self):
 
 
 class TestMulPJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'mul_p'
@@ -149,7 +151,9 @@ def init_data(self):
         check_dot = lambda v: v is X
         Z_BAR = paddle.static.data(name='Z_BAR', shape=[5, 6], dtype='int64')
         self.transpose_args = (check_dot, Z_BAR)
-        self.transpose_out_shape_map = {0: X, }
+        self.transpose_out_shape_map = {
+            0: X,
+        }
 
         self.all_ops = [
             # prim op:
@@ -164,6 +168,7 @@ def init_data(self):
 
 
 class TestDivPJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'div_p'
@@ -186,7 +191,9 @@ def init_data(self):
         check_dot = lambda v: v is X
         Z_BAR = paddle.static.data(name='Z_BAR', shape=[5, 6], dtype='int64')
         self.transpose_args = (check_dot, Z_BAR)
-        self.transpose_out_shape_map = {0: X, }
+        self.transpose_out_shape_map = {
+            0: X,
+        }
 
         self.all_ops = [
             # prim op:
@@ -203,11 +210,14 @@ def init_data(self):
 
 
 class TestSqrtPJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'sqrt_p'
         X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        self.prim_input = {'X': X, }
+        self.prim_input = {
+            'X': X,
+        }
         self.prim_output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -232,11 +242,14 @@ def init_data(self):
 
 
 class TestTanhPJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'tanh_p'
         X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
-        self.prim_input = {'X': X, }
+        self.prim_input = {
+            'X': X,
+        }
         self.prim_output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -261,11 +274,14 @@ def init_data(self):
 
 
 class TestReshapePJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'reshape_p'
         X = paddle.static.data(name='X', shape=[8, 8], dtype='int64')
-        self.prim_input = {'X': X, }
+        self.prim_input = {
+            'X': X,
+        }
         self.prim_output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -281,7 +297,9 @@ def init_data(self):
         check_dot = lambda v: v is X
         Y_BAR = paddle.static.data(name='Y_BAR', shape=[2, 32], dtype='int64')
         self.transpose_args = (check_dot, Y_BAR)
-        self.transpose_out_shape_map = {0: X, }
+        self.transpose_out_shape_map = {
+            0: X,
+        }
 
         self.all_ops = [
             # prim op:
@@ -294,11 +312,14 @@ def init_data(self):
 
 
 class TestBroadcastPJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'broadcast_p'
         X = paddle.static.data(name='X', shape=[10, 1], dtype='int64')
-        self.prim_input = {'X': X, }
+        self.prim_input = {
+            'X': X,
+        }
         self.prim_output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -312,10 +333,13 @@ def init_data(self):
 
         # Set transpose
         check_dot = lambda v: v is X
-        Y_BAR = paddle.static.data(
-            name='Y_BAR', shape=[2, 10, 7], dtype='int64')
+        Y_BAR = paddle.static.data(name='Y_BAR',
+                                   shape=[2, 10, 7],
+                                   dtype='int64')
         self.transpose_args = (check_dot, Y_BAR)
-        self.transpose_out_shape_map = {0: X, }
+        self.transpose_out_shape_map = {
+            0: X,
+        }
 
         self.all_ops = [
             # prim op:
@@ -329,11 +353,14 @@ def init_data(self):
 
 
 class TestTransposePJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'transpose_p'
         X = paddle.static.data(name='X', shape=[2, 3, 4, 5], dtype='int64')
-        self.prim_input = {'X': X, }
+        self.prim_input = {
+            'X': X,
+        }
         self.prim_output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -341,17 +368,21 @@ def init_data(self):
         self.prim_attrs = {'axis': [0, 2, 3, 1]}
 
         # Set JVP
-        X_DOT = paddle.static.data(
-            name='X_DOT', shape=[2, 3, 4, 5], dtype='int64')
+        X_DOT = paddle.static.data(name='X_DOT',
+                                   shape=[2, 3, 4, 5],
+                                   dtype='int64')
         self.jvp_args = (X_DOT, )
         self.jvp_out_shape_map = {0: self.prim_output['Y']}
 
         # Set transpose
         check_dot = lambda v: v is X
-        Y_BAR = paddle.static.data(
-            name='Y_BAR', shape=[2, 4, 5, 3], dtype='int64')
+        Y_BAR = paddle.static.data(name='Y_BAR',
+                                   shape=[2, 4, 5, 3],
+                                   dtype='int64')
         self.transpose_args = (check_dot, Y_BAR)
-        self.transpose_out_shape_map = {0: X, }
+        self.transpose_out_shape_map = {
+            0: X,
+        }
 
         self.all_ops = [
             # prim op:
@@ -364,11 +395,14 @@ def init_data(self):
 
 
 class TestSplitPJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'split_p'
         X = paddle.static.data(name='X', shape=[2, 7, 10], dtype='int64')
-        self.prim_input = {'X': X, }
+        self.prim_input = {
+            'X': X,
+        }
         self.prim_output = {
             'YS': [
                 self.layer_help.create_variable_for_type_inference(
@@ -378,8 +412,9 @@ def init_data(self):
         self.prim_attrs = {'num_or_sections': [2, 3, 4, 1], 'axis': 2}
 
         # Set JVP
-        X_DOT = paddle.static.data(
-            name='X_DOT', shape=[2, 7, 10], dtype='int64')
+        X_DOT = paddle.static.data(name='X_DOT',
+                                   shape=[2, 7, 10],
+                                   dtype='int64')
         self.jvp_args = (X_DOT, )
         self.jvp_out_shape_map = {
             0: self.prim_output['YS'][0],
@@ -391,17 +426,15 @@ def init_data(self):
         # Set transpose
         check_dot = lambda v: v is X
         YS_BAR = [
-            paddle.static.data(
-                name='Y_BAR1', shape=[2, 7, 2], dtype='int64'),
-            paddle.static.data(
-                name='Y_BAR2', shape=[2, 7, 3], dtype='int64'),
-            paddle.static.data(
-                name='Y_BAR3', shape=[2, 7, 4], dtype='int64'),
-            paddle.static.data(
-                name='Y_BAR4', shape=[2, 7, 1], dtype='int64'),
+            paddle.static.data(name='Y_BAR1', shape=[2, 7, 2], dtype='int64'),
+            paddle.static.data(name='Y_BAR2', shape=[2, 7, 3], dtype='int64'),
+            paddle.static.data(name='Y_BAR3', shape=[2, 7, 4], dtype='int64'),
+            paddle.static.data(name='Y_BAR4', shape=[2, 7, 1], dtype='int64'),
         ]
         self.transpose_args = (check_dot, YS_BAR)
-        self.transpose_out_shape_map = {0: X, }
+        self.transpose_out_shape_map = {
+            0: X,
+        }
 
         self.all_ops = [
             # prim op:
@@ -414,13 +447,16 @@ def init_data(self):
 
 
 class TestConcatPJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'concat_p'
         X = paddle.static.data(name='X', shape=[3, 9, 5], dtype='float64')
         Y = paddle.static.data(name='Y', shape=[3, 2, 5], dtype='float64')
         Z = paddle.static.data(name='Z', shape=[3, 3, 5], dtype='float64')
-        self.prim_input = {'XS': [X, Y, Z], }
+        self.prim_input = {
+            'XS': [X, Y, Z],
+        }
         self.prim_output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -429,20 +465,18 @@ def init_data(self):
 
         # Set JVP
         XS_DOT = [
-            paddle.static.data(
-                name='X_DOT1', shape=[3, 9, 5], dtype='float64'),
-            paddle.static.data(
-                name='X_DOT2', shape=[3, 2, 5], dtype='float64'),
-            paddle.static.data(
-                name='X_DOT3', shape=[3, 3, 5], dtype='float64'),
+            paddle.static.data(name='X_DOT1', shape=[3, 9, 5], dtype='float64'),
+            paddle.static.data(name='X_DOT2', shape=[3, 2, 5], dtype='float64'),
+            paddle.static.data(name='X_DOT3', shape=[3, 3, 5], dtype='float64'),
         ]
         self.jvp_args = (XS_DOT, )
         self.jvp_out_shape_map = {0: self.prim_output['Y']}
 
         # Set transpose
         check_dot = lambda v: v is X or v is Y or v is Z
-        Y_BAR = paddle.static.data(
-            name='Y_BAR', shape=[3, 14, 5], dtype='float64')
+        Y_BAR = paddle.static.data(name='Y_BAR',
+                                   shape=[3, 14, 5],
+                                   dtype='float64')
         self.transpose_args = (check_dot, Y_BAR)
         self.transpose_out_shape_map = {
             0: X,
@@ -461,6 +495,7 @@ def init_data(self):
 
 
 class TestReducePJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'reduce_p'
@@ -473,17 +508,21 @@ def init_data(self):
         self.prim_attrs = {'axis': [2], 'keepdim': False}
 
         # Set JVP
-        X_DOT = paddle.static.data(
-            name='X_DOT1', shape=[2, 3, 4, 5], dtype='float64')
+        X_DOT = paddle.static.data(name='X_DOT1',
+                                   shape=[2, 3, 4, 5],
+                                   dtype='float64')
         self.jvp_args = (X_DOT, )
         self.jvp_out_shape_map = {0: self.prim_output['Y']}
 
         # Set transpose
         check_dot = lambda v: v is X
-        Y_BAR = paddle.static.data(
-            name='Y_BAR', shape=[2, 3, 5], dtype='float64')
+        Y_BAR = paddle.static.data(name='Y_BAR',
+                                   shape=[2, 3, 5],
+                                   dtype='float64')
         self.transpose_args = (check_dot, Y_BAR)
-        self.transpose_out_shape_map = {0: X, }
+        self.transpose_out_shape_map = {
+            0: X,
+        }
 
         self.all_ops = [
             # prim op:
@@ -497,6 +536,7 @@ def init_data(self):
 
 
 class TestMatmulPJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'matmul_p'
@@ -519,7 +559,9 @@ def init_data(self):
         check_dot = lambda v: v is X
         Z_BAR = paddle.static.data(name='Z_BAR', shape=[2, 4], dtype='float64')
         self.transpose_args = (check_dot, Z_BAR)
-        self.transpose_out_shape_map = {0: X, }
+        self.transpose_out_shape_map = {
+            0: X,
+        }
 
         self.all_ops = [
             # prim op:
@@ -535,11 +577,14 @@ def init_data(self):
 
 
 class TestSliceSelectPJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'slice_select_p'
         X = paddle.static.data(name='X', shape=[3, 20], dtype='float64')
-        self.prim_input = {'X': X, }
+        self.prim_input = {
+            'X': X,
+        }
         self.prim_output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -560,7 +605,9 @@ def init_data(self):
         check_dot = lambda v: v is X
         Y_BAR = paddle.static.data(name='Y_BAR', shape=[3, 10], dtype='float64')
         self.transpose_args = (check_dot, Y_BAR)
-        self.transpose_out_shape_map = {0: X, }
+        self.transpose_out_shape_map = {
+            0: X,
+        }
 
         self.all_ops = [
             # prim op:
@@ -574,6 +621,7 @@ def init_data(self):
 
 
 class TestSliceAssignPJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'slice_assign_p'
@@ -616,12 +664,14 @@ def init_data(self):
 
 
 class TestGatherPJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'gather_p'
         X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
-        IndexTensor = paddle.static.data(
-            name='IndexTensor', shape=[3], dtype='int32')
+        IndexTensor = paddle.static.data(name='IndexTensor',
+                                         shape=[3],
+                                         dtype='int32')
         self.prim_input = {'X': X, 'IndexTensor': IndexTensor}
         self.prim_output = {
             'Y':
@@ -633,14 +683,17 @@ def init_data(self):
         X_DOT = paddle.static.data(name='X_DOT', shape=[9, 5], dtype='float64')
         self.jvp_args = (
             X_DOT,
-            IndexTensor, )
+            IndexTensor,
+        )
         self.jvp_out_shape_map = {0: self.prim_output['Y']}
 
         # Set transpose
         check_dot = lambda v: v is X
         Y_BAR = paddle.static.data(name='Y_BAR', shape=[9, 3], dtype='float64')
         self.transpose_args = (check_dot, Y_BAR)
-        self.transpose_out_shape_map = {0: X, }
+        self.transpose_out_shape_map = {
+            0: X,
+        }
 
         self.all_ops = [
             # prim op:
@@ -654,13 +707,15 @@ def init_data(self):
 
 
 class TestScatterAddPJVPAndTranspose(TestAddPJVPAndTranspose):
+
     def init_data(self):
         # Set prim op
         self.op_type = 'scatter_add_p'
         X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
         Y = paddle.static.data(name='Y', shape=[9, 3], dtype='float64')
-        IndexTensor = paddle.static.data(
-            name='IndexTensor', shape=[3], dtype='int32')
+        IndexTensor = paddle.static.data(name='IndexTensor',
+                                         shape=[3],
+                                         dtype='int32')
         self.prim_input = {'X': X, 'Y': Y, 'IndexTensor': IndexTensor}
         self.prim_output = {
             'Z':
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py b/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py
index 24c8febccf5c0..924292c4a4aed 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,6 +24,7 @@
 
 ############################ Test orig2prim rules ############################
 class TestElementWiseAddOrig2Prim(unittest.TestCase):
+
     def setUp(self):
         self.main_program = paddle.static.Program()
         self.startup_program = paddle.static.Program()
@@ -53,11 +54,10 @@ def init_data(self):
     def test_op(self):
         with paddle.static.program_guard(self.main_program,
                                          self.startup_program):
-            op = self.layer_help.append_op(
-                type=self.op_type,
-                inputs=self.input,
-                outputs=self.output,
-                attrs=self.attrs)
+            op = self.layer_help.append_op(type=self.op_type,
+                                           inputs=self.input,
+                                           outputs=self.output,
+                                           attrs=self.attrs)
 
             prim_out = _orig2prim(op, *self.orig2prim_args)
             all_ops = [op.type for op in self.main_program.block(0).ops]
@@ -69,11 +69,14 @@ def test_op(self):
 
 
 class TestSqrtOrig2Prim(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'sqrt'
         X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
             'Out':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -87,6 +90,7 @@ def init_data(self):
 
 
 class TestElementWiseMulOrig2Prim(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'elementwise_mul'
         X = paddle.static.data(name='X', shape=[8, 8], dtype='float')
@@ -106,6 +110,7 @@ def init_data(self):
 
 
 class TestMatmulV2Orig2Prim(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'matmul_v2'
         X = paddle.static.data(name='X', shape=[3, 4], dtype='float')
@@ -124,11 +129,14 @@ def init_data(self):
 
 
 class TestTanhOrig2Prim(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'tanh'
         X = paddle.static.data(name='X', shape=[3, 4], dtype='float')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
             'Out':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -141,13 +149,17 @@ def init_data(self):
 
 
 class TestReshape2Orig2Prim(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'reshape2'
         X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
-            'Out': X,
+            'Out':
+            X,
             'XShape':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
         }
@@ -156,19 +168,23 @@ def init_data(self):
         self.orig2prim_args = (
             None,
             None,
-            X, )
+            X,
+        )
         self.all_ops = ['reshape2', 'reshape_p', 'fill_constant_p']
         # Do not checke XShape
         self.out_map = {0: self.output['Out']}
 
 
 class TestConcatOrig2Prim(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'concat'
         X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
         Y = paddle.static.data(name='Y', shape=[3, 6], dtype='int64')
 
-        self.input = {'X': [X, Y], }
+        self.input = {
+            'X': [X, Y],
+        }
         self.output = {
             'Out':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -177,17 +193,21 @@ def init_data(self):
 
         self.orig2prim_args = (
             None,
-            (X, Y), )
+            (X, Y),
+        )
         self.all_ops = ['concat', 'concat_p']
         self.out_map = {0: self.output['Out']}
 
 
 class TestSliceOrig2Prim(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'slice'
         X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
 
-        self.input = {'Input': X, }
+        self.input = {
+            'Input': X,
+        }
         self.output = {
             'Out':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -204,11 +224,14 @@ def init_data(self):
 
 
 class TestFillZerosLikeOrig2Prim(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'fill_zeros_like'
         X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
             'Out':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -221,6 +244,7 @@ def init_data(self):
 
 
 class TestSumOrig2Prim(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'sum'
         X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
@@ -239,11 +263,14 @@ def init_data(self):
 
 
 class TestPNormOrig2Prim1(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'p_norm'
         X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
             'Out':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -259,11 +286,14 @@ def init_data(self):
 
 
 class TestPNormOrig2Prim2(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'p_norm'
         X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
             'Out':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -279,6 +309,7 @@ def init_data(self):
 
 
 class TestIndexSelectOrig2Prim(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'index_select'
         X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
@@ -289,16 +320,20 @@ def init_data(self):
             'Out':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
         }
-        self.attrs = {'dim': 0, }
+        self.attrs = {
+            'dim': 0,
+        }
 
         self.orig2prim_args = (
             Index,
-            X, )
+            X,
+        )
         self.all_ops = ['index_select', 'gather_p']
         self.out_map = {0: self.output['Out']}
 
 
 class TestElementwiseSubOrig2Prim(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'elementwise_sub'
         X = paddle.static.data(name='X', shape=[5, 6], dtype='int32')
@@ -309,21 +344,27 @@ def init_data(self):
             'Out':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
         }
-        self.attrs = {'dim': 0, }
+        self.attrs = {
+            'dim': 0,
+        }
 
         self.orig2prim_args = (
             X,
-            Y, )
+            Y,
+        )
         self.all_ops = ['elementwise_sub', 'broadcast_p', 'sub_p']
         self.out_map = {0: self.output['Out']}
 
 
 class TestScaleOrig2Prim(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'scale'
         X = paddle.static.data(name='X', shape=[10, 7], dtype='int32')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
             'Out':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -332,7 +373,8 @@ def init_data(self):
 
         self.orig2prim_args = (
             None,
-            X, )
+            X,
+        )
         self.all_ops = [
             'scale', 'fill_constant_p', 'fill_constant_p', 'mul_p', 'add_p'
         ]
@@ -340,11 +382,14 @@ def init_data(self):
 
 
 class TestAssignOrig2Prim(TestElementWiseAddOrig2Prim):
+
     def init_data(self):
         self.op_type = 'assign'
         X = paddle.static.data(name='X', shape=[10, 7], dtype='int32')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
             'Out':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py b/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py
index 15ab016fc543d..56a28f38712eb 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,6 +24,7 @@
 
 ############################ Test prim2orig rules ############################
 class TestAddPPrim2Orig(unittest.TestCase):
+
     def setUp(self):
         self.main_program = paddle.static.Program()
         self.startup_program = paddle.static.Program()
@@ -53,11 +54,10 @@ def init_data(self):
     def test_op(self):
         with paddle.static.program_guard(self.main_program,
                                          self.startup_program):
-            op = self.layer_help.append_op(
-                type=self.op_type,
-                inputs=self.input,
-                outputs=self.output,
-                attrs=self.attrs)
+            op = self.layer_help.append_op(type=self.op_type,
+                                           inputs=self.input,
+                                           outputs=self.output,
+                                           attrs=self.attrs)
 
             orig_out = _prim2orig(op, *self.prim2orig_args)
             all_ops = [op.type for op in self.main_program.block(0).ops]
@@ -68,6 +68,7 @@ def test_op(self):
 
 
 class TestSubPPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'sub_p'
         X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
@@ -86,6 +87,7 @@ def init_data(self):
 
 
 class TestMulPPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'mul_p'
         X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
@@ -104,6 +106,7 @@ def init_data(self):
 
 
 class TestDivPPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'div_p'
         X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
@@ -122,11 +125,14 @@ def init_data(self):
 
 
 class TestSqrtPPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'sqrt_p'
         X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -139,11 +145,14 @@ def init_data(self):
 
 
 class TestTanhPPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'tanh_p'
         X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -156,11 +165,14 @@ def init_data(self):
 
 
 class TestReshapePPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'reshape_p'
         X = paddle.static.data(name='X', shape=[2, 8], dtype='float64')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -173,11 +185,14 @@ def init_data(self):
 
 
 class TestBroadcastPPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'broadcast_p'
         X = paddle.static.data(name='X', shape=[2, 8], dtype='float64')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -190,11 +205,14 @@ def init_data(self):
 
 
 class TestTransposePPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'transpose_p'
         X = paddle.static.data(name='X', shape=[7, 8, 9, 10], dtype='float64')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -207,11 +225,14 @@ def init_data(self):
 
 
 class TestSplitPPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'split_p'
         X = paddle.static.data(name='X', shape=[3, 9, 5], dtype='float64')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
             'YS': [
                 self.layer_help.create_variable_for_type_inference(
@@ -230,13 +251,16 @@ def init_data(self):
 
 
 class TestConcatPPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'concat_p'
         X = paddle.static.data(name='X', shape=[3, 9, 5], dtype='float64')
         Y = paddle.static.data(name='Y', shape=[2, 9, 5], dtype='float64')
         Z = paddle.static.data(name='Z', shape=[1, 9, 5], dtype='float64')
 
-        self.input = {'XS': [X, Y, Z], }
+        self.input = {
+            'XS': [X, Y, Z],
+        }
         self.output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -249,6 +273,7 @@ def init_data(self):
 
 
 class TestReducePPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'reduce_p'
         X = paddle.static.data(name='X', shape=[3, 9, 5], dtype='float64')
@@ -266,6 +291,7 @@ def init_data(self):
 
 
 class TestMatmulPPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'matmul_p'
         X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
@@ -284,11 +310,14 @@ def init_data(self):
 
 
 class TestSliceSelectPPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'slice_select_p'
         X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
 
-        self.input = {'X': X, }
+        self.input = {
+            'X': X,
+        }
         self.output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
@@ -301,6 +330,7 @@ def init_data(self):
 
 
 class TestSliceAssignPPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'slice_assign_p'
         X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
@@ -319,40 +349,49 @@ def init_data(self):
 
 
 class TestGatherPPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'gather_p'
         X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
-        IndexTensor = paddle.static.data(
-            name='IndexTensor', shape=[3], dtype='int32')
+        IndexTensor = paddle.static.data(name='IndexTensor',
+                                         shape=[3],
+                                         dtype='int32')
 
         self.input = {'X': X, 'IndexTensor': IndexTensor}
         self.output = {
             'Y':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
         }
-        self.attrs = {'axis': 0, }
+        self.attrs = {
+            'axis': 0,
+        }
 
         self.prim2orig_args = (
             IndexTensor,
-            X, )
+            X,
+        )
         self.all_ops = ['gather_p', 'gather']
         self.out_map = {self.output['Y']: 0}
 
 
 class TestScatterAddPPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'scatter_add_p'
         X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
         Y = paddle.static.data(name='Y', shape=[3, 5], dtype='float64')
-        IndexTensor = paddle.static.data(
-            name='IndexTensor', shape=[3], dtype='int32')
+        IndexTensor = paddle.static.data(name='IndexTensor',
+                                         shape=[3],
+                                         dtype='int32')
 
         self.input = {'X': X, 'Y': Y, 'IndexTensor': IndexTensor}
         self.output = {
             'Z':
             self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
         }
-        self.attrs = {'axis': 0, }
+        self.attrs = {
+            'axis': 0,
+        }
 
         self.prim2orig_args = (IndexTensor, X, Y)
         self.all_ops = [
@@ -362,6 +401,7 @@ def init_data(self):
 
 
 class TestFillConstantPPrim2Orig(TestAddPPrim2Orig):
+
     def init_data(self):
         self.op_type = 'fill_constant_p'
 
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primops.py b/python/paddle/fluid/tests/unittests/autograd/test_primops.py
index e6a8c4ec3fe4c..ccbd630bfd084 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_primops.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_primops.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,10 +15,12 @@
 import unittest
 import numpy as np
 import paddle
-from paddle.incubate.autograd.primops import (
-    neg, set_value, add, sub, mul, div, sqrt, tanh, reshape, broadcast,
-    transpose, split, concat, reduce, matmul, slice_select, slice_assign,
-    gather, scatter_add, fill_const)
+from paddle.incubate.autograd.primops import (neg, set_value, add, sub, mul,
+                                              div, sqrt, tanh, reshape,
+                                              broadcast, transpose, split,
+                                              concat, reduce, matmul,
+                                              slice_select, slice_assign,
+                                              gather, scatter_add, fill_const)
 from paddle.incubate.autograd.primx import Transform, topo_path, orig2prim, prim2orig, _gradients
 from paddle.incubate.autograd.utils import enable_prim, disable_prim, prim_enabled
 
@@ -104,19 +106,29 @@ def test_ops(self):
         self.assertEqual(matmul_1.dtype, d.dtype)
         self.assertEqual(matmul_1.shape, (2, 2))
 
-        slice_select_1 = slice_select(
-            e, axis=[0], starts=[0], ends=[2], strides=[1])
+        slice_select_1 = slice_select(e,
+                                      axis=[0],
+                                      starts=[0],
+                                      ends=[2],
+                                      strides=[1])
         self.assertEqual(slice_select_1.dtype, e.dtype)
         self.assertEqual(slice_select_1.shape, (2, 2))
 
-        slice_select_2 = slice_select(
-            d, axis=[0, 1], starts=[0, 1], ends=[2, 3], strides=[1, 2])
+        slice_select_2 = slice_select(d,
+                                      axis=[0, 1],
+                                      starts=[0, 1],
+                                      ends=[2, 3],
+                                      strides=[1, 2])
         self.assertEqual(slice_select_2.dtype, d.dtype)
         self.assertEqual(slice_select_2.shape, (2, 1))
 
         y = broadcast(b, [2, 2])
-        slice_assign_1 = slice_assign(
-            d, y, axis=[1], starts=[1], ends=[3], strides=[1])
+        slice_assign_1 = slice_assign(d,
+                                      y,
+                                      axis=[1],
+                                      starts=[1],
+                                      ends=[3],
+                                      strides=[1])
         self.assertEqual(slice_assign_1.dtype, d.dtype)
         self.assertEqual(slice_assign_1.shape, d.shape)
 
@@ -138,8 +150,13 @@ def test_ops(self):
         self.assertEqual(neg_1.shape, b.shape)
         self.assertEqual(neg_1.dtype, b.dtype)
 
-        set_value_1 = set_value(
-            d, a, axis=[1], starts=[1], ends=[3], strides=[1], out=d)
+        set_value_1 = set_value(d,
+                                a,
+                                axis=[1],
+                                starts=[1],
+                                ends=[3],
+                                strides=[1],
+                                out=d)
         self.assertEqual(set_value_1.shape, d.shape)
         self.assertEqual(set_value_1.dtype, d.dtype)
 
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_transform.py b/python/paddle/fluid/tests/unittests/autograd/test_transform.py
index a2b75f5d7bb1a..08626593e2904 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_transform.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_transform.py
@@ -23,6 +23,7 @@
 
 
 class TestAutoGradTransformForAdd(unittest.TestCase):
+
     def setUp(self):
         self.main_program = paddle.static.Program()
         self.startup_program = paddle.static.Program()
@@ -36,11 +37,13 @@ def init_data(self):
         self.xs_shape_map = {0: (20, 40), 1: (20, 40)}
         # { output_index: output_shape }
         self.ys_shape_map = {0: (20, 40)}
-        X0 = paddle.static.data(
-            name='X0', shape=self.xs_shape_map[0], dtype='float32')
+        X0 = paddle.static.data(name='X0',
+                                shape=self.xs_shape_map[0],
+                                dtype='float32')
         X0.stop_gradient = False
-        X1 = paddle.static.data(
-            name='X1', shape=self.xs_shape_map[1], dtype='float32')
+        X1 = paddle.static.data(name='X1',
+                                shape=self.xs_shape_map[1],
+                                dtype='float32')
         X1.stop_gradient = False
 
         A = paddle.tanh(X0)
@@ -48,7 +51,9 @@ def init_data(self):
         Y = paddle.add(A, B)
 
         self.orig_xs = [X0, X1]
-        self.orig_ys = [Y, ]
+        self.orig_ys = [
+            Y,
+        ]
 
         self.orig_ops = ['tanh', 'tanh', 'elementwise_add']
         self.orig2prim_ops = ['tanh_p', 'tanh_p', 'add_p']
@@ -134,16 +139,19 @@ def test_run(self):
 
 
 class TestAutoGradTransformForMatmul(TestAutoGradTransformForAdd):
+
     def init_data(self):
         # { input_index: input_shape }
         self.xs_shape_map = {0: (100, 2), 1: (5, 2)}
         # { output_index: output_shape }
         self.ys_shape_map = {0: (100, 5)}
-        X0 = paddle.static.data(
-            'X0', shape=self.xs_shape_map[0], dtype='float32')
+        X0 = paddle.static.data('X0',
+                                shape=self.xs_shape_map[0],
+                                dtype='float32')
         X0.stop_gradient = False
-        X1 = paddle.static.data(
-            'X1', shape=self.xs_shape_map[1], dtype='float32')
+        X1 = paddle.static.data('X1',
+                                shape=self.xs_shape_map[1],
+                                dtype='float32')
         X1.stop_gradient = False
 
         A = paddle.reshape(X1, [2, 5])
@@ -151,7 +159,9 @@ def init_data(self):
         Y = paddle.matmul(X0, B)
 
         self.orig_xs = [X0, X1]
-        self.orig_ys = [Y, ]
+        self.orig_ys = [
+            Y,
+        ]
 
         self.orig_ops = ['reshape2', 'scale', 'matmul_v2']
         self.orig2prim_ops = [
@@ -210,20 +220,24 @@ def init_data(self):
 
 
 class TestAutoGradTransformForIndexSelect(TestAutoGradTransformForAdd):
+
     def init_data(self):
         # { input_index: input_shape }
         self.xs_shape_map = {0: (7, 8, 9), 1: (8, 1), 2: (7, 8, 9), 3: (3, )}
         # { output_index: output_shape }
         self.ys_shape_map = {0: (3, 16, 9)}
 
-        X0 = paddle.static.data(
-            'X0', shape=self.xs_shape_map[0], dtype='float32')
+        X0 = paddle.static.data('X0',
+                                shape=self.xs_shape_map[0],
+                                dtype='float32')
         X0.stop_gradient = False
-        X1 = paddle.static.data(
-            'X1', shape=self.xs_shape_map[1], dtype='float32')
+        X1 = paddle.static.data('X1',
+                                shape=self.xs_shape_map[1],
+                                dtype='float32')
         X1.stop_gradient = False
-        X2 = paddle.static.data(
-            'X2', shape=self.xs_shape_map[2], dtype='float32')
+        X2 = paddle.static.data('X2',
+                                shape=self.xs_shape_map[2],
+                                dtype='float32')
         X2.stop_gradient = False
         X3 = paddle.static.data('X3', shape=self.xs_shape_map[3], dtype='int32')
         X3.stop_gradient = False
@@ -235,7 +249,9 @@ def init_data(self):
         Y = paddle.index_select(D, X3, axis=0)  # (3, 16, 9)
 
         self.orig_xs = [X0, X1, X2, X3]
-        self.orig_ys = [Y, ]
+        self.orig_ys = [
+            Y,
+        ]
         self.orig_ops = [
             'elementwise_add', 'p_norm', 'elementwise_sub', 'concat',
             'index_select'
diff --git a/python/paddle/fluid/tests/unittests/autograd/utils.py b/python/paddle/fluid/tests/unittests/autograd/utils.py
index 0816b57fbf70b..4105ea2672be0 100644
--- a/python/paddle/fluid/tests/unittests/autograd/utils.py
+++ b/python/paddle/fluid/tests/unittests/autograd/utils.py
@@ -65,8 +65,8 @@ def _compute_numerical_jacobian(func, xs, delta, np_dtype):
     for i in range(fout_size):
         jac_i = list([] for _ in range(fin_size))
         for j in range(fin_size):
-            jac_i[j] = np.zeros(
-                (_product(ys[i].shape), _product(xs[j].shape)), dtype=np_dtype)
+            jac_i[j] = np.zeros((_product(ys[i].shape), _product(xs[j].shape)),
+                                dtype=np_dtype)
         jacobian[i] = jac_i
 
     for j in range(fin_size):
@@ -109,16 +109,16 @@ def _compute_numerical_hessian(func, xs, delta, np_dtype):
                     orig = _get_item(xs[j], q)
                     x_pos = orig + delta
                     xs[j] = _set_item(xs[j], q, x_pos)
-                    jacobian_pos = _compute_numerical_jacobian(func, xs, delta,
-                                                               np_dtype)
+                    jacobian_pos = _compute_numerical_jacobian(
+                        func, xs, delta, np_dtype)
                     x_neg = orig - delta
                     xs[j] = _set_item(xs[j], q, x_neg)
-                    jacobian_neg = _compute_numerical_jacobian(func, xs, delta,
-                                                               np_dtype)
+                    jacobian_neg = _compute_numerical_jacobian(
+                        func, xs, delta, np_dtype)
                     xs[j] = _set_item(xs[j], q, orig)
                     hessian[i][j][p][q] = (
-                        jacobian_pos[0][i][0][p] - jacobian_neg[0][i][0][p]
-                    ) / delta / 2.
+                        jacobian_pos[0][i][0][p] -
+                        jacobian_neg[0][i][0][p]) / delta / 2.
     return hessian
 
 
@@ -197,8 +197,7 @@ def _compute_numerical_batch_hessian(func, xs, delta, np_dtype):
     mid = len(hessian_res) // 2
     for i in range(mid):
         hessian_result.append(
-            np.stack(
-                (hessian_res[i], hessian_res[mid + i]), axis=0))
+            np.stack((hessian_res[i], hessian_res[mid + i]), axis=0))
     return hessian_result
 
 
@@ -262,6 +261,7 @@ def unuse(x, y):
 
 
 def nested(x):
+
     def inner(y):
         return x * y
 
diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py
index 9ea95f3e87002..14479e7a2710c 100644
--- a/python/paddle/fluid/tests/unittests/benchmark.py
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
@@ -27,6 +27,7 @@
 
 
 class BenchmarkSuite(OpTest):
+
     def timeit_function(self, callback, iters, *args, **kwargs):
         assert iters != 0, "Iters should >= 1"
         start = time.time()
@@ -46,12 +47,9 @@ def _assert_cpu_gpu_same(self, cpu_outs, gpu_outs, fetch_list, atol):
             var_name = variable if isinstance(
                 variable, six.string_types) else variable.name
             self.assertTrue(
-                np.allclose(
-                    actual_t, expect_t, atol=atol),
-                "Output (" + var_name + ") has diff" + str(actual_t) + "\n" +
-                str(expect_t))
-            self.assertListEqual(actual.lod(),
-                                 expect.lod(),
+                np.allclose(actual_t, expect_t, atol=atol), "Output (" +
+                var_name + ") has diff" + str(actual_t) + "\n" + str(expect_t))
+            self.assertListEqual(actual.lod(), expect.lod(),
                                  "Output (" + var_name + ") has different lod")
 
     def _get_input_names(self):
@@ -98,13 +96,12 @@ def timeit_output(self, iters=100):
     def timeit_grad_with_place(self, place, iters=100):
         inputs_to_check = self._get_input_names()
         output_names = self._get_output_names()
-        return self.timeit_function(
-            self._get_gradient,
-            iters,
-            inputs_to_check,
-            place,
-            output_names,
-            no_grad_set=None)
+        return self.timeit_function(self._get_gradient,
+                                    iters,
+                                    inputs_to_check,
+                                    place,
+                                    output_names,
+                                    no_grad_set=None)
 
     def timeit_grad(self, iters=100):
         places = self._get_places()
diff --git a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
index 0e7338b839e2a..37cc77836a804 100644
--- a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
@@ -25,6 +25,7 @@
 
 
 class TestSumOp(BenchmarkSuite):
+
     def setUp(self):
         self.op_type = "sum"
         self.customize_testcase()
diff --git a/python/paddle/fluid/tests/unittests/c_comm_init_op.py b/python/paddle/fluid/tests/unittests/c_comm_init_op.py
index ed6a75230c60d..52409ccf8c88e 100644
--- a/python/paddle/fluid/tests/unittests/c_comm_init_op.py
+++ b/python/paddle/fluid/tests/unittests/c_comm_init_op.py
@@ -25,6 +25,7 @@
 
 
 class TestCCommInitOp(unittest.TestCase):
+
     def setUp(self):
         self.endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')
         self.current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
@@ -45,25 +46,23 @@ def test_specifying_devices(self):
             name=fluid.unique_name.generate('nccl_id'),
             persistable=True,
             type=fluid.core.VarDesc.VarType.RAW)
-        block.append_op(
-            type='c_gen_nccl_id',
-            inputs={},
-            outputs={'Out': nccl_id_var},
-            attrs={
-                'rank': self.rank,
-                'endpoint': self.current_endpoint,
-                'other_endpoints': self.other_endpoints
-            })
-        block.append_op(
-            type='c_comm_init',
-            inputs={'X': nccl_id_var},
-            outputs={},
-            attrs={
-                'nranks': self.nranks,
-                'rank': self.rank,
-                'ring_id': 0,
-                'device_id': self.gpu_id
-            })
+        block.append_op(type='c_gen_nccl_id',
+                        inputs={},
+                        outputs={'Out': nccl_id_var},
+                        attrs={
+                            'rank': self.rank,
+                            'endpoint': self.current_endpoint,
+                            'other_endpoints': self.other_endpoints
+                        })
+        block.append_op(type='c_comm_init',
+                        inputs={'X': nccl_id_var},
+                        outputs={},
+                        attrs={
+                            'nranks': self.nranks,
+                            'rank': self.rank,
+                            'ring_id': 0,
+                            'device_id': self.gpu_id
+                        })
         self.exe.run(program)
 
 
diff --git a/python/paddle/fluid/tests/unittests/c_embedding_op_base.py b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py
index 8bdeecae4569e..8b5f18407906a 100644
--- a/python/paddle/fluid/tests/unittests/c_embedding_op_base.py
+++ b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py
@@ -36,6 +36,7 @@ def get_c_embedding(start, end, table, ids):
 
 
 class TestCEmbeddingCPU(OpTest):
+
     def setUp(self):
         self.init_dtype()
         self.initcase()
@@ -47,8 +48,8 @@ def setUp(self):
     def initcase(self):
         self.op_type = "c_embedding"
         table = np.random.random((17, 64)).astype(self.dtype)
-        ids = np.random.randint(
-            low=0, high=17 * 2, size=(2, 4)).astype(self.ids_dtype)
+        ids = np.random.randint(low=0, high=17 * 2,
+                                size=(2, 4)).astype(self.ids_dtype)
         self.start_index = 10
         self.end_index = self.start_index + 17
 
@@ -71,6 +72,7 @@ def init_dtype(self):
 
 
 class TestCEmbeddingOpBase(TestCEmbeddingCPU):
+
     def setUp(self):
         self.init_dtype()
         self.initcase()
@@ -97,6 +99,7 @@ def init_dtype(self):
 
 
 class TestCEmbeddingOpFP32(TestCEmbeddingOpBase):
+
     def setUp(self):
         self.init_dtype()
         self.initcase()
@@ -104,8 +107,8 @@ def setUp(self):
     def initcase(self):
         self.op_type = "c_embedding"
         table = np.random.random((17, 64)).astype(self.dtype)
-        ids = np.random.randint(
-            low=0, high=17 * 2, size=(2, 4)).astype(self.ids_dtype)
+        ids = np.random.randint(low=0, high=17 * 2,
+                                size=(2, 4)).astype(self.ids_dtype)
         self.start_index = 10
         ids[0][1] = 12
         ids[0][2] = 12
diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
index 13a7ff6860e4d..d188ae6654509 100644
--- a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
+++ b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
@@ -36,8 +36,8 @@
 def generator():
     batch_size = 5
     for i in range(5):
-        curr_train_x = np.random.randint(
-            batch_size, size=(batch_size, 3)).astype("float32")
+        curr_train_x = np.random.randint(batch_size,
+                                         size=(batch_size, 3)).astype("float32")
         if i >= 2:
             curr_train_x[0, :] = np.nan
             curr_train_x[-1, :] = np.inf
@@ -94,12 +94,14 @@ def check(use_cuda):
             for train_data, y_label in generator():
                 outs = exe.run(
                     main,
-                    feed={'x': train_data,
-                          'y': y_label},
+                    feed={
+                        'x': train_data,
+                        'y': y_label
+                    },
                     fetch_list=[y_predict.name, avg_cost.name, acc_top1.name])
                 step += 1
-                print('iter={:.0f},cost={},acc1={}'.format(step, outs[1][0],
-                                                           outs[2][0]))
+                print('iter={:.0f},cost={},acc1={}'.format(
+                    step, outs[1][0], outs[2][0]))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py b/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
index dee74fdcb1ff3..93ccc8b54f7b9 100644
--- a/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
@@ -33,8 +33,8 @@
 def generator():
     batch_size = 5
     for i in range(5):
-        curr_train_x = np.random.randint(
-            batch_size, size=(batch_size, 3)).astype("float32")
+        curr_train_x = np.random.randint(batch_size,
+                                         size=(batch_size, 3)).astype("float32")
         if i >= 2:
             curr_train_x[0, :] = np.nan
             curr_train_x[-1, :] = np.inf
@@ -47,6 +47,7 @@ def generator():
 
 
 class TestLayer(nn.Layer):
+
     def __init__(self):
         super(TestLayer, self).__init__()
         self.linear1 = nn.Linear(3, 400)
@@ -86,8 +87,8 @@ def check(use_cuda):
 
         acc_top1 = paddle.metric.accuracy(input=y_pred, label=y, k=1)
 
-        print('iter={:.0f}, cost={}, acc1={}'.format(
-            step, avg_cost.numpy(), acc_top1.numpy()))
+        print('iter={:.0f}, cost={}, acc1={}'.format(step, avg_cost.numpy(),
+                                                     acc_top1.numpy()))
 
         sgd.step()
         sgd.clear_grad()
diff --git a/python/paddle/fluid/tests/unittests/collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective_allgather_api.py
index 63d7f52c11a8a..d2a639d0294db 100644
--- a/python/paddle/fluid/tests/unittests/collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_allgather_api.py
@@ -39,14 +39,16 @@
 
 
 class TestCollectiveAllgatherAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank):
         with fluid.program_guard(main_prog, startup_program):
             tensor_list = []
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             paddle.distributed.all_gather(tensor_list, tindata)
             return tensor_list
 
diff --git a/python/paddle/fluid/tests/unittests/collective_allgather_op.py b/python/paddle/fluid/tests/unittests/collective_allgather_op.py
index f77a97aa915f6..bbfc35e6c9d99 100644
--- a/python/paddle/fluid/tests/unittests/collective_allgather_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_allgather_op.py
@@ -38,6 +38,7 @@
 
 
 class TestCollectiveAllGather(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -45,25 +46,26 @@ def get_model(self, main_prog, startup_program):
         ring_id = 0
         nranks = 2
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outofgather",
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_allgather",
-                inputs={'X': tindata},
-                attrs={'ring_id': ring_id,
-                       'nranks': nranks},
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
+            main_prog.global_block().append_op(type="c_allgather",
+                                               inputs={'X': tindata},
+                                               attrs={
+                                                   'ring_id': ring_id,
+                                                   'nranks': nranks
+                                               },
+                                               outputs={'Out': toutdata})
+            main_prog.global_block().append_op(type="c_sync_comm_stream",
+                                               inputs={'X': toutdata},
+                                               outputs={'Out': toutdata},
+                                               attrs={'ring_id': ring_id})
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
index 67242b274fcb1..c72fd144ed861 100644
--- a/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
@@ -39,13 +39,15 @@
 
 
 class TestCollectiveAllreduceAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             paddle.distributed.all_reduce(tindata)
             return [tindata]
 
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py b/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py
index 597765cfb9811..859161af456e2 100644
--- a/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py
@@ -39,16 +39,19 @@
 
 
 class TestCollectiveAllreduceNewGroupAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             gp = paddle.distributed.new_group([0, 1])
-            paddle.distributed.all_reduce(
-                tindata, group=gp, use_calc_stream=False)
+            paddle.distributed.all_reduce(tindata,
+                                          group=gp,
+                                          use_calc_stream=False)
             return [tindata]
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_op.py b/python/paddle/fluid/tests/unittests/collective_allreduce_op.py
index eef59ee3dde92..800131a6a6f48 100644
--- a/python/paddle/fluid/tests/unittests/collective_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_op.py
@@ -39,30 +39,30 @@
 
 
 class TestCollectiveAllreduce(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program):
         ring_id = 0
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outofallreduce",
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_allreduce_sum",
-                inputs={'X': tindata},
-                attrs={'ring_id': ring_id},
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
+            main_prog.global_block().append_op(type="c_allreduce_sum",
+                                               inputs={'X': tindata},
+                                               attrs={'ring_id': ring_id},
+                                               outputs={'Out': toutdata})
+            main_prog.global_block().append_op(type="c_sync_comm_stream",
+                                               inputs={'X': toutdata},
+                                               outputs={'Out': toutdata},
+                                               attrs={'ring_id': ring_id})
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_op_wait.py b/python/paddle/fluid/tests/unittests/collective_allreduce_op_wait.py
index 61a0ad3bd7636..a254ddd606863 100644
--- a/python/paddle/fluid/tests/unittests/collective_allreduce_op_wait.py
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_op_wait.py
@@ -39,14 +39,16 @@
 
 
 class TestCollectiveAllreduce(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program):
         ring_id = 0
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outofallreduce",
                 dtype='float32',
@@ -62,33 +64,32 @@ def get_model(self, main_prog, startup_program):
                         'X': tindata,
                         'Y': tindata,
                     },
-                    outputs={'Out': toutdata}, )
+                    outputs={'Out': toutdata},
+                )
                 main_prog.global_block().append_op(
                     type="elementwise_sub",
                     inputs={
                         'X': toutdata,
                         'Y': tindata,
                     },
-                    outputs={'Out': toutdata}, )
+                    outputs={'Out': toutdata},
+                )
 
-            main_prog.global_block().append_op(
-                type='c_wait_compute',
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
+            main_prog.global_block().append_op(type='c_wait_compute',
+                                               inputs={'X': toutdata},
+                                               outputs={'Out': toutdata},
+                                               attrs={'ring_id': ring_id})
 
-            main_prog.global_block().append_op(
-                type="c_allreduce_sum",
-                inputs={'X': toutdata},
-                attrs={'ring_id': ring_id},
-                outputs={'Out': toutdata},
-                attr={'use_calc_stream': False})
+            main_prog.global_block().append_op(type="c_allreduce_sum",
+                                               inputs={'X': toutdata},
+                                               attrs={'ring_id': ring_id},
+                                               outputs={'Out': toutdata},
+                                               attr={'use_calc_stream': False})
 
-            main_prog.global_block().append_op(
-                type="c_wait_comm",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
+            main_prog.global_block().append_op(type="c_wait_comm",
+                                               inputs={'X': toutdata},
+                                               outputs={'Out': toutdata},
+                                               attrs={'ring_id': ring_id})
 
             # tout = tin + tout - tin = tout
             if True:
@@ -98,14 +99,16 @@ def get_model(self, main_prog, startup_program):
                         'X': tindata,
                         'Y': toutdata,
                     },
-                    outputs={'Out': toutdata}, )
+                    outputs={'Out': toutdata},
+                )
                 main_prog.global_block().append_op(
                     type="elementwise_sub",
                     inputs={
                         'X': toutdata,
                         'Y': tindata,
                     },
-                    outputs={'Out': toutdata}, )
+                    outputs={'Out': toutdata},
+                )
 
             return toutdata
 
diff --git a/python/paddle/fluid/tests/unittests/collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective_alltoall_api.py
index be18b68a1da33..343ba13c4e81f 100644
--- a/python/paddle/fluid/tests/unittests/collective_alltoall_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_alltoall_api.py
@@ -39,13 +39,15 @@
 
 
 class TestCollectiveAllToAllAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             tindata = paddle.split(tindata, 2, axis=0)
             tout_data = []
             paddle.distributed.alltoall(tindata, tout_data)
diff --git a/python/paddle/fluid/tests/unittests/collective_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective_alltoall_api_dygraph.py
index 02a59aef071f8..b5994db5cb6c5 100644
--- a/python/paddle/fluid/tests/unittests/collective_alltoall_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective_alltoall_api_dygraph.py
@@ -37,6 +37,7 @@
 
 
 class TestCollectiveAllToAllAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
diff --git a/python/paddle/fluid/tests/unittests/collective_barrier_api.py b/python/paddle/fluid/tests/unittests/collective_barrier_api.py
index dbcc70d540bd6..1e08c73f8cbc3 100644
--- a/python/paddle/fluid/tests/unittests/collective_barrier_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_barrier_api.py
@@ -39,6 +39,7 @@
 
 
 class TestCollectiveBarrierAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
diff --git a/python/paddle/fluid/tests/unittests/collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
index 08a3d948906a8..b928e409f0e06 100644
--- a/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
@@ -39,13 +39,15 @@
 
 
 class TestCollectiveBroadcastAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             paddle.distributed.broadcast(tindata, src=1)
             return [tindata]
 
diff --git a/python/paddle/fluid/tests/unittests/collective_broadcast_op.py b/python/paddle/fluid/tests/unittests/collective_broadcast_op.py
index 127f48be61851..140df2b91d9c8 100644
--- a/python/paddle/fluid/tests/unittests/collective_broadcast_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_broadcast_op.py
@@ -39,6 +39,7 @@
 
 
 class TestCollectiveBroadcast(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -46,25 +47,26 @@ def get_model(self, main_prog, startup_program):
         ring_id = 0
         rootid = 1
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outofbroadcast",
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_broadcast",
-                inputs={'X': tindata},
-                attrs={'ring_id': ring_id,
-                       'root': rootid},
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
+            main_prog.global_block().append_op(type="c_broadcast",
+                                               inputs={'X': tindata},
+                                               attrs={
+                                                   'ring_id': ring_id,
+                                                   'root': rootid
+                                               },
+                                               outputs={'Out': toutdata})
+            main_prog.global_block().append_op(type="c_sync_comm_stream",
+                                               inputs={'X': toutdata},
+                                               outputs={'Out': toutdata},
+                                               attrs={'ring_id': ring_id})
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective_concat_op.py b/python/paddle/fluid/tests/unittests/collective_concat_op.py
index c9de1713e7282..2f2e4d699f70a 100644
--- a/python/paddle/fluid/tests/unittests/collective_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_concat_op.py
@@ -38,6 +38,7 @@
 
 
 class TestCollectiveConcat(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -45,23 +46,23 @@ def get_model(self, main_prog, startup_program):
         ring_id = 0
         nranks = 2
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outofconcat",
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_concat",
-                inputs={'X': tindata},
-                attrs={
-                    'ring_id': ring_id,
-                    'rank': self.rank,
-                    'nranks': nranks
-                },
-                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(type="c_concat",
+                                               inputs={'X': tindata},
+                                               attrs={
+                                                   'ring_id': ring_id,
+                                                   'rank': self.rank,
+                                                   'nranks': nranks
+                                               },
+                                               outputs={'Out': toutdata})
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective_global_gather.py b/python/paddle/fluid/tests/unittests/collective_global_gather.py
index 164abe0593491..60909f63211de 100644
--- a/python/paddle/fluid/tests/unittests/collective_global_gather.py
+++ b/python/paddle/fluid/tests/unittests/collective_global_gather.py
@@ -29,6 +29,7 @@
 
 
 class TestCollectiveGlobalGatherAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -40,12 +41,15 @@ def get_model(self, main_prog, startup_program, rank, indata=None):
             n_expert = 2
             world_size = 2
             tot_expert = n_expert * world_size
-            local_input_buf = paddle.static.data(
-                name="local_input_buf", shape=[-1, in_feat], dtype="float32")
-            local_expert_count = paddle.static.data(
-                name="local_expert_count", shape=[tot_expert], dtype="int64")
-            global_expert_count = paddle.static.data(
-                name="global_expert_count", shape=[tot_expert], dtype="int64")
+            local_input_buf = paddle.static.data(name="local_input_buf",
+                                                 shape=[-1, in_feat],
+                                                 dtype="float32")
+            local_expert_count = paddle.static.data(name="local_expert_count",
+                                                    shape=[tot_expert],
+                                                    dtype="int64")
+            global_expert_count = paddle.static.data(name="global_expert_count",
+                                                     shape=[tot_expert],
+                                                     dtype="int64")
 
             output = paddle.distributed.utils.global_gather(
                 local_input_buf, local_expert_count, global_expert_count)
@@ -79,13 +83,12 @@ def run_trainer(self, args):
         # Call paddle.distributed.alltoall() under legacy dygraph
         _enable_legacy_dygraph()
         np.random.seed(os.getpid())
-        local_expert_count = np.random.randint(
-            1, 4, size=tot_expert).astype("int64")
+        local_expert_count = np.random.randint(1, 4,
+                                               size=tot_expert).astype("int64")
         local_expert_count = paddle.to_tensor(local_expert_count)
         global_expert_count = []
-        paddle.distributed.alltoall(
-            paddle.split(
-                local_expert_count, 2, axis=0), global_expert_count)
+        paddle.distributed.alltoall(paddle.split(local_expert_count, 2, axis=0),
+                                    global_expert_count)
         global_expert_count = paddle.concat(global_expert_count, axis=0)
         global_expert_count = global_expert_count.numpy()
         local_expert_count = local_expert_count.numpy()
diff --git a/python/paddle/fluid/tests/unittests/collective_global_gather_dygraph.py b/python/paddle/fluid/tests/unittests/collective_global_gather_dygraph.py
index 20df5f3555596..0b264f5ba8966 100644
--- a/python/paddle/fluid/tests/unittests/collective_global_gather_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective_global_gather_dygraph.py
@@ -25,6 +25,7 @@
 
 
 class TestCollectiveGlobalGatherAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -41,8 +42,7 @@ def get_model(self, main_prog, startup_program, rank, indata=None):
             local_expert_count = paddle.to_tensor(local_expert_count)
             global_expert_count = []
             paddle.distributed.alltoall(
-                paddle.split(
-                    local_expert_count, 2, axis=0),
+                paddle.split(local_expert_count, 2, axis=0),
                 global_expert_count)
             global_expert_count = paddle.concat(global_expert_count, axis=0)
             fwd_expert_count = sum(global_expert_count)
diff --git a/python/paddle/fluid/tests/unittests/collective_global_scatter.py b/python/paddle/fluid/tests/unittests/collective_global_scatter.py
index 74d12b61aca41..c4950025877df 100644
--- a/python/paddle/fluid/tests/unittests/collective_global_scatter.py
+++ b/python/paddle/fluid/tests/unittests/collective_global_scatter.py
@@ -28,6 +28,7 @@
 
 
 class TestCollectiveGlobalScatterAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -39,14 +40,15 @@ def get_model(self, main_prog, startup_program, rank, indata=None):
             n_expert = 2
             world_size = 2
             tot_expert = n_expert * world_size
-            local_input_buf = paddle.static.data(
-                name="local_input_buf", shape=[-1, in_feat], dtype="float32")
-            local_expert_count = paddle.static.data(
-                name="local_expert_count", shape=[tot_expert], dtype="int64")
+            local_input_buf = paddle.static.data(name="local_input_buf",
+                                                 shape=[-1, in_feat],
+                                                 dtype="float32")
+            local_expert_count = paddle.static.data(name="local_expert_count",
+                                                    shape=[tot_expert],
+                                                    dtype="int64")
             global_expert_count = []
             paddle.distributed.alltoall(
-                paddle.split(
-                    local_expert_count, 2, axis=0),
+                paddle.split(local_expert_count, 2, axis=0),
                 global_expert_count)
             global_expert_count = paddle.concat(global_expert_count, axis=0)
             output = paddle.distributed.utils.global_scatter(
@@ -75,8 +77,8 @@ def run_trainer(self, args):
         n_expert = 2
         world_size = 2
         tot_expert = n_expert * world_size
-        local_expert_count = np.random.randint(
-            1, 4, size=tot_expert).astype("int64")
+        local_expert_count = np.random.randint(1, 4,
+                                               size=tot_expert).astype("int64")
         fwd_expert_count = sum(local_expert_count)
         local_input_buf = np.random.rand(fwd_expert_count,
                                          in_feat).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/collective_global_scatter_dygraph.py b/python/paddle/fluid/tests/unittests/collective_global_scatter_dygraph.py
index f7e13a8762274..82816c899e2cb 100644
--- a/python/paddle/fluid/tests/unittests/collective_global_scatter_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective_global_scatter_dygraph.py
@@ -25,6 +25,7 @@
 
 
 class TestCollectiveGlobalScatterAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -45,8 +46,7 @@ def get_model(self, main_prog, startup_program, rank, indata=None):
             local_input_buf = paddle.to_tensor(local_input_buf)
             global_expert_count = []
             paddle.distributed.alltoall(
-                paddle.split(
-                    local_expert_count, 2, axis=0),
+                paddle.split(local_expert_count, 2, axis=0),
                 global_expert_count)
             global_expert_count = paddle.concat(global_expert_count, axis=0)
             local_input_buf.stop_gradient = False
diff --git a/python/paddle/fluid/tests/unittests/collective_identity_op.py b/python/paddle/fluid/tests/unittests/collective_identity_op.py
index e024b64e82509..a757b0605a51e 100644
--- a/python/paddle/fluid/tests/unittests/collective_identity_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_identity_op.py
@@ -38,6 +38,7 @@
 
 
 class TestCollectiveIdentity(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -45,20 +46,22 @@ def get_model(self, main_prog, startup_program):
         ring_id = 0
         nranks = 2
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outofgather",
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_identity",
-                inputs={'X': tindata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id,
-                       'nranks': nranks})
+            main_prog.global_block().append_op(type="c_identity",
+                                               inputs={'X': tindata},
+                                               outputs={'Out': toutdata},
+                                               attrs={
+                                                   'ring_id': ring_id,
+                                                   'nranks': nranks
+                                               })
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective_reduce_api.py
index 41e31146a2229..d474dd683dddc 100644
--- a/python/paddle/fluid/tests/unittests/collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_api.py
@@ -39,13 +39,15 @@
 
 
 class TestCollectiveReduceAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             paddle.distributed.reduce(tindata, dst=0)
             return [tindata]
 
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_op.py b/python/paddle/fluid/tests/unittests/collective_reduce_op.py
index 0448c66d13234..c39a8a38f48a7 100644
--- a/python/paddle/fluid/tests/unittests/collective_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op.py
@@ -39,6 +39,7 @@
 
 
 class TestCollectiveReduce(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -46,25 +47,26 @@ def get_model(self, main_prog, startup_program):
         ring_id = 0
         rootid = 1
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outofreduce",
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_reduce_sum",
-                inputs={'X': tindata},
-                attrs={'ring_id': ring_id,
-                       'root_id': rootid},
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
+            main_prog.global_block().append_op(type="c_reduce_sum",
+                                               inputs={'X': tindata},
+                                               attrs={
+                                                   'ring_id': ring_id,
+                                                   'root_id': rootid
+                                               },
+                                               outputs={'Out': toutdata})
+            main_prog.global_block().append_op(type="c_sync_comm_stream",
+                                               inputs={'X': toutdata},
+                                               outputs={'Out': toutdata},
+                                               attrs={'ring_id': ring_id})
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py b/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
index 7a9e0b148d556..0a1fc2b79a935 100644
--- a/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
@@ -39,6 +39,7 @@
 
 
 class TestCollectiveReduce(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -46,28 +47,27 @@ def get_model(self, main_prog, startup_program):
         ring_id = 0
         rootid = 1
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outofreduce",
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_reduce_sum",
-                inputs={'X': tindata},
-                attrs={
-                    'ring_id': ring_id,
-                    'use_calc_stream': True,
-                    'root_id': rootid
-                },
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
+            main_prog.global_block().append_op(type="c_reduce_sum",
+                                               inputs={'X': tindata},
+                                               attrs={
+                                                   'ring_id': ring_id,
+                                                   'use_calc_stream': True,
+                                                   'root_id': rootid
+                                               },
+                                               outputs={'Out': toutdata})
+            main_prog.global_block().append_op(type="c_sync_comm_stream",
+                                               inputs={'X': toutdata},
+                                               outputs={'Out': toutdata},
+                                               attrs={'ring_id': ring_id})
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective_reducescatter.py b/python/paddle/fluid/tests/unittests/collective_reducescatter.py
index 00d4a1c4cf6bd..27f7fd506b5b4 100644
--- a/python/paddle/fluid/tests/unittests/collective_reducescatter.py
+++ b/python/paddle/fluid/tests/unittests/collective_reducescatter.py
@@ -38,6 +38,7 @@
 
 
 class TestCollectiveReduceScatter(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -45,8 +46,9 @@ def get_model(self, main_prog, startup_program):
         ring_id = 0
         nranks = 2
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = fluid.layers.collective._c_reducescatter(tindata, nranks)
             toutdata = fluid.layers.collective._c_sync_comm_stream(toutdata, 0)
             return toutdata
diff --git a/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py b/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py
index 91712e2b50f23..a1843394e84e6 100644
--- a/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py
@@ -39,6 +39,7 @@
 
 
 class TestCollectiveReduceScatter(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -46,25 +47,26 @@ def get_model(self, main_prog, startup_program):
         ring_id = 0
         nranks = 2
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outofrs",
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_reducescatter",
-                inputs={'X': tindata},
-                attrs={'ring_id': ring_id,
-                       'nranks': nranks},
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
+            main_prog.global_block().append_op(type="c_reducescatter",
+                                               inputs={'X': tindata},
+                                               attrs={
+                                                   'ring_id': ring_id,
+                                                   'nranks': nranks
+                                               },
+                                               outputs={'Out': toutdata})
+            main_prog.global_block().append_op(type="c_sync_comm_stream",
+                                               inputs={'X': toutdata},
+                                               outputs={'Out': toutdata},
+                                               attrs={'ring_id': ring_id})
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective_scatter_api.py
index 643106ff53a95..0a0d1e1593ef3 100644
--- a/python/paddle/fluid/tests/unittests/collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_api.py
@@ -39,18 +39,19 @@
 
 
 class TestCollectiveScatterAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata",
-                shape=[10, 1000],
-                dtype='float32',
-                append_batch_size=False)
-            toutdata = layers.fill_constant(
-                shape=[5, 1000], dtype='float32', value=1.0)
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32',
+                                  append_batch_size=False)
+            toutdata = layers.fill_constant(shape=[5, 1000],
+                                            dtype='float32',
+                                            value=1.0)
             tensor_list = None
             if rank == 1:
                 tensor_list = paddle.split(tindata, 2, axis=0)
diff --git a/python/paddle/fluid/tests/unittests/collective_scatter_op.py b/python/paddle/fluid/tests/unittests/collective_scatter_op.py
index 7afa4aec63990..1434bd3be6a8f 100644
--- a/python/paddle/fluid/tests/unittests/collective_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_op.py
@@ -39,6 +39,7 @@
 
 
 class TestCollectiveScatter(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -46,26 +47,27 @@ def get_model(self, main_prog, startup_program):
         ring_id = 0
         rootid = 1
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outofreduce",
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_scatter",
-                inputs={'X': tindata},
-                attrs={'ring_id': ring_id,
-                       'root': rootid,
-                       'nranks': 2},
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
+            main_prog.global_block().append_op(type="c_scatter",
+                                               inputs={'X': tindata},
+                                               attrs={
+                                                   'ring_id': ring_id,
+                                                   'root': rootid,
+                                                   'nranks': 2
+                                               },
+                                               outputs={'Out': toutdata})
+            main_prog.global_block().append_op(type="c_sync_comm_stream",
+                                               inputs={'X': toutdata},
+                                               outputs={'Out': toutdata},
+                                               attrs={'ring_id': ring_id})
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective_sendrecv_api.py
index 551537a0ea4ea..a4e699b64a9c0 100644
--- a/python/paddle/fluid/tests/unittests/collective_sendrecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_sendrecv_api.py
@@ -39,16 +39,16 @@
 
 
 class TestCollectiveSendRecvAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata",
-                shape=[10, 1000],
-                dtype='float32',
-                append_batch_size=False)
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32',
+                                  append_batch_size=False)
             if rank == 0:
                 paddle.distributed.send(tindata, dst=1)
             else:
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective_sendrecv_api_dygraph.py
index 10028488e85a2..8508c3d043c93 100644
--- a/python/paddle/fluid/tests/unittests/collective_sendrecv_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective_sendrecv_api_dygraph.py
@@ -37,6 +37,7 @@
 
 
 class TestCollectiveSendRecvAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_op.py b/python/paddle/fluid/tests/unittests/collective_sendrecv_op.py
index 18a7aeccf4c15..e19bdab2bb305 100644
--- a/python/paddle/fluid/tests/unittests/collective_sendrecv_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_sendrecv_op.py
@@ -39,37 +39,36 @@
 
 
 class TestCollectiveSendRecv(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program):
         ring_id = self.global_ring_id
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata",
-                shape=[10, 1000],
-                dtype='float64',
-                append_batch_size=False)
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float64',
+                                  append_batch_size=False)
             if self.rank == 0:
-                main_prog.global_block().append_op(
-                    type="send_v2",
-                    inputs={'X': tindata},
-                    attrs={
-                        'ring_id': ring_id,
-                        'peer': 1,
-                        'use_calc_stream': True
-                    })
+                main_prog.global_block().append_op(type="send_v2",
+                                                   inputs={'X': tindata},
+                                                   attrs={
+                                                       'ring_id': ring_id,
+                                                       'peer': 1,
+                                                       'use_calc_stream': True
+                                                   })
             else:
-                main_prog.global_block().append_op(
-                    type="recv_v2",
-                    outputs={'Out': tindata},
-                    attrs={
-                        'peer': 0,
-                        'ring_id': ring_id,
-                        'dtype': tindata.dtype,
-                        'out_shape': tindata.shape,
-                        'use_calc_stream': True,
-                    })
+                main_prog.global_block().append_op(type="recv_v2",
+                                                   outputs={'Out': tindata},
+                                                   attrs={
+                                                       'peer': 0,
+                                                       'ring_id': ring_id,
+                                                       'dtype': tindata.dtype,
+                                                       'out_shape':
+                                                       tindata.shape,
+                                                       'use_calc_stream': True,
+                                                   })
             return tindata
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_op_array.py b/python/paddle/fluid/tests/unittests/collective_sendrecv_op_array.py
index 6876a70ce91bc..ee8c4cce738e9 100644
--- a/python/paddle/fluid/tests/unittests/collective_sendrecv_op_array.py
+++ b/python/paddle/fluid/tests/unittests/collective_sendrecv_op_array.py
@@ -39,44 +39,39 @@
 
 
 class TestCollectiveSendRecv(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program):
         ring_id = self.global_ring_id
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata",
-                shape=[10, 1000],
-                dtype='float64',
-                append_batch_size=False)
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float64',
+                                  append_batch_size=False)
             if self.rank == 0:
                 data1 = fluid.layers.assign(
-                    np.array(
-                        [[0, 1, 2]], dtype='float32'))
+                    np.array([[0, 1, 2]], dtype='float32'))
                 data2 = fluid.layers.assign(
-                    np.array(
-                        [[3, 4, 5]], dtype='float32'))
+                    np.array([[3, 4, 5]], dtype='float32'))
             elif self.rank == 1:
                 data1 = fluid.layers.assign(
-                    np.array(
-                        [[3, 4, 5]], dtype='float32'))
+                    np.array([[3, 4, 5]], dtype='float32'))
                 data2 = fluid.layers.assign(
-                    np.array(
-                        [[0, 1, 2]], dtype='float32'))
+                    np.array([[0, 1, 2]], dtype='float32'))
             tensor_array = fluid.layers.create_array(dtype='float32')
             i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
             fluid.layers.array_write(data1, i, tensor_array)
             fluid.layers.array_write(data2, i + 1, tensor_array)
             if self.rank == 0:
-                main_prog.global_block().append_op(
-                    type="send_v2",
-                    inputs={'X': tensor_array},
-                    attrs={
-                        'ring_id': ring_id,
-                        'peer': 1,
-                        'use_calc_stream': True
-                    })
+                main_prog.global_block().append_op(type="send_v2",
+                                                   inputs={'X': tensor_array},
+                                                   attrs={
+                                                       'ring_id': ring_id,
+                                                       'peer': 1,
+                                                       'use_calc_stream': True
+                                                   })
             else:
                 main_prog.global_block().append_op(
                     type="recv_v2",
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_op_dynamic_shape.py b/python/paddle/fluid/tests/unittests/collective_sendrecv_op_dynamic_shape.py
index 093af635f44f6..45f349ed28545 100644
--- a/python/paddle/fluid/tests/unittests/collective_sendrecv_op_dynamic_shape.py
+++ b/python/paddle/fluid/tests/unittests/collective_sendrecv_op_dynamic_shape.py
@@ -39,39 +39,38 @@
 
 
 class TestCollectiveSendRecvDynamicShape(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program):
         ring_id = self.global_ring_id
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata",
-                shape=[10, 1000],
-                dtype='float64',
-                append_batch_size=False)
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float64',
+                                  append_batch_size=False)
             if self.rank == 0:
-                main_prog.global_block().append_op(
-                    type="send_v2",
-                    inputs={'X': tindata},
-                    attrs={
-                        'ring_id': ring_id,
-                        'peer': 1,
-                        'use_calc_stream': True,
-                        'dynamic_shape': True
-                    })
+                main_prog.global_block().append_op(type="send_v2",
+                                                   inputs={'X': tindata},
+                                                   attrs={
+                                                       'ring_id': ring_id,
+                                                       'peer': 1,
+                                                       'use_calc_stream': True,
+                                                       'dynamic_shape': True
+                                                   })
             else:
-                main_prog.global_block().append_op(
-                    type="recv_v2",
-                    outputs={'Out': tindata},
-                    attrs={
-                        'peer': 0,
-                        'ring_id': ring_id,
-                        'dtype': tindata.dtype,
-                        'out_shape': tindata.shape,
-                        'use_calc_stream': True,
-                        'dynamic_shape': True
-                    })
+                main_prog.global_block().append_op(type="recv_v2",
+                                                   outputs={'Out': tindata},
+                                                   attrs={
+                                                       'peer': 0,
+                                                       'ring_id': ring_id,
+                                                       'dtype': tindata.dtype,
+                                                       'out_shape':
+                                                       tindata.shape,
+                                                       'use_calc_stream': True,
+                                                       'dynamic_shape': True
+                                                   })
             return tindata
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective_split_op.py b/python/paddle/fluid/tests/unittests/collective_split_op.py
index 553955354fe02..f899d82d89775 100644
--- a/python/paddle/fluid/tests/unittests/collective_split_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_split_op.py
@@ -38,6 +38,7 @@
 
 
 class TestCollectiveAllGather(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -45,23 +46,23 @@ def get_model(self, main_prog, startup_program):
         ring_id = 0
         nranks = 2
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outofsplit",
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_split",
-                inputs={'X': tindata},
-                attrs={
-                    'ring_id': ring_id,
-                    'rank': self.rank,
-                    'nranks': nranks
-                },
-                outputs={'Out': toutdata})
+            main_prog.global_block().append_op(type="c_split",
+                                               inputs={'X': tindata},
+                                               attrs={
+                                                   'ring_id': ring_id,
+                                                   'rank': self.rank,
+                                                   'nranks': nranks
+                                               },
+                                               outputs={'Out': toutdata})
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py b/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py
index 815018dc4b2f4..b9ebbdc3807a6 100644
--- a/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py
+++ b/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py
@@ -41,6 +41,7 @@
 
 
 class TestColumnParallelLinearAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -50,8 +51,9 @@ def get_model(self, main_prog, startup_program, rank):
             np.random.seed(2020)
             np_array = np.random.rand(1000, 16)
 
-            data = paddle.static.data(
-                name='tindata', shape=[10, 1000], dtype="float32")
+            data = paddle.static.data(name='tindata',
+                                      shape=[10, 1000],
+                                      dtype="float32")
             paddle.distributed.broadcast(data, src=0)
             if rank == 0:
                 param_attr = paddle.fluid.ParamAttr(
@@ -69,7 +71,8 @@ def get_model(self, main_prog, startup_program, rank):
                 axis=1,
                 num_partitions=2,
                 weight_attr=param_attr,
-                bias_attr=True, )
+                bias_attr=True,
+            )
 
             return [linear_out]
 
diff --git a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
index 815e77896ed6d..f96c4589b26cc 100644
--- a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
+++ b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
@@ -63,10 +63,12 @@ def load_lr_input_record(sent):
 
 
 class CtrReader(object):
+
     def __init__(self):
         pass
 
     def _reader_creator(self, filelist):
+
         def get_rand(low=0.0, high=1.0):
             return random.random()
 
@@ -85,7 +87,9 @@ def reader():
 
 
 class DatasetCtrReader(fleet.MultiSlotDataGenerator):
+
     def generate_sample(self, line):
+
         def get_rand(low=0.0, high=1.0):
             return random.random()
 
@@ -113,8 +117,7 @@ def prepare_data():
         lines = f.readlines()
     err_info = "wrong meta format"
     assert len(lines) == 2, err_info
-    assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[
-        1], err_info
+    assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[1], err_info
     res = map(int, [_.split(':')[1] for _ in lines])
     res = list(res)
     dnn_input_dim = res[0]
@@ -195,8 +198,8 @@ def prepare_fake_data(file_nums=4, file_lines=500):
             for line_index in range(file_lines - 1):
                 file_str += gen_fake_line()
             fin.write(file_str)
-            warnings.warn("Write done ctr_train_data_part_{}".format(
-                file_index))
+            warnings.warn(
+                "Write done ctr_train_data_part_{}".format(file_index))
 
     file_list = [os.path.join(file_dir, x) for x in os.listdir(file_dir)]
     assert len(file_list) == file_nums
diff --git a/python/paddle/fluid/tests/unittests/decorator_helper.py b/python/paddle/fluid/tests/unittests/decorator_helper.py
index 1a5f4540cf033..20e1e49b5e85b 100644
--- a/python/paddle/fluid/tests/unittests/decorator_helper.py
+++ b/python/paddle/fluid/tests/unittests/decorator_helper.py
@@ -20,7 +20,9 @@
 
 
 def many_times(times):
+
     def __impl__(fn):
+
         def __fn__(*args, **kwargs):
             for _ in range(times):
                 fn(*args, **kwargs)
@@ -31,7 +33,9 @@ def __fn__(*args, **kwargs):
 
 
 def prog_scope():
+
     def __impl__(fn):
+
         def __fn__(*args, **kwargs):
             prog = fluid.Program()
             startup_prog = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/detected_gpu.py b/python/paddle/fluid/tests/unittests/detected_gpu.py
index 8abd44aff71e2..28e0cc7876030 100644
--- a/python/paddle/fluid/tests/unittests/detected_gpu.py
+++ b/python/paddle/fluid/tests/unittests/detected_gpu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,8 +19,8 @@
 print("compile with cuda:", fluid.core.is_compiled_with_cuda())
 print("get_cuda_device_count:", fluid.core.get_cuda_device_count())
 
-if fluid.core.is_compiled_with_cuda() and fluid.core.get_cuda_device_count(
-) > 0:
+if fluid.core.is_compiled_with_cuda(
+) and fluid.core.get_cuda_device_count() > 0:
     sys.exit(0)
 else:
     sys.exit(1)
diff --git a/python/paddle/fluid/tests/unittests/detected_xpu.py b/python/paddle/fluid/tests/unittests/detected_xpu.py
index d7b6f58c94144..a1b4b2ec1ecff 100644
--- a/python/paddle/fluid/tests/unittests/detected_xpu.py
+++ b/python/paddle/fluid/tests/unittests/detected_xpu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
index de52072d4a838..1360d975603b2 100644
--- a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
@@ -75,6 +75,7 @@ def cnn_model(data):
 
 
 class TestDistMnist2x2(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, single_device=False):
         # Input data
         images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
@@ -87,16 +88,17 @@ def get_model(self, batch_size=2, single_device=False):
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-        batch_acc = fluid.layers.accuracy(
-            input=predict, label=label, total=batch_size_tensor)
+        batch_acc = fluid.layers.accuracy(input=predict,
+                                          label=label,
+                                          total=batch_size_tensor)
 
         inference_program = fluid.default_main_program().clone()
 
         # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                    batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
 
         # Optimization
         # TODO(typhoonzero): fix distributed adam optimizer
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py
index c5aae1eef180e..6cd452ed1952a 100644
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
@@ -30,28 +30,26 @@
 
 
 class TestDistCTR2x2(TestDistRunnerBase):
+
     def get_model(self, batch_size=2):
 
         dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta()
         """ network definition """
-        dnn_data = fluid.layers.data(
-            name="dnn_data",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=1,
-            append_batch_size=False)
-        lr_data = fluid.layers.data(
-            name="lr_data",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=1,
-            append_batch_size=False)
-        label = fluid.layers.data(
-            name="click",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=0,
-            append_batch_size=False)
+        dnn_data = fluid.layers.data(name="dnn_data",
+                                     shape=[-1, 1],
+                                     dtype="int64",
+                                     lod_level=1,
+                                     append_batch_size=False)
+        lr_data = fluid.layers.data(name="lr_data",
+                                    shape=[-1, 1],
+                                    dtype="int64",
+                                    lod_level=1,
+                                    append_batch_size=False)
+        label = fluid.layers.data(name="click",
+                                  shape=[-1, 1],
+                                  dtype="int64",
+                                  lod_level=0,
+                                  append_batch_size=False)
 
         # build dnn model
         dnn_layer_dims = [128, 64, 32, 1]
@@ -63,8 +61,8 @@ def get_model(self, batch_size=2):
                 name="deep_embedding",
                 initializer=fluid.initializer.Constant(value=0.01)),
             is_sparse=IS_SPARSE)
-        dnn_pool = fluid.layers.sequence_pool(
-            input=dnn_embedding, pool_type="sum")
+        dnn_pool = fluid.layers.sequence_pool(input=dnn_embedding,
+                                              pool_type="sum")
         dnn_out = dnn_pool
         for i, dim in enumerate(dnn_layer_dims[1:]):
             fc = fluid.layers.fc(
@@ -106,11 +104,10 @@ def get_model(self, batch_size=2):
         use_lr_decay = bool(os.getenv('LR_DECAY', 0))
         lr = 0.0001
         if use_lr_decay:
-            lr = fluid.layers.exponential_decay(
-                learning_rate=0.0001,
-                decay_steps=10000,
-                decay_rate=0.999,
-                staircase=True)
+            lr = fluid.layers.exponential_decay(learning_rate=0.0001,
+                                                decay_steps=10000,
+                                                decay_rate=0.999,
+                                                staircase=True)
 
         sgd_optimizer = fluid.optimizer.SGD(learning_rate=lr,
                                             regularization=regularization)
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
index c030afdd4ff9b..4bc231e4eafe6 100644
--- a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
@@ -109,6 +109,7 @@ def load_lr_input_record(sent):
 
 
 class Dataset(object):
+
     def train(self):
         '''
         Load trainset.
@@ -163,8 +164,7 @@ def load_data_meta():
     lines = read_data('data.meta.txt')
     err_info = "wrong meta format"
     assert len(lines) == 2, err_info
-    assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[
-        1], err_info
+    assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[1], err_info
     res = map(int, [_.split(':')[1] for _ in lines])
     res = list(res)
     logger.info('dnn input dim: %d' % res[0])
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index be5118f0acc18..9508dc6c26292 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -39,6 +39,7 @@
 
 
 def fake_ctr_reader():
+
     def reader():
         for _ in range(1000):
             deep = np.random.random_integers(0, 1e5 - 1, size=16).tolist()
@@ -66,40 +67,36 @@ def net(self, args, is_train=True, batch_size=4, lr=0.01):
         """
         dnn_input_dim, lr_input_dim = int(1e5), int(1e5)
 
-        dnn_data = fluid.layers.data(
-            name="dnn_data",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=1,
-            append_batch_size=False)
-        lr_data = fluid.layers.data(
-            name="lr_data",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=1,
-            append_batch_size=False)
-        label = fluid.layers.data(
-            name="click",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=0,
-            append_batch_size=False)
+        dnn_data = fluid.layers.data(name="dnn_data",
+                                     shape=[-1, 1],
+                                     dtype="int64",
+                                     lod_level=1,
+                                     append_batch_size=False)
+        lr_data = fluid.layers.data(name="lr_data",
+                                    shape=[-1, 1],
+                                    dtype="int64",
+                                    lod_level=1,
+                                    append_batch_size=False)
+        label = fluid.layers.data(name="click",
+                                  shape=[-1, 1],
+                                  dtype="int64",
+                                  lod_level=0,
+                                  append_batch_size=False)
 
         datas = [dnn_data, lr_data, label]
 
         if args.reader == "pyreader":
             if is_train:
-                self.reader = fluid.io.PyReader(
-                    feed_list=datas,
-                    capacity=64,
-                    iterable=False,
-                    use_double_buffer=False)
+                self.reader = fluid.io.PyReader(feed_list=datas,
+                                                capacity=64,
+                                                iterable=False,
+                                                use_double_buffer=False)
             else:
-                self.test_reader = fluid.io.PyReader(
-                    feed_list=datas,
-                    capacity=64,
-                    iterable=False,
-                    use_double_buffer=False)
+                self.test_reader = fluid.io.PyReader(feed_list=datas,
+                                                     capacity=64,
+                                                     iterable=False,
+                                                     use_double_buffer=False)
+
 
 # build dnn model
         dnn_layer_dims = [128, 128, 64, 32, 1]
@@ -112,8 +109,8 @@ def net(self, args, is_train=True, batch_size=4, lr=0.01):
                 initializer=fluid.initializer.Constant(value=0.01)),
             is_sparse=True,
             padding_idx=0)
-        dnn_pool = fluid.layers.sequence_pool(
-            input=dnn_embedding, pool_type="sum")
+        dnn_pool = fluid.layers.sequence_pool(input=dnn_embedding,
+                                              pool_type="sum")
         dnn_out = dnn_pool
         for i, dim in enumerate(dnn_layer_dims[1:]):
             fc = fluid.layers.fc(
@@ -186,8 +183,8 @@ def do_distributed_testing(self, fleet):
                 loss_val = exe.run(program=paddle.static.default_main_program(),
                                    fetch_list=[self.avg_cost.name])
                 loss_val = np.mean(loss_val)
-                message = "TEST ---> batch_idx: {} loss: {}\n".format(batch_idx,
-                                                                      loss_val)
+                message = "TEST ---> batch_idx: {} loss: {}\n".format(
+                    batch_idx, loss_val)
                 fleet.util.print_on_rank(message, 0)
         except fluid.core.EOFException:
             self.test_reader.reset()
@@ -223,8 +220,8 @@ def do_pyreader_training(self, fleet):
                     #       np.array(loss_val), mode="sum")
                     #   loss_all_trainer = fleet.util.all_gather(float(loss_val))
                     #   loss_val = float(reduce_output) / len(loss_all_trainer)
-                    message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
-                                                                      loss_val)
+                    message = "TRAIN ---> pass: {} loss: {}\n".format(
+                        epoch_id, loss_val)
                     fleet.util.print_on_rank(message, 0)
 
                 pass_time = time.time() - pass_start
@@ -236,8 +233,9 @@ def do_pyreader_training(self, fleet):
             fleet.save_persistables(exe, dirname=dirname)
 
         model_dir = tempfile.mkdtemp()
-        fleet.save_inference_model(
-            exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost)
+        fleet.save_inference_model(exe, model_dir,
+                                   [feed.name for feed in self.feeds],
+                                   self.avg_cost)
         self.check_model_right(model_dir)
         shutil.rmtree(model_dir)
 
@@ -256,24 +254,22 @@ def do_dataset_training_queuedataset(self, fleet):
         dataset = paddle.distributed.QueueDataset()
         pipe_command = 'python ctr_dataset_reader.py'
 
-        dataset.init(
-            batch_size=batch_size,
-            use_var=self.feeds,
-            pipe_command=pipe_command,
-            thread_num=thread_num)
+        dataset.init(batch_size=batch_size,
+                     use_var=self.feeds,
+                     pipe_command=pipe_command,
+                     thread_num=thread_num)
 
         dataset.set_filelist(filelist)
 
         for epoch_id in range(1):
             pass_start = time.time()
             dataset.set_filelist(filelist)
-            exe.train_from_dataset(
-                program=fluid.default_main_program(),
-                dataset=dataset,
-                fetch_list=[self.avg_cost],
-                fetch_info=["cost"],
-                print_period=2,
-                debug=int(os.getenv("Debug", "0")))
+            exe.train_from_dataset(program=fluid.default_main_program(),
+                                   dataset=dataset,
+                                   fetch_list=[self.avg_cost],
+                                   fetch_info=["cost"],
+                                   print_period=2,
+                                   debug=int(os.getenv("Debug", "0")))
             pass_time = time.time() - pass_start
 
         if os.getenv("SAVE_MODEL") == "1":
@@ -317,13 +313,12 @@ def do_dataset_training(self, fleet):
 
         for epoch_id in range(1):
             pass_start = time.time()
-            exe.train_from_dataset(
-                program=fluid.default_main_program(),
-                dataset=dataset,
-                fetch_list=[self.avg_cost],
-                fetch_info=["cost"],
-                print_period=2,
-                debug=int(os.getenv("Debug", "0")))
+            exe.train_from_dataset(program=fluid.default_main_program(),
+                                   dataset=dataset,
+                                   fetch_list=[self.avg_cost],
+                                   fetch_info=["cost"],
+                                   print_period=2,
+                                   debug=int(os.getenv("Debug", "0")))
             pass_time = time.time() - pass_start
         dataset.release_memory()
 
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
index 8b3d49a741a95..4ecad3e97c676 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
@@ -75,12 +75,12 @@ def do_pyreader_training(self, fleet):
                     loss_val = exe.run(program=fleet.main_program,
                                        fetch_list=[self.avg_cost.name])
                     loss_val = np.mean(loss_val)
-                    reduce_output = fleet.util.all_reduce(
-                        np.array(loss_val), mode="sum")
+                    reduce_output = fleet.util.all_reduce(np.array(loss_val),
+                                                          mode="sum")
                     loss_all_trainer = fleet.util.all_gather(float(loss_val))
                     loss_val = float(reduce_output) / len(loss_all_trainer)
-                    message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
-                                                                      loss_val)
+                    message = "TRAIN ---> pass: {} loss: {}\n".format(
+                        epoch_id, loss_val)
                     fleet.util.print_on_rank(message, 0)
 
                 pass_time = time.time() - pass_start
@@ -88,8 +88,9 @@ def do_pyreader_training(self, fleet):
                 self.reader.reset()
 
         model_dir = tempfile.mkdtemp()
-        fleet.save_inference_model(
-            exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost)
+        fleet.save_inference_model(exe, model_dir,
+                                   [feed.name for feed in self.feeds],
+                                   self.avg_cost)
         self.check_model_right(model_dir)
         if fleet.is_first_worker():
             fleet.save_persistables(executor=exe, dirname=model_dir)
@@ -125,13 +126,12 @@ def do_dataset_training(self, fleet):
         for epoch_id in range(1):
             pass_start = time.time()
             dataset.set_filelist(filelist)
-            exe.train_from_dataset(
-                program=fleet.main_program,
-                dataset=dataset,
-                fetch_list=[self.avg_cost],
-                fetch_info=["cost"],
-                print_period=2,
-                debug=int(os.getenv("Debug", "0")))
+            exe.train_from_dataset(program=fleet.main_program,
+                                   dataset=dataset,
+                                   fetch_list=[self.avg_cost],
+                                   fetch_info=["cost"],
+                                   print_period=2,
+                                   debug=int(os.getenv("Debug", "0")))
             pass_time = time.time() - pass_start
 
         if os.getenv("SAVE_MODEL") == "1":
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py b/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py
index 7e811408291a0..d3cf735808d77 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py
@@ -23,6 +23,7 @@
 import paddle.distributed.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
+
 logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger("fluid")
 logger.setLevel(logging.INFO)
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py
index c6c2537b42c18..f714526286c92 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py
@@ -55,24 +55,21 @@ def net(self, args, batch_size=4, lr=0.01):
         dnn_input_dim, lr_input_dim = int(1e5), int(1e5)
 
         with fluid.device_guard("cpu"):
-            dnn_data = fluid.layers.data(
-                name="dnn_data",
-                shape=[-1, 1],
-                dtype="int64",
-                lod_level=1,
-                append_batch_size=False)
-            lr_data = fluid.layers.data(
-                name="lr_data",
-                shape=[-1, 1],
-                dtype="int64",
-                lod_level=1,
-                append_batch_size=False)
-            label = fluid.layers.data(
-                name="click",
-                shape=[-1, 1],
-                dtype="float32",
-                lod_level=0,
-                append_batch_size=False)
+            dnn_data = fluid.layers.data(name="dnn_data",
+                                         shape=[-1, 1],
+                                         dtype="int64",
+                                         lod_level=1,
+                                         append_batch_size=False)
+            lr_data = fluid.layers.data(name="lr_data",
+                                        shape=[-1, 1],
+                                        dtype="int64",
+                                        lod_level=1,
+                                        append_batch_size=False)
+            label = fluid.layers.data(name="click",
+                                      shape=[-1, 1],
+                                      dtype="float32",
+                                      lod_level=0,
+                                      append_batch_size=False)
 
             datas = [dnn_data, lr_data, label]
 
@@ -86,8 +83,8 @@ def net(self, args, batch_size=4, lr=0.01):
                     name="deep_embedding",
                     initializer=fluid.initializer.Constant(value=0.01)),
                 is_sparse=True)
-            dnn_pool = fluid.layers.sequence_pool(
-                input=dnn_embedding, pool_type="sum")
+            dnn_pool = fluid.layers.sequence_pool(input=dnn_embedding,
+                                                  pool_type="sum")
             dnn_out = dnn_pool
 
             # build lr model
@@ -99,8 +96,8 @@ def net(self, args, batch_size=4, lr=0.01):
                     name="wide_embedding",
                     initializer=fluid.initializer.Constant(value=0.01)),
                 is_sparse=True)
-            lr_pool = fluid.layers.sequence_pool(
-                input=lr_embbding, pool_type="sum")
+            lr_pool = fluid.layers.sequence_pool(input=lr_embbding,
+                                                 pool_type="sum")
 
         with fluid.device_guard("gpu"):
             for i, dim in enumerate(dnn_layer_dims[1:]):
@@ -144,8 +141,8 @@ def do_dataset_training(self, fleet):
         train_file_list = ctr_dataset_reader.prepare_fake_data()
 
         exe = fluid.Executor(fluid.CPUPlace())
-        real_program = fluid.default_main_program()._heter_pipeline_opt[
-            "section_program"]
+        real_program = fluid.default_main_program(
+        )._heter_pipeline_opt["section_program"]
         print(real_program)
 
         exe.run(fluid.default_startup_program())
@@ -170,13 +167,12 @@ def do_dataset_training(self, fleet):
         for epoch_id in range(1):
             pass_start = time.time()
             dataset.set_filelist(filelist)
-            exe.train_from_dataset(
-                program=fluid.default_main_program(),
-                dataset=dataset,
-                fetch_list=[self.avg_cost],
-                fetch_info=["cost"],
-                print_period=2,
-                debug=int(os.getenv("Debug", "0")))
+            exe.train_from_dataset(program=fluid.default_main_program(),
+                                   dataset=dataset,
+                                   fetch_list=[self.avg_cost],
+                                   fetch_info=["cost"],
+                                   print_period=2,
+                                   debug=int(os.getenv("Debug", "0")))
             pass_time = time.time() - pass_start
             print("do_dataset_training done. using time {}".format(pass_time))
         exe.close()
@@ -186,20 +182,19 @@ def do_dataset_heter_training(self, fleet):
         exe = fluid.Executor()
         exe.run(fluid.default_startup_program())
         fleet.init_worker()
-        real_program = fluid.default_main_program()._heter_pipeline_opt[
-            "section_program"]
+        real_program = fluid.default_main_program(
+        )._heter_pipeline_opt["section_program"]
         print(real_program)
 
         thread_num = int(os.getenv("CPU_NUM", 2))
         batch_size = 128
 
         pass_start = time.time()
-        exe.train_from_dataset(
-            program=fluid.default_main_program(),
-            fetch_list=[self.avg_cost],
-            fetch_info=["cost"],
-            print_period=2,
-            debug=int(os.getenv("Debug", "0")))
+        exe.train_from_dataset(program=fluid.default_main_program(),
+                               fetch_list=[self.avg_cost],
+                               fetch_info=["cost"],
+                               print_period=2,
+                               debug=int(os.getenv("Debug", "0")))
         exe.close()
         pass_time = time.time() - pass_start
         print("do_dataset_heter_training done. using time {}".format(pass_time))
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
index 575c07390a35b..19e278b4f4620 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
@@ -67,6 +67,7 @@ def cnn_model(data):
 
 
 class TestFleetMetaOptimizerPrecision(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, single_device=False):
         # Input data
         images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
@@ -79,16 +80,17 @@ def get_model(self, batch_size=2, single_device=False):
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-        batch_acc = fluid.layers.accuracy(
-            input=predict, label=label, total=batch_size_tensor)
+        batch_acc = fluid.layers.accuracy(input=predict,
+                                          label=label,
+                                          total=batch_size_tensor)
 
         test_program = fluid.default_main_program().clone(for_test=True)
 
         # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                    batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
 
         optimizer = paddle.fluid.optimizer.Adam(0.01)
         if single_device:
@@ -98,8 +100,8 @@ def get_model(self, batch_size=2, single_device=False):
             fleet.init(role)
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.without_graph_optimization = True
-            optimizer = fleet.distributed_optimizer(
-                optimizer, strategy=strategy)
+            optimizer = fleet.distributed_optimizer(optimizer,
+                                                    strategy=strategy)
             optimizer.minimize(avg_cost)
 
         return test_program, avg_cost, train_reader, test_reader, batch_acc, predict
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
index aaf33d04e6b33..cab4484d3e49c 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
@@ -67,6 +67,7 @@ def cnn_model(data):
 
 
 class TestFleetMetaOptimizerFuseAllReducePrecision(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, single_device=False):
         # Input data
         images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
@@ -79,16 +80,17 @@ def get_model(self, batch_size=2, single_device=False):
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-        batch_acc = fluid.layers.accuracy(
-            input=predict, label=label, total=batch_size_tensor)
+        batch_acc = fluid.layers.accuracy(input=predict,
+                                          label=label,
+                                          total=batch_size_tensor)
 
         test_program = fluid.default_main_program().clone(for_test=True)
 
         # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                    batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
 
         optimizer = paddle.fluid.optimizer.Adam(0.01)
         if single_device:
@@ -101,8 +103,8 @@ def get_model(self, batch_size=2, single_device=False):
             strategy.fuse_all_reduce_ops = True
             strategy._calc_comm_same_stream = False
             strategy.fuse_grad_size_in_num = 8
-            optimizer = fleet.distributed_optimizer(
-                optimizer, strategy=strategy)
+            optimizer = fleet.distributed_optimizer(optimizer,
+                                                    strategy=strategy)
             optimizer.minimize(avg_cost)
 
         return test_program, avg_cost, train_reader, test_reader, batch_acc, predict
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
index cfd9887f3323e..4a43fb44f46f7 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -54,6 +54,7 @@
 
 
 def fake_simnet_reader():
+
     def reader():
         for _ in range(1000):
             q = np.random.random_integers(0, 1500 - 1, size=1).tolist()
@@ -69,24 +70,27 @@ def get_acc(cos_q_nt, cos_q_pt, batch_size):
     cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
     cond = fluid.layers.cast(cond, dtype='float64')
     cond_3 = fluid.layers.reduce_sum(cond)
-    acc = fluid.layers.elementwise_div(
-        cond_3,
-        fluid.layers.fill_constant(
-            shape=[1], value=batch_size * 1.0, dtype='float64'),
-        name="simnet_acc")
+    acc = fluid.layers.elementwise_div(cond_3,
+                                       fluid.layers.fill_constant(
+                                           shape=[1],
+                                           value=batch_size * 1.0,
+                                           dtype='float64'),
+                                       name="simnet_acc")
     return acc
 
 
 def get_loss(cos_q_pt, cos_q_nt):
     loss_op1 = fluid.layers.elementwise_sub(
-        fluid.layers.fill_constant_batch_size_like(
-            input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'),
-        cos_q_pt)
+        fluid.layers.fill_constant_batch_size_like(input=cos_q_pt,
+                                                   shape=[-1, 1],
+                                                   value=margin,
+                                                   dtype='float32'), cos_q_pt)
     loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
     loss_op3 = fluid.layers.elementwise_max(
-        fluid.layers.fill_constant_batch_size_like(
-            input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
-        loss_op2)
+        fluid.layers.fill_constant_batch_size_like(input=loss_op2,
+                                                   shape=[-1, 1],
+                                                   value=0.0,
+                                                   dtype='float32'), loss_op2)
     avg_cost = fluid.layers.mean(loss_op3)
     return avg_cost
 
@@ -97,26 +101,31 @@ def train_network(batch_size,
                   is_self_contained_lr=False,
                   is_pyreader=False):
     # query
-    q = fluid.layers.data(
-        name="query_ids", shape=[1], dtype="int64", lod_level=1)
+    q = fluid.layers.data(name="query_ids",
+                          shape=[1],
+                          dtype="int64",
+                          lod_level=1)
     # label data
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
     # pt
-    pt = fluid.layers.data(
-        name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+    pt = fluid.layers.data(name="pos_title_ids",
+                           shape=[1],
+                           dtype="int64",
+                           lod_level=1)
     # nt
-    nt = fluid.layers.data(
-        name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+    nt = fluid.layers.data(name="neg_title_ids",
+                           shape=[1],
+                           dtype="int64",
+                           lod_level=1)
 
     datas = [q, label, pt, nt]
 
     reader = None
     if is_pyreader:
-        reader = fluid.io.PyReader(
-            feed_list=datas,
-            capacity=64,
-            iterable=False,
-            use_double_buffer=False)
+        reader = fluid.io.PyReader(feed_list=datas,
+                                   capacity=64,
+                                   iterable=False,
+                                   use_double_buffer=False)
 
     # embedding
     q_emb = fluid.embedding(
@@ -137,7 +146,8 @@ def train_network(batch_size,
         param_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.01),
             name="__q_fc__",
-            learning_rate=base_lr), )
+            learning_rate=base_lr),
+    )
 
     # embedding
     pt_emb = fluid.embedding(
@@ -235,8 +245,8 @@ def do_pyreader_training(self, fleet):
                     loss_val = exe.run(program=fluid.default_main_program(),
                                        fetch_list=[self.avg_cost.name])
                     loss_val = np.mean(loss_val)
-                    message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
-                                                                      loss_val)
+                    message = "TRAIN ---> pass: {} loss: {}\n".format(
+                        epoch_id, loss_val)
                     fleet.util.print_on_rank(message, 0)
 
                 pass_time = time.time() - pass_start
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
index 4e21d11561233..60b8a7bb6fdff 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
@@ -30,6 +30,7 @@
 
 
 def fake_ctr_reader():
+
     def reader():
         for _ in range(1000):
             deep = np.random.random_integers(0, 1e10, size=16).tolist()
@@ -57,33 +58,29 @@ def net(self, args, batch_size=4, lr=0.01):
         """
         dnn_input_dim, lr_input_dim = 10, 10
 
-        dnn_data = fluid.layers.data(
-            name="dnn_data",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=1,
-            append_batch_size=False)
-        lr_data = fluid.layers.data(
-            name="lr_data",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=1,
-            append_batch_size=False)
-        label = fluid.layers.data(
-            name="click",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=0,
-            append_batch_size=False)
+        dnn_data = fluid.layers.data(name="dnn_data",
+                                     shape=[-1, 1],
+                                     dtype="int64",
+                                     lod_level=1,
+                                     append_batch_size=False)
+        lr_data = fluid.layers.data(name="lr_data",
+                                    shape=[-1, 1],
+                                    dtype="int64",
+                                    lod_level=1,
+                                    append_batch_size=False)
+        label = fluid.layers.data(name="click",
+                                  shape=[-1, 1],
+                                  dtype="int64",
+                                  lod_level=0,
+                                  append_batch_size=False)
 
         datas = [dnn_data, lr_data, label]
 
         if args.reader == "pyreader":
-            self.reader = fluid.io.PyReader(
-                feed_list=datas,
-                capacity=64,
-                iterable=False,
-                use_double_buffer=False)
+            self.reader = fluid.io.PyReader(feed_list=datas,
+                                            capacity=64,
+                                            iterable=False,
+                                            use_double_buffer=False)
 
         # build dnn model
         initializer = int(os.getenv("INITIALIZER", "0"))
@@ -105,10 +102,9 @@ def net(self, args, batch_size=4, lr=0.01):
             size=[dnn_input_dim, dnn_layer_dims[0]],
             is_test=inference,
             entry=entry,
-            param_attr=fluid.ParamAttr(
-                name="deep_embedding", initializer=init))
-        dnn_pool = fluid.layers.sequence_pool(
-            input=dnn_embedding, pool_type="sum")
+            param_attr=fluid.ParamAttr(name="deep_embedding", initializer=init))
+        dnn_pool = fluid.layers.sequence_pool(input=dnn_embedding,
+                                              pool_type="sum")
         dnn_out = dnn_pool
         for i, dim in enumerate(dnn_layer_dims[1:]):
             fc = fluid.layers.fc(
@@ -170,8 +166,8 @@ def do_pyreader_training(self, fleet):
                     loss_val = exe.run(program=fluid.default_main_program(),
                                        fetch_list=[self.avg_cost.name])
                     loss_val = np.mean(loss_val)
-                    print("TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
-                                                                  loss_val))
+                    print("TRAIN ---> pass: {} loss: {}\n".format(
+                        epoch_id, loss_val))
             except fluid.core.EOFException:
                 self.reader.reset()
 
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index f63139464e755..cdfec08f9fe7a 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -76,6 +76,7 @@ def cnn_model(data):
 
 
 class TestDistMnist2x2(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         # Input data
         images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
@@ -88,8 +89,9 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-        batch_acc = fluid.layers.accuracy(
-            input=predict, label=label, total=batch_size_tensor)
+        batch_acc = fluid.layers.accuracy(input=predict,
+                                          label=label,
+                                          total=batch_size_tensor)
 
         inference_program = fluid.default_main_program().clone()
         # Optimization
@@ -99,18 +101,19 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         if not use_dgc:
             opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9)
         else:
-            opt = fluid.optimizer.DGCMomentumOptimizer(
-                learning_rate=self.lr, momentum=0.9, rampup_begin_step=2)
+            opt = fluid.optimizer.DGCMomentumOptimizer(learning_rate=self.lr,
+                                                       momentum=0.9,
+                                                       rampup_begin_step=2)
 
         # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                    batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
 
         if dist_strategy:
-            dist_opt = fleet.distributed_optimizer(
-                optimizer=opt, strategy=dist_strategy)
+            dist_opt = fleet.distributed_optimizer(optimizer=opt,
+                                                   strategy=dist_strategy)
             _, param_grads = dist_opt.minimize(avg_cost)
         else:
             opt.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py b/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
index d386e75fd887a..ca59e33ec9e12 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
@@ -49,6 +49,7 @@ def test_merge_reader(repeat_batch_size=8):
 
 
 class TestDistMnist2x2(TestDistRunnerBase):
+
     def get_model(self, batch_size=2):
         # Input data
         images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
@@ -61,8 +62,9 @@ def get_model(self, batch_size=2):
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-        batch_acc = fluid.layers.accuracy(
-            input=predict, label=label, total=batch_size_tensor)
+        batch_acc = fluid.layers.accuracy(input=predict,
+                                          label=label,
+                                          total=batch_size_tensor)
 
         inference_program = fluid.default_main_program().clone()
         # Optimization
@@ -70,8 +72,8 @@ def get_model(self, batch_size=2):
 
         # Reader
         train_reader = paddle.batch(test_merge_reader, batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
         opt.minimize(avg_cost)
         return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
 
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py b/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
index 3198c6cac86c2..b78dd744a9ae1 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
@@ -29,6 +29,7 @@
 
 
 class TestDistMnist2x2(TestDistRunnerBase):
+
     def get_model(self, batch_size=2):
         # Input data
         images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
@@ -41,20 +42,21 @@ def get_model(self, batch_size=2):
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-        batch_acc = fluid.layers.accuracy(
-            input=predict, label=label, total=batch_size_tensor)
+        batch_acc = fluid.layers.accuracy(input=predict,
+                                          label=label,
+                                          total=batch_size_tensor)
 
         inference_program = fluid.default_main_program().clone()
         # Optimization
-        opt = fluid.optimizer.MomentumOptimizer(
-            learning_rate=0.001, momentum=0.9)
+        opt = fluid.optimizer.MomentumOptimizer(learning_rate=0.001,
+                                                momentum=0.9)
         opt = FP16AllReduce(opt)
 
         # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                    batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
         opt.minimize(avg_cost)
         return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
 
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge.py b/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge.py
index 66ea24e0bde2d..50a053f57b801 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge.py
@@ -28,6 +28,7 @@
 
 
 class TestDistMnist2x2(TestDistRunnerBase):
+
     def get_model(self, batch_size=2):
         # Input data
         images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
@@ -40,20 +41,21 @@ def get_model(self, batch_size=2):
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-        batch_acc = fluid.layers.accuracy(
-            input=predict, label=label, total=batch_size_tensor)
+        batch_acc = fluid.layers.accuracy(input=predict,
+                                          label=label,
+                                          total=batch_size_tensor)
 
         inference_program = fluid.default_main_program().clone()
         # Optimization
-        opt = fluid.optimizer.MomentumOptimizer(
-            learning_rate=0.001, momentum=0.9)
+        opt = fluid.optimizer.MomentumOptimizer(learning_rate=0.001,
+                                                momentum=0.9)
         opt = fluid.optimizer.GradientMergeOptimizer(opt, 2)
 
         # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                    batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
         opt.minimize(avg_cost)
         return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
 
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge_raw_optimizer.py b/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge_raw_optimizer.py
index 733c4267db613..ff31a7016a673 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge_raw_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge_raw_optimizer.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 
 class TestDistMnistGradientMergeRawOptimizer(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, single_device=False):
         paddle.enable_static()
         paddle.seed(1)
@@ -53,8 +54,9 @@ def get_model(self, batch_size=2, single_device=False):
         strategy.without_graph_optimization = True
 
         fleet.init(is_collective=True, strategy=strategy)
-        image = paddle.static.data(
-            name='image', shape=[None, 1, 28, 28], dtype="float32")
+        image = paddle.static.data(name='image',
+                                   shape=[None, 1, 28, 28],
+                                   dtype="float32")
         label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
         predict = cnn_model(image)
         acc = paddle.metric.accuracy(predict, label)
@@ -86,10 +88,10 @@ def get_model(self, batch_size=2, single_device=False):
             else:
                 assert start_allreduce_idx == 1
 
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                    batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
         return test_program, cost, train_reader, test_reader, acc, predict
 
 
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_lars.py b/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
index 977e17c37f767..31362565c8981 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
@@ -40,6 +40,7 @@
 
 
 class TestDistMnist2x2(TestDistRunnerBase):
+
     def get_model(self, batch_size=2):
         # Input data
         images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
@@ -52,19 +53,20 @@ def get_model(self, batch_size=2):
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-        batch_acc = fluid.layers.accuracy(
-            input=predict, label=label, total=batch_size_tensor)
+        batch_acc = fluid.layers.accuracy(input=predict,
+                                          label=label,
+                                          total=batch_size_tensor)
 
         inference_program = fluid.default_main_program().clone()
         # Optimization
-        opt = fluid.optimizer.LarsMomentumOptimizer(
-            learning_rate=0.001, momentum=0.9)
+        opt = fluid.optimizer.LarsMomentumOptimizer(learning_rate=0.001,
+                                                    momentum=0.9)
 
         # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                    batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
         opt.minimize(avg_cost)
         return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
 
diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py
index dd010e962e2a9..175b100990bd0 100644
--- a/python/paddle/fluid/tests/unittests/dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
@@ -40,7 +40,9 @@
 
 
 class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
+
     def _load_persistable_vars(self, executor, dirname, program):
+
         def _is_checkpoint_var(var):
             """
             the checkpoint will not save or load all the variables.
@@ -68,20 +70,18 @@ def _is_checkpoint_var(var):
 
             return var.persistable
 
-        io.load_vars(
-            executor,
-            dirname=dirname,
-            main_program=program,
-            predicate=_is_checkpoint_var,
-            filename=None)
+        io.load_vars(executor,
+                     dirname=dirname,
+                     main_program=program,
+                     predicate=_is_checkpoint_var,
+                     filename=None)
 
     def run_pserver(self, args):
         self.get_model(batch_size=2)
         # NOTE: pserver should not call memory optimize
-        t = self.get_transpiler(args.trainer_id,
-                                fluid.default_main_program(), args.endpoints,
-                                args.trainers, args.sync_mode, False,
-                                args.current_endpoint)
+        t = self.get_transpiler(args.trainer_id, fluid.default_main_program(),
+                                args.endpoints, args.trainers, args.sync_mode,
+                                False, args.current_endpoint)
         pserver_prog = t.get_pserver_program(args.current_endpoint)
         startup_prog = t.get_startup_program(args.current_endpoint,
                                              pserver_prog)
@@ -130,11 +130,10 @@ def run_trainer(self, args):
         else:
             build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
 
-        exe = fluid.ParallelExecutor(
-            args.use_cuda,
-            loss_name=avg_cost.name,
-            exec_strategy=strategy,
-            build_strategy=build_stra)
+        exe = fluid.ParallelExecutor(args.use_cuda,
+                                     loss_name=avg_cost.name,
+                                     exec_strategy=strategy,
+                                     build_strategy=build_stra)
 
         feed_var_list = [
             var for var in trainer_prog.global_block().vars.values()
@@ -167,8 +166,8 @@ def get_data():
                 if need_save and model_dir:
                     io.save_persistables(startup_exe, model_dir, trainer_prog)
 
-            var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor(
-            ))
+            var = np.array(
+                fluid.global_scope().find_var('__fc_b__').get_tensor())
             sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist()))
 
         elif save_mode == "DIST":
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index 5ba40c7c8388c..ad5d632637ebb 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -49,6 +49,7 @@
 
 
 class SE_ResNeXt():
+
     def __init__(self, layers=50):
         self.params = train_parameters
         self.layers = layers
@@ -64,56 +65,53 @@ def net(self, input, class_dim=1000):
             depth = [3, 4, 6, 3]
             num_filters = [128, 256, 512, 1024]
 
-            conv = self.conv_bn_layer(
-                input=input,
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            conv = fluid.layers.pool2d(
-                input=conv,
-                pool_size=3,
-                pool_stride=2,
-                pool_padding=1,
-                pool_type='max')
+            conv = self.conv_bn_layer(input=input,
+                                      num_filters=64,
+                                      filter_size=7,
+                                      stride=2,
+                                      act='relu')
+            conv = fluid.layers.pool2d(input=conv,
+                                       pool_size=3,
+                                       pool_stride=2,
+                                       pool_padding=1,
+                                       pool_type='max')
         elif layers == 101:
             cardinality = 32
             reduction_ratio = 16
             depth = [3, 4, 23, 3]
             num_filters = [128, 256, 512, 1024]
 
-            conv = self.conv_bn_layer(
-                input=input,
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            conv = fluid.layers.pool2d(
-                input=conv,
-                pool_size=3,
-                pool_stride=2,
-                pool_padding=1,
-                pool_type='max')
+            conv = self.conv_bn_layer(input=input,
+                                      num_filters=64,
+                                      filter_size=7,
+                                      stride=2,
+                                      act='relu')
+            conv = fluid.layers.pool2d(input=conv,
+                                       pool_size=3,
+                                       pool_stride=2,
+                                       pool_padding=1,
+                                       pool_type='max')
         elif layers == 152:
             cardinality = 64
             reduction_ratio = 16
             depth = [3, 8, 36, 3]
             num_filters = [128, 256, 512, 1024]
 
-            conv = self.conv_bn_layer(
-                input=input,
-                num_filters=64,
-                filter_size=3,
-                stride=2,
-                act='relu')
-            conv = self.conv_bn_layer(
-                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
-            conv = self.conv_bn_layer(
-                input=conv,
-                num_filters=128,
-                filter_size=3,
-                stride=1,
-                act='relu')
+            conv = self.conv_bn_layer(input=input,
+                                      num_filters=64,
+                                      filter_size=3,
+                                      stride=2,
+                                      act='relu')
+            conv = self.conv_bn_layer(input=conv,
+                                      num_filters=64,
+                                      filter_size=3,
+                                      stride=1,
+                                      act='relu')
+            conv = self.conv_bn_layer(input=conv,
+                                      num_filters=128,
+                                      filter_size=3,
+                                      stride=1,
+                                      act='relu')
             conv = fluid.layers.pool2d(
                 input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
                 pool_type='max')
@@ -127,16 +125,18 @@ def net(self, input, class_dim=1000):
                     cardinality=cardinality,
                     reduction_ratio=reduction_ratio)
 
-        pool = fluid.layers.pool2d(
-            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        pool = fluid.layers.pool2d(input=conv,
+                                   pool_size=7,
+                                   pool_type='avg',
+                                   global_pooling=True)
         drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
         stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
         out = fluid.layers.fc(
             input=drop,
             size=class_dim,
             act='softmax',
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)))
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.05)))
         return out
 
     def shortcut(self, input, ch_out, stride):
@@ -149,21 +149,23 @@ def shortcut(self, input, ch_out, stride):
 
     def bottleneck_block(self, input, num_filters, stride, cardinality,
                          reduction_ratio):
-        conv0 = self.conv_bn_layer(
-            input=input, num_filters=num_filters, filter_size=1, act='relu')
-        conv1 = self.conv_bn_layer(
-            input=conv0,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            groups=cardinality,
-            act='relu')
-        conv2 = self.conv_bn_layer(
-            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
-        scale = self.squeeze_excitation(
-            input=conv2,
-            num_channels=num_filters * 2,
-            reduction_ratio=reduction_ratio)
+        conv0 = self.conv_bn_layer(input=input,
+                                   num_filters=num_filters,
+                                   filter_size=1,
+                                   act='relu')
+        conv1 = self.conv_bn_layer(input=conv0,
+                                   num_filters=num_filters,
+                                   filter_size=3,
+                                   stride=stride,
+                                   groups=cardinality,
+                                   act='relu')
+        conv2 = self.conv_bn_layer(input=conv1,
+                                   num_filters=num_filters * 2,
+                                   filter_size=1,
+                                   act=None)
+        scale = self.squeeze_excitation(input=conv2,
+                                        num_channels=num_filters * 2,
+                                        reduction_ratio=reduction_ratio)
 
         short = self.shortcut(input, num_filters * 2, stride)
 
@@ -185,37 +187,41 @@ def conv_bn_layer(self,
             groups=groups,
             act=None,
             # avoid pserver CPU init differs from GPU
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)),
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.05)),
             bias_attr=False)
         return fluid.layers.batch_norm(input=conv, act=act)
 
     def squeeze_excitation(self, input, num_channels, reduction_ratio):
-        pool = fluid.layers.pool2d(
-            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        pool = fluid.layers.pool2d(input=input,
+                                   pool_size=0,
+                                   pool_type='avg',
+                                   global_pooling=True)
         stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
         squeeze = fluid.layers.fc(
             input=pool,
             size=num_channels // reduction_ratio,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)),
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.05)),
             act='relu')
         stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
         excitation = fluid.layers.fc(
             input=squeeze,
             size=num_channels,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)),
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.05)),
             act='sigmoid')
         scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
         return scale
 
 
 class DistSeResneXt2x2(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, use_dgc=False):
         # Input data
-        image = fluid.layers.data(
-            name="data", shape=[3, 224, 224], dtype='float32')
+        image = fluid.layers.data(name="data",
+                                  shape=[3, 224, 224],
+                                  dtype='float32')
         label = fluid.layers.data(name="int64", shape=[1], dtype='int64')
 
         # Train program
@@ -241,24 +247,24 @@ def get_model(self, batch_size=2, use_dgc=False):
 
         if not use_dgc:
             optimizer = fluid.optimizer.Momentum(
-                learning_rate=fluid.layers.piecewise_decay(
-                    boundaries=bd, values=lr),
+                learning_rate=fluid.layers.piecewise_decay(boundaries=bd,
+                                                           values=lr),
                 momentum=0.9,
                 regularization=fluid.regularizer.L2Decay(1e-4))
         else:
             optimizer = fluid.optimizer.DGCMomentumOptimizer(
-                learning_rate=fluid.layers.piecewise_decay(
-                    boundaries=bd, values=lr),
+                learning_rate=fluid.layers.piecewise_decay(boundaries=bd,
+                                                           values=lr),
                 momentum=0.9,
                 rampup_begin_step=0,
                 regularization=fluid.regularizer.L2Decay(1e-4))
         optimizer.minimize(avg_cost)
 
         # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.dataset.flowers.test(use_xmap=False),
+                                    batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.flowers.test(use_xmap=False),
+                                   batch_size=batch_size)
 
         return test_program, avg_cost, train_reader, test_reader, acc_top1, out
 
diff --git a/python/paddle/fluid/tests/unittests/dist_sharding_save.py b/python/paddle/fluid/tests/unittests/dist_sharding_save.py
index 7d3d934cb458f..e31901c8c85b9 100755
--- a/python/paddle/fluid/tests/unittests/dist_sharding_save.py
+++ b/python/paddle/fluid/tests/unittests/dist_sharding_save.py
@@ -42,18 +42,20 @@ def runtime_main():
     fleet.init(role)
     with fluid.program_guard(train_prog, startup_prog):
         with fluid.unique_name.guard():
-            input_x = paddle.fluid.layers.data(
-                name="x", shape=[32], dtype='float32')
-            input_y = paddle.fluid.layers.data(
-                name="y", shape=[1], dtype='int64')
+            input_x = paddle.fluid.layers.data(name="x",
+                                               shape=[32],
+                                               dtype='float32')
+            input_y = paddle.fluid.layers.data(name="y",
+                                               shape=[1],
+                                               dtype='int64')
 
             fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
             fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
             prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                 size=2,
                                                 act='softmax')
-            cost = paddle.fluid.layers.cross_entropy(
-                input=prediction, label=input_y)
+            cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                     label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
@@ -64,10 +66,10 @@ def runtime_main():
                 "sharding_degree": 2,
             }
 
-            optimizer = paddle.fluid.optimizer.Momentum(
-                learning_rate=0.01, momentum=0.9)
-            optimizer = fleet.distributed_optimizer(
-                optimizer, strategy=strategy)
+            optimizer = paddle.fluid.optimizer.Momentum(learning_rate=0.01,
+                                                        momentum=0.9)
+            optimizer = fleet.distributed_optimizer(optimizer,
+                                                    strategy=strategy)
             optimizer.minimize(avg_cost)
 
     # execution
@@ -76,8 +78,10 @@ def runtime_main():
     exe = fluid.Executor(place)
     exe.run(startup_prog)
     dirname = "./ut_sharding_save_model"
-    sharding.utils.save_persistables(
-        exe, dirname, main_program=train_prog, filename=None)
+    sharding.utils.save_persistables(exe,
+                                     dirname,
+                                     main_program=train_prog,
+                                     filename=None)
 
     out_losses = []
     sys.stdout.buffer.write(pickle.dumps(out_losses))
@@ -85,8 +89,8 @@ def runtime_main():
 
 if __name__ == "__main__":
     #NOTE(liangjianzhong): dist unittest should be imlpement using runtime_main in test_dist_base.py
-    # but the runtime_main in test_dist_base.py use the fleet, DistributedStrategy from 
-    # paddle.fluid.incubate.fleet.collective which is not support by sharding (paddle.distributed.fleet). 
+    # but the runtime_main in test_dist_base.py use the fleet, DistributedStrategy from
+    # paddle.fluid.incubate.fleet.collective which is not support by sharding (paddle.distributed.fleet).
     # this should be update in future.
     # runtime_main(TestDistMnist2x2)
     runtime_main()
diff --git a/python/paddle/fluid/tests/unittests/dist_text_classification.py b/python/paddle/fluid/tests/unittests/dist_text_classification.py
index b96032b92eb98..ede62e643d2e6 100644
--- a/python/paddle/fluid/tests/unittests/dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py
@@ -77,28 +77,30 @@ def conv_net(input,
         filter_size=window_size,
         act="tanh",
         pool_type="max",
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)))
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
 
     fc_0 = fluid.layers.fc(
         input=[conv_3],
         size=fc0_dim,
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)))
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
 
     prediction = fluid.layers.fc(
         input=[fc_0],
         size=class_dim,
         act="softmax",
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)))
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
 
     return prediction
 
 
 def inference_network(dict_dim):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
+    data = fluid.layers.data(name="words",
+                             shape=[1],
+                             dtype="int64",
+                             lod_level=1)
     out = conv_net(data, dict_dim)
     return out
 
@@ -119,14 +121,17 @@ def get_optimizer(learning_rate):
 
 
 class TestDistTextClassification2x2(TestDistRunnerBase):
+
     def get_model(self, batch_size=2):
         vocab = os.path.join(paddle.dataset.common.DATA_HOME,
                              "text_classification", "imdb.vocab")
         word_dict, dict_dim = get_worddict(vocab)
 
         # Input data
-        data = fluid.layers.data(
-            name="words", shape=[1], dtype="int64", lod_level=1)
+        data = fluid.layers.data(name="words",
+                                 shape=[1],
+                                 dtype="int64",
+                                 lod_level=1)
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
         # Train program
@@ -162,9 +167,9 @@ def tokenize(pattern):
         while tf != None:
             if bool(pattern.match(tf.name)):
                 # newline and punctuations removal and ad-hoc tokenization.
-                yield tarf.extractfile(tf).read().rstrip(six.b(
-                    "\n\r")).translate(
-                        None, six.b(string.punctuation)).lower().split()
+                yield tarf.extractfile(tf).read().rstrip(
+                    six.b("\n\r")).translate(None, six.b(
+                        string.punctuation)).lower().split()
             tf = tarf.next()
 
 
@@ -198,9 +203,8 @@ def train(word_idx):
     :return: Training reader creator
     :rtype: callable
     """
-    return reader_creator(
-        re.compile(r"train/pos/.*\.txt$"),
-        re.compile(r"train/neg/.*\.txt$"), word_idx)
+    return reader_creator(re.compile(r"train/pos/.*\.txt$"),
+                          re.compile(r"train/neg/.*\.txt$"), word_idx)
 
 
 def test(word_idx):
@@ -215,9 +219,8 @@ def test(word_idx):
     :return: Test reader creator
     :rtype: callable
     """
-    return reader_creator(
-        re.compile(r"test/pos/.*\.txt$"),
-        re.compile(r"test/neg/.*\.txt$"), word_idx)
+    return reader_creator(re.compile(r"test/pos/.*\.txt$"),
+                          re.compile(r"test/neg/.*\.txt$"), word_idx)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index db321f9417880..b91e43c53b963 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -193,8 +193,8 @@ def merge_cfg_from_list(cfg_list, g_cfgs):
     # encoder.
     # The actual data shape of src_slf_attn_bias is:
     # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
-    "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
-                           seq_len), "float32"],
+    "src_slf_attn_bias":
+    [(batch_size, ModelHyperParams.n_head, seq_len, seq_len), "float32"],
     # The actual data shape of trg_word is:
     # [batch_size * max_trg_len_in_batch, 1]
     "trg_word": [(batch_size, seq_len, long_type(1)), "int64",
@@ -206,14 +206,14 @@ def merge_cfg_from_list(cfg_list, g_cfgs):
     # subsequent words in the decoder.
     # The actual data shape of trg_slf_attn_bias is:
     # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
-    "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
-                           seq_len), "float32"],
+    "trg_slf_attn_bias":
+    [(batch_size, ModelHyperParams.n_head, seq_len, seq_len), "float32"],
     # This input is used to remove attention weights on paddings of the source
     # input in the encoder-decoder attention.
     # The actual data shape of trg_src_attn_bias is:
     # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
-    "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
-                           seq_len), "float32"],
+    "trg_src_attn_bias":
+    [(batch_size, ModelHyperParams.n_head, seq_len, seq_len), "float32"],
     # This input is used in independent decoder program for inference.
     # The actual data shape of enc_output is:
     # [batch_size, max_src_len_in_batch, d_model]
@@ -234,31 +234,37 @@ def merge_cfg_from_list(cfg_list, g_cfgs):
 # Names of word embedding table which might be reused for weight sharing.
 word_emb_param_names = (
     "src_word_emb_table",
-    "trg_word_emb_table", )
+    "trg_word_emb_table",
+)
 # Names of position encoding table which will be initialized externally.
 pos_enc_param_names = (
     "src_pos_enc_table",
-    "trg_pos_enc_table", )
+    "trg_pos_enc_table",
+)
 # separated inputs for different usages.
 encoder_data_input_fields = (
     "src_word",
     "src_pos",
-    "src_slf_attn_bias", )
+    "src_slf_attn_bias",
+)
 decoder_data_input_fields = (
     "trg_word",
     "trg_pos",
     "trg_slf_attn_bias",
     "trg_src_attn_bias",
-    "enc_output", )
+    "enc_output",
+)
 label_data_input_fields = (
     "lbl_word",
-    "lbl_weight", )
+    "lbl_weight",
+)
 # In fast decoder, trg_pos (only containing the current time step) is generated
 # by ops and trg_slf_attn_bias is not needed.
 fast_decoder_data_input_fields = (
     "trg_word",
     "init_score",
-    "trg_src_attn_bias", )
+    "trg_src_attn_bias",
+)
 
 # fast_decoder_util_input_fields = (
 #     "trg_slf_attn_pre_softmax_shape_delta",
@@ -314,21 +320,22 @@ def pad_batch_data(insts,
     """
     return_list = []
     max_len = max(len(inst) for inst in insts)
-    num_token = six.moves.reduce(
-        lambda x, y: x + y,
-        [len(inst) for inst in insts]) if return_num_token else 0
+    num_token = six.moves.reduce(lambda x, y: x + y,
+                                 [len(inst)
+                                  for inst in insts]) if return_num_token else 0
     # Any token included in dict can be used to pad, since the paddings' loss
     # will be masked out by weights and make no effect on parameter gradients.
     inst_data = np.array(
         [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
     return_list += [inst_data.astype("int64").reshape([-1, 1])]
     if is_label:  # label weight
-        inst_weight = np.array(
-            [[1.] * len(inst) + [0.] * (max_len - len(inst)) for inst in insts])
+        inst_weight = np.array([[1.] * len(inst) + [0.] * (max_len - len(inst))
+                                for inst in insts])
         return_list += [inst_weight.astype("float32").reshape([-1, 1])]
     else:  # position data
         inst_pos = np.array([
-            list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst))
+            list(range(1,
+                       len(inst) + 1)) + [0] * (max_len - len(inst))
             for inst in insts
         ])
         return_list += [inst_pos.astype("int64").reshape([-1, 1])]
@@ -461,12 +468,11 @@ def test_context(test_program, avg_cost, train_exe, dev_count, data_input_names,
     strategy = fluid.ExecutionStrategy()
     strategy.num_threads = 1
 
-    test_exe = fluid.ParallelExecutor(
-        use_cuda=TrainTaskConfig.use_gpu,
-        main_program=test_program,
-        share_vars_from=train_exe,
-        build_strategy=build_strategy,
-        exec_strategy=strategy)
+    test_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
+                                      main_program=test_program,
+                                      share_vars_from=train_exe,
+                                      build_strategy=build_strategy,
+                                      exec_strategy=strategy)
 
     def test(exe=test_exe):
         test_total_cost = 0
@@ -477,8 +483,7 @@ def test(exe=test_exe):
         for batch_id, data in enumerate(test_data()):
             feed_list = []
             for place_id, data_buffer in enumerate(
-                    split_data(
-                        data, num_part=dev_count)):
+                    split_data(data, num_part=dev_count)):
                 data_input_dict, _ = prepare_batch_input(
                     data_buffer, data_input_names, ModelHyperParams.eos_idx,
                     ModelHyperParams.eos_idx, ModelHyperParams.n_head,
@@ -536,12 +541,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
     strategy = fluid.ExecutionStrategy()
     strategy.num_threads = 1
 
-    train_exe = fluid.ParallelExecutor(
-        use_cuda=TrainTaskConfig.use_gpu,
-        loss_name=sum_cost.name,
-        main_program=train_progm,
-        build_strategy=build_strategy,
-        exec_strategy=strategy)
+    train_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
+                                       loss_name=sum_cost.name,
+                                       main_program=train_progm,
+                                       build_strategy=build_strategy,
+                                       exec_strategy=strategy)
 
     data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
                                                                              -1] + label_data_input_fields
@@ -552,10 +556,10 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
 
     # the best cross-entropy value with label smoothing
     loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log(
-        (1. - TrainTaskConfig.label_smooth_eps
-         )) + TrainTaskConfig.label_smooth_eps *
-                        np.log(TrainTaskConfig.label_smooth_eps / (
-                            ModelHyperParams.trg_vocab_size - 1) + 1e-20))
+        (1. - TrainTaskConfig.label_smooth_eps)) +
+                        TrainTaskConfig.label_smooth_eps *
+                        np.log(TrainTaskConfig.label_smooth_eps /
+                               (ModelHyperParams.trg_vocab_size - 1) + 1e-20))
     init = False
     for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
         pass_start_time = time.time()
@@ -570,8 +574,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
                 lr_rate = lr_scheduler.update_learning_rate()
 
             for place_id, data_buffer in enumerate(
-                    split_data(
-                        data, num_part=dev_count)):
+                    split_data(data, num_part=dev_count)):
                 data_input_dict, num_token = prepare_batch_input(
                     data_buffer, data_input_names, ModelHyperParams.eos_idx,
                     ModelHyperParams.eos_idx, ModelHyperParams.n_head,
@@ -579,9 +582,8 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
                 total_num_token += num_token
                 feed_kv_pairs = list(data_input_dict.items())
                 if TrainTaskConfig.local:
-                    feed_kv_pairs += list({
-                        lr_scheduler.learning_rate.name: lr_rate
-                    }.items())
+                    feed_kv_pairs += list(
+                        {lr_scheduler.learning_rate.name: lr_rate}.items())
                 feed_list.append(dict(feed_kv_pairs))
 
                 if not init:
@@ -626,6 +628,7 @@ class SortType(object):
 
 
 class Converter(object):
+
     def __init__(self, vocab, beg, end, unk, delimiter):
         self._vocab = vocab
         self._beg = beg
@@ -641,6 +644,7 @@ def __call__(self, sentence):
 
 
 class ComposedConverter(object):
+
     def __init__(self, converters):
         self._converters = converters
 
@@ -652,6 +656,7 @@ def __call__(self, parallel_sentence):
 
 
 class SentenceBatchCreator(object):
+
     def __init__(self, batch_size):
         self.batch = []
         self._batch_size = batch_size
@@ -665,6 +670,7 @@ def append(self, info):
 
 
 class TokenBatchCreator(object):
+
     def __init__(self, batch_size):
         self.batch = []
         self.max_len = -1
@@ -684,6 +690,7 @@ def append(self, info):
 
 
 class SampleInfo(object):
+
     def __init__(self, i, max_len, min_len):
         self.i = i
         self.min_len = min_len
@@ -691,6 +698,7 @@ def __init__(self, i, max_len, min_len):
 
 
 class MinMaxFilter(object):
+
     def __init__(self, max_len, min_len, underlying_creator):
         self._min_len = min_len
         self._max_len = max_len
@@ -823,21 +831,19 @@ def __init__(self,
     def load_src_trg_ids(self, end_mark, fpattern, start_mark, tar_fname,
                          unk_mark):
         converters = [
-            Converter(
-                vocab=self._src_vocab,
-                beg=self._src_vocab[start_mark],
-                end=self._src_vocab[end_mark],
-                unk=self._src_vocab[unk_mark],
-                delimiter=self._token_delimiter)
+            Converter(vocab=self._src_vocab,
+                      beg=self._src_vocab[start_mark],
+                      end=self._src_vocab[end_mark],
+                      unk=self._src_vocab[unk_mark],
+                      delimiter=self._token_delimiter)
         ]
         if not self._only_src:
             converters.append(
-                Converter(
-                    vocab=self._trg_vocab,
-                    beg=self._trg_vocab[start_mark],
-                    end=self._trg_vocab[end_mark],
-                    unk=self._trg_vocab[unk_mark],
-                    delimiter=self._token_delimiter))
+                Converter(vocab=self._trg_vocab,
+                          beg=self._trg_vocab[start_mark],
+                          end=self._trg_vocab[end_mark],
+                          unk=self._trg_vocab[unk_mark],
+                          delimiter=self._token_delimiter))
 
         converters = ComposedConverter(converters)
 
@@ -865,8 +871,9 @@ def _load_lines(self, fpattern, tar_fname):
             for line in f.extractfile(tar_fname):
                 line = cpt.to_text(line)
                 fields = line.strip("\n").split(self._field_delimiter)
-                if (not self._only_src and len(fields) == 2) or (
-                        self._only_src and len(fields) == 1):
+                if (not self._only_src
+                        and len(fields) == 2) or (self._only_src
+                                                  and len(fields) == 1):
                     yield fields
         else:
             for fpath in fpaths:
@@ -877,8 +884,9 @@ def _load_lines(self, fpattern, tar_fname):
                     for line in f:
                         line = cpt.to_text(line)
                         fields = line.strip("\n").split(self._field_delimiter)
-                        if (not self._only_src and len(fields) == 2) or (
-                                self._only_src and len(fields) == 1):
+                        if (not self._only_src
+                                and len(fields) == 2) or (self._only_src
+                                                          and len(fields) == 1):
                             yield fields
 
     @staticmethod
@@ -896,8 +904,9 @@ def load_dict(dict_path, reverse=False):
     def batch_generator(self):
         # global sort or global shuffle
         if self._sort_type == SortType.GLOBAL:
-            infos = sorted(
-                self._sample_infos, key=lambda x: x.max_len, reverse=True)
+            infos = sorted(self._sample_infos,
+                           key=lambda x: x.max_len,
+                           reverse=True)
         else:
             if self._shuffle:
                 infos = self._sample_infos
@@ -1006,8 +1015,8 @@ def __split_heads(x, n_head):
         hidden_size = x.shape[-1]
         # The value 0 in shape attr means copying the corresponding dimension
         # size of the input as the output dimension size.
-        reshaped = layers.reshape(
-            x=x, shape=[0, 0, n_head, hidden_size // n_head])
+        reshaped = layers.reshape(x=x,
+                                  shape=[0, 0, n_head, hidden_size // n_head])
 
         # permute the dimensions into:
         # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
@@ -1039,11 +1048,10 @@ def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
             product += attn_bias
         weights = layers.softmax(product)
         if dropout_rate:
-            weights = layers.dropout(
-                weights,
-                dropout_prob=dropout_rate,
-                seed=ModelHyperParams.dropout_seed,
-                is_test=False)
+            weights = layers.dropout(weights,
+                                     dropout_prob=dropout_rate,
+                                     seed=ModelHyperParams.dropout_seed,
+                                     is_test=False)
         out = layers.matmul(weights, v)
         return out
 
@@ -1102,18 +1110,16 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.):
         if cmd == "a":  # add residual connection
             out = out + prev_out if prev_out else out
         elif cmd == "n":  # add layer normalization
-            out = layers.layer_norm(
-                out,
-                begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.initializer.Constant(1.),
-                bias_attr=fluid.initializer.Constant(0.))
+            out = layers.layer_norm(out,
+                                    begin_norm_axis=len(out.shape) - 1,
+                                    param_attr=fluid.initializer.Constant(1.),
+                                    bias_attr=fluid.initializer.Constant(0.))
         elif cmd == "d":  # add dropout
             if dropout_rate:
-                out = layers.dropout(
-                    out,
-                    dropout_prob=dropout_rate,
-                    seed=ModelHyperParams.dropout_seed,
-                    is_test=False)
+                out = layers.dropout(out,
+                                     dropout_prob=dropout_rate,
+                                     seed=ModelHyperParams.dropout_seed,
+                                     is_test=False)
     return out
 
 
@@ -1145,9 +1151,9 @@ def prepare_encoder(src_word,
         src_word_emb = layers.embedding(
             src_word,
             size=[src_vocab_size, src_emb_dim],
-            param_attr=fluid.ParamAttr(
-                name=word_emb_param_name,
-                initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
+            param_attr=fluid.ParamAttr(name=word_emb_param_name,
+                                       initializer=fluid.initializer.Normal(
+                                           0., src_emb_dim**-0.5)))
 
     src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
     src_pos_enc = layers.embedding(
@@ -1159,17 +1165,16 @@ def prepare_encoder(src_word,
             initializer=fluid.initializer.ConstantInitializer(0.001)))
     src_pos_enc.stop_gradient = True
     enc_input = src_word_emb + src_pos_enc
-    return layers.dropout(
-        enc_input,
-        dropout_prob=dropout_rate,
-        seed=ModelHyperParams.dropout_seed,
-        is_test=False) if dropout_rate else enc_input
+    return layers.dropout(enc_input,
+                          dropout_prob=dropout_rate,
+                          seed=ModelHyperParams.dropout_seed,
+                          is_test=False) if dropout_rate else enc_input
 
 
-prepare_encoder = partial(
-    prepare_encoder, pos_enc_param_name=pos_enc_param_names[0])
-prepare_decoder = partial(
-    prepare_encoder, pos_enc_param_name=pos_enc_param_names[1])
+prepare_encoder = partial(prepare_encoder,
+                          pos_enc_param_name=pos_enc_param_names[0])
+prepare_decoder = partial(prepare_encoder,
+                          pos_enc_param_name=pos_enc_param_names[1])
 
 
 def encoder_layer(enc_input,
@@ -1240,12 +1245,14 @@ def decoder_layer(dec_input,
         d_model,
         n_head,
         dropout_rate,
-        cache, )
+        cache,
+    )
     slf_attn_output = post_process_layer(
         dec_input,
         slf_attn_output,
         "dan",  # residual connection + dropout + layer normalization
-        dropout_rate, )
+        dropout_rate,
+    )
     enc_attn_output = multi_head_attention(
         slf_attn_output,
         enc_output,
@@ -1255,21 +1262,25 @@ def decoder_layer(dec_input,
         d_value,
         d_model,
         n_head,
-        dropout_rate, )
+        dropout_rate,
+    )
     enc_attn_output = post_process_layer(
         slf_attn_output,
         enc_attn_output,
         "dan",  # residual connection + dropout + layer normalization
-        dropout_rate, )
+        dropout_rate,
+    )
     ffd_output = positionwise_feed_forward(
         enc_attn_output,
         d_inner_hid,
-        d_model, )
+        d_model,
+    )
     dec_output = post_process_layer(
         enc_attn_output,
         ffd_output,
         "dan",  # residual connection + dropout + layer normalization
-        dropout_rate, )
+        dropout_rate,
+    )
     return dec_output
 
 
@@ -1293,18 +1304,17 @@ def decoder(dec_input,
         if caches is not None:
             cache = caches[i]
 
-        dec_output = decoder_layer(
-            dec_input,
-            enc_output,
-            dec_slf_attn_bias,
-            dec_enc_attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            dropout_rate,
-            cache=cache)
+        dec_output = decoder_layer(dec_input,
+                                   enc_output,
+                                   dec_slf_attn_bias,
+                                   dec_enc_attn_bias,
+                                   n_head,
+                                   d_key,
+                                   d_value,
+                                   d_model,
+                                   d_inner_hid,
+                                   dropout_rate,
+                                   cache=cache)
         dec_input = dec_output
     return dec_output
 
@@ -1315,30 +1325,30 @@ def make_all_inputs(input_fields):
     """
     inputs = []
     for input_field in input_fields:
-        input_var = layers.data(
-            name=input_field,
-            shape=input_descs[input_field][0],
-            dtype=input_descs[input_field][1],
-            lod_level=input_descs[input_field][2]
-            if len(input_descs[input_field]) == 3 else 0,
-            append_batch_size=False)
+        input_var = layers.data(name=input_field,
+                                shape=input_descs[input_field][0],
+                                dtype=input_descs[input_field][1],
+                                lod_level=input_descs[input_field][2]
+                                if len(input_descs[input_field]) == 3 else 0,
+                                append_batch_size=False)
         inputs.append(input_var)
     return inputs
 
 
 def transformer(
-        src_vocab_size,
-        trg_vocab_size,
-        max_length,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        dropout_rate,
-        weight_sharing,
-        label_smooth_eps, ):
+    src_vocab_size,
+    trg_vocab_size,
+    max_length,
+    n_layer,
+    n_head,
+    d_key,
+    d_value,
+    d_model,
+    d_inner_hid,
+    dropout_rate,
+    weight_sharing,
+    label_smooth_eps,
+):
     if weight_sharing:
         assert src_vocab_size == src_vocab_size, (
             "Vocabularies in source and target should be same for weight sharing."
@@ -1356,7 +1366,8 @@ def transformer(
         d_inner_hid,
         dropout_rate,
         weight_sharing,
-        enc_inputs, )
+        enc_inputs,
+    )
 
     dec_inputs = make_all_inputs(decoder_data_input_fields[:-1])
 
@@ -1372,20 +1383,19 @@ def transformer(
         dropout_rate,
         weight_sharing,
         dec_inputs,
-        enc_output, )
+        enc_output,
+    )
 
     # Padding index do not contribute to the total loss. The weights is used to
     # cancel padding index in calculating the loss.
     label, weights = make_all_inputs(label_data_input_fields)
     if label_smooth_eps:
-        label = layers.label_smooth(
-            label=layers.one_hot(
-                input=label, depth=trg_vocab_size),
-            epsilon=label_smooth_eps)
+        label = layers.label_smooth(label=layers.one_hot(input=label,
+                                                         depth=trg_vocab_size),
+                                    epsilon=label_smooth_eps)
 
     cost = layers.softmax_with_cross_entropy(
-        logits=layers.reshape(
-            predict, shape=[-1, trg_vocab_size]),
+        logits=layers.reshape(predict, shape=[-1, trg_vocab_size]),
         label=label,
         soft_label=True if label_smooth_eps else False)
     weighted_cost = cost * weights
@@ -1417,14 +1427,13 @@ def wrap_encoder(src_vocab_size,
     else:
         src_word, src_pos, src_slf_attn_bias = \
             enc_inputs
-    enc_input = prepare_encoder(
-        src_word,
-        src_pos,
-        src_vocab_size,
-        d_model,
-        max_length,
-        dropout_rate,
-        word_emb_param_name=word_emb_param_names[0])
+    enc_input = prepare_encoder(src_word,
+                                src_pos,
+                                src_vocab_size,
+                                d_model,
+                                max_length,
+                                dropout_rate,
+                                word_emb_param_name=word_emb_param_names[0])
     enc_output = encoder(enc_input, src_slf_attn_bias, n_layer, n_head, d_key,
                          d_value, d_model, d_inner_hid, dropout_rate)
     return enc_output
@@ -1454,34 +1463,32 @@ def wrap_decoder(trg_vocab_size,
     else:
         trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
 
-    dec_input = prepare_decoder(
-        trg_word,
-        trg_pos,
-        trg_vocab_size,
-        d_model,
-        max_length,
-        dropout_rate,
-        word_emb_param_name=word_emb_param_names[0]
-        if weight_sharing else word_emb_param_names[1])
-    dec_output = decoder(
-        dec_input,
-        enc_output,
-        trg_slf_attn_bias,
-        trg_src_attn_bias,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        dropout_rate,
-        caches=caches)
+    dec_input = prepare_decoder(trg_word,
+                                trg_pos,
+                                trg_vocab_size,
+                                d_model,
+                                max_length,
+                                dropout_rate,
+                                word_emb_param_name=word_emb_param_names[0]
+                                if weight_sharing else word_emb_param_names[1])
+    dec_output = decoder(dec_input,
+                         enc_output,
+                         trg_slf_attn_bias,
+                         trg_src_attn_bias,
+                         n_layer,
+                         n_head,
+                         d_key,
+                         d_value,
+                         d_model,
+                         d_inner_hid,
+                         dropout_rate,
+                         caches=caches)
     # Return logits for training and probs for inference.
     if weight_sharing:
-        predict = layers.matmul(
-            x=dec_output,
-            y=fluid.framework._get_var(word_emb_param_names[0]),
-            transpose_y=True)
+        predict = layers.matmul(x=dec_output,
+                                y=fluid.framework._get_var(
+                                    word_emb_param_names[0]),
+                                transpose_y=True)
     else:
         predict = layers.fc(input=dec_output,
                             size=trg_vocab_size,
@@ -1494,20 +1501,21 @@ def wrap_decoder(trg_vocab_size,
 
 
 def fast_decode(
-        src_vocab_size,
-        trg_vocab_size,
-        max_in_len,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        dropout_rate,
-        weight_sharing,
-        beam_size,
-        max_out_len,
-        eos_idx, ):
+    src_vocab_size,
+    trg_vocab_size,
+    max_in_len,
+    n_layer,
+    n_head,
+    d_key,
+    d_value,
+    d_model,
+    d_inner_hid,
+    dropout_rate,
+    weight_sharing,
+    beam_size,
+    max_out_len,
+    eos_idx,
+):
     """
     Use beam search to decode. Caches will be used to store states of history
     steps which can make the decoding faster.
@@ -1519,30 +1527,32 @@ def fast_decode(
         make_all_inputs(fast_decoder_data_input_fields )
 
     def beam_search():
-        max_len = layers.fill_constant(
-            shape=[1], dtype=start_tokens.dtype, value=max_out_len)
-        step_idx = layers.fill_constant(
-            shape=[1], dtype=start_tokens.dtype, value=0)
+        max_len = layers.fill_constant(shape=[1],
+                                       dtype=start_tokens.dtype,
+                                       value=max_out_len)
+        step_idx = layers.fill_constant(shape=[1],
+                                        dtype=start_tokens.dtype,
+                                        value=0)
         cond = layers.less_than(x=step_idx, y=max_len)
         while_op = layers.While(cond)
         # array states will be stored for each step.
-        ids = layers.array_write(
-            layers.reshape(start_tokens, (-1, 1)), step_idx)
+        ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)),
+                                 step_idx)
         scores = layers.array_write(init_scores, step_idx)
         # cell states will be overwrited at each step.
         # caches contains states of history steps to reduce redundant
         # computation in decoder.
         caches = [{
-            "k": layers.fill_constant_batch_size_like(
-                input=start_tokens,
-                shape=[-1, 0, d_model],
-                dtype=enc_output.dtype,
-                value=0),
-            "v": layers.fill_constant_batch_size_like(
-                input=start_tokens,
-                shape=[-1, 0, d_model],
-                dtype=enc_output.dtype,
-                value=0)
+            "k":
+            layers.fill_constant_batch_size_like(input=start_tokens,
+                                                 shape=[-1, 0, d_model],
+                                                 dtype=enc_output.dtype,
+                                                 value=0),
+            "v":
+            layers.fill_constant_batch_size_like(input=start_tokens,
+                                                 shape=[-1, 0, d_model],
+                                                 dtype=enc_output.dtype,
+                                                 value=0)
         } for i in range(n_layer)]
         with while_op.block():
             pre_ids = layers.array_read(array=ids, i=step_idx)
@@ -1550,47 +1560,46 @@ def beam_search():
             pre_scores = layers.array_read(array=scores, i=step_idx)
             # sequence_expand can gather sequences according to lod thus can be
             # used in beam search to sift states corresponding to selected ids.
-            pre_src_attn_bias = layers.sequence_expand(
-                x=trg_src_attn_bias, y=pre_scores)
+            pre_src_attn_bias = layers.sequence_expand(x=trg_src_attn_bias,
+                                                       y=pre_scores)
             pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores)
             pre_caches = [{
-                "k": layers.sequence_expand(
-                    x=cache["k"], y=pre_scores),
-                "v": layers.sequence_expand(
-                    x=cache["v"], y=pre_scores),
+                "k":
+                layers.sequence_expand(x=cache["k"], y=pre_scores),
+                "v":
+                layers.sequence_expand(x=cache["v"], y=pre_scores),
             } for cache in caches]
             pre_pos = layers.elementwise_mul(
                 x=layers.fill_constant_batch_size_like(
-                    input=pre_enc_output,  # can't use pre_ids here since it has lod
+                    input=
+                    pre_enc_output,  # can't use pre_ids here since it has lod
                     value=1,
                     shape=[-1, 1, 1],
                     dtype=pre_ids.dtype),
-                y=layers.increment(
-                    x=step_idx, value=1.0, in_place=False),
+                y=layers.increment(x=step_idx, value=1.0, in_place=False),
                 axis=0)
-            logits = wrap_decoder(
-                trg_vocab_size,
-                max_in_len,
-                n_layer,
-                n_head,
-                d_key,
-                d_value,
-                d_model,
-                d_inner_hid,
-                dropout_rate,
-                weight_sharing,
-                dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias),
-                enc_output=pre_enc_output,
-                caches=pre_caches)
+            logits = wrap_decoder(trg_vocab_size,
+                                  max_in_len,
+                                  n_layer,
+                                  n_head,
+                                  d_key,
+                                  d_value,
+                                  d_model,
+                                  d_inner_hid,
+                                  dropout_rate,
+                                  weight_sharing,
+                                  dec_inputs=(pre_ids, pre_pos, None,
+                                              pre_src_attn_bias),
+                                  enc_output=pre_enc_output,
+                                  caches=pre_caches)
             logits = layers.reshape(logits, (-1, trg_vocab_size))
 
             topk_scores, topk_indices = layers.topk(
                 input=layers.softmax(logits), k=beam_size)
-            accu_scores = layers.elementwise_add(
-                x=layers.log(topk_scores),
-                y=layers.reshape(
-                    pre_scores, shape=[-1]),
-                axis=0)
+            accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
+                                                 y=layers.reshape(pre_scores,
+                                                                  shape=[-1]),
+                                                 axis=0)
             # beam_search op uses lod to distinguish branches.
             topk_indices = layers.lod_reset(topk_indices, pre_ids)
             selected_ids, selected_scores = layers.beam_search(
@@ -1653,11 +1662,10 @@ def get_model(is_dist, is_async):
          .noam_decay(ModelHyperParams.d_model,
             TrainTaskConfig.warmup_steps)
 
-        optimizer = fluid.optimizer.Adam(
-            learning_rate=lr_decay,
-            beta1=TrainTaskConfig.beta1,
-            beta2=TrainTaskConfig.beta2,
-            epsilon=TrainTaskConfig.eps)
+        optimizer = fluid.optimizer.Adam(learning_rate=lr_decay,
+                                         beta1=TrainTaskConfig.beta1,
+                                         beta2=TrainTaskConfig.beta2,
+                                         epsilon=TrainTaskConfig.eps)
         optimizer.minimize(sum_cost)
 
     return sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program
@@ -1667,7 +1675,8 @@ def update_args():
     src_dict = DataReader.load_dict(TrainTaskConfig.src_vocab_fpath)
     trg_dict = DataReader.load_dict(TrainTaskConfig.trg_vocab_fpath)
     dict_args = [
-        "src_vocab_size", str(len(src_dict)), "trg_vocab_size",
+        "src_vocab_size",
+        str(len(src_dict)), "trg_vocab_size",
         str(len(trg_dict)), "bos_idx",
         str(src_dict[TrainTaskConfig.special_token[0]]), "eos_idx",
         str(src_dict[TrainTaskConfig.special_token[1]]), "unk_idx",
@@ -1677,11 +1686,11 @@ def update_args():
 
 
 class DistTransformer2x2(TestDistRunnerBase):
+
     def run_pserver(self, args):
         get_model(True, not args.sync_mode)
-        t = self.get_transpiler(args.trainer_id,
-                                fluid.default_main_program(), args.endpoints,
-                                args.trainers, args.sync_mode)
+        t = self.get_transpiler(args.trainer_id, fluid.default_main_program(),
+                                args.endpoints, args.trainers, args.sync_mode)
         pserver_prog = t.get_pserver_program(args.current_endpoint)
         startup_prog = t.get_startup_program(args.current_endpoint,
                                              pserver_prog)
diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py
index 835306edd0f17..744a6d6729a71 100644
--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
@@ -39,6 +39,7 @@
 
 
 class TestDistWord2vec2x2(TestDistRunnerBase):
+
     def get_model(self, batch_size=2):
         BATCH_SIZE = batch_size
 
@@ -91,8 +92,8 @@ def __network__(words):
                 act='softmax',
                 param_attr=fluid.ParamAttr(
                     initializer=fluid.initializer.Constant(value=0.1)))
-            cost = fluid.layers.cross_entropy(
-                input=predict_word, label=words[4])
+            cost = fluid.layers.cross_entropy(input=predict_word,
+                                              label=words[4])
             avg_cost = fluid.layers.mean(cost)
             return avg_cost, predict_word
 
@@ -100,8 +101,9 @@ def __network__(words):
         dict_size = len(word_dict)
 
         first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-        second_word = fluid.layers.data(
-            name='secondw', shape=[1], dtype='int64')
+        second_word = fluid.layers.data(name='secondw',
+                                        shape=[1],
+                                        dtype='int64')
         third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
         forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
         next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
@@ -113,10 +115,10 @@ def __network__(words):
         sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
         sgd_optimizer.minimize(avg_cost)
 
-        train_reader = paddle.batch(
-            paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
-        test_reader = paddle.batch(
-            paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
+        train_reader = paddle.batch(paddle.dataset.imikolov.train(word_dict, N),
+                                    BATCH_SIZE)
+        test_reader = paddle.batch(paddle.dataset.imikolov.test(word_dict, N),
+                                   BATCH_SIZE)
 
         return inference_program, avg_cost, train_reader, test_reader, None, predict_word
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
index 0af7d40a2f02e..ee2b180586dd2 100644
--- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -76,6 +76,7 @@ def prune_fwd_bwd_ops(program, start_idx):
 
 
 class GradClipDecorator(ClipGradBase):
+
     def __init__(self, clip, clip_after_allreduce):
         self.clip = clip
         self.clip_after_allreduce = clip_after_allreduce
@@ -91,17 +92,17 @@ def _insert_allreduce_ops(self, params_grads):
         scale = 1.0 / world_size
         # scale = 1.0
         for p, g in params_grads:
-            block.append_op(
-                type='c_allreduce_sum',
-                inputs={'X': [g]},
-                outputs={'Out': [g]},
-                attrs={'ring_id': 0,
-                       'use_calc_stream': True})
-            block.append_op(
-                type='scale',
-                inputs={'X': [g]},
-                outputs={'Out': [g]},
-                attrs={'scale': scale})
+            block.append_op(type='c_allreduce_sum',
+                            inputs={'X': [g]},
+                            outputs={'Out': [g]},
+                            attrs={
+                                'ring_id': 0,
+                                'use_calc_stream': True
+                            })
+            block.append_op(type='scale',
+                            inputs={'X': [g]},
+                            outputs={'Out': [g]},
+                            attrs={'scale': scale})
 
     def _static_clip(self, params_grads):
         if self.clip_after_allreduce:
@@ -114,6 +115,7 @@ def _static_clip(self, params_grads):
 
 
 class IdentityGradClip(ClipGradBase):
+
     def _dygraph_clip(self, params_grads):
         return params_grads
 
@@ -130,12 +132,12 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
     with paddle.static.program_guard(main, startup):
         with paddle.fluid.unique_name.guard():
             with paddle.static.amp.fp16_guard():
-                image = paddle.static.data(
-                    name='image',
-                    shape=[None, 3, 224, 224],
-                    dtype=paddle.float32)
-                label = paddle.static.data(
-                    name='label', shape=[None, 1], dtype=paddle.int64)
+                image = paddle.static.data(name='image',
+                                           shape=[None, 3, 224, 224],
+                                           dtype=paddle.float32)
+                label = paddle.static.data(name='label',
+                                           shape=[None, 1],
+                                           dtype=paddle.int64)
                 model = resnet()
                 pred = model(image)
                 loss_fn = paddle.nn.loss.CrossEntropyLoss()
@@ -222,8 +224,8 @@ def gen_random_grad_tensor(grad):
 
     def reader():
         for _ in range(6):
-            yield dict(
-                [(grad.name, gen_random_grad_tensor(grad)) for grad in grads])
+            yield dict([(grad.name, gen_random_grad_tensor(grad))
+                        for grad in grads])
 
     scope = paddle.static.Scope()
     fetch_list = params
@@ -253,6 +255,7 @@ def reader():
 
 
 class TestDistributedFusedLamb(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         if not paddle.is_compiled_with_cuda():
@@ -265,16 +268,18 @@ def setUpClass(cls):
 
     def config(self):
         clip_after_allreduce = bool(
-            distutils.util.strtobool(
-                os.getenv('CLIP_AFTER_ALLREDUCE', 'True')))
+            distutils.util.strtobool(os.getenv('CLIP_AFTER_ALLREDUCE', 'True')))
         max_global_norm = float(os.getenv('MAX_GLOBAL_NORM', -1.0))
         gm_steps = int(os.getenv('GRADIENT_MERGE_STEPS', 1))
         print('clip_after_allreduce = {}, max_global_norm = {}'.format(
             clip_after_allreduce, max_global_norm))
         return {
-            'clip_after_allreduce': clip_after_allreduce,
-            'gradient_accumulation_steps': gm_steps,
-            'grad_clip': paddle.nn.ClipGradByGlobalNorm(max_global_norm)
+            'clip_after_allreduce':
+            clip_after_allreduce,
+            'gradient_accumulation_steps':
+            gm_steps,
+            'grad_clip':
+            paddle.nn.ClipGradByGlobalNorm(max_global_norm)
             if max_global_norm > 0 else None,
         }
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
index 764a862d30f55..c68cebaa25b22 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
@@ -1,25 +1,30 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-if ((NOT WITH_GPU) AND (NOT WITH_XPU) AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
-    list(REMOVE_ITEM TEST_OPS "test_dist_fuse_adam_pass")
-    list(REMOVE_ITEM TEST_OPS "test_dist_fuse_all_reduce_pass")
-    list(REMOVE_ITEM TEST_OPS "test_dist_fuse_bn_act_pass")
-    list(REMOVE_ITEM TEST_OPS "test_dist_fuse_bn_add_act_pass")
-    list(REMOVE_ITEM TEST_OPS "test_dist_fuse_momentum_pass")
-    list(REMOVE_ITEM TEST_OPS "test_dist_fuse_relu_depthwise_conv_pass")
-    list(REMOVE_ITEM TEST_OPS "test_dist_fuse_sgd_pass")
-    list(REMOVE_ITEM TEST_OPS "test_dist_gradient_merge_pass")
-    list(REMOVE_ITEM TEST_OPS "test_dist_inplace_addto_pass")
-    list(REMOVE_ITEM TEST_OPS "test_auto_parallel_amp_pass")
-    list(REMOVE_ITEM TEST_OPS "test_auto_parallel_recompute_pass")
-    list(REMOVE_ITEM TEST_OPS "test_auto_parallel_sharding_pass")
-    list(REMOVE_ITEM TEST_OPS "test_auto_parallel_fp16_pass")
+if((NOT WITH_GPU)
+   AND (NOT WITH_XPU)
+   AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+  list(REMOVE_ITEM TEST_OPS "test_dist_fuse_adam_pass")
+  list(REMOVE_ITEM TEST_OPS "test_dist_fuse_all_reduce_pass")
+  list(REMOVE_ITEM TEST_OPS "test_dist_fuse_bn_act_pass")
+  list(REMOVE_ITEM TEST_OPS "test_dist_fuse_bn_add_act_pass")
+  list(REMOVE_ITEM TEST_OPS "test_dist_fuse_momentum_pass")
+  list(REMOVE_ITEM TEST_OPS "test_dist_fuse_relu_depthwise_conv_pass")
+  list(REMOVE_ITEM TEST_OPS "test_dist_fuse_sgd_pass")
+  list(REMOVE_ITEM TEST_OPS "test_dist_gradient_merge_pass")
+  list(REMOVE_ITEM TEST_OPS "test_dist_inplace_addto_pass")
+  list(REMOVE_ITEM TEST_OPS "test_auto_parallel_amp_pass")
+  list(REMOVE_ITEM TEST_OPS "test_auto_parallel_recompute_pass")
+  list(REMOVE_ITEM TEST_OPS "test_auto_parallel_sharding_pass")
+  list(REMOVE_ITEM TEST_OPS "test_auto_parallel_fp16_pass")
 endif()
 
 foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-    list(APPEND DIST_TEST_OPS ${TEST_OP})
-    set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 120)
-    set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST")
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  list(APPEND DIST_TEST_OPS ${TEST_OP})
+  set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 120)
+  set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST")
 endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py
index e024ef1d5d190..63abdeef5950f 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,12 +26,14 @@
 
 import paddle.distributed.fleet as fleet
 import paddle.distributed.auto_parallel as auto
+
 sys.path.append("..")
 import auto_parallel_gpt_model as modeling
 from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion
 
 
 class AutoPallelPassTestBase(DistPassTestBase):
+
     def setUp(self):
         paddle.enable_static()
         seed = int(os.environ.get('SEED', -1))
@@ -62,10 +64,14 @@ def apply_no_passes(self):
         fleet.init(is_collective=True, strategy=dist_strategy)
 
     def check_main(self, gpus=None, **kwargs):
-        no_pass_rets = self._distributed_launch(
-            model=None, apply_pass=False, gpus=gpus, **kwargs)
-        pass_rets = self._distributed_launch(
-            model=None, apply_pass=True, gpus=gpus, **kwargs)
+        no_pass_rets = self._distributed_launch(model=None,
+                                                apply_pass=False,
+                                                gpus=gpus,
+                                                **kwargs)
+        pass_rets = self._distributed_launch(model=None,
+                                             apply_pass=True,
+                                             gpus=gpus,
+                                             **kwargs)
         self.check_results(no_pass_rets, pass_rets)
 
     def _run_gpu_main(self, model, apply_pass, dump_file, **kwargs):
@@ -113,72 +119,71 @@ def get_gpt_model(self, strategy, place, batch_size, sequence_len,
         else:
             raise ValueError("'get_gpt_model' only support dp and mp.")
 
-        tokens = paddle.static.data(
-            name="tokens", shape=[batch_size, sequence_len], dtype='int64')
-        position_ids = paddle.static.data(
-            name="position_ids",
-            shape=[batch_size, sequence_len],
-            dtype='int64')
+        tokens = paddle.static.data(name="tokens",
+                                    shape=[batch_size, sequence_len],
+                                    dtype='int64')
+        position_ids = paddle.static.data(name="position_ids",
+                                          shape=[batch_size, sequence_len],
+                                          dtype='int64')
         attention_mask = paddle.static.data(
             name="attention_mask",
             shape=[batch_size, 1, sequence_len, sequence_len],
             dtype='float32')
-        labels = paddle.static.data(
-            name="labels", shape=[batch_size, sequence_len], dtype='int64')
-        loss_mask = paddle.static.data(
-            name="loss_mask", shape=[batch_size, sequence_len], dtype='float32')
+        labels = paddle.static.data(name="labels",
+                                    shape=[batch_size, sequence_len],
+                                    dtype='int64')
+        loss_mask = paddle.static.data(name="loss_mask",
+                                       shape=[batch_size, sequence_len],
+                                       dtype='float32')
         data_holder = [tokens, position_ids, attention_mask, labels, loss_mask]
 
         if modeling._global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                tokens,
-                dist_attr={
-                    "process_mesh": modeling._global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(tokens,
+                              dist_attr={
+                                  "process_mesh": modeling._global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif modeling._global_parallel_strategy == "pp":
-            auto.shard_tensor(
-                tokens,
-                dist_attr={
-                    "process_mesh": modeling.PP_MESH_LIST[0],
-                    "dims_mapping": [-1, -1]
-                })
-            auto.shard_tensor(
-                attention_mask,
-                dist_attr={
-                    "process_mesh": modeling.PP_MESH_LIST[0],
-                    "dims_mapping": [-1, -1, -1, -1]
-                })
-
-        gpt = GPTModel(
-            vocab_size=1000,
-            hidden_size=64,
-            num_hidden_layers=2,
-            num_attention_heads=8,
-            intermediate_size=256,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.0,
-            attention_probs_dropout_prob=0.0,
-            max_position_embeddings=1024,
-            type_vocab_size=1,
-            initializer_range=0.02,
-            pad_token_id=0,
-            eos_token_id=7,
-            bos_token_id=0,
-            eol_token_id=3)
-
-        model = GPTForPretraining(
-            gpt, vocab_size=1000, hidden_size=64, initializer_range=0.02)
+            auto.shard_tensor(tokens,
+                              dist_attr={
+                                  "process_mesh": modeling.PP_MESH_LIST[0],
+                                  "dims_mapping": [-1, -1]
+                              })
+            auto.shard_tensor(attention_mask,
+                              dist_attr={
+                                  "process_mesh": modeling.PP_MESH_LIST[0],
+                                  "dims_mapping": [-1, -1, -1, -1]
+                              })
+
+        gpt = GPTModel(vocab_size=1000,
+                       hidden_size=64,
+                       num_hidden_layers=2,
+                       num_attention_heads=8,
+                       intermediate_size=256,
+                       hidden_act="gelu",
+                       hidden_dropout_prob=0.0,
+                       attention_probs_dropout_prob=0.0,
+                       max_position_embeddings=1024,
+                       type_vocab_size=1,
+                       initializer_range=0.02,
+                       pad_token_id=0,
+                       eos_token_id=7,
+                       bos_token_id=0,
+                       eol_token_id=3)
+
+        model = GPTForPretraining(gpt,
+                                  vocab_size=1000,
+                                  hidden_size=64,
+                                  initializer_range=0.02)
         preds = model(tokens, position_ids, attention_mask)
         criterion = GPTPretrainingCriterion()
         loss = criterion(preds, labels, loss_mask)
         clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
-        optimizer = paddle.fluid.optimizer.AdamOptimizer(
-            learning_rate=0.00001,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=1e-08,
-            grad_clip=clip)
+        optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
+                                                         beta1=0.9,
+                                                         beta2=0.999,
+                                                         epsilon=1e-08,
+                                                         grad_clip=clip)
         optimizer = fleet.distributed_optimizer(optimizer)
         startup_program = paddle.static.default_startup_program()
         _, _, dist_startup_prog, dist_main_prog = optimizer.minimize(
@@ -194,13 +199,11 @@ def gen_data():
                 loss_mask = []
                 for _ in range(batch_size):
                     tokens.append(
-                        np.random.randint(
-                            vocab_size, size=sequence_len))
+                        np.random.randint(vocab_size, size=sequence_len))
                     position_ids.append(np.arange(sequence_len))
                     attention_mask.append([np.tril(np.ones(sequence_len))])
                     labels.append(
-                        np.random.randint(
-                            vocab_size, size=sequence_len))
+                        np.random.randint(vocab_size, size=sequence_len))
                     loss_mask.append(np.ones(sequence_len))
 
                 yield tokens, position_ids, attention_mask, labels, loss_mask
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/check_pass_conflict_example.py b/python/paddle/fluid/tests/unittests/distributed_passes/check_pass_conflict_example.py
index fc0582f7aacdd..ffb8ea8e381c3 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/check_pass_conflict_example.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/check_pass_conflict_example.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,6 +19,7 @@
 
 
 class CheckPassConflictTest1(PassConflictChecker):
+
     def pass_config(self):
         return [
             new_pass("fuse_all_reduce", {"max_memory_size": 1024 * 1024}),
@@ -30,6 +31,7 @@ def test_resnet(self):
 
 
 class CheckPassConflictTest2(PassConflictChecker):
+
     def pass_config(self):
         return [
             new_pass("fuse_elewise_add_act"),
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py
index 786ee06487fbc..f13439575c9b1 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -57,6 +57,7 @@ def remove_path_if_exists(path):
 
 # NOTE: only support GPU now
 class DistPassTestBase(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         if paddle.is_compiled_with_cuda():
@@ -85,10 +86,14 @@ def apply_passes(self, main_prog, startup_prog):
         raise NotImplementedError()
 
     def check_main(self, model=None, gpus=None, **kwargs):
-        pass_rets = self._distributed_launch(
-            model=model, apply_pass=True, gpus=gpus, **kwargs)
-        no_pass_rets = self._distributed_launch(
-            model=model, apply_pass=False, gpus=gpus, **kwargs)
+        pass_rets = self._distributed_launch(model=model,
+                                             apply_pass=True,
+                                             gpus=gpus,
+                                             **kwargs)
+        no_pass_rets = self._distributed_launch(model=model,
+                                                apply_pass=False,
+                                                gpus=gpus,
+                                                **kwargs)
         self.check_results(no_pass_rets, pass_rets)
 
     def check_results(self, no_pass_rets, pass_rets):
@@ -101,12 +106,11 @@ def check_results(self, no_pass_rets, pass_rets):
                     self.assertTrue(out_var_pass is None)
                 else:
                     self.assertTrue(
-                        np.allclose(
-                            out_var_no_pass,
-                            out_var_pass,
-                            rtol=self.rtol,
-                            atol=self.atol,
-                            equal_nan=self.equal_nan))
+                        np.allclose(out_var_no_pass,
+                                    out_var_pass,
+                                    rtol=self.rtol,
+                                    atol=self.atol,
+                                    equal_nan=self.equal_nan))
 
     @classmethod
     def _to_var_names(cls, names_or_vars):
@@ -238,8 +242,8 @@ def _distributed_launch(self, model, apply_pass, gpus=None, **kwargs):
                 dump_file = '{0}/{1}.bin'.format(output_dir, i)
                 self.assertTrue(
                     os.path.exists(dump_file),
-                    "Pass test failed with apply_pass = {}, please view log in {}".
-                    format(apply_pass, output_dir))
+                    "Pass test failed with apply_pass = {}, please view log in {}"
+                    .format(apply_pass, output_dir))
                 with open(dump_file, "rb") as f:
                     results.append(pickle.load(f))
             return results
@@ -249,6 +253,7 @@ def _distributed_launch(self, model, apply_pass, gpus=None, **kwargs):
 
 
 class PassConflictChecker(DistPassTestBase):
+
     def setUp(self):
         os.environ['DEBUG'] = '1'  # to save the debug directory
         super(PassConflictChecker, self).setUp()
@@ -266,16 +271,14 @@ def apply_passes(self, main_prog, startup_prog):
         auto_pass_manager = PassManager(passes, auto_solve_conflict=True)
         new_passes = auto_pass_manager.passes
         self.assertEqual(
-            len(passes),
-            len(new_passes),
+            len(passes), len(new_passes),
             "After solving conflicts, the left passes are: {}".format(
                 auto_pass_manager.names))
 
         for i, (p1, p2) in enumerate(zip(passes, new_passes)):
             self.assertEqual(
-                id(p1),
-                id(p2),
-                "After solving conflicts, the {}-th pass is different: {} vs {}".
-                format(i, p1.name, p2.name))
+                id(p1), id(p2),
+                "After solving conflicts, the {}-th pass is different: {} vs {}"
+                .format(i, p1.name, p2.name))
 
         auto_pass_manager.apply([main_prog], [startup_prog])
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/launch.py b/python/paddle/fluid/tests/unittests/distributed_passes/launch.py
index c225fe85cd844..ee9ff484523d3 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/launch.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/launch.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py b/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py
index 7eebee47e59a8..9a48d117bb128 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,19 +19,25 @@
 import numpy as np
 import paddle.nn as nn
 
-__all__ = ['resnet_model', ]
+__all__ = [
+    'resnet_model',
+]
 
 
 def get_seed_from_env():
     return int(os.environ.get("SEED", 0))
 
 
-def resnet_model(place, batch_size, image_shape=[3, 224, 224],
+def resnet_model(place,
+                 batch_size,
+                 image_shape=[3, 224, 224],
                  num_classes=1000):
-    image = paddle.static.data(
-        shape=[batch_size] + image_shape, dtype='float32', name='image')
-    label = paddle.static.data(
-        shape=[batch_size, 1], dtype='int64', name='label')
+    image = paddle.static.data(shape=[batch_size] + image_shape,
+                               dtype='float32',
+                               name='image')
+    label = paddle.static.data(shape=[batch_size, 1],
+                               dtype='int64',
+                               name='label')
     model = resnet(pretrained=False)
     loss_fn = nn.loss.CrossEntropyLoss()
     pred_out = model(image)
@@ -52,8 +58,9 @@ def reader():
         np.random.seed(seed + rank)
         for _ in range(10):
             image_np = np.random.random(size=image.shape).astype('float32')
-            label_np = np.random.randint(
-                low=0, high=num_classes, size=label.shape).astype('int64')
+            label_np = np.random.randint(low=0,
+                                         high=num_classes,
+                                         size=label.shape).astype('int64')
             yield image_np, label_np
 
     main_program = paddle.static.default_main_program()
@@ -62,10 +69,12 @@ def reader():
 
 
 def simple_net(place, batch_size, image_shape=[784], num_classes=10):
-    image = paddle.static.data(
-        shape=[batch_size] + image_shape, dtype='float32', name='image')
-    label = paddle.static.data(
-        shape=[batch_size, 1], dtype='int64', name='label')
+    image = paddle.static.data(shape=[batch_size] + image_shape,
+                               dtype='float32',
+                               name='image')
+    label = paddle.static.data(shape=[batch_size, 1],
+                               dtype='int64',
+                               name='label')
     linears = [nn.Linear(784, 784) for _ in range(3)]
     hidden = image
     for linear in linears:
@@ -89,8 +98,9 @@ def reader():
         np.random.seed(seed + rank)
         for _ in range(10):
             image_np = np.random.random(size=image.shape).astype('float32')
-            label_np = np.random.randint(
-                low=0, high=num_classes, size=label.shape).astype('int64')
+            label_np = np.random.randint(low=0,
+                                         high=num_classes,
+                                         size=label.shape).astype('int64')
             yield image_np, label_np
 
     main_program = paddle.static.default_main_program()
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/pass_run_main.py b/python/paddle/fluid/tests/unittests/distributed_passes/pass_run_main.py
index 1dad8796a61ba..95c670ce90969 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/pass_run_main.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/pass_run_main.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,13 +29,13 @@ def parse_args():
     parser.add_argument(
         '--class_name',
         type=str,
-        help='The test class name. It is the class name that inherits the DistPassTestBase class.'
+        help=
+        'The test class name. It is the class name that inherits the DistPassTestBase class.'
     )
-    parser.add_argument(
-        '--apply_pass',
-        default=False,
-        action="store_true",
-        help='Whether to apply distributed passes.')
+    parser.add_argument('--apply_pass',
+                        default=False,
+                        action="store_true",
+                        help='Whether to apply distributed passes.')
     parser.add_argument(
         '--input_file',
         type=str,
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py
index 93a0044a5e43c..beddb79fd9ea3 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -28,6 +28,7 @@
 
 
 class PsPassTestBase(unittest.TestCase):
+
     def init(self):
         self.config = {}
         self.config['ps_mode_config'] = ""
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_amp_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_amp_pass.py
index 0507909b132e1..4585fe997cd45 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_amp_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_amp_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 
 class TestAMPPass(AutoPallelPassTestBase):
+
     def init(self):
         if paddle.is_compiled_with_cuda():
             paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
@@ -51,8 +52,10 @@ def apply_passes(self):
         fleet.init(is_collective=True, strategy=dist_strategy)
 
     def test_bs_8(self):
-        self.check_main(
-            gpus=[0, 1], batch_size=8, sequence_len=512, vocab_size=1000)
+        self.check_main(gpus=[0, 1],
+                        batch_size=8,
+                        sequence_len=512,
+                        vocab_size=1000)
 
     def get_model(self, place, batch_size, sequence_len, vocab_size):
         return self.get_gpt_model("mp", place, batch_size, sequence_len,
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_fp16_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_fp16_pass.py
index ccc60bc6782ea..5ac78cc5fec4d 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_fp16_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_fp16_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,6 +24,7 @@
 
 
 class TestPF16Pass(TestAMPPass):
+
     def apply_passes(self):
         dist_strategy = fleet.DistributedStrategy()
         dist_strategy.amp = True
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_recompute_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_recompute_pass.py
index 74a751881ddf2..7afa10d49dbf5 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_recompute_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_recompute_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,6 +26,7 @@
 
 
 class TestRecomputePass(AutoPallelPassTestBase):
+
     def init(self):
         if paddle.is_compiled_with_cuda():
             paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
@@ -45,8 +46,10 @@ def apply_passes(self):
         fleet.init(is_collective=True, strategy=dist_strategy)
 
     def test_bs_8(self):
-        self.check_main(
-            gpus=[0, 1], batch_size=8, sequence_len=512, vocab_size=1000)
+        self.check_main(gpus=[0, 1],
+                        batch_size=8,
+                        sequence_len=512,
+                        vocab_size=1000)
 
     def get_model(self, place, batch_size, sequence_len, vocab_size):
         return self.get_gpt_model("mp", place, batch_size, sequence_len,
@@ -54,6 +57,7 @@ def get_model(self, place, batch_size, sequence_len, vocab_size):
 
 
 class TestRecomputePassDP(TestRecomputePass):
+
     def get_model(self, place, batch_size, sequence_len, vocab_size):
         return self.get_gpt_model("dp", place, batch_size, sequence_len,
                                   vocab_size)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py
index 51e87260609df..16d63b0964360 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,12 +23,14 @@
 import paddle.distributed.auto_parallel as auto
 from paddle.distributed.passes import new_pass, PassManager
 from auto_parallel_pass_test_base import AutoPallelPassTestBase
+
 sys.path.append("..")
 import auto_parallel_gpt_model as modeling
 from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion
 
 
 class TestShardingPass(AutoPallelPassTestBase):
+
     def init(self):
         if paddle.is_compiled_with_cuda():
             paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
@@ -58,8 +60,10 @@ def apply_no_passes(self):
         fleet.init(is_collective=True, strategy=dist_strategy)
 
     def test_bs_8(self):
-        self.check_main(
-            gpus=[0, 1], batch_size=8, sequence_len=512, vocab_size=1000)
+        self.check_main(gpus=[0, 1],
+                        batch_size=8,
+                        sequence_len=512,
+                        vocab_size=1000)
 
     def get_model(self, place, batch_size, sequence_len, vocab_size):
         return self.get_gpt_model('dp', place, batch_size, sequence_len,
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_resnet.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_resnet.py
index 8430eb615a20c..48679116ccf5b 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_resnet.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_resnet.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@
 
 
 class TestBuildCINNPass(DistPassTestBase):
+
     def init(self):
         self.atol = 0.5
         self.rtol = 0.0
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_simple_net.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_simple_net.py
index e030420d32420..31bc9bd66d032 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_simple_net.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_simple_net.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@
 
 
 class TestBuildCINNPass(DistPassTestBase):
+
     def init(self):
         self.atol = 0.0
         self.rtol = 0.0
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_adam_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_adam_pass.py
index 3ca71fb83151e..85c3bf321a3b1 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_adam_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_adam_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 
 class DemoNet(nn.Layer):
+
     def __init__(self):
         super(DemoNet, self).__init__()
 
@@ -39,13 +40,15 @@ def forward(self, x):
 
 
 class TestFuseAdamPass(DistPassTestBase):
+
     def init(self):
         self.atol = 1e-4
         self.rtol = 1e-4
 
     def get_model(self, place, batch_size=32, image_shape=[224, 224, 3]):
-        image = paddle.static.data(
-            shape=[batch_size] + image_shape, dtype='float32', name='image')
+        image = paddle.static.data(shape=[batch_size] + image_shape,
+                                   dtype='float32',
+                                   name='image')
 
         model = DemoNet()
         pred_out = model(image)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_all_reduce_pass.py
index c011815b7d2a1..06cd2ac6da49e 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_all_reduce_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,6 +19,7 @@
 
 
 class TestFuseAllReducePass(DistPassTestBase):
+
     def init(self):
         self.atol = 0.0
         self.rtol = 0.0
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py
index a7147724fbc5c..a0090f6d8c310 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 
 class BatchNormActNet(nn.Layer):
+
     def __init__(self):
         super(BatchNormActNet, self).__init__()
 
@@ -39,13 +40,15 @@ def forward(self, x):
 
 
 class TestFuseBatchNormActPass(DistPassTestBase):
+
     def init(self):
         self.atol = 1e-4
         self.rtol = 1e-4
 
     def get_model(self, place, batch_size=32, image_shape=[224, 224, 3]):
-        image = paddle.static.data(
-            shape=[batch_size] + image_shape, dtype='float32', name='image')
+        image = paddle.static.data(shape=[batch_size] + image_shape,
+                                   dtype='float32',
+                                   name='image')
 
         model = BatchNormActNet()
         pred_out = model(image)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_add_act_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_add_act_pass.py
index 1b01260eaf2fd..eb9a901a40a9f 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_add_act_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 
 class BatchNormAddActNet(nn.Layer):
+
     def __init__(self):
         super(BatchNormAddActNet, self).__init__()
 
@@ -43,13 +44,15 @@ def forward(self, x):
 
 
 class TestFuseBatchNormAddActPass(DistPassTestBase):
+
     def init(self):
         self.atol = 1e-4
         self.rtol = 1e-4
 
     def get_model(self, place, batch_size=32, image_shape=[224, 224, 3]):
-        image = paddle.static.data(
-            shape=[batch_size] + image_shape, dtype='float32', name='image')
+        image = paddle.static.data(shape=[batch_size] + image_shape,
+                                   dtype='float32',
+                                   name='image')
 
         model = BatchNormAddActNet()
         pred_out = model(image)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_momentum_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_momentum_pass.py
index a0dd634b3ad46..11bd4f5d2b13a 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_momentum_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_momentum_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 
 class DemoNet(nn.Layer):
+
     def __init__(self):
         super(DemoNet, self).__init__()
 
@@ -39,13 +40,15 @@ def forward(self, x):
 
 
 class TestFuseAdamPass(DistPassTestBase):
+
     def init(self):
         self.atol = 1e-4
         self.rtol = 1e-4
 
     def get_model(self, place, batch_size=32, image_shape=[224, 224, 3]):
-        image = paddle.static.data(
-            shape=[batch_size] + image_shape, dtype='float32', name='image')
+        image = paddle.static.data(shape=[batch_size] + image_shape,
+                                   dtype='float32',
+                                   name='image')
 
         model = DemoNet()
         pred_out = model(image)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py
index c07744c882e7e..0a7442a18d776 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 
 class ReluDepthwiseConvNet(nn.Layer):
+
     def __init__(self):
         super(ReluDepthwiseConvNet, self).__init__()
 
@@ -39,13 +40,15 @@ def forward(self, x):
 
 
 class TestFuseReluDepthwiseConvPass(DistPassTestBase):
+
     def init(self):
         self.atol = 1e-4
         self.rtol = 1e-4
 
     def get_model(self, place, batch_size=32, image_shape=[3, 224, 224]):
-        image = paddle.static.data(
-            shape=[batch_size] + image_shape, dtype='float32', name='image')
+        image = paddle.static.data(shape=[batch_size] + image_shape,
+                                   dtype='float32',
+                                   name='image')
 
         model = ReluDepthwiseConvNet()
         pred_out = model(image)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_sgd_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_sgd_pass.py
index 3939bd53739c4..3e96e9d3440e0 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_sgd_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_sgd_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 
 class DemoNet(nn.Layer):
+
     def __init__(self):
         super(DemoNet, self).__init__()
 
@@ -39,13 +40,15 @@ def forward(self, x):
 
 
 class TestFuseAdamPass(DistPassTestBase):
+
     def init(self):
         self.atol = 1e-4
         self.rtol = 1e-4
 
     def get_model(self, place, batch_size=32, image_shape=[224, 224, 3]):
-        image = paddle.static.data(
-            shape=[batch_size] + image_shape, dtype='float32', name='image')
+        image = paddle.static.data(shape=[batch_size] + image_shape,
+                                   dtype='float32',
+                                   name='image')
 
         model = DemoNet()
         pred_out = model(image)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py
index 0c324ba8ee9aa..f856059402efb 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -42,6 +42,7 @@
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=128,
                  intermediate_size=4 * 128,
@@ -55,18 +56,30 @@ def __init__(self,
         weight_attr0 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr0))
         weight_attr1 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr1))
         bias_attr = None
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr1, bias_attr=bias_attr)
-        self.linear2 = nn.Linear(
-            d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr)
-        self.linear3 = nn.Linear(
-            dim_feedforward, d_model, weight_attr1, bias_attr=bias_attr)
-        self.linear4 = nn.Linear(
-            d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr)
-        self.linear5 = nn.Linear(
-            dim_feedforward, d_model, weight_attr1, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr0,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr1,
+                                 bias_attr=bias_attr)
+        self.linear2 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr0,
+                                 bias_attr=bias_attr)
+        self.linear3 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr1,
+                                 bias_attr=bias_attr)
+        self.linear4 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr0,
+                                 bias_attr=bias_attr)
+        self.linear5 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr1,
+                                 bias_attr=bias_attr)
         self.norm0 = nn.LayerNorm(d_model, epsilon=1e-5)
         self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
         self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
@@ -91,17 +104,15 @@ def forward(self, input):
 
 def mlp_forward(input, label, hidden_size):
     if _global_parallel_strategy == "dp":
-        auto.shard_tensor(
-            input,
-            dist_attr={
-                "process_mesh": _global_process_mesh,
-                "dims_mapping": [0, -1]
-            })
-
-    mlp = MLPLayer(
-        hidden_size=hidden_size,
-        intermediate_size=4 * hidden_size,
-        initializer_range=0.02)
+        auto.shard_tensor(input,
+                          dist_attr={
+                              "process_mesh": _global_process_mesh,
+                              "dims_mapping": [0, -1]
+                          })
+
+    mlp = MLPLayer(hidden_size=hidden_size,
+                   intermediate_size=4 * hidden_size,
+                   initializer_range=0.02)
     predict = mlp(input)
     error_cost = paddle.nn.functional.square_error_cost(predict, label)
     loss = paddle.mean(error_cost)
@@ -109,6 +120,7 @@ def mlp_forward(input, label, hidden_size):
 
 
 class TestGradientMergePass(DistPassTestBase):
+
     def init(self):
         self._params_grads = None
         self._config = {"k_steps": 4, "avg": True}
@@ -128,20 +140,18 @@ def apply_passes(self, main_prog, startup_prog):
         fleet.init(is_collective=True, strategy=dist_strategy)
 
     def test_result(self):
-        no_pass_rets = self._distributed_launch(
-            model=None,
-            apply_pass=False,
-            gpus=[0],
-            gradient_merge=False,
-            batch_size=32,
-            max_step=2)
-        pass_rets = self._distributed_launch(
-            model=None,
-            apply_pass=True,
-            gpus=[0],
-            gradient_merge=True,
-            batch_size=8,
-            max_step=8)
+        no_pass_rets = self._distributed_launch(model=None,
+                                                apply_pass=False,
+                                                gpus=[0],
+                                                gradient_merge=False,
+                                                batch_size=32,
+                                                max_step=2)
+        pass_rets = self._distributed_launch(model=None,
+                                             apply_pass=True,
+                                             gpus=[0],
+                                             gradient_merge=True,
+                                             batch_size=8,
+                                             max_step=8)
         """
         # avg loss for gradient_merge pass
         avg_loss = 0
@@ -193,10 +203,12 @@ def get_model(self, place, gradient_merge, batch_size, max_step):
 
         with static.program_guard(train_program, startup_program), \
             utils.unique_name.guard():
-            input = static.data(
-                name="input", shape=[batch_size, hidden_size], dtype='float32')
-            label = static.data(
-                name="label", shape=[batch_size, 1], dtype='float32')
+            input = static.data(name="input",
+                                shape=[batch_size, hidden_size],
+                                dtype='float32')
+            label = static.data(name="label",
+                                shape=[batch_size, 1],
+                                dtype='float32')
             input.stop_gradient = False
             loss = mlp_forward(input, label, hidden_size)
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_inplace_addto_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_inplace_addto_pass.py
index 32a6257a5f62e..32bb1ca83a9b3 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_inplace_addto_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_inplace_addto_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 
 class DemoNet(nn.Layer):
+
     def __init__(self):
         super(DemoNet, self).__init__()
 
@@ -40,14 +41,16 @@ def forward(self, x):
 
 
 class TestInplaceAddtoPass(DistPassTestBase):
+
     def init(self):
         self.atol = 0.0
         self.rtol = 0.0
         paddle.fluid.set_flags({"FLAGS_max_inplace_grad_add": 8})
 
     def get_model(self, place, batch_size=32, image_shape=[224, 224, 3]):
-        image = paddle.static.data(
-            shape=[batch_size] + image_shape, dtype='float32', name='image')
+        image = paddle.static.data(shape=[batch_size] + image_shape,
+                                   dtype='float32',
+                                   name='image')
 
         model = DemoNet()
         pred_out = model(image)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_server_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_server_pass.py
index e9beda446aaa3..5c46794d8a498 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_server_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_server_pass.py
@@ -24,6 +24,7 @@
 
 
 class TestPsServerPass(PsPassTestBase):
+
     def init(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
index 054950df1ebf8..964e13d53715f 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
@@ -26,6 +26,7 @@
 
 
 class TestPsTrainerPass(PsPassTestBase):
+
     def setUp(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_white_lists.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_white_lists.py
index 37abe1e121ff3..645fa38099df1 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_white_lists.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_white_lists.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,6 +18,7 @@
 
 
 class TestConcretePass(PassBase):
+
     def __init__(self):
         super(TestConcretePass, self).__init__()
 
@@ -33,35 +34,41 @@ def _apply_single_impl(self, main_program, startup_program, context):
 
 @register_pass("A")
 class A(TestConcretePass):
+
     def __init__(self):
         super(A, self).__init__()
 
 
 @register_pass("B")
 class B(TestConcretePass):
+
     def __init__(self):
         super(B, self).__init__()
 
 
 @register_pass("C")
 class C(TestConcretePass):
+
     def __init__(self):
         super(C, self).__init__()
 
 
 @register_pass("D")
 class D(TestConcretePass):
+
     def __init__(self):
         super(D, self).__init__()
 
 
 @register_pass("E")
 class E(TestConcretePass):
+
     def __init__(self):
         super(E, self).__init__()
 
 
 class TestMakeWhiteListsRule(unittest.TestCase):
+
     def test_main(self):
         before_white_lists = {"A": ["B", "C"]}
         after_white_lists = {"D": ["C"]}
diff --git a/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt
index f71e04c09aa38..e3bf89c48821a 100644
--- a/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt
@@ -1,6 +1,9 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/distribution/parameterize.py b/python/paddle/fluid/tests/unittests/distribution/parameterize.py
index 09aa241b15dfe..72c9ac03325a5 100644
--- a/python/paddle/fluid/tests/unittests/distribution/parameterize.py
+++ b/python/paddle/fluid/tests/unittests/distribution/parameterize.py
@@ -29,6 +29,7 @@ def xrand(shape=(10, 10, 10), dtype=config.DEFAULT_DTYPE, min=1.0, max=10.0):
 
 
 def place(devices, key='place'):
+
     def decorate(cls):
         module = sys.modules[cls.__module__].__dict__
         raw_classes = {
@@ -70,7 +71,9 @@ def decorate(cls):
     return decorate
 
 
-def parameterize_func(input, name_func=None, doc_func=None,
+def parameterize_func(input,
+                      name_func=None,
+                      doc_func=None,
                       skip_on_empty=False):
     doc_func = doc_func or default_doc_func
     name_func = name_func or default_name_func
@@ -90,9 +93,8 @@ def wrapper(f, instance=None):
 
         digits = len(str(len(parameters) - 1))
         for num, p in enumerate(parameters):
-            name = name_func(
-                f, "{num:0>{digits}}".format(
-                    digits=digits, num=num), p)
+            name = name_func(f, "{num:0>{digits}}".format(digits=digits,
+                                                          num=num), p)
             # If the original function has patches applied by 'mock.patch',
             # re-construct all patches on the just former decoration layer
             # of param_as_standalone_func so as not to share
@@ -111,7 +113,9 @@ def wrapper(f, instance=None):
 
 
 def reapply_patches_if_need(func):
+
     def dummy_wrapper(orgfunc):
+
         @wraps(orgfunc)
         def dummy_func(*args, **kwargs):
             return orgfunc(*args, **kwargs)
@@ -163,6 +167,7 @@ def default_doc_func(func, num, p):
 
 
 def param_as_standalone_func(p, func, name):
+
     @functools.wraps(func)
     def standalone_func(*a):
         return func(*(a + p.args), **p.kwargs)
@@ -204,6 +209,7 @@ def skip_on_empty_helper(*a, **kw):
 
 
 class param(_param):
+
     def __new__(cls, *args, **kwargs):
         return _param.__new__(cls, args, kwargs)
 
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_dirichlet_op.py b/python/paddle/fluid/tests/unittests/distribution/test_dirichlet_op.py
index 3e7662b573e0d..2e85b47a20e32 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_dirichlet_op.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_dirichlet_op.py
@@ -25,6 +25,7 @@
 import paddle.static as static
 import scipy.stats
 from numpy.random import random as rand
+
 sys.path.append("../")
 from op_test import OpTest
 from paddle.fluid import Program, program_guard
@@ -55,6 +56,5 @@ def _hypothesis_testing(self, outs):
             scipy.stats.kstest(
                 outs[0][:, 0],
                 # scipy dirichlet have not cdf, use beta to replace it.
-                scipy.stats.beta(
-                    a=self.alpha[0], b=self.alpha[1]).cdf)[0],
+                scipy.stats.beta(a=self.alpha[0], b=self.alpha[1]).cdf)[0],
             0.01)
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution.py
index 7a1cb25b96f46..028faac6e8408 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution.py
@@ -28,6 +28,7 @@
 
 
 class DistributionNumpy():
+
     def sample(self):
         raise NotImplementedError
 
@@ -45,6 +46,7 @@ def probs(self, value):
 
 
 class DistributionTestName(unittest.TestCase):
+
     def get_prefix(self, string):
         return (string.split('.')[0])
 
@@ -137,9 +139,10 @@ def test_categorical_name(self):
 @parameterize.parameterize_cls(
     (parameterize.TEST_CASE_NAME, 'batch_shape', 'event_shape'),
     [('test-tuple', (10, 20), (10, 20)),
-     ('test-list', [100, 100], [100, 200, 300]), ('test-null-eventshape',
-                                                  (100, 100), ())])
+     ('test-list', [100, 100], [100, 200, 300]),
+     ('test-null-eventshape', (100, 100), ())])
 class TestDistributionShape(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
         self.dist = paddle.distribution.Distribution(
@@ -169,6 +172,7 @@ def test_extend_shape(self):
 
 
 class TestDistributionException(unittest.TestCase):
+
     def setUp(self):
         self._d = paddle.distribution.Distribution()
 
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta.py
index fb0c37e3d659d..1d23b0f79d26a 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta.py
@@ -28,6 +28,7 @@
                   [('test-scale', 1.0, 2.0), ('test-tensor', xrand(), xrand()),
                    ('test-broadcast', xrand((2, 1)), xrand((2, 5)))])
 class TestBeta(unittest.TestCase):
+
     def setUp(self):
         # scale no need convert to tensor for scale input unittest
         alpha, beta = self.alpha, self.beta
@@ -97,8 +98,8 @@ def test_sample_shape(self):
         ]
         for case in cases:
             self.assertTrue(
-                self._paddle_beta.sample(case.get('input')).shape ==
-                case.get('expect'))
+                self._paddle_beta.sample(case.get('input')).shape == case.get(
+                    'expect'))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta_static.py
index e8fe0f17600c4..83b66f5c2b217 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta_static.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta_static.py
@@ -27,11 +27,13 @@
 
 
 @param.place(config.DEVICES)
-@param.parameterize_cls(
-    (param.TEST_CASE_NAME, 'alpha', 'beta'), [('test-tensor', xrand(
-        (10, 10)), xrand((10, 10))), ('test-broadcast', xrand((2, 1)), xrand(
-            (2, 5))), ('test-larger-data', xrand((10, 20)), xrand((10, 20)))])
+@param.parameterize_cls((param.TEST_CASE_NAME, 'alpha', 'beta'),
+                        [('test-tensor', xrand((10, 10)), xrand((10, 10))),
+                         ('test-broadcast', xrand((2, 1)), xrand((2, 5))),
+                         ('test-larger-data', xrand((10, 20)), xrand(
+                             (10, 20)))])
 class TestBeta(unittest.TestCase):
+
     def setUp(self):
         self.program = paddle.static.Program()
         self.executor = paddle.static.Executor(self.place)
@@ -48,23 +50,23 @@ def test_mean(self):
             [mean] = self.executor.run(self.program,
                                        feed=self.feeds,
                                        fetch_list=[self._paddle_beta.mean])
-            np.testing.assert_allclose(
-                mean,
-                scipy.stats.beta.mean(self.alpha, self.beta),
-                rtol=RTOL.get(str(self.alpha.dtype)),
-                atol=ATOL.get(str(self.alpha.dtype)))
+            np.testing.assert_allclose(mean,
+                                       scipy.stats.beta.mean(
+                                           self.alpha, self.beta),
+                                       rtol=RTOL.get(str(self.alpha.dtype)),
+                                       atol=ATOL.get(str(self.alpha.dtype)))
 
     def test_variance(self):
         with paddle.static.program_guard(self.program):
-            [variance] = self.executor.run(
-                self.program,
-                feed=self.feeds,
-                fetch_list=[self._paddle_beta.variance])
-            np.testing.assert_allclose(
-                variance,
-                scipy.stats.beta.var(self.alpha, self.beta),
-                rtol=RTOL.get(str(self.alpha.dtype)),
-                atol=ATOL.get(str(self.alpha.dtype)))
+            [variance
+             ] = self.executor.run(self.program,
+                                   feed=self.feeds,
+                                   fetch_list=[self._paddle_beta.variance])
+            np.testing.assert_allclose(variance,
+                                       scipy.stats.beta.var(
+                                           self.alpha, self.beta),
+                                       rtol=RTOL.get(str(self.alpha.dtype)),
+                                       atol=ATOL.get(str(self.alpha.dtype)))
 
     def test_prob(self):
 
@@ -79,11 +81,12 @@ def test_prob(self):
             [prob] = self.executor.run(self.program,
                                        feed=feeds,
                                        fetch_list=[prob])
-            np.testing.assert_allclose(
-                prob,
-                scipy.stats.beta.pdf(random_number, self.alpha, self.beta),
-                rtol=RTOL.get(str(self.alpha.dtype)),
-                atol=ATOL.get(str(self.alpha.dtype)))
+            np.testing.assert_allclose(prob,
+                                       scipy.stats.beta.pdf(
+                                           random_number, self.alpha,
+                                           self.beta),
+                                       rtol=RTOL.get(str(self.alpha.dtype)),
+                                       atol=ATOL.get(str(self.alpha.dtype)))
 
     def test_log_prob(self):
         with paddle.static.program_guard(self.program):
@@ -95,23 +98,24 @@ def test_log_prob(self):
             [prob] = self.executor.run(self.program,
                                        feed=feeds,
                                        fetch_list=[prob])
-            np.testing.assert_allclose(
-                prob,
-                scipy.stats.beta.logpdf(random_number, self.alpha, self.beta),
-                rtol=RTOL.get(str(self.alpha.dtype)),
-                atol=ATOL.get(str(self.alpha.dtype)))
+            np.testing.assert_allclose(prob,
+                                       scipy.stats.beta.logpdf(
+                                           random_number, self.alpha,
+                                           self.beta),
+                                       rtol=RTOL.get(str(self.alpha.dtype)),
+                                       atol=ATOL.get(str(self.alpha.dtype)))
 
     def test_entropy(self):
         with paddle.static.program_guard(self.program):
-            [entropy] = self.executor.run(
-                self.program,
-                feed=self.feeds,
-                fetch_list=[self._paddle_beta.entropy()])
-            np.testing.assert_allclose(
-                entropy,
-                scipy.stats.beta.entropy(self.alpha, self.beta),
-                rtol=RTOL.get(str(self.alpha.dtype)),
-                atol=ATOL.get(str(self.alpha.dtype)))
+            [entropy
+             ] = self.executor.run(self.program,
+                                   feed=self.feeds,
+                                   fetch_list=[self._paddle_beta.entropy()])
+            np.testing.assert_allclose(entropy,
+                                       scipy.stats.beta.entropy(
+                                           self.alpha, self.beta),
+                                       rtol=RTOL.get(str(self.alpha.dtype)),
+                                       atol=ATOL.get(str(self.alpha.dtype)))
 
     def test_sample(self):
         with paddle.static.program_guard(self.program):
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py
index f43ac7bea763f..24c21d1bd45da 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,6 +25,7 @@
 
 
 class CategoricalNumpy(DistributionNumpy):
+
     def __init__(self, logits):
         self.logits = np.array(logits).astype('float32')
 
@@ -51,6 +52,7 @@ def kl_divergence(self, other):
 
 
 class CategoricalTest(unittest.TestCase):
+
     def setUp(self, use_gpu=False, batch_size=3, dims=5):
         self.use_gpu = use_gpu
         if not use_gpu:
@@ -100,12 +102,15 @@ def init_dynamic_data(self, batch_size, dims):
 
     def init_static_data(self, batch_size, dims):
         with fluid.program_guard(self.test_program):
-            self.logits_static = fluid.data(
-                name='logits', shape=self.logits_shape, dtype='float32')
-            self.other_logits_static = fluid.data(
-                name='other_logits', shape=self.logits_shape, dtype='float32')
-            self.value_static = fluid.data(
-                name='value', shape=self.value_shape, dtype='int64')
+            self.logits_static = fluid.data(name='logits',
+                                            shape=self.logits_shape,
+                                            dtype='float32')
+            self.other_logits_static = fluid.data(name='other_logits',
+                                                  shape=self.logits_shape,
+                                                  dtype='float32')
+            self.value_static = fluid.data(name='value',
+                                           shape=self.value_shape,
+                                           dtype='int64')
 
     def get_numpy_selected_probs(self, probability):
         np_probs = np.zeros(self.dist_shape + self.value_shape)
@@ -126,20 +131,28 @@ def compare_with_numpy(self, fetch_list, tolerance=1e-6):
         np_entropy = np_categorical.entropy()
         np_kl = np_categorical.kl_divergence(np_other_categorical)
 
-        np.testing.assert_allclose(
-            entropy, np_entropy, rtol=log_tolerance, atol=log_tolerance)
-        np.testing.assert_allclose(
-            kl, np_kl, rtol=log_tolerance, atol=log_tolerance)
+        np.testing.assert_allclose(entropy,
+                                   np_entropy,
+                                   rtol=log_tolerance,
+                                   atol=log_tolerance)
+        np.testing.assert_allclose(kl,
+                                   np_kl,
+                                   rtol=log_tolerance,
+                                   atol=log_tolerance)
 
         sum_dist = np.sum(self.logits_np, axis=-1, keepdims=True)
         probability = self.logits_np / sum_dist
         np_probs = self.get_numpy_selected_probs(probability)
         np_log_prob = np.log(np_probs)
 
-        np.testing.assert_allclose(
-            probs, np_probs, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            log_prob, np_log_prob, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(probs,
+                                   np_probs,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(log_prob,
+                                   np_log_prob,
+                                   rtol=tolerance,
+                                   atol=tolerance)
 
     def test_categorical_distribution_dygraph(self, tolerance=1e-6):
         paddle.disable_static(self.place)
@@ -184,6 +197,7 @@ def test_categorical_distribution_static(self, tolerance=1e-6):
 
 
 class CategoricalTest2(CategoricalTest):
+
     def init_numpy_data(self, batch_size, dims):
         # input logtis is 2-D Tensor with dtype Float64
         # value used in probs and log_prob method is 1-D Tensor
@@ -199,15 +213,19 @@ def init_numpy_data(self, batch_size, dims):
 
     def init_static_data(self, batch_size, dims):
         with fluid.program_guard(self.test_program):
-            self.logits_static = fluid.data(
-                name='logits', shape=self.logits_shape, dtype='float64')
-            self.other_logits_static = fluid.data(
-                name='other_logits', shape=self.logits_shape, dtype='float64')
-            self.value_static = fluid.data(
-                name='value', shape=self.value_shape, dtype='int64')
+            self.logits_static = fluid.data(name='logits',
+                                            shape=self.logits_shape,
+                                            dtype='float64')
+            self.other_logits_static = fluid.data(name='other_logits',
+                                                  shape=self.logits_shape,
+                                                  dtype='float64')
+            self.value_static = fluid.data(name='value',
+                                           shape=self.value_shape,
+                                           dtype='int64')
 
 
 class CategoricalTest3(CategoricalTest):
+
     def init_dynamic_data(self, batch_size, dims):
         # input logtis is 2-D numpy.ndarray with dtype Float32
         # value used in probs and log_prob method is 1-D Tensor
@@ -219,11 +237,13 @@ def init_static_data(self, batch_size, dims):
         with fluid.program_guard(self.test_program):
             self.logits_static = self.logits_np
             self.other_logits_static = self.other_logits_np
-            self.value_static = fluid.data(
-                name='value', shape=self.value_shape, dtype='int64')
+            self.value_static = fluid.data(name='value',
+                                           shape=self.value_shape,
+                                           dtype='int64')
 
 
 class CategoricalTest4(CategoricalTest):
+
     def init_numpy_data(self, batch_size, dims):
         # input logtis is 2-D numpy.ndarray with dtype Float64
         # value used in probs and log_prob method is 1-D Tensor
@@ -246,12 +266,14 @@ def init_static_data(self, batch_size, dims):
         with fluid.program_guard(self.test_program):
             self.logits_static = self.logits_np
             self.other_logits_static = self.other_logits_np
-            self.value_static = fluid.data(
-                name='value', shape=self.value_shape, dtype='int64')
+            self.value_static = fluid.data(name='value',
+                                           shape=self.value_shape,
+                                           dtype='int64')
 
 
 # test shape of logits and value used in probs and log_prob method
 class CategoricalTest5(CategoricalTest):
+
     def init_numpy_data(self, batch_size, dims):
         # input logtis is 1-D Tensor
         # value used in probs and log_prob method is 1-D Tensor
@@ -272,6 +294,7 @@ def get_numpy_selected_probs(self, probability):
 
 
 class CategoricalTest6(CategoricalTest):
+
     def init_numpy_data(self, batch_size, dims):
         # input logtis is 2-D Tensor
         # value used in probs and log_prob method has the same number of batches with input
@@ -293,6 +316,7 @@ def get_numpy_selected_probs(self, probability):
 
 
 class CategoricalTest7(CategoricalTest):
+
     def init_numpy_data(self, batch_size, dims):
         # input logtis is 3-D Tensor
         # value used in probs and log_prob method has the same number of distribuions with input
@@ -315,6 +339,7 @@ def get_numpy_selected_probs(self, probability):
 
 
 class CategoricalTest8(CategoricalTest):
+
     def init_dynamic_data(self, batch_size, dims):
         # input logtis is 2-D list
         # value used in probs and log_prob method is 1-D Tensor
@@ -326,11 +351,13 @@ def init_static_data(self, batch_size, dims):
         with fluid.program_guard(self.test_program):
             self.logits_static = self.logits_np.tolist()
             self.other_logits_static = self.other_logits_np.tolist()
-            self.value_static = fluid.data(
-                name='value', shape=self.value_shape, dtype='int64')
+            self.value_static = fluid.data(name='value',
+                                           shape=self.value_shape,
+                                           dtype='int64')
 
 
 class CategoricalTest9(CategoricalTest):
+
     def init_dynamic_data(self, batch_size, dims):
         # input logtis is 2-D tuple
         # value used in probs and log_prob method is 1-D Tensor
@@ -342,11 +369,13 @@ def init_static_data(self, batch_size, dims):
         with fluid.program_guard(self.test_program):
             self.logits_static = tuple(self.logits_np.tolist())
             self.other_logits_static = tuple(self.other_logits_np.tolist())
-            self.value_static = fluid.data(
-                name='value', shape=self.value_shape, dtype='int64')
+            self.value_static = fluid.data(name='value',
+                                           shape=self.value_shape,
+                                           dtype='int64')
 
 
 class DistributionTestError(unittest.TestCase):
+
     def test_distribution_error(self):
         distribution = Distribution()
 
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_constraint.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_constraint.py
index c31d2124193ee..b927aef8e9b81 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_constraint.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_constraint.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,6 +25,7 @@
 @param.param_cls((param.TEST_CASE_NAME, 'value'),
                  [('NotImplement', np.random.rand(2, 3))])
 class TestConstraint(unittest.TestCase):
+
     def setUp(self):
         self._constraint = constraint.Constraint()
 
@@ -36,6 +37,7 @@ def test_costraint(self):
 @param.param_cls((param.TEST_CASE_NAME, 'value', 'expect'),
                  [('real', 1., True)])
 class TestReal(unittest.TestCase):
+
     def setUp(self):
         self._constraint = constraint.Real()
 
@@ -46,6 +48,7 @@ def test_costraint(self):
 @param.param_cls((param.TEST_CASE_NAME, 'lower', 'upper', 'value', 'expect'),
                  [('in_range', 0, 1, 0.5, True), ('out_range', 0, 1, 2, False)])
 class TestRange(unittest.TestCase):
+
     def setUp(self):
         self._constraint = constraint.Range(self.lower, self.upper)
 
@@ -56,6 +59,7 @@ def test_costraint(self):
 @param.param_cls((param.TEST_CASE_NAME, 'value', 'expect'),
                  [('positive', 1, True), ('negative', -1, False)])
 class TestPositive(unittest.TestCase):
+
     def setUp(self):
         self._constraint = constraint.Positive()
 
@@ -67,6 +71,7 @@ def test_costraint(self):
                  [('simplex', paddle.to_tensor([0.5, 0.5]), True),
                   ('non_simplex', paddle.to_tensor([-0.5, 0.5]), False)])
 class TestSimplex(unittest.TestCase):
+
     def setUp(self):
         self._constraint = constraint.Simplex()
 
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet.py
index 9caec312b3382..8188b2231f294 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet.py
@@ -31,6 +31,7 @@
         # ('test-multi-dim', config.xrand((10, 20, 30)))
     ])
 class TestDirichlet(unittest.TestCase):
+
     def setUp(self):
         self._paddle_diric = paddle.distribution.Dirichlet(
             paddle.to_tensor(self.concentration))
@@ -91,17 +92,18 @@ def test_log_normalizer(self):
         self.assertTrue(
             np.all(
                 self._paddle_diric._log_normalizer(
-                    paddle.to_tensor(param.xrand((100, 100, 100)))).numpy() <
-                0.0))
+                    paddle.to_tensor(param.xrand((100, 100,
+                                                  100)))).numpy() < 0.0))
 
     @param.place(DEVICES)
     @param.param_cls((param.TEST_CASE_NAME, 'concentration'),
                      [('test-zero-dim', np.array(1.0))])
     class TestDirichletException(unittest.TestCase):
+
         def TestInit(self):
             with self.assertRaises(ValueError):
-                paddle.distribution.Dirichlet(
-                    paddle.squeeze(self.concentration))
+                paddle.distribution.Dirichlet(paddle.squeeze(
+                    self.concentration))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet_static.py
index f7096d295eeb5..c4630bbd84b57 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet_static.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet_static.py
@@ -28,6 +28,7 @@
 @parameterize_cls((TEST_CASE_NAME, 'concentration'),
                   [('test-one-dim', np.random.rand(89) + 5.0)])
 class TestDirichlet(unittest.TestCase):
+
     def setUp(self):
         self.program = paddle.static.Program()
         self.executor = paddle.static.Executor()
@@ -95,10 +96,9 @@ def test_log_prob(self):
 
     def test_entropy(self):
         with paddle.static.program_guard(self.program):
-            [out] = self.executor.run(
-                self.program,
-                feed=self.feeds,
-                fetch_list=[self._paddle_diric.entropy()])
+            [out] = self.executor.run(self.program,
+                                      feed=self.feeds,
+                                      fetch_list=[self._paddle_diric.entropy()])
             np.testing.assert_allclose(
                 out,
                 scipy.stats.dirichlet.entropy(self.concentration),
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py
index b601ac285840a..cc4b843091379 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py
@@ -25,11 +25,12 @@
 
 @parameterize.place(config.DEVICES)
 @parameterize.parameterize_cls(
-    (parameterize.TEST_CASE_NAME, 'dist'), [('test-mock-exp',
-                                             mock.Exponential(rate=paddle.rand(
-                                                 [100, 200, 99],
-                                                 dtype=config.DEFAULT_DTYPE)))])
+    (parameterize.TEST_CASE_NAME, 'dist'),
+    [('test-mock-exp',
+      mock.Exponential(
+          rate=paddle.rand([100, 200, 99], dtype=config.DEFAULT_DTYPE)))])
 class TestExponentialFamily(unittest.TestCase):
+
     def test_entropy(self):
         np.testing.assert_allclose(
             self.dist.entropy(),
@@ -43,11 +44,12 @@ def test_entropy(self):
     (config.TEST_CASE_NAME, 'dist'),
     [('test-dummy', mock.DummyExpFamily(0.5, 0.5)),
      ('test-dirichlet',
-      paddle.distribution.Dirichlet(paddle.to_tensor(parameterize.xrand()))), (
-          'test-beta', paddle.distribution.Beta(
-              paddle.to_tensor(parameterize.xrand()),
-              paddle.to_tensor(parameterize.xrand())))])
+      paddle.distribution.Dirichlet(paddle.to_tensor(parameterize.xrand()))),
+     ('test-beta',
+      paddle.distribution.Beta(paddle.to_tensor(parameterize.xrand()),
+                               paddle.to_tensor(parameterize.xrand())))])
 class TestExponentialFamilyException(unittest.TestCase):
+
     def test_entropy_exception(self):
         with self.assertRaises(NotImplementedError):
             paddle.distribution.ExponentialFamily.entropy(self.dist)
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily_static.py
index 28c337b617b2e..63f1fa81bf187 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily_static.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily_static.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,6 +27,7 @@
 
 @parameterize.place(config.DEVICES)
 class TestExponentialFamily(unittest.TestCase):
+
     def setUp(self):
         self.program = paddle.static.Program()
         self.executor = paddle.static.Executor()
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent.py
index f67c260cbcc31..4f0639a0380a3 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent.py
@@ -23,10 +23,12 @@
 
 
 @param.place(config.DEVICES)
-@param.param_cls((param.TEST_CASE_NAME, 'base', 'reinterpreted_batch_rank'),
-                 [('base_beta', paddle.distribution.Beta(
-                     paddle.rand([1, 2]), paddle.rand([1, 2])), 1)])
+@param.param_cls(
+    (param.TEST_CASE_NAME, 'base', 'reinterpreted_batch_rank'),
+    [('base_beta',
+      paddle.distribution.Beta(paddle.rand([1, 2]), paddle.rand([1, 2])), 1)])
 class TestIndependent(unittest.TestCase):
+
     def setUp(self):
         self._t = paddle.distribution.Independent(self.base,
                                                   self.reinterpreted_batch_rank)
@@ -82,6 +84,7 @@ def test_sample(self):
     [('base_not_transform', '', 1, TypeError),
      ('rank_less_than_zero', paddle.distribution.Transform(), -1, ValueError)])
 class TestIndependentException(unittest.TestCase):
+
     def test_init(self):
         with self.assertRaises(self.expected_exception):
             paddle.distribution.IndependentTransform(
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent_static.py
index eb078160a03e0..e0196ecbf136e 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent_static.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent_static.py
@@ -27,9 +27,10 @@
 @param.place(config.DEVICES)
 @param.param_cls(
     (param.TEST_CASE_NAME, 'base', 'reinterpreted_batch_rank', 'alpha', 'beta'),
-    [('base_beta', paddle.distribution.Beta, 1, np.random.rand(1, 2),
-      np.random.rand(1, 2))])
+    [('base_beta', paddle.distribution.Beta, 1, np.random.rand(
+        1, 2), np.random.rand(1, 2))])
 class TestIndependent(unittest.TestCase):
+
     def setUp(self):
         value = np.random.rand(1)
         self.dtype = value.dtype
@@ -63,45 +64,42 @@ def setUp(self):
             self.mean, self.variance, self.entropy, self.log_prob,
             self.base_mean, self.base_variance, self.base_entropy,
             self.base_log_prob
-        ] = exe.run(
-            mp,
-            feed={'value': value,
-                  'alpha': self.alpha,
-                  'beta': self.beta},
-            fetch_list=fetch_list)
+        ] = exe.run(mp,
+                    feed={
+                        'value': value,
+                        'alpha': self.alpha,
+                        'beta': self.beta
+                    },
+                    fetch_list=fetch_list)
 
     def test_mean(self):
-        np.testing.assert_allclose(
-            self.mean,
-            self.base_mean,
-            rtol=config.RTOL.get(str(self.dtype)),
-            atol=config.ATOL.get(str(self.dtype)))
+        np.testing.assert_allclose(self.mean,
+                                   self.base_mean,
+                                   rtol=config.RTOL.get(str(self.dtype)),
+                                   atol=config.ATOL.get(str(self.dtype)))
 
     def test_variance(self):
-        np.testing.assert_allclose(
-            self.variance,
-            self.base_variance,
-            rtol=config.RTOL.get(str(self.dtype)),
-            atol=config.ATOL.get(str(self.dtype)))
+        np.testing.assert_allclose(self.variance,
+                                   self.base_variance,
+                                   rtol=config.RTOL.get(str(self.dtype)),
+                                   atol=config.ATOL.get(str(self.dtype)))
 
     def test_entropy(self):
-        np.testing.assert_allclose(
-            self._np_sum_rightmost(self.base_entropy,
-                                   self.reinterpreted_batch_rank),
-            self.entropy,
-            rtol=config.RTOL.get(str(self.dtype)),
-            atol=config.ATOL.get(str(self.dtype)))
+        np.testing.assert_allclose(self._np_sum_rightmost(
+            self.base_entropy, self.reinterpreted_batch_rank),
+                                   self.entropy,
+                                   rtol=config.RTOL.get(str(self.dtype)),
+                                   atol=config.ATOL.get(str(self.dtype)))
 
     def _np_sum_rightmost(self, value, n):
         return np.sum(value, tuple(range(-n, 0))) if n > 0 else value
 
     def test_log_prob(self):
-        np.testing.assert_allclose(
-            self._np_sum_rightmost(self.base_log_prob,
-                                   self.reinterpreted_batch_rank),
-            self.log_prob,
-            rtol=config.RTOL.get(str(self.dtype)),
-            atol=config.ATOL.get(str(self.dtype)))
+        np.testing.assert_allclose(self._np_sum_rightmost(
+            self.base_log_prob, self.reinterpreted_batch_rank),
+                                   self.log_prob,
+                                   rtol=config.RTOL.get(str(self.dtype)),
+                                   atol=config.ATOL.get(str(self.dtype)))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial.py
index 851645a96d405..0bec1c5a58cd9 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial.py
@@ -31,6 +31,7 @@
         ('prob-sum-non-one', 10, np.array([2., 3., 5.])),
     ])
 class TestMultinomial(unittest.TestCase):
+
     def setUp(self):
         self._dist = paddle.distribution.Multinomial(
             total_count=self.total_count, probs=paddle.to_tensor(self.probs))
@@ -38,29 +39,26 @@ def setUp(self):
     def test_mean(self):
         mean = self._dist.mean
         self.assertEqual(mean.numpy().dtype, self.probs.dtype)
-        np.testing.assert_allclose(
-            mean,
-            self._np_mean(),
-            rtol=config.RTOL.get(str(self.probs.dtype)),
-            atol=config.ATOL.get(str(self.probs.dtype)))
+        np.testing.assert_allclose(mean,
+                                   self._np_mean(),
+                                   rtol=config.RTOL.get(str(self.probs.dtype)),
+                                   atol=config.ATOL.get(str(self.probs.dtype)))
 
     def test_variance(self):
         var = self._dist.variance
         self.assertEqual(var.numpy().dtype, self.probs.dtype)
-        np.testing.assert_allclose(
-            var,
-            self._np_variance(),
-            rtol=config.RTOL.get(str(self.probs.dtype)),
-            atol=config.ATOL.get(str(self.probs.dtype)))
+        np.testing.assert_allclose(var,
+                                   self._np_variance(),
+                                   rtol=config.RTOL.get(str(self.probs.dtype)),
+                                   atol=config.ATOL.get(str(self.probs.dtype)))
 
     def test_entropy(self):
         entropy = self._dist.entropy()
         self.assertEqual(entropy.numpy().dtype, self.probs.dtype)
-        np.testing.assert_allclose(
-            entropy,
-            self._np_entropy(),
-            rtol=config.RTOL.get(str(self.probs.dtype)),
-            atol=config.ATOL.get(str(self.probs.dtype)))
+        np.testing.assert_allclose(entropy,
+                                   self._np_entropy(),
+                                   rtol=config.RTOL.get(str(self.probs.dtype)),
+                                   atol=config.ATOL.get(str(self.probs.dtype)))
 
     def test_sample(self):
         sample_shape = ()
@@ -82,10 +80,12 @@ def test_sample(self):
         sample_shape = (5000, )
         samples = self._dist.sample(sample_shape)
         sample_mean = samples.mean(axis=0)
-        # Tolerance value 0.2 is empirical value which is consistent with 
+        # Tolerance value 0.2 is empirical value which is consistent with
         # TensorFlow
-        np.testing.assert_allclose(
-            sample_mean, self._dist.mean, atol=0, rtol=0.20)
+        np.testing.assert_allclose(sample_mean,
+                                   self._dist.mean,
+                                   atol=0,
+                                   rtol=0.20)
 
     def _np_variance(self):
         probs = self.probs / self.probs.sum(-1, keepdims=True)
@@ -106,11 +106,12 @@ def _np_entropy(self):
     [
         ('value-float', 10, np.array([0.2, 0.3, 0.5]), np.array([2., 3., 5.])),
         ('value-int', 10, np.array([0.2, 0.3, 0.5]), np.array([2, 3, 5])),
-        ('value-multi-dim', 10, np.array([[0.3, 0.7], [0.5, 0.5]]),
-         np.array([[4., 6], [8, 2]])),
+        ('value-multi-dim', 10, np.array([[0.3, 0.7], [0.5, 0.5]
+                                          ]), np.array([[4., 6], [8, 2]])),
         # ('value-sum-non-n', 10, np.array([0.5, 0.2, 0.3]), np.array([4,5,2])),
     ])
 class TestMultinomialPmf(unittest.TestCase):
+
     def setUp(self):
         self._dist = paddle.distribution.Multinomial(
             total_count=self.total_count, probs=paddle.to_tensor(self.probs))
@@ -132,6 +133,7 @@ def test_prob(self):
         ('probs_zero_dim', np.array(0)),
     ])
 class TestMultinomialException(unittest.TestCase):
+
     def TestInit(self):
         with self.assertRaises(ValueError):
             paddle.distribution.Multinomial(self.total_count,
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial_static.py
index ac86ad8d3e185..f9beb6b7702f8 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial_static.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_multinomial_static.py
@@ -33,6 +33,7 @@
         ('prob-sum-non-one', 5, np.array([2., 3., 5.])),
     ])
 class TestMultinomial(unittest.TestCase):
+
     def setUp(self):
         startup_program = paddle.static.Program()
         main_program = paddle.static.Program()
@@ -57,28 +58,25 @@ def setUp(self):
 
     def test_mean(self):
         self.assertEqual(str(self.mean.dtype).split('.')[-1], self.probs.dtype)
-        np.testing.assert_allclose(
-            self.mean,
-            self._np_mean(),
-            rtol=config.RTOL.get(str(self.probs.dtype)),
-            atol=config.ATOL.get(str(self.probs.dtype)))
+        np.testing.assert_allclose(self.mean,
+                                   self._np_mean(),
+                                   rtol=config.RTOL.get(str(self.probs.dtype)),
+                                   atol=config.ATOL.get(str(self.probs.dtype)))
 
     def test_variance(self):
         self.assertEqual(str(self.var.dtype).split('.')[-1], self.probs.dtype)
-        np.testing.assert_allclose(
-            self.var,
-            self._np_variance(),
-            rtol=config.RTOL.get(str(self.probs.dtype)),
-            atol=config.ATOL.get(str(self.probs.dtype)))
+        np.testing.assert_allclose(self.var,
+                                   self._np_variance(),
+                                   rtol=config.RTOL.get(str(self.probs.dtype)),
+                                   atol=config.ATOL.get(str(self.probs.dtype)))
 
     def test_entropy(self):
         self.assertEqual(
             str(self.entropy.dtype).split('.')[-1], self.probs.dtype)
-        np.testing.assert_allclose(
-            self.entropy,
-            self._np_entropy(),
-            rtol=config.RTOL.get(str(self.probs.dtype)),
-            atol=config.ATOL.get(str(self.probs.dtype)))
+        np.testing.assert_allclose(self.entropy,
+                                   self._np_entropy(),
+                                   rtol=config.RTOL.get(str(self.probs.dtype)),
+                                   atol=config.ATOL.get(str(self.probs.dtype)))
 
     def test_sample(self):
         self.assertEqual(
@@ -107,11 +105,12 @@ def _np_entropy(self):
     [
         ('value-float', 5, np.array([0.2, 0.3, 0.5]), np.array([1., 1., 3.])),
         ('value-int', 5, np.array([0.2, 0.3, 0.5]), np.array([2, 2, 1])),
-        ('value-multi-dim', 5, np.array([[0.3, 0.7], [0.5, 0.5]]),
-         np.array([[1., 4.], [2., 3.]])),
+        ('value-multi-dim', 5, np.array([[0.3, 0.7], [0.5, 0.5]
+                                         ]), np.array([[1., 4.], [2., 3.]])),
         # ('value-sum-non-n', 10, np.array([0.5, 0.2, 0.3]), np.array([4,5,2])),
     ])
 class TestMultinomialPmf(unittest.TestCase):
+
     def setUp(self):
         startup_program = paddle.static.Program()
         main_program = paddle.static.Program()
@@ -133,12 +132,12 @@ def setUp(self):
                                   fetch_list=fetch_list)
 
     def test_prob(self):
-        np.testing.assert_allclose(
-            self.pmf,
-            scipy.stats.multinomial.pmf(self.value, self.total_count,
-                                        self.probs),
-            rtol=config.RTOL.get(str(self.probs.dtype)),
-            atol=config.ATOL.get(str(self.probs.dtype)))
+        np.testing.assert_allclose(self.pmf,
+                                   scipy.stats.multinomial.pmf(
+                                       self.value, self.total_count,
+                                       self.probs),
+                                   rtol=config.RTOL.get(str(self.probs.dtype)),
+                                   atol=config.ATOL.get(str(self.probs.dtype)))
 
 
 @parameterize.place(config.DEVICES)
@@ -149,6 +148,7 @@ def test_prob(self):
         ('probs_zero_dim', np.array(0)),
     ])
 class TestMultinomialException(unittest.TestCase):
+
     def setUp(self):
         startup_program = paddle.static.Program()
         self.main_program = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_normal.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_normal.py
index 0c23e367f98f7..9e597c3d3635e 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_normal.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_normal.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,6 +25,7 @@
 
 
 class NormalNumpy(DistributionNumpy):
+
     def __init__(self, loc, scale):
         self.loc = np.array(loc)
         self.scale = np.array(scale)
@@ -39,8 +40,9 @@ def sample(self, shape):
     def log_prob(self, value):
         var = self.scale * self.scale
         log_scale = np.log(self.scale)
-        return -((value - self.loc) * (value - self.loc)) / (
-            2. * var) - log_scale - math.log(math.sqrt(2. * math.pi))
+        return -((value - self.loc) *
+                 (value - self.loc)) / (2. * var) - log_scale - math.log(
+                     math.sqrt(2. * math.pi))
 
     def probs(self, value):
         var = self.scale * self.scale
@@ -60,6 +62,7 @@ def kl_divergence(self, other):
 
 
 class NormalTest(unittest.TestCase):
+
     def setUp(self, use_gpu=False, batch_size=2, dims=3):
         self.use_gpu = use_gpu
         if not use_gpu:
@@ -105,8 +108,9 @@ def init_static_data(self, batch_size, dims):
         self.static_other_loc = self.other_loc_np
         self.static_other_scale = self.other_scale_np
         with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[], dtype='float32')
+            self.static_values = layers.data(name='values',
+                                             shape=[],
+                                             dtype='float32')
 
     def compare_with_numpy(self, fetch_list, sample_shape=7, tolerance=1e-6):
         sample, entropy, log_prob, probs, kl = fetch_list
@@ -127,14 +131,22 @@ def compare_with_numpy(self, fetch_list, sample_shape=7, tolerance=1e-6):
         log_tolerance = 1e-4
 
         np.testing.assert_equal(sample.shape, np_sample.shape)
-        np.testing.assert_allclose(
-            entropy, np_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            log_prob, np_lp, rtol=log_tolerance, atol=log_tolerance)
-        np.testing.assert_allclose(
-            probs, np_p, rtol=log_tolerance, atol=log_tolerance)
-        np.testing.assert_allclose(
-            kl, np_kl, rtol=log_tolerance, atol=log_tolerance)
+        np.testing.assert_allclose(entropy,
+                                   np_entropy,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(log_prob,
+                                   np_lp,
+                                   rtol=log_tolerance,
+                                   atol=log_tolerance)
+        np.testing.assert_allclose(probs,
+                                   np_p,
+                                   rtol=log_tolerance,
+                                   atol=log_tolerance)
+        np.testing.assert_allclose(kl,
+                                   np_kl,
+                                   rtol=log_tolerance,
+                                   atol=log_tolerance)
 
     def test_normal_distribution_dygraph(self, sample_shape=7, tolerance=1e-6):
         paddle.disable_static(self.place)
@@ -182,6 +194,7 @@ def test_normal_distribution_static(self, sample_shape=7, tolerance=1e-6):
 
 
 class NormalTest2(NormalTest):
+
     def init_numpy_data(self, batch_size, dims):
         # loc ans scale are 'int'
         self.loc_np = int((np.random.ranf() - 0.5) * 8)
@@ -197,6 +210,7 @@ def init_numpy_data(self, batch_size, dims):
 
 
 class NormalTest3(NormalTest):
+
     def init_numpy_data(self, batch_size, dims):
         # test broadcast: loc is float, scale is numpy.ndarray with dtype 'float32'.
         self.loc_np = (np.random.ranf() - 0.5) * 4
@@ -218,11 +232,13 @@ def init_static_data(self, batch_size, dims):
         self.static_other_loc = self.other_loc_np
         self.static_other_scale = self.other_scale_np
         with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float32')
 
 
 class NormalTest4(NormalTest):
+
     def init_numpy_data(self, batch_size, dims):
         # loc and scale are numpy.ndarray with dtype 'float32'.
         self.loc_np = np.random.randn(batch_size, dims).astype('float32')
@@ -244,11 +260,13 @@ def init_static_data(self, batch_size, dims):
         self.static_other_loc = self.other_loc_np
         self.static_other_scale = self.other_scale_np
         with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float32')
 
 
 class NormalTest5(NormalTest):
+
     def init_numpy_data(self, batch_size, dims):
         # loc and scale are numpy.ndarray with dtype 'float64'.
         self.loc_np = np.random.randn(batch_size, dims).astype('float64')
@@ -277,11 +295,13 @@ def init_static_data(self, batch_size, dims):
         self.static_other_loc = self.other_loc_np
         self.static_other_scale = self.other_scale_np
         with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float64')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float64')
 
 
 class NormalTest6(NormalTest):
+
     def init_numpy_data(self, batch_size, dims):
         # loc and scale are Tensor with dtype 'VarType.FP32'.
         self.loc_np = np.random.randn(batch_size, dims).astype('float32')
@@ -306,19 +326,25 @@ def init_dynamic_data(self, batch_size, dims):
 
     def init_static_data(self, batch_size, dims):
         with fluid.program_guard(self.test_program):
-            self.static_loc = layers.data(
-                name='loc', shape=[dims], dtype='float32')
-            self.static_scale = layers.data(
-                name='scale', shape=[dims], dtype='float32')
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-            self.static_other_loc = layers.data(
-                name='other_loc', shape=[dims], dtype='float32')
-            self.static_other_scale = layers.data(
-                name='other_scale', shape=[dims], dtype='float32')
+            self.static_loc = layers.data(name='loc',
+                                          shape=[dims],
+                                          dtype='float32')
+            self.static_scale = layers.data(name='scale',
+                                            shape=[dims],
+                                            dtype='float32')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float32')
+            self.static_other_loc = layers.data(name='other_loc',
+                                                shape=[dims],
+                                                dtype='float32')
+            self.static_other_scale = layers.data(name='other_scale',
+                                                  shape=[dims],
+                                                  dtype='float32')
 
 
 class NormalTest7(NormalTest):
+
     def init_numpy_data(self, batch_size, dims):
         # loc and scale are Tensor with dtype 'VarType.FP64'.
         self.loc_np = np.random.randn(batch_size, dims).astype('float64')
@@ -338,26 +364,32 @@ def init_dynamic_data(self, batch_size, dims):
         self.dynamic_loc = paddle.to_tensor(self.loc_np, dtype='float64')
         self.dynamic_scale = paddle.to_tensor(self.scale_np, dtype='float64')
         self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
-        self.dynamic_other_loc = paddle.to_tensor(
-            self.other_loc_np, dtype='float64')
-        self.dynamic_other_scale = paddle.to_tensor(
-            self.other_scale_np, dtype='float64')
+        self.dynamic_other_loc = paddle.to_tensor(self.other_loc_np,
+                                                  dtype='float64')
+        self.dynamic_other_scale = paddle.to_tensor(self.other_scale_np,
+                                                    dtype='float64')
 
     def init_static_data(self, batch_size, dims):
         with fluid.program_guard(self.test_program):
-            self.static_loc = layers.data(
-                name='loc', shape=[dims], dtype='float64')
-            self.static_scale = layers.data(
-                name='scale', shape=[dims], dtype='float64')
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float64')
-            self.static_other_loc = layers.data(
-                name='other_loc', shape=[dims], dtype='float64')
-            self.static_other_scale = layers.data(
-                name='other_scale', shape=[dims], dtype='float64')
+            self.static_loc = layers.data(name='loc',
+                                          shape=[dims],
+                                          dtype='float64')
+            self.static_scale = layers.data(name='scale',
+                                            shape=[dims],
+                                            dtype='float64')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float64')
+            self.static_other_loc = layers.data(name='other_loc',
+                                                shape=[dims],
+                                                dtype='float64')
+            self.static_other_scale = layers.data(name='other_scale',
+                                                  shape=[dims],
+                                                  dtype='float64')
 
 
 class NormalTest8(NormalTest):
+
     def init_numpy_data(self, batch_size, dims):
         # loc and scale are Tensor with dtype 'VarType.FP64'. value's dtype is 'VarType.FP32'.
         self.loc_np = np.random.randn(batch_size, dims).astype('float64')
@@ -377,26 +409,32 @@ def init_dynamic_data(self, batch_size, dims):
         self.dynamic_loc = paddle.to_tensor(self.loc_np, dtype='float64')
         self.dynamic_scale = paddle.to_tensor(self.scale_np, dtype='float64')
         self.dynamic_values = paddle.to_tensor(self.values_np)
-        self.dynamic_other_loc = paddle.to_tensor(
-            self.other_loc_np, dtype='float64')
-        self.dynamic_other_scale = paddle.to_tensor(
-            self.other_scale_np, dtype='float64')
+        self.dynamic_other_loc = paddle.to_tensor(self.other_loc_np,
+                                                  dtype='float64')
+        self.dynamic_other_scale = paddle.to_tensor(self.other_scale_np,
+                                                    dtype='float64')
 
     def init_static_data(self, batch_size, dims):
         with fluid.program_guard(self.test_program):
-            self.static_loc = layers.data(
-                name='loc', shape=[dims], dtype='float64')
-            self.static_scale = layers.data(
-                name='scale', shape=[dims], dtype='float64')
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-            self.static_other_loc = layers.data(
-                name='other_loc', shape=[dims], dtype='float64')
-            self.static_other_scale = layers.data(
-                name='other_scale', shape=[dims], dtype='float64')
+            self.static_loc = layers.data(name='loc',
+                                          shape=[dims],
+                                          dtype='float64')
+            self.static_scale = layers.data(name='scale',
+                                            shape=[dims],
+                                            dtype='float64')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float32')
+            self.static_other_loc = layers.data(name='other_loc',
+                                                shape=[dims],
+                                                dtype='float64')
+            self.static_other_scale = layers.data(name='other_scale',
+                                                  shape=[dims],
+                                                  dtype='float64')
 
 
 class NormalTest9(NormalTest):
+
     def init_numpy_data(self, batch_size, dims):
         # loc and scale are list.
         self.loc_np = np.random.randn(batch_size,
@@ -422,11 +460,13 @@ def init_static_data(self, batch_size, dims):
         self.static_other_loc = self.other_loc_np
         self.static_other_scale = self.other_scale_np
         with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float32')
 
 
 class NormalTest10(NormalTest):
+
     def init_numpy_data(self, batch_size, dims):
         # loc and scale are tuple.
         self.loc_np = tuple(
@@ -452,8 +492,9 @@ def init_static_data(self, batch_size, dims):
         self.static_other_loc = self.other_loc_np
         self.static_other_scale = self.other_scale_np
         with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float32')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform.py
index b1304a52ef354..8311a10f4d5f2 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform.py
@@ -25,21 +25,22 @@
 
 @param.place(config.DEVICES)
 class TestTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.Transform()
 
-    @param.param_func([
-        (paddle.distribution.Distribution(),
-         paddle.distribution.TransformedDistribution),
-        (paddle.distribution.ExpTransform(), paddle.distribution.ChainTransform)
-    ])
+    @param.param_func([(paddle.distribution.Distribution(),
+                        paddle.distribution.TransformedDistribution),
+                       (paddle.distribution.ExpTransform(),
+                        paddle.distribution.ChainTransform)])
     def test_call(self, input, expected_type):
         t = transform.Transform()
         self.assertIsInstance(t(input), expected_type)
 
-    @param.param_func(
-        [(transform.Type.BIJECTION, True), (transform.Type.INJECTION, True),
-         (transform.Type.SURJECTION, False), (transform.Type.OTHER, False)])
+    @param.param_func([(transform.Type.BIJECTION, True),
+                       (transform.Type.INJECTION, True),
+                       (transform.Type.SURJECTION, False),
+                       (transform.Type.OTHER, False)])
     def test_is_injective(self, type, expected):
         transform.Transform._type = type
         self.assertEqual(self._t._is_injective(), expected)
@@ -50,26 +51,26 @@ def test_domain(self):
     def test_codomain(self):
         self.assertTrue(isinstance(self._t._codomain, variable.Real))
 
-    @param.param_func([(0, TypeError), (paddle.rand((2, 3)),
-                                        NotImplementedError)])
+    @param.param_func([(0, TypeError), (paddle.rand(
+        (2, 3)), NotImplementedError)])
     def test_forward(self, input, expected):
         with self.assertRaises(expected):
             self._t.forward(input)
 
-    @param.param_func([(0, TypeError), (paddle.rand((2, 3)),
-                                        NotImplementedError)])
+    @param.param_func([(0, TypeError), (paddle.rand(
+        (2, 3)), NotImplementedError)])
     def test_inverse(self, input, expected):
         with self.assertRaises(expected):
             self._t.inverse(input)
 
-    @param.param_func([(0, TypeError), (paddle.rand((2, 3)),
-                                        NotImplementedError)])
+    @param.param_func([(0, TypeError), (paddle.rand(
+        (2, 3)), NotImplementedError)])
     def test_forward_log_det_jacobian(self, input, expected):
         with self.assertRaises(expected):
             self._t.forward_log_det_jacobian(input)
 
-    @param.param_func([(0, TypeError), (paddle.rand((2, 3)),
-                                        NotImplementedError)])
+    @param.param_func([(0, TypeError), (paddle.rand(
+        (2, 3)), NotImplementedError)])
     def test_inverse_log_det_jacobian(self, input, expected):
         with self.assertRaises(expected):
             self._t.inverse_log_det_jacobian(input)
@@ -87,6 +88,7 @@ def test_inverse_shape(self, shape, expected):
 
 @param.place(config.DEVICES)
 class TestAbsTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.AbsTransform()
 
@@ -107,46 +109,44 @@ def test_codomain(self):
                        (np.array([[1., -1., -0.1], [-3., -0.1, 0]]),
                         np.array([[1., 1., 0.1], [3., 0.1, 0]]))])
     def test_forward(self, input, expected):
-        np.testing.assert_allclose(
-            self._t.forward(paddle.to_tensor(input)).numpy(),
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(self._t.forward(
+            paddle.to_tensor(input)).numpy(),
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([(np.array(1.), (-np.array(1.), np.array(1.)))])
     def test_inverse(self, input, expected):
         actual0, actual1 = self._t.inverse(paddle.to_tensor(input))
         expected0, expected1 = expected
-        np.testing.assert_allclose(
-            actual0.numpy(),
-            expected0,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
-        np.testing.assert_allclose(
-            actual1.numpy(),
-            expected1,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(actual0.numpy(),
+                                   expected0,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(actual1.numpy(),
+                                   expected1,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     def test_forward_log_det_jacobian(self):
         with self.assertRaises(NotImplementedError):
             self._t.forward_log_det_jacobian(paddle.rand((10, )))
 
-    @param.param_func([(np.array(1.), (np.array(0.), np.array(0.))), ])
+    @param.param_func([
+        (np.array(1.), (np.array(0.), np.array(0.))),
+    ])
     def test_inverse_log_det_jacobian(self, input, expected):
         actual0, actual1 = self._t.inverse_log_det_jacobian(
             paddle.to_tensor(input))
         expected0, expected1 = expected
-        np.testing.assert_allclose(
-            actual0.numpy(),
-            expected0,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
-        np.testing.assert_allclose(
-            actual1.numpy(),
-            expected1,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(actual0.numpy(),
+                                   expected0,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(actual1.numpy(),
+                                   expected1,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([((), ()), ((2, 3, 5), (2, 3, 5))])
     def test_forward_shape(self, shape, expected_shape):
@@ -163,9 +163,10 @@ def test_inverse_shape(self, shape, expected_shape):
     ('broadcast', np.random.rand(2, 10), np.random.rand(10)),
 ])
 class TestAffineTransform(unittest.TestCase):
+
     def setUp(self):
-        self._t = transform.AffineTransform(
-            paddle.to_tensor(self.loc), paddle.to_tensor(self.scale))
+        self._t = transform.AffineTransform(paddle.to_tensor(self.loc),
+                                            paddle.to_tensor(self.scale))
 
     @param.param_func([
         (paddle.rand([1]), 0, TypeError),
@@ -253,6 +254,7 @@ def test_inverse_shape(self):
 
 @param.place(config.DEVICES)
 class TestExpTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.ExpTransform()
 
@@ -269,35 +271,36 @@ def test_codomain(self):
         self.assertEqual(self._t._codomain.event_rank, 0)
         self.assertEqual(self._t._codomain.is_discrete, False)
 
-    @param.param_func(
-        [(np.array([0., 1., 2., 3.]), np.exp(np.array([0., 1., 2., 3.]))),
-         (np.array([[0., 1., 2., 3.], [-5., 6., 7., 8.]]),
-          np.exp(np.array([[0., 1., 2., 3.], [-5., 6., 7., 8.]])))])
+    @param.param_func([(np.array([0., 1., 2.,
+                                  3.]), np.exp(np.array([0., 1., 2., 3.]))),
+                       (np.array([[0., 1., 2., 3.], [-5., 6., 7., 8.]]),
+                        np.exp(np.array([[0., 1., 2., 3.], [-5., 6., 7.,
+                                                            8.]])))])
     def test_forward(self, input, expected):
-        np.testing.assert_allclose(
-            self._t.forward(paddle.to_tensor(input)).numpy(),
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(self._t.forward(
+            paddle.to_tensor(input)).numpy(),
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([(np.array([1., 2., 3.]), np.log(np.array([1., 2., 3.]))),
                        (np.array([[1., 2., 3.], [6., 7., 8.]]),
                         np.log(np.array([[1., 2., 3.], [6., 7., 8.]])))])
     def test_inverse(self, input, expected):
-        np.testing.assert_allclose(
-            self._t.inverse(paddle.to_tensor(input)).numpy(),
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(self._t.inverse(
+            paddle.to_tensor(input)).numpy(),
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([(np.array([1., 2., 3.]), ),
                        (np.array([[1., 2., 3.], [6., 7., 8.]]), )])
     def test_forward_log_det_jacobian(self, input):
-        np.testing.assert_allclose(
-            self._t.forward_log_det_jacobian(paddle.to_tensor(input)).numpy(),
-            self._np_forward_jacobian(input),
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(self._t.forward_log_det_jacobian(
+            paddle.to_tensor(input)).numpy(),
+                                   self._np_forward_jacobian(input),
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     def _np_forward_jacobian(self, x):
         return x
@@ -305,11 +308,11 @@ def _np_forward_jacobian(self, x):
     @param.param_func([(np.array([1., 2., 3.]), ),
                        (np.array([[1., 2., 3.], [6., 7., 8.]]), )])
     def test_inverse_log_det_jacobian(self, input):
-        np.testing.assert_allclose(
-            self._t.inverse_log_det_jacobian(paddle.to_tensor(input)).numpy(),
-            self._np_inverse_jacobian(input),
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(self._t.inverse_log_det_jacobian(
+            paddle.to_tensor(input)).numpy(),
+                                   self._np_inverse_jacobian(input),
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     def _np_inverse_jacobian(self, y):
         return -self._np_forward_jacobian(np.log(y))
@@ -325,19 +328,21 @@ def test_inverse_shape(self, shape, expected_shape):
 
 @param.place(config.DEVICES)
 class TestChainTransform(unittest.TestCase):
+
     @param.param_func([(paddle.distribution.Transform, TypeError),
                        ([0], TypeError)])
     def test_init_exception(self, transforms, exception):
         with self.assertRaises(exception):
             paddle.distribution.ChainTransform(transforms)
 
-    @param.param_func((
-        (transform.ChainTransform(
-            (transform.AbsTransform(),
-             transform.AffineTransform(paddle.rand([1]), paddle.rand([1])))),
-         False), (transform.ChainTransform((
-             transform.AffineTransform(paddle.rand([1]), paddle.rand([1])),
-             transform.ExpTransform(), )), True)))
+    @param.param_func(((transform.ChainTransform(
+        (transform.AbsTransform(),
+         transform.AffineTransform(paddle.rand([1]), paddle.rand([1])))),
+                        False), (transform.ChainTransform((
+                            transform.AffineTransform(paddle.rand([1]),
+                                                      paddle.rand([1])),
+                            transform.ExpTransform(),
+                        )), True)))
     def test_is_injective(self, chain, expected):
         self.assertEqual(chain._is_injective(), expected)
 
@@ -361,74 +366,83 @@ def test_codomain(self, input, expected):
         self.assertEqual(input._codomain.event_rank, expected.event_rank)
         self.assertEqual(input._codomain.is_discrete, expected.is_discrete)
 
-    @param.param_func(
-        [(transform.ChainTransform((transform.AffineTransform(
-            paddle.to_tensor(0.0), paddle.to_tensor(1.0)),
-                                    transform.ExpTransform())),
-          np.array([0., 1., 2., 3.]), np.exp(np.array([0., 1., 2., 3.]) * 1.0)),
-         (transform.ChainTransform((transform.ExpTransform(),
-                                    transform.TanhTransform())),
-          np.array([[0., -1., 2., -3.], [-5., 6., 7., -8.]]),
-          np.tanh(np.exp(np.array([[0., -1., 2., -3.], [-5., 6., 7., -8.]]))))])
+    @param.param_func([
+        (transform.ChainTransform(
+            (transform.AffineTransform(paddle.to_tensor(0.0),
+                                       paddle.to_tensor(1.0)),
+             transform.ExpTransform())), np.array([0., 1., 2., 3.]),
+         np.exp(np.array([0., 1., 2., 3.]) * 1.0)),
+        (transform.ChainTransform(
+            (transform.ExpTransform(), transform.TanhTransform())),
+         np.array([[0., -1., 2., -3.], [-5., 6., 7., -8.]]),
+         np.tanh(np.exp(np.array([[0., -1., 2., -3.], [-5., 6., 7., -8.]]))))
+    ])
     def test_forward(self, chain, input, expected):
-        np.testing.assert_allclose(
-            chain.forward(paddle.to_tensor(input)).numpy(),
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(chain.forward(
+            paddle.to_tensor(input)).numpy(),
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
-    @param.param_func(
-        [(transform.ChainTransform(
-            (transform.AffineTransform(
-                paddle.to_tensor(0.0), paddle.to_tensor(-1.0)),
+    @param.param_func([
+        (transform.ChainTransform(
+            (transform.AffineTransform(paddle.to_tensor(0.0),
+                                       paddle.to_tensor(-1.0)),
              transform.ExpTransform())), np.array([0., 1., 2., 3.]),
-          np.log(np.array([0., 1., 2., 3.])) / (-1.0)),
-         (transform.ChainTransform((transform.ExpTransform(),
-                                    transform.TanhTransform())),
-          np.array([[0., 1., 2., 3.], [5., 6., 7., 8.]]),
-          np.log(np.arctanh(np.array([[0., 1., 2., 3.], [5., 6., 7., 8.]]))))])
+         np.log(np.array([0., 1., 2., 3.])) / (-1.0)),
+        (transform.ChainTransform(
+            (transform.ExpTransform(), transform.TanhTransform())),
+         np.array([[0., 1., 2., 3.], [5., 6., 7., 8.]]),
+         np.log(np.arctanh(np.array([[0., 1., 2., 3.], [5., 6., 7., 8.]]))))
+    ])
     def test_inverse(self, chain, input, expected):
-        np.testing.assert_allclose(
-            chain.inverse(paddle.to_tensor(input)).numpy(),
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(chain.inverse(
+            paddle.to_tensor(input)).numpy(),
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([
         (transform.ChainTransform(
-            (transform.AffineTransform(
-                paddle.to_tensor(0.0), paddle.to_tensor(-1.0)),
+            (transform.AffineTransform(paddle.to_tensor(0.0),
+                                       paddle.to_tensor(-1.0)),
              transform.PowerTransform(paddle.to_tensor(2.0)))),
          np.array([1., 2., 3.]), np.log(2. * np.array([1., 2., 3.]))),
     ])
     def test_forward_log_det_jacobian(self, chain, input, expected):
-        np.testing.assert_allclose(
-            chain.forward_log_det_jacobian(paddle.to_tensor(input)).numpy(),
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
-
-    @param.param_func([(transform.ChainTransform((transform.AffineTransform(
-        paddle.to_tensor(0.0),
-        paddle.to_tensor(-1.0)), transform.ExpTransform())), (2, 3, 5),
-                        (2, 3, 5)), ])
+        np.testing.assert_allclose(chain.forward_log_det_jacobian(
+            paddle.to_tensor(input)).numpy(),
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
+
+    @param.param_func([
+        (transform.ChainTransform(
+            (transform.AffineTransform(paddle.to_tensor(0.0),
+                                       paddle.to_tensor(-1.0)),
+             transform.ExpTransform())), (2, 3, 5), (2, 3, 5)),
+    ])
     def test_forward_shape(self, chain, shape, expected_shape):
         self.assertEqual(chain.forward_shape(shape), expected_shape)
 
-    @param.param_func([(transform.ChainTransform((transform.AffineTransform(
-        paddle.to_tensor(0.0),
-        paddle.to_tensor(-1.0)), transform.ExpTransform())), (2, 3, 5),
-                        (2, 3, 5)), ])
+    @param.param_func([
+        (transform.ChainTransform(
+            (transform.AffineTransform(paddle.to_tensor(0.0),
+                                       paddle.to_tensor(-1.0)),
+             transform.ExpTransform())), (2, 3, 5), (2, 3, 5)),
+    ])
     def test_inverse_shape(self, chain, shape, expected_shape):
         self.assertEqual(chain.inverse_shape(shape), expected_shape)
 
 
 @param.place(config.DEVICES)
 @param.param_cls(
-    (param.TEST_CASE_NAME, 'base', 'reinterpreted_batch_rank', 'x'),
-    [('rank-over-zero', transform.ExpTransform(), 2, np.random.rand(2, 3, 3)),
-     ])
+    (param.TEST_CASE_NAME, 'base', 'reinterpreted_batch_rank', 'x'), [
+        ('rank-over-zero', transform.ExpTransform(), 2, np.random.rand(2, 3,
+                                                                       3)),
+    ])
 class TestIndependentTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.IndependentTransform(self.base,
                                                  self.reinterpreted_batch_rank)
@@ -474,16 +488,14 @@ def test_inverse(self):
 
     def test_forward_log_det_jacobian(self):
         actual = self._t.forward_log_det_jacobian(paddle.to_tensor(self.x))
-        self.assertEqual(
-            tuple(actual.shape), self.x.shape[:-self.reinterpreted_batch_rank])
-        expected = self.base.forward_log_det_jacobian(
-            paddle.to_tensor(self.x)).sum(
-                list(range(-self.reinterpreted_batch_rank, 0)))
-        np.testing.assert_allclose(
-            actual.numpy(),
-            expected.numpy(),
-            rtol=config.RTOL.get(str(self.x.dtype)),
-            atol=config.ATOL.get(str(self.x.dtype)))
+        self.assertEqual(tuple(actual.shape),
+                         self.x.shape[:-self.reinterpreted_batch_rank])
+        expected = self.base.forward_log_det_jacobian(paddle.to_tensor(
+            self.x)).sum(list(range(-self.reinterpreted_batch_rank, 0)))
+        np.testing.assert_allclose(actual.numpy(),
+                                   expected.numpy(),
+                                   rtol=config.RTOL.get(str(self.x.dtype)),
+                                   atol=config.ATOL.get(str(self.x.dtype)))
 
     @param.param_func([((), ()), ((2, 3, 5), (2, 3, 5))])
     def test_forward_shape(self, shape, expected_shape):
@@ -496,6 +508,7 @@ def test_inverse_shape(self, shape, expected_shape):
 
 @param.place(config.DEVICES)
 class TestPowerTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.PowerTransform(paddle.to_tensor(2.))
 
@@ -516,35 +529,34 @@ def test_codomain(self):
         self.assertEqual(self._t._codomain.event_rank, 0)
         self.assertEqual(self._t._codomain.is_discrete, False)
 
-    @param.param_func([(np.array([2.]), np.array([0., -1., 2.]), np.power(
-        np.array([0., -1., 2.]),
-        2.)), (np.array([[0.], [3.]]), np.array([[1., 0.], [5., 6.]]), np.power(
-            np.array([[1., 0.], [5., 6.]]), np.array([[0.], [3.]])))])
+    @param.param_func([(np.array([2.]), np.array([0., -1., 2.]),
+                        np.power(np.array([0., -1., 2.]), 2.)),
+                       (np.array([[0.], [3.]]), np.array([[1., 0.], [5., 6.]]),
+                        np.power(np.array([[1., 0.], [5., 6.]]),
+                                 np.array([[0.], [3.]])))])
     def test_forward(self, power, x, y):
         t = transform.PowerTransform(paddle.to_tensor(power))
-        np.testing.assert_allclose(
-            t.forward(paddle.to_tensor(x)).numpy(),
-            y,
-            rtol=config.RTOL.get(str(x.dtype)),
-            atol=config.ATOL.get(str(x.dtype)))
+        np.testing.assert_allclose(t.forward(paddle.to_tensor(x)).numpy(),
+                                   y,
+                                   rtol=config.RTOL.get(str(x.dtype)),
+                                   atol=config.ATOL.get(str(x.dtype)))
 
     @param.param_func([(np.array([2.]), np.array([4.]), np.array([2.]))])
     def test_inverse(self, power, y, x):
         t = transform.PowerTransform(paddle.to_tensor(power))
-        np.testing.assert_allclose(
-            t.inverse(paddle.to_tensor(y)).numpy(),
-            x,
-            rtol=config.RTOL.get(str(x.dtype)),
-            atol=config.ATOL.get(str(x.dtype)))
+        np.testing.assert_allclose(t.inverse(paddle.to_tensor(y)).numpy(),
+                                   x,
+                                   rtol=config.RTOL.get(str(x.dtype)),
+                                   atol=config.ATOL.get(str(x.dtype)))
 
     @param.param_func(((np.array([2.]), np.array([3., 1.4, 0.8])), ))
     def test_forward_log_det_jacobian(self, power, x):
         t = transform.PowerTransform(paddle.to_tensor(power))
-        np.testing.assert_allclose(
-            t.forward_log_det_jacobian(paddle.to_tensor(x)).numpy(),
-            self._np_forward_jacobian(power, x),
-            rtol=config.RTOL.get(str(x.dtype)),
-            atol=config.ATOL.get(str(x.dtype)))
+        np.testing.assert_allclose(t.forward_log_det_jacobian(
+            paddle.to_tensor(x)).numpy(),
+                                   self._np_forward_jacobian(power, x),
+                                   rtol=config.RTOL.get(str(x.dtype)),
+                                   atol=config.ATOL.get(str(x.dtype)))
 
     def _np_forward_jacobian(self, alpha, x):
         return np.abs(np.log(alpha * np.power(x, alpha - 1)))
@@ -560,6 +572,7 @@ def test_inverse_shape(self, shape, expected_shape):
 
 @param.place(config.DEVICES)
 class TestTanhTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.TanhTransform()
 
@@ -578,36 +591,37 @@ def test_codomain(self):
         self.assertEqual(self._t._codomain._constraint._lower, -1)
         self.assertEqual(self._t._codomain._constraint._upper, 1)
 
-    @param.param_func(
-        [(np.array([0., 1., 2., 3.]), np.tanh(np.array([0., 1., 2., 3.]))),
-         (np.array([[0., 1., 2., 3.], [-5., 6., 7., 8.]]),
-          np.tanh(np.array([[0., 1., 2., 3.], [-5., 6., 7., 8.]])))])
+    @param.param_func([(np.array([0., 1., 2.,
+                                  3.]), np.tanh(np.array([0., 1., 2., 3.]))),
+                       (np.array([[0., 1., 2., 3.], [-5., 6., 7., 8.]]),
+                        np.tanh(np.array([[0., 1., 2., 3.], [-5., 6., 7.,
+                                                             8.]])))])
     def test_forward(self, input, expected):
-        np.testing.assert_allclose(
-            self._t.forward(paddle.to_tensor(input)).numpy(),
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
-
-    @param.param_func(
-        [(np.array([1., 2., 3.]), np.arctanh(np.array([1., 2., 3.]))),
-         (np.array([[1., 2., 3.], [6., 7., 8.]]),
-          np.arctanh(np.array([[1., 2., 3.], [6., 7., 8.]])))])
+        np.testing.assert_allclose(self._t.forward(
+            paddle.to_tensor(input)).numpy(),
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
+
+    @param.param_func([(np.array([1., 2.,
+                                  3.]), np.arctanh(np.array([1., 2., 3.]))),
+                       (np.array([[1., 2., 3.], [6., 7., 8.]]),
+                        np.arctanh(np.array([[1., 2., 3.], [6., 7., 8.]])))])
     def test_inverse(self, input, expected):
-        np.testing.assert_allclose(
-            self._t.inverse(paddle.to_tensor(input)).numpy(),
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(self._t.inverse(
+            paddle.to_tensor(input)).numpy(),
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([(np.array([1., 2., 3.]), ),
                        (np.array([[1., 2., 3.], [6., 7., 8.]]), )])
     def test_forward_log_det_jacobian(self, input):
-        np.testing.assert_allclose(
-            self._t.forward_log_det_jacobian(paddle.to_tensor(input)).numpy(),
-            self._np_forward_jacobian(input),
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(self._t.forward_log_det_jacobian(
+            paddle.to_tensor(input)).numpy(),
+                                   self._np_forward_jacobian(input),
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     def _np_forward_jacobian(self, x):
         return 2. * (np.log(2.) - x - self._np_softplus(-2. * x))
@@ -623,11 +637,11 @@ def _np_inverse_jacobian(self, y):
     @param.param_func([(np.array([1., 2., 3.]), ),
                        (np.array([[1., 2., 3.], [6., 7., 8.]]), )])
     def test_inverse_log_det_jacobian(self, input):
-        np.testing.assert_allclose(
-            self._t.inverse_log_det_jacobian(paddle.to_tensor(input)).numpy(),
-            self._np_inverse_jacobian(input),
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(self._t.inverse_log_det_jacobian(
+            paddle.to_tensor(input)).numpy(),
+                                   self._np_inverse_jacobian(input),
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([((), ()), ((2, 3, 5), (2, 3, 5))])
     def test_forward_shape(self, shape, expected_shape):
@@ -643,6 +657,7 @@ def test_inverse_shape(self, shape, expected_shape):
     ('regular_shape', (2, 3), (3, 2)),
 ])
 class TestReshapeTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.ReshapeTransform(self.in_event_shape,
                                              self.out_event_shape)
@@ -664,27 +679,24 @@ def test_codomain(self):
 
     def test_forward(self):
         x = paddle.ones(self.in_event_shape)
-        np.testing.assert_allclose(
-            self._t.forward(x),
-            paddle.ones(self.out_event_shape),
-            rtol=config.RTOL.get(str(x.numpy().dtype)),
-            atol=config.ATOL.get(str(x.numpy().dtype)))
+        np.testing.assert_allclose(self._t.forward(x),
+                                   paddle.ones(self.out_event_shape),
+                                   rtol=config.RTOL.get(str(x.numpy().dtype)),
+                                   atol=config.ATOL.get(str(x.numpy().dtype)))
 
     def test_inverse(self):
         x = paddle.ones(self.out_event_shape)
-        np.testing.assert_allclose(
-            self._t.inverse(x).numpy(),
-            paddle.ones(self.in_event_shape).numpy(),
-            rtol=config.RTOL.get(str(x.numpy().dtype)),
-            atol=config.ATOL.get(str(x.numpy().dtype)))
+        np.testing.assert_allclose(self._t.inverse(x).numpy(),
+                                   paddle.ones(self.in_event_shape).numpy(),
+                                   rtol=config.RTOL.get(str(x.numpy().dtype)),
+                                   atol=config.ATOL.get(str(x.numpy().dtype)))
 
     def test_forward_log_det_jacobian(self):
         x = paddle.ones(self.in_event_shape)
-        np.testing.assert_allclose(
-            self._t.forward_log_det_jacobian(x).numpy(),
-            paddle.zeros([1]).numpy(),
-            rtol=config.RTOL.get(str(x.numpy().dtype)),
-            atol=config.ATOL.get(str(x.numpy().dtype)))
+        np.testing.assert_allclose(self._t.forward_log_det_jacobian(x).numpy(),
+                                   paddle.zeros([1]).numpy(),
+                                   rtol=config.RTOL.get(str(x.numpy().dtype)),
+                                   atol=config.ATOL.get(str(x.numpy().dtype)))
 
     def test_in_event_shape(self):
         self.assertEqual(self._t.in_event_shape, self.in_event_shape)
@@ -710,6 +722,7 @@ def _np_softplus(x, beta=1., threshold=20.):
 
 
 class TestSigmoidTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.SigmoidTransform()
 
@@ -722,33 +735,32 @@ def test_domain(self):
     def test_codomain(self):
         self.assertTrue(isinstance(self._t._codomain, variable.Variable))
 
-    @param.param_func(((np.ones((5, 10)),
-                        1 / (1 + np.exp(-np.ones((5, 10))))), ))
+    @param.param_func(((np.ones(
+        (5, 10)), 1 / (1 + np.exp(-np.ones((5, 10))))), ))
     def test_forward(self, input, expected):
-        np.testing.assert_allclose(
-            self._t.forward(paddle.to_tensor(input)),
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(self._t.forward(paddle.to_tensor(input)),
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
-    @param.param_func((
-        (np.ones(10), np.log(np.ones(10)) - np.log1p(-np.ones(10))), ))
+    @param.param_func(
+        ((np.ones(10), np.log(np.ones(10)) - np.log1p(-np.ones(10))), ))
     def test_inverse(self, input, expected):
-        np.testing.assert_allclose(
-            self._t.inverse(paddle.to_tensor(input)).numpy(),
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
-
-    @param.param_func((
-        (np.ones(10),
-         -_np_softplus(-np.ones(10)) - _np_softplus(np.ones(10))), ))
+        np.testing.assert_allclose(self._t.inverse(
+            paddle.to_tensor(input)).numpy(),
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
+
+    @param.param_func(
+        ((np.ones(10),
+          -_np_softplus(-np.ones(10)) - _np_softplus(np.ones(10))), ))
     def test_forward_log_det_jacobian(self, input, expected):
-        np.testing.assert_allclose(
-            self._t.forward_log_det_jacobian(paddle.to_tensor(input)).numpy(),
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(self._t.forward_log_det_jacobian(
+            paddle.to_tensor(input)).numpy(),
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([((), ()), ((2, 3, 5), (2, 3, 5))])
     def test_forward_shape(self, shape, expected_shape):
@@ -760,6 +772,7 @@ def test_inverse_shape(self, shape, expected_shape):
 
 
 class TestSoftmaxTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.SoftmaxTransform()
 
@@ -774,19 +787,17 @@ def test_codomain(self):
 
     @param.param_func(((np.random.random((5, 10)), ), ))
     def test_forward(self, input):
-        np.testing.assert_allclose(
-            self._t.forward(paddle.to_tensor(input)),
-            self._np_forward(input),
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(self._t.forward(paddle.to_tensor(input)),
+                                   self._np_forward(input),
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func(((np.random.random(10), ), ))
     def test_inverse(self, input):
-        np.testing.assert_allclose(
-            self._t.inverse(paddle.to_tensor(input)),
-            self._np_inverse(input),
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(self._t.inverse(paddle.to_tensor(input)),
+                                   self._np_inverse(input),
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     def _np_forward(self, x):
         x = np.exp(x - np.max(x, -1, keepdims=True)[0])
@@ -819,6 +830,7 @@ def test_inverse_shape(self, shape, expected_shape):
 
 
 class TestStickBreakingTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.StickBreakingTransform()
 
@@ -833,11 +845,11 @@ def test_codomain(self):
 
     @param.param_func(((np.random.random((10)), ), ))
     def test_forward(self, input):
-        np.testing.assert_allclose(
-            self._t.inverse(self._t.forward(paddle.to_tensor(input))),
-            input,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(self._t.inverse(
+            self._t.forward(paddle.to_tensor(input))),
+                                   input,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([((2, 3, 5), (2, 3, 6))])
     def test_forward_shape(self, shape, expected_shape):
@@ -859,6 +871,7 @@ def test_forward_log_det_jacobian(self, x):
     ('simple_one_transform', [transform.ExpTransform()], 0),
 ])
 class TestStackTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.StackTransform(self.transforms, self.axis)
 
@@ -874,22 +887,22 @@ def test_codomain(self):
     @param.param_func([(np.array([[0., 1., 2., 3.]]), ),
                        (np.array([[-5., 6., 7., 8.]]), )])
     def test_forward(self, input):
-        self.assertEqual(
-            tuple(self._t.forward(paddle.to_tensor(input)).shape), input.shape)
+        self.assertEqual(tuple(self._t.forward(paddle.to_tensor(input)).shape),
+                         input.shape)
 
     @param.param_func([(np.array([[1., 2., 3.]]), ),
                        (np.array([[6., 7., 8.]], ), )])
     def test_inverse(self, input):
-        self.assertEqual(
-            tuple(self._t.inverse(paddle.to_tensor(input)).shape), input.shape)
+        self.assertEqual(tuple(self._t.inverse(paddle.to_tensor(input)).shape),
+                         input.shape)
 
-    @param.param_func([(np.array([[1., 2., 3.]]), ),
-                       (np.array([[6., 7., 8.]]), )])
+    @param.param_func([(np.array([[1., 2., 3.]]), ), (np.array([[6., 7.,
+                                                                 8.]]), )])
     def test_forward_log_det_jacobian(self, input):
         self.assertEqual(
             tuple(
-                self._t.forward_log_det_jacobian(paddle.to_tensor(input))
-                .shape), input.shape)
+                self._t.forward_log_det_jacobian(
+                    paddle.to_tensor(input)).shape), input.shape)
 
     @param.param_func([((), ()), ((2, 3, 5), (2, 3, 5))])
     def test_forward_shape(self, shape, expected_shape):
@@ -902,9 +915,9 @@ def test_inverse_shape(self, shape, expected_shape):
     def test_axis(self):
         self.assertEqual(self._t.axis, self.axis)
 
-    @param.param_func(
-        [(0, 0, TypeError), ([0], 0, TypeError),
-         ([paddle.distribution.ExpTransform()], 'axis', TypeError)])
+    @param.param_func([(0, 0, TypeError), ([0], 0, TypeError),
+                       ([paddle.distribution.ExpTransform()], 'axis', TypeError)
+                       ])
     def test_init_exception(self, transforms, axis, exc):
         with self.assertRaises(exc):
             paddle.distribution.StackTransform(transforms, axis)
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform_static.py
index fa5742fb26103..00a1f409dad52 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform_static.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform_static.py
@@ -26,12 +26,14 @@
 
 @param.place(config.DEVICES)
 class TestTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.Transform()
 
-    @param.param_func(
-        [(transform.Type.BIJECTION, True), (transform.Type.INJECTION, True),
-         (transform.Type.SURJECTION, False), (transform.Type.OTHER, False)])
+    @param.param_func([(transform.Type.BIJECTION, True),
+                       (transform.Type.INJECTION, True),
+                       (transform.Type.SURJECTION, False),
+                       (transform.Type.OTHER, False)])
     def test_is_injective(self, type, expected):
         transform.Transform._type = type
         self.assertEqual(self._t._is_injective(), expected)
@@ -42,8 +44,8 @@ def test_domain(self):
     def test_codomain(self):
         self.assertTrue(isinstance(self._t._codomain, variable.Real))
 
-    @param.param_func([(np.array(0), NotImplementedError), (np.random.random(
-        (2, 3)), NotImplementedError)])
+    @param.param_func([(np.array(0), NotImplementedError),
+                       (np.random.random((2, 3)), NotImplementedError)])
     def test_forward(self, input, expected):
         with self.assertRaises(expected):
             exe = paddle.static.Executor()
@@ -57,8 +59,8 @@ def test_forward(self, input, expected):
             exe.run(sp)
             exe.run(mp, feed={'input': input}, fetch_list=[output])
 
-    @param.param_func([(np.array(0), NotImplementedError), (np.random.random(
-        (2, 3)), NotImplementedError)])
+    @param.param_func([(np.array(0), NotImplementedError),
+                       (np.random.random((2, 3)), NotImplementedError)])
     def test_inverse(self, input, expected):
         with self.assertRaises(expected):
             exe = paddle.static.Executor()
@@ -72,8 +74,8 @@ def test_inverse(self, input, expected):
             exe.run(sp)
             exe.run(mp, feed={'input': input}, fetch_list=[output])
 
-    @param.param_func([(np.array(0), NotImplementedError), (paddle.rand(
-        (2, 3)), NotImplementedError)])
+    @param.param_func([(np.array(0), NotImplementedError),
+                       (paddle.rand((2, 3)), NotImplementedError)])
     def test_forward_log_det_jacobian(self, input, expected):
         with self.assertRaises(expected):
             exe = paddle.static.Executor()
@@ -87,8 +89,8 @@ def test_forward_log_det_jacobian(self, input, expected):
             exe.run(sp)
             exe.run(mp, feed={'input': input}, fetch_list=[output])
 
-    @param.param_func([(np.array(0), NotImplementedError), (paddle.rand(
-        (2, 3)), NotImplementedError)])
+    @param.param_func([(np.array(0), NotImplementedError),
+                       (paddle.rand((2, 3)), NotImplementedError)])
     def test_inverse_log_det_jacobian(self, input, expected):
         with self.assertRaises(expected):
             exe = paddle.static.Executor()
@@ -115,6 +117,7 @@ def test_inverse_shape(self, shape, expected):
 
 @param.place(config.DEVICES)
 class TestAbsTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.AbsTransform()
 
@@ -144,11 +147,10 @@ def test_forward(self, input, expected):
             output = t.forward(static_input)
         exe.run(sp)
         [output] = exe.run(mp, feed={'input': input}, fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(output,
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([(np.array([1.]), (-np.array([1.]), np.array([1.])))])
     def test_inverse(self, input, expected):
@@ -164,16 +166,14 @@ def test_inverse(self, input, expected):
                                      feed={'input': input},
                                      fetch_list=[actual0, actual1])
         expected0, expected1 = expected
-        np.testing.assert_allclose(
-            actual0,
-            expected0,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
-        np.testing.assert_allclose(
-            actual1,
-            expected1,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(actual0,
+                                   expected0,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(actual1,
+                                   expected1,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     def test_forward_log_det_jacobian(self):
         input = np.random.random((10, ))
@@ -189,7 +189,9 @@ def test_forward_log_det_jacobian(self):
             exe.run(sp)
             [output] = exe.run(mp, feed={'input': input}, fetch_list=[output])
 
-    @param.param_func([(np.array([1.]), (np.array([0.]), np.array([0.]))), ])
+    @param.param_func([
+        (np.array([1.]), (np.array([0.]), np.array([0.]))),
+    ])
     def test_inverse_log_det_jacobian(self, input, expected):
         exe = paddle.static.Executor()
         sp = paddle.static.Program()
@@ -203,16 +205,14 @@ def test_inverse_log_det_jacobian(self, input, expected):
                                      feed={'input': input},
                                      fetch_list=[actual0, actual1])
         expected0, expected1 = expected
-        np.testing.assert_allclose(
-            actual0,
-            expected0,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
-        np.testing.assert_allclose(
-            actual1,
-            expected1,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(actual0,
+                                   expected0,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(actual1,
+                                   expected1,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([((), ()), ((2, 3, 5), (2, 3, 5))])
     def test_forward_shape(self, shape, expected_shape):
@@ -229,6 +229,7 @@ def test_inverse_shape(self, shape, expected_shape):
     ('broadcast', np.random.rand(2, 10), np.random.rand(10)),
 ])
 class TestAffineTransform(unittest.TestCase):
+
     def setUp(self):
         sp = paddle.static.Program()
         mp = paddle.static.Program()
@@ -265,17 +266,17 @@ def test_forward(self):
                                               self.loc.dtype)
             output = t.forward(static_input)
         exe.run(sp)
-        [output] = exe.run(
-            mp,
-            feed={'input': input,
-                  'loc': self.loc,
-                  'scale': self.scale},
-            fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            self._np_forward(input),
-            rtol=config.RTOL.get(str(self.loc.dtype)),
-            atol=config.ATOL.get(str(self.loc.dtype)))
+        [output] = exe.run(mp,
+                           feed={
+                               'input': input,
+                               'loc': self.loc,
+                               'scale': self.scale
+                           },
+                           fetch_list=[output])
+        np.testing.assert_allclose(output,
+                                   self._np_forward(input),
+                                   rtol=config.RTOL.get(str(self.loc.dtype)),
+                                   atol=config.ATOL.get(str(self.loc.dtype)))
 
     def test_inverse(self):
         input = np.random.random(self.loc.shape)
@@ -291,17 +292,17 @@ def test_inverse(self):
                                               self.loc.dtype)
             output = t.inverse(static_input)
         exe.run(sp)
-        [output] = exe.run(
-            mp,
-            feed={'input': input,
-                  'loc': self.loc,
-                  'scale': self.scale},
-            fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            self._np_inverse(input),
-            rtol=config.RTOL.get(str(self.loc.dtype)),
-            atol=config.ATOL.get(str(self.loc.dtype)))
+        [output] = exe.run(mp,
+                           feed={
+                               'input': input,
+                               'loc': self.loc,
+                               'scale': self.scale
+                           },
+                           fetch_list=[output])
+        np.testing.assert_allclose(output,
+                                   self._np_inverse(input),
+                                   rtol=config.RTOL.get(str(self.loc.dtype)),
+                                   atol=config.ATOL.get(str(self.loc.dtype)))
 
     def _np_forward(self, x):
         return self.loc + self.scale * x
@@ -328,17 +329,17 @@ def test_inverse_log_det_jacobian(self):
             static_input = paddle.static.data('input', input.shape, input.dtype)
             output = t.inverse_log_det_jacobian(static_input)
         exe.run(sp)
-        [output] = exe.run(
-            mp,
-            feed={'input': input,
-                  'loc': self.loc,
-                  'scale': self.scale},
-            fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            self._np_inverse_jacobian(input),
-            rtol=config.RTOL.get(str(self.loc.dtype)),
-            atol=config.ATOL.get(str(self.loc.dtype)))
+        [output] = exe.run(mp,
+                           feed={
+                               'input': input,
+                               'loc': self.loc,
+                               'scale': self.scale
+                           },
+                           fetch_list=[output])
+        np.testing.assert_allclose(output,
+                                   self._np_inverse_jacobian(input),
+                                   rtol=config.RTOL.get(str(self.loc.dtype)),
+                                   atol=config.ATOL.get(str(self.loc.dtype)))
 
     def test_forward_log_det_jacobian(self):
         input = np.random.random(self.scale.shape)
@@ -353,17 +354,17 @@ def test_forward_log_det_jacobian(self):
             static_input = paddle.static.data('input', input.shape, input.dtype)
             output = t.forward_log_det_jacobian(static_input)
         exe.run(sp)
-        [output] = exe.run(
-            mp,
-            feed={'input': input,
-                  'loc': self.loc,
-                  'scale': self.scale},
-            fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            self._np_forward_jacobian(input),
-            rtol=config.RTOL.get(str(self.loc.dtype)),
-            atol=config.ATOL.get(str(self.loc.dtype)))
+        [output] = exe.run(mp,
+                           feed={
+                               'input': input,
+                               'loc': self.loc,
+                               'scale': self.scale
+                           },
+                           fetch_list=[output])
+        np.testing.assert_allclose(output,
+                                   self._np_forward_jacobian(input),
+                                   rtol=config.RTOL.get(str(self.loc.dtype)),
+                                   atol=config.ATOL.get(str(self.loc.dtype)))
 
     def test_forward_shape(self):
         shape = self.loc.shape
@@ -380,6 +381,7 @@ def test_inverse_shape(self):
 
 @param.place(config.DEVICES)
 class TestExpTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.ExpTransform()
 
@@ -396,10 +398,11 @@ def test_codomain(self):
         self.assertEqual(self._t._codomain.event_rank, 0)
         self.assertEqual(self._t._codomain.is_discrete, False)
 
-    @param.param_func(
-        [(np.array([0., 1., 2., 3.]), np.exp(np.array([0., 1., 2., 3.]))),
-         (np.array([[0., 1., 2., 3.], [-5., 6., 7., 8.]]),
-          np.exp(np.array([[0., 1., 2., 3.], [-5., 6., 7., 8.]])))])
+    @param.param_func([(np.array([0., 1., 2.,
+                                  3.]), np.exp(np.array([0., 1., 2., 3.]))),
+                       (np.array([[0., 1., 2., 3.], [-5., 6., 7., 8.]]),
+                        np.exp(np.array([[0., 1., 2., 3.], [-5., 6., 7.,
+                                                            8.]])))])
     def test_forward(self, input, expected):
         exe = paddle.static.Executor()
         sp = paddle.static.Program()
@@ -410,11 +413,10 @@ def test_forward(self, input, expected):
             output = t.forward(static_input)
         exe.run(sp)
         [output] = exe.run(mp, feed={'input': input}, fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(output,
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([(np.array([1., 2., 3.]), np.log(np.array([1., 2., 3.]))),
                        (np.array([[1., 2., 3.], [6., 7., 8.]]),
@@ -429,11 +431,10 @@ def test_inverse(self, input, expected):
             output = t.inverse(static_input)
         exe.run(sp)
         [output] = exe.run(mp, feed={'input': input}, fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(output,
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([(np.array([1., 2., 3.]), ),
                        (np.array([[1., 2., 3.], [6., 7., 8.]]), )])
@@ -447,11 +448,10 @@ def test_forward_log_det_jacobian(self, input):
             output = t.forward_log_det_jacobian(static_input)
         exe.run(sp)
         [output] = exe.run(mp, feed={'input': input}, fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            self._np_forward_jacobian(input),
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(output,
+                                   self._np_forward_jacobian(input),
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     def _np_forward_jacobian(self, x):
         return x
@@ -468,11 +468,10 @@ def test_inverse_log_det_jacobian(self, input):
             output = t.inverse_log_det_jacobian(static_input)
         exe.run(sp)
         [output] = exe.run(mp, feed={'input': input}, fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            self._np_inverse_jacobian(input),
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(output,
+                                   self._np_inverse_jacobian(input),
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     def _np_inverse_jacobian(self, y):
         return -self._np_forward_jacobian(np.log(y))
@@ -488,13 +487,15 @@ def test_inverse_shape(self, shape, expected_shape):
 
 @param.place(config.DEVICES)
 class TestChainTransform(unittest.TestCase):
-    @param.param_func((
-        (transform.ChainTransform(
-            (transform.AbsTransform(),
-             transform.AffineTransform(paddle.rand([1]), paddle.rand([1])))),
-         False), (transform.ChainTransform((
-             transform.AffineTransform(paddle.rand([1]), paddle.rand([1])),
-             transform.ExpTransform(), )), True)))
+
+    @param.param_func(((transform.ChainTransform(
+        (transform.AbsTransform(),
+         transform.AffineTransform(paddle.rand([1]), paddle.rand([1])))),
+                        False), (transform.ChainTransform((
+                            transform.AffineTransform(paddle.rand([1]),
+                                                      paddle.rand([1])),
+                            transform.ExpTransform(),
+                        )), True)))
     def test_is_injective(self, chain, expected):
         self.assertEqual(chain._is_injective(), expected)
 
@@ -518,11 +519,12 @@ def test_codomain(self, input, expected):
         self.assertEqual(input._codomain.event_rank, expected.event_rank)
         self.assertEqual(input._codomain.is_discrete, expected.is_discrete)
 
-    @param.param_func(
-        [(transform.ChainTransform((transform.ExpTransform(),
-                                    transform.TanhTransform())),
-          np.array([[0., -1., 2., -3.], [-5., 6., 7., -8.]]),
-          np.tanh(np.exp(np.array([[0., -1., 2., -3.], [-5., 6., 7., -8.]]))))])
+    @param.param_func([
+        (transform.ChainTransform(
+            (transform.ExpTransform(), transform.TanhTransform())),
+         np.array([[0., -1., 2., -3.], [-5., 6., 7., -8.]]),
+         np.tanh(np.exp(np.array([[0., -1., 2., -3.], [-5., 6., 7., -8.]]))))
+    ])
     def test_forward(self, chain, input, expected):
         exe = paddle.static.Executor()
         sp = paddle.static.Program()
@@ -533,17 +535,17 @@ def test_forward(self, chain, input, expected):
             output = t.forward(static_input)
         exe.run(sp)
         [output] = exe.run(mp, feed={'input': input}, fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
-
-    @param.param_func(
-        [(transform.ChainTransform((transform.ExpTransform(),
-                                    transform.TanhTransform())),
-          np.array([[0., 1., 2., 3.], [5., 6., 7., 8.]]),
-          np.log(np.arctanh(np.array([[0., 1., 2., 3.], [5., 6., 7., 8.]]))))])
+        np.testing.assert_allclose(output,
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
+
+    @param.param_func([
+        (transform.ChainTransform(
+            (transform.ExpTransform(), transform.TanhTransform())),
+         np.array([[0., 1., 2., 3.], [5., 6., 7., 8.]]),
+         np.log(np.arctanh(np.array([[0., 1., 2., 3.], [5., 6., 7., 8.]]))))
+    ])
     def test_inverse(self, chain, input, expected):
         exe = paddle.static.Executor()
         sp = paddle.static.Program()
@@ -554,33 +556,38 @@ def test_inverse(self, chain, input, expected):
             output = t.inverse(static_input)
         exe.run(sp)
         [output] = exe.run(mp, feed={'input': input}, fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
-
-    @param.param_func([(transform.ChainTransform((transform.AffineTransform(
-        paddle.full([1], 0.0),
-        paddle.full([1], -1.0)), transform.ExpTransform())), (2, 3, 5),
-                        (2, 3, 5)), ])
+        np.testing.assert_allclose(output,
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
+
+    @param.param_func([
+        (transform.ChainTransform(
+            (transform.AffineTransform(paddle.full([1], 0.0),
+                                       paddle.full([1], -1.0)),
+             transform.ExpTransform())), (2, 3, 5), (2, 3, 5)),
+    ])
     def test_forward_shape(self, chain, shape, expected_shape):
         self.assertEqual(chain.forward_shape(shape), expected_shape)
 
-    @param.param_func([(transform.ChainTransform((transform.AffineTransform(
-        paddle.full([1], 0.0),
-        paddle.full([1], -1.0)), transform.ExpTransform())), (2, 3, 5),
-                        (2, 3, 5)), ])
+    @param.param_func([
+        (transform.ChainTransform(
+            (transform.AffineTransform(paddle.full([1], 0.0),
+                                       paddle.full([1], -1.0)),
+             transform.ExpTransform())), (2, 3, 5), (2, 3, 5)),
+    ])
     def test_inverse_shape(self, chain, shape, expected_shape):
         self.assertEqual(chain.forward_shape(shape), expected_shape)
 
 
 @param.place(config.DEVICES)
 @param.param_cls(
-    (param.TEST_CASE_NAME, 'base', 'reinterpreted_batch_rank', 'x'),
-    [('rank-over-zero', transform.ExpTransform(), 2, np.random.rand(2, 3, 3)),
-     ])
+    (param.TEST_CASE_NAME, 'base', 'reinterpreted_batch_rank', 'x'), [
+        ('rank-over-zero', transform.ExpTransform(), 2, np.random.rand(2, 3,
+                                                                       3)),
+    ])
 class TestIndependentTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.IndependentTransform(self.base,
                                                  self.reinterpreted_batch_rank)
@@ -619,11 +626,10 @@ def test_forward(self):
         [output, expected] = exe.run(mp,
                                      feed={'input': self.x},
                                      fetch_list=[output, expected])
-        np.testing.assert_allclose(
-            output,
-            expected,
-            rtol=config.RTOL.get(str(self.x.dtype)),
-            atol=config.ATOL.get(str(self.x.dtype)))
+        np.testing.assert_allclose(output,
+                                   expected,
+                                   rtol=config.RTOL.get(str(self.x.dtype)),
+                                   atol=config.ATOL.get(str(self.x.dtype)))
 
     def test_inverse(self):
         exe = paddle.static.Executor()
@@ -640,11 +646,10 @@ def test_inverse(self):
         [output, expected] = exe.run(mp,
                                      feed={'input': self.x},
                                      fetch_list=[output, expected])
-        np.testing.assert_allclose(
-            expected,
-            output,
-            rtol=config.RTOL.get(str(self.x.dtype)),
-            atol=config.ATOL.get(str(self.x.dtype)))
+        np.testing.assert_allclose(expected,
+                                   output,
+                                   rtol=config.RTOL.get(str(self.x.dtype)),
+                                   atol=config.ATOL.get(str(self.x.dtype)))
 
     def test_forward_log_det_jacobian(self):
         exe = paddle.static.Executor()
@@ -657,19 +662,18 @@ def test_forward_log_det_jacobian(self):
                                               self.x.dtype)
             output = t.forward_log_det_jacobian(static_input)
             expected = self.base.forward_log_det_jacobian(
-                static_input.sum(
-                    list(range(-self.reinterpreted_batch_rank, 0))))
+                static_input.sum(list(range(-self.reinterpreted_batch_rank,
+                                            0))))
         exe.run(sp)
         [actual, expected] = exe.run(mp,
                                      feed={'input': self.x},
                                      fetch_list=[output, expected])
-        self.assertEqual(
-            tuple(actual.shape), self.x.shape[:-self.reinterpreted_batch_rank])
-        np.testing.assert_allclose(
-            actual,
-            expected,
-            rtol=config.RTOL.get(str(self.x.dtype)),
-            atol=config.ATOL.get(str(self.x.dtype)))
+        self.assertEqual(tuple(actual.shape),
+                         self.x.shape[:-self.reinterpreted_batch_rank])
+        np.testing.assert_allclose(actual,
+                                   expected,
+                                   rtol=config.RTOL.get(str(self.x.dtype)),
+                                   atol=config.ATOL.get(str(self.x.dtype)))
 
     @param.param_func([((), ()), ((2, 3, 5), (2, 3, 5))])
     def test_forward_shape(self, shape, expected_shape):
@@ -682,6 +686,7 @@ def test_inverse_shape(self, shape, expected_shape):
 
 @param.place(config.DEVICES)
 class TestPowerTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.PowerTransform(paddle.full([1], 2.))
 
@@ -702,10 +707,11 @@ def test_codomain(self):
         self.assertEqual(self._t._codomain.event_rank, 0)
         self.assertEqual(self._t._codomain.is_discrete, False)
 
-    @param.param_func([(np.array([2.]), np.array([0., -1., 2.]), np.power(
-        np.array([0., -1., 2.]),
-        2.)), (np.array([[0.], [3.]]), np.array([[1., 0.], [5., 6.]]), np.power(
-            np.array([[1., 0.], [5., 6.]]), np.array([[0.], [3.]])))])
+    @param.param_func([(np.array([2.]), np.array([0., -1., 2.]),
+                        np.power(np.array([0., -1., 2.]), 2.)),
+                       (np.array([[0.], [3.]]), np.array([[1., 0.], [5., 6.]]),
+                        np.power(np.array([[1., 0.], [5., 6.]]),
+                                 np.array([[0.], [3.]])))])
     def test_forward(self, power, input, expected):
         exe = paddle.static.Executor()
         sp = paddle.static.Program()
@@ -717,14 +723,15 @@ def test_forward(self, power, input, expected):
             output = t.forward(static_input)
         exe.run(sp)
         [output] = exe.run(mp,
-                           feed={'input': input,
-                                 'power': power},
+                           feed={
+                               'input': input,
+                               'power': power
+                           },
                            fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(output,
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([(np.array([2.]), np.array([4.]), np.array([2.]))])
     def test_inverse(self, power, input, expected):
@@ -738,14 +745,15 @@ def test_inverse(self, power, input, expected):
             output = t.inverse(static_input)
         exe.run(sp)
         [output] = exe.run(mp,
-                           feed={'input': input,
-                                 'power': power},
+                           feed={
+                               'input': input,
+                               'power': power
+                           },
                            fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(output,
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func(((np.array([2.]), np.array([3., 1.4, 0.8])), ))
     def test_forward_log_det_jacobian(self, power, input):
@@ -759,14 +767,15 @@ def test_forward_log_det_jacobian(self, power, input):
             output = t.forward_log_det_jacobian(static_input)
         exe.run(sp)
         [output] = exe.run(mp,
-                           feed={'input': input,
-                                 'power': power},
+                           feed={
+                               'input': input,
+                               'power': power
+                           },
                            fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            self._np_forward_jacobian(power, input),
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(output,
+                                   self._np_forward_jacobian(power, input),
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     def _np_forward_jacobian(self, alpha, x):
         return np.abs(np.log(alpha * np.power(x, alpha - 1)))
@@ -782,6 +791,7 @@ def test_inverse_shape(self, shape, expected_shape):
 
 @param.place(config.DEVICES)
 class TestTanhTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.TanhTransform()
 
@@ -800,10 +810,11 @@ def test_codomain(self):
         self.assertEqual(self._t._codomain._constraint._lower, -1)
         self.assertEqual(self._t._codomain._constraint._upper, 1)
 
-    @param.param_func(
-        [(np.array([0., 1., 2., 3.]), np.tanh(np.array([0., 1., 2., 3.]))),
-         (np.array([[0., 1., 2., 3.], [-5., 6., 7., 8.]]),
-          np.tanh(np.array([[0., 1., 2., 3.], [-5., 6., 7., 8.]])))])
+    @param.param_func([(np.array([0., 1., 2.,
+                                  3.]), np.tanh(np.array([0., 1., 2., 3.]))),
+                       (np.array([[0., 1., 2., 3.], [-5., 6., 7., 8.]]),
+                        np.tanh(np.array([[0., 1., 2., 3.], [-5., 6., 7.,
+                                                             8.]])))])
     def test_forward(self, input, expected):
         exe = paddle.static.Executor()
         sp = paddle.static.Program()
@@ -814,16 +825,15 @@ def test_forward(self, input, expected):
             output = t.forward(static_input)
         exe.run(sp)
         [output] = exe.run(mp, feed={'input': input}, fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
-
-    @param.param_func(
-        [(np.array([1., 2., 3.]), np.arctanh(np.array([1., 2., 3.]))),
-         (np.array([[1., 2., 3.], [6., 7., 8.]]),
-          np.arctanh(np.array([[1., 2., 3.], [6., 7., 8.]])))])
+        np.testing.assert_allclose(output,
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
+
+    @param.param_func([(np.array([1., 2.,
+                                  3.]), np.arctanh(np.array([1., 2., 3.]))),
+                       (np.array([[1., 2., 3.], [6., 7., 8.]]),
+                        np.arctanh(np.array([[1., 2., 3.], [6., 7., 8.]])))])
     def test_inverse(self, input, expected):
         exe = paddle.static.Executor()
         sp = paddle.static.Program()
@@ -834,11 +844,10 @@ def test_inverse(self, input, expected):
             output = t.inverse(static_input)
         exe.run(sp)
         [output] = exe.run(mp, feed={'input': input}, fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            expected,
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(output,
+                                   expected,
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([(np.array([1., 2., 3.]), ),
                        (np.array([[1., 2., 3.], [6., 7., 8.]]), )])
@@ -852,11 +861,10 @@ def test_forward_log_det_jacobian(self, input):
             output = t.forward_log_det_jacobian(static_input)
         exe.run(sp)
         [output] = exe.run(mp, feed={'input': input}, fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            self._np_forward_jacobian(input),
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(output,
+                                   self._np_forward_jacobian(input),
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     def _np_forward_jacobian(self, x):
         return 2. * (np.log(2.) - x - self._np_softplus(-2. * x))
@@ -881,11 +889,10 @@ def test_inverse_log_det_jacobian(self, input):
             output = t.inverse_log_det_jacobian(static_input)
         exe.run(sp)
         [output] = exe.run(mp, feed={'input': input}, fetch_list=[output])
-        np.testing.assert_allclose(
-            output,
-            self._np_inverse_jacobian(input),
-            rtol=config.RTOL.get(str(input.dtype)),
-            atol=config.ATOL.get(str(input.dtype)))
+        np.testing.assert_allclose(output,
+                                   self._np_inverse_jacobian(input),
+                                   rtol=config.RTOL.get(str(input.dtype)),
+                                   atol=config.ATOL.get(str(input.dtype)))
 
     @param.param_func([((), ()), ((2, 3, 5), (2, 3, 5))])
     def test_forward_shape(self, shape, expected_shape):
@@ -901,6 +908,7 @@ def test_inverse_shape(self, shape, expected_shape):
     ('regular_shape', (2, 3), (3, 2)),
 ])
 class TestReshapeTransform(unittest.TestCase):
+
     def setUp(self):
         self._t = transform.ReshapeTransform(self.in_event_shape,
                                              self.out_event_shape)
@@ -926,11 +934,10 @@ def test_forward(self):
         exe.run(sp)
         [output] = exe.run(mp, feed={}, fetch_list=[output])
         expected = np.ones(self.out_event_shape)
-        np.testing.assert_allclose(
-            output,
-            expected,
-            rtol=config.RTOL.get(str(expected.dtype)),
-            atol=config.ATOL.get(str(expected.dtype)))
+        np.testing.assert_allclose(output,
+                                   expected,
+                                   rtol=config.RTOL.get(str(expected.dtype)),
+                                   atol=config.ATOL.get(str(expected.dtype)))
 
     def test_inverse(self):
         exe = paddle.static.Executor()
@@ -945,11 +952,10 @@ def test_inverse(self):
         [output] = exe.run(mp, feed={}, fetch_list=[output])
         expected = np.ones(self.in_event_shape)
 
-        np.testing.assert_allclose(
-            output,
-            expected,
-            rtol=config.RTOL.get(str(expected.dtype)),
-            atol=config.ATOL.get(str(expected.dtype)))
+        np.testing.assert_allclose(output,
+                                   expected,
+                                   rtol=config.RTOL.get(str(expected.dtype)),
+                                   atol=config.ATOL.get(str(expected.dtype)))
 
     def test_forward_log_det_jacobian(self):
         exe = paddle.static.Executor()
@@ -963,11 +969,10 @@ def test_forward_log_det_jacobian(self):
         exe.run(sp)
         [output] = exe.run(mp, feed={}, fetch_list=[output])
         expected = np.zeros([1])
-        np.testing.assert_allclose(
-            output,
-            expected,
-            rtol=config.RTOL.get(str(expected.dtype)),
-            atol=config.ATOL.get(str(expected.dtype)))
+        np.testing.assert_allclose(output,
+                                   expected,
+                                   rtol=config.RTOL.get(str(expected.dtype)),
+                                   atol=config.ATOL.get(str(expected.dtype)))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_transformed_distribution.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_transformed_distribution.py
index 2f7bb61e38d13..c47250195daab 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_transformed_distribution.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_transformed_distribution.py
@@ -24,12 +24,13 @@
 
 @param.place(config.DEVICES)
 @param.param_cls((param.TEST_CASE_NAME, 'base', 'transforms'),
-                 [('base_normal', paddle.distribution.Normal(0., 1.),
-                   [paddle.distribution.ExpTransform()])])
+                 [('base_normal', paddle.distribution.Normal(
+                     0., 1.), [paddle.distribution.ExpTransform()])])
 class TestIndependent(unittest.TestCase):
+
     def setUp(self):
-        self._t = paddle.distribution.TransformedDistribution(self.base,
-                                                              self.transforms)
+        self._t = paddle.distribution.TransformedDistribution(
+            self.base, self.transforms)
 
     def _np_sum_rightmost(self, value, n):
         return np.sum(value, tuple(range(-n, 0))) if n > 0 else value
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_transformed_distribution_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_transformed_distribution_static.py
index f07205a62680a..4e4bcc1f4d402 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_transformed_distribution_static.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_transformed_distribution_static.py
@@ -29,6 +29,7 @@
                  [('base_normal', paddle.distribution.Normal,
                    [paddle.distribution.ExpTransform()])])
 class TestIndependent(unittest.TestCase):
+
     def setUp(self):
         value = np.array([0.5])
         loc = np.array([0.])
@@ -54,17 +55,18 @@ def setUp(self):
         [self.actual_log_prob, self.expected_log_prob,
          self.sample_data] = exe.run(
              mp,
-             feed={'value': value,
-                   'loc': loc,
-                   'scale': scale},
+             feed={
+                 'value': value,
+                 'loc': loc,
+                 'scale': scale
+             },
              fetch_list=[actual_log_prob, expected_log_prob, sample_data])
 
     def test_log_prob(self):
-        np.testing.assert_allclose(
-            self.actual_log_prob,
-            self.expected_log_prob,
-            rtol=config.RTOL.get(str(self.dtype)),
-            atol=config.ATOL.get(str(self.dtype)))
+        np.testing.assert_allclose(self.actual_log_prob,
+                                   self.expected_log_prob,
+                                   rtol=config.RTOL.get(str(self.dtype)),
+                                   atol=config.ATOL.get(str(self.dtype)))
 
     def transformed_log_prob(self, value, base, transforms):
         log_prob = 0.0
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_uniform.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_uniform.py
index d8fe23b9c1bda..3fbb382a2403a 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_uniform.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_uniform.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,6 +25,7 @@
 
 
 class UniformNumpy(DistributionNumpy):
+
     def __init__(self, low, high):
         self.low = np.array(low)
         self.high = np.array(high)
@@ -52,6 +53,7 @@ def entropy(self):
 
 
 class UniformTest(unittest.TestCase):
+
     def setUp(self, use_gpu=False, batch_size=5, dims=6):
         self.use_gpu = use_gpu
         if not use_gpu:
@@ -86,8 +88,9 @@ def init_static_data(self, batch_size, dims):
         self.static_low = self.low_np
         self.static_high = self.high_np
         with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[], dtype='float32')
+            self.static_values = layers.data(name='values',
+                                             shape=[],
+                                             dtype='float32')
 
     def compare_with_numpy(self, fetch_list, sample_shape=7, tolerance=1e-6):
         sample, entropy, log_prob, probs = fetch_list
@@ -99,10 +102,14 @@ def compare_with_numpy(self, fetch_list, sample_shape=7, tolerance=1e-6):
         np_p = np_uniform.probs(self.values_np)
 
         np.testing.assert_equal(sample.shape, np_sample.shape)
-        np.testing.assert_allclose(
-            entropy, np_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            log_prob, np_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(entropy,
+                                   np_entropy,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(log_prob,
+                                   np_lp,
+                                   rtol=tolerance,
+                                   atol=tolerance)
         np.testing.assert_allclose(probs, np_p, rtol=tolerance, atol=tolerance)
 
     def test_uniform_distribution_dygraph(self, sample_shape=7, tolerance=1e-6):
@@ -141,6 +148,7 @@ def test_uniform_distribution_static(self, sample_shape=7, tolerance=1e-6):
 
 
 class UniformTest2(UniformTest):
+
     def init_numpy_data(self, batch_size, dims):
         # low ans high are 'int'
         self.low_np = int(np.random.uniform(-2, 1))
@@ -149,6 +157,7 @@ def init_numpy_data(self, batch_size, dims):
 
 
 class UniformTest3(UniformTest):
+
     def init_numpy_data(self, batch_size, dims):
         # test broadcast: low is float, high is numpy.ndarray with dtype 'float32'.
         self.low_np = np.random.uniform(-2, 1)
@@ -160,11 +169,13 @@ def init_static_data(self, batch_size, dims):
         self.static_low = self.low_np
         self.static_high = self.high_np
         with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float32')
 
 
 class UniformTest4(UniformTest):
+
     def init_numpy_data(self, batch_size, dims):
         # low and high are numpy.ndarray with dtype 'float32'.
         self.low_np = np.random.randn(batch_size, dims).astype('float32')
@@ -176,11 +187,13 @@ def init_static_data(self, batch_size, dims):
         self.static_low = self.low_np
         self.static_high = self.high_np
         with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float32')
 
 
 class UniformTest5(UniformTest):
+
     def init_numpy_data(self, batch_size, dims):
         # low and high are numpy.ndarray with dtype 'float64'.
         self.low_np = np.random.randn(batch_size, dims).astype('float64')
@@ -197,11 +210,13 @@ def init_static_data(self, batch_size, dims):
         self.static_low = self.low_np
         self.static_high = self.high_np
         with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float64')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float64')
 
 
 class UniformTest6(UniformTest):
+
     def init_numpy_data(self, batch_size, dims):
         # low and high are Tensor with dtype 'VarType.FP32'.
         self.low_np = np.random.randn(batch_size, dims).astype('float32')
@@ -216,15 +231,19 @@ def init_dynamic_data(self, batch_size, dims):
 
     def init_static_data(self, batch_size, dims):
         with fluid.program_guard(self.test_program):
-            self.static_low = layers.data(
-                name='low', shape=[dims], dtype='float32')
-            self.static_high = layers.data(
-                name='high', shape=[dims], dtype='float32')
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
+            self.static_low = layers.data(name='low',
+                                          shape=[dims],
+                                          dtype='float32')
+            self.static_high = layers.data(name='high',
+                                           shape=[dims],
+                                           dtype='float32')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float32')
 
 
 class UniformTest7(UniformTest):
+
     def init_numpy_data(self, batch_size, dims):
         # low and high are Tensor with dtype 'VarType.FP64'.
         self.low_np = np.random.randn(batch_size, dims).astype('float64')
@@ -239,15 +258,19 @@ def init_dynamic_data(self, batch_size, dims):
 
     def init_static_data(self, batch_size, dims):
         with fluid.program_guard(self.test_program):
-            self.static_low = layers.data(
-                name='low', shape=[dims], dtype='float64')
-            self.static_high = layers.data(
-                name='high', shape=[dims], dtype='float64')
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float64')
+            self.static_low = layers.data(name='low',
+                                          shape=[dims],
+                                          dtype='float64')
+            self.static_high = layers.data(name='high',
+                                           shape=[dims],
+                                           dtype='float64')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float64')
 
 
 class UniformTest8(UniformTest):
+
     def init_numpy_data(self, batch_size, dims):
         # low and high are Tensor with dtype 'VarType.FP64'. value's dtype is 'VarType.FP32'.
         self.low_np = np.random.randn(batch_size, dims).astype('float64')
@@ -262,15 +285,19 @@ def init_dynamic_data(self, batch_size, dims):
 
     def init_static_data(self, batch_size, dims):
         with fluid.program_guard(self.test_program):
-            self.static_low = layers.data(
-                name='low', shape=[dims], dtype='float64')
-            self.static_high = layers.data(
-                name='high', shape=[dims], dtype='float64')
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
+            self.static_low = layers.data(name='low',
+                                          shape=[dims],
+                                          dtype='float64')
+            self.static_high = layers.data(name='high',
+                                           shape=[dims],
+                                           dtype='float64')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float32')
 
 
 class UniformTest9(UniformTest):
+
     def init_numpy_data(self, batch_size, dims):
         # low and high are numpy.ndarray with dtype 'float32'.
         # high < low.
@@ -283,11 +310,13 @@ def init_static_data(self, batch_size, dims):
         self.static_low = self.low_np
         self.static_high = self.high_np
         with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float32')
 
 
 class UniformTest10(UniformTest):
+
     def init_numpy_data(self, batch_size, dims):
         # low and high are list.
         self.low_np = np.random.randn(batch_size,
@@ -300,29 +329,33 @@ def init_static_data(self, batch_size, dims):
         self.static_low = self.low_np
         self.static_high = self.high_np
         with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float32')
 
 
 class UniformTest11(UniformTest):
+
     def init_numpy_data(self, batch_size, dims):
         # low and high are tuple.
         self.low_np = tuple(
             np.random.randn(batch_size, dims).astype('float32').tolist())
         self.high_np = tuple(
-            np.random.uniform(5.0, 15.0, (batch_size, dims)).astype('float32')
-            .tolist())
+            np.random.uniform(5.0, 15.0,
+                              (batch_size, dims)).astype('float32').tolist())
         self.values_np = np.random.randn(batch_size, dims).astype('float32')
 
     def init_static_data(self, batch_size, dims):
         self.static_low = self.low_np
         self.static_high = self.high_np
         with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
+            self.static_values = layers.data(name='values',
+                                             shape=[dims],
+                                             dtype='float32')
 
 
 class UniformTestSample(unittest.TestCase):
+
     def setUp(self):
         self.init_param()
 
@@ -340,6 +373,7 @@ def test_uniform_sample(self):
 
 
 class UniformTestSample2(UniformTestSample):
+
     def init_param(self):
         self.low = -5.0
         self.high = 2.0
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_variable.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_variable.py
index 6cd50157207fd..94558395e0035 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_variable.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_variable.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,6 +27,7 @@
     (param.TEST_CASE_NAME, 'is_discrete', 'event_rank', 'constraint'),
     [('NotImplement', False, 0, constraint.Constraint())])
 class TestVariable(unittest.TestCase):
+
     def setUp(self):
         self._var = variable.Variable(self.is_discrete, self.event_rank,
                                       self.constraint)
@@ -40,10 +41,13 @@ def test_costraint(self, value):
 @param.param_cls((param.TEST_CASE_NAME, 'base', 'rank'),
                  [('real_base', variable.real, 10)])
 class TestIndependent(unittest.TestCase):
+
     def setUp(self):
         self._var = variable.Independent(self.base, self.rank)
 
-    @param.param_func([(paddle.rand([2, 3, 4]), ValueError), ])
+    @param.param_func([
+        (paddle.rand([2, 3, 4]), ValueError),
+    ])
     def test_costraint(self, value, expect):
         with self.assertRaises(expect):
             self._var.constraint(value)
@@ -52,13 +56,16 @@ def test_costraint(self, value, expect):
 @param.param_cls((param.TEST_CASE_NAME, 'vars', 'axis'),
                  [('real_base', [variable.real], 10)])
 class TestStack(unittest.TestCase):
+
     def setUp(self):
         self._var = variable.Stack(self.vars, self.axis)
 
     def test_is_discrete(self):
         self.assertEqual(self._var.is_discrete, False)
 
-    @param.param_func([(paddle.rand([2, 3, 4]), ValueError), ])
+    @param.param_func([
+        (paddle.rand([2, 3, 4]), ValueError),
+    ])
     def test_costraint(self, value, expect):
         with self.assertRaises(expect):
             self._var.constraint(value)
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_kl.py b/python/paddle/fluid/tests/unittests/distribution/test_kl.py
index 635f5446c8ef2..0a957c540bed7 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_kl.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_kl.py
@@ -30,16 +30,18 @@
 
 @param.place(config.DEVICES)
 @param.parameterize_cls((param.TEST_CASE_NAME, 'a1', 'b1', 'a2', 'b2'), [
-    ('test_regular_input', 6.0 * np.random.random((4, 5)) + 1e-4,
-     6.0 * np.random.random((4, 5)) + 1e-4, 6.0 * np.random.random(
-         (4, 5)) + 1e-4, 6.0 * np.random.random((4, 5)) + 1e-4),
+    ('test_regular_input', 6.0 * np.random.random(
+        (4, 5)) + 1e-4, 6.0 * np.random.random(
+            (4, 5)) + 1e-4, 6.0 * np.random.random(
+                (4, 5)) + 1e-4, 6.0 * np.random.random((4, 5)) + 1e-4),
 ])
 class TestKLBetaBeta(unittest.TestCase):
+
     def setUp(self):
-        self.p = paddle.distribution.Beta(
-            paddle.to_tensor(self.a1), paddle.to_tensor(self.b1))
-        self.q = paddle.distribution.Beta(
-            paddle.to_tensor(self.a2), paddle.to_tensor(self.b2))
+        self.p = paddle.distribution.Beta(paddle.to_tensor(self.a1),
+                                          paddle.to_tensor(self.b1))
+        self.q = paddle.distribution.Beta(paddle.to_tensor(self.a2),
+                                          paddle.to_tensor(self.b2))
 
     def test_kl_divergence(self):
         with paddle.fluid.dygraph.guard(self.place):
@@ -58,10 +60,11 @@ def scipy_kl_beta_beta(self, a1, b1, a2, b2):
 
 @param.place(config.DEVICES)
 @param.param_cls((param.TEST_CASE_NAME, 'conc1', 'conc2'), [
-    ('test-regular-input', np.random.random((5, 7, 8, 10)), np.random.random(
-        (5, 7, 8, 10))),
+    ('test-regular-input', np.random.random(
+        (5, 7, 8, 10)), np.random.random((5, 7, 8, 10))),
 ])
 class TestKLDirichletDirichlet(unittest.TestCase):
+
     def setUp(self):
         self.p = paddle.distribution.Dirichlet(paddle.to_tensor(self.conc1))
         self.q = paddle.distribution.Dirichlet(paddle.to_tensor(self.conc2))
@@ -79,10 +82,10 @@ def scipy_kl_diric_diric(self, conc1, conc2):
             scipy.special.gammaln(np.sum(conc1, -1)) -
             scipy.special.gammaln(np.sum(conc2, -1)) - np.sum(
                 scipy.special.gammaln(conc1) - scipy.special.gammaln(conc2), -1)
-            + np.sum((conc1 - conc2) *
-                     (scipy.special.digamma(conc1) -
-                      scipy.special.digamma(np.sum(conc1, -1, keepdims=True))),
-                     -1))
+            + np.sum(
+                (conc1 - conc2) *
+                (scipy.special.digamma(conc1) -
+                 scipy.special.digamma(np.sum(conc1, -1, keepdims=True))), -1))
 
 
 class DummyDistribution(paddle.distribution.Distribution):
@@ -93,25 +96,27 @@ class DummyDistribution(paddle.distribution.Distribution):
 @param.param_cls((param.TEST_CASE_NAME, 'p', 'q'),
                  [('test-unregister', DummyDistribution(), DummyDistribution)])
 class TestDispatch(unittest.TestCase):
+
     def test_dispatch_with_unregister(self):
         with self.assertRaises(NotImplementedError):
             paddle.distribution.kl_divergence(self.p, self.q)
 
 
 @param.place(config.DEVICES)
-@param.param_cls((param.TEST_CASE_NAME, 'p', 'q'),
-                 [('test-diff-dist',
-                   mock.Exponential(paddle.rand((100, 200, 100)) + 1.0),
-                   mock.Exponential(paddle.rand((100, 200, 100)) + 2.0)),
-                  ('test-same-dist', mock.Exponential(paddle.to_tensor(1.0)),
-                   mock.Exponential(paddle.to_tensor(1.0)))])
+@param.param_cls(
+    (param.TEST_CASE_NAME, 'p', 'q'),
+    [('test-diff-dist', mock.Exponential(paddle.rand((100, 200, 100)) + 1.0),
+      mock.Exponential(paddle.rand((100, 200, 100)) + 2.0)),
+     ('test-same-dist', mock.Exponential(
+         paddle.to_tensor(1.0)), mock.Exponential(paddle.to_tensor(1.0)))])
 class TestKLExpfamilyExpFamily(unittest.TestCase):
+
     def test_kl_expfamily_expfamily(self):
-        np.testing.assert_allclose(
-            paddle.distribution.kl_divergence(self.p, self.q),
-            kl._kl_expfamily_expfamily(self.p, self.q),
-            rtol=config.RTOL.get(config.DEFAULT_DTYPE),
-            atol=config.ATOL.get(config.DEFAULT_DTYPE))
+        np.testing.assert_allclose(paddle.distribution.kl_divergence(
+            self.p, self.q),
+                                   kl._kl_expfamily_expfamily(self.p, self.q),
+                                   rtol=config.RTOL.get(config.DEFAULT_DTYPE),
+                                   atol=config.ATOL.get(config.DEFAULT_DTYPE))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_kl_static.py b/python/paddle/fluid/tests/unittests/distribution/test_kl_static.py
index b061650a53b9e..3bd62e1334bc0 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_kl_static.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_kl_static.py
@@ -30,11 +30,13 @@
 
 @param.place(config.DEVICES)
 @param.param_cls((param.TEST_CASE_NAME, 'a1', 'b1', 'a2', 'b2'), [
-    ('test_regular_input', 6.0 * np.random.random((4, 5)) + 1e-4,
-     6.0 * np.random.random((4, 5)) + 1e-4, 6.0 * np.random.random(
-         (4, 5)) + 1e-4, 6.0 * np.random.random((4, 5)) + 1e-4),
+    ('test_regular_input', 6.0 * np.random.random(
+        (4, 5)) + 1e-4, 6.0 * np.random.random(
+            (4, 5)) + 1e-4, 6.0 * np.random.random(
+                (4, 5)) + 1e-4, 6.0 * np.random.random((4, 5)) + 1e-4),
 ])
 class TestKLBetaBeta(unittest.TestCase):
+
     def setUp(self):
         self.mp = paddle.static.Program()
         self.sp = paddle.static.Program()
@@ -63,11 +65,11 @@ def test_kl_divergence(self):
                                       feed=self.feeds,
                                       fetch_list=[out])
 
-            np.testing.assert_allclose(
-                out,
-                self.scipy_kl_beta_beta(self.a1, self.b1, self.a2, self.b2),
-                rtol=config.RTOL.get(str(self.a1.dtype)),
-                atol=config.ATOL.get(str(self.a1.dtype)))
+            np.testing.assert_allclose(out,
+                                       self.scipy_kl_beta_beta(
+                                           self.a1, self.b1, self.a2, self.b2),
+                                       rtol=config.RTOL.get(str(self.a1.dtype)),
+                                       atol=config.ATOL.get(str(self.a1.dtype)))
 
     def scipy_kl_beta_beta(self, a1, b1, a2, b2):
         return (scipy.special.betaln(a2, b2) - scipy.special.betaln(a1, b1) +
@@ -78,10 +80,11 @@ def scipy_kl_beta_beta(self, a1, b1, a2, b2):
 
 @param.place(config.DEVICES)
 @param.param_cls((param.TEST_CASE_NAME, 'conc1', 'conc2'), [
-    ('test-regular-input', np.random.random((5, 7, 8, 10)), np.random.random(
-        (5, 7, 8, 10))),
+    ('test-regular-input', np.random.random(
+        (5, 7, 8, 10)), np.random.random((5, 7, 8, 10))),
 ])
 class TestKLDirichletDirichlet(unittest.TestCase):
+
     def setUp(self):
         self.mp = paddle.static.Program()
         self.sp = paddle.static.Program()
@@ -114,10 +117,10 @@ def scipy_kl_diric_diric(self, conc1, conc2):
             scipy.special.gammaln(np.sum(conc1, -1)) -
             scipy.special.gammaln(np.sum(conc2, -1)) - np.sum(
                 scipy.special.gammaln(conc1) - scipy.special.gammaln(conc2), -1)
-            + np.sum((conc1 - conc2) *
-                     (scipy.special.digamma(conc1) -
-                      scipy.special.digamma(np.sum(conc1, -1, keepdims=True))),
-                     -1))
+            + np.sum(
+                (conc1 - conc2) *
+                (scipy.special.digamma(conc1) -
+                 scipy.special.digamma(np.sum(conc1, -1, keepdims=True))), -1))
 
 
 class DummyDistribution(paddle.distribution.Distribution):
@@ -128,6 +131,7 @@ class DummyDistribution(paddle.distribution.Distribution):
 @param.param_cls((param.TEST_CASE_NAME, 'p', 'q'),
                  [('test-dispatch-exception')])
 class TestDispatch(unittest.TestCase):
+
     def setUp(self):
         self.mp = paddle.static.Program()
         self.sp = paddle.static.Program()
@@ -150,15 +154,18 @@ def test_dispatch_with_unregister(self):
                    np.random.rand(100, 200, 100) + 2.0),
                   ('test-same-dist', np.array([1.0]), np.array([1.0]))])
 class TestKLExpfamilyExpFamily(unittest.TestCase):
+
     def setUp(self):
         self.mp = paddle.static.Program()
         self.sp = paddle.static.Program()
         self.executor = paddle.static.Executor(self.place)
         with paddle.static.program_guard(self.mp, self.sp):
-            rate1 = paddle.static.data(
-                'rate1', shape=self.rate1.shape, dtype=self.rate1.dtype)
-            rate2 = paddle.static.data(
-                'rate2', shape=self.rate2.shape, dtype=self.rate2.dtype)
+            rate1 = paddle.static.data('rate1',
+                                       shape=self.rate1.shape,
+                                       dtype=self.rate1.dtype)
+            rate2 = paddle.static.data('rate2',
+                                       shape=self.rate2.shape,
+                                       dtype=self.rate2.dtype)
             self.p = mock.Exponential(rate1)
             self.q = mock.Exponential(rate2)
             self.feeds = {'rate1': self.rate1, 'rate2': self.rate2}
diff --git a/python/paddle/fluid/tests/unittests/dygraph_fleet_api.py b/python/paddle/fluid/tests/unittests/dygraph_fleet_api.py
index de4457a58fb0f..a649b3a19ebf6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_fleet_api.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_fleet_api.py
@@ -31,6 +31,7 @@
 
 
 class TestDygraphFleetAPI(unittest.TestCase):
+
     def setUp(self):
         paddle.seed(2022)
         random.seed(2022)
@@ -48,8 +49,8 @@ def test_dygraph_fleet_api(self):
         strategy.amp = True
         strategy.recompute = True
         fleet.init(is_collective=True, strategy=strategy)
-        net = paddle.nn.Sequential(
-            paddle.nn.Linear(10, 1), paddle.nn.Linear(1, 2))
+        net = paddle.nn.Sequential(paddle.nn.Linear(10, 1),
+                                   paddle.nn.Linear(1, 2))
         net = dist.fleet.distributed_model(net)
         data = np.random.uniform(-1, 1, [30, 10]).astype('float32')
         data = paddle.to_tensor(data)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
index a1a853f006c0d..34b485a8bd462 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,6 +35,7 @@
 
 
 class MLP(fluid.Layer):
+
     def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
@@ -50,6 +51,7 @@ def forward(self, inputs):
 
 
 def reader_decorator(linear_size=1000):
+
     def __reader__():
         for _ in range(100):
             img = np.random.rand(linear_size).astype('float32')
@@ -80,18 +82,20 @@ def train_mlp(model, shard_level, use_pure_fp16, output_dir):
     model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32')
     scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
 
-    model, optimizer, scaler = group_sharded_parallel(
-        model=model, optimizer=optimizer, level=shard_level, scaler=scaler)
+    model, optimizer, scaler = group_sharded_parallel(model=model,
+                                                      optimizer=optimizer,
+                                                      level=shard_level,
+                                                      scaler=scaler)
 
-    train_reader = paddle.batch(
-        reader_decorator(), batch_size=batch_size, drop_last=True)
+    train_reader = paddle.batch(reader_decorator(),
+                                batch_size=batch_size,
+                                drop_last=True)
 
-    train_loader = paddle.io.DataLoader.from_generator(
-        capacity=32,
-        use_double_buffer=True,
-        iterable=True,
-        return_list=True,
-        use_multiprocess=True)
+    train_loader = paddle.io.DataLoader.from_generator(capacity=32,
+                                                       use_double_buffer=True,
+                                                       iterable=True,
+                                                       return_list=True,
+                                                       use_multiprocess=True)
     train_loader.set_sample_list_generator(train_reader)
 
     for eop in range(epoch):
@@ -102,8 +106,8 @@ def train_mlp(model, shard_level, use_pure_fp16, output_dir):
             img.stop_gradient = True
             with paddle.amp.auto_cast(True, level='O2'):
                 out = model(img)
-                loss = paddle.nn.functional.cross_entropy(
-                    input=out, label=label)
+                loss = paddle.nn.functional.cross_entropy(input=out,
+                                                          label=label)
             avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
 
             if not use_pure_fp16:
@@ -129,17 +133,20 @@ def test_sharding_api():
     output_dir = tempfile.mkdtemp()
 
     # fp16
-    stage2_params = train_mlp(
-        mlp1, shard_level="os_g", use_pure_fp16=True, output_dir=output_dir)
-    stage3_params = train_mlp(
-        mlp2, shard_level="p_g_os", use_pure_fp16=True, output_dir=output_dir)
+    stage2_params = train_mlp(mlp1,
+                              shard_level="os_g",
+                              use_pure_fp16=True,
+                              output_dir=output_dir)
+    stage3_params = train_mlp(mlp2,
+                              shard_level="p_g_os",
+                              use_pure_fp16=True,
+                              output_dir=output_dir)
 
     for i in range(len(stage3_params)):
-        np.testing.assert_allclose(
-            stage2_params[i].numpy(),
-            stage3_params[i].numpy(),
-            rtol=1e-4,
-            atol=1e-3)
+        np.testing.assert_allclose(stage2_params[i].numpy(),
+                                   stage3_params[i].numpy(),
+                                   rtol=1e-4,
+                                   atol=1e-3)
     shutil.rmtree(output_dir)
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py
index 85a5446cb6447..8f6dadb5ce978 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,6 +35,7 @@
 
 
 class MLP(fluid.Layer):
+
     def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
@@ -50,6 +51,7 @@ def forward(self, inputs):
 
 
 def reader_decorator(linear_size=1000):
+
     def __reader__():
         for _ in range(100):
             img = np.random.rand(linear_size).astype('float32')
@@ -78,18 +80,20 @@ def train_mlp(model, shard_level, use_pure_fp16, output_dir):
     model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32')
     scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
 
-    model, optimizer, scaler = group_sharded_parallel(
-        model=model, optimizer=optimizer, level=shard_level, scaler=scaler)
+    model, optimizer, scaler = group_sharded_parallel(model=model,
+                                                      optimizer=optimizer,
+                                                      level=shard_level,
+                                                      scaler=scaler)
 
-    train_reader = paddle.batch(
-        reader_decorator(), batch_size=batch_size, drop_last=True)
+    train_reader = paddle.batch(reader_decorator(),
+                                batch_size=batch_size,
+                                drop_last=True)
 
-    train_loader = paddle.io.DataLoader.from_generator(
-        capacity=32,
-        use_double_buffer=True,
-        iterable=True,
-        return_list=True,
-        use_multiprocess=True)
+    train_loader = paddle.io.DataLoader.from_generator(capacity=32,
+                                                       use_double_buffer=True,
+                                                       iterable=True,
+                                                       return_list=True,
+                                                       use_multiprocess=True)
     train_loader.set_sample_list_generator(train_reader)
 
     for eop in range(epoch):
@@ -100,8 +104,8 @@ def train_mlp(model, shard_level, use_pure_fp16, output_dir):
             img.stop_gradient = True
             with paddle.amp.auto_cast(True, level='O2'):
                 out = model(img)
-                loss = paddle.nn.functional.cross_entropy(
-                    input=out, label=label)
+                loss = paddle.nn.functional.cross_entropy(input=out,
+                                                          label=label)
             avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
 
             if not use_pure_fp16:
@@ -128,17 +132,20 @@ def test_sharding_api():
     output_dir = tempfile.mkdtemp()
 
     # fp16
-    stage2_params = train_mlp(
-        mlp1, shard_level="os_g", use_pure_fp16=True, output_dir=output_dir)
-    stage3_params = train_mlp(
-        mlp2, shard_level="p_g_os", use_pure_fp16=True, output_dir=output_dir)
+    stage2_params = train_mlp(mlp1,
+                              shard_level="os_g",
+                              use_pure_fp16=True,
+                              output_dir=output_dir)
+    stage3_params = train_mlp(mlp2,
+                              shard_level="p_g_os",
+                              use_pure_fp16=True,
+                              output_dir=output_dir)
 
     for i in range(len(stage3_params)):
-        np.testing.assert_allclose(
-            stage2_params[i].numpy(),
-            stage3_params[i].numpy(),
-            rtol=1e-4,
-            atol=1e-3)
+        np.testing.assert_allclose(stage2_params[i].numpy(),
+                                   stage3_params[i].numpy(),
+                                   rtol=1e-4,
+                                   atol=1e-3)
     shutil.rmtree(output_dir)
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py
index 8c07734d513c4..f4cc451c40a78 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py
@@ -1,13 +1,13 @@
 # -*- coding: UTF-8 -*-
 
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -40,6 +40,7 @@
 
 
 class MLP(fluid.Layer):
+
     def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
@@ -55,6 +56,7 @@ def forward(self, inputs):
 
 
 def reader_decorator(linear_size=1000):
+
     def __reader__():
         for _ in range(100):
             img = np.random.rand(linear_size).astype('float32')
@@ -66,14 +68,13 @@ def __reader__():
 
 def optimizer_setting(model, use_pure_fp16, opt_group=False):
     clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
-    optimizer = paddle.optimizer.AdamW(
-        parameters=[{
-            "params": model.parameters(),
-        }] if opt_group else model.parameters(),
-        learning_rate=0.001,
-        weight_decay=0.00001,
-        grad_clip=clip,
-        multi_precision=use_pure_fp16)
+    optimizer = paddle.optimizer.AdamW(parameters=[{
+        "params": model.parameters(),
+    }] if opt_group else model.parameters(),
+                                       learning_rate=0.001,
+                                       weight_decay=0.00001,
+                                       grad_clip=clip,
+                                       multi_precision=use_pure_fp16)
 
     return optimizer
 
@@ -89,8 +90,9 @@ def train_mlp(model,
     if sharding_stage != "dp":
         group = paddle.distributed.new_group([0, 1], backend="nccl")
     if opt_group:
-        optimizer = optimizer_setting(
-            model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group)
+        optimizer = optimizer_setting(model=model,
+                                      use_pure_fp16=use_pure_fp16,
+                                      opt_group=opt_group)
     else:
         optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
 
@@ -98,8 +100,10 @@ def train_mlp(model,
         optimizer = GroupShardedOptimizerStage2(
             params=optimizer._parameter_list, optim=optimizer, group=group)
 
-        model = GroupShardedStage2(
-            model, optimizer, group=group, buffer_max_size=2**21)
+        model = GroupShardedStage2(model,
+                                   optimizer,
+                                   group=group,
+                                   buffer_max_size=2**21)
     else:
         model = paddle.DataParallel(model)
 
@@ -112,15 +116,15 @@ def train_mlp(model,
                 "====== Find sharding_stage2_optimizer.minimize() error ======")
         return
 
-    train_reader = paddle.batch(
-        reader_decorator(), batch_size=batch_size, drop_last=True)
+    train_reader = paddle.batch(reader_decorator(),
+                                batch_size=batch_size,
+                                drop_last=True)
 
-    train_loader = paddle.io.DataLoader.from_generator(
-        capacity=32,
-        use_double_buffer=True,
-        iterable=True,
-        return_list=True,
-        use_multiprocess=True)
+    train_loader = paddle.io.DataLoader.from_generator(capacity=32,
+                                                       use_double_buffer=True,
+                                                       iterable=True,
+                                                       return_list=True,
+                                                       use_multiprocess=True)
     train_loader.set_sample_list_generator(train_reader)
 
     if sharding_stage == 2:
@@ -175,42 +179,50 @@ def test_dp_stage2():
     mlp7.set_state_dict(state_dict)
 
     # DP VS stage2
-    dp_params = train_mlp(
-        mlp1, sharding_stage="dp", use_pure_fp16=False, opt_group=False)
-    stage2_params = train_mlp(
-        mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=False)
+    dp_params = train_mlp(mlp1,
+                          sharding_stage="dp",
+                          use_pure_fp16=False,
+                          opt_group=False)
+    stage2_params = train_mlp(mlp2,
+                              sharding_stage=2,
+                              use_pure_fp16=False,
+                              opt_group=False)
     for i in range(len(dp_params)):
-        np.testing.assert_allclose(
-            dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6)
+        np.testing.assert_allclose(dp_params[i].numpy(),
+                                   stage2_params[i].numpy(),
+                                   rtol=1e-6)
 
     # stage2 accumulate grad
     stage2_params = train_mlp(mlp3, sharding_stage=2, accumulate_grad=True)
-    stage2_accumulate_grad = train_mlp(
-        mlp4, sharding_stage=2, batch_size=20, accumulate_grad=True)
+    stage2_accumulate_grad = train_mlp(mlp4,
+                                       sharding_stage=2,
+                                       batch_size=20,
+                                       accumulate_grad=True)
     for i in range(len(stage2_params)):
-        np.testing.assert_allclose(
-            stage2_params[i].numpy(),
-            stage2_accumulate_grad[i].numpy(),
-            rtol=1e-5,
-            atol=1e-5)
+        np.testing.assert_allclose(stage2_params[i].numpy(),
+                                   stage2_accumulate_grad[i].numpy(),
+                                   rtol=1e-5,
+                                   atol=1e-5)
 
     # stage2 param list VS param group
-    stage2_params = train_mlp(
-        mlp5, sharding_stage=2, use_pure_fp16=False, opt_group=True)
+    stage2_params = train_mlp(mlp5,
+                              sharding_stage=2,
+                              use_pure_fp16=False,
+                              opt_group=True)
     for i in range(len(dp_params)):
-        np.testing.assert_allclose(
-            dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6)
+        np.testing.assert_allclose(dp_params[i].numpy(),
+                                   stage2_params[i].numpy(),
+                                   rtol=1e-6)
 
     # save/load model
     output_dir = tempfile.mkdtemp()
     model_file = os.path.join(output_dir, "model.pdmodel")
     optimizer_file = os.path.join(output_dir, "model.pdopt")
-    model_stage2, optimizer_stage2 = train_mlp(
-        mlp6,
-        sharding_stage=2,
-        use_pure_fp16=False,
-        opt_group=False,
-        save_model=True)
+    model_stage2, optimizer_stage2 = train_mlp(mlp6,
+                                               sharding_stage=2,
+                                               use_pure_fp16=False,
+                                               opt_group=False,
+                                               save_model=True)
     paddle.save(model_stage2.state_dict(), model_file)
     paddle.save(optimizer_stage2.state_dict(), optimizer_file)
     m_state_dict = paddle.load(model_file)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py
index b09314ae9e31c..060b856505f63 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py
@@ -1,13 +1,13 @@
 # -*- coding: UTF-8 -*-
 
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -47,19 +47,20 @@ def train_mlp(model, offload=False):
     scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
     scaler = GroupShardedScaler(scaler)
 
-    optimizer = GroupShardedOptimizerStage2(
-        params=optimizer._parameter_list, optim=optimizer, offload=offload)
+    optimizer = GroupShardedOptimizerStage2(params=optimizer._parameter_list,
+                                            optim=optimizer,
+                                            offload=offload)
     model = GroupShardedStage2(model, optimizer, buffer_max_size=2**21)
 
-    train_reader = paddle.batch(
-        reader_decorator(linear_size), batch_size=batch_size, drop_last=True)
+    train_reader = paddle.batch(reader_decorator(linear_size),
+                                batch_size=batch_size,
+                                drop_last=True)
 
-    train_loader = paddle.io.DataLoader.from_generator(
-        capacity=32,
-        use_double_buffer=True,
-        iterable=True,
-        return_list=True,
-        use_multiprocess=True)
+    train_loader = paddle.io.DataLoader.from_generator(capacity=32,
+                                                       use_double_buffer=True,
+                                                       iterable=True,
+                                                       return_list=True,
+                                                       use_multiprocess=True)
     train_loader.set_sample_list_generator(train_reader)
 
     for eop in range(epoch):
@@ -72,8 +73,8 @@ def train_mlp(model, offload=False):
 
             with paddle.amp.auto_cast(True, level='O2'):
                 out = model(img)
-                loss = paddle.nn.functional.cross_entropy(
-                    input=out, label=label)
+                loss = paddle.nn.functional.cross_entropy(input=out,
+                                                          label=label)
 
             avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
             scaler.scale(avg_loss).backward()
@@ -99,11 +100,10 @@ def test_sharding_stage2_offload():
     mlp_offload_params = train_mlp(mlp_offload, offload=True)
 
     for i in range(len(mlp_params)):
-        np.testing.assert_allclose(
-            mlp_params[i].numpy(),
-            mlp_offload_params[i].numpy(),
-            rtol=5e-3,
-            atol=5e-3)
+        np.testing.assert_allclose(mlp_params[i].numpy(),
+                                   mlp_offload_params[i].numpy(),
+                                   rtol=5e-3,
+                                   atol=5e-3)
     return
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3.py
index 6c350e63f444c..31b56cdfb8c7d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3.py
@@ -1,13 +1,13 @@
 # -*- coding: UTF-8 -*-
 
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -42,6 +42,7 @@
 
 
 class MLP(fluid.Layer):
+
     def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
@@ -57,6 +58,7 @@ def forward(self, inputs):
 
 
 def reader_decorator(linear_size=1000):
+
     def __reader__():
         for _ in range(100):
             img = np.random.rand(linear_size).astype('float32')
@@ -91,28 +93,31 @@ def train_mlp(model,
               save_model=False):
     group = paddle.distributed.new_group([0, 1])
     if opt_group:
-        optimizer = optimizer_setting(
-            model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group)
+        optimizer = optimizer_setting(model=model,
+                                      use_pure_fp16=use_pure_fp16,
+                                      opt_group=opt_group)
     else:
         optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
 
     if use_pure_fp16:
-        model = paddle.amp.decorate(
-            models=model, level='O2', save_dtype='float32')
+        model = paddle.amp.decorate(models=model,
+                                    level='O2',
+                                    save_dtype='float32')
         scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
         scaler = GroupShardedScaler(scaler)
     if sharding_stage == 2:
         optimizer = GroupShardedOptimizerStage2(
             params=optimizer._parameter_list, optim=optimizer, group=group)
-        model = GroupShardedStage2(
-            model, optimizer, group=group, buffer_max_size=2**21)
+        model = GroupShardedStage2(model,
+                                   optimizer,
+                                   group=group,
+                                   buffer_max_size=2**21)
     elif sharding_stage == 3:
-        model = GroupShardedStage3(
-            model,
-            optimizer=optimizer,
-            group=group,
-            sync_comm=sync_comm,
-            segment_size=2**15)
+        model = GroupShardedStage3(model,
+                                   optimizer=optimizer,
+                                   group=group,
+                                   sync_comm=sync_comm,
+                                   segment_size=2**15)
 
     # check optimizer.minimize() error
     if test_minimize:
@@ -123,15 +128,15 @@ def train_mlp(model,
                 "====== Find sharding_stage3_optimizer.minimize() error ======")
         return
 
-    train_reader = paddle.batch(
-        reader_decorator(), batch_size=batch_size, drop_last=True)
+    train_reader = paddle.batch(reader_decorator(),
+                                batch_size=batch_size,
+                                drop_last=True)
 
-    train_loader = paddle.io.DataLoader.from_generator(
-        capacity=32,
-        use_double_buffer=True,
-        iterable=True,
-        return_list=True,
-        use_multiprocess=True)
+    train_loader = paddle.io.DataLoader.from_generator(capacity=32,
+                                                       use_double_buffer=True,
+                                                       iterable=True,
+                                                       return_list=True,
+                                                       use_multiprocess=True)
     train_loader.set_sample_list_generator(train_reader)
 
     for eop in range(epoch):
@@ -142,8 +147,8 @@ def train_mlp(model,
             img.stop_gradient = True
             with paddle.amp.auto_cast(True, level='O2'):
                 out = model(img)
-                loss = paddle.nn.functional.cross_entropy(
-                    input=out, label=label)
+                loss = paddle.nn.functional.cross_entropy(input=out,
+                                                          label=label)
             avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
 
             if batch_size == 20:
@@ -192,75 +197,79 @@ def test_stage2_stage3():
     mlp9.set_state_dict(state_dict)
     mlp10.set_state_dict(state_dict)
 
-    # fp32 
-    stage2_params = train_mlp(
-        mlp1, sharding_stage=2, use_pure_fp16=False, opt_group=False)
-    stage3_params = train_mlp(
-        mlp2, sharding_stage=3, use_pure_fp16=False, opt_group=False)
+    # fp32
+    stage2_params = train_mlp(mlp1,
+                              sharding_stage=2,
+                              use_pure_fp16=False,
+                              opt_group=False)
+    stage3_params = train_mlp(mlp2,
+                              sharding_stage=3,
+                              use_pure_fp16=False,
+                              opt_group=False)
 
     for i in range(len(stage2_params)):
-        np.testing.assert_allclose(
-            stage2_params[i].numpy(),
-            stage3_params[i].numpy(),
-            rtol=1e-6,
-            atol=1e-6)
+        np.testing.assert_allclose(stage2_params[i].numpy(),
+                                   stage3_params[i].numpy(),
+                                   rtol=1e-6,
+                                   atol=1e-6)
 
     # fp32 accumulate grad
-    stage3_params = train_mlp(
-        mlp3,
-        sharding_stage=3,
-        use_pure_fp16=False,
-        accumulate_grad=True,
-        opt_group=True)
-    stage3_params_add = train_mlp(
-        mlp4,
-        sharding_stage=3,
-        use_pure_fp16=False,
-        accumulate_grad=True,
-        batch_size=20,
-        opt_group=True)
+    stage3_params = train_mlp(mlp3,
+                              sharding_stage=3,
+                              use_pure_fp16=False,
+                              accumulate_grad=True,
+                              opt_group=True)
+    stage3_params_add = train_mlp(mlp4,
+                                  sharding_stage=3,
+                                  use_pure_fp16=False,
+                                  accumulate_grad=True,
+                                  batch_size=20,
+                                  opt_group=True)
     for i in range(len(stage3_params)):
-        np.testing.assert_allclose(
-            stage3_params[i].numpy(),
-            stage3_params_add[i].numpy(),
-            rtol=1e-6,
-            atol=1e-4)
+        np.testing.assert_allclose(stage3_params[i].numpy(),
+                                   stage3_params_add[i].numpy(),
+                                   rtol=1e-6,
+                                   atol=1e-4)
 
     # fp16
-    stage2_params = train_mlp(
-        mlp5, sharding_stage=2, use_pure_fp16=True, opt_group=False)
-    stage3_params = train_mlp(
-        mlp6, sharding_stage=3, use_pure_fp16=True, opt_group=False)
+    stage2_params = train_mlp(mlp5,
+                              sharding_stage=2,
+                              use_pure_fp16=True,
+                              opt_group=False)
+    stage3_params = train_mlp(mlp6,
+                              sharding_stage=3,
+                              use_pure_fp16=True,
+                              opt_group=False)
     for i in range(len(stage2_params)):
-        np.testing.assert_allclose(
-            stage2_params[i].numpy(),
-            stage3_params[i].numpy(),
-            rtol=1e-4,
-            atol=1e-3)
+        np.testing.assert_allclose(stage2_params[i].numpy(),
+                                   stage3_params[i].numpy(),
+                                   rtol=1e-4,
+                                   atol=1e-3)
 
     # fp16 sync_comm
-    stage3_params = train_mlp(
-        mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False)
-    stage3_params_re = train_mlp(
-        mlp8,
-        sharding_stage=3,
-        use_pure_fp16=True,
-        opt_group=False,
-        sync_comm=True)
+    stage3_params = train_mlp(mlp7,
+                              sharding_stage=3,
+                              use_pure_fp16=True,
+                              opt_group=False)
+    stage3_params_re = train_mlp(mlp8,
+                                 sharding_stage=3,
+                                 use_pure_fp16=True,
+                                 opt_group=False,
+                                 sync_comm=True)
     for i in range(len(stage3_params)):
-        np.testing.assert_allclose(
-            stage3_params[i].numpy(), stage3_params_re[i].numpy(), rtol=1e-6)
+        np.testing.assert_allclose(stage3_params[i].numpy(),
+                                   stage3_params_re[i].numpy(),
+                                   rtol=1e-6)
 
     # save/load model
     output_dir = tempfile.mkdtemp()
     model_file = os.path.join(output_dir, "model.pdmodel")
     optimizer_file = os.path.join(output_dir, "model.pdopt")
-    model_stage3, optimizer_stage3 = train_mlp(
-        mlp9,
-        sharding_stage=3,
-        use_pure_fp16=False,
-        opt_group=False,
-        save_model=True)
+    model_stage3, optimizer_stage3 = train_mlp(mlp9,
+                                               sharding_stage=3,
+                                               use_pure_fp16=False,
+                                               opt_group=False,
+                                               save_model=True)
     paddle.save(model_stage3.state_dict(), model_file)
     paddle.save(optimizer_stage3.state_dict(), optimizer_file)
     m_state_dict = paddle.load(model_file)
@@ -270,12 +279,11 @@ def test_stage2_stage3():
     shutil.rmtree(output_dir)
 
     # check optimizer.minimize() error
-    train_mlp(
-        mlp10,
-        sharding_stage=3,
-        use_pure_fp16=False,
-        opt_group=False,
-        test_minimize=True)
+    train_mlp(mlp10,
+              sharding_stage=3,
+              use_pure_fp16=False,
+              opt_group=False,
+              test_minimize=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3_offload.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3_offload.py
index 5f9ec5c6e708e..da84fb67ca9c6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3_offload.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage3_offload.py
@@ -1,13 +1,13 @@
 # -*- coding: UTF-8 -*-
 
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -37,6 +37,7 @@
 
 
 class MLP(fluid.Layer):
+
     def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
@@ -52,6 +53,7 @@ def forward(self, inputs):
 
 
 def reader_decorator(linear_size=1000):
+
     def __reader__():
         for _ in range(100):
             img = np.random.rand(linear_size).astype('float32')
@@ -63,14 +65,13 @@ def __reader__():
 
 def optimizer_setting(model, use_pure_fp16, opt_group=False):
     clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
-    optimizer = paddle.optimizer.AdamW(
-        parameters=[{
-            "params": model.parameters()
-        }] if opt_group else model.parameters(),
-        learning_rate=0.001,
-        weight_decay=0.00001,
-        grad_clip=clip,
-        multi_precision=use_pure_fp16)
+    optimizer = paddle.optimizer.AdamW(parameters=[{
+        "params": model.parameters()
+    }] if opt_group else model.parameters(),
+                                       learning_rate=0.001,
+                                       weight_decay=0.00001,
+                                       grad_clip=clip,
+                                       multi_precision=use_pure_fp16)
 
     return optimizer
 
@@ -85,27 +86,27 @@ def train_mlp(model,
     optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
 
     if use_pure_fp16:
-        model = paddle.amp.decorate(
-            models=model, level='O2', save_dtype='float32')
+        model = paddle.amp.decorate(models=model,
+                                    level='O2',
+                                    save_dtype='float32')
         scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
         scaler = GroupShardedScaler(scaler)
 
-    model = GroupShardedStage3(
-        model,
-        optimizer=optimizer,
-        group=group,
-        offload=offload,
-        segment_size=2**15)
-
-    train_reader = paddle.batch(
-        reader_decorator(), batch_size=batch_size, drop_last=True)
-
-    train_loader = paddle.io.DataLoader.from_generator(
-        capacity=32,
-        use_double_buffer=True,
-        iterable=True,
-        return_list=True,
-        use_multiprocess=True)
+    model = GroupShardedStage3(model,
+                               optimizer=optimizer,
+                               group=group,
+                               offload=offload,
+                               segment_size=2**15)
+
+    train_reader = paddle.batch(reader_decorator(),
+                                batch_size=batch_size,
+                                drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(capacity=32,
+                                                       use_double_buffer=True,
+                                                       iterable=True,
+                                                       return_list=True,
+                                                       use_multiprocess=True)
     train_loader.set_sample_list_generator(train_reader)
 
     for eop in range(epoch):
@@ -116,8 +117,8 @@ def train_mlp(model,
             img.stop_gradient = True
             with paddle.amp.auto_cast(True, level='O2'):
                 out = model(img)
-                loss = paddle.nn.functional.cross_entropy(
-                    input=out, label=label)
+                loss = paddle.nn.functional.cross_entropy(input=out,
+                                                          label=label)
             avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
 
             if accumulate_grad:
@@ -165,38 +166,36 @@ def test_stage3_offload():
     stage3_params = train_mlp(mlp1, use_pure_fp16=False)
     stage3_params_offload = train_mlp(mlp2, use_pure_fp16=False, offload=True)
     for i in range(len(stage3_params)):
-        np.testing.assert_allclose(
-            stage3_params[i].numpy(),
-            stage3_params_offload[i].numpy(),
-            rtol=1e-6,
-            atol=1e-8)
+        np.testing.assert_allclose(stage3_params[i].numpy(),
+                                   stage3_params_offload[i].numpy(),
+                                   rtol=1e-6,
+                                   atol=1e-8)
 
     # fp16 offload
     stage3_params = train_mlp(mlp3, use_pure_fp16=True)
     stage3_params_offload = train_mlp(mlp4, use_pure_fp16=True, offload=True)
     for i in range(len(stage3_params)):
-        np.testing.assert_allclose(
-            stage3_params[i].numpy(),
-            stage3_params_offload[i].numpy(),
-            rtol=1e-2,
-            atol=1e-2)
+        np.testing.assert_allclose(stage3_params[i].numpy(),
+                                   stage3_params_offload[i].numpy(),
+                                   rtol=1e-2,
+                                   atol=1e-2)
 
     # fp32 accumulate grad offload
-    stage3_params = train_mlp(
-        mlp5, use_pure_fp16=False, batch_size=20, accumulate_grad=True)
-    stage3_params_offload = train_mlp(
-        mlp6,
-        use_pure_fp16=False,
-        accumulate_grad=True,
-        offload=True,
-        batch_size=20,
-        convert2cpu=True)
+    stage3_params = train_mlp(mlp5,
+                              use_pure_fp16=False,
+                              batch_size=20,
+                              accumulate_grad=True)
+    stage3_params_offload = train_mlp(mlp6,
+                                      use_pure_fp16=False,
+                                      accumulate_grad=True,
+                                      offload=True,
+                                      batch_size=20,
+                                      convert2cpu=True)
     for i in range(len(stage3_params)):
-        np.testing.assert_allclose(
-            stage3_params[i].numpy(),
-            stage3_params_offload[i].numpy(),
-            rtol=1e-6,
-            atol=1e-8)
+        np.testing.assert_allclose(stage3_params[i].numpy(),
+                                   stage3_params_offload[i].numpy(),
+                                   rtol=1e-6,
+                                   atol=1e-8)
     return
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
index 0ed9b681fdcf5..22d001c7c4633 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
@@ -1,13 +1,13 @@
 # -*- coding: UTF-8 -*-
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -38,6 +38,7 @@
 
 
 class MLP(fluid.Layer):
+
     def __init__(self, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
@@ -51,6 +52,7 @@ def forward(self, inputs):
 
 
 def reader_decorator():
+
     def __reader__():
         for _ in range(100):
             img = np.random.rand(10).astype('float32')
@@ -76,18 +78,18 @@ def train_mlp():
     mlp = MLP()
 
     optimizer = optimizer_setting(parameter_list=mlp.parameters())
-    oss_optimizer = ShardingOptimizerStage2(
-        params=mlp.parameters(), optim=optimizer, group=group)
+    oss_optimizer = ShardingOptimizerStage2(params=mlp.parameters(),
+                                            optim=optimizer,
+                                            group=group)
     # cover grad_storage code
     trainable_param2align = dict()
     for p in mlp.parameters():
         trainable_param2align[p.name] = 0
-    grad_storage = GradStorage(
-        10000,
-        dtype=paddle.float32,
-        device="gpu",
-        destination=0,
-        parm2align=trainable_param2align)
+    grad_storage = GradStorage(10000,
+                               dtype=paddle.float32,
+                               device="gpu",
+                               destination=0,
+                               parm2align=trainable_param2align)
     for p in mlp.parameters():
         grad_storage.can_add_grad_view(p, trainable_param2align[p.name])
         grad_storage.add_grad(p, trainable_param2align[p.name])
@@ -95,15 +97,15 @@ def train_mlp():
     grad_storage.rebuild()
     grad_storage.reset_checked_in()
 
-    train_reader = paddle.batch(
-        reader_decorator(), batch_size=batch_size, drop_last=True)
+    train_reader = paddle.batch(reader_decorator(),
+                                batch_size=batch_size,
+                                drop_last=True)
 
-    train_loader = paddle.io.DataLoader.from_generator(
-        capacity=32,
-        use_double_buffer=True,
-        iterable=True,
-        return_list=True,
-        use_multiprocess=True)
+    train_loader = paddle.io.DataLoader.from_generator(capacity=32,
+                                                       use_double_buffer=True,
+                                                       iterable=True,
+                                                       return_list=True,
+                                                       use_multiprocess=True)
     train_loader.set_sample_list_generator(train_reader)
 
     for eop in range(epoch):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
index 58432540d1b82..756b1bfb6074d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -1,13 +1,13 @@
 # -*- coding: UTF-8 -*-
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -48,6 +48,7 @@
 
 
 class MLP(fluid.Layer):
+
     def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
@@ -63,6 +64,7 @@ def forward(self, inputs):
 
 
 def reader_decorator(linear_size=1000):
+
     def __reader__():
         for _ in range(100):
             img = np.random.rand(linear_size).astype('float32')
@@ -74,14 +76,13 @@ def __reader__():
 
 def optimizer_setting(model, use_pure_fp16, opt_group=False):
     clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
-    optimizer = paddle.optimizer.AdamW(
-        parameters=[{
-            "params": model.parameters()
-        }] if opt_group else model.parameters(),
-        learning_rate=0.001,
-        weight_decay=0.00001,
-        grad_clip=clip,
-        multi_precision=use_pure_fp16)
+    optimizer = paddle.optimizer.AdamW(parameters=[{
+        "params": model.parameters()
+    }] if opt_group else model.parameters(),
+                                       learning_rate=0.001,
+                                       weight_decay=0.00001,
+                                       grad_clip=clip,
+                                       multi_precision=use_pure_fp16)
 
     return optimizer
 
@@ -99,30 +100,34 @@ def train_mlp(model,
     else:
         group = paddle.distributed.new_group([0, 1])
     if opt_group:
-        optimizer = optimizer_setting(
-            model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group)
+        optimizer = optimizer_setting(model=model,
+                                      use_pure_fp16=use_pure_fp16,
+                                      opt_group=opt_group)
     else:
         optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
 
     if sharding_stage == 2:
-        optimizer = ShardingOptimizerStage2(
-            params=model.parameters(), optim=optimizer, group=group)
-
-        model = ShardingStage2(
-            model, optimizer, group=group, buffer_max_size=2**21)
+        optimizer = ShardingOptimizerStage2(params=model.parameters(),
+                                            optim=optimizer,
+                                            group=group)
+
+        model = ShardingStage2(model,
+                               optimizer,
+                               group=group,
+                               buffer_max_size=2**21)
     else:
         optimizer = fleet.distributed_optimizer(optimizer)
         model = fleet.distributed_model(model)
 
-    train_reader = paddle.batch(
-        reader_decorator(), batch_size=batch_size, drop_last=True)
+    train_reader = paddle.batch(reader_decorator(),
+                                batch_size=batch_size,
+                                drop_last=True)
 
-    train_loader = paddle.io.DataLoader.from_generator(
-        capacity=32,
-        use_double_buffer=True,
-        iterable=True,
-        return_list=True,
-        use_multiprocess=True)
+    train_loader = paddle.io.DataLoader.from_generator(capacity=32,
+                                                       use_double_buffer=True,
+                                                       iterable=True,
+                                                       return_list=True,
+                                                       use_multiprocess=True)
     train_loader.set_sample_list_generator(train_reader)
 
     if sharding_stage == 2:
@@ -174,42 +179,50 @@ def test_dp_stage2():
     mlp6.set_state_dict(state_dict)
 
     # DP VS stage2
-    dp_params = train_mlp(
-        mlp1, sharding_stage="dp", use_pure_fp16=False, opt_group=False)
-    stage2_params = train_mlp(
-        mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=False)
+    dp_params = train_mlp(mlp1,
+                          sharding_stage="dp",
+                          use_pure_fp16=False,
+                          opt_group=False)
+    stage2_params = train_mlp(mlp2,
+                              sharding_stage=2,
+                              use_pure_fp16=False,
+                              opt_group=False)
     for i in range(len(dp_params)):
-        np.testing.assert_allclose(
-            dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6)
+        np.testing.assert_allclose(dp_params[i].numpy(),
+                                   stage2_params[i].numpy(),
+                                   rtol=1e-6)
 
     # stage2 accumulate grad
     stage2_params = train_mlp(mlp3, sharding_stage=2, accumulate_grad=True)
-    stage2_accumulate_grad = train_mlp(
-        mlp4, sharding_stage=2, batch_size=20, accumulate_grad=True)
+    stage2_accumulate_grad = train_mlp(mlp4,
+                                       sharding_stage=2,
+                                       batch_size=20,
+                                       accumulate_grad=True)
     for i in range(len(stage2_params)):
-        np.testing.assert_allclose(
-            stage2_params[i].numpy(),
-            stage2_accumulate_grad[i].numpy(),
-            rtol=1e-5,
-            atol=1e-5)
+        np.testing.assert_allclose(stage2_params[i].numpy(),
+                                   stage2_accumulate_grad[i].numpy(),
+                                   rtol=1e-5,
+                                   atol=1e-5)
 
     # stage2 param list VS param group
-    stage2_params = train_mlp(
-        mlp5, sharding_stage=2, use_pure_fp16=False, opt_group=True)
+    stage2_params = train_mlp(mlp5,
+                              sharding_stage=2,
+                              use_pure_fp16=False,
+                              opt_group=True)
     for i in range(len(dp_params)):
-        np.testing.assert_allclose(
-            dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6)
+        np.testing.assert_allclose(dp_params[i].numpy(),
+                                   stage2_params[i].numpy(),
+                                   rtol=1e-6)
 
     # save/load model
     output_dir = tempfile.mkdtemp()
     model_file = os.path.join(output_dir, "model.pdmodel")
     optimizer_file = os.path.join(output_dir, "model.pdopt")
-    model_stage2, optimizer_stage2 = train_mlp(
-        mlp6,
-        sharding_stage=2,
-        use_pure_fp16=False,
-        opt_group=False,
-        save_model=True)
+    model_stage2, optimizer_stage2 = train_mlp(mlp6,
+                                               sharding_stage=2,
+                                               use_pure_fp16=False,
+                                               opt_group=False,
+                                               save_model=True)
     paddle.save(model_stage2.state_dict(), model_file)
     paddle.save(optimizer_stage2.state_dict(), optimizer_file)
     m_state_dict = paddle.load(model_file)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
index cd2d7b3f12765..1acdce548d580 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
@@ -1,13 +1,13 @@
 # -*- coding: UTF-8 -*-
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -55,19 +55,20 @@ def train_mlp(model, offload=False):
     scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
     scaler = ShardingScaler(scaler)
 
-    optimizer = ShardingOptimizerStage2(
-        params=model.parameters(), optim=optimizer, offload=offload)
+    optimizer = ShardingOptimizerStage2(params=model.parameters(),
+                                        optim=optimizer,
+                                        offload=offload)
     model = ShardingStage2(model, optimizer, buffer_max_size=2**21)
 
-    train_reader = paddle.batch(
-        reader_decorator(linear_size), batch_size=batch_size, drop_last=True)
+    train_reader = paddle.batch(reader_decorator(linear_size),
+                                batch_size=batch_size,
+                                drop_last=True)
 
-    train_loader = paddle.io.DataLoader.from_generator(
-        capacity=32,
-        use_double_buffer=True,
-        iterable=True,
-        return_list=True,
-        use_multiprocess=True)
+    train_loader = paddle.io.DataLoader.from_generator(capacity=32,
+                                                       use_double_buffer=True,
+                                                       iterable=True,
+                                                       return_list=True,
+                                                       use_multiprocess=True)
     train_loader.set_sample_list_generator(train_reader)
 
     for eop in range(epoch):
@@ -80,8 +81,8 @@ def train_mlp(model, offload=False):
 
             with paddle.amp.auto_cast(True, level='O2'):
                 out = model(img)
-                loss = paddle.nn.functional.cross_entropy(
-                    input=out, label=label)
+                loss = paddle.nn.functional.cross_entropy(input=out,
+                                                          label=label)
 
             avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
             scaler.scale(avg_loss).backward()
@@ -106,11 +107,10 @@ def test_sharding_stage2_offload():
     mlp_offload_params = train_mlp(mlp_offload, offload=True)
 
     for i in range(len(mlp_params)):
-        np.testing.assert_allclose(
-            mlp_params[i].numpy(),
-            mlp_offload_params[i].numpy(),
-            rtol=5e-3,
-            atol=5e-3)
+        np.testing.assert_allclose(mlp_params[i].numpy(),
+                                   mlp_offload_params[i].numpy(),
+                                   rtol=5e-3,
+                                   atol=5e-3)
     return
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
index fc4002ef405bd..c48e7a36424cc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
@@ -1,13 +1,13 @@
 # -*- coding: UTF-8 -*-
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -42,6 +42,7 @@
 
 
 class MLP(fluid.Layer):
+
     def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
@@ -57,6 +58,7 @@ def forward(self, inputs):
 
 
 def reader_decorator(linear_size=1000):
+
     def __reader__():
         for _ in range(100):
             img = np.random.rand(linear_size).astype('float32')
@@ -91,24 +93,31 @@ def train_mlp(model,
               save_model=False):
     group = paddle.distributed.new_group([0, 1])
     if opt_group:
-        optimizer = optimizer_setting(
-            model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group)
+        optimizer = optimizer_setting(model=model,
+                                      use_pure_fp16=use_pure_fp16,
+                                      opt_group=opt_group)
     else:
         optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
 
     if use_pure_fp16:
-        model = paddle.amp.decorate(
-            models=model, level='O2', save_dtype='float32')
+        model = paddle.amp.decorate(models=model,
+                                    level='O2',
+                                    save_dtype='float32')
         scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
         scaler = ShardingScaler(scaler)
     if sharding_stage == 2:
-        optimizer = ShardingOptimizerStage2(
-            params=model.parameters(), optim=optimizer, group=group)
-        model = ShardingStage2(
-            model, optimizer, group=group, buffer_max_size=2**21)
+        optimizer = ShardingOptimizerStage2(params=model.parameters(),
+                                            optim=optimizer,
+                                            group=group)
+        model = ShardingStage2(model,
+                               optimizer,
+                               group=group,
+                               buffer_max_size=2**21)
     elif sharding_stage == 3:
-        model = ShardingStage3(
-            model, optimizer=optimizer, group=group, sync_comm=sync_comm)
+        model = ShardingStage3(model,
+                               optimizer=optimizer,
+                               group=group,
+                               sync_comm=sync_comm)
 
     # check optimizer.minimize() error
     if test_minimize:
@@ -119,15 +128,15 @@ def train_mlp(model,
                 "====== Find sharding_stage3_optimizer.minimize() error ======")
         return
 
-    train_reader = paddle.batch(
-        reader_decorator(), batch_size=batch_size, drop_last=True)
+    train_reader = paddle.batch(reader_decorator(),
+                                batch_size=batch_size,
+                                drop_last=True)
 
-    train_loader = paddle.io.DataLoader.from_generator(
-        capacity=32,
-        use_double_buffer=True,
-        iterable=True,
-        return_list=True,
-        use_multiprocess=True)
+    train_loader = paddle.io.DataLoader.from_generator(capacity=32,
+                                                       use_double_buffer=True,
+                                                       iterable=True,
+                                                       return_list=True,
+                                                       use_multiprocess=True)
     train_loader.set_sample_list_generator(train_reader)
 
     for eop in range(epoch):
@@ -138,8 +147,8 @@ def train_mlp(model,
             img.stop_gradient = True
             with paddle.amp.auto_cast(True, level='O2'):
                 out = model(img)
-                loss = paddle.nn.functional.cross_entropy(
-                    input=out, label=label)
+                loss = paddle.nn.functional.cross_entropy(input=out,
+                                                          label=label)
             avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
 
             if batch_size == 20:
@@ -187,75 +196,79 @@ def test_stage2_stage3():
     mlp9.set_state_dict(state_dict)
     mlp10.set_state_dict(state_dict)
 
-    # fp32 
-    stage2_params = train_mlp(
-        mlp1, sharding_stage=2, use_pure_fp16=False, opt_group=False)
-    stage3_params = train_mlp(
-        mlp2, sharding_stage=3, use_pure_fp16=False, opt_group=False)
+    # fp32
+    stage2_params = train_mlp(mlp1,
+                              sharding_stage=2,
+                              use_pure_fp16=False,
+                              opt_group=False)
+    stage3_params = train_mlp(mlp2,
+                              sharding_stage=3,
+                              use_pure_fp16=False,
+                              opt_group=False)
 
     for i in range(len(stage2_params)):
-        np.testing.assert_allclose(
-            stage2_params[i].numpy(),
-            stage3_params[i].numpy(),
-            rtol=1e-6,
-            atol=1e-6)
+        np.testing.assert_allclose(stage2_params[i].numpy(),
+                                   stage3_params[i].numpy(),
+                                   rtol=1e-6,
+                                   atol=1e-6)
 
     # fp32 accumulate grad
-    stage3_params = train_mlp(
-        mlp3,
-        sharding_stage=3,
-        use_pure_fp16=False,
-        accumulate_grad=True,
-        opt_group=True)
-    stage3_params_add = train_mlp(
-        mlp4,
-        sharding_stage=3,
-        use_pure_fp16=False,
-        accumulate_grad=True,
-        batch_size=20,
-        opt_group=True)
+    stage3_params = train_mlp(mlp3,
+                              sharding_stage=3,
+                              use_pure_fp16=False,
+                              accumulate_grad=True,
+                              opt_group=True)
+    stage3_params_add = train_mlp(mlp4,
+                                  sharding_stage=3,
+                                  use_pure_fp16=False,
+                                  accumulate_grad=True,
+                                  batch_size=20,
+                                  opt_group=True)
     for i in range(len(stage3_params)):
-        np.testing.assert_allclose(
-            stage3_params[i].numpy(),
-            stage3_params_add[i].numpy(),
-            rtol=1e-6,
-            atol=1e-4)
+        np.testing.assert_allclose(stage3_params[i].numpy(),
+                                   stage3_params_add[i].numpy(),
+                                   rtol=1e-6,
+                                   atol=1e-4)
 
     # fp16
-    stage2_params = train_mlp(
-        mlp5, sharding_stage=2, use_pure_fp16=True, opt_group=False)
-    stage3_params = train_mlp(
-        mlp6, sharding_stage=3, use_pure_fp16=True, opt_group=False)
+    stage2_params = train_mlp(mlp5,
+                              sharding_stage=2,
+                              use_pure_fp16=True,
+                              opt_group=False)
+    stage3_params = train_mlp(mlp6,
+                              sharding_stage=3,
+                              use_pure_fp16=True,
+                              opt_group=False)
     for i in range(len(stage2_params)):
-        np.testing.assert_allclose(
-            stage2_params[i].numpy(),
-            stage3_params[i].numpy(),
-            rtol=1e-4,
-            atol=1e-3)
+        np.testing.assert_allclose(stage2_params[i].numpy(),
+                                   stage3_params[i].numpy(),
+                                   rtol=1e-4,
+                                   atol=1e-3)
 
     # fp16 sync_comm
-    stage3_params = train_mlp(
-        mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False)
-    stage3_params_re = train_mlp(
-        mlp8,
-        sharding_stage=3,
-        use_pure_fp16=True,
-        opt_group=False,
-        sync_comm=True)
+    stage3_params = train_mlp(mlp7,
+                              sharding_stage=3,
+                              use_pure_fp16=True,
+                              opt_group=False)
+    stage3_params_re = train_mlp(mlp8,
+                                 sharding_stage=3,
+                                 use_pure_fp16=True,
+                                 opt_group=False,
+                                 sync_comm=True)
     for i in range(len(stage3_params)):
-        np.testing.assert_allclose(
-            stage3_params[i].numpy(), stage3_params_re[i].numpy(), rtol=1e-6)
+        np.testing.assert_allclose(stage3_params[i].numpy(),
+                                   stage3_params_re[i].numpy(),
+                                   rtol=1e-6)
 
     # save/load model
     output_dir = tempfile.mkdtemp()
     model_file = os.path.join(output_dir, "model.pdmodel")
     optimizer_file = os.path.join(output_dir, "model.pdopt")
-    model_stage3, optimizer_stage3 = train_mlp(
-        mlp9,
-        sharding_stage=3,
-        use_pure_fp16=False,
-        opt_group=False,
-        save_model=True)
+    model_stage3, optimizer_stage3 = train_mlp(mlp9,
+                                               sharding_stage=3,
+                                               use_pure_fp16=False,
+                                               opt_group=False,
+                                               save_model=True)
     paddle.save(model_stage3.state_dict(), model_file)
     paddle.save(optimizer_stage3.state_dict(), optimizer_file)
     m_state_dict = paddle.load(model_file)
@@ -265,12 +278,11 @@ def test_stage2_stage3():
     shutil.rmtree(output_dir)
 
     # check optimizer.minimize() error
-    train_mlp(
-        mlp10,
-        sharding_stage=3,
-        use_pure_fp16=False,
-        opt_group=False,
-        test_minimize=True)
+    train_mlp(mlp10,
+              sharding_stage=3,
+              use_pure_fp16=False,
+              opt_group=False,
+              test_minimize=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py
index 763a7a8b97fdd..19c0c91c20df8 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py
@@ -1,13 +1,13 @@
 # -*- coding: UTF-8 -*-
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -37,6 +37,7 @@
 
 
 class MLP(fluid.Layer):
+
     def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
@@ -52,6 +53,7 @@ def forward(self, inputs):
 
 
 def reader_decorator(linear_size=1000):
+
     def __reader__():
         for _ in range(100):
             img = np.random.rand(linear_size).astype('float32')
@@ -63,14 +65,13 @@ def __reader__():
 
 def optimizer_setting(model, use_pure_fp16, opt_group=False):
     clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
-    optimizer = paddle.optimizer.AdamW(
-        parameters=[{
-            "params": model.parameters()
-        }] if opt_group else model.parameters(),
-        learning_rate=0.001,
-        weight_decay=0.00001,
-        grad_clip=clip,
-        multi_precision=use_pure_fp16)
+    optimizer = paddle.optimizer.AdamW(parameters=[{
+        "params": model.parameters()
+    }] if opt_group else model.parameters(),
+                                       learning_rate=0.001,
+                                       weight_decay=0.00001,
+                                       grad_clip=clip,
+                                       multi_precision=use_pure_fp16)
 
     return optimizer
 
@@ -85,23 +86,26 @@ def train_mlp(model,
     optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
 
     if use_pure_fp16:
-        model = paddle.amp.decorate(
-            models=model, level='O2', save_dtype='float32')
+        model = paddle.amp.decorate(models=model,
+                                    level='O2',
+                                    save_dtype='float32')
         scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
         scaler = ShardingScaler(scaler)
 
-    model = ShardingStage3(
-        model, optimizer=optimizer, group=group, offload=offload)
+    model = ShardingStage3(model,
+                           optimizer=optimizer,
+                           group=group,
+                           offload=offload)
 
-    train_reader = paddle.batch(
-        reader_decorator(), batch_size=batch_size, drop_last=True)
+    train_reader = paddle.batch(reader_decorator(),
+                                batch_size=batch_size,
+                                drop_last=True)
 
-    train_loader = paddle.io.DataLoader.from_generator(
-        capacity=32,
-        use_double_buffer=True,
-        iterable=True,
-        return_list=True,
-        use_multiprocess=True)
+    train_loader = paddle.io.DataLoader.from_generator(capacity=32,
+                                                       use_double_buffer=True,
+                                                       iterable=True,
+                                                       return_list=True,
+                                                       use_multiprocess=True)
     train_loader.set_sample_list_generator(train_reader)
 
     for eop in range(epoch):
@@ -112,8 +116,8 @@ def train_mlp(model,
             img.stop_gradient = True
             with paddle.amp.auto_cast(True, level='O2'):
                 out = model(img)
-                loss = paddle.nn.functional.cross_entropy(
-                    input=out, label=label)
+                loss = paddle.nn.functional.cross_entropy(input=out,
+                                                          label=label)
             avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
 
             if accumulate_grad:
@@ -160,38 +164,36 @@ def test_stage3_offload():
     stage3_params = train_mlp(mlp1, use_pure_fp16=False)
     stage3_params_offload = train_mlp(mlp2, use_pure_fp16=False, offload=True)
     for i in range(len(stage3_params)):
-        np.testing.assert_allclose(
-            stage3_params[i].numpy(),
-            stage3_params_offload[i].numpy(),
-            rtol=1e-6,
-            atol=1e-8)
+        np.testing.assert_allclose(stage3_params[i].numpy(),
+                                   stage3_params_offload[i].numpy(),
+                                   rtol=1e-6,
+                                   atol=1e-8)
 
     # fp16 offload
     stage3_params = train_mlp(mlp3, use_pure_fp16=True)
     stage3_params_offload = train_mlp(mlp4, use_pure_fp16=True, offload=True)
     for i in range(len(stage3_params)):
-        np.testing.assert_allclose(
-            stage3_params[i].numpy(),
-            stage3_params_offload[i].numpy(),
-            rtol=1e-2,
-            atol=1e-2)
+        np.testing.assert_allclose(stage3_params[i].numpy(),
+                                   stage3_params_offload[i].numpy(),
+                                   rtol=1e-2,
+                                   atol=1e-2)
 
     # fp32 accumulate grad offload
-    stage3_params = train_mlp(
-        mlp5, use_pure_fp16=False, batch_size=20, accumulate_grad=True)
-    stage3_params_offload = train_mlp(
-        mlp6,
-        use_pure_fp16=False,
-        accumulate_grad=True,
-        offload=True,
-        batch_size=20,
-        convert2cpu=True)
+    stage3_params = train_mlp(mlp5,
+                              use_pure_fp16=False,
+                              batch_size=20,
+                              accumulate_grad=True)
+    stage3_params_offload = train_mlp(mlp6,
+                                      use_pure_fp16=False,
+                                      accumulate_grad=True,
+                                      offload=True,
+                                      batch_size=20,
+                                      convert2cpu=True)
     for i in range(len(stage3_params)):
-        np.testing.assert_allclose(
-            stage3_params[i].numpy(),
-            stage3_params_offload[i].numpy(),
-            rtol=1e-6,
-            atol=1e-8)
+        np.testing.assert_allclose(stage3_params[i].numpy(),
+                                   stage3_params_offload[i].numpy(),
+                                   rtol=1e-6,
+                                   atol=1e-8)
     return
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
index ddc959a29a2ef..f9a1e83d381fd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
@@ -1,35 +1,61 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0)
 set(DY2ST_EAGER_TEST_ENVS ${GC_ENVS} FLAGS_enable_eager_mode=1)
 
-set(TEST_EAGER_OPS test_bmn test_break_continue test_ifelse test_loop test_mnist_amp 
-    test_mnist_pure_fp16 test_mobile_net test_program_translator test_ptb_lm test_reinforcement_learning 
-    test_resnet test_resnet_amp test_resnet_pure_fp16 test_se_resnet test_sentiment test_seq2seq 
-    test_tsm test_word2vec test_yolov3 test_bert test_cycle_gan test_lstm test_simnet test_transformer)
+set(TEST_EAGER_OPS
+    test_bmn
+    test_break_continue
+    test_ifelse
+    test_loop
+    test_mnist_amp
+    test_mnist_pure_fp16
+    test_mobile_net
+    test_program_translator
+    test_ptb_lm
+    test_reinforcement_learning
+    test_resnet
+    test_resnet_amp
+    test_resnet_pure_fp16
+    test_se_resnet
+    test_sentiment
+    test_seq2seq
+    test_tsm
+    test_word2vec
+    test_yolov3
+    test_bert
+    test_cycle_gan
+    test_lstm
+    test_simnet
+    test_transformer)
 list(REMOVE_ITEM TEST_OPS test_lac)
 # NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope will
 # be removed and will cause some random failed in multi-thread.
 if(NOT ON_INFER)
-    py_test_modules(test_lac MODULES test_lac ENVS FLAGS_enable_eager_mode=1)
-    set_tests_properties(test_lac PROPERTIES TIMEOUT 120)
+  py_test_modules(test_lac MODULES test_lac ENVS FLAGS_enable_eager_mode=1)
+  set_tests_properties(test_lac PROPERTIES TIMEOUT 120)
 endif()
 
 if(WIN32 AND NOT WITH_GPU)
-    list(REMOVE_ITEM TEST_OPS test_resnet_amp) # disable on Windows CPU CI for timeout
+  list(REMOVE_ITEM TEST_OPS test_resnet_amp
+  )# disable on Windows CPU CI for timeout
 endif()
 
 foreach(TEST_OP ${TEST_OPS})
-    list(FIND TEST_EAGER_OPS ${TEST_OP} WAS_FOUND)
-    if (NOT WAS_FOUND EQUAL -1)
-        py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${DY2ST_EAGER_TEST_ENVS})
-    else()
-        py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
-    endif()
+  list(FIND TEST_EAGER_OPS ${TEST_OP} WAS_FOUND)
+  if(NOT WAS_FOUND EQUAL -1)
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${DY2ST_EAGER_TEST_ENVS})
+  else()
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
+  endif()
 endforeach(TEST_OP)
 
 set_tests_properties(test_se_resnet PROPERTIES TIMEOUT 900)
-set_tests_properties(test_yolov3 PROPERTIES TIMEOUT 900 LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_yolov3 PROPERTIES TIMEOUT 900 LABELS
+                                            "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 120)
 set_tests_properties(test_seq2seq PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cycle_gan PROPERTIES TIMEOUT 150)
@@ -42,14 +68,14 @@ set_tests_properties(test_bmn PROPERTIES TIMEOUT 120)
 set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 120)
 
 if(NOT WIN32)
-    set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_tsm PROPERTIES TIMEOUT 900)
-    #set_tests_properties(test_resnet PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_tsm PROPERTIES TIMEOUT 900)
+  #set_tests_properties(test_resnet PROPERTIES TIMEOUT 120)
 endif()
 
 if(APPLE)
-    set_tests_properties(test_bmn PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_bmn PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_build_strategy PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 300)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
index b302dd37794fd..7ee6203fb9433 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
@@ -22,6 +22,7 @@
 
 
 class PositionwiseFeedForwardLayer(Layer):
+
     def __init__(self,
                  hidden_act,
                  d_inner_hid,
@@ -31,33 +32,35 @@ def __init__(self,
                  name=""):
         super(PositionwiseFeedForwardLayer, self).__init__()
 
-        self._i2h = Linear(
-            input_dim=d_model,
-            output_dim=d_inner_hid,
-            param_attr=fluid.ParamAttr(
-                name=name + '_fc_0.w_0', initializer=param_initializer),
-            bias_attr=name + '_fc_0.b_0',
-            act=hidden_act)
-
-        self._h2o = Linear(
-            input_dim=d_inner_hid,
-            output_dim=d_model,
-            param_attr=fluid.ParamAttr(
-                name=name + '_fc_1.w_0', initializer=param_initializer),
-            bias_attr=name + '_fc_1.b_0')
+        self._i2h = Linear(input_dim=d_model,
+                           output_dim=d_inner_hid,
+                           param_attr=fluid.ParamAttr(
+                               name=name + '_fc_0.w_0',
+                               initializer=param_initializer),
+                           bias_attr=name + '_fc_0.b_0',
+                           act=hidden_act)
+
+        self._h2o = Linear(input_dim=d_inner_hid,
+                           output_dim=d_model,
+                           param_attr=fluid.ParamAttr(
+                               name=name + '_fc_1.w_0',
+                               initializer=param_initializer),
+                           bias_attr=name + '_fc_1.b_0')
 
         self._dropout_rate = dropout_rate
 
     def forward(self, x):
         hidden = self._i2h(x)
         if self._dropout_rate:
-            hidden = fluid.layers.dropout(
-                hidden, dropout_prob=self._dropout_rate, is_test=False)
+            hidden = fluid.layers.dropout(hidden,
+                                          dropout_prob=self._dropout_rate,
+                                          is_test=False)
         out = self._h2o(hidden)
         return out
 
 
 class EncoderSubLayer(Layer):
+
     def __init__(self,
                  hidden_act,
                  n_head,
@@ -78,8 +81,9 @@ def __init__(self,
         self._preprocess_cmd = preprocess_cmd
         self._postprocess_cmd = postprocess_cmd
         self._prepostprocess_dropout = prepostprocess_dropout
-        self._preprocess_layer = PrePostProcessLayer(
-            self._preprocess_cmd, d_model, prepostprocess_dropout)
+        self._preprocess_layer = PrePostProcessLayer(self._preprocess_cmd,
+                                                     d_model,
+                                                     prepostprocess_dropout)
         self._multihead_attention_layer = MultiHeadAttention(
             d_key, d_value, d_model, n_head, attention_dropout,
             param_initializer)
@@ -108,6 +112,7 @@ def forward(self, enc_input, attn_bias):
 
 
 class EncoderLayer(Layer):
+
     def __init__(self,
                  hidden_act,
                  n_layer,
@@ -137,20 +142,19 @@ def __init__(self,
             self._encoder_sublayers.append(
                 self.add_sublayer(
                     'esl_%d' % i,
-                    EncoderSubLayer(
-                        hidden_act,
-                        n_head,
-                        d_key,
-                        d_value,
-                        d_model,
-                        d_inner_hid,
-                        prepostprocess_dropout,
-                        attention_dropout,
-                        relu_dropout,
-                        preprocess_cmd,
-                        postprocess_cmd,
-                        param_initializer,
-                        name=name + '_layer_' + str(i))))
+                    EncoderSubLayer(hidden_act,
+                                    n_head,
+                                    d_key,
+                                    d_value,
+                                    d_model,
+                                    d_inner_hid,
+                                    prepostprocess_dropout,
+                                    attention_dropout,
+                                    relu_dropout,
+                                    preprocess_cmd,
+                                    postprocess_cmd,
+                                    param_initializer,
+                                    name=name + '_layer_' + str(i))))
 
     def forward(self, enc_input, attn_bias):
         for i in range(self._n_layer):
@@ -161,6 +165,7 @@ def forward(self, enc_input, attn_bias):
 
 
 class BertModelLayer(Layer):
+
     def __init__(self, config, return_pooled_out=True, use_fp16=False):
         super(BertModelLayer, self).__init__()
 
@@ -183,31 +188,31 @@ def __init__(self, config, return_pooled_out=True, use_fp16=False):
         self._param_initializer = fluid.initializer.TruncatedNormal(
             scale=config['initializer_range'])
 
-        self._src_emb = Embedding(
-            size=[self._voc_size, self._emb_size],
-            param_attr=fluid.ParamAttr(
-                name=self._word_emb_name, initializer=self._param_initializer),
-            dtype=self._dtype)
+        self._src_emb = Embedding(size=[self._voc_size, self._emb_size],
+                                  param_attr=fluid.ParamAttr(
+                                      name=self._word_emb_name,
+                                      initializer=self._param_initializer),
+                                  dtype=self._dtype)
 
         self._pos_emb = Embedding(
             size=[self._max_position_seq_len, self._emb_size],
-            param_attr=fluid.ParamAttr(
-                name=self._pos_emb_name, initializer=self._param_initializer),
+            param_attr=fluid.ParamAttr(name=self._pos_emb_name,
+                                       initializer=self._param_initializer),
             dtype=self._dtype)
 
-        self._sent_emb = Embedding(
-            size=[self._sent_types, self._emb_size],
-            param_attr=fluid.ParamAttr(
-                name=self._sent_emb_name, initializer=self._param_initializer),
-            dtype=self._dtype)
+        self._sent_emb = Embedding(size=[self._sent_types, self._emb_size],
+                                   param_attr=fluid.ParamAttr(
+                                       name=self._sent_emb_name,
+                                       initializer=self._param_initializer),
+                                   dtype=self._dtype)
 
-        self.pooled_fc = Linear(
-            input_dim=self._emb_size,
-            output_dim=self._emb_size,
-            param_attr=fluid.ParamAttr(
-                name="pooled_fc.w_0", initializer=self._param_initializer),
-            bias_attr="pooled_fc.b_0",
-            act="tanh")
+        self.pooled_fc = Linear(input_dim=self._emb_size,
+                                output_dim=self._emb_size,
+                                param_attr=fluid.ParamAttr(
+                                    name="pooled_fc.w_0",
+                                    initializer=self._param_initializer),
+                                bias_attr="pooled_fc.b_0",
+                                act="tanh")
 
         self.pre_process_layer = PrePostProcessLayer(
             "nd", self._emb_size, self._prepostprocess_dropout)
@@ -237,12 +242,16 @@ def forward(self, src_ids, position_ids, sentence_ids, input_mask):
 
         emb_out = self.pre_process_layer(emb_out)
 
-        self_attn_mask = fluid.layers.matmul(
-            x=input_mask, y=input_mask, transpose_y=True)
-        self_attn_mask = fluid.layers.scale(
-            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
-        n_head_self_attn_mask = fluid.layers.stack(
-            x=[self_attn_mask] * self._n_head, axis=1)
+        self_attn_mask = fluid.layers.matmul(x=input_mask,
+                                             y=input_mask,
+                                             transpose_y=True)
+        self_attn_mask = fluid.layers.scale(x=self_attn_mask,
+                                            scale=10000.0,
+                                            bias=-1.0,
+                                            bias_after_scale=False)
+        n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] *
+                                                   self._n_head,
+                                                   axis=1)
         n_head_self_attn_mask.stop_gradient = True
 
         enc_output = self._encoder(emb_out, n_head_self_attn_mask)
@@ -252,16 +261,19 @@ def forward(self, src_ids, position_ids, sentence_ids, input_mask):
         #
         #if not self.return_pooled_out:
         #    return enc_output
-        next_sent_feat = fluid.layers.slice(
-            input=enc_output, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = fluid.layers.slice(input=enc_output,
+                                            axes=[1],
+                                            starts=[0],
+                                            ends=[1])
         next_sent_feat = self.pooled_fc(next_sent_feat)
-        next_sent_feat = fluid.layers.reshape(
-            next_sent_feat, shape=[-1, self._emb_size])
+        next_sent_feat = fluid.layers.reshape(next_sent_feat,
+                                              shape=[-1, self._emb_size])
 
         return enc_output, next_sent_feat
 
 
 class PretrainModelLayer(Layer):
+
     def __init__(self,
                  config,
                  return_pooled_out=True,
@@ -281,33 +293,32 @@ def __init__(self,
         self.use_fp16 = use_fp16
         self._dtype = "float16" if use_fp16 else "float32"
 
-        self.bert_layer = BertModelLayer(
-            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+        self.bert_layer = BertModelLayer(config=self.config,
+                                         return_pooled_out=True,
+                                         use_fp16=self.use_fp16)
 
         self.pre_process_layer = PrePostProcessLayer(
             "n", self._emb_size, self._prepostprocess_dropout)
 
-        self.pooled_fc = Linear(
-            input_dim=self._emb_size,
-            output_dim=self._emb_size,
-            param_attr=fluid.ParamAttr(
-                name="mask_lm_trans_fc.w_0",
-                initializer=self._param_initializer),
-            bias_attr="mask_lm_trans_fc.b_0",
-            act="tanh")
+        self.pooled_fc = Linear(input_dim=self._emb_size,
+                                output_dim=self._emb_size,
+                                param_attr=fluid.ParamAttr(
+                                    name="mask_lm_trans_fc.w_0",
+                                    initializer=self._param_initializer),
+                                bias_attr="mask_lm_trans_fc.b_0",
+                                act="tanh")
 
         self.mask_lm_out_bias_attr = fluid.ParamAttr(
             name="mask_lm_out_fc.b_0",
             initializer=fluid.initializer.Constant(value=0.0))
 
         if not self._weight_sharing:
-            self.out_fc = Linear(
-                input_dim=self._emb_size,
-                output_dim=self._voc_size,
-                param_attr=fluid.ParamAttr(
-                    name="mask_lm_out_fc.w_0",
-                    initializer=self._param_initializer),
-                bias_attr=self.mask_lm_out_bias_attr)
+            self.out_fc = Linear(input_dim=self._emb_size,
+                                 output_dim=self._voc_size,
+                                 param_attr=fluid.ParamAttr(
+                                     name="mask_lm_out_fc.w_0",
+                                     initializer=self._param_initializer),
+                                 bias_attr=self.mask_lm_out_bias_attr)
         else:
             self.fc_create_params = self.create_parameter(
                 shape=[self._voc_size],
@@ -315,12 +326,12 @@ def __init__(self,
                 attr=self.mask_lm_out_bias_attr,
                 is_bias=True)
 
-        self.next_sent_fc = Linear(
-            input_dim=self._emb_size,
-            output_dim=2,
-            param_attr=fluid.ParamAttr(
-                name="next_sent_fc.w_0", initializer=self._param_initializer),
-            bias_attr="next_sent_fc.b_0")
+        self.next_sent_fc = Linear(input_dim=self._emb_size,
+                                   output_dim=2,
+                                   param_attr=fluid.ParamAttr(
+                                       name="next_sent_fc.w_0",
+                                       initializer=self._param_initializer),
+                                   bias_attr="next_sent_fc.b_0")
 
     @declarative
     def forward(self, src_ids, position_ids, sentence_ids, input_mask,
@@ -329,24 +340,23 @@ def forward(self, src_ids, position_ids, sentence_ids, input_mask,
 
         enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
                                                      sentence_ids, input_mask)
-        reshaped_emb_out = fluid.layers.reshape(
-            x=enc_output, shape=[-1, self._emb_size])
+        reshaped_emb_out = fluid.layers.reshape(x=enc_output,
+                                                shape=[-1, self._emb_size])
 
         mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
         mask_trans_feat = self.pooled_fc(mask_feat)
         mask_trans_feat = self.pre_process_layer(mask_trans_feat)
 
         if self._weight_sharing:
-            fc_out = fluid.layers.matmul(
-                x=mask_trans_feat,
-                y=self.bert_layer._src_emb._w,
-                transpose_y=True)
+            fc_out = fluid.layers.matmul(x=mask_trans_feat,
+                                         y=self.bert_layer._src_emb._w,
+                                         transpose_y=True)
             fc_out += self.fc_create_params
         else:
             fc_out = self.out_fc(mask_trans_feat)
 
-        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
-            logits=fc_out, label=mask_label)
+        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out,
+                                                               label=mask_label)
         mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
 
         next_sent_fc_out = self.next_sent_fc(next_sent_feat)
@@ -354,8 +364,8 @@ def forward(self, src_ids, position_ids, sentence_ids, input_mask,
         next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
             logits=next_sent_fc_out, label=labels, return_softmax=True)
 
-        next_sent_acc = fluid.layers.accuracy(
-            input=next_sent_softmax, label=labels)
+        next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax,
+                                              label=labels)
 
         mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_utils.py
index a18bb34e18282..53996775a61f6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_utils.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 import random
+
 SEED = 2020
 
 
@@ -88,9 +89,8 @@ def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
 
         # ensure at least mask one word in a sentence
         while not mask_flag:
-            token_index = int(
-                self_random.randint(
-                    1, high=len(sent) - 1, size=1))
+            token_index = int(self_random.randint(1, high=len(sent) - 1,
+                                                  size=1))
             if sent[token_index] != SEP and sent[token_index] != CLS:
                 mask_label.append(sent[token_index])
                 sent[token_index] = MASK
@@ -132,8 +132,8 @@ def pad_batch_data(insts,
 
     if return_input_mask:
         # This is used to avoid attention on paddings.
-        input_mask_data = np.array([[1] * len(inst) + [0] *
-                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.array(
+            [[1] * len(inst) + [0] * (max_len - len(inst)) for inst in insts])
         input_mask_data = np.expand_dims(input_mask_data, axis=-1)
         return_list += [input_mask_data.astype("float32")]
 
@@ -177,28 +177,26 @@ def prepare_batch_data(insts,
 
     # First step: do mask without padding
     if mask_id >= 0:
-        out, mask_label, mask_pos = mask(
-            batch_src_ids,
-            total_token_num,
-            vocab_size=voc_size,
-            CLS=cls_id,
-            SEP=sep_id,
-            MASK=mask_id)
+        out, mask_label, mask_pos = mask(batch_src_ids,
+                                         total_token_num,
+                                         vocab_size=voc_size,
+                                         CLS=cls_id,
+                                         SEP=sep_id,
+                                         MASK=mask_id)
     else:
         out = batch_src_ids
     # Second step: padding
-    src_id, self_input_mask = pad_batch_data(
-        out, pad_idx=pad_id, return_input_mask=True)
-    pos_id = pad_batch_data(
-        batch_pos_ids,
-        pad_idx=pad_id,
-        return_pos=False,
-        return_input_mask=False)
-    sent_id = pad_batch_data(
-        batch_sent_ids,
-        pad_idx=pad_id,
-        return_pos=False,
-        return_input_mask=False)
+    src_id, self_input_mask = pad_batch_data(out,
+                                             pad_idx=pad_id,
+                                             return_input_mask=True)
+    pos_id = pad_batch_data(batch_pos_ids,
+                            pad_idx=pad_id,
+                            return_pos=False,
+                            return_input_mask=False)
+    sent_id = pad_batch_data(batch_sent_ids,
+                             pad_idx=pad_id,
+                             return_pos=False,
+                             return_input_mask=False)
 
     if mask_id >= 0:
         return_list = [
@@ -212,6 +210,7 @@ def prepare_batch_data(insts,
 
 
 class DataReader(object):
+
     def __init__(self,
                  batch_size=4096,
                  in_tokens=True,
@@ -268,7 +267,9 @@ def build_fake_data(self):
             yield token_ids, sent_ids, pos_ids, label
 
     def data_generator(self):
+
         def wrapper():
+
             def reader():
                 for epoch in range(self.epoch):
                     self.current_epoch = epoch + 1
@@ -292,25 +293,25 @@ def batch_reader(reader, batch_size, in_tokens):
                         total_token_num += len(token_ids)
                     else:
                         yield batch, total_token_num
-                        batch, total_token_num, max_len = [parsed_line], len(
-                            token_ids), len(token_ids)
+                        batch, total_token_num, max_len = [
+                            parsed_line
+                        ], len(token_ids), len(token_ids)
 
                 if len(batch) > 0:
                     yield batch, total_token_num
 
             for batch_data, total_token_num in batch_reader(
                     reader, self.batch_size, self.in_tokens):
-                yield prepare_batch_data(
-                    batch_data,
-                    total_token_num,
-                    voc_size=self.voc_size,
-                    pad_id=self.pad_id,
-                    cls_id=self.cls_id,
-                    sep_id=self.sep_id,
-                    mask_id=self.mask_id,
-                    return_input_mask=True,
-                    return_max_len=False,
-                    return_num_token=False)
+                yield prepare_batch_data(batch_data,
+                                         total_token_num,
+                                         voc_size=self.voc_size,
+                                         pad_id=self.pad_id,
+                                         cls_id=self.cls_id,
+                                         sep_id=self.sep_id,
+                                         mask_id=self.mask_id,
+                                         return_input_mask=True,
+                                         return_max_len=False,
+                                         return_num_token=False)
 
         return wrapper
 
@@ -325,12 +326,11 @@ class ModelHyperParams(object):
 
 def get_feed_data_reader(bert_config):
     args = ModelHyperParams()
-    data_reader = DataReader(
-        batch_size=args.batch_size,
-        in_tokens=args.in_tokens,
-        voc_size=bert_config['vocab_size'],
-        epoch=args.epoch,
-        max_seq_len=args.max_seq_len,
-        generate_neg_sample=args.generate_neg_sample)
+    data_reader = DataReader(batch_size=args.batch_size,
+                             in_tokens=args.in_tokens,
+                             voc_size=bert_config['vocab_size'],
+                             epoch=args.epoch,
+                             max_seq_len=args.max_seq_len,
+                             generate_neg_sample=args.generate_neg_sample)
 
     return data_reader
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
index cdf478f87107a..58482bb977136 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
@@ -21,6 +21,7 @@
 
 
 class ConvBNLayer(fluid.dygraph.Layer):
+
     def __init__(self,
                  ch_in,
                  ch_out,
@@ -32,26 +33,23 @@ def __init__(self,
                  is_test=True):
         super(ConvBNLayer, self).__init__()
 
-        self.conv = Conv2D(
-            num_channels=ch_in,
-            num_filters=ch_out,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=groups,
-            param_attr=ParamAttr(
-                initializer=fluid.initializer.Normal(0., 0.02)),
-            bias_attr=False,
-            act=None)
+        self.conv = Conv2D(num_channels=ch_in,
+                           num_filters=ch_out,
+                           filter_size=filter_size,
+                           stride=stride,
+                           padding=padding,
+                           groups=groups,
+                           param_attr=ParamAttr(
+                               initializer=fluid.initializer.Normal(0., 0.02)),
+                           bias_attr=False,
+                           act=None)
         self.batch_norm = BatchNorm(
             num_channels=ch_out,
             is_test=is_test,
-            param_attr=ParamAttr(
-                initializer=fluid.initializer.Normal(0., 0.02),
-                regularizer=L2Decay(0.)),
-            bias_attr=ParamAttr(
-                initializer=fluid.initializer.Constant(0.0),
-                regularizer=L2Decay(0.)))
+            param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02),
+                                 regularizer=L2Decay(0.)),
+            bias_attr=ParamAttr(initializer=fluid.initializer.Constant(0.0),
+                                regularizer=L2Decay(0.)))
 
         self.act = act
 
@@ -64,6 +62,7 @@ def forward(self, inputs):
 
 
 class DownSample(fluid.dygraph.Layer):
+
     def __init__(self,
                  ch_in,
                  ch_out,
@@ -74,13 +73,12 @@ def __init__(self,
 
         super(DownSample, self).__init__()
 
-        self.conv_bn_layer = ConvBNLayer(
-            ch_in=ch_in,
-            ch_out=ch_out,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            is_test=is_test)
+        self.conv_bn_layer = ConvBNLayer(ch_in=ch_in,
+                                         ch_out=ch_out,
+                                         filter_size=filter_size,
+                                         stride=stride,
+                                         padding=padding,
+                                         is_test=is_test)
         self.ch_out = ch_out
 
     def forward(self, inputs):
@@ -89,23 +87,22 @@ def forward(self, inputs):
 
 
 class BasicBlock(fluid.dygraph.Layer):
+
     def __init__(self, ch_in, ch_out, is_test=True):
         super(BasicBlock, self).__init__()
 
-        self.conv1 = ConvBNLayer(
-            ch_in=ch_in,
-            ch_out=ch_out,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            is_test=is_test)
-        self.conv2 = ConvBNLayer(
-            ch_in=ch_out,
-            ch_out=ch_out * 2,
-            filter_size=3,
-            stride=1,
-            padding=1,
-            is_test=is_test)
+        self.conv1 = ConvBNLayer(ch_in=ch_in,
+                                 ch_out=ch_out,
+                                 filter_size=1,
+                                 stride=1,
+                                 padding=0,
+                                 is_test=is_test)
+        self.conv2 = ConvBNLayer(ch_in=ch_out,
+                                 ch_out=ch_out * 2,
+                                 filter_size=3,
+                                 stride=1,
+                                 padding=1,
+                                 is_test=is_test)
 
     def forward(self, inputs):
         conv1 = self.conv1(inputs)
@@ -115,6 +112,7 @@ def forward(self, inputs):
 
 
 class LayerWarp(fluid.dygraph.Layer):
+
     def __init__(self, ch_in, ch_out, count, is_test=True):
         super(LayerWarp, self).__init__()
 
@@ -123,8 +121,7 @@ def __init__(self, ch_in, ch_out, count, is_test=True):
         for i in range(1, count):
             res_out = self.add_sublayer(
                 "basic_block_%d" % (i),
-                BasicBlock(
-                    ch_out * 2, ch_out, is_test=is_test))
+                BasicBlock(ch_out * 2, ch_out, is_test=is_test))
             self.res_out_list.append(res_out)
         self.ch_out = ch_out
 
@@ -139,18 +136,18 @@ def forward(self, inputs):
 
 
 class DarkNet53_conv_body(fluid.dygraph.Layer):
+
     def __init__(self, ch_in=3, is_test=True):
         super(DarkNet53_conv_body, self).__init__()
         self.stages = DarkNet_cfg[53]
         self.stages = self.stages[0:5]
 
-        self.conv0 = ConvBNLayer(
-            ch_in=ch_in,
-            ch_out=32,
-            filter_size=3,
-            stride=1,
-            padding=1,
-            is_test=is_test)
+        self.conv0 = ConvBNLayer(ch_in=ch_in,
+                                 ch_out=32,
+                                 filter_size=3,
+                                 stride=1,
+                                 padding=1,
+                                 is_test=is_test)
 
         self.downsample0 = DownSample(ch_in=32, ch_out=32 * 2, is_test=is_test)
         self.darknet53_conv_block_list = []
@@ -159,16 +156,14 @@ def __init__(self, ch_in=3, is_test=True):
         for i, stage in enumerate(self.stages):
             conv_block = self.add_sublayer(
                 "stage_%d" % (i),
-                LayerWarp(
-                    int(ch_in[i]), 32 * (2**i), stage, is_test=is_test))
+                LayerWarp(int(ch_in[i]), 32 * (2**i), stage, is_test=is_test))
             self.darknet53_conv_block_list.append(conv_block)
         for i in range(len(self.stages) - 1):
             downsample = self.add_sublayer(
                 "stage_%d_downsample" % i,
-                DownSample(
-                    ch_in=32 * (2**(i + 1)),
-                    ch_out=32 * (2**(i + 2)),
-                    is_test=is_test))
+                DownSample(ch_in=32 * (2**(i + 1)),
+                           ch_out=32 * (2**(i + 2)),
+                           is_test=is_test))
             self.downsample_list.append(downsample)
 
     def forward(self, inputs):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
index ecb7d7f6bd19c..0c7d2903c3625 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
@@ -103,8 +103,7 @@ def false_fn_0(q, x, y):
 def dyfunc_with_if_else_with_list_geneator(x):
     if 10 > 5:
         y = paddle.add_n(
-            [paddle.full(
-                shape=[2], fill_value=v) for v in range(5)])
+            [paddle.full(shape=[2], fill_value=v) for v in range(5)])
     else:
         y = x
     return y
@@ -131,8 +130,9 @@ def nested_if_else(x_v):
             if fluid.layers.mean(y).numpy()[0] < batch_size:
                 y = fluid.layers.abs(y)
             else:
-                tmp = fluid.layers.fill_constant(
-                    [feat_size], dtype='float32', value=-1)
+                tmp = fluid.layers.fill_constant([feat_size],
+                                                 dtype='float32',
+                                                 value=-1)
                 y = y - tmp
     else:
         y = x_v - bias
@@ -148,13 +148,15 @@ def nested_if_else_2(x):
     x_shape_0 = x.shape[0]
     if x_shape_0 < 1:
         if fluid.layers.shape(y).numpy()[0] < 1:
-            res = fluid.layers.fill_constant(
-                value=2, shape=x.shape, dtype="int32")
+            res = fluid.layers.fill_constant(value=2,
+                                             shape=x.shape,
+                                             dtype="int32")
             # `z` is a new var here.
             z = y + 1
         else:
-            res = fluid.layers.fill_constant(
-                value=3, shape=x.shape, dtype="int32")
+            res = fluid.layers.fill_constant(value=3,
+                                             shape=x.shape,
+                                             dtype="int32")
     else:
         res = x
     return res
@@ -179,29 +181,32 @@ def nested_if_else_3(x):
     else:
         y_shape = fluid.layers.shape(y)
         if y_shape.numpy()[0] < 1:
-            res = fluid.layers.fill_constant(
-                value=2, shape=x.shape, dtype="int32")
+            res = fluid.layers.fill_constant(value=2,
+                                             shape=x.shape,
+                                             dtype="int32")
             # `z` is created in above code block.
             z = y + 1
         else:
-            res = fluid.layers.fill_constant(
-                value=3, shape=x.shape, dtype="int32")
+            res = fluid.layers.fill_constant(value=3,
+                                             shape=x.shape,
+                                             dtype="int32")
             # `out` is a new var.
             out = x + 1
     return res
 
 
 class NetWithControlFlowIf(fluid.dygraph.Layer):
+
     def __init__(self, hidden_dim=16):
         super(NetWithControlFlowIf, self).__init__()
         self.hidden_dim = hidden_dim
         self.fc = fluid.dygraph.Linear(
             input_dim=hidden_dim,
             output_dim=5,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.99)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.5)))
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.99)),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.5)))
         self.alpha = 10.
         self.constant_vars = {}
 
@@ -210,11 +215,12 @@ def forward(self, input):
         hidden_dim = input.shape[-1]
         if hidden_dim != self.hidden_dim:
             raise ValueError(
-                "hidden_dim {} of input is not equal to FC.weight[0]: {}"
-                .format(hidden_dim, self.hidden_dim))
+                "hidden_dim {} of input is not equal to FC.weight[0]: {}".
+                format(hidden_dim, self.hidden_dim))
 
-        self.constant_vars['bias'] = fluid.layers.fill_constant(
-            [5], dtype='float32', value=1)
+        self.constant_vars['bias'] = fluid.layers.fill_constant([5],
+                                                                dtype='float32',
+                                                                value=1)
         # Control flow `if` statement
         fc_out = self.fc(input)
         if fluid.layers.mean(fc_out).numpy()[0] < 0:
@@ -233,8 +239,9 @@ def forward(self, input):
                         [hidden_dim], dtype='float32', value=9)
                     y = fluid.layers.abs(y)
                 else:
-                    tmp = fluid.layers.fill_constant(
-                        [5], dtype='float32', value=-1)
+                    tmp = fluid.layers.fill_constant([5],
+                                                     dtype='float32',
+                                                     value=-1)
                     y = y - tmp
         else:
             y = fc_out - self.constant_vars['bias']
@@ -245,8 +252,8 @@ def forward(self, input):
 
 def if_with_and_or(x_v, label=None):
     batch_size = fluid.layers.shape(x_v)
-    if x_v is not None and (fluid.layers.mean(x_v).numpy()[0] > 0 or
-                            label is not None) and batch_size[0] > 1 and True:
+    if x_v is not None and (fluid.layers.mean(x_v).numpy()[0] > 0 or label
+                            is not None) and batch_size[0] > 1 and True:
         x_v = x_v - 1
     else:
         x_v = x_v + 1
@@ -289,17 +296,19 @@ def if_with_and_or_3(x, y=None):
 def if_with_and_or_4(x, y=None):
     batch_size = fluid.layers.shape(x)
     mean_res = fluid.layers.mean(x)
-    if (x is not None and batch_size[0] > 1) or (y is not None and
-                                                 mean_res.numpy()[0] > 0):
+    if (x is not None and batch_size[0] > 1) or (y is not None
+                                                 and mean_res.numpy()[0] > 0):
         x = x + 1
-    if (x is not None or batch_size[0] > 1) and (y is not None or
-                                                 mean_res.numpy()[0] > 0):
+    if (x is not None or batch_size[0] > 1) and (y is not None
+                                                 or mean_res.numpy()[0] > 0):
         x = x - 1
     return x
 
 
 def if_with_class_var(x, y=None):
+
     class Foo(object):
+
         def __init__(self):
             self.a = 1
             self.b = 2
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
index 0a3be4478125c..8c7f301e9ed55 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -29,11 +29,13 @@
 
 INF = 1. * 1e5
 alpha = 0.6
-uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x)
+uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x,
+                                                                     high=x)
 zero_constant = fluid.initializer.Constant(0.0)
 
 
 class BasicLSTMUnit(Layer):
+
     def __init__(self,
                  hidden_size,
                  input_size,
@@ -59,11 +61,10 @@ def __init__(self,
             shape=[self._input_size + self._hiden_size, 4 * self._hiden_size],
             dtype=self._dtype)
 
-        self._bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[4 * self._hiden_size],
-            dtype=self._dtype,
-            is_bias=True)
+        self._bias = self.create_parameter(attr=self._bias_attr,
+                                           shape=[4 * self._hiden_size],
+                                           dtype=self._dtype,
+                                           is_bias=True)
 
     def forward(self, input, pre_hidden, pre_cell):
         concat_input_hidden = layers.concat([input, pre_hidden], 1)
@@ -82,6 +83,7 @@ def forward(self, input, pre_hidden, pre_cell):
 
 
 class BaseModel(fluid.dygraph.Layer):
+
     def __init__(self,
                  hidden_size,
                  src_vocab_size,
@@ -130,30 +132,27 @@ def __init__(self,
             self.enc_units.append(
                 self.add_sublayer(
                     "enc_units_%d" % i,
-                    BasicLSTMUnit(
-                        hidden_size=self.hidden_size,
-                        input_size=self.hidden_size,
-                        param_attr=param_attr,
-                        bias_attr=bias_attr,
-                        forget_bias=forget_bias)))
+                    BasicLSTMUnit(hidden_size=self.hidden_size,
+                                  input_size=self.hidden_size,
+                                  param_attr=param_attr,
+                                  bias_attr=bias_attr,
+                                  forget_bias=forget_bias)))
 
         self.dec_units = []
         for i in range(num_layers):
             self.dec_units.append(
                 self.add_sublayer(
                     "dec_units_%d" % i,
-                    BasicLSTMUnit(
-                        hidden_size=self.hidden_size,
-                        input_size=self.hidden_size,
-                        param_attr=param_attr,
-                        bias_attr=bias_attr,
-                        forget_bias=forget_bias)))
-
-        self.fc = fluid.dygraph.nn.Linear(
-            self.hidden_size,
-            self.tar_vocab_size,
-            param_attr=param_attr,
-            bias_attr=False)
+                    BasicLSTMUnit(hidden_size=self.hidden_size,
+                                  input_size=self.hidden_size,
+                                  param_attr=param_attr,
+                                  bias_attr=bias_attr,
+                                  forget_bias=forget_bias)))
+
+        self.fc = fluid.dygraph.nn.Linear(self.hidden_size,
+                                          self.tar_vocab_size,
+                                          param_attr=param_attr,
+                                          bias_attr=False)
 
     def _transpose_batch_time(self, x):
         return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape))))
@@ -191,25 +190,26 @@ def forward(self, inputs):
         # NOTE: modify model code about `enc_hidden` and `enc_cell` to transforme dygraph code successfully.
         # Because nested list can't be transformed now.
         enc_hidden_0 = to_variable(
-            np.zeros(
-                (self.batch_size, self.hidden_size), dtype='float32'))
+            np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
         enc_cell_0 = to_variable(
-            np.zeros(
-                (self.batch_size, self.hidden_size), dtype='float32'))
+            np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
         zero = fluid.layers.zeros(shape=[1], dtype="int64")
         enc_hidden = fluid.layers.create_array(dtype="float32")
         enc_cell = fluid.layers.create_array(dtype="float32")
         for i in range(self.num_layers):
             index = zero + i
-            enc_hidden = fluid.layers.array_write(
-                enc_hidden_0, index, array=enc_hidden)
-            enc_cell = fluid.layers.array_write(
-                enc_cell_0, index, array=enc_cell)
+            enc_hidden = fluid.layers.array_write(enc_hidden_0,
+                                                  index,
+                                                  array=enc_hidden)
+            enc_cell = fluid.layers.array_write(enc_cell_0,
+                                                index,
+                                                array=enc_cell)
 
         max_seq_len = src_emb.shape[0]
 
-        enc_len_mask = fluid.layers.sequence_mask(
-            src_sequence_length, maxlen=max_seq_len, dtype="float32")
+        enc_len_mask = fluid.layers.sequence_mask(src_sequence_length,
+                                                  maxlen=max_seq_len,
+                                                  dtype="float32")
         enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0])
 
         # TODO: Because diff exits if call while_loop in static graph.
@@ -221,8 +221,9 @@ def forward(self, inputs):
             step_mask = enc_len_mask[k]
             new_enc_hidden, new_enc_cell = [], []
             for i in range(self.num_layers):
-                enc_new_hidden, enc_new_cell = self.enc_units[i](
-                    enc_step_input, enc_hidden[i], enc_cell[i])
+                enc_new_hidden, enc_new_cell = self.enc_units[i](enc_step_input,
+                                                                 enc_hidden[i],
+                                                                 enc_cell[i])
                 if self.dropout != None and self.dropout > 0.0:
                     enc_step_input = fluid.layers.dropout(
                         enc_new_hidden,
@@ -247,8 +248,9 @@ def forward(self, inputs):
             step_input = tar_emb[j]
             new_dec_hidden, new_dec_cell = [], []
             for i in range(self.num_layers):
-                new_hidden, new_cell = self.dec_units[i](
-                    step_input, dec_hidden[i], dec_cell[i])
+                new_hidden, new_cell = self.dec_units[i](step_input,
+                                                         dec_hidden[i],
+                                                         dec_cell[i])
                 new_dec_hidden.append(new_hidden)
                 new_dec_cell.append(new_cell)
                 if self.dropout != None and self.dropout > 0.0:
@@ -262,12 +264,14 @@ def forward(self, inputs):
 
         dec_output = fluid.layers.stack(dec_output)
         dec_output = self.fc(self._transpose_batch_time(dec_output))
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=dec_output, label=label, soft_label=False)
+        loss = fluid.layers.softmax_with_cross_entropy(logits=dec_output,
+                                                       label=label,
+                                                       soft_label=False)
         loss = fluid.layers.squeeze(loss, axes=[2])
         max_tar_seq_len = fluid.layers.shape(tar)[1]
-        tar_mask = fluid.layers.sequence_mask(
-            tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32')
+        tar_mask = fluid.layers.sequence_mask(tar_sequence_length,
+                                              maxlen=max_tar_seq_len,
+                                              dtype='float32')
         loss = loss * tar_mask
         loss = fluid.layers.reduce_mean(loss, dim=[0])
         loss = fluid.layers.reduce_sum(loss)
@@ -282,25 +286,26 @@ def beam_search(self, inputs):
 
         src_emb = self.src_embeder(self._transpose_batch_time(src))
         enc_hidden_0 = to_variable(
-            np.zeros(
-                (self.batch_size, self.hidden_size), dtype='float32'))
+            np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
         enc_cell_0 = to_variable(
-            np.zeros(
-                (self.batch_size, self.hidden_size), dtype='float32'))
+            np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
         zero = fluid.layers.zeros(shape=[1], dtype="int64")
         enc_hidden = fluid.layers.create_array(dtype="float32")
         enc_cell = fluid.layers.create_array(dtype="float32")
         for j in range(self.num_layers):
             index = zero + j
-            enc_hidden = fluid.layers.array_write(
-                enc_hidden_0, index, array=enc_hidden)
-            enc_cell = fluid.layers.array_write(
-                enc_cell_0, index, array=enc_cell)
+            enc_hidden = fluid.layers.array_write(enc_hidden_0,
+                                                  index,
+                                                  array=enc_hidden)
+            enc_cell = fluid.layers.array_write(enc_cell_0,
+                                                index,
+                                                array=enc_cell)
 
         max_seq_len = src_emb.shape[0]
 
-        enc_len_mask = fluid.layers.sequence_mask(
-            src_sequence_length, maxlen=max_seq_len, dtype="float32")
+        enc_len_mask = fluid.layers.sequence_mask(src_sequence_length,
+                                                  maxlen=max_seq_len,
+                                                  dtype="float32")
         enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0])
 
         for k in range(args.max_seq_len):
@@ -310,8 +315,9 @@ def beam_search(self, inputs):
             new_enc_hidden, new_enc_cell = [], []
 
             for i in range(self.num_layers):
-                enc_new_hidden, enc_new_cell = self.enc_units[i](
-                    enc_step_input, enc_hidden[i], enc_cell[i])
+                enc_new_hidden, enc_new_cell = self.enc_units[i](enc_step_input,
+                                                                 enc_hidden[i],
+                                                                 enc_cell[i])
                 if self.dropout != None and self.dropout > 0.0:
                     enc_step_input = fluid.layers.dropout(
                         enc_new_hidden,
@@ -329,21 +335,18 @@ def beam_search(self, inputs):
 
         # beam search
         batch_beam_shape = (self.batch_size, self.beam_size)
-        vocab_size_tensor = to_variable(np.full((
-            1), self.tar_vocab_size)).astype("int64")
+        vocab_size_tensor = to_variable(np.full(
+            (1), self.tar_vocab_size)).astype("int64")
         start_token_tensor = to_variable(
-            np.full(
-                batch_beam_shape, self.beam_start_token, dtype='int64'))
+            np.full(batch_beam_shape, self.beam_start_token, dtype='int64'))
         end_token_tensor = to_variable(
-            np.full(
-                batch_beam_shape, self.beam_end_token, dtype='int64'))
+            np.full(batch_beam_shape, self.beam_end_token, dtype='int64'))
         step_input = self.tar_embeder(start_token_tensor)
         beam_finished = to_variable(
-            np.full(
-                batch_beam_shape, 0, dtype='float32'))
+            np.full(batch_beam_shape, 0, dtype='float32'))
         beam_state_log_probs = to_variable(
-            np.array(
-                [[0.] + [-self.kinf] * (self.beam_size - 1)], dtype="float32"))
+            np.array([[0.] + [-self.kinf] * (self.beam_size - 1)],
+                     dtype="float32"))
         beam_state_log_probs = fluid.layers.expand(beam_state_log_probs,
                                                    [self.batch_size, 1])
         dec_hidden, dec_cell = enc_hidden, enc_cell
@@ -352,9 +355,8 @@ def beam_search(self, inputs):
 
         batch_pos = fluid.layers.expand(
             fluid.layers.unsqueeze(
-                to_variable(np.arange(
-                    0, self.batch_size, 1, dtype="int64")), [1]),
-            [1, self.beam_size])
+                to_variable(np.arange(0, self.batch_size, 1, dtype="int64")),
+                [1]), [1, self.beam_size])
         predicted_ids = []
         parent_ids = []
 
@@ -370,8 +372,9 @@ def beam_search(self, inputs):
             dec_cell = [self._merge_batch_beams(state) for state in dec_cell]
 
             for i in range(self.num_layers):
-                new_hidden, new_cell = self.dec_units[i](
-                    step_input, dec_hidden[i], dec_cell[i])
+                new_hidden, new_cell = self.dec_units[i](step_input,
+                                                         dec_hidden[i],
+                                                         dec_cell[i])
                 new_dec_hidden.append(new_hidden)
                 new_dec_cell.append(new_cell)
                 if self.dropout != None and self.dropout > 0.0:
@@ -389,24 +392,24 @@ def beam_search(self, inputs):
             noend_array = [-self.kinf] * self.tar_vocab_size
             noend_array[self.beam_end_token] = 0
             noend_mask_tensor = to_variable(
-                np.array(
-                    noend_array, dtype='float32'))
+                np.array(noend_array, dtype='float32'))
 
             step_log_probs = fluid.layers.elementwise_mul(
                 fluid.layers.expand(fluid.layers.unsqueeze(beam_finished, [2]), [1, 1, self.tar_vocab_size]),
                 noend_mask_tensor, axis=-1) - \
                              fluid.layers.elementwise_mul(step_log_probs, (beam_finished - 1), axis=0)
-            log_probs = fluid.layers.elementwise_add(
-                x=step_log_probs, y=beam_state_log_probs, axis=0)
+            log_probs = fluid.layers.elementwise_add(x=step_log_probs,
+                                                     y=beam_state_log_probs,
+                                                     axis=0)
             scores = fluid.layers.reshape(
                 log_probs, [-1, self.beam_size * self.tar_vocab_size])
-            topk_scores, topk_indices = fluid.layers.topk(
-                input=scores, k=self.beam_size)
+            topk_scores, topk_indices = fluid.layers.topk(input=scores,
+                                                          k=self.beam_size)
 
-            beam_indices = fluid.layers.elementwise_floordiv(topk_indices,
-                                                             vocab_size_tensor)
-            token_indices = fluid.layers.elementwise_mod(topk_indices,
-                                                         vocab_size_tensor)
+            beam_indices = fluid.layers.elementwise_floordiv(
+                topk_indices, vocab_size_tensor)
+            token_indices = fluid.layers.elementwise_mod(
+                topk_indices, vocab_size_tensor)
             next_log_probs = self._gather(scores, topk_indices, batch_pos)
 
             x = 0
@@ -451,6 +454,7 @@ def beam_search(self, inputs):
 
 
 class AttentionModel(fluid.dygraph.Layer):
+
     def __init__(self,
                  hidden_size,
                  src_vocab_size,
@@ -501,12 +505,11 @@ def __init__(self,
             self.enc_units.append(
                 self.add_sublayer(
                     "enc_units_%d" % i,
-                    BasicLSTMUnit(
-                        hidden_size=self.hidden_size,
-                        input_size=self.hidden_size,
-                        param_attr=param_attr,
-                        bias_attr=bias_attr,
-                        forget_bias=forget_bias)))
+                    BasicLSTMUnit(hidden_size=self.hidden_size,
+                                  input_size=self.hidden_size,
+                                  param_attr=param_attr,
+                                  bias_attr=bias_attr,
+                                  forget_bias=forget_bias)))
 
         self.dec_units = []
         for i in range(num_layers):
@@ -514,52 +517,50 @@ def __init__(self,
                 self.dec_units.append(
                     self.add_sublayer(
                         "dec_units_%d" % i,
-                        BasicLSTMUnit(
-                            hidden_size=self.hidden_size,
-                            input_size=self.hidden_size * 2,
-                            param_attr=ParamAttr(
-                                name="dec_units_%d" % i,
-                                initializer=uniform_initializer(
-                                    self.init_scale)),
-                            bias_attr=bias_attr,
-                            forget_bias=forget_bias)))
+                        BasicLSTMUnit(hidden_size=self.hidden_size,
+                                      input_size=self.hidden_size * 2,
+                                      param_attr=ParamAttr(
+                                          name="dec_units_%d" % i,
+                                          initializer=uniform_initializer(
+                                              self.init_scale)),
+                                      bias_attr=bias_attr,
+                                      forget_bias=forget_bias)))
             else:
                 self.dec_units.append(
                     self.add_sublayer(
                         "dec_units_%d" % i,
-                        BasicLSTMUnit(
-                            hidden_size=self.hidden_size,
-                            input_size=self.hidden_size,
-                            param_attr=ParamAttr(
-                                name="dec_units_%d" % i,
-                                initializer=uniform_initializer(
-                                    self.init_scale)),
-                            bias_attr=bias_attr,
-                            forget_bias=forget_bias)))
+                        BasicLSTMUnit(hidden_size=self.hidden_size,
+                                      input_size=self.hidden_size,
+                                      param_attr=ParamAttr(
+                                          name="dec_units_%d" % i,
+                                          initializer=uniform_initializer(
+                                              self.init_scale)),
+                                      bias_attr=bias_attr,
+                                      forget_bias=forget_bias)))
 
         self.attn_fc = fluid.dygraph.nn.Linear(
             self.hidden_size,
             self.hidden_size,
-            param_attr=ParamAttr(
-                name="self_attn_fc",
-                initializer=uniform_initializer(self.init_scale)),
+            param_attr=ParamAttr(name="self_attn_fc",
+                                 initializer=uniform_initializer(
+                                     self.init_scale)),
             bias_attr=False)
 
         self.concat_fc = fluid.dygraph.nn.Linear(
             2 * self.hidden_size,
             self.hidden_size,
-            param_attr=ParamAttr(
-                name="self_concat_fc",
-                initializer=uniform_initializer(self.init_scale)),
+            param_attr=ParamAttr(name="self_concat_fc",
+                                 initializer=uniform_initializer(
+                                     self.init_scale)),
             bias_attr=False)
 
-        self.fc = fluid.dygraph.nn.Linear(
-            self.hidden_size,
-            self.tar_vocab_size,
-            param_attr=ParamAttr(
-                name="self_fc",
-                initializer=uniform_initializer(self.init_scale)),
-            bias_attr=False)
+        self.fc = fluid.dygraph.nn.Linear(self.hidden_size,
+                                          self.tar_vocab_size,
+                                          param_attr=ParamAttr(
+                                              name="self_fc",
+                                              initializer=uniform_initializer(
+                                                  self.init_scale)),
+                                          bias_attr=False)
 
     def _transpose_batch_time(self, x):
         return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape))))
@@ -572,15 +573,16 @@ def tile_beam_merge_with_batch(self, x):
         expand_times = [1] * len(x.shape)
         expand_times[1] = self.beam_size
         x = fluid.layers.expand(x, expand_times)  # [batch_size, beam_size, ...]
-        x = fluid.layers.transpose(x, list(range(2, len(x.shape))) +
+        x = fluid.layers.transpose(x,
+                                   list(range(2, len(x.shape))) +
                                    [0, 1])  # [..., batch_size, beam_size]
         # use 0 to copy to avoid wrong shape
-        x = fluid.layers.reshape(
-            x, shape=[0] *
-            (len(x.shape) - 2) + [-1])  # [..., batch_size * beam_size]
+        x = fluid.layers.reshape(x, shape=[0] * (len(x.shape) - 2) +
+                                 [-1])  # [..., batch_size * beam_size]
         x = fluid.layers.transpose(
             x, [len(x.shape) - 1] +
-            list(range(0, len(x.shape) - 1)))  # [batch_size * beam_size, ...]
+            list(range(0,
+                       len(x.shape) - 1)))  # [batch_size * beam_size, ...]
         return x
 
     def _split_batch_beams(self, x):
@@ -635,27 +637,28 @@ def forward(self, inputs):
         # NOTE: modify model code about `enc_hidden` and `enc_cell` to transforme dygraph code successfully.
         # Because nested list can't be transformed now.
         enc_hidden_0 = to_variable(
-            np.zeros(
-                (self.batch_size, self.hidden_size), dtype='float32'))
+            np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
         enc_hidden_0.stop_gradient = True
         enc_cell_0 = to_variable(
-            np.zeros(
-                (self.batch_size, self.hidden_size), dtype='float32'))
+            np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
         enc_hidden_0.stop_gradient = True
         zero = fluid.layers.zeros(shape=[1], dtype="int64")
         enc_hidden = fluid.layers.create_array(dtype="float32")
         enc_cell = fluid.layers.create_array(dtype="float32")
         for i in range(self.num_layers):
             index = zero + i
-            enc_hidden = fluid.layers.array_write(
-                enc_hidden_0, index, array=enc_hidden)
-            enc_cell = fluid.layers.array_write(
-                enc_cell_0, index, array=enc_cell)
+            enc_hidden = fluid.layers.array_write(enc_hidden_0,
+                                                  index,
+                                                  array=enc_hidden)
+            enc_cell = fluid.layers.array_write(enc_cell_0,
+                                                index,
+                                                array=enc_cell)
 
         max_seq_len = src_emb.shape[0]
 
-        enc_len_mask = fluid.layers.sequence_mask(
-            src_sequence_length, maxlen=max_seq_len, dtype="float32")
+        enc_len_mask = fluid.layers.sequence_mask(src_sequence_length,
+                                                  maxlen=max_seq_len,
+                                                  dtype="float32")
         enc_padding_mask = (enc_len_mask - 1.0)
         enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0])
 
@@ -669,8 +672,9 @@ def forward(self, inputs):
             step_mask = enc_len_mask[k]
             new_enc_hidden, new_enc_cell = [], []
             for i in range(self.num_layers):
-                enc_new_hidden, enc_new_cell = self.enc_units[i](
-                    enc_step_input, enc_hidden[i], enc_cell[i])
+                enc_new_hidden, enc_new_cell = self.enc_units[i](enc_step_input,
+                                                                 enc_hidden[i],
+                                                                 enc_cell[i])
                 if self.dropout != None and self.dropout > 0.0:
                     enc_step_input = fluid.layers.dropout(
                         enc_new_hidden,
@@ -691,8 +695,7 @@ def forward(self, inputs):
 
         # train
         input_feed = to_variable(
-            np.zeros(
-                (self.batch_size, self.hidden_size), dtype='float32'))
+            np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
         # NOTE: set stop_gradient here, otherwise grad var is null
         input_feed.stop_gradient = True
         dec_hidden, dec_cell = enc_hidden, enc_cell
@@ -706,8 +709,9 @@ def forward(self, inputs):
             step_input = fluid.layers.concat([step_input, input_feed], 1)
             new_dec_hidden, new_dec_cell = [], []
             for i in range(self.num_layers):
-                new_hidden, new_cell = self.dec_units[i](
-                    step_input, dec_hidden[i], dec_cell[i])
+                new_hidden, new_cell = self.dec_units[i](step_input,
+                                                         dec_hidden[i],
+                                                         dec_cell[i])
                 new_dec_hidden.append(new_hidden)
                 new_dec_cell.append(new_cell)
                 if self.dropout != None and self.dropout > 0.0:
@@ -727,12 +731,14 @@ def forward(self, inputs):
 
         dec_output = fluid.layers.stack(dec_output)
         dec_output = self.fc(self._transpose_batch_time(dec_output))
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=dec_output, label=label, soft_label=False)
+        loss = fluid.layers.softmax_with_cross_entropy(logits=dec_output,
+                                                       label=label,
+                                                       soft_label=False)
         loss = fluid.layers.squeeze(loss, axes=[2])
         max_tar_seq_len = fluid.layers.shape(tar)[1]
-        tar_mask = fluid.layers.sequence_mask(
-            tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32')
+        tar_mask = fluid.layers.sequence_mask(tar_sequence_length,
+                                              maxlen=max_tar_seq_len,
+                                              dtype='float32')
         loss = loss * tar_mask
         loss = fluid.layers.reduce_mean(loss, dim=[0])
         loss = fluid.layers.reduce_sum(loss)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_utils.py
index 821fea3a67ddb..34e9aaffdcfb3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_utils.py
@@ -18,6 +18,7 @@
 from __future__ import print_function
 
 import numpy as np
+
 SEED = 2020
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
index affec2f7dfefc..a2ec446c7286e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -41,12 +41,12 @@ def ops(self):
         """
         # TODO(huihuangzheng): The original code set the is_sparse=True, but it
         # causes crush in dy2stat. Set it to True after fixing it.
-        emb = Embedding(
-            size=[self.dict_size, self.emb_dim],
-            is_sparse=True,
-            padding_idx=self.padding_idx,
-            param_attr=attr.ParamAttr(
-                name=self.name, initializer=fluid.initializer.Xavier()))
+        emb = Embedding(size=[self.dict_size, self.emb_dim],
+                        is_sparse=True,
+                        padding_idx=self.padding_idx,
+                        param_attr=attr.ParamAttr(
+                            name=self.name,
+                            initializer=fluid.initializer.Xavier()))
 
         return emb
 
@@ -327,8 +327,8 @@ def __init__(self,
 
     def _build_once(self, input):
         i = 0
-        for inp, param in self._helper.iter_inputs_and_params(input,
-                                                              self._param_attr):
+        for inp, param in self._helper.iter_inputs_and_params(
+                input, self._param_attr):
             input_shape = inp.shape
 
             param_shape = [
@@ -338,16 +338,17 @@ def _build_once(self, input):
             self.__w.append(
                 self.add_parameter(
                     '_w%d' % i,
-                    self.create_parameter(
-                        attr=param,
-                        shape=param_shape,
-                        dtype=self._dtype,
-                        is_bias=False)))
+                    self.create_parameter(attr=param,
+                                          shape=param_shape,
+                                          dtype=self._dtype,
+                                          is_bias=False)))
             i += 1
 
         size = list([self._size])
-        self._b = self.create_parameter(
-            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
+        self._b = self.create_parameter(attr=self._bias_attr,
+                                        shape=size,
+                                        dtype=self._dtype,
+                                        is_bias=True)
 
     # TODO(songyouwei): We should remove _w property
     @property
@@ -382,18 +383,19 @@ def bias(self, value):
     def forward(self, input):
         mul_results = list()
         i = 0
-        for inp, param in self._helper.iter_inputs_and_params(input,
-                                                              self._param_attr):
+        for inp, param in self._helper.iter_inputs_and_params(
+                input, self._param_attr):
             tmp = self._helper.create_variable_for_type_inference(self._dtype)
-            self._helper.append_op(
-                type="mul",
-                inputs={"X": inp,
-                        "Y": self.__w[i]},
-                outputs={"Out": tmp},
-                attrs={
-                    "x_num_col_dims": self._num_flatten_dims,
-                    "y_num_col_dims": 1
-                })
+            self._helper.append_op(type="mul",
+                                   inputs={
+                                       "X": inp,
+                                       "Y": self.__w[i]
+                                   },
+                                   outputs={"Out": tmp},
+                                   attrs={
+                                       "x_num_col_dims": self._num_flatten_dims,
+                                       "y_num_col_dims": 1
+                                   })
             i += 1
             mul_results.append(tmp)
 
@@ -402,21 +404,21 @@ def forward(self, input):
         else:
             pre_bias = self._helper.create_variable_for_type_inference(
                 self._dtype)
-            self._helper.append_op(
-                type="sum",
-                inputs={"X": mul_results},
-                outputs={"Out": pre_bias},
-                attrs={"use_mkldnn": False})
+            self._helper.append_op(type="sum",
+                                   inputs={"X": mul_results},
+                                   outputs={"Out": pre_bias},
+                                   attrs={"use_mkldnn": False})
 
         if self._b is not None:
             pre_activation = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self._b]},
-                outputs={'Out': [pre_activation]},
-                attrs={'axis': self._num_flatten_dims})
+            self._helper.append_op(type='elementwise_add',
+                                   inputs={
+                                       'X': [pre_bias],
+                                       'Y': [self._b]
+                                   },
+                                   outputs={'Out': [pre_activation]},
+                                   attrs={'axis': self._num_flatten_dims})
         else:
             pre_activation = pre_bias
         # Currently, we don't support inplace in dygraph mode
@@ -482,10 +484,10 @@ def forward(self, left, right):
         # embedding layer
         left_emb = self.emb_layer(left)
         right_emb = self.emb_layer(right)
-        left_emb = fluid.layers.reshape(
-            left_emb, shape=[-1, self.seq_len, self.bow_dim])
-        right_emb = fluid.layers.reshape(
-            right_emb, shape=[-1, self.seq_len, self.bow_dim])
+        left_emb = fluid.layers.reshape(left_emb,
+                                        shape=[-1, self.seq_len, self.bow_dim])
+        right_emb = fluid.layers.reshape(right_emb,
+                                         shape=[-1, self.seq_len, self.bow_dim])
 
         bow_left = fluid.layers.reduce_sum(left_emb, dim=1)
         bow_right = fluid.layers.reduce_sum(right_emb, dim=1)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
index 5cbaeb0f4046e..4d9193830137d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
@@ -313,8 +313,8 @@ def __init__(self,
 
     def _build_once(self, input):
         i = 0
-        for inp, param in self._helper.iter_inputs_and_params(input,
-                                                              self._param_attr):
+        for inp, param in self._helper.iter_inputs_and_params(
+                input, self._param_attr):
             input_shape = inp.shape
 
             param_shape = [
@@ -324,16 +324,17 @@ def _build_once(self, input):
             self.__w.append(
                 self.add_parameter(
                     '_w%d' % i,
-                    self.create_parameter(
-                        attr=param,
-                        shape=param_shape,
-                        dtype=self._dtype,
-                        is_bias=False)))
+                    self.create_parameter(attr=param,
+                                          shape=param_shape,
+                                          dtype=self._dtype,
+                                          is_bias=False)))
             i += 1
 
         size = list([self._size])
-        self._b = self.create_parameter(
-            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
+        self._b = self.create_parameter(attr=self._bias_attr,
+                                        shape=size,
+                                        dtype=self._dtype,
+                                        is_bias=True)
 
     # TODO(songyouwei): We should remove _w property
     @property
@@ -368,18 +369,19 @@ def bias(self, value):
     def forward(self, input):
         mul_results = list()
         i = 0
-        for inp, param in self._helper.iter_inputs_and_params(input,
-                                                              self._param_attr):
+        for inp, param in self._helper.iter_inputs_and_params(
+                input, self._param_attr):
             tmp = self._helper.create_variable_for_type_inference(self._dtype)
-            self._helper.append_op(
-                type="mul",
-                inputs={"X": inp,
-                        "Y": self.__w[i]},
-                outputs={"Out": tmp},
-                attrs={
-                    "x_num_col_dims": self._num_flatten_dims,
-                    "y_num_col_dims": 1
-                })
+            self._helper.append_op(type="mul",
+                                   inputs={
+                                       "X": inp,
+                                       "Y": self.__w[i]
+                                   },
+                                   outputs={"Out": tmp},
+                                   attrs={
+                                       "x_num_col_dims": self._num_flatten_dims,
+                                       "y_num_col_dims": 1
+                                   })
             i += 1
             mul_results.append(tmp)
 
@@ -388,21 +390,21 @@ def forward(self, input):
         else:
             pre_bias = self._helper.create_variable_for_type_inference(
                 self._dtype)
-            self._helper.append_op(
-                type="sum",
-                inputs={"X": mul_results},
-                outputs={"Out": pre_bias},
-                attrs={"use_mkldnn": False})
+            self._helper.append_op(type="sum",
+                                   inputs={"X": mul_results},
+                                   outputs={"Out": pre_bias},
+                                   attrs={"use_mkldnn": False})
 
         if self._b is not None:
             pre_activation = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self._b]},
-                outputs={'Out': [pre_activation]},
-                attrs={'axis': self._num_flatten_dims})
+            self._helper.append_op(type='elementwise_add',
+                                   inputs={
+                                       'X': [pre_bias],
+                                       'Y': [self._b]
+                                   },
+                                   outputs={'Out': [pre_activation]},
+                                   attrs={'axis': self._num_flatten_dims})
         else:
             pre_activation = pre_bias
         # Currently, we don't support inplace in dygraph mode
@@ -455,8 +457,8 @@ def __init__(self, conf_dict):
         self.seq_len = conf_dict["seq_len"]
         self.emb_layer = EmbeddingLayer(self.dict_size, self.emb_dim,
                                         "emb").ops()
-        self.bow_layer = paddle.nn.Linear(
-            in_features=self.bow_dim, out_features=self.bow_dim)
+        self.bow_layer = paddle.nn.Linear(in_features=self.bow_dim,
+                                          out_features=self.bow_dim)
         self.bow_layer_po = FCLayer(self.bow_dim, None, "fc").ops()
         self.softmax_layer = FCLayer(2, "softmax", "cos_sim").ops()
 
@@ -469,10 +471,10 @@ def forward(self, left, right):
         # embedding layer
         left_emb = self.emb_layer(left)
         right_emb = self.emb_layer(right)
-        left_emb = paddle.reshape(
-            left_emb, shape=[-1, self.seq_len, self.bow_dim])
-        right_emb = paddle.reshape(
-            right_emb, shape=[-1, self.seq_len, self.bow_dim])
+        left_emb = paddle.reshape(left_emb,
+                                  shape=[-1, self.seq_len, self.bow_dim])
+        right_emb = paddle.reshape(right_emb,
+                                   shape=[-1, self.seq_len, self.bow_dim])
 
         bow_left = paddle.fluid.layers.reduce_sum(left_emb, dim=1)
         bow_right = paddle.fluid.layers.reduce_sum(right_emb, dim=1)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py
index d4646833ea2bd..e2f8f69fa83eb 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_assert.py
@@ -35,6 +35,7 @@ def dyfunc_assert_non_variable(x=True):
 
 
 class TestAssertVariable(unittest.TestCase):
+
     def _run(self, func, x, with_exception, to_static):
         ProgramTranslator().enable(to_static)
         if with_exception:
@@ -50,22 +51,28 @@ def _run_dy_static(self, func, x, with_exception):
         self._run(func, x, with_exception, False)
 
     def test_non_variable(self):
-        self._run_dy_static(
-            dyfunc_assert_non_variable, x=False, with_exception=True)
-        self._run_dy_static(
-            dyfunc_assert_non_variable, x=True, with_exception=False)
+        self._run_dy_static(dyfunc_assert_non_variable,
+                            x=False,
+                            with_exception=True)
+        self._run_dy_static(dyfunc_assert_non_variable,
+                            x=True,
+                            with_exception=False)
 
     def test_bool_variable(self):
-        self._run_dy_static(
-            dyfunc_assert_variable, x=numpy.array([False]), with_exception=True)
-        self._run_dy_static(
-            dyfunc_assert_variable, x=numpy.array([True]), with_exception=False)
+        self._run_dy_static(dyfunc_assert_variable,
+                            x=numpy.array([False]),
+                            with_exception=True)
+        self._run_dy_static(dyfunc_assert_variable,
+                            x=numpy.array([True]),
+                            with_exception=False)
 
     def test_int_variable(self):
-        self._run_dy_static(
-            dyfunc_assert_variable, x=numpy.array([0]), with_exception=True)
-        self._run_dy_static(
-            dyfunc_assert_variable, x=numpy.array([1]), with_exception=False)
+        self._run_dy_static(dyfunc_assert_variable,
+                            x=numpy.array([0]),
+                            with_exception=True)
+        self._run_dy_static(dyfunc_assert_variable,
+                            x=numpy.array([1]),
+                            with_exception=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py
index 31a50226f0b79..00eb25792b2d2 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py
@@ -38,6 +38,7 @@ def _ast2func(self, func):
         return transformed_func
 
     def test_ast2func(self):
+
         def func(x, y):
             return x + y
 
@@ -55,6 +56,7 @@ def test_ast2func_dygraph(self):
                 self.assertTrue((true_ret == test_ret).all())
 
     def test_ast2func_static(self):
+
         def func(x):
             y = fluid.layers.relu(x)
             loss = fluid.layers.mean(y)
@@ -73,8 +75,8 @@ def func(x):
     def test_ast2func_error(self):
         with self.assertRaises(Exception) as e:
             self.assertRaises(TypeError, ast_to_func("x = a + b", 'foo'))
-        self.assertTrue("Type of ast_root should be gast.AST or ast.AST" in
-                        str(e.exception))
+        self.assertTrue("Type of ast_root should be gast.AST or ast.AST" in str(
+            e.exception))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
index b86b85bb90ff6..b818ed95a24a6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
@@ -72,6 +72,7 @@ def dyfunc_bool_to_tensor(x):
 
 
 class TestDygraphBasicApi_ToVariable(unittest.TestCase):
+
     def setUp(self):
         self.input = np.ones(5).astype("int32")
         self.test_funcs = [
@@ -79,8 +80,8 @@ def setUp(self):
             dyfunc_float_to_tensor, dyfunc_to_variable, dyfunc_to_variable_2,
             dyfunc_to_variable_3
         ]
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 
     def get_dygraph_output(self):
         with fluid.dygraph.guard():
@@ -103,10 +104,9 @@ def test_transformed_static_result(self):
             self.dygraph_func = func
             dygraph_res = self.get_dygraph_output()
             static_res = self.get_static_output()
-            self.assertTrue(
-                np.allclose(dygraph_res, static_res),
-                msg='dygraph is {}\n static_res is {}'.format(dygraph_res,
-                                                              static_res))
+            self.assertTrue(np.allclose(dygraph_res, static_res),
+                            msg='dygraph is {}\n static_res is {}'.format(
+                                dygraph_res, static_res))
 
 
 # 1. test Apis that inherit from layers.Layer
@@ -115,14 +115,13 @@ def dyfunc_BilinearTensorProduct(layer1, layer2):
         input1_dim=5,
         input2_dim=4,
         output_dim=1000,
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.99)),
-        bias_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.5)))
-
-    res = bilinearTensorProduct(
-        fluid.dygraph.base.to_variable(layer1),
-        fluid.dygraph.base.to_variable(layer2))
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.99)),
+        bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.5)))
+
+    res = bilinearTensorProduct(fluid.dygraph.base.to_variable(layer1),
+                                fluid.dygraph.base.to_variable(layer2))
     return res
 
 
@@ -131,10 +130,11 @@ def dyfunc_Conv2D(input):
         num_channels=3,
         num_filters=2,
         filter_size=3,
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.99)),
-        bias_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.5)), )
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.99)),
+        bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.5)),
+    )
     res = conv2d(input)
     return res
 
@@ -144,10 +144,11 @@ def dyfunc_Conv3D(input):
         num_channels=3,
         num_filters=2,
         filter_size=3,
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.99)),
-        bias_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.5)), )
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.99)),
+        bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.5)),
+    )
     res = conv3d(input)
     return res
 
@@ -158,10 +159,11 @@ def dyfunc_Conv2DTranspose(input):
         num_filters=12,
         filter_size=12,
         use_cudnn=False,
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.99)),
-        bias_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.5)), )
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.99)),
+        bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.5)),
+    )
     ret = conv2dTranspose(input)
     return ret
 
@@ -172,10 +174,11 @@ def dyfunc_Conv3DTranspose(input):
         num_filters=12,
         filter_size=12,
         use_cudnn=False,
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.99)),
-        bias_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.5)), )
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.99)),
+        bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.5)),
+    )
     ret = conv3dTranspose(input)
     return ret
 
@@ -185,19 +188,24 @@ def dyfunc_Linear(input):
         input_dim=10,
         output_dim=5,
         act='relu',
-        param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.99)),
-        bias_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.5)), )
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.99)),
+        bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.5)),
+    )
     res = fc(input)
     return res
 
 
 def dyfunc_Pool2D(input):
-    fluid.dygraph.Pool2D(
-        pool_size=2, pool_type='avg', pool_stride=1, global_pooling=False)
-    pool2d = fluid.dygraph.Pool2D(
-        pool_size=2, pool_type='avg', pool_stride=1, global_pooling=False)
+    fluid.dygraph.Pool2D(pool_size=2,
+                         pool_type='avg',
+                         pool_stride=1,
+                         global_pooling=False)
+    pool2d = fluid.dygraph.Pool2D(pool_size=2,
+                                  pool_type='avg',
+                                  pool_stride=1,
+                                  global_pooling=False)
     res = pool2d(input)
     return res
 
@@ -244,13 +252,13 @@ def get_static_output(self):
     def test_transformed_static_result(self):
         dygraph_res = self.get_dygraph_output()
         static_res = self.get_static_output()
-        self.assertTrue(
-            np.allclose(dygraph_res, static_res),
-            msg='dygraph is {}\n static_res is \n{}'.format(dygraph_res,
-                                                            static_res))
+        self.assertTrue(np.allclose(dygraph_res, static_res),
+                        msg='dygraph is {}\n static_res is \n{}'.format(
+                            dygraph_res, static_res))
 
 
 class TestDygraphBasicApi_BilinearTensorProduct(TestDygraphBasicApi):
+
     def setUp(self):
         self.input1 = np.random.random((5, 5)).astype('float32')
         self.input2 = np.random.random((5, 4)).astype('float32')
@@ -279,36 +287,42 @@ def get_static_output(self):
 
 
 class TestDygraphBasicApi_Conv2D(TestDygraphBasicApi):
+
     def setUp(self):
         self.input = np.random.random((1, 3, 3, 5)).astype('float32')
         self.dygraph_func = dyfunc_Conv2D
 
 
 class TestDygraphBasicApi_Conv3D(TestDygraphBasicApi):
+
     def setUp(self):
         self.input = np.random.random((1, 3, 3, 3, 5)).astype('float32')
         self.dygraph_func = dyfunc_Conv3D
 
 
 class TestDygraphBasicApi_Conv2DTranspose(TestDygraphBasicApi):
+
     def setUp(self):
         self.input = np.random.random((5, 3, 32, 32)).astype('float32')
         self.dygraph_func = dyfunc_Conv2DTranspose
 
 
 class TestDygraphBasicApi_Conv3DTranspose(TestDygraphBasicApi):
+
     def setUp(self):
         self.input = np.random.random((5, 3, 12, 32, 32)).astype('float32')
         self.dygraph_func = dyfunc_Conv3DTranspose
 
 
 class TestDygraphBasicApi_Linear(TestDygraphBasicApi):
+
     def setUp(self):
         self.input = np.random.random((4, 3, 10)).astype('float32')
         self.dygraph_func = dyfunc_Linear
 
 
 class TestDygraphBasicApi_Prelu(TestDygraphBasicApi):
+
     def setUp(self):
         self.input = np.ones([5, 20, 10, 10]).astype('float32')
         self.dygraph_func = dyfunc_Prelu
@@ -317,41 +331,39 @@ def setUp(self):
 # 2. test Apis that inherit from LearningRateDecay
 def dyfunc_CosineDecay():
     base_lr = 0.1
-    CosineDecay = fluid.dygraph.CosineDecay(
-        learning_rate=base_lr, step_each_epoch=10000, epochs=120)
+    CosineDecay = fluid.dygraph.CosineDecay(learning_rate=base_lr,
+                                            step_each_epoch=10000,
+                                            epochs=120)
     lr = CosineDecay()
     return lr
 
 
 def dyfunc_ExponentialDecay():
     base_lr = 0.1
-    exponential_decay = fluid.dygraph.ExponentialDecay(
-        learning_rate=base_lr,
-        decay_steps=10000,
-        decay_rate=0.5,
-        staircase=True)
+    exponential_decay = fluid.dygraph.ExponentialDecay(learning_rate=base_lr,
+                                                       decay_steps=10000,
+                                                       decay_rate=0.5,
+                                                       staircase=True)
     lr = exponential_decay()
     return lr
 
 
 def dyfunc_InverseTimeDecay():
     base_lr = 0.1
-    inverse_time_decay = fluid.dygraph.InverseTimeDecay(
-        learning_rate=base_lr,
-        decay_steps=10000,
-        decay_rate=0.5,
-        staircase=True)
+    inverse_time_decay = fluid.dygraph.InverseTimeDecay(learning_rate=base_lr,
+                                                        decay_steps=10000,
+                                                        decay_rate=0.5,
+                                                        staircase=True)
     lr = inverse_time_decay()
     return lr
 
 
 def dyfunc_NaturalExpDecay():
     base_lr = 0.1
-    natural_exp_decay = fluid.dygraph.NaturalExpDecay(
-        learning_rate=base_lr,
-        decay_steps=10000,
-        decay_rate=0.5,
-        staircase=True)
+    natural_exp_decay = fluid.dygraph.NaturalExpDecay(learning_rate=base_lr,
+                                                      decay_steps=10000,
+                                                      decay_rate=0.5,
+                                                      staircase=True)
     lr = natural_exp_decay()
     return lr
 
@@ -380,6 +392,7 @@ def dyfunc_PolynomialDecay():
 
 
 class TestDygraphBasicApi_CosineDecay(unittest.TestCase):
+
     def setUp(self):
         self.dygraph_func = dyfunc_CosineDecay
 
@@ -406,38 +419,43 @@ def get_static_output(self):
     def test_transformed_static_result(self):
         dygraph_res = self.get_dygraph_output()
         static_res = self.get_static_output()
-        self.assertTrue(
-            np.allclose(dygraph_res, static_res),
-            msg='dygraph is {}\n static_res is \n{}'.format(dygraph_res,
-                                                            static_res))
+        self.assertTrue(np.allclose(dygraph_res, static_res),
+                        msg='dygraph is {}\n static_res is \n{}'.format(
+                            dygraph_res, static_res))
 
 
 class TestDygraphBasicApi_ExponentialDecay(TestDygraphBasicApi_CosineDecay):
+
     def setUp(self):
         self.dygraph_func = dyfunc_ExponentialDecay
 
 
 class TestDygraphBasicApi_InverseTimeDecay(TestDygraphBasicApi_CosineDecay):
+
     def setUp(self):
         self.dygraph_func = dyfunc_InverseTimeDecay
 
 
 class TestDygraphBasicApi_NaturalExpDecay(TestDygraphBasicApi_CosineDecay):
+
     def setUp(self):
         self.dygraph_func = dyfunc_NaturalExpDecay
 
 
 class TestDygraphBasicApi_NoamDecay(TestDygraphBasicApi_CosineDecay):
+
     def setUp(self):
         self.dygraph_func = dyfunc_NoamDecay
 
 
 class TestDygraphBasicApi_PiecewiseDecay(TestDygraphBasicApi_CosineDecay):
+
     def setUp(self):
         self.dygraph_func = dyfunc_PiecewiseDecay
 
 
 class TestDygraphBasicApi_PolynomialDecay(TestDygraphBasicApi_CosineDecay):
+
     def setUp(self):
         self.dygraph_func = dyfunc_PolynomialDecay
 
@@ -451,6 +469,7 @@ def _dygraph_fn():
 
 
 class TestDygraphApiRecognition(unittest.TestCase):
+
     def setUp(self):
         self.src = inspect.getsource(_dygraph_fn)
         self.root = gast.parse(self.src)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
index db533e6379add..f26ed2a682391 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
@@ -29,14 +29,15 @@
 from predictor_utils import PredictorTools
 
 program_translator = ProgramTranslator()
-place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
-)
+place = fluid.CUDAPlace(
+    0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 SEED = 2020
 STEP_NUM = 10
 PRINT_STEP = 2
 
 
 class TestBert(unittest.TestCase):
+
     def setUp(self):
         self.bert_config = get_bert_config()
         self.data_reader = get_feed_data_reader(self.bert_config)
@@ -56,13 +57,14 @@ def train(self, bert_config, data_reader, to_static):
             fluid.default_main_program().random_seed = SEED
             fluid.default_startup_program().random_seed = SEED
 
-            data_loader = fluid.io.DataLoader.from_generator(
-                capacity=50, iterable=True)
-            data_loader.set_batch_generator(
-                data_reader.data_generator(), places=place)
+            data_loader = fluid.io.DataLoader.from_generator(capacity=50,
+                                                             iterable=True)
+            data_loader.set_batch_generator(data_reader.data_generator(),
+                                            places=place)
 
-            bert = PretrainModelLayer(
-                config=bert_config, weight_sharing=False, use_fp16=False)
+            bert = PretrainModelLayer(config=bert_config,
+                                      weight_sharing=False,
+                                      use_fp16=False)
 
             optimizer = fluid.optimizer.Adam(parameter_list=bert.parameters())
             step_idx = 0
@@ -120,12 +122,11 @@ def predict_static(self, data):
         paddle.enable_static()
         exe = fluid.Executor(place)
         # load inference model
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             self.model_save_dir,
-             executor=exe,
-             model_filename=self.model_filename,
-             params_filename=self.params_filename)
+        [inference_program, feed_target_names, fetch_targets
+         ] = fluid.io.load_inference_model(self.model_save_dir,
+                                           executor=exe,
+                                           model_filename=self.model_filename,
+                                           params_filename=self.params_filename)
         pred_res = exe.run(inference_program,
                            feed=dict(zip(feed_target_names, data)),
                            fetch_list=fetch_targets)
@@ -135,8 +136,9 @@ def predict_static(self, data):
     def predict_dygraph(self, bert_config, data):
         program_translator.enable(False)
         with fluid.dygraph.guard(place):
-            bert = PretrainModelLayer(
-                config=bert_config, weight_sharing=False, use_fp16=False)
+            bert = PretrainModelLayer(config=bert_config,
+                                      weight_sharing=False,
+                                      use_fp16=False)
             model_dict, _ = fluid.dygraph.load_dygraph(
                 self.dy_state_dict_save_path)
 
@@ -145,14 +147,13 @@ def predict_dygraph(self, bert_config, data):
 
             input_vars = [fluid.dygraph.to_variable(x) for x in data]
             src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = input_vars
-            pred_res = bert(
-                src_ids=src_ids,
-                position_ids=pos_ids,
-                sentence_ids=sent_ids,
-                input_mask=input_mask,
-                mask_label=mask_label,
-                mask_pos=mask_pos,
-                labels=labels)
+            pred_res = bert(src_ids=src_ids,
+                            position_ids=pos_ids,
+                            sentence_ids=sent_ids,
+                            input_mask=input_mask,
+                            mask_label=mask_label,
+                            mask_pos=mask_pos,
+                            labels=labels)
             pred_res = [var.numpy() for var in pred_res]
 
             return pred_res
@@ -180,14 +181,12 @@ def test_train(self):
                                                     self.data_reader)
         dygraph_loss, dygraph_ppl = self.train_dygraph(self.bert_config,
                                                        self.data_reader)
-        self.assertTrue(
-            np.allclose(static_loss, dygraph_loss),
-            msg="static_loss: {} \n dygraph_loss: {}".format(static_loss,
-                                                             dygraph_loss))
-        self.assertTrue(
-            np.allclose(static_ppl, dygraph_ppl),
-            msg="static_ppl: {} \n dygraph_ppl: {}".format(static_ppl,
-                                                           dygraph_ppl))
+        self.assertTrue(np.allclose(static_loss, dygraph_loss),
+                        msg="static_loss: {} \n dygraph_loss: {}".format(
+                            static_loss, dygraph_loss))
+        self.assertTrue(np.allclose(static_ppl, dygraph_ppl),
+                        msg="static_ppl: {} \n dygraph_ppl: {}".format(
+                            static_ppl, dygraph_ppl))
 
         self.verify_predict()
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index bec9b35a7febb..14683b33feb37 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -94,6 +94,7 @@ def _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample,
 
 
 class Conv1D(fluid.dygraph.Layer):
+
     def __init__(self,
                  prefix,
                  num_channels=256,
@@ -105,25 +106,22 @@ def __init__(self,
         super(Conv1D, self).__init__()
         fan_in = num_channels * size_k * 1
         k = 1. / math.sqrt(fan_in)
-        param_attr = ParamAttr(
-            name=prefix + "_w",
-            initializer=fluid.initializer.Uniform(
-                low=-k, high=k))
-        bias_attr = ParamAttr(
-            name=prefix + "_b",
-            initializer=fluid.initializer.Uniform(
-                low=-k, high=k))
-
-        self._conv2d = fluid.dygraph.Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=(1, size_k),
-            stride=1,
-            padding=(0, padding),
-            groups=groups,
-            act=act,
-            param_attr=param_attr,
-            bias_attr=bias_attr)
+        param_attr = ParamAttr(name=prefix + "_w",
+                               initializer=fluid.initializer.Uniform(low=-k,
+                                                                     high=k))
+        bias_attr = ParamAttr(name=prefix + "_b",
+                              initializer=fluid.initializer.Uniform(low=-k,
+                                                                    high=k))
+
+        self._conv2d = fluid.dygraph.Conv2D(num_channels=num_channels,
+                                            num_filters=num_filters,
+                                            filter_size=(1, size_k),
+                                            stride=1,
+                                            padding=(0, padding),
+                                            groups=groups,
+                                            act=act,
+                                            param_attr=param_attr,
+                                            bias_attr=bias_attr)
 
     def forward(self, x):
         x = fluid.layers.unsqueeze(input=x, axes=[2])
@@ -133,6 +131,7 @@ def forward(self, x):
 
 
 class BMN(fluid.dygraph.Layer):
+
     def __init__(self, cfg):
         super(BMN, self).__init__()
 
@@ -147,49 +146,50 @@ def __init__(self, cfg):
         self.hidden_dim_3d = 512
 
         # Base Module
-        self.b_conv1 = Conv1D(
-            prefix="Base_1",
-            num_channels=cfg.feat_dim,
-            num_filters=self.hidden_dim_1d,
-            size_k=3,
-            padding=1,
-            groups=4,
-            act="relu")
-        self.b_conv2 = Conv1D(
-            prefix="Base_2",
-            num_filters=self.hidden_dim_1d,
-            size_k=3,
-            padding=1,
-            groups=4,
-            act="relu")
+        self.b_conv1 = Conv1D(prefix="Base_1",
+                              num_channels=cfg.feat_dim,
+                              num_filters=self.hidden_dim_1d,
+                              size_k=3,
+                              padding=1,
+                              groups=4,
+                              act="relu")
+        self.b_conv2 = Conv1D(prefix="Base_2",
+                              num_filters=self.hidden_dim_1d,
+                              size_k=3,
+                              padding=1,
+                              groups=4,
+                              act="relu")
 
         # Temporal Evaluation Module
-        self.ts_conv1 = Conv1D(
-            prefix="TEM_s1",
-            num_filters=self.hidden_dim_1d,
-            size_k=3,
-            padding=1,
-            groups=4,
-            act="relu")
-        self.ts_conv2 = Conv1D(
-            prefix="TEM_s2", num_filters=1, size_k=1, padding=0, act="sigmoid")
-        self.te_conv1 = Conv1D(
-            prefix="TEM_e1",
-            num_filters=self.hidden_dim_1d,
-            size_k=3,
-            padding=1,
-            groups=4,
-            act="relu")
-        self.te_conv2 = Conv1D(
-            prefix="TEM_e2", num_filters=1, size_k=1, padding=0, act="sigmoid")
+        self.ts_conv1 = Conv1D(prefix="TEM_s1",
+                               num_filters=self.hidden_dim_1d,
+                               size_k=3,
+                               padding=1,
+                               groups=4,
+                               act="relu")
+        self.ts_conv2 = Conv1D(prefix="TEM_s2",
+                               num_filters=1,
+                               size_k=1,
+                               padding=0,
+                               act="sigmoid")
+        self.te_conv1 = Conv1D(prefix="TEM_e1",
+                               num_filters=self.hidden_dim_1d,
+                               size_k=3,
+                               padding=1,
+                               groups=4,
+                               act="relu")
+        self.te_conv2 = Conv1D(prefix="TEM_e2",
+                               num_filters=1,
+                               size_k=1,
+                               padding=0,
+                               act="sigmoid")
 
         #Proposal Evaluation Module
-        self.p_conv1 = Conv1D(
-            prefix="PEM_1d",
-            num_filters=self.hidden_dim_2d,
-            size_k=3,
-            padding=1,
-            act="relu")
+        self.p_conv1 = Conv1D(prefix="PEM_1d",
+                              num_filters=self.hidden_dim_2d,
+                              size_k=3,
+                              padding=1,
+                              act="relu")
 
         # init to speed up
         sample_mask = get_interp1d_mask(self.tscale, self.dscale,
@@ -263,8 +263,8 @@ def forward(self, x):
         xp = self.p_conv1(x)
         # BM layer
         xp = fluid.layers.matmul(xp, self.sample_mask)
-        xp = fluid.layers.reshape(
-            xp, shape=[0, 0, -1, self.dscale, self.tscale])
+        xp = fluid.layers.reshape(xp,
+                                  shape=[0, 0, -1, self.dscale, self.tscale])
 
         xp = self.p_conv3d1(xp)
         xp = fluid.layers.squeeze(xp, axes=[2])
@@ -277,6 +277,7 @@ def forward(self, x):
 
 def bmn_loss_func(pred_bm, pred_start, pred_end, gt_iou_map, gt_start, gt_end,
                   cfg):
+
     def _get_mask(cfg):
         dscale = cfg.dscale
         tscale = cfg.tscale
@@ -286,24 +287,29 @@ def _get_mask(cfg):
                            ] + [0 for i in range(idx)]
             bm_mask.append(mask_vector)
         bm_mask = np.array(bm_mask, dtype=np.float32)
-        self_bm_mask = fluid.layers.create_global_var(
-            shape=[dscale, tscale], value=0, dtype=DATATYPE, persistable=True)
+        self_bm_mask = fluid.layers.create_global_var(shape=[dscale, tscale],
+                                                      value=0,
+                                                      dtype=DATATYPE,
+                                                      persistable=True)
         fluid.layers.assign(bm_mask, self_bm_mask)
         self_bm_mask.stop_gradient = True
         return self_bm_mask
 
     def tem_loss_func(pred_start, pred_end, gt_start, gt_end):
+
         def bi_loss(pred_score, gt_label):
-            pred_score = fluid.layers.reshape(
-                x=pred_score, shape=[-1], inplace=False)
-            gt_label = fluid.layers.reshape(
-                x=gt_label, shape=[-1], inplace=False)
+            pred_score = fluid.layers.reshape(x=pred_score,
+                                              shape=[-1],
+                                              inplace=False)
+            gt_label = fluid.layers.reshape(x=gt_label,
+                                            shape=[-1],
+                                            inplace=False)
             gt_label.stop_gradient = True
             pmask = fluid.layers.cast(x=(gt_label > 0.5), dtype=DATATYPE)
-            num_entries = fluid.layers.cast(
-                fluid.layers.shape(pmask), dtype=DATATYPE)
-            num_positive = fluid.layers.cast(
-                fluid.layers.reduce_sum(pmask), dtype=DATATYPE)
+            num_entries = fluid.layers.cast(fluid.layers.shape(pmask),
+                                            dtype=DATATYPE)
+            num_positive = fluid.layers.cast(fluid.layers.reduce_sum(pmask),
+                                             dtype=DATATYPE)
             ratio = num_entries / num_positive
             coef_0 = 0.5 * ratio / (ratio - 1)
             coef_1 = 0.5 * ratio
@@ -334,26 +340,26 @@ def pem_reg_loss_func(pred_score, gt_iou_map, mask):
         u_lmask = fluid.layers.cast(x=u_lmask, dtype=DATATYPE)
         u_lmask = fluid.layers.elementwise_mul(u_lmask, mask)
 
-        num_h = fluid.layers.cast(
-            fluid.layers.reduce_sum(u_hmask), dtype=DATATYPE)
-        num_m = fluid.layers.cast(
-            fluid.layers.reduce_sum(u_mmask), dtype=DATATYPE)
-        num_l = fluid.layers.cast(
-            fluid.layers.reduce_sum(u_lmask), dtype=DATATYPE)
+        num_h = fluid.layers.cast(fluid.layers.reduce_sum(u_hmask),
+                                  dtype=DATATYPE)
+        num_m = fluid.layers.cast(fluid.layers.reduce_sum(u_mmask),
+                                  dtype=DATATYPE)
+        num_l = fluid.layers.cast(fluid.layers.reduce_sum(u_lmask),
+                                  dtype=DATATYPE)
 
         r_m = num_h / num_m
         u_smmask = fluid.layers.assign(
-            local_random.uniform(0., 1., [
-                gt_iou_map.shape[1], gt_iou_map.shape[2]
-            ]).astype(DATATYPE))
+            local_random.uniform(
+                0., 1.,
+                [gt_iou_map.shape[1], gt_iou_map.shape[2]]).astype(DATATYPE))
         u_smmask = fluid.layers.elementwise_mul(u_mmask, u_smmask)
         u_smmask = fluid.layers.cast(x=(u_smmask > (1. - r_m)), dtype=DATATYPE)
 
         r_l = num_h / num_l
         u_slmask = fluid.layers.assign(
-            local_random.uniform(0., 1., [
-                gt_iou_map.shape[1], gt_iou_map.shape[2]
-            ]).astype(DATATYPE))
+            local_random.uniform(
+                0., 1.,
+                [gt_iou_map.shape[1], gt_iou_map.shape[2]]).astype(DATATYPE))
         u_slmask = fluid.layers.elementwise_mul(u_lmask, u_slmask)
         u_slmask = fluid.layers.cast(x=(u_slmask > (1. - r_l)), dtype=DATATYPE)
 
@@ -388,12 +394,16 @@ def pem_cls_loss_func(pred_score, gt_iou_map, mask):
         loss = -1 * (loss_pos + loss_neg) / num_entries
         return loss
 
-    pred_bm_reg = fluid.layers.squeeze(
-        fluid.layers.slice(
-            pred_bm, axes=[1], starts=[0], ends=[1]), axes=[1])
-    pred_bm_cls = fluid.layers.squeeze(
-        fluid.layers.slice(
-            pred_bm, axes=[1], starts=[1], ends=[2]), axes=[1])
+    pred_bm_reg = fluid.layers.squeeze(fluid.layers.slice(pred_bm,
+                                                          axes=[1],
+                                                          starts=[0],
+                                                          ends=[1]),
+                                       axes=[1])
+    pred_bm_cls = fluid.layers.squeeze(fluid.layers.slice(pred_bm,
+                                                          axes=[1],
+                                                          starts=[1],
+                                                          ends=[2]),
+                                       axes=[1])
 
     bm_mask = _get_mask(cfg)
 
@@ -433,8 +443,7 @@ def optimizer(cfg, parameter_list):
     l2_weight_decay = cfg.l2_weight_decay
     lr = [base_lr, base_lr * lr_decay]
     optimizer = fluid.optimizer.Adam(
-        fluid.layers.piecewise_decay(
-            boundaries=bd, values=lr),
+        fluid.layers.piecewise_decay(boundaries=bd, values=lr),
         parameter_list=parameter_list,
         regularization=fluid.regularizer.L2DecayRegularizer(
             regularization_coeff=l2_weight_decay))
@@ -442,6 +451,7 @@ def optimizer(cfg, parameter_list):
 
 
 def fake_data_reader(args, mode='train'):
+
     def iou_with_anchors(anchors_min, anchors_max, box_min, box_max):
         """Compute jaccard score between a box and the anchors.
         """
@@ -517,8 +527,9 @@ def get_video_label(match_map, anchor_xmin, anchor_xmax):
         for jdx in range(len(anchor_xmin)):
             match_score_start.append(
                 np.max(
-                    ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[
-                        jdx], gt_start_bboxs[:, 0], gt_start_bboxs[:, 1])))
+                    ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
+                                     gt_start_bboxs[:, 0], gt_start_bboxs[:,
+                                                                          1])))
         match_score_end = []
         for jdx in range(len(anchor_xmin)):
             match_score_end.append(
@@ -547,8 +558,8 @@ def reader():
                 batch_out.append(
                     (video_feat, gt_iou_map, gt_start, gt_end, video_idx))
             else:
-                raise NotImplementedError('mode {} not implemented'.format(
-                    mode))
+                raise NotImplementedError(
+                    'mode {} not implemented'.format(mode))
             if len(batch_out) == args.batch_size:
                 yield batch_out
                 batch_out = []
@@ -582,7 +593,9 @@ def val_bmn(model, args):
         avg_loss = fluid.layers.mean(loss)
 
         loss_data += [
-            avg_loss.numpy()[0], tem_loss.numpy()[0], pem_reg_loss.numpy()[0],
+            avg_loss.numpy()[0],
+            tem_loss.numpy()[0],
+            pem_reg_loss.numpy()[0],
             pem_cls_loss.numpy()[0]
         ]
 
@@ -597,6 +610,7 @@ def val_bmn(model, args):
 
 
 class TestTrain(unittest.TestCase):
+
     def setUp(self):
         self.args = Args()
         self.place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda() \
@@ -629,14 +643,14 @@ def train_bmn(self, args, place, to_static):
 
             for epoch in range(args.epoch):
                 for batch_id, data in enumerate(train_reader()):
-                    video_feat = np.array(
-                        [item[0] for item in data]).astype(DATATYPE)
-                    gt_iou_map = np.array(
-                        [item[1] for item in data]).astype(DATATYPE)
-                    gt_start = np.array(
-                        [item[2] for item in data]).astype(DATATYPE)
-                    gt_end = np.array(
-                        [item[3] for item in data]).astype(DATATYPE)
+                    video_feat = np.array([item[0]
+                                           for item in data]).astype(DATATYPE)
+                    gt_iou_map = np.array([item[1]
+                                           for item in data]).astype(DATATYPE)
+                    gt_start = np.array([item[2]
+                                         for item in data]).astype(DATATYPE)
+                    gt_end = np.array([item[3]
+                                       for item in data]).astype(DATATYPE)
 
                     x_data = to_variable(video_feat)
                     gt_iou_map = to_variable(gt_iou_map)
@@ -658,12 +672,14 @@ def train_bmn(self, args, place, to_static):
                     bmn.clear_gradients()
                     # log loss data to verify correctness
                     loss_data += [
-                        avg_loss.numpy()[0], tem_loss.numpy()[0],
-                        pem_reg_loss.numpy()[0], pem_cls_loss.numpy()[0]
+                        avg_loss.numpy()[0],
+                        tem_loss.numpy()[0],
+                        pem_reg_loss.numpy()[0],
+                        pem_cls_loss.numpy()[0]
                     ]
 
-                    if args.log_interval > 0 and (
-                            batch_id % args.log_interval == 0):
+                    if args.log_interval > 0 and (batch_id % args.log_interval
+                                                  == 0):
                         print('[TRAIN] Epoch {}, iter {} '.format(epoch, batch_id)
                                     + '\tLoss = {}, \ttem_loss = {}, \tpem_reg_loss = {}, \tpem_cls_loss = {}'.format(
                             '%f' % avg_loss.numpy()[0], '%f' % tem_loss.numpy()[0], \
@@ -748,12 +764,11 @@ def predict_static(self, data):
         paddle.enable_static()
         exe = fluid.Executor(self.place)
         # load inference model
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             self.model_save_dir,
-             executor=exe,
-             model_filename=self.model_filename,
-             params_filename=self.params_filename)
+        [inference_program, feed_target_names, fetch_targets
+         ] = fluid.io.load_inference_model(self.model_save_dir,
+                                           executor=exe,
+                                           model_filename=self.model_filename,
+                                           params_filename=self.params_filename)
         pred_res = exe.run(inference_program,
                            feed={feed_target_names[0]: data},
                            fetch_list=fetch_targets)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
index f67dda3fbd79a..79b6880b0d871 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
@@ -139,7 +139,9 @@ def test_for_in_else(x):
 
 
 def while_loop_class_var(x):
+
     class Foo(object):
+
         def __init__(self):
             self.a = 3
             self.b = 4
@@ -183,10 +185,11 @@ def test_optim_break_in_while(x):
 
 
 class TestContinueInFor(unittest.TestCase):
+
     def setUp(self):
         self.input = np.zeros((1)).astype('int64')
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self.init_dygraph_func()
 
     def init_dygraph_func(self):
@@ -205,58 +208,67 @@ def run_static_mode(self):
     def test_transformed_static_result(self):
         static_res = self.run_static_mode()
         dygraph_res = self.run_dygraph_mode()
-        self.assertTrue(
-            np.allclose(dygraph_res, static_res),
-            msg='dygraph res is {}\nstatic_res is {}'.format(dygraph_res,
-                                                             static_res))
+        self.assertTrue(np.allclose(dygraph_res, static_res),
+                        msg='dygraph res is {}\nstatic_res is {}'.format(
+                            dygraph_res, static_res))
 
 
 class TestContinueInForAtEnd(TestContinueInFor):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_continue_in_for_at_end
 
 
 class TestBreakInFor(TestContinueInFor):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_break_in_for
 
 
 class TestBreakInForAtEnd(TestContinueInFor):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_break_in_for_at_end
 
 
 class TestBreakContinueInFor(TestContinueInFor):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_break_continue_in_for
 
 
 class TestForInElse(TestContinueInFor):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_for_in_else
 
 
 class TestContinueInWhile(TestContinueInFor):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_continue_in_while
 
 
 class TestBreakInWhile(TestContinueInWhile):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_break_in_while
 
 
 class TestWhileLoopClassVar(TestContinueInWhile):
+
     def init_dygraph_func(self):
         self.dygraph_func = while_loop_class_var
 
 
 class TestOptimBreakInFor(TestContinueInWhile):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_optim_break_in_for
 
 
 class TestOptimBreakInWhile(TestContinueInWhile):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_optim_break_in_while
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py
index 95ea5ad227eeb..27272985d55ec 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,6 +24,7 @@
 
 
 class TestResnetWithPass(unittest.TestCase):
+
     def setUp(self):
         self.build_strategy = paddle.static.BuildStrategy()
         self.build_strategy.fuse_elewise_add_act_ops = True
@@ -44,24 +45,22 @@ def verify_predict(self):
         st_pre = self.resnet_helper.predict_static(image)
         dy_jit_pre = self.resnet_helper.predict_dygraph_jit(image)
         predictor_pre = self.resnet_helper.predict_analysis_inference(image)
-        self.assertTrue(
-            np.allclose(dy_pre, st_pre),
-            msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
-        self.assertTrue(
-            np.allclose(dy_jit_pre, st_pre),
-            msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
-        self.assertTrue(
-            np.allclose(predictor_pre, st_pre),
-            msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
-                                                              st_pre))
+        self.assertTrue(np.allclose(dy_pre, st_pre),
+                        msg="dy_pre:\n {}\n, st_pre: \n{}.".format(
+                            dy_pre, st_pre))
+        self.assertTrue(np.allclose(dy_jit_pre, st_pre),
+                        msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(
+                            dy_jit_pre, st_pre))
+        self.assertTrue(np.allclose(predictor_pre, st_pre),
+                        msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(
+                            predictor_pre, st_pre))
 
     def test_resnet(self):
         static_loss = self.train(to_static=True)
         dygraph_loss = self.train(to_static=False)
-        self.assertTrue(
-            np.allclose(static_loss, dygraph_loss),
-            msg="static_loss: {} \n dygraph_loss: {}".format(static_loss,
-                                                             dygraph_loss))
+        self.assertTrue(np.allclose(static_loss, dygraph_loss),
+                        msg="static_loss: {} \n dygraph_loss: {}".format(
+                            static_loss, dygraph_loss))
         self.verify_predict()
 
     def test_in_static_mode_mkldnn(self):
@@ -74,7 +73,9 @@ def test_in_static_mode_mkldnn(self):
 
 
 class TestError(unittest.TestCase):
+
     def test_type_error(self):
+
         def foo(x):
             out = x + 1
             return out
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py
index b72149a29c73f..3d2339f58f387 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py
@@ -28,6 +28,7 @@
 
 
 class TestCacheProgram(unittest.TestCase):
+
     def setUp(self):
         self.batch_num = 5
         self.dygraph_class = Pool2D
@@ -55,12 +56,14 @@ def test_cache(self):
                         cur_out, (tuple, list)) else cur_out.numpy()
                     self.assertTrue(
                         np.allclose(prev_out_numpy, cur_out_numpy),
-                        msg='Output in previous batch is {}\n Output in current batch is \n{}'
+                        msg=
+                        'Output in previous batch is {}\n Output in current batch is \n{}'
                         .format(prev_out_numpy, cur_out_numpy))
                     self.assertEqual(prev_ops, cur_ops)
 
 
 class TestCacheProgram2(TestCacheProgram):
+
     def setUp(self):
         self.batch_num = 5
         self.dygraph_class = Linear
@@ -68,6 +71,7 @@ def setUp(self):
 
 
 class TestCacheProgramWithOptimizer(unittest.TestCase):
+
     def setUp(self):
         self.dygraph_class = Linear
         self.data = np.random.random((4, 10)).astype('float32')
@@ -102,10 +106,9 @@ def train(self, to_static=False):
     def test_with_optimizer(self):
         dygraph_loss = self.train_dygraph()
         static_loss = self.train_static()
-        self.assertTrue(
-            np.allclose(dygraph_loss, static_loss),
-            msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
-                                                            static_loss))
+        self.assertTrue(np.allclose(dygraph_loss, static_loss),
+                        msg='dygraph is {}\n static_res is \n{}'.format(
+                            dygraph_loss, static_loss))
 
 
 def simple_func(x):
@@ -115,6 +118,7 @@ def simple_func(x):
 
 
 class TestConvertWithCache(unittest.TestCase):
+
     def test_cache(self):
         static_func = convert_to_static(simple_func)
         # Get transformed function from cache.
@@ -145,6 +149,7 @@ def sum_under_while(limit):
 
 
 class TestToOutputWithCache(unittest.TestCase):
+
     def test_output(self):
         with fluid.dygraph.guard():
             ret = sum_even_until_limit(80, 10)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cast.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cast.py
index b4cc38b3a601b..da67b08287c3d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cast.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cast.py
@@ -61,9 +61,10 @@ def test_mix_cast(x):
 
 
 class TestCastBase(unittest.TestCase):
+
     def setUp(self):
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self.prepare()
         self.set_func()
 
@@ -97,6 +98,7 @@ def test_cast_result(self):
 
 
 class TestIntCast(TestCastBase):
+
     def prepare(self):
         self.input_shape = (1, )
         self.input_dtype = 'float32'
@@ -110,6 +112,7 @@ def set_func(self):
 
 
 class TestFloatCast(TestCastBase):
+
     def prepare(self):
         self.input_shape = (8, 16)
         self.input_dtype = 'bool'
@@ -123,6 +126,7 @@ def set_func(self):
 
 
 class TestMixCast(TestCastBase):
+
     def prepare(self):
         self.input_shape = (8, 32)
         self.input_dtype = 'float32'
@@ -152,6 +156,7 @@ def test_cast_result(self):
 
 
 class TestNotVarCast(TestCastBase):
+
     def prepare(self):
         self.input = 3.14
         self.cast_dtype = 'int'
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py
index 74f4a895d1583..6ed32e49775d7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@
 
 
 class BufferLayers(paddle.nn.Layer):
+
     def __init__(self, out_channel):
         super(BufferLayers, self).__init__()
         self.out_channel = out_channel
@@ -37,6 +38,7 @@ def _mask(self):
 
 
 class SequentialNet(paddle.nn.Layer):
+
     def __init__(self, sub_layer, in_channel, out_channel):
         super(SequentialNet, self).__init__()
         self.layer = paddle.nn.Sequential(
@@ -50,14 +52,17 @@ def forward(self, x):
 
 
 class NestSequentialNet(paddle.nn.Layer):
+
     def __init__(self):
         super().__init__()
         group1 = paddle.nn.Sequential(
             paddle.nn.Linear(10, 10),
-            paddle.nn.Sigmoid(), )
+            paddle.nn.Sigmoid(),
+        )
         group2 = paddle.nn.Sequential(
             paddle.nn.Linear(10, 3),
-            paddle.nn.ReLU(), )
+            paddle.nn.ReLU(),
+        )
         self.layers = paddle.nn.Sequential(group1, group2)
 
     def forward(self, x):
@@ -65,6 +70,7 @@ def forward(self, x):
 
 
 class TestSequential(unittest.TestCase):
+
     def setUp(self):
         paddle.set_device('cpu')
         self.seed = 2021
@@ -90,9 +96,9 @@ def _run(self, to_static):
         out = self.net(x)
         if to_static:
             load_out = self._test_load(self.net, x)
-            self.assertTrue(
-                np.allclose(load_out, out),
-                msg='load_out is {}\st_out is {}'.format(load_out, out))
+            self.assertTrue(np.allclose(load_out, out),
+                            msg='load_out is {}\st_out is {}'.format(
+                                load_out, out))
 
         return out
 
@@ -100,9 +106,9 @@ def test_train(self):
         paddle.jit.set_code_level(100)
         dy_out = self._run(to_static=False)
         st_out = self._run(to_static=True)
-        self.assertTrue(
-            np.allclose(dy_out, st_out),
-            msg='dygraph_res is {}\nstatic_res is {}'.format(dy_out, st_out))
+        self.assertTrue(np.allclose(dy_out, st_out),
+                        msg='dygraph_res is {}\nstatic_res is {}'.format(
+                            dy_out, st_out))
 
     def _test_load(self, net, x):
         paddle.jit.save(net, self.model_path)
@@ -112,6 +118,7 @@ def _test_load(self, net, x):
 
 
 class TestNestSequential(TestSequential):
+
     def _init_config(self):
         self.net = NestSequentialNet()
         self.model_path = os.path.join(self.temp_dir.name,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
index 2e2918facf896..38746337ce3cd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
@@ -66,10 +66,11 @@ def dyfunc_with_third_library_logging(x_v):
 
 
 class TestRecursiveCall1(unittest.TestCase):
+
     def setUp(self):
         self.input = np.random.random([10, 16]).astype('float32')
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self.init_test_func()
 
     def init_test_func(self):
@@ -90,26 +91,26 @@ def get_static_output(self):
     def test_transformed_static_result(self):
         static_res = self.get_static_output()
         dygraph_res = self.get_dygraph_output()
-        self.assertTrue(
-            np.allclose(dygraph_res, static_res),
-            msg='dygraph res is {}\nstatic_res is {}'.format(dygraph_res,
-                                                             static_res))
+        self.assertTrue(np.allclose(dygraph_res, static_res),
+                        msg='dygraph res is {}\nstatic_res is {}'.format(
+                            dygraph_res, static_res))
 
 
 lambda_fun = lambda x: x
 
 
 class MyConvLayer(fluid.dygraph.Layer):
+
     def __init__(self):
         super(MyConvLayer, self).__init__()
         self._conv = fluid.dygraph.Conv2D(
             num_channels=3,
             num_filters=2,
             filter_size=3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.99)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.5)))
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.99)),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.5)))
 
     @paddle.jit.to_static
     def forward(self, inputs):
@@ -125,6 +126,7 @@ def dymethod(self, x_v):
 
 
 class MyLayer(fluid.dygraph.Layer):
+
     def __init__(self):
         super(MyLayer, self).__init__()
 
@@ -133,10 +135,10 @@ def __init__(self):
             input_dim=5,
             output_dim=1,
             act='relu',
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.99)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.5)))
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.99)),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.5)))
 
     @paddle.jit.to_static
     def forward(self, inputs):
@@ -146,10 +148,11 @@ def forward(self, inputs):
 
 
 class TestRecursiveCall2(unittest.TestCase):
+
     def setUp(self):
         self.input = np.random.random((1, 3, 3, 5)).astype('float32')
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self.set_func()
 
     def set_func(self):
@@ -173,13 +176,13 @@ def get_static_output(self):
     def test_transformed_static_result(self):
         dygraph_res = self.get_dygraph_output()
         static_res = self.get_static_output()
-        self.assertTrue(
-            np.allclose(dygraph_res, static_res),
-            msg='dygraph is {}\n static_res is \n{}'.format(dygraph_res,
-                                                            static_res))
+        self.assertTrue(np.allclose(dygraph_res, static_res),
+                        msg='dygraph is {}\n static_res is \n{}'.format(
+                            dygraph_res, static_res))
 
 
 class TestThirdPartyLibrary(TestRecursiveCall2):
+
     def set_func(self):
         self.dygraph_func = dyfunc_with_third_library_logging
 
@@ -205,6 +208,7 @@ def func_convert_then_not_to_static(x):
 
 
 class TestClass(paddle.nn.Layer):
+
     @paddle.jit.not_to_static
     def called_member(self, x):
         return paddle.sum(x)
@@ -216,6 +220,7 @@ def forward(self, x):
 
 
 class TestNotToConvert(TestRecursiveCall2):
+
     def set_func(self):
         self.dygraph_func = func_not_to_static
 
@@ -226,16 +231,19 @@ def test_conversion_options(self):
 
 
 class TestNotToConvert2(TestRecursiveCall2):
+
     def set_func(self):
         self.dygraph_func = func_convert_then_not_to_static
 
 
 class TestNotToConvert3(TestRecursiveCall2):
+
     def set_func(self):
         self.dygraph_func = TestClass()
 
 
 class TestDynamicToStaticCode(unittest.TestCase):
+
     def setUp(self):
         self.set_func()
         self.set_answer_func()
@@ -244,7 +252,9 @@ def set_func(self):
         self.func = func_not_to_static
 
     def set_answer_func(self):
+
         class StaticCode():
+
             @paddle.jit.not_to_static
             def func_not_to_static(x):
                 res = func_sum(x)
@@ -270,11 +280,14 @@ def test_code(self):
 
 
 class TestDynamicToStaticCode2(TestDynamicToStaticCode):
+
     def set_func(self):
         self.func = func_convert_then_not_to_static
 
     def set_answer_func(self):
+
         class StaticCode():
+
             def func_convert_then_not_to_static(x):
                 y = _jst.convert_call(func_not_to_static)(x)
                 return y
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call_generator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call_generator.py
index cfe9e191ed486..19645d6fd63df 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call_generator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call_generator.py
@@ -40,6 +40,7 @@ def main_func():
 
 
 class TestConvertGenerator(unittest.TestCase):
+
     def test_raise_error(self):
         with self.assertRaises(Exception):
             to_static(main_func)()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py
index bb1942692fd9d..6188d6a786b2e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py
@@ -19,12 +19,14 @@
 
 
 class CallNotExist(paddle.nn.Layer):
+
     def __call__(self):
         # call a non-exist API to trigger exception
         return paddle.nn.not_exist_api
 
 
 class ForwardNotExist(paddle.nn.Layer):
+
     def forward(self):
         return 0
 
@@ -34,7 +36,9 @@ def forward(self):
 
 
 class TestConvertCall(unittest.TestCase):
+
     def test_class_exception(self):
+
         @paddle.jit.to_static
         def call_not_exist():
             net = CallNotExist()
@@ -52,9 +56,10 @@ def forward_not_exist():
 
 
 class TestConvertShapeCompare(unittest.TestCase):
+
     def test_non_variable(self):
-        self.assertEqual(
-            paddle.jit.dy2static.convert_shape_compare(1, "<", 2), True)
+        self.assertEqual(paddle.jit.dy2static.convert_shape_compare(1, "<", 2),
+                         True)
         self.assertEqual(
             paddle.jit.dy2static.convert_shape_compare(1, "<", 2, "<=", 3),
             True)
@@ -69,8 +74,9 @@ def error_func():
             raise ValueError("Used for test")
 
         self.assertEqual(
-            paddle.jit.dy2static.convert_shape_compare(
-                1, ">", 2, "<=", lambda: error_func()), False)
+            paddle.jit.dy2static.convert_shape_compare(1, ">", 2, "<=",
+                                                       lambda: error_func()),
+            False)
 
         self.assertEqual(
             paddle.jit.dy2static.convert_shape_compare(1, "<", 2, "in",
@@ -99,30 +105,30 @@ def test_variable(self):
             x = paddle.static.data(name='x', shape=[3, 2], dtype='float32')
             y = paddle.static.data(name='y', shape=[3, 2], dtype='float32')
             self.assertEqual(
-                paddle.jit.dy2static.convert_shape_compare(x, "is", x, "is not",
-                                                           y), True)
+                paddle.jit.dy2static.convert_shape_compare(
+                    x, "is", x, "is not", y), True)
             self.assertEqual(
-                paddle.jit.dy2static.convert_shape_compare(x, "is not", x,
-                                                           "is not", y), False)
+                paddle.jit.dy2static.convert_shape_compare(
+                    x, "is not", x, "is not", y), False)
             self.assertEqual(
                 paddle.jit.dy2static.convert_shape_compare(x, "is", x, "is", y),
                 False)
 
             eq_out = paddle.jit.dy2static.convert_shape_compare(x, "==", y)
             not_eq_out = paddle.jit.dy2static.convert_shape_compare(x, "!=", y)
-            long_eq_out = paddle.jit.dy2static.convert_shape_compare(x, "==", x,
-                                                                     "!=", y)
+            long_eq_out = paddle.jit.dy2static.convert_shape_compare(
+                x, "==", x, "!=", y)
 
-            place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
-            ) else paddle.CPUPlace()
+            place = paddle.CUDAPlace(
+                0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             x_y_eq_out = exe.run(feed={
                 "x": np.ones([3, 2]).astype(np.float32),
                 "y": np.ones([3, 2]).astype(np.float32)
             },
                                  fetch_list=[eq_out, not_eq_out, long_eq_out])
-            np.testing.assert_array_equal(
-                np.array(x_y_eq_out), np.array([[True], [False], [False]]))
+            np.testing.assert_array_equal(np.array(x_y_eq_out),
+                                          np.array([[True], [False], [False]]))
 
             set_a_zero = np.ones([3, 2]).astype(np.float32)
             set_a_zero[0][0] = 0.0
@@ -132,16 +138,16 @@ def test_variable(self):
                     "y": set_a_zero
                 },
                 fetch_list=[eq_out, not_eq_out, long_eq_out])
-            np.testing.assert_array_equal(
-                np.array(x_y_not_eq_out), np.array([[False], [True], [True]]))
+            np.testing.assert_array_equal(np.array(x_y_not_eq_out),
+                                          np.array([[False], [True], [True]]))
         paddle.disable_static()
 
 
 class TestChooseShapeAttrOrApi(unittest.TestCase):
+
     def test_api_shape_is_none(self):
         self.assertEqual(
-            paddle.jit.dy2static.choose_shape_attr_or_api([1, 2], None),
-            [1, 2])
+            paddle.jit.dy2static.choose_shape_attr_or_api([1, 2], None), [1, 2])
         self.assertEqual(
             paddle.jit.dy2static.choose_shape_attr_or_api([1], None), [1])
         self.assertEqual(
@@ -151,38 +157,37 @@ def test_api_shape_is_none(self):
     def test_attr_shape_is_int(self):
         x = paddle.zeros([1, 3, 5, 7])
         self.assertEqual(
-            paddle.jit.dy2static.choose_shape_attr_or_api(x.shape[0],
-                                                          paddle.shape(x)[0]),
-            1)
+            paddle.jit.dy2static.choose_shape_attr_or_api(
+                x.shape[0],
+                paddle.shape(x)[0]), 1)
         self.assertEqual(
-            paddle.jit.dy2static.choose_shape_attr_or_api(x.shape[1],
-                                                          paddle.shape(x)[1]),
-            3)
+            paddle.jit.dy2static.choose_shape_attr_or_api(
+                x.shape[1],
+                paddle.shape(x)[1]), 3)
         self.assertEqual(
-            paddle.jit.dy2static.choose_shape_attr_or_api(-1,
-                                                          paddle.shape(x)[0]),
+            paddle.jit.dy2static.choose_shape_attr_or_api(
+                -1,
+                paddle.shape(x)[0]),
             paddle.shape(x)[0])
         self.assertEqual(
-            paddle.jit.dy2static.choose_shape_attr_or_api(-1,
-                                                          paddle.shape(x), 0),
+            paddle.jit.dy2static.choose_shape_attr_or_api(
+                -1, paddle.shape(x), 0),
             paddle.shape(x)[0])
 
     def test_positive_attr_shape(self):
         x = paddle.zeros([1, 3, 5, 7])
         self.assertEqual(
-            paddle.jit.dy2static.choose_shape_attr_or_api(x.shape,
-                                                          paddle.shape(x)),
-            x.shape)
+            paddle.jit.dy2static.choose_shape_attr_or_api(
+                x.shape, paddle.shape(x)), x.shape)
         self.assertEqual(
-            paddle.jit.dy2static.choose_shape_attr_or_api(x.shape,
-                                                          paddle.shape(x), 3),
-            x.shape[3])
+            paddle.jit.dy2static.choose_shape_attr_or_api(
+                x.shape, paddle.shape(x), 3), x.shape[3])
 
     def test_negative_attr_shape(self):
         x = paddle.zeros([7])
         self.assertEqual(
-            paddle.jit.dy2static.choose_shape_attr_or_api([-1],
-                                                          paddle.shape(x), 0),
+            paddle.jit.dy2static.choose_shape_attr_or_api([-1], paddle.shape(x),
+                                                          0),
             paddle.shape(x)[0])
         self.assertEqual(
             paddle.jit.dy2static.choose_shape_attr_or_api([-1],
@@ -191,6 +196,7 @@ def test_negative_attr_shape(self):
 
 
 class TestEvaIfExistElseNone(unittest.TestCase):
+
     def test_globals(self):
         global x_shape
         x_shape = [1, 2, 3]
@@ -205,10 +211,10 @@ def test_enclosing_scope(self):
 
         def foo():
             y_shape = [2, 3, 4]
-            self.assertEqual(
-                eval_if_exist_else_none('x_shape', globals()), [1, 2, 3])
-            self.assertEqual(
-                eval_if_exist_else_none('y_shape', locals()), [2, 3, 4])
+            self.assertEqual(eval_if_exist_else_none('x_shape', globals()),
+                             [1, 2, 3])
+            self.assertEqual(eval_if_exist_else_none('y_shape', locals()),
+                             [2, 3, 4])
 
         foo()
         del x_shape
@@ -220,17 +226,18 @@ def foo():
             global y_shape
             y_shape = [2, 3, 4]
 
-            self.assertEqual(
-                eval_if_exist_else_none('y_shape', globals()), [2, 3, 4])
+            self.assertEqual(eval_if_exist_else_none('y_shape', globals()),
+                             [2, 3, 4])
             self.assertEqual(eval_if_exist_else_none('x_shape', locals()), None)
-            self.assertEqual(
-                eval_if_exist_else_none('x_shape', globals()), None)
+            self.assertEqual(eval_if_exist_else_none('x_shape', globals()),
+                             None)
 
             del y_shape
 
         foo()
 
     def test_none(self):
+
         def foo():
             x_shape = [2, 3, 4]
             return x_shape
@@ -239,6 +246,7 @@ def foo():
 
 
 class ShapeLayer(paddle.nn.Layer):
+
     def __init__(self):
         super(ShapeLayer, self).__init__()
 
@@ -253,6 +261,7 @@ def forward(self, x):
 
 
 class TestChooseShapeAttrOrApiWithLayer(unittest.TestCase):
+
     def test_tensor_shape(self):
         x = paddle.zeros(shape=[4, 1], dtype='float32')
         net = ShapeLayer()
@@ -262,6 +271,7 @@ def test_tensor_shape(self):
 
 
 class TestIfElseNoValue(unittest.TestCase):
+
     def test_else_ret_none(self):
         input_x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
index 8a9a1e19205fb..2fe985490ba0b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
@@ -64,6 +64,7 @@
 
 
 class Cycle_Gan(fluid.dygraph.Layer):
+
     def __init__(self, input_channel, istrain=True):
         super(Cycle_Gan, self).__init__()
 
@@ -88,11 +89,9 @@ def forward(self, input_A, input_B):
         cyc_B = self.build_generator_resnet_9blocks_a(fake_A)
 
         diff_A = fluid.layers.abs(
-            fluid.layers.elementwise_sub(
-                x=input_A, y=cyc_A))
+            fluid.layers.elementwise_sub(x=input_A, y=cyc_A))
         diff_B = fluid.layers.abs(
-            fluid.layers.elementwise_sub(
-                x=input_B, y=cyc_B))
+            fluid.layers.elementwise_sub(x=input_B, y=cyc_B))
         cyc_A_loss = fluid.layers.reduce_mean(diff_A) * lambda_A
         cyc_B_loss = fluid.layers.reduce_mean(diff_B) * lambda_B
         cyc_loss = cyc_A_loss + cyc_B_loss
@@ -138,24 +137,23 @@ def discriminatorB(self, input_A, input_B):
 
 
 class build_resnet_block(fluid.dygraph.Layer):
+
     def __init__(self, dim, use_bias=False):
         super(build_resnet_block, self).__init__()
 
-        self.conv0 = conv2d(
-            num_channels=dim,
-            num_filters=dim,
-            filter_size=3,
-            stride=1,
-            stddev=0.02,
-            use_bias=False)
-        self.conv1 = conv2d(
-            num_channels=dim,
-            num_filters=dim,
-            filter_size=3,
-            stride=1,
-            stddev=0.02,
-            relu=False,
-            use_bias=False)
+        self.conv0 = conv2d(num_channels=dim,
+                            num_filters=dim,
+                            filter_size=3,
+                            stride=1,
+                            stddev=0.02,
+                            use_bias=False)
+        self.conv1 = conv2d(num_channels=dim,
+                            num_filters=dim,
+                            filter_size=3,
+                            stride=1,
+                            stddev=0.02,
+                            relu=False,
+                            use_bias=False)
         self.dim = dim
 
     def forward(self, inputs):
@@ -168,30 +166,28 @@ def forward(self, inputs):
 
 
 class build_generator_resnet_9blocks(fluid.dygraph.Layer):
+
     def __init__(self, input_channel):
         super(build_generator_resnet_9blocks, self).__init__()
 
-        self.conv0 = conv2d(
-            num_channels=input_channel,
-            num_filters=32,
-            filter_size=7,
-            stride=1,
-            padding=0,
-            stddev=0.02)
-        self.conv1 = conv2d(
-            num_channels=32,
-            num_filters=64,
-            filter_size=3,
-            stride=2,
-            padding=1,
-            stddev=0.02)
-        self.conv2 = conv2d(
-            num_channels=64,
-            num_filters=128,
-            filter_size=3,
-            stride=2,
-            padding=1,
-            stddev=0.02)
+        self.conv0 = conv2d(num_channels=input_channel,
+                            num_filters=32,
+                            filter_size=7,
+                            stride=1,
+                            padding=0,
+                            stddev=0.02)
+        self.conv1 = conv2d(num_channels=32,
+                            num_filters=64,
+                            filter_size=3,
+                            stride=2,
+                            padding=1,
+                            stddev=0.02)
+        self.conv2 = conv2d(num_channels=64,
+                            num_filters=128,
+                            filter_size=3,
+                            stride=2,
+                            padding=1,
+                            stddev=0.02)
         self.build_resnet_block_list = []
         dim = 128
         for i in range(9):
@@ -205,25 +201,24 @@ def __init__(self, input_channel):
             stride=2,
             stddev=0.02,
             padding=[1, 1],
-            outpadding=[0, 1, 0, 1], )
-        self.deconv1 = DeConv2D(
-            num_channels=32 * 2,
-            num_filters=32,
-            filter_size=3,
-            stride=2,
-            stddev=0.02,
-            padding=[1, 1],
-            outpadding=[0, 1, 0, 1])
-        self.conv3 = conv2d(
-            num_channels=32,
-            num_filters=input_channel,
-            filter_size=7,
-            stride=1,
-            stddev=0.02,
-            padding=0,
-            relu=False,
-            norm=False,
-            use_bias=True)
+            outpadding=[0, 1, 0, 1],
+        )
+        self.deconv1 = DeConv2D(num_channels=32 * 2,
+                                num_filters=32,
+                                filter_size=3,
+                                stride=2,
+                                stddev=0.02,
+                                padding=[1, 1],
+                                outpadding=[0, 1, 0, 1])
+        self.conv3 = conv2d(num_channels=32,
+                            num_filters=input_channel,
+                            filter_size=7,
+                            stride=1,
+                            stddev=0.02,
+                            padding=0,
+                            relu=False,
+                            norm=False,
+                            use_bias=True)
 
     def forward(self, inputs):
         pad_input = fluid.layers.pad2d(inputs, [3, 3, 3, 3], mode="reflect")
@@ -241,53 +236,49 @@ def forward(self, inputs):
 
 
 class build_gen_discriminator(fluid.dygraph.Layer):
+
     def __init__(self, input_channel):
         super(build_gen_discriminator, self).__init__()
 
-        self.conv0 = conv2d(
-            num_channels=input_channel,
-            num_filters=64,
-            filter_size=4,
-            stride=2,
-            stddev=0.02,
-            padding=1,
-            norm=False,
-            use_bias=True,
-            relufactor=0.2)
-        self.conv1 = conv2d(
-            num_channels=64,
-            num_filters=128,
-            filter_size=4,
-            stride=2,
-            stddev=0.02,
-            padding=1,
-            relufactor=0.2)
-        self.conv2 = conv2d(
-            num_channels=128,
-            num_filters=IMAGE_SIZE,
-            filter_size=4,
-            stride=2,
-            stddev=0.02,
-            padding=1,
-            relufactor=0.2)
-        self.conv3 = conv2d(
-            num_channels=IMAGE_SIZE,
-            num_filters=512,
-            filter_size=4,
-            stride=1,
-            stddev=0.02,
-            padding=1,
-            relufactor=0.2)
-        self.conv4 = conv2d(
-            num_channels=512,
-            num_filters=1,
-            filter_size=4,
-            stride=1,
-            stddev=0.02,
-            padding=1,
-            norm=False,
-            relu=False,
-            use_bias=True)
+        self.conv0 = conv2d(num_channels=input_channel,
+                            num_filters=64,
+                            filter_size=4,
+                            stride=2,
+                            stddev=0.02,
+                            padding=1,
+                            norm=False,
+                            use_bias=True,
+                            relufactor=0.2)
+        self.conv1 = conv2d(num_channels=64,
+                            num_filters=128,
+                            filter_size=4,
+                            stride=2,
+                            stddev=0.02,
+                            padding=1,
+                            relufactor=0.2)
+        self.conv2 = conv2d(num_channels=128,
+                            num_filters=IMAGE_SIZE,
+                            filter_size=4,
+                            stride=2,
+                            stddev=0.02,
+                            padding=1,
+                            relufactor=0.2)
+        self.conv3 = conv2d(num_channels=IMAGE_SIZE,
+                            num_filters=512,
+                            filter_size=4,
+                            stride=1,
+                            stddev=0.02,
+                            padding=1,
+                            relufactor=0.2)
+        self.conv4 = conv2d(num_channels=512,
+                            num_filters=1,
+                            filter_size=4,
+                            stride=1,
+                            stddev=0.02,
+                            padding=1,
+                            norm=False,
+                            relu=False,
+                            use_bias=True)
 
     def forward(self, inputs):
         y = self.conv0(inputs)
@@ -320,18 +311,17 @@ def __init__(self,
             con_bias_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(0.0))
 
-        self.conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            use_cudnn=use_cudnn,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NormalInitializer(
-                    loc=0.0, scale=stddev)),
-            bias_attr=con_bias_attr)
-        # Note(Aurelius84): The calculation of GPU kernel in BN is non-deterministic, 
+        self.conv = Conv2D(num_channels=num_channels,
+                           num_filters=num_filters,
+                           filter_size=filter_size,
+                           stride=stride,
+                           padding=padding,
+                           use_cudnn=use_cudnn,
+                           param_attr=fluid.ParamAttr(
+                               initializer=fluid.initializer.NormalInitializer(
+                                   loc=0.0, scale=stddev)),
+                           bias_attr=con_bias_attr)
+        # Note(Aurelius84): The calculation of GPU kernel in BN is non-deterministic,
         # failure rate is 1/100 in Dev but seems incremental in CE platform.
         # If on GPU, we disable BN temporarily.
         if fluid.is_compiled_with_cuda():
@@ -361,6 +351,7 @@ def forward(self, inputs):
 
 
 class DeConv2D(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters=64,
@@ -389,8 +380,8 @@ def __init__(self,
             padding=padding,
             use_cudnn=use_cudnn,
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NormalInitializer(
-                    loc=0.0, scale=stddev)),
+                initializer=fluid.initializer.NormalInitializer(loc=0.0,
+                                                                scale=stddev)),
             bias_attr=de_bias_attr)
         if fluid.is_compiled_with_cuda():
             norm = False
@@ -412,8 +403,10 @@ def __init__(self,
 
     def forward(self, inputs):
         conv = self._deconv(inputs)
-        conv = fluid.layers.pad2d(
-            conv, paddings=self.outpadding, mode='constant', pad_value=0.0)
+        conv = fluid.layers.pad2d(conv,
+                                  paddings=self.outpadding,
+                                  mode='constant',
+                                  pad_value=0.0)
 
         if self.norm:
             conv = self.bn(conv)
@@ -423,6 +416,7 @@ def forward(self, inputs):
 
 
 class ImagePool(object):
+
     def __init__(self, pool_size=50):
         self.pool = []
         self.count = 0
@@ -445,6 +439,7 @@ def pool_image(self, image):
 
 
 def reader_creater():
+
     def reader():
         while True:
             fake_image = np.uint8(
@@ -482,15 +477,14 @@ class Args(object):
 
 def optimizer_setting(parameters):
     lr = 0.0002
-    optimizer = fluid.optimizer.Adam(
-        learning_rate=fluid.layers.piecewise_decay(
-            boundaries=[
-                100 * step_per_epoch, 120 * step_per_epoch,
-                140 * step_per_epoch, 160 * step_per_epoch, 180 * step_per_epoch
-            ],
-            values=[lr, lr * 0.8, lr * 0.6, lr * 0.4, lr * 0.2, lr * 0.1]),
-        parameter_list=parameters,
-        beta1=0.5)
+    optimizer = fluid.optimizer.Adam(learning_rate=fluid.layers.piecewise_decay(
+        boundaries=[
+            100 * step_per_epoch, 120 * step_per_epoch, 140 * step_per_epoch,
+            160 * step_per_epoch, 180 * step_per_epoch
+        ],
+        values=[lr, lr * 0.8, lr * 0.6, lr * 0.4, lr * 0.2, lr * 0.1]),
+                                     parameter_list=parameters,
+                                     beta1=0.5)
     return optimizer
 
 
@@ -534,11 +528,11 @@ def train(args, to_static):
 
                 s_time = time.time()
                 data_A = np.array(
-                    [data_A[0].reshape(3, IMAGE_SIZE, IMAGE_SIZE)]).astype(
-                        "float32")
+                    [data_A[0].reshape(3, IMAGE_SIZE,
+                                       IMAGE_SIZE)]).astype("float32")
                 data_B = np.array(
-                    [data_B[0].reshape(3, IMAGE_SIZE, IMAGE_SIZE)]).astype(
-                        "float32")
+                    [data_B[0].reshape(3, IMAGE_SIZE,
+                                       IMAGE_SIZE)]).astype("float32")
                 data_A = to_variable(data_A)
                 data_B = to_variable(data_B)
 
@@ -552,19 +546,19 @@ def train(args, to_static):
 
                 fake_pool_B = B_pool.pool_image(fake_B).numpy()
                 fake_pool_B = np.array(
-                    [fake_pool_B[0].reshape(3, IMAGE_SIZE, IMAGE_SIZE)]).astype(
-                        "float32")
+                    [fake_pool_B[0].reshape(3, IMAGE_SIZE,
+                                            IMAGE_SIZE)]).astype("float32")
                 fake_pool_B = to_variable(fake_pool_B)
 
                 fake_pool_A = A_pool.pool_image(fake_A).numpy()
                 fake_pool_A = np.array(
-                    [fake_pool_A[0].reshape(3, IMAGE_SIZE, IMAGE_SIZE)]).astype(
-                        "float32")
+                    [fake_pool_A[0].reshape(3, IMAGE_SIZE,
+                                            IMAGE_SIZE)]).astype("float32")
                 fake_pool_A = to_variable(fake_pool_A)
 
                 # optimize the d_A network
-                rec_B, fake_pool_rec_B = cycle_gan.discriminatorA(data_B,
-                                                                  fake_pool_B)
+                rec_B, fake_pool_rec_B = cycle_gan.discriminatorA(
+                    data_B, fake_pool_B)
                 d_loss_A = (fluid.layers.square(fake_pool_rec_B) +
                             fluid.layers.square(rec_B - 1)) / 2.0
                 d_loss_A = fluid.layers.reduce_mean(d_loss_A)
@@ -574,8 +568,8 @@ def train(args, to_static):
                 cycle_gan.clear_gradients()
 
                 # optimize the d_B network
-                rec_A, fake_pool_rec_A = cycle_gan.discriminatorB(data_A,
-                                                                  fake_pool_A)
+                rec_A, fake_pool_rec_A = cycle_gan.discriminatorB(
+                    data_A, fake_pool_A)
                 d_loss_B = (fluid.layers.square(fake_pool_rec_A) +
                             fluid.layers.square(rec_A - 1)) / 2.0
                 d_loss_B = fluid.layers.reduce_mean(d_loss_B)
@@ -596,8 +590,8 @@ def train(args, to_static):
                 t_time += batch_time
                 if batch_id % args.log_step == 0:
                     print(
-                        "batch: {}\t Batch_time_cost: {}\n g_loss: {}\t d_A_loss: {}\t d_B_loss:{}\n g_A_loss: {}\t g_A_cyc_loss: {}\t g_A_idt_loss: {}\n g_B_loss: {}\t g_B_cyc_loss: {}\t g_B_idt_loss: {}".
-                        format(batch_id, batch_time, *cur_batch_loss))
+                        "batch: {}\t Batch_time_cost: {}\n g_loss: {}\t d_A_loss: {}\t d_B_loss:{}\n g_A_loss: {}\t g_A_cyc_loss: {}\t g_A_idt_loss: {}\n g_B_loss: {}\t g_B_cyc_loss: {}\t g_B_idt_loss: {}"
+                        .format(batch_id, batch_time, *cur_batch_loss))
 
                 if batch_id > args.train_step:
                     break
@@ -607,6 +601,7 @@ def train(args, to_static):
 
 
 class TestCycleGANModel(unittest.TestCase):
+
     def setUp(self):
         self.args = Args()
 
@@ -619,15 +614,14 @@ def test_train(self):
         dy_out = self.train(to_static=False)
 
         assert_func = np.allclose
-        # Note(Aurelius84): Because we disable BN on GPU, 
+        # Note(Aurelius84): Because we disable BN on GPU,
         # but here we enhance the check on CPU by `np.array_equal`
         # which means the dy_out and st_out shall be exactly same.
         if not fluid.is_compiled_with_cuda():
             assert_func = np.array_equal
 
-        self.assertTrue(
-            assert_func(dy_out, st_out),
-            msg="dy_out:\n {}\n st_out:\n{}".format(dy_out, st_out))
+        self.assertTrue(assert_func(dy_out, st_out),
+                        msg="dy_out:\n {}\n st_out:\n{}".format(dy_out, st_out))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index 35dfe550552a9..ef9eff2651853 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -28,6 +28,7 @@
 
 
 class SimpleNet(Layer):
+
     def __init__(self):
         super(SimpleNet, self).__init__()
         self.linear = fluid.dygraph.Linear(10, 3)
@@ -83,6 +84,7 @@ def func_with_list_dict(self, dl):
 
 
 class TestStaticFunctionInstance(unittest.TestCase):
+
     def test_instance_same_class(self):
         with fluid.dygraph.guard(fluid.CPUPlace()):
             net_1 = SimpleNet()
@@ -100,6 +102,7 @@ def test_instance_same_class(self):
 
 
 class TestInputSpec(unittest.TestCase):
+
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
         self.model_path = os.path.join(self.temp_dir.name, 'simple_net')
@@ -157,12 +160,12 @@ def test_with_error(self):
 
             # 2. requires len(input_spec) <= len(args)
             with self.assertRaises(ValueError):
-                net.add_func = declarative(
-                    net.add_func,
-                    input_spec=[
-                        InputSpec([-1, 10]), InputSpec([-1, 10]),
-                        InputSpec([10])
-                    ])
+                net.add_func = declarative(net.add_func,
+                                           input_spec=[
+                                               InputSpec([-1, 10]),
+                                               InputSpec([-1, 10]),
+                                               InputSpec([10])
+                                           ])
                 net.add_func(x, y)
 
     def test_concrete_program(self):
@@ -175,10 +178,8 @@ def test_concrete_program(self):
             # We can get concrete_program by specificing InputSpec information. Faking input is no need.
             net.add_func = declarative(
                 net.add_func,
-                input_spec=[
-                    InputSpec([-1, 10]), InputSpec(
-                        [-1, 10], name='y')
-                ])
+                input_spec=[InputSpec([-1, 10]),
+                            InputSpec([-1, 10], name='y')])
             cp1 = net.add_func.concrete_program
             self.assertTrue(cp1.inputs[-1].shape == (-1, 10))
             self.assertTrue(cp1.inputs[-1].name == 'y')
@@ -186,8 +187,8 @@ def test_concrete_program(self):
             # generate another program
             net.add_func = declarative(
                 net.add_func,
-                input_spec=[InputSpec([10]), InputSpec(
-                    [10], name='label')])
+                input_spec=[InputSpec([10]),
+                            InputSpec([10], name='label')])
             cp2 = net.add_func.concrete_program
             self.assertTrue(cp2.inputs[-1].shape == (10, ))
             self.assertTrue(cp2.inputs[-1].name == 'label')
@@ -203,6 +204,7 @@ def foo_func(a, b, c=1, d=2):
 
 
 class TestDifferentInputSpecCacheProgram(unittest.TestCase):
+
     def setUp(self):
         program_trans.enable(True)
 
@@ -248,25 +250,26 @@ def test_get_concrete_program(self):
         foo = declarative(foo_func)
 
         # 1. specific InputSpec for `x`/`y`
-        concrete_program_1 = foo.get_concrete_program(
-            InputSpec([None, 10]), InputSpec([10]))
+        concrete_program_1 = foo.get_concrete_program(InputSpec([None, 10]),
+                                                      InputSpec([10]))
         self.assertTrue(len(foo.program_cache) == 1)
 
         # 2. specific `c`/`d` explicitly with same default value
-        concrete_program_2 = foo.get_concrete_program(
-            InputSpec([None, 10]), InputSpec([10]), 1, 2)
+        concrete_program_2 = foo.get_concrete_program(InputSpec([None, 10]),
+                                                      InputSpec([10]), 1, 2)
         self.assertTrue(concrete_program_2 == concrete_program_1)
         self.assertTrue(len(foo.program_cache) == 1)
 
         # 3. specific `c` = 2
-        concrete_program_3 = foo.get_concrete_program(
-            InputSpec([None, 10]), InputSpec([10]), c=2)
+        concrete_program_3 = foo.get_concrete_program(InputSpec([None, 10]),
+                                                      InputSpec([10]),
+                                                      c=2)
         self.assertTrue(concrete_program_3 != concrete_program_1)
         self.assertTrue(len(foo.program_cache) == 2)
 
         # 4. specific x.shape = [10]
-        concrete_program_4 = foo.get_concrete_program(
-            InputSpec([10]), InputSpec([10]))
+        concrete_program_4 = foo.get_concrete_program(InputSpec([10]),
+                                                      InputSpec([10]))
         self.assertTrue(concrete_program_4 != concrete_program_1)
         self.assertTrue(len(foo.program_cache) == 3)
 
@@ -276,20 +279,19 @@ def test_get_concrete_program(self):
 
         # 6. specific unknown kwargs `e`=4
         with self.assertRaises(TypeError):
-            concrete_program_5 = foo.get_concrete_program(
-                InputSpec([10]), InputSpec([10]), e=4)
+            concrete_program_5 = foo.get_concrete_program(InputSpec([10]),
+                                                          InputSpec([10]),
+                                                          e=4)
 
     def test_concrete_program(self):
         with fluid.dygraph.guard(fluid.CPUPlace()):
 
             # usage 1
-            foo_1 = paddle.jit.to_static(
-                foo_func,
-                input_spec=[
-                    InputSpec(
-                        [10], name='x'), InputSpec(
-                            [10], name='y')
-                ])
+            foo_1 = paddle.jit.to_static(foo_func,
+                                         input_spec=[
+                                             InputSpec([10], name='x'),
+                                             InputSpec([10], name='y')
+                                         ])
             self.assertTrue(isinstance(foo_1.concrete_program, ConcreteProgram))
 
             # usage 2
@@ -304,6 +306,7 @@ def test_concrete_program(self):
 
 
 class TestInputDefaultName(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
         self.net = SimpleNet()
@@ -328,6 +331,7 @@ def test_nest_input(self):
 
 
 class TestDeclarativeAPI(unittest.TestCase):
+
     def test_error(self):
         func = declarative(dyfunc_to_variable)
 
@@ -346,6 +350,7 @@ def test_error(self):
 
 
 class TestDecorateModelDirectly(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
         program_trans.enable(True)
@@ -372,6 +377,7 @@ def test_input_spec(self):
 
 
 class TestErrorWithInitFromStaticMode(unittest.TestCase):
+
     def test_raise_error(self):
         # disable imperative
         paddle.enable_static()
@@ -391,6 +397,7 @@ def test_raise_error(self):
 
 
 class CallNonForwardFuncNet(paddle.nn.Layer):
+
     def __init__(self):
         super(CallNonForwardFuncNet, self).__init__()
         self.sub = CallNonForwardFuncSubNet()
@@ -401,6 +408,7 @@ def forward(self):
 
 
 class CallNonForwardFuncSubNet(paddle.nn.Layer):
+
     def __init__(self):
         super(CallNonForwardFuncSubNet, self).__init__()
         self.a = paddle.to_tensor([1, 2])
@@ -411,6 +419,7 @@ def func(self):
 
 
 class TestCallNonForwardFunc(unittest.TestCase):
+
     def test_call_non_forward(self):
         paddle.disable_static()
         net = CallNonForwardFuncNet()
@@ -420,6 +429,7 @@ def test_call_non_forward(self):
 
 
 class SetBuffersNet1(paddle.nn.Layer):
+
     def __init__(self):
         super(SetBuffersNet1, self).__init__()
         self.a = paddle.to_tensor([1])
@@ -431,6 +441,7 @@ def forward(self):
 
 
 class SetBuffersNet2(paddle.nn.Layer):
+
     def __init__(self):
         super(SetBuffersNet2, self).__init__()
         self.b = paddle.to_tensor([2])
@@ -443,6 +454,7 @@ def forward(self):
 
 
 class TestSetBuffers(unittest.TestCase):
+
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
         self.model_path = os.path.join(self.temp_dir.name, 'SetBuffersNet1')
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
index a7be885576496..e8999acce0e12 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
@@ -23,31 +23,30 @@
 from paddle.jit import to_static
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
 
-PLACE = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
-)
+PLACE = fluid.CUDAPlace(
+    0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 
 
 class SubNetWithDict(fluid.dygraph.Layer):
+
     def __init__(self, hidden_size=16, output_size=16):
         super(SubNetWithDict, self).__init__()
 
-        init_weight = lambda x: fluid.ParamAttr(initializer=fluid.initializer.Constant(x))
-
-        self.q_fc = fluid.dygraph.Linear(
-            input_dim=hidden_size,
-            output_dim=output_size,
-            bias_attr=False,
-            param_attr=init_weight(0.6))
-        self.k_fc = fluid.dygraph.Linear(
-            input_dim=hidden_size,
-            output_dim=output_size,
-            bias_attr=False,
-            param_attr=init_weight(0.5))
-        self.v_fc = fluid.dygraph.Linear(
-            input_dim=hidden_size,
-            output_dim=output_size,
-            bias_attr=False,
-            param_attr=init_weight(0.2))
+        init_weight = lambda x: fluid.ParamAttr(initializer=fluid.initializer.
+                                                Constant(x))
+
+        self.q_fc = fluid.dygraph.Linear(input_dim=hidden_size,
+                                         output_dim=output_size,
+                                         bias_attr=False,
+                                         param_attr=init_weight(0.6))
+        self.k_fc = fluid.dygraph.Linear(input_dim=hidden_size,
+                                         output_dim=output_size,
+                                         bias_attr=False,
+                                         param_attr=init_weight(0.5))
+        self.v_fc = fluid.dygraph.Linear(input_dim=hidden_size,
+                                         output_dim=output_size,
+                                         bias_attr=False,
+                                         param_attr=init_weight(0.2))
 
     def forward(self, input, cache=None):
         input = fluid.dygraph.to_variable(input)
@@ -70,6 +69,7 @@ def forward(self, input, cache=None):
 
 
 class MainNetWithDict(fluid.dygraph.Layer):
+
     def __init__(self, batch_size=64, hidden_size=16, output_size=16):
         super(MainNetWithDict, self).__init__()
         self.batch_size = batch_size
@@ -81,11 +81,13 @@ def __init__(self, batch_size=64, hidden_size=16, output_size=16):
     def forward(self, input, max_len=4):
         input = fluid.dygraph.to_variable(input)
         cache = {
-            "k": fluid.layers.fill_constant(
+            "k":
+            fluid.layers.fill_constant(
                 shape=[self.batch_size, self.output_size],
                 dtype='float32',
                 value=0),
-            "v": fluid.layers.fill_constant(
+            "v":
+            fluid.layers.fill_constant(
                 shape=[self.batch_size, self.output_size],
                 dtype='float32',
                 value=0),
@@ -166,10 +168,11 @@ def test_dic_pop_2(x):
 
 
 class TestDictPop(unittest.TestCase):
+
     def setUp(self):
         self.input = np.random.random((3)).astype('int32')
-        self.place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        self.place = paddle.CUDAPlace(
+            0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
         self._set_test_func()
 
     def _set_test_func(self):
@@ -192,18 +195,19 @@ def _run(self, to_static):
     def test_transformed_result(self):
         dygraph_res = self._run_dygraph()
         static_res = self._run_static()
-        self.assertTrue(
-            np.allclose(dygraph_res, static_res),
-            msg='dygraph result is {}\nstatic result is {}'.format(dygraph_res,
-                                                                   static_res))
+        self.assertTrue(np.allclose(dygraph_res, static_res),
+                        msg='dygraph result is {}\nstatic result is {}'.format(
+                            dygraph_res, static_res))
 
 
 class TestDictPop2(TestDictPop):
+
     def _set_test_func(self):
         self.dygraph_func = test_dic_pop_2
 
 
 class NetWithDictPop(paddle.nn.Layer):
+
     def __init__(self):
         super(NetWithDictPop, self).__init__()
 
@@ -220,6 +224,7 @@ def forward(self, x, **kwargs):
 
 
 class TestDictPop3(TestNetWithDict):
+
     def setUp(self):
         self.x = np.array([2, 2]).astype('float32')
 
@@ -235,14 +240,15 @@ def test_ast_to_func(self):
         dygraph_result = self._run_dygraph()
         static_result = self._run_static()
 
-        self.assertTrue(
-            (dygraph_result == static_result).all(),
-            msg="dygraph result: {}\nstatic result: {}".format(dygraph_result,
-                                                               static_result))
+        self.assertTrue((dygraph_result == static_result).all(),
+                        msg="dygraph result: {}\nstatic result: {}".format(
+                            dygraph_result, static_result))
 
 
 class TestDictCmpInFor(unittest.TestCase):
+
     def test_with_for(self):
+
         def func():
             pos = [1, 3]
             neg = [-1, -3]
@@ -259,6 +265,7 @@ def func():
         self.assertEqual(paddle.jit.to_static(func)()['minus'], 8)
 
     def test_with_for_enumerate(self):
+
         def func():
             pos = [1, 3]
             neg = [-1, -3]
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_drop_path.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_drop_path.py
index 7383c834ba9a4..d5c8323574732 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_drop_path.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_drop_path.py
@@ -26,6 +26,7 @@ def drop_path(x, training=False):
 
 
 class DropPath(paddle.nn.Layer):
+
     def __init__(self):
         super(DropPath, self).__init__()
 
@@ -35,6 +36,7 @@ def forward(self, x):
 
 
 class TestTrainEval(unittest.TestCase):
+
     def setUp(self):
         self.model = DropPath()
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py
index 146608cb07a16..555e71ce9a0ca 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py
@@ -25,10 +25,13 @@
 
 
 class Pool2D(fluid.dygraph.Layer):
+
     def __init__(self):
         super(Pool2D, self).__init__()
-        self.pool2d = fluid.dygraph.Pool2D(
-            pool_size=2, pool_type='avg', pool_stride=1, global_pooling=False)
+        self.pool2d = fluid.dygraph.Pool2D(pool_size=2,
+                                           pool_type='avg',
+                                           pool_stride=1,
+                                           global_pooling=False)
 
     @declarative
     def forward(self, x):
@@ -41,6 +44,7 @@ def get_result(x):
 
 
 class Linear(fluid.dygraph.Layer):
+
     def __init__(self, input_dim=10, output_dim=5):
         super(Linear, self).__init__()
         self.fc = fluid.dygraph.Linear(
@@ -60,6 +64,7 @@ def forward(self, x):
 
 
 class TestPool2D(unittest.TestCase):
+
     def setUp(self):
         self.dygraph_class = Pool2D
         self.data = np.random.random((1, 2, 4, 4)).astype('float32')
@@ -87,13 +92,13 @@ def test_declarative(self):
         dygraph_res = self.train_dygraph()
         static_res = self.train_static()
 
-        self.assertTrue(
-            np.allclose(dygraph_res, static_res),
-            msg='dygraph_res is {}\n static_res is \n{}'.format(dygraph_res,
-                                                                static_res))
+        self.assertTrue(np.allclose(dygraph_res, static_res),
+                        msg='dygraph_res is {}\n static_res is \n{}'.format(
+                            dygraph_res, static_res))
 
 
 class TestLinear(TestPool2D):
+
     def setUp(self):
         self.dygraph_class = Linear
         self.data = np.random.random((4, 10)).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
index 337e9cd720229..4c69849ccbda5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
@@ -37,7 +37,7 @@ def for_in_range(x):
     return z
 
 
-# 1. for iter list 
+# 1. for iter list
 @paddle.jit.to_static
 def for_iter_list(x_array):
     z = fluid.layers.fill_constant([1], 'int32', 0)
@@ -292,6 +292,7 @@ def for_tuple_as_enumerate_value(x_array):
 
 # 20. test for function in a class
 class ForwardContainsForLayer(paddle.nn.Layer):
+
     def __init__(self):
         super(ForwardContainsForLayer, self).__init__()
         self.high = 5
@@ -307,7 +308,7 @@ def forward(self, x):
         return z
 
 
-# 21. for original list 
+# 21. for original list
 @paddle.jit.to_static
 def for_original_list():
     z = fluid.layers.fill_constant([1], 'int32', 0)
@@ -327,7 +328,8 @@ def for_original_tuple():
 
 # 23. for zip error
 @paddle.jit.to_static(
-    input_spec=[InputSpec(shape=[None, 10]), InputSpec(shape=[None, 10])])
+    input_spec=[InputSpec(shape=[None, 10]),
+                InputSpec(shape=[None, 10])])
 def for_zip_error(x, y):
     for i, j in zip(x, y):
         a = i + j
@@ -336,7 +338,8 @@ def for_zip_error(x, y):
 
 # 24. for zip
 @paddle.jit.to_static(
-    input_spec=[InputSpec(shape=[2, 10]), InputSpec(shape=[2, 10])])
+    input_spec=[InputSpec(shape=[2, 10]),
+                InputSpec(shape=[2, 10])])
 def for_zip(x, y):
     for i, j in zip(x, y):
         a = i + j
@@ -344,9 +347,10 @@ def for_zip(x, y):
 
 
 class TestTransformBase(unittest.TestCase):
+
     def setUp(self):
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self.set_input()
         self.set_test_func()
 
@@ -370,6 +374,7 @@ def get_static_output(self):
 
 
 class TestTransform(TestTransformBase):
+
     def transformed_result_compare(self):
         dy_outs = self.get_dygraph_output()
         if not isinstance(dy_outs, (tuple, list)):
@@ -384,6 +389,7 @@ def transformed_result_compare(self):
 
 
 class TestTransformForOriginalList(TestTransform):
+
     def _run(self, to_static):
         program_translator.enable(to_static)
         with fluid.dygraph.guard():
@@ -391,6 +397,7 @@ def _run(self, to_static):
 
 
 class TestTransformError(TestTransformBase):
+
     def transformed_error(self, etype):
         with self.assertRaises(etype):
             dy_out = self.get_dygraph_output()
@@ -398,6 +405,7 @@ def transformed_error(self, etype):
 
 
 class TestForInRange(TestTransform):
+
     def set_input(self):
         self.input = np.array([5])
 
@@ -409,6 +417,7 @@ def test_transformed_result_compare(self):
 
 
 class TestForIterList(TestTransform):
+
     def set_test_func(self):
         self.dygraph_func = for_iter_list
 
@@ -417,16 +426,19 @@ def test_transformed_result_compare(self):
 
 
 class TestForEnumerateSimple(TestForIterList):
+
     def set_test_func(self):
         self.dygraph_func = for_enumerate_list
 
 
 class TestForInRangeWithBreak(TestForInRange):
+
     def set_test_func(self):
         self.dygraph_func = for_in_range_with_break
 
 
 class TestForIterVarNumpy(TestTransform):
+
     def set_input(self):
         self.input = np.array([1, 2, 3, 4, 5])
 
@@ -438,86 +450,103 @@ def test_transformed_result_compare(self):
 
 
 class TestForEnumerateVarNumpy(TestForIterVarNumpy):
+
     def set_test_func(self):
         self.dygraph_func = for_enumerate_var_numpy
 
 
 class TestForEnumerateVarNumpyWithStart(TestForIterVarNumpy):
+
     def set_test_func(self):
         self.dygraph_func = for_enumerate_var_numpy_with_start
 
 
 class TestForEnumerateVarNumpyWithBreak(TestForIterVarNumpy):
+
     def set_test_func(self):
         self.dygraph_func = for_enumerate_var_numpy_with_break
 
 
 class TestForEnumerateVarNumpyWithContinue(TestForIterVarNumpy):
+
     def set_test_func(self):
         self.dygraph_func = for_enumerate_var_numpy_with_continue
 
 
 class TestForEnumerateVarNumpyWithStartAndBreak(TestForIterVarNumpy):
+
     def set_test_func(self):
         self.dygraph_func = for_enumerate_var_numpy_with_start_break
 
 
 class TestForEnumerateVarNumpyWithStartAndContinue(TestForIterVarNumpy):
+
     def set_test_func(self):
         self.dygraph_func = for_enumerate_var_numpy_with_start_continue
 
 
 class TestForIterVar(TestForIterVarNumpy):
+
     def set_test_func(self):
         self.dygraph_func = for_iter_var
 
 
 class TestForIterVarIdx(TestForIterVarNumpy):
+
     def set_test_func(self):
         self.dygraph_func = for_iter_var_idx
 
 
 class TestForEnumerateVar(TestForIterVarNumpy):
+
     def set_test_func(self):
         self.dygraph_func = for_enumerate_var
 
 
 class TestForEnumerateVarWithNestedRange(TestForIterVarNumpy):
+
     def set_test_func(self):
         self.dygraph_func = for_enumerate_var_with_nested_range
 
 
 class TestForIterVarList(TestForInRange):
+
     def set_test_func(self):
         self.dygraph_func = for_iter_var_list
 
 
 class TestForEnumerateVarList(TestForInRange):
+
     def set_test_func(self):
         self.dygraph_func = for_enumerate_var_list
 
 
 class TestForTupleAsIterVar(TestForIterVarNumpy):
+
     def set_test_func(self):
         self.dygraph_func = for_tuple_as_iter_var
 
 
 class TestForTupleAsEnumerateIter(TestForIterVarNumpy):
+
     def set_test_func(self):
         self.dygraph_func = for_tuple_as_enumerate_iter
 
 
 class TestForTupleAsEnumerateValue(TestForIterVarNumpy):
+
     def set_test_func(self):
         self.dygraph_func = for_tuple_as_enumerate_value
 
 
 class TestForwardContainsForLayer(TestForIterVarNumpy):
+
     def set_test_func(self):
         self.dygraph_func = ForwardContainsForLayer()
 
 
 class TestForOriginalList(TestTransformForOriginalList):
+
     def set_test_func(self):
         self.dygraph_func = for_original_list
 
@@ -526,6 +555,7 @@ def test_transformed_result_compare(self):
 
 
 class TestForOriginalTuple(TestTransformForOriginalList):
+
     def set_test_func(self):
         self.dygraph_func = for_original_tuple
 
@@ -534,6 +564,7 @@ def test_transformed_result_compare(self):
 
 
 class TestForZip(unittest.TestCase):
+
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_full_name_usage.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_full_name_usage.py
index 4f7fa65ee9c90..33b50af7c6dcf 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_full_name_usage.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_full_name_usage.py
@@ -46,6 +46,7 @@ def decorated_call_decorated(x):
 
 
 class DoubleDecorated(object):
+
     @classmethod
     @declarative
     def double_decorated_func1(self, x):
@@ -58,6 +59,7 @@ def double_decorated_func2(self, x):
 
 
 class TestFullNameDecorator(unittest.TestCase):
+
     def test_run_success(self):
         x = np.ones([1, 2]).astype("float32")
         answer = np.zeros([1, 2]).astype("float32")
@@ -74,6 +76,7 @@ def test_run_success(self):
 
 
 class TestImportProgramTranslator(unittest.TestCase):
+
     def test_diff_pkg_same_cls(self):
         dygraph_prog_trans = fluid.dygraph.ProgramTranslator()
         dy_to_stat_prog_trans = fluid.dygraph.dygraph_to_static.ProgramTranslator(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py
index c242bb34626c1..9fdb6e7c6d36d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py
@@ -24,6 +24,7 @@
 
 
 class TestFunctionSpec(unittest.TestCase):
+
     def test_constructor(self):
         foo_spec = FunctionSpec(foo_func)
         args_name = foo_spec.args_name
@@ -50,10 +51,11 @@ def test_unified_args_and_kwargs(self):
         self.assertTrue(len(kwargs) == 0)
 
         # case 2: foo(a=10, b=20, d=4)
-        args, kwargs = foo_spec.unified_args_and_kwargs(
-            [], {'a': 10,
-                 'b': 20,
-                 'd': 4})
+        args, kwargs = foo_spec.unified_args_and_kwargs([], {
+            'a': 10,
+            'b': 20,
+            'd': 4
+        })
         self.assertTupleEqual(args, (10, 20, 1, 4))
         self.assertTrue(len(kwargs) == 0)
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py
index b5160e210c1b4..f7eccf1f9e7e5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py
@@ -22,6 +22,7 @@
 
 
 class GradLayer(paddle.nn.Layer):
+
     def __init__(self):
         super(GradLayer, self).__init__()
 
@@ -34,6 +35,7 @@ def forward(self, x):
 
 
 class GradLinearLayer(paddle.nn.Layer):
+
     def __init__(self):
         super(GradLinearLayer, self).__init__()
         self.linear = paddle.nn.Linear(5, 5, bias_attr=False)
@@ -45,12 +47,15 @@ def forward(self, x):
         for i in range(10):
             tmp = self.linear(tmp)
         out = tmp
-        dx = paddle.grad(
-            [out], [x], None, create_graph=True, allow_unused=False)[0]
+        dx = paddle.grad([out], [x],
+                         None,
+                         create_graph=True,
+                         allow_unused=False)[0]
         return dx
 
 
 class NoGradLinearLayer(paddle.nn.Layer):
+
     def __init__(self):
         super(NoGradLinearLayer, self).__init__()
         self.linear = paddle.nn.Linear(5, 5, bias_attr=False)
@@ -67,6 +72,7 @@ def forward(self, x):
 
 
 class TestGrad(unittest.TestCase):
+
     def setUp(self):
         self.func = GradLayer()
         self.x = paddle.ones(shape=[10, 2, 5], dtype='float32')
@@ -86,6 +92,7 @@ def test_forward(self):
 
 
 class TestGradLinear(TestGrad):
+
     def setUp(self):
         self.func = GradLinearLayer()
         self.x = paddle.ones(shape=[10, 2, 5], dtype='float32')
@@ -102,8 +109,7 @@ def tearDown(self):
 
     def test_save_infer_program(self):
         input_spec = [
-            paddle.static.InputSpec(
-                shape=[10, 2, 5], dtype='float32')
+            paddle.static.InputSpec(shape=[10, 2, 5], dtype='float32')
         ]
         paddle.jit.save(self.func, self.infer_model_path, input_spec=input_spec)
         load_func = paddle.jit.load(self.infer_model_path)
@@ -134,6 +140,7 @@ def test_save_train_program(self):
 
 
 class TestNoGradLinear(TestGradLinear):
+
     def setUp(self):
         self.func = NoGradLinearLayer()
         self.x = paddle.ones(shape=[10, 2, 5], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grid_generator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grid_generator.py
index ea2964d4c8b2a..574f65ffeaae0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grid_generator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grid_generator.py
@@ -27,6 +27,7 @@
 
 
 class GridGenerator(nn.Layer):
+
     def __init__(self, in_channels, num_fiducial):
         super(GridGenerator, self).__init__()
         self.eps = 1e-6
@@ -35,13 +36,14 @@ def __init__(self, in_channels, num_fiducial):
         initializer = nn.initializer.Constant(value=0.0)
         param_attr = ParamAttr(learning_rate=0.0, initializer=initializer)
         bias_attr = ParamAttr(learning_rate=0.0, initializer=initializer)
-        self.fc = nn.Linear(
-            in_channels, 6, weight_attr=param_attr, bias_attr=bias_attr)
+        self.fc = nn.Linear(in_channels,
+                            6,
+                            weight_attr=param_attr,
+                            bias_attr=bias_attr)
 
     @paddle.jit.to_static(input_spec=[
-        paddle.static.InputSpec(
-            shape=[None, 3, 32, 100], dtype='float32'), paddle.static.InputSpec(
-                shape=[32, 100], dtype='float32')
+        paddle.static.InputSpec(shape=[None, 3, 32, 100], dtype='float32'),
+        paddle.static.InputSpec(shape=[32, 100], dtype='float32')
     ])
     def forward(self, batch_C_prime, I_r_size):
         """
@@ -91,17 +93,16 @@ def build_inv_delta_C_paddle(self, C):
                     hat_C[i, j] = r
                     hat_C[j, i] = r
         hat_C = (hat_C**2) * paddle.log(hat_C)
-        delta_C = paddle.concat(
-            [
-                paddle.concat(
-                    [paddle.ones((F, 1)), C, hat_C], axis=1),
-                paddle.concat(
-                    [paddle.zeros((2, 3)), paddle.transpose(
-                        C, perm=[1, 0])],
-                    axis=1), paddle.concat(
-                        [paddle.zeros((1, 3)), paddle.ones((1, F))], axis=1)
-            ],
-            axis=0)
+        delta_C = paddle.concat([
+            paddle.concat([paddle.ones((F, 1)), C, hat_C], axis=1),
+            paddle.concat(
+                [paddle.zeros((2, 3)),
+                 paddle.transpose(C, perm=[1, 0])],
+                axis=1),
+            paddle.concat([paddle.zeros(
+                (1, 3)), paddle.ones((1, F))], axis=1)
+        ],
+                                axis=0)
         inv_delta_C = paddle.inverse(delta_C)
         return inv_delta_C
 
@@ -114,8 +115,8 @@ def build_P_hat_paddle(self, C, P):
         P_diff = P_tile - C_tile
         rbf_norm = paddle.norm(P_diff, p=2, axis=2, keepdim=False)
 
-        rbf = paddle.multiply(
-            paddle.square(rbf_norm), paddle.log(rbf_norm + eps))
+        rbf = paddle.multiply(paddle.square(rbf_norm),
+                              paddle.log(rbf_norm + eps))
         P_hat = paddle.concat([paddle.ones((n, 1)), P, rbf], axis=1)
         return P_hat
 
@@ -128,6 +129,7 @@ def get_expand_tensor(self, batch_C_prime):
 
 
 class TestGridGenerator(unittest.TestCase):
+
     def setUp(self):
         self.x = paddle.uniform(shape=[1, 20, 2], dtype='float32')
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
index 276aa68e895c6..5ce163c76855d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
@@ -60,36 +60,42 @@ def test_ast_to_func(self):
 
 
 class TestDygraphIfElse2(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = dyfunc_with_if_else2
 
 
 class TestDygraphIfElse3(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = dyfunc_with_if_else3
 
 
 class TestDygraphIfElseWithListGenerator(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = dyfunc_with_if_else_with_list_geneator
 
 
 class TestDygraphNestedIfElse(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = nested_if_else
 
 
 class TestDygraphNestedIfElse2(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = nested_if_else_2
 
 
 class TestDygraphNestedIfElse3(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = nested_if_else_3
@@ -122,6 +128,7 @@ def body(i, ten, y):
 
 
 class TestDygraphIfElse6(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = dyfunc_ifExp_with_while
@@ -146,48 +153,56 @@ def map_func(func, tensor_list):
 
 
 class TestDygraphIfElse7(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = dyfunc_ifExp
 
 
 class TestDygraphIfElseWithAndOr(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = if_with_and_or
 
 
 class TestDygraphIfElseWithAndOr1(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = if_with_and_or_1
 
 
 class TestDygraphIfElseWithAndOr2(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = if_with_and_or_2
 
 
 class TestDygraphIfElseWithAndOr3(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = if_with_and_or_3
 
 
 class TestDygraphIfElseWithAndOr4(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = if_with_and_or_4
 
 
 class TestDygraphIfElseWithClassVar(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = if_with_class_var
 
 
 class TestDygraphIfTensor(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = if_tensor_case
@@ -242,12 +257,14 @@ def call_external_func(x, label=None):
 
 
 class TestAst2FuncWithExternalFunc(TestDygraphIfElse):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.dyfunc = call_external_func
 
 
 class NetWithExternalFunc(fluid.dygraph.Layer):
+
     @declarative
     def forward(self, x, label=None):
         if fluid.layers.mean(x) < 0:
@@ -268,12 +285,14 @@ def softmax(x):
 
 
 class TestNetWithExternalFunc(TestDygraphIfElseNet):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.Net = NetWithExternalFunc
 
 
 class DiffModeNet1(paddle.nn.Layer):
+
     def __init__(self, mode):
         super(DiffModeNet1, self).__init__()
         self.mode = mode
@@ -290,6 +309,7 @@ def forward(self, x, y):
 
 
 class DiffModeNet2(paddle.nn.Layer):
+
     def __init__(self, mode):
         super(DiffModeNet2, self).__init__()
         self.mode = mode
@@ -328,23 +348,28 @@ def _run(self, mode, to_static):
         return ret.numpy()
 
     def test_train_mode(self):
-        self.assertTrue((self._run(
-            mode='train', to_static=True) == self._run(
-                mode='train', to_static=False)).all())
+        self.assertTrue(
+            (self._run(mode='train',
+                       to_static=True) == self._run(mode='train',
+                                                    to_static=False)).all())
 
     def test_infer_mode(self):
-        self.assertTrue((self._run(
-            mode='infer', to_static=True) == self._run(
-                mode='infer', to_static=False)).all())
+        self.assertTrue(
+            (self._run(mode='infer',
+                       to_static=True) == self._run(mode='infer',
+                                                    to_static=False)).all())
 
 
 class TestDiffModeNet2(TestDiffModeNet):
+
     def init_net(self):
         self.Net = DiffModeNet2
 
 
 class TestNewVarCreateInOneBranch(unittest.TestCase):
+
     def test_var_used_in_another_for(self):
+
         def case_func(training):
             # targets and targets_list is dynamically defined by training
             if training:
@@ -367,6 +392,7 @@ def case_func(training):
 
 
 class TestDy2StIfElseRetInt1(unittest.TestCase):
+
     def setUp(self):
         self.x = np.random.random([5]).astype('float32')
         self.dyfunc = dyfunc_ifelse_ret_int1
@@ -385,6 +411,7 @@ def test_ast_to_func(self):
 
 
 class TestDy2StIfElseRetInt2(TestDy2StIfElseRetInt1):
+
     def setUp(self):
         self.x = np.random.random([5]).astype('float32')
         self.dyfunc = dyfunc_ifelse_ret_int2
@@ -396,6 +423,7 @@ def test_ast_to_func(self):
 
 
 class TestDy2StIfElseRetInt3(TestDy2StIfElseRetInt1):
+
     def setUp(self):
         self.x = np.random.random([5]).astype('float32')
         self.dyfunc = dyfunc_ifelse_ret_int3
@@ -406,6 +434,7 @@ def test_ast_to_func(self):
 
 
 class TestDy2StIfElseRetInt4(TestDy2StIfElseRetInt1):
+
     def setUp(self):
         self.x = np.random.random([5]).astype('float32')
         self.dyfunc = dyfunc_ifelse_ret_int4
@@ -415,20 +444,22 @@ def test_ast_to_func(self):
         with self.assertRaises(TypeError):
             static_func = paddle.jit.to_static(self.dyfunc)
             out = static_func(self.x)
-        # Why need set `_in_declarative_mode_` here? 
-        # In Dy2St we use `with _switch_declarative_mode_guard_()` to indicate 
-        # that the code block is under @to_static, but in this UT 
-        # an exception is thrown during Dy2St, making the `_in_declarative_mode_` 
+        # Why need set `_in_declarative_mode_` here?
+        # In Dy2St we use `with _switch_declarative_mode_guard_()` to indicate
+        # that the code block is under @to_static, but in this UT
+        # an exception is thrown during Dy2St, making the `_in_declarative_mode_`
         # a wrong value. So We need set `_in_declarative_mode_` to False manually.
         paddle.fluid.dygraph.base._in_declarative_mode_ = False
         ProgramTranslator().enable(False)
 
 
 class IfElseNet(paddle.nn.Layer):
+
     def __init__(self):
         super(IfElseNet, self).__init__()
-        self.param = self.create_parameter(
-            shape=[3, 2], dtype='float32', is_bias=False)
+        self.param = self.create_parameter(shape=[3, 2],
+                                           dtype='float32',
+                                           is_bias=False)
 
     @paddle.jit.to_static
     def forward(self, a, b, c):
@@ -444,6 +475,7 @@ def forward(self, a, b, c):
 
 
 class TestDy2StIfElseBackward(unittest.TestCase):
+
     def test_run_backward(self):
         a = paddle.randn((4, 3), dtype='float32')
         a.stop_gradient = False
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py
index 975797a487be7..826063cf67392 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py
@@ -39,9 +39,8 @@ def test_get_name_ids(self):
         source = textwrap.dedent(self.source)
         root = gast.parse(source)
         all_name_ids = get_name_ids([root])
-        self.assertDictEqual(
-            self.transfer_dict(self.all_name_ids),
-            self.transfer_dict(all_name_ids))
+        self.assertDictEqual(self.transfer_dict(self.all_name_ids),
+                             self.transfer_dict(all_name_ids))
 
     def transfer_dict(self, name_ids_dict):
         new_dict = {}
@@ -51,6 +50,7 @@ def transfer_dict(self, name_ids_dict):
 
 
 class TestGetNameIds2(TestGetNameIds):
+
     def setUp(self):
         self.source = """
           def test_fn(x, y):
@@ -64,11 +64,14 @@ def test_fn(x, y):
             return z
         """
         self.all_name_ids = {
-            'x': [
-                gast.Param(), gast.Store(), gast.Load(), gast.Load(),
-                gast.Load()
-            ],
-            'a': [gast.Store(), gast.Load(), gast.Load()],
+            'x':
+            [gast.Param(),
+             gast.Store(),
+             gast.Load(),
+             gast.Load(),
+             gast.Load()],
+            'a': [gast.Store(), gast.Load(),
+                  gast.Load()],
             'y': [
                 gast.Param(),
                 gast.Load(),
@@ -87,6 +90,7 @@ def test_fn(x, y):
 
 
 class TestGetNameIds3(TestGetNameIds):
+
     def setUp(self):
         self.source = """
           def test_fn(x, y):
@@ -119,6 +123,7 @@ def test_fn(x, y):
 
 
 class TestIsControlFlowIf(unittest.TestCase):
+
     def check_false_case(self, code):
         code = textwrap.dedent(code)
         node = gast.parse(code)
@@ -248,14 +253,14 @@ def test_with_node_var_type_map(self):
         var_name_to_type = {"x": {NodeVarType.TENSOR}}
 
         self.assertTrue(
-            is_control_flow_to_transform(
-                node_test, var_name_to_type=var_name_to_type))
+            is_control_flow_to_transform(node_test,
+                                         var_name_to_type=var_name_to_type))
 
         # if x is not a Tensor
         var_name_to_type = {"x": {NodeVarType.NUMPY_NDARRAY}}
         self.assertFalse(
-            is_control_flow_to_transform(
-                node_test, var_name_to_type=var_name_to_type))
+            is_control_flow_to_transform(node_test,
+                                         var_name_to_type=var_name_to_type))
 
     def test_raise_error(self):
         node = "a + b"
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_isinstance.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_isinstance.py
index a838ac6842aba..95432b58a33dd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_isinstance.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_isinstance.py
@@ -31,11 +31,13 @@
 
 
 class SimpleReturnLayer(nn.Layer):
+
     def forward(self, x):
         return x
 
 
 class AddAttrLayer(nn.Layer):
+
     def __init__(self):
         super(AddAttrLayer, self).__init__()
         self.attr = None
@@ -46,6 +48,7 @@ def forward(self, x):
 
 
 class IsInstanceLayer(nn.Layer):
+
     def __init__(self, layer):
         super(IsInstanceLayer, self).__init__()
         self.layer = layer
@@ -59,6 +62,7 @@ def forward(self, x):
 
 
 class SequentialLayer(nn.Layer):
+
     def __init__(self, layers):
         super(SequentialLayer, self).__init__()
         self.layers = nn.LayerList(layers)
@@ -84,6 +88,7 @@ def train(model, to_static):
 
 
 class TestIsinstance(unittest.TestCase):
+
     def test_isinstance_simple_return_layer(self):
         model = IsInstanceLayer(SimpleReturnLayer())
         self._test_model(model)
@@ -103,9 +108,8 @@ def test_sequential_layer(self):
     def _test_model(self, model):
         st_out = train(model, to_static=True)
         dy_out = train(model, to_static=False)
-        self.assertTrue(
-            np.allclose(dy_out, st_out),
-            msg="dy_out:\n {}\n st_out:\n{}".format(dy_out, st_out))
+        self.assertTrue(np.allclose(dy_out, st_out),
+                        msg="dy_out:\n {}\n st_out:\n{}".format(dy_out, st_out))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index e0a9a3ad2af07..ddda462525f31 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -20,6 +20,7 @@
 
 import os
 import tempfile
+
 os.environ["CUDA_VISIBLE_DEVICES"] = "2"
 
 import paddle
@@ -43,6 +44,7 @@
 
 
 class DynamicGRU(fluid.dygraph.Layer):
+
     def __init__(self,
                  size,
                  h_0=None,
@@ -55,13 +57,12 @@ def __init__(self,
                  init_size=None):
         super(DynamicGRU, self).__init__()
 
-        self.gru_unit = GRUUnit(
-            size * 3,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            activation=candidate_activation,
-            gate_activation=gate_activation,
-            origin_mode=origin_mode)
+        self.gru_unit = GRUUnit(size * 3,
+                                param_attr=param_attr,
+                                bias_attr=bias_attr,
+                                activation=candidate_activation,
+                                gate_activation=gate_activation,
+                                origin_mode=origin_mode)
 
         self.size = size
         self.h_0 = h_0
@@ -81,13 +82,15 @@ def forward(self, inputs):
                 j = i
 
             # input_ = inputs[:, j:j+1, :]  # original code
-            input_ = fluid.layers.slice(
-                inputs, axes=[1], starts=[j], ends=[j + 1])
-            input_ = fluid.layers.reshape(
-                input_, [-1, input_.shape[2]], inplace=False)
+            input_ = fluid.layers.slice(inputs,
+                                        axes=[1],
+                                        starts=[j],
+                                        ends=[j + 1])
+            input_ = fluid.layers.reshape(input_, [-1, input_.shape[2]],
+                                          inplace=False)
             hidden, reset, gate = self.gru_unit(input_, hidden)
-            hidden_ = fluid.layers.reshape(
-                hidden, [-1, 1, hidden.shape[1]], inplace=False)
+            hidden_ = fluid.layers.reshape(hidden, [-1, 1, hidden.shape[1]],
+                                           inplace=False)
             res.append(hidden_)
 
         if self.is_reverse:
@@ -97,6 +100,7 @@ def forward(self, inputs):
 
 
 class BiGRU(fluid.dygraph.Layer):
+
     def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
         super(BiGRU, self).__init__()
 
@@ -104,8 +108,8 @@ def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
             input_dim=input_dim,
             output_dim=grnn_hidden_dim * 3,
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
+                initializer=fluid.initializer.Uniform(low=-init_bound,
+                                                      high=init_bound),
                 regularizer=fluid.regularizer.L2DecayRegularizer(
                     regularization_coeff=1e-4)))
 
@@ -113,8 +117,8 @@ def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
             size=grnn_hidden_dim,
             h_0=h_0,
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
+                initializer=fluid.initializer.Uniform(low=-init_bound,
+                                                      high=init_bound),
                 regularizer=fluid.regularizer.L2DecayRegularizer(
                     regularization_coeff=1e-4)))
 
@@ -122,8 +126,8 @@ def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
             input_dim=input_dim,
             output_dim=grnn_hidden_dim * 3,
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
+                initializer=fluid.initializer.Uniform(low=-init_bound,
+                                                      high=init_bound),
                 regularizer=fluid.regularizer.L2DecayRegularizer(
                     regularization_coeff=1e-4)))
 
@@ -132,8 +136,8 @@ def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
             is_reverse=True,
             h_0=h_0,
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
+                initializer=fluid.initializer.Uniform(low=-init_bound,
+                                                      high=init_bound),
                 regularizer=fluid.regularizer.L2DecayRegularizer(
                     regularization_coeff=1e-4)))
 
@@ -149,6 +153,7 @@ def forward(self, input_feature):
 
 
 class LinearChainCRF(fluid.dygraph.Layer):
+
     def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
         super(LinearChainCRF, self).__init__()
 
@@ -191,20 +196,22 @@ def forward(self, input, label, length=None):
         }
         if length is not None:
             this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='linear_chain_crf',
-            inputs=this_inputs,
-            outputs={
-                "Alpha": [alpha],
-                "EmissionExps": [emission_exps],
-                "TransitionExps": transition_exps,
-                "LogLikelihood": log_likelihood
-            },
-            attrs={"is_test": self._is_test, })
+        self._helper.append_op(type='linear_chain_crf',
+                               inputs=this_inputs,
+                               outputs={
+                                   "Alpha": [alpha],
+                                   "EmissionExps": [emission_exps],
+                                   "TransitionExps": transition_exps,
+                                   "LogLikelihood": log_likelihood
+                               },
+                               attrs={
+                                   "is_test": self._is_test,
+                               })
         return log_likelihood
 
 
 class CRFDecoding(fluid.dygraph.Layer):
+
     def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
         super(CRFDecoding, self).__init__()
 
@@ -239,16 +246,20 @@ def forward(self, input, label=None, length=None):
         }
         if length is not None:
             this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='crf_decoding',
-            inputs=this_inputs,
-            outputs={"ViterbiPath": [viterbi_path]},
-            attrs={"is_test": self._is_test, })
+        self._helper.append_op(type='crf_decoding',
+                               inputs=this_inputs,
+                               outputs={"ViterbiPath": [viterbi_path]},
+                               attrs={
+                                   "is_test": self._is_test,
+                               })
         return viterbi_path
 
 
 class ChunkEval(fluid.dygraph.Layer):
-    def __init__(self, num_chunk_types, chunk_scheme,
+
+    def __init__(self,
+                 num_chunk_types,
+                 chunk_scheme,
                  excluded_chunk_types=None):
         super(ChunkEval, self).__init__()
         self.num_chunk_types = num_chunk_types
@@ -257,10 +268,11 @@ def __init__(self, num_chunk_types, chunk_scheme,
 
     def forward(self, input, label, seq_length=None):
         if _non_static_mode():
-            return _C_ops.chunk_eval(
-                input, label, seq_length, "num_chunk_types",
-                self.num_chunk_types, "chunk_scheme", self.chunk_scheme,
-                "excluded_chunk_types", self.excluded_chunk_types or [])
+            return _C_ops.chunk_eval(input, label, seq_length,
+                                     "num_chunk_types", self.num_chunk_types,
+                                     "chunk_scheme", self.chunk_scheme,
+                                     "excluded_chunk_types",
+                                     self.excluded_chunk_types or [])
 
         precision = self._helper.create_variable_for_type_inference(
             dtype="float32")
@@ -279,27 +291,30 @@ def forward(self, input, label, seq_length=None):
         if seq_length is not None:
             this_input["SeqLength"] = [seq_length]
 
-        self._helper.append_op(
-            type='chunk_eval',
-            inputs=this_input,
-            outputs={
-                "Precision": [precision],
-                "Recall": [recall],
-                "F1-Score": [f1_score],
-                "NumInferChunks": [num_infer_chunks],
-                "NumLabelChunks": [num_label_chunks],
-                "NumCorrectChunks": [num_correct_chunks]
-            },
-            attrs={
-                "num_chunk_types": self.num_chunk_types,
-                "chunk_scheme": self.chunk_scheme,
-                "excluded_chunk_types": self.excluded_chunk_types or []
-            })
+        self._helper.append_op(type='chunk_eval',
+                               inputs=this_input,
+                               outputs={
+                                   "Precision": [precision],
+                                   "Recall": [recall],
+                                   "F1-Score": [f1_score],
+                                   "NumInferChunks": [num_infer_chunks],
+                                   "NumLabelChunks": [num_label_chunks],
+                                   "NumCorrectChunks": [num_correct_chunks]
+                               },
+                               attrs={
+                                   "num_chunk_types":
+                                   self.num_chunk_types,
+                                   "chunk_scheme":
+                                   self.chunk_scheme,
+                                   "excluded_chunk_types":
+                                   self.excluded_chunk_types or []
+                               })
         return (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
                 num_correct_chunks)
 
 
 class LexNet(fluid.dygraph.Layer):
+
     def __init__(self, args, length=None):
         super(LexNet, self).__init__()
         """
@@ -325,11 +340,11 @@ def __init__(self, args, length=None):
         self.word_embedding = Embedding(
             size=[self.vocab_size, self.word_emb_dim],
             dtype='float32',
-            param_attr=fluid.ParamAttr(
-                learning_rate=self.emb_lr,
-                name="word_emb",
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound)))
+            param_attr=fluid.ParamAttr(learning_rate=self.emb_lr,
+                                       name="word_emb",
+                                       initializer=fluid.initializer.Uniform(
+                                           low=-self.init_bound,
+                                           high=self.init_bound)))
 
         h_0 = np.zeros((args.batch_size, self.grnn_hidden_dim), dtype="float32")
         h_0 = to_variable(h_0)
@@ -340,39 +355,34 @@ def __init__(self, args, length=None):
                 self.bigru_units.append(
                     self.add_sublayer(
                         "bigru_units%d" % i,
-                        BiGRU(
-                            self.grnn_hidden_dim,
-                            self.grnn_hidden_dim,
-                            self.init_bound,
-                            h_0=h_0)))
+                        BiGRU(self.grnn_hidden_dim,
+                              self.grnn_hidden_dim,
+                              self.init_bound,
+                              h_0=h_0)))
             else:
                 self.bigru_units.append(
                     self.add_sublayer(
                         "bigru_units%d" % i,
-                        BiGRU(
-                            self.grnn_hidden_dim * 2,
-                            self.grnn_hidden_dim,
-                            self.init_bound,
-                            h_0=h_0)))
-
-        self.fc = Linear(
-            input_dim=self.grnn_hidden_dim * 2,
-            output_dim=self.num_labels,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-
-        self.linear_chain_crf = LinearChainCRF(
-            param_attr=fluid.ParamAttr(
-                name='linear_chain_crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
-
-        self.crf_decoding = CRFDecoding(
-            param_attr=fluid.ParamAttr(
-                name='crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
+                        BiGRU(self.grnn_hidden_dim * 2,
+                              self.grnn_hidden_dim,
+                              self.init_bound,
+                              h_0=h_0)))
+
+        self.fc = Linear(input_dim=self.grnn_hidden_dim * 2,
+                         output_dim=self.num_labels,
+                         param_attr=fluid.ParamAttr(
+                             initializer=fluid.initializer.Uniform(
+                                 low=-self.init_bound, high=self.init_bound),
+                             regularizer=fluid.regularizer.L2DecayRegularizer(
+                                 regularization_coeff=1e-4)))
+
+        self.linear_chain_crf = LinearChainCRF(param_attr=fluid.ParamAttr(
+            name='linear_chain_crfw', learning_rate=self.crf_lr),
+                                               size=self.num_labels)
+
+        self.crf_decoding = CRFDecoding(param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=self.crf_lr),
+                                        size=self.num_labels)
         # share weight
         self.crf_decoding.weight = self.linear_chain_crf.weight
 
@@ -390,8 +400,9 @@ def forward(self, word, target, length=None):
 
         emission = self.fc(bigru_output)
 
-        crf_cost = self.linear_chain_crf(
-            input=emission, label=target, length=length)
+        crf_cost = self.linear_chain_crf(input=emission,
+                                         label=target,
+                                         length=length)
         avg_cost = fluid.layers.mean(x=crf_cost)
         crf_decode = self.crf_decoding(input=emission, length=length)
         return avg_cost, crf_decode
@@ -420,8 +431,8 @@ def __reader__():
             cur_len = local_random.randint(3, max_seq_len)
             word_ids = local_random.randint(0, vocab_size,
                                             [cur_len]).astype('int64').tolist()
-            label_ids = local_random.randint(0, num_labels,
-                                             [cur_len]).astype('int64').tolist()
+            label_ids = local_random.randint(
+                0, num_labels, [cur_len]).astype('int64').tolist()
             batch.append((word_ids, label_ids))
             init_lens.append(cur_len)
             if len(batch) == batch_size:
@@ -446,8 +457,9 @@ def __reader__():
 
 
 def create_dataloader(reader, place):
-    data_loader = fluid.io.DataLoader.from_generator(
-        capacity=16, use_double_buffer=True, iterable=True)
+    data_loader = fluid.io.DataLoader.from_generator(capacity=16,
+                                                     use_double_buffer=True,
+                                                     iterable=True)
 
     data_loader.set_sample_list_generator(reader, places=place)
 
@@ -455,10 +467,11 @@ def create_dataloader(reader, place):
 
 
 class TestLACModel(unittest.TestCase):
+
     def setUp(self):
         self.args = Args()
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self.temp_dir = tempfile.TemporaryDirectory()
         self.model_save_dir = os.path.join(self.temp_dir.name, 'inference')
         self.model_save_prefix = os.path.join(self.model_save_dir, 'lac')
@@ -468,8 +481,8 @@ def setUp(self):
 
     def train(self, args, to_static):
         program_translator.enable(to_static)
-        place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.dygraph.guard(place):
             paddle.seed(SEED)
             paddle.framework.random._manual_program_seed(SEED)
@@ -482,8 +495,8 @@ def train(self, args, to_static):
             optimizer = fluid.optimizer.AdamOptimizer(
                 learning_rate=args.base_learning_rate,
                 parameter_list=model.parameters())
-            chunk_eval = ChunkEval(
-                int(math.ceil((args.num_labels - 1) / 2.0)), "IOB")
+            chunk_eval = ChunkEval(int(math.ceil((args.num_labels - 1) / 2.0)),
+                                   "IOB")
 
             step = 0
             chunk_evaluator = fluid.metrics.ChunkEvaluator()
@@ -505,8 +518,10 @@ def train(self, args, to_static):
 
                     if step % args.print_steps == 0:
                         (precision, recall, f1_score, num_infer_chunks,
-                         num_label_chunks, num_correct_chunks) = chunk_eval(
-                             input=crf_decode, label=targets, seq_length=length)
+                         num_label_chunks,
+                         num_correct_chunks) = chunk_eval(input=crf_decode,
+                                                          label=targets,
+                                                          seq_length=length)
                         outputs = [avg_cost, precision, recall, f1_score]
                         avg_cost, precision, recall, f1_score = [
                             np.mean(x.numpy()) for x in outputs
@@ -534,28 +549,27 @@ def train(self, args, to_static):
     def test_train(self):
         st_out = self.train(self.args, to_static=True)
         dy_out = self.train(self.args, to_static=False)
-        self.assertTrue(
-            np.allclose(dy_out, st_out),
-            msg="dygraph output:\n{},\nstatic output:\n {}.".format(dy_out,
-                                                                    st_out))
+        self.assertTrue(np.allclose(dy_out, st_out),
+                        msg="dygraph output:\n{},\nstatic output:\n {}.".format(
+                            dy_out, st_out))
         # Prediction needs trained models, so put `test_predict` at last of `test_train`
         # self.verify_predict()
 
     def verify_predict(self):
-        reader = get_random_input_data(
-            self.args.batch_size, self.args.vocab_size, self.args.num_labels)
+        reader = get_random_input_data(self.args.batch_size,
+                                       self.args.vocab_size,
+                                       self.args.num_labels)
         for batch in reader():
             batch = [np.vstack(var) for var in zip(*batch)]
             dy_pre = self.predict_dygraph(batch)
             st_pre = self.predict_static(batch)
             dy_jit_pre = self.predict_dygraph_jit(batch)
-            self.assertTrue(
-                np.allclose(dy_pre, st_pre),
-                msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
-            self.assertTrue(
-                np.allclose(dy_jit_pre, st_pre),
-                msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre,
-                                                               st_pre))
+            self.assertTrue(np.allclose(dy_pre, st_pre),
+                            msg="dy_pre:\n {}\n, st_pre: \n{}.".format(
+                                dy_pre, st_pre))
+            self.assertTrue(np.allclose(dy_jit_pre, st_pre),
+                            msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(
+                                dy_jit_pre, st_pre))
 
     def predict_dygraph(self, batch):
         words, targets, length = batch
@@ -567,8 +581,8 @@ def predict_dygraph(self, batch):
             model.set_dict(model_dict)
             model.eval()
 
-            _, pred_res = model(
-                to_variable(words), to_variable(targets), to_variable(length))
+            _, pred_res = model(to_variable(words), to_variable(targets),
+                                to_variable(length))
 
             return pred_res.numpy()
 
@@ -580,19 +594,19 @@ def predict_static(self, batch):
         paddle.enable_static()
         exe = fluid.Executor(self.place)
         # load inference model
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             self.model_save_dir,
-             executor=exe,
-             model_filename=self.model_filename,
-             params_filename=self.params_filename)
+        [inference_program, feed_target_names, fetch_targets
+         ] = fluid.io.load_inference_model(self.model_save_dir,
+                                           executor=exe,
+                                           model_filename=self.model_filename,
+                                           params_filename=self.params_filename)
 
         words, targets, length = batch
-        pred_res = exe.run(
-            inference_program,
-            feed={feed_target_names[0]: words,
-                  feed_target_names[1]: length},
-            fetch_list=fetch_targets)
+        pred_res = exe.run(inference_program,
+                           feed={
+                               feed_target_names[0]: words,
+                               feed_target_names[1]: length
+                           },
+                           fetch_list=fetch_targets)
         return pred_res[0]
 
     def predict_dygraph_jit(self, batch):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lambda.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lambda.py
index 1ab10461fd297..7eccbedf4d219 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lambda.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lambda.py
@@ -80,11 +80,12 @@ def call_lambda_with_ifExpr2(x):
 
 
 class TestLambda(unittest.TestCase):
+
     def setUp(self):
         self.x = np.random.random([10, 16]).astype('float32')
         self.x = np.array([1, 3]).astype('float32')
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self.init_func()
 
     def init_func(self):
@@ -108,8 +109,8 @@ def run_dygraph(self, func, to_static=False):
 
     def test_ast_to_func(self):
         for func in self.dyfuncs:
-            self.assertTrue((self.run_dygraph(func) == self.run_static(func)
-                             ).all())
+            self.assertTrue(
+                (self.run_dygraph(func) == self.run_static(func)).all())
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_layer_hook.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_layer_hook.py
index 357d9611053da..b06b01a46fee8 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_layer_hook.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_layer_hook.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,6 +29,7 @@ def forward_pre_hook1(layer, input):
 
 
 class SimpleNet(paddle.nn.Layer):
+
     def __init__(self, ):
         super(SimpleNet, self).__init__()
         self.fc1 = paddle.nn.Linear(10, 10)
@@ -52,6 +53,7 @@ def forward(self, x):
 
 
 class TestNestLayerHook(unittest.TestCase):
+
     def setUp(self):
         paddle.seed(2022)
         self.x = paddle.randn([4, 10])
@@ -83,12 +85,12 @@ def test_hook(self):
         st_out = self.train_net(to_static=True)
         load_out = self.load_train()
         print(st_out, dy_out, load_out)
-        self.assertTrue(
-            np.allclose(st_out, dy_out),
-            msg='dygraph_res is {}\nstatic_res is {}'.format(dy_out, st_out))
-        self.assertTrue(
-            np.allclose(st_out, load_out),
-            msg='load_out is {}\nstatic_res is {}'.format(load_out, st_out))
+        self.assertTrue(np.allclose(st_out, dy_out),
+                        msg='dygraph_res is {}\nstatic_res is {}'.format(
+                            dy_out, st_out))
+        self.assertTrue(np.allclose(st_out, load_out),
+                        msg='load_out is {}\nstatic_res is {}'.format(
+                            load_out, st_out))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py
index 00a1b018376c6..28f79b57b6ba3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py
@@ -42,9 +42,10 @@ def len_with_lod_tensor_array(x):
 
 
 class TestLen(unittest.TestCase):
+
     def setUp(self):
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self.x_data = np.random.random([10, 16]).astype('float32')
         self.init_func()
 
@@ -69,6 +70,7 @@ def test_len(self):
 
 
 class TestLenWithTensorArray(TestLen):
+
     def init_func(self):
         self.func = len_with_lod_tensor_array
 
@@ -78,11 +80,10 @@ def init_func(self):
 def len_with_selected_rows(place):
     block = fluid.default_main_program().global_block()
     # create selected_rows variable
-    var = block.create_var(
-        name="X",
-        dtype="float32",
-        persistable=True,
-        type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
+    var = block.create_var(name="X",
+                           dtype="float32",
+                           persistable=True,
+                           type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
     # y is Variable(SelectedRows)
     y = fluid.layers.merge_selected_rows(var)
     y_len = convert_call(len)(y)
@@ -108,9 +109,10 @@ def len_with_selected_rows(place):
 
 
 class TestLenWithSelectedRows(unittest.TestCase):
+
     def setUp(self):
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 
     def test_len(self):
         selected_rows_var_len, var_tensor_len = len_with_selected_rows(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
index ba1f5ed2b3ead..55dff1c92bb20 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
@@ -44,8 +44,7 @@ def test_list_append_in_if(x):
         a.append(x)
     else:
         a.append(
-            fluid.layers.fill_constant(
-                shape=[1, 2], value=9, dtype="int64"))
+            fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int64"))
     # TODO(Aurelius84): Currently, run_program_op doesn't support output LoDTensorArray.
     return a[0]
 
@@ -101,8 +100,9 @@ def test_list_append_in_for_loop_with_concat(x, iter_num):
 
 def test_list_append_in_while_loop(x, iter_num):
     x = fluid.dygraph.to_variable(x)
-    iter_num = fluid.layers.fill_constant(
-        shape=[1], value=iter_num, dtype="int32")
+    iter_num = fluid.layers.fill_constant(shape=[1],
+                                          value=iter_num,
+                                          dtype="int32")
     a = []
     i = 0
     while i < iter_num:
@@ -113,8 +113,9 @@ def test_list_append_in_while_loop(x, iter_num):
 
 def test_list_append_in_while_loop_with_stack(x, iter_num):
     x = fluid.dygraph.to_variable(x)
-    iter_num = fluid.layers.fill_constant(
-        shape=[1], value=iter_num, dtype="int32")
+    iter_num = fluid.layers.fill_constant(shape=[1],
+                                          value=iter_num,
+                                          dtype="int32")
     a = []
     i = 0
     while i < iter_num.numpy()[0]:
@@ -182,8 +183,9 @@ def test_list_pop_in_for_loop(x, iter_num):
 
 def test_list_pop_in_while_loop(x, iter_num):
     x = fluid.dygraph.to_variable(x)
-    iter_num = fluid.layers.fill_constant(
-        shape=[1], value=iter_num, dtype="int32")
+    iter_num = fluid.layers.fill_constant(shape=[1],
+                                          value=iter_num,
+                                          dtype="int32")
     a = []
     b = [x]
     b.append(x)
@@ -200,9 +202,10 @@ def test_list_pop_in_while_loop(x, iter_num):
 
 
 class TestListWithoutControlFlow(unittest.TestCase):
+
     def setUp(self):
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 
         self.init_data()
         self.init_dygraph_func()
@@ -249,16 +252,18 @@ def test_transformed_static_result(self):
             for stat_res, dy_res in zip(static_res_list, dygraph_res_list):
                 self.assertTrue(
                     np.allclose(stat_res, dy_res),
-                    msg='dygraph_res is {}\nstatic_res is {}'.format(stat_res,
-                                                                     dy_res))
+                    msg='dygraph_res is {}\nstatic_res is {}'.format(
+                        stat_res, dy_res))
 
 
 class TestListInIf(TestListWithoutControlFlow):
+
     def init_dygraph_func(self):
         self.all_dygraph_funcs = [test_list_append_in_if, test_list_pop_in_if]
 
 
 class TestListInWhileLoop(TestListWithoutControlFlow):
+
     def init_data(self):
         self.input = np.random.random((3)).astype('int32')
         self.iter_num = 3
@@ -279,11 +284,13 @@ def train(self, to_static=False):
 
 
 class TestListInWhileLoopWithStack(TestListInWhileLoop):
+
     def init_dygraph_func(self):
         self.all_dygraph_funcs = [test_list_append_in_while_loop_with_stack]
 
 
 class TestListInForLoop(TestListInWhileLoop):
+
     def init_dygraph_func(self):
         self.all_dygraph_funcs = [
             test_list_append_in_for_loop, test_list_pop_in_for_loop
@@ -291,11 +298,15 @@ def init_dygraph_func(self):
 
 
 class TestListInForLoopWithConcat(TestListInWhileLoopWithStack):
+
     def init_dygraph_func(self):
-        self.all_dygraph_funcs = [test_list_append_in_for_loop_with_concat, ]
+        self.all_dygraph_funcs = [
+            test_list_append_in_for_loop_with_concat,
+        ]
 
 
 class TestListInForLoopWithSubscript(TestListWithoutControlFlow):
+
     def init_dygraph_func(self):
         self.all_dygraph_funcs = [
             test_list_append_in_for_subscript,
@@ -307,6 +318,7 @@ def init_data(self):
 
 
 class ListWithCondNet(paddle.nn.Layer):
+
     def __init__(self):
         super(ListWithCondNet, self).__init__()
 
@@ -330,6 +342,7 @@ def forward(self, x, index):
 
 
 class TestListWithCondGradInferVarType(unittest.TestCase):
+
     def test_to_static(self):
         net = ListWithCondNet()
         x = paddle.to_tensor([2, 3, 4], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
index 385b7ce204a86..e1ea7d99d8252 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
@@ -28,6 +28,7 @@
 
 
 class TestLoggingUtils(unittest.TestCase):
+
     def setUp(self):
         self.verbosity_level = 1
         self.code_level = 3
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logical.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logical.py
index b11e9441c8c0e..0a510eb81b1e0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logical.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logical.py
@@ -172,10 +172,11 @@ def test_shape_not_equal(x):
 
 
 class TestLogicalBase(unittest.TestCase):
+
     def setUp(self):
         self.input = np.array([3]).astype('int32')
-        self.place = paddle.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        self.place = paddle.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else paddle.CPUPlace()
         self._set_test_func()
 
     def _set_test_func(self):
@@ -196,69 +197,77 @@ def _run_static(self):
 
 
 class TestLogicalNot(TestLogicalBase):
+
     def _set_test_func(self):
         self.dygraph_func = test_logical_not
 
     def test_transformed_result(self):
         dygraph_res = self._run_dygraph()
         static_res = self._run_static()
-        self.assertTrue(
-            np.allclose(dygraph_res, static_res),
-            msg='dygraph result is {}\nstatic_result is {}'.format(dygraph_res,
-                                                                   static_res))
+        self.assertTrue(np.allclose(dygraph_res, static_res),
+                        msg='dygraph result is {}\nstatic_result is {}'.format(
+                            dygraph_res, static_res))
 
 
 class TestLogicalNot2(TestLogicalBase):
+
     def _set_test_func(self):
         self.dygraph_func = test_logical_not_2
 
     def test_transformed_result(self):
         dygraph_res = self._run_dygraph()
         static_res = self._run_static()
-        self.assertTrue(
-            np.allclose(dygraph_res, static_res),
-            msg='dygraph result is {}\nstatic_result is {}'.format(dygraph_res,
-                                                                   static_res))
+        self.assertTrue(np.allclose(dygraph_res, static_res),
+                        msg='dygraph result is {}\nstatic_result is {}'.format(
+                            dygraph_res, static_res))
 
 
 class TestLogicalAnd(TestLogicalNot):
+
     def _set_test_func(self):
         self.dygraph_func = test_logical_and
 
 
 class TestLogicalAnd2(TestLogicalNot):
+
     def _set_test_func(self):
         self.dygraph_func = test_logical_and_2
 
 
 class TestLogicalOr(TestLogicalNot):
+
     def _set_test_func(self):
         self.dygraph_func = test_logical_or
 
 
 class TestLogicalOr2(TestLogicalNot):
+
     def _set_test_func(self):
         self.dygraph_func = test_logical_or_2
 
 
 class TestLogicalNotAndOr(TestLogicalNot):
+
     def _set_test_func(self):
         self.dygraph_func = test_logical_not_and_or
 
 
 class TestShapeEqual(TestLogicalNot):
+
     def _set_test_func(self):
         self.input = np.ones([1, 2, 3]).astype('float32')
         self.dygraph_func = test_shape_equal
 
 
 class TestShapeNotEqual(TestLogicalNot):
+
     def _set_test_func(self):
         self.input = np.ones([1, 2, 3]).astype('float32')
         self.dygraph_func = test_shape_not_equal
 
 
 class TestCmpopNodeToStr(unittest.TestCase):
+
     def test_exception(self):
         with self.assertRaises(KeyError):
             cmpop_node_to_str(gast.Or())
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
index 93eb1247888b5..56e9cabbef485 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
@@ -150,7 +150,9 @@ def while_loop_bool_op2(x):
 
 
 def while_loop_class_var(x):
+
     class Foo(object):
+
         def __init__(self):
             self.a = 3
             self.b = 4
@@ -176,7 +178,9 @@ def loop_var_contains_property(x):
 
 
 def for_loop_class_var(max_len):
+
     class Foo(object):
+
         def __init__(self):
             self.a = 3
             self.b = 4
@@ -185,8 +189,9 @@ def __init__(self):
     foo = Foo()
 
     # Use `to_variable` so that static analysis can analyze the type of X is Tensor
-    max_len = fluid.layers.fill_constant(
-        shape=[1], value=max_len, dtype="int32")
+    max_len = fluid.layers.fill_constant(shape=[1],
+                                         value=max_len,
+                                         dtype="int32")
 
     for i in range(max_len):
         foo.b = fluid.layers.zeros(shape=[1], dtype='float32')
@@ -225,13 +230,16 @@ def for_loop_dufunc_with_listcomp(array):
 
 
 class TestNameVisitor(unittest.TestCase):
+
     def setUp(self):
         self.loop_funcs = [
             while_loop_dyfunc, for_loop_dyfunc, while_loop_dyfunc_with_none,
             for_loop_dufunc_with_listcomp
         ]
         self.loop_var_names = [
-            set(["i", "x"]), set(["i", "ret", "max_len"]), set(["i", "x"]),
+            set(["i", "x"]),
+            set(["i", "ret", "max_len"]),
+            set(["i", "x"]),
             set(["j", "array", "res", "x"])
         ]
         self.create_var_names = [set(), set(["ret"]), set(), set(["res", "x"])]
@@ -258,7 +266,9 @@ def test_nested_loop_vars(self):
         name_visitor = NameVisitor(gast_root)
 
         self.loop_var_names = [
-            set(["j", "two"]), set(["i", "three", "b"]), set(["i", "j"])
+            set(["j", "two"]),
+            set(["i", "three", "b"]),
+            set(["i", "j"])
         ]
         self.create_var_names = [set(), set(["b"]), set()]
 
@@ -275,15 +285,17 @@ def test_nested_loop_vars(self):
                 self.assertEqual(
                     create_var_names,
                     self.create_var_names[i],
-                    msg="i = {}\ncreate_var_names : {}, \nexpected create_var_names : {}".
-                    format(i, create_var_names, self.create_var_names[i]))
+                    msg=
+                    "i = {}\ncreate_var_names : {}, \nexpected create_var_names : {}"
+                    .format(i, create_var_names, self.create_var_names[i]))
                 i += 1
 
 
 class TestTransformWhileLoop(unittest.TestCase):
+
     def setUp(self):
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self.x = np.zeros(shape=(1), dtype=np.int32)
         self._init_dyfunc()
 
@@ -316,49 +328,58 @@ def test_ast_to_func(self):
 
 
 class TestTransformWhileLoopWithoutTensor(TestTransformWhileLoop):
+
     def _init_dyfunc(self):
         self.dyfunc = while_loop_dyfunc_without_tensor
 
 
 class TestTransformWhileLoopWithConflicVar(TestTransformWhileLoop):
+
     def _init_dyfunc(self):
         self.dyfunc = while_loop_dyfun_with_conflict_var
 
 
 class TestTransformWhileLoopWithNone(TestTransformWhileLoop):
+
     def _init_dyfunc(self):
         self.dyfunc = while_loop_dyfunc_with_none
 
 
 class TestForBreakSingleReturn(TestTransformWhileLoop):
+
     def _init_dyfunc(self):
         self.dyfunc = for_break_single_return
 
 
 class TestWhileLoopBoolOp(TestTransformWhileLoop):
+
     def _init_dyfunc(self):
         self.dyfunc = while_loop_bool_op
 
 
 class TestWhileLoopBoolOp2(TestTransformWhileLoop):
+
     def _init_dyfunc(self):
         self.dyfunc = while_loop_bool_op2
 
 
 class TestWhileLoopClassVar(TestTransformWhileLoop):
+
     def _init_dyfunc(self):
         self.dyfunc = while_loop_class_var
 
 
 class TestLoopVarContainsProperty(TestTransformWhileLoop):
+
     def _init_dyfunc(self):
         self.dyfunc = loop_var_contains_property
 
 
 class TestTransformForLoop(unittest.TestCase):
+
     def setUp(self):
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self.len = 100
         self._init_dyfunc()
 
@@ -384,31 +405,37 @@ def test_ast_to_func(self):
 
 
 class TestTransformForLoop2(TestTransformForLoop):
+
     def _init_dyfunc(self):
         self.dyfunc = for_loop_dyfunc2
 
 
 class TestTransformForLoop3(TestTransformForLoop):
+
     def _init_dyfunc(self):
         self.dyfunc = for_loop_dyfunc3
 
 
 class TestTransformForLoop4(TestTransformForLoop):
+
     def _init_dyfunc(self):
         self.dyfunc = for_loop_dyfunc4
 
 
 class TestClassVarInForLoop(TestTransformForLoop):
+
     def _init_dyfunc(self):
         self.dyfunc = for_loop_class_var
 
 
 class TestVarCreateInForLoop(TestTransformForLoop):
+
     def _init_dyfunc(self):
         self.dyfunc = var_create_in_for_loop
 
 
 class TestErrorInForLoop(TestTransformForLoop):
+
     def _init_dyfunc(self):
         self.dyfunc = for_loop_dyfunc_not_support
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
index 8d54e199800cd..6017585156198 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
@@ -21,10 +21,13 @@
 
 
 class LSTMLayer(nn.Layer):
+
     def __init__(self, in_channels, hidden_size):
         super(LSTMLayer, self).__init__()
-        self.cell = nn.LSTM(
-            in_channels, hidden_size, direction='bidirectional', num_layers=2)
+        self.cell = nn.LSTM(in_channels,
+                            hidden_size,
+                            direction='bidirectional',
+                            num_layers=2)
 
     def forward(self, x):
         x, _ = self.cell(x)
@@ -32,6 +35,7 @@ def forward(self, x):
 
 
 class Net(nn.Layer):
+
     def __init__(self, in_channels, hidden_size):
         super(Net, self).__init__()
         self.lstm = LSTMLayer(in_channels, hidden_size)
@@ -42,6 +46,7 @@ def forward(self, x):
 
 
 class TestLstm(unittest.TestCase):
+
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
 
@@ -64,10 +69,9 @@ def run_lstm(self, to_static):
     def test_lstm_to_static(self):
         dygraph_out = self.run_lstm(to_static=False)
         static_out = self.run_lstm(to_static=True)
-        self.assertTrue(
-            np.allclose(dygraph_out, static_out),
-            msg='dygraph_out is {}\n static_out is \n{}'.format(dygraph_out,
-                                                                static_out))
+        self.assertTrue(np.allclose(dygraph_out, static_out),
+                        msg='dygraph_out is {}\n static_out is \n{}'.format(
+                            dygraph_out, static_out))
 
     def test_save_in_eval(self, with_training=True):
         paddle.jit.ProgramTranslator().enable(True)
@@ -94,23 +98,22 @@ def test_save_in_eval(self, with_training=True):
         load_net = paddle.jit.load(model_path)
 
         static_out = load_net(x)
-        self.assertTrue(
-            np.allclose(dygraph_out.numpy(), static_out.numpy()),
-            msg='dygraph_out is {}\n static_out is \n{}'.format(dygraph_out,
-                                                                static_out))
+        self.assertTrue(np.allclose(dygraph_out.numpy(), static_out.numpy()),
+                        msg='dygraph_out is {}\n static_out is \n{}'.format(
+                            dygraph_out, static_out))
         # switch back into train mode.
         net.train()
         train_out = net(x)
-        self.assertTrue(
-            np.allclose(dygraph_out.numpy(), train_out.numpy()),
-            msg='dygraph_out is {}\n static_out is \n{}'.format(dygraph_out,
-                                                                train_out))
+        self.assertTrue(np.allclose(dygraph_out.numpy(), train_out.numpy()),
+                        msg='dygraph_out is {}\n static_out is \n{}'.format(
+                            dygraph_out, train_out))
 
     def test_save_without_training(self):
         self.test_save_in_eval(with_training=False)
 
 
 class LinearNet(nn.Layer):
+
     def __init__(self):
         super(LinearNet, self).__init__()
         self.fc = nn.Linear(10, 12)
@@ -124,6 +127,7 @@ def forward(self, x):
 
 
 class TestSaveInEvalMode(unittest.TestCase):
+
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
 
@@ -156,13 +160,13 @@ def test_save_in_eval(self):
         eval_out = net(x)
 
         infer_out = load_net(x)
-        self.assertTrue(
-            np.allclose(eval_out.numpy(), infer_out.numpy()),
-            msg='eval_out is {}\n infer_out is \n{}'.format(eval_out,
-                                                            infer_out))
+        self.assertTrue(np.allclose(eval_out.numpy(), infer_out.numpy()),
+                        msg='eval_out is {}\n infer_out is \n{}'.format(
+                            eval_out, infer_out))
 
 
 class TestEvalAfterSave(unittest.TestCase):
+
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index 2bb3879efb753..35c8b4d952295 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -40,6 +40,7 @@
 
 
 class SimpleImgConvPool(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -59,26 +60,24 @@ def __init__(self,
                  bias_attr=None):
         super(SimpleImgConvPool, self).__init__()
 
-        self._conv2d = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=conv_stride,
-            padding=conv_padding,
-            dilation=conv_dilation,
-            groups=conv_groups,
-            param_attr=None,
-            bias_attr=None,
-            act=act,
-            use_cudnn=use_cudnn)
-
-        self._pool2d = Pool2D(
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn)
+        self._conv2d = Conv2D(num_channels=num_channels,
+                              num_filters=num_filters,
+                              filter_size=filter_size,
+                              stride=conv_stride,
+                              padding=conv_padding,
+                              dilation=conv_dilation,
+                              groups=conv_groups,
+                              param_attr=None,
+                              bias_attr=None,
+                              act=act,
+                              use_cudnn=use_cudnn)
+
+        self._pool2d = Pool2D(pool_size=pool_size,
+                              pool_type=pool_type,
+                              pool_stride=pool_stride,
+                              pool_padding=pool_padding,
+                              global_pooling=global_pooling,
+                              use_cudnn=use_cudnn)
 
     def forward(self, inputs):
         x = self._conv2d(inputs)
@@ -87,25 +86,33 @@ def forward(self, inputs):
 
 
 class MNIST(fluid.dygraph.Layer):
+
     def __init__(self):
         super(MNIST, self).__init__()
 
-        self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            1, 20, 5, 2, 2, act="relu")
+        self._simple_img_conv_pool_1 = SimpleImgConvPool(1,
+                                                         20,
+                                                         5,
+                                                         2,
+                                                         2,
+                                                         act="relu")
 
-        self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            20, 50, 5, 2, 2, act="relu")
+        self._simple_img_conv_pool_2 = SimpleImgConvPool(20,
+                                                         50,
+                                                         5,
+                                                         2,
+                                                         2,
+                                                         act="relu")
 
         self.pool_2_shape = 50 * 4 * 4
         SIZE = 10
         scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
-        self._fc = Linear(
-            self.pool_2_shape,
-            10,
-            param_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.NormalInitializer(
-                    loc=0.0, scale=scale)),
-            act="softmax")
+        self._fc = Linear(self.pool_2_shape,
+                          10,
+                          param_attr=fluid.param_attr.ParamAttr(
+                              initializer=fluid.initializer.NormalInitializer(
+                                  loc=0.0, scale=scale)),
+                          act="softmax")
 
     def forward(self, inputs, label=None):
         x = self.inference(inputs)
@@ -127,15 +134,15 @@ def inference(self, inputs):
 
 
 class TestMNIST(unittest.TestCase):
+
     def setUp(self):
         self.epoch_num = 1
         self.batch_size = 64
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        self.train_reader = paddle.batch(
-            paddle.dataset.mnist.train(),
-            batch_size=self.batch_size,
-            drop_last=True)
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
+        self.train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                         batch_size=self.batch_size,
+                                         drop_last=True)
         self.temp_dir = tempfile.TemporaryDirectory()
 
     def tearDown(self):
@@ -158,17 +165,15 @@ def train_dygraph(self):
     def test_mnist_to_static(self):
         dygraph_loss = self.train_dygraph()
         static_loss = self.train_static()
-        self.assertTrue(
-            np.allclose(dygraph_loss, static_loss),
-            msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
-                                                            static_loss))
+        self.assertTrue(np.allclose(dygraph_loss, static_loss),
+                        msg='dygraph is {}\n static_res is \n{}'.format(
+                            dygraph_loss, static_loss))
         with _test_eager_guard():
             dygraph_loss = self.train_dygraph()
             static_loss = self.train_static()
-            self.assertTrue(
-                np.allclose(dygraph_loss, static_loss),
-                msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
-                                                                static_loss))
+            self.assertTrue(np.allclose(dygraph_loss, static_loss),
+                            msg='dygraph is {}\n static_res is \n{}'.format(
+                                dygraph_loss, static_loss))
 
     def test_mnist_declarative_cpu_vs_mkldnn(self):
         dygraph_loss_cpu = self.train_dygraph()
@@ -177,10 +182,9 @@ def test_mnist_declarative_cpu_vs_mkldnn(self):
             dygraph_loss_mkldnn = self.train_dygraph()
         finally:
             fluid.set_flags({'FLAGS_use_mkldnn': False})
-        self.assertTrue(
-            np.allclose(dygraph_loss_cpu, dygraph_loss_mkldnn),
-            msg='cpu dygraph is {}\n mkldnn dygraph is \n{}'.format(
-                dygraph_loss_cpu, dygraph_loss_mkldnn))
+        self.assertTrue(np.allclose(dygraph_loss_cpu, dygraph_loss_mkldnn),
+                        msg='cpu dygraph is {}\n mkldnn dygraph is \n{}'.format(
+                            dygraph_loss_cpu, dygraph_loss_mkldnn))
 
     def train(self, to_static=False):
 
@@ -191,17 +195,17 @@ def train(self, to_static=False):
             mnist = MNIST()
             if to_static:
                 mnist = paddle.jit.to_static(mnist)
-            adam = AdamOptimizer(
-                learning_rate=0.001, parameter_list=mnist.parameters())
+            adam = AdamOptimizer(learning_rate=0.001,
+                                 parameter_list=mnist.parameters())
 
             for epoch in range(self.epoch_num):
                 start = time()
                 for batch_id, data in enumerate(self.train_reader()):
-                    dy_x_data = np.array(
-                        [x[0].reshape(1, 28, 28)
-                         for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+                    dy_x_data = np.array([
+                        x[0].reshape(1, 28, 28) for x in data
+                    ]).astype('float32')
+                    y_data = np.array([x[1] for x in data
+                                       ]).astype('int64').reshape(-1, 1)
 
                     img = to_variable(dy_x_data)
                     label = to_variable(y_data)
@@ -217,9 +221,9 @@ def train(self, to_static=False):
                     if batch_id % 10 == 0:
                         print(
                             "Loss at epoch {} step {}: loss: {:}, acc: {}, cost: {}"
-                            .format(epoch, batch_id,
-                                    avg_loss.numpy(),
-                                    acc.numpy(), time() - start))
+                            .format(epoch, batch_id, avg_loss.numpy(),
+                                    acc.numpy(),
+                                    time() - start))
                         start = time()
                     if batch_id == 50:
                         mnist.eval()
@@ -239,11 +243,10 @@ def check_jit_save_load(self, model, inputs, input_spec, to_static, gt_out):
             model_save_prefix = os.path.join(model_save_dir, 'mnist')
             model_filename = "mnist" + INFER_MODEL_SUFFIX
             params_filename = "mnist" + INFER_PARAMS_SUFFIX
-            fluid.dygraph.jit.save(
-                layer=model,
-                path=model_save_prefix,
-                input_spec=input_spec,
-                output_spec=[gt_out])
+            fluid.dygraph.jit.save(layer=model,
+                                   path=model_save_prefix,
+                                   input_spec=input_spec,
+                                   output_spec=[gt_out])
             # load in static mode
             static_infer_out = self.jit_load_and_run_inference_static(
                 model_save_dir, model_filename, params_filename, inputs)
@@ -262,12 +265,11 @@ def jit_load_and_run_inference_static(self, model_path, model_filename,
                                           params_filename, inputs):
         paddle.enable_static()
         exe = fluid.Executor(self.place)
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             dirname=model_path,
-             executor=exe,
-             model_filename=model_filename,
-             params_filename=params_filename)
+        [inference_program, feed_target_names, fetch_targets
+         ] = fluid.io.load_inference_model(dirname=model_path,
+                                           executor=exe,
+                                           model_filename=model_filename,
+                                           params_filename=params_filename)
         assert len(inputs) == len(feed_target_names)
         results = exe.run(inference_program,
                           feed=dict(zip(feed_target_names, inputs)),
@@ -280,8 +282,9 @@ def jit_load_and_run_inference_dygraph(self, model_path, inputs):
         pred = infer_net(inputs[0])
         return pred.numpy()
 
-    def predictor_load_and_run_inference_analysis(
-            self, model_path, model_filename, params_filename, inputs):
+    def predictor_load_and_run_inference_analysis(self, model_path,
+                                                  model_filename,
+                                                  params_filename, inputs):
         output = PredictorTools(model_path, model_filename, params_filename,
                                 inputs)
         out = output()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_amp.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_amp.py
index 573ce1678d514..ad4d64d4b9c41 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_amp.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_amp.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,6 +25,7 @@
 
 
 class TestAMP(TestMNIST):
+
     def train_static(self):
         return self.train(to_static=True)
 
@@ -37,11 +38,9 @@ def test_mnist_to_static(self):
         # NOTE(Aurelius84): In static AMP training, there is a grep_list but
         # dygraph AMP don't. It will bring the numbers of cast_op is different
         # and leads to loss has a bit diff.
-        self.assertTrue(
-            np.allclose(
-                dygraph_loss, static_loss, atol=1e-3),
-            msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
-                                                            static_loss))
+        self.assertTrue(np.allclose(dygraph_loss, static_loss, atol=1e-3),
+                        msg='dygraph is {}\n static_res is \n{}'.format(
+                            dygraph_loss, static_loss))
 
     def train(self, to_static=False):
         paddle.seed(SEED)
@@ -51,8 +50,8 @@ def train(self, to_static=False):
             print("Successfully to apply @to_static.")
             mnist = paddle.jit.to_static(mnist)
 
-        adam = AdamOptimizer(
-            learning_rate=0.001, parameter_list=mnist.parameters())
+        adam = AdamOptimizer(learning_rate=0.001,
+                             parameter_list=mnist.parameters())
 
         scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
 
@@ -62,8 +61,8 @@ def train(self, to_static=False):
             for batch_id, data in enumerate(self.train_reader()):
                 dy_x_data = np.array([x[0].reshape(1, 28, 28)
                                       for x in data]).astype('float32')
-                y_data = np.array(
-                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
+                y_data = np.array([x[1] for x in data
+                                   ]).astype('int64').reshape(-1, 1)
 
                 img = paddle.to_tensor(dy_x_data)
                 label = paddle.to_tensor(y_data)
@@ -82,8 +81,8 @@ def train(self, to_static=False):
                 if batch_id % 10 == 0:
                     print(
                         "Loss at epoch {} step {}: loss: {:}, acc: {}, cost: {}"
-                        .format(epoch, batch_id,
-                                avg_loss.numpy(), acc.numpy(), time() - start))
+                        .format(epoch, batch_id, avg_loss.numpy(), acc.numpy(),
+                                time() - start))
                     start = time()
                 if batch_id == 50:
                     break
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py
index 10ba073f63e19..d54231d2c4659 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,6 +26,7 @@
 
 
 class TestPureFP16(TestMNIST):
+
     def train_static(self):
         return self.train(to_static=True)
 
@@ -37,11 +38,9 @@ def test_mnist_to_static(self):
             dygraph_loss = self.train_dygraph()
             static_loss = self.train_static()
             # NOTE: In pure fp16 training, loss is not stable, so we enlarge atol here.
-            self.assertTrue(
-                np.allclose(
-                    dygraph_loss, static_loss, atol=1e-3),
-                msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
-                                                                static_loss))
+            self.assertTrue(np.allclose(dygraph_loss, static_loss, atol=1e-3),
+                            msg='dygraph is {}\n static_res is \n{}'.format(
+                                dygraph_loss, static_loss))
 
     def train(self, to_static=False):
         np.random.seed(SEED)
@@ -58,16 +57,15 @@ def train(self, to_static=False):
             build_strategy.enable_inplace = False
             mnist = paddle.jit.to_static(mnist, build_strategy=build_strategy)
 
-        optimizer = paddle.optimizer.Adam(
-            learning_rate=0.001, parameters=mnist.parameters())
+        optimizer = paddle.optimizer.Adam(learning_rate=0.001,
+                                          parameters=mnist.parameters())
 
         scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
 
-        mnist, optimizer = paddle.amp.decorate(
-            models=mnist,
-            optimizers=optimizer,
-            level='O2',
-            save_dtype='float32')
+        mnist, optimizer = paddle.amp.decorate(models=mnist,
+                                               optimizers=optimizer,
+                                               level='O2',
+                                               save_dtype='float32')
 
         loss_data = []
         for epoch in range(self.epoch_num):
@@ -75,18 +73,17 @@ def train(self, to_static=False):
             for batch_id, data in enumerate(self.train_reader()):
                 dy_x_data = np.array([x[0].reshape(1, 28, 28)
                                       for x in data]).astype('float32')
-                y_data = np.array(
-                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
+                y_data = np.array([x[1] for x in data
+                                   ]).astype('int64').reshape(-1, 1)
 
                 img = paddle.to_tensor(dy_x_data)
                 label = paddle.to_tensor(y_data)
                 label.stop_gradient = True
 
-                with paddle.amp.auto_cast(
-                        enable=True,
-                        custom_white_list=None,
-                        custom_black_list=None,
-                        level='O2'):
+                with paddle.amp.auto_cast(enable=True,
+                                          custom_white_list=None,
+                                          custom_black_list=None,
+                                          level='O2'):
                     prediction, acc, avg_loss = mnist(img, label=label)
 
                 scaled = scaler.scale(avg_loss)
@@ -99,8 +96,8 @@ def train(self, to_static=False):
                 if batch_id % 2 == 0:
                     print(
                         "Loss at epoch {} step {}: loss: {:}, acc: {}, cost: {}"
-                        .format(epoch, batch_id,
-                                avg_loss.numpy(), acc.numpy(), time() - start))
+                        .format(epoch, batch_id, avg_loss.numpy(), acc.numpy(),
+                                time() - start))
                     start = time()
                 if batch_id == 10:
                     break
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index 7b98ced95e22c..18694f6cdec58 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -39,6 +39,7 @@
 
 
 class ConvBNLayer(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  filter_size,
@@ -52,18 +53,18 @@ def __init__(self,
                  name=None):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=ParamAttr(
-                initializer=MSRA(), name=self.full_name() + "_weights"),
-            bias_attr=False)
+        self._conv = Conv2D(num_channels=num_channels,
+                            num_filters=num_filters,
+                            filter_size=filter_size,
+                            stride=stride,
+                            padding=padding,
+                            groups=num_groups,
+                            act=None,
+                            use_cudnn=use_cudnn,
+                            param_attr=ParamAttr(initializer=MSRA(),
+                                                 name=self.full_name() +
+                                                 "_weights"),
+                            bias_attr=False)
 
         self._batch_norm = BatchNorm(
             num_filters,
@@ -82,6 +83,7 @@ def forward(self, inputs, if_act=False):
 
 
 class DepthwiseSeparable(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters1,
@@ -92,14 +94,14 @@ def __init__(self,
                  name=None):
         super(DepthwiseSeparable, self).__init__()
 
-        self._depthwise_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=int(num_filters1 * scale),
-            filter_size=3,
-            stride=stride,
-            padding=1,
-            num_groups=int(num_groups * scale),
-            use_cudnn=True)
+        self._depthwise_conv = ConvBNLayer(num_channels=num_channels,
+                                           num_filters=int(num_filters1 *
+                                                           scale),
+                                           filter_size=3,
+                                           stride=stride,
+                                           padding=1,
+                                           num_groups=int(num_groups * scale),
+                                           use_cudnn=True)
 
         self._pointwise_conv = ConvBNLayer(
             num_channels=int(num_filters1 * scale),
@@ -115,127 +117,118 @@ def forward(self, inputs):
 
 
 class MobileNetV1(fluid.dygraph.Layer):
+
     def __init__(self, scale=1.0, class_dim=1000):
         super(MobileNetV1, self).__init__()
         self.scale = scale
         self.dwsl = []
 
-        self.conv1 = ConvBNLayer(
-            num_channels=3,
-            filter_size=3,
-            channels=3,
-            num_filters=int(32 * scale),
-            stride=2,
-            padding=1)
-
-        dws21 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(32 * scale),
-                num_filters1=32,
-                num_filters2=64,
-                num_groups=32,
-                stride=1,
-                scale=scale),
-            name="conv2_1")
+        self.conv1 = ConvBNLayer(num_channels=3,
+                                 filter_size=3,
+                                 channels=3,
+                                 num_filters=int(32 * scale),
+                                 stride=2,
+                                 padding=1)
+
+        dws21 = self.add_sublayer(sublayer=DepthwiseSeparable(num_channels=int(
+            32 * scale),
+                                                              num_filters1=32,
+                                                              num_filters2=64,
+                                                              num_groups=32,
+                                                              stride=1,
+                                                              scale=scale),
+                                  name="conv2_1")
         self.dwsl.append(dws21)
 
-        dws22 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(64 * scale),
-                num_filters1=64,
-                num_filters2=128,
-                num_groups=64,
-                stride=2,
-                scale=scale),
-            name="conv2_2")
+        dws22 = self.add_sublayer(sublayer=DepthwiseSeparable(num_channels=int(
+            64 * scale),
+                                                              num_filters1=64,
+                                                              num_filters2=128,
+                                                              num_groups=64,
+                                                              stride=2,
+                                                              scale=scale),
+                                  name="conv2_2")
         self.dwsl.append(dws22)
 
-        dws31 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(128 * scale),
-                num_filters1=128,
-                num_filters2=128,
-                num_groups=128,
-                stride=1,
-                scale=scale),
-            name="conv3_1")
+        dws31 = self.add_sublayer(sublayer=DepthwiseSeparable(num_channels=int(
+            128 * scale),
+                                                              num_filters1=128,
+                                                              num_filters2=128,
+                                                              num_groups=128,
+                                                              stride=1,
+                                                              scale=scale),
+                                  name="conv3_1")
         self.dwsl.append(dws31)
 
-        dws32 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(128 * scale),
-                num_filters1=128,
-                num_filters2=256,
-                num_groups=128,
-                stride=2,
-                scale=scale),
-            name="conv3_2")
+        dws32 = self.add_sublayer(sublayer=DepthwiseSeparable(num_channels=int(
+            128 * scale),
+                                                              num_filters1=128,
+                                                              num_filters2=256,
+                                                              num_groups=128,
+                                                              stride=2,
+                                                              scale=scale),
+                                  name="conv3_2")
         self.dwsl.append(dws32)
 
-        dws41 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(256 * scale),
-                num_filters1=256,
-                num_filters2=256,
-                num_groups=256,
-                stride=1,
-                scale=scale),
-            name="conv4_1")
+        dws41 = self.add_sublayer(sublayer=DepthwiseSeparable(num_channels=int(
+            256 * scale),
+                                                              num_filters1=256,
+                                                              num_filters2=256,
+                                                              num_groups=256,
+                                                              stride=1,
+                                                              scale=scale),
+                                  name="conv4_1")
         self.dwsl.append(dws41)
 
-        dws42 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(256 * scale),
-                num_filters1=256,
-                num_filters2=512,
-                num_groups=256,
-                stride=2,
-                scale=scale),
-            name="conv4_2")
+        dws42 = self.add_sublayer(sublayer=DepthwiseSeparable(num_channels=int(
+            256 * scale),
+                                                              num_filters1=256,
+                                                              num_filters2=512,
+                                                              num_groups=256,
+                                                              stride=2,
+                                                              scale=scale),
+                                  name="conv4_2")
         self.dwsl.append(dws42)
 
         for i in range(5):
-            tmp = self.add_sublayer(
-                sublayer=DepthwiseSeparable(
-                    num_channels=int(512 * scale),
-                    num_filters1=512,
-                    num_filters2=512,
-                    num_groups=512,
-                    stride=1,
-                    scale=scale),
-                name="conv5_" + str(i + 1))
-            self.dwsl.append(tmp)
-
-        dws56 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
+            tmp = self.add_sublayer(sublayer=DepthwiseSeparable(
                 num_channels=int(512 * scale),
                 num_filters1=512,
-                num_filters2=1024,
+                num_filters2=512,
                 num_groups=512,
-                stride=2,
+                stride=1,
                 scale=scale),
-            name="conv5_6")
+                                    name="conv5_" + str(i + 1))
+            self.dwsl.append(tmp)
+
+        dws56 = self.add_sublayer(sublayer=DepthwiseSeparable(num_channels=int(
+            512 * scale),
+                                                              num_filters1=512,
+                                                              num_filters2=1024,
+                                                              num_groups=512,
+                                                              stride=2,
+                                                              scale=scale),
+                                  name="conv5_6")
         self.dwsl.append(dws56)
 
-        dws6 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                num_channels=int(1024 * scale),
-                num_filters1=1024,
-                num_filters2=1024,
-                num_groups=1024,
-                stride=1,
-                scale=scale),
-            name="conv6")
+        dws6 = self.add_sublayer(sublayer=DepthwiseSeparable(num_channels=int(
+            1024 * scale),
+                                                             num_filters1=1024,
+                                                             num_filters2=1024,
+                                                             num_groups=1024,
+                                                             stride=1,
+                                                             scale=scale),
+                                 name="conv6")
         self.dwsl.append(dws6)
 
         self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
 
-        self.out = Linear(
-            int(1024 * scale),
-            class_dim,
-            param_attr=ParamAttr(
-                initializer=MSRA(), name=self.full_name() + "fc7_weights"),
-            bias_attr=ParamAttr(name="fc7_offset"))
+        self.out = Linear(int(1024 * scale),
+                          class_dim,
+                          param_attr=ParamAttr(initializer=MSRA(),
+                                               name=self.full_name() +
+                                               "fc7_weights"),
+                          bias_attr=ParamAttr(name="fc7_offset"))
 
     @declarative
     def forward(self, inputs):
@@ -249,44 +242,43 @@ def forward(self, inputs):
 
 
 class InvertedResidualUnit(fluid.dygraph.Layer):
+
     def __init__(
-            self,
-            num_channels,
-            num_in_filter,
-            num_filters,
-            stride,
-            filter_size,
-            padding,
-            expansion_factor, ):
+        self,
+        num_channels,
+        num_in_filter,
+        num_filters,
+        stride,
+        filter_size,
+        padding,
+        expansion_factor,
+    ):
         super(InvertedResidualUnit, self).__init__()
         num_expfilter = int(round(num_in_filter * expansion_factor))
-        self._expand_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_expfilter,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            act=None,
-            num_groups=1)
-
-        self._bottleneck_conv = ConvBNLayer(
-            num_channels=num_expfilter,
-            num_filters=num_expfilter,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            num_groups=num_expfilter,
-            act=None,
-            use_cudnn=True)
-
-        self._linear_conv = ConvBNLayer(
-            num_channels=num_expfilter,
-            num_filters=num_filters,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            act=None,
-            num_groups=1)
+        self._expand_conv = ConvBNLayer(num_channels=num_channels,
+                                        num_filters=num_expfilter,
+                                        filter_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        act=None,
+                                        num_groups=1)
+
+        self._bottleneck_conv = ConvBNLayer(num_channels=num_expfilter,
+                                            num_filters=num_expfilter,
+                                            filter_size=filter_size,
+                                            stride=stride,
+                                            padding=padding,
+                                            num_groups=num_expfilter,
+                                            act=None,
+                                            use_cudnn=True)
+
+        self._linear_conv = ConvBNLayer(num_channels=num_expfilter,
+                                        num_filters=num_filters,
+                                        filter_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        act=None,
+                                        num_groups=1)
 
     def forward(self, inputs, ifshortcut):
         y = self._expand_conv(inputs, if_act=True)
@@ -298,30 +290,29 @@ def forward(self, inputs, ifshortcut):
 
 
 class InvresiBlocks(fluid.dygraph.Layer):
+
     def __init__(self, in_c, t, c, n, s):
         super(InvresiBlocks, self).__init__()
 
-        self._first_block = InvertedResidualUnit(
-            num_channels=in_c,
-            num_in_filter=in_c,
-            num_filters=c,
-            stride=s,
-            filter_size=3,
-            padding=1,
-            expansion_factor=t)
+        self._first_block = InvertedResidualUnit(num_channels=in_c,
+                                                 num_in_filter=in_c,
+                                                 num_filters=c,
+                                                 stride=s,
+                                                 filter_size=3,
+                                                 padding=1,
+                                                 expansion_factor=t)
 
         self._inv_blocks = []
         for i in range(1, n):
-            tmp = self.add_sublayer(
-                sublayer=InvertedResidualUnit(
-                    num_channels=c,
-                    num_in_filter=c,
-                    num_filters=c,
-                    stride=1,
-                    filter_size=3,
-                    padding=1,
-                    expansion_factor=t),
-                name=self.full_name() + "_" + str(i + 1))
+            tmp = self.add_sublayer(sublayer=InvertedResidualUnit(
+                num_channels=c,
+                num_in_filter=c,
+                num_filters=c,
+                stride=1,
+                filter_size=3,
+                padding=1,
+                expansion_factor=t),
+                                    name=self.full_name() + "_" + str(i + 1))
             self._inv_blocks.append(tmp)
 
     def forward(self, inputs):
@@ -332,6 +323,7 @@ def forward(self, inputs):
 
 
 class MobileNetV2(fluid.dygraph.Layer):
+
     def __init__(self, class_dim=1000, scale=1.0):
         super(MobileNetV2, self).__init__()
         self.scale = scale
@@ -348,13 +340,12 @@ def __init__(self, class_dim=1000, scale=1.0):
         ]
 
         #1. conv1
-        self._conv1 = ConvBNLayer(
-            num_channels=3,
-            num_filters=int(32 * scale),
-            filter_size=3,
-            stride=2,
-            act=None,
-            padding=1)
+        self._conv1 = ConvBNLayer(num_channels=3,
+                                  num_filters=int(32 * scale),
+                                  filter_size=3,
+                                  stride=2,
+                                  act=None,
+                                  padding=1)
 
         #2. bottleneck sequences
         self._invl = []
@@ -363,33 +354,33 @@ def __init__(self, class_dim=1000, scale=1.0):
         for layer_setting in bottleneck_params_list:
             t, c, n, s = layer_setting
             i += 1
-            tmp = self.add_sublayer(
-                sublayer=InvresiBlocks(
-                    in_c=in_c, t=t, c=int(c * scale), n=n, s=s),
-                name='conv' + str(i))
+            tmp = self.add_sublayer(sublayer=InvresiBlocks(in_c=in_c,
+                                                           t=t,
+                                                           c=int(c * scale),
+                                                           n=n,
+                                                           s=s),
+                                    name='conv' + str(i))
             self._invl.append(tmp)
             in_c = int(c * scale)
 
         #3. last_conv
         self._out_c = int(1280 * scale) if scale > 1.0 else 1280
-        self._conv9 = ConvBNLayer(
-            num_channels=in_c,
-            num_filters=self._out_c,
-            filter_size=1,
-            stride=1,
-            act=None,
-            padding=0)
+        self._conv9 = ConvBNLayer(num_channels=in_c,
+                                  num_filters=self._out_c,
+                                  filter_size=1,
+                                  stride=1,
+                                  act=None,
+                                  padding=0)
 
         #4. pool
         self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
 
         #5. fc
         tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
-        self._fc = Linear(
-            self._out_c,
-            class_dim,
-            param_attr=tmp_param,
-            bias_attr=ParamAttr(name="fc10_offset"))
+        self._fc = Linear(self._out_c,
+                          class_dim,
+                          param_attr=tmp_param,
+                          bias_attr=ParamAttr(name="fc10_offset"))
 
     @declarative
     def forward(self, inputs):
@@ -439,8 +430,8 @@ class Args(object):
     class_dim = 50
     print_step = 1
     train_step = 10
-    place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-    ) else fluid.CPUPlace()
+    place = fluid.CUDAPlace(
+        0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
     model_save_dir = None
     model_save_prefix = None
     model_filename = None
@@ -486,8 +477,8 @@ def train_mobilenet(args, to_static):
 
                 t_end = time.time()
                 softmax_out = fluid.layers.softmax(out, use_cudnn=False)
-                loss = fluid.layers.cross_entropy(
-                    input=softmax_out, label=label)
+                loss = fluid.layers.cross_entropy(input=softmax_out,
+                                                  label=label)
                 avg_loss = fluid.layers.mean(x=loss)
                 acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
                 acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
@@ -523,12 +514,11 @@ def predict_static(args, data):
     exe = fluid.Executor(args.place)
     # load inference model
 
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(
-         args.model_save_dir,
-         executor=exe,
-         model_filename=args.model_filename,
-         params_filename=args.params_filename)
+    [inference_program, feed_target_names, fetch_targets
+     ] = fluid.io.load_inference_model(args.model_save_dir,
+                                       executor=exe,
+                                       model_filename=args.model_filename,
+                                       params_filename=args.params_filename)
 
     pred_res = exe.run(inference_program,
                        feed={feed_target_names[0]: data},
@@ -571,6 +561,7 @@ def predict_analysis_inference(args, data):
 
 
 class TestMobileNet(unittest.TestCase):
+
     def setUp(self):
         self.args = Args()
         self.temp_dir = tempfile.TemporaryDirectory()
@@ -594,9 +585,8 @@ def train(self, model_name, to_static):
     def assert_same_loss(self, model_name):
         dy_out = self.train(model_name, to_static=False)
         st_out = self.train(model_name, to_static=True)
-        self.assertTrue(
-            np.allclose(dy_out, st_out),
-            msg="dy_out: {}, st_out: {}".format(dy_out, st_out))
+        self.assertTrue(np.allclose(dy_out, st_out),
+                        msg="dy_out: {}, st_out: {}".format(dy_out, st_out))
 
     def assert_same_predict(self, model_name):
         self.args.model = model_name
@@ -612,17 +602,15 @@ def assert_same_predict(self, model_name):
         st_pre = predict_static(self.args, image)
         dy_jit_pre = predict_dygraph_jit(self.args, image)
         predictor_pre = predict_analysis_inference(self.args, image)
-        self.assertTrue(
-            np.allclose(dy_pre, st_pre),
-            msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
-        self.assertTrue(
-            np.allclose(dy_jit_pre, st_pre),
-            msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
-        self.assertTrue(
-            np.allclose(
-                predictor_pre, st_pre, atol=1e-5),
-            msg="inference_pred_res:\n {}\n, st_pre: \n{}.".format(
-                predictor_pre, st_pre))
+        self.assertTrue(np.allclose(dy_pre, st_pre),
+                        msg="dy_pre:\n {}\n, st_pre: \n{}.".format(
+                            dy_pre, st_pre))
+        self.assertTrue(np.allclose(dy_jit_pre, st_pre),
+                        msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(
+                            dy_jit_pre, st_pre))
+        self.assertTrue(np.allclose(predictor_pre, st_pre, atol=1e-5),
+                        msg="inference_pred_res:\n {}\n, st_pre: \n{}.".format(
+                            predictor_pre, st_pre))
 
     def test_mobile_net(self):
         # MobileNet-V1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
index a39b5d7cd1a44..bf8252b56eab1 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,6 +21,7 @@
 
 
 class MySub(paddle.nn.Layer):
+
     def __init__(self):
         super(MySub, self).__init__()
 
@@ -29,6 +30,7 @@ def forward(self, x, y, name=None):
 
 
 class NetWithOpAttr(paddle.nn.Layer):
+
     def __init__(self, in_num, out_num):
         super(NetWithOpAttr, self).__init__()
 
@@ -53,6 +55,7 @@ def with_cond(self, x):
 
 
 class CheckOpAttr(unittest.TestCase):
+
     def setUp(self):
         self.in_num = 16
         self.out_num = 16
@@ -88,8 +91,8 @@ def test_set_op_attrs(self):
         self.assertEqual(len(net.linear._forward_pre_hooks), 1)
         self.assertEqual(len(net.linear._forward_post_hooks), 1)
         # to_static
-        net = paddle.jit.to_static(
-            net, input_spec=[InputSpec.from_tensor(self.x)])
+        net = paddle.jit.to_static(net,
+                                   input_spec=[InputSpec.from_tensor(self.x)])
 
         # assert attrs have be set.
         self.check_op_attrs(net.forward.concrete_program.main_program)
@@ -103,8 +106,8 @@ def check_op_attrs(self, main_program):
             ops = cur_block.ops
             for op in ops:
                 if op.type not in self.infos: continue
-                for attr_name, expect_vals in six.iteritems(self.infos[
-                        op.type]):
+                for attr_name, expect_vals in six.iteritems(
+                        self.infos[op.type]):
                     op_vals = op.desc.attr(attr_name)
                     if not isinstance(expect_vals, list):
                         expect_vals = [expect_vals]
@@ -120,9 +123,8 @@ def check_op_attrs(self, main_program):
     def test_set_op_attrs_with_sub_block(self):
         net = NetWithOpAttr(self.in_num, self.out_num)
         # set attrs
-        net.linear._set_op_attrs({
-            "int_vals": [0, 0]
-        })  # test overwrite behavior
+        net.linear._set_op_attrs({"int_vals": [0,
+                                               0]})  # test overwrite behavior
         net.linear._set_op_attrs(self.fc_attrs)
         net.bn._set_op_attrs(self.bn_attrs)
         net.sub._set_op_attrs(self.sub_attrs)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_param_guard.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_param_guard.py
index cd3c76412feac..1a4eca9592096 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_param_guard.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_param_guard.py
@@ -20,6 +20,7 @@
 
 
 class NetWithParameterList(paddle.nn.Layer):
+
     def __init__(self, in_size, out_size):
         super(NetWithParameterList, self).__init__()
         weight = self.create_parameter([in_size, out_size])
@@ -35,6 +36,7 @@ def forward(self, x):
 
 
 class NetWithParameterListIter(NetWithParameterList):
+
     def __init__(self, in_size, out_size):
         super(NetWithParameterListIter, self).__init__(in_size, out_size)
 
@@ -49,6 +51,7 @@ def forward(self, x):
 
 
 class TestParameterList(unittest.TestCase):
+
     def setUp(self):
         self.seed = 2021
         self.iter_num = 5
@@ -77,28 +80,26 @@ def train(self, is_iter, to_static):
     def test_parameter_list(self):
         static_loss = self.train(False, to_static=True)
         dygraph_loss = self.train(False, to_static=False)
-        self.assertTrue(
-            np.allclose(dygraph_loss, static_loss),
-            msg='dygraph result is {}\nstatic result is {}'.format(dygraph_loss,
-                                                                   static_loss))
+        self.assertTrue(np.allclose(dygraph_loss, static_loss),
+                        msg='dygraph result is {}\nstatic result is {}'.format(
+                            dygraph_loss, static_loss))
 
     def test_parameter_list_iter(self):
         static_loss = self.train(True, to_static=True)
         dygraph_loss = self.train(True, to_static=False)
-        self.assertTrue(
-            np.allclose(dygraph_loss, static_loss),
-            msg='dygraph result is {}\nstatic result is {}'.format(dygraph_loss,
-                                                                   static_loss))
+        self.assertTrue(np.allclose(dygraph_loss, static_loss),
+                        msg='dygraph result is {}\nstatic result is {}'.format(
+                            dygraph_loss, static_loss))
 
 
 class NetWithRawParamList(paddle.nn.Layer):
+
     def __init__(self, in_size, out_size):
         super(NetWithRawParamList, self).__init__()
         weight = self.add_parameter('w',
                                     self.create_parameter([in_size, out_size]))
         bias = self.add_parameter(
-            'b', self.create_parameter(
-                [out_size], is_bias=True))
+            'b', self.create_parameter([out_size], is_bias=True))
         self.params = [weight]
         self.bias_dict = {'b': bias}
 
@@ -111,6 +112,7 @@ def forward(self, x):
 
 
 class TestRawParameterList(unittest.TestCase):
+
     def setUp(self):
         self.seed = 2021
         self.iter_num = 5
@@ -140,13 +142,13 @@ def train(self, to_static):
     def test_parameter_list(self):
         static_loss = self.train(to_static=True)
         dygraph_loss = self.train(to_static=False)
-        self.assertTrue(
-            np.allclose(dygraph_loss, static_loss),
-            msg='dygraph result is {}\nstatic result is {}'.format(dygraph_loss,
-                                                                   static_loss))
+        self.assertTrue(np.allclose(dygraph_loss, static_loss),
+                        msg='dygraph result is {}\nstatic result is {}'.format(
+                            dygraph_loss, static_loss))
 
 
 class NetWithSubLayerParamList(paddle.nn.Layer):
+
     def __init__(self, sub_layer):
         super(NetWithSubLayerParamList, self).__init__()
         self.sub_layer = sub_layer
@@ -162,6 +164,7 @@ def forward(self, x):
 
 
 class TestSubLayerParameterList(TestRawParameterList):
+
     def init_net(self):
         fc = paddle.nn.Linear(10, 3)
         self.net = NetWithSubLayerParamList(fc)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
index 4f55dbd324c21..8549d03f7e27b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
@@ -55,6 +55,7 @@ def fake_data(shape):
 
 
 class TestWithNestedInput(unittest.TestCase):
+
     def setUp(self):
         self.x = None
         self.y = None
@@ -63,7 +64,8 @@ def fake_input(self):
         self.x = fake_data([10, 16])
         self.y = [
             fake_data([10, 16]), "preprocess_cmd", 64, {
-                'z': [fake_data([10, 12]), fake_data([10, 12])],
+                'z': [fake_data([10, 12]),
+                      fake_data([10, 12])],
                 'c': fake_data([10, 10]),
                 'd': {
                     'da': 12,
@@ -91,6 +93,7 @@ def test_nest(self):
 
 
 class TestWithNestedOutput(unittest.TestCase):
+
     def setUp(self):
         self.x = None
         self.y = None
@@ -126,6 +129,7 @@ def test_nest(self):
 
 
 class TestWithTrainAndEval(unittest.TestCase):
+
     def test_switch_eval_and_train(self):
         program_translator = ProgramTranslator()
 
@@ -155,6 +159,7 @@ def test_switch_eval_and_train(self):
 
 
 class TestWithNoGrad(unittest.TestCase):
+
     def test_with_no_grad(self):
         with fluid.dygraph.guard():
             linear_net = Linear()
@@ -170,6 +175,7 @@ def test_with_no_grad(self):
 
 
 class GPT2LMHeadModel(fluid.dygraph.Layer):
+
     def __init__(self):
         super(GPT2LMHeadModel, self).__init__()
         self.embedding0 = paddle.nn.Embedding(20, 16)
@@ -185,6 +191,7 @@ def forward(self, x):
 
 
 class TestPruneUnusedParamInProgram(unittest.TestCase):
+
     def test_prune(self):
         input_ids = np.array([[15, 11, 6, 3, 18, 13]]).astype("float32")
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_print.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_print.py
index aabfd3b2c48ff..ae773f36d6f30 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_print.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_print.py
@@ -154,10 +154,11 @@ def dyfunc_print_continue_vars(x):
 
 
 class TestPrintBase(unittest.TestCase):
+
     def setUp(self):
         self.input = numpy.ones(5).astype("int32")
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self.set_test_func()
 
     def set_test_func(self):
@@ -177,6 +178,7 @@ def get_static_output(self):
 
 
 class TestPrintVariable(TestPrintBase):
+
     def set_test_func(self):
         self.dygraph_func = dyfunc_print_variable
 
@@ -186,31 +188,37 @@ def test_transformed_static_result(self):
 
 
 class TestPrintNdArray(TestPrintVariable):
+
     def set_test_func(self):
         self.dygraph_func = dyfunc_print_ndarray
 
 
 class TestPrintWithFormat(TestPrintVariable):
+
     def set_test_func(self):
         self.dygraph_func = dyfunc_print_with_format
 
 
 class TestPrintWithFormat2(TestPrintVariable):
+
     def set_test_func(self):
         self.dygraph_func = dyfunc_print_with_format2
 
 
 class TestPrintWithIfElse(TestPrintVariable):
+
     def set_test_func(self):
         self.dygraph_func = dyfunc_print_with_ifelse
 
 
 class TestPrintMultipleVar(TestPrintVariable):
+
     def set_test_func(self):
         self.dygraph_func = dyfunc_print_multi_vars
 
 
 class TestPrintContinueVar(TestPrintVariable):
+
     def set_test_func(self):
         self.dygraph_func = dyfunc_print_continue_vars
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index 4e90c73baa944..b656a4dc5950e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -64,6 +64,7 @@ def get_source_code(func):
 
 
 class StaticCode1():
+
     def dyfunc_with_if_else(x_v, label=None):
         __return_value_init_0 = paddle.fluid.layers.fill_constant(
             shape=[1], dtype='float64', value=0.0, name='__return_value_init_0')
@@ -107,8 +108,8 @@ def false_fn_2(__return_value_0):
 
         __return_value_0 = _jst.convert_ifelse(
             _jst.convert_logical_not(__return_0), true_fn_2, false_fn_2,
-            (__return_0, __return_value_0,
-             x_v), (__return_value_0, ), (__return_value_0, ))
+            (__return_0, __return_value_0, x_v), (__return_value_0, ),
+            (__return_value_0, ))
         return __return_value_0
 
 
@@ -157,12 +158,13 @@ def false_fn_5(__return_value_1):
 
         __return_value_1 = _jst.convert_ifelse(
             _jst.convert_logical_not(__return_2), true_fn_5, false_fn_5,
-            (__return_2, __return_value_1,
-             x_v), (__return_value_1, ), (__return_value_1, ))
+            (__return_2, __return_value_1, x_v), (__return_value_1, ),
+            (__return_value_1, ))
         return __return_value_1
 
 
 class NetWithError(fluid.dygraph.layers.Layer):
+
     @declarative
     def forward(self, x):
         linear = fluid.dygraph.Linear(32, 64)
@@ -171,6 +173,7 @@ def forward(self, x):
 
 
 class TestDygraphToStaticCode(unittest.TestCase):
+
     def setUp(self):
         # set to print all string diff when assertEqual fails
         self.maxDiff = None
@@ -189,6 +192,7 @@ def test_program_translator(self):
 
 
 class TestEnableDeclarative(unittest.TestCase):
+
     def setUp(self):
         self.x = np.random.randn(30, 10, 32).astype('float32')
         self.weight = np.random.randn(32, 64).astype('float32')
@@ -212,8 +216,9 @@ def test_enable_disable_get_output(self):
             dygraph_output = self.program_translator.get_output(
                 simple_func, self.x, self.weight)
             self.assertTrue(
-                np.allclose(
-                    static_output.numpy(), dygraph_output.numpy(), atol=1e-4))
+                np.allclose(static_output.numpy(),
+                            dygraph_output.numpy(),
+                            atol=1e-4))
 
     def test_enable_disable_get_func(self):
 
@@ -230,14 +235,14 @@ def test_enable_disable_get_func(self):
             self.assertTrue(callable(dygraph_func))
             dygraph_output = dygraph_func(self.x, self.weight)
             self.assertTrue(
-                isinstance(dygraph_output, (fluid.core.VarBase,
-                                            fluid.core.eager.Tensor)))
+                isinstance(dygraph_output,
+                           (fluid.core.VarBase, fluid.core.eager.Tensor)))
 
     def test_enable_disable_get_program(self):
 
         self.program_translator.enable(True)
-        static_output = self.program_translator.get_program(simple_func, self.x,
-                                                            self.weight)
+        static_output = self.program_translator.get_program(
+            simple_func, self.x, self.weight)
         self.assertTrue(isinstance(static_output, tuple))
         self.assertEqual(len(static_output), 4)
         self.assertTrue(isinstance(static_output[0], fluid.Program))
@@ -254,8 +259,8 @@ def test_enable_disable_get_program(self):
             dygraph_output = self.program_translator.get_program(
                 simple_func, self.x, self.weight)
             self.assertTrue(
-                isinstance(dygraph_output, (fluid.core.VarBase,
-                                            fluid.core.eager.Tensor)))
+                isinstance(dygraph_output,
+                           (fluid.core.VarBase, fluid.core.eager.Tensor)))
 
     def test_enable_disable_declarative(self):
 
@@ -267,11 +272,13 @@ def test_enable_disable_declarative(self):
         with fluid.dygraph.guard():
             dygraph_output = decorated_simple_func(self.x, self.weight)
             self.assertTrue(
-                np.allclose(
-                    static_output.numpy(), dygraph_output.numpy(), atol=1e-4))
+                np.allclose(static_output.numpy(),
+                            dygraph_output.numpy(),
+                            atol=1e-4))
 
 
 class Net(fluid.dygraph.layers.Layer):
+
     def __init__(self):
         super(Net, self).__init__()
 
@@ -280,6 +287,7 @@ def forward(self, x):
 
 
 class TestErrorWithInitFromStaticMode(unittest.TestCase):
+
     def setUp(self):
         self.program_translator = ProgramTranslator()
         self.x = np.random.randn(10, 32).astype('float32')
@@ -300,6 +308,7 @@ def test_raise_error(self):
 
 
 class SwitchModeNet(paddle.nn.Layer):
+
     def __init__(self):
         super(SwitchModeNet, self).__init__()
 
@@ -318,6 +327,7 @@ def switch_mode_funciton():
 
 
 class TestFunctionTrainEvalMode(unittest.TestCase):
+
     def test_switch_mode(self):
         paddle.disable_static()
         switch_mode_funciton.eval()
@@ -347,6 +357,7 @@ def test_raise_error(self):
 
 
 class TestRemoveCommentInDy2St(unittest.TestCase):
+
     def func_with_comment(self):
         # Comment1
         x = paddle.to_tensor([1, 2, 3])
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
index 7b7ff66343a40..75f17e22e46d0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
@@ -34,6 +34,7 @@
 
 
 class SimpleLSTMRNN(fluid.Layer):
+
     def __init__(self,
                  hidden_size,
                  num_steps,
@@ -94,8 +95,9 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                 gate_input = fluid.layers.matmul(x=nn, y=weight_1)
 
                 gate_input = fluid.layers.elementwise_add(gate_input, bias)
-                i, j, f, o = fluid.layers.split(
-                    gate_input, num_or_sections=4, dim=-1)
+                i, j, f, o = fluid.layers.split(gate_input,
+                                                num_or_sections=4,
+                                                dim=-1)
                 c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
                     i) * fluid.layers.tanh(j)
                 m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
@@ -124,6 +126,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
 
 
 class PtbModel(fluid.Layer):
+
     def __init__(self,
                  hidden_size,
                  vocab_size,
@@ -138,12 +141,11 @@ def __init__(self,
         self.num_layers = num_layers
         self.num_steps = num_steps
         self.dropout = dropout
-        self.simple_lstm_rnn = SimpleLSTMRNN(
-            hidden_size,
-            num_steps,
-            num_layers=num_layers,
-            init_scale=init_scale,
-            dropout=dropout)
+        self.simple_lstm_rnn = SimpleLSTMRNN(hidden_size,
+                                             num_steps,
+                                             num_layers=num_layers,
+                                             init_scale=init_scale,
+                                             dropout=dropout)
         self.embedding = Embedding(
             size=[vocab_size, hidden_size],
             dtype='float32',
@@ -186,14 +188,15 @@ def forward(self, input, label, init_hidden, init_cell):
                 x_emb,
                 dropout_prob=self.dropout,
                 dropout_implementation='upscale_in_train')
-        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h,
-                                                               init_c)
+        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(
+            x_emb, init_h, init_c)
 
         projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
         projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
 
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=projection, label=label, soft_label=False)
+        loss = fluid.layers.softmax_with_cross_entropy(logits=projection,
+                                                       label=label,
+                                                       soft_label=False)
         loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
         loss = fluid.layers.reduce_mean(loss, dim=[0])
         loss = fluid.layers.reduce_sum(loss)
@@ -220,16 +223,15 @@ def train(place):
     with fluid.dygraph.guard(place):
         paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
-        ptb_model = PtbModel(
-            hidden_size=hidden_size,
-            vocab_size=vocab_size,
-            num_layers=num_layers,
-            num_steps=num_steps,
-            init_scale=init_scale,
-            dropout=dropout)
+        ptb_model = PtbModel(hidden_size=hidden_size,
+                             vocab_size=vocab_size,
+                             num_layers=num_layers,
+                             num_steps=num_steps,
+                             init_scale=init_scale,
+                             dropout=dropout)
 
-        sgd = SGDOptimizer(
-            learning_rate=1e-3, parameter_list=ptb_model.parameters())
+        sgd = SGDOptimizer(learning_rate=1e-3,
+                           parameter_list=ptb_model.parameters())
 
         for epoch_id in range(max_epoch):
 
@@ -237,10 +239,10 @@ def train(place):
             iters = 0.0
             total_sample = 0
 
-            init_hidden_data = np.zeros(
-                (num_layers, batch_size, hidden_size), dtype='float32')
-            init_cell_data = np.zeros(
-                (num_layers, batch_size, hidden_size), dtype='float32')
+            init_hidden_data = np.zeros((num_layers, batch_size, hidden_size),
+                                        dtype='float32')
+            init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                      dtype='float32')
 
             init_hidden = to_variable(init_hidden_data)
             init_cell = to_variable(init_cell_data)
@@ -255,8 +257,8 @@ def train(place):
                 x = to_variable(x_data)
                 y = to_variable(y_data)
 
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
                 out_loss = dy_loss.numpy()
 
                 dy_loss.backward()
@@ -268,8 +270,9 @@ def train(place):
                 total_sample += 1
                 if step_id % PRINT_STEP == 0:
                     if step_id == 0:
-                        logging.info("epoch %d | step %d, loss %0.3f" % (
-                            epoch_id, step_id, total_loss / total_sample))
+                        logging.info(
+                            "epoch %d | step %d, loss %0.3f" %
+                            (epoch_id, step_id, total_loss / total_sample))
                         avg_batch_time = time.time()
                     else:
                         speed = PRINT_STEP / (time.time() - avg_batch_time)
@@ -293,6 +296,7 @@ def train_static(place):
 
 
 class TestPtb(unittest.TestCase):
+
     def setUp(self):
         self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
             else fluid.CPUPlace()
@@ -301,16 +305,15 @@ def test_check_result(self):
         loss_1, hidden_1, cell_1 = train_static(self.place)
         loss_2, hidden_2, cell_2 = train_dygraph(self.place)
 
-        self.assertTrue(
-            np.allclose(loss_1, loss_2),
-            msg="static loss: {} \ndygraph loss: {}".format(loss_1, loss_2))
-        self.assertTrue(
-            np.allclose(hidden_1, hidden_2),
-            msg="static hidden: {} \ndygraph acc1: {}".format(hidden_1,
-                                                              hidden_2))
-        self.assertTrue(
-            np.allclose(cell_1, cell_2),
-            msg="static cell: {} \ndygraph cell: {}".format(cell_1, cell_2))
+        self.assertTrue(np.allclose(loss_1, loss_2),
+                        msg="static loss: {} \ndygraph loss: {}".format(
+                            loss_1, loss_2))
+        self.assertTrue(np.allclose(hidden_1, hidden_2),
+                        msg="static hidden: {} \ndygraph acc1: {}".format(
+                            hidden_1, hidden_2))
+        self.assertTrue(np.allclose(cell_1, cell_2),
+                        msg="static cell: {} \ndygraph cell: {}".format(
+                            cell_1, cell_2))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py
index 0d45d7edb2742..5d0d488915d53 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py
@@ -28,6 +28,7 @@
 
 
 class SimpleLSTMRNN(paddle.nn.Layer):
+
     def __init__(self,
                  hidden_size,
                  num_steps,
@@ -86,8 +87,9 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                 gate_input = paddle.matmul(x=nn, y=weight_1)
 
                 gate_input = paddle.add(x=gate_input, y=bias)
-                i, j, f, o = paddle.split(
-                    x=gate_input, num_or_sections=4, axis=-1)
+                i, j, f, o = paddle.split(x=gate_input,
+                                          num_or_sections=4,
+                                          axis=-1)
                 c = pre_cell * paddle.nn.functional.sigmoid(
                     f) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
                 m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
@@ -116,6 +118,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
 
 
 class PtbModel(paddle.nn.Layer):
+
     def __init__(self,
                  hidden_size,
                  vocab_size,
@@ -130,20 +133,19 @@ def __init__(self,
         self.num_layers = num_layers
         self.num_steps = num_steps
         self.dropout = dropout
-        self.simple_lstm_rnn = SimpleLSTMRNN(
-            hidden_size,
-            num_steps,
-            num_layers=num_layers,
-            init_scale=init_scale,
-            dropout=dropout)
+        self.simple_lstm_rnn = SimpleLSTMRNN(hidden_size,
+                                             num_steps,
+                                             num_layers=num_layers,
+                                             init_scale=init_scale,
+                                             dropout=dropout)
         self.embedding = paddle.fluid.dygraph.nn.Embedding(
             size=[vocab_size, hidden_size],
             dtype='float32',
             is_sparse=False,
             param_attr=paddle.ParamAttr(
                 name='embedding_para',
-                initializer=paddle.nn.initializer.Uniform(
-                    low=-init_scale, high=init_scale)))
+                initializer=paddle.nn.initializer.Uniform(low=-init_scale,
+                                                          high=init_scale)))
         self.softmax_weight = self.create_parameter(
             attr=paddle.ParamAttr(),
             shape=[self.hidden_size, self.vocab_size],
@@ -163,23 +165,23 @@ def build_once(self, input, label, init_hidden, init_cell):
     @paddle.jit.to_static
     def forward(self, input, label, init_hidden, init_cell):
 
-        init_h = paddle.reshape(
-            init_hidden, shape=[self.num_layers, -1, self.hidden_size])
+        init_h = paddle.reshape(init_hidden,
+                                shape=[self.num_layers, -1, self.hidden_size])
 
-        init_c = paddle.reshape(
-            init_cell, shape=[self.num_layers, -1, self.hidden_size])
+        init_c = paddle.reshape(init_cell,
+                                shape=[self.num_layers, -1, self.hidden_size])
 
         x_emb = self.embedding(input)
 
-        x_emb = paddle.reshape(
-            x_emb, shape=[-1, self.num_steps, self.hidden_size])
+        x_emb = paddle.reshape(x_emb,
+                               shape=[-1, self.num_steps, self.hidden_size])
         if self.dropout is not None and self.dropout > 0.0:
             x_emb = paddle.nn.functional.dropout(
                 x_emb,
                 dropout_prob=self.dropout,
                 dropout_implementation='upscale_in_train')
-        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h,
-                                                               init_c)
+        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(
+            x_emb, init_h, init_c)
 
         projection = paddle.matmul(x=rnn_out, y=self.softmax_weight)
         projection = paddle.add(x=projection, y=self.softmax_bias)
@@ -212,13 +214,12 @@ def train(place):
     paddle.disable_static(place)
     paddle.seed(SEED)
     paddle.framework.random._manual_program_seed(SEED)
-    ptb_model = PtbModel(
-        hidden_size=hidden_size,
-        vocab_size=vocab_size,
-        num_layers=num_layers,
-        num_steps=num_steps,
-        init_scale=init_scale,
-        dropout=dropout)
+    ptb_model = PtbModel(hidden_size=hidden_size,
+                         vocab_size=vocab_size,
+                         num_layers=num_layers,
+                         num_steps=num_steps,
+                         init_scale=init_scale,
+                         dropout=dropout)
 
     sgd = paddle.optimizer.SGD(learning_rate=1e-3,
                                parameters=ptb_model.parameters())
@@ -229,15 +230,19 @@ def train(place):
         iters = 0.0
         total_sample = 0
 
-        init_hidden_data = np.zeros(
-            (num_layers, batch_size, hidden_size), dtype='float32')
-        init_cell_data = np.zeros(
-            (num_layers, batch_size, hidden_size), dtype='float32')
-
-        init_hidden = paddle.to_tensor(
-            data=init_hidden_data, dtype=None, place=None, stop_gradient=True)
-        init_cell = paddle.to_tensor(
-            data=init_cell_data, dtype=None, place=None, stop_gradient=True)
+        init_hidden_data = np.zeros((num_layers, batch_size, hidden_size),
+                                    dtype='float32')
+        init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                  dtype='float32')
+
+        init_hidden = paddle.to_tensor(data=init_hidden_data,
+                                       dtype=None,
+                                       place=None,
+                                       stop_gradient=True)
+        init_cell = paddle.to_tensor(data=init_cell_data,
+                                     dtype=None,
+                                     place=None,
+                                     stop_gradient=True)
         for step_id in range(batch_num):
             x_data = np.arange(12).reshape(4, 3).astype('int64')
             y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
@@ -246,10 +251,14 @@ def train(place):
             x_data = x_data.reshape((-1, num_steps, 1))
             y_data = y_data.reshape((-1, num_steps, 1))
 
-            x = paddle.to_tensor(
-                data=x_data, dtype=None, place=None, stop_gradient=True)
-            y = paddle.to_tensor(
-                data=y_data, dtype=None, place=None, stop_gradient=True)
+            x = paddle.to_tensor(data=x_data,
+                                 dtype=None,
+                                 place=None,
+                                 stop_gradient=True)
+            y = paddle.to_tensor(data=y_data,
+                                 dtype=None,
+                                 place=None,
+                                 stop_gradient=True)
 
             dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
                                                         init_cell)
@@ -290,6 +299,7 @@ def train_static(place):
 
 
 class TestPtb(unittest.TestCase):
+
     def setUp(self):
         self.place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \
             else paddle.CPUPlace()
@@ -298,16 +308,15 @@ def test_check_result(self):
         loss_1, hidden_1, cell_1 = train_static(self.place)
         loss_2, hidden_2, cell_2 = train_dygraph(self.place)
 
-        self.assertTrue(
-            np.allclose(loss_1, loss_2),
-            msg="static loss: {} \ndygraph loss: {}".format(loss_1, loss_2))
-        self.assertTrue(
-            np.allclose(hidden_1, hidden_2),
-            msg="static hidden: {} \ndygraph acc1: {}".format(hidden_1,
-                                                              hidden_2))
-        self.assertTrue(
-            np.allclose(cell_1, cell_2),
-            msg="static cell: {} \ndygraph cell: {}".format(cell_1, cell_2))
+        self.assertTrue(np.allclose(loss_1, loss_2),
+                        msg="static loss: {} \ndygraph loss: {}".format(
+                            loss_1, loss_2))
+        self.assertTrue(np.allclose(hidden_1, hidden_2),
+                        msg="static hidden: {} \ndygraph acc1: {}".format(
+                            hidden_1, hidden_2))
+        self.assertTrue(np.allclose(cell_1, cell_2),
+                        msg="static cell: {} \ndygraph cell: {}".format(
+                            cell_1, cell_2))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
index 4ddca0c689e15..cc373f07e99ae 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
@@ -29,6 +29,7 @@
 
 
 class Policy(Layer):
+
     def __init__(self):
         super(Policy, self).__init__()
 
@@ -188,9 +189,9 @@ def finish_episode():
             running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
             if i_episode % args.log_interval == 0:
                 print(
-                    'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\t loss_probs: {}'.
-                    format(i_episode, ep_reward, running_reward,
-                           loss.numpy()[0]))
+                    'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\t loss_probs: {}'
+                    .format(i_episode, ep_reward, running_reward,
+                            loss.numpy()[0]))
 
             if i_episode > args.train_step:
                 break
@@ -199,6 +200,7 @@ def finish_episode():
 
 
 class TestDeclarative(unittest.TestCase):
+
     def setUp(self):
         self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
             else fluid.CPUPlace()
@@ -207,9 +209,9 @@ def setUp(self):
     def test_train(self):
         st_out = train(self.args, self.place, to_static=True)
         dy_out = train(self.args, self.place, to_static=False)
-        self.assertTrue(
-            np.allclose(st_out, dy_out),
-            msg="dy_out:\n {}\n st_out:\n{}\n".format(dy_out, st_out))
+        self.assertTrue(np.allclose(st_out, dy_out),
+                        msg="dy_out:\n {}\n st_out:\n{}\n".format(
+                            dy_out, st_out))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 1a531c65bbf1e..553ad00a6d29b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -58,6 +58,7 @@ def optimizer_setting(parameter_list=None):
 
 
 class ConvBNLayer(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -67,15 +68,14 @@ def __init__(self,
                  act=None):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=False)
+        self._conv = Conv2D(num_channels=num_channels,
+                            num_filters=num_filters,
+                            filter_size=filter_size,
+                            stride=stride,
+                            padding=(filter_size - 1) // 2,
+                            groups=groups,
+                            act=None,
+                            bias_attr=False)
 
         self._batch_norm = BatchNorm(num_filters, act=act)
 
@@ -87,32 +87,29 @@ def forward(self, inputs):
 
 
 class BottleneckBlock(fluid.dygraph.Layer):
+
     def __init__(self, num_channels, num_filters, stride, shortcut=True):
         super(BottleneckBlock, self).__init__()
 
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu')
-        self.conv2 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters * 4,
-            filter_size=1,
-            act=None)
+        self.conv0 = ConvBNLayer(num_channels=num_channels,
+                                 num_filters=num_filters,
+                                 filter_size=1,
+                                 act='relu')
+        self.conv1 = ConvBNLayer(num_channels=num_filters,
+                                 num_filters=num_filters,
+                                 filter_size=3,
+                                 stride=stride,
+                                 act='relu')
+        self.conv2 = ConvBNLayer(num_channels=num_filters,
+                                 num_filters=num_filters * 4,
+                                 filter_size=1,
+                                 act=None)
 
         if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * 4,
-                filter_size=1,
-                stride=stride)
+            self.short = ConvBNLayer(num_channels=num_channels,
+                                     num_filters=num_filters * 4,
+                                     filter_size=1,
+                                     stride=stride)
 
         self.shortcut = shortcut
 
@@ -130,12 +127,13 @@ def forward(self, inputs):
 
         y = fluid.layers.elementwise_add(x=short, y=conv2)
 
-        layer_helper = fluid.layer_helper.LayerHelper(
-            self.full_name(), act='relu')
+        layer_helper = fluid.layer_helper.LayerHelper(self.full_name(),
+                                                      act='relu')
         return layer_helper.append_activation(y)
 
 
 class ResNet(fluid.dygraph.Layer):
+
     def __init__(self, layers=50, class_dim=102):
         super(ResNet, self).__init__()
 
@@ -153,10 +151,15 @@ def __init__(self, layers=50, class_dim=102):
         num_channels = [64, 256, 512, 1024]
         num_filters = [64, 128, 256, 512]
 
-        self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
-        self.pool2d_max = Pool2D(
-            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+        self.conv = ConvBNLayer(num_channels=3,
+                                num_filters=64,
+                                filter_size=7,
+                                stride=2,
+                                act='relu')
+        self.pool2d_max = Pool2D(pool_size=3,
+                                 pool_stride=2,
+                                 pool_padding=1,
+                                 pool_type='max')
 
         self.bottleneck_block_list = []
         for block in range(len(depth)):
@@ -164,17 +167,17 @@ def __init__(self, layers=50, class_dim=102):
             for i in range(depth[block]):
                 bottleneck_block = self.add_sublayer(
                     'bb_%d_%d' % (block, i),
-                    BottleneckBlock(
-                        num_channels=num_channels[block]
-                        if i == 0 else num_filters[block] * 4,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        shortcut=shortcut))
+                    BottleneckBlock(num_channels=num_channels[block]
+                                    if i == 0 else num_filters[block] * 4,
+                                    num_filters=num_filters[block],
+                                    stride=2 if i == 0 and block != 0 else 1,
+                                    shortcut=shortcut))
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
 
-        self.pool2d_avg = Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True)
+        self.pool2d_avg = Pool2D(pool_size=7,
+                                 pool_type='avg',
+                                 global_pooling=True)
 
         self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 4 * 1 * 1
 
@@ -200,6 +203,7 @@ def forward(self, inputs):
 
 
 def reader_decorator(reader):
+
     def __reader__():
         for item in reader():
             img = np.array(item[0]).astype('float32').reshape(3, 224, 224)
@@ -210,6 +214,7 @@ def __reader__():
 
 
 class ResNetHelper:
+
     def __init__(self):
         self.temp_dir = tempfile.TemporaryDirectory()
         self.model_save_dir = os.path.join(self.temp_dir.name, 'inference')
@@ -231,18 +236,18 @@ def train(self, to_static, build_strategy=None):
             paddle.seed(SEED)
             paddle.framework.random._manual_program_seed(SEED)
 
-            train_reader = paddle.batch(
-                reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
-                batch_size=batch_size,
-                drop_last=True)
-            data_loader = fluid.io.DataLoader.from_generator(
-                capacity=5, iterable=True)
+            train_reader = paddle.batch(reader_decorator(
+                paddle.dataset.flowers.train(use_xmap=False)),
+                                        batch_size=batch_size,
+                                        drop_last=True)
+            data_loader = fluid.io.DataLoader.from_generator(capacity=5,
+                                                             iterable=True)
             data_loader.set_sample_list_generator(train_reader)
 
             resnet = ResNet()
             if to_static:
-                resnet = paddle.jit.to_static(
-                    resnet, build_strategy=build_strategy)
+                resnet = paddle.jit.to_static(resnet,
+                                              build_strategy=build_strategy)
             optimizer = optimizer_setting(parameter_list=resnet.parameters())
 
             for epoch in range(epoch_num):
@@ -258,10 +263,12 @@ def train(self, to_static, build_strategy=None):
                     pred = resnet(img)
                     loss = fluid.layers.cross_entropy(input=pred, label=label)
                     avg_loss = fluid.layers.mean(x=loss)
-                    acc_top1 = fluid.layers.accuracy(
-                        input=pred, label=label, k=1)
-                    acc_top5 = fluid.layers.accuracy(
-                        input=pred, label=label, k=5)
+                    acc_top1 = fluid.layers.accuracy(input=pred,
+                                                     label=label,
+                                                     k=1)
+                    acc_top5 = fluid.layers.accuracy(input=pred,
+                                                     label=label,
+                                                     k=5)
 
                     avg_loss.backward()
                     optimizer.minimize(avg_loss)
@@ -308,12 +315,11 @@ def predict_dygraph(self, data):
     def predict_static(self, data):
         paddle.enable_static()
         exe = fluid.Executor(place)
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             self.model_save_dir,
-             executor=exe,
-             model_filename=self.model_filename,
-             params_filename=self.params_filename)
+        [inference_program, feed_target_names, fetch_targets
+         ] = fluid.io.load_inference_model(self.model_save_dir,
+                                           executor=exe,
+                                           model_filename=self.model_filename,
+                                           params_filename=self.params_filename)
 
         pred_res = exe.run(inference_program,
                            feed={feed_target_names[0]: data},
@@ -338,6 +344,7 @@ def predict_analysis_inference(self, data):
 
 
 class TestResnet(unittest.TestCase):
+
     def setUp(self):
         self.resnet_helper = ResNetHelper()
 
@@ -351,24 +358,22 @@ def verify_predict(self):
         st_pre = self.resnet_helper.predict_static(image)
         dy_jit_pre = self.resnet_helper.predict_dygraph_jit(image)
         predictor_pre = self.resnet_helper.predict_analysis_inference(image)
-        self.assertTrue(
-            np.allclose(dy_pre, st_pre),
-            msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
-        self.assertTrue(
-            np.allclose(dy_jit_pre, st_pre),
-            msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
-        self.assertTrue(
-            np.allclose(predictor_pre, st_pre),
-            msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
-                                                              st_pre))
+        self.assertTrue(np.allclose(dy_pre, st_pre),
+                        msg="dy_pre:\n {}\n, st_pre: \n{}.".format(
+                            dy_pre, st_pre))
+        self.assertTrue(np.allclose(dy_jit_pre, st_pre),
+                        msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(
+                            dy_jit_pre, st_pre))
+        self.assertTrue(np.allclose(predictor_pre, st_pre),
+                        msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(
+                            predictor_pre, st_pre))
 
     def test_resnet(self):
         static_loss = self.train(to_static=True)
         dygraph_loss = self.train(to_static=False)
-        self.assertTrue(
-            np.allclose(static_loss, dygraph_loss),
-            msg="static_loss: {} \n dygraph_loss: {}".format(static_loss,
-                                                             dygraph_loss))
+        self.assertTrue(np.allclose(static_loss, dygraph_loss),
+                        msg="static_loss: {} \n dygraph_loss: {}".format(
+                            static_loss, dygraph_loss))
         self.verify_predict()
 
     def test_in_static_mode_mkldnn(self):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py
index 1d45e906cd378..cfdd7d9df51d0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py
@@ -62,11 +62,10 @@ def train(to_static, build_strategy=None):
             for batch_id in range(100):
                 start_time = time.time()
                 img = paddle.to_tensor(
-                    np.random.random([batch_size, 3, 224, 224]).astype(
-                        'float32'))
+                    np.random.random([batch_size, 3, 224,
+                                      224]).astype('float32'))
                 label = paddle.to_tensor(
-                    np.random.randint(
-                        0, 100, [batch_size, 1], dtype='int64'))
+                    np.random.randint(0, 100, [batch_size, 1], dtype='int64'))
                 img.stop_gradient = True
                 label.stop_gradient = True
 
@@ -102,6 +101,7 @@ def train(to_static, build_strategy=None):
 
 
 class TestResnet(unittest.TestCase):
+
     def train(self, to_static):
         program_translator.enable(to_static)
         return train(to_static)
@@ -109,10 +109,9 @@ def train(self, to_static):
     def test_resnet(self):
         static_loss = self.train(to_static=True)
         dygraph_loss = self.train(to_static=False)
-        self.assertTrue(
-            np.allclose(static_loss, dygraph_loss),
-            msg="static_loss: {} \n dygraph_loss: {}".format(static_loss,
-                                                             dygraph_loss))
+        self.assertTrue(np.allclose(static_loss, dygraph_loss),
+                        msg="static_loss: {} \n dygraph_loss: {}".format(
+                            static_loss, dygraph_loss))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py
index 49d114730e4ed..fa0460f5200b2 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py
@@ -50,8 +50,10 @@ def train(to_static, build_strategy=None):
     optimizer = optimizer_setting(parameter_list=resnet.parameters())
     scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
 
-    resnet, optimizer = paddle.amp.decorate(
-        models=resnet, optimizers=optimizer, level='O2', save_dtype='float32')
+    resnet, optimizer = paddle.amp.decorate(models=resnet,
+                                            optimizers=optimizer,
+                                            level='O2',
+                                            save_dtype='float32')
 
     for epoch in range(epoch_num):
         loss_data = []
@@ -65,16 +67,14 @@ def train(to_static, build_strategy=None):
             img = paddle.to_tensor(
                 np.random.random([batch_size, 3, 224, 224]).astype('float32'))
             label = paddle.to_tensor(
-                np.random.randint(
-                    0, 100, [batch_size, 1], dtype='int64'))
+                np.random.randint(0, 100, [batch_size, 1], dtype='int64'))
             img.stop_gradient = True
             label.stop_gradient = True
 
-            with paddle.amp.auto_cast(
-                    enable=True,
-                    custom_white_list=None,
-                    custom_black_list=None,
-                    level='O2'):
+            with paddle.amp.auto_cast(enable=True,
+                                      custom_white_list=None,
+                                      custom_black_list=None,
+                                      level='O2'):
                 pred = resnet(img)
                 loss = fluid.layers.cross_entropy(input=pred, label=label)
             avg_loss = fluid.layers.mean(x=pred)
@@ -104,6 +104,7 @@ def train(to_static, build_strategy=None):
 
 
 class TestResnet(unittest.TestCase):
+
     def train(self, to_static):
         program_translator.enable(to_static)
         build_strategy = paddle.static.BuildStrategy()
@@ -117,11 +118,9 @@ def test_resnet(self):
             static_loss = self.train(to_static=True)
             dygraph_loss = self.train(to_static=False)
             # NOTE: In pure fp16 training, loss is not stable, so we enlarge atol here.
-            self.assertTrue(
-                np.allclose(
-                    static_loss, dygraph_loss, atol=1e-3),
-                msg="static_loss: {} \n dygraph_loss: {}".format(static_loss,
-                                                                 dygraph_loss))
+            self.assertTrue(np.allclose(static_loss, dygraph_loss, atol=1e-3),
+                            msg="static_loss: {} \n dygraph_loss: {}".format(
+                                static_loss, dygraph_loss))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
index c79a86015eb4e..0832c5f523f68 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import os
+
 os.environ["FLAGS_enable_eager_mode"] = "0"
 import math
 import time
@@ -55,6 +56,7 @@ def optimizer_setting(parameter_list=None):
 
 
 class ConvBNLayer(paddle.nn.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -64,14 +66,13 @@ def __init__(self,
                  act=None):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = paddle.nn.Conv2D(
-            in_channels=num_channels,
-            out_channels=num_filters,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            bias_attr=False)
+        self._conv = paddle.nn.Conv2D(in_channels=num_channels,
+                                      out_channels=num_filters,
+                                      kernel_size=filter_size,
+                                      stride=stride,
+                                      padding=(filter_size - 1) // 2,
+                                      groups=groups,
+                                      bias_attr=False)
 
         self._batch_norm = paddle.nn.BatchNorm(num_filters, act=act)
 
@@ -83,32 +84,29 @@ def forward(self, inputs):
 
 
 class BottleneckBlock(paddle.nn.Layer):
+
     def __init__(self, num_channels, num_filters, stride, shortcut=True):
         super(BottleneckBlock, self).__init__()
 
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu')
-        self.conv2 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters * 4,
-            filter_size=1,
-            act=None)
+        self.conv0 = ConvBNLayer(num_channels=num_channels,
+                                 num_filters=num_filters,
+                                 filter_size=1,
+                                 act='relu')
+        self.conv1 = ConvBNLayer(num_channels=num_filters,
+                                 num_filters=num_filters,
+                                 filter_size=3,
+                                 stride=stride,
+                                 act='relu')
+        self.conv2 = ConvBNLayer(num_channels=num_filters,
+                                 num_filters=num_filters * 4,
+                                 filter_size=1,
+                                 act=None)
 
         if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * 4,
-                filter_size=1,
-                stride=stride)
+            self.short = ConvBNLayer(num_channels=num_channels,
+                                     num_filters=num_filters * 4,
+                                     filter_size=1,
+                                     stride=stride)
 
         self.shortcut = shortcut
 
@@ -126,12 +124,13 @@ def forward(self, inputs):
 
         y = paddle.add(x=short, y=conv2)
 
-        layer_helper = paddle.fluid.layer_helper.LayerHelper(
-            self.full_name(), act='relu')
+        layer_helper = paddle.fluid.layer_helper.LayerHelper(self.full_name(),
+                                                             act='relu')
         return layer_helper.append_activation(y)
 
 
 class ResNet(paddle.nn.Layer):
+
     def __init__(self, layers=50, class_dim=102):
         super(ResNet, self).__init__()
 
@@ -149,10 +148,15 @@ def __init__(self, layers=50, class_dim=102):
         num_channels = [64, 256, 512, 1024]
         num_filters = [64, 128, 256, 512]
 
-        self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
-        self.pool2d_max = paddle.fluid.dygraph.Pool2D(
-            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+        self.conv = ConvBNLayer(num_channels=3,
+                                num_filters=64,
+                                filter_size=7,
+                                stride=2,
+                                act='relu')
+        self.pool2d_max = paddle.fluid.dygraph.Pool2D(pool_size=3,
+                                                      pool_stride=2,
+                                                      pool_padding=1,
+                                                      pool_type='max')
 
         self.bottleneck_block_list = []
         for block in range(len(depth)):
@@ -160,17 +164,17 @@ def __init__(self, layers=50, class_dim=102):
             for i in range(depth[block]):
                 bottleneck_block = self.add_sublayer(
                     'bb_%d_%d' % (block, i),
-                    BottleneckBlock(
-                        num_channels=num_channels[block]
-                        if i == 0 else num_filters[block] * 4,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        shortcut=shortcut))
+                    BottleneckBlock(num_channels=num_channels[block]
+                                    if i == 0 else num_filters[block] * 4,
+                                    num_filters=num_filters[block],
+                                    stride=2 if i == 0 and block != 0 else 1,
+                                    shortcut=shortcut))
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
 
-        self.pool2d_avg = paddle.fluid.dygraph.Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True)
+        self.pool2d_avg = paddle.fluid.dygraph.Pool2D(pool_size=7,
+                                                      pool_type='avg',
+                                                      global_pooling=True)
 
         self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 4 * 1 * 1
 
@@ -197,6 +201,7 @@ def forward(self, inputs):
 
 
 def reader_decorator(reader):
+
     def __reader__():
         for item in reader():
             img = np.array(item[0]).astype('float32').reshape(3, 224, 224)
@@ -207,6 +212,7 @@ def __reader__():
 
 
 class TestResnet(unittest.TestCase):
+
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
 
@@ -230,12 +236,12 @@ def do_train(self, to_static):
         paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
-        train_reader = paddle.batch(
-            reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
-            batch_size=batch_size,
-            drop_last=True)
-        data_loader = paddle.io.DataLoader.from_generator(
-            capacity=5, iterable=True)
+        train_reader = paddle.batch(reader_decorator(
+            paddle.dataset.flowers.train(use_xmap=False)),
+                                    batch_size=batch_size,
+                                    drop_last=True)
+        data_loader = paddle.io.DataLoader.from_generator(capacity=5,
+                                                          iterable=True)
         data_loader.set_sample_list_generator(train_reader)
 
         resnet = ResNet()
@@ -252,8 +258,8 @@ def do_train(self, to_static):
                 img, label = data
 
                 pred = resnet(img)
-                loss = paddle.nn.functional.cross_entropy(
-                    input=pred, label=label)
+                loss = paddle.nn.functional.cross_entropy(input=pred,
+                                                          label=label)
                 avg_loss = paddle.mean(x=loss)
                 acc_top1 = paddle.metric.accuracy(input=pred, label=label, k=1)
                 acc_top5 = paddle.metric.accuracy(input=pred, label=label, k=5)
@@ -296,8 +302,10 @@ def predict_dygraph(self, data):
         resnet.eval()
 
         pred_res = resnet(
-            paddle.to_tensor(
-                data=data, dtype=None, place=None, stop_gradient=True))
+            paddle.to_tensor(data=data,
+                             dtype=None,
+                             place=None,
+                             stop_gradient=True))
 
         ret = pred_res.numpy()
         paddle.enable_static()
@@ -345,24 +353,22 @@ def verify_predict(self):
         st_pre = self.predict_static(image)
         dy_jit_pre = self.predict_dygraph_jit(image)
         predictor_pre = self.predict_analysis_inference(image)
-        self.assertTrue(
-            np.allclose(dy_pre, st_pre),
-            msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
-        self.assertTrue(
-            np.allclose(dy_jit_pre, st_pre),
-            msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
-        self.assertTrue(
-            np.allclose(predictor_pre, st_pre),
-            msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
-                                                              st_pre))
+        self.assertTrue(np.allclose(dy_pre, st_pre),
+                        msg="dy_pre:\n {}\n, st_pre: \n{}.".format(
+                            dy_pre, st_pre))
+        self.assertTrue(np.allclose(dy_jit_pre, st_pre),
+                        msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(
+                            dy_jit_pre, st_pre))
+        self.assertTrue(np.allclose(predictor_pre, st_pre),
+                        msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(
+                            predictor_pre, st_pre))
 
     def test_resnet(self):
         static_loss = self.train(to_static=True)
         dygraph_loss = self.train(to_static=False)
-        self.assertTrue(
-            np.allclose(static_loss, dygraph_loss),
-            msg="static_loss: {} \n dygraph_loss: {}".format(static_loss,
-                                                             dygraph_loss))
+        self.assertTrue(np.allclose(static_loss, dygraph_loss),
+                        msg="static_loss: {} \n dygraph_loss: {}".format(
+                            static_loss, dygraph_loss))
         self.verify_predict()
 
     def test_in_static_mode_mkldnn(self):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py
index 507133aba98e2..07e3fe518c2ce 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py
@@ -202,10 +202,11 @@ def test_return_without_paddle_cond(x):
 
 
 class TestReturnBase(unittest.TestCase):
+
     def setUp(self):
         self.input = np.ones((1)).astype('int32')
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self.init_dygraph_func()
         self.program_translator = ProgramTranslator()
 
@@ -235,91 +236,106 @@ def test_transformed_static_result(self):
                         dygraph_res[i], static_res[i]))
 
         elif isinstance(dygraph_res, np.ndarray):
-            self.assertTrue(
-                np.allclose(dygraph_res, static_res),
-                msg='dygraph res is {}\nstatic_res is {}'.format(dygraph_res,
-                                                                 static_res))
+            self.assertTrue(np.allclose(dygraph_res, static_res),
+                            msg='dygraph res is {}\nstatic_res is {}'.format(
+                                dygraph_res, static_res))
         else:
             self.assertEqual(dygraph_res, static_res)
 
 
 class TestInsideFuncBase(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_inside_func_base
 
 
 class TestReturnIf(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_if
 
 
 class TestReturnIfElse(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_if_else
 
 
 class TestReturnInWhile(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_in_while
 
 
 class TestReturnInFor(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_in_for
 
 
 class TestRecursiveReturn(TestReturnBase):
+
     def init_dygraph_func(self):
         self.input = self.input.astype(np.float32)
         self.dygraph_func = test_recursive_return
 
 
 class TestReturnDifferentLengthIfBody(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_different_length_if_body
 
 
 class TestReturnDifferentLengthElse(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_different_length_else
 
 
 class TestNoReturn(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_no_return
 
 
 class TestReturnNone(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_none
 
 
 class TestReturnNoVariable(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_no_variable
 
 
 class TestReturnListOneValue(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_list_one_value
 
 
 class TestReturnListManyValue(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_list_many_values
 
 
 class TestReturnTupleOneValue(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_tuple_one_value
 
 
 class TestReturnTupleManyValue(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_tuple_many_values
 
 
 class TestReturnSpecial(TestReturnBase):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_return_without_paddle_cond
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
index 794aa17038cd6..6c8216dac55fa 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
@@ -30,12 +30,13 @@
 
 np.random.seed(SEED)
 
-place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
-)
+place = fluid.CUDAPlace(
+    0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 program_translator = ProgramTranslator()
 
 
 class SimpleFcLayer(fluid.dygraph.Layer):
+
     def __init__(self, fc_size):
         super(SimpleFcLayer, self).__init__()
         self._linear = fluid.dygraph.Linear(fc_size, fc_size)
@@ -49,6 +50,7 @@ def forward(self, x):
 
 
 class TestDyToStaticSaveInferenceModel(unittest.TestCase):
+
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
 
@@ -77,18 +79,19 @@ def test_save_inference_model(self):
                 self.temp_dir.name, "test_dy2stat_inference_in_guard/model")
             infer_model_dir = os.path.join(self.temp_dir.name,
                                            "test_dy2stat_inference_in_guard")
-            fluid.dygraph.jit.save(
-                layer=layer,
-                path=infer_model_prefix,
-                input_spec=[x],
-                output_spec=[pred])
+            fluid.dygraph.jit.save(layer=layer,
+                                   path=infer_model_prefix,
+                                   input_spec=[x],
+                                   output_spec=[pred])
             # Check the correctness of the inference
             dygraph_out, _ = layer(x)
         self.check_save_inference_model(layer, [x_data], dygraph_out.numpy())
-        self.check_save_inference_model(
-            layer, [x_data], dygraph_out.numpy(), fetch=[loss])
-        self.check_save_inference_model(
-            layer, [x_data], dygraph_out.numpy(), feed=[x])
+        self.check_save_inference_model(layer, [x_data],
+                                        dygraph_out.numpy(),
+                                        fetch=[loss])
+        self.check_save_inference_model(layer, [x_data],
+                                        dygraph_out.numpy(),
+                                        feed=[x])
 
     def check_save_inference_model(self,
                                    model,
@@ -105,11 +108,10 @@ def check_save_inference_model(self,
                                        "test_dy2stat_inference")
         model_filename = "model" + INFER_MODEL_SUFFIX
         params_filename = "model" + INFER_PARAMS_SUFFIX
-        fluid.dygraph.jit.save(
-            layer=model,
-            path=infer_model_prefix,
-            input_spec=feed if feed else None,
-            output_spec=fetch if fetch else None)
+        fluid.dygraph.jit.save(layer=model,
+                               path=infer_model_prefix,
+                               input_spec=feed if feed else None,
+                               output_spec=fetch if fetch else None)
         # Check the correctness of the inference
         infer_out = self.load_and_run_inference(infer_model_dir, model_filename,
                                                 params_filename, inputs)
@@ -119,12 +121,11 @@ def load_and_run_inference(self, model_path, model_filename,
                                params_filename, inputs):
         paddle.enable_static()
         exe = fluid.Executor(place)
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             dirname=model_path,
-             executor=exe,
-             model_filename=model_filename,
-             params_filename=params_filename)
+        [inference_program, feed_target_names, fetch_targets
+         ] = fluid.io.load_inference_model(dirname=model_path,
+                                           executor=exe,
+                                           model_filename=model_filename,
+                                           params_filename=params_filename)
         results = exe.run(inference_program,
                           feed=dict(zip(feed_target_names, inputs)),
                           fetch_list=fetch_targets)
@@ -133,6 +134,7 @@ def load_and_run_inference(self, model_path, model_filename,
 
 
 class TestPartialProgramRaiseError(unittest.TestCase):
+
     def test_param_type(self):
         program_translator = ProgramTranslator()
         program_translator.enable(True)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_load.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_load.py
index c5677756f501d..cc75dcd949dad 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_load.py
@@ -27,11 +27,12 @@
 
 np.random.seed(2020)
 
-place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
-)
+place = fluid.CUDAPlace(
+    0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 
 
 class TestDyToStaticSaveLoad(unittest.TestCase):
+
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
         self.model_path = os.path.join(self.temp_dir.name,
@@ -50,8 +51,8 @@ def test_save_load_same_result(self):
             program_translator.enable(True)
             x = fluid.dygraph.to_variable(x_data)
             net = Linear(32, 64)
-            adam = AdamOptimizer(
-                learning_rate=0.1, parameter_list=net.parameters())
+            adam = AdamOptimizer(learning_rate=0.1,
+                                 parameter_list=net.parameters())
 
             for i in range(batch_num):
                 static_out, static_loss = net(x)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index 7ac1f40de99eb..965013adf5d8f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -78,8 +78,9 @@ def optimizer_setting(params, parameter_list):
     lr = params["lr"]
     num_epochs = params["num_epochs"]
     optimizer = fluid.optimizer.Momentum(
-        learning_rate=fluid.layers.cosine_decay(
-            learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
+        learning_rate=fluid.layers.cosine_decay(learning_rate=lr,
+                                                step_each_epoch=step,
+                                                epochs=num_epochs),
         momentum=momentum_rate,
         regularization=fluid.regularizer.L2Decay(l2_decay),
         parameter_list=parameter_list)
@@ -88,6 +89,7 @@ def optimizer_setting(params, parameter_list):
 
 
 class ConvBNLayer(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -97,15 +99,14 @@ def __init__(self,
                  act=None):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=False)
+        self._conv = Conv2D(num_channels=num_channels,
+                            num_filters=num_filters,
+                            filter_size=filter_size,
+                            stride=stride,
+                            padding=(filter_size - 1) // 2,
+                            groups=groups,
+                            act=None,
+                            bias_attr=False)
 
         self._batch_norm = BatchNorm(num_filters, act=act)
 
@@ -117,6 +118,7 @@ def forward(self, inputs):
 
 
 class SqueezeExcitation(fluid.dygraph.Layer):
+
     def __init__(self, num_channels, reduction_ratio):
 
         super(SqueezeExcitation, self).__init__()
@@ -147,6 +149,7 @@ def forward(self, input):
 
 
 class BottleneckBlock(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -156,33 +159,29 @@ def __init__(self,
                  shortcut=True):
         super(BottleneckBlock, self).__init__()
 
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1,
-            act="relu")
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            groups=cardinality,
-            act="relu")
-        self.conv2 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters * 2,
-            filter_size=1,
-            act=None)
-
-        self.scale = SqueezeExcitation(
-            num_channels=num_filters * 2, reduction_ratio=reduction_ratio)
+        self.conv0 = ConvBNLayer(num_channels=num_channels,
+                                 num_filters=num_filters,
+                                 filter_size=1,
+                                 act="relu")
+        self.conv1 = ConvBNLayer(num_channels=num_filters,
+                                 num_filters=num_filters,
+                                 filter_size=3,
+                                 stride=stride,
+                                 groups=cardinality,
+                                 act="relu")
+        self.conv2 = ConvBNLayer(num_channels=num_filters,
+                                 num_filters=num_filters * 2,
+                                 filter_size=1,
+                                 act=None)
+
+        self.scale = SqueezeExcitation(num_channels=num_filters * 2,
+                                       reduction_ratio=reduction_ratio)
 
         if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * 2,
-                filter_size=1,
-                stride=stride)
+            self.short = ConvBNLayer(num_channels=num_channels,
+                                     num_filters=num_filters * 2,
+                                     filter_size=1,
+                                     stride=stride)
 
         self.shortcut = shortcut
 
@@ -204,6 +203,7 @@ def forward(self, inputs):
 
 
 class SeResNeXt(fluid.dygraph.Layer):
+
     def __init__(self, layers=50, class_dim=102):
         super(SeResNeXt, self).__init__()
 
@@ -217,52 +217,53 @@ def __init__(self, layers=50, class_dim=102):
             reduction_ratio = 16
             depth = [3, 4, 6, 3]
             num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                num_channels=3,
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            self.pool = Pool2D(
-                pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+            self.conv0 = ConvBNLayer(num_channels=3,
+                                     num_filters=64,
+                                     filter_size=7,
+                                     stride=2,
+                                     act='relu')
+            self.pool = Pool2D(pool_size=3,
+                               pool_stride=2,
+                               pool_padding=1,
+                               pool_type='max')
         elif layers == 101:
             cardinality = 32
             reduction_ratio = 16
             depth = [3, 4, 23, 3]
             num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                num_channels=3,
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            self.pool = Pool2D(
-                pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+            self.conv0 = ConvBNLayer(num_channels=3,
+                                     num_filters=64,
+                                     filter_size=7,
+                                     stride=2,
+                                     act='relu')
+            self.pool = Pool2D(pool_size=3,
+                               pool_stride=2,
+                               pool_padding=1,
+                               pool_type='max')
         elif layers == 152:
             cardinality = 64
             reduction_ratio = 16
             depth = [3, 8, 36, 3]
             num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                num_channels=3,
-                num_filters=64,
-                filter_size=3,
-                stride=2,
-                act='relu')
-            self.conv1 = ConvBNLayer(
-                num_channels=64,
-                num_filters=64,
-                filter_size=3,
-                stride=1,
-                act='relu')
-            self.conv2 = ConvBNLayer(
-                num_channels=64,
-                num_filters=128,
-                filter_size=3,
-                stride=1,
-                act='relu')
-            self.pool = Pool2D(
-                pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+            self.conv0 = ConvBNLayer(num_channels=3,
+                                     num_filters=64,
+                                     filter_size=3,
+                                     stride=2,
+                                     act='relu')
+            self.conv1 = ConvBNLayer(num_channels=64,
+                                     num_filters=64,
+                                     filter_size=3,
+                                     stride=1,
+                                     act='relu')
+            self.conv2 = ConvBNLayer(num_channels=64,
+                                     num_filters=128,
+                                     filter_size=3,
+                                     stride=1,
+                                     act='relu')
+            self.pool = Pool2D(pool_size=3,
+                               pool_stride=2,
+                               pool_padding=1,
+                               pool_type='max')
 
         self.bottleneck_block_list = []
         num_channels = 64
@@ -273,19 +274,19 @@ def __init__(self, layers=50, class_dim=102):
             for i in range(depth[block]):
                 bottleneck_block = self.add_sublayer(
                     'bb_%d_%d' % (block, i),
-                    BottleneckBlock(
-                        num_channels=num_channels,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        cardinality=cardinality,
-                        reduction_ratio=reduction_ratio,
-                        shortcut=shortcut))
+                    BottleneckBlock(num_channels=num_channels,
+                                    num_filters=num_filters[block],
+                                    stride=2 if i == 0 and block != 0 else 1,
+                                    cardinality=cardinality,
+                                    reduction_ratio=reduction_ratio,
+                                    shortcut=shortcut))
                 num_channels = bottleneck_block._num_channels_out
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
 
-        self.pool2d_avg = Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True)
+        self.pool2d_avg = Pool2D(pool_size=7,
+                                 pool_type='avg',
+                                 global_pooling=True)
         stdv = 1.0 / math.sqrt(2048 * 1.0)
 
         self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 2 * 1 * 1
@@ -325,12 +326,12 @@ def forward(self, inputs, label):
 
 
 class TestSeResnet(unittest.TestCase):
+
     def setUp(self):
-        self.train_reader = paddle.batch(
-            paddle.dataset.flowers.train(
-                use_xmap=False, cycle=True),
-            batch_size=BATCH_SIZE,
-            drop_last=True)
+        self.train_reader = paddle.batch(paddle.dataset.flowers.train(
+            use_xmap=False, cycle=True),
+                                         batch_size=BATCH_SIZE,
+                                         drop_last=True)
         self.temp_dir = tempfile.TemporaryDirectory()
 
         self.model_save_dir = os.path.join(self.temp_dir.name, "inference")
@@ -365,12 +366,12 @@ def train(self, train_reader, to_static):
                 step_idx = 0
                 speed_list = []
                 for step_id, data in enumerate(train_reader()):
-                    dy_x_data = np.array(
-                        [x[0].reshape(3, 224, 224)
-                         for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(
-                            BATCH_SIZE, 1)
+                    dy_x_data = np.array([
+                        x[0].reshape(3, 224, 224) for x in data
+                    ]).astype('float32')
+                    y_data = np.array([x[1]
+                                       for x in data]).astype('int64').reshape(
+                                           BATCH_SIZE, 1)
 
                     img = to_variable(dy_x_data)
                     label = to_variable(y_data)
@@ -406,10 +407,10 @@ def train(self, train_reader, to_static):
                     step_idx += 1
                     if step_idx == STEP_NUM:
                         if to_static:
-                            fluid.dygraph.jit.save(
-                                se_resnext,
-                                self.model_save_prefix, [img],
-                                output_spec=[pred])
+                            fluid.dygraph.jit.save(se_resnext,
+                                                   self.model_save_prefix,
+                                                   [img],
+                                                   output_spec=[pred])
                         else:
                             fluid.dygraph.save_dygraph(
                                 se_resnext.state_dict(),
@@ -439,12 +440,11 @@ def predict_dygraph(self, data):
     def predict_static(self, data):
         paddle.enable_static()
         exe = fluid.Executor(place)
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             self.model_save_dir,
-             executor=exe,
-             model_filename=self.model_filename,
-             params_filename=self.params_filename)
+        [inference_program, feed_target_names, fetch_targets
+         ] = fluid.io.load_inference_model(self.model_save_dir,
+                                           executor=exe,
+                                           model_filename=self.model_filename,
+                                           params_filename=self.params_filename)
 
         pred_res = exe.run(inference_program,
                            feed={feed_target_names[0]: data},
@@ -473,12 +473,12 @@ def verify_predict(self):
         st_pre = self.predict_static(image)
         dy_jit_pre = self.predict_dygraph_jit(image)
         predictor_pre = self.predict_analysis_inference(image)
-        self.assertTrue(
-            np.allclose(dy_pre, st_pre),
-            msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
-        self.assertTrue(
-            np.allclose(dy_jit_pre, st_pre),
-            msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
+        self.assertTrue(np.allclose(dy_pre, st_pre),
+                        msg="dy_pre:\n {}\n, st_pre: \n{}.".format(
+                            dy_pre, st_pre))
+        self.assertTrue(np.allclose(dy_jit_pre, st_pre),
+                        msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(
+                            dy_jit_pre, st_pre))
 
         flat_st_pre = st_pre.flatten()
         flat_predictor_pre = np.array(predictor_pre).flatten()
@@ -492,23 +492,23 @@ def verify_predict(self):
                     flat_predictor_pre[i], flat_st_pre[i]))
 
     def test_check_result(self):
-        pred_1, loss_1, acc1_1, acc5_1 = self.train(
-            self.train_reader, to_static=False)
-        pred_2, loss_2, acc1_2, acc5_2 = self.train(
-            self.train_reader, to_static=True)
-
-        self.assertTrue(
-            np.allclose(pred_1, pred_2),
-            msg="static pred: {} \ndygraph pred: {}".format(pred_1, pred_2))
-        self.assertTrue(
-            np.allclose(loss_1, loss_2),
-            msg="static loss: {} \ndygraph loss: {}".format(loss_1, loss_2))
-        self.assertTrue(
-            np.allclose(acc1_1, acc1_2),
-            msg="static acc1: {} \ndygraph acc1: {}".format(acc1_1, acc1_2))
-        self.assertTrue(
-            np.allclose(acc5_1, acc5_2),
-            msg="static acc5: {} \ndygraph acc5: {}".format(acc5_1, acc5_2))
+        pred_1, loss_1, acc1_1, acc5_1 = self.train(self.train_reader,
+                                                    to_static=False)
+        pred_2, loss_2, acc1_2, acc5_2 = self.train(self.train_reader,
+                                                    to_static=True)
+
+        self.assertTrue(np.allclose(pred_1, pred_2),
+                        msg="static pred: {} \ndygraph pred: {}".format(
+                            pred_1, pred_2))
+        self.assertTrue(np.allclose(loss_1, loss_2),
+                        msg="static loss: {} \ndygraph loss: {}".format(
+                            loss_1, loss_2))
+        self.assertTrue(np.allclose(acc1_1, acc1_2),
+                        msg="static acc1: {} \ndygraph acc1: {}".format(
+                            acc1_1, acc1_2))
+        self.assertTrue(np.allclose(acc5_1, acc5_2),
+                        msg="static acc5: {} \ndygraph acc5: {}".format(
+                            acc5_1, acc5_2))
 
         self.verify_predict()
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
index b72894fb14764..108c060fab868 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
@@ -33,6 +33,7 @@
 
 
 class SimpleConvPool(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -41,13 +42,12 @@ def __init__(self,
                  batch_size=None):
         super(SimpleConvPool, self).__init__()
         self.batch_size = batch_size
-        self._conv2d = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            padding=[1, 1],
-            use_cudnn=use_cudnn,
-            act='tanh')
+        self._conv2d = Conv2D(num_channels=num_channels,
+                              num_filters=num_filters,
+                              filter_size=filter_size,
+                              padding=[1, 1],
+                              use_cudnn=use_cudnn,
+                              act='tanh')
 
     def forward(self, inputs):
         x = self._conv2d(inputs)
@@ -57,6 +57,7 @@ def forward(self, inputs):
 
 
 class CNN(fluid.dygraph.Layer):
+
     def __init__(self, dict_dim, batch_size, seq_len):
         super(CNN, self).__init__()
         self.dict_dim = dict_dim
@@ -68,28 +69,25 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self.win_size = [3, self.hid_dim]
         self.batch_size = batch_size
         self.seq_len = seq_len
-        self.embedding = Embedding(
-            size=[self.dict_dim + 1, self.emb_dim],
-            dtype='float32',
-            is_sparse=False)
-        self._simple_conv_pool_1 = SimpleConvPool(
-            self.channels,
-            self.hid_dim,
-            self.win_size,
-            batch_size=self.batch_size)
-        self._fc1 = Linear(
-            input_dim=self.hid_dim * self.seq_len,
-            output_dim=self.fc_hid_dim,
-            act="softmax")
-        self._fc_prediction = Linear(
-            input_dim=self.fc_hid_dim, output_dim=self.class_dim, act="softmax")
+        self.embedding = Embedding(size=[self.dict_dim + 1, self.emb_dim],
+                                   dtype='float32',
+                                   is_sparse=False)
+        self._simple_conv_pool_1 = SimpleConvPool(self.channels,
+                                                  self.hid_dim,
+                                                  self.win_size,
+                                                  batch_size=self.batch_size)
+        self._fc1 = Linear(input_dim=self.hid_dim * self.seq_len,
+                           output_dim=self.fc_hid_dim,
+                           act="softmax")
+        self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
+                                     output_dim=self.class_dim,
+                                     act="softmax")
 
     @declarative
     def forward(self, inputs, label=None):
         emb = self.embedding(inputs)
-        o_np_mask = (
-            fluid.layers.reshape(inputs, [-1, 1]) != self.dict_dim).astype(
-                dtype='float32')
+        o_np_mask = (fluid.layers.reshape(inputs, [-1, 1]) !=
+                     self.dict_dim).astype(dtype='float32')
         mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim])
         emb = emb * mask_emb
         emb = fluid.layers.reshape(
@@ -105,6 +103,7 @@ def forward(self, inputs, label=None):
 
 
 class BOW(fluid.dygraph.Layer):
+
     def __init__(self, dict_dim, batch_size, seq_len):
         super(BOW, self).__init__()
         self.dict_dim = dict_dim
@@ -114,23 +113,24 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self.class_dim = 2
         self.batch_size = batch_size
         self.seq_len = seq_len
-        self.embedding = Embedding(
-            size=[self.dict_dim + 1, self.emb_dim],
-            dtype='float32',
-            is_sparse=False)
-        self._fc1 = Linear(
-            input_dim=self.hid_dim, output_dim=self.hid_dim, act="tanh")
-        self._fc2 = Linear(
-            input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
-        self._fc_prediction = Linear(
-            input_dim=self.fc_hid_dim, output_dim=self.class_dim, act="softmax")
+        self.embedding = Embedding(size=[self.dict_dim + 1, self.emb_dim],
+                                   dtype='float32',
+                                   is_sparse=False)
+        self._fc1 = Linear(input_dim=self.hid_dim,
+                           output_dim=self.hid_dim,
+                           act="tanh")
+        self._fc2 = Linear(input_dim=self.hid_dim,
+                           output_dim=self.fc_hid_dim,
+                           act="tanh")
+        self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
+                                     output_dim=self.class_dim,
+                                     act="softmax")
 
     @declarative
     def forward(self, inputs, label=None):
         emb = self.embedding(inputs)
-        o_np_mask = (
-            fluid.layers.reshape(inputs, [-1, 1]) != self.dict_dim).astype(
-                dtype='float32')
+        o_np_mask = (fluid.layers.reshape(inputs, [-1, 1]) !=
+                     self.dict_dim).astype(dtype='float32')
         mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim])
         emb = emb * mask_emb
         emb = fluid.layers.reshape(emb, shape=[-1, self.seq_len, self.hid_dim])
@@ -147,6 +147,7 @@ def forward(self, inputs, label=None):
 
 
 class GRU(fluid.dygraph.Layer):
+
     def __init__(self, dict_dim, batch_size, seq_len):
         super(GRU, self).__init__()
         self.dict_dim = dict_dim
@@ -156,29 +157,30 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self.class_dim = 2
         self.batch_size = batch_size
         self.seq_len = seq_len
-        self.embedding = Embedding(
-            size=[self.dict_dim + 1, self.emb_dim],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(learning_rate=30),
-            is_sparse=False)
+        self.embedding = Embedding(size=[self.dict_dim + 1, self.emb_dim],
+                                   dtype='float32',
+                                   param_attr=fluid.ParamAttr(learning_rate=30),
+                                   is_sparse=False)
         h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
         h_0 = to_variable(h_0)
         self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3)
-        self._fc2 = Linear(
-            input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
-        self._fc_prediction = Linear(
-            input_dim=self.fc_hid_dim, output_dim=self.class_dim, act="softmax")
+        self._fc2 = Linear(input_dim=self.hid_dim,
+                           output_dim=self.fc_hid_dim,
+                           act="tanh")
+        self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
+                                     output_dim=self.class_dim,
+                                     act="softmax")
         self._gru = DynamicGRU(size=self.hid_dim, h_0=h_0)
 
     @declarative
     def forward(self, inputs, label=None):
         emb = self.embedding(inputs)
-        o_np_mask = (fluid.layers.reshape(inputs, [-1, 1]) != self.dict_dim
-                     ).astype('float32')
+        o_np_mask = (fluid.layers.reshape(inputs, [-1, 1]) !=
+                     self.dict_dim).astype('float32')
         mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim])
         emb = emb * mask_emb
-        emb = fluid.layers.reshape(
-            emb, shape=[self.batch_size, -1, self.hid_dim])
+        emb = fluid.layers.reshape(emb,
+                                   shape=[self.batch_size, -1, self.hid_dim])
         fc_1 = self._fc1(emb)
         gru_hidden = self._gru(fc_1)
         gru_hidden = fluid.layers.reduce_max(gru_hidden, dim=1)
@@ -193,6 +195,7 @@ def forward(self, inputs, label=None):
 
 
 class BiGRU(fluid.dygraph.Layer):
+
     def __init__(self, dict_dim, batch_size, seq_len):
         super(BiGRU, self).__init__()
         self.dict_dim = dict_dim
@@ -202,32 +205,35 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self.class_dim = 2
         self.batch_size = batch_size
         self.seq_len = seq_len
-        self.embedding = Embedding(
-            size=[self.dict_dim + 1, self.emb_dim],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(learning_rate=30),
-            is_sparse=False)
+        self.embedding = Embedding(size=[self.dict_dim + 1, self.emb_dim],
+                                   dtype='float32',
+                                   param_attr=fluid.ParamAttr(learning_rate=30),
+                                   is_sparse=False)
         h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
         h_0 = to_variable(h_0)
         self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3)
-        self._fc2 = Linear(
-            input_dim=self.hid_dim * 2, output_dim=self.fc_hid_dim, act="tanh")
-        self._fc_prediction = Linear(
-            input_dim=self.fc_hid_dim, output_dim=self.class_dim, act="softmax")
-        self._gru_forward = DynamicGRU(
-            size=self.hid_dim, h_0=h_0, is_reverse=False)
-        self._gru_backward = DynamicGRU(
-            size=self.hid_dim, h_0=h_0, is_reverse=True)
+        self._fc2 = Linear(input_dim=self.hid_dim * 2,
+                           output_dim=self.fc_hid_dim,
+                           act="tanh")
+        self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
+                                     output_dim=self.class_dim,
+                                     act="softmax")
+        self._gru_forward = DynamicGRU(size=self.hid_dim,
+                                       h_0=h_0,
+                                       is_reverse=False)
+        self._gru_backward = DynamicGRU(size=self.hid_dim,
+                                        h_0=h_0,
+                                        is_reverse=True)
 
     @declarative
     def forward(self, inputs, label=None):
         emb = self.embedding(inputs)
-        o_np_mask = (fluid.layers.reshape(inputs, [-1, 1]) != self.dict_dim
-                     ).astype('float32')
+        o_np_mask = (fluid.layers.reshape(inputs, [-1, 1]) !=
+                     self.dict_dim).astype('float32')
         mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim])
         emb = emb * mask_emb
-        emb = fluid.layers.reshape(
-            emb, shape=[self.batch_size, -1, self.hid_dim])
+        emb = fluid.layers.reshape(emb,
+                                   shape=[self.batch_size, -1, self.hid_dim])
         fc_1 = self._fc1(emb)
         gru_forward = self._gru_forward(fc_1)
         gru_backward = self._gru_backward(fc_1)
@@ -258,8 +264,8 @@ def reader():
             seq_len = local_random.randint(padding_size // 2,
                                            int(padding_size * 1.2))
             word_ids = local_random.randint(0, vocab_size, [seq_len]).tolist()
-            word_ids = word_ids[:padding_size] + [vocab_size] * (padding_size -
-                                                                 seq_len)
+            word_ids = word_ids[:padding_size] + [vocab_size
+                                                  ] * (padding_size - seq_len)
             batch_data.append((word_ids, [label], seq_len))
             if len(batch_data) == batch_size:
                 yield batch_data
@@ -339,6 +345,7 @@ def train(args, to_static):
 
 
 class TestSentiment(unittest.TestCase):
+
     def setUp(self):
         self.args = Args()
 
@@ -346,9 +353,9 @@ def train_model(self, model_type='cnn_net'):
         self.args.model_type = model_type
         st_out = train(self.args, True)
         dy_out = train(self.args, False)
-        self.assertTrue(
-            np.allclose(dy_out, st_out),
-            msg="dy_out:\n {}\n st_out:\n {}".format(dy_out, st_out))
+        self.assertTrue(np.allclose(dy_out, st_out),
+                        msg="dy_out:\n {}\n st_out:\n {}".format(
+                            dy_out, st_out))
 
     def test_train(self):
         model_types = ['cnn_net', 'bow_net', 'gru_net', 'bigru_net']
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
index bc462ab8c95fa..7ed2d12f5a810 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
@@ -25,8 +25,9 @@
 from seq2seq_dygraph_model import BaseModel, AttentionModel
 from seq2seq_utils import Seq2SeqModelHyperParams
 from seq2seq_utils import get_data_iter
-place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
-)
+
+place = fluid.CUDAPlace(
+    0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 program_translator = ProgramTranslator()
 STEP_NUM = 10
 PRINT_STEP = 2
@@ -50,23 +51,21 @@ def train(args, attn_model=False):
         fluid.default_main_program().random_seed = 2020
 
         if attn_model:
-            model = AttentionModel(
-                args.hidden_size,
-                args.src_vocab_size,
-                args.tar_vocab_size,
-                args.batch_size,
-                num_layers=args.num_layers,
-                init_scale=args.init_scale,
-                dropout=args.dropout)
+            model = AttentionModel(args.hidden_size,
+                                   args.src_vocab_size,
+                                   args.tar_vocab_size,
+                                   args.batch_size,
+                                   num_layers=args.num_layers,
+                                   init_scale=args.init_scale,
+                                   dropout=args.dropout)
         else:
-            model = BaseModel(
-                args.hidden_size,
-                args.src_vocab_size,
-                args.tar_vocab_size,
-                args.batch_size,
-                num_layers=args.num_layers,
-                init_scale=args.init_scale,
-                dropout=args.dropout)
+            model = BaseModel(args.hidden_size,
+                              args.src_vocab_size,
+                              args.tar_vocab_size,
+                              args.batch_size,
+                              num_layers=args.num_layers,
+                              init_scale=args.init_scale,
+                              dropout=args.dropout)
 
         gloabl_norm_clip = GradientClipByGlobalNorm(args.max_grad_norm)
         optimizer = fluid.optimizer.SGD(args.learning_rate,
@@ -122,27 +121,25 @@ def infer(args, attn_model=False):
     with fluid.dygraph.guard(place):
 
         if attn_model:
-            model = AttentionModel(
-                args.hidden_size,
-                args.src_vocab_size,
-                args.tar_vocab_size,
-                args.batch_size,
-                beam_size=args.beam_size,
-                num_layers=args.num_layers,
-                init_scale=args.init_scale,
-                dropout=0.0,
-                mode='beam_search')
+            model = AttentionModel(args.hidden_size,
+                                   args.src_vocab_size,
+                                   args.tar_vocab_size,
+                                   args.batch_size,
+                                   beam_size=args.beam_size,
+                                   num_layers=args.num_layers,
+                                   init_scale=args.init_scale,
+                                   dropout=0.0,
+                                   mode='beam_search')
         else:
-            model = BaseModel(
-                args.hidden_size,
-                args.src_vocab_size,
-                args.tar_vocab_size,
-                args.batch_size,
-                beam_size=args.beam_size,
-                num_layers=args.num_layers,
-                init_scale=args.init_scale,
-                dropout=0.0,
-                mode='beam_search')
+            model = BaseModel(args.hidden_size,
+                              args.src_vocab_size,
+                              args.tar_vocab_size,
+                              args.batch_size,
+                              beam_size=args.beam_size,
+                              num_layers=args.num_layers,
+                              init_scale=args.init_scale,
+                              dropout=0.0,
+                              mode='beam_search')
 
         model_path = args.attn_model_path if attn_model else args.base_model_path
         state_dict, _ = fluid.dygraph.load_dygraph(model_path)
@@ -161,6 +158,7 @@ def infer(args, attn_model=False):
 
 
 class TestSeq2seq(unittest.TestCase):
+
     def setUp(self):
         self.args = Seq2SeqModelHyperParams
         self.temp_dir = tempfile.TemporaryDirectory()
@@ -192,19 +190,17 @@ def _test_train(self, attn_model=False):
         dygraph_loss = self.run_dygraph(mode="train", attn_model=attn_model)
         static_loss = self.run_static(mode="train", attn_model=attn_model)
         result = np.allclose(dygraph_loss, static_loss)
-        self.assertTrue(
-            result,
-            msg="\ndygraph_loss = {} \nstatic_loss = {}".format(dygraph_loss,
-                                                                static_loss))
+        self.assertTrue(result,
+                        msg="\ndygraph_loss = {} \nstatic_loss = {}".format(
+                            dygraph_loss, static_loss))
 
     def _test_predict(self, attn_model=False):
         pred_dygraph = self.run_dygraph(mode="test", attn_model=attn_model)
         pred_static = self.run_static(mode="test", attn_model=attn_model)
         result = np.allclose(pred_static, pred_dygraph)
-        self.assertTrue(
-            result,
-            msg="\npred_dygraph = {} \npred_static = {}".format(pred_dygraph,
-                                                                pred_static))
+        self.assertTrue(result,
+                        msg="\npred_dygraph = {} \npred_static = {}".format(
+                            pred_dygraph, pred_static))
 
     def test_base_model(self):
         self._test_train(attn_model=False)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
index ce88ea74af23e..2bc344ae95a6b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
@@ -36,20 +36,22 @@ def create_conf_dict():
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=32,
-        help="Total examples' number in batch for training.")
-    parser.add_argument(
-        "--seq_len", type=int, default=32, help="The length of each sentence.")
-    parser.add_argument(
-        "--epoch", type=int, default=1, help="The number of training epoch.")
-    parser.add_argument(
-        "--fake_sample_size",
-        type=int,
-        default=128,
-        help="The number of samples of fake data.")
+    parser.add_argument("--batch_size",
+                        type=int,
+                        default=32,
+                        help="Total examples' number in batch for training.")
+    parser.add_argument("--seq_len",
+                        type=int,
+                        default=32,
+                        help="The length of each sentence.")
+    parser.add_argument("--epoch",
+                        type=int,
+                        default=1,
+                        help="The number of training epoch.")
+    parser.add_argument("--fake_sample_size",
+                        type=int,
+                        default=128,
+                        help="The number of samples of fake data.")
     args = parser.parse_args([])
     return args
 
@@ -70,6 +72,7 @@ def fake_vocabulary():
 
 
 class FakeReaderProcessor(object):
+
     def __init__(self, args, vocab):
         self.vocab = vocab
         self.seq_len = args.seq_len
@@ -83,6 +86,7 @@ def __init__(self, args, vocab):
                 np.array([query, pos_title, neg_title]).astype(np.int64))
 
     def get_reader(self, mode, epoch=0):
+
         def reader_with_pairwise():
             if mode == "train":
                 for i in range(self.sample_size):
@@ -133,11 +137,10 @@ def train(conf_dict, to_static):
             return_list=True,
             iterable=True,
             use_double_buffer=True)
-        get_train_examples = simnet_process.get_reader(
-            "train", epoch=args.epoch)
+        get_train_examples = simnet_process.get_reader("train",
+                                                       epoch=args.epoch)
         train_loader.set_sample_list_generator(
-            paddle.batch(
-                get_train_examples, batch_size=args.batch_size), place)
+            paddle.batch(get_train_examples, batch_size=args.batch_size), place)
 
         for left, pos_right, neg_right in train_loader():
             left = fluid.layers.reshape(left, shape=[-1, 1])
@@ -157,6 +160,7 @@ def train(conf_dict, to_static):
 
 
 class TestSimnet(unittest.TestCase):
+
     def test_dygraph_static_same_loss(self):
         if fluid.is_compiled_with_cuda():
             fluid.set_flags({"FLAGS_cudnn_deterministic": True})
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
index 872d419ff8928..f2c72e9932ea7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
@@ -34,20 +34,22 @@ def create_conf_dict():
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=32,
-        help="Total examples' number in batch for training.")
-    parser.add_argument(
-        "--seq_len", type=int, default=32, help="The length of each sentence.")
-    parser.add_argument(
-        "--epoch", type=int, default=1, help="The number of training epoch.")
-    parser.add_argument(
-        "--fake_sample_size",
-        type=int,
-        default=128,
-        help="The number of samples of fake data.")
+    parser.add_argument("--batch_size",
+                        type=int,
+                        default=32,
+                        help="Total examples' number in batch for training.")
+    parser.add_argument("--seq_len",
+                        type=int,
+                        default=32,
+                        help="The length of each sentence.")
+    parser.add_argument("--epoch",
+                        type=int,
+                        default=1,
+                        help="The number of training epoch.")
+    parser.add_argument("--fake_sample_size",
+                        type=int,
+                        default=128,
+                        help="The number of samples of fake data.")
     args = parser.parse_args([])
     return args
 
@@ -68,6 +70,7 @@ def fake_vocabulary():
 
 
 class FakeReaderProcessor(object):
+
     def __init__(self, args, vocab):
         self.vocab = vocab
         self.seq_len = args.seq_len
@@ -81,6 +84,7 @@ def __init__(self, args, vocab):
                 np.array([query, pos_title, neg_title]).astype(np.int64))
 
     def get_reader(self, mode, epoch=0):
+
         def reader_with_pairwise():
             if mode == "train":
                 for i in range(self.sample_size):
@@ -114,24 +118,24 @@ def train(conf_dict, to_static):
 
     net = BOW(conf_dict)
     loss = HingeLoss(conf_dict)
-    optimizer = paddle.optimizer.Adam(
-        learning_rate=0.001,
-        beta1=0.9,
-        beta2=0.999,
-        epsilon=1e-08,
-        parameters=net.parameters())
+    optimizer = paddle.optimizer.Adam(learning_rate=0.001,
+                                      beta1=0.9,
+                                      beta2=0.999,
+                                      epsilon=1e-08,
+                                      parameters=net.parameters())
 
     metric = paddle.metric.Auc(name="auc")
 
     global_step = 0
     losses = []
 
-    train_loader = paddle.io.DataLoader.from_generator(
-        capacity=16, return_list=True, iterable=True, use_double_buffer=True)
+    train_loader = paddle.io.DataLoader.from_generator(capacity=16,
+                                                       return_list=True,
+                                                       iterable=True,
+                                                       use_double_buffer=True)
     get_train_examples = simnet_process.get_reader("train", epoch=args.epoch)
     train_loader.set_sample_list_generator(
-        paddle.batch(
-            get_train_examples, batch_size=args.batch_size), place)
+        paddle.batch(get_train_examples, batch_size=args.batch_size), place)
 
     for left, pos_right, neg_right in train_loader():
         left = paddle.reshape(left, shape=[-1, 1])
@@ -152,6 +156,7 @@ def train(conf_dict, to_static):
 
 
 class TestSimnet(unittest.TestCase):
+
     def test_dygraph_static_same_loss(self):
         if paddle.is_compiled_with_cuda():
             paddle.fluid.set_flags({"FLAGS_cudnn_deterministic": True})
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py
index eecb6d8b75842..48dc33cc6c786 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py
@@ -98,6 +98,7 @@ def test_set_value(x):
 
 
 class LayerWithSetValue(paddle.nn.Layer):
+
     def __init__(self, input_dim, hidden):
         super(LayerWithSetValue, self).__init__()
         self.linear = paddle.nn.Linear(input_dim, hidden)
@@ -110,10 +111,11 @@ def forward(self, x):
 
 
 class TestSliceWithoutControlFlow(unittest.TestCase):
+
     def setUp(self):
         self.init_input()
-        self.place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        self.place = paddle.CUDAPlace(
+            0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
         self.init_dygraph_func()
         paddle.disable_static()
 
@@ -137,28 +139,31 @@ def run_static_mode(self):
     def test_transformed_static_result(self):
         static_res = self.run_static_mode()
         dygraph_res = self.run_dygraph_mode()
-        self.assertTrue(
-            np.allclose(dygraph_res, static_res),
-            msg='dygraph_res is {}\nstatic_res is {}'.format(dygraph_res,
-                                                             static_res))
+        self.assertTrue(np.allclose(dygraph_res, static_res),
+                        msg='dygraph_res is {}\nstatic_res is {}'.format(
+                            dygraph_res, static_res))
 
 
 class TestSliceInIf(TestSliceWithoutControlFlow):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_slice_in_if
 
 
 class TestSliceInWhileLoop(TestSliceWithoutControlFlow):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_slice_in_while_loop
 
 
 class TestSliceInForLoop(TestSliceWithoutControlFlow):
+
     def init_dygraph_func(self):
         self.dygraph_func = test_slice_in_for_loop
 
 
 class TestSetValue(TestSliceWithoutControlFlow):
+
     def init_input(self):
         self.input = np.full([3, 4, 5], 5).astype('float32')
 
@@ -167,6 +172,7 @@ def init_dygraph_func(self):
 
 
 class TestSetValueWithLayerAndSave(unittest.TestCase):
+
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
         self.model_path = os.path.join(self.temp_dir.name,
@@ -179,8 +185,10 @@ def test_set_value_with_save(self):
         prog_trans.enable(True)
         model = LayerWithSetValue(input_dim=10, hidden=1)
         x = paddle.full(shape=[5, 10], fill_value=5.0, dtype="float32")
-        paddle.jit.save(
-            layer=model, path=self.model_path, input_spec=[x], output_spec=None)
+        paddle.jit.save(layer=model,
+                        path=self.model_path,
+                        input_spec=[x],
+                        output_spec=None)
 
 
 class TestSliceSupplementSpecialCase(unittest.TestCase):
@@ -213,8 +221,8 @@ def func(inps):
             return inps[::2], inps[::-2]
 
         origin_result = func(inps)
-        sfunc = paddle.jit.to_static(
-            func, input_spec=[InputSpec(shape=[None, 4, 4])])
+        sfunc = paddle.jit.to_static(func,
+                                     input_spec=[InputSpec(shape=[None, 4, 4])])
         static_result = sfunc(inps)
 
         self.assertTrue(
@@ -224,6 +232,7 @@ def func(inps):
 
 
 class TestPaddleStridedSlice(unittest.TestCase):
+
     def test_compare_paddle_strided_slice_with_numpy(self):
         paddle.disable_static()
         array = np.arange(5)
@@ -232,8 +241,19 @@ def test_compare_paddle_strided_slice_with_numpy(self):
         s1 = 3
         e1 = 1
         stride1 = -2
-        sl = paddle.strided_slice(
-            pt, axes=[0, ], starts=[s1, ], ends=[e1, ], strides=[stride1, ])
+        sl = paddle.strided_slice(pt,
+                                  axes=[
+                                      0,
+                                  ],
+                                  starts=[
+                                      s1,
+                                  ],
+                                  ends=[
+                                      e1,
+                                  ],
+                                  strides=[
+                                      stride1,
+                                  ])
 
         self.assertTrue(array[s1:e1:stride1], sl)
 
@@ -242,20 +262,27 @@ def test_compare_paddle_strided_slice_with_numpy(self):
         s2 = [8, -1]
         e2 = [1, -5]
         stride2 = [-2, -3]
-        sl = paddle.strided_slice(
-            pt, axes=[0, 1], starts=s2, ends=e2, strides=stride2)
+        sl = paddle.strided_slice(pt,
+                                  axes=[0, 1],
+                                  starts=s2,
+                                  ends=e2,
+                                  strides=stride2)
 
         self.assertTrue(
-            np.array_equal(sl.numpy(), array[s2[0]:e2[0]:stride2[0], s2[1]:e2[
-                1]:stride2[1]]))
+            np.array_equal(
+                sl.numpy(), array[s2[0]:e2[0]:stride2[0],
+                                  s2[1]:e2[1]:stride2[1]]))
 
         array = np.arange(6 * 7 * 8).reshape((6, 7, 8))
         pt = paddle.to_tensor(array)
         s2 = [7, -1]
         e2 = [2, -5]
         stride2 = [-2, -3]
-        sl = paddle.strided_slice(
-            pt, axes=[0, 2], starts=s2, ends=e2, strides=stride2)
+        sl = paddle.strided_slice(pt,
+                                  axes=[0, 2],
+                                  starts=s2,
+                                  ends=e2,
+                                  strides=stride2)
 
         array_slice = array[s2[0]:e2[0]:stride2[0], ::, s2[1]:e2[1]:stride2[1]]
         self.assertTrue(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py
index 361fcbf9c73f5..7311fd285abc7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,6 +19,7 @@
 
 
 class Net(Layer):
+
     def __init__(self):
         super(Net, self).__init__()
         self.fc = paddle.nn.Linear(16, 3)
@@ -36,6 +37,7 @@ def forward(self, x, y, m, n):
 
 
 class TestArgsSpecName(unittest.TestCase):
+
     def read_from_dataset(self):
         self.x = paddle.randn([4, 2, 8])
         self.y = paddle.randn([4, 2, 8])
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
index 388291a51c22f..36b2425058f05 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
@@ -94,6 +94,7 @@ def func_to_test4():
 
 
 def func_to_test5():
+
     def inner_int_func():
         return 1
 
@@ -144,7 +145,7 @@ def add(x, y):
 }
 
 
-def func_to_test7(a: int, b: float, c: paddle.Tensor, d: float='diff'):
+def func_to_test7(a: int, b: float, c: paddle.Tensor, d: float = 'diff'):
     a = True
     e, f = paddle.shape(c)
     g: paddle.Tensor = len(c)
@@ -171,6 +172,7 @@ def func_to_test7(a: int, b: float, c: paddle.Tensor, d: float='diff'):
 
 
 class TestStaticAnalysis(unittest.TestCase):
+
     def _check_wrapper(self, wrapper, node_to_wrapper_map):
         self.assertEqual(node_to_wrapper_map[wrapper.node], wrapper)
         if wrapper.parent is not None:
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_methods.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_methods.py
index f06d48c963d83..f535cf4c35da3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_methods.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_methods.py
@@ -27,6 +27,7 @@ def tensor_clone(x):
 
 
 class TestTensorClone(unittest.TestCase):
+
     def _run(self, to_static):
         prog_trans = paddle.jit.ProgramTranslator()
         prog_trans.enable(to_static)
@@ -36,10 +37,9 @@ def _run(self, to_static):
     def test_tensor_clone(self):
         dygraph_res = self._run(to_static=False)
         static_res = self._run(to_static=True)
-        self.assertTrue(
-            numpy.allclose(dygraph_res, static_res),
-            msg='dygraph res is {}\nstatic_res is {}'.format(dygraph_res,
-                                                             static_res))
+        self.assertTrue(numpy.allclose(dygraph_res, static_res),
+                        msg='dygraph res is {}\nstatic_res is {}'.format(
+                            dygraph_res, static_res))
 
 
 @paddle.jit.to_static
@@ -50,6 +50,7 @@ def tensor_numpy(x):
 
 
 class TestTensorDygraphOnlyMethodError(unittest.TestCase):
+
     def _run(self, to_static):
         prog_trans = paddle.jit.ProgramTranslator()
         prog_trans.enable(to_static)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
index 5cf9d7749c358..3e30eb84ed671 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
@@ -111,11 +111,13 @@ def dyfunc_with_if_1(x):
         # `res.shape[0]` is transformed into
         #   `paddle.jit.dy2static.convert_var_shape(res)[0]`
         if res.shape[0] > 1:
-            res = fluid.layers.fill_constant(
-                value=2, shape=x.shape, dtype="int32")
+            res = fluid.layers.fill_constant(value=2,
+                                             shape=x.shape,
+                                             dtype="int32")
         else:
-            res = fluid.layers.fill_constant(
-                value=3, shape=x.shape, dtype="int32")
+            res = fluid.layers.fill_constant(value=3,
+                                             shape=x.shape,
+                                             dtype="int32")
     return res
 
 
@@ -231,10 +233,11 @@ def dyfunc_dict_assign_shape():
 
 # 1. Basic tests without control flow
 class TestTensorShapeBasic(unittest.TestCase):
+
     def setUp(self):
         self.input = numpy.ones(5).astype("int32")
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self._set_input_spec()
         self._set_expected_op_num()
         self.init_test_func()
@@ -262,10 +265,9 @@ def get_static_output(self):
     def test_transformed_static_result(self):
         static_res = self.get_static_output()
         dygraph_res = self.get_dygraph_output()
-        self.assertTrue(
-            numpy.allclose(dygraph_res, static_res),
-            msg='dygraph res is {}\nstatic_res is {}'.format(dygraph_res,
-                                                             static_res))
+        self.assertTrue(numpy.allclose(dygraph_res, static_res),
+                        msg='dygraph res is {}\nstatic_res is {}'.format(
+                            dygraph_res, static_res))
 
     def _set_expected_op_num(self):
         self.expected_op_num = 2
@@ -293,6 +295,7 @@ def test_op_num(self):
 
 
 class TestTensorShapeBasic2(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.dygraph_func = dyfunc_tensor_shape_2
 
@@ -303,16 +306,19 @@ def _set_expected_op_num(self):
 
 
 class TestTensorShapeBasic3(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.dygraph_func = dyfunc_tensor_shape_3
 
 
 class TestTensorShapeBasic4(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.dygraph_func = dyfunc_tensor_shape_4
 
 
 class TestTensorShapeBasic5(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.dygraph_func = dyfunc_tensor_shape_5
 
@@ -323,6 +329,7 @@ def _set_expected_op_num(self):
 
 
 class TestTensorShapeBasic6(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.dygraph_func = dyfunc_tensor_shape_6
 
@@ -333,6 +340,7 @@ def _set_expected_op_num(self):
 
 
 class TestTupleShape1(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.input = numpy.ones((5, 7)).astype("int32")
         self.input_spec = [paddle.static.InputSpec(shape=[5, 7], dtype="int32")]
@@ -345,6 +353,7 @@ def _set_expected_op_num(self):
 
 
 class TestTupleShape2(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.input = numpy.ones((5, 7)).astype("int32")
         self.input_spec = [paddle.static.InputSpec(shape=[5, 7], dtype="int32")]
@@ -357,6 +366,7 @@ def _set_expected_op_num(self):
 
 
 class TestTupleShape3(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.input = numpy.ones((5, 7)).astype("int32")
         self.input_spec = [paddle.static.InputSpec(shape=[5, 7], dtype="int32")]
@@ -369,6 +379,7 @@ def _set_expected_op_num(self):
 
 
 class TestPaddleShapeApi(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.input = numpy.ones((5, 7)).astype("int32")
         self.input_spec = [paddle.static.InputSpec(shape=[5, 7], dtype="int32")]
@@ -382,6 +393,7 @@ def _set_expected_op_num(self):
 
 # 2. Tests with control flow if
 class TestTensorShapeInIf1(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.dygraph_func = dyfunc_with_if_1
 
@@ -392,6 +404,7 @@ def _set_expected_op_num(self):
 
 
 class TestTensorShapeInIf2(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.dygraph_func = dyfunc_with_if_2
 
@@ -403,6 +416,7 @@ def _set_expected_op_num(self):
 
 # 3. Tests with control flow for loop
 class TestTensorShapeInFor1(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.dygraph_func = dyfunc_with_for_1
 
@@ -413,6 +427,7 @@ def _set_expected_op_num(self):
 
 
 class TestTensorShapeInFor2(TestTensorShapeInFor1):
+
     def init_test_func(self):
         self.dygraph_func = dyfunc_with_for_2
 
@@ -423,6 +438,7 @@ def _set_expected_op_num(self):
 
 
 class TestTensorShapeInFor3(TestTensorShapeInFor1):
+
     def init_test_func(self):
         self.dygraph_func = dyfunc_with_for_3
 
@@ -434,11 +450,13 @@ def _set_expected_op_num(self):
 
 # 4. Tests with control flow while loop
 class TestTensorShapeInWhile1(TestTensorShapeInFor1):
+
     def init_test_func(self):
         self.dygraph_func = dyfunc_with_while_1
 
 
 class TestTensorShapeInWhile2(TestTensorShapeInFor1):
+
     def init_test_func(self):
         self.dygraph_func = dyfunc_with_while_2
 
@@ -449,6 +467,7 @@ def _set_expected_op_num(self):
 
 
 class TestTensorShapeInWhile3(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.dygraph_func = dyfunc_with_while_3
 
@@ -459,6 +478,7 @@ def _set_expected_op_num(self):
 
 
 class TestTensorShapeInWhile4(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.dygraph_func = dyfunc_with_while_4
 
@@ -470,6 +490,7 @@ def _set_expected_op_num(self):
 
 # 5. Test op num for negetive dim
 class TestOpNumBasicWithTensorShape(unittest.TestCase):
+
     def setUp(self):
         self._set_input_spec()
         self._set_test_func()
@@ -477,8 +498,7 @@ def setUp(self):
 
     def _set_input_spec(self):
         self.input_spec = [
-            paddle.static.InputSpec(
-                shape=[-1, 5], dtype="int32")
+            paddle.static.InputSpec(shape=[-1, 5], dtype="int32")
         ]
 
     def _set_test_func(self):
@@ -511,6 +531,7 @@ def test_op_num(self):
 
 
 class TestOpNumBasicWithTensorShape4(TestOpNumBasicWithTensorShape):
+
     def _set_test_func(self):
         self.dygraph_func = dyfunc_tensor_shape_4
 
@@ -521,6 +542,7 @@ def _set_expected_op_num(self):
 
 
 class TestOpNumWithTensorShapeTuple1(TestOpNumBasicWithTensorShape):
+
     def _set_test_func(self):
         self.dygraph_func = dyfunc_tuple_shape_1
 
@@ -531,6 +553,7 @@ def _set_expected_op_num(self):
 
 
 class TestOpNumWithTensorShapeInIf1(TestOpNumBasicWithTensorShape):
+
     def _set_test_func(self):
         self.dygraph_func = dyfunc_with_if_1
 
@@ -541,6 +564,7 @@ def _set_expected_op_num(self):
 
 
 class TestOpNumWithTensorShapeInFor1(TestOpNumBasicWithTensorShape):
+
     def _set_test_func(self):
         self.dygraph_func = dyfunc_with_for_1
 
@@ -551,6 +575,7 @@ def _set_expected_op_num(self):
 
 
 class TestOpNumWithTensorShapeInWhile1(TestOpNumBasicWithTensorShape):
+
     def _set_test_func(self):
         self.dygraph_func = dyfunc_with_while_1
 
@@ -561,6 +586,7 @@ def _set_expected_op_num(self):
 
 
 class TestChangeShapeAfterAssign(TestTensorShapeBasic):
+
     def init_test_func(self):
         self.input = numpy.ones((2, 3)).astype("int32")
         self.input_spec = [paddle.static.InputSpec(shape=[2, 3], dtype="int32")]
@@ -580,13 +606,15 @@ def dyfunc_with_static_convert_var_shape(x):
     else:
         # Test for correctly to find `batch_size__static_convert_var_shape_suffix_0` in
         # deeply nested scope.
-        res = fluid.layers.fill_constant(
-            value=8, shape=[batch_size], dtype="int32")
+        res = fluid.layers.fill_constant(value=8,
+                                         shape=[batch_size],
+                                         dtype="int32")
 
     return res
 
 
 class TestFindStatiConvertVarShapeSuffixVar(unittest.TestCase):
+
     def test(self):
         x_spec = paddle.static.InputSpec(shape=[None, 10])
         func = paddle.jit.to_static(dyfunc_with_if_2, input_spec=[x_spec])
@@ -595,6 +623,7 @@ def test(self):
 
 
 class TestPaddleShape(unittest.TestCase):
+
     def test_paddle_shape(self):
         func = paddle.jit.to_static(dyfunc_len_paddle_shape)
         func_code = func.code.replace("\n", "").replace(" ", "")
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
index c8fe3e3932914..32bd9bc5d5003 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
@@ -26,8 +26,8 @@
 from transformer_dygraph_model import CrossEntropyCriterion, Transformer, position_encoding_init
 
 trainer_count = 1
-place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
-)
+place = fluid.CUDAPlace(
+    0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 SEED = 10
 STEP_NUM = 10
 
@@ -72,16 +72,16 @@ def train_static(args, batch_generator):
             # define optimizer
             learning_rate = fluid.layers.learning_rate_scheduler.noam_decay(
                 args.d_model, args.warmup_steps, args.learning_rate)
-            optimizer = fluid.optimizer.Adam(
-                learning_rate=learning_rate,
-                beta1=args.beta1,
-                beta2=args.beta2,
-                epsilon=float(args.eps))
+            optimizer = fluid.optimizer.Adam(learning_rate=learning_rate,
+                                             beta1=args.beta1,
+                                             beta2=args.beta2,
+                                             epsilon=float(args.eps))
             optimizer.minimize(avg_cost)
             # the best cross-entropy value with label smoothing
             loss_normalizer = -((1. - args.label_smooth_eps) * np.log(
-                (1. - args.label_smooth_eps)) + args.label_smooth_eps * np.log(
-                    args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20))
+                (1. - args.label_smooth_eps)) + args.label_smooth_eps *
+                                np.log(args.label_smooth_eps /
+                                       (args.trg_vocab_size - 1) + 1e-20))
     step_idx = 0
     total_batch_num = 0
     avg_loss = []
@@ -94,8 +94,8 @@ def train_static(args, batch_generator):
                            feed=feed_dict,
                            fetch_list=[sum_cost.name, token_num.name])
             if step_idx % args.print_step == 0:
-                sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[
-                    1])
+                sum_cost_val, token_num_val = np.array(outs[0]), np.array(
+                    outs[1])
                 total_sum_cost = sum_cost_val.sum()
                 total_token_num = token_num_val.sum()
                 total_avg_cost = total_sum_cost / total_token_num
@@ -114,8 +114,8 @@ def train_static(args, batch_generator):
                         "normalized loss: %f, ppl: %f, speed: %.2f steps/s" %
                         (step_idx, pass_id, batch_id, total_avg_cost,
                          total_avg_cost - loss_normalizer,
-                         np.exp([min(total_avg_cost, 100)]),
-                         args.print_step / (time.time() - avg_batch_time)))
+                         np.exp([min(total_avg_cost, 100)]), args.print_step /
+                         (time.time() - avg_batch_time)))
                     avg_batch_time = time.time()
             batch_id += 1
             step_idx += 1
@@ -160,8 +160,9 @@ def train_dygraph(args, batch_generator):
         # the best cross-entropy value with label smoothing
         loss_normalizer = -(
             (1. - args.label_smooth_eps) * np.log(
-                (1. - args.label_smooth_eps)) + args.label_smooth_eps *
-            np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20))
+                (1. - args.label_smooth_eps)) +
+            args.label_smooth_eps * np.log(args.label_smooth_eps /
+                                           (args.trg_vocab_size - 1) + 1e-20))
         ce_time = []
         ce_ppl = []
         avg_loss = []
@@ -176,8 +177,8 @@ def train_dygraph(args, batch_generator):
                 logits = transformer(src_word, src_pos, src_slf_attn_bias,
                                      trg_word, trg_pos, trg_slf_attn_bias,
                                      trg_src_attn_bias)
-                sum_cost, avg_cost, token_num = criterion(logits, lbl_word,
-                                                          lbl_weight)
+                sum_cost, avg_cost, token_num = criterion(
+                    logits, lbl_word, lbl_weight)
                 avg_cost.backward()
                 optimizer.minimize(avg_cost)
                 transformer.clear_gradients()
@@ -196,11 +197,11 @@ def train_dygraph(args, batch_generator):
                         logging.info(
                             "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                             "normalized loss: %f, ppl: %f, speed: %.2f steps/s"
-                            %
-                            (step_idx, pass_id, batch_id, total_avg_cost,
-                             total_avg_cost - loss_normalizer,
-                             np.exp([min(total_avg_cost, 100)]),
-                             args.print_step / (time.time() - avg_batch_time)))
+                            % (step_idx, pass_id, batch_id, total_avg_cost,
+                               total_avg_cost - loss_normalizer,
+                               np.exp([min(total_avg_cost, 100)
+                                       ]), args.print_step /
+                               (time.time() - avg_batch_time)))
                         ce_ppl.append(np.exp([min(total_avg_cost, 100)]))
                         avg_batch_time = time.time()
                 batch_id += 1
@@ -310,8 +311,8 @@ def predict_static(args, batch_generator):
 
         input_field = util.InputField(input_slots)
         feed_list = input_field.feed_list
-        loader = fluid.io.DataLoader.from_generator(
-            feed_list=feed_list, capacity=10)
+        loader = fluid.io.DataLoader.from_generator(feed_list=feed_list,
+                                                    capacity=10)
 
         # define model
         transformer = Transformer(
@@ -322,12 +323,11 @@ def predict_static(args, batch_generator):
             args.postprocess_cmd, args.weight_sharing, args.bos_idx,
             args.eos_idx)
 
-        out_ids, out_scores = transformer.beam_search(
-            *feed_list,
-            bos_id=args.bos_idx,
-            eos_id=args.eos_idx,
-            beam_size=args.beam_size,
-            max_len=args.max_out_len)
+        out_ids, out_scores = transformer.beam_search(*feed_list,
+                                                      bos_id=args.bos_idx,
+                                                      eos_id=args.eos_idx,
+                                                      beam_size=args.beam_size,
+                                                      max_len=args.max_out_len)
 
     # This is used here to set dropout to the test mode.
     test_prog = test_prog.clone(for_test=True)
@@ -335,8 +335,8 @@ def predict_static(args, batch_generator):
     # define the executor and program for training
     exe = fluid.Executor(place)
 
-    util.load(test_prog,
-              os.path.join(args.save_static_model_path, "transformer"), exe)
+    util.load(test_prog, os.path.join(args.save_static_model_path,
+                                      "transformer"), exe)
 
     loader.set_batch_generator(batch_generator, places=place)
 
@@ -372,6 +372,7 @@ def predict_static(args, batch_generator):
 
 
 class TestTransformer(unittest.TestCase):
+
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
 
@@ -401,14 +402,12 @@ def _test_predict(self):
         static_seq_ids, static_scores = predict_static(args, batch_generator)
         dygraph_seq_ids, dygraph_scores = predict_dygraph(args, batch_generator)
 
-        self.assertTrue(
-            np.allclose(static_seq_ids, static_seq_ids),
-            msg="static_seq_ids: {} \n dygraph_seq_ids: {}".format(
-                static_seq_ids, dygraph_seq_ids))
-        self.assertTrue(
-            np.allclose(static_scores, dygraph_scores),
-            msg="static_scores: {} \n dygraph_scores: {}".format(
-                static_scores, dygraph_scores))
+        self.assertTrue(np.allclose(static_seq_ids, static_seq_ids),
+                        msg="static_seq_ids: {} \n dygraph_seq_ids: {}".format(
+                            static_seq_ids, dygraph_seq_ids))
+        self.assertTrue(np.allclose(static_scores, dygraph_scores),
+                        msg="static_scores: {} \n dygraph_scores: {}".format(
+                            static_scores, dygraph_scores))
 
     def test_check_result(self):
         self._test_train()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
index 37fc78458dd78..481858be6f469 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
@@ -33,21 +33,20 @@
 
 def parse_args():
     parser = argparse.ArgumentParser("Paddle Video train script")
-    parser.add_argument(
-        '--config',
-        type=str,
-        default='tsm.yaml',
-        help='path to config file of model')
-    parser.add_argument(
-        '--use_gpu',
-        type=bool,
-        default=fluid.is_compiled_with_cuda(),
-        help='default use gpu.')
+    parser.add_argument('--config',
+                        type=str,
+                        default='tsm.yaml',
+                        help='path to config file of model')
+    parser.add_argument('--use_gpu',
+                        type=bool,
+                        default=fluid.is_compiled_with_cuda(),
+                        help='default use gpu.')
     args = parser.parse_args(['--config', 'tsm.yaml'])
     return args
 
 
 class ConvBNLayer(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -57,22 +56,20 @@ def __init__(self,
                  act=None):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=None,
-            act=None,
-            param_attr=fluid.param_attr.ParamAttr(),
-            bias_attr=False)
-
-        self._batch_norm = BatchNorm(
-            num_filters,
-            act=act,
-            param_attr=fluid.param_attr.ParamAttr(),
-            bias_attr=fluid.param_attr.ParamAttr())
+        self._conv = Conv2D(num_channels=num_channels,
+                            num_filters=num_filters,
+                            filter_size=filter_size,
+                            stride=stride,
+                            padding=(filter_size - 1) // 2,
+                            groups=None,
+                            act=None,
+                            param_attr=fluid.param_attr.ParamAttr(),
+                            bias_attr=False)
+
+        self._batch_norm = BatchNorm(num_filters,
+                                     act=act,
+                                     param_attr=fluid.param_attr.ParamAttr(),
+                                     bias_attr=fluid.param_attr.ParamAttr())
 
     def forward(self, inputs):
         y = self._conv(inputs)
@@ -82,6 +79,7 @@ def forward(self, inputs):
 
 
 class BottleneckBlock(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -90,29 +88,25 @@ def __init__(self,
                  seg_num=8):
         super(BottleneckBlock, self).__init__()
 
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu')
-        self.conv2 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters * 4,
-            filter_size=1,
-            act=None)
+        self.conv0 = ConvBNLayer(num_channels=num_channels,
+                                 num_filters=num_filters,
+                                 filter_size=1,
+                                 act='relu')
+        self.conv1 = ConvBNLayer(num_channels=num_filters,
+                                 num_filters=num_filters,
+                                 filter_size=3,
+                                 stride=stride,
+                                 act='relu')
+        self.conv2 = ConvBNLayer(num_channels=num_filters,
+                                 num_filters=num_filters * 4,
+                                 filter_size=1,
+                                 act=None)
 
         if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * 4,
-                filter_size=1,
-                stride=stride)
+            self.short = ConvBNLayer(num_channels=num_channels,
+                                     num_filters=num_filters * 4,
+                                     filter_size=1,
+                                     stride=stride)
         self.shortcut = shortcut
         self.seg_num = seg_num
         self._num_channels_out = int(num_filters * 4)
@@ -131,6 +125,7 @@ def forward(self, inputs):
 
 
 class TSM_ResNet(fluid.dygraph.Layer):
+
     def __init__(self, name_scope, config, mode):
         super(TSM_ResNet, self).__init__(name_scope)
 
@@ -148,10 +143,15 @@ def __init__(self, name_scope, config, mode):
             raise NotImplementedError
         num_filters = [64, 128, 256, 512]
 
-        self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
-        self.pool2d_max = Pool2D(
-            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+        self.conv = ConvBNLayer(num_channels=3,
+                                num_filters=64,
+                                filter_size=7,
+                                stride=2,
+                                act='relu')
+        self.pool2d_max = Pool2D(pool_size=3,
+                                 pool_stride=2,
+                                 pool_padding=1,
+                                 pool_type='max')
 
         self.bottleneck_block_list = []
         num_channels = 64
@@ -161,17 +161,17 @@ def __init__(self, name_scope, config, mode):
             for i in range(depth[block]):
                 bottleneck_block = self.add_sublayer(
                     'bb_%d_%d' % (block, i),
-                    BottleneckBlock(
-                        num_channels=num_channels,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        shortcut=shortcut,
-                        seg_num=self.seg_num))
+                    BottleneckBlock(num_channels=num_channels,
+                                    num_filters=num_filters[block],
+                                    stride=2 if i == 0 and block != 0 else 1,
+                                    shortcut=shortcut,
+                                    seg_num=self.seg_num))
                 num_channels = int(bottleneck_block._num_channels_out)
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
-        self.pool2d_avg = Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True)
+        self.pool2d_avg = Pool2D(pool_size=7,
+                                 pool_type='avg',
+                                 global_pooling=True)
 
         import math
         stdv = 1.0 / math.sqrt(2048 * 1.0)
@@ -202,6 +202,7 @@ def forward(self, inputs):
 
 
 class FakeDataReader(object):
+
     def __init__(self, mode, cfg):
         self.format = cfg.MODEL.format
         self.num_classes = cfg.MODEL.num_classes
@@ -211,8 +212,8 @@ def __init__(self, mode, cfg):
         self.target_size = cfg[mode.upper()]['target_size']
         self.img_mean = np.array(cfg.MODEL.image_mean).reshape(
             [3, 1, 1]).astype(np.float32)
-        self.img_std = np.array(cfg.MODEL.image_std).reshape(
-            [3, 1, 1]).astype(np.float32)
+        self.img_std = np.array(cfg.MODEL.image_std).reshape([3, 1, 1]).astype(
+            np.float32)
 
         self.batch_size = 1 if sys.platform == 'darwin' or os.name == 'nt' else cfg[
             mode.upper()]['batch_size']
@@ -232,6 +233,7 @@ def __init__(self, mode, cfg):
             self.generator_out.append(batch_out)
 
     def create_reader(self):
+
         def batch_reader():
             for i in range(self.total_iter):
                 yield self.generator_out[i]
@@ -251,8 +253,7 @@ def create_optimizer(cfg, params):
     momentum = cfg.momentum
 
     optimizer = fluid.optimizer.Momentum(
-        learning_rate=fluid.layers.piecewise_decay(
-            boundaries=bd, values=lr),
+        learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr),
         momentum=momentum,
         regularization=fluid.regularizer.L2Decay(l2_weight_decay),
         parameter_list=params)
@@ -299,13 +300,16 @@ def train(args, fake_data_reader, to_static):
                 labels = to_variable(y_data)
                 labels.stop_gradient = True
                 outputs = video_model(imgs)
-                loss = fluid.layers.cross_entropy(
-                    input=outputs, label=labels, ignore_index=-1)
+                loss = fluid.layers.cross_entropy(input=outputs,
+                                                  label=labels,
+                                                  ignore_index=-1)
                 avg_loss = fluid.layers.mean(loss)
-                acc_top1 = fluid.layers.accuracy(
-                    input=outputs, label=labels, k=1)
-                acc_top5 = fluid.layers.accuracy(
-                    input=outputs, label=labels, k=5)
+                acc_top1 = fluid.layers.accuracy(input=outputs,
+                                                 label=labels,
+                                                 k=1)
+                acc_top5 = fluid.layers.accuracy(input=outputs,
+                                                 label=labels,
+                                                 k=5)
 
                 avg_loss.backward()
                 optimizer.minimize(avg_loss)
@@ -319,20 +323,23 @@ def train(args, fake_data_reader, to_static):
                 print('TRAIN Epoch {}, iter {}, loss = {}, acc1 {}, acc5 {}'.
                       format(epoch, batch_id,
                              avg_loss.numpy()[0],
-                             acc_top1.numpy()[0], acc_top5.numpy()[0]))
+                             acc_top1.numpy()[0],
+                             acc_top5.numpy()[0]))
                 ret.extend([
-                    avg_loss.numpy()[0], acc_top1.numpy()[0],
+                    avg_loss.numpy()[0],
+                    acc_top1.numpy()[0],
                     acc_top5.numpy()[0]
                 ])
 
             print(
                 'TRAIN End, Epoch {}, avg_loss= {}, avg_acc1= {}, avg_acc5= {}'.
-                format(epoch, total_loss / total_sample, total_acc1 /
-                       total_sample, total_acc5 / total_sample))
+                format(epoch, total_loss / total_sample,
+                       total_acc1 / total_sample, total_acc5 / total_sample))
         return ret
 
 
 class TestTsm(unittest.TestCase):
+
     def test_dygraph_static_same_loss(self):
         if fluid.is_compiled_with_cuda():
             fluid.set_flags({"FLAGS_cudnn_deterministic": True})
@@ -340,10 +347,9 @@ def test_dygraph_static_same_loss(self):
         fake_data_reader = FakeDataReader("train", parse_config(args.config))
         dygraph_loss = train(args, fake_data_reader, to_static=False)
         static_loss = train(args, fake_data_reader, to_static=True)
-        self.assertTrue(
-            np.allclose(dygraph_loss, static_loss),
-            msg="dygraph_loss: {} \nstatic_loss: {}".format(dygraph_loss,
-                                                            static_loss))
+        self.assertTrue(np.allclose(dygraph_loss, static_loss),
+                        msg="dygraph_loss: {} \nstatic_loss: {}".format(
+                            dygraph_loss, static_loss))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_typing.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_typing.py
index 7017cdda9cd23..66b154ee30a36 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_typing.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_typing.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@
 
 
 class BaseLayer(paddle.nn.Layer):
+
     def __init__(self, in_size, out_size):
         super(BaseLayer, self).__init__()
         self._linear = paddle.nn.Linear(in_size, out_size)
@@ -31,6 +32,7 @@ def build(self, x):
 
 
 class LinearNetWithTuple(BaseLayer):
+
     def __init__(self, in_size, out_size):
         super(LinearNetWithTuple, self).__init__(in_size, out_size)
 
@@ -40,6 +42,7 @@ def forward(self, x) -> Tuple[paddle.Tensor, str]:
 
 
 class LinearNetWithTuple2(BaseLayer):
+
     def __init__(self, in_size, out_size):
         super(LinearNetWithTuple2, self).__init__(in_size, out_size)
 
@@ -49,6 +52,7 @@ def forward(self, x) -> Tuple[paddle.Tensor, np.array]:
 
 
 class LinearNetWithList(BaseLayer):
+
     def __init__(self, in_size, out_size):
         super(LinearNetWithList, self).__init__(in_size, out_size)
 
@@ -58,6 +62,7 @@ def forward(self, x) -> List[paddle.Tensor]:
 
 
 class LinearNetWithDict(BaseLayer):
+
     def __init__(self, in_size, out_size):
         super(LinearNetWithDict, self).__init__(in_size, out_size)
 
@@ -67,6 +72,7 @@ def forward(self, x) -> Dict[str, paddle.Tensor]:
 
 
 class TestTyping(unittest.TestCase):
+
     def setUp(self):
         self.in_num = 16
         self.out_num = 16
@@ -99,6 +105,7 @@ def test_type(self):
 
 
 class TestTypingTuple(TestTyping):
+
     def build_net(self):
         return LinearNetWithTuple2(self.in_num, self.out_num)
 
@@ -109,6 +116,7 @@ def run_dy(self):
 
 
 class TestTypingList(TestTyping):
+
     def build_net(self):
         return LinearNetWithList(self.in_num, self.out_num)
 
@@ -118,6 +126,7 @@ def run_dy(self):
 
 
 class TestTypingDict(TestTyping):
+
     def build_net(self):
         return LinearNetWithDict(self.in_num, self.out_num)
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_utils.py
index 747e9f1c0dbd9..6f4fe613db714 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_utils.py
@@ -25,6 +25,7 @@
 
 
 class TestIndexInList(unittest.TestCase):
+
     def test_index_in_list(self):
         list_to_test = [1, 2, 3, 4, 5]
         self.assertEqual(index_in_list(list_to_test, 4), 3)
@@ -42,6 +43,7 @@ def dyfunc_assign(input):
 
 
 class StaticCode():
+
     def dyfunc_assign(input):
         b = 1
         a = b
@@ -56,6 +58,7 @@ def dyfunc_assign(input):
 
 
 class TestSplitAssignTransformer(unittest.TestCase):
+
     def test_code(self):
         answer = get_source_code(StaticCode.dyfunc_assign)
         program_translator = ProgramTranslator()
@@ -64,6 +67,7 @@ def test_code(self):
 
 
 class TestIsPaddle(unittest.TestCase):
+
     def fake_module(self):
         return types.ModuleType('paddlenlp')
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py
index 8500f46d974d8..377353c0ab65b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py
@@ -26,6 +26,7 @@
 
 
 class TestDataLayerNotCheck(unittest.TestCase):
+
     def test_create_none_shape(self):
         main_program = fluid.Program()
         with fluid.program_guard(main_program):
@@ -38,8 +39,8 @@ def test_feed_mismatch_shape(self):
         with fluid.program_guard(main_program):
             d = data_layer_not_check(name="d", shape=(1, 2, 3))
         feed_in_data = np.random.uniform(size=[1, 2, 4]).astype(np.float32)
-        place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         ret = exe.run(main_program,
                       feed={d.name: feed_in_data},
@@ -48,6 +49,7 @@ def test_feed_mismatch_shape(self):
 
 
 class TestVariableTransFunc(unittest.TestCase):
+
     def test_create_fill_constant_node(self):
         node = create_fill_constant_node("a", 1.0)
         source = "a = paddle.fluid.layers.fill_constant(shape=[1], dtype='float64', value=1.0, name='a')"
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
index f270c5672afc3..f510e2dca6f0f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
@@ -61,8 +61,9 @@ def build_dict(corpus, min_freq=3):
                 word_freq_dict[word] = 0
             word_freq_dict[word] += 1
 
-    word_freq_dict = sorted(
-        word_freq_dict.items(), key=lambda x: x[1], reverse=True)
+    word_freq_dict = sorted(word_freq_dict.items(),
+                            key=lambda x: x[1],
+                            reverse=True)
 
     word2id_dict = dict()
     word2id_freq = dict()
@@ -109,9 +110,10 @@ def convert_corpus_to_id(corpus, word2id_dict):
 
 
 def subsampling(corpus, word2id_freq):
+
     def keep(word_id):
-        return random.uniform(0, 1) < math.sqrt(1e-4 / word2id_freq[word_id] *
-                                                len(corpus))
+        return random.uniform(0, 1) < math.sqrt(
+            1e-4 / word2id_freq[word_id] * len(corpus))
 
     new_corpus = []
     for line in corpus:
@@ -136,12 +138,13 @@ def build_data(corpus,
             window_size = random.randint(1, max_window_size)
             center_word = line[center_word_idx]
 
-            positive_word_range = (max(0, center_word_idx - window_size), min(
-                len(line) - 1, center_word_idx + window_size))
+            positive_word_range = (max(0, center_word_idx - window_size),
+                                   min(
+                                       len(line) - 1,
+                                       center_word_idx + window_size))
             positive_word_candidates = [
-                line[idx]
-                for idx in range(positive_word_range[0], positive_word_range[1]
-                                 + 1)
+                line[idx] for idx in range(positive_word_range[0],
+                                           positive_word_range[1] + 1)
                 if idx != center_word_idx and line[idx] != line[center_word_idx]
             ]
 
@@ -203,6 +206,7 @@ def build_batch(dataset, batch_size, epoch_num):
 
 
 class SkipGram(fluid.dygraph.Layer):
+
     def __init__(self, name_scope, vocab_size, embedding_size, init_scale=0.1):
         super(SkipGram, self).__init__(name_scope)
         self.vocab_size = vocab_size
@@ -259,8 +263,8 @@ def train(to_static):
     random.seed(0)
     np.random.seed(0)
 
-    place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-    ) else fluid.CPUPlace()
+    place = fluid.CUDAPlace(
+        0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
     with fluid.dygraph.guard(place):
         fluid.default_startup_program().random_seed = 1000
         fluid.default_main_program().random_seed = 1000
@@ -293,13 +297,13 @@ def train(to_static):
 
 
 class TestWord2Vec(unittest.TestCase):
+
     def test_dygraph_static_same_loss(self):
         dygraph_loss = train(to_static=False)
         static_loss = train(to_static=True)
-        self.assertTrue(
-            np.allclose(dygraph_loss, static_loss),
-            msg="dygraph_loss: {} \nstatic_loss: {}".format(dygraph_loss,
-                                                            static_loss))
+        self.assertTrue(np.allclose(dygraph_loss, static_loss),
+                        msg="dygraph_loss: {} \nstatic_loss: {}".format(
+                            dygraph_loss, static_loss))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_yolov3.py
index 9326af2952e84..ef074447893ce 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_yolov3.py
@@ -47,6 +47,7 @@ def get_mean_value(self):
 
 
 class FakeDataReader(object):
+
     def __init__(self):
         self.generator_out = []
         self.total_iter = cfg.max_iter
@@ -58,13 +59,15 @@ def __init__(self):
                 point1 = cfg.input_size / 4
                 point2 = cfg.input_size / 2
                 gt_boxes = np.array([[point1, point1, point2, point2]])
-                gt_labels = np.random.randint(
-                    low=0, high=cfg.class_num, size=[1])
+                gt_labels = np.random.randint(low=0,
+                                              high=cfg.class_num,
+                                              size=[1])
                 gt_scores = np.zeros([1])
                 batch_out.append([img, gt_boxes, gt_labels, gt_scores])
             self.generator_out.append(batch_out)
 
     def reader(self):
+
         def generator():
             for i in range(self.total_iter):
                 yield self.generator_out[i]
@@ -94,14 +97,16 @@ def train(to_static):
         learning_rate = cfg.learning_rate
         values = [learning_rate * (gamma**i) for i in range(step_num + 1)]
 
-        lr = fluid.dygraph.PiecewiseDecay(
-            boundaries=boundaries, values=values, begin=0)
+        lr = fluid.dygraph.PiecewiseDecay(boundaries=boundaries,
+                                          values=values,
+                                          begin=0)
 
         lr = fluid.layers.linear_lr_warmup(
             learning_rate=lr,
             warmup_steps=cfg.warm_up_iter,
             start_lr=0.0,
-            end_lr=cfg.learning_rate, )
+            end_lr=cfg.learning_rate,
+        )
 
         optimizer = fluid.optimizer.Momentum(
             learning_rate=lr,
@@ -146,8 +151,8 @@ def train(to_static):
             total_sample += 1
 
             print("Iter {:d}, loss {:.6f}, time {:.5f}".format(
-                iter_id,
-                smoothed_loss.get_mean_value(), start_time - prev_start_time))
+                iter_id, smoothed_loss.get_mean_value(),
+                start_time - prev_start_time))
             ret.append(smoothed_loss.get_mean_value())
 
             loss.backward()
@@ -159,14 +164,16 @@ def train(to_static):
 
 
 class TestYolov3(unittest.TestCase):
+
     def test_dygraph_static_same_loss(self):
         dygraph_loss = train(to_static=False)
         static_loss = train(to_static=True)
-        self.assertTrue(
-            np.allclose(
-                dygraph_loss, static_loss, atol=1e-5, rtol=1e-3),
-            msg="dygraph_loss: {} \nstatic_loss: {}".format(dygraph_loss,
-                                                            static_loss))
+        self.assertTrue(np.allclose(dygraph_loss,
+                                    static_loss,
+                                    atol=1e-5,
+                                    rtol=1e-3),
+                        msg="dygraph_loss: {} \nstatic_loss: {}".format(
+                            dygraph_loss, static_loss))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
index 07e9b1ac62e27..ab52d518fe7af 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -32,10 +32,10 @@ def position_encoding_init(n_position, d_pos_vec):
     num_timescales = channels // 2
     log_timescale_increment = (np.log(float(1e4) / float(1)) /
                                (num_timescales - 1))
-    inv_timescales = np.exp(np.arange(
-        num_timescales)) * -log_timescale_increment
-    scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
-                                                               0)
+    inv_timescales = np.exp(
+        np.arange(num_timescales)) * -log_timescale_increment
+    scaled_time = np.expand_dims(position, 1) * np.expand_dims(
+        inv_timescales, 0)
     signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
     signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
     position_enc = signal
@@ -43,6 +43,7 @@ def position_encoding_init(n_position, d_pos_vec):
 
 
 class PrePostProcessLayer(Layer):
+
     def __init__(self, process_cmd, d_model, dropout_rate):
         super(PrePostProcessLayer, self).__init__()
         self.process_cmd = process_cmd
@@ -53,8 +54,8 @@ def __init__(self, process_cmd, d_model, dropout_rate):
             elif cmd == "n":  # add layer normalization
                 self.functors.append(
                     self.add_sublayer(
-                        "layer_norm_%d" % len(
-                            [layer for layer in self.children()]),
+                        "layer_norm_%d" %
+                        len([layer for layer in self.children()]),
                         LayerNorm(
                             normalized_shape=d_model,
                             param_attr=fluid.ParamAttr(
@@ -63,8 +64,8 @@ def __init__(self, process_cmd, d_model, dropout_rate):
                                 initializer=fluid.initializer.Constant(0.)))))
             elif cmd == "d":  # add dropout
                 if dropout_rate:
-                    self.functors.append(lambda x: layers.dropout(
-                        x, dropout_prob=dropout_rate))
+                    self.functors.append(
+                        lambda x: layers.dropout(x, dropout_prob=dropout_rate))
 
     def forward(self, x, residual=None):
         for i, cmd in enumerate(self.process_cmd):
@@ -76,6 +77,7 @@ def forward(self, x, residual=None):
 
 
 class MultiHeadAttention(Layer):
+
     def __init__(self,
                  d_key,
                  d_value,
@@ -131,8 +133,10 @@ def forward(self, queries, keys, values, attn_bias, cache=None):
             v = layers.concat([cache_v, v], axis=2)
             cache["k"], cache["v"] = k, v
         # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
+        product = layers.matmul(x=q,
+                                y=k,
+                                transpose_y=True,
+                                alpha=self.d_model**-0.5)
         if attn_bias is not None:
             product += attn_bias
         weights = layers.softmax(product)
@@ -146,6 +150,7 @@ def forward(self, queries, keys, values, attn_bias, cache=None):
 
 
 class FFN(Layer):
+
     def __init__(self, d_inner_hid, d_model, dropout_rate):
         super(FFN, self).__init__()
         self.dropout_rate = dropout_rate
@@ -161,6 +166,7 @@ def forward(self, x):
 
 
 class EncoderLayer(Layer):
+
     def __init__(self,
                  n_head,
                  d_key,
@@ -189,8 +195,8 @@ def __init__(self,
                                                   prepostprocess_dropout)
 
     def forward(self, enc_input, attn_bias):
-        attn_output = self.self_attn(
-            self.preprocesser1(enc_input), None, None, attn_bias)
+        attn_output = self.self_attn(self.preprocesser1(enc_input), None, None,
+                                     attn_bias)
         attn_output = self.postprocesser1(attn_output, enc_input)
         ffn_output = self.ffn(self.preprocesser2(attn_output))
         ffn_output = self.postprocesser2(ffn_output, attn_output)
@@ -198,6 +204,7 @@ def forward(self, enc_input, attn_bias):
 
 
 class Encoder(Layer):
+
     def __init__(self,
                  n_layer,
                  n_head,
@@ -234,6 +241,7 @@ def forward(self, enc_input, attn_bias):
 
 
 class Embedder(Layer):
+
     def __init__(self, vocab_size, emb_dim, bos_idx=0):
         super(Embedder, self).__init__()
         self.word_embedder = Embedding(
@@ -248,6 +256,7 @@ def forward(self, word):
 
 
 class WrapEncoder(Layer):
+
     def __init__(self, src_vocab_size, max_length, n_layer, n_head, d_key,
                  d_value, d_model, d_inner_hid, prepostprocess_dropout,
                  attention_dropout, relu_dropout, preprocess_cmd,
@@ -275,12 +284,14 @@ def forward(self, src_word, src_pos, src_slf_attn_bias):
         emb = word_emb + pos_enc
         enc_input = layers.dropout(
             emb,
-            dropout_prob=self.emb_dropout, ) if self.emb_dropout else emb
+            dropout_prob=self.emb_dropout,
+        ) if self.emb_dropout else emb
         enc_output = self.encoder(enc_input, src_slf_attn_bias)
         return enc_output
 
 
 class DecoderLayer(Layer):
+
     def __init__(self,
                  n_head,
                  d_key,
@@ -318,8 +329,8 @@ def forward(self,
                 self_attn_bias,
                 cross_attn_bias,
                 cache=None):
-        self_attn_output = self.self_attn(
-            self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
+        self_attn_output = self.self_attn(self.preprocesser1(dec_input), None,
+                                          None, self_attn_bias, cache)
         self_attn_output = self.postprocesser1(self_attn_output, dec_input)
         cross_attn_output = self.cross_attn(
             self.preprocesser2(self_attn_output), enc_output, enc_output,
@@ -332,6 +343,7 @@ def forward(self,
 
 
 class Decoder(Layer):
+
     def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
                  prepostprocess_dropout, attention_dropout, relu_dropout,
                  preprocess_cmd, postprocess_cmd):
@@ -357,13 +369,14 @@ def forward(self,
                 caches=None):
         for i, decoder_layer in enumerate(self.decoder_layers):
             dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
-                                       cross_attn_bias, None
-                                       if caches is None else caches[i])
+                                       cross_attn_bias,
+                                       None if caches is None else caches[i])
             dec_input = dec_output
         return self.processer(dec_output)
 
 
 class WrapDecoder(Layer):
+
     def __init__(self, trg_vocab_size, max_length, n_layer, n_head, d_key,
                  d_value, d_model, d_inner_hid, prepostprocess_dropout,
                  attention_dropout, relu_dropout, preprocess_cmd,
@@ -389,8 +402,9 @@ def __init__(self, trg_vocab_size, max_length, n_layer, n_head, d_key,
                                                   word_embedder.weight,
                                                   transpose_y=True)
         else:
-            self.linear = Linear(
-                input_dim=d_model, output_dim=trg_vocab_size, bias_attr=False)
+            self.linear = Linear(input_dim=d_model,
+                                 output_dim=trg_vocab_size,
+                                 bias_attr=False)
 
     def forward(self,
                 trg_word,
@@ -406,26 +420,28 @@ def forward(self,
         emb = word_emb + pos_enc
         dec_input = layers.dropout(
             emb,
-            dropout_prob=self.emb_dropout, ) if self.emb_dropout else emb
+            dropout_prob=self.emb_dropout,
+        ) if self.emb_dropout else emb
         dec_output = self.decoder(dec_input, enc_output, trg_slf_attn_bias,
                                   trg_src_attn_bias, caches)
         dec_output = layers.reshape(
             dec_output,
-            shape=[-1, dec_output.shape[-1]], )
+            shape=[-1, dec_output.shape[-1]],
+        )
         logits = self.linear(dec_output)
         return logits
 
 
 class CrossEntropyCriterion(object):
+
     def __init__(self, label_smooth_eps):
         self.label_smooth_eps = label_smooth_eps
 
     def __call__(self, predict, label, weights):
         if self.label_smooth_eps:
-            label_out = layers.label_smooth(
-                label=layers.one_hot(
-                    input=label, depth=predict.shape[-1]),
-                epsilon=self.label_smooth_eps)
+            label_out = layers.label_smooth(label=layers.one_hot(
+                input=label, depth=predict.shape[-1]),
+                                            epsilon=self.label_smooth_eps)
 
         cost = layers.softmax_with_cross_entropy(
             logits=predict,
@@ -440,6 +456,7 @@ def __call__(self, predict, label, weights):
 
 
 class Transformer(Layer):
+
     def __init__(self,
                  src_vocab_size,
                  trg_vocab_size,
@@ -459,25 +476,29 @@ def __init__(self,
                  bos_id=0,
                  eos_id=1):
         super(Transformer, self).__init__()
-        src_word_embedder = Embedder(
-            vocab_size=src_vocab_size, emb_dim=d_model, bos_idx=bos_id)
-        self.encoder = WrapEncoder(
-            src_vocab_size, max_length, n_layer, n_head, d_key, d_value,
-            d_model, d_inner_hid, prepostprocess_dropout, attention_dropout,
-            relu_dropout, preprocess_cmd, postprocess_cmd, src_word_embedder)
+        src_word_embedder = Embedder(vocab_size=src_vocab_size,
+                                     emb_dim=d_model,
+                                     bos_idx=bos_id)
+        self.encoder = WrapEncoder(src_vocab_size, max_length, n_layer, n_head,
+                                   d_key, d_value, d_model, d_inner_hid,
+                                   prepostprocess_dropout, attention_dropout,
+                                   relu_dropout, preprocess_cmd,
+                                   postprocess_cmd, src_word_embedder)
         if weight_sharing:
             assert src_vocab_size == trg_vocab_size, (
                 "Vocabularies in source and target should be same for weight sharing."
             )
             trg_word_embedder = src_word_embedder
         else:
-            trg_word_embedder = Embedder(
-                vocab_size=trg_vocab_size, emb_dim=d_model, bos_idx=bos_id)
-        self.decoder = WrapDecoder(
-            trg_vocab_size, max_length, n_layer, n_head, d_key, d_value,
-            d_model, d_inner_hid, prepostprocess_dropout, attention_dropout,
-            relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing,
-            trg_word_embedder)
+            trg_word_embedder = Embedder(vocab_size=trg_vocab_size,
+                                         emb_dim=d_model,
+                                         bos_idx=bos_id)
+        self.decoder = WrapDecoder(trg_vocab_size, max_length, n_layer, n_head,
+                                   d_key, d_value, d_model, d_inner_hid,
+                                   prepostprocess_dropout, attention_dropout,
+                                   relu_dropout, preprocess_cmd,
+                                   postprocess_cmd, weight_sharing,
+                                   trg_word_embedder)
 
         self.trg_vocab_size = trg_vocab_size
         self.n_layer = n_layer
@@ -504,9 +525,10 @@ def beam_search(self,
                     eos_id=1,
                     beam_size=4,
                     max_len=256):
+
         def expand_to_beam_size(tensor, beam_size):
-            tensor = layers.reshape(
-                tensor, [tensor.shape[0], 1] + list(tensor.shape[1:]))
+            tensor = layers.reshape(tensor, [tensor.shape[0], 1] +
+                                    list(tensor.shape[1:]))
             tile_dims = [1] * len(tensor.shape)
             tile_dims[1] = beam_size
             return layers.expand(tensor, tile_dims)
@@ -518,9 +540,9 @@ def merge_batch_beams(tensor):
                 list(range(var_dim_in_state, len(tensor.shape))) +
                 list(range(0, var_dim_in_state)))
 
-            tensor = layers.reshape(tensor,
-                                    [0] * (len(tensor.shape) - var_dim_in_state
-                                           ) + [batch_size * beam_size])
+            tensor = layers.reshape(tensor, [0] *
+                                    (len(tensor.shape) - var_dim_in_state) +
+                                    [batch_size * beam_size])
             res = layers.transpose(
                 tensor,
                 list(
@@ -535,9 +557,9 @@ def split_batch_beams(tensor):
                 tensor,
                 list(range(var_dim_in_state, len(tensor.shape))) +
                 list(range(0, var_dim_in_state)))
-            tensor = layers.reshape(tensor,
-                                    [0] * (len(tensor.shape) - var_dim_in_state
-                                           ) + [batch_size, beam_size])
+            tensor = layers.reshape(tensor, [0] *
+                                    (len(tensor.shape) - var_dim_in_state) +
+                                    [batch_size, beam_size])
             res = layers.transpose(
                 tensor,
                 list(
@@ -548,13 +570,11 @@ def split_batch_beams(tensor):
 
         def mask_probs(probs, finished, noend_mask_tensor):
             finished = layers.cast(finished, dtype=probs.dtype)
-            probs = layers.elementwise_mul(
-                layers.expand(
-                    layers.unsqueeze(finished, [2]),
-                    [1, 1, self.trg_vocab_size]),
-                noend_mask_tensor,
-                axis=-1) - layers.elementwise_mul(
-                    probs, (finished - 1), axis=0)
+            probs = layers.elementwise_mul(layers.expand(
+                layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]),
+                                           noend_mask_tensor,
+                                           axis=-1) - layers.elementwise_mul(
+                                               probs, (finished - 1), axis=0)
             return probs
 
         def gather(input, indices, batch_pos):
@@ -568,32 +588,31 @@ def gather(input, indices, batch_pos):
         # constant number
         inf = float(1. * 1e7)
         max_len = (enc_output.shape[1] + 20) if max_len is None else max_len
-        vocab_size_tensor = layers.fill_constant(
-            shape=[1], dtype="int64", value=self.trg_vocab_size)
+        vocab_size_tensor = layers.fill_constant(shape=[1],
+                                                 dtype="int64",
+                                                 value=self.trg_vocab_size)
         end_token_tensor = to_variable(
-            np.full(
-                [batch_size, beam_size], eos_id, dtype="int64"))
+            np.full([batch_size, beam_size], eos_id, dtype="int64"))
         noend_array = [-inf] * self.trg_vocab_size
         noend_array[eos_id] = 0
         noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32"))
         batch_pos = layers.expand(
             layers.unsqueeze(
-                to_variable(np.arange(
-                    0, batch_size, 1, dtype="int64")), [1]), [1, beam_size])
+                to_variable(np.arange(0, batch_size, 1, dtype="int64")), [1]),
+            [1, beam_size])
         predict_ids = []
         parent_ids = []
         ### initialize states of beam search ###
         log_probs = to_variable(
-            np.array(
-                [[0.] + [-inf] * (beam_size - 1)] * batch_size,
-                dtype="float32"))
+            np.array([[0.] + [-inf] * (beam_size - 1)] * batch_size,
+                     dtype="float32"))
 
-        finished = to_variable(
-            np.full(
-                [batch_size, beam_size], 0, dtype="bool"))
+        finished = to_variable(np.full([batch_size, beam_size], 0,
+                                       dtype="bool"))
 
-        trg_word = layers.fill_constant(
-            shape=[batch_size * beam_size, 1], dtype="int64", value=bos_id)
+        trg_word = layers.fill_constant(shape=[batch_size * beam_size, 1],
+                                        dtype="int64",
+                                        value=bos_id)
 
         trg_src_attn_bias = merge_batch_beams(
             expand_to_beam_size(trg_src_attn_bias, beam_size))
@@ -602,19 +621,22 @@ def gather(input, indices, batch_pos):
 
         # init states (caches) for transformer, need to be updated according to selected beam
         caches = [{
-            "k": layers.fill_constant(
+            "k":
+            layers.fill_constant(
                 shape=[batch_size, beam_size, self.n_head, 0, self.d_key],
                 dtype=enc_output.dtype,
                 value=0),
-            "v": layers.fill_constant(
+            "v":
+            layers.fill_constant(
                 shape=[batch_size, beam_size, self.n_head, 0, self.d_value],
                 dtype=enc_output.dtype,
                 value=0),
         } for i in range(self.n_layer)]
 
         for i in range(max_len):
-            trg_pos = layers.fill_constant(
-                shape=trg_word.shape, dtype="int64", value=i)
+            trg_pos = layers.fill_constant(shape=trg_word.shape,
+                                           dtype="int64",
+                                           value=i)
             caches = map_structure(merge_batch_beams,
                                    caches)  # TODO: modified for dygraph2static
             logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
@@ -625,17 +647,18 @@ def gather(input, indices, batch_pos):
 
             step_log_probs = mask_probs(step_log_probs, finished,
                                         noend_mask_tensor)
-            log_probs = layers.elementwise_add(
-                x=step_log_probs, y=log_probs, axis=0)
+            log_probs = layers.elementwise_add(x=step_log_probs,
+                                               y=log_probs,
+                                               axis=0)
             log_probs = layers.reshape(log_probs,
                                        [-1, beam_size * self.trg_vocab_size])
             scores = log_probs
-            topk_scores, topk_indices = fluid.layers.topk(
-                input=scores, k=beam_size)
-            beam_indices = fluid.layers.elementwise_floordiv(topk_indices,
-                                                             vocab_size_tensor)
-            token_indices = fluid.layers.elementwise_mod(topk_indices,
-                                                         vocab_size_tensor)
+            topk_scores, topk_indices = fluid.layers.topk(input=scores,
+                                                          k=beam_size)
+            beam_indices = fluid.layers.elementwise_floordiv(
+                topk_indices, vocab_size_tensor)
+            token_indices = fluid.layers.elementwise_mod(
+                topk_indices, vocab_size_tensor)
 
             # update states
             caches = map_structure(lambda x: gather(x, beam_indices, batch_pos),
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py
index e264a300d8c18..bf06fb12bdd10 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py
@@ -32,14 +32,15 @@ def get_input_descs(args, mode="train"):
     input_descs_train = {
         "src_word": [(batch_size, seq_len), "int64", 2],
         "src_pos": [(batch_size, seq_len), "int64"],
-        "src_slf_attn_bias":
-        [(batch_size, n_head, seq_len, seq_len), "float32"],
+        "src_slf_attn_bias": [(batch_size, n_head, seq_len, seq_len),
+                              "float32"],
         "trg_word": [(batch_size, seq_len), "int64", 2],
         "trg_pos": [(batch_size, seq_len), "int64"],
-        "trg_slf_attn_bias":
-        [(batch_size, n_head, seq_len, seq_len), "float32"],
-        "trg_src_attn_bias": [(batch_size, n_head, seq_len, seq_len), "float32"
-                              ],  # TODO: 1 for predict, seq_len for train
+        "trg_slf_attn_bias": [(batch_size, n_head, seq_len, seq_len),
+                              "float32"],
+        "trg_src_attn_bias":
+        [(batch_size, n_head, seq_len, seq_len),
+         "float32"],  # TODO: 1 for predict, seq_len for train
         "enc_output": [(batch_size, seq_len, d_model), "float32"],
         "lbl_word": [(None, 1), "int64"],
         "lbl_weight": [(None, 1), "float32"],
@@ -49,12 +50,12 @@ def get_input_descs(args, mode="train"):
     input_descs_predict = {
         "src_word": [(batch_size, seq_len), "int64", 2],
         "src_pos": [(batch_size, seq_len), "int64"],
-        "src_slf_attn_bias":
-        [(batch_size, n_head, seq_len, seq_len), "float32"],
+        "src_slf_attn_bias": [(batch_size, n_head, seq_len, seq_len),
+                              "float32"],
         "trg_word": [(batch_size, seq_len), "int64", 2],
         "trg_pos": [(batch_size, seq_len), "int64"],
-        "trg_slf_attn_bias":
-        [(batch_size, n_head, seq_len, seq_len), "float32"],
+        "trg_slf_attn_bias": [(batch_size, n_head, seq_len, seq_len),
+                              "float32"],
         "trg_src_attn_bias": [(batch_size, n_head, 1, seq_len), "float32"],
         "enc_output": [(batch_size, seq_len, d_model), "float32"],
         "lbl_word": [(None, 1), "int64"],
@@ -69,19 +70,23 @@ def get_input_descs(args, mode="train"):
 encoder_data_input_fields = (
     "src_word",
     "src_pos",
-    "src_slf_attn_bias", )
+    "src_slf_attn_bias",
+)
 decoder_data_input_fields = (
     "trg_word",
     "trg_pos",
     "trg_slf_attn_bias",
     "trg_src_attn_bias",
-    "enc_output", )
+    "enc_output",
+)
 label_data_input_fields = (
     "lbl_word",
-    "lbl_weight", )
+    "lbl_weight",
+)
 fast_decoder_data_input_fields = (
     "trg_word",
-    "trg_src_attn_bias", )
+    "trg_src_attn_bias",
+)
 
 
 class ModelHyperParams(object):
@@ -220,19 +225,20 @@ def prepare_infer_input(insts, src_pad_idx, bos_idx, n_head):
 
 
 def get_feed_data_reader(args, mode='train'):
+
     def __for_train__():
-        train_reader = paddle.batch(
-            wmt16.train(args.src_vocab_size, args.trg_vocab_size),
-            batch_size=args.batch_size)
+        train_reader = paddle.batch(wmt16.train(args.src_vocab_size,
+                                                args.trg_vocab_size),
+                                    batch_size=args.batch_size)
         for batch in train_reader():
             tensors = prepare_train_input(batch, args.eos_idx, args.eos_idx,
                                           args.n_head)
             yield tensors
 
     def __for_test__():
-        test_reader = paddle.batch(
-            wmt16.test(args.src_vocab_size, args.trg_vocab_size),
-            batch_size=args.batch_size)
+        test_reader = paddle.batch(wmt16.test(args.src_vocab_size,
+                                              args.trg_vocab_size),
+                                   batch_size=args.batch_size)
         for batch in test_reader():
             tensors = prepare_infer_input(batch, args.eos_idx, args.eos_idx,
                                           args.n_head)
@@ -242,16 +248,16 @@ def __for_test__():
 
 
 class InputField(object):
+
     def __init__(self, input_slots):
         self.feed_list = []
         for slot in input_slots:
             self.feed_list.append(
-                fluid.layers.data(
-                    name=slot['name'],
-                    shape=slot['shape'],
-                    dtype=slot['dtype'],
-                    lod_level=slot.get('lod_level', 0),
-                    append_batch_size=False))
+                fluid.layers.data(name=slot['name'],
+                                  shape=slot['shape'],
+                                  dtype=slot['dtype'],
+                                  lod_level=slot.get('lod_level', 0),
+                                  append_batch_size=False))
 
 
 def load(program, model_path, executor=None, var_list=None):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/tsm_config_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/tsm_config_utils.py
index 4fedd1b246b27..0b37e94b3a2b7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/tsm_config_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/tsm_config_utils.py
@@ -14,6 +14,7 @@
 
 import yaml
 import logging
+
 logger = logging.getLogger(__name__)
 
 CONFIG_SECS = [
@@ -25,6 +26,7 @@
 
 
 class AttrDict(dict):
+
     def __getattr__(self, key):
         return self[key]
 
@@ -76,8 +78,8 @@ def merge_configs(cfg, sec, args_dict):
 
 
 def print_configs(cfg, mode):
-    logger.info("---------------- {:>5} Arguments ----------------".format(
-        mode))
+    logger.info(
+        "---------------- {:>5} Arguments ----------------".format(mode))
     for sec, sec_items in cfg.items():
         logger.info("{}:".format(sec))
         for k, v in sec_items.items():
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
index bb95bdf9fc677..f1552869a2d90 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
@@ -32,6 +32,7 @@
 
 
 class AttrDict(dict):
+
     def __init__(self, *args, **kwargs):
         super(AttrDict, self).__init__(*args, **kwargs)
 
@@ -100,7 +101,7 @@ def __setattr__(self, name, value):
 cfg.max_iter = 20 if fluid.is_compiled_with_cuda() else 1
 # Disable mixup in last N iter
 cfg.no_mixup_iter = 10 if fluid.is_compiled_with_cuda() else 1
-# warm up to learning rate 
+# warm up to learning rate
 cfg.warm_up_iter = 10 if fluid.is_compiled_with_cuda() else 1
 cfg.warm_up_factor = 0.
 # lr steps_with_decay
@@ -120,54 +121,49 @@ def __setattr__(self, name, value):
 
 
 class YoloDetectionBlock(fluid.dygraph.Layer):
+
     def __init__(self, ch_in, channel, is_test=True):
         super(YoloDetectionBlock, self).__init__()
 
         assert channel % 2 == 0, \
             "channel {} cannot be divided by 2".format(channel)
 
-        self.conv0 = ConvBNLayer(
-            ch_in=ch_in,
-            ch_out=channel,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            is_test=is_test)
-        self.conv1 = ConvBNLayer(
-            ch_in=channel,
-            ch_out=channel * 2,
-            filter_size=3,
-            stride=1,
-            padding=1,
-            is_test=is_test)
-        self.conv2 = ConvBNLayer(
-            ch_in=channel * 2,
-            ch_out=channel,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            is_test=is_test)
-        self.conv3 = ConvBNLayer(
-            ch_in=channel,
-            ch_out=channel * 2,
-            filter_size=3,
-            stride=1,
-            padding=1,
-            is_test=is_test)
-        self.route = ConvBNLayer(
-            ch_in=channel * 2,
-            ch_out=channel,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            is_test=is_test)
-        self.tip = ConvBNLayer(
-            ch_in=channel,
-            ch_out=channel * 2,
-            filter_size=3,
-            stride=1,
-            padding=1,
-            is_test=is_test)
+        self.conv0 = ConvBNLayer(ch_in=ch_in,
+                                 ch_out=channel,
+                                 filter_size=1,
+                                 stride=1,
+                                 padding=0,
+                                 is_test=is_test)
+        self.conv1 = ConvBNLayer(ch_in=channel,
+                                 ch_out=channel * 2,
+                                 filter_size=3,
+                                 stride=1,
+                                 padding=1,
+                                 is_test=is_test)
+        self.conv2 = ConvBNLayer(ch_in=channel * 2,
+                                 ch_out=channel,
+                                 filter_size=1,
+                                 stride=1,
+                                 padding=0,
+                                 is_test=is_test)
+        self.conv3 = ConvBNLayer(ch_in=channel,
+                                 ch_out=channel * 2,
+                                 filter_size=3,
+                                 stride=1,
+                                 padding=1,
+                                 is_test=is_test)
+        self.route = ConvBNLayer(ch_in=channel * 2,
+                                 ch_out=channel,
+                                 filter_size=1,
+                                 stride=1,
+                                 padding=0,
+                                 is_test=is_test)
+        self.tip = ConvBNLayer(ch_in=channel,
+                               ch_out=channel * 2,
+                               filter_size=3,
+                               stride=1,
+                               padding=1,
+                               is_test=is_test)
 
     def forward(self, inputs):
         out = self.conv0(inputs)
@@ -180,6 +176,7 @@ def forward(self, inputs):
 
 
 class Upsample(fluid.dygraph.Layer):
+
     def __init__(self, scale=2):
         super(Upsample, self).__init__()
         self.scale = scale
@@ -187,20 +184,24 @@ def __init__(self, scale=2):
     def forward(self, inputs):
         # get dynamic upsample output shape
         shape_nchw = fluid.layers.shape(inputs)
-        shape_hw = fluid.layers.slice(
-            shape_nchw, axes=[0], starts=[2], ends=[4])
+        shape_hw = fluid.layers.slice(shape_nchw,
+                                      axes=[0],
+                                      starts=[2],
+                                      ends=[4])
         shape_hw.stop_gradient = True
         in_shape = fluid.layers.cast(shape_hw, dtype='int32')
         out_shape = in_shape * self.scale
         out_shape.stop_gradient = True
 
         # reisze by actual_shape
-        out = fluid.layers.resize_nearest(
-            input=inputs, scale=self.scale, actual_shape=out_shape)
+        out = fluid.layers.resize_nearest(input=inputs,
+                                          scale=self.scale,
+                                          actual_shape=out_shape)
         return out
 
 
 class YOLOv3(fluid.dygraph.Layer):
+
     def __init__(self, ch_in, is_train=True, use_random=False):
         super(YOLOv3, self).__init__()
 
@@ -215,39 +216,36 @@ def __init__(self, ch_in, is_train=True, use_random=False):
         for i in range(3):
             yolo_block = self.add_sublayer(
                 "yolo_detecton_block_%d" % (i),
-                YoloDetectionBlock(
-                    ch_in_list[i],
-                    channel=512 // (2**i),
-                    is_test=not self.is_train))
+                YoloDetectionBlock(ch_in_list[i],
+                                   channel=512 // (2**i),
+                                   is_test=not self.is_train))
             self.yolo_blocks.append(yolo_block)
 
             num_filters = len(cfg.anchor_masks[i]) * (cfg.class_num + 5)
 
             block_out = self.add_sublayer(
                 "block_out_%d" % (i),
-                Conv2D(
-                    num_channels=1024 // (2**i),
-                    num_filters=num_filters,
-                    filter_size=1,
-                    stride=1,
-                    padding=0,
-                    act=None,
-                    param_attr=ParamAttr(
-                        initializer=fluid.initializer.Normal(0., 0.02)),
-                    bias_attr=ParamAttr(
-                        initializer=fluid.initializer.Constant(0.0),
-                        regularizer=L2Decay(0.))))
+                Conv2D(num_channels=1024 // (2**i),
+                       num_filters=num_filters,
+                       filter_size=1,
+                       stride=1,
+                       padding=0,
+                       act=None,
+                       param_attr=ParamAttr(
+                           initializer=fluid.initializer.Normal(0., 0.02)),
+                       bias_attr=ParamAttr(
+                           initializer=fluid.initializer.Constant(0.0),
+                           regularizer=L2Decay(0.))))
             self.block_outputs.append(block_out)
             if i < 2:
                 route = self.add_sublayer(
                     "route2_%d" % i,
-                    ConvBNLayer(
-                        ch_in=512 // (2**i),
-                        ch_out=256 // (2**i),
-                        filter_size=1,
-                        stride=1,
-                        padding=0,
-                        is_test=(not self.is_train)))
+                    ConvBNLayer(ch_in=512 // (2**i),
+                                ch_out=256 // (2**i),
+                                filter_size=1,
+                                stride=1,
+                                padding=0,
+                                is_test=(not self.is_train)))
                 self.route_blocks_2.append(route)
             self.upsample = Upsample()
 
@@ -313,8 +311,7 @@ def forward(self,
                     name="yolo_box" + str(i))
                 self.boxes.append(boxes)
                 self.scores.append(
-                    fluid.layers.transpose(
-                        scores, perm=[0, 2, 1]))
+                    fluid.layers.transpose(scores, perm=[0, 2, 1]))
             self.downsample //= 2
 
         if not self.is_train:
@@ -322,14 +319,13 @@ def forward(self,
             yolo_boxes = fluid.layers.concat(self.boxes, axis=1)
             yolo_scores = fluid.layers.concat(self.scores, axis=2)
 
-            pred = fluid.layers.multiclass_nms(
-                bboxes=yolo_boxes,
-                scores=yolo_scores,
-                score_threshold=cfg.valid_thresh,
-                nms_top_k=cfg.nms_topk,
-                keep_top_k=cfg.nms_posk,
-                nms_threshold=cfg.nms_thresh,
-                background_label=-1)
+            pred = fluid.layers.multiclass_nms(bboxes=yolo_boxes,
+                                               scores=yolo_scores,
+                                               score_threshold=cfg.valid_thresh,
+                                               nms_top_k=cfg.nms_topk,
+                                               keep_top_k=cfg.nms_posk,
+                                               nms_threshold=cfg.nms_thresh,
+                                               background_label=-1)
             return pred
         else:
             return sum(self.losses)
diff --git a/python/paddle/fluid/tests/unittests/elastic_demo.py b/python/paddle/fluid/tests/unittests/elastic_demo.py
index c5177c0f52950..af26abd0d8878 100644
--- a/python/paddle/fluid/tests/unittests/elastic_demo.py
+++ b/python/paddle/fluid/tests/unittests/elastic_demo.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,9 +15,10 @@
 import os, sys
 import time
 
-sys.stderr.write("{}-DISTRIBUTED_TRAINER_ENDPOINTS={}\n".format(os.environ[
-    'PADDLE_TRAINER_ID'], os.environ['DISTRIBUTED_TRAINER_ENDPOINTS']))
-sys.stderr.write("{}-PADDLE_TRAINERS={}\n".format(os.environ[
-    'PADDLE_TRAINER_ID'], os.environ['PADDLE_TRAINERS']))
+sys.stderr.write("{}-DISTRIBUTED_TRAINER_ENDPOINTS={}\n".format(
+    os.environ['PADDLE_TRAINER_ID'],
+    os.environ['DISTRIBUTED_TRAINER_ENDPOINTS']))
+sys.stderr.write("{}-PADDLE_TRAINERS={}\n".format(
+    os.environ['PADDLE_TRAINER_ID'], os.environ['PADDLE_TRAINERS']))
 
 time.sleep(600)
diff --git a/python/paddle/fluid/tests/unittests/fake_reader.py b/python/paddle/fluid/tests/unittests/fake_reader.py
index 34a256e15dd2f..f97884218eb36 100644
--- a/python/paddle/fluid/tests/unittests/fake_reader.py
+++ b/python/paddle/fluid/tests/unittests/fake_reader.py
@@ -21,14 +21,18 @@ def fake_imdb_reader(word_dict_size,
                      lower_seq_len=100,
                      upper_seq_len=200,
                      class_dim=2):
+
     def __reader__():
         for _ in six.moves.range(sample_num):
-            length = np.random.random_integers(
-                low=lower_seq_len, high=upper_seq_len, size=[1])[0]
-            ids = np.random.random_integers(
-                low=0, high=word_dict_size - 1, size=[length]).astype('int64')
-            label = np.random.random_integers(
-                low=0, high=class_dim - 1, size=[1]).astype('int64')[0]
+            length = np.random.random_integers(low=lower_seq_len,
+                                               high=upper_seq_len,
+                                               size=[1])[0]
+            ids = np.random.random_integers(low=0,
+                                            high=word_dict_size - 1,
+                                            size=[length]).astype('int64')
+            label = np.random.random_integers(low=0,
+                                              high=class_dim - 1,
+                                              size=[1]).astype('int64')[0]
             yield ids, label
 
     return __reader__
diff --git a/python/paddle/fluid/tests/unittests/feed_data_reader.py b/python/paddle/fluid/tests/unittests/feed_data_reader.py
index 1e6016d57bd77..9ea7e88f66eb0 100644
--- a/python/paddle/fluid/tests/unittests/feed_data_reader.py
+++ b/python/paddle/fluid/tests/unittests/feed_data_reader.py
@@ -18,6 +18,7 @@
 
 
 def cyclic_reader(reader):
+
     def __reader__():
         while True:
             for data in reader():
@@ -27,6 +28,7 @@ def __reader__():
 
 
 class FeedDataReader(object):
+
     def __init__(self, feed_list, reader):
         self._feed_list = []
         for var in feed_list:
diff --git a/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt b/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt
index f71e04c09aa38..e3bf89c48821a 100644
--- a/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt
@@ -1,6 +1,9 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/fft/__init__.py b/python/paddle/fluid/tests/unittests/fft/__init__.py
index b9a7651e44909..185a92b8d94d3 100644
--- a/python/paddle/fluid/tests/unittests/fft/__init__.py
+++ b/python/paddle/fluid/tests/unittests/fft/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py b/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py
index b00111f6821ae..3c48c99af34b5 100644
--- a/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py
+++ b/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -52,8 +52,8 @@ def _fftc2r(a, n=None, axis=-1, norm=None, forward=None):
         inv_norm = _get_forward_norm(n, norm)
     else:
         inv_norm = _get_backward_norm(n, norm)
-    output = _raw_fft(a.conj()
-                      if forward else a, n, axis, True, False, inv_norm)
+    output = _raw_fft(a.conj() if forward else a, n, axis, True, False,
+                      inv_norm)
     return output
 
 
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft.py b/python/paddle/fluid/tests/unittests/fft/test_fft.py
index 7ee5a04ece496..a3c62323c2c20 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_fft.py
+++ b/python/paddle/fluid/tests/unittests/fft/test_fft.py
@@ -44,13 +44,14 @@ def rand_x(dims=1,
            complex=False):
     shape = [np.random.randint(min_dim_len, max_dim_len) for i in range(dims)]
     if complex:
-        return np.random.randn(*shape).astype(dtype) + 1.j * np.random.randn(
-            *shape).astype(dtype)
+        return np.random.randn(*shape).astype(
+            dtype) + 1.j * np.random.randn(*shape).astype(dtype)
     else:
         return np.random.randn(*shape).astype(dtype)
 
 
 def place(devices, key='place'):
+
     def decorate(cls):
         module = sys.modules[cls.__module__].__dict__
         raw_classes = {
@@ -97,65 +98,66 @@ def decorate(cls):
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
     [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'),
-     ('test_x_complex', rand_x(
-         5, complex=True), None, -1,
-      'backward'), ('test_n_grater_input_length', rand_x(
-          5, max_dim_len=5), 11, -1,
-                    'backward'), ('test_n_smaller_than_input_length', rand_x(
-                        5, min_dim_len=5, complex=True), 3, -1, 'backward'),
+     ('test_x_complex', rand_x(5, complex=True), None, -1, 'backward'),
+     ('test_n_grater_input_length', rand_x(5,
+                                           max_dim_len=5), 11, -1, 'backward'),
+     ('test_n_smaller_than_input_length', rand_x(
+         5, min_dim_len=5, complex=True), 3, -1, 'backward'),
      ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
      ('test_norm_forward', rand_x(5), None, 3, 'forward'),
      ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
 class TestFft(unittest.TestCase):
+
     def test_fft(self):
         """Test fft with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             self.assertTrue(
-                np.allclose(
-                    scipy.fft.fft(self.x, self.n, self.axis, self.norm),
-                    paddle.fft.fft(
-                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
-                    rtol=RTOL.get(str(self.x.dtype)),
-                    atol=ATOL.get(str(self.x.dtype))))
+                np.allclose(scipy.fft.fft(self.x, self.n, self.axis, self.norm),
+                            paddle.fft.fft(paddle.to_tensor(self.x), self.n,
+                                           self.axis, self.norm),
+                            rtol=RTOL.get(str(self.x.dtype)),
+                            atol=ATOL.get(str(self.x.dtype))))
 
 
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
     [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'),
-     ('test_x_complex', rand_x(
-         5, complex=True), None, -1,
-      'backward'), ('test_n_grater_input_length', rand_x(
-          5, max_dim_len=5), 11, -1,
-                    'backward'), ('test_n_smaller_than_input_length', rand_x(
-                        5, min_dim_len=5, complex=True), 3, -1, 'backward'),
+     ('test_x_complex', rand_x(5, complex=True), None, -1, 'backward'),
+     ('test_n_grater_input_length', rand_x(5,
+                                           max_dim_len=5), 11, -1, 'backward'),
+     ('test_n_smaller_than_input_length', rand_x(
+         5, min_dim_len=5, complex=True), 3, -1, 'backward'),
      ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
      ('test_norm_forward', rand_x(5), None, 3, 'forward'),
      ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
 class TestIfft(unittest.TestCase):
+
     def test_fft(self):
         """Test ifft with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             self.assertTrue(
-                np.allclose(
-                    scipy.fft.ifft(self.x, self.n, self.axis, self.norm),
-                    paddle.fft.ifft(
-                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
-                    rtol=RTOL.get(str(self.x.dtype)),
-                    atol=ATOL.get(str(self.x.dtype))))
+                np.allclose(scipy.fft.ifft(self.x, self.n, self.axis,
+                                           self.norm),
+                            paddle.fft.ifft(paddle.to_tensor(self.x), self.n,
+                                            self.axis, self.norm),
+                            rtol=RTOL.get(str(self.x.dtype)),
+                            atol=ATOL.get(str(self.x.dtype))))
 
 
 @place(DEVICES)
-@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
-    ('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
-    ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
-    ('test_axis_out_of_range', rand_x(1), None, 10, 'backward', ValueError),
-    ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
-    ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError)
-])
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
+     ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
+     ('test_axis_out_of_range', rand_x(1), None, 10, 'backward', ValueError),
+     ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
+     ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError)]
+)
 class TestFftException(unittest.TestCase):
+
     def test_fft(self):
         """Test fft with buoudary condition
         Test case include:
@@ -165,56 +167,55 @@ def test_fft(self):
         - norm out of range
         """
         with self.assertRaises(self.expect_exception):
-            paddle.fft.fft(
-                paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+            paddle.fft.fft(paddle.to_tensor(self.x), self.n, self.axis,
+                           self.norm)
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
-        ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
-        ('test_x_complex128', rand_x(
-            5, complex=True), None, (0, 1), 'backward'),
-        ('test_n_grater_input_length', rand_x(
-            5, max_dim_len=5), (6, 6), (0, 1), 'backward'),
-        ('test_n_smaller_than_input_length', rand_x(
-            5, min_dim_len=5, complex=True), (4, 4), (0, 1), 'backward'),
-        ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
-        ('test_axis_none', rand_x(5), None, None, 'backward'),
-        ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
-        ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
-    ])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
+    ('test_x_complex128', rand_x(5, complex=True), None, (0, 1), 'backward'),
+    ('test_n_grater_input_length', rand_x(5, max_dim_len=5), (6, 6),
+     (0, 1), 'backward'),
+    ('test_n_smaller_than_input_length', rand_x(5, min_dim_len=5, complex=True),
+     (4, 4), (0, 1), 'backward'),
+    ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
+    ('test_axis_none', rand_x(5), None, None, 'backward'),
+    ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
+    ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
+])
 class TestFft2(unittest.TestCase):
+
     def test_fft2(self):
         """Test fft2 with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             self.assertTrue(
-                np.allclose(
-                    scipy.fft.fft2(self.x, self.n, self.axis, self.norm),
-                    paddle.fft.fft2(
-                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
-                    rtol=RTOL.get(str(self.x.dtype)),
-                    atol=ATOL.get(str(self.x.dtype))))
+                np.allclose(scipy.fft.fft2(self.x, self.n, self.axis,
+                                           self.norm),
+                            paddle.fft.fft2(paddle.to_tensor(self.x), self.n,
+                                            self.axis, self.norm),
+                            rtol=RTOL.get(str(self.x.dtype)),
+                            atol=ATOL.get(str(self.x.dtype))))
 
 
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
-    [('test_x_complex_input', rand_x(
-        2, complex=True), None, (0, 1), None,
-      ValueError), ('test_x_1dim_tensor', rand_x(1), None, (0, 1), None,
-                    ValueError), ('test_n_nagative', rand_x(2), -1, (0, 1),
-                                  'backward', ValueError),
-     ('test_n_len_not_equal_axis', rand_x(
-         5, max_dim_len=5), 11, (0, 1), 'backward',
-      ValueError), ('test_n_zero', rand_x(2), (0, 0), (0, 1), 'backward',
-                    ValueError), ('test_axis_out_of_range', rand_x(2), None,
-                                  (0, 1, 2), 'backward', ValueError),
+    [('test_x_complex_input', rand_x(2, complex=True), None,
+      (0, 1), None, ValueError),
+     ('test_x_1dim_tensor', rand_x(1), None, (0, 1), None, ValueError),
+     ('test_n_nagative', rand_x(2), -1, (0, 1), 'backward', ValueError),
+     ('test_n_len_not_equal_axis', rand_x(5, max_dim_len=5), 11,
+      (0, 1), 'backward', ValueError),
+     ('test_n_zero', rand_x(2), (0, 0), (0, 1), 'backward', ValueError),
+     ('test_axis_out_of_range', rand_x(2), None,
+      (0, 1, 2), 'backward', ValueError),
      ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
      ('test_axis_not_sequence', rand_x(5), None, -10, 'backward', ValueError),
      ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError)])
 class TestFft2Exception(unittest.TestCase):
+
     def test_fft2(self):
         """Test fft2 with buoudary condition
         Test case include:
@@ -227,58 +228,59 @@ def test_fft2(self):
         """
         with paddle.fluid.dygraph.guard(self.place):
             with self.assertRaises(self.expect_exception):
-                paddle.fft.fft2(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+                paddle.fft.fft2(paddle.to_tensor(self.x), self.n, self.axis,
+                                self.norm)
 
 
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
     [('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
-     ('test_x_complex128', rand_x(
-         5, complex=True), None, None,
-      'backward'), ('test_n_grater_input_length', rand_x(
-          5, max_dim_len=5), (6, 6), (1, 2), 'backward'), (
-              'test_n_smaller_input_length', rand_x(
-                  5, min_dim_len=5, complex=True), (3, 3), (1, 2), 'backward'),
-     ('test_axis_not_default', rand_x(5), None, (1, 2),
-      'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'),
+     ('test_x_complex128', rand_x(5, complex=True), None, None, 'backward'),
+     ('test_n_grater_input_length', rand_x(5, max_dim_len=5), (6, 6),
+      (1, 2), 'backward'),
+     ('test_n_smaller_input_length', rand_x(5, min_dim_len=5, complex=True),
+      (3, 3), (1, 2), 'backward'),
+     ('test_axis_not_default', rand_x(5), None, (1, 2), 'backward'),
+     ('test_norm_forward', rand_x(5), None, None, 'forward'),
      ('test_norm_ortho', rand_x(5), None, None, 'ortho')])
 class TestFftn(unittest.TestCase):
+
     def test_fftn(self):
         """Test fftn with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
-            np.testing.assert_allclose(
-                scipy.fft.fftn(self.x, self.n, self.axis, self.norm),
-                paddle.fft.fftn(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
-                rtol=RTOL.get(str(self.x.dtype)),
-                atol=ATOL.get(str(self.x.dtype)))
+            np.testing.assert_allclose(scipy.fft.fftn(self.x, self.n, self.axis,
+                                                      self.norm),
+                                       paddle.fft.fftn(paddle.to_tensor(self.x),
+                                                       self.n, self.axis,
+                                                       self.norm),
+                                       rtol=RTOL.get(str(self.x.dtype)),
+                                       atol=ATOL.get(str(self.x.dtype)))
 
 
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
     [('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
-     ('test_x_complex128', rand_x(
-         5, complex=True), None, None,
-      'backward'), ('test_n_grater_input_length', rand_x(
-          5, max_dim_len=5), (6, 6), (1, 2), 'backward'), (
-              'test_n_smaller_input_length', rand_x(
-                  5, min_dim_len=5, complex=True), (3, 3), (1, 2), 'backward'),
-     ('test_axis_not_default', rand_x(5), None, (1, 2),
-      'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'),
+     ('test_x_complex128', rand_x(5, complex=True), None, None, 'backward'),
+     ('test_n_grater_input_length', rand_x(5, max_dim_len=5), (6, 6),
+      (1, 2), 'backward'),
+     ('test_n_smaller_input_length', rand_x(5, min_dim_len=5, complex=True),
+      (3, 3), (1, 2), 'backward'),
+     ('test_axis_not_default', rand_x(5), None, (1, 2), 'backward'),
+     ('test_norm_forward', rand_x(5), None, None, 'forward'),
      ('test_norm_ortho', rand_x(5), None, None, 'ortho')])
 class TestIFftn(unittest.TestCase):
+
     def test_ifftn(self):
         """Test ifftn with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 scipy.fft.ifftn(self.x, self.n, self.axis, self.norm),
-                paddle.fft.ifftn(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                paddle.fft.ifftn(paddle.to_tensor(self.x), self.n, self.axis,
+                                 self.norm),
                 rtol=RTOL.get(str(self.x.dtype)),
                 atol=ATOL.get(str(self.x.dtype)))
 
@@ -286,67 +288,60 @@ def test_ifftn(self):
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
     ('test_x_complex128',
-     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-      ).astype(np.complex128), None, -1, "backward"),
-    ('test_n_grater_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 4, -1,
-     "backward"),
-    ('test_n_smaller_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 2, -1,
-     "backward"),
-    ('test_axis_not_last',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, 1,
-     "backward"),
-    ('test_norm_forward',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, 1,
-     "forward"),
-    ('test_norm_ortho',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
-     "ortho"),
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.complex128), None, -1, "backward"),
+    ('test_n_grater_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), 4, -1, "backward"),
+    ('test_n_smaller_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), 2, -1, "backward"),
+    ('test_axis_not_last', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, 1, "backward"),
+    ('test_norm_forward', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, 1, "forward"),
+    ('test_norm_ortho', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, -1, "ortho"),
 ])
 class TestHfft(unittest.TestCase):
+
     def test_hfft(self):
         """Test hfft with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
-            np.testing.assert_allclose(
-                scipy.fft.hfft(self.x, self.n, self.axis, self.norm),
-                paddle.fft.hfft(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
-                rtol=1e-5,
-                atol=0)
+            np.testing.assert_allclose(scipy.fft.hfft(self.x, self.n, self.axis,
+                                                      self.norm),
+                                       paddle.fft.hfft(paddle.to_tensor(self.x),
+                                                       self.n, self.axis,
+                                                       self.norm),
+                                       rtol=1e-5,
+                                       atol=0)
 
 
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
     ('test_x_complex128',
-     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-      ).astype(np.complex128), None, -1, "backward"),
-    ('test_n_grater_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 4, -1,
-     "backward"),
-    ('test_n_smaller_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 2, -1,
-     "backward"),
-    ('test_axis_not_last',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
-     "backward"),
-    ('test_norm_forward',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
-     "forward"),
-    ('test_norm_ortho',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
-     "ortho"),
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.complex128), None, -1, "backward"),
+    ('test_n_grater_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), 4, -1, "backward"),
+    ('test_n_smaller_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), 2, -1, "backward"),
+    ('test_axis_not_last', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, -1, "backward"),
+    ('test_norm_forward', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, -1, "forward"),
+    ('test_norm_ortho', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, -1, "ortho"),
 ])
 class TestIrfft(unittest.TestCase):
+
     def test_irfft(self):
         """Test irfft with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 scipy.fft.irfft(self.x, self.n, self.axis, self.norm),
-                paddle.fft.irfft(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                paddle.fft.irfft(paddle.to_tensor(self.x), self.n, self.axis,
+                                 self.norm),
                 rtol=1e-5,
                 atol=0)
 
@@ -354,33 +349,29 @@ def test_irfft(self):
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
     ('test_x_complex128',
-     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-      ).astype(np.complex128), None, None, "backward"),
-    ('test_n_grater_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [4], None,
-     "backward"),
-    ('test_n_smaller_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [2], None,
-     "backward"),
-    ('test_axis_not_last',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
-     "backward"),
-    ('test_norm_forward',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
-     "forward"),
-    ('test_norm_ortho',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
-     "ortho"),
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.complex128), None, None, "backward"),
+    ('test_n_grater_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), [4], None, "backward"),
+    ('test_n_smaller_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), [2], None, "backward"),
+    ('test_axis_not_last', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, None, "backward"),
+    ('test_norm_forward', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, None, "forward"),
+    ('test_norm_ortho', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, None, "ortho"),
 ])
 class TestIrfftn(unittest.TestCase):
+
     def test_irfftn(self):
         """Test irfftn with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 scipy.fft.irfftn(self.x, self.n, self.axis, self.norm),
-                paddle.fft.irfftn(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                paddle.fft.irfftn(paddle.to_tensor(self.x), self.n, self.axis,
+                                  self.norm),
                 rtol=1e-5,
                 atol=0)
 
@@ -388,33 +379,29 @@ def test_irfftn(self):
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
     ('test_x_complex128',
-     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-      ).astype(np.complex128), None, None, "backward"),
-    ('test_n_grater_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [4], None,
-     "backward"),
-    ('test_n_smaller_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [2], None,
-     "backward"),
-    ('test_axis_not_last',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
-     "backward"),
-    ('test_norm_forward',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
-     "forward"),
-    ('test_norm_ortho',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
-     "ortho"),
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.complex128), None, None, "backward"),
+    ('test_n_grater_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), [4], None, "backward"),
+    ('test_n_smaller_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), [2], None, "backward"),
+    ('test_axis_not_last', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, None, "backward"),
+    ('test_norm_forward', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, None, "forward"),
+    ('test_norm_ortho', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, None, "ortho"),
 ])
 class TestHfftn(unittest.TestCase):
+
     def test_hfftn(self):
         """Test hfftn with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 scipy.fft.hfftn(self.x, self.n, self.axis, self.norm),
-                paddle.fft.hfftn(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                paddle.fft.hfftn(paddle.to_tensor(self.x), self.n, self.axis,
+                                 self.norm),
                 rtol=1e-5,
                 atol=0)
 
@@ -422,29 +409,30 @@ def test_hfftn(self):
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 's', 'axis', 'norm'), [
     ('test_x_complex128',
-     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-      ).astype(np.complex128), None, (-2, -1), "backward"),
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.complex128), None, (-2, -1), "backward"),
     ('test_with_s', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
      [2, 2], (-2, -1), "backward", ValueError),
     ('test_axis_not_last',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
-     "backward"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None,
+     (-2, -1), "backward"),
     ('test_norm_forward',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
-     "forward"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None,
+     (-2, -1), "forward"),
     ('test_norm_ortho',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
-     "ortho"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None,
+     (-2, -1), "ortho"),
 ])
 class TestHfft2(unittest.TestCase):
+
     def test_hfft2(self):
         """Test hfft2 with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 scipy.fft.hfft2(self.x, self.s, self.axis, self.norm),
-                paddle.fft.hfft2(
-                    paddle.to_tensor(self.x), self.s, self.axis, self.norm),
+                paddle.fft.hfft2(paddle.to_tensor(self.x), self.s, self.axis,
+                                 self.norm),
                 rtol=1e-5,
                 atol=0)
 
@@ -452,57 +440,55 @@ def test_hfft2(self):
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 's', 'axis', 'norm'), [
     ('test_x_complex128',
-     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-      ).astype(np.complex128), None, (-2, -1), "backward"),
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.complex128), None, (-2, -1), "backward"),
     ('test_n_equal_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (4, 6), (-2, -1),
-     "backward"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (4, 6),
+     (-2, -1), "backward"),
     ('test_axis_not_last',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
-     "backward"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None,
+     (-2, -1), "backward"),
     ('test_norm_forward',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
-     "forward"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None,
+     (-2, -1), "forward"),
     ('test_norm_ortho',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
-     "ortho"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None,
+     (-2, -1), "ortho"),
 ])
 class TestIrfft2(unittest.TestCase):
+
     def test_irfft2(self):
         """Test irfft2 with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 scipy.fft.irfft2(self.x, self.s, self.axis, self.norm),
-                paddle.fft.irfft2(
-                    paddle.to_tensor(self.x), self.s, self.axis, self.norm),
+                paddle.fft.irfft2(paddle.to_tensor(self.x), self.s, self.axis,
+                                  self.norm),
                 rtol=1e-5,
                 atol=0)
 
 
 @place(DEVICES)
-@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [(
-    'test_bool_input',
-    (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(np.bool8),
-    None, -1, 'backward', NotImplementedError), (
-        'test_n_nagative',
-        np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), -1, -1,
-        'backward', ValueError), (
-            'test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
-            0, -1, 'backward', ValueError), (
-                'test_n_type',
-                np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-                (1, 2, 3), -1, 'backward', ValueError), (
-                    'test_axis_out_of_range',
-                    np.random.randn(4) + 1j * np.random.randn(4), None, 10,
-                    'backward', ValueError), (
-                        'test_axis_with_array',
-                        np.random.randn(4) + 1j * np.random.randn(4), None,
-                        (0, 1), 'backward', ValueError), (
-                            'test_norm_not_in_enum_value',
-                            np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
-                            None, -1, 'random', ValueError)])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+    ('test_bool_input',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.bool8), None, -1, 'backward', NotImplementedError),
+    ('test_n_nagative', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), -1, -1, 'backward', ValueError),
+    ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1,
+     'backward', ValueError),
+    ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+     (1, 2, 3), -1, 'backward', ValueError),
+    ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+     None, 10, 'backward', ValueError),
+    ('test_axis_with_array', np.random.randn(4) + 1j * np.random.randn(4), None,
+     (0, 1), 'backward', ValueError),
+    ('test_norm_not_in_enum_value', np.random.randn(4, 4) +
+     1j * np.random.randn(4, 4), None, -1, 'random', ValueError)
+])
 class TestHfftException(unittest.TestCase):
+
     def test_hfft(self):
         """Test hfft with buoudary condition
         Test case include:
@@ -515,28 +501,27 @@ def test_hfft(self):
         """
         with paddle.fluid.dygraph.guard(self.place):
             with self.assertRaises(self.expect_exception):
-                paddle.fft.hfft(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+                paddle.fft.hfft(paddle.to_tensor(self.x), self.n, self.axis,
+                                self.norm)
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
-    [('test_n_nagative',
-      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), -1, -1,
-      'backward', ValueError),
-     ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1,
-      'backward', ValueError),
-     ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-      (1, 2), -1, 'backward', ValueError),
-     ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
-      None, 10, 'backward', ValueError),
-     ('test_axis_with_array', np.random.randn(4) + 1j * np.random.randn(4),
-      None, (0, 1), 'backward',
-      ValueError), ('test_norm_not_in_enum_value',
-                    np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None,
-                    None, 'random', ValueError)])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+    ('test_n_nagative', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), -1, -1, 'backward', ValueError),
+    ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1,
+     'backward', ValueError),
+    ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+     (1, 2), -1, 'backward', ValueError),
+    ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+     None, 10, 'backward', ValueError),
+    ('test_axis_with_array', np.random.randn(4) + 1j * np.random.randn(4), None,
+     (0, 1), 'backward', ValueError),
+    ('test_norm_not_in_enum_value', np.random.randn(4, 4) +
+     1j * np.random.randn(4, 4), None, None, 'random', ValueError)
+])
 class TestIrfftException(unittest.TestCase):
+
     def test_irfft(self):
         """
         Test irfft with buoudary condition
@@ -549,16 +534,16 @@ def test_irfft(self):
         """
         with paddle.fluid.dygraph.guard(self.place):
             with self.assertRaises(self.expect_exception):
-                paddle.fft.irfft(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+                paddle.fft.irfft(paddle.to_tensor(self.x), self.n, self.axis,
+                                 self.norm)
 
 
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
     [('test_bool_input',
-      (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-       ).astype(np.bool8), None, (-2, -1), 'backward', NotImplementedError),
+      (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+          np.bool8), None, (-2, -1), 'backward', NotImplementedError),
      ('test_n_nagative',
       np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
       (-2, -1), 'backward', ValueError),
@@ -567,16 +552,16 @@ def test_irfft(self):
      ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
       3, None, 'backward', ValueError),
      ('test_n_axis_dim',
-      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (1, 2), (-1),
-      'backward', ValueError), ('test_axis_out_of_range',
-                                np.random.randn(4) + 1j * np.random.randn(4),
-                                None, (1, 2), 'backward', ValueError),
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (1, 2),
+      (-1), 'backward', ValueError),
+     ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+      None, (1, 2), 'backward', ValueError),
      ('test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None, -1,
-      'backward',
-      ValueError), ('test_norm_not_in_enum_value',
-                    np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None,
-                    None, 'random', ValueError)])
+      'backward', ValueError),
+     ('test_norm_not_in_enum_value', np.random.randn(4, 4) +
+      1j * np.random.randn(4, 4), None, None, 'random', ValueError)])
 class TestHfft2Exception(unittest.TestCase):
+
     def test_hfft2(self):
         """
         Test hfft2 with buoudary condition
@@ -590,8 +575,8 @@ def test_hfft2(self):
         """
         with paddle.fluid.dygraph.guard(self.place):
             with self.assertRaises(self.expect_exception):
-                paddle.fft.hfft2(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+                paddle.fft.hfft2(paddle.to_tensor(self.x), self.n, self.axis,
+                                 self.norm)
 
 
 @place(DEVICES)
@@ -601,23 +586,23 @@ def test_hfft2(self):
       np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
       (-2, -1), 'backward', ValueError),
      ('test_zero_point',
-      np.random.randn(4, 4, 1) + 1j * np.random.randn(4, 4, 1), None, (-2, -1),
-      "backward", ValueError),
+      np.random.randn(4, 4, 1) + 1j * np.random.randn(4, 4, 1), None,
+      (-2, -1), "backward", ValueError),
      ('test_n_zero', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
       (0, 0), (-2, -1), 'backward', ValueError),
      ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-      3, -1, 'backward',
-      ValueError), ('test_n_axis_dim',
-                    np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-                    (1, 2), (-3, -2, -1), 'backward', ValueError),
+      3, -1, 'backward', ValueError),
+     ('test_n_axis_dim',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (1, 2),
+      (-3, -2, -1), 'backward', ValueError),
      ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
-      None, (1, 2), 'backward', ValueError), (
-          'test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None,
-          1, 'backward',
-          ValueError), ('test_norm_not_in_enum_value',
-                        np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
-                        None, None, 'random', ValueError)])
+      None, (1, 2), 'backward', ValueError),
+     ('test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None, 1,
+      'backward', ValueError),
+     ('test_norm_not_in_enum_value', np.random.randn(4, 4) +
+      1j * np.random.randn(4, 4), None, None, 'random', ValueError)])
 class TestIrfft2Exception(unittest.TestCase):
+
     def test_irfft2(self):
         """
         Test irfft2 with buoudary condition
@@ -631,16 +616,16 @@ def test_irfft2(self):
         """
         with paddle.fluid.dygraph.guard(self.place):
             with self.assertRaises(self.expect_exception):
-                paddle.fft.irfft2(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+                paddle.fft.irfft2(paddle.to_tensor(self.x), self.n, self.axis,
+                                  self.norm)
 
 
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
     [('test_bool_input',
-      (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-       ).astype(np.bool8), None, (-2, -1), 'backward', NotImplementedError),
+      (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+          np.bool8), None, (-2, -1), 'backward', NotImplementedError),
      ('test_n_nagative',
       np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
       (-2, -1), 'backward', ValueError),
@@ -649,17 +634,16 @@ def test_irfft2(self):
      ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
       3, -1, 'backward', ValueError),
      ('test_n_axis_dim',
-      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-      (1, 2), (-3, -2, -1), 'backward',
-      ValueError), ('test_axis_out_of_range',
-                    np.random.randn(4) + 1j * np.random.randn(4), None,
-                    (10, 20), 'backward', ValueError),
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (1, 2),
+      (-3, -2, -1), 'backward', ValueError),
+     ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+      None, (10, 20), 'backward', ValueError),
      ('test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None, 1,
-      'backward',
-      ValueError), ('test_norm_not_in_enum_value',
-                    np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None,
-                    None, 'random', ValueError)])
+      'backward', ValueError),
+     ('test_norm_not_in_enum_value', np.random.randn(4, 4) +
+      1j * np.random.randn(4, 4), None, None, 'random', ValueError)])
 class TestHfftnException(unittest.TestCase):
+
     def test_hfftn(self):
         """Test hfftn with buoudary condition
         Test case include:
@@ -672,8 +656,8 @@ def test_hfftn(self):
         """
         with paddle.fluid.dygraph.guard(self.place):
             with self.assertRaises(self.expect_exception):
-                paddle.fft.hfftn(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+                paddle.fft.hfftn(paddle.to_tensor(self.x), self.n, self.axis,
+                                 self.norm)
 
 
 @place(DEVICES)
@@ -685,18 +669,18 @@ def test_hfftn(self):
      ('test_n_zero', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
       (0, 0), (-2, -1), 'backward', ValueError),
      ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-      3, -1, 'backward',
-      ValueError), ('test_n_axis_dim',
-                    np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-                    (1, 2), (-3, -2, -1), 'backward', ValueError),
+      3, -1, 'backward', ValueError),
+     ('test_n_axis_dim',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (1, 2),
+      (-3, -2, -1), 'backward', ValueError),
      ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
       None, (10, 20), 'backward', ValueError),
      ('test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None, 1,
-      'backward',
-      ValueError), ('test_norm_not_in_enum_value',
-                    np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None,
-                    None, 'random', ValueError)])
+      'backward', ValueError),
+     ('test_norm_not_in_enum_value', np.random.randn(4, 4) +
+      1j * np.random.randn(4, 4), None, None, 'random', ValueError)])
 class TestIrfftnException(unittest.TestCase):
+
     def test_irfftn(self):
         """Test irfftn with buoudary condition
         Test case include:
@@ -708,44 +692,46 @@ def test_irfftn(self):
         """
         with paddle.fluid.dygraph.guard(self.place):
             with self.assertRaises(self.expect_exception):
-                paddle.fft.irfftn(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+                paddle.fft.irfftn(paddle.to_tensor(self.x), self.n, self.axis,
+                                  self.norm)
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
-    [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'), (
-        'test_n_grater_than_input_length', rand_x(
-            5, max_dim_len=5), 11, -1, 'backward'),
-     ('test_n_smaller_than_input_length', rand_x(
-         5, min_dim_len=5), 3, -1,
-      'backward'), ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
-     ('test_norm_forward', rand_x(5), None, 3, 'forward'),
-     ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+              [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'),
+               ('test_n_grater_than_input_length', rand_x(
+                   5, max_dim_len=5), 11, -1, 'backward'),
+               ('test_n_smaller_than_input_length', rand_x(
+                   5, min_dim_len=5), 3, -1, 'backward'),
+               ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
+               ('test_norm_forward', rand_x(5), None, 3, 'forward'),
+               ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
 class TestRfft(unittest.TestCase):
+
     def test_rfft(self):
         """Test rfft with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             self.assertTrue(
-                np.allclose(
-                    scipy.fft.rfft(self.x, self.n, self.axis, self.norm),
-                    paddle.fft.rfft(
-                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
-                    rtol=RTOL.get(str(self.x.dtype)),
-                    atol=ATOL.get(str(self.x.dtype))))
+                np.allclose(scipy.fft.rfft(self.x, self.n, self.axis,
+                                           self.norm),
+                            paddle.fft.rfft(paddle.to_tensor(self.x), self.n,
+                                            self.axis, self.norm),
+                            rtol=RTOL.get(str(self.x.dtype)),
+                            atol=ATOL.get(str(self.x.dtype))))
 
 
 @place(DEVICES)
-@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
-    ('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
-    ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
-    ('test_axis_out_of_range', rand_x(1), None, 10, 'backward', ValueError),
-    ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
-    ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError)
-])
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
+     ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
+     ('test_axis_out_of_range', rand_x(1), None, 10, 'backward', ValueError),
+     ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
+     ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError)]
+)
 class TestRfftException(unittest.TestCase):
+
     def test_rfft(self):
         """Test rfft with buoudary condition
         Test case include:
@@ -756,54 +742,52 @@ def test_rfft(self):
         - the dimensions of n and axis are different
         """
         with self.assertRaises(self.expect_exception):
-            paddle.fft.rfft(
-                paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+            paddle.fft.rfft(paddle.to_tensor(self.x), self.n, self.axis,
+                            self.norm)
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
-        ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
-        ('test_n_grater_input_length', rand_x(
-            5, max_dim_len=5), (6, 6), (0, 1), 'backward'),
-        ('test_n_smaller_than_input_length', rand_x(
-            5, min_dim_len=5), (4, 4), (0, 1), 'backward'),
-        ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
-        ('test_axis_none', rand_x(5), None, None, 'backward'),
-        ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
-        ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
-    ])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
+    ('test_n_grater_input_length', rand_x(5, max_dim_len=5), (6, 6),
+     (0, 1), 'backward'),
+    ('test_n_smaller_than_input_length', rand_x(5, min_dim_len=5), (4, 4),
+     (0, 1), 'backward'),
+    ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
+    ('test_axis_none', rand_x(5), None, None, 'backward'),
+    ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
+    ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
+])
 class TestRfft2(unittest.TestCase):
+
     def test_rfft2(self):
         """Test rfft2 with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             self.assertTrue(
-                np.allclose(
-                    scipy.fft.rfft2(self.x, self.n, self.axis, self.norm),
-                    paddle.fft.rfft2(
-                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
-                    rtol=RTOL.get(str(self.x.dtype)),
-                    atol=ATOL.get(str(self.x.dtype))))
+                np.allclose(scipy.fft.rfft2(self.x, self.n, self.axis,
+                                            self.norm),
+                            paddle.fft.rfft2(paddle.to_tensor(self.x), self.n,
+                                             self.axis, self.norm),
+                            rtol=RTOL.get(str(self.x.dtype)),
+                            atol=ATOL.get(str(self.x.dtype))))
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
-        ('test_x_complex_input', rand_x(
-            2, complex=True), None, (0, 1), 'backward', RuntimeError),
-        ('test_x_1dim_tensor', rand_x(1), None, (0, 1), 'backward', ValueError),
-        ('test_n_nagative', rand_x(2), -1, (0, 1), 'backward', ValueError),
-        ('test_n_zero', rand_x(2), 0, (0, 1), 'backward', ValueError),
-        ('test_axis_out_of_range', rand_x(2), None, (0, 1, 2), 'backward',
-         ValueError),
-        ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward',
-         ValueError),
-        ('test_axis_not_sequence', rand_x(5), None, -10, 'backward',
-         ValueError),
-        ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError),
-    ])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+    ('test_x_complex_input', rand_x(2, complex=True), None,
+     (0, 1), 'backward', RuntimeError),
+    ('test_x_1dim_tensor', rand_x(1), None, (0, 1), 'backward', ValueError),
+    ('test_n_nagative', rand_x(2), -1, (0, 1), 'backward', ValueError),
+    ('test_n_zero', rand_x(2), 0, (0, 1), 'backward', ValueError),
+    ('test_axis_out_of_range', rand_x(2), None,
+     (0, 1, 2), 'backward', ValueError),
+    ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
+    ('test_axis_not_sequence', rand_x(5), None, -10, 'backward', ValueError),
+    ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError),
+])
 class TestRfft2Exception(unittest.TestCase):
+
     def test_rfft2(self):
         """Test rfft2 with buoudary condition
         Test case include:
@@ -816,49 +800,48 @@ def test_rfft2(self):
         """
         with paddle.fluid.dygraph.guard(self.place):
             with self.assertRaises(self.expect_exception):
-                paddle.fft.rfft2(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+                paddle.fft.rfft2(paddle.to_tensor(self.x), self.n, self.axis,
+                                 self.norm)
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
-        ('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
-        ('test_n_grater_input_length', rand_x(
-            5, max_dim_len=5), (6, 6), (1, 2), 'backward'),
-        ('test_n_smaller_input_length', rand_x(
-            5, min_dim_len=5), (3, 3), (1, 2), 'backward'),
-        ('test_axis_not_default', rand_x(5), None, (1, 2), 'backward'),
-        ('test_norm_forward', rand_x(5), None, None, 'forward'),
-        ('test_norm_ortho', rand_x(5), None, None, 'ortho'),
-    ])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
+    ('test_n_grater_input_length', rand_x(5, max_dim_len=5), (6, 6),
+     (1, 2), 'backward'),
+    ('test_n_smaller_input_length', rand_x(5, min_dim_len=5), (3, 3),
+     (1, 2), 'backward'),
+    ('test_axis_not_default', rand_x(5), None, (1, 2), 'backward'),
+    ('test_norm_forward', rand_x(5), None, None, 'forward'),
+    ('test_norm_ortho', rand_x(5), None, None, 'ortho'),
+])
 class TestRfftn(unittest.TestCase):
+
     def test_rfftn(self):
         """Test rfftn with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             self.assertTrue(
-                np.allclose(
-                    scipy.fft.rfftn(self.x, self.n, self.axis, self.norm),
-                    paddle.fft.rfftn(
-                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
-                    rtol=RTOL.get(str(self.x.dtype)),
-                    atol=ATOL.get(str(self.x.dtype))))
+                np.allclose(scipy.fft.rfftn(self.x, self.n, self.axis,
+                                            self.norm),
+                            paddle.fft.rfftn(paddle.to_tensor(self.x), self.n,
+                                             self.axis, self.norm),
+                            rtol=RTOL.get(str(self.x.dtype)),
+                            atol=ATOL.get(str(self.x.dtype))))
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
-    [('test_x_complex', rand_x(
-        4, complex=True), None, None, 'backward',
-      RuntimeError), ('test_n_nagative', rand_x(4), (-1, -1), (1, 2),
-                      'backward', ValueError),
-     ('test_n_not_sequence', rand_x(4), -1, None, 'backward', ValueError),
-     ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError), (
-         'test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward',
-         ValueError),
-     ('test_norm_not_in_enum', rand_x(2), None, -1, 'random', ValueError)])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+    ('test_x_complex', rand_x(
+        4, complex=True), None, None, 'backward', RuntimeError),
+    ('test_n_nagative', rand_x(4), (-1, -1), (1, 2), 'backward', ValueError),
+    ('test_n_not_sequence', rand_x(4), -1, None, 'backward', ValueError),
+    ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError),
+    ('test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward', ValueError),
+    ('test_norm_not_in_enum', rand_x(2), None, -1, 'random', ValueError)
+])
 class TestRfftnException(unittest.TestCase):
+
     def test_rfftn(self):
         """Test rfftn with buoudary condition
         Test case include:
@@ -869,43 +852,45 @@ def test_rfftn(self):
         """
         with paddle.fluid.dygraph.guard(self.place):
             with self.assertRaises(self.expect_exception):
-                paddle.fft.rfftn(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+                paddle.fft.rfftn(paddle.to_tensor(self.x), self.n, self.axis,
+                                 self.norm)
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
-    [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'), (
-        'test_n_grater_than_input_length', rand_x(
-            5, max_dim_len=5), 11, -1, 'backward'),
-     ('test_n_smaller_than_input_length', rand_x(
-         5, min_dim_len=5), 3, -1,
-      'backward'), ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
-     ('test_norm_forward', rand_x(5), None, 3, 'forward'),
-     ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+              [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'),
+               ('test_n_grater_than_input_length', rand_x(
+                   5, max_dim_len=5), 11, -1, 'backward'),
+               ('test_n_smaller_than_input_length', rand_x(
+                   5, min_dim_len=5), 3, -1, 'backward'),
+               ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
+               ('test_norm_forward', rand_x(5), None, 3, 'forward'),
+               ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
 class TestIhfft(unittest.TestCase):
+
     def test_ihfft(self):
         """Test ihfft with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 scipy.fft.ihfft(self.x, self.n, self.axis, self.norm),
-                paddle.fft.ihfft(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                paddle.fft.ihfft(paddle.to_tensor(self.x), self.n, self.axis,
+                                 self.norm),
                 rtol=RTOL.get(str(self.x.dtype)),
                 atol=ATOL.get(str(self.x.dtype)))
 
 
 @place(DEVICES)
-@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
-    ('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
-    ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
-    ('test_axis_out_of_range', rand_x(1), None, 10, 'backward', ValueError),
-    ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
-    ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError)
-])
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
+     ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
+     ('test_axis_out_of_range', rand_x(1), None, 10, 'backward', ValueError),
+     ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
+     ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError)]
+)
 class TestIhfftException(unittest.TestCase):
+
     def test_ihfft(self):
         """Test ihfft with buoudary condition
         Test case include:
@@ -915,32 +900,32 @@ def test_ihfft(self):
         """
         with paddle.fluid.dygraph.guard(self.place):
             with self.assertRaises(self.expect_exception):
-                paddle.fft.ihfft(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+                paddle.fft.ihfft(paddle.to_tensor(self.x), self.n, self.axis,
+                                 self.norm)
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
-        ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
-        ('test_n_grater_input_length', rand_x(
-            5, max_dim_len=5), (11, 11), (0, 1), 'backward'),
-        ('test_n_smaller_than_input_length', rand_x(
-            5, min_dim_len=5), (1, 1), (0, 1), 'backward'),
-        ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
-        ('test_axis_none', rand_x(5), None, None, 'backward'),
-        ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
-        ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
-    ])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
+    ('test_n_grater_input_length', rand_x(5, max_dim_len=5), (11, 11),
+     (0, 1), 'backward'),
+    ('test_n_smaller_than_input_length', rand_x(5, min_dim_len=5), (1, 1),
+     (0, 1), 'backward'),
+    ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
+    ('test_axis_none', rand_x(5), None, None, 'backward'),
+    ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
+    ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
+])
 class TestIhfft2(unittest.TestCase):
+
     def test_ihfft2(self):
         """Test ihfft2 with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             np.testing.assert_allclose(
                 scipy.fft.ihfft2(self.x, self.n, self.axis, self.norm),
-                paddle.fft.ihfft2(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                paddle.fft.ihfft2(paddle.to_tensor(self.x), self.n, self.axis,
+                                  self.norm),
                 rtol=RTOL.get(str(self.x.dtype)),
                 atol=ATOL.get(str(self.x.dtype)))
 
@@ -948,19 +933,20 @@ def test_ihfft2(self):
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
-    [('test_x_complex_input', rand_x(
-        2, complex=True), None, (0, 1), None, ValueError),
-     ('test_x_1dim_tensor', rand_x(1), None, (0, 1), None,
-      ValueError), ('test_n_nagative', rand_x(2), -1, (0, 1), 'backward',
-                    ValueError), ('test_n_len_not_equal_axis', rand_x(
-                        5, max_dim_len=5), 11, (0, 1), 'backward', ValueError),
+    [('test_x_complex_input', rand_x(2, complex=True), None,
+      (0, 1), None, ValueError),
+     ('test_x_1dim_tensor', rand_x(1), None, (0, 1), None, ValueError),
+     ('test_n_nagative', rand_x(2), -1, (0, 1), 'backward', ValueError),
+     ('test_n_len_not_equal_axis', rand_x(5, max_dim_len=5), 11,
+      (0, 1), 'backward', ValueError),
      ('test_n_zero', rand_x(2), (0, 0), (0, 1), 'backward', ValueError),
-     ('test_axis_out_of_range', rand_x(2), None, (0, 1, 2), 'backward',
-      ValueError), ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward',
-                    ValueError), ('test_axis_not_sequence', rand_x(5), None,
-                                  -10, 'backward', ValueError),
+     ('test_axis_out_of_range', rand_x(2), None,
+      (0, 1, 2), 'backward', ValueError),
+     ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
+     ('test_axis_not_sequence', rand_x(5), None, -10, 'backward', ValueError),
      ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError)])
 class TestIhfft2Exception(unittest.TestCase):
+
     def test_ihfft2(self):
         """Test ihfft2 with buoudary condition
         Test case include:
@@ -973,46 +959,47 @@ def test_ihfft2(self):
         """
         with paddle.fluid.dygraph.guard(self.place):
             with self.assertRaises(self.expect_exception):
-                paddle.fft.ihfft2(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+                paddle.fft.ihfft2(paddle.to_tensor(self.x), self.n, self.axis,
+                                  self.norm)
 
 
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
     [('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
-     ('test_n_grater_input_length', rand_x(
-         5, max_dim_len=5), (11, 11), (0, 1),
-      'backward'), ('test_n_smaller_input_length', rand_x(
-          5, min_dim_len=5), (1, 1), (0, 1), 'backward'),
-     ('test_axis_not_default', rand_x(5), None, (1, 2),
-      'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'),
+     ('test_n_grater_input_length', rand_x(5, max_dim_len=5), (11, 11),
+      (0, 1), 'backward'),
+     ('test_n_smaller_input_length', rand_x(5, min_dim_len=5), (1, 1),
+      (0, 1), 'backward'),
+     ('test_axis_not_default', rand_x(5), None, (1, 2), 'backward'),
+     ('test_norm_forward', rand_x(5), None, None, 'forward'),
      ('test_norm_ortho', rand_x(5), None, None, 'ortho')])
 class TestIhfftn(unittest.TestCase):
+
     def test_ihfftn(self):
         """Test ihfftn with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
             self.assertTrue(
-                np.allclose(
-                    scipy.fft.ihfftn(self.x, self.n, self.axis, self.norm),
-                    paddle.fft.ihfftn(
-                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
-                    rtol=RTOL.get(str(self.x.dtype)),
-                    atol=ATOL.get(str(self.x.dtype))))
+                np.allclose(scipy.fft.ihfftn(self.x, self.n, self.axis,
+                                             self.norm),
+                            paddle.fft.ihfftn(paddle.to_tensor(self.x), self.n,
+                                              self.axis, self.norm),
+                            rtol=RTOL.get(str(self.x.dtype)),
+                            atol=ATOL.get(str(self.x.dtype))))
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
-    [('test_x_complex', rand_x(
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+    ('test_x_complex', rand_x(
         4, complex=True), None, None, 'backward', RuntimeError),
-     ('test_n_nagative', rand_x(4), -1, None, 'backward', ValueError),
-     ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError), (
-         'test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward',
-         ValueError),
-     ('test_norm_not_in_enum', rand_x(2), None, -1, 'random', ValueError)])
+    ('test_n_nagative', rand_x(4), -1, None, 'backward', ValueError),
+    ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError),
+    ('test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward', ValueError),
+    ('test_norm_not_in_enum', rand_x(2), None, -1, 'random', ValueError)
+])
 class TestIhfftnException(unittest.TestCase):
+
     def test_ihfftn(self):
         """Test ihfftn with buoudary condition
         Test case include:
@@ -1023,8 +1010,8 @@ def test_ihfftn(self):
         """
         with paddle.fluid.dygraph.guard(self.place):
             with self.assertRaises(self.expect_exception):
-                paddle.fft.ihfftn(
-                    paddle.to_tensor(self.x), self.n, self.axis, self.norm)
+                paddle.fft.ihfftn(paddle.to_tensor(self.x), self.n, self.axis,
+                                  self.norm)
 
 
 @place(DEVICES)
@@ -1033,6 +1020,7 @@ def test_ihfftn(self):
     ('test_with_d', 20, 0.5, 'float32'),
 ])
 class TestFftFreq(unittest.TestCase):
+
     def test_fftfreq(self):
         """Test fftfreq with norm condition
         """
@@ -1050,6 +1038,7 @@ def test_fftfreq(self):
     ('test_with_d', 20, 0.5, 'float32'),
 ])
 class TestRfftFreq(unittest.TestCase):
+
     def test_rfftfreq(self):
         """Test rfftfreq with norm condition
         """
@@ -1070,37 +1059,39 @@ def test_rfftfreq(self):
      np.random.randn(5, 5) + 1j * np.random.randn(5, 5), None, 'complex128'),
 ])
 class TestFftShift(unittest.TestCase):
+
     def test_fftshift(self):
         """Test fftshift with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
-            np.testing.assert_allclose(
-                scipy.fft.fftshift(self.x, self.axes),
-                paddle.fft.fftshift(paddle.to_tensor(self.x),
-                                    self.axes).numpy(),
-                rtol=RTOL.get(str(self.x.dtype)),
-                atol=ATOL.get(str(self.x.dtype)))
+            np.testing.assert_allclose(scipy.fft.fftshift(self.x, self.axes),
+                                       paddle.fft.fftshift(
+                                           paddle.to_tensor(self.x),
+                                           self.axes).numpy(),
+                                       rtol=RTOL.get(str(self.x.dtype)),
+                                       atol=ATOL.get(str(self.x.dtype)))
 
 
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'axes'),
-    [('test_1d', np.random.randn(10), (0, ),
-      'float64'), ('test_2d', np.random.randn(10, 10), (0, 1), 'float64'),
+    [('test_1d', np.random.randn(10), (0, ), 'float64'),
+     ('test_2d', np.random.randn(10, 10), (0, 1), 'float64'),
      ('test_2d_with_all_axes', np.random.randn(10, 10), None, 'float64'),
      ('test_2d_odd_with_all_axes',
       np.random.randn(5, 5) + 1j * np.random.randn(5, 5), None, 'complex128')])
 class TestIfftShift(unittest.TestCase):
+
     def test_ifftshift(self):
         """Test ifftshift with norm condition
         """
         with paddle.fluid.dygraph.guard(self.place):
-            np.testing.assert_allclose(
-                scipy.fft.ifftshift(self.x, self.axes),
-                paddle.fft.ifftshift(paddle.to_tensor(self.x),
-                                     self.axes).numpy(),
-                rtol=RTOL.get(str(self.x.dtype)),
-                atol=ATOL.get(str(self.x.dtype)))
+            np.testing.assert_allclose(scipy.fft.ifftshift(self.x, self.axes),
+                                       paddle.fft.ifftshift(
+                                           paddle.to_tensor(self.x),
+                                           self.axes).numpy(),
+                                       rtol=RTOL.get(str(self.x.dtype)),
+                                       atol=ATOL.get(str(self.x.dtype)))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py b/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
index 4f19cd06a493f..ce0a623aea076 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
+++ b/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
@@ -42,27 +42,27 @@ def stgraph(func, place, x, n, axes, norm):
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
-    [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'),
-     ('test_x_complex64', rand_x(
-         5, np.float64, complex=True), None, -1,
-      'backward'), ('test_n_grater_than_input_length', rand_x(
-          5, max_dim_len=5), 11, -1,
-                    'backward'), ('test_n_smaller_than_input_length', rand_x(
-                        5, min_dim_len=5), 3, -1, 'backward'),
-     ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
-     ('test_norm_forward', rand_x(5), None, 3, 'forward'),
-     ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+              [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'),
+               ('test_x_complex64', rand_x(5, np.float64,
+                                           complex=True), None, -1, 'backward'),
+               ('test_n_grater_than_input_length', rand_x(
+                   5, max_dim_len=5), 11, -1, 'backward'),
+               ('test_n_smaller_than_input_length', rand_x(
+                   5, min_dim_len=5), 3, -1, 'backward'),
+               ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
+               ('test_norm_forward', rand_x(5), None, 3, 'forward'),
+               ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
 class TestFft(unittest.TestCase):
+
     def test_static_rfft(self):
         with stgraph(paddle.fft.fft, self.place, self.x, self.n, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.fft(self.x, self.n, self.axis, self.norm),
-                y,
-                rtol=RTOL.get(str(self.x.dtype)),
-                atol=ATOL.get(str(self.x.dtype)))
+            np.testing.assert_allclose(scipy.fft.fft(self.x, self.n, self.axis,
+                                                     self.norm),
+                                       y,
+                                       rtol=RTOL.get(str(self.x.dtype)),
+                                       atol=ATOL.get(str(self.x.dtype)))
 
 
 @place(DEVICES)
@@ -70,11 +70,12 @@ def test_static_rfft(self):
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
     [('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
      ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
-     ('test_axis_out_of_range', rand_x(1), None, 10, 'backward',
-      ValueError), ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward',
-                    ValueError), ('test_norm_not_in_enum_value', rand_x(2),
-                                  None, -1, 'random', ValueError)])
+     ('test_axis_out_of_range', rand_x(1), None, 10, 'backward', ValueError),
+     ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
+     ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError)]
+)
 class TestFftException(unittest.TestCase):
+
     def test_fft(self):
         with self.assertRaises(self.expect_exception):
             with stgraph(paddle.fft.fft, self.place, self.x, self.n, self.axis,
@@ -83,29 +84,28 @@ def test_fft(self):
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
-        ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
-        ('test_x_complex128', rand_x(
-            5, complex=True), None, (0, 1), 'backward'),
-        ('test_n_grater_input_length', rand_x(
-            5, max_dim_len=5), (6, 6), (0, 1), 'backward'),
-        ('test_n_smaller_than_input_length', rand_x(
-            5, min_dim_len=5), (4, 4), (0, 1), 'backward'),
-        ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
-        ('test_axis_none', rand_x(5), None, None, 'backward'),
-        ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
-        ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
-    ])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
+    ('test_x_complex128', rand_x(5, complex=True), None, (0, 1), 'backward'),
+    ('test_n_grater_input_length', rand_x(5, max_dim_len=5), (6, 6),
+     (0, 1), 'backward'),
+    ('test_n_smaller_than_input_length', rand_x(5, min_dim_len=5), (4, 4),
+     (0, 1), 'backward'),
+    ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
+    ('test_axis_none', rand_x(5), None, None, 'backward'),
+    ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
+    ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
+])
 class TestFft2(unittest.TestCase):
+
     def test_static_fft2(self):
         with stgraph(paddle.fft.fft2, self.place, self.x, self.n, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.fft2(self.x, self.n, self.axis, self.norm),
-                y,
-                rtol=RTOL.get(str(self.x.dtype)),
-                atol=ATOL.get(str(self.x.dtype)))
+            np.testing.assert_allclose(scipy.fft.fft2(self.x, self.n, self.axis,
+                                                      self.norm),
+                                       y,
+                                       rtol=RTOL.get(str(self.x.dtype)),
+                                       atol=ATOL.get(str(self.x.dtype)))
 
 
 @place(DEVICES)
@@ -116,15 +116,16 @@ def test_static_fft2(self):
         ('test_x_1dim_tensor', rand_x(1), None, (0, 1), 'backward', ValueError),
         ('test_n_nagative', rand_x(2), -1, (0, 1), 'backward', ValueError),
         ('test_n_zero', rand_x(2), 0, (0, 1), 'backward', ValueError),
-        ('test_axis_out_of_range', rand_x(2), None, (0, 1, 2), 'backward',
-         ValueError),
-        ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward',
-         ValueError),
+        ('test_axis_out_of_range', rand_x(2), None,
+         (0, 1, 2), 'backward', ValueError),
+        ('test_axis_with_array', rand_x(1), None,
+         (0, 1), 'backward', ValueError),
         ('test_axis_not_sequence', rand_x(5), None, -10, 'backward',
          ValueError),
         ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError)
     ])
 class TestFft2Exception(unittest.TestCase):
+
     def test_static_fft2(self):
         with self.assertRaises(self.expect_exception):
             with stgraph(paddle.fft.fft2, self.place, self.x, self.n, self.axis,
@@ -136,39 +137,39 @@ def test_static_fft2(self):
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
     [('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
-     ('test_x_complex128', rand_x(
-         5, np.float64, complex=True), None, None,
-      'backward'), ('test_n_grater_input_length', rand_x(
-          5, max_dim_len=5), (6, 6), (1, 2),
-                    'backward'), ('test_n_smaller_input_length', rand_x(
-                        5, min_dim_len=5), (3, 3), (1, 2), 'backward'),
-     ('test_axis_not_default', rand_x(5), None, (1, 2),
-      'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'),
+     ('test_x_complex128', rand_x(5, np.float64,
+                                  complex=True), None, None, 'backward'),
+     ('test_n_grater_input_length', rand_x(5, max_dim_len=5), (6, 6),
+      (1, 2), 'backward'),
+     ('test_n_smaller_input_length', rand_x(5, min_dim_len=5), (3, 3),
+      (1, 2), 'backward'),
+     ('test_axis_not_default', rand_x(5), None, (1, 2), 'backward'),
+     ('test_norm_forward', rand_x(5), None, None, 'forward'),
      ('test_norm_ortho', rand_x(5), None, None, 'ortho')])
 class TestFftn(unittest.TestCase):
+
     def test_static_fftn(self):
         with stgraph(paddle.fft.fftn, self.place, self.x, self.n, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.fftn(self.x, self.n, self.axis, self.norm),
-                y,
-                rtol=RTOL.get(str(self.x.dtype)),
-                atol=ATOL.get(str(self.x.dtype)))
+            np.testing.assert_allclose(scipy.fft.fftn(self.x, self.n, self.axis,
+                                                      self.norm),
+                                       y,
+                                       rtol=RTOL.get(str(self.x.dtype)),
+                                       atol=ATOL.get(str(self.x.dtype)))
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
-    [('test_x_complex', rand_x(
-        4, complex=True), None, None, 'backward',
-      TypeError), ('test_n_nagative', rand_x(4), (-1, -1), (1, 2), 'backward',
-                   ValueError), ('test_n_not_sequence', rand_x(4), -1, None,
-                                 'backward', ValueError),
-     ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError),
-     ('test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward',
-      ValueError), ('test_norm_not_in_enum', rand_x(2), None, -1, 'random',
-                    ValueError)])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+    ('test_x_complex', rand_x(4,
+                              complex=True), None, None, 'backward', TypeError),
+    ('test_n_nagative', rand_x(4), (-1, -1), (1, 2), 'backward', ValueError),
+    ('test_n_not_sequence', rand_x(4), -1, None, 'backward', ValueError),
+    ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError),
+    ('test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward', ValueError),
+    ('test_norm_not_in_enum', rand_x(2), None, -1, 'random', ValueError)
+])
 class TestRfftnException(unittest.TestCase):
+
     def test_static_rfftn(self):
         with self.assertRaises(self.expect_exception):
             with stgraph(paddle.fft.rfftn, self.place, self.x, self.n,
@@ -179,23 +180,18 @@ def test_static_rfftn(self):
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
     ('test_x_complex128',
-     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-      ).astype(np.complex128), None, -1, "backward"),
-    ('test_n_grater_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 4, -1,
-     "backward"),
-    ('test_n_smaller_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 2, -1,
-     "backward"),
-    ('test_axis_not_last',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, 1,
-     "backward"),
-    ('test_norm_forward',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, 1,
-     "forward"),
-    ('test_norm_ortho',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
-     "ortho"),
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.complex128), None, -1, "backward"),
+    ('test_n_grater_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), 4, -1, "backward"),
+    ('test_n_smaller_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), 2, -1, "backward"),
+    ('test_axis_not_last', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, 1, "backward"),
+    ('test_norm_forward', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, 1, "forward"),
+    ('test_norm_ortho', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, -1, "ortho"),
 ])
 class TestHfft(unittest.TestCase):
     """Test hfft with norm condition
@@ -204,33 +200,28 @@ class TestHfft(unittest.TestCase):
     def test_hfft(self):
         with stgraph(paddle.fft.hfft, self.place, self.x, self.n, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.hfft(self.x, self.n, self.axis, self.norm),
-                y,
-                rtol=1e-5,
-                atol=0)
+            np.testing.assert_allclose(scipy.fft.hfft(self.x, self.n, self.axis,
+                                                      self.norm),
+                                       y,
+                                       rtol=1e-5,
+                                       atol=0)
 
 
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
     ('test_x_complex128',
-     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-      ).astype(np.complex128), None, -1, "backward"),
-    ('test_n_grater_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 4, -1,
-     "backward"),
-    ('test_n_smaller_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 2, -1,
-     "backward"),
-    ('test_axis_not_last',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
-     "backward"),
-    ('test_norm_forward',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
-     "forward"),
-    ('test_norm_ortho',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, -1,
-     "ortho"),
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.complex128), None, -1, "backward"),
+    ('test_n_grater_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), 4, -1, "backward"),
+    ('test_n_smaller_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), 2, -1, "backward"),
+    ('test_axis_not_last', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, -1, "backward"),
+    ('test_norm_forward', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, -1, "forward"),
+    ('test_norm_ortho', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, -1, "ortho"),
 ])
 class TestIrfft(unittest.TestCase):
     """Test irfft with norm condition
@@ -239,33 +230,28 @@ class TestIrfft(unittest.TestCase):
     def test_irfft(self):
         with stgraph(paddle.fft.irfft, self.place, self.x, self.n, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.irfft(self.x, self.n, self.axis, self.norm),
-                y,
-                rtol=1e-5,
-                atol=0)
+            np.testing.assert_allclose(scipy.fft.irfft(self.x, self.n,
+                                                       self.axis, self.norm),
+                                       y,
+                                       rtol=1e-5,
+                                       atol=0)
 
 
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
     ('test_x_complex128',
-     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-      ).astype(np.complex128), None, None, "backward"),
-    ('test_n_grater_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [4], None,
-     "backward"),
-    ('test_n_smaller_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [2], None,
-     "backward"),
-    ('test_axis_not_last',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
-     "backward"),
-    ('test_norm_forward',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
-     "forward"),
-    ('test_norm_ortho',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
-     "ortho"),
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.complex128), None, None, "backward"),
+    ('test_n_grater_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), [4], None, "backward"),
+    ('test_n_smaller_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), [2], None, "backward"),
+    ('test_axis_not_last', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, None, "backward"),
+    ('test_norm_forward', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, None, "forward"),
+    ('test_norm_ortho', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, None, "ortho"),
 ])
 class Testirfftn(unittest.TestCase):
     """Test irfftn with norm condition
@@ -274,33 +260,28 @@ class Testirfftn(unittest.TestCase):
     def test_static_irfftn(self):
         with stgraph(paddle.fft.irfftn, self.place, self.x, self.n, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.irfftn(self.x, self.n, self.axis, self.norm),
-                y,
-                rtol=1e-5,
-                atol=0)
+            np.testing.assert_allclose(scipy.fft.irfftn(self.x, self.n,
+                                                        self.axis, self.norm),
+                                       y,
+                                       rtol=1e-5,
+                                       atol=0)
 
 
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
     ('test_x_complex128',
-     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-      ).astype(np.complex128), None, None, "backward"),
-    ('test_n_grater_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [4], None,
-     "backward"),
-    ('test_n_smaller_than_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [2], None,
-     "backward"),
-    ('test_axis_not_last',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
-     "backward"),
-    ('test_norm_forward',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
-     "forward"),
-    ('test_norm_ortho',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None,
-     "ortho"),
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.complex128), None, None, "backward"),
+    ('test_n_grater_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), [4], None, "backward"),
+    ('test_n_smaller_than_input_length', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), [2], None, "backward"),
+    ('test_axis_not_last', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, None, "backward"),
+    ('test_norm_forward', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, None, "forward"),
+    ('test_norm_ortho', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), None, None, "ortho"),
 ])
 class Testhfftn(unittest.TestCase):
     """Test hfftn with norm condition
@@ -309,33 +290,33 @@ class Testhfftn(unittest.TestCase):
     def test_static_hfftn(self):
         with stgraph(paddle.fft.hfftn, self.place, self.x, self.n, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.hfftn(self.x, self.n, self.axis, self.norm),
-                y,
-                rtol=1e-5,
-                atol=0)
+            np.testing.assert_allclose(scipy.fft.hfftn(self.x, self.n,
+                                                       self.axis, self.norm),
+                                       y,
+                                       rtol=1e-5,
+                                       atol=0)
 
 
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 's', 'axis', 'norm'), [
     ('test_x_complex128',
-     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-      ).astype(np.complex128), None, (-2, -1), "backward"),
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.complex128), None, (-2, -1), "backward"),
     ('test_n_grater_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [4, 8], (-2, -1),
-     "backward"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [4, 8],
+     (-2, -1), "backward"),
     ('test_n_smaller_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [2, 4], (-2, -1),
-     "backward"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), [2, 4],
+     (-2, -1), "backward"),
     ('test_axis_not_last',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
-     "backward"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None,
+     (-2, -1), "backward"),
     ('test_norm_forward',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
-     "forward"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None,
+     (-2, -1), "forward"),
     ('test_norm_ortho',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
-     "ortho"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None,
+     (-2, -1), "ortho"),
 ])
 class Testhfft2(unittest.TestCase):
     """Test hfft2 with norm condition
@@ -344,30 +325,30 @@ class Testhfft2(unittest.TestCase):
     def test_static_hfft2(self):
         with stgraph(paddle.fft.hfft2, self.place, self.x, self.s, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.hfft2(self.x, self.s, self.axis, self.norm),
-                y,
-                rtol=1e-5,
-                atol=0)
+            np.testing.assert_allclose(scipy.fft.hfft2(self.x, self.s,
+                                                       self.axis, self.norm),
+                                       y,
+                                       rtol=1e-5,
+                                       atol=0)
 
 
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 's', 'axis', 'norm'), [
     ('test_x_complex128',
-     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-      ).astype(np.complex128), None, (-2, -1), "backward"),
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.complex128), None, (-2, -1), "backward"),
     ('test_n_equal_input_length',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (2, 4), (-2, -1),
-     "backward"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (2, 4),
+     (-2, -1), "backward"),
     ('test_axis_not_last',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
-     "backward"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None,
+     (-2, -1), "backward"),
     ('test_norm_forward',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
-     "forward"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None,
+     (-2, -1), "forward"),
     ('test_norm_ortho',
-     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1),
-     "ortho"),
+     np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None,
+     (-2, -1), "ortho"),
 ])
 class TestIrfft2(unittest.TestCase):
     """Test irfft2 with norm condition
@@ -376,34 +357,33 @@ class TestIrfft2(unittest.TestCase):
     def test_static_irfft2(self):
         with stgraph(paddle.fft.irfft2, self.place, self.x, self.s, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.irfft2(self.x, self.s, self.axis, self.norm),
-                y,
-                rtol=1e-5,
-                atol=0)
+            np.testing.assert_allclose(scipy.fft.irfft2(self.x, self.s,
+                                                        self.axis, self.norm),
+                                       y,
+                                       rtol=1e-5,
+                                       atol=0)
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
-    [('test_input_dtype', np.random.randn(4, 4, 4), None, -1, 'backward',
-      TypeError), ('test_bool_input',
-                   (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-                    ).astype(np.bool8), None, -1, 'backward', TypeError),
-     ('test_n_nagative',
-      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), -1, -1,
-      'backward', ValueError),
-     ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1,
-      'backward', ValueError),
-     ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-      (1, 2, 3), -1, 'backward', ValueError),
-     ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
-      None, 10, 'backward', ValueError), (
-          'test_axis_with_array', np.random.randn(4) + 1j * np.random.randn(4),
-          None, (0, 1), 'backward',
-          ValueError), ('test_norm_not_in_enum_value',
-                        np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
-                        None, -1, 'random', ValueError)])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+    ('test_input_dtype', np.random.randn(4, 4,
+                                         4), None, -1, 'backward', TypeError),
+    ('test_bool_input',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.bool8), None, -1, 'backward', TypeError),
+    ('test_n_nagative', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), -1, -1, 'backward', ValueError),
+    ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1,
+     'backward', ValueError),
+    ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+     (1, 2, 3), -1, 'backward', ValueError),
+    ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+     None, 10, 'backward', ValueError),
+    ('test_axis_with_array', np.random.randn(4) + 1j * np.random.randn(4), None,
+     (0, 1), 'backward', ValueError),
+    ('test_norm_not_in_enum_value', np.random.randn(4, 4) +
+     1j * np.random.randn(4, 4), None, -1, 'random', ValueError)
+])
 class TestHfftException(unittest.TestCase):
     '''Test hfft with buoudary condition
     Test case include:
@@ -421,26 +401,25 @@ def test_static_hfft(self):
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
-    [('test_input_dtype', np.random.randn(4, 4, 4), None, -1, 'backward',
-      TypeError), ('test_bool_input',
-                   (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-                    ).astype(np.bool8), None, -1, 'backward', TypeError),
-     ('test_n_nagative',
-      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), -1, -1,
-      'backward', ValueError),
-     ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1,
-      'backward', ValueError),
-     ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-      (1, 2), -1, 'backward', ValueError),
-     ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
-      None, 10, 'backward', ValueError), (
-          'test_axis_with_array', np.random.randn(4) + 1j * np.random.randn(4),
-          None, (0, 1), 'backward',
-          ValueError), ('test_norm_not_in_enum_value',
-                        np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
-                        None, None, 'random', ValueError)])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+    ('test_input_dtype', np.random.randn(4, 4,
+                                         4), None, -1, 'backward', TypeError),
+    ('test_bool_input',
+     (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+         np.bool8), None, -1, 'backward', TypeError),
+    ('test_n_nagative', np.random.randn(4, 4, 4) +
+     1j * np.random.randn(4, 4, 4), -1, -1, 'backward', ValueError),
+    ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1,
+     'backward', ValueError),
+    ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
+     (1, 2), -1, 'backward', ValueError),
+    ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
+     None, 10, 'backward', ValueError),
+    ('test_axis_with_array', np.random.randn(4) + 1j * np.random.randn(4), None,
+     (0, 1), 'backward', ValueError),
+    ('test_norm_not_in_enum_value', np.random.randn(4, 4) +
+     1j * np.random.randn(4, 4), None, None, 'random', ValueError)
+])
 class TestIrfftException(unittest.TestCase):
     '''Test Irfft with buoudary condition
     Test case include:
@@ -461,27 +440,27 @@ def test_static_irfft(self):
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
-    [('test_input_dtype', np.random.randn(4, 4, 4), None, None, 'backward',
-      TypeError), ('test_bool_input',
-                   (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-                    ).astype(np.bool8), None, (-2, -1), 'backward', TypeError),
+    [('test_input_dtype', np.random.randn(
+        4, 4, 4), None, None, 'backward', TypeError),
+     ('test_bool_input',
+      (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+          np.bool8), None, (-2, -1), 'backward', TypeError),
      ('test_n_nagative',
       np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
       (-2, -1), 'backward', ValueError),
      ('test_n_zero', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
       (0, 0), (-2, -1), 'backward', ValueError),
      ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-      3, None, 'backward',
-      ValueError), ('test_n_axis_dim',
-                    np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-                    (1, 2), (-1), 'backward', ValueError),
+      3, None, 'backward', ValueError),
+     ('test_n_axis_dim',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (1, 2),
+      (-1), 'backward', ValueError),
      ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
-      None, (1, 2), 'backward', ValueError), (
-          'test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None,
-          -1, 'backward',
-          ValueError), ('test_norm_not_in_enum_value',
-                        np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
-                        None, None, 'random', ValueError)])
+      None, (1, 2), 'backward', ValueError),
+     ('test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None, -1,
+      'backward', ValueError),
+     ('test_norm_not_in_enum_value', np.random.randn(4, 4) +
+      1j * np.random.randn(4, 4), None, None, 'random', ValueError)])
 class TestHfft2Exception(unittest.TestCase):
     '''Test hfft2 with buoudary condition
     Test case include:
@@ -502,27 +481,27 @@ def test_static_hfft2(self):
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
-    [('test_input_dtype', np.random.randn(4, 4, 4), None, None, 'backward',
-      TypeError), ('test_bool_input',
-                   (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-                    ).astype(np.bool8), None, (-2, -1), 'backward', TypeError),
+    [('test_input_dtype', np.random.randn(
+        4, 4, 4), None, None, 'backward', TypeError),
+     ('test_bool_input',
+      (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+          np.bool8), None, (-2, -1), 'backward', TypeError),
      ('test_n_nagative',
       np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
       (-2, -1), 'backward', ValueError),
      ('test_n_zero', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
       (0, 0), (-2, -1), 'backward', ValueError),
      ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-      3, -1, 'backward',
-      ValueError), ('test_n_axis_dim',
-                    np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-                    (1, 2), (-3, -2, -1), 'backward', ValueError),
+      3, -1, 'backward', ValueError),
+     ('test_n_axis_dim',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (1, 2),
+      (-3, -2, -1), 'backward', ValueError),
      ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
-      None, (1, 2), 'backward', ValueError), (
-          'test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None,
-          1, 'backward',
-          ValueError), ('test_norm_not_in_enum_value',
-                        np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
-                        None, None, 'random', ValueError)])
+      None, (1, 2), 'backward', ValueError),
+     ('test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None, 1,
+      'backward', ValueError),
+     ('test_norm_not_in_enum_value', np.random.randn(4, 4) +
+      1j * np.random.randn(4, 4), None, None, 'random', ValueError)])
 class TestIrfft2Exception(unittest.TestCase):
     '''Test irfft2 with buoudary condition
     Test case include:
@@ -543,27 +522,27 @@ def test_static_irfft2(self):
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
-    [('test_input_dtype', np.random.randn(4, 4, 4), None, None, 'backward',
-      TypeError), ('test_bool_input',
-                   (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-                    ).astype(np.bool8), None, (-2, -1), 'backward', TypeError),
+    [('test_input_dtype', np.random.randn(
+        4, 4, 4), None, None, 'backward', TypeError),
+     ('test_bool_input',
+      (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
+          np.bool8), None, (-2, -1), 'backward', TypeError),
      ('test_n_nagative',
       np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
       (-2, -1), 'backward', ValueError),
      ('test_n_zero', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
       (0, 0), (-2, -1), 'backward', ValueError),
      ('test_n_type', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-      3, -1, 'backward',
-      ValueError), ('test_n_axis_dim',
-                    np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4),
-                    (1, 2), (-3, -2, -1), 'backward', ValueError),
+      3, -1, 'backward', ValueError),
+     ('test_n_axis_dim',
+      np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (1, 2),
+      (-3, -2, -1), 'backward', ValueError),
      ('test_axis_out_of_range', np.random.randn(4) + 1j * np.random.randn(4),
-      None, (10, 20), 'backward', ValueError), (
-          'test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None,
-          1, 'backward',
-          ValueError), ('test_norm_not_in_enum_value',
-                        np.random.randn(4, 4) + 1j * np.random.randn(4, 4),
-                        None, None, 'random', ValueError)])
+      None, (10, 20), 'backward', ValueError),
+     ('test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None, 1,
+      'backward', ValueError),
+     ('test_norm_not_in_enum_value', np.random.randn(4, 4) +
+      1j * np.random.randn(4, 4), None, None, 'random', ValueError)])
 class TestHfftnException(unittest.TestCase):
     '''Test hfftn with buoudary condition
     Test case include:
@@ -585,8 +564,8 @@ def test_static_hfftn(self):
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
     [
-        ('test_input_dtype', np.random.randn(4, 4, 4), None, None, 'backward',
-         TypeError),
+        ('test_input_dtype', np.random.randn(
+            4, 4, 4), None, None, 'backward', TypeError),
         #  ('test_bool_input',
         #                (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
         #                 ).astype(np.bool8), None, (-2, -1), 'backward', ValueError),
@@ -596,9 +575,8 @@ def test_static_hfftn(self):
         ('test_n_zero',
          np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (0, 0),
          (-2, -1), 'backward', ValueError),
-        ('test_n_type',
-         np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), 3, -1,
-         'backward', ValueError),
+        ('test_n_type', np.random.randn(4, 4, 4) +
+         1j * np.random.randn(4, 4, 4), 3, -1, 'backward', ValueError),
         ('test_n_axis_dim',
          np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (1, 2),
          (-3, -2, -1), 'backward', ValueError),
@@ -606,9 +584,8 @@ def test_static_hfftn(self):
          None, (10, 20), 'backward', ValueError),
         ('test_axis_type', np.random.randn(4) + 1j * np.random.randn(4), None,
          1, 'backward', ValueError),
-        ('test_norm_not_in_enum_value',
-         np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None, None,
-         'random', ValueError)
+        ('test_norm_not_in_enum_value', np.random.randn(4, 4) +
+         1j * np.random.randn(4, 4), None, None, 'random', ValueError)
     ])
 class TestIrfftnException(unittest.TestCase):
     '''Test irfftn with buoudary condition
@@ -628,25 +605,25 @@ def test_static_irfftn(self):
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
-    [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'), (
-        'test_n_grater_than_input_length', rand_x(
-            5, max_dim_len=5), 11, -1, 'backward'),
-     ('test_n_smaller_than_input_length', rand_x(
-         5, min_dim_len=5), 3, -1,
-      'backward'), ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
-     ('test_norm_forward', rand_x(5), None, 3, 'forward'),
-     ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+              [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'),
+               ('test_n_grater_than_input_length', rand_x(
+                   5, max_dim_len=5), 11, -1, 'backward'),
+               ('test_n_smaller_than_input_length', rand_x(
+                   5, min_dim_len=5), 3, -1, 'backward'),
+               ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
+               ('test_norm_forward', rand_x(5), None, 3, 'forward'),
+               ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
 class TestRfft(unittest.TestCase):
+
     def test_static_rfft(self):
         with stgraph(paddle.fft.rfft, self.place, self.x, self.n, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.rfft(self.x, self.n, self.axis, self.norm),
-                y,
-                rtol=RTOL.get(str(self.x.dtype)),
-                atol=ATOL.get(str(self.x.dtype)))
+            np.testing.assert_allclose(scipy.fft.rfft(self.x, self.n, self.axis,
+                                                      self.norm),
+                                       y,
+                                       rtol=RTOL.get(str(self.x.dtype)),
+                                       atol=ATOL.get(str(self.x.dtype)))
 
 
 @place(DEVICES)
@@ -654,11 +631,12 @@ def test_static_rfft(self):
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
     [('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
      ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
-     ('test_axis_out_of_range', rand_x(1), None, 10, 'backward',
-      ValueError), ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward',
-                    ValueError), ('test_norm_not_in_enum_value', rand_x(2),
-                                  None, -1, 'random', ValueError)])
+     ('test_axis_out_of_range', rand_x(1), None, 10, 'backward', ValueError),
+     ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
+     ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError)]
+)
 class TestRfftException(unittest.TestCase):
+
     def test_rfft(self):
         with self.assertRaises(self.expect_exception):
             with stgraph(paddle.fft.rfft, self.place, self.x, self.n, self.axis,
@@ -667,48 +645,49 @@ def test_rfft(self):
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
-        ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
-        ('test_n_grater_input_length', rand_x(
-            5, max_dim_len=5), (6, 6), (0, 1), 'backward'),
-        ('test_n_smaller_than_input_length', rand_x(
-            5, min_dim_len=5), (4, 4), (0, 1), 'backward'),
-        ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
-        ('test_axis_none', rand_x(5), None, None, 'backward'),
-        ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
-        ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
-    ])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
+    ('test_n_grater_input_length', rand_x(5, max_dim_len=5), (6, 6),
+     (0, 1), 'backward'),
+    ('test_n_smaller_than_input_length', rand_x(5, min_dim_len=5), (4, 4),
+     (0, 1), 'backward'),
+    ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
+    ('test_axis_none', rand_x(5), None, None, 'backward'),
+    ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
+    ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
+])
 class TestRfft2(unittest.TestCase):
+
     def test_static_rfft2(self):
         with stgraph(paddle.fft.rfft2, self.place, self.x, self.n, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.rfft2(self.x, self.n, self.axis, self.norm),
-                y,
-                rtol=RTOL.get(str(self.x.dtype)),
-                atol=ATOL.get(str(self.x.dtype)))
+            np.testing.assert_allclose(scipy.fft.rfft2(self.x, self.n,
+                                                       self.axis, self.norm),
+                                       y,
+                                       rtol=RTOL.get(str(self.x.dtype)),
+                                       atol=ATOL.get(str(self.x.dtype)))
 
 
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
     [
-        ('test_x_complex_input', rand_x(
-            2, complex=True), None, (0, 1), 'backward', TypeError),
+        ('test_x_complex_input', rand_x(2, complex=True), None,
+         (0, 1), 'backward', TypeError),
         # ('test_x_not_tensor', [0, 1], None, (0, 1), 'backward', ValueError),
         ('test_x_1dim_tensor', rand_x(1), None, (0, 1), 'backward', ValueError),
         ('test_n_nagative', rand_x(2), -1, (0, 1), 'backward', ValueError),
         ('test_n_zero', rand_x(2), 0, (0, 1), 'backward', ValueError),
-        ('test_axis_out_of_range', rand_x(2), None, (0, 1, 2), 'backward',
-         ValueError),
-        ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward',
-         ValueError),
+        ('test_axis_out_of_range', rand_x(2), None,
+         (0, 1, 2), 'backward', ValueError),
+        ('test_axis_with_array', rand_x(1), None,
+         (0, 1), 'backward', ValueError),
         ('test_axis_not_sequence', rand_x(5), None, -10, 'backward',
          ValueError),
         ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError)
     ])
 class TestRfft2Exception(unittest.TestCase):
+
     def test_static_rfft(self):
         with self.assertRaises(self.expect_exception):
             with stgraph(paddle.fft.rfft2, self.place, self.x, self.n,
@@ -720,37 +699,37 @@ def test_static_rfft(self):
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
     [('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
-     ('test_n_grater_input_length', rand_x(
-         5, max_dim_len=5), (6, 6), (1, 2),
-      'backward'), ('test_n_smaller_input_length', rand_x(
-          5, min_dim_len=5), (3, 3), (1, 2), 'backward'),
-     ('test_axis_not_default', rand_x(5), None, (1, 2),
-      'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'),
+     ('test_n_grater_input_length', rand_x(5, max_dim_len=5), (6, 6),
+      (1, 2), 'backward'),
+     ('test_n_smaller_input_length', rand_x(5, min_dim_len=5), (3, 3),
+      (1, 2), 'backward'),
+     ('test_axis_not_default', rand_x(5), None, (1, 2), 'backward'),
+     ('test_norm_forward', rand_x(5), None, None, 'forward'),
      ('test_norm_ortho', rand_x(5), None, None, 'ortho')])
 class TestRfftn(unittest.TestCase):
+
     def test_static_rfft(self):
         with stgraph(paddle.fft.rfftn, self.place, self.x, self.n, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.rfftn(self.x, self.n, self.axis, self.norm),
-                y,
-                rtol=RTOL.get(str(self.x.dtype)),
-                atol=ATOL.get(str(self.x.dtype)))
+            np.testing.assert_allclose(scipy.fft.rfftn(self.x, self.n,
+                                                       self.axis, self.norm),
+                                       y,
+                                       rtol=RTOL.get(str(self.x.dtype)),
+                                       atol=ATOL.get(str(self.x.dtype)))
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
-    [('test_x_complex', rand_x(
-        4, complex=True), None, None, 'backward',
-      TypeError), ('test_n_nagative', rand_x(4), (-1, -1), (1, 2), 'backward',
-                   ValueError), ('test_n_not_sequence', rand_x(4), -1, None,
-                                 'backward', ValueError),
-     ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError),
-     ('test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward',
-      ValueError), ('test_norm_not_in_enum', rand_x(2), None, -1, 'random',
-                    ValueError)])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+    ('test_x_complex', rand_x(4,
+                              complex=True), None, None, 'backward', TypeError),
+    ('test_n_nagative', rand_x(4), (-1, -1), (1, 2), 'backward', ValueError),
+    ('test_n_not_sequence', rand_x(4), -1, None, 'backward', ValueError),
+    ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError),
+    ('test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward', ValueError),
+    ('test_norm_not_in_enum', rand_x(2), None, -1, 'random', ValueError)
+])
 class TestRfftnException(unittest.TestCase):
+
     def test_static_rfftn(self):
         with self.assertRaises(self.expect_exception):
             with stgraph(paddle.fft.rfftn, self.place, self.x, self.n,
@@ -759,36 +738,38 @@ def test_static_rfftn(self):
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
-    [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'), (
-        'test_n_grater_than_input_length', rand_x(
-            5, max_dim_len=5), 11, -1, 'backward'),
-     ('test_n_smaller_than_input_length', rand_x(
-         5, min_dim_len=5), 3, -1,
-      'backward'), ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
-     ('test_norm_forward', rand_x(5), None, 3, 'forward'),
-     ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+              [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'),
+               ('test_n_grater_than_input_length', rand_x(
+                   5, max_dim_len=5), 11, -1, 'backward'),
+               ('test_n_smaller_than_input_length', rand_x(
+                   5, min_dim_len=5), 3, -1, 'backward'),
+               ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
+               ('test_norm_forward', rand_x(5), None, 3, 'forward'),
+               ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
 class TestIhfft(unittest.TestCase):
+
     def test_static_ihfft(self):
         with stgraph(paddle.fft.ihfft, self.place, self.x, self.n, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.ihfft(self.x, self.n, self.axis, self.norm),
-                y,
-                rtol=RTOL.get(str(self.x.dtype)),
-                atol=ATOL.get(str(self.x.dtype)))
+            np.testing.assert_allclose(scipy.fft.ihfft(self.x, self.n,
+                                                       self.axis, self.norm),
+                                       y,
+                                       rtol=RTOL.get(str(self.x.dtype)),
+                                       atol=ATOL.get(str(self.x.dtype)))
 
 
 @place(DEVICES)
-@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
-    ('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
-    ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
-    ('test_axis_out_of_range', rand_x(1), None, 10, 'backward', ValueError),
-    ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
-    ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError)
-])
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
+    [('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
+     ('test_n_zero', rand_x(2), 0, -1, 'backward', ValueError),
+     ('test_axis_out_of_range', rand_x(1), None, 10, 'backward', ValueError),
+     ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward', ValueError),
+     ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError)]
+)
 class TestIhfftException(unittest.TestCase):
+
     def test_static_ihfft(self):
         with self.assertRaises(self.expect_exception):
             with stgraph(paddle.fft.ihfft, self.place, self.x, self.n,
@@ -797,50 +778,51 @@ def test_static_ihfft(self):
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
-        ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
-        ('test_n_grater_input_length', rand_x(
-            5, max_dim_len=5), (11, 11), (0, 1), 'backward'),
-        ('test_n_smaller_than_input_length', rand_x(
-            5, min_dim_len=5), (1, 1), (0, 1), 'backward'),
-        ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
-        ('test_axis_none', rand_x(5), None, None, 'backward'),
-        ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
-        ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
-    ])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
+    ('test_x_float64', rand_x(5), None, (0, 1), 'backward'),
+    ('test_n_grater_input_length', rand_x(5, max_dim_len=5), (11, 11),
+     (0, 1), 'backward'),
+    ('test_n_smaller_than_input_length', rand_x(5, min_dim_len=5), (1, 1),
+     (0, 1), 'backward'),
+    ('test_axis_random', rand_x(5), None, (1, 2), 'backward'),
+    ('test_axis_none', rand_x(5), None, None, 'backward'),
+    ('test_norm_forward', rand_x(5), None, (0, 1), 'forward'),
+    ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'),
+])
 class TestIhfft2(unittest.TestCase):
+
     def test_static_ihfft2(self):
         with stgraph(paddle.fft.ihfft2, self.place, self.x, self.n, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.ihfft2(self.x, self.n, self.axis, self.norm),
-                y,
-                rtol=RTOL.get(str(self.x.dtype)),
-                atol=ATOL.get(str(self.x.dtype)))
+            np.testing.assert_allclose(scipy.fft.ihfft2(self.x, self.n,
+                                                        self.axis, self.norm),
+                                       y,
+                                       rtol=RTOL.get(str(self.x.dtype)),
+                                       atol=ATOL.get(str(self.x.dtype)))
 
 
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
     [
-        ('test_x_complex_input', rand_x(
-            2, complex=True), None, (0, 1), None, ValueError),
+        ('test_x_complex_input', rand_x(2, complex=True), None,
+         (0, 1), None, ValueError),
         # ('test_x_not_tensor', [0, 1], None, (0, 1), None, ValueError),
         ('test_x_1dim_tensor', rand_x(1), None, (0, 1), None, ValueError),
         ('test_n_nagative', rand_x(2), -1, (0, 1), 'backward', ValueError),
-        ('test_n_len_not_equal_axis', rand_x(
-            5, max_dim_len=5), 11, (0, 1), 'backward', ValueError),
+        ('test_n_len_not_equal_axis', rand_x(5, max_dim_len=5), 11,
+         (0, 1), 'backward', ValueError),
         ('test_n_zero', rand_x(2), (0, 0), (0, 1), 'backward', ValueError),
-        ('test_axis_out_of_range', rand_x(2), None, (0, 1, 2), 'backward',
-         ValueError),
-        ('test_axis_with_array', rand_x(1), None, (0, 1), 'backward',
-         ValueError),
+        ('test_axis_out_of_range', rand_x(2), None,
+         (0, 1, 2), 'backward', ValueError),
+        ('test_axis_with_array', rand_x(1), None,
+         (0, 1), 'backward', ValueError),
         ('test_axis_not_sequence', rand_x(5), None, -10, 'backward',
          ValueError),
         ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError)
     ])
 class TestIhfft2Exception(unittest.TestCase):
+
     def test_static_ihfft2(self):
         with self.assertRaises(self.expect_exception):
             with stgraph(paddle.fft.ihfft2, self.place, self.x, self.n,
@@ -852,35 +834,36 @@ def test_static_ihfft2(self):
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
     [('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
-     ('test_n_grater_input_length', rand_x(
-         5, max_dim_len=5), (11, 11), (0, 1),
-      'backward'), ('test_n_smaller_input_length', rand_x(
-          5, min_dim_len=5), (1, 1), (0, 1), 'backward'),
-     ('test_axis_not_default', rand_x(5), None, (1, 2),
-      'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'),
+     ('test_n_grater_input_length', rand_x(5, max_dim_len=5), (11, 11),
+      (0, 1), 'backward'),
+     ('test_n_smaller_input_length', rand_x(5, min_dim_len=5), (1, 1),
+      (0, 1), 'backward'),
+     ('test_axis_not_default', rand_x(5), None, (1, 2), 'backward'),
+     ('test_norm_forward', rand_x(5), None, None, 'forward'),
      ('test_norm_ortho', rand_x(5), None, None, 'ortho')])
 class TestIhfftn(unittest.TestCase):
+
     def test_static_ihfftn(self):
         with stgraph(paddle.fft.ihfftn, self.place, self.x, self.n, self.axis,
                      self.norm) as y:
-            np.testing.assert_allclose(
-                scipy.fft.ihfftn(self.x, self.n, self.axis, self.norm),
-                y,
-                rtol=RTOL.get(str(self.x.dtype)),
-                atol=ATOL.get(str(self.x.dtype)))
+            np.testing.assert_allclose(scipy.fft.ihfftn(self.x, self.n,
+                                                        self.axis, self.norm),
+                                       y,
+                                       rtol=RTOL.get(str(self.x.dtype)),
+                                       atol=ATOL.get(str(self.x.dtype)))
 
 
 @place(DEVICES)
-@parameterize(
-    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
-    [('test_x_complex', rand_x(
-        4, complex=True), None, None, 'backward', TypeError),
-     ('test_n_nagative', rand_x(4), -1, None, 'backward',
-      ValueError), ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError),
-     ('test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward',
-      ValueError), ('test_norm_not_in_enum', rand_x(2), None, -1, 'random',
-                    ValueError)])
+@parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
+    ('test_x_complex', rand_x(4,
+                              complex=True), None, None, 'backward', TypeError),
+    ('test_n_nagative', rand_x(4), -1, None, 'backward', ValueError),
+    ('test_n_zero', rand_x(4), 0, None, 'backward', ValueError),
+    ('test_axis_out_of_range', rand_x(1), None, [0, 1], 'backward', ValueError),
+    ('test_norm_not_in_enum', rand_x(2), None, -1, 'random', ValueError)
+])
 class TestIhfftnException(unittest.TestCase):
+
     def test_static_ihfftn(self):
         with self.assertRaises(self.expect_exception):
             with stgraph(paddle.fft.ihfftn, self.place, self.x, self.n,
@@ -897,6 +880,7 @@ def test_static_ihfftn(self):
      np.random.randn(5, 5) + 1j * np.random.randn(5, 5), None, 'complex128'),
 ])
 class TestFftShift(unittest.TestCase):
+
     def test_fftshift(self):
         """Test fftshift with norm condition
         """
@@ -916,12 +900,13 @@ def test_fftshift(self):
 @place(DEVICES)
 @parameterize(
     (TEST_CASE_NAME, 'x', 'axes'),
-    [('test_1d', np.random.randn(10), (0, ),
-      'float64'), ('test_2d', np.random.randn(10, 10), (0, 1), 'float64'),
+    [('test_1d', np.random.randn(10), (0, ), 'float64'),
+     ('test_2d', np.random.randn(10, 10), (0, 1), 'float64'),
      ('test_2d_with_all_axes', np.random.randn(10, 10), None, 'float64'),
      ('test_2d_odd_with_all_axes',
       np.random.randn(5, 5) + 1j * np.random.randn(5, 5), None, 'complex128')])
 class TestIfftShift(unittest.TestCase):
+
     def test_ifftshift(self):
         """Test ifftshift with norm condition
         """
diff --git a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py b/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
index a84092e36f6a8..ba4092965920b 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
+++ b/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,6 +26,7 @@
 import paddle.static as static
 from numpy.random import random as rand
 from paddle.fluid import Program, program_guard
+
 sys.path.append("../")
 from op_test import OpTest
 
@@ -72,22 +73,22 @@ def class_name(cls, num, params_dict):
     return "{}_{}{}".format(cls.__name__, num, suffix and "_" + suffix)
 
 
-@parameterize((TEST_CASE_NAME, 'x', 'axes', 'norm', 'forward'), [
-    ('test_axes_is_sqe_type', (np.random.random(
-        (12, 14)) + 1j * np.random.random((12, 14))).astype(np.complex128),
-     [0, 1], 'forward', True), ('test_axis_not_last', (np.random.random(
-         (4, 4, 4)) + 1j * np.random.random((4, 4, 4))).astype(np.complex128),
-                                (0, 1), "backward", False),
-    ('test_norm_forward', (np.random.random((12, 14)) + 1j * np.random.random(
-        (12, 14))).astype(np.complex128), (0, ), "forward",
-     False), ('test_norm_backward', (np.random.random(
-         (12, 14)) + 1j * np.random.random((12, 14))).astype(np.complex128),
-              (0, ), "backward", True), ('test_norm_ortho', (np.random.random(
-                  (12, 14)) + 1j * np.random.random(
-                      (12, 14))).astype(np.complex128), (1, ), "ortho", True)
-])
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'axes', 'norm', 'forward'),
+    [('test_axes_is_sqe_type', (np.random.random(
+        (12, 14)) + 1j * np.random.random(
+            (12, 14))).astype(np.complex128), [0, 1], 'forward', True),
+     ('test_axis_not_last', (np.random.random(
+         (4, 4, 4)) + 1j * np.random.random(
+             (4, 4, 4))).astype(np.complex128), (0, 1), "backward", False),
+     ('test_norm_forward', (np.random.random((12, 14)) + 1j * np.random.random(
+         (12, 14))).astype(np.complex128), (0, ), "forward", False),
+     ('test_norm_backward', (np.random.random((12, 14)) + 1j * np.random.random(
+         (12, 14))).astype(np.complex128), (0, ), "backward", True),
+     ('test_norm_ortho', (np.random.random((12, 14)) + 1j * np.random.random(
+         (12, 14))).astype(np.complex128), (1, ), "ortho", True)])
 class TestFFTC2COp(OpTest):
-    # Because framwork not support complex numerial gradient, we skip gradient check. 
+    # Because framwork not support complex numerial gradient, we skip gradient check.
     no_need_check_grad = True
 
     def setUp(self):
@@ -110,19 +111,19 @@ def test_check_output(self):
 @parameterize(
     (TEST_CASE_NAME, 'x', 'axes', 'norm', 'forward', 'last_dim_size'),
     [('test_axes_is_sqe_type', (np.random.random(
-        (12, 14)) + 1j * np.random.random((12, 14))).astype(np.complex128),
-      [0, 1], 'forward', True, 26), ('test_axis_not_last', (np.random.random(
-          (4, 4, 4)) + 1j * np.random.random((4, 4, 4))).astype(np.complex128),
-                                     (0, 1), "backward", False, None),
+        (12, 14)) + 1j * np.random.random(
+            (12, 14))).astype(np.complex128), [0, 1], 'forward', True, 26),
+     ('test_axis_not_last', (np.random.random(
+         (4, 4, 4)) + 1j * np.random.random((4, 4, 4))).astype(np.complex128),
+      (0, 1), "backward", False, None),
      ('test_norm_forward', (np.random.random((12, 14)) + 1j * np.random.random(
          (12, 14))).astype(np.complex128), (0, ), "forward", False, 22),
      ('test_norm_backward', (np.random.random((12, 14)) + 1j * np.random.random(
-         (12, 14))).astype(np.complex128), (0, ), "backward", True,
-      22), ('test_norm_ortho', (np.random.random(
-          (12, 14)) + 1j * np.random.random((12, 14))).astype(np.complex128),
-            (1, ), "ortho", True, 26)])
+         (12, 14))).astype(np.complex128), (0, ), "backward", True, 22),
+     ('test_norm_ortho', (np.random.random((12, 14)) + 1j * np.random.random(
+         (12, 14))).astype(np.complex128), (1, ), "ortho", True, 26)])
 class TestFFTC2ROp(OpTest):
-    # Because framwork not support complex numerial gradient, we skip gradient check. 
+    # Because framwork not support complex numerial gradient, we skip gradient check.
     no_need_check_grad = True
 
     def setUp(self):
@@ -147,17 +148,17 @@ def test_check_output(self):
 @parameterize(
     (TEST_CASE_NAME, 'x', 'axes', 'norm', 'forward', 'onesided'),
     [('test_axes_is_sqe_type', np.random.randn(12, 14).astype(np.float64),
-      (0, 1), 'forward', True,
-      True), ('test_axis_not_last', np.random.randn(4, 4, 4).astype(np.float64),
-              (0, 1), "backward", False, True),
-     ('test_norm_forward', np.random.randn(12, 14).astype(np.float64), (0, 1),
-      "forward", False, False),
-     ('test_norm_backward', np.random.randn(12, 14).astype(np.float64), (0, ),
-      "backward", True, False), ('test_norm_ortho',
-                                 np.random.randn(12, 14).astype(np.float64),
-                                 (1, ), "ortho", True, False)])
+      (0, 1), 'forward', True, True),
+     ('test_axis_not_last', np.random.randn(4, 4, 4).astype(np.float64),
+      (0, 1), "backward", False, True),
+     ('test_norm_forward', np.random.randn(12, 14).astype(np.float64),
+      (0, 1), "forward", False, False),
+     ('test_norm_backward', np.random.randn(12, 14).astype(np.float64),
+      (0, ), "backward", True, False),
+     ('test_norm_ortho', np.random.randn(12, 14).astype(np.float64),
+      (1, ), "ortho", True, False)])
 class TestFFTR2COp(OpTest):
-    # Because framwork not support complex numerial gradient, we skip gradient check. 
+    # Because framwork not support complex numerial gradient, we skip gradient check.
     no_need_check_grad = True
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py b/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py
index 2b4ae3d60dd78..c6a39bd6d0418 100644
--- a/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py
+++ b/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py
@@ -42,24 +42,21 @@ def net(batch_size=4, lr=0.01):
     dnn_input_dim, lr_input_dim = int(2), int(2)
 
     with fluid.device_guard("cpu"):
-        dnn_data = fluid.layers.data(
-            name="dnn_data",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=1,
-            append_batch_size=False)
-        lr_data = fluid.layers.data(
-            name="lr_data",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=1,
-            append_batch_size=False)
-        label = fluid.layers.data(
-            name="click",
-            shape=[-1, 1],
-            dtype="float32",
-            lod_level=0,
-            append_batch_size=False)
+        dnn_data = fluid.layers.data(name="dnn_data",
+                                     shape=[-1, 1],
+                                     dtype="int64",
+                                     lod_level=1,
+                                     append_batch_size=False)
+        lr_data = fluid.layers.data(name="lr_data",
+                                    shape=[-1, 1],
+                                    dtype="int64",
+                                    lod_level=1,
+                                    append_batch_size=False)
+        label = fluid.layers.data(name="click",
+                                  shape=[-1, 1],
+                                  dtype="float32",
+                                  lod_level=0,
+                                  append_batch_size=False)
 
         datas = [dnn_data, lr_data, label]
 
@@ -73,8 +70,8 @@ def net(batch_size=4, lr=0.01):
                 name="deep_embedding",
                 initializer=fluid.initializer.Constant(value=0.01)),
             is_sparse=True)
-        dnn_pool = fluid.layers.sequence_pool(
-            input=dnn_embedding, pool_type="sum")
+        dnn_pool = fluid.layers.sequence_pool(input=dnn_embedding,
+                                              pool_type="sum")
         dnn_out = dnn_pool
 
         # build lr model
diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
index d3a396f6baf12..fe79bae75f530 100755
--- a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
+++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
@@ -22,6 +22,7 @@
 
 
 class TestFleetMetaOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "1"
         os.environ[
@@ -37,8 +38,8 @@ def debug_program(self, main_prog, startup_prog):
         main_prog_op_types = [op.type for op in main_prog_ops]
         startup_prog_op_types = [op.type for op in startup_prog_ops]
 
-        print("=== debug program and ops in func [{}] ==="
-              .format(inspect.stack()[1].function))
+        print("=== debug program and ops in func [{}] ===".format(
+            inspect.stack()[1].function))
         print(main_prog)
         print(main_prog_op_types)
         print(startup_prog)
@@ -49,10 +50,12 @@ def net(self, main_prog, startup_prog):
             with fluid.unique_name.guard():
                 role = role_maker.PaddleCloudRoleMaker(is_collective=True)
                 fleet.init(role)
-                input_x = paddle.fluid.layers.data(
-                    name="x", shape=[32], dtype='float32')
-                input_y = paddle.fluid.layers.data(
-                    name="y", shape=[1], dtype='int64')
+                input_x = paddle.fluid.layers.data(name="x",
+                                                   shape=[32],
+                                                   dtype='float32')
+                input_y = paddle.fluid.layers.data(name="y",
+                                                   shape=[1],
+                                                   dtype='int64')
 
                 fc_1 = paddle.fluid.layers.fc(input=input_x,
                                               size=64,
@@ -61,14 +64,15 @@ def net(self, main_prog, startup_prog):
                 prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                     size=2,
                                                     act='softmax')
-                cost = paddle.fluid.layers.cross_entropy(
-                    input=prediction, label=input_y)
+                cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                         label=input_y)
                 avg_cost = paddle.fluid.layers.mean(x=cost)
 
                 strategy = paddle.distributed.fleet.DistributedStrategy()
         return avg_cost, strategy
 
     def pp_net(self, main_prog, startup_prog, pp_degree=2):
+
         def fc_block(input_x):
             fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
             fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
@@ -80,10 +84,12 @@ def fc_block(input_x):
                 role = role_maker.PaddleCloudRoleMaker(is_collective=True)
                 fleet.init(role)
                 with fluid.device_guard("gpu:0"):
-                    input_x = paddle.fluid.layers.data(
-                        name="x", shape=[32], dtype='float32')
-                    input_y = paddle.fluid.layers.data(
-                        name="y", shape=[1], dtype='int64')
+                    input_x = paddle.fluid.layers.data(name="x",
+                                                       shape=[32],
+                                                       dtype='float32')
+                    input_y = paddle.fluid.layers.data(name="y",
+                                                       shape=[1],
+                                                       dtype='int64')
 
                 for stage_idx in range(pp_degree):
                     with fluid.device_guard("gpu:" + str(stage_idx)):
@@ -93,8 +99,8 @@ def fc_block(input_x):
                     prediction = paddle.fluid.layers.fc(input=[input_x],
                                                         size=2,
                                                         act='softmax')
-                    cost = paddle.fluid.layers.cross_entropy(
-                        input=prediction, label=input_y)
+                    cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                             label=input_y)
                     avg_cost = paddle.fluid.layers.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
@@ -136,12 +142,11 @@ def optimizer(self,
                         regularization=regularization,
                         grad_clip=grad_clip)
                 elif name == 'adamw':
-                    optimizer = paddle.optimizer.AdamW(
-                        learning_rate=0.01,
-                        weight_decay=0.01,
-                        grad_clip=grad_clip)
-                optimizer = fleet.distributed_optimizer(
-                    optimizer, strategy=strategy)
+                    optimizer = paddle.optimizer.AdamW(learning_rate=0.01,
+                                                       weight_decay=0.01,
+                                                       grad_clip=grad_clip)
+                optimizer = fleet.distributed_optimizer(optimizer,
+                                                        strategy=strategy)
                 optimizer.minimize(loss)
 
     def set_strategy(self, strategy, name):
diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py
index defbffe8f2020..be1fa92f0888e 100644
--- a/python/paddle/fluid/tests/unittests/gradient_checker.py
+++ b/python/paddle/fluid/tests/unittests/gradient_checker.py
@@ -97,8 +97,8 @@ def make_jacobian(x, y_size, np_dtype):
         return np.zeros((_product(x.shape), y_size), dtype=np_dtype)
     elif isinstance(x, Sequence):
         jacobians = list(
-            filter(lambda t: t is not None, (make_jacobian(
-                item, y_size, np_dtype) for item in x)))
+            filter(lambda t: t is not None,
+                   (make_jacobian(item, y_size, np_dtype) for item in x)))
         return jacobians
     else:
         None
@@ -186,8 +186,10 @@ def _compute_analytical_jacobian(program, x, y, place, scope):
 
     np_type = dtype_to_np_dtype(y.dtype)
     # create dy Variable in Program
-    dy = program.global_block().create_var(
-        name=dy_name, shape=y.shape, dtype=np_type, persistable=True)
+    dy = program.global_block().create_var(name=dy_name,
+                                           shape=y.shape,
+                                           dtype=np_type,
+                                           persistable=True)
     # append backward
     dx = fluid.gradients(y, x, dy)
 
@@ -217,8 +219,8 @@ def _compute_analytical_jacobian(program, x, y, place, scope):
             if dx_res[j] is not None:
                 jacobian[dx_idx][:, i] = dx_res[j].flatten()
             else:
-                jacobian[dx_idx][:, i] = np.zeros(
-                    dx[dx_idx].shape, dtype=np_type).flatten()
+                jacobian[dx_idx][:, i] = np.zeros(dx[dx_idx].shape,
+                                                  dtype=np_type).flatten()
 
         _set_item(dy_t, i, 0, np_type)
 
@@ -313,8 +315,8 @@ def fail_test(msg):
         analytical.append(
             _compute_analytical_jacobian(prog, clone_x, clone_y, place, scope))
 
-    for i, (x_idx,
-            y_idx) in enumerate(product(*[range(len(x)), range(len(y))])):
+    for i, (x_idx, y_idx) in enumerate(
+            product(*[range(len(x)), range(len(y))])):
         a = analytical[y_idx][x_idx]
         n = numerical[x_idx][y_idx]
         if not np.allclose(a, n, rtol, atol):
@@ -373,8 +375,10 @@ def double_grad_check(x,
         for yi in y:
             dyi_name = _append_grad_suffix_(yi.name)
             np_type = dtype_to_np_dtype(yi.dtype)
-            dy = program.global_block().create_var(
-                name=dyi_name, shape=yi.shape, dtype=np_type, persistable=True)
+            dy = program.global_block().create_var(name=dyi_name,
+                                                   shape=yi.shape,
+                                                   dtype=np_type,
+                                                   persistable=True)
             dy.stop_gradient = False
             v = np.random.random(size=yi.shape).astype(np_type)
             set_var_in_scope(scope, place, dyi_name, v)
@@ -398,7 +402,7 @@ def double_grad_check(x,
     grad_check(x, target_grads, x_init, place, program, eps, atol, rtol)
 
 
-# TODO(jiabin): We currently support only triple grad check here, extend this to support 
+# TODO(jiabin): We currently support only triple grad check here, extend this to support
 # higher order differenciation later.
 
 
@@ -452,8 +456,10 @@ def triple_grad_check(x,
         for yi in y:
             dyi_name = _append_grad_suffix_(yi.name)
             np_type = dtype_to_np_dtype(yi.dtype)
-            dy = program.global_block().create_var(
-                name=dyi_name, shape=yi.shape, dtype=np_type, persistable=True)
+            dy = program.global_block().create_var(name=dyi_name,
+                                                   shape=yi.shape,
+                                                   dtype=np_type,
+                                                   persistable=True)
             dy.stop_gradient = False
             v = np.random.random(size=yi.shape).astype(np_type)
             set_var_in_scope(scope, place, dyi_name, v)
@@ -475,11 +481,10 @@ def triple_grad_check(x,
         for dxi in target_grads:
             ddxi_name = _append_grad_suffix_(dxi.name)
             np_type = dtype_to_np_dtype(dxi.dtype)
-            ddx = program.global_block().create_var(
-                name=ddxi_name,
-                shape=dxi.shape,
-                dtype=np_type,
-                persistable=True)
+            ddx = program.global_block().create_var(name=ddxi_name,
+                                                    shape=dxi.shape,
+                                                    dtype=np_type,
+                                                    persistable=True)
             ddx.stop_gradient = False
             v = np.random.random(size=dxi.shape).astype(np_type)
             set_var_in_scope(scope, place, ddxi_name, v)
@@ -507,15 +512,14 @@ def triple_grad_check(x,
     x_init += x_grads_grads_init
 
     # x <=> [x, dout, ddx]
-    grad_check(
-        x=x,
-        y=filted_target_grads_grads,
-        x_init=x_init,
-        place=place,
-        program=program,
-        eps=eps,
-        atol=atol,
-        rtol=rtol)
+    grad_check(x=x,
+               y=filted_target_grads_grads,
+               x_init=x_init,
+               place=place,
+               program=program,
+               eps=eps,
+               atol=atol,
+               rtol=rtol)
 
 
 def get_static_double_grad(x,
@@ -547,8 +551,10 @@ def get_static_double_grad(x,
         yi = y[i]
         dyi_name = _append_grad_suffix_(yi.name)
         np_type = dtype_to_np_dtype(yi.dtype)
-        dy = program.global_block().create_var(
-            name=dyi_name, shape=yi.shape, dtype=np_type, persistable=True)
+        dy = program.global_block().create_var(name=dyi_name,
+                                               shape=yi.shape,
+                                               dtype=np_type,
+                                               persistable=True)
         dy.stop_gradient = False
         set_var_in_scope(scope, place, dyi_name, dy_init[i])
         y_grads.append(dy)
@@ -599,8 +605,10 @@ def get_static_double_grad(x,
         np_type = dtype_to_np_dtype(yi.dtype)
         dy_name = _append_grad_suffix_(yi.name)
         # create dy Variable in Program
-        dy = program.global_block().create_var(
-            name=dy_name, shape=yi.shape, dtype=np_type, persistable=True)
+        dy = program.global_block().create_var(name=dy_name,
+                                               shape=yi.shape,
+                                               dtype=np_type,
+                                               persistable=True)
         # init dy tensor in scope
         value = np.ones(yi.shape, dtype=np_type)
         dy_t = set_var_in_scope(scope, place, dy_name, value)
@@ -656,12 +664,11 @@ def get_eager_double_grad(func,
         dys.append(dy_tensor)
     # calculate first derivative
     outputs = func(inputs)
-    d_inputs = paddle.grad(
-        outputs=outputs,
-        inputs=inputs,
-        grad_outputs=dys,
-        create_graph=True,
-        allow_unused=True)
+    d_inputs = paddle.grad(outputs=outputs,
+                           inputs=inputs,
+                           grad_outputs=dys,
+                           create_graph=True,
+                           allow_unused=True)
     d_inputs = [d_input for d_input in d_inputs if d_input is not None]
 
     # calcluate second derivative
@@ -678,12 +685,11 @@ def get_eager_double_grad(func,
         ddy.stop_gradient = False
         ddys.append(ddy)
 
-    dd_inputs = paddle.grad(
-        outputs=d_inputs,
-        inputs=inputs,
-        grad_outputs=ddys,
-        create_graph=create_graph,
-        allow_unused=True)
+    dd_inputs = paddle.grad(outputs=d_inputs,
+                            inputs=inputs,
+                            grad_outputs=ddys,
+                            create_graph=create_graph,
+                            allow_unused=True)
 
     if return_mid_result:
         return dd_inputs, inputs + ddys
@@ -790,8 +796,10 @@ def get_static_triple_grad(x,
         yi = y[i]
         dyi_name = _append_grad_suffix_(yi.name)
         np_type = dtype_to_np_dtype(yi.dtype)
-        dy = program.global_block().create_var(
-            name=dyi_name, shape=yi.shape, dtype=np_type, persistable=True)
+        dy = program.global_block().create_var(name=dyi_name,
+                                               shape=yi.shape,
+                                               dtype=np_type,
+                                               persistable=True)
         dy.stop_gradient = False
         set_var_in_scope(scope, place, dyi_name, dy_init[i])
         y_grads.append(dy)
@@ -811,8 +819,12 @@ def get_static_triple_grad(x,
         value = np.ones(dxi.shape, dtype=np_type)
         x_grads_grads_init.append(value)
 
-    return get_static_double_grad(
-        x, y, x_init, dy_init=x_grads_grads_init, place=place, program=program)
+    return get_static_double_grad(x,
+                                  y,
+                                  x_init,
+                                  dy_init=x_grads_grads_init,
+                                  place=place,
+                                  program=program)
 
 
 def get_eager_triple_grad(func,
@@ -832,8 +844,11 @@ def get_eager_triple_grad(func,
     Returns:
         A list of numpy array that stores second derivative result calulated by dygraph
     """
-    dd_y, dd_x = get_eager_double_grad(
-        func, x_init, dy_init, place, return_mid_result=True)
+    dd_y, dd_x = get_eager_double_grad(func,
+                                       x_init,
+                                       dy_init,
+                                       place,
+                                       return_mid_result=True)
 
     # calcluate third derivative
     dddys = []
diff --git a/python/paddle/fluid/tests/unittests/hccl_tools.py b/python/paddle/fluid/tests/unittests/hccl_tools.py
index e3628ee5a4e9b..ab35b36161b79 100644
--- a/python/paddle/fluid/tests/unittests/hccl_tools.py
+++ b/python/paddle/fluid/tests/unittests/hccl_tools.py
@@ -1,13 +1,13 @@
 # -*- coding:UTF-8 -*-
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -56,17 +56,17 @@ def parse_args():
         "--device_num",
         type=str,
         default="[0,8)",
-        help="The number of the Ascend accelerators used. please note that the Ascend accelerators"
+        help=
+        "The number of the Ascend accelerators used. please note that the Ascend accelerators"
         "used must be continuous, such [0,4) means to use four chips "
         "0,1,2,3; [0,1) means to use chip 0; The first four chips are"
         "a group, and the last four chips are a group. In addition to"
         "the [0,8) chips are allowed, other cross-group such as [3,6)"
         "are prohibited.")
-    parser.add_argument(
-        "--visible_devices",
-        type=str,
-        default="0,1,2,3,4,5,6,7",
-        help="will use the visible devices sequentially")
+    parser.add_argument("--visible_devices",
+                        type=str,
+                        default="0,1,2,3,4,5,6,7",
+                        help="will use the visible devices sequentially")
     parser.add_argument("--server_ip", type=str, default="", help="server ip")
     args = parser.parse_args()
     return args
@@ -121,8 +121,8 @@ def main():
                 pass
             else:
                 raise ValueError(
-                    "device num {} must be in the same group of [0,4] or [4,8] !".
-                    format(args.device_num))
+                    "device num {} must be in the same group of [0,4] or [4,8] !"
+                    .format(args.device_num))
 
     device_num_list = list(range(first_num, last_num))
     print("device_num_list:", device_num_list)
@@ -162,8 +162,11 @@ def main():
 
     # save hccn_table to file
     table_path = os.getcwd()
-    table_fn = os.path.join(table_path, 'hccl_{}p_{}_{}.json'.format(
-        len(device_num_list), "".join(map(str, device_num_list)), server_id))
+    table_fn = os.path.join(
+        table_path,
+        'hccl_{}p_{}_{}.json'.format(len(device_num_list),
+                                     "".join(map(str,
+                                                 device_num_list)), server_id))
     with open(table_fn, 'w') as table_fp:
         json.dump(hccn_table, table_fp, indent=4)
     sys.stdout.flush()
diff --git a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
index 69ccc7088b834..2dfa86f976644 100644
--- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
+++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
@@ -25,6 +25,7 @@
 
 
 class FSTestBase(unittest.TestCase):
+
     def _test_dirs(self, fs):
         dir_path = os.path.abspath("./test_dir")
         fs.delete(dir_path)
@@ -220,11 +221,10 @@ def _test_rm(self, fs):
             pass
 
     def _test_list_dir(self, fs):
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            None,
-            time_out=15 * 1000,
-            sleep_inter=100)
+        fs = HDFSClient("/usr/local/hadoop-2.7.7/",
+                        None,
+                        time_out=15 * 1000,
+                        sleep_inter=100)
         fs.ls_dir("test_not_exists")
 
     def _test_touch(self, fs):
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_communicate_group.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_communicate_group.py
index 53d0f95a23667..f290705c312e0 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_communicate_group.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_communicate_group.py
@@ -19,6 +19,7 @@
 
 
 class TestNewGroupAPI(object):
+
     def __init__(self):
         paddle.distributed.init_parallel_env()
         topo = fleet.CommunicateTopology(["data", "model", "sharding", "pipe"],
@@ -49,27 +50,30 @@ def test_all(self):
 
         tmp = np.array([0, 0, 0])
         result = paddle.to_tensor(tmp)
-        paddle.distributed.scatter(
-            result, [self.tensor2, self.tensor1],
-            src=dp_src_rank,
-            group=dp_gp,
-            use_calc_stream=True)
+        paddle.distributed.scatter(result, [self.tensor2, self.tensor1],
+                                   src=dp_src_rank,
+                                   group=dp_gp,
+                                   use_calc_stream=True)
         if dp_rank == 0:
             assert np.array_equal(result, self.tensor2)
         elif dp_rank == 1:
             assert np.array_equal(result, self.tensor1)
         print("test scatter api ok")
 
-        paddle.distributed.broadcast(
-            result, src=1, group=dp_gp, use_calc_stream=True)
+        paddle.distributed.broadcast(result,
+                                     src=1,
+                                     group=dp_gp,
+                                     use_calc_stream=True)
         assert np.array_equal(result, self.tensor1)
         print("test broadcast api ok")
 
-        paddle.distributed.reduce(
-            result, dst=dp_src_rank, group=dp_gp, use_calc_stream=True)
+        paddle.distributed.reduce(result,
+                                  dst=dp_src_rank,
+                                  group=dp_gp,
+                                  use_calc_stream=True)
         if dp_rank == 0:
-            assert np.array_equal(result,
-                                  paddle.add(self.tensor1, self.tensor1))
+            assert np.array_equal(result, paddle.add(self.tensor1,
+                                                     self.tensor1))
         elif dp_rank == 1:
             assert np.array_equal(result, self.tensor1)
         print("test reduce api ok")
@@ -85,8 +89,10 @@ def test_all(self):
         print("test wait api ok")
 
         result = []
-        paddle.distributed.all_gather(
-            result, self.tensor1, group=dp_gp, use_calc_stream=True)
+        paddle.distributed.all_gather(result,
+                                      self.tensor1,
+                                      group=dp_gp,
+                                      use_calc_stream=True)
         assert np.array_equal(result[0], self.tensor1)
         assert np.array_equal(result[1], self.tensor1)
         print("test all_gather api ok")
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_inference_helper.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_inference_helper.py
index 949d537586f65..830b8ccecbec7 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_inference_helper.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_inference_helper.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,6 +27,7 @@
 import paddle.distributed.fleet as fleet
 from paddle import framework
 from paddle.distributed.fleet.utils.hybrid_parallel_inference import HybridParallelInferenceHelper
+
 paddle.enable_static()
 
 
@@ -46,6 +47,7 @@ def numpy_while(x, w1=1.0, w2=2.0, max_len=2):
 
 
 class TestHybridParallelInferenceHelperClass(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         fleet.init(is_collective=True, strategy=strategy)
@@ -64,31 +66,29 @@ def test_hybrid_parallel_inference_helper_mp1pp2(self):
 
         with paddle.static.program_guard(main_program, startup_program):
             with paddle.fluid.device_guard(f'{device}:0'):
-                X = paddle.static.data(
-                    name='X', shape=[None, 2], dtype='float32')
+                X = paddle.static.data(name='X',
+                                       shape=[None, 2],
+                                       dtype='float32')
 
             with paddle.fluid.device_guard(f'{device}:all'):
-                max_len = layers.fill_constant(
-                    shape=[1],
-                    dtype="int64",
-                    value=2,
-                    force_cpu=False,
-                    name="n")
-                step_idx = layers.fill_constant(
-                    shape=[1],
-                    dtype="int64",
-                    value=0,
-                    force_cpu=False,
-                    name="i")
+                max_len = layers.fill_constant(shape=[1],
+                                               dtype="int64",
+                                               value=2,
+                                               force_cpu=False,
+                                               name="n")
+                step_idx = layers.fill_constant(shape=[1],
+                                                dtype="int64",
+                                                value=0,
+                                                force_cpu=False,
+                                                name="i")
 
                 data = layers.array_write(X, step_idx)
 
-                cond_int = layers.fill_constant(
-                    shape=[1],
-                    dtype="int64",
-                    value=0,
-                    force_cpu=False,
-                    name="cond_int")
+                cond_int = layers.fill_constant(shape=[1],
+                                                dtype="int64",
+                                                value=0,
+                                                force_cpu=False,
+                                                name="cond_int")
                 cond = layers.less_than(x=step_idx, y=max_len)
                 while_op = layers.While(cond, is_test=True)
 
@@ -101,21 +101,19 @@ def test_hybrid_parallel_inference_helper_mp1pp2(self):
                 with paddle.fluid.device_guard(f'{device}:0'):
                     param_attr = paddle.ParamAttr(
                         initializer=paddle.nn.initializer.Constant(1.0))
-                    weight1 = paddle.static.create_parameter(
-                        shape=[2, 5],
-                        dtype='float32',
-                        attr=param_attr,
-                        is_bias=False)
+                    weight1 = paddle.static.create_parameter(shape=[2, 5],
+                                                             dtype='float32',
+                                                             attr=param_attr,
+                                                             is_bias=False)
                     hidden1 = paddle.matmul(input, weight1)
 
                 with paddle.fluid.device_guard(f'{device}:1'):
                     param_attr = paddle.ParamAttr(
                         initializer=paddle.nn.initializer.Constant(2.0))
-                    weight2 = paddle.static.create_parameter(
-                        shape=[5, 2],
-                        dtype='float32',
-                        attr=param_attr,
-                        is_bias=False)
+                    weight2 = paddle.static.create_parameter(shape=[5, 2],
+                                                             dtype='float32',
+                                                             attr=param_attr,
+                                                             is_bias=False)
                     hidden2 = paddle.matmul(hidden1, weight2)
 
                     layers.array_write(hidden2, i=step_idx, array=data)
@@ -142,16 +140,17 @@ def test_hybrid_parallel_inference_helper_mp1pp2(self):
             micro_batch_size=2,
             num_mp=1,
             num_pp=2,
-            init_comm=nranks > 1, )
-        helper.gen_infer_program(
-            ['array_write_0.out'], ['cond_int.tmp_0'], debug=True)
+            init_comm=nranks > 1,
+        )
+        helper.gen_infer_program(['array_write_0.out'], ['cond_int.tmp_0'],
+                                 debug=True)
 
         exe = paddle.static.Executor(paddle.CUDAPlace(dev_id))
         exe.run(startup_program)
 
         for step in range(2):
-            init_data = np.random.uniform(
-                low=0.0, high=1.0, size=[2, 2]).astype('float32')
+            init_data = np.random.uniform(low=0.0, high=1.0,
+                                          size=[2, 2]).astype('float32')
             [res] = exe.run(main_program,
                             feed={"X": init_data},
                             fetch_list=[out])
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
index 4c966585d5f1f..e36bc5a22116f 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
@@ -23,16 +23,21 @@
 
 
 class TestMPClipGrad(TestDistMPTraning):
+
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
-        scheduler = paddle.optimizer.lr.ExponentialDecay(
-            learning_rate=0.001, gamma=0.999, verbose=True)
+        scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.001,
+                                                         gamma=0.999,
+                                                         verbose=True)
         optimizer = paddle.optimizer.SGD(scheduler,
                                          grad_clip=grad_clip,
                                          parameters=[{
-                                             'params': model.parameters(),
-                                             'weight_decay': 0.001,
-                                             'learning_rate': 0.1
+                                             'params':
+                                             model.parameters(),
+                                             'weight_decay':
+                                             0.001,
+                                             'learning_rate':
+                                             0.1
                                          }])
         return optimizer
 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py
index ad95aceaa2cf9..a3fb8774d78c0 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py
@@ -26,10 +26,12 @@
 
 
 class TestMPClipGrad(TestDistMPTraning):
+
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
-        scheduler = paddle.optimizer.lr.ExponentialDecay(
-            learning_rate=0.001, gamma=0.999, verbose=True)
+        scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.001,
+                                                         gamma=0.999,
+                                                         verbose=True)
         optimizer = paddle.optimizer.SGD(scheduler,
                                          grad_clip=grad_clip,
                                          parameters=model.parameters())
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_fp16.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_fp16.py
index 3e5eedbec9aea..449a1f18f7848 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_fp16.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_fp16.py
@@ -23,19 +23,20 @@
 
 
 class TestMPFP16(TestDistMPTraning):
+
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0)
-        scheduler = paddle.optimizer.lr.ExponentialDecay(
-            learning_rate=0.001, gamma=0.999, verbose=True)
+        scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.001,
+                                                         gamma=0.999,
+                                                         verbose=True)
         optimizer = paddle.optimizer.SGD(scheduler,
                                          grad_clip=grad_clip,
                                          parameters=model.parameters())
 
-        model, optimizer = paddle.amp.decorate(
-            models=model,
-            optimizers=optimizer,
-            level='O2',
-            save_dtype='float32')
+        model, optimizer = paddle.amp.decorate(models=model,
+                                               optimizers=optimizer,
+                                               level='O2',
+                                               save_dtype='float32')
 
         return optimizer
 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
index 9ae9c14db3fcf..e9567ae80c0ac 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
@@ -35,6 +35,7 @@ def set_random_seed(seed):
 
 
 class ColumnLinearNet(fluid.dygraph.Layer):
+
     def __init__(self, input_size, output_size, global_dtype):
         super(ColumnLinearNet, self).__init__()
         self.parallel_linear = fleet.meta_parallel.ColumnParallelLinear(
@@ -51,6 +52,7 @@ def forward(self, x):
 
 
 class RowLinearNet(fluid.dygraph.Layer):
+
     def __init__(self, input_size, output_size):
         super(RowLinearNet, self).__init__()
         self.parallel_linear = fleet.meta_parallel.RowParallelLinear(
@@ -66,10 +68,11 @@ def forward(self, x):
 
 
 class EmbeddingNet(fluid.dygraph.Layer):
+
     def __init__(self, vocab_size, hidden_size):
         super(EmbeddingNet, self).__init__()
-        self.embedding = fleet.meta_parallel.VocabParallelEmbedding(vocab_size,
-                                                                    hidden_size)
+        self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
+            vocab_size, hidden_size)
 
     def forward(self, x):
         output = self.embedding(x)
@@ -77,6 +80,7 @@ def forward(self, x):
 
 
 class SimpleMatmul(fluid.dygraph.Layer):
+
     def __init__(self, weight, output_size, global_dtype):
         super(SimpleMatmul, self).__init__()
         self.weight = paddle.create_parameter(
@@ -96,6 +100,7 @@ def forward(self, x):
 
 
 class SimpleEmbedding(fluid.dygraph.Layer):
+
     def __init__(self, vocab_size, hidden_size, weight):
         super(SimpleEmbedding, self).__init__()
         self.embedding = paddle.nn.Embedding(
@@ -111,6 +116,7 @@ def forward(self, x):
 
 
 class TestDistTraning(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
@@ -211,8 +217,9 @@ def test_row_parallel_layer(self):
             optimizer_a.step()
             optimizer_b.step()
 
-            np.testing.assert_allclose(
-                loss_a.numpy(), loss_b.numpy(), rtol=5e-6)
+            np.testing.assert_allclose(loss_a.numpy(),
+                                       loss_b.numpy(),
+                                       rtol=5e-6)
 
     def test_parallel_embedding(self):
         batch_size = 17
@@ -301,8 +308,9 @@ def test_parallel_cross_entropy(self):
             check_group = dist.new_group(list(range(self.model_parallel_size)))
             integral_data = []
             partial_data = data.clone().detach()
-            paddle.distributed.all_gather(
-                integral_data, partial_data, group=check_group)
+            paddle.distributed.all_gather(integral_data,
+                                          partial_data,
+                                          group=check_group)
             integral_data = paddle.concat(integral_data, axis=-1)
             integral_data = integral_data.detach().clone()
             integral_data.stop_gradient = False
@@ -311,20 +319,23 @@ def test_parallel_cross_entropy(self):
             loss_b = model_b(integral_data, label).sum() / batch_size
             print("loss_a: ", loss_a.numpy(), "loss_b: ", loss_b.numpy())
 
-            np.testing.assert_allclose(
-                loss_a.numpy(), loss_b.numpy(), rtol=1e-6)
+            np.testing.assert_allclose(loss_a.numpy(),
+                                       loss_b.numpy(),
+                                       rtol=1e-6)
 
             loss_a.backward()
             loss_b.backward()
 
             integral_grad = []
             partial_grad = data.grad.clone().detach()
-            paddle.distributed.all_gather(
-                integral_grad, partial_grad, group=check_group)
+            paddle.distributed.all_gather(integral_grad,
+                                          partial_grad,
+                                          group=check_group)
             integral_grad = paddle.concat(integral_grad, axis=-1)
 
-            np.testing.assert_allclose(
-                integral_data.grad.numpy(), integral_grad.numpy(), rtol=1e-6)
+            np.testing.assert_allclose(integral_data.grad.numpy(),
+                                       integral_grad.numpy(),
+                                       rtol=1e-6)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
index f9ec49d88172a..2e8acc7f6d02e 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
@@ -63,6 +63,7 @@ def parallel_matmul(lm_output, logit_weights, parallel_output):
 
 
 class SimpleMPNet(fluid.dygraph.Layer):
+
     def __init__(self, vocab_size, hidden_size, inner_size, output_size, np_fc1,
                  np_fc2, mp_id):
         super(SimpleMPNet, self).__init__()
@@ -113,6 +114,7 @@ def forward(self, x):
 
 
 class SimpleDPNet(fluid.dygraph.Layer):
+
     def __init__(self, vocab_size, hidden_size, inner_size, output_size, np_fc1,
                  np_fc2):
 
@@ -156,6 +158,7 @@ def forward(self, x):
 
 
 class TestDistMPTraning(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
@@ -210,13 +213,15 @@ def test_mp_model(self):
         for _ in range(5):
             np_data = np.random.randint(0, vocab_size, (
                 batch_size,
-                seq_length, ))
+                seq_length,
+            ))
             batch = paddle.to_tensor(np_data)
             loss_a = self.train_batch(batch, model_a, optimizer_a, True)
             loss_b = self.train_batch(batch, model_b, optimizer_b, False)
 
-            np.testing.assert_allclose(
-                loss_a.numpy(), loss_b.numpy(), rtol=1e-6)
+            np.testing.assert_allclose(loss_a.numpy(),
+                                       loss_b.numpy(),
+                                       rtol=1e-6)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py
index 59d24066946aa..32e9fc708980e 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py
@@ -26,6 +26,7 @@
 
 
 class TestDistTraning(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py
index 71e873b0e2f7c..62747b5e70a5a 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py
@@ -37,6 +37,7 @@ def set_random_seed(seed, dp_id, rank_id):
 
 
 class TestDistPPTraning(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
@@ -54,8 +55,9 @@ def setUp(self):
         fleet.init(is_collective=True, strategy=strategy)
 
     def build_optimizer(self, model):
-        scheduler = paddle.optimizer.lr.PiecewiseDecay(
-            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2],
+                                                       values=[0.001, 0.002],
+                                                       verbose=True)
         optimizer = paddle.optimizer.SGD(learning_rate=scheduler,
                                          parameters=model.parameters())
         return scheduler, optimizer
@@ -88,14 +90,15 @@ def test_pp_model(self):
             param.set_value(parameters[idx + pp_id * (param_len // 2)])
 
         # construct reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=batch_size, drop_last=True)
+        train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                    batch_size=batch_size,
+                                    drop_last=True)
 
         for step_id, data in enumerate(train_reader()):
             x_data = np.array([x[0] for x in data]).astype('float32').reshape(
                 batch_size, 1, 28, 28)
-            y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                batch_size, 1)
+            y_data = np.array([x[1] for x in data
+                               ]).astype('int64').reshape(batch_size, 1)
             img = paddle.to_tensor(x_data)
             label = paddle.to_tensor(y_data)
             img.stop_gradient = True
@@ -113,8 +116,9 @@ def test_pp_model(self):
             loss_b = model_b.train_batch([img, label], optimizer_b, scheduler_b)
 
             print("loss: ", loss_a.numpy(), loss_b.numpy())
-            np.testing.assert_allclose(
-                loss_a.numpy(), loss_b.numpy(), rtol=5e-5)
+            np.testing.assert_allclose(loss_a.numpy(),
+                                       loss_b.numpy(),
+                                       rtol=5e-5)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py
index 608bdd7a35d3f..824dd234b704c 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py
@@ -37,6 +37,7 @@ def set_random_seed(seed, dp_id, rank_id):
 
 
 class TestDistPPTraning(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
@@ -65,8 +66,9 @@ def test_pp_model(self):
 
         #construct model a
         model_a = AlexNet(10)
-        scheduler_a = paddle.optimizer.lr.PiecewiseDecay(
-            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        scheduler_a = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2],
+                                                         values=[0.001, 0.002],
+                                                         verbose=True)
         optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a,
                                            grad_clip=grad_clip,
                                            parameters=model_a.parameters())
@@ -80,8 +82,9 @@ def test_pp_model(self):
 
         # construct model b
         model_b = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size)
-        scheduler_b = paddle.optimizer.lr.PiecewiseDecay(
-            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        scheduler_b = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2],
+                                                         values=[0.001, 0.002],
+                                                         verbose=True)
         optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b,
                                            grad_clip=grad_clip,
                                            parameters=model_b.parameters())
@@ -94,14 +97,15 @@ def test_pp_model(self):
             param.set_value(parameters[idx + pp_id * (param_len // 2)])
 
         # construct reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=batch_size, drop_last=True)
+        train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                    batch_size=batch_size,
+                                    drop_last=True)
 
         for step_id, data in enumerate(train_reader()):
             x_data = np.array([x[0] for x in data]).astype('float32').reshape(
                 batch_size, 1, 28, 28)
-            y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                batch_size, 1)
+            y_data = np.array([x[1] for x in data
+                               ]).astype('int64').reshape(batch_size, 1)
             img = paddle.to_tensor(x_data)
             label = paddle.to_tensor(y_data)
             img.stop_gradient = True
@@ -119,12 +123,15 @@ def test_pp_model(self):
             scheduler_a.step()
 
             with paddle.amp.auto_cast():
-                loss_b = model_b.train_batch(
-                    [img, label], optimizer_b, scheduler_b, scaler=scaler_b)
+                loss_b = model_b.train_batch([img, label],
+                                             optimizer_b,
+                                             scheduler_b,
+                                             scaler=scaler_b)
 
             print("loss: ", loss_a.numpy(), loss_b.numpy())
-            np.testing.assert_allclose(
-                loss_a.numpy(), loss_b.numpy(), rtol=5e-5)
+            np.testing.assert_allclose(loss_a.numpy(),
+                                       loss_b.numpy(),
+                                       rtol=5e-5)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py
index 430c6e0884822..38d2bfabef7d9 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py
@@ -21,10 +21,12 @@
 
 
 class TestPPClipGrad(TestDistPPTraning):
+
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5)
-        scheduler = paddle.optimizer.lr.PiecewiseDecay(
-            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2],
+                                                       values=[0.001, 0.002],
+                                                       verbose=True)
         optimizer = paddle.optimizer.SGD(learning_rate=scheduler,
                                          grad_clip=grad_clip,
                                          parameters=model.parameters())
@@ -32,16 +34,18 @@ def build_optimizer(self, model):
 
 
 class TestPPClipGradParamGroup(TestDistPPTraning):
+
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5)
-        scheduler = paddle.optimizer.lr.PiecewiseDecay(
-            boundaries=[2], values=[0.001, 0.002], verbose=True)
-        optimizer = paddle.optimizer.Momentum(
-            learning_rate=scheduler,
-            grad_clip=grad_clip,
-            parameters=[{
-                "params": model.parameters()
-            }])
+        scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2],
+                                                       values=[0.001, 0.002],
+                                                       verbose=True)
+        optimizer = paddle.optimizer.Momentum(learning_rate=scheduler,
+                                              grad_clip=grad_clip,
+                                              parameters=[{
+                                                  "params":
+                                                  model.parameters()
+                                              }])
         return scheduler, optimizer
 
 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_embedding.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_embedding.py
index d2be0cb80722b..9c077cb70fde8 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_embedding.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_embedding.py
@@ -43,26 +43,29 @@ def set_random_seed(seed, dp_id, rank_id):
 
 
 class SimpleNet(Layer):
+
     def __init__(self):
         super(SimpleNet, self).__init__()
         self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
 
         self.softmax_weight = self.create_parameter(
             shape=[hidden_size, vocab_size])
-        self.softmax_bias = self.create_parameter(
-            shape=[vocab_size], is_bias=False)
+        self.softmax_bias = self.create_parameter(shape=[vocab_size],
+                                                  is_bias=False)
 
     def forward(self, x1, x2, y1):
         x_emb = self.word_embeddings(x1)
         fc = fluid.layers.matmul(x_emb, self.softmax_weight)
         fc = fluid.layers.elementwise_add(fc, self.softmax_bias)
         projection = fluid.layers.reshape(fc, shape=[-1, vocab_size])
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=projection, label=y1, soft_label=False)
+        loss = fluid.layers.softmax_with_cross_entropy(logits=projection,
+                                                       label=y1,
+                                                       soft_label=False)
         return loss.mean()
 
 
 class EmbeddingNet(Layer):
+
     def __init__(self):
         super(EmbeddingNet, self).__init__()
         self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
@@ -74,6 +77,7 @@ def forward(self, args):
 
 
 class MatmulNet(Layer):
+
     def __init__(self):
         super(MatmulNet, self).__init__()
         self.softmax_weight = self.create_parameter(
@@ -87,6 +91,7 @@ def forward(self, args):
 
 
 class BiasNet(Layer):
+
     def __init__(self):
         super(BiasNet, self).__init__()
         self.softmax_bias = self.create_parameter(shape=[vocab_size])
@@ -99,17 +104,20 @@ def forward(self, args):
 
 
 class LossNet(Layer):
+
     def __init__(self):
         super(LossNet, self).__init__()
 
     def forward(self, args, y1):
         projection, x2 = args
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=projection, label=y1[0], soft_label=False)
+        loss = fluid.layers.softmax_with_cross_entropy(logits=projection,
+                                                       label=y1[0],
+                                                       soft_label=False)
         return loss.mean()
 
 
 class SimpleNetPipe(Layer):
+
     def __init__(self):
         super(SimpleNetPipe, self).__init__()
         self.features = Sequential(EmbeddingNet(), MatmulNet(), BiasNet())
@@ -120,6 +128,7 @@ def to_layers(self):
 
 
 class TestDistEmbeddingTraning(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
@@ -152,10 +161,9 @@ def test_pp_model(self):
                                            parameters=model_a.parameters())
 
         init_net = SimpleNetPipe()
-        model_b = PipelineLayer(
-            layers=init_net.to_layers(),
-            num_stages=self.pipeline_parallel_size,
-            loss_fn=LossNet())
+        model_b = PipelineLayer(layers=init_net.to_layers(),
+                                num_stages=self.pipeline_parallel_size,
+                                loss_fn=LossNet())
 
         scheduler_b = paddle.optimizer.lr.PiecewiseDecay(
             boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04], verbose=True)
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py
index 4893960345ea7..5ad1573d2c144 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py
@@ -37,6 +37,7 @@ def set_random_seed(seed, dp_id, rank_id):
 
 
 class TestDistPPTraning(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
@@ -65,8 +66,9 @@ def test_pp_model(self):
 
         #construct model a
         model_a = AlexNet(10)
-        scheduler_a = paddle.optimizer.lr.PiecewiseDecay(
-            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        scheduler_a = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2],
+                                                         values=[0.001, 0.002],
+                                                         verbose=True)
         optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a,
                                            grad_clip=grad_clip,
                                            parameters=model_a.parameters())
@@ -75,8 +77,9 @@ def test_pp_model(self):
 
         # construct model b
         model_b = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size)
-        scheduler_b = paddle.optimizer.lr.PiecewiseDecay(
-            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        scheduler_b = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2],
+                                                         values=[0.001, 0.002],
+                                                         verbose=True)
         optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b,
                                            grad_clip=grad_clip,
                                            parameters=model_b.parameters())
@@ -89,16 +92,14 @@ def test_pp_model(self):
         for idx, param in enumerate(model_b.parameters()):
             param.set_value(parameters[idx + pp_id * (param_len // 2)])
 
-        model_a, optimizer_a = paddle.amp.decorate(
-            models=model_a,
-            optimizers=optimizer_a,
-            level='O2',
-            save_dtype='float32')
-        model_b, optimizer_b = paddle.amp.decorate(
-            models=model_b,
-            optimizers=optimizer_b,
-            level='O2',
-            save_dtype='float32')
+        model_a, optimizer_a = paddle.amp.decorate(models=model_a,
+                                                   optimizers=optimizer_a,
+                                                   level='O2',
+                                                   save_dtype='float32')
+        model_b, optimizer_b = paddle.amp.decorate(models=model_b,
+                                                   optimizers=optimizer_b,
+                                                   level='O2',
+                                                   save_dtype='float32')
 
         model_b = fleet.distributed_model(model_b)
         optimizer_b = fleet.distributed_optimizer(optimizer_b)
@@ -106,14 +107,15 @@ def test_pp_model(self):
         scaler_b = fleet.distributed_scaler(scaler_b)
 
         # construct reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=batch_size, drop_last=True)
+        train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                    batch_size=batch_size,
+                                    drop_last=True)
 
         for step_id, data in enumerate(train_reader()):
             x_data = np.array([x[0] for x in data]).astype('float32').reshape(
                 batch_size, 1, 28, 28)
-            y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                batch_size, 1)
+            y_data = np.array([x[1] for x in data
+                               ]).astype('int64').reshape(batch_size, 1)
             img = paddle.to_tensor(x_data)
             label = paddle.to_tensor(y_data)
             img.stop_gradient = True
@@ -130,12 +132,15 @@ def test_pp_model(self):
             scheduler_a.step()
 
             with paddle.amp.auto_cast(enable=True, level='O2'):
-                loss_b = model_b.train_batch(
-                    [img, label], optimizer_b, scheduler_b, scaler=scaler_b)
+                loss_b = model_b.train_batch([img, label],
+                                             optimizer_b,
+                                             scheduler_b,
+                                             scaler=scaler_b)
 
             print("loss: ", loss_a.numpy(), loss_b.numpy())
-            np.testing.assert_allclose(
-                loss_a.numpy(), loss_b.numpy(), rtol=5e-3)
+            np.testing.assert_allclose(loss_a.numpy(),
+                                       loss_b.numpy(),
+                                       rtol=5e-3)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
index b30df0e9a2f21..c1609c975e683 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
@@ -25,6 +25,7 @@
 
 
 class ReshapeHelp(Layer):
+
     def __init__(self, shape):
         super(ReshapeHelp, self).__init__()
         self.shape = shape
@@ -34,30 +35,24 @@ def forward(self, x):
 
 
 class AlexNet(Layer):
+
     def __init__(self, num_classes=10):
         super(AlexNet, self).__init__()
         self.features = Sequential(
-            nn.Conv2D(
-                1, 64, kernel_size=11, stride=4, padding=5),
+            nn.Conv2D(1, 64, kernel_size=11, stride=4, padding=5),
             nn.ReLU(),
-            nn.MaxPool2D(
-                kernel_size=2, stride=2),
-            nn.Conv2D(
-                64, 192, kernel_size=5, padding=2),
+            nn.MaxPool2D(kernel_size=2, stride=2),
+            nn.Conv2D(64, 192, kernel_size=5, padding=2),
             nn.ReLU(),
-            nn.MaxPool2D(
-                kernel_size=2, stride=2),
-            nn.Conv2D(
-                192, 384, kernel_size=3, padding=1),
+            nn.MaxPool2D(kernel_size=2, stride=2),
+            nn.Conv2D(192, 384, kernel_size=3, padding=1),
             nn.ReLU(),
-            nn.Conv2D(
-                384, 256, kernel_size=3, padding=1),
+            nn.Conv2D(384, 256, kernel_size=3, padding=1),
             nn.ReLU(),
-            nn.Conv2D(
-                256, 256, kernel_size=3, padding=1),
+            nn.Conv2D(256, 256, kernel_size=3, padding=1),
             nn.ReLU(),
-            nn.MaxPool2D(
-                kernel_size=2, stride=2), )
+            nn.MaxPool2D(kernel_size=2, stride=2),
+        )
 
         self.reshape_layer = ReshapeHelp(shape=[-1, 256])
         self.classifier = nn.Linear(256, num_classes)
@@ -71,6 +66,7 @@ def forward(self, x, y):
 
 
 class AlexNetPipe(AlexNet):
+
     def to_layers(self):
         feat = [self.features[i] for i in range(len(self.features))]
         loss_fn = [self.reshape_layer, self.classifier]
@@ -79,39 +75,33 @@ def to_layers(self):
 
 
 class AlexNetPipeDesc(PipelineLayer):
+
     def __init__(self, num_classes=10, **kwargs):
         self.num_classes = num_classes
         decs = [
-            LayerDesc(
-                nn.Conv2D, 1, 64, kernel_size=11, stride=4, padding=5),
+            LayerDesc(nn.Conv2D, 1, 64, kernel_size=11, stride=4, padding=5),
             LayerDesc(nn.ReLU),
-            LayerDesc(
-                nn.MaxPool2D, kernel_size=2, stride=2),
-            LayerDesc(
-                nn.Conv2D, 64, 192, kernel_size=5, padding=2),
+            LayerDesc(nn.MaxPool2D, kernel_size=2, stride=2),
+            LayerDesc(nn.Conv2D, 64, 192, kernel_size=5, padding=2),
             F.relu,
-            LayerDesc(
-                nn.MaxPool2D, kernel_size=2, stride=2),
-            LayerDesc(
-                nn.Conv2D, 192, 384, kernel_size=3, padding=1),
+            LayerDesc(nn.MaxPool2D, kernel_size=2, stride=2),
+            LayerDesc(nn.Conv2D, 192, 384, kernel_size=3, padding=1),
             F.relu,
-            LayerDesc(
-                nn.Conv2D, 384, 256, kernel_size=3, padding=1),
+            LayerDesc(nn.Conv2D, 384, 256, kernel_size=3, padding=1),
             F.relu,
-            LayerDesc(
-                nn.Conv2D, 256, 256, kernel_size=3, padding=1),
+            LayerDesc(nn.Conv2D, 256, 256, kernel_size=3, padding=1),
             F.relu,
-            LayerDesc(
-                nn.MaxPool2D, kernel_size=2, stride=2),
-            LayerDesc(
-                ReshapeHelp, shape=[-1, 256]),
+            LayerDesc(nn.MaxPool2D, kernel_size=2, stride=2),
+            LayerDesc(ReshapeHelp, shape=[-1, 256]),
             LayerDesc(nn.Linear, 256, self.num_classes),  # classifier
         ]
-        super(AlexNetPipeDesc, self).__init__(
-            layers=decs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
+        super(AlexNetPipeDesc, self).__init__(layers=decs,
+                                              loss_fn=nn.CrossEntropyLoss(),
+                                              **kwargs)
 
 
 class TestPipeLayerAPI(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.pipeline_parallel_size = 2
@@ -129,10 +119,9 @@ def test_pipelayer_desc(self):
 
     def test_pipelayer_sequential(self):
         init_net = AlexNetPipe()
-        pipe_model = PipelineLayer(
-            layers=init_net.to_layers(),
-            num_stages=self.pipeline_parallel_size,
-            loss_fn=nn.CrossEntropyLoss())
+        pipe_model = PipelineLayer(layers=init_net.to_layers(),
+                                   num_stages=self.pipeline_parallel_size,
+                                   loss_fn=nn.CrossEntropyLoss())
         stage_id = self.hcg.get_stage_id()
         init_parameters = init_net.parameters()
         pipe_parameters = pipe_model.parameters()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_recompute.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_recompute.py
index ebcac70a3b68a..8e364290bae67 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_recompute.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_recompute.py
@@ -45,6 +45,7 @@ def set_random_seed(seed, dp_id, rank_id):
 
 
 class EmbeddingNet(Layer):
+
     def __init__(self):
         super(EmbeddingNet, self).__init__()
         self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
@@ -58,6 +59,7 @@ def forward(self, x):
 
 
 class TransformerNet(Layer):
+
     def __init__(self):
         super(TransformerNet, self).__init__()
         self.linear1 = nn.Linear(d_model, dim_feedforward)
@@ -87,17 +89,20 @@ def forward(self, x):
 
 
 class EmbeddingPipe(EmbeddingNet):
+
     def forward(self, x):
         return super().forward(x)
 
 
 class TransformerNetPipe(TransformerNet):
+
     def forward(self, x):
         output = super().forward(x)
         return output
 
 
 class CriterionPipe(Layer):
+
     def __init__(self):
         super(CriterionPipe, self).__init__()
 
@@ -107,6 +112,7 @@ def forward(self, out, label):
 
 
 class ModelPipe(PipelineLayer):
+
     def __init__(self, topology):
         self.descs = []
         self.descs.append(LayerDesc(EmbeddingPipe))
@@ -114,17 +120,17 @@ def __init__(self, topology):
         for x in range(2):
             self.descs.append(LayerDesc(TransformerNetPipe))
 
-        super().__init__(
-            layers=self.descs,
-            loss_fn=CriterionPipe(),
-            topology=topology,
-            seg_method="layer:TransformerNetPipe",
-            recompute_interval=1,
-            recompute_partition=False,
-            recompute_offload=False)
+        super().__init__(layers=self.descs,
+                         loss_fn=CriterionPipe(),
+                         topology=topology,
+                         seg_method="layer:TransformerNetPipe",
+                         recompute_interval=1,
+                         recompute_partition=False,
+                         recompute_offload=False)
 
 
 class TestDistPPTraning(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
@@ -151,8 +157,9 @@ def test_pp_model(self):
         set_random_seed(1024, dp_id, rank_id)
 
         model = ModelPipe(topology)
-        scheduler = paddle.optimizer.lr.PiecewiseDecay(
-            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2],
+                                                       values=[0.001, 0.002],
+                                                       verbose=True)
         optimizer = paddle.optimizer.SGD(learning_rate=scheduler,
                                          parameters=model.parameters())
 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_save_load.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_save_load.py
index e6e27bbb41a8a..8521ae8b35bb1 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_save_load.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_save_load.py
@@ -33,6 +33,7 @@
 
 
 class TestDistPPSaveLoadTraning(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
@@ -59,8 +60,9 @@ def test_pp_model(self):
         set_random_seed(1024, dp_id, rank_id)
 
         model = ModelPipe(topology)
-        scheduler = paddle.optimizer.lr.PiecewiseDecay(
-            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2],
+                                                       values=[0.001, 0.002],
+                                                       verbose=True)
         optimizer = paddle.optimizer.SGD(learning_rate=scheduler,
                                          parameters=model.parameters())
 
@@ -81,8 +83,9 @@ def test_pp_model(self):
 
         # construct data
         test_steps = 5
-        np_data = np.random.randint(
-            0, vocab_size, size=[test_steps, batch_size, length])
+        np_data = np.random.randint(0,
+                                    vocab_size,
+                                    size=[test_steps, batch_size, length])
 
         origin_loss = []
         for step_id in range(5):
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py
index c4c1e565068b2..ffe4a063a9ccf 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py
@@ -45,15 +45,15 @@ def set_random_seed(seed, dp_id, rank_id):
 
 
 class EmbeddingNet(Layer):
+
     def __init__(self):
         super(EmbeddingNet, self).__init__()
         self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
         self.position_embeddings = nn.Embedding(vocab_size, hidden_size)
 
     def forward(self, x):
-        attention_mask = paddle.tensor.triu(
-            (paddle.ones(
-                (length, length), dtype="float32") * -1e9), 1)
+        attention_mask = paddle.tensor.triu((paddle.ones(
+            (length, length), dtype="float32") * -1e9), 1)
 
         no_used = paddle.ones((3, 3), dtype="int32")
 
@@ -68,6 +68,7 @@ def forward(self, x):
 
 
 class TransformerNet(Layer):
+
     def __init__(self):
         super(TransformerNet, self).__init__()
         self.linear1 = nn.Linear(d_model, dim_feedforward)
@@ -98,11 +99,13 @@ def forward(self, x, mask):
 
 
 class EmbeddingPipe(EmbeddingNet):
+
     def forward(self, x):
         return super().forward(x)
 
 
 class TransformerNetPipe(TransformerNet):
+
     def forward(self, args):
         x, mask, no_used, p_emb = args[0], args[1], args[2], args[3]
 
@@ -113,6 +116,7 @@ def forward(self, args):
 
 
 class CriterionPipe(Layer):
+
     def __init__(self):
         super(CriterionPipe, self).__init__()
 
@@ -122,6 +126,7 @@ def forward(self, out, label):
 
 
 class ModelPipe(PipelineLayer):
+
     def __init__(self, topology):
         self.descs = []
         self.descs.append(LayerDesc(EmbeddingPipe))
@@ -131,14 +136,14 @@ def __init__(self, topology):
 
         self.descs.append(lambda x: x[0])
 
-        super().__init__(
-            layers=self.descs,
-            loss_fn=CriterionPipe(),
-            topology=topology,
-            seg_method="layer:TransformerNetPipe")
+        super().__init__(layers=self.descs,
+                         loss_fn=CriterionPipe(),
+                         topology=topology,
+                         seg_method="layer:TransformerNetPipe")
 
 
 class TestDistPPTraning(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
@@ -165,8 +170,9 @@ def test_pp_model(self):
         set_random_seed(1024, dp_id, rank_id)
 
         model = ModelPipe(topology)
-        scheduler = paddle.optimizer.lr.PiecewiseDecay(
-            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2],
+                                                       values=[0.001, 0.002],
+                                                       verbose=True)
         optimizer = paddle.optimizer.SGD(learning_rate=scheduler,
                                          parameters=model.parameters())
 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py
index 8cb1166cd0d83..63bdcbc4d87a7 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py
@@ -57,6 +57,7 @@ def parallel_matmul(lm_output, logit_weights, parallel_output):
 
 
 class SimpleMPNet(fluid.dygraph.Layer):
+
     def __init__(self, vocab_size, hidden_size, inner_size, output_size, np_fc1,
                  np_fc2, mp_id):
         super(SimpleMPNet, self).__init__()
@@ -107,6 +108,7 @@ def forward(self, x):
 
 
 class SimpleDPNet(fluid.dygraph.Layer):
+
     def __init__(self, vocab_size, hidden_size, inner_size, output_size, np_fc1,
                  np_fc2):
 
@@ -150,6 +152,7 @@ def forward(self, x):
 
 
 class TestDistMPTraning(unittest.TestCase):
+
     def setUp(self):
         random.seed(2021)
         np.random.seed(2021)
@@ -166,7 +169,8 @@ def setUp(self):
         self.data = [
             np.random.randint(0, vocab_size, (
                 batch_size,
-                seq_length, )) for _ in range(STEPS)
+                seq_length,
+            )) for _ in range(STEPS)
         ]
 
     def train_batch(self, batch, model, optimizer):
@@ -228,21 +232,19 @@ def build_model_optimizer(self, Optimizer="adam"):
 
         model_a = SimpleDPNet(vocab_size, hidden_size, inner_size, output_size,
                               np_fc1, np_fc2)
-        optimizer_a = self.build_optimizer(
-            model_a,
-            strategy=self.strategy,
-            is_sharding=True,
-            Optimizer=Optimizer)
+        optimizer_a = self.build_optimizer(model_a,
+                                           strategy=self.strategy,
+                                           is_sharding=True,
+                                           Optimizer=Optimizer)
         model_a = fleet.distributed_model(model_a)
         optimizer_a = fleet.distributed_optimizer(optimizer_a)
 
         model_b = SimpleDPNet(vocab_size, hidden_size, inner_size, output_size,
                               np_fc1, np_fc2)
-        optimizer_b = self.build_optimizer(
-            model_b,
-            strategy=self.strategy,
-            is_sharding=False,
-            Optimizer=Optimizer)
+        optimizer_b = self.build_optimizer(model_b,
+                                           strategy=self.strategy,
+                                           is_sharding=False,
+                                           Optimizer=Optimizer)
 
         return model_a, optimizer_a, model_b, optimizer_b
 
@@ -257,8 +259,8 @@ def sharding_model(self, Optimizer, sharded_accumulators):
 
             if idx == 2 and paddle.distributed.get_rank() == 0:
                 self.assertTrue(
-                    set(optimizer_a._inner_opt._inner_optimizer.state_dict()
-                        .keys()) == sharded_accumulators)
+                    set(optimizer_a._inner_opt._inner_optimizer.state_dict().
+                        keys()) == sharded_accumulators)
 
             if paddle.distributed.get_rank() == 0:
                 batch_sharding = paddle.to_tensor(self.data[idx][:2])
@@ -270,10 +272,9 @@ def sharding_model(self, Optimizer, sharded_accumulators):
             loss_b = self.train_batch(batch_single, model_b, optimizer_b)
 
             for j in range(len(model_a.parameters())):
-                np.testing.assert_allclose(
-                    model_a.parameters()[j].numpy(),
-                    model_b.parameters()[j].numpy(),
-                    rtol=1e-6)
+                np.testing.assert_allclose(model_a.parameters()[j].numpy(),
+                                           model_b.parameters()[j].numpy(),
+                                           rtol=1e-6)
 
     def test_sharding_adam(self):
         sharded_accumulators = set([
@@ -286,16 +287,16 @@ def test_sharding_adam(self):
             'linear_0.w_0_beta2_pow_acc_0', 'linear_1.b_0_beta2_pow_acc_0',
             'linear_2.b_0_beta2_pow_acc_0', 'embedding_0.w_0_beta2_pow_acc_0'
         ])
-        self.sharding_model(
-            Optimizer="adam", sharded_accumulators=sharded_accumulators)
+        self.sharding_model(Optimizer="adam",
+                            sharded_accumulators=sharded_accumulators)
 
     def test_sharding_momentum(self):
         sharded_accumulators = set([
             'linear_6.w_0_velocity_0', 'linear_7.b_0_velocity_0',
             'linear_8.b_0_velocity_0', 'embedding_2.w_0_velocity_0'
         ])
-        self.sharding_model(
-            Optimizer="Momentum", sharded_accumulators=sharded_accumulators)
+        self.sharding_model(Optimizer="Momentum",
+                            sharded_accumulators=sharded_accumulators)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_shared_weight.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_shared_weight.py
index 9253f737bf942..20bdb9f9d6888 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_shared_weight.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_shared_weight.py
@@ -48,14 +48,15 @@ def set_random_seed(seed, dp_id, rank_id):
 
 
 class SimpleNet(Layer):
+
     def __init__(self):
         super(SimpleNet, self).__init__()
         self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
 
         self.softmax_weight = self.create_parameter(
             shape=[hidden_size, vocab_size])
-        self.softmax_bias = self.create_parameter(
-            shape=[vocab_size], is_bias=False)
+        self.softmax_bias = self.create_parameter(shape=[vocab_size],
+                                                  is_bias=False)
 
     def forward(self, x1, x2, y1):
         x_emb = self.word_embeddings(x1)
@@ -65,12 +66,14 @@ def forward(self, x1, x2, y1):
 
         projection = paddle.matmul(projection, self.word_embeddings.weight)
 
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=projection, label=y1, soft_label=False)
+        loss = fluid.layers.softmax_with_cross_entropy(logits=projection,
+                                                       label=y1,
+                                                       soft_label=False)
         return loss.mean()
 
 
 class EmbeddingPipe(Layer):
+
     def __init__(self):
         super(EmbeddingPipe, self).__init__()
         self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
@@ -86,6 +89,7 @@ def forward(self, args):
 
 
 class MatmulNet(Layer):
+
     def __init__(self):
         super(MatmulNet, self).__init__()
         self.softmax_weight = self.create_parameter(
@@ -99,6 +103,7 @@ def forward(self, args):
 
 
 class BiasNet(Layer):
+
     def __init__(self):
         super(BiasNet, self).__init__()
         self.softmax_bias = self.create_parameter(shape=[vocab_size])
@@ -111,22 +116,26 @@ def forward(self, args):
 
 
 class LossNet(Layer):
+
     def __init__(self):
         super(LossNet, self).__init__()
 
     def forward(self, args, y1):
         projection = args
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=projection, label=y1[0], soft_label=False)
+        loss = fluid.layers.softmax_with_cross_entropy(logits=projection,
+                                                       label=y1[0],
+                                                       soft_label=False)
         return loss.mean()
 
 
 class SimpleNetPipe(PipelineLayer):
+
     def __init__(self, **kwargs):
         self.descs = []
         self.descs.append(
-            SharedLayerDesc(
-                'embed', EmbeddingPipe, shared_weight_attr='embedding_weight'))
+            SharedLayerDesc('embed',
+                            EmbeddingPipe,
+                            shared_weight_attr='embedding_weight'))
         self.descs.append(LayerDesc(MatmulNet))
 
         self.descs.append(LayerDesc(BiasNet))
@@ -135,17 +144,18 @@ def _logits_helper(embedding, output):
             return paddle.matmul(output[0], embedding.embedding_weight)
 
         self.descs.append(
-            SharedLayerDesc(
-                'embed',
-                EmbeddingPipe,
-                forward_func=_logits_helper,
-                shared_weight_attr='embedding_weight'))
+            SharedLayerDesc('embed',
+                            EmbeddingPipe,
+                            forward_func=_logits_helper,
+                            shared_weight_attr='embedding_weight'))
 
-        super(SimpleNetPipe, self).__init__(
-            layers=self.descs, loss_fn=LossNet(), **kwargs)
+        super(SimpleNetPipe, self).__init__(layers=self.descs,
+                                            loss_fn=LossNet(),
+                                            **kwargs)
 
 
 class TestDistEmbeddingTraning(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/python/paddle/fluid/tests/unittests/init_process_group.py b/python/paddle/fluid/tests/unittests/init_process_group.py
index 17887a9d767c1..c1131d101de92 100644
--- a/python/paddle/fluid/tests/unittests/init_process_group.py
+++ b/python/paddle/fluid/tests/unittests/init_process_group.py
@@ -30,6 +30,7 @@
 
 
 class TestProcessGroupFp32(unittest.TestCase):
+
     def setUp(self):
         self.config()
 
diff --git a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
index 09cc6ed5b5fce..976a36b761568 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
@@ -1,9 +1,46 @@
-file(GLOB TEST_INTERP_CASES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_INTERP_CASES
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}")
 
 foreach(target ${TEST_INTERP_CASES})
-  py_test_modules(${target} MODULES ${target} ENVS FLAGS_host_trace_level=10 FLAGS_static_executor_perfstat_filepath=./perfstat FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0)
-  py_test_modules(${target}_non_eager_deletion MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0.000001) 
-  py_test_modules(${target}_fast_gc MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=true FLAGS_eager_delete_tensor_gb=0)
-  py_test_modules(${target}_fast_gc_non_eager_deletion MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=true FLAGS_eager_delete_tensor_gb=0.000001)
+  py_test_modules(
+    ${target}
+    MODULES
+    ${target}
+    ENVS
+    FLAGS_host_trace_level=10
+    FLAGS_static_executor_perfstat_filepath=./perfstat
+    FLAGS_allocator_strategy=auto_growth
+    FLAGS_use_stream_safe_cuda_allocator=true
+    FLAGS_fast_eager_deletion_mode=false
+    FLAGS_eager_delete_tensor_gb=0)
+  py_test_modules(
+    ${target}_non_eager_deletion
+    MODULES
+    ${target}
+    ENVS
+    FLAGS_allocator_strategy=auto_growth
+    FLAGS_use_stream_safe_cuda_allocator=true
+    FLAGS_fast_eager_deletion_mode=false
+    FLAGS_eager_delete_tensor_gb=0.000001)
+  py_test_modules(
+    ${target}_fast_gc
+    MODULES
+    ${target}
+    ENVS
+    FLAGS_allocator_strategy=auto_growth
+    FLAGS_use_stream_safe_cuda_allocator=true
+    FLAGS_fast_eager_deletion_mode=true
+    FLAGS_eager_delete_tensor_gb=0)
+  py_test_modules(
+    ${target}_fast_gc_non_eager_deletion
+    MODULES
+    ${target}
+    ENVS
+    FLAGS_allocator_strategy=auto_growth
+    FLAGS_use_stream_safe_cuda_allocator=true
+    FLAGS_fast_eager_deletion_mode=true
+    FLAGS_eager_delete_tensor_gb=0.000001)
 endforeach()
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py
index 7c1497a48535e..eeddcaa5bb534 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -31,9 +31,10 @@
 #  and new executor twice and check the result.
 #  please override the _get_feeds() and build_prgram()
 class TestCompatibility(unittest.TestCase):
+
     def setUp(self):
-        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        self.place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
         self.iter_run = 4
 
     def _get_feed(self):
@@ -42,15 +43,20 @@ def _get_feed(self):
         return None
 
     def build_program(self):
+
         def true_func():
-            return layers.fill_constant(
-                shape=[1, 2], dtype='int32', value=1), layers.fill_constant(
-                    shape=[2, 3], dtype='bool', value=True)
+            return layers.fill_constant(shape=[1, 2], dtype='int32',
+                                        value=1), layers.fill_constant(
+                                            shape=[2, 3],
+                                            dtype='bool',
+                                            value=True)
 
         def false_func():
-            return layers.fill_constant(
-                shape=[3, 4], dtype='float32', value=3), layers.fill_constant(
-                    shape=[4, 5], dtype='int64', value=2)
+            return layers.fill_constant(shape=[3, 4], dtype='float32',
+                                        value=3), layers.fill_constant(
+                                            shape=[4, 5],
+                                            dtype='int64',
+                                            value=2)
 
         main_program = Program()
         startup_program = Program()
@@ -101,12 +107,14 @@ def test_with_feed(self):
 
 
 class TestWhile(TestCompatibility):
+
     def _get_feed(self):
         """ return the feeds
         """
         return None
 
     def build_program(self):
+
         def cond(i, ten):
             return i < ten
 
@@ -117,10 +125,10 @@ def body(i, ten):
         main_program = paddle.static.default_main_program()
         startup_program = paddle.static.default_startup_program()
         with paddle.static.program_guard(main_program, startup_program):
-            i = paddle.full(
-                shape=[1], fill_value=0, dtype='int64')  # loop counter
-            ten = paddle.full(
-                shape=[1], fill_value=10, dtype='int64')  # loop length
+            i = paddle.full(shape=[1], fill_value=0,
+                            dtype='int64')  # loop counter
+            ten = paddle.full(shape=[1], fill_value=10,
+                              dtype='int64')  # loop length
             i, ten = paddle.static.nn.while_loop(cond, body, [i, ten])
 
             exe = paddle.static.Executor(paddle.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index a4dad5f53f14b..7faff7ec18193 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+
 os.environ['FLAGS_use_stream_safe_cuda_allocator'] = "true"
 import sys
 import shutil
@@ -29,9 +30,10 @@
 
 
 class LinearTestCase(unittest.TestCase):
+
     def setUp(self):
-        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
         self.place = core.Place()
         self.place.set_place(place)
 
@@ -48,33 +50,27 @@ def build_program(self):
 
     def test_interp_base(self):
         startup_program, main_program, c = self.build_program()
-        standaloneexecutor = StandaloneExecutor(
-            self.place, startup_program.desc, main_program.desc, core.Scope())
-        out = standaloneexecutor.run({
-            "a": np.ones(
-                [2, 2], dtype="float32") * 2
-        }, [c.name])
+        standaloneexecutor = StandaloneExecutor(self.place,
+                                                startup_program.desc,
+                                                main_program.desc, core.Scope())
+        out = standaloneexecutor.run(
+            {"a": np.ones([2, 2], dtype="float32") * 2}, [c.name])
         for i in range(10):
-            out = standaloneexecutor.run({
-                "a": np.ones(
-                    [2, 2], dtype="float32") * i
-            }, [c.name])
+            out = standaloneexecutor.run(
+                {"a": np.ones([2, 2], dtype="float32") * i}, [c.name])
 
         for i in range(10):
-            out = standaloneexecutor.run({
-                "a": np.ones(
-                    [2, 2], dtype="float32") * i
-            }, ['a', c.name])
+            out = standaloneexecutor.run(
+                {"a": np.ones([2, 2], dtype="float32") * i}, ['a', c.name])
 
     def test_dry_run(self):
         startup_program, main_program, c = self.build_program()
-        standaloneexecutor = StandaloneExecutor(
-            self.place, startup_program.desc, main_program.desc, core.Scope())
+        standaloneexecutor = StandaloneExecutor(self.place,
+                                                startup_program.desc,
+                                                main_program.desc, core.Scope())
         # test for cost_info
-        cost_info = standaloneexecutor.dry_run({
-            "a": np.ones(
-                [2, 2], dtype="float32")
-        })
+        cost_info = standaloneexecutor.dry_run(
+            {"a": np.ones([2, 2], dtype="float32")})
         self.check_cost_info(cost_info)
 
     def check_cost_info(self, cost_info):
@@ -120,10 +116,11 @@ def build_program():
 
 
 class ExecutorStatisticsTestCase(unittest.TestCase):
+
     def setUp(self):
         self.iter_n = 3
-        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        self.place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
 
     def test_standalone_executor_statistics(self):
         if os.getenv("FLAGS_static_executor_perfstat_filepath") is None:
@@ -221,10 +218,11 @@ def test_executor_statistics(self):
 
 
 class MultiStreamModelTestCase(unittest.TestCase):
+
     def setUp(self):
         self.iter_n = 2
-        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        self.place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
 
     def test_result(self):
         ground_truths = self.run_raw_executor()
@@ -274,6 +272,7 @@ def run_new_executor_sequential(self):
 
 
 class SwitchExecutorInterfaceTestCase(MultiStreamModelTestCase):
+
     def run_new_executor(self):
         paddle.seed(2020)
         os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
@@ -291,9 +290,10 @@ def run_new_executor(self):
 
 
 class SwitchExecutorInterfaceWithFeed(unittest.TestCase):
+
     def setUp(self):
-        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        self.place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
         self.iter_run = 2
 
     def build_program(self, is_double=False):
@@ -325,8 +325,8 @@ def _run(self,
 
         if use_compiled:
             main_program = paddle.static.CompiledProgram(
-                main_program).with_data_parallel(
-                    fetch_vars[0].name, places=[self.place])
+                main_program).with_data_parallel(fetch_vars[0].name,
+                                                 places=[self.place])
 
         if use_str:  # test for fetch name
             fetch_vars = [x.name for x in fetch_vars]
@@ -342,11 +342,15 @@ def _run(self,
 
     def run_raw_executor(self, feed, use_compiled=False):
         # run construct program 1
-        out1 = self._run(
-            feed, use_str=False, is_double=False, use_compiled=use_compiled)
+        out1 = self._run(feed,
+                         use_str=False,
+                         is_double=False,
+                         use_compiled=use_compiled)
         # run construct program 2 with same executor
-        out2 = self._run(
-            feed, use_str=True, is_double=True, use_compiled=use_compiled)
+        out2 = self._run(feed,
+                         use_str=True,
+                         is_double=True,
+                         use_compiled=use_compiled)
 
         return [out1, out2]
 
@@ -396,6 +400,7 @@ def test_empty_program(self):
 
 
 class TestException(unittest.TestCase):
+
     def setUp(self):
         self.place = paddle.CPUPlace()
         self.fetch_vars = None
@@ -407,8 +412,10 @@ def build_program(self):
             w = paddle.rand([10, 3])
             ids = paddle.static.data(name="id", shape=[5], dtype='int64')
             data = paddle.static.data(name="data", shape=[3], dtype='float32')
-            emb = paddle.nn.functional.embedding(
-                x=ids, weight=w, sparse=False, name="embedding")
+            emb = paddle.nn.functional.embedding(x=ids,
+                                                 weight=w,
+                                                 sparse=False,
+                                                 name="embedding")
             emb = emb + data
 
         return main_program, startup_program, emb
@@ -470,6 +477,7 @@ def test_scope_find_temp_var(self):
 
 
 class TestInplaceApiWithDataTransform(unittest.TestCase):
+
     def test_increment(self):
         if paddle.fluid.core.is_compiled_with_cuda():
             with paddle.fluid.device_guard("gpu:0"):
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_multiply_write.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_multiply_write.py
index 5e298fc3dc7a6..8006c59d2ba12 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_multiply_write.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_multiply_write.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,6 +29,7 @@
 
 
 class TestMultiplyWrite(TestCompatibility):
+
     def _get_feed(self):
         """ return the feeds
         """
diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
index 4826b37512614..6b709d85d75c3 100644
--- a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
@@ -1,15 +1,18 @@
 if(WITH_IPU)
-    file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-    string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+  file(
+    GLOB TEST_OPS
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "test_*.py")
+  string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-    foreach(TEST_OP ${TEST_OPS})
-        py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-        # set all UTs timeout to 200s
-        set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200)    
-    endforeach(TEST_OP)
+  foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+    # set all UTs timeout to 200s
+    set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200)
+  endforeach(TEST_OP)
 
-    set_tests_properties(test_conv_op_ipu PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_reduce_x_op_ipu PROPERTIES TIMEOUT 600)
-    set_tests_properties(test_save_load_ipu PROPERTIES TIMEOUT 600)
+  set_tests_properties(test_conv_op_ipu PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_reduce_x_op_ipu PROPERTIES TIMEOUT 600)
+  set_tests_properties(test_save_load_ipu PROPERTIES TIMEOUT 600)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh b/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh
deleted file mode 100644
index 6f491ef107104..0000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/bin/bash
-  
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -e
-
-partition_name=pod64
-vipu_server=10.137.96.62
-allclose_script="
-import sys
-import numpy as np
-data1 = np.loadtxt(\"ipu_res.txt\")
-data2 = np.loadtxt(\"cpu_res.txt\")
-if np.allclose(data1[::16], data2, atol=1e-6):
-    sys.exit(0)
-else:
-    sys.exit(1)
-"
-
-for opt in lamb sgd adam ;
-do
-    for onchip in False True ;
-    do
-        for rts in False True ;
-        do
-            echo "Testcase: opt: ${opt}, onchip: ${onchip}, rts: ${rts}"
-            echo "paddle.distributed.fleet.launch test with IPUs..."
-            python3.7 -m paddle.distributed.launch \
-            --device_num=8 \
-            ipu \
-            --hosts=localhost \
-            --nproc_per_host=2 \
-            --ipus_per_replica=2 \
-            --ipu_partition=${partition_name} \
-            --vipu_server=${vipu_server} \
-            test_dist_data_parallel_ipu.py ${opt} ipu_res.txt ${onchip} ${rts} > ipu.log
-            echo "paddle.distributed.fleet.launch test with IPUs...Done"
-
-            echo "paddle normal test with CPU..."
-            export POPLAR_IPUMODEL=1
-            python3.7 test_dist_data_parallel_ipu.py ${opt} cpu_res.txt > cpu.log
-            unset POPLAR_IPUMODEL
-            echo "paddle normal test with CPU...Done"
-
-            echo "Compare results..."
-            python3.7 -c """${allclose_script}"""
-            if [ $? -eq 0 ];then
-            echo "Compare results...Done"
-            else
-            echo "Error occurs. Please check ipu.log, cpu.log, ipu_res.txt and cpu_res.txt"
-            exit 0
-            fi
-        done
-    done
-done
-
-if [ -f "ipu.log" ]; then
-    rm "ipu.log"
-fi
-if [ -f "cpu.log" ]; then
-    rm "cpu.log"
-fi
-if [ -f "ipu_res.txt" ]; then
-    rm "ipu_res.txt"
-fi
-if [ -f "cpu_res.txt" ]; then
-    rm "cpu_res.txt"
-fi
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py
deleted file mode 100644
index 6054f2be7579e..0000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py
+++ /dev/null
@@ -1,184 +0,0 @@
-#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import sys
-import os
-import random
-import numpy as np
-import paddle
-import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-mpi_comm = None
-
-
-@unittest.skip('Disable distributed tests on auto CI.')
-class TestBase(IPUOpTest):
-    def set_attrs(self, enable_ipu, optimizer, log, onchip=False, rts=False):
-        self.ipu_options = {
-            "enable_pipelining": True,
-            "batches_per_step": 1,
-            "enable_gradient_accumulation": True,
-            "accumulation_factor": 4,
-            "enable_replicated_graphs": True,
-            "replicated_graph_count": 2,
-            "location_optimizer": {
-                "on_chip": onchip,
-                "use_replicated_tensor_sharding": rts
-            }
-        }
-
-        self.cpu_bs = 16
-        self.ipu_bs = 1
-        self.optimizer = optimizer
-        self.log = log
-        self.enable_ipu = enable_ipu
-
-    def test(self):
-        seed = 2021
-        np.random.seed(seed)
-        random.seed(seed)
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = seed
-        startup_prog.random_seed = seed
-
-        bs = self.ipu_bs if self.enable_ipu else self.cpu_bs
-        data = np.random.rand(1, 3, 10, 10).astype(np.float32)
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[bs, 3, 10, 10], dtype='float32')
-                with paddle.static.ipu_shard_guard(index=0, stage=0):
-                    conv1 = paddle.static.nn.conv2d(
-                        image, num_filters=3, filter_size=3, bias_attr=False)
-                with paddle.static.ipu_shard_guard(index=1, stage=1):
-                    conv2 = paddle.static.nn.conv2d(
-                        conv1, num_filters=3, filter_size=3, bias_attr=False)
-                    # should consider influence of bs
-                    loss = paddle.mean(conv2)
-
-                if self.optimizer == 'sgd':
-                    opt = paddle.optimizer.SGD(learning_rate=1e-2)
-                elif self.optimizer == 'adam':
-                    opt = paddle.optimizer.Adam(learning_rate=1e-2)
-                elif self.optimizer == 'lamb':
-                    opt = paddle.optimizer.Lamb(learning_rate=1e-2)
-                else:
-                    raise Exception('optimizer must be sgd, adam or lamb')
-
-                opt.minimize(loss)
-
-                if self.enable_ipu:
-                    place = paddle.IPUPlace()
-                else:
-                    place = paddle.CPUPlace()
-                executor = paddle.static.Executor(place)
-                executor.run(startup_prog)
-
-                if self.enable_ipu:
-                    feed_list = [image.name]
-                    fetch_list = [loss.name]
-                    ipu_strategy = paddle.static.IpuStrategy()
-                    ipu_strategy.set_graph_config(
-                        num_ipus=2 * self.ipu_options['replicated_graph_count'],
-                        is_training=True,
-                        enable_manual_shard=True)
-                    ipu_strategy.set_options(self.ipu_options)
-                    ipu_strategy.set_options({
-                        "enable_distribution": True,
-                        "enable_distributed_replicated_graphs": True,
-                        "global_replica_offset":
-                        int(os.environ.get("PADDLE_TRAINER_ID")) * 2,
-                        "global_replication_factor": 4
-                    })
-                    program = paddle.static.IpuCompiledProgram(
-                        main_prog, ipu_strategy=ipu_strategy).compile(
-                            feed_list, fetch_list)
-                    feed = {
-                        "image": np.tile(data, [
-                            self.ipu_options['replicated_graph_count'] *
-                            self.ipu_options['batches_per_step'] *
-                            self.ipu_options['accumulation_factor'], 1, 1, 1
-                        ])
-                    }
-
-                else:
-                    program = main_prog
-                    feed = {"image": np.tile(data, [self.cpu_bs, 1, 1, 1])}
-
-                epoch = 10
-                if not self.enable_ipu:
-                    # global replication factor
-                    epoch *= 4
-                    epoch *= self.ipu_options['batches_per_step']
-                    epoch *= self.ipu_options['accumulation_factor']
-                    epoch = epoch / (self.cpu_bs / self.ipu_bs)
-
-                results = []
-                for i in range(int(epoch)):
-                    res = executor.run(program, feed=feed, fetch_list=[loss])
-                    if self.enable_ipu:
-                        res = mpi_comm.gather(res, root=0)
-                    results.append(res)
-                if self.enable_ipu:
-                    if int(os.environ.get("PADDLE_TRAINER_ID")) == 0:
-                        np.savetxt(self.log, np.array(results).flatten())
-                else:
-                    np.savetxt(self.log, np.array(results).flatten())
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    # Run distributed tests    
-    if len(sys.argv) == 5:
-        from mpi4py import MPI
-
-        DISTRIBUTED_COMM = MPI.COMM_WORLD
-
-        def _get_comm():
-            global DISTRIBUTED_COMM
-            if DISTRIBUTED_COMM is None:
-                raise RuntimeError(
-                    "Distributed Commumication not setup. Please run setup_comm(MPI.COMM_WORLD) first."
-                )
-            return DISTRIBUTED_COMM
-
-        mpi_comm = _get_comm()
-
-        optimizer = sys.argv[1]
-        log = sys.argv[2]
-        onchip = True if sys.argv[3] == "True" else False
-        rts = True if sys.argv[4] == "True" else False
-        test = TestBase()
-        test.set_attrs(
-            enable_ipu=True,
-            optimizer=optimizer,
-            log=log,
-            onchip=onchip,
-            rts=rts)
-        test.test()
-    # Run cpu tests for compare
-    elif len(sys.argv) == 3:
-        test = TestBase()
-        test.set_attrs(enable_ipu=False, optimizer=sys.argv[1], log=sys.argv[2])
-        test.test()
-    else:
-        raise ValueError(
-            "Only support 3 or 5 args. 3 for cpu test, 5 for ipu distributed test"
-        )
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py
deleted file mode 100644
index 44c26d123ba39..0000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-'''
-python3.7 -m paddle.distributed.launch \
---device_num=128 \
-ipu \
---hosts=host1,host2 \
---ipus_per_host=2 \
---nproc_per_host=1 \
---ipu_partition=pod128 \
---vipu_server=lr17-1-ctrl \
-python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_pod128_ipu.py
-
-Equal to:
-
-poprun \
---host=localhost,host2 \
---num-instances=2 \
---num-replicas=64 \
---ipus-per-replica=2 \
---print-topology=yes \
---vipu-partition=pod128_bert \
---vipu-server-host=lr17-1-ctrl \
---update-partition=yes \
-python3.7 python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_pod128_ipu.py
-'''
-
-import os
-import numpy as np
-import paddle
-
-
-def TestDistTraining():
-    paddle.enable_static()
-
-    attrs = {"size": [128, 16], "padding_idx": -1, "dtype": 'float32'}
-
-    scope = paddle.fluid.core.Scope()
-    main_prog = paddle.static.Program()
-    startup_prog = paddle.static.Program()
-    main_prog.random_seed = 42
-    startup_prog.random_seed = 42
-
-    np.random.seed(42)
-    input_data = np.random.uniform(0, 127, size=[128, 3, 2, 1]).astype(np.int32)
-
-    with paddle.fluid.scope_guard(scope):
-        with paddle.static.program_guard(main_prog, startup_prog):
-            x = paddle.static.data(name="x", shape=[3, 2, 1], dtype='int64')
-            with paddle.static.ipu_shard_guard(index=0, stage=0):
-                out = paddle.fluid.layers.embedding(x, **attrs)
-            with paddle.static.ipu_shard_guard(index=1, stage=1):
-                loss = paddle.mean(out)
-            opt = paddle.optimizer.Adam(learning_rate=1e-1)
-            opt.minimize(loss)
-
-            feed_list = ["x"]
-            fetch_list = [loss.name]
-
-            place = paddle.IPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            ipu_strategy = paddle.static.IpuStrategy()
-            ipu_strategy.set_graph_config(
-                num_ipus=64, is_training=True, enable_manual_shard=True)
-            ipu_strategy.set_pipelining_config(
-                enable_pipelining=True,
-                batches_per_step=1,
-                enable_gradient_accumulation=True,
-                accumulation_factor=4)
-            ipu_strategy.set_options({
-                "enable_distribution": True,
-                "enable_replicated_graphs": True,
-                "replicated_graph_count": 32,
-                "enable_distributed_replicated_graphs": True,
-                "global_replica_offset":
-                # Paddle : int(os.environ.get("PADDLE_TRAINER_ID")) * 32
-                # PopRun : int(os.environ.get("POPDIST_REPLICA_INDEX_OFFSET"))
-                int(os.environ.get("PADDLE_TRAINER_ID")) * 32,
-                "global_replication_factor": 64,
-                "location_optimizer": {
-                    "on_chip": False,
-                    "use_replicated_tensor_sharding": True
-                }
-            })
-
-            ipu_program = paddle.static.IpuCompiledProgram(
-                main_prog, ipu_strategy=ipu_strategy)
-            program = ipu_program.compile(feed_list, fetch_list)
-
-            for i in range(10):
-                res = exe.run(program,
-                              feed={"x": input_data},
-                              fetch_list=fetch_list)
-                print("index: {}, result: {}".format(i, res))
-
-
-if __name__ == "__main__":
-    TestDistTraining()
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py
deleted file mode 100644
index 6ca9222d914de..0000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py
+++ /dev/null
@@ -1,177 +0,0 @@
-#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-'''
-Single host:
-
-python3.7 -m paddle.distributed.launch \
---device_num=4 \
-ipu \
---hosts=localhost \
---nproc_per_host=2 \
---ipus_per_replica=1 \
---ipu_partition=pod64 \
---vipu_server=10.137.96.62 \
-python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_sample.py
-
-Equal to:
-
-poprun \
---host=localhost \
---num-instances=2 \
---num-replicas=4 \
---ipus-per-replica=1 \
---print-topology=yes \
-python3.7 python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_sample.py
-'''
-'''
-Multi hosts:
-
-python3.7 -m paddle.distributed.launch \
---device_num=4 \
-ipu \
---hosts=host1,host2 \
---nproc_per_host=1 \
---ipus_per_replica=1 \
---ipu_partition=pod64 \
---vipu_server=10.137.96.62 \
-python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_sample.py
-
-Equal to:
-
-poprun \
---host=host1,host2 \
---num-instances=2 \
---num-replicas=4 \
---ipus-per-replica=1 \
---print-topology=yes \
-python3.7 python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_sample.py
-'''
-
-import os
-import sys
-import paddle
-import numpy as np
-
-mpi_comm = None
-
-
-def Test(use_dist, file_name):
-    paddle.enable_static()
-
-    attrs = {"size": [128, 16], "padding_idx": -1, "dtype": 'float32'}
-
-    scope = paddle.fluid.core.Scope()
-    main_prog = paddle.static.Program()
-    startup_prog = paddle.static.Program()
-    main_prog.random_seed = 42
-    startup_prog.random_seed = 42
-
-    with paddle.fluid.scope_guard(scope):
-        with paddle.static.program_guard(main_prog, startup_prog):
-            x = paddle.static.data(name="x", shape=[3, 2, 1], dtype='int64')
-
-            out = paddle.fluid.layers.embedding(x, **attrs)
-            loss = paddle.mean(out)
-            opt = paddle.optimizer.Adam(learning_rate=1e-1)
-            opt.minimize(loss)
-
-            feed_list = ["x"]
-            fetch_list = [loss.name]
-
-            place = paddle.IPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            ipu_strategy = paddle.static.IpuStrategy()
-            if use_dist:
-                ipu_strategy.set_graph_config(num_ipus=2, is_training=True)
-                # Set distributed envs
-                ipu_strategy.set_options({
-                    "enable_distribution": True,
-                    "enable_replicated_graphs": True,
-                    "replicated_graph_count": 2,
-                    "enable_distributed_replicated_graphs": True,
-                    "global_replica_offset":
-                    int(os.environ.get("PADDLE_TRAINER_ID")) * 2,
-                    "global_replication_factor": 4
-                })
-            else:
-                ipu_strategy.set_graph_config(num_ipus=4, is_training=True)
-                ipu_strategy.set_options({
-                    "enable_replicated_graphs": True,
-                    "replicated_graph_count": 4,
-                })
-
-            ipu_program = paddle.static.IpuCompiledProgram(
-                main_prog, ipu_strategy=ipu_strategy)
-            program = ipu_program.compile(feed_list, fetch_list)
-
-            if use_dist:
-                if os.environ.get("PADDLE_TRAINER_ID") == "0":
-                    input_data = np.concatenate([
-                        np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
-                        .astype(np.int32), np.array(
-                            [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(
-                                np.int32)
-                    ])
-                else:
-                    input_data = np.concatenate([
-                        np.array([[[8], [60]], [[50], [77]],
-                                  [[90], [13]]]).astype(np.int32),
-                        np.array([[[8], [60]], [[50], [77]],
-                                  [[90], [13]]]).astype(np.int32)
-                    ])
-            else:
-                input_data = np.concatenate([
-                    np.array([[[1], [3]], [[2], [4]], [[4], [127]]]).astype(
-                        np.int32), np.array([[[1], [3]], [[2], [4]],
-                                             [[4], [127]]]).astype(np.int32),
-                    np.array([[[8], [60]], [[50], [77]], [[90], [13]]]).astype(
-                        np.int32), np.array([[[8], [60]], [[50], [77]],
-                                             [[90], [13]]]).astype(np.int32)
-                ])
-            feed_data = {"x": input_data}
-
-            for step in range(10):
-                res = exe.run(program, feed=feed_data, fetch_list=fetch_list)
-
-            if use_dist:
-                if os.getenv("PADDLE_TRAINER_ID") == "0":
-                    res = mpi_comm.gather(res, root=0)
-                    np.savetxt(file_name, res)
-            else:
-                np.savetxt(file_name, res)
-
-
-if __name__ == "__main__":
-    file_name = sys.argv[1]
-
-    use_dist = False
-    if 'PADDLE_TRAINER_ID' in os.environ:
-        from mpi4py import MPI
-
-        DISTRIBUTED_COMM = MPI.COMM_WORLD
-
-        def _get_comm():
-            global DISTRIBUTED_COMM
-            if DISTRIBUTED_COMM is None:
-                raise RuntimeError(
-                    "Distributed Commumication not setup. Please run setup_comm(MPI.COMM_WORLD) first."
-                )
-            return DISTRIBUTED_COMM
-
-        mpi_comm = _get_comm()
-        use_dist = True
-
-    Test(use_dist, file_name)
diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
index ad11083b67773..5f2a0d59bb8be 100644
--- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
@@ -55,6 +55,7 @@ class ExecutionMode(IntEnum):
 
 
 class IPUTest(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         # Get random seeds
@@ -87,6 +88,7 @@ def use_ipumodel(cls):
 
     # Decorator for static graph building
     def static_graph(builder):
+
         def wrapper(self, *args, **kwargs):
             self.scope = paddle.static.Scope()
             self.main_prog = paddle.static.Program()
@@ -116,6 +118,7 @@ def cast_model_to_fp16(cls, main_program):
 
 
 class IPUOpTest(IPUTest):
+
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
@@ -181,8 +184,9 @@ def run_op_test(self, exec_mode, ipu_strategy=None):
                 ipu_strategy.set_precision_config(enable_fp16=True)
                 IPUOpTest.cast_model_to_fp16(self.main_prog)
             program = paddle.static.IpuCompiledProgram(
-                self.main_prog, ipu_strategy=ipu_strategy).compile(
-                    self.feed_list, self.fetch_list)
+                self.main_prog,
+                ipu_strategy=ipu_strategy).compile(self.feed_list,
+                                                   self.fetch_list)
         else:
             program = self.main_prog
 
@@ -214,8 +218,10 @@ def check(self, check_shape=False, output_dict=None):
         ipu_fp32 = output_dict[ExecutionMode.IPU_FP32]
         cpu_fp32 = np.asarray(cpu_fp32).astype(np.float32).flatten()
         ipu_fp32 = np.asarray(ipu_fp32).astype(np.float32).flatten()
-        pass_check = np.allclose(
-            ipu_fp32, cpu_fp32, rtol=self.rtol, atol=self.atol)
+        pass_check = np.allclose(ipu_fp32,
+                                 cpu_fp32,
+                                 rtol=self.rtol,
+                                 atol=self.atol)
         if not pass_check:
             max_atol = np.abs(ipu_fp32 - cpu_fp32).max()
             cpu_fp32_abs = np.abs(cpu_fp32)
@@ -231,8 +237,10 @@ def check(self, check_shape=False, output_dict=None):
         if ExecutionMode.IPU_FP16 in output_dict.keys():
             ipu_fp16 = output_dict[ExecutionMode.IPU_FP16]
             ipu_fp16 = np.asarray(ipu_fp16).astype(np.float32).flatten()
-            pass_check = np.allclose(
-                ipu_fp16, cpu_fp32, rtol=self.rtol_fp16, atol=self.atol_fp16)
+            pass_check = np.allclose(ipu_fp16,
+                                     cpu_fp32,
+                                     rtol=self.rtol_fp16,
+                                     atol=self.atol_fp16)
             if not pass_check:
                 max_atol = np.abs(ipu_fp16 - cpu_fp32).max()
                 cpu_fp32_abs = np.abs(cpu_fp32)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
index b90c3374db96e..19abf74a55683 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
@@ -24,6 +24,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestRelu(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_test_op()
@@ -46,8 +47,9 @@ def set_feed_attr(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = self.op(x, **self.op_attrs)
         self.fetch_list = [out.name]
 
@@ -63,24 +65,28 @@ def test(self):
 
 
 class TestTanh(TestRelu):
+
     def set_test_op(self):
         self.op = F.tanh
         self.op_attrs = {}
 
 
 class TestLog(TestRelu):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.log
         self.op_attrs = {}
 
 
 class TestSigmoid(TestRelu):
+
     def set_test_op(self):
         self.op = F.sigmoid
         self.op_attrs = {}
 
 
 class TestSqrt(TestRelu):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.sqrt
         self.op_attrs = {}
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
index c48ce75ccd9f3..3612656cea354 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -45,8 +46,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.argmax(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -64,6 +66,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"axis": 0}
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
index 1239a97f2f653..3b2034ebe836c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -40,8 +41,9 @@ def set_feed_attr(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         x = paddle.assign(x)
         out = paddle.fluid.layers.elementwise_add(x, x)
         self.fetch_list = [out.name]
@@ -58,6 +60,7 @@ def test(self):
 
 
 class TestAssignFp32Value(TestBase):
+
     def set_data_feed(self):
         data = np.random.uniform(size=[2, 3, 1])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -68,14 +71,16 @@ def set_data_feed(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         assign = paddle.assign(self.assign_fp32)
         out = paddle.fluid.layers.elementwise_add(x, assign)
         self.fetch_list = [out.name]
 
 
 class TestAssignBoolValue(TestBase):
+
     def set_data_feed(self):
         data = np.random.uniform(size=[2, 3, 1])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -85,8 +90,9 @@ def set_data_feed(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         x = paddle.less_than(x, x)
         assign = paddle.assign(self.assign_bool)
         x = paddle.logical_and(x, assign)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
index cf494034fd86f..3f45bf485b817 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -46,16 +47,25 @@ def set_feed_attr(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        x = paddle.static.nn.conv2d(
-            x, num_filters=3, filter_size=3, bias_attr=False)
-        x = paddle.static.nn.conv2d(
-            x, num_filters=3, filter_size=3, bias_attr=False)
-        x = paddle.static.nn.conv2d(
-            x, num_filters=3, filter_size=3, bias_attr=False)
-        x = paddle.static.nn.conv2d(
-            x, num_filters=3, filter_size=3, bias_attr=False)
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        x = paddle.static.nn.conv2d(x,
+                                    num_filters=3,
+                                    filter_size=3,
+                                    bias_attr=False)
+        x = paddle.static.nn.conv2d(x,
+                                    num_filters=3,
+                                    filter_size=3,
+                                    bias_attr=False)
+        x = paddle.static.nn.conv2d(x,
+                                    num_filters=3,
+                                    filter_size=3,
+                                    bias_attr=False)
+        x = paddle.static.nn.conv2d(x,
+                                    num_filters=3,
+                                    filter_size=3,
+                                    bias_attr=False)
         self.fetch_list = [x.name]
 
     def run_model(self, exec_mode):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
index adb2abfc47418..2d2d331543930 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -53,10 +54,13 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        x = paddle.static.nn.conv2d(
-            x, num_filters=3, filter_size=3, bias_attr=False)
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        x = paddle.static.nn.conv2d(x,
+                                    num_filters=3,
+                                    filter_size=3,
+                                    bias_attr=False)
         x = paddle.fluid.layers.batch_norm(x, **self.attrs)
         self.fetch_list = [x.name]
 
@@ -72,6 +76,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_atol(self):
         self.atol = 1e-6
         self.rtol = 1e-6
@@ -86,6 +91,7 @@ def set_op_attrs(self):
 
 
 class TestCase2(TestBase):
+
     def set_atol(self):
         self.atol = 1e-6
         self.rtol = 1e-6
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
index d7b15a442957d..f361b779bb30b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -49,10 +50,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0],
-            shape=self.feed_shape[0],
-            dtype=self.feed_dtype[0])
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype=self.feed_dtype[0])
         out = paddle.cast(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -68,6 +68,7 @@ def test(self):
 
 
 class TestEnableFp16(TestBase):
+
     @property
     def fp16_enabled(self):
         return True
@@ -86,6 +87,7 @@ def set_op_attrs(self):
 
 
 class TestCase2(TestBase):
+
     def set_data_feed(self):
         self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
@@ -97,6 +99,7 @@ def set_op_attrs(self):
 
 
 class TestCase3(TestBase):
+
     def set_data_feed(self):
         self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
@@ -108,6 +111,7 @@ def set_op_attrs(self):
 
 
 class TestCase4(TestBase):
+
     def set_data_feed(self):
         self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'),
@@ -119,6 +123,7 @@ def set_op_attrs(self):
 
 
 class TestCase5(TestBase):
+
     def set_data_feed(self):
         self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
@@ -130,6 +135,7 @@ def set_op_attrs(self):
 
 
 class TestCase6(TestBase):
+
     def set_data_feed(self):
         self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'),
@@ -142,6 +148,7 @@ def set_op_attrs(self):
 
 @unittest.skip('float64 is not supported')
 class TestCase2(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float64'
@@ -149,6 +156,7 @@ def set_op_attrs(self):
 
 @unittest.skip('skip float16 to float32')
 class TestCase3(TestBase):
+
     def set_data_feed(self):
         self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
@@ -161,14 +169,16 @@ def set_op_attrs(self):
 
 @unittest.skip('int32 to int8 is not supported')
 class TestCase4(TestBase):
+
     def set_atol(self):
         super().set_atol()
         self.atol = 1
 
     def set_data_feed(self):
         self.feed_fp32 = {
-            "x": np.random.randint(
-                low=1, high=100, size=[1, 3, 3, 3]).astype('int32'),
+            "x":
+            np.random.randint(low=1, high=100, size=[1, 3, 3,
+                                                     3]).astype('int32'),
         }
 
     def set_op_attrs(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
index a5410ab499082..d0160551b93bd 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -51,10 +52,12 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        y = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='float32')
         out = paddle.fluid.layers.concat([x, y], **self.attrs)
         self.fetch_list = [out.name]
 
@@ -70,6 +73,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"axis": 1}
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
index e450621b11d34..5a2485e251c96 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -54,8 +55,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         x = paddle.fluid.layers.conv2d(x, **self.attrs)
         self.fetch_list = [x.name]
 
@@ -71,54 +73,63 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['num_filters'] = 1
 
 
 class TestCase2(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['filter_size'] = [3, 3]
 
 
 class TestCase2_1(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['filter_size'] = [3, 2]
 
 
 class TestCase3(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['stride'] = [2, 3]
 
 
 class TestCase4(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['dilation'] = [2, 2]
 
 
 class TestCase5(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['groups'] = 3
 
 
 class TestCase6(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['padding'] = 2
 
 
 class TestCase7(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['padding'] = [2, 3]
 
 
 class TestCase8(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['padding'] = [1, 2, 2, 3]
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
index d035673e219df..ffd4368c089b5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -47,20 +48,26 @@ def set_feed_attr(self):
         self.feed_list = list(self.feed_fp32.keys())
 
     def set_op_attrs(self):
-        self.attrs = {'soft_label': False, }
+        self.attrs = {
+            'soft_label': False,
+        }
 
     @IPUOpTest.static_graph
     def build_model(self, on_ipu):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="float32")
         if on_ipu:
-            label = paddle.static.data(
-                name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
+            label = paddle.static.data(name=self.feed_list[1],
+                                       shape=self.feed_shape[1],
+                                       dtype='int32')
         else:
-            label = paddle.static.data(
-                name=self.feed_list[1], shape=self.feed_shape[1], dtype='int64')
-        out = paddle.fluid.layers.cross_entropy(
-            input=x, label=label, **self.attrs)
+            label = paddle.static.data(name=self.feed_list[1],
+                                       shape=self.feed_shape[1],
+                                       dtype='int64')
+        out = paddle.fluid.layers.cross_entropy(input=x,
+                                                label=label,
+                                                **self.attrs)
         self.fetch_list = [out.name]
 
     def run_model(self, exec_mode):
@@ -77,6 +84,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             'soft_label': False,
@@ -85,6 +93,7 @@ def set_op_attrs(self):
 
 
 class TestCase2(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[30, 70])
         label = np.arange(30).reshape([30, 1])
@@ -100,8 +109,11 @@ def set_data_feed(self):
 
 @unittest.skip("soft_label=True is not supported")
 class TestCase3(TestBase):
+
     def set_op_attrs(self):
-        self.attrs = {'soft_label': True, }
+        self.attrs = {
+            'soft_label': True,
+        }
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
index a0a145fb72b35..75cd3c92322ab 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -50,8 +51,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="float32")
         out = paddle.fluid.layers.cumsum(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -67,16 +69,19 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"exclusive": True, "reverse": False}
 
 
 class TestCase2(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"exclusive": False, "reverse": True}
 
 
 class TestCase3(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"exclusive": True, "reverse": True}
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
index 4e3b03ffca068..be96762549dd4 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -48,8 +49,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         x = paddle.fluid.layers.dropout(x, **self.attrs)
         out = paddle.fluid.layers.elementwise_add(x, x)
         self.fetch_list = [out.name]
@@ -66,6 +68,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "dropout_prob": 0.5,
@@ -75,6 +78,7 @@ def set_op_attrs(self):
 
 
 class TestCase2(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "dropout_prob": 0.0,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
index 24082fe49bae5..f78f446404dcb 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestMul(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -44,10 +45,12 @@ def set_feed_attr(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        y = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='float32')
         out = self.op(x, y, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -124,36 +127,43 @@ def test_case3(self):
 
 
 class TestAdd(TestMul):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_add
 
 
 class TestSub(TestMul):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_sub
 
 
 class TestDiv(TestMul):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_div
 
 
 class TestMin(TestMul):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_min
 
 
 class TestMax(TestMul):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_max
 
 
 class TestPow(TestMul):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_pow
 
 
 class TestMod(TestMul):
+
     def set_atol(self):
         self.atol = 1e-7
         self.rtol = 1e-5
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
index 56b9a73f08009..ad419c2e2bfc5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -51,10 +52,12 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        y = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='float32')
         out = paddle.fluid.layers.equal(x, y, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -70,6 +73,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_data_feed(self):
         x = np.ones([1, 10])
         y = np.ones([1, 10])
@@ -78,6 +82,7 @@ def set_data_feed(self):
 
 
 class TestCase2(TestBase):
+
     def set_data_feed(self):
         x = np.ones([1, 10])
         y = np.arange(0, 10).reshape([1, 10])
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_eval_model_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_eval_model_ipu.py
index 30a4a5370790a..f81f5d7de74d1 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_eval_model_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_eval_model_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_data_feed()
@@ -58,22 +59,25 @@ def _test_optimizer(self, run_ipu=True):
 
         with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[1, 3, 10, 10], dtype='float32')
-                conv1 = paddle.static.nn.conv2d(
-                    image, num_filters=3, filter_size=3, bias_attr=False)
+                image = paddle.static.data(name='image',
+                                           shape=[1, 3, 10, 10],
+                                           dtype='float32')
+                conv1 = paddle.static.nn.conv2d(image,
+                                                num_filters=3,
+                                                filter_size=3,
+                                                bias_attr=False)
                 loss = paddle.mean(conv1)
 
                 weight_decay = self.attrs['weight_decay']
                 opt = paddle.optimizer.SGD(learning_rate=1e-1,
                                            weight_decay=weight_decay)
                 if self.attrs['optimizer'] == 'adam':
-                    opt = paddle.optimizer.Adam(
-                        learning_rate=1e-1, weight_decay=weight_decay)
+                    opt = paddle.optimizer.Adam(learning_rate=1e-1,
+                                                weight_decay=weight_decay)
                 elif self.attrs['optimizer'] == 'lamb':
 
-                    opt = paddle.optimizer.Lamb(
-                        learning_rate=1e-1, lamb_weight_decay=weight_decay)
+                    opt = paddle.optimizer.Lamb(learning_rate=1e-1,
+                                                lamb_weight_decay=weight_decay)
                 opt.minimize(loss)
 
             if run_ipu:
@@ -90,8 +94,8 @@ def _test_optimizer(self, run_ipu=True):
                 ipu_strategy.set_graph_config(is_training=True)
                 ipu_strategy.set_options({"runtime_options.enable_eval": True})
                 program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
-                                                                  fetch_list)
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
@@ -99,9 +103,8 @@ def _test_optimizer(self, run_ipu=True):
             if run_ipu:
                 for epoch in range(200):
                     if epoch == 100:
-                        ipu_strategy.set_options({
-                            "runtime_options.enable_eval": False
-                        })
+                        ipu_strategy.set_options(
+                            {"runtime_options.enable_eval": False})
                     loss_res = exe.run(program,
                                        feed=self.feed,
                                        fetch_list=[loss])
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
index 211aa4a61a5b8..872f4a4bef160 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -45,8 +46,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="float32")
         out = paddle.fluid.layers.expand(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -62,6 +64,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[2, 2])
         self.feed_fp32 = {"x": x.astype(np.float32)}
@@ -77,12 +80,14 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="float32")
         expand_times = paddle.fluid.layers.fill_constant(
             shape=[len(self.feed_shape[0])], dtype="int32", value=2)
-        out = paddle.fluid.layers.expand(
-            x, expand_times=expand_times, **self.attrs)
+        out = paddle.fluid.layers.expand(x,
+                                         expand_times=expand_times,
+                                         **self.attrs)
         self.fetch_list = [out.name]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
index b3faabda3cdf2..a6c497433020c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -44,8 +45,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         x_fill = paddle.full_like(x, **self.attrs)
         out = paddle.fluid.layers.elementwise_add(x_fill, x_fill)
         self.fetch_list = [out.name]
@@ -62,6 +64,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {'fill_value': 3, 'dtype': 'int32'}
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
index ce457b7abeb5b..4d4d88351892f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -65,6 +66,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             'name': 'x',
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
index a8d530f6b77ad..29dd9510dda40 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -45,8 +46,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.flatten(x=x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -62,12 +64,14 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {}
         self.attrs['axis'] = 0
 
 
 class TestCase2(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {}
         self.attrs['axis'] = 2
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py
index 1d3b17dbc2dfc..0cfe769225001 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -51,15 +52,22 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        conv1 = paddle.static.nn.conv2d(
-            x, num_filters=3, filter_size=3, bias_attr=False)
-        conv2 = paddle.static.nn.conv2d(
-            x, num_filters=3, filter_size=3, bias_attr=False)
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        conv1 = paddle.static.nn.conv2d(x,
+                                        num_filters=3,
+                                        filter_size=3,
+                                        bias_attr=False)
+        conv2 = paddle.static.nn.conv2d(x,
+                                        num_filters=3,
+                                        filter_size=3,
+                                        bias_attr=False)
         add1 = conv1 + conv2
-        conv3 = paddle.static.nn.conv2d(
-            add1, num_filters=8, filter_size=8, bias_attr=False)
+        conv3 = paddle.static.nn.conv2d(add1,
+                                        num_filters=8,
+                                        filter_size=8,
+                                        bias_attr=False)
         out = paddle.fluid.layers.relu(conv3, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -75,6 +83,7 @@ def test(self):
 
 
 class TestIntInput(TestBase):
+
     def set_data_feed(self):
         embedding = np.random.uniform(size=[10, 20])
         indice = np.array([1, 3, 5]).astype(np.int32)
@@ -89,10 +98,12 @@ def set_data_feed(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        y = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='int32')
         out = paddle.fluid.layers.gather(x, index=y)
         self.fetch_list = [out.name]
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
index bbf3ec0ffdfe6..42ba6babd7911 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -45,10 +46,12 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        y = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='int32')
         out = paddle.fluid.layers.gather(x, index=y, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -64,6 +67,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[100])
         y = np.array([1, 3, 5])
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
index e9721463876d0..673c7c0503242 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -44,8 +45,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.gelu(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -61,6 +63,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_atol(self):
         self.atol = 1e-10
         self.rtol = 1e-6
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py
index b7567f60cc3a2..7eea222e5e3c4 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_data_feed()
@@ -61,10 +62,13 @@ def set_training(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        image = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        conv1 = paddle.static.nn.conv2d(
-            image, num_filters=3, filter_size=3, bias_attr=False)
+        image = paddle.static.data(name=self.feed_list[0],
+                                   shape=self.feed_shape[0],
+                                   dtype='float32')
+        conv1 = paddle.static.nn.conv2d(image,
+                                        num_filters=3,
+                                        filter_size=3,
+                                        bias_attr=False)
         loss = paddle.mean(conv1)
         self.fetch_list = [loss.name]
 
@@ -76,13 +80,13 @@ def build_model(self):
                                        weight_decay=weight_decay,
                                        grad_clip=clip)
         elif self.attrs['optimizer'] == 'adam':
-            opt = paddle.optimizer.Adam(
-                learning_rate=1e-1, weight_decay=weight_decay, grad_clip=clip)
+            opt = paddle.optimizer.Adam(learning_rate=1e-1,
+                                        weight_decay=weight_decay,
+                                        grad_clip=clip)
         elif self.attrs['optimizer'] == 'lamb':
-            opt = paddle.optimizer.Lamb(
-                learning_rate=1e-1,
-                lamb_weight_decay=weight_decay,
-                grad_clip=clip)
+            opt = paddle.optimizer.Lamb(learning_rate=1e-1,
+                                        lamb_weight_decay=weight_decay,
+                                        grad_clip=clip)
         else:
             raise ValueError(
                 f"Not supported optimizer {self.attrs['optimizer']} for test")
@@ -100,6 +104,7 @@ def test(self):
 
 
 class TestAdam(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "optimizer": 'adam',
@@ -108,6 +113,7 @@ def set_attrs(self):
 
 
 class TestLamb(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "optimizer": 'lamb',
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
index c499bb0bd5ff9..eb3c0601dd148 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestGreaterThan(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -36,10 +37,12 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        y = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='float32')
         out = self.op(x, y, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -113,11 +116,13 @@ def test_case3(self):
 
 
 class TestLessThan(TestGreaterThan):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.less_than
 
 
 class TestEqual(TestGreaterThan):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.equal
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
index bb984a8d90789..4c5098640fdba 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -54,23 +55,30 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         if self.is_training:
             ch = self.feed_shape[0][1]
-            conv1 = paddle.static.nn.conv2d(
-                x, num_filters=ch, filter_size=3, bias_attr=False)
+            conv1 = paddle.static.nn.conv2d(x,
+                                            num_filters=ch,
+                                            filter_size=3,
+                                            bias_attr=False)
             scale = paddle.ParamAttr(trainable=True)
             bias = paddle.ParamAttr(trainable=True)
-            out = paddle.fluid.layers.nn.group_norm(
-                conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+            out = paddle.fluid.layers.nn.group_norm(conv1,
+                                                    param_attr=scale,
+                                                    bias_attr=bias,
+                                                    **self.attrs)
             loss = paddle.mean(out)
             adam = paddle.optimizer.Adam(learning_rate=1e-2)
             adam.minimize(loss)
             self.fetch_list = [loss.name]
         else:
-            out = paddle.fluid.layers.nn.group_norm(
-                x, param_attr=True, bias_attr=True, **self.attrs)
+            out = paddle.fluid.layers.nn.group_norm(x,
+                                                    param_attr=True,
+                                                    bias_attr=True,
+                                                    **self.attrs)
             self.fetch_list = [out.name]
 
     def run_model(self, exec_mode):
@@ -85,6 +93,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "groups": 4,
@@ -94,6 +103,7 @@ def set_op_attrs(self):
 
 
 class TestTrainCase1(TestBase):
+
     def set_training(self):
         self.is_training = True
         self.epoch = 20
@@ -101,6 +111,7 @@ def set_training(self):
 
 @unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
 class TestTrainCase2(TestBase):
+
     def set_atol(self):
         self.atol = 7e-4
         self.rtol = 1e-6
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
index 33a63a80e3bc0..18cd5e30e88c1 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
@@ -24,6 +24,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_data_feed()
@@ -66,16 +67,14 @@ def _test_save(self):
         with paddle.fluid.unique_name.guard(generator):
             with paddle.static.scope_guard(scope):
                 with paddle.static.program_guard(main_prog, startup_prog):
-                    x = paddle.static.data(
-                        name=self.feed_list[0],
-                        shape=self.feed_shape[0],
-                        dtype='float32')
-                    conv1 = paddle.static.nn.conv2d(
-                        x,
-                        num_filters=3,
-                        filter_size=3,
-                        bias_attr=False,
-                        name='conv2d')
+                    x = paddle.static.data(name=self.feed_list[0],
+                                           shape=self.feed_shape[0],
+                                           dtype='float32')
+                    conv1 = paddle.static.nn.conv2d(x,
+                                                    num_filters=3,
+                                                    filter_size=3,
+                                                    bias_attr=False,
+                                                    name='conv2d')
                     loss = paddle.mean(conv1)
 
                     if self.attrs['is_training']:
@@ -98,8 +97,9 @@ def _test_save(self):
                 ipu_strategy.set_graph_config(
                     is_training=self.attrs['is_training'])
                 program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(
-                        self.feed_list, fetch_list)
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(self.feed_list,
+                                                       fetch_list)
 
                 result = []
                 for i in range(self.attrs['steps']):
@@ -108,8 +108,11 @@ def _test_save(self):
                                   fetch_list=fetch_list)
                     result.append(tmp)
 
-                paddle.static.save_inference_model(
-                    self.full_name, x, loss, exe, program=program.org_program)
+                paddle.static.save_inference_model(self.full_name,
+                                                   x,
+                                                   loss,
+                                                   exe,
+                                                   program=program.org_program)
 
     def _test_load(self, run_ipu):
         if run_ipu:
@@ -118,8 +121,8 @@ def _test_load(self, run_ipu):
             place = paddle.CPUPlace()
         exe = paddle.static.Executor(place)
 
-        [inference_program, feed_target_names, fetch_targets] = (
-            paddle.static.load_inference_model(self.full_name, exe))
+        [inference_program, feed_target_names, fetch_targets
+         ] = (paddle.static.load_inference_model(self.full_name, exe))
 
         if run_ipu:
             feed_list = feed_target_names
@@ -146,6 +149,7 @@ def test_base(self):
 
 
 class TestAdam(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
@@ -156,6 +160,7 @@ def set_op_attrs(self):
 
 
 class TestLamb(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
index fa425cbf9f94a..3828728a567c3 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -50,24 +51,31 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
 
         if self.is_training:
             ch = self.feed_shape[0][1]
-            conv1 = paddle.static.nn.conv2d(
-                x, num_filters=ch, filter_size=3, bias_attr=False)
+            conv1 = paddle.static.nn.conv2d(x,
+                                            num_filters=ch,
+                                            filter_size=3,
+                                            bias_attr=False)
             scale = paddle.ParamAttr(trainable=True)
             bias = paddle.ParamAttr(trainable=True)
-            out = paddle.fluid.layers.nn.instance_norm(
-                conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+            out = paddle.fluid.layers.nn.instance_norm(conv1,
+                                                       param_attr=scale,
+                                                       bias_attr=bias,
+                                                       **self.attrs)
             loss = paddle.mean(out)
             adam = paddle.optimizer.Adam(learning_rate=1e-2)
             adam.minimize(loss)
             self.fetch_list = [loss.name]
         else:
-            out = paddle.fluid.layers.nn.instance_norm(
-                x, param_attr=True, bias_attr=True, **self.attrs)
+            out = paddle.fluid.layers.nn.instance_norm(x,
+                                                       param_attr=True,
+                                                       bias_attr=True,
+                                                       **self.attrs)
             self.fetch_list = [out.name]
 
     def run_model(self, exec_mode):
@@ -82,6 +90,7 @@ def test(self):
 
 
 class TestTrainCase1(TestBase):
+
     def set_training(self):
         self.is_training = True
         self.epoch = 10
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
index 76ab1a2c3f311..13f146f6fd741 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
@@ -24,6 +24,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestIpuShard(unittest.TestCase):
+
     def _test(self):
         # build graph
         main_prog = paddle.static.Program()
@@ -61,13 +62,13 @@ def test_ipu_shard(self):
         ipu_index_list = self._test()
         expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
         self.assertTrue(
-            np.allclose(
-                ipu_index_list, expected_ipu_index_list, atol=0))
+            np.allclose(ipu_index_list, expected_ipu_index_list, atol=0))
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestIpuPipeline(unittest.TestCase):
+
     def _test(self):
         # build graph
         main_prog = paddle.static.Program()
@@ -106,8 +107,7 @@ def test_ipu_shard(self):
         expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
 
         self.assertTrue(
-            np.allclose(
-                ipu_index_list, expected_ipu_index_list, atol=0))
+            np.allclose(ipu_index_list, expected_ipu_index_list, atol=0))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
index 21a6655406729..14128109029c7 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestIpuStrategy(unittest.TestCase):
+
     def test_set_options(self):
         ipu_strategy = paddle.static.IpuStrategy()
         all_option_names = ipu_strategy._ipu_strategy.get_all_option_names()
@@ -78,14 +79,15 @@ def test_set_other_options(self):
         for k, v in options.items():
             ipu_strategy.set_options({k: v})
             if (isinstance(v, list)):
-                assert v.sort() == ipu_strategy.get_option(k).sort(
-                ), f"set {k} to {v} failed "
+                assert v.sort() == ipu_strategy.get_option(
+                    k).sort(), f"set {k} to {v} failed "
             else:
                 assert v == ipu_strategy.get_option(
                     k), f"set {k} to {v} failed "
 
         # The custom logger need 2 int as inputs
-        logger = lambda progress, total: print(f"compile progrss: {progress}/{total}")
+        logger = lambda progress, total: print(
+            f"compile progrss: {progress}/{total}")
         ipu_strategy.set_options({'compilation_progress_logger': logger})
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
index cab2fa3fde2cb..e365ffd4e166f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -57,23 +58,30 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         if self.is_training:
             ch = self.feed_shape[0][1]
-            conv1 = paddle.static.nn.conv2d(
-                x, num_filters=ch, filter_size=3, bias_attr=False)
+            conv1 = paddle.static.nn.conv2d(x,
+                                            num_filters=ch,
+                                            filter_size=3,
+                                            bias_attr=False)
             scale = paddle.ParamAttr(trainable=True)
             bias = paddle.ParamAttr(trainable=True)
-            out = paddle.fluid.layers.nn.layer_norm(
-                conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+            out = paddle.fluid.layers.nn.layer_norm(conv1,
+                                                    param_attr=scale,
+                                                    bias_attr=bias,
+                                                    **self.attrs)
             loss = paddle.mean(out)
             self.fetch_list = [loss.name]
         else:
             scale = self.attrs['scale']
             bias = self.attrs['shift']
-            out = paddle.fluid.layers.nn.layer_norm(
-                x, param_attr=scale, bias_attr=bias, **self.attrs)
+            out = paddle.fluid.layers.nn.layer_norm(x,
+                                                    param_attr=scale,
+                                                    bias_attr=bias,
+                                                    **self.attrs)
             self.fetch_list = [out.name]
 
         if self.is_training:
@@ -83,8 +91,8 @@ def build_model(self):
             elif self.optimizer == 'adam':
                 optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
             elif self.optimizer == 'lamb':
-                optimizer = paddle.optimizer.Lamb(
-                    learning_rate=1e-2, lamb_weight_decay=0.0)
+                optimizer = paddle.optimizer.Lamb(learning_rate=1e-2,
+                                                  lamb_weight_decay=0.0)
             if optimizer is not None:
                 optimizer.minimize(loss)
 
@@ -101,6 +109,7 @@ def test(self):
 
 @unittest.skip('raise error')
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "scale": False,
@@ -112,6 +121,7 @@ def set_op_attrs(self):
 
 @unittest.skip('raise error')
 class TestCase2(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "scale": True,
@@ -122,6 +132,7 @@ def set_op_attrs(self):
 
 
 class TestCase3(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "scale": True,
@@ -133,6 +144,7 @@ def set_op_attrs(self):
 
 
 class TestTrainCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "scale": True,
@@ -152,6 +164,7 @@ def set_training(self):
 
 
 class TestTrainCase3(TestBase):
+
     def set_atol(self):
         super().set_atol()
         self.atol = 5e-3
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
index c0e4865b3a627..6711894c7de28 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
@@ -24,6 +24,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -47,8 +48,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = F.log_softmax(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -64,6 +66,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_attrs(self):
         self.attrs = {"axis": 1}
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
index 725d2b3429a7f..a406fa128fc5b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -41,8 +42,9 @@ def set_feed_attr(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype="bool")
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="bool")
         out = paddle.fluid.layers.logical_not(x)
         self.fetch_list = [out.name]
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
index 55a2c08c1b5e7..71a75db9ab392 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestLogicalAnd(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -40,14 +41,12 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0],
-            shape=self.feed_shape[0],
-            dtype=self.feed_dtype[0])
-        y = paddle.static.data(
-            name=self.feed_list[1],
-            shape=self.feed_shape[1],
-            dtype=self.feed_dtype[1])
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype=self.feed_dtype[0])
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype=self.feed_dtype[1])
         out = self.op(x, y, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -82,6 +81,7 @@ def test_case0(self):
 
 
 class TestLogicalOr(TestLogicalAnd):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.logical_or
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
index 80636348cfad3..27a70329ca132 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -51,8 +52,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int64')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='int64')
         out = paddle.fluid.layers.embedding(x, **self.attrs)
         if self.is_training:
             loss = paddle.mean(out)
@@ -76,6 +78,7 @@ def test(self):
 
 
 class TestTrainCase1(TestBase):
+
     def set_atol(self):
         self.atol = 1e-7
         self.rtol = 1e-6
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
index 7f021a615afa0..c15eb3a3b8edb 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -51,8 +52,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int64')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='int64')
         embedding = paddle.nn.Embedding(**self.attrs)
         out = embedding(x)
         if self.is_training:
@@ -77,6 +79,7 @@ def test(self):
 
 
 class TestTrainCase1(TestBase):
+
     def set_atol(self):
         self.atol = 1e-7
         self.rtol = 1e-6
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
index 6641efde69473..f7a01b7268ddf 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
@@ -21,6 +21,7 @@
 
 
 class LR_New(LRScheduler):
+
     def __init__(self, learning_rate=1e-5, last_epoch=-1, verbose=False):
         super(LR_New, self).__init__(learning_rate, last_epoch, verbose)
 
@@ -33,12 +34,16 @@ def get_lr(self):
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestConvNet(IPUOpTest):
+
     @IPUOpTest.static_graph
     def build_model(self):
-        image = paddle.static.data(
-            name='image', shape=[1, 3, 10, 10], dtype='float32')
-        conv1 = paddle.static.nn.conv2d(
-            image, num_filters=3, filter_size=3, bias_attr=False)
+        image = paddle.static.data(name='image',
+                                   shape=[1, 3, 10, 10],
+                                   dtype='float32')
+        conv1 = paddle.static.nn.conv2d(image,
+                                        num_filters=3,
+                                        filter_size=3,
+                                        bias_attr=False)
         loss = paddle.mean(conv1)
 
         opt = paddle.optimizer.Lamb(learning_rate=LR_New())
@@ -58,8 +63,9 @@ def run_model(self, run_ipu=True):
             ipu_strategy = paddle.static.IpuStrategy()
             ipu_strategy.set_graph_config(is_training=True)
             program = paddle.static.IpuCompiledProgram(
-                self.main_prog, ipu_strategy=ipu_strategy).compile(
-                    self.feed_list, self.fetch_list)
+                self.main_prog,
+                ipu_strategy=ipu_strategy).compile(self.feed_list,
+                                                   self.fetch_list)
         else:
             program = self.main_prog
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
index e7e4c000e16a2..222bb20209750 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -50,10 +51,12 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        y = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='float32')
 
         out = paddle.fluid.layers.matmul(x, y, **self.attrs)
         self.fetch_list = [out.name]
@@ -70,6 +73,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
@@ -79,6 +83,7 @@ def set_op_attrs(self):
 
 
 class TestCase2(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
@@ -94,6 +99,7 @@ def set_atol(self):
 
 
 class TestCase3(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[5, 4, 3, 2])
         y = np.random.uniform(size=[5, 4, 2, 3])
@@ -103,6 +109,7 @@ def set_data_feed(self):
 
 
 class TestCase4(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[4, 3, 2])
         y = np.random.uniform(size=[4, 2, 3])
@@ -112,6 +119,7 @@ def set_data_feed(self):
 
 
 class TestCase5(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[4, 2, 3])
         y = np.random.uniform(size=[3, 2])
@@ -121,6 +129,7 @@ def set_data_feed(self):
 
 
 class TestCase6(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[3])
 
@@ -130,6 +139,7 @@ def set_data_feed(self):
 
 @unittest.skip("not supported")
 class TestCase6_2(TestCase6):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[3])
 
@@ -145,6 +155,7 @@ def set_op_attrs(self):
 
 
 class TestCase7(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[1, 12, 128, 64])
         y = np.random.uniform(size=[1, 12, 128, 64])
@@ -157,6 +168,7 @@ def set_op_attrs(self):
 
 
 class TestCase8(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[3, 1])
         y = np.random.uniform(size=[1, 2])
@@ -167,6 +179,7 @@ def set_data_feed(self):
 
 @unittest.skip("not supported")
 class TestCase8_2(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[3])
         y = np.random.uniform(size=[2])
@@ -184,6 +197,7 @@ def set_op_attrs(self):
 
 @unittest.skip("dim > 4 is not supported")
 class TestCase9(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[6, 5, 4, 2, 3])
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py
index 0a273e91dd571..8151c55326500 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py
@@ -29,6 +29,7 @@ def set_serialize_factor(serialize_factor):
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -52,14 +53,12 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0],
-            shape=self.feed_shape[0],
-            dtype=self.feed_dtype[0])
-        y = paddle.static.data(
-            name=self.feed_list[1],
-            shape=self.feed_shape[1],
-            dtype=self.feed_dtype[1])
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype=self.feed_dtype[0])
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype=self.feed_dtype[1])
         # decrator maybe the best choice, but need to modify api
         out = paddle.matmul(x, y, **self.attrs)
         set_serialize_factor(4)
@@ -89,8 +88,7 @@ def test_base(self):
         res0 = self.run_model(False)
         res1 = self.run_model(True)
         self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+            np.allclose(res0.flatten(), res1.flatten(), atol=self.atol))
         self.assertTrue(res0.shape == res1.shape)
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
index 725f3243e0f3d..4777c42da138e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -46,10 +47,12 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        y = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='float32')
         out = paddle.matmul(x, y, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -65,6 +68,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
@@ -73,6 +77,7 @@ def set_op_attrs(self):
 
 
 class TestCase3(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[5, 4, 2, 3])
         y = np.random.uniform(size=[5, 4, 3, 2])
@@ -82,6 +87,7 @@ def set_data_feed(self):
 
 
 class TestCase4(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[4, 2, 3])
         y = np.random.uniform(size=[4, 3, 2])
@@ -91,6 +97,7 @@ def set_data_feed(self):
 
 
 class TestCase5(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[4, 2, 3])
         y = np.random.uniform(size=[3, 2])
@@ -100,6 +107,7 @@ def set_data_feed(self):
 
 
 class TestCase6(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[3])
         y = np.random.uniform(size=[3])
@@ -110,6 +118,7 @@ def set_data_feed(self):
 
 @unittest.skip("not supported")
 class TestCase6_2(TestCase6):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[3])
         y = np.random.uniform(size=[3])
@@ -122,6 +131,7 @@ def set_op_attrs(self):
 
 
 class TestCase7(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[3, 1])
         y = np.random.uniform(size=[1, 2])
@@ -132,6 +142,7 @@ def set_data_feed(self):
 
 @unittest.skip("dim > 4 is not supported")
 class TestCase8(TestBase):
+
     def set_data_feed(self):
         self.feed = {
             "x": np.random.uniform(size=[6, 5, 4, 2, 3]).astype('float32'),
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
index c0d7dd1fd171d..72c2c9cc3beed 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -44,8 +45,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.mean(x)
         self.fetch_list = [out.name]
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
index 9bdf233556012..ba8f9c7bad51f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
@@ -24,6 +24,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_data_feed()
@@ -58,8 +59,9 @@ def set_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
 
         # using fp32
         x = paddle.static.nn.conv2d(input=x, num_filters=3, filter_size=3)
@@ -110,8 +112,9 @@ def run_model(self, exec_mode):
                 enable_pipelining=self.enable_pipelining,
                 batches_per_step=self.batches_per_step)
             program = paddle.static.IpuCompiledProgram(
-                self.main_prog, ipu_strategy=ipu_strategy).compile(
-                    self.feed_list, self.fetch_list)
+                self.main_prog,
+                ipu_strategy=ipu_strategy).compile(self.feed_list,
+                                                   self.fetch_list)
         else:
             program = self.main_prog
 
@@ -128,13 +131,15 @@ def test(self):
 
 
 class TestPipline(TestBase):
+
     @IPUOpTest.static_graph
     def build_model(self, exec_mode):
         feed_shape = list(self.feed_shape[0])
         if self.is_ipu_mode(exec_mode):
             feed_shape[0] = 1
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=feed_shape, dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=feed_shape,
+                               dtype='float32')
         with paddle.static.ipu_shard_guard(index=0, stage=0):
             # using fp32
             x = paddle.static.nn.conv2d(input=x, num_filters=3, filter_size=3)
@@ -144,8 +149,9 @@ def build_model(self, exec_mode):
         with paddle.static.ipu_shard_guard(index=1, stage=1):
             # using fp16
             with paddle.static.amp.fp16_guard():
-                x = paddle.static.nn.conv2d(
-                    input=x, num_filters=6, filter_size=3)
+                x = paddle.static.nn.conv2d(input=x,
+                                            num_filters=6,
+                                            filter_size=3)
                 x = paddle.static.nn.batch_norm(x, act='relu')
                 x = F.max_pool2d(x, kernel_size=2, stride=2)
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
index c4ac9cddd7c3f..4fc3b40f9ab8c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
@@ -24,6 +24,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -63,8 +64,9 @@ def dtype_check(self, program, to_fp16_var_names):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
 
         # using fp32
         x = paddle.static.nn.conv2d(input=x, num_filters=3, filter_size=3)
@@ -119,8 +121,9 @@ def run_model(self, exec_mode):
                 enable_pipelining=self.enable_pipelining,
                 batches_per_step=self.batches_per_step)
             program = paddle.static.IpuCompiledProgram(
-                self.main_prog, ipu_strategy=ipu_strategy).compile(
-                    self.feed_list, self.fetch_list)
+                self.main_prog,
+                ipu_strategy=ipu_strategy).compile(self.feed_list,
+                                                   self.fetch_list)
         else:
             program = self.main_prog
 
@@ -140,13 +143,15 @@ def test(self):
 
 
 class TestPipline(TestBase):
+
     @IPUOpTest.static_graph
     def build_model(self, exec_mode):
         feed_shape = list(self.feed_shape[0])
         if self.is_ipu_mode(exec_mode):
             feed_shape[0] = 1
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=feed_shape, dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=feed_shape,
+                               dtype='float32')
 
         with paddle.static.ipu_shard_guard(index=0, stage=0):
             # using fp32
@@ -157,8 +162,9 @@ def build_model(self, exec_mode):
         with paddle.static.ipu_shard_guard(index=1, stage=1):
             # using fp16
             with paddle.static.amp.fp16_guard():
-                x = paddle.static.nn.conv2d(
-                    input=x, num_filters=6, filter_size=3)
+                x = paddle.static.nn.conv2d(input=x,
+                                            num_filters=6,
+                                            filter_size=3)
                 x = paddle.static.nn.batch_norm(x, act='relu')
                 x = F.max_pool2d(x, kernel_size=2, stride=2)
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
index 884162d336f35..81f5295c7dda8 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -60,14 +61,19 @@ def _test_base(self, run_ipu=True):
         bs = self.ipu_bs if run_ipu else self.cpu_bs
         with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[bs, 3, 10, 10], dtype='float32')
+                image = paddle.static.data(name='image',
+                                           shape=[bs, 3, 10, 10],
+                                           dtype='float32')
                 with paddle.static.ipu_shard_guard(index=0):
-                    conv1 = paddle.static.nn.conv2d(
-                        image, num_filters=3, filter_size=3, bias_attr=False)
+                    conv1 = paddle.static.nn.conv2d(image,
+                                                    num_filters=3,
+                                                    filter_size=3,
+                                                    bias_attr=False)
                 with paddle.static.ipu_shard_guard(index=1):
-                    conv2 = paddle.static.nn.conv2d(
-                        conv1, num_filters=3, filter_size=3, bias_attr=False)
+                    conv2 = paddle.static.nn.conv2d(conv1,
+                                                    num_filters=3,
+                                                    filter_size=3,
+                                                    bias_attr=False)
                     # should consider influence of bs
                     loss = paddle.mean(conv2)
 
@@ -126,6 +132,7 @@ def test(self):
 
 
 class TestReplicaInference(TestBase):
+
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 1,
@@ -149,6 +156,7 @@ def set_data_feed(self):
 
 
 class TestReplicaCollectiveInference(TestBase):
+
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 1,
@@ -179,6 +187,7 @@ def set_data_feed(self):
 
 
 class TestPipelineInference(TestBase):
+
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 2,
@@ -195,12 +204,13 @@ def set_data_feed(self):
         np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
         self.feed_cpu = {"image": np_image}
         self.feed_ipu = {
-            "image": np.tile(np_image,
-                             [self.ipu_options['batches_per_step'], 1, 1, 1])
+            "image":
+            np.tile(np_image, [self.ipu_options['batches_per_step'], 1, 1, 1])
         }
 
 
 class TestTrainBase(TestBase):
+
     def set_training(self):
         self.is_training = True
         self.epoch = 10
@@ -220,6 +230,7 @@ def set_attrs(self):
 
 
 class TestReplicaTrain(TestTrainBase):
+
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 1,
@@ -250,6 +261,7 @@ def test(self):
 
 
 class TestReplicaCollectiveTrain(TestTrainBase):
+
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 1,
@@ -287,6 +299,7 @@ def test(self):
 
 
 class TestPipelineTrain(TestTrainBase):
+
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 3,
@@ -315,6 +328,7 @@ def test(self):
 
 
 class TestAdamTrain(TestTrainBase):
+
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 1,
@@ -330,6 +344,7 @@ def set_attrs(self):
 
 
 class TestAdamReplicaTrain(TestReplicaTrain):
+
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 1,
@@ -345,6 +360,7 @@ def set_attrs(self):
 
 
 class TestAdamPipelineTrain(TestPipelineTrain):
+
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 3,
@@ -360,6 +376,7 @@ def set_attrs(self):
 
 
 class TestAdamRecomputationTrain(TestPipelineTrain):
+
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 3,
@@ -376,6 +393,7 @@ def set_attrs(self):
 
 
 class TestLambTrain(TestAdamTrain):
+
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 1,
@@ -391,6 +409,7 @@ def set_attrs(self):
 
 
 class TestLambReplicaTrain(TestAdamReplicaTrain):
+
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 1,
@@ -406,6 +425,7 @@ def set_attrs(self):
 
 
 class TestLambPipelineTrain(TestAdamPipelineTrain):
+
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 3,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
index 7e70239964002..27538610a42b7 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
@@ -26,6 +26,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestCastNet(unittest.TestCase):
+
     def _test(self, run_ipu=True):
         scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
@@ -38,14 +39,19 @@ def _test(self, run_ipu=True):
 
         with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[1, 3, 10, 10], dtype='float32')
+                image = paddle.static.data(name='image',
+                                           shape=[1, 3, 10, 10],
+                                           dtype='float32')
                 with paddle.static.ipu_shard_guard(index=0):
-                    conv1 = paddle.static.nn.conv2d(
-                        image, num_filters=3, filter_size=3, bias_attr=False)
+                    conv1 = paddle.static.nn.conv2d(image,
+                                                    num_filters=3,
+                                                    filter_size=3,
+                                                    bias_attr=False)
                 with paddle.static.ipu_shard_guard(index=1):
-                    conv2 = paddle.static.nn.conv2d(
-                        conv1, num_filters=3, filter_size=3, bias_attr=False)
+                    conv2 = paddle.static.nn.conv2d(conv1,
+                                                    num_filters=3,
+                                                    filter_size=3,
+                                                    bias_attr=False)
                     loss = paddle.mean(conv2)
 
             if run_ipu:
@@ -59,8 +65,9 @@ def _test(self, run_ipu=True):
                 feed_list = [image.name]
                 fetch_list = [loss.name]
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(
-                    num_ipus=2, is_training=False, enable_manual_shard=True)
+                ipu_strategy.set_graph_config(num_ipus=2,
+                                              is_training=False,
+                                              enable_manual_shard=True)
                 ipu_strategy.set_pipelining_config(enable_pipelining=False)
                 program = paddle.static.IpuCompiledProgram(
                     main_prog,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
index 583a8941ac62b..50be6420a5569 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -49,10 +50,12 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        y = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='float32')
         out = paddle.fluid.layers.mul(x, y, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -68,6 +71,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[1, 2, 5])
         y = np.random.uniform(size=[5, 3])
@@ -82,6 +86,7 @@ def set_op_attrs(self):
 
 
 class TestCase2(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[3, 4, 2, 9])
         y = np.random.uniform(size=[3, 6, 1, 2, 3])
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py
index a4365c021ff3c..c796cc7c02b42 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -51,10 +52,12 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        y = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='float32')
         out = paddle.fluid.layers.not_equal(x, y, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -70,6 +73,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_data_feed(self):
         x = np.ones([1, 10])
         y = np.ones([1, 10])
@@ -78,6 +82,7 @@ def set_data_feed(self):
 
 
 class TestCase2(TestBase):
+
     def set_data_feed(self):
         x = np.ones([1, 10])
         y = np.arange(0, 10).reshape([1, 10])
@@ -88,6 +93,7 @@ def set_data_feed(self):
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestScalar(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -98,8 +104,12 @@ def setUp(self):
     def set_data_feed(self):
         x = np.ones([1, 10])
         y = 0.5
-        self.feed_fp32 = {"x": x.astype(np.float32), }
-        self.feed_fp16 = {"x": x.astype(np.float16), }
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+        }
 
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
@@ -110,8 +120,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = (x != 0.5)
         self.fetch_list = [out.name]
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
index 938654bfafc05..6c8c3b113143a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -44,8 +45,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='int32')
         out = paddle.fluid.layers.one_hot(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -62,6 +64,7 @@ def test(self):
 
 @unittest.skip('does not support allow_out_of_range=True')
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"depth": 4, "allow_out_of_range": True}
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
index ec25f378866aa..8822c352b8ba5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -44,8 +45,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='int32')
         out = paddle.fluid.input.one_hot(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -62,6 +64,7 @@ def test(self):
 
 @unittest.skip('does not support allow_out_of_range=True')
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"depth": 4, "allow_out_of_range": True}
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
index 060a69e83112a..5169eddc70307 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
@@ -22,6 +22,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_data_feed()
@@ -58,22 +59,25 @@ def _test_optimizer(self, run_ipu=True):
 
         with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[1, 3, 10, 10], dtype='float32')
-                conv1 = paddle.static.nn.conv2d(
-                    image, num_filters=3, filter_size=3, bias_attr=False)
+                image = paddle.static.data(name='image',
+                                           shape=[1, 3, 10, 10],
+                                           dtype='float32')
+                conv1 = paddle.static.nn.conv2d(image,
+                                                num_filters=3,
+                                                filter_size=3,
+                                                bias_attr=False)
                 loss = paddle.mean(conv1)
 
                 weight_decay = self.attrs['weight_decay']
                 opt = paddle.optimizer.SGD(learning_rate=1e-1,
                                            weight_decay=weight_decay)
                 if self.attrs['optimizer'] == 'adam':
-                    opt = paddle.optimizer.Adam(
-                        learning_rate=1e-1, weight_decay=weight_decay)
+                    opt = paddle.optimizer.Adam(learning_rate=1e-1,
+                                                weight_decay=weight_decay)
                 elif self.attrs['optimizer'] == 'lamb':
 
-                    opt = paddle.optimizer.Lamb(
-                        learning_rate=1e-1, lamb_weight_decay=weight_decay)
+                    opt = paddle.optimizer.Lamb(learning_rate=1e-1,
+                                                lamb_weight_decay=weight_decay)
                 opt.minimize(loss)
 
             if run_ipu:
@@ -88,21 +92,19 @@ def _test_optimizer(self, run_ipu=True):
                 fetch_list = [loss.name]
                 ipu_strategy = paddle.static.IpuStrategy()
                 ipu_strategy.set_graph_config(is_training=True)
-                ipu_strategy.set_options({
-                    'loss_scaling': self.attrs["loss_scaling"]
-                })
+                ipu_strategy.set_options(
+                    {'loss_scaling': self.attrs["loss_scaling"]})
                 if "use_no_bias_optimizer" in self.attrs.keys():
                     ipu_strategy.set_options({
                         "use_no_bias_optimizer":
                         self.attrs["use_no_bias_optimizer"]
                     })
                 if "accl1_type" in self.attrs.keys():
-                    ipu_strategy.set_options({
-                        "accl1_type": self.attrs["accl1_type"]
-                    })
+                    ipu_strategy.set_options(
+                        {"accl1_type": self.attrs["accl1_type"]})
                 program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
-                                                                  fetch_list)
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
@@ -123,6 +125,7 @@ def test(self):
 
 @unittest.skip('do not support L2 regularization')
 class TestSGD(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "optimizer": 'sgd',
@@ -133,6 +136,7 @@ def set_attrs(self):
 
 @unittest.skip('do not support L2 regularization')
 class TestAdamCase1(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "optimizer": 'adam',
@@ -142,6 +146,7 @@ def set_attrs(self):
 
 
 class TestAdamCase2(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "optimizer": 'adam',
@@ -152,6 +157,7 @@ def set_attrs(self):
 
 @unittest.skip('cpu do not support AdamNoBias')
 class TestAdamNoBias(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "optimizer": 'adam',
@@ -163,6 +169,7 @@ def set_attrs(self):
 
 @unittest.skip('cpu do not support FLOAT16')
 class TestAdamCase3(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "optimizer": 'adam',
@@ -174,6 +181,7 @@ def set_attrs(self):
 
 @unittest.skip('seems cpu output wrong')
 class TestLambCase1(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "optimizer": 'lamb',
@@ -184,6 +192,7 @@ def set_attrs(self):
 
 @unittest.skip('seems cpu output wrong')
 class TestLamb(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "optimizer": 'lamb',
@@ -194,6 +203,7 @@ def set_attrs(self):
 
 @unittest.skip('cpu do not support LambNoBias')
 class TestLambNoBias(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "optimizer": 'lamb',
@@ -205,6 +215,7 @@ def set_attrs(self):
 
 @unittest.skip('cpu do not support FLOAT16')
 class TestLambCase2(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "optimizer": 'lamb',
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
index e5df11eb4ef8c..a9ffeb8dc0106 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -54,8 +55,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.pool2d(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -71,36 +73,42 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_attrs(self):
         super().set_attrs()
         self.attrs['pool_size'] = 3
 
 
 class TestCase1_2(TestBase):
+
     def set_attrs(self):
         super().set_attrs()
         self.attrs['pool_size'] = [3, 1]
 
 
 class TestCase2(TestBase):
+
     def set_attrs(self):
         super().set_attrs()
         self.attrs['pool_stride'] = 2
 
 
 class TestCase2_2(TestBase):
+
     def set_attrs(self):
         super().set_attrs()
         self.attrs['pool_stride'] = [2, 1]
 
 
 class TestCase3(TestBase):
+
     def set_attrs(self):
         super().set_attrs()
         self.attrs['pool_padding'] = [1, 1]
 
 
 class TestCase3_2(TestBase):
+
     def set_attrs(self):
         super().set_attrs()
         self.attrs['pool_padding'] = [1, 1, 2, 2]
@@ -108,6 +116,7 @@ def set_attrs(self):
 
 @unittest.skip('the results has a positional offset')
 class TestCase3_3(TestBase):
+
     def set_attrs(self):
         super().set_attrs()
         self.attrs['pool_padding'] = [1, 2, 1, 1]
@@ -115,6 +124,7 @@ def set_attrs(self):
 
 @unittest.skip('paddle output has nan')
 class TestCase3_4(TestBase):
+
     def set_attrs(self):
         super().set_attrs()
         self.attrs['pool_size'] = 1
@@ -122,24 +132,28 @@ def set_attrs(self):
 
 
 class TestCase4(TestBase):
+
     def set_attrs(self):
         super().set_attrs()
         self.attrs['global_pooling'] = True
 
 
 class TestCase5(TestBase):
+
     def set_attrs(self):
         super().set_attrs()
         self.attrs['ceil_mode'] = True
 
 
 class TestCase6(TestBase):
+
     def set_attrs(self):
         super().set_attrs()
         self.attrs['exclusive'] = False
 
 
 class TestAdaptive(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "pool_size": 1,
@@ -149,8 +163,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.adaptive_pool2d(x, **self.attrs)
         self.fetch_list = [out.name]
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
index 41b2b8406dc7e..e9fec9a02326d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -54,8 +55,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.pool2d(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -71,36 +73,42 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['pool_size'] = 3
 
 
 class TestCase1_2(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['pool_size'] = [3, 1]
 
 
 class TestCase2(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['pool_stride'] = 2
 
 
 class TestCase2_2(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['pool_stride'] = [2, 1]
 
 
 class TestCase3(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['pool_padding'] = [1, 1]
 
 
 class TestCase3_2(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['pool_padding'] = [1, 1, 2, 2]
@@ -108,6 +116,7 @@ def set_op_attrs(self):
 
 @unittest.skip('auto_pad is not currently supported')
 class TestCase3_3(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['pool_padding'] = 'VALID'
@@ -115,30 +124,35 @@ def set_op_attrs(self):
 
 @unittest.skip('auto_pad is not currently supported')
 class TestCase3_4(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['pool_padding'] = 'SAME'
 
 
 class TestCase4(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['global_pooling'] = True
 
 
 class TestCase5(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['ceil_mode'] = True
 
 
 class TestCase6(TestBase):
+
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['exclusive'] = False
 
 
 class TestAdaptive(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "pool_size": 1,
@@ -148,8 +162,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.adaptive_pool2d(x, **self.attrs)
         self.fetch_list = [out.name]
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
index 5ff1223961bb7..3f596f951cd0c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -45,8 +46,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.pow(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -62,6 +64,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_data_feed(self):
         data1 = np.random.uniform(size=[1, 3, 2, 2])
         data2 = np.array([2.0])
@@ -80,10 +83,12 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        factor = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        factor = paddle.static.data(name=self.feed_list[1],
+                                    shape=self.feed_shape[1],
+                                    dtype='float32')
         out = paddle.fluid.layers.pow(x, factor=factor, **self.attrs)
         self.fetch_list = [out.name]
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
index 3189e060d5837..1c050d1e485b8 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -49,10 +50,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0],
-            shape=self.feed_shape[0],
-            dtype=self.feed_dtype[0])
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype=self.feed_dtype[0])
         out = paddle.fluid.layers.conv2d(x, num_filters=3, filter_size=3)
         out = paddle.fluid.layers.Print(out, **self.attrs)
 
@@ -75,11 +75,13 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"message": "input_data"}
 
 
 class TestTrainCase1(TestBase):
+
     def set_op_attrs(self):
         # "forward" : print forward
         # "backward" : print forward and backward
@@ -93,6 +95,7 @@ def set_training(self):
 
 @unittest.skip("attrs are not supported")
 class TestCase2(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "first_n": 10,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
index 93f96e08fd4b7..ffa3c6d155025 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestMean(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -38,8 +39,9 @@ def set_feed_attr(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = self.op(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -123,21 +125,25 @@ def test_case7(self):
 
 
 class TestMax(TestMean):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_max
 
 
 class TestMin(TestMean):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_min
 
 
 class TestProd(TestMean):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_prod
 
 
 class TestSum(TestMean):
+
     def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_sum
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
index 35be4d988273a..9a8c127ab650c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -48,8 +49,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         add = paddle.fluid.layers.elementwise_add(x, x)
         out = paddle.fluid.layers.reshape(add, **self.attrs)
         self.fetch_list = [out.name]
@@ -66,6 +68,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "shape": [-1, 0, 10],
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
index 427e975402344..32cedf0cdda58 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -46,8 +47,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.reshape(x=x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -63,6 +65,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {}
         self.attrs['shape'] = [2, 3, -1, 2]
@@ -70,6 +73,7 @@ def set_op_attrs(self):
 
 
 class TestCase2(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {}
         self.attrs['shape'] = [-1, 0, 3, 2]
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
index c8f0961baa480..1b39ead9b84a8 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
@@ -26,6 +26,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_data_feed()
@@ -63,16 +64,14 @@ def _test_base(self, save_otherwise_load):
         with paddle.fluid.unique_name.guard(generator):
             with paddle.static.scope_guard(scope):
                 with paddle.static.program_guard(main_prog, startup_prog):
-                    x = paddle.static.data(
-                        name=self.feed_list[0],
-                        shape=self.feed_shape[0],
-                        dtype='float32')
-                    conv1 = paddle.static.nn.conv2d(
-                        x,
-                        num_filters=3,
-                        filter_size=3,
-                        bias_attr=False,
-                        name='conv2d')
+                    x = paddle.static.data(name=self.feed_list[0],
+                                           shape=self.feed_shape[0],
+                                           dtype='float32')
+                    conv1 = paddle.static.nn.conv2d(x,
+                                                    num_filters=3,
+                                                    filter_size=3,
+                                                    bias_attr=False,
+                                                    name='conv2d')
                     loss = paddle.mean(conv1)
 
                     # apply optimizer
@@ -121,59 +120,69 @@ def test_base(self):
         res1 = self._test_base(False)
 
         self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+            np.allclose(res0.flatten(), res1.flatten(), atol=self.atol))
         self.attrs['model_path'].cleanup()
 
 
 class TestMomentum(TestBase):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.Momentum, learning_rate=1e-1)
 
 
 class TestAdam(TestBase):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.Adam, learning_rate=1e-1)
 
 
 class TestLamb(TestBase):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.Lamb, learning_rate=1e-1)
 
 
 class TestAdamW(TestBase):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.AdamW, learning_rate=1e-1)
 
 
 class TestAdamax(TestBase):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.Adamax, learning_rate=1e-1)
 
 
 class TestAdagrad(TestBase):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.Adagrad, learning_rate=1e-1)
 
 
 class TestAdadelta(TestBase):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.Adagrad, learning_rate=1e-1)
 
 
 class TestRMSProp(TestBase):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.RMSProp, learning_rate=1e-1)
 
 
 class TestCenteredRMSProp(TestBase):
+
     def set_optimizer(self):
-        self.optimizer = partial(
-            paddle.optimizer.RMSProp, learning_rate=1e-1, centered=True)
+        self.optimizer = partial(paddle.optimizer.RMSProp,
+                                 learning_rate=1e-1,
+                                 centered=True)
 
 
 @unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
 class TestSGDFP16(TestBase):
+
     def set_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
@@ -186,49 +195,59 @@ def set_optimizer(self):
 
 
 class TestMomentumFp16(TestSGDFP16):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.Momentum, learning_rate=1e-1)
 
 
 class TestAdamFP16(TestSGDFP16):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.Adam, learning_rate=1e-1)
 
 
 class TestLambFP16(TestSGDFP16):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.Lamb, learning_rate=1e-1)
 
 
 class TestAdamWFP16FP16(TestSGDFP16):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.AdamW, learning_rate=1e-1)
 
 
 class TestAdamaxFP16(TestSGDFP16):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.Adamax, learning_rate=1e-1)
 
 
 class TestAdagradFP16(TestSGDFP16):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.Adagrad, learning_rate=1e-1)
 
 
 class TestAdadeltaFP16(TestSGDFP16):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.Adagrad, learning_rate=1e-1)
 
 
 class TestRMSPropFP16(TestSGDFP16):
+
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.RMSProp, learning_rate=1e-1)
 
 
 class TestCenteredRMSPropFP16(TestSGDFP16):
+
     def set_optimizer(self):
-        self.optimizer = partial(
-            paddle.optimizer.RMSProp, learning_rate=1e-1, centered=True)
+        self.optimizer = partial(paddle.optimizer.RMSProp,
+                                 learning_rate=1e-1,
+                                 centered=True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
index f28bcba4cf0d9..8b6b8425b5209 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -53,8 +54,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.scale(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -70,6 +72,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "scale": 5.0,
@@ -79,6 +82,7 @@ def set_op_attrs(self):
 
 
 class TestCase2(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "scale": 1.0,
@@ -88,6 +92,7 @@ def set_op_attrs(self):
 
 
 class TestCase3(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "scale": 5.0,
@@ -97,6 +102,7 @@ def set_op_attrs(self):
 
 
 class TestCase4(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "scale": 1.0,
@@ -106,6 +112,7 @@ def set_op_attrs(self):
 
 
 class TestCase5(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[3, 3, 10, 10])
         y = np.array([3.0])
@@ -120,10 +127,12 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        y = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='float32')
         out = paddle.fluid.layers.scale(x, scale=y, **self.attrs)
         self.fetch_list = [out.name]
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py
index 113b316af4ea9..79527f7a13081 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py
@@ -22,6 +22,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -52,18 +53,21 @@ def set_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        image = paddle.static.data(
-            name='image', shape=[1, 3, 10, 10], dtype='float32')
-        conv1 = paddle.static.nn.conv2d(
-            image, num_filters=3, filter_size=3, bias_attr=False)
+        image = paddle.static.data(name='image',
+                                   shape=[1, 3, 10, 10],
+                                   dtype='float32')
+        conv1 = paddle.static.nn.conv2d(image,
+                                        num_filters=3,
+                                        filter_size=3,
+                                        bias_attr=False)
         loss = paddle.mean(conv1)
 
         weight_decay = self.attrs['weight_decay']
-        opt = paddle.optimizer.Adam(
-            learning_rate=1e-1, weight_decay=weight_decay)
+        opt = paddle.optimizer.Adam(learning_rate=1e-1,
+                                    weight_decay=weight_decay)
         if self.attrs['optimizer'] == 'lamb':
-            opt = paddle.optimizer.Lamb(
-                learning_rate=1e-1, lamb_weight_decay=weight_decay)
+            opt = paddle.optimizer.Lamb(learning_rate=1e-1,
+                                        lamb_weight_decay=weight_decay)
         opt.minimize(loss)
         self.feed_list = [image.name]
         self.fetch_list = [loss.name]
@@ -74,7 +78,8 @@ def run_model(self, exec_mode):
         if self.is_ipu_mode(exec_mode):
             if "use_no_bias_optimizer" in self.attrs.keys():
                 ipu_strategy.set_options({
-                    "use_no_bias_optimizer": self.attrs["use_no_bias_optimizer"]
+                    "use_no_bias_optimizer":
+                    self.attrs["use_no_bias_optimizer"]
                 })
             if "scaled_optimizer_state" in self.attrs.keys():
                 ipu_strategy.set_options({
@@ -92,6 +97,7 @@ def test(self):
 
 
 class TestScaledAdam(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "optimizer": 'adam',
@@ -107,6 +113,7 @@ def set_atol(self):
 
 @unittest.skip('cpu do not support AdamNoBias')
 class TestScaledAdamNoBias(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "optimizer": 'adam',
@@ -118,6 +125,7 @@ def set_attrs(self):
 
 @unittest.skip('cpu do not support LambNoBias')
 class TestScaledLambNoBias(TestBase):
+
     def set_attrs(self):
         self.attrs = {
             "optimizer": 'lamb',
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
index 5c61012cacece..2af8de38377b9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -50,22 +51,31 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        conv1 = paddle.static.nn.conv2d(
-            x, num_filters=3, filter_size=3, bias_attr=False)
-        conv2 = paddle.static.nn.conv2d(
-            conv1, num_filters=3, filter_size=3, bias_attr=False)
-        conv3 = paddle.static.nn.conv2d(
-            conv2, num_filters=3, filter_size=3, bias_attr=False)
-        conv4 = paddle.static.nn.conv2d(
-            conv3, num_filters=3, filter_size=3, bias_attr=False)
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        conv1 = paddle.static.nn.conv2d(x,
+                                        num_filters=3,
+                                        filter_size=3,
+                                        bias_attr=False)
+        conv2 = paddle.static.nn.conv2d(conv1,
+                                        num_filters=3,
+                                        filter_size=3,
+                                        bias_attr=False)
+        conv3 = paddle.static.nn.conv2d(conv2,
+                                        num_filters=3,
+                                        filter_size=3,
+                                        bias_attr=False)
+        conv4 = paddle.static.nn.conv2d(conv3,
+                                        num_filters=3,
+                                        filter_size=3,
+                                        bias_attr=False)
         self.fetch_list = [conv4.name]
 
     def run_model(self, exec_mode):
         ipu_strategy = paddle.static.IpuStrategy()
-        ipu_strategy.set_graph_config(
-            is_training=self.is_training, micro_batch_size=2)
+        ipu_strategy.set_graph_config(is_training=self.is_training,
+                                      micro_batch_size=2)
         self.run_op_test(exec_mode, ipu_strategy)
 
     def test(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
index ac8ef3e9d65ad..3a96d4bb0b9f8 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -49,8 +50,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.slice(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -66,6 +68,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             "axes": [0, 1],
@@ -76,6 +79,7 @@ def set_op_attrs(self):
 
 @unittest.skip('dynamic graph is not support on IPU')
 class TestCase2(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[4, 5, 6])
         s = np.array([0, 0, 2])
@@ -96,14 +100,19 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        starts = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
-        ends = paddle.static.data(
-            name=self.feed_list[2], shape=self.feed_shape[2], dtype='int32')
-        out = paddle.fluid.layers.slice(
-            x, starts=starts, ends=ends, **self.attrs)
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        starts = paddle.static.data(name=self.feed_list[1],
+                                    shape=self.feed_shape[1],
+                                    dtype='int32')
+        ends = paddle.static.data(name=self.feed_list[2],
+                                  shape=self.feed_shape[2],
+                                  dtype='int32')
+        out = paddle.fluid.layers.slice(x,
+                                        starts=starts,
+                                        ends=ends,
+                                        **self.attrs)
         self.fetch_list = [out.name]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
index 0b2d776cf240b..be803e61cf533 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -45,8 +46,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.softmax(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -62,6 +64,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"axis": 2}
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py
index cb1ed6ad93044..97b0c25f9380e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py
@@ -24,6 +24,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -48,18 +49,23 @@ def set_feed_attr(self):
         self.feed_list = list(self.feed_fp32.keys())
 
     def set_op_attrs(self):
-        self.attrs = {'soft_label': False, }
+        self.attrs = {
+            'soft_label': False,
+        }
 
     @IPUOpTest.static_graph
     def build_model(self, on_ipu):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="float32")
         if on_ipu:
-            label = paddle.static.data(
-                name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
+            label = paddle.static.data(name=self.feed_list[1],
+                                       shape=self.feed_shape[1],
+                                       dtype='int32')
         else:
-            label = paddle.static.data(
-                name=self.feed_list[1], shape=self.feed_shape[1], dtype='int64')
+            label = paddle.static.data(name=self.feed_list[1],
+                                       shape=self.feed_shape[1],
+                                       dtype='int64')
         out = F.softmax_with_cross_entropy(x, label, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -77,6 +83,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {
             'soft_label': False,
@@ -85,6 +92,7 @@ def set_op_attrs(self):
 
 
 class TestCase2(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[30, 70])
         label = np.arange(30).reshape([30, 1])
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
index 63d9584dae37d..76b65a015e95f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -44,8 +45,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.split(x, **self.attrs)
         self.fetch_list = [fetch.name for fetch in out]
 
@@ -63,6 +65,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"num_or_sections": [2, 8], "axis": 2}
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
index 33950221ad5e8..1afc79b6a6586 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -45,8 +46,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.squeeze(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -62,11 +64,13 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"axes": []}
 
 
 class TestCase2(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"axes": [-2]}
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
index 11a827cee0948..1828772c07a51 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -55,12 +56,15 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        y = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
-        z = paddle.static.data(
-            name=self.feed_list[2], shape=self.feed_shape[2], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='float32')
+        z = paddle.static.data(name=self.feed_list[2],
+                               shape=self.feed_shape[2],
+                               dtype='float32')
         out = paddle.fluid.layers.stack([x, y, z], **self.attrs)
         self.fetch_list = [out.name]
 
@@ -76,6 +80,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"axis": -2}
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
index fdc6ce08b6e15..084c68654239c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -46,10 +47,12 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        y = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='float32')
         out = paddle.fluid.layers.sum([x, y], **self.attrs)
         self.fetch_list = [out.name]
 
@@ -65,6 +68,7 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_data_feed(self):
         x = np.random.uniform(size=[1, 3, 2, 2])
         y = np.random.uniform(size=[1, 3, 2, 2])
@@ -82,12 +86,15 @@ def set_data_feed(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
-        y = paddle.static.data(
-            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
-        z = paddle.static.data(
-            name=self.feed_list[2], shape=self.feed_shape[2], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='float32')
+        z = paddle.static.data(name=self.feed_list[2],
+                               shape=self.feed_shape[2],
+                               dtype='float32')
         out = paddle.fluid.layers.sum([x, y, z], **self.attrs)
         self.fetch_list = [out.name]
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
index c5331d43f5e55..417d9c37675c3 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestTopKOp(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -51,14 +52,17 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         if not self.use_k_as_const_variable:
             topk_values, topk_indices = self.op(x, **self.attrs)
         else:
             # !important, popart cannot accept non const tensor
-            K_t = paddle.fluid.layers.fill_constant(
-                shape=[1], dtype='int32', value=self.k, name="in_2")
+            K_t = paddle.fluid.layers.fill_constant(shape=[1],
+                                                    dtype='int32',
+                                                    value=self.k,
+                                                    name="in_2")
             topk_values, topk_indices = self.op(x, K_t, **self.attrs)
         self.fetch_list = [topk_values.name, topk_indices.name]
 
@@ -81,12 +85,14 @@ def test(self):
 
 
 class TestCase2(TestTopKOp):
+
     def set_test_op(self):
         self.op = paddle.topk
 
 
 @unittest.skip("Trying to get data as int64 but it is of type int32")
 class TestCase3(TestTopKOp):
+
     def set_op_attrs(self):
         self.use_k_as_const_variable = True
         self.attrs = {}
@@ -95,6 +101,7 @@ def set_op_attrs(self):
 
 @unittest.skip("Trying to get data as int64 but it is of type int32")
 class TestCase4(TestCase3):
+
     def set_test_op(self):
         self.op = paddle.topk
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
index d5fef73a31b3e..03068d407b2f3 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -45,8 +46,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.transpose(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -62,11 +64,13 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"perm": [0, 1, 2, 3]}
 
 
 class TestCase2(TestBase):
+
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 2, 3, 4, 5])
         self.feed_fp32 = {"x": data.astype(np.float32)}
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
index 54cbc571ec6ff..998eee38b5e59 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -45,8 +46,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
         out = paddle.fluid.layers.unsqueeze(x, **self.attrs)
         self.fetch_list = [out.name]
 
@@ -62,11 +64,13 @@ def test(self):
 
 
 class TestCase1(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"axes": -1}
 
 
 class TestCase2(TestBase):
+
     def set_op_attrs(self):
         self.attrs = {"axes": [1, 2]}
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
index 5cc62432dc635..b3535c8cd5690 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -55,10 +56,9 @@ def _test_base(self, run_ipu=True):
 
         with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                x = paddle.static.data(name=self.feed_list[0],
+                                       shape=self.feed_shape[0],
+                                       dtype=self.feed_dtype[0])
                 add1 = paddle.fluid.layers.elementwise_add(x, x)
                 reshape = paddle.fluid.layers.reshape(add1, **self.attrs)
                 add2 = paddle.fluid.layers.elementwise_add(reshape, reshape)
@@ -76,8 +76,8 @@ def _test_base(self, run_ipu=True):
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
             scale1_out = main_prog.global_block().ops[4].output("Out")[0]
-            main_prog.global_block().ops[4]._rename_output(scale1_out,
-                                                           add2.name)
+            main_prog.global_block().ops[4]._rename_output(
+                scale1_out, add2.name)
             main_prog.global_block().ops[5]._rename_input(scale1_out, add2.name)
 
             if run_ipu:
@@ -98,8 +98,7 @@ def test_base(self):
         res1 = self._test_base(False)
 
         self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+            np.allclose(res0.flatten(), res1.flatten(), atol=self.atol))
 
         self.assertTrue(res0.shape == res1.shape)
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
index 5e652ce48334d..630a00f5a7d56 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
@@ -24,6 +24,7 @@
                  "core is not compiled with IPU")
 @unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
 class TestBase(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_data_feed()
@@ -50,6 +51,7 @@ def set_attrs(self):
         }
 
     def _test_optimizer(self, run_ipu=True):
+
         def exclude_fn(param):
             return param.name.endswith('.w_0')
 
@@ -62,13 +64,16 @@ def exclude_fn(param):
 
         with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[1, 3, 10, 10], dtype='float32')
+                image = paddle.static.data(name='image',
+                                           shape=[1, 3, 10, 10],
+                                           dtype='float32')
                 bias = paddle.fluid.layers.create_parameter(
                     shape=[1, 3, 10, 10], is_bias=True, dtype='float32')
                 add1 = image + bias
-                conv1 = paddle.static.nn.conv2d(
-                    add1, num_filters=3, filter_size=3, bias_attr=False)
+                conv1 = paddle.static.nn.conv2d(add1,
+                                                num_filters=3,
+                                                filter_size=3,
+                                                bias_attr=False)
 
                 loss = paddle.mean(conv1)
                 opt = paddle.optimizer.Lamb(
@@ -90,12 +95,11 @@ def exclude_fn(param):
                 fetch_list = [loss.name]
                 ipu_strategy = paddle.static.IpuStrategy()
                 ipu_strategy.set_graph_config(is_training=True)
-                ipu_strategy.set_options({
-                    'loss_scaling': self.attrs["loss_scaling"]
-                })
+                ipu_strategy.set_options(
+                    {'loss_scaling': self.attrs["loss_scaling"]})
                 program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
-                                                                  fetch_list)
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
index 30e003917efbd..52e88119af0e9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestWeightSharing(IPUOpTest):
+
     def setUp(self):
         self.set_atol()
         self.set_training()
@@ -52,8 +53,9 @@ def set_op_attrs(self):
 
     @IPUOpTest.static_graph
     def build_model(self):
-        x = paddle.static.data(
-            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int64')
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='int64')
         with paddle.static.ipu_shard_guard(index=0, stage=0):
             y = paddle.fluid.layers.embedding(
                 input=x,
@@ -82,15 +84,15 @@ def run_model(self, run_ipu):
         exe.run(self.startup_prog)
         if run_ipu:
             ipu_strategy = paddle.static.IpuStrategy()
-            ipu_strategy.set_graph_config(
-                num_ipus=2,
-                is_training=self.is_training,
-                enable_manual_shard=True)
-            ipu_strategy.set_pipelining_config(
-                enable_pipelining=True, batches_per_step=3)
+            ipu_strategy.set_graph_config(num_ipus=2,
+                                          is_training=self.is_training,
+                                          enable_manual_shard=True)
+            ipu_strategy.set_pipelining_config(enable_pipelining=True,
+                                               batches_per_step=3)
             program = paddle.static.IpuCompiledProgram(
-                self.main_prog, ipu_strategy=ipu_strategy).compile(
-                    self.feed_list, self.fetch_list)
+                self.main_prog,
+                ipu_strategy=ipu_strategy).compile(self.feed_list,
+                                                   self.fetch_list)
         else:
             program = self.main_prog
 
@@ -103,8 +105,7 @@ def test_base(self):
         res1 = self.run_model(True)
 
         self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1[0].flatten(), atol=self.atol))
+            np.allclose(res0.flatten(), res1[0].flatten(), atol=self.atol))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/CMakeLists.txt
index 3d80d92595b17..d34ee9380ead4 100644
--- a/python/paddle/fluid/tests/unittests/ir/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/CMakeLists.txt
@@ -1,8 +1,13 @@
-file(GLOB TEST_IR_PASSES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_IR_PASSES
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_IR_PASSES "${TEST_IR_PASSES}")
 
-if(((NOT WITH_GPU) AND (NOT WITH_ROCM)) OR WIN32 OR APPLE)
-  LIST(REMOVE_ITEM TEST_IR_PASSES test_ir_fusion_group_pass)
+if(((NOT WITH_GPU) AND (NOT WITH_ROCM))
+   OR WIN32
+   OR APPLE)
+  list(REMOVE_ITEM TEST_IR_PASSES test_ir_fusion_group_pass)
 endif()
 
 foreach(target ${TEST_IR_PASSES})
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 4717dfa1eab52..3687d09653fd8 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -1,10 +1,19 @@
-file(GLOB TEST_INFERENCE_IR_PASSES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_INFERENCE_IR_PASSES
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_INFERENCE_IR_PASSES "${TEST_INFERENCE_IR_PASSES}")
 
-file(GLOB TEST_TRT_IR_PASSES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_trt_*.py")
+file(
+  GLOB TEST_TRT_IR_PASSES
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_trt_*.py")
 string(REPLACE ".py" "" TEST_TRT_IR_PASSES "${TEST_TRT_IR_PASSES}")
 
-file(GLOB TEST_TRT_CONVERTER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_trt_convert_*.py")
+file(
+  GLOB TEST_TRT_CONVERTER
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_trt_convert_*.py")
 string(REPLACE ".py" "" TEST_TRT_CONVERTER "${TEST_TRT_CONVERTER}")
 
 # Only for cpu(mkl + openblas)
@@ -27,7 +36,8 @@ if(WITH_GPU AND TENSORRT_FOUND)
 
   foreach(target ${TEST_TRT_IR_PASSES})
     if(${target} STREQUAL "test_trt_slice_dynamic_plugin")
-      if("${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}" VERSION_GREATER "7.1")
+      if("${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}" VERSION_GREATER
+         "7.1")
         py_test_modules(${target} MODULES ${target})
         set_tests_properties(${target} PROPERTIES TIMEOUT 60)
       endif()
@@ -42,7 +52,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
   endforeach()
 endif()
 
-file(GLOB TEST_MKLDNN_IR_PASSES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_mkldnn_*.py")
+file(
+  GLOB TEST_MKLDNN_IR_PASSES
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_mkldnn_*.py")
 string(REPLACE ".py" "" TEST_MKLDNN_IR_PASSES "${TEST_MKLDNN_IR_PASSES}")
 foreach(TEST_INFERENCE_IR_PASS ${TEST_MKLDNN_IR_PASSES})
   list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES ${TEST_INFERENCE_IR_PASS})
@@ -54,95 +67,142 @@ if(WITH_MKLDNN)
   endforeach()
 endif()
 
-if (WITH_MKLDNN AND TENSORRT_FOUND AND WITH_GPU)
+if(WITH_MKLDNN
+   AND TENSORRT_FOUND
+   AND WITH_GPU)
   foreach(target ${TEST_INFERENCE_IR_PASSES})
     py_test_modules(${target} MODULES ${target})
   endforeach()
 endif()
 
-if (NOT WITH_MKLDNN AND NOT TENSORRT_FOUND AND NOT WITH_GPU)
+if(NOT WITH_MKLDNN
+   AND NOT TENSORRT_FOUND
+   AND NOT WITH_GPU)
   foreach(target ${TEST_INFERENCE_CPU_UT})
     py_test_modules(${target} MODULES ${target})
   endforeach()
 
-set_tests_properties(test_mul_lstm_fuse_pass PROPERTIES TIMEOUT 300)
-set_tests_properties(test_mul_gru_fuse_pass PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_mul_lstm_fuse_pass PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_mul_gru_fuse_pass PROPERTIES TIMEOUT 300)
 endif()
 
 if(WITH_GPU AND TENSORRT_FOUND)
-set_tests_properties(test_trt_subgraph_pass PROPERTIES TIMEOUT 120)
-set_tests_properties(test_trt_activation_pass PROPERTIES TIMEOUT 120)
-set_tests_properties(test_trt_conv_pass PROPERTIES TIMEOUT 120)
-#set_tests_properties(test_trt_multiclass_nms_op PROPERTIES TIMEOUT 200)
-set_tests_properties(test_trt_dynamic_shape PROPERTIES TIMEOUT 120)
-set_tests_properties(test_trt_inspector PROPERTIES TIMEOUT 60)
-if(WITH_NV_JETSON)
-  set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
-  set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
-else()
-  set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 300)
-  set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45)
-endif()
-set_tests_properties(test_trt_reduce_mean_op PROPERTIES TIMEOUT 60)
-set_tests_properties(test_trt_tile_op PROPERTIES TIMEOUT 60)
-set_tests_properties(test_trt_fc_fuse_quant_dequant_pass PROPERTIES TIMEOUT 100)
-set_tests_properties(test_trt_conv_quant_dequant_pass PROPERTIES TIMEOUT 100)
-set_tests_properties(test_trt_matmul_quant_dequant PROPERTIES TIMEOUT 100)
-set_tests_properties(test_trt_conv3d_op PROPERTIES TIMEOUT 60)
-set_tests_properties(test_trt_conv3d_transpose_op PROPERTIES TIMEOUT 60)
-set_tests_properties(test_trt_nearest_interp_v2_op PROPERTIES TIMEOUT 30)
-set_tests_properties(test_trt_multiclass_nms3_op PROPERTIES TIMEOUT 60 ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0) 
-
-if (WITH_MKLDNN AND TENSORRT_FOUND AND WITH_GPU)
-  set_tests_properties(test_emb_eltwise_layernorm_fuse_pass PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_fc_fuse_pass PROPERTIES TIMEOUT 240)
-  set_tests_properties(test_simplify_with_basic_ops_pass_autoscan PROPERTIES TIMEOUT 60)
-  set_tests_properties(test_adaptive_pool2d_convert_global_pass_autoscan PROPERTIES TIMEOUT 100)
-  set_tests_properties(test_conv_act_mkldnn_fuse_pass PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_conv_elementwise_add2_act_fuse_pass PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_conv_elementwise_add_act_fuse_pass PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_flatten2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
-  set_tests_properties(test_squeeze2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
-  set_tests_properties(test_reshape2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
-  set_tests_properties(test_trt_flatten2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
-  set_tests_properties(test_trt_squeeze2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
-  set_tests_properties(test_trt_reshape2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
-  set_tests_properties(test_shuffle_channel_detect_pass PROPERTIES TIMEOUT 120)
-  if (WIN32)
-    set_tests_properties(test_matmul_scale_fuse_pass PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_matmul_v2_scale_fuse_pass PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_map_matmul_v2_to_matmul_pass PROPERTIES TIMEOUT 360)
-    set_tests_properties(test_map_matmul_v2_to_mul_pass PROPERTIES TIMEOUT 360)
-    set_tests_properties(test_map_matmul_to_mul_pass PROPERTIES TIMEOUT 360)
-  else ()
-    set_tests_properties(test_matmul_scale_fuse_pass PROPERTIES TIMEOUT 60)
-    set_tests_properties(test_matmul_v2_scale_fuse_pass PROPERTIES TIMEOUT 60)
-    set_tests_properties(test_map_matmul_v2_to_matmul_pass PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_map_matmul_v2_to_mul_pass PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_map_matmul_to_mul_pass PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_trt_subgraph_pass PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_trt_activation_pass PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_trt_conv_pass PROPERTIES TIMEOUT 120)
+  #set_tests_properties(test_trt_multiclass_nms_op PROPERTIES TIMEOUT 200)
+  set_tests_properties(test_trt_dynamic_shape PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_trt_inspector PROPERTIES TIMEOUT 60)
+  if(WITH_NV_JETSON)
+    set_tests_properties(
+      test_trt_pool_op
+      PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT
+                 450)
+    set_tests_properties(
+      test_trt_pool3d_op
+      PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT
+                 450)
+  else()
+    set_tests_properties(
+      test_trt_pool_op
+      PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT
+                 300)
+    set_tests_properties(
+      test_trt_pool3d_op
+      PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45)
+  endif()
+  set_tests_properties(test_trt_reduce_mean_op PROPERTIES TIMEOUT 60)
+  set_tests_properties(test_trt_tile_op PROPERTIES TIMEOUT 60)
+  set_tests_properties(test_trt_fc_fuse_quant_dequant_pass PROPERTIES TIMEOUT
+                                                                      100)
+  set_tests_properties(test_trt_conv_quant_dequant_pass PROPERTIES TIMEOUT 100)
+  set_tests_properties(test_trt_matmul_quant_dequant PROPERTIES TIMEOUT 100)
+  set_tests_properties(test_trt_conv3d_op PROPERTIES TIMEOUT 60)
+  set_tests_properties(test_trt_conv3d_transpose_op PROPERTIES TIMEOUT 60)
+  set_tests_properties(test_trt_nearest_interp_v2_op PROPERTIES TIMEOUT 30)
+  set_tests_properties(
+    test_trt_multiclass_nms3_op PROPERTIES TIMEOUT 60 ENVIRONMENT
+                                           FLAGS_USE_STANDALONE_EXECUTOR=0)
+
+  if(WITH_MKLDNN
+     AND TENSORRT_FOUND
+     AND WITH_GPU)
+    set_tests_properties(test_emb_eltwise_layernorm_fuse_pass PROPERTIES TIMEOUT
+                                                                         120)
+    set_tests_properties(test_fc_fuse_pass PROPERTIES TIMEOUT 240)
+    set_tests_properties(test_simplify_with_basic_ops_pass_autoscan
+                         PROPERTIES TIMEOUT 60)
+    set_tests_properties(test_adaptive_pool2d_convert_global_pass_autoscan
+                         PROPERTIES TIMEOUT 100)
+    set_tests_properties(test_conv_act_mkldnn_fuse_pass PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_conv_elementwise_add2_act_fuse_pass
+                         PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_conv_elementwise_add_act_fuse_pass
+                         PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_flatten2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
+    set_tests_properties(test_squeeze2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
+    set_tests_properties(test_reshape2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
+    set_tests_properties(test_trt_flatten2_matmul_fuse_pass PROPERTIES TIMEOUT
+                                                                       240)
+    set_tests_properties(test_trt_squeeze2_matmul_fuse_pass PROPERTIES TIMEOUT
+                                                                       240)
+    set_tests_properties(test_trt_reshape2_matmul_fuse_pass PROPERTIES TIMEOUT
+                                                                       240)
+    set_tests_properties(test_shuffle_channel_detect_pass PROPERTIES TIMEOUT
+                                                                     120)
+    if(WIN32)
+      set_tests_properties(test_matmul_scale_fuse_pass PROPERTIES TIMEOUT 300)
+      set_tests_properties(test_matmul_v2_scale_fuse_pass PROPERTIES TIMEOUT
+                                                                     300)
+      set_tests_properties(test_map_matmul_v2_to_matmul_pass PROPERTIES TIMEOUT
+                                                                        360)
+      set_tests_properties(test_map_matmul_v2_to_mul_pass PROPERTIES TIMEOUT
+                                                                     360)
+      set_tests_properties(test_map_matmul_to_mul_pass PROPERTIES TIMEOUT 360)
+    else()
+      set_tests_properties(test_matmul_scale_fuse_pass PROPERTIES TIMEOUT 60)
+      set_tests_properties(test_matmul_v2_scale_fuse_pass PROPERTIES TIMEOUT 60)
+      set_tests_properties(test_map_matmul_v2_to_matmul_pass PROPERTIES TIMEOUT
+                                                                        120)
+      set_tests_properties(test_map_matmul_v2_to_mul_pass PROPERTIES TIMEOUT
+                                                                     120)
+      set_tests_properties(test_map_matmul_to_mul_pass PROPERTIES TIMEOUT 120)
+    endif()
   endif()
-endif()
 
-if (WITH_MKLDNN)
-  set_tests_properties(test_mkldnn_conv_elementwise_add_fuse_pass PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_mkldnn_depthwise_conv_pass PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_mkldnn_reshape_transpose_matmul_fuse_pass PROPERTIES TIMEOUT 100)
-  set_tests_properties(test_mkldnn_mish_op PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_mkldnn_conv3d_op PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_mkldnn_prelu_op PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_conv_act_mkldnn_fuse_pass PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_conv_transpose_eltwiseadd_bn_fuse_pass PROPERTIES TIMEOUT 250)
-  set_tests_properties(test_mkldnn_matmul_transpose_reshape_fuse_pass PROPERTIES TIMEOUT 100)
-  set_tests_properties(test_conv_transpose_bn_fuse_pass PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_mkldnn_conv_hard_sigmoid_fuse_pass PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_mkldnn_conv_hard_swish_fuse_pass PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_mkldnn_batch_norm_act_fuse_pass PROPERTIES TIMEOUT 100)
-  set_tests_properties(test_mkldnn_matmul_v2_transpose_reshape_fuse_pass PROPERTIES TIMEOUT 100)
-  set_tests_properties(test_mkldnn_conv_transpose_bias_fuse_pass PROPERTIES TIMEOUT 100)
-  set_tests_properties(test_conv_eltwiseadd_bn_fuse_pass PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_mkldnn_conv_mish_fuse_pass PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_mkldnn_fc_mish_fuse_pass PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_mkldnn_fc_elementwise_add_fuse_pass PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_mkldnn_conv_affine_channel_fuse_pass PROPERTIES TIMEOUT 60)
-endif()
+  if(WITH_MKLDNN)
+    set_tests_properties(test_mkldnn_conv_elementwise_add_fuse_pass
+                         PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_mkldnn_depthwise_conv_pass PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_mkldnn_reshape_transpose_matmul_fuse_pass
+                         PROPERTIES TIMEOUT 100)
+    set_tests_properties(test_mkldnn_mish_op PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_mkldnn_conv3d_op PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_mkldnn_prelu_op PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_conv_act_mkldnn_fuse_pass PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_conv_transpose_eltwiseadd_bn_fuse_pass
+                         PROPERTIES TIMEOUT 250)
+    set_tests_properties(test_mkldnn_matmul_transpose_reshape_fuse_pass
+                         PROPERTIES TIMEOUT 100)
+    set_tests_properties(test_conv_transpose_bn_fuse_pass PROPERTIES TIMEOUT
+                                                                     300)
+    set_tests_properties(test_mkldnn_conv_hard_sigmoid_fuse_pass
+                         PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_mkldnn_conv_hard_swish_fuse_pass
+                         PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_mkldnn_batch_norm_act_fuse_pass PROPERTIES TIMEOUT
+                                                                         100)
+    set_tests_properties(test_mkldnn_matmul_v2_transpose_reshape_fuse_pass
+                         PROPERTIES TIMEOUT 100)
+    set_tests_properties(test_mkldnn_conv_transpose_bias_fuse_pass
+                         PROPERTIES TIMEOUT 100)
+    set_tests_properties(test_conv_eltwiseadd_bn_fuse_pass PROPERTIES TIMEOUT
+                                                                      300)
+    set_tests_properties(test_mkldnn_conv_mish_fuse_pass PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_mkldnn_fc_mish_fuse_pass PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_mkldnn_fc_elementwise_add_fuse_pass
+                         PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_mkldnn_conv_affine_channel_fuse_pass
+                         PROPERTIES TIMEOUT 60)
+  endif()
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
index 161c785ef8565..1676763a6d82f 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
@@ -36,22 +36,20 @@
 
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 
-settings.register_profile(
-    "ci",
-    max_examples=100,
-    suppress_health_check=hypothesis.HealthCheck.all(),
-    deadline=None,
-    print_blob=True,
-    derandomize=True,
-    report_multiple_bugs=False)
-settings.register_profile(
-    "dev",
-    max_examples=1000,
-    suppress_health_check=hypothesis.HealthCheck.all(),
-    deadline=None,
-    print_blob=True,
-    derandomize=True,
-    report_multiple_bugs=False)
+settings.register_profile("ci",
+                          max_examples=100,
+                          suppress_health_check=hypothesis.HealthCheck.all(),
+                          deadline=None,
+                          print_blob=True,
+                          derandomize=True,
+                          report_multiple_bugs=False)
+settings.register_profile("dev",
+                          max_examples=1000,
+                          suppress_health_check=hypothesis.HealthCheck.all(),
+                          deadline=None,
+                          print_blob=True,
+                          derandomize=True,
+                          report_multiple_bugs=False)
 if float(os.getenv('TEST_NUM_PERCENT_CASES', default='1.0')) < 1 or \
     os.getenv('HYPOTHESIS_TEST_PROFILE', 'dev') == 'ci':
     settings.load_profile("ci")
@@ -75,6 +73,7 @@ class IgnoreReasons(enum.Enum):
 
 
 class AutoScanTest(unittest.TestCase):
+
     def __init__(self, *args, **kwargs):
         np.random.seed(1024)
         paddle.enable_static()
@@ -102,11 +101,9 @@ def sample_predictor_configs(self):
         raise NotImplementedError
 
     @abc.abstractmethod
-    def add_ignore_check_case(
-            self,
-            teller: [Callable[[ProgramConfig, paddle_infer.Config], bool]],
-            reason: IgnoreReasons,
-            note: str):
+    def add_ignore_check_case(self, teller: [
+        Callable[[ProgramConfig, paddle_infer.Config], bool]
+    ], reason: IgnoreReasons, note: str):
         self.ignore_cases.append((teller, reason, note))
 
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
@@ -134,9 +131,7 @@ def run_test_config(self, model, params, prog_config, pred_config,
         return result
 
     @abc.abstractmethod
-    def assert_tensors_near(self,
-                            atol: float,
-                            rtol: float,
+    def assert_tensors_near(self, atol: float, rtol: float,
                             tensor: Dict[str, np.array],
                             baseline: Dict[str, np.array]):
         for key, arr in tensor.items():
@@ -146,8 +141,7 @@ def assert_tensors_near(self,
                 str(baseline[key].shape) + ', but got ' + str(arr.shape))
             diff = abs(baseline[key] - arr)
             self.assertTrue(
-                np.allclose(
-                    baseline[key], arr, atol=atol, rtol=rtol),
+                np.allclose(baseline[key], arr, atol=atol, rtol=rtol),
                 "Output has diff, Maximum absolute error: {}".format(
                     np.amax(diff)))
 
@@ -155,17 +149,16 @@ def assert_tensors_near(self,
     def run_test(self, quant=False):
         raise NotImplementedError
 
-    def generate_op_config(self,
-                           ops_config: List[Dict[str, Any]]) -> List[OpConfig]:
+    def generate_op_config(self, ops_config: List[Dict[str,
+                                                       Any]]) -> List[OpConfig]:
         ops = []
         for i in range(len(ops_config)):
             op_config = ops_config[i]
             ops.append(
-                OpConfig(
-                    type=op_config['op_type'],
-                    inputs=op_config['op_inputs'],
-                    outputs=op_config['op_outputs'],
-                    attrs=op_config['op_attrs']))
+                OpConfig(type=op_config['op_type'],
+                         inputs=op_config['op_inputs'],
+                         outputs=op_config['op_outputs'],
+                         attrs=op_config['op_attrs']))
         return ops
 
     @abc.abstractmethod
@@ -182,10 +175,10 @@ def success_log(self, msg: str):
 
     @abc.abstractmethod
     def create_inference_config(self,
-                                passes: Optional[List[str]]=None,
-                                use_gpu: bool=False,
-                                use_mkldnn: bool=False,
-                                ir_optim: Optional[bool]=None):
+                                passes: Optional[List[str]] = None,
+                                use_gpu: bool = False,
+                                use_mkldnn: bool = False,
+                                ir_optim: Optional[bool] = None):
         config = paddle_infer.Config()
         config.switch_ir_debug(True)
         config.set_optim_cache_dir(self.cache_dir)
@@ -203,6 +196,7 @@ def create_inference_config(self,
 
 
 class MkldnnAutoScanTest(AutoScanTest):
+
     def __init__(self, *args, **kwargs):
         super(MkldnnAutoScanTest, self).__init__(*args, **kwargs)
 
@@ -243,10 +237,10 @@ def run_test(self, quant=False, *args, **kwargs):
                         ignore_flag = True
                         if ignore_info[
                                 1] == IgnoreReasons.MKLDNN_ACCURACY_ERROR:
-                            self.ignore_log("[MKLDNN_ACCURACY_ERROR] " +
-                                            ignore_info[2] + ' ' + ' vs ' +
-                                            self.inference_config_str(
-                                                pred_config))
+                            self.ignore_log(
+                                "[MKLDNN_ACCURACY_ERROR] " + ignore_info[2] +
+                                ' ' + ' vs ' +
+                                self.inference_config_str(pred_config))
                         else:
                             raise NotImplementedError
                         break
@@ -269,8 +263,9 @@ def run_test(self, quant=False, *args, **kwargs):
                     if not ignore_flag:
                         status = False
                     continue
-                self.success_log('RUN predictor_config ' + self.
-                                 inference_config_str(pred_config) + ' done')
+                self.success_log('RUN predictor_config ' +
+                                 self.inference_config_str(pred_config) +
+                                 ' done')
 
         self.assertTrue(status)
 
@@ -284,6 +279,7 @@ def inference_config_str(self, config) -> str:
 
 
 class PassAutoScanTest(AutoScanTest):
+
     def __init__(self, *args, **kwargs):
         super(PassAutoScanTest, self).__init__(*args, **kwargs)
         self.passes = []
@@ -309,8 +305,8 @@ def assert_op_list(self, op_list_after_fusion):
                                            self.passes[-1] + ".pdmodel")
         if not os.path.exists(last_passed_program):
             raise ValueError(
-                "Cannot find file {}, please make sure that your pass name is correct".
-                format(last_passed_program))
+                "Cannot find file {}, please make sure that your pass name is correct"
+                .format(last_passed_program))
         model_bytes = paddle.static.load_from_file(last_passed_program)
         pg = paddle.static.deserialize_program(model_bytes)
         main_block = pg.desc.block(0)
@@ -322,7 +318,8 @@ def assert_op_list(self, op_list_after_fusion):
         self.assertTrue(
             op_list_after_fusion == after_op_list,
             "Expected operator list after fusion is {}, but now it's {}".format(
-                op_list_after_fusion, after_op_list), )
+                op_list_after_fusion, after_op_list),
+        )
 
     def run_and_statis(self,
                        quant=False,
@@ -344,7 +341,8 @@ def run_and_statis(self,
             deadline=None,
             print_blob=True,
             derandomize=True,
-            report_multiple_bugs=False, )
+            report_multiple_bugs=False,
+        )
         settings.load_profile("ci")
         assert passes is not None, "Parameter of passes must be defined in function run_and_statis."
         self.passes = passes
@@ -372,8 +370,8 @@ def run_test(prog_config):
         logging.info("Number of Ran Programs: {}".format(self.num_ran_programs))
         logging.info("Number of Ignore Tests: {}".format(self.num_ignore_tests))
         successful_ran_programs = int(self.num_ran_programs -
-                                      self.num_ignore_tests / max(
-                                          self.num_predictor_kinds, 1))
+                                      self.num_ignore_tests /
+                                      max(self.num_predictor_kinds, 1))
         logging.info(
             "Number of successfully ran programs approximately equal to {}".
             format(successful_ran_programs))
@@ -382,14 +380,14 @@ def run_test(prog_config):
                 "satisfied_programs = ran_programs - num_ignore_tests / num_predictor_kinds"
             )
             logging.error(
-                "At least {} programs need to ran successfully, but now only about {} programs satisfied.".
-                format(min_success_num, successful_ran_programs))
+                "At least {} programs need to ran successfully, but now only about {} programs satisfied."
+                .format(min_success_num, successful_ran_programs))
             assert False
         used_time = time.time() - start_time
         if max_duration > 0 and used_time > max_duration:
             logging.error(
-                "The duration exceeds {} seconds, if this is necessary, try to set a larger number for parameter `max_duration`.".
-                format(max_duration))
+                "The duration exceeds {} seconds, if this is necessary, try to set a larger number for parameter `max_duration`."
+                .format(max_duration))
             assert False
 
     def run_test(self, quant=False, prog_configs=None):
@@ -425,10 +423,10 @@ def run_test(self, quant=False, prog_configs=None):
                         ignore_flag = True
                         self.num_ignore_tests += 1
                         if ignore_info[1] == IgnoreReasons.PASS_ACCURACY_ERROR:
-                            self.ignore_log("[PASS_ACCURACY_ERROR] " +
-                                            ignore_info[2] + ' ' + ' vs ' +
-                                            self.inference_config_str(
-                                                pred_config))
+                            self.ignore_log(
+                                "[PASS_ACCURACY_ERROR] " + ignore_info[2] +
+                                ' ' + ' vs ' +
+                                self.inference_config_str(pred_config))
                         else:
                             raise NotImplementedError
                         break
@@ -443,17 +441,19 @@ def run_test(self, quant=False, prog_configs=None):
                     ir_optim=False, use_gpu=pred_config.use_gpu())
                 try:
                     # baseline
-                    base_result = self.run_test_config(
-                        model, params, prog_config, base_config, feed_data)
+                    base_result = self.run_test_config(model, params,
+                                                       prog_config, base_config,
+                                                       feed_data)
                     self.success_log('RUN_BASELINE ' +
-                                     self.inference_config_str(
-                                         base_config) + ' done')
+                                     self.inference_config_str(base_config) +
+                                     ' done')
 
                     if os.path.exists(self.cache_dir):
                         shutil.rmtree(self.cache_dir)
 
-                    pred_result = self.run_test_config(
-                        model, params, prog_config, pred_config, feed_data)
+                    pred_result = self.run_test_config(model, params,
+                                                       prog_config, pred_config,
+                                                       feed_data)
                     self.assert_tensors_near(atol, rtol, pred_result,
                                              base_result)
                     if not ignore_flag:
@@ -466,8 +466,9 @@ def run_test(self, quant=False, prog_configs=None):
                     if not ignore_flag:
                         status = False
                     continue
-                self.success_log('RUN predictor_config ' + self.
-                                 inference_config_str(pred_config) + ' done')
+                self.success_log('RUN predictor_config ' +
+                                 self.inference_config_str(pred_config) +
+                                 ' done')
 
         status = self.check_op_version() and status
         self.assertTrue(status)
@@ -502,6 +503,7 @@ def create_trt_inference_config(self) -> paddle_infer.Config:
 
 
 class TrtLayerAutoScanTest(AutoScanTest):
+
     class TensorRTParam:
         '''
         TensorRT subgraph engine parameters. 
@@ -539,8 +541,7 @@ def __init__(self, *args, **kwargs):
             use_calib_mode=False)
         self.dynamic_shape = self.DynamicShapeParam({}, {}, {}, False)
         self.num_percent_cases = float(
-            os.getenv(
-                'TEST_NUM_PERCENT_CASES', default='1.0'))
+            os.getenv('TEST_NUM_PERCENT_CASES', default='1.0'))
         # Choose different tests by week
         np.random.seed(int(time.strftime("%W")))
 
@@ -582,12 +583,14 @@ def assert_op_size(self, trt_engine_num, paddle_op_num):
         ]
         trt_engine_size = sum(op_types)
         paddle_op_size = op_size - trt_engine_size
-        self.assertTrue(trt_engine_size == trt_engine_num,
-                        'trt_engine_num is {}, but got {}!'.format(
-                            trt_engine_size, trt_engine_num))
-        self.assertTrue(paddle_op_size == paddle_op_num,
-                        'paddle_op_num is {}, but got {}!'.format(
-                            paddle_op_size, paddle_op_num))
+        self.assertTrue(
+            trt_engine_size == trt_engine_num,
+            'trt_engine_num is {}, but got {}!'.format(trt_engine_size,
+                                                       trt_engine_num))
+        self.assertTrue(
+            paddle_op_size == paddle_op_num,
+            'paddle_op_num is {}, but got {}!'.format(paddle_op_size,
+                                                      paddle_op_num))
 
     def inference_config_str(self, config: paddle_infer.Config) -> str:
         dic = {}
@@ -651,8 +654,8 @@ def run_test(self, quant=False, *args, **kwargs):
                 if isinstance(threshold, float):
                     atol = threshold
                     rtol = 1e-8
-                elif isinstance(threshold, list) or isinstance(threshold,
-                                                               tuple):
+                elif isinstance(threshold, list) or isinstance(
+                        threshold, tuple):
                     atol = threshold[0]
                     rtol = threshold[1]
                 else:
@@ -670,14 +673,14 @@ def run_test(self, quant=False, *args, **kwargs):
                     if ignore_info[0](prog_config, pred_config):
                         ignore_flag = True
                         if ignore_info[1] == IgnoreReasons.TRT_NOT_IMPLEMENTED:
-                            self.ignore_log("[TRT_NOT_IMPLEMENTED] " +
-                                            ignore_info[2] + ' ' + ' vs ' +
-                                            self.inference_config_str(
-                                                pred_config))
+                            self.ignore_log(
+                                "[TRT_NOT_IMPLEMENTED] " + ignore_info[2] +
+                                ' ' + ' vs ' +
+                                self.inference_config_str(pred_config))
                         elif ignore_info[1] == IgnoreReasons.TRT_NOT_SUPPORT:
-                            self.ignore_log("[TRT_NOT_SUPPORT] " + ignore_info[
-                                2] + ' ' + ' vs ' + self.inference_config_str(
-                                    pred_config))
+                            self.ignore_log(
+                                "[TRT_NOT_SUPPORT] " + ignore_info[2] + ' ' +
+                                ' vs ' + self.inference_config_str(pred_config))
                         else:
                             raise NotImplementedError
                         break
@@ -702,15 +705,14 @@ def run_test(self, quant=False, *args, **kwargs):
                     if not ignore_flag:
                         status = False
                     continue
-                self.success_log('RUN predictor_config ' + self.
-                                 inference_config_str(pred_config) + ' done')
+                self.success_log('RUN predictor_config ' +
+                                 self.inference_config_str(pred_config) +
+                                 ' done')
 
         self.assertTrue(status)
 
     # TODO(wilber): just for backward compatible
-    def add_skip_case(
-            self,
-            teller: [Callable[[ProgramConfig, paddle_infer.Config], bool]],
-            reason: IgnoreReasons,
-            note: str):
+    def add_skip_case(self, teller: [
+        Callable[[ProgramConfig, paddle_infer.Config], bool]
+    ], reason: IgnoreReasons, note: str):
         self.ignore_cases.append((teller, reason, note))
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
index 20d9b9d972d8f..91c7a8963c4f6 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
@@ -30,6 +30,7 @@
 
 
 class InferencePassTest(unittest.TestCase):
+
     def __init__(self, methodName='runTest'):
         paddle.enable_static()
         super(InferencePassTest, self).__init__(methodName)
@@ -42,7 +43,7 @@ def __init__(self, methodName='runTest'):
         self.enable_mkldnn = False
         self.enable_mkldnn_bfloat16 = False
         self.enable_trt = False
-        self.enable_tensorrt_oss = True
+        self.enable_tensorrt_varseqlen = True
         self.trt_parameters = None
         self.dynamic_shape_params = None
         self.enable_lite = False
@@ -57,8 +58,8 @@ def _get_place(self):
     def _save_models(self, dirname, feeded_var_names, target_vars, executor,
                      program, scope):
         with fluid.scope_guard(scope):
-            # save models as combined to ensure that 
-            # there won't be too many useless files 
+            # save models as combined to ensure that
+            # there won't be too many useless files
             # after finishing a couple of tests.
             fluid.io.save_inference_model(dirname, feeded_var_names,
                                           target_vars, executor, program)
@@ -134,8 +135,8 @@ def _get_analysis_config(self,
                         self.dynamic_shape_params.max_input_shape,
                         self.dynamic_shape_params.optim_input_shape,
                         self.dynamic_shape_params.disable_trt_plugin_fp16)
-                if self.enable_tensorrt_oss:
-                    config.enable_tensorrt_oss()
+                if self.enable_tensorrt_varseqlen:
+                    config.enable_tensorrt_varseqlen()
 
         elif use_mkldnn:
             config.enable_mkldnn()
@@ -173,18 +174,17 @@ def check_output_with_option(self,
         device = "GPU" if use_gpu else "CPU"
         with fluid.scope_guard(scope):
             executor.run(self.startup_program)
-        self._save_models(self.path,
-                          list(self.feeds.keys()), self.fetch_list, executor,
-                          self.main_program, scope)
+        self._save_models(self.path, list(self.feeds.keys()), self.fetch_list,
+                          executor, self.main_program, scope)
         paddle_outs = self._get_paddle_outs(executor, self.main_program, scope)
         inference_outs = self._get_inference_outs(
             self._get_analysis_config(use_gpu=use_gpu))
 
-        # Check whether the results calculated on CPU and on GPU are the same. 
+        # Check whether the results calculated on CPU and on GPU are the same.
         self.assertTrue(
             len(paddle_outs) == len(inference_outs),
-            "The number of outputs is different between inference and training forward at {}".
-            format(device))
+            "The number of outputs is different between inference and training forward at {}"
+            .format(device))
 
         for out, inference_out in zip(paddle_outs, inference_outs):
             paddle_out = np.array(out)
@@ -193,22 +193,21 @@ def check_output_with_option(self,
                 inference_out = inference_out.flatten()
 
             self.assertTrue(
-                np.allclose(
-                    paddle_out, inference_out, atol=atol),
+                np.allclose(paddle_out, inference_out, atol=atol),
                 "Output has diff between inference and training forward at {} ".
                 format(device))
 
-        # Check whether the trt results and the GPU results are the same. 
+        # Check whether the trt results and the GPU results are the same.
         if use_gpu and self.enable_trt:
             tensorrt_outputs = self._get_inference_outs(
-                self._get_analysis_config(
-                    use_gpu=use_gpu, use_trt=self.enable_trt))
+                self._get_analysis_config(use_gpu=use_gpu,
+                                          use_trt=self.enable_trt))
 
             if self.trt_parameters.use_static:
                 #deserialize
                 tensorrt_outputs = self._get_inference_outs(
-                    self._get_analysis_config(
-                        use_gpu=use_gpu, use_trt=self.enable_trt))
+                    self._get_analysis_config(use_gpu=use_gpu,
+                                              use_trt=self.enable_trt))
 
             self.assertTrue(
                 len(tensorrt_outputs) == len(paddle_outs),
@@ -222,15 +221,17 @@ def check_output_with_option(self,
                     tensorrt_output = tensorrt_output.flatten()
 
                 self.assertTrue(
-                    np.allclose(
-                        paddle_out, tensorrt_output, rtol=rtol, atol=atol),
+                    np.allclose(paddle_out,
+                                tensorrt_output,
+                                rtol=rtol,
+                                atol=atol),
                     "Output has diff between GPU and TensorRT. ")
 
-        # Check whether the mkldnn results and the CPU results are the same. 
+        # Check whether the mkldnn results and the CPU results are the same.
         if (not use_gpu) and self.enable_mkldnn:
             mkldnn_outputs = self._get_inference_outs(
-                self._get_analysis_config(
-                    use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn))
+                self._get_analysis_config(use_gpu=use_gpu,
+                                          use_mkldnn=self.enable_mkldnn))
 
             self.assertTrue(
                 len(paddle_outs) == len(mkldnn_outputs),
@@ -240,8 +241,7 @@ def check_output_with_option(self,
                 atol = 0.01
             for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs):
                 self.assertTrue(
-                    np.allclose(
-                        np.array(paddle_out), mkldnn_output, atol=atol),
+                    np.allclose(np.array(paddle_out), mkldnn_output, atol=atol),
                     "Output has diff between CPU and MKLDNN. ")
 
     class TensorRTParam:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
index a8c43daab731b..e634dd3dca51d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
@@ -34,9 +34,9 @@ class TensorConfig:
     '''
 
     def __init__(self,
-                 lod: Optional[List[List[int]]]=None,
-                 data_gen: Optional[Callable[..., np.array]]=None,
-                 shape: Optional[List[List[int]]]=None):
+                 lod: Optional[List[List[int]]] = None,
+                 data_gen: Optional[Callable[..., np.array]] = None,
+                 shape: Optional[List[List[int]]] = None):
         '''
         shape: The shape of the tensor.
         dtype: The data type of the tensor.
@@ -71,9 +71,9 @@ def __init__(self,
                  type: str,
                  inputs: Dict[str, List[str]],
                  outputs: Dict[str, List[str]],
-                 attrs: Dict[str, Any]=None,
-                 outputs_var_type: Dict[str, VarType]=None,
-                 outputs_dtype: Dict[str, np.dtype]=None,
+                 attrs: Dict[str, Any] = None,
+                 outputs_var_type: Dict[str, VarType] = None,
+                 outputs_dtype: Dict[str, np.dtype] = None,
                  **kwargs):
         self.type = type
         self.inputs = inputs
@@ -109,9 +109,9 @@ class BlockConfig:
     def __init__(self,
                  ops: List[OpConfig],
                  vars: List[str],
-                 vars_dtype: Dict[str, np.dtype]=None,
-                 vars_var_type: Dict[str, VarType]=None,
-                 vars_lod_level: Dict[str, int]=None):
+                 vars_dtype: Dict[str, np.dtype] = None,
+                 vars_var_type: Dict[str, VarType] = None,
+                 vars_lod_level: Dict[str, int] = None):
         self.ops = ops
         self.vars = vars
         self.vars_dtype = vars_dtype
@@ -165,8 +165,8 @@ def fill_block_desc(self, block_desc):
                     if op_config.outputs_dtype is not None and v in op_config.outputs_dtype.keys(
                     ):
                         var_desc.set_dtype(
-                            convert_np_dtype_to_dtype_(op_config.outputs_dtype[
-                                v]))
+                            convert_np_dtype_to_dtype_(
+                                op_config.outputs_dtype[v]))
             if op_config.type not in _OP_WITHOUT_KERNEL_SET:
                 op_desc.infer_var_type(block_desc)
                 op_desc.infer_shape(block_desc)
@@ -176,11 +176,8 @@ def fill_block_desc(self, block_desc):
 class ProgramConfig:
     '''  A config builder for generating a Program.  '''
 
-    def __init__(self,
-                 ops: List[OpConfig],
-                 weights: Dict[str, TensorConfig],
-                 inputs: Dict[str, TensorConfig],
-                 outputs: List[str]):
+    def __init__(self, ops: List[OpConfig], weights: Dict[str, TensorConfig],
+                 inputs: Dict[str, TensorConfig], outputs: List[str]):
         self.ops = ops
         # if no weight need to save, we create a place_holder to help seriazlie params.
         if not weights:
@@ -260,12 +257,13 @@ def create_fake_model(program_config):
     out_var = util_program.global_block().create_var(
         type=core.VarDesc.VarType.RAW, name="out_var_0")
     out_var.desc.set_persistable(True)
-    util_program.global_block().append_op(
-        type='save_combine',
-        inputs={'X': in_vars},
-        outputs={'Y': out_var},
-        attrs={'file_path': '',
-               'save_to_memory': True})
+    util_program.global_block().append_op(type='save_combine',
+                                          inputs={'X': in_vars},
+                                          outputs={'Y': out_var},
+                                          attrs={
+                                              'file_path': '',
+                                              'save_to_memory': True
+                                          })
     for op_config in program_config.ops:
         op_desc = main_block_desc.append_op()
         op_desc.set_type(op_config.type)
@@ -337,11 +335,10 @@ def create_quant_model(model,
     scope = global_scope()
     exe = paddle.static.Executor(place)
     [inference_program, feed_target_names,
-     fetch_targets] = paddle.static.load_inference_model(
-         path_prefix=None,
-         executor=exe,
-         model_filename=model,
-         params_filename=params)
+     fetch_targets] = paddle.static.load_inference_model(path_prefix=None,
+                                                         executor=exe,
+                                                         model_filename=model,
+                                                         params_filename=params)
     graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
 
     out_scale_op_list = [
@@ -489,18 +486,18 @@ def _get_op_output_var_names(op):
             tensor.set(np.ones(tensor.shape(), dtype=np.float32), place)
 
     if save:
-        fluid.io.save_inference_model(
-            'test_inference_model',
-            feed_target_names,
-            fetch_targets,
-            exe,
-            main_program=main_program)
+        fluid.io.save_inference_model('test_inference_model',
+                                      feed_target_names,
+                                      fetch_targets,
+                                      exe,
+                                      main_program=main_program)
 
     feed_vars = [
         main_program.global_block().var(name) for name in feed_target_names
     ]
-    serialized_program = paddle.static.serialize_program(
-        feed_vars, fetch_targets, program=main_program)
+    serialized_program = paddle.static.serialize_program(feed_vars,
+                                                         fetch_targets,
+                                                         program=main_program)
     serialized_params = paddle.static.serialize_persistables(
         feed_vars, fetch_targets, executor=exe, program=main_program)
     return serialized_program, serialized_params
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py b/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py
index 1ca7799963bf8..b42a54e5efee4 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -34,6 +34,7 @@
 
 
 class QuantDequantTest(unittest.TestCase):
+
     def __init__(self, methodName='runTest'):
         super(QuantDequantTest, self).__init__(methodName)
         paddle.enable_static()
@@ -46,7 +47,7 @@ def __init__(self, methodName='runTest'):
         self.enable_mkldnn = False
         self.enable_mkldnn_bfloat16 = False
         self.enable_trt = False
-        self.enable_tensorrt_oss = True
+        self.enable_tensorrt_varseqlen = True
         self.trt_parameters = None
         self.dynamic_shape_params = None
         self.enable_lite = False
@@ -58,7 +59,7 @@ def __init__(self, methodName='runTest'):
         np.random.seed(1)
         random.seed(1)
 
-    # from Paddle release2.1 
+    # from Paddle release2.1
     def _normalize_program(self, program, feed_vars, fetch_vars):
         if not isinstance(program, Program):
             raise TypeError(
@@ -111,13 +112,12 @@ def _normalize_program(self, program, feed_vars, fetch_vars):
     def _save_models(self, dirname, feeded_var_names, target_vars, executor,
                      program, scope):
         with fluid.scope_guard(scope):
-            fluid.io.save_inference_model(
-                dirname,
-                feeded_var_names,
-                target_vars,
-                executor,
-                program,
-                clip_extra=True)
+            fluid.io.save_inference_model(dirname,
+                                          feeded_var_names,
+                                          target_vars,
+                                          executor,
+                                          program,
+                                          clip_extra=True)
 
     def _get_paddle_outs(self, feed, fetch_list, executor, program, scope):
         '''
@@ -184,8 +184,8 @@ def _get_analysis_config(self,
                         self.dynamic_shape_params.max_input_shape,
                         self.dynamic_shape_params.optim_input_shape,
                         self.dynamic_shape_params.disable_trt_plugin_fp16)
-                if self.enable_tensorrt_oss:
-                    config.enable_tensorrt_oss()
+                if self.enable_tensorrt_varseqlen:
+                    config.enable_tensorrt_varseqlen()
 
         elif use_mkldnn:
             config.enable_mkldnn()
@@ -214,8 +214,8 @@ def check_output_with_option(self,
             executor.run(self.startup_program)
             executor.run(self.test_startup_program)
         main_graph = IrGraph(core.Graph(self.main_program.desc), for_test=False)
-        test_graph = IrGraph(
-            core.Graph(self.test_main_program.desc), for_test=True)
+        test_graph = IrGraph(core.Graph(self.test_main_program.desc),
+                             for_test=True)
 
         transform_pass = QuantizationTransformPass(
             scope=scope,
@@ -240,12 +240,11 @@ def check_output_with_option(self,
 
         iters = 10
         batch_size = 1
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=500),
-            batch_size=batch_size)
-        feeder = fluid.DataFeeder(
-            feed_list=[self.data, self.label], place=place)
+        train_reader = paddle.batch(paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+                                    batch_size=batch_size)
+        feeder = fluid.DataFeeder(feed_list=[self.data, self.label],
+                                  place=place)
         with fluid.scope_guard(scope):
             for _ in range(iters):
                 data = next(train_reader())
@@ -266,23 +265,23 @@ def check_output_with_option(self,
         self.main_program = test_graph.to_program()
 
         with fluid.scope_guard(scope):
-            self.main_program = self._normalize_program(
-                self.main_program, self.data, self.fetch_list)
+            self.main_program = self._normalize_program(self.main_program,
+                                                        self.data,
+                                                        self.fetch_list)
 
-        self._save_models(self.path,
-                          list(self.feeds.keys()), self.fetch_list, executor,
-                          self.main_program, scope)
+        self._save_models(self.path, list(self.feeds.keys()), self.fetch_list,
+                          executor, self.main_program, scope)
 
         paddle_outs = self._get_paddle_outs(self.feeds, self.fetch_list,
                                             executor, self.main_program, scope)
         inference_outs = self._get_inference_outs(
             self._get_analysis_config(use_gpu=use_gpu))
 
-        # Check whether the results calculated on CPU and on GPU are the same. 
+        # Check whether the results calculated on CPU and on GPU are the same.
         self.assertTrue(
             len(paddle_outs) == len(inference_outs),
-            "The number of outputs is different between inference and training forward at {}".
-            format(device))
+            "The number of outputs is different between inference and training forward at {}"
+            .format(device))
 
         for out, inference_out in zip(paddle_outs, inference_outs):
             paddle_out = np.array(out)
@@ -292,22 +291,21 @@ def check_output_with_option(self,
                 inference_out = inference_out.flatten()
 
             self.assertTrue(
-                np.allclose(
-                    paddle_out, inference_out, atol=atol),
+                np.allclose(paddle_out, inference_out, atol=atol),
                 "Output has diff between inference and training forward at {} ".
                 format(device))
 
-        # Check whether the trt results and the GPU results are the same. 
+        # Check whether the trt results and the GPU results are the same.
         if use_gpu and self.enable_trt:
             tensorrt_outputs = self._get_inference_outs(
-                self._get_analysis_config(
-                    use_gpu=use_gpu, use_trt=self.enable_trt))
+                self._get_analysis_config(use_gpu=use_gpu,
+                                          use_trt=self.enable_trt))
 
             if self.trt_parameters.use_static:
                 #deserialize
                 tensorrt_outputs = self._get_inference_outs(
-                    self._get_analysis_config(
-                        use_gpu=use_gpu, use_trt=self.enable_trt))
+                    self._get_analysis_config(use_gpu=use_gpu,
+                                              use_trt=self.enable_trt))
 
             self.assertTrue(
                 len(tensorrt_outputs) == len(paddle_outs),
@@ -322,15 +320,17 @@ def check_output_with_option(self,
                     tensorrt_output = tensorrt_output.flatten()
 
                 self.assertTrue(
-                    np.allclose(
-                        paddle_out, tensorrt_output, rtol=rtol, atol=atol),
+                    np.allclose(paddle_out,
+                                tensorrt_output,
+                                rtol=rtol,
+                                atol=atol),
                     "Output has diff between GPU and TensorRT. ")
 
-        # Check whether the mkldnn results and the CPU results are the same. 
+        # Check whether the mkldnn results and the CPU results are the same.
         if (not use_gpu) and self.enable_mkldnn:
             mkldnn_outputs = self._get_inference_outs(
-                self._get_analysis_config(
-                    use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn))
+                self._get_analysis_config(use_gpu=use_gpu,
+                                          use_mkldnn=self.enable_mkldnn))
 
             self.assertTrue(
                 len(paddle_outs) == len(mkldnn_outputs),
@@ -340,8 +340,7 @@ def check_output_with_option(self,
                 atol = 0.01
             for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs):
                 self.assertTrue(
-                    np.allclose(
-                        np.array(paddle_out), mkldnn_output, atol=atol),
+                    np.allclose(np.array(paddle_out), mkldnn_output, atol=atol),
                     "Output has diff between CPU and MKLDNN. ")
 
     class TensorRTParam:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
index a8c3009a5aea1..c24a90d5084dc 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
@@ -26,53 +26,55 @@
 
 
 class TestAdaptivePool2dConvertGlobalPass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_config(self, draw):
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=4,
+                     max_size=4))
         pooling_type = draw(st.sampled_from(["max", "avg"]))
 
         data_format = "NCHW"  #trt support this format only
         strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=2))
 
         paddings = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=2))
 
         ceil_mode = draw(st.booleans())
         exclusive = draw(st.booleans())
         global_pooling = draw(st.booleans())
         padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VAILD"]))
 
-        pool_op = OpConfig(
-            "pool2d",
-            inputs={"X": ["input_data"]},
-            outputs={"Out": ["pool_output"]},
-            ksize=[1, 1],
-            adaptive=True,
-            pooling_type=pooling_type,
-            data_format=data_format,
-            strides=strides,
-            paddings=paddings,
-            ceil_mode=ceil_mode,
-            global_pooling=global_pooling,
-            padding_algorithm=padding_algorithm,
-            exclusive=exclusive)
+        pool_op = OpConfig("pool2d",
+                           inputs={"X": ["input_data"]},
+                           outputs={"Out": ["pool_output"]},
+                           ksize=[1, 1],
+                           adaptive=True,
+                           pooling_type=pooling_type,
+                           data_format=data_format,
+                           strides=strides,
+                           paddings=paddings,
+                           ceil_mode=ceil_mode,
+                           global_pooling=global_pooling,
+                           padding_algorithm=padding_algorithm,
+                           exclusive=exclusive)
         ops = [pool_op]
 
-        program_config = ProgramConfig(
-            ops=ops,
-            weights={},
-            inputs={"input_data": TensorConfig(shape=x_shape), },
-            outputs=["pool_output"])
+        program_config = ProgramConfig(ops=ops,
+                                       weights={},
+                                       inputs={
+                                           "input_data":
+                                           TensorConfig(shape=x_shape),
+                                       },
+                                       outputs=["pool_output"])
 
         return program_config
 
@@ -88,11 +90,10 @@ def sample_predictor_configs(self, program_config):
         yield config, ['pool2d'], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=300,
-            passes=["adaptive_pool2d_convert_global_pass"],
-            min_success_num=40)
+        self.run_and_statis(quant=False,
+                            max_examples=300,
+                            passes=["adaptive_pool2d_convert_global_pass"],
+                            min_success_num=40)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py
index d029bcd6a7f17..1516d1dafd32f 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py
@@ -76,9 +76,9 @@ def is_program_valid(self, prog_config):
     def sample_program_config(self, draw):
         # 1. Generate shape of input:X of conv2d
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=5, max_value=100), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=5, max_value=100),
+                     min_size=4,
+                     max_size=4))
         x_shape[1] = draw(st.integers(min_value=5, max_value=10))
 
         # 2. Generate legal attr:data_format of conv2d
@@ -86,9 +86,9 @@ def sample_program_config(self, draw):
 
         # 3. Generate legal shape of input:Y of conv2d
         f_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=5), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=5),
+                     min_size=4,
+                     max_size=4))
         if data_format == "NCHW":
             f_shape[1] = x_shape[1]
         else:
@@ -96,37 +96,35 @@ def sample_program_config(self, draw):
 
         # 4. Generate legal attr:strides of conv2d
         strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=5), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=5),
+                     min_size=2,
+                     max_size=2))
 
         # 5. Generate legal attr:padding_algorithm of conv2d
         padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
 
         # 6. Generate legal attr:padding of conv2d
         padding = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=5), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=5),
+                     min_size=4,
+                     max_size=4))
 
         # 7. Generate legal attr:groups of conv2d
         groups = draw(st.integers(min_value=1, max_value=3))
 
         # 8. Generate legal attr:dilations of conv2d
         dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=5), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=5),
+                     min_size=2,
+                     max_size=2))
 
         # 9. Generate legal input:ResidualData of conv2d
         res_shape = []
         if draw(st.booleans()):
             res_shape = draw(
-                st.lists(
-                    st.integers(
-                        min_value=1, max_value=100),
-                    min_size=4,
-                    max_size=4))
+                st.lists(st.integers(min_value=1, max_value=100),
+                         min_size=4,
+                         max_size=4))
 
         # 10. Generate legal shape of input:bias of conv2d
         conv_bias_shape = []
@@ -159,17 +157,16 @@ def sample_program_config(self, draw):
         act_type = draw(
             st.sampled_from(["relu", "leaky_relu", "relu6", "swish"]))
 
-        conv2d_op = OpConfig(
-            "conv2d",
-            inputs=inputs,
-            outputs={"Output": ["conv2d_out"]},
-            strides=strides,
-            padding_algorithm=padding_algorithm,
-            paddings=padding,
-            groups=groups,
-            dilations=dilations,
-            data_format=data_format,
-            use_mkldnn=True)
+        conv2d_op = OpConfig("conv2d",
+                             inputs=inputs,
+                             outputs={"Output": ["conv2d_out"]},
+                             strides=strides,
+                             padding_algorithm=padding_algorithm,
+                             paddings=padding,
+                             groups=groups,
+                             dilations=dilations,
+                             data_format=data_format,
+                             use_mkldnn=True)
 
         # 11. Generate legal attr of act
         act_op = None
@@ -177,33 +174,29 @@ def sample_program_config(self, draw):
         if act_type == "relu6":
             self.passes = ["conv_relu6_mkldnn_fuse_pass"]
             threshold = draw(st.floats(min_value=1.0, max_value=10.0))
-            act_op = OpConfig(
-                "relu6",
-                inputs={"X": ["conv2d_out"]},
-                outputs={"Out": ["relu_out"]},
-                threshold=threshold)
+            act_op = OpConfig("relu6",
+                              inputs={"X": ["conv2d_out"]},
+                              outputs={"Out": ["relu_out"]},
+                              threshold=threshold)
         if act_type == "leaky_relu":
             self.passes = ["conv_leaky_relu_mkldnn_fuse_pass"]
             alpha = draw(st.floats(min_value=0.1, max_value=1.0))
-            act_op = OpConfig(
-                "leaky_relu",
-                inputs={"X": ["conv2d_out"]},
-                outputs={"Out": ["relu_out"]},
-                alpha=alpha)
+            act_op = OpConfig("leaky_relu",
+                              inputs={"X": ["conv2d_out"]},
+                              outputs={"Out": ["relu_out"]},
+                              alpha=alpha)
         if act_type == "relu":
             self.passes = ["conv_relu_mkldnn_fuse_pass"]
-            act_op = OpConfig(
-                "relu",
-                inputs={"X": ["conv2d_out"]},
-                outputs={"Out": ["relu_out"]})
+            act_op = OpConfig("relu",
+                              inputs={"X": ["conv2d_out"]},
+                              outputs={"Out": ["relu_out"]})
         if act_type == "swish":
             self.passes = ["conv_swish_mkldnn_fuse_pass"]
             beta = draw(st.floats(min_value=0.1, max_value=1.0))
-            act_op = OpConfig(
-                "swish",
-                inputs={"X": ["conv2d_out"]},
-                outputs={"Out": ["swish_out"]},
-                beta=beta)
+            act_op = OpConfig("swish",
+                              inputs={"X": ["conv2d_out"]},
+                              outputs={"Out": ["swish_out"]},
+                              beta=beta)
 
         ops = [conv2d_op, act_op]
 
@@ -214,7 +207,8 @@ def sample_program_config(self, draw):
                 "input_x": TensorConfig(shape=x_shape),
                 "residualdata": TensorConfig(shape=res_shape)
             },
-            outputs=ops[-1].outputs["Out"], )
+            outputs=ops[-1].outputs["Out"],
+        )
         return program_config
 
     def test(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
index a0213c5b1f4df..098cec7115932 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
@@ -76,9 +76,9 @@ def is_program_valid(self, prog_config):
     def sample_program_config(self, draw):
         # 1. Generate shape of input:X of conv2d
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=5, max_value=100), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=5, max_value=100),
+                     min_size=4,
+                     max_size=4))
         x_shape[1] = draw(st.integers(min_value=5, max_value=10))
 
         # 2. Generate legal attr:data_format of conv2d
@@ -86,9 +86,9 @@ def sample_program_config(self, draw):
 
         # 3. Generate legal shape of input:Y of conv2d
         f_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=4,
+                     max_size=4))
         if data_format == "NCHW":
             f_shape[1] = x_shape[1]
         else:
@@ -96,27 +96,27 @@ def sample_program_config(self, draw):
 
         # 4. Generate legal attr:strides of conv2d
         strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=2))
 
         # 5. Generate legal attr:padding_algorithm of conv2d
         padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
 
         # 6. Generate legal attr:padding of conv2d
         padding = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=4,
+                     max_size=4))
 
         # 7. Generate legal attr:groups of conv2d
         groups = draw(st.integers(min_value=1, max_value=3))
 
         # 8. Generate legal attr:dilations of conv2d
         dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=2))
 
         # 9. Generate legal shape of input:bias of elementwise_add
         bias_shape = [f_shape[0]]
@@ -157,24 +157,24 @@ def sample_program_config(self, draw):
             }
             use_mkldnn = False
 
-        conv2d_op = OpConfig(
-            "conv2d",
-            inputs=inputs,
-            outputs={"Output": ["conv2d_out"]},
-            strides=strides,
-            padding_algorithm=padding_algorithm,
-            paddings=padding,
-            groups=groups,
-            dilations=dilations,
-            data_format=data_format,
-            use_mkldnn=use_mkldnn)
-
-        add_op = OpConfig(
-            "elementwise_add",
-            inputs={"X": ["conv2d_out"],
-                    "Y": ["bias"]},
-            outputs={"Out": ["add_out"]},
-            axis=axis)
+        conv2d_op = OpConfig("conv2d",
+                             inputs=inputs,
+                             outputs={"Output": ["conv2d_out"]},
+                             strides=strides,
+                             padding_algorithm=padding_algorithm,
+                             paddings=padding,
+                             groups=groups,
+                             dilations=dilations,
+                             data_format=data_format,
+                             use_mkldnn=use_mkldnn)
+
+        add_op = OpConfig("elementwise_add",
+                          inputs={
+                              "X": ["conv2d_out"],
+                              "Y": ["bias"]
+                          },
+                          outputs={"Out": ["add_out"]},
+                          axis=axis)
 
         ops = [conv2d_op, add_op]
 
@@ -186,10 +186,9 @@ def sample_program_config(self, draw):
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=350,
-            passes=["conv_bias_mkldnn_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=350,
+                            passes=["conv_bias_mkldnn_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py
index 67e97b0a3752e..e23089e7895dd 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py
@@ -26,10 +26,10 @@
 
 
 class TestConvBnFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         # mainly for TRT, which is invalid for current pass test framework!!
         if attrs[0]['data_format'] == "NHWC":
@@ -49,17 +49,17 @@ def sample_program_config(self, draw):
         out_channel = groups * out_channel_factor
         batch_size = draw(st.integers(min_value=1, max_value=4))
         dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=2),
+                     min_size=2,
+                     max_size=2))
         paddings = draw(
-            st.lists(
-                st.integers(
-                    min_value=0, max_value=2), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=0, max_value=2),
+                     min_size=2,
+                     max_size=2))
         strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=2),
+                     min_size=2,
+                     max_size=2))
         has_bias = draw(st.booleans())
         use_mkldnn = draw(st.booleans())
         epsilon = draw(st.floats(min_value=0.0, max_value=0.001))
@@ -94,43 +94,41 @@ def generate_bn_Mean():
         def generate_bn_Var():
             return np.random.random(var_shape).astype(np.float32)
 
-        conv2d_op = OpConfig(
-            "conv2d",
-            inputs={
-                "Input": ["conv2d_input"],
-                "Filter": ["conv2d_weight"],
-            },
-            outputs={"Output": ["conv2d_out"]},
-            data_format=data_format,
-            dilations=dilations,
-            padding_algorithm=padding_algorithm,
-            groups=groups,
-            paddings=paddings,
-            strides=strides,
-            use_mkldnn=use_mkldnn,
-            has_bias=has_bias,
-            is_test=True)
-        bn_op = OpConfig(
-            "batch_norm",
-            inputs={
-                "X": ["conv2d_out"],
-                "Scale": ["batch_norm_Scale"],
-                "Bias": ["batch_norm_Bias"],
-                "Mean": ["batch_norm_Mean"],
-                "Variance": ["batch_norm_Variance"],
-            },
-            outputs={
-                "Y": ["batch_norm_Y"],
-                "MeanOut": ["batch_norm_Mean"],
-                "VarianceOut": ["batch_norm_Variance"],
-                "SavedMean": ["batch_norm_SavedMean"],
-                "SavedVariance": ["batch_norm_SavedVariance"],
-                "ReserveSpace": ["batch_norm_ReserveSpace"],
-            },
-            epsilon=epsilon,
-            trainable_statistics=False,
-            data_layout=data_format,
-            is_test=True)
+        conv2d_op = OpConfig("conv2d",
+                             inputs={
+                                 "Input": ["conv2d_input"],
+                                 "Filter": ["conv2d_weight"],
+                             },
+                             outputs={"Output": ["conv2d_out"]},
+                             data_format=data_format,
+                             dilations=dilations,
+                             padding_algorithm=padding_algorithm,
+                             groups=groups,
+                             paddings=paddings,
+                             strides=strides,
+                             use_mkldnn=use_mkldnn,
+                             has_bias=has_bias,
+                             is_test=True)
+        bn_op = OpConfig("batch_norm",
+                         inputs={
+                             "X": ["conv2d_out"],
+                             "Scale": ["batch_norm_Scale"],
+                             "Bias": ["batch_norm_Bias"],
+                             "Mean": ["batch_norm_Mean"],
+                             "Variance": ["batch_norm_Variance"],
+                         },
+                         outputs={
+                             "Y": ["batch_norm_Y"],
+                             "MeanOut": ["batch_norm_Mean"],
+                             "VarianceOut": ["batch_norm_Variance"],
+                             "SavedMean": ["batch_norm_SavedMean"],
+                             "SavedVariance": ["batch_norm_SavedVariance"],
+                             "ReserveSpace": ["batch_norm_ReserveSpace"],
+                         },
+                         epsilon=epsilon,
+                         trainable_statistics=False,
+                         data_layout=data_format,
+                         is_test=True)
         if has_bias == True:
             conv2d_op.inputs["Bias"] = ["conv2d_bias"]
         ops = [conv2d_op, bn_op]
@@ -144,10 +142,14 @@ def generate_bn_Var():
             weights={
                 "conv2d_weight":
                 TensorConfig(data_gen=partial(generate_conv2d_Filter)),
-                "batch_norm_Scale": TensorConfig(data_gen=generate_bn_Scale),
-                "batch_norm_Bias": TensorConfig(data_gen=generate_bn_Bias),
-                "batch_norm_Mean": TensorConfig(data_gen=generate_bn_Mean),
-                "batch_norm_Variance": TensorConfig(data_gen=generate_bn_Var),
+                "batch_norm_Scale":
+                TensorConfig(data_gen=generate_bn_Scale),
+                "batch_norm_Bias":
+                TensorConfig(data_gen=generate_bn_Bias),
+                "batch_norm_Mean":
+                TensorConfig(data_gen=generate_bn_Mean),
+                "batch_norm_Variance":
+                TensorConfig(data_gen=generate_bn_Var),
             },
             outputs=["batch_norm_Y"])
         if has_bias == True:
@@ -182,6 +184,7 @@ def sample_predictor_configs(self, program_config):
                 yield config, ['conv2d_fusion'], (1e-5, 1e-5)
 
     def add_ignore_pass_case(self):
+
         def teller1(program_config, predictor_config):
             if program_config.ops[0].attrs[
                     'data_format'] == "NHWC" and not predictor_config.mkldnn_enabled(
@@ -191,8 +194,8 @@ def teller1(program_config, predictor_config):
 
         # mkldnn Output has diff with bias!
         def teller2(program_config, predictor_config):
-            return predictor_config.mkldnn_enabled() and program_config.ops[
-                0].attrs['has_bias'] == True
+            return predictor_config.mkldnn_enabled(
+            ) and program_config.ops[0].attrs['has_bias'] == True
 
         self.add_ignore_check_case(
             teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
@@ -206,7 +209,8 @@ def teller2(program_config, predictor_config):
     def test(self):
         self.run_and_statis(
             quant=False,
-            passes=["conv_bn_fuse_pass"], )
+            passes=["conv_bn_fuse_pass"],
+        )
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
index 9dd41bd1c3939..56ce8f3ea3b6e 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
@@ -92,11 +92,9 @@ def sample_program_config(self, draw):
         while is_not_valid:
             # 1. Generate shape of input:X of conv2d
             x_shape = draw(
-                st.lists(
-                    st.integers(
-                        min_value=1, max_value=100),
-                    min_size=4,
-                    max_size=4))
+                st.lists(st.integers(min_value=1, max_value=100),
+                         min_size=4,
+                         max_size=4))
             x_shape[1] = draw(st.integers(min_value=1, max_value=10))
 
             # 2. Generate legal attr:data_format of conv2d
@@ -104,11 +102,9 @@ def sample_program_config(self, draw):
 
             # 3. Generate legal shape of input:Y of conv2d
             f_shape = draw(
-                st.lists(
-                    st.integers(
-                        min_value=1, max_value=7),
-                    min_size=4,
-                    max_size=4))
+                st.lists(st.integers(min_value=1, max_value=7),
+                         min_size=4,
+                         max_size=4))
             if data_format == "NCHW":
                 f_shape[1] = x_shape[1]
             else:
@@ -116,11 +112,9 @@ def sample_program_config(self, draw):
 
             # 4. Generate legal attr:strides of conv2d
             strides = draw(
-                st.lists(
-                    st.integers(
-                        min_value=1, max_value=5),
-                    min_size=2,
-                    max_size=2))
+                st.lists(st.integers(min_value=1, max_value=5),
+                         min_size=2,
+                         max_size=2))
 
             # 5. Generate legal attr:padding_algorithm of conv2d
             padding_algorithm = draw(
@@ -128,22 +122,18 @@ def sample_program_config(self, draw):
 
             # 6. Generate legal attr:padding of conv2d
             padding = draw(
-                st.lists(
-                    st.integers(
-                        min_value=1, max_value=5),
-                    min_size=4,
-                    max_size=4))
+                st.lists(st.integers(min_value=1, max_value=5),
+                         min_size=4,
+                         max_size=4))
 
             # 7. Generate legal attr:groups of conv2d
             groups = draw(st.integers(min_value=1, max_value=3))
 
             # 8. Generate legal attr:dilations of conv2d
             dilations = draw(
-                st.lists(
-                    st.integers(
-                        min_value=1, max_value=5),
-                    min_size=2,
-                    max_size=2))
+                st.lists(st.integers(min_value=1, max_value=5),
+                         min_size=2,
+                         max_size=2))
 
             # 9. Generate legal elemntwise_add: X of conv2d
             bias_2_dict = dict()
@@ -179,35 +169,37 @@ def sample_program_config(self, draw):
             # 12. Generate legal attr:axis of elementwise_add_2
             axis_2 = -1
 
-            conv2d_op = OpConfig(
-                "conv2d",
-                inputs={"Input": ["input_x"],
-                        "Filter": ["filter"]},
-                outputs={"Output": ["conv2d_out"]},
-                strides=strides,
-                padding_algorithm=padding_algorithm,
-                paddings=padding,
-                groups=groups,
-                dilations=dilations,
-                data_format=data_format)
-            add_1_op = OpConfig(
-                "elementwise_add",
-                inputs={"X": ["conv2d_out"],
-                        "Y": ["bias_1"]},
-                outputs={"Out": ["add_1_out"]},
-                axis=axis_1)
-
-            add_2_op = OpConfig(
-                "elementwise_add",
-                inputs={"X": ["bias_2"],
-                        "Y": ["add_1_out"]},
-                outputs={"Out": ["add_out"]},
-                axis=axis_2)
-
-            relu_op = OpConfig(
-                "relu",
-                inputs={"X": ["add_out"]},
-                outputs={"Out": ["relu_out"]})
+            conv2d_op = OpConfig("conv2d",
+                                 inputs={
+                                     "Input": ["input_x"],
+                                     "Filter": ["filter"]
+                                 },
+                                 outputs={"Output": ["conv2d_out"]},
+                                 strides=strides,
+                                 padding_algorithm=padding_algorithm,
+                                 paddings=padding,
+                                 groups=groups,
+                                 dilations=dilations,
+                                 data_format=data_format)
+            add_1_op = OpConfig("elementwise_add",
+                                inputs={
+                                    "X": ["conv2d_out"],
+                                    "Y": ["bias_1"]
+                                },
+                                outputs={"Out": ["add_1_out"]},
+                                axis=axis_1)
+
+            add_2_op = OpConfig("elementwise_add",
+                                inputs={
+                                    "X": ["bias_2"],
+                                    "Y": ["add_1_out"]
+                                },
+                                outputs={"Out": ["add_out"]},
+                                axis=axis_2)
+
+            relu_op = OpConfig("relu",
+                               inputs={"X": ["add_out"]},
+                               outputs={"Out": ["relu_out"]})
 
             ops = [conv2d_op, add_1_op, add_2_op, relu_op]
 
@@ -221,14 +213,14 @@ def sample_program_config(self, draw):
                     "input_x": TensorConfig(shape=x_shape),
                     "bias_2": TensorConfig(shape=bias_2_shape)
                 },
-                outputs=ops[-1].outputs["Out"], )
+                outputs=ops[-1].outputs["Out"],
+            )
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=300,
-            passes=["conv_elementwise_add2_act_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=300,
+                            passes=["conv_elementwise_add2_act_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
index 0d93ae9a7d2bb..f1d2192a4c769 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
@@ -81,9 +81,9 @@ def is_program_valid(self, prog_config):
     def sample_program_config(self, draw):
         # 1. Generate shape of input:X of conv2d
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=100), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=100),
+                     min_size=4,
+                     max_size=4))
         x_shape[1] = draw(st.integers(min_value=1, max_value=10))
 
         # 2. Generate legal attr:data_format of conv2d
@@ -91,9 +91,9 @@ def sample_program_config(self, draw):
 
         # 3. Generate legal shape of input:Y of conv2d
         f_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=7), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=7),
+                     min_size=4,
+                     max_size=4))
         if data_format == "NCHW":
             f_shape[1] = x_shape[1]
         else:
@@ -101,37 +101,35 @@ def sample_program_config(self, draw):
 
         # 4. Generate legal attr:strides of conv2d
         strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=5), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=5),
+                     min_size=2,
+                     max_size=2))
 
         # 5. Generate legal attr:padding_algorithm of conv2d
         padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
 
         # 6. Generate legal attr:padding of conv2d
         padding = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=5), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=5),
+                     min_size=4,
+                     max_size=4))
 
         # 7. Generate legal attr:groups of conv2d
         groups = draw(st.integers(min_value=1, max_value=3))
 
         # 8. Generate legal attr:dilations of conv2d
         dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=5), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=5),
+                     min_size=2,
+                     max_size=2))
 
         # 9. Generate legal input:ResidualData of conv2d
         res_shape = []
         if draw(st.booleans()):
             res_shape = draw(
-                st.lists(
-                    st.integers(
-                        min_value=1, max_value=100),
-                    min_size=4,
-                    max_size=4))
+                st.lists(st.integers(min_value=1, max_value=100),
+                         min_size=4,
+                         max_size=4))
 
         # 10. Generate legal shape of input:bias of elementwise_add
         bias_shape = [f_shape[0]]
@@ -139,29 +137,30 @@ def sample_program_config(self, draw):
         # 11. Generate legal attr:axis of elementwise_add
         axis = 1
 
-        conv2d_op = OpConfig(
-            "conv2d",
-            inputs={
-                "Input": ["input_x"],
-                "Filter": ["filter"],
-                "ResidualData": ["residualdata"]
-            },
-            outputs={"Output": ["conv2d_out"]},
-            strides=strides,
-            padding_algorithm=padding_algorithm,
-            paddings=padding,
-            groups=groups,
-            dilations=dilations,
-            data_format=data_format)
-        add_op = OpConfig(
-            "elementwise_add",
-            inputs={"X": ["conv2d_out"],
-                    "Y": ["bias"]},
-            outputs={"Out": ["add_out"]},
-            axis=axis)
-
-        relu_op = OpConfig(
-            "relu", inputs={"X": ["add_out"]}, outputs={"Out": ["relu_out"]})
+        conv2d_op = OpConfig("conv2d",
+                             inputs={
+                                 "Input": ["input_x"],
+                                 "Filter": ["filter"],
+                                 "ResidualData": ["residualdata"]
+                             },
+                             outputs={"Output": ["conv2d_out"]},
+                             strides=strides,
+                             padding_algorithm=padding_algorithm,
+                             paddings=padding,
+                             groups=groups,
+                             dilations=dilations,
+                             data_format=data_format)
+        add_op = OpConfig("elementwise_add",
+                          inputs={
+                              "X": ["conv2d_out"],
+                              "Y": ["bias"]
+                          },
+                          outputs={"Out": ["add_out"]},
+                          axis=axis)
+
+        relu_op = OpConfig("relu",
+                           inputs={"X": ["add_out"]},
+                           outputs={"Out": ["relu_out"]})
 
         ops = [conv2d_op, add_op, relu_op]
 
@@ -175,14 +174,14 @@ def sample_program_config(self, draw):
                 "input_x": TensorConfig(shape=x_shape),
                 "residualdata": TensorConfig(shape=res_shape)
             },
-            outputs=ops[-1].outputs["Out"], )
+            outputs=ops[-1].outputs["Out"],
+        )
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=400,
-            passes=["conv_elementwise_add_act_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=400,
+                            passes=["conv_elementwise_add_act_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
index 0bcee474d1394..5b33a18af8ef4 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
@@ -26,10 +26,10 @@
 
 
 class TestConvEltwiseAddFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         if attrs[0]['data_format'] == "NHWC" and attrs[1]['axis'] != 3:
@@ -49,17 +49,17 @@ def sample_program_config(self, draw):
         out_channel = groups * out_channel_factor
         batch_size = draw(st.integers(min_value=1, max_value=4))
         dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=2),
+                     min_size=2,
+                     max_size=2))
         paddings = draw(
-            st.lists(
-                st.integers(
-                    min_value=0, max_value=2), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=0, max_value=2),
+                     min_size=2,
+                     max_size=2))
         strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=2),
+                     min_size=2,
+                     max_size=2))
 
         x_shape = [
             batch_size, in_channel, 64, 64
@@ -80,26 +80,26 @@ def generate_bias():
         def generate_scale_bias():
             return np.random.random(bias_shape).astype(np.float32)
 
-        conv2d_op = OpConfig(
-            "conv2d",
-            inputs={
-                "Input": ["input_data"],
-                "Filter": ["conv2d_weight"],
-            },
-            outputs={"Output": ["conv_output"]},
-            data_format=data_format,
-            dilations=dilations,
-            padding_algorithm=padding_algorithm,
-            groups=groups,
-            paddings=paddings,
-            strides=strides,
-            is_test=True)
-        eltwise_op = OpConfig(
-            "elementwise_add",
-            inputs={"X": ["conv_output"],
-                    "Y": ["conv2d_bias"]},
-            outputs={"Out": ["elementwise_output"]},
-            axis=axis)
+        conv2d_op = OpConfig("conv2d",
+                             inputs={
+                                 "Input": ["input_data"],
+                                 "Filter": ["conv2d_weight"],
+                             },
+                             outputs={"Output": ["conv_output"]},
+                             data_format=data_format,
+                             dilations=dilations,
+                             padding_algorithm=padding_algorithm,
+                             groups=groups,
+                             paddings=paddings,
+                             strides=strides,
+                             is_test=True)
+        eltwise_op = OpConfig("elementwise_add",
+                              inputs={
+                                  "X": ["conv_output"],
+                                  "Y": ["conv2d_bias"]
+                              },
+                              outputs={"Out": ["elementwise_output"]},
+                              axis=axis)
         ops = [conv2d_op, eltwise_op]
 
         program_config = ProgramConfig(
@@ -132,7 +132,7 @@ def sample_predictor_configs(self, program_config):
         yield config, ['conv2d_fusion'], (1e-4, 1e-4)
 
     def add_ignore_pass_case(self):
-        # If the problem has been fixed, the judgment 
+        # If the problem has been fixed, the judgment
         # in is_program_valid needs to be deleted!!!
         def teller1(program_config, predictor_config):
             if program_config.ops[0].attrs['data_format'] == "NHWC":
@@ -149,7 +149,8 @@ def teller1(program_config, predictor_config):
     def test(self):
         self.run_and_statis(
             quant=False,
-            passes=["conv_elementwise_add_fuse_pass"], )
+            passes=["conv_elementwise_add_fuse_pass"],
+        )
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py
index c8319a5f3d772..4463f95437186 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py
@@ -107,11 +107,9 @@ def is_program_valid(self, prog_config):
     def sample_program_config(self, draw):
         # 1. Generate shape of input:X of conv2d
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=10, max_value=100),
-                min_size=4,
-                max_size=4))
+            st.lists(st.integers(min_value=10, max_value=100),
+                     min_size=4,
+                     max_size=4))
         x_shape[1] = draw(st.integers(min_value=1, max_value=10))
 
         # 2. Generate legal attr:data_format of conv2d
@@ -119,9 +117,9 @@ def sample_program_config(self, draw):
 
         # 2. Generate legal shape of input:Y of conv2d
         f_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=7), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=7),
+                     min_size=4,
+                     max_size=4))
         if data_format == "NCHW":
             f_shape[1] = x_shape[1]
         else:
@@ -129,37 +127,35 @@ def sample_program_config(self, draw):
 
         # 3. Generate legal attr:strides of conv2d
         strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=5), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=5),
+                     min_size=2,
+                     max_size=2))
 
         # 4. Generate legal attr:padding_algorithm of conv2d
         padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
 
         # 5. Generate legal attr:padding of conv2d
         padding = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=5), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=5),
+                     min_size=4,
+                     max_size=4))
 
         # 6. Generate legal attr:groups of conv2d
         groups = draw(st.integers(min_value=1, max_value=3))
 
         # 7. Generate legal attr:dilations of conv2d
         dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=5), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=5),
+                     min_size=2,
+                     max_size=2))
 
         # 9. Generate legal input:ResidualData of conv2d
         res_shape = []
         if draw(st.booleans()):
             res_shape = draw(
-                st.lists(
-                    st.integers(
-                        min_value=1, max_value=100),
-                    min_size=4,
-                    max_size=4))
+                st.lists(st.integers(min_value=1, max_value=100),
+                         min_size=4,
+                         max_size=4))
 
         # 10. Generate legal shape of input:bias of elementwise_add
         bias_shape = [f_shape[0]]
@@ -183,51 +179,51 @@ def sample_program_config(self, draw):
         epsilon = draw(st.floats(min_value=0.00001, max_value=0.001))
 
         def generate_batch_variance():
-            return (0.1 + (1.0 - 0.1) * np.random.random(bn_variance_shape)
-                    ).astype(np.float32)
-
-        conv2d_op = OpConfig(
-            "conv2d",
-            inputs={
-                "Input": ["input_x"],
-                "Filter": ["filter"],
-                "ResidualData": ["residualdata"]
-            },
-            outputs={"Output": ["conv2d_out"]},
-            strides=strides,
-            padding_algorithm=padding_algorithm,
-            paddings=padding,
-            groups=groups,
-            dilations=dilations,
-            data_format=data_format)
-        add_op = OpConfig(
-            "elementwise_add",
-            inputs={"X": ["conv2d_out"],
-                    "Y": ["bias"]},
-            outputs={"Out": ["add_out"]},
-            axis=axis)
-
-        bn_op = OpConfig(
-            "batch_norm",
-            inputs={
-                "X": ["add_out"],
-                "Scale": ["scale_in"],
-                "Bias": ["bias_in"],
-                "Mean": ["mean_in"],
-                "Variance": ["variance_in"]
-            },
-            outputs={
-                "Y": ["y_out"],
-                "MeanOut": ["mean_in"],
-                "VarianceOut": ["variance_in"],
-                "SavedMean": ["SavedMean_out"],
-                "SavedVariance": ["SavedVariance_out"],
-                "ReserveSpace": ["ReserveSpace_out"]
-            },
-            epsilon=epsilon,
-            is_test=True,
-            trainable_statistics=False,
-            data_layout=data_format)
+            return (0.1 +
+                    (1.0 - 0.1) * np.random.random(bn_variance_shape)).astype(
+                        np.float32)
+
+        conv2d_op = OpConfig("conv2d",
+                             inputs={
+                                 "Input": ["input_x"],
+                                 "Filter": ["filter"],
+                                 "ResidualData": ["residualdata"]
+                             },
+                             outputs={"Output": ["conv2d_out"]},
+                             strides=strides,
+                             padding_algorithm=padding_algorithm,
+                             paddings=padding,
+                             groups=groups,
+                             dilations=dilations,
+                             data_format=data_format)
+        add_op = OpConfig("elementwise_add",
+                          inputs={
+                              "X": ["conv2d_out"],
+                              "Y": ["bias"]
+                          },
+                          outputs={"Out": ["add_out"]},
+                          axis=axis)
+
+        bn_op = OpConfig("batch_norm",
+                         inputs={
+                             "X": ["add_out"],
+                             "Scale": ["scale_in"],
+                             "Bias": ["bias_in"],
+                             "Mean": ["mean_in"],
+                             "Variance": ["variance_in"]
+                         },
+                         outputs={
+                             "Y": ["y_out"],
+                             "MeanOut": ["mean_in"],
+                             "VarianceOut": ["variance_in"],
+                             "SavedMean": ["SavedMean_out"],
+                             "SavedVariance": ["SavedVariance_out"],
+                             "ReserveSpace": ["ReserveSpace_out"]
+                         },
+                         epsilon=epsilon,
+                         is_test=True,
+                         trainable_statistics=False,
+                         data_layout=data_format)
 
         ops = [conv2d_op, add_op, bn_op]
 
@@ -255,10 +251,9 @@ def generate_batch_variance():
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=300,
-            passes=["conv_eltwiseadd_bn_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=300,
+                            passes=["conv_eltwiseadd_bn_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_bn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_bn_fuse_pass.py
index 62515fc2177b8..6ecfa50d65393 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_bn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_bn_fuse_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -40,11 +40,10 @@ class TestConvTransposeBnFusePass(PassAutoScanTest):
     '''
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=150,
-            max_duration=250,
-            passes=["conv_transpose_bn_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=150,
+                            max_duration=250,
+                            passes=["conv_transpose_bn_fuse_pass"])
 
     def sample_program_config(self, draw):
         # generate random number
@@ -54,26 +53,26 @@ def sample_program_config(self, draw):
         random_input_dim2 = draw(st.integers(min_value=20, max_value=50))
         random_groups = draw(st.integers(min_value=1, max_value=2))
         random_dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=3), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=3),
+                     min_size=2,
+                     max_size=2))
         random_strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=2))
         random_paddings = draw(
-            st.lists(
-                st.integers(
-                    min_value=0, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=0, max_value=4),
+                     min_size=2,
+                     max_size=2))
         random_padding_algorithm = draw(
             st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
         random_data_layout = draw(st.sampled_from(["NCHW", "NHWC"]))
         random_use_mkldnn = draw(st.booleans())
         random_output_size = []
         random_filter = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=2))
         random_out_channel = draw(st.integers(min_value=10, max_value=25))
         random_epsilon = draw(st.floats(min_value=0.0, max_value=0.001))
 
@@ -94,24 +93,24 @@ def generate_conv2d_Filter():
             return np.random.random(shape).astype(np.float32)
 
         def generate_batch_norm_Scale():
-            return np.random.random(
-                [random_out_channel * random_groups * random_groups]).astype(
-                    np.float32)
+            return np.random.random([
+                random_out_channel * random_groups * random_groups
+            ]).astype(np.float32)
 
         def generate_batch_norm_Bias():
-            return np.random.random(
-                [random_out_channel * random_groups * random_groups]).astype(
-                    np.float32)
+            return np.random.random([
+                random_out_channel * random_groups * random_groups
+            ]).astype(np.float32)
 
         def generate_batch_norm_Mean():
-            return np.random.random(
-                [random_out_channel * random_groups * random_groups]).astype(
-                    np.float32)
+            return np.random.random([
+                random_out_channel * random_groups * random_groups
+            ]).astype(np.float32)
 
         def generate_batch_norm_Variance():
-            return np.random.random(
-                [random_out_channel * random_groups * random_groups]).astype(
-                    np.float32)
+            return np.random.random([
+                random_out_channel * random_groups * random_groups
+            ]).astype(np.float32)
 
         # define op
         conv2d_op = OpConfig(
@@ -121,7 +120,9 @@ def generate_batch_norm_Variance():
                 "Filter": ["conv2d_Filter"],
                 #"Bias": ["conv2d_Bias"],
             },
-            outputs={"Output": ["conv2d_Out"], },
+            outputs={
+                "Output": ["conv2d_Out"],
+            },
             attrs={
                 'groups': random_groups,
                 'dilations': random_dilations,
@@ -135,30 +136,31 @@ def generate_batch_norm_Variance():
                 'is_test': True,
             })
 
-        batch_norm_op = OpConfig(
-            type="batch_norm",
-            inputs={
-                "X": ["conv2d_Out"],
-                "Scale": ["batch_norm_Scale"],
-                "Bias": ["batch_norm_Bias"],
-                "Mean": ["batch_norm_Mean"],
-                "Variance": ["batch_norm_Variance"],
-            },
-            outputs={
-                "Y": ["batch_norm_Y"],
-                "MeanOut": ["batch_norm_Mean"],
-                "VarianceOut": ["batch_norm_Variance"],
-                "SavedMean": ["batch_norm_SavedMean"],
-                "SavedVariance": ["batch_norm_SavedVariance"],
-                "ReserveSpace": ["batch_norm_ReserveSpace"],
-            },
-            attrs={
-                'epsilon': random_epsilon,
-                'is_test': True,
-                'trainable_statistics': False,
-                'data_layout': random_data_layout,
-                'use_mkldnn': random_use_mkldnn,
-            })
+        batch_norm_op = OpConfig(type="batch_norm",
+                                 inputs={
+                                     "X": ["conv2d_Out"],
+                                     "Scale": ["batch_norm_Scale"],
+                                     "Bias": ["batch_norm_Bias"],
+                                     "Mean": ["batch_norm_Mean"],
+                                     "Variance": ["batch_norm_Variance"],
+                                 },
+                                 outputs={
+                                     "Y": ["batch_norm_Y"],
+                                     "MeanOut": ["batch_norm_Mean"],
+                                     "VarianceOut": ["batch_norm_Variance"],
+                                     "SavedMean": ["batch_norm_SavedMean"],
+                                     "SavedVariance":
+                                     ["batch_norm_SavedVariance"],
+                                     "ReserveSpace":
+                                     ["batch_norm_ReserveSpace"],
+                                 },
+                                 attrs={
+                                     'epsilon': random_epsilon,
+                                     'is_test': True,
+                                     'trainable_statistics': False,
+                                     'data_layout': random_data_layout,
+                                     'use_mkldnn': random_use_mkldnn,
+                                 })
 
         # define model_net
         model_net = [conv2d_op, batch_norm_op]
@@ -169,7 +171,8 @@ def generate_batch_norm_Variance():
                 "conv2d_Input": TensorConfig(data_gen=generate_conv2d_Input),
             },
             weights={
-                "conv2d_Filter": TensorConfig(data_gen=generate_conv2d_Filter),
+                "conv2d_Filter":
+                TensorConfig(data_gen=generate_conv2d_Filter),
                 "batch_norm_Scale":
                 TensorConfig(data_gen=generate_batch_norm_Scale),
                 "batch_norm_Bias":
@@ -195,8 +198,7 @@ def sample_predictor_configs(self, program_config):
 
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         if attrs[0]['data_format'] == "NHWC":
@@ -205,6 +207,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def add_ignore_pass_case(self):
+
         def teller1(program_config, predictor_config):
             if program_config.ops[0].attrs['data_format'] == "NHWC":
                 return True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
index 58ae05183a424..29099b9a7a549 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -44,11 +44,10 @@ class TestConvTransposeEltwiseaddBnFusePass(PassAutoScanTest):
     '''
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=150,
-            max_duration=250,
-            passes=["conv_transpose_eltwiseadd_bn_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=150,
+                            max_duration=250,
+                            passes=["conv_transpose_eltwiseadd_bn_fuse_pass"])
 
     def sample_program_config(self, draw):
         # generate random number
@@ -58,26 +57,26 @@ def sample_program_config(self, draw):
         random_input_dim2 = draw(st.integers(min_value=20, max_value=50))
         random_groups = draw(st.integers(min_value=1, max_value=2))
         random_dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=3), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=3),
+                     min_size=2,
+                     max_size=2))
         random_strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=2))
         random_paddings = draw(
-            st.lists(
-                st.integers(
-                    min_value=0, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=0, max_value=4),
+                     min_size=2,
+                     max_size=2))
         random_padding_algorithm = draw(
             st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
         random_data_layout = draw(st.sampled_from(["NCHW", "NHWC"]))
         random_use_mkldnn = draw(st.booleans())
         random_output_size = []
         random_filter = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=2))
         random_out_channel = draw(st.integers(min_value=20, max_value=25))
         random_epsilon = draw(st.floats(min_value=0.0, max_value=0.001))
 
@@ -98,84 +97,89 @@ def generate_conv2d_Filter():
             return np.random.random(shape).astype(np.float32)
 
         def generate_elementwise_add_Y():
-            return np.random.random(
-                [random_out_channel * random_groups * random_groups]).astype(
-                    np.float32)
+            return np.random.random([
+                random_out_channel * random_groups * random_groups
+            ]).astype(np.float32)
 
         def generate_batch_norm_Scale():
-            return np.random.random(
-                [random_out_channel * random_groups * random_groups]).astype(
-                    np.float32)
+            return np.random.random([
+                random_out_channel * random_groups * random_groups
+            ]).astype(np.float32)
 
         def generate_batch_norm_Bias():
-            return np.random.random(
-                [random_out_channel * random_groups * random_groups]).astype(
-                    np.float32)
+            return np.random.random([
+                random_out_channel * random_groups * random_groups
+            ]).astype(np.float32)
 
         def generate_batch_norm_Mean():
-            return np.random.random(
-                [random_out_channel * random_groups * random_groups]).astype(
-                    np.float32)
+            return np.random.random([
+                random_out_channel * random_groups * random_groups
+            ]).astype(np.float32)
 
         def generate_batch_norm_Variance():
-            return np.random.random(
-                [random_out_channel * random_groups * random_groups]).astype(
-                    np.float32)
+            return np.random.random([
+                random_out_channel * random_groups * random_groups
+            ]).astype(np.float32)
 
         # define op
-        conv2d_op = OpConfig(
-            type="conv2d_transpose",
-            inputs={
-                "Input": ["conv2d_Input"],
-                "Filter": ["conv2d_Filter"],
-            },
-            outputs={"Output": ["conv2d_Out"], },
-            attrs={
-                'groups': random_groups,
-                'dilations': random_dilations,
-                'strides': random_strides,
-                'paddings': random_paddings,
-                'padding_algorithm': random_padding_algorithm,
-                'data_format': random_data_layout,
-                'output_size': random_output_size,
-                'output_padding': random_output_size,
-                'use_mkldnn': random_use_mkldnn,
-                'is_test': True,
-            })
-
-        elementwise_op = OpConfig(
-            type="elementwise_add",
-            inputs={
-                "X": ["conv2d_Out"],
-                "Y": ["elementwise_add_Y"],
-            },
-            outputs={"Out": ["elementwise_add_Out"], },
-            attrs={'axis': 1, })
-
-        batch_norm_op = OpConfig(
-            type="batch_norm",
-            inputs={
-                "X": ["elementwise_add_Out"],
-                "Scale": ["batch_norm_Scale"],
-                "Bias": ["batch_norm_Bias"],
-                "Mean": ["batch_norm_Mean"],
-                "Variance": ["batch_norm_Variance"],
-            },
-            outputs={
-                "Y": ["batch_norm_Y"],
-                "MeanOut": ["batch_norm_Mean"],
-                "VarianceOut": ["batch_norm_Variance"],
-                "SavedMean": ["batch_norm_SavedMean"],
-                "SavedVariance": ["batch_norm_SavedVariance"],
-                "ReserveSpace": ["batch_norm_ReserveSpace"],
-            },
-            attrs={
-                'epsilon': random_epsilon,
-                'is_test': True,
-                'trainable_statistics': False,
-                'data_layout': random_data_layout,
-                'use_mkldnn': random_use_mkldnn,
-            })
+        conv2d_op = OpConfig(type="conv2d_transpose",
+                             inputs={
+                                 "Input": ["conv2d_Input"],
+                                 "Filter": ["conv2d_Filter"],
+                             },
+                             outputs={
+                                 "Output": ["conv2d_Out"],
+                             },
+                             attrs={
+                                 'groups': random_groups,
+                                 'dilations': random_dilations,
+                                 'strides': random_strides,
+                                 'paddings': random_paddings,
+                                 'padding_algorithm': random_padding_algorithm,
+                                 'data_format': random_data_layout,
+                                 'output_size': random_output_size,
+                                 'output_padding': random_output_size,
+                                 'use_mkldnn': random_use_mkldnn,
+                                 'is_test': True,
+                             })
+
+        elementwise_op = OpConfig(type="elementwise_add",
+                                  inputs={
+                                      "X": ["conv2d_Out"],
+                                      "Y": ["elementwise_add_Y"],
+                                  },
+                                  outputs={
+                                      "Out": ["elementwise_add_Out"],
+                                  },
+                                  attrs={
+                                      'axis': 1,
+                                  })
+
+        batch_norm_op = OpConfig(type="batch_norm",
+                                 inputs={
+                                     "X": ["elementwise_add_Out"],
+                                     "Scale": ["batch_norm_Scale"],
+                                     "Bias": ["batch_norm_Bias"],
+                                     "Mean": ["batch_norm_Mean"],
+                                     "Variance": ["batch_norm_Variance"],
+                                 },
+                                 outputs={
+                                     "Y": ["batch_norm_Y"],
+                                     "MeanOut": ["batch_norm_Mean"],
+                                     "VarianceOut": ["batch_norm_Variance"],
+                                     "SavedMean": ["batch_norm_SavedMean"],
+                                     "SavedVariance":
+                                     ["batch_norm_SavedVariance"],
+                                     "ReserveSpace":
+                                     ["batch_norm_ReserveSpace"],
+                                 },
+                                 attrs={
+                                     'epsilon': random_epsilon,
+                                     'is_test': True,
+                                     'trainable_statistics': False,
+                                     'data_layout': random_data_layout,
+                                     'use_mkldnn': random_use_mkldnn,
+                                 })
 
         # define model_net
         model_net = [conv2d_op, elementwise_op, batch_norm_op]
@@ -187,7 +191,8 @@ def generate_batch_norm_Variance():
                 "conv2d_Input": TensorConfig(data_gen=generate_conv2d_Input),
             },
             weights={
-                "conv2d_Filter": TensorConfig(data_gen=generate_conv2d_Filter),
+                "conv2d_Filter":
+                TensorConfig(data_gen=generate_conv2d_Filter),
                 "elementwise_add_Y":
                 TensorConfig(data_gen=generate_elementwise_add_Y),
                 "batch_norm_Scale":
@@ -215,8 +220,7 @@ def sample_predictor_configs(self, program_config):
 
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         if attrs[0]['data_format'] == "NHWC":
@@ -225,6 +229,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def add_ignore_pass_case(self):
+
         def teller1(program_config, predictor_config):
             if program_config.ops[0].attrs['data_format'] == "NHWC":
                 return True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_emb_eltwise_layernorm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_emb_eltwise_layernorm_fuse_pass.py
index 7379a8d333b67..8001c76816e65 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_emb_eltwise_layernorm_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_emb_eltwise_layernorm_fuse_pass.py
@@ -57,8 +57,8 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         if program_config.ops[3].attrs['axis'] not in [-1, 2]:
             return False
 
-        if not (program_config.ops[5].attrs['epsilon'] >= 0 and
-                program_config.ops[5].attrs['epsilon'] <= 0.001):
+        if not (program_config.ops[5].attrs['epsilon'] >= 0
+                and program_config.ops[5].attrs['epsilon'] <= 0.001):
             return False
 
         if program_config.ops[5].attrs['begin_norm_axis'] != 2:
@@ -86,17 +86,17 @@ def sample_program_config(self, draw):
 
         def generate_input(attrs):
             if attrs[0]['op_type'] == 'lookup_table':
-                return np.random.randint(
-                    0,
-                    attrs[3]['weight_size'][0],
-                    size=(attrs[3]['batch_size'], attrs[3]['input_dim'],
-                          1)).astype(np.int64)
+                return np.random.randint(0,
+                                         attrs[3]['weight_size'][0],
+                                         size=(attrs[3]['batch_size'],
+                                               attrs[3]['input_dim'],
+                                               1)).astype(np.int64)
             else:
-                return np.random.randint(
-                    0,
-                    attrs[3]['weight_size'][0],
-                    size=(attrs[3]['batch_size'],
-                          attrs[3]['input_dim'])).astype(np.int64)
+                return np.random.randint(0,
+                                         attrs[3]['weight_size'][0],
+                                         size=(attrs[3]['batch_size'],
+                                               attrs[3]['input_dim'])).astype(
+                                                   np.int64)
 
         def generate_weight1(attrs):
             # set embedding weight by attrs
@@ -105,9 +105,9 @@ def generate_weight1(attrs):
         def generate_weight2(attrs):
             # set layernorm weight by attrs
             if attrs[2]['begin_norm_axis'] == 1:
-                return np.random.random(
-                    attrs[3]['input_dim'] *
-                    attrs[3]['weight_size'][1]).astype(np.float32)
+                return np.random.random(attrs[3]['input_dim'] *
+                                        attrs[3]['weight_size'][1]).astype(
+                                            np.float32)
             else:
                 return np.random.random(attrs[3]['weight_size'][1]).astype(
                     np.float32)
@@ -128,68 +128,69 @@ def generate_weight2(attrs):
             'weight_size': weight_size
         }]
 
-        emb_op1 = OpConfig(
-            type=attrs[0]['op_type'],
-            inputs={"Ids": ["input_data1"],
-                    "W": ["embedding_weight1"]},
-            outputs={"Out": ["embedding_output1"]},
-            attrs={
-                'is_sparse': attrs[0]['is_sparse'],
-                'is_distributed': attrs[0]['is_distributed'],
-                'padding_idx': attrs[0]['padding_idx']
-            })
-        emb_op2 = OpConfig(
-            type=attrs[0]['op_type'],
-            inputs={"Ids": ["input_data2"],
-                    "W": ["embedding_weight2"]},
-            outputs={"Out": ["embedding_output2"]},
-            attrs={
-                'is_sparse': attrs[0]['is_sparse'],
-                'is_distributed': attrs[0]['is_distributed'],
-                'padding_idx': attrs[0]['padding_idx']
-            })
-        emb_op3 = OpConfig(
-            type=attrs[0]['op_type'],
-            inputs={"Ids": ["input_data3"],
-                    "W": ["embedding_weight3"]},
-            outputs={"Out": ["embedding_output3"]},
-            attrs={
-                'is_sparse': attrs[0]['is_sparse'],
-                'is_distributed': attrs[0]['is_distributed'],
-                'padding_idx': attrs[0]['padding_idx']
-            })
-        add_op1 = OpConfig(
-            type='elementwise_add',
-            inputs={
-                "X": [emb_op2.outputs["Out"][0]],
-                "Y": [emb_op3.outputs["Out"][0]],
-            },
-            outputs={"Out": ["elementwise_add_output1"]},
-            attrs={"axis": attrs[1]['axis']})
-        add_op2 = OpConfig(
-            type='elementwise_add',
-            inputs={
-                "X": [add_op1.outputs["Out"][0]],
-                "Y": [emb_op1.outputs["Out"][0]],
-            },
-            outputs={"Out": ["elementwise_add_output2"]},
-            attrs={"axis": attrs[1]['axis']})
-        layer_norm_op = OpConfig(
-            type='layer_norm',
-            inputs={
-                "X": [add_op2.outputs["Out"][0]],
-                "Bias": ["layer_norm_bias"],
-                "Scale": ["layer_norm_scale"]
-            },
-            outputs={
-                "Y": ["layer_norm_output1"],
-                "Mean": ["layer_norm_output2"],
-                "Variance": ["layer_norm_output3"]
-            },
-            attrs={
-                'begin_norm_axis': attrs[2]['begin_norm_axis'],
-                'epsilon': attrs[2]['epsilon']
-            })
+        emb_op1 = OpConfig(type=attrs[0]['op_type'],
+                           inputs={
+                               "Ids": ["input_data1"],
+                               "W": ["embedding_weight1"]
+                           },
+                           outputs={"Out": ["embedding_output1"]},
+                           attrs={
+                               'is_sparse': attrs[0]['is_sparse'],
+                               'is_distributed': attrs[0]['is_distributed'],
+                               'padding_idx': attrs[0]['padding_idx']
+                           })
+        emb_op2 = OpConfig(type=attrs[0]['op_type'],
+                           inputs={
+                               "Ids": ["input_data2"],
+                               "W": ["embedding_weight2"]
+                           },
+                           outputs={"Out": ["embedding_output2"]},
+                           attrs={
+                               'is_sparse': attrs[0]['is_sparse'],
+                               'is_distributed': attrs[0]['is_distributed'],
+                               'padding_idx': attrs[0]['padding_idx']
+                           })
+        emb_op3 = OpConfig(type=attrs[0]['op_type'],
+                           inputs={
+                               "Ids": ["input_data3"],
+                               "W": ["embedding_weight3"]
+                           },
+                           outputs={"Out": ["embedding_output3"]},
+                           attrs={
+                               'is_sparse': attrs[0]['is_sparse'],
+                               'is_distributed': attrs[0]['is_distributed'],
+                               'padding_idx': attrs[0]['padding_idx']
+                           })
+        add_op1 = OpConfig(type='elementwise_add',
+                           inputs={
+                               "X": [emb_op2.outputs["Out"][0]],
+                               "Y": [emb_op3.outputs["Out"][0]],
+                           },
+                           outputs={"Out": ["elementwise_add_output1"]},
+                           attrs={"axis": attrs[1]['axis']})
+        add_op2 = OpConfig(type='elementwise_add',
+                           inputs={
+                               "X": [add_op1.outputs["Out"][0]],
+                               "Y": [emb_op1.outputs["Out"][0]],
+                           },
+                           outputs={"Out": ["elementwise_add_output2"]},
+                           attrs={"axis": attrs[1]['axis']})
+        layer_norm_op = OpConfig(type='layer_norm',
+                                 inputs={
+                                     "X": [add_op2.outputs["Out"][0]],
+                                     "Bias": ["layer_norm_bias"],
+                                     "Scale": ["layer_norm_scale"]
+                                 },
+                                 outputs={
+                                     "Y": ["layer_norm_output1"],
+                                     "Mean": ["layer_norm_output2"],
+                                     "Variance": ["layer_norm_output3"]
+                                 },
+                                 attrs={
+                                     'begin_norm_axis':
+                                     attrs[2]['begin_norm_axis'],
+                                     'epsilon': attrs[2]['epsilon']
+                                 })
 
         program_config = ProgramConfig(
             ops=[emb_op1, emb_op2, emb_op3, add_op1, add_op2, layer_norm_op],
@@ -241,36 +242,39 @@ def sample_predictor_configs(self, program_config):
             use_static=False,
             use_calib_mode=False)
         if program_config.ops[0].type == 'lookup_table':
-            config.set_trt_dynamic_shape_info({
-                "input_data1": [1, 4, 1],
-                "input_data2": [1, 4, 1],
-                "input_data3": [1, 4, 1]
-            }, {
-                "input_data1": [4, 512, 1],
-                "input_data2": [4, 512, 1],
-                "input_data3": [4, 512, 1]
-            }, {
-                "input_data1": [2, 128, 1],
-                "input_data2": [2, 128, 1],
-                "input_data3": [2, 128, 1]
-            })
+            config.set_trt_dynamic_shape_info(
+                {
+                    "input_data1": [1, 4, 1],
+                    "input_data2": [1, 4, 1],
+                    "input_data3": [1, 4, 1]
+                }, {
+                    "input_data1": [4, 512, 1],
+                    "input_data2": [4, 512, 1],
+                    "input_data3": [4, 512, 1]
+                }, {
+                    "input_data1": [2, 128, 1],
+                    "input_data2": [2, 128, 1],
+                    "input_data3": [2, 128, 1]
+                })
         else:
-            config.set_trt_dynamic_shape_info({
-                "input_data1": [1, 4],
-                "input_data2": [1, 4],
-                "input_data3": [1, 4]
-            }, {
-                "input_data1": [4, 512],
-                "input_data2": [4, 512],
-                "input_data3": [4, 512]
-            }, {
-                "input_data1": [2, 128],
-                "input_data2": [2, 128],
-                "input_data3": [2, 128]
-            })
+            config.set_trt_dynamic_shape_info(
+                {
+                    "input_data1": [1, 4],
+                    "input_data2": [1, 4],
+                    "input_data3": [1, 4]
+                }, {
+                    "input_data1": [4, 512],
+                    "input_data2": [4, 512],
+                    "input_data3": [4, 512]
+                }, {
+                    "input_data1": [2, 128],
+                    "input_data2": [2, 128],
+                    "input_data3": [2, 128]
+                })
         yield config, ['fused_embedding_eltwise_layernorm'], (1e-5, 1e-5)
 
     def add_ignore_pass_case(self):
+
         def teller1(program_config, predictor_config):
             if program_config.ops[3].attrs['axis'] in [
                     -1, 2
@@ -287,11 +291,10 @@ def teller1(program_config, predictor_config):
 
     def test(self):
         # this fuse need to fix, now there's no program can ran successfully
-        self.run_and_statis(
-            quant=False,
-            max_examples=50,
-            passes=["embedding_eltwise_layernorm_fuse_pass"],
-            min_success_num=0)
+        self.run_and_statis(quant=False,
+                            max_examples=50,
+                            passes=["embedding_eltwise_layernorm_fuse_pass"],
+                            min_success_num=0)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
index 26f91092d2af8..c6be25f9ff0a3 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
@@ -48,66 +48,74 @@ def sample_predictor_configs(self, program_config):
     def sample_program_config(self, draw):
         # 1. Generate shape of input:X of fc
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=5))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=5))
         x_shape = [2, 1]
         x_rank = len(x_shape)
         # 2. Generate attr:in_num_col_dims of fc
         in_num_col_dims = draw(st.integers(min_value=1, max_value=x_rank - 1))
         # 3. Generate legal shape of input:W/bias of fc
         w_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=2))
         w_shape[0] = int(np.prod(x_shape[in_num_col_dims:]))
         w_shape = [1, 2]
-        fc_bias_shape = [w_shape[1], ]
+        fc_bias_shape = [
+            w_shape[1],
+        ]
         if draw(st.booleans()):
             fc_bias_shape.insert(0, 1)
-        fc_bias_shape = [2, ]
+        fc_bias_shape = [
+            2,
+        ]
         fc_out_shape = x_shape[:in_num_col_dims] + w_shape[1:]
         # 4. Generate legal attr:axis/shape of elementwise_add
         add_bias_shape = fc_out_shape[:]
         axis = draw(st.integers(min_value=-1, max_value=0))
         # 5. Generate legal shape of layer_norm
         begin_norm_axis = draw(
-            st.integers(
-                min_value=1, max_value=len(fc_out_shape) - 1))
+            st.integers(min_value=1, max_value=len(fc_out_shape) - 1))
         layer_norm_shape = [int(np.prod(fc_out_shape[begin_norm_axis:]))]
         epsilon = 1e-5
 
         fc_op = OpConfig(
             "fc",
-            inputs={"Input": ["fc_x"],
-                    "W": ["fc_w"],
-                    "Bias": ["fc_bias"]},
+            inputs={
+                "Input": ["fc_x"],
+                "W": ["fc_w"],
+                "Bias": ["fc_bias"]
+            },
             outputs={"Out": ["fc_out"]},
             in_num_col_dims=in_num_col_dims,
             padding_weights=False,
             activation_type="",
             use_quantizer=False,
-            use_mkldnn=False, )
+            use_mkldnn=False,
+        )
         add_op = OpConfig(
             "elementwise_add",
-            inputs={"X": ["fc_out"],
-                    "Y": ["add_bias"]},
-            outputs={"Out": ["add_out"]},
-            axis=axis, )
-        layer_norm_op = OpConfig(
-            "layer_norm",
             inputs={
-                "X": ["add_out"],
-                "Scale": ["scale"],
-                "Bias": ["layer_norm_bias"]
-            },
-            outputs={
-                "Y": ["layer_norm_out"],
-                "Mean": ["layer_norm_mean"],
-                "Variance": ["layer_norm_var"]
+                "X": ["fc_out"],
+                "Y": ["add_bias"]
             },
-            begin_norm_axis=begin_norm_axis,
-            epsilon=epsilon)
+            outputs={"Out": ["add_out"]},
+            axis=axis,
+        )
+        layer_norm_op = OpConfig("layer_norm",
+                                 inputs={
+                                     "X": ["add_out"],
+                                     "Scale": ["scale"],
+                                     "Bias": ["layer_norm_bias"]
+                                 },
+                                 outputs={
+                                     "Y": ["layer_norm_out"],
+                                     "Mean": ["layer_norm_mean"],
+                                     "Variance": ["layer_norm_var"]
+                                 },
+                                 begin_norm_axis=begin_norm_axis,
+                                 epsilon=epsilon)
 
         ops = [fc_op, add_op, layer_norm_op]
         program_config = ProgramConfig(
@@ -119,15 +127,17 @@ def sample_program_config(self, draw):
                 "scale": TensorConfig(shape=layer_norm_shape),
                 "layer_norm_bias": TensorConfig(shape=layer_norm_shape),
             },
-            inputs={"fc_x": TensorConfig(shape=x_shape), },
-            outputs=ops[-1].outputs["Y"], )
+            inputs={
+                "fc_x": TensorConfig(shape=x_shape),
+            },
+            outputs=ops[-1].outputs["Y"],
+        )
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=300,
-            passes=["fc_elementwise_layernorm_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=300,
+                            passes=["fc_elementwise_layernorm_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
index dccc29e75f036..86262aaee102a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
@@ -106,19 +106,18 @@ def is_program_valid(self, prog_config):
     def sample_program_config(self, draw):
         # 1. Generate shape of input:X of mul
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=4))
         # 2. Generate attr:x_num_col_dims/y_num_col_dims of mul
         x_num_col_dims = draw(
-            st.integers(
-                min_value=1, max_value=len(x_shape) - 1))
+            st.integers(min_value=1, max_value=len(x_shape) - 1))
         y_num_col_dims = 1
         # 3. Generate legal shape of input:Y of mul
         y_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=2))
         y_shape[0] = int(np.prod(x_shape[x_num_col_dims:]))
         # 4. Generate legal attr:axis of elementwise_add
         mul_out_shape = x_shape[:x_num_col_dims] + y_shape[1:]
@@ -131,8 +130,7 @@ def sample_program_config(self, draw):
         else:
             max_bias_rank = 1
             bias_rank = draw(
-                st.integers(
-                    min_value=1, max_value=len(mul_out_shape)))
+                st.integers(min_value=1, max_value=len(mul_out_shape)))
             bias_shape = mul_out_shape[-1 * bias_rank:]
         # 6. Random choose if use broadcast for elementwise_add, e.g [3, 4] -> [1, 4]
         if draw(st.booleans()):
@@ -153,23 +151,28 @@ def sample_program_config(self, draw):
         # Use function `add_skip_pass_case` to ignore the programs even if they cause bug while runing
         mul_op = OpConfig(
             "mul",
-            inputs={"X": ["mul_x"],
-                    "Y": ["mul_y"]},
+            inputs={
+                "X": ["mul_x"],
+                "Y": ["mul_y"]
+            },
             outputs={"Out": ["mul_out"]},
             x_num_col_dims=x_num_col_dims,
-            y_num_col_dims=y_num_col_dims, )
+            y_num_col_dims=y_num_col_dims,
+        )
         add_op = OpConfig(
             "elementwise_add",
-            inputs={"X": ["mul_out"],
-                    "Y": ["bias"]},
+            inputs={
+                "X": ["mul_out"],
+                "Y": ["bias"]
+            },
             outputs={"Out": ["add_out"]},
-            axis=axis, )
+            axis=axis,
+        )
         ops = [mul_op, add_op]
         if has_relu:
-            relu_op = OpConfig(
-                "relu",
-                inputs={"X": ["add_out"]},
-                outputs={"Out": ["relu_out"]})
+            relu_op = OpConfig("relu",
+                               inputs={"X": ["add_out"]},
+                               outputs={"Out": ["relu_out"]})
             ops.append(relu_op)
         program_config = ProgramConfig(
             ops=ops,
@@ -177,13 +180,17 @@ def sample_program_config(self, draw):
                 "mul_y": TensorConfig(shape=y_shape),
                 "bias": TensorConfig(shape=bias_shape),
             },
-            inputs={"mul_x": TensorConfig(shape=x_shape), },
-            outputs=ops[-1].outputs["Out"], )
+            inputs={
+                "mul_x": TensorConfig(shape=x_shape),
+            },
+            outputs=ops[-1].outputs["Out"],
+        )
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False, max_examples=500, passes=["fc_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=500,
+                            passes=["fc_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py
index f7b43470d402f..3da1516e974f9 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py
@@ -21,26 +21,28 @@
 
 
 class FcGruFusePassTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             dict_dim, emb_dim = 128, 64
-            data = fluid.data(
-                name='step_data', shape=[None], dtype='int64', lod_level=1)
+            data = fluid.data(name='step_data',
+                              shape=[None],
+                              dtype='int64',
+                              lod_level=1)
             emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
             hidden_dim = 512
             x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
-            hidden = fluid.layers.dynamic_gru(
-                input=x,
-                size=hidden_dim,
-                bias_attr=True,
-                origin_mode=False,
-                is_reverse=True)
+            hidden = fluid.layers.dynamic_gru(input=x,
+                                              size=hidden_dim,
+                                              bias_attr=True,
+                                              origin_mode=False,
+                                              is_reverse=True)
 
         batch = 16
         lod_tensor = fluid.LoDTensor()
-        lod_tensor.set(np.random.randint(
-            0, dict_dim, size=[batch]).astype("int64"),
-                       fluid.CPUPlace())
+        lod_tensor.set(
+            np.random.randint(0, dict_dim, size=[batch]).astype("int64"),
+            fluid.CPUPlace())
         lod_tensor.set_lod([[0, batch]])
         self.feeds = {"step_data": lod_tensor}
         self.fetch_list = [hidden]
@@ -52,26 +54,28 @@ def test_check_output(self):
 
 
 class MulGruFusePassTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             dict_dim, emb_dim = 128, 64
-            data = fluid.data(
-                name='step_data', shape=[None], dtype='int64', lod_level=1)
+            data = fluid.data(name='step_data',
+                              shape=[None],
+                              dtype='int64',
+                              lod_level=1)
             emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
             hidden_dim = 512
             x = fluid.layers.fc(input=emb, size=hidden_dim * 3, bias_attr=False)
-            hidden = fluid.layers.dynamic_gru(
-                input=x,
-                size=hidden_dim,
-                bias_attr=True,
-                origin_mode=False,
-                is_reverse=True)
+            hidden = fluid.layers.dynamic_gru(input=x,
+                                              size=hidden_dim,
+                                              bias_attr=True,
+                                              origin_mode=False,
+                                              is_reverse=True)
 
         batch = 16
         lod_tensor = fluid.LoDTensor()
-        lod_tensor.set(np.random.randint(
-            0, dict_dim, size=[batch]).astype("int64"),
-                       fluid.CPUPlace())
+        lod_tensor.set(
+            np.random.randint(0, dict_dim, size=[batch]).astype("int64"),
+            fluid.CPUPlace())
         lod_tensor.set_lod([[0, batch]])
         self.feeds = {"step_data": lod_tensor}
         self.fetch_list = [hidden]
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py
index fbb4373dae2c4..4db2c5f96cd97 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py
@@ -21,23 +21,26 @@
 
 
 class MulLstmFusePassTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             dict_dim, emb_dim = 128, 64
             hidden_dim = 512
 
-            data = fluid.data(
-                name='data', shape=[1], dtype='int64', lod_level=1)
+            data = fluid.data(name='data',
+                              shape=[1],
+                              dtype='int64',
+                              lod_level=1)
             emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
             x = fluid.layers.fc(input=emb, size=hidden_dim * 4, bias_attr=False)
-            forward, cell = fluid.layers.dynamic_lstm(
-                input=x, size=hidden_dim * 4)
+            forward, cell = fluid.layers.dynamic_lstm(input=x,
+                                                      size=hidden_dim * 4)
 
         batch = 16
         lod_tensor = fluid.LoDTensor()
-        lod_tensor.set(np.random.randint(
-            0, dict_dim, size=[batch]).astype("int64"),
-                       fluid.CPUPlace())
+        lod_tensor.set(
+            np.random.randint(0, dict_dim, size=[batch]).astype("int64"),
+            fluid.CPUPlace())
         lod_tensor.set_lod([[0, batch]])
         self.feeds = {"data": lod_tensor}
         self.fetch_list = [forward, cell]
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_flatten2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_flatten2_matmul_fuse_pass.py
index ba99ac306c700..181ed89c65ee6 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_flatten2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_flatten2_matmul_fuse_pass.py
@@ -50,9 +50,9 @@ def sample_predictor_configs(self, program_config):
     def sample_program_config(self, draw):
         # 1. Generate shape and attr of flatten2
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=10), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=10),
+                     min_size=4,
+                     max_size=4))
         # [a, b, c, d] => [a, b*c*d]
         flatten_axis = 1
         flatten_shape = [x_shape[0], x_shape[1] * x_shape[2] * x_shape[3]]
@@ -64,15 +64,17 @@ def sample_program_config(self, draw):
 
         # 3. Generate legal shape of input:Y of matmul
         y_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=2))
         y_shape[0] = flatten_shape[1]
 
         # 4. Generate legal attr:axis of elementwise_add
         axis = draw(st.integers(min_value=-1, max_value=1))
         if axis == 0:
-            bias_shape = [flatten_shape[0], ]
+            bias_shape = [
+                flatten_shape[0],
+            ]
         elif axis == 1:
             bias_shape = [y_shape[1]]
         else:
@@ -82,14 +84,21 @@ def sample_program_config(self, draw):
 
         flatten2_op = OpConfig(
             "flatten2",
-            inputs={"X": ["flatten2_x"], },
+            inputs={
+                "X": ["flatten2_x"],
+            },
             axis=flatten_axis,
-            outputs={"Out": ["flatten2_out"],
-                     "XShape": ["xshape"]}, )
+            outputs={
+                "Out": ["flatten2_out"],
+                "XShape": ["xshape"]
+            },
+        )
         matmul_op = OpConfig(
             "matmul",
-            inputs={"X": ["flatten2_out"],
-                    "Y": ["matmul_y"]},
+            inputs={
+                "X": ["flatten2_out"],
+                "Y": ["matmul_y"]
+            },
             outputs={"Out": ["matmul_out"]},
             alpha=alpha,
             transpose_X=transpose_X,
@@ -99,14 +108,18 @@ def sample_program_config(self, draw):
             fused_transpose_X=[],
             fused_transpose_Y=[],
             fused_reshape_Out=[],
-            fused_transpose_Out=[], )
+            fused_transpose_Out=[],
+        )
 
         add_op = OpConfig(
             "elementwise_add",
-            inputs={"X": ["matmul_out"],
-                    "Y": ["bias"]},
+            inputs={
+                "X": ["matmul_out"],
+                "Y": ["bias"]
+            },
             outputs={"Out": ["add_out"]},
-            axis=axis, )
+            axis=axis,
+        )
 
         ops = [flatten2_op, matmul_op, add_op]
 
@@ -117,8 +130,11 @@ def sample_program_config(self, draw):
                     "matmul_y": TensorConfig(shape=y_shape),
                     "bias": TensorConfig(shape=bias_shape),
                 },
-                inputs={"flatten2_x": TensorConfig(shape=x_shape), },
-                outputs=ops[-1].outputs["Out"], )
+                inputs={
+                    "flatten2_x": TensorConfig(shape=x_shape),
+                },
+                outputs=ops[-1].outputs["Out"],
+            )
         else:
             program_config = ProgramConfig(
                 ops=ops,
@@ -128,15 +144,15 @@ def sample_program_config(self, draw):
                     "matmul_y": TensorConfig(shape=y_shape),
                     "bias": TensorConfig(shape=bias_shape),
                 },
-                outputs=ops[-1].outputs["Out"], )
+                outputs=ops[-1].outputs["Out"],
+            )
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=50,
-            max_duration=1000,
-            passes=["gpu_cpu_flatten2_matmul_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=50,
+                            max_duration=1000,
+                            passes=["gpu_cpu_flatten2_matmul_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_identity_scale_clean_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_identity_scale_clean_pass.py
index 8cacb6d29af0d..f7265193a85ba 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_identity_scale_clean_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_identity_scale_clean_pass.py
@@ -21,6 +21,7 @@
 
 
 class TestIdentityScaleCleanPass(PassAutoScanTest):
+
     def sample_predictor_configs(self, program_config):
         config = self.create_trt_inference_config()
         config.enable_tensorrt_engine(
@@ -39,15 +40,15 @@ def sample_program_config(self, draw):
         h = draw(st.integers(min_value=1, max_value=20))
         w = draw(st.integers(min_value=1, max_value=20))
 
-        relu_op = OpConfig(
-            "relu", inputs={"X": ["relu_x"]}, outputs={"Out": ["relu_out"]})
-        scale_op = OpConfig(
-            "scale",
-            inputs={"X": ["relu_out"]},
-            outputs={"Out": ["scale_out"]},
-            bias=0.,
-            scale=1.,
-            bias_after_scale=True)
+        relu_op = OpConfig("relu",
+                           inputs={"X": ["relu_x"]},
+                           outputs={"Out": ["relu_out"]})
+        scale_op = OpConfig("scale",
+                            inputs={"X": ["relu_out"]},
+                            outputs={"Out": ["scale_out"]},
+                            bias=0.,
+                            scale=1.,
+                            bias_after_scale=True)
         program_config = ProgramConfig(
             ops=[relu_op, scale_op],
             weights={},
@@ -56,8 +57,8 @@ def sample_program_config(self, draw):
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            max_examples=25, passes=["identity_scale_op_clean_pass"])
+        self.run_and_statis(max_examples=25,
+                            passes=["identity_scale_op_clean_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_layer_norm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_layer_norm_fuse_pass.py
index 7409bf17f3c12..a56ce98b37aa6 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_layer_norm_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_layer_norm_fuse_pass.py
@@ -75,21 +75,21 @@ def teller1(program_config, predictor_config):
         self.add_ignore_check_case(
             teller1,
             IgnoreReasons.PASS_ACCURACY_ERROR,
-            "Use bad case to test pass.", )
+            "Use bad case to test pass.",
+        )
 
     def sample_program_config(self, draw):
-        # 1. Generate shape of input:X 
+        # 1. Generate shape of input:X
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=4, max_size=5))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=4,
+                     max_size=5))
         x_shape_rank = len(x_shape)
         # 2. Generate attrs of reduce_mean
         keep_dim = draw(st.booleans())
         reduce_all = False
         begin_norm_axis = draw(
-            st.integers(
-                min_value=1, max_value=x_shape_rank - 1))
+            st.integers(min_value=1, max_value=x_shape_rank - 1))
         if begin_norm_axis == x_shape_rank - 1 and draw(st.booleans()):
             reduce_mean_dim = [-1]
         else:
@@ -98,7 +98,9 @@ def sample_program_config(self, draw):
         error_test_ratio = draw(st.integers(min_value=1, max_value=10))
         if error_test_ratio > 9:
             keep_dim = True
-            reduce_mean_dim = [1, ]
+            reduce_mean_dim = [
+                1,
+            ]
         elif error_test_ratio > 8:
             keep_dim = True
             begin_norm_axis = 1
@@ -111,20 +113,22 @@ def sample_program_config(self, draw):
         pow_axis = -1
 
         def generate_pow_data():
-            return np.array([2, ], dtype="float32")
+            return np.array([
+                2,
+            ], dtype="float32")
 
         # 5. Generate attrs of elementwise_add
         if keep_dim:
             add_axis = draw(
-                st.integers(
-                    min_value=-1, max_value=x_shape_rank - 1))
+                st.integers(min_value=-1, max_value=x_shape_rank - 1))
         else:
             add_axis = draw(
-                st.integers(
-                    min_value=-1, max_value=begin_norm_axis - 1))
+                st.integers(min_value=-1, max_value=begin_norm_axis - 1))
 
         def generate_epsilon_data():
-            return np.array([1e-5, ], dtype="float32")
+            return np.array([
+                1e-5,
+            ], dtype="float32")
 
         # 6. Generate attrs of elementwise_div
         div_axis = 0
@@ -142,58 +146,85 @@ def generate_epsilon_data():
 
         mean_op1 = OpConfig(
             "reduce_mean",
-            inputs={"X": ["x"], },
+            inputs={
+                "X": ["x"],
+            },
             outputs={"Out": ["mean_out"]},
             dim=reduce_mean_dim,
             keep_dim=keep_dim,
-            reduce_all=reduce_all, )
+            reduce_all=reduce_all,
+        )
         sub_op = OpConfig(
             "elementwise_sub",
-            inputs={"X": ["x"],
-                    "Y": ["mean_out"]},
+            inputs={
+                "X": ["x"],
+                "Y": ["mean_out"]
+            },
             outputs={"Out": ["sub_out"]},
-            axis=sub_axis, )
+            axis=sub_axis,
+        )
         pow_op = OpConfig(
             "elementwise_pow",
-            inputs={"X": ["sub_out"],
-                    "Y": ["pow_y"]},
+            inputs={
+                "X": ["sub_out"],
+                "Y": ["pow_y"]
+            },
             outputs={"Out": ["pow_out"]},
-            axis=pow_axis, )
+            axis=pow_axis,
+        )
         mean_op2 = OpConfig(
             "reduce_mean",
-            inputs={"X": ["pow_out"], },
+            inputs={
+                "X": ["pow_out"],
+            },
             outputs={"Out": ["mean_out2"]},
             dim=reduce_mean_dim,
             keep_dim=keep_dim,
-            reduce_all=reduce_all, )
+            reduce_all=reduce_all,
+        )
         add_op = OpConfig(
             "elementwise_add",
-            inputs={"X": ["mean_out2"],
-                    "Y": ["epsilon_var"]},
+            inputs={
+                "X": ["mean_out2"],
+                "Y": ["epsilon_var"]
+            },
             outputs={"Out": ["add_out"]},
-            axis=add_axis, )
+            axis=add_axis,
+        )
         sqrt_op = OpConfig(
             "sqrt",
-            inputs={"X": ["add_out"], },
-            outputs={"Out": ["sqrt_out"]}, )
+            inputs={
+                "X": ["add_out"],
+            },
+            outputs={"Out": ["sqrt_out"]},
+        )
         div_op = OpConfig(
             "elementwise_div",
-            inputs={"X": ["sub_out"],
-                    "Y": ["sqrt_out"]},
+            inputs={
+                "X": ["sub_out"],
+                "Y": ["sqrt_out"]
+            },
             outputs={"Out": ["div_out"]},
-            axis=div_axis, )
+            axis=div_axis,
+        )
         mul_op = OpConfig(
             "elementwise_mul",
-            inputs={"X": ["div_out"],
-                    "Y": ["gamma_var"]},
+            inputs={
+                "X": ["div_out"],
+                "Y": ["gamma_var"]
+            },
             outputs={"Out": ["mul_out"]},
-            axis=mul_axis, )
+            axis=mul_axis,
+        )
         add_op2 = OpConfig(
             "elementwise_add",
-            inputs={"X": ["mul_out"],
-                    "Y": ["beta_var"]},
+            inputs={
+                "X": ["mul_out"],
+                "Y": ["beta_var"]
+            },
             outputs={"Out": ["add_out2"]},
-            axis=add_axis2, )
+            axis=add_axis2,
+        )
 
         ops = [
             mean_op1, sub_op, pow_op, mean_op2, add_op, sqrt_op, div_op, mul_op,
@@ -208,15 +239,19 @@ def generate_epsilon_data():
                 "gamma_var": TensorConfig(shape=gamma_shape),
                 "beta_var": TensorConfig(shape=beta_shape),
             },
-            inputs={"x": TensorConfig(shape=x_shape), },
-            outputs=ops[-1].outputs["Out"], )
+            inputs={
+                "x": TensorConfig(shape=x_shape),
+            },
+            outputs=ops[-1].outputs["Out"],
+        )
         return program_config
 
     def test(self):
         self.run_and_statis(
             quant=False,
             max_examples=300,
-            passes=["layer_norm_fuse_pass"], )
+            passes=["layer_norm_fuse_pass"],
+        )
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_to_mul_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_to_mul_pass.py
index ce695ec2f01bf..2dc0556e9e21f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_to_mul_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_to_mul_pass.py
@@ -35,11 +35,15 @@ class TestMapMatmulToMulPass(PassAutoScanTest):
     def sample_predictor_configs(self, program_config):
         # cpu
         config = self.create_inference_config(use_gpu=False)
-        yield config, ["mul", ], (1e-5, 1e-5)
+        yield config, [
+            "mul",
+        ], (1e-5, 1e-5)
 
         # for gpu
         config = self.create_inference_config(use_gpu=True)
-        yield config, ["mul", ], (1e-5, 1e-5)
+        yield config, [
+            "mul",
+        ], (1e-5, 1e-5)
 
         # TRT
         # config = self.create_trt_inference_config()
@@ -75,13 +79,13 @@ def teller1(program_config, predictor_config):
     def sample_program_config(self, draw):
         # 1. Generate shape and attr of matmul
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=5))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=5))
         y_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=2))
         y_shape[0] = x_shape[-1]
         alpha = 1.0
         transpose_X = False
@@ -89,8 +93,10 @@ def sample_program_config(self, draw):
 
         matmul_op = OpConfig(
             "matmul",
-            inputs={"X": ["matmul_x"],
-                    "Y": ["matmul_y"]},
+            inputs={
+                "X": ["matmul_x"],
+                "Y": ["matmul_y"]
+            },
             outputs={"Out": ["matmul_out"]},
             alpha=alpha,
             transpose_X=transpose_X,
@@ -100,24 +106,31 @@ def sample_program_config(self, draw):
             fused_transpose_X=[],
             fused_transpose_Y=[],
             fused_reshape_Out=[],
-            fused_transpose_Out=[], )
-
-        ops = [matmul_op, ]
-        weights = {"matmul_y": TensorConfig(shape=y_shape), }
-        inputs = {"matmul_x": TensorConfig(shape=x_shape), }
+            fused_transpose_Out=[],
+        )
+
+        ops = [
+            matmul_op,
+        ]
+        weights = {
+            "matmul_y": TensorConfig(shape=y_shape),
+        }
+        inputs = {
+            "matmul_x": TensorConfig(shape=x_shape),
+        }
         program_config = ProgramConfig(
             ops=ops,
             weights=weights,
             inputs=inputs,
-            outputs=ops[-1].outputs["Out"], )
+            outputs=ops[-1].outputs["Out"],
+        )
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=100,
-            passes=["gpu_cpu_map_matmul_to_mul_pass"],
-            max_duration=180)
+        self.run_and_statis(quant=False,
+                            max_examples=100,
+                            passes=["gpu_cpu_map_matmul_to_mul_pass"],
+                            max_duration=180)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
index fac8b710c8ca4..2f0de50610f8e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
@@ -35,11 +35,15 @@ class TestMapMatmulToMulPass(PassAutoScanTest):
     def sample_predictor_configs(self, program_config):
         # cpu
         config = self.create_inference_config(use_gpu=False)
-        yield config, ["matmul", ], (1e-5, 1e-5)
+        yield config, [
+            "matmul",
+        ], (1e-5, 1e-5)
 
         # for gpu
         config = self.create_inference_config(use_gpu=True)
-        yield config, ["matmul", ], (1e-5, 1e-5)
+        yield config, [
+            "matmul",
+        ], (1e-5, 1e-5)
 
         # TRT
         # config = self.create_trt_inference_config()
@@ -71,13 +75,13 @@ def teller1(program_config, predictor_config):
     def sample_program_config(self, draw):
         # 1. Generate shape and attr of matmul
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=5))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=5))
         y_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=2))
         transpose_X = draw(st.booleans())
         transpose_Y = draw(st.booleans())
         if transpose_X:
@@ -96,8 +100,10 @@ def sample_program_config(self, draw):
 
         matmul_op = OpConfig(
             "matmul_v2",
-            inputs={"X": ["matmul_x"],
-                    "Y": ["matmul_y"]},
+            inputs={
+                "X": ["matmul_x"],
+                "Y": ["matmul_y"]
+            },
             outputs={"Out": ["matmul_out"]},
             alpha=alpha,
             trans_x=transpose_X,
@@ -107,9 +113,12 @@ def sample_program_config(self, draw):
             fused_reshape_X=[],
             fused_reshape_Y=[],
             fused_transpose_X=[],
-            fused_transpose_Y=[], )
+            fused_transpose_Y=[],
+        )
 
-        ops = [matmul_op, ]
+        ops = [
+            matmul_op,
+        ]
         weights = {}
         inputs = {
             "matmul_x": TensorConfig(shape=x_shape),
@@ -120,14 +129,14 @@ def sample_program_config(self, draw):
             ops=ops,
             weights=weights,
             inputs=inputs,
-            outputs=ops[-1].outputs["Out"], )
+            outputs=ops[-1].outputs["Out"],
+        )
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=100,
-            passes=["gpu_cpu_map_matmul_v2_to_matmul_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=100,
+                            passes=["gpu_cpu_map_matmul_v2_to_matmul_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
index e8a37ebc7ea09..d8dd7a0eac93d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
@@ -35,11 +35,15 @@ class TestMapMatmulToMulPass(PassAutoScanTest):
     def sample_predictor_configs(self, program_config):
         # cpu
         config = self.create_inference_config(use_gpu=False)
-        yield config, ["mul", ], (1e-5, 1e-5)
+        yield config, [
+            "mul",
+        ], (1e-5, 1e-5)
 
         # for gpu
         config = self.create_inference_config(use_gpu=True)
-        yield config, ["mul", ], (1e-5, 1e-5)
+        yield config, [
+            "mul",
+        ], (1e-5, 1e-5)
 
         # TRT
         # config = self.create_trt_inference_config()
@@ -71,13 +75,13 @@ def teller1(program_config, predictor_config):
     def sample_program_config(self, draw):
         # 1. Generate shape and attr of matmul
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=5))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=5))
         y_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=2))
         y_shape[0] = x_shape[-1]
         alpha = 1.0
         transpose_X = False
@@ -85,8 +89,10 @@ def sample_program_config(self, draw):
 
         matmul_op = OpConfig(
             "matmul_v2",
-            inputs={"X": ["matmul_x"],
-                    "Y": ["matmul_y"]},
+            inputs={
+                "X": ["matmul_x"],
+                "Y": ["matmul_y"]
+            },
             outputs={"Out": ["matmul_out"]},
             alpha=alpha,
             trans_x=transpose_X,
@@ -96,23 +102,30 @@ def sample_program_config(self, draw):
             fused_reshape_X=[],
             fused_reshape_Y=[],
             fused_transpose_X=[],
-            fused_transpose_Y=[], )
-
-        ops = [matmul_op, ]
-        weights = {"matmul_y": TensorConfig(shape=y_shape), }
-        inputs = {"matmul_x": TensorConfig(shape=x_shape), }
+            fused_transpose_Y=[],
+        )
+
+        ops = [
+            matmul_op,
+        ]
+        weights = {
+            "matmul_y": TensorConfig(shape=y_shape),
+        }
+        inputs = {
+            "matmul_x": TensorConfig(shape=x_shape),
+        }
         program_config = ProgramConfig(
             ops=ops,
             weights=weights,
             inputs=inputs,
-            outputs=ops[-1].outputs["Out"], )
+            outputs=ops[-1].outputs["Out"],
+        )
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=100,
-            passes=["gpu_cpu_map_matmul_v2_to_mul_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=100,
+                            passes=["gpu_cpu_map_matmul_v2_to_mul_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_scale_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_scale_fuse_pass.py
index 9c10ff18fa1f1..7e3ddf95fb764 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_scale_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_scale_fuse_pass.py
@@ -37,25 +37,27 @@ class TestMatmulScaleFusePass(PassAutoScanTest):
     def sample_predictor_configs(self, program_config):
         # cpu
         config = self.create_inference_config(use_gpu=False)
-        yield config, ["matmul", ], (1e-5, 1e-5)
+        yield config, [
+            "matmul",
+        ], (1e-5, 1e-5)
 
         # mkldnn
         config = self.create_inference_config(use_mkldnn=True)
-        yield config, ["matmul", ], (1e-5, 1e-5)
+        yield config, [
+            "matmul",
+        ], (1e-5, 1e-5)
 
     def sample_program_config(self, draw):
         # 1. Generate shape and attr of matmul
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=5))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=5))
         x_shape_rank = len(x_shape)
         y_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8),
-                min_size=x_shape_rank,
-                max_size=x_shape_rank))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=x_shape_rank,
+                     max_size=x_shape_rank))
         y_shape_rank = len(y_shape)
         y_shape[-2] = x_shape[-1]
         for i in range(y_shape_rank - 3, -1, -1):
@@ -73,8 +75,10 @@ def sample_program_config(self, draw):
 
         matmul_op = OpConfig(
             "matmul",
-            inputs={"X": ["matmul_x"],
-                    "Y": ["matmul_y"]},
+            inputs={
+                "X": ["matmul_x"],
+                "Y": ["matmul_y"]
+            },
             outputs={"Out": ["matmul_out"]},
             transpose_X=transpose_X,
             transpose_Y=transpose_Y,
@@ -85,25 +89,32 @@ def sample_program_config(self, draw):
             fused_transpose_Y=[],
             fused_reshape_Out=[],
             fused_transpose_Out=[],
-            head_number=1, )
+            head_number=1,
+        )
         is_scale_tensor = draw(st.booleans())
         if is_scale_tensor:
             scale_op = OpConfig(
                 "scale",
-                inputs={"X": ["matmul_out"],
-                        "ScaleTensor": ["scale_tensor"]},
+                inputs={
+                    "X": ["matmul_out"],
+                    "ScaleTensor": ["scale_tensor"]
+                },
                 outputs={"Out": ["scale_out"]},
                 scale=scale_value,
                 bias=0.0,
-                bias_after_scale=draw(st.booleans()), )
+                bias_after_scale=draw(st.booleans()),
+            )
         else:
             scale_op = OpConfig(
                 "scale",
-                inputs={"X": ["matmul_out"], },
+                inputs={
+                    "X": ["matmul_out"],
+                },
                 outputs={"Out": ["scale_out"]},
                 scale=scale_value,
                 bias=0.0,
-                bias_after_scale=draw(st.booleans()), )
+                bias_after_scale=draw(st.booleans()),
+            )
 
         ops = [matmul_op, scale_op]
         weights = {}
@@ -113,7 +124,9 @@ def sample_program_config(self, draw):
                 "matmul_y": TensorConfig(shape=y_shape),
                 "scale_tensor": TensorConfig(shape=scale_shape)
             }
-            inputs = {"matmul_x": TensorConfig(shape=x_shape), }
+            inputs = {
+                "matmul_x": TensorConfig(shape=x_shape),
+            }
         else:
             inputs = {
                 "matmul_x": TensorConfig(shape=x_shape),
@@ -124,14 +137,16 @@ def sample_program_config(self, draw):
             ops=ops,
             weights=weights,
             inputs=inputs,
-            outputs=ops[-1].outputs["Out"], )
+            outputs=ops[-1].outputs["Out"],
+        )
         return program_config
 
     def test(self):
         self.run_and_statis(
             quant=False,
             max_examples=100,
-            passes=["matmul_scale_fuse_pass"], )
+            passes=["matmul_scale_fuse_pass"],
+        )
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
index 47bd5623646a7..52da377599d33 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
@@ -43,21 +43,21 @@ def sample_predictor_configs(self, program_config):
 
         # mkldnn
         config = self.create_inference_config(use_mkldnn=True)
-        yield config, ["matmul_v2", ], (1e-5, 1e-5)
+        yield config, [
+            "matmul_v2",
+        ], (1e-5, 1e-5)
 
     def sample_program_config(self, draw):
         # 1. Generate shape and attr of matmul
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=5))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=5))
         x_shape_rank = len(x_shape)
         y_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8),
-                min_size=x_shape_rank,
-                max_size=x_shape_rank))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=x_shape_rank,
+                     max_size=x_shape_rank))
         y_shape_rank = len(y_shape)
         y_shape[-2] = x_shape[-1]
         for i in range(y_shape_rank - 3, -1, -1):
@@ -74,8 +74,10 @@ def sample_program_config(self, draw):
 
         matmul_v2_op = OpConfig(
             "matmul_v2",
-            inputs={"X": ["matmul_x"],
-                    "Y": ["matmul_y"]},
+            inputs={
+                "X": ["matmul_x"],
+                "Y": ["matmul_y"]
+            },
             outputs={"Out": ["matmul_out"]},
             trans_x=transpose_X,
             trans_y=transpose_Y,
@@ -84,43 +86,56 @@ def sample_program_config(self, draw):
             fused_transpose_X=[],
             fused_transpose_Y=[],
             fused_reshape_Out=[],
-            fused_transpose_Out=[], )
+            fused_transpose_Out=[],
+        )
         is_scale_tensor = draw(st.booleans())
         if is_scale_tensor:
             scale_op = OpConfig(
                 "scale",
-                inputs={"X": ["matmul_out"],
-                        "ScaleTensor": ["scale_tensor"]},
+                inputs={
+                    "X": ["matmul_out"],
+                    "ScaleTensor": ["scale_tensor"]
+                },
                 outputs={"Out": ["scale_out"]},
                 scale=scale_value,
                 bias=0.0,
-                bias_after_scale=draw(st.booleans()), )
+                bias_after_scale=draw(st.booleans()),
+            )
         else:
             scale_op = OpConfig(
                 "scale",
-                inputs={"X": ["matmul_out"], },
+                inputs={
+                    "X": ["matmul_out"],
+                },
                 outputs={"Out": ["scale_out"]},
                 scale=scale_value,
                 bias=0.0,
-                bias_after_scale=draw(st.booleans()), )
+                bias_after_scale=draw(st.booleans()),
+            )
 
         ops = [matmul_v2_op, scale_op]
-        weights = {"matmul_y": TensorConfig(shape=y_shape), }
+        weights = {
+            "matmul_y": TensorConfig(shape=y_shape),
+        }
         if is_scale_tensor:
             weights["scale_tensor"] = TensorConfig(shape=scale_shape)
-        inputs = {"matmul_x": TensorConfig(shape=x_shape), }
+        inputs = {
+            "matmul_x": TensorConfig(shape=x_shape),
+        }
         program_config = ProgramConfig(
             ops=ops,
             weights=weights,
             inputs=inputs,
-            outputs=ops[-1].outputs["Out"], )
+            outputs=ops[-1].outputs["Out"],
+        )
         return program_config
 
     def test(self):
         self.run_and_statis(
             quant=False,
             max_examples=100,
-            passes=["matmul_v2_scale_fuse_pass"], )
+            passes=["matmul_v2_scale_fuse_pass"],
+        )
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_batch_norm_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_batch_norm_act_fuse_pass.py
index 0012ebb05b162..6c17db2caa476 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_batch_norm_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_batch_norm_act_fuse_pass.py
@@ -26,6 +26,7 @@
 
 
 class TestScaleMatmulMkldnnFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -58,39 +59,40 @@ def generate_input():
         def generate_weight():
             return np.random.random(channel).astype(np.float32)
 
-        batch_norm_op = OpConfig(
-            type="batch_norm",
-            inputs={
-                "X": ["input_data"],
-                "Bias": ["Bias"],
-                "Mean": ["Mean"],
-                "Scale": ["Scale"],
-                "Variance": ["Variance"]
-            },
-            outputs={
-                "Y": ["norm_output"],
-                "MeanOut": ["Mean"],
-                "VarianceOut": ["Variance"],
-                "SavedMean": ["SavedMean"],
-                "SavedVariance": ["SavedVariance"]
-            },
-            attrs={
-                "data_layout": data_layout,
-                "epsilon": epsilon,
-                "fuse_with_relu": fuse_with_relu,
-                "is_test": is_test,
-                "momentum": momentum,
-                "trainable_statistics": trainable_statistics,
-                "use_global_stats": use_global_stats,
-                "use_mkldnn": use_mkldnn1
-            })
-
-        relu_op = OpConfig(
-            type="relu",
-            inputs={"X": ["norm_output"]},
-            outputs={"Out": ["relu_output"]},
-            attrs={"use_cudnn": use_cudnn,
-                   "use_mkldnn": use_mkldnn2})
+        batch_norm_op = OpConfig(type="batch_norm",
+                                 inputs={
+                                     "X": ["input_data"],
+                                     "Bias": ["Bias"],
+                                     "Mean": ["Mean"],
+                                     "Scale": ["Scale"],
+                                     "Variance": ["Variance"]
+                                 },
+                                 outputs={
+                                     "Y": ["norm_output"],
+                                     "MeanOut": ["Mean"],
+                                     "VarianceOut": ["Variance"],
+                                     "SavedMean": ["SavedMean"],
+                                     "SavedVariance": ["SavedVariance"]
+                                 },
+                                 attrs={
+                                     "data_layout": data_layout,
+                                     "epsilon": epsilon,
+                                     "fuse_with_relu": fuse_with_relu,
+                                     "is_test": is_test,
+                                     "momentum": momentum,
+                                     "trainable_statistics":
+                                     trainable_statistics,
+                                     "use_global_stats": use_global_stats,
+                                     "use_mkldnn": use_mkldnn1
+                                 })
+
+        relu_op = OpConfig(type="relu",
+                           inputs={"X": ["norm_output"]},
+                           outputs={"Out": ["relu_output"]},
+                           attrs={
+                               "use_cudnn": use_cudnn,
+                               "use_mkldnn": use_mkldnn2
+                           })
 
         model_net = [batch_norm_op, relu_op]
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv3d_bias_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv3d_bias_fuse_pass.py
index ae0ac6a3ecd43..3556e5ef3346d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv3d_bias_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv3d_bias_fuse_pass.py
@@ -26,6 +26,7 @@
 
 
 class TestConv3dBiasMkldnnFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -48,8 +49,8 @@ def generate_input1(attrs):
                     [attrs[2]['batch_size'], 64, 32, 64, 48]).astype(np.float32)
 
         def generate_weight1():
-            return np.random.random(
-                [16, int(48 / groups), 3, 3, 3]).astype(np.float32)
+            return np.random.random([16, int(48 / groups), 3, 3,
+                                     3]).astype(np.float32)
 
         def generate_weight2():
             return np.random.random([16]).astype(np.float32)
@@ -104,7 +105,8 @@ def generate_weight2():
         program_config = ProgramConfig(
             ops=ops,
             weights={
-                "conv_weight": TensorConfig(data_gen=partial(generate_weight1)),
+                "conv_weight":
+                TensorConfig(data_gen=partial(generate_weight1)),
                 "elementwise_weight":
                 TensorConfig(data_gen=partial(generate_weight2))
             },
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv3d_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv3d_op.py
index f6e668ed59097..7f75f8ddf4f7f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv3d_op.py
@@ -26,36 +26,41 @@
 
 
 class TestMkldnnConv3dOp(MkldnnAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self, *args, **kwargs):
+
         def generate_input(*args, **kwargs):
             if kwargs["data_format"] == "NCDHW":
-                return np.random.random(
-                    [kwargs["batch_size"], 48, 64, 32, 64]).astype(np.float32)
+                return np.random.random([kwargs["batch_size"], 48, 64, 32,
+                                         64]).astype(np.float32)
             else:
-                return np.random.random(
-                    [kwargs["batch_size"], 64, 32, 64, 48]).astype(np.float32)
+                return np.random.random([kwargs["batch_size"], 64, 32, 64,
+                                         48]).astype(np.float32)
 
         def generate_weight(*args, **kwargs):
-            return np.random.random(
-                [16, int(48 / kwargs["groups"]), 3, 3, 3]).astype(np.float32)
+            return np.random.random([16,
+                                     int(48 / kwargs["groups"]), 3, 3,
+                                     3]).astype(np.float32)
 
-        conv3d_op = OpConfig(
-            type="conv3d",
-            inputs={"Input": ["input_data"],
-                    "Filter": ["conv_weight"]},
-            outputs={"Output": ["conv_output"]},
-            attrs={
-                "data_format": kwargs["data_format"],
-                "dilations": kwargs["dilations"],
-                "padding_algorithm": kwargs["padding_algorithm"],
-                "groups": kwargs["groups"],
-                "paddings": kwargs["paddings"],
-                "strides": kwargs["strides"],
-                "is_test": True
-            })
+        conv3d_op = OpConfig(type="conv3d",
+                             inputs={
+                                 "Input": ["input_data"],
+                                 "Filter": ["conv_weight"]
+                             },
+                             outputs={"Output": ["conv_output"]},
+                             attrs={
+                                 "data_format": kwargs["data_format"],
+                                 "dilations": kwargs["dilations"],
+                                 "padding_algorithm":
+                                 kwargs["padding_algorithm"],
+                                 "groups": kwargs["groups"],
+                                 "paddings": kwargs["paddings"],
+                                 "strides": kwargs["strides"],
+                                 "is_test": True
+                             })
 
         program_config = ProgramConfig(
             ops=[conv3d_op],
@@ -82,8 +87,8 @@ def sample_predictor_configs(self, program_config):
         groups=st.sampled_from([2]),
         paddings=st.sampled_from([[0, 3, 2]]),
         strides=st.sampled_from([[1, 2, 1]]),
-        batch_size=st.integers(
-            min_value=1, max_value=4), )
+        batch_size=st.integers(min_value=1, max_value=4),
+    )
     def test(self, *args, **kwargs):
         self.run_test(*args, **kwargs)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
index 56cb0748a232b..645ca2202648d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
@@ -24,17 +24,18 @@
 
 
 class ConvActivationMkldnnFusePassTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32")
-            conv_out = fluid.layers.conv2d(
-                data,
-                num_filters=self.conv_num_filters,
-                filter_size=self.conv_filter_size,
-                bias_attr=self.conv_bias_attr,
-                act=self.act)
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 100, 100],
+                              dtype="float32")
+            conv_out = fluid.layers.conv2d(data,
+                                           num_filters=self.conv_num_filters,
+                                           filter_size=self.conv_filter_size,
+                                           bias_attr=self.conv_bias_attr,
+                                           act=self.act)
 
         self.feeds = {
             "data": np.random.random((1, 3, 100, 100)).astype("float32")
@@ -58,6 +59,7 @@ def test_pass_compatible(self):
 
 
 class ConvActivationMkldnnFusePassTest_1(ConvActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.conv_num_filters = 5
         self.conv_filter_size = 5
@@ -67,6 +69,7 @@ def set_params(self):
 
 
 class ConvActivationMkldnnFusePassTest_2(ConvActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.conv_num_filters = 3
         self.conv_filter_size = 3
@@ -76,6 +79,7 @@ def set_params(self):
 
 
 class ConvActivationMkldnnFusePassTest_3(ConvActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.conv_num_filters = 5
         self.conv_filter_size = 5
@@ -85,6 +89,7 @@ def set_params(self):
 
 
 class ConvActivationMkldnnFusePassTest_4(ConvActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.conv_num_filters = 3
         self.conv_filter_size = 3
@@ -94,6 +99,7 @@ def set_params(self):
 
 
 class ConvActivationMkldnnFusePassTest_5(ConvActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.conv_num_filters = 5
         self.conv_filter_size = 5
@@ -103,6 +109,7 @@ def set_params(self):
 
 
 class ConvActivationMkldnnFusePassTest_6(ConvActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.conv_num_filters = 5
         self.conv_filter_size = 5
@@ -112,6 +119,7 @@ def set_params(self):
 
 
 class ConvHardSigmoidOneDNNFusePassTest(ConvActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.conv_num_filters = 5
         self.conv_filter_size = 5
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py
index a35b75e69f812..89595f908171a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py
@@ -26,6 +26,7 @@
 
 
 class TestConvAffineChannelFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -41,17 +42,17 @@ def sample_program_config(self, draw):
         out_channel = groups * out_channel_factor
         batch_size = draw(st.integers(min_value=1, max_value=4))
         dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=2),
+                     min_size=2,
+                     max_size=2))
         paddings = draw(
-            st.lists(
-                st.integers(
-                    min_value=0, max_value=2), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=0, max_value=2),
+                     min_size=2,
+                     max_size=2))
         strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=2),
+                     min_size=2,
+                     max_size=2))
         has_bias = draw(st.booleans())
 
         x_shape = [
@@ -73,30 +74,28 @@ def generate_bias():
         def generate_scale_bias():
             return np.random.random(bias_shape).astype(np.float32)
 
-        conv2d_op = OpConfig(
-            "conv2d",
-            inputs={
-                "Input": ["input_data"],
-                "Filter": ["conv2d_weight"],
-            },
-            outputs={"Output": ["conv_output"]},
-            data_format=data_format,
-            dilations=dilations,
-            padding_algorithm=padding_algorithm,
-            groups=groups,
-            paddings=paddings,
-            strides=strides,
-            has_bias=has_bias,
-            is_test=True)
-        ac_op = OpConfig(
-            "affine_channel",
-            inputs={
-                "X": ["conv_output"],
-                "Scale": ["affine_channel_scale"],
-                "Bias": ["affine_channel_bias"]
-            },
-            outputs={"Out": ["affine_channel_ouput"]},
-            data_layout=data_format)
+        conv2d_op = OpConfig("conv2d",
+                             inputs={
+                                 "Input": ["input_data"],
+                                 "Filter": ["conv2d_weight"],
+                             },
+                             outputs={"Output": ["conv_output"]},
+                             data_format=data_format,
+                             dilations=dilations,
+                             padding_algorithm=padding_algorithm,
+                             groups=groups,
+                             paddings=paddings,
+                             strides=strides,
+                             has_bias=has_bias,
+                             is_test=True)
+        ac_op = OpConfig("affine_channel",
+                         inputs={
+                             "X": ["conv_output"],
+                             "Scale": ["affine_channel_scale"],
+                             "Bias": ["affine_channel_bias"]
+                         },
+                         outputs={"Out": ["affine_channel_ouput"]},
+                         data_layout=data_format)
         if has_bias == True:
             conv2d_op.inputs["Bias"] = ["conv2d_bias"]
         ops = [conv2d_op, ac_op]
@@ -109,7 +108,8 @@ def generate_scale_bias():
             weights={
                 "conv2d_weight":
                 TensorConfig(data_gen=partial(generate_weight)),
-                "conv2d_bias": TensorConfig(data_gen=partial(generate_bias)),
+                "conv2d_bias":
+                TensorConfig(data_gen=partial(generate_bias)),
                 "affine_channel_scale":
                 TensorConfig(data_gen=partial(generate_scale_bias)),
                 "affine_channel_bias":
@@ -126,7 +126,7 @@ def sample_predictor_configs(self, program_config):
         yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
 
     def add_ignore_pass_case(self):
-        # If the problem has been fixed, the judgment 
+        # If the problem has been fixed, the judgment
         # in is_program_valid needs to be deleted!!!
         def teller1(program_config, predictor_config):
             if program_config.ops[0].attrs['data_format'] == "NHWC":
@@ -135,8 +135,8 @@ def teller1(program_config, predictor_config):
 
         # mkldnn Output has diff with bias!
         def teller2(program_config, predictor_config):
-            return predictor_config.mkldnn_enabled() and program_config.ops[
-                0].attrs['has_bias'] == True
+            return predictor_config.mkldnn_enabled(
+            ) and program_config.ops[0].attrs['has_bias'] == True
 
         self.add_ignore_check_case(
             teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
@@ -151,7 +151,8 @@ def teller2(program_config, predictor_config):
     def test(self):
         self.run_and_statis(
             quant=False,
-            passes=["conv_affine_channel_mkldnn_fuse_pass"], )
+            passes=["conv_affine_channel_mkldnn_fuse_pass"],
+        )
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py
index 6c8b9d4d3a879..20c754aee95d9 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py
@@ -25,19 +25,20 @@
 
 #padding SAME
 class ConvBiasMkldnnFusePassSamePadTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 100, 100],
+                              dtype="float32")
             param_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.Xavier(uniform=False),
                 learning_rate=0.001)
-            conv_out = fluid.layers.conv2d(
-                input=data,
-                num_filters=3,
-                filter_size=3,
-                padding="SAME",
-                bias_attr=param_attr)
+            conv_out = fluid.layers.conv2d(input=data,
+                                           num_filters=3,
+                                           filter_size=3,
+                                           padding="SAME",
+                                           bias_attr=param_attr)
 
         self.feeds = {
             "data": np.random.random((1, 3, 100, 100)).astype("float32")
@@ -54,19 +55,20 @@ def test_check_output(self):
 
 #padding VALID
 class ConvBiasMkldnnFusePassValidPadTest(ConvBiasMkldnnFusePassSamePadTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 100, 100],
+                              dtype="float32")
             param_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.Xavier(uniform=False),
                 learning_rate=0.001)
-            conv_out = fluid.layers.conv2d(
-                input=data,
-                num_filters=3,
-                filter_size=3,
-                padding="VALID",
-                bias_attr=param_attr)
+            conv_out = fluid.layers.conv2d(input=data,
+                                           num_filters=3,
+                                           filter_size=3,
+                                           padding="VALID",
+                                           bias_attr=param_attr)
 
         self.feeds = {
             "data": np.random.random((1, 3, 100, 100)).astype("float32")
@@ -77,19 +79,20 @@ def setUp(self):
 
 #padding EXPLICT NUMBER
 class ConvBiasMkldnnFusePassExplictPadTest(ConvBiasMkldnnFusePassSamePadTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 100, 100],
+                              dtype="float32")
             param_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.Xavier(uniform=False),
                 learning_rate=0.001)
-            conv_out = fluid.layers.conv2d(
-                input=data,
-                num_filters=3,
-                filter_size=3,
-                padding=[2, 4, 6, 8],
-                bias_attr=param_attr)
+            conv_out = fluid.layers.conv2d(input=data,
+                                           num_filters=3,
+                                           filter_size=3,
+                                           padding=[2, 4, 6, 8],
+                                           bias_attr=param_attr)
 
         self.feeds = {
             "data": np.random.random((1, 3, 100, 100)).astype("float32")
@@ -99,23 +102,24 @@ def setUp(self):
 
 
 class ConvBiasMkldnnFusePassGroupTest(ConvBiasMkldnnFusePassSamePadTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 100, 100],
+                              dtype="float32")
             param_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.Xavier(uniform=False),
                 learning_rate=0.001)
-            conv_out = fluid.layers.conv2d(
-                input=data,
-                num_filters=3,
-                filter_size=3,
-                padding="VALID",
-                groups=3,
-                bias_attr=param_attr,
-                use_cudnn=False,
-                act="softmax",
-                data_format="NCHW")
+            conv_out = fluid.layers.conv2d(input=data,
+                                           num_filters=3,
+                                           filter_size=3,
+                                           padding="VALID",
+                                           groups=3,
+                                           bias_attr=param_attr,
+                                           use_cudnn=False,
+                                           act="softmax",
+                                           data_format="NCHW")
 
         self.feeds = {
             "data": np.random.random((1, 3, 100, 100)).astype("float32")
@@ -126,24 +130,25 @@ def setUp(self):
 
 class ConvBiasMkldnnFusePassDialtionsGroupsTest(
         ConvBiasMkldnnFusePassSamePadTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 100, 100],
+                              dtype="float32")
             param_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.Xavier(uniform=False),
                 learning_rate=0.001)
-            conv_out = fluid.layers.conv2d(
-                input=data,
-                num_filters=3,
-                filter_size=3,
-                padding="VALID",
-                dilation=2,
-                groups=3,
-                bias_attr=param_attr,
-                use_cudnn=False,
-                act="softmax",
-                data_format="NCHW")
+            conv_out = fluid.layers.conv2d(input=data,
+                                           num_filters=3,
+                                           filter_size=3,
+                                           padding="VALID",
+                                           dilation=2,
+                                           groups=3,
+                                           bias_attr=param_attr,
+                                           use_cudnn=False,
+                                           act="softmax",
+                                           data_format="NCHW")
 
         self.feeds = {
             "data": np.random.random((1, 3, 100, 100)).astype("float32")
@@ -153,20 +158,20 @@ def setUp(self):
 
 
 class ConvTransposeMkldnnFusePassDialtionsGroupsTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[-1, 3, 5, 5], dtype="float32")
             param_attr = fluid.ParamAttr(
                 initializer=fluid.initializer.Xavier(uniform=False),
                 learning_rate=0.001)
-            conv_out = fluid.layers.conv2d_transpose(
-                input=data,
-                num_filters=3,
-                filter_size=3,
-                padding="SAME",
-                dilation=1,
-                bias_attr=param_attr,
-                use_cudnn=False)
+            conv_out = fluid.layers.conv2d_transpose(input=data,
+                                                     num_filters=3,
+                                                     filter_size=3,
+                                                     padding="SAME",
+                                                     dilation=1,
+                                                     bias_attr=param_attr,
+                                                     use_cudnn=False)
 
         self.feeds = {"data": np.random.random((1, 3, 5, 5)).astype("float32")}
         self.fetch_list = [conv_out]
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
index 6654fbba264e0..2a313bbdaa1ef 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
@@ -26,6 +26,7 @@
 
 
 class TestConvConcatReluMkldnnFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -41,15 +42,15 @@ def sample_program_config(self, draw):
 
         def generate_input(attrs):
             if attrs[0]['data_format'] == "NCHW":
-                return np.random.random(
-                    [attrs[2]['batch_size'], 48, 64, 64]).astype(np.float32)
+                return np.random.random([attrs[2]['batch_size'], 48, 64,
+                                         64]).astype(np.float32)
             else:
-                return np.random.random(
-                    [attrs[2]['batch_size'], 64, 64, 48]).astype(np.float32)
+                return np.random.random([attrs[2]['batch_size'], 64, 64,
+                                         48]).astype(np.float32)
 
         def generate_weight():
-            return np.random.random(
-                [16, int(48 / groups), 3, 3]).astype(np.float32)
+            return np.random.random([16, int(48 / groups), 3,
+                                     3]).astype(np.float32)
 
         attrs = [{
             "data_format": data_format,
@@ -142,8 +143,8 @@ def sample_predictor_configs(self, program_config):
         yield config, ["conv2d", "conv2d", "concat"], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False, passes=["conv_concat_relu_mkldnn_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            passes=["conv_concat_relu_mkldnn_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
index 58d09a880619c..44b1e8bf0649f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
@@ -27,10 +27,10 @@
 
 # the two inputs of elementwise_add are tensor
 class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         if attrs[1]['data_format'] == "NHWC" and attrs[3]['axis'] == 0:
             return False
@@ -50,56 +50,58 @@ def sample_program_config(self, draw):
 
         def generate_input():
             if data_format == "NCHW":
-                return np.random.random(
-                    [batch_size, 48, 64, 64]).astype(np.float32)
+                return np.random.random([batch_size, 48, 64,
+                                         64]).astype(np.float32)
             else:
-                return np.random.random(
-                    [batch_size, 64, 64, 48]).astype(np.float32)
+                return np.random.random([batch_size, 64, 64,
+                                         48]).astype(np.float32)
 
         def generate_weight():
-            return np.random.random(
-                [48, int(48 / groups), 3, 3]).astype(np.float32)
-
-        relu_op = OpConfig(
-            type="relu",
-            inputs={"X": ["input_data"]},
-            outputs={"Out": ["relu_out"]},
-            attrs={})
-
-        conv2d_op1 = OpConfig(
-            type="conv2d",
-            inputs={"Input": ["relu_out"],
-                    "Filter": ["conv_weight1"]},
-            outputs={"Output": ["conv_output1"]},
-            attrs={
-                "data_format": data_format,
-                "dilations": dilations,
-                "padding_algorithm": padding_algorithm,
-                "groups": groups,
-                "paddings": paddings,
-                "strides": strides
-            })
-
-        conv2d_op2 = OpConfig(
-            type="conv2d",
-            inputs={"Input": ["input_data"],
-                    "Filter": ["conv_weight2"]},
-            outputs={"Output": ["conv_output2"]},
-            attrs={
-                "data_format": data_format,
-                "dilations": dilations,
-                "padding_algorithm": padding_algorithm,
-                "groups": groups,
-                "paddings": paddings,
-                "strides": strides
-            })
-
-        elt_op = OpConfig(
-            type="elementwise_add",
-            inputs={"X": ["conv_output1"],
-                    "Y": ["conv_output2"]},
-            outputs={"Out": ["elementwise_output"]},
-            attrs={'axis': axis})
+            return np.random.random([48, int(48 / groups), 3,
+                                     3]).astype(np.float32)
+
+        relu_op = OpConfig(type="relu",
+                           inputs={"X": ["input_data"]},
+                           outputs={"Out": ["relu_out"]},
+                           attrs={})
+
+        conv2d_op1 = OpConfig(type="conv2d",
+                              inputs={
+                                  "Input": ["relu_out"],
+                                  "Filter": ["conv_weight1"]
+                              },
+                              outputs={"Output": ["conv_output1"]},
+                              attrs={
+                                  "data_format": data_format,
+                                  "dilations": dilations,
+                                  "padding_algorithm": padding_algorithm,
+                                  "groups": groups,
+                                  "paddings": paddings,
+                                  "strides": strides
+                              })
+
+        conv2d_op2 = OpConfig(type="conv2d",
+                              inputs={
+                                  "Input": ["input_data"],
+                                  "Filter": ["conv_weight2"]
+                              },
+                              outputs={"Output": ["conv_output2"]},
+                              attrs={
+                                  "data_format": data_format,
+                                  "dilations": dilations,
+                                  "padding_algorithm": padding_algorithm,
+                                  "groups": groups,
+                                  "paddings": paddings,
+                                  "strides": strides
+                              })
+
+        elt_op = OpConfig(type="elementwise_add",
+                          inputs={
+                              "X": ["conv_output1"],
+                              "Y": ["conv_output2"]
+                          },
+                          outputs={"Out": ["elementwise_output"]},
+                          attrs={'axis': axis})
 
         model_net = [relu_op, conv2d_op1, conv2d_op2, elt_op]
 
@@ -121,8 +123,8 @@ def sample_predictor_configs(self, program_config):
         yield config, ["relu", "conv2d", "conv2d"], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            passes=["conv_elementwise_add_mkldnn_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
index 81bb182802ede..65634972117e2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
@@ -27,6 +27,7 @@
 
 
 class TestConvGeluMkldnnFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -42,15 +43,15 @@ def sample_program_config(self, draw):
 
         def generate_input():
             if data_format == "NCHW":
-                return np.random.random(
-                    [batch_size, 48, 64, 64]).astype(np.float32)
+                return np.random.random([batch_size, 48, 64,
+                                         64]).astype(np.float32)
             else:
-                return np.random.random(
-                    [batch_size, 64, 64, 48]).astype(np.float32)
+                return np.random.random([batch_size, 64, 64,
+                                         48]).astype(np.float32)
 
         def generate_weight():
-            return np.random.random(
-                [16, int(48 / groups), 3, 3]).astype(np.float32)
+            return np.random.random([16, int(48 / groups), 3,
+                                     3]).astype(np.float32)
 
         ops_config = [{
             "op_type": "conv2d",
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py
index 2eb071d6eb83b..d62770bf758d2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py
@@ -26,6 +26,7 @@
 
 
 class TestConvHardSigmoidMkldnnFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -42,15 +43,15 @@ def sample_program_config(self, draw):
 
         def generate_input():
             if data_format == "NCHW":
-                return np.random.random(
-                    [batch_size, 48, 64, 64]).astype(np.float32)
+                return np.random.random([batch_size, 48, 64,
+                                         64]).astype(np.float32)
             else:
-                return np.random.random(
-                    [batch_size, 64, 64, 48]).astype(np.float32)
+                return np.random.random([batch_size, 64, 64,
+                                         48]).astype(np.float32)
 
         def generate_weight():
-            return np.random.random(
-                [16, int(48 / groups), 3, 3]).astype(np.float32)
+            return np.random.random([16, int(48 / groups), 3,
+                                     3]).astype(np.float32)
 
         ops_config = [{
             "op_type": "conv2d",
@@ -102,8 +103,8 @@ def sample_predictor_configs(self, program_config):
         yield config, ["conv2d"], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False, passes=["conv_hard_sigmoid_mkldnn_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            passes=["conv_hard_sigmoid_mkldnn_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py
index 990489c32136a..ad54ca3d91e21 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py
@@ -26,6 +26,7 @@
 
 
 class TestConvHardSwishMkldnnFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -43,15 +44,15 @@ def sample_program_config(self, draw):
 
         def generate_input():
             if data_format == "NCHW":
-                return np.random.random(
-                    [batch_size, 48, 64, 64]).astype(np.float32)
+                return np.random.random([batch_size, 48, 64,
+                                         64]).astype(np.float32)
             else:
-                return np.random.random(
-                    [batch_size, 64, 64, 48]).astype(np.float32)
+                return np.random.random([batch_size, 64, 64,
+                                         48]).astype(np.float32)
 
         def generate_weight():
-            return np.random.random(
-                [16, int(48 / groups), 3, 3]).astype(np.float32)
+            return np.random.random([16, int(48 / groups), 3,
+                                     3]).astype(np.float32)
 
         ops_config = [{
             "op_type": "conv2d",
@@ -104,8 +105,8 @@ def sample_predictor_configs(self, program_config):
         yield config, ["conv2d"], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False, passes=["conv_hard_swish_mkldnn_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            passes=["conv_hard_swish_mkldnn_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_mish_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_mish_fuse_pass.py
index b4d2c95087c33..365ba5346e392 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_mish_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_mish_fuse_pass.py
@@ -21,6 +21,7 @@
 
 
 class TestConvMishMkldnnFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [op.attrs for op in program_config.ops]
         # If the problem has been fixed, the judgment
@@ -41,15 +42,15 @@ def sample_program_config(self, draw):
 
         def generate_input():
             if data_format == "NCHW":
-                return np.random.random(
-                    [batch_size, 48, 64, 64]).astype(np.float32)
+                return np.random.random([batch_size, 48, 64,
+                                         64]).astype(np.float32)
             else:
-                return np.random.random(
-                    [batch_size, 64, 64, 48]).astype(np.float32)
+                return np.random.random([batch_size, 64, 64,
+                                         48]).astype(np.float32)
 
         def generate_weight():
-            return np.random.random(
-                [16, int(48 / groups), 3, 3]).astype(np.float32)
+            return np.random.random([16, int(48 / groups), 3,
+                                     3]).astype(np.float32)
 
         ops_config = [{
             "op_type": "conv2d",
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py
index c5cedac226149..a05cbf5ba357f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py
@@ -26,10 +26,10 @@
 
 
 class TestConvTransposeMkldnnFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         if attrs[0]['data_format'] == "NCHW" and attrs[1]["axis"] == 3:
@@ -51,11 +51,11 @@ def sample_program_config(self, draw):
 
         def generate_input():
             if data_format == "NCHW":
-                return np.random.random(
-                    [batch_size, 16, 64, 64]).astype(np.float32)
+                return np.random.random([batch_size, 16, 64,
+                                         64]).astype(np.float32)
             else:
-                return np.random.random(
-                    [batch_size, 64, 64, 16]).astype(np.float32)
+                return np.random.random([batch_size, 64, 64,
+                                         16]).astype(np.float32)
 
         def generate_weight1():
             return np.random.random([16, 16, 3, 3]).astype(np.float32)
@@ -63,29 +63,31 @@ def generate_weight1():
         def generate_weight2():
             return np.random.random([16 * groups]).astype(np.float32)
 
-        conv2d_op = OpConfig(
-            type="conv2d_transpose",
-            inputs={"Input": ["input_data"],
-                    "Filter": ["conv2d_weight"]},
-            outputs={"Output": ["conv_output"]},
-            attrs={
-                "data_format": data_format,
-                "dilations": dilations,
-                "padding_algorithm": padding_algorithm,
-                "groups": groups,
-                "paddings": paddings,
-                "strides": strides,
-                "output_size": [],
-                "output_padding": [],
-                "is_test": True
-            })
-
-        elt_op = OpConfig(
-            type="elementwise_add",
-            inputs={"X": ["conv_output"],
-                    "Y": ["elementwise_weight"]},
-            outputs={"Out": ["elementwise_output"]},
-            attrs={'axis': axis})
+        conv2d_op = OpConfig(type="conv2d_transpose",
+                             inputs={
+                                 "Input": ["input_data"],
+                                 "Filter": ["conv2d_weight"]
+                             },
+                             outputs={"Output": ["conv_output"]},
+                             attrs={
+                                 "data_format": data_format,
+                                 "dilations": dilations,
+                                 "padding_algorithm": padding_algorithm,
+                                 "groups": groups,
+                                 "paddings": paddings,
+                                 "strides": strides,
+                                 "output_size": [],
+                                 "output_padding": [],
+                                 "is_test": True
+                             })
+
+        elt_op = OpConfig(type="elementwise_add",
+                          inputs={
+                              "X": ["conv_output"],
+                              "Y": ["elementwise_weight"]
+                          },
+                          outputs={"Out": ["elementwise_output"]},
+                          attrs={'axis': axis})
 
         model_net = [conv2d_op, elt_op]
 
@@ -109,10 +111,9 @@ def sample_predictor_configs(self, program_config):
         yield config, ['conv2d_transpose'], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_duration=300,
-            passes=["conv_transpose_bias_mkldnn_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_duration=300,
+                            passes=["conv_transpose_bias_mkldnn_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py
index 4b36e4b742c9d..95996f22a86d7 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py
@@ -20,11 +20,13 @@
 
 
 class TestMKLDNNCpuBfloat16Pass(InferencePassTest):
+
     def setUp(self):
         self.init_data()
         with fluid.program_guard(self.main_program, self.startup_program):
-            x = fluid.data(
-                name='x', shape=[-1] + self.shape_x, dtype=self.d_type)
+            x = fluid.data(name='x',
+                           shape=[-1] + self.shape_x,
+                           dtype=self.d_type)
             out = fluid.layers.transpose(x, perm=[0, 1, 2, 3])
             out = fluid.layers.reshape(out, [0, 0, 0, 0])
             out = fluid.layers.fc(out, size=1)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_depthwise_conv_pass.py
index b484a88273b20..312b77acaa4bc 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_depthwise_conv_pass.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -48,24 +48,24 @@ def sample_program_config(self, draw):
 
         random_groups = draw(st.integers(min_value=1, max_value=3))
         random_dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=3), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=3),
+                     min_size=2,
+                     max_size=2))
         random_strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=2))
         random_paddings = draw(
-            st.lists(
-                st.integers(
-                    min_value=0, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=0, max_value=4),
+                     min_size=2,
+                     max_size=2))
         random_padding_algorithm = draw(
             st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
         random_data_layout = draw(st.sampled_from(["NCHW", "NHWC"]))
         random_filter = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=2))
 
         def generate_conv2d_Input():
             shape = [random_input_dim1, random_input_dim2]
@@ -84,22 +84,23 @@ def generate_conv2d_Filter():
             return np.random.random(shape).astype(np.float32)
 
         # define op
-        conv2d_op = OpConfig(
-            type="depthwise_conv2d",
-            inputs={
-                "Input": ["conv2d_Input"],
-                "Filter": ["conv2d_Filter"],
-            },
-            outputs={"Output": ["conv2d_Out"], },
-            attrs={
-                'groups': random_groups,
-                'dilations': random_dilations,
-                'strides': random_strides,
-                'paddings': random_paddings,
-                'padding_algorithm': random_padding_algorithm,
-                'data_format': random_data_layout,
-                'use_mkldnn': True,
-            })
+        conv2d_op = OpConfig(type="depthwise_conv2d",
+                             inputs={
+                                 "Input": ["conv2d_Input"],
+                                 "Filter": ["conv2d_Filter"],
+                             },
+                             outputs={
+                                 "Output": ["conv2d_Out"],
+                             },
+                             attrs={
+                                 'groups': random_groups,
+                                 'dilations': random_dilations,
+                                 'strides': random_strides,
+                                 'paddings': random_paddings,
+                                 'padding_algorithm': random_padding_algorithm,
+                                 'data_format': random_data_layout,
+                                 'use_mkldnn': True,
+                             })
 
         # define model_net
         model_net = [conv2d_op]
@@ -124,8 +125,7 @@ def sample_predictor_configs(self, program_config):
 
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         if attrs[0]['data_format'] == "NHWC":
@@ -134,6 +134,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def add_ignore_pass_case(self):
+
         def teller1(program_config, predictor_config):
             if program_config.ops[0].attrs['data_format'] == "NHWC":
                 return True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
index 893bd3833430c..12f4249a4d66d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
@@ -30,10 +30,12 @@ class ElementwiseActivationMkldnnFusePassTest(InferencePassTest):
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data_A = fluid.data(
-                name="data_A", shape=[-1, 3, 100, 100], dtype="float32")
-            data_B = fluid.data(
-                name="data_B", shape=[-1, 3, 100, 100], dtype="float32")
+            data_A = fluid.data(name="data_A",
+                                shape=[-1, 3, 100, 100],
+                                dtype="float32")
+            data_B = fluid.data(name="data_B",
+                                shape=[-1, 3, 100, 100],
+                                dtype="float32")
             elt_out = self.operand(data_A, data_B)
             if self.act is not None:
                 if self.act_beta is not None:
@@ -64,6 +66,7 @@ def test_pass_compatible(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Add_Relu(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_add
         self.act = fluid.layers.relu
@@ -71,6 +74,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Add_Tanh(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_add
         self.act = fluid.layers.tanh
@@ -78,6 +82,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Add_LeakyRelu(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_add
         self.act_alpha = 0.2
@@ -86,6 +91,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Add_Swish(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_add
         self.act_alpha = 4
@@ -94,6 +100,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Add_HardSwish(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_add
         self.act = fluid.layers.hard_swish
@@ -101,6 +108,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Add_SQRT(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_add
         self.act = fluid.layers.sqrt
@@ -108,6 +116,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Add_ABS(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_add
         self.act = fluid.layers.abs
@@ -115,6 +124,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Add_Clip(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_add
         self.act = fluid.layers.clip
@@ -124,6 +134,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Add_Gelu(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_add
         self.act = fluid.layers.gelu
@@ -131,6 +142,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Add_Gelu_Tanh(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_add
         self.act = fluid.layers.gelu
@@ -139,6 +151,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Add_Relu6(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_add
         self.act = fluid.layers.relu6
@@ -147,6 +160,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Add_Sigmoid(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_add
         self.act = fluid.layers.sigmoid
@@ -154,6 +168,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Sub_Relu(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_sub
         self.act = fluid.layers.relu
@@ -161,6 +176,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Sub_Tanh(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_sub
         self.act = fluid.layers.tanh
@@ -168,6 +184,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Sub_LeakyRelu(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_sub
         self.act_alpha = 0.2
@@ -176,6 +193,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Sub_Swish(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_sub
         self.act = fluid.layers.swish
@@ -183,6 +201,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Sub_HardSwish(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_sub
         self.act = fluid.layers.hard_swish
@@ -190,6 +209,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Sub_ABS(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_sub
         self.act = fluid.layers.abs
@@ -197,6 +217,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Sub_Clip(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_sub
         self.act = fluid.layers.clip
@@ -206,6 +227,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_sub
         self.act = fluid.layers.gelu
@@ -213,6 +235,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu_Tanh(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_sub
         self.act = fluid.layers.gelu
@@ -221,6 +244,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Sub_Relu6(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_sub
         self.act = fluid.layers.relu6
@@ -229,6 +253,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Sub_Sigmoid(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_sub
         self.act = fluid.layers.sigmoid
@@ -236,6 +261,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Mul_Relu(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_mul
         self.act = fluid.layers.relu
@@ -243,6 +269,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Mul_Tanh(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_mul
         self.act = fluid.layers.tanh
@@ -250,6 +277,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Mul_LeakyRelu(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_mul
         self.act_alpha = 0.2
@@ -258,6 +286,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Mul_Swish(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_mul
         self.act = fluid.layers.swish
@@ -265,6 +294,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Mul_HardSwish(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_mul
         self.act = fluid.layers.hard_swish
@@ -272,6 +302,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Mul_SQRT(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_mul
         self.act = fluid.layers.sqrt
@@ -279,6 +310,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Mul_ABS(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_mul
         self.act = fluid.layers.abs
@@ -286,6 +318,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Mul_Clip(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_mul
         self.act = fluid.layers.clip
@@ -295,6 +328,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_mul
         self.act = fluid.layers.gelu
@@ -302,6 +336,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu_Tanh(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_mul
         self.act = fluid.layers.gelu
@@ -310,6 +345,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Mul_Relu6(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_mul
         self.act = fluid.layers.relu6
@@ -318,6 +354,7 @@ def set_params(self):
 
 class ElementwiseActivationMkldnnFusePassTest_Mul_Sigmoid(
         ElementwiseActivationMkldnnFusePassTest):
+
     def set_params(self):
         self.operand = fluid.layers.elementwise_mul
         self.act = fluid.layers.sigmoid
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py
index 0f5279b0edadd..78393ef59b658 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py
@@ -25,6 +25,7 @@
 
 
 class TestElementWiseAddReluFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -32,8 +33,8 @@ def sample_program_config(self, draw):
         batch_size = draw(st.integers(min_value=1, max_value=4))
 
         def generate_input():
-            return np.random.random(
-                [batch_size, 3, 100, 100]).astype(np.float32)
+            return np.random.random([batch_size, 3, 100,
+                                     100]).astype(np.float32)
 
         ops_config = [{
             "op_type": "elementwise_add",
@@ -74,8 +75,9 @@ def sample_predictor_configs(self, program_config):
         yield config, ["elementwise_add"], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False, passes=["elt_act_mkldnn_fuse_pass"], min_success_num=4)
+        self.run_and_statis(quant=False,
+                            passes=["elt_act_mkldnn_fuse_pass"],
+                            min_success_num=4)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_act_fuse_pass.py
index 66bcca51bed1d..1a30c0f2d3d98 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_act_fuse_pass.py
@@ -26,11 +26,13 @@
 
 
 class FCGeluTanhOneDnnFusePassTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 128, 768], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 128, 768],
+                              dtype="float32")
             fc_out = fluid.layers.fc(input=data, size=3072, num_flatten_dims=2)
             gelu_out = fluid.layers.gelu(fc_out, approximate=False)
 
@@ -47,11 +49,13 @@ def test_check_output(self):
 
 
 class FCGeluErfOneDnnFusePassTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 128, 768], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 128, 768],
+                              dtype="float32")
             fc_out = fluid.layers.fc(input=data, size=3072, num_flatten_dims=2)
             gelu_out = fluid.layers.gelu(fc_out, approximate=True)
 
@@ -69,11 +73,13 @@ def test_check_output(self):
 
 
 class FCTanhOneDnnFusePassTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 128, 768], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 128, 768],
+                              dtype="float32")
             fc_out = fluid.layers.fc(input=data, size=3072, num_flatten_dims=2)
             tanh_out = fluid.layers.tanh(fc_out)
 
@@ -91,11 +97,13 @@ def test_check_output(self):
 
 
 class FCSigmoidOneDnnFusePassTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 128, 768], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 128, 768],
+                              dtype="float32")
             fc_out = fluid.layers.fc(input=data, size=3072, num_flatten_dims=2)
             sigmoid_out = fluid.layers.sigmoid(fc_out)
 
@@ -113,11 +121,13 @@ def test_check_output(self):
 
 
 class FCHardSwishOneDnnFusePassTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 128, 768], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 128, 768],
+                              dtype="float32")
             fc_out = fluid.layers.fc(input=data, size=3072, num_flatten_dims=2)
             hardswish_out = fluid.layers.hard_swish(fc_out)
 
@@ -135,11 +145,13 @@ def test_check_output(self):
 
 
 class FCMishOneDnnFusePassTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 128, 768], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 128, 768],
+                              dtype="float32")
             fc_out = fluid.layers.fc(input=data, size=3072, num_flatten_dims=2)
             mish_out = fluid.layers.mish(fc_out)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_elementwise_add_fuse_pass.py
index 22b8960497beb..61492b1d05df6 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_elementwise_add_fuse_pass.py
@@ -26,6 +26,7 @@
 
 
 class TestFCElementwiseAddMkldnnFusePass(PassAutoScanTest):
+
     def sample_program_config(self, draw):
         axis = draw(st.sampled_from([-1, 0, 1]))
         fc_as_x = draw(st.sampled_from([True, False]))
@@ -41,37 +42,34 @@ def generate_fc_weight():
         def generate_fc_bias():
             return np.random.random([fc_wei]).astype(np.float32)
 
-        relu_op = OpConfig(
-            type="relu",
-            inputs={"X": ["input_data"]},
-            outputs={"Out": ["relu_out"]},
-            attrs={})
-
-        fc_op = OpConfig(
-            type="fc",
-            inputs={
-                "Input": ["relu_out"],
-                "W": ["fc_weight"],
-                "Bias": ["fc_bias"]
-            },
-            outputs={"Out": ["fc_output"]},
-            attrs={
-                "use_mkldnn": True,
-                "padding_weights": False,
-                "activation_type": "",
-                "in_num_col_dims": 1,
-            })
+        relu_op = OpConfig(type="relu",
+                           inputs={"X": ["input_data"]},
+                           outputs={"Out": ["relu_out"]},
+                           attrs={})
+
+        fc_op = OpConfig(type="fc",
+                         inputs={
+                             "Input": ["relu_out"],
+                             "W": ["fc_weight"],
+                             "Bias": ["fc_bias"]
+                         },
+                         outputs={"Out": ["fc_output"]},
+                         attrs={
+                             "use_mkldnn": True,
+                             "padding_weights": False,
+                             "activation_type": "",
+                             "in_num_col_dims": 1,
+                         })
 
         if fc_as_x:
             inputs = {"X": ["fc_output"], "Y": ["input_data"]}
         else:
             inputs = {"X": ["input_data"], "Y": ["fc_output"]}
 
-        elt_add_op = OpConfig(
-            type="elementwise_add",
-            inputs=inputs,
-            outputs={"Out": ["elementwise_output"]},
-            attrs={'axis': axis})
+        elt_add_op = OpConfig(type="elementwise_add",
+                              inputs=inputs,
+                              outputs={"Out": ["elementwise_output"]},
+                              attrs={'axis': axis})
 
         model_net = [relu_op, fc_op, elt_add_op]
 
@@ -93,8 +91,8 @@ def sample_predictor_configs(self, program_config):
         yield config, ["relu", "fc"], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False, passes=["fc_elementwise_add_mkldnn_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            passes=["fc_elementwise_add_mkldnn_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_mish_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_mish_fuse_pass.py
index 20a7cddbeb223..dd9321b6a74be 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_mish_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_fc_mish_fuse_pass.py
@@ -20,16 +20,17 @@
 
 
 class TestFCMishMkldnnFusePass(PassAutoScanTest):
+
     def sample_program_config(self, draw):
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=128), min_size=2, max_size=3))
+            st.lists(st.integers(min_value=1, max_value=128),
+                     min_size=2,
+                     max_size=3))
         in_num_col_dims = len(x_shape) - 1
         w_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=128), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=128),
+                     min_size=2,
+                     max_size=2))
         w_shape[0] = int(np.prod(x_shape[in_num_col_dims:]))
         fc_bias_shape = [w_shape[1]]
 
@@ -62,14 +63,17 @@ def sample_program_config(self, draw):
 
         ops = self.generate_op_config(ops_config)
 
-        program_config = ProgramConfig(
-            ops=ops,
-            weights={
-                "fc_w": TensorConfig(shape=w_shape),
-                "fc_bias": TensorConfig(shape=fc_bias_shape),
-            },
-            inputs={"fc_x": TensorConfig(shape=x_shape), },
-            outputs=["mish_output"])
+        program_config = ProgramConfig(ops=ops,
+                                       weights={
+                                           "fc_w":
+                                           TensorConfig(shape=w_shape),
+                                           "fc_bias":
+                                           TensorConfig(shape=fc_bias_shape),
+                                       },
+                                       inputs={
+                                           "fc_x": TensorConfig(shape=x_shape),
+                                       },
+                                       outputs=["mish_output"])
         return program_config
 
     def sample_predictor_configs(self, program_config):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_inplace_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_inplace_fuse_pass.py
index 4215e56de2cc7..3b7f0162c20e0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_inplace_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_inplace_fuse_pass.py
@@ -25,17 +25,22 @@
 
 
 class MkldnnInplacePassTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             paddle.enable_static()
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32")
-            conv_out_1 = fluid.layers.conv2d(
-                data, num_filters=3, filter_size=3, bias_attr=False)
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 100, 100],
+                              dtype="float32")
+            conv_out_1 = fluid.layers.conv2d(data,
+                                             num_filters=3,
+                                             filter_size=3,
+                                             bias_attr=False)
             softmax_out = fluid.layers.softmax(conv_out_1)
             relu_out = fluid.layers.relu(conv_out_1)
-            eltwise_out = fluid.layers.elementwise_add(
-                softmax_out, relu_out, axis=-1)
+            eltwise_out = fluid.layers.elementwise_add(softmax_out,
+                                                       relu_out,
+                                                       axis=-1)
 
         self.pass_name = 'mkldnn_inplace_pass'
         self.feeds = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_int8_scale_calculation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
index 31415f6472587..3d2895cc619d4 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
@@ -20,6 +20,7 @@
 
 
 class TestInt8ScaleCalculationMkldnnPass(PassAutoScanTest):
+
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_gpu=False)
         config.pass_builder().append_pass("int8_scale_calculation_mkldnn_pass")
@@ -56,40 +57,40 @@ def is_program_valid(self, prog_config):
 
     def sample_program_config(self, draw):
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=5, max_value=100), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=5, max_value=100),
+                     min_size=4,
+                     max_size=4))
         x_shape[1] = draw(st.integers(min_value=5, max_value=10))
 
         data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
 
         f_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=4,
+                     max_size=4))
         if data_format == "NCHW":
             f_shape[1] = x_shape[1]
         else:
             f_shape[1] = x_shape[3]
 
         strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=2))
 
         padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
 
         padding = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=4,
+                     max_size=4))
 
         groups = draw(st.integers(min_value=1, max_value=3))
 
         dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=2))
 
         bias_shape = [f_shape[0]]
         inputs = dict()
@@ -111,20 +112,21 @@ def sample_program_config(self, draw):
                 "Input": ["input_x"],
                 "Filter": ["filter"],
             }
-            weights = {"filter": TensorConfig(shape=f_shape), }
-
-        conv2d_op = OpConfig(
-            "conv2d",
-            inputs=inputs,
-            outputs={"Output": ["conv2d_out"]},
-            strides=strides,
-            padding_algorithm=padding_algorithm,
-            paddings=padding,
-            groups=groups,
-            dilations=dilations,
-            data_format=data_format,
-            use_mkldnn=use_mkldnn,
-            mkldnn_data_type="int8")
+            weights = {
+                "filter": TensorConfig(shape=f_shape),
+            }
+
+        conv2d_op = OpConfig("conv2d",
+                             inputs=inputs,
+                             outputs={"Output": ["conv2d_out"]},
+                             strides=strides,
+                             padding_algorithm=padding_algorithm,
+                             paddings=padding,
+                             groups=groups,
+                             dilations=dilations,
+                             data_format=data_format,
+                             use_mkldnn=use_mkldnn,
+                             mkldnn_data_type="int8")
 
         ops = [conv2d_op]
 
@@ -136,10 +138,9 @@ def sample_program_config(self, draw):
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=100,
-            passes=["int8_scale_calculation_mkldnn_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=100,
+                            passes=["int8_scale_calculation_mkldnn_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_log_softmax_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_log_softmax_op.py
index 3dc0623a112f5..929863b42a730 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_log_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_log_softmax_op.py
@@ -22,25 +22,26 @@
 
 
 class TestMKLDNNLogSoftmaxOp(MkldnnAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self, *args, **kwargs):
+
         def generate_input(*args, **kwargs):
             return np.random.random(kwargs['in_shape']).astype(np.float32)
 
-        logsoftmax_op = OpConfig(
-            type="log_softmax",
-            inputs={"X": ["input_data"]},
-            outputs={"Out": ["output_data"]},
-            attrs={"axis": kwargs['axis']})
+        logsoftmax_op = OpConfig(type="log_softmax",
+                                 inputs={"X": ["input_data"]},
+                                 outputs={"Out": ["output_data"]},
+                                 attrs={"axis": kwargs['axis']})
 
         program_config = ProgramConfig(
             ops=[logsoftmax_op],
             weights={},
             inputs={
-                "input_data": TensorConfig(data_gen=partial(generate_input,
-                                                            *args, **kwargs)),
+                "input_data":
+                TensorConfig(data_gen=partial(generate_input, *args, **kwargs)),
             },
             outputs=["output_data"])
 
@@ -50,11 +51,10 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_mkldnn=True)
         yield config, (1e-5, 1e-5)
 
-    @given(
-        axis=st.sampled_from([-2, -1, 0, 1]),
-        in_shape=st.lists(
-            st.integers(
-                min_value=2, max_value=5), min_size=3, max_size=5))
+    @given(axis=st.sampled_from([-2, -1, 0, 1]),
+           in_shape=st.lists(st.integers(min_value=2, max_value=5),
+                             min_size=3,
+                             max_size=5))
     def test(self, *args, **kwargs):
         self.run_test(quant=False, *args, **kwargs)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py
index b1ad5804ebc2c..a22207030c8c9 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py
@@ -22,6 +22,7 @@
 
 
 class TestMKLDNNMatmulFuseOp(InferencePassTest):
+
     def init_data(self):
         self.bs = 8
         self.d_type = np.float32
@@ -31,10 +32,12 @@ def init_data(self):
 
     def make_network(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            x = fluid.data(
-                name='x', shape=[-1] + self.shape_x, dtype=self.d_type)
-            y = fluid.data(
-                name='y', shape=[-1] + self.shape_y, dtype=self.d_type)
+            x = fluid.data(name='x',
+                           shape=[-1] + self.shape_x,
+                           dtype=self.d_type)
+            y = fluid.data(name='y',
+                           shape=[-1] + self.shape_y,
+                           dtype=self.d_type)
             out = fluid.layers.matmul(x, y)
             out = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
             out = fluid.layers.reshape(
@@ -60,6 +63,7 @@ def test_check_output(self):
 
 
 class TestMKLDNNMatmulOtherDimsFuseOp(TestMKLDNNMatmulFuseOp):
+
     def init_data(self):
         self.bs = 8
         self.d_type = np.float32
@@ -69,12 +73,15 @@ def init_data(self):
 
 
 class TestMKLDNNMatmulOpNotFusedWrongTransposeAxis(TestMKLDNNMatmulFuseOp):
+
     def make_network(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            x = fluid.data(
-                name='x', shape=[-1] + self.shape_x, dtype=self.d_type)
-            y = fluid.data(
-                name='y', shape=[-1] + self.shape_y, dtype=self.d_type)
+            x = fluid.data(name='x',
+                           shape=[-1] + self.shape_x,
+                           dtype=self.d_type)
+            y = fluid.data(name='y',
+                           shape=[-1] + self.shape_y,
+                           dtype=self.d_type)
             out = fluid.layers.matmul(x, y)
             out = fluid.layers.transpose(out, perm=[0, 1, 2, 3])
             out = fluid.layers.reshape(out, [0, 0, 0, 0])
@@ -83,6 +90,7 @@ def make_network(self):
 
 
 class TestMKLDNNMatmulOpNotFusedBreakPattern(TestMKLDNNMatmulFuseOp):
+
     def init_data(self):
         self.bs = 7
         self.d_type = np.float32
@@ -92,14 +100,16 @@ def init_data(self):
 
     def make_network(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            x = fluid.data(
-                name='x', shape=[-1] + self.shape_x, dtype=self.d_type)
-            y = fluid.data(
-                name='y', shape=[-1] + self.shape_y, dtype=self.d_type)
+            x = fluid.data(name='x',
+                           shape=[-1] + self.shape_x,
+                           dtype=self.d_type)
+            y = fluid.data(name='y',
+                           shape=[-1] + self.shape_y,
+                           dtype=self.d_type)
             out = fluid.layers.matmul(x, y)
             out = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
-            out = fluid.layers.transpose(
-                out, perm=[0, 1, 2, 3])  # breaks pattern
+            out = fluid.layers.transpose(out, perm=[0, 1, 2,
+                                                    3])  # breaks pattern
             out = fluid.layers.reshape(
                 out, [0, 0, self.shape_y[0] * self.shape_y[2]])
             out = fluid.layers.fc(out, size=1)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
index c0d3ff766b8da..a5471eca2c26e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
@@ -26,12 +26,12 @@
 
 
 class TestMatmulTransposeReshapeMkldnnFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
-        # If the problem has been fixed, the judgment 
+        # If the problem has been fixed, the judgment
         # needs to be deleted!!!
         if 0 in attrs[2]['shape']:
             return False
@@ -67,40 +67,39 @@ def generate_input(type):
             else:
                 return np.random.random(shape_y).astype(np.float32)
 
-        matmul_op = OpConfig(
-            type="matmul",
-            inputs={"X": ["input_data1"],
-                    "Y": ["input_data2"]},
-            outputs={"Out": ["matmul_output"]},
-            attrs={
-                "transpose_X": transpose_X,
-                "transpose_Y": transpose_Y,
-                "alpha": alpha,
-                "fused_reshape_X": [],
-                "fused_reshape_Y": [],
-                "fused_transpose_X": [],
-                "fused_transpose_Y": [],
-                "fused_reshape_Out": [],
-                "fused_transpose_Out": []
-            })
-
-        transpose2_op = OpConfig(
-            type="transpose2",
-            inputs={"X": ["matmul_output"]},
-            outputs={
-                "Out": ["transpose2_output"],
-                "XShape": ["transpose2_xshape"]
-            },
-            attrs={'axis': axis})
-
-        reshape2_op = OpConfig(
-            type="reshape2",
-            inputs={"X": ["transpose2_output"]},
-            outputs={
-                "Out": ["reshape2_output"],
-                "XShape": ["reshape2_xshape"]
-            },
-            attrs={'shape': shape})
+        matmul_op = OpConfig(type="matmul",
+                             inputs={
+                                 "X": ["input_data1"],
+                                 "Y": ["input_data2"]
+                             },
+                             outputs={"Out": ["matmul_output"]},
+                             attrs={
+                                 "transpose_X": transpose_X,
+                                 "transpose_Y": transpose_Y,
+                                 "alpha": alpha,
+                                 "fused_reshape_X": [],
+                                 "fused_reshape_Y": [],
+                                 "fused_transpose_X": [],
+                                 "fused_transpose_Y": [],
+                                 "fused_reshape_Out": [],
+                                 "fused_transpose_Out": []
+                             })
+
+        transpose2_op = OpConfig(type="transpose2",
+                                 inputs={"X": ["matmul_output"]},
+                                 outputs={
+                                     "Out": ["transpose2_output"],
+                                     "XShape": ["transpose2_xshape"]
+                                 },
+                                 attrs={'axis': axis})
+
+        reshape2_op = OpConfig(type="reshape2",
+                               inputs={"X": ["transpose2_output"]},
+                               outputs={
+                                   "Out": ["reshape2_output"],
+                                   "XShape": ["reshape2_xshape"]
+                               },
+                               attrs={'shape': shape})
 
         model_net = [matmul_op, transpose2_op, reshape2_op]
 
@@ -122,8 +121,8 @@ def sample_predictor_configs(self, program_config):
         yield config, ["matmul"], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False, passes=["matmul_transpose_reshape_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            passes=["matmul_transpose_reshape_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
index 3c6560b3b2911..28fe916a6ef02 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
@@ -26,26 +26,24 @@
 
 
 class TestMatmulv2TransposeReshapeMkldnnFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         if program_config.inputs["input_data1"].shape[
-                -4] != 1 and program_config.inputs["input_data2"].shape[
-                    -4] != 1:
+                -4] != 1 and program_config.inputs["input_data2"].shape[-4] != 1:
             if program_config.inputs["input_data1"].shape[
                     -4] != program_config.inputs["input_data2"].shape[-4]:
                 return False
 
         if program_config.inputs["input_data1"].shape[
-                -3] != 1 and program_config.inputs["input_data2"].shape[
-                    -3] != 1:
+                -3] != 1 and program_config.inputs["input_data2"].shape[-3] != 1:
             if program_config.inputs["input_data1"].shape[
                     -3] != program_config.inputs["input_data2"].shape[-3]:
                 return False
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
-        # If the problem has been fixed, the judgment 
+        # If the problem has been fixed, the judgment
         # needs to be deleted!!!
         if 0 in attrs[2]['shape']:
             return False
@@ -82,39 +80,38 @@ def generate_input(type):
             else:
                 return np.random.random(shape_y).astype(np.float32)
 
-        matmul_op = OpConfig(
-            type="matmul_v2",
-            inputs={"X": ["input_data1"],
-                    "Y": ["input_data2"]},
-            outputs={"Out": ["matmul_output"]},
-            attrs={
-                "trans_x": transpose_X,
-                "trans_y": transpose_Y,
-                "fused_reshape_X": [],
-                "fused_reshape_Y": [],
-                "fused_transpose_X": [],
-                "fused_transpose_Y": [],
-                "fused_reshape_Out": [],
-                "fused_transpose_Out": []
-            })
-
-        transpose2_op = OpConfig(
-            type="transpose2",
-            inputs={"X": ["matmul_output"]},
-            outputs={
-                "Out": ["transpose2_output"],
-                "XShape": ["transpose2_xshape"]
-            },
-            attrs={'axis': axis})
-
-        reshape2_op = OpConfig(
-            type="reshape2",
-            inputs={"X": ["transpose2_output"]},
-            outputs={
-                "Out": ["reshape2_output"],
-                "XShape": ["reshape2_xshape"]
-            },
-            attrs={'shape': shape})
+        matmul_op = OpConfig(type="matmul_v2",
+                             inputs={
+                                 "X": ["input_data1"],
+                                 "Y": ["input_data2"]
+                             },
+                             outputs={"Out": ["matmul_output"]},
+                             attrs={
+                                 "trans_x": transpose_X,
+                                 "trans_y": transpose_Y,
+                                 "fused_reshape_X": [],
+                                 "fused_reshape_Y": [],
+                                 "fused_transpose_X": [],
+                                 "fused_transpose_Y": [],
+                                 "fused_reshape_Out": [],
+                                 "fused_transpose_Out": []
+                             })
+
+        transpose2_op = OpConfig(type="transpose2",
+                                 inputs={"X": ["matmul_output"]},
+                                 outputs={
+                                     "Out": ["transpose2_output"],
+                                     "XShape": ["transpose2_xshape"]
+                                 },
+                                 attrs={'axis': axis})
+
+        reshape2_op = OpConfig(type="reshape2",
+                               inputs={"X": ["transpose2_output"]},
+                               outputs={
+                                   "Out": ["reshape2_output"],
+                                   "XShape": ["reshape2_xshape"]
+                               },
+                               attrs={'shape': shape})
 
         model_net = [matmul_op, transpose2_op, reshape2_op]
 
@@ -132,7 +129,7 @@ def generate_input(type):
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        # gpu_cpu_map_matmul_v2_to_matmul_pass will affect the type of final fused op 
+        # gpu_cpu_map_matmul_v2_to_matmul_pass will affect the type of final fused op
         fused_op = "matmul_v2"
         input1_dim1 = program_config.inputs["input_data1"].shape[0]
         input2_dim1 = program_config.inputs["input_data2"].shape[0]
@@ -145,8 +142,8 @@ def sample_predictor_configs(self, program_config):
         yield config, [fused_op], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False, passes=["matmul_v2_transpose_reshape_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            passes=["matmul_v2_transpose_reshape_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py
index 9fa98045ef303..c144830904a09 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py
@@ -26,6 +26,7 @@
 
 
 class TestMkldnnMatmulv2Op(MkldnnAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         if len(program_config.inputs["input_data2"].shape) == 4:
             if program_config.inputs["input_data1"].shape[
@@ -36,14 +37,14 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
                     return False
 
         if program_config.inputs["input_data1"].shape[
-                -3] != 1 and program_config.inputs["input_data2"].shape[
-                    -3] != 1:
+                -3] != 1 and program_config.inputs["input_data2"].shape[-3] != 1:
             if program_config.inputs["input_data1"].shape[
                     -3] != program_config.inputs["input_data2"].shape[-3]:
                 return False
         return True
 
     def sample_program_configs(self, *args, **kwargs):
+
         def generate_input(type, *args, **kwargs):
             transpose_X = kwargs["transpose_X"]
             transpose_Y = kwargs["transpose_Y"]
@@ -83,30 +84,33 @@ def generate_input(type, *args, **kwargs):
             else:
                 return np.random.random(shape_y).astype(np.float32)
 
-        matmul_op = OpConfig(
-            type="matmul_v2",
-            inputs={"X": ["input_data1"],
-                    "Y": ["input_data2"]},
-            outputs={"Out": ["matmul_output"]},
-            attrs={
-                "trans_x": kwargs["transpose_X"],
-                "trans_y": kwargs["transpose_Y"],
-                "fused_reshape_X": [],
-                "fused_reshape_Y": [],
-                "fused_transpose_X": [],
-                "fused_transpose_Y": [],
-                "fused_reshape_Out": [],
-                "fused_transpose_Out": []
-            })
+        matmul_op = OpConfig(type="matmul_v2",
+                             inputs={
+                                 "X": ["input_data1"],
+                                 "Y": ["input_data2"]
+                             },
+                             outputs={"Out": ["matmul_output"]},
+                             attrs={
+                                 "trans_x": kwargs["transpose_X"],
+                                 "trans_y": kwargs["transpose_Y"],
+                                 "fused_reshape_X": [],
+                                 "fused_reshape_Y": [],
+                                 "fused_transpose_X": [],
+                                 "fused_transpose_Y": [],
+                                 "fused_reshape_Out": [],
+                                 "fused_transpose_Out": []
+                             })
 
         program_config = ProgramConfig(
             ops=[matmul_op],
             weights={},
             inputs={
-                "input_data1": TensorConfig(data_gen=partial(
-                    generate_input, "x", *args, **kwargs)),
-                "input_data2": TensorConfig(data_gen=partial(
-                    generate_input, "y", *args, **kwargs))
+                "input_data1":
+                TensorConfig(
+                    data_gen=partial(generate_input, "x", *args, **kwargs)),
+                "input_data2":
+                TensorConfig(
+                    data_gen=partial(generate_input, "y", *args, **kwargs))
             },
             outputs=["matmul_output"])
 
@@ -116,17 +120,14 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_mkldnn=True)
         yield config, (1e-5, 1e-5)
 
-    @given(
-        transpose_X=st.booleans(),
-        transpose_Y=st.booleans(),
-        y_dim_len=st.sampled_from([3, 4]),
-        batch_size1=st.integers(
-            min_value=1, max_value=4),
-        batch_size2=st.integers(
-            min_value=1, max_value=4),
-        channel1=st.sampled_from([1, 16, 32, 64]),
-        channel2=st.sampled_from([1, 16, 32, 64]),
-        input_dim=st.sampled_from([16, 32, 64]))
+    @given(transpose_X=st.booleans(),
+           transpose_Y=st.booleans(),
+           y_dim_len=st.sampled_from([3, 4]),
+           batch_size1=st.integers(min_value=1, max_value=4),
+           batch_size2=st.integers(min_value=1, max_value=4),
+           channel1=st.sampled_from([1, 16, 32, 64]),
+           channel2=st.sampled_from([1, 16, 32, 64]),
+           input_dim=st.sampled_from([16, 32, 64]))
     def test(self, *args, **kwargs):
         self.run_test(*args, **kwargs)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_mish_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_mish_op.py
index 83744e0a8bd0f..2b2759cc65151 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_mish_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_mish_op.py
@@ -22,32 +22,33 @@
 
 
 class TestMkldnnMishOp(MkldnnAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         # if mode is channel, and in_shape is 1 rank
-        if len(program_config.inputs['input_data'].
-               shape) == 1 and program_config.ops[0].attrs['mode'] == 'channel':
+        if len(program_config.inputs['input_data'].shape
+               ) == 1 and program_config.ops[0].attrs['mode'] == 'channel':
             return False
         return True
 
     def sample_program_configs(self, *args, **kwargs):
+
         def generate_input(*args, **kwargs):
             return np.random.random(kwargs['in_shape']).astype(np.float32)
 
-        mish_op = OpConfig(
-            type="mish",
-            inputs={"X": ["input_data"]},
-            outputs={"Out": ["output_data"]},
-            attrs={
-                "mode": kwargs['mode'],
-                "data_format": kwargs['data_format']
-            })
+        mish_op = OpConfig(type="mish",
+                           inputs={"X": ["input_data"]},
+                           outputs={"Out": ["output_data"]},
+                           attrs={
+                               "mode": kwargs['mode'],
+                               "data_format": kwargs['data_format']
+                           })
 
         program_config = ProgramConfig(
             ops=[mish_op],
             weights={},
             inputs={
-                "input_data": TensorConfig(data_gen=partial(generate_input,
-                                                            *args, **kwargs)),
+                "input_data":
+                TensorConfig(data_gen=partial(generate_input, *args, **kwargs)),
             },
             outputs=["output_data"])
 
@@ -57,12 +58,11 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_mkldnn=True)
         yield config, (1e-5, 1e-5)
 
-    @given(
-        mode=st.sampled_from(['all', 'channel', 'element']),
-        data_format=st.sampled_from(['NCHW', 'NHWC']),
-        in_shape=st.lists(
-            st.integers(
-                min_value=1, max_value=32), min_size=1, max_size=4))
+    @given(mode=st.sampled_from(['all', 'channel', 'element']),
+           data_format=st.sampled_from(['NCHW', 'NHWC']),
+           in_shape=st.lists(st.integers(min_value=1, max_value=32),
+                             min_size=1,
+                             max_size=4))
     def test(self, *args, **kwargs):
         self.run_test(quant=False, *args, **kwargs)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_prelu_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_prelu_op.py
index 3839c22ca25dc..3dde53d84aa06 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_prelu_op.py
@@ -26,14 +26,16 @@
 
 
 class TestMkldnnPreluOp(MkldnnAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         # if mode is channel, and in_shape is 1 rank
-        if len(program_config.inputs['input_data'].
-               shape) == 1 and program_config.ops[0].attrs['mode'] == 'channel':
+        if len(program_config.inputs['input_data'].shape
+               ) == 1 and program_config.ops[0].attrs['mode'] == 'channel':
             return False
         return True
 
     def sample_program_configs(self, *args, **kwargs):
+
         def generate_input(*args, **kwargs):
             return np.random.random(kwargs['in_shape']).astype(np.float32)
 
@@ -56,15 +58,16 @@ def generate_alpha(*args, **kwargs):
                     return np.zeros((1)).astype(np.float32)
                 return np.random.random(kwargs['in_shape']).astype(np.float32)
 
-        prelu_op = OpConfig(
-            type="prelu",
-            inputs={"X": ["input_data"],
-                    "Alpha": ["alpha_weight"]},
-            outputs={"Out": ["output_data"]},
-            attrs={
-                "mode": kwargs['mode'],
-                "data_format": kwargs['data_format']
-            })
+        prelu_op = OpConfig(type="prelu",
+                            inputs={
+                                "X": ["input_data"],
+                                "Alpha": ["alpha_weight"]
+                            },
+                            outputs={"Out": ["output_data"]},
+                            attrs={
+                                "mode": kwargs['mode'],
+                                "data_format": kwargs['data_format']
+                            })
 
         program_config = ProgramConfig(
             ops=[prelu_op],
@@ -87,12 +90,11 @@ def sample_predictor_configs(self, program_config):
     def add_skip_pass_case(self):
         pass
 
-    @given(
-        mode=st.sampled_from(['all', 'channel', 'element']),
-        data_format=st.sampled_from(['NCHW', 'NHWC']),
-        in_shape=st.lists(
-            st.integers(
-                min_value=1, max_value=32), min_size=1, max_size=4))
+    @given(mode=st.sampled_from(['all', 'channel', 'element']),
+           data_format=st.sampled_from(['NCHW', 'NHWC']),
+           in_shape=st.lists(st.integers(min_value=1, max_value=32),
+                             min_size=1,
+                             max_size=4))
     def test(self, *args, **kwargs):
         self.add_skip_pass_case()
         self.run_test(quant=False, *args, **kwargs)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
index 952cd27bbaeab..ce1ea51cb848d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
@@ -29,6 +29,7 @@
 
 
 class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -38,8 +39,8 @@ def sample_program_config(self, draw):
         alpha = draw(st.floats(min_value=0.01, max_value=2))
         axis = draw(st.sampled_from([[0, 2, 1, 3]]))
         shape = draw(
-            st.sampled_from([[0, 64, -1, 32], [0, 32, -1, 64], [-1, 32, 1, 64]
-                             ]))
+            st.sampled_from([[0, 64, -1, 32], [0, 32, -1, 64], [-1, 32, 1,
+                                                                64]]))
         batch_size = draw(st.integers(min_value=1, max_value=4))
         channel = draw(st.integers(min_value=1, max_value=64))
         input_dim = draw(st.sampled_from([32, 64]))
@@ -63,7 +64,7 @@ def generate_input2(attrs):
                     if matmul_shape[i] == -1:
                         matmul_shape[i] = int(abs(input_volume / shape_volume))
 
-            # Only for transpose axis [0, 2, 1, 3]     
+            # Only for transpose axis [0, 2, 1, 3]
             matmul_shape[1], matmul_shape[2] = matmul_shape[2], matmul_shape[1]
 
             if attrs[2]['transpose_X'] and attrs[2]['transpose_Y']:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py
index caf33156fc1bf..fb8dc034bd56b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py
@@ -25,23 +25,24 @@
 
 
 class TestReshapeTransposeMatmulV2OneDNNFusePass(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         self.tranpose_perm = [0, 2, 1, 3]
         self.pass_name = 'reshape_transpose_matmul_v2_mkldnn_fuse_pass'
 
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=self.data_shape, dtype="float32")
-            weight = fluid.layers.create_parameter(
-                shape=self.weight_shape, dtype="float32")
+            data = fluid.data(name="data",
+                              shape=self.data_shape,
+                              dtype="float32")
+            weight = fluid.layers.create_parameter(shape=self.weight_shape,
+                                                   dtype="float32")
             reshape = fluid.layers.reshape(data, shape=self.reshape_shape)
             transpose = fluid.layers.transpose(reshape, self.tranpose_perm)
-            matmul = paddle.matmul(
-                transpose,
-                weight,
-                transpose_x=self.transpose_x,
-                transpose_y=self.transpose_y)
+            matmul = paddle.matmul(transpose,
+                                   weight,
+                                   transpose_x=self.transpose_x,
+                                   transpose_y=self.transpose_y)
 
         self.fetch_list = [matmul]
         self.enable_mkldnn = True
@@ -64,6 +65,7 @@ def test_pass_compatible(self):
 
 class TestReshapeTransposeMatmulV2OneDNNFusePassBroadcast(
         TestReshapeTransposeMatmulV2OneDNNFusePass):
+
     def set_params(self):
         self.data_shape = [2, 64, 16]
         self.weight_shape = [1, 2, 8, 64]
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
index 86acbe615b3a9..f29e20f6b8d06 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
@@ -26,6 +26,7 @@
 
 
 class TestScaleMatmulMkldnnFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shape_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shape_op.py
index 8f5d7823cdf0f..92111062b1298 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shape_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shape_op.py
@@ -22,25 +22,26 @@
 
 
 class TestMkldnnShapeOp(MkldnnAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self, *args, **kwargs):
+
         def generate_input(*args, **kwargs):
-            return np.random.random(kwargs['in_shape']).astype(kwargs[
-                'in_dtype'])
+            return np.random.random(kwargs['in_shape']).astype(
+                kwargs['in_dtype'])
 
-        shape_op = OpConfig(
-            type="shape",
-            inputs={"Input": ["input_data"]},
-            outputs={"Out": ["output_data"]})
+        shape_op = OpConfig(type="shape",
+                            inputs={"Input": ["input_data"]},
+                            outputs={"Out": ["output_data"]})
 
         program_config = ProgramConfig(
             ops=[shape_op],
             weights={},
             inputs={
-                "input_data": TensorConfig(data_gen=partial(generate_input,
-                                                            *args, **kwargs)),
+                "input_data":
+                TensorConfig(data_gen=partial(generate_input, *args, **kwargs)),
             },
             outputs=["output_data"])
 
@@ -50,11 +51,10 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_mkldnn=True)
         yield config, (1e-5, 1e-5)
 
-    @given(
-        in_shape=st.lists(
-            st.integers(
-                min_value=1, max_value=3), min_size=1, max_size=6),
-        in_dtype=st.sampled_from([np.float32, np.uint16, np.int8, np.uint8]))
+    @given(in_shape=st.lists(st.integers(min_value=1, max_value=3),
+                             min_size=1,
+                             max_size=6),
+           in_dtype=st.sampled_from([np.float32, np.uint16, np.int8, np.uint8]))
     def test(self, *args, **kwargs):
         self.run_test(quant=False, *args, **kwargs)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py
index 828e92dc03426..74c3c34212fc0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py
@@ -32,6 +32,7 @@ def product(input):
 
 
 class TestShuffleChannelMKLDNNDetectPass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         input_shape = program_config.inputs['input_data'].shape
         first_reshape2_shape = program_config.ops[0].attrs['shape']
@@ -130,8 +131,8 @@ def sample_predictor_configs(self, program_config):
         yield config, ["shuffle_channel"], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False, passes=["shuffle_channel_mkldnn_detect_pass"])
+        self.run_and_statis(quant=False,
+                            passes=["shuffle_channel_mkldnn_detect_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_op.py
index 26655970290cd..d9050b58ee380 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_op.py
@@ -22,25 +22,26 @@
 
 
 class TestMKLDNNShuffleChannelOp(MkldnnAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self, *args, **kwargs):
+
         def generate_input(*args, **kwargs):
             return np.random.random(kwargs['in_shape']).astype(np.float32)
 
-        shuffle_channel_op = OpConfig(
-            type="shuffle_channel",
-            inputs={"X": ["input_data"]},
-            outputs={"Out": ["output_data"]},
-            attrs={"group": kwargs['group']})
+        shuffle_channel_op = OpConfig(type="shuffle_channel",
+                                      inputs={"X": ["input_data"]},
+                                      outputs={"Out": ["output_data"]},
+                                      attrs={"group": kwargs['group']})
 
         program_config = ProgramConfig(
             ops=[shuffle_channel_op],
             weights={},
             inputs={
-                "input_data": TensorConfig(data_gen=partial(generate_input,
-                                                            *args, **kwargs)),
+                "input_data":
+                TensorConfig(data_gen=partial(generate_input, *args, **kwargs)),
             },
             outputs=["output_data"])
 
@@ -50,9 +51,8 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_mkldnn=True)
         yield config, (1e-5, 1e-5)
 
-    @given(
-        group=st.sampled_from([1, 2, 8, 32, 128]),
-        in_shape=st.sampled_from([[5, 512, 2, 3], [2, 256, 5, 4]]))
+    @given(group=st.sampled_from([1, 2, 8, 32, 128]),
+           in_shape=st.sampled_from([[5, 512, 2, 3], [2, 256, 5, 4]]))
     def test(self, *args, **kwargs):
         self.run_test(quant=False, *args, **kwargs)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_softplus_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_softplus_activation_fuse_pass.py
index 83c095baeff4e..0c25a790138cd 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_softplus_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_softplus_activation_fuse_pass.py
@@ -30,8 +30,9 @@ class SoftplusActivationReluOneDNNFusePassTest(InferencePassTest):
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 100, 100],
+                              dtype="float32")
             softplus_out = fluid.layers.softplus(data)
             if self.fuse_activation_beta is not None:
                 activation_out = self.fuse_activation(
@@ -62,12 +63,14 @@ def test_pass_compatible(self):
 
 class SoftplusActivationTanhOneDNNFusePassTest(
         SoftplusActivationReluOneDNNFusePassTest):
+
     def set_params(self):
         self.fuse_activation = fluid.layers.tanh
 
 
 class SoftplusActivationLeakyReluOneDNNFusePassTest(
         SoftplusActivationReluOneDNNFusePassTest):
+
     def set_params(self):
         self.fuse_activation = fluid.layers.leaky_relu
         self.fuse_activation_alpha = 0.3
@@ -75,6 +78,7 @@ def set_params(self):
 
 class SoftplusActivationSwishOneDNNFusePassTest(
         SoftplusActivationReluOneDNNFusePassTest):
+
     def set_params(self):
         self.fuse_activation = fluid.layers.swish
         self.fuse_activation_alpha = 3
@@ -82,24 +86,28 @@ def set_params(self):
 
 class SoftplusActivationHardSwishOneDNNFusePassTest(
         SoftplusActivationReluOneDNNFusePassTest):
+
     def set_params(self):
         self.fuse_activation = fluid.layers.hard_swish
 
 
 class SoftplusActivationSqrtOneDNNFusePassTest(
         SoftplusActivationReluOneDNNFusePassTest):
+
     def set_params(self):
         self.fuse_activation = fluid.layers.hard_swish
 
 
 class SoftplusActivationAbsOneDNNFusePassTest(
         SoftplusActivationReluOneDNNFusePassTest):
+
     def set_params(self):
         self.fuse_activation = fluid.layers.abs
 
 
 class SoftplusActivationClipOneDNNFusePassTest(
         SoftplusActivationReluOneDNNFusePassTest):
+
     def set_params(self):
         self.fuse_activation = fluid.layers.clip
         self.fuse_activation_alpha = 1.1
@@ -108,12 +116,14 @@ def set_params(self):
 
 class SoftplusActivationGeluErfOneDNNFusePassTest(
         SoftplusActivationReluOneDNNFusePassTest):
+
     def set_params(self):
         self.fuse_activation = fluid.layers.gelu
 
 
 class SoftplusActivationGeluTanhOneDNNFusePassTest(
         SoftplusActivationReluOneDNNFusePassTest):
+
     def set_params(self):
         self.fuse_activation = fluid.layers.gelu
         self.fuse_activation_alpha = True  # simulated "Approximate" attr
@@ -121,12 +131,14 @@ def set_params(self):
 
 class SoftplusActivationRelu6OneDNNFusePassTest(
         SoftplusActivationReluOneDNNFusePassTest):
+
     def set_params(self):
         self.fuse_activation = fluid.layers.relu6
 
 
 class SoftplusActivationSigmoidOneDNNFusePassTest(
         SoftplusActivationReluOneDNNFusePassTest):
+
     def set_params(self):
         self.fuse_activation = fluid.layers.sigmoid
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mul_gru_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mul_gru_fuse_pass.py
index b5a5377043571..b1cf07776efdd 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mul_gru_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mul_gru_fuse_pass.py
@@ -27,6 +27,7 @@
 
 
 class TestMulGruFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -47,66 +48,66 @@ def generate_input():
         def generate_weight(shape):
             return np.full(shape, 0.0001).astype(np.float32)
 
-        im2sequence_op = OpConfig(
-            type="im2sequence",
-            inputs={"X": ["input_data"]},
-            outputs={"Out": ["seq_out"]},
-            attrs={
-                "kernels": [6, 1],
-                "out_stride": [1, 1],
-                "paddings": [0, 0, 0, 0],
-                "strides": [1, 1]
-            })
-
-        mul_op = OpConfig(
-            type="mul",
-            inputs={"X": ["seq_out"],
-                    "Y": ["mul_weight"]},
-            outputs={"Out": ["mul_out"]},
-            attrs={"x_num_col_dims": x_col,
-                   "y_num_col_dims": y_col})
+        im2sequence_op = OpConfig(type="im2sequence",
+                                  inputs={"X": ["input_data"]},
+                                  outputs={"Out": ["seq_out"]},
+                                  attrs={
+                                      "kernels": [6, 1],
+                                      "out_stride": [1, 1],
+                                      "paddings": [0, 0, 0, 0],
+                                      "strides": [1, 1]
+                                  })
+
+        mul_op = OpConfig(type="mul",
+                          inputs={
+                              "X": ["seq_out"],
+                              "Y": ["mul_weight"]
+                          },
+                          outputs={"Out": ["mul_out"]},
+                          attrs={
+                              "x_num_col_dims": x_col,
+                              "y_num_col_dims": y_col
+                          })
 
         if has_origin_mode:
-            gru_op = OpConfig(
-                type="gru",
-                inputs={
-                    "Input": ["mul_out"],
-                    "Weight": ["gru_weight"],
-                    "Bias": ["gru_bias"]
-                },
-                outputs={
-                    "BatchGate": ["batch_gate"],
-                    "BatchHidden": ["batch_hidden"],
-                    "BatchResetHiddenPrev": ["batch_reset"],
-                    "Hidden": ["hidden"]
-                },
-                attrs={
-                    'activation': activation,
-                    'is_reverse': is_reverse,
-                    'gate_activation': gate_activation,
-                    'is_test': True,
-                    'origin_mode': origin_mode
-                })
+            gru_op = OpConfig(type="gru",
+                              inputs={
+                                  "Input": ["mul_out"],
+                                  "Weight": ["gru_weight"],
+                                  "Bias": ["gru_bias"]
+                              },
+                              outputs={
+                                  "BatchGate": ["batch_gate"],
+                                  "BatchHidden": ["batch_hidden"],
+                                  "BatchResetHiddenPrev": ["batch_reset"],
+                                  "Hidden": ["hidden"]
+                              },
+                              attrs={
+                                  'activation': activation,
+                                  'is_reverse': is_reverse,
+                                  'gate_activation': gate_activation,
+                                  'is_test': True,
+                                  'origin_mode': origin_mode
+                              })
         else:
-            gru_op = OpConfig(
-                type="gru",
-                inputs={
-                    "Input": ["mul_out"],
-                    "Weight": ["gru_weight"],
-                    "Bias": ["gru_bias"]
-                },
-                outputs={
-                    "BatchGate": ["batch_gate"],
-                    "BatchHidden": ["batch_hidden"],
-                    "BatchResetHiddenPrev": ["batch_reset"],
-                    "Hidden": ["hidden"]
-                },
-                attrs={
-                    'activation': activation,
-                    'is_reverse': is_reverse,
-                    'gate_activation': gate_activation,
-                    'is_test': True
-                })
+            gru_op = OpConfig(type="gru",
+                              inputs={
+                                  "Input": ["mul_out"],
+                                  "Weight": ["gru_weight"],
+                                  "Bias": ["gru_bias"]
+                              },
+                              outputs={
+                                  "BatchGate": ["batch_gate"],
+                                  "BatchHidden": ["batch_hidden"],
+                                  "BatchResetHiddenPrev": ["batch_reset"],
+                                  "Hidden": ["hidden"]
+                              },
+                              attrs={
+                                  'activation': activation,
+                                  'is_reverse': is_reverse,
+                                  'gate_activation': gate_activation,
+                                  'is_test': True
+                              })
 
         model_net = [im2sequence_op, mul_op, gru_op]
 
@@ -132,8 +133,9 @@ def sample_predictor_configs(self, program_config):
         yield config, ["im2sequence", "fusion_gru"], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False, max_duration=300, passes=["mul_gru_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_duration=300,
+                            passes=["mul_gru_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mul_lstm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mul_lstm_fuse_pass.py
index c944abb60c86a..959c75d53db41 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mul_lstm_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mul_lstm_fuse_pass.py
@@ -27,6 +27,7 @@
 
 
 class TestMulLstmFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -48,46 +49,47 @@ def generate_input():
         def generate_weight(shape):
             return np.full(shape, 0.0001).astype(np.float32)
 
-        im2sequence_op = OpConfig(
-            type="im2sequence",
-            inputs={"X": ["input_data"]},
-            outputs={"Out": ["seq_out"]},
-            attrs={
-                "kernels": [6, 1],
-                "out_stride": [1, 1],
-                "paddings": [0, 0, 0, 0],
-                "strides": [1, 1]
-            })
-
-        mul_op = OpConfig(
-            type="mul",
-            inputs={"X": ["seq_out"],
-                    "Y": ["mul_weight"]},
-            outputs={"Out": ["mul_out"]},
-            attrs={"x_num_col_dims": x_col,
-                   "y_num_col_dims": y_col})
-
-        lstm_op = OpConfig(
-            type="lstm",
-            inputs={
-                "Input": ["mul_out"],
-                "Weight": ["lstm_weight"],
-                "Bias": ["lstm_bias"]
-            },
-            outputs={
-                "Hidden": ["lstm_hidden"],
-                "Cell": ["lstm_cell"],
-                "BatchGate": ["lstm_gate"],
-                "BatchCellPreAct": ["lstm_batch_cell"]
-            },
-            attrs={
-                'use_peepholes': use_peepholes,
-                'is_reverse': is_reverse,
-                'gate_activation': gate_activation,
-                'cell_activation': cell_activation,
-                'candidate_activation': candidate_activation,
-                'is_test': True
-            })
+        im2sequence_op = OpConfig(type="im2sequence",
+                                  inputs={"X": ["input_data"]},
+                                  outputs={"Out": ["seq_out"]},
+                                  attrs={
+                                      "kernels": [6, 1],
+                                      "out_stride": [1, 1],
+                                      "paddings": [0, 0, 0, 0],
+                                      "strides": [1, 1]
+                                  })
+
+        mul_op = OpConfig(type="mul",
+                          inputs={
+                              "X": ["seq_out"],
+                              "Y": ["mul_weight"]
+                          },
+                          outputs={"Out": ["mul_out"]},
+                          attrs={
+                              "x_num_col_dims": x_col,
+                              "y_num_col_dims": y_col
+                          })
+
+        lstm_op = OpConfig(type="lstm",
+                           inputs={
+                               "Input": ["mul_out"],
+                               "Weight": ["lstm_weight"],
+                               "Bias": ["lstm_bias"]
+                           },
+                           outputs={
+                               "Hidden": ["lstm_hidden"],
+                               "Cell": ["lstm_cell"],
+                               "BatchGate": ["lstm_gate"],
+                               "BatchCellPreAct": ["lstm_batch_cell"]
+                           },
+                           attrs={
+                               'use_peepholes': use_peepholes,
+                               'is_reverse': is_reverse,
+                               'gate_activation': gate_activation,
+                               'cell_activation': cell_activation,
+                               'candidate_activation': candidate_activation,
+                               'is_test': True
+                           })
 
         model_net = [im2sequence_op, mul_op, lstm_op]
 
@@ -118,8 +120,9 @@ def sample_predictor_configs(self, program_config):
         yield config, ["im2sequence", "fusion_lstm"], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False, max_duration=300, passes=["mul_lstm_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_duration=300,
+                            passes=["mul_lstm_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py
index 1814b53401ed5..f112ccca87853 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py
@@ -27,6 +27,7 @@
 
 
 class TestRepeatedFcReluFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -53,55 +54,61 @@ def generate_weight(shape):
             'dim': dim
         }]
 
-        mul_op1 = OpConfig(
-            type="mul",
-            inputs={"X": ["input_data"],
-                    "Y": ["mul1_weight"]},
-            outputs={"Out": ["mul1_output"]},
-            attrs={"x_num_col_dims": x_col,
-                   "y_num_col_dims": y_col})
-
-        elt_op1 = OpConfig(
-            type="elementwise_add",
-            inputs={"X": ["mul1_output"],
-                    "Y": ["elementwise1_weight"]},
-            outputs={"Out": ["elementwise1_output"]},
-            attrs={"axis": axis})
-
-        relu_op1 = OpConfig(
-            type="relu",
-            inputs={"X": ["elementwise1_output"]},
-            outputs={"Out": ["relu1_output"]},
-            attrs={})
-
-        mul_op2 = OpConfig(
-            type="mul",
-            inputs={"X": ["relu1_output"],
-                    "Y": ["mul2_weight"]},
-            outputs={"Out": ["mul2_output"]},
-            attrs={"x_num_col_dims": x_col,
-                   "y_num_col_dims": y_col})
-
-        elt_op2 = OpConfig(
-            type="elementwise_add",
-            inputs={"X": ["mul2_output"],
-                    "Y": ["elementwise2_weight"]},
-            outputs={"Out": ["elementwise2_output"]},
-            attrs={"axis": axis})
-
-        relu_op2 = OpConfig(
-            type="relu",
-            inputs={"X": ["elementwise2_output"]},
-            outputs={"Out": ["relu2_output"]},
-            attrs={})
+        mul_op1 = OpConfig(type="mul",
+                           inputs={
+                               "X": ["input_data"],
+                               "Y": ["mul1_weight"]
+                           },
+                           outputs={"Out": ["mul1_output"]},
+                           attrs={
+                               "x_num_col_dims": x_col,
+                               "y_num_col_dims": y_col
+                           })
+
+        elt_op1 = OpConfig(type="elementwise_add",
+                           inputs={
+                               "X": ["mul1_output"],
+                               "Y": ["elementwise1_weight"]
+                           },
+                           outputs={"Out": ["elementwise1_output"]},
+                           attrs={"axis": axis})
+
+        relu_op1 = OpConfig(type="relu",
+                            inputs={"X": ["elementwise1_output"]},
+                            outputs={"Out": ["relu1_output"]},
+                            attrs={})
+
+        mul_op2 = OpConfig(type="mul",
+                           inputs={
+                               "X": ["relu1_output"],
+                               "Y": ["mul2_weight"]
+                           },
+                           outputs={"Out": ["mul2_output"]},
+                           attrs={
+                               "x_num_col_dims": x_col,
+                               "y_num_col_dims": y_col
+                           })
+
+        elt_op2 = OpConfig(type="elementwise_add",
+                           inputs={
+                               "X": ["mul2_output"],
+                               "Y": ["elementwise2_weight"]
+                           },
+                           outputs={"Out": ["elementwise2_output"]},
+                           attrs={"axis": axis})
+
+        relu_op2 = OpConfig(type="relu",
+                            inputs={"X": ["elementwise2_output"]},
+                            outputs={"Out": ["relu2_output"]},
+                            attrs={})
 
         model_net = [mul_op1, elt_op1, relu_op1, mul_op2, elt_op2, relu_op2]
 
         program_config = ProgramConfig(
             ops=model_net,
             weights={
-                "mul1_weight": TensorConfig(data_gen=partial(generate_weight,
-                                                             [dim, 32])),
+                "mul1_weight":
+                TensorConfig(data_gen=partial(generate_weight, [dim, 32])),
                 "mul2_weight":
                 TensorConfig(data_gen=partial(generate_weight, [32, 128])),
                 "elementwise1_weight":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_reshape2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_reshape2_matmul_fuse_pass.py
index 9bec34df5b6e1..79652c53e12e0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_reshape2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_reshape2_matmul_fuse_pass.py
@@ -50,9 +50,9 @@ def sample_predictor_configs(self, program_config):
     def sample_program_config(self, draw):
         # 1. Generate shape and attr of reshape2
         reshape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=10), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=10),
+                     min_size=2,
+                     max_size=2))
         x_shape = reshape + [1, 1]
 
         # 2. Generate attr:transpose_X/transpose_Y/alpha of matmul
@@ -62,9 +62,9 @@ def sample_program_config(self, draw):
 
         # 3. Generate legal shape of input:Y of matmul
         y_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=2))
         y_shape[0] = x_shape[1]
 
         # 4. Generate legal attr:axis of elementwise_add
@@ -72,13 +72,19 @@ def sample_program_config(self, draw):
         if axis == 0 or axis == -1:
             if draw(st.booleans()):
                 if axis == 0:
-                    bias_shape = [x_shape[0], ]
+                    bias_shape = [
+                        x_shape[0],
+                    ]
                 else:
-                    bias_shape = [y_shape[1], ]
+                    bias_shape = [
+                        y_shape[1],
+                    ]
             else:
                 bias_shape = [x_shape[0], y_shape[1]]
         elif axis == 1:
-            bias_shape = [y_shape[1], ]
+            bias_shape = [
+                y_shape[1],
+            ]
 
         if draw(st.integers(min_value=1, max_value=10)) <= 1:
             bias_shape[-1] = 1
@@ -87,14 +93,21 @@ def sample_program_config(self, draw):
 
         reshape2_op = OpConfig(
             "reshape2",
-            inputs={"X": ["reshape2_x"], },
+            inputs={
+                "X": ["reshape2_x"],
+            },
             shape=reshape,
-            outputs={"Out": ["reshape2_out"],
-                     "XShape": ["xshape"]}, )
+            outputs={
+                "Out": ["reshape2_out"],
+                "XShape": ["xshape"]
+            },
+        )
         matmul_op = OpConfig(
             "matmul",
-            inputs={"X": ["reshape2_out"],
-                    "Y": ["matmul_y"]},
+            inputs={
+                "X": ["reshape2_out"],
+                "Y": ["matmul_y"]
+            },
             outputs={"Out": ["matmul_out"]},
             alpha=alpha,
             transpose_X=transpose_X,
@@ -104,14 +117,18 @@ def sample_program_config(self, draw):
             fused_transpose_X=[],
             fused_transpose_Y=[],
             fused_reshape_Out=[],
-            fused_transpose_Out=[], )
+            fused_transpose_Out=[],
+        )
 
         add_op = OpConfig(
             "elementwise_add",
-            inputs={"X": ["matmul_out"],
-                    "Y": ["bias"]},
+            inputs={
+                "X": ["matmul_out"],
+                "Y": ["bias"]
+            },
             outputs={"Out": ["add_out"]},
-            axis=axis, )
+            axis=axis,
+        )
 
         ops = [reshape2_op, matmul_op, add_op]
 
@@ -122,8 +139,11 @@ def sample_program_config(self, draw):
                     "matmul_y": TensorConfig(shape=y_shape),
                     "bias": TensorConfig(shape=bias_shape),
                 },
-                inputs={"reshape2_x": TensorConfig(shape=x_shape), },
-                outputs=ops[-1].outputs["Out"], )
+                inputs={
+                    "reshape2_x": TensorConfig(shape=x_shape),
+                },
+                outputs=ops[-1].outputs["Out"],
+            )
         else:
             program_config = ProgramConfig(
                 ops=ops,
@@ -133,15 +153,15 @@ def sample_program_config(self, draw):
                     "matmul_y": TensorConfig(shape=y_shape),
                     "bias": TensorConfig(shape=bias_shape),
                 },
-                outputs=ops[-1].outputs["Out"], )
+                outputs=ops[-1].outputs["Out"],
+            )
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=50,
-            max_duration=1000,
-            passes=["gpu_cpu_reshape2_matmul_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=50,
+                            max_duration=1000,
+                            passes=["gpu_cpu_reshape2_matmul_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
index c8e939d3926eb..de0aed5e204d2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
@@ -27,6 +27,7 @@
 
 
 class TestSeqConcatFcFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -48,19 +49,21 @@ def generate_input(shape):
         def generate_weight(shape):
             return np.random.random(shape).astype(np.float32)
 
-        sequence_expand_op1 = OpConfig(
-            type="sequence_expand",
-            inputs={"X": ["input_data1"],
-                    "Y": ["input_data2"]},
-            outputs={"Out": ["seq_exp1_out"]},
-            attrs={"ref_level": ref_level})
-
-        sequence_expand_op2 = OpConfig(
-            type="sequence_expand",
-            inputs={"X": ["input_data1"],
-                    "Y": ["input_data3"]},
-            outputs={"Out": ["seq_exp2_out"]},
-            attrs={"ref_level": ref_level})
+        sequence_expand_op1 = OpConfig(type="sequence_expand",
+                                       inputs={
+                                           "X": ["input_data1"],
+                                           "Y": ["input_data2"]
+                                       },
+                                       outputs={"Out": ["seq_exp1_out"]},
+                                       attrs={"ref_level": ref_level})
+
+        sequence_expand_op2 = OpConfig(type="sequence_expand",
+                                       inputs={
+                                           "X": ["input_data1"],
+                                           "Y": ["input_data3"]
+                                       },
+                                       outputs={"Out": ["seq_exp2_out"]},
+                                       attrs={"ref_level": ref_level})
 
         concat_op = OpConfig(
             type="concat",
@@ -68,27 +71,32 @@ def generate_weight(shape):
             outputs={"Out": ["concat_output"]},
             attrs={'axis': axis1})
 
-        mul_op = OpConfig(
-            type="mul",
-            inputs={"X": ["concat_output"],
-                    "Y": ["mul_weight"]},
-            outputs={"Out": ["mul_out"]},
-            attrs={"x_num_col_dims": x_col,
-                   "y_num_col_dims": y_col})
-
-        elt_op = OpConfig(
-            type="elementwise_add",
-            inputs={"X": ["mul_out"],
-                    "Y": ["elt_weight"]},
-            outputs={"Out": ["elt_out"]},
-            attrs={"axis": axis2})
-
-        act_op = OpConfig(
-            type=act_type,
-            inputs={"X": ["elt_out"]},
-            outputs={"Out": ["act_out"]},
-            attrs={"use_cudnn": use_cudnn,
-                   "use_mkldnn": use_mkldnn})
+        mul_op = OpConfig(type="mul",
+                          inputs={
+                              "X": ["concat_output"],
+                              "Y": ["mul_weight"]
+                          },
+                          outputs={"Out": ["mul_out"]},
+                          attrs={
+                              "x_num_col_dims": x_col,
+                              "y_num_col_dims": y_col
+                          })
+
+        elt_op = OpConfig(type="elementwise_add",
+                          inputs={
+                              "X": ["mul_out"],
+                              "Y": ["elt_weight"]
+                          },
+                          outputs={"Out": ["elt_out"]},
+                          attrs={"axis": axis2})
+
+        act_op = OpConfig(type=act_type,
+                          inputs={"X": ["elt_out"]},
+                          outputs={"Out": ["act_out"]},
+                          attrs={
+                              "use_cudnn": use_cudnn,
+                              "use_mkldnn": use_mkldnn
+                          })
 
         model_net = [
             sequence_expand_op1, sequence_expand_op2, concat_op, mul_op, elt_op,
@@ -104,15 +112,18 @@ def generate_weight(shape):
                 TensorConfig(data_gen=partial(generate_weight, [dim]))
             },
             inputs={
-                "input_data1": TensorConfig(
-                    data_gen=partial(generate_input, [batch_size, 128]),
-                    lod=[[0, 1]]),
-                "input_data2": TensorConfig(
-                    data_gen=partial(generate_input, [batch_size, 128]),
-                    lod=[[0, 1]]),
-                "input_data3": TensorConfig(
-                    data_gen=partial(generate_input, [batch_size, 128]),
-                    lod=[[0, 1]])
+                "input_data1":
+                TensorConfig(data_gen=partial(generate_input,
+                                              [batch_size, 128]),
+                             lod=[[0, 1]]),
+                "input_data2":
+                TensorConfig(data_gen=partial(generate_input,
+                                              [batch_size, 128]),
+                             lod=[[0, 1]]),
+                "input_data3":
+                TensorConfig(data_gen=partial(generate_input,
+                                              [batch_size, 128]),
+                             lod=[[0, 1]])
             },
             outputs=["act_out"])
 
@@ -123,6 +134,7 @@ def sample_predictor_configs(self, program_config):
         yield config, ["fusion_seqexpand_concat_fc"], (1e-5, 1e-5)
 
     def add_ignore_pass_case(self):
+
         def teller1(program_config, predictor_config):
             if program_config.ops[-1].type == "relu":
                 return True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py
index 769720fb2588c..4140eb32bb85e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py
@@ -27,6 +27,7 @@
 
 
 class TestSeqconvEltaddReluFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -45,41 +46,41 @@ def generate_input():
         def generate_weight(shape):
             return np.random.random(shape).astype(np.float32)
 
-        im2sequence_op = OpConfig(
-            type="im2sequence",
-            inputs={"X": ["input_data"]},
-            outputs={"Out": ["seq_out"]},
-            attrs={
-                "kernels": [6, 1],
-                "out_stride": [1, 1],
-                "paddings": [0, 0, 0, 0],
-                "strides": [1, 1]
-            })
-
-        sequence_conv_op = OpConfig(
-            type="sequence_conv",
-            inputs={"X": ["seq_out"],
-                    "Filter": ["conv_weight"]},
-            outputs={"Out": ["conv_out"]},
-            attrs={
-                "contextLength": contextLength,
-                "contextStart": contextStart,
-                "contextStride": contextStride,
-                "paddingTrainable": paddingTrainable
-            })
-
-        elementwise_add_op = OpConfig(
-            type="elementwise_add",
-            inputs={"X": ["conv_out"],
-                    "Y": ["elt_weight"]},
-            outputs={"Out": ["elt_output"]},
-            attrs={'axis': axis})
-
-        relu_op = OpConfig(
-            type="relu",
-            inputs={"X": ["elt_output"]},
-            outputs={"Out": ["relu_output"]},
-            attrs={})
+        im2sequence_op = OpConfig(type="im2sequence",
+                                  inputs={"X": ["input_data"]},
+                                  outputs={"Out": ["seq_out"]},
+                                  attrs={
+                                      "kernels": [6, 1],
+                                      "out_stride": [1, 1],
+                                      "paddings": [0, 0, 0, 0],
+                                      "strides": [1, 1]
+                                  })
+
+        sequence_conv_op = OpConfig(type="sequence_conv",
+                                    inputs={
+                                        "X": ["seq_out"],
+                                        "Filter": ["conv_weight"]
+                                    },
+                                    outputs={"Out": ["conv_out"]},
+                                    attrs={
+                                        "contextLength": contextLength,
+                                        "contextStart": contextStart,
+                                        "contextStride": contextStride,
+                                        "paddingTrainable": paddingTrainable
+                                    })
+
+        elementwise_add_op = OpConfig(type="elementwise_add",
+                                      inputs={
+                                          "X": ["conv_out"],
+                                          "Y": ["elt_weight"]
+                                      },
+                                      outputs={"Out": ["elt_output"]},
+                                      attrs={'axis': axis})
+
+        relu_op = OpConfig(type="relu",
+                           inputs={"X": ["elt_output"]},
+                           outputs={"Out": ["relu_output"]},
+                           attrs={})
 
         model_net = [
             im2sequence_op, sequence_conv_op, elementwise_add_op, relu_op
@@ -88,8 +89,10 @@ def generate_weight(shape):
         program_config = ProgramConfig(
             ops=model_net,
             weights={
-                "conv_weight": TensorConfig(data_gen=partial(
-                    generate_weight, [768 * contextLength, 16])),
+                "conv_weight":
+                TensorConfig(
+                    data_gen=partial(generate_weight, [768 *
+                                                       contextLength, 16])),
                 "elt_weight":
                 TensorConfig(data_gen=partial(generate_weight, [16]))
             },
@@ -102,12 +105,12 @@ def generate_weight(shape):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config()
-        yield config, ["im2sequence", "fusion_seqconv_eltadd_relu"], (1e-5,
-                                                                      1e-5)
+        yield config, ["im2sequence",
+                       "fusion_seqconv_eltadd_relu"], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False, passes=["seqconv_eltadd_relu_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            passes=["seqconv_eltadd_relu_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_seqpool_cvm_concat_fuse_pass_py.py b/python/paddle/fluid/tests/unittests/ir/inference/test_seqpool_cvm_concat_fuse_pass_py.py
index 2e403b99ab8ad..53333da57e78d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_seqpool_cvm_concat_fuse_pass_py.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_seqpool_cvm_concat_fuse_pass_py.py
@@ -27,6 +27,7 @@
 
 
 class TestSeqpoolCvmConcatFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -51,76 +52,80 @@ def generate_input2():
         def generate_input3():
             return np.random.random([1, 768]).astype(np.float32)
 
-        im2sequence_op = OpConfig(
-            type="im2sequence",
-            inputs={"X": ["input_data1"]},
-            outputs={"Out": ["seq_out"]},
-            attrs={
-                "kernels": [6, 1],
-                "out_stride": [1, 1],
-                "paddings": [0, 0, 0, 0],
-                "strides": [1, 1]
-            })
-
-        sequence_pool_op1 = OpConfig(
-            type="sequence_pool",
-            inputs={"X": ["seq_out"]},
-            outputs={"Out": ["seq_pool1_out"],
-                     "MaxIndex": ["index1_out"]},
-            attrs={
-                "is_test": is_test,
-                "pooltype": pooltype,
-                "pad_value": pad_value1
-            })
-
-        sequence_pool_op2 = OpConfig(
-            type="sequence_pool",
-            inputs={"X": ["seq_out"]},
-            outputs={"Out": ["seq_pool2_out"],
-                     "MaxIndex": ["index2_out"]},
-            attrs={
-                "is_test": is_test,
-                "pooltype": pooltype,
-                "pad_value": pad_value2
-            })
-
-        sequence_pool_op3 = OpConfig(
-            type="sequence_pool",
-            inputs={"X": ["seq_out"]},
-            outputs={"Out": ["seq_pool3_out"],
-                     "MaxIndex": ["index3_out"]},
-            attrs={
-                "is_test": is_test,
-                "pooltype": pooltype,
-                "pad_value": pad_value3
-            })
-
-        cvm_op1 = OpConfig(
-            type="cvm",
-            inputs={"X": ["seq_pool1_out"],
-                    "CVM": ["input_data2"]},
-            outputs={"Y": ["cvm1_out"]},
-            attrs={"use_cvm": use_cvm})
-
-        cvm_op2 = OpConfig(
-            type="cvm",
-            inputs={"X": ["seq_pool2_out"],
-                    "CVM": ["input_data2"]},
-            outputs={"Y": ["cvm2_out"]},
-            attrs={"use_cvm": use_cvm})
-
-        cvm_op3 = OpConfig(
-            type="cvm",
-            inputs={"X": ["seq_pool3_out"],
-                    "CVM": ["input_data2"]},
-            outputs={"Y": ["cvm3_out"]},
-            attrs={"use_cvm": use_cvm})
-
-        concat_op = OpConfig(
-            type="concat",
-            inputs={"X": ["cvm1_out", "cvm2_out", "cvm3_out"]},
-            outputs={"Out": ["concat_output"]},
-            attrs={'axis': axis})
+        im2sequence_op = OpConfig(type="im2sequence",
+                                  inputs={"X": ["input_data1"]},
+                                  outputs={"Out": ["seq_out"]},
+                                  attrs={
+                                      "kernels": [6, 1],
+                                      "out_stride": [1, 1],
+                                      "paddings": [0, 0, 0, 0],
+                                      "strides": [1, 1]
+                                  })
+
+        sequence_pool_op1 = OpConfig(type="sequence_pool",
+                                     inputs={"X": ["seq_out"]},
+                                     outputs={
+                                         "Out": ["seq_pool1_out"],
+                                         "MaxIndex": ["index1_out"]
+                                     },
+                                     attrs={
+                                         "is_test": is_test,
+                                         "pooltype": pooltype,
+                                         "pad_value": pad_value1
+                                     })
+
+        sequence_pool_op2 = OpConfig(type="sequence_pool",
+                                     inputs={"X": ["seq_out"]},
+                                     outputs={
+                                         "Out": ["seq_pool2_out"],
+                                         "MaxIndex": ["index2_out"]
+                                     },
+                                     attrs={
+                                         "is_test": is_test,
+                                         "pooltype": pooltype,
+                                         "pad_value": pad_value2
+                                     })
+
+        sequence_pool_op3 = OpConfig(type="sequence_pool",
+                                     inputs={"X": ["seq_out"]},
+                                     outputs={
+                                         "Out": ["seq_pool3_out"],
+                                         "MaxIndex": ["index3_out"]
+                                     },
+                                     attrs={
+                                         "is_test": is_test,
+                                         "pooltype": pooltype,
+                                         "pad_value": pad_value3
+                                     })
+
+        cvm_op1 = OpConfig(type="cvm",
+                           inputs={
+                               "X": ["seq_pool1_out"],
+                               "CVM": ["input_data2"]
+                           },
+                           outputs={"Y": ["cvm1_out"]},
+                           attrs={"use_cvm": use_cvm})
+
+        cvm_op2 = OpConfig(type="cvm",
+                           inputs={
+                               "X": ["seq_pool2_out"],
+                               "CVM": ["input_data2"]
+                           },
+                           outputs={"Y": ["cvm2_out"]},
+                           attrs={"use_cvm": use_cvm})
+
+        cvm_op3 = OpConfig(type="cvm",
+                           inputs={
+                               "X": ["seq_pool3_out"],
+                               "CVM": ["input_data2"]
+                           },
+                           outputs={"Y": ["cvm3_out"]},
+                           attrs={"use_cvm": use_cvm})
+
+        concat_op = OpConfig(type="concat",
+                             inputs={"X": ["cvm1_out", "cvm2_out", "cvm3_out"]},
+                             outputs={"Out": ["concat_output"]},
+                             attrs={'axis': axis})
 
         model_net = [
             im2sequence_op, sequence_pool_op1, sequence_pool_op2,
@@ -144,8 +149,8 @@ def sample_predictor_configs(self, program_config):
         yield config, ["im2sequence", "fusion_seqpool_cvm_concat"], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False, passes=["seqpool_cvm_concat_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            passes=["seqpool_cvm_concat_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py
index 1781eb5048347..1bcaa4d43e7a1 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py
@@ -26,10 +26,10 @@
 
 
 class TestShuffleChannelDetectPass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         if attrs[0]['input_shape'] != attrs[2]['shape']:
@@ -49,31 +49,34 @@ def sample_program_config(self, draw):
         def generate_reshape2_Input():
             return np.random.random(x_shape).astype(np.float32)
 
-        reshape2_op1 = OpConfig(
-            "reshape2",
-            inputs={"X": ["reshape2_input1"], },
-            outputs={
-                "Out": ["reshape2_output1"],
-                "XShape": ["reshape2_xshape1"]
-            },
-            shape=shape,
-            input_shape=x_shape)
-        transpose2_op = OpConfig(
-            "transpose2",
-            inputs={"X": ["reshape2_output1"], },
-            outputs={
-                "Out": ["transpose2_output"],
-                "XShape": ["transpose2_xshape"]
-            },
-            axis=axis_v)
-        reshape2_op2 = OpConfig(
-            "reshape2",
-            inputs={"X": ["transpose2_output"], },
-            outputs={
-                "Out": ["reshape2_output2"],
-                "XShape": ["reshape2_xshape2"]
-            },
-            shape=x_shape)
+        reshape2_op1 = OpConfig("reshape2",
+                                inputs={
+                                    "X": ["reshape2_input1"],
+                                },
+                                outputs={
+                                    "Out": ["reshape2_output1"],
+                                    "XShape": ["reshape2_xshape1"]
+                                },
+                                shape=shape,
+                                input_shape=x_shape)
+        transpose2_op = OpConfig("transpose2",
+                                 inputs={
+                                     "X": ["reshape2_output1"],
+                                 },
+                                 outputs={
+                                     "Out": ["transpose2_output"],
+                                     "XShape": ["transpose2_xshape"]
+                                 },
+                                 axis=axis_v)
+        reshape2_op2 = OpConfig("reshape2",
+                                inputs={
+                                    "X": ["transpose2_output"],
+                                },
+                                outputs={
+                                    "Out": ["reshape2_output2"],
+                                    "XShape": ["reshape2_xshape2"]
+                                },
+                                shape=x_shape)
         ops = [reshape2_op1, transpose2_op, reshape2_op2]
 
         program_config = ProgramConfig(
@@ -100,7 +103,8 @@ def sample_predictor_configs(self, program_config):
     def test(self):
         self.run_and_statis(
             quant=False,
-            passes=["shuffle_channel_detect_pass"], )
+            passes=["shuffle_channel_detect_pass"],
+        )
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_simplify_with_basic_ops_pass_autoscan.py b/python/paddle/fluid/tests/unittests/ir/inference/test_simplify_with_basic_ops_pass_autoscan.py
index cb55dc64445ec..1d279a2313f0d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_simplify_with_basic_ops_pass_autoscan.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_simplify_with_basic_ops_pass_autoscan.py
@@ -26,6 +26,7 @@
 
 
 class TestSimplifyWithBasicOpsPassUpscale(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -38,32 +39,34 @@ def sample_program_config(self, draw):
         dropout_prob = draw(st.floats(min_value=0.0, max_value=1.0))
         seed = draw(st.integers(min_value=0, max_value=512))
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=4))
         is_test = True
 
-        dropout_op = OpConfig(
-            "dropout",
-            inputs={"X": ["input_data"]},
-            outputs={"Out": ["dropout_output"],
-                     "Mask": ["mask"]},
-            fix_seed=fix_seed,
-            dropout_implementation=dropout_implementation,
-            dropout_prob=dropout_prob,
-            seed=seed,
-            is_test=is_test)
-        relu_op = OpConfig(
-            "relu",
-            inputs={"X": ["dropout_output"]},
-            outputs={"Out": ["relu_out"]})
+        dropout_op = OpConfig("dropout",
+                              inputs={"X": ["input_data"]},
+                              outputs={
+                                  "Out": ["dropout_output"],
+                                  "Mask": ["mask"]
+                              },
+                              fix_seed=fix_seed,
+                              dropout_implementation=dropout_implementation,
+                              dropout_prob=dropout_prob,
+                              seed=seed,
+                              is_test=is_test)
+        relu_op = OpConfig("relu",
+                           inputs={"X": ["dropout_output"]},
+                           outputs={"Out": ["relu_out"]})
         ops = [dropout_op, relu_op]
 
-        program_config = ProgramConfig(
-            ops=ops,
-            weights={},
-            inputs={"input_data": TensorConfig(shape=x_shape), },
-            outputs=["relu_out"])
+        program_config = ProgramConfig(ops=ops,
+                                       weights={},
+                                       inputs={
+                                           "input_data":
+                                           TensorConfig(shape=x_shape),
+                                       },
+                                       outputs=["relu_out"])
 
         return program_config
 
@@ -83,14 +86,14 @@ def sample_predictor_configs(self, program_config):
         yield config, ['relu'], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=30,
-            passes=["simplify_with_basic_ops_pass"],
-            min_success_num=30)
+        self.run_and_statis(quant=False,
+                            max_examples=30,
+                            passes=["simplify_with_basic_ops_pass"],
+                            min_success_num=30)
 
 
 class TestSimplifyWithBasicOpsPassDowngrade(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -100,32 +103,34 @@ def sample_program_config(self, draw):
         dropout_prob = draw(st.floats(min_value=0.0, max_value=1.0))
         seed = draw(st.integers(min_value=0, max_value=512))
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=4), min_size=2, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=4),
+                     min_size=2,
+                     max_size=4))
         is_test = True
 
-        dropout_op = OpConfig(
-            "dropout",
-            inputs={"X": ["input_data"]},
-            outputs={"Out": ["dropout_output"],
-                     "Mask": ["mask"]},
-            fix_seed=fix_seed,
-            dropout_implementation=dropout_implementation,
-            dropout_prob=dropout_prob,
-            seed=seed,
-            is_test=is_test)
-        relu_op = OpConfig(
-            "relu",
-            inputs={"X": ["dropout_output"]},
-            outputs={"Out": ["relu_out"]})
+        dropout_op = OpConfig("dropout",
+                              inputs={"X": ["input_data"]},
+                              outputs={
+                                  "Out": ["dropout_output"],
+                                  "Mask": ["mask"]
+                              },
+                              fix_seed=fix_seed,
+                              dropout_implementation=dropout_implementation,
+                              dropout_prob=dropout_prob,
+                              seed=seed,
+                              is_test=is_test)
+        relu_op = OpConfig("relu",
+                           inputs={"X": ["dropout_output"]},
+                           outputs={"Out": ["relu_out"]})
         ops = [dropout_op, relu_op]
 
-        program_config = ProgramConfig(
-            ops=ops,
-            weights={},
-            inputs={"input_data": TensorConfig(shape=x_shape), },
-            outputs=["relu_out"])
+        program_config = ProgramConfig(ops=ops,
+                                       weights={},
+                                       inputs={
+                                           "input_data":
+                                           TensorConfig(shape=x_shape),
+                                       },
+                                       outputs=["relu_out"])
 
         return program_config
 
@@ -145,11 +150,10 @@ def sample_predictor_configs(self, program_config):
         yield config, ['scale', 'relu'], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=30,
-            passes=["simplify_with_basic_ops_pass"],
-            min_success_num=30)
+        self.run_and_statis(quant=False,
+                            max_examples=30,
+                            passes=["simplify_with_basic_ops_pass"],
+                            min_success_num=30)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
index 64166daa91f1e..4b9ba91da4984 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
@@ -26,6 +26,7 @@
 
 
 class TestSquaredMatSubFusePass(PassAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -52,95 +53,94 @@ def generate_input(type):
             else:
                 return np.random.random(shape_y).astype(np.float32)
 
-        matmul_op1 = OpConfig(
-            type="matmul",
-            inputs={"X": ["input_data1"],
-                    "Y": ["input_data2"]},
-            outputs={"Out": ["matmul1_output"]},
-            attrs={
-                "transpose_X": transpose_X,
-                "transpose_Y": transpose_Y,
-                "alpha": alpha1,
-                "fused_reshape_X": [],
-                "fused_reshape_Y": [],
-                "fused_transpose_X": [],
-                "fused_transpose_Y": [],
-                "fused_reshape_Out": [],
-                "fused_transpose_Out": []
-            })
-
-        square_op1 = OpConfig(
-            type="square",
-            inputs={"X": ["matmul1_output"]},
-            outputs={"Out": ["square1_output"]},
-            attrs={})
-
-        square_op2 = OpConfig(
-            type="square",
-            inputs={"X": ["input_data1"]},
-            outputs={"Out": ["square2_output"]},
-            attrs={})
-
-        square_op3 = OpConfig(
-            type="square",
-            inputs={"X": ["input_data2"]},
-            outputs={"Out": ["square3_output"]},
-            attrs={})
-
-        matmul_op2 = OpConfig(
-            type="matmul",
-            inputs={"X": ["square2_output"],
-                    "Y": ["square3_output"]},
-            outputs={"Out": ["matmul2_output"]},
-            attrs={
-                "transpose_X": transpose_X,
-                "transpose_Y": transpose_Y,
-                "alpha": alpha2,
-                "fused_reshape_X": [],
-                "fused_reshape_Y": [],
-                "fused_transpose_X": [],
-                "fused_transpose_Y": [],
-                "fused_reshape_Out": [],
-                "fused_transpose_Out": []
-            })
-
-        elt_sub_op = OpConfig(
-            type="elementwise_sub",
-            inputs={"X": ["square1_output"],
-                    "Y": ["matmul2_output"]},
-            outputs={"Out": ["sub_out"]},
-            attrs={"axis": axis1})
+        matmul_op1 = OpConfig(type="matmul",
+                              inputs={
+                                  "X": ["input_data1"],
+                                  "Y": ["input_data2"]
+                              },
+                              outputs={"Out": ["matmul1_output"]},
+                              attrs={
+                                  "transpose_X": transpose_X,
+                                  "transpose_Y": transpose_Y,
+                                  "alpha": alpha1,
+                                  "fused_reshape_X": [],
+                                  "fused_reshape_Y": [],
+                                  "fused_transpose_X": [],
+                                  "fused_transpose_Y": [],
+                                  "fused_reshape_Out": [],
+                                  "fused_transpose_Out": []
+                              })
+
+        square_op1 = OpConfig(type="square",
+                              inputs={"X": ["matmul1_output"]},
+                              outputs={"Out": ["square1_output"]},
+                              attrs={})
+
+        square_op2 = OpConfig(type="square",
+                              inputs={"X": ["input_data1"]},
+                              outputs={"Out": ["square2_output"]},
+                              attrs={})
+
+        square_op3 = OpConfig(type="square",
+                              inputs={"X": ["input_data2"]},
+                              outputs={"Out": ["square3_output"]},
+                              attrs={})
+
+        matmul_op2 = OpConfig(type="matmul",
+                              inputs={
+                                  "X": ["square2_output"],
+                                  "Y": ["square3_output"]
+                              },
+                              outputs={"Out": ["matmul2_output"]},
+                              attrs={
+                                  "transpose_X": transpose_X,
+                                  "transpose_Y": transpose_Y,
+                                  "alpha": alpha2,
+                                  "fused_reshape_X": [],
+                                  "fused_reshape_Y": [],
+                                  "fused_transpose_X": [],
+                                  "fused_transpose_Y": [],
+                                  "fused_reshape_Out": [],
+                                  "fused_transpose_Out": []
+                              })
+
+        elt_sub_op = OpConfig(type="elementwise_sub",
+                              inputs={
+                                  "X": ["square1_output"],
+                                  "Y": ["matmul2_output"]
+                              },
+                              outputs={"Out": ["sub_out"]},
+                              attrs={"axis": axis1})
 
         if has_str_value:
-            fill_constant_op = OpConfig(
-                type="fill_constant",
-                inputs={},
-                outputs={"Out": ["constant_out"]},
-                attrs={
-                    "dtype": 5,
-                    "place_type": place_type,
-                    "str_value": str_value,
-                    "value": value,
-                    "shape": shape
-                })
+            fill_constant_op = OpConfig(type="fill_constant",
+                                        inputs={},
+                                        outputs={"Out": ["constant_out"]},
+                                        attrs={
+                                            "dtype": 5,
+                                            "place_type": place_type,
+                                            "str_value": str_value,
+                                            "value": value,
+                                            "shape": shape
+                                        })
         else:
-            fill_constant_op = OpConfig(
-                type="fill_constant",
-                inputs={},
-                outputs={"Out": ["constant_out"]},
-                attrs={
-                    "dtype": 5,
-                    "place_type": place_type,
-                    "value": value,
-                    "shape": shape
-                })
-
-        elt_mul_op = OpConfig(
-            type="elementwise_mul",
-            inputs={"X": ["sub_out"],
-                    "Y": ["constant_out"]},
-            outputs={"Out": ["mul_out"]},
-            attrs={"axis": axis2})
+            fill_constant_op = OpConfig(type="fill_constant",
+                                        inputs={},
+                                        outputs={"Out": ["constant_out"]},
+                                        attrs={
+                                            "dtype": 5,
+                                            "place_type": place_type,
+                                            "value": value,
+                                            "shape": shape
+                                        })
+
+        elt_mul_op = OpConfig(type="elementwise_mul",
+                              inputs={
+                                  "X": ["sub_out"],
+                                  "Y": ["constant_out"]
+                              },
+                              outputs={"Out": ["mul_out"]},
+                              attrs={"axis": axis2})
 
         model_net = [
             matmul_op1, square_op1, square_op2, square_op3, matmul_op2,
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_squeeze2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_squeeze2_matmul_fuse_pass.py
index 6d9457f35750b..e06a242395f85 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_squeeze2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_squeeze2_matmul_fuse_pass.py
@@ -50,9 +50,9 @@ def sample_predictor_configs(self, program_config):
     def sample_program_config(self, draw):
         # 1. Generate shape of input:X of squeeze2
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=2))
         # axes of squeeze2 == [2, 3]
         x_shape += [1, 1]
         axes = [2, 3]
@@ -64,9 +64,9 @@ def sample_program_config(self, draw):
 
         # 3. Generate legal shape of input:Y of matmul
         y_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=2))
         y_shape[0] = x_shape[1]
 
         # 4. Generate legal attr:axis of elementwise_add
@@ -74,13 +74,19 @@ def sample_program_config(self, draw):
         if axis == 0 or axis == -1:
             if draw(st.booleans()):
                 if axis == 0:
-                    bias_shape = [x_shape[0], ]
+                    bias_shape = [
+                        x_shape[0],
+                    ]
                 else:
-                    bias_shape = [y_shape[1], ]
+                    bias_shape = [
+                        y_shape[1],
+                    ]
             else:
                 bias_shape = [x_shape[0], y_shape[1]]
         elif axis == 1:
-            bias_shape = [y_shape[1], ]
+            bias_shape = [
+                y_shape[1],
+            ]
 
         if draw(st.integers(min_value=1, max_value=10)) <= 1:
             bias_shape[-1] = 1
@@ -89,14 +95,21 @@ def sample_program_config(self, draw):
 
         squeeze2_op = OpConfig(
             "squeeze2",
-            inputs={"X": ["squeeze2_x"], },
+            inputs={
+                "X": ["squeeze2_x"],
+            },
             axes=axes,
-            outputs={"Out": ["squeeze2_out"],
-                     "XShape": ["xshape"]}, )
+            outputs={
+                "Out": ["squeeze2_out"],
+                "XShape": ["xshape"]
+            },
+        )
         matmul_op = OpConfig(
             "matmul",
-            inputs={"X": ["squeeze2_out"],
-                    "Y": ["matmul_y"]},
+            inputs={
+                "X": ["squeeze2_out"],
+                "Y": ["matmul_y"]
+            },
             outputs={"Out": ["matmul_out"]},
             alpha=alpha,
             transpose_X=transpose_X,
@@ -106,14 +119,18 @@ def sample_program_config(self, draw):
             fused_transpose_X=[],
             fused_transpose_Y=[],
             fused_reshape_Out=[],
-            fused_transpose_Out=[], )
+            fused_transpose_Out=[],
+        )
 
         add_op = OpConfig(
             "elementwise_add",
-            inputs={"X": ["matmul_out"],
-                    "Y": ["bias"]},
+            inputs={
+                "X": ["matmul_out"],
+                "Y": ["bias"]
+            },
             outputs={"Out": ["add_out"]},
-            axis=axis, )
+            axis=axis,
+        )
 
         ops = [squeeze2_op, matmul_op, add_op]
 
@@ -124,8 +141,11 @@ def sample_program_config(self, draw):
                     "matmul_y": TensorConfig(shape=y_shape),
                     "bias": TensorConfig(shape=bias_shape),
                 },
-                inputs={"squeeze2_x": TensorConfig(shape=x_shape), },
-                outputs=ops[-1].outputs["Out"], )
+                inputs={
+                    "squeeze2_x": TensorConfig(shape=x_shape),
+                },
+                outputs=ops[-1].outputs["Out"],
+            )
         else:
             program_config = ProgramConfig(
                 ops=ops,
@@ -135,15 +155,15 @@ def sample_program_config(self, draw):
                     "matmul_y": TensorConfig(shape=y_shape),
                     "bias": TensorConfig(shape=bias_shape),
                 },
-                outputs=ops[-1].outputs["Out"], )
+                outputs=ops[-1].outputs["Out"],
+            )
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=50,
-            max_duration=1000,
-            passes=["gpu_cpu_squeeze2_matmul_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=50,
+                            max_duration=1000,
+                            passes=["gpu_cpu_squeeze2_matmul_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
index 64c3042b63cf8..198c4e5c742dc 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
@@ -37,12 +37,14 @@ class TestTransposeFlattenConcatFusePass(PassAutoScanTest):
     """
 
     def sample_predictor_configs(self, program_config):
-        # TRT  
+        # TRT
         # after tensorrt_subgraph_pass ，The pass needs to be deleted on TRT
 
         # for gpu
         config = self.create_inference_config(use_gpu=True)
-        yield config, ["fusion_transpose_flatten_concat", ], (1e-5, 1e-5)
+        yield config, [
+            "fusion_transpose_flatten_concat",
+        ], (1e-5, 1e-5)
 
     def is_program_valid(self, prog_config):
         concat_axis = prog_config.ops[-1].attrs["axis"]
@@ -96,36 +98,39 @@ def sample_program_config(self, draw):
             if draw(st.booleans()):
                 trans_axis[j], trans_axis[-1] = trans_axis[-1], trans_axis[j]
         #  Generate axis of flatten
-        flatten_axis = draw(
-            st.integers(
-                min_value=0, max_value=x_shape_rank - 1))
+        flatten_axis = draw(st.integers(min_value=0,
+                                        max_value=x_shape_rank - 1))
         for i in range(times):
             #  Generate x_shape of transpose
             x_shape = draw(
-                st.lists(
-                    st.integers(
-                        min_value=1, max_value=10),
-                    min_size=x_shape_rank,
-                    max_size=x_shape_rank))
+                st.lists(st.integers(min_value=1, max_value=10),
+                         min_size=x_shape_rank,
+                         max_size=x_shape_rank))
 
             str_i = str(i)
             transpose_op = OpConfig(
                 "transpose2",
-                inputs={"X": ["transpose2_x" + str_i], },
+                inputs={
+                    "X": ["transpose2_x" + str_i],
+                },
                 axis=trans_axis,
                 outputs={
                     "Out": ["trans_out" + str_i],
                     "XShape": ["trans_shape" + str_i]
-                }, )
+                },
+            )
             ops.append(transpose_op)
             flatten_op = OpConfig(
                 "flatten2",
-                inputs={"X": ["trans_out" + str_i], },
+                inputs={
+                    "X": ["trans_out" + str_i],
+                },
                 axis=flatten_axis,
                 outputs={
                     "Out": ["flatten2_out" + str_i],
                     "XShape": ["xshape" + str_i]
-                }, )
+                },
+            )
             concat_input.append("flatten2_out" + str_i)
             ops.append(flatten_op)
             inputs["transpose2_x" + str_i] = TensorConfig(shape=x_shape)
@@ -137,7 +142,8 @@ def sample_program_config(self, draw):
                 "AxisTensor": [],
             },
             outputs={"Out": ["concat_out"]},
-            axis=concat_axis, )
+            axis=concat_axis,
+        )
 
         ops.append(concat_op)
 
@@ -145,14 +151,14 @@ def sample_program_config(self, draw):
             ops=ops,
             weights={},
             inputs=inputs,
-            outputs=ops[-1].outputs["Out"], )
+            outputs=ops[-1].outputs["Out"],
+        )
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=300,
-            passes=["transpose_flatten_concat_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=300,
+                            passes=["transpose_flatten_concat_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
index a1f15de488010..7ba824360ad46 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
@@ -24,6 +24,7 @@
 
 
 class TensorRTSubgraphPassActivationTest(InferencePassTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
@@ -32,8 +33,9 @@ def setUpTensorRTParam(self):
     def setUp(self):
         self.setUpTensorRTParam()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 32, 32], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 6, 32, 32],
+                              dtype="float32")
             act_out = self.append_act(data)
             out = fluid.layers.batch_norm(act_out, is_test=True)
         self.feeds = {
@@ -58,52 +60,62 @@ def test_check_output(self):
 
 
 class TensorRTSubgraphPassLeakyReluTest(TensorRTSubgraphPassActivationTest):
+
     def append_act(self, x):
         return fluid.layers.leaky_relu(x)
 
 
 class TensorRTSubgraphPassRelu6Test(TensorRTSubgraphPassActivationTest):
+
     def append_act(self, x):
         return fluid.layers.relu6(x)
 
 
 class TensorRTSubgraphPassSoftMaxTest(TensorRTSubgraphPassActivationTest):
+
     def append_act(self, x):
         return fluid.layers.softmax(x)
 
 
 class TensorRTSubgraphPassSigmoidTest(TensorRTSubgraphPassActivationTest):
+
     def append_act(self, x):
         return fluid.layers.sigmoid(x)
 
 
 class TensorRTSubgraphPassHardSwishTest(TensorRTSubgraphPassActivationTest):
+
     def append_act(self, x):
         return fluid.layers.hard_swish(x)
 
 
 class TensorRTSubgraphPassHardSigmoidTest(TensorRTSubgraphPassActivationTest):
+
     def append_act(self, x):
         return fluid.layers.hard_sigmoid(x)
 
 
-class TensorRTSubgraphPassHardSwishPluginTest(
-        TensorRTSubgraphPassActivationTest):
+class TensorRTSubgraphPassHardSwishPluginTest(TensorRTSubgraphPassActivationTest
+                                              ):
+
     def append_act(self, x):
         return fluid.layers.hard_swish(x, threshold=4.0, scale=8.0)
 
 
 class TensorRTSubgraphPassClipTest(TensorRTSubgraphPassActivationTest):
+
     def append_act(self, x):
         return fluid.layers.clip(x, 0, 1)
 
 
 class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest):
+
     def append_act(self, x):
         return fluid.layers.tanh(x)
 
 
 class TensorRTSubgraphPassSwishTest(TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
@@ -115,6 +127,7 @@ def append_act(self, x):
 
 class TensorRTSubgraphPassSwishFp16SerializeTest(
         TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
@@ -126,20 +139,21 @@ def append_act(self, x):
 
 class TensorRTSubgraphPassDynamicSwishFp16SerializeTest(
         TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
         self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 128, 128]}, {'data': [1, 6, 64, 64]}, False)
+            {'data': [1, 6, 8, 8]}, {'data': [1, 6, 128, 128]},
+            {'data': [1, 6, 64, 64]}, False)
 
     def append_act(self, x):
         return fluid.layers.swish(x)
 
 
 class TensorRTSubgraphPassMishTest(TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
@@ -151,6 +165,7 @@ def append_act(self, x):
 
 class TensorRTSubgraphPassMishFp16SerializeTest(
         TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
@@ -162,49 +177,53 @@ def append_act(self, x):
 
 class TensorRTSubgraphPassDynamicMishFp16SerializeTest(
         TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False)
         self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 128, 128]}, {'data': [1, 6, 64, 64]}, False)
+            {'data': [1, 6, 8, 8]}, {'data': [1, 6, 128, 128]},
+            {'data': [1, 6, 64, 64]}, False)
 
     def append_act(self, x):
         return fluid.layers.mish(x)
 
 
 class TensorRTSubgraphPassPreluAllTest(TensorRTSubgraphPassActivationTest):
+
     def append_act(self, x):
         return fluid.layers.prelu(x, mode='all')
 
 
 class TensorRTSubgraphPassPreluChannelTest(TensorRTSubgraphPassActivationTest):
+
     def append_act(self, x):
         return fluid.layers.prelu(x, mode='channel')
 
 
 class TensorRTSubgraphPassPreluElementTest(TensorRTSubgraphPassActivationTest):
+
     def append_act(self, x):
         return fluid.layers.prelu(x, mode='element')
 
 
 class TensorRTSubgraphPassPreluDynamicTest(TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
         self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 128, 128]}, {'data': [1, 6, 64, 64]}, False)
+            {'data': [1, 6, 8, 8]}, {'data': [1, 6, 128, 128]},
+            {'data': [1, 6, 64, 64]}, False)
 
     def append_act(self, x):
         return fluid.layers.prelu(x, mode='all')
 
 
 class TensorRTSubgraphPassPreluFp16Test(TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
@@ -216,6 +235,7 @@ def append_act(self, x):
 
 class TensorRTSubgraphPassPreluFp16SerializeTest(
         TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
@@ -227,14 +247,14 @@ def append_act(self, x):
 
 class TensorRTSubgraphPassPreluFp16DynamicTest(
         TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False)
         self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 128, 128]}, {'data': [1, 6, 64, 64]}, False)
+            {'data': [1, 6, 8, 8]}, {'data': [1, 6, 128, 128]},
+            {'data': [1, 6, 64, 64]}, False)
 
     def append_act(self, x):
         return fluid.layers.prelu(x, mode='all')
@@ -242,39 +262,41 @@ def append_act(self, x):
 
 class TensorRTSubgraphPassPreluFp16DynamicSerializeTest(
         TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
         self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 128, 128]}, {'data': [1, 6, 64, 64]}, False)
+            {'data': [1, 6, 8, 8]}, {'data': [1, 6, 128, 128]},
+            {'data': [1, 6, 64, 64]}, False)
 
     def append_act(self, x):
         return fluid.layers.prelu(x, mode='all')
 
 
 class TensorRTSubgraphPassGeluTest(TensorRTSubgraphPassActivationTest):
+
     def append_act(self, x):
         return fluid.layers.gelu(x)
 
 
 class TensorRTSubgraphPassGeluDynamicTest(TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
         self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 128, 128]}, {'data': [1, 6, 64, 64]}, False)
+            {'data': [1, 6, 8, 8]}, {'data': [1, 6, 128, 128]},
+            {'data': [1, 6, 64, 64]}, False)
 
     def append_act(self, x):
         return fluid.layers.gelu(x)
 
 
 class TensorRTSubgraphPassGeluFp16Test(TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
@@ -286,6 +308,7 @@ def append_act(self, x):
 
 class TensorRTSubgraphPassGeluFp16SerializeTest(
         TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
@@ -295,16 +318,16 @@ def append_act(self, x):
         return fluid.layers.gelu(x)
 
 
-class TensorRTSubgraphPassGeluFp16DynamicTest(
-        TensorRTSubgraphPassActivationTest):
+class TensorRTSubgraphPassGeluFp16DynamicTest(TensorRTSubgraphPassActivationTest
+                                              ):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False)
         self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 128, 128]}, {'data': [1, 6, 64, 64]}, False)
+            {'data': [1, 6, 8, 8]}, {'data': [1, 6, 128, 128]},
+            {'data': [1, 6, 64, 64]}, False)
 
     def append_act(self, x):
         return fluid.layers.gelu(x)
@@ -312,14 +335,14 @@ def append_act(self, x):
 
 class TensorRTSubgraphPassGeluFp16DynamicSerializeTest(
         TensorRTSubgraphPassActivationTest):
+
     def setUpTensorRTParam(self):
         self.enable_trt = True
         self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
         self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
-            {
-                'data': [1, 6, 8, 8]
-            }, {'data': [1, 6, 128, 128]}, {'data': [1, 6, 64, 64]}, False)
+            {'data': [1, 6, 8, 8]}, {'data': [1, 6, 128, 128]},
+            {'data': [1, 6, 64, 64]}, False)
 
     def append_act(self, x):
         return fluid.layers.gelu(x)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py
index 90cdf784b1fcf..9bfd9edfe984a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py
@@ -25,6 +25,7 @@
 
 
 class TRTAffineChannelTest(InferencePassTest):
+
     def setUp(self):
         self.bs = 2
         self.channel = 8
@@ -36,7 +37,7 @@ def setUp(self):
         self.enable_trt = True
 
     def build(self):
-        # set min_graph_size to 2, 
+        # set min_graph_size to 2,
         # because affine channel doesn't support nhwc format
         self.trt_parameters = InferencePassTest.TensorRTParam(
             1 << 30, self.bs, 2, self.precision, self.serialize, False)
@@ -62,7 +63,9 @@ def build(self):
             out = fluid.layers.batch_norm(affine_channel_out, is_test=True)
 
         shape[0] = self.bs
-        self.feeds = {'in': np.random.random(shape).astype('float32'), }
+        self.feeds = {
+            'in': np.random.random(shape).astype('float32'),
+        }
         self.fetch_list = [out]
 
     def check_output(self):
@@ -99,9 +102,8 @@ def run_test_all(self):
             max_shape = [self.bs, self.height * 2, self.width * 2, self.channel]
             opt_shape = [self.bs, self.height, self.width, self.channel]
 
-        dynamic_shape_profile = InferencePassTest.DynamicShapeParam({
-            'in': min_shape
-        }, {'in': max_shape}, {'in': opt_shape}, False)
+        dynamic_shape_profile = InferencePassTest.DynamicShapeParam(
+            {'in': min_shape}, {'in': max_shape}, {'in': opt_shape}, False)
         dynamic_shape_opt = [None, dynamic_shape_profile]
 
         for precision, serialize, dynamic_shape in itertools.product(
@@ -123,10 +125,10 @@ def test_serialize(self):
         self.run_test()
 
     def test_dynamic(self):
-        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({
-            'in': [self.bs, self.channel, self.height // 2, self.width // 2]
-        }, {'in': [self.bs, self.channel, self.height * 2, self.width * 2]
-            }, {'in': [self.bs, self.channel, self.height, self.width]}, False)
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam(
+            {'in': [self.bs, self.channel, self.height // 2, self.width // 2]},
+            {'in': [self.bs, self.channel, self.height * 2, self.width * 2]},
+            {'in': [self.bs, self.channel, self.height, self.width]}, False)
         self.run_test()
 
     def test_nchw_all(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
index 1d6f1c2c45910..e3cecbe4119ee 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
@@ -25,6 +25,7 @@
 
 
 class TRTAnchorGeneratorBaseTest(InferencePassTest):
+
     def setUp(self):
         self.bs = 1
         self.channel = 16
@@ -49,10 +50,9 @@ def build(self):
             1 << 30, self.bs, min_graph_size, self.precision, self.serialize,
             False)
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name='data',
-                shape=[-1, self.channel, self.height, self.width],
-                dtype='float32')
+            data = fluid.data(name='data',
+                              shape=[-1, self.channel, self.height, self.width],
+                              dtype='float32')
             anchor, var = fluid.layers.detection.anchor_generator(
                 data,
                 anchor_sizes=self.anchor_sizes,
@@ -70,11 +70,12 @@ def run_test(self):
         self.check_output()
 
     def set_dynamic(self):
-        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({
-            'data': [self.bs, self.channel, self.height // 2, self.width // 2]
-        }, {
-            'data': [self.bs, self.channel, self.height, self.width]
-        }, {'data': [self.bs, self.channel, self.height, self.width]}, False)
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam(
+            {
+                'data':
+                [self.bs, self.channel, self.height // 2, self.width // 2]
+            }, {'data': [self.bs, self.channel, self.height, self.width]},
+            {'data': [self.bs, self.channel, self.height, self.width]}, False)
 
     def test_base(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_op.py
index 8bca7af3f0d23..2fdacdc1e4ec9 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_op.py
@@ -24,22 +24,23 @@
 
 
 class TensorRTSubgraphPassConv3dTest(InferencePassTest):
+
     def setUp(self):
         self.init_params()
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 6, 32, 32], dtype="float32")
-            conv_out = fluid.layers.conv3d(
-                input=data,
-                num_filters=self.conv_num_filters,
-                filter_size=self.conv_filter_size,
-                groups=self.conv_groups,
-                padding=self.conv_padding,
-                bias_attr=False,
-                use_cudnn=self.use_cudnn,
-                stride=self.stride,
-                act=None)
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 6, 32, 32],
+                              dtype="float32")
+            conv_out = fluid.layers.conv3d(input=data,
+                                           num_filters=self.conv_num_filters,
+                                           filter_size=self.conv_filter_size,
+                                           groups=self.conv_groups,
+                                           padding=self.conv_padding,
+                                           bias_attr=False,
+                                           use_cudnn=self.use_cudnn,
+                                           stride=self.stride,
+                                           act=None)
         self.feeds = {
             "data": np.random.random([1, 3, 6, 32, 32]).astype("float32"),
         }
@@ -69,8 +70,9 @@ def test_check_output(self):
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
 
 
-class TensorRTSubgraphPassConv3dValidPaddingTest(
-        TensorRTSubgraphPassConv3dTest):
+class TensorRTSubgraphPassConv3dValidPaddingTest(TensorRTSubgraphPassConv3dTest
+                                                 ):
+
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
@@ -79,6 +81,7 @@ def set_params(self):
 
 
 class TensorRTSubgraphPassConv3dSamePaddingTest(TensorRTSubgraphPassConv3dTest):
+
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
@@ -87,6 +90,7 @@ def set_params(self):
 
 
 class TensorRTSubgraphPassConv3dPaddingTest(TensorRTSubgraphPassConv3dTest):
+
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
@@ -95,6 +99,7 @@ def set_params(self):
 
 
 class TensorRTSubgraphPassConv3dStrideTest(TensorRTSubgraphPassConv3dTest):
+
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
@@ -104,21 +109,22 @@ def set_params(self):
 
 
 class DynamicShapeTensorRTSubgraphPassConv3dTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, -1, -1, -1], dtype="float32")
-            conv_out = fluid.layers.conv3d(
-                input=data,
-                num_filters=self.conv_num_filters,
-                filter_size=self.conv_filter_size,
-                groups=self.conv_groups,
-                padding=self.conv_padding,
-                bias_attr=False,
-                use_cudnn=self.use_cudnn,
-                stride=self.stride,
-                act=None)
+            data = fluid.data(name="data",
+                              shape=[-1, 6, -1, -1, -1],
+                              dtype="float32")
+            conv_out = fluid.layers.conv3d(input=data,
+                                           num_filters=self.conv_num_filters,
+                                           filter_size=self.conv_filter_size,
+                                           groups=self.conv_groups,
+                                           padding=self.conv_padding,
+                                           bias_attr=False,
+                                           use_cudnn=self.use_cudnn,
+                                           stride=self.stride,
+                                           act=None)
         self.feeds = {
             "data": np.random.random([1, 6, 32, 32, 8]).astype("float32"),
         }
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_transpose_op.py
index dfec7ef9b4d7d..6ada9edd18e6a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_transpose_op.py
@@ -24,11 +24,13 @@
 
 
 class TensorRTSubgraphPassConv3dTransposeTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 4, 4, 32, 32], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 4, 4, 32, 32],
+                              dtype="float32")
             conv_out = fluid.layers.conv3d_transpose(
                 input=data,
                 num_filters=self.conv_num_filters,
@@ -64,6 +66,7 @@ def test_check_output(self):
 
 class TensorRTSubgraphPassConv3dTransposeSamePaddingTest(
         TensorRTSubgraphPassConv3dTransposeTest):
+
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
@@ -74,6 +77,7 @@ def set_params(self):
 
 class TensorRTSubgraphPassConv3dTransposeMultigroupTest(
         TensorRTSubgraphPassConv3dTransposeTest):
+
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
@@ -83,11 +87,13 @@ def set_params(self):
 
 
 class DynamicShapeTensorRTSubgraphPassConv3dTransposeTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, -1, -1, -1], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 6, -1, -1, -1],
+                              dtype="float32")
             conv_out = fluid.layers.conv3d_transpose(
                 input=data,
                 num_filters=self.conv_num_filters,
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
index ebbf724d0b4ea..a934c264e473f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
@@ -24,20 +24,21 @@
 
 
 class TensorRTSubgraphPassConvTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
-            conv_out = fluid.layers.conv2d(
-                input=data,
-                num_filters=self.conv_num_filters,
-                filter_size=self.conv_filter_size,
-                groups=self.conv_groups,
-                padding=self.conv_padding,
-                bias_attr=False,
-                use_cudnn=self.use_cudnn,
-                act=None)
+            data = fluid.data(name="data",
+                              shape=[-1, 6, 64, 64],
+                              dtype="float32")
+            conv_out = fluid.layers.conv2d(input=data,
+                                           num_filters=self.conv_num_filters,
+                                           filter_size=self.conv_filter_size,
+                                           groups=self.conv_groups,
+                                           padding=self.conv_padding,
+                                           bias_attr=False,
+                                           use_cudnn=self.use_cudnn,
+                                           act=None)
         self.feeds = {
             "data": np.random.random([1, 6, 64, 64]).astype("float32"),
         }
@@ -62,6 +63,7 @@ def test_check_output(self):
 
 
 class TensorRTSubgraphPassConvValidPaddingTest(TensorRTSubgraphPassConvTest):
+
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
@@ -71,6 +73,7 @@ def set_params(self):
 
 
 class TensorRTSubgraphPassConvSamePaddingTest(InferencePassTest):
+
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
@@ -80,6 +83,7 @@ def set_params(self):
 
 
 class TensorRTSubgraphPassDepthwiseConvTest(TensorRTSubgraphPassConvTest):
+
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
@@ -89,6 +93,7 @@ def set_params(self):
 
 
 class TensorRTSubgraphPassDepthwiseConv2Test(TensorRTSubgraphPassConvTest):
+
     def set_params(self):
         self.conv_num_filters = 12
         self.conv_filter_size = 6
@@ -98,11 +103,13 @@ def set_params(self):
 
 
 class TensorRTSubgraphPassConvTransposeTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 6, 64, 64],
+                              dtype="float32")
             conv_out = fluid.layers.conv2d_transpose(
                 input=data,
                 num_filters=self.conv_num_filters,
@@ -137,6 +144,7 @@ def test_check_output(self):
 
 class TensorRTSubgraphPassConvTransposeValidPaddingTest(
         TensorRTSubgraphPassConvTransposeTest):
+
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
@@ -147,6 +155,7 @@ def set_params(self):
 
 class TensorRTSubgraphPassConvTransposeSamePaddingTest(
         TensorRTSubgraphPassConvTransposeTest):
+
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
@@ -157,6 +166,7 @@ def set_params(self):
 
 class TensorRTSubgraphPassConvTransposeMultiGroupTest(
         TensorRTSubgraphPassConvTransposeTest):
+
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
@@ -167,6 +177,7 @@ def set_params(self):
 
 class TensorRTSubgraphPassConvTranspose2Test(
         TensorRTSubgraphPassConvTransposeTest):
+
     def set_params(self):
         self.conv_num_filters = 12
         self.conv_filter_size = 4
@@ -177,6 +188,7 @@ def set_params(self):
 
 class TensorRTSubgraphPassDepthwiseConvTransposeTest(
         TensorRTSubgraphPassConvTransposeTest):
+
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 4
@@ -186,21 +198,22 @@ def set_params(self):
 
 
 class DynamicShapeTensorRTSubgraphPassConvTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, -1, -1], dtype="float32")
-            conv_out = fluid.layers.conv2d(
-                input=data,
-                num_filters=self.conv_num_filters,
-                filter_size=self.conv_filter_size,
-                groups=self.conv_groups,
-                padding=self.conv_padding,
-                bias_attr=False,
-                use_cudnn=self.use_cudnn,
-                stride=self.stride,
-                act=None)
+            data = fluid.data(name="data",
+                              shape=[-1, 6, -1, -1],
+                              dtype="float32")
+            conv_out = fluid.layers.conv2d(input=data,
+                                           num_filters=self.conv_num_filters,
+                                           filter_size=self.conv_filter_size,
+                                           groups=self.conv_groups,
+                                           padding=self.conv_padding,
+                                           bias_attr=False,
+                                           use_cudnn=self.use_cudnn,
+                                           stride=self.stride,
+                                           act=None)
         self.feeds = {
             "data": np.random.random([32, 6, 64, 64]).astype("float32"),
         }
@@ -241,6 +254,7 @@ def test_check_output(self):
 
 class DynamicShapeTensorRTSubgraphPassDepthwiseConvTransposeTest(
         DynamicShapeTensorRTSubgraphPassConvTest):
+
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py
index eacdb26968961..f800d2fc3f4de 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py
@@ -25,24 +25,25 @@
 
 
 class QuantDequantTensorRTSubgraphPassConvTest(QuantDequantTest):
+
     def setUp(self):
         self.set_params()
 
         def network():
-            self.data = fluid.data(
-                name='data', shape=[1, 28, 28], dtype='float32')
+            self.data = fluid.data(name='data',
+                                   shape=[1, 28, 28],
+                                   dtype='float32')
             data_reshape = fluid.layers.reshape(self.data, shape=[1, 4, 14, 14])
             self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
             label_shape = fluid.layers.reshape(self.label, shape=[1, 1, 1])
-            conv_out = fluid.layers.conv2d(
-                input=data_reshape,
-                num_filters=self.conv_num_filters,
-                filter_size=self.conv_filter_size,
-                groups=self.conv_groups,
-                padding=self.conv_padding,
-                bias_attr=False,
-                use_cudnn=self.use_cudnn,
-                act=None)
+            conv_out = fluid.layers.conv2d(input=data_reshape,
+                                           num_filters=self.conv_num_filters,
+                                           filter_size=self.conv_filter_size,
+                                           groups=self.conv_groups,
+                                           padding=self.conv_padding,
+                                           bias_attr=False,
+                                           use_cudnn=self.use_cudnn,
+                                           act=None)
             if self.conv_padding == [1, 1]:
                 cout = fluid.layers.reshape(conv_out, shape=[1, 1, 10816])
             elif self.conv_padding == 'VALID':
@@ -87,14 +88,17 @@ def set_params(self):
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             use_gpu = True
-            self.check_output_with_option(
-                use_gpu, atol=1e-1, flatten=False, rtol=1e-1)
+            self.check_output_with_option(use_gpu,
+                                          atol=1e-1,
+                                          flatten=False,
+                                          rtol=1e-1)
             self.assertTrue(
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
 
 
 class QuantDequantTensorRTSubgraphPassConvValidPaddingTest(
         QuantDequantTensorRTSubgraphPassConvTest):
+
     def set_params(self):
         self.conv_num_filters = 64
         self.conv_filter_size = 4
@@ -105,6 +109,7 @@ def set_params(self):
 
 class QuantDequantTensorRTSubgraphPassConvSamePaddingTest(
         QuantDequantTensorRTSubgraphPassConvTest):
+
     def set_params(self):
         self.conv_num_filters = 64
         self.conv_filter_size = 4
@@ -115,6 +120,7 @@ def set_params(self):
 
 class QuantDequantTensorRTSubgraphPassDWConvTest(
         QuantDequantTensorRTSubgraphPassConvTest):
+
     def set_params(self):
         self.conv_num_filters = 64
         self.conv_filter_size = 4
@@ -124,24 +130,25 @@ def set_params(self):
 
 
 class DynamicShapeQuantDequantTensorRTSubgraphPassConvTest(QuantDequantTest):
+
     def setUp(self):
         self.set_params()
 
         def network():
-            self.data = fluid.data(
-                name='data', shape=[1, 28, 28], dtype='float32')
+            self.data = fluid.data(name='data',
+                                   shape=[1, 28, 28],
+                                   dtype='float32')
             data_reshape = fluid.layers.reshape(self.data, shape=[1, 4, 14, 14])
             self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
             label_shape = fluid.layers.reshape(self.label, shape=[1, 1, 1])
-            conv_out = fluid.layers.conv2d(
-                input=data_reshape,
-                num_filters=self.conv_num_filters,
-                filter_size=self.conv_filter_size,
-                groups=self.conv_groups,
-                padding=self.conv_padding,
-                bias_attr=False,
-                use_cudnn=self.use_cudnn,
-                act=None)
+            conv_out = fluid.layers.conv2d(input=data_reshape,
+                                           num_filters=self.conv_num_filters,
+                                           filter_size=self.conv_filter_size,
+                                           groups=self.conv_groups,
+                                           padding=self.conv_padding,
+                                           bias_attr=False,
+                                           use_cudnn=self.use_cudnn,
+                                           act=None)
             cout = fluid.layers.reshape(conv_out, shape=[1, 1, 10816])
             result = fluid.layers.relu(cout)
             loss = fluid.layers.cross_entropy(input=result, label=label_shape)
@@ -199,19 +206,23 @@ def set_params(self):
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             use_gpu = True
-            self.check_output_with_option(
-                use_gpu, atol=1e-1, flatten=False, rtol=1e-1)
+            self.check_output_with_option(use_gpu,
+                                          atol=1e-1,
+                                          flatten=False,
+                                          rtol=1e-1)
             self.assertTrue(
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
 
 
 class QuantDequantTensorRTSubgraphPassConvTransposeTest(QuantDequantTest):
+
     def setUp(self):
         self.set_params()
 
         def network():
-            self.data = fluid.data(
-                name='data', shape=[1, 28, 28], dtype='float32')
+            self.data = fluid.data(name='data',
+                                   shape=[1, 28, 28],
+                                   dtype='float32')
             data_reshape = fluid.layers.reshape(self.data, shape=[1, 4, 14, 14])
             self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
             label_shape = fluid.layers.reshape(self.label, shape=[1, 1, 1])
@@ -268,14 +279,17 @@ def set_params(self):
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             use_gpu = True
-            self.check_output_with_option(
-                use_gpu, atol=1e-1, flatten=False, rtol=1e-1)
+            self.check_output_with_option(use_gpu,
+                                          atol=1e-1,
+                                          flatten=False,
+                                          rtol=1e-1)
             self.assertTrue(
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
 
 
 class QuantDequantTensorRTSubgraphPassConvTransValidPaddingTest(
         QuantDequantTensorRTSubgraphPassConvTransposeTest):
+
     def set_params(self):
         self.conv_num_filters = 64
         self.conv_filter_size = 4
@@ -286,6 +300,7 @@ def set_params(self):
 
 class QuantDequantTensorRTSubgraphPassConvTransSamePaddingTest(
         QuantDequantTensorRTSubgraphPassConvTransposeTest):
+
     def set_params(self):
         self.conv_num_filters = 64
         self.conv_filter_size = 4
@@ -296,6 +311,7 @@ def set_params(self):
 
 class QuantDequantTensorRTSubgraphPassTransDWConvTest(
         QuantDequantTensorRTSubgraphPassConvTransposeTest):
+
     def set_params(self):
         self.conv_num_filters = 64
         self.conv_filter_size = 4
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py
index c7f724bdaae3f..a7532ff3e7376 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertActivationTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
             if dims == 1:
                 return np.ones([32]).astype(np.float32)
@@ -58,7 +60,8 @@ def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data": TensorConfig(data_gen=partial(
+                            "input_data":
+                            TensorConfig(data_gen=partial(
                                 generate_input1, dims, batch, dics))
                         },
                         outputs=["output_data"])
@@ -67,6 +70,7 @@ def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -102,8 +106,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -118,11 +121,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_affine_channel.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_affine_channel.py
index 33eb90b9f9123..c5958f93ef837 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_affine_channel.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_affine_channel.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertAffineChannelTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(batch, dims, attrs: List[Dict[str, Any]]):
             if dims == 2:
                 return np.ones([batch, 64]).astype(np.float32)
@@ -65,13 +67,16 @@ def generate_weight1(dims, attrs: List[Dict[str, Any]]):
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={
-                            "scale": TensorConfig(data_gen=partial(
-                                generate_weight1, dims, dics)),
-                            "bias": TensorConfig(data_gen=partial(
-                                generate_weight1, dims, dics))
+                            "scale":
+                            TensorConfig(
+                                data_gen=partial(generate_weight1, dims, dics)),
+                            "bias":
+                            TensorConfig(
+                                data_gen=partial(generate_weight1, dims, dics))
                         },
                         inputs={
-                            "input_data": TensorConfig(data_gen=partial(
+                            "input_data":
+                            TensorConfig(data_gen=partial(
                                 generate_input1, batch, dims, dics))
                         },
                         outputs=["output_data"])
@@ -80,6 +85,7 @@ def generate_weight1(dims, attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 2:
                 self.dynamic_shape.min_input_shape = {"input_data": [1, 32]}
@@ -119,8 +125,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 return 0, 3
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -135,11 +140,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py
index 2dd380c53af44..0a2877b9a2327 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertAnchorGeneratorTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(batch, attrs: List[Dict[str, Any]]):
             return np.random.random([batch, 3, 64, 64]).astype(np.float32)
 
@@ -61,9 +63,9 @@ def generate_input1(batch, attrs: List[Dict[str, Any]]):
                                     ops=ops,
                                     weights={},
                                     inputs={
-                                        "input_data": TensorConfig(
-                                            data_gen=partial(generate_input1,
-                                                             batch, dics))
+                                        "input_data":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input1, batch, dics))
                                     },
                                     outputs=[
                                         "output_anchors", "output_variances"
@@ -73,6 +75,7 @@ def generate_input1(batch, attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
@@ -90,8 +93,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 return 0, 4
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -106,11 +108,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_arg_max.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_arg_max.py
index 719e448856995..82ac600fd1e73 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_arg_max.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_arg_max.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class TrtConvertArgMaxTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         input_shape = program_config.inputs["arg_max_input"].shape
         axis = program_config.ops[0].attrs["axis"]
@@ -32,6 +33,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(rank, batch):
             dims = [batch]
             for i in range(rank - 1):
@@ -65,7 +67,8 @@ def generate_input(rank, batch):
                             ops=ops,
                             weights={},
                             inputs={
-                                "arg_max_input": TensorConfig(data_gen=partial(
+                                "arg_max_input":
+                                TensorConfig(data_gen=partial(
                                     generate_input, rank, batch))
                             },
                             outputs=["arg_max_out"])
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py
index 899cf0e263955..fa73ab7c62eac 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertBatchNormTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             if self.dims == 4:
                 if attrs[0]['data_layout'] == "NCHW":
@@ -82,30 +84,40 @@ def generate_MomentumTensor(attrs: List[Dict[str, Any]], batch):
                                     "Variance": ["Variance"]
                                 }]
                                 dics_intputs = [{
-                                    "Bias": TensorConfig(data_gen=partial(
+                                    "Bias":
+                                    TensorConfig(data_gen=partial(
                                         generate_bias, dics, batch)),
-                                    "Mean": TensorConfig(data_gen=partial(
+                                    "Mean":
+                                    TensorConfig(data_gen=partial(
                                         generate_mean, dics, batch)),
-                                    "Scale": TensorConfig(data_gen=partial(
+                                    "Scale":
+                                    TensorConfig(data_gen=partial(
                                         generate_scale, dics, batch)),
-                                    "Variance": TensorConfig(data_gen=partial(
+                                    "Variance":
+                                    TensorConfig(data_gen=partial(
                                         generate_variance, dics, batch)),
                                     "MomentumTensor":
                                     TensorConfig(data_gen=partial(
                                         generate_MomentumTensor, dics, batch)),
                                 }, {
-                                    "Bias": TensorConfig(data_gen=partial(
+                                    "Bias":
+                                    TensorConfig(data_gen=partial(
                                         generate_bias, dics, batch)),
-                                    "Mean": TensorConfig(data_gen=partial(
+                                    "Mean":
+                                    TensorConfig(data_gen=partial(
                                         generate_mean, dics, batch)),
-                                    "Scale": TensorConfig(data_gen=partial(
+                                    "Scale":
+                                    TensorConfig(data_gen=partial(
                                         generate_scale, dics, batch)),
-                                    "Variance": TensorConfig(data_gen=partial(
+                                    "Variance":
+                                    TensorConfig(data_gen=partial(
                                         generate_variance, dics, batch))
                                 }]
                                 ops_config = [{
-                                    "op_type": "batch_norm",
-                                    "op_inputs": dics_intput[num_input],
+                                    "op_type":
+                                    "batch_norm",
+                                    "op_inputs":
+                                    dics_intput[num_input],
                                     "op_outputs": {
                                         "Y": ["batch_norm_out"],
                                         "MeanOut": ["Mean"],
@@ -113,16 +125,17 @@ def generate_MomentumTensor(attrs: List[Dict[str, Any]], batch):
                                         "SavedMean": ["SavedMean"],
                                         "SavedVariance": ["SavedVariance"]
                                     },
-                                    "op_attrs": dics[0]
+                                    "op_attrs":
+                                    dics[0]
                                 }]
                                 ops = self.generate_op_config(ops_config)
                                 program_config = ProgramConfig(
                                     ops=ops,
                                     weights=dics_intputs[num_input],
                                     inputs={
-                                        "batch_norm_input": TensorConfig(
-                                            data_gen=partial(generate_input1,
-                                                             dics, batch))
+                                        "batch_norm_input":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input1, dics, batch))
                                     },
                                     outputs=["batch_norm_out"])
 
@@ -130,6 +143,7 @@ def generate_MomentumTensor(attrs: List[Dict[str, Any]], batch):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 4:
                 if attrs[0]['data_layout'] == "NCHW":
@@ -182,8 +196,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         # for static_shape
         clear_dynamic_shape()
@@ -197,13 +210,14 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             if len(program_config.weights) == 5:
                 return True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py
index 1277cde011c17..aec2f3efd4f23 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertClipTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
             if dims == 1:
                 return np.ones([32]).astype(np.float32)
@@ -72,13 +74,16 @@ def generate_weight2(attrs: List[Dict[str, Any]]):
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={
-                            "Min_": TensorConfig(data_gen=partial(
-                                generate_weight1, dics)),
-                            "Max_": TensorConfig(data_gen=partial(
-                                generate_weight2, dics))
+                            "Min_":
+                            TensorConfig(
+                                data_gen=partial(generate_weight1, dics)),
+                            "Max_":
+                            TensorConfig(
+                                data_gen=partial(generate_weight2, dics))
                         },
                         inputs={
-                            "input_data": TensorConfig(data_gen=partial(
+                            "input_data":
+                            TensorConfig(data_gen=partial(
                                 generate_input1, dims, batch, dics))
                         },
                         outputs=["output_data"])
@@ -86,6 +91,7 @@ def generate_weight2(attrs: List[Dict[str, Any]]):
                     yield program_config
 
     def sample_predictor_configs(self, program_config):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -122,8 +128,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -138,11 +143,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py
index ebd2f7724da22..e8c9a65bbfc93 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py
@@ -22,14 +22,14 @@
 
 
 class TrtConvertConcatTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         outputs = program_config.outputs
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         #The input dimension should be less than or equal to the set axis.
         if len(inputs['concat_input1'].shape) <= attrs[0]['axis']:
@@ -38,6 +38,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             if self.dims == 4:
                 return np.ones([batch, 3, 24, 24]).astype(np.float32)
@@ -79,33 +80,36 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
                         self.dims = dims
                         dics = [{"axis": axis}, {}]
                         dics_intput = [{
-                            "X": [
-                                "concat_input1", "concat_input2",
-                                "concat_input3"
-                            ],
+                            "X":
+                            ["concat_input1", "concat_input2", "concat_input3"],
                             "AxisTensor": ["AxisTensor"],
                         }, {
-                            "X": [
-                                "concat_input1", "concat_input2",
-                                "concat_input3"
-                            ]
+                            "X":
+                            ["concat_input1", "concat_input2", "concat_input3"]
                         }]
                         dics_inputs = [{
-                            "concat_input1": TensorConfig(data_gen=partial(
-                                generate_input1, dics, batch)),
-                            "concat_input2": TensorConfig(data_gen=partial(
-                                generate_input2, dics, batch)),
-                            "concat_input3": TensorConfig(data_gen=partial(
-                                generate_input3, dics, batch)),
-                            "AxisTensor": TensorConfig(data_gen=partial(
-                                generate_weight1, dics))
+                            "concat_input1":
+                            TensorConfig(
+                                data_gen=partial(generate_input1, dics, batch)),
+                            "concat_input2":
+                            TensorConfig(
+                                data_gen=partial(generate_input2, dics, batch)),
+                            "concat_input3":
+                            TensorConfig(
+                                data_gen=partial(generate_input3, dics, batch)),
+                            "AxisTensor":
+                            TensorConfig(
+                                data_gen=partial(generate_weight1, dics))
                         }, {
-                            "concat_input1": TensorConfig(data_gen=partial(
-                                generate_input1, dics, batch)),
-                            "concat_input2": TensorConfig(data_gen=partial(
-                                generate_input2, dics, batch)),
-                            "concat_input3": TensorConfig(data_gen=partial(
-                                generate_input3, dics, batch))
+                            "concat_input1":
+                            TensorConfig(
+                                data_gen=partial(generate_input1, dics, batch)),
+                            "concat_input2":
+                            TensorConfig(
+                                data_gen=partial(generate_input2, dics, batch)),
+                            "concat_input3":
+                            TensorConfig(
+                                data_gen=partial(generate_input3, dics, batch))
                         }]
                         ops_config = [{
                             "op_type": "concat",
@@ -126,6 +130,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.num_input == 0:
                 if self.dims == 4:
@@ -285,8 +290,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                     return 0, 5
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         # for static_shape
         clear_dynamic_shape()
@@ -300,13 +304,14 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             if len(program_config.inputs) == 4:
                 return True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py
index 84ef5b4da68ab..13b9fb6f5c90b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,16 +22,16 @@
 
 
 class TrtConvertConv2dTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
-        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[
-                1] * attrs[0]['groups']:
+        if inputs['input_data'].shape[
+                1] != weights['conv2d_weight'].shape[1] * attrs[0]['groups']:
             return False
 
         ver = paddle_infer.get_trt_compile_version()
@@ -46,8 +46,8 @@ def sample_program_configs(self):
         self.trt_param.workspace_size = 1073741824
 
         def generate_input1(batch, attrs: List[Dict[str, Any]]):
-            return np.ones(
-                [batch, attrs[0]['groups'] * 3, 64, 64]).astype(np.float32)
+            return np.ones([batch, attrs[0]['groups'] * 3, 64,
+                            64]).astype(np.float32)
 
         def generate_weight1(attrs: List[Dict[str, Any]]):
             return np.random.random([24, 3, 3, 3]).astype(np.float32)
@@ -111,6 +111,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             input_groups = attrs[0]['groups'] * 3
             self.dynamic_shape.min_input_shape = {
@@ -135,8 +136,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -154,8 +154,8 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-5)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py
index 8a9a9909571a4..1a36ea12e86c8 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,16 +22,16 @@
 
 
 class TrtConvertConv2dFusionTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
-        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[
-                1] * attrs[0]['groups']:
+        if inputs['input_data'].shape[
+                1] != weights['conv2d_weight'].shape[1] * attrs[0]['groups']:
             return False
 
         if attrs[0]['groups'] <= 1:
@@ -49,8 +49,8 @@ def sample_program_configs(self):
         self.trt_param.workspace_size = 1073741824
 
         def generate_input1(batch, attrs: List[Dict[str, Any]]):
-            return np.ones(
-                [batch, attrs[0]['groups'] * 3, 64, 64]).astype(np.float32)
+            return np.ones([batch, attrs[0]['groups'] * 3, 64,
+                            64]).astype(np.float32)
 
         def generate_weight1(attrs: List[Dict[str, Any]]):
             return np.random.random([24, 3, 3, 3]).astype(np.float32)
@@ -108,9 +108,9 @@ def generate_weight2(attrs: List[Dict[str, Any]]):
                                             "conv2d_weight":
                                             TensorConfig(data_gen=partial(
                                                 generate_weight1, dics)),
-                                            "elementwise_weight": TensorConfig(
-                                                data_gen=partial(
-                                                    generate_weight2, dics))
+                                            "elementwise_weight":
+                                            TensorConfig(data_gen=partial(
+                                                generate_weight2, dics))
                                         },
                                         inputs={
                                             "input_data":
@@ -123,6 +123,7 @@ def generate_weight2(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             input_groups = attrs[0]['groups'] * 3
             self.dynamic_shape.min_input_shape = {
@@ -147,8 +148,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -166,8 +166,8 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-5)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
index 65fc35f9c56f8..0db051560516d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,16 +22,16 @@
 
 
 class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
-        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[
-                1] * attrs[0]['groups']:
+        if inputs['input_data'].shape[
+                1] != weights['conv2d_weight'].shape[1] * attrs[0]['groups']:
             return False
 
         if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[0]:
@@ -54,12 +54,12 @@ def generate_input1(batch, num_channels, attrs: List[Dict[str, Any]]):
 
         def generate_weight1(num_channels, attrs: List[Dict[str, Any]]):
             if attrs[0]['groups'] == 1:
-                return np.random.random(
-                    [num_channels, num_channels, 3, 3]).astype(np.float32)
+                return np.random.random([num_channels, num_channels, 3,
+                                         3]).astype(np.float32)
             else:
                 return np.random.random(
-                    [num_channels, int(num_channels / 2), 3, 3]).astype(
-                        np.float32)
+                    [num_channels, int(num_channels / 2), 3,
+                     3]).astype(np.float32)
 
         for num_channels in [2, 4, 6]:
             for batch in [1, 2, 4]:
@@ -120,6 +120,7 @@ def generate_weight1(num_channels, attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.num_channels == 2:
                 self.dynamic_shape.min_input_shape = {
@@ -170,8 +171,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -189,8 +189,8 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-3)
@@ -199,6 +199,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         #     attrs, True), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
                 return True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
index c692e92861bc6..d0d8e35fdd0ff 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,23 +22,23 @@
 
 
 class TrtConvertDeformableConvTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
-        if inputs['input_data'].shape[1] != weights['filter_data'].shape[
-                1] * attrs[0]['groups']:
+        if inputs['input_data'].shape[
+                1] != weights['filter_data'].shape[1] * attrs[0]['groups']:
             return False
 
         return True
 
     def sample_program_configs(self):
-        def compute_output_size(input_size: List[int],
-                                kernel_sizes: List[int],
+
+        def compute_output_size(input_size: List[int], kernel_sizes: List[int],
                                 attrs: List[Dict[str, Any]]):
             strides = attrs[0]['strides']
             paddings = attrs[0]['paddings']
@@ -50,40 +50,40 @@ def compute_output_size(input_size: List[int],
                 output_size.append((i + 2 * p - k) // s + 1)
             return output_size
 
-        def generate_input1(batch: int,
-                            input_size: List[int],
-                            kernel_sizes: List[int],
-                            attrs: List[Dict[str, Any]]):
+        def generate_input1(batch: int, input_size: List[int],
+                            kernel_sizes: List[int], attrs: List[Dict[str,
+                                                                      Any]]):
             return np.random.random([batch, 3] + input_size).astype(np.float32)
 
-        def generate_offset1(batch: int,
-                             input_size: List[int],
-                             kernel_sizes: List[int],
-                             attrs: List[Dict[str, Any]]):
+        def generate_offset1(batch: int, input_size: List[int],
+                             kernel_sizes: List[int], attrs: List[Dict[str,
+                                                                       Any]]):
             output_size = compute_output_size(input_size, kernel_sizes, attrs)
             return np.random.random([batch, 2 * np.prod(kernel_sizes)] +
                                     output_size).astype(np.float32)
 
-        def generate_mask1(batch: int,
-                           input_size: List[int],
-                           kernel_sizes: List[int],
-                           attrs: List[Dict[str, Any]]):
+        def generate_mask1(batch: int, input_size: List[int],
+                           kernel_sizes: List[int], attrs: List[Dict[str,
+                                                                     Any]]):
             output_size = compute_output_size(input_size, kernel_sizes, attrs)
             return np.random.random([batch, np.prod(kernel_sizes)] +
                                     output_size).astype(np.float32)
 
-        def generate_filter1(batch: int,
-                             input_size: List[int],
-                             kernel_sizes: List[int],
-                             attrs: List[Dict[str, Any]]):
+        def generate_filter1(batch: int, input_size: List[int],
+                             kernel_sizes: List[int], attrs: List[Dict[str,
+                                                                       Any]]):
             return np.random.random([6, 3] + kernel_sizes).astype(np.float32)
 
-        for batch in [1, ]:
+        for batch in [
+                1,
+        ]:
             for input_size in [[32, 32]]:
                 for kernel_sizes in [[3, 3]]:
                     for strides in [[1, 1], [2, 2]]:
                         for paddings in [[1, 1], [0, 2]]:
-                            for groups in [1, ]:
+                            for groups in [
+                                    1,
+                            ]:
                                 for dilations in [[1, 1], [2, 2]]:
                                     dics = [{
                                         "strides": strides,
@@ -126,10 +126,10 @@ def generate_filter1(batch: int,
                                         TensorConfig(data_gen=partial(
                                             generate_offset1, batch, input_size,
                                             kernel_sizes, dics)),
-                                        "mask_data": TensorConfig(
-                                            data_gen=partial(
-                                                generate_mask1, batch,
-                                                input_size, kernel_sizes, dics))
+                                        "mask_data":
+                                        TensorConfig(data_gen=partial(
+                                            generate_mask1, batch, input_size,
+                                            kernel_sizes, dics))
                                     },
                                     outputs=["output_data"])
 
@@ -137,6 +137,7 @@ def generate_filter1(batch: int,
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def clear_dynamic_shape():
             self.dynamic_shape.min_input_shape = {}
             self.dynamic_shape.max_input_shape = {}
@@ -150,8 +151,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 return 1, 4
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py
index b87b33d355798..f4d6a5f1efa5e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,16 +22,16 @@
 
 
 class TrtConvertDepthwiseConv2dTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
-        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[
-                1] * attrs[0]['groups']:
+        if inputs['input_data'].shape[
+                1] != weights['conv2d_weight'].shape[1] * attrs[0]['groups']:
             return False
 
         return True
@@ -99,6 +99,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if attrs[0]['groups'] == 1:
                 self.dynamic_shape.min_input_shape = {
@@ -149,8 +150,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -168,8 +168,8 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-5)
@@ -178,6 +178,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, True), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             if program_config.ops[0].attrs[
                     'padding_algorithm'] == "SAME" or program_config.ops[
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
index 5f77e7de0df42..f32dfdb47c954 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,16 +22,16 @@
 
 
 class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
-        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[
-                1] * attrs[0]['groups']:
+        if inputs['input_data'].shape[
+                1] != weights['conv2d_weight'].shape[1] * attrs[0]['groups']:
             return False
 
         if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[1]:
@@ -53,12 +53,12 @@ def sample_program_configs(self):
         self.trt_param.workspace_size = 1073741824
 
         def generate_input1(batch, attrs: List[Dict[str, Any]]):
-            return np.ones(
-                [batch, attrs[0]['groups'], 64, 64]).astype(np.float32)
+            return np.ones([batch, attrs[0]['groups'], 64,
+                            64]).astype(np.float32)
 
         def generate_weight1(attrs: List[Dict[str, Any]]):
-            return np.random.random(
-                [attrs[0]['groups'], 1, 3, 3]).astype(np.float32)
+            return np.random.random([attrs[0]['groups'], 1, 3,
+                                     3]).astype(np.float32)
 
         for batch in [1, 2, 4]:
             for strides in [[1, 1], [2, 2], [1, 2]]:
@@ -111,6 +111,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, attrs[0]['groups'], 32, 32],
@@ -134,8 +135,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -153,8 +153,8 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-5)
@@ -163,6 +163,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         #     attrs, True), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
                 return True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
index f9bb4e66f2ab4..5d8e93ef984f6 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertDropoutTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
             if dims == 1:
                 return np.ones([64]).astype(np.float32)
@@ -70,9 +72,9 @@ def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
                                     ops=ops,
                                     weights={},
                                     inputs={
-                                        "input_data": TensorConfig(
-                                            data_gen=partial(generate_input1,
-                                                             dims, batch, dics))
+                                        "input_data":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input1, dims, batch, dics))
                                     },
                                     outputs=["dropout_output_data"])
 
@@ -80,6 +82,7 @@ def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -118,8 +121,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -134,11 +136,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
index 27d8247aded5a..2fabc6013893e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertElementwiseTest_one_input(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -59,7 +61,8 @@ def generate_weight():
                                 TensorConfig(data_gen=partial(generate_weight))
                             },
                             inputs={
-                                "input_data": TensorConfig(
+                                "input_data":
+                                TensorConfig(
                                     data_gen=partial(generate_input, shape)),
                             },
                             outputs=["output_data"])
@@ -68,6 +71,7 @@ def generate_weight():
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             # The input.dims[1] must be equal to the weight's length.
             if self.dims == 1:
@@ -104,8 +108,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -120,11 +123,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
         pass
@@ -136,6 +139,7 @@ def test(self):
 
 class TrtConvertElementwiseTest_two_input_without_broadcast(
         TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         if len(inputs['input_data1'].shape) == 1:
@@ -144,6 +148,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -172,9 +177,11 @@ def generate_input(shape):
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data1": TensorConfig(
+                            "input_data1":
+                            TensorConfig(
                                 data_gen=partial(generate_input, shape)),
-                            "input_data2": TensorConfig(
+                            "input_data2":
+                            TensorConfig(
                                 data_gen=partial(generate_input, shape))
                         },
                         outputs=["output_data"])
@@ -183,6 +190,7 @@ def generate_input(shape):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {
@@ -243,8 +251,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -270,6 +277,7 @@ def test(self):
 
 
 class TrtConvertElementwiseTest_two_input_with_broadcast(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         if len(inputs['input_data1'].shape) != len(inputs['input_data2'].shape):
@@ -278,6 +286,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -332,9 +341,11 @@ def generate_input(shape):
                             ops=ops,
                             weights={},
                             inputs={
-                                "input_data1": TensorConfig(data_gen=partial(
+                                "input_data1":
+                                TensorConfig(data_gen=partial(
                                     generate_input, input1_shape)),
-                                "input_data2": TensorConfig(data_gen=partial(
+                                "input_data2":
+                                TensorConfig(data_gen=partial(
                                     generate_input, input2_shape))
                             },
                             outputs=["output_data"])
@@ -343,6 +354,7 @@ def generate_input(shape):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             max_shape = [[128], [128, 128], [128, 128, 128],
                          [128, 128, 128, 128]]
@@ -368,8 +380,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -396,10 +407,12 @@ def test(self):
 
 
 class TrtConvertElementwiseTest_one_input_corner_case(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -436,7 +449,8 @@ def generate_weight():
                                 TensorConfig(data_gen=partial(generate_weight))
                             },
                             inputs={
-                                "input_data": TensorConfig(
+                                "input_data":
+                                TensorConfig(
                                     data_gen=partial(generate_input, shape)),
                             },
                             outputs=["output_data"])
@@ -445,6 +459,7 @@ def generate_weight():
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             # The input.dims[1] must be equal to the weight's length.
             if self.dims == 1:
@@ -483,8 +498,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -499,13 +513,14 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             input_x_names = program_config.ops[0].inputs["X"]
             for weight_name in program_config.weights:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
index 1eecf9c0497a1..36b63be345392 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,13 +22,15 @@
 
 
 class TrtConvertEmbEltwiseLayernormTest1(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(batch, input_size):
-            return np.random.randint(
-                0, 7, size=(batch, input_size, 1)).astype(np.int64)
+            return np.random.randint(0, 7, size=(batch, input_size,
+                                                 1)).astype(np.int64)
 
         def generate_weight1(size11, size2):
             return np.random.randn(size11, size2).astype(np.float32)
@@ -75,7 +77,8 @@ def generate_weight4(size2):
                                                 "epsilon": epsilon
                                             }]
                                             ops_config = [{
-                                                "op_type": type,
+                                                "op_type":
+                                                type,
                                                 "op_inputs": {
                                                     "Ids": ["input_data1"],
                                                     "W": ["embedding1_weight"]
@@ -84,11 +87,12 @@ def generate_weight4(size2):
                                                     "Out":
                                                     ["embedding1_output"]
                                                 },
-                                                "op_attrs": dics[0]
-                                                if type == "lookup_table" else
-                                                dics[1]
+                                                "op_attrs":
+                                                dics[0] if type
+                                                == "lookup_table" else dics[1]
                                             }, {
-                                                "op_type": type,
+                                                "op_type":
+                                                type,
                                                 "op_inputs": {
                                                     "Ids": ["input_data2"],
                                                     "W": ["embedding2_weight"]
@@ -97,11 +101,12 @@ def generate_weight4(size2):
                                                     "Out":
                                                     ["embedding2_output"]
                                                 },
-                                                "op_attrs": dics[0]
-                                                if type == "lookup_table" else
-                                                dics[1]
+                                                "op_attrs":
+                                                dics[0] if type
+                                                == "lookup_table" else dics[1]
                                             }, {
-                                                "op_type": type,
+                                                "op_type":
+                                                type,
                                                 "op_inputs": {
                                                     "Ids": ["input_data3"],
                                                     "W": ["embedding3_weight"]
@@ -110,9 +115,9 @@ def generate_weight4(size2):
                                                     "Out":
                                                     ["embedding3_output"]
                                                 },
-                                                "op_attrs": dics[0]
-                                                if type == "lookup_table" else
-                                                dics[1]
+                                                "op_attrs":
+                                                dics[0] if type
+                                                == "lookup_table" else dics[1]
                                             }, {
                                                 "op_type": "elementwise_add",
                                                 "op_inputs": {
@@ -120,39 +125,33 @@ def generate_weight4(size2):
                                                     "Y": ["embedding3_output"]
                                                 },
                                                 "op_outputs": {
-                                                    "Out": [
-                                                        "elementwise_add1_output"
-                                                    ]
+                                                    "Out":
+                                                    ["elementwise_add1_output"]
                                                 },
                                                 "op_attrs": dics[2]
                                             }, {
                                                 "op_type": "elementwise_add",
                                                 "op_inputs": {
-                                                    "X": [
-                                                        "elementwise_add1_output"
-                                                    ],
+                                                    "X":
+                                                    ["elementwise_add1_output"],
                                                     "Y": ["embedding1_output"]
                                                 },
                                                 "op_outputs": {
-                                                    "Out": [
-                                                        "elementwise_add2_output"
-                                                    ]
+                                                    "Out":
+                                                    ["elementwise_add2_output"]
                                                 },
                                                 "op_attrs": dics[3]
                                             }, {
                                                 "op_type": "layer_norm",
                                                 "op_inputs": {
-                                                    "X": [
-                                                        "elementwise_add2_output"
-                                                    ],
-                                                    "Bias":
-                                                    ["layer_norm_bias"],
+                                                    "X":
+                                                    ["elementwise_add2_output"],
+                                                    "Bias": ["layer_norm_bias"],
                                                     "Scale":
                                                     ["layer_norm_scale"]
                                                 },
                                                 "op_outputs": {
-                                                    "Y":
-                                                    ["layer_norm_output1"],
+                                                    "Y": ["layer_norm_output1"],
                                                     "Mean":
                                                     ["layer_norm_output2"],
                                                     "Variance":
@@ -193,15 +192,18 @@ def generate_weight4(size2):
                                                             size2))
                                                 },
                                                 inputs={
-                                                    "input_data1": TensorConfig(
+                                                    "input_data1":
+                                                    TensorConfig(
                                                         data_gen=partial(
                                                             generate_input,
                                                             batch, input_size)),
-                                                    "input_data2": TensorConfig(
+                                                    "input_data2":
+                                                    TensorConfig(
                                                         data_gen=partial(
                                                             generate_input,
                                                             batch, input_size)),
-                                                    "input_data3": TensorConfig(
+                                                    "input_data3":
+                                                    TensorConfig(
                                                         data_gen=partial(
                                                             generate_input,
                                                             batch, input_size))
@@ -212,6 +214,7 @@ def generate_weight4(size2):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data1": [1, 4, 1],
@@ -235,8 +238,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py
index 7b0089ab9ab7f..da947dc35dfde 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertFlattenTest_dim_2(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(batch):
             return np.random.random([batch, 32]).astype(np.float32)
 
@@ -54,7 +56,8 @@ def generate_input(batch):
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data": TensorConfig(
+                            "input_data":
+                            TensorConfig(
                                 data_gen=partial(generate_input, batch))
                         },
                         outputs=["output_data"])
@@ -63,6 +66,7 @@ def generate_input(batch):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 8]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 64]}
@@ -90,8 +94,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                     return 0, 3
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -105,21 +108,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
 
 
 class TrtConvertFlattenTest_dim_3(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(batch):
             return np.random.random([batch, 32, 64]).astype(np.float32)
 
@@ -148,7 +153,8 @@ def generate_input(batch):
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data": TensorConfig(
+                            "input_data":
+                            TensorConfig(
                                 data_gen=partial(generate_input, batch))
                         },
                         outputs=["output_data"])
@@ -157,6 +163,7 @@ def generate_input(batch):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 8, 8]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 64, 768]}
@@ -184,8 +191,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                     return 0, 3
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -199,21 +205,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
 
 
 class TrtConvertFlattenTest_dim_4(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(batch):
             return np.random.random([batch, 8, 8, 8]).astype(np.float32)
 
@@ -242,7 +250,8 @@ def generate_input(batch):
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data": TensorConfig(
+                            "input_data":
+                            TensorConfig(
                                 data_gen=partial(generate_input, batch))
                         },
                         outputs=["output_data"])
@@ -251,6 +260,7 @@ def generate_input(batch):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 4, 4, 4]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 64, 64]}
@@ -278,8 +288,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                     return 0, 3
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -293,21 +302,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
 
 
 class TrtConvertFlattenTest_dim_5(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(batch):
             return np.random.random([batch, 8, 8, 8]).astype(np.float32)
 
@@ -336,7 +347,8 @@ def generate_input(batch):
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data": TensorConfig(
+                            "input_data":
+                            TensorConfig(
                                 data_gen=partial(generate_input, batch))
                         },
                         outputs=["output_data"])
@@ -345,6 +357,7 @@ def generate_input(batch):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 4, 4, 4]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 64, 64]}
@@ -372,8 +385,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                     return 0, 3
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -387,11 +399,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py
index a4060349d4bed..406f5e1a13ca8 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertFlattenContiguousRangeTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(batch):
             return np.random.random([2, batch, 4, 8, 3]).astype(np.float32)
 
@@ -54,7 +56,8 @@ def generate_input(batch):
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data": TensorConfig(
+                            "input_data":
+                            TensorConfig(
                                 data_gen=partial(generate_input, batch))
                         },
                         outputs=["output_data"])
@@ -62,6 +65,7 @@ def generate_input(batch):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [2, 1, 4, 8, 3]}
             self.dynamic_shape.max_input_shape = {"input_data": [2, 4, 4, 8, 3]}
@@ -86,8 +90,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 return 0, 3
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -101,11 +104,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
index 852bb2ffa8412..5405f11465106 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,11 +23,11 @@
 
 
 class TrtConvertGatherTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         if len(inputs['input_data'].shape) <= attrs[0]['axis']:
             return False
@@ -35,6 +35,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -74,16 +75,21 @@ def generate_input3(axis):
                                 ops=ops,
                                 weights={},
                                 inputs={
-                                    "input_data": TensorConfig(data_gen=partial(
+                                    "input_data":
+                                    TensorConfig(data_gen=partial(
                                         generate_input1, shape)),
-                                    "index_data": TensorConfig(data_gen=partial(
+                                    "index_data":
+                                    TensorConfig(data_gen=partial(
                                         generate_input2, index)),
                                 } if len(input) == 2 else {
-                                    "input_data": TensorConfig(data_gen=partial(
+                                    "input_data":
+                                    TensorConfig(data_gen=partial(
                                         generate_input1, shape)),
-                                    "index_data": TensorConfig(data_gen=partial(
+                                    "index_data":
+                                    TensorConfig(data_gen=partial(
                                         generate_input2, index)),
-                                    "axis_data": TensorConfig(data_gen=partial(
+                                    "axis_data":
+                                    TensorConfig(data_gen=partial(
                                         generate_input3, axis)),
                                 },
                                 outputs=["output_data"])
@@ -92,6 +98,7 @@ def generate_input3(axis):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if len(self.shape) == 1:
                 self.dynamic_shape.min_input_shape = {
@@ -161,8 +168,7 @@ def generate_trt_nodes_num(dynamic_shape):
                     return 0, 4
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -188,8 +194,8 @@ def add_skip_trt_case(self):
             def teller1(program_config, predictor_config):
                 if len(self.dynamic_shape.min_input_shape) != 0:
                     inputs = program_config.inputs
-                    if len(inputs['input_data'].shape) == 1 or len(inputs[
-                            'index_data'].shape) == 1:
+                    if len(inputs['input_data'].shape) == 1 or len(
+                            inputs['index_data'].shape) == 1:
                         return True
                 return False
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
index 6b6a9536d81be..9343f1ebd7cd0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertGatherNdTest_dim_4_1(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1():
             return np.random.random([2, 32, 64, 64]).astype(np.float32)
 
@@ -58,6 +60,7 @@ def generate_input2():
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 8, 8, 8],
@@ -78,8 +81,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -101,10 +103,12 @@ def test(self):
 
 
 class TrtConvertGatherNdTest_dim_4_1_2(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1():
             return np.random.random([2, 32, 64, 64]).astype(np.float32)
 
@@ -137,6 +141,7 @@ def generate_input2():
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 8, 8, 8],
@@ -157,8 +162,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -180,10 +184,12 @@ def test(self):
 
 
 class TrtConvertGatherNdTest_dim_4_2(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1():
             return np.random.random([2, 32, 64, 64]).astype(np.float32)
 
@@ -216,6 +222,7 @@ def generate_input2():
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 8, 8, 8],
@@ -236,8 +243,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -259,10 +265,12 @@ def test(self):
 
 
 class TrtConvertGatherNdTest_dim_4_3(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1():
             return np.random.random([2, 32, 64, 64]).astype(np.float32)
 
@@ -295,6 +303,7 @@ def generate_input2():
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 8, 8, 8],
@@ -315,8 +324,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -338,10 +346,12 @@ def test(self):
 
 
 class TrtConvertGatherNdTest_dim_2_2(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1():
             return np.random.random([2, 32]).astype(np.float32)
 
@@ -374,6 +384,7 @@ def generate_input2():
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 4],
@@ -394,8 +405,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -417,16 +427,18 @@ def test(self):
 
 
 class TrtConvertGatherNdTest_dim_3_3(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1():
             return np.random.random([16, 32, 256]).astype(np.float32)
 
         def generate_input2():
-            return np.array(
-                [[[2, 5], [3, 8]], [[0, 2], [0, 3]]]).astype(np.int32)
+            return np.array([[[2, 5], [3, 8]], [[0, 2], [0,
+                                                         3]]]).astype(np.int32)
 
         ops_config = [{
             "op_type": "gather_nd",
@@ -454,6 +466,7 @@ def generate_input2():
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 4, 4],
@@ -474,8 +487,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py
index 448e4e3e71b02..29f656130f793 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertGeluTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(dims, attrs: List[Dict[str, Any]]):
             if dims == 1:
                 return np.ones([32]).astype(np.float32)
@@ -57,8 +59,9 @@ def generate_input1(dims, attrs: List[Dict[str, Any]]):
                     ops=ops,
                     weights={},
                     inputs={
-                        "input_data": TensorConfig(data_gen=partial(
-                            generate_input1, dims, dics))
+                        "input_data":
+                        TensorConfig(
+                            data_gen=partial(generate_input1, dims, dics))
                     },
                     outputs=["output_data"])
 
@@ -66,6 +69,7 @@ def generate_input1(dims, attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -112,8 +116,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                     return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -128,11 +131,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py
index 203e86c4b25de..da65c3d21980b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertGroupNormTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(attrs: List[Dict[str, Any]], batch):
             if attrs[0]['data_layout'] == 'NCHW':
                 return np.random.random([batch, 32, 64, 64]).astype(np.float32)
@@ -70,13 +72,16 @@ def generate_bias():
                             program_config = ProgramConfig(
                                 ops=ops,
                                 weights={
-                                    "scale_weight": TensorConfig(
+                                    "scale_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_scale)),
-                                    "bias_weight": TensorConfig(
+                                    "bias_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_bias))
                                 },
                                 inputs={
-                                    "input_data": TensorConfig(data_gen=partial(
+                                    "input_data":
+                                    TensorConfig(data_gen=partial(
                                         generate_input, dics, batch))
                                 },
                                 outputs=["y_output"])
@@ -85,6 +90,7 @@ def generate_bias():
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 16, 32, 32]}
             self.dynamic_shape.max_input_shape = {
@@ -107,8 +113,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 return 0, 3
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -130,6 +135,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, True), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             if len(self.dynamic_shape.min_input_shape) != 0:
                 return True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py
index b3f118e9fbf52..0980acccb88b5 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertHardSigmoidTest_dim_2(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -51,7 +53,8 @@ def generate_input(shape):
                             ops=ops,
                             weights={},
                             inputs={
-                                "input_data": TensorConfig(
+                                "input_data":
+                                TensorConfig(
                                     data_gen=partial(generate_input, shape))
                             },
                             outputs=["output_data"])
@@ -60,6 +63,7 @@ def generate_input(shape):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.input_dim == 2:
                 self.dynamic_shape.min_input_shape = {"input_data": [1, 8]}
@@ -86,8 +90,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py
index c092d6da86839..5f5664d2aa433 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,12 +22,12 @@
 
 
 class TrtConvertHardSwishTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         if attrs[0]['threshold'] <= 0 or attrs[0]['scale'] <= 0:
@@ -36,6 +36,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]]):
             return np.ones([1, 3, 32, 32]).astype(np.float32)
 
@@ -64,8 +65,9 @@ def generate_input1(attrs: List[Dict[str, Any]]):
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data": TensorConfig(data_gen=partial(
-                                generate_input1, dics))
+                            "input_data":
+                            TensorConfig(
+                                data_gen=partial(generate_input1, dics))
                         },
                         outputs=["hard_swish_output_data"])
 
@@ -73,6 +75,7 @@ def generate_input1(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 16, 16]}
             self.dynamic_shape.max_input_shape = {"input_data": [2, 3, 32, 32]}
@@ -87,8 +90,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -103,8 +105,8 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-5)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py
index acd920ccd57ae..457db86c3236d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,10 +23,10 @@
 
 
 class TrtConvertInstanceNormTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         if attrs[0]['epsilon'] < 0 or attrs[0]['epsilon'] > 0.001:
@@ -35,6 +35,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]], shape_input):
             return np.random.random(shape_input).astype(np.float32)
 
@@ -65,13 +66,16 @@ def generate_input2(attrs: List[Dict[str, Any]], shape_input):
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={
-                            "bias_data": TensorConfig(data_gen=partial(
+                            "bias_data":
+                            TensorConfig(data_gen=partial(
                                 generate_input2, dics, shape_input)),
-                            "scale_data": TensorConfig(data_gen=partial(
+                            "scale_data":
+                            TensorConfig(data_gen=partial(
                                 generate_input2, dics, shape_input))
                         },
                         inputs={
-                            "input_data": TensorConfig(data_gen=partial(
+                            "input_data":
+                            TensorConfig(data_gen=partial(
                                 generate_input1, dics, shape_input))
                         },
                         outputs=["y_data"])
@@ -80,6 +84,7 @@ def generate_input2(attrs: List[Dict[str, Any]], shape_input):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.in_dim == 2:
                 self.dynamic_shape.min_input_shape = {"input_data": [1, 4]}
@@ -113,8 +118,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -129,11 +133,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_layer_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_layer_norm.py
index 13c932d55b827..16c95bff5e33b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_layer_norm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_layer_norm.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,12 +23,12 @@
 
 
 class TrtConvertLayerNormTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         if attrs[0]['epsilon'] < 0 or attrs[0]['epsilon'] > 0.001:
@@ -40,6 +40,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]], shape_input):
             return np.ones(shape_input).astype(np.float32)
 
@@ -76,14 +77,17 @@ def generate_input2(attrs: List[Dict[str, Any]], shape_input):
                 program_config = ProgramConfig(
                     ops=ops,
                     weights={
-                        "bias_data": TensorConfig(data_gen=partial(
-                            generate_input2, dics, shape_input)),
-                        "scale_data": TensorConfig(data_gen=partial(
-                            generate_input2, dics, shape_input))
+                        "bias_data":
+                        TensorConfig(data_gen=partial(generate_input2, dics,
+                                                      shape_input)),
+                        "scale_data":
+                        TensorConfig(data_gen=partial(generate_input2, dics,
+                                                      shape_input))
                     },
                     inputs={
-                        "input_data": TensorConfig(data_gen=partial(
-                            generate_input1, dics, shape_input))
+                        "input_data":
+                        TensorConfig(data_gen=partial(generate_input1, dics,
+                                                      shape_input))
                     },
                     outputs=["y_data"])
 
@@ -91,6 +95,7 @@ def generate_input2(attrs: List[Dict[str, Any]], shape_input):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
@@ -110,8 +115,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -126,11 +130,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-2
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-2
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py
index c647849fa7ee4..7f33cfc64a866 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,10 +23,12 @@
 
 
 class TrtConvertLeakyReluTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -50,8 +52,9 @@ def generate_input1(shape):
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data": TensorConfig(data_gen=partial(
-                                generate_input1, shape))
+                            "input_data":
+                            TensorConfig(
+                                data_gen=partial(generate_input1, shape))
                         },
                         outputs=["y_data"])
 
@@ -59,6 +62,7 @@ def generate_input1(shape):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.input_dim == 2:
                 self.dynamic_shape.min_input_shape = {"input_data": [1, 8]}
@@ -90,8 +94,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -109,8 +112,8 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-5)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
index c6f2fa205c713..76fcffad4592c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertMatmulTest_static(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -73,9 +75,11 @@ def generate_input(shape):
                             ops=ops,
                             weights={},
                             inputs={
-                                "input1_data": TensorConfig(data_gen=partial(
+                                "input1_data":
+                                TensorConfig(data_gen=partial(
                                     generate_input, input1_shape)),
-                                "input2_data": TensorConfig(data_gen=partial(
+                                "input2_data":
+                                TensorConfig(data_gen=partial(
                                     generate_input, input2_shape))
                             },
                             outputs=["output_data"])
@@ -84,6 +88,7 @@ def generate_input(shape):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             pass
 
@@ -104,10 +109,12 @@ def test(self):
 
 
 class TrtConvertMatmulTest_dynamic(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -154,9 +161,11 @@ def generate_input(shape):
                         ops=ops,
                         weights={},
                         inputs={
-                            "input1_data": TensorConfig(
+                            "input1_data":
+                            TensorConfig(
                                 data_gen=partial(generate_input, input1_shape)),
-                            "input2_data": TensorConfig(
+                            "input2_data":
+                            TensorConfig(
                                 data_gen=partial(generate_input, input2_shape))
                         },
                         outputs=["output_data"])
@@ -165,6 +174,7 @@ def generate_input(shape):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input1_data": [1, 4, 4],
@@ -180,8 +190,7 @@ def generate_dynamic_shape(attrs):
             }
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for dynamic_shape
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py
index d2b6924a9e938..063fbba1a07c1 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertMishTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(batch, dim1, dim2, dim3):
             shape = [batch]
             if dim1 != 0:
@@ -68,20 +70,28 @@ def generate_input(batch, dim1, dim2, dim3):
                                 ops=ops,
                                 weights={},
                                 inputs={
-                                    "input_data": TensorConfig(
-                                        data_gen=partial(generate_input, batch,
-                                                         dim1, dim2, dim3))
+                                    "input_data":
+                                    TensorConfig(data_gen=partial(
+                                        generate_input, batch, dim1, dim2,
+                                        dim3))
                                 },
                                 outputs=["mish_output_data"])
 
                             yield program_config
 
     def sample_predictor_configs(self, program_config):
+
         def generate_dynamic_shape(attrs):
             if self.dim1 == 0:
-                self.dynamic_shape.min_input_shape = {"input_data": [1], }
-                self.dynamic_shape.max_input_shape = {"input_data": [4], }
-                self.dynamic_shape.opt_input_shape = {"input_data": [2], }
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [2],
+                }
             else:
                 if self.dim2 == 0 and self.dim3 == 0:
                     self.dynamic_shape.min_input_shape = {
@@ -123,8 +133,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -139,13 +148,14 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             if self.dim1 == 0 and self.dim2 == 0 and self.dim3 == 0:
                 return True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multiclass_nms3.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multiclass_nms3.py
index b6a3f0c9cb1c6..03260a2241660 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multiclass_nms3.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multiclass_nms3.py
@@ -22,6 +22,7 @@
 
 
 class TrtConvertMulticlassNMS3Test(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -58,21 +59,23 @@ def create_inference_config(self, use_trt=True) -> paddle_infer.Config:
             return config
 
     def sample_program_configs(self):
+
         def generate_boxes(batch, num_boxes):
-            return np.arange(
-                batch * num_boxes * 4,
-                dtype=np.float32).reshape([batch, num_boxes, 4])
+            return np.arange(batch * num_boxes * 4,
+                             dtype=np.float32).reshape([batch, num_boxes, 4])
 
         def generate_scores(batch, num_boxes, num_classes):
-            return np.arange(
-                batch * num_classes * num_boxes,
-                dtype=np.float32).reshape([batch, num_classes, num_boxes])
+            return np.arange(batch * num_classes * num_boxes,
+                             dtype=np.float32).reshape(
+                                 [batch, num_classes, num_boxes])
             # return np.random.rand(batch, num_classes, num_boxes).astype(np.float32)
 
         for batch in [1, 2]:
             for num_boxes in [4, 12]:
                 for num_classes in [2, 6]:
-                    for score_threshold in [0.01, ]:
+                    for score_threshold in [
+                            0.01,
+                    ]:
                         ops_config = [{
                             "op_type": "multiclass_nms3",
                             "op_inputs": {
@@ -99,9 +102,11 @@ def generate_scores(batch, num_boxes, num_classes):
                             ops=ops,
                             weights={},
                             inputs={
-                                "input_bboxes": TensorConfig(data_gen=partial(
+                                "input_bboxes":
+                                TensorConfig(data_gen=partial(
                                     generate_boxes, batch, num_boxes)),
-                                "input_scores": TensorConfig(
+                                "input_scores":
+                                TensorConfig(
                                     data_gen=partial(generate_scores, batch,
                                                      num_boxes, num_classes))
                             },
@@ -114,6 +119,7 @@ def generate_scores(batch, num_boxes, num_classes):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def clear_dynamic_shape():
             self.dynamic_shape.min_input_shape = {}
             self.dynamic_shape.max_input_shape = {}
@@ -123,8 +129,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -136,9 +141,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), 1e-2
 
-    def assert_tensors_near(self,
-                            atol: float,
-                            rtol: float,
+    def assert_tensors_near(self, atol: float, rtol: float,
                             tensor: Dict[str, np.array],
                             baseline: Dict[str, np.array]):
         # the order of tensorrt outputs are not consistent with paddle
@@ -147,12 +150,10 @@ def assert_tensors_near(self,
                 continue
             if key == "nms_output_boxes":
                 basline_arr = np.array(
-                    sorted(
-                        baseline[key].reshape((-1, 6)),
-                        key=lambda i: [i[0], i[1]]))
+                    sorted(baseline[key].reshape((-1, 6)),
+                           key=lambda i: [i[0], i[1]]))
                 arr = np.array(
-                    sorted(
-                        arr.reshape((-1, 6)), key=lambda i: [i[0], i[1]]))
+                    sorted(arr.reshape((-1, 6)), key=lambda i: [i[0], i[1]]))
             else:
                 basline_arr = np.array(baseline[key].reshape((-1, 1)))
                 arr = np.array(arr.reshape((-1, 1)))
@@ -163,8 +164,7 @@ def assert_tensors_near(self,
                 str(basline_arr.shape) + ', but got ' + str(arr.shape))
             diff = abs(basline_arr - arr)
             self.assertTrue(
-                np.allclose(
-                    basline_arr, arr, atol=atol, rtol=rtol),
+                np.allclose(basline_arr, arr, atol=atol, rtol=rtol),
                 "Output has diff, Maximum absolute error: {}".format(
                     np.amax(diff)))
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
index 26066be7dc787..9fd60e5f3fee1 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertMultiHeadMatmulTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(batch, dim1):
             return np.random.random((batch, dim1, 768)).astype(np.float32)
 
@@ -158,8 +160,7 @@ def generate_weight2():
                                     },
                                     "op_outputs": {
                                         "Out": ["transpose21_output"],
-                                        "XShape":
-                                        ["transpose21_output_xshape"]
+                                        "XShape": ["transpose21_output_xshape"]
                                     },
                                     "op_attrs": dics[3]
                                 },
@@ -203,8 +204,7 @@ def generate_weight2():
                                     },
                                     "op_outputs": {
                                         "Out": ["transpose22_output"],
-                                        "XShape":
-                                        ["transpose22_output_xshape"]
+                                        "XShape": ["transpose22_output_xshape"]
                                     },
                                     "op_attrs": dics[7]
                                 },
@@ -248,8 +248,7 @@ def generate_weight2():
                                     },
                                     "op_outputs": {
                                         "Out": ["transpose23_output"],
-                                        "XShape":
-                                        ["transpose23_output_xshape"]
+                                        "XShape": ["transpose23_output_xshape"]
                                     },
                                     "op_attrs": dics[11]
                                 },
@@ -323,8 +322,7 @@ def generate_weight2():
                                     },
                                     "op_outputs": {
                                         "Out": ["transpose24_output"],
-                                        "XShape":
-                                        ["transpose24_output_xshape"]
+                                        "XShape": ["transpose24_output_xshape"]
                                     },
                                     "op_attrs": dics[18]
                                 },
@@ -339,7 +337,7 @@ def generate_weight2():
                                     },
                                     "op_attrs": dics[19]
                                 },
-                                # In order to fuse ops with 
+                                # In order to fuse ops with
                                 # multihead_matmul_fuse_pass_v2, the last op
                                 # must be mul.
                                 {
@@ -359,28 +357,35 @@ def generate_weight2():
                             program_config = ProgramConfig(
                                 ops=ops,
                                 weights={
-                                    "mul1_weight": TensorConfig(
+                                    "mul1_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_weight1)),
-                                    "mul2_weight": TensorConfig(
+                                    "mul2_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_weight1)),
-                                    "mul3_weight": TensorConfig(
+                                    "mul3_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_weight1)),
-                                    "mul4_weight": TensorConfig(
+                                    "mul4_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_weight1)),
-                                    "elementwise_add1_weight": TensorConfig(
+                                    "elementwise_add1_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_weight2)),
-                                    "elementwise_add2_weight": TensorConfig(
+                                    "elementwise_add2_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_weight2)),
-                                    "elementwise_add3_weight": TensorConfig(
+                                    "elementwise_add3_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_weight2)),
                                 },
                                 inputs={
-                                    "input_data1": TensorConfig(
-                                        data_gen=partial(generate_input1, batch,
-                                                         dim1)),
-                                    "input_data2": TensorConfig(
-                                        data_gen=partial(generate_input2,
-                                                         input2_shape)),
+                                    "input_data1":
+                                    TensorConfig(data_gen=partial(
+                                        generate_input1, batch, dim1)),
+                                    "input_data2":
+                                    TensorConfig(data_gen=partial(
+                                        generate_input2, input2_shape)),
                                 },
                                 outputs=["mul4_output"])
 
@@ -388,6 +393,7 @@ def generate_weight2():
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             # The last dim of input1 and input2 should be static.
             self.dynamic_shape.min_input_shape = {
@@ -412,8 +418,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -431,6 +436,7 @@ def clear_dynamic_shape():
         yield self.create_inference_config(), (1, 3), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             if self.trt_param.precision == paddle_infer.PrecisionType.Half:
                 return True
@@ -466,7 +472,9 @@ def test(self):
 
 
 class TrtConvertMultiHeadMatmulTestInt8(TrtConvertMultiHeadMatmulTest):
+
     def sample_program_configs(self):
+
         def generate_input1(batch, dim1):
             return np.random.random((batch, dim1, 768)).astype(np.float32)
 
@@ -608,8 +616,7 @@ def generate_weight2():
                                     },
                                     "op_outputs": {
                                         "Out": ["transpose21_output"],
-                                        "XShape":
-                                        ["transpose21_output_xshape"]
+                                        "XShape": ["transpose21_output_xshape"]
                                     },
                                     "op_attrs": dics[3]
                                 },
@@ -653,8 +660,7 @@ def generate_weight2():
                                     },
                                     "op_outputs": {
                                         "Out": ["transpose22_output"],
-                                        "XShape":
-                                        ["transpose22_output_xshape"]
+                                        "XShape": ["transpose22_output_xshape"]
                                     },
                                     "op_attrs": dics[7]
                                 },
@@ -698,8 +704,7 @@ def generate_weight2():
                                     },
                                     "op_outputs": {
                                         "Out": ["transpose23_output"],
-                                        "XShape":
-                                        ["transpose23_output_xshape"]
+                                        "XShape": ["transpose23_output_xshape"]
                                     },
                                     "op_attrs": dics[11]
                                 },
@@ -773,8 +778,7 @@ def generate_weight2():
                                     },
                                     "op_outputs": {
                                         "Out": ["transpose24_output"],
-                                        "XShape":
-                                        ["transpose24_output_xshape"]
+                                        "XShape": ["transpose24_output_xshape"]
                                     },
                                     "op_attrs": dics[18]
                                 },
@@ -789,7 +793,7 @@ def generate_weight2():
                                     },
                                     "op_attrs": dics[19]
                                 },
-                                # In order to fuse ops with 
+                                # In order to fuse ops with
                                 # multihead_matmul_fuse_pass_v2, the last op
                                 # must be mul.
                                 {
@@ -809,28 +813,35 @@ def generate_weight2():
                             program_config = ProgramConfig(
                                 ops=ops,
                                 weights={
-                                    "mul1_weight": TensorConfig(
+                                    "mul1_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_weight1)),
-                                    "mul2_weight": TensorConfig(
+                                    "mul2_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_weight1)),
-                                    "mul3_weight": TensorConfig(
+                                    "mul3_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_weight1)),
-                                    "mul4_weight": TensorConfig(
+                                    "mul4_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_weight1)),
-                                    "elementwise_add1_weight": TensorConfig(
+                                    "elementwise_add1_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_weight2)),
-                                    "elementwise_add2_weight": TensorConfig(
+                                    "elementwise_add2_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_weight2)),
-                                    "elementwise_add3_weight": TensorConfig(
+                                    "elementwise_add3_weight":
+                                    TensorConfig(
                                         data_gen=partial(generate_weight2)),
                                 },
                                 inputs={
-                                    "input_data1": TensorConfig(
-                                        data_gen=partial(generate_input1, batch,
-                                                         dim1)),
-                                    "input_data2": TensorConfig(
-                                        data_gen=partial(generate_input2,
-                                                         input2_shape)),
+                                    "input_data1":
+                                    TensorConfig(data_gen=partial(
+                                        generate_input1, batch, dim1)),
+                                    "input_data2":
+                                    TensorConfig(data_gen=partial(
+                                        generate_input2, input2_shape)),
                                 },
                                 outputs=["mul4_output"])
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp.py
index 56c0b041da244..7cefcb96a3a6a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,16 +22,16 @@
 
 
 class TrtConvertNearestInterpTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
-        if attrs[0]['scale'] <= 0 and (attrs[0]['out_h'] <= 0 or
-                                       attrs[0]['out_w'] <= 0):
+        if attrs[0]['scale'] <= 0 and (attrs[0]['out_h'] <= 0
+                                       or attrs[0]['out_w'] <= 0):
             return False
         if (attrs[0]['out_h'] <= 0) ^ (attrs[0]['out_w'] <= 0):
             return False
@@ -39,6 +39,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]]):
             return np.ones([1, 3, 64, 64]).astype(np.float32)
 
@@ -73,9 +74,9 @@ def generate_input1(attrs: List[Dict[str, Any]]):
                                     ops=ops,
                                     weights={},
                                     inputs={
-                                        "input_data": TensorConfig(
-                                            data_gen=partial(generate_input1,
-                                                             dics))
+                                        "input_data":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input1, dics))
                                     },
                                     outputs=["nearest_interp_output_data"])
 
@@ -83,6 +84,7 @@ def generate_input1(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
@@ -97,8 +99,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -113,13 +114,14 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-2
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-2
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             if program_config.ops[0].attrs[
                     'scale'] <= 0 and self.dynamic_shape.min_input_shape:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py
index cf8b7b3516b37..2cd4253cb8f94 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertNearestInterpV2Test(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input():
             return np.ones([1, 3, 32, 32]).astype(np.float32)
 
@@ -60,6 +62,7 @@ def generate_input():
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
@@ -74,8 +77,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -90,11 +92,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-2
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-2
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pad.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pad.py
index 446f7717e3b50..767854b0fbace 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pad.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pad.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,12 +22,12 @@
 
 
 class TrtConvertPadTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         if attrs[0]['pad_value'] != 0.0:
@@ -39,6 +39,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]]):
             return np.ones([1, 3, 64, 64]).astype(np.float32)
 
@@ -46,8 +47,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
             return np.random.random([24, 3, 3, 3]).astype(np.float32)
 
         for pad_value in [0.0, 1.0, 2.0, -100, 100.0]:
-            for paddings in [[0, 0, 0, 0, 1, 1, 1, 1],
-                             [0, 0, 0, 0, 1, 2, 3, 4],
+            for paddings in [[0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 0, 0, 1, 2, 3, 4],
                              [0, 0, 1, 1, 1, 1, 1, 1],
                              [0, 0, 0, 0, -1, -1, 1, 1]]:
                 dics = [{"pad_value": pad_value, "paddings": paddings}, {}]
@@ -77,6 +77,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
@@ -94,8 +95,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -110,13 +110,14 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-2
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-2
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             for x in range(len(program_config.ops[0].attrs['paddings']) - 4):
                 if program_config.ops[0].attrs['paddings'][x] != 0:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
index 89ce1145d74e0..24e80e01e9707 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class TrtConvertPool2dTest(TrtLayerAutoScanTest):
+
     def is_paddings_valid(self, program_config: ProgramConfig) -> bool:
         exclusive = program_config.ops[0].attrs['exclusive']
         paddings = program_config.ops[0].attrs['paddings']
@@ -80,14 +81,16 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
                                                 }]
 
                                                 ops_config = [{
-                                                    "op_type": "pool2d",
+                                                    "op_type":
+                                                    "pool2d",
                                                     "op_inputs": {
                                                         "X": ["input_data"],
                                                     },
                                                     "op_outputs": {
                                                         "Out": ["output_data"]
                                                     },
-                                                    "op_attrs": dics[0]
+                                                    "op_attrs":
+                                                    dics[0]
                                                 }]
                                                 ops = self.generate_op_config(
                                                     ops_config)
@@ -108,6 +111,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
@@ -122,8 +126,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -138,13 +141,14 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
+
         def teller(program_config, predictor_config):
             if program_config.ops[0].attrs['pooling_type'] == 'avg' and \
                program_config.ops[0].attrs['global_pooling'] == False and \
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py
index 00e3f7feb6022..49a750f14dd36 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertPreluTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input(batch, dim1, dim2, dim3):
             shape = [batch]
             if dim1 != 0:
@@ -108,16 +110,16 @@ def generate_alpha(attrs: List[Dict[str, Any]], dim1, dim2, dim3):
                                 program_config = ProgramConfig(
                                     ops=ops,
                                     weights={
-                                        "alpha_weight": TensorConfig(
-                                            data_gen=partial(generate_alpha,
-                                                             dics, dim1, dim2,
-                                                             dim3))
+                                        "alpha_weight":
+                                        TensorConfig(data_gen=partial(
+                                            generate_alpha, dics, dim1, dim2,
+                                            dim3))
                                     },
                                     inputs={
-                                        "input_data": TensorConfig(
-                                            data_gen=partial(generate_input,
-                                                             batch, dim1, dim2,
-                                                             dim3)),
+                                        "input_data":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input, batch, dim1, dim2,
+                                            dim3)),
                                     },
                                     outputs=["output_data"])
 
@@ -125,11 +127,18 @@ def generate_alpha(attrs: List[Dict[str, Any]], dim1, dim2, dim3):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dim1 == 0:
-                self.dynamic_shape.min_input_shape = {"input_data": [1], }
-                self.dynamic_shape.max_input_shape = {"input_data": [4], }
-                self.dynamic_shape.opt_input_shape = {"input_data": [2], }
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [2],
+                }
             else:
                 if self.dim2 == 0 and self.dim3 == 0:
                     self.dynamic_shape.min_input_shape = {
@@ -168,8 +177,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
@@ -189,11 +197,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
         ver = paddle_infer.get_trt_compile_version()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py
index 2e1e04870b926..dc5696a9b79d8 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,11 +22,11 @@
 
 
 class TrtConvertReduceMeanTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         ## dim should be in (-rank, rank), and not NONE
@@ -45,6 +45,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(dtype, attrs: List[Dict[str, Any]]):
             if dtype == -1 or dtype == 5:
                 return np.random.random([1, 3, 64, 64]).astype(np.float32)
@@ -80,7 +81,8 @@ def generate_input1(dtype, attrs: List[Dict[str, Any]]):
                             ops=ops,
                             weights={},
                             inputs={
-                                "input_data": TensorConfig(data_gen=partial(
+                                "input_data":
+                                TensorConfig(data_gen=partial(
                                     generate_input1, out_dtype, dics))
                             },
                             outputs=["reduce_output_data"])
@@ -92,6 +94,7 @@ def generate_input1(dtype, attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
@@ -115,8 +118,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                     return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -131,8 +133,8 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (5e-4, 5e-4)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py
index 2a7e673d4203a..68c3e9bd377db 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,11 +23,11 @@
 
 
 class TrtConvertReduceSumTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         ## dim should be in (-rank, rank), and not NONE
@@ -41,6 +41,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(dtype, attrs: List[Dict[str, Any]]):
             if dtype == -1 or dtype == 5:
                 return np.random.random([1, 3, 64, 64]).astype(np.float32)
@@ -76,7 +77,8 @@ def generate_input1(dtype, attrs: List[Dict[str, Any]]):
                             ops=ops,
                             weights={},
                             inputs={
-                                "input_data": TensorConfig(data_gen=partial(
+                                "input_data":
+                                TensorConfig(data_gen=partial(
                                     generate_input1, out_dtype, dics))
                             },
                             outputs=["reduce_output_data"])
@@ -87,6 +89,7 @@ def generate_input1(dtype, attrs: List[Dict[str, Any]]):
                         yield program_config
 
     def sample_predictor_configs(self, program_config):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
@@ -110,8 +113,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                     return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py
index 4355b83557fc6..e05a78e66b900 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,10 @@
 
 
 class TrtConvertReshapeTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         if self.dims == 1:
             if len(attrs[0]['shape']) != 1:
@@ -45,6 +45,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]]):
             if self.dims == 4:
                 return np.ones([1, 2, 4, 6]).astype(np.float32)
@@ -66,10 +67,11 @@ def generate_shapeT2_data(attrs: List[Dict[str, Any]]):
 
         for dims in [4, 3, 2, 1]:
             for num_input in [0, 1, 2, 3]:
-                for shape in [[1, 6, 8], [1, 2, 4, 6], [1, 1, 0, 12],
-                              [1, 0, 6], [1, -1, 12], [2, -1], [3, 16],
-                              [3, 4, 4], [48]]:
-                    dics = [{"shape": shape, }, {}]
+                for shape in [[1, 6, 8], [1, 2, 4, 6], [1, 1, 0, 12], [1, 0, 6],
+                              [1, -1, 12], [2, -1], [3, 16], [3, 4, 4], [48]]:
+                    dics = [{
+                        "shape": shape,
+                    }, {}]
                     self.num_input = num_input
                     self.dims = dims
                     dics_intput = [{
@@ -89,18 +91,22 @@ def generate_shapeT2_data(attrs: List[Dict[str, Any]]):
                     dics_weight = [{
                         "shape_data":
                         TensorConfig(data_gen=partial(generate_weight1, dics)),
-                        "shapeT1_data": TensorConfig(data_gen=partial(
-                            generate_shapeT1_data, dics)),
-                        "shapeT2_data": TensorConfig(data_gen=partial(
-                            generate_shapeT2_data, dics))
+                        "shapeT1_data":
+                        TensorConfig(
+                            data_gen=partial(generate_shapeT1_data, dics)),
+                        "shapeT2_data":
+                        TensorConfig(
+                            data_gen=partial(generate_shapeT2_data, dics))
                     }, {
                         "shape_data":
                         TensorConfig(data_gen=partial(generate_weight1, dics))
                     }, {
-                        "shapeT1_data": TensorConfig(data_gen=partial(
-                            generate_shapeT1_data, dics)),
-                        "shapeT2_data": TensorConfig(data_gen=partial(
-                            generate_shapeT2_data, dics))
+                        "shapeT1_data":
+                        TensorConfig(
+                            data_gen=partial(generate_shapeT1_data, dics)),
+                        "shapeT2_data":
+                        TensorConfig(
+                            data_gen=partial(generate_shapeT2_data, dics))
                     }, {}]
 
                     ops_config = [{
@@ -116,8 +122,9 @@ def generate_shapeT2_data(attrs: List[Dict[str, Any]]):
                         ops=ops,
                         weights=dics_weight[num_input],
                         inputs={
-                            "reshape_input": TensorConfig(data_gen=partial(
-                                generate_input1, dics))
+                            "reshape_input":
+                            TensorConfig(
+                                data_gen=partial(generate_input1, dics))
                         },
                         outputs=["reshape_out"])
 
@@ -125,6 +132,7 @@ def generate_shapeT2_data(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
@@ -164,8 +172,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         if attrs[0]['shape'][0] > 1 and len(attrs[0]['shape']) > 1:
             pass
@@ -182,13 +189,14 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             if len(program_config.weights) >= 1:
                 return True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py
index b2d754337fe02..ca12fe876ca39 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertRoiAlignTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             return np.ones([batch, 256, 32, 32]).astype(np.float32)
 
@@ -61,30 +63,34 @@ def generate_input3(attrs: List[Dict[str, Any]], batch):
                                         "ROIs": ["ROIs"]
                                     }]
                                     program_input = [{
-                                        "roi_align_input": TensorConfig(
-                                            data_gen=partial(generate_input1,
-                                                             dics, batch)),
-                                        "ROIs": TensorConfig(data_gen=partial(
+                                        "roi_align_input":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input1, dics, batch)),
+                                        "ROIs":
+                                        TensorConfig(data_gen=partial(
                                             generate_input2, dics, batch)),
-                                        "RoisNum": TensorConfig(
-                                            data_gen=partial(generate_input3,
-                                                             dics, batch))
+                                        "RoisNum":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input3, dics, batch))
                                     }, {
-                                        "roi_align_input": TensorConfig(
-                                            data_gen=partial(generate_input1,
-                                                             dics, batch)),
-                                        "ROIs": TensorConfig(
-                                            data_gen=partial(generate_input2,
-                                                             dics, batch),
-                                            lod=[[32, 3]])
+                                        "roi_align_input":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input1, dics, batch)),
+                                        "ROIs":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input2, dics, batch),
+                                                     lod=[[32, 3]])
                                     }]
                                     ops_config = [{
-                                        "op_type": "roi_align",
-                                        "op_inputs": dics_input[num_input],
+                                        "op_type":
+                                        "roi_align",
+                                        "op_inputs":
+                                        dics_input[num_input],
                                         "op_outputs": {
                                             "Out": ["roi_align_out"]
                                         },
-                                        "op_attrs": dics[0]
+                                        "op_attrs":
+                                        dics[0]
                                     }]
                                     ops = self.generate_op_config(ops_config)
                                     program_config = ProgramConfig(
@@ -97,6 +103,7 @@ def generate_input3(attrs: List[Dict[str, Any]], batch):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.num_input == 0:
                 self.dynamic_shape.min_input_shape = {
@@ -145,8 +152,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 0, 4
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -161,13 +167,14 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             if len(program_config.inputs) == 3:
                 return True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py
index 1b3d38036614f..675054317d9b1 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py
@@ -22,16 +22,17 @@
 
 
 class TrtConvertRollTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]]):
             return np.ones([1, 56, 56, 192]).astype(np.float32)
 
@@ -67,6 +68,7 @@ def generate_input1(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 56, 56, 192]
@@ -94,8 +96,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -110,11 +111,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-4
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-4
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py
index 75783450e86bf..1765760e15c43 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertScaleTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             if self.dims == 4:
                 return np.ones([batch, 3, 24, 24]).astype(np.float32)
@@ -60,27 +62,30 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
                                     "X": ["scale_input"]
                                 }]
                                 dics_intputs = [{
-                                    "ScaleTensor": TensorConfig(
-                                        data_gen=partial(generate_weight1,
-                                                         dics))
+                                    "ScaleTensor":
+                                    TensorConfig(data_gen=partial(
+                                        generate_weight1, dics))
                                 }, {}]
 
                                 ops_config = [{
-                                    "op_type": "scale",
-                                    "op_inputs": dics_intput[num_input],
+                                    "op_type":
+                                    "scale",
+                                    "op_inputs":
+                                    dics_intput[num_input],
                                     "op_outputs": {
                                         "Out": ["scale_out"]
                                     },
-                                    "op_attrs": dics[0]
+                                    "op_attrs":
+                                    dics[0]
                                 }]
                                 ops = self.generate_op_config(ops_config)
                                 program_config = ProgramConfig(
                                     ops=ops,
                                     weights=dics_intputs[num_input],
                                     inputs={
-                                        "scale_input": TensorConfig(
-                                            data_gen=partial(generate_input1,
-                                                             dics, batch))
+                                        "scale_input":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input1, dics, batch))
                                     },
                                     outputs=["scale_out"])
 
@@ -88,6 +93,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
@@ -121,8 +127,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -137,13 +142,14 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             if self.num_input == 0:
                 return True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py
index c6a8147236044..9948b29321dc0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertShuffleChannelTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             return np.ones([batch, 6, 24, 24]).astype(np.float32)
 
@@ -47,8 +49,9 @@ def generate_input1(attrs: List[Dict[str, Any]], batch):
                     ops=ops,
                     weights={},
                     inputs={
-                        "shuffle_channel_input": TensorConfig(data_gen=partial(
-                            generate_input1, dics, batch))
+                        "shuffle_channel_input":
+                        TensorConfig(
+                            data_gen=partial(generate_input1, dics, batch))
                     },
                     outputs=["shuffle_channel_out"])
 
@@ -56,6 +59,7 @@ def generate_input1(attrs: List[Dict[str, Any]], batch):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "shuffle_channel_input": [1, 6, 24, 24]
@@ -79,8 +83,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         self.trt_param.max_batch_size = 9
         # for static_shape
@@ -95,11 +98,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py
index 9f3e7a81777c2..c99c3cef5b6f8 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,20 +22,20 @@
 
 
 class TrtConvertSkipLayernormTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         outputs = program_config.outputs
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         #The input dimension should be less than or equal to the set axis.
         if attrs[0]['begin_norm_axis'] >= 0:
-            if len(inputs['skip_layernorm_inputX_data'].shape) <= attrs[0][
-                    'begin_norm_axis']:
+            if len(inputs['skip_layernorm_inputX_data'].shape
+                   ) <= attrs[0]['begin_norm_axis']:
                 return False
 
         #2D input is not supported.
@@ -44,6 +44,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             if self.dims == 4:
                 return np.ones([batch, 6, 128, 768]).astype(np.float32)
@@ -94,18 +95,20 @@ def generate_weight2(attrs: List[Dict[str, Any]]):
                             program_config = ProgramConfig(
                                 ops=ops,
                                 weights={
-                                    "Bias": TensorConfig(data_gen=partial(
+                                    "Bias":
+                                    TensorConfig(data_gen=partial(
                                         generate_weight1, dics)),
-                                    "Scale": TensorConfig(data_gen=partial(
+                                    "Scale":
+                                    TensorConfig(data_gen=partial(
                                         generate_weight2, dics))
                                 },
                                 inputs={
-                                    "skip_layernorm_inputX_data": TensorConfig(
-                                        data_gen=partial(generate_input1, dics,
-                                                         batch)),
-                                    "skip_layernorm_inputY_data": TensorConfig(
-                                        data_gen=partial(generate_input2, dics,
-                                                         batch))
+                                    "skip_layernorm_inputX_data":
+                                    TensorConfig(data_gen=partial(
+                                        generate_input1, dics, batch)),
+                                    "skip_layernorm_inputY_data":
+                                    TensorConfig(data_gen=partial(
+                                        generate_input2, dics, batch))
                                 },
                                 outputs=["skip_layernorm_out"])
 
@@ -113,6 +116,7 @@ def generate_weight2(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
@@ -184,8 +188,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 return 0, 4
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         # for static_shape
         clear_dynamic_shape()
@@ -200,11 +203,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py
index 86c52dad23af0..f82acb204f0a2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,12 +22,12 @@
 
 
 class TrtConvertSliceTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         for x in attrs[0]["decrease_axis"]:
@@ -42,8 +42,8 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
             else:
                 start = attrs[0]["starts"][x]
             if attrs[0]["ends"][x] < 0:
-                end = attrs[0]["ends"][x] + inputs['input_data'].shape[attrs[0][
-                    "axes"][x]]
+                end = attrs[0]["ends"][x] + inputs['input_data'].shape[
+                    attrs[0]["axes"][x]]
             else:
                 end = attrs[0]["ends"][x]
             start = max(0, start)
@@ -54,6 +54,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]]):
             return np.ones([6, 6, 64, 64]).astype(np.float32)
 
@@ -86,8 +87,9 @@ def generate_input1(attrs: List[Dict[str, Any]]):
                                 ops=ops,
                                 weights={},
                                 inputs={
-                                    "input_data": TensorConfig(data_gen=partial(
-                                        generate_input1, dics))
+                                    "input_data":
+                                    TensorConfig(
+                                        data_gen=partial(generate_input1, dics))
                                 },
                                 outputs=["slice_output_data"])
 
@@ -95,6 +97,7 @@ def generate_input1(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [8, 8, 64, 64]}
@@ -124,8 +127,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         self.trt_param.max_batch_size = 9
         # for static_shape
@@ -140,11 +142,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-4
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-4
 
     def test(self):
         # TODO(inference): fix.
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py
index 7efaebf00cf72..b6cef5ca17bdc 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,14 +22,14 @@
 
 
 class TrtConvertSoftmaxTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         outputs = program_config.outputs
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         #The input dimension should be less than or equal to the set axis.
@@ -39,6 +39,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             if self.dims == 4:
                 return np.ones([batch, 3, 24, 24]).astype(np.float32)
@@ -67,8 +68,9 @@ def generate_input1(attrs: List[Dict[str, Any]], batch):
                         ops=ops,
                         weights={},
                         inputs={
-                            "softmax_input": TensorConfig(data_gen=partial(
-                                generate_input1, dics, batch))
+                            "softmax_input":
+                            TensorConfig(
+                                data_gen=partial(generate_input1, dics, batch))
                         },
                         outputs=["softmax_out"])
 
@@ -76,6 +78,7 @@ def generate_input1(attrs: List[Dict[str, Any]], batch):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
@@ -111,8 +114,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         # for static_shape
         clear_dynamic_shape()
@@ -129,11 +131,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py
index cef84dfbb4e0a..38ca6963e94b2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,14 +22,14 @@
 
 
 class TrtConvertSplitTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         outputs = program_config.outputs
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         # the dimensions of input and axis match
         if len(inputs['split_input'].shape) <= attrs[0]['axis']:
@@ -60,8 +60,9 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
 
         #Test AxisTensor and SectionsTensorList
         if self.num_input == 0:
-            if self.dims == 2 and attrs[0]['sections'] == [10, 14] and len(
-                    outputs) == 2:
+            if self.dims == 2 and attrs[0]['sections'] == [
+                    10, 14
+            ] and len(outputs) == 2:
                 return True
             else:
                 return False
@@ -69,6 +70,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             if self.dims == 4:
                 return np.ones([batch, 3, 3, 24]).astype(np.float32)
@@ -121,22 +123,25 @@ def generate_SectionsTensorList2(attrs: List[Dict[str, Any]]):
                                         "AxisTensor":
                                         TensorConfig(data_gen=partial(
                                             generate_AxisTensor, dics)),
-                                        "SectionsTensorList1": TensorConfig(
-                                            data_gen=partial(
-                                                generate_SectionsTensorList1,
-                                                dics)),
+                                        "SectionsTensorList1":
+                                        TensorConfig(data_gen=partial(
+                                            generate_SectionsTensorList1,
+                                            dics)),
                                         "SectionsTensorList2":
                                         TensorConfig(data_gen=partial(
                                             generate_SectionsTensorList2, dics))
                                     }, {}]
 
                                     ops_config = [{
-                                        "op_type": "split",
-                                        "op_inputs": dics_intput[num_input],
+                                        "op_type":
+                                        "split",
+                                        "op_inputs":
+                                        dics_intput[num_input],
                                         "op_outputs": {
                                             "Out": Out
                                         },
-                                        "op_attrs": dics[0]
+                                        "op_attrs":
+                                        dics[0]
                                     }]
                                     ops = self.generate_op_config(ops_config)
                                     program_config = ProgramConfig(
@@ -153,6 +158,7 @@ def generate_SectionsTensorList2(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
@@ -195,8 +201,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                     return 0, 5
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         self.trt_param.max_batch_size = 9
         # for static_shape
@@ -211,13 +216,14 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
+
         def teller1(program_config, predictor_config):
             if len(program_config.weights) == 3:
                 return True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py
index 062312b0fab4f..f9641bad34c5b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py
@@ -22,14 +22,14 @@
 
 
 class TrtConvertStackTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         outputs = program_config.outputs
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         #The input dimension should be less than the set axis.
         if len(inputs['stack_input1'].shape) < attrs[0]['axis']:
@@ -38,6 +38,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             if self.dims == 4:
                 return np.ones([batch, 3, 24, 24]).astype(np.float32)
@@ -89,12 +90,15 @@ def generate_input3(attrs: List[Dict[str, Any]], batch):
                         ops=ops,
                         weights={},
                         inputs={
-                            "stack_input1": TensorConfig(data_gen=partial(
-                                generate_input1, dics, batch)),
-                            "stack_input2": TensorConfig(data_gen=partial(
-                                generate_input2, dics, batch)),
-                            "stack_input3": TensorConfig(data_gen=partial(
-                                generate_input3, dics, batch))
+                            "stack_input1":
+                            TensorConfig(
+                                data_gen=partial(generate_input1, dics, batch)),
+                            "stack_input2":
+                            TensorConfig(
+                                data_gen=partial(generate_input2, dics, batch)),
+                            "stack_input3":
+                            TensorConfig(
+                                data_gen=partial(generate_input3, dics, batch))
                         },
                         outputs=["stack_output"])
 
@@ -102,6 +106,7 @@ def generate_input3(attrs: List[Dict[str, Any]], batch):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
@@ -180,8 +185,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 return 0, 5
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         # for static_shape
         clear_dynamic_shape()
@@ -195,11 +199,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
index 8bc48047c1397..beea119c79fc0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
@@ -22,16 +22,17 @@
 
 
 class TrtConvertStridedSliceTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]]):
             return np.ones([1, 56, 56, 192]).astype(np.float32)
 
@@ -66,9 +67,9 @@ def generate_input1(attrs: List[Dict[str, Any]]):
                                     ops=ops,
                                     weights={},
                                     inputs={
-                                        "input_data": TensorConfig(
-                                            data_gen=partial(generate_input1,
-                                                             dics))
+                                        "input_data":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input1, dics))
                                     },
                                     outputs=["slice_output_data"])
 
@@ -76,6 +77,7 @@ def generate_input1(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 56, 56, 192]
@@ -109,8 +111,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -122,8 +123,8 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_swish.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_swish.py
index df97e7542b882..1ae92dc527aa9 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_swish.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_swish.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertSwishTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(dims, attrs: List[Dict[str, Any]]):
             if dims == 1:
                 return np.ones([3]).astype(np.float32)
@@ -58,8 +60,9 @@ def generate_input1(dims, attrs: List[Dict[str, Any]]):
                     ops=ops,
                     weights={},
                     inputs={
-                        "input_data": TensorConfig(data_gen=partial(
-                            generate_input1, dims, dics))
+                        "input_data":
+                        TensorConfig(
+                            data_gen=partial(generate_input1, dims, dics))
                     },
                     outputs=["output_data"])
 
@@ -67,6 +70,7 @@ def generate_input1(dims, attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -104,8 +108,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -120,11 +123,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py
index cbbd13a7b8003..82c707869f88c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,11 +26,11 @@
 
 
 class TrtConvertTileTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         for x in attrs[0]['repeat_times']:
             if x <= 0:
@@ -39,6 +39,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self, *args, **kwargs):
+
         def generate_input1(attrs: List[Dict[str, Any]]):
             return np.ones([1, 2, 3, 4]).astype(np.float32)
 
@@ -60,8 +61,8 @@ def generate_input1(attrs: List[Dict[str, Any]]):
             ops=ops,
             weights={},
             inputs={
-                "input_data": TensorConfig(data_gen=partial(generate_input1,
-                                                            dics))
+                "input_data":
+                TensorConfig(data_gen=partial(generate_input1, dics))
             },
             outputs=["tile_output_data"])
 
@@ -69,6 +70,7 @@ def generate_input1(attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
@@ -90,8 +92,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 return 0, 3
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -106,11 +107,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-4
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-4
 
     @given(repeat_times=st.sampled_from([[100], [1, 2], [0, 3], [1, 2, 100]]))
     def test(self, *args, **kwargs):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py
index 87e81396ab411..e9604925e4ac5 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,14 +22,14 @@
 
 
 class TrtConvertTransposeTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
         outputs = program_config.outputs
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         #The shape of input and axis should be equal.
@@ -39,6 +39,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             if self.dims == 4:
                 return np.ones([batch, 3, 24, 24]).astype(np.float32)
@@ -50,8 +51,8 @@ def generate_input1(attrs: List[Dict[str, Any]], batch):
         for dims in [2, 3, 4]:
             for batch in [1, 2, 4]:
                 for axis in [[0, 1, 3, 2], [0, 3, 2, 1], [3, 2, 0, 1],
-                             [0, 1, 2, 3], [0, 1, 2], [2, 0, 1], [1, 0],
-                             [0, 1]]:
+                             [0, 1, 2, 3], [0, 1, 2], [2, 0, 1], [1, 0], [0,
+                                                                          1]]:
                     self.dims = dims
                     dics = [{"axis": axis}, {}]
                     ops_config = [{
@@ -69,8 +70,9 @@ def generate_input1(attrs: List[Dict[str, Any]], batch):
                         ops=ops,
                         weights={},
                         inputs={
-                            "transpose_input": TensorConfig(data_gen=partial(
-                                generate_input1, dics, batch))
+                            "transpose_input":
+                            TensorConfig(
+                                data_gen=partial(generate_input1, dics, batch))
                         },
                         outputs=["transpose_out"])
 
@@ -78,6 +80,7 @@ def generate_input1(attrs: List[Dict[str, Any]], batch):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
@@ -125,8 +128,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                     return 0, 3
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         # for static_shape
         clear_dynamic_shape()
@@ -140,11 +142,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
index 2abf0a1acda67..fd4753528ee1e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,12 @@
 
 
 class TrtConvertActivationTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
             if dims == 1:
                 return np.ones([32]).astype(np.float32)
@@ -58,7 +60,8 @@ def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data": TensorConfig(data_gen=partial(
+                            "input_data":
+                            TensorConfig(data_gen=partial(
                                 generate_input1, dims, batch, dics))
                         },
                         outputs=["output_data"])
@@ -67,6 +70,7 @@ def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -102,8 +106,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             return 1, 2
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
         # for static_shape
@@ -118,11 +121,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py
index 269523661ee4d..cebede99e6f82 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,17 +22,19 @@
 
 
 class TrtConvertYoloBoxTest(TrtLayerAutoScanTest):
+
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
+
         def generate_input1(attrs: List[Dict[str, Any]], batch, channel):
             if attrs[0]['iou_aware'] == True:
-                return np.ones(
-                    [batch, 3 * (channel + 6), 13, 13]).astype(np.float32)
+                return np.ones([batch, 3 * (channel + 6), 13,
+                                13]).astype(np.float32)
             else:
-                return np.ones(
-                    [batch, 3 * (channel + 5), 13, 13]).astype(np.float32)
+                return np.ones([batch, 3 * (channel + 5), 13,
+                                13]).astype(np.float32)
 
         def generate_input2(attrs: List[Dict[str, Any]], batch):
             return np.random.random([batch, 2]).astype(np.int32)
@@ -47,14 +49,20 @@ def generate_input2(attrs: List[Dict[str, Any]], batch):
                                     for iou_aware in [False, True]:
                                         for iou_aware_factor in [0.5]:
                                             dics = [{
-                                                "class_num": class_num,
-                                                "anchors": anchors,
+                                                "class_num":
+                                                class_num,
+                                                "anchors":
+                                                anchors,
                                                 "downsample_ratio":
                                                 downsample_ratio,
-                                                "conf_thresh": conf_thresh,
-                                                "clip_bbox": clip_bbox,
-                                                "scale_x_y": scale_x_y,
-                                                "iou_aware": iou_aware,
+                                                "conf_thresh":
+                                                conf_thresh,
+                                                "clip_bbox":
+                                                clip_bbox,
+                                                "scale_x_y":
+                                                scale_x_y,
+                                                "iou_aware":
+                                                iou_aware,
                                                 "iou_aware_factor":
                                                 iou_aware_factor
                                             }, {}]
@@ -82,7 +90,8 @@ def generate_input2(attrs: List[Dict[str, Any]], batch):
                                                             generate_input1,
                                                             dics, batch,
                                                             class_num)),
-                                                    "imgsize": TensorConfig(
+                                                    "imgsize":
+                                                    TensorConfig(
                                                         data_gen=partial(
                                                             generate_input2,
                                                             dics, batch))
@@ -93,6 +102,7 @@ def generate_input2(attrs: List[Dict[str, Any]], batch):
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
+
         def generate_dynamic_shape(attrs):
             if attrs[0]['iou_aware'] == True:
                 channel = 3 * (attrs[0]['class_num'] + 6)
@@ -129,8 +139,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 return 1, 4
 
         attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
         # for static_shape
         clear_dynamic_shape()
@@ -144,11 +153,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-3
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-3
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box_head.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box_head.py
index ece2d187fb9da..08a09338bf27b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box_head.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box_head.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,7 +22,9 @@
 
 
 class TrtConvertYoloBoxHeadTest(TrtLayerAutoScanTest):
+
     def sample_program_configs(self):
+
         def generate_input(attrs: List[Dict[str, Any]], batch, shape):
             gen_shape = shape.copy()
             gen_shape.insert(0, batch)
@@ -53,7 +55,8 @@ def generate_input(attrs: List[Dict[str, Any]], batch, shape):
                     ops=ops,
                     weights={},
                     inputs={
-                        "yolo_box_head_input": TensorConfig(data_gen=partial(
+                        "yolo_box_head_input":
+                        TensorConfig(data_gen=partial(
                             generate_input, attrs_dict, batch, input_shape[i]))
                     },
                     outputs=["yolo_box_head_output"])
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_deformable_conv.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_deformable_conv.py
index 508095fb80175..3bed89e74f595 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_deformable_conv.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_deformable_conv.py
@@ -24,15 +24,19 @@
 
 
 class TRTDeformableConvTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            input = fluid.data(
-                name='input', shape=self.input_size, dtype=self.dtype)
-            offset = fluid.data(
-                name='offset', shape=self.offset_size, dtype=self.dtype)
-            mask = fluid.data(
-                name='mask', shape=self.mask_size, dtype=self.dtype)
+            input = fluid.data(name='input',
+                               shape=self.input_size,
+                               dtype=self.dtype)
+            offset = fluid.data(name='offset',
+                                shape=self.offset_size,
+                                dtype=self.dtype)
+            mask = fluid.data(name='mask',
+                              shape=self.mask_size,
+                              dtype=self.dtype)
 
             output = fluid.layers.deformable_conv(
                 input,
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py
index a7ae6a635ecdf..8b0595387386f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py
@@ -24,30 +24,33 @@
 
 
 class TRTDynamicShapeTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 16, 16], dtype="float32")
-            out = fluid.layers.conv2d(
-                input=data,
-                num_filters=3,
-                filter_size=3,
-                groups=1,
-                padding=[1, 1],
-                bias_attr=False,
-                act=None)
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 16, 16],
+                              dtype="float32")
+            out = fluid.layers.conv2d(input=data,
+                                      num_filters=3,
+                                      filter_size=3,
+                                      groups=1,
+                                      padding=[1, 1],
+                                      bias_attr=False,
+                                      act=None)
 
         self.feeds = self.set_feeds()
         self.enable_trt = True
         self.trt_parameters = TRTDynamicShapeTest.TensorRTParam(
             1 << 30, 1, 1, AnalysisConfig.Precision.Float32, False, False)
-        self.dynamic_shape_params = TRTDynamicShapeTest.DynamicShapeParam({
-            'data': [1, 3, 8, 8]
-        }, {'data': [1, 3, 32, 32]}, {'data': [1, 3, 16, 16]}, False)
+        self.dynamic_shape_params = TRTDynamicShapeTest.DynamicShapeParam(
+            {'data': [1, 3, 8, 8]}, {'data': [1, 3, 32, 32]},
+            {'data': [1, 3, 16, 16]}, False)
         self.fetch_list = [out]
 
     def set_feeds(self):
-        return {"data": np.random.random([1, 3, 16, 16]).astype("float32"), }
+        return {
+            "data": np.random.random([1, 3, 16, 16]).astype("float32"),
+        }
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -56,8 +59,11 @@ def test_check_output(self):
 
 
 class TRTDynamicShapeOutOfBound1Test(TRTDynamicShapeTest):
+
     def set_feeds(self):
-        return {"data": np.random.random([1, 3, 64, 16]).astype("float32"), }
+        return {
+            "data": np.random.random([1, 3, 64, 16]).astype("float32"),
+        }
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -71,18 +77,21 @@ def test_check_output(self):
 # class TRTDynamicShapeOutOfBound2Test(TRTDynamicShapeTest):
 #     def set_feeds(self):
 #         return {"data": np.random.random([2, 3, 16, 16]).astype("float32"), }
-# 
+#
 #     def test_check_output(self):
 #         if core.is_compiled_with_cuda():
 #             use_gpu = True
 #             with self.assertRaises(Exception):
 #                 self.check_output_with_option(use_gpu)
-# 
+#
 
 
 class TRTDynamicShapeOutOfBound3Test(TRTDynamicShapeTest):
+
     def set_feeds(self):
-        return {"data": np.random.random([1, 3, 4, 16]).astype("float32"), }
+        return {
+            "data": np.random.random([1, 3, 4, 16]).astype("float32"),
+        }
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_elementwise_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_elementwise_op.py
index b40daba48689b..a989135a64c52 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_elementwise_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_elementwise_op.py
@@ -26,12 +26,15 @@
 
 
 class TensorRTSubgraphPassElementwiseBroadcastTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data1 = fluid.data(
-                name="data1", shape=[-1, 3, 64, 64], dtype="float32")
-            data2 = fluid.data(
-                name="data2", shape=[-1, 3, 64, 1], dtype="float32")
+            data1 = fluid.data(name="data1",
+                               shape=[-1, 3, 64, 64],
+                               dtype="float32")
+            data2 = fluid.data(name="data2",
+                               shape=[-1, 3, 64, 1],
+                               dtype="float32")
             eltwise_out = self.append_eltwise(data1, data2)
             out = fluid.layers.batch_norm(eltwise_out, is_test=True)
         self.feeds = {
@@ -58,18 +61,21 @@ def test_check_output(self):
 
 class TensorRTSubgraphPassElementwiseBroadcastTest1(
         TensorRTSubgraphPassElementwiseBroadcastTest):
+
     def append_eltwise(self, data1, data2):
         return fluid.layers.elementwise_sub(x=data1, y=data2, axis=0)
 
 
 class TensorRTSubgraphPassElementwiseBroadcastTest2(
         TensorRTSubgraphPassElementwiseBroadcastTest):
+
     def append_eltwise(self, data1, data2):
         return fluid.layers.elementwise_mul(x=data1, y=data2, axis=0)
 
 
 class TensorRTSubgraphPassElementwiseBroadcastTest3(
         TensorRTSubgraphPassElementwiseBroadcastTest):
+
     def append_eltwise(self, data1, data2):
         return fluid.layers.elementwise_div(x=data1, y=data2, axis=0)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
index dd6232fac459e..4b086f995fcbb 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
@@ -25,10 +25,12 @@
 
 
 class FCFusePassTRTTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[32, 128, 2, 2], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[32, 128, 2, 2],
+                              dtype="float32")
             fc_out1 = fluid.layers.fc(input=data,
                                       size=128,
                                       num_flatten_dims=1,
@@ -38,9 +40,9 @@ def setUp(self):
         self.feeds = {
             "data": np.random.random((32, 128, 2, 2)).astype("float32")
         }
-        # Diff occurred between GPU and TRT. 
-        # In order to provide TRT CI ASAP, this test for trt part 
-        # is disabled temporarily. 
+        # Diff occurred between GPU and TRT.
+        # In order to provide TRT CI ASAP, this test for trt part
+        # is disabled temporarily.
         # self.enable_trt = True
         # self.trt_parameters = FCFusePassTRTTest.TensorRTParam(
         #     1 << 30, 32, 3, AnalysisConfig.Precision.Float32, False, False)
@@ -55,10 +57,12 @@ def test_check_output(self):
 
 
 class FCFusePassTRTStaticDims4Cols1Test(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[32, 128, 32, 8], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[32, 128, 32, 8],
+                              dtype="float32")
             fc_out1 = fluid.layers.fc(input=data,
                                       size=64,
                                       num_flatten_dims=1,
@@ -82,10 +86,12 @@ def test_check_output(self):
 
 
 class FCFusePassTRTStaticDims4Cols2Test(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[3, 24, 16, 16], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[3, 24, 16, 16],
+                              dtype="float32")
             fc_out1 = fluid.layers.fc(input=data,
                                       size=32,
                                       num_flatten_dims=2,
@@ -109,6 +115,7 @@ def test_check_output(self):
 
 
 class FCFusePassTRTDynamicDims2Test(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[32, 128], dtype="float32")
@@ -123,9 +130,7 @@ def setUp(self):
         self.trt_parameters = FCFusePassTRTDynamicDims2Test.TensorRTParam(
             1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
         self.dynamic_shape_params = FCFusePassTRTDynamicDims2Test.DynamicShapeParam(
-            {
-                'data': [1, 128]
-            }, {'data': [64, 128]}, {'data': [32, 128]}, False)
+            {'data': [1, 128]}, {'data': [64, 128]}, {'data': [32, 128]}, False)
         self.fetch_list = [out]
 
     def test_check_output(self):
@@ -137,6 +142,7 @@ def test_check_output(self):
 
 
 class FCFusePassTRTDynamicDims3Cols1Test(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[32, 128, 32], dtype="float32")
@@ -151,9 +157,8 @@ def setUp(self):
         self.trt_parameters = FCFusePassTRTDynamicDims3Cols1Test.TensorRTParam(
             1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
         self.dynamic_shape_params = FCFusePassTRTDynamicDims3Cols1Test.DynamicShapeParam(
-            {
-                'data': [1, 128, 32]
-            }, {'data': [64, 128, 32]}, {'data': [32, 128, 32]}, False)
+            {'data': [1, 128, 32]}, {'data': [64, 128, 32]},
+            {'data': [32, 128, 32]}, False)
         self.fetch_list = [out]
 
     def test_check_output(self):
@@ -165,6 +170,7 @@ def test_check_output(self):
 
 
 class FCFusePassTRTDynamicDims3Cols2Test(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[32, 128, 32], dtype="float32")
@@ -179,9 +185,8 @@ def setUp(self):
         self.trt_parameters = FCFusePassTRTDynamicDims3Cols2Test.TensorRTParam(
             1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
         self.dynamic_shape_params = FCFusePassTRTDynamicDims3Cols2Test.DynamicShapeParam(
-            {
-                'data': [1, 32, 32]
-            }, {'data': [64, 256, 32]}, {'data': [32, 128, 32]}, False)
+            {'data': [1, 32, 32]}, {'data': [64, 256, 32]},
+            {'data': [32, 128, 32]}, False)
         self.fetch_list = [out]
 
     def test_check_output(self):
@@ -193,10 +198,12 @@ def test_check_output(self):
 
 
 class FCFusePassTRTDynamicDims4Cols1Test(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[32, 12, 4, 6], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[32, 12, 4, 6],
+                              dtype="float32")
             fc_out1 = fluid.layers.fc(input=data,
                                       size=64,
                                       num_flatten_dims=1,
@@ -210,9 +217,8 @@ def setUp(self):
         self.trt_parameters = FCFusePassTRTDynamicDims4Cols1Test.TensorRTParam(
             1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
         self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols1Test.DynamicShapeParam(
-            {
-                'data': [1, 12, 4, 6]
-            }, {'data': [64, 12, 4, 6]}, {'data': [32, 12, 4, 6]}, False)
+            {'data': [1, 12, 4, 6]}, {'data': [64, 12, 4, 6]},
+            {'data': [32, 12, 4, 6]}, False)
         self.fetch_list = [out]
 
     def test_check_output(self):
@@ -224,10 +230,12 @@ def test_check_output(self):
 
 
 class FCFusePassTRTDynamicDims4Cols2Test(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[32, 128, 32, 32], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[32, 128, 32, 32],
+                              dtype="float32")
             fc_out1 = fluid.layers.fc(input=data,
                                       size=64,
                                       num_flatten_dims=2,
@@ -241,9 +249,8 @@ def setUp(self):
         self.trt_parameters = FCFusePassTRTDynamicDims4Cols2Test.TensorRTParam(
             1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
         self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols2Test.DynamicShapeParam(
-            {
-                'data': [1, 64, 32, 32]
-            }, {'data': [64, 256, 32, 32]}, {'data': [32, 128, 32, 32]}, False)
+            {'data': [1, 64, 32, 32]}, {'data': [64, 256, 32, 32]},
+            {'data': [32, 128, 32, 32]}, False)
         self.fetch_list = [out]
 
     def test_check_output(self):
@@ -255,10 +262,12 @@ def test_check_output(self):
 
 
 class FCFusePassTRTDynamicDims4Cols3Test(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[32, 128, 32, 32], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[32, 128, 32, 32],
+                              dtype="float32")
             fc_out1 = fluid.layers.fc(input=data,
                                       size=64,
                                       num_flatten_dims=3,
@@ -272,9 +281,8 @@ def setUp(self):
         self.trt_parameters = FCFusePassTRTDynamicDims4Cols3Test.TensorRTParam(
             1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
         self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols3Test.DynamicShapeParam(
-            {
-                'data': [1, 128, 32, 32]
-            }, {'data': [64, 128, 32, 32]}, {'data': [32, 128, 32, 32]}, False)
+            {'data': [1, 128, 32, 32]}, {'data': [64, 128, 32, 32]},
+            {'data': [32, 128, 32, 32]}, False)
         self.fetch_list = [out]
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py
index 9e1991ae1ae30..e62b6557844c9 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py
@@ -25,10 +25,13 @@
 
 
 class FCQuantDequantFusePassTRTDims3Cols1Test(QuantDequantTest):
+
     def setUp(self):
+
         def network():
-            self.data = fluid.data(
-                name='data', shape=[1, 28, 28], dtype='float32')
+            self.data = fluid.data(name='data',
+                                   shape=[1, 28, 28],
+                                   dtype='float32')
             self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
             fc_out = fluid.layers.fc(input=self.data,
                                      size=10,
@@ -62,10 +65,13 @@ def network():
             {
                 'data': [1, 28, 28],
                 'reshape2_1.tmp_0': [1, 1, 10]
-            }, {'data': [2, 28, 28],
-                'reshape2_1.tmp_0': [2, 1, 10]},
-            {'data': [1, 28, 28],
-             'reshape2_1.tmp_0': [1, 1, 10]}, False)
+            }, {
+                'data': [2, 28, 28],
+                'reshape2_1.tmp_0': [2, 1, 10]
+            }, {
+                'data': [1, 28, 28],
+                'reshape2_1.tmp_0': [1, 1, 10]
+            }, False)
         self.activation_quantize_type = 'moving_average_abs_max'
         self.weight_quantize_type = 'channel_wise_abs_max'
 
@@ -73,18 +79,23 @@ def test_check_output(self):
         #self.quant_dequant()
         if core.is_compiled_with_cuda():
             use_gpu = True
-            self.check_output_with_option(
-                use_gpu, atol=1e-2, flatten=False, rtol=1e-2)
+            self.check_output_with_option(use_gpu,
+                                          atol=1e-2,
+                                          flatten=False,
+                                          rtol=1e-2)
             self.assertTrue(
                 PassVersionChecker.IsCompatible(
                     'quant_conv2d_dequant_fuse_pass'))
 
 
 class FCQuantDequantFusePassTRTDims3Cols2Test(QuantDequantTest):
+
     def setUp(self):
+
         def network():
-            self.data = fluid.data(
-                name='data', shape=[1, 28, 28], dtype='float32')
+            self.data = fluid.data(name='data',
+                                   shape=[1, 28, 28],
+                                   dtype='float32')
             self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
             fc_out = fluid.layers.fc(input=self.data,
                                      size=28,
@@ -119,10 +130,13 @@ def network():
             {
                 'data': [1, 28, 28],
                 'reshape2_0.tmp_0': [1, 784]
-            }, {'data': [4, 28, 28],
-                'reshape2_0.tmp_0':
-                [4, 784]}, {'data': [1, 28, 28],
-                            'reshape2_0.tmp_0': [1, 784]}, False)
+            }, {
+                'data': [4, 28, 28],
+                'reshape2_0.tmp_0': [4, 784]
+            }, {
+                'data': [1, 28, 28],
+                'reshape2_0.tmp_0': [1, 784]
+            }, False)
         self.activation_quantize_type = 'moving_average_abs_max'
         self.weight_quantize_type = 'channel_wise_abs_max'
 
@@ -130,18 +144,23 @@ def test_check_output(self):
         #self.quant_dequant()
         if core.is_compiled_with_cuda():
             use_gpu = True
-            self.check_output_with_option(
-                use_gpu, atol=1e-1, flatten=False, rtol=1e-1)
+            self.check_output_with_option(use_gpu,
+                                          atol=1e-1,
+                                          flatten=False,
+                                          rtol=1e-1)
             self.assertTrue(
                 PassVersionChecker.IsCompatible(
                     'quant_conv2d_dequant_fuse_pass'))
 
 
 class FCQuantDequantFusePassTRTDims3Cols3Test(QuantDequantTest):
+
     def setUp(self):
+
         def network():
-            self.data = fluid.data(
-                name='data', shape=[1, 28, 28], dtype='float32')
+            self.data = fluid.data(name='data',
+                                   shape=[1, 28, 28],
+                                   dtype='float32')
             self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
             label_shape = fluid.layers.reshape(self.label, shape=[1, 1, 1])
             reshape_out = fluid.layers.reshape(self.data, shape=[1, 14, 14, 4])
@@ -195,8 +214,10 @@ def test_check_output(self):
         #self.quant_dequant()
         if core.is_compiled_with_cuda():
             use_gpu = True
-            self.check_output_with_option(
-                use_gpu, atol=1e0, flatten=False, rtol=1e0)
+            self.check_output_with_option(use_gpu,
+                                          atol=1e0,
+                                          flatten=False,
+                                          rtol=1e0)
             self.assertTrue(
                 PassVersionChecker.IsCompatible(
                     'quant_conv2d_dequant_fuse_pass'))
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten2_matmul_fuse_pass.py
index 9d0f8857e92b4..a02cdb6a34791 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten2_matmul_fuse_pass.py
@@ -64,14 +64,15 @@ def teller1(program_config, predictor_config):
         self.add_ignore_check_case(
             teller1,
             IgnoreReasons.PASS_ACCURACY_ERROR,
-            "The pass error on TRT while shape of bias is not [out_size].", )
+            "The pass error on TRT while shape of bias is not [out_size].",
+        )
 
     def sample_program_config(self, draw):
         # 1. Generate shape and attr of flatten2
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=10), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=10),
+                     min_size=4,
+                     max_size=4))
         # [a, b, c, d] => [a, b*c*d]
         flatten_axis = 1
         flatten_shape = [x_shape[0], x_shape[1] * x_shape[2] * x_shape[3]]
@@ -83,27 +84,36 @@ def sample_program_config(self, draw):
 
         # 3. Generate legal shape of input:Y of matmul
         y_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=2))
         y_shape[0] = flatten_shape[1]
 
         # 4. Generate legal attr:axis of elementwise_add
         axis = draw(st.integers(min_value=-1, max_value=1))
         if axis == 0:
             axis = -1
-        bias_shape = [y_shape[1], ]
+        bias_shape = [
+            y_shape[1],
+        ]
 
         flatten2_op = OpConfig(
             "flatten2",
-            inputs={"X": ["flatten2_x"], },
+            inputs={
+                "X": ["flatten2_x"],
+            },
             axis=flatten_axis,
-            outputs={"Out": ["flatten2_out"],
-                     "XShape": ["xshape"]}, )
+            outputs={
+                "Out": ["flatten2_out"],
+                "XShape": ["xshape"]
+            },
+        )
         matmul_op = OpConfig(
             "matmul",
-            inputs={"X": ["flatten2_out"],
-                    "Y": ["matmul_y"]},
+            inputs={
+                "X": ["flatten2_out"],
+                "Y": ["matmul_y"]
+            },
             outputs={"Out": ["matmul_out"]},
             alpha=alpha,
             transpose_X=transpose_X,
@@ -113,14 +123,18 @@ def sample_program_config(self, draw):
             fused_transpose_X=[],
             fused_transpose_Y=[],
             fused_reshape_Out=[],
-            fused_transpose_Out=[], )
+            fused_transpose_Out=[],
+        )
 
         add_op = OpConfig(
             "elementwise_add",
-            inputs={"X": ["matmul_out"],
-                    "Y": ["bias"]},
+            inputs={
+                "X": ["matmul_out"],
+                "Y": ["bias"]
+            },
             outputs={"Out": ["add_out"]},
-            axis=axis, )
+            axis=axis,
+        )
 
         ops = [flatten2_op, matmul_op, add_op]
 
@@ -130,16 +144,18 @@ def sample_program_config(self, draw):
                 "matmul_y": TensorConfig(shape=y_shape),
                 "bias": TensorConfig(shape=bias_shape),
             },
-            inputs={"flatten2_x": TensorConfig(shape=x_shape), },
-            outputs=ops[-1].outputs["Out"], )
+            inputs={
+                "flatten2_x": TensorConfig(shape=x_shape),
+            },
+            outputs=ops[-1].outputs["Out"],
+        )
 
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=50,
-            passes=["trt_flatten2_matmul_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=50,
+                            passes=["trt_flatten2_matmul_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten_op.py
index bb28fcf708503..8e5728f63f296 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten_op.py
@@ -24,10 +24,12 @@
 
 
 class TRTFlattenTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 6, 64, 64],
+                              dtype="float32")
             flatten_out = self.append_flatten(data)
             out = fluid.layers.batch_norm(flatten_out, is_test=True)
         self.feeds = {
@@ -50,10 +52,12 @@ def test_check_output(self):
 
 
 class TRTFlattenDynamicTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 6, 64, 64],
+                              dtype="float32")
             flatten_out = self.append_flatten(data)
             out = fluid.layers.batch_norm(flatten_out, is_test=True)
         self.feeds = {
@@ -62,11 +66,14 @@ def setUp(self):
         self.enable_trt = True
         self.trt_parameters = TRTFlattenDynamicTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
-        self.dynamic_shape_params = TRTFlattenDynamicTest.DynamicShapeParam({
-            'data': [2, 6, 64, 64],
-            'flatten_0.tmp_0': [2, 6 * 64 * 64]
-        }, {'data': [2, 6, 64, 64],
-            'flatten_0.tmp_0': [2, 6 * 64 * 64]}, {
+        self.dynamic_shape_params = TRTFlattenDynamicTest.DynamicShapeParam(
+            {
+                'data': [2, 6, 64, 64],
+                'flatten_0.tmp_0': [2, 6 * 64 * 64]
+            }, {
+                'data': [2, 6, 64, 64],
+                'flatten_0.tmp_0': [2, 6 * 64 * 64]
+            }, {
                 'data': [2, 6, 64, 64],
                 'flatten_0.tmp_0': [2, 6 * 64 * 64]
             }, False)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
index a9d11f8fd1818..a9a0b0f327d6b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
@@ -24,6 +24,7 @@
 
 
 class TRTGatherNdTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[-1, 3, 4], dtype="float32")
@@ -33,19 +34,24 @@ def setUp(self):
 
         self.feeds = {
             "data": np.random.random([2, 3, 4]).astype("float32"),
-            "index":
-            np.array([[[0, 1], [1, 0]], [[1, 2], [0, 1]]]).astype("int32"),
+            "index": np.array([[[0, 1], [1, 0]], [[1, 2],
+                                                  [0, 1]]]).astype("int32"),
         }
         self.enable_trt = True
         self.trt_parameters = TRTGatherNdTest.TensorRTParam(
             1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
         self.fetch_list = [out]
-        self.dynamic_shape_params = TRTGatherNdTest.DynamicShapeParam({
-            'data': [1, 3, 4],
-            'index': [1, 2, 2]
-        }, {'data': [3, 3, 4],
-            'index': [3, 2, 2]}, {'data': [3, 3, 4],
-                                  'index': [3, 2, 2]}, False)
+        self.dynamic_shape_params = TRTGatherNdTest.DynamicShapeParam(
+            {
+                'data': [1, 3, 4],
+                'index': [1, 2, 2]
+            }, {
+                'data': [3, 3, 4],
+                'index': [3, 2, 2]
+            }, {
+                'data': [3, 3, 4],
+                'index': [3, 2, 2]
+            }, False)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -56,10 +62,12 @@ def test_check_output(self):
 
 
 class TRTGatherNdFp16Test(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 1280, 192], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 1280, 192],
+                              dtype="float32")
             index = fluid.data(name="index", shape=[-1, 1028, 2], dtype="int32")
             gather_nd = fluid.layers.gather_nd(data, index)
             out = fluid.layers.batch_norm(gather_nd, is_test=True)
@@ -73,13 +81,17 @@ def setUp(self):
         self.trt_parameters = TRTGatherNdFp16Test.TensorRTParam(
             1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False)
         self.fetch_list = [out]
-        self.dynamic_shape_params = TRTGatherNdFp16Test.DynamicShapeParam({
-            'data': [1, 1280, 192],
-            'index': [1, 1028, 2]
-        }, {'data': [3, 1280, 192],
-            'index':
-            [3, 1028, 2]}, {'data': [3, 1280, 192],
-                            'index': [3, 1028, 2]}, False)
+        self.dynamic_shape_params = TRTGatherNdFp16Test.DynamicShapeParam(
+            {
+                'data': [1, 1280, 192],
+                'index': [1, 1028, 2]
+            }, {
+                'data': [3, 1280, 192],
+                'index': [3, 1028, 2]
+            }, {
+                'data': [3, 1280, 192],
+                'index': [3, 1028, 2]
+            }, False)
 
     def test_check_output(self, atol=1e-3):
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py
index 57c295686f63d..9536c8c4e0800 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py
@@ -24,6 +24,7 @@
 
 
 class TRTGatherTest1(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
@@ -40,12 +41,17 @@ def setUp(self):
         self.enable_trt = True
         self.trt_parameters = TRTGatherTest1.TensorRTParam(
             1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
-        self.dynamic_shape_params = TRTGatherTest1.DynamicShapeParam({
-            'data': [1, 1],
-            'index': [1, 1]
-        }, {'data': [32, 128],
-            'index': [3, 1]}, {'data': [32, 128],
-                               'index': [3, 1]}, False)
+        self.dynamic_shape_params = TRTGatherTest1.DynamicShapeParam(
+            {
+                'data': [1, 1],
+                'index': [1, 1]
+            }, {
+                'data': [32, 128],
+                'index': [3, 1]
+            }, {
+                'data': [32, 128],
+                'index': [3, 1]
+            }, False)
         self.fetch_list = [out]
 
     def set_params(self):
@@ -61,6 +67,7 @@ def test_check_output(self):
 
 
 class TRTGatherTest2(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
@@ -77,12 +84,17 @@ def setUp(self):
         self.enable_trt = True
         self.trt_parameters = TRTGatherTest2.TensorRTParam(
             1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
-        self.dynamic_shape_params = TRTGatherTest2.DynamicShapeParam({
-            'data': [2, 4],
-            'index': [1]
-        }, {'data': [256, 256],
-            'index': [4]}, {'data': [64, 32],
-                            'index': [2]}, False)
+        self.dynamic_shape_params = TRTGatherTest2.DynamicShapeParam(
+            {
+                'data': [2, 4],
+                'index': [1]
+            }, {
+                'data': [256, 256],
+                'index': [4]
+            }, {
+                'data': [64, 32],
+                'index': [2]
+            }, False)
         self.fetch_list = [out]
 
     def set_params(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_group_norm_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_group_norm_op.py
index 1bcbbc38c9762..de59753d976d8 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_group_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_group_norm_op.py
@@ -24,10 +24,12 @@
 
 
 class TRTGroupNormTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 512, 12, 12], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 512, 12, 12],
+                              dtype="float32")
             out = self.append_group_norm(data)
 
         self.feeds = {
@@ -36,9 +38,9 @@ def setUp(self):
         self.enable_trt = True
         self.trt_parameters = TRTGroupNormTest.TensorRTParam(
             1 << 30, 1, 1, AnalysisConfig.Precision.Float32, False, False)
-        self.dynamic_shape_params = TRTGroupNormTest.DynamicShapeParam({
-            'data': [1, 512, 12, 12]
-        }, {'data': [1, 512, 12, 12]}, {'data': [1, 512, 12, 12]}, False)
+        self.dynamic_shape_params = TRTGroupNormTest.DynamicShapeParam(
+            {'data': [1, 512, 12, 12]}, {'data': [1, 512, 12, 12]},
+            {'data': [1, 512, 12, 12]}, False)
         self.fetch_list = [out]
 
     def append_group_norm(self, data):
@@ -48,12 +50,11 @@ def append_group_norm(self, data):
         bias_attr = fluid.ParamAttr(
             name='group_norm_bias',
             initializer=fluid.initializer.Constant(value=0.0))
-        return fluid.layers.group_norm(
-            data,
-            groups=32,
-            epsilon=0.000009999999747378752,
-            param_attr=param_attr,
-            bias_attr=bias_attr)
+        return fluid.layers.group_norm(data,
+                                       groups=32,
+                                       epsilon=0.000009999999747378752,
+                                       param_attr=param_attr,
+                                       bias_attr=bias_attr)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py
index 3d4b2dc10c2b6..c69e0c98a4083 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py
@@ -28,19 +28,21 @@
 
 
 class TensorRTInspectorTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[1, 16, 16], dtype="float32")
-            matmul_out = fluid.layers.matmul(
-                x=data,
-                y=data,
-                transpose_x=self.transpose_x,
-                transpose_y=self.transpose_y,
-                alpha=self.alpha)
+            matmul_out = fluid.layers.matmul(x=data,
+                                             y=data,
+                                             transpose_x=self.transpose_x,
+                                             transpose_y=self.transpose_y,
+                                             alpha=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
 
-        self.feeds = {"data": np.ones([1, 16, 16]).astype("float32"), }
+        self.feeds = {
+            "data": np.ones([1, 16, 16]).astype("float32"),
+        }
         self.enable_trt = True
         self.trt_parameters = InferencePassTest.TensorRTParam(
             1 << 30, 1, 0, AnalysisConfig.Precision.Float32, False, False, True)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_instance_norm_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_instance_norm_op.py
index d283465dcba09..67e601cc5209c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_instance_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_instance_norm_op.py
@@ -27,6 +27,7 @@
 
 
 class TRTInstanceNormTest(InferencePassTest):
+
     def setUp(self):
         self.bs = 4
         self.channel = 4
@@ -47,7 +48,9 @@ def build(self):
             out = fluid.layers.batch_norm(instance_norm_out, is_test=True)
 
         shape[0] = self.bs
-        self.feeds = {'in': np.random.random(shape).astype('float32'), }
+        self.feeds = {
+            'in': np.random.random(shape).astype('float32'),
+        }
         self.fetch_list = [out]
 
     def check_output(self, remove_cache=False):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
index 99e99a8387784..14b0e9fa1451d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
@@ -22,19 +22,21 @@
 
 
 class TensorRTMatMulDims2Test(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[24, 24], dtype="float32")
-            matmul_out = fluid.layers.matmul(
-                x=data,
-                y=data,
-                transpose_x=self.transpose_x,
-                transpose_y=self.transpose_y,
-                alpha=self.alpha)
+            matmul_out = fluid.layers.matmul(x=data,
+                                             y=data,
+                                             transpose_x=self.transpose_x,
+                                             transpose_y=self.transpose_y,
+                                             alpha=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
 
-        self.feeds = {"data": np.ones([24, 24]).astype("float32"), }
+        self.feeds = {
+            "data": np.ones([24, 24]).astype("float32"),
+        }
         self.enable_trt = True
         self.trt_parameters = TensorRTMatMulDims2Test.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
@@ -54,20 +56,23 @@ def test_check_output(self):
 
 
 class TensorRTMatMulTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 24, 24], dtype="float32")
-            matmul_out = fluid.layers.matmul(
-                x=data,
-                y=data,
-                transpose_x=self.transpose_x,
-                transpose_y=self.transpose_y,
-                alpha=self.alpha)
+            data = fluid.data(name="data",
+                              shape=[-1, 6, 24, 24],
+                              dtype="float32")
+            matmul_out = fluid.layers.matmul(x=data,
+                                             y=data,
+                                             transpose_x=self.transpose_x,
+                                             transpose_y=self.transpose_y,
+                                             alpha=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
 
-        self.feeds = {"data": np.ones([1, 6, 24, 24]).astype("float32"), }
+        self.feeds = {
+            "data": np.ones([1, 6, 24, 24]).astype("float32"),
+        }
         self.enable_trt = True
         self.trt_parameters = TensorRTMatMulTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
@@ -87,6 +92,7 @@ def test_check_output(self):
 
 
 class TensorRTMatMulTransposeXTest(TensorRTMatMulTest):
+
     def set_params(self):
         self.transpose_x = True
         self.transpose_y = False
@@ -94,6 +100,7 @@ def set_params(self):
 
 
 class TensorRTMatMulTransposeYTest(TensorRTMatMulTest):
+
     def set_params(self):
         self.transpose_x = False
         self.transpose_y = True
@@ -101,6 +108,7 @@ def set_params(self):
 
 
 class TensorRTMatMulScaleTest(TensorRTMatMulTest):
+
     def set_params(self):
         self.transpose_x = False
         self.transpose_y = False
@@ -108,19 +116,20 @@ def set_params(self):
 
 
 class TensorRTMatMulBroadcastTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         place = fluid.CPUPlace()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data_x = fluid.data(
-                name="data_x", shape=[-1, 6, 24], dtype="float32")
+            data_x = fluid.data(name="data_x",
+                                shape=[-1, 6, 24],
+                                dtype="float32")
             data_y = fluid.data(name="data_y", shape=[24, 16], dtype="float32")
-            matmul_out = fluid.layers.matmul(
-                x=data_x,
-                y=data_y,
-                transpose_x=self.transpose_x,
-                transpose_y=self.transpose_y,
-                alpha=self.alpha)
+            matmul_out = fluid.layers.matmul(x=data_x,
+                                             y=data_y,
+                                             transpose_x=self.transpose_x,
+                                             transpose_y=self.transpose_y,
+                                             alpha=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
 
         self.feeds = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
index adf9ce4aead61..01f65b54bd4ae 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
@@ -23,19 +23,20 @@
 
 
 class TensorRTMatMulQuantDequantDims3Test(QuantDequantTest):
+
     def setUp(self):
         self.set_params()
 
         def network():
-            self.data = fluid.data(
-                name='data', shape=[1, 28, 28], dtype='float32')
+            self.data = fluid.data(name='data',
+                                   shape=[1, 28, 28],
+                                   dtype='float32')
             self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
-            matmul_out = fluid.layers.matmul(
-                x=self.data,
-                y=self.data,
-                transpose_x=self.transpose_x,
-                transpose_y=self.transpose_y,
-                alpha=self.alpha)
+            matmul_out = fluid.layers.matmul(x=self.data,
+                                             y=self.data,
+                                             transpose_x=self.transpose_x,
+                                             transpose_y=self.transpose_y,
+                                             alpha=self.alpha)
             fc_out = fluid.layers.fc(input=matmul_out,
                                      size=10,
                                      num_flatten_dims=1,
@@ -76,14 +77,17 @@ def test_check_output(self):
         #self.quant_dequant()
         if core.is_compiled_with_cuda():
             use_gpu = True
-            self.check_output_with_option(
-                use_gpu, atol=1, flatten=False, rtol=1e-1)
+            self.check_output_with_option(use_gpu,
+                                          atol=1,
+                                          flatten=False,
+                                          rtol=1e-1)
             self.assertTrue(
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
 
 
 class TensorRTMatMulQuantDequantDims3TransposeXTest(
         TensorRTMatMulQuantDequantDims3Test):
+
     def set_params(self):
         self.transpose_x = True
         self.transpose_y = False
@@ -92,6 +96,7 @@ def set_params(self):
 
 class TensorRTMatMulQuantDequantDims3TransposeYTest(
         TensorRTMatMulQuantDequantDims3Test):
+
     def set_params(self):
         self.transpose_x = False
         self.transpose_y = True
@@ -100,6 +105,7 @@ def set_params(self):
 
 class TensorRTMatMulQuantDequantDims3TransposeXYTest(
         TensorRTMatMulQuantDequantDims3Test):
+
     def set_params(self):
         self.transpose_x = True
         self.transpose_y = True
@@ -107,20 +113,21 @@ def set_params(self):
 
 
 class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest):
+
     def setUp(self):
         self.set_params()
 
         def network():
-            self.data = fluid.data(
-                name='data', shape=[1, 28, 28], dtype='float32')
+            self.data = fluid.data(name='data',
+                                   shape=[1, 28, 28],
+                                   dtype='float32')
             self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
             reshape_out = fluid.layers.reshape(self.data, shape=[1, 4, 14, 14])
-            matmul_out = fluid.layers.matmul(
-                x=reshape_out,
-                y=reshape_out,
-                transpose_x=self.transpose_x,
-                transpose_y=self.transpose_y,
-                alpha=self.alpha)
+            matmul_out = fluid.layers.matmul(x=reshape_out,
+                                             y=reshape_out,
+                                             transpose_x=self.transpose_x,
+                                             transpose_y=self.transpose_y,
+                                             alpha=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
             fc_out = fluid.layers.fc(input=matmul_out,
                                      size=10,
@@ -162,14 +169,17 @@ def test_check_output(self):
         #self.quant_dequant()
         if core.is_compiled_with_cuda():
             use_gpu = True
-            self.check_output_with_option(
-                use_gpu, atol=1, flatten=False, rtol=1e-1)
+            self.check_output_with_option(use_gpu,
+                                          atol=1,
+                                          flatten=False,
+                                          rtol=1e-1)
             self.assertTrue(
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
 
 
 class TensorRTMatMulQuantDequantDims4TransposeXTest(
         TensorRTMatMulQuantDequantDims4Test):
+
     def set_params(self):
         self.transpose_x = True
         self.transpose_y = False
@@ -178,6 +188,7 @@ def set_params(self):
 
 class TensorRTMatMulQuantDequantDims4TransposeYTest(
         TensorRTMatMulQuantDequantDims4Test):
+
     def set_params(self):
         self.transpose_x = False
         self.transpose_y = True
@@ -186,6 +197,7 @@ def set_params(self):
 
 class TensorRTMatMulQuantDequantDims4TransposeXYTest(
         TensorRTMatMulQuantDequantDims4Test):
+
     def set_params(self):
         self.transpose_x = True
         self.transpose_y = True
@@ -193,19 +205,20 @@ def set_params(self):
 
 
 class TensorRTMatMulQuantDequantDims3DynamicTest(QuantDequantTest):
+
     def setUp(self):
         self.set_params()
 
         def network():
-            self.data = fluid.data(
-                name='data', shape=[-1, 28, 28], dtype='float32')
+            self.data = fluid.data(name='data',
+                                   shape=[-1, 28, 28],
+                                   dtype='float32')
             self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
-            matmul_out = fluid.layers.matmul(
-                x=self.data,
-                y=self.data,
-                transpose_x=self.transpose_x,
-                transpose_y=self.transpose_y,
-                alpha=self.alpha)
+            matmul_out = fluid.layers.matmul(x=self.data,
+                                             y=self.data,
+                                             transpose_x=self.transpose_x,
+                                             transpose_y=self.transpose_y,
+                                             alpha=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
             fc_out = fluid.layers.fc(input=matmul_out,
                                      size=10,
@@ -236,9 +249,8 @@ def network():
         self.trt_parameters = TensorRTMatMulQuantDequantDims3DynamicTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Int8, False, False)
         self.dynamic_shape_params = TensorRTMatMulQuantDequantDims3DynamicTest.DynamicShapeParam(
-            {
-                'data': [1, 28, 28]
-            }, {'data': [4, 28, 28]}, {'data': [3, 28, 28]}, False)
+            {'data': [1, 28, 28]}, {'data': [4, 28, 28]}, {'data': [3, 28, 28]},
+            False)
         self.activation_quantize_type = 'moving_average_abs_max'
         self.weight_quantize_type = 'channel_wise_abs_max'
 
@@ -251,14 +263,17 @@ def test_check_output(self):
         #self.quant_dequant()
         if core.is_compiled_with_cuda():
             use_gpu = True
-            self.check_output_with_option(
-                use_gpu, atol=1, flatten=False, rtol=1e-1)
+            self.check_output_with_option(use_gpu,
+                                          atol=1,
+                                          flatten=False,
+                                          rtol=1e-1)
             self.assertTrue(
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
 
 
 class TensorRTMatMulQuantDequantDims4TransposeXDynamicTest(
         TensorRTMatMulQuantDequantDims3DynamicTest):
+
     def set_params(self):
         self.transpose_x = True
         self.transpose_y = False
@@ -267,6 +282,7 @@ def set_params(self):
 
 class TensorRTMatMulQuantDequantDims4TransposeYDynamicTest(
         TensorRTMatMulQuantDequantDims3DynamicTest):
+
     def set_params(self):
         self.transpose_x = False
         self.transpose_y = True
@@ -275,6 +291,7 @@ def set_params(self):
 
 class TensorRTMatMulQuantDequantDims4TransposeXYDynamicTest(
         TensorRTMatMulQuantDequantDims3DynamicTest):
+
     def set_params(self):
         self.transpose_x = True
         self.transpose_y = True
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py
index ed993ffce7da7..1911155ca707d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py
@@ -131,8 +131,8 @@ class number
                  score_threshold, 'nms_top_k', nms_top_k, 'nms_threshold',
                  nms_threshold, 'keep_top_k', keep_top_k, 'nms_eta', nms_eta,
                  'normalized', normalized)
-        output, index, nms_rois_num = core.ops.multiclass_nms3(bboxes, scores,
-                                                               rois_num, *attrs)
+        output, index, nms_rois_num = core.ops.multiclass_nms3(
+            bboxes, scores, rois_num, *attrs)
         if not return_index:
             index = None
         return output, nms_rois_num, index
@@ -153,19 +153,18 @@ class number
                 dtype='int32')
             outputs['NmsRoisNum'] = nms_rois_num
 
-        helper.append_op(
-            type="multiclass_nms3",
-            inputs=inputs,
-            attrs={
-                'background_label': background_label,
-                'score_threshold': score_threshold,
-                'nms_top_k': nms_top_k,
-                'nms_threshold': nms_threshold,
-                'keep_top_k': keep_top_k,
-                'nms_eta': nms_eta,
-                'normalized': normalized
-            },
-            outputs=outputs)
+        helper.append_op(type="multiclass_nms3",
+                         inputs=inputs,
+                         attrs={
+                             'background_label': background_label,
+                             'score_threshold': score_threshold,
+                             'nms_top_k': nms_top_k,
+                             'nms_threshold': nms_threshold,
+                             'keep_top_k': keep_top_k,
+                             'nms_eta': nms_eta,
+                             'normalized': normalized
+                         },
+                         outputs=outputs)
         output.stop_gradient = True
         index.stop_gradient = True
         if not return_index:
@@ -177,9 +176,10 @@ class number
 
 
 class TensorRTMultiClassNMS3Test(InferencePassTest):
+
     def setUp(self):
         self.enable_trt = True
-        self.enable_tensorrt_oss = True
+        self.enable_tensorrt_varseqlen = True
         self.precision = AnalysisConfig.Precision.Float32
         self.serialize = False
         self.bs = 1
@@ -197,12 +197,12 @@ def setUp(self):
 
     def build(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            boxes = fluid.data(
-                name='bboxes', shape=[-1, self.num_boxes, 4], dtype='float32')
-            scores = fluid.data(
-                name='scores',
-                shape=[-1, self.num_classes, self.num_boxes],
-                dtype='float32')
+            boxes = fluid.data(name='bboxes',
+                               shape=[-1, self.num_boxes, 4],
+                               dtype='float32')
+            scores = fluid.data(name='scores',
+                                shape=[-1, self.num_classes, self.num_boxes],
+                                dtype='float32')
             multiclass_nms_out, _, _ = multiclass_nms(
                 bboxes=boxes,
                 scores=scores,
@@ -244,10 +244,12 @@ def run_test_all(self):
         }
         opt_shape = max_shape
         dynamic_shape_opt = [
-            None, InferencePassTest.DynamicShapeParam({
-                'bboxes': [1, 1, 4],
-                'scores': [1, 1, 1]
-            }, max_shape, opt_shape, False)
+            None,
+            InferencePassTest.DynamicShapeParam(
+                {
+                    'bboxes': [1, 1, 4],
+                    'scores': [1, 1, 1]
+                }, max_shape, opt_shape, False)
         ]
         for precision, serialize, dynamic_shape in itertools.product(
                 precision_opt, serialize_opt, dynamic_shape_opt):
@@ -281,18 +283,19 @@ def test_dynamic(self):
             'scores': [self.bs, self.num_classes, self.num_boxes],
         }
         opt_shape = max_shape
-        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({
-            'bboxes': [1, 1, 4],
-            'scores': [1, 1, 1]
-        }, max_shape, opt_shape, False)
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam(
+            {
+                'bboxes': [1, 1, 4],
+                'scores': [1, 1, 1]
+            }, max_shape, opt_shape, False)
         self.run_test()
 
     def test_background(self):
         self.background = 7
         self.run_test()
 
-    def test_disable_oss(self):
-        self.diable_tensorrt_oss = False
+    def test_disable_varseqlen(self):
+        self.diable_tensorrt_varseqlen = False
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
index 045261fabb020..5e04241f14991 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
@@ -23,9 +23,10 @@
 
 
 class TensorRTMultiClassNMSTest(InferencePassTest):
+
     def setUp(self):
         self.enable_trt = True
-        self.enable_tensorrt_oss = True
+        self.enable_tensorrt_varseqlen = True
         self.precision = AnalysisConfig.Precision.Float32
         self.serialize = False
         self.bs = 1
@@ -42,12 +43,12 @@ def setUp(self):
 
     def build(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            boxes = fluid.data(
-                name='bboxes', shape=[-1, self.num_boxes, 4], dtype='float32')
-            scores = fluid.data(
-                name='scores',
-                shape=[-1, self.num_classes, self.num_boxes],
-                dtype='float32')
+            boxes = fluid.data(name='bboxes',
+                               shape=[-1, self.num_boxes, 4],
+                               dtype='float32')
+            scores = fluid.data(name='scores',
+                                shape=[-1, self.num_classes, self.num_boxes],
+                                dtype='float32')
             multiclass_nms_out = fluid.layers.multiclass_nms(
                 bboxes=boxes,
                 scores=scores,
@@ -88,10 +89,12 @@ def run_test_all(self):
         }
         opt_shape = max_shape
         dynamic_shape_opt = [
-            None, InferencePassTest.DynamicShapeParam({
-                'bboxes': [1, 1, 4],
-                'scores': [1, 1, 1]
-            }, max_shape, opt_shape, False)
+            None,
+            InferencePassTest.DynamicShapeParam(
+                {
+                    'bboxes': [1, 1, 4],
+                    'scores': [1, 1, 1]
+                }, max_shape, opt_shape, False)
         ]
         for precision, serialize, dynamic_shape in itertools.product(
                 precision_opt, serialize_opt, dynamic_shape_opt):
@@ -125,18 +128,19 @@ def test_dynamic(self):
             'scores': [self.bs, self.num_classes, self.num_boxes],
         }
         opt_shape = max_shape
-        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({
-            'bboxes': [1, 1, 4],
-            'scores': [1, 1, 1]
-        }, max_shape, opt_shape, False)
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam(
+            {
+                'bboxes': [1, 1, 4],
+                'scores': [1, 1, 1]
+            }, max_shape, opt_shape, False)
         self.run_test()
 
     def test_background(self):
         self.background = 7
         self.run_test()
 
-    def test_disable_oss(self):
-        self.diable_tensorrt_oss = False
+    def test_disable_varseqlen(self):
+        self.diable_tensorrt_varseqlen = False
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py
index 04631534adaee..7aba95a03993f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py
@@ -24,6 +24,7 @@
 
 
 class TRTNearestInterpTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
 
@@ -53,7 +54,9 @@ def setUp(self):
                 self.channels
             ]
 
-        self.feeds = {'data': np.random.random(shape).astype('float32'), }
+        self.feeds = {
+            'data': np.random.random(shape).astype('float32'),
+        }
         self.enable_trt = True
         self.trt_parameters = TRTNearestInterpTest.TensorRTParam(
             1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
@@ -71,16 +74,14 @@ def set_params(self):
 
     def append_nearest_interp(self, data):
         if self.scale > 0.:
-            return fluid.layers.resize_nearest(
-                data,
-                scale=self.scale,
-                align_corners=self.align_corners,
-                data_format=self.data_layout)
-        return fluid.layers.resize_nearest(
-            data,
-            out_shape=self.resize_shape,
-            align_corners=self.align_corners,
-            data_format=self.data_layout)
+            return fluid.layers.resize_nearest(data,
+                                               scale=self.scale,
+                                               align_corners=self.align_corners,
+                                               data_format=self.data_layout)
+        return fluid.layers.resize_nearest(data,
+                                           out_shape=self.resize_shape,
+                                           align_corners=self.align_corners,
+                                           data_format=self.data_layout)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -91,6 +92,7 @@ def test_check_output(self):
 
 
 class TRTNearestInterpTest1(TRTNearestInterpTest):
+
     def set_params(self):
         self.bs = 4
         self.scale = -1
@@ -102,6 +104,7 @@ def set_params(self):
 
 
 class TRTNearestInterpTest2(TRTNearestInterpTest):
+
     def set_params(self):
         self.bs = 4
         self.scale = 2.
@@ -113,6 +116,7 @@ def set_params(self):
 
 
 class TRTNearestInterpTest3(TRTNearestInterpTest):
+
     def set_params(self):
         self.bs = 4
         self.scale = 0
@@ -124,6 +128,7 @@ def set_params(self):
 
 
 class TRTNearestInterpTest4(TRTNearestInterpTest):
+
     def set_params(self):
         self.bs = 4
         self.scale = -1
@@ -135,6 +140,7 @@ def set_params(self):
 
 
 class TRTNearestInterpTest5(TRTNearestInterpTest):
+
     def set_params(self):
         self.bs = 4
         self.scale = -1
@@ -146,6 +152,7 @@ def set_params(self):
 
 
 class TRTNearestInterpTest6(TRTNearestInterpTest):
+
     def set_params(self):
         self.bs = 4
         self.scale = 2.
@@ -157,6 +164,7 @@ def set_params(self):
 
 
 class TRTNearestInterpTest7(TRTNearestInterpTest):
+
     def set_params(self):
         self.bs = 4
         self.scale = -1
@@ -168,6 +176,7 @@ def set_params(self):
 
 
 class TRTNearestInterpTest8(TRTNearestInterpTest):
+
     def set_params(self):
         self.bs = 4
         self.scale = -1
@@ -179,6 +188,7 @@ def set_params(self):
 
 
 class TRTNearestInterpTest9(TRTNearestInterpTest):
+
     def set_params(self):
         self.bs = 4
         self.scale = -1
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py
index 73c1c5d3618bb..1496b96ce2199 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py
@@ -25,6 +25,7 @@
 
 
 class TRTNearestInterpTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
 
@@ -54,7 +55,9 @@ def setUp(self):
                 self.channels
             ]
 
-        self.feeds = {'data': np.random.random(shape).astype('float32'), }
+        self.feeds = {
+            'data': np.random.random(shape).astype('float32'),
+        }
         self.enable_trt = True
         self.trt_parameters = TRTNearestInterpTest.TensorRTParam(
             1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
@@ -71,18 +74,16 @@ def set_params(self):
 
     def append_nearest_interp(self, data):
         if self.scale > 0.:
-            return F.interpolate(
-                data,
-                scale_factor=self.scale,
-                align_corners=self.align_corners,
-                mode='nearest',
-                data_format=self.data_layout)
-        return F.interpolate(
-            data,
-            size=self.resize_shape,
-            align_corners=self.align_corners,
-            mode='nearest',
-            data_format=self.data_layout)
+            return F.interpolate(data,
+                                 scale_factor=self.scale,
+                                 align_corners=self.align_corners,
+                                 mode='nearest',
+                                 data_format=self.data_layout)
+        return F.interpolate(data,
+                             size=self.resize_shape,
+                             align_corners=self.align_corners,
+                             mode='nearest',
+                             data_format=self.data_layout)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -93,6 +94,7 @@ def test_check_output(self):
 
 
 class TRTNearestInterpTest1(TRTNearestInterpTest):
+
     def set_params(self):
         self.bs = 4
         self.scale = 2.
@@ -104,6 +106,7 @@ def set_params(self):
 
 
 class TRTNearestInterpTest2(TRTNearestInterpTest):
+
     def set_params(self):
         self.bs = 4
         self.scale = -1
@@ -115,6 +118,7 @@ def set_params(self):
 
 
 class TRTNearestInterpTest3(TRTNearestInterpTest):
+
     def set_params(self):
         self.bs = 4
         self.scale = -1
@@ -126,6 +130,7 @@ def set_params(self):
 
 
 class TRTNearestInterpTest4(TRTNearestInterpTest):
+
     def set_params(self):
         self.bs = 4
         self.scale = 2.
@@ -137,6 +142,7 @@ def set_params(self):
 
 
 class TRTNearestInterpTest5(TRTNearestInterpTest):
+
     def set_params(self):
         self.bs = 4
         self.scale = -1
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
index 060f6c6c5f044..0a61b83b8ce4a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
@@ -23,10 +23,12 @@
 
 
 class PadOpTRTTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[1, 3, 128, 128], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[1, 3, 128, 128],
+                              dtype="float32")
             pad_out = fluid.layers.pad(x=data,
                                        paddings=[0, 0, 0, 0, 0, 1, 1, 2],
                                        pad_value=0.0)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py
index 6fbddcf5a1fc0..22f278d6d5d18 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py
@@ -26,6 +26,7 @@
 
 
 class TensorRTPool3dTest(InferencePassTest):
+
     def setUp(self):
         self.bs = 1
         self.channel = 3
@@ -43,7 +44,8 @@ def setUp(self):
         self.serialize = False
         self.precision = AnalysisConfig.Precision.Float32
         self.feeds = {
-            'data': np.random.random(
+            'data':
+            np.random.random(
                 [self.bs, self.channel, self.depth, self.height,
                  self.width]).astype('float32'),
         }
@@ -61,15 +63,14 @@ def build_network(self):
                 name='data',
                 shape=[-1, self.channel, self.depth, self.height, self.width],
                 dtype='float32')
-            pool_out = fluid.layers.pool3d(
-                input=data,
-                pool_size=self.pool_size,
-                pool_type=self.pool_type,
-                pool_stride=self.pool_stride,
-                pool_padding=self.pool_padding,
-                global_pooling=self.global_pooling,
-                ceil_mode=self.ceil_mode,
-                exclusive=self.exclusive)
+            pool_out = fluid.layers.pool3d(input=data,
+                                           pool_size=self.pool_size,
+                                           pool_type=self.pool_type,
+                                           pool_stride=self.pool_stride,
+                                           pool_padding=self.pool_padding,
+                                           global_pooling=self.global_pooling,
+                                           ceil_mode=self.ceil_mode,
+                                           exclusive=self.exclusive)
             #out = fluid.layers.batch_norm(pool_out, is_test=True)
             self.fetch_list = [pool_out]
 
@@ -91,25 +92,27 @@ def test(self):
             AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
         ]
         serialize_options = [False, True]
-        dynamic_shape_profile = InferencePassTest.DynamicShapeParam({
-            'data': [
-                self.bs, self.channel, self.depth // 2, self.height // 2,
-                self.width // 2
-            ]
-        }, {
-            'data':
-            [self.bs, self.channel, self.depth, self.height, self.width]
-        }, {
-            'data':
-            [self.bs, self.channel, self.depth, self.height, self.width]
-        }, False)
+        dynamic_shape_profile = InferencePassTest.DynamicShapeParam(
+            {
+                'data': [
+                    self.bs, self.channel, self.depth // 2, self.height // 2,
+                    self.width // 2
+                ]
+            }, {
+                'data':
+                [self.bs, self.channel, self.depth, self.height, self.width]
+            }, {
+                'data':
+                [self.bs, self.channel, self.depth, self.height, self.width]
+            }, False)
         dynamic_shape_options = [None, dynamic_shape_profile]
 
         for precision, serialize, dynamic_shape in itertools.product(
                 precision_options, serialize_options, dynamic_shape_options):
             is_dynamic = True if dynamic_shape_options is not None else False
-            with self.subTest('Precision: {}, Serialize: {}, Dynamic: {}'.
-                              format(precision, serialize, is_dynamic)):
+            with self.subTest(
+                    'Precision: {}, Serialize: {}, Dynamic: {}'.format(
+                        precision, serialize, is_dynamic)):
                 self.precision = precision
                 self.serialize = serialize
                 self.dynamic_shape_params = dynamic_shape
@@ -117,6 +120,7 @@ def test(self):
 
 
 class TensorRTAvgPool3dTest(TensorRTPool3dTest):
+
     def set_extra_config(self):
         self.pool_size = 2
         self.pool_type = 'avg'
@@ -128,6 +132,7 @@ def set_extra_config(self):
 
 
 class TensorRTGlobalPool3dTest(TensorRTPool3dTest):
+
     def set_extra_config(self):
         self.pool_size = 2
         self.pool_type = 'max'
@@ -139,6 +144,7 @@ def set_extra_config(self):
 
 
 class TensorRTCeilPool3dTest(TensorRTPool3dTest):
+
     def set_extra_config(self):
         self.pool_size = 2
         self.pool_type = 'max'
@@ -150,6 +156,7 @@ def set_extra_config(self):
 
 
 class TensorRTExclusivePool3dTest(TensorRTPool3dTest):
+
     def set_extra_config(self):
         self.pool_size = 2
         self.pool_type = 'max'
@@ -161,6 +168,7 @@ def set_extra_config(self):
 
 
 class TensorRTSamePaddingPool3dTest(InferencePassTest):
+
     def set_extra_config(self):
         self.pool_size = 2
         self.pool_type = 'max'
@@ -172,6 +180,7 @@ def set_extra_config(self):
 
 
 class TensorRTValidPaddingPool3dTest(InferencePassTest):
+
     def set_extra_config(self):
         self.pool_size = 2
         self.pool_type = 'max'
@@ -183,6 +192,7 @@ def set_extra_config(self):
 
 
 class TensorRTAdaptiveAvgPool3DTest(InferencePassTest):
+
     def setUp(self):
         self.bs = 1
         self.channel = 3
@@ -193,7 +203,8 @@ def setUp(self):
         self.serialize = False
         self.precision = AnalysisConfig.Precision.Float32
         self.feeds = {
-            'data': np.random.random(
+            'data':
+            np.random.random(
                 [self.bs, self.channel, self.depth, self.height,
                  self.width]).astype('float32'),
         }
@@ -230,25 +241,27 @@ def test(self):
             AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
         ]
         serialize_options = [False, True]
-        dynamic_shape_profile = InferencePassTest.DynamicShapeParam({
-            'data': [
-                self.bs, self.channel, self.depth // 2, self.height // 2,
-                self.width // 2
-            ]
-        }, {
-            'data':
-            [self.bs, self.channel, self.depth, self.height, self.width]
-        }, {
-            'data':
-            [self.bs, self.channel, self.depth, self.height, self.width]
-        }, False)
+        dynamic_shape_profile = InferencePassTest.DynamicShapeParam(
+            {
+                'data': [
+                    self.bs, self.channel, self.depth // 2, self.height // 2,
+                    self.width // 2
+                ]
+            }, {
+                'data':
+                [self.bs, self.channel, self.depth, self.height, self.width]
+            }, {
+                'data':
+                [self.bs, self.channel, self.depth, self.height, self.width]
+            }, False)
         dynamic_shape_options = [None, dynamic_shape_profile]
 
         for precision, serialize, dynamic_shape in itertools.product(
                 precision_options, serialize_options, dynamic_shape_options):
             is_dynamic = True if dynamic_shape_options is not None else False
-            with self.subTest('Precision: {}, Serialize: {}, Dynamic: {}'.
-                              format(precision, serialize, is_dynamic)):
+            with self.subTest(
+                    'Precision: {}, Serialize: {}, Dynamic: {}'.format(
+                        precision, serialize, is_dynamic)):
                 self.precision = precision
                 self.serialize = serialize
                 self.dynamic_shape_params = dynamic_shape
@@ -256,6 +269,7 @@ def test(self):
 
 
 class TensorRTAdaptiveMaxPool3DTest(InferencePassTest):
+
     def setUp(self):
         self.bs = 1
         self.channel = 3
@@ -266,7 +280,8 @@ def setUp(self):
         self.serialize = False
         self.precision = AnalysisConfig.Precision.Float32
         self.feeds = {
-            'data': np.random.random(
+            'data':
+            np.random.random(
                 [self.bs, self.channel, self.depth, self.height,
                  self.width]).astype('float32'),
         }
@@ -303,25 +318,27 @@ def test(self):
             AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
         ]
         serialize_options = [False, True]
-        dynamic_shape_profile = InferencePassTest.DynamicShapeParam({
-            'data': [
-                self.bs, self.channel, self.depth // 2, self.height // 2,
-                self.width // 2
-            ]
-        }, {
-            'data':
-            [self.bs, self.channel, self.depth, self.height, self.width]
-        }, {
-            'data':
-            [self.bs, self.channel, self.depth, self.height, self.width]
-        }, False)
+        dynamic_shape_profile = InferencePassTest.DynamicShapeParam(
+            {
+                'data': [
+                    self.bs, self.channel, self.depth // 2, self.height // 2,
+                    self.width // 2
+                ]
+            }, {
+                'data':
+                [self.bs, self.channel, self.depth, self.height, self.width]
+            }, {
+                'data':
+                [self.bs, self.channel, self.depth, self.height, self.width]
+            }, False)
         dynamic_shape_options = [None, dynamic_shape_profile]
 
         for precision, serialize, dynamic_shape in itertools.product(
                 precision_options, serialize_options, dynamic_shape_options):
             is_dynamic = True if dynamic_shape_options is not None else False
-            with self.subTest('Precision: {}, Serialize: {}, Dynamic: {}'.
-                              format(precision, serialize, is_dynamic)):
+            with self.subTest(
+                    'Precision: {}, Serialize: {}, Dynamic: {}'.format(
+                        precision, serialize, is_dynamic)):
                 self.precision = precision
                 self.serialize = serialize
                 self.dynamic_shape_params = dynamic_shape
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
index d71937f986e51..3812642d2a5e4 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
@@ -25,6 +25,7 @@
 
 
 class TensorRTPoolTest(InferencePassTest):
+
     def setUp(self):
         self.bs = 1
         self.channel = 2
@@ -55,19 +56,17 @@ def build_network(self):
             1 << 30, self.bs, 0, self.precision, self.serialize, False)
 
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name='data',
-                shape=[-1, self.channel, self.height, self.width],
-                dtype='float32')
-            pool_out = fluid.layers.pool2d(
-                input=data,
-                pool_size=self.pool_size,
-                pool_type=self.pool_type,
-                pool_stride=self.pool_stride,
-                pool_padding=self.pool_padding,
-                global_pooling=self.global_pooling,
-                ceil_mode=self.ceil_mode,
-                exclusive=self.exclusive)
+            data = fluid.data(name='data',
+                              shape=[-1, self.channel, self.height, self.width],
+                              dtype='float32')
+            pool_out = fluid.layers.pool2d(input=data,
+                                           pool_size=self.pool_size,
+                                           pool_type=self.pool_type,
+                                           pool_stride=self.pool_stride,
+                                           pool_padding=self.pool_padding,
+                                           global_pooling=self.global_pooling,
+                                           ceil_mode=self.ceil_mode,
+                                           exclusive=self.exclusive)
             out = fluid.layers.batch_norm(pool_out, is_test=True)
             self.fetch_list = [out]
 
@@ -100,8 +99,9 @@ def test(self):
         for precision, serialize, dynamic_shape in itertools.product(
                 precision_options, serialize_options, dynamic_shape_options):
             is_dynamic = True if dynamic_shape_options is not None else False
-            with self.subTest('Precision: {}, Serialize: {}, Dynamic: {}'.
-                              format(precision, serialize, is_dynamic)):
+            with self.subTest(
+                    'Precision: {}, Serialize: {}, Dynamic: {}'.format(
+                        precision, serialize, is_dynamic)):
                 self.precision = precision
                 self.serialize = serialize
                 self.dynamic_shape = dynamic_shape
@@ -109,6 +109,7 @@ def test(self):
 
 
 class TensorRTAvgPoolTest(TensorRTPoolTest):
+
     def set_extra_config(self):
         self.pool_size = 2
         self.pool_type = 'avg'
@@ -120,6 +121,7 @@ def set_extra_config(self):
 
 
 class TensorRTAvgCeilPoolTest(TensorRTPoolTest):
+
     def set_extra_config(self):
         self.pool_size = 2
         self.pool_type = 'avg'
@@ -131,6 +133,7 @@ def set_extra_config(self):
 
 
 class TensorRTGlobalPoolTest(TensorRTPoolTest):
+
     def set_extra_config(self):
         self.pool_size = 2
         self.pool_type = 'max'
@@ -142,6 +145,7 @@ def set_extra_config(self):
 
 
 class TensorRTCeilPoolTest(TensorRTPoolTest):
+
     def set_extra_config(self):
         self.pool_size = 2
         self.pool_type = 'max'
@@ -153,6 +157,7 @@ def set_extra_config(self):
 
 
 class TensorRTExclusivePoolTest(TensorRTPoolTest):
+
     def set_extra_config(self):
         self.pool_size = 2
         self.pool_type = 'max'
@@ -164,6 +169,7 @@ def set_extra_config(self):
 
 
 class TensorRTSamePaddingPoolTest(InferencePassTest):
+
     def set_extra_config(self):
         self.pool_size = 2
         self.pool_type = 'max'
@@ -175,6 +181,7 @@ def set_extra_config(self):
 
 
 class TensorRTValidPaddingPoolTest(InferencePassTest):
+
     def set_extra_config(self):
         self.pool_size = 2
         self.pool_type = 'max'
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_mean_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_mean_op.py
index 7ccbe673fd601..1086e1428e09f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_mean_op.py
@@ -24,12 +24,15 @@
 
 
 class TRTReduceMeanTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, -1, -1], dtype="float32")
-            reduce_mean = fluid.layers.reduce_mean(
-                data, dim=[2, -1], keep_dim=True)
+            data = fluid.data(name="data",
+                              shape=[-1, 3, -1, -1],
+                              dtype="float32")
+            reduce_mean = fluid.layers.reduce_mean(data,
+                                                   dim=[2, -1],
+                                                   keep_dim=True)
             out = fluid.layers.batch_norm(reduce_mean, is_test=True)
 
         self.feeds = {
@@ -39,9 +42,9 @@ def setUp(self):
         self.trt_parameters = TRTReduceMeanTest.TensorRTParam(
             1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
         self.fetch_list = [out]
-        self.dynamic_shape_params = TRTReduceMeanTest.DynamicShapeParam({
-            'data': [1, 3, 16, 16]
-        }, {'data': [3, 3, 56, 56]}, {'data': [3, 3, 56, 56]}, False)
+        self.dynamic_shape_params = TRTReduceMeanTest.DynamicShapeParam(
+            {'data': [1, 3, 16, 16]}, {'data': [3, 3, 56, 56]},
+            {'data': [3, 3, 56, 56]}, False)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -52,10 +55,12 @@ def test_check_output(self):
 
 
 class TRTReduceMeanAllNoBatchTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, -1, -1], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 3, -1, -1],
+                              dtype="float32")
             reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
             out = fluid.layers.batch_norm(reduce_mean, is_test=True)
 
@@ -67,9 +72,8 @@ def setUp(self):
             1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
         self.fetch_list = [out]
         self.dynamic_shape_params = TRTReduceMeanAllNoBatchTest.DynamicShapeParam(
-            {
-                'data': [1, 3, 16, 16]
-            }, {'data': [3, 3, 56, 56]}, {'data': [3, 3, 56, 56]}, False)
+            {'data': [1, 3, 16, 16]}, {'data': [3, 3, 56, 56]},
+            {'data': [3, 3, 56, 56]}, False)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -80,12 +84,15 @@ def test_check_output(self):
 
 
 class TRTReduceMeanTestFP16(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, -1, -1], dtype="float32")
-            reduce_mean = fluid.layers.reduce_mean(
-                data, dim=[2, -1], keep_dim=True)
+            data = fluid.data(name="data",
+                              shape=[-1, 3, -1, -1],
+                              dtype="float32")
+            reduce_mean = fluid.layers.reduce_mean(data,
+                                                   dim=[2, -1],
+                                                   keep_dim=True)
             out = fluid.layers.batch_norm(reduce_mean, is_test=True)
 
         self.feeds = {
@@ -95,9 +102,9 @@ def setUp(self):
         self.trt_parameters = TRTReduceMeanTestFP16.TensorRTParam(
             1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False)
         self.fetch_list = [out]
-        self.dynamic_shape_params = TRTReduceMeanTestFP16.DynamicShapeParam({
-            'data': [1, 3, 16, 16]
-        }, {'data': [3, 3, 56, 56]}, {'data': [3, 3, 56, 56]}, False)
+        self.dynamic_shape_params = TRTReduceMeanTestFP16.DynamicShapeParam(
+            {'data': [1, 3, 16, 16]}, {'data': [3, 3, 56, 56]},
+            {'data': [3, 3, 56, 56]}, False)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -108,10 +115,12 @@ def test_check_output(self):
 
 
 class TRTReduceMeanAllTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 56, 56], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 56, 56],
+                              dtype="float32")
             reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
             out = fluid.layers.batch_norm(reduce_mean, is_test=True)
 
@@ -122,9 +131,9 @@ def setUp(self):
         self.trt_parameters = TRTReduceMeanAllTest.TensorRTParam(
             1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
         self.fetch_list = [out]
-        self.dynamic_shape_params = TRTReduceMeanAllTest.DynamicShapeParam({
-            'data': [1, 3, 56, 56]
-        }, {'data': [3, 3, 56, 56]}, {'data': [3, 3, 56, 56]}, False)
+        self.dynamic_shape_params = TRTReduceMeanAllTest.DynamicShapeParam(
+            {'data': [1, 3, 56, 56]}, {'data': [3, 3, 56, 56]},
+            {'data': [3, 3, 56, 56]}, False)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -135,12 +144,15 @@ def test_check_output(self):
 
 
 class TRTReduceMeanTestStatic(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[3, 3, 56, 56], dtype="float32")
-            reduce_mean = fluid.layers.reduce_mean(
-                data, dim=[2, -1], keep_dim=True)
+            data = fluid.data(name="data",
+                              shape=[3, 3, 56, 56],
+                              dtype="float32")
+            reduce_mean = fluid.layers.reduce_mean(data,
+                                                   dim=[2, -1],
+                                                   keep_dim=True)
             out = fluid.layers.batch_norm(reduce_mean, is_test=True)
 
         self.feeds = {
@@ -160,10 +172,12 @@ def test_check_output(self):
 
 
 class TRTReduceMeanStaticAllTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[4, 3, 56, 56], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[4, 3, 56, 56],
+                              dtype="float32")
             reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
             out = fluid.layers.batch_norm(reduce_mean, is_test=True)
 
@@ -184,10 +198,12 @@ def test_check_output(self):
 
 
 class TRTReduceMeanStaticFP16(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[4, 3, 56, 56], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[4, 3, 56, 56],
+                              dtype="float32")
             reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
             out = fluid.layers.batch_norm(reduce_mean, is_test=True)
 
@@ -208,10 +224,12 @@ def test_check_output(self):
 
 
 class TRTReduceMeanFP16Static(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[4, 3, 56, 56], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[4, 3, 56, 56],
+                              dtype="float32")
             reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
             out = fluid.layers.batch_norm(reduce_mean, is_test=True)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
index fbe944cd7f30d..2e413bde5f700 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
@@ -24,12 +24,15 @@
 
 
 class TRTReduceSumTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 10, 192], dtype="float32")
-            reduce_sum = fluid.layers.reduce_sum(
-                data, dim=[2, -1], keep_dim=True)
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 10, 192],
+                              dtype="float32")
+            reduce_sum = fluid.layers.reduce_sum(data,
+                                                 dim=[2, -1],
+                                                 keep_dim=True)
             out = fluid.layers.batch_norm(reduce_sum, is_test=True)
 
         self.feeds = {
@@ -39,9 +42,9 @@ def setUp(self):
         self.trt_parameters = TRTReduceSumTest.TensorRTParam(
             1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
         self.fetch_list = [out]
-        self.dynamic_shape_params = TRTReduceSumTest.DynamicShapeParam({
-            'data': [1, 3, 8, 8]
-        }, {'data': [3, 3, 10, 192]}, {'data': [3, 3, 10, 192]}, False)
+        self.dynamic_shape_params = TRTReduceSumTest.DynamicShapeParam(
+            {'data': [1, 3, 8, 8]}, {'data': [3, 3, 10, 192]},
+            {'data': [3, 3, 10, 192]}, False)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -52,10 +55,12 @@ def test_check_output(self):
 
 
 class TRTReduceSumAllTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 10, 192], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 10, 192],
+                              dtype="float32")
             reduce_sum = fluid.layers.reduce_sum(data, keep_dim=True)
             out = fluid.layers.batch_norm(reduce_sum, is_test=True)
 
@@ -66,9 +71,9 @@ def setUp(self):
         self.trt_parameters = TRTReduceSumAllTest.TensorRTParam(
             1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
         self.fetch_list = [out]
-        self.dynamic_shape_params = TRTReduceSumAllTest.DynamicShapeParam({
-            'data': [1, 3, 8, 8]
-        }, {'data': [3, 3, 10, 192]}, {'data': [3, 3, 10, 192]}, False)
+        self.dynamic_shape_params = TRTReduceSumAllTest.DynamicShapeParam(
+            {'data': [1, 3, 8, 8]}, {'data': [3, 3, 10, 192]},
+            {'data': [3, 3, 10, 192]}, False)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape2_matmul_fuse_pass.py
index ecfc5c9dac064..d2dca92345ad3 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape2_matmul_fuse_pass.py
@@ -64,14 +64,15 @@ def teller1(program_config, predictor_config):
         self.add_ignore_check_case(
             teller1,
             IgnoreReasons.PASS_ACCURACY_ERROR,
-            "The pass error on TRT while shape of bias is not [out_size].", )
+            "The pass error on TRT while shape of bias is not [out_size].",
+        )
 
     def sample_program_config(self, draw):
         # 1. Generate shape and attr of reshape2
         reshape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=10), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=10),
+                     min_size=2,
+                     max_size=2))
         x_shape = reshape + [1, 1]
 
         # 2. Generate attr:transpose_X/transpose_Y/alpha of matmul
@@ -81,16 +82,18 @@ def sample_program_config(self, draw):
 
         # 3. Generate legal shape of input:Y of matmul
         y_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=2))
         y_shape[0] = x_shape[1]
 
         # 4. Generate legal attr:axis of elementwise_add
         axis = draw(st.integers(min_value=-1, max_value=1))
         if axis == 0:
             axis = -1
-        bias_shape = [y_shape[1], ]
+        bias_shape = [
+            y_shape[1],
+        ]
         # if axis == -1:
         #     if draw(st.booleans()):
         #         bias_shape = [y_shape[1], ]
@@ -99,14 +102,21 @@ def sample_program_config(self, draw):
 
         reshape2_op = OpConfig(
             "reshape2",
-            inputs={"X": ["reshape2_x"], },
+            inputs={
+                "X": ["reshape2_x"],
+            },
             shape=reshape,
-            outputs={"Out": ["reshape2_out"],
-                     "XShape": ["xshape"]}, )
+            outputs={
+                "Out": ["reshape2_out"],
+                "XShape": ["xshape"]
+            },
+        )
         matmul_op = OpConfig(
             "matmul",
-            inputs={"X": ["reshape2_out"],
-                    "Y": ["matmul_y"]},
+            inputs={
+                "X": ["reshape2_out"],
+                "Y": ["matmul_y"]
+            },
             outputs={"Out": ["matmul_out"]},
             alpha=alpha,
             transpose_X=transpose_X,
@@ -116,14 +126,18 @@ def sample_program_config(self, draw):
             fused_transpose_X=[],
             fused_transpose_Y=[],
             fused_reshape_Out=[],
-            fused_transpose_Out=[], )
+            fused_transpose_Out=[],
+        )
 
         add_op = OpConfig(
             "elementwise_add",
-            inputs={"X": ["matmul_out"],
-                    "Y": ["bias"]},
+            inputs={
+                "X": ["matmul_out"],
+                "Y": ["bias"]
+            },
             outputs={"Out": ["add_out"]},
-            axis=axis, )
+            axis=axis,
+        )
 
         ops = [reshape2_op, matmul_op, add_op]
 
@@ -133,16 +147,18 @@ def sample_program_config(self, draw):
                 "matmul_y": TensorConfig(shape=y_shape),
                 "bias": TensorConfig(shape=bias_shape),
             },
-            inputs={"reshape2_x": TensorConfig(shape=x_shape), },
-            outputs=ops[-1].outputs["Out"], )
+            inputs={
+                "reshape2_x": TensorConfig(shape=x_shape),
+            },
+            outputs=ops[-1].outputs["Out"],
+        )
 
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=50,
-            passes=["trt_reshape2_matmul_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=50,
+                            passes=["trt_reshape2_matmul_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
index 0522df3a9219d..8fcf993e2711d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
@@ -24,6 +24,7 @@
 
 
 class TRTReshapeTest(InferencePassTest):
+
     def setUp(self):
         self.bs = 1
         self.input_shape = [16, 3, 8]
@@ -33,8 +34,9 @@ def setUp(self):
             self.input_shape[2]
         ]
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name='data', shape=self.data_shape, dtype='float32')
+            data = fluid.data(name='data',
+                              shape=self.data_shape,
+                              dtype='float32')
             reshape_out = self.append_reshape(data, self.reshape)
             out = fluid.layers.batch_norm(reshape_out, is_test=True)
         self.feeds = {
@@ -57,6 +59,7 @@ def test_check_output(self):
 
 
 class TRTReshapeTest1(TRTReshapeTest):
+
     def setUp(self):
         self.bs = 2
         self.input_shape = [23, 13, 12]
@@ -66,8 +69,9 @@ def setUp(self):
             self.input_shape[2]
         ]
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name='data', shape=self.data_shape, dtype='float32')
+            data = fluid.data(name='data',
+                              shape=self.data_shape,
+                              dtype='float32')
             reshape_out = self.append_reshape(data, self.reshape)
             out = fluid.layers.batch_norm(reshape_out, is_test=True)
         self.feeds = {
@@ -80,6 +84,7 @@ def setUp(self):
 
 
 class TRTReshapeTest2(TRTReshapeTest):
+
     def setUp(self):
         self.bs = 2
         self.input_shape = [23, 13, 12]
@@ -89,8 +94,9 @@ def setUp(self):
             self.input_shape[2]
         ]
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name='data', shape=self.data_shape, dtype='float32')
+            data = fluid.data(name='data',
+                              shape=self.data_shape,
+                              dtype='float32')
             reshape_out = fluid.layers.reshape(x=data, shape=self.reshape)
             out = fluid.layers.batch_norm(reshape_out, is_test=True)
         self.feeds = {
@@ -103,6 +109,7 @@ def setUp(self):
 
 
 class TRTReshapeTest3(TRTReshapeTest):
+
     def setUp(self):
         self.bs = 1
         self.input_shape = [7, 16, 27]
@@ -112,8 +119,9 @@ def setUp(self):
             self.input_shape[2]
         ]
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name='data', shape=self.data_shape, dtype='float32')
+            data = fluid.data(name='data',
+                              shape=self.data_shape,
+                              dtype='float32')
             bn_out = fluid.layers.batch_norm(data, is_test=True)
             out = self.append_reshape(bn_out, self.reshape)
         self.feeds = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py
index 37f17661dbc7c..f644a0954e434 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py
@@ -24,6 +24,7 @@
 
 
 class TRTRoiAlignTest(InferencePassTest):
+
     def setUp(self):
         self.bs = 2
         self.num_rois = 4
@@ -41,8 +42,10 @@ def build(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data_shape = [-1, self.channel, self.height, self.width]
             data = fluid.data(name='data', shape=data_shape, dtype='float32')
-            rois = fluid.data(
-                name='rois', shape=[-1, 4], dtype='float32', lod_level=1)
+            rois = fluid.data(name='rois',
+                              shape=[-1, 4],
+                              dtype='float32',
+                              lod_level=1)
             roi_align_out = fluid.layers.roi_align(data, rois)
             out = fluid.layers.batch_norm(roi_align_out, is_test=True)
 
@@ -75,11 +78,13 @@ def set_dynamic(self):
             self.bs, self.channel, self.height // 2, self.width // 2
         ]
         min_shape_spec['rois'] = [1, 4]
-        max_shape_spec[
-            'data'] = [self.bs, self.channel, self.height * 2, self.width * 2]
+        max_shape_spec['data'] = [
+            self.bs, self.channel, self.height * 2, self.width * 2
+        ]
         max_shape_spec['rois'] = [self.bs * self.num_rois, 4]
-        opt_shape_spec[
-            'data'] = [self.bs, self.channel, self.height, self.width]
+        opt_shape_spec['data'] = [
+            self.bs, self.channel, self.height, self.width
+        ]
         opt_shape_spec['rois'] = [self.bs * self.num_rois, 4]
 
         self.dynamic_shape_params = InferencePassTest.DynamicShapeParam(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
index 4530e04d8de63..752fe3ac14699 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
@@ -24,21 +24,26 @@
 
 
 class TRTScaleTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[-1, 512], dtype="float32")
             scale_out = self.append_scale(data)
             out = fluid.layers.batch_norm(scale_out, is_test=True)
 
-        self.feeds = {"data": np.random.random([1, 512]).astype("float32"), }
+        self.feeds = {
+            "data": np.random.random([1, 512]).astype("float32"),
+        }
         self.enable_trt = True
         self.trt_parameters = TRTScaleTest.TensorRTParam(
             1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
         self.fetch_list = [out]
 
     def append_scale(self, data):
-        return fluid.layers.scale(
-            x=data, scale=2.0, bias=-1.0, bias_after_scale=False)
+        return fluid.layers.scale(x=data,
+                                  scale=2.0,
+                                  bias=-1.0,
+                                  bias_after_scale=False)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -49,10 +54,12 @@ def test_check_output(self):
 
 
 class TRTScaleShape2Test(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 512, 512], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 512, 512],
+                              dtype="float32")
             scale_out = self.append_scale(data)
             out = fluid.layers.batch_norm(scale_out, is_test=True)
 
@@ -65,8 +72,10 @@ def setUp(self):
         self.fetch_list = [out]
 
     def append_scale(self, data):
-        return fluid.layers.scale(
-            x=data, scale=2.0, bias=-1.0, bias_after_scale=False)
+        return fluid.layers.scale(x=data,
+                                  scale=2.0,
+                                  bias=-1.0,
+                                  bias_after_scale=False)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
index e9c304496afcc..ced6c706592be 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
@@ -22,10 +22,12 @@
 
 
 class ShuffleChannelFuseTRTPassTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 6, 64, 64],
+                              dtype="float32")
             reshape1 = fluid.layers.reshape(x=data, shape=[-1, 2, 3, 64, 64])
             trans = fluid.layers.transpose(x=reshape1, perm=[0, 2, 1, 3, 4])
             reshape2 = fluid.layers.reshape(x=trans, shape=[-1, 6, 64, 64])
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_dynamic_plugin.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_dynamic_plugin.py
index 7b4b84724e8b3..531b4e3df4588 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_dynamic_plugin.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_dynamic_plugin.py
@@ -24,6 +24,7 @@
 
 #normal starts && ends
 class SlicePluginTRTDynamicTest(InferencePassTest):
+
     def setUpSliceParams(self):
         self.params_axes = [1, 3]
         self.params_starts = [0, 1]
@@ -34,9 +35,8 @@ def setUpTensorRTParams(self):
             1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
         self.enable_trt = True
         self.dynamic_shape_params = SlicePluginTRTDynamicTest.DynamicShapeParam(
-            {
-                'data': [1, 1, 1, 1]
-            }, {'data': [8, 8, 8, 8]}, {'data': [8, 8, 8, 8]}, False)
+            {'data': [1, 1, 1, 1]}, {'data': [8, 8, 8, 8]},
+            {'data': [8, 8, 8, 8]}, False)
 
     def setUp(self):
         self.setUpSliceParams()
@@ -46,8 +46,10 @@ def setUp(self):
             axes = self.params_axes
             starts = self.params_starts
             ends = self.params_ends
-            slice_out = fluid.layers.slice(
-                data, axes=axes, starts=starts, ends=ends)
+            slice_out = fluid.layers.slice(data,
+                                           axes=axes,
+                                           starts=starts,
+                                           ends=ends)
 
         self.feeds = {
             "data": np.random.random((3, 3, 3, 3)).astype("float32"),
@@ -66,6 +68,7 @@ def test_check_output(self):
 
 
 class SlicePluginTRTDynamicBoundTest(SlicePluginTRTDynamicTest):
+
     def setUpSliceParams(self):
         self.params_axes = [1, 3]
         self.params_starts = [0, 1]
@@ -76,12 +79,12 @@ def setUpTensorRTParams(self):
             1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False)
         self.enable_trt = True
         self.dynamic_shape_params = SlicePluginTRTDynamicBoundTest.DynamicShapeParam(
-            {
-                'data': [1, 1, 1, 1]
-            }, {'data': [8, 8, 8, 8]}, {'data': [8, 8, 8, 8]}, False)
+            {'data': [1, 1, 1, 1]}, {'data': [8, 8, 8, 8]},
+            {'data': [8, 8, 8, 8]}, False)
 
 
 class SlicePluginTRTDynamicNegativeBoundTest(SlicePluginTRTDynamicTest):
+
     def setUpSliceParams(self):
         self.params_axes = [1, 3]
         self.params_starts = [-5, 1]
@@ -92,9 +95,8 @@ def setUpTensorRTParams(self):
             1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False)
         self.enable_trt = True
         self.dynamic_shape_params = SlicePluginTRTDynamicNegativeBoundTest.DynamicShapeParam(
-            {
-                'data': [1, 1, 1, 1]
-            }, {'data': [8, 8, 8, 8]}, {'data': [8, 8, 8, 8]}, False)
+            {'data': [1, 1, 1, 1]}, {'data': [8, 8, 8, 8]},
+            {'data': [8, 8, 8, 8]}, False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
index 98232838ee08b..a1249c04c2736 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
@@ -24,6 +24,7 @@
 
 #normal starts && ends
 class SlicePluginTRTTest(InferencePassTest):
+
     def setUpSliceParams(self):
         self.params_axes = [1, 3]
         self.params_starts = [0, 1]
@@ -42,8 +43,10 @@ def setUp(self):
             axes = self.params_axes
             starts = self.params_starts
             ends = self.params_ends
-            slice_out = fluid.layers.slice(
-                data, axes=axes, starts=starts, ends=ends)
+            slice_out = fluid.layers.slice(data,
+                                           axes=axes,
+                                           starts=starts,
+                                           ends=ends)
             out = fluid.layers.batch_norm(slice_out, is_test=True)
 
         self.feeds = {
@@ -64,6 +67,7 @@ def test_check_output(self):
 
 #negative starts && ends
 class SlicePluginTRTTestNegativeStartsAndEnds(SlicePluginTRTTest):
+
     def setUpSliceParams(self):
         self.params_axes = [2, 3]
         self.params_starts = [-3, -2]
@@ -72,6 +76,7 @@ def setUpSliceParams(self):
 
 #exceeded bound starts && ends
 class SlicePluginTRTTestStartsAndEndsBoundCheck(SlicePluginTRTTest):
+
     def setUpSliceParams(self):
         self.params_axes = [2, 3]
         self.params_starts = [-5, -2]
@@ -80,6 +85,7 @@ def setUpSliceParams(self):
 
 #fp16
 class SlicePluginTRTTestFp16(SlicePluginTRTTest):
+
     def setUpTensorRTParams(self):
         self.trt_parameters = SlicePluginTRTTest.TensorRTParam(
             1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False)
@@ -87,6 +93,7 @@ def setUpTensorRTParams(self):
 
 
 class StaticSlicePluginTRTTestFp16(SlicePluginTRTTest):
+
     def setUpTensorRTParams(self):
         self.trt_parameters = SlicePluginTRTTest.TensorRTParam(
             1 << 30, 32, 1, AnalysisConfig.Precision.Half, True, False)
@@ -94,6 +101,7 @@ def setUpTensorRTParams(self):
 
 
 class StaticSlicePluginTRTTestFp32(SlicePluginTRTTest):
+
     def setUpTensorRTParams(self):
         self.trt_parameters = SlicePluginTRTTest.TensorRTParam(
             1 << 30, 32, 1, AnalysisConfig.Precision.Float32, True, False)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py
index d2791737a1cbf..a52dd0aed8465 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py
@@ -64,14 +64,15 @@ def teller1(program_config, predictor_config):
         self.add_ignore_check_case(
             teller1,
             IgnoreReasons.PASS_ACCURACY_ERROR,
-            "The pass error on TRT while shape of bias is not [out_size].", )
+            "The pass error on TRT while shape of bias is not [out_size].",
+        )
 
     def sample_program_config(self, draw):
         # 1. Generate shape of input:X of squeeze2
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=2))
         # axes of squeeze2 == [2, 3]
         x_shape += [1, 1]
         axes = [2, 3]
@@ -83,16 +84,18 @@ def sample_program_config(self, draw):
 
         # 3. Generate legal shape of input:Y of matmul
         y_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=8), min_size=2, max_size=2))
+            st.lists(st.integers(min_value=1, max_value=8),
+                     min_size=2,
+                     max_size=2))
         y_shape[0] = x_shape[1]
 
         # 4. Generate legal attr:axis of elementwise_add
         axis = draw(st.integers(min_value=-1, max_value=1))
         if axis == 0:
             axis = -1
-        bias_shape = [y_shape[1], ]
+        bias_shape = [
+            y_shape[1],
+        ]
         # if axis == -1:
         #     if draw(st.booleans()):
         #         bias_shape = [y_shape[1], ]
@@ -101,14 +104,21 @@ def sample_program_config(self, draw):
 
         squeeze2_op = OpConfig(
             "squeeze2",
-            inputs={"X": ["squeeze2_x"], },
+            inputs={
+                "X": ["squeeze2_x"],
+            },
             axes=axes,
-            outputs={"Out": ["squeeze2_out"],
-                     "XShape": ["xshape"]}, )
+            outputs={
+                "Out": ["squeeze2_out"],
+                "XShape": ["xshape"]
+            },
+        )
         matmul_op = OpConfig(
             "matmul",
-            inputs={"X": ["squeeze2_out"],
-                    "Y": ["matmul_y"]},
+            inputs={
+                "X": ["squeeze2_out"],
+                "Y": ["matmul_y"]
+            },
             outputs={"Out": ["matmul_out"]},
             alpha=alpha,
             transpose_X=transpose_X,
@@ -118,14 +128,18 @@ def sample_program_config(self, draw):
             fused_transpose_X=[],
             fused_transpose_Y=[],
             fused_reshape_Out=[],
-            fused_transpose_Out=[], )
+            fused_transpose_Out=[],
+        )
 
         add_op = OpConfig(
             "elementwise_add",
-            inputs={"X": ["matmul_out"],
-                    "Y": ["bias"]},
+            inputs={
+                "X": ["matmul_out"],
+                "Y": ["bias"]
+            },
             outputs={"Out": ["add_out"]},
-            axis=axis, )
+            axis=axis,
+        )
 
         ops = [squeeze2_op, matmul_op, add_op]
         program_config = ProgramConfig(
@@ -134,16 +148,18 @@ def sample_program_config(self, draw):
                 "matmul_y": TensorConfig(shape=y_shape),
                 "bias": TensorConfig(shape=bias_shape),
             },
-            inputs={"squeeze2_x": TensorConfig(shape=x_shape), },
-            outputs=ops[-1].outputs["Out"], )
+            inputs={
+                "squeeze2_x": TensorConfig(shape=x_shape),
+            },
+            outputs=ops[-1].outputs["Out"],
+        )
 
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=50,
-            passes=["trt_squeeze2_matmul_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=50,
+                            passes=["trt_squeeze2_matmul_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
index 23a3d19140179..2472ff027e3cc 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
@@ -24,10 +24,12 @@
 
 
 class TensorRTSubgraphPassFcTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 6, 64, 64],
+                              dtype="float32")
             fc_out = fluid.layers.fc(input=[data], act=None, size=1000)
             reshape_out = fluid.layers.reshape(x=fc_out, shape=[1, 1000])
         self.feeds = {
@@ -48,12 +50,15 @@ def test_check_output(self):
 
 
 class TensorRTSubgraphPassConcatTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data1 = fluid.data(
-                name="data1", shape=[-1, 3, 64, 64], dtype="float32")
-            data2 = fluid.data(
-                name="data2", shape=[-1, 3, 64, 64], dtype="float32")
+            data1 = fluid.data(name="data1",
+                               shape=[-1, 3, 64, 64],
+                               dtype="float32")
+            data2 = fluid.data(name="data2",
+                               shape=[-1, 3, 64, 64],
+                               dtype="float32")
             concat_out = fluid.layers.concat([data1, data2], axis=2)
             out = fluid.layers.batch_norm(concat_out, is_test=True)
         self.feeds = {
@@ -74,10 +79,12 @@ def test_check_output(self):
 
 
 class TensorRTSubgraphPassSplitTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 64, 64],
+                              dtype="float32")
             split_out = fluid.layers.split(data, dim=-1, num_or_sections=2)
             out = fluid.layers.batch_norm(split_out[0], is_test=True)
         self.feeds = {
@@ -97,10 +104,12 @@ def test_check_output(self):
 
 
 class TensorRTSubgraphPassSplitSerializeTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 64, 64],
+                              dtype="float32")
             split_out = fluid.layers.split(data, dim=-1, num_or_sections=2)
             out = fluid.layers.batch_norm(split_out[0], is_test=True)
         self.feeds = {
@@ -122,10 +131,12 @@ def test_check_output(self):
 
 
 class TensorRTSubgraphPassDynamicSplitFp16SerializeTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 64, 64],
+                              dtype="float32")
             split_out = fluid.layers.split(data, dim=-1, num_or_sections=2)
             out = fluid.layers.batch_norm(split_out[0], is_test=True)
         self.feeds = {
@@ -135,9 +146,8 @@ def setUp(self):
         self.trt_parameters = TensorRTSubgraphPassSplitTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
         self.dynamic_shape_params = TensorRTSubgraphPassDynamicSplitFp16SerializeTest.DynamicShapeParam(
-            {
-                'data': [1, 3, 8, 64]
-            }, {'data': [1, 3, 512, 64]}, {'data': [1, 3, 256, 64]}, False)
+            {'data': [1, 3, 8, 64]}, {'data': [1, 3, 512, 64]},
+            {'data': [1, 3, 256, 64]}, False)
         self.fetch_list = [out]
 
     def test_check_output(self):
@@ -151,18 +161,21 @@ def test_check_output(self):
 
 
 class TensorRTSubgraphPassInstanceNormTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 64, 64],
+                              dtype="float32")
             param_attr = fluid.ParamAttr(
                 name='instance_norm_w',
                 initializer=fluid.initializer.Constant(value=1.0))
             bias_attr = fluid.ParamAttr(
                 name='instance_norm_b',
                 initializer=fluid.initializer.Constant(value=0.0))
-            out = fluid.layers.instance_norm(
-                input=data, param_attr=param_attr, bias_attr=bias_attr)
+            out = fluid.layers.instance_norm(input=data,
+                                             param_attr=param_attr,
+                                             bias_attr=bias_attr)
         self.feeds = {
             "data": np.random.random([1, 3, 64, 64]).astype("float32"),
         }
@@ -180,10 +193,12 @@ def test_check_output(self):
 
 
 class TensorRTSubgraphPassTransposeTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 6, 64, 64],
+                              dtype="float32")
             transpose_out = self.append_transpose(data)
             out = fluid.layers.batch_norm(transpose_out, is_test=True)
         self.feeds = {
@@ -206,13 +221,15 @@ def test_check_output(self):
 
 
 class TensorRTSubgraphPassLayerNormTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 64, 64], dtype="float32")
-            out = fluid.layers.layer_norm(
-                data, begin_norm_axis=self.begin_norm_axis)
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 64, 64],
+                              dtype="float32")
+            out = fluid.layers.layer_norm(data,
+                                          begin_norm_axis=self.begin_norm_axis)
         self.feeds = {
             "data": np.random.random([1, 3, 64, 64]).astype("float32"),
         }
@@ -233,13 +250,15 @@ def test_check_output(self):
 
 
 class TensorRTSubgraphPassLayerNormDynamicTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 64, 64], dtype="float32")
-            out = fluid.layers.layer_norm(
-                data, begin_norm_axis=self.begin_norm_axis)
+            data = fluid.data(name="data",
+                              shape=[-1, 3, 64, 64],
+                              dtype="float32")
+            out = fluid.layers.layer_norm(data,
+                                          begin_norm_axis=self.begin_norm_axis)
         self.feeds = {
             "data": np.random.random([1, 3, 64, 64]).astype("float32"),
         }
@@ -253,7 +272,11 @@ def set_trt_params(self):
         self.dynamic_shape_params = TensorRTSubgraphPassLayerNormDynamicTest.DynamicShapeParam(
             {
                 'data': [1, 3, 64, 64],
-            }, {'data': [8, 8, 64, 64], }, {'data': [4, 4, 64, 64], }, False)
+            }, {
+                'data': [8, 8, 64, 64],
+            }, {
+                'data': [4, 4, 64, 64],
+            }, False)
 
     def set_params(self):
         self.begin_norm_axis = 2
@@ -272,6 +295,7 @@ def test_check_output(self):
 
 class TensorRTSubgraphPassLayerNormDynamicFP16Test(
         TensorRTSubgraphPassLayerNormDynamicTest):
+
     def set_params(self):
         self.begin_norm_axis = 2
         self.precision = AnalysisConfig.Precision.Half
@@ -289,23 +313,28 @@ def test_check_output(self):
 
 class TensorRTSubgraphPassLayerNormBeginNormAxis2Test(
         TensorRTSubgraphPassLayerNormTest):
+
     def set_params(self):
         self.begin_norm_axis = 2
 
 
 class TensorRTSubgraphPassLayerNormBeginNormAxis3Test(
         TensorRTSubgraphPassLayerNormTest):
+
     def set_params(self):
         self.begin_norm_axis = 3
 
 
 class TensorRTSubgraphPassElementwiseTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data1 = fluid.data(
-                name="data1", shape=[-1, 3, 64, 64], dtype="float32")
-            data2 = fluid.data(
-                name="data2", shape=[-1, 3, 64, 64], dtype="float32")
+            data1 = fluid.data(name="data1",
+                               shape=[-1, 3, 64, 64],
+                               dtype="float32")
+            data2 = fluid.data(name="data2",
+                               shape=[-1, 3, 64, 64],
+                               dtype="float32")
             eltwise_out = self.append_eltwise(data1, data2)
             out = fluid.layers.batch_norm(eltwise_out, is_test=True)
         self.feeds = {
@@ -328,14 +357,16 @@ def test_check_output(self):
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
 
 
-class TensorRTSubgraphPassElementwiseMulTest(
-        TensorRTSubgraphPassElementwiseTest):
+class TensorRTSubgraphPassElementwiseMulTest(TensorRTSubgraphPassElementwiseTest
+                                             ):
+
     def append_eltwise(self, data1, data2):
         return fluid.layers.elementwise_mul(x=data1, y=data2)
 
 
 class TensorRTSubgraphPassElementwiseSerializeTest(
         TensorRTSubgraphPassElementwiseTest):
+
     def setUp(self):
         super(TensorRTSubgraphPassElementwiseSerializeTest, self).setUp()
         self.trt_parameters = TensorRTSubgraphPassElementwiseTest.TensorRTParam(
@@ -349,10 +380,12 @@ def test_check_output(self):
 
 
 class TensorRTSubgraphPassElementwiseBroadcastDynamicTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data1 = fluid.data(
-                name="data1", shape=[-1, 3, 64, 64], dtype="float32")
+            data1 = fluid.data(name="data1",
+                               shape=[-1, 3, 64, 64],
+                               dtype="float32")
             data2 = fluid.data(name="data2", shape=[64, 64], dtype="float32")
             eltwise_out = self.append_eltwise(data1, data2)
             out = fluid.layers.batch_norm(eltwise_out, is_test=True)
@@ -367,10 +400,13 @@ def setUp(self):
             {
                 'data1': [1, 3, 8, 64],
                 'data2': [8, 64]
-            }, {'data1': [1, 3, 512, 64],
-                'data2':
-                [512, 64]}, {'data1': [1, 3, 256, 64],
-                             'data2': [256, 64]}, False)
+            }, {
+                'data1': [1, 3, 512, 64],
+                'data2': [512, 64]
+            }, {
+                'data1': [1, 3, 256, 64],
+                'data2': [256, 64]
+            }, False)
         self.fetch_list = [out]
 
     def append_eltwise(self, data1, data2):
@@ -387,10 +423,12 @@ def test_check_output(self):
 
 
 class TensorRTSubgraphPassShuffleChannelTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[-1, 6, 64, 64],
+                              dtype="float32")
             sc_out = fluid.layers.shuffle_channel(data, group=3)
             out = fluid.layers.batch_norm(sc_out, is_test=True)
         self.feeds = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tile_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tile_op.py
index 8ec6bb5090861..78ef0838ca268 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tile_op.py
@@ -25,10 +25,12 @@
 
 
 class TRTTileTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[4, 3, 224, 256], dtype="float32")
+            data = fluid.data(name="data",
+                              shape=[4, 3, 224, 256],
+                              dtype="float32")
             tile_out = paddle.tile(x=data, repeat_times=[1, 1, 1, 1])
             out = fluid.layers.batch_norm(tile_out, is_test=True)
 
@@ -49,6 +51,7 @@ def test_check_output(self):
 
 
 class TRTTileExpandTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[1, 1, 1, 1], dtype="float32")
@@ -72,6 +75,7 @@ def test_check_output(self):
 
 
 class TRTTileExpandStaticTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[1, 1, 1, 1], dtype="float32")
@@ -95,6 +99,7 @@ def test_check_output(self):
 
 
 class TRTTileExpandHalfTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[1, 1, 1, 1], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
index b15035c3c4dba..3a15f0dcf34a6 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
@@ -21,19 +21,22 @@
 
 
 class TransposeFlattenConcatFusePassTRTTest(InferencePassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data1 = fluid.data(
-                name="data1", shape=[8, 32, 128], dtype="float32")
-            data2 = fluid.data(
-                name="data2", shape=[8, 32, 128], dtype="float32")
+            data1 = fluid.data(name="data1",
+                               shape=[8, 32, 128],
+                               dtype="float32")
+            data2 = fluid.data(name="data2",
+                               shape=[8, 32, 128],
+                               dtype="float32")
             trans1 = fluid.layers.transpose(data1, perm=[0, 2, 1])
             trans2 = fluid.layers.transpose(data2, perm=[0, 2, 1])
             flatt1 = fluid.layers.flatten(trans1)
             flatt2 = fluid.layers.flatten(trans2)
             concat_out = fluid.layers.concat([flatt1, flatt2], axis=1)
-            # There is no parameters for above structure. 
-            # Hence, append a batch_norm to avoid failure caused by load_combined. 
+            # There is no parameters for above structure.
+            # Hence, append a batch_norm to avoid failure caused by load_combined.
             reshape_out = fluid.layers.reshape(concat_out, [-1, 0, 1, 1])
             out = fluid.layers.batch_norm(reshape_out, is_test=True)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tuned_dynamic_shape.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tuned_dynamic_shape.py
index 4a5090fa49802..041676e38e87d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tuned_dynamic_shape.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tuned_dynamic_shape.py
@@ -17,12 +17,14 @@
 import unittest
 import numpy as np
 import paddle
+
 paddle.enable_static()
 import paddle.fluid as fluid
 from paddle.inference import Config, Predictor, create_predictor
 
 
 class TRTTunedDynamicShapeTest(unittest.TestCase):
+
     def get_model(self):
         place = fluid.CUDAPlace(0)
         exe = fluid.Executor(place)
@@ -30,16 +32,16 @@ def get_model(self):
         main_program = fluid.Program()
         startup_program = fluid.Program()
         with fluid.program_guard(main_program, startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 6, 64, 64], dtype="float32")
-            conv_out = fluid.layers.conv2d(
-                input=data,
-                num_filters=3,
-                filter_size=3,
-                groups=1,
-                padding=0,
-                bias_attr=False,
-                act=None)
+            data = fluid.data(name="data",
+                              shape=[-1, 6, 64, 64],
+                              dtype="float32")
+            conv_out = fluid.layers.conv2d(input=data,
+                                           num_filters=3,
+                                           filter_size=3,
+                                           groups=1,
+                                           padding=0,
+                                           bias_attr=False,
+                                           act=None)
         exe.run(startup_program)
         serialized_program = paddle.static.serialize_program(
             data, conv_out, program=main_program)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
index b0124f055b4e1..670d246cd41da 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
@@ -24,19 +24,22 @@
 
 
 class TRTYoloBoxTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
             image_shape = [self.bs, self.channel, self.height, self.width]
             image = fluid.data(name='image', shape=image_shape, dtype='float32')
-            image_size = fluid.data(
-                name='image_size', shape=[self.bs, 2], dtype='int32')
+            image_size = fluid.data(name='image_size',
+                                    shape=[self.bs, 2],
+                                    dtype='int32')
             boxes, scores = self.append_yolobox(image, image_size)
 
         self.feeds = {
-            'image': np.random.random(image_shape).astype('float32'),
-            'image_size': np.random.randint(
-                32, 64, size=(self.bs, 2)).astype('int32'),
+            'image':
+            np.random.random(image_shape).astype('float32'),
+            'image_size':
+            np.random.randint(32, 64, size=(self.bs, 2)).astype('int32'),
         }
         self.enable_trt = True
         self.trt_parameters = TRTYoloBoxTest.TensorRTParam(
@@ -54,13 +57,12 @@ def set_params(self):
         self.downsample_ratio = 32
 
     def append_yolobox(self, image, image_size):
-        return fluid.layers.yolo_box(
-            x=image,
-            img_size=image_size,
-            class_num=self.class_num,
-            anchors=self.anchors,
-            conf_thresh=self.conf_thresh,
-            downsample_ratio=self.downsample_ratio)
+        return fluid.layers.yolo_box(x=image,
+                                     img_size=image_size,
+                                     class_num=self.class_num,
+                                     anchors=self.anchors,
+                                     conf_thresh=self.conf_thresh,
+                                     downsample_ratio=self.downsample_ratio)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -71,13 +73,15 @@ def test_check_output(self):
 
 
 class TRTYoloBoxFP16Test(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
             image_shape = [self.bs, self.channel, self.height, self.width]
             image = fluid.data(name='image', shape=image_shape, dtype='float32')
-            image_size = fluid.data(
-                name='image_size', shape=[self.bs, 2], dtype='int32')
+            image_size = fluid.data(name='image_size',
+                                    shape=[self.bs, 2],
+                                    dtype='int32')
             boxes, scores = self.append_yolobox(image, image_size)
 
         self.feeds = {
@@ -100,13 +104,12 @@ def set_params(self):
         self.downsample_ratio = 32
 
     def append_yolobox(self, image, image_size):
-        return fluid.layers.yolo_box(
-            x=image,
-            img_size=image_size,
-            class_num=self.class_num,
-            anchors=self.anchors,
-            conf_thresh=self.conf_thresh,
-            downsample_ratio=self.downsample_ratio)
+        return fluid.layers.yolo_box(x=image,
+                                     img_size=image_size,
+                                     class_num=self.class_num,
+                                     anchors=self.anchors,
+                                     conf_thresh=self.conf_thresh,
+                                     downsample_ratio=self.downsample_ratio)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -117,19 +120,22 @@ def test_check_output(self):
 
 
 class TRTYoloBoxIoUAwareTest(InferencePassTest):
+
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
             image_shape = [self.bs, self.channel, self.height, self.width]
             image = fluid.data(name='image', shape=image_shape, dtype='float32')
-            image_size = fluid.data(
-                name='image_size', shape=[self.bs, 2], dtype='int32')
+            image_size = fluid.data(name='image_size',
+                                    shape=[self.bs, 2],
+                                    dtype='int32')
             boxes, scores = self.append_yolobox(image, image_size)
 
         self.feeds = {
-            'image': np.random.random(image_shape).astype('float32'),
-            'image_size': np.random.randint(
-                32, 64, size=(self.bs, 2)).astype('int32'),
+            'image':
+            np.random.random(image_shape).astype('float32'),
+            'image_size':
+            np.random.randint(32, 64, size=(self.bs, 2)).astype('int32'),
         }
         self.enable_trt = True
         self.trt_parameters = TRTYoloBoxTest.TensorRTParam(
@@ -149,15 +155,14 @@ def set_params(self):
         self.iou_aware_factor = 0.5
 
     def append_yolobox(self, image, image_size):
-        return fluid.layers.yolo_box(
-            x=image,
-            img_size=image_size,
-            class_num=self.class_num,
-            anchors=self.anchors,
-            conf_thresh=self.conf_thresh,
-            downsample_ratio=self.downsample_ratio,
-            iou_aware=self.iou_aware,
-            iou_aware_factor=self.iou_aware_factor)
+        return fluid.layers.yolo_box(x=image,
+                                     img_size=image_size,
+                                     class_num=self.class_num,
+                                     anchors=self.anchors,
+                                     conf_thresh=self.conf_thresh,
+                                     downsample_ratio=self.downsample_ratio,
+                                     iou_aware=self.iou_aware,
+                                     iou_aware_factor=self.iou_aware_factor)
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py
index 81acd9856cf24..e69091ed855ee 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py
@@ -46,14 +46,16 @@ def sample_predictor_configs(self, program_config):
             precision_mode=paddle_infer.PrecisionType.Float32,
             use_static=False,
             use_calib_mode=False)
-        yield config, ['elementwise_mul', ], (1e-5, 1e-5)
+        yield config, [
+            'elementwise_mul',
+        ], (1e-5, 1e-5)
 
     def sample_program_config(self, draw):
         # 1. Generate shape and attr of mul
         x_shape = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=10), min_size=4, max_size=4))
+            st.lists(st.integers(min_value=1, max_value=10),
+                     min_size=4,
+                     max_size=4))
         axis = -1
 
         # 2. Generate legal shape and attr of input:Y of unsqueeze2
@@ -68,14 +70,20 @@ def sample_program_config(self, draw):
                 "AxesTensorList": []
             },
             axes=unsqueeze2_axes,
-            outputs={"Out": ["unsqueeze2_out"],
-                     "XShape": ["xshape"]}, )
+            outputs={
+                "Out": ["unsqueeze2_out"],
+                "XShape": ["xshape"]
+            },
+        )
         mul_op = OpConfig(
             "elementwise_mul",
-            inputs={"Y": ["unsqueeze2_out"],
-                    "X": ["mul_x"]},
+            inputs={
+                "Y": ["unsqueeze2_out"],
+                "X": ["mul_x"]
+            },
             axis=axis,
-            outputs={"Out": ["mul_out"]}, )
+            outputs={"Out": ["mul_out"]},
+        )
 
         ops = [
             unsqueeze2_op,
@@ -89,14 +97,14 @@ def sample_program_config(self, draw):
                 "mul_x": TensorConfig(shape=x_shape),
                 "unsqueeze2_x": TensorConfig(shape=y_shape),
             },
-            outputs=ops[-1].outputs["Out"], )
+            outputs=ops[-1].outputs["Out"],
+        )
         return program_config
 
     def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=300,
-            passes=["unsqueeze2_eltwise_fuse_pass"])
+        self.run_and_statis(quant=False,
+                            max_examples=300,
+                            passes=["unsqueeze2_eltwise_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_yolo_box_post.py b/python/paddle/fluid/tests/unittests/ir/inference/test_yolo_box_post.py
index 2fb83fb039215..b009152071e8d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_yolo_box_post.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_yolo_box_post.py
@@ -17,6 +17,7 @@
 import paddle
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
+
 paddle.enable_static()
 
 
@@ -48,23 +49,22 @@ def yolo_box_post(box0,
     }
     outputs = {'Out': output, 'NmsRoisNum': nms_rois_num}
 
-    helper.append_op(
-        type="yolo_box_post",
-        inputs=inputs,
-        attrs={
-            'anchors0': anchors0,
-            'anchors1': anchors1,
-            'anchors2': anchors2,
-            'class_num': class_num,
-            'conf_thresh': conf_thresh,
-            'downsample_ratio0': downsample_ratio0,
-            'downsample_ratio1': downsample_ratio1,
-            'downsample_ratio2': downsample_ratio2,
-            'clip_bbox': clip_bbox,
-            'scale_x_y': scale_x_y,
-            'nms_threshold': nms_threshold
-        },
-        outputs=outputs)
+    helper.append_op(type="yolo_box_post",
+                     inputs=inputs,
+                     attrs={
+                         'anchors0': anchors0,
+                         'anchors1': anchors1,
+                         'anchors2': anchors2,
+                         'class_num': class_num,
+                         'conf_thresh': conf_thresh,
+                         'downsample_ratio0': downsample_ratio0,
+                         'downsample_ratio1': downsample_ratio1,
+                         'downsample_ratio2': downsample_ratio2,
+                         'clip_bbox': clip_bbox,
+                         'scale_x_y': scale_x_y,
+                         'nms_threshold': nms_threshold
+                     },
+                     outputs=outputs)
     output.stop_gradient = True
     nms_rois_num.stop_gradient = True
     return output, nms_rois_num
@@ -73,6 +73,7 @@ def yolo_box_post(box0,
 @unittest.skipIf(not paddle.is_compiled_with_cuda(),
                  "only support cuda kernel.")
 class TestYoloBoxPost(unittest.TestCase):
+
     def test_yolo_box_post(self):
         place = paddle.CUDAPlace(0)
         program = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/ir/pass_test.py b/python/paddle/fluid/tests/unittests/ir/pass_test.py
index e92821387aed4..56e31aa705ff2 100644
--- a/python/paddle/fluid/tests/unittests/ir/pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/pass_test.py
@@ -28,6 +28,7 @@
 
 
 class PassTest(unittest.TestCase):
+
     @classmethod
     def setUpClass(self):
         self.main_program = fluid.Program()
@@ -184,8 +185,9 @@ def _check_fused_ops(self, program):
         self.assertTrue(
             self.num_fused_ops == acctual_num_fused_ops,
             "Checking of the number of fused operator < {} > failed. "
-            "Expected: {}, Received: {}".format(
-                self.fused_op_type, self.num_fused_ops, acctual_num_fused_ops))
+            "Expected: {}, Received: {}".format(self.fused_op_type,
+                                                self.num_fused_ops,
+                                                acctual_num_fused_ops))
 
     def check_program(self, program=None):
         '''
diff --git a/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py b/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py
index 711891216b68a..3eab857826005 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py
@@ -24,10 +24,11 @@
 np.random.seed(0)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_cuda() or
-                 paddle.get_cudnn_version() < 8000,
+@unittest.skipIf(not paddle.is_compiled_with_cuda()
+                 or paddle.get_cudnn_version() < 8000,
                  "only support with cuda and cudnn version is at least 8.0.")
 class TestFuseResNetUnit(unittest.TestCase):
+
     def test_fuse_resenet_unit(self):
         place = paddle.CUDAPlace(0)
         program = paddle.static.Program()
@@ -35,10 +36,14 @@ def test_fuse_resenet_unit(self):
         with paddle.static.amp.fp16_guard():
             with paddle.static.program_guard(program, startup_program):
                 x = paddle.static.data("x", [1, 64, 64, 8])
-                conv2d = paddle.nn.Conv2D(
-                    8, 32, 1, bias_attr=False, data_format='NHWC')
-                batch_norm = paddle.nn.BatchNorm(
-                    32, act='relu', data_layout='NHWC')
+                conv2d = paddle.nn.Conv2D(8,
+                                          32,
+                                          1,
+                                          bias_attr=False,
+                                          data_format='NHWC')
+                batch_norm = paddle.nn.BatchNorm(32,
+                                                 act='relu',
+                                                 data_layout='NHWC')
                 out = batch_norm(conv2d(x))
         graph = core.Graph(program.desc)
         core.get_pass("fuse_resnet_unit").apply(graph)
@@ -47,8 +52,9 @@ def test_fuse_resenet_unit(self):
         after_params = paddle.static.amp.cast_model_to_fp16(after_program)
         exe = paddle.static.Executor(place)
         exe.run(startup_program)
-        paddle.static.amp.cast_parameters_to_fp16(
-            place, program, to_fp16_var_names=params)
+        paddle.static.amp.cast_parameters_to_fp16(place,
+                                                  program,
+                                                  to_fp16_var_names=params)
         paddle.static.amp.cast_parameters_to_fp16(
             place, after_program, to_fp16_var_names=after_params)
         feed = {"x": np.random.randn(1, 64, 64, 8).astype("float16")}
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py
index aa31bc2a35d55..6a573e7beacce 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py
@@ -21,81 +21,89 @@
 
 
 class EmbEltwiseLayerNormFusePassTest(PassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            word_id = fluid.layers.data(
-                name="word_id",
-                shape=[1, 128, 1],
-                dtype="int64",
-                append_batch_size=False)
-            pos_id = fluid.layers.data(
-                name="pos_id",
-                shape=[1, 128, 1],
-                dtype="int64",
-                append_batch_size=False)
-            sent_id = fluid.layers.data(
-                name="sent_id",
-                shape=[1, 128, 1],
-                dtype="int64",
-                append_batch_size=False)
-            word_emb = fluid.layers.embedding(
-                input=word_id, size=(128, 768), dtype='float32')
-            pos_emb = fluid.layers.embedding(
-                input=pos_id, size=(128, 768), dtype='float32')
-            sent_emb = fluid.layers.embedding(
-                input=sent_id, size=(128, 768), dtype='float32')
+            word_id = fluid.layers.data(name="word_id",
+                                        shape=[1, 128, 1],
+                                        dtype="int64",
+                                        append_batch_size=False)
+            pos_id = fluid.layers.data(name="pos_id",
+                                       shape=[1, 128, 1],
+                                       dtype="int64",
+                                       append_batch_size=False)
+            sent_id = fluid.layers.data(name="sent_id",
+                                        shape=[1, 128, 1],
+                                        dtype="int64",
+                                        append_batch_size=False)
+            word_emb = fluid.layers.embedding(input=word_id,
+                                              size=(128, 768),
+                                              dtype='float32')
+            pos_emb = fluid.layers.embedding(input=pos_id,
+                                             size=(128, 768),
+                                             dtype='float32')
+            sent_emb = fluid.layers.embedding(input=sent_id,
+                                              size=(128, 768),
+                                              dtype='float32')
             add1 = fluid.layers.elementwise_add(word_emb, pos_emb)
             add2 = fluid.layers.elementwise_add(add1, sent_emb)
             hidden1 = fluid.layers.layer_norm(input=add2, begin_norm_axis=2)
 
-            id1 = fluid.layers.data(
-                name="id1",
-                shape=[1, 128, 1],
-                dtype="int64",
-                append_batch_size=False)
-            id2 = fluid.layers.data(
-                name="id2",
-                shape=[1, 128, 1],
-                dtype="int64",
-                append_batch_size=False)
-            id3 = fluid.layers.data(
-                name="id3",
-                shape=[1, 128, 1],
-                dtype="int64",
-                append_batch_size=False)
-            id4 = fluid.layers.data(
-                name="id4",
-                shape=[1, 128, 1],
-                dtype="int64",
-                append_batch_size=False)
-            emb1 = fluid.layers.embedding(
-                input=id1, size=(128, 768), dtype='float32')
-            emb2 = fluid.layers.embedding(
-                input=id2, size=(128, 768), dtype='float32')
-            emb3 = fluid.layers.embedding(
-                input=id3, size=(128, 768), dtype='float32')
-            emb4 = fluid.layers.embedding(
-                input=id4, size=(128, 768), dtype='float32')
+            id1 = fluid.layers.data(name="id1",
+                                    shape=[1, 128, 1],
+                                    dtype="int64",
+                                    append_batch_size=False)
+            id2 = fluid.layers.data(name="id2",
+                                    shape=[1, 128, 1],
+                                    dtype="int64",
+                                    append_batch_size=False)
+            id3 = fluid.layers.data(name="id3",
+                                    shape=[1, 128, 1],
+                                    dtype="int64",
+                                    append_batch_size=False)
+            id4 = fluid.layers.data(name="id4",
+                                    shape=[1, 128, 1],
+                                    dtype="int64",
+                                    append_batch_size=False)
+            emb1 = fluid.layers.embedding(input=id1,
+                                          size=(128, 768),
+                                          dtype='float32')
+            emb2 = fluid.layers.embedding(input=id2,
+                                          size=(128, 768),
+                                          dtype='float32')
+            emb3 = fluid.layers.embedding(input=id3,
+                                          size=(128, 768),
+                                          dtype='float32')
+            emb4 = fluid.layers.embedding(input=id4,
+                                          size=(128, 768),
+                                          dtype='float32')
             add_1 = fluid.layers.elementwise_add(emb1, emb2)
             add_2 = fluid.layers.elementwise_add(add_1, emb3)
             add_3 = fluid.layers.elementwise_add(add_2, emb4)
             hidden_1 = fluid.layers.layer_norm(input=add_3, begin_norm_axis=2)
 
         self.feeds = {
-            "word_id": np.random.randint(
-                low=0, high=128, size=(1, 128, 1)).astype("int64"),
-            "pos_id": np.random.randint(
-                low=0, high=128, size=(1, 128, 1)).astype("int64"),
-            "sent_id": np.random.randint(
-                low=0, high=128, size=(1, 128, 1)).astype("int64"),
-            "id1": np.random.randint(
-                low=0, high=128, size=(1, 128, 1)).astype("int64"),
-            "id2": np.random.randint(
-                low=0, high=128, size=(1, 128, 1)).astype("int64"),
-            "id3": np.random.randint(
-                low=0, high=128, size=(1, 128, 1)).astype("int64"),
-            "id4": np.random.randint(
-                low=0, high=128, size=(1, 128, 1)).astype("int64"),
+            "word_id":
+            np.random.randint(low=0, high=128,
+                              size=(1, 128, 1)).astype("int64"),
+            "pos_id":
+            np.random.randint(low=0, high=128,
+                              size=(1, 128, 1)).astype("int64"),
+            "sent_id":
+            np.random.randint(low=0, high=128,
+                              size=(1, 128, 1)).astype("int64"),
+            "id1":
+            np.random.randint(low=0, high=128,
+                              size=(1, 128, 1)).astype("int64"),
+            "id2":
+            np.random.randint(low=0, high=128,
+                              size=(1, 128, 1)).astype("int64"),
+            "id3":
+            np.random.randint(low=0, high=128,
+                              size=(1, 128, 1)).astype("int64"),
+            "id4":
+            np.random.randint(low=0, high=128,
+                              size=(1, 128, 1)).astype("int64"),
         }
         self.fetch_list = [hidden1, hidden_1]
         self.pass_names = "embedding_eltwise_layernorm_fuse_pass"
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_fc_fuse_pass.py
index cb485609b55ec..060d63cc332a5 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_fc_fuse_pass.py
@@ -21,10 +21,13 @@
 
 
 class FCFusePassTest(PassTest):
+
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[32, 128], dtype="float32", lod_level=0)
+            data = fluid.data(name="data",
+                              shape=[32, 128],
+                              dtype="float32",
+                              lod_level=0)
             tmp_0 = fluid.layers.fc(input=data,
                                     size=128,
                                     num_flatten_dims=1,
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
index 84d7bb5c969e6..10b861fad54c4 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
@@ -22,12 +22,12 @@
 
 
 class FusionGroupPassTest(PassTest):
+
     def build_program(self, dtype):
         with fluid.program_guard(self.main_program, self.startup_program):
             self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 2)
             self.feed_vars.append(
-                fluid.data(
-                    name="data2", shape=[128, 128], dtype=dtype))
+                fluid.data(name="data2", shape=[128, 128], dtype=dtype))
 
             # subgraph with only 1 op node
             tmp_0 = self.feed_vars[0] * self.feed_vars[1]
@@ -78,6 +78,7 @@ def test_check_output(self):
 
 
 class FusionGroupPassComplicatedTest(FusionGroupPassTest):
+
     def build_program(self, dtype):
         with fluid.program_guard(self.main_program, self.startup_program):
             self.feed_vars = self._prepare_feed_vars([32, 64], dtype, 5)
@@ -96,12 +97,12 @@ def build_program(self, dtype):
 
 
 class FusionGroupPassInplaceTest(FusionGroupPassTest):
+
     def build_program(self, dtype):
         with fluid.program_guard(self.main_program, self.startup_program):
             self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 3)
             self.feed_vars.append(
-                fluid.data(
-                    name="data3", shape=[128, 32], dtype=dtype))
+                fluid.data(name="data3", shape=[128, 32], dtype=dtype))
 
             # subgraph with 3 op node
             tmp_0 = self.feed_vars[0] - self.feed_vars[1]
@@ -114,6 +115,7 @@ def build_program(self, dtype):
 
 
 class FusionGroupPassTestFP64(FusionGroupPassTest):
+
     def setUp(self):
         self.build_program("float64")
         self.feeds = self._feed_random_data(self.feed_vars)
@@ -122,12 +124,12 @@ def setUp(self):
 
 
 class FusionGroupPassTestCastAndFP16(FusionGroupPassTest):
+
     def build_program(self, dtype):
         with fluid.program_guard(self.main_program, self.startup_program):
             self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 2)
             self.feed_vars.append(
-                fluid.data(
-                    name="data2", shape=[128, 128], dtype=dtype))
+                fluid.data(name="data2", shape=[128, 128], dtype=dtype))
 
             # subgraph with 2 op nodes
             tmp_0 = self.feed_vars[0] * self.feed_vars[1]
@@ -150,12 +152,12 @@ def build_program(self, dtype):
 
 
 class FusionGroupPassSumTest(FusionGroupPassTest):
+
     def build_program(self, dtype):
         with fluid.program_guard(self.main_program, self.startup_program):
             self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 3)
             self.feed_vars.append(
-                fluid.data(
-                    name="data3", shape=[128, 128], dtype=dtype))
+                fluid.data(name="data3", shape=[128, 128], dtype=dtype))
 
             # subgraph with 2 op nodes
             tmp_0 = layers.sum(
@@ -172,6 +174,7 @@ def build_program(self, dtype):
 
 
 class FusionGroupPassCastTest(FusionGroupPassTest):
+
     def build_program(self, dtype):
         with fluid.program_guard(self.main_program, self.startup_program):
             self.feed_vars = self._prepare_feed_vars([2, 2], dtype, 2)
@@ -193,14 +196,17 @@ def setUp(self):
 
 
 class FusionGroupPassFillConstantTest(FusionGroupPassTest):
+
     def build_program(self, dtype):
         with fluid.program_guard(self.main_program, self.startup_program):
             self.feed_vars = self._prepare_feed_vars([2, 2], dtype, 2)
 
             tmp_0 = layers.elementwise_add(self.feed_vars[0], self.feed_vars[1])
             tmp_1 = layers.fill_constant(shape=[2, 2], dtype=dtype, value=2.0)
-            tmp_2 = layers.scale(
-                tmp_1, scale=3.0, bias=1.0, bias_after_scale=True)
+            tmp_2 = layers.scale(tmp_1,
+                                 scale=3.0,
+                                 bias=1.0,
+                                 bias_after_scale=True)
             tmp_3 = layers.elementwise_mul(tmp_2, tmp_0)
 
         self.append_gradients(tmp_3)
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py
index 2a7c2768e27cd..7c6ab5d9462ea 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py
@@ -23,7 +23,9 @@
 # 1: relu(X=ewadd(X=mul(X=x, Y=w), Y=b)) => fc(Input=x, W=w, Bias=b)
 @ir.RegisterPass
 def generate_fc_fuse():
+
     def create_pass_pair(with_relu):
+
         def pattern(x, w, b):
             mul = ir.PassDesc.OP.mul(X=x, Y=w)
             ewadd = ir.PassDesc.OP.elementwise_add(X=mul, Y=b)
@@ -34,8 +36,8 @@ def pattern(x, w, b):
 
         def replace(x, w, b):
             fc = ir.PassDesc.OP.fc(Input=x, W=w, Bias=b)
-            fc.Attr("in_num_col_dims").MappedPattern(
-                op="mul", name="x_num_col_dims")
+            fc.Attr("in_num_col_dims").MappedPattern(op="mul",
+                                                     name="x_num_col_dims")
             if with_relu:
                 fc.SetAttr("activation_type", "relu")
             return fc
@@ -55,6 +57,7 @@ def multi_add_to_sum_v1():
 
 @ir.RegisterPass
 def multi_add_to_sum_v2():
+
     def pattern(x, y, z):
         ewadd1 = ir.PassDesc.OP.elementwise_add(X=x, Y=y)
         ewadd2 = ir.PassDesc.OP.elementwise_add(X=ewadd1, Y=z)
@@ -78,6 +81,7 @@ def multi_add_to_sum_v3():
     'y2': InputSpec([32, 48])
 })
 def generate_combine_mul_v1():
+
     def pattern(x, y1, y2):
         mul1 = paddle.matmul(x, y1)
         mul2 = paddle.matmul(x, y2)
@@ -95,6 +99,7 @@ def replace(x, y1, y2):
 
 @ir.RegisterPass
 def generate_combine_mul_v2():
+
     def pattern(x, y1, y2):
         mul1 = ir.PassDesc.OP.matmul_v2(X=x, Y=y1)
         mul2 = ir.PassDesc.OP.matmul_v2(X=x, Y=y2)
@@ -113,6 +118,7 @@ def replace(x, y1, y2):
 # reshape(reshape(x)) => x
 @ir.RegisterPass(input_specs={'x': InputSpec([10, 16, 16])})
 def generate_simplify_inference_v1():
+
     def pattern(x):
         transpose = paddle.transpose(x, [0, 2, 1])
         return paddle.transpose(transpose, [0, 2, 1])
@@ -122,6 +128,7 @@ def pattern(x):
 
 @ir.RegisterPass
 def generate_simplify_inference_v2():
+
     def pattern(x):
         op1 = ir.PassDesc.OP.transpose2
         op2 = ir.PassDesc.OP.transpose2
@@ -133,6 +140,7 @@ def pattern(x):
 
 @ir.RegisterPass
 def generate_layer_norm_fuse_pass():
+
     def pattern(x, gamma, beta):
         gamma.Attr("shape").Size().EQ(1)
         gamma.Attr("shape")[0].EQ(x.Attr("shape")[-1])
@@ -167,6 +175,7 @@ def replace(x, gamma, beta):
 
 @ir.RegisterPass
 def unimplemented_operand_exception():
+
     def pattern(x, y):
         return ir.PassDesc.OP.elementwise_add(X=x, Y=y)
 
@@ -180,6 +189,7 @@ def replace(x, y):
 
 @ir.RegisterPass
 def unimplemented_operation_exception():
+
     def pattern(x, y):
         return ir.PassDesc.OP.elementwise_add(X=x, Y=y)
 
@@ -198,6 +208,7 @@ def get_multi_pass_desc_from_str(s):
 
 
 class TestGeneratePass(unittest.TestCase):
+
     def convert_ops_to_op_dicts(self, ops):
         op_dicts = dict()
         for op in ops:
@@ -226,12 +237,13 @@ def test_exception(self):
             core.get_pass("unimplemented_operation_exception").apply(graph)
 
     def test_generate_fc_fuse(self):
+
         def _check_fc_fuse_pass(pass_desc, with_relu):
             pattern_op_dicts = self.convert_ops_to_op_dicts(pass_desc.pattern)
             replace_op_dicts = self.convert_ops_to_op_dicts(pass_desc.replace)
             self.assertEqual(len(pattern_op_dicts.get("mul", [])), 1)
-            self.assertEqual(
-                len(pattern_op_dicts.get("elementwise_add", [])), 1)
+            self.assertEqual(len(pattern_op_dicts.get("elementwise_add", [])),
+                             1)
             if with_relu:
                 self.assertEqual(len(pattern_op_dicts.get("relu", [])), 1)
                 pattern_op_num = 3  # relu, ewadd, mul
@@ -312,8 +324,9 @@ def test_generate_combine_mul_v1(self):
         }
         before_out1, before_out2 = executor.run(
             program, feed=feed, fetch_list=[out1.name, out2.name])
-        after_out1, after_out2 = executor.run(
-            after_program, feed=feed, fetch_list=[out1.name, out2.name])
+        after_out1, after_out2 = executor.run(after_program,
+                                              feed=feed,
+                                              fetch_list=[out1.name, out2.name])
         self.assertTrue(np.allclose(before_out1, after_out1))
         self.assertTrue(np.allclose(before_out2, after_out2))
 
@@ -368,10 +381,12 @@ def test_generate_layer_norm_fuse_pass(self):
         startup_program = paddle.static.Program()
         with paddle.static.program_guard(program, startup_program):
             x = paddle.static.data("x", [3, 64, 120], "float32")
-            gamma = paddle.static.create_parameter(
-                shape=[120], dtype="float32", is_bias=True)
-            beta = paddle.static.create_parameter(
-                shape=[120], dtype="float32", is_bias=True)
+            gamma = paddle.static.create_parameter(shape=[120],
+                                                   dtype="float32",
+                                                   is_bias=True)
+            beta = paddle.static.create_parameter(shape=[120],
+                                                  dtype="float32",
+                                                  is_bias=True)
 
             x_sub_mean = x - paddle.mean(x, axis=-1, keepdim=True)
             std_dev = paddle.mean(x_sub_mean.pow(2), axis=-1, keepdim=True)
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_graph_to_program_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_graph_to_program_pass.py
index da6cc4ed64f3d..1815fe16fdbde 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_graph_to_program_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_graph_to_program_pass.py
@@ -33,6 +33,7 @@ def IRGraph_to_program(ir_graph):
 
 
 class GraphToProgramPassTest(unittest.TestCase):
+
     def check_vars_equal(self, o_block, c_block):
         o_params = sorted(o_block.all_parameters(), key=lambda p: p.name)
         c_params = sorted(c_block.all_parameters(), key=lambda p: p.name)
@@ -70,11 +71,12 @@ def check_op_attrs_equal(self, o_op, c_op):
             o_attr = o_attrs[attr_idx]
             c_attr = c_attrs[attr_idx]
             self.assertEqual(o_attr, c_attr)
-            self.assertEqual(
-                o_op.desc.attr_type(o_attr), c_op.desc.attr_type(c_attr))
+            self.assertEqual(o_op.desc.attr_type(o_attr),
+                             c_op.desc.attr_type(c_attr))
 
 
 class SingleGraphToProgramPass(GraphToProgramPassTest):
+
     def setUp(self):
         self.origin_program = self.build_program()
         ir_graph = program_to_IRGraph(self.origin_program)
@@ -91,10 +93,10 @@ def build_program():
         return program
 
     def test_check_parameter(self):
-        origin_parameter = sorted(
-            self.origin_program.all_parameters(), key=lambda p: p.name)
-        converted_parameter = sorted(
-            self.converted_program.all_parameters(), key=lambda p: p.name)
+        origin_parameter = sorted(self.origin_program.all_parameters(),
+                                  key=lambda p: p.name)
+        converted_parameter = sorted(self.converted_program.all_parameters(),
+                                     key=lambda p: p.name)
 
         self.assertEqual(len(origin_parameter), len(converted_parameter))
 
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_skip_layernorm_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_skip_layernorm_pass.py
index 0aac6650f52dd..25b5fa6ffb7c9 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_skip_layernorm_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_skip_layernorm_pass.py
@@ -22,13 +22,18 @@
 
 
 class SkipLayerNormFusePassTest(PassTest):
+
     def setUp(self):
         paddle.enable_static()
         with fluid.program_guard(self.main_program, self.startup_program):
-            x = fluid.data(
-                name="x", shape=[128, 768], dtype="float32", lod_level=0)
-            y = fluid.data(
-                name="y", shape=[128, 768], dtype="float32", lod_level=0)
+            x = fluid.data(name="x",
+                           shape=[128, 768],
+                           dtype="float32",
+                           lod_level=0)
+            y = fluid.data(name="y",
+                           shape=[128, 768],
+                           dtype="float32",
+                           lod_level=0)
             elementwise_out = fluid.layers.elementwise_add(x=x, y=y)
             out = fluid.layers.layer_norm(input=elementwise_out)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py b/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py
index 49ca89a35f4ac..0c9170242e7de 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,10 +29,13 @@
 
 
 class TestQuantizationSubGraph(unittest.TestCase):
+
     def build_graph_with_sub_graph(self):
+
         def linear_fc(num):
-            data = fluid.layers.data(
-                name='image', shape=[1, 32, 32], dtype='float32')
+            data = fluid.layers.data(name='image',
+                                     shape=[1, 32, 32],
+                                     dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             hidden = data
             for _ in six.moves.xrange(num):
@@ -57,7 +60,7 @@ def false_func():
             out = layers.cond(pred, true_func, false_func)
 
         core_graph = core.Graph(main_program.desc)
-        # We should create graph for test, otherwise it will throw a 
+        # We should create graph for test, otherwise it will throw a
         # error that it cannot find the node of "STEP_COUNTER"
         graph = IrGraph(core_graph, for_test=True)
         sub_graph = graph.get_sub_graph(0)
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_yolo_box_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_yolo_box_pass.py
index 02fb890220431..b710436a511a0 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_yolo_box_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_yolo_box_pass.py
@@ -17,6 +17,7 @@
 import paddle
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
+
 paddle.enable_static()
 
 
@@ -36,19 +37,18 @@ def multiclass_nms(bboxes,
     inputs = {'BBoxes': bboxes, 'Scores': scores}
     outputs = {'Out': output, 'Index': index, 'NmsRoisNum': nms_rois_num}
 
-    helper.append_op(
-        type="multiclass_nms3",
-        inputs=inputs,
-        attrs={
-            'background_label': background_label,
-            'score_threshold': score_threshold,
-            'nms_top_k': nms_top_k,
-            'nms_threshold': nms_threshold,
-            'keep_top_k': keep_top_k,
-            'nms_eta': nms_eta,
-            'normalized': normalized
-        },
-        outputs=outputs)
+    helper.append_op(type="multiclass_nms3",
+                     inputs=inputs,
+                     attrs={
+                         'background_label': background_label,
+                         'score_threshold': score_threshold,
+                         'nms_top_k': nms_top_k,
+                         'nms_threshold': nms_threshold,
+                         'keep_top_k': keep_top_k,
+                         'nms_eta': nms_eta,
+                         'normalized': normalized
+                     },
+                     outputs=outputs)
     output.stop_gradient = True
     index.stop_gradient = True
 
@@ -56,6 +56,7 @@ def multiclass_nms(bboxes,
 
 
 class TestYoloBoxPass(unittest.TestCase):
+
     def test_yolo_box_pass(self):
         program = paddle.static.Program()
         with paddle.static.program_guard(program):
diff --git a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
index ea125ccf3fc6c..cc3e1c2961efd 100644
--- a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
+++ b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
@@ -33,12 +33,13 @@
 
 
 class BuildIrMemOptBase(unittest.TestCase):
+
     def setup_reader(self):
         self.batch_size = 32
         self.word_dict = paddle.dataset.imdb.word_dict()
-        self.train_reader = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict),
-            batch_size=self.batch_size)
+        self.train_reader = paddle.batch(paddle.dataset.imdb.train(
+            self.word_dict),
+                                         batch_size=self.batch_size)
 
     def check_network_convergence(self,
                                   network,
@@ -58,8 +59,10 @@ def check_network_convergence(self,
         fluid.default_startup_program().random_seed = 100
         fluid.default_main_program().random_seed = 100
 
-        data = fluid.layers.data(
-            name="words", shape=[1], dtype="int64", lod_level=1)
+        data = fluid.layers.data(name="words",
+                                 shape=[1],
+                                 dtype="int64",
+                                 lod_level=1)
 
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
@@ -78,8 +81,8 @@ def check_network_convergence(self,
         exe.run(fluid.default_startup_program())
 
         train_cp = compiler.CompiledProgram(fluid.default_main_program())
-        train_cp = train_cp.with_data_parallel(
-            loss_name=cost.name, build_strategy=build_strategy)
+        train_cp = train_cp.with_data_parallel(loss_name=cost.name,
+                                               build_strategy=build_strategy)
         fetch_list = [cost.name]
 
         begin = time.time()
@@ -99,8 +102,8 @@ def check_network_convergence(self,
                 break
         end = time.time()
 
-        print("%.4f Instance per second" % (
-            (self.batch_size * iter) / (end - begin)))
+        print("%.4f Instance per second" % ((self.batch_size * iter) /
+                                            (end - begin)))
 
         print(first_loss, last_loss)
         avg_last_loss_val = np.array(last_loss).mean()
@@ -113,6 +116,7 @@ def check_network_convergence(self,
 
 
 class TestIrMemOptBase(BuildIrMemOptBase):
+
     def setUp(self):
         self.network = None
 
@@ -130,11 +134,9 @@ def test_network(self):
                 cur_first_loss, cur_last_loss = self.check_network_convergence(
                     self.network)
 
-                self.assertAlmostEquals(
-                    np.mean(baseline_last_loss),
-                    np.mean(cur_last_loss),
-                    delta=1e-6)
-                self.assertAlmostEquals(
-                    np.mean(baseline_first_loss),
-                    np.mean(cur_first_loss),
-                    delta=1e-6)
+                self.assertAlmostEquals(np.mean(baseline_last_loss),
+                                        np.mean(cur_last_loss),
+                                        delta=1e-6)
+                self.assertAlmostEquals(np.mean(baseline_first_loss),
+                                        np.mean(cur_first_loss),
+                                        delta=1e-6)
diff --git a/python/paddle/fluid/tests/unittests/launch_function_helper.py b/python/paddle/fluid/tests/unittests/launch_function_helper.py
index 0462684440187..d5eb73057b9a6 100644
--- a/python/paddle/fluid/tests/unittests/launch_function_helper.py
+++ b/python/paddle/fluid/tests/unittests/launch_function_helper.py
@@ -59,6 +59,7 @@ def wait(procs, timeout=30):
 
 
 def _find_free_port(port_set):
+
     def __free_port():
         with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
             s.bind(('', 0))
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
index 69991a446d7a1..7ed1529ea4c6b 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
@@ -1,8 +1,11 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 set_tests_properties(test_concat_mkldnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv3d_mkldnn_op PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
index 11b8858b6b195..e13a15e35cfbe 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
@@ -21,6 +21,7 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import _global_flags
 from paddle.fluid.framework import _enable_legacy_dygraph
+
 _enable_legacy_dygraph()
 
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py b/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
index ab9dc2455af94..6bceff485fda6 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
@@ -20,9 +20,8 @@
 
 
 def __assert_close(test_case, tensor, np_array, msg, atol=1e-4):
-    test_case.assertTrue(
-        np.allclose(
-            np.array(tensor), np_array, atol=atol), msg)
+    test_case.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol),
+                         msg)
 
 
 def check_if_mkldnn_primitives_exist_in_bwd(test_case, op_type, x, out,
@@ -37,18 +36,20 @@ def check_if_mkldnn_primitives_exist_in_bwd(test_case, op_type, x, out,
     with fluid.program_guard(program):
         block = program.global_block()
         for name in ground_truth:
-            block.create_var(
-                name=name, dtype=np.float32, shape=ground_truth[name].shape)
+            block.create_var(name=name,
+                             dtype=np.float32,
+                             shape=ground_truth[name].shape)
 
-        op = block.append_op(
-            type=op_type,
-            inputs={'X': block.var('x'), },
-            outputs={'Out': block.var('out')},
-            attrs={'use_mkldnn': True})
+        op = block.append_op(type=op_type,
+                             inputs={
+                                 'X': block.var('x'),
+                             },
+                             outputs={'Out': block.var('out')},
+                             attrs={'use_mkldnn': True})
 
         # Generate backward op_desc
-        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc,
-                                                                  set(), [])
+        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+            op.desc, set(), [])
         grad_op_desc = grad_op_desc_list[0]
         new_op_desc = block.desc.append_op()
         new_op_desc.copy_from(grad_op_desc)
@@ -73,8 +74,9 @@ def check_if_mkldnn_primitives_exist_in_bwd(test_case, op_type, x, out,
         __assert_close(test_case, x_grad, out[0], 'x@GRAD')
 
 
-def check_if_mkldnn_batchnorm_primitives_exist_in_bwd(
-        test_case, var_dict, place, shape, data_layout):
+def check_if_mkldnn_batchnorm_primitives_exist_in_bwd(test_case, var_dict,
+                                                      place, shape,
+                                                      data_layout):
 
     var_names = [
         'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
@@ -85,8 +87,9 @@ def check_if_mkldnn_batchnorm_primitives_exist_in_bwd(
     with fluid.program_guard(program):
         block = program.global_block()
         for name in ground_truth:
-            block.create_var(
-                name=name, dtype='float32', shape=ground_truth[name].shape)
+            block.create_var(name=name,
+                             dtype='float32',
+                             shape=ground_truth[name].shape)
         bn_op = block.append_op(
             type="batch_norm",
             inputs={
@@ -112,8 +115,9 @@ def check_if_mkldnn_batchnorm_primitives_exist_in_bwd(
                 "fuse_with_relu": test_case.fuse_with_relu,
                 "use_global_stats": test_case.use_global_stats
             })
-        block.create_var(
-            name='y@GRAD', dtype='float32', shape=var_dict['y'].shape)
+        block.create_var(name='y@GRAD',
+                         dtype='float32',
+                         shape=var_dict['y'].shape)
 
         # generate backward op_desc
         grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py
index ac851bf9febf0..aeb40ed9502a1 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py
@@ -28,6 +28,7 @@
 @OpTestTool.skip_if_not_cpu_bf16()
 @six.add_metaclass(abc.ABCMeta)
 class MKLDNNBF16ActivationOp(object):
+
     @abc.abstractmethod
     def config(self):
         pass
@@ -72,6 +73,7 @@ def test_check_grad(self):
 
 
 class TestMKLDNNSigmoidBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+
     def config(self):
         self.op_type = "sigmoid"
 
@@ -83,6 +85,7 @@ def op_grad(self, dout, x):
 
 
 class TestMKLDNNSqrtBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+
     def config(self):
         self.op_type = "sqrt"
 
@@ -97,6 +100,7 @@ def op_grad(self, dout, x):
 
 
 class TestMKLDNNGeluErfBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+
     def config(self):
         self.op_type = "gelu"
 
@@ -110,11 +114,13 @@ def op_grad(self, dout, x):
 
 
 class TestMKLDNNGeluErfDim2BF16Op(TestMKLDNNGeluErfBF16Op):
+
     def init_data(self):
         self.x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
 
 
 class TestMKLDNNGeluTanhBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+
     def config(self):
         self.op_type = "gelu"
 
@@ -133,11 +139,13 @@ def set_attrs(self):
 
 
 class TestMKLDNNGeluTanhDim2BF16Op(TestMKLDNNGeluTanhBF16Op):
+
     def init_data(self):
         self.x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
 
 
 class TestMKLDNNReluBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+
     def config(self):
         self.op_type = "relu"
 
@@ -149,6 +157,7 @@ def op_grad(self, dout, x):
 
 
 class TestMKLDNNMishBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+
     def config(self):
         self.op_type = "mish"
 
@@ -156,13 +165,14 @@ def op_forward(self, x):
         return x * np.tanh(np.log(1 + np.exp(x)))
 
     def op_grad(self, dout, x):
-        omega = np.exp(3 * x) + 4 * np.exp(2 * x) + np.exp(x) * (4 * x + 6
-                                                                 ) + 4 * (x + 1)
+        omega = np.exp(
+            3 * x) + 4 * np.exp(2 * x) + np.exp(x) * (4 * x + 6) + 4 * (x + 1)
         delta = np.exp(2 * x) + 2 * np.exp(x) + 2
         return dout * ((np.exp(x) * omega) / delta**2)
 
 
 class TestMKLDNNRelu6BF16Op(MKLDNNBF16ActivationOp, TestActivation):
+
     def config(self):
         self.op_type = "relu6"
 
@@ -174,6 +184,7 @@ def op_grad(self, dout, x):
 
 
 class TestMKLDNNLeakyReluBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+
     def config(self):
         self.op_type = "leaky_relu"
 
@@ -189,6 +200,7 @@ def set_attrs(self):
 
 
 class TestMKLDNNSwishBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+
     def config(self):
         self.op_type = "swish"
 
@@ -207,6 +219,7 @@ def set_attrs(self):
 
 
 class TestMKLDNNHardSwishBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+
     def config(self):
         self.op_type = "hard_swish"
 
@@ -220,6 +233,7 @@ def op_grad(self, dout, x):
 
 
 class TestMKLDNNTanhBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+
     def config(self):
         self.op_type = "tanh"
 
@@ -231,6 +245,7 @@ def op_grad(self, dout, x):
 
 
 class TestMKLDNNAbsBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+
     def config(self):
         self.op_type = "abs"
 
@@ -242,6 +257,7 @@ def op_grad(self, dout, x):
 
 
 class TestMKLDNNEluBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+
     def config(self):
         self.op_type = "elu"
 
@@ -257,6 +273,7 @@ def set_attrs(self):
 
 
 class TestMKLDNNExpBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+
     def config(self):
         self.op_type = "exp"
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index 44263b89e1616..6796773ae6597 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -25,6 +25,7 @@
 
 
 class TestMKLDNNReluDim2(TestRelu):
+
     def setUp(self):
         super(TestMKLDNNReluDim2, self).setUp()
 
@@ -35,6 +36,7 @@ def init_dtype(self):
 
 
 class TestMKLDNNRelu6Dim2(TestRelu6):
+
     def setUp(self):
         super(TestMKLDNNRelu6Dim2, self).setUp()
         self.attrs.update({"use_mkldnn": True})
@@ -44,6 +46,7 @@ def init_dtype(self):
 
 
 class TestMKLDNNLeakyReluDim2(TestLeakyRelu):
+
     def setUp(self):
         super(TestMKLDNNLeakyReluDim2, self).setUp()
 
@@ -54,6 +57,7 @@ def init_dtype(self):
 
 
 class TestMKLDNNGeluDim2(TestActivation):
+
     def setUp(self):
         self.op_type = "gelu"
         self.dtype = np.float32
@@ -67,6 +71,7 @@ def setUp(self):
 
 
 class TestMKLDNNGeluDim2Approx(TestActivation):
+
     def setUp(self):
         self.op_type = "gelu"
         self.dtype = np.float32
@@ -80,6 +85,7 @@ def setUp(self):
 
 
 class TestMKLDNNTanhDim2(TestTanh):
+
     def setUp(self):
         super(TestMKLDNNTanhDim2, self).setUp()
 
@@ -90,6 +96,7 @@ def init_dtype(self):
 
 
 class TestMKLDNNSqrtDim2(TestSqrt):
+
     def setUp(self):
         super(TestMKLDNNSqrtDim2, self).setUp()
 
@@ -100,6 +107,7 @@ def init_dtype(self):
 
 
 class TestMKLDNNAbsDim2(TestAbs):
+
     def setUp(self):
         super(TestMKLDNNAbsDim2, self).setUp()
         self.attrs = {"use_mkldnn": True}
@@ -109,6 +117,7 @@ def init_dtype(self):
 
 
 class TestMKLDNNSwishDim2(TestSwish):
+
     def setUp(self):
         super(TestMKLDNNSwishDim2, self).setUp()
 
@@ -120,6 +129,7 @@ def init_dtype(self):
 
 
 class TestMKLDNNHardSwishDim2(TestHardSwish):
+
     def setUp(self):
         super(TestMKLDNNHardSwishDim2, self).setUp()
 
@@ -130,12 +140,14 @@ def init_dtype(self):
 
 
 class TestMKLDNNSigmoidDim2(TestSigmoid):
+
     def setUp(self):
         super(TestMKLDNNSigmoidDim2, self).setUp()
         self.attrs = {"use_mkldnn": True}
 
 
 class TestMKLDNNReluDim4(TestRelu):
+
     def setUp(self):
         super(TestMKLDNNReluDim4, self).setUp()
 
@@ -153,6 +165,7 @@ def init_dtype(self):
 
 
 class TestMKLDNNLeakyReluDim4(TestLeakyRelu):
+
     def setUp(self):
         super(TestMKLDNNLeakyReluDim4, self).setUp()
 
@@ -170,6 +183,7 @@ def init_dtype(self):
 
 
 class TestMKLDNNGeluDim4(TestActivation):
+
     def setUp(self):
         self.op_type = "gelu"
         self.dtype = np.float32
@@ -183,6 +197,7 @@ def setUp(self):
 
 
 class TestMKLDNNGeluDim4Approx(TestActivation):
+
     def setUp(self):
         self.op_type = "gelu"
         self.dtype = np.float32
@@ -198,6 +213,7 @@ def setUp(self):
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestMKLDNNGeluBf16Dim4(TestActivation):
+
     def setUp(self):
         self.op_type = "gelu"
         self.dtype = np.uint16
@@ -219,6 +235,7 @@ def test_check_grad(self):
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestMKLDNNGeluBf16Dim4Approx(TestActivation):
+
     def setUp(self):
         self.op_type = "gelu"
         self.dtype = np.uint16
@@ -238,6 +255,7 @@ def test_check_grad(self):
 
 
 class TestMKLDNNTanhDim4(TestTanh):
+
     def setUp(self):
         super(TestMKLDNNTanhDim4, self).setUp()
 
@@ -249,6 +267,7 @@ def setUp(self):
 
 
 class TestMKLDNNSqrtDim4(TestSqrt):
+
     def setUp(self):
         super(TestMKLDNNSqrtDim4, self).setUp()
 
@@ -260,6 +279,7 @@ def setUp(self):
 
 
 class TestMKLDNNAbsDim4(TestAbs):
+
     def setUp(self):
         super(TestMKLDNNAbsDim4, self).setUp()
 
@@ -275,6 +295,7 @@ def init_dtype(self):
 
 
 class TestMKLDNNSwishDim4(TestSwish):
+
     def setUp(self):
         super(TestMKLDNNSwishDim4, self).setUp()
 
@@ -297,6 +318,7 @@ def ref_hardswish(x, threshold=6.0, scale=6.0, offset=3.0):
 
 
 class TestMKLDNNHardSwishDim4(TestHardSwish):
+
     def setUp(self):
         super(TestMKLDNNHardSwishDim4, self).setUp()
 
@@ -318,6 +340,7 @@ def init_dtype(self):
 
 
 class TestMKLDNNMish(TestActivation):
+
     def setUp(self):
         self.op_type = "mish"
         self.dtype = np.float32
@@ -331,6 +354,7 @@ def setUp(self):
 
 
 class TestMKLDNNRound(TestActivation):
+
     def setUp(self):
         self.op_type = "round"
 
@@ -343,6 +367,7 @@ def setUp(self):
 
 
 class TestMKLDNNSigmoidDim4(TestSigmoid):
+
     def setUp(self):
         super(TestMKLDNNSigmoidDim4, self).setUp()
 
@@ -354,6 +379,7 @@ def setUp(self):
 
 
 class TestMKLDNNEluDefaultAlpha(TestActivation):
+
     def setUp(self):
         self.op_type = "elu"
         self.set_alpha()
@@ -372,11 +398,13 @@ def set_alpha(self):
 
 
 class TestMKLDNNEluCustomAlpha(TestMKLDNNEluDefaultAlpha):
+
     def set_alpha(self):
         self.alpha = 2.5
 
 
 class TestMKLDNNExpOp(TestActivation):
+
     def setUp(self):
         self.op_type = "exp"
         x = np.random.random((5, 5, 4)).astype("float32")
@@ -388,6 +416,7 @@ def setUp(self):
 
 # Check if primitives already exist in backward
 class TestMKLDNNAbsPrimitivesAlreadyExist(unittest.TestCase):
+
     def setUp(self):
         super(TestMKLDNNAbsPrimitivesAlreadyExist, self).setUp()
 
@@ -403,8 +432,9 @@ def __abs_bwd(self, x, out_grad):
         return out_grad * np.sign(x)
 
     def test_check(self):
-        check_if_mkldnn_primitives_exist_in_bwd(
-            self, self.op_type, self.x, self.out, self.out_grad, self.x_grad)
+        check_if_mkldnn_primitives_exist_in_bwd(self, self.op_type, self.x,
+                                                self.out, self.out_grad,
+                                                self.x_grad)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py
index 85b398f684237..3c6640822ae45 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py
@@ -28,6 +28,7 @@
 
 
 class TestMKLDNNBatchNormOpTraining(TestBatchNormOpTraining):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self.data_formats = ["NCHW"]
@@ -44,19 +45,23 @@ def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
         mean_out = saved_mean * (1. - momentum) + momentum * mean
         variance_out = saved_variance * (1. - momentum) + momentum * variance
         # run backward
-        x_grad, scale_grad, bias_grad = _reference_grad(
-            x, y_grad, scale, saved_mean, saved_variance, epsilon, data_layout)
+        x_grad, scale_grad, bias_grad = _reference_grad(x, y_grad, scale,
+                                                        saved_mean,
+                                                        saved_variance, epsilon,
+                                                        data_layout)
 
         return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
 
 
 class TestMKLDNNBatchNormOpTraining_NHWC(TestMKLDNNBatchNormOpTraining):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self.data_formats = ["NHWC"]
 
 
 class TestMKLDNNBatchNormOpExistedPrimitives(TestMKLDNNBatchNormOpTraining):
+
     def init_test_case(self):
         TestMKLDNNBatchNormOpTraining.init_test_case(self)
         self.fetch_list = ['y', 'x@GRAD']
@@ -82,11 +87,12 @@ def test_forward_backward(self):
         var_dict['x@GRAD'] = x_grad
         var_dict['scale@GRAD'] = scale_grad
         var_dict['bias@GRAD'] = bias_grad
-        check_if_mkldnn_batchnorm_primitives_exist_in_bwd(self, var_dict, place,
-                                                          shape, data_layout)
+        check_if_mkldnn_batchnorm_primitives_exist_in_bwd(
+            self, var_dict, place, shape, data_layout)
 
 
 class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
@@ -97,6 +103,7 @@ def test_check_output(self):
 
 
 class TestMKLDNNBatchNormOpInference_NHWC(TestMKLDNNBatchNormOpInference):
+
     def test_check_output(self):
         place = core.CPUPlace()
         data_format = "NHWC"
@@ -104,6 +111,7 @@ def test_check_output(self):
 
 
 class TestMKLDNNBatchNormOpWithReluInference(TestBatchNormOpInference):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self.fuse_with_relu = True
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py
index e740efa14c575..707f98d753b4b 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py
@@ -59,9 +59,10 @@ def bilinear_interp_mkldnn_np(input,
             input_h1_w0 = input[:, :, h1, w0]
             input_h0_w1 = input[:, :, h0, w1]
             input_h1_w1 = input[:, :, h1, w1]
-            out[:, :, oh, ow] = input_h0_w0 * (1 - Wh) * (
-                1 - Ww) + input_h1_w0 * Wh * (1 - Ww) + input_h0_w1 * (
-                    1 - Wh) * Ww + input_h1_w1 * Wh * Ww
+            out[:, :, oh,
+                ow] = input_h0_w0 * (1 - Wh) * (1 - Ww) + input_h1_w0 * Wh * (
+                    1 - Ww) + input_h0_w1 * (1 -
+                                             Wh) * Ww + input_h1_w1 * Wh * Ww
 
     if data_layout == "NHWC":
         out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
@@ -71,6 +72,7 @@ def bilinear_interp_mkldnn_np(input,
 
 @skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
 class TestBilinearInterpMKLDNNOp(OpTest):
+
     def init_test_case(self):
         pass
 
@@ -129,6 +131,7 @@ def test_check_output(self):
 
 
 class TestBilinearInterpOpMKLDNNNHWC(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [3, 2, 32, 16]
         self.out_h = 27
@@ -138,6 +141,7 @@ def init_test_case(self):
 
 
 class TestBilinearNeighborInterpMKLDNNCase2(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
@@ -146,6 +150,7 @@ def init_test_case(self):
 
 
 class TestBilinearNeighborInterpDataLayout(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [2, 4, 4, 5]
         self.out_h = 6
@@ -155,6 +160,7 @@ def init_test_case(self):
 
 
 class TestBilinearNeighborInterpCase3(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [1, 1, 32, 64]
         self.out_h = 64
@@ -163,6 +169,7 @@ def init_test_case(self):
 
 
 class TestBilinearNeighborInterpCase4(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 1
@@ -172,6 +179,7 @@ def init_test_case(self):
 
 
 class TestBilinearNeighborInterpCase5(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [1, 1, 9, 6]
         self.out_h = 12
@@ -181,6 +189,7 @@ def init_test_case(self):
 
 
 class TestBilinearNeighborInterpCase6(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [1, 1, 32, 64]
         self.out_h = 64
@@ -190,6 +199,7 @@ def init_test_case(self):
 
 
 class TestBilinearNeighborInterpSame(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [2, 3, 32, 64]
         self.out_h = 32
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
index e3b0639289ab2..8a9455690f42e 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
@@ -59,9 +59,10 @@ def bilinear_interp_mkldnn_np(input,
             input_h1_w0 = input[:, :, h1, w0]
             input_h0_w1 = input[:, :, h0, w1]
             input_h1_w1 = input[:, :, h1, w1]
-            out[:, :, oh, ow] = input_h0_w0 * (1 - Wh) * (
-                1 - Ww) + input_h1_w0 * Wh * (1 - Ww) + input_h0_w1 * (
-                    1 - Wh) * Ww + input_h1_w1 * Wh * Ww
+            out[:, :, oh,
+                ow] = input_h0_w0 * (1 - Wh) * (1 - Ww) + input_h1_w0 * Wh * (
+                    1 - Ww) + input_h0_w1 * (1 -
+                                             Wh) * Ww + input_h1_w1 * Wh * Ww
 
     if data_layout == "NHWC":
         out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
@@ -71,6 +72,7 @@ def bilinear_interp_mkldnn_np(input,
 
 @skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
 class TestBilinearInterpMKLDNNOp(OpTest):
+
     def init_test_case(self):
         pass
 
@@ -146,6 +148,7 @@ def test_check_output(self):
 
 
 class TestBilinearInterpOpMKLDNNNHWC(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [3, 2, 32, 16]
         self.out_h = 27
@@ -155,6 +158,7 @@ def init_test_case(self):
 
 
 class TestBilinearNeighborInterpMKLDNNCase2(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
@@ -162,6 +166,7 @@ def init_test_case(self):
 
 
 class TestBilinearNeighborInterpCase3(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [1, 1, 32, 64]
         self.out_h = 64
@@ -170,6 +175,7 @@ def init_test_case(self):
 
 
 class TestBilinearNeighborInterpCase4(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [1, 1, 32, 64]
         self.out_h = 64
@@ -179,6 +185,7 @@ def init_test_case(self):
 
 
 class TestBilinearNeighborInterpCase5(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [1, 1, 9, 6]
         self.out_h = 12
@@ -187,6 +194,7 @@ def init_test_case(self):
 
 
 class TestBilinearNeighborInterpCase6(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [1, 1, 32, 64]
         self.out_h = 64
@@ -196,6 +204,7 @@ def init_test_case(self):
 
 
 class TestBilinearNeighborInterpSame(TestBilinearInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [2, 3, 32, 64]
         self.out_h = 32
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_cast_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_cast_mkldnn_op.py
index 95de37fdc0251..331fa871897ae 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_cast_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_cast_mkldnn_op.py
@@ -27,6 +27,7 @@
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestCastBF16ToFP32MKLDNNOp(OpTest):
+
     def init_data(self):
         self.out = np.random.random(size=[10, 10]).astype("float32")
         self.x = convert_float_to_uint16(self.out)
@@ -35,7 +36,8 @@ def setUp(self):
         self.init_data()
         self.inputs = {'X': self.x}
         self.outputs = {'Out': self.out}
-        prepare_dtype = lambda x: int(core.VarDesc.VarType.BF16 if x.dtype != np.float32 else core.VarDesc.VarType.FP32)
+        prepare_dtype = lambda x: int(core.VarDesc.VarType.BF16 if x.dtype != np
+                                      .float32 else core.VarDesc.VarType.FP32)
         self.attrs = {
             'in_dtype': prepare_dtype(self.x),
             'out_dtype': prepare_dtype(self.out),
@@ -56,18 +58,21 @@ def test_check_grad(self):
 
 
 class TestCastFP32ToBF16MKLDNNOp(TestCastBF16ToFP32MKLDNNOp):
+
     def init_data(self):
         self.x = np.random.random(size=[2, 6]).astype("float32")
         self.out = convert_float_to_uint16(self.x)
 
 
 class TestCastBF16ToBF16MKLDNNOp(TestCastBF16ToFP32MKLDNNOp):
+
     def init_data(self):
         self.x = np.random.random(size=[6, 13]).astype("uint16")
         self.out = self.x
 
 
 class TestCastFP32ToFP32MKLDNNOp(TestCastBF16ToFP32MKLDNNOp):
+
     def init_data(self):
         self.x = np.random.random(size=[7, 15]).astype("float32")
         self.out = self.x
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_clip_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_clip_mkldnn_op.py
index 97a9137531845..adfd0613bd3f4 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_clip_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_clip_mkldnn_op.py
@@ -22,6 +22,7 @@
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestClipOneDNNOp(OpTest):
+
     def setUp(self):
         self.op_type = "clip"
         self.set_inputs()
@@ -57,16 +58,19 @@ def test_check_grad(self):
 
 
 class TestClipMinAsInputOneDNNOp(TestClipOneDNNOp):
+
     def set_additional_inputs(self):
         self.inputs['Min'] = np.array([6.8]).astype('float32')
 
 
 class TestClipMaxAsInputOneDNNOp(TestClipOneDNNOp):
+
     def set_additional_inputs(self):
         self.inputs['Max'] = np.array([9.1]).astype('float32')
 
 
 class TestClipMaxAndMinAsInputsOneDNNOp(TestClipOneDNNOp):
+
     def set_additional_inputs(self):
         self.inputs['Max'] = np.array([8.5]).astype('float32')
         self.inputs['Min'] = np.array([7.1]).astype('float32')
@@ -74,8 +78,10 @@ def set_additional_inputs(self):
 
 #   BF16 TESTS
 def create_bf16_test_class(parent):
+
     @OpTestTool.skip_if_not_cpu_bf16()
     class TestClipBF16OneDNNOp(parent):
+
         def set_inputs(self):
             self.x_fp32 = np.random.random((10, 10)).astype(np.float32) * 25
             self.inputs = {'X': convert_float_to_uint16(self.x_fp32)}
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
index e53afaa57be1c..a39f14b0b3adb 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
@@ -26,6 +26,7 @@
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestConcatBf16Op(OpTest):
+
     def setUp(self):
         self.op_type = "concat"
         self.use_mkldnn = True
@@ -43,8 +44,8 @@ def setUp(self):
         self.sections = [self.x0.shape[self.axis]] * 2
         self.sections[1] += self.x1.shape[self.axis]
 
-        self.output = np.concatenate(
-            (self.x0, self.x1, self.x2), axis=self.axis).astype(np.uint16)
+        self.output = np.concatenate((self.x0, self.x1, self.x2),
+                                     axis=self.axis).astype(np.uint16)
         self.outputs = {'Out': self.output}
 
     def calculate_grads(self):
@@ -85,6 +86,7 @@ def init_shape(self):
 
 
 class TestAxis1Case(TestConcatBf16Op):
+
     def init_axis(self):
         self.axis = 1
 
@@ -98,6 +100,7 @@ def init_shape(self):
 
 
 class TestAxis2Case(TestConcatBf16Op):
+
     def init_axis(self):
         self.axis = 2
 
@@ -111,6 +114,7 @@ def init_shape(self):
 
 
 class TestAxis3Case(TestConcatBf16Op):
+
     def init_axis(self):
         self.axis = 3
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py
index ef2fa1c1cc268..900de9ab9cacd 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py
@@ -20,6 +20,7 @@
 
 
 class TestConcatOp(OpTest):
+
     def setUp(self):
         self.op_type = "concat"
         self.use_mkldnn = True
@@ -30,8 +31,8 @@ def setUp(self):
         self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
         self.attrs = {'axis': self.axis, 'use_mkldnn': True}
 
-        self.output = np.concatenate(
-            (self.x0, self.x1, self.x2), axis=self.axis).astype('int')
+        self.output = np.concatenate((self.x0, self.x1, self.x2),
+                                     axis=self.axis).astype('int')
 
         self.outputs = {'Out': self.output}
 
@@ -59,6 +60,7 @@ def init_shape(self):
 
 
 class TestConcatOp2(TestConcatOp):
+
     def init_test_data(self):
         self.x0 = (np.random.randint(0, 100, self.x0_shape)).astype('uint8')
         self.x1 = (np.random.randint(0, 50, self.x1_shape)).astype('uint8')
@@ -78,6 +80,7 @@ def create_test_int8_class(parent):
     #--------------------test concat s8/u8 in with axis 1--------------------
 
     class TestAxis1Case(parent):
+
         def init_axis(self):
             self.axis = 1
 
@@ -89,6 +92,7 @@ def init_shape(self):
 #--------------------test concat s8/u8 in with axis 2--------------------
 
     class TestAxis2Case(parent):
+
         def init_axis(self):
             self.axis = 2
 
@@ -97,9 +101,11 @@ def init_shape(self):
             self.x1_shape = [2, 3, 5, 5]
             self.x2_shape = [2, 3, 6, 5]
 
+
 #--------------------test concat s8/u8 in with axis 3--------------------
 
     class TestAxis3Case(parent):
+
         def init_axis(self):
             self.axis = 3
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
index 7fc8f1d30802c..18b5705ec012a 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
@@ -24,6 +24,7 @@
 
 
 class TestConcatAxis0OneDNNOp(OpTest):
+
     def setUp(self):
         self.op_type = "concat"
         self.mkldnn_data_type = "float32"
@@ -38,8 +39,8 @@ def setUp(self):
             'mkldnn_data_type': self.mkldnn_data_type
         }
 
-        self.output = np.concatenate(
-            (self.x0, self.x1, self.x2), axis=self.axis).astype(self.dtype)
+        self.output = np.concatenate((self.x0, self.x1, self.x2),
+                                     axis=self.axis).astype(self.dtype)
 
         self.outputs = {'Out': self.output}
 
@@ -70,6 +71,7 @@ def init_shape(self):
 
 
 class TestConcatAxis1OneDNNOp(TestConcatAxis0OneDNNOp):
+
     def init_axis(self):
         self.axis = 1
 
@@ -80,6 +82,7 @@ def init_shape(self):
 
 
 class TestConcatAxis2OneDNNOp(TestConcatAxis0OneDNNOp):
+
     def init_axis(self):
         self.axis = 2
 
@@ -90,6 +93,7 @@ def init_shape(self):
 
 
 class TestConcatAxis3OneDNNOp(TestConcatAxis0OneDNNOp):
+
     def init_axis(self):
         self.axis = 3
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
index 702d26b073b6b..02c6a5c3daeb1 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
@@ -32,6 +32,7 @@ def conv2d_residual_naive(out, residual):
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestConv2DBF16Op(TestConv2DOp):
+
     def setUp(self):
         self.op_type = "conv2d"
         self.use_cudnn = False
@@ -63,8 +64,9 @@ def setUp(self):
 
         self.inputs_fp32 = {'Input': self.input, 'Filter': self.filter}
 
-        conv_out, _, _, _, _ = conv2d_forward_naive(
-            self.input, self.filter, self.groups, self.conv2d_param)
+        conv_out, _, _, _, _ = conv2d_forward_naive(self.input, self.filter,
+                                                    self.groups,
+                                                    self.conv2d_param)
         self.conv_output_float = conv_out
 
         if self.fuse_residual:
@@ -88,9 +90,10 @@ def setUp(self):
             self.filter = convert_float_to_uint16(self.filter)
 
         self.inputs = {
-            'Input': self.input,
-            'Filter': OpTest.np_dtype_to_fluid_dtype(
-                self.filter.astype(self.weight_type))
+            'Input':
+            self.input,
+            'Filter':
+            OpTest.np_dtype_to_fluid_dtype(self.filter.astype(self.weight_type))
         }
 
         if self.fuse_residual:
@@ -156,6 +159,7 @@ def init_additional_attrs(self):
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestConv2DWithGradBF16Op(TestConv2DBF16Op):
+
     def init_fuse_relu(self):
         self.fuse_activation = None
 
@@ -233,8 +237,8 @@ def conv_backward(dout, x, w, params):
                         for l in range(W_out):
                             for ic in range(IC):
                                 dweights[oc, ic, i, j] += x_padded[
-                                    n, ic, i + k * stride[0], j + l * stride[
-                                        1]] * dout[n, oc, k, l]
+                                    n, ic, i + k * stride[0],
+                                    j + l * stride[1]] * dout[n, oc, k, l]
 
     dx_padded = np.pad(dx, ((0, ), (0, ), (padding, ), (padding, )), 'constant')
 
@@ -250,9 +254,10 @@ def conv_backward(dout, x, w, params):
                     for kh in range(KH):
                         for kw in range(KW):
                             for ic in range(IC):
-                                dx_padded[n, ic, stride[0] * i + kh, stride[1] *
-                                          j + kw] += dout[n, oc, i, j] * w[
-                                              oc, ic, kh, kw]
+                                dx_padded[n, ic, stride[0] * i + kh,
+                                          stride[1] * j +
+                                          kw] += dout[n, oc, i, j] * w[oc, ic,
+                                                                       kh, kw]
 
     if padding == 0:
         dx = dx_padded
@@ -263,18 +268,21 @@ def conv_backward(dout, x, w, params):
 
 
 class TestConv2DBF16WithPadding1(TestConv2DWithGradBF16Op):
+
     def init_test_case(self):
         TestConv2DWithGradBF16Op.init_test_case(self)
         self.pad = [1, 1]
 
 
 class TestConv2DBF16WithStride2(TestConv2DWithGradBF16Op):
+
     def init_test_case(self):
         TestConv2DWithGradBF16Op.init_test_case(self)
         self.stride = [2, 3]
 
 
 class TestConv2D(TestConv2DBF16Op):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -289,6 +297,7 @@ def init_data_type(self):
 
 
 class TestWithPad(TestConv2D):
+
     def init_test_case(self):
         TestConv2D.init_test_case(self)
         self.pad = [1, 1]
@@ -296,11 +305,13 @@ def init_test_case(self):
 
 
 class TestWithGroup(TestConv2D):
+
     def init_group(self):
         self.groups = 3
 
 
 class TestWithStride(TestConv2DBF16Op):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -315,6 +326,7 @@ def init_data_type(self):
 
 
 class TestWithDilations(TestConv2DBF16Op):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -330,6 +342,7 @@ def init_data_type(self):
 
 
 class TestWith1x1ForceFP32Output(TestConv2DBF16Op):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -346,6 +359,7 @@ def init_fuse_residual(self):
 
 
 class TestWithInput1x1Filter1x1(TestConv2DBF16Op):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
index 6fc01488c7ea0..111def512ee10 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -30,6 +30,7 @@ def conv2d_forward_refer(input, filter, group, conv_param):
 @unittest.skipIf(not core.supports_int8(),
                  "place does not support int8 computation")
 class TestConv2DInt8Op(TestConv2DOp):
+
     def setUp(self):
         self.op_type = "conv2d"
         self.use_cudnn = False
@@ -71,9 +72,9 @@ def setUp(self):
         scale_output_shift = scale_output_shift / avx_scale
 
         def conv2d_forward_refer_helper(input_):
-            return conv2d_forward_refer(
-                input_.astype(np.int32), filter_int, self.groups,
-                conv2d_param).astype(np.float32) * scale_output_shift
+            return conv2d_forward_refer(input_.astype(np.int32), filter_int,
+                                        self.groups, conv2d_param).astype(
+                                            np.float32) * scale_output_shift
 
         def residual_helper(init_low, init_high, output_):
             input_residual_ = np.random.randint(
@@ -123,8 +124,7 @@ def residual_helper(init_low, init_high, output_):
         output = np.round(output).astype(self.dsttype)
 
         self.inputs = {
-            'Input':
-            OpTest.np_dtype_to_fluid_dtype(input.astype(self.srctype)),
+            'Input': OpTest.np_dtype_to_fluid_dtype(input.astype(self.srctype)),
             'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
         }
         if self.fuse_residual:
@@ -154,8 +154,9 @@ def residual_helper(init_low, init_high, output_):
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output_with_place(
-            core.CPUPlace(), atol=0, check_dygraph=False)
+        self.check_output_with_place(core.CPUPlace(),
+                                     atol=0,
+                                     check_dygraph=False)
 
     def test_check_grad(self):
         pass
@@ -194,6 +195,7 @@ def init_fuse_residual(self):
 
 
 class TestConv2D(TestConv2DInt8Op):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -209,6 +211,7 @@ def init_test_case(self):
 
 
 class TestWithHardSwish(TestConv2D):
+
     def init_fuse_activation(self):
         self.fuse_activation = "hard_swish"
         self.fuse_alpha = 0
@@ -216,6 +219,7 @@ def init_fuse_activation(self):
 
 
 class TestWithRelu6(TestConv2D):
+
     def init_fuse_activation(self):
         self.fuse_activation = "relu6"
         self.fuse_alpha = 6
@@ -223,6 +227,7 @@ def init_fuse_activation(self):
 
 
 class TestWithSwish(TestConv2D):
+
     def init_fuse_activation(self):
         self.fuse_activation = "swish"
         self.fuse_alpha = 1
@@ -230,6 +235,7 @@ def init_fuse_activation(self):
 
 
 class TestWithLeakyRelu(TestConv2D):
+
     def init_fuse_activation(self):
         self.fuse_activation = "leaky_relu"
         self.fuse_alpha = 0.02
@@ -237,6 +243,7 @@ def init_fuse_activation(self):
 
 
 class TestWithPad(TestConv2D):
+
     def init_test_case(self):
         TestConv2D.init_test_case(self)
         self.pad = [1, 1]
@@ -244,11 +251,13 @@ def init_test_case(self):
 
 
 class TestWithGroup(TestConv2D):
+
     def init_group(self):
         self.groups = 3
 
 
 class TestWithStride(TestConv2DInt8Op):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -264,6 +273,7 @@ def init_test_case(self):
 
 
 class TestWithDilations(TestConv2DInt8Op):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -280,6 +290,7 @@ def init_test_case(self):
 
 
 class TestWith1x1(TestConv2DInt8Op):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -295,6 +306,7 @@ def init_test_case(self):
 
 
 class TestWithInput1x1Filter1x1(TestConv2DInt8Op):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -325,31 +337,37 @@ def create_test_int8_class(parent):
 
     # --------------------test conv2d s8 in and u8 out--------------------
     class TestS8U8Case(parent):
+
         def init_data_type(self):
             init_data_type_with_fusion(self, np.int8, "relu", False)
 
     # --------------------test conv2d s8 in and s8 out--------------------
     class TestS8S8Case(parent):
+
         def init_data_type(self):
             init_data_type_with_fusion(self, np.int8, "", False)
 
     # --------------------test conv2d u8 in and s8 out--------------------
     class TestU8S8Case(parent):
+
         def init_data_type(self):
             init_data_type_with_fusion(self, np.uint8, "", False)
 
     # --------------------test conv2d u8 in and u8 out without residual fuse--------------------
     class TestU8U8Case(parent):
+
         def init_data_type(self):
             init_data_type_with_fusion(self, np.uint8, "relu", False)
 
     # --------------------test conv2d s8 in and s8 out with residual fuse--------------------
     class TestS8S8ResCase(parent):
+
         def init_data_type(self):
             init_data_type_with_fusion(self, np.int8, "", True)
 
     # --------------------test conv2d u8 in and s8 out with residual fuse--------------------
     class TestU8S8ResCase(parent):
+
         def init_data_type(self):
             init_data_type_with_fusion(self, np.uint8, "", True)
 
@@ -358,10 +376,10 @@ def init_data_type(self):
     cls_name_u8s8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "0")
     cls_name_u8u8 = "{0}_relu_{1}_residual_0".format(parent.__name__, "1")
 
-    cls_name_s8s8_re_1 = "{0}_relu_{1}_residual_{2}".format(parent.__name__,
-                                                            "0", "1")
-    cls_name_u8s8_re_1 = "{0}_relu_{1}_residual_{2}".format(parent.__name__,
-                                                            "0", "1")
+    cls_name_s8s8_re_1 = "{0}_relu_{1}_residual_{2}".format(
+        parent.__name__, "0", "1")
+    cls_name_u8s8_re_1 = "{0}_relu_{1}_residual_{2}".format(
+        parent.__name__, "0", "1")
     TestS8U8Case.__name__ = cls_name_s8u8
     TestS8S8Case.__name__ = cls_name_s8s8
     TestU8S8Case.__name__ = cls_name_u8s8
@@ -379,11 +397,12 @@ def init_data_type(self):
     if os.name != 'nt':
         # --------------------test conv2d s8 in and u8 out with residual fuse--------------------
         class TestS8U8ResCase(parent):
+
             def init_data_type(self):
                 init_data_type_with_fusion(self, np.int8, "relu", True)
 
-        cls_name_s8u8_re_1 = "{0}_relu_{1}_residual_{2}".format(parent.__name__,
-                                                                "1", "1")
+        cls_name_s8u8_re_1 = "{0}_relu_{1}_residual_{2}".format(
+            parent.__name__, "1", "1")
         TestS8U8ResCase.__name__ = cls_name_s8u8_re_1
         globals()[cls_name_s8u8_re_1] = TestS8U8ResCase
 
@@ -398,6 +417,7 @@ def init_data_type(self):
 
 
 class TestConv2DOp_AsyPadding_INT_MKLDNN(TestConv2DInt8Op):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
@@ -407,12 +427,14 @@ def init_paddings(self):
 
 
 class TestConv2DOp_Same_INT_MKLDNN(TestConv2DOp_AsyPadding_INT_MKLDNN):
+
     def init_paddings(self):
         self.pad = [0, 0]
         self.padding_algorithm = "SAME"
 
 
 class TestConv2DOp_Valid_INT_MKLDNN(TestConv2DOp_AsyPadding_INT_MKLDNN):
+
     def init_paddings(self):
         self.pad = [1, 1]
         self.padding_algorithm = "VALID"
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index 39f55fb45b87b..0471c295ad45d 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -38,6 +38,7 @@ def conv2d_residual_naive(out, residual):
 
 
 class TestConv2DMKLDNNOp(TestConv2DOp):
+
     def init_group(self):
         self.groups = 1
 
@@ -106,6 +107,7 @@ def setUp(self):
 @skip_check_grad_ci(
     reason="Fusion is for inference only, check_grad is not required.")
 class TestWithbreluFusion(TestConv2DMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DMKLDNNOp.init_test_case(self)
         self.fuse_activation = "relu6"
@@ -116,6 +118,7 @@ def init_test_case(self):
 @skip_check_grad_ci(
     reason="Fusion is for inference only, check_grad is not required.")
 class TestWithFuse(TestConv2DMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
@@ -126,6 +129,7 @@ def init_test_case(self):
 
 
 class TestWithPadWithBias(TestConv2DMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
@@ -133,6 +137,7 @@ def init_test_case(self):
 
 
 class TestWithStride(TestConv2DMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
@@ -141,6 +146,7 @@ def init_test_case(self):
 
 
 class TestWithGroup(TestConv2DMKLDNNOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -154,12 +160,14 @@ def init_group(self):
 
 
 class TestWith1x1(TestConv2DMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DMKLDNNOp.init_test_case(self)
         self.filter_size = [40, 3, 1, 1]
 
 
 class TestWithInput1x1Filter1x1(TestConv2DMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DMKLDNNOp.init_test_case(self)
         self.input_size = [2, 60, 1, 1]  # NCHW
@@ -172,6 +180,7 @@ def init_group(self):
 
 
 class TestConv2DOp_AsyPadding_MKLDNN(TestConv2DOp_v2):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self.dtype = np.float32
@@ -182,18 +191,21 @@ def init_paddings(self):
 
 
 class TestConv2DOp_Same_MKLDNN(TestConv2DOp_AsyPadding_MKLDNN):
+
     def init_paddings(self):
         self.pad = [0, 0]
         self.padding_algorithm = "SAME"
 
 
 class TestConv2DOp_Valid_MKLDNN(TestConv2DOp_AsyPadding_MKLDNN):
+
     def init_paddings(self):
         self.pad = [1, 1]
         self.padding_algorithm = "VALID"
 
 
 class TestConv2DOp_Valid_NHWC_MKLDNN(TestConv2DOp_Valid_MKLDNN):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -203,18 +215,21 @@ def init_test_case_2(self):
 
 
 class TestConv2DOp_Same_NHWC_MKLDNN(TestConv2DOp_Valid_NHWC_MKLDNN):
+
     def init_paddings(self):
         self.pad = [0, 0]
         self.padding_algorithm = "SAME"
 
 
 class TestConv2DOp_AsyPadding_NHWC_MKLDNN(TestConv2DOp_Valid_NHWC_MKLDNN):
+
     def init_paddings(self):
         self.pad = [0, 0, 1, 2]
         self.padding_algorithm = "EXPLICIT"
 
 
 class TestMKLDNNDilations(TestConv2DMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DMKLDNNOp.init_test_case(self)
         self.pad = [0, 0]
@@ -234,6 +249,7 @@ def init_group(self):
 # TODO(chenweihang): To solve the coverage problem, add this unittest,
 # remove this unittest after new executor set to default executor
 class TestConv2dMKLDNNByNewExecutor(TestConv2DMKLDNNOp):
+
     def test_check_output_by_new_executor(self):
         os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
         self.test_check_output()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py
index c6b7c175d9000..1f2fba8d6098e 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py
@@ -34,6 +34,7 @@ def conv2d_bias_naive(out, bias):
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestConv2DTransposeBF16MKLDNNOp(OpTest):
+
     def test_check_output(self):
         self.check_output_with_place(core.CPUPlace())
 
@@ -133,6 +134,7 @@ def setUp(self):
 
 
 class TestMKLDNNFuseBias(TestConv2DTransposeBF16MKLDNNOp):
+
     def init_test_case(self):
         super(TestMKLDNNFuseBias, self).init_test_case()
         self.pad = [1, 1]
@@ -141,6 +143,7 @@ def init_test_case(self):
 
 
 class TestMKLDNNWithPad(TestConv2DTransposeBF16MKLDNNOp):
+
     def init_test_case(self):
         super(TestMKLDNNWithPad, self).init_test_case()
         self.pad = [1, 1]
@@ -148,6 +151,7 @@ def init_test_case(self):
 
 
 class TestMKLDNNWithStride(TestConv2DTransposeBF16MKLDNNOp):
+
     def init_test_case(self):
         super(TestMKLDNNWithStride, self).init_test_case()
         self.pad = [1, 1]
@@ -156,6 +160,7 @@ def init_test_case(self):
 
 
 class TestMKLDNNWithAsymPad(TestConv2DTransposeBF16MKLDNNOp):
+
     def init_test_case(self):
         super(TestMKLDNNWithAsymPad, self).init_test_case()
         self.pad = [0, 0, 1, 2]
@@ -163,6 +168,7 @@ def init_test_case(self):
 
 
 class TestMKLDNNWithSamePad(TestConv2DTransposeBF16MKLDNNOp):
+
     def init_test_case(self):
         super(TestMKLDNNWithSamePad, self).init_test_case()
         self.pad = [0, 0]
@@ -170,6 +176,7 @@ def init_test_case(self):
 
 
 class TestMKLDNNWithValidPad(TestConv2DTransposeBF16MKLDNNOp):
+
     def init_test_case(self):
         super(TestMKLDNNWithValidPad, self).init_test_case()
         self.pad = [1, 1]
@@ -177,6 +184,7 @@ def init_test_case(self):
 
 
 class TestMKLDNNWithValidPad_NHWC(TestMKLDNNWithValidPad):
+
     def init_test_case(self):
         super(TestMKLDNNWithValidPad_NHWC, self).init_test_case()
         self.data_format = 'NHWC'
@@ -186,6 +194,7 @@ def init_test_case(self):
 
 class TestConv2DTransposeMKLDNNWithDilationsExplicitPad(
         TestConv2DTransposeBF16MKLDNNOp):
+
     def init_test_case(self):
         super(TestConv2DTransposeMKLDNNWithDilationsExplicitPad,
               self).init_test_case()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
index a36fc28013bb4..05c7cf18152b5 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
@@ -31,6 +31,7 @@ def conv2d_bias_naive(out, bias):
 
 
 class TestConv2DTransposeMKLDNNOp(TestConv2DTransposeOp):
+
     def test_check_grad(self):
         return
 
@@ -89,6 +90,7 @@ def setUp(self):
 
 
 class TestMKLDNNFuseBias(TestConv2DTransposeMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
@@ -97,6 +99,7 @@ def init_test_case(self):
 
 
 class TestMKLDNNWithPad(TestConv2DTransposeMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
@@ -104,6 +107,7 @@ def init_test_case(self):
 
 
 class TestMKLDNNWithStride(TestConv2DTransposeMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
@@ -112,6 +116,7 @@ def init_test_case(self):
 
 
 class TestMKLDNNWithAsymPad(TestConv2DTransposeMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.pad = [0, 0, 1, 2]
@@ -119,6 +124,7 @@ def init_test_case(self):
 
 
 class TestMKLDNNWithSamePad(TestConv2DTransposeMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.pad = [0, 0]
@@ -126,6 +132,7 @@ def init_test_case(self):
 
 
 class TestMKLDNNWithValidPad(TestConv2DTransposeMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
@@ -133,6 +140,7 @@ def init_test_case(self):
 
 
 class TestMKLDNNWithValidPad_NHWC(TestMKLDNNWithValidPad):
+
     def init_test_case(self):
         super(TestMKLDNNWithValidPad_NHWC, self).init_test_case()
         self.data_format = "NHWC"
@@ -142,6 +150,7 @@ def init_test_case(self):
 
 class TestConv2DTransposeMKLDNNWithDilationsExplicitPad(
         TestConv2DTransposeMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.stride = [2, 1]
@@ -155,6 +164,7 @@ def init_test_case(self):
 
 
 class TestMKLDNNWithGroups(TestConv2DTransposeMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
@@ -165,6 +175,7 @@ def init_test_case(self):
 
 
 class TestMKLDNNWithGroups_NHWC(TestConv2DTransposeMKLDNNOp):
+
     def init_test_case(self):
         TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
index dcaee49558ba2..ae2abb18f13fa 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
@@ -20,6 +20,7 @@
 
 
 class TestMKLDNN(TestConv3DOp):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self.data_format = "NCHW"
@@ -27,6 +28,7 @@ def init_kernel_type(self):
 
 
 class TestMKLDNNCase1(TestCase1):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self.data_format = "NCHW"
@@ -34,6 +36,7 @@ def init_kernel_type(self):
 
 
 class TestMKLDNNGroup1(TestWithGroup1):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self.data_format = "NCHW"
@@ -41,6 +44,7 @@ def init_kernel_type(self):
 
 
 class TestMKLDNNGroup2(TestWithGroup2):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self.data_format = "NCHW"
@@ -48,6 +52,7 @@ def init_kernel_type(self):
 
 
 class TestMKLDNNWith1x1(TestWith1x1):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self.data_format = "NCHW"
@@ -55,6 +60,7 @@ def init_kernel_type(self):
 
 
 class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self.data_format = "NCHW"
@@ -62,6 +68,7 @@ def init_kernel_type(self):
 
 
 class TestConv3DOp_AsyPadding_MKLDNN(TestConv3DOp):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self.data_format = "NCHW"
@@ -73,6 +80,7 @@ def init_paddings(self):
 
 
 class TestConv3DOp_Same_MKLDNN(TestConv3DOp_AsyPadding_MKLDNN):
+
     def init_paddings(self):
         self.pad = [0, 0, 0]
         self.padding_algorithm = "SAME"
@@ -84,6 +92,7 @@ def init_kernel_type(self):
 
 
 class TestConv3DOp_Valid_MKLDNN(TestConv3DOp_AsyPadding_MKLDNN):
+
     def init_paddings(self):
         self.pad = [1, 1, 1]
         self.padding_algorithm = "VALID"
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
index fae52ab833b9d..fcd1f26d72cfc 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
@@ -21,6 +21,7 @@
 
 
 class TestDeQuantizeOp(OpTest):
+
     def setUp(self):
         self.op_type = 'dequantize'
         self.scale = 127.0
@@ -46,12 +47,12 @@ def prepare_input_output_bf16(self):
     def prepare_input_int8(self):
         if self.data_type == 'int8':
             # input data values are integers from interval [-128, 128)
-            self.input = (np.random.randint(0, 256, self.input_size) - 128
-                          ).astype(self.data_type)
+            self.input = (np.random.randint(0, 256, self.input_size) -
+                          128).astype(self.data_type)
         else:
             # input data values are integers from interval [0, 256)
-            self.input = (np.random.randint(
-                0, 256, self.input_size)).astype(self.data_type)
+            self.input = (np.random.randint(0, 256, self.input_size)).astype(
+                self.data_type)
 
         self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(self.input)}
         self.attrs = {'Scale': self.scale, 'Shift': self.shift}
@@ -88,6 +89,7 @@ def set_input_size(self):
 
 
 class TestDeQuantizeOp1(TestDeQuantizeOp):
+
     def set_scale(self):
         self.scale = 1.5
 
@@ -96,6 +98,7 @@ def set_data_type(self):
 
 
 class TestDeQuantizeOp2(TestDeQuantizeOp):
+
     def set_scale(self):
         self.scale = 0.8
 
@@ -104,6 +107,7 @@ def set_data_type(self):
 
 
 class TestDeQuantizeOpBf16(TestDeQuantizeOp):
+
     def set_scale(self):
         self.scale = 1.0
 
@@ -114,6 +118,7 @@ def set_data_type(self):
 # 2-dim input
 # P - positive input, with shift
 class TestDeQuantizeOpShift_2_P(TestDeQuantizeOp):
+
     def set_data_type(self):
         self.data_type = 'uint8'
 
@@ -130,6 +135,7 @@ def set_input_size(self):
 # 2-dim input
 # N - negative input, with shift
 class TestDeQuantizeOpShift_2_N(TestDeQuantizeOpShift_2_P):
+
     def set_data_type(self):
         self.data_type = 'int8'
 
@@ -145,22 +151,26 @@ def set_input_size(self):
 
 # 3-dim input
 class TestDeQuantizeOpShift_3_P(TestDeQuantizeOpShift_2_P):
+
     def set_input_size(self):
         self.input_size = [2, 3, 4]
 
 
 class TestDeQuantizeOpShift_3_N(TestDeQuantizeOpShift_2_N):
+
     def set_input_size(self):
         self.input_size = [2, 3, 4]
 
 
 # 4-dim input
 class TestDeQuantizeOpShift_4_P(TestDeQuantizeOpShift_2_P):
+
     def set_input_size(self):
         self.input_size = [2, 3, 4, 5]
 
 
 class TestDeQuantizeOpShift_4_N(TestDeQuantizeOpShift_2_N):
+
     def set_input_size(self):
         self.input_size = [2, 3, 4, 5]
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
index 3a20ffde7a1b2..3a9f535a83336 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestElementwiseAddBf16MklDNNOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_add"
         self.use_mkldnn = True
@@ -47,32 +48,30 @@ def test_check_output(self):
 
     # elementwise_add grad (no braodcasting) is just passing upper gradients to either X or Y or both
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            core.CPUPlace(), ["X", "Y"],
-            "Out",
-            check_dygraph=False,
-            user_defined_grads=[self.x, self.x],
-            user_defined_grad_outputs=[self.x_bf16])
+        self.check_grad_with_place(core.CPUPlace(), ["X", "Y"],
+                                   "Out",
+                                   check_dygraph=False,
+                                   user_defined_grads=[self.x, self.x],
+                                   user_defined_grad_outputs=[self.x_bf16])
 
     def test_check_grad_ingore_x(self):
-        self.check_grad_with_place(
-            core.CPUPlace(), ["Y"],
-            "Out",
-            check_dygraph=False,
-            user_defined_grads=[self.y],
-            user_defined_grad_outputs=[self.y_bf16])
+        self.check_grad_with_place(core.CPUPlace(), ["Y"],
+                                   "Out",
+                                   check_dygraph=False,
+                                   user_defined_grads=[self.y],
+                                   user_defined_grad_outputs=[self.y_bf16])
 
     def test_check_grad_ingore_y(self):
-        self.check_grad_with_place(
-            core.CPUPlace(), ["X"],
-            "Out",
-            check_dygraph=False,
-            user_defined_grads=[self.x],
-            user_defined_grad_outputs=[self.x_bf16])
+        self.check_grad_with_place(core.CPUPlace(), ["X"],
+                                   "Out",
+                                   check_dygraph=False,
+                                   user_defined_grads=[self.x],
+                                   user_defined_grad_outputs=[self.x_bf16])
+
 
+class TestElementwiseAddBroadCastingBf16MklDNNOp(TestElementwiseAddBf16MklDNNOp
+                                                 ):
 
-class TestElementwiseAddBroadCastingBf16MklDNNOp(
-        TestElementwiseAddBf16MklDNNOp):
     def generate_data(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(np.float32)
         self.y = np.random.uniform(1, 2, [100]).astype(np.float32)
@@ -90,9 +89,8 @@ def test_check_grad_normal(self):
             core.CPUPlace(), ["X", "Y"],
             "Out",
             check_dygraph=False,
-            user_defined_grads=[
-                self.x, self.compute_reduced_gradients(self.x)
-            ],
+            user_defined_grads=[self.x,
+                                self.compute_reduced_gradients(self.x)],
             user_defined_grad_outputs=[self.x_bf16])
 
     def test_check_grad_ingore_x(self):
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
index 585ae38875cc7..2ae717d64a302 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
@@ -21,6 +21,7 @@
 
 
 class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
@@ -29,6 +30,7 @@ def init_dtype(self):
 
 
 class TestMKLDNNElementwiseAddOp2(TestMKLDNNElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.random((100, )).astype(self.dtype)
         self.y = np.random.random((100, )).astype(self.dtype)
@@ -36,6 +38,7 @@ def init_input_output(self):
 
 
 class TestMKLDNNElementwiseAddOp3(TestMKLDNNElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
         self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
@@ -43,6 +46,7 @@ def init_input_output(self):
 
 
 class TestMKLDNNElementwiseAddOp4(TestMKLDNNElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype)
@@ -57,6 +61,7 @@ def test_check_grad_ingore_y(self):
 
 
 class TestMKLDNNElementwiseAddOp5(TestMKLDNNElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
@@ -64,6 +69,7 @@ def init_input_output(self):
 
 
 class TestMKLDNNElementwiseAddOp_broadcast_3(TestMKLDNNElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -74,6 +80,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestMKLDNNElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 12).astype(self.dtype)
         self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype)
@@ -99,6 +106,7 @@ def test_check_grad_ingore_x(self):
 @skip_check_grad_ci(
     reason="oneDNN's int8 elementwise_ops don't implemend grad kernel.")
 class TestInt8(TestElementwiseAddOp):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self._cpu_only = True
@@ -132,6 +140,7 @@ def test_check_grad_ingore_y(self):
 
 
 class TestInt8Scales(TestInt8):
+
     def quantize(self, tensor, dt="int8"):
         max_int = 127.0 if dt == "int8" else 255.0
         scale = max_int / np.abs(np.amax(tensor))
@@ -156,11 +165,12 @@ def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         self.init_scales()
         int_atol = 1  # different quantization techniques
-        self.check_output(
-            check_dygraph=(self.use_mkldnn == False), atol=int_atol)
+        self.check_output(check_dygraph=(self.use_mkldnn == False),
+                          atol=int_atol)
 
 
 class TestUint8Scales(TestInt8Scales):
+
     def init_input_output(self):
         self.x_f = np.random.random((100, )).astype("float")
         self.y_f = np.random.random((100, )).astype("float")
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_div_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_div_mkldnn_op.py
index a3c41d2f03476..55b32e1088c06 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_div_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_div_mkldnn_op.py
@@ -24,6 +24,7 @@
 @OpTestTool.skip_if(not (isinstance(_current_expected_place(), core.CPUPlace)),
                     "GPU is not supported")
 class TestMKLDNNElementwiseDivOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.init_dtype()
@@ -65,6 +66,7 @@ def test_check_output(self):
 
 
 class TestMKLDNNElementwiseDivOp2(TestMKLDNNElementwiseDivOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [100]).astype(self.dtype)
         self.y = np.random.uniform(0.1, 1, [100]).astype(self.dtype)
@@ -72,6 +74,7 @@ def init_input_output(self):
 
 
 class TestMKLDNNElementwiseDivOp3(TestMKLDNNElementwiseDivOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
         self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
@@ -79,6 +82,7 @@ def init_input_output(self):
 
 
 class TestMKLDNNElementwiseDivOp4(TestMKLDNNElementwiseDivOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype)
@@ -93,6 +97,7 @@ def test_check_grad_ignore_x(self):
 
 
 class TestMKLDNNElementwiseDivOp5(TestMKLDNNElementwiseDivOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
@@ -108,6 +113,7 @@ def test_check_grad_ignore_x(self):
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestBf16(TestMKLDNNElementwiseDivOp):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.init_dtype()
@@ -134,24 +140,23 @@ def test_check_output(self):
         self.check_output_with_place(core.CPUPlace())
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            core.CPUPlace(), ["X", "Y"],
-            "Out",
-            user_defined_grads=[
-                np.divide(self.x, self.y), np.divide(
-                    (np.multiply(-self.x, self.x)), np.multiply(self.y, self.y))
-            ],
-            user_defined_grad_outputs=[self.x_bf16])
+        self.check_grad_with_place(core.CPUPlace(), ["X", "Y"],
+                                   "Out",
+                                   user_defined_grads=[
+                                       np.divide(self.x, self.y),
+                                       np.divide((np.multiply(-self.x, self.x)),
+                                                 np.multiply(self.y, self.y))
+                                   ],
+                                   user_defined_grad_outputs=[self.x_bf16])
 
     def test_check_grad_ignore_x(self):
-        self.check_grad_with_place(
-            core.CPUPlace(), ["Y"],
-            "Out",
-            user_defined_grads=[
-                np.divide((np.multiply(-self.x, self.y)),
-                          np.multiply(self.y, self.y))
-            ],
-            user_defined_grad_outputs=[self.y_bf16])
+        self.check_grad_with_place(core.CPUPlace(), ["Y"],
+                                   "Out",
+                                   user_defined_grads=[
+                                       np.divide((np.multiply(-self.x, self.y)),
+                                                 np.multiply(self.y, self.y))
+                                   ],
+                                   user_defined_grad_outputs=[self.y_bf16])
 
     def test_check_grad_ignore_y(self):
         self.check_grad_with_place(
@@ -162,6 +167,7 @@ def test_check_grad_ignore_y(self):
 
 
 class TestBf16Broadcasting(TestBf16):
+
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
index b67ae17ba3a5a..232c1afef4dd2 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestElementwiseMulBf16MklDNNOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.use_mkldnn = True
@@ -45,14 +46,14 @@ def test_check_output(self):
         self.check_output_with_place(core.CPUPlace())
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            core.CPUPlace(), ["X", "Y"],
-            "Out",
-            check_dygraph=False,
-            user_defined_grads=[
-                np.multiply(self.x, self.y), np.multiply(self.x, self.x)
-            ],
-            user_defined_grad_outputs=[self.x_bf16])
+        self.check_grad_with_place(core.CPUPlace(), ["X", "Y"],
+                                   "Out",
+                                   check_dygraph=False,
+                                   user_defined_grads=[
+                                       np.multiply(self.x, self.y),
+                                       np.multiply(self.x, self.x)
+                                   ],
+                                   user_defined_grad_outputs=[self.x_bf16])
 
     def test_check_grad_ingore_x(self):
         self.check_grad_with_place(
@@ -71,8 +72,9 @@ def test_check_grad_ingore_y(self):
             user_defined_grad_outputs=[self.x_bf16])
 
 
-class TestElementwiseMulBroadcastingBf16MklDNNOp(
-        TestElementwiseMulBf16MklDNNOp):
+class TestElementwiseMulBroadcastingBf16MklDNNOp(TestElementwiseMulBf16MklDNNOp
+                                                 ):
+
     def generate_data(self):
         self.x = np.random.uniform(1, 2, [1, 2, 3, 100]).astype(np.float32)
         self.y = np.random.uniform(1, 2, [100]).astype(np.float32)
@@ -85,7 +87,7 @@ def compute_reduced_gradients(self, out_grads):
         part_sum = np.add.reduceat(part_sum, [0], axis=2)
         return part_sum.flatten()
 
-    # TODO(jczaja): elementwise_mul bf16 grad got some potential 
+    # TODO(jczaja): elementwise_mul bf16 grad got some potential
     # accuracy problems that need to be explained
     def test_check_grad_normal(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
index f2648e5b723ed..f369f8587b8a3 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
@@ -21,6 +21,7 @@
 
 
 class TestMKLDNNElementwiseMulOp(ElementwiseMulOp):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
@@ -29,6 +30,7 @@ def init_dtype(self):
 
 
 class TestMKLDNNElementwiseMulOp2(TestMKLDNNElementwiseMulOp):
+
     def init_input_output(self):
         self.x = np.random.random((100, )).astype(self.dtype)
         self.y = np.random.random((100, )).astype(self.dtype)
@@ -36,6 +38,7 @@ def init_input_output(self):
 
 
 class TestMKLDNNElementwiseMulOp3(TestMKLDNNElementwiseMulOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
         self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
@@ -43,6 +46,7 @@ def init_input_output(self):
 
 
 class TestMKLDNNElementwiseMulOp4(TestMKLDNNElementwiseMulOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype)
@@ -57,6 +61,7 @@ def test_check_grad_ingore_y(self):
 
 
 class TestMKLDNNElementwiseMulOp5(TestMKLDNNElementwiseMulOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
@@ -79,6 +84,7 @@ def test_check_grad_ingore_x(self):
 @skip_check_grad_ci(
     reason="oneDNN's int8 elementwise_ops don't implemend grad kernel.")
 class TestInt8(ElementwiseMulOp):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self._cpu_only = True
@@ -112,6 +118,7 @@ def test_check_grad_ingore_y(self):
 
 
 class TestInt8Scales(TestInt8):
+
     def quantize(self, tensor, dt="int8"):
         max_int = 127.0 if dt == "int8" else 255.0
         scale = max_int / np.abs(np.amax(tensor))
@@ -136,11 +143,12 @@ def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         self.init_scales()
         int_atol = 1  # different quantization techniques
-        self.check_output(
-            check_dygraph=(self.use_mkldnn == False), atol=int_atol)
+        self.check_output(check_dygraph=(self.use_mkldnn == False),
+                          atol=int_atol)
 
 
 class TestUint8Scales(TestInt8Scales):
+
     def init_input_output(self):
         self.x_f = np.random.random((100, )).astype("float")
         self.y_f = np.random.random((100, )).astype("float")
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py
index 62c8c9571b793..e70cc8e377967 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py
@@ -24,6 +24,7 @@
 @OpTestTool.skip_if(not (isinstance(_current_expected_place(), core.CPUPlace)),
                     "GPU is not supported")
 class TestMKLDNNElementwiseSubOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.init_dtype()
@@ -65,6 +66,7 @@ def test_check_output(self):
 
 
 class TestMKLDNNElementwiseSubOp2(TestMKLDNNElementwiseSubOp):
+
     def init_input_output(self):
         self.x = np.random.random((100, )).astype(self.dtype)
         self.y = np.random.random((100, )).astype(self.dtype)
@@ -72,6 +74,7 @@ def init_input_output(self):
 
 
 class TestMKLDNNElementwiseSubOp3(TestMKLDNNElementwiseSubOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
         self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
@@ -79,6 +82,7 @@ def init_input_output(self):
 
 
 class TestMKLDNNElementwiseSubOp4(TestMKLDNNElementwiseSubOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype)
@@ -86,6 +90,7 @@ def init_input_output(self):
 
 
 class TestMKLDNNElementwiseSubOp5(TestMKLDNNElementwiseSubOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
@@ -93,6 +98,7 @@ def init_input_output(self):
 
 
 class TestMKLDNNElementwiseSubOp_broadcast(TestMKLDNNElementwiseSubOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -103,6 +109,7 @@ def init_axis(self):
 
 
 class TestElementwiseSubOp_xsize_lessthan_ysize_sub(TestMKLDNNElementwiseSubOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 12).astype(self.dtype)
         self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype)
@@ -123,6 +130,7 @@ def test_check_grad_ignore_x(self):
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestBf16(TestMKLDNNElementwiseSubOp):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.init_dtype()
@@ -149,28 +157,26 @@ def test_check_output(self):
         self.check_output_with_place(core.CPUPlace())
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            core.CPUPlace(), ["X", "Y"],
-            "Out",
-            user_defined_grads=[self.x, -self.x],
-            user_defined_grad_outputs=[self.x_bf16])
+        self.check_grad_with_place(core.CPUPlace(), ["X", "Y"],
+                                   "Out",
+                                   user_defined_grads=[self.x, -self.x],
+                                   user_defined_grad_outputs=[self.x_bf16])
 
     def test_check_grad_ignore_x(self):
-        self.check_grad_with_place(
-            core.CPUPlace(), ["Y"],
-            "Out",
-            user_defined_grads=[-self.y],
-            user_defined_grad_outputs=[self.y_bf16])
+        self.check_grad_with_place(core.CPUPlace(), ["Y"],
+                                   "Out",
+                                   user_defined_grads=[-self.y],
+                                   user_defined_grad_outputs=[self.y_bf16])
 
     def test_check_grad_ignore_y(self):
-        self.check_grad_with_place(
-            core.CPUPlace(), ["X"],
-            "Out",
-            user_defined_grads=[self.x],
-            user_defined_grad_outputs=[self.x_bf16])
+        self.check_grad_with_place(core.CPUPlace(), ["X"],
+                                   "Out",
+                                   user_defined_grads=[self.x],
+                                   user_defined_grad_outputs=[self.x_bf16])
 
 
 class TestBf16Broadcasting(TestBf16):
+
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
@@ -186,9 +192,8 @@ def test_check_grad_normal(self):
         self.check_grad_with_place(
             core.CPUPlace(), ["X", "Y"],
             "Out",
-            user_defined_grads=[
-                self.x, self.compute_reduced_gradients(self.x)
-            ],
+            user_defined_grads=[self.x,
+                                self.compute_reduced_gradients(self.x)],
             user_defined_grad_outputs=[self.x_bf16])
 
     def test_check_grad_ignore_x(self):
@@ -200,6 +205,7 @@ def test_check_grad_ignore_x(self):
 
 
 class TestInt8(TestMKLDNNElementwiseSubOp):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self._cpu_only = True
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
index 6229b7f559b16..b179571e8f015 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
@@ -24,6 +24,7 @@
 @OpTestTool.skip_if(core.is_compiled_with_cuda(),
                     "CUDA required dygraph so oneDNN UT must be skipped")
 class TestExpandV2OneDNNOp(OpTest):
+
     def setUp(self):
         self.op_type = "expand_v2"
         self.init_data()
@@ -53,6 +54,7 @@ def test_check_grad(self):
 
 
 class TestExpandV2ExpandDimOneDNNOp(TestExpandV2OneDNNOp):
+
     def init_data(self):
         self.ori_shape = [120]
         self.shape = [2, 120]
@@ -60,6 +62,7 @@ def init_data(self):
 
 
 class TestExpandV2CopyScenarioOneDNNOp(TestExpandV2OneDNNOp):
+
     def init_data(self):
         self.ori_shape = (2, 10, 5)
         self.shape = (2, 10, 5)
@@ -67,6 +70,7 @@ def init_data(self):
 
 
 class TestExpandV2CopyScenarioShapeNotGivenOneDNNOp(TestExpandV2OneDNNOp):
+
     def init_data(self):
         self.ori_shape = (2, 4, 5, 7)
         self.shape = (-1, -1, -1, -1)
@@ -74,6 +78,7 @@ def init_data(self):
 
 
 class TestExpandV2ExpandShapesTensor1OneDNNOp(TestExpandV2OneDNNOp):
+
     def init_data(self):
         self.ori_shape = [100, 1]
         self.expand_times = [1, 2]
@@ -93,6 +98,7 @@ def set_additional_inputs(self):
 
 class TestExpandV2ExpandShapesTensor2OneDNNOp(
         TestExpandV2ExpandShapesTensor1OneDNNOp):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.expand_times = [1, 1]
@@ -101,6 +107,7 @@ def init_data(self):
 
 
 class TestExpandV2ShapesTensorOneDNNOp(TestExpandV2OneDNNOp):
+
     def init_data(self):
         self.ori_shape = [100]
         self.expand_times = [2, 1]
@@ -113,8 +120,10 @@ def set_additional_inputs(self):
 
 #   BF16 TESTS
 def create_expand_v2_bf16_test_class(parent):
+
     @OpTestTool.skip_if_not_cpu_bf16()
     class TestExpandV2BF16OneDNNOp(parent):
+
         def set_inputs(self):
             self.attrs['mkldnn_data_type'] = 'bfloat16'
             self.inputs = {"X": convert_float_to_uint16(self.x)}
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fc_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_bf16_mkldnn_op.py
index 1104372c74148..0cb069dd14b14 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fc_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_bf16_mkldnn_op.py
@@ -27,6 +27,7 @@ def fully_connected_naive(input, weights, bias_data):
 
 
 class MatrixGenerate:
+
     def __init__(self, mb, ic, oc, h, w):
         self.input = np.random.random((mb, ic * h * w)).astype(np.float32)
         self.weights = np.random.random((ic * h * w, oc)).astype(np.float32)
@@ -35,6 +36,7 @@ def __init__(self, mb, ic, oc, h, w):
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestFcBf16MklDNNOp(OpTest):
+
     def generate_data(self):
         self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
         self.bias = np.random.random(15).astype("float32")
@@ -75,6 +77,7 @@ def test_check_grad_no_weight(self):
 
 
 class TestFCMKLDNNOp1(TestFcBf16MklDNNOp):
+
     def generate_data(self):
         self.matrix = MatrixGenerate(2, 15, 48, 2, 2)
         self.bias = np.random.random(48).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
index e96b8cf8191e3..84de7246965f8 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
@@ -25,12 +25,14 @@ def fully_connected_naive(input, weights, bias_data):
 
 
 class MatrixGenerate:
+
     def __init__(self, mb, ic, oc, h, w):
         self.input = np.random.random((mb, ic * h * w)).astype("float32")
         self.weights = np.random.random((ic * h * w, oc)).astype("float32")
 
 
 class TestFCMKLDNNOp(OpTest):
+
     def create_data(self):
         self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
         self.bias = np.random.random(15).astype("float32")
@@ -49,8 +51,9 @@ def setUp(self):
         self.attrs = {'use_mkldnn': self.use_mkldnn}
 
         self.outputs = {
-            'Out': fully_connected_naive(self.matrix.input, self.matrix.weights,
-                                         self.bias)
+            'Out':
+            fully_connected_naive(self.matrix.input, self.matrix.weights,
+                                  self.bias)
         }
 
     def test_check_output(self):
@@ -65,6 +68,7 @@ def test_check_grad_no_weight(self):
 
 
 class TestFCMKLDNNOp1(TestFCMKLDNNOp):
+
     def create_data(self):
         self.matrix = MatrixGenerate(2, 15, 48, 2, 2)
         self.bias = np.random.random(48).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fill_constant_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fill_constant_mkldnn_op.py
index d729efbb0fb60..27400abcf7f83 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fill_constant_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fill_constant_mkldnn_op.py
@@ -22,6 +22,7 @@
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestFillConstant2DOneDNNOp(OpTest):
+
     def setUp(self):
         self.op_type = "fill_constant"
         self.dtype = np.float32
@@ -63,14 +64,16 @@ def test_check_output(self):
         self.check_output()
 
 
-class TestFillZerosLike4DShapeTensorPriorityOneDNNOp(
-        TestFillConstant2DOneDNNOp):
+class TestFillZerosLike4DShapeTensorPriorityOneDNNOp(TestFillConstant2DOneDNNOp
+                                                     ):
+
     def set_inputs(self):
         self.inputs = {'ShapeTensor': np.array([5, 6, 7, 8]).astype("int32")}
 
 
 class TestFillZerosLike4DShapeTensorListPriorityOneDNNOp(
         TestFillConstant2DOneDNNOp):
+
     def set_inputs(self):
         shape = (4, 5, 6, 7)
         self.shape_tensor_list = []
@@ -82,13 +85,15 @@ def set_inputs(self):
 
 
 class TestFillZerosLike2DStringValueInfOneDNNOp(TestFillConstant2DOneDNNOp):
+
     def set_attrs(self):
         self.str_value = "inf"
         self.attrs = {'shape': (10, 13), 'use_mkldnn': True, 'str_value': "inf"}
 
 
-class TestFillZerosLike2DStringValueMinusInfOneDNNOp(
-        TestFillConstant2DOneDNNOp):
+class TestFillZerosLike2DStringValueMinusInfOneDNNOp(TestFillConstant2DOneDNNOp
+                                                     ):
+
     def set_attrs(self):
         self.str_value = "-inf"
         self.attrs = {
@@ -99,6 +104,7 @@ def set_attrs(self):
 
 
 class TestFillZerosLike2DStringValueFloatOneDNNOp(TestFillConstant2DOneDNNOp):
+
     def set_attrs(self):
         self.str_value = "0.123"
         self.attrs = {
@@ -110,6 +116,7 @@ def set_attrs(self):
 
 class TestFillZerosLike2DValueTensorPriorityOneDNNOp(
         TestFillZerosLike2DStringValueFloatOneDNNOp):
+
     def set_inputs(self):
         self.inputs = {'ValueTensor': np.atleast_1d(2.25).astype("float32")}
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_flags_mkldnn_ops_on_off.py b/python/paddle/fluid/tests/unittests/mkldnn/test_flags_mkldnn_ops_on_off.py
index 4e52b7b08cf32..a8d2d42ebe382 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_flags_mkldnn_ops_on_off.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_flags_mkldnn_ops_on_off.py
@@ -23,6 +23,7 @@
 
 
 class TestFlagsUseMkldnn(unittest.TestCase):
+
     def setUp(self):
         self._python_interp = sys.executable
         self._python_interp += " check_flags_mkldnn_ops_on_off.py"
@@ -38,11 +39,10 @@ def setUp(self):
     def flags_use_mkl_dnn_common(self, e):
         cmd = self._python_interp
         env = dict(self.env, **e)
-        proc = subprocess.Popen(
-            cmd.split(" "),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            env=env)
+        proc = subprocess.Popen(cmd.split(" "),
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE,
+                                env=env)
 
         out, err = proc.communicate()
         returncode = proc.returncode
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py b/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py
index 0974d6357fcda..d86a9467053f7 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py
@@ -23,6 +23,7 @@
 
 
 class TestFlagsUseMkldnn(unittest.TestCase):
+
     def setUp(self):
         self._python_interp = sys.executable
         self._python_interp += " check_flags_use_mkldnn.py"
@@ -47,11 +48,10 @@ def found(self, regex, out, err):
     def test_flags_use_mkl_dnn(self):
         cmd = self._python_interp
 
-        proc = subprocess.Popen(
-            cmd.split(" "),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            env=self.env)
+        proc = subprocess.Popen(cmd.split(" "),
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE,
+                                env=self.env)
 
         out, err = proc.communicate()
         returncode = proc.returncode
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_flatten_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_flatten_mkldnn_op.py
index c01f244004eff..dc750335ea576 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_flatten_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_flatten_mkldnn_op.py
@@ -24,6 +24,7 @@
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestFlattenOneDNNOp(OpTest):
+
     def setUp(self):
         self.set_op_type()
         self.init_test_case()
@@ -51,6 +52,7 @@ def init_test_case(self):
 
 
 class TestFlattenOneDNNOp1(TestFlattenOneDNNOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 2, 10)
         self.axis = 0
@@ -58,6 +60,7 @@ def init_test_case(self):
 
 
 class TestFlattenOneDNNOpSixDims(TestFlattenOneDNNOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 3, 2, 4, 4)
         self.axis = 4
@@ -65,23 +68,28 @@ def init_test_case(self):
 
 
 class TestFlatten2OneDNNOp(TestFlattenOneDNNOp):
+
     def set_op_type(self):
         self.op_type = "flatten2"
 
 
 class TestFlatten2OneDNNOp1(TestFlattenOneDNNOp1):
+
     def set_op_type(self):
         self.op_type = "flatten2"
 
 
 class TestFlatten2OneDNNOpSixDims(TestFlattenOneDNNOpSixDims):
+
     def set_op_type(self):
         self.op_type = "flatten2"
 
 
 #   BF16 TESTS
 def create_flatten_bf16_test_classes(parent):
+
     class TestFlatten2BF16OneDNNOp(parent):
+
         def set_inputs(self):
             self.dtype = np.uint16
             self.inputs = {
@@ -93,22 +101,22 @@ def calculate_grads(self):
             self.dx = np.reshape(self.dout, self.ori_shape)
 
         def test_check_output(self):
-            self.check_output_with_place(
-                core.CPUPlace(), no_check_set=["XShape"])
+            self.check_output_with_place(core.CPUPlace(),
+                                         no_check_set=["XShape"])
 
         def test_check_grad(self):
             self.calculate_grads()
-            self.check_grad_with_place(
-                core.CPUPlace(), ["X"],
-                "Out",
-                user_defined_grads=[self.dx],
-                user_defined_grad_outputs=[self.dout])
+            self.check_grad_with_place(core.CPUPlace(), ["X"],
+                                       "Out",
+                                       user_defined_grads=[self.dx],
+                                       user_defined_grad_outputs=[self.dout])
 
     cls_name = "{0}_{1}".format(parent.__name__, "Flatten2_BF16")
     TestFlatten2BF16OneDNNOp.__name__ = cls_name
     globals()[cls_name] = TestFlatten2BF16OneDNNOp
 
     class TestFlattenBF16OneDNNOp(parent):
+
         def set_op_type(self):
             self.dtype = np.uint16
             self.op_type = "flatten"
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
index ef26a27d05e1b..b4b30d1dbcaa4 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
@@ -26,6 +26,7 @@
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestFusionGRUBF16MKLDNNOp(OpTest):
+
     def set_confs(self):
         pass
 
@@ -76,10 +77,11 @@ def setUp(self):
             N, self.D).astype('float32') if self.with_h0 else np.zeros(
                 (N, self.D), dtype='float32')
 
-        _, _, _, hidden = fusion_gru(
-            x_fp32, self.lod, h0_fp32, wx_fp32, wh_fp32, bias, self.is_reverse,
-            self.origin_mode, ACTIVATION[self.act_state],
-            ACTIVATION[self.act_gate])
+        _, _, _, hidden = fusion_gru(x_fp32, self.lod, h0_fp32, wx_fp32,
+                                     wh_fp32, bias, self.is_reverse,
+                                     self.origin_mode,
+                                     ACTIVATION[self.act_state],
+                                     ACTIVATION[self.act_gate])
 
         hidden_bf16 = convert_float_to_uint16(hidden)
 
@@ -121,16 +123,19 @@ def setUp(self):
 
 
 class TestFusionGRUINT8MKLDNNOp2(TestFusionGRUBF16MKLDNNOp):
+
     def set_confs(self):
         self.origin_mode = False
 
 
 class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUBF16MKLDNNOp):
+
     def set_confs(self):
         self.with_bias = False
 
 
 class TestFusionGRUINT8MKLDNNBF16WeightsOp(TestFusionGRUBF16MKLDNNOp):
+
     def set_confs(self):
         self.weights_dtype = 'bf16'
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
index 4fda51e9e05f4..fee53dc348366 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
@@ -20,6 +20,7 @@
 
 
 class TestFusionGRUINT8MKLDNNOp(OpTest):
+
     def set_confs(self):
         pass
 
@@ -62,20 +63,19 @@ def setUp(self):
         # Scales shape in oneDNN:   [3, OC]
         s8_max = 127.0
         scale_ur = s8_max / np.max(np.abs(
-            np.concatenate(
-                [
-                    wx[:, :2 * self.OC], wh.flatten()[:2 * self.OC * self.OC]
-                    .reshape(self.OC, 2 * self.OC)
-                ],
-                axis=0)),
+            np.concatenate([
+                wx[:, :2 * self.OC],
+                wh.flatten()[:2 * self.OC * self.OC].reshape(
+                    self.OC, 2 * self.OC)
+            ],
+                           axis=0)),
                                    axis=0)
         scale_o = s8_max / np.max(np.abs(
-            np.concatenate(
-                [
-                    wx[:, 2 * self.OC:], wh.flatten()[2 * self.OC * self.OC:]
-                    .reshape(self.OC, self.OC)
-                ],
-                axis=0)),
+            np.concatenate([
+                wx[:, 2 * self.OC:],
+                wh.flatten()[2 * self.OC * self.OC:].reshape(self.OC, self.OC)
+            ],
+                           axis=0)),
                                   axis=0)
 
         scale_weights = np.concatenate([scale_ur, scale_o]).astype('float')
@@ -128,21 +128,25 @@ def test_check_output(self):
 
 
 class TestFusionGRUINT8MKLDNNOp2(TestFusionGRUINT8MKLDNNOp):
+
     def set_confs(self):
         self.force_fp32_output = False
 
 
 class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUINT8MKLDNNOp):
+
     def set_confs(self):
         self.origin_mode = False
 
 
 class TestFusionGRUINT8MKLDNNOp4(TestFusionGRUINT8MKLDNNOp):
+
     def set_confs(self):
         self.with_bias = False
 
 
 class TestFusionGRUINT8MKLDNNOp5(TestFusionGRUINT8MKLDNNOp):
+
     def set_confs(self):
         self.with_h0 = False
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
index 3c70380493d9a..2910a2c05c310 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
@@ -18,35 +18,41 @@
 
 
 class TestFusionGRUMKLDNNOp(TestFusionGRUOp):
+
     def set_confs(self):
         self.use_mkldnn = True
 
 
 class TestFusionGRUMKLDNNOpNoInitial(TestFusionGRUOp):
+
     def set_confs(self):
         self.with_h0 = False
         self.use_mkldnn = True
 
 
 class TestFusionGRUMKLDNNOpNoBias(TestFusionGRUOp):
+
     def set_confs(self):
         self.with_bias = False
         self.use_mkldnn = True
 
 
 class TestFusionGRUMKLDNNOpReverse(TestFusionGRUOp):
+
     def set_confs(self):
         self.is_reverse = True
         self.use_mkldnn = True
 
 
 class TestFusionGRUMKLDNNOpOriginMode(TestFusionGRUOp):
+
     def set_confs(self):
         self.origin_mode = True
         self.use_mkldnn = True
 
 
 class TestFusionGRUMKLDNNOpMD1(TestFusionGRUOp):
+
     def set_confs(self):
         self.M = 36
         self.D = 8
@@ -54,6 +60,7 @@ def set_confs(self):
 
 
 class TestFusionGRUMKLDNNOpMD2(TestFusionGRUOp):
+
     def set_confs(self):
         self.M = 8
         self.D = 8
@@ -61,6 +68,7 @@ def set_confs(self):
 
 
 class TestFusionGRUMKLDNNOpMD3(TestFusionGRUOp):
+
     def set_confs(self):
         self.M = 17
         self.D = 15
@@ -68,6 +76,7 @@ def set_confs(self):
 
 
 class TestFusionGRUMKLDNNOpBS1(TestFusionGRUOp):
+
     def set_confs(self):
         self.lod = [[3]]
         self.D = 16
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
index d07eda3259960..e094f8a844fe4 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
@@ -26,14 +26,16 @@
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestFusionLSTMBF16ONEDNNOp(OpTest):
+
     def set_confs(self):
         pass
 
     def test_check_output(self):
         for use_seq in {True, False}:
             self.attrs['use_seq'] = use_seq
-            self.check_output(
-                check_dygraph=False, no_check_set=["Cell"], atol=2e-2)
+            self.check_output(check_dygraph=False,
+                              no_check_set=["Cell"],
+                              atol=2e-2)
 
     def setUp(self):
         self.op_type = 'fusion_lstm'
@@ -137,21 +139,25 @@ def setUp(self):
 
 
 class TestFusionLSTMBF16ONEDNNPeepholesOp(TestFusionLSTMBF16ONEDNNOp):
+
     def set_confs(self):
         self.use_peepholes = True
 
 
 class TestFusionLSTMBF16ONEDNNInitializedStateOp(TestFusionLSTMBF16ONEDNNOp):
+
     def set_confs(self):
         self.has_initial_state = True
 
 
 class TestFusionLSTMBF16ONEDNNReverseOp(TestFusionLSTMBF16ONEDNNOp):
+
     def set_confs(self):
         self.is_reverse = True
 
 
 class TestFusionLSTMBF16ONEDNNBF16WeightsOp(TestFusionLSTMBF16ONEDNNOp):
+
     def set_confs(self):
         self.weights_dtype = 'bf16'
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
index 12f8c01783d9c..8d3b4db171487 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
@@ -19,6 +19,7 @@
 
 
 class TestFusionLSTMINT8MKLDNNOp(OpTest):
+
     def set_confs(self):
         pass
 
@@ -58,8 +59,7 @@ def setUp(self):
         s8_max = 127.0
 
         scale_weights = s8_max / np.max(
-            np.abs(np.concatenate(
-                [wx[:, :], wh[:, :]], axis=0)), axis=0)
+            np.abs(np.concatenate([wx[:, :], wh[:, :]], axis=0)), axis=0)
 
         scale_weights = scale_weights.astype('float')
 
@@ -80,10 +80,11 @@ def setUp(self):
             h0 = np.zeros((N, self.OC)).astype('float32')
             c0 = np.zeros((N, self.OC)).astype('float32')
 
-        hidden_f32, c = fusion_lstm(
-            x_f32, self.lod, wx, bx, h0, c0, wh, w_b, w_c, self.is_reverse,
-            ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
-            ACTIVATION[self.act_cand])
+        hidden_f32, c = fusion_lstm(x_f32, self.lod, wx, bx, h0, c0, wh, w_b,
+                                    w_c, self.is_reverse,
+                                    ACTIVATION[self.act_gate],
+                                    ACTIVATION[self.act_cell],
+                                    ACTIVATION[self.act_cand])
 
         self.inputs = {
             'X': (x_u8, self.lod),
@@ -128,23 +129,25 @@ def setUp(self):
     def test_check_output(self):
         for use_seq in {True, False}:
             self.attrs['use_seq'] = use_seq
-            self.check_output(
-                check_dygraph=False,
-                no_check_set=["Cell"],
-                atol=self.error_margin)
+            self.check_output(check_dygraph=False,
+                              no_check_set=["Cell"],
+                              atol=self.error_margin)
 
 
 class TestFusionLSTMINT8MKLDNNOp2(TestFusionLSTMINT8MKLDNNOp):
+
     def set_confs(self):
         self.force_fp32_output = True
 
 
 class TestFusionLSTMINT8MKLDNNOp4(TestFusionLSTMINT8MKLDNNOp):
+
     def set_confs(self):
         self.is_reverse = True
 
 
 class TestFusionLSTMINT8MKLDNNOp5(TestFusionLSTMINT8MKLDNNOp):
+
     def set_confs(self):
         self.has_initial_state = True
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_mkldnn_op.py
index 9988a033a7d89..6c48ba9b46a7b 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_mkldnn_op.py
@@ -18,6 +18,7 @@
 
 
 class TestFusionLSTMONEDNNOp(TestFusionLSTMOp):
+
     def set_conf(self):
         self.use_mkldnn = True
 
@@ -28,12 +29,14 @@ def test_check_output(self):
 
 
 class TestFusionLSTMONEDNNOpReverse(TestFusionLSTMONEDNNOp):
+
     def set_conf(self):
         self.is_reverse = True
         self.use_mkldnn = True
 
 
 class TestFusionLSTMONEDNNOpInitReverse(TestFusionLSTMONEDNNOp):
+
     def set_conf(self):
         self.has_initial_state = True
         self.is_reverse = True
@@ -41,6 +44,7 @@ def set_conf(self):
 
 
 class TestFusionLSTMONEDNNOpMD1(TestFusionLSTMONEDNNOp):
+
     def set_conf(self):
         self.M = 36
         self.D = 8
@@ -48,6 +52,7 @@ def set_conf(self):
 
 
 class TestFusionLSTMONEDNNOpMD2(TestFusionLSTMONEDNNOp):
+
     def set_conf(self):
         self.M = 8
         self.D = 8
@@ -55,6 +60,7 @@ def set_conf(self):
 
 
 class TestFusionLSTMONEDNNOpMD3(TestFusionLSTMONEDNNOp):
+
     def set_conf(self):
         self.M = 15
         self.D = 3
@@ -62,6 +68,7 @@ def set_conf(self):
 
 
 class TestFusionLSTMONEDNNOpBS1(TestFusionLSTMONEDNNOp):
+
     def set_conf(self):
         self.lod = [[3]]
         self.D = 16
@@ -69,6 +76,7 @@ def set_conf(self):
 
 
 class TestFusionLSTMONEDNNOpPeepholesInit(TestFusionLSTMONEDNNOp):
+
     def set_conf(self):
         self.use_peepholes = True
         self.has_initial_state = True
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py
index a65efa6deb01f..b0b9ddf879af6 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py
@@ -20,11 +20,13 @@
 
 
 class TestMKLDNNGaussianRandomOpSeed10(TestGaussianRandomOp):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
 class TestMKLDNNGaussianRandomOpSeed0(TestGaussianRandomOp):
+
     def setUp(self):
         TestGaussianRandomOp.setUp(self)
         self.use_mkldnn = True
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_bf16_mkldnn_op.py
index dc881a5752112..2cad7cd8cc79b 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_bf16_mkldnn_op.py
@@ -36,10 +36,10 @@
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestLayerNormBF16MKLDNNOp(TestLayerNormMKLDNNOp):
+
     def __assert_close(self, tensor, np_array, msg, rtol=2e-02, atol=2):
         self.assertTrue(
-            np.allclose(
-                np.array(tensor), np_array, rtol=rtol, atol=atol), msg)
+            np.allclose(np.array(tensor), np_array, rtol=rtol, atol=atol), msg)
 
     def check_forward(self,
                       shape,
@@ -83,15 +83,13 @@ def check_forward(self,
             # scale and bias are fp32 and other vars are of bf16
             for name in ground_truth:
                 if name == 'x_bf16' or name == 'y_bf16':
-                    block.create_var(
-                        name=name,
-                        dtype='uint16',
-                        shape=ground_truth[name].shape)
+                    block.create_var(name=name,
+                                     dtype='uint16',
+                                     shape=ground_truth[name].shape)
                 else:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape)
+                    block.create_var(name=name,
+                                     dtype='float32',
+                                     shape=ground_truth[name].shape)
 
             inputs = {"X": block.var('x_bf16')}
             if with_scale_bias:
@@ -130,8 +128,9 @@ def check_forward(self,
                 self.__assert_close(variance, out[2], "variance", 1e-3)
 
     def test_check_forward_with_is_test(self):
-        self.check_forward(
-            shape=[2, 3, 4, 5], begin_norm_axis=3, with_is_test=True)
+        self.check_forward(shape=[2, 3, 4, 5],
+                           begin_norm_axis=3,
+                           with_is_test=True)
 
     # TODO (jczaja): Enable those to test when enabling training using bf16
     def test_check_forward_with_scale_and_bias(self):
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_mkldnn_op.py
index d20fb003ee93b..d36b5cc9e6413 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_layer_norm_mkldnn_op.py
@@ -52,6 +52,7 @@ def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
 
 
 class TestLayerNormMKLDNNOp(unittest.TestCase):
+
     def setUp(self):
         self.use_mkldnn = True
 
@@ -95,8 +96,9 @@ def check_forward(self,
             block = program.global_block()
 
             for name in ground_truth:
-                block.create_var(
-                    name=name, dtype='float32', shape=ground_truth[name].shape)
+                block.create_var(name=name,
+                                 dtype='float32',
+                                 shape=ground_truth[name].shape)
 
             inputs = {"X": block.var('x')}
             if with_scale_bias:
@@ -138,12 +140,14 @@ def test_check_forward_with_scale_and_bias(self):
         self.check_forward(shape=[2, 3, 4, 5], begin_norm_axis=3)
 
     def test_check_forward_without_scale_and_bias(self):
-        self.check_forward(
-            shape=[2, 3, 4, 5], begin_norm_axis=3, with_scale_bias=False)
+        self.check_forward(shape=[2, 3, 4, 5],
+                           begin_norm_axis=3,
+                           with_scale_bias=False)
 
     def test_check_forward_with_is_test(self):
-        self.check_forward(
-            shape=[2, 3, 4, 5], begin_norm_axis=3, with_is_test=True)
+        self.check_forward(shape=[2, 3, 4, 5],
+                           begin_norm_axis=3,
+                           with_is_test=True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_log_softmax_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_log_softmax_mkldnn_op.py
index 7477eaf3339b2..89de5198101c2 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_log_softmax_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_log_softmax_mkldnn_op.py
@@ -22,6 +22,7 @@
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestLogSoftmaxOneDNNOp(OpTest):
+
     def setUp(self):
         self.op_type = 'log_softmax'
         self.set_dtype()
@@ -52,38 +53,45 @@ def test_check_output(self):
 
 
 class TestLogSoftmax1DOneDNNOp(TestLogSoftmaxOneDNNOp):
+
     def set_shape(self):
         self.shape = [100]
 
 
 class TestLogSoftmax3DOneDNNOp(TestLogSoftmaxOneDNNOp):
+
     def set_shape(self):
         self.shape = [12, 10, 3]
 
 
 class TestLogSoftmax5DOneDNNOp(TestLogSoftmaxOneDNNOp):
+
     def set_shape(self):
         self.shape = [2, 3, 4, 5, 6]
 
 
 class TestLogSoftmaxPositiveAxisOneDNNOp(TestLogSoftmaxOneDNNOp):
+
     def set_axis(self):
         self.axis = 2
 
 
 # BF16 TESTS
 class TestLogSoftmax1DBF16OneDNNOp(TestLogSoftmax1DOneDNNOp):
+
     def set_dtype(self):
         self.dtype = np.uint16
 
 
-class TestLogSoftmaxPositiveAxisBF16OneDNNOp(
-        TestLogSoftmaxPositiveAxisOneDNNOp):
+class TestLogSoftmaxPositiveAxisBF16OneDNNOp(TestLogSoftmaxPositiveAxisOneDNNOp
+                                             ):
+
     def set_dtype(self):
         self.dtype = np.uint16
 
 
 class TestLogSoftmax5DBF16OneDNNOp(TestLogSoftmax5DOneDNNOp):
+
     def set_shape(self):
         self.shape = [2, 3, 4, 5, 6]
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
index 088b4fb59057b..9941f567af2fe 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
@@ -20,6 +20,7 @@
 
 
 class TestLRNMKLDNNOp(TestLRNOp):
+
     def get_attrs(self):
         attrs = TestLRNOp.get_attrs(self)
         attrs['use_mkldnn'] = True
@@ -28,26 +29,33 @@ def get_attrs(self):
     def test_check_output(self):
         # We cannot validate MidOut as LRN REF has diffrent meaning in it
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(
-            atol=0.002, no_check_set=['MidOut'], check_dygraph=False)
+        self.check_output(atol=0.002,
+                          no_check_set=['MidOut'],
+                          check_dygraph=False)
 
     def test_check_grad_normal(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.01, check_dygraph=False)
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.01,
+                        check_dygraph=False)
 
 
 class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
+
     def get_attrs(self):
         attrs = TestLRNMKLDNNOp.get_attrs(self)
         attrs['is_test'] = True
         return attrs
 
     def test_check_grad_normal(self):
+
         def check_raise_is_test():
             try:
-                self.check_grad(
-                    ['X'], 'Out', max_relative_error=0.01, check_dygraph=False)
+                self.check_grad(['X'],
+                                'Out',
+                                max_relative_error=0.01,
+                                check_dygraph=False)
             except Exception as e:
                 t = \
                 "is_test attribute should be set to False in training phase."
@@ -58,6 +66,7 @@ def check_raise_is_test():
 
 
 class TestLRNMKLDNNOpNHWC(TestLRNMKLDNNOp):
+
     def init_test_case(self):
         self.data_format = 'NHWC'
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_bf16_mkldnn_op.py
index dba63be27b438..a16a5f3fdff20 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_bf16_mkldnn_op.py
@@ -25,6 +25,7 @@
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestMatmulBf16MklDNNOp(OpTest):
+
     def generate_data(self):
         self.x_fp32 = np.random.random((25, 2, 2)).astype(np.float32)
         self.y_fp32 = np.random.random((25, 2, 2)).astype(np.float32)
@@ -83,10 +84,10 @@ def calculate_grads(self):
         x_transpose_axes = [1, 0] if self.x_fp32.ndim == 2 else [0, 2, 1]
         y_transpose_axes = [1, 0] if self.y_fp32.ndim == 2 else [0, 2, 1]
 
-        x = np.transpose(self.x_fp32, x_transpose_axes) if self.attrs[
-            'transpose_X'] is True else self.x_fp32
-        y = np.transpose(self.y_fp32, y_transpose_axes) if self.attrs[
-            'transpose_Y'] is True else self.y_fp32
+        x = np.transpose(self.x_fp32, x_transpose_axes
+                         ) if self.attrs['transpose_X'] is True else self.x_fp32
+        y = np.transpose(self.y_fp32, y_transpose_axes
+                         ) if self.attrs['transpose_Y'] is True else self.y_fp32
 
         dout = self.alpha * np.matmul(x, y)
 
@@ -110,6 +111,7 @@ def calculate_grads(self):
 
 
 class TestDnnlMatMulOpAlpha(TestMatmulBf16MklDNNOp):
+
     def generate_data(self):
         self.x_fp32 = np.random.random((17, 2, 3)).astype(np.float32)
         self.y_fp32 = np.random.random((17, 3, 2)).astype(np.float32)
@@ -118,6 +120,7 @@ def generate_data(self):
 
 
 class TestDnnlMatMulOp2D(TestMatmulBf16MklDNNOp):
+
     def generate_data(self):
         self.x_fp32 = np.random.random((12, 9)).astype(np.float32)
         self.y_fp32 = np.random.random((9, 12)).astype(np.float32)
@@ -125,6 +128,7 @@ def generate_data(self):
 
 
 class TestDnnlMatMulOpTransposeX(TestMatmulBf16MklDNNOp):
+
     def generate_data(self):
         self.x_fp32 = np.random.random((12, 9)).astype(np.float32)
         self.y_fp32 = np.random.random((12, 9)).astype(np.float32)
@@ -140,6 +144,7 @@ def set_attributes(self):
 
 
 class TestDnnlMatMulOpTransposeY(TestMatmulBf16MklDNNOp):
+
     def generate_data(self):
         self.x_fp32 = np.random.random((12, 9)).astype(np.float32)
         self.y_fp32 = np.random.random((12, 9)).astype(np.float32)
@@ -155,6 +160,7 @@ def set_attributes(self):
 
 
 class TestMatmulBf16MklDNNForceFp32Output(TestMatmulBf16MklDNNOp):
+
     def generate_data(self):
         self.x_fp32 = np.random.random((12, 9)).astype(np.float32)
         self.y_fp32 = np.random.random((9, 12)).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
index 634288c3e875b..af838d7826edd 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
@@ -20,6 +20,7 @@
 
 
 class TestDnnlMatMulOp(OpTest):
+
     def generate_data(self):
         self.x = np.random.random((25, 2, 2)).astype("float32")
         self.y = np.random.random((25, 2, 2)).astype("float32")
@@ -48,11 +49,13 @@ def test_check_output(self):
 
 
 class TestDnnlMatMulWithGradOp(TestDnnlMatMulOp):
+
     def test_check_grad(self):
         self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-2)
 
 
 class TestDnnlMatMulOpMixedDims1(TestDnnlMatMulWithGradOp):
+
     def generate_data(self):
         self.x = np.random.random((17, 2, 3)).astype("float32")
         self.y = np.random.random((3, 4)).astype("float32")
@@ -60,6 +63,7 @@ def generate_data(self):
 
 
 class TestDnnlMatMulOpMixedDimsYWiderTransposeY(TestDnnlMatMulWithGradOp):
+
     def generate_data(self):
         self.x = np.random.random((8, 2, 3)).astype("float32")
         self.y = np.random.random((4, 3)).astype("float32")
@@ -70,6 +74,7 @@ def set_attributes(self):
 
 
 class TestDnnlMatMulOpMixedDimsYWiderTransposeX(TestDnnlMatMulWithGradOp):
+
     def generate_data(self):
         self.x = np.random.random((8, 3, 2)).astype("float32")
         self.y = np.random.random((3, 4)).astype("float32")
@@ -80,28 +85,31 @@ def set_attributes(self):
 
 
 class TestDnnlMatMulOpMixedDimsXWiderTransposeXY(TestDnnlMatMulWithGradOp):
+
     def generate_data(self):
         self.x = np.random.random((8, 3, 2)).astype("float32")
         self.y = np.random.random((4, 3)).astype("float32")
-        self.out = np.matmul(
-            np.transpose(self.x, (0, 2, 1)), np.transpose(self.y))
+        self.out = np.matmul(np.transpose(self.x, (0, 2, 1)),
+                             np.transpose(self.y))
 
     def set_attributes(self):
         self.attrs = {'transpose_X': True, 'transpose_Y': True}
 
 
 class TestDnnlMatMulOpMixedDimsYWiderTransposeXY(TestDnnlMatMulWithGradOp):
+
     def generate_data(self):
         self.x = np.random.random((3, 2)).astype("float32")
         self.y = np.random.random((8, 4, 3)).astype("float32")
-        self.out = np.matmul(
-            np.transpose(self.x), np.transpose(self.y, (0, 2, 1)))
+        self.out = np.matmul(np.transpose(self.x),
+                             np.transpose(self.y, (0, 2, 1)))
 
     def set_attributes(self):
         self.attrs = {'transpose_X': True, 'transpose_Y': True}
 
 
 class TestDnnlMatMulOpMixedDimsXWiderTransposeX(TestDnnlMatMulWithGradOp):
+
     def generate_data(self):
         self.x = np.random.random((5, 4)).astype("float32")
         self.y = np.random.random((8, 5, 4)).astype("float32")
@@ -112,6 +120,7 @@ def set_attributes(self):
 
 
 class TestDnnlMatMulOpVectorMultiply(TestDnnlMatMulWithGradOp):
+
     def generate_data(self):
         self.x = np.random.random((5)).astype("float32")
         self.y = np.random.random((5)).astype("float32")
@@ -119,6 +128,7 @@ def generate_data(self):
 
 
 class TestDnnlMatMulOpVectorMultiplyTranspose(TestDnnlMatMulWithGradOp):
+
     def generate_data(self):
         self.x = np.random.random((5)).astype("float32")
         x_resized = np.copy(self.x)
@@ -133,6 +143,7 @@ def set_attributes(self):
 
 
 class TestDnnlMatMulOpMixedDims2(TestDnnlMatMulWithGradOp):
+
     def generate_data(self):
         self.x = np.random.random((2, 3)).astype("float32")
         self.y = np.random.random((17, 3, 4)).astype("float32")
@@ -140,6 +151,7 @@ def generate_data(self):
 
 
 class TestDnnlMatMulOpAlpha(TestDnnlMatMulWithGradOp):
+
     def generate_data(self):
         self.x = np.random.random((17, 2, 3)).astype("float32")
         self.y = np.random.random((17, 3, 2)).astype("float32")
@@ -148,6 +160,7 @@ def generate_data(self):
 
 
 class TestDnnlMatMulOp2D(TestDnnlMatMulWithGradOp):
+
     def generate_data(self):
         self.x = np.random.random((12, 9)).astype("float32")
         self.y = np.random.random((9, 12)).astype("float32")
@@ -155,6 +168,7 @@ def generate_data(self):
 
 
 class TestDnnlMatMulOpTransposeX(TestDnnlMatMulWithGradOp):
+
     def generate_data(self):
         self.x = np.random.random((12, 9)).astype("float32")
         self.y = np.random.random((12, 9)).astype("float32")
@@ -165,6 +179,7 @@ def set_attributes(self):
 
 
 class TestDnnlMatMulOpTransposeY(TestDnnlMatMulWithGradOp):
+
     def generate_data(self):
         self.x = np.random.random((12, 9)).astype("float32")
         self.y = np.random.random((12, 9)).astype("float32")
@@ -175,6 +190,7 @@ def set_attributes(self):
 
 
 class TestDnnlMatMulOpTransposeY3D(TestDnnlMatMulWithGradOp):
+
     def generate_data(self):
         self.x = np.random.random((17, 3, 2)).astype("float32")
         self.y = np.random.random((17, 3, 2)).astype("float32")
@@ -185,6 +201,7 @@ def set_attributes(self):
 
 
 class TestDnnlMatMulOpInt8NoScales(TestDnnlMatMulOp):
+
     def generate_data(self):
         self.x = np.random.random((12, 9)).astype("int8")
         self.y = np.random.random((9, 12)).astype("int8")
@@ -223,6 +240,7 @@ def test_check_output(self):
 
 
 class TestDnnlMatMulOpInt8ForceFP32(TestDnnlMatMulOpInt8):
+
     def generate_data(self):
         x_float = np.random.random((12, 9)).astype("float32")
         self.x_scale, self.x = self.quantize(x_float)
@@ -242,6 +260,7 @@ def set_attributes(self):
 
 
 class TestDnnlMatMulOpInt8ForceFP32BasicScales(TestDnnlMatMulOp):
+
     def generate_data(self):
         self.x = np.random.randint(0, 3, (12, 9)).astype("int8")
         self.y = np.random.randint(0, 3, (9, 12)).astype("int8")
@@ -253,6 +272,7 @@ def set_attributes(self):
 
 @skip_check_grad_ci(reason="DNNL's MatMul doesn't implement grad kernel.")
 class TestReshapeTransposeMatMulOp(OpTest):
+
     def init_data_type(self):
         self.data_type_ = 'float32'
 
@@ -300,6 +320,7 @@ def test_check_output(self):
 
 
 class TestReshapeTransposeMatMulOp4DXFloat(TestReshapeTransposeMatMulOp):
+
     def generate_data(self):
         self.x = np.random.random([2, 128, 768]).astype("float32")
         self.y = np.random.random([2, 128, 768]).astype("float32").reshape(
@@ -314,11 +335,13 @@ def generate_data(self):
 
 
 class TestReshapeTransposeMatMulOp4DXInt8(TestReshapeTransposeMatMulOp4DXFloat):
+
     def init_data_type(self):
         self.data_type_ = 'int8'
 
 
 class TestReshapeTransposeMatMulOp4DYFloat(TestReshapeTransposeMatMulOp):
+
     def generate_data(self):
         self.x = np.random.random([2, 128, 768]).astype("float32").reshape(
             [2, 128, 12, 64]).transpose([0, 2, 1, 3])
@@ -328,15 +351,18 @@ def generate_data(self):
         self.fused_transpose_Y = [0, 2, 1, 3]
         self.fused_reshape_Y = [0, 0, 12, 64]
         self.out = np.matmul(
-            self.x, self.y.reshape([2, 128, 12, 64]).transpose([0, 2, 3, 1]))
+            self.x,
+            self.y.reshape([2, 128, 12, 64]).transpose([0, 2, 3, 1]))
 
 
 class TestReshapeTransposeMatMulOp4DYInt8(TestReshapeTransposeMatMulOp4DYFloat):
+
     def init_data_type(self):
         self.data_type_ = 'int8'
 
 
 class TestReshapeTransposeMatMulOp4DXYFloat(TestReshapeTransposeMatMulOp):
+
     def generate_data(self):
         self.x = np.random.random([2, 128, 768]).astype("float32")
         self.y = np.random.random([2, 128, 768]).astype("float32")
@@ -349,13 +375,15 @@ def generate_data(self):
             self.y.reshape([2, 128, 12, 64]).transpose([0, 2, 3, 1]))
 
 
-class TestReshapeTransposeMatMulOp4DXYInt8(
-        TestReshapeTransposeMatMulOp4DXYFloat):
+class TestReshapeTransposeMatMulOp4DXYInt8(TestReshapeTransposeMatMulOp4DXYFloat
+                                           ):
+
     def init_data_type(self):
         self.data_type_ = 'int8'
 
 
 class TestReshapeTransposeMatMulOp2DXFloat(TestReshapeTransposeMatMulOp):
+
     def generate_data(self):
         self.x = np.random.random([2, 5, 10]).astype("float32")
         self.y = np.random.random([2, 5, 10]).astype("float32").reshape(
@@ -365,16 +393,18 @@ def generate_data(self):
         self.fused_transpose_Y = []
         self.fused_reshape_Y = []
         self.out = np.matmul(
-            self.x.reshape([10, 10]).transpose([1, 0]),
-            self.y.transpose([1, 0]))
+            self.x.reshape([10, 10]).transpose([1, 0]), self.y.transpose([1,
+                                                                          0]))
 
 
 class TestReshapeTransposeMatMulOp2DXInt8(TestReshapeTransposeMatMulOp2DXFloat):
+
     def init_data_type(self):
         self.data_type_ = 'int8'
 
 
 class TestReshapeTransposeMatMulOp2DYFloat(TestReshapeTransposeMatMulOp):
+
     def generate_data(self):
         self.x = np.random.random([2, 5, 10]).astype("float32").reshape(
             [10, 10]).transpose([1, 0])
@@ -387,11 +417,13 @@ def generate_data(self):
 
 
 class TestReshapeTransposeMatMulOp2DYInt8(TestReshapeTransposeMatMulOp2DYFloat):
+
     def init_data_type(self):
         self.data_type_ = 'int8'
 
 
 class TestReshapeTransposeMatMulOp3DXFloat(TestReshapeTransposeMatMulOp):
+
     def generate_data(self):
         self.x = np.random.random([2, 2, 5, 5]).astype("float32")
         self.y = np.random.random([2, 2, 5, 5]).astype("float32").reshape(
@@ -406,11 +438,13 @@ def generate_data(self):
 
 
 class TestReshapeTransposeMatMulOp3DXInt8(TestReshapeTransposeMatMulOp3DXFloat):
+
     def init_data_type(self):
         self.data_type_ = 'int8'
 
 
 class TestReshapeTransposeMatMulOp3DYFloat(TestReshapeTransposeMatMulOp):
+
     def generate_data(self):
         self.x = np.random.random([2, 2, 5, 5]).astype(self.data_type_).reshape(
             [2, 10, 5]).transpose([0, 2, 1])
@@ -423,12 +457,14 @@ def generate_data(self):
 
 
 class TestReshapeTransposeMatMulOp3DYInt8(TestReshapeTransposeMatMulOp3DYFloat):
+
     def init_data_type(self):
         self.data_type_ = 'int8'
 
 
 @skip_check_grad_ci(reason="Tests inference only optimization.")
 class TestMatMulOpTransposeReshapeEmptyFloat(OpTest):
+
     def init_data_type(self):
         self.data_type_ = np.float32
 
@@ -479,12 +515,14 @@ def check_raise_error(self, msg):
 
 class TestMatMulOpTransposeReshapeIntEmptyInt(
         TestMatMulOpTransposeReshapeEmptyFloat):
+
     def init_data_type(self):
         self.data_type_ = np.int8
 
 
 class TestMatMulOpTransposeReshapeBasicFloat(
         TestMatMulOpTransposeReshapeEmptyFloat):
+
     def generate_data(self):
         self.bs = 8
         self.x = np.random.random([self.bs, 12, 128,
@@ -501,12 +539,14 @@ def init_params_and_out(self):
 
 class TestMatMulOpTransposeReshapeBasicInt(
         TestMatMulOpTransposeReshapeBasicFloat):
+
     def init_data_type(self):
         self.data_type_ = np.int8
 
 
 class TestMatMulOpTransposeReshapeOtherDimFloat(
         TestMatMulOpTransposeReshapeBasicFloat):
+
     def generate_data(self):
         self.bs = 11
         self.x = np.random.random([self.bs, 12, 14, 18]).astype(self.data_type_)
@@ -515,6 +555,7 @@ def generate_data(self):
 
 class TestMatMulOpTransposeReshapeOtherDimInt(
         TestMatMulOpTransposeReshapeOtherDimFloat):
+
     def init_data_type(self):
         self.data_type_ = np.int8
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
index 69cee49c3ec61..6f45da4e31ec0 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
@@ -59,6 +59,7 @@ def reference_matmul(X, Y, transpose_x=False, transpose_y=False):
 
 
 class TestMatMulV2VectorXVectorOneDNNOp(OpTest):
+
     def config(self):
         self.x_shape = (100, )
         self.y_shape = (100, )
@@ -102,6 +103,7 @@ def test_check_grad(self):
 
 class TestMatMulV2VectorXMatrixTransposeYOneDNNOp(
         TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (100, )
         self.y_shape = (1, 3, 2, 100)
@@ -110,6 +112,7 @@ def config(self):
 
 
 class TestMatMulV2VectorXMatrixOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (100, )
         self.y_shape = (1, 1, 100, 2)
@@ -119,6 +122,7 @@ def config(self):
 
 class TestMatMulV2MatrixXVectorTransposeXOneDNNOp(
         TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (1, 1, 100, 1)
         self.y_shape = (100, )
@@ -127,6 +131,7 @@ def config(self):
 
 
 class TestMatMulV2MatrixXVectorOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (1, 2, 1, 100)
         self.y_shape = (100, )
@@ -135,6 +140,7 @@ def config(self):
 
 
 class TestMatMulV2MatrixXMatrixOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (1, 1, 2, 100)
         self.y_shape = (1, 1, 100, 1)
@@ -144,6 +150,7 @@ def config(self):
 
 class TestMatMulV2MatrixXMatrixTransposeYOneDNNOp(
         TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (1, 1, 1, 100)
         self.y_shape = (2, 1, 2, 100)
@@ -152,6 +159,7 @@ def config(self):
 
 
 class TestMatMulV2MatrixXMatrix2OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (2, 1, 12, 9)
         self.y_shape = (1, 3, 9, 12)
@@ -160,6 +168,7 @@ def config(self):
 
 
 class TestMatMulV2MatrixXMatrix3OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (2, 1, 2, 100)
         self.y_shape = (1, 1, 100, 2)
@@ -169,6 +178,7 @@ def config(self):
 
 class TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2(
         TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (2, 1, 4, 25)
         self.y_shape = (1, 1, 4, 25)
@@ -178,6 +188,7 @@ def config(self):
 
 class TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3(
         TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (2, 2, 7, 4)
         self.y_shape = (2, 2, 7, 5)
@@ -187,6 +198,7 @@ def config(self):
 
 class TestMatMulV2MatrixXMatrixTransposeX3OneDNNOp(
         TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (3, 1, 6, 7)
         self.y_shape = (1, 2, 6, 9)
@@ -195,6 +207,7 @@ def config(self):
 
 
 class TestMatMulV2MatrixXMatrix4OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (3, 1, 6, 6)
         self.y_shape = (1, 2, 6, 9)
@@ -203,6 +216,7 @@ def config(self):
 
 
 class TestMatMulV2VectorXMatrix5DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (100)
         self.y_shape = (1, 2, 2, 100, 2)
@@ -211,6 +225,7 @@ def config(self):
 
 
 class TestMatMulV2Matrix3DXVectorOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (2, 1, 100)
         self.y_shape = (100)
@@ -220,6 +235,7 @@ def config(self):
 
 class TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp(
         TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (3, 1, 10, 8)
         self.y_shape = (1, 2, 9, 10)
@@ -229,6 +245,7 @@ def config(self):
 
 class TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp(
         TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (3, 1, 10, 10)
         self.y_shape = (1, 2, 9, 10)
@@ -238,6 +255,7 @@ def config(self):
 
 class TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp(
         TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (1, 3, 1, 10, 10)
         self.y_shape = (3, 1, 2, 9, 10)
@@ -246,6 +264,7 @@ def config(self):
 
 
 class TestMatMulV2MatrixXMatrix6Dx2DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (1, 1, 2, 1, 8, 9)
         self.y_shape = (9, 12)
@@ -254,6 +273,7 @@ def config(self):
 
 
 class TestMatMulV2MatrixXMatrix2Dx5DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (20, 5)
         self.y_shape = (1, 2, 1, 5, 11)
@@ -263,6 +283,7 @@ def config(self):
 
 class TestMatMulV2MatrixXMatrix4Dx3DTransposeXOneDNNOp(
         TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (5, 4, 15, 10)
         self.y_shape = (1, 15, 20)
@@ -272,6 +293,7 @@ def config(self):
 
 class TestMatMulV2MatrixXMatrix3Dx4DTransposeYOneDNNOp(
         TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (2, 10, 15)
         self.y_shape = (4, 2, 20, 15)
@@ -281,6 +303,7 @@ def config(self):
 
 class TestMatMulV2MatrixXMatrix5Dx3DTransposeXTransposeYOneDNNOp(
         TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (4, 3, 2, 15, 10)
         self.y_shape = (1, 20, 15)
@@ -289,6 +312,7 @@ def config(self):
 
 
 class TestMatMulV2MatrixXMatrix3Dx4DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+
     def config(self):
         self.x_shape = (1, 1, 32, 16)
         self.y_shape = (16, 16, 16)
@@ -298,8 +322,10 @@ def config(self):
 
 #   BF16 TESTS
 def create_bf16_test_class(parent):
+
     @OpTestTool.skip_if_not_cpu_bf16()
     class TestMatMulV2Bf16OneDNNOp(parent):
+
         def set_inputs(self, x, y):
             self.inputs = {
                 'X': convert_float_to_uint16(x),
@@ -348,10 +374,10 @@ def calculate_grads(self):
             x_transpose_axes = self.shape_transpose_axes[self.x_fp32.ndim]
             y_transpose_axes = self.shape_transpose_axes[self.y_fp32.ndim]
 
-            x = np.transpose(self.x_fp32, x_transpose_axes) if self.attrs[
-                'trans_x'] is True else self.x_fp32
-            y = np.transpose(self.y_fp32, y_transpose_axes) if self.attrs[
-                'trans_y'] is True else self.y_fp32
+            x = np.transpose(self.x_fp32, x_transpose_axes
+                             ) if self.attrs['trans_x'] is True else self.x_fp32
+            y = np.transpose(self.y_fp32, y_transpose_axes
+                             ) if self.attrs['trans_y'] is True else self.y_fp32
 
             dout = np.matmul(x, y)
 
@@ -383,15 +409,13 @@ def calculate_grads(self):
             if is_broadcast:
                 x_reduce_axis = []
                 y_reduce_axis = []
-                for index, (
-                        first, second
-                ) in enumerate(zip(x_shape[0:-2], self.dx.shape[0:-2])):
+                for index, (first, second) in enumerate(
+                        zip(x_shape[0:-2], self.dx.shape[0:-2])):
                     if first != second:
                         x_reduce_axis.append(index)
 
-                for index, (
-                        first, second
-                ) in enumerate(zip(y_shape[0:-2], self.dy.shape[0:-2])):
+                for index, (first, second) in enumerate(
+                        zip(y_shape[0:-2], self.dy.shape[0:-2])):
                     if first != second:
                         y_reduce_axis.append(index)
 
@@ -438,23 +462,27 @@ def calculate_grads(self):
 
 class TestMatMulV2OpTransposeReshapeEmptyFloat(
         TestMatMulOpTransposeReshapeEmptyFloat):
+
     def set_op_type(self):
         self.op_type = "matmul_v2"
 
 
 class TestMatMulV2OpTransposeReshapeBasicFloat(
         TestMatMulOpTransposeReshapeBasicFloat):
+
     def set_op_type(self):
         self.op_type = "matmul_v2"
 
 
 class TestMatMulV2OpTransposeReshapeOtherDimFloat(
         TestMatMulOpTransposeReshapeOtherDimFloat):
+
     def set_op_type(self):
         self.op_type = "matmul_v2"
 
 
 class TestMatMulV2OpReshapeTranspose(TestReshapeTransposeMatMulOp):
+
     def set_op_type_and_transpose_y_name(self):
         self.op_type = "matmul_v2"
         self.transpose_y_name = "trans_y"
@@ -462,6 +490,7 @@ def set_op_type_and_transpose_y_name(self):
 
 class TestMatMulV2OpReshapeTranspose4DXFloat(
         TestReshapeTransposeMatMulOp4DXFloat):
+
     def set_op_type_and_transpose_y_name(self):
         self.op_type = "matmul_v2"
         self.transpose_y_name = "trans_y"
@@ -469,6 +498,7 @@ def set_op_type_and_transpose_y_name(self):
 
 class TestMatMulV2OpReshapeTranspose4DYFloat(
         TestReshapeTransposeMatMulOp4DYFloat):
+
     def set_op_type_and_transpose_y_name(self):
         self.op_type = "matmul_v2"
         self.transpose_y_name = "trans_y"
@@ -476,6 +506,7 @@ def set_op_type_and_transpose_y_name(self):
 
 class TestMatMulV2OpReshapeTranspose4DXYFloat(
         TestReshapeTransposeMatMulOp4DXYFloat):
+
     def set_op_type_and_transpose_y_name(self):
         self.op_type = "matmul_v2"
         self.transpose_y_name = "trans_y"
@@ -483,6 +514,7 @@ def set_op_type_and_transpose_y_name(self):
 
 class TestMatMulV2OpReshapeTranspose2DXFloat(
         TestReshapeTransposeMatMulOp2DXFloat):
+
     def set_op_type_and_transpose_y_name(self):
         self.op_type = "matmul_v2"
         self.transpose_y_name = "trans_y"
@@ -490,6 +522,7 @@ def set_op_type_and_transpose_y_name(self):
 
 class TestMatMulV2OpReshapeTranspose2DYFloat(
         TestReshapeTransposeMatMulOp2DYFloat):
+
     def set_op_type_and_transpose_y_name(self):
         self.op_type = "matmul_v2"
         self.transpose_y_name = "trans_y"
@@ -497,6 +530,7 @@ def set_op_type_and_transpose_y_name(self):
 
 class TestMatMulV2OpReshapeTranspose3DXFloat(
         TestReshapeTransposeMatMulOp3DXFloat):
+
     def set_op_type_and_transpose_y_name(self):
         self.op_type = "matmul_v2"
         self.transpose_y_name = "trans_y"
@@ -504,6 +538,7 @@ def set_op_type_and_transpose_y_name(self):
 
 class TestMatMulV2OpReshapeTranspose3DYFloat(
         TestReshapeTransposeMatMulOp3DYFloat):
+
     def set_op_type_and_transpose_y_name(self):
         self.op_type = "matmul_v2"
         self.transpose_y_name = "trans_y"
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
index 9265d5f7edfbb..67d06e7b22c1b 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
@@ -25,9 +25,11 @@
 
 
 @skip_check_grad_ci(
-    reason="mul_mkldnn_op does not implement grad operator, check_grad is not required."
+    reason=
+    "mul_mkldnn_op does not implement grad operator, check_grad is not required."
 )
 class TestMKLDNNMulOpS8S8(OpTest):
+
     def setUp(self):
         self.op_type = "mul"
         self.init_kernel_type()
@@ -78,8 +80,9 @@ def init_data(self):
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output_with_place(
-            core.CPUPlace(), atol=0, check_dygraph=False)
+        self.check_output_with_place(core.CPUPlace(),
+                                     atol=0,
+                                     check_dygraph=False)
 
 
 '''
@@ -88,6 +91,7 @@ def test_check_output(self):
 
 
 class TestMKLDNNMulOpS8U8(TestMKLDNNMulOpS8S8):
+
     def init_data_type(self):
         self.srctype = np.uint8
         self.dsttype = np.float32 if self.force_fp32 else np.int8
@@ -99,6 +103,7 @@ def init_data_type(self):
 
 
 class TestMKLDNNMulOpS8S8WithFlatten(TestMKLDNNMulOpS8S8):
+
     def setUp(self):
         self.op_type = "mul"
         self.init_kernel_type()
@@ -154,6 +159,7 @@ def init_data(self):
 
 
 class TestMKLDNNMulOpS8U8WithFlatten(TestMKLDNNMulOpS8S8WithFlatten):
+
     def init_data_type(self):
         self.srctype = np.uint8
         self.dsttype = np.float32 if self.force_fp32 else np.int8
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py
index a0581d791209d..f4e7bd78e23d5 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py
@@ -24,6 +24,7 @@
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestMulOneDNNOp(OpTest):
+
     def setUp(self):
         self.op_type = "mul"
         self.attrs = {'use_mkldnn': True}
@@ -39,9 +40,8 @@ def setUp(self):
 
         self.inputs = {'X': self.x, 'Y': self.y}
 
-        output = np.dot(
-            np.reshape(self.x_fp32, self.np_x_shape),
-            np.reshape(self.y_fp32, self.np_y_shape))
+        output = np.dot(np.reshape(self.x_fp32, self.np_x_shape),
+                        np.reshape(self.y_fp32, self.np_y_shape))
         self.outputs = {'Out': np.reshape(output, self.out_shape)}
 
     def init_shapes_and_attrs(self):
@@ -70,6 +70,7 @@ def test_check_grad_ingore_y(self):
 
 
 class TestMulXNumColDims2OneDNNOp(TestMulOneDNNOp):
+
     def init_shapes_and_attrs(self):
         self.x_shape = (6, 7, 5)
         self.y_shape = (5, 21)
@@ -83,6 +84,7 @@ def init_shapes_and_attrs(self):
 
 
 class TestMulYNumColDims2OneDNNOp(TestMulOneDNNOp):
+
     def init_shapes_and_attrs(self):
         self.x_shape = (20, 6)
         self.y_shape = (2, 3, 21)
@@ -96,6 +98,7 @@ def init_shapes_and_attrs(self):
 
 
 class TestMulYAndXNumColDims2OneDNNOp(TestMulOneDNNOp):
+
     def init_shapes_and_attrs(self):
         self.x_shape = (10, 5, 6)
         self.y_shape = (2, 3, 21)
@@ -110,6 +113,7 @@ def init_shapes_and_attrs(self):
 
 
 class TestMulBF16OneDNNOp(TestMulOneDNNOp):
+
     def init_inputs_dtype(self):
         self.x = convert_float_to_uint16(self.x)
         self.y = convert_float_to_uint16(self.y)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_multi_gru_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_multi_gru_mkldnn_op.py
index 04941ef22ac3b..4c11712947540 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_multi_gru_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_multi_gru_mkldnn_op.py
@@ -42,6 +42,7 @@ def multi_gru(
 
 
 class TestMultiGruMkldnnOp(OpTest):
+
     def set_confs(self):
         pass
 
@@ -96,9 +97,8 @@ def setUp(self):
                 wx.append(np.random.rand(IC, 3 * OC).astype('float32'))
                 wh.append(np.random.rand(OC, 3 * OC).astype('float32'))
                 bias.append(
-                    np.random.rand(1, 3 * OC).astype('float32')
-                    if self.with_bias else np.zeros(
-                        (1, 3 * OC), dtype='float32'))
+                    np.random.rand(1, 3 * OC).astype('float32') if self.
+                    with_bias else np.zeros((1, 3 * OC), dtype='float32'))
                 h0.append(np.zeros((N, OC), dtype='float32'))
 
         self.inputs['WeightX'] = [('wx' + str(i), wx[i])
@@ -116,20 +116,20 @@ def setUp(self):
                 OC = self.OCs[layer]
                 for j in range(2):
                     scale_ur = s8_max / np.max(np.abs(
-                        np.concatenate(
-                            [
-                                wx[2 * layer + j][:, :2 * OC], wh[2 * layer + j]
-                                .flatten()[:2 * OC * OC].reshape(OC, 2 * OC)
-                            ],
-                            axis=0)),
+                        np.concatenate([
+                            wx[2 * layer + j][:, :2 * OC],
+                            wh[2 * layer + j].flatten()[:2 * OC * OC].reshape(
+                                OC, 2 * OC)
+                        ],
+                                       axis=0)),
                                                axis=0)
                     scale_o = s8_max / np.max(np.abs(
-                        np.concatenate(
-                            [
-                                wx[2 * layer + j][:, 2 * OC:], wh[2 * layer + j]
-                                .flatten()[2 * OC * OC:].reshape(OC, OC)
-                            ],
-                            axis=0)),
+                        np.concatenate([
+                            wx[2 * layer + j][:, 2 * OC:],
+                            wh[2 * layer + j].flatten()[2 * OC * OC:].reshape(
+                                OC, OC)
+                        ],
+                                       axis=0)),
                                               axis=0)
 
                     scale_weights.append(
@@ -167,11 +167,13 @@ def test_check_output(self):
 
 
 class TestMultiGruMkldnnOpNoBias(TestMultiGruMkldnnOp):
+
     def set_confs(self):
         self.with_bias = False
 
 
 class TestMultiGruMkldnnOpLayers2(TestMultiGruMkldnnOp):
+
     def set_confs(self):
         self.layers = 2
         self.ICs = [2, 6]
@@ -179,6 +181,7 @@ def set_confs(self):
 
 
 class TestMultiGruMkldnnOpLayers3(TestMultiGruMkldnnOp):
+
     def set_confs(self):
         self.layers = 3
         self.ICs = [2, 6, 12]
@@ -186,60 +189,71 @@ def set_confs(self):
 
 
 class TestMultiGruMkldnnOpOriginMode(TestMultiGruMkldnnOp):
+
     def set_confs(self):
         self.origin_mode = True
 
 
 class TestMultiGruMkldnnInt8Op(TestMultiGruMkldnnOp):
+
     def set_dtype(self):
         self.dtype = 'int8'
 
 
 class TestMultiGruMkldnnInt8OpForceFP32Output(TestMultiGruMkldnnInt8Op):
+
     def set_force_fp32_output(self):
         self.force_fp32_output = True
 
 
 class TestMultiGruMkldnnInt8OpNoBias(TestMultiGruMkldnnOpNoBias):
+
     def set_dtype(self):
         self.dtype = 'int8'
 
 
 class TestMultiGruMkldnnInt8OpNoBiasForceFP32Output(
         TestMultiGruMkldnnInt8OpNoBias):
+
     def set_force_fp32_output(self):
         self.force_fp32_output = True
 
 
 class TestMultiGruMkldnnInt8OpLayers2(TestMultiGruMkldnnOpLayers2):
+
     def set_dtype(self):
         self.dtype = 'int8'
 
 
 class TestMultiGruMkldnnInt8OpLayers2ForceFP32Output(
         TestMultiGruMkldnnInt8OpLayers2):
+
     def set_force_fp32_output(self):
         self.force_fp32_output = True
 
 
 class TestMultiGruMkldnnInt8OpLayers3(TestMultiGruMkldnnOpLayers3):
+
     def set_dtype(self):
         self.dtype = 'int8'
 
 
 class TestMultiGruMkldnnInt8OpLayers3ForceFP32Output(
         TestMultiGruMkldnnInt8OpLayers3):
+
     def set_force_fp32_output(self):
         self.force_fp32_output = True
 
 
 class TestMultiGruMkldnnInt8OpOriginMode(TestMultiGruMkldnnOpOriginMode):
+
     def set_dtype(self):
         self.dtype = 'int8'
 
 
 class TestMultiGruMkldnnInt8OpOriginModeForceFP32Output(
         TestMultiGruMkldnnInt8OpOriginMode):
+
     def set_force_fp32_output(self):
         self.force_fp32_output = True
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
index a802ef4c61285..f2d0dd9101e04 100755
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
@@ -60,6 +60,7 @@ def nearest_neighbor_interp_mkldnn_np(X,
 
 @skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
 class TestNearestInterpMKLDNNOp(OpTest):
+
     def init_test_case(self):
         pass
 
@@ -105,9 +106,10 @@ def setUp(self):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = nearest_neighbor_interp_mkldnn_np(
-            input_np, out_h, out_w, self.out_size, self.actual_shape,
-            self.data_layout)
+        output_np = nearest_neighbor_interp_mkldnn_np(input_np, out_h, out_w,
+                                                      self.out_size,
+                                                      self.actual_shape,
+                                                      self.data_layout)
 
         self.inputs = {'X': input_np}
         if self.out_size is not None:
@@ -129,6 +131,7 @@ def test_check_output(self):
 
 
 class TestNearestInterpOpMKLDNNNHWC(TestNearestInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [3, 2, 32, 16]
         self.out_h = 27
@@ -138,6 +141,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpMKLDNNCase2(TestNearestInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
@@ -146,6 +150,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase3(TestNearestInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [1, 1, 32, 64]
         self.out_h = 64
@@ -154,6 +159,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase4(TestNearestInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [1, 1, 32, 64]
         self.out_h = 64
@@ -163,6 +169,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpSame(TestNearestInterpMKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [2, 3, 32, 64]
         self.out_h = 32
@@ -171,15 +178,19 @@ def init_test_case(self):
 
 
 def create_test_class(parent):
+
     class TestFp32Case(parent):
+
         def init_data_type(self):
             self.dtype = np.float32
 
     class TestInt8Case(parent):
+
         def init_data_type(self):
             self.dtype = np.int8
 
     class TestUint8Case(parent):
+
         def init_data_type(self):
             self.dtype = np.uint8
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
index d72a1d53d3aa5..075792e3a5151 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
@@ -61,6 +61,7 @@ def nearest_neighbor_interp_mkldnn_np(X,
 @skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestNearestInterpV2MKLDNNOp(OpTest):
+
     def init_test_case(self):
         pass
 
@@ -120,9 +121,10 @@ def setUp(self):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = nearest_neighbor_interp_mkldnn_np(
-            input_np, out_h, out_w, self.out_size, self.actual_shape,
-            self.data_layout)
+        output_np = nearest_neighbor_interp_mkldnn_np(input_np, out_h, out_w,
+                                                      self.out_size,
+                                                      self.actual_shape,
+                                                      self.data_layout)
 
         if isinstance(self.scale, float):
             self.scale = [self.scale]
@@ -150,6 +152,7 @@ def test_check_output(self):
 
 
 class TestNearestInterpOpV2MKLDNNNHWC(TestNearestInterpV2MKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [3, 2, 32, 16]
         self.out_h = 27
@@ -159,6 +162,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpV2MKLDNNCase2(TestNearestInterpV2MKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
@@ -166,6 +170,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpV2MKLDNNCase3(TestNearestInterpV2MKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [1, 1, 32, 64]
         self.out_h = 64
@@ -174,6 +179,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpV2MKLDNNCase4(TestNearestInterpV2MKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [1, 1, 32, 64]
         self.out_h = 64
@@ -183,6 +189,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpV2MKLDNNSame(TestNearestInterpV2MKLDNNOp):
+
     def init_test_case(self):
         self.input_shape = [2, 3, 32, 64]
         self.out_h = 32
@@ -191,19 +198,24 @@ def init_test_case(self):
 
 
 def create_test_class(parent):
+
     class TestFp32Case(parent):
+
         def init_data_type(self):
             self.dtype = np.float32
 
     class TestBf16Case(parent):
+
         def init_data_type(self):
             self.dtype = np.uint16
 
     class TestInt8Case(parent):
+
         def init_data_type(self):
             self.dtype = np.int8
 
     class TestUint8Case(parent):
+
         def init_data_type(self):
             self.dtype = np.uint8
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_bf16_mkldnn_op.py
index 5430c1598f84d..794871ba5c1c0 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_bf16_mkldnn_op.py
@@ -25,6 +25,7 @@
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestPoolBf16MklDNNOpGrad(TestPool2D_Op_Mixin, OpTest):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
@@ -36,10 +37,11 @@ def setUp(self):
         self.attrs['mkldnn_data_type'] = "bfloat16"
         self.x_fp32 = np.random.random(self.shape).astype(np.float32)
 
-        output = self.pool2D_forward_naive(
-            self.x_fp32, self.ksize, self.strides, self.paddings,
-            self.global_pool, self.ceil_mode, self.exclusive, self.adaptive,
-            "float32").astype(np.float32)
+        output = self.pool2D_forward_naive(self.x_fp32, self.ksize,
+                                           self.strides, self.paddings,
+                                           self.global_pool, self.ceil_mode,
+                                           self.exclusive, self.adaptive,
+                                           "float32").astype(np.float32)
 
         self.inputs = {'X': convert_float_to_uint16(self.x_fp32)}
         self.outputs = {'Out': convert_float_to_uint16(output)}
@@ -48,25 +50,27 @@ def test_check_output(self):
         self.check_output_with_place(core.CPUPlace())
 
     def test_check_grad(self):
-        x_grad = pool2d_backward_naive(
-            self.x_fp32,
-            ksize=self.ksize,
-            strides=self.strides,
-            paddings=self.paddings,
-            global_pool=self.global_pool,
-            ceil_mode=False,
-            exclusive=self.exclusive,
-            adaptive=self.adaptive,
-            data_format=self.data_format,
-            pool_type=self.pool_type,
-            padding_algorithm=self.padding_algorithm)
+        x_grad = pool2d_backward_naive(self.x_fp32,
+                                       ksize=self.ksize,
+                                       strides=self.strides,
+                                       paddings=self.paddings,
+                                       global_pool=self.global_pool,
+                                       ceil_mode=False,
+                                       exclusive=self.exclusive,
+                                       adaptive=self.adaptive,
+                                       data_format=self.data_format,
+                                       pool_type=self.pool_type,
+                                       padding_algorithm=self.padding_algorithm)
         x_grad = x_grad / np.prod(self.outputs['Out'].shape)
-        self.check_grad_with_place(
-            core.CPUPlace(), set(['X']), 'Out', user_defined_grads=[x_grad])
+        self.check_grad_with_place(core.CPUPlace(),
+                                   set(['X']),
+                                   'Out',
+                                   user_defined_grads=[x_grad])
 
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestPoolBf16MklDNNOp(TestPool2D_Op_Mixin, OpTest):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
@@ -75,10 +79,11 @@ def setUp(self):
         self.dtype = np.uint16
 
         input = np.random.random(self.shape).astype(np.float32)
-        output = (self.pool2D_forward_naive(
-            input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive, self.adaptive,
-            "float32")).astype(np.float32)
+        output = (self.pool2D_forward_naive(input, self.ksize, self.strides,
+                                            self.paddings, self.global_pool,
+                                            self.ceil_mode, self.exclusive,
+                                            self.adaptive,
+                                            "float32")).astype(np.float32)
 
         self.inputs = {'X': convert_float_to_uint16(input)}
         self.outputs = {'Out': convert_float_to_uint16(output)}
@@ -91,6 +96,7 @@ def test_check_grad(self):
 
 
 class TestCase1Avg(TestPoolBf16MklDNNOp):
+
     def init_test_case(self):
         self.shape = [2, 3, 7, 7]
         self.ksize = [3, 3]
@@ -105,6 +111,7 @@ def init_exclusive(self):
 
 
 class TestCase2Avg(TestPoolBf16MklDNNOp):
+
     def init_test_case(self):
         self.shape = [2, 3, 7, 7]
         self.ksize = [3, 3]
@@ -119,24 +126,28 @@ def init_exclusive(self):
 
 
 class TestCase0Max(TestPoolBf16MklDNNOp):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase1Max(TestCase1Avg):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase2Max(TestCase2Avg):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase1PadZeroExclusiveAvgGrad(TestPoolBf16MklDNNOpGrad):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -155,23 +166,27 @@ def init_exclusive(self):
 
 
 class TestCase2PadOneNonExclusiveAvgGrad(TestCase1PadZeroExclusiveAvgGrad):
+
     def init_exclusive(self):
         self.exclusive = False
 
 
 class TestCase0InitialMaxGrad(TestPoolBf16MklDNNOpGrad):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase1PadZeroExclusiveMaxGrad(TestCase1PadZeroExclusiveAvgGrad):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase2PadOneNonExclusiveMaxGrad(TestCase2PadOneNonExclusiveAvgGrad):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py
index 639cb570a8472..30bdbcbe78bc2 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py
@@ -24,6 +24,7 @@
 
 
 class TestPool2DMKLDNNInt8_Op(TestPool2D_Op):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
@@ -32,26 +33,29 @@ def init_data_type(self):
 
     def setUp(self):
         TestPool2D_Op.setUp(self)
-        assert self.dtype in [np.int8, np.uint8
-                              ], 'Dtype should be int8 or uint8'
+        assert self.dtype in [np.int8,
+                              np.uint8], 'Dtype should be int8 or uint8'
         input = np.random.randint(0, 100, self.shape).astype(self.dtype)
-        output = (self.pool2D_forward_naive(
-            input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive, self.adaptive,
-            self.dtype)).astype(self.dtype)
+        output = (self.pool2D_forward_naive(input, self.ksize, self.strides,
+                                            self.paddings, self.global_pool,
+                                            self.ceil_mode, self.exclusive,
+                                            self.adaptive,
+                                            self.dtype)).astype(self.dtype)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
         self.outputs = {'Out': output}
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output_with_place(
-            core.CPUPlace(), atol=1e-5, check_dygraph=False)
+        self.check_output_with_place(core.CPUPlace(),
+                                     atol=1e-5,
+                                     check_dygraph=False)
 
     def test_check_grad(self):
         pass
 
 
 class TestCase1Avg(TestPool2DMKLDNNInt8_Op):
+
     def init_test_case(self):
         self.shape = [2, 3, 7, 7]
         self.ksize = [3, 3]
@@ -66,6 +70,7 @@ def init_exclusive(self):
 
 
 class TestCase2Avg(TestPool2DMKLDNNInt8_Op):
+
     def init_test_case(self):
         self.shape = [2, 3, 7, 7]
         self.ksize = [3, 3]
@@ -80,29 +85,35 @@ def init_exclusive(self):
 
 
 class TestCase0Max(TestPool2DMKLDNNInt8_Op):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase1Max(TestCase1Avg):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase2Max(TestCase2Avg):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 def create_test_s8_u8_class(parent):
+
     class TestS8Case(parent):
+
         def init_data_type(self):
             self.dtype = np.int8
 
     class TestU8Case(parent):
+
         def init_data_type(self):
             self.dtype = np.uint8
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
index 3f80bdc1651be..6d39b27b1aee7 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
@@ -20,7 +20,9 @@
 
 
 def create_test_mkldnn_use_ceil_class(parent):
+
     class TestMKLDNNPool2DUseCeilCase(parent):
+
         def init_kernel_type(self):
             self.use_mkldnn = True
 
@@ -41,7 +43,9 @@ def init_data_type(self):
 
 
 def create_test_mkldnn_class(parent):
+
     class TestMKLDNNCase(parent):
+
         def init_kernel_type(self):
             self.use_mkldnn = True
 
@@ -62,6 +66,7 @@ def init_data_type(self):
 
 
 class TestAvgPoolAdaptive(TestPool2D_Op):
+
     def init_adaptive(self):
         self.adaptive = True
 
@@ -84,6 +89,7 @@ def init_global_pool(self):
 
 
 class TestAvgPoolAdaptive2(TestAvgPoolAdaptive):
+
     def init_test_case(self):
         self.ksize = [2, 3]
         self.strides = [1, 1]
@@ -93,6 +99,7 @@ def init_shape(self):
 
 
 class TestAvgPoolAdaptive3(TestAvgPoolAdaptive):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -102,6 +109,7 @@ def init_shape(self):
 
 
 class TestAsymPad(TestPool2D_Op):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -127,68 +135,81 @@ def init_data_type(self):
 
 
 class TestAsymPadCase1(TestAsymPad):
+
     def init_paddings(self):
         self.paddings = [1, 1, 0, 0]
 
 
 class TestAsymPadCase2(TestAsymPad):
+
     def init_paddings(self):
         self.paddings = [1, 0, 1, 2]
 
 
 class TestAsymPadCase3(TestAsymPad):
+
     def init_paddings(self):
         self.paddings = [1, 2, 1, 2]
 
 
 class TestAsymPadCase4(TestAsymPad):
+
     def init_paddings(self):
         self.paddings = [1, 0, 1, 2]
 
 
 class TestAsymPadCase5(TestAsymPad):
+
     def init_paddings(self):
         self.paddings = [2, 2, 1, 2]
 
 
 class TestAsymPadMaxCase1(TestAsymPadCase1):
+
     def init_pool_type(self):
         self.pool_type = "max"
 
 
 class TestAsymPadMaxCase2(TestAsymPadCase2):
+
     def init_pool_type(self):
         self.pool_type = "max"
 
 
 class TestAsymPadMaxCase3(TestAsymPadCase3):
+
     def init_pool_type(self):
         self.pool_type = "max"
 
 
 class TestAsymPadMaxCase4(TestAsymPadCase4):
+
     def init_pool_type(self):
         self.pool_type = "max"
 
 
 class TestAsymPadMaxCase5(TestAsymPadCase5):
+
     def init_pool_type(self):
         self.pool_type = "max"
 
 
 class TestAsymPadSame(TestAsymPad):
+
     def init_paddings(self):
         self.paddings = [0, 0]
         self.padding_algorithm = "SAME"
 
 
 class TestAsymPadValid(TestAsymPad):
+
     def init_paddings(self):
         self.paddings = [0, 0, 0, 0]
         self.padding_algorithm = "VALID"
 
 
 class TestAsymPadValidNHWC(TestAsymPadValid):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_prelu_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_prelu_mkldnn_op.py
index 901aa200a3775..ab6a4f4c06b4e 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_prelu_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_prelu_mkldnn_op.py
@@ -42,6 +42,7 @@ def ref_prelu(x, weight, mode):
 
 
 class TestPReluModeChannelOneDNNOp(OpTest):
+
     def init_attrs(self):
         self.mode = "element"
         self.alpha = np.random.random((1, 4, 5, 5)).astype("float32")
@@ -70,6 +71,7 @@ def test_check_grad(self):
 
 
 class TestPReluModeAllOneDNNOp(TestPReluModeChannelOneDNNOp):
+
     def init_attrs(self):
         self.mode = "all"
         self.alpha = np.random.random((1, 1, 1, 1)).astype("float32")
@@ -81,12 +83,14 @@ def test_check_grad(self):
 
 
 class TestPReluModeElementOneDNNOp(TestPReluModeChannelOneDNNOp):
+
     def init_attrs(self):
         self.mode = "element"
         self.alpha = np.random.random((1, 4, 5, 5)).astype("float32")
 
 
 class TestPReluModeChannel3DOneDNNOp(TestPReluModeChannelOneDNNOp):
+
     def init_attrs(self):
         self.mode = "channel"
         self.x = np.random.random((1, 100, 1)).astype("float32")
@@ -94,6 +98,7 @@ def init_attrs(self):
 
 
 class TestPReluModeChannelAlpha1DOneDNNOp(TestPReluModeChannelOneDNNOp):
+
     def init_attrs(self):
         self.mode = "channel"
         self.x = np.random.random((1, 100, 1)).astype("float32")
@@ -101,6 +106,7 @@ def init_attrs(self):
 
 
 class TestPReluModeAllAlpha1DOneDNNOp(TestPReluModeAllOneDNNOp):
+
     def init_attrs(self):
         self.mode = "channel"
         self.x = np.random.random((1, 1, 100)).astype("float32")
@@ -109,8 +115,10 @@ def init_attrs(self):
 
 #   BF16 TESTS
 def create_bf16_test_class(parent):
+
     @OpTestTool.skip_if_not_cpu_bf16()
     class TestPReluBF16OneDNNOp(parent):
+
         def set_inputs(self, ):
             self.inputs = {
                 'X': convert_float_to_uint16(self.x),
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
index c92d870565fbc..e0c28115d13f6 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
@@ -21,6 +21,7 @@
 
 
 class TestQuantizeOp(OpTest):
+
     def setUp(self):
         self.op_type = 'quantize'
         self.scale = 255.0
@@ -39,12 +40,12 @@ def setUp(self):
     def prepare_input(self):
         if self.is_negative:
             # input data values are from interval [-1.0, 1.0)
-            self.input = (2 * np.random.random_sample(self.input_size) - 1
-                          ).astype('float32')
+            self.input = (2 * np.random.random_sample(self.input_size) -
+                          1).astype('float32')
         else:
             # input data values are from interval [0.0, 1.0)
-            self.input = (
-                np.random.random_sample(self.input_size)).astype('float32')
+            self.input = (np.random.random_sample(
+                self.input_size)).astype('float32')
 
         self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(self.input)}
         self.attrs = {
@@ -56,8 +57,8 @@ def prepare_input(self):
 
     def prepare_output(self):
         input_data_type = 'int8' if self.is_negative else 'uint8'
-        output = np.rint(self.input * self.scale + self.shift).astype(
-            input_data_type)
+        output = np.rint(self.input * self.scale +
+                         self.shift).astype(input_data_type)
         self.outputs = {'Output': output}
 
     def test_check_output(self):
@@ -90,6 +91,7 @@ def set_output_format(self):
 
 
 class TestQuantizeOp1(TestQuantizeOp):
+
     def set_scale(self):
         self.scale = 127.0
 
@@ -98,6 +100,7 @@ def set_is_negative(self):
 
 
 class TestQuantizeOp2(TestQuantizeOp):
+
     def set_scale(self):
         self.scale = 255.0
 
@@ -108,6 +111,7 @@ def set_is_negative(self):
 # 2-dim input
 # P - positive input
 class TestQuantizeOpShift_NCHW_2_P(TestQuantizeOp):
+
     def set_output_format(self):
         self.output_format = 'NCHW'
 
@@ -127,6 +131,7 @@ def set_input_size(self):
 # 2-dim input
 # N - negative input
 class TestQuantizeOpShift_NCHW_2_N(TestQuantizeOpShift_NCHW_2_P):
+
     def set_is_negative(self):
         self.is_nagative = True
 
@@ -138,53 +143,63 @@ def set_shift(self):
 
 
 class TestQuantizeOpShift_NHWC_2_P(TestQuantizeOpShift_NCHW_2_P):
+
     def set_output_format(self):
         self.output_format = 'NHWC'
 
 
 class TestQuantizeOpShift_NHWC_2_N(TestQuantizeOpShift_NCHW_2_N):
+
     def set_output_format(self):
         self.output_format = 'NHWC'
 
 
 # 3-dim input
 class TestQuantizeOpShift_NCHW_3_P(TestQuantizeOpShift_NCHW_2_P):
+
     def set_input_size(self):
         self.input_size = [2, 3, 4]
 
 
 class TestQuantizeOpShift_NCHW_3_N(TestQuantizeOpShift_NCHW_2_N):
+
     def set_input_size(self):
         self.input_size = [2, 3, 4]
 
 
 class TestQuantizeOpShift_NHWC_3_P(TestQuantizeOpShift_NCHW_3_P):
+
     def set_output_format(self):
         self.output_format = 'NHWC'
 
 
 class TestQuantizeOpShift_NHWC_3_N(TestQuantizeOpShift_NCHW_3_N):
+
     def set_output_format(self):
         self.output_format = 'NHWC'
 
 
 # 4-dim input
 class TestQuantizeOpShift_NCHW_4_P(TestQuantizeOpShift_NCHW_2_P):
+
     def set_input_size(self):
         self.input_size = [2, 3, 4, 5]
 
 
 class TestQuantizeOpShift_NCHW_4_N(TestQuantizeOpShift_NCHW_2_N):
+
     def set_input_size(self):
         self.input_size = [2, 3, 4, 5]
 
 
 class TestQuantizeOpShift_NHWC_4_P(TestQuantizeOpShift_NCHW_4_P):
+
     def set_output_format(self):
         self.output_format = 'NHWC'
 
 
 class TestQuantizeOpShift_NHWC_4_N(TestQuantizeOpShift_NCHW_4_N):
+
     def set_output_format(self):
         self.output_format = 'NHWC'
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py
index d1a657679037d..1176a80564654 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py
@@ -20,11 +20,13 @@
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 import paddle
+
 paddle.enable_static()
 
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestReduceSumDefaultBF16OneDNNOp(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -70,6 +72,7 @@ def calculate_grads(self):
 
 
 class TestReduceDefaultWithGradBF16OneDNNOp(TestReduceSumDefaultBF16OneDNNOp):
+
     def test_check_grad(self):
         self.calculate_grads()
         self.check_grad_with_place(
@@ -82,6 +85,7 @@ def test_check_grad(self):
 
 class TestReduceSum4DReduceAllDimAttributeBF16OneDNNOp(
         TestReduceDefaultWithGradBF16OneDNNOp):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -94,6 +98,7 @@ def setUp(self):
 
 class TestReduceSum4DReduceAllWithoutReduceAllAttributeNegativeDimsBF16OneDNNOp(
         TestReduceDefaultWithGradBF16OneDNNOp):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -106,6 +111,7 @@ def setUp(self):
 
 class TestReduceSum5DReduceAllKeepDimsBF16OneDNNOp(
         TestReduceDefaultWithGradBF16OneDNNOp):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -116,8 +122,9 @@ def setUp(self):
         self.outputs = {'Out': self.x_fp32.sum(keepdims=self.attrs['keep_dim'])}
 
 
-class TestReduceSum4DReduceAllBF16OneDNNOp(
-        TestReduceDefaultWithGradBF16OneDNNOp):
+class TestReduceSum4DReduceAllBF16OneDNNOp(TestReduceDefaultWithGradBF16OneDNNOp
+                                           ):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -178,6 +185,7 @@ def setUp(self):
 
 
 class TestReduceMean3DBF16OneDNNOp(TestReduceDefaultWithGradBF16OneDNNOp):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.use_mkldnn = True
@@ -189,6 +197,7 @@ def setUp(self):
 
 
 class TestReduceMean4DBF16OneDNNOp(TestReduceDefaultWithGradBF16OneDNNOp):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.use_mkldnn = True
@@ -197,7 +206,8 @@ def setUp(self):
         self.inputs = {'X': self.x_bf16}
         self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [0, 1]}
         self.outputs = {
-            'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim'])) /
+            'Out':
+            self.x_fp32.sum(axis=tuple(self.attrs['dim'])) /
             (self.x_fp32.shape[0] * self.x_fp32.shape[1])
         }
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
index 7b0bb706aece9..23687aec9ef29 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
@@ -20,6 +20,7 @@
 
 
 class TestReduceSumDefaultOneDNNOp(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -32,11 +33,13 @@ def test_check_output(self):
 
 
 class TestReduceDefaultWithGradOneDNNOp(TestReduceSumDefaultOneDNNOp):
+
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
 class TestReduceSum4DOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -49,6 +52,7 @@ def setUp(self):
 
 class TestReduceSum4DReduceAllDimAttributeBF16OneDNNOp(
         TestReduceDefaultWithGradOneDNNOp):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -60,19 +64,22 @@ def setUp(self):
 
 
 class TestReduceSum5DKeepDimsOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
         self.inputs = {'X': np.random.random((2, 5, 3, 2, 2)).astype("float32")}
         self.attrs = {'dim': (2, 3, 4), 'keep_dim': True, 'use_mkldnn': True}
         self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
-                                        keepdims=self.attrs['keep_dim'])
+            'Out':
+            self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                 keepdims=self.attrs['keep_dim'])
         }
 
 
-class TestReduceSum5DReduceAllKeepDimsOneDNNOp(
-        TestReduceDefaultWithGradOneDNNOp):
+class TestReduceSum5DReduceAllKeepDimsOneDNNOp(TestReduceDefaultWithGradOneDNNOp
+                                               ):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -84,6 +91,7 @@ def setUp(self):
 
 
 class TestReduceSum4DReduceAllOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -95,6 +103,7 @@ def setUp(self):
 @OpTestTool.skip_if_not_cpu()
 class TestReduceSum4DNoReduceSimpleCopyOneDNNOp(
         TestReduceDefaultWithGradOneDNNOp):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -153,6 +162,7 @@ def setUp(self):
 
 
 class TestReduceMean3DOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.use_mkldnn = True
@@ -164,6 +174,7 @@ def setUp(self):
 
 
 class TestReduceMean4DReduceAllOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.use_mkldnn = True
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
index 88aebac42e84b..336ee80c1fcf5 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
@@ -23,6 +23,7 @@
 
 
 class TestReQuantizeOp(OpTest):
+
     def set_input_size(self):
         self.input_size = [1, 1, 10, 10]
         self.format_reorder = format_reorder
@@ -44,12 +45,12 @@ def setUp(self):
     def prepare_input(self):
         if self.input_data_type == 'int8':
             # input data values are integers from interval [-128, 128)
-            self.input = (np.random.randint(0, 256, self.input_size) - 128
-                          ).astype(self.input_data_type)
+            self.input = (np.random.randint(0, 256, self.input_size) -
+                          128).astype(self.input_data_type)
         else:
             # input data values are integers from interval [0, 256)
-            self.input = (np.random.randint(
-                0, 256, self.input_size)).astype(self.input_data_type)
+            self.input = (np.random.randint(0, 256, self.input_size)).astype(
+                self.input_data_type)
 
         self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(self.input)}
         self.attrs = {
@@ -112,24 +113,28 @@ def set_input_data_type(self):
 
 
 class TestReQuantizeOp_S8_SameScales(TestReQuantizeOp):
+
     def set_scales(self):
         self.scale_in = 127.0
         self.scale_out = 127.0
 
 
 class TestReQuantizeOp_S8_DifferentScales_1(TestReQuantizeOp):
+
     def set_scales(self):
         self.scale_in = 127.0
         self.scale_out = 100.0
 
 
 class TestReQuantizeOp_S8_DifferentScales_2(TestReQuantizeOp):
+
     def set_scales(self):
         self.scale_in = 100.0
         self.scale_out = 127.0
 
 
 class TestReQuantizeOp_S8_ZeroInputScale(TestReQuantizeOp):
+
     def set_scales(self):
         self.scale_in = 0.0
         self.scale_out = 127.0
@@ -144,6 +149,7 @@ def test_check_output(self):
 
 
 class TestReQuantizeOp_S8_ZeroOutputScale(TestReQuantizeOp):
+
     def set_scales(self):
         self.scale_in = 127.0
         self.scale_out = 0.0
@@ -161,18 +167,21 @@ def test_check_output(self):
 
 
 class TestReQuantizeOp_U8_SameScales(TestReQuantizeOp_S8_SameScales):
+
     def set_input_data_type(self):
         self.input_data_type = 'uint8'
 
 
 class TestReQuantizeOp_U8_DifferentScales_1(
         TestReQuantizeOp_S8_DifferentScales_1):
+
     def set_input_data_type(self):
         self.input_data_type = 'uint8'
 
 
 class TestReQuantizeOp_U8_DifferentScales_2(
         TestReQuantizeOp_S8_DifferentScales_2):
+
     def set_input_data_type(self):
         self.input_data_type = 'uint8'
 
@@ -181,6 +190,7 @@ def set_input_data_type(self):
 
 
 class TestReQuantizeOp_S8_WithShift(TestReQuantizeOp):
+
     def set_scales(self):
         self.scale_in = 60.0
         self.scale_out = 127.0
@@ -196,6 +206,7 @@ def test_check_output(self):
 
 
 class TestReQuantizeOp_S8_WithOutputShift(TestReQuantizeOp):
+
     def set_scales(self):
         self.scale_in = 127.0
         self.scale_out = 60.0
@@ -209,6 +220,7 @@ def set_shifts(self):
 
 
 class TestReQuantizeOp_U8_SameScales_SameShift(TestReQuantizeOp_U8_SameScales):
+
     def set_shifts(self):
         self.shift_in = 128.0
         self.shift_out = 128.0
@@ -216,6 +228,7 @@ def set_shifts(self):
 
 class TestReQuantizeOp_U8_SameScales_DifferentShift_1(
         TestReQuantizeOp_U8_SameScales):
+
     def set_shifts(self):
         self.shift_in = 60.0
         self.shift_out = 128.0
@@ -223,6 +236,7 @@ def set_shifts(self):
 
 class TestReQuantizeOp_U8_SameScales_DifferentShift_2(
         TestReQuantizeOp_U8_SameScales):
+
     def set_shifts(self):
         self.shift_in = 128.0
         self.shift_out = 60.0
@@ -230,6 +244,7 @@ def set_shifts(self):
 
 class TestReQuantizeOp_U8_DifferentScales_1_SameShift(
         TestReQuantizeOp_U8_DifferentScales_1):
+
     def set_shifts(self):
         self.shift_in = 128.0
         self.shift_out = 128.0
@@ -237,6 +252,7 @@ def set_shifts(self):
 
 class TestReQuantizeOp_U8_DifferentScales_2_SameShift(
         TestReQuantizeOp_U8_DifferentScales_2):
+
     def set_shifts(self):
         self.shift_in = 128.0
         self.shift_out = 128.0
@@ -244,6 +260,7 @@ def set_shifts(self):
 
 class TestReQuantizeOp_U8_DifferentScales_1_DifferentShift_1(
         TestReQuantizeOp_U8_DifferentScales_1):
+
     def set_shifts(self):
         self.shift_in = 128.0
         self.shift_out = 60.0
@@ -251,6 +268,7 @@ def set_shifts(self):
 
 class TestReQuantizeOp_U8_DifferentScales_2_DifferentShift_1(
         TestReQuantizeOp_U8_DifferentScales_2):
+
     def set_shifts(self):
         self.shift_in = 128.0
         self.shift_out = 60.0
@@ -258,6 +276,7 @@ def set_shifts(self):
 
 class TestReQuantizeOp_U8_DifferentScales_1_DifferentShift_2(
         TestReQuantizeOp_U8_DifferentScales_1):
+
     def set_shifts(self):
         self.shift_in = 60.0
         self.shift_out = 128.0
@@ -265,6 +284,7 @@ def set_shifts(self):
 
 class TestReQuantizeOp_U8_DifferentScales_2_DifferentShift_2(
         TestReQuantizeOp_U8_DifferentScales_2):
+
     def set_shifts(self):
         self.shift_in = 60.0
         self.shift_out = 128.0
@@ -274,6 +294,7 @@ def set_shifts(self):
 
 
 class TestReQuantizeOp_2DimFormat(TestReQuantizeOp):
+
     def format_reorder_2Dim(self, out, size):
         return out
 
@@ -286,6 +307,7 @@ def set_input_size(self):
 
 
 class TestReQuantizeOpReused(TestReQuantizeOp):
+
     def setUp(self):
         #  self.input_size = [1, 1, 10, 10]
         self.input_size = [1, 1, 2, 2]
@@ -317,18 +339,20 @@ def test_check_output(self):
         with fluid.program_guard(program):
             block = program.global_block()
             for name in variables:
-                block.create_var(
-                    name=name, dtype="int8", shape=variables[name].shape)
-            block.append_op(
-                type="requantize",
-                inputs={'Input': block.var('input'), },
-                outputs={"Output": block.var('output')},
-                attrs={
-                    'Scale_in': self.scale_in,
-                    'Scale_out': self.scale_out,
-                    'Shift_in': self.shift_in,
-                    'Shift_out': self.shift_out
-                })
+                block.create_var(name=name,
+                                 dtype="int8",
+                                 shape=variables[name].shape)
+            block.append_op(type="requantize",
+                            inputs={
+                                'Input': block.var('input'),
+                            },
+                            outputs={"Output": block.var('output')},
+                            attrs={
+                                'Scale_in': self.scale_in,
+                                'Scale_out': self.scale_out,
+                                'Shift_in': self.shift_in,
+                                'Shift_out': self.shift_out
+                            })
             place = core.CPUPlace()
             exe = fluid.Executor(place)
             for i in range(2):
@@ -336,15 +360,15 @@ def test_check_output(self):
                               feed={'input': variables['input']},
                               fetch_list=['output'])
 
-            self.assertTrue(
-                np.allclose(
-                    variables['output'], out[0], atol=1e-4), 'output')
+            self.assertTrue(np.allclose(variables['output'], out[0], atol=1e-4),
+                            'output')
 
 
 # ---------------test reused requantize op, no shift------------------------
 
 
 class TestReQuantizeOpReused_WithShift(TestReQuantizeOpReused):
+
     def set_input_data_type(self):
         self.input_data_type = 'uint8'
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
index ae844834154fb..fe33593136111 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
@@ -26,6 +26,7 @@
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestReshapeBf16Op(OpTest):
+
     def setUp(self):
         self.op_type = "reshape2"
         self.use_mkldnn = False
@@ -58,14 +59,14 @@ def test_check_output(self):
         self.check_output_with_place(core.CPUPlace(), no_check_set=['XShape'])
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            core.CPUPlace(), ["X"],
-            "Out",
-            check_dygraph=False,
-            user_defined_grads=[self.input_data_fp32],
-            user_defined_grad_outputs=[
-                self.inputs["X"].reshape(self.infered_shape)
-            ])
+        self.check_grad_with_place(core.CPUPlace(), ["X"],
+                                   "Out",
+                                   check_dygraph=False,
+                                   user_defined_grads=[self.input_data_fp32],
+                                   user_defined_grad_outputs=[
+                                       self.inputs["X"].reshape(
+                                           self.infered_shape)
+                                   ])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_mkldnn_op.py
index 78e5af3311b99..828d190735ac7 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_mkldnn_op.py
@@ -24,6 +24,7 @@
 @OpTestTool.skip_if(core.is_compiled_with_cuda(),
                     "CUDA has to be skipped because it forces dygraph")
 class TestReshape2OneDNNOp(OpTest):
+
     def setUp(self):
         self.init_data()
         self.set_op_type()
@@ -64,6 +65,7 @@ def test_check_grad(self):
 
 
 class TestReshape2OneDNNOpDimInfer1(TestReshape2OneDNNOp):
+
     def init_data(self):
         self.ori_shape = (5, 25)
         self.new_shape = (5, -1, 5)
@@ -71,6 +73,7 @@ def init_data(self):
 
 
 class TestReshape2OneDNNOpDimInfer2(TestReshape2OneDNNOp):
+
     def init_data(self):
         self.ori_shape = (6, 20)
         self.new_shape = (0, -1, 20)
@@ -87,6 +90,7 @@ def set_outputs(self):
 
 
 class TestReshape2OneDNNOp_attr_OnlyShape(TestReshape2OneDNNOp):
+
     def set_additional_inputs(self):
         self.inputs["Shape"] = np.array(self.new_shape, dtype="int32")
 
@@ -107,6 +111,7 @@ def init_data(self):
 
 class TestReshape2OneDNNOpDimInfer1_attr_OnlyShape(
         TestReshape2OneDNNOp_attr_OnlyShape):
+
     def init_data(self):
         self.ori_shape = (5, 20)
         self.new_shape = (5, -1, 10)
@@ -115,6 +120,7 @@ def init_data(self):
 
 
 class TestReshape2OneDNNOpDimInfer1_attr_ShapeTensor(TestReshape2OneDNNOp):
+
     def set_additional_inputs(self):
         shape_tensor = []
         for index, ele in enumerate(self.new_shape):
@@ -132,6 +138,7 @@ def init_data(self):
 
 class TestReshape2OneDNNOpDimInfer1_attr_ShapeTensorAndShape(
         TestReshape2OneDNNOpDimInfer1_attr_ShapeTensor):
+
     def set_additional_inputs(self):
         shape_tensor = []
         for index, ele in enumerate(self.new_shape):
@@ -143,6 +150,7 @@ def set_additional_inputs(self):
 
 
 class TestReshapeOneDNNOp(TestReshape2OneDNNOp):
+
     def set_op_type(self):
         self.op_type = "reshape"
 
@@ -154,6 +162,7 @@ def test_check_output(self):
 
 
 class TestReshapeOneDNNOpDimInfer1(TestReshapeOneDNNOp):
+
     def init_data(self):
         self.ori_shape = (5, 25)
         self.new_shape = (5, -1, 5)
@@ -161,6 +170,7 @@ def init_data(self):
 
 
 class TestReshapeOneDNNOp_attr_OnlyShape(TestReshape2OneDNNOp_attr_OnlyShape):
+
     def set_op_type(self):
         self.op_type = "reshape"
 
@@ -173,6 +183,7 @@ def test_check_output(self):
 
 class TestReshapeOneDNNOpDimInfer1_attr_OnlyShape(
         TestReshapeOneDNNOp_attr_OnlyShape):
+
     def init_data(self):
         self.ori_shape = (5, 20)
         self.new_shape = (5, -1, 10)
@@ -182,8 +193,10 @@ def init_data(self):
 
 #   BF16 TESTS
 def create_reshape_bf16_test_classes(parent):
+
     @OpTestTool.skip_if_not_cpu_bf16()
     class TestReshape2BF16OneDNNOp(parent):
+
         def set_inputs(self):
             self.dtype = np.uint16
             self.inputs = {"X": convert_float_to_uint16(self.x)}
@@ -193,22 +206,22 @@ def calculate_grads(self):
             self.dx = np.reshape(self.dout, self.ori_shape)
 
         def test_check_output(self):
-            self.check_output_with_place(
-                core.CPUPlace(), no_check_set=["XShape"])
+            self.check_output_with_place(core.CPUPlace(),
+                                         no_check_set=["XShape"])
 
         def test_check_grad(self):
             self.calculate_grads()
-            self.check_grad_with_place(
-                core.CPUPlace(), ["X"],
-                "Out",
-                user_defined_grads=[self.dx],
-                user_defined_grad_outputs=[self.dout])
+            self.check_grad_with_place(core.CPUPlace(), ["X"],
+                                       "Out",
+                                       user_defined_grads=[self.dx],
+                                       user_defined_grad_outputs=[self.dout])
 
     cls_name = "{0}_{1}".format(parent.__name__, "Reshape2_BF16")
     TestReshape2BF16OneDNNOp.__name__ = cls_name
     globals()[cls_name] = TestReshape2BF16OneDNNOp
 
     class TestReshapeBF16OneDNNOp(TestReshape2BF16OneDNNOp):
+
         def set_op_type(self):
             self.dtype = np.uint16
             self.op_type = "reshape"
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_scale_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_scale_bf16_mkldnn_op.py
index 8e9f989f06c10..496d6c393a451 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_scale_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_scale_bf16_mkldnn_op.py
@@ -27,6 +27,7 @@
 @unittest.skipIf(core.is_compiled_with_cuda(),
                  "core is compiled with CUDA which has no BF implementation")
 class TestScaleOpBF16(OpTest):
+
     def setUp(self):
         self.op_type = "scale"
         self.x_fp32 = np.random.random((10, 10)).astype(np.float32)
@@ -65,6 +66,7 @@ def test_check_grad(self):
 
 
 class TestScaleOpBF16BiasNotAfterScale(TestScaleOpBF16):
+
     def setUp(self):
         self.op_type = "scale"
         self.x_fp32 = np.random.random((10, 10)).astype(np.float32)
@@ -84,6 +86,7 @@ def setUp(self):
 
 
 class TestScaleOpBF16ScaleTensor(TestScaleOpBF16):
+
     def setUp(self):
         self.op_type = "scale"
         self.scale = -2.3
@@ -99,6 +102,7 @@ def setUp(self):
 
 
 class TestScaleOpBF16ScaleTensorNotBiasAfterScale(TestScaleOpBF16):
+
     def setUp(self):
         self.op_type = "scale"
         self.scale = 1.2
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_scale_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_scale_mkldnn_op.py
index 528b55dcd873d..50a5e917985a2 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_scale_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_scale_mkldnn_op.py
@@ -22,6 +22,7 @@
 
 
 class TestScaleOp(OpTest):
+
     def setUp(self):
         self.op_type = "scale"
         self.inputs = {'X': np.random.random((10, 10)).astype(np.float32)}
@@ -39,6 +40,7 @@ def test_check_grad(self):
 
 
 class TestScaleOpBiasNotAfterScale(OpTest):
+
     def setUp(self):
         self.op_type = "scale"
         self.inputs = {'X': np.random.random((10, 10)).astype(np.float32)}
@@ -61,6 +63,7 @@ def test_check_grad(self):
 
 
 class TestScaleOpScaleTensor(OpTest):
+
     def setUp(self):
         self.op_type = "scale"
         self.scale = -2.3
@@ -79,6 +82,7 @@ def test_check_grad(self):
 
 
 class TestScaleOpScaleTensorNotBiasAfterScale(OpTest):
+
     def setUp(self):
         self.op_type = "scale"
         self.scale = -1.2
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_shape_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_shape_mkldnn_op.py
index 41e6344a0a17f..44f2e30d4fce9 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_shape_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_shape_mkldnn_op.py
@@ -24,6 +24,7 @@
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestShape3DFP32OneDNNOp(OpTest):
+
     def setUp(self):
         self.op_type = "shape"
         self.config()
@@ -40,18 +41,21 @@ def test_check_output(self):
 
 
 class TestShape6DBF16OneDNNOp(TestShape3DFP32OneDNNOp):
+
     def config(self):
         self.shape = [10, 2, 3, 4, 5, 2]
         self.dtype = np.uint16
 
 
 class TestShape9DINT8OneDNNOp(TestShape3DFP32OneDNNOp):
+
     def config(self):
         self.shape = [1, 2, 3, 4, 5, 6, 7, 8, 9]
         self.dtype = np.int8
 
 
 class TestShape2DUINT8OneDNNOp(TestShape3DFP32OneDNNOp):
+
     def config(self):
         self.shape = [7, 11]
         self.dtype = np.uint8
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_shuffle_channel_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_shuffle_channel_mkldnn_op.py
index 1d657817503de..edbd19285cac3 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_shuffle_channel_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_shuffle_channel_mkldnn_op.py
@@ -24,6 +24,7 @@
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestShuffleChannelOneDNNOp(OpTest):
+
     def setUp(self):
         self.op_type = "shuffle_channel"
         self.set_dtype()
@@ -48,11 +49,13 @@ def test_check_output(self):
 
 
 class TestShuffleChannelSingleGroupOneDNNOp(TestShuffleChannelOneDNNOp):
+
     def set_group(self):
         self.group = 1
 
 
 class TestShuffleChannelBF16OneDNNOp(TestShuffleChannelOneDNNOp):
+
     def set_dtype(self):
         self.dtype = np.uint16
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_slice_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_slice_mkldnn_op.py
index 443e4d90c3a8a..6b5bfe2155010 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_slice_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_slice_mkldnn_op.py
@@ -26,6 +26,7 @@
 @OpTestTool.skip_if(core.is_compiled_with_cuda(),
                     "CUDA required dygraph so oneDNN UT must be skipped")
 class TestSliceOneDNNOp(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.config()
@@ -62,6 +63,7 @@ def test_check_grad(self):
 
 
 class TestSliceOneDNNOp1(TestSliceOneDNNOp):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-3, 0, 2]
@@ -72,6 +74,7 @@ def config(self):
 
 
 class TestSliceOneDNNOp2(TestSliceOneDNNOp):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-3, 0, 2]
@@ -82,6 +85,7 @@ def config(self):
 
 
 class TestSliceDecrease1AxisOneDNNOp(TestSliceOneDNNOp):
+
     def set_attrs(self):
         self.attrs['decrease_axis'] = self.decrease_axis
 
@@ -96,6 +100,7 @@ def config(self):
 
 
 class TestSliceDecrease2AxesOneDNNOp(TestSliceDecrease1AxisOneDNNOp):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [1, 0, 2]
@@ -107,6 +112,7 @@ def config(self):
 
 
 class TestSliceDecrease3AxesOneDNNOp(TestSliceDecrease1AxisOneDNNOp):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-1, 0, 2]
@@ -118,6 +124,7 @@ def config(self):
 
 
 class TestSliceDecrease4AxesOneDNNOp(TestSliceDecrease1AxisOneDNNOp):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 7]).astype("float32")
         self.starts = [0, 1, 2, 3]
@@ -129,6 +136,7 @@ def config(self):
 
 
 class TestSlice5DOneDNNOp(TestSliceDecrease1AxisOneDNNOp):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6, 7]).astype("float32")
         self.starts = [-1]
@@ -140,6 +148,7 @@ def config(self):
 
 
 class TestSlice3DOneDNNOp(TestSliceDecrease1AxisOneDNNOp):
+
     def config(self):
         self.input = np.random.random([5, 4, 5]).astype("float32")
         self.starts = [-1]
@@ -152,6 +161,7 @@ def config(self):
 
 class TestSliceOneDNNOp_decs_dim_starts_ListTensor(
         TestSliceDecrease1AxisOneDNNOp):
+
     def set_inputs(self):
         starts_tensor = []
         for index, ele in enumerate(self.starts):
@@ -169,6 +179,7 @@ def config(self):
 
 
 class TestSlice4DInferDimsOneDNNOp(TestSliceDecrease1AxisOneDNNOp):
+
     def config(self):
         self.input = np.random.random([1, 1, 10, 10]).astype("float32")
         self.starts = [1, 2]
@@ -180,6 +191,7 @@ def config(self):
 
 
 class TestSlice4DInferDimsOneDNNOp2(TestSliceDecrease1AxisOneDNNOp):
+
     def config(self):
         self.input = np.random.random([1, 1, 10, 10]).astype("float32")
         self.starts = [4, 2]
@@ -192,8 +204,10 @@ def config(self):
 
 #   BF16 TESTS
 def create_bf16_test_class(parent):
+
     @OpTestTool.skip_if_not_cpu_bf16()
     class TestSliceBF16OneDNNOp(parent):
+
         def set_inputs(self):
             self.dtype = np.uint16
             self.inputs = {'Input': convert_float_to_uint16(self.input)}
@@ -208,8 +222,8 @@ def calculate_grads(self):
             for i in range(len(self.axes)):
                 begin[self.axes[i]] = self.starts[i]
                 end[self.axes[i]] = self.ends[i]
-            self.dx[begin[0]:end[0], begin[1]:end[1], begin[2]:end[2], begin[3]:
-                    end[3]] = self.dout
+            self.dx[begin[0]:end[0], begin[1]:end[1], begin[2]:end[2],
+                    begin[3]:end[3]] = self.dout
 
         def test_check_output(self):
             self.check_output_with_place(core.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py
index e9b0cafd11495..ca61f961b7a0a 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py
@@ -32,6 +32,7 @@ def stable_softmax(x):
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
+
     def get_x_shape(self):
         return [10, 10]
 
@@ -65,26 +66,31 @@ def init_kernel_type(self):
 
 
 class TestSoftmaxMKLDNNOp2(TestSoftmaxOp2):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
 class TestSoftmaxMKLDNNOp3(TestSoftmaxOp3):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
 class TestSoftmaxMKLDNNOp4(TestSoftmaxOp4):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
 class TestSoftmaxMKLDNNOp5(TestSoftmaxOp5):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
 class TestSoftmaxMKLDNNOp6(TestSoftmaxOp6):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
index 13c1883af6184..ccd43d48bafb8 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
@@ -30,6 +30,7 @@ def stable_softmax(x):
 
 
 class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
+
     def get_x_shape(self):
         return [10, 10]
 
@@ -69,46 +70,53 @@ def test_check_grad(self):
         if self.use_cudnn or self.dtype == np.float16:
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
-                self.check_grad_with_place(
-                    place, ["X"],
-                    "Out",
-                    max_relative_error=0.01,
-                    check_dygraph=False)
+                self.check_grad_with_place(place, ["X"],
+                                           "Out",
+                                           max_relative_error=0.01,
+                                           check_dygraph=False)
         else:
-            self.check_grad(
-                ["X"], "Out", max_relative_error=0.01, check_dygraph=False)
+            self.check_grad(["X"],
+                            "Out",
+                            max_relative_error=0.01,
+                            check_dygraph=False)
 
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
 class TestSoftmaxMKLDNNOp2(TestSoftmaxOp2):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
 class TestSoftmaxMKLDNNOp3(TestSoftmaxOp3):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
 class TestSoftmaxMKLDNNOp4(TestSoftmaxOp4):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
 class TestSoftmaxMKLDNNOp5(TestSoftmaxOp5):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
 class TestSoftmaxMKLDNNOp6(TestSoftmaxOp6):
+
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
 # Check if primitives already exist in backward
 class TestSoftmaxMKLDNNPrimitivesAlreadyExist(unittest.TestCase):
+
     def setUp(self):
         super(TestSoftmaxMKLDNNPrimitivesAlreadyExist, self).setUp()
 
@@ -124,8 +132,9 @@ def __softmax_bwd(self, out, out_grad):
         return out * (out_grad - np.dot(out, out_grad))
 
     def test_check(self):
-        check_if_mkldnn_primitives_exist_in_bwd(
-            self, self.op_type, self.x, self.out, self.out_grad, self.x_grad)
+        check_if_mkldnn_primitives_exist_in_bwd(self, self.op_type, self.x,
+                                                self.out, self.out_grad,
+                                                self.x_grad)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py
index c2911114e4913..23803ae2898b4 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py
@@ -32,6 +32,7 @@ def ref_softplus(x, beta, threshold):
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestSoftplusOneDNNOp(OpTest):
+
     def setUp(self):
         self.op_type = "softplus"
         self.beta = 1
@@ -61,44 +62,52 @@ def test_check_output(self):
 
 
 class TestSoftplus4DOneDNNOp(TestSoftplusOneDNNOp):
+
     def config(self):
         self.x_shape = (10, 5, 4, 2)
 
 
 class TestSoftplus6DOneDNNOp(TestSoftplusOneDNNOp):
+
     def config(self):
         self.x_shape = (3, 2, 2, 5, 4, 2)
 
 
 class TestSoftplus6DExtendedFunctorOneDNNOp(TestSoftplusOneDNNOp):
+
     def config(self):
         self.x_shape = (3, 5, 2, 5, 4, 2)
         self.beta = 2.5
 
 
 class TestSoftplus3DExtendedFunctorOneDNNOp(TestSoftplusOneDNNOp):
+
     def config(self):
         self.x_shape = (20, 4, 2)
         self.beta = 0.4
 
 
 class TestSoftplusBF16OneDNNOp(TestSoftplusOneDNNOp):
+
     def set_dtype(self):
         self.dtype = np.uint16
 
 
 class TestSoftplus4DBF16OneDNNOp(TestSoftplus4DOneDNNOp):
+
     def set_dtype(self):
         self.dtype = np.uint16
 
 
 class TestSoftplus6DBF16OneDNNOp(TestSoftplus6DOneDNNOp):
+
     def set_dtype(self):
         self.dtype = np.uint16
 
 
 class TestSoftplus3DExtendedFunctorBF16OneDNNOp(
         TestSoftplus3DExtendedFunctorOneDNNOp):
+
     def set_dtype(self):
         self.dtype = np.uint16
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_split_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_split_bf16_mkldnn_op.py
index 4cb559fc15407..f6fbc46075430 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_split_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_split_bf16_mkldnn_op.py
@@ -26,6 +26,7 @@
 @unittest.skipIf(core.is_compiled_with_cuda(),
                  "core is compiled with CUDA which has no BF implementation")
 class TestSplitSectionsBF16OneDNNOp(OpTest):
+
     def init_data(self):
         self.x = np.random.random((4, 5, 6)).astype("uint16")
         self.axis = 1
@@ -74,6 +75,7 @@ def test_check_output(self):
 
 
 class TestSplitNumBF16OneDNNOp(TestSplitSectionsBF16OneDNNOp):
+
     def init_data(self):
         self.x = np.random.random((4, 8, 5, 3)).astype("uint16")
         self.axis = 1
@@ -84,6 +86,7 @@ def init_data(self):
 
 
 class TestSplitNumAxisTensorBF16OneDNNOp(TestSplitSectionsBF16OneDNNOp):
+
     def init_data(self):
         self.x = np.random.random((4, 5, 6)).astype("uint16")
         self.axis = None
@@ -95,6 +98,7 @@ def init_data(self):
 
 
 class TestSplitSectionsTensorBF16OneDNNOp(TestSplitSectionsBF16OneDNNOp):
+
     def init_data(self):
         self.x = np.random.random((4, 5, 6)).astype("uint16")
         self.axis = 1
@@ -109,6 +113,7 @@ def init_data(self):
 
 
 class TestSplitOpUnknownSectionBF16OneDNNOp(TestSplitSectionsBF16OneDNNOp):
+
     def init_data(self):
         self.x = np.random.random((4, 5, 6)).astype("uint16")
         self.axis = 2
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_split_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_split_mkldnn_op.py
index 55b56434f3eb1..c7c4413da6c27 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_split_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_split_mkldnn_op.py
@@ -22,6 +22,7 @@
 
 
 class TestSplitSectionsOneDNNOp(OpTest):
+
     def init_data(self):
         self.x = np.random.random((4, 5, 6)).astype("float32")
         self.axis = 1
@@ -60,6 +61,7 @@ def test_check_grad(self):
 
 # test with attr(num)
 class TestSplitNumOneDNNOp(TestSplitSectionsOneDNNOp):
+
     def init_data(self):
         self.x = np.random.random((4, 8, 5, 3)).astype("float32")
         self.axis = 1
@@ -73,6 +75,7 @@ def test_check_grad(self):
 
 
 class TestSplitNumAxisTensorOneDNNOp(TestSplitSectionsOneDNNOp):
+
     def init_data(self):
         self.x = np.random.random((4, 5, 6)).astype("float32")
         self.axis = None
@@ -85,6 +88,7 @@ def init_data(self):
 
 # attr(sections) is list containing Tensor
 class TestSplitSectionsTensorOneDNNOp(TestSplitSectionsOneDNNOp):
+
     def init_data(self):
         self.x = np.random.random((4, 5, 6)).astype("float32")
         self.axis = 1
@@ -99,6 +103,7 @@ def init_data(self):
 
 
 class TestSplitOpUnknownSectionOneDNNOp(TestSplitSectionsOneDNNOp):
+
     def init_data(self):
         self.x = np.random.random((4, 5, 6)).astype("float32")
         self.axis = 2
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_squeeze2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_squeeze2_mkldnn_op.py
index 489d851038042..61729178a9253 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_squeeze2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_squeeze2_mkldnn_op.py
@@ -24,6 +24,7 @@
 @OpTestTool.skip_if(core.is_compiled_with_cuda(),
                     "CUDA has to be skipped because it forces dygraph")
 class TestSqueeze2OneDNNOp(OpTest):
+
     def set_op_type(self):
         self.op_type = "squeeze2"
 
@@ -60,6 +61,7 @@ def test_check_grad(self):
 
 
 class TestSqueezeOneDNNOp(TestSqueeze2OneDNNOp):
+
     def set_op_type(self):
         self.op_type = "squeeze"
 
@@ -71,6 +73,7 @@ def test_check_output(self):
 
 
 class TestSqueeze2OneDNNOp1(TestSqueeze2OneDNNOp):
+
     def init_test_case(self):
         self.ori_shape = (1, 20, 1, 5)
         self.axes = (0, -2)
@@ -78,6 +81,7 @@ def init_test_case(self):
 
 
 class TestSqueezeOneDNNOp1(TestSqueezeOneDNNOp):
+
     def init_test_case(self):
         self.ori_shape = (1, 20, 1, 5)
         self.axes = (0, -2)
@@ -85,6 +89,7 @@ def init_test_case(self):
 
 
 class TestSqueeze2OneDNNOp2(TestSqueeze2OneDNNOp):
+
     def init_test_case(self):
         self.ori_shape = (1, 20, 1, 5)
         self.axes = ()
@@ -92,6 +97,7 @@ def init_test_case(self):
 
 
 class TestSqueezeOneDNNOp2(TestSqueezeOneDNNOp):
+
     def init_test_case(self):
         self.ori_shape = (1, 20, 1, 5)
         self.axes = ()
@@ -99,6 +105,7 @@ def init_test_case(self):
 
 
 class TestSqueeze2OneDNNOp3(TestSqueeze2OneDNNOp):
+
     def init_test_case(self):
         self.ori_shape = (25, 1, 1, 4, 1)
         self.axes = (1, -1)
@@ -106,6 +113,7 @@ def init_test_case(self):
 
 
 class TestSqueezeOneDNNOp3(TestSqueezeOneDNNOp):
+
     def init_test_case(self):
         self.ori_shape = (25, 1, 1, 4, 1)
         self.axes = (1, -1)
@@ -114,8 +122,10 @@ def init_test_case(self):
 
 #   BF16 TESTS
 def create_squeeze_bf16_test_classes(parent):
+
     @OpTestTool.skip_if_not_cpu_bf16()
     class TestSqueeze2BF16OneDNNOp(parent):
+
         def set_inputs(self):
             self.dtype = np.uint16
             self.inputs = {"X": convert_float_to_uint16(self.x)}
@@ -126,17 +136,17 @@ def calculate_grads(self):
 
         def test_check_grad(self):
             self.calculate_grads()
-            self.check_grad_with_place(
-                core.CPUPlace(), ["X"],
-                "Out",
-                user_defined_grads=[self.dx],
-                user_defined_grad_outputs=[self.dout])
+            self.check_grad_with_place(core.CPUPlace(), ["X"],
+                                       "Out",
+                                       user_defined_grads=[self.dx],
+                                       user_defined_grad_outputs=[self.dout])
 
     cls_name = "{0}_{1}".format(parent.__name__, "Squeeze2_BF16")
     TestSqueeze2BF16OneDNNOp.__name__ = cls_name
     globals()[cls_name] = TestSqueeze2BF16OneDNNOp
 
     class TestSqueezeBF16OneDNNOp(TestSqueeze2BF16OneDNNOp):
+
         def set_op_type(self):
             self.dtype = np.uint16
             self.op_type = "squeeze"
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_stack_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_stack_mkldnn_op.py
index f7424014c2111..432ceafcfd06c 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_stack_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_stack_mkldnn_op.py
@@ -22,6 +22,7 @@
 
 @OpTestTool.skip_if_not_cpu()
 class TestStack2DOneDNNOp(OpTest):
+
     def initDefaultParameters(self):
         self.num_inputs = 4
         self.input_dim = (2, 2)
@@ -65,18 +66,21 @@ def test_check_grad(self):
 
 
 class TestStack1DOneDNNOp(TestStack2DOneDNNOp):
+
     def initParameters(self):
         self.input_dim = (100)
         self.axis = 0
 
 
 class TestStack1DAxis1OneDNNOp(TestStack2DOneDNNOp):
+
     def initParameters(self):
         self.input_dim = (100)
         self.axis = 1
 
 
 class TestStack2DAxisLastOneDNNOp(TestStack2DOneDNNOp):
+
     def initParameters(self):
         self.input_dim = (13, 24)
         self.num_inputs = 5
@@ -84,12 +88,14 @@ def initParameters(self):
 
 
 class TestStack3DAxisNegativeOneDNNOp(TestStack2DOneDNNOp):
+
     def initParameters(self):
         self.input_dim = (10, 128, 128)
         self.axis = -2
 
 
 class TestStack3DOneDNNOp(TestStack2DOneDNNOp):
+
     def initParameters(self):
         self.input_dim = (10, 128, 128)
         self.num_inputs = 3
@@ -97,6 +103,7 @@ def initParameters(self):
 
 
 class TestStack4DOneDNNOp(TestStack2DOneDNNOp):
+
     def initParameters(self):
         self.input_dim = (2, 2, 2, 2)
         self.num_inputs = 3
@@ -104,6 +111,7 @@ def initParameters(self):
 
 
 class TestStack5DOneDNNOp(TestStack2DOneDNNOp):
+
     def initParameters(self):
         self.input_dim = (2, 3, 4, 5, 6)
         self.num_inputs = 6
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_sum_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_bf16_mkldnn_op.py
index c71baad0c7040..34c1c7bc2491f 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_sum_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_bf16_mkldnn_op.py
@@ -26,6 +26,7 @@
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestSumBF16MKLDNN(TestSumOp):
+
     def setUp(self):
         self.op_type = "sum"
         self.use_mkldnn = True
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py
index 1a87b1cea532d..33d9af4e0e2ba 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py
@@ -22,6 +22,7 @@
 
 
 class TestSumMKLDNN(TestSumOp):
+
     def setUp(self):
         self.op_type = "sum"
         self.init_data_type()
@@ -47,6 +48,7 @@ def test_check_grad(self):
 
 
 class TestMKLDNNSumInplaceOp(unittest.TestCase):
+
     def setUp(self):
         self.op_type = "sum"
         self.init_data_type()
@@ -70,15 +72,16 @@ def test_check_output(self):
                 tensor = var.get_tensor()
                 tensor.set(var_value, place)
 
-        sum_op = fluid_op.Operator(
-            "sum", X=["x0", "x1"], Out=out_var_name, use_mkldnn=True)
+        sum_op = fluid_op.Operator("sum",
+                                   X=["x0", "x1"],
+                                   Out=out_var_name,
+                                   use_mkldnn=True)
         expected_out = np.array(self.x0 + self.x1)
         sum_op.run(scope, place)
         out = scope.find_var("x0").get_tensor()
         out_array = np.array(out)
         self.assertTrue(
-            np.allclose(
-                expected_out, out_array, atol=1e-5),
+            np.allclose(expected_out, out_array, atol=1e-5),
             "Inplace sum_mkldnn_op output has diff with expected output")
 
     def test_check_grad(self):
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_bf16_mkldnn_op.py
index 72efa0aa99e7d..45f8aca4f9853 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_bf16_mkldnn_op.py
@@ -24,6 +24,7 @@
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestTransposeOp(OpTest):
+
     def setUp(self):
         self.op_type = "transpose2"
         self.use_mkldnn = True
@@ -57,6 +58,7 @@ def init_test_data(self):
 
 
 class TestBF16Case(TestTransposeOp):
+
     def init_test_case(self):
         self.shape = (2, 4, 6, 8)
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py
index 6437226bf4c73..756630913db39 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py
@@ -22,6 +22,7 @@
 
 
 class TestTransposeOp(OpTest):
+
     def setUp(self):
         self.init_op_type()
         self.initTestCase()
@@ -49,27 +50,31 @@ def init_op_type(self):
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output_with_place(
-            core.CPUPlace(), 1e-5, no_check_set=['XShape'], check_dygraph=False)
+        self.check_output_with_place(core.CPUPlace(),
+                                     1e-5,
+                                     no_check_set=['XShape'],
+                                     check_dygraph=False)
 
     def initTestCase(self):
         self.shape = (2, 3, 4, 5)
 
     def initInputData(self):
-        self.input_data = (
-            np.random.randint(0, 100, self.shape) - 50).astype(np.int8)
+        self.input_data = (np.random.randint(0, 100, self.shape) - 50).astype(
+            np.int8)
 
 
 class TestINT8Case(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 4, 6, 8)
 
     def initInputData(self):
-        self.input_data = (
-            np.random.randint(0, 100, self.shape) - 50).astype(np.int8)
+        self.input_data = (np.random.randint(0, 100, self.shape) - 50).astype(
+            np.int8)
 
 
 class TestUINT8Case(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (1, 3, 5, 7)
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py
index 0d670898dd76e..18573baa554d3 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py
@@ -21,6 +21,7 @@
 
 
 class TestTransposeMKLDNN(TestTransposeOp):
+
     def setUp(self):
         self.init_op_type()
         self.initTestCase()
@@ -53,36 +54,42 @@ def initTestCase(self):
 
 
 class TestCase0MKLDNN(TestTransposeMKLDNN):
+
     def initTestCase(self):
         self.shape = (100, )
         self.axis = (0, )
 
 
 class TestCase1a(TestTransposeMKLDNN):
+
     def initTestCase(self):
         self.shape = (3, 4, 10)
         self.axis = (0, 2, 1)
 
 
 class TestCase1b(TestTransposeMKLDNN):
+
     def initTestCase(self):
         self.shape = (3, 4, 10)
         self.axis = (2, 1, 0)
 
 
 class TestCase2(TestTransposeMKLDNN):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5)
         self.axis = (0, 2, 3, 1)
 
 
 class TestCase3(TestTransposeMKLDNN):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6)
         self.axis = (4, 2, 3, 1, 0)
 
 
 class TestCase4(TestTransposeMKLDNN):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6, 1)
         self.axis = (4, 2, 3, 1, 0, 5)
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
index 229a2c1792c25..5c680c564f437 100644
--- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -1,40 +1,54 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-file(GLOB TEST_DIST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_collective_*.py")
+file(
+  GLOB TEST_DIST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_collective_*.py")
 string(REPLACE ".py" "" TEST_DIST_OPS "${TEST_DIST_OPS}")
 
-if (WITH_MLU)
-    foreach(TEST_OP ${TEST_DIST_OPS})
-        LIST(REMOVE_ITEM TEST_OPS ${TEST_OP})
-    endforeach(TEST_OP)
-    LIST(REMOVE_ITEM TEST_OPS "test_spawn_mlu")
+if(WITH_MLU)
+  foreach(TEST_OP ${TEST_DIST_OPS})
+    list(REMOVE_ITEM TEST_OPS ${TEST_OP})
+  endforeach(TEST_OP)
+  list(REMOVE_ITEM TEST_OPS "test_spawn_mlu")
 
-    foreach(TEST_OP ${TEST_OPS})
-        py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-    endforeach(TEST_OP)
+  foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  endforeach(TEST_OP)
 
-    if(WITH_CNCL)
-	LIST(APPEND TEST_DIST_OPS "test_spawn_mlu")
-        foreach(TEST_OP ${TEST_DIST_OPS})
-            py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-        endforeach(TEST_OP)
-        bash_test_modules(test_launch_async_mlu START_BASH test_launch_async_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        bash_test_modules(test_c_comm_init_op_mlu START_BASH test_c_comm_init_op_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120)
-	set_tests_properties(test_collective_allreduce_sum PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_collective_allreduce_max PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_collective_allreduce_min PROPERTIES TIMEOUT 120)
-	set_tests_properties(test_collective_allreduce_prod PROPERTIES TIMEOUT 120)
-	set_tests_properties(test_collective_allgather PROPERTIES TIMEOUT 120)
-	set_tests_properties(test_collective_reduce_sum PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_collective_reduce_max PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_collective_reduce_min PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_collective_reduce_prod PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_collective_broadcast_api_mlu PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_collective_allreduce_api_mlu PROPERTIES TIMEOUT 120)
-	set_tests_properties(test_collective_allgather_api_mlu PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_c_comm_init_op_mlu PROPERTIES TIMEOUT 120)
-    endif(WITH_CNCL)
+  if(WITH_CNCL)
+    list(APPEND TEST_DIST_OPS "test_spawn_mlu")
+    foreach(TEST_OP ${TEST_DIST_OPS})
+      py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+    endforeach(TEST_OP)
+    bash_test_modules(test_launch_async_mlu START_BASH test_launch_async_mlu.sh
+                      ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+    bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh
+                      ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+    bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh
+                      ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+    bash_test_modules(
+      test_c_comm_init_op_mlu START_BASH test_c_comm_init_op_mlu.sh ENVS
+      PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+    set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_allreduce_sum PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_allreduce_max PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_allreduce_min PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_allreduce_prod PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_allgather PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_reduce_sum PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_reduce_max PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_reduce_min PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_reduce_prod PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_broadcast_api_mlu PROPERTIES TIMEOUT
+                                                                      120)
+    set_tests_properties(test_collective_allreduce_api_mlu PROPERTIES TIMEOUT
+                                                                      120)
+    set_tests_properties(test_collective_allgather_api_mlu PROPERTIES TIMEOUT
+                                                                      120)
+    set_tests_properties(test_c_comm_init_op_mlu PROPERTIES TIMEOUT 120)
+  endif(WITH_CNCL)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py
index e91f28e3b1db8..1f343bb532174 100644
--- a/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py
@@ -25,6 +25,7 @@
 
 
 class TestCCommInitOp(unittest.TestCase):
+
     def setUp(self):
         self.endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')
         self.current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
@@ -45,25 +46,23 @@ def test_specifying_devices(self):
             name=fluid.unique_name.generate('cncl_id'),
             persistable=True,
             type=fluid.core.VarDesc.VarType.RAW)
-        block.append_op(
-            type='c_gen_cncl_id',
-            inputs={},
-            outputs={'Out': cncl_id_var},
-            attrs={
-                'rank': self.rank,
-                'endpoint': self.current_endpoint,
-                'other_endpoints': self.other_endpoints
-            })
-        block.append_op(
-            type='c_comm_init',
-            inputs={'X': cncl_id_var},
-            outputs={},
-            attrs={
-                'nranks': self.nranks,
-                'rank': self.rank,
-                'ring_id': 0,
-                'device_id': self.mlu_id
-            })
+        block.append_op(type='c_gen_cncl_id',
+                        inputs={},
+                        outputs={'Out': cncl_id_var},
+                        attrs={
+                            'rank': self.rank,
+                            'endpoint': self.current_endpoint,
+                            'other_endpoints': self.other_endpoints
+                        })
+        block.append_op(type='c_comm_init',
+                        inputs={'X': cncl_id_var},
+                        outputs={},
+                        attrs={
+                            'nranks': self.nranks,
+                            'rank': self.rank,
+                            'ring_id': 0,
+                            'device_id': self.mlu_id
+                        })
         self.exe.run(program)
 
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py
index 50ae6b1a169d7..b30d055e5f402 100755
--- a/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_api.py
@@ -39,14 +39,16 @@
 
 
 class TestCollectiveAllgatherAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank):
         with fluid.program_guard(main_prog, startup_program):
             tensor_list = []
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             paddle.distributed.all_gather(tensor_list, tindata)
             return tensor_list
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py
index f67b3fbcc6a80..591376deb1b81 100755
--- a/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allgather_op.py
@@ -38,6 +38,7 @@
 
 
 class TestCollectiveAllgather(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -45,25 +46,26 @@ def get_model(self, main_prog, startup_program, col_type):
         ring_id = 0
         nranks = 2
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outofallgather",
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_allgather",
-                inputs={'X': tindata},
-                attrs={'ring_id': ring_id,
-                       'nranks': nranks},
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
+            main_prog.global_block().append_op(type="c_allgather",
+                                               inputs={'X': tindata},
+                                               attrs={
+                                                   'ring_id': ring_id,
+                                                   'nranks': nranks
+                                               },
+                                               outputs={'Out': toutdata})
+            main_prog.global_block().append_op(type="c_sync_comm_stream",
+                                               inputs={'X': toutdata},
+                                               outputs={'Out': toutdata},
+                                               attrs={'ring_id': ring_id})
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py
index ebe4e71d22fde..51df37d38d45d 100644
--- a/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py
@@ -39,13 +39,15 @@
 
 
 class TestCollectiveAllreduceAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             paddle.distributed.all_reduce(tindata)
             return [tindata]
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py
index 404ed1235d2ae..c839e3213f4cf 100644
--- a/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py
@@ -39,30 +39,30 @@
 
 
 class TestCollectiveAllreduce(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, col_type):
         ring_id = 0
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outof" + col_type,
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_" + col_type,
-                inputs={'X': tindata},
-                attrs={'ring_id': ring_id},
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
+            main_prog.global_block().append_op(type="c_" + col_type,
+                                               inputs={'X': tindata},
+                                               attrs={'ring_id': ring_id},
+                                               outputs={'Out': toutdata})
+            main_prog.global_block().append_op(type="c_sync_comm_stream",
+                                               inputs={'X': toutdata},
+                                               outputs={'Out': toutdata},
+                                               attrs={'ring_id': ring_id})
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py
index 2002909ea2eec..c608dcc616599 100644
--- a/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py
@@ -39,13 +39,15 @@
 
 
 class TestCollectiveBroadcastAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype="float32")
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype="float32")
             paddle.distributed.broadcast(tindata, src=1)
             return [tindata]
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_op.py b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_op.py
index 49bc6a6c4bb24..21da1aaa656cc 100755
--- a/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_op.py
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_op.py
@@ -39,6 +39,7 @@
 
 
 class TestCollectiveBroadcast(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -46,25 +47,26 @@ def get_model(self, main_prog, startup_program, col_type):
         ring_id = 0
         rootid = 1
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outofbroadcast",
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_broadcast",
-                inputs={'X': tindata},
-                attrs={'ring_id': ring_id,
-                       'root': rootid},
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
+            main_prog.global_block().append_op(type="c_broadcast",
+                                               inputs={'X': tindata},
+                                               attrs={
+                                                   'ring_id': ring_id,
+                                                   'root': rootid
+                                               },
+                                               outputs={'Out': toutdata})
+            main_prog.global_block().append_op(type="c_sync_comm_stream",
+                                               inputs={'X': toutdata},
+                                               outputs={'Out': toutdata},
+                                               attrs={'ring_id': ring_id})
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_reduce_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_reduce_api.py
index f987a71abda72..b1824dcba8a7c 100644
--- a/python/paddle/fluid/tests/unittests/mlu/collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_reduce_api.py
@@ -39,13 +39,15 @@
 
 
 class TestCollectiveReduceAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             paddle.distributed.reduce(tindata, dst=0)
             return [tindata]
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_reduce_op.py b/python/paddle/fluid/tests/unittests/mlu/collective_reduce_op.py
index 05fc17a5c7da3..48352dca085b4 100644
--- a/python/paddle/fluid/tests/unittests/mlu/collective_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_reduce_op.py
@@ -39,6 +39,7 @@
 
 
 class TestCollectiveReduce(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -46,25 +47,26 @@ def get_model(self, main_prog, startup_program, col_type):
         ring_id = 0
         rootid = 1
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outof" + col_type,
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_" + col_type,
-                inputs={'X': tindata},
-                attrs={'ring_id': ring_id,
-                       'root_id': rootid},
-                outputs={'Out': toutdata})
-            main_prog.global_block().append_op(
-                type="c_sync_comm_stream",
-                inputs={'X': toutdata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id})
+            main_prog.global_block().append_op(type="c_" + col_type,
+                                               inputs={'X': tindata},
+                                               attrs={
+                                                   'ring_id': ring_id,
+                                                   'root_id': rootid
+                                               },
+                                               outputs={'Out': toutdata})
+            main_prog.global_block().append_op(type="c_sync_comm_stream",
+                                               inputs={'X': toutdata},
+                                               outputs={'Out': toutdata},
+                                               attrs={'ring_id': ring_id})
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py b/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py
index 9ea550a8452e4..782475ff8cb5e 100644
--- a/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py
@@ -45,7 +45,7 @@ def train_abort(prefix):
 
     if trainer_id == 0:
         try:
-            # train abort 
+            # train abort
             exit(1)
         except SystemExit:
             name = "abort>>> selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_abs_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_abs_op_mlu.py
index 0c33bd6b1ade8..65b2d6f122687 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_abs_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_abs_op_mlu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append('..')
 from op_test import OpTest
 import paddle.fluid.core as core
@@ -30,6 +31,7 @@
 
 
 class TestAbs(OpTest):
+
     def setUp(self):
         self.op_type = "abs"
         self.set_mlu()
@@ -56,11 +58,12 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], ['Out'], check_eager=False)
+        self.check_grad_with_place(self.place, ['X'], ['Out'],
+                                   check_eager=False)
 
 
 class TestAbsHalf(OpTest):
+
     def setUp(self):
         self.op_type = "abs"
         self.set_mlu()
@@ -87,8 +90,8 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], ['Out'], check_eager=False)
+        self.check_grad_with_place(self.place, ['X'], ['Out'],
+                                   check_eager=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py
index 5e5c4c9a301e9..e3754224b2424 100755
--- a/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append('..')
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestAccuracyOp(OpTest):
+
     def setUp(self):
         self.op_type = "accuracy"
         self.place = paddle.device.MLUPlace(0)
@@ -58,6 +60,7 @@ def test_check_output(self):
 
 
 class TestAccuracyOpFp16(TestAccuracyOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -66,13 +69,15 @@ def test_check_output(self):
 
 
 class TestAccuracyOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of accuracy_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.MLUPlace(0))
-            label = fluid.layers.data(
-                name='label', shape=[-1, 1], dtype="int32")
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.MLUPlace(0))
+            label = fluid.layers.data(name='label',
+                                      shape=[-1, 1],
+                                      dtype="int32")
             self.assertRaises(TypeError, fluid.layers.accuracy, x1, label)
             self.assertRaises(TypeError, paddle.metric.accuracy, x1, label)
             # The input dtype of accuracy_op must be float32 or float64.
@@ -85,13 +90,17 @@ def test_errors(self):
 
 
 class TestAccuracyAPI1(unittest.TestCase):
+
     def setUp(self):
-        self.predictions = paddle.static.data(
-            shape=[2, 5], name="predictions", dtype="float32")
-        self.label = paddle.static.data(
-            shape=[2, 1], name="labels", dtype="int32")
-        self.result = paddle.static.accuracy(
-            input=self.predictions, label=self.label, k=1)
+        self.predictions = paddle.static.data(shape=[2, 5],
+                                              name="predictions",
+                                              dtype="float32")
+        self.label = paddle.static.data(shape=[2, 1],
+                                        name="labels",
+                                        dtype="int32")
+        self.result = paddle.static.accuracy(input=self.predictions,
+                                             label=self.label,
+                                             k=1)
         self.input_predictions = np.array(
             [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]],
             dtype="float32")
@@ -109,6 +118,7 @@ def test_api(self):
 
 
 class TestAccuracyAPI2(unittest.TestCase):
+
     def test_api(self):
         with fluid.dygraph.guard():
             predictions = paddle.to_tensor(
@@ -121,6 +131,7 @@ def test_api(self):
 
 
 class TestAccuracyAPI(unittest.TestCase):
+
     def test_api(self):
         with fluid.dygraph.guard():
             predictions = paddle.to_tensor(
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py
index f30a391f65385..4354883a44274 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py
@@ -15,6 +15,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestAdam(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "adam"
@@ -78,6 +80,7 @@ def test_check_output(self):
 
 
 class TestAdamWithEpsilonTensor(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "adam"
@@ -132,6 +135,7 @@ def test_check_output(self):
 
 
 class TestAdamOpWithSkipUpdate(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "adam"
@@ -184,6 +188,7 @@ def test_check_output(self):
 
 
 class TestAdamOpWithGlobalBetaPow(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "adam"
@@ -241,6 +246,7 @@ def test_check_output(self):
 
 
 class TestNet(unittest.TestCase):
+
     def _test(self, run_mlu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -255,8 +261,9 @@ def _test(self, run_mlu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.pow(sum, 2.0)
@@ -280,12 +287,13 @@ def _test(self, run_mlu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py
index d2827725a2058..5c69cdb74093a 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py
@@ -15,6 +15,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestAdamW(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "adamw"
@@ -84,6 +86,7 @@ def test_check_output(self):
 
 
 class TestAdamOpWithSkipUpdate(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "adamw"
@@ -136,6 +139,7 @@ def test_check_output(self):
 
 
 class TestAdamOpWithoutDecay(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "adamw"
@@ -188,6 +192,7 @@ def test_check_output(self):
 
 
 class TestNet(unittest.TestCase):
+
     def _test(self, run_mlu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -202,8 +207,9 @@ def _test(self, run_mlu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.pow(sum, 2.0)
@@ -227,12 +233,13 @@ def _test(self, run_mlu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py
index 57fa56acd6875..9e0fdbbd20810 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -26,6 +27,7 @@
 
 
 class TestCheckFiniteAndUnscaleOp(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "check_finite_and_unscale"
@@ -54,6 +56,7 @@ def test_check_output(self):
 
 
 class TestCheckFiniteAndUnscaleOpWithNan(TestCheckFiniteAndUnscaleOp):
+
     def init_test_case(self):
         x = np.random.random((129, 129)).astype(self.dtype)
         x[128][128] = np.nan
@@ -66,12 +69,13 @@ def init_test_case(self):
         }
 
     def test_check_output(self):
-        # When input contains nan, do not check the output, 
+        # When input contains nan, do not check the output,
         # since the output may be nondeterministic and will be discarded.
         self.check_output_with_place(self.place, no_check_set=['Out'])
 
 
 class TestCheckFiniteAndUnscaleOpWithInf(TestCheckFiniteAndUnscaleOp):
+
     def init_test_case(self):
         x = np.random.random((129, 129)).astype(self.dtype)
         x[128][128] = np.inf
@@ -84,12 +88,13 @@ def init_test_case(self):
         }
 
     def test_check_output(self):
-        # When input contains inf, do not check the output, 
+        # When input contains inf, do not check the output,
         # since the output may be nondeterministic and will be discarded.
         self.check_output_with_place(self.place, no_check_set=['Out'])
 
 
 class TestCheckFiniteAndUnscaleOpMultiInput(TestCheckFiniteAndUnscaleOp):
+
     def init_test_case(self):
         x0 = np.random.random((129, 129)).astype(self.dtype)
         x1 = np.random.random((129, 129)).astype(self.dtype)
@@ -103,6 +108,7 @@ def init_test_case(self):
 
 
 class TestCheckFiniteAndUnscaleOpMultiInputWithNan(TestCheckFiniteAndUnscaleOp):
+
     def init_test_case(self):
         x0 = np.random.random((129, 129)).astype(self.dtype)
         x0[128][128] = np.nan
@@ -116,12 +122,13 @@ def init_test_case(self):
         }
 
     def test_check_output(self):
-        # When input contains inf, do not check the output, 
+        # When input contains inf, do not check the output,
         # since the output may be nondeterministic and will be discarded.
         self.check_output_with_place(self.place, no_check_set=['Out'])
 
 
 class TestCheckFiniteAndUnscaleOpMultiInputWithInf(TestCheckFiniteAndUnscaleOp):
+
     def init_test_case(self):
         x0 = np.random.random((129, 129)).astype(self.dtype)
         x0[128][128] = np.nan
@@ -136,7 +143,7 @@ def init_test_case(self):
         }
 
     def test_check_output(self):
-        # When input contains inf, do not check the output, 
+        # When input contains inf, do not check the output,
         # since the output may be nondeterministic and will be discarded.
         self.check_output_with_place(self.place, no_check_set=['Out'])
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_assign_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_assign_op_mlu.py
index 85302ad76da8b..8aaba7b258019 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_assign_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_assign_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -26,6 +27,7 @@
 
 
 class TestAssign(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "assign"
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_assign_value_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_assign_value_op_mlu.py
index 5ee9d369e0fd9..1bf2504c86310 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_assign_value_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_assign_value_op_mlu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy
 import sys
+
 sys.path.append("..")
 
 import op_test
@@ -30,6 +31,7 @@
 
 
 class TestAssignValueMLUOp(op_test.OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "assign_value"
@@ -55,21 +57,24 @@ def test_check_output(self):
 
 
 class TestAssignValueMLUOp2(TestAssignValueMLUOp):
+
     def init_data(self):
         self.value = numpy.random.random(size=(2, 5)).astype(numpy.int32)
         self.attrs["int32_values"] = [int(v) for v in self.value.flat]
 
 
 class TestAssignValueMLUOp3(TestAssignValueMLUOp):
+
     def init_data(self):
         self.value = numpy.random.random(size=(2, 5)).astype(numpy.int64)
         self.attrs["int64_values"] = [int(v) for v in self.value.flat]
 
 
 class TestAssignValueMLUOp4(TestAssignValueMLUOp):
+
     def init_data(self):
-        self.value = numpy.random.choice(
-            a=[False, True], size=(2, 5)).astype(numpy.bool)
+        self.value = numpy.random.choice(a=[False, True],
+                                         size=(2, 5)).astype(numpy.bool)
         self.attrs["bool_values"] = [int(v) for v in self.value.flat]
 
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
index 4cbff21dfc496..86f044b9d3dad 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
@@ -22,6 +22,7 @@
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
 import sys
+
 sys.path.append('..')
 from op_test import OpTest, _set_use_system_allocator
 from paddle.fluid.framework import grad_var_name
@@ -159,9 +160,9 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
         x = np.transpose(x, (0, 2, 3, 1))
         y_grad = np.transpose(y_grad, (0, 2, 3, 1))
 
-    x_grad = scale * (y_grad - np.mean(
-        y_grad, axis=(0, 1, 2)) - (x - mean) * np.mean(
-            y_grad * (x - mean), axis=(0, 1, 2)) /
+    x_grad = scale * (y_grad - np.mean(y_grad, axis=(0, 1, 2)) -
+                      (x - mean) * np.mean(y_grad *
+                                           (x - mean), axis=(0, 1, 2)) /
                       (var + epsilon)) / np.sqrt(var + epsilon)
     grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
                         axis=(0, 1, 2))
@@ -189,6 +190,7 @@ def create_or_get_tensor(scope, var_name, var, place):
 
 
 def set_output_grad(scope, outputs, place, feed_dict=None):
+
     def __set_tensor__(name, data=None):
         out_tensor = scope.find_var(name).get_tensor()
         grad_tensor = scope.var(grad_var_name(name)).get_tensor()
@@ -210,6 +212,7 @@ def __set_tensor__(name, data=None):
 
 
 class TestBatchNormOpInference(unittest.TestCase):
+
     def setUp(self):
         self.dtype = np.float32
         self.fuse_with_relu = False
@@ -254,8 +257,8 @@ def check_with_place(self, place, data_layout, dtype, shape):
                                         OpTest.np_dtype_to_fluid_dtype(x_val),
                                         place)
         scale_tensor = create_or_get_tensor(
-            scope, "scale_val",
-            OpTest.np_dtype_to_fluid_dtype(scale_val), place)
+            scope, "scale_val", OpTest.np_dtype_to_fluid_dtype(scale_val),
+            place)
         bias_tensor = create_or_get_tensor(
             scope, "bias_val", OpTest.np_dtype_to_fluid_dtype(bias_val), place)
         mean_tensor = create_or_get_tensor(scope, "mean",
@@ -297,13 +300,12 @@ def check_with_place(self, place, data_layout, dtype, shape):
         batch_norm_op.run(scope, place)
 
         # check inference result
-        self.__assert_close(
-            y_tensor,
-            y_out,
-            "inference output are different at " + str(place) + ", " +
-            data_layout + ", " + str(np.dtype(dtype)) +
-            str(np.array(y_tensor)) + str(y_out),
-            atol=1e-3)
+        self.__assert_close(y_tensor,
+                            y_out,
+                            "inference output are different at " + str(place) +
+                            ", " + data_layout + ", " + str(np.dtype(dtype)) +
+                            str(np.array(y_tensor)) + str(y_out),
+                            atol=1e-3)
 
     def test_check_output(self):
         places = [core.CPUPlace()]
@@ -321,6 +323,7 @@ def init_kernel_type(self):
 
 
 class TestFP16BatchNormOpInference(TestBatchNormOpInference):
+
     def setUp(self):
         self.dtype = np.float16
         self.fuse_with_relu = False
@@ -339,6 +342,7 @@ def test_check_output(self):
 
 
 class TestBatchNormOpTraining(unittest.TestCase):
+
     def setUp(self):
         self.fuse_with_relu = False
         self.data_formats = ["NCHW", "NHWC"]
@@ -368,8 +372,9 @@ def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
         variance_out = var_ref * (1. - momentum) + momentum * variance
         saved_variance = 1. / np.sqrt(var_ref + epsilon)
         # run backward
-        x_grad, scale_grad, bias_grad = _reference_grad(
-            x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout)
+        x_grad, scale_grad, bias_grad = _reference_grad(x, y_grad, scale,
+                                                        saved_mean, var_ref,
+                                                        epsilon, data_layout)
 
         return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
 
@@ -385,6 +390,7 @@ def set_mean_variance(self, scale_shape, x, data_layout):
         return mean, variance
 
     def test_forward_backward(self):
+
         def test_with_place(place, data_layout, shape):
             # attr
             epsilon = self.epsilon
@@ -423,10 +429,9 @@ def test_with_place(place, data_layout, shape):
             with fluid.program_guard(program):
                 block = program.global_block()
                 for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape)
+                    block.create_var(name=name,
+                                     dtype='float32',
+                                     shape=ground_truth[name].shape)
                 inputs = {
                     "X": block.var('x'),
                     "Scale": block.var('scale'),
@@ -456,11 +461,10 @@ def test_with_place(place, data_layout, shape):
                 }
                 block.create_var(name="reserve_space", dtype='float32')
                 outputs["ReserveSpace"] = block.var('reserve_space')
-                bn_op = block.append_op(
-                    type="batch_norm",
-                    inputs=inputs,
-                    outputs=outputs,
-                    attrs=attrs)
+                bn_op = block.append_op(type="batch_norm",
+                                        inputs=inputs,
+                                        outputs=outputs,
+                                        attrs=attrs)
                 block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
 
                 # generate backward op_desc
@@ -492,8 +496,10 @@ def test_with_place(place, data_layout, shape):
 
             for id, name in enumerate(self.fetch_list):
                 if name == 'variance':
-                    self.__assert_close(
-                        var_dict[name], out[id], name, atol=1e-3)
+                    self.__assert_close(var_dict[name],
+                                        out[id],
+                                        name,
+                                        atol=1e-3)
                     continue
                 self.__assert_close(var_dict[name], out[id], name)
             print("op test forward passed: ", str(place), data_layout)
@@ -512,6 +518,7 @@ def init_kernel_type(self):
 
 
 class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining):
+
     def init_test_case(self):
         self.use_global_stats = False
         self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
@@ -519,6 +526,7 @@ def init_test_case(self):
 
 
 class TestBatchNormOpTrainingCase2(TestBatchNormOpTraining):
+
     def init_test_case(self):
         self.use_global_stats = False
         self.no_grad_set = set()
@@ -530,6 +538,7 @@ def init_test_case(self):
 
 
 class TestBatchNormOpTrainingCase3(TestBatchNormOpTraining):
+
     def init_test_case(self):
         self.use_global_stats = False
         self.no_grad_set = set(['x@GRAD'])
@@ -537,6 +546,7 @@ def init_test_case(self):
 
 
 class TestBatchNormOpTrainingMomentumVariable(TestBatchNormOpTraining):
+
     def init_test_case(self):
         self.use_momentum_variable = True
         self.use_global_stats = False
@@ -548,6 +558,7 @@ def init_test_case(self):
 
 
 class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining):
+
     def init_test_case(self):
         self.use_global_stats = True
         self.no_grad_set = set()
@@ -602,6 +613,7 @@ def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
 
 class TestBatchNormOpFreezeStatsAndScaleBiasTraining(
         TestBatchNormOpFreezeStatsTraining):
+
     def init_test_case(self):
         self.use_global_stats = True
         self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
@@ -609,11 +621,12 @@ def init_test_case(self):
 
 
 class TestBatchNormOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of batch_norm must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.batch_norm, x1)
 
             # the input dtype of batch_norm must be float16 or float32 or float64
@@ -623,12 +636,13 @@ def test_errors(self):
 
 
 class TestDygraphBatchNormAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             batch_norm = fluid.dygraph.BatchNorm(10)
             # the input of BatchNorm must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
             self.assertRaises(TypeError, batch_norm, x1)
 
             # the input dtype of BatchNorm must be float16 or float32 or float64
@@ -638,6 +652,7 @@ def test_errors(self):
 
 
 class TestDygraphBatchNormTrainableStats(unittest.TestCase):
+
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_mlu():
@@ -686,6 +701,7 @@ def compute(x_np, is_test, trainable_statistics):
 
 
 class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
+
     def test_reservespace(self):
         with program_guard(Program(), Program()):
             paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
index 7dd9dcdee57f9..b0fec2bdd0f6a 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
@@ -19,6 +19,7 @@
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, _set_use_system_allocator
 from paddle.fluid.framework import grad_var_name
@@ -30,6 +31,7 @@
 
 
 class TestBatchNorm(unittest.TestCase):
+
     def test_name(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_mlu():
@@ -124,8 +126,9 @@ def compute_v3(x, is_test, trainable_statistics):
 
             def compute_v4(x):
                 with fluid.dygraph.guard(p):
-                    bn = paddle.nn.BatchNorm2D(
-                        shape[1], weight_attr=False, bias_attr=False)
+                    bn = paddle.nn.BatchNorm2D(shape[1],
+                                               weight_attr=False,
+                                               bias_attr=False)
                     y = bn(fluid.dygraph.to_variable(x))
                 return y.numpy()
 
@@ -173,6 +176,7 @@ def compute_v2(x_np):
 
 
 class TestBatchNormChannelLast(unittest.TestCase):
+
     def setUp(self):
         self.original_dtyep = paddle.get_default_dtype()
         paddle.set_default_dtype("float32")
@@ -196,8 +200,7 @@ def test_1d(self):
                 y2 = net2(channel_first_x)
                 y2 = paddle.transpose(y2, [0, 2, 1])
                 self.assertEqual(
-                    np.allclose(
-                        y1.numpy(), y2.numpy(), atol=1e-07), True)
+                    np.allclose(y1.numpy(), y2.numpy(), atol=1e-07), True)
 
     def test_2d(self):
         for p in self.places:
@@ -212,8 +215,7 @@ def test_2d(self):
                 y2 = net2(channel_first_x)
                 y2 = paddle.transpose(y2, [0, 2, 3, 1])
                 self.assertEqual(
-                    np.allclose(
-                        y1.numpy(), y2.numpy(), atol=1e-07), True)
+                    np.allclose(y1.numpy(), y2.numpy(), atol=1e-07), True)
 
     def test_3d(self):
         for p in self.places:
@@ -228,8 +230,7 @@ def test_3d(self):
                 y2 = net2(channel_first_x)
                 y2 = paddle.transpose(y2, [0, 2, 3, 4, 1])
                 self.assertEqual(
-                    np.allclose(
-                        y1.numpy(), y2.numpy(), atol=1e-07), True)
+                    np.allclose(y1.numpy(), y2.numpy(), atol=1e-07), True)
                 # res = np.allclose(y1.numpy(), y2.numpy())
                 # if res == False:
                 #   np.savetxt("./y1.txt", y1.numpy().flatten(), fmt='%.10f', delimiter='\n')
@@ -238,6 +239,7 @@ def test_3d(self):
 
 
 class TestBatchNormUseGlobalStats(unittest.TestCase):
+
     def setUp(self):
         self.places = [fluid.CPUPlace()]
         if core.is_compiled_with_mlu():
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
index 10356b124b2ea..6ba62b11499f4 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 
@@ -29,6 +30,7 @@
 
 
 class TestCastOpFp32ToFp16(OpTest):
+
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float32')}
@@ -47,6 +49,7 @@ def test_check_output(self):
 
 
 class TestCastOpFp16ToFp32(OpTest):
+
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float16')}
@@ -64,6 +67,7 @@ def test_check_output(self):
 
 
 class TestCastOpInt32ToInt32(OpTest):
+
     def setUp(self):
         ipt = np.random.randint(1000, size=(10, 10))
         self.inputs = {'X': ipt.astype('int32')}
@@ -81,6 +85,7 @@ def test_check_output(self):
 
 
 class TestCastOpInt32ToFp32(OpTest):
+
     def setUp(self):
         ipt = np.random.randint(1000, size=[10, 10])
         self.inputs = {'X': ipt.astype('int32')}
@@ -98,6 +103,7 @@ def test_check_output(self):
 
 
 class TestCastOpInt16ToFp64(OpTest):
+
     def setUp(self):
         ipt = np.random.randint(1000, size=[10, 10])
         self.inputs = {'X': ipt.astype('int16')}
@@ -115,11 +121,12 @@ def test_check_output(self):
 
 
 class TestCastOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of cast_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.MLUPlace(0))
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.MLUPlace(0))
             self.assertRaises(TypeError, fluid.layers.cast, x1, 'int32')
 
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_coalesce_tensor_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_coalesce_tensor_op_mlu.py
index 854ac0b6826cd..72783e3ca19cd 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_coalesce_tensor_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_coalesce_tensor_op_mlu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append('..')
 from op_test import OpTest
 from paddle.fluid import core
@@ -27,6 +28,7 @@
 
 
 class TestAllocContinuousSpace(OpTest):
+
     def setUp(self):
         self.op_type = "coalesce_tensor"
         self.dtype, self.fluid_dtype = self.init_dtype()
@@ -82,13 +84,13 @@ def init_output(self, input_list, set_constant, constant):
         return outputs, coalesce_tensor_var
 
     def test_check_output(self):
-        self.check_output_with_place(
-            place=paddle.device.MLUPlace(0),
-            no_check_set=["FusedOutput"],
-            atol=1e-5)
+        self.check_output_with_place(place=paddle.device.MLUPlace(0),
+                                     no_check_set=["FusedOutput"],
+                                     atol=1e-5)
 
 
 class TestAllocContinuousSpace2(TestAllocContinuousSpace):
+
     def init_attr(self):
         return {
             "copy_data": False,
@@ -99,10 +101,9 @@ def init_attr(self):
         }
 
     def test_check_output(self):
-        self.check_output_with_place(
-            place=paddle.device.MLUPlace(0),
-            no_check_set=["FusedOutput"],
-            atol=1e-5)
+        self.check_output_with_place(place=paddle.device.MLUPlace(0),
+                                     no_check_set=["FusedOutput"],
+                                     atol=1e-5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py
index 09166e15aac81..6590281173294 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather.py
@@ -24,6 +24,7 @@
 
 
 class TestCAllgatherOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py
index 576c310cc3ac2..be3dedefc59b8 100755
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allgather_api_mlu.py
@@ -23,6 +23,7 @@
 
 
 class TestCollectiveAllgatherAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py
index 447498b9022d4..8b3accc505051 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py
@@ -23,6 +23,7 @@
 
 
 class TestCollectiveAllreduceAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_max.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_max.py
index bd04e6e2dc6af..02901d2d511b6 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_max.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_max.py
@@ -24,6 +24,7 @@
 
 
 class TestCAllreduceOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_min.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_min.py
index 4b16146e2ee2e..b8bae97f4f9b6 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_min.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_min.py
@@ -24,6 +24,7 @@
 
 
 class TestCAllreduceOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_prod.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_prod.py
index 0c6ea566cfa94..519715142fadc 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_prod.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_prod.py
@@ -24,6 +24,7 @@
 
 
 class TestCAllreduceOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_sum.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_sum.py
index a7a3984f4e55e..04ddff84f3ddd 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_sum.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_sum.py
@@ -24,6 +24,7 @@
 
 
 class TestCAllreduceOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
index 9fae73a2540f4..04332b061f885 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
@@ -41,6 +41,7 @@ def DataTypeCast(date_type):
 
 
 class TestCollectiveAPIRunnerBase(object):
+
     def get_model(self, train_prog, startup_prog, rank, indata=None):
         raise NotImplementedError(
             "get model should be implemented by child class.")
@@ -95,6 +96,7 @@ def runtime_main(test_class, col_type):
 
 
 class TestDistBase(unittest.TestCase):
+
     def setUp(self):
         self._port_set = set()
         self._trainers = 2
@@ -103,6 +105,7 @@ def setUp(self):
         self._python_interp = sys.executable
 
     def _find_free_port(self):
+
         def __free_port():
             with closing(socket.socket(socket.AF_INET,
                                        socket.SOCK_STREAM)) as s:
@@ -145,18 +148,16 @@ def _run_cluster(self, model_file, envs):
         tr1_cmd = tr_cmd % (self._python_interp, model_file)
         tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w")
         tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
-        #print(tr0_cmd) 
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.strip().split(),
-            stdout=subprocess.PIPE,
-            stderr=tr0_pipe,
-            env=env0)
-
-        tr1_proc = subprocess.Popen(
-            tr0_cmd.strip().split(),
-            stdout=subprocess.PIPE,
-            stderr=tr1_pipe,
-            env=env1)
+        #print(tr0_cmd)
+        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr0_pipe,
+                                    env=env0)
+
+        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr1_pipe,
+                                    env=env1)
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
@@ -200,8 +201,8 @@ def check_with_place(self,
             required_envs["GLOG_v"] = "3"
             required_envs["GLOG_logtostderr"] = "1"
             required_envs["GLOO_LOG_LEVEL"] = "TRACE"
-        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
-                                                         required_envs)
+        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(
+            model_file, required_envs)
         np_data_type = DataTypeCast(data_type)
         np.random.seed(pid0)
         input1 = np.random.random((10, 1000)).astype(np_data_type)
@@ -214,11 +215,9 @@ def check_with_place(self,
         elif col_type == "allreduce":
             need_result = input1 + input2
             self.assertTrue(
-                np.allclose(
-                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr0_out, need_result, rtol=1e-05, atol=1e-05))
             self.assertTrue(
-                np.allclose(
-                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out, need_result, rtol=1e-05, atol=1e-05))
         elif col_type == "reduce":
             need_result = input1 + input2
             self.assertTrue(np.allclose(tr0_out, need_result))
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
index f63daaf66ac21..4ec1e7f7528bb 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
@@ -53,6 +53,7 @@ def DataTypeCast(date_type):
 
 
 class TestCollectiveRunnerBase(object):
+
     def get_model(self, train_prog, startup_prog, col_type):
         raise NotImplementedError(
             "get model should be implemented by child class.")
@@ -63,9 +64,8 @@ def wait_server_ready(self, endpoints):
             not_ready_endpoints = []
             for ep in endpoints:
                 ip_port = ep.split(":")
-                with closing(
-                        socket.socket(socket.AF_INET,
-                                      socket.SOCK_STREAM)) as sock:
+                with closing(socket.socket(socket.AF_INET,
+                                           socket.SOCK_STREAM)) as sock:
                     sock.settimeout(2)
                     sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                     if hasattr(socket, 'SO_REUSEPORT'):
@@ -78,13 +78,14 @@ def wait_server_ready(self, endpoints):
                         not_ready_endpoints.append(ep)
             if not all_ok:
                 sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-                sys.stderr.write("not ready endpoints:" + str(
-                    not_ready_endpoints) + "\n")
+                sys.stderr.write("not ready endpoints:" +
+                                 str(not_ready_endpoints) + "\n")
                 sys.stderr.flush()
                 time.sleep(3)
             else:
                 break
 
+
 #endpoints should be ["ip1:port1","ip2:port2"]
 
     def initCommunicator(self, program, rank, nranks, wait_port,
@@ -94,30 +95,27 @@ def initCommunicator(self, program, rank, nranks, wait_port,
         if rank == 0 and wait_port:
             self.wait_server_ready(other_endpoints)
         block = program.global_block()
-        cncl_id_var = block.create_var(
-            name=nameGen.generate('cncl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-
-        block.append_op(
-            type='c_gen_cncl_id',
-            inputs={},
-            outputs={'Out': cncl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints
-            })
-
-        block.append_op(
-            type='c_comm_init',
-            inputs={'X': cncl_id_var},
-            outputs={},
-            attrs={
-                'nranks': nranks,
-                'rank': rank,
-                'ring_id': self.global_ring_id
-            })
+        cncl_id_var = block.create_var(name=nameGen.generate('cncl_id'),
+                                       persistable=True,
+                                       type=core.VarDesc.VarType.RAW)
+
+        block.append_op(type='c_gen_cncl_id',
+                        inputs={},
+                        outputs={'Out': cncl_id_var},
+                        attrs={
+                            'rank': rank,
+                            'endpoint': current_endpoint,
+                            'other_endpoints': other_endpoints
+                        })
+
+        block.append_op(type='c_comm_init',
+                        inputs={'X': cncl_id_var},
+                        outputs={},
+                        attrs={
+                            'nranks': nranks,
+                            'rank': rank,
+                            'ring_id': self.global_ring_id
+                        })
 
     def run_trainer(self, args):
         train_prog = fluid.Program()
@@ -162,6 +160,7 @@ def runtime_main(test_class):
 
 
 class TestDistBase(unittest.TestCase):
+
     def setUp(self):
         self._port_set = set()
         self._trainers = 2
@@ -170,6 +169,7 @@ def setUp(self):
         self._python_interp = sys.executable
 
     def _find_free_port(self):
+
         def __free_port():
             with closing(socket.socket(socket.AF_INET,
                                        socket.SOCK_STREAM)) as s:
@@ -210,17 +210,15 @@ def _run_cluster(self, model_file, envs):
         tr0_pipe = open("/tmp/tr0_err.log", "wb")
         tr1_pipe = open("/tmp/tr1_err.log", "wb")
 
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.strip().split(),
-            stdout=subprocess.PIPE,
-            stderr=tr0_pipe,
-            env=env0)
+        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr0_pipe,
+                                    env=env0)
 
-        tr1_proc = subprocess.Popen(
-            tr0_cmd.strip().split(),
-            stdout=subprocess.PIPE,
-            stderr=tr1_pipe,
-            env=env1)
+        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr1_pipe,
+                                    env=env1)
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
@@ -252,8 +250,8 @@ def check_with_place(self,
         if check_error_log:
             required_envs["GLOG_v"] = "3"
             required_envs["GLOG_logtostderr"] = "1"
-        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
-                                                         required_envs)
+        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(
+            model_file, required_envs)
         np_data_type = DataTypeCast(data_type)
         np.random.seed(pid0)
         input1 = np.random.random((10, 1000)).astype(np_data_type)
@@ -266,35 +264,27 @@ def check_with_place(self,
         elif col_type == "allreduce_sum":
             need_result = input1 + input2
             self.assertTrue(
-                np.allclose(
-                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr0_out, need_result, rtol=1e-05, atol=1e-05))
             self.assertTrue(
-                np.allclose(
-                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out, need_result, rtol=1e-05, atol=1e-05))
         elif col_type == "allreduce_prod":
             need_result = input1 * input2
             self.assertTrue(
-                np.allclose(
-                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr0_out, need_result, rtol=1e-05, atol=1e-05))
             self.assertTrue(
-                np.allclose(
-                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out, need_result, rtol=1e-05, atol=1e-05))
         elif col_type == "allreduce_max":
             need_result = np.maximum(input1, input2)
             self.assertTrue(
-                np.allclose(
-                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr0_out, need_result, rtol=1e-05, atol=1e-05))
             self.assertTrue(
-                np.allclose(
-                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out, need_result, rtol=1e-05, atol=1e-05))
         elif col_type == "allreduce_min":
             need_result = np.minimum(input1, input2)
             self.assertTrue(
-                np.allclose(
-                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr0_out, need_result, rtol=1e-05, atol=1e-05))
             self.assertTrue(
-                np.allclose(
-                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out, need_result, rtol=1e-05, atol=1e-05))
         elif col_type == "reduce_sum":
             need_result = input1 + input2
             self.assertTrue(np.allclose(tr1_out, need_result))
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast.py
index d9f3aca031442..537f125e41bd2 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast.py
@@ -24,6 +24,7 @@
 
 
 class TestCBroadcastOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py
index 95919f3332869..b85a37841e8ee 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py
@@ -23,6 +23,7 @@
 
 
 class TestCollectiveBroadcastAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_api_mlu.py
index dc4b099330684..43a9728e54302 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_api_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_api_mlu.py
@@ -23,6 +23,7 @@
 
 
 class TestCollectiveReduceAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_max.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_max.py
index 5da899c581f0b..e341c10dea07a 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_max.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_max.py
@@ -24,6 +24,7 @@
 
 
 class TestCReduceOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_min.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_min.py
index 21fea55eff7db..932e3a86846f1 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_min.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_min.py
@@ -24,6 +24,7 @@
 
 
 class TestCReduceOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_prod.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_prod.py
index 86d52a8c32627..335979ee26177 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_prod.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_prod.py
@@ -24,6 +24,7 @@
 
 
 class TestCReduceOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_sum.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_sum.py
index 7028a0f29e849..2b873f1266f7f 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_sum.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_reduce_sum.py
@@ -24,6 +24,7 @@
 
 
 class TestCReduceOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py
index 87997acce02a3..ea3b39817e5ef 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -25,7 +26,9 @@
 
 
 def create_test_class(op_type, typename, callback):
+
     class Cls(OpTest):
+
         def setUp(self):
             self.set_mlu()
             self.place = paddle.MLUPlace(0)
@@ -76,18 +79,22 @@ def test_dynamic_api(self):
         def test_broadcast_api_1(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
-                x = paddle.static.data(
-                    name='x', shape=[1, 2, 1, 3], dtype=typename)
-                y = paddle.static.data(
-                    name='y', shape=[1, 2, 3], dtype=typename)
+                x = paddle.static.data(name='x',
+                                       shape=[1, 2, 1, 3],
+                                       dtype=typename)
+                y = paddle.static.data(name='y',
+                                       shape=[1, 2, 3],
+                                       dtype=typename)
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
                 exe = paddle.static.Executor(self.place)
                 input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(typename)
                 input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(typename)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
@@ -95,18 +102,22 @@ def test_broadcast_api_1(self):
         def test_broadcast_api_2(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
-                x = paddle.static.data(
-                    name='x', shape=[1, 2, 3], dtype=typename)
-                y = paddle.static.data(
-                    name='y', shape=[1, 2, 1, 3], dtype=typename)
+                x = paddle.static.data(name='x',
+                                       shape=[1, 2, 3],
+                                       dtype=typename)
+                y = paddle.static.data(name='y',
+                                       shape=[1, 2, 1, 3],
+                                       dtype=typename)
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
                 exe = paddle.static.Executor(self.place)
                 input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(typename)
                 input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(typename)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
@@ -122,8 +133,10 @@ def test_broadcast_api_3(self):
                 input_x = np.arange(0, 5).reshape((5)).astype(typename)
                 input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(typename)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
index ba37fcee15472..d4ebe0d16ef98 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestConcatOp(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "concat"
@@ -43,8 +45,8 @@ def setUp(self):
             self.actual_axis = self.axis
 
         self.outputs = {
-            'Out': np.concatenate(
-                (self.x0, self.x1, self.x2), axis=self.actual_axis)
+            'Out':
+            np.concatenate((self.x0, self.x1, self.x2), axis=self.actual_axis)
         }
 
     def set_mlu(self):
@@ -69,6 +71,7 @@ def init_test_data(self):
 
 
 class TestConcatOp2(TestConcatOp):
+
     def init_test_data(self):
         self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
         self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
@@ -79,6 +82,7 @@ def init_test_data(self):
 @skip_check_grad_ci(
     reason="The function 'check_grad' for large inputs is too slow.")
 class TestConcatOp3(TestConcatOp):
+
     def init_test_data(self):
         self.x0 = np.random.random((1, 256, 170, 256)).astype(self.dtype)
         self.x1 = np.random.random((1, 128, 170, 256)).astype(self.dtype)
@@ -90,9 +94,11 @@ def test_check_grad(self):
 
 
 @skip_check_grad_ci(
-    reason="This test will meet fetch error when there is a null grad. The detailed information is in PR#17015."
+    reason=
+    "This test will meet fetch error when there is a null grad. The detailed information is in PR#17015."
 )
 class TestConcatOp4(TestConcatOp):
+
     def init_test_data(self):
         self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
         self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
@@ -104,6 +110,7 @@ def test_check_grad(self):
 
 
 class TestConcatOp5(TestConcatOp):
+
     def init_test_data(self):
         self.x0 = np.random.random((5, 1, 4, 5)).astype(self.dtype)
         self.x1 = np.random.random((5, 2, 4, 5)).astype(self.dtype)
@@ -113,7 +120,9 @@ def init_test_data(self):
 
 #----------------Concat Fp16----------------
 def create_test_fp16(parent):
+
     class TestConcatFp16(parent):
+
         def init_dtype(self):
             self.dtype = np.float16
 
@@ -131,7 +140,9 @@ def init_dtype(self):
 
 #----------------Concat Int64----------------
 def create_test_int64(parent):
+
     class TestConcatInt64(parent):
+
         def init_dtype(self):
             self.dtype = np.int64
 
@@ -152,7 +163,9 @@ def test_check_grad(self):
 
 #----------------Concat Int32----------------
 def create_test_int32(parent):
+
     class TestConcatInt32(parent):
+
         def init_dtype(self):
             self.dtype = np.int32
 
@@ -173,7 +186,9 @@ def test_check_grad(self):
 
 #----------------Concat AxisTensor----------------
 def create_test_AxisTensor(parent):
+
     class TestConcatAxisTensor(parent):
+
         def setUp(self):
             self.op_type = "concat"
             self.init_dtype()
@@ -192,8 +207,9 @@ def setUp(self):
                 self.actual_axis = self.axis
 
             self.outputs = {
-                'Out': np.concatenate(
-                    (self.x0, self.x1, self.x2), axis=self.actual_axis)
+                'Out':
+                np.concatenate((self.x0, self.x1, self.x2),
+                               axis=self.actual_axis)
             }
 
             self.place = paddle.device.MLUPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_mlu.py
index b09d892554bab..79200ab572bfc 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_mlu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 import paddle
 import paddle.fluid.core as core
@@ -29,7 +30,9 @@
 
 
 def create_test_channel_last_class(parent):
+
     class TestChannelLastCase(parent):
+
         def init_data_format(self):
             self.data_format = "NHWC"
 
@@ -43,7 +46,9 @@ def init_test_case_2(self):
 
 
 def create_test_padding_SAME_class(parent):
+
     class TestPaddingSMAECase(parent):
+
         def init_paddings(self):
             self.pad = [0, 0]
             self.padding_algorithm = "SAME"
@@ -54,7 +59,9 @@ def init_paddings(self):
 
 
 def create_test_padding_VALID_class(parent):
+
     class TestPaddingVALIDCase(parent):
+
         def init_paddings(self):
             self.pad = [1, 1]
             self.padding_algorithm = "VALID"
@@ -65,7 +72,9 @@ def init_paddings(self):
 
 
 def create_test_fp16_class(parent):
+
     class TestFp16Case(parent):
+
         def init_dtype(self):
             self.dtype = np.float16
 
@@ -75,6 +84,7 @@ def init_dtype(self):
 
 
 class TestConv2DOp(OpTest):
+
     def set_mlu(self):
         self.__class__.use_mlu = True
         self.place = paddle.device.MLUPlace(0)
@@ -103,12 +113,11 @@ def setUp(self):
         input = np.random.random(self.input_size).astype(self.dtype)
         filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
 
-        output, _, _, _, _ = conv2d_forward_naive(
-            input,
-            filter,
-            self.groups,
-            conv2d_param,
-            data_format=self.data_format)
+        output, _, _, _, _ = conv2d_forward_naive(input,
+                                                  filter,
+                                                  self.groups,
+                                                  conv2d_param,
+                                                  data_format=self.data_format)
         output = output.astype(self.dtype)
 
         self.inputs = {
@@ -130,31 +139,28 @@ def test_check_output(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad_with_place(
-            self.place, {'Input', 'Filter'},
-            'Output',
-            max_relative_error=0.03,
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(self.place, {'Input', 'Filter'},
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
-        self.check_grad_with_place(
-            self.place, ['Input'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Filter']),
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(self.place, ['Input'],
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set(['Filter']),
+                                   numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
-        self.check_grad_with_place(
-            self.place, ['Filter'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Input']),
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(self.place, ['Filter'],
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set(['Input']),
+                                   numeric_place=paddle.CPUPlace())
 
     def init_test_case(self):
         self.pad = [0, 0]
@@ -172,6 +178,7 @@ def init_group(self):
 
 
 class TestWithPad(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -182,6 +189,7 @@ def init_test_case(self):
 
 
 class TestWithStride(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -192,6 +200,7 @@ def init_test_case(self):
 
 
 class TestWithGroup(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -203,6 +212,7 @@ def init_test_case(self):
 
 
 class TestWith1x1(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -213,12 +223,13 @@ def init_test_case(self):
 
     def init_group(self):
         # FIXME: Supporting group = 3 in this case.
-        # NOTE(wangran16): There is an unknown error (acl error code is : 507015) 
+        # NOTE(wangran16): There is an unknown error (acl error code is : 507015)
         # when group = 3, which needs to be fixed.
         self.groups = 1
 
 
 class TestWithDepthWise5x5(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -232,6 +243,7 @@ def init_group(self):
 
 
 class TestWithDepthWise7x7(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -245,6 +257,7 @@ def init_group(self):
 
 
 class TestWithDilation(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -263,6 +276,7 @@ def init_dilation(self):
 
 
 class TestWithInput1x1Filter1x1(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -276,6 +290,7 @@ def init_group(self):
 
 
 class TestConv2DOp_v2(OpTest):
+
     def set_mlu(self):
         self.__class__.use_mlu = True
         self.place = paddle.device.MLUPlace(0)
@@ -300,9 +315,10 @@ def setUp(self):
 
         input = np.random.random(self.input_size).astype(self.dtype)
         filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
-        output, _, _, _, _ = conv2d_forward_naive(
-            input, filter, self.groups, conv2d_param, self.padding_algorithm,
-            self.data_format)
+        output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups,
+                                                  conv2d_param,
+                                                  self.padding_algorithm,
+                                                  self.data_format)
         output = output.astype(self.dtype)
 
         self.inputs = {
@@ -325,30 +341,27 @@ def test_check_output(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad_with_place(
-            self.place, {'Input', 'Filter'},
-            'Output',
-            max_relative_error=0.02,
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(self.place, {'Input', 'Filter'},
+                                   'Output',
+                                   max_relative_error=0.02,
+                                   numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
-        self.check_grad_with_place(
-            self.place, ['Input'],
-            'Output',
-            max_relative_error=0.02,
-            no_grad_set=set(['Filter']),
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(self.place, ['Input'],
+                                   'Output',
+                                   max_relative_error=0.02,
+                                   no_grad_set=set(['Filter']),
+                                   numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
-        self.check_grad_with_place(
-            self.place, ['Filter'],
-            'Output',
-            no_grad_set=set(['Input']),
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(self.place, ['Filter'],
+                                   'Output',
+                                   no_grad_set=set(['Input']),
+                                   numeric_place=paddle.CPUPlace())
 
     def init_test_case(self):
         self.pad = [0, 0]
@@ -379,12 +392,14 @@ def init_test_case_2(self):
 
 
 class TestConv2DOp_AsyPadding(TestConv2DOp_v2):
+
     def init_paddings(self):
         self.pad = [0, 0, 1, 2]
         self.padding_algorithm = "EXPLICIT"
 
 
 class TestWithPad_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
@@ -398,6 +413,7 @@ def init_paddings(self):
 
 
 class TestWithStride_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [2, 2]
         self.input_size = [2, 3, 6, 6]  # NCHW
@@ -411,6 +427,7 @@ def init_paddings(self):
 
 
 class TestWithGroup_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 2]
@@ -422,6 +439,7 @@ def init_test_case(self):
 
 
 class TestWith1x1_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
@@ -438,6 +456,7 @@ def init_paddings(self):
 
 
 class TestWithDepthWise3x3_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [3, 4, 10, 10]  # NCHW
@@ -459,6 +478,7 @@ def init_paddings(self):
 
 
 class TestWithDepthWise5x5_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 4, 10, 10]  # NCHW
@@ -475,6 +495,7 @@ def init_paddings(self):
 
 
 class TestWithDepthWise7x7_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [2, 2]
         self.input_size = [2, 8, 10, 10]  # NCHW
@@ -491,6 +512,7 @@ def init_paddings(self):
 
 
 class TestWithDilation_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 10, 10]  # NCHW
@@ -512,6 +534,7 @@ def init_paddings(self):
 
 
 class TestWithInput1x1Filter1x1_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [100, 1, 1, 1]  # NCHW
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_cumsum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_cumsum_op_mlu.py
index 5b7ce30728cbc..3ef23367eeb77 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_cumsum_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_cumsum_op_mlu.py
@@ -26,6 +26,7 @@
 
 
 class TestMLUCumSumOp(OpTest):
+
     def setUp(self):
         self.op_type = "cumsum"
         self.set_mlu()
@@ -49,17 +50,18 @@ def init_testcase(self):
 
 
 class TestMLUCumSumOp2(TestMLUCumSumOp):
+
     def init_testcase(self):
         self.attrs = {'axis': -1, 'reverse': True}
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
         self.outputs = {
-            'Out': np.flip(
-                np.flip(
-                    self.inputs['X'], axis=2).cumsum(axis=2), axis=2)
+            'Out': np.flip(np.flip(self.inputs['X'], axis=2).cumsum(axis=2),
+                           axis=2)
         }
 
 
 class TestMLUCumSumOp3(TestMLUCumSumOp):
+
     def init_testcase(self):
         self.attrs = {'axis': 1}
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
@@ -67,6 +69,7 @@ def init_testcase(self):
 
 
 class TestMLUCumSumOp4(TestMLUCumSumOp):
+
     def init_testcase(self):
         self.attrs = {'axis': 0}
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
@@ -74,98 +77,107 @@ def init_testcase(self):
 
 
 class TestMLUCumSumOp5(TestMLUCumSumOp):
+
     def init_testcase(self):
         self.inputs = {'X': np.random.random((5, 20)).astype(self.dtype)}
         self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
 
 
 class TestMLUCumSumOp7(TestMLUCumSumOp):
+
     def init_testcase(self):
         self.inputs = {'X': np.random.random((100)).astype(self.dtype)}
         self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
 
 
 class TestNPUCumSumExclusive1(TestMLUCumSumOp):
+
     def init_testcase(self):
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((4, 5, 65)).astype(self.dtype)
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
 
 class TestNPUCumSumExclusive2(TestMLUCumSumOp):
+
     def init_testcase(self):
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((1, 1, 888)).astype(self.dtype)
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (1, 1, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (1, 1, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
 
 class TestNPUCumSumExclusive3(TestMLUCumSumOp):
+
     def init_testcase(self):
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((4, 5, 888)).astype(self.dtype)
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
 
 class TestNPUCumSumExclusive4(TestMLUCumSumOp):
+
     def init_testcase(self):
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((1, 1, 3049)).astype(self.dtype)
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (1, 1, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (1, 1, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
 
 class TestNPUCumSumExclusive5(TestMLUCumSumOp):
+
     def init_testcase(self):
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((4, 5, 3096)).astype(self.dtype)
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
 
 class TestNPUCumSumReverseExclusive(TestMLUCumSumOp):
+
     def init_testcase(self):
         self.attrs = {'axis': 2, 'reverse': True, "exclusive": True}
         a = np.random.random((4, 5, 6)).astype(self.dtype)
         self.inputs = {'X': a}
         a = np.flip(a, axis=2)
         self.outputs = {
-            'Out': np.concatenate(
-                (np.flip(
-                    a[:, :, :-1].cumsum(axis=2), axis=2), np.zeros(
-                        (4, 5, 1), dtype=self.dtype)),
+            'Out':
+            np.concatenate(
+                (np.flip(a[:, :, :-1].cumsum(axis=2),
+                         axis=2), np.zeros((4, 5, 1), dtype=self.dtype)),
                 axis=2)
         }
 
 
 class TestNPUCumSumWithFlatten1(TestMLUCumSumOp):
+
     def init_testcase(self):
         self.attrs = {'flatten': True}
         self.inputs = {'X': np.random.random((5, 6)).astype(self.dtype)}
@@ -173,6 +185,7 @@ def init_testcase(self):
 
 
 class TestNPUCumSumWithFlatten2(TestMLUCumSumOp):
+
     def init_testcase(self):
         self.attrs = {'flatten': True}
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
index f8984f5c6dfa4..e9d172c89410e 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestDropoutOp(OpTest):
+
     def setUp(self):
         self.op_type = "dropout"
         self.set_mlu()
@@ -163,6 +165,7 @@ def test_check_output(self):
 
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestDropoutOpInference2(TestDropoutOpInference):
+
     def setUp(self):
         self.op_type = "dropout"
         self.set_mlu()
@@ -184,8 +187,7 @@ def setUp(self):
         self.init_dtype()
         self.inputs = {
             "X": np.random.random((32, 64)).astype(self.dtype),
-            "Seed": np.asarray(
-                [125], dtype="int32")
+            "Seed": np.asarray([125], dtype="int32")
         }
         self.attrs = {
             'dropout_prob': 0.0,
@@ -210,6 +212,7 @@ def set_mlu(self):
 
 
 class TestDropoutAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace(), paddle.device.MLUPlace(0)]
@@ -217,36 +220,43 @@ def setUp(self):
     def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[40, 40], dtype="float32")
-            res1 = paddle.nn.functional.dropout(
-                x=input, p=0., training=False, mode='upscale_in_train')
-            res2 = paddle.nn.functional.dropout(
-                x=input, p=0., axis=0, training=True, mode='upscale_in_train')
-            res3 = paddle.nn.functional.dropout(
-                x=input, p=0., axis=0, training=False, mode='upscale_in_train')
-            res4 = paddle.nn.functional.dropout(
-                x=input,
-                p=0.,
-                axis=[0, 1],
-                training=True,
-                mode='upscale_in_train')
-            res5 = paddle.nn.functional.dropout(
-                x=input,
-                p=0.,
-                axis=[0, 1],
-                training=False,
-                mode='upscale_in_train')
-            res6 = paddle.nn.functional.dropout(
-                x=input, p=1., training=True, mode='upscale_in_train')
+            res1 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                training=False,
+                                                mode='upscale_in_train')
+            res2 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=0,
+                                                training=True,
+                                                mode='upscale_in_train')
+            res3 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=0,
+                                                training=False,
+                                                mode='upscale_in_train')
+            res4 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=[0, 1],
+                                                training=True,
+                                                mode='upscale_in_train')
+            res5 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=[0, 1],
+                                                training=False,
+                                                mode='upscale_in_train')
+            res6 = paddle.nn.functional.dropout(x=input,
+                                                p=1.,
+                                                training=True,
+                                                mode='upscale_in_train')
             res7 = paddle.fluid.layers.dropout(
                 x=input,
                 dropout_prob=0.,
                 dropout_implementation='upscale_in_train')
-            res8 = paddle.nn.functional.dropout(
-                x=input,
-                p=0.,
-                axis=(0, 1),
-                training=False,
-                mode='upscale_in_train')
+            res8 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=(0, 1),
+                                                training=False,
+                                                mode='upscale_in_train')
 
             in_np = np.random.random([40, 40]).astype("float32")
             res_np = in_np
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py
index 3dc711c7d75e1..2a0d74d5000cb 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py
@@ -18,6 +18,7 @@
 import paddle
 import paddle.fluid.core as core
 import sys
+
 sys.path.append('..')
 from op_test import OpTest, skip_check_grad_ci
 import paddle.fluid as fluid
@@ -27,6 +28,7 @@
 
 
 class TestElementwiseAddOp(OpTest):
+
     def set_mlu(self):
         self.place = paddle.device.MLUPlace(0)
         self.__class__.use_mlu = True
@@ -51,26 +53,25 @@ def test_check_output(self):
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
             return
-        self.check_grad_with_place(
-            self.place, ['X', 'Y'], 'Out', max_relative_error=0.01)
+        self.check_grad_with_place(self.place, ['X', 'Y'],
+                                   'Out',
+                                   max_relative_error=0.01)
 
     def test_check_grad_ingore_x(self):
         if self.dtype == np.float16:
             return
-        self.check_grad_with_place(
-            self.place, ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            max_relative_error=0.01)
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"),
+                                   max_relative_error=0.01)
 
     def test_check_grad_ingore_y(self):
         if self.dtype == np.float16:
             return
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            max_relative_error=0.01)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set('Y'),
+                                   max_relative_error=0.01)
 
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
@@ -85,6 +86,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -95,6 +97,7 @@ def test_check_output(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(1).astype(self.dtype)
@@ -104,6 +107,7 @@ def init_input_output(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestFP16ElementwiseAddOp_scalar(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(1).astype(self.dtype)
@@ -113,6 +117,7 @@ def init_input_output(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
 class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(1, 1).astype(self.dtype)
@@ -122,6 +127,7 @@ def init_input_output(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
 class TestFP16ElementwiseAddOp_scalar2(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(1, 1).astype(self.dtype)
@@ -129,6 +135,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.random((100, )).astype(self.dtype)
         self.y = np.random.random((100, )).astype(self.dtype)
@@ -136,6 +143,7 @@ def init_input_output(self):
 
 
 class TestFP16ElementwiseAddOp_Vector(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.random((100, )).astype(self.dtype)
         self.y = np.random.random((100, )).astype(self.dtype)
@@ -143,6 +151,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -153,6 +162,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_0(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -163,6 +173,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 100, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -173,6 +184,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_1(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 100, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -183,6 +195,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -190,6 +203,7 @@ def init_input_output(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_2(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -197,6 +211,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 1).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -207,6 +222,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_3(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -217,6 +233,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
@@ -227,6 +244,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_4(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
@@ -237,6 +255,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 3, 12).astype(self.dtype)
         self.y = np.random.rand(10, 1, 12).astype(self.dtype)
@@ -244,6 +263,7 @@ def init_input_output(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_5(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 3, 12).astype(self.dtype)
         self.y = np.random.rand(10, 1, 12).astype(self.dtype)
@@ -251,6 +271,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
         self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
@@ -258,6 +279,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
         self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
@@ -265,6 +287,7 @@ def init_input_output(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_6(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
         self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
@@ -272,6 +295,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -282,6 +306,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_rowwise_add_0(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -294,6 +319,7 @@ def init_axis(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 1).astype(self.dtype)
         self.y = np.random.rand(1).astype(self.dtype)
@@ -306,6 +332,7 @@ def init_axis(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 1).astype(self.dtype)
         self.y = np.random.rand(1).astype(self.dtype)
@@ -316,6 +343,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100, 1, 1).astype(self.dtype)
@@ -326,6 +354,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100, 1, 1).astype(self.dtype)
@@ -336,6 +365,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(1, 1, 100).astype(self.dtype)
@@ -346,6 +376,7 @@ def init_axis(self):
 
 
 class TestElementwiseFP16AddOp_commonuse_add1(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(1, 1, 100).astype(self.dtype)
@@ -356,6 +387,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
         self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
@@ -366,6 +398,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 12).astype(self.dtype)
         self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype)
@@ -376,6 +409,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_same_shape_ysize_large(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 1, 12).astype(self.dtype)
         self.y = np.random.rand(10, 2, 12).astype(self.dtype)
@@ -386,13 +420,14 @@ def init_axis(self):
 
 
 class TestElementwiseAddOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of elementwise_add must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.MLUPlace(0))
-            y1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.MLUPlace(0))
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.MLUPlace(0))
+            y1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.MLUPlace(0))
             self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1)
 
             # the input dtype of elementwise_add must be float16 or float32
@@ -402,6 +437,7 @@ def test_errors(self):
 
 
 class TestAddApi(unittest.TestCase):
+
     def _executed_api(self, x, y, name=None):
         return paddle.add(x, y, name)
 
@@ -445,11 +481,13 @@ def test_dygraph(self):
 
 
 class TestAddInplaceApi(TestAddApi):
+
     def _executed_api(self, x, y, name=None):
         return x.add_(y, name)
 
 
 class TestAddInplaceBroadcastSuccess(unittest.TestCase):
+
     def init_data(self):
         self.x_numpy = np.random.rand(2, 3, 4).astype('float32')
         self.y_numpy = np.random.rand(3, 4).astype('float32')
@@ -466,18 +504,21 @@ def test_broadcast_success(self):
 
 
 class TestAddInplaceBroadcastSuccess2(TestAddInplaceBroadcastSuccess):
+
     def init_data(self):
         self.x_numpy = np.random.rand(1, 2, 3, 1).astype('float32')
         self.y_numpy = np.random.rand(3, 1).astype('float32')
 
 
 class TestAddInplaceBroadcastSuccess3(TestAddInplaceBroadcastSuccess):
+
     def init_data(self):
         self.x_numpy = np.random.rand(2, 3, 1, 5).astype('float32')
         self.y_numpy = np.random.rand(1, 3, 1, 5).astype('float32')
 
 
 class TestAddInplaceBroadcastError(unittest.TestCase):
+
     def init_data(self):
         self.x_numpy = np.random.rand(3, 4).astype('float32')
         self.y_numpy = np.random.rand(2, 3, 4).astype('float32')
@@ -496,18 +537,21 @@ def broadcast_shape_error():
 
 
 class TestAddInplaceBroadcastError2(TestAddInplaceBroadcastError):
+
     def init_data(self):
         self.x_numpy = np.random.rand(2, 1, 4).astype('float32')
         self.y_numpy = np.random.rand(2, 3, 4).astype('float32')
 
 
 class TestAddInplaceBroadcastError3(TestAddInplaceBroadcastError):
+
     def init_data(self):
         self.x_numpy = np.random.rand(5, 2, 1, 4).astype('float32')
         self.y_numpy = np.random.rand(2, 3, 4).astype('float32')
 
 
 class TestBoolAddFloatElementwiseAddop(unittest.TestCase):
+
     def test_static_add(self):
         paddle.enable_static()
         a = 1.5
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_div_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_div_op_mlu.py
index 8fdac75c4c1a8..c3eadc341f358 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_div_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_div_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestElementwiseDiv(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_div"
@@ -56,25 +58,25 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            self.place, ['X', 'Y'], 'Out', max_relative_error=0.05)
+        self.check_grad_with_place(self.place, ['X', 'Y'],
+                                   'Out',
+                                   max_relative_error=0.05)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad_with_place(
-            self.place, ['Y'],
-            'Out',
-            max_relative_error=0.05,
-            no_grad_set=set("X"))
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   max_relative_error=0.05,
+                                   no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            max_relative_error=0.05,
-            no_grad_set=set("Y"))
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.05,
+                                   no_grad_set=set("Y"))
 
 
 class TestElementwiseDivFp16(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_div"
@@ -107,6 +109,7 @@ def test_check_output(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestTestElementwiseDiv_scalar(TestElementwiseDiv):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_div"
@@ -118,6 +121,7 @@ def setUp(self):
 
 
 class TestTestElementwiseDiv_Vector(TestElementwiseDiv):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_div"
@@ -129,6 +133,7 @@ def setUp(self):
 
 
 class TestTestElementwiseDiv_broadcast_0(TestElementwiseDiv):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_div"
@@ -139,12 +144,13 @@ def setUp(self):
 
         self.attrs = {'axis': 0}
         self.outputs = {
-            'Out':
-            np.divide(self.inputs['X'], self.inputs['Y'].reshape(100, 1, 1))
+            'Out': np.divide(self.inputs['X'],
+                             self.inputs['Y'].reshape(100, 1, 1))
         }
 
 
 class TestTestElementwiseDiv_broadcast_1(TestElementwiseDiv):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_div"
@@ -155,12 +161,13 @@ def setUp(self):
 
         self.attrs = {'axis': 1}
         self.outputs = {
-            'Out':
-            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 100, 1))
+            'Out': np.divide(self.inputs['X'],
+                             self.inputs['Y'].reshape(1, 100, 1))
         }
 
 
 class TestTestElementwiseDiv_broadcast_2(TestElementwiseDiv):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_div"
@@ -170,12 +177,13 @@ def setUp(self):
         }
 
         self.outputs = {
-            'Out':
-            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 100))
+            'Out': np.divide(self.inputs['X'],
+                             self.inputs['Y'].reshape(1, 1, 100))
         }
 
 
 class TestTestElementwiseDiv_broadcast_3(TestElementwiseDiv):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_div"
@@ -192,6 +200,7 @@ def setUp(self):
 
 
 class TestTestElementwiseDiv_broadcast_4(TestElementwiseDiv):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_div"
@@ -203,6 +212,7 @@ def setUp(self):
 
 
 class TestTestElementwiseDiv_broadcast_5(TestElementwiseDiv):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_div"
@@ -214,6 +224,7 @@ def setUp(self):
 
 
 class TestTestElementwiseDiv_commonuse_1(TestElementwiseDiv):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_div"
@@ -225,6 +236,7 @@ def setUp(self):
 
 
 class TestTestElementwiseDiv_commonuse_2(TestElementwiseDiv):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_div"
@@ -236,6 +248,7 @@ def setUp(self):
 
 
 class TestTestElementwiseDiv_xsize_lessthan_ysize(TestElementwiseDiv):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_div"
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py
index bc8a08c39ffc8..dd7be15b812d4 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py
@@ -24,6 +24,7 @@
 from paddle.fluid.op import Operator
 
 import sys
+
 sys.path.append('..')
 from op_test import OpTest, skip_check_grad_ci
 
@@ -31,6 +32,7 @@
 
 
 class ElementwiseMulOp(OpTest):
+
     def init_kernel_type(self):
         self.__class__.use_mlu = True
         self.place = paddle.device.MLUPlace(0)
@@ -58,12 +60,14 @@ def test_check_grad_normal(self):
         self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        self.check_grad_with_place(
-            self.place, ['Y'], 'Out', no_grad_set=set("X"))
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', no_grad_set=set('Y'))
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set('Y'))
 
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
@@ -80,6 +84,7 @@ def init_axis(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseMulOp_scalar(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -91,6 +96,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_Vector(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -102,6 +108,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -112,6 +119,7 @@ def init_axis(self):
 
 
 class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -127,6 +135,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -141,6 +150,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -156,6 +166,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -167,6 +178,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -178,11 +190,13 @@ def setUp(self):
 
 
 class TestElementwiseMulOpFp16(ElementwiseMulOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -194,6 +208,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -205,6 +220,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -221,13 +237,14 @@ def setUp(self):
 
 
 class TestElementwiseMulOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of elementwise_mul must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
-            y1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.elementwise_mul, x1, y1)
 
             # the input dtype of elementwise_mul must be float16 or float32 or int32
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_sub_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_sub_op_mlu.py
index 9ca5359e05ff7..a406317a96a12 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_sub_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_sub_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestElementwiseSubOp(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_sub"
@@ -64,23 +66,22 @@ def test_check_grad_normal(self):
         self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        self.check_grad_with_place(
-            self.place, ['Y'],
-            'Out',
-            max_relative_error=0.005,
-            no_grad_set=set("X"))
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   max_relative_error=0.005,
+                                   no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            max_relative_error=0.005,
-            no_grad_set=set('Y'))
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.005,
+                                   no_grad_set=set('Y'))
 
 
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseSubOp_scalar(TestElementwiseSubOp):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_sub"
@@ -92,6 +93,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_Vector(TestElementwiseSubOp):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_sub"
@@ -103,6 +105,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_broadcast_0(TestElementwiseSubOp):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_sub"
@@ -117,6 +120,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_broadcast_1(TestElementwiseSubOp):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_sub"
@@ -131,6 +135,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_broadcast_2(TestElementwiseSubOp):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_sub"
@@ -144,6 +149,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_broadcast_3(TestElementwiseSubOp):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_sub"
@@ -158,6 +164,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_broadcast_4(TestElementwiseSubOp):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_sub"
@@ -169,6 +176,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_commonuse_1(TestElementwiseSubOp):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_sub"
@@ -180,6 +188,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_commonuse_2(TestElementwiseSubOp):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_sub"
@@ -191,6 +200,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseSubOp):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "elementwise_sub"
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_fill_any_like_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_fill_any_like_op_mlu.py
index 065c8072d4ce8..4847a6a42d940 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_fill_any_like_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_fill_any_like_op_mlu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -26,6 +27,7 @@
 
 
 class TestFillAnyLikeOp(OpTest):
+
     def setUp(self):
         self.init_dtype()
         self.set_mlu()
@@ -54,21 +56,25 @@ def test_check_output(self):
 
 
 class TestFillAnyLikeOp2(TestFillAnyLikeOp):
+
     def set_value(self):
         self.value = -0.0
 
 
 class TestFillAnyLikeOp3(TestFillAnyLikeOp):
+
     def set_value(self):
         self.value = 1.0
 
 
 class TestFillAnyLikeOp4(TestFillAnyLikeOp):
+
     def set_value(self):
         self.value = 1e-9
 
 
 class TestFillAnyLikeOp5(TestFillAnyLikeOp):
+
     def set_value(self):
         if self.dtype == "float16":
             self.value = 0.05
@@ -77,6 +83,7 @@ def set_value(self):
 
 
 class TestFillAnyLikeOpInt32(TestFillAnyLikeOp):
+
     def init_dtype(self):
         self.dtype = np.int32
 
@@ -85,6 +92,7 @@ def set_value(self):
 
 
 class TestFillAnyLikeOpInt64(TestFillAnyLikeOp):
+
     def init_dtype(self):
         self.dtype = np.int64
 
@@ -93,6 +101,7 @@ def set_value(self):
 
 
 class TestFillAnyLikeOpFloat32(TestFillAnyLikeOp):
+
     def init_dtype(self):
         self.dtype = np.float32
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py
index a43b7d0164d7b..604dbf4ddbcce 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append('..')
 from op_test import OpTest, convert_float_to_uint16
 
@@ -32,6 +33,7 @@
 
 # Situation 1: Attr(shape) is a list(without tensor)
 class TestFillConstantOp1(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified value
         '''
@@ -49,6 +51,7 @@ def test_check_output(self):
 
 
 class TestFillConstantOp2(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with default value
         '''
@@ -66,6 +69,7 @@ def test_check_output(self):
 
 
 class TestFillConstantOp3(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified int64 value
         '''
@@ -83,6 +87,7 @@ def test_check_output(self):
 
 
 class TestFillConstantOp4(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified int value
         '''
@@ -100,14 +105,17 @@ def test_check_output(self):
 
 
 class TestFillConstantOpWithSelectedRows(unittest.TestCase):
+
     def check_with_place(self, place):
         scope = core.Scope()
         # create Out Variable
         out = scope.var('Out').get_selected_rows()
 
         # create and run fill_constant_op operator
-        fill_constant_op = Operator(
-            "fill_constant", shape=[123, 92], value=3.8, Out='Out')
+        fill_constant_op = Operator("fill_constant",
+                                    shape=[123, 92],
+                                    value=3.8,
+                                    Out='Out')
         fill_constant_op.run(scope, place)
 
         # get result from Out
@@ -127,6 +135,7 @@ def test_fill_constant_with_selected_rows(self):
 
 # Situation 2: Attr(shape) is a list(with tensor)
 class TestFillConstantOp1_ShapeTensorList(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified value
         '''
@@ -154,6 +163,7 @@ def test_check_output(self):
 
 
 class TestFillConstantOp2_ShapeTensorList(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with default value
         '''
@@ -180,6 +190,7 @@ def test_check_output(self):
 
 
 class TestFillConstantOp3_ShapeTensorList(TestFillConstantOp1_ShapeTensorList):
+
     def init_data(self):
         self.shape = [123, 92]
         self.infer_shape = [123, -1]
@@ -187,6 +198,7 @@ def init_data(self):
 
 
 class TestFillConstantOp4_ShapeTensorList(TestFillConstantOp1_ShapeTensorList):
+
     def init_data(self):
         self.shape = [123, 92]
         self.infer_shape = [123, -1]
@@ -195,6 +207,7 @@ def init_data(self):
 
 # Situation 3: shape is a tensor
 class TestFillConstantOp1_ShapeTensor(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified value
         '''
@@ -218,6 +231,7 @@ def test_check_output(self):
 
 # Situation 4: value is a tensor
 class TestFillConstantOp1_ValueTensor(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified value
         '''
@@ -246,6 +260,7 @@ def test_check_output(self):
 
 # Situation 5: value is a tensor
 class TestFillConstantOp2_ValueTensor(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified value
         '''
@@ -273,43 +288,56 @@ def test_check_output(self):
 
 # Test python API
 class TestFillConstantAPI(unittest.TestCase):
+
     def test_api(self):
 
         positive_2_int32 = fluid.layers.fill_constant([1], "int32", 2)
         positive_2_int64 = fluid.layers.fill_constant([1], "int64", 2)
 
-        shape_tensor_int32 = fluid.data(
-            name="shape_tensor_int32", shape=[2], dtype="int32")
-        shape_tensor_int64 = fluid.data(
-            name="shape_tensor_int64", shape=[2], dtype="int64")
-
-        out_1 = fluid.layers.fill_constant(
-            shape=[1, 2], dtype="float32", value=1.1)
-
-        out_2 = fluid.layers.fill_constant(
-            shape=[1, positive_2_int32], dtype="float32", value=1.1)
-
-        out_3 = fluid.layers.fill_constant(
-            shape=[1, positive_2_int64], dtype="float32", value=1.1)
-
-        out_4 = fluid.layers.fill_constant(
-            shape=shape_tensor_int32, dtype="float32", value=1.1)
-
-        out_5 = fluid.layers.fill_constant(
-            shape=shape_tensor_int64, dtype="float32", value=1.1)
-
-        out_6 = fluid.layers.fill_constant(
-            shape=shape_tensor_int64, dtype=np.float32, value=1.1)
-
-        val1 = fluid.layers.fill_constant(
-            shape=[1], dtype=np.float32, value=1.1)
-        val2 = fluid.layers.fill_constant(
-            shape=[1], dtype=np.float64, value=1.1)
-        out_7 = fluid.layers.fill_constant(
-            shape=shape_tensor_int64, dtype=np.float32, value=val1)
-
-        out_8 = fluid.layers.fill_constant(
-            shape=shape_tensor_int64, dtype=np.float32, value=val2)
+        shape_tensor_int32 = fluid.data(name="shape_tensor_int32",
+                                        shape=[2],
+                                        dtype="int32")
+        shape_tensor_int64 = fluid.data(name="shape_tensor_int64",
+                                        shape=[2],
+                                        dtype="int64")
+
+        out_1 = fluid.layers.fill_constant(shape=[1, 2],
+                                           dtype="float32",
+                                           value=1.1)
+
+        out_2 = fluid.layers.fill_constant(shape=[1, positive_2_int32],
+                                           dtype="float32",
+                                           value=1.1)
+
+        out_3 = fluid.layers.fill_constant(shape=[1, positive_2_int64],
+                                           dtype="float32",
+                                           value=1.1)
+
+        out_4 = fluid.layers.fill_constant(shape=shape_tensor_int32,
+                                           dtype="float32",
+                                           value=1.1)
+
+        out_5 = fluid.layers.fill_constant(shape=shape_tensor_int64,
+                                           dtype="float32",
+                                           value=1.1)
+
+        out_6 = fluid.layers.fill_constant(shape=shape_tensor_int64,
+                                           dtype=np.float32,
+                                           value=1.1)
+
+        val1 = fluid.layers.fill_constant(shape=[1],
+                                          dtype=np.float32,
+                                          value=1.1)
+        val2 = fluid.layers.fill_constant(shape=[1],
+                                          dtype=np.float64,
+                                          value=1.1)
+        out_7 = fluid.layers.fill_constant(shape=shape_tensor_int64,
+                                           dtype=np.float32,
+                                           value=val1)
+
+        out_8 = fluid.layers.fill_constant(shape=shape_tensor_int64,
+                                           dtype=np.float32,
+                                           value=val2)
 
         exe = fluid.Executor(place=fluid.CPUPlace())
         res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8 = exe.run(
@@ -318,9 +346,7 @@ def test_api(self):
                 "shape_tensor_int32": np.array([1, 2]).astype("int32"),
                 "shape_tensor_int64": np.array([1, 2]).astype("int64"),
             },
-            fetch_list=[
-                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
-            ])
+            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8])
 
         assert np.array_equal(res_1, np.full([1, 2], 1.1, dtype="float32"))
         assert np.array_equal(res_2, np.full([1, 2], 1.1, dtype="float32"))
@@ -333,6 +359,7 @@ def test_api(self):
 
 
 class TestFillConstantImperative(unittest.TestCase):
+
     def test_api(self):
         with fluid.dygraph.guard():
             data1 = np.array([1, 2]).astype('int32')
@@ -341,26 +368,26 @@ def test_api(self):
             shape = fluid.dygraph.to_variable(data1)
             val = fluid.dygraph.to_variable(data2)
             value = fluid.dygraph.to_variable(data3)
-            res1 = fluid.layers.fill_constant(
-                shape=[1, 2], dtype='float32', value=1.1)
-            res2 = fluid.layers.fill_constant(
-                shape=shape, dtype='float32', value=1.1)
-            res3 = fluid.layers.fill_constant(
-                shape=shape, dtype='float32', value=val)
-            res4 = fluid.layers.fill_constant(
-                shape=shape, dtype='int32', value=value)
-            assert np.array_equal(
-                res1.numpy(), np.full(
-                    [1, 2], 1.1, dtype="float32"))
-            assert np.array_equal(
-                res2.numpy(), np.full(
-                    [1, 2], 1.1, dtype="float32"))
-            assert np.array_equal(
-                res3.numpy(), np.full(
-                    [1, 2], 1.1, dtype="float32"))
-            assert np.array_equal(
-                res4.numpy(), np.full(
-                    [1, 2], 88, dtype="int32"))
+            res1 = fluid.layers.fill_constant(shape=[1, 2],
+                                              dtype='float32',
+                                              value=1.1)
+            res2 = fluid.layers.fill_constant(shape=shape,
+                                              dtype='float32',
+                                              value=1.1)
+            res3 = fluid.layers.fill_constant(shape=shape,
+                                              dtype='float32',
+                                              value=val)
+            res4 = fluid.layers.fill_constant(shape=shape,
+                                              dtype='int32',
+                                              value=value)
+            assert np.array_equal(res1.numpy(),
+                                  np.full([1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(res2.numpy(),
+                                  np.full([1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(res3.numpy(),
+                                  np.full([1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(res4.numpy(),
+                                  np.full([1, 2], 88, dtype="int32"))
 
     def test_nan(self):
         with fluid.dygraph.guard():
@@ -380,45 +407,42 @@ def test_ninf(self):
 
 
 class TestFillConstantOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             #for ci coverage
             x1 = fluid.layers.data(name='x1', shape=[1], dtype="int16")
-            self.assertRaises(
-                TypeError,
-                fluid.layers.fill_constant,
-                shape=[1],
-                value=5,
-                dtype='uint4')
-
-            self.assertRaises(
-                TypeError,
-                fluid.layers.fill_constant,
-                shape=[1.1],
-                value=5,
-                dtype='float32',
-                out=x1)
+            self.assertRaises(TypeError,
+                              fluid.layers.fill_constant,
+                              shape=[1],
+                              value=5,
+                              dtype='uint4')
+
+            self.assertRaises(TypeError,
+                              fluid.layers.fill_constant,
+                              shape=[1.1],
+                              value=5,
+                              dtype='float32',
+                              out=x1)
 
             # The argument dtype of fill_constant_op must be one of bool, float16,
             #float32, float64, uint8, int16, int32 or int64
             x2 = fluid.layers.data(name='x2', shape=[1], dtype="int32")
 
-            self.assertRaises(
-                TypeError,
-                fluid.layers.fill_constant,
-                shape=[1],
-                value=5,
-                dtype='float64',
-                out=x2)
+            self.assertRaises(TypeError,
+                              fluid.layers.fill_constant,
+                              shape=[1],
+                              value=5,
+                              dtype='float64',
+                              out=x2)
 
             x3 = np.random.randn(100, 100).astype('int32')
-            self.assertRaises(
-                TypeError,
-                fluid.layers.fill_constant,
-                shape=[100, 100],
-                value=5,
-                dtype='float64',
-                out=x3)
+            self.assertRaises(TypeError,
+                              fluid.layers.fill_constant,
+                              shape=[100, 100],
+                              value=5,
+                              dtype='float64',
+                              out=x3)
 
             # The argument shape's type of fill_constant_op must be list, tuple or Variable.
             def test_shape_type():
@@ -434,18 +458,22 @@ def test_shape_size():
 
             # The shape dtype of fill_constant_op must be int32 or int64.
             def test_shape_tensor_dtype():
-                shape = fluid.data(
-                    name="shape_tensor", shape=[2], dtype="float32")
-                fluid.layers.fill_constant(
-                    shape=shape, dtype="float32", value=1)
+                shape = fluid.data(name="shape_tensor",
+                                   shape=[2],
+                                   dtype="float32")
+                fluid.layers.fill_constant(shape=shape,
+                                           dtype="float32",
+                                           value=1)
 
             self.assertRaises(TypeError, test_shape_tensor_dtype)
 
             def test_shape_tensor_list_dtype():
-                shape = fluid.data(
-                    name="shape_tensor_list", shape=[1], dtype="bool")
-                fluid.layers.fill_constant(
-                    shape=[shape, 2], dtype="float32", value=1)
+                shape = fluid.data(name="shape_tensor_list",
+                                   shape=[1],
+                                   dtype="bool")
+                fluid.layers.fill_constant(shape=[shape, 2],
+                                           dtype="float32",
+                                           value=1)
 
             self.assertRaises(TypeError, test_shape_tensor_list_dtype)
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_flatten2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_flatten2_op_mlu.py
index b5f79a92b190f..df0d6f23a23ec 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_flatten2_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_flatten2_op_mlu.py
@@ -19,12 +19,15 @@
 import paddle.fluid as fluid
 import paddle
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
+
 paddle.enable_static()
 
 
 class TestFlattenOp(OpTest):
+
     def setUp(self):
         self.place = paddle.device.MLUPlace(0)
         self.__class__.use_mlu = True
@@ -53,6 +56,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp1(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.axis = 0
@@ -60,6 +64,7 @@ def init_test_case(self):
 
 
 class TestFlattenOpWithDefaultAxis(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (10, 2, 2, 3)
         self.new_shape = (10, 12)
@@ -69,6 +74,7 @@ def init_attrs(self):
 
 
 class TestFlattenOpSixDims(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 3, 2, 4, 4)
         self.axis = 4
@@ -76,6 +82,7 @@ def init_test_case(self):
 
 
 class TestStaticFlattenInferShapePythonAPI(unittest.TestCase):
+
     def execute_api(self, x, axis=1):
         return fluid.layers.flatten(x, axis=axis)
 
@@ -83,13 +90,15 @@ def test_static_api(self):
         paddle.enable_static()
         main_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, paddle.static.Program()):
-            x = paddle.static.data(
-                name="x", shape=[-1, 3, -1, -1], dtype='float32')
+            x = paddle.static.data(name="x",
+                                   shape=[-1, 3, -1, -1],
+                                   dtype='float32')
             out = self.execute_api(x, axis=2)
         self.assertTrue((-1, -1) == out.shape)
 
 
 class TestFlatten2OpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input_data = np.random.random((3, 2, 4, 5)).astype("float64")
@@ -102,8 +111,9 @@ def test_Variable():
 
         def test_type():
             # dtype must be float32, float64, int8, int32, int64, uint8.
-            x2 = fluid.layers.data(
-                name='x2', shape=[3, 2, 4, 5], dtype='float16')
+            x2 = fluid.layers.data(name='x2',
+                                   shape=[3, 2, 4, 5],
+                                   dtype='float16')
             fluid.layers.flatten(x2, axis=1)
 
         self.assertRaises(TypeError, test_type)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_flatten_contigous_range_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_flatten_contigous_range_op_mlu.py
index 8b14494ea0947..1474ec35637d6 100755
--- a/python/paddle/fluid/tests/unittests/mlu/test_flatten_contigous_range_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_flatten_contigous_range_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -26,6 +27,7 @@
 
 
 class TestFlattenOp(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "flatten_contiguous_range"
@@ -66,6 +68,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_1(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 1
@@ -80,6 +83,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_2(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -94,6 +98,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_3(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -108,6 +113,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_4(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = -2
@@ -122,6 +128,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_5(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 2
@@ -136,6 +143,7 @@ def init_attrs(self):
 
 
 class TestFlattenOpSixDims(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 3, 2, 4, 4)
         self.start_axis = 3
@@ -150,6 +158,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_Float32(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -165,6 +174,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_int32(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -183,6 +193,7 @@ def test_check_grad(self):
 
 
 class TestFlattenOp_uint8(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -201,6 +212,7 @@ def test_check_grad(self):
 
 
 class TestFlattenOp_int8(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -219,6 +231,7 @@ def test_check_grad(self):
 
 
 class TestFlattenOp_int64(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -237,6 +250,7 @@ def test_check_grad(self):
 
 
 class TestFlatten2OpError(unittest.TestCase):
+
     def test_errors(self):
         image_shape = (2, 3, 4, 4)
         x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
@@ -244,22 +258,25 @@ def test_errors(self):
         x = x.astype('float32')
 
         def test_ValueError1():
-            x_var = paddle.static.data(
-                name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(name="x",
+                                       shape=image_shape,
+                                       dtype='float32')
             out = paddle.flatten(x_var, start_axis=2, stop_axis=1)
 
         self.assertRaises(ValueError, test_ValueError1)
 
         def test_ValueError2():
-            x_var = paddle.static.data(
-                name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(name="x",
+                                       shape=image_shape,
+                                       dtype='float32')
             paddle.flatten(x_var, start_axis=10, stop_axis=1)
 
         self.assertRaises(ValueError, test_ValueError2)
 
         def test_ValueError3():
-            x_var = paddle.static.data(
-                name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(name="x",
+                                       shape=image_shape,
+                                       dtype='float32')
             paddle.flatten(x_var, start_axis=2, stop_axis=10)
 
         self.assertRaises(ValueError, test_ValueError3)
@@ -269,8 +286,9 @@ def test_type():
             x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
                            image_shape[3]).reshape(image_shape) / 100.
             x2 = x2.astype('float16')
-            x2_var = paddle.fluid.data(
-                name='x2', shape=[3, 2, 4, 5], dtype='float16')
+            x2_var = paddle.fluid.data(name='x2',
+                                       shape=[3, 2, 4, 5],
+                                       dtype='float16')
             paddle.flatten(x2_var)
 
         self.assertRaises(TypeError, test_type)
@@ -282,6 +300,7 @@ def test_InputError():
 
 
 class TestStaticFlattenPythonAPI(unittest.TestCase):
+
     def execute_api(self, x, start_axis=0, stop_axis=-1):
         return paddle.flatten(x, start_axis, stop_axis)
 
@@ -291,8 +310,9 @@ def test_static_api(self):
 
         main_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, paddle.static.Program()):
-            x = paddle.static.data(
-                name="x", shape=[2, 3, 4, 4], dtype='float32')
+            x = paddle.static.data(name="x",
+                                   shape=[2, 3, 4, 4],
+                                   dtype='float32')
             out = self.execute_api(x, start_axis=-2, stop_axis=-1)
 
         exe = paddle.static.Executor(place=paddle.MLUPlace(0))
@@ -301,11 +321,13 @@ def test_static_api(self):
 
 
 class TestStaticInplaceFlattenPythonAPI(TestStaticFlattenPythonAPI):
+
     def execute_api(self, x, start_axis=0, stop_axis=-1):
         return x.flatten_(start_axis, stop_axis)
 
 
 class TestFlattenPython(unittest.TestCase):
+
     def test_python_api(self):
         image_shape = (2, 3, 4, 4)
         x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_flatten_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_flatten_op_mlu.py
index a5503de7cca24..5a884cc89cccf 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_flatten_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_flatten_op_mlu.py
@@ -17,15 +17,18 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
+
 paddle.enable_static()
 
 
 class TestFlattenOp(OpTest):
+
     def setUp(self):
         self.place = paddle.device.MLUPlace(0)
         self.__class__.use_mlu = True
@@ -51,6 +54,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp1(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 2, 10)
         self.axis = 0
@@ -58,6 +62,7 @@ def init_test_case(self):
 
 
 class TestFlattenOpWithDefaultAxis(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (10, 2, 2, 3)
         self.new_shape = (10, 12)
@@ -67,6 +72,7 @@ def init_attrs(self):
 
 
 class TestFlattenOpSixDims(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 3, 2, 4, 4)
         self.axis = 4
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py
index f0aff986fa1ff..6c6ddda303d4e 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append('..')
 from op_test import OpTest, convert_float_to_uint16
 import paddle
@@ -35,6 +36,7 @@ def gather_numpy(x, index, axis):
 
 
 class TestGatherOp(OpTest):
+
     def setUp(self):
         self.op_type = "gather"
         self.place = paddle.MLUPlace(0)
@@ -65,6 +67,7 @@ def config(self):
 
 
 class TestCase1(TestGatherOp):
+
     def config(self):
         """
         For one dimension input
@@ -76,6 +79,7 @@ def config(self):
 
 
 class TestCase2(TestGatherOp):
+
     def config(self):
         """
         For int64_t index type
@@ -87,6 +91,7 @@ def config(self):
 
 
 class API_TestDygraphGather(unittest.TestCase):
+
     def test_out1(self):
         paddle.disable_static()
         input_1 = np.array([[1, 2], [3, 4], [5, 6]]).astype('int32')
@@ -124,6 +129,7 @@ def test_zero_index(self):
 
 
 class TestGathertError(unittest.TestCase):
+
     def test_error1(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
@@ -132,8 +138,9 @@ def test_error1(self):
             x = paddle.fluid.data(shape=shape, dtype='int8', name='x')
             axis = paddle.fluid.data(shape=[1], dtype='float32', name='axis')
             index = paddle.fluid.data(shape=shape, dtype='int32', name='index')
-            index_float = paddle.fluid.data(
-                shape=shape, dtype='float32', name='index_float')
+            index_float = paddle.fluid.data(shape=shape,
+                                            dtype='float32',
+                                            name='index_float')
 
             def test_x_type():
                 paddle.gather(x, index)
@@ -161,8 +168,9 @@ def test_error2(self):
             shape = [8, 9, 6]
             x = fluid.data(shape=shape, dtype='int8', name='x')
             index = fluid.data(shape=shape, dtype='int32', name='mask')
-            index_float = fluid.data(
-                shape=shape, dtype='float32', name='index_float')
+            index_float = fluid.data(shape=shape,
+                                     dtype='float32',
+                                     name='index_float')
 
             def test_x_type():
                 paddle.fluid.layers.gather(x, index)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py
index 6f64196a586dd..9f755de687234 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py
@@ -22,6 +22,7 @@
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
 import sys
+
 sys.path.append('..')
 from op_test import OpTest
 import paddle
@@ -30,6 +31,7 @@
 
 
 class TestGaussianRandomOp(OpTest):
+
     def setUp(self):
         self.op_type = "gaussian_random"
         self.place = paddle.device.MLUPlace(0)
@@ -63,13 +65,12 @@ def verify_output(self, outs):
         hist2, _ = np.histogram(data, range=(-3, 5))
         hist2 = hist2.astype("float32")
         hist2 /= float(outs[0].size)
-        self.assertTrue(
-            np.allclose(
-                hist, hist2, rtol=0, atol=0.01),
-            "hist: " + str(hist) + " hist2: " + str(hist2))
+        self.assertTrue(np.allclose(hist, hist2, rtol=0, atol=0.01),
+                        "hist: " + str(hist) + " hist2: " + str(hist2))
 
 
 class TestMeanStdAreInt(TestGaussianRandomOp):
+
     def set_attrs(self):
         self.mean = 1
         self.std = 2
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py
index c62d30d43c089..2cf89789bfc8b 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py
@@ -18,6 +18,7 @@
 from scipy import special
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -33,6 +34,7 @@ def np_gelu(x):
 
 
 class TestGelu(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "gelu"
@@ -57,11 +59,13 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-3)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', max_relative_error=0.007)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.007)
 
 
 class TestGeluFp16(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "gelu"
@@ -88,6 +92,7 @@ def test_check_output(self):
 
 
 class TestGeluNet(unittest.TestCase):
+
     def _test(self, run_mlu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -102,8 +107,9 @@ def _test(self, run_mlu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             c = paddle.multiply(a, b)
 
@@ -127,12 +133,13 @@ def _test(self, run_mlu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py
index 8b32692020cbf..5df59be28a87b 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py
@@ -23,6 +23,7 @@
 import paddle.nn.functional as F
 from functools import reduce
 import sys
+
 sys.path.append('..')
 from op_test import _set_use_system_allocator
 from paddle.fluid import Program, program_guard
@@ -37,6 +38,7 @@
 
 
 class TestLayerNormOp(unittest.TestCase):
+
     def setUp(self):
         self.use_cudnn = True
         self.place = paddle.device.MLUPlace(0)
@@ -52,6 +54,7 @@ def check_forward_backward(self,
                                has_bias=True,
                                y_grad_scale=1.0,
                                use_mkldnn=False):
+
         def test_with_place(place,
                             shape,
                             begin_norm_axis,
@@ -68,8 +71,8 @@ def test_with_place(place,
                 np.float32) if has_scale else None
             bias = np.random.random_sample(scale_shape).astype(
                 np.float32) if has_bias else None
-            y_grad = (np.random.random_sample(x_shape) *
-                      y_grad_scale).astype(np.float32)
+            y_grad = (np.random.random_sample(x_shape) * y_grad_scale).astype(
+                np.float32)
 
             # reference forward & backward
             y, mean, variance = _reference_layer_norm_naive(
@@ -90,10 +93,9 @@ def test_with_place(place,
             with fluid.program_guard(program):
                 block = program.global_block()
                 for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape)
+                    block.create_var(name=name,
+                                     dtype='float32',
+                                     shape=ground_truth[name].shape)
                 inputs = {"X": block.var('x')}
                 fetch_list = [
                     'y',
@@ -163,83 +165,79 @@ def test_with_place(place,
     def test_check_forward_backward_with_scale_and_bias(self):
         self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
-        self.check_forward_backward(
-            shape=[2, 3, 4, 5],
-            begin_norm_axis=1,
-            has_scale=False,
-            has_bias=True)
-        self.check_forward_backward(
-            shape=[2, 3, 4, 5],
-            begin_norm_axis=1,
-            has_scale=True,
-            has_bias=False)
-        self.check_forward_backward(
-            shape=[2, 3, 4, 5],
-            begin_norm_axis=1,
-            has_scale=False,
-            has_bias=False)
+        self.check_forward_backward(shape=[2, 3, 4, 5],
+                                    begin_norm_axis=1,
+                                    has_scale=False,
+                                    has_bias=True)
+        self.check_forward_backward(shape=[2, 3, 4, 5],
+                                    begin_norm_axis=1,
+                                    has_scale=True,
+                                    has_bias=False)
+        self.check_forward_backward(shape=[2, 3, 4, 5],
+                                    begin_norm_axis=1,
+                                    has_scale=False,
+                                    has_bias=False)
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
-        self.check_forward_backward(
-            shape=[92, 513, 129], begin_norm_axis=2, y_grad_scale=0.1)
+        self.check_forward_backward(shape=[92, 513, 129],
+                                    begin_norm_axis=2,
+                                    y_grad_scale=0.1)
         self.check_forward_backward(shape=[3, 34, 1134], begin_norm_axis=2)
-        self.check_forward_backward(
-            shape=[92, 513, 1134], begin_norm_axis=2, y_grad_scale=0.1)
-        self.check_forward_backward(
-            shape=[92, 513, 1134],
-            begin_norm_axis=2,
-            has_scale=False,
-            has_bias=True,
-            y_grad_scale=0.1)
-        self.check_forward_backward(
-            shape=[92, 513, 1134],
-            begin_norm_axis=2,
-            has_scale=True,
-            has_bias=False,
-            y_grad_scale=0.1)
-        self.check_forward_backward(
-            shape=[92, 513, 1134],
-            begin_norm_axis=2,
-            has_scale=False,
-            has_bias=False,
-            y_grad_scale=0.1)
-        self.check_forward_backward(
-            shape=[512, 1024], begin_norm_axis=1, has_scale=True, has_bias=True)
+        self.check_forward_backward(shape=[92, 513, 1134],
+                                    begin_norm_axis=2,
+                                    y_grad_scale=0.1)
+        self.check_forward_backward(shape=[92, 513, 1134],
+                                    begin_norm_axis=2,
+                                    has_scale=False,
+                                    has_bias=True,
+                                    y_grad_scale=0.1)
+        self.check_forward_backward(shape=[92, 513, 1134],
+                                    begin_norm_axis=2,
+                                    has_scale=True,
+                                    has_bias=False,
+                                    y_grad_scale=0.1)
+        self.check_forward_backward(shape=[92, 513, 1134],
+                                    begin_norm_axis=2,
+                                    has_scale=False,
+                                    has_bias=False,
+                                    y_grad_scale=0.1)
+        self.check_forward_backward(shape=[512, 1024],
+                                    begin_norm_axis=1,
+                                    has_scale=True,
+                                    has_bias=True)
 
 
 class TestLayerNormAPI(unittest.TestCase):
+
     def test_case(self):
-        x = fluid.layers.data(
-            name='x',
-            shape=[64, 32, 256],
-            dtype='float32',
-            append_batch_size=False)
-        x = fluid.layers.layer_norm(
-            x,
-            scale=True,
-            shift=True,
-            begin_norm_axis=1,
-            epsilon=1e-05,
-            param_attr=None,
-            bias_attr=None)
-        x = fluid.layers.layer_norm(
-            x,
-            scale=False,
-            shift=False,
-            begin_norm_axis=1,
-            epsilon=1e-05,
-            param_attr=None,
-            bias_attr=None)
-        x = fluid.layers.layer_norm(
-            x,
-            scale=False,
-            shift=False,
-            begin_norm_axis=1,
-            epsilon=1e-05,
-            param_attr="scale",
-            bias_attr="shift")
+        x = fluid.layers.data(name='x',
+                              shape=[64, 32, 256],
+                              dtype='float32',
+                              append_batch_size=False)
+        x = fluid.layers.layer_norm(x,
+                                    scale=True,
+                                    shift=True,
+                                    begin_norm_axis=1,
+                                    epsilon=1e-05,
+                                    param_attr=None,
+                                    bias_attr=None)
+        x = fluid.layers.layer_norm(x,
+                                    scale=False,
+                                    shift=False,
+                                    begin_norm_axis=1,
+                                    epsilon=1e-05,
+                                    param_attr=None,
+                                    bias_attr=None)
+        x = fluid.layers.layer_norm(x,
+                                    scale=False,
+                                    shift=False,
+                                    begin_norm_axis=1,
+                                    epsilon=1e-05,
+                                    param_attr="scale",
+                                    bias_attr="shift")
 
 
 class TestDygraphLayerNormAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             paddle.enable_static()
@@ -255,6 +253,7 @@ def test_errors(self):
 
 
 class TestFP16ScaleBiasLayerNorm(unittest.TestCase):
+
     def check_main(self, x_np, weight_np, bias_np, dtype):
         paddle.disable_static()
 
@@ -297,6 +296,7 @@ def assert_equal(x, y):
 
 
 class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):
+
     def test_main(self):
         self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
         _keep_layer_norm_scale_bias_to_fp32(False)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_leaky_relu_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_leaky_relu_op_mlu.py
index ec2150fceb133..0aad79eb61f92 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_leaky_relu_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_leaky_relu_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 from test_activation_op import ref_leaky_relu
@@ -28,6 +29,7 @@
 
 
 class TestLeadyRelu(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "leaky_relu"
@@ -63,28 +65,33 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['X'], 'Out', max_relative_error=0.006)
+            self.check_grad_with_place(self.place, ['X'],
+                                       'Out',
+                                       max_relative_error=0.006)
         else:
             self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
 class TestLeadyReluFP16(TestLeadyRelu):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestLeadyRelu2(TestLeadyRelu):
+
     def set_attrs(self):
         self.attrs = {'alpha': 0.5}
 
 
 class TestLeadyRelu3(TestLeadyRelu):
+
     def set_attrs(self):
         self.attrs = {'alpha': -0.5}
 
 
 class TestLeakyReluNet(unittest.TestCase):
+
     def _test(self, run_mlu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -97,8 +104,9 @@ def _test(self, run_mlu=True):
 
         with paddle.static.program_guard(main_prog, startup_prog):
             x = paddle.static.data(name="x", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             y = paddle.nn.functional.leaky_relu(x)
 
@@ -122,8 +130,10 @@ def _test(self, run_mlu=True):
         for epoch in range(100):
 
             pred_res, loss_res = exe.run(main_prog,
-                                         feed={"x": x_np,
-                                               "label": label_np},
+                                         feed={
+                                             "x": x_np,
+                                             "label": label_np
+                                         },
                                          fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py
index dea6391b8bae0..a1d594b93d01d 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py
@@ -41,6 +41,7 @@ def ref_log_softmax_grad(x, axis):
 
 
 class TestLogSoftmaxOp(OpTest):
+
     def setUp(self):
         self.op_type = 'log_softmax'
         self.set_mlu()
@@ -69,21 +70,24 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
+        self.check_grad_with_place(self.place, ['X'], ['Out'],
+                                   user_defined_grads=[self.x_grad])
 
 
 class TestLogSoftmaxShape(TestLogSoftmaxOp):
+
     def set_attrs(self):
         self.shape = [12, 10]
 
 
 class TestLogSoftmaxAxis(TestLogSoftmaxOp):
+
     def set_attrs(self):
         self.axis = 1
 
 
 class TestNNLogSoftmaxAPI(unittest.TestCase):
+
     def setUp(self):
         self.set_mlu()
         self.x_shape = [2, 3, 4, 5]
@@ -118,6 +122,7 @@ def test_check_api(self):
 
 
 class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
+
     def setUp(self):
         self.set_mlu()
         self.x_shape = [2, 3, 4, 5]
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py
index f9a08ba4c9b14..17ef85dd2bd8a 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestLookupTableV2(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "lookup_table_v2"
@@ -36,8 +38,9 @@ def setUp(self):
         self.init_padding_idx()
         np.random.seed(SEED)
         w = np.random.random([self.vocab, self.dim]).astype(self.dtype)
-        x = np.random.randint(
-            0, self.vocab, size=(self.bsz, self.seqlen)).astype(self.ids_dtype)
+        x = np.random.randint(0, self.vocab,
+                              size=(self.bsz,
+                                    self.seqlen)).astype(self.ids_dtype)
         out = w[x]
         if self.padding_idx != -1:
             out[np.squeeze(x == self.padding_idx)] = np.zeros(self.dim)
@@ -77,8 +80,9 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['W'], 'Out', max_relative_error=0.01)
+            self.check_grad_with_place(self.place, ['W'],
+                                       'Out',
+                                       max_relative_error=0.01)
         else:
             self.check_grad_with_place(self.place, ['W'], 'Out')
 
@@ -97,6 +101,7 @@ def set_mlu(self):
 
 
 class TestLookupTableV2Dim32(TestLookupTableV2):
+
     def init_dims(self):
         self.bsz = 6
         self.seqlen = 8
@@ -125,11 +130,13 @@ def set_mlu(self):
 
 
 class TestLookupTableV2WithPadding(TestLookupTableV2):
+
     def init_padding_idx(self):
         self.padding_idx = np.random.randint(0, self.vocab)
 
 
 class TestLookupTableV2WithPadding1(TestLookupTableV2):
+
     def init_padding_idx(self):
         self.padding_idx = np.random.randint(0, self.vocab)
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py
index adfff112e6be2..e8e69440ab40f 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -294,7 +295,9 @@ def config(self):
 
 #--------------------test matmul fp16--------------------
 def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
+
     class TestMatMulOpFp16Case(parent):
+
         def init_kernel_type(self):
             self.dtype = np.float16
 
@@ -302,10 +305,9 @@ def test_check_output(self):
             self.check_output_with_place(self.place, atol=atol)
 
         def test_check_grad(self):
-            self.check_grad_with_place(
-                self.place, ['X', 'Y'],
-                'Out',
-                max_relative_error=max_relative_error)
+            self.check_grad_with_place(self.place, ['X', 'Y'],
+                                       'Out',
+                                       max_relative_error=max_relative_error)
 
     cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
     TestMatMulOpFp16Case.__name__ = cls_name
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_matmul_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_matmul_v2_op_mlu.py
index 011769c29dbb3..85c73aa78ce0a 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_matmul_v2_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_matmul_v2_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -291,7 +292,9 @@ def config(self):
 
 
 def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
+
     class TestMatMulOpFp16Case(parent):
+
         def init_kernel_type(self):
             self.dtype = np.float16
 
@@ -299,10 +302,9 @@ def test_check_output(self):
             self.check_output_with_place(self.place, atol=atol)
 
         def test_check_grad(self):
-            self.check_grad_with_place(
-                self.place, ['X', 'Y'],
-                'Out',
-                max_relative_error=max_relative_error)
+            self.check_grad_with_place(self.place, ['X', 'Y'],
+                                       'Out',
+                                       max_relative_error=max_relative_error)
 
     cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
     TestMatMulOpFp16Case.__name__ = cls_name
@@ -329,6 +331,7 @@ def test_check_grad(self):
 
 
 class TestMatMulV2API(unittest.TestCase):
+
     def setUp(self):
         self.places = [paddle.CPUPlace()]
         if paddle.is_compiled_with_mlu():
@@ -346,8 +349,10 @@ def check_static_result(self, place):
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
-                              feed={"input_x": x_np,
-                                    "input_y": y_np},
+                              feed={
+                                  "input_x": x_np,
+                                  "input_y": y_np
+                              },
                               fetch_list=[result])
 
     def test_static(self):
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py
index 36419327db6b0..2b296b2d7dc3f 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestMean(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.place = paddle.device.MLUPlace(0)
@@ -55,6 +57,7 @@ def test_check_grad(self):
 
 
 class TestMeanFP16(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.place = paddle.MLUPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py
index f3699da15b535..31eb98b7a8850 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import sys
+
 sys.path.append('..')
 import unittest
 import paddle
@@ -47,22 +48,21 @@ def run_momentum_op(params,
         }
 
         param_vars = [
-            helper.create_variable(
-                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
+            helper.create_variable(persistable=True,
+                                   shape=p.shape,
+                                   dtype=p.dtype) for p in params
         ]
         grad_vars = [
-            helper.create_variable(
-                shape=g.shape, dtype=g.dtype) for g in grads
+            helper.create_variable(shape=g.shape, dtype=g.dtype) for g in grads
         ]
         velocity_vars = [
-            helper.create_variable(
-                persistable=True, shape=v.shape, dtype=v.dtype)
-            for v in velocitys
+            helper.create_variable(persistable=True,
+                                   shape=v.shape,
+                                   dtype=v.dtype) for v in velocitys
         ]
-        lr_var = helper.create_variable(
-            persistable=True,
-            shape=learning_rate.shape,
-            dtype=learning_rate.dtype)
+        lr_var = helper.create_variable(persistable=True,
+                                        shape=learning_rate.shape,
+                                        dtype=learning_rate.dtype)
 
         feed_dict = OrderedDict()
 
@@ -81,14 +81,15 @@ def run_momentum_op(params,
 
         if multi_precision:
             master_param_vars = [
-                helper.create_variable(
-                    persistable=True, shape=p.shape, dtype=p.dtype)
-                for p in master_params
+                helper.create_variable(persistable=True,
+                                       shape=p.shape,
+                                       dtype=p.dtype) for p in master_params
             ]
             feed_dict.update(
-                OrderedDict([(mp_var.name, mp_val)
-                             for mp_var, mp_val in zip(master_param_vars,
-                                                       master_params)]))
+                OrderedDict([
+                    (mp_var.name, mp_val)
+                    for mp_var, mp_val in zip(master_param_vars, master_params)
+                ]))
             # CPUPlace does not use MasterParam
             if isinstance(place, paddle.CUDAPlace):
                 fetch_list = fetch_list + [
@@ -110,8 +111,10 @@ def run_momentum_op(params,
                 if multi_precision:
                     inputs['MasterParam'] = master_param_vars[i]
                     outputs['MasterParamOut'] = master_param_vars[i]
-                helper.append_op(
-                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+                helper.append_op(type=op_type,
+                                 inputs=inputs,
+                                 outputs=outputs,
+                                 attrs=attrs)
         else:
             inputs = {
                 'Param': param_vars,
@@ -123,8 +126,10 @@ def run_momentum_op(params,
             if multi_precision:
                 inputs['MasterParam'] = master_param_vars
                 outputs['MasterParamOut'] = master_param_vars
-            helper.append_op(
-                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+            helper.append_op(type=op_type,
+                             inputs=inputs,
+                             outputs=outputs,
+                             attrs=attrs)
 
     exe = paddle.static.Executor(place)
     with paddle.static.scope_guard(paddle.static.Scope()):
@@ -154,22 +159,21 @@ def run_momentum_op2(params,
         helper = LayerHelper(op_type, **locals())
 
         param_vars = [
-            helper.create_variable(
-                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
+            helper.create_variable(persistable=True,
+                                   shape=p.shape,
+                                   dtype=p.dtype) for p in params
         ]
         grad_vars = [
-            helper.create_variable(
-                shape=g.shape, dtype=g.dtype) for g in grads
+            helper.create_variable(shape=g.shape, dtype=g.dtype) for g in grads
         ]
         velocity_vars = [
-            helper.create_variable(
-                persistable=True, shape=v.shape, dtype=v.dtype)
-            for v in velocitys
+            helper.create_variable(persistable=True,
+                                   shape=v.shape,
+                                   dtype=v.dtype) for v in velocitys
         ]
-        lr_var = helper.create_variable(
-            persistable=True,
-            shape=learning_rate.shape,
-            dtype=learning_rate.dtype)
+        lr_var = helper.create_variable(persistable=True,
+                                        shape=learning_rate.shape,
+                                        dtype=learning_rate.dtype)
 
         feed_dict = OrderedDict()
 
@@ -188,14 +192,15 @@ def run_momentum_op2(params,
 
         if multi_precision:
             master_param_vars = [
-                helper.create_variable(
-                    persistable=True, shape=p.shape, dtype=p.dtype)
-                for p in master_params
+                helper.create_variable(persistable=True,
+                                       shape=p.shape,
+                                       dtype=p.dtype) for p in master_params
             ]
             feed_dict.update(
-                OrderedDict([(mp_var.name, mp_val)
-                             for mp_var, mp_val in zip(master_param_vars,
-                                                       master_params)]))
+                OrderedDict([
+                    (mp_var.name, mp_val)
+                    for mp_var, mp_val in zip(master_param_vars, master_params)
+                ]))
             # CPUPlace does not use MasterParam
             if isinstance(place, paddle.CUDAPlace):
                 fetch_list = fetch_list + [
@@ -225,8 +230,10 @@ def run_momentum_op2(params,
                     'regularization_method': 'l2_decay',
                     'regularization_coeff': 2.0,
                 }
-                helper.append_op(
-                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+                helper.append_op(type=op_type,
+                                 inputs=inputs,
+                                 outputs=outputs,
+                                 attrs=attrs)
         else:
             inputs = {
                 'Param': param_vars,
@@ -239,16 +246,22 @@ def run_momentum_op2(params,
                 inputs['MasterParam'] = master_param_vars
                 outputs['MasterParamOut'] = master_param_vars
             attrs = {
-                'mu': mu,
-                'multi_precision': multi_precision,
-                'rescale_grad': rescale_grad,
-                'use_nesterov': use_nesterov,
+                'mu':
+                mu,
+                'multi_precision':
+                multi_precision,
+                'rescale_grad':
+                rescale_grad,
+                'use_nesterov':
+                use_nesterov,
                 'regularization_method':
                 ['l2_decay' for i in range(len(param_vars))],
                 'regularization_coeff': [2.0 for i in range(len(param_vars))],
             }
-            helper.append_op(
-                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+            helper.append_op(type=op_type,
+                             inputs=inputs,
+                             outputs=outputs,
+                             attrs=attrs)
 
     exe = paddle.static.Executor(place)
     with paddle.static.scope_guard(paddle.static.Scope()):
@@ -257,6 +270,7 @@ def run_momentum_op2(params,
 
 
 class TestMergedMomentum(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
@@ -286,18 +300,17 @@ def check_with_place(self, place, multi_precision):
             self.shapes, multi_precision, self.seed, place)
 
         def run_op(use_merged):
-            # MLU Momentum Op does not support rescale_grad 
+            # MLU Momentum Op does not support rescale_grad
             rescale_grad = 1.0
-            return run_momentum_op(
-                params,
-                grads,
-                velocitys,
-                master_params,
-                learning_rate,
-                place,
-                multi_precision,
-                rescale_grad=rescale_grad,
-                use_merged=use_merged)
+            return run_momentum_op(params,
+                                   grads,
+                                   velocitys,
+                                   master_params,
+                                   learning_rate,
+                                   place,
+                                   multi_precision,
+                                   rescale_grad=rescale_grad,
+                                   use_merged=use_merged)
 
         outs1 = run_op(True)
         outs2 = run_op(False)
@@ -310,6 +323,7 @@ def test_main(self):
 
 
 class TestMergedMomentum2(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
@@ -339,19 +353,18 @@ def check_with_place(self, place, multi_precision):
             self.shapes, multi_precision, self.seed, place)
 
         def run_op(use_nesterov, use_merged):
-            # MLU Momentum Op does not support rescale_grad 
+            # MLU Momentum Op does not support rescale_grad
             rescale_grad = 1.0
-            return run_momentum_op2(
-                params,
-                grads,
-                velocitys,
-                master_params,
-                learning_rate,
-                place,
-                multi_precision,
-                rescale_grad=rescale_grad,
-                use_merged=use_merged,
-                use_nesterov=use_nesterov)
+            return run_momentum_op2(params,
+                                    grads,
+                                    velocitys,
+                                    master_params,
+                                    learning_rate,
+                                    place,
+                                    multi_precision,
+                                    rescale_grad=rescale_grad,
+                                    use_merged=use_merged,
+                                    use_nesterov=use_nesterov)
 
         outs1 = run_op(use_nesterov=True, use_merged=True)
         outs2 = run_op(use_nesterov=True, use_merged=False)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
index a2cd69fee325a..abe16155d0362 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
@@ -19,6 +19,7 @@
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 import sys
+
 sys.path.append('..')
 from op_test import OpTest
 import paddle
@@ -30,6 +31,7 @@
 
 
 class TestMomentumOp1(OpTest):
+
     def setUp(self):
         self.op_type = "momentum"
         self.dtype = np.float32
@@ -74,6 +76,7 @@ def test_check_output(self):
 
 
 class TestMomentumOpFp16(TestMomentumOp1):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -121,14 +124,16 @@ def test_check_output(self):
 
 
 class TestMomentumV2(unittest.TestCase):
+
     def test_momentum_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Momentum(
-            learning_rate=0.01, momentum=0.9, parameters=linear.parameters())
+        adam = paddle.optimizer.Momentum(learning_rate=0.01,
+                                         momentum=0.9,
+                                         parameters=linear.parameters())
         out = linear(a)
         out.backward()
         adam.step()
@@ -145,13 +150,13 @@ def test_momentum(self):
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
             avg_cost = fluid.layers.mean(cost)
 
-            rms_optimizer = paddle.optimizer.Momentum(
-                learning_rate=0.1, momentum=0.9)
+            rms_optimizer = paddle.optimizer.Momentum(learning_rate=0.1,
+                                                      momentum=0.9)
             rms_optimizer.minimize(avg_cost)
 
             fetch_list = [avg_cost]
-            train_reader = paddle.batch(
-                paddle.dataset.uci_housing.train(), batch_size=1)
+            train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
+                                        batch_size=1)
             feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -159,12 +164,14 @@ def test_momentum(self):
                 exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
 
     def test_raise_error(self):
-        self.assertRaises(
-            ValueError, paddle.optimizer.Momentum, learning_rate=None)
+        self.assertRaises(ValueError,
+                          paddle.optimizer.Momentum,
+                          learning_rate=None)
         self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None)
 
 
 class TestMomentumOpWithDecay(OpTest):
+
     def setUp(self):
         self.op_type = "momentum"
         self.place = paddle.device.MLUPlace(0)
@@ -219,6 +226,7 @@ def test_check_output(self):
 
 
 class TestMomentumOpWithDecayFP16(TestMomentumOpWithDecay):
+
     def init_config(self):
         self.dtype = np.float16
 
@@ -227,11 +235,13 @@ def test_check_output(self):
 
 
 class TestMomentumOpWithDecay2(TestMomentumOpWithDecay):
+
     def init_config(self):
         self.use_nesterov = False
 
 
 class TestMomentumOpWithDecayAPI(unittest.TestCase):
+
     def _test_momentum_dygraph_common(self, regularization):
         paddle.disable_static()
         inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
@@ -268,8 +278,8 @@ def test_momentum_static(self):
             momentum_optimizer.minimize(avg_cost)
 
             fetch_list = [avg_cost]
-            train_reader = paddle.batch(
-                paddle.dataset.uci_housing.train(), batch_size=1)
+            train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
+                                        batch_size=1)
             feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -278,14 +288,17 @@ def test_momentum_static(self):
 
 
 class TestFusedMomentumWithDecayAPI(unittest.TestCase):
+
     def get_program(self, weight_attr, bias_attr=False):
         main_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        with paddle.static.program_guard(
-                main_program=main_program, startup_program=startup_program):
+        with paddle.static.program_guard(main_program=main_program,
+                                         startup_program=startup_program):
             x = paddle.static.data(name='x', shape=[10, 10])
-            linear = paddle.nn.Linear(
-                10, 10, weight_attr=weight_attr, bias_attr=bias_attr)
+            linear = paddle.nn.Linear(10,
+                                      10,
+                                      weight_attr=weight_attr,
+                                      bias_attr=bias_attr)
             out = linear(x)
             loss = paddle.mean(out)
             optimizer = paddle.optimizer.Momentum(
@@ -349,10 +362,11 @@ def test_param_has_no_regularizer(self):
 
 
 class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
+
     def __update_params(self, momentum, linear):
         for i in range(10):
-            inp = paddle.full(
-                shape=[2, 2], fill_value=i, dtype='float32').astype("float32")
+            inp = paddle.full(shape=[2, 2], fill_value=i,
+                              dtype='float32').astype("float32")
             inp = paddle.to_tensor(inp)
             out = linear(inp)
             loss = paddle.mean(out)
@@ -401,6 +415,7 @@ def test_vs(self, place=fluid.MLUPlace(0)):
 
 
 class TestMomentumV2Group(TestMomentumV2):
+
     def test_momentum_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -408,18 +423,22 @@ def test_momentum_dygraph(self):
         linear_1 = paddle.nn.Linear(13, 5)
         linear_2 = paddle.nn.Linear(5, 3)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Momentum(
-            learning_rate=0.01,
-            parameters=[{
-                'params': linear_1.parameters()
-            }, {
-                'params': linear_2.parameters(),
-                'weight_decay': 0.001,
-                'learning_rate': 0.1,
-                'momentum': 0.99
-            }],
-            weight_decay=0.1,
-            momentum=0.9)
+        adam = paddle.optimizer.Momentum(learning_rate=0.01,
+                                         parameters=[{
+                                             'params':
+                                             linear_1.parameters()
+                                         }, {
+                                             'params':
+                                             linear_2.parameters(),
+                                             'weight_decay':
+                                             0.001,
+                                             'learning_rate':
+                                             0.1,
+                                             'momentum':
+                                             0.99
+                                         }],
+                                         weight_decay=0.1,
+                                         momentum=0.9)
         out = linear_1(a)
         out = linear_2(out)
         out.backward()
@@ -428,6 +447,7 @@ def test_momentum_dygraph(self):
 
 
 class TestMultiTensorMomentumDygraph(unittest.TestCase):
+
     def _momentum_optimize_dygraph(self,
                                    place,
                                    use_param_attr=False,
@@ -494,8 +514,7 @@ def _check_with_place_amp(self, place, use_amp):
         self.assertEqual(np.allclose(output1, output2, rtol=1e-05), True)
         for idx in range(len(params1)):
             self.assertEqual(
-                np.allclose(
-                    params1[idx], params2[idx], rtol=1e-05), True)
+                np.allclose(params1[idx], params2[idx], rtol=1e-05), True)
 
     def _check_with_param_arrt(self, place, use_amp):
         output1, params1 = self._momentum_optimize_dygraph(
@@ -511,8 +530,7 @@ def _check_with_param_arrt(self, place, use_amp):
         self.assertEqual(np.allclose(output1, output2, rtol=1e-05), True)
         for idx in range(len(params1)):
             self.assertEqual(
-                np.allclose(
-                    params1[idx], params2[idx], rtol=1e-05), True)
+                np.allclose(params1[idx], params2[idx], rtol=1e-05), True)
 
     def _check_with_param_group(self, place, use_amp):
         output1, params1 = self._momentum_optimize_dygraph(
@@ -528,8 +546,7 @@ def _check_with_param_group(self, place, use_amp):
         self.assertEqual(np.allclose(output1, output2, rtol=1e-05), True)
         for idx in range(len(params1)):
             self.assertEqual(
-                np.allclose(
-                    params1[idx], params2[idx], rtol=1e-05), True)
+                np.allclose(params1[idx], params2[idx], rtol=1e-05), True)
 
     def test_main(self):
         for place in self._get_places():
@@ -542,6 +559,7 @@ def test_main(self):
 
 
 class TestMultiTensorMomentumStatic(unittest.TestCase):
+
     def _momentum_optimize_static(self,
                                   place,
                                   use_amp=False,
@@ -554,8 +572,8 @@ def _momentum_optimize_static(self,
         exe = paddle.static.Executor(place=paddle.device.MLUPlace(0))
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        optimizer = paddle.optimizer.Momentum(
-            multi_precision=use_amp, use_multi_tensor=use_multi_tensor)
+        optimizer = paddle.optimizer.Momentum(multi_precision=use_amp,
+                                              use_multi_tensor=use_multi_tensor)
         if use_amp:
             optimizer = paddle.static.amp.decorate(
                 optimizer,
@@ -565,11 +583,13 @@ def _momentum_optimize_static(self,
                 use_fp16_guard=False)
         with paddle.static.program_guard(train_program, startup_program):
             if use_amp:
-                data = paddle.static.data(
-                    shape=[2, 2], name='X', dtype='float16')
+                data = paddle.static.data(shape=[2, 2],
+                                          name='X',
+                                          dtype='float16')
             else:
-                data = paddle.static.data(
-                    shape=[2, 2], name='X', dtype='float32')
+                data = paddle.static.data(shape=[2, 2],
+                                          name='X',
+                                          dtype='float32')
             hidden = paddle.static.nn.fc(x=data, size=10)
             loss = paddle.fluid.layers.mean(hidden)
             optimizer.minimize(loss)
@@ -592,14 +612,15 @@ def _get_places(self):
         return places
 
     def _check_with_place_amp(self, place, use_amp):
-        output1 = self._momentum_optimize_static(
-            place=place, use_amp=use_amp, use_multi_tensor=True)
-        output2 = self._momentum_optimize_static(
-            place=place, use_amp=use_amp, use_multi_tensor=False)
+        output1 = self._momentum_optimize_static(place=place,
+                                                 use_amp=use_amp,
+                                                 use_multi_tensor=True)
+        output2 = self._momentum_optimize_static(place=place,
+                                                 use_amp=use_amp,
+                                                 use_multi_tensor=False)
         for idx in range(len(output1)):
             self.assertEqual(
-                np.allclose(
-                    output1[idx], output2[idx], rtol=1e-05), True)
+                np.allclose(output1[idx], output2[idx], rtol=1e-05), True)
 
     def test_main(self):
         for place in self._get_places():
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_one_hot_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_one_hot_v2_op_mlu.py
index a56e9ff7558f6..9af31dcf73f04 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_one_hot_v2_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_one_hot_v2_op_mlu.py
@@ -18,6 +18,7 @@
 import numpy as np
 import math
 import sys
+
 sys.path.append('..')
 from op_test import OpTest
 import paddle
@@ -30,6 +31,7 @@
 
 
 class TestOneHotOp(OpTest):
+
     def setUp(self):
         self.place = paddle.device.MLUPlace(0)
         self.__class__.use_mlu = True
@@ -55,6 +57,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_attr(OpTest):
+
     def setUp(self):
         self.place = paddle.device.MLUPlace(0)
         self.__class__.use_mlu = True
@@ -82,6 +85,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_default_dtype(OpTest):
+
     def setUp(self):
         self.place = paddle.device.MLUPlace(0)
         self.__class__.use_mlu = True
@@ -107,6 +111,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_default_dtype_attr(OpTest):
+
     def setUp(self):
         self.place = paddle.device.MLUPlace(0)
         self.__class__.use_mlu = True
@@ -132,6 +137,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_exception(unittest.TestCase):
+
     def setUp(self):
         self.place = paddle.device.MLUPlace(0)
         self.__class__.use_mlu = True
@@ -149,18 +155,18 @@ def setUp(self):
     def test_check_output(self):
         program = Program()
         with program_guard(program):
-            x = fluid.layers.data(
-                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
+            x = fluid.layers.data(name='x',
+                                  shape=[self.dimension],
+                                  dtype='float32',
+                                  lod_level=1)
             block = program.current_block()
-            one_hot_out = block.create_var(
-                name="one_hot_out",
-                type=core.VarDesc.VarType.LOD_TENSOR,
-                dtype='float32')
-            block.append_op(
-                type='one_hot',
-                inputs={'X': x},
-                attrs={'depth': self.depth},
-                outputs={'Out': one_hot_out})
+            one_hot_out = block.create_var(name="one_hot_out",
+                                           type=core.VarDesc.VarType.LOD_TENSOR,
+                                           dtype='float32')
+            block.append_op(type='one_hot',
+                            inputs={'X': x},
+                            attrs={'depth': self.depth},
+                            outputs={'Out': one_hot_out})
             exe = fluid.Executor(self.place)
 
             def run():
@@ -172,6 +178,7 @@ def run():
 
 
 class TestOneHotOpApi(unittest.TestCase):
+
     def setUp(self):
         self.place = paddle.device.MLUPlace(0)
         self.__class__.use_mlu = True
@@ -207,12 +214,15 @@ def _run(self, depth):
 
         exe = fluid.Executor(self.place)
         exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'label': label_data, },
+        ret = exe.run(feed={
+            'label': label_data,
+        },
                       fetch_list=[one_hot_label],
                       return_numpy=False)
 
 
 class BadInputTestOnehotV2(unittest.TestCase):
+
     def setUp(self):
         self.place = paddle.device.MLUPlace(0)
         self.__class__.use_mlu = True
@@ -221,11 +231,10 @@ def test_error(self):
         with fluid.program_guard(fluid.Program()):
 
             def test_bad_x():
-                label = fluid.layers.data(
-                    name="label",
-                    shape=[4],
-                    append_batch_size=False,
-                    dtype="float32")
+                label = fluid.layers.data(name="label",
+                                          shape=[4],
+                                          append_batch_size=False,
+                                          dtype="float32")
                 one_hot_label = fluid.one_hot(input=label, depth=4)
 
             self.assertRaises(TypeError, test_bad_x)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
index 1be3d2d85a422..d33646cbfa32b 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
@@ -23,6 +23,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 import sys
+
 sys.path.append('..')
 from op_test import OpTest
 from test_pool2d_op import pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive, adaptive_start_index, adaptive_end_index
@@ -47,8 +48,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         for input_size, filter_size, stride_size in zip(input_shape, pool_size,
                                                         pool_stride):
             out_size = int((input_size + stride_size - 1) / stride_size)
-            pad_sum = np.max((
-                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0))
             pad_0 = int(pad_sum / 2)
             pad_1 = int(pad_sum - pad_0)
             padding.append(pad_0)
@@ -129,19 +130,19 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
 
             if pool_type == 'avg':
                 if (exclusive or adaptive):
-                    field_size = (in_h_end - in_h_start) * (
-                        in_w_end - in_w_start)
-                x_grad[:, :, in_h_start:in_h_end, in_w_start:
-                       in_w_end] += 1 / field_size
+                    field_size = (in_h_end - in_h_start) * (in_w_end -
+                                                            in_w_start)
+                x_grad[:, :, in_h_start:in_h_end,
+                       in_w_start:in_w_end] += 1 / field_size
             elif pool_type == 'max':
                 for n in range(N):
                     for c in range(C):
-                        idx = np.argmax(x[n, c, in_h_start:in_h_end, in_w_start:
-                                          in_w_end].flatten())
+                        idx = np.argmax(x[n, c, in_h_start:in_h_end,
+                                          in_w_start:in_w_end].flatten())
                         idx_h = idx // (in_w_end - in_w_start)
                         idx_w = idx % (in_w_end - in_w_start)
-                        x_grad[n, c, in_h_start + idx_h, in_w_start +
-                               idx_w] += 1
+                        x_grad[n, c, in_h_start + idx_h,
+                               in_w_start + idx_w] += 1
 
     if data_format == "NHWC":
         x_grad = x_grad.transpose([0, 2, 3, 1])
@@ -149,6 +150,7 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
 
 
 class TestPool2D_Op_Mixin(object):
+
     def setUp(self):
         self.place = paddle.device.MLUPlace(0)
         self.__class__.use_mlu = True
@@ -166,10 +168,12 @@ def setUp(self):
         self.init_shape()
 
         input = np.random.random(self.shape).astype(self.dtype)
-        output = pool2D_forward_naive(
-            input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive, self.adaptive, self.data_format,
-            self.pool_type, self.padding_algorithm).astype(self.dtype)
+        output = pool2D_forward_naive(input, self.ksize, self.strides,
+                                      self.paddings, self.global_pool,
+                                      self.ceil_mode, self.exclusive,
+                                      self.adaptive, self.data_format,
+                                      self.pool_type,
+                                      self.padding_algorithm).astype(self.dtype)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
@@ -191,25 +195,23 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        x_grad = pool2d_backward_navie(
-            self.inputs["X"],
-            ksize=self.ksize,
-            strides=self.strides,
-            paddings=self.paddings,
-            global_pool=self.global_pool,
-            ceil_mode=False,
-            exclusive=self.exclusive,
-            adaptive=self.adaptive,
-            data_format=self.data_format,
-            pool_type=self.pool_type,
-            padding_algorithm=self.padding_algorithm)
+        x_grad = pool2d_backward_navie(self.inputs["X"],
+                                       ksize=self.ksize,
+                                       strides=self.strides,
+                                       paddings=self.paddings,
+                                       global_pool=self.global_pool,
+                                       ceil_mode=False,
+                                       exclusive=self.exclusive,
+                                       adaptive=self.adaptive,
+                                       data_format=self.data_format,
+                                       pool_type=self.pool_type,
+                                       padding_algorithm=self.padding_algorithm)
         x_grad = x_grad / np.prod(self.outputs['Out'].shape)
-        self.check_grad_with_place(
-            self.place,
-            set(['X']),
-            'Out',
-            max_relative_error=0.06,
-            user_defined_grads=[x_grad])
+        self.check_grad_with_place(self.place,
+                                   set(['X']),
+                                   'Out',
+                                   max_relative_error=0.06,
+                                   user_defined_grads=[x_grad])
 
     def init_data_format(self):
         self.data_format = "NCHW"
@@ -250,6 +252,7 @@ class TestPool2D_Op(TestPool2D_Op_Mixin, OpTest):
 
 
 class TestCase1(TestPool2D_Op):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -269,6 +272,7 @@ def init_shape(self):
 
 
 class TestCase2(TestPool2D_Op):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -288,25 +292,30 @@ def init_shape(self):
 
 
 class TestCase3(TestPool2D_Op):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase4(TestCase1):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase5(TestCase2):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 def create_test_fp16_class(parent):
+
     class TestFp16Case(parent):
+
         def init_data_type(self):
             self.dtype = np.float16
 
@@ -330,7 +339,9 @@ def test_check_output(self):
 
 
 def create_test_use_ceil_class(parent):
+
     class TestPool2DUseCeilCase(parent):
+
         def init_ceil_mode(self):
             self.ceil_mode = True
 
@@ -344,16 +355,19 @@ def init_ceil_mode(self):
 
 
 class TestAvgInclude(TestCase2):
+
     def init_exclusive(self):
         self.exclusive = False
 
 
 class TestAvgPoolAdaptive(TestCase1):
+
     def init_adaptive(self):
         self.adaptive = True
 
 
 class TestAvgPoolAdaptiveAsyOutSize(TestCase1):
+
     def init_adaptive(self):
         self.adaptive = True
 
@@ -370,6 +384,7 @@ def init_test_case(self):
 
 
 class TestPool2D_AsyPadding(TestPool2D_Op):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -380,6 +395,7 @@ def init_shape(self):
 
 
 class TestCase1_AsyPadding(TestCase1):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -390,6 +406,7 @@ def init_shape(self):
 
 
 class TestCase2_AsyPadding(TestCase2):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -400,6 +417,7 @@ def init_shape(self):
 
 
 class TestCase3_AsyPadding(TestCase3):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -410,6 +428,7 @@ def init_shape(self):
 
 
 class TestCase4_AsyPadding(TestCase4):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -420,6 +439,7 @@ def init_shape(self):
 
 
 class TestCase5_AsyPadding((TestCase5)):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -434,6 +454,7 @@ def init_shape(self):
 
 
 class TestAvgInclude_AsyPadding(TestCase2):
+
     def init_exclusive(self):
         self.exclusive = False
 
@@ -447,6 +468,7 @@ def init_shape(self):
 
 
 class TestAvgPoolAdaptive_AsyPadding(TestCase1):
+
     def init_adaptive(self):
         self.adaptive = True
 
@@ -461,6 +483,7 @@ def init_shape(self):
 
 #----------- test channel_last --------------
 class TestPool2D_channel_last(TestPool2D_Op):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -469,6 +492,7 @@ def init_shape(self):
 
 
 class TestCase1_channel_last(TestCase1):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -477,6 +501,7 @@ def init_shape(self):
 
 
 class TestCase2_channel_last(TestCase2):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -485,6 +510,7 @@ def init_shape(self):
 
 
 class TestCase3_channel_last(TestCase3):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -493,6 +519,7 @@ def init_shape(self):
 
 
 class TestCase4_channel_last(TestCase4):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -501,6 +528,7 @@ def init_shape(self):
 
 
 class TestCase5_channel_last(TestCase5):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -513,11 +541,13 @@ def init_shape(self):
 
 
 class TestCase5_Max(TestCase2):
+
     def init_pool_type(self):
         self.pool_type = "max"
 
 
 class TestCase5_channel_last_Max(TestCase5_Max):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -526,16 +556,19 @@ def init_shape(self):
 
 
 class TestAvgInclude_channel_last(TestCase2_channel_last):
+
     def init_exclusive(self):
         self.exclusive = False
 
 
 class TestAvgPoolAdaptive_channel_last(TestCase1_channel_last):
+
     def init_adaptive(self):
         self.adaptive = True
 
 
 class TestPool2D_AsyPadding_channel_last(TestPool2D_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -544,6 +577,7 @@ def init_shape(self):
 
 
 class TestCase1_AsyPadding_channel_last(TestCase1_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -552,6 +586,7 @@ def init_shape(self):
 
 
 class TestCase2_AsyPadding_channel_last(TestCase2_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -560,6 +595,7 @@ def init_shape(self):
 
 
 class TestCase3_AsyPadding_channel_last(TestCase3_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -568,6 +604,7 @@ def init_shape(self):
 
 
 class TestCase4_AsyPadding_channel_last(TestCase4_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -576,6 +613,7 @@ def init_shape(self):
 
 
 class TestCase5_AsyPadding_channel_last(TestCase5_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -588,6 +626,7 @@ def init_shape(self):
 
 
 class TestAvgInclude_AsyPadding_channel_last(TestAvgInclude_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -599,7 +638,9 @@ def init_shape(self):
 
 
 def create_test_padding_SAME_class(parent):
+
     class TestPaddingSMAECase(parent):
+
         def init_paddings(self):
             self.paddings = [0, 0]
             self.padding_algorithm = "SAME"
@@ -625,7 +666,9 @@ def init_paddings(self):
 
 
 def create_test_padding_VALID_class(parent):
+
     class TestPaddingVALIDCase(parent):
+
         def init_paddings(self):
             self.paddings = [1, 1]
             self.padding_algorithm = "VALID"
@@ -651,6 +694,7 @@ def init_paddings(self):
 
 
 class TestCase1_strides(TestCase1):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 2]
@@ -664,107 +708,96 @@ def init_shape(self):
 
 # ----- test API
 class TestPool2DAPI(unittest.TestCase):
+
     def test_api(self):
         x_NHWC = np.random.random([2, 5, 5, 3]).astype("float32")
         x_NCHW = np.random.random([2, 3, 5, 5]).astype("float32")
 
-        input_NHWC = fluid.layers.data(
-            name="input_NHWC",
-            shape=[2, 5, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
-
-        input_NCHW = fluid.layers.data(
-            name="input_NCHW",
-            shape=[2, 3, 5, 5],
-            append_batch_size=False,
-            dtype="float32")
-
-        input_NHWC_negetive = fluid.layers.data(
-            name="input_NHWC_negetive",
-            shape=[2, -1, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
-
-        input_NCHW_negetive = fluid.layers.data(
-            name="input_NCHW_negetive",
-            shape=[2, 3, -1, -1],
-            append_batch_size=False,
-            dtype="float32")
+        input_NHWC = fluid.layers.data(name="input_NHWC",
+                                       shape=[2, 5, 5, 3],
+                                       append_batch_size=False,
+                                       dtype="float32")
+
+        input_NCHW = fluid.layers.data(name="input_NCHW",
+                                       shape=[2, 3, 5, 5],
+                                       append_batch_size=False,
+                                       dtype="float32")
+
+        input_NHWC_negetive = fluid.layers.data(name="input_NHWC_negetive",
+                                                shape=[2, -1, 5, 3],
+                                                append_batch_size=False,
+                                                dtype="float32")
+
+        input_NCHW_negetive = fluid.layers.data(name="input_NCHW_negetive",
+                                                shape=[2, 3, -1, -1],
+                                                append_batch_size=False,
+                                                dtype="float32")
 
         ksize = [3, 3]
-        out_1 = fluid.layers.pool2d(
-            input=input_NHWC,
-            pool_size=ksize,
-            pool_type="max",
-            pool_padding=[1, 1],
-            data_format="NHWC")
-
-        out_2 = fluid.layers.pool2d(
-            input=input_NHWC,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
-            data_format="NHWC")
-
-        out_3 = fluid.layers.pool2d(
-            input=input_NCHW,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding=[[0, 0], [0, 0], [1, 1], [1, 1]],
-            data_format="NCHW")
-
-        out_4 = fluid.layers.pool2d(
-            input=input_NCHW,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding=[1, 2, 1, 0],
-            data_format="NCHW")
+        out_1 = fluid.layers.pool2d(input=input_NHWC,
+                                    pool_size=ksize,
+                                    pool_type="max",
+                                    pool_padding=[1, 1],
+                                    data_format="NHWC")
+
+        out_2 = fluid.layers.pool2d(input=input_NHWC,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding=[[0, 0], [1, 1], [1, 1],
+                                                  [0, 0]],
+                                    data_format="NHWC")
+
+        out_3 = fluid.layers.pool2d(input=input_NCHW,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding=[[0, 0], [0, 0], [1, 1],
+                                                  [1, 1]],
+                                    data_format="NCHW")
+
+        out_4 = fluid.layers.pool2d(input=input_NCHW,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding=[1, 2, 1, 0],
+                                    data_format="NCHW")
         # test VALID
-        out_5 = fluid.layers.pool2d(
-            input=input_NCHW,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding="VALID",
-            data_format="NCHW")
-
-        out_6 = fluid.layers.pool2d(
-            input=input_NHWC,
-            pool_size=ksize,
-            pool_type="max",
-            pool_padding="VALID",
-            data_format="NHWC")
+        out_5 = fluid.layers.pool2d(input=input_NCHW,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding="VALID",
+                                    data_format="NCHW")
+
+        out_6 = fluid.layers.pool2d(input=input_NHWC,
+                                    pool_size=ksize,
+                                    pool_type="max",
+                                    pool_padding="VALID",
+                                    data_format="NHWC")
 
         # test SAME
-        out_7 = fluid.layers.pool2d(
-            input=input_NCHW,
-            pool_size=[4, 4],
-            pool_type="avg",
-            pool_padding="SAME",
-            data_format="NCHW")
-
-        out_8 = fluid.layers.pool2d(
-            input=input_NHWC,
-            pool_size=[4, 4],
-            pool_type="max",
-            pool_padding="SAME",
-            data_format="NHWC")
+        out_7 = fluid.layers.pool2d(input=input_NCHW,
+                                    pool_size=[4, 4],
+                                    pool_type="avg",
+                                    pool_padding="SAME",
+                                    data_format="NCHW")
+
+        out_8 = fluid.layers.pool2d(input=input_NHWC,
+                                    pool_size=[4, 4],
+                                    pool_type="max",
+                                    pool_padding="SAME",
+                                    data_format="NHWC")
 
         # test negetive
-        out_9 = fluid.layers.pool2d(
-            input=input_NHWC_negetive,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding=[0, 0],
-            data_format="NHWC")
+        out_9 = fluid.layers.pool2d(input=input_NHWC_negetive,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding=[0, 0],
+                                    data_format="NHWC")
         assert out_9.shape == (2, -1, 3, 3)
 
-        out_10 = fluid.layers.pool2d(
-            input=input_NCHW_negetive,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding=[0, 0],
-            data_format="NCHW")
+        out_10 = fluid.layers.pool2d(input=input_NCHW_negetive,
+                                     pool_size=ksize,
+                                     pool_type="avg",
+                                     pool_padding=[0, 0],
+                                     data_format="NCHW")
         assert out_10.shape == (2, 3, -1, -1)
 
         exe = fluid.Executor(place=fluid.MLUPlace(0))
@@ -776,52 +809,44 @@ def test_api(self):
                 "input_NHWC_negetive": x_NHWC,
                 "input_NCHW_negetive": x_NCHW
             },
-            fetch_list=[
-                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
-            ])
+            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8])
 
         assert np.allclose(
             res_1,
-            pool2D_forward_naive(
-                x=x_NHWC,
-                ksize=ksize,
-                pool_type="max",
-                strides=[1, 1],
-                paddings=[1, 1],
-                data_format="NHWC"))
+            pool2D_forward_naive(x=x_NHWC,
+                                 ksize=ksize,
+                                 pool_type="max",
+                                 strides=[1, 1],
+                                 paddings=[1, 1],
+                                 data_format="NHWC"))
 
         assert np.allclose(
             res_2,
-            pool2D_forward_naive(
-                x=x_NHWC,
-                ksize=ksize,
-                pool_type="avg",
-                strides=[1, 1],
-                paddings=[1, 1, 1, 1],
-                data_format="NHWC"))
-        assert np.allclose(
-            res_3,
-            pool2D_forward_naive(
-                x=x_NCHW,
-                ksize=ksize,
-                pool_type="avg",
-                strides=[1, 1],
-                paddings=[1, 1, 1, 1],
-                data_format="NCHW"),
-            rtol=0.07,
-            atol=1e-05)
-
-        assert np.allclose(
-            res_4,
-            pool2D_forward_naive(
-                x=x_NCHW,
-                ksize=ksize,
-                pool_type="avg",
-                strides=[1, 1],
-                paddings=[1, 2, 1, 0],
-                data_format="NCHW"),
-            rtol=0.07,
-            atol=1e-05)
+            pool2D_forward_naive(x=x_NHWC,
+                                 ksize=ksize,
+                                 pool_type="avg",
+                                 strides=[1, 1],
+                                 paddings=[1, 1, 1, 1],
+                                 data_format="NHWC"))
+        assert np.allclose(res_3,
+                           pool2D_forward_naive(x=x_NCHW,
+                                                ksize=ksize,
+                                                pool_type="avg",
+                                                strides=[1, 1],
+                                                paddings=[1, 1, 1, 1],
+                                                data_format="NCHW"),
+                           rtol=0.07,
+                           atol=1e-05)
+
+        assert np.allclose(res_4,
+                           pool2D_forward_naive(x=x_NCHW,
+                                                ksize=ksize,
+                                                pool_type="avg",
+                                                strides=[1, 1],
+                                                paddings=[1, 2, 1, 0],
+                                                data_format="NCHW"),
+                           rtol=0.07,
+                           atol=1e-05)
 
         # VALID
         assert np.allclose(
@@ -838,182 +863,170 @@ def test_api(self):
             atol=1e-05)
         assert np.allclose(
             res_6,
-            pool2D_forward_naive(
-                x=x_NHWC,
-                ksize=ksize,
-                pool_type="max",
-                strides=[1, 1],
-                paddings=[10, 20],
-                padding_algorithm="VALID",
-                data_format="NHWC"))
+            pool2D_forward_naive(x=x_NHWC,
+                                 ksize=ksize,
+                                 pool_type="max",
+                                 strides=[1, 1],
+                                 paddings=[10, 20],
+                                 padding_algorithm="VALID",
+                                 data_format="NHWC"))
         # SAME
-        assert np.allclose(
-            res_7,
-            pool2D_forward_naive(
-                x=x_NCHW,
-                ksize=[4, 4],
-                pool_type="avg",
-                strides=[1, 1],
-                paddings=[10, 20],
-                padding_algorithm="SAME",
-                data_format="NCHW"),
-            rtol=0.07,
-            atol=1e-05)
+        assert np.allclose(res_7,
+                           pool2D_forward_naive(x=x_NCHW,
+                                                ksize=[4, 4],
+                                                pool_type="avg",
+                                                strides=[1, 1],
+                                                paddings=[10, 20],
+                                                padding_algorithm="SAME",
+                                                data_format="NCHW"),
+                           rtol=0.07,
+                           atol=1e-05)
 
         assert np.allclose(
             res_8,
-            pool2D_forward_naive(
-                x=x_NHWC,
-                ksize=[4, 4],
-                pool_type="max",
-                strides=[1, 1],
-                paddings=[10, 20],
-                padding_algorithm="SAME",
-                data_format="NHWC"))
+            pool2D_forward_naive(x=x_NHWC,
+                                 ksize=[4, 4],
+                                 pool_type="max",
+                                 strides=[1, 1],
+                                 paddings=[10, 20],
+                                 padding_algorithm="SAME",
+                                 data_format="NHWC"))
 
 
 class TestPool2DAPI_Error(unittest.TestCase):
+
     def test_api(self):
-        input_NHWC = fluid.layers.data(
-            name="input_NHWC",
-            shape=[2, 5, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
+        input_NHWC = fluid.layers.data(name="input_NHWC",
+                                       shape=[2, 5, 5, 3],
+                                       append_batch_size=False,
+                                       dtype="float32")
         ksize = [3, 3]
 
         # data_format value error
         def run_2():
-            out_2 = fluid.layers.pool2d(
-                input=input_NHWC,
-                pool_size=ksize,
-                pool_type="max",
-                pool_padding=[1, 1],
-                data_format="NHWCC")
+            out_2 = fluid.layers.pool2d(input=input_NHWC,
+                                        pool_size=ksize,
+                                        pool_type="max",
+                                        pool_padding=[1, 1],
+                                        data_format="NHWCC")
 
         self.assertRaises(ValueError, run_2)
 
         # padding str value error
         def run_3():
-            out_3 = fluid.layers.pool2d(
-                input=input_NHWC,
-                pool_size=ksize,
-                pool_type="max",
-                pool_padding="VALIDSAME",
-                data_format="NHWC")
+            out_3 = fluid.layers.pool2d(input=input_NHWC,
+                                        pool_size=ksize,
+                                        pool_type="max",
+                                        pool_padding="VALIDSAME",
+                                        data_format="NHWC")
 
         self.assertRaises(ValueError, run_3)
 
         # padding str valid and ceil_mode value error
         def run_4():
-            out_4 = fluid.layers.pool2d(
-                input=input_NHWC,
-                pool_size=ksize,
-                pool_type="max",
-                pool_padding="VALID",
-                ceil_mode=True,
-                data_format="NHWC")
+            out_4 = fluid.layers.pool2d(input=input_NHWC,
+                                        pool_size=ksize,
+                                        pool_type="max",
+                                        pool_padding="VALID",
+                                        ceil_mode=True,
+                                        data_format="NHWC")
 
         self.assertRaises(ValueError, run_4)
 
         # padding with 8 ele. value error
         def run_5():
-            out_5 = fluid.layers.pool2d(
-                input=input_NHWC,
-                pool_size=ksize,
-                pool_type="max",
-                pool_padding=[[1, 1], [0, 0], [0, 0], [1, 1]],
-                data_format="NHWC")
+            out_5 = fluid.layers.pool2d(input=input_NHWC,
+                                        pool_size=ksize,
+                                        pool_type="max",
+                                        pool_padding=[[1, 1], [0, 0], [0, 0],
+                                                      [1, 1]],
+                                        data_format="NHWC")
 
         self.assertRaises(ValueError, run_5)
 
 
 class TestDygraphPool2DAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of Pool2D must be Variable.
             data1 = np.random.random((3, 32, 32, 5)).astype('float32')
-            pool2d = fluid.dygraph.Pool2D(
-                pool_size=2,
-                pool_type='max',
-                pool_stride=1,
-                global_pooling=False)
+            pool2d = fluid.dygraph.Pool2D(pool_size=2,
+                                          pool_type='max',
+                                          pool_stride=1,
+                                          global_pooling=False)
             self.assertRaises(TypeError, pool2d, data1)
 
-            # the input dtype of mlu Pool2D must be float16 or float32 
-            data2 = fluid.layers.data(
-                name='x1', shape=[3, 32, 32, 5], dtype="int32")
+            # the input dtype of mlu Pool2D must be float16 or float32
+            data2 = fluid.layers.data(name='x1',
+                                      shape=[3, 32, 32, 5],
+                                      dtype="int32")
             self.assertRaises(TypeError, pool2d, data2)
 
     def test_data_format_error(self):
         with program_guard(Program(), Program()):
             # the data_format must be 'NCHW' or 'NHWC'
             data1 = np.random.random((3, 32, 32, 5)).astype('float32')
-            self.assertRaises(
-                ValueError,
-                fluid.dygraph.Pool2D,
-                pool_size=2,
-                pool_type='max',
-                pool_stride=1,
-                global_pooling=False,
-                data_format='NWHC')
+            self.assertRaises(ValueError,
+                              fluid.dygraph.Pool2D,
+                              pool_size=2,
+                              pool_type='max',
+                              pool_stride=1,
+                              global_pooling=False,
+                              data_format='NWHC')
 
 
 class TestDygraphPool2DAPI(unittest.TestCase):
+
     def test_nhwc(self):
         with fluid.dygraph.guard():
             data = np.random.random((3, 32, 32, 5)).astype('float32')
             x = fluid.dygraph.to_variable(data)
-            pool2d = fluid.dygraph.Pool2D(
-                pool_size=2,
-                pool_type='max',
-                pool_stride=1,
-                pool_padding=[0, 0],
-                global_pooling=False,
-                data_format='NHWC')
+            pool2d = fluid.dygraph.Pool2D(pool_size=2,
+                                          pool_type='max',
+                                          pool_stride=1,
+                                          pool_padding=[0, 0],
+                                          global_pooling=False,
+                                          data_format='NHWC')
             out1 = pool2d(x)
-            out2 = pool2D_forward_naive(
-                data, [2, 2], [1, 1],
-                paddings=[0, 0],
-                pool_type='max',
-                data_format='NHWC')
+            out2 = pool2D_forward_naive(data, [2, 2], [1, 1],
+                                        paddings=[0, 0],
+                                        pool_type='max',
+                                        data_format='NHWC')
             self.assertTrue(np.allclose(out1.numpy(), out2))
 
     def test_lower_case(self):
         with fluid.dygraph.guard():
             data = np.random.random((3, 32, 32, 5)).astype('float32')
             x = fluid.dygraph.to_variable(data)
-            pool2d = fluid.dygraph.Pool2D(
-                pool_size=2,
-                pool_type='max',
-                pool_stride=1,
-                pool_padding=[0, 0],
-                global_pooling=False,
-                data_format='nhwc')
+            pool2d = fluid.dygraph.Pool2D(pool_size=2,
+                                          pool_type='max',
+                                          pool_stride=1,
+                                          pool_padding=[0, 0],
+                                          global_pooling=False,
+                                          data_format='nhwc')
             out1 = pool2d(x)
-            out2 = pool2D_forward_naive(
-                data, [2, 2], [1, 1],
-                paddings=[0, 0],
-                pool_type='max',
-                data_format='NHWC')
+            out2 = pool2D_forward_naive(data, [2, 2], [1, 1],
+                                        paddings=[0, 0],
+                                        pool_type='max',
+                                        data_format='NHWC')
             self.assertTrue(np.allclose(out1.numpy(), out2))
 
     def test_upper_case(self):
         with fluid.dygraph.guard():
             data = np.random.random((3, 32, 32, 5)).astype('float32')
             x = fluid.dygraph.to_variable(data)
-            pool2d = fluid.dygraph.Pool2D(
-                pool_size=2,
-                pool_type='MAX',
-                pool_stride=1,
-                pool_padding=[0, 0],
-                global_pooling=False,
-                data_format='nhwc')
+            pool2d = fluid.dygraph.Pool2D(pool_size=2,
+                                          pool_type='MAX',
+                                          pool_stride=1,
+                                          pool_padding=[0, 0],
+                                          global_pooling=False,
+                                          data_format='nhwc')
             out1 = pool2d(x)
-            out2 = pool2D_forward_naive(
-                data, [2, 2], [1, 1],
-                paddings=[0, 0],
-                pool_type='max',
-                data_format='NHWC')
+            out2 = pool2D_forward_naive(data, [2, 2], [1, 1],
+                                        paddings=[0, 0],
+                                        pool_type='max',
+                                        data_format='NHWC')
             self.assertTrue(np.allclose(out1.numpy(), out2))
 
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_max_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_max_op_mlu.py
index ef33719d368e8..372f2bd6ad4d0 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_reduce_max_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_max_op_mlu.py
@@ -130,8 +130,9 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.FP16)
         }
         self.outputs = {
-            'Out': self.inputs['X'].max(
-                axis=tuple(self.attrs['dim'])).astype(np.float16)
+            'Out':
+            self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(
+                np.float16)
         }
 
     def init_dtype(self):
@@ -158,8 +159,9 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.FP32)
         }
         self.outputs = {
-            'Out': self.inputs['X'].max(
-                axis=tuple(self.attrs['dim'])).astype(np.float32)
+            'Out':
+            self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(
+                np.float32)
         }
 
     def init_dtype(self):
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py
index c0be644c79115..5fa30f400f204 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -26,6 +27,7 @@
 
 
 class TestMeanOp(OpTest):
+
     def set_mlu(self):
         self.__class__.use_mlu = True
         self.place = paddle.device.MLUPlace(0)
@@ -44,6 +46,7 @@ def test_check_grad(self):
 
 
 class TestMeanOp5D(TestMeanOp):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "reduce_mean"
@@ -54,6 +57,7 @@ def setUp(self):
 
 
 class TestMeanOp6D(TestMeanOp):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "reduce_mean"
@@ -64,6 +68,7 @@ def setUp(self):
 
 
 class TestMeanOp8D(TestMeanOp):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "reduce_mean"
@@ -75,6 +80,7 @@ def setUp(self):
 
 
 class Test1DReduce(TestMeanOp):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "reduce_mean"
@@ -83,6 +89,7 @@ def setUp(self):
 
 
 class Test2DReduce0(Test1DReduce):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "reduce_mean"
@@ -92,6 +99,7 @@ def setUp(self):
 
 
 class Test2DReduce1(Test1DReduce):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "reduce_mean"
@@ -103,6 +111,7 @@ def setUp(self):
 
 
 class Test3DReduce0(Test1DReduce):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "reduce_mean"
@@ -114,6 +123,7 @@ def setUp(self):
 
 
 class Test3DReduce1(Test1DReduce):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "reduce_mean"
@@ -125,6 +135,7 @@ def setUp(self):
 
 
 class Test3DReduce2(Test1DReduce):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "reduce_mean"
@@ -136,6 +147,7 @@ def setUp(self):
 
 
 class Test3DReduce3(Test1DReduce):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "reduce_mean"
@@ -147,18 +159,21 @@ def setUp(self):
 
 
 class TestKeepDimReduce(Test1DReduce):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "reduce_mean"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
         self.attrs = {'dim': [1], 'keep_dim': True}
         self.outputs = {
-            'Out': self.inputs['X'].mean(
-                axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])
+            'Out':
+            self.inputs['X'].mean(axis=tuple(self.attrs['dim']),
+                                  keepdims=self.attrs['keep_dim'])
         }
 
 
 class TestKeepDim8DReduce(Test1DReduce):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "reduce_mean"
@@ -167,12 +182,14 @@ def setUp(self):
         }
         self.attrs = {'dim': (3, 4, 5), 'keep_dim': True}
         self.outputs = {
-            'Out': self.inputs['X'].mean(
-                axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])
+            'Out':
+            self.inputs['X'].mean(axis=tuple(self.attrs['dim']),
+                                  keepdims=self.attrs['keep_dim'])
         }
 
 
 class TestReduceAll(Test1DReduce):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "reduce_mean"
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_min_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_min_op_mlu.py
index 284f8f984c232..a2f8007973c1e 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_reduce_min_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_min_op_mlu.py
@@ -130,8 +130,9 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.FP16)
         }
         self.outputs = {
-            'Out': self.inputs['X'].min(
-                axis=tuple(self.attrs['dim'])).astype(np.float16)
+            'Out':
+            self.inputs['X'].min(axis=tuple(self.attrs['dim'])).astype(
+                np.float16)
         }
 
     def init_dtype(self):
@@ -158,8 +159,9 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.FP32)
         }
         self.outputs = {
-            'Out': self.inputs['X'].min(
-                axis=tuple(self.attrs['dim'])).astype(np.float32)
+            'Out':
+            self.inputs['X'].min(axis=tuple(self.attrs['dim'])).astype(
+                np.float32)
         }
 
     def init_dtype(self):
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py
index d2729d77abaa7..ab98418744337 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -25,6 +26,7 @@
 
 
 class TestMLUReduceSumOp(OpTest):
+
     def setUp(self):
         self.init_op_type()
         self.initTestCase()
@@ -39,8 +41,9 @@ def setUp(self):
             self.outputs = {'Out': self.inputs['X'].sum()}
         else:
             self.outputs = {
-                'Out': self.inputs['X'].sum(axis=self.axis,
-                                            keepdims=self.attrs['keep_dim'])
+                'Out':
+                self.inputs['X'].sum(axis=self.axis,
+                                     keepdims=self.attrs['keep_dim'])
             }
 
     def set_mlu(self):
@@ -65,66 +68,77 @@ def initTestCase(self):
 
 
 class TestSumOp5D(TestMLUReduceSumOp):
+
     def initTestCase(self):
         self.shape = (1, 2, 5, 6, 10)
         self.axis = (0, )
 
 
 class TestSumOp6D(TestMLUReduceSumOp):
+
     def initTestCase(self):
         self.shape = (1, 1, 2, 5, 6, 10)
         self.axis = (0, )
 
 
 class TestSumOp8D(TestMLUReduceSumOp):
+
     def initTestCase(self):
         self.shape = (1, 3, 1, 2, 1, 4, 3, 10)
         self.axis = (0, 3)
 
 
 class Test1DReduce(TestMLUReduceSumOp):
+
     def initTestCase(self):
         self.shape = 120
         self.axis = (0, )
 
 
 class Test2DReduce0(TestMLUReduceSumOp):
+
     def initTestCase(self):
         self.shape = (20, 10)
         self.axis = (0, )
 
 
 class Test2DReduce1(TestMLUReduceSumOp):
+
     def initTestCase(self):
         self.shape = (20, 10)
         self.axis = (1, )
 
 
 class Test3DReduce0(TestMLUReduceSumOp):
+
     def initTestCase(self):
         self.shape = (5, 6, 7)
         self.axis = (1, )
 
 
 class Test3DReduce1(TestMLUReduceSumOp):
+
     def initTestCase(self):
         self.shape = (5, 6, 7)
         self.axis = (2, )
 
 
 class Test3DReduce2(TestMLUReduceSumOp):
+
     def initTestCase(self):
         self.shape = (5, 6, 7)
         self.axis = (-2, )
 
 
 class Test3DReduce3(TestMLUReduceSumOp):
+
     def initTestCase(self):
         self.shape = (5, 6, 7)
         self.axis = (1, 2)
 
 
 class TestKeepDimReduce(TestMLUReduceSumOp):
+
     def initTestCase(self):
         self.shape = (5, 6, 10)
         self.axis = (1, )
@@ -132,6 +146,7 @@ def initTestCase(self):
 
 
 class TestKeepDim8DReduce(TestMLUReduceSumOp):
+
     def initTestCase(self):
         self.shape = (2, 5, 3, 2, 2, 3, 4, 2)
         self.axis = (3, 4, 5)
@@ -139,6 +154,7 @@ def initTestCase(self):
 
 
 class TestReduceAll(TestMLUReduceSumOp):
+
     def initTestCase(self):
         self.shape = (5, 6, 2, 10)
         self.axis = (0, )
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py
index 54b1afd036331..ffb6fee30f5e7 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py
@@ -20,6 +20,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 
 paddle.enable_static()
@@ -34,6 +35,7 @@ def ref_relu6(x, threshold=6.0):
 
 
 class TestRelu6(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "relu6"
@@ -63,6 +65,7 @@ def init_dtype(self):
 
 
 class TestRelu6Float16(TestRelu6):
+
     def set_mlu(self):
         self.__class__.use_mlu = True
         self.__class__.no_need_check_grad = True
@@ -75,6 +78,7 @@ def test_check_output(self):
 
 
 class TestReluNeg(TestRelu6):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "relu6"
@@ -101,6 +105,7 @@ def test_check_output(self):
 
 
 class TestRelu6Net(unittest.TestCase):
+
     def _test(self, run_mlu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -115,8 +120,9 @@ def _test(self, run_mlu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.nn.functional.relu6(sum)
@@ -140,12 +146,13 @@ def _test(self, run_mlu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py
index 25c50f67949e7..495711e5303f3 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestRelu(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "relu"
@@ -52,6 +54,7 @@ def test_check_output(self):
 
 
 class TestReluFp16(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "relu"
@@ -78,6 +81,7 @@ def test_check_output(self):
 
 
 class TestReluNeg(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "relu"
@@ -103,6 +107,7 @@ def test_check_output(self):
 
 
 class TestReluNet(unittest.TestCase):
+
     def _test(self, run_mlu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -117,8 +122,9 @@ def _test(self, run_mlu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.nn.functional.relu(sum)
@@ -142,12 +148,13 @@ def _test(self, run_mlu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reshape2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reshape2_op_mlu.py
index 9cff269913fe9..2fe28af81b161 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_reshape2_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reshape2_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestReshape2(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "reshape2"
@@ -56,6 +58,7 @@ def test_check_grad_normal(self):
 
 
 class TestReshape2_case2(TestReshape2):
+
     def init_data(self):
         self.ori_shape = (2, 100)
         self.new_shape = (-1, 10)
@@ -63,6 +66,7 @@ def init_data(self):
 
 
 class TestReshape2_case3(TestReshape2):
+
     def init_data(self):
         self.ori_shape = (100, 5, 6)
         self.new_shape = (-1, 0, 3)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py
index 53254c738d985..aed58a352f4dc 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append('..')
 from op_test import OpTest
 import paddle
@@ -29,6 +30,7 @@
 
 
 class TestScaleOp(OpTest):
+
     def setUp(self):
         self.op_type = "scale"
         self.place = paddle.device.MLUPlace(0)
@@ -49,6 +51,7 @@ def test_check_output(self):
 
 
 class TestScaleOpScaleVariable(OpTest):
+
     def setUp(self):
         self.op_type = "scale"
         self.place = paddle.device.MLUPlace(0)
@@ -71,6 +74,7 @@ def test_check_output(self):
 
 
 class TestScaleOpSelectedRows(unittest.TestCase):
+
     def init_dtype_type(self):
         pass
 
@@ -129,7 +133,9 @@ def test_scale_selected_rows_inplace(self):
 
 
 class TestScaleRaiseError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_type():
             fluid.layers.scale([10])
 
@@ -140,6 +146,7 @@ def test_type():
 @unittest.skipIf(not core.is_compiled_with_mlu(),
                  "core is not compiled with MLU")
 class TestScaleFp16Op(TestScaleOp):
+
     def init_dtype_type(self):
         self.dtype = np.float16
 
@@ -150,6 +157,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_mlu(),
                  "core is not compiled with MLU")
 class TestScaleFp16OpSelectedRows(TestScaleOpSelectedRows):
+
     def init_dtype_type(self):
         self.dtype = np.float16
 
@@ -163,6 +171,7 @@ def test_scale_selected_rows_inplace(self):
 
 
 class TestScaleApiStatic(unittest.TestCase):
+
     def _executed_api(self, x, scale=1.0, bias=0.0):
         return paddle.scale(x, scale, bias)
 
@@ -180,11 +189,13 @@ def test_api(self):
 
 
 class TestScaleInplaceApiStatic(TestScaleApiStatic):
+
     def _executed_api(self, x, scale=1.0, bias=0.0):
         return x.scale_(scale, bias)
 
 
 class TestScaleApiDygraph(unittest.TestCase):
+
     def _executed_api(self, x, scale=1.0, bias=0.0):
         return paddle.scale(x, scale, bias)
 
@@ -198,6 +209,7 @@ def test_api(self):
 
 
 class TestScaleInplaceApiDygraph(TestScaleApiDygraph):
+
     def _executed_api(self, x, scale=1.0, bias=0.0):
         return x.scale_(scale, bias)
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sigmoid_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_sigmoid_op_mlu.py
index f4c5612377e1c..5438e3955d3b1 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_sigmoid_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sigmoid_op_mlu.py
@@ -26,6 +26,7 @@
 
 
 class TestMLUSigmoid(OpTest):
+
     def setUp(self):
         self.op_type = "sigmoid"
         self.set_mlu()
@@ -42,8 +43,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', max_relative_error=0.01)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.01)
 
     def set_mlu(self):
         self.__class__.use_mlu = True
@@ -54,6 +56,7 @@ def init_dtype(self):
 
 
 class TestMLUSigmoidFp16(TestMLUSigmoid):
+
     def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-3)
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
index 44532ddceb765..a074a9d91a8bc 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
@@ -18,6 +18,7 @@
 import numpy as np
 import paddle.fluid.core as core
 import sys
+
 sys.path.append('..')
 from op_test import OpTest
 import paddle.fluid as fluid
@@ -30,6 +31,7 @@
 # Situation 1: starts(list, no tensor), ends(list, no tensor)
 # 1.1 without attr(decrease)
 class TestSliceOp(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.set_mlu()
@@ -55,8 +57,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            self.place, ['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad_with_place(self.place, ['Input'],
+                                   'Out',
+                                   max_relative_error=0.006)
 
     def set_mlu(self):
         self.__class__.use_mlu = True
@@ -64,6 +67,7 @@ def set_mlu(self):
 
 
 class TestCase1(TestSliceOp):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-3, 0, 2]
@@ -74,6 +78,7 @@ def config(self):
 
 
 class TestCase2(TestSliceOp):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-3, 0, 2]
@@ -85,6 +90,7 @@ def config(self):
 
 # 1.2 with attr(decrease)
 class TestSliceOp_decs_dim(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.set_mlu()
@@ -112,8 +118,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            self.place, ['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad_with_place(self.place, ['Input'],
+                                   'Out',
+                                   max_relative_error=0.006)
 
     def set_mlu(self):
         self.__class__.use_mlu = True
@@ -121,6 +128,7 @@ def set_mlu(self):
 
 
 class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [1, 0, 2]
@@ -132,6 +140,7 @@ def config(self):
 
 
 class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-1, 0, 2]
@@ -143,6 +152,7 @@ def config(self):
 
 
 class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 7]).astype("float32")
         self.starts = [0, 1, 2, 3]
@@ -154,6 +164,7 @@ def config(self):
 
 
 class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-1]
@@ -165,6 +176,7 @@ def config(self):
 
 
 class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [0, 1, 2, 3]
@@ -178,6 +190,7 @@ def config(self):
 # Situation 2: starts(list, have tensor), ends(list, no tensor)
 # without attr(decrease)
 class TestSliceOp_starts_ListTensor(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.set_mlu()
@@ -211,8 +224,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            self.place, ['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad_with_place(self.place, ['Input'],
+                                   'Out',
+                                   max_relative_error=0.006)
 
     def set_mlu(self):
         self.__class__.use_mlu = True
@@ -222,6 +236,7 @@ def set_mlu(self):
 # Situation 2: starts(list, have tensor), ends(list, no tensor)
 #  with attr(decrease)
 class TestSliceOp_decs_dim_starts_ListTensor(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.set_mlu()
@@ -258,8 +273,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            self.place, ['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad_with_place(self.place, ['Input'],
+                                   'Out',
+                                   max_relative_error=0.006)
 
     def set_mlu(self):
         self.__class__.use_mlu = True
@@ -268,6 +284,7 @@ def set_mlu(self):
 
 class TestSliceOp_decs_dim_5_starts_ListTensor(
         TestSliceOp_decs_dim_starts_ListTensor):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-1]
@@ -283,6 +300,7 @@ def config(self):
 # Situation 3: starts(tensor), ends(list, no tensor)
 # with attr(decrease)
 class TestSliceOp_decs_dim_starts_OneTensor(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.__class__.use_mlu = True
@@ -290,8 +308,7 @@ def setUp(self):
         self.config()
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32")
+            "StartsTensor": np.array(self.starts, dtype="int32")
         }
         self.outputs = {'Out': self.out}
         self.attrs = {
@@ -315,13 +332,15 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            self.place, ['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad_with_place(self.place, ['Input'],
+                                   'Out',
+                                   max_relative_error=0.006)
 
 
 # Situation 4: starts(tensor), ends(tensor)
 # without attr(decrease)
 class TestSliceOp_starts_OneTensor_ends_OneTensor(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.__class__.use_mlu = True
@@ -330,10 +349,8 @@ def setUp(self):
 
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int64"),
-            "EndsTensor": np.array(
-                self.ends, dtype="int32")
+            "StartsTensor": np.array(self.starts, dtype="int64"),
+            "EndsTensor": np.array(self.ends, dtype="int32")
         }
         self.outputs = {'Out': self.out}
         self.attrs = {
@@ -355,13 +372,15 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            self.place, ['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad_with_place(self.place, ['Input'],
+                                   'Out',
+                                   max_relative_error=0.006)
 
 
 # Situation 5: starts(tensor), ends(tensor)
 #  with attr(decrease)
 class TestSliceOp_decs_dim_starts_and_ends_OneTensor(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.__class__.use_mlu = True
@@ -369,10 +388,8 @@ def setUp(self):
         self.config()
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32"),
-            "EndsTensor": np.array(
-                self.ends, dtype="int32")
+            "StartsTensor": np.array(self.starts, dtype="int32"),
+            "EndsTensor": np.array(self.ends, dtype="int32")
         }
         self.outputs = {'Out': self.out}
         self.attrs = {
@@ -396,13 +413,15 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            self.place, ['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad_with_place(self.place, ['Input'],
+                                   'Out',
+                                   max_relative_error=0.006)
 
 
 # Situation 6: starts(tensor), ends(list, have tensor)
 # without attr(decrease)
 class TestSliceOp_starts_OneTensor_ends_ListTensor(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.__class__.use_mlu = True
@@ -416,8 +435,7 @@ def setUp(self):
 
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32"),
+            "StartsTensor": np.array(self.starts, dtype="int32"),
             'EndsTensorList': ends_tensor
         }
         self.outputs = {'Out': self.out}
@@ -442,12 +460,14 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            self.place, ['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad_with_place(self.place, ['Input'],
+                                   'Out',
+                                   max_relative_error=0.006)
 
 
 # Test float16
 class TestFP16(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.__class__.use_mlu = True
@@ -475,11 +495,13 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-5)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            self.place, ['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad_with_place(self.place, ['Input'],
+                                   'Out',
+                                   max_relative_error=0.006)
 
 
 class TestFP16_2(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.__class__.use_mlu = True
@@ -507,27 +529,24 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-5)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            self.place, ['Input'],
-            'Out',
-            max_relative_error=0.006,
-            numeric_grad_delta=0.5)
+        self.check_grad_with_place(self.place, ['Input'],
+                                   'Out',
+                                   max_relative_error=0.006,
+                                   numeric_grad_delta=0.5)
 
 
 class TestSliceApiWithTensor(unittest.TestCase):
+
     def test_starts_ends_is_tensor(self):
         with paddle.fluid.dygraph.guard():
             a = paddle.rand(shape=[4, 5, 6], dtype='float32')
             axes = [0, 1, 2]
             starts = [-3, 0, 2]
             ends = [3, 2, 4]
-            a_1 = paddle.slice(
-                a,
-                axes=axes,
-                starts=paddle.to_tensor(
-                    starts, dtype='int32'),
-                ends=paddle.to_tensor(
-                    ends, dtype='int32'))
+            a_1 = paddle.slice(a,
+                               axes=axes,
+                               starts=paddle.to_tensor(starts, dtype='int32'),
+                               ends=paddle.to_tensor(ends, dtype='int32'))
             a_2 = paddle.slice(a, axes=axes, starts=starts, ends=ends)
 
             self.assertTrue(np.array_equal(a_1.numpy(), a_2.numpy()))
@@ -550,6 +569,7 @@ def test_bool_tensor(self):
 
 
 class TestImperativeVarBaseGetItem(unittest.TestCase):
+
     def test_getitem_with_long(self):
         with fluid.dygraph.guard():
             data = np.random.random((2, 80, 16128)).astype('float32')
@@ -561,6 +581,7 @@ def test_getitem_with_long(self):
             self.assertEqual(sliced.shape, [2, 78, 78])
 
     def test_getitem_with_float(self):
+
         def test_float_in_slice_item():
             with fluid.dygraph.guard():
                 data = np.random.random((2, 80, 16128)).astype('float32')
@@ -579,6 +600,7 @@ def test_float_in_index():
 
 
 class TestInferShape(unittest.TestCase):
+
     def test(self):
         x = paddle.ones(shape=[3, 4, 5])
         x.desc.set_shape([3, -1, 5])
@@ -594,7 +616,9 @@ def test_axis_less_than_zero(self):
             x_arr = np.arange(0, 24, dtype=np.float32).reshape([2, 3, 4])
             x = paddle.to_tensor(x_arr)
 
-            pp_slice = paddle.slice(x, [100, ], [0], [1])
+            pp_slice = paddle.slice(x, [
+                100,
+            ], [0], [1])
             np_slice = x_arr[:, :, 0:1]
             self.assertTrue(np.array_equal(pp_slice, np_slice))
 
@@ -606,13 +630,9 @@ def test_axis_less_than_zero(self):
             x = paddle.to_tensor(np.reshape(x_arr, (0, 0, 0)))
 
             starts = paddle.to_tensor(
-                np.reshape(
-                    np.array(
-                        [], dtype=np.int32), (0, )))
+                np.reshape(np.array([], dtype=np.int32), (0, )))
             ends = paddle.to_tensor(
-                np.reshape(
-                    np.array(
-                        [], dtype=np.int32), (0, )))
+                np.reshape(np.array([], dtype=np.int32), (0, )))
 
             with self.assertRaises(ValueError):
                 paddle.slice(x, [-1000000], starts, ends)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_softmax_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_softmax_op_mlu.py
index 54acafcf0df5e..766b88aa15452 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_softmax_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_softmax_op_mlu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append('..')
 from op_test import OpTest
 import paddle.fluid.core as core
@@ -48,6 +49,7 @@ def ref_softmax(x, axis=None, dtype=None):
 
 
 class TestSoftmaxOp(OpTest):
+
     def get_x_shape(self):
         return [10, 10]
 
@@ -68,7 +70,9 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
-        self.attrs = {'axis': self.axis, }
+        self.attrs = {
+            'axis': self.axis,
+        }
 
     def init_kernel_type(self):
         pass
@@ -77,16 +81,19 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ["X"], "Out", max_relative_error=0.01)
+        self.check_grad_with_place(self.place, ["X"],
+                                   "Out",
+                                   max_relative_error=0.01)
 
 
 class TestSoftmaxOp2(TestSoftmaxOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
 
 class TestSoftmaxOp3(TestSoftmaxOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
@@ -95,6 +102,7 @@ def get_axis(self):
 
 
 class TestSoftmaxOp4(TestSoftmaxOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
@@ -103,6 +111,7 @@ def get_axis(self):
 
 
 class TestSoftmaxOp5(TestSoftmaxOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
@@ -111,6 +120,7 @@ def get_axis(self):
 
 
 class TestSoftmaxOp6(TestSoftmaxOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
@@ -119,6 +129,7 @@ def get_axis(self):
 
 
 class TestSoftmaxAPI(unittest.TestCase):
+
     def setUp(self):
         self.place = paddle.MLUPlace(0)
         self.x_np = np.random.uniform(-1., 1., [2, 3, 4, 5]).astype('float32')
@@ -171,16 +182,19 @@ def test_error(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, self.softmax, 1)
             # The input dtype must be float16, float32
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[2, 3], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[2, 3],
+                                        dtype='int32')
             self.assertRaises(TypeError, self.softmax, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[2, 3], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[2, 3],
+                                       dtype='float16')
             self.softmax(x_fp16)
 
 
 class TestSoftmaxInplaceAPI(TestSoftmaxAPI):
+
     def executed_api(self):
         self.softmax = F.softmax_
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
index e626b6a093766..f112cd6f66fa2 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -29,6 +30,7 @@
 
 
 class TestSoftmaxWithCrossEntropyOp(OpTest):
+
     def set_mlu(self):
         self.__class__.use_mlu = True
 
@@ -90,14 +92,14 @@ def test_check_grad(self):
         if self.dtype == np.float16:
             return
         # fp32 has low precision, cpu and mlu both need to relax the max_relative_error if using fp32
-        self.check_grad_with_place(
-            self.place, ['Logits'],
-            'Loss',
-            numeric_grad_delta=0.001,
-            max_relative_error=0.5)
+        self.check_grad_with_place(self.place, ['Logits'],
+                                   'Loss',
+                                   numeric_grad_delta=0.001,
+                                   max_relative_error=0.5)
 
 
 class TestPowNet(unittest.TestCase):
+
     def _test(self, run_mlu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -112,8 +114,9 @@ def _test(self, run_mlu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.pow(sum, 2.0)
@@ -137,12 +140,13 @@ def _test(self, run_mlu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py
index 773063c7a8ac9..e52b5ee301c5a 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,6 +25,7 @@
 
 
 class LinearNet(nn.Layer):
+
     def __init__(self):
         super(LinearNet, self).__init__()
         self._linear1 = nn.Linear(10, 10)
@@ -62,6 +63,7 @@ def train(print_result=False):
 
 
 class TestSpawn(unittest.TestCase):
+
     def test_nprocs_greater_than_device_num_error(self):
         with self.assertRaises(RuntimeError):
             _get_subprocess_env_list(nprocs=100, options=dict())
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py
index b8363545d2288..2728473f55088 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestCase1(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.set_example()
@@ -35,8 +37,9 @@ def setUp(self):
         self.place = paddle.device.MLUPlace(0)
         ipt = self.x.astype(self.dtype)
         axis = self.axis if isinstance(self.axis, int) else int(self.axis[0])
-        tmp_outs = np.split(
-            ipt, axis=axis, indices_or_sections=self.num_or_sections)
+        tmp_outs = np.split(ipt,
+                            axis=axis,
+                            indices_or_sections=self.num_or_sections)
         tmp_outs = [o.astype(self.dtype) for o in tmp_outs]
         self.outputs = {'Out': []}
         self.outs = []
@@ -63,6 +66,7 @@ def set_example(self):
 
 
 class TestCase2(TestCase1):
+
     def set_example(self):
         self.dtype = "float32"
         self.x = np.random.random((20, 4, 50))
@@ -71,6 +75,7 @@ def set_example(self):
 
 
 class TestCase4(TestCase1):
+
     def set_example(self):
         self.dtype = "float16"
         self.x = np.random.random((4, 50, 20))
@@ -80,6 +85,7 @@ def set_example(self):
 
 # Test Sections
 class TestCase5(TestCase1):
+
     def set_example(self):
         super().set_example()
         self.x = np.random.random((2, 10, 4))
@@ -92,6 +98,7 @@ def setUp(self):
 
 
 class API_TestSplit(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.layers.data('data', shape=[-1, 10], dtype='float32')
@@ -106,6 +113,7 @@ def test_out(self):
 
 
 class API_TestSplit2(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.layers.data('data', shape=[-1, 10], dtype='float32')
@@ -120,6 +128,7 @@ def test_out(self):
 
 
 class API_TestDygraphSplit(unittest.TestCase):
+
     def test_out1(self):
         with fluid.dygraph.guard(paddle.MLUPlace(0)):
             input_1 = np.random.random([4, 6, 6]).astype("int32")
@@ -151,6 +160,7 @@ def test_out2(self):
 
 # attr(axis) is Tensor
 class TestSplitOp_AxisTensor(OpTest):
+
     def setUp(self):
         self._set_op_type()
         self.dtype = self.get_dtype()
@@ -186,6 +196,7 @@ def test_check_output(self):
 
 
 class TestSplitOp_SectionsTensor(OpTest):
+
     def setUp(self):
         self._set_op_type()
         self.dtype = self.get_dtype()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_squeeze2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_squeeze2_op_mlu.py
new file mode 100755
index 0000000000000..6a555ed63b8f6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_squeeze2_op_mlu.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import sys
+
+sys.path.append("..")
+import numpy as np
+import paddle
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+# Correct: General.
+class TestSqueezeOp(OpTest):
+
+    def setUp(self):
+        self.init_test_case()
+        self.set_mlu()
+        self.op_type = "squeeze2"
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 40)
+        self.axes = (0, 2)
+        self.new_shape = (3, 40)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes}
+
+
+# Correct: There is mins axis.
+class TestSqueezeOp1(TestSqueezeOp):
+
+    def init_test_case(self):
+        self.ori_shape = (1, 20, 1, 5)
+        self.axes = (0, -2)
+        self.new_shape = (20, 5)
+
+
+# Correct: No axes input.
+class TestSqueezeOp2(TestSqueezeOp):
+
+    def init_test_case(self):
+        self.ori_shape = (1, 20, 1, 5)
+        self.axes = ()
+        self.new_shape = (20, 5)
+
+
+# Correct: Just part of axes be squeezed.
+class TestSqueezeOp3(TestSqueezeOp):
+
+    def init_test_case(self):
+        self.ori_shape = (6, 1, 5, 1, 4, 1)
+        self.axes = (1, -1)
+        self.new_shape = (6, 5, 1, 4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_squeeze_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_squeeze_op_mlu.py
new file mode 100644
index 0000000000000..dc60ab96d207c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_squeeze_op_mlu.py
@@ -0,0 +1,128 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import sys
+
+sys.path.append("..")
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from op_test import OpTest, convert_float_to_uint16
+import paddle.fluid.core as core
+
+paddle.enable_static()
+
+
+# Correct: General.
+class TestSqueezeOp(OpTest):
+
+    def setUp(self):
+        self.op_type = "squeeze"
+        self.init_test_case()
+        self.set_mlu()
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float64")}
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 40)
+        self.axes = (0, 2)
+        self.new_shape = (3, 40)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes}
+
+
+class TestSqueezeBF16Op(OpTest):
+
+    def setUp(self):
+        self.op_type = "squeeze"
+        self.dtype = np.uint16
+        self.init_test_case()
+        x = np.random.random(self.ori_shape).astype("float32")
+        out = x.reshape(self.new_shape)
+        self.inputs = {"X": convert_float_to_uint16(x)}
+        self.init_attrs()
+        self.outputs = {"Out": convert_float_to_uint16(out)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 40)
+        self.axes = (0, 2)
+        self.new_shape = (3, 40)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes}
+
+
+# Correct: There is mins axis.
+class TestSqueezeOp1(TestSqueezeOp):
+
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 40)
+        self.axes = (0, -2)
+        self.new_shape = (3, 40)
+
+
+# Correct: No axes input.
+class TestSqueezeOp2(TestSqueezeOp):
+
+    def init_test_case(self):
+        self.ori_shape = (1, 20, 1, 5)
+        self.axes = ()
+        self.new_shape = (20, 5)
+
+
+# Correct: Just part of axes be squeezed.
+class TestSqueezeOp3(TestSqueezeOp):
+
+    def init_test_case(self):
+        self.ori_shape = (6, 1, 5, 1, 4, 1)
+        self.axes = (1, -1)
+        self.new_shape = (6, 5, 1, 4)
+
+
+# Correct: The demension of axis is not of size 1 remains unchanged.
+class TestSqueezeOp4(TestSqueezeOp):
+
+    def init_test_case(self):
+        self.ori_shape = (6, 1, 5, 1, 4, 1)
+        self.axes = (1, 2)
+        self.new_shape = (6, 5, 1, 4, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_sum_op_mlu.py
index e9db14de46ab5..428401408bf92 100755
--- a/python/paddle/fluid/tests/unittests/mlu/test_sum_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sum_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestSum1(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.init_dtype()
@@ -54,6 +56,7 @@ def test_check_output(self):
 
 
 class TestSum2(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.init_dtype()
@@ -66,7 +69,7 @@ def setUp(self):
         x3 = np.random.random((3, 3)).astype(self.dtype)
         self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2), ("x3", x3)]}
         # There will be a problem if just using `y=x0+x1+x2+x3` to calculate the
-        # summation result as the reference standard result. The reason is that 
+        # summation result as the reference standard result. The reason is that
         # numpy's fp16 data has precision loss when doing `add` operation.
         # For example, the results of `x0+x1+x2+x3` is different from that of
         # `x3+x2+x1+x0` if the dtype is fp16.
@@ -88,6 +91,7 @@ def test_check_output(self):
 
 
 class TestSum3(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.init_dtype()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_tanh_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_tanh_op_mlu.py
index a5aeeac0ffb9e..e1023a94bec5f 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_tanh_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_tanh_op_mlu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestTanh(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "tanh"
@@ -58,6 +60,7 @@ def test_check_grad(self):
 
 
 class TestTanhFp16(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "tanh"
@@ -84,6 +87,7 @@ def test_check_output(self):
 
 
 class TestTanhNet(unittest.TestCase):
+
     def _test(self, run_mlu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -98,8 +102,9 @@ def _test(self, run_mlu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             c = paddle.multiply(a, b)
             d = paddle.tanh(c)
@@ -123,12 +128,13 @@ def _test(self, run_mlu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py
index 366f783ce0d2a..33caf2ff52207 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append('..')
 from op_test import OpTest
 import paddle
@@ -26,6 +27,7 @@
 
 
 class TestTopkOp(OpTest):
+
     def setUp(self):
         self.variable_k = False
         self.set_args()
@@ -66,6 +68,7 @@ def test_check_output(self):
 
 
 class TestTopkFP16Op(TestTopkOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_top_k_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_top_k_v2_op_mlu.py
index 8979344bd4505..57081f1a54564 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_top_k_v2_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_top_k_v2_op_mlu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append('..')
 from op_test import OpTest
 import paddle
@@ -40,6 +41,7 @@ def numpy_topk(x, k=1, axis=-1, largest=True):
 
 
 class TestTopkOp(OpTest):
+
     def init_args(self):
         self.k = 3
         self.axis = 1
@@ -55,8 +57,10 @@ def setUp(self):
         self.init_args()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
-        output, indices = numpy_topk(
-            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        output, indices = numpy_topk(self.input_data,
+                                     axis=self.axis,
+                                     k=self.k,
+                                     largest=self.largest)
         self.outputs = {'Out': output, 'Indices': indices}
 
     def test_check_output(self):
@@ -65,6 +69,7 @@ def test_check_output(self):
 
 
 class TestTopkOp1(TestTopkOp):
+
     def init_args(self):
         self.k = 3
         self.axis = 0
@@ -72,6 +77,7 @@ def init_args(self):
 
 
 class TestTopkOp2(TestTopkOp):
+
     def init_args(self):
         self.k = 4
         self.axis = 0
@@ -79,6 +85,7 @@ def init_args(self):
 
 
 class TestTopkOp3(OpTest):
+
     def init_args(self):
         self.k = 6
         self.axis = 1
@@ -91,12 +98,15 @@ def setUp(self):
         self.init_args()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
-        output, indices = numpy_topk(
-            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        output, indices = numpy_topk(self.input_data,
+                                     axis=self.axis,
+                                     k=self.k,
+                                     largest=self.largest)
         self.outputs = {'Out': output, 'Indices': indices}
 
 
 class TestTopkOp4(TestTopkOp):
+
     def init_args(self):
         self.k = 3
         self.axis = 1
@@ -111,12 +121,15 @@ def setUp(self):
         self.init_args()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
-        output, indices = numpy_topk(
-            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        output, indices = numpy_topk(self.input_data,
+                                     axis=self.axis,
+                                     k=self.k,
+                                     largest=self.largest)
         self.outputs = {'Out': output, 'Indices': indices}
 
 
 class TestTopkOp5(TestTopkOp):
+
     def init_args(self):
         self.k = 3
         self.axis = 1
@@ -131,12 +144,15 @@ def setUp(self):
         self.init_args()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
-        output, indices = numpy_topk(
-            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        output, indices = numpy_topk(self.input_data,
+                                     axis=self.axis,
+                                     k=self.k,
+                                     largest=self.largest)
         self.outputs = {'Out': output, 'Indices': indices}
 
 
 class TestTopkOp6(OpTest):
+
     def init_args(self):
         self.k = 100
         self.axis = 1
@@ -151,12 +167,15 @@ def setUp(self):
         self.init_args()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
-        output, indices = numpy_topk(
-            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        output, indices = numpy_topk(self.input_data,
+                                     axis=self.axis,
+                                     k=self.k,
+                                     largest=self.largest)
         self.outputs = {'Out': output, 'Indices': indices}
 
 
 class TestTopKAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.dtype = np.float32
@@ -195,15 +214,16 @@ def run_dygraph(self, place):
         numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False)
         self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
         self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
-        # test case for basic test case 6 for the partial sort 
+        # test case for basic test case 6 for the partial sort
         paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1)
         numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
         self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
         self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
-        # test case for basic test case 7 for the unsorted 
+        # test case for basic test case 7 for the unsorted
         paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
-        sort_paddle = numpy_topk(
-            np.array(paddle_result[0].numpy()), axis=1, k=2)
+        sort_paddle = numpy_topk(np.array(paddle_result[0].numpy()),
+                                 axis=1,
+                                 k=2)
         numpy_result = numpy_topk(self.input_data, k=2, axis=1)
         self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
 
@@ -211,10 +231,12 @@ def run_static(self, place):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            input_tensor = paddle.static.data(
-                name="x", shape=[6, 7, 8], dtype="float32")
-            large_input_tensor = paddle.static.data(
-                name="large_x", shape=[2, 1030], dtype="float32")
+            input_tensor = paddle.static.data(name="x",
+                                              shape=[6, 7, 8],
+                                              dtype="float32")
+            large_input_tensor = paddle.static.data(name="large_x",
+                                                    shape=[2, 1030],
+                                                    dtype="float32")
             k_tensor = paddle.static.data(name="k", shape=[1], dtype="int32")
             result1 = paddle.topk(input_tensor, k=2)
             result2 = paddle.topk(input_tensor, k=2, axis=-1)
@@ -228,17 +250,18 @@ def run_static(self, place):
             exe = paddle.static.Executor(place)
             input_data = np.random.rand(10, 20).astype("float32")
             large_input_data = np.random.rand(2, 100).astype("float32")
-            paddle_result = exe.run(
-                feed={
-                    "x": self.input_data,
-                    "large_x": self.large_input_data,
-                    "k": np.array([2]).astype("int32")
-                },
-                fetch_list=[
-                    result1[0], result1[1], result2[0], result2[1], result3[0],
-                    result3[1], result4[0], result4[1], result5[0], result5[1],
-                    result6[0], result6[1], result7[0], result7[1]
-                ])
+            paddle_result = exe.run(feed={
+                "x": self.input_data,
+                "large_x": self.large_input_data,
+                "k": np.array([2]).astype("int32")
+            },
+                                    fetch_list=[
+                                        result1[0], result1[1], result2[0],
+                                        result2[1], result3[0], result3[1],
+                                        result4[0], result4[1], result5[0],
+                                        result5[1], result6[0], result6[1],
+                                        result7[0], result7[1]
+                                    ])
             numpy_result = numpy_topk(self.input_data, k=2)
             self.assertTrue(np.allclose(paddle_result[0], numpy_result[0]))
             self.assertTrue(np.allclose(paddle_result[1], numpy_result[1]))
@@ -248,12 +271,16 @@ def run_static(self, place):
             numpy_result = numpy_topk(self.input_data, k=2, axis=1)
             self.assertTrue(np.allclose(paddle_result[4], numpy_result[0]))
             self.assertTrue(np.allclose(paddle_result[5], numpy_result[1]))
-            numpy_result = numpy_topk(
-                self.input_data, k=2, axis=1, largest=False)
+            numpy_result = numpy_topk(self.input_data,
+                                      k=2,
+                                      axis=1,
+                                      largest=False)
             self.assertTrue(np.allclose(paddle_result[6], numpy_result[0]))
             self.assertTrue(np.allclose(paddle_result[7], numpy_result[1]))
-            numpy_result = numpy_topk(
-                self.input_data, k=2, axis=-1, largest=False)
+            numpy_result = numpy_topk(self.input_data,
+                                      k=2,
+                                      axis=-1,
+                                      largest=False)
             self.assertTrue(np.allclose(paddle_result[8], numpy_result[0]))
             self.assertTrue(np.allclose(paddle_result[9], numpy_result[1]))
             numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py
index 6f1bda477f07d..bcb41283de91e 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append('..')
 from op_test import OpTest, convert_float_to_uint16
 import paddle
@@ -28,12 +29,15 @@
 
 
 class TestTransposeOp(OpTest):
+
     def setUp(self):
         self.init_op_type()
         self.initKernelType()
         self.initTestCase()
         self.inputs = {'X': np.random.random(self.shape).astype("float32")}
-        self.attrs = {'axis': list(self.axis), }
+        self.attrs = {
+            'axis': list(self.axis),
+        }
         self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
 
     def init_op_type(self):
@@ -55,71 +59,83 @@ def initKernelType(self):
 
 
 class TestCase0(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (100, )
         self.axis = (0, )
 
 
 class TestCase1(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (3, 4, 10)
         self.axis = (0, 2, 1)
 
 
 class TestCase2(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5)
         self.axis = (0, 2, 3, 1)
 
 
 class TestCase3(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6)
         self.axis = (4, 2, 3, 1, 0)
 
 
 class TestCase4(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6, 1)
         self.axis = (4, 2, 3, 1, 0, 5)
 
 
 class TestCase5(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 16, 96)
         self.axis = (0, 2, 1)
 
 
 class TestCase6(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 10, 12, 16)
         self.axis = (3, 1, 2, 0)
 
 
 class TestCase7(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 10, 2, 16)
         self.axis = (0, 1, 3, 2)
 
 
 class TestCase8(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
         self.axis = (0, 1, 3, 2, 4, 5, 6, 7)
 
 
 class TestCase9(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
         self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
 
 
 class TestTransposeOpBool(TestTransposeOp):
+
     def test_check_grad(self):
         pass
 
 
 class TestTransposeOpBool1D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (100, )
         self.axis = (0, )
@@ -128,6 +144,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool2D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (3, 40)
         self.axis = (1, 0)
@@ -136,6 +153,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool3D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (3, 4, 10)
         self.axis = (0, 2, 1)
@@ -144,6 +162,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool4D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5)
         self.axis = (0, 2, 3, 1)
@@ -152,6 +171,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool5D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6)
         self.axis = (4, 2, 3, 1, 0)
@@ -160,6 +180,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool6D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6, 1)
         self.axis = (4, 2, 3, 1, 0, 5)
@@ -168,6 +189,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool7D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (2, 3, 2, 3, 2, 4, 3)
         self.axis = (0, 1, 3, 2, 4, 5, 6)
@@ -176,6 +198,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool8D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
         self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
@@ -184,6 +207,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
@@ -216,6 +240,7 @@ def test_each_elem_value_check():
 
 
 class TestTransposeApi(unittest.TestCase):
+
     def test_static_out(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
@@ -252,6 +277,7 @@ def test_dygraph_out(self):
 
 
 class TestTAPI(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program()):
             data = fluid.data(shape=[10], dtype="float32", name="data")
@@ -318,6 +344,7 @@ def test_x_dimension_check():
 
 
 class TestMoveAxis(unittest.TestCase):
+
     def test_moveaxis1(self):
         x_np = np.random.randn(2, 3, 4, 5, 7).astype('float32')
         expected = np.moveaxis(x_np, [0, 4, 3, 2], [1, 3, 2, 0])
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_uniform_random_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_uniform_random_op_mlu.py
index 3847b010c144c..70289853e8921 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_uniform_random_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_uniform_random_op_mlu.py
@@ -18,6 +18,7 @@
 import subprocess
 import unittest
 import numpy as np
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -40,6 +41,7 @@ def output_hist(out):
 
 
 class TestMLUUniformRandomOp(OpTest):
+
     def setUp(self):
         self.set_mlu()
         self.op_type = "uniform_random"
@@ -69,12 +71,12 @@ def test_check_output(self):
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestMLUUniformRandomOpSelectedRows(unittest.TestCase):
+
     def get_places(self):
         places = [core.CPUPlace()]
         if core.is_compiled_with_mlu():
@@ -89,19 +91,17 @@ def check_with_place(self, place):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         paddle.seed(10)
-        op = Operator(
-            "uniform_random",
-            Out="X",
-            shape=[1000, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10)
+        op = Operator("uniform_random",
+                      Out="X",
+                      shape=[1000, 784],
+                      min=-5.0,
+                      max=10.0,
+                      seed=10)
         op.run(scope, place)
         self.assertEqual(out.get_tensor().shape(), [1000, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_unsqueeze2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_unsqueeze2_op_mlu.py
index 0ed5eb7e8a9bc..0dc498bf6e948 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_unsqueeze2_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_unsqueeze2_op_mlu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import unittest
 import sys
+
 sys.path.append("..")
 
 import numpy as np
@@ -27,6 +28,7 @@
 
 # Correct: General.
 class TestUnsqueezeOp(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.set_mlu()
@@ -59,6 +61,7 @@ def init_attrs(self):
 
 # Correct: Single input index.
 class TestUnsqueezeOp1(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (-1, )
@@ -67,6 +70,7 @@ def init_test_case(self):
 
 # Correct: Mixed input axis.
 class TestUnsqueezeOp2(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (0, -1)
@@ -75,6 +79,7 @@ def init_test_case(self):
 
 # Correct: There is duplicated axis.
 class TestUnsqueezeOp3(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (0, 3, 3)
@@ -83,6 +88,7 @@ def init_test_case(self):
 
 # Correct: Reversed axes.
 class TestUnsqueezeOp4(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (3, 1, 1)
@@ -91,6 +97,7 @@ def init_test_case(self):
 
 # axes is a list(with tensor)
 class TestUnsqueezeOp_AxesTensorList(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.set_mlu()
@@ -131,6 +138,7 @@ def init_attrs(self):
 
 
 class TestUnsqueezeOp1_AxesTensorList(TestUnsqueezeOp_AxesTensorList):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (-1, )
@@ -138,6 +146,7 @@ def init_test_case(self):
 
 
 class TestUnsqueezeOp2_AxesTensorList(TestUnsqueezeOp_AxesTensorList):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (0, -1)
@@ -145,6 +154,7 @@ def init_test_case(self):
 
 
 class TestUnsqueezeOp3_AxesTensorList(TestUnsqueezeOp_AxesTensorList):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (0, 3, 3)
@@ -152,6 +162,7 @@ def init_test_case(self):
 
 
 class TestUnsqueezeOp4_AxesTensorList(TestUnsqueezeOp_AxesTensorList):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (3, 1, 1)
@@ -160,6 +171,7 @@ def init_test_case(self):
 
 # axes is a Tensor
 class TestUnsqueezeOp_AxesTensor(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.set_mlu()
@@ -195,6 +207,7 @@ def init_attrs(self):
 
 
 class TestUnsqueezeOp1_AxesTensor(TestUnsqueezeOp_AxesTensor):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (-1, )
@@ -202,6 +215,7 @@ def init_test_case(self):
 
 
 class TestUnsqueezeOp2_AxesTensor(TestUnsqueezeOp_AxesTensor):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (0, -1)
@@ -209,6 +223,7 @@ def init_test_case(self):
 
 
 class TestUnsqueezeOp3_AxesTensor(TestUnsqueezeOp_AxesTensor):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (0, 3, 3)
@@ -216,6 +231,7 @@ def init_test_case(self):
 
 
 class TestUnsqueezeOp4_AxesTensor(TestUnsqueezeOp_AxesTensor):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (3, 1, 1)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_unsqueeze_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_unsqueeze_op_mlu.py
index d75a2f4d21a28..47ab0c472908a 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_unsqueeze_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_unsqueeze_op_mlu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import unittest
 import sys
+
 sys.path.append("..")
 
 import numpy as np
@@ -27,6 +28,7 @@
 
 # Correct: General.
 class TestUnsqueezeOp(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.set_mlu()
@@ -56,6 +58,7 @@ def init_attrs(self):
 
 # Correct: Single input index.
 class TestUnsqueezeOp1(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (-1, )
@@ -64,6 +67,7 @@ def init_test_case(self):
 
 # Correct: Mixed input axis.
 class TestUnsqueezeOp2(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (0, -1)
@@ -72,6 +76,7 @@ def init_test_case(self):
 
 # Correct: There is duplicated axis.
 class TestUnsqueezeOp3(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (0, 3, 3)
@@ -80,6 +85,7 @@ def init_test_case(self):
 
 # Correct: Reversed axes.
 class TestUnsqueezeOp4(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (3, 1, 1)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_unstack_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_unstack_op_mlu.py
index a75a6aa1dfcb9..23ed85926a04e 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_unstack_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_unstack_op_mlu.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import unittest
@@ -25,6 +26,7 @@
 
 
 class TestUnStackOpBase(OpTest):
+
     def initDefaultParameters(self):
         self.input_dim = (5, 6, 7)
         self.axis = 0
@@ -74,21 +76,25 @@ def test_check_grad(self):
 
 
 class TestStackOp3(TestUnStackOpBase):
+
     def initParameters(self):
         self.axis = -1
 
 
 class TestStackOp4(TestUnStackOpBase):
+
     def initParameters(self):
         self.axis = -3
 
 
 class TestStackOp5(TestUnStackOpBase):
+
     def initParameters(self):
         self.axis = 1
 
 
 class TestStackOp6(TestUnStackOpBase):
+
     def initParameters(self):
         self.axis = 2
 
diff --git a/python/paddle/fluid/tests/unittests/multi_process.py b/python/paddle/fluid/tests/unittests/multi_process.py
index f999ce803a512..fa6b7200f32f9 100644
--- a/python/paddle/fluid/tests/unittests/multi_process.py
+++ b/python/paddle/fluid/tests/unittests/multi_process.py
@@ -44,7 +44,7 @@ def train_abort(prefix):
 
     if trainer_id == 0:
         try:
-            # train abort 
+            # train abort
             exit(1)
         except SystemExit:
             name = "abort>>> selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
diff --git a/python/paddle/fluid/tests/unittests/my_data_generator.py b/python/paddle/fluid/tests/unittests/my_data_generator.py
index ac906b3256163..00fd636467c2f 100644
--- a/python/paddle/fluid/tests/unittests/my_data_generator.py
+++ b/python/paddle/fluid/tests/unittests/my_data_generator.py
@@ -22,7 +22,9 @@
 
 
 class MyDataset(fleet.MultiSlotDataGenerator):
+
     def generate_sample(self, line):
+
         def data_iter():
             elements = line.strip().split()[0:]
             output = [("show", [int(elements[0])]),
diff --git a/python/paddle/fluid/tests/unittests/new_group.py b/python/paddle/fluid/tests/unittests/new_group.py
index c9c4acc3220c7..56ef510c3047f 100644
--- a/python/paddle/fluid/tests/unittests/new_group.py
+++ b/python/paddle/fluid/tests/unittests/new_group.py
@@ -18,6 +18,7 @@
 
 
 class TestNewGroupAPI(object):
+
     def __init__(self):
         paddle.distributed.init_parallel_env()
         d1 = np.array([1, 2, 3])
@@ -32,26 +33,27 @@ def test_all(self):
 
         tmp = np.array([0, 0, 0])
         result = paddle.to_tensor(tmp)
-        paddle.distributed.scatter(
-            result, [self.tensor2, self.tensor1],
-            src=0,
-            group=gp,
-            use_calc_stream=True)
+        paddle.distributed.scatter(result, [self.tensor2, self.tensor1],
+                                   src=0,
+                                   group=gp,
+                                   use_calc_stream=True)
         if gp.rank == 0:
             assert np.array_equal(result, self.tensor2)
         elif gp.rank == 1:
             assert np.array_equal(result, self.tensor1)
         print("test scatter api ok")
 
-        paddle.distributed.broadcast(
-            result, src=1, group=gp, use_calc_stream=True)
+        paddle.distributed.broadcast(result,
+                                     src=1,
+                                     group=gp,
+                                     use_calc_stream=True)
         assert np.array_equal(result, self.tensor1)
         print("test broadcast api ok")
 
         paddle.distributed.reduce(result, dst=0, group=gp, use_calc_stream=True)
         if gp.rank == 0:
-            assert np.array_equal(result,
-                                  paddle.add(self.tensor1, self.tensor1))
+            assert np.array_equal(result, paddle.add(self.tensor1,
+                                                     self.tensor1))
         elif gp.rank == 1:
             assert np.array_equal(result, self.tensor1)
         print("test reduce api ok")
@@ -67,8 +69,10 @@ def test_all(self):
         print("test wait api ok")
 
         result = []
-        paddle.distributed.all_gather(
-            result, self.tensor1, group=gp, use_calc_stream=True)
+        paddle.distributed.all_gather(result,
+                                      self.tensor1,
+                                      group=gp,
+                                      use_calc_stream=True)
         assert np.array_equal(result[0], self.tensor1)
         assert np.array_equal(result[1], self.tensor1)
         print("test all_gather api ok")
diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
index e9d9af5c11366..7498fa72194d9 100644
--- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
@@ -1,26 +1,32 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-if (WITH_ASCEND_CL)
-    foreach(TEST_OP ${TEST_OPS})
-        py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-    endforeach(TEST_OP)
+if(WITH_ASCEND_CL)
+  foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  endforeach(TEST_OP)
 
-    # NOTE: NPU `get_float_status` read the value from register, During the test,
-    # it is found that this register will be overwritten by any program on the card.
-    # In order to prevent the interference of nan/inf in the other unittests, we
-    # need to set the unittests related to `float_status` to exclusive.
-    set_tests_properties(test_amp_check_finite_and_scale_op_npu PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-    set_tests_properties(test_flags_check_nan_inf_npu PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-    set_tests_properties(test_float_status_op_npu PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  # NOTE: NPU `get_float_status` read the value from register, During the test,
+  # it is found that this register will be overwritten by any program on the card.
+  # In order to prevent the interference of nan/inf in the other unittests, we
+  # need to set the unittests related to `float_status` to exclusive.
+  set_tests_properties(test_amp_check_finite_and_scale_op_npu
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  set_tests_properties(test_flags_check_nan_inf_npu
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  set_tests_properties(test_float_status_op_npu PROPERTIES LABELS
+                                                           "RUN_TYPE=EXCLUSIVE")
 
-    # Note: the following test cases has running time more than 120s
-    set_tests_properties(test_nearest_interp_op_npu PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_nearest_interp_v2_op_npu PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_bilinear_interp_v2_op_npu PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_stack_op_npu PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_matmulv2_op_npu PROPERTIES TIMEOUT 300)
-    set_tests_properties(test_elementwise_add_op_npu PROPERTIES TIMEOUT 200)
+  # Note: the following test cases has running time more than 120s
+  set_tests_properties(test_nearest_interp_op_npu PROPERTIES TIMEOUT 200)
+  set_tests_properties(test_nearest_interp_v2_op_npu PROPERTIES TIMEOUT 200)
+  set_tests_properties(test_bilinear_interp_v2_op_npu PROPERTIES TIMEOUT 200)
+  set_tests_properties(test_stack_op_npu PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200)
+  set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_matmulv2_op_npu PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_elementwise_add_op_npu PROPERTIES TIMEOUT 200)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/npu/collective_identity_op_npu.py b/python/paddle/fluid/tests/unittests/npu/collective_identity_op_npu.py
index a85bd4fccc3a7..d5e8a5dd55a38 100644
--- a/python/paddle/fluid/tests/unittests/npu/collective_identity_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/collective_identity_op_npu.py
@@ -38,6 +38,7 @@
 
 
 class TestCollectiveIdentity(TestCollectiveRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -45,20 +46,22 @@ def get_model(self, main_prog, startup_program):
         ring_id = 0
         nranks = 2
         with fluid.program_guard(main_prog, startup_program):
-            tindata = layers.data(
-                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = layers.data(name="tindata",
+                                  shape=[10, 1000],
+                                  dtype='float32')
             toutdata = main_prog.current_block().create_var(
                 name="outofgather",
                 dtype='float32',
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
                 stop_gradient=False)
-            main_prog.global_block().append_op(
-                type="c_identity",
-                inputs={'X': tindata},
-                outputs={'Out': toutdata},
-                attrs={'ring_id': ring_id,
-                       'nranks': nranks})
+            main_prog.global_block().append_op(type="c_identity",
+                                               inputs={'X': tindata},
+                                               outputs={'Out': toutdata},
+                                               attrs={
+                                                   'ring_id': ring_id,
+                                                   'nranks': nranks
+                                               })
             return toutdata
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py b/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py
index 37a24885be1bf..88ab49ea9adff 100644
--- a/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py
+++ b/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py
@@ -39,6 +39,7 @@ def init_process_group(strategy=None):
 
 
 class TestProcessGroupFp32(unittest.TestCase):
+
     def setUp(self):
         paddle.seed(2022)
         random.seed(2022)
@@ -234,6 +235,7 @@ def test_create_process_group_nccl(self):
 
 
 class TestProcessGroupFp16(TestProcessGroupFp32):
+
     def setUp(self):
         paddle.seed(2022)
         random.seed(2022)
diff --git a/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py
index 361efebce9175..dd2868af0fea0 100644
--- a/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py
@@ -18,6 +18,7 @@
 import argparse
 import os
 import sys
+
 sys.path.append("..")
 import signal
 import time
@@ -43,6 +44,7 @@
 
 
 class TestSyncBatchNormOpTraining(TestSyncBatchNormRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -66,11 +68,10 @@ def get_model(self,
         use_cudnn = False
         with fluid.unique_name.guard():
             with fluid.program_guard(main, startup):
-                data = fluid.layers.data(
-                    name='input',
-                    shape=self.dshape,
-                    dtype=self.dtype,
-                    append_batch_size=False)
+                data = fluid.layers.data(name='input',
+                                         shape=self.dshape,
+                                         dtype=self.dtype,
+                                         append_batch_size=False)
                 conv = fluid.layers.conv2d(
                     input=data,
                     num_filters=32,
diff --git a/python/paddle/fluid/tests/unittests/npu/test_abs_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_abs_op_npu.py
index 3c16a24b33191..48a0761098353 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_abs_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_abs_op_npu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -26,6 +27,7 @@
 
 
 class TestNPUAbs(OpTest):
+
     def setUp(self):
         self.op_type = "abs"
         self.set_npu()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
index 0f55c8b591487..8c7d6fcfb3eee 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestAccuracy(OpTest):
+
     def setUp(self):
         self.op_type = "accuracy"
         self.set_npu()
@@ -62,6 +64,7 @@ def test_check_output(self):
 
 
 class TestAccuracy2(TestAccuracy):
+
     def setUp(self):
         self.op_type = "accuracy"
         self.set_npu()
@@ -86,6 +89,7 @@ def setUp(self):
 
 
 class TestAccuracyType(TestAccuracy):
+
     def setUp(self):
         self.op_type = "accuracy"
         self.set_npu()
@@ -110,6 +114,7 @@ def setUp(self):
 
 
 class TestAccuracyType2(TestAccuracy):
+
     def setUp(self):
         self.op_type = "accuracy"
         self.set_npu()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
index 4899938766fbc..92cd3025b07e4 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
@@ -15,6 +15,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestAdam(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -78,6 +80,7 @@ def test_check_output(self):
 
 
 class TestAdamWithEpsilonTensor(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -132,6 +135,7 @@ def test_check_output(self):
 
 
 class TestAdamOpWithSkipUpdate(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -184,6 +188,7 @@ def test_check_output(self):
 
 
 class TestAdamOpWithGlobalBetaPow(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -241,6 +246,7 @@ def test_check_output(self):
 
 
 class TestNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -255,8 +261,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.pow(sum, 2.0)
@@ -280,12 +287,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
@@ -301,6 +309,7 @@ def test_npu(self):
 
 
 class TestNetWithEpsilonTensor(unittest.TestCase):
+
     def _test(self,
               place,
               use_tensor=True,
@@ -331,8 +340,9 @@ def _test(self,
             with paddle.utils.unique_name.guard():
                 a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
                 b = paddle.static.data(name="b", shape=[2, 2], dtype='float32')
-                label = paddle.static.data(
-                    name="label", shape=[2, 1], dtype='int64')
+                label = paddle.static.data(name="label",
+                                           shape=[2, 1],
+                                           dtype='int64')
 
                 sum = paddle.add(a, b)
                 z = paddle.pow(sum, 2.0)
@@ -378,12 +388,11 @@ def _test(self,
                             align_size=256,
                             grad_clip=clip)
                     else:
-                        adam = paddle.optimizer.Adam(
-                            learning_rate=0.01,
-                            beta1=beta1,
-                            beta2=beta2,
-                            epsilon=epsilon,
-                            grad_clip=clip)
+                        adam = paddle.optimizer.Adam(learning_rate=0.01,
+                                                     beta1=beta1,
+                                                     beta2=beta2,
+                                                     epsilon=epsilon,
+                                                     grad_clip=clip)
                 else:
                     if use_fluid_api:
                         adam = fluid.optimizer.Adam(
@@ -396,12 +405,11 @@ def _test(self,
                             align_size=256,
                             grad_clip=clip)
                     else:
-                        adam = fluid.optimizer.Adam(
-                            learning_rate=0.01,
-                            beta1=beta1_init,
-                            beta2=beta2_init,
-                            epsilon=epsilon_init,
-                            grad_clip=clip)
+                        adam = fluid.optimizer.Adam(learning_rate=0.01,
+                                                    beta1=beta1_init,
+                                                    beta2=beta2_init,
+                                                    epsilon=epsilon_init,
+                                                    grad_clip=clip)
 
                 adam.minimize(loss)
 
@@ -412,12 +420,13 @@ def _test(self,
 
             print("Start run on {}".format(place))
             for epoch in range(10):
-                pred_res, loss_res = exe.run(
-                    main_prog,
-                    feed={"a": a_np,
-                          "b": b_np,
-                          "label": label_np},
-                    fetch_list=[prediction, loss])
+                pred_res, loss_res = exe.run(main_prog,
+                                             feed={
+                                                 "a": a_np,
+                                                 "b": b_np,
+                                                 "label": label_np
+                                             },
+                                             fetch_list=[prediction, loss])
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
             paddle.disable_static()
@@ -431,9 +440,10 @@ def _test_with_place(self, place):
             for use_fluid_api in [True, False]:
                 for use_global_beta_pow in [True, False]:
                     for flatten_param_grads in [True, False]:
-                        pred, loss = self._test(
-                            place, use_tensor, use_fluid_api,
-                            use_global_beta_pow, flatten_param_grads)
+                        pred, loss = self._test(place, use_tensor,
+                                                use_fluid_api,
+                                                use_global_beta_pow,
+                                                flatten_param_grads)
                         preds.append(pred)
                         losses.append(loss)
         for pred in preds:
diff --git a/python/paddle/fluid/tests/unittests/npu/test_adamw_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adamw_op_npu.py
index 78ee572d11fee..8a0966339e871 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_adamw_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_adamw_op_npu.py
@@ -15,6 +15,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestAdamW(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -84,6 +86,7 @@ def test_check_output(self):
 
 
 class TestAdamOpWithSkipUpdate(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -136,6 +139,7 @@ def test_check_output(self):
 
 
 class TestAdamOpWithoutDecay(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -188,6 +192,7 @@ def test_check_output(self):
 
 
 class TestNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -202,8 +207,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.pow(sum, 2.0)
@@ -227,12 +233,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
index 604eb32db0a6c..d67b10845799b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -26,6 +27,7 @@
 
 
 class TestCheckFiniteAndUnscale(unittest.TestCase):
+
     def get_prog(self):
         paddle.enable_static()
         main_program = paddle.static.Program()
@@ -33,8 +35,9 @@ def get_prog(self):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
             scale = paddle.static.data(name="scale", shape=[1], dtype='float32')
-            float_status = paddle.static.data(
-                name="status", shape=[8], dtype='float32')
+            float_status = paddle.static.data(name="status",
+                                              shape=[8],
+                                              dtype='float32')
             main_program.global_block().append_op(
                 type="alloc_float_status",
                 outputs={"FloatStatus": float_status})
@@ -43,8 +46,9 @@ def get_prog(self):
                 inputs={"FloatStatus": float_status},
                 outputs={"FloatStatusOut": float_status})
             c = paddle.fluid.layers.elementwise_div(a, b)
-            out, found_inf = check_finite_and_unscale(
-                [c], scale, float_status=float_status)
+            out, found_inf = check_finite_and_unscale([c],
+                                                      scale,
+                                                      float_status=float_status)
 
         return main_program, out, found_inf, float_status
 
@@ -54,9 +58,11 @@ def run_prog(self, a, b, scale):
         exe = fluid.Executor(place)
         out_, founf_inf_, float_status_ = exe.run(
             main_program,
-            feed={"a": a,
-                  "b": b,
-                  "scale": scale},
+            feed={
+                "a": a,
+                "b": b,
+                "scale": scale
+            },
             fetch_list=[out, found_inf, float_status])
         print(float_status_)
         return out_, founf_inf_
@@ -94,6 +100,7 @@ def test_not_contains_nan_inf(self):
 
 
 class TestCheckFiniteAndUnscaleClearFloatStatus(unittest.TestCase):
+
     def get_prog(self):
         paddle.enable_static()
         main_program = paddle.static.Program()
@@ -101,8 +108,9 @@ def get_prog(self):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
             scale = paddle.static.data(name="scale", shape=[1], dtype='float32')
-            float_status = paddle.static.data(
-                name="status", shape=[8], dtype='float32')
+            float_status = paddle.static.data(name="status",
+                                              shape=[8],
+                                              dtype='float32')
             main_program.global_block().append_op(
                 type="alloc_float_status",
                 outputs={"FloatStatus": float_status})
@@ -111,8 +119,9 @@ def get_prog(self):
                 inputs={"FloatStatus": float_status},
                 outputs={"FloatStatusOut": float_status})
             c = paddle.fluid.layers.elementwise_div(a, b)
-            out, found_inf = check_finite_and_unscale(
-                [c], scale, float_status=float_status)
+            out, found_inf = check_finite_and_unscale([c],
+                                                      scale,
+                                                      float_status=float_status)
             main_program.global_block().append_op(
                 type="alloc_float_status",
                 outputs={"FloatStatus": float_status})
@@ -121,8 +130,9 @@ def get_prog(self):
                 inputs={"FloatStatus": float_status},
                 outputs={"FloatStatusOut": float_status})
             d = paddle.fluid.layers.elementwise_add(a, b)
-            out, found_inf = check_finite_and_unscale(
-                [d], scale, float_status=float_status)
+            out, found_inf = check_finite_and_unscale([d],
+                                                      scale,
+                                                      float_status=float_status)
 
         return main_program, out, found_inf, float_status
 
@@ -132,9 +142,11 @@ def run_prog(self, a, b, scale):
         exe = fluid.Executor(place)
         out_, founf_inf_, float_status_ = exe.run(
             main_program,
-            feed={"a": a,
-                  "b": b,
-                  "scale": scale},
+            feed={
+                "a": a,
+                "b": b,
+                "scale": scale
+            },
             fetch_list=[out, found_inf, float_status])
         print(float_status_)
         return out_, founf_inf_
diff --git a/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py
index c6135383721e1..12da1794e4cab 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class BaseTestCase(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -52,6 +54,7 @@ def test_check_output(self):
 
 # test argmax, dtype: float16
 class TestArgMaxFloat16Case1(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4, 5)
@@ -60,6 +63,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat16Case2(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4, 5)
@@ -68,6 +72,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat16Case3(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4, 5)
@@ -76,6 +81,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat16Case4(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4, 5)
@@ -84,6 +90,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat16Case5(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4)
@@ -92,6 +99,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat16Case6(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4)
@@ -100,6 +108,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat16Case7(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4)
@@ -108,6 +117,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat16Case8(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (1, )
@@ -116,6 +126,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat16Case9(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (2, )
@@ -124,6 +135,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat16Case10(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, )
@@ -133,6 +145,7 @@ def initTestCase(self):
 
 # test argmax, dtype: float32
 class TestArgMaxFloat32Case1(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4, 5)
@@ -141,6 +154,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat32Case2(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4, 5)
@@ -149,6 +163,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat32Case3(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4, 5)
@@ -157,6 +172,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat32Case4(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4, 5)
@@ -165,6 +181,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat32Case5(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4)
@@ -173,6 +190,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat32Case6(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4)
@@ -181,6 +199,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat32Case7(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4)
@@ -189,6 +208,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat32Case8(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (1, )
@@ -197,6 +217,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat32Case9(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (2, )
@@ -205,6 +226,7 @@ def initTestCase(self):
 
 
 class TestArgMaxFloat32Case10(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, )
@@ -213,6 +235,7 @@ def initTestCase(self):
 
 
 class BaseTestComplex1_1(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -233,8 +256,7 @@ def setUp(self):
             'dtype': int(core.VarDesc.VarType.INT32)
         }
         self.outputs = {
-            'Out': np.argmax(
-                self.x, axis=self.axis).astype("int32")
+            'Out': np.argmax(self.x, axis=self.axis).astype("int32")
         }
 
     def test_check_output(self):
@@ -242,6 +264,7 @@ def test_check_output(self):
 
 
 class BaseTestComplex1_2(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -262,8 +285,7 @@ def setUp(self):
             'dtype': int(core.VarDesc.VarType.INT32)
         }
         self.outputs = {
-            'Out': np.argmax(
-                self.x, axis=self.axis).astype("int32")
+            'Out': np.argmax(self.x, axis=self.axis).astype("int32")
         }
 
     def test_check_output(self):
@@ -271,6 +293,7 @@ def test_check_output(self):
 
 
 class TestArgMaxAPI(unittest.TestCase):
+
     def initTestCase(self):
         self.dims = (3, 4, 5)
         self.dtype = 'float32'
@@ -282,6 +305,7 @@ def setUp(self):
         self.place = [paddle.NPUPlace(0)]
 
     def test_dygraph_api(self):
+
         def run(place):
             paddle.disable_static(place)
             np.random.seed(2021)
@@ -289,8 +313,8 @@ def run(place):
             tensor_input = paddle.to_tensor(numpy_input)
             numpy_output = np.argmax(numpy_input, axis=self.axis)
             paddle_output = paddle.argmax(tensor_input, axis=self.axis)
-            self.assertEqual(
-                np.allclose(numpy_output, paddle_output.numpy()), True)
+            self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()),
+                             True)
             paddle.enable_static()
 
         for place in self.place:
@@ -298,6 +322,7 @@ def run(place):
 
 
 class TestArgMaxAPI_2(unittest.TestCase):
+
     def initTestCase(self):
         self.dims = (3, 4, 5)
         self.dtype = 'float32'
@@ -310,17 +335,19 @@ def setUp(self):
         self.place = [paddle.NPUPlace(0)]
 
     def test_dygraph_api(self):
+
         def run(place):
             paddle.disable_static(place)
             np.random.seed(2021)
             numpy_input = (np.random.random(self.dims)).astype(self.dtype)
             tensor_input = paddle.to_tensor(numpy_input)
-            numpy_output = np.argmax(
-                numpy_input, axis=self.axis).reshape(1, 4, 5)
-            paddle_output = paddle.argmax(
-                tensor_input, axis=self.axis, keepdim=self.keep_dims)
-            self.assertEqual(
-                np.allclose(numpy_output, paddle_output.numpy()), True)
+            numpy_output = np.argmax(numpy_input,
+                                     axis=self.axis).reshape(1, 4, 5)
+            paddle_output = paddle.argmax(tensor_input,
+                                          axis=self.axis,
+                                          keepdim=self.keep_dims)
+            self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()),
+                             True)
             self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
             paddle.enable_static()
 
@@ -329,6 +356,7 @@ def run(place):
 
 
 class TestArgMaxAPI_3(unittest.TestCase):
+
     def initTestCase(self):
         self.dims = (1, 9)
         self.dtype = 'float32'
@@ -339,6 +367,7 @@ def setUp(self):
         self.place = [paddle.NPUPlace(0)]
 
     def test_dygraph_api(self):
+
         def run(place):
             paddle.disable_static(place)
             np.random.seed(2021)
@@ -346,8 +375,8 @@ def run(place):
             tensor_input = paddle.to_tensor(numpy_input)
             numpy_output = np.argmax(numpy_input).reshape([1])
             paddle_output = paddle.argmax(tensor_input)
-            self.assertEqual(
-                np.allclose(numpy_output, paddle_output.numpy()), True)
+            self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()),
+                             True)
             self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
             paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_arg_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_arg_min_op_npu.py
index 455f92b8ed6cf..b129c673c32ee 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_arg_min_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_arg_min_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -26,6 +27,7 @@
 
 
 class BaseTestCase(OpTest):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4)
@@ -51,6 +53,7 @@ def test_check_output(self):
 
 # test argmin, dtype: float16
 class TestArgMinFloat16Case1(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4, 5)
@@ -59,6 +62,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat16Case2(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4, 5)
@@ -67,6 +71,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat16Case3(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4, 5)
@@ -75,6 +80,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat16Case4(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4, 5)
@@ -83,6 +89,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat16Case5(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4)
@@ -91,6 +98,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat16Case6(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4)
@@ -99,6 +107,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat16Case7(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4)
@@ -107,6 +116,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat16Case8(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (1, )
@@ -115,6 +125,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat16Case9(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (2, )
@@ -123,6 +134,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat16Case10(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, )
@@ -132,6 +144,7 @@ def initTestCase(self):
 
 # test argmin, dtype: float32
 class TestArgMinFloat32Case1(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4, 5)
@@ -140,6 +153,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat32Case2(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4, 5)
@@ -148,6 +162,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat32Case3(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4, 5)
@@ -156,6 +171,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat32Case4(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4, 5)
@@ -164,6 +180,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat32Case5(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4)
@@ -172,6 +189,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat32Case6(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4)
@@ -180,6 +198,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat32Case7(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4)
@@ -188,6 +207,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat32Case8(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (1, )
@@ -196,6 +216,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat32Case9(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (2, )
@@ -204,6 +225,7 @@ def initTestCase(self):
 
 
 class TestArgMinFloat32Case10(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, )
@@ -212,6 +234,7 @@ def initTestCase(self):
 
 
 class TestArgMinAPI(unittest.TestCase):
+
     def initTestCase(self):
         self.dims = (3, 4, 5)
         self.dtype = 'float32'
@@ -223,6 +246,7 @@ def setUp(self):
         self.place = [paddle.NPUPlace(0)]
 
     def test_dygraph_api(self):
+
         def run(place):
             paddle.disable_static(place)
             np.random.seed(2021)
@@ -230,8 +254,8 @@ def run(place):
             tensor_input = paddle.to_tensor(numpy_input)
             numpy_output = np.argmin(numpy_input, axis=self.axis)
             paddle_output = paddle.argmin(tensor_input, axis=self.axis)
-            self.assertEqual(
-                np.allclose(numpy_output, paddle_output.numpy()), True)
+            self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()),
+                             True)
             paddle.enable_static()
 
         for place in self.place:
@@ -239,6 +263,7 @@ def run(place):
 
 
 class TestArgMaxAPI_2(unittest.TestCase):
+
     def initTestCase(self):
         self.dims = (3, 4, 5)
         self.dtype = 'float32'
@@ -251,17 +276,19 @@ def setUp(self):
         self.place = [paddle.NPUPlace(0)]
 
     def test_dygraph_api(self):
+
         def run(place):
             paddle.disable_static(place)
             np.random.seed(2021)
             numpy_input = (np.random.random(self.dims)).astype(self.dtype)
             tensor_input = paddle.to_tensor(numpy_input)
-            numpy_output = np.argmin(
-                numpy_input, axis=self.axis).reshape(1, 4, 5)
-            paddle_output = paddle.argmin(
-                tensor_input, axis=self.axis, keepdim=self.keep_dims)
-            self.assertEqual(
-                np.allclose(numpy_output, paddle_output.numpy()), True)
+            numpy_output = np.argmin(numpy_input,
+                                     axis=self.axis).reshape(1, 4, 5)
+            paddle_output = paddle.argmin(tensor_input,
+                                          axis=self.axis,
+                                          keepdim=self.keep_dims)
+            self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()),
+                             True)
             self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
             paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py
index ebabea93dd05f..59a5f35c99e98 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -32,6 +33,7 @@
 
 
 class TestArgsortOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "argsort"
@@ -50,11 +52,9 @@ def setUp(self):
     def get_output(self):
         if self.descending:
             self.indices = np.flip(
-                np.argsort(
-                    self.x, kind='heapsort', axis=self.axis), self.axis)
+                np.argsort(self.x, kind='heapsort', axis=self.axis), self.axis)
             self.sorted_x = np.flip(
-                np.sort(
-                    self.x, kind='heapsort', axis=self.axis), self.axis)
+                np.sort(self.x, kind='heapsort', axis=self.axis), self.axis)
         else:
             self.indices = np.argsort(self.x, kind='heapsort', axis=self.axis)
             self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis)
@@ -80,68 +80,80 @@ def init_direction(self):
 
 
 class TestArgsortOpAxis0NPU(TestArgsortOp):
+
     def init_axis(self):
         self.axis = 0
 
 
 class TestArgsortOpAxis1NPU(TestArgsortOp):
+
     def init_axis(self):
         self.axis = 1
 
 
 class TestArgsortOpAxis2NPU(TestArgsortOp):
+
     def init_axis(self):
         self.axis = 2
 
 
 class TestArgsortOpAxisNeg1NPU(TestArgsortOp):
+
     def init_axis(self):
         self.axis = -1
 
 
 class TestArgsortOpAxisNeg2NPU(TestArgsortOp):
+
     def init_axis(self):
         self.axis = -2
 
 
 class TestArgsortOpDescendingAxisNPU(TestArgsortOp):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis0NPU(TestArgsortOpAxis0NPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis1NPU(TestArgsortOpAxis1NPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis2NPU(TestArgsortOpAxis2NPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxisNeg1NPU(TestArgsortOpAxisNeg1NPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxisNeg2NPU(TestArgsortOpAxisNeg2NPU):
+
     def init_direction(self):
         self.descending = True
 
 
-# liurui25: argsort of npu has bug with type fp32, 
-# it will change the type from fp32 to fp16, 
+# liurui25: argsort of npu has bug with type fp32,
+# it will change the type from fp32 to fp16,
 # so the check_output_with_place add thw atol
 # this test is only used to test the grad
 # issue： https://gitee.com/ascend/modelzoo/issues/I44I7K
 
 
 class TestArgsortOpAxis0NPUFP32(TestArgsortOp):
+
     def init_axis(self):
         self.axis = 0
 
@@ -155,62 +167,74 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ["X"], "Out", max_relative_error=0.03)
+        self.check_grad_with_place(self.place, ["X"],
+                                   "Out",
+                                   max_relative_error=0.03)
 
 
 class TestArgsortOpAxis1NPUFP32(TestArgsortOpAxis0NPUFP32):
+
     def init_axis(self):
         self.axis = 1
 
 
 class TestArgsortOpAxis2NPUFP32(TestArgsortOpAxis0NPUFP32):
+
     def init_axis(self):
         self.axis = 2
 
 
 class TestArgsortOpAxisNeg1NPUFP32(TestArgsortOpAxis0NPUFP32):
+
     def init_axis(self):
         self.axis = -1
 
 
 class TestArgsortOpAxisNeg2NPUFP32(TestArgsortOpAxis0NPUFP32):
+
     def init_axis(self):
         self.axis = -2
 
 
 class TestArgsortOpDescendingAxisNPUFP32(TestArgsortOpAxis0NPUFP32):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis0NPUFP32(TestArgsortOpAxis0NPUFP32):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis1NPUFP32(TestArgsortOpAxis1NPUFP32):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis2NPUFP32(TestArgsortOpAxis2NPUFP32):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxisNeg1NPUFP32(TestArgsortOpAxisNeg1NPUFP32):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxisNeg2NPUFP32(TestArgsortOpAxisNeg2NPUFP32):
+
     def init_direction(self):
         self.descending = True
 
 
 # test cases for int64
 class TestArgsortOpAxis0NPUINT64(TestArgsortOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "argsort"
@@ -220,9 +244,10 @@ def setUp(self):
         self.init_axis()
         self.init_direction()
 
-        self.x = np.random.randint(
-            low=-100, high=100, size=self.input_shape,
-            dtype=self.dtype).astype(self.dtype)
+        self.x = np.random.randint(low=-100,
+                                   high=100,
+                                   size=self.input_shape,
+                                   dtype=self.dtype).astype(self.dtype)
         self.inputs = {"X": self.x}
         self.attrs = {"axis": self.axis, "descending": self.descending}
         self.get_output()
@@ -242,51 +267,61 @@ def set_npu(self):
 
 
 class TestArgsortOpAxis1NPUINT64(TestArgsortOpAxis0NPUINT64):
+
     def init_axis(self):
         self.axis = 1
 
 
 class TestArgsortOpAxis2NPUINT64(TestArgsortOpAxis0NPUINT64):
+
     def init_axis(self):
         self.axis = 2
 
 
 class TestArgsortOpAxisNeg1NPUINT64(TestArgsortOpAxis0NPUINT64):
+
     def init_axis(self):
         self.axis = -1
 
 
 class TestArgsortOpAxisNeg2NPUINT64(TestArgsortOpAxis0NPUINT64):
+
     def init_axis(self):
         self.axis = -2
 
 
 class TestArgsortOpDescendingAxisNPUINT64(TestArgsortOpAxis0NPUINT64):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis0NPUINT64(TestArgsortOpAxis0NPUINT64):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis1NPUINT64(TestArgsortOpAxis1NPUINT64):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis2NPUINT64(TestArgsortOpAxis2NPUINT64):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxisNeg1NPUINT64(TestArgsortOpAxisNeg1NPUINT64):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxisNeg2NPUINT64(TestArgsortOpAxisNeg2NPUINT64):
+
     def init_direction(self):
         self.descending = True
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_assign_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_assign_op_npu.py
index 14133d5a385ff..a070a63e7ede6 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_assign_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_assign_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestAssign(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py
index 71d4b45e61b18..808996d355fa0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy
 import sys
+
 sys.path.append("..")
 
 import op_test
@@ -30,6 +31,7 @@
 
 
 class TestAssignValueNPUOp(op_test.OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -56,31 +58,35 @@ def test_forward(self):
 
 
 class TestAssignValueNPUOp2(TestAssignValueNPUOp):
+
     def init_data(self):
         self.value = numpy.random.random(size=(2, 5)).astype(numpy.int32)
         self.attrs["int32_values"] = [int(v) for v in self.value.flat]
 
 
 class TestAssignValueNPUOp3(TestAssignValueNPUOp):
+
     def init_data(self):
         self.value = numpy.random.random(size=(2, 5)).astype(numpy.int64)
         self.attrs["int64_values"] = [int(v) for v in self.value.flat]
 
 
 class TestAssignValueNPUOp4(TestAssignValueNPUOp):
+
     def init_data(self):
-        self.value = numpy.random.choice(
-            a=[False, True], size=(2, 5)).astype(numpy.bool)
+        self.value = numpy.random.choice(a=[False, True],
+                                         size=(2, 5)).astype(numpy.bool)
         self.attrs["bool_values"] = [int(v) for v in self.value.flat]
 
 
 class TestAssignApi(unittest.TestCase):
+
     def setUp(self):
         self.init_dtype()
-        self.value = (
-            -100 + 200 * numpy.random.random(size=(2, 5))).astype(self.dtype)
-        self.place = fluid.NPUPlace(0) if fluid.core.is_compiled_with_npu(
-        ) else fluid.CPUPlace()
+        self.value = (-100 + 200 * numpy.random.random(size=(2, 5))).astype(
+            self.dtype)
+        self.place = fluid.NPUPlace(
+            0) if fluid.core.is_compiled_with_npu() else fluid.CPUPlace()
 
     def init_dtype(self):
         self.dtype = "float32"
@@ -93,29 +99,31 @@ def test_assign(self):
 
         exe = fluid.Executor(self.place)
         [fetched_x] = exe.run(main_program, feed={}, fetch_list=[x])
-        self.assertTrue(
-            numpy.array_equal(fetched_x, self.value),
-            "fetch_x=%s val=%s" % (fetched_x, self.value))
+        self.assertTrue(numpy.array_equal(fetched_x, self.value),
+                        "fetch_x=%s val=%s" % (fetched_x, self.value))
         self.assertEqual(fetched_x.dtype, self.value.dtype)
 
 
 class TestAssignApi2(TestAssignApi):
+
     def init_dtype(self):
         self.dtype = "int32"
 
 
 class TestAssignApi3(TestAssignApi):
+
     def init_dtype(self):
         self.dtype = "int64"
 
 
 class TestAssignApi4(TestAssignApi):
+
     def setUp(self):
         self.init_dtype()
-        self.value = numpy.random.choice(
-            a=[False, True], size=(2, 5)).astype(numpy.bool)
-        self.place = fluid.NPUPlace(0) if fluid.core.is_compiled_with_npu(
-        ) else fluid.CPUPlace()
+        self.value = numpy.random.choice(a=[False, True],
+                                         size=(2, 5)).astype(numpy.bool)
+        self.place = fluid.NPUPlace(
+            0) if fluid.core.is_compiled_with_npu() else fluid.CPUPlace()
 
     def init_dtype(self):
         self.dtype = "bool"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_atan_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_atan_op_npu.py
index a18b8a03075ef..b06481d3f7350 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_atan_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_atan_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestAtan(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "atan"
@@ -74,11 +76,13 @@ def test_check_output(self):
 
 
 class TestAtanShape(TestAtan):
+
     def set_attrs(self):
         self.shape = [12, 23, 10]
 
 
 class TestAtanFloat16(TestAtan):
+
     def set_attrs(self):
         self.dtype = np.float16
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
index e01b2b691a28a..c6b7fada1fb39 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 import paddle
 import paddle.fluid.core as core
@@ -32,6 +33,7 @@
 
 
 class TestBatchNormOpInference(unittest.TestCase):
+
     def setUp(self):
         self.dtype = np.float32
         self.init_kernel_type()
@@ -81,8 +83,9 @@ def check_with_place(self, place, data_layout, dtype, shape):
         with fluid.program_guard(program):
             block = program.global_block()
             for name in ground_truth:
-                block.create_var(
-                    name=name, dtype="float32", shape=ground_truth[name].shape)
+                block.create_var(name=name,
+                                 dtype="float32",
+                                 shape=ground_truth[name].shape)
             inputs = {
                 "X": block.var("x"),
                 "Scale": block.var("scale"),
@@ -106,8 +109,10 @@ def check_with_place(self, place, data_layout, dtype, shape):
             }
             block.create_var(name="reserve_space", dtype='float32')
             outputs["ReserveSpace"] = block.var('reserve_space')
-            bn_op = block.append_op(
-                type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+            bn_op = block.append_op(type="batch_norm",
+                                    inputs=inputs,
+                                    outputs=outputs,
+                                    attrs=attrs)
 
             program._sync_with_cpp()
 
@@ -132,6 +137,7 @@ def init_kernel_type(self):
 
 
 class TestFP16BatchNormOpInference(TestBatchNormOpInference):
+
     def setUp(self):
         self.dtype = np.float16
         self.init_kernel_type()
@@ -139,6 +145,7 @@ def setUp(self):
 
 
 class TestBatchNormOpTraining(unittest.TestCase):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -177,8 +184,9 @@ def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
         variance_out = var_ref * (1. - momentum) + momentum * variance
         saved_variance = 1. / np.sqrt(var_ref + epsilon)
         # run backward
-        x_grad, scale_grad, bias_grad = _reference_grad(
-            x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout)
+        x_grad, scale_grad, bias_grad = _reference_grad(x, y_grad, scale,
+                                                        saved_mean, var_ref,
+                                                        epsilon, data_layout)
 
         return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
 
@@ -194,6 +202,7 @@ def set_mean_variance(self, scale_shape, x, data_layout):
         return mean, variance
 
     def test_forward_backward(self):
+
         def test_with_place(place, data_layout, shape):
             # attr
             epsilon = self.epsilon
@@ -246,10 +255,9 @@ def test_with_place(place, data_layout, shape):
             with fluid.program_guard(program):
                 block = program.global_block()
                 for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape)
+                    block.create_var(name=name,
+                                     dtype='float32',
+                                     shape=ground_truth[name].shape)
                 inputs = {
                     "X": block.var('x'),
                     "Scale": block.var('scale'),
@@ -279,11 +287,10 @@ def test_with_place(place, data_layout, shape):
                 }
                 block.create_var(name="reserve_space", dtype='float32')
                 outputs["ReserveSpace"] = block.var('reserve_space')
-                bn_op = block.append_op(
-                    type="batch_norm",
-                    inputs=inputs,
-                    outputs=outputs,
-                    attrs=attrs)
+                bn_op = block.append_op(type="batch_norm",
+                                        inputs=inputs,
+                                        outputs=outputs,
+                                        attrs=attrs)
                 block.create_var(name='y@GRAD', dtype=self.dtype, shape=y.shape)
 
                 # generate backward op_desc
@@ -315,8 +322,10 @@ def test_with_place(place, data_layout, shape):
 
             for id, name in enumerate(self.fetch_list):
                 if name == 'variance':
-                    self.__assert_close(
-                        var_dict[name], out[id], name, atol=1e-3)
+                    self.__assert_close(var_dict[name],
+                                        out[id],
+                                        name,
+                                        atol=1e-3)
                     continue
                 self.__assert_close(var_dict[name], out[id], name)
             print("op test forward passed: ", str(place), data_layout)
@@ -330,11 +339,13 @@ def init_kernel_type(self):
 
 
 class TestFP16BatchNormOpTraining(TestBatchNormOpTraining):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining):
+
     def init_test_case(self):
         self.use_global_stats = False
         self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
@@ -342,6 +353,7 @@ def init_test_case(self):
 
 
 class TestBatchNormOpTrainingMomentumVariable(TestBatchNormOpTraining):
+
     def init_test_case(self):
         self.use_momentum_variable = True
         self.use_global_stats = False
@@ -353,6 +365,7 @@ def init_test_case(self):
 
 
 class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining):
+
     def init_test_case(self):
         self.use_global_stats = True
         self.no_grad_set = set()
@@ -436,6 +449,7 @@ def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
 
 class TestBatchNormOpFreezeStatsAndScaleBiasTraining(
         TestBatchNormOpFreezeStatsTraining):
+
     def init_test_case(self):
         self.use_global_stats = True
         self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
@@ -443,6 +457,7 @@ def init_test_case(self):
 
 
 class TestDygraphBatchNormTrainableStats(unittest.TestCase):
+
     def test_dygraph(self):
         places = [fluid.NPUPlace(0)]
         for p in places:
diff --git a/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py b/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py
index 7c3d32647aea9..b7a5cd2405e60 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py
@@ -19,6 +19,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 
@@ -33,23 +34,27 @@ def test_static_layer(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        input = paddle.fluid.data(
-            name='input', shape=input_np.shape, dtype='float32')
-        label = paddle.fluid.data(
-            name='label', shape=label_np.shape, dtype='float32')
+        input = paddle.fluid.data(name='input',
+                                  shape=input_np.shape,
+                                  dtype='float32')
+        label = paddle.fluid.data(name='label',
+                                  shape=label_np.shape,
+                                  dtype='float32')
         if weight_np is not None:
-            weight = paddle.fluid.data(
-                name='weight', shape=weight_np.shape, dtype='float32')
-            bce_loss = paddle.nn.loss.BCELoss(
-                weight=weight, reduction=reduction)
+            weight = paddle.fluid.data(name='weight',
+                                       shape=weight_np.shape,
+                                       dtype='float32')
+            bce_loss = paddle.nn.loss.BCELoss(weight=weight,
+                                              reduction=reduction)
         else:
             bce_loss = paddle.nn.loss.BCELoss(reduction=reduction)
         res = bce_loss(input, label)
         exe = paddle.static.Executor(place)
         static_result = exe.run(prog,
-                                feed={"input": input_np,
-                                      "label": label_np}
-                                if weight_np is None else {
+                                feed={
+                                    "input": input_np,
+                                    "label": label_np
+                                } if weight_np is None else {
                                     "input": input_np,
                                     "label": label_np,
                                     "weight": weight_np
@@ -66,23 +71,30 @@ def test_static_functional(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        input = paddle.fluid.data(
-            name='input', shape=input_np.shape, dtype='float32')
-        label = paddle.fluid.data(
-            name='label', shape=label_np.shape, dtype='float32')
+        input = paddle.fluid.data(name='input',
+                                  shape=input_np.shape,
+                                  dtype='float32')
+        label = paddle.fluid.data(name='label',
+                                  shape=label_np.shape,
+                                  dtype='float32')
         if weight_np is not None:
-            weight = paddle.fluid.data(
-                name='weight', shape=weight_np.shape, dtype='float32')
-            res = paddle.nn.functional.binary_cross_entropy(
-                input, label, weight=weight, reduction=reduction)
+            weight = paddle.fluid.data(name='weight',
+                                       shape=weight_np.shape,
+                                       dtype='float32')
+            res = paddle.nn.functional.binary_cross_entropy(input,
+                                                            label,
+                                                            weight=weight,
+                                                            reduction=reduction)
         else:
-            res = paddle.nn.functional.binary_cross_entropy(
-                input, label, reduction=reduction)
+            res = paddle.nn.functional.binary_cross_entropy(input,
+                                                            label,
+                                                            reduction=reduction)
         exe = paddle.static.Executor(place)
         static_result = exe.run(prog,
-                                feed={"input": input_np,
-                                      "label": label_np}
-                                if weight_np is None else {
+                                feed={
+                                    "input": input_np,
+                                    "label": label_np
+                                } if weight_np is None else {
                                     "input": input_np,
                                     "label": label_np,
                                     "weight": weight_np
@@ -119,11 +131,14 @@ def test_dygraph_functional(place,
 
     if weight_np is not None:
         weight = paddle.to_tensor(weight_np)
-        dy_res = paddle.nn.functional.binary_cross_entropy(
-            input, label, weight=weight, reduction=reduction)
+        dy_res = paddle.nn.functional.binary_cross_entropy(input,
+                                                           label,
+                                                           weight=weight,
+                                                           reduction=reduction)
     else:
-        dy_res = paddle.nn.functional.binary_cross_entropy(
-            input, label, reduction=reduction)
+        dy_res = paddle.nn.functional.binary_cross_entropy(input,
+                                                           label,
+                                                           reduction=reduction)
     dy_result = dy_res.numpy()
     paddle.enable_static()
     return dy_result
@@ -148,6 +163,7 @@ def calc_bceloss(input_np, label_np, reduction='mean', weight_np=None):
 
 
 class TestBCELoss(unittest.TestCase):
+
     def test_BCELoss(self):
         input_np = np.random.uniform(0.1, 0.8, size=(20, 30)).astype(np.float32)
         label_np = np.random.randint(0, 2, size=(20, 30)).astype(np.float32)
@@ -165,8 +181,8 @@ def test_BCELoss(self):
                 self.assertTrue(np.allclose(static_result, expected))
                 self.assertTrue(np.allclose(static_result, dy_result))
                 self.assertTrue(np.allclose(dy_result, expected))
-                static_functional = test_static_functional(place, input_np,
-                                                           label_np, reduction)
+                static_functional = test_static_functional(
+                    place, input_np, label_np, reduction)
                 dy_functional = test_dygraph_functional(place, input_np,
                                                         label_np, reduction)
                 self.assertTrue(np.allclose(static_functional, expected))
@@ -174,43 +190,57 @@ def test_BCELoss(self):
                 self.assertTrue(np.allclose(dy_functional, expected))
 
     def test_BCELoss_weight(self):
-        input_np = np.random.uniform(
-            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float32)
-        label_np = np.random.randint(
-            0, 2, size=(2, 3, 4, 10)).astype(np.float32)
+        input_np = np.random.uniform(0.1, 0.8,
+                                     size=(2, 3, 4, 10)).astype(np.float32)
+        label_np = np.random.randint(0, 2,
+                                     size=(2, 3, 4, 10)).astype(np.float32)
         weight_np = np.random.random(size=(3, 4, 10)).astype(np.float32)
-        place = fluid.NPUPlace(0) if fluid.core.is_compiled_with_npu(
-        ) else fluid.CPUPlace()
+        place = fluid.NPUPlace(
+            0) if fluid.core.is_compiled_with_npu() else fluid.CPUPlace()
         for reduction in ['sum', 'mean', 'none']:
-            static_result = test_static_layer(
-                place, input_np, label_np, reduction, weight_np=weight_np)
-            dy_result = test_dygraph_layer(
-                place, input_np, label_np, reduction, weight_np=weight_np)
-            expected = calc_bceloss(
-                input_np, label_np, reduction, weight_np=weight_np)
+            static_result = test_static_layer(place,
+                                              input_np,
+                                              label_np,
+                                              reduction,
+                                              weight_np=weight_np)
+            dy_result = test_dygraph_layer(place,
+                                           input_np,
+                                           label_np,
+                                           reduction,
+                                           weight_np=weight_np)
+            expected = calc_bceloss(input_np,
+                                    label_np,
+                                    reduction,
+                                    weight_np=weight_np)
             self.assertTrue(np.allclose(static_result, expected))
             self.assertTrue(np.allclose(static_result, dy_result))
             self.assertTrue(np.allclose(dy_result, expected))
-            static_functional = test_static_functional(
-                place, input_np, label_np, reduction, weight_np=weight_np)
-            dy_functional = test_dygraph_functional(
-                place, input_np, label_np, reduction, weight_np=weight_np)
+            static_functional = test_static_functional(place,
+                                                       input_np,
+                                                       label_np,
+                                                       reduction,
+                                                       weight_np=weight_np)
+            dy_functional = test_dygraph_functional(place,
+                                                    input_np,
+                                                    label_np,
+                                                    reduction,
+                                                    weight_np=weight_np)
             self.assertTrue(np.allclose(static_functional, expected))
             self.assertTrue(np.allclose(static_functional, dy_functional))
             self.assertTrue(np.allclose(dy_functional, expected))
 
     def test_BCELoss_error(self):
         paddle.disable_static(paddle.NPUPlace(0))
-        self.assertRaises(
-            ValueError, paddle.nn.loss.BCELoss, reduction="unsupport reduction")
+        self.assertRaises(ValueError,
+                          paddle.nn.loss.BCELoss,
+                          reduction="unsupport reduction")
         input = paddle.to_tensor([[0.1, 0.3]], dtype='float32')
         label = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
-        self.assertRaises(
-            ValueError,
-            paddle.nn.functional.binary_cross_entropy,
-            input=input,
-            label=label,
-            reduction="unsupport reduction")
+        self.assertRaises(ValueError,
+                          paddle.nn.functional.binary_cross_entropy,
+                          input=input,
+                          label=label,
+                          reduction="unsupport reduction")
         paddle.enable_static()
 
 
@@ -219,6 +249,7 @@ def bce_loss(input, label):
 
 
 class TestBceLossOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.init_test_case()
@@ -245,11 +276,13 @@ def init_test_case(self):
 
 
 class TestBceLossOpCase1(OpTest):
+
     def init_test_cast(self):
         self.shape = [2, 3, 4, 5]
 
 
 class TestBceLossOpCase2(OpTest):
+
     def init_test_cast(self):
         self.shape = [2, 3, 20]
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_beam_search_decode_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_beam_search_decode_op_npu.py
index 647bd29ffaef5..0a45cec0d0c95 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_beam_search_decode_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_beam_search_decode_op_npu.py
@@ -44,37 +44,28 @@ def test_get_set(self):
         # beam_size = 2, end_id = 1
         # start with start_id
         [
-            self.append_lod_tensor(
-                array, [[0, 1, 2], [0, 1, 2]], np.array(
-                    [0, 0], dtype=dtype))
+            self.append_lod_tensor(array, [[0, 1, 2], [0, 1, 2]],
+                                   np.array([0, 0], dtype=dtype))
             for array, dtype in ((ids, "int64"), (scores, "float32"))
         ]
         [
-            self.append_lod_tensor(
-                array, [[0, 1, 2], [0, 2, 4]],
-                np.array(
-                    [2, 3, 4, 5], dtype=dtype))
+            self.append_lod_tensor(array, [[0, 1, 2], [0, 2, 4]],
+                                   np.array([2, 3, 4, 5], dtype=dtype))
             for array, dtype in ((ids, "int64"), (scores, "float32"))
         ]
         [
-            self.append_lod_tensor(
-                array, [[0, 2, 4], [0, 2, 2, 4, 4]],
-                np.array(
-                    [3, 1, 5, 4], dtype=dtype))
+            self.append_lod_tensor(array, [[0, 2, 4], [0, 2, 2, 4, 4]],
+                                   np.array([3, 1, 5, 4], dtype=dtype))
             for array, dtype in ((ids, "int64"), (scores, "float32"))
         ]
         [
-            self.append_lod_tensor(
-                array, [[0, 2, 4], [0, 1, 2, 3, 4]],
-                np.array(
-                    [1, 1, 3, 5], dtype=dtype))
+            self.append_lod_tensor(array, [[0, 2, 4], [0, 1, 2, 3, 4]],
+                                   np.array([1, 1, 3, 5], dtype=dtype))
             for array, dtype in ((ids, "int64"), (scores, "float32"))
         ]
         [
-            self.append_lod_tensor(
-                array, [[0, 2, 4], [0, 0, 0, 2, 2]],
-                np.array(
-                    [5, 1], dtype=dtype))
+            self.append_lod_tensor(array, [[0, 2, 4], [0, 0, 0, 2, 2]],
+                                   np.array([5, 1], dtype=dtype))
             for array, dtype in ((ids, "int64"), (scores, "float32"))
         ]
 
@@ -90,7 +81,8 @@ def test_get_set(self):
             SentenceIds="sentence_ids",
             SentenceScores="sentence_scores",
             beam_size=2,
-            end_id=1, )
+            end_id=1,
+        )
 
         beam_search_decode_op.run(self.scope, self.place)
 
@@ -101,8 +93,8 @@ def test_get_set(self):
         expected_data = np.array(
             [0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1], "int64")
         self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data))
-        self.assertTrue(
-            np.array_equal(np.array(sentence_scores), expected_data))
+        self.assertTrue(np.array_equal(np.array(sentence_scores),
+                                       expected_data))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_beam_search_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_beam_search_op_npu.py
index 14e4fbb73fd1b..dcfa60e746269 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_beam_search_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_beam_search_op_npu.py
@@ -16,6 +16,7 @@
 
 import paddle
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import unittest
@@ -26,6 +27,7 @@
 
 
 class TestBeamSearchNPUOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -57,19 +59,18 @@ def init_data(self):
         self.beam_size = 2
         self.is_accumulated = True
         self.pre_ids = np.array([[1], [2], [3], [4]], dtype='int64')
-        self.ids = np.array(
-            [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64')
+        self.ids = np.array([[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]],
+                            dtype='int64')
         self.lod = [[2, 2], [1, 1, 1, 1]]
         self.out_lod = [[2, 2], [1, 1, 1, 1]]
         self.offset_lod = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        self.score = np.array(
-            [
-                [0.5, 0.3, 0.2],
-                [0.6, 0.3, 0.1],
-                [0.9, 0.5, 0.1],
-                [0.7, 0.5, 0.1],
-            ],
-            dtype='float32')
+        self.score = np.array([
+            [0.5, 0.3, 0.2],
+            [0.6, 0.3, 0.1],
+            [0.9, 0.5, 0.1],
+            [0.7, 0.5, 0.1],
+        ],
+                              dtype='float32')
         self.pre_score = np.array([[0.1], [0.2], [0.3], [0.4]], dtype='float32')
         self.selected_ids = np.array([4, 2, 3, 8])[:, np.newaxis]
         self.selected_scores = np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]
@@ -80,6 +81,7 @@ def test_check_output(self):
 
 
 class TestBeamSearchNPUOp2(TestBeamSearchNPUOp):
+
     def init_data(self):
         self.beam_size = 2
         self.is_accumulated = True
@@ -88,13 +90,13 @@ def init_data(self):
         self.lod = [[2, 2], [1, 1, 1, 1]]
         self.out_lod = [[2, 2], [2, 0, 1, 1]]
         self.offset_lod = [[0, 2, 4], [0, 2, 2, 3, 4]]
-        self.score = np.array(
-            [
-                [0.6, 0.9],
-                [0.5, 0.3],
-                [0.9, 0.5],
-                [0.1, 0.7],
-            ], dtype='float32')
+        self.score = np.array([
+            [0.6, 0.9],
+            [0.5, 0.3],
+            [0.9, 0.5],
+            [0.1, 0.7],
+        ],
+                              dtype='float32')
         self.pre_score = np.array([[0.1], [0.2], [0.3], [0.4]], dtype='float32')
         self.selected_ids = np.array([4, 2, 3, 1])[:, np.newaxis]
         self.selected_scores = np.array([0.6, 0.9, 0.9, 0.7])[:, np.newaxis]
@@ -102,6 +104,7 @@ def init_data(self):
 
 
 class TestBeamSearchNPUOp3(TestBeamSearchNPUOp):
+
     def init_data(self):
         # end_id = 0
         self.beam_size = 2
@@ -111,13 +114,13 @@ def init_data(self):
         self.lod = [[2, 2], [1, 1, 1, 1]]
         self.out_lod = [[2, 2], [1, 1, 0, 2]]
         self.offset_lod = [[0, 2, 4], [0, 1, 2, 2, 4]]
-        self.score = np.array(
-            [
-                [0.6, 0.9],
-                [0.5, 0.3],
-                [0.9, 0.5],
-                [0.6, 0.7],
-            ], dtype='float32')
+        self.score = np.array([
+            [0.6, 0.9],
+            [0.5, 0.3],
+            [0.9, 0.5],
+            [0.6, 0.7],
+        ],
+                              dtype='float32')
         self.pre_score = np.array([[0.1], [1.2], [0.5], [0.4]], dtype='float32')
         self.selected_ids = np.array([2, 0, 8, 1])[:, np.newaxis]
         self.selected_scores = np.array([0.9, 1.2, 0.6, 0.7])[:, np.newaxis]
@@ -125,6 +128,7 @@ def init_data(self):
 
 
 class TestBeamSearchNPUOp4(TestBeamSearchNPUOp):
+
     def init_data(self):
         # is_accumulated = False
         self.beam_size = 2
@@ -134,21 +138,22 @@ def init_data(self):
         self.lod = [[2, 2], [1, 1, 1, 1]]
         self.out_lod = [[2, 2], [0, 2, 1, 1]]
         self.offset_lod = [[0, 2, 4], [0, 0, 2, 3, 4]]
-        self.score = np.array(
-            [
-                [0.6, 0.9],
-                [0.5, 0.3],
-                [0.9, 0.5],
-                [0.1, 0.7],
-            ], dtype='float32')
+        self.score = np.array([
+            [0.6, 0.9],
+            [0.5, 0.3],
+            [0.9, 0.5],
+            [0.1, 0.7],
+        ],
+                              dtype='float32')
         self.pre_score = np.array([[0.1], [2.2], [0.3], [0.4]], dtype='float32')
         self.selected_ids = np.array([7, 3, 3, 1])[:, np.newaxis]
-        self.selected_scores = np.array(
-            [1.50685, 0.996027, 0.194639, 0.043325])[:, np.newaxis]
+        self.selected_scores = np.array([1.50685, 0.996027, 0.194639,
+                                         0.043325])[:, np.newaxis]
         self.parent_idx = np.array([1, 1, 2, 3])
 
 
 class TestBeamSearchNPUOp5(TestBeamSearchNPUOp):
+
     def init_data(self):
         # beam_size = 1
         self.beam_size = 1
@@ -158,13 +163,13 @@ def init_data(self):
         self.lod = [[1, 1, 1, 1], [1, 1, 1, 1]]
         self.out_lod = [[1, 1, 1, 1], [1, 1, 1, 1]]
         self.offset_lod = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]
-        self.score = np.array(
-            [
-                [0.6, 0.9],
-                [0.5, 0.3],
-                [0.9, 0.5],
-                [0.1, 0.7],
-            ], dtype='float32')
+        self.score = np.array([
+            [0.6, 0.9],
+            [0.5, 0.3],
+            [0.9, 0.5],
+            [0.1, 0.7],
+        ],
+                              dtype='float32')
         self.pre_score = np.array([[0.1], [0.2], [0.3], [0.4]], dtype='float32')
         self.selected_ids = np.array([2, 7, 3, 1])[:, np.newaxis]
         self.selected_scores = np.array([0.9, 0.5, 0.9, 0.7])[:, np.newaxis]
diff --git a/python/paddle/fluid/tests/unittests/npu/test_bilinear_interp_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_bilinear_interp_v2_op_npu.py
index 6da49b8d84d19..44cf417228a1b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_bilinear_interp_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_bilinear_interp_v2_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle.fluid.core as core
@@ -30,6 +31,7 @@
 
 
 class TestBilinearInterpOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -127,6 +129,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCaseFP16(TestBilinearInterpOp):
+
     def init_test_case(self):
         super(TestBilinearInterpCaseFP16, self).init_test_case()
         self.dtype = 'float16'
@@ -134,6 +137,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase1(TestBilinearInterpOp):
+
     def init_test_case(self):
         super(TestBilinearInterpCase1, self).init_test_case()
         self.input_shape = [4, 1, 7, 8]
@@ -143,6 +147,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase2(TestBilinearInterpOp):
+
     def init_test_case(self):
         super(TestBilinearInterpCase2, self).init_test_case()
         self.input_shape = [3, 3, 9, 6]
@@ -152,6 +157,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase3(TestBilinearInterpOp):
+
     def init_test_case(self):
         super(TestBilinearInterpCase3, self).init_test_case()
         self.input_shape = [1, 1, 32, 64]
@@ -161,6 +167,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase4(TestBilinearInterpOp):
+
     def init_test_case(self):
         super(TestBilinearInterpCase4, self).init_test_case()
         self.input_shape = [4, 1, 7, 8]
@@ -171,6 +178,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase5(TestBilinearInterpOp):
+
     def init_test_case(self):
         super(TestBilinearInterpCase5, self).init_test_case()
         self.input_shape = [3, 3, 9, 6]
@@ -181,6 +189,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase6(TestBilinearInterpOp):
+
     def init_test_case(self):
         super(TestBilinearInterpCase6, self).init_test_case()
         self.input_shape = [1, 1, 32, 64]
@@ -191,6 +200,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase7(TestBilinearInterpOp):
+
     def init_test_case(self):
         super(TestBilinearInterpCase7, self).init_test_case()
         self.input_shape = [1, 1, 32, 64]
@@ -200,6 +210,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpSame(TestBilinearInterpOp):
+
     def init_test_case(self):
         super(TestBilinearInterpSame, self).init_test_case()
         self.input_shape = [2, 3, 32, 64]
@@ -209,6 +220,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpActualShape(TestBilinearInterpOp):
+
     def init_test_case(self):
         super(TestBilinearInterpActualShape, self).init_test_case()
         self.input_shape = [3, 2, 32, 16]
@@ -219,6 +231,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpDataLayout(TestBilinearInterpOp):
+
     def init_test_case(self):
         super(TestBilinearInterpDataLayout, self).init_test_case()
         self.input_shape = [2, 5, 5, 3]
@@ -230,24 +243,28 @@ def init_test_case(self):
 
 
 class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = False
         self.align_mode = 1
 
 
 class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = False
         self.align_mode = 0
 
 
 class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = True
         self.align_mode = 0
 
 
 class TestBilinearInterpScale1(TestBilinearInterpOp):
+
     def init_test_case(self):
         super(TestBilinearInterpScale1, self).init_test_case()
         self.input_shape = [2, 3, 5, 7]
@@ -257,6 +274,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpScale2(TestBilinearInterpOp):
+
     def init_test_case(self):
         super(TestBilinearInterpScale2, self).init_test_case()
         self.input_shape = [2, 3, 5, 7]
@@ -266,6 +284,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpZero(TestBilinearInterpOp):
+
     def init_test_case(self):
         super(TestBilinearInterpZero, self).init_test_case()
         self.input_shape = [2, 3, 5, 7]
diff --git a/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py
index 4d4d61ace841e..7febcaba45cb4 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 import math
 import paddle
@@ -41,8 +42,9 @@ def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0):
     pb_y = pb_y.reshape(shape)
 
     if pb_v.ndim == 2:
-        var_shape = (1, pb_v.shape[0], pb_v.shape[1]) if axis == 0 else (
-            pb_v.shape[0], 1, pb_v.shape[1])
+        var_shape = (1, pb_v.shape[0],
+                     pb_v.shape[1]) if axis == 0 else (pb_v.shape[0], 1,
+                                                       pb_v.shape[1])
         pb_v = pb_v.reshape(var_shape)
     if pb_v.ndim == 1:
         tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x
@@ -112,6 +114,7 @@ def batch_box_coder(p_box, pb_v, t_box, lod, code_type, norm, axis=0):
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestBoxCoderOp(OpTest):
+
     def setUp(self):
         self.op_type = "box_coder"
         self.set_npu()
@@ -195,9 +198,10 @@ def set_attrs(self):
             self.attrs['axis'] = self.axis
 
     def set_outputs(self):
-        output_box = batch_box_coder(
-            self.prior_box, self.prior_box_var, self.target_box, self.lod[0],
-            self.code_type, self.box_normalized, self.axis)
+        output_box = batch_box_coder(self.prior_box, self.prior_box_var,
+                                     self.target_box, self.lod[0],
+                                     self.code_type, self.box_normalized,
+                                     self.axis)
         self.outputs = {'OutputBox': output_box.astype(self.dtype)}
 
     def test_check_output(self):
@@ -205,6 +209,7 @@ def test_check_output(self):
 
 
 class TestBoxCoderOpWithoutBoxVar(TestBoxCoderOp):
+
     def set_init_config(self):
         super(TestBoxCoderOpWithoutBoxVar, self).set_init_config()
         self.without_prior_box_var = True
@@ -212,6 +217,7 @@ def set_init_config(self):
 
 
 class TestBoxCoderOpWithLoD(TestBoxCoderOp):
+
     def set_init_config(self):
         super(TestBoxCoderOpWithLoD, self).set_init_config()
         self.M = 20
@@ -222,24 +228,28 @@ def set_init_config(self):
 
 
 class TestBoxCoderOpWithLoDWithVariance(TestBoxCoderOpWithLoD):
+
     def set_init_config(self):
         super(TestBoxCoderOpWithLoDWithVariance, self).set_init_config()
         self.use_variance = True
 
 
 class TestBoxCoderOpWithAxis(TestBoxCoderOp):
+
     def set_init_config(self):
         super(TestBoxCoderOpWithAxis, self).set_init_config()
         self.axis = 1
 
 
 class TestBoxCoderOpWithVariance(TestBoxCoderOp):
+
     def set_init_config(self):
         super(TestBoxCoderOpWithVariance, self).set_init_config()
         self.use_variance = True
 
 
 class TestBoxCoderOpFP16(TestBoxCoderOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_c_embedding_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_c_embedding_op_npu.py
index 533a3fd12fd52..586aa513c469d 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_c_embedding_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_c_embedding_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
diff --git a/python/paddle/fluid/tests/unittests/npu/test_c_identity_npu.py b/python/paddle/fluid/tests/unittests/npu/test_c_identity_npu.py
index 9ea52a88d9897..59a4f6e8cb605 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_c_identity_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_c_identity_npu.py
@@ -24,13 +24,15 @@
 
 
 class TestIdentityOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
     def test_identity(self, col_type="identity"):
         dist_env = os.environ
-        self.check_with_place(
-            "collective_identity_op_npu.py", col_type, need_envs=dist_env)
+        self.check_with_place("collective_identity_op_npu.py",
+                              col_type,
+                              need_envs=dist_env)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
index 0d79d9b07233f..7761d2f6ede06 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -29,6 +30,7 @@
 
 @skip_check_grad_ci(reason="[skip NPU cast grad check] not implemented yet.")
 class TestCast1(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "cast"
@@ -52,6 +54,7 @@ def test_check_output(self):
 
 @skip_check_grad_ci(reason="[skip NPU cast grad check] not implemented yet.")
 class TestCast2(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "cast"
@@ -75,6 +78,7 @@ def test_check_output(self):
 
 @skip_check_grad_ci(reason="[skip NPU cast grad check] not implemented yet.")
 class TestCast3(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "cast"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_clip_by_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_clip_by_norm_op_npu.py
index d71fc142ade3a..2af5850987473 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_clip_by_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_clip_by_norm_op_npu.py
@@ -20,6 +20,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 
@@ -27,6 +28,7 @@
 
 
 class TestClipByNormOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.max_relative_error = 0.006
@@ -35,7 +37,9 @@ def setUp(self):
         input = np.random.random(self.shape).astype(self.dtype)
         input[np.abs(input) < self.max_relative_error] = 0.5
         self.op_type = "clip_by_norm"
-        self.inputs = {'X': input, }
+        self.inputs = {
+            'X': input,
+        }
         self.attrs = {}
         self.attrs['max_norm'] = self.max_norm
         norm = np.sqrt(np.sum(np.square(input)))
@@ -61,24 +65,28 @@ def init_dtype(self):
 
 
 class TestCase1(TestClipByNormOp):
+
     def initTestCase(self):
         self.shape = (100, )
         self.max_norm = 1e20
 
 
 class TestCase2(TestClipByNormOp):
+
     def initTestCase(self):
         self.shape = (16, 16)
         self.max_norm = 0.1
 
 
 class TestCase3(TestClipByNormOp):
+
     def initTestCase(self):
         self.shape = (4, 8, 16)
         self.max_norm = 1.0
 
 
 class TestClipByNormOpFp16(TestClipByNormOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -87,18 +95,21 @@ def test_check_output(self):
 
 
 class TestClipByNormOpFp16Case1(TestClipByNormOpFp16):
+
     def initTestCase(self):
         self.shape = (100, )
         self.max_norm = 1e20
 
 
 class TestClipByNormOpFp16Case2(TestClipByNormOpFp16):
+
     def initTestCase(self):
         self.shape = (16, 16)
         self.max_norm = 0.1
 
 
 class TestClipByNormOpFp16Case3(TestClipByNormOpFp16):
+
     def initTestCase(self):
         self.shape = (4, 8, 16)
         self.max_norm = 1.0
diff --git a/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
index 65dcc73aa46d0..cf6af6462d061 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
@@ -20,11 +20,13 @@
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 
 
 class TestClipOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -75,6 +77,7 @@ def initTestCase(self):
 
 
 class TestCase1(TestClipOp):
+
     def initTestCase(self):
         self.shape = (8, 16, 8)
         self.max = 0.7
@@ -82,6 +85,7 @@ def initTestCase(self):
 
 
 class TestCase2(TestClipOp):
+
     def initTestCase(self):
         self.shape = (8, 16)
         self.max = 1.0
@@ -89,6 +93,7 @@ def initTestCase(self):
 
 
 class TestCase3(TestClipOp):
+
     def initTestCase(self):
         self.shape = (4, 8, 16)
         self.max = 0.7
@@ -96,6 +101,7 @@ def initTestCase(self):
 
 
 class TestCase4(TestClipOp):
+
     def initTestCase(self):
         self.shape = (4, 8, 8)
         self.max = 0.7
@@ -105,6 +111,7 @@ def initTestCase(self):
 
 
 class TestCase5(TestClipOp):
+
     def initTestCase(self):
         self.shape = (4, 8, 16)
         self.max = 0.5
@@ -112,6 +119,7 @@ def initTestCase(self):
 
 
 class TestClipOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
@@ -131,6 +139,7 @@ def test_dtype():
 
 
 class TestClipAPI(unittest.TestCase):
+
     def _executed_api(self, x, min=None, max=None):
         return paddle.clip(x, min, max)
 
@@ -142,8 +151,8 @@ def test_clip(self):
         min = fluid.data(name='min', shape=[1], dtype='float32')
         max = fluid.data(name='max', shape=[1], dtype='float32')
 
-        place = fluid.NPUPlace(0) if fluid.core.is_compiled_with_npu(
-        ) else fluid.CPUPlace()
+        place = fluid.NPUPlace(
+            0) if fluid.core.is_compiled_with_npu() else fluid.CPUPlace()
         exe = fluid.Executor(place)
 
         out_1 = self._executed_api(images, min=min, max=max)
@@ -162,9 +171,7 @@ def test_clip(self):
                 "min": np.array([0.2]).astype('float32'),
                 "max": np.array([0.8]).astype('float32')
             },
-            fetch_list=[
-                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
-            ])
+            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8])
 
         self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
         self.assertTrue(np.allclose(res2, data.clip(0.2, 0.9)))
@@ -178,8 +185,8 @@ def test_clip(self):
 
     def test_clip_dygraph(self):
         paddle.disable_static()
-        place = fluid.NPUPlace(0) if fluid.core.is_compiled_with_npu(
-        ) else fluid.CPUPlace()
+        place = fluid.NPUPlace(
+            0) if fluid.core.is_compiled_with_npu() else fluid.CPUPlace()
         paddle.disable_static(place)
         data_shape = [1, 9, 9, 4]
         data = np.random.random(data_shape).astype('float32')
@@ -207,6 +214,7 @@ def test_errors(self):
 
 
 class TestInplaceClipAPI(TestClipAPI):
+
     def _executed_api(self, x, min=None, max=None):
         return x.clip_(min, max)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py
index 93a969bf10f03..313ab90c93f27 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -29,6 +30,7 @@
 
 
 class TestAllocContinuousSpace(OpTest):
+
     def setUp(self):
         self.__class__.use_npu = True
         self.op_type = "coalesce_tensor"
@@ -80,10 +82,12 @@ def test_check_output(self):
         self.check_output_with_place(
             place=paddle.NPUPlace(0),
             no_check_set=["FusedOutput"],
-            atol=1e-5, )
+            atol=1e-5,
+        )
 
 
 class TestAllocContinuousSpace2(TestAllocContinuousSpace):
+
     def init_attr(self):
         return {
             "copy_data": True,
@@ -98,7 +102,8 @@ def test_check_output(self):
         self.check_output_with_place(
             place=paddle.NPUPlace(0),
             no_check_set=["FusedOutput"],
-            atol=1e-5, )
+            atol=1e-5,
+        )
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py b/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py
index 774423a8be1b1..69f3b1bcbe41b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py
@@ -31,6 +31,7 @@
 
 
 class TestCollectiveRunnerBase(object):
+
     def get_model(self, train_prog, startup_prog):
         raise NotImplementedError(
             "get model should be implemented by child class.")
@@ -42,9 +43,8 @@ def wait_server_ready(self, endpoints):
             not_ready_endpoints = []
             for ep in endpoints:
                 ip_port = ep.split(":")
-                with closing(
-                        socket.socket(socket.AF_INET,
-                                      socket.SOCK_STREAM)) as sock:
+                with closing(socket.socket(socket.AF_INET,
+                                           socket.SOCK_STREAM)) as sock:
                     sock.settimeout(2)
                     sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                     if hasattr(socket, 'SO_REUSEPORT'):
@@ -57,13 +57,14 @@ def wait_server_ready(self, endpoints):
                         not_ready_endpoints.append(ep)
             if not all_ok:
                 sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-                sys.stderr.write("not ready endpoints:" + str(
-                    not_ready_endpoints) + "\n")
+                sys.stderr.write("not ready endpoints:" +
+                                 str(not_ready_endpoints) + "\n")
                 sys.stderr.flush()
                 time.sleep(3)
             else:
                 break
 
+
 #endpoints should be ["ip1:port1","ip2:port2"]
 
     def initCommunicator(self, program, rank, nranks, wait_port,
@@ -73,29 +74,26 @@ def initCommunicator(self, program, rank, nranks, wait_port,
         if rank == 0 and wait_port:
             self.wait_server_ready(other_endpoints)
         block = program.global_block()
-        hccl_id_var = block.create_var(
-            name=nameGen.generate('hccl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-        block.append_op(
-            type='c_gen_hccl_id',
-            inputs={},
-            outputs={'Out': hccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints
-            })
-        block.append_op(
-            type='c_comm_init_hccl',
-            inputs={'X': hccl_id_var},
-            outputs={},
-            attrs={
-                'rank': rank,
-                'ring_id': self.global_ring_id,
-                'device_id': int(os.getenv("FLAGS_selected_npus")),
-                'rank_ids': nranks
-            })
+        hccl_id_var = block.create_var(name=nameGen.generate('hccl_id'),
+                                       persistable=True,
+                                       type=core.VarDesc.VarType.RAW)
+        block.append_op(type='c_gen_hccl_id',
+                        inputs={},
+                        outputs={'Out': hccl_id_var},
+                        attrs={
+                            'rank': rank,
+                            'endpoint': current_endpoint,
+                            'other_endpoints': other_endpoints
+                        })
+        block.append_op(type='c_comm_init_hccl',
+                        inputs={'X': hccl_id_var},
+                        outputs={},
+                        attrs={
+                            'rank': rank,
+                            'ring_id': self.global_ring_id,
+                            'device_id': int(os.getenv("FLAGS_selected_npus")),
+                            'rank_ids': nranks
+                        })
 
     def run_trainer(self, args):
         train_prog = fluid.Program()
@@ -138,6 +136,7 @@ def runtime_main(test_class, col_type, sub_type):
 
 
 class TestDistBase(unittest.TestCase):
+
     def setUp(self):
         self._port_set = set()
         self._trainers = 2
@@ -146,6 +145,7 @@ def setUp(self):
         self._python_interp = sys.executable
 
     def _find_free_port(self):
+
         def __free_port():
             with closing(socket.socket(socket.AF_INET,
                                        socket.SOCK_STREAM)) as s:
@@ -185,18 +185,16 @@ def _run_cluster(self, model_file, envs):
         tr1_cmd = tr_cmd % (self._python_interp, model_file)
         tr0_pipe = open("/tmp/tr0_err.log", "wb")
         tr1_pipe = open("/tmp/tr1_err.log", "wb")
-        #print(tr0_cmd) 
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.strip().split(),
-            stdout=subprocess.PIPE,
-            stderr=tr0_pipe,
-            env=env0)
-
-        tr1_proc = subprocess.Popen(
-            tr0_cmd.strip().split(),
-            stdout=subprocess.PIPE,
-            stderr=tr1_pipe,
-            env=env1)
+        #print(tr0_cmd)
+        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr0_pipe,
+                                    env=env0)
+
+        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr1_pipe,
+                                    env=env1)
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py b/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py
index 9b2c6fae15eb4..d3d5ab76a94f7 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py
@@ -16,11 +16,13 @@
 
 import unittest
 import sys
+
 sys.path.append("..")
 from test_parallel_dygraph_dataparallel import TestMultipleGpus
 
 
 class TestProcessGroup(TestMultipleGpus):
+
     def test_process_group_nccl(self):
         self.run_mnist_2gpu('process_group_hccl.py')
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
index 66ce81756fc9d..ba2e3a083f30f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -25,7 +26,9 @@
 
 
 def create_test_class(op_type, typename, callback):
+
     class Cls(OpTest):
+
         def setUp(self):
             self.set_npu()
             self.place = paddle.NPUPlace(0)
@@ -76,18 +79,22 @@ def test_dynamic_api(self):
         def test_broadcast_api_1(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
-                x = paddle.static.data(
-                    name='x', shape=[1, 2, 1, 3], dtype=typename)
-                y = paddle.static.data(
-                    name='y', shape=[1, 2, 3], dtype=typename)
+                x = paddle.static.data(name='x',
+                                       shape=[1, 2, 1, 3],
+                                       dtype=typename)
+                y = paddle.static.data(name='y',
+                                       shape=[1, 2, 3],
+                                       dtype=typename)
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
                 exe = paddle.static.Executor(self.place)
                 input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(typename)
                 input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(typename)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
@@ -95,18 +102,22 @@ def test_broadcast_api_1(self):
         def test_broadcast_api_2(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
-                x = paddle.static.data(
-                    name='x', shape=[1, 2, 3], dtype=typename)
-                y = paddle.static.data(
-                    name='y', shape=[1, 2, 1, 3], dtype=typename)
+                x = paddle.static.data(name='x',
+                                       shape=[1, 2, 3],
+                                       dtype=typename)
+                y = paddle.static.data(name='y',
+                                       shape=[1, 2, 1, 3],
+                                       dtype=typename)
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
                 exe = paddle.static.Executor(self.place)
                 input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(typename)
                 input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(typename)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
@@ -122,8 +133,10 @@ def test_broadcast_api_3(self):
                 input_x = np.arange(0, 5).reshape((5)).astype(typename)
                 input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(typename)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py
index f9eecefdfb237..4fff3ab5fa059 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestConcatOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "concat"
@@ -43,8 +45,8 @@ def setUp(self):
             self.actual_axis = self.axis
 
         self.outputs = {
-            'Out': np.concatenate(
-                (self.x0, self.x1, self.x2), axis=self.actual_axis)
+            'Out':
+            np.concatenate((self.x0, self.x1, self.x2), axis=self.actual_axis)
         }
 
     def set_npu(self):
@@ -69,6 +71,7 @@ def init_test_data(self):
 
 
 class TestConcatOp2(TestConcatOp):
+
     def init_test_data(self):
         self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
         self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
@@ -79,6 +82,7 @@ def init_test_data(self):
 @skip_check_grad_ci(
     reason="The function 'check_grad' for large inputs is too slow.")
 class TestConcatOp3(TestConcatOp):
+
     def init_test_data(self):
         self.x0 = np.random.random((1, 256, 170, 256)).astype(self.dtype)
         self.x1 = np.random.random((1, 128, 170, 256)).astype(self.dtype)
@@ -90,9 +94,11 @@ def test_check_grad(self):
 
 
 @skip_check_grad_ci(
-    reason="This test will meet fetch error when there is a null grad. The detailed information is in PR#17015."
+    reason=
+    "This test will meet fetch error when there is a null grad. The detailed information is in PR#17015."
 )
 class TestConcatOp4(TestConcatOp):
+
     def init_test_data(self):
         self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
         self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
@@ -104,6 +110,7 @@ def test_check_grad(self):
 
 
 class TestConcatOp5(TestConcatOp):
+
     def init_test_data(self):
         self.x0 = np.random.random((5, 1, 4, 5)).astype(self.dtype)
         self.x1 = np.random.random((5, 2, 4, 5)).astype(self.dtype)
@@ -113,7 +120,9 @@ def init_test_data(self):
 
 #----------------Concat Fp16----------------
 def create_test_fp16(parent):
+
     class TestConcatFp16(parent):
+
         def init_dtype(self):
             self.dtype = np.float16
 
@@ -131,7 +140,9 @@ def init_dtype(self):
 
 #----------------Concat Int64----------------
 def create_test_int64(parent):
+
     class TestConcatInt64(parent):
+
         def init_dtype(self):
             self.dtype = np.int64
 
@@ -170,8 +181,9 @@ def set_program(self, use_fluid_api):
             with fluid.program_guard(self.program):
                 input = fluid.layers.assign(self.x)
                 tensor_array = fluid.layers.create_array(dtype='float32')
-                zero = fluid.layers.fill_constant(
-                    shape=[1], value=0, dtype="int64")
+                zero = fluid.layers.fill_constant(shape=[1],
+                                                  value=0,
+                                                  dtype="int64")
 
                 for i in range(self.iter_num):
                     fluid.layers.array_write(input, zero + i, tensor_array)
@@ -208,9 +220,8 @@ def _run_static_mode(self, use_fluid_api):
         res = exe.run(self.program, fetch_list=self.out_var)
         self.assertTrue(
             np.array_equal(
-                res[0],
-                np.concatenate(
-                    [self.x] * self.iter_num, axis=self.axis)))
+                res[0], np.concatenate([self.x] * self.iter_num,
+                                       axis=self.axis)))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py
index 2e15a1eac2b4b..6c300acfe48f7 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py
@@ -19,6 +19,7 @@
 import paddle
 import paddle.fluid as fluid
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 from test_conv2d_op import conv2d_forward_naive
@@ -31,7 +32,9 @@
 
 
 def create_test_channel_last_class(parent):
+
     class TestChannelLastCase(parent):
+
         def init_data_format(self):
             self.data_format = "NHWC"
 
@@ -45,7 +48,9 @@ def init_test_case_2(self):
 
 
 def create_test_padding_SAME_class(parent):
+
     class TestPaddingSMAECase(parent):
+
         def init_paddings(self):
             self.pad = [0, 0]
             self.padding_algorithm = "SAME"
@@ -56,7 +61,9 @@ def init_paddings(self):
 
 
 def create_test_padding_VALID_class(parent):
+
     class TestPaddingVALIDCase(parent):
+
         def init_paddings(self):
             self.pad = [1, 1]
             self.padding_algorithm = "VALID"
@@ -67,7 +74,9 @@ def init_paddings(self):
 
 
 def create_test_fp16_class(parent):
+
     class TestFp16Case(parent):
+
         def init_data_type(self):
             self.dtype = np.float16
 
@@ -77,6 +86,7 @@ def init_data_type(self):
 
 
 class TestDepthwiseConvNPU(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "depthwise_conv2d"
@@ -134,47 +144,41 @@ def test_check_output(self):
     def test_check_grad(self):
         if self.dilations[0] == 1 and self.dilations[1] == 1:
             if self.dtype == np.float16:
-                self.check_grad_with_place(
-                    self.place, {'Input', 'Filter'},
-                    'Output',
-                    max_relative_error=0.9)
+                self.check_grad_with_place(self.place, {'Input', 'Filter'},
+                                           'Output',
+                                           max_relative_error=0.9)
             else:
-                self.check_grad_with_place(
-                    self.place, {'Input', 'Filter'},
-                    'Output',
-                    max_relative_error=0.03,
-                    numeric_place=paddle.CPUPlace())
+                self.check_grad_with_place(self.place, {'Input', 'Filter'},
+                                           'Output',
+                                           max_relative_error=0.03,
+                                           numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['Input'],
-                'Output',
-                no_grad_set=set(['Filter']),
-                max_relative_error=0.9)
+            self.check_grad_with_place(self.place, ['Input'],
+                                       'Output',
+                                       no_grad_set=set(['Filter']),
+                                       max_relative_error=0.9)
         else:
-            self.check_grad_with_place(
-                self.place, ['Input'],
-                'Output',
-                no_grad_set=set(['Filter']),
-                max_relative_error=0.03,
-                numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(self.place, ['Input'],
+                                       'Output',
+                                       no_grad_set=set(['Filter']),
+                                       max_relative_error=0.03,
+                                       numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
         if self.dilations[0] == 1 and self.dilations[1] == 1:
             if self.dtype == np.float16:
-                self.check_grad_with_place(
-                    self.place, ['Filter'],
-                    'Output',
-                    no_grad_set=set(['Input']),
-                    max_relative_error=0.9)
+                self.check_grad_with_place(self.place, ['Filter'],
+                                           'Output',
+                                           no_grad_set=set(['Input']),
+                                           max_relative_error=0.9)
             else:
-                self.check_grad_with_place(
-                    self.place, ['Filter'],
-                    'Output',
-                    no_grad_set=set(['Input']),
-                    max_relative_error=0.03,
-                    numeric_place=paddle.CPUPlace())
+                self.check_grad_with_place(self.place, ['Filter'],
+                                           'Output',
+                                           no_grad_set=set(['Input']),
+                                           max_relative_error=0.03,
+                                           numeric_place=paddle.CPUPlace())
 
     def init_data_format(self):
         self.data_format = "NCHW"
@@ -187,6 +191,7 @@ def init_test_case_2(self):
 
 
 class TestDepthwiseConvNPU2(TestDepthwiseConvNPU):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.dilations = [1, 1]
@@ -199,6 +204,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConvNPU3(TestDepthwiseConvNPU):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.dilations = [1, 1]
@@ -211,6 +217,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConvNPU4(TestDepthwiseConvNPU):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.dilations = [1, 1]
@@ -223,6 +230,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConvNPU_Padding(OpTest):
+
     def setUp(self):
         self.op_type = "depthwise_conv2d"
         self.dtype = np.float32
@@ -242,9 +250,10 @@ def setUp(self):
         input = np.random.random(self.input_size).astype(self.dtype)
         filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
 
-        output, _, _, _, _ = conv2d_forward_naive(
-            input, filter, self.groups, conv2d_param, self.padding_algorithm,
-            self.data_format)
+        output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups,
+                                                  conv2d_param,
+                                                  self.padding_algorithm,
+                                                  self.data_format)
         output = output.astype(self.dtype)
 
         self.inputs = {
@@ -281,46 +290,40 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, {'Input', 'Filter'},
-                'Output',
-                max_relative_error=1.2)
+            self.check_grad_with_place(self.place, {'Input', 'Filter'},
+                                       'Output',
+                                       max_relative_error=1.2)
         else:
-            self.check_grad_with_place(
-                self.place, {'Input', 'Filter'},
-                'Output',
-                max_relative_error=0.03,
-                numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(self.place, {'Input', 'Filter'},
+                                       'Output',
+                                       max_relative_error=0.03,
+                                       numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['Input'],
-                'Output',
-                max_relative_error=0.7,
-                no_grad_set=set(['Filter']))
+            self.check_grad_with_place(self.place, ['Input'],
+                                       'Output',
+                                       max_relative_error=0.7,
+                                       no_grad_set=set(['Filter']))
         else:
-            self.check_grad_with_place(
-                self.place, ['Input'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Filter']),
-                numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(self.place, ['Input'],
+                                       'Output',
+                                       max_relative_error=0.03,
+                                       no_grad_set=set(['Filter']),
+                                       numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['Filter'],
-                'Output',
-                max_relative_error=0.8,
-                no_grad_set=set(['Input']))
+            self.check_grad_with_place(self.place, ['Filter'],
+                                       'Output',
+                                       max_relative_error=0.8,
+                                       no_grad_set=set(['Input']))
         else:
-            self.check_grad_with_place(
-                self.place, ['Filter'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Input']),
-                numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(self.place, ['Filter'],
+                                       'Output',
+                                       max_relative_error=0.03,
+                                       no_grad_set=set(['Input']),
+                                       numeric_place=paddle.CPUPlace())
 
     def init_data_format(self):
         self.data_format = "NCHW"
@@ -337,6 +340,7 @@ def init_test_case_2(self):
 
 
 class TestDepthwiseConvNPU2_Padding(TestDepthwiseConvNPU_Padding):
+
     def init_test_case(self):
         self.pad = [1, 1, 0, 1]
         self.dilations = [1, 1]
@@ -353,6 +357,7 @@ def init_paddings(self):
 
 
 class TestDepthwiseConvNPU3_Padding(TestDepthwiseConvNPU_Padding):
+
     def init_test_case(self):
         self.pad = [1, 1, 0, 1]
         self.dilations = [1, 1]
diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py
index 4070d0267d95b..c2244fb9a6eea 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 import paddle
 import paddle.fluid.core as core
@@ -29,7 +30,9 @@
 
 
 def create_test_channel_last_class(parent):
+
     class TestChannelLastCase(parent):
+
         def init_data_format(self):
             self.data_format = "NHWC"
 
@@ -43,7 +46,9 @@ def init_test_case_2(self):
 
 
 def create_test_padding_SAME_class(parent):
+
     class TestPaddingSMAECase(parent):
+
         def init_paddings(self):
             self.pad = [0, 0]
             self.padding_algorithm = "SAME"
@@ -54,7 +59,9 @@ def init_paddings(self):
 
 
 def create_test_padding_VALID_class(parent):
+
     class TestPaddingVALIDCase(parent):
+
         def init_paddings(self):
             self.pad = [1, 1]
             self.padding_algorithm = "VALID"
@@ -65,7 +72,9 @@ def init_paddings(self):
 
 
 def create_test_fp16_class(parent):
+
     class TestFp16Case(parent):
+
         def init_dtype(self):
             self.dtype = np.float16
 
@@ -75,6 +84,7 @@ def init_dtype(self):
 
 
 class TestConv2DOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -102,12 +112,11 @@ def setUp(self):
         input = np.random.random(self.input_size).astype(self.dtype)
         filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
 
-        output, _, _, _, _ = conv2d_forward_naive(
-            input,
-            filter,
-            self.groups,
-            conv2d_param,
-            data_format=self.data_format)
+        output, _, _, _, _ = conv2d_forward_naive(input,
+                                                  filter,
+                                                  self.groups,
+                                                  conv2d_param,
+                                                  data_format=self.data_format)
         output = output.astype(self.dtype)
 
         self.inputs = {
@@ -127,27 +136,24 @@ def test_check_output(self):
         self.check_output_with_place(fluid.NPUPlace(0), atol=1e-2)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            fluid.NPUPlace(0), {'Input', 'Filter'},
-            'Output',
-            max_relative_error=0.03,
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(fluid.NPUPlace(0), {'Input', 'Filter'},
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
-        self.check_grad_with_place(
-            fluid.NPUPlace(0), ['Input'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Filter']),
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(fluid.NPUPlace(0), ['Input'],
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set(['Filter']),
+                                   numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
-        self.check_grad_with_place(
-            fluid.NPUPlace(0), ['Filter'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Input']),
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(fluid.NPUPlace(0), ['Filter'],
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set(['Input']),
+                                   numeric_place=paddle.CPUPlace())
 
     def init_test_case(self):
         self.pad = [0, 0]
@@ -165,6 +171,7 @@ def init_group(self):
 
 
 class TestWithPad(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -175,6 +182,7 @@ def init_test_case(self):
 
 
 class TestWithStride(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -185,6 +193,7 @@ def init_test_case(self):
 
 
 class TestWithGroup(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -196,6 +205,7 @@ def init_test_case(self):
 
 
 class TestWith1x1(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -206,12 +216,13 @@ def init_test_case(self):
 
     def init_group(self):
         # FIXME: Supporting group = 3 in this case.
-        # NOTE(wangran16): There is an unknown error (acl error code is : 507015) 
+        # NOTE(wangran16): There is an unknown error (acl error code is : 507015)
         # when group = 3, which needs to be fixed.
         self.groups = 1
 
 
 class TestWithDepthWise5x5(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -225,6 +236,7 @@ def init_group(self):
 
 
 class TestWithDepthWise7x7(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -238,6 +250,7 @@ def init_group(self):
 
 
 class TestWithDilation(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -254,6 +267,7 @@ def init_group(self):
 
 
 class TestWithInput1x1Filter1x1(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -267,6 +281,7 @@ def init_group(self):
 
 
 class TestConv2DOp_v2(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -293,9 +308,10 @@ def setUp(self):
 
         input = np.random.random(self.input_size).astype(self.dtype)
         filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
-        output, _, _, _, _ = conv2d_forward_naive(
-            input, filter, self.groups, conv2d_param, self.padding_algorithm,
-            self.data_format)
+        output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups,
+                                                  conv2d_param,
+                                                  self.padding_algorithm,
+                                                  self.data_format)
         output = output.astype(self.dtype)
 
         self.inputs = {
@@ -317,45 +333,39 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                paddle.NPUPlace(0), {'Input', 'Filter'},
-                'Output',
-                max_relative_error=1.1)
+            self.check_grad_with_place(paddle.NPUPlace(0), {'Input', 'Filter'},
+                                       'Output',
+                                       max_relative_error=1.1)
         else:
-            self.check_grad_with_place(
-                paddle.NPUPlace(0), {'Input', 'Filter'},
-                'Output',
-                max_relative_error=0.02,
-                numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(paddle.NPUPlace(0), {'Input', 'Filter'},
+                                       'Output',
+                                       max_relative_error=0.02,
+                                       numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                paddle.NPUPlace(0), ['Input'],
-                'Output',
-                max_relative_error=0.99,
-                no_grad_set=set(['Filter']))
+            self.check_grad_with_place(paddle.NPUPlace(0), ['Input'],
+                                       'Output',
+                                       max_relative_error=0.99,
+                                       no_grad_set=set(['Filter']))
         else:
-            self.check_grad_with_place(
-                paddle.NPUPlace(0), ['Input'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Filter']),
-                numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(paddle.NPUPlace(0), ['Input'],
+                                       'Output',
+                                       max_relative_error=0.02,
+                                       no_grad_set=set(['Filter']),
+                                       numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                paddle.NPUPlace(0), ['Filter'],
-                'Output',
-                max_relative_error=0.99,
-                no_grad_set=set(['Input']))
+            self.check_grad_with_place(paddle.NPUPlace(0), ['Filter'],
+                                       'Output',
+                                       max_relative_error=0.99,
+                                       no_grad_set=set(['Input']))
         else:
-            self.check_grad_with_place(
-                paddle.NPUPlace(0), ['Filter'],
-                'Output',
-                no_grad_set=set(['Input']),
-                numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(paddle.NPUPlace(0), ['Filter'],
+                                       'Output',
+                                       no_grad_set=set(['Input']),
+                                       numeric_place=paddle.CPUPlace())
 
     def init_test_case(self):
         self.pad = [0, 0]
@@ -386,12 +396,14 @@ def init_test_case_2(self):
 
 
 class TestConv2DOp_AsyPadding(TestConv2DOp_v2):
+
     def init_paddings(self):
         self.pad = [0, 0, 1, 2]
         self.padding_algorithm = "EXPLICIT"
 
 
 class TestWithPad_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
@@ -405,6 +417,7 @@ def init_paddings(self):
 
 
 class TestWithStride_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [2, 2]
         self.input_size = [2, 3, 6, 6]  # NCHW
@@ -418,6 +431,7 @@ def init_paddings(self):
 
 
 class TestWithGroup_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 2]
@@ -429,6 +443,7 @@ def init_test_case(self):
 
 
 class TestWith1x1_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
@@ -445,6 +460,7 @@ def init_paddings(self):
 
 
 class TestWithDepthWise3x3_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [3, 4, 10, 10]  # NCHW
@@ -464,6 +480,7 @@ def init_paddings(self):
 
 
 class TestWithDepthWise5x5_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 4, 10, 10]  # NCHW
@@ -480,6 +497,7 @@ def init_paddings(self):
 
 
 class TestWithDepthWise7x7_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [2, 2]
         self.input_size = [2, 8, 10, 10]  # NCHW
@@ -496,6 +514,7 @@ def init_paddings(self):
 
 
 class TestWithDilation_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 10, 10]  # NCHW
@@ -515,6 +534,7 @@ def init_paddings(self):
 
 
 class TestWithInput1x1Filter1x1_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [100, 1, 1, 1]  # NCHW
diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_transpose_op_npu.py
index a603f6c9238c7..c11a583e8539f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_transpose_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_transpose_op_npu.py
@@ -20,6 +20,7 @@
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 
@@ -29,6 +30,7 @@
 
 
 class TestConv2DTransposeOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -78,29 +80,26 @@ def test_check_output(self):
 
     def test_check_grad_no_input(self):
         if self.need_check_grad:
-            self.check_grad_with_place(
-                self.place, ['Filter'],
-                'Output',
-                no_grad_set=set(['Input']),
-                numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(self.place, ['Filter'],
+                                       'Output',
+                                       no_grad_set=set(['Input']),
+                                       numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.need_check_grad:
-            self.check_grad_with_place(
-                self.place, ['Input'],
-                'Output',
-                no_grad_set=set(['Filter']),
-                max_relative_error=0.006,
-                numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(self.place, ['Input'],
+                                       'Output',
+                                       no_grad_set=set(['Filter']),
+                                       max_relative_error=0.006,
+                                       numeric_place=paddle.CPUPlace())
 
     def test_check_grad(self):
         if self.need_check_grad:
-            self.check_grad_with_place(
-                self.place,
-                set(['Input', 'Filter']),
-                'Output',
-                max_relative_error=0.02,
-                numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(self.place,
+                                       set(['Input', 'Filter']),
+                                       'Output',
+                                       max_relative_error=0.02,
+                                       numeric_place=paddle.CPUPlace())
 
     def init_test_case(self):
         self.pad = [0, 0]
@@ -119,6 +118,7 @@ def init_dtype(self):
 
 
 class TestWithSymmetricPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -130,12 +130,14 @@ def init_test_case(self):
 
 
 class TestWithSymmetricPad_FP16(TestWithSymmetricPad):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithAsymmetricPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 0, 1, 2]
         self.stride = [1, 1]
@@ -147,12 +149,14 @@ def init_test_case(self):
 
 
 class TestWithAsymmetricPad_FP16(TestWithAsymmetricPad):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithSAMEPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.stride = [2, 1]
         self.dilations = [1, 2]
@@ -164,12 +168,14 @@ def init_test_case(self):
 
 
 class TestWithSAMEPad_FP16(TestWithSAMEPad):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithVALIDPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.dilations = [1, 1]
@@ -181,12 +187,14 @@ def init_test_case(self):
 
 
 class TestWithVALIDPad_FP16(TestWithVALIDPad):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithGroups(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -198,12 +206,14 @@ def init_test_case(self):
 
 
 class TestWithGroups_FP16(TestWithGroups):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithStride(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -215,12 +225,14 @@ def init_test_case(self):
 
 
 class TestWithStride_FP16(TestWithStride):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithDilation(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -232,12 +244,14 @@ def init_test_case(self):
 
 
 class TestWithDilation_FP16(TestWithDilation):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithEvenUpsample(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [2, 2]
@@ -250,12 +264,14 @@ def init_test_case(self):
 
 
 class TestWithEvenUpsample_FP16(TestWithEvenUpsample):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithEvenUpsampleOutputPadding(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [2, 2]
@@ -268,12 +284,14 @@ def init_test_case(self):
 
 
 class TestWithEvenUpsampleOutputPadding_FP16(TestWithEvenUpsampleOutputPadding):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class Test_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -286,12 +304,14 @@ def init_test_case(self):
 
 
 class Test_NHWC_FP16(Test_NHWC):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithSymmetricPad_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -304,12 +324,14 @@ def init_test_case(self):
 
 
 class TestWithSymmetricPad_NHWC_FP16(TestWithSymmetricPad_NHWC):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithAsymmetricPad_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 0, 1, 2]
         self.stride = [1, 1]
@@ -322,12 +344,14 @@ def init_test_case(self):
 
 
 class TestWithAsymmetricPad_NHWC_FP16(TestWithAsymmetricPad_NHWC):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithGroups_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -340,12 +364,14 @@ def init_test_case(self):
 
 
 class TestWithGroups_NHWC_FP16(TestWithGroups_NHWC):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithStride_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -358,12 +384,14 @@ def init_test_case(self):
 
 
 class TestWithStride_NHWC_FP16(TestWithStride_NHWC):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithDilation_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -376,12 +404,14 @@ def init_test_case(self):
 
 
 class TestWithDilation_NHWC_FP16(TestWithDilation_NHWC):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithEvenUpsample_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [2, 2]
@@ -395,12 +425,14 @@ def init_test_case(self):
 
 
 class TestWithEvenUpsample_NHWC_FP16(TestWithEvenUpsample_NHWC):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestWithEvenUpsample_NHWC_output_padding(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [2, 2]
@@ -415,64 +447,63 @@ def init_test_case(self):
 
 class TestWithEvenUpsample_NHWC_output_padding_FP16(
         TestWithEvenUpsample_NHWC_output_padding):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.need_check_grad = False
 
 
 class TestConv2DTransposeAPI(unittest.TestCase):
+
     def test_case1(self):
-        data1 = fluid.layers.data(
-            name='data1', shape=[3, 5, 5], dtype='float32')
-        data2 = fluid.layers.data(
-            name='data2', shape=[5, 5, 3], dtype='float32')
-        out1 = fluid.layers.conv2d_transpose(
-            input=data1,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            data_format='NCHW')
-        out2 = fluid.layers.conv2d_transpose(
-            input=data2,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            data_format='NHWC')
-        out3 = fluid.layers.conv2d_transpose(
-            input=data1,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
-            data_format='NHWC')
-        out4 = fluid.layers.conv2d_transpose(
-            input=data1,
-            groups=3,
-            num_filters=6,
-            filter_size=3,
-            padding=[[0, 0], [0, 0], [2, 1], [0, 0]],
-            data_format='NCHW')
-        out5 = fluid.layers.conv2d_transpose(
-            input=data2,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            padding='SAME',
-            data_format='NCHW')
-        out6 = fluid.layers.conv2d_transpose(
-            input=data1,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            padding='VALID',
-            data_format='NHWC')
-        out7 = fluid.layers.conv2d_transpose(
-            input=data1,
-            groups=1,
-            num_filters=6,
-            output_size=[7, 7],
-            padding=[0, 0],
-            data_format='NHWC')
+        data1 = fluid.layers.data(name='data1',
+                                  shape=[3, 5, 5],
+                                  dtype='float32')
+        data2 = fluid.layers.data(name='data2',
+                                  shape=[5, 5, 3],
+                                  dtype='float32')
+        out1 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             data_format='NCHW')
+        out2 = fluid.layers.conv2d_transpose(input=data2,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             data_format='NHWC')
+        out3 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding=[[0, 0], [1, 1], [1, 1],
+                                                      [0, 0]],
+                                             data_format='NHWC')
+        out4 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=3,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding=[[0, 0], [0, 0], [2, 1],
+                                                      [0, 0]],
+                                             data_format='NCHW')
+        out5 = fluid.layers.conv2d_transpose(input=data2,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding='SAME',
+                                             data_format='NCHW')
+        out6 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding='VALID',
+                                             data_format='NHWC')
+        out7 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=1,
+                                             num_filters=6,
+                                             output_size=[7, 7],
+                                             padding=[0, 0],
+                                             data_format='NHWC')
 
         data1_np = np.random.random((2, 3, 5, 5)).astype("float32")
         data2_np = np.random.random((2, 5, 5, 3)).astype("float32")
@@ -480,12 +511,13 @@ def test_case1(self):
         place = core.NPUPlace(0)
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        results = exe.run(
-            fluid.default_main_program(),
-            feed={"data1": data1_np,
-                  "data2": data2_np},
-            fetch_list=[out1, out2, out3, out4, out5, out6, out7],
-            return_numpy=True)
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "data1": data1_np,
+                              "data2": data2_np
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5, out6, out7],
+                          return_numpy=True)
         self.assertIsNotNone(results[0])
         self.assertIsNotNone(results[1])
         self.assertIsNotNone(results[2])
@@ -496,6 +528,7 @@ def test_case1(self):
 
 
 class TestConv2DTransposeRepr(unittest.TestCase):
+
     def test_case(self):
         paddle.disable_static(paddle.NPUPlace(0))
         x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv3d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv3d_op_npu.py
index d7821f0766926..779a75dddb412 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_conv3d_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_conv3d_op_npu.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 import sys
+
 sys.path.append("..")
 import paddle
 import paddle.fluid.core as core
@@ -30,7 +31,9 @@
 
 
 def create_test_padding_SAME_class(parent):
+
     class TestPaddingSMAECase(parent):
+
         def init_paddings(self):
             self.pad = [0, 0, 0]
             self.padding_algorithm = "SAME"
@@ -41,7 +44,9 @@ def init_paddings(self):
 
 
 def create_test_padding_VALID_class(parent):
+
     class TestPaddingVALIDCase(parent):
+
         def init_paddings(self):
             self.pad = [1, 1, 1]
             self.padding_algorithm = "VALID"
@@ -52,7 +57,9 @@ def init_paddings(self):
 
 
 def create_test_channel_last_class(parent):
+
     class TestChannelLastCase(parent):
+
         def init_data_format(self):
             self.data_format = "NDHWC"
 
@@ -66,7 +73,9 @@ def init_test_case_2(self):
 
 
 def create_test_fp16_class(parent):
+
     class TestFp16Case(parent):
+
         def init_dtype(self):
             self.dtype = np.float16
 
@@ -76,6 +85,7 @@ def init_dtype(self):
 
 
 class TestConv3DOp(OpTest):
+
     def setUp(self):
         self.op_type = "conv3d"
         self.set_npu()
@@ -97,7 +107,8 @@ def setUp(self):
             input,
             filter,
             self.groups,
-            conv3d_param, ).astype(self.dtype)
+            conv3d_param,
+        ).astype(self.dtype)
 
         self.inputs = {
             'Input': OpTest.np_dtype_to_fluid_dtype(input),
@@ -119,33 +130,30 @@ def test_check_grad(self):
         if self.dtype == np.float16:
             return
 
-        self.check_grad_with_place(
-            self.place, {'Input', 'Filter'},
-            'Output',
-            max_relative_error=0.03,
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(self.place, {'Input', 'Filter'},
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
 
-        self.check_grad_with_place(
-            self.place, ['Input'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Filter']),
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(self.place, ['Input'],
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set(['Filter']),
+                                   numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
 
-        self.check_grad_with_place(
-            self.place, ['Filter'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Input']),
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(self.place, ['Filter'],
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set(['Input']),
+                                   numeric_place=paddle.CPUPlace())
 
     def set_npu(self):
         self.__class__.use_npu = True
@@ -173,6 +181,7 @@ def init_test_case(self):
 
 
 class TestCase1(TestConv3DOp):
+
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
@@ -186,6 +195,7 @@ def init_test_case(self):
 
 
 class TestConv3DOp_2(OpTest):
+
     def setUp(self):
         self.op_type = "conv3d"
         self.set_npu()
@@ -231,33 +241,30 @@ def test_check_grad(self):
         if self.dtype == np.float16:
             return
 
-        self.check_grad_with_place(
-            self.place, {'Input', 'Filter'},
-            'Output',
-            max_relative_error=0.03,
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(self.place, {'Input', 'Filter'},
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
 
-        self.check_grad_with_place(
-            self.place, ['Input'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Filter']),
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(self.place, ['Input'],
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set(['Filter']),
+                                   numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
 
-        self.check_grad_with_place(
-            self.place, ['Filter'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Input']),
-            numeric_place=paddle.CPUPlace())
+        self.check_grad_with_place(self.place, ['Filter'],
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set(['Input']),
+                                   numeric_place=paddle.CPUPlace())
 
     def set_npu(self):
         self.__class__.use_npu = True
@@ -291,6 +298,7 @@ def init_test_case_2(self):
 
 
 class TestConv3DOp_AsyPadding(TestConv3DOp_2):
+
     def init_test_case(self):
         self.stride = [1, 1, 2]
         self.input_size = [2, 3, 4, 4, 4]  # NCDHW
@@ -304,6 +312,7 @@ def init_paddings(self):
 
 
 class TestConv3DOp_DiffDataInDiffDim(TestConv3DOp_2):
+
     def init_test_case(self):
         self.stride = [1, 1, 2]
         self.input_size = [2, 3, 4, 5, 5]  # NCDHW
@@ -317,6 +326,7 @@ def init_paddings(self):
 
 
 class TestCase1_AsyPadding(TestConv3DOp_2):
+
     def init_test_case(self):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 4, 4, 4]  # NCDHW
@@ -331,210 +341,196 @@ def init_paddings(self):
 
 # --------- test python API ---------------
 class TestConv3DAPI(unittest.TestCase):
+
     def test_api(self):
 
-        input_NDHWC = fluid.layers.data(
-            name="input_NDHWC",
-            shape=[2, 5, 5, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
-
-        input_NCDHW = fluid.layers.data(
-            name="input_NCDHW",
-            shape=[2, 3, 5, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
-
-        fluid.layers.conv3d(
-            input=input_NDHWC,
-            num_filters=3,
-            filter_size=[3, 3, 3],
-            stride=[1, 1, 1],
-            padding=0,
-            dilation=[1, 1, 1],
-            groups=1,
-            data_format="NCDHW")
-
-        fluid.layers.conv3d(
-            input=input_NCDHW,
-            num_filters=3,
-            filter_size=[3, 3, 3],
-            stride=[1, 1, 1],
-            padding=[1, 2, 1, 0, 1, 0],
-            dilation=[1, 1, 1],
-            groups=1,
-            data_format="NCDHW")
-
-        fluid.layers.conv3d(
-            input=input_NCDHW,
-            num_filters=3,
-            filter_size=[3, 3, 3],
-            stride=[1, 1, 1],
-            padding=[[0, 0], [0, 0], [1, 1], [1, 1], [1, 1]],
-            dilation=[1, 1, 1],
-            groups=1,
-            data_format="NCDHW")
-
-        fluid.layers.conv3d(
-            input=input_NDHWC,
-            num_filters=3,
-            filter_size=[3, 3, 3],
-            stride=[1, 1, 1],
-            padding=[[0, 0], [1, 1], [1, 1], [1, 1], [0, 0]],
-            dilation=[1, 1, 1],
-            groups=1,
-            data_format="NDHWC")
-
-        fluid.layers.conv3d(
-            input=input_NCDHW,
-            num_filters=3,
-            filter_size=[3, 3, 3],
-            stride=[1, 1, 1],
-            padding="SAME",
-            dilation=[1, 1, 1],
-            groups=1,
-            data_format="NCDHW")
-
-        fluid.layers.conv3d(
-            input=input_NCDHW,
-            num_filters=3,
-            filter_size=[3, 3, 3],
-            stride=[1, 1, 1],
-            padding="VALID",
-            dilation=[1, 1, 1],
-            groups=1,
-            data_format="NCDHW")
+        input_NDHWC = fluid.layers.data(name="input_NDHWC",
+                                        shape=[2, 5, 5, 5, 3],
+                                        append_batch_size=False,
+                                        dtype="float32")
+
+        input_NCDHW = fluid.layers.data(name="input_NCDHW",
+                                        shape=[2, 3, 5, 5, 3],
+                                        append_batch_size=False,
+                                        dtype="float32")
+
+        fluid.layers.conv3d(input=input_NDHWC,
+                            num_filters=3,
+                            filter_size=[3, 3, 3],
+                            stride=[1, 1, 1],
+                            padding=0,
+                            dilation=[1, 1, 1],
+                            groups=1,
+                            data_format="NCDHW")
+
+        fluid.layers.conv3d(input=input_NCDHW,
+                            num_filters=3,
+                            filter_size=[3, 3, 3],
+                            stride=[1, 1, 1],
+                            padding=[1, 2, 1, 0, 1, 0],
+                            dilation=[1, 1, 1],
+                            groups=1,
+                            data_format="NCDHW")
+
+        fluid.layers.conv3d(input=input_NCDHW,
+                            num_filters=3,
+                            filter_size=[3, 3, 3],
+                            stride=[1, 1, 1],
+                            padding=[[0, 0], [0, 0], [1, 1], [1, 1], [1, 1]],
+                            dilation=[1, 1, 1],
+                            groups=1,
+                            data_format="NCDHW")
+
+        fluid.layers.conv3d(input=input_NDHWC,
+                            num_filters=3,
+                            filter_size=[3, 3, 3],
+                            stride=[1, 1, 1],
+                            padding=[[0, 0], [1, 1], [1, 1], [1, 1], [0, 0]],
+                            dilation=[1, 1, 1],
+                            groups=1,
+                            data_format="NDHWC")
+
+        fluid.layers.conv3d(input=input_NCDHW,
+                            num_filters=3,
+                            filter_size=[3, 3, 3],
+                            stride=[1, 1, 1],
+                            padding="SAME",
+                            dilation=[1, 1, 1],
+                            groups=1,
+                            data_format="NCDHW")
+
+        fluid.layers.conv3d(input=input_NCDHW,
+                            num_filters=3,
+                            filter_size=[3, 3, 3],
+                            stride=[1, 1, 1],
+                            padding="VALID",
+                            dilation=[1, 1, 1],
+                            groups=1,
+                            data_format="NCDHW")
 
 
 class TestConv3DAPI_Error(unittest.TestCase):
+
     def test_api(self):
-        input = fluid.layers.data(
-            name="input",
-            shape=[2, 5, 5, 5, 4],
-            append_batch_size=False,
-            dtype="float32")
+        input = fluid.layers.data(name="input",
+                                  shape=[2, 5, 5, 5, 4],
+                                  append_batch_size=False,
+                                  dtype="float32")
 
         # ValueError: cudnn
         def run_1():
-            fluid.layers.conv3d(
-                input=input,
-                num_filters=3,
-                filter_size=3,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=1,
-                use_cudnn=[0],
-                data_format="NCDHW")
+            fluid.layers.conv3d(input=input,
+                                num_filters=3,
+                                filter_size=3,
+                                stride=1,
+                                padding=0,
+                                dilation=1,
+                                groups=1,
+                                use_cudnn=[0],
+                                data_format="NCDHW")
 
         self.assertRaises(ValueError, run_1)
 
         # ValueError: data_format
         def run_2():
-            fluid.layers.conv3d(
-                input=input,
-                num_filters=3,
-                filter_size=[3, 3, 3],
-                stride=[1, 1, 1],
-                padding=0,
-                dilation=[1, 1, 1],
-                groups=1,
-                use_cudnn=False,
-                data_format="NCHWC")
+            fluid.layers.conv3d(input=input,
+                                num_filters=3,
+                                filter_size=[3, 3, 3],
+                                stride=[1, 1, 1],
+                                padding=0,
+                                dilation=[1, 1, 1],
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NCHWC")
 
         self.assertRaises(ValueError, run_2)
 
         # ValueError: padding
         def run_3():
-            fluid.layers.conv3d(
-                input=input,
-                num_filters=3,
-                filter_size=3,
-                stride=1,
-                padding="SAMEE",
-                dilation=1,
-                groups=1,
-                use_cudnn=False,
-                data_format="NCDHW")
+            fluid.layers.conv3d(input=input,
+                                num_filters=3,
+                                filter_size=3,
+                                stride=1,
+                                padding="SAMEE",
+                                dilation=1,
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NCDHW")
 
         self.assertRaises(ValueError, run_3)
 
         def run_4():
-            fluid.layers.conv3d(
-                input=input,
-                num_filters=3,
-                filter_size=3,
-                stride=1,
-                padding=[[0, 1], [0, 0], [0, 1], [0, 1], [0, 1]],
-                dilation=1,
-                groups=1,
-                use_cudnn=False,
-                data_format="NCDHW")
+            fluid.layers.conv3d(input=input,
+                                num_filters=3,
+                                filter_size=3,
+                                stride=1,
+                                padding=[[0, 1], [0, 0], [0, 1], [0, 1], [0,
+                                                                          1]],
+                                dilation=1,
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NCDHW")
 
         self.assertRaises(ValueError, run_4)
 
         def run_5():
-            fluid.layers.conv3d(
-                input=input,
-                num_filters=3,
-                filter_size=0,
-                stride=0,
-                padding=[[0, 1], [0, 1], [0, 1], [0, 1], [0, 1]],
-                dilation=1,
-                groups=1,
-                use_cudnn=False,
-                data_format="NDHWC")
+            fluid.layers.conv3d(input=input,
+                                num_filters=3,
+                                filter_size=0,
+                                stride=0,
+                                padding=[[0, 1], [0, 1], [0, 1], [0, 1], [0,
+                                                                          1]],
+                                dilation=1,
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NDHWC")
 
         self.assertRaises(ValueError, run_5)
 
         # ValueError: channel dimmention
-        x = fluid.layers.data(
-            name="x",
-            shape=[2, 5, 5, 5, -1],
-            append_batch_size=False,
-            dtype="float32")
+        x = fluid.layers.data(name="x",
+                              shape=[2, 5, 5, 5, -1],
+                              append_batch_size=False,
+                              dtype="float32")
 
         def run_6():
-            fluid.layers.conv3d(
-                input=x,
-                num_filters=3,
-                filter_size=3,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=1,
-                use_cudnn=False,
-                data_format="NDHWC")
+            fluid.layers.conv3d(input=x,
+                                num_filters=3,
+                                filter_size=3,
+                                stride=1,
+                                padding=0,
+                                dilation=1,
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NDHWC")
 
         self.assertRaises(ValueError, run_6)
 
         # ValueError: groups
         def run_7():
-            fluid.layers.conv3d(
-                input=input,
-                num_filters=3,
-                filter_size=3,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=3,
-                use_cudnn=False,
-                data_format="NDHWC")
+            fluid.layers.conv3d(input=input,
+                                num_filters=3,
+                                filter_size=3,
+                                stride=1,
+                                padding=0,
+                                dilation=1,
+                                groups=3,
+                                use_cudnn=False,
+                                data_format="NDHWC")
 
         self.assertRaises(ValueError, run_7)
 
         # ValueError: filter num
         def run_8():
-            fluid.layers.conv3d(
-                input=input,
-                num_filters=0,
-                filter_size=0,
-                stride=0,
-                padding=0,
-                dilation=0,
-                groups=1,
-                use_cudnn=False,
-                data_format="NDHWC")
+            fluid.layers.conv3d(input=input,
+                                num_filters=0,
+                                filter_size=0,
+                                stride=0,
+                                padding=0,
+                                dilation=0,
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NDHWC")
 
         self.assertRaises(ValueError, run_8)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
index a4769442b083e..44baf7a547c00 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestCos(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "cos"
@@ -55,6 +57,7 @@ def test_check_grad(self):
 
 
 class TestCosFp16(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "cos"
@@ -81,6 +84,7 @@ def test_check_output(self):
 
 
 class TestCosNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -95,8 +99,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             c = paddle.multiply(a, b)
             d = paddle.cos(c)
@@ -120,12 +125,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_crop_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_crop_op_npu.py
index 02168aeb71d3e..6398d7d1ed52d 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_crop_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_crop_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -29,6 +30,7 @@
 
 
 class TestCropOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -74,6 +76,7 @@ def test_check_output(self):
 
 
 class TestCase1(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (16, 8, 32)
         self.crop_shape = [2, 2, 3]
@@ -81,6 +84,7 @@ def initTestCase(self):
 
 
 class TestCase2(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (15, 8)
         self.crop_shape = [15, 8]
@@ -88,6 +92,7 @@ def initTestCase(self):
 
 
 class TestCase3(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (4, 10)
         self.crop_shape = [2, 3]
@@ -96,6 +101,7 @@ def initTestCase(self):
 
 
 class TestCase4(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (10, 9, 14)
         self.crop_shape = [3, 3, 5]
@@ -103,6 +109,7 @@ def initTestCase(self):
 
 
 class TestCase5(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (10, 9, 14)
         self.crop_shape = [3, 3, 5]
@@ -111,6 +118,7 @@ def initTestCase(self):
 
 
 class TestCase6(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (10, 9, 14)
         self.crop_shape = [3, 3, 5]
@@ -121,6 +129,7 @@ def initTestCase(self):
 
 
 class TestCase7(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (10, 9, 14)
         self.crop_shape = [3, 3, 5]
@@ -130,6 +139,7 @@ def initTestCase(self):
 
 
 class TestCase8(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (10, 9, 14)
         self.crop_shape = [3, 3, 5]
@@ -138,6 +148,7 @@ def initTestCase(self):
 
 
 class TestCase9(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (10, 9, 14)
         self.crop_shape = [3, 3, 5]
@@ -146,6 +157,7 @@ def initTestCase(self):
 
 
 class TestCase10(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (10, 9, 14)
         self.crop_shape = [3, 3, 5]
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py
index 9289da6641e7d..9cf22adbb7591 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py
@@ -26,6 +26,7 @@
 
 
 class TestCumsumOp(unittest.TestCase):
+
     def run_cases(self):
         data_np = np.arange(12).reshape(3, 4)
         data = paddle.to_tensor(data_np)
@@ -96,6 +97,7 @@ def test_name(self):
 
 
 class TestNPUCumSumOp1(OpTest):
+
     def setUp(self):
         self.op_type = "cumsum"
         self.set_npu()
@@ -119,17 +121,18 @@ def init_testcase(self):
 
 
 class TestNPUCumSumOp2(TestNPUCumSumOp1):
+
     def init_testcase(self):
         self.attrs = {'axis': -1, 'reverse': True}
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
         self.outputs = {
-            'Out': np.flip(
-                np.flip(
-                    self.inputs['X'], axis=2).cumsum(axis=2), axis=2)
+            'Out': np.flip(np.flip(self.inputs['X'], axis=2).cumsum(axis=2),
+                           axis=2)
         }
 
 
 class TestNPUCumSumOp3(TestNPUCumSumOp1):
+
     def init_testcase(self):
         self.attrs = {'axis': 1}
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
@@ -137,6 +140,7 @@ def init_testcase(self):
 
 
 class TestNPUCumSumOp4(TestNPUCumSumOp1):
+
     def init_testcase(self):
         self.attrs = {'axis': 0}
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
@@ -144,98 +148,107 @@ def init_testcase(self):
 
 
 class TestNPUCumSumOp5(TestNPUCumSumOp1):
+
     def init_testcase(self):
         self.inputs = {'X': np.random.random((5, 20)).astype(self.dtype)}
         self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
 
 
 class TestNPUCumSumOp7(TestNPUCumSumOp1):
+
     def init_testcase(self):
         self.inputs = {'X': np.random.random((100)).astype(self.dtype)}
         self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
 
 
 class TestNPUCumSumExclusive1(TestNPUCumSumOp1):
+
     def init_testcase(self):
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((4, 5, 65)).astype(self.dtype)
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
 
 class TestNPUCumSumExclusive2(TestNPUCumSumOp1):
+
     def init_testcase(self):
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((1, 1, 888)).astype(self.dtype)
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (1, 1, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (1, 1, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
 
 class TestNPUCumSumExclusive3(TestNPUCumSumOp1):
+
     def init_testcase(self):
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((4, 5, 888)).astype(self.dtype)
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
 
 class TestNPUCumSumExclusive4(TestNPUCumSumOp1):
+
     def init_testcase(self):
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((1, 1, 3049)).astype(self.dtype)
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (1, 1, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (1, 1, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
 
 class TestNPUCumSumExclusive5(TestNPUCumSumOp1):
+
     def init_testcase(self):
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((4, 5, 3096)).astype(self.dtype)
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
 
 class TestNPUCumSumReverseExclusive(TestNPUCumSumOp1):
+
     def init_testcase(self):
         self.attrs = {'axis': 2, 'reverse': True, "exclusive": True}
         a = np.random.random((4, 5, 6)).astype(self.dtype)
         self.inputs = {'X': a}
         a = np.flip(a, axis=2)
         self.outputs = {
-            'Out': np.concatenate(
-                (np.flip(
-                    a[:, :, :-1].cumsum(axis=2), axis=2), np.zeros(
-                        (4, 5, 1), dtype=self.dtype)),
+            'Out':
+            np.concatenate(
+                (np.flip(a[:, :, :-1].cumsum(axis=2),
+                         axis=2), np.zeros((4, 5, 1), dtype=self.dtype)),
                 axis=2)
         }
 
 
 class TestNPUCumSumWithFlatten1(TestNPUCumSumOp1):
+
     def init_testcase(self):
         self.attrs = {'flatten': True}
         self.inputs = {'X': np.random.random((5, 6)).astype(self.dtype)}
@@ -243,6 +256,7 @@ def init_testcase(self):
 
 
 class TestNPUCumSumWithFlatten2(TestNPUCumSumOp1):
+
     def init_testcase(self):
         self.attrs = {'flatten': True}
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
@@ -251,21 +265,22 @@ def init_testcase(self):
 
 #----------------Cumsum Int64----------------
 class TestNPUCumSumOpInt64(TestNPUCumSumOp1):
+
     def init_testcase(self):
         self.attrs = {'axis': -1, 'reverse': True}
         self.inputs = {
-            'X': np.random.randint(
-                1, 10000, size=(5, 6, 10)).astype(self.dtype)
+            'X': np.random.randint(1, 10000, size=(5, 6, 10)).astype(self.dtype)
         }
         self.outputs = {
-            'Out': np.flip(
-                np.flip(
-                    self.inputs['X'], axis=2).cumsum(axis=2), axis=2)
+            'Out': np.flip(np.flip(self.inputs['X'], axis=2).cumsum(axis=2),
+                           axis=2)
         }
 
 
 def create_test_int64(parent):
+
     class TestCumSumInt64(parent):
+
         def init_dtype(self):
             self.dtype = np.int64
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py
index a190aa9b6f2be..7271644ce8294 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 import math
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestNpuDensityPriorBoxOp(OpTest):
+
     def set_data(self):
         self.init_test_params()
         self.init_test_input()
@@ -89,8 +91,8 @@ def init_test_params(self):
         if len(self.fixed_sizes) > 0 and len(self.densities) > 0:
             for density in self.densities:
                 if len(self.fixed_ratios) > 0:
-                    self.num_priors += len(self.fixed_ratios) * (pow(density,
-                                                                     2))
+                    self.num_priors += len(self.fixed_ratios) * (pow(
+                        density, 2))
         self.offset = 0.5
         self.atol = 1e-5
 
@@ -149,6 +151,7 @@ def init_test_output(self):
 
 
 class TestNpuDensityPriorBoxFlatten(TestNpuDensityPriorBoxOp):
+
     def set_density(self):
         self.densities = [3, 4]
         self.fixed_sizes = [1.0, 2.0]
@@ -161,6 +164,7 @@ def set_density(self):
 
 
 class TestNpuDensityPriorBoxOp1(TestNpuDensityPriorBoxOp):
+
     def set_density(self):
         super(TestNpuDensityPriorBoxOp1, self).set_density()
         self.layer_w = 1
@@ -168,6 +172,7 @@ def set_density(self):
 
 
 class TestNpuDensityPriorBoxOp2(TestNpuDensityPriorBoxOp):
+
     def set_density(self):
         super(TestNpuDensityPriorBoxOp2, self).set_density()
         self.layer_w = 15
@@ -177,12 +182,14 @@ def set_density(self):
 
 
 class TestNpuDensityPriorBoxOp3(TestNpuDensityPriorBoxOp):
+
     def set_density(self):
         super(TestNpuDensityPriorBoxOp3, self).set_density()
         self.fixed_ratios = [1.0, 4.0]
 
 
 class TestNpuDensityPriorBoxOpFP16(TestNpuDensityPriorBoxOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
index fea8502f2d766..bca1d631c8e55 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -29,6 +30,7 @@
 
 
 class TestDropoutOp(OpTest):
+
     def setUp(self):
         self.op_type = "dropout"
         self.set_npu()
@@ -164,6 +166,7 @@ def test_check_output(self):
 
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestDropoutOpInference2(TestDropoutOpInference):
+
     def setUp(self):
         self.op_type = "dropout"
         self.set_npu()
@@ -185,8 +188,7 @@ def setUp(self):
         self.init_dtype()
         self.inputs = {
             "X": np.random.random((32, 64)).astype(self.dtype),
-            "Seed": np.asarray(
-                [125], dtype="int32")
+            "Seed": np.asarray([125], dtype="int32")
         }
         self.attrs = {
             'dropout_prob': 0.0,
@@ -211,6 +213,7 @@ def set_npu(self):
 
 
 class TestDropoutAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace(), paddle.NPUPlace(0)]
@@ -218,36 +221,43 @@ def setUp(self):
     def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[40, 40], dtype="float32")
-            res1 = paddle.nn.functional.dropout(
-                x=input, p=0., training=False, mode='upscale_in_train')
-            res2 = paddle.nn.functional.dropout(
-                x=input, p=0., axis=0, training=True, mode='upscale_in_train')
-            res3 = paddle.nn.functional.dropout(
-                x=input, p=0., axis=0, training=False, mode='upscale_in_train')
-            res4 = paddle.nn.functional.dropout(
-                x=input,
-                p=0.,
-                axis=[0, 1],
-                training=True,
-                mode='upscale_in_train')
-            res5 = paddle.nn.functional.dropout(
-                x=input,
-                p=0.,
-                axis=[0, 1],
-                training=False,
-                mode='upscale_in_train')
-            res6 = paddle.nn.functional.dropout(
-                x=input, p=1., training=True, mode='upscale_in_train')
+            res1 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                training=False,
+                                                mode='upscale_in_train')
+            res2 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=0,
+                                                training=True,
+                                                mode='upscale_in_train')
+            res3 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=0,
+                                                training=False,
+                                                mode='upscale_in_train')
+            res4 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=[0, 1],
+                                                training=True,
+                                                mode='upscale_in_train')
+            res5 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=[0, 1],
+                                                training=False,
+                                                mode='upscale_in_train')
+            res6 = paddle.nn.functional.dropout(x=input,
+                                                p=1.,
+                                                training=True,
+                                                mode='upscale_in_train')
             res7 = paddle.fluid.layers.dropout(
                 x=input,
                 dropout_prob=0.,
                 dropout_implementation='upscale_in_train')
-            res8 = paddle.nn.functional.dropout(
-                x=input,
-                p=0.,
-                axis=(0, 1),
-                training=False,
-                mode='upscale_in_train')
+            res8 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=(0, 1),
+                                                training=False,
+                                                mode='upscale_in_train')
 
             in_np = np.random.random([40, 40]).astype("float32")
             res_np = in_np
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
index f24c6c455a0cb..0883fca07943f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
@@ -16,6 +16,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 
 from paddle.fluid import Program, program_guard
@@ -28,6 +29,7 @@
 
 
 class TestElementwiseAddOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_add"
@@ -73,13 +75,15 @@ def test_check_grad_normal(self):
                 self.place,
                 ['X', 'Y'],
                 'Out',
-                max_relative_error=0.15, )
+                max_relative_error=0.15,
+            )
         else:
             self.check_grad_with_place(
                 self.place,
                 ['X', 'Y'],
                 'Out',
-                max_relative_error=0.006, )
+                max_relative_error=0.006,
+            )
 
     def test_check_grad_ingore_x(self):
         if self.dtype == np.int64:
@@ -91,14 +95,16 @@ def test_check_grad_ingore_x(self):
                 ['Y'],
                 'Out',
                 no_grad_set=set("X"),
-                max_relative_error=0.92, )
+                max_relative_error=0.92,
+            )
         else:
             self.check_grad_with_place(
                 self.place,
                 ['Y'],
                 'Out',
                 no_grad_set=set("X"),
-                max_relative_error=0.006, )
+                max_relative_error=0.006,
+            )
 
     def test_check_grad_ingore_y(self):
         if self.dtype == np.int64:
@@ -110,22 +116,26 @@ def test_check_grad_ingore_y(self):
                 ['X'],
                 'Out',
                 no_grad_set=set("Y"),
-                max_relative_error=0.8, )
+                max_relative_error=0.8,
+            )
         else:
             self.check_grad_with_place(
                 self.place,
                 ['X'],
                 'Out',
                 no_grad_set=set("Y"),
-                max_relative_error=0.006, )
+                max_relative_error=0.006,
+            )
 
 
 class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestINT64ElementwiseAddOp(TestElementwiseAddOp):
+
     def init_dtype(self):
         self.dtype = np.int64
 
@@ -133,6 +143,7 @@ def init_dtype(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(1).astype(self.dtype)
@@ -142,6 +153,7 @@ def init_input_output(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestFP16ElementwiseAddOp_scalar(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(1).astype(self.dtype)
@@ -151,6 +163,7 @@ def init_input_output(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
 class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(1, 1).astype(self.dtype)
@@ -160,6 +173,7 @@ def init_input_output(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
 class TestFP16ElementwiseAddOp_scalar2(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(1, 1).astype(self.dtype)
@@ -167,6 +181,7 @@ def init_input_output(self):
 
 
 class TestAddAPI(unittest.TestCase):
+
     def test_name(self):
         with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
@@ -191,8 +206,10 @@ def test_static(self):
 
             place = paddle.NPUPlace(0)
             exe = paddle.static.Executor(place)
-            x_value, y_value, z_value = exe.run(feed={"x": x_np,
-                                                      "y": y_np},
+            x_value, y_value, z_value = exe.run(feed={
+                "x": x_np,
+                "y": y_np
+            },
                                                 fetch_list=[x, y, z])
 
             z_expected = np.array([3., 8., 6.])
@@ -211,24 +228,28 @@ def test_static(self):
 
 
 class TestAddError(unittest.TestCase):
+
     def test_errors(self):
         with paddle.static.program_guard(paddle.static.Program()):
             # the input of elementwise_add must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
-            y1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            y1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.NPUPlace(0))
             self.assertRaises(TypeError, paddle.add, x1, y1)
 
             # the input dtype must be float16 or float32 or float64 or int32 or int64
-            x2 = paddle.static.data(
-                name='x2', shape=[3, 4, 5, 6], dtype="uint8")
-            y2 = paddle.static.data(
-                name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            x2 = paddle.static.data(name='x2',
+                                    shape=[3, 4, 5, 6],
+                                    dtype="uint8")
+            y2 = paddle.static.data(name='y2',
+                                    shape=[3, 4, 5, 6],
+                                    dtype="uint8")
             self.assertRaises(TypeError, paddle.add, x2, y2)
 
 
 class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.random((100, )).astype(self.dtype)
         self.y = np.random.random((100, )).astype(self.dtype)
@@ -236,6 +257,7 @@ def init_input_output(self):
 
 
 class TestFP16ElementwiseAddOp_Vector(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.random((100, )).astype(self.dtype)
         self.y = np.random.random((100, )).astype(self.dtype)
@@ -243,6 +265,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -253,6 +276,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_0(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -263,6 +287,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 100, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -273,6 +298,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_1(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 100, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -283,6 +309,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -290,6 +317,7 @@ def init_input_output(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_2(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -297,6 +325,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 1).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -307,6 +336,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_3(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -317,6 +347,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
@@ -327,6 +358,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_4(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
@@ -337,6 +369,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 3, 12).astype(self.dtype)
         self.y = np.random.rand(10, 1, 12).astype(self.dtype)
@@ -344,6 +377,7 @@ def init_input_output(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_5(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 3, 12).astype(self.dtype)
         self.y = np.random.rand(10, 1, 12).astype(self.dtype)
@@ -351,6 +385,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
         self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
@@ -358,6 +393,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
         self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
@@ -365,6 +401,7 @@ def init_input_output(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_6(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
         self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
@@ -372,6 +409,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -382,6 +420,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_rowwise_add_0(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -394,6 +433,7 @@ def init_axis(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 1).astype(self.dtype)
         self.y = np.random.rand(1).astype(self.dtype)
@@ -406,6 +446,7 @@ def init_axis(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 1).astype(self.dtype)
         self.y = np.random.rand(1).astype(self.dtype)
@@ -416,6 +457,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100, 1, 1).astype(self.dtype)
@@ -426,6 +468,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100, 1, 1).astype(self.dtype)
@@ -436,6 +479,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(1, 1, 100).astype(self.dtype)
@@ -446,6 +490,7 @@ def init_axis(self):
 
 
 class TestElementwiseFP16AddOp_commonuse_add1(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(1, 1, 100).astype(self.dtype)
@@ -456,6 +501,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
         self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
@@ -466,6 +512,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 12).astype(self.dtype)
         self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype)
@@ -476,6 +523,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_same_shape_ysize_large(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 1, 12).astype(self.dtype)
         self.y = np.random.rand(10, 2, 12).astype(self.dtype)
@@ -486,13 +534,14 @@ def init_axis(self):
 
 
 class TestElementwiseAddOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of elementwise_add must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
-            y1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            y1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.NPUPlace(0))
             self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1)
 
             # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64
@@ -503,6 +552,7 @@ def test_errors(self):
 
 
 class TestAddApi(unittest.TestCase):
+
     def _executed_api(self, x, y, name=None):
         return paddle.add(x, y, name)
 
@@ -546,11 +596,13 @@ def test_dygraph(self):
 
 
 class TestAddInplaceApi(TestAddApi):
+
     def _executed_api(self, x, y, name=None):
         return x.add_(y, name)
 
 
 class TestAddInplaceBroadcastSuccess(unittest.TestCase):
+
     def init_data(self):
         self.x_numpy = np.random.rand(2, 3, 4).astype('float')
         self.y_numpy = np.random.rand(3, 4).astype('float')
@@ -567,18 +619,21 @@ def test_broadcast_success(self):
 
 
 class TestAddInplaceBroadcastSuccess2(TestAddInplaceBroadcastSuccess):
+
     def init_data(self):
         self.x_numpy = np.random.rand(1, 2, 3, 1).astype('float')
         self.y_numpy = np.random.rand(3, 1).astype('float')
 
 
 class TestAddInplaceBroadcastSuccess3(TestAddInplaceBroadcastSuccess):
+
     def init_data(self):
         self.x_numpy = np.random.rand(2, 3, 1, 5).astype('float')
         self.y_numpy = np.random.rand(1, 3, 1, 5).astype('float')
 
 
 class TestAddInplaceBroadcastError(unittest.TestCase):
+
     def init_data(self):
         self.x_numpy = np.random.rand(3, 4).astype('float')
         self.y_numpy = np.random.rand(2, 3, 4).astype('float')
@@ -597,12 +652,14 @@ def broadcast_shape_error():
 
 
 class TestAddInplaceBroadcastError2(TestAddInplaceBroadcastError):
+
     def init_data(self):
         self.x_numpy = np.random.rand(2, 1, 4).astype('float')
         self.y_numpy = np.random.rand(2, 3, 4).astype('float')
 
 
 class TestAddInplaceBroadcastError3(TestAddInplaceBroadcastError):
+
     def init_data(self):
         self.x_numpy = np.random.rand(5, 2, 1, 4).astype('float')
         self.y_numpy = np.random.rand(2, 3, 4).astype('float')
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py
index 07c22868d5acc..9dcf4aa707ce5 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestElementwiseDiv(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_div"
@@ -60,7 +62,8 @@ def test_check_grad_normal(self):
             self.place,
             ['X', 'Y'],
             'Out',
-            max_relative_error=0.007, )
+            max_relative_error=0.007,
+        )
 
     def test_check_grad_ingore_x(self):
         self.check_grad_with_place(
@@ -68,14 +71,17 @@ def test_check_grad_ingore_x(self):
             ['Y'],
             'Out',
             max_relative_error=0.007,
-            no_grad_set=set("X"), )
+            no_grad_set=set("X"),
+        )
 
     def test_check_grad_ingore_y(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', no_grad_set=set("Y"))
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set("Y"))
 
 
 class TestElementwiseDivFp16(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_div"
@@ -106,6 +112,7 @@ def test_check_output(self):
 
 
 class TestElementwiseDivNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -124,8 +131,9 @@ def _test(self, run_npu=True):
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
             c = paddle.static.data(name="c", shape=[32, 32], dtype='float32')
             d = paddle.static.data(name="d", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             e = paddle.multiply(a, b)
             f = paddle.multiply(c, d)
@@ -175,6 +183,7 @@ def test_npu(self):
 
 
 class TestFloatStatus(unittest.TestCase):
+
     def test_overflow(self):
         paddle.disable_static()
         paddle.set_device('npu')
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py
index 36d282a3d06f7..3edf270566dd4 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -25,6 +26,7 @@
 
 
 class TestElementwiseFloorDiv(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_floordiv"
         self.set_npu()
@@ -55,6 +57,7 @@ def test_check_output(self):
 
 
 class TestElementwiseFloorDiv2(TestElementwiseFloorDiv):
+
     def init_dtype(self):
         self.dtype = "int32"
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
index cbfc07f354479..6d3683615978f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -80,6 +81,7 @@ def ComputeGrad(x, y, out, axis):
 
 
 class TestElementwiseMaxOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_max"
@@ -105,8 +107,8 @@ def init_dtype(self):
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
         sgn = np.random.choice([-1, 1], [13, 17]).astype(self.dtype)
-        self.y = self.x + sgn * np.random.uniform(0.1, 1,
-                                                  [13, 17]).astype(self.dtype)
+        self.y = self.x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype(
+            self.dtype)
         self.out = np.maximum(self.x, self.y)
 
     def init_axis(self):
@@ -119,15 +121,18 @@ def test_check_grad_normal(self):
         self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        self.check_grad_with_place(
-            self.place, ['Y'], 'Out', no_grad_set=set("X"))
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', no_grad_set=set("Y"))
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set("Y"))
 
 
 class TestElementwiseMaxOp_int32(TestElementwiseMaxOp):
+
     def init_dtype(self):
         self.dtype = np.int32
 
@@ -143,6 +148,7 @@ def test_check_grad_ingore_y(self):
 
 
 class TestElementwiseMaxOp_scalar(TestElementwiseMaxOp):
+
     def init_input_output(self):
         self.x = np.random.random_integers(-5, 5, [2, 3, 20]).astype(self.dtype)
         self.y = np.array([0.5]).astype(self.dtype)
@@ -150,6 +156,7 @@ def init_input_output(self):
 
 
 class TestElementwiseMaxOp_vector(TestElementwiseMaxOp):
+
     def init_input_output(self):
         self.x = np.random.random((100, )).astype(self.dtype)
         sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
@@ -159,6 +166,7 @@ def init_input_output(self):
 
 
 class TestElementwiseMaxOp_broadcast_0(TestElementwiseMaxOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(0.5, 1, (100, 5, 2)).astype(self.dtype)
         sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
@@ -171,6 +179,7 @@ def init_axis(self):
 
 
 class TestElementwiseMaxOp_broadcast_1(TestElementwiseMaxOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(0.5, 1, (2, 100, 3)).astype(self.dtype)
         sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
@@ -183,22 +192,21 @@ def init_axis(self):
 
     def test_check_grad_ingore_x(self):
         _, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[dy])
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"),
+                                   user_defined_grads=[dy])
 
     def test_check_grad_ingore_y(self):
         dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            no_grad_set=set("Y"),
-            user_defined_grads=[dx])
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set("Y"),
+                                   user_defined_grads=[dx])
 
 
 class TestElementwiseMaxOp_broadcast_2(TestElementwiseMaxOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(0.5, 1, (2, 3, 100)).astype(self.dtype)
         sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
@@ -208,27 +216,27 @@ def init_input_output(self):
 
     def test_check_grad_normal(self):
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
+        self.check_grad_with_place(self.place, ['X', 'Y'],
+                                   'Out',
+                                   user_defined_grads=[dx, dy])
 
     def test_check_grad_ingore_x(self):
         _, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[dy])
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"),
+                                   user_defined_grads=[dy])
 
     def test_check_grad_ingore_y(self):
         dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            no_grad_set=set("Y"),
-            user_defined_grads=[dx])
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set("Y"),
+                                   user_defined_grads=[dx])
 
 
 class TestElementwiseMaxOp_broadcast_3(TestElementwiseMaxOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(0.5, 1, (2, 50, 2, 1)).astype(self.dtype)
         sgn = np.random.choice([-1, 1], (50, 2)).astype(self.dtype)
@@ -241,6 +249,7 @@ def init_axis(self):
 
 
 class TestElementwiseMaxOp_broadcast_4(TestElementwiseMaxOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(0.5, 1, (2, 3, 4, 5)).astype(self.dtype)
         sgn = np.random.choice([-1, 1], (2, 3, 1, 5)).astype(self.dtype)
@@ -250,6 +259,7 @@ def init_input_output(self):
 
 
 class TestElementwiseMaxOp_broadcast_5(TestElementwiseMaxOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(0.5, 1, (2, 3, 4, 5)).astype(self.dtype)
         sgn = np.random.choice([-1, 1], (2, 3, 1, 1)).astype(self.dtype)
@@ -259,6 +269,7 @@ def init_input_output(self):
 
 
 class TestElementwiseMaxNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -273,8 +284,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             c = paddle.maximum(a, b)
 
@@ -297,12 +309,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
index e191224df81ee..2ddd7b4069d59 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -29,6 +30,7 @@
 
 
 class TestElementwiseMinOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_min"
@@ -51,8 +53,8 @@ def init_input_output(self):
         # to avoid them being too close to each other.
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
         self.sgn = np.random.choice([-1, 1], [13, 17]).astype(self.dtype)
-        self.y = self.x + self.sgn * np.random.uniform(
-            0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = self.x + self.sgn * np.random.uniform(0.1, 1, [13, 17]).astype(
+            self.dtype)
         self.out = np.minimum(self.x, self.y)
         self.axis = -1
 
@@ -64,59 +66,64 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['X', 'Y'], 'Out', max_relative_error=0.5)
+            self.check_grad_with_place(self.place, ['X', 'Y'],
+                                       'Out',
+                                       max_relative_error=0.5)
         else:
             self.check_grad_with_place(
                 self.place,
                 ['X', 'Y'],
-                'Out', )
+                'Out',
+            )
 
     def test_check_grad_ingore_x(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['Y'],
-                'Out',
-                no_grad_set=set("X"),
-                max_relative_error=0.9)
+            self.check_grad_with_place(self.place, ['Y'],
+                                       'Out',
+                                       no_grad_set=set("X"),
+                                       max_relative_error=0.9)
         else:
             self.check_grad_with_place(
                 self.place,
                 ['Y'],
                 'Out',
-                no_grad_set=set("X"), )
+                no_grad_set=set("X"),
+            )
 
     def test_check_grad_ingore_y(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['X'],
-                'Out',
-                no_grad_set=set("Y"),
-                max_relative_error=0.1)
+            self.check_grad_with_place(self.place, ['X'],
+                                       'Out',
+                                       no_grad_set=set("Y"),
+                                       max_relative_error=0.1)
         else:
             self.check_grad_with_place(
                 self.place,
                 ['X'],
                 'Out',
-                no_grad_set=set("Y"), )
+                no_grad_set=set("Y"),
+            )
 
 
 class TestElementwiseMinOpFp16(TestElementwiseMinOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestElementwiseMinOp_Vector(TestElementwiseMinOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, (100, )).astype(self.dtype)
         self.sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
-        self.y = self.x + self.sgn * np.random.uniform(0.1, 1, (
-            100, )).astype(self.dtype)
+        self.y = self.x + self.sgn * np.random.uniform(0.1, 1, (100, )).astype(
+            self.dtype)
         self.out = np.minimum(self.x, self.y)
         self.axis = -1
 
 
 class TestElementwiseMinOpFp16_Vector(TestElementwiseMinOp_Vector):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -124,6 +131,7 @@ def init_dtype(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseMinOp_scalar(TestElementwiseMinOp):
+
     def init_input_output(self):
         self.x = np.random.random_integers(-5, 5, [10, 3, 4]).astype(self.dtype)
         self.y = np.array([0.5]).astype(self.dtype)
@@ -134,11 +142,13 @@ def init_input_output(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseMinOpFp16_scalar(TestElementwiseMinOp_scalar):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestElementwiseMinOp_broadcast(TestElementwiseMinOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(0.5, 1, (2, 3, 100)).astype(self.dtype)
         self.sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
@@ -149,11 +159,13 @@ def init_input_output(self):
 
 
 class TestElementwiseMinOpFp16_broadcast(TestElementwiseMinOp_broadcast):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestElementwiseMinOpNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -168,8 +180,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             c = paddle.minimum(a, b)
 
@@ -192,12 +205,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mod_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mod_op_npu.py
index d6551e84080a9..763f5db52b204 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mod_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mod_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 
@@ -29,6 +30,7 @@
 
 
 class TestElementwiseModOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -68,11 +70,13 @@ def test_check_output(self):
 
 
 class TestElementwiseModOpInt64(TestElementwiseModOp):
+
     def init_dtype(self):
         self.dtype = np.int64
 
 
 class TestElementwiseModOp_scalar(TestElementwiseModOp):
+
     def init_input_output(self):
         scale_x = random.randint(0, 100000000)
         scale_y = random.randint(1, 100000000)
@@ -82,6 +86,7 @@ def init_input_output(self):
 
 
 class TestElementwiseModOpFloat(TestElementwiseModOp):
+
     def init_dtype(self):
         self.dtype = np.float32
 
@@ -95,6 +100,7 @@ def test_check_output(self):
 
 
 class TestElementwiseModOpDouble(TestElementwiseModOpFloat):
+
     def init_dtype(self):
         self.dtype = np.float64
 
@@ -103,6 +109,7 @@ def test_check_output(self):
 
 
 class TestElementwiseModOpFP16(TestElementwiseModOpFloat):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -111,6 +118,7 @@ def test_check_output(self):
 
 
 class TestElementwiseModOp_broadcast_0(TestElementwiseModOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -121,6 +129,7 @@ def init_axis(self):
 
 
 class TestElementwiseModOp_broadcast_1(TestElementwiseModOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 100, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -131,6 +140,7 @@ def init_axis(self):
 
 
 class TestElementwiseModOp_broadcast_2(TestElementwiseModOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -141,6 +151,7 @@ def init_axis(self):
 
 
 class TestRemainderOp(unittest.TestCase):
+
     def test_name(self):
         paddle.set_device('npu:0')
         with fluid.program_guard(fluid.Program()):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
index 92bbc9f536d13..abdf43e98dbd8 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -26,6 +27,7 @@
 
 
 class ElementwiseMulOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -53,12 +55,14 @@ def test_check_grad_normal(self):
         self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        self.check_grad_with_place(
-            self.place, ['Y'], 'Out', no_grad_set=set("X"))
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', no_grad_set=set('Y'))
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set('Y'))
 
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
@@ -75,6 +79,7 @@ def init_axis(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseMulOp_scalar(ElementwiseMulOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_mul"
@@ -86,6 +91,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_Vector(ElementwiseMulOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_mul"
@@ -97,6 +103,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -107,6 +114,7 @@ def init_axis(self):
 
 
 class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_mul"
@@ -122,6 +130,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_mul"
@@ -136,6 +145,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_mul"
@@ -151,6 +161,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_mul"
@@ -162,6 +173,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_mul"
@@ -175,11 +187,13 @@ def setUp(self):
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "paddle is not compiled with NPU")
 class TestElementwiseMulOpFp16(ElementwiseMulOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_mul"
@@ -191,6 +205,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_mul"
@@ -202,6 +217,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_mul"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
index 907e149c8b2c3..f197f9bd381c7 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
@@ -20,6 +20,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 
 paddle.enable_static()
@@ -79,6 +80,7 @@ def ComputeGrad(x, y, out, axis):
 
 
 class TestElementwisePow(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_pow"
@@ -115,27 +117,27 @@ def init_input_output(self):
 
     def test_check_grad_normal(self):
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
+        self.check_grad_with_place(self.place, ['X', 'Y'],
+                                   'Out',
+                                   user_defined_grads=[dx, dy])
 
     def test_check_grad_ingore_x(self):
         _, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[dy])
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"),
+                                   user_defined_grads=[dy])
 
     def test_check_grad_ingore_y(self):
         dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            no_grad_set=set("Y"),
-            user_defined_grads=[dx])
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set("Y"),
+                                   user_defined_grads=[dx])
 
 
 class TestElementwisePowFp16(TestElementwisePow):
+
     def init_input_output(self):
         np.random.seed(SEED)
         self.x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
@@ -154,6 +156,7 @@ def test_check_output(self):
 
 
 class TestElementwisePowDouble(TestElementwisePow):
+
     def init_input_output(self):
         np.random.seed(SEED)
         self.x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
@@ -172,6 +175,7 @@ def test_check_output(self):
 
 
 class TestElementwisePowOp_broadcast_0(TestElementwisePow):
+
     def init_axis(self):
         self.axis = 1
 
@@ -183,27 +187,27 @@ def init_input_output(self):
 
     def test_check_grad_normal(self):
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
+        self.check_grad_with_place(self.place, ['X', 'Y'],
+                                   'Out',
+                                   user_defined_grads=[dx, dy])
 
     def test_check_grad_ingore_x(self):
         _, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[dy])
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"),
+                                   user_defined_grads=[dy])
 
     def test_check_grad_ingore_y(self):
         dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            no_grad_set=set("Y"),
-            user_defined_grads=[dx])
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set("Y"),
+                                   user_defined_grads=[dx])
 
 
 class TestElementwisePowOp_broadcast_1(TestElementwisePow):
+
     def init_axis(self):
         self.axis = 1
 
@@ -215,27 +219,27 @@ def init_input_output(self):
 
     def test_check_grad_normal(self):
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
+        self.check_grad_with_place(self.place, ['X', 'Y'],
+                                   'Out',
+                                   user_defined_grads=[dx, dy])
 
     def test_check_grad_ingore_x(self):
         _, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[dy])
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"),
+                                   user_defined_grads=[dy])
 
     def test_check_grad_ingore_y(self):
         dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            no_grad_set=set("Y"),
-            user_defined_grads=[dx])
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set("Y"),
+                                   user_defined_grads=[dx])
 
 
 class TestElementwisePowOp_broadcast_2(TestElementwisePow):
+
     def init_axis(self):
         self.axis = 0
 
@@ -247,27 +251,27 @@ def init_input_output(self):
 
     def test_check_grad_normal(self):
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
+        self.check_grad_with_place(self.place, ['X', 'Y'],
+                                   'Out',
+                                   user_defined_grads=[dx, dy])
 
     def test_check_grad_ingore_x(self):
         _, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[dy])
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"),
+                                   user_defined_grads=[dy])
 
     def test_check_grad_ingore_y(self):
         dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis)
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            no_grad_set=set("Y"),
-            user_defined_grads=[dx])
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set("Y"),
+                                   user_defined_grads=[dx])
 
 
 class TestElementwisePowNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -282,8 +286,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             c = paddle.pow(a, b)
 
@@ -306,12 +311,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
index fac2bc66ff49b..58ccc04a0f47a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestElementwiseSubOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_sub"
@@ -91,16 +93,19 @@ def test_check_output(self):
 
 
 class TestElementwiseSubOpInt32(TestElementwiseSubOp):
+
     def init_dtype(self):
         self.dtype = np.int32
 
 
 class TestElementwiseSubOpInt64(TestElementwiseSubOp):
+
     def init_dtype(self):
         self.dtype = np.int64
 
 
 class TestSubtractAPI(unittest.TestCase):
+
     def test_name(self):
         with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
@@ -125,8 +130,10 @@ def test_static(self):
 
             place = paddle.NPUPlace(0)
             exe = paddle.static.Executor(place)
-            x_value, y_value, z_value = exe.run(feed={"x": x_np,
-                                                      "y": y_np},
+            x_value, y_value, z_value = exe.run(feed={
+                "x": x_np,
+                "y": y_np
+            },
                                                 fetch_list=[x, y, z])
 
             z_expected = np.array([1., -2., 2.])
@@ -145,24 +152,28 @@ def test_static(self):
 
 
 class TestSubtractError(unittest.TestCase):
+
     def test_errors(self):
         with paddle.static.program_guard(paddle.static.Program()):
             # the input of elementwise_add must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
-            y1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            y1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.NPUPlace(0))
             self.assertRaises(TypeError, paddle.subtract, x1, y1)
 
             # the input dtype must be float16 or float32 or float64 or int32 or int64
-            x2 = paddle.static.data(
-                name='x2', shape=[3, 4, 5, 6], dtype="uint8")
-            y2 = paddle.static.data(
-                name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            x2 = paddle.static.data(name='x2',
+                                    shape=[3, 4, 5, 6],
+                                    dtype="uint8")
+            y2 = paddle.static.data(name='y2',
+                                    shape=[3, 4, 5, 6],
+                                    dtype="uint8")
             self.assertRaises(TypeError, paddle.subtract, x2, y2)
 
 
 class TestSubtractNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -177,8 +188,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             c = paddle.assign(b)
@@ -202,12 +214,13 @@ def _test(self, run_npu=True):
 
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py
index 6be2fe0086b12..288239801a12d 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -31,6 +31,7 @@
 
 
 class TestExpNPUOP(OpTest):
+
     def setUp(self):
 
         self.set_npu()
@@ -63,6 +64,7 @@ def set_npu(self):
 
 
 class TestExpNPUOPFloat64(TestExpNPUOP):
+
     def init_dtype(self):
         self.dtype = np.float64
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_as_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_as_v2_op_npu.py
index 99edc25f7696a..6a1a67645f7cc 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_expand_as_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_expand_as_v2_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestExpandAsOpRank1(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -50,6 +52,7 @@ def test_check_grad(self):
 
 
 class TestExpandAsOpRank2(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -73,6 +76,7 @@ def test_check_grad(self):
 
 
 class TestExpandAsOpRank3(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -96,6 +100,7 @@ def test_check_grad(self):
 
 
 class TestExpandAsOpRank4(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -120,24 +125,28 @@ def test_check_grad(self):
 
 # Test python API
 class TestExpandAsV2API(unittest.TestCase):
+
     def test_api(self):
         input1 = np.random.random([12, 14]).astype("float32")
         input2 = np.random.random([2, 12, 14]).astype("float32")
-        x = fluid.layers.data(
-            name='x', shape=[12, 14], append_batch_size=False, dtype="float32")
+        x = fluid.layers.data(name='x',
+                              shape=[12, 14],
+                              append_batch_size=False,
+                              dtype="float32")
 
-        y = fluid.layers.data(
-            name='target_tensor',
-            shape=[2, 12, 14],
-            append_batch_size=False,
-            dtype="float32")
+        y = fluid.layers.data(name='target_tensor',
+                              shape=[2, 12, 14],
+                              append_batch_size=False,
+                              dtype="float32")
 
         out_1 = paddle.expand_as(x, y=y)
 
         exe = fluid.Executor(place=fluid.NPUPlace(0))
         res_1 = exe.run(fluid.default_main_program(),
-                        feed={"x": input1,
-                              "target_tensor": input2},
+                        feed={
+                            "x": input1,
+                            "target_tensor": input2
+                        },
                         fetch_list=[out_1])
         assert np.array_equal(res_1[0], np.tile(input1, (2, 1, 1)))
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
index 83b65630d801a..5613afe18273e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestExpand(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "expand"
@@ -55,6 +57,7 @@ def test_check_grad(self):
 
 
 class TestExpandV2(TestExpand):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "expand"
@@ -82,6 +85,7 @@ def init_dtype(self):
 
 
 class TestExpandNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -94,8 +98,9 @@ def _test(self, run_npu=True):
 
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 1], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             res = paddle.fluid.layers.expand(a, [1, 32])
             loss = res.sum()
@@ -113,8 +118,10 @@ def _test(self, run_npu=True):
         for epoch in range(100):
 
             loss_res = exe.run(main_prog,
-                               feed={"a": a_np,
-                                     "label": label_np},
+                               feed={
+                                   "a": a_np,
+                                   "label": label_np
+                               },
                                fetch_list=[loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Loss: {}".format(epoch, loss))
@@ -134,6 +141,7 @@ def test_npu(self):
 
 
 class TestExpand_expand_times_all_one(TestExpand):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "expand"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py
index fd0b9850308b2..058f146de1225 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py
@@ -16,6 +16,7 @@
 import unittest
 import sys
 import numpy as np
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle.fluid as fluid
@@ -29,6 +30,7 @@
 # CANN Op Support X: float16, float32, int32, int8 ,uint8
 # Situation 1: shape is a list(without tensor)
 class TestExpandV2NPUOpRank1(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -57,6 +59,7 @@ def test_check_grad(self):
 
 
 class TestExpandV2OpRank2_DimExpanding(TestExpandV2NPUOpRank1):
+
     def init_data(self):
         self.ori_shape = [120]
         self.shape = [2, 120]
@@ -64,6 +67,7 @@ def init_data(self):
 
 
 class TestExpandV2OpRank2(TestExpandV2NPUOpRank1):
+
     def init_data(self):
         self.ori_shape = [1, 140]
         self.shape = [12, 140]
@@ -71,6 +75,7 @@ def init_data(self):
 
 
 class TestExpandV2OpRank3_Corner(TestExpandV2NPUOpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 10, 5)
         self.shape = (2, 10, 5)
@@ -78,6 +83,7 @@ def init_data(self):
 
 
 class TestExpandV2OpRank4(TestExpandV2NPUOpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 4, 5, 7)
         self.shape = (-1, -1, -1, -1)
@@ -85,6 +91,7 @@ def init_data(self):
 
 
 class TestExpandV2OpRank5(TestExpandV2NPUOpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 4, 1, 15)
         self.shape = (2, -1, 4, -1)
@@ -92,6 +99,7 @@ def init_data(self):
 
 
 class TestExpandV2OpRank6(TestExpandV2NPUOpRank1):
+
     def init_data(self):
         self.ori_shape = (4, 1, 30)
         self.shape = (2, -1, 4, 30)
@@ -100,6 +108,7 @@ def init_data(self):
 
 # Situation 2: shape is a list(with tensor)
 class TestExpandV2OpNPURank1_tensor_attr(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -135,8 +144,9 @@ def test_check_grad(self):
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
-class TestExpandV2OpRank2_Corner_tensor_attr(
-        TestExpandV2OpNPURank1_tensor_attr):
+class TestExpandV2OpRank2_Corner_tensor_attr(TestExpandV2OpNPURank1_tensor_attr
+                                             ):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.expand_times = [1, 1]
@@ -146,6 +156,7 @@ def init_data(self):
 
 # Situation 3: shape is a tensor
 class TestExpandV2NPUOpRank1_tensor(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -179,6 +190,7 @@ def test_check_grad(self):
 # Situation 4: input x is float16
 # skip grad check for float16
 class TestExpandV2OpFloat(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -201,6 +213,7 @@ def test_check_output(self):
 # Situation 5: input x is int32
 # skip grad check for int32
 class TestExpandV2OpInteger(OpTest):
+
     def init_dtype(self):
         self.dtype = 'int32'
 
@@ -209,8 +222,7 @@ def setUp(self):
         self.place = paddle.NPUPlace(0)
         self.op_type = "expand_v2"
         self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 20)).astype(self.dtype)
+            'X': np.random.randint(10, size=(2, 4, 20)).astype(self.dtype)
         }
         self.attrs = {'shape': [2, 4, 20]}
         output = np.tile(self.inputs['X'], (1, 1, 1))
@@ -225,11 +237,13 @@ def test_check_output(self):
 
 
 class TesstExpandV2OpInt64(TestExpandV2OpInteger):
+
     def init_dtype(self):
         self.dtype = 'int64'
 
 
 class TesstExpandV2OpBool(TestExpandV2OpInteger):
+
     def init_dtype(self):
         self.dtype = 'bool'
 
@@ -244,10 +258,11 @@ def setUp(self):
 
 
 class TestExpandV2Error(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], paddle.NPUPlace(0))
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         paddle.NPUPlace(0))
             shape = [2, 2]
             self.assertRaises(TypeError, paddle.tensor.expand, x1, shape)
             x2 = fluid.layers.data(name='x2', shape=[2], dtype="uint8")
@@ -259,21 +274,20 @@ def test_errors(self):
 
 # Test python API
 class TestExpandV2API(unittest.TestCase):
+
     def test_static(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = np.random.random([12, 14]).astype("float32")
-            x = fluid.layers.data(
-                name='x',
-                shape=[12, 14],
-                append_batch_size=False,
-                dtype="float32")
+            x = fluid.layers.data(name='x',
+                                  shape=[12, 14],
+                                  append_batch_size=False,
+                                  dtype="float32")
 
             positive_2 = fluid.layers.fill_constant([1], "int32", 12)
-            expand_shape = fluid.layers.data(
-                name="expand_shape",
-                shape=[2],
-                append_batch_size=False,
-                dtype="int32")
+            expand_shape = fluid.layers.data(name="expand_shape",
+                                             shape=[2],
+                                             append_batch_size=False,
+                                             dtype="int32")
 
             out_1 = paddle.expand(x, shape=[12, 14])
             out_2 = paddle.expand(x, shape=[positive_2, 14])
@@ -284,7 +298,8 @@ def test_static(self):
             exe = fluid.Executor(place=paddle.NPUPlace(0))
             res_1, res_2, res_3 = exe.run(fluid.default_main_program(),
                                           feed={
-                                              "x": input,
+                                              "x":
+                                              input,
                                               "expand_shape":
                                               np.array([12, 14]).astype("int32")
                                           },
diff --git a/python/paddle/fluid/tests/unittests/npu/test_eye_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_eye_op_npu.py
index abe981399a962..210d27c37432f 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_eye_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_eye_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -29,6 +30,7 @@
 
 
 class TestEyeOp(OpTest):
+
     def setUp(self):
         '''
 	    Test eye op with specified shape
@@ -73,35 +75,41 @@ def test_check_output(self):
 
 
 class TestEyeOp1(TestEyeOp):
+
     def initTestCase(self):
         self.num_rows = 50
 
 
 class TestEyeOp2(TestEyeOp):
+
     def initTestCase(self):
         self.num_rows = 50
         self.dtype = np.int32
 
 
 class TestEyeOp3(TestEyeOp):
+
     def initTestCase(self):
         self.num_rows = 50
         self.dtype = np.float16
 
 
 class TestEyeOp4(TestEyeOp):
+
     def initTestCase(self):
         self.num_rows = 1
         self.num_columns = 99
 
 
 class TestEyeOp5(TestEyeOp):
+
     def initTestCase(self):
         self.num_rows = 100
         self.num_columns = 100
 
 
 class TestEyeOp6(TestEyeOp):
+
     def initTestCase(self):
         self.num_rows = 100
         self.num_columns = 100
@@ -109,6 +117,7 @@ def initTestCase(self):
 
 
 class API_TestTensorEye(unittest.TestCase):
+
     def test_out(self):
         with paddle.static.program_guard(paddle.static.Program()):
             data = paddle.eye(10)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py
index c3074db1aaff6..5b602ca7c0fa0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestFillAnyLikeNPUOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -52,40 +54,47 @@ def test_check_output(self):
 
 
 class TestFillAnyLikeNPUOpInt32(TestFillAnyLikeNPUOp):
+
     def init(self):
         self.dtype = np.int32
         self.value = -1
 
 
 class TestFillAnyLikeNPUOpInt64(TestFillAnyLikeNPUOp):
+
     def init(self):
         self.dtype = np.int64
         self.value = -1
 
 
 class TestFillAnyLikeNPUOpFloat32(TestFillAnyLikeNPUOp):
+
     def init(self):
         self.dtype = np.float32
         self.value = 0.09
 
 
 class TestFillAnyLikeNPUOpFloat16(TestFillAnyLikeNPUOp):
+
     def init(self):
         self.dtype = np.float16
         self.value = 0.05
 
 
 class TestFillAnyLikeNPUOpValue1(TestFillAnyLikeNPUOp):
+
     def init(self):
         self.value = 1.0
 
 
 class TestFillAnyLikeNPUOpValue2(TestFillAnyLikeNPUOp):
+
     def init(self):
         self.value = 1e-9
 
 
 class TestFillAnyLikeNPUOpShape(TestFillAnyLikeNPUOp):
+
     def init(self):
         self.shape = [12, 10]
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_batch_size_like_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_batch_size_like_op_npu.py
index 615fe6f7645f9..01130b682469a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_batch_size_like_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_batch_size_like_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestFillConstantBatchSizeLike(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -84,6 +86,7 @@ def test_check_output(self):
 
 
 class TestFillConstantBatchSizeLike2(TestFillConstantBatchSizeLike):
+
     def init_shape(self):
         # test shape
         self.input_shape = [4, 5, 6, 7]
@@ -92,6 +95,7 @@ def init_shape(self):
 
 
 class TestFillConstantBatchSizeLike3(TestFillConstantBatchSizeLike):
+
     def init_value(self):
         # use 'str_value' rather than 'value'
         self.value = 3.8
@@ -100,6 +104,7 @@ def init_value(self):
 
 
 class TestFillConstantBatchSizeLike4(TestFillConstantBatchSizeLike):
+
     def init_value(self):
         # str_value = 'inf'
         self.value = 3.8
@@ -108,6 +113,7 @@ def init_value(self):
 
 
 class TestFillConstantBatchSizeLike5(TestFillConstantBatchSizeLike):
+
     def init_value(self):
         # str_value = '-inf'
         self.value = 3.8
@@ -116,6 +122,7 @@ def init_value(self):
 
 
 class TestFillConstantBatchSizeLike6(TestFillConstantBatchSizeLike):
+
     def init_dtype(self):
         self.dtype = core.VarDesc.VarType.FP16
         self.output_dtype = np.float16
@@ -125,17 +132,20 @@ def test_check_output(self):
 
 
 class TestFillConstantBatchSizeLike7(TestFillConstantBatchSizeLike):
+
     def init_dtype(self):
         self.dtype = core.VarDesc.VarType.INT32
         self.output_dtype = np.int32
 
 
 class TestFillConstantBatchSizeLike8(TestFillConstantBatchSizeLike):
+
     def init_force_cpu(self):
         self.force_cpu = True
 
 
 class TestFillConstantBatchSizeLike9(TestFillConstantBatchSizeLike):
+
     def init_shape(self):
         self.input_shape = [4, 5]
         self.shape = [123, 92]
diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
index 152a454805576..d661f953cf9a1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestFillConstant(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -49,6 +51,7 @@ def test_check_output(self):
 
 
 class TestFillConstantInt(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -73,6 +76,7 @@ def test_check_output(self):
 
 
 class TestFillConstantInt64(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -97,6 +101,7 @@ def test_check_output(self):
 
 
 class TestFillConstantFP16(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -121,6 +126,7 @@ def test_check_output(self):
 
 
 class TestFillConstantBool(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -145,6 +151,7 @@ def test_check_output(self):
 
 
 class TestFillConstantWithPlaceType(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_zeros_like_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_zeros_like_op_npu.py
index e00aa6971ebeb..f9f338a731079 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_fill_zeros_like_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_fill_zeros_like_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -25,6 +26,7 @@
 
 
 class TestFillZerosLikeOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -44,26 +46,31 @@ def test_check_output(self):
 
 
 class TestFillZerosLikeOpBool(TestFillZerosLikeOp):
+
     def init_dtype(self):
         self.dtype = np.bool
 
 
 class TestFillZerosLikeOpFp16(TestFillZerosLikeOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestFillZerosLikeOpFp64(TestFillZerosLikeOp):
+
     def init_dtype(self):
         self.dtype = np.float64
 
 
 class TestFillZerosLikeOpInt32(TestFillZerosLikeOp):
+
     def init_dtype(self):
         self.dtype = np.int32
 
 
 class TestFillZerosLikeOpInt64(TestFillZerosLikeOp):
+
     def init_dtype(self):
         self.dtype = np.int64
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_flags_check_nan_inf_npu.py b/python/paddle/fluid/tests/unittests/npu/test_flags_check_nan_inf_npu.py
index 66c39062dc7ed..69c586fb2d884 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_flags_check_nan_inf_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_flags_check_nan_inf_npu.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -26,6 +27,7 @@
 
 
 class TestCheckFiniteAndUnscale(unittest.TestCase):
+
     def setUp(self):
         fluid.set_flags({'FLAGS_check_nan_inf': True})
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_flatten2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_flatten2_op_npu.py
index acd7ca770164e..a415c8be71c49 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_flatten2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_flatten2_op_npu.py
@@ -16,15 +16,18 @@
 
 import unittest
 import sys
+
 sys.path.append("..")
 import numpy as np
 import paddle
 import paddle.fluid as fluid
 from op_test import OpTest
+
 paddle.enable_static()
 
 
 class TestFlatten2Op(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "flatten2"
@@ -56,6 +59,7 @@ def init_attrs(self):
 
 
 class TestFlatten2OpWithCornerAxis(TestFlatten2Op):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.axis = 0
@@ -63,6 +67,7 @@ def init_test_case(self):
 
 
 class TestFlatten2OpWithDefaultAxis(TestFlatten2Op):
+
     def init_test_case(self):
         self.in_shape = (10, 2, 2, 3)
         self.new_shape = (10, 12)
@@ -72,6 +77,7 @@ def init_attrs(self):
 
 
 class TestFlatten2OpSixDims(TestFlatten2Op):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 3, 2, 4, 4)
         self.axis = 4
diff --git a/python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py
index 742d156c7f5f1..3f90c8b19b478 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -26,6 +27,7 @@
 
 
 class TestFlattenOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "flatten_contiguous_range"
@@ -65,6 +67,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_1(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 1
@@ -79,6 +82,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_2(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -93,6 +97,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_3(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -107,6 +112,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_4(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = -2
@@ -121,6 +127,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_5(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 2
@@ -135,6 +142,7 @@ def init_attrs(self):
 
 
 class TestFlattenOpSixDims(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 3, 2, 4, 4)
         self.start_axis = 3
@@ -149,6 +157,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_Float32(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -164,6 +173,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_int32(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -182,6 +192,7 @@ def test_check_grad(self):
 
 
 class TestFlattenOp_uint8(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -200,6 +211,7 @@ def test_check_grad(self):
 
 
 class TestFlattenOp_int8(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -218,6 +230,7 @@ def test_check_grad(self):
 
 
 class TestFlattenOp_int64(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -236,6 +249,7 @@ def test_check_grad(self):
 
 
 class TestFlatten2OpError(unittest.TestCase):
+
     def test_errors(self):
         image_shape = (2, 3, 4, 4)
         x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
@@ -243,22 +257,25 @@ def test_errors(self):
         x = x.astype('float32')
 
         def test_ValueError1():
-            x_var = paddle.static.data(
-                name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(name="x",
+                                       shape=image_shape,
+                                       dtype='float32')
             out = paddle.flatten(x_var, start_axis=2, stop_axis=1)
 
         self.assertRaises(ValueError, test_ValueError1)
 
         def test_ValueError2():
-            x_var = paddle.static.data(
-                name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(name="x",
+                                       shape=image_shape,
+                                       dtype='float32')
             paddle.flatten(x_var, start_axis=10, stop_axis=1)
 
         self.assertRaises(ValueError, test_ValueError2)
 
         def test_ValueError3():
-            x_var = paddle.static.data(
-                name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(name="x",
+                                       shape=image_shape,
+                                       dtype='float32')
             paddle.flatten(x_var, start_axis=2, stop_axis=10)
 
         self.assertRaises(ValueError, test_ValueError3)
@@ -268,8 +285,9 @@ def test_type():
             x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
                            image_shape[3]).reshape(image_shape) / 100.
             x2 = x2.astype('float16')
-            x2_var = paddle.fluid.data(
-                name='x2', shape=[3, 2, 4, 5], dtype='float16')
+            x2_var = paddle.fluid.data(name='x2',
+                                       shape=[3, 2, 4, 5],
+                                       dtype='float16')
             paddle.flatten(x2_var)
 
         self.assertRaises(TypeError, test_type)
@@ -281,6 +299,7 @@ def test_InputError():
 
 
 class TestStaticFlattenPythonAPI(unittest.TestCase):
+
     def execute_api(self, x, start_axis=0, stop_axis=-1):
         return paddle.flatten(x, start_axis, stop_axis)
 
@@ -290,8 +309,9 @@ def test_static_api(self):
 
         main_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, paddle.static.Program()):
-            x = paddle.static.data(
-                name="x", shape=[2, 3, 4, 4], dtype='float32')
+            x = paddle.static.data(name="x",
+                                   shape=[2, 3, 4, 4],
+                                   dtype='float32')
             out = self.execute_api(x, start_axis=-2, stop_axis=-1)
 
         exe = paddle.static.Executor(place=paddle.NPUPlace(0))
@@ -300,11 +320,13 @@ def test_static_api(self):
 
 
 class TestStaticInplaceFlattenPythonAPI(TestStaticFlattenPythonAPI):
+
     def execute_api(self, x, start_axis=0, stop_axis=-1):
         return x.flatten_(start_axis, stop_axis)
 
 
 class TestFlattenPython(unittest.TestCase):
+
     def test_python_api(self):
         image_shape = (2, 3, 4, 4)
         x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
diff --git a/python/paddle/fluid/tests/unittests/npu/test_float_status_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_float_status_op_npu.py
index 206641dab5c13..71764aad47c22 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_float_status_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_float_status_op_npu.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -22,6 +23,7 @@
 
 
 class TestGetFloatStatusOp(unittest.TestCase):
+
     def setUp(self):
         device = paddle.set_device('npu')
 
@@ -62,6 +64,7 @@ def test_not_contains_nan_inf(self):
 
 
 class TestClearFloatStatusOp(unittest.TestCase):
+
     def setUp(self):
         device = paddle.set_device('npu')
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py
index acb4ffd686fa2..5f33d7358161a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle.fluid as fluid
@@ -38,6 +39,7 @@ def gather_nd_grad(x, index):
 
 
 def test_class1(op_type, typename):
+
     class TestGatherNdOpWithEmptyIndex(OpTest):
         #Index has empty element, which means copy entire tensor
 
@@ -72,7 +74,9 @@ def test_check_grad(self):
 
 
 def test_class2(op_type, typename):
+
     class TestGatherNdOpWithIndex1(OpTest):
+
         def setUp(self):
             self.set_npu()
             self.place = paddle.NPUPlace(0)
@@ -99,6 +103,7 @@ def test_check_grad(self):
 
 
 def test_class3(op_type, typename):
+
     class TestGatherNdOpWithLowIndex(OpTest):
         #Index has low rank, X has high rank
 
@@ -123,8 +128,9 @@ def test_check_grad(self):
             if typename == "float16" or typename == "int64":
                 self.__class__.no_need_check_grad = True
             else:
-                self.check_grad_with_place(
-                    self.place, ['X'], 'Out', user_defined_grads=[self.x_grad])
+                self.check_grad_with_place(self.place, ['X'],
+                                           'Out',
+                                           user_defined_grads=[self.x_grad])
 
     cls_name = "{0}_{1}_3".format(op_type, typename)
     TestGatherNdOpWithLowIndex.__name__ = cls_name
@@ -132,6 +138,7 @@ def test_check_grad(self):
 
 
 def test_class4(op_type, typename):
+
     class TestGatherNdOpIndex1(OpTest):
         #Index has low rank, X has high rank
 
@@ -164,6 +171,7 @@ def test_check_grad(self):
 
 
 def test_class5(op_type, typename):
+
     class TestGatherNdOpWithSameIndexAsX(OpTest):
         #Index has same rank as X's rank
 
@@ -195,6 +203,7 @@ def test_check_grad(self):
 
 
 def test_class6(op_type, typename):
+
     class TestGatherNdOpWithHighRankSame(OpTest):
         #Both Index and X have high rank, and Rank(Index) = Rank(X)
 
@@ -204,8 +213,8 @@ def setUp(self):
             self.op_type = "gather_nd"
             shape = (5, 2, 3, 1, 10)
             xnp = np.random.rand(*shape).astype(typename)
-            index = np.vstack([np.random.randint(
-                0, s, size=2) for s in shape]).T
+            index = np.vstack([np.random.randint(0, s, size=2)
+                               for s in shape]).T
 
             self.inputs = {'X': xnp, 'Index': index.astype("int32")}
             self.outputs = {'Out': xnp[tuple(index.T)]}
@@ -228,6 +237,7 @@ def test_check_grad(self):
 
 
 def test_class7(op_type, typename):
+
     class TestGatherNdOpWithHighRankDiff(OpTest):
         #Both Index and X have high rank, Rank(Index) < Rank(X)
 
@@ -238,8 +248,7 @@ def setUp(self):
             shape = (2, 3, 4, 1, 10)
             xnp = np.random.rand(*shape).astype(typename)
             index = np.vstack(
-                [np.random.randint(
-                    0, s, size=200) for s in shape]).T
+                [np.random.randint(0, s, size=200) for s in shape]).T
             index_re = index.reshape([20, 5, 2, 5])
 
             self.inputs = {'X': xnp, 'Index': index_re.astype("int32")}
@@ -263,6 +272,7 @@ def test_check_grad(self):
 
 
 class TestGatherNdAPI(unittest.TestCase):
+
     def test_imperative(self):
         paddle.disable_static()
         input_1 = np.array([[1, 2], [3, 4], [5, 6]])
diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
index daca3d884600a..28b8ab9b25f93 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -35,6 +36,7 @@ def gather_numpy(x, index, axis):
 
 
 class TestGatherOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -58,7 +60,8 @@ def test_check_grad(self):
             self.place,
             ['X'],
             'Out',
-            max_relative_error=0.006, )
+            max_relative_error=0.006,
+        )
 
     def config(self):
         """
@@ -71,6 +74,7 @@ def config(self):
 
 
 class TestCase1(TestGatherOp):
+
     def config(self):
         """
         For one dimension input
@@ -82,6 +86,7 @@ def config(self):
 
 
 class API_TestGather(unittest.TestCase):
+
     def test_out1(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float32')
@@ -91,8 +96,10 @@ def test_out1(self):
             exe = fluid.Executor(place)
             input = np.array([[1, 2], [3, 4], [5, 6]])
             index_1 = np.array([1, 2])
-            result, = exe.run(feed={"data1": input,
-                                    "index": index_1},
+            result, = exe.run(feed={
+                "data1": input,
+                "index": index_1
+            },
                               fetch_list=[out])
             expected_output = np.array([[3, 4], [5, 6]])
         self.assertTrue(np.allclose(result, expected_output))
@@ -107,14 +114,17 @@ def test_out2(self):
             exe = paddle.static.Executor(place)
             x_np = np.array([[1, 2], [3, 4], [5, 6]]).astype('float32')
             index_np = np.array([1, 1]).astype('int32')
-            result, = exe.run(feed={"x": x_np,
-                                    "index": index_np},
+            result, = exe.run(feed={
+                "x": x_np,
+                "index": index_np
+            },
                               fetch_list=[out])
             expected_output = gather_numpy(x_np, index_np, axis=0)
         self.assertTrue(np.allclose(result, expected_output))
 
 
 class TestGatherGrad(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -127,8 +137,9 @@ def _test(self, run_npu=True):
 
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[8192, 768], dtype='float32')
-            index = paddle.static.data(
-                name="index", shape=[1232, 1], dtype='int32')
+            index = paddle.static.data(name="index",
+                                       shape=[1232, 1],
+                                       dtype='int32')
             a.stop_gradient = False
             b = paddle.gather(a, index)
 
@@ -148,8 +159,10 @@ def _test(self, run_npu=True):
         for epoch in range(100):
 
             pred_res, loss_res = exe.run(main_prog,
-                                         feed={"a": a_np,
-                                               "index": index_np},
+                                         feed={
+                                             "a": a_np,
+                                             "index": index_np
+                                         },
                                          fetch_list=[b, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
diff --git a/python/paddle/fluid/tests/unittests/npu/test_gaussian_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gaussian_random_op_npu.py
index 11f64b8fc7d26..470982b9e70eb 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_gaussian_random_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_gaussian_random_op_npu.py
@@ -17,6 +17,7 @@
 import sys
 import unittest
 import numpy as np
+
 sys.path.append("..")
 import paddle
 import paddle.fluid as fluid
@@ -27,6 +28,7 @@
 
 
 class TestNPUGaussianRandomOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "gaussian_random"
@@ -68,10 +70,8 @@ def verify_output(self, outs):
         hist2, _ = np.histogram(data, range=(-3, 5))
         hist2 = hist2.astype("float32")
         hist2 /= float(outs[0].size)
-        self.assertTrue(
-            np.allclose(
-                hist, hist2, rtol=0, atol=0.01),
-            "hist: " + str(hist) + " hist2: " + str(hist2))
+        self.assertTrue(np.allclose(hist, hist2, rtol=0, atol=0.01),
+                        "hist: " + str(hist) + " hist2: " + str(hist2))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
index 760ce59812ea2..a779e797808a0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
@@ -18,6 +18,7 @@
 from scipy import special
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -33,6 +34,7 @@ def np_gelu(x):
 
 
 class TestGelu(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "gelu"
@@ -57,11 +59,13 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-3)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', max_relative_error=0.007)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.007)
 
 
 class TestGeluFp16(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "gelu"
@@ -88,6 +92,7 @@ def test_check_output(self):
 
 
 class TestGeluNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -102,8 +107,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             c = paddle.multiply(a, b)
 
@@ -127,12 +133,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py
index 9ab1161be36dd..a5830325c83a0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 import sys
+
 sys.path.append("..")
 
 from operator import mul
@@ -46,6 +47,7 @@ def group_norm_naive(x, scale, bias, epsilon, groups, data_layout):
 
 
 class TestGroupNormOpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
 
@@ -57,8 +59,9 @@ def test_x_type():
             self.assertRaises(TypeError, test_x_type)
 
             def test_x_dtype():
-                x2 = fluid.layers.data(
-                    name='x2', shape=[2, 100, 3, 5], dtype='int32')
+                x2 = fluid.layers.data(name='x2',
+                                       shape=[2, 100, 3, 5],
+                                       dtype='int32')
                 groups = 2
                 fluid.layers.group_norm(x2, groups)
 
@@ -66,6 +69,7 @@ def test_x_dtype():
 
 
 class TestGroupNormOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = 'group_norm'
@@ -86,9 +90,10 @@ def setUp(self):
             input = np.transpose(input, (0, 2, 3, 1))
         scale = np.random.random([self.shape[1]]).astype(self.dtype)
         bias = np.random.random([self.shape[1]]).astype(self.dtype)
-        output, mean, var = group_norm_naive(
-            input, scale, bias, self.attrs['epsilon'], self.attrs['groups'],
-            self.data_format)
+        output, mean, var = group_norm_naive(input, scale, bias,
+                                             self.attrs['epsilon'],
+                                             self.attrs['groups'],
+                                             self.data_format)
 
         self.inputs = {
             'X': OpTest.np_dtype_to_fluid_dtype(input),
@@ -130,45 +135,53 @@ def init_test_case(self):
 
 
 class TestGroupNormOp1(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 1
 
 
 class TestGroupNormOp2(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 4
 
 
 class TestGroupNormOpBigEps1(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 1
         self.attrs['epsilon'] = 0.5
 
 
 class TestGroupNormOpBigEps2(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 4
         self.attrs['epsilon'] = 0.5
 
 
 class TestGroupNormOpBigEps3(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['epsilon'] = 0.5
 
 
 class TestGroupNormOp1_With_NHWC(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 1
         self.data_format = "NHWC"
 
 
 class TestGroupNormOp2_With_NHWC(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 4
         self.data_format = "NHWC"
 
 
 class TestGroupNormOpBigEps1_With_NHWC(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 1
         self.attrs['epsilon'] = 0.5
@@ -176,6 +189,7 @@ def init_test_case(self):
 
 
 class TestGroupNormOpBigEps2_With_NHWC(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 4
         self.attrs['epsilon'] = 0.5
@@ -183,17 +197,20 @@ def init_test_case(self):
 
 
 class TestGroupNormOpBigEps3_With_NHWC(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['epsilon'] = 0.5
         self.data_format = "NHWC"
 
 
 class TestGroupNormOpFP16(TestGroupNormOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestGroupNormOpFP16_With_NHWC(TestGroupNormOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -207,8 +224,9 @@ def test_exception(self):
         data = fluid.data(name='data', shape=[None, 3, 3, 4], dtype="float64")
 
         def attr_data_format():
-            out = fluid.layers.group_norm(
-                input=data, groups=2, data_layout="NDHW")
+            out = fluid.layers.group_norm(input=data,
+                                          groups=2,
+                                          data_layout="NDHW")
 
         self.assertRaises(ValueError, attr_data_format)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py
index f1d89cb8d561b..a83618392a1d1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py
@@ -32,6 +32,7 @@ def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5):
 
 
 class TestNPUHardSigmoid(OpTest):
+
     def setUp(self):
         paddle.enable_static()
 
@@ -74,18 +75,21 @@ def set_attrs(self):
 
 
 class TestNPUHardSigmoid2(TestNPUHardSigmoid):
+
     def set_attrs(self):
         self.slope = 0.2
         self.offset = 0.5
 
 
 class TestNPUHardSigmoid3(TestNPUHardSigmoid):
+
     def set_attrs(self):
         self.slope = 0.2
         self.offset = 0.4
 
 
 class TestNPUHardSigmoidFp16(TestNPUHardSigmoid):
+
     def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-3)
 
@@ -142,12 +146,14 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.hardsigmoid, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.hardsigmoid, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.hardsigmoid(x_fp16)
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py
index 9495cdb8a55aa..4e83700da78a1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -33,6 +34,7 @@ def ref_hard_swish_grad(x, threshold=6.0, scale=6.0, offset=3.0):
 
 
 class TestHardSwishNPU(OpTest):
+
     def setUp(self):
         paddle.enable_static()
 
@@ -48,8 +50,10 @@ def setUp(self):
         #the same with TestAbs
         x[np.abs(x + offset) < 0.005] = 0.02
         x[np.abs(x - threshold + offset) < 0.005] = threshold - offset + 0.02
-        out = (x * (np.minimum(np.maximum(x + offset, 0.), threshold) /
-                    scale)).astype(self.dtype)
+        out = (
+            x *
+            (np.minimum(np.maximum(x + offset, 0.), threshold) / scale)).astype(
+                self.dtype)
         self.x_grad = ref_hard_swish_grad(x, threshold, scale, offset)
 
         self.inputs = {'X': x}
@@ -67,14 +71,16 @@ def test_check_output(self):
 
     def test_check_grad(self):
         # There is a problem that precision of grad result using float32
-        # can't satisfy the default precision requirement 
-        # when compared with numeric_grads, but the results on 
+        # can't satisfy the default precision requirement
+        # when compared with numeric_grads, but the results on
         # NPU and CPU are same (verified in TestHardSwishNPUWithCPU)
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', user_defined_grads=[self.x_grad])
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   user_defined_grads=[self.x_grad])
 
 
 class TestHardSwishNPUFp16(TestHardSwishNPU):
+
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
@@ -84,6 +90,7 @@ def init_dtype(self):
 
 # test the result of hard_swish and hard_swish_grad on CPU and NPU
 class TestHardSwishNPUWithCPU(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py
index a9c195bb8cd29..a8fe42b294f57 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -37,6 +38,7 @@ def huber_loss_forward(val, delta):
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestHuberLossOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = 'huber_loss'
@@ -84,36 +86,38 @@ def test_check_grad_normal(self):
         self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        self.check_grad_with_place(
-            self.place, ['Y'],
-            'Out',
-            max_relative_error=0.008,
-            no_grad_set=set("residual"))
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   max_relative_error=0.008,
+                                   no_grad_set=set("residual"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            max_relative_error=0.008,
-            no_grad_set=set('residual'))
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.008,
+                                   no_grad_set=set('residual'))
 
 
 def TestHuberLossOp1(TestHuberLossOp):
+
     def set_shape(self):
         return (64)
 
 
 def TestHuberLossOp2(TestHuberLossOp):
+
     def set_shape(self):
         return (6, 6)
 
 
 def TestHuberLossOp3(TestHuberLossOp):
+
     def set_shape(self):
         return (6, 6, 1)
 
 
 def TestHuberLossOpFP16(TestHuberLossOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -121,6 +125,7 @@ def init_dtype(self):
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestHuberLossOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input and label must be Variable
diff --git a/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
index 626dbfc52a715..e86f562539876 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -30,6 +31,7 @@
 
 
 class TestIncrement(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(NPUPlace)
@@ -56,6 +58,7 @@ def test_check_output(self):
 
 
 class TestIncrementFP16(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(NPUPlace)
@@ -82,6 +85,7 @@ def test_check_output(self):
 
 
 class TestIncrementINT64(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(NPUPlace)
@@ -108,6 +112,7 @@ def test_check_output(self):
 
 
 class TestIncrementInplace(unittest.TestCase):
+
     def test_npu(self):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -126,7 +131,9 @@ def test_npu(self):
         exe = paddle.static.Executor(place)
         exe.run(startup_prog)
 
-        b_value = exe.run(main_prog, feed={"a": a_np, }, fetch_list=[b])
+        b_value = exe.run(main_prog, feed={
+            "a": a_np,
+        }, fetch_list=[b])
 
         print('input a id is : {}'.format(id(a)))
         print('input b id is : {}'.format(id(b)))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_index_sample_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_index_sample_op_npu.py
index 9b890d22ada79..6ce647efc2d1b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_index_sample_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_index_sample_op_npu.py
@@ -17,6 +17,7 @@
 import sys
 import unittest
 import numpy as np
+
 sys.path.append("..")
 
 from op_test import OpTest
@@ -27,6 +28,7 @@
 
 
 class TestIndexSampleOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -35,9 +37,10 @@ def setUp(self):
         self.op_type = "index_sample"
         self.config()
         xnp = np.random.random(self.x_shape).astype(self.dtype)
-        indexnp = np.random.randint(
-            low=0, high=self.x_shape[1],
-            size=self.index_shape).astype(self.index_type)
+        indexnp = np.random.randint(low=0,
+                                    high=self.x_shape[1],
+                                    size=self.index_shape).astype(
+                                        self.index_type)
         self.inputs = {'X': xnp, 'Index': indexnp}
         index_array = []
         for i in range(self.index_shape[0]):
@@ -64,6 +67,7 @@ def config(self):
 
 
 class TestCase1(TestIndexSampleOp):
+
     def config(self):
         """
         For one dimension input
@@ -75,6 +79,7 @@ def config(self):
 
 
 class TestCase2(TestIndexSampleOp):
+
     def config(self):
         """
         For int64_t index type
@@ -86,6 +91,7 @@ def config(self):
 
 
 class TestCase3(TestIndexSampleOp):
+
     def config(self):
         """
         For int index type
@@ -97,6 +103,7 @@ def config(self):
 
 
 class TestCase4(TestIndexSampleOp):
+
     def config(self):
         """
         For int64 index type
@@ -108,6 +115,7 @@ def config(self):
 
 
 class TestCase5(TestIndexSampleOp):
+
     def config(self):
         """
         For float16 x type
@@ -123,6 +131,7 @@ def test_check_grad(self):
 
 
 class TestCase6(TestCase5):
+
     def config(self):
         """
         For int32 x type
@@ -135,6 +144,7 @@ def config(self):
 
 
 class TestCase7(TestCase5):
+
     def config(self):
         """
         For int64 x type
@@ -147,6 +157,7 @@ def config(self):
 
 
 class TestIndexSampleShape(unittest.TestCase):
+
     def test_shape(self):
         paddle.enable_static()
         # create x value
@@ -157,8 +168,8 @@ def test_shape(self):
         # create index value
         index_shape = (2, 3)
         index_type = "int32"
-        index_np = np.random.randint(
-            low=0, high=x_shape[1], size=index_shape).astype(index_type)
+        index_np = np.random.randint(low=0, high=x_shape[1],
+                                     size=index_shape).astype(index_type)
 
         x = fluid.data(name='x', shape=[-1, 5], dtype='float32')
         index = fluid.data(name='index', shape=[-1, 3], dtype='int32')
@@ -173,18 +184,18 @@ def test_shape(self):
 
 
 class TestIndexSampleDynamic(unittest.TestCase):
+
     def test_result(self):
         with fluid.dygraph.guard(paddle.NPUPlace(0)):
-            x = paddle.to_tensor(
-                [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
-                 [9.0, 10.0, 11.0, 12.0]],
-                dtype='float32')
-            index = paddle.to_tensor(
-                [[0, 1, 2], [1, 2, 3], [0, 0, 0]], dtype='int32')
+            x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
+                                  [9.0, 10.0, 11.0, 12.0]],
+                                 dtype='float32')
+            index = paddle.to_tensor([[0, 1, 2], [1, 2, 3], [0, 0, 0]],
+                                     dtype='int32')
             out_z1 = paddle.index_sample(x, index)
 
-            except_output = np.array(
-                [[1.0, 2.0, 3.0], [6.0, 7.0, 8.0], [9.0, 9.0, 9.0]])
+            except_output = np.array([[1.0, 2.0, 3.0], [6.0, 7.0, 8.0],
+                                      [9.0, 9.0, 9.0]])
             assert out_z1.numpy().all() == except_output.all()
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_index_select_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_index_select_op_npu.py
index 57293ad5e5633..5428bf1e6571f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_index_select_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_index_select_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestNPUIndexSelect(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -34,11 +36,10 @@ def setUp(self):
         self.config()
 
         x_np = np.random.random(self.x_shape).astype(self.x_type)
-        index_np = np.random.randint(
-            low=0,
-            high=self.x_shape[self.dim],
-            size=self.index_size,
-            dtype=self.index_type)
+        index_np = np.random.randint(low=0,
+                                     high=self.x_shape[self.dim],
+                                     size=self.index_size,
+                                     dtype=self.index_type)
 
         # compute real output as baseline.
         outer_loop = np.prod(self.x_shape[:self.dim])
@@ -77,6 +78,7 @@ def config(self):
 
 
 class TestNPUIndexSelectCase2(TestNPUIndexSelect):
+
     def config(self):
         self.dim = -2
         self.x_type = np.float32
@@ -86,6 +88,7 @@ def config(self):
 
 
 class TestNPUIndexSelectCase3(TestNPUIndexSelect):
+
     def config(self):
         self.dim = 0
         self.x_type = np.float32
@@ -95,6 +98,7 @@ def config(self):
 
 
 class TestNPUIndexSelectCase4(TestNPUIndexSelect):
+
     def config(self):
         self.dim = -1
         self.x_type = np.float32
@@ -104,6 +108,7 @@ def config(self):
 
 
 class TestNPUIndexSelectAPI(unittest.TestCase):
+
     def input_data(self):
         self.data_x = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
                                 [9.0, 10.0, 11.0, 12.0]]).astype('float32')
@@ -120,8 +125,10 @@ def test_index_select_api(self):
             index = paddle.static.data(name='index', shape=[3], dtype='int32')
             z = paddle.index_select(x, index, axis=1)
             exe = paddle.static.Executor(paddle.NPUPlace(0))
-            res, = exe.run(feed={'x': self.data_x,
-                                 'index': self.data_index},
+            res, = exe.run(feed={
+                'x': self.data_x,
+                'index': self.data_index
+            },
                            fetch_list=[z.name],
                            return_numpy=False)
         expect_out = np.array([[1.0, 2.0, 2.0], [5.0, 6.0, 6.0],
@@ -134,8 +141,10 @@ def test_index_select_api(self):
             index = paddle.static.data(name='index', shape=[3], dtype='int32')
             z = paddle.index_select(x, index)
             exe = paddle.static.Executor(paddle.NPUPlace(0))
-            res, = exe.run(feed={'x': self.data_x,
-                                 'index': self.data_index},
+            res, = exe.run(feed={
+                'x': self.data_x,
+                'index': self.data_index
+            },
                            fetch_list=[z.name],
                            return_numpy=False)
         expect_out = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
diff --git a/python/paddle/fluid/tests/unittests/npu/test_iou_similarity_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_iou_similarity_op_npu.py
index 22042ce49200b..80ddda34df0ed 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_iou_similarity_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_iou_similarity_op_npu.py
@@ -18,6 +18,7 @@
 import numpy as np
 import numpy.random as random
 import sys
+
 sys.path.append("..")
 import math
 import paddle
@@ -29,6 +30,7 @@
 
 
 class TestNpuIouSimilarityOp(OpTest):
+
     def setUp(self):
         self.op_type = "iou_similarity"
         self.set_npu()
@@ -104,6 +106,7 @@ def _compute_iou(self, ):
 
 
 class TestNpuIouSimilarityOpWithLoD(TestNpuIouSimilarityOp):
+
     def set_init_config(self):
         super(TestNpuIouSimilarityOpWithLoD, self).set_init_config()
         self.box_normalized = True
@@ -111,6 +114,7 @@ def set_init_config(self):
 
 
 class TestNpuIouSimilarityOpWithBoxNormalized(TestNpuIouSimilarityOp):
+
     def set_init_config(self):
         super(TestNpuIouSimilarityOpWithBoxNormalized, self).set_init_config()
         self.box_normalized = True
@@ -118,6 +122,7 @@ def set_init_config(self):
 
 
 def TestNpuIouSimilarityOpFp16(TestNpuIouSimilarityOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_is_empty_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_is_empty_op_npu.py
index 09801b0f5ec3e..4f903e106307a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_is_empty_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_is_empty_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestEmpty(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.init_dtype()
@@ -51,6 +53,7 @@ def test_check_output(self):
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestNotEmpty(TestEmpty):
+
     def set_data(self):
         self.inputs = {'X': np.array([])}
         self.outputs = {'Out': np.array([True])}
@@ -59,6 +62,7 @@ def set_data(self):
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestIsEmptyOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
@@ -73,16 +77,18 @@ def test_Variable():
 
             def test_type():
                 # dtype must be float32, float16 in NPU
-                x3 = paddle.static.data(
-                    name="x3", shape=[4, 32, 32], dtype="bool")
+                x3 = paddle.static.data(name="x3",
+                                        shape=[4, 32, 32],
+                                        dtype="bool")
                 res = paddle.is_empty(x=x3)
 
             self.assertRaises(TypeError, test_type)
 
             def test_name_type():
                 # name type must be string.
-                x4 = paddle.static.data(
-                    name="x4", shape=[3, 2], dtype="float32")
+                x4 = paddle.static.data(name="x4",
+                                        shape=[3, 2],
+                                        dtype="float32")
                 res = paddle.is_empty(x=x4, name=1)
 
             self.assertRaises(TypeError, test_name_type)
@@ -91,6 +97,7 @@ def test_name_type():
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestIsEmptyOpDygraph(unittest.TestCase):
+
     def test_dygraph(self):
         paddle.disable_static(paddle.NPUPlace(0))
         input = paddle.rand(shape=[4, 32, 32], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py
index 7ed1775fa5e6d..3d9ba6c440779 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestKLDivLossOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -55,11 +57,10 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Loss',
-            no_grad_set=set(["Target"]),
-            max_relative_error=0.15)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Loss',
+                                   no_grad_set=set(["Target"]),
+                                   max_relative_error=0.15)
 
     def initTestCase(self):
         self.x_shape = (4, 5, 5)
@@ -67,24 +68,28 @@ def initTestCase(self):
 
 
 class TestKLDivLossOp2(TestKLDivLossOp):
+
     def initTestCase(self):
         self.x_shape = (3, 2, 7, 7)
         self.reduction = 'none'
 
 
 class TestKLDivLossOp3(TestKLDivLossOp):
+
     def initTestCase(self):
         self.x_shape = (2, 3, 5, 7, 9)
         self.reduction = 'mean'
 
 
 class TestKLDivLossOp4(TestKLDivLossOp):
+
     def initTestCase(self):
         self.x_shape = (5, 20)
         self.reduction = 'sum'
 
 
 class TestKLDivLossOp_fp16(TestKLDivLossOp):
+
     def init_dtype(self):
         self.dtype = 'float16'
 
@@ -94,15 +99,15 @@ def test_check_output(self):
     def test_check_grad(self):
         input_grad = -self.inputs['Target'] * (
             self.inputs['Target'] > 0) / self.inputs['Target'].shape[0]
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Loss',
-            no_grad_set=set(["Target"]),
-            max_relative_error=0.2,
-            user_defined_grads=[input_grad])
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Loss',
+                                   no_grad_set=set(["Target"]),
+                                   max_relative_error=0.2,
+                                   user_defined_grads=[input_grad])
 
 
 class TestKLDivLossDygraph(unittest.TestCase):
+
     def run_kl_loss(self, reduction, shape=(5, 20)):
         x = np.random.uniform(-10, 10, shape).astype('float32')
         target = np.random.uniform(-10, 10, shape).astype('float32')
@@ -110,8 +115,8 @@ def run_kl_loss(self, reduction, shape=(5, 20)):
 
         with paddle.fluid.dygraph.guard(paddle.NPUPlace(0)):
             kldiv_criterion = paddle.nn.KLDivLoss(reduction)
-            pred_loss = kldiv_criterion(
-                paddle.to_tensor(x), paddle.to_tensor(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
             self.assertTrue(np.allclose(pred_loss.numpy(), gt_loss))
 
     def test_kl_loss_batchmean(self):
@@ -137,6 +142,7 @@ def test_kl_loss_static_api(self):
 
 
 class TestKLDivLossTypePromotion(unittest.TestCase):
+
     def test_kl_div_promotion(self):
         with paddle.fluid.dygraph.guard(paddle.NPUPlace(0)):
             x1 = paddle.rand([5, 20], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py
index d02ddae461ba5..f298c64d8d2ac 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -29,6 +30,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestLabelSmoothOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "label_smooth"
@@ -78,13 +80,15 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['X'], 'Out', max_relative_error=0.5)
+            self.check_grad_with_place(self.place, ['X'],
+                                       'Out',
+                                       max_relative_error=0.5)
         else:
             self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
 class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp):
+
     def set_inputs(self):
         super(TestLabelSmoothOpWithPriorDist, self).set_inputs()
         label_dim = self.inputs['X'].shape[-1]
@@ -93,33 +97,39 @@ def set_inputs(self):
 
 
 class TestLabelSmoothOp3D(TestLabelSmoothOp):
+
     def set_inputs(self):
         super(TestLabelSmoothOp3D, self).set_inputs()
         self.inputs['X'].reshape([2, -1, self.inputs['X'].shape[-1]])
 
 
 class TestLabelSmoothOpWithPriorDist3D(TestLabelSmoothOpWithPriorDist):
+
     def set_inputs(self):
         super(TestLabelSmoothOpWithPriorDist3D, self).set_inputs()
         self.inputs['X'].reshape([2, -1, self.inputs['X'].shape[-1]])
 
 
 class TestLabelSmoothOpFP16(TestLabelSmoothOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestLabelSmoothOpWithPriorDistFP16(TestLabelSmoothOpWithPriorDist):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestLabelSmoothOp3DFP16(TestLabelSmoothOp3D):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestLabelSmoothOpWithPriorDist3DFP16(TestLabelSmoothOpWithPriorDist3D):
+
     def init_dtype(self):
         self.dtype = np.float16
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py
index 0345ac1f2065b..5295ed50555be 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 from functools import reduce
@@ -37,6 +38,7 @@
 
 
 class TestLayerNormOp(unittest.TestCase):
+
     def setUp(self):
         self.use_cudnn = True
         self.set_npu()
@@ -52,9 +54,9 @@ def init_dtype(self):
 
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
         self.assertTrue(
-            np.allclose(
-                np.array(tensor).astype(np_array.dtype), np_array, atol=atol),
-            msg)
+            np.allclose(np.array(tensor).astype(np_array.dtype),
+                        np_array,
+                        atol=atol), msg)
 
     def check_forward_backward(self,
                                shape,
@@ -63,6 +65,7 @@ def check_forward_backward(self,
                                has_bias=True,
                                y_grad_scale=1.0,
                                use_mkldnn=False):
+
         def test_with_place(place,
                             shape,
                             begin_norm_axis,
@@ -79,8 +82,8 @@ def test_with_place(place,
                 np.float32) if has_scale else None
             bias = np.random.random_sample(scale_shape).astype(
                 np.float32) if has_bias else None
-            y_grad = (np.random.random_sample(x_shape) *
-                      y_grad_scale).astype(self.dtype)
+            y_grad = (np.random.random_sample(x_shape) * y_grad_scale).astype(
+                self.dtype)
 
             # reference forward & backward
             y, mean, variance = _reference_layer_norm_naive(
@@ -101,10 +104,9 @@ def test_with_place(place,
             with fluid.program_guard(program):
                 block = program.global_block()
                 for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype=self.dtype,
-                        shape=ground_truth[name].shape)
+                    block.create_var(name=name,
+                                     dtype=self.dtype,
+                                     shape=ground_truth[name].shape)
                 inputs = {"X": block.var('x')}
                 fetch_list = [
                     'y',
@@ -171,25 +173,23 @@ def test_with_place(place,
 
     def test_check_forward_backward_with_scale_and_bias(self):
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
-        self.check_forward_backward(
-            shape=[2, 3, 4, 5],
-            begin_norm_axis=1,
-            has_scale=False,
-            has_bias=True)
-        self.check_forward_backward(
-            shape=[2, 3, 4, 5],
-            begin_norm_axis=1,
-            has_scale=True,
-            has_bias=False)
-        self.check_forward_backward(
-            shape=[2, 3, 4, 5],
-            begin_norm_axis=1,
-            has_scale=False,
-            has_bias=False)
+        self.check_forward_backward(shape=[2, 3, 4, 5],
+                                    begin_norm_axis=1,
+                                    has_scale=False,
+                                    has_bias=True)
+        self.check_forward_backward(shape=[2, 3, 4, 5],
+                                    begin_norm_axis=1,
+                                    has_scale=True,
+                                    has_bias=False)
+        self.check_forward_backward(shape=[2, 3, 4, 5],
+                                    begin_norm_axis=1,
+                                    has_scale=False,
+                                    has_bias=False)
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
 
 
 class TestLayerNormOpFP16(TestLayerNormOp):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.atol = 1e-2
diff --git a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py
index a0472f9611eb0..d285d82f9d99a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 from test_activation_op import ref_leaky_relu
@@ -28,6 +29,7 @@
 
 
 class TestLeadyRelu(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "leaky_relu"
@@ -63,28 +65,33 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['X'], 'Out', max_relative_error=0.006)
+            self.check_grad_with_place(self.place, ['X'],
+                                       'Out',
+                                       max_relative_error=0.006)
         else:
             self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
 class TestLeadyReluFP16(TestLeadyRelu):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestLeadyRelu2(TestLeadyRelu):
+
     def set_attrs(self):
         self.attrs = {'alpha': 0.5}
 
 
 class TestLeadyRelu3(TestLeadyRelu):
+
     def set_attrs(self):
         self.attrs = {'alpha': -0.5}
 
 
 class TestLeakyReluNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -97,8 +104,9 @@ def _test(self, run_npu=True):
 
         with paddle.static.program_guard(main_prog, startup_prog):
             x = paddle.static.data(name="x", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             y = paddle.nn.functional.leaky_relu(x)
 
@@ -122,8 +130,10 @@ def _test(self, run_npu=True):
         for epoch in range(100):
 
             pred_res, loss_res = exe.run(main_prog,
-                                         feed={"x": x_np,
-                                               "label": label_np},
+                                         feed={
+                                             "x": x_np,
+                                             "label": label_np
+                                         },
                                          fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_loss_op_npu.py
index ff1b0e53dfeb1..2f93b1c223e99 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_log_loss_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_loss_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -32,6 +33,7 @@ def sigmoid_array(x):
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestLogLossOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = 'log_loss'
@@ -78,6 +80,7 @@ def test_check_grad(self):
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestLogLossOpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program()):
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
index 5da3cb0ce5650..e6724a28354ca 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestLog(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "log"
@@ -55,6 +57,7 @@ def test_check_grad(self):
 
 
 class TestLogFp16(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "log"
@@ -81,6 +84,7 @@ def test_check_output(self):
 
 
 class TestLogNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -95,8 +99,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             c = paddle.multiply(a, b)
             d = paddle.log(c)
@@ -120,12 +125,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py
index 10ec8621ffa58..8971f888b6574 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,6 +16,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -29,6 +30,7 @@
 
 
 class TestLogSoftmaxNPUOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -63,17 +65,18 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['X'], ['Out'],
-                user_defined_grads=[self.x_grad],
-                max_relative_error=0.02)
+            self.check_grad_with_place(self.place, ['X'], ['Out'],
+                                       user_defined_grads=[self.x_grad],
+                                       max_relative_error=0.02)
         else:
-            self.check_grad_with_place(
-                self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
+            self.check_grad_with_place(self.place, ['X'], ['Out'],
+                                       user_defined_grads=[self.x_grad])
 
 
 def test_class(op_type, typename):
+
     class TestLogSoftmaxShape(TestLogSoftmaxNPUOp):
+
         def set_attrs(self):
             self.shape = [12, 10]
 
@@ -86,7 +89,9 @@ def set_dtype(self):
 
 
 def test_class2(op_type, typename):
+
     class TestLogSoftmaxAxis(TestLogSoftmaxNPUOp):
+
         def set_attrs(self):
             self.axis = 0
 
@@ -105,6 +110,7 @@ def set_dtype(self):
 
 
 class TestNNLogSoftmaxAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_shape = [2, 3, 4, 5]
         self.x = np.random.uniform(-1., 1., self.x_shape).astype(np.float32)
@@ -137,6 +143,7 @@ def test_check_api(self):
 
 
 class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_shape = [2, 3, 4, 5]
         self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py
index add7d72582164..bbf9bd2bf0c63 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 import op_test
 import unittest
@@ -158,10 +159,10 @@ def test(unit_test, use_npu=False, test_error=False):
             META_DATA = dict(TEST_META_WRONG_SHAPE_DATA)
         for shape_data in META_DATA.values():
             for data_type in SUPPORTED_DTYPES:
-                meta_data['x_np'] = np_data_generator(
-                    shape_data['x_shape'], dtype=data_type)
-                meta_data['y_np'] = np_data_generator(
-                    shape_data['y_shape'], dtype=data_type)
+                meta_data['x_np'] = np_data_generator(shape_data['x_shape'],
+                                                      dtype=data_type)
+                meta_data['y_np'] = np_data_generator(shape_data['y_shape'],
+                                                      dtype=data_type)
                 if meta_data['binary_op'] and test_error:
                     # catch C++ Exception
                     unit_test.assertRaises(BaseException, run_static,
@@ -176,11 +177,12 @@ def test(unit_test, use_npu=False, test_error=False):
                 else:
                     np_result = np_op(meta_data['x_np'])
                 unit_test.assertTrue((static_result == np_result).all())
-                unit_test.assertTrue((dygraph_result.numpy() == np_result).all(
-                ))
+                unit_test.assertTrue(
+                    (dygraph_result.numpy() == np_result).all())
 
 
 def test_type_error(unit_test, use_npu, type_str_map):
+
     def check_type(op_str, x, y, binary_op):
         op = getattr(paddle, op_str)
         error_type = ValueError
@@ -215,10 +217,12 @@ def check_type(op_str, x, y, binary_op):
         startup_program = paddle.static.Program()
         main_program = paddle.static.Program()
         with paddle.static.program_guard(main_program, startup_program):
-            x = paddle.static.data(
-                name='x', shape=[10], dtype=type_str_map['x'])
-            y = paddle.static.data(
-                name='y', shape=[10], dtype=type_str_map['y'])
+            x = paddle.static.data(name='x',
+                                   shape=[10],
+                                   dtype=type_str_map['x'])
+            y = paddle.static.data(name='y',
+                                   shape=[10],
+                                   dtype=type_str_map['y'])
             check_type(meta_data['op_str'], x, y, binary_op)
 
 
@@ -230,6 +234,7 @@ def type_map_factory():
 
 
 class TestCPU(unittest.TestCase):
+
     def test(self):
         test(self)
 
@@ -243,6 +248,7 @@ def test_type_error(self):
 
 
 class TestNPU(unittest.TestCase):
+
     def test(self):
         test(self, True)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
index 8ec9eb1cf3572..100cad468e30e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestLookupTableV2(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "lookup_table_v2"
@@ -37,8 +39,9 @@ def setUp(self):
         self.init_padding_idx()
         np.random.seed(SEED)
         w = np.random.random([self.vocab, self.dim]).astype(self.dtype)
-        x = np.random.randint(
-            0, self.vocab, size=(self.bsz, self.seqlen)).astype(self.ids_dtype)
+        x = np.random.randint(0, self.vocab,
+                              size=(self.bsz,
+                                    self.seqlen)).astype(self.ids_dtype)
         out = w[x]
         if self.padding_idx != -1:
             out[np.squeeze(x == self.padding_idx)] = np.zeros(self.dim)
@@ -77,8 +80,9 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['W'], 'Out', max_relative_error=0.01)
+            self.check_grad_with_place(self.place, ['W'],
+                                       'Out',
+                                       max_relative_error=0.01)
         else:
             self.check_grad_with_place(self.place, ['W'], 'Out')
 
@@ -96,6 +100,7 @@ def set_npu(self):
 
 
 class TestLookupTableV2Dim32(TestLookupTableV2):
+
     def init_dims(self):
         self.bsz = 6
         self.seqlen = 8
@@ -123,11 +128,13 @@ def set_npu(self):
 
 
 class TestLookupTableV2WithPadding(TestLookupTableV2):
+
     def init_padding_idx(self):
         self.padding_idx = np.random.randint(0, self.vocab)
 
 
 class TestLookupTableV2WithPadding1(TestLookupTableV2):
+
     def init_padding_idx(self):
         self.padding_idx = np.random.randint(0, self.vocab)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_masked_select_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_masked_select_op_npu.py
index 13078aea6903a..7cd9df1f2ebb4 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_masked_select_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_masked_select_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -34,6 +35,7 @@ def np_masked_select(x, mask):
 
 
 class TestMaskedSelectOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -63,28 +65,33 @@ def init_dtype(self):
 
 
 class TestMaskedSelectOp1(TestMaskedSelectOp):
+
     def init(self):
         self.shape = (6, 8, 9, 18)
 
 
 class TestMaskedSelectOp2(TestMaskedSelectOp):
+
     def init(self):
         self.shape = (168, )
 
 
 class TestMaskedSelectOpFp16(TestMaskedSelectOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_grad(self):
         x_grad = self.inputs['Mask'].astype(self.dtype)
         x_grad = x_grad * (1 / x_grad.sum())
-        self.check_grad_with_place(
-            self.place, ['X'], 'Y', user_defined_grads=[x_grad])
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Y',
+                                   user_defined_grads=[x_grad])
 
 
 @skip_check_grad_ci(reason="get_numeric_gradient not support int32")
 class TestMaskedSelectOpInt32(TestMaskedSelectOp):
+
     def init_dtype(self):
         self.dtype = np.int32
 
@@ -94,6 +101,7 @@ def test_check_grad(self):
 
 @skip_check_grad_ci(reason="get_numeric_gradient not support int64")
 class TestMaskedSelectOpInt64(TestMaskedSelectOp):
+
     def init_dtype(self):
         self.dtype = np.int64
 
@@ -102,6 +110,7 @@ def test_check_grad(self):
 
 
 class TestMaskedSelectAPI(unittest.TestCase):
+
     def test_imperative_mode(self):
         paddle.disable_static(paddle.NPUPlace(0))
         shape = (88, 6, 8)
@@ -127,13 +136,16 @@ def test_static_mode(self):
         exe = paddle.static.Executor(place=paddle.NPUPlace(0))
 
         res = exe.run(paddle.static.default_main_program(),
-                      feed={"x": np_x,
-                            "mask": np_mask},
+                      feed={
+                          "x": np_x,
+                          "mask": np_mask
+                      },
                       fetch_list=[out])
         self.assertEqual(np.allclose(res, np_out), True)
 
 
 class TestMaskedSelectError(unittest.TestCase):
+
     def test_error(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
@@ -141,8 +153,9 @@ def test_error(self):
             shape = [8, 9, 6]
             x = paddle.fluid.data(shape=shape, dtype='float32', name='x')
             mask = paddle.fluid.data(shape=shape, dtype='bool', name='mask')
-            mask_float = paddle.fluid.data(
-                shape=shape, dtype='float32', name='mask_float')
+            mask_float = paddle.fluid.data(shape=shape,
+                                           dtype='float32',
+                                           name='mask_float')
             np_x = np.random.random(shape).astype('float32')
             np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmul_op_npu.py
index a8dc0c137c353..841521fecdd2b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_matmul_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_matmul_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -269,7 +270,9 @@ def config(self):
 
 #--------------------test matmul alpha--------------------
 def create_test_alpha_class(parent):
+
     class TestMatMulOpAlphaCase(parent):
+
         def init_alpha(self):
             self.alpha = 0.125
 
@@ -294,7 +297,9 @@ def init_alpha(self):
 
 #--------------------test matmul fp16--------------------
 def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
+
     class TestMatMulOpFp16Case(parent):
+
         def init_kernel_type(self):
             self.dtype = np.float16
 
@@ -302,10 +307,9 @@ def test_check_output(self):
             self.check_output_with_place(self.place, atol=atol)
 
         def test_check_grad(self):
-            self.check_grad_with_place(
-                self.place, ['X', 'Y'],
-                'Out',
-                max_relative_error=max_relative_error)
+            self.check_grad_with_place(self.place, ['X', 'Y'],
+                                       'Out',
+                                       max_relative_error=max_relative_error)
 
     cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
     TestMatMulOpFp16Case.__name__ = cls_name
diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
index 23ca0cf1f492f..a607c3035a918 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -291,7 +292,9 @@ def config(self):
 
 
 def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
+
     class TestMatMulOpFp16Case(parent):
+
         def init_kernel_type(self):
             self.dtype = np.float16
 
@@ -299,10 +302,9 @@ def test_check_output(self):
             self.check_output_with_place(self.place, atol=atol)
 
         def test_check_grad(self):
-            self.check_grad_with_place(
-                self.place, ['X', 'Y'],
-                'Out',
-                max_relative_error=max_relative_error)
+            self.check_grad_with_place(self.place, ['X', 'Y'],
+                                       'Out',
+                                       max_relative_error=max_relative_error)
 
     cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
     TestMatMulOpFp16Case.__name__ = cls_name
@@ -329,6 +331,7 @@ def test_check_grad(self):
 
 
 class TestMatMulV2API(unittest.TestCase):
+
     def setUp(self):
         self.places = [paddle.CPUPlace()]
         if paddle.is_compiled_with_npu():
@@ -346,8 +349,10 @@ def check_static_result(self, place):
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
-                              feed={"input_x": x_np,
-                                    "input_y": y_np},
+                              feed={
+                                  "input_x": x_np,
+                                  "input_y": y_np
+                              },
                               fetch_list=[result])
 
     def test_static(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
index e69c2fd84dd9d..a6936541f5b09 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestMean(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -55,6 +57,7 @@ def test_check_grad(self):
 
 
 class TestMeanFP16(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
index 6500a8c8cd821..d11d83f47cce2 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -29,51 +30,47 @@
 
 
 class TestMemcpy_FillConstant(unittest.TestCase):
+
     def get_prog(self):
         paddle.enable_static()
         main_program = Program()
         with program_guard(main_program):
             cpu_var_name = "tensor@Cpu"
             npu_var_name = "tensor@Npu"
-            cpu_var = main_program.global_block().create_var(
-                name=cpu_var_name,
-                shape=[10, 10],
-                dtype='float32',
-                persistable=False,
-                stop_gradient=True)
-            npu_var = main_program.global_block().create_var(
-                name=npu_var_name,
-                shape=[10, 10],
-                dtype='float32',
-                persistable=False,
-                stop_gradient=True)
-            main_program.global_block().append_op(
-                type="fill_constant",
-                outputs={"Out": npu_var_name},
-                attrs={
-                    "shape": [10, 10],
-                    "dtype": npu_var.dtype,
-                    "value": 1.0,
-                    "place_type": 4
-                })
-            main_program.global_block().append_op(
-                type="fill_constant",
-                outputs={"Out": cpu_var_name},
-                attrs={
-                    "shape": [10, 10],
-                    "dtype": cpu_var.dtype,
-                    "value": 0.0,
-                    "place_type": 0
-                })
+            cpu_var = main_program.global_block().create_var(name=cpu_var_name,
+                                                             shape=[10, 10],
+                                                             dtype='float32',
+                                                             persistable=False,
+                                                             stop_gradient=True)
+            npu_var = main_program.global_block().create_var(name=npu_var_name,
+                                                             shape=[10, 10],
+                                                             dtype='float32',
+                                                             persistable=False,
+                                                             stop_gradient=True)
+            main_program.global_block().append_op(type="fill_constant",
+                                                  outputs={"Out": npu_var_name},
+                                                  attrs={
+                                                      "shape": [10, 10],
+                                                      "dtype": npu_var.dtype,
+                                                      "value": 1.0,
+                                                      "place_type": 4
+                                                  })
+            main_program.global_block().append_op(type="fill_constant",
+                                                  outputs={"Out": cpu_var_name},
+                                                  attrs={
+                                                      "shape": [10, 10],
+                                                      "dtype": cpu_var.dtype,
+                                                      "value": 0.0,
+                                                      "place_type": 0
+                                                  })
         return main_program, npu_var, cpu_var
 
     def test_npu_cpoy_to_cpu(self):
         main_program, npu_var, cpu_var = self.get_prog()
-        main_program.global_block().append_op(
-            type='memcpy',
-            inputs={'X': npu_var},
-            outputs={'Out': cpu_var},
-            attrs={'dst_place_type': 0})
+        main_program.global_block().append_op(type='memcpy',
+                                              inputs={'X': npu_var},
+                                              outputs={'Out': cpu_var},
+                                              attrs={'dst_place_type': 0})
         place = fluid.NPUPlace(0)
         exe = fluid.Executor(place)
         npu_, cpu_ = exe.run(main_program,
@@ -84,11 +81,10 @@ def test_npu_cpoy_to_cpu(self):
 
     def test_cpu_cpoy_npu(self):
         main_program, npu_var, cpu_var = self.get_prog()
-        main_program.global_block().append_op(
-            type='memcpy',
-            inputs={'X': cpu_var},
-            outputs={'Out': npu_var},
-            attrs={'dst_place_type': 4})
+        main_program.global_block().append_op(type='memcpy',
+                                              inputs={'X': cpu_var},
+                                              outputs={'Out': npu_var},
+                                              attrs={'dst_place_type': 4})
         place = fluid.NPUPlace(0)
         exe = fluid.Executor(place)
         npu_, cpu_ = exe.run(main_program,
diff --git a/python/paddle/fluid/tests/unittests/npu/test_merged_momentum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_merged_momentum_op_npu.py
index 96a15fc1caac3..dce642cc0634e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_merged_momentum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_merged_momentum_op_npu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import sys
+
 sys.path.append('..')
 import unittest
 import paddle
@@ -47,22 +48,21 @@ def run_momentum_op(params,
         }
 
         param_vars = [
-            helper.create_variable(
-                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
+            helper.create_variable(persistable=True,
+                                   shape=p.shape,
+                                   dtype=p.dtype) for p in params
         ]
         grad_vars = [
-            helper.create_variable(
-                shape=g.shape, dtype=g.dtype) for g in grads
+            helper.create_variable(shape=g.shape, dtype=g.dtype) for g in grads
         ]
         velocity_vars = [
-            helper.create_variable(
-                persistable=True, shape=v.shape, dtype=v.dtype)
-            for v in velocitys
+            helper.create_variable(persistable=True,
+                                   shape=v.shape,
+                                   dtype=v.dtype) for v in velocitys
         ]
-        lr_var = helper.create_variable(
-            persistable=True,
-            shape=learning_rate.shape,
-            dtype=learning_rate.dtype)
+        lr_var = helper.create_variable(persistable=True,
+                                        shape=learning_rate.shape,
+                                        dtype=learning_rate.dtype)
 
         feed_dict = OrderedDict()
 
@@ -81,14 +81,15 @@ def run_momentum_op(params,
 
         if multi_precision:
             master_param_vars = [
-                helper.create_variable(
-                    persistable=True, shape=p.shape, dtype=p.dtype)
-                for p in master_params
+                helper.create_variable(persistable=True,
+                                       shape=p.shape,
+                                       dtype=p.dtype) for p in master_params
             ]
             feed_dict.update(
-                OrderedDict([(mp_var.name, mp_val)
-                             for mp_var, mp_val in zip(master_param_vars,
-                                                       master_params)]))
+                OrderedDict([
+                    (mp_var.name, mp_val)
+                    for mp_var, mp_val in zip(master_param_vars, master_params)
+                ]))
             # CPUPlace does not use MasterParam
             if isinstance(place, paddle.CUDAPlace):
                 fetch_list = fetch_list + [
@@ -110,8 +111,10 @@ def run_momentum_op(params,
                 if multi_precision:
                     inputs['MasterParam'] = master_param_vars[i]
                     outputs['MasterParamOut'] = master_param_vars[i]
-                helper.append_op(
-                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+                helper.append_op(type=op_type,
+                                 inputs=inputs,
+                                 outputs=outputs,
+                                 attrs=attrs)
         else:
             inputs = {
                 'Param': param_vars,
@@ -123,8 +126,10 @@ def run_momentum_op(params,
             if multi_precision:
                 inputs['MasterParam'] = master_param_vars
                 outputs['MasterParamOut'] = master_param_vars
-            helper.append_op(
-                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+            helper.append_op(type=op_type,
+                             inputs=inputs,
+                             outputs=outputs,
+                             attrs=attrs)
 
     exe = paddle.static.Executor(place)
     with paddle.static.scope_guard(paddle.static.Scope()):
@@ -154,22 +159,21 @@ def run_momentum_op2(params,
         helper = LayerHelper(op_type, **locals())
 
         param_vars = [
-            helper.create_variable(
-                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
+            helper.create_variable(persistable=True,
+                                   shape=p.shape,
+                                   dtype=p.dtype) for p in params
         ]
         grad_vars = [
-            helper.create_variable(
-                shape=g.shape, dtype=g.dtype) for g in grads
+            helper.create_variable(shape=g.shape, dtype=g.dtype) for g in grads
         ]
         velocity_vars = [
-            helper.create_variable(
-                persistable=True, shape=v.shape, dtype=v.dtype)
-            for v in velocitys
+            helper.create_variable(persistable=True,
+                                   shape=v.shape,
+                                   dtype=v.dtype) for v in velocitys
         ]
-        lr_var = helper.create_variable(
-            persistable=True,
-            shape=learning_rate.shape,
-            dtype=learning_rate.dtype)
+        lr_var = helper.create_variable(persistable=True,
+                                        shape=learning_rate.shape,
+                                        dtype=learning_rate.dtype)
 
         feed_dict = OrderedDict()
 
@@ -188,14 +192,15 @@ def run_momentum_op2(params,
 
         if multi_precision:
             master_param_vars = [
-                helper.create_variable(
-                    persistable=True, shape=p.shape, dtype=p.dtype)
-                for p in master_params
+                helper.create_variable(persistable=True,
+                                       shape=p.shape,
+                                       dtype=p.dtype) for p in master_params
             ]
             feed_dict.update(
-                OrderedDict([(mp_var.name, mp_val)
-                             for mp_var, mp_val in zip(master_param_vars,
-                                                       master_params)]))
+                OrderedDict([
+                    (mp_var.name, mp_val)
+                    for mp_var, mp_val in zip(master_param_vars, master_params)
+                ]))
             # CPUPlace does not use MasterParam
             if isinstance(place, paddle.CUDAPlace):
                 fetch_list = fetch_list + [
@@ -225,8 +230,10 @@ def run_momentum_op2(params,
                     'regularization_method': 'l2_decay',
                     'regularization_coeff': 2.0,
                 }
-                helper.append_op(
-                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+                helper.append_op(type=op_type,
+                                 inputs=inputs,
+                                 outputs=outputs,
+                                 attrs=attrs)
         else:
             inputs = {
                 'Param': param_vars,
@@ -239,16 +246,22 @@ def run_momentum_op2(params,
                 inputs['MasterParam'] = master_param_vars
                 outputs['MasterParamOut'] = master_param_vars
             attrs = {
-                'mu': mu,
-                'multi_precision': multi_precision,
-                'rescale_grad': rescale_grad,
-                'use_nesterov': use_nesterov,
+                'mu':
+                mu,
+                'multi_precision':
+                multi_precision,
+                'rescale_grad':
+                rescale_grad,
+                'use_nesterov':
+                use_nesterov,
                 'regularization_method':
                 ['l2_decay' for i in range(len(param_vars))],
                 'regularization_coeff': [2.0 for i in range(len(param_vars))],
             }
-            helper.append_op(
-                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+            helper.append_op(type=op_type,
+                             inputs=inputs,
+                             outputs=outputs,
+                             attrs=attrs)
 
     exe = paddle.static.Executor(place)
     with paddle.static.scope_guard(paddle.static.Scope()):
@@ -257,6 +270,7 @@ def run_momentum_op2(params,
 
 
 class TestMergedMomentum(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
@@ -286,18 +300,17 @@ def check_with_place(self, place, multi_precision):
             self.shapes, multi_precision, self.seed, place)
 
         def run_op(use_merged):
-            # NPU Momentum Op does not support rescale_grad 
+            # NPU Momentum Op does not support rescale_grad
             rescale_grad = 1.0
-            return run_momentum_op(
-                params,
-                grads,
-                velocitys,
-                master_params,
-                learning_rate,
-                place,
-                multi_precision,
-                rescale_grad=rescale_grad,
-                use_merged=use_merged)
+            return run_momentum_op(params,
+                                   grads,
+                                   velocitys,
+                                   master_params,
+                                   learning_rate,
+                                   place,
+                                   multi_precision,
+                                   rescale_grad=rescale_grad,
+                                   use_merged=use_merged)
 
         outs1 = run_op(True)
         outs2 = run_op(False)
@@ -310,6 +323,7 @@ def test_main(self):
 
 
 class TestMergedMomentum2(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
@@ -339,19 +353,18 @@ def check_with_place(self, place, multi_precision):
             self.shapes, multi_precision, self.seed, place)
 
         def run_op(use_nesterov, use_merged):
-            # NPU Momentum Op does not support rescale_grad 
+            # NPU Momentum Op does not support rescale_grad
             rescale_grad = 1.0
-            return run_momentum_op2(
-                params,
-                grads,
-                velocitys,
-                master_params,
-                learning_rate,
-                place,
-                multi_precision,
-                rescale_grad=rescale_grad,
-                use_merged=use_merged,
-                use_nesterov=use_nesterov)
+            return run_momentum_op2(params,
+                                    grads,
+                                    velocitys,
+                                    master_params,
+                                    learning_rate,
+                                    place,
+                                    multi_precision,
+                                    rescale_grad=rescale_grad,
+                                    use_merged=use_merged,
+                                    use_nesterov=use_nesterov)
 
         outs1 = run_op(use_nesterov=True, use_merged=True)
         outs2 = run_op(use_nesterov=True, use_merged=False)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_meshgrid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_meshgrid_op_npu.py
index 39802602bf5e0..a4d388d2ed4f4 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_meshgrid_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_meshgrid_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle.fluid as fluid
@@ -27,6 +28,7 @@
 
 
 class TestMeshgridOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "meshgrid"
@@ -71,32 +73,41 @@ def get_x_shape(self):
 @skip_check_grad_ci(
     reason="The backward test is not supported for float16 type on NPU.")
 class TestMeshgridOpFP16(TestMeshgridOp):
+
     def get_dtype(self):
         return "float16"
 
 
 class TestMeshgridOpINT32(TestMeshgridOp):
+
     def get_dtype(self):
         return "int32"
 
 
 class TestMeshgridOpINT64(TestMeshgridOp):
+
     def get_dtype(self):
         return "int64"
 
 
 class TestMeshgridOp2(TestMeshgridOp):
+
     def get_x_shape(self):
         return [100, 300]
 
 
 class TestMeshgridOp3(unittest.TestCase):
+
     def test_api(self):
         x = fluid.data(shape=[100], dtype='int32', name='x')
         y = fluid.data(shape=[200], dtype='int32', name='y')
 
-        input_1 = np.random.randint(0, 100, [100, ]).astype('int32')
-        input_2 = np.random.randint(0, 100, [200, ]).astype('int32')
+        input_1 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_2 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
 
         out_1 = np.reshape(input_1, [100, 1])
         out_1 = np.broadcast_to(out_1, [100, 200])
@@ -106,8 +117,10 @@ def test_api(self):
         exe = fluid.Executor(place=fluid.NPUPlace(0))
         grid_x, grid_y = paddle.tensor.meshgrid(x, y)
         res_1, res_2 = exe.run(fluid.default_main_program(),
-                               feed={'x': input_1,
-                                     'y': input_2},
+                               feed={
+                                   'x': input_1,
+                                   'y': input_2
+                               },
                                fetch_list=[grid_x, grid_y])
 
         self.assertTrue(np.allclose(res_1, out_1))
@@ -115,12 +128,17 @@ def test_api(self):
 
 
 class TestMeshgridOp4(unittest.TestCase):
+
     def test_list_input(self):
         x = fluid.data(shape=[100], dtype='int32', name='x')
         y = fluid.data(shape=[200], dtype='int32', name='y')
 
-        input_1 = np.random.randint(0, 100, [100, ]).astype('int32')
-        input_2 = np.random.randint(0, 100, [200, ]).astype('int32')
+        input_1 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_2 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
 
         out_1 = np.reshape(input_1, [100, 1])
         out_1 = np.broadcast_to(out_1, [100, 200])
@@ -130,8 +148,10 @@ def test_list_input(self):
         exe = fluid.Executor(place=fluid.NPUPlace(0))
         grid_x, grid_y = paddle.tensor.meshgrid([x, y])
         res_1, res_2 = exe.run(fluid.default_main_program(),
-                               feed={'x': input_1,
-                                     'y': input_2},
+                               feed={
+                                   'x': input_1,
+                                   'y': input_2
+                               },
                                fetch_list=[grid_x, grid_y])
 
         self.assertTrue(np.allclose(res_1, out_1))
@@ -139,12 +159,17 @@ def test_list_input(self):
 
 
 class TestMeshgridOp5(unittest.TestCase):
+
     def test_tuple_input(self):
         x = fluid.data(shape=[100], dtype='int32', name='x')
         y = fluid.data(shape=[200], dtype='int32', name='y')
 
-        input_1 = np.random.randint(0, 100, [100, ]).astype('int32')
-        input_2 = np.random.randint(0, 100, [200, ]).astype('int32')
+        input_1 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_2 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
 
         out_1 = np.reshape(input_1, [100, 1])
         out_1 = np.broadcast_to(out_1, [100, 200])
@@ -154,8 +179,10 @@ def test_tuple_input(self):
         exe = fluid.Executor(place=fluid.NPUPlace(0))
         grid_x, grid_y = paddle.tensor.meshgrid((x, y))
         res_1, res_2 = exe.run(fluid.default_main_program(),
-                               feed={'x': input_1,
-                                     'y': input_2},
+                               feed={
+                                   'x': input_1,
+                                   'y': input_2
+                               },
                                fetch_list=[grid_x, grid_y])
 
         self.assertTrue(np.allclose(res_1, out_1))
@@ -163,10 +190,15 @@ def test_tuple_input(self):
 
 
 class TestMeshgridOp6(unittest.TestCase):
+
     def test_api_with_dygraph(self):
         paddle.disable_static(paddle.NPUPlace(0))
-        input_3 = np.random.randint(0, 100, [100, ]).astype('int32')
-        input_4 = np.random.randint(0, 100, [200, ]).astype('int32')
+        input_3 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_4 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
 
         out_3 = np.reshape(input_3, [100, 1])
         out_3 = np.broadcast_to(out_3, [100, 200])
@@ -183,10 +215,15 @@ def test_api_with_dygraph(self):
 
 
 class TestMeshgridOp7(unittest.TestCase):
+
     def test_api_with_dygraph_list_input(self):
         paddle.disable_static(paddle.NPUPlace(0))
-        input_3 = np.random.randint(0, 100, [100, ]).astype('int32')
-        input_4 = np.random.randint(0, 100, [200, ]).astype('int32')
+        input_3 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_4 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
 
         out_3 = np.reshape(input_3, [100, 1])
         out_3 = np.broadcast_to(out_3, [100, 200])
@@ -203,10 +240,15 @@ def test_api_with_dygraph_list_input(self):
 
 
 class TestMeshgridOp8(unittest.TestCase):
+
     def test_api_with_dygraph_tuple_input(self):
         paddle.disable_static(paddle.NPUPlace(0))
-        input_3 = np.random.randint(0, 100, [100, ]).astype('int32')
-        input_4 = np.random.randint(0, 100, [200, ]).astype('int32')
+        input_3 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_4 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
 
         out_3 = np.reshape(input_3, [100, 1])
         out_3 = np.broadcast_to(out_3, [100, 200])
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mixed_precision_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mixed_precision_npu.py
index 193b9eb4e0aca..26a74b7b736b8 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_mixed_precision_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_mixed_precision_npu.py
@@ -15,6 +15,7 @@
 import unittest
 import sys
 import paddle
+
 sys.path.append("..")
 import test_mixed_precision
 
@@ -22,6 +23,7 @@
 
 
 class AMPTestNpu(test_mixed_precision.AMPTest):
+
     def setUp(self):
         self.place = paddle.NPUPlace(0)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py
index b8c261c2555c4..6c2e24bb16382 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -29,6 +30,7 @@
 
 
 class TestMomentumOp1(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -75,6 +77,7 @@ def test_check_output(self):
 
 
 class TestMomentumOpFp16(TestMomentumOp1):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -83,20 +86,23 @@ def test_check_output(self):
 
 
 class TestMomentumOp2(TestMomentumOp1):
+
     def init_case(self):
         self.shape = (123, 321)
         self.use_nesterov = True
 
 
 class TestMomentumV2(unittest.TestCase):
+
     def test_momentum_dygraph(self):
         paddle.disable_static(place=fluid.NPUPlace(0))
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Momentum(
-            learning_rate=0.01, momentum=0.9, parameters=linear.parameters())
+        adam = paddle.optimizer.Momentum(learning_rate=0.01,
+                                         momentum=0.9,
+                                         parameters=linear.parameters())
         out = linear(a)
         out.backward()
         adam.step()
@@ -113,13 +119,13 @@ def test_momentum(self):
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
             avg_cost = fluid.layers.mean(cost)
 
-            rms_optimizer = paddle.optimizer.Momentum(
-                learning_rate=0.1, momentum=0.9)
+            rms_optimizer = paddle.optimizer.Momentum(learning_rate=0.1,
+                                                      momentum=0.9)
             rms_optimizer.minimize(avg_cost)
 
             fetch_list = [avg_cost]
-            train_reader = paddle.batch(
-                paddle.dataset.uci_housing.train(), batch_size=1)
+            train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
+                                        batch_size=1)
             feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -127,12 +133,14 @@ def test_momentum(self):
                 exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
 
     def test_raise_error(self):
-        self.assertRaises(
-            ValueError, paddle.optimizer.Momentum, learning_rate=None)
+        self.assertRaises(ValueError,
+                          paddle.optimizer.Momentum,
+                          learning_rate=None)
         self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None)
 
 
 class TestMomentumOpWithDecay(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -189,6 +197,7 @@ def test_check_output(self):
 
 
 class TestMomentumOpWithDecayFP16(TestMomentumOpWithDecay):
+
     def init_config(self):
         self.dtype = np.float16
 
@@ -198,11 +207,13 @@ def test_check_output(self):
 
 
 class TestMomentumOpWithDecay2(TestMomentumOpWithDecay):
+
     def init_config(self):
         self.use_nesterov = False
 
 
 class TestMomentumOpWithDecayAPI(unittest.TestCase):
+
     def _test_momentum_dygraph_common(self, regularization):
         paddle.disable_static(fluid.NPUPlace(0))
         inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
@@ -239,8 +250,8 @@ def test_momentum_static(self):
             momentum_optimizer.minimize(avg_cost)
 
             fetch_list = [avg_cost]
-            train_reader = paddle.batch(
-                paddle.dataset.uci_housing.train(), batch_size=1)
+            train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
+                                        batch_size=1)
             feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -249,10 +260,11 @@ def test_momentum_static(self):
 
 
 class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
+
     def __update_params(self, momentum, linear):
         for i in range(10):
-            inp = paddle.full(
-                shape=[2, 2], fill_value=i, dtype='float32').astype("float32")
+            inp = paddle.full(shape=[2, 2], fill_value=i,
+                              dtype='float32').astype("float32")
             inp = paddle.to_tensor(inp)
             out = linear(inp)
             loss = paddle.mean(out)
@@ -298,6 +310,7 @@ def test_vs(self, place=fluid.NPUPlace(0)):
 
 
 class TestMomentumV2Group(TestMomentumV2):
+
     def test_momentum_dygraph(self):
         paddle.disable_static(place=fluid.NPUPlace(0))
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -305,18 +318,22 @@ def test_momentum_dygraph(self):
         linear_1 = paddle.nn.Linear(13, 5)
         linear_2 = paddle.nn.Linear(5, 3)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Momentum(
-            learning_rate=0.01,
-            parameters=[{
-                'params': linear_1.parameters()
-            }, {
-                'params': linear_2.parameters(),
-                'weight_decay': 0.001,
-                'learning_rate': 0.1,
-                'momentum': 0.99
-            }],
-            weight_decay=0.1,
-            momentum=0.9)
+        adam = paddle.optimizer.Momentum(learning_rate=0.01,
+                                         parameters=[{
+                                             'params':
+                                             linear_1.parameters()
+                                         }, {
+                                             'params':
+                                             linear_2.parameters(),
+                                             'weight_decay':
+                                             0.001,
+                                             'learning_rate':
+                                             0.1,
+                                             'momentum':
+                                             0.99
+                                         }],
+                                         weight_decay=0.1,
+                                         momentum=0.9)
         out = linear_1(a)
         out = linear_2(out)
         out.backward()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
index b6e3134439d03..c4adebcda6ff6 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -59,7 +60,8 @@ def test_check_grad_normal(self):
             self.place,
             ['X', 'Y'],
             'Out',
-            max_relative_error=0.0065, )
+            max_relative_error=0.0065,
+        )
 
     def test_check_grad_ingore_x(self):
         self.check_grad_with_place(
@@ -67,7 +69,8 @@ def test_check_grad_ingore_x(self):
             ['Y'],
             'Out',
             no_grad_set=set("X"),
-            max_relative_error=0.0065, )
+            max_relative_error=0.0065,
+        )
 
     def test_check_grad_ingore_y(self):
         self.check_grad_with_place(
@@ -75,12 +78,14 @@ def test_check_grad_ingore_y(self):
             ['X'],
             'Out',
             no_grad_set=set("Y"),
-            max_relative_error=0.0065, )
+            max_relative_error=0.0065,
+        )
 
 
 @skip_check_grad_ci(
     reason="Don't support grad checking for NPU OP with FP16 data type.")
 class TestMulFP16(TestMul):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -119,6 +124,7 @@ def setUp(self):
 @skip_check_grad_ci(
     reason="Don't support grad checking for NPU OP with FP16 data type.")
 class TestMul2FP16(TestMul2):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -157,6 +163,7 @@ def setUp(self):
 @skip_check_grad_ci(
     reason="Don't support grad checking for NPU OP with FP16 data type.")
 class TestMul3FP16(TestMul3):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -195,6 +202,7 @@ def setUp(self):
 @skip_check_grad_ci(
     reason="Don't support grad checking for NPU OP with FP16 data type.")
 class TestMul4FP16(TestMul4):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -209,6 +217,7 @@ def test_check_grad_ingore_y(self):
 
 
 class TestMulNet(unittest.TestCase):
+
     def init_dtype(self):
         self.dtype = np.float32
 
@@ -230,8 +239,9 @@ def _test(self, run_npu=True):
             b = paddle.static.data(name="b", shape=[2, 3], dtype=self.dtype)
             c = paddle.static.data(name="c", shape=[3, 2], dtype=self.dtype)
             d = paddle.static.data(name="d", shape=[3, 2], dtype=self.dtype)
-            label = paddle.static.data(
-                name="label", shape=[2, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[2, 1],
+                                       dtype='int64')
 
             sum_1 = paddle.add(a, b)
             sum_2 = paddle.add(c, d)
@@ -280,6 +290,7 @@ def test_npu(self):
 
 
 class TestMulNet3_2(unittest.TestCase):
+
     def init_dtype(self):
         self.dtype = np.float32
 
@@ -301,8 +312,9 @@ def _test(self, run_npu=True):
             b = paddle.static.data(name="b", shape=[2, 3, 4], dtype=self.dtype)
             c = paddle.static.data(name="c", shape=[12, 5], dtype=self.dtype)
             d = paddle.static.data(name="d", shape=[12, 5], dtype=self.dtype)
-            label = paddle.static.data(
-                name="label", shape=[2, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[2, 1],
+                                       dtype='int64')
 
             sum_1 = paddle.add(a, b)
             sum_2 = paddle.add(c, d)
@@ -346,12 +358,13 @@ def test_npu(self):
         cpu_pred, cpu_loss = self._test(False)
         npu_pred, npu_loss = self._test(True)
 
-        self.assertTrue(np.allclose(
-            npu_pred, cpu_pred, atol=1e-5))  # atol needed on cann 20.3
+        self.assertTrue(np.allclose(npu_pred, cpu_pred,
+                                    atol=1e-5))  # atol needed on cann 20.3
         self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-5))
 
 
 class TestMulNet3_2_xc2(unittest.TestCase):
+
     def init_dtype(self):
         self.dtype = np.float32
 
@@ -373,8 +386,9 @@ def _test(self, run_npu=True):
             b = paddle.static.data(name="b", shape=[2, 3, 4], dtype=self.dtype)
             c = paddle.static.data(name="c", shape=[4, 5], dtype=self.dtype)
             d = paddle.static.data(name="d", shape=[4, 5], dtype=self.dtype)
-            label = paddle.static.data(
-                name="label", shape=[2, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[2, 1],
+                                       dtype='int64')
 
             sum_1 = paddle.add(a, b)
             sum_2 = paddle.add(c, d)
@@ -424,6 +438,7 @@ def test_npu(self):
 
 
 class TestMulNet4_2(unittest.TestCase):
+
     def init_dtype(self):
         self.dtype = np.float32
 
@@ -445,8 +460,9 @@ def _test(self, run_npu=True):
             b = paddle.static.data(name="b", shape=[12, 5], dtype=self.dtype)
             c = paddle.static.data(name="c", shape=[12, 5], dtype=self.dtype)
             d = paddle.static.data(name="d", shape=[12, 5], dtype=self.dtype)
-            label = paddle.static.data(
-                name="label", shape=[2, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[2, 1],
+                                       dtype='int64')
 
             sum_1 = paddle.add(a, b)  # [12, 5]
             sum_2 = paddle.add(c, d)  # [12, 5]
@@ -493,8 +509,8 @@ def test_npu(self):
         cpu_pred, cpu_loss = self._test(False)
         npu_pred, npu_loss = self._test(True)
 
-        self.assertTrue(np.allclose(
-            npu_pred, cpu_pred, atol=1e-5))  # atol needed on cann 20.3
+        self.assertTrue(np.allclose(npu_pred, cpu_pred,
+                                    atol=1e-5))  # atol needed on cann 20.3
         self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-5))
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_multinomial_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_multinomial_op_npu.py
index 28833a7dc1dcc..036e6a0a7f957 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_multinomial_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_multinomial_op_npu.py
@@ -19,6 +19,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import numpy as np
@@ -48,6 +49,7 @@ def sample_output_two_dimension(out, shape):
 
 
 class TestMultinomialOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "multinomial"
@@ -65,8 +67,8 @@ def init_data(self):
         self.attrs = {"num_samples": 100000, "replacement": True}
 
     def test_check_output(self):
-        self.check_output_customized(
-            self.verify_output, custom_place=self.place)
+        self.check_output_customized(self.verify_output,
+                                     custom_place=self.place)
 
     def sample_output(self, out):
         return sample_output_one_dimension(out, 4)
@@ -76,12 +78,12 @@ def verify_output(self, outs):
         prob = self.input_np / self.input_np.sum(axis=-1, keepdims=True)
         sample_prob = self.sample_output(np.array(outs[0]))
         self.assertTrue(
-            np.allclose(
-                sample_prob, prob, rtol=0, atol=0.01),
+            np.allclose(sample_prob, prob, rtol=0, atol=0.01),
             "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
 
 
 class TestMultinomialOp2(TestMultinomialOp):
+
     def init_data(self):
         # input probability is a matrix
         self.input_np = np.random.rand(3, 4)
@@ -93,6 +95,7 @@ def sample_output(self, out):
 
 
 class TestMultinomialOp3(TestMultinomialOp):
+
     def init_data(self):
         # replacement is False. number of samples must be less than number of categories.
         self.input_np = np.random.rand(1000)
@@ -108,6 +111,7 @@ def verify_output(self, outs):
 
 
 class TestMultinomialApi(unittest.TestCase):
+
     def test_dygraph(self):
         # input probability is a vector, and replacement is True
         paddle.set_device('npu:0')
@@ -119,8 +123,7 @@ def test_dygraph(self):
         sample_prob = sample_output_one_dimension(out.numpy(), 4)
         prob = x_numpy / x_numpy.sum(axis=-1, keepdims=True)
         self.assertTrue(
-            np.allclose(
-                sample_prob, prob, rtol=0, atol=0.01),
+            np.allclose(sample_prob, prob, rtol=0, atol=0.01),
             "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
         paddle.enable_static()
 
@@ -135,8 +138,7 @@ def test_dygraph2(self):
         sample_prob = sample_output_two_dimension(out.numpy(), [3, 4])
         prob = x_numpy / x_numpy.sum(axis=-1, keepdims=True)
         self.assertTrue(
-            np.allclose(
-                sample_prob, prob, rtol=0, atol=0.01),
+            np.allclose(sample_prob, prob, rtol=0, atol=0.01),
             "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
         paddle.enable_static()
 
@@ -181,22 +183,24 @@ def test_static(self):
         sample_prob = sample_output_one_dimension(out, 4)
         prob = x_np / x_np.sum(axis=-1, keepdims=True)
         self.assertTrue(
-            np.allclose(
-                sample_prob, prob, rtol=0, atol=0.01),
+            np.allclose(sample_prob, prob, rtol=0, atol=0.01),
             "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
 
 
 class TestMultinomialAlias(unittest.TestCase):
+
     def test_alias(self):
         paddle.set_device('npu:0')
         x = paddle.rand([4])
         out1 = paddle.multinomial(x, num_samples=10, replacement=True)
         out2 = paddle.tensor.multinomial(x, num_samples=10, replacement=True)
-        out3 = paddle.tensor.random.multinomial(
-            x, num_samples=10, replacement=True)
+        out3 = paddle.tensor.random.multinomial(x,
+                                                num_samples=10,
+                                                replacement=True)
 
 
 class TestMultinomialError(unittest.TestCase):
+
     def setUp(self):
         paddle.set_device('npu:0')
         paddle.disable_static()
@@ -205,6 +209,7 @@ def tearDown(self):
         paddle.enable_static()
 
     def test_num_sample(self):
+
         def test_num_sample_less_than_0():
             x = paddle.rand([4])
             out = paddle.multinomial(x, num_samples=-2)
@@ -212,6 +217,7 @@ def test_num_sample_less_than_0():
         self.assertRaises(ValueError, test_num_sample_less_than_0)
 
     def test_input_probs_dim(self):
+
         def test_dim_larger_than_2():
             x = paddle.rand([2, 3, 3])
             out = paddle.multinomial(x)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_op_npu.py
index c6f85c8dee40c..c17b8461bd17f 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestNearestInterpOp(OpTest):
+
     def setUp(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -52,9 +54,10 @@ def setUp(self):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = nearest_neighbor_interp_np(
-            input_np, out_h, out_w, self.out_size, self.actual_shape,
-            self.align_corners, self.data_layout)
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+                                               self.out_size, self.actual_shape,
+                                               self.align_corners,
+                                               self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -88,6 +91,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase1(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [4, 1, 7, 8]
@@ -98,6 +102,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 3, 9, 6]
@@ -107,11 +112,14 @@ def init_test_case(self):
         self.align_corners = False
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', in_place=True, max_relative_error=0.006)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   in_place=True,
+                                   max_relative_error=0.006)
 
 
 class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [1, 1, 32, 64]
@@ -122,6 +130,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [4, 1, 7, 8]
@@ -133,6 +142,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 3, 9, 6]
@@ -144,6 +154,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [1, 1, 32, 64]
@@ -155,6 +166,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpSame(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [2, 3, 32, 64]
@@ -165,6 +177,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 32, 16]
@@ -176,6 +189,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [2, 4, 4, 5]
@@ -188,6 +202,7 @@ def init_test_case(self):
 
 
 class TestNearestInterpOpUint8(OpTest):
+
     def setUp(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -196,8 +211,8 @@ def setUp(self):
         self.actual_shape = None
         self.init_test_case()
         self.op_type = "nearest_interp"
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape).astype("uint8")
+        input_np = np.random.randint(low=0, high=256,
+                                     size=self.input_shape).astype("uint8")
 
         if self.scale > 0:
             out_h = int(self.input_shape[2] * self.scale)
@@ -234,6 +249,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [2, 3, 32, 64]
@@ -244,6 +260,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [4, 1, 7, 8]
@@ -255,6 +272,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpScale1(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 7, 5]
@@ -266,6 +284,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpScale2(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 5, 7]
@@ -277,6 +296,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpScale3(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 7, 5]
@@ -288,6 +308,7 @@ def init_test_case(self):
 
 
 class TestNearestInterpOp_attr_tensor(OpTest):
+
     def setUp(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -350,6 +371,7 @@ def init_test_case(self):
 
 # out_size is a tensor list
 class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 3, 9, 6]
@@ -362,6 +384,7 @@ def init_test_case(self):
 
 # out_size is a 1-D tensor
 class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 32, 16]
@@ -375,6 +398,7 @@ def init_test_case(self):
 
 # scale is a 1-D tensor
 class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 32, 16]
@@ -387,6 +411,7 @@ def init_test_case(self):
 
 
 class TestNearestAPI(unittest.TestCase):
+
     def test_case(self):
         x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
         y = fluid.data(name="y", shape=[2, 6, 6, 3], dtype="float32")
@@ -394,19 +419,27 @@ def test_case(self):
         dim = fluid.data(name="dim", shape=[1], dtype="int32")
         shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
         actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
-        scale_tensor = fluid.data(
-            name="scale_tensor", shape=[1], dtype="float32")
-
-        out1 = fluid.layers.resize_nearest(
-            y, out_shape=[12, 12], data_format='NHWC', align_corners=False)
-        out2 = fluid.layers.resize_nearest(
-            x, out_shape=[12, dim], align_corners=False)
-        out3 = fluid.layers.resize_nearest(
-            x, out_shape=shape_tensor, align_corners=False)
-        out4 = fluid.layers.resize_nearest(
-            x, out_shape=[4, 4], actual_shape=actual_size, align_corners=False)
-        out5 = fluid.layers.resize_nearest(
-            x, scale=scale_tensor, align_corners=False)
+        scale_tensor = fluid.data(name="scale_tensor",
+                                  shape=[1],
+                                  dtype="float32")
+
+        out1 = fluid.layers.resize_nearest(y,
+                                           out_shape=[12, 12],
+                                           data_format='NHWC',
+                                           align_corners=False)
+        out2 = fluid.layers.resize_nearest(x,
+                                           out_shape=[12, dim],
+                                           align_corners=False)
+        out3 = fluid.layers.resize_nearest(x,
+                                           out_shape=shape_tensor,
+                                           align_corners=False)
+        out4 = fluid.layers.resize_nearest(x,
+                                           out_shape=[4, 4],
+                                           actual_shape=actual_size,
+                                           align_corners=False)
+        out5 = fluid.layers.resize_nearest(x,
+                                           scale=scale_tensor,
+                                           align_corners=False)
 
         x_data = np.random.random((2, 3, 6, 6)).astype("float32")
         dim_data = np.array([12]).astype("int32")
@@ -429,8 +462,10 @@ def test_case(self):
                           fetch_list=[out1, out2, out3, out4, out5],
                           return_numpy=True)
 
-        expect_res = nearest_neighbor_interp_np(
-            x_data, out_h=12, out_w=12, align_corners=False)
+        expect_res = nearest_neighbor_interp_np(x_data,
+                                                out_h=12,
+                                                out_w=12,
+                                                align_corners=False)
         self.assertTrue(
             np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1))))
         for i in range(len(results) - 1):
@@ -438,13 +473,15 @@ def test_case(self):
 
 
 class TestNearestInterpException(unittest.TestCase):
+
     def test_exception(self):
         input = fluid.data(name="input", shape=[1, 3, 6, 6], dtype="float32")
 
         def attr_data_format():
             # for 4-D input, data_format can only be NCHW or NHWC
-            out = fluid.layers.resize_nearest(
-                input, out_shape=[4, 8], data_format='NDHWC')
+            out = fluid.layers.resize_nearest(input,
+                                              out_shape=[4, 8],
+                                              data_format='NDHWC')
 
         def attr_scale_type():
             out = fluid.layers.resize_nearest(input, scale='scale')
diff --git a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py
index ec51dcf3f8e3e..5c5a05383889c 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle.fluid.core as core
@@ -31,6 +32,7 @@
 
 
 class TestNearestInterpOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -68,9 +70,11 @@ def setUp(self):
             output_h = self.out_h
             output_w = self.out_w
 
-        output_np = nearest_neighbor_interp_np(
-            input_np, output_h, output_w, scale_h, scale_w, self.out_size,
-            self.actual_shape, self.align_corners, self.data_layout)
+        output_np = nearest_neighbor_interp_np(input_np, output_h, output_w,
+                                               scale_h, scale_w, self.out_size,
+                                               self.actual_shape,
+                                               self.align_corners,
+                                               self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -97,17 +101,15 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['X'],
-                'Out',
-                in_place=True,
-                max_relative_error=0.02)
+            self.check_grad_with_place(self.place, ['X'],
+                                       'Out',
+                                       in_place=True,
+                                       max_relative_error=0.02)
         else:
-            self.check_grad_with_place(
-                self.place, ['X'],
-                'Out',
-                in_place=True,
-                max_relative_error=0.006)
+            self.check_grad_with_place(self.place, ['X'],
+                                       'Out',
+                                       in_place=True,
+                                       max_relative_error=0.006)
 
     def init_dtype(self):
         self.dtype = np.float32
@@ -123,11 +125,13 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpFP16(TestNearestInterpOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestNearestNeighborInterpCase1(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [4, 1, 7, 8]
@@ -138,6 +142,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 3, 9, 6]
@@ -148,6 +153,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [1, 1, 32, 64]
@@ -158,6 +164,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [4, 1, 7, 8]
@@ -169,6 +176,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 3, 9, 6]
@@ -180,6 +188,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [1, 1, 32, 64]
@@ -191,6 +200,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpSame(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [2, 3, 32, 64]
@@ -201,6 +211,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 32, 16]
@@ -212,6 +223,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpScale1(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 7, 5]
@@ -223,6 +235,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpScale2(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 5, 7]
@@ -234,6 +247,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpScale3(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 7, 5]
@@ -245,6 +259,7 @@ def init_test_case(self):
 
 
 class TestNearestInterpOp_attr_tensor(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -323,6 +338,7 @@ def init_test_case(self):
 
 # out_size is a tensor list
 class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 3, 9, 6]
@@ -335,6 +351,7 @@ def init_test_case(self):
 
 # out_size is a 1-D tensor
 class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 32, 16]
@@ -348,6 +365,7 @@ def init_test_case(self):
 
 # scale is a 1-D tensor
 class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 32, 16]
@@ -360,6 +378,7 @@ def init_test_case(self):
 
 
 class TestNearestInterpOpAPI_dy(unittest.TestCase):
+
     def test_case(self):
         import paddle
         if core.is_compiled_with_npu():
@@ -371,13 +390,14 @@ def test_case(self):
             scale_np = np.array([2, 2]).astype("int64")
             input_x = paddle.to_tensor(input_data)
             scale = paddle.to_tensor(scale_np)
-            expect_res = nearest_neighbor_interp_np(
-                input_data, out_h=12, out_w=12, align_corners=False)
-            out = interpolate(
-                x=input_x,
-                scale_factor=scale,
-                mode="nearest",
-                align_corners=False)
+            expect_res = nearest_neighbor_interp_np(input_data,
+                                                    out_h=12,
+                                                    out_w=12,
+                                                    align_corners=False)
+            out = interpolate(x=input_x,
+                              scale_factor=scale,
+                              mode="nearest",
+                              align_corners=False)
             self.assertTrue(np.allclose(out.numpy(), expect_res))
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py
index 8e28b3fe413b0..3934ea3b9bbf2 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -25,6 +26,7 @@
 
 
 class TestNPUNormOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.set_npu()
@@ -54,11 +56,13 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', max_relative_error=0.006)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.006)
 
 
 class TestNPUNormOp2(TestNPUNormOp):
+
     def init_test_case(self):
         self.shape = [5, 3, 9, 7]
         self.axis = 0
@@ -66,6 +70,7 @@ def init_test_case(self):
 
 
 class TestNPUNormOp3(TestNPUNormOp):
+
     def init_test_case(self):
         self.shape = [5, 3, 2, 7]
         self.axis = -1
@@ -75,6 +80,7 @@ def init_test_case(self):
 @skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
                     "however it is desirable to cover the forward pass")
 class TestNPUNormOp4(TestNPUNormOp):
+
     def init_test_case(self):
         self.shape = [128, 1024, 14, 14]
         self.axis = 2
@@ -87,6 +93,7 @@ def test_check_grad(self):
 @skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
                     "however it is desirable to cover the forward pass")
 class TestNPUNormOp5(TestNPUNormOp):
+
     def init_test_case(self):
         self.shape = [2048, 2048]
         self.axis = 1
@@ -97,6 +104,7 @@ def test_check_grad(self):
 
 
 class API_NormTest(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with fluid.program_guard(fluid.Program()):
@@ -109,6 +117,7 @@ def test_norm_x_type():
 
 
 class TestNPUNormOpFP16(TestNPUNormOp):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.__class__.no_need_check_grad = True
diff --git a/python/paddle/fluid/tests/unittests/npu/test_npu_place.py b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py
index 91e0c29e10609..2d0432204c85e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_npu_place.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py
@@ -23,6 +23,7 @@
 
 
 class TestNpuPlace(unittest.TestCase):
+
     def test(self):
         p = core.Place()
         p.set_place(paddle.NPUPlace(0))
@@ -32,6 +33,7 @@ def test(self):
 
 
 class TestNpuPlaceError(unittest.TestCase):
+
     def test_static(self):
         # NPU is not supported in ParallelExecutor
         prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_one_hot_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_one_hot_op_npu.py
index c92fffb2d26cb..0c77eb8217bf2 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_one_hot_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_one_hot_op_npu.py
@@ -17,6 +17,7 @@
 import sys
 import unittest
 import numpy as np
+
 sys.path.append("..")
 
 from op_test import OpTest
@@ -29,6 +30,7 @@
 
 
 class TestOneHotOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -57,6 +59,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_attr(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -84,6 +87,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_default_dtype(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -112,6 +116,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_default_dtype_attr(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -139,6 +144,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_out_of_range(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -162,6 +168,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_dtype_int64(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_one_hot_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_one_hot_v2_op_npu.py
index e511286cc2d67..d250dbfd2baa3 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_one_hot_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_one_hot_v2_op_npu.py
@@ -17,6 +17,7 @@
 import sys
 import unittest
 import numpy as np
+
 sys.path.append("..")
 
 from op_test import OpTest
@@ -29,6 +30,7 @@
 
 
 class TestOneHotOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -56,6 +58,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_non_lod(OpTest):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         depth = 10
@@ -79,6 +82,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_attr(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -106,6 +110,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_default_dtype(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -133,6 +138,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_default_dtype_attr(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -160,6 +166,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_out_of_range(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -182,6 +189,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_dtype_int64(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -204,6 +212,7 @@ def test_check_output(self):
 
 
 class TestOneHotOpApi(unittest.TestCase):
+
     def test_api(self):
         depth = 10
         self._run(depth)
@@ -230,7 +239,9 @@ def _run(self, depth):
 
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'label': label_data, },
+        ret = exe.run(feed={
+            'label': label_data,
+        },
                       fetch_list=[one_hot_label],
                       return_numpy=False)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py
index a7ca4edc524be..5560b8bbd143a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py
@@ -15,6 +15,7 @@
 import sys
 import unittest
 import numpy as np
+
 sys.path.append("..")
 
 import paddle
@@ -25,6 +26,7 @@
 
 
 class TestPnormOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -51,8 +53,9 @@ def test_check_output(self):
             self.check_output_with_place(paddle.NPUPlace(0))
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            paddle.NPUPlace(0), ['X'], 'Out', user_defined_grads=self.gradient)
+        self.check_grad_with_place(paddle.NPUPlace(0), ['X'],
+                                   'Out',
+                                   user_defined_grads=self.gradient)
 
     def init_test_case(self):
         self.shape = [2, 3, 4, 5]
@@ -95,6 +98,7 @@ def calc_gradient(self):
 
 
 class TestPnormOp2(TestPnormOp):
+
     def init_test_case(self):
         self.shape = [3, 20, 3]
         self.axis = 2
@@ -105,6 +109,7 @@ def init_test_case(self):
 
 
 class TestPnormOp3(TestPnormOp):
+
     def init_test_case(self):
         self.shape = [3, 20, 3]
         self.axis = 2
@@ -115,6 +120,7 @@ def init_test_case(self):
 
 
 class TestPnormOp4(TestPnormOp3):
+
     def init_test_case(self):
         self.shape = [3, 20, 3]
         self.axis = 2
@@ -125,6 +131,7 @@ def init_test_case(self):
 
 
 class TestPnormOp5(TestPnormOp3):
+
     def init_test_case(self):
         self.shape = [3, 20, 3]
         self.axis = 2
@@ -135,6 +142,7 @@ def init_test_case(self):
 
 
 class TestPnormOp6(TestPnormOp3):
+
     def init_test_case(self):
         self.shape = [2, 3, 4, 5]
         self.axis = 1
@@ -145,26 +153,31 @@ def init_test_case(self):
 
 
 class TestPnormOpfp16(TestPnormOp):
+
     def init_dtype(self):
         self.dtype = "float16"
 
 
 class TestPnormOp2fp16(TestPnormOp2):
+
     def init_dtype(self):
         self.dtype = "float16"
 
 
 class TestPnormOp3fp16(TestPnormOp3):
+
     def init_dtype(self):
         self.dtype = "float16"
 
 
 class TestPnormOp4fp16(TestPnormOp4):
+
     def init_dtype(self):
         self.dtype = "float16"
 
 
 class TestPnormOp5fp16(TestPnormOp5):
+
     def init_dtype(self):
         self.dtype = "float16"
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pad3d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pad3d_op_npu.py
index 234ceb2f0b7ec..12ade62af4d98 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_pad3d_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_pad3d_op_npu.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 import op_test
 import paddle
@@ -26,6 +27,7 @@
 
 
 class TestPad3dNPUOp(op_test.OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.__class__.use_npu = True
@@ -42,11 +44,11 @@ def setUp(self):
         self.attrs = {}
         if self.variable_paddings:
             self.attrs['paddings'] = []
-            self.inputs['Paddings'] = np.array(self.paddings).flatten().astype(
-                "int32")
+            self.inputs['Paddings'] = np.array(
+                self.paddings).flatten().astype("int32")
         else:
-            self.attrs['paddings'] = np.array(self.paddings).flatten().astype(
-                "int32")
+            self.attrs['paddings'] = np.array(
+                self.paddings).flatten().astype("int32")
         self.attrs['value'] = self.value
         self.attrs['mode'] = self.mode
         self.attrs['data_format'] = self.data_format
@@ -87,6 +89,7 @@ def initTestCase(self):
 
 
 class TestCase1(TestPad3dNPUOp):
+
     def initTestCase(self):
         self.shape = (3, 4, 5, 6, 7)
         self.paddings = [0, 1, 2, 3, 4, 5]
@@ -99,6 +102,7 @@ def test_check_grad(self):
 
 
 class TestCase2(TestPad3dNPUOp):
+
     def initTestCase(self):
         self.shape = (4, 5, 6, 7, 8)
         self.paddings = [1, 1, 1, 1, 1, 1]
@@ -107,6 +111,7 @@ def initTestCase(self):
 
 
 class TestPadAPI(unittest.TestCase):
+
     def _get_numpy_out(self,
                        input_data,
                        pad,
@@ -163,8 +168,8 @@ def _get_numpy_out(self,
 
     def test_static(self):
         paddle.enable_static()
-        self.place = fluid.NPUPlace(0) if fluid.core.is_compiled_with_npu(
-        ) else fluid.CPUPlace()
+        self.place = fluid.NPUPlace(
+            0) if fluid.core.is_compiled_with_npu() else fluid.CPUPlace()
         with program_guard(Program(), Program()):
             input_shape = (1, 2, 3, 4, 5)
             pad = [1, 2, 1, 1, 3, 4]
@@ -187,10 +192,16 @@ def test_static(self):
                               feed={"x": input_data},
                               fetch_list=[result1, result2])
 
-            np_out1 = self._get_numpy_out(
-                input_data, pad, mode, value, data_format="NCDHW")
-            np_out2 = self._get_numpy_out(
-                input_data, pad, mode, value, data_format="NDHWC")
+            np_out1 = self._get_numpy_out(input_data,
+                                          pad,
+                                          mode,
+                                          value,
+                                          data_format="NCDHW")
+            np_out2 = self._get_numpy_out(input_data,
+                                          pad,
+                                          mode,
+                                          value,
+                                          data_format="NDHWC")
             self.assertTrue(np.allclose(fetches[0], np_out1))
             self.assertTrue(np.allclose(fetches[1], np_out2))
 
@@ -205,10 +216,16 @@ def test_dygraph_1(self):
         input_data = np.random.rand(*input_shape).astype(np.float32)
         tensor_data = paddle.to_tensor(input_data)
 
-        np_out1 = self._get_numpy_out(
-            input_data, pad, mode, value, data_format="NCDHW")
-        np_out2 = self._get_numpy_out(
-            input_data, pad, mode, value, data_format="NDHWC")
+        np_out1 = self._get_numpy_out(input_data,
+                                      pad,
+                                      mode,
+                                      value,
+                                      data_format="NCDHW")
+        np_out2 = self._get_numpy_out(input_data,
+                                      pad,
+                                      mode,
+                                      value,
+                                      data_format="NDHWC")
 
         y1 = F.pad(tensor_data,
                    pad=pad,
@@ -235,10 +252,16 @@ def test_dygraph_2(self):
         input_data = np.random.rand(*input_shape).astype(np.float32)
         tensor_data = paddle.to_tensor(input_data)
 
-        np_out1 = self._get_numpy_out(
-            input_data, pad, mode, value, data_format="NCHW")
-        np_out2 = self._get_numpy_out(
-            input_data, pad, mode, value, data_format="NHWC")
+        np_out1 = self._get_numpy_out(input_data,
+                                      pad,
+                                      mode,
+                                      value,
+                                      data_format="NCHW")
+        np_out2 = self._get_numpy_out(input_data,
+                                      pad,
+                                      mode,
+                                      value,
+                                      data_format="NHWC")
 
         y1 = F.pad(tensor_data,
                    pad=pad,
@@ -265,10 +288,16 @@ def test_dygraph_3(self):
         input_data = np.random.rand(*input_shape).astype(np.float32)
         tensor_data = paddle.to_tensor(input_data)
 
-        np_out1 = self._get_numpy_out(
-            input_data, pad, mode, value, data_format="NCL")
-        np_out2 = self._get_numpy_out(
-            input_data, pad, mode, value, data_format="NLC")
+        np_out1 = self._get_numpy_out(input_data,
+                                      pad,
+                                      mode,
+                                      value,
+                                      data_format="NCL")
+        np_out2 = self._get_numpy_out(input_data,
+                                      pad,
+                                      mode,
+                                      value,
+                                      data_format="NLC")
 
         y1 = F.pad(tensor_data,
                    pad=pad,
@@ -286,6 +315,7 @@ def test_dygraph_3(self):
 
 
 class TestPad1dAPI(unittest.TestCase):
+
     def _get_numpy_out(self,
                        input_data,
                        pad,
@@ -318,26 +348,30 @@ def test_class(self):
         input_data = np.random.rand(*input_shape).astype(np.float32)
 
         pad_constant = nn.Pad1D(padding=pad, mode="constant", value=value)
-        pad_constant_int = nn.Pad1D(
-            padding=pad_int, mode="constant", value=value)
+        pad_constant_int = nn.Pad1D(padding=pad_int,
+                                    mode="constant",
+                                    value=value)
 
         data = paddle.to_tensor(input_data)
 
         output = pad_constant(data)
-        np_out = self._get_numpy_out(
-            input_data, pad, "constant", value=value, data_format="NCL")
+        np_out = self._get_numpy_out(input_data,
+                                     pad,
+                                     "constant",
+                                     value=value,
+                                     data_format="NCL")
         self.assertTrue(np.allclose(output.numpy(), np_out))
 
         output = pad_constant_int(data)
-        np_out = self._get_numpy_out(
-            input_data, [pad_int] * 2,
-            "constant",
-            value=value,
-            data_format="NCL")
+        np_out = self._get_numpy_out(input_data, [pad_int] * 2,
+                                     "constant",
+                                     value=value,
+                                     data_format="NCL")
         self.assertTrue(np.allclose(output.numpy(), np_out))
 
 
 class TestPad2dAPI(unittest.TestCase):
+
     def _get_numpy_out(self,
                        input_data,
                        pad,
@@ -372,26 +406,30 @@ def test_class(self):
         input_data = np.random.rand(*input_shape).astype(np.float32)
 
         pad_constant = nn.Pad2D(padding=pad, mode="constant", value=value)
-        pad_constant_int = nn.Pad2D(
-            padding=pad_int, mode="constant", value=value)
+        pad_constant_int = nn.Pad2D(padding=pad_int,
+                                    mode="constant",
+                                    value=value)
 
         data = paddle.to_tensor(input_data)
 
         output = pad_constant(data)
-        np_out = self._get_numpy_out(
-            input_data, pad, "constant", value=value, data_format="NCHW")
+        np_out = self._get_numpy_out(input_data,
+                                     pad,
+                                     "constant",
+                                     value=value,
+                                     data_format="NCHW")
         self.assertTrue(np.allclose(output.numpy(), np_out))
 
         output = pad_constant_int(data)
-        np_out = self._get_numpy_out(
-            input_data, [pad_int] * 4,
-            "constant",
-            value=value,
-            data_format="NCHW")
+        np_out = self._get_numpy_out(input_data, [pad_int] * 4,
+                                     "constant",
+                                     value=value,
+                                     data_format="NCHW")
         self.assertTrue(np.allclose(output.numpy(), np_out))
 
 
 class TestPad3dAPI(unittest.TestCase):
+
     def _get_numpy_out(self,
                        input_data,
                        pad,
@@ -428,27 +466,32 @@ def test_class(self):
         input_data = np.random.rand(*input_shape).astype(np.float32)
 
         pad_constant = nn.Pad3D(padding=pad, mode="constant", value=value)
-        pad_constant_int = nn.Pad3D(
-            padding=pad_int, mode="constant", value=value)
+        pad_constant_int = nn.Pad3D(padding=pad_int,
+                                    mode="constant",
+                                    value=value)
 
         data = paddle.to_tensor(input_data)
 
         output = pad_constant(data)
-        np_out = self._get_numpy_out(
-            input_data, pad, "constant", value=value, data_format="NCDHW")
+        np_out = self._get_numpy_out(input_data,
+                                     pad,
+                                     "constant",
+                                     value=value,
+                                     data_format="NCDHW")
         self.assertTrue(np.allclose(output.numpy(), np_out))
 
         output = pad_constant_int(data)
-        np_out = self._get_numpy_out(
-            input_data, [pad_int] * 6,
-            "constant",
-            value=value,
-            data_format="NCDHW")
+        np_out = self._get_numpy_out(input_data, [pad_int] * 6,
+                                     "constant",
+                                     value=value,
+                                     data_format="NCDHW")
         self.assertTrue(np.allclose(output.numpy(), np_out))
 
 
 class TestPad3dOpNpuError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_value():
             input_shape = (1, 2, 3, 4, 5)
             data = np.random.rand(*input_shape).astype(np.float32)
@@ -495,12 +538,14 @@ def test_mode_3():
 
 
 class TestPadDataformatError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_ncl():
             input_shape = (1, 2, 3, 4)
             pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
-            data = np.arange(
-                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
+            data = np.arange(np.prod(input_shape),
+                             dtype=np.float64).reshape(input_shape) + 1
             my_pad = nn.Pad1D(padding=pad, mode="replicate", data_format="NCL")
             data = paddle.to_tensor(data)
             result = my_pad(data)
@@ -508,8 +553,8 @@ def test_ncl():
         def test_nchw():
             input_shape = (1, 2, 4)
             pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
-            data = np.arange(
-                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
+            data = np.arange(np.prod(input_shape),
+                             dtype=np.float64).reshape(input_shape) + 1
             my_pad = nn.Pad1D(padding=pad, mode="replicate", data_format="NCHW")
             data = paddle.to_tensor(data)
             result = my_pad(data)
@@ -517,10 +562,11 @@ def test_nchw():
         def test_ncdhw():
             input_shape = (1, 2, 3, 4)
             pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
-            data = np.arange(
-                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
-            my_pad = nn.Pad1D(
-                padding=pad, mode="replicate", data_format="NCDHW")
+            data = np.arange(np.prod(input_shape),
+                             dtype=np.float64).reshape(input_shape) + 1
+            my_pad = nn.Pad1D(padding=pad,
+                              mode="replicate",
+                              data_format="NCDHW")
             data = paddle.to_tensor(data)
             result = my_pad(data)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py
index d1d2e8b3467be..13c99f993f95a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,21 +29,25 @@
 
 
 class TestPadOp(OpTest):
+
     def setUp(self):
         self.op_type = "pad"
         self.set_npu()
         self.init_dtype()
         self.initTestCase()
 
-        self.inputs = {'X': np.random.random(self.shape).astype(self.dtype), }
+        self.inputs = {
+            'X': np.random.random(self.shape).astype(self.dtype),
+        }
         self.attrs = {}
         self.attrs['paddings'] = np.array(self.paddings).flatten()
         self.attrs['pad_value'] = self.pad_value
         self.outputs = {
-            'Out': np.pad(self.inputs['X'],
-                          self.paddings,
-                          mode='constant',
-                          constant_values=self.pad_value)
+            'Out':
+            np.pad(self.inputs['X'],
+                   self.paddings,
+                   mode='constant',
+                   constant_values=self.pad_value)
         }
 
     def test_check_output(self):
@@ -50,8 +55,9 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['X'], 'Out', max_relative_error=0.6)
+            self.check_grad_with_place(self.place, ['X'],
+                                       'Out',
+                                       max_relative_error=0.6)
         else:
             self.check_grad_with_place(self.place, ['X'], 'Out')
 
@@ -69,6 +75,7 @@ def initTestCase(self):
 
 
 class TestCase1(TestPadOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5)
         self.paddings = [(0, 1), (2, 3), (2, 1), (1, 1)]
@@ -76,6 +83,7 @@ def initTestCase(self):
 
 
 class TestCase2(TestPadOp):
+
     def initTestCase(self):
         self.shape = (5, 5, 5)
         self.paddings = [(0, 0), (0, 0), (1, 2)]
@@ -83,6 +91,7 @@ def initTestCase(self):
 
 
 class TestCase3(TestPadOp):
+
     def initTestCase(self):
         self.shape = (100)
         self.paddings = [(0, 1)]
@@ -93,7 +102,9 @@ def initTestCase(self):
 
 
 def create_test_fp16(parent):
+
     class TestPadFp16(parent):
+
         def init_dtype(self):
             self.dtype = np.float16
 
@@ -109,6 +120,7 @@ def init_dtype(self):
 
 
 class TestPadOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             input_data = np.random.random((2, 2)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/npu/test_parallel_dygraph_mnist_npu.py b/python/paddle/fluid/tests/unittests/npu/test_parallel_dygraph_mnist_npu.py
index 1d09bd93e9b22..76980bf8478e9 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_parallel_dygraph_mnist_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_parallel_dygraph_mnist_npu.py
@@ -17,6 +17,7 @@
 import os
 import sys
 import unittest
+
 sys.path.append("..")
 
 from test_dist_base import TestDistBase
@@ -51,14 +52,19 @@
     os.getenv("ASCEND_AICPU_PATH", "/usr/local/Ascend/nnae/latest"),
     "ASCEND_OPP_PATH":
     os.getenv("ASCEND_OPP_PATH", "/usr/local/Ascend/nnae/latest/opp"),
-    "HCCL_CONNECT_TIMEOUT": "7200",
-    "HCCL_WHITELIST_DISABLE": "1",
-    "HCCL_SECURITY_MODE": "1",
-    "RANK_TABLE_FILE": "rank_table_file.json",
+    "HCCL_CONNECT_TIMEOUT":
+    "7200",
+    "HCCL_WHITELIST_DISABLE":
+    "1",
+    "HCCL_SECURITY_MODE":
+    "1",
+    "RANK_TABLE_FILE":
+    "rank_table_file.json",
 }
 
 
 class TestParallelDygraphMnistNPU(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._hccl_mode = True
@@ -78,6 +84,7 @@ def test_mnist(self):
 
 
 class TestFleetDygraphMnistNPU(TestParallelDygraphMnistNPU):
+
     def _setup_config(self):
         self._sync_mode = False
         self._hccl_mode = True
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
index 4822abc3b25eb..3e7d1fd80eed1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
@@ -17,6 +17,7 @@
 import sys
 import unittest
 import numpy as np
+
 sys.path.append("..")
 
 import paddle
@@ -30,7 +31,9 @@
 
 
 def create_test_padding_SAME_class(parent):
+
     class TestPaddingSMAECase(parent):
+
         def init_paddings(self):
             self.paddings = [0, 0]
             self.padding_algorithm = "SAME"
@@ -41,7 +44,9 @@ def init_paddings(self):
 
 
 def create_test_use_ceil_class(parent):
+
     class TestPool2DUseCeilCase(parent):
+
         def init_ceil_mode(self):
             self.ceil_mode = True
 
@@ -51,7 +56,9 @@ def init_ceil_mode(self):
 
 
 def create_test_padding_VALID_class(parent):
+
     class TestPaddingVALIDCase(parent):
+
         def init_paddings(self):
             self.paddings = [1, 1]
             self.padding_algorithm = "VALID"
@@ -62,7 +69,9 @@ def init_paddings(self):
 
 
 def create_test_fp16_class(parent):
+
     class TestFp16Case(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = False
             self.dtype = np.float16
@@ -89,8 +98,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         for input_size, filter_size, stride_size in zip(input_shape, pool_size,
                                                         pool_stride):
             out_size = int((input_size + stride_size - 1) / stride_size)
-            pad_sum = np.max((
-                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0))
             pad_0 = int(pad_sum / 2)
             pad_1 = int(pad_sum - pad_0)
             padding.append(pad_0)
@@ -171,19 +180,19 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
 
             if pool_type == 'avg':
                 if (exclusive or adaptive):
-                    field_size = (in_h_end - in_h_start) * (
-                        in_w_end - in_w_start)
-                x_grad[:, :, in_h_start:in_h_end, in_w_start:
-                       in_w_end] += 1 / field_size
+                    field_size = (in_h_end - in_h_start) * (in_w_end -
+                                                            in_w_start)
+                x_grad[:, :, in_h_start:in_h_end,
+                       in_w_start:in_w_end] += 1 / field_size
             elif pool_type == 'max':
                 for n in range(N):
                     for c in range(C):
-                        idx = np.argmax(x[n, c, in_h_start:in_h_end, in_w_start:
-                                          in_w_end].flatten())
+                        idx = np.argmax(x[n, c, in_h_start:in_h_end,
+                                          in_w_start:in_w_end].flatten())
                         idx_h = idx // (in_w_end - in_w_start)
                         idx_w = idx % (in_w_end - in_w_start)
-                        x_grad[n, c, in_h_start + idx_h, in_w_start +
-                               idx_w] += 1
+                        x_grad[n, c, in_h_start + idx_h,
+                               in_w_start + idx_w] += 1
 
     if data_format == "NHWC":
         x_grad = x_grad.transpose([0, 2, 3, 1])
@@ -191,6 +200,7 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
 
 
 class TestPool2D_Op(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "pool2d"
@@ -210,12 +220,14 @@ def setUp(self):
 
         input = np.random.random(self.shape).astype(self.dtype)
         if self.pool_type == "max":
-            input = np.array([x for x in range(np.prod(self.shape))]).reshape(
-                self.shape).astype(self.dtype)
-        output = pool2D_forward_naive(
-            input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive, self.adaptive, self.data_format,
-            self.pool_type, self.padding_algorithm).astype(self.dtype)
+            input = np.array([x for x in range(np.prod(self.shape))
+                              ]).reshape(self.shape).astype(self.dtype)
+        output = pool2D_forward_naive(input, self.ksize, self.strides,
+                                      self.paddings, self.global_pool,
+                                      self.ceil_mode, self.exclusive,
+                                      self.adaptive, self.data_format,
+                                      self.pool_type,
+                                      self.padding_algorithm).astype(self.dtype)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
@@ -278,28 +290,27 @@ def test_check_output(self):
         self.check_output_with_place(fluid.NPUPlace(0), atol=1e-3)
 
     def test_check_grad(self):
-        x_grad = pool2d_backward_navie(
-            self.inputs["X"],
-            ksize=self.ksize,
-            strides=self.strides,
-            paddings=self.paddings,
-            global_pool=self.global_pool,
-            ceil_mode=False,
-            exclusive=self.exclusive,
-            adaptive=self.adaptive,
-            data_format=self.data_format,
-            pool_type=self.pool_type,
-            padding_algorithm=self.padding_algorithm)
+        x_grad = pool2d_backward_navie(self.inputs["X"],
+                                       ksize=self.ksize,
+                                       strides=self.strides,
+                                       paddings=self.paddings,
+                                       global_pool=self.global_pool,
+                                       ceil_mode=False,
+                                       exclusive=self.exclusive,
+                                       adaptive=self.adaptive,
+                                       data_format=self.data_format,
+                                       pool_type=self.pool_type,
+                                       padding_algorithm=self.padding_algorithm)
         x_grad = x_grad / np.prod(self.outputs['Out'].shape)
-        self.check_grad_with_place(
-            fluid.NPUPlace(0),
-            set(['X']),
-            'Out',
-            max_relative_error=0.06,
-            user_defined_grads=[x_grad])
+        self.check_grad_with_place(fluid.NPUPlace(0),
+                                   set(['X']),
+                                   'Out',
+                                   max_relative_error=0.06,
+                                   user_defined_grads=[x_grad])
 
 
 class TestCase1(TestPool2D_Op):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -319,6 +330,7 @@ def init_shape(self):
 
 
 class TestCase2(TestPool2D_Op):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -338,29 +350,34 @@ def init_shape(self):
 
 
 class TestCase3(TestPool2D_Op):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase4(TestCase1):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase5(TestCase2):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestAvgInclude(TestCase2):
+
     def init_exclusive(self):
         self.exclusive = False
 
 
 class TestAvgPoolAdaptive(TestCase1):
+
     def init_adaptive(self):
         self.adaptive = True
 
@@ -374,6 +391,7 @@ def init_test_case(self):
 
 
 class TestAvgPoolAdaptiveAsyOutSize(TestCase1):
+
     def init_adaptive(self):
         self.adaptive = True
 
@@ -390,6 +408,7 @@ def init_test_case(self):
 
 #-------test pool2d with asymmetric padding-----
 class TestPool2D_AsyPadding(TestPool2D_Op):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -400,6 +419,7 @@ def init_shape(self):
 
 
 class TestCase1_AsyPadding(TestCase1):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -410,6 +430,7 @@ def init_shape(self):
 
 
 class TestCase2_AsyPadding(TestCase2):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -420,6 +441,7 @@ def init_shape(self):
 
 
 class TestCase3_AsyPadding(TestCase3):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -430,6 +452,7 @@ def init_shape(self):
 
 
 class TestCase4_AsyPadding(TestCase4):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -440,6 +463,7 @@ def init_shape(self):
 
 
 class TestCase5_AsyPadding((TestCase5)):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -450,6 +474,7 @@ def init_shape(self):
 
 
 class TestAvgInclude_AsyPadding(TestCase2):
+
     def init_exclusive(self):
         self.exclusive = False
 
@@ -463,6 +488,7 @@ def init_shape(self):
 
 
 class TestAvgPoolAdaptive_AsyPadding(TestCase1):
+
     def init_adaptive(self):
         self.adaptive = True
 
@@ -477,6 +503,7 @@ def init_shape(self):
 
 #----------- test channel_last --------------
 class TestPool2D_channel_last(TestPool2D_Op):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -485,6 +512,7 @@ def init_shape(self):
 
 
 class TestCase1_channel_last(TestCase1):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -493,6 +521,7 @@ def init_shape(self):
 
 
 class TestCase2_channel_last(TestCase2):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -501,6 +530,7 @@ def init_shape(self):
 
 
 class TestCase3_channel_last(TestCase3):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -509,6 +539,7 @@ def init_shape(self):
 
 
 class TestCase4_channel_last(TestCase4):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -517,6 +548,7 @@ def init_shape(self):
 
 
 class TestCase5_channel_last(TestCase5):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -525,11 +557,13 @@ def init_shape(self):
 
 
 class TestCase5_Max(TestCase2):
+
     def init_pool_type(self):
         self.pool_type = "max"
 
 
 class TestCase5_channel_last_Max(TestCase5_Max):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -538,11 +572,13 @@ def init_shape(self):
 
 
 class TestAvgInclude_channel_last(TestCase2_channel_last):
+
     def init_exclusive(self):
         self.exclusive = False
 
 
 class TestAvgPoolAdaptive_channel_last(TestCase1_channel_last):
+
     def init_adaptive(self):
         self.adaptive = True
 
@@ -555,6 +591,7 @@ def init_test_case(self):
 
 
 class TestPool2D_AsyPadding_channel_last(TestPool2D_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -563,6 +600,7 @@ def init_shape(self):
 
 
 class TestCase1_AsyPadding_channel_last(TestCase1_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -571,6 +609,7 @@ def init_shape(self):
 
 
 class TestCase2_AsyPadding_channel_last(TestCase2_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -579,6 +618,7 @@ def init_shape(self):
 
 
 class TestCase3_AsyPadding_channel_last(TestCase3_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -587,6 +627,7 @@ def init_shape(self):
 
 
 class TestCase4_AsyPadding_channel_last(TestCase4_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -595,6 +636,7 @@ def init_shape(self):
 
 
 class TestCase5_AsyPadding_channel_last(TestCase5_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -603,6 +645,7 @@ def init_shape(self):
 
 
 class TestAvgInclude_AsyPadding_channel_last(TestAvgInclude_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -610,8 +653,9 @@ def init_shape(self):
         self.shape = [2, 7, 7, 3]
 
 
-class TestAvgPoolAdaptive_AsyPadding_channel_last(
-        TestAvgPoolAdaptive_AsyPadding):
+class TestAvgPoolAdaptive_AsyPadding_channel_last(TestAvgPoolAdaptive_AsyPadding
+                                                  ):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -620,6 +664,7 @@ def init_shape(self):
 
 
 class TestCase1_strides(TestCase1):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         # fixme: CANN AvgPoolGradV3 dose not support asymmetric strides
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py
index a188953d70c93..6274ba53781ae 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestPow(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "pow"
@@ -55,6 +57,7 @@ def test_check_grad(self):
 
 
 class TestPowFp16(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "pow"
@@ -81,6 +84,7 @@ def test_check_output(self):
 
 
 class TestPowNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -95,8 +99,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.pow(sum, 2.0)
@@ -120,12 +125,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_prior_box_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_prior_box_op_npu.py
index 47b78d308205c..cfd78c2b05b36 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_prior_box_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_prior_box_op_npu.py
@@ -26,6 +26,7 @@
 
 
 class TestNPUPriorBox(OpTest):
+
     def setUp(self):
         self.op_type = "prior_box"
         self.set_npu()
@@ -93,8 +94,8 @@ def init_test_params(self):
         self.flip = True
         self.set_min_max_aspect_ratios_order()
         self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
-        self.aspect_ratios = np.array(
-            self.aspect_ratios, dtype=np.float).flatten()
+        self.aspect_ratios = np.array(self.aspect_ratios,
+                                      dtype=np.float).flatten()
         self.variances = [0.1, 0.1, 0.2, 0.2]
         self.variances = np.array(self.variances, dtype=np.float).flatten()
 
@@ -132,22 +133,22 @@ def init_test_output(self):
                             ar = self.real_aspect_ratios[r]
                             c_w = min_size * math.sqrt(ar) / 2
                             c_h = (min_size / math.sqrt(ar)) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
+                            out_boxes[h, w,
+                                      idx, :] = [(c_x - c_w) / self.image_w,
+                                                 (c_y - c_h) / self.image_h,
+                                                 (c_x + c_w) / self.image_w,
+                                                 (c_y + c_h) / self.image_h]
                             idx += 1
 
                         if len(self.max_sizes) > 0:
                             max_size = self.max_sizes[s]
                             # second prior: aspect_ratio = 1,
                             c_w = c_h = math.sqrt(min_size * max_size) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
+                            out_boxes[h, w,
+                                      idx, :] = [(c_x - c_w) / self.image_w,
+                                                 (c_y - c_h) / self.image_h,
+                                                 (c_x + c_w) / self.image_w,
+                                                 (c_y + c_h) / self.image_h]
                             idx += 1
                     else:
                         c_w = c_h = min_size / 2.
@@ -160,11 +161,11 @@ def init_test_output(self):
                             max_size = self.max_sizes[s]
                             # second prior: aspect_ratio = 1,
                             c_w = c_h = math.sqrt(min_size * max_size) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
+                            out_boxes[h, w,
+                                      idx, :] = [(c_x - c_w) / self.image_w,
+                                                 (c_y - c_h) / self.image_h,
+                                                 (c_x + c_w) / self.image_w,
+                                                 (c_y + c_h) / self.image_h]
                             idx += 1
 
                         # rest of priors
@@ -174,29 +175,31 @@ def init_test_output(self):
                                 continue
                             c_w = min_size * math.sqrt(ar) / 2
                             c_h = (min_size / math.sqrt(ar)) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
+                            out_boxes[h, w,
+                                      idx, :] = [(c_x - c_w) / self.image_w,
+                                                 (c_y - c_h) / self.image_h,
+                                                 (c_x + c_w) / self.image_w,
+                                                 (c_y + c_h) / self.image_h]
                             idx += 1
 
         # clip the prior's coordidate such that it is within[0, 1]
         if self.clip:
             out_boxes = np.clip(out_boxes, 0.0, 1.0)
         # set the variance.
-        out_var = np.tile(self.variances, (self.layer_h, self.layer_w,
-                                           self.num_priors, 1))
+        out_var = np.tile(self.variances,
+                          (self.layer_h, self.layer_w, self.num_priors, 1))
         self.out_boxes = out_boxes.astype('float32')
         self.out_var = out_var.astype('float32')
 
 
 class TestNPUPriorBoxWithoutMaxSize(TestNPUPriorBox):
+
     def set_max_sizes(self):
         self.max_sizes = []
 
 
 class TestNPUPriorBoxWithoutSpecifiedOutOrder(TestNPUPriorBox):
+
     def set_min_max_aspect_ratios_order(self):
         self.min_max_aspect_ratios_order = False
         self.atol = 1e-1
diff --git a/python/paddle/fluid/tests/unittests/npu/test_randperm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_randperm_op_npu.py
index 4ec353c55deb1..02b2b2caf864e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_randperm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_randperm_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -58,31 +59,36 @@ def test_check_output(self):
 
     def verify_output(self, outs):
         out_np = np.array(outs[0])
-        self.assertTrue(
-            check_randperm_out(self.n, out_np), msg=error_msg(out_np))
+        self.assertTrue(check_randperm_out(self.n, out_np),
+                        msg=error_msg(out_np))
 
 
 class TestRandpermOpN(TestRandpermOp):
+
     def init_attrs(self):
         self.n = 10000
 
 
 class TestRandpermOpInt32(TestRandpermOp):
+
     def init_attrs(self):
         self.dtype = "int32"
 
 
 class TestRandpermOpFloat32(TestRandpermOp):
+
     def init_attrs(self):
         self.dtype = "float32"
 
 
 class TestRandpermOpFloat64(TestRandpermOp):
+
     def init_attrs(self):
         self.dtype = "float64"
 
 
 class TestRandpermOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             self.assertRaises(ValueError, paddle.randperm, -3)
@@ -90,6 +96,7 @@ def test_errors(self):
 
 
 class TestRandpermAPI(unittest.TestCase):
+
     def test_out(self):
         n = 10
         place = paddle.NPUPlace(0)
@@ -107,14 +114,15 @@ def test_out(self):
 
 
 class TestRandpermImperative(unittest.TestCase):
+
     def test_out(self):
         paddle.disable_static(paddle.NPUPlace(0))
         n = 10
         for dtype in ['int32', np.int64, 'float32', 'float64']:
             data_p = paddle.randperm(n, dtype)
             data_np = data_p.numpy()
-            self.assertTrue(
-                check_randperm_out(n, data_np), msg=error_msg(data_np))
+            self.assertTrue(check_randperm_out(n, data_np),
+                            msg=error_msg(data_np))
         paddle.enable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_range_npu.py b/python/paddle/fluid/tests/unittests/npu/test_range_npu.py
index c6700a19c5239..d9663a3a15172 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_range_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_range_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -25,6 +26,7 @@
 
 
 class TestRangeOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -40,8 +42,9 @@ def setUp(self):
         }
 
         self.outputs = {
-            'Out': np.arange(self.case[0], self.case[1],
-                             self.case[2]).astype(self.dtype)
+            'Out':
+            np.arange(self.case[0], self.case[1],
+                      self.case[2]).astype(self.dtype)
         }
 
     def init_config(self):
@@ -53,42 +56,49 @@ def test_check_output(self):
 
 
 class TestFloatRangeOpCase0(TestRangeOp):
+
     def init_config(self):
         self.dtype = np.float32
         self.case = (0, 5, 1)
 
 
 class TestInt32RangeOpCase0(TestRangeOp):
+
     def init_config(self):
         self.dtype = np.int32
         self.case = (0, 5, 2)
 
 
 class TestInt32RangeOpCase1(TestRangeOp):
+
     def init_config(self):
         self.dtype = np.int32
         self.case = (10, 1, -2)
 
 
 class TestInt32RangeOpCase2(TestRangeOp):
+
     def init_config(self):
         self.dtype = np.int32
         self.case = (-1, -10, -2)
 
 
 class TestInt64RangeOpCase0(TestRangeOp):
+
     def init_config(self):
         self.dtype = np.int64
         self.case = (0, 5, 2)
 
 
 class TestInt64RangeOpCase1(TestRangeOp):
+
     def init_config(self):
         self.dtype = np.int64
         self.case = (10, 1, -2)
 
 
 class TestInt64RangeOpCase2(TestRangeOp):
+
     def init_config(self):
         self.dtype = np.int64
         self.case = (-1, -10, -2)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py
index 899d4ef43bd86..87e1c488024f6 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,13 +17,16 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
+
 paddle.enable_static()
 
 
 class TestNPUReciprocal(OpTest):
+
     def setUp(self):
         self.op_type = "reciprocal"
         self.set_npu()
@@ -40,8 +43,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', max_relative_error=0.01)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.01)
 
     def set_npu(self):
         self.__class__.use_npu = True
@@ -52,6 +56,7 @@ def init_dtype(self):
 
 
 class TestNPUReciprocalFp64(TestNPUReciprocal):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -63,6 +68,7 @@ def init_dtype(self):
 @skip_check_grad_ci(
     reason="The backward test is not supported for float16 type on NPU.")
 class TestNPUReciprocalFp16(TestNPUReciprocal):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
index 1a30d1395283e..ae871b0998900 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -29,6 +30,7 @@
 
 
 class TestAny8DOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_any"
@@ -48,6 +50,7 @@ def test_check_output(self):
 
 
 class TestAnyOpWithDim(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_any"
@@ -64,6 +67,7 @@ def test_check_output(self):
 
 
 class TestAny8DOpWithDim(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_any"
@@ -83,6 +87,7 @@ def test_check_output(self):
 
 
 class TestAnyOpWithKeepDim(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_any"
@@ -90,8 +95,8 @@ def setUp(self):
         self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
         self.attrs = {'dim': (1), 'keep_dim': True}
         self.outputs = {
-            'Out': np.expand_dims(
-                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
+            'Out':
+            np.expand_dims(self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
         }
 
     def set_npu(self):
@@ -102,6 +107,7 @@ def test_check_output(self):
 
 
 class TestAny8DOpWithKeepDim(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_any"
@@ -112,8 +118,8 @@ def setUp(self):
         }
         self.attrs = {'dim': (1), 'keep_dim': True}
         self.outputs = {
-            'Out': np.expand_dims(
-                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
+            'Out':
+            np.expand_dims(self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
         }
 
     def set_npu(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py
index 68a28ea72e1fc..64f66476542da 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py
@@ -194,8 +194,9 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.FP16)
         }
         self.outputs = {
-            'Out': self.inputs['X'].max(
-                axis=tuple(self.attrs['dim'])).astype(np.float16)
+            'Out':
+            self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(
+                np.float16)
         }
 
     def test_check_output(self):
@@ -219,8 +220,9 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.FP32)
         }
         self.outputs = {
-            'Out': self.inputs['X'].max(
-                axis=tuple(self.attrs['dim'])).astype(np.float32)
+            'Out':
+            self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(
+                np.float32)
         }
 
 
@@ -241,8 +243,9 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.FP64)
         }
         self.outputs = {
-            'Out': self.inputs['X'].max(
-                axis=tuple(self.attrs['dim'])).astype(np.float64)
+            'Out':
+            self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(
+                np.float64)
         }
 
 
@@ -263,8 +266,9 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.FP32)
         }
         self.outputs = {
-            'Out': self.inputs['X'].max(
-                axis=tuple(self.attrs['dim'])).astype(np.float32)
+            'Out':
+            self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(
+                np.float32)
         }
 
     def init_dtype(self):
@@ -288,8 +292,9 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.INT64)
         }
         self.outputs = {
-            'Out': self.inputs['X'].max(
-                axis=tuple(self.attrs['dim'])).astype(np.float32)
+            'Out':
+            self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(
+                np.float32)
         }
 
     def init_dtype(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_mean_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_mean_op_npu.py
index ed27c335a4e32..3a2f70f0d376e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_mean_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_mean_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -26,6 +27,7 @@
 
 
 class TestMeanOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -43,6 +45,7 @@ def test_check_grad(self):
 
 
 class TestMeanOp5D(TestMeanOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_mean"
@@ -53,6 +56,7 @@ def setUp(self):
 
 
 class TestMeanOp6D(TestMeanOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_mean"
@@ -63,6 +67,7 @@ def setUp(self):
 
 
 class TestMeanOp8D(TestMeanOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_mean"
@@ -74,6 +79,7 @@ def setUp(self):
 
 
 class Test1DReduce(TestMeanOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_mean"
@@ -82,6 +88,7 @@ def setUp(self):
 
 
 class Test2DReduce0(Test1DReduce):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_mean"
@@ -91,6 +98,7 @@ def setUp(self):
 
 
 class Test2DReduce1(Test1DReduce):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_mean"
@@ -102,6 +110,7 @@ def setUp(self):
 
 
 class Test3DReduce0(Test1DReduce):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_mean"
@@ -113,6 +122,7 @@ def setUp(self):
 
 
 class Test3DReduce1(Test1DReduce):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_mean"
@@ -124,6 +134,7 @@ def setUp(self):
 
 
 class Test3DReduce2(Test1DReduce):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_mean"
@@ -135,6 +146,7 @@ def setUp(self):
 
 
 class Test3DReduce3(Test1DReduce):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_mean"
@@ -146,18 +158,21 @@ def setUp(self):
 
 
 class TestKeepDimReduce(Test1DReduce):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_mean"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
         self.attrs = {'dim': [1], 'keep_dim': True}
         self.outputs = {
-            'Out': self.inputs['X'].mean(
-                axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])
+            'Out':
+            self.inputs['X'].mean(axis=tuple(self.attrs['dim']),
+                                  keepdims=self.attrs['keep_dim'])
         }
 
 
 class TestKeepDim8DReduce(Test1DReduce):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_mean"
@@ -166,12 +181,14 @@ def setUp(self):
         }
         self.attrs = {'dim': (3, 4, 5), 'keep_dim': True}
         self.outputs = {
-            'Out': self.inputs['X'].mean(
-                axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])
+            'Out':
+            self.inputs['X'].mean(axis=tuple(self.attrs['dim']),
+                                  keepdims=self.attrs['keep_dim'])
         }
 
 
 class TestReduceAll(Test1DReduce):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_mean"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_min_op_npu.py
index bbf23e1be3e0e..85d1fe9478140 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_min_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_min_op_npu.py
@@ -194,8 +194,9 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.FP16)
         }
         self.outputs = {
-            'Out': self.inputs['X'].min(
-                axis=tuple(self.attrs['dim'])).astype(np.float16)
+            'Out':
+            self.inputs['X'].min(axis=tuple(self.attrs['dim'])).astype(
+                np.float16)
         }
 
     def test_check_output(self):
@@ -219,8 +220,9 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.FP32)
         }
         self.outputs = {
-            'Out': self.inputs['X'].min(
-                axis=tuple(self.attrs['dim'])).astype(np.float32)
+            'Out':
+            self.inputs['X'].min(axis=tuple(self.attrs['dim'])).astype(
+                np.float32)
         }
 
 
@@ -241,8 +243,9 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.FP64)
         }
         self.outputs = {
-            'Out': self.inputs['X'].min(
-                axis=tuple(self.attrs['dim'])).astype(np.float64)
+            'Out':
+            self.inputs['X'].min(axis=tuple(self.attrs['dim'])).astype(
+                np.float64)
         }
 
 
@@ -263,8 +266,9 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.FP32)
         }
         self.outputs = {
-            'Out': self.inputs['X'].min(
-                axis=tuple(self.attrs['dim'])).astype(np.float32)
+            'Out':
+            self.inputs['X'].min(axis=tuple(self.attrs['dim'])).astype(
+                np.float32)
         }
 
     def init_dtype(self):
@@ -288,8 +292,9 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.INT64)
         }
         self.outputs = {
-            'Out': self.inputs['X'].min(
-                axis=tuple(self.attrs['dim'])).astype(np.float32)
+            'Out':
+            self.inputs['X'].min(axis=tuple(self.attrs['dim'])).astype(
+                np.float32)
         }
 
     def init_dtype(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py
index 59f181be5edac..c32e105b02ade 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py
@@ -27,6 +27,7 @@
 
 
 class TestNPUReduceProd(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.set_npu()
@@ -50,6 +51,7 @@ def init_dtype(self):
 
 
 class TestNPUReduceProd2(TestNPUReduceProd):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.set_npu()
@@ -61,6 +63,7 @@ def setUp(self):
 
 
 class TestNPUReduceProd3(TestNPUReduceProd):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.set_npu()
@@ -72,6 +75,7 @@ def setUp(self):
 
 
 class TestNPUReduceProd6D(TestNPUReduceProd):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.set_npu()
@@ -87,6 +91,7 @@ def setUp(self):
 
 
 class TestNPUReduceProd8D(TestNPUReduceProd):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.set_npu()
@@ -102,6 +107,7 @@ def setUp(self):
 
 
 class TestReduceAll(TestNPUReduceProd):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.set_npu()
@@ -113,6 +119,7 @@ def setUp(self):
 
 
 class TestNPUReduceProdWithOutDtype_bool(TestNPUReduceProd):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.set_npu()
@@ -127,6 +134,7 @@ def setUp(self):
 
 
 class TestNPUReduceProdWithOutDtype_int16(TestNPUReduceProd):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.set_npu()
@@ -135,12 +143,14 @@ def setUp(self):
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
         self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.INT16)}
         self.outputs = {
-            'Out': self.inputs['X'].prod(
-                axis=tuple(self.attrs['dim'])).astype(np.int16)
+            'Out':
+            self.inputs['X'].prod(axis=tuple(self.attrs['dim'])).astype(
+                np.int16)
         }
 
 
 class TestNPUReduceProdWithOutDtype_int32(TestNPUReduceProd):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.set_npu()
@@ -149,12 +159,14 @@ def setUp(self):
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
         self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.INT32)}
         self.outputs = {
-            'Out': self.inputs['X'].prod(
-                axis=tuple(self.attrs['dim'])).astype(np.int32)
+            'Out':
+            self.inputs['X'].prod(axis=tuple(self.attrs['dim'])).astype(
+                np.int32)
         }
 
 
 class TestNPUReduceProdWithOutDtype_int64(TestNPUReduceProd):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.set_npu()
@@ -163,12 +175,14 @@ def setUp(self):
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
         self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.INT64)}
         self.outputs = {
-            'Out': self.inputs['X'].prod(
-                axis=tuple(self.attrs['dim'])).astype(np.int64)
+            'Out':
+            self.inputs['X'].prod(axis=tuple(self.attrs['dim'])).astype(
+                np.int64)
         }
 
 
 class TestNPUReduceProdWithOutDtype_fp16(TestNPUReduceProd):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.set_npu()
@@ -177,8 +191,9 @@ def setUp(self):
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
         self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.FP16)}
         self.outputs = {
-            'Out': self.inputs['X'].prod(
-                axis=tuple(self.attrs['dim'])).astype(np.float16)
+            'Out':
+            self.inputs['X'].prod(axis=tuple(self.attrs['dim'])).astype(
+                np.float16)
         }
 
     def test_check_output(self):
@@ -186,6 +201,7 @@ def test_check_output(self):
 
 
 class TestNPUReduceProdWithOutDtype_fp32(TestNPUReduceProd):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.set_npu()
@@ -194,12 +210,14 @@ def setUp(self):
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
         self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.FP32)}
         self.outputs = {
-            'Out': self.inputs['X'].prod(
-                axis=tuple(self.attrs['dim'])).astype(np.float32)
+            'Out':
+            self.inputs['X'].prod(axis=tuple(self.attrs['dim'])).astype(
+                np.float32)
         }
 
 
 class TestNPUReduceProdWithOutDtype_fp64(TestNPUReduceProd):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.set_npu()
@@ -208,13 +226,15 @@ def setUp(self):
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
         self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.FP64)}
         self.outputs = {
-            'Out': self.inputs['X'].prod(
-                axis=tuple(self.attrs['dim'])).astype(np.float64)
+            'Out':
+            self.inputs['X'].prod(axis=tuple(self.attrs['dim'])).astype(
+                np.float64)
         }
 
 
 @skip_check_grad_ci(reason="right now not implement grad op")
 class TestNPUReduceProdWithOutDtype_fp32_2(TestNPUReduceProd):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.set_npu()
@@ -223,8 +243,9 @@ def setUp(self):
         self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
         self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.FP32)}
         self.outputs = {
-            'Out': self.inputs['X'].prod(
-                axis=tuple(self.attrs['dim'])).astype(np.float32)
+            'Out':
+            self.inputs['X'].prod(axis=tuple(self.attrs['dim'])).astype(
+                np.float32)
         }
 
     def init_dtype(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
index bd7ce2a040c93..632defd7f0ede 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestReduceSum(OpTest):
+
     def setUp(self):
         np.random.seed(SEED)
         self.set_npu()
@@ -46,8 +48,9 @@ def setUp(self):
             self.outputs = {'Out': self.inputs['X'].sum()}
         else:
             self.outputs = {
-                'Out': self.inputs['X'].sum(axis=self.axis,
-                                            keepdims=self.attrs['keep_dim'])
+                'Out':
+                self.inputs['X'].sum(axis=self.axis,
+                                     keepdims=self.attrs['keep_dim'])
             }
 
     def set_npu(self):
@@ -78,11 +81,13 @@ def test_check_output(self):
 
 
 class TestReduceSum2(OpTest):
+
     def init_dtype(self):
         self.dtype = np.int32
 
 
 class TestReduceSumNet(unittest.TestCase):
+
     def set_reduce_sum_function(self, x):
         # keep_dim = False
         return paddle.fluid.layers.reduce_sum(x, dim=-1)
@@ -101,8 +106,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32')
             b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[2, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[2, 1],
+                                       dtype='int64')
 
             a_1 = fluid.layers.fc(input=a, size=4, num_flatten_dims=2, act=None)
             b_1 = fluid.layers.fc(input=b, size=4, num_flatten_dims=2, act=None)
@@ -127,12 +133,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
@@ -148,12 +155,14 @@ def test_npu(self):
 
 
 class TestReduceSumNet2(TestReduceSumNet):
+
     def set_reduce_sum_function(self, x):
         # keep_dim = True
         return paddle.fluid.layers.reduce_sum(x, dim=-1, keep_dim=True)
 
 
 class TestReduceSumNet3(TestReduceSumNet):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -185,8 +194,10 @@ def _test(self, run_npu=True):
         for epoch in range(100):
 
             loss_res = exe.run(main_prog,
-                               feed={"a": a_np,
-                                     "b": b_np},
+                               feed={
+                                   "a": a_np,
+                                   "b": b_np
+                               },
                                fetch_list=[loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Loss: {}".format(epoch, loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
index b1cb5e02a731f..1bf503a37799a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
@@ -20,6 +20,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 
 paddle.enable_static()
@@ -34,6 +35,7 @@ def ref_relu6(x, threshold=6.0):
 
 
 class TestRelu6(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "relu6"
@@ -63,6 +65,7 @@ def init_dtype(self):
 
 
 class TestRelu6Float16(TestRelu6):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.__class__.no_need_check_grad = True
@@ -75,6 +78,7 @@ def test_check_output(self):
 
 
 class TestReluNeg(TestRelu6):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "relu6"
@@ -101,6 +105,7 @@ def test_check_output(self):
 
 
 class TestRelu6Net(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -115,8 +120,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.nn.functional.relu6(sum)
@@ -140,12 +146,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
index c909b14b5141f..f5f95deffba18 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestRelu(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "relu"
@@ -53,18 +55,21 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['X'], 'Out', max_relative_error=0.006)
+            self.check_grad_with_place(self.place, ['X'],
+                                       'Out',
+                                       max_relative_error=0.006)
         else:
             self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
 class TestReluFp16(TestRelu):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestReluNeg(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "relu"
@@ -94,6 +99,7 @@ def test_check_output(self):
 
 
 class TestReluNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -108,8 +114,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.nn.functional.relu(sum)
@@ -133,12 +140,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py
index 520de15f4df62..1281925271026 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestReshape2(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "reshape2"
@@ -56,6 +58,7 @@ def test_check_grad_normal(self):
 
 
 class TestReshape2_case2(TestReshape2):
+
     def init_data(self):
         self.ori_shape = (2, 100)
         self.new_shape = (-1, 10)
@@ -63,6 +66,7 @@ def init_data(self):
 
 
 class TestReshape2_case3(TestReshape2):
+
     def init_data(self):
         self.ori_shape = (100, 5, 6)
         self.new_shape = (-1, 0, 3)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_rmsprop_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_rmsprop_op_npu.py
index 8bdf841c5cf18..d71c1453c33f9 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_rmsprop_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_rmsprop_op_npu.py
@@ -16,6 +16,7 @@
 
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import numpy as np
@@ -29,6 +30,7 @@
 
 
 class TestNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -43,8 +45,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.pow(sum, 2.0)
@@ -68,12 +71,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
@@ -89,6 +93,7 @@ def test_npu(self):
 
 
 class TestCenteredNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -103,8 +108,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.pow(sum, 2.0)
@@ -128,12 +134,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_roi_align_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_roi_align_op_npu.py
index 9ca2856886e08..1073e645c3e84 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_roi_align_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_roi_align_op_npu.py
@@ -18,6 +18,7 @@
 import numpy as np
 import math
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestROIAlignNPUOp(OpTest):
+
     def set_data(self):
         self.init_test_case()
         self.make_rois()
@@ -73,8 +75,8 @@ def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w,
         bilinear_pos = np.zeros(
             [self.channels, self.pooled_height, self.pooled_width, count, 4],
             np.float32)
-        bilinear_w = np.zeros(
-            [self.pooled_height, self.pooled_width, count, 4], np.float32)
+        bilinear_w = np.zeros([self.pooled_height, self.pooled_width, count, 4],
+                              np.float32)
         for ph in range(self.pooled_width):
             for pw in range(self.pooled_height):
                 c = 0
@@ -195,6 +197,7 @@ def test_check_grad(self):
 
 
 class TestROIAlignOpWithMinusSample(TestROIAlignNPUOp):
+
     def init_test_case(self):
         self.batch_size = 3
         self.channels = 3
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sampling_id_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sampling_id_op_npu.py
index 836d2b6d31189..399ac0d25225a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sampling_id_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sampling_id_op_npu.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 from op_test import OpTest, _set_use_system_allocator
@@ -27,6 +28,7 @@
 
 
 class TestSamplingIdShape(unittest.TestCase):
+
     def test_shape(self):
         paddle.enable_static()
         x = fluid.layers.data(name='x', shape=[3], dtype='float32')
@@ -37,8 +39,7 @@ def test_shape(self):
         exe.run(fluid.default_startup_program())
 
         feed = {
-            'x': np.array(
-                [[0.2, 0.3, 0.5], [0.2, 0.3, 0.4]], dtype='float32')
+            'x': np.array([[0.2, 0.3, 0.5], [0.2, 0.3, 0.4]], dtype='float32')
         }
         output_np = exe.run(feed=feed, fetch_list=[output])[0]
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py
index 3bdf8146fb228..bb21b1024e3b1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py
@@ -16,6 +16,7 @@
 
 import unittest
 import sys
+
 sys.path.append("..")
 import paddle
 import paddle.fluid as fluid
@@ -37,54 +38,62 @@
 
 
 class TestNPUSaveLoadBase(TestSaveLoadBase):
+
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_npu(
-        ) else paddle.NPUPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_npu() else paddle.NPUPlace(0)
 
 
 class TestNPUSaveLoadPartial(TestSaveLoadPartial):
+
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_npu(
-        ) else paddle.NPUPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_npu() else paddle.NPUPlace(0)
 
 
 class TestNPUSaveLoadSetStateDict(TestSaveLoadSetStateDict):
+
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_npu(
-        ) else paddle.NPUPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_npu() else paddle.NPUPlace(0)
 
 
 class TestNPUProgramStatePartial(TestProgramStatePartial):
+
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_npu(
-        ) else paddle.NPUPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_npu() else paddle.NPUPlace(0)
 
 
 class TestNPULoadFromOldInterface(TestLoadFromOldInterface):
+
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_npu(
-        ) else paddle.NPUPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_npu() else paddle.NPUPlace(0)
 
 
 class TestNPULoadFromOldInterfaceSingleFile(TestLoadFromOldInterfaceSingleFile):
+
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_npu(
-        ) else paddle.NPUPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_npu() else paddle.NPUPlace(0)
 
 
 class TestNPUProgramStateOldSave(TestProgramStateOldSave):
+
     def setUp(self):
         self.test_dygraph = False
 
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_npu(
-        ) else paddle.NPUPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_npu() else paddle.NPUPlace(0)
 
 
 class TestNPUProgramStateOldSaveSingleModel(TestProgramStateOldSaveSingleModel):
+
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_npu(
-        ) else paddle.NPUPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_npu() else paddle.NPUPlace(0)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
index 424c4ca0ff35d..f8db47345a7e4 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestScale(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "scale"
@@ -34,13 +36,14 @@ def setUp(self):
         self.init_dtype()
 
         self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(
+            'X':
+            OpTest.np_dtype_to_fluid_dtype(
                 np.random.random((10, 10)).astype(self.dtype))
         }
         self.attrs = {'scale': -2.3, 'bias': 0, 'bias_after_scale': True}
         self.outputs = {
-            'Out': (self.inputs['X'] *
-                    self.dtype(self.attrs['scale'])).astype(self.dtype)
+            'Out': (self.inputs['X'] * self.dtype(self.attrs['scale'])).astype(
+                self.dtype)
         }
 
     def set_npu(self):
@@ -54,21 +57,25 @@ def test_check_output(self):
 
 
 class TestFP16Scale(TestScale):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestScaleInt(TestScale):
+
     def init_dtype(self):
         self.dtype = np.int32
 
 
 class TestScaleInt64(TestScale):
+
     def init_dtype(self):
         self.dtype = np.int64
 
 
 class TestBiasAfterScale(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "scale"
@@ -76,7 +83,8 @@ def setUp(self):
         self.init_dtype()
 
         self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(
+            'X':
+            OpTest.np_dtype_to_fluid_dtype(
                 np.random.random((10, 10)).astype(self.dtype))
         }
         self.attrs = {'scale': -2.3, 'bias': 0, 'bias_after_scale': False}
diff --git a/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
index c353654641932..1eb85db274c63 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestCast1_FP32(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "scatter"
@@ -51,6 +53,7 @@ def test_check_output(self):
 
 
 class TestCast_INT32(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "scatter"
@@ -74,6 +77,7 @@ def test_check_output(self):
 
 
 class TestCast2_FP32(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "scatter"
@@ -97,6 +101,7 @@ def test_check_output(self):
 
 
 class TestCast3_FP32(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "scatter"
@@ -121,6 +126,7 @@ def test_check_output(self):
 
 
 class TestCast_INT64(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "scatter"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_seed_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_seed_op_npu.py
index 85a1e0594ba94..37d77e84dba8d 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_seed_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_seed_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestSeedOpFixSeed(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "seed"
@@ -42,6 +44,7 @@ def test_check_output(self):
 
 
 class TestSeedOpDiffSeed(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "seed"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sequence_mask_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sequence_mask_npu.py
index 21440de9fddd1..850dbfa1fd372 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sequence_mask_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sequence_mask_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class SequenceMaskTestBase(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -57,13 +59,13 @@ def setUp(self):
     def calc_ground_truth_mask(self):
         maxlen = np.max(self.x) if self.maxlen < 0 else self.maxlen
         shape = self.x.shape + (maxlen, )
-        index_broadcast = np.broadcast_to(
-            np.reshape(
-                range(maxlen), newshape=[1] * self.x.ndim + [-1]),
-            shape=shape)
-        x_broadcast = np.broadcast_to(
-            np.reshape(
-                self.x, newshape=self.x.shape + (-1, )), shape=shape)
+        index_broadcast = np.broadcast_to(np.reshape(
+            range(maxlen), newshape=[1] * self.x.ndim + [-1]),
+                                          shape=shape)
+        x_broadcast = np.broadcast_to(np.reshape(self.x,
+                                                 newshape=self.x.shape +
+                                                 (-1, )),
+                                      shape=shape)
         return (index_broadcast < x_broadcast).astype(self.mask_dtype)
 
     def test_check_output(self):
@@ -71,36 +73,43 @@ def test_check_output(self):
 
 
 class SequenceMaskTest1(SequenceMaskTestBase):
+
     def initParameters(self):
         self.mask_dtype = 'bool'
 
 
 class SequenceMaskTest2(SequenceMaskTestBase):
+
     def initParameters(self):
         self.mask_dtype = 'uint8'
 
 
 class SequenceMaskTest3(SequenceMaskTestBase):
+
     def initParameters(self):
         self.mask_dtype = 'int32'
 
 
 class SequenceMaskTest4(SequenceMaskTestBase):
+
     def initParameters(self):
         self.mask_dtype = 'float32'
 
 
 class SequenceMaskTest5(SequenceMaskTestBase):
+
     def initParameters(self):
         self.mask_dtype = 'float64'
 
 
 class SequenceMaskTest6(SequenceMaskTestBase):
+
     def initParameters(self):
         self.maxlen = -1
 
 
 class SequenceMaskTestBase_tensor_attr(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -128,13 +137,13 @@ def setUp(self):
     def calc_ground_truth_mask(self):
         maxlen = np.max(self.x) if self.maxlen < 0 else self.maxlen
         shape = self.x.shape + (maxlen, )
-        index_broadcast = np.broadcast_to(
-            np.reshape(
-                range(maxlen), newshape=[1] * self.x.ndim + [-1]),
-            shape=shape)
-        x_broadcast = np.broadcast_to(
-            np.reshape(
-                self.x, newshape=self.x.shape + (-1, )), shape=shape)
+        index_broadcast = np.broadcast_to(np.reshape(
+            range(maxlen), newshape=[1] * self.x.ndim + [-1]),
+                                          shape=shape)
+        x_broadcast = np.broadcast_to(np.reshape(self.x,
+                                                 newshape=self.x.shape +
+                                                 (-1, )),
+                                      shape=shape)
         return (index_broadcast < x_broadcast).astype(self.mask_dtype)
 
     def test_check_output(self):
@@ -142,31 +151,37 @@ def test_check_output(self):
 
 
 class SequenceMaskTest1_tensor_attr(SequenceMaskTestBase_tensor_attr):
+
     def initParameters(self):
         self.mask_dtype = 'bool'
 
 
 class SequenceMaskTest2_tensor_attr(SequenceMaskTestBase_tensor_attr):
+
     def initParameters(self):
         self.mask_dtype = 'uint8'
 
 
 class SequenceMaskTest3_tensor_attr(SequenceMaskTestBase_tensor_attr):
+
     def initParameters(self):
         self.mask_dtype = 'int32'
 
 
 class SequenceMaskTest4_tensor_attr(SequenceMaskTestBase_tensor_attr):
+
     def initParameters(self):
         self.mask_dtype = 'float32'
 
 
 class SequenceMaskTest5_tensor_attr(SequenceMaskTestBase_tensor_attr):
+
     def initParameters(self):
         self.mask_dtype = 'float64'
 
 
 class TestSequenceMaskOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             input_data = np.random.uniform(1, 5, [4]).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py
index 421ea1df4cff0..969c7ee2fbca9 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -25,6 +26,7 @@
 
 
 class TestSetValueBase(unittest.TestCase):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -55,6 +57,7 @@ def _get_answer(self):
 
 
 class TestSetValueApi(TestSetValueBase):
+
     def _run_static(self):
         paddle.enable_static()
         with paddle.static.program_guard(self.program):
@@ -80,17 +83,16 @@ def test_api(self):
         self._get_answer()
 
         error_msg = "\nIn {} mode: \nExpected res = \n{}, \n\nbut received : \n{}"
-        self.assertTrue(
-            (self.data == static_out).all(),
-            msg=error_msg.format("static", self.data, static_out))
-        self.assertTrue(
-            (self.data == dynamic_out).all(),
-            msg=error_msg.format("dynamic", self.data, dynamic_out))
+        self.assertTrue((self.data == static_out).all(),
+                        msg=error_msg.format("static", self.data, static_out))
+        self.assertTrue((self.data == dynamic_out).all(),
+                        msg=error_msg.format("dynamic", self.data, dynamic_out))
 
 
 # 1. Test different type of item: int, Python slice, Paddle Tensor
 # 1.1 item is int
 class TestSetValueItemInt(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0] = self.value
 
@@ -101,6 +103,7 @@ def _get_answer(self):
 # 1.2 item is slice
 # 1.2.1 step is 1
 class TestSetValueItemSlice(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:2] = self.value
 
@@ -109,6 +112,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSlice2(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:-1] = self.value
 
@@ -117,6 +121,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSlice3(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:-1, 0:2] = self.value
 
@@ -125,6 +130,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSlice4(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:, 1:2, :] = self.value
 
@@ -152,6 +158,7 @@ def _get_answer(self):
 
 # 1.2.2 step > 1
 class TestSetValueItemSliceStep(TestSetValueApi):
+
     def set_shape(self):
         self.shape = [5, 5, 5]
 
@@ -163,6 +170,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSliceStep2(TestSetValueApi):
+
     def set_shape(self):
         self.shape = [7, 5, 5]
 
@@ -174,6 +182,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSliceStep3(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:-1, 0:2, ::2] = self.value
 
@@ -182,6 +191,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSliceStep4(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:, 1:2:2, :] = self.value
 
@@ -191,6 +201,7 @@ def _get_answer(self):
 
 # 1.2.3 step < 0
 class TestSetValueItemSliceNegetiveStep(TestSetValueApi):
+
     def set_shape(self):
         self.shape = [5, 2]
 
@@ -205,6 +216,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSliceNegetiveStep2(TestSetValueApi):
+
     def set_shape(self):
         self.shape = [5]
 
@@ -219,6 +231,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSliceNegetiveStep3(TestSetValueApi):
+
     def set_shape(self):
         self.shape = [3]
 
@@ -233,6 +246,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSliceNegetiveStep4(TestSetValueApi):
+
     def set_shape(self):
         self.shape = [3, 4, 5]
 
@@ -247,6 +261,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemEllipsis1(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:, ..., 1:] = self.value
 
@@ -255,6 +270,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemEllipsis2(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:, ...] = self.value
 
@@ -263,6 +279,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemEllipsis3(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[..., 1:] = self.value
 
@@ -271,6 +288,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemEllipsis4(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[...] = self.value
 
@@ -280,6 +298,7 @@ def _get_answer(self):
 
 # 1.4 item is Paddle Tensor
 class TestSetValueItemTensor(TestSetValueApi):
+
     def _call_setitem(self, x):
         zero = paddle.full([1], 0, dtype="int32")
         x[zero] = self.value
@@ -289,6 +308,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemTensor2(TestSetValueApi):
+
     def _call_setitem(self, x):
         zero = paddle.full([1], 0, dtype="int32")
         two = paddle.full([1], 2, dtype="int64")
@@ -299,6 +319,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemTensor3(TestSetValueApi):
+
     def _call_setitem(self, x):
         zero = paddle.full([1], 0, dtype="int32")
         two = paddle.full([1], 2, dtype="int64")
@@ -309,6 +330,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemTensor4(TestSetValueApi):
+
     def _call_setitem(self, x):
         zero = paddle.full([1], 0, dtype="int32")
         two = paddle.full([1], 2, dtype="int64")
@@ -319,6 +341,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemTensor5(TestSetValueApi):
+
     def _call_setitem(self, x):
         zero = paddle.full([1], 0, dtype="int32")
         two = paddle.full([1], 2, dtype="int64")
@@ -329,6 +352,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemTensor6(TestSetValueApi):
+
     def set_shape(self):
         self.shape = [3, 4, 5]
 
@@ -343,6 +367,7 @@ def _get_answer(self):
 
 # 1.5 item is None
 class TestSetValueItemNone1(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[None] = self.value
 
@@ -351,6 +376,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone2(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0, None, 1] = self.value
 
@@ -359,6 +385,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone3(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[:, None, None, 1] = self.value
 
@@ -367,6 +394,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone4(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0, 0, None, 1] = self.value
 
@@ -375,6 +403,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone5(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0, None, 0, None, 1] = self.value
 
@@ -383,6 +412,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone6(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[None, 0, 0, None, 0] = self.value
 
@@ -391,6 +421,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone7(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[:, None, 1] = np.zeros(self.shape)[:, None, 0]
 
@@ -399,6 +430,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone8(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[:, 1, None] = np.zeros(self.shape)[:, 0, None]
 
@@ -407,6 +439,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone9(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[None, :, 1, ..., None] = np.zeros(self.shape)[0, 0, :, None]
 
@@ -416,6 +449,7 @@ def _get_answer(self):
 
 # 1.5 item is list or Tensor of bol
 class TestSetValueItemBool1(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[[True, False]] = self.value
 
@@ -424,6 +458,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemBool2(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[[False, False]] = self.value
 
@@ -432,6 +467,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemBool3(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[[False, True]] = np.zeros(self.shape[2])
 
@@ -440,6 +476,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemBool4(TestSetValueApi):
+
     def _call_setitem(self, x):
         idx = paddle.assign(np.array([False, True]))
         x[idx] = np.zeros(self.shape[2])
@@ -449,17 +486,19 @@ def _get_answer(self):
 
 
 class TestSetValueItemBool5(TestSetValueApi):
+
     def _call_setitem(self, x):
         idx = paddle.assign(
             np.array([[False, True, False], [True, True, False]]))
         x[idx] = self.value
 
     def _get_answer(self):
-        self.data[np.array([[False, True, False], [True, True, False]
-                            ])] = self.value
+        self.data[np.array([[False, True, False], [True, True,
+                                                   False]])] = self.value
 
 
 class TestSetValueItemBool6(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0, ...] = 0
         x[x > 0] = self.value
@@ -470,7 +509,9 @@ def _get_answer(self):
 
 
 def create_test_value_int32(parent):
+
     class TestValueInt(parent):
+
         def set_value(self):
             self.value = 7
 
@@ -490,7 +531,9 @@ def set_dtype(self):
 
 
 def create_test_value_int64(parent):
+
     class TestValueInt(parent):
+
         def set_value(self):
             self.value = 7
 
@@ -510,7 +553,9 @@ def set_dtype(self):
 
 
 def create_test_value_tensor_fp32(parent):
+
     class TestValueInt(parent):
+
         def set_dtype(self):
             self.dtype = "float32"
 
@@ -535,6 +580,7 @@ def _get_answer(self):
 
 # 3. Test different shape of value
 class TestSetValueValueShape1(TestSetValueApi):
+
     def set_value(self):
         self.value = np.array([3, 4, 5, 6])  # shape is (4,)
 
@@ -546,6 +592,7 @@ def _get_answer(self):
 
 
 class TestSetValueValueShape2(TestSetValueApi):
+
     def set_value(self):
         self.value = np.array([[3, 4, 5, 6]])  # shape is (1,4)
 
@@ -557,9 +604,10 @@ def _get_answer(self):
 
 
 class TestSetValueValueShape3(TestSetValueApi):
+
     def set_value(self):
-        self.value = np.array(
-            [[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3]])  # shape is (3,4)
+        self.value = np.array([[1, 1, 1, 1], [2, 2, 2, 2],
+                               [3, 3, 3, 3]])  # shape is (3,4)
 
     def _call_setitem(self, x):
         x[0] = self.value
@@ -569,10 +617,11 @@ def _get_answer(self):
 
 
 class TestSetValueValueShape4(TestSetValueApi):
+
     def set_value(self):
-        self.value = np.array(
-            [[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3]]).astype(
-                self.dtype)  # shape is (3,4)
+        self.value = np.array([[1, 1, 1, 1], [2, 2, 2, 2],
+                               [3, 3, 3,
+                                3]]).astype(self.dtype)  # shape is (3,4)
 
     def _call_setitem(self, x):
         x[0] = paddle.assign(self.value)  # x is Paddle.Tensor
@@ -582,6 +631,7 @@ def _get_answer(self):
 
 
 class TestSetValueValueShape5(TestSetValueApi):
+
     def set_value(self):
         self.value = np.array([3, 3, 3]).astype(self.dtype)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py
index 99061cba8d270..1a3d0b1dbdff7 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py
@@ -15,6 +15,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -25,6 +26,7 @@
 
 
 class TestSGD(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -52,6 +54,7 @@ def test_check_output(self):
 
 
 class TestNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -66,8 +69,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.pow(sum, 2.0)
@@ -91,12 +95,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py
index 0adfb69cd63b5..a4f4275058587 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestShape(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "shape"
@@ -52,21 +54,25 @@ def test_check_output(self):
 
 
 class TestShape_fp16(TestShape):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestShape_double(TestShape):
+
     def init_dtype(self):
         self.dtype = np.float64
 
 
 class TestShape_int32(TestShape):
+
     def init_dtype(self):
         self.dtype = np.int32
 
 
 class TestShape_int64(TestShape):
+
     def init_dtype(self):
         self.dtype = np.int64
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_shard_index_op.py b/python/paddle/fluid/tests/unittests/npu/test_shard_index_op.py
index ce7e962624a46..afa3e1a5819be 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_shard_index_op.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_shard_index_op.py
@@ -18,6 +18,7 @@
 import numpy as np
 import math
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle.fluid as fluid
@@ -25,6 +26,7 @@
 import paddle.fluid.framework as framework
 from paddle.fluid.framework import Program, program_guard
 import paddle
+
 paddle.enable_static()
 SEED = 2021
 
@@ -58,6 +60,7 @@ def common_setup(self, index_num, nshards, shard_id, ignore_value):
 
 
 class TestShardIndexShardId0Op(OpTest):
+
     def setUp(self):
         common_setup(self, 20, 2, 0, -1)
 
@@ -66,16 +69,19 @@ def test_check_output(self):
 
 
 class TestShardIndexShardId1Op(TestShardIndexShardId0Op):
+
     def setUp(self):
         common_setup(self, 20, 2, 1, -1)
 
 
 class TestShardIndexIgnoreValueOp(TestShardIndexShardId0Op):
+
     def setUp(self):
         common_setup(self, 20, 2, 0, -2)
 
 
 class TestShardIndexNotEvenlyDividedOp(TestShardIndexShardId0Op):
+
     def setUp(self):
         common_setup(self, 15, 2, 1, -1)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_cross_entropy_with_logits_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_cross_entropy_with_logits_op_npu.py
index 913633b725b02..777d96afdd8ce 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_cross_entropy_with_logits_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_cross_entropy_with_logits_op_npu.py
@@ -41,11 +41,13 @@ def setUp(self):
         batch_size = 64
         num_classes = 20
         self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, (batch_size, num_classes))
-                .astype(self.dtype)),
-            'Label': np.random.randint(0, 2, (batch_size, num_classes))
-            .astype(self.dtype)
+            'X':
+            logit(
+                np.random.uniform(0, 1, (batch_size, num_classes)).astype(
+                    self.dtype)),
+            'Label':
+            np.random.randint(0, 2,
+                              (batch_size, num_classes)).astype(self.dtype)
         }
 
         # Fw Pass is implemented as elementwise sigmoid followed by
@@ -72,8 +74,8 @@ def init_dtype(self):
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
-class TestSigmoidCrossEntropyWithLogitsOp3(
-        TestSigmoidCrossEntropyWithLogitsOp1):
+class TestSigmoidCrossEntropyWithLogitsOp3(TestSigmoidCrossEntropyWithLogitsOp1
+                                           ):
     """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
     """
 
@@ -85,11 +87,13 @@ def setUp(self):
         batch_size = 64
         num_classes = 20
         self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, (batch_size, num_classes))
-                .astype(self.dtype)),
-            'Label': np.random.uniform(0, 1, (batch_size, num_classes))
-            .astype(self.dtype)
+            'X':
+            logit(
+                np.random.uniform(0, 1, (batch_size, num_classes)).astype(
+                    self.dtype)),
+            'Label':
+            np.random.uniform(0, 1,
+                              (batch_size, num_classes)).astype(self.dtype)
         }
 
         # Fw Pass is implemented as elementwise sigmoid followed by
@@ -103,8 +107,8 @@ def setUp(self):
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
-class TestSigmoidCrossEntropyWithLogitsOp5(
-        TestSigmoidCrossEntropyWithLogitsOp1):
+class TestSigmoidCrossEntropyWithLogitsOp5(TestSigmoidCrossEntropyWithLogitsOp1
+                                           ):
     """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
     """
 
@@ -116,11 +120,14 @@ def setUp(self):
         batch_size = [10, 10]
         num_classes = 20
         self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-                .astype(self.dtype)),
-            'Label': np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-            .astype(self.dtype)
+            'X':
+            logit(
+                np.random.uniform(0, 1,
+                                  tuple(batch_size + [num_classes])).astype(
+                                      self.dtype)),
+            'Label':
+            np.random.uniform(0, 1, tuple(batch_size + [num_classes])).astype(
+                self.dtype)
         }
 
         # Fw Pass is implemented as elementwise sigmoid followed by
@@ -134,8 +141,8 @@ def setUp(self):
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
-class TestSigmoidCrossEntropyWithLogitsOp6(
-        TestSigmoidCrossEntropyWithLogitsOp1):
+class TestSigmoidCrossEntropyWithLogitsOp6(TestSigmoidCrossEntropyWithLogitsOp1
+                                           ):
     """Test sigmoid_cross_entropy_with_logit_op with binary label
     """
 
@@ -147,11 +154,14 @@ def setUp(self):
         batch_size = [10, 10]
         num_classes = 20
         self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-                .astype(self.dtype)),
-            'Label': np.random.randint(0, 2, tuple(batch_size + [num_classes]))
-            .astype(self.dtype)
+            'X':
+            logit(
+                np.random.uniform(0, 1,
+                                  tuple(batch_size + [num_classes])).astype(
+                                      self.dtype)),
+            'Label':
+            np.random.randint(0, 2, tuple(batch_size + [num_classes])).astype(
+                self.dtype)
         }
 
         # Fw Pass is implemented as elementwise sigmoid followed by
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py
index 489f8bfb116a1..4525fc411053c 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py
@@ -28,6 +28,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestNPUSigmoid(OpTest):
+
     def setUp(self):
         self.op_type = "sigmoid"
         self.set_npu()
@@ -44,8 +45,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', max_relative_error=0.01)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.01)
 
     def set_npu(self):
         self.__class__.use_npu = True
@@ -58,6 +60,7 @@ def init_dtype(self):
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestNPUSigmoidFp16(TestNPUSigmoid):
+
     def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-3)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sin_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sin_op_npu.py
index 437f5c35e9702..da1fd633a48d6 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sin_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sin_op_npu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -30,7 +30,9 @@
 
 
 def test_class(op_type, typename):
+
     class TestSin(OpTest):
+
         def setUp(self):
             self.op_type = "sin"
             self.__class__.use_npu = True
diff --git a/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py
index 80721cbd66a55..76fc5846534ac 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 import paddle
 import paddle.fluid as fluid
@@ -24,6 +25,7 @@
 
 
 class TestSizeOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -46,42 +48,49 @@ def set_npu(self):
 
 
 class TestSizeOp1(TestSizeOp):
+
     def config(self):
         self.shape = [2]
         self.dtype = np.float64
 
 
 class TestSizeOp2(TestSizeOp):
+
     def config(self):
         self.shape = [2, 3]
         self.dtype = np.float32
 
 
 class TestSizeOp3(TestSizeOp):
+
     def config(self):
         self.shape = [2, 3, 100]
         self.dtype = np.float16
 
 
 class TestSizeOp4(TestSizeOp):
+
     def config(self):
         self.shape = [2**10]
         self.dtype = np.bool
 
 
 class TestSizeOp5(TestSizeOp):
+
     def config(self):
         self.shape = [7, 8, 9, 10]
         self.dtype = np.int64
 
 
 class TestSizeOp6(TestSizeOp):
+
     def config(self):
         self.shape = []
         self.dtype = np.int64
 
 
 class TestSizeAPI(unittest.TestCase):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -107,10 +116,12 @@ def test_size_static(self):
                 "x_2": input_2,
             },
                                    fetch_list=[out_1, out_2])
-            assert (np.array_equal(
-                res_1, np.array([np.size(input_1)]).astype("int64")))
-            assert (np.array_equal(
-                res_2, np.array([np.size(input_2)]).astype("int64")))
+            assert (np.array_equal(res_1,
+                                   np.array([np.size(input_1)
+                                             ]).astype("int64")))
+            assert (np.array_equal(res_2,
+                                   np.array([np.size(input_2)
+                                             ]).astype("int64")))
 
     def test_size_imperative(self):
         paddle.disable_static(self.place)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
index a5b203b6eea2a..e0ad94361ad4c 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestSliceOp(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.set_npu()
@@ -65,13 +67,15 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['Input'], 'Out', max_relative_error=0.02)
+            self.check_grad_with_place(self.place, ['Input'],
+                                       'Out',
+                                       max_relative_error=0.02)
         else:
             self.check_grad_with_place(self.place, ['Input'], 'Out')
 
 
 class TestSliceOp2(TestSliceOp):
+
     def config(self):
         self.input = np.random.random([10, 5, 6]).astype(self.dtype)
         self.starts = [0]
@@ -82,6 +86,7 @@ def config(self):
 
 
 class TestSliceOpFp16(TestSliceOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -92,6 +97,7 @@ def set_npu(self):
 
 
 class TestSliceOpTensor(TestSliceOp):
+
     def setUp(self):
         self.op_type = "slice"
         self.set_npu()
@@ -120,6 +126,7 @@ def config(self):
 
 
 class TestSliceOpTensor2(TestSliceOpTensor):
+
     def setUp(self):
         self.op_type = "slice"
         self.set_npu()
@@ -148,6 +155,7 @@ def config(self):
 
 
 class TestSliceOpFp16Tensor(TestSliceOpTensor):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -158,6 +166,7 @@ def set_npu(self):
 
 
 class TestSliceOpTensorList(TestSliceOp):
+
     def setUp(self):
         self.op_type = "slice"
         self.set_npu()
@@ -197,6 +206,7 @@ def config(self):
 
 
 class TestSliceOpTensorList2(TestSliceOpTensorList):
+
     def setUp(self):
         self.op_type = "slice"
         self.set_npu()
@@ -236,6 +246,7 @@ def config(self):
 
 
 class TestSliceOpFp16TensorList(TestSliceOpTensorList):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -246,6 +257,7 @@ def set_npu(self):
 
 
 class TestSliceNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -262,8 +274,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=data_shape, dtype='float32')
             b = paddle.static.data(name="b", shape=data_shape, dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[batch_size, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[batch_size, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.slice(sum, axes=[0, 1], starts=[0, 0], ends=[33, 2])
@@ -286,12 +299,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(EPOCH):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
@@ -307,6 +321,7 @@ def test_npu(self):
 
 
 class TestSliceOpDecsDim(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.set_npu()
@@ -352,18 +367,21 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            self.check_grad_with_place(
-                self.place, ['Input'], 'Out', max_relative_error=0.5)
+            self.check_grad_with_place(self.place, ['Input'],
+                                       'Out',
+                                       max_relative_error=0.5)
         else:
             self.check_grad_with_place(self.place, ['Input'], 'Out')
 
 
 class TestSliceOpDecsDimFp16(TestSliceOpDecsDim):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestSliceOpDecsDim2(TestSliceOpDecsDim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
         self.starts = [1, 0, 2]
@@ -375,6 +393,7 @@ def config(self):
 
 
 class TestSliceOpDecsDim3(TestSliceOpDecsDim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
         self.starts = [-1, 0, 2]
@@ -386,6 +405,7 @@ def config(self):
 
 
 class TestSliceOpDecsDim4(TestSliceOpDecsDim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 7]).astype(self.dtype)
         self.starts = [0, 1, 2, 3]
@@ -397,6 +417,7 @@ def config(self):
 
 
 class TestSliceOpDecsDim5(TestSliceOpDecsDim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
         self.starts = [-1]
@@ -408,6 +429,7 @@ def config(self):
 
 
 class TestSliceOpDecsDim6(TestSliceOpDecsDim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
         self.starts = [0, 1, 2, 3]
@@ -419,11 +441,11 @@ def config(self):
 
 
 class TestSliceOpDecsDimStartsTensor(TestSliceOpDecsDim):
+
     def set_inputs(self):
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype='int32')
+            "StartsTensor": np.array(self.starts, dtype='int32')
         }
 
     def set_attrs(self):
@@ -446,18 +468,18 @@ def config(self):
 
 
 class TestSliceOpDecsDimStartsTensorFP16(TestSliceOpDecsDimStartsTensor):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestSliceOpDecsDimStartsTensorStartsAndEndsTensor(TestSliceOpDecsDim):
+
     def set_inputs(self):
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype='int64'),
-            "EndsTensor": np.array(
-                self.ends, dtype='int32')
+            "StartsTensor": np.array(self.starts, dtype='int64'),
+            "EndsTensor": np.array(self.ends, dtype='int32')
         }
 
     def set_attrs(self):
@@ -481,11 +503,13 @@ def config(self):
 
 class TestSliceOpDecsDimStartsTensorStartsAndEndsTensorFP16(
         TestSliceOpDecsDimStartsTensorStartsAndEndsTensor):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestSliceOpDecsDimStartsListTensor(TestSliceOpDecsDim):
+
     def set_inputs(self):
         starts_tensor = []
         for index, ele in enumerate(self.starts):
@@ -516,6 +540,7 @@ def config(self):
 
 
 class TestSliceOpDecsDimStartsListTensor2(TestSliceOpDecsDimStartsListTensor):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
         self.starts = [-1]
@@ -528,13 +553,15 @@ def config(self):
         self.starts_infer = [-1]
 
 
-class TestSliceOpDecsDimStartsListTensorFP16(
-        TestSliceOpDecsDimStartsListTensor):
+class TestSliceOpDecsDimStartsListTensorFP16(TestSliceOpDecsDimStartsListTensor
+                                             ):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestSliceOpInt64(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -554,8 +581,8 @@ def setUp(self):
         }
 
     def config(self):
-        self.input = np.random.randint(
-            100, size=(3, 4, 5, 6)).astype(self.dtype)
+        self.input = np.random.randint(100,
+                                       size=(3, 4, 5, 6)).astype(self.dtype)
         self.starts = [1, 0, 2]
         self.ends = [3, 3, 4]
         self.axes = [0, 1, 2]
@@ -570,6 +597,7 @@ def test_check_output(self):
 
 
 class TestSliceOpTensorInt64(TestSliceOpInt64):
+
     def setUp(self):
         self.op_type = "slice"
         self.set_npu()
@@ -589,8 +617,8 @@ def setUp(self):
         }
 
     def config(self):
-        self.input = np.random.randint(
-            100, size=(3, 4, 5, 6)).astype(self.dtype)
+        self.input = np.random.randint(100,
+                                       size=(3, 4, 5, 6)).astype(self.dtype)
         self.starts = np.array([1, 0, 2]).astype('int32')
         self.ends = np.array([3, 3, 4]).astype('int32')
         self.axes = [0, 1, 2]
diff --git a/python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py
index 8c20f25061b85..1ba4e711c1df0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py
@@ -17,10 +17,12 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
+
 paddle.enable_static()
 
 
@@ -33,6 +35,7 @@ def smooth_l1_loss_forward(val, sigma2):
 
 
 class TestSmoothL1LossOp1(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -60,25 +63,25 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            self.place, ['X', 'Y'], 'Out', max_relative_error=0.02)
+        self.check_grad_with_place(self.place, ['X', 'Y'],
+                                   'Out',
+                                   max_relative_error=0.02)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad_with_place(
-            self.place, ['Y'],
-            'Out',
-            max_relative_error=0.03,
-            no_grad_set=set("X"))
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            max_relative_error=0.03,
-            no_grad_set=set('Y'))
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set('Y'))
 
 
 class TestSmoothL1LossOp2(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -110,32 +113,34 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            self.place, ['X', 'Y'], 'Out', max_relative_error=0.03)
+        self.check_grad_with_place(self.place, ['X', 'Y'],
+                                   'Out',
+                                   max_relative_error=0.03)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad_with_place(
-            self.place, ['Y'],
-            'Out',
-            max_relative_error=0.03,
-            no_grad_set=set(['X', 'InsideWeight', 'OutsideWeight']))
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set(
+                                       ['X', 'InsideWeight', 'OutsideWeight']))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            max_relative_error=0.03,
-            no_grad_set=set(['Y', 'InsideWeight', 'OutsideWeight']))
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set(
+                                       ['Y', 'InsideWeight', 'OutsideWeight']))
 
 
 class TestSmoothL1LossOpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             # The input type of accuracy_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.NPUPlace(0))
-            y1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.NPUPlace(0))
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.NPUPlace(0))
+            y1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.NPUPlace(0))
             self.assertRaises(TypeError, fluid.layers.smooth_l1, x1, y1)
             # The input dtype of accuracy_op must be float32 or float64.
             x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
index f2a9ef2bee074..9d734eac48be0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestSoftmax(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -53,6 +55,7 @@ def test_check_output(self):
 
 
 class TestSoftmaxNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -67,8 +70,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[4, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[4, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[4, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[4, 1],
+                                       dtype='int64')
 
             c = paddle.multiply(a, b)
             d = paddle.sqrt(c)
@@ -97,12 +101,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
index f0ca778834576..f6f3d746d8089 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -29,6 +30,7 @@
 
 
 class TestSoftmaxWithCrossEntropyOp(OpTest):
+
     def set_npu(self):
         self.__class__.use_npu = True
 
@@ -88,14 +90,14 @@ def test_check_output(self):
 
     def test_check_grad(self):
         # fp32 has low precision, cpu and npu both need to relax the max_relative_error if using fp32
-        self.check_grad_with_place(
-            self.place, ['Logits'],
-            'Loss',
-            numeric_grad_delta=0.001,
-            max_relative_error=0.5)
+        self.check_grad_with_place(self.place, ['Logits'],
+                                   'Loss',
+                                   numeric_grad_delta=0.001,
+                                   max_relative_error=0.5)
 
 
 class TestPowNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -110,8 +112,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             sum = paddle.add(a, b)
             z = paddle.pow(sum, 2.0)
@@ -135,12 +138,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_split_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_split_op_npu.py
index fd48ec958e4a4..3a06e0566d4dc 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_split_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_split_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -30,6 +31,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
 class TestCase1(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.set_example()
@@ -37,8 +39,9 @@ def setUp(self):
         self.place = paddle.NPUPlace(0)
         ipt = self.x.astype(self.dtype)
         axis = self.axis if isinstance(self.axis, int) else int(self.axis[0])
-        tmp_outs = np.split(
-            ipt, axis=axis, indices_or_sections=self.num_or_sections)
+        tmp_outs = np.split(ipt,
+                            axis=axis,
+                            indices_or_sections=self.num_or_sections)
         tmp_outs = [o.astype(self.dtype) for o in tmp_outs]
         self.outputs = {'Out': []}
         self.outs = []
@@ -68,6 +71,7 @@ def set_example(self):
 
 
 class TestCase2(TestCase1):
+
     def set_example(self):
         self.dtype = "float32"
         self.x = np.random.random((20, 4, 50))
@@ -76,6 +80,7 @@ def set_example(self):
 
 
 class TestCase4(TestCase1):
+
     def set_example(self):
         self.dtype = "float16"
         self.x = np.random.random((4, 50, 20))
@@ -85,6 +90,7 @@ def set_example(self):
 
 # Test Sections
 class TestCase5(TestCase1):
+
     def set_example(self):
         super().set_example()
         self.x = np.random.random((2, 10, 4))
@@ -97,6 +103,7 @@ def setUp(self):
 
 
 class API_TestSplit(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.layers.data('data', shape=[-1, 10], dtype='float32')
@@ -111,6 +118,7 @@ def test_out(self):
 
 
 class API_TestSplit2(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.layers.data('data', shape=[-1, 10], dtype='float32')
@@ -125,6 +133,7 @@ def test_out(self):
 
 
 class API_TestDygraphSplit(unittest.TestCase):
+
     def test_out1(self):
         with fluid.dygraph.guard(paddle.NPUPlace(0)):
             input_1 = np.random.random([4, 6, 6]).astype("int32")
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
index 24b34fa625c63..0ac775135e3b6 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestSqrt(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "sqrt"
@@ -58,6 +60,7 @@ def test_check_grad(self):
 
 
 class TestSqrtFp16(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "sqrt"
@@ -84,6 +87,7 @@ def test_check_output(self):
 
 
 class TestSqrtNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -98,8 +102,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             c = paddle.multiply(a, b)
             d = paddle.sqrt(c)
@@ -123,12 +128,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
index 170f6b6ca4f93..49dd0c94eb07d 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestSquare(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "square"
@@ -55,6 +57,7 @@ def test_check_grad(self):
 
 
 class TestSquareFp16(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "square"
@@ -81,6 +84,7 @@ def test_check_output(self):
 
 
 class TestSquareNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -95,8 +99,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             c = paddle.multiply(a, b)
             d = paddle.square(c)
@@ -120,12 +125,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_squared_l2_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_squared_l2_norm_op_npu.py
index d3ee8df1cd106..af2853492085a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_squared_l2_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_squared_l2_norm_op_npu.py
@@ -18,6 +18,7 @@
 import unittest
 from numpy import linalg as LA
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -47,10 +48,9 @@ def test_check_output(self):
         self.check_output_with_place(place=self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            max_relative_error=self.max_relative_error)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   max_relative_error=self.max_relative_error)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/npu/test_squeeze_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_squeeze_op_npu.py
index 2e741c8d8a5ef..827fb0344d84b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_squeeze_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_squeeze_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -30,13 +31,16 @@
 
 
 class TestSqueezeOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "squeeze"
         self.init_test_case()
         self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
         self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape), }
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+        }
 
     def set_npu(self):
         self.__class__.use_npu = True
@@ -60,6 +64,7 @@ def init_attrs(self):
 
 
 class TestSqueezeOp1(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (1, 3, 1, 40)
         self.axes = (0, -2)
@@ -70,16 +75,18 @@ def init_test_case(self):
 
 
 class TestSqueezeOp2(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (1, 20, 1, 5)
         self.axes = ()
         self.new_shape = (20, 5)
 
 
-# Correct: Just part of axes be squeezed. 
+# Correct: Just part of axes be squeezed.
 
 
 class TestSqueezeOp3(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (6, 1, 5, 1, 4, 1)
         self.axes = (1, -1)
@@ -90,6 +97,7 @@ def init_test_case(self):
 
 
 class TestSqueezeOp4(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (6, 1, 5, 1, 4, 1)
         self.axes = (1, 2)
@@ -97,12 +105,13 @@ def init_test_case(self):
 
 
 class TestSqueezeOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input type of softmax_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], paddle.NPUPlace(0))
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         paddle.NPUPlace(0))
             self.assertRaises(TypeError, paddle.squeeze, x1)
             # The input axes of squeeze must be list.
             x2 = paddle.static.data(name='x2', shape=[4], dtype="int32")
@@ -113,6 +122,7 @@ def test_errors(self):
 
 
 class API_TestSqueeze(unittest.TestCase):
+
     def setUp(self):
         self.executed_api()
 
@@ -123,8 +133,9 @@ def test_out(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            data1 = paddle.static.data(
-                'data1', shape=[-1, 1, 10], dtype='float64')
+            data1 = paddle.static.data('data1',
+                                       shape=[-1, 1, 10],
+                                       dtype='float64')
             result_squeeze = self.squeeze(data1, axis=[1])
             place = paddle.NPUPlace(0)
             exe = paddle.static.Executor(place)
@@ -136,11 +147,13 @@ def test_out(self):
 
 
 class API_TestStaticSqueeze_(API_TestSqueeze):
+
     def executed_api(self):
         self.squeeze = paddle.squeeze_
 
 
 class API_TestDygraphSqueeze(unittest.TestCase):
+
     def setUp(self):
         self.executed_api()
 
@@ -199,12 +212,14 @@ def test_dimension_not_1(self):
 
 
 class API_TestDygraphSqueezeInplace(API_TestDygraphSqueeze):
+
     def executed_api(self):
         self.squeeze = paddle.squeeze_
 
 
 # Correct: General.
 class TestSqueeze2Op(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "squeeze2"
@@ -220,8 +235,8 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(
-            paddle.NPUPlace(0), no_check_set=['XShape'])
+        self.check_output_with_place(paddle.NPUPlace(0),
+                                     no_check_set=['XShape'])
 
     def test_check_grad(self):
         self.check_grad_with_place(paddle.NPUPlace(0), ["X"], "Out")
@@ -237,6 +252,7 @@ def init_attrs(self):
 
 # Correct: There is mins axis.
 class TestSqueeze2Op1(TestSqueeze2Op):
+
     def init_test_case(self):
         self.ori_shape = (1, 20, 1, 5)
         self.axes = (0, -2)
@@ -245,14 +261,16 @@ def init_test_case(self):
 
 # Correct: No axes input.
 class TestSqueeze2Op2(TestSqueeze2Op):
+
     def init_test_case(self):
         self.ori_shape = (1, 20, 1, 5)
         self.axes = ()
         self.new_shape = (20, 5)
 
 
-# Correct: Just part of axes be squeezed. 
+# Correct: Just part of axes be squeezed.
 class TestSqueeze2Op3(TestSqueeze2Op):
+
     def init_test_case(self):
         self.ori_shape = (6, 1, 5, 1, 4, 1)
         self.axes = (1, -1)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py
index af5648f8f39ef..ae20f642a2802 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestStackOpBase(OpTest):
+
     def initDefaultParameters(self):
         self.num_inputs = 4
         self.input_dim = (5, 6, 7)
@@ -78,41 +80,49 @@ def test_check_grad(self):
 
 
 class TestStackOp1(TestStackOpBase):
+
     def initParameters(self):
         self.num_inputs = 16
 
 
 class TestStackOp2(TestStackOpBase):
+
     def initParameters(self):
         self.num_inputs = 20
 
 
 class TestStackOp3(TestStackOpBase):
+
     def initParameters(self):
         self.axis = -1
 
 
 class TestStackOp4(TestStackOpBase):
+
     def initParameters(self):
         self.axis = -4
 
 
 class TestStackOp5(TestStackOpBase):
+
     def initParameters(self):
         self.axis = 1
 
 
 class TestStackOp6(TestStackOpBase):
+
     def initParameters(self):
         self.axis = 3
 
 
 class TestStackOpINT32(TestStackOpBase):
+
     def init_dtype(self):
         self.dtype = np.int32
 
 
 class TestStackOpINT64(TestStackOpBase):
+
     def init_dtype(self):
         self.dtype = np.int64
 
@@ -148,9 +158,8 @@ def test_case(self):
         exe = fluid.Executor(self.place)
         res = exe.run(self.program, fetch_list=self.out_var)
         self.assertTrue(
-            np.array_equal(
-                res[0], np.stack(
-                    [self.x] * self.iter_num, axis=self.axis)))
+            np.array_equal(res[0],
+                           np.stack([self.x] * self.iter_num, axis=self.axis)))
 
 
 class TestTensorStackAPIWithLoDTensorArray(unittest.TestCase):
@@ -184,12 +193,12 @@ def test_case(self):
         exe = fluid.Executor(self.place)
         res = exe.run(self.program, fetch_list=self.out_var)
         self.assertTrue(
-            np.array_equal(
-                res[0], np.stack(
-                    [self.x] * self.iter_num, axis=self.axis)))
+            np.array_equal(res[0],
+                           np.stack([self.x] * self.iter_num, axis=self.axis)))
 
 
 class API_test(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = fluid.layers.data('data1', shape=[1, 2], dtype='float32')
@@ -201,11 +210,12 @@ def test_out(self):
             input1 = np.random.random([1, 2]).astype('float32')
             input2 = np.random.random([1, 2]).astype('float32')
             input3 = np.random.random([1, 2]).astype('float32')
-            result, = exe.run(
-                feed={"data1": input1,
-                      "data2": input2,
-                      "data3": input3},
-                fetch_list=[result_stack])
+            result, = exe.run(feed={
+                "data1": input1,
+                "data2": input2,
+                "data3": input3
+            },
+                              fetch_list=[result_stack])
             expected_result = np.stack([input1, input2, input3], axis=0)
             self.assertTrue(np.allclose(expected_result, result))
 
@@ -216,6 +226,7 @@ def test_single_tensor_error(self):
 
 
 class API_DygraphTest(unittest.TestCase):
+
     def test_out(self):
         data1 = np.array([[1.0, 2.0]])
         data2 = np.array([[3.0, 4.0]])
diff --git a/python/paddle/fluid/tests/unittests/npu/test_strided_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_strided_slice_op_npu.py
index 1260017da939c..bf32653455c22 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_strided_slice_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_strided_slice_op_npu.py
@@ -14,6 +14,7 @@
 
 import sys
 import numpy as np
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import unittest
@@ -57,13 +58,15 @@ def strided_slice_native_forward(input, axes, starts, ends, strides):
 
 
 class TestStridedSliceOp(OpTest):
+
     def setUp(self):
         self.initTestCase()
         self.set_npu()
         self.place = paddle.NPUPlace(0)
         self.op_type = 'strided_slice'
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
+        self.output = strided_slice_native_forward(self.input, self.axes,
+                                                   self.starts, self.ends,
+                                                   self.strides)
 
         self.inputs = {'Input': self.input}
         self.outputs = {'Out': self.output}
@@ -94,6 +97,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp1(TestStridedSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(100)
         self.axes = [0]
@@ -104,6 +108,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp2(TestStridedSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(100)
         self.axes = [0]
@@ -114,6 +119,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp3(TestStridedSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(100)
         self.axes = [0]
@@ -124,6 +130,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp4(TestStridedSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 4, 10)
         self.axes = [0, 1, 2]
@@ -134,6 +141,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp5(TestStridedSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(5, 5, 5)
         self.axes = [0, 1, 2]
@@ -144,6 +152,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp6(TestStridedSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(5, 5, 5)
         self.axes = [0, 1, 2]
@@ -154,6 +163,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp7(TestStridedSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(5, 5, 5)
         self.axes = [0, 1, 2]
@@ -164,6 +174,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp8(TestStridedSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(1, 100, 1)
         self.axes = [1]
@@ -174,6 +185,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp9(TestStridedSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(1, 100, 1)
         self.axes = [1]
@@ -184,6 +196,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp10(TestStridedSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(10, 10)
         self.axes = [0, 1]
@@ -194,6 +207,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp11(TestStridedSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 3, 3, 4)
         self.axes = [0, 1, 2, 3]
@@ -204,6 +218,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp12(TestStridedSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 3, 3, 4, 5)
         self.axes = [0, 1, 2, 3, 4]
@@ -214,6 +229,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp13(TestStridedSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 3, 3, 6, 7, 8)
         self.axes = [0, 1, 2, 3, 4, 5]
@@ -224,11 +240,13 @@ def initTestCase(self):
 
 
 class TestStridedSliceOpBool(TestStridedSliceOp):
+
     def test_check_grad(self):
         pass
 
 
 class TestStridedSliceOpBool1D(TestStridedSliceOpBool):
+
     def initTestCase(self):
         self.input = np.random.rand(100).astype("bool")
         self.axes = [0]
@@ -239,6 +257,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOpBool2D(TestStridedSliceOpBool):
+
     def initTestCase(self):
         self.input = np.random.rand(10, 10).astype("bool")
         self.axes = [0, 1]
@@ -249,6 +268,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOpBool3D(TestStridedSliceOpBool):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 4, 10).astype("bool")
         self.axes = [0, 1, 2]
@@ -259,6 +279,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOpBool4D(TestStridedSliceOpBool):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 3, 3, 4).astype("bool")
         self.axes = [0, 1, 2, 3]
@@ -269,6 +290,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOpBool5D(TestStridedSliceOpBool):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 3, 3, 4, 5).astype("bool")
         self.axes = [0, 1, 2, 3, 4]
@@ -279,6 +301,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOpBool6D(TestStridedSliceOpBool):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 3, 3, 6, 7, 8).astype("bool")
         self.axes = [0, 1, 2, 3, 4, 5]
@@ -289,6 +312,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp_starts_ListTensor(OpTest):
+
     def setUp(self):
         self.place = paddle.NPUPlace(0)
         self.op_type = "strided_slice"
@@ -320,8 +344,9 @@ def config(self):
         self.axes = [0, 1, 2]
         self.strides = [1, 1, 1]
         self.infer_flags = [1, -1, 1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
+        self.output = strided_slice_native_forward(self.input, self.axes,
+                                                   self.starts, self.ends,
+                                                   self.strides)
 
         self.starts_infer = [1, 10, 2]
 
@@ -333,6 +358,7 @@ def test_check_grad_normal(self):
 
 
 class TestStridedSliceOp_ends_ListTensor(OpTest):
+
     def setUp(self):
         self.place = paddle.NPUPlace(0)
         self.op_type = "strided_slice"
@@ -364,8 +390,9 @@ def config(self):
         self.axes = [0, 1, 2]
         self.strides = [1, 1, 2]
         self.infer_flags = [1, -1, 1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
+        self.output = strided_slice_native_forward(self.input, self.axes,
+                                                   self.starts, self.ends,
+                                                   self.strides)
 
         self.ends_infer = [3, 1, 4]
 
@@ -377,6 +404,7 @@ def test_check_grad_normal(self):
 
 
 class TestStridedSliceOp_starts_Tensor(OpTest):
+
     def setUp(self):
         self.place = paddle.NPUPlace(0)
         self.op_type = "strided_slice"
@@ -385,8 +413,7 @@ def setUp(self):
 
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32")
+            "StartsTensor": np.array(self.starts, dtype="int32")
         }
         self.outputs = {'Out': self.output}
         self.attrs = {
@@ -407,8 +434,9 @@ def config(self):
         self.axes = [0, 1, 2]
         self.strides = [1, 1, 1]
         self.infer_flags = [-1, -1, -1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
+        self.output = strided_slice_native_forward(self.input, self.axes,
+                                                   self.starts, self.ends,
+                                                   self.strides)
 
     def test_check_output(self):
         self.check_output_with_place(self.place)
@@ -418,6 +446,7 @@ def test_check_grad_normal(self):
 
 
 class TestStridedSliceOp_ends_Tensor(OpTest):
+
     def setUp(self):
         self.place = paddle.NPUPlace(0)
         self.op_type = "strided_slice"
@@ -426,8 +455,7 @@ def setUp(self):
 
         self.inputs = {
             'Input': self.input,
-            "EndsTensor": np.array(
-                self.ends, dtype="int32")
+            "EndsTensor": np.array(self.ends, dtype="int32")
         }
         self.outputs = {'Out': self.output}
         self.attrs = {
@@ -448,8 +476,9 @@ def config(self):
         self.axes = [0, 1, 2]
         self.strides = [1, 1, 1]
         self.infer_flags = [-1, -1, -1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
+        self.output = strided_slice_native_forward(self.input, self.axes,
+                                                   self.starts, self.ends,
+                                                   self.strides)
 
     def test_check_output(self):
         self.check_output_with_place(self.place)
@@ -459,6 +488,7 @@ def test_check_grad_normal(self):
 
 
 class TestStridedSliceOp_listTensor_Tensor(OpTest):
+
     def setUp(self):
         self.place = paddle.NPUPlace(0)
         self.op_type = "strided_slice"
@@ -472,8 +502,7 @@ def setUp(self):
 
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32"),
+            "StartsTensor": np.array(self.starts, dtype="int32"),
             "EndsTensorList": ends_tensor
         }
         self.outputs = {'Out': self.output}
@@ -495,8 +524,9 @@ def config(self):
         self.axes = [0, 1, 2]
         self.strides = [1, 1, 1]
         self.infer_flags = [-1, -1, -1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
+        self.output = strided_slice_native_forward(self.input, self.axes,
+                                                   self.starts, self.ends,
+                                                   self.strides)
 
     def test_check_output(self):
         self.check_output_with_place(self.place)
@@ -506,6 +536,7 @@ def test_check_grad_normal(self):
 
 
 class TestStridedSliceOp_strides_Tensor(OpTest):
+
     def setUp(self):
         self.place = paddle.NPUPlace(0)
         self.op_type = "strided_slice"
@@ -514,8 +545,7 @@ def setUp(self):
 
         self.inputs = {
             'Input': self.input,
-            "StridesTensor": np.array(
-                self.strides, dtype="int32")
+            "StridesTensor": np.array(self.strides, dtype="int32")
         }
         self.outputs = {'Out': self.output}
         self.attrs = {
@@ -536,8 +566,9 @@ def config(self):
         self.axes = [0, 1, 2]
         self.strides = [1, -1, 1]
         self.infer_flags = [-1, -1, -1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
+        self.output = strided_slice_native_forward(self.input, self.axes,
+                                                   self.starts, self.ends,
+                                                   self.strides)
 
     def test_check_output(self):
         self.check_output_with_place(self.place)
@@ -548,42 +579,48 @@ def test_check_grad_normal(self):
 
     # Test python API
 class TestStridedSliceAPI(unittest.TestCase):
+
     def test_1(self):
         input = np.random.random([3, 4, 5, 6]).astype("float64")
         minus_1 = fluid.layers.fill_constant([1], "int32", -1)
         minus_3 = fluid.layers.fill_constant([1], "int32", -3)
-        starts = fluid.layers.data(
-            name='starts', shape=[3], dtype='int32', append_batch_size=False)
-        ends = fluid.layers.data(
-            name='ends', shape=[3], dtype='int32', append_batch_size=False)
-        strides = fluid.layers.data(
-            name='strides', shape=[3], dtype='int32', append_batch_size=False)
-
-        x = fluid.layers.data(
-            name="x",
-            shape=[3, 4, 5, 6],
-            append_batch_size=False,
-            dtype="float64")
-        out_1 = fluid.layers.strided_slice(
-            x,
-            axes=[0, 1, 2],
-            starts=[-3, 0, 2],
-            ends=[3, 100, -1],
-            strides=[1, 1, 1])
-        out_2 = fluid.layers.strided_slice(
-            x,
-            axes=[0, 1, 3],
-            starts=[minus_3, 0, 2],
-            ends=[3, 100, -1],
-            strides=[1, 1, 1])
-        out_3 = fluid.layers.strided_slice(
-            x,
-            axes=[0, 1, 3],
-            starts=[minus_3, 0, 2],
-            ends=[3, 100, minus_1],
-            strides=[1, 1, 1])
-        out_4 = fluid.layers.strided_slice(
-            x, axes=[0, 1, 2], starts=starts, ends=ends, strides=strides)
+        starts = fluid.layers.data(name='starts',
+                                   shape=[3],
+                                   dtype='int32',
+                                   append_batch_size=False)
+        ends = fluid.layers.data(name='ends',
+                                 shape=[3],
+                                 dtype='int32',
+                                 append_batch_size=False)
+        strides = fluid.layers.data(name='strides',
+                                    shape=[3],
+                                    dtype='int32',
+                                    append_batch_size=False)
+
+        x = fluid.layers.data(name="x",
+                              shape=[3, 4, 5, 6],
+                              append_batch_size=False,
+                              dtype="float64")
+        out_1 = fluid.layers.strided_slice(x,
+                                           axes=[0, 1, 2],
+                                           starts=[-3, 0, 2],
+                                           ends=[3, 100, -1],
+                                           strides=[1, 1, 1])
+        out_2 = fluid.layers.strided_slice(x,
+                                           axes=[0, 1, 3],
+                                           starts=[minus_3, 0, 2],
+                                           ends=[3, 100, -1],
+                                           strides=[1, 1, 1])
+        out_3 = fluid.layers.strided_slice(x,
+                                           axes=[0, 1, 3],
+                                           starts=[minus_3, 0, 2],
+                                           ends=[3, 100, minus_1],
+                                           strides=[1, 1, 1])
+        out_4 = fluid.layers.strided_slice(x,
+                                           axes=[0, 1, 2],
+                                           starts=starts,
+                                           ends=ends,
+                                           strides=strides)
 
         out_5 = x[-3:3, 0:100:2, -1:2:-1]
         out_6 = x[minus_3:3:1, 0:100:2, :, minus_1:2:minus_1]
@@ -613,8 +650,11 @@ def test_dygraph_op(self):
         starts = [-3, 0, 2]
         ends = [3, 2, 4]
         strides_1 = [1, 1, 1]
-        sliced_1 = paddle.strided_slice(
-            x, axes=axes, starts=starts, ends=ends, strides=strides_1)
+        sliced_1 = paddle.strided_slice(x,
+                                        axes=axes,
+                                        starts=starts,
+                                        ends=ends,
+                                        strides=strides_1)
         assert sliced_1.shape == (3, 2, 2, 2)
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
index 1ea8504ceec01..eb2594206aeed 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestSum1(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.init_dtype()
@@ -54,6 +56,7 @@ def test_check_output(self):
 
 
 class TestSum2(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.init_dtype()
@@ -66,7 +69,7 @@ def setUp(self):
         x3 = np.random.random((3, 3)).astype(self.dtype)
         self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2), ("x3", x3)]}
         # There will be a problem if just using `y=x0+x1+x2+x3` to calculate the
-        # summation result as the reference standard result. The reason is that 
+        # summation result as the reference standard result. The reason is that
         # numpy's fp16 data has precision loss when doing `add` operation.
         # For example, the results of `x0+x1+x2+x3` is different from that of
         # `x3+x2+x1+x0` if the dtype is fp16.
@@ -88,6 +91,7 @@ def test_check_output(self):
 
 
 class TestSum3(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.init_dtype()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_swish_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_swish_op_npu.py
index c7c488625be9e..3267820eff5ae 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_swish_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_swish_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from paddle.fluid.tests.unittests.op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestSwishOp(OpTest):
+
     def setUp(self):
         self.op_type = "swish"
         self.set_npu()
@@ -49,11 +51,10 @@ def test_check_grad(self):
         dx = beta * out + expit(x) * (1 - beta * out)
         dx = dx / x.size
 
-        self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            max_relative_error=0.01,
-            user_defined_grads=[dx])
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.01,
+                                   user_defined_grads=[dx])
 
     def set_npu(self):
         self.__class__.use_npu = True
@@ -64,6 +65,7 @@ def init_dtype(self):
 
 
 class TestSwishOpFp16(TestSwishOp):
+
     def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-3)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_base_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_base_npu.py
index dfd8680c4424e..4d81e97a9d070 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_base_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_base_npu.py
@@ -20,6 +20,7 @@
 import os
 import six
 import sys
+
 sys.path.append("..")
 import subprocess
 import traceback
@@ -43,6 +44,7 @@
 
 
 class TestSyncBatchNormRunnerBase(object):
+
     def get_model(self,
                   main,
                   startup,
@@ -61,9 +63,8 @@ def wait_server_ready(self, endpoints):
             not_ready_endpoints = []
             for ep in endpoints:
                 ip_port = ep.split(":")
-                with closing(
-                        socket.socket(socket.AF_INET,
-                                      socket.SOCK_STREAM)) as sock:
+                with closing(socket.socket(socket.AF_INET,
+                                           socket.SOCK_STREAM)) as sock:
                     sock.settimeout(2)
                     sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                     if hasattr(socket, 'SO_REUSEPORT'):
@@ -76,13 +77,14 @@ def wait_server_ready(self, endpoints):
                         not_ready_endpoints.append(ep)
             if not all_ok:
                 sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-                sys.stderr.write("not ready endpoints:" + str(
-                    not_ready_endpoints) + "\n")
+                sys.stderr.write("not ready endpoints:" +
+                                 str(not_ready_endpoints) + "\n")
                 sys.stderr.flush()
                 time.sleep(3)
             else:
                 break
 
+
 #endpoints should be ["ip1:port1","ip2:port2"]
 
     def initCommunicator(self, program, rank, nranks, wait_port,
@@ -92,29 +94,26 @@ def initCommunicator(self, program, rank, nranks, wait_port,
         if rank == 0 and wait_port:
             self.wait_server_ready(other_endpoints)
         block = program.global_block()
-        hccl_id_var = block.create_var(
-            name=nameGen.generate('hccl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-        block.append_op(
-            type='c_gen_hccl_id',
-            inputs={},
-            outputs={'Out': hccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints
-            })
-        block.append_op(
-            type='c_comm_init_hccl',
-            inputs={'X': hccl_id_var},
-            outputs={},
-            attrs={
-                'rank': rank,
-                'ring_id': self.global_ring_id,
-                'device_id': int(os.getenv("FLAGS_selected_npus")),
-                'rank_ids': nranks
-            })
+        hccl_id_var = block.create_var(name=nameGen.generate('hccl_id'),
+                                       persistable=True,
+                                       type=core.VarDesc.VarType.RAW)
+        block.append_op(type='c_gen_hccl_id',
+                        inputs={},
+                        outputs={'Out': hccl_id_var},
+                        attrs={
+                            'rank': rank,
+                            'endpoint': current_endpoint,
+                            'other_endpoints': other_endpoints
+                        })
+        block.append_op(type='c_comm_init_hccl',
+                        inputs={'X': hccl_id_var},
+                        outputs={},
+                        attrs={
+                            'rank': rank,
+                            'ring_id': self.global_ring_id,
+                            'device_id': int(os.getenv("FLAGS_selected_npus")),
+                            'rank_ids': nranks
+                        })
 
     def run_trainer(self, args):
         device_id = int(os.getenv("FLAGS_selected_npus", "0"))
@@ -339,8 +338,8 @@ def _cal_multiple_cards(self, args, data, place, layout, only_forward):
 
         self.initCommunicator(startup_prog, rank, nranks, True,
                               current_endpoint, endpoints)
-        sys.stderr.write("after init, startup_prog: " + startup_prog.to_string(
-            True) + "\n")
+        sys.stderr.write("after init, startup_prog: " +
+                         startup_prog.to_string(True) + "\n")
         train_prog.global_seed(SEED)
         train_prog._sync_with_cpp()
         startup_prog.global_seed(SEED)
@@ -350,8 +349,8 @@ def _cal_multiple_cards(self, args, data, place, layout, only_forward):
         self.rank = rank
         outs = self.get_model(train_prog, startup_prog, place, layout, SEED,
                               True, only_forward)
-        sys.stderr.write("after get_model, train_prog: " + train_prog.to_string(
-            True) + "\n")
+        sys.stderr.write("after get_model, train_prog: " +
+                         train_prog.to_string(True) + "\n")
         sys.stderr.write("after get_model, startup_prog: " +
                          startup_prog.to_string(True) + "\n")
 
@@ -405,6 +404,7 @@ def runtime_main(test_class, col_type, sub_type):
 
 
 class TestDistBase(unittest.TestCase):
+
     def setUp(self):
         self._port_set = set()
         self._trainers = 2
@@ -413,6 +413,7 @@ def setUp(self):
         self._python_interp = sys.executable
 
     def _find_free_port(self):
+
         def __free_port():
             with closing(socket.socket(socket.AF_INET,
                                        socket.SOCK_STREAM)) as s:
@@ -454,18 +455,16 @@ def _run_cluster(self, model_file, envs):
         tr0_pipe = open("/tmp/tr0_err.log", "wb")
         tr1_pipe = open("/tmp/tr1_err.log", "wb")
         # print(tr0_cmd)
-        # print(tr1_cmd) 
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.strip().split(),
-            stdout=subprocess.PIPE,
-            stderr=tr0_pipe,
-            env=env0)
-
-        tr1_proc = subprocess.Popen(
-            tr0_cmd.strip().split(),
-            stdout=subprocess.PIPE,
-            stderr=tr1_pipe,
-            env=env1)
+        # print(tr1_cmd)
+        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr0_pipe,
+                                    env=env0)
+
+        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr1_pipe,
+                                    env=env1)
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_op_npu_baseline.py b/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_op_npu_baseline.py
index 54a78ea2d52a1..27926e032f5a3 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_op_npu_baseline.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_op_npu_baseline.py
@@ -18,6 +18,7 @@
 import paddle
 import os
 import sys
+
 sys.path.append("..")
 
 from paddle.fluid.tests.unittests.op_test import OpTest, _set_use_system_allocator
@@ -29,13 +30,15 @@
 
 
 class TestSyncBatchNormOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
     def test_identity(self, col_type="identity"):
         dist_env = os.environ
-        self.check_with_place(
-            "sync_batch_norm_op_npu.py", col_type, need_envs=dist_env)
+        self.check_with_place("sync_batch_norm_op_npu.py",
+                              col_type,
+                              need_envs=dist_env)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_op_npu_extra.py b/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_op_npu_extra.py
index bafe45b77daac..8fe46e3f4147b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_op_npu_extra.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sync_batch_norm_op_npu_extra.py
@@ -18,6 +18,7 @@
 import paddle
 import os
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -33,28 +34,29 @@
 
 
 class TestDygraphSyncBatchNormAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             my_sync_batch_norm = paddle.nn.SyncBatchNorm(10)
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.NPUPlace(0))
             self.assertRaises(TypeError, my_sync_batch_norm, x1)
 
-            # the input dtype of SyncBatchNorm must be float16 or float32 
+            # the input dtype of SyncBatchNorm must be float16 or float32
             # float16 only can be set on GPU place and NPU place
             x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32")
             self.assertRaises(TypeError, my_sync_batch_norm, x2)
 
 
 class TestConvertSyncBatchNorm(unittest.TestCase):
+
     def test_convert(self):
         with program_guard(Program(), Program()):
-            compare_model = paddle.nn.Sequential(
-                paddle.nn.Conv2D(3, 5, 3),
-                paddle.nn.BatchNorm2D(5), paddle.nn.BatchNorm2D(5))
+            compare_model = paddle.nn.Sequential(paddle.nn.Conv2D(3, 5, 3),
+                                                 paddle.nn.BatchNorm2D(5),
+                                                 paddle.nn.BatchNorm2D(5))
             model = paddle.nn.Sequential(
-                paddle.nn.Conv2D(3, 5, 3),
-                paddle.nn.BatchNorm2D(5),
+                paddle.nn.Conv2D(3, 5, 3), paddle.nn.BatchNorm2D(5),
                 paddle.nn.BatchNorm2D(
                     5,
                     weight_attr=fluid.ParamAttr(name='bn.scale'),
@@ -67,8 +69,11 @@ def test_convert(self):
 
 
 class TestConvertSyncBatchNormCast1(unittest.TestCase):
+
     def test_convert(self):
+
         class Net(nn.Layer):
+
             def __init__(self):
                 super(Net, self).__init__()
                 self.conv1 = nn.Conv2D(3, 5, 3)
@@ -93,6 +98,7 @@ def forward(self, x):
 
 
 class TestDygraphSyncBatchNormDataFormatError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.dygraph.guard(fluid.NPUPlace(0)):
             my_sync_batch_norm = paddle.nn.SyncBatchNorm(10, data_format='CN')
diff --git a/python/paddle/fluid/tests/unittests/npu/test_take_along_axis_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_take_along_axis_op_npu.py
index 4aad02f7df06e..450cb54294341 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_take_along_axis_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_take_along_axis_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 @unittest.skip(reason="Skip unsupported ut, need paddle surpport cann 5.0.4+")
 class TestTakeAlongAxisOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.init_data()
@@ -59,13 +61,14 @@ def init_data(self):
         self.x_type = "float64"
         self.x_shape = (5, 5, 5)
         self.index_type = "int32"
-        self.index = np.array(
-            [[[1]], [[1]], [[2]], [[4]], [[3]]]).astype(self.index_type)
+        self.index = np.array([[[1]], [[1]], [[2]], [[4]],
+                               [[3]]]).astype(self.index_type)
         self.axis = 2
         self.axis_type = "int64"
 
 
 class TestCase1(TestTakeAlongAxisOp):
+
     def init_data(self):
         self.x_type = "float64"
         self.x_shape = (5, 5, 5)
@@ -77,6 +80,7 @@ def init_data(self):
 
 @unittest.skip(reason="Skip unsupported ut, need paddle surpport cann 5.0.4+")
 class TestTakeAlongAxisAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(0)
         self.shape = [3, 3]
@@ -93,8 +97,10 @@ def test_api_static(self):
             index = paddle.fluid.data('Index', self.index_shape, "int64")
             out = paddle.take_along_axis(x, index, self.axis)
             exe = paddle.static.Executor(self.place)
-            res = exe.run(feed={'X': self.x_np,
-                                'Index': self.index_np},
+            res = exe.run(feed={
+                'X': self.x_np,
+                'Index': self.index_np
+            },
                           fetch_list=[out])
         out_ref = np.array(
             np.take_along_axis(self.x_np, self.index_np, self.axis))
@@ -114,12 +120,13 @@ def test_api_dygraph(self):
 
 @unittest.skip(reason="Skip unsupported ut, need paddle surpport cann 5.0.4+")
 class TestTakeAlongAxisAPICase1(TestTakeAlongAxisAPI):
+
     def setUp(self):
         np.random.seed(0)
         self.shape = [2, 2]
         self.index_shape = [4, 2]
-        self.index_np = np.array(
-            [[0, 0], [1, 0], [0, 0], [1, 0]]).astype('int64')
+        self.index_np = np.array([[0, 0], [1, 0], [0, 0], [1,
+                                                           0]]).astype('int64')
         self.x_np = np.random.random(self.shape).astype(np.float32)
         self.place = paddle.NPUPlace(0)
         self.axis = 0
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
index 375eef12291ec..e26f713f00f9d 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestTanh(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "tanh"
@@ -58,6 +60,7 @@ def test_check_grad(self):
 
 
 class TestTanhFp16(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "tanh"
@@ -84,6 +87,7 @@ def test_check_output(self):
 
 
 class TestTanhNet(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -98,8 +102,9 @@ def _test(self, run_npu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[32, 1],
+                                       dtype='int64')
 
             c = paddle.multiply(a, b)
             d = paddle.tanh(c)
@@ -123,12 +128,13 @@ def _test(self, run_npu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(
-                main_prog,
-                feed={"a": a_np,
-                      "b": b_np,
-                      "label": label_np},
-                fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
             if epoch % 10 == 0:
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py
index 0e61fa00fdf28..7caacf738ec84 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -30,6 +31,7 @@
 
 #Situation 1: repeat_times is a list (without tensor)
 class TestTileOpRank1(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -57,36 +59,42 @@ def test_check_grad(self):
 
 #with dimension expanding
 class TestTileOpRank2Expanding(TestTileOpRank1):
+
     def init_data(self):
         self.ori_shape = [120]
         self.repeat_times = [2, 2]
 
 
 class TestTileOpRank2(TestTileOpRank1):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.repeat_times = [2, 3]
 
 
 class TestTileOpRank3_Corner(TestTileOpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 10, 5)
         self.repeat_times = (1, 1, 1)
 
 
 class TestTileOpRank3_Corner2(TestTileOpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 10, 5)
         self.repeat_times = (2, 2)
 
 
 class TestTileOpRank3(TestTileOpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 4, 15)
         self.repeat_times = (2, 1, 4)
 
 
 class TestTileOpRank4(TestTileOpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 4, 5, 7)
         self.repeat_times = (3, 2, 1, 2)
@@ -94,6 +102,7 @@ def init_data(self):
 
 # Situation 2: repeat_times is a list (with tensor)
 class TestTileOpRank1_tensor_attr(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -128,6 +137,7 @@ def test_check_grad(self):
 
 
 class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.repeat_times = [1, 1]
@@ -135,6 +145,7 @@ def init_data(self):
 
 
 class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.repeat_times = [2, 3]
@@ -143,6 +154,7 @@ def init_data(self):
 
 # Situation 3: repeat_times is a tensor
 class TestTileOpRank1_tensor(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -172,6 +184,7 @@ def test_check_grad(self):
 
 
 class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.repeat_times = [2, 3]
@@ -179,13 +192,13 @@ def init_data(self):
 
 # Situation 4: input x is Integer
 class TestTileOpInteger(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
         self.op_type = "tile"
         self.inputs = {
-            'X': np.random.randint(
-                10, size=(4, 4, 5)).astype("int32")
+            'X': np.random.randint(10, size=(4, 4, 5)).astype("int32")
         }
         self.attrs = {'repeat_times': [2, 1, 4]}
         output = np.tile(self.inputs['X'], (2, 1, 4))
@@ -200,13 +213,13 @@ def test_check_output(self):
 
 # Situation 5: input x is Integer
 class TestTileOpInt64_t(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
         self.op_type = "tile"
         self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 5)).astype("int64")
+            'X': np.random.randint(10, size=(2, 4, 5)).astype("int64")
         }
         self.attrs = {'repeat_times': [2, 1, 4]}
         output = np.tile(self.inputs['X'], (2, 1, 4))
@@ -221,6 +234,7 @@ def test_check_output(self):
 
 # Situation 6: input x is Bool
 class TestTileOpBool(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -239,6 +253,7 @@ def test_check_output(self):
 
 # Test python API
 class TestTileAPI(unittest.TestCase):
+
     def test_api(self):
         with fluid.dygraph.guard(paddle.NPUPlace(0)):
             np_x = np.random.random([12, 14]).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
index c8a620d9dbb35..f05e4f19d8e7a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -29,6 +30,7 @@
 
 
 class TestTopk(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -40,8 +42,8 @@ def setUp(self):
                       [0.96527182, 0.34851612, 0.12959783]]).astype(self.dtype)
 
         self.inputs = {'X': x}
-        np_out = np.array(
-            [[0.88745828], [0.82196718], [0.96527182]]).astype(self.dtype)
+        np_out = np.array([[0.88745828], [0.82196718],
+                           [0.96527182]]).astype(self.dtype)
         np_indices = np.array([[1], [0], [0]])
 
         self.attrs = {'k': 1, "axis": -1}
@@ -59,6 +61,7 @@ def test_check_output(self):
 
 
 class TestTopkV2(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -89,6 +92,7 @@ def test_check_output(self):
 
 
 class TestTopkV3(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -97,8 +101,10 @@ def setUp(self):
         self.init_dtype()
         self.set_input_data()
         self.set_attrs()
-        output, indices = numpy_topk(
-            self.input_data, axis=self.axis, k=self.k, largest=True)
+        output, indices = numpy_topk(self.input_data,
+                                     axis=self.axis,
+                                     k=self.k,
+                                     largest=True)
 
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis}
@@ -119,8 +125,8 @@ def set_attrs(self):
         self.axis = 1
 
     def set_input_data(self):
-        self.input_data = np.random.choice(
-            10000, size=(10, 20), replace=False).astype(self.dtype)
+        self.input_data = np.random.choice(10000, size=(10, 20),
+                                           replace=False).astype(self.dtype)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py
index a8242be855c80..86a58cfae097b 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -40,6 +41,7 @@ def numpy_topk(x, k=1, axis=-1, largest=True):
 
 
 class TestTopkV2NPUOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.op_type = "top_k_v2"
@@ -48,8 +50,10 @@ def setUp(self):
         self.set_dtype()
         self.set_input_data()
         self.set_attrs()
-        output, indices = numpy_topk(
-            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        output, indices = numpy_topk(self.input_data,
+                                     axis=self.axis,
+                                     k=self.k,
+                                     largest=self.largest)
 
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
@@ -64,8 +68,8 @@ def set_attrs(self):
         self.largest = True
 
     def set_input_data(self):
-        self.input_data = np.random.choice(
-            10000, size=(10, 20), replace=False).astype(self.dtype)
+        self.input_data = np.random.choice(10000, size=(10, 20),
+                                           replace=False).astype(self.dtype)
 
     def test_check_output(self):
         self.__class__.no_need_check_grad = True
@@ -80,6 +84,7 @@ def set_npu(self):
 
 
 class TestTopkV2OpFloat16(TestTopkV2NPUOp):
+
     def set_attrs(self):
         self.k = 3
         self.axis = 1
@@ -93,6 +98,7 @@ def set_input_data(self):
 
 
 class TestTopkV2OP1Int32(TestTopkV2NPUOp):
+
     def set_attrs(self):
         self.k = 3
         self.axis = 0
@@ -100,6 +106,7 @@ def set_attrs(self):
 
 
 class TestTopkV2OP2Int32(TestTopkV2NPUOp):
+
     def set_attrs(self):
         self.k = 4
         self.axis = 0
@@ -107,6 +114,7 @@ def set_attrs(self):
 
 
 class TestTopkV2OP3Int32(TestTopkV2NPUOp):
+
     def set_attrs(self):
         self.k = 6
         self.axis = 1
@@ -114,6 +122,7 @@ def set_attrs(self):
 
 
 class TestTopkV2OP4Int32(TestTopkV2NPUOp):
+
     def set_attrs(self):
         self.k = 3
         self.axis = 1
@@ -121,26 +130,31 @@ def set_attrs(self):
 
 
 class TestTopkV2Op1Int64(TestTopkV2OP1Int32):
+
     def set_dtype(self):
         self.dtype = np.int64
 
 
 class TestTopkV2Op2Int64(TestTopkV2OP2Int32):
+
     def set_dtype(self):
         self.dtype = np.int64
 
 
 class TestTopkV2Op3Int64(TestTopkV2OP3Int32):
+
     def set_dtype(self):
         self.dtype = np.int64
 
 
 class TestTopkV2Op4Int64(TestTopkV2OP4Int32):
+
     def set_dtype(self):
         self.dtype = np.int64
 
 
 class TestTopkV2Op1Float32(TestTopkV2OP1Int32):
+
     def set_dtype(self):
         self.dtype = np.float32
 
@@ -149,6 +163,7 @@ def set_input_data(self):
 
 
 class TestTopkV2Op2Float32(TestTopkV2OP2Int32):
+
     def set_dtype(self):
         self.dtype = np.float32
 
@@ -157,6 +172,7 @@ def set_input_data(self):
 
 
 class TestTopkV2Op3Float32(TestTopkV2OP3Int32):
+
     def set_dtype(self):
         self.dtype = np.float32
 
@@ -165,6 +181,7 @@ def set_input_data(self):
 
 
 class TestTopkV2Op4Float32(TestTopkV2OP4Int32):
+
     def set_dtype(self):
         self.dtype = np.float32
 
@@ -173,6 +190,7 @@ def set_input_data(self):
 
 
 class TestTopkV2Op1Float64(TestTopkV2OP1Int32):
+
     def set_dtype(self):
         self.dtype = np.float64
 
@@ -181,6 +199,7 @@ def set_input_data(self):
 
 
 class TestTopkV2Op2Float64(TestTopkV2OP2Int32):
+
     def set_dtype(self):
         self.dtype = np.float64
 
@@ -189,6 +208,7 @@ def set_input_data(self):
 
 
 class TestTopkV2Op3Float64(TestTopkV2OP3Int32):
+
     def set_dtype(self):
         self.dtype = np.float64
 
@@ -197,6 +217,7 @@ def set_input_data(self):
 
 
 class TestTopkV2Op4Float64(TestTopkV2OP4Int32):
+
     def set_dtype(self):
         self.dtype = np.float64
 
@@ -205,6 +226,7 @@ def set_input_data(self):
 
 
 class TestTopKAPI(unittest.TestCase):
+
     def setUp(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -248,15 +270,16 @@ def run_dygraph(self, place):
         self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
         self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
 
-        # test case for basic test case 6 for the partial sort 
+        # test case for basic test case 6 for the partial sort
         paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1)
         numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
         self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
         self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
-        # test case for basic test case 7 for the unsorted 
+        # test case for basic test case 7 for the unsorted
         paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
-        sort_paddle = numpy_topk(
-            np.array(paddle_result[0].numpy()), axis=1, k=2)
+        sort_paddle = numpy_topk(np.array(paddle_result[0].numpy()),
+                                 axis=1,
+                                 k=2)
         numpy_result = numpy_topk(self.input_data, k=2, axis=1)
         self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
 
@@ -264,10 +287,12 @@ def run_static(self, place):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            input_tensor = paddle.static.data(
-                name="x", shape=[6, 7, 8], dtype="float64")
-            large_input_tensor = paddle.static.data(
-                name="large_x", shape=[2, 1030], dtype="float64")
+            input_tensor = paddle.static.data(name="x",
+                                              shape=[6, 7, 8],
+                                              dtype="float64")
+            large_input_tensor = paddle.static.data(name="large_x",
+                                                    shape=[2, 1030],
+                                                    dtype="float64")
             k_tensor = paddle.static.data(name="k", shape=[1], dtype="int32")
             result1 = paddle.topk(input_tensor, k=2)
             result2 = paddle.topk(input_tensor, k=2, axis=-1)
@@ -281,17 +306,18 @@ def run_static(self, place):
             exe = paddle.static.Executor(place)
             input_data = np.random.rand(10, 20).astype("float64")
             large_input_data = np.random.rand(2, 100).astype("float64")
-            paddle_result = exe.run(
-                feed={
-                    "x": self.input_data,
-                    "large_x": self.large_input_data,
-                    "k": np.array([2]).astype("int32")
-                },
-                fetch_list=[
-                    result1[0], result1[1], result2[0], result2[1], result3[0],
-                    result3[1], result4[0], result4[1], result5[0], result5[1],
-                    result6[0], result6[1], result7[0], result7[1]
-                ])
+            paddle_result = exe.run(feed={
+                "x": self.input_data,
+                "large_x": self.large_input_data,
+                "k": np.array([2]).astype("int32")
+            },
+                                    fetch_list=[
+                                        result1[0], result1[1], result2[0],
+                                        result2[1], result3[0], result3[1],
+                                        result4[0], result4[1], result5[0],
+                                        result5[1], result6[0], result6[1],
+                                        result7[0], result7[1]
+                                    ])
             numpy_result = numpy_topk(self.input_data, k=2)
             self.assertTrue(np.allclose(paddle_result[0], numpy_result[0]))
             self.assertTrue(np.allclose(paddle_result[1], numpy_result[1]))
@@ -304,13 +330,17 @@ def run_static(self, place):
             self.assertTrue(np.allclose(paddle_result[4], numpy_result[0]))
             self.assertTrue(np.allclose(paddle_result[5], numpy_result[1]))
 
-            numpy_result = numpy_topk(
-                self.input_data, k=2, axis=1, largest=False)
+            numpy_result = numpy_topk(self.input_data,
+                                      k=2,
+                                      axis=1,
+                                      largest=False)
             self.assertTrue(np.allclose(paddle_result[6], numpy_result[0]))
             self.assertTrue(np.allclose(paddle_result[7], numpy_result[1]))
 
-            numpy_result = numpy_topk(
-                self.input_data, k=2, axis=-1, largest=False)
+            numpy_result = numpy_topk(self.input_data,
+                                      k=2,
+                                      axis=-1,
+                                      largest=False)
             self.assertTrue(np.allclose(paddle_result[8], numpy_result[0]))
             self.assertTrue(np.allclose(paddle_result[9], numpy_result[1]))
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
index b1a6bfcdaaadc..a5548b5ea125a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, _set_use_system_allocator
 import paddle
@@ -26,6 +27,7 @@
 
 
 class TestTransposeOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "transpose2"
@@ -55,66 +57,77 @@ def test_check_grad(self):
 
 
 class TestCase0(TestTransposeOp):
+
     def init_shape_axis(self):
         self.shape = (100, )
         self.axis = (0, )
 
 
 class TestCase1(TestTransposeOp):
+
     def init_shape_axis(self):
         self.shape = (3, 4, 10)
         self.axis = (0, 2, 1)
 
 
 class TestCase2(TestTransposeOp):
+
     def init_shape_axis(self):
         self.shape = (2, 3, 4, 5)
         self.axis = (0, 2, 3, 1)
 
 
 class TestCase3(TestTransposeOp):
+
     def init_shape_axis(self):
         self.shape = (2, 3, 4, 5, 6)
         self.axis = (4, 2, 3, 1, 0)
 
 
 class TestCase4(TestTransposeOp):
+
     def init_shape_axis(self):
         self.shape = (2, 3, 4, 5, 6, 1)
         self.axis = (4, 2, 3, 1, 0, 5)
 
 
 class TestCase5(TestTransposeOp):
+
     def init_shape_axis(self):
         self.shape = (2, 16, 96)
         self.axis = (0, 2, 1)
 
 
 class TestCase6(TestTransposeOp):
+
     def init_shape_axis(self):
         self.shape = (2, 10, 12, 16)
         self.axis = (3, 1, 2, 0)
 
 
 class TestCase7(TestTransposeOp):
+
     def init_shape_axis(self):
         self.shape = (2, 10, 2, 16)
         self.axis = (0, 1, 3, 2)
 
 
 class TestCase8(TestTransposeOp):
+
     def init_shape_axis(self):
         self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
         self.axis = (0, 1, 3, 2, 4, 5, 6, 7)
 
 
 class TestCase9(TestTransposeOp):
+
     def init_shape_axis(self):
         self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
         self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
 
 
 class TestTransposeOpFP16(TestTransposeOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -123,6 +136,7 @@ def test_check_grad(self):
 
 
 class TestTransposeOpInt64(TestTransposeOp):
+
     def init_dtype(self):
         self.dtype = np.int64
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
index 8239dd4f3fa89..b3d5fa9a6b5c9 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
@@ -42,7 +42,8 @@ def setUp(self):
             'lower': True if self.real_op_type == 'tril' else False,
         }
         self.outputs = {
-            'Out': self.real_np_op(self.X, self.diagonal)
+            'Out':
+            self.real_np_op(self.X, self.diagonal)
             if self.diagonal else self.real_np_op(self.X)
         }
 
@@ -78,15 +79,17 @@ def case_generator(op_type, Xshape, diagonal, expected):
     }
 
     class FailureCase(unittest.TestCase):
+
         def test_failure(self):
             paddle.enable_static()
 
             data = fluid.data(shape=Xshape, dtype='float32', name=cls_name)
-            with self.assertRaisesRegexp(
-                    eval(expected.split(':')[-1]), errmsg[expected]):
+            with self.assertRaisesRegexp(eval(expected.split(':')[-1]),
+                                         errmsg[expected]):
                 getattr(tensor, op_type)(x=data, diagonal=diagonal)
 
     class SuccessCase(TestNPUTrilTriu):
+
         def initTestCase(self):
             paddle.enable_static()
 
@@ -100,7 +103,7 @@ def initTestCase(self):
 
 
 ### NOTE: meaningful diagonal is [1 - min(H, W), max(H, W) -1]
-### test the diagonal just at the border, upper/lower the border, 
+### test the diagonal just at the border, upper/lower the border,
 ###     negative/positive integer within range and a zero
 cases = {
     'success': {
@@ -126,8 +129,9 @@ def initTestCase(self):
     for _expected, _params in cases.items():
         for _Xshape, _diaglist in _params.items():
             list(
-                map(lambda _diagonal: case_generator(_op_type, _Xshape, _diagonal, _expected),
-                    _diaglist))
+                map(
+                    lambda _diagonal: case_generator(
+                        _op_type, _Xshape, _diagonal, _expected), _diaglist))
 
 
 class TestTrilTriuOpAPI(unittest.TestCase):
@@ -151,7 +155,8 @@ def test_api(self):
                 tril_out, triu_out = exe.run(
                     fluid.default_main_program(),
                     feed={"x": data},
-                    fetch_list=[tril_out, triu_out], )
+                    fetch_list=[tril_out, triu_out],
+                )
                 self.assertTrue(np.allclose(tril_out, np.tril(data)))
                 self.assertTrue(np.allclose(triu_out, np.triu(data)))
 
@@ -189,6 +194,7 @@ def test_fluid_api(self):
 
 # @skip_check_grad_ci(reason="[NPU does not support grad right now.")
 class TestNPUTrilTriu_bool(TestNPUTrilTriu):
+
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py
index de94e7febaca7..0ce6deb42e097 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -30,6 +31,7 @@
 
 
 class TestTruncatedNormal(unittest.TestCase):
+
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -44,10 +46,12 @@ def _test(self, run_npu=True):
             with paddle.static.program_guard(main_prog, startup_prog):
                 weight_attr = paddle.framework.ParamAttr(
                     name="linear_weight",
-                    initializer=paddle.nn.initializer.TruncatedNormal(
-                        mean=0.0, std=2.0))
-                linear = paddle.nn.Linear(
-                    2, 2, weight_attr=weight_attr, bias_attr=False)
+                    initializer=paddle.nn.initializer.TruncatedNormal(mean=0.0,
+                                                                      std=2.0))
+                linear = paddle.nn.Linear(2,
+                                          2,
+                                          weight_attr=weight_attr,
+                                          bias_attr=False)
 
             if run_npu:
                 place = paddle.NPUPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py
index 0e21c59432bad..7f2c2753b9b98 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py
@@ -18,6 +18,7 @@
 import subprocess
 import unittest
 import numpy as np
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -40,6 +41,7 @@ def output_hist(out):
 
 
 class TestNPUUniformRandomOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "uniform_random"
@@ -69,12 +71,12 @@ def test_check_output(self):
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestNPUUniformRandomOpSelectedRows(unittest.TestCase):
+
     def get_places(self):
         places = [core.CPUPlace()]
         if core.is_compiled_with_npu():
@@ -89,19 +91,17 @@ def check_with_place(self, place):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         paddle.seed(10)
-        op = Operator(
-            "uniform_random",
-            Out="X",
-            shape=[1000, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10)
+        op = Operator("uniform_random",
+                      Out="X",
+                      shape=[1000, 784],
+                      min=-5.0,
+                      max=10.0,
+                      seed=10)
         op.run(scope, place)
         self.assertEqual(out.get_tensor().shape(), [1000, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/npu/test_unsqueeze_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_unsqueeze_op_npu.py
index cebfed1629aba..3f7783d4959eb 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_unsqueeze_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_unsqueeze_op_npu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -29,6 +30,7 @@
 
 # unsqueeze
 class TestUnsqueezeOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "unsqueeze"
@@ -37,7 +39,9 @@ def setUp(self):
         self.x = np.random.random(self.ori_shape).astype("float32")
         self.inputs = {"X": OpTest.np_dtype_to_fluid_dtype(self.x)}
         self.init_attrs()
-        self.outputs = {"Out": self.x.reshape(self.new_shape), }
+        self.outputs = {
+            "Out": self.x.reshape(self.new_shape),
+        }
 
     def set_npu(self):
         self.__class__.use_npu = True
@@ -58,6 +62,7 @@ def init_attrs(self):
 
 
 class TestUnsqueezeOp1(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (3, 40)
         self.axes = (0, -2)
@@ -66,22 +71,25 @@ def init_test_case(self):
 
 # No axes input.
 class TestUnsqueezeOp2(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = ()
         self.new_shape = (1, 20, 5)
 
 
-# Just part of axes be squeezed. 
+# Just part of axes be squeezed.
 class TestUnsqueezeOp3(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (6, 5, 1, 4)
         self.axes = (1, -1)
         self.new_shape = (6, 1, 5, 1, 4, 1)
 
 
-# unsqueeze 2        
+# unsqueeze 2
 class TestUnsqueeze2Op(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "unsqueeze2"
@@ -115,6 +123,7 @@ def init_attrs(self):
 
 # Correct: There is mins axis.
 class TestUnsqueeze2Op1(TestUnsqueeze2Op):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (0, -2)
@@ -123,14 +132,16 @@ def init_test_case(self):
 
 # Correct: No axes input.
 class TestUnsqueeze2Op2(TestUnsqueeze2Op):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = ()
         self.new_shape = (1, 20, 5)
 
 
-# Correct: Just part of axes be squeezed. 
+# Correct: Just part of axes be squeezed.
 class TestUnsqueeze2Op3(TestUnsqueeze2Op):
+
     def init_test_case(self):
         self.ori_shape = (6, 5, 1, 4)
         self.axes = (1, -1)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_unstack_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_unstack_op_npu.py
index 097f31c72467c..32519e3e4b6e7 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_unstack_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_unstack_op_npu.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import unittest
@@ -25,6 +26,7 @@
 
 
 class TestUnStackOpBase(OpTest):
+
     def initDefaultParameters(self):
         self.input_dim = (5, 6, 7)
         self.axis = 0
@@ -74,21 +76,25 @@ def test_check_grad(self):
 
 
 class TestStackOp3(TestUnStackOpBase):
+
     def initParameters(self):
         self.axis = -1
 
 
 class TestStackOp4(TestUnStackOpBase):
+
     def initParameters(self):
         self.axis = -3
 
 
 class TestStackOp5(TestUnStackOpBase):
+
     def initParameters(self):
         self.axis = 1
 
 
 class TestStackOp6(TestUnStackOpBase):
+
     def initParameters(self):
         self.axis = 2
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py
index 18e2db7f6b1d9..21be9e295d2e1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py
@@ -16,6 +16,7 @@
 import numpy as np
 import sys
 import os
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestUpdateLossScalingOpMinLossScalingBad(TestUpdateLossScalingOpBad):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "update_loss_scaling"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
index 1388adf609ff6..5299369ff1743 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -26,6 +27,7 @@
 
 
 class TestUpdateLossScalingOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "update_loss_scaling"
@@ -73,6 +75,7 @@ def test_check_output(self):
 
 
 class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "update_loss_scaling"
@@ -102,17 +105,21 @@ def setUp(self):
 
 
 class TestUpdateLossScalingLayer(unittest.TestCase):
+
     def loss_scaling_check(self, use_npu=True, scope=fluid.Scope()):
         a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
         b = fluid.data(name="b", shape=[512, 128], dtype='float32')
         x = [a, b]
         found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
-        prev_loss_scaling = fluid.data(
-            name="prev_loss_scaling", shape=[1], dtype='float32')
-        num_good_steps = fluid.data(
-            name="num_good_steps", shape=[1], dtype='int32')
-        num_bad_steps = fluid.data(
-            name="num_bad_steps", shape=[1], dtype='int32')
+        prev_loss_scaling = fluid.data(name="prev_loss_scaling",
+                                       shape=[1],
+                                       dtype='float32')
+        num_good_steps = fluid.data(name="num_good_steps",
+                                    shape=[1],
+                                    dtype='int32')
+        num_bad_steps = fluid.data(name="num_bad_steps",
+                                   shape=[1],
+                                   dtype='int32')
 
         a_v = np.random.random([1024, 1024]).astype('float32')
         b_v = np.random.random([512, 128]).astype('float32')
@@ -126,17 +133,16 @@ def loss_scaling_check(self, use_npu=True, scope=fluid.Scope()):
         incr_ratio = 2
         decr_ratio = 0.8
 
-        result = amp_nn.update_loss_scaling(
-            x,
-            found_inf,
-            prev_loss_scaling,
-            num_good_steps,
-            num_bad_steps,
-            incr_every_n_steps,
-            decr_every_n_nan_or_inf,
-            incr_ratio,
-            decr_ratio,
-            name="update_loss_scaling")
+        result = amp_nn.update_loss_scaling(x,
+                                            found_inf,
+                                            prev_loss_scaling,
+                                            num_good_steps,
+                                            num_bad_steps,
+                                            incr_every_n_steps,
+                                            decr_every_n_nan_or_inf,
+                                            incr_ratio,
+                                            decr_ratio,
+                                            name="update_loss_scaling")
 
         place = paddle.NPUPlace(0) if use_npu else fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -168,12 +174,15 @@ def loss_scaling_check_inf(self, use_npu=True, scope=fluid.Scope()):
         b = fluid.data(name="b", shape=[512, 128], dtype='float32')
         x = [a, b]
         found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
-        prev_loss_scaling = fluid.data(
-            name="prev_loss_scaling", shape=[1], dtype='float32')
-        num_good_steps = fluid.data(
-            name="num_good_steps", shape=[1], dtype='int32')
-        num_bad_steps = fluid.data(
-            name="num_bad_steps", shape=[1], dtype='int32')
+        prev_loss_scaling = fluid.data(name="prev_loss_scaling",
+                                       shape=[1],
+                                       dtype='float32')
+        num_good_steps = fluid.data(name="num_good_steps",
+                                    shape=[1],
+                                    dtype='int32')
+        num_bad_steps = fluid.data(name="num_bad_steps",
+                                   shape=[1],
+                                   dtype='int32')
 
         a_v = np.random.random([1024, 1024]).astype('float32')
         b_v = np.random.random([512, 128]).astype('float32')
@@ -190,17 +199,16 @@ def loss_scaling_check_inf(self, use_npu=True, scope=fluid.Scope()):
         incr_ratio = 2
         decr_ratio = 0.8
 
-        result = amp_nn.update_loss_scaling(
-            x,
-            found_inf,
-            prev_loss_scaling,
-            num_good_steps,
-            num_bad_steps,
-            incr_every_n_steps,
-            decr_every_n_nan_or_inf,
-            incr_ratio,
-            decr_ratio,
-            name="update_loss_scaling")
+        result = amp_nn.update_loss_scaling(x,
+                                            found_inf,
+                                            prev_loss_scaling,
+                                            num_good_steps,
+                                            num_bad_steps,
+                                            incr_every_n_steps,
+                                            decr_every_n_nan_or_inf,
+                                            incr_ratio,
+                                            decr_ratio,
+                                            name="update_loss_scaling")
 
         place = paddle.NPUPlace(0) if use_npu else fluid.CPUPlace()
         exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_where_index_npu.py b/python/paddle/fluid/tests/unittests/npu/test_where_index_npu.py
index 20d7fb6879d44..6790afc9af045 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_where_index_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_where_index_npu.py
@@ -18,6 +18,7 @@
 import unittest
 import paddle
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 from paddle.fluid.op import Operator
@@ -28,6 +29,7 @@
 
 
 class TestWhereIndexOp(OpTest):
+
     def setUp(self):
         self.set_npu()
         self.op_type = "where_index"
@@ -38,7 +40,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def init_config(self):
-        self.inputs = {'Condition': np.array([True, False, True]), }
+        self.inputs = {
+            'Condition': np.array([True, False, True]),
+        }
 
         self.outputs = {'Out': np.array([[0], [2]], dtype='int64')}
 
@@ -47,42 +51,54 @@ def set_npu(self):
 
 
 class TestNotBool(TestWhereIndexOp):
+
     def init_config(self):
-        self.inputs = {'Condition': np.array([1, 0, 8]), }
+        self.inputs = {
+            'Condition': np.array([1, 0, 8]),
+        }
 
         self.outputs = {'Out': np.array([[0], [2]], dtype='int64')}
 
 
 class TestAllFalse(TestWhereIndexOp):
+
     def init_config(self):
-        self.inputs = {'Condition': np.array([False, False, False]), }
+        self.inputs = {
+            'Condition': np.array([False, False, False]),
+        }
 
         self.outputs = {'Out': np.array([], dtype='int64')}
 
 
 class TestRank2(TestWhereIndexOp):
+
     def init_config(self):
-        self.inputs = {'Condition': np.array([[True, False], [False, True]]), }
+        self.inputs = {
+            'Condition': np.array([[True, False], [False, True]]),
+        }
 
         self.outputs = {'Out': np.array([[0, 0], [1, 1]], dtype='int64')}
 
 
 class TestRank3(TestWhereIndexOp):
+
     def init_config(self):
         self.inputs = {
-            'Condition': np.array([[[True, False], [False, True]],
-                                   [[False, True], [True, False]],
-                                   [[False, False], [False, True]]]),
+            'Condition':
+            np.array([[[True, False], [False, True]],
+                      [[False, True], [True, False]],
+                      [[False, False], [False, True]]]),
         }
 
         self.outputs = {
-            'Out': np.array(
-                [[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 0], [2, 1, 1]],
-                dtype='int64')
+            'Out':
+            np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 0], [2, 1, 1]],
+                     dtype='int64')
         }
 
 
 class TestWhereOpError(unittest.TestCase):
+
     def test_api(self):
         with program_guard(Program(), Program()):
             cond = fluid.layers.data(name='cond', shape=[4], dtype='bool')
@@ -95,7 +111,9 @@ def test_api(self):
 
 
 class TestWhereRaiseError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_type():
             fluid.layers.where([10])
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py
index cf877ff2872af..c90bf0cb49398 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -28,6 +29,7 @@
 
 
 class TestNPUWhereOp(OpTest):
+
     def setUp(self):
         self.op_type = "where"
         self.set_npu()
@@ -52,6 +54,7 @@ def test_check_grad_normal(self):
 
 
 class TestNPUWhereOp2(TestNPUWhereOp):
+
     def init_config(self):
         self.x = np.random.uniform(-5, 5, (60, 2)).astype("float64")
         self.y = np.random.uniform(-5, 5, (60, 2)).astype("float64")
@@ -59,6 +62,7 @@ def init_config(self):
 
 
 class TestNPUWhereOp3(TestNPUWhereOp):
+
     def init_config(self):
         self.x = np.random.uniform(-3, 5, (20, 2, 4)).astype("float64")
         self.y = np.random.uniform(-3, 5, (20, 2, 4)).astype("float64")
@@ -66,6 +70,7 @@ def init_config(self):
 
 
 class TestNPUWhereAPI(unittest.TestCase):
+
     def setUp(self):
         self.__class__.use_npu = True
         self.place = paddle.NPUPlace(0)
@@ -90,8 +95,9 @@ def test_api(self):
                 train_prog = fluid.Program()
                 startup = fluid.Program()
                 with fluid.program_guard(train_prog, startup):
-                    cond = fluid.data(
-                        name='cond', shape=self.shape, dtype='bool')
+                    cond = fluid.data(name='cond',
+                                      shape=self.shape,
+                                      dtype='bool')
                     x = fluid.data(name='x', shape=self.shape, dtype='float32')
                     y = fluid.data(name='y', shape=self.shape, dtype='float32')
 
@@ -109,12 +115,13 @@ def test_api(self):
                         fetch_list.append(x.grad_name)
                     if y_stop_gradient is False:
                         fetch_list.append(y.grad_name)
-                    out = exe.run(
-                        train_prog,
-                        feed={'cond': self.cond,
-                              'x': self.x,
-                              'y': self.y},
-                        fetch_list=fetch_list)
+                    out = exe.run(train_prog,
+                                  feed={
+                                      'cond': self.cond,
+                                      'x': self.x,
+                                      'y': self.y
+                                  },
+                                  fetch_list=fetch_list)
                     assert np.array_equal(out[0], self.out)
 
                     if x_stop_gradient is False:
@@ -134,21 +141,24 @@ def test_api_broadcast(self, use_cuda=False):
             x = fluid.layers.data(name='x', shape=[4, 1], dtype='float32')
             y = fluid.layers.data(name='y', shape=[4, 2], dtype='float32')
             x_i = np.array([[0.9383, 0.1983, 3.2, 1.2]]).astype("float32")
-            y_i = np.array([[1.0, 1.0, 1.0, 1.0],
-                            [1.0, 1.0, 1.0, 1.0]]).astype("float32")
+            y_i = np.array([[1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0,
+                                                   1.0]]).astype("float32")
             result = paddle.where(x > 1, x=x, y=y)
 
             exe = fluid.Executor(self.place)
             exe.run(startup)
 
             out = exe.run(train_prog,
-                          feed={'x': x_i,
-                                'y': y_i},
+                          feed={
+                              'x': x_i,
+                              'y': y_i
+                          },
                           fetch_list=[result])
             assert np.array_equal(out[0], np.where(x_i > 1, x_i, y_i))
 
 
 class TestWhereDygraphAPI(unittest.TestCase):
+
     def test_api(self):
         with fluid.dygraph.guard(paddle.NPUPlace(0)):
             x_i = np.array([0.9383, 0.1983, 3.2, 1.2]).astype("float64")
diff --git a/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py
index a388761d5e384..22918347a2de3 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py
@@ -28,13 +28,20 @@
 
 
 class TestWhileOp(unittest.TestCase):
+
     def simple_net(self):
-        d0 = layers.data(
-            "d0", shape=[10], append_batch_size=False, dtype='float32')
-        d1 = layers.data(
-            "d1", shape=[10], append_batch_size=False, dtype='float32')
-        d2 = layers.data(
-            "d2", shape=[10], append_batch_size=False, dtype='float32')
+        d0 = layers.data("d0",
+                         shape=[10],
+                         append_batch_size=False,
+                         dtype='float32')
+        d1 = layers.data("d1",
+                         shape=[10],
+                         append_batch_size=False,
+                         dtype='float32')
+        d2 = layers.data("d2",
+                         shape=[10],
+                         append_batch_size=False,
+                         dtype='float32')
         # fill_constant npu op doesn't support int64
         i = layers.zeros(shape=[1], dtype='int32')
         i = layers.cast(i, 'int64')
@@ -102,9 +109,11 @@ def test_simple_net(self):
             for i in range(3):
                 d.append(numpy.random.random(size=[10]).astype('float32'))
 
-            outs = exe.run(feed={'d0': d[0],
-                                 'd1': d[1],
-                                 'd2': d[2]},
+            outs = exe.run(feed={
+                'd0': d[0],
+                'd1': d[1],
+                'd2': d[2]
+            },
                            fetch_list=[sum_result])
             self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
 
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index f7a3dfa1102b2..ded9f188472dd 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -46,14 +46,16 @@
     create_op,
     set_input,
     append_input_output,
-    append_loss_ops, )
+    append_loss_ops,
+)
 from white_list import (
     op_accuracy_white_list,
     check_shape_white_list,
     compile_vs_runtime_white_list,
     no_check_set_white_list,
     op_threshold_white_list,
-    no_grad_set_white_list, )
+    no_grad_set_white_list,
+)
 
 # For switch new eager mode globally
 g_is_in_eager = _in_eager_without_dygraph_check()
@@ -88,11 +90,12 @@ def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs):
                     shape, dtype = spec
                 else:
                     raise ValueError(
-                        "Value of in_specs[{}] should contains two elements: [shape, dtype]".
-                        format(index))
+                        "Value of in_specs[{}] should contains two elements: [shape, dtype]"
+                        .format(index))
                 input_t.append(
-                    paddle.static.data(
-                        name='data_%s' % index, shape=shape, dtype=dtype))
+                    paddle.static.data(name='data_%s' % index,
+                                       shape=shape,
+                                       dtype=dtype))
 
             out = api_fn(*input_t, **configs)
             out_dtype = fluid.data_feeder.convert_dtype(out.dtype)
@@ -112,8 +115,8 @@ def _set_use_system_allocator(value=None):
 
 
 def randomize_probability(batch_size, class_num, dtype='float32'):
-    prob = np.random.uniform(
-        0.1, 1.0, size=(batch_size, class_num)).astype(dtype)
+    prob = np.random.uniform(0.1, 1.0,
+                             size=(batch_size, class_num)).astype(dtype)
     prob_sum = prob.sum(axis=1)
     for i in six.moves.xrange(len(prob)):
         prob[i] /= prob_sum[i]
@@ -152,8 +155,9 @@ def product(dim):
     elif tensor_to_check_dtype == core.VarDesc.VarType.COMPLEX128:
         tensor_tp_check_dtype = np.complex128
     else:
-        raise ValueError("Not supported data type " + str(tensor_to_check_dtype)
-                         + ", tensor name : " + str(input_to_check))
+        raise ValueError("Not supported data type " +
+                         str(tensor_to_check_dtype) + ", tensor name : " +
+                         str(input_to_check))
 
     def get_output():
         sum = []
@@ -178,10 +182,10 @@ def __get_elem__(tensor, i):
         elif tensor_to_check._dtype() == core.VarDesc.VarType.BF16:
             numpy_tensor = np.array(tensor).astype(np.uint16)
             numpy_tensor = numpy_tensor.flatten()
-            return struct.unpack('<f',
-                                 struct.pack('<I',
-                                             np.uint32(numpy_tensor[i])
-                                             << np.uint32(16)))[0]
+            return struct.unpack(
+                '<f',
+                struct.pack('<I',
+                            np.uint32(numpy_tensor[i]) << np.uint32(16)))[0]
         elif tensor_to_check_dtype == np.float32:
             return tensor._get_float_element(i)
         elif tensor_to_check_dtype == np.float64:
@@ -263,6 +267,18 @@ def wrapper(cls):
     return wrapper
 
 
+def skip_check_inplace_ci(reason=None):
+    if not isinstance(reason, str):
+        raise AssertionError(
+            "The reason for skipping check_inplace is required.")
+
+    def wrapper(cls):
+        cls.no_need_check_inplace = True
+        return cls
+
+    return wrapper
+
+
 def copy_bits_from_float_to_uint16(f):
     return struct.unpack('<I', struct.pack('<f', f))[0] >> 16
 
@@ -283,13 +299,15 @@ def convert_float_to_uint16(float_list, data_format="NCHW"):
 
 def convert_uint16_to_float(in_list):
     in_list = np.asarray(in_list)
-    out = np.vectorize(
-        lambda x: struct.unpack('<f', struct.pack('<I', np.uint32(x) << np.uint32(16)))[0],
-        otypes=[np.float32])(in_list.flat)
+    out = np.vectorize(lambda x: struct.unpack(
+        '<f', struct.pack('<I',
+                          np.uint32(x) << np.uint32(16)))[0],
+                       otypes=[np.float32])(in_list.flat)
     return np.reshape(out, in_list.shape)
 
 
 class OpTest(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         '''Fix random seeds to remove randomness from tests'''
@@ -391,27 +409,27 @@ def try_call_once(self, data_type):
     def is_bfloat16_op(self):
         # self.dtype is the dtype of inputs, and is set in infer_dtype_from_inputs_outputs.
         # Make sure this function is called after calling infer_dtype_from_inputs_outputs.
-        return self.dtype == np.uint16 or (
-            hasattr(self, 'output_dtype') and
-            self.output_dtype == np.uint16) or (
-                hasattr(self, 'mkldnn_data_type') and
-                getattr(self, 'mkldnn_data_type') == "bfloat16") or (
-                    hasattr(self, 'attrs') and
-                    'mkldnn_data_type' in self.attrs and
-                    self.attrs['mkldnn_data_type'] == 'bfloat16')
+        return self.dtype == np.uint16 or (hasattr(
+            self, 'output_dtype') and self.output_dtype == np.uint16) or (
+                hasattr(self, 'mkldnn_data_type')
+                and getattr(self, 'mkldnn_data_type') == "bfloat16") or (
+                    hasattr(self, 'attrs') and 'mkldnn_data_type' in self.attrs
+                    and self.attrs['mkldnn_data_type'] == 'bfloat16')
 
     def is_mkldnn_op(self):
         return (hasattr(self, "use_mkldnn") and self.use_mkldnn == True) or (
-            hasattr(self, "attrs") and "use_mkldnn" in self.attrs and
-            self.attrs["use_mkldnn"] == True)
+            hasattr(self, "attrs") and "use_mkldnn" in self.attrs
+            and self.attrs["use_mkldnn"] == True)
 
     def is_xpu_op(self):
-        return (hasattr(self, "use_xpu") and self.use_xpu == True) or (
-            hasattr(self, "attrs") and "use_xpu" in self.attrs and
-            self.attrs["use_xpu"] == True)
+        return (hasattr(self, "use_xpu")
+                and self.use_xpu == True) or (hasattr(self, "attrs")
+                                              and "use_xpu" in self.attrs
+                                              and self.attrs["use_xpu"] == True)
 
     # set the self.output_dtype .
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
+
         def is_np_data(input):
             return isinstance(input, (np.ndarray, np.generic))
 
@@ -444,9 +462,15 @@ def infer_dtype(numpy_dict, dtype_set):
         input_dtype_set = set()
         infer_dtype(inputs, input_dtype_set)
         dtype_list = [
-            np.dtype(np.float64), np.dtype(np.float32), np.dtype(np.float16),
-            np.dtype(np.int64), np.dtype(np.int32), np.dtype(np.uint16),
-            np.dtype(np.int16), np.dtype(np.int8), np.dtype(np.uint8),
+            np.dtype(np.float64),
+            np.dtype(np.float32),
+            np.dtype(np.float16),
+            np.dtype(np.int64),
+            np.dtype(np.int32),
+            np.dtype(np.uint16),
+            np.dtype(np.int16),
+            np.dtype(np.int8),
+            np.dtype(np.uint8),
             np.dtype(np.bool)
         ]
         # check the dtype in dtype_list in order, select the first dtype that in dtype_set
@@ -481,8 +505,8 @@ def feed_var(self, input_vars, place):
                 tensor = core.LoDTensor()
                 if isinstance(self.inputs[var_name], tuple):
                     tensor.set(self.inputs[var_name][0], place)
-                    tensor.set_recursive_sequence_lengths(self.inputs[var_name][
-                        1])
+                    tensor.set_recursive_sequence_lengths(
+                        self.inputs[var_name][1])
                 else:
                     tensor.set(self.inputs[var_name], place)
                 feed_map[var_name] = tensor
@@ -511,11 +535,10 @@ def _append_ops(self, block):
 
         if hasattr(self, "cache_name_list"):
             for name in self.cache_name_list:
-                inputs[name] = block.create_var(
-                    name=name,
-                    persistable=True,
-                    type=core.VarDesc.VarType.RAW,
-                    stop_gradient=True)
+                inputs[name] = block.create_var(name=name,
+                                                persistable=True,
+                                                type=core.VarDesc.VarType.RAW,
+                                                stop_gradient=True)
 
         op = block.append_op(
             type=self.op_type,
@@ -586,8 +609,9 @@ def lod_has_single_zero(self, lod):
 
     def lod_has_continuous_zero(self, lod):
         for i in range(len(lod) - 3):
-            if lod[i] != 0 and lod[i + 1] == 0 and lod[i + 2] == 0 and lod[
-                    i + 3] != 0:
+            if lod[i] != 0 and lod[i +
+                                   1] == 0 and lod[i +
+                                                   2] == 0 and lod[i + 3] != 0:
                 return True
         return False
 
@@ -616,6 +640,7 @@ def get_sequence_instance_size_0_input(self, lod=None, shape=None):
 
     def append_input_output_for_dygraph(self, op_proto, np_list, is_input,
                                         if_return_inputs_grad_dict, block):
+
         def create_var(np_value, name, is_input, if_return_inputs_grad_dict):
             np_value_temp = np_value
             has_lod = False
@@ -637,12 +662,11 @@ def create_var(np_value, name, is_input, if_return_inputs_grad_dict):
                     v.value().get_tensor().set_recursive_sequence_lengths(
                         lod_temp)
             else:
-                v = block.create_var(
-                    name=name,
-                    dtype=np_value_temp.dtype,
-                    type=core.VarDesc.VarType.LOD_TENSOR,
-                    persistable=False,
-                    stop_gradient=False)
+                v = block.create_var(name=name,
+                                     dtype=np_value_temp.dtype,
+                                     type=core.VarDesc.VarType.LOD_TENSOR,
+                                     persistable=False,
+                                     stop_gradient=False)
             return v
 
         # prepare variable for input or output
@@ -656,8 +680,8 @@ def create_var(np_value, name, is_input, if_return_inputs_grad_dict):
                 continue
             if name not in np_list:
                 assert var_proto.intermediate, "{} not found".format(name)
-                v = block.create_var(
-                    dtype='float32', type=core.VarDesc.VarType.LOD_TENSOR)
+                v = block.create_var(dtype='float32',
+                                     type=core.VarDesc.VarType.LOD_TENSOR)
                 var_dict[name].append(v)
                 if if_return_inputs_grad_dict:
                     inputs_grad_dict[name] = v
@@ -705,11 +729,10 @@ def _check_api_outs_by_dygraph_outs(self, api_outs, dygraph_outs, place):
             np_api = np.array(api_outs[name])
             np_dyg = np.array(dygraph_outs[name])
             self.assertTrue(
-                np.allclose(
-                    np_api, np_dyg, equal_nan=False),
-                "Output (" + name + ") has diff at " + str(place) + "\nExpect "
-                + str(np_dyg) + "\n" + "But Got" + str(np_api) + " in class " +
-                self.__class__.__name__)
+                np.allclose(np_api, np_dyg, equal_nan=False),
+                "Output (" + name + ") has diff at " + str(place) +
+                "\nExpect " + str(np_dyg) + "\n" + "But Got" + str(np_api) +
+                " in class " + self.__class__.__name__)
 
     def _calc_python_api_output(self, place, egr_inps=None, egr_oups=None):
         """ set egr_inps and egr_oups = None if you want to create it by yourself.
@@ -748,18 +771,18 @@ def parse_attri_value(name, op_inputs, op_attrs):
                 elif name in op_inputs:
                     if len(op_inputs[name]) == 1:
                         # why don't use numpy().item() : if the Tensor is float64, we will change it to python.float32, where we loss accuracy: [allclose_op]
-                        # why we reconstruct a tensor: because we want the tensor in cpu. 
-                        return paddle.to_tensor(
-                            op_inputs[name][0].numpy(), place='cpu')
+                        # why we reconstruct a tensor: because we want the tensor in cpu.
+                        return paddle.to_tensor(op_inputs[name][0].numpy(),
+                                                place='cpu')
                     else:
                         # if this is a list (test_unsqueeze2_op): we just pass it into the python api.
                         return op_inputs[name]
                 else:
                     return Empty()
 
-            # NOTE(xiongkun): the logic of constructing parameters: 
-            # for example:  
-            #    python api: cumprod(x, dim, dtype=None, name=None) 
+            # NOTE(xiongkun): the logic of constructing parameters:
+            # for example:
+            #    python api: cumprod(x, dim, dtype=None, name=None)
             #    kernel sig: [["x"], ["dim"], ["out"]]"
             #
             # we will construct a lot of list with the same length : len == len(api_params), here is 4
@@ -767,7 +790,7 @@ def parse_attri_value(name, op_inputs, op_attrs):
             #    api_defaults = [Empty, Empty, None, None]; empty means no defaults.
             #    inputs_and_attrs = ["x", "dim"] , the length may shorter or longer than api_params
             #    input_arguments = [RealValue in self.inputs and self.attrs]
-            # then ,we will loop for the api_params, construct a result list: 
+            # then ,we will loop for the api_params, construct a result list:
             #    if the name in ['name', 'dtype', 'out', 'output'], we will use the default value
             #    else, we will consume a input_arguments. (because the name is not corresponding, so we only use the order)
 
@@ -879,15 +902,17 @@ def cal_python_api(python_api, args, kernel_sig):
                     if self.attrs[attrs_name] is not None:
                         attrs_outputs[attrs_name] = self.attrs[attrs_name]
 
-            kernel_sig = _get_kernel_signature(
-                eager_tensor_inputs, eager_tensor_outputs, attrs_outputs)
+            kernel_sig = _get_kernel_signature(eager_tensor_inputs,
+                                               eager_tensor_outputs,
+                                               attrs_outputs)
             if not kernel_sig:
                 return None
             assert hasattr(
                 self, "python_api"
             ), "Detect there is KernelSignature for `%s` op, please set the `self.python_api` if you set check_eager = True" % self.op_type
-            args = prepare_python_api_arguments(
-                self.python_api, eager_tensor_inputs, attrs_outputs, kernel_sig)
+            args = prepare_python_api_arguments(self.python_api,
+                                                eager_tensor_inputs,
+                                                attrs_outputs, kernel_sig)
             """ we directly return the cal_python_api value because the value is already tensor. 
             """
             return cal_python_api(self.python_api, args, kernel_sig)
@@ -900,8 +925,8 @@ def _calc_dygraph_output(self, place, parallel=False, no_check_set=None):
             op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
 
             # prepare input variable
-            inputs = self.append_input_output_for_dygraph(op_proto, self.inputs,
-                                                          True, False, block)
+            inputs = self.append_input_output_for_dygraph(
+                op_proto, self.inputs, True, False, block)
             # prepare output variable
             outputs = self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
@@ -1017,8 +1042,7 @@ def _compare_expect_and_actual_outputs(self,
             actual_out = np.array(actual_outs[i])
             if inplace_atol is not None:
                 self.assertTrue(
-                    np.allclose(
-                        expect_out, actual_out, atol=inplace_atol),
+                    np.allclose(expect_out, actual_out, atol=inplace_atol),
                     "Output (" + name + ") has diff at " + str(place) +
                     " when using and not using inplace" + "\nExpect " +
                     str(expect_out) + "\n" + "But Got" + str(actual_out) +
@@ -1058,12 +1082,11 @@ def _construct_grad_program_from_forward(self, fwd_program, grad_op_desc,
             fwd_var = fwd_program.global_block().vars.get(fwd_var_name)
             assert fwd_var is not None, "{} cannot be found".format(
                 fwd_var_name)
-            grad_var = grad_block.create_var(
-                name=arg,
-                dtype=fwd_var.dtype,
-                shape=fwd_var.shape,
-                type=fwd_var.type,
-                persistable=False)
+            grad_var = grad_block.create_var(name=arg,
+                                             dtype=fwd_var.dtype,
+                                             shape=fwd_var.shape,
+                                             type=fwd_var.type,
+                                             persistable=False)
 
             # Some variables' tensors hold no buffer (tensor's _holder is NULL), like XShape in reshape2 op,
             # and the shapes of those variables contain 0 (eg. Xshape.shape = [0, 2, 5]).
@@ -1175,23 +1198,20 @@ def _check_forward_inplace(self,
                 We return this to construct grad_program and grad_feed_map for grad inplace check.
         """
         # _calc_output() returns in the form tuple(outs, fetch_list, feed_map, program, op_desc) when for_inplace_test=True.
-        expect_res = self._calc_output(
-            place,
-            no_check_set=no_check_set,
-            enable_inplace=False,
-            for_inplace_test=True)
-        actual_res = self._calc_output(
-            place,
-            no_check_set=no_check_set,
-            enable_inplace=True,
-            for_inplace_test=True)
+        expect_res = self._calc_output(place,
+                                       no_check_set=no_check_set,
+                                       enable_inplace=False,
+                                       for_inplace_test=True)
+        actual_res = self._calc_output(place,
+                                       no_check_set=no_check_set,
+                                       enable_inplace=True,
+                                       for_inplace_test=True)
         # compare expect_outs and actual_outs
-        self._compare_expect_and_actual_outputs(
-            place,
-            expect_res[1],
-            expect_res[0],
-            actual_res[0],
-            inplace_atol=inplace_atol)
+        self._compare_expect_and_actual_outputs(place,
+                                                expect_res[1],
+                                                expect_res[0],
+                                                actual_res[0],
+                                                inplace_atol=inplace_atol)
         return expect_res
 
     def _calc_grad_output(self,
@@ -1215,8 +1235,8 @@ def _calc_grad_output(self,
             res (tuple(outs, fetch_list, feed_map, program, op_desc)): The results of given grad_op_desc.
         """
         fwd_outs, fwd_fetch_list, fwd_feed_map, fwd_program, fwd_op_desc = fwd_res
-        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(fwd_op_desc,
-                                                                  set(), [])
+        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+            fwd_op_desc, set(), [])
         grad_program = self._construct_grad_program_from_forward(
             fwd_program, grad_op_desc, op_grad_to_var)
         grad_feed_map = self._construct_grad_feed_map_from_forward(
@@ -1228,8 +1248,9 @@ def _calc_grad_output(self,
             build_strategy = fluid.BuildStrategy()
             build_strategy.enable_inplace = enable_inplace
             compiled_program = fluid.CompiledProgram(
-                grad_program).with_data_parallel(
-                    loss_name="", build_strategy=build_strategy, places=place)
+                grad_program).with_data_parallel(loss_name="",
+                                                 build_strategy=build_strategy,
+                                                 places=place)
             program = compiled_program
         outs = exe.run(program,
                        feed=grad_feed_map,
@@ -1259,16 +1280,19 @@ def _check_grad_inplace(self,
             expect_res (tuple(outs, fetch_list, feed_map, program, op_desc)): The results of given op.
                 We return this to construct grad_program and grad_feed_map for grad inplace check.
         """
-        expect_res = self._calc_grad_output(
-            place, fwd_res, grad_op_desc, enable_inplace=False)
-        actual_res = self._calc_grad_output(
-            place, fwd_res, grad_op_desc, enable_inplace=True)
-        self._compare_expect_and_actual_outputs(
-            place,
-            expect_res[1],
-            expect_res[0],
-            actual_res[0],
-            inplace_atol=inplace_atol)
+        expect_res = self._calc_grad_output(place,
+                                            fwd_res,
+                                            grad_op_desc,
+                                            enable_inplace=False)
+        actual_res = self._calc_grad_output(place,
+                                            fwd_res,
+                                            grad_op_desc,
+                                            enable_inplace=True)
+        self._compare_expect_and_actual_outputs(place,
+                                                expect_res[1],
+                                                expect_res[0],
+                                                actual_res[0],
+                                                inplace_atol=inplace_atol)
         return expect_res
 
     def check_inplace_output_with_place(self,
@@ -1288,11 +1312,15 @@ def check_inplace_output_with_place(self,
         Returns:
             None
         """
+        if getattr(self, "no_need_check_inplace", False):
+            return
+
         has_infer_inplace = fluid.core.has_infer_inplace(self.op_type)
         has_grad_op_maker = fluid.core.has_grad_op_maker(self.op_type)
 
-        fwd_res = self._calc_output(
-            place, no_check_set=no_check_set, for_inplace_test=True)
+        fwd_res = self._calc_output(place,
+                                    no_check_set=no_check_set,
+                                    for_inplace_test=True)
         op_desc = fwd_res[4]
         need_run_ops = self._get_need_run_ops(op_desc)
 
@@ -1309,15 +1337,15 @@ def check_inplace_output_with_place(self,
                         no_check_set=no_check_set,
                         inplace_atol=inplace_atol)
                 else:
-                    res[op_desc] = self._calc_output(
-                        place, no_check_set=no_check_set, for_inplace_test=True)
+                    res[op_desc] = self._calc_output(place,
+                                                     no_check_set=no_check_set,
+                                                     for_inplace_test=True)
             else:
                 # TODO(zhiqiu): enhance inplace_grad test for ops (sum and activation) using mkldnn
                 # skip op that use_mkldnn currently
                 flags_use_mkldnn = fluid.core.globals()["FLAGS_use_mkldnn"]
-                attrs_use_mkldnn = hasattr(
-                    self,
-                    'attrs') and bool(self.attrs.get('use_mkldnn', False))
+                attrs_use_mkldnn = hasattr(self, 'attrs') and bool(
+                    self.attrs.get('use_mkldnn', False))
                 if flags_use_mkldnn or attrs_use_mkldnn:
                     warnings.warn(
                         "check inplace_grad for ops using mkldnn is not supported"
@@ -1328,8 +1356,8 @@ def check_inplace_output_with_place(self,
                     res[op_desc] = self._check_grad_inplace(
                         place, fwd_res, op_desc, inplace_atol=inplace_atol)
                 else:
-                    res[op_desc] = self._calc_grad_output(place, fwd_res,
-                                                          op_desc)
+                    res[op_desc] = self._calc_grad_output(
+                        place, fwd_res, op_desc)
 
     def check_output_with_place(self,
                                 place,
@@ -1339,6 +1367,7 @@ def check_output_with_place(self,
                                 check_dygraph=True,
                                 inplace_atol=None,
                                 check_eager=False):
+
         def find_imperative_actual(target_name, dygraph_outs, place):
             for name in dygraph_outs:
                 if name == target_name:
@@ -1347,8 +1376,9 @@ def find_imperative_actual(target_name, dygraph_outs, place):
                 for i, var in enumerate(var_list):
                     if var.name == target_name:
                         return dygraph_outs[name][i]
-            self.assertTrue(False, "Found failed {} {}".format(
-                dygraph_outs.keys(), target_name))
+            self.assertTrue(
+                False, "Found failed {} {}".format(dygraph_outs.keys(),
+                                                   target_name))
 
         def find_actual(target_name, fetch_list):
             found = [
@@ -1405,9 +1435,8 @@ def _compare_numpy(self, name, actual_np, expect_np):
                         expect_np,
                         atol=atol,
                         rtol=self.rtol if hasattr(self, 'rtol') else 1e-5,
-                        equal_nan=equal_nan),
-                    "Output (" + name + ") has diff at " + str(place) + " in " +
-                    self.checker_name)
+                        equal_nan=equal_nan), "Output (" + name +
+                    ") has diff at " + str(place) + " in " + self.checker_name)
 
             def _compare_list(self, name, actual, expect):
                 """ if expect is a tuple, we need to compare list.
@@ -1439,8 +1468,8 @@ def compare_outputs_with_expects(self):
                                                  type(sub_out))
                         for item in sub_out:
                             sub_out_name, expect = item[0], item[1]
-                            self.compare_single_output_with_expect(sub_out_name,
-                                                                   expect)
+                            self.compare_single_output_with_expect(
+                                sub_out_name, expect)
                     else:
                         expect = self.expects[out_name]
                         self.compare_single_output_with_expect(out_name, expect)
@@ -1456,6 +1485,7 @@ def check(self):
                 self.compare_outputs_with_expects()
 
         class StaticChecker(Checker):
+
             def init(self):
                 self.checker_name = "static checker"
 
@@ -1498,6 +1528,7 @@ def _compare_list(self, name, actual, expect):
                     "Output (" + name + ") has different lod at " + str(place))
 
         class DygraphChecker(Checker):
+
             def init(self):
                 self.checker_name = "dygraph checker"
 
@@ -1509,8 +1540,8 @@ def find_actual_value(self, name):
                 with fluid.dygraph.base.guard(place=place):
                     imperative_actual = find_imperative_actual(
                         name, self.outputs, place)
-                    imperative_actual_t = np.array(imperative_actual.value()
-                                                   .get_tensor())
+                    imperative_actual_t = np.array(
+                        imperative_actual.value().get_tensor())
                     return imperative_actual, imperative_actual_t
 
             def convert_uint16_to_float_ifneed(self, actual_np, expect_np):
@@ -1532,8 +1563,8 @@ def _compare_list(self, name, actual, expect):
                 """
                 with fluid.dygraph.base.guard(place=place):
                     self.op_test.assertListEqual(
-                        actual.value().get_tensor()
-                        .recursive_sequence_lengths(), expect[1],
+                        actual.value().get_tensor().recursive_sequence_lengths(
+                        ), expect[1],
                         "Output (" + name + ") has different lod at " +
                         str(place) + " in dygraph mode")
 
@@ -1555,6 +1586,7 @@ def _compare_numpy(self, name, actual_np, expect_np):
                         " in " + self.checker_name)
 
         class EagerChecker(DygraphChecker):
+
             def init(self):
                 self.checker_name = "eager checker"
 
@@ -1577,8 +1609,8 @@ def _compare_numpy(self, name, actual_np, expect_np):
 
             def convert_uint16_to_float_ifneed(self, actual_np, expect_np):
                 with _test_eager_guard():
-                    return super().convert_uint16_to_float_ifneed(actual_np,
-                                                                  expect_np)
+                    return super().convert_uint16_to_float_ifneed(
+                        actual_np, expect_np)
 
             def find_actual_value(self, name):
                 with _test_eager_guard():
@@ -1598,7 +1630,7 @@ def _is_skip_name(self, name):
                     return True
                 return super()._is_skip_name(name)
 
-        # set some flags by the combination of arguments. 
+        # set some flags by the combination of arguments.
         self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
         if self.dtype == np.float64 and \
             self.op_type not in op_threshold_white_list.NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST:
@@ -1654,8 +1686,9 @@ def _is_skip_name(self, name):
         if not paddle.is_compiled_with_xpu(
         ) and not paddle.is_compiled_with_npu(
         ) and not paddle.is_compiled_with_mlu():
-            self.check_inplace_output_with_place(
-                place, no_check_set=no_check_set, inplace_atol=inplace_atol)
+            self.check_inplace_output_with_place(place,
+                                                 no_check_set=no_check_set,
+                                                 inplace_atol=inplace_atol)
 
         if check_eager:
             return outs, dygraph_outs, eager_dygraph_outs, fetch_list
@@ -1665,6 +1698,7 @@ def _is_skip_name(self, name):
             return outs, fetch_list
 
     def check_compile_vs_runtime(self, fetch_list, fetch_outs):
+
         def find_fetch_index(target_name, fetch_list):
             found = [
                 i for i, var_name in enumerate(fetch_list)
@@ -1741,14 +1775,13 @@ def check_output(self,
 
         places = self._get_places()
         for place in places:
-            res = self.check_output_with_place(
-                place,
-                atol,
-                no_check_set,
-                equal_nan,
-                check_dygraph,
-                inplace_atol,
-                check_eager=check_eager)
+            res = self.check_output_with_place(place,
+                                               atol,
+                                               no_check_set,
+                                               equal_nan,
+                                               check_dygraph,
+                                               inplace_atol,
+                                               check_eager=check_eager)
             if check_eager:
                 assert check_dygraph == True
                 outs, dygraph_outs, eager_dygraph_outs, fetch_list = res
@@ -1828,18 +1861,17 @@ def check_grad(self,
         self._check_grad_helper()
         places = self._get_places()
         for place in places:
-            self.check_grad_with_place(
-                place,
-                inputs_to_check,
-                output_names,
-                no_grad_set,
-                numeric_grad_delta,
-                in_place,
-                max_relative_error,
-                user_defined_grads,
-                user_defined_grad_outputs,
-                check_dygraph,
-                check_eager=check_eager)
+            self.check_grad_with_place(place,
+                                       inputs_to_check,
+                                       output_names,
+                                       no_grad_set,
+                                       numeric_grad_delta,
+                                       in_place,
+                                       max_relative_error,
+                                       user_defined_grads,
+                                       user_defined_grad_outputs,
+                                       check_dygraph,
+                                       check_eager=check_eager)
 
     def check_grad_with_place(self,
                               place,
@@ -1879,13 +1911,12 @@ def check_grad_with_place(self,
             op_attrs["use_mkldnn"] = False
             use_onednn = True
 
-        self.op = create_op(
-            self.scope,
-            self.op_type,
-            op_inputs,
-            op_outputs,
-            op_attrs,
-            cache_list=cache_list)
+        self.op = create_op(self.scope,
+                            self.op_type,
+                            op_inputs,
+                            op_outputs,
+                            op_attrs,
+                            cache_list=cache_list)
 
         if use_onednn:
             op_attrs["use_mkldnn"] = True
@@ -1894,9 +1925,9 @@ def check_grad_with_place(self,
             no_grad_set = set()
         else:
             if (self.op_type not in no_grad_set_white_list.NEED_TO_FIX_OP_LIST
-                ) and (
-                    self.op_type not in no_grad_set_white_list.NOT_CHECK_OP_LIST
-                ) and (not self.is_bfloat16_op()):
+                ) and (self.op_type
+                       not in no_grad_set_white_list.NOT_CHECK_OP_LIST) and (
+                           not self.is_bfloat16_op()):
                 raise AssertionError("no_grad_set must be None, op_type is " +
                                      self.op_type + " Op.")
 
@@ -1915,15 +1946,15 @@ def check_grad_with_place(self,
             numeric_place = place
 
         numeric_grads = user_defined_grads or [
-            get_numeric_gradient(
-                numeric_place,
-                self.scope,
-                self.op,
-                self.inputs,
-                input_to_check,
-                output_names,
-                delta=numeric_grad_delta,
-                in_place=in_place) for input_to_check in inputs_to_check
+            get_numeric_gradient(numeric_place,
+                                 self.scope,
+                                 self.op,
+                                 self.inputs,
+                                 input_to_check,
+                                 output_names,
+                                 delta=numeric_grad_delta,
+                                 in_place=in_place)
+            for input_to_check in inputs_to_check
         ]
         analytic_grads = self._get_gradient(inputs_to_check, place,
                                             output_names, no_grad_set,
@@ -1954,9 +1985,10 @@ def check_grad_with_place(self,
             # ensure switch into legacy dygraph
             g_enable_legacy_dygraph()
 
-            dygraph_grad = self._get_dygraph_grad(
-                inputs_to_check, place, output_names, user_defined_grad_outputs,
-                no_grad_set, False)
+            dygraph_grad = self._get_dygraph_grad(inputs_to_check, place,
+                                                  output_names,
+                                                  user_defined_grad_outputs,
+                                                  no_grad_set, False)
             fp32_grads = []
             for grad in dygraph_grad:
                 if grad.dtype == np.uint16:
@@ -2024,8 +2056,8 @@ def _get_dygraph_grad(self,
                         attrs_outputs[attrs_name] = self.attrs[attrs_name]
 
             if check_eager:
-                eager_outputs = self._calc_python_api_output(place, inputs,
-                                                             outputs)
+                eager_outputs = self._calc_python_api_output(
+                    place, inputs, outputs)
             # if outputs is None, kernel sig is empty or other error is happens.
             if not check_eager or eager_outputs is None:
                 block.append_op(
@@ -2039,16 +2071,17 @@ def _get_dygraph_grad(self,
             if self.dtype == np.uint16:
                 cast_inputs = self._find_var_in_dygraph(outputs,
                                                         output_names[0])
-                cast_outputs = block.create_var(
-                    dtype="float32", shape=cast_inputs[0].shape)
-                cast_op = block.append_op(
-                    inputs={"X": cast_inputs},
-                    outputs={"Out": cast_outputs},
-                    type="cast",
-                    attrs={
-                        "in_dtype": core.VarDesc.VarType.BF16,
-                        "out_dtype": core.VarDesc.VarType.FP32
-                    })
+                cast_outputs = block.create_var(dtype="float32",
+                                                shape=cast_inputs[0].shape)
+                cast_op = block.append_op(inputs={"X": cast_inputs},
+                                          outputs={"Out": cast_outputs},
+                                          type="cast",
+                                          attrs={
+                                              "in_dtype":
+                                              core.VarDesc.VarType.BF16,
+                                              "out_dtype":
+                                              core.VarDesc.VarType.FP32
+                                          })
                 outputs = {output_names[0]: cast_outputs}
 
             outputs_valid = {}
@@ -2078,11 +2111,10 @@ def _get_dygraph_grad(self,
                             type=core.VarDesc.VarType.LOD_TENSOR,
                             persistable=False,
                             stop_gradient=False)
-                        block.append_op(
-                            type="mean",
-                            inputs={"X": outputs_valid[cur_loss]},
-                            outputs={"Out": [cur_avg_loss]},
-                            attrs=None)
+                        block.append_op(type="mean",
+                                        inputs={"X": outputs_valid[cur_loss]},
+                                        outputs={"Out": [cur_avg_loss]},
+                                        attrs=None)
                         avg_sum.append(cur_avg_loss)
                     loss_sum = block.create_var(
                         dtype=self.dtype,
@@ -2090,22 +2122,20 @@ def _get_dygraph_grad(self,
                         persistable=False,
                         stop_gradient=False,
                         shape=[1])
-                    block.append_op(
-                        type='sum',
-                        inputs={"X": avg_sum},
-                        outputs={"Out": loss_sum},
-                        attrs=None)
+                    block.append_op(type='sum',
+                                    inputs={"X": avg_sum},
+                                    outputs={"Out": loss_sum},
+                                    attrs=None)
                     loss = block.create_var(
                         dtype=self.dtype,
                         type=core.VarDesc.VarType.LOD_TENSOR,
                         persistable=False,
                         stop_gradient=False,
                         shape=[1])
-                    block.append_op(
-                        type='scale',
-                        inputs={"X": loss_sum},
-                        outputs={"Out": loss},
-                        attrs={'scale': 1.0 / float(len(avg_sum))})
+                    block.append_op(type='scale',
+                                    inputs={"X": loss_sum},
+                                    outputs={"Out": loss},
+                                    attrs={'scale': 1.0 / float(len(avg_sum))})
                 loss.backward()
 
                 fetch_list_grad = []
@@ -2120,14 +2150,13 @@ def _get_dygraph_grad(self,
                 grad_outputs = []
                 for grad_out_value in user_defined_grad_outputs:
                     grad_outputs.append(paddle.to_tensor(grad_out_value))
-                # delete the inputs which no need to calculate grad                
+                # delete the inputs which no need to calculate grad
                 for no_grad_val in no_grad_set:
                     del (inputs[no_grad_val])
 
                 if not _in_legacy_dygraph():
-                    core.eager.run_backward(
-                        fluid.layers.utils.flatten(outputs), grad_outputs,
-                        False)
+                    core.eager.run_backward(fluid.layers.utils.flatten(outputs),
+                                            grad_outputs, False)
                     grad_inputs = []
                     for inputs_list in inputs.values():
                         for inp in inputs_list:
@@ -2179,24 +2208,24 @@ def _get_gradient(self,
         if user_defined_grad_outputs is None:
             if self.dtype == np.uint16:
                 cast_inputs = list(map(block.var, output_names))
-                cast_outputs = block.create_var(
-                    dtype="float32", shape=cast_inputs[0].shape)
-                cast_op = block.append_op(
-                    inputs={"X": cast_inputs},
-                    outputs={"Out": cast_outputs},
-                    type="cast",
-                    attrs={
-                        "in_dtype": core.VarDesc.VarType.BF16,
-                        "out_dtype": core.VarDesc.VarType.FP32
-                    })
+                cast_outputs = block.create_var(dtype="float32",
+                                                shape=cast_inputs[0].shape)
+                cast_op = block.append_op(inputs={"X": cast_inputs},
+                                          outputs={"Out": cast_outputs},
+                                          type="cast",
+                                          attrs={
+                                              "in_dtype":
+                                              core.VarDesc.VarType.BF16,
+                                              "out_dtype":
+                                              core.VarDesc.VarType.FP32
+                                          })
                 cast_op.desc.infer_var_type(block.desc)
                 cast_op.desc.infer_shape(block.desc)
                 output_names = [cast_outputs.name]
             loss = append_loss_ops(block, output_names)
-            param_grad_list = append_backward(
-                loss=loss,
-                parameter_list=input_to_check,
-                no_grad_set=no_grad_set)
+            param_grad_list = append_backward(loss=loss,
+                                              parameter_list=input_to_check,
+                                              no_grad_set=no_grad_set)
             fetch_list = [g for p, g in param_grad_list]
         else:
             assert parallel is False, "unsupported parallel mode when giving custom grad outputs."
@@ -2206,10 +2235,9 @@ def _get_gradient(self,
             grad_outputs = []
             for grad_out_value in user_defined_grad_outputs:
                 # `presistable` is used to avoid executor create new var in local scope
-                var = block.create_var(
-                    shape=grad_out_value.shape,
-                    dtype=grad_out_value.dtype,
-                    persistable=True)
+                var = block.create_var(shape=grad_out_value.shape,
+                                       dtype=grad_out_value.dtype,
+                                       persistable=True)
                 true_var = scope.var(var.name)
                 tensor = true_var.get_tensor()
                 tensor.set(grad_out_value, place)
@@ -2231,7 +2259,8 @@ def _get_gradient(self,
             prog = compiled_prog
         executor = fluid.Executor(place)
         return list(
-            map(np.array,
+            map(
+                np.array,
                 executor.run(prog,
                              feed_dict,
                              fetch_list,
@@ -2240,6 +2269,7 @@ def _get_gradient(self,
 
 
 class OpTestTool:
+
     @classmethod
     def skip_if(cls, condition: object, reason: str):
         return unittest.skipIf(condition, reason)
@@ -2247,8 +2277,8 @@ def skip_if(cls, condition: object, reason: str):
     @classmethod
     def skip_if_not_cpu_bf16(cls):
         return OpTestTool.skip_if(
-            not (isinstance(_current_expected_place(), core.CPUPlace) and
-                 core.supports_bfloat16()),
+            not (isinstance(_current_expected_place(), core.CPUPlace)
+                 and core.supports_bfloat16()),
             "Place does not support BF16 evaluation")
 
     @classmethod
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py
index 4a67af02bcff3..cabfec949fe1e 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -42,6 +42,7 @@
 
 
 class XPUOpTest(OpTest):
+
     @classmethod
     def setUpClass(cls):
         '''Fix random seeds to remove randomness from tests'''
@@ -100,8 +101,9 @@ def check_output_with_place(self,
 
         if self.dtype == np.float16:
             atol = 0.1
-        return super().check_output_with_place(
-            place, atol, no_check_set, equal_nan, check_dygraph, inplace_atol)
+        return super().check_output_with_place(place, atol, no_check_set,
+                                               equal_nan, check_dygraph,
+                                               inplace_atol)
 
     def check_grad(self,
                    inputs_to_check,
@@ -210,13 +212,12 @@ def get_grad_with_place(self,
             op_attrs["use_mkldnn"] = False
             use_onednn = True
 
-        self.op = create_op(
-            self.scope,
-            self.op_type,
-            op_inputs,
-            op_outputs,
-            op_attrs,
-            cache_list=cache_list)
+        self.op = create_op(self.scope,
+                            self.op_type,
+                            op_inputs,
+                            op_outputs,
+                            op_attrs,
+                            cache_list=cache_list)
 
         if use_onednn:
             op_attrs["use_mkldnn"] = True
@@ -225,9 +226,9 @@ def get_grad_with_place(self,
             no_grad_set = set()
         else:
             if (self.op_type not in no_grad_set_white_list.NEED_TO_FIX_OP_LIST
-                ) and (
-                    self.op_type not in no_grad_set_white_list.NOT_CHECK_OP_LIST
-                ) and (not self.is_bfloat16_op()):
+                ) and (self.op_type
+                       not in no_grad_set_white_list.NOT_CHECK_OP_LIST) and (
+                           not self.is_bfloat16_op()):
                 raise AssertionError("no_grad_set must be None, op_type is " +
                                      self.op_type + " Op.")
 
diff --git a/python/paddle/fluid/tests/unittests/parallel_class_center_sample.py b/python/paddle/fluid/tests/unittests/parallel_class_center_sample.py
index e1126138eac84..d5815d90191fc 100644
--- a/python/paddle/fluid/tests/unittests/parallel_class_center_sample.py
+++ b/python/paddle/fluid/tests/unittests/parallel_class_center_sample.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -44,8 +44,8 @@ def class_center_sample_numpy(label, classes_list, num_samples):
     for i in range(nranks):
         index = np.logical_and(unique_label >= class_interval[i],
                                unique_label < class_interval[i + 1])
-        pos_class_center_per_device.append(unique_label[index] - class_interval[
-            i])
+        pos_class_center_per_device.append(unique_label[index] -
+                                           class_interval[i])
         unique_label_per_device.append(unique_label[index])
 
     num_samples_per_device = []
@@ -67,6 +67,7 @@ def class_center_sample_numpy(label, classes_list, num_samples):
 
 
 class TestParallelClassCenterSampleOp(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         fleet.init(is_collective=True, strategy=strategy)
@@ -90,8 +91,9 @@ def test_class_center_sample(self):
                 classes_list = np.random.randint(10, 15, (nranks, ))
                 num_class = np.sum(classes_list)
 
-                np_label = np.random.randint(
-                    0, num_class, (batch_size, ), dtype=dtype)
+                np_label = np.random.randint(0,
+                                             num_class, (batch_size, ),
+                                             dtype=dtype)
                 label = paddle.to_tensor(np_label, dtype=dtype)
                 np_remapped_label, np_sampled_class_center_per_device = class_center_sample_numpy(
                     np_label, classes_list, num_samples)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py
index 26c9944abd6c6..a05de58363c6c 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py
@@ -28,14 +28,14 @@
 
 
 class SimpleNet(fluid.Layer):
+
     def __init__(self, hidden_size, vocab_size, is_sparse=False):
         super(SimpleNet, self).__init__()
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
-        self.embedding = Embedding(
-            size=[self.vocab_size, self.hidden_size],
-            dtype='float32',
-            is_sparse=is_sparse)
+        self.embedding = Embedding(size=[self.vocab_size, self.hidden_size],
+                                   dtype='float32',
+                                   is_sparse=is_sparse)
 
         self.lin_a = paddle.nn.Linear(self.hidden_size, self.vocab_size)
         self.lin_b = paddle.nn.Linear(self.vocab_size, 1)
@@ -60,8 +60,10 @@ def forward(self, input, label, conf):
             projection = paddle.reshape(projection, shape=[-1, 1])
             output = paddle.gather(projection, emb_mask_inds)
             target = paddle.gather(label, emb_mask_inds)
-            loss_box = F.smooth_l1_loss(
-                output, target, reduction='sum', delta=1.0)
+            loss_box = F.smooth_l1_loss(output,
+                                        target,
+                                        reduction='sum',
+                                        delta=1.0)
             loss_box = loss_box / len(conf)
 
         return loss_box
@@ -73,29 +75,33 @@ def forward(self, input, label, conf):
 hidden_size = 5
 vocab_size = 100
 
-conf_dataset = [[0], [0], [0], [0], [1], [0], [1], [0], [0], [1], [0], [1],
-                [1], [1], [1], [1], [1], [1], [1], [1], [1], [0], [0], [1]]
+conf_dataset = [[0], [0], [0], [0], [1], [0], [1], [0], [0], [1], [0], [1], [1],
+                [1], [1], [1], [1], [1], [1], [1], [1], [0], [0], [1]]
 
 
 def fake_sample_reader():
+
     def __reader__():
         for i in range(batch_num):
             x_data = np.random.randint(0, vocab_size)
             y_data = np.random.random_sample((1, )).astype('float32')
-            conf_data = np.array(conf_dataset[i % len(conf_dataset)]).astype(
-                'int64')
+            conf_data = np.array(
+                conf_dataset[i % len(conf_dataset)]).astype('int64')
             yield x_data, y_data, conf_data
 
     return __reader__
 
 
 class TestSimpleNet(TestParallelDyGraphRunnerBase):
+
     def get_model(self):
-        model = SimpleNet(
-            hidden_size=hidden_size, vocab_size=vocab_size, is_sparse=False)
+        model = SimpleNet(hidden_size=hidden_size,
+                          vocab_size=vocab_size,
+                          is_sparse=False)
 
-        train_reader = paddle.batch(
-            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        train_reader = paddle.batch(fake_sample_reader(),
+                                    batch_size=batch_size,
+                                    drop_last=True)
 
         optimizer = paddle.optimizer.SGD(learning_rate=0.001,
                                          parameters=model.parameters())
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py
index 3157d5e4129ee..4f7809253899c 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py
@@ -36,14 +36,15 @@
 
 
 class SimpleNet(fluid.Layer):
+
     def __init__(self):
         super(SimpleNet, self).__init__()
-        self.net_a = paddle.nn.Sequential(
-            paddle.nn.Linear(10, 20),
-            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
-        self.net_b = paddle.nn.Sequential(
-            paddle.nn.Linear(10, 20),
-            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
+        self.net_a = paddle.nn.Sequential(paddle.nn.Linear(10, 20),
+                                          paddle.nn.Linear(20, 20),
+                                          paddle.nn.Linear(20, 5))
+        self.net_b = paddle.nn.Sequential(paddle.nn.Linear(10, 20),
+                                          paddle.nn.Linear(20, 20),
+                                          paddle.nn.Linear(20, 5))
         self.net_unused = Linear(10, 20)
         self.step = 0
 
@@ -57,6 +58,7 @@ def forward(self, x):
 
 
 def fake_sample_reader():
+
     def __reader__():
         for i in range(batch_num):
             x_data = np.random.random_sample((10, )).astype('float32')
@@ -66,10 +68,12 @@ def __reader__():
 
 
 class TestSimpleNet(TestParallelDyGraphRunnerBase):
+
     def get_model(self):
         model = SimpleNet()
-        train_reader = paddle.batch(
-            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        train_reader = paddle.batch(fake_sample_reader(),
+                                    batch_size=batch_size,
+                                    drop_last=True)
         optimizer = paddle.optimizer.SGD(learning_rate=0.001,
                                          parameters=model.parameters())
         return model, train_reader, optimizer
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_with_pylayer.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_with_pylayer.py
index 8ce2275868b39..049c3a0858a84 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_with_pylayer.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_with_pylayer.py
@@ -31,6 +31,7 @@
 
 
 class cus_tanh(PyLayer):
+
     @staticmethod
     def forward(ctx, x):
         y = paddle.tanh(x)
@@ -45,6 +46,7 @@ def backward(ctx, dy):
 
 
 class cus_tanh_eager(EagerPyLayer):
+
     @staticmethod
     def forward(ctx, x):
         y = paddle.tanh(x)
@@ -59,6 +61,7 @@ def backward(ctx, dy):
 
 
 class SimpleNet(paddle.nn.Layer):
+
     def __init__(self, train_id, model_id):
         super(SimpleNet, self).__init__()
         self.w = self.create_parameter(shape=[in_dim, batch], dtype="float32")
@@ -82,6 +85,7 @@ def forward(self, inputs):
 
 
 class TestDistTraning(unittest.TestCase):
+
     def test_multiple_gpus(self):
         self.trainer_id = dist.get_rank()
         dist.init_parallel_env()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
index 781d606f33b8f..b4f9455950485 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
@@ -33,16 +33,17 @@
 
 
 class SimpleNet(fluid.Layer):
+
     def __init__(self, train_id):
         super(SimpleNet, self).__init__()
-        self.w1 = self.create_parameter(
-            shape=[in_dim, out_dim], dtype="float32")
-        self.w2 = self.create_parameter(
-            shape=[in_dim, out_dim], dtype="float32")
+        self.w1 = self.create_parameter(shape=[in_dim, out_dim],
+                                        dtype="float32")
+        self.w2 = self.create_parameter(shape=[in_dim, out_dim],
+                                        dtype="float32")
         self.share_net = Linear(out_dim, 10)
 
-        self.unused_param = self.create_parameter(
-            shape=[out_dim, in_dim], dtype="float64")
+        self.unused_param = self.create_parameter(shape=[out_dim, in_dim],
+                                                  dtype="float64")
 
         # just for test sync_params_buffers
         self.register_buffer("queue", paddle.randn([10, 5]))
@@ -53,8 +54,8 @@ def __init__(self, train_id):
 
     def forward(self, x):
         is_use = (paddle.equal_all(
-            x, paddle.ones(shape=(batch, in_dim))).numpy()[0] and
-                  self.trainer_id == 1)
+            x, paddle.ones(shape=(batch, in_dim))).numpy()[0]
+                  and self.trainer_id == 1)
 
         if is_use:
             tmp = paddle.matmul(x, self.w1)
@@ -65,6 +66,7 @@ def forward(self, x):
 
 
 class TestDistTraning(unittest.TestCase):
+
     def test_multiple_gpus(self):
         dist.init_parallel_env()
         self.trainer_id = dist.get_rank()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py
index db41236dd5c1d..debc9e90e0775 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py
@@ -37,16 +37,17 @@
 
 
 class SimpleNet(fluid.Layer):
+
     def __init__(self, train_id):
         super(SimpleNet, self).__init__()
-        self.w1 = self.create_parameter(
-            shape=[in_dim, out_dim], dtype="float32")
-        self.w2 = self.create_parameter(
-            shape=[in_dim, out_dim], dtype="float32")
+        self.w1 = self.create_parameter(shape=[in_dim, out_dim],
+                                        dtype="float32")
+        self.w2 = self.create_parameter(shape=[in_dim, out_dim],
+                                        dtype="float32")
         self.share_net = Linear(out_dim, 10)
 
-        self.unused_param = self.create_parameter(
-            shape=[out_dim, in_dim], dtype="float64")
+        self.unused_param = self.create_parameter(shape=[out_dim, in_dim],
+                                                  dtype="float64")
 
         # just for test sync_params_buffers
         # self.register_buffer("queue", paddle.randn([10, 5]))
@@ -57,8 +58,8 @@ def __init__(self, train_id):
 
     def forward(self, x):
         is_use = (paddle.equal_all(
-            x, paddle.ones(shape=(batch, in_dim))).numpy()[0] and
-                  self.trainer_id == 1)
+            x, paddle.ones(shape=(batch, in_dim))).numpy()[0]
+                  and self.trainer_id == 1)
 
         if is_use:
             tmp = paddle.matmul(x, self.w1)
@@ -69,6 +70,7 @@ def forward(self, x):
 
 
 class TestDistTraning(unittest.TestCase):
+
     def test_multiple_gpus(self):
         self.trainer_id = dist.get_rank()
         with _test_eager_guard():
@@ -80,10 +82,12 @@ def test_multiple_gpus(self):
             state_dict = model_a.state_dict()
             model_b.set_state_dict(state_dict)
 
-            model_a = paddle.DataParallel(
-                model_a, find_unused_parameters=True, group=self.pg)
-            model_b = paddle.DataParallel(
-                model_b, find_unused_parameters=True, group=self.pg)
+            model_a = paddle.DataParallel(model_a,
+                                          find_unused_parameters=True,
+                                          group=self.pg)
+            model_b = paddle.DataParallel(model_b,
+                                          find_unused_parameters=True,
+                                          group=self.pg)
 
             ones_input = paddle.ones(shape=(batch, in_dim))
             ones_input.stop_gradient = True
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
index b083e76897cd9..93ca1fa5b56a0 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
@@ -33,6 +33,7 @@
 
 
 class SimpleImgConvPool(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -52,25 +53,23 @@ def __init__(self,
                  bias_attr=None):
         super(SimpleImgConvPool, self).__init__()
 
-        self._conv2d = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=conv_stride,
-            padding=conv_padding,
-            dilation=conv_dilation,
-            groups=conv_groups,
-            param_attr=None,
-            bias_attr=None,
-            use_cudnn=use_cudnn)
-
-        self._pool2d = Pool2D(
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn)
+        self._conv2d = Conv2D(num_channels=num_channels,
+                              num_filters=num_filters,
+                              filter_size=filter_size,
+                              stride=conv_stride,
+                              padding=conv_padding,
+                              dilation=conv_dilation,
+                              groups=conv_groups,
+                              param_attr=None,
+                              bias_attr=None,
+                              use_cudnn=use_cudnn)
+
+        self._pool2d = Pool2D(pool_size=pool_size,
+                              pool_type=pool_type,
+                              pool_stride=pool_stride,
+                              pool_padding=pool_padding,
+                              global_pooling=global_pooling,
+                              use_cudnn=use_cudnn)
 
     def forward(self, inputs):
         x = self._conv2d(inputs)
@@ -79,25 +78,33 @@ def forward(self, inputs):
 
 
 class MNIST(fluid.dygraph.Layer):
+
     def __init__(self):
         super(MNIST, self).__init__()
 
-        self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            1, 20, 5, 2, 2, act="relu")
+        self._simple_img_conv_pool_1 = SimpleImgConvPool(1,
+                                                         20,
+                                                         5,
+                                                         2,
+                                                         2,
+                                                         act="relu")
 
-        self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            20, 50, 5, 2, 2, act="relu")
+        self._simple_img_conv_pool_2 = SimpleImgConvPool(20,
+                                                         50,
+                                                         5,
+                                                         2,
+                                                         2,
+                                                         act="relu")
 
         self.pool_2_shape = 50 * 4 * 4
         SIZE = 10
         scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
-        self._fc = Linear(
-            self.pool_2_shape,
-            10,
-            param_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.NormalInitializer(
-                    loc=0.0, scale=scale)),
-            act="softmax")
+        self._fc = Linear(self.pool_2_shape,
+                          10,
+                          param_attr=fluid.param_attr.ParamAttr(
+                              initializer=fluid.initializer.NormalInitializer(
+                                  loc=0.0, scale=scale)),
+                          act="softmax")
 
     def forward(self, inputs, label):
         x = self._simple_img_conv_pool_1(inputs)
@@ -110,20 +117,22 @@ def forward(self, inputs, label):
 
 
 class TestMnist(TestParallelDyGraphRunnerBase):
+
     def get_model(self):
         model = MNIST()
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=2, drop_last=True)
-        opt = paddle.optimizer.Adam(
-            learning_rate=1e-3, parameters=model.parameters())
+        train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                    batch_size=2,
+                                    drop_last=True)
+        opt = paddle.optimizer.Adam(learning_rate=1e-3,
+                                    parameters=model.parameters())
         return model, train_reader, opt
 
     def run_one_loop(self, model, opt, data):
         batch_size = len(data)
         dy_x_data = np.array([x[0].reshape(1, 28, 28)
                               for x in data]).astype('float32')
-        y_data = np.array(
-            [x[1] for x in data]).astype('int64').reshape(batch_size, 1)
+        y_data = np.array([x[1] for x in data
+                           ]).astype('int64').reshape(batch_size, 1)
         img = to_variable(dy_x_data)
         label = to_variable(y_data)
         label.stop_gradient = True
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py
index 9a3b5ee2f0f3e..5544ad1da16fd 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py
@@ -39,6 +39,7 @@
 
 
 class SimpleNet(fluid.Layer):
+
     def __init__(self):
         super(SimpleNet, self).__init__()
         self.net_a = Linear(input_dim=10, output_dim=20)
@@ -53,10 +54,12 @@ def forward(self, x):
 
 
 class TestNoSync(TestParallelDyGraphRunnerBase):
+
     def get_model(self):
         model = SimpleNet()
-        train_reader = paddle.batch(
-            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        train_reader = paddle.batch(fake_sample_reader(),
+                                    batch_size=batch_size,
+                                    drop_last=True)
         optimizer = paddle.optimizer.SGD(learning_rate=0.001,
                                          parameters=model.parameters())
         return model, train_reader, optimizer
@@ -146,6 +149,7 @@ def model_train(self, args, model, opt, train_reader):
 
 
 def fake_sample_reader():
+
     def __reader__():
         for i in range(batch_num):
             x_data = np.random.random_sample((10, )).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py
index 8b3e1b9aedde9..b33ef5165fee4 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py
@@ -38,6 +38,7 @@
 
 
 class SimpleNetControlFlow(fluid.Layer):
+
     def __init__(self):
         super(SimpleNetControlFlow, self).__init__()
         self.net_a = Linear(input_dim=10, output_dim=20)
@@ -56,10 +57,12 @@ def forward(self, x):
 
 
 class TestNoSyncControlFlow(TestNoSync):
+
     def get_model(self):
         model = SimpleNetControlFlow()
-        train_reader = paddle.batch(
-            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        train_reader = paddle.batch(fake_sample_reader(),
+                                    batch_size=batch_size,
+                                    drop_last=True)
         optimizer = paddle.optimizer.SGD(learning_rate=0.001,
                                          parameters=model.parameters())
         return model, train_reader, optimizer
@@ -74,6 +77,7 @@ def run_one_loop(self, model, optimizer, batch):
 
 
 def fake_sample_reader():
+
     def __reader__():
         for i in range(batch_num):
             x_data = np.random.random_sample((10, )).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_gradient_check.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_gradient_check.py
index 642ea14d8a87d..f7da6cb2aae87 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_gradient_check.py
@@ -32,16 +32,17 @@
 
 
 class SimpleNet(fluid.Layer):
+
     def __init__(self, train_id):
         super(SimpleNet, self).__init__()
-        self.w1 = self.create_parameter(
-            shape=[in_dim, out_dim], dtype="float32")
-        self.w2 = self.create_parameter(
-            shape=[in_dim, out_dim], dtype="float32")
+        self.w1 = self.create_parameter(shape=[in_dim, out_dim],
+                                        dtype="float32")
+        self.w2 = self.create_parameter(shape=[in_dim, out_dim],
+                                        dtype="float32")
         self.share_net = Linear(out_dim, 1)
 
-        self.unused_param = self.create_parameter(
-            shape=[out_dim, in_dim], dtype="float32")
+        self.unused_param = self.create_parameter(shape=[out_dim, in_dim],
+                                                  dtype="float32")
 
         # just for test sync_params_buffers
         self.register_buffer("queue", paddle.randn([10, 5]))
@@ -52,8 +53,8 @@ def __init__(self, train_id):
 
     def forward(self, x):
         is_use = (paddle.equal_all(
-            x, paddle.ones(shape=(batch, in_dim))).numpy()[0] and
-                  self.trainer_id == 1)
+            x, paddle.ones(shape=(batch, in_dim))).numpy()[0]
+                  and self.trainer_id == 1)
 
         if is_use:
             tmp = paddle.matmul(x, self.w1)
@@ -64,6 +65,7 @@ def forward(self, x):
 
 
 class TestDistTraning(unittest.TestCase):
+
     def test_multiple_gpus(self):
         self.trainer_id = dist.get_rank()
         dist.init_parallel_env()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py
index 5aecf13bc154d..9f28e2ce4c518 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py
@@ -38,6 +38,7 @@
 
 
 class SimpleNetUnusedParam(fluid.Layer):
+
     def __init__(self):
         super(SimpleNetUnusedParam, self).__init__()
         self.net_a = Linear(input_dim=10, output_dim=20)
@@ -55,10 +56,12 @@ def forward(self, x):
 
 
 class TestNoSyncUnusedParam(TestNoSync):
+
     def get_model(self):
         model = SimpleNetUnusedParam()
-        train_reader = paddle.batch(
-            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        train_reader = paddle.batch(fake_sample_reader(),
+                                    batch_size=batch_size,
+                                    drop_last=True)
         optimizer = paddle.optimizer.SGD(learning_rate=0.001,
                                          parameters=model.parameters())
         return model, train_reader, optimizer
@@ -73,6 +76,7 @@ def run_one_loop(self, model, optimizer, batch):
 
 
 def fake_sample_reader():
+
     def __reader__():
         for i in range(batch_num):
             x_data = np.random.random_sample((10, )).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py
index fc0246a9720bf..7c8c40850d900 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py
@@ -35,14 +35,15 @@
 
 
 class SimpleNet(fluid.Layer):
+
     def __init__(self):
         super(SimpleNet, self).__init__()
-        self.net_a = paddle.nn.Sequential(
-            paddle.nn.Linear(10, 20),
-            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
-        self.net_b = paddle.nn.Sequential(
-            paddle.nn.Linear(10, 20),
-            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
+        self.net_a = paddle.nn.Sequential(paddle.nn.Linear(10, 20),
+                                          paddle.nn.Linear(20, 20),
+                                          paddle.nn.Linear(20, 5))
+        self.net_b = paddle.nn.Sequential(paddle.nn.Linear(10, 20),
+                                          paddle.nn.Linear(20, 20),
+                                          paddle.nn.Linear(20, 5))
         self.step = 0
 
     def forward(self, x):
@@ -50,6 +51,7 @@ def forward(self, x):
 
 
 def fake_sample_reader():
+
     def __reader__():
         for i in range(batch_num):
             x_data = np.random.random_sample((10, )).astype('float32')
@@ -59,10 +61,12 @@ def __reader__():
 
 
 class TestSimpleNet(TestParallelDyGraphRunnerBase):
+
     def get_model(self):
         model = SimpleNet()
-        train_reader = paddle.batch(
-            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        train_reader = paddle.batch(fake_sample_reader(),
+                                    batch_size=batch_size,
+                                    drop_last=True)
         optimizer = paddle.optimizer.SGD(learning_rate=0.001,
                                          parameters=model.parameters())
         return model, train_reader, optimizer
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
index 5e50e915d73c7..6ee04dd342b81 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
@@ -68,15 +68,17 @@ def optimizer_setting(params, parameter_list=None):
     num_epochs = params["num_epochs"]
     if fluid._non_static_mode():
         optimizer = fluid.optimizer.Momentum(
-            learning_rate=fluid.layers.cosine_decay(
-                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
+            learning_rate=fluid.layers.cosine_decay(learning_rate=lr,
+                                                    step_each_epoch=step,
+                                                    epochs=num_epochs),
             momentum=momentum_rate,
             regularization=fluid.regularizer.L2Decay(l2_decay),
             parameter_list=parameter_list)
     else:
         optimizer = fluid.optimizer.Momentum(
-            learning_rate=fluid.layers.cosine_decay(
-                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
+            learning_rate=fluid.layers.cosine_decay(learning_rate=lr,
+                                                    step_each_epoch=step,
+                                                    epochs=num_epochs),
             momentum=momentum_rate,
             regularization=fluid.regularizer.L2Decay(l2_decay))
 
@@ -84,6 +86,7 @@ def optimizer_setting(params, parameter_list=None):
 
 
 class ConvBNLayer(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -93,15 +96,14 @@ def __init__(self,
                  act=None):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=False)
+        self._conv = Conv2D(num_channels=num_channels,
+                            num_filters=num_filters,
+                            filter_size=filter_size,
+                            stride=stride,
+                            padding=(filter_size - 1) // 2,
+                            groups=groups,
+                            act=None,
+                            bias_attr=False)
 
         # disable BatchNorm in multi-card. disable LayerNorm because of complex input_shape
         # self._batch_norm = BatchNorm(num_filters, act=act)
@@ -114,6 +116,7 @@ def forward(self, inputs):
 
 
 class SqueezeExcitation(fluid.dygraph.Layer):
+
     def __init__(self, num_channels, reduction_ratio):
 
         super(SqueezeExcitation, self).__init__()
@@ -144,6 +147,7 @@ def forward(self, input):
 
 
 class BottleneckBlock(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -153,33 +157,29 @@ def __init__(self,
                  shortcut=True):
         super(BottleneckBlock, self).__init__()
 
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1,
-            act="relu")
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            groups=cardinality,
-            act="relu")
-        self.conv2 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters * 2,
-            filter_size=1,
-            act=None)
-
-        self.scale = SqueezeExcitation(
-            num_channels=num_filters * 2, reduction_ratio=reduction_ratio)
+        self.conv0 = ConvBNLayer(num_channels=num_channels,
+                                 num_filters=num_filters,
+                                 filter_size=1,
+                                 act="relu")
+        self.conv1 = ConvBNLayer(num_channels=num_filters,
+                                 num_filters=num_filters,
+                                 filter_size=3,
+                                 stride=stride,
+                                 groups=cardinality,
+                                 act="relu")
+        self.conv2 = ConvBNLayer(num_channels=num_filters,
+                                 num_filters=num_filters * 2,
+                                 filter_size=1,
+                                 act=None)
+
+        self.scale = SqueezeExcitation(num_channels=num_filters * 2,
+                                       reduction_ratio=reduction_ratio)
 
         if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * 2,
-                filter_size=1,
-                stride=stride)
+            self.short = ConvBNLayer(num_channels=num_channels,
+                                     num_filters=num_filters * 2,
+                                     filter_size=1,
+                                     stride=stride)
 
         self.shortcut = shortcut
 
@@ -201,6 +201,7 @@ def forward(self, inputs):
 
 
 class SeResNeXt(fluid.dygraph.Layer):
+
     def __init__(self, layers=50, class_dim=102):
         super(SeResNeXt, self).__init__()
 
@@ -214,52 +215,53 @@ def __init__(self, layers=50, class_dim=102):
             reduction_ratio = 16
             depth = [3, 4, 6, 3]
             num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                num_channels=3,
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            self.pool = Pool2D(
-                pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+            self.conv0 = ConvBNLayer(num_channels=3,
+                                     num_filters=64,
+                                     filter_size=7,
+                                     stride=2,
+                                     act='relu')
+            self.pool = Pool2D(pool_size=3,
+                               pool_stride=2,
+                               pool_padding=1,
+                               pool_type='max')
         elif layers == 101:
             cardinality = 32
             reduction_ratio = 16
             depth = [3, 4, 23, 3]
             num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                num_channels=3,
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            self.pool = Pool2D(
-                pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+            self.conv0 = ConvBNLayer(num_channels=3,
+                                     num_filters=64,
+                                     filter_size=7,
+                                     stride=2,
+                                     act='relu')
+            self.pool = Pool2D(pool_size=3,
+                               pool_stride=2,
+                               pool_padding=1,
+                               pool_type='max')
         elif layers == 152:
             cardinality = 64
             reduction_ratio = 16
             depth = [3, 8, 36, 3]
             num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                num_channels=3,
-                num_filters=64,
-                filter_size=3,
-                stride=2,
-                act='relu')
-            self.conv1 = ConvBNLayer(
-                num_channels=64,
-                num_filters=64,
-                filter_size=3,
-                stride=1,
-                act='relu')
-            self.conv2 = ConvBNLayer(
-                num_channels=64,
-                num_filters=128,
-                filter_size=3,
-                stride=1,
-                act='relu')
-            self.pool = Pool2D(
-                pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+            self.conv0 = ConvBNLayer(num_channels=3,
+                                     num_filters=64,
+                                     filter_size=3,
+                                     stride=2,
+                                     act='relu')
+            self.conv1 = ConvBNLayer(num_channels=64,
+                                     num_filters=64,
+                                     filter_size=3,
+                                     stride=1,
+                                     act='relu')
+            self.conv2 = ConvBNLayer(num_channels=64,
+                                     num_filters=128,
+                                     filter_size=3,
+                                     stride=1,
+                                     act='relu')
+            self.pool = Pool2D(pool_size=3,
+                               pool_stride=2,
+                               pool_padding=1,
+                               pool_type='max')
 
         self.bottleneck_block_list = []
         num_channels = 64
@@ -268,19 +270,19 @@ def __init__(self, layers=50, class_dim=102):
             for i in range(depth[block]):
                 bottleneck_block = self.add_sublayer(
                     'bb_%d_%d' % (block, i),
-                    BottleneckBlock(
-                        num_channels=num_channels,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        cardinality=cardinality,
-                        reduction_ratio=reduction_ratio,
-                        shortcut=shortcut))
+                    BottleneckBlock(num_channels=num_channels,
+                                    num_filters=num_filters[block],
+                                    stride=2 if i == 0 and block != 0 else 1,
+                                    cardinality=cardinality,
+                                    reduction_ratio=reduction_ratio,
+                                    shortcut=shortcut))
                 num_channels = bottleneck_block._num_channels_out
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
 
-        self.pool2d_avg = Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True)
+        self.pool2d_avg = Pool2D(pool_size=7,
+                                 pool_type='avg',
+                                 global_pooling=True)
         stdv = 1.0 / math.sqrt(2048 * 1.0)
 
         self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 2 * 1 * 1
@@ -310,14 +312,14 @@ def forward(self, inputs):
 
 
 class TestSeResNeXt(TestParallelDyGraphRunnerBase):
+
     def get_model(self):
         model = SeResNeXt()
-        train_reader = paddle.batch(
-            paddle.dataset.flowers.test(use_xmap=False),
-            batch_size=train_parameters["batch_size"],
-            drop_last=True)
-        optimizer = optimizer_setting(
-            train_parameters, parameter_list=model.parameters())
+        train_reader = paddle.batch(paddle.dataset.flowers.test(use_xmap=False),
+                                    batch_size=train_parameters["batch_size"],
+                                    drop_last=True)
+        optimizer = optimizer_setting(train_parameters,
+                                      parameter_list=model.parameters())
         return model, train_reader, optimizer
 
     def run_one_loop(self, model, opt, data):
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py
index facac33e4c60e..a0906383a4fec 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py
@@ -28,6 +28,7 @@
 
 
 class SimpleNet(fluid.Layer):
+
     def __init__(self):
         # bias is unused parameters, and it share with net_a
         super(SimpleNet, self).__init__()
@@ -44,6 +45,7 @@ def forward(self, x):
 
 
 def fake_sample_reader():
+
     def __reader__():
         for i in range(batch_num):
             x_data = np.random.random_sample((10, )).astype('float32')
@@ -53,10 +55,12 @@ def __reader__():
 
 
 class TestSimpleNet(TestParallelDyGraphRunnerBase):
+
     def get_model(self):
         model = SimpleNet()
-        train_reader = paddle.batch(
-            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        train_reader = paddle.batch(fake_sample_reader(),
+                                    batch_size=batch_size,
+                                    drop_last=True)
         optimizer = paddle.optimizer.SGD(learning_rate=0.001,
                                          parameters=model.parameters())
         return model, train_reader, optimizer
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
index 33ae0acf43d12..53cc8b1267769 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
@@ -25,6 +25,7 @@
 
 
 class SimpleNet(fluid.Layer):
+
     def __init__(self,
                  hidden_size,
                  vocab_size,
@@ -62,8 +63,9 @@ def forward(self, input, label):
         fc = fluid.layers.matmul(x_emb, self.softmax_weight)
         fc = fluid.layers.elementwise_add(fc, self.softmax_bias)
         projection = fluid.layers.reshape(fc, shape=[-1, self.vocab_size])
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=projection, label=label, soft_label=False)
+        loss = fluid.layers.softmax_with_cross_entropy(logits=projection,
+                                                       label=label,
+                                                       soft_label=False)
         loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
         loss = fluid.layers.reduce_mean(loss, dim=[0])
         loss = fluid.layers.reduce_sum(loss)
@@ -81,6 +83,7 @@ def forward(self, input, label):
 
 
 def fake_sample_reader():
+
     def __reader__():
         for i in range(batch_num):
             x_data = np.arange(num_steps).astype('int64')
@@ -91,16 +94,17 @@ def __reader__():
 
 
 class TestSparseEmbedding(TestParallelDyGraphRunnerBase):
+
     def get_model(self):
-        model = SimpleNet(
-            hidden_size=hidden_size,
-            vocab_size=vocab_size,
-            num_steps=num_steps,
-            init_scale=init_scale,
-            is_sparse=True)
-
-        train_reader = paddle.batch(
-            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        model = SimpleNet(hidden_size=hidden_size,
+                          vocab_size=vocab_size,
+                          num_steps=num_steps,
+                          init_scale=init_scale,
+                          is_sparse=True)
+
+        train_reader = paddle.batch(fake_sample_reader(),
+                                    batch_size=batch_size,
+                                    drop_last=True)
 
         optimizer = paddle.optimizer.SGD(learning_rate=0.001,
                                          parameters=model.parameters())
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py
index b341a227285b1..e6ef94f7bb0be 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py
@@ -19,10 +19,12 @@
 
 from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
 from paddle.nn import Layer, Embedding
+
 paddle.set_default_dtype("float64")
 
 
 class SimpleNet(Layer):
+
     def __init__(self,
                  hidden_size,
                  vocab_size,
@@ -40,8 +42,8 @@ def __init__(self,
             self.hidden_size,
             sparse=True,
             weight_attr=paddle.ParamAttr(
-                initializer=paddle.nn.initializer.Uniform(
-                    low=-init_scale, high=init_scale)))
+                initializer=paddle.nn.initializer.Uniform(low=-init_scale,
+                                                          high=init_scale)))
         self.softmax_weight = self.create_parameter(
             attr=paddle.ParamAttr(),
             shape=[self.hidden_size, self.vocab_size],
@@ -85,6 +87,7 @@ def forward(self, input, label):
 
 
 def fake_sample_reader():
+
     def __reader__():
         for i in range(batch_num):
             x_data = np.arange(num_steps).astype('int64')
@@ -95,16 +98,17 @@ def __reader__():
 
 
 class TestSparseEmbeddingFP64(TestParallelDyGraphRunnerBase):
+
     def get_model(self):
-        model = SimpleNet(
-            hidden_size=hidden_size,
-            vocab_size=vocab_size,
-            num_steps=num_steps,
-            init_scale=init_scale,
-            is_sparse=True)
-
-        train_reader = paddle.batch(
-            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        model = SimpleNet(hidden_size=hidden_size,
+                          vocab_size=vocab_size,
+                          num_steps=num_steps,
+                          init_scale=init_scale,
+                          is_sparse=True)
+
+        train_reader = paddle.batch(fake_sample_reader(),
+                                    batch_size=batch_size,
+                                    drop_last=True)
 
         optimizer = paddle.optimizer.SGD(learning_rate=0.001,
                                          parameters=model.parameters())
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_over_height.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_over_height.py
index 61749a24c9821..7d3ef413f135c 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_over_height.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_over_height.py
@@ -31,16 +31,17 @@
 
 
 class TestSparseEmbeddingOverHeight(TestSparseEmbedding):
+
     def get_model(self):
-        model = SimpleNet(
-            hidden_size=hidden_size,
-            vocab_size=vocab_size,
-            num_steps=num_steps,
-            init_scale=init_scale,
-            is_sparse=True)
-
-        train_reader = paddle.batch(
-            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        model = SimpleNet(hidden_size=hidden_size,
+                          vocab_size=vocab_size,
+                          num_steps=num_steps,
+                          init_scale=init_scale,
+                          is_sparse=True)
+
+        train_reader = paddle.batch(fake_sample_reader(),
+                                    batch_size=batch_size,
+                                    drop_last=True)
 
         optimizer = fluid.optimizer.SGD(learning_rate=0.001,
                                         parameter_list=model.parameters())
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
index d525009fbd734..a8e099137a349 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
@@ -33,6 +33,7 @@
 
 
 class TestLayer(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -42,28 +43,27 @@ def __init__(self,
                  act=None):
         super(TestLayer, self).__init__()
 
-        self._conv = Conv2D(
-            in_channels=num_channels,
-            out_channels=num_filters,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            bias_attr=False)
+        self._conv = Conv2D(in_channels=num_channels,
+                            out_channels=num_filters,
+                            kernel_size=filter_size,
+                            stride=stride,
+                            padding=(filter_size - 1) // 2,
+                            groups=groups,
+                            bias_attr=False)
 
         self._sync_batch_norm = SyncBatchNorm(num_filters)
 
-        self._conv2 = Conv2D(
-            in_channels=num_filters,
-            out_channels=num_filters,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            bias_attr=False)
+        self._conv2 = Conv2D(in_channels=num_filters,
+                             out_channels=num_filters,
+                             kernel_size=filter_size,
+                             stride=stride,
+                             padding=(filter_size - 1) // 2,
+                             groups=groups,
+                             bias_attr=False)
 
-        self._sync_batch_norm2 = SyncBatchNorm(
-            num_filters, weight_attr=False, bias_attr=False)
+        self._sync_batch_norm2 = SyncBatchNorm(num_filters,
+                                               weight_attr=False,
+                                               bias_attr=False)
 
     def forward(self, inputs):
         y = self._conv(inputs)
@@ -75,14 +75,14 @@ def forward(self, inputs):
 
 
 class TestSyncBatchNorm(TestParallelDyGraphRunnerBase):
+
     def get_model(self):
         model = TestLayer(3, 64, 7)
-        train_reader = paddle.batch(
-            paddle.dataset.flowers.test(use_xmap=False),
-            batch_size=32,
-            drop_last=True)
-        opt = fluid.optimizer.Adam(
-            learning_rate=1e-3, parameter_list=model.parameters())
+        train_reader = paddle.batch(paddle.dataset.flowers.test(use_xmap=False),
+                                    batch_size=32,
+                                    drop_last=True)
+        opt = fluid.optimizer.Adam(learning_rate=1e-3,
+                                   parameter_list=model.parameters())
         return model, train_reader, opt
 
     def run_one_loop(self, model, opt, data):
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_transformer.py
index f149637641add..922c424e17827 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_transformer.py
@@ -162,32 +162,38 @@ class ModelHyperParams(object):
 # Names of word embedding table which might be reused for weight sharing.
 word_emb_param_names = (
     "src_word_emb_table",
-    "trg_word_emb_table", )
+    "trg_word_emb_table",
+)
 # Names of position encoding table which will be initialized externally.
 pos_enc_param_names = (
     "src_pos_enc_table",
-    "trg_pos_enc_table", )
+    "trg_pos_enc_table",
+)
 # separated inputs for different usages.
 encoder_data_input_fields = (
     "src_word",
     "src_pos",
-    "src_slf_attn_bias", )
+    "src_slf_attn_bias",
+)
 decoder_data_input_fields = (
     "trg_word",
     "trg_pos",
     "trg_slf_attn_bias",
     "trg_src_attn_bias",
-    "enc_output", )
+    "enc_output",
+)
 label_data_input_fields = (
     "lbl_word",
-    "lbl_weight", )
+    "lbl_weight",
+)
 # In fast decoder, trg_pos (only containing the current time step) is generated
 # by ops and trg_slf_attn_bias is not needed.
 fast_decoder_data_input_fields = (
     "trg_word",
     # "init_score",
     # "init_idx",
-    "trg_src_attn_bias", )
+    "trg_src_attn_bias",
+)
 
 
 def position_encoding_init(n_position, d_pos_vec):
@@ -199,10 +205,10 @@ def position_encoding_init(n_position, d_pos_vec):
     num_timescales = channels // 2
     log_timescale_increment = (np.log(float(1e4) / float(1)) /
                                (num_timescales - 1))
-    inv_timescales = np.exp(np.arange(
-        num_timescales)) * -log_timescale_increment
-    scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
-                                                               0)
+    inv_timescales = np.exp(
+        np.arange(num_timescales)) * -log_timescale_increment
+    scaled_time = np.expand_dims(position, 1) * np.expand_dims(
+        inv_timescales, 0)
     signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
     signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
     position_enc = signal
@@ -216,6 +222,7 @@ def position_encoding_init(n_position, d_pos_vec):
 
 
 class PrePostProcessLayer(Layer):
+
     def __init__(self, d_model, process_cmd, shape_len=None):
         super(PrePostProcessLayer, self).__init__()
         for cmd in process_cmd:
@@ -244,6 +251,7 @@ def forward(self, prev_out, out, process_cmd, dropout_rate=0.):
 
 
 class PositionwiseFeedForwardLayer(Layer):
+
     def __init__(self, d_inner_hid, d_hid, dropout_rate):
         super(PositionwiseFeedForwardLayer, self).__init__()
         self._i2h = Linear(d_hid, d_inner_hid, act="relu")
@@ -253,16 +261,16 @@ def __init__(self, d_inner_hid, d_hid, dropout_rate):
     def forward(self, x):
         hidden = self._i2h(x)
         if self._dropout_rate:
-            hidden = fluid.layers.dropout(
-                hidden,
-                dropout_prob=self._dropout_rate,
-                seed=ModelHyperParams.dropout_seed,
-                is_test=False)
+            hidden = fluid.layers.dropout(hidden,
+                                          dropout_prob=self._dropout_rate,
+                                          seed=ModelHyperParams.dropout_seed,
+                                          is_test=False)
         out = self._h2o(hidden)
         return out
 
 
 class MultiHeadAttentionLayer(Layer):
+
     def __init__(self,
                  d_key,
                  d_value,
@@ -304,11 +312,10 @@ def forward(self, queries, keys, values, attn_bias):
         transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
 
         # scale dot product attention
-        product = fluid.layers.matmul(
-            x=transpose_q,
-            y=transpose_k,
-            transpose_y=True,
-            alpha=self._d_model**-0.5)
+        product = fluid.layers.matmul(x=transpose_q,
+                                      y=transpose_k,
+                                      transpose_y=True,
+                                      alpha=self._d_model**-0.5)
         if attn_bias is not None:
             product += attn_bias
         weights = fluid.layers.softmax(product)
@@ -337,6 +344,7 @@ def forward(self, queries, keys, values, attn_bias):
 
 
 class EncoderSubLayer(Layer):
+
     def __init__(self,
                  n_head,
                  d_key,
@@ -358,14 +366,16 @@ def __init__(self,
                                                      self._preprocess_cmd, 3)
         self._multihead_attention_layer = MultiHeadAttentionLayer(
             d_key, d_value, d_model, n_head, attention_dropout)
-        self._postprocess_layer = PrePostProcessLayer(
-            d_model, self._postprocess_cmd, None)
+        self._postprocess_layer = PrePostProcessLayer(d_model,
+                                                      self._postprocess_cmd,
+                                                      None)
         self._preprocess_layer2 = PrePostProcessLayer(d_model,
                                                       self._preprocess_cmd, 3)
         self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
             d_inner_hid, d_model, relu_dropout)
-        self._postprocess_layer2 = PrePostProcessLayer(
-            d_model, self._postprocess_cmd, None)
+        self._postprocess_layer2 = PrePostProcessLayer(d_model,
+                                                       self._postprocess_cmd,
+                                                       None)
 
     def forward(self, enc_input, attn_bias):
         pre_process_multihead = self._preprocess_layer(
@@ -385,6 +395,7 @@ def forward(self, enc_input, attn_bias):
 
 
 class EncoderLayer(Layer):
+
     def __init__(self,
                  n_layer,
                  n_head,
@@ -424,6 +435,7 @@ def forward(self, enc_input, attn_bias):
 
 
 class PrepareEncoderDecoderLayer(Layer):
+
     def __init__(self,
                  src_vocab_size,
                  src_emb_dim,
@@ -437,13 +449,13 @@ def __init__(self,
         self._src_emb_dim = src_emb_dim
         self._src_vocab_size = src_vocab_size
         self._dropout_rate = dropout_rate
-        self._input_emb = Embedding(
-            size=[src_vocab_size, src_emb_dim],
-            is_sparse=is_sparse,
-            padding_idx=0,
-            param_attr=fluid.ParamAttr(
-                name=word_emb_param_name,
-                initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
+        self._input_emb = Embedding(size=[src_vocab_size, src_emb_dim],
+                                    is_sparse=is_sparse,
+                                    padding_idx=0,
+                                    param_attr=fluid.ParamAttr(
+                                        name=word_emb_param_name,
+                                        initializer=fluid.initializer.Normal(
+                                            0., src_emb_dim**-0.5)))
 
         if pos_enc_param_name is pos_enc_param_names[0]:
             pos_inp = pos_inp1
@@ -459,8 +471,8 @@ def __init__(self,
 
     def forward(self, src_word, src_pos):
         src_word_emb = self._input_emb(src_word)
-        src_word_emb = fluid.layers.scale(
-            x=src_word_emb, scale=self._src_emb_dim**0.5)
+        src_word_emb = fluid.layers.scale(x=src_word_emb,
+                                          scale=self._src_emb_dim**0.5)
         # # TODO change this to fit dynamic length input
         src_pos_emb = self._pos_emb(src_pos)
         src_pos_emb.stop_gradient = True
@@ -473,6 +485,7 @@ def forward(self, src_word, src_pos):
 
 
 class WrapEncoderLayer(Layer):
+
     def __init__(self,
                  src_vocab_size,
                  max_length,
@@ -515,6 +528,7 @@ def forward(self, enc_inputs):
 
 
 class DecoderSubLayer(Layer):
+
     def __init__(self,
                  n_head,
                  d_key,
@@ -565,10 +579,11 @@ def __init__(self,
                                                         postprocess_cmd, None)
 
     def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
-        pre_process_rlt = self._pre_process_layer(
-            None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout)
-        slf_attn_output = self._multihead_attention_layer(pre_process_rlt, None,
-                                                          None, slf_attn_bias)
+        pre_process_rlt = self._pre_process_layer(None, dec_input,
+                                                  self._preprocess_cmd,
+                                                  self._prepostprcess_dropout)
+        slf_attn_output = self._multihead_attention_layer(
+            pre_process_rlt, None, None, slf_attn_bias)
         slf_attn_output_pp = self._post_process_layer(
             dec_input, slf_attn_output, self._postprocess_cmd,
             self._prepostprcess_dropout)
@@ -577,9 +592,10 @@ def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
                                                     self._prepostprcess_dropout)
         enc_attn_output_pp = self._multihead_attention_layer2(
             pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
-        enc_attn_output = self._post_process_layer2(
-            slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd,
-            self._prepostprcess_dropout)
+        enc_attn_output = self._post_process_layer2(slf_attn_output_pp,
+                                                    enc_attn_output_pp,
+                                                    self._postprocess_cmd,
+                                                    self._prepostprcess_dropout)
         pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
                                                     self._preprocess_cmd,
                                                     self._prepostprcess_dropout)
@@ -591,6 +607,7 @@ def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
 
 
 class DecoderLayer(Layer):
+
     def __init__(self,
                  n_layer,
                  n_head,
@@ -616,25 +633,25 @@ def __init__(self,
             self._decoder_sub_layers.append(
                 self.add_sublayer(
                     'dsl_%d' % i,
-                    DecoderSubLayer(
-                        n_head,
-                        d_key,
-                        d_value,
-                        d_model,
-                        d_inner_hid,
-                        prepostprocess_dropout,
-                        attention_dropout,
-                        relu_dropout,
-                        preprocess_cmd,
-                        postprocess_cmd,
-                        cache=None if caches is None else caches[i],
-                        gather_idx=gather_idx)))
+                    DecoderSubLayer(n_head,
+                                    d_key,
+                                    d_value,
+                                    d_model,
+                                    d_inner_hid,
+                                    prepostprocess_dropout,
+                                    attention_dropout,
+                                    relu_dropout,
+                                    preprocess_cmd,
+                                    postprocess_cmd,
+                                    cache=None if caches is None else caches[i],
+                                    gather_idx=gather_idx)))
 
     def forward(self, dec_input, enc_output, dec_slf_attn_bias,
                 dec_enc_attn_bias):
         for i in range(self._n_layer):
-            tmp_dec_output = self._decoder_sub_layers[i](
-                dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias)
+            tmp_dec_output = self._decoder_sub_layers[i](dec_input, enc_output,
+                                                         dec_slf_attn_bias,
+                                                         dec_enc_attn_bias)
             dec_input = tmp_dec_output
 
         dec_output = self._pre_process_layer(None, tmp_dec_output,
@@ -644,6 +661,7 @@ def forward(self, dec_input, enc_output, dec_slf_attn_bias,
 
 
 class WrapDecoderLayer(Layer):
+
     def __init__(self,
                  trg_vocab_size,
                  max_length,
@@ -675,20 +693,19 @@ def __init__(self,
             is_sparse=is_sparse,
             word_emb_param_name=word_emb_param_names[1],
             pos_enc_param_name=pos_enc_param_names[1])
-        self._decoder_layer = DecoderLayer(
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd,
-            postprocess_cmd,
-            caches=caches,
-            gather_idx=gather_idx)
+        self._decoder_layer = DecoderLayer(n_layer,
+                                           n_head,
+                                           d_key,
+                                           d_value,
+                                           d_model,
+                                           d_inner_hid,
+                                           prepostprocess_dropout,
+                                           attention_dropout,
+                                           relu_dropout,
+                                           preprocess_cmd,
+                                           postprocess_cmd,
+                                           caches=caches,
+                                           gather_idx=gather_idx)
         self._weight_sharing = weight_sharing
         if not weight_sharing:
             self._fc = Linear(d_model, trg_vocab_size, bias_attr=False)
@@ -718,6 +735,7 @@ def forward(self, dec_inputs=None, enc_output=None):
 
 
 class TransFormer(Layer):
+
     def __init__(self,
                  src_vocab_size,
                  trg_vocab_size,
@@ -745,38 +763,36 @@ def __init__(self,
             assert src_vocab_size == trg_vocab_size, (
                 "Vocabularies in source and target should be same for weight sharing."
             )
-        self._wrap_encoder_layer = WrapEncoderLayer(
-            src_vocab_size,
-            max_length,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd,
-            postprocess_cmd,
-            weight_sharing,
-            is_sparse=is_sparse)
-        self._wrap_decoder_layer = WrapDecoderLayer(
-            trg_vocab_size,
-            max_length,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd,
-            postprocess_cmd,
-            weight_sharing,
-            is_sparse=is_sparse)
+        self._wrap_encoder_layer = WrapEncoderLayer(src_vocab_size,
+                                                    max_length,
+                                                    n_layer,
+                                                    n_head,
+                                                    d_key,
+                                                    d_value,
+                                                    d_model,
+                                                    d_inner_hid,
+                                                    prepostprocess_dropout,
+                                                    attention_dropout,
+                                                    relu_dropout,
+                                                    preprocess_cmd,
+                                                    postprocess_cmd,
+                                                    weight_sharing,
+                                                    is_sparse=is_sparse)
+        self._wrap_decoder_layer = WrapDecoderLayer(trg_vocab_size,
+                                                    max_length,
+                                                    n_layer,
+                                                    n_head,
+                                                    d_key,
+                                                    d_value,
+                                                    d_model,
+                                                    d_inner_hid,
+                                                    prepostprocess_dropout,
+                                                    attention_dropout,
+                                                    relu_dropout,
+                                                    preprocess_cmd,
+                                                    postprocess_cmd,
+                                                    weight_sharing,
+                                                    is_sparse=is_sparse)
 
         if weight_sharing:
             self._wrap_decoder_layer._prepare_decoder_layer._input_emb.weight = self._wrap_encoder_layer._prepare_encoder_layer._input_emb.weight
@@ -786,8 +802,8 @@ def forward(self, enc_inputs, dec_inputs, label, weights):
         predict = self._wrap_decoder_layer(dec_inputs, enc_output)
         if self._label_smooth_eps:
             label_out = fluid.layers.label_smooth(
-                label=fluid.layers.one_hot(
-                    input=label, depth=self._trg_vocab_size),
+                label=fluid.layers.one_hot(input=label,
+                                           depth=self._trg_vocab_size),
                 epsilon=self._label_smooth_eps)
 
         cost = fluid.layers.softmax_with_cross_entropy(
@@ -807,32 +823,39 @@ def forward(self, enc_inputs, dec_inputs, label, weights):
 
 
 def fake_data_reader():
+
     def __reader__():
         iteration = TrainTaskConfig.batch_size * batch_num
         for _ in six.moves.range(iteration):
             # random data
             np.random.seed = 90
-            src_word_np = np.arange(1, seq_len + 1).reshape(
-                [seq_len]).astype('int64')
-            src_pos_np = np.random.randint(
-                1, seq_len, size=(seq_len), dtype='int64')
-            src_slf_attn_bias_np = np.random.randn(
-                ModelHyperParams.n_head, seq_len, seq_len).astype('float32')
-
-            trg_word_np = np.arange(1, seq_len + 1).reshape(
-                [seq_len]).astype('int64')
-            trg_pos_np = np.random.randint(
-                1, seq_len, size=(seq_len), dtype='int64')
-            trg_slf_attn_bias_np = np.random.randn(
-                ModelHyperParams.n_head, seq_len, seq_len).astype('float32')
-            trg_src_attn_bias_np = np.random.randn(
-                ModelHyperParams.n_head, seq_len, seq_len).astype('float32')
-
-            lbl_word_np = np.random.randint(
-                1,
-                ModelHyperParams.src_vocab_size - 1,
-                size=(seq_len, 1),
-                dtype='int64')
+            src_word_np = np.arange(1, seq_len + 1).reshape([seq_len
+                                                             ]).astype('int64')
+            src_pos_np = np.random.randint(1,
+                                           seq_len,
+                                           size=(seq_len),
+                                           dtype='int64')
+            src_slf_attn_bias_np = np.random.randn(ModelHyperParams.n_head,
+                                                   seq_len,
+                                                   seq_len).astype('float32')
+
+            trg_word_np = np.arange(1, seq_len + 1).reshape([seq_len
+                                                             ]).astype('int64')
+            trg_pos_np = np.random.randint(1,
+                                           seq_len,
+                                           size=(seq_len),
+                                           dtype='int64')
+            trg_slf_attn_bias_np = np.random.randn(ModelHyperParams.n_head,
+                                                   seq_len,
+                                                   seq_len).astype('float32')
+            trg_src_attn_bias_np = np.random.randn(ModelHyperParams.n_head,
+                                                   seq_len,
+                                                   seq_len).astype('float32')
+
+            lbl_word_np = np.random.randint(1,
+                                            ModelHyperParams.src_vocab_size - 1,
+                                            size=(seq_len, 1),
+                                            dtype='int64')
 
             # Note(chenweihang): weight will introduce diff, so use constant here
             lbl_weight_np = np.ones((seq_len, 1)).astype('int64')
@@ -875,8 +898,9 @@ def np_to_variable(data):
         var_inputs.append(to_variable(data_inputs[i], name=field))
 
     enc_inputs = var_inputs[0:len(encoder_data_input_fields)]
-    dec_inputs = var_inputs[len(encoder_data_input_fields):len(
-        encoder_data_input_fields) + len(decoder_data_input_fields[:-1])]
+    dec_inputs = var_inputs[len(encoder_data_input_fields
+                                ):len(encoder_data_input_fields) +
+                            len(decoder_data_input_fields[:-1])]
     label = var_inputs[-2]
     weights = var_inputs[-1]
 
@@ -887,39 +911,38 @@ def np_to_variable(data):
 
 
 class TestTransformer(TestParallelDyGraphRunnerBase):
+
     def get_model(self):
-        model = TransFormer(
-            ModelHyperParams.src_vocab_size,
-            ModelHyperParams.trg_vocab_size,
-            ModelHyperParams.max_length + 1,
-            ModelHyperParams.n_layer,
-            ModelHyperParams.n_head,
-            ModelHyperParams.d_key,
-            ModelHyperParams.d_value,
-            ModelHyperParams.d_model,
-            ModelHyperParams.d_inner_hid,
-            ModelHyperParams.prepostprocess_dropout,
-            ModelHyperParams.attention_dropout,
-            ModelHyperParams.relu_dropout,
-            ModelHyperParams.preprocess_cmd,
-            ModelHyperParams.postprocess_cmd,
-            ModelHyperParams.weight_sharing,
-            TrainTaskConfig.label_smooth_eps,
-            is_sparse=True)
+        model = TransFormer(ModelHyperParams.src_vocab_size,
+                            ModelHyperParams.trg_vocab_size,
+                            ModelHyperParams.max_length + 1,
+                            ModelHyperParams.n_layer,
+                            ModelHyperParams.n_head,
+                            ModelHyperParams.d_key,
+                            ModelHyperParams.d_value,
+                            ModelHyperParams.d_model,
+                            ModelHyperParams.d_inner_hid,
+                            ModelHyperParams.prepostprocess_dropout,
+                            ModelHyperParams.attention_dropout,
+                            ModelHyperParams.relu_dropout,
+                            ModelHyperParams.preprocess_cmd,
+                            ModelHyperParams.postprocess_cmd,
+                            ModelHyperParams.weight_sharing,
+                            TrainTaskConfig.label_smooth_eps,
+                            is_sparse=True)
         train_reader = paddle.batch(fake_data_reader(),
                                     TrainTaskConfig.batch_size)
         if naive_optimize:
             optimizer = fluid.optimizer.SGD(learning_rate=0.001,
                                             parameter_list=model.parameters())
         else:
-            optimizer = fluid.optimizer.Adam(
-                learning_rate=NoamDecay(ModelHyperParams.d_model,
-                                        TrainTaskConfig.warmup_steps,
-                                        TrainTaskConfig.learning_rate),
-                beta1=TrainTaskConfig.beta1,
-                beta2=TrainTaskConfig.beta2,
-                epsilon=TrainTaskConfig.eps,
-                parameter_list=model.parameters())
+            optimizer = fluid.optimizer.Adam(learning_rate=NoamDecay(
+                ModelHyperParams.d_model, TrainTaskConfig.warmup_steps,
+                TrainTaskConfig.learning_rate),
+                                             beta1=TrainTaskConfig.beta1,
+                                             beta2=TrainTaskConfig.beta2,
+                                             epsilon=TrainTaskConfig.eps,
+                                             parameter_list=model.parameters())
 
         return model, train_reader, optimizer
 
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py
index b4dd03aecfaf3..a88a36838a5d0 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py
@@ -22,6 +22,7 @@
 
 
 class SimpleNet(Layer):
+
     def __init__(self,
                  hidden_size,
                  vocab_size,
@@ -39,8 +40,8 @@ def __init__(self,
             self.hidden_size,
             sparse=is_sparse,
             weight_attr=paddle.ParamAttr(
-                initializer=paddle.nn.initializer.Uniform(
-                    low=-init_scale, high=init_scale)))
+                initializer=paddle.nn.initializer.Uniform(low=-init_scale,
+                                                          high=init_scale)))
         self.softmax_weight = self.create_parameter(
             attr=paddle.ParamAttr(),
             shape=[self.hidden_size, self.vocab_size],
@@ -88,6 +89,7 @@ def forward(self, input, label):
 
 
 def fake_sample_reader():
+
     def __reader__():
         for i in range(batch_num):
             x_data = np.arange(num_steps).astype('int64')
@@ -98,16 +100,17 @@ def __reader__():
 
 
 class TestSparseEmbeddingUnusedVars(TestParallelDyGraphRunnerBase):
+
     def get_model(self):
-        model = SimpleNet(
-            hidden_size=hidden_size,
-            vocab_size=vocab_size,
-            num_steps=num_steps,
-            init_scale=init_scale,
-            is_sparse=False)
-
-        train_reader = paddle.batch(
-            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        model = SimpleNet(hidden_size=hidden_size,
+                          vocab_size=vocab_size,
+                          num_steps=num_steps,
+                          init_scale=init_scale,
+                          is_sparse=False)
+
+        train_reader = paddle.batch(fake_sample_reader(),
+                                    batch_size=batch_size,
+                                    drop_last=True)
 
         optimizer = paddle.optimizer.SGD(learning_rate=0.001,
                                          parameters=model.parameters())
diff --git a/python/paddle/fluid/tests/unittests/parallel_embedding_api.py b/python/paddle/fluid/tests/unittests/parallel_embedding_api.py
index 8907adbf46a97..1f3d173228dc9 100644
--- a/python/paddle/fluid/tests/unittests/parallel_embedding_api.py
+++ b/python/paddle/fluid/tests/unittests/parallel_embedding_api.py
@@ -41,6 +41,7 @@
 
 
 class TestParallelEmbeddingAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -54,8 +55,9 @@ def get_model(self, main_prog, startup_program, rank):
             paddle.seed(2020)
             data_in = paddle.randint(0, size[0], shape=(10, 4))
 
-            data = paddle.static.data(
-                name='tindata', shape=[10, 1000], dtype="float32")
+            data = paddle.static.data(name='tindata',
+                                      shape=[10, 1000],
+                                      dtype="float32")
             per_part_size = size[0] // 2
             if rank == 0:
                 param_attr = paddle.fluid.ParamAttr(
@@ -66,12 +68,11 @@ def get_model(self, main_prog, startup_program, rank):
                     initializer=paddle.fluid.initializer.NumpyArrayInitializer(
                         np_array[per_part_size:size[0], :]), )
 
-            emb_out = paddle.distributed.split(
-                data_in,
-                size,
-                operation="embedding",
-                num_partitions=2,
-                weight_attr=param_attr)
+            emb_out = paddle.distributed.split(data_in,
+                                               size,
+                                               operation="embedding",
+                                               num_partitions=2,
+                                               weight_attr=param_attr)
 
             return [data_in, emb_out]
 
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 2633a5992563f..46ab8f8851121 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -32,6 +32,7 @@
 
 
 class TestParallelExecutorBase(unittest.TestCase):
+
     @classmethod
     def check_network_convergence(cls,
                                   method,
@@ -52,6 +53,7 @@ def check_network_convergence(cls,
                                   optimizer=fluid.optimizer.Adam,
                                   use_fast_executor=False,
                                   enable_sequential_execution=False):
+
         def run_executor(exe, binary, feed, fetch_list):
             if feed_data_reader is None:
                 res = exe.run(binary, feed=feed, fetch_list=fetch_list)
@@ -102,17 +104,21 @@ def run_executor(exe, binary, feed, fetch_list):
                 os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
 
         begin = time.time()
-        first_loss, = run_executor(
-            exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
+        first_loss, = run_executor(exe=exe,
+                                   binary=binary,
+                                   feed=feed_dict,
+                                   fetch_list=[loss.name])
         for _ in range(iter):
             run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[])
-        last_loss, = run_executor(
-            exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
+        last_loss, = run_executor(exe=exe,
+                                  binary=binary,
+                                  feed=feed_dict,
+                                  fetch_list=[loss.name])
         end = time.time()
 
         if batch_size is not None:
-            print("%.4f Instance per second" % (
-                (batch_size * iter + 2) / (end - begin)))
+            print("%.4f Instance per second" % ((batch_size * iter + 2) /
+                                                (end - begin)))
 
         avg_last_loss_val = np.array(last_loss).mean()
         avg_first_loss_val = np.array(first_loss).mean()
diff --git a/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py b/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py
index 83db08fc61515..b77a04d8eea9c 100644
--- a/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py
+++ b/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,6 +35,7 @@ def set_random_seed(seed):
 
 
 class TestParallelMarginSoftmaxCrossEntropyOp(unittest.TestCase):
+
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         fleet.init(is_collective=True, strategy=strategy)
@@ -62,31 +63,31 @@ def test_parallel_margin_softmax_cross_entropy(self):
             for num_class_per_card in num_class_per_cards:
 
                 num_class = np.sum(num_class_per_card)
-                for margin1, margin2, margin3, scale in zip(margin1s, margin2s,
-                                                            margin3s, scales):
+                for margin1, margin2, margin3, scale in zip(
+                        margin1s, margin2s, margin3s, scales):
 
                     for _ in range(5):
                         np_label = np.random.randint(0, num_class,
                                                      (batch_size, ))
                         label = paddle.to_tensor(np_label, dtype="int64")
 
-                        input = paddle.randn(
-                            shape=[batch_size, feature_length], dtype=dtype)
+                        input = paddle.randn(shape=[batch_size, feature_length],
+                                             dtype=dtype)
                         input.stop_gradient = False
                         input_l2 = paddle.sqrt(
-                            paddle.sum(
-                                paddle.square(input), axis=1, keepdim=True))
+                            paddle.sum(paddle.square(input),
+                                       axis=1,
+                                       keepdim=True))
                         norm_input = paddle.divide(input, input_l2)
 
                         weight = paddle.randn(
-                            shape=[
-                                feature_length, num_class_per_card[rank_id]
-                            ],
+                            shape=[feature_length, num_class_per_card[rank_id]],
                             dtype=dtype)
                         weight.stop_gradient = False
                         weight_l2 = paddle.sqrt(
-                            paddle.sum(
-                                paddle.square(weight), axis=0, keepdim=True))
+                            paddle.sum(paddle.square(weight),
+                                       axis=0,
+                                       keepdim=True))
                         norm_weight = paddle.divide(weight, weight_l2)
 
                         data = paddle.matmul(norm_input, norm_weight)
@@ -96,12 +97,12 @@ def test_parallel_margin_softmax_cross_entropy(self):
                             num_class_per_card[:rank_id]) if rank_id > 0 else 0
                         end = np.sum(num_class_per_card[:rank_id + 1])
 
-                        integral_data = np.zeros(
-                            (batch_size, num_class), dtype=dtype)
-                        integral_data[:, sta:end] = data.clone().detach().numpy(
-                        )
-                        integral_data = paddle.to_tensor(
-                            integral_data, dtype=dtype)
+                        integral_data = np.zeros((batch_size, num_class),
+                                                 dtype=dtype)
+                        integral_data[:,
+                                      sta:end] = data.clone().detach().numpy()
+                        integral_data = paddle.to_tensor(integral_data,
+                                                         dtype=dtype)
 
                         paddle.distributed.all_reduce(
                             integral_data,
@@ -141,18 +142,17 @@ def test_parallel_margin_softmax_cross_entropy(self):
                             label=paddle.reshape(label, (-1, 1)),
                             return_softmax=True)
 
-                        np.testing.assert_allclose(
-                            loss_a.numpy(),
-                            loss_b.numpy(),
-                            rtol=1e-5,
-                            atol=1e-7)
+                        np.testing.assert_allclose(loss_a.numpy(),
+                                                   loss_b.numpy(),
+                                                   rtol=1e-5,
+                                                   atol=1e-7)
 
-                        integral_prob = np.zeros(
-                            (batch_size, num_class), dtype=dtype)
+                        integral_prob = np.zeros((batch_size, num_class),
+                                                 dtype=dtype)
                         integral_prob[:, sta:end] = softmax_a.clone().detach(
                         ).numpy()
-                        integral_prob = paddle.to_tensor(
-                            integral_prob, dtype=dtype)
+                        integral_prob = paddle.to_tensor(integral_prob,
+                                                         dtype=dtype)
                         paddle.distributed.all_reduce(
                             integral_prob,
                             op=paddle.distributed.ReduceOp.SUM,
@@ -160,32 +160,30 @@ def test_parallel_margin_softmax_cross_entropy(self):
                         integral_prob = integral_prob.detach().clone()
                         integral_prob.stop_gradient = False
 
-                        np.testing.assert_allclose(
-                            integral_prob.numpy(),
-                            softmax_b.numpy(),
-                            rtol=1e-5,
-                            atol=1e-6)
+                        np.testing.assert_allclose(integral_prob.numpy(),
+                                                   softmax_b.numpy(),
+                                                   rtol=1e-5,
+                                                   atol=1e-6)
 
                         loss_a = loss_a.sum() / batch_size
                         loss_b = loss_b.sum() / batch_size
                         loss_a.backward()
                         loss_b.backward()
 
-                        integral_grad = np.zeros(
-                            (batch_size, num_class), dtype=dtype)
+                        integral_grad = np.zeros((batch_size, num_class),
+                                                 dtype=dtype)
                         integral_grad[:, sta:end] = data.grad.clone().detach()
-                        integral_grad = paddle.to_tensor(
-                            integral_grad, dtype=dtype)
+                        integral_grad = paddle.to_tensor(integral_grad,
+                                                         dtype=dtype)
                         paddle.distributed.all_reduce(
                             integral_grad,
                             op=paddle.distributed.ReduceOp.SUM,
                             group=check_group)
 
-                        np.testing.assert_allclose(
-                            integral_data.grad.numpy(),
-                            integral_grad.numpy(),
-                            rtol=1e-5,
-                            atol=1e-7)
+                        np.testing.assert_allclose(integral_data.grad.numpy(),
+                                                   integral_grad.numpy(),
+                                                   rtol=1e-5,
+                                                   atol=1e-7)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/pipeline_mnist.py
index 37e992c4d1365..90238f56eea24 100644
--- a/python/paddle/fluid/tests/unittests/pipeline_mnist.py
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist.py
@@ -85,11 +85,13 @@ def cnn_model(data):
 
 
 class TestDistMnist2x2(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         # Input data
         with fluid.device_guard("gpu:0"):
-            images = fluid.layers.data(
-                name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+            images = fluid.layers.data(name='pixel',
+                                       shape=[1, 28, 28],
+                                       dtype=DTYPE)
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
             if dist_strategy:
@@ -107,8 +109,9 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         # Evaluator
         with fluid.device_guard("gpu:1"):
             batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-            batch_acc = fluid.layers.accuracy(
-                input=predict, label=label, total=batch_size_tensor)
+            batch_acc = fluid.layers.accuracy(input=predict,
+                                              label=label,
+                                              total=batch_size_tensor)
 
         inference_program = fluid.default_main_program().clone()
         base_lr = self.lr
@@ -125,10 +128,10 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         acc_steps = 2  # accumulated steps for pipeline
         if dist_strategy:
             # Reader
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=batch_size)
-            test_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=batch_size)
+            train_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                        batch_size=batch_size)
+            test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                       batch_size=batch_size)
             fleet.init(is_collective=True)
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
@@ -138,16 +141,16 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
                 'schedule_mode': '1F1B',
                 'accumulate_steps': acc_steps
             }
-            dist_opt = fleet.distributed_optimizer(
-                optimizer=opt, strategy=strategy)
+            dist_opt = fleet.distributed_optimizer(optimizer=opt,
+                                                   strategy=strategy)
             dist_opt.minimize(avg_cost)
         else:
             opt.minimize(avg_cost)
             # Reader
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps)
-            test_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps)
+            train_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                        batch_size=batch_size * acc_steps)
+            test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                       batch_size=batch_size * acc_steps)
 
         if dist_strategy:
             return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py b/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py
index 7211bd3e92f79..3ec8dfb44850e 100644
--- a/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py
@@ -85,11 +85,13 @@ def cnn_model(data):
 
 
 class TestDistMnist2x2(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         # Input data
         with fluid.device_guard("gpu:0"):
-            images = fluid.layers.data(
-                name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+            images = fluid.layers.data(name='pixel',
+                                       shape=[1, 28, 28],
+                                       dtype=DTYPE)
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
             if dist_strategy:
@@ -107,8 +109,9 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         # Evaluator
         with fluid.device_guard("gpu:1"):
             batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-            batch_acc = fluid.layers.accuracy(
-                input=predict, label=label, total=batch_size_tensor)
+            batch_acc = fluid.layers.accuracy(input=predict,
+                                              label=label,
+                                              total=batch_size_tensor)
 
         inference_program = fluid.default_main_program().clone()
         base_lr = self.lr
@@ -125,10 +128,10 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         acc_steps = 2  # accumulated steps for pipeline
         if dist_strategy:
             # Reader
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=batch_size)
-            test_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=batch_size)
+            train_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                        batch_size=batch_size)
+            test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                       batch_size=batch_size)
             fleet.init(is_collective=True)
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
@@ -138,16 +141,16 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
                 'schedule_mode': 'F-then-B',
                 'accumulate_steps': acc_steps
             }
-            dist_opt = fleet.distributed_optimizer(
-                optimizer=opt, strategy=strategy)
+            dist_opt = fleet.distributed_optimizer(optimizer=opt,
+                                                   strategy=strategy)
             dist_opt.minimize(avg_cost)
         else:
             opt.minimize(avg_cost)
             # Reader
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps)
-            test_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps)
+            train_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                        batch_size=batch_size * acc_steps)
+            test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                       batch_size=batch_size * acc_steps)
 
         if dist_strategy:
             return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py b/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py
index 41b3ad34103c5..cfc5a4904ac3e 100644
--- a/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py
@@ -76,14 +76,16 @@ def cnn_model(data):
 
 
 class TestDistMnist2x2(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         # Input data
         device_id = 0
         if dist_strategy:
             fleet.init(is_collective=True)
         with fluid.device_guard("gpu:0"):
-            images = fluid.layers.data(
-                name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+            images = fluid.layers.data(name='pixel',
+                                       shape=[1, 28, 28],
+                                       dtype=DTYPE)
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
             if dist_strategy:
@@ -101,8 +103,9 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         # Evaluator
         with fluid.device_guard("gpu:0"):
             batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-            batch_acc = fluid.layers.accuracy(
-                input=predict, label=label, total=batch_size_tensor)
+            batch_acc = fluid.layers.accuracy(input=predict,
+                                              label=label,
+                                              total=batch_size_tensor)
 
         inference_program = fluid.default_main_program().clone()
         base_lr = self.lr
@@ -114,10 +117,10 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9)
 
         # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                    batch_size=batch_size)
+        test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                                   batch_size=batch_size)
 
         if dist_strategy:
             strategy = fleet.DistributedStrategy()
@@ -126,8 +129,8 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
                 'schedule_mode': 'F-then-B',
                 'micro_batch_size': batch_size
             }
-            dist_opt = fleet.distributed_optimizer(
-                optimizer=opt, strategy=strategy)
+            dist_opt = fleet.distributed_optimizer(optimizer=opt,
+                                                   strategy=strategy)
             dist_opt.minimize(avg_cost)
         else:
             opt.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py
index 9be8a35f1ae1b..f18d73842bdb6 100644
--- a/python/paddle/fluid/tests/unittests/process_group_gloo.py
+++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py
@@ -30,6 +30,7 @@
 
 
 class TestProcessGroupFp32(unittest.TestCase):
+
     def setUp(self):
         paddle.seed(2022)
         random.seed(2022)
diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py
index 3667633d3b38d..1635eb6c951bd 100644
--- a/python/paddle/fluid/tests/unittests/process_group_nccl.py
+++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py
@@ -39,6 +39,7 @@ def init_process_group(strategy=None):
 
 
 class TestProcessGroupFp32(unittest.TestCase):
+
     def setUp(self):
         paddle.seed(2022)
         random.seed(2022)
@@ -87,13 +88,15 @@ def test_create_process_group_nccl(self):
             max_result = paddle.maximum(tensor_x, tensor_y)
 
             if pg.rank() == 0:
-                task = dist.all_reduce(
-                    tensor_x, dist.ReduceOp.MAX, use_calc_stream=False)
+                task = dist.all_reduce(tensor_x,
+                                       dist.ReduceOp.MAX,
+                                       use_calc_stream=False)
                 task.wait()
                 assert np.array_equal(tensor_x, max_result)
             else:
-                task = dist.all_reduce(
-                    tensor_y, dist.ReduceOp.MAX, use_calc_stream=False)
+                task = dist.all_reduce(tensor_y,
+                                       dist.ReduceOp.MAX,
+                                       use_calc_stream=False)
                 task.wait()
                 assert np.array_equal(tensor_y, max_result)
 
@@ -110,13 +113,15 @@ def test_create_process_group_nccl(self):
             min_result = paddle.minimum(tensor_x, tensor_y)
 
             if pg.rank() == 0:
-                task = dist.all_reduce(
-                    tensor_x, dist.ReduceOp.MIN, use_calc_stream=False)
+                task = dist.all_reduce(tensor_x,
+                                       dist.ReduceOp.MIN,
+                                       use_calc_stream=False)
                 task.wait()
                 assert np.array_equal(tensor_x, min_result)
             else:
-                task = dist.all_reduce(
-                    tensor_y, dist.ReduceOp.MIN, use_calc_stream=False)
+                task = dist.all_reduce(tensor_y,
+                                       dist.ReduceOp.MIN,
+                                       use_calc_stream=False)
                 task.wait()
                 assert np.array_equal(tensor_y, min_result)
 
@@ -133,13 +138,15 @@ def test_create_process_group_nccl(self):
             prod_result = np.multiply(x, y)
 
             if pg.rank() == 0:
-                task = dist.all_reduce(
-                    tensor_x, dist.ReduceOp.PROD, use_calc_stream=False)
+                task = dist.all_reduce(tensor_x,
+                                       dist.ReduceOp.PROD,
+                                       use_calc_stream=False)
                 task.wait()
                 assert np.array_equal(tensor_x, prod_result)
             else:
-                task = dist.all_reduce(
-                    tensor_y, dist.ReduceOp.PROD, use_calc_stream=False)
+                task = dist.all_reduce(tensor_y,
+                                       dist.ReduceOp.PROD,
+                                       use_calc_stream=False)
                 task.wait()
                 assert np.array_equal(tensor_y, prod_result)
 
@@ -195,10 +202,12 @@ def test_create_process_group_nccl(self):
             # rank 1
             else:
                 tensor_out_list = [
-                    paddle.empty_like(tensor_x), paddle.empty_like(tensor_x)
+                    paddle.empty_like(tensor_x),
+                    paddle.empty_like(tensor_x)
                 ]
-                task = dist.all_gather(
-                    tensor_out_list, tensor_y, use_calc_stream=False)
+                task = dist.all_gather(tensor_out_list,
+                                       tensor_y,
+                                       use_calc_stream=False)
                 paddle.device.cuda.synchronize()
                 tensor_out = paddle.concat(tensor_out_list)
             out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
@@ -215,8 +224,9 @@ def test_create_process_group_nccl(self):
             # rank 1
             else:
                 tensor_out_list = []
-                task = dist.all_gather(
-                    tensor_out_list, tensor_y, use_calc_stream=False)
+                task = dist.all_gather(tensor_out_list,
+                                       tensor_y,
+                                       use_calc_stream=False)
                 paddle.device.cuda.synchronize()
                 tensor_out = paddle.concat(tensor_out_list)
             out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
@@ -322,13 +332,17 @@ def test_create_process_group_nccl(self):
             max_result = paddle.maximum(tensor_x, tensor_y)
 
             if pg.rank() == 0:
-                task = dist.reduce(
-                    tensor_x, 0, dist.ReduceOp.MAX, use_calc_stream=False)
+                task = dist.reduce(tensor_x,
+                                   0,
+                                   dist.ReduceOp.MAX,
+                                   use_calc_stream=False)
                 task.wait()
                 assert np.array_equal(tensor_x, max_result)
             else:
-                task = dist.reduce(
-                    tensor_y, 0, dist.ReduceOp.MAX, use_calc_stream=False)
+                task = dist.reduce(tensor_y,
+                                   0,
+                                   dist.ReduceOp.MAX,
+                                   use_calc_stream=False)
                 task.wait()
 
             print("test reduce max api ok")
@@ -344,13 +358,17 @@ def test_create_process_group_nccl(self):
             min_result = paddle.minimum(tensor_x, tensor_y)
 
             if pg.rank() == 0:
-                task = dist.reduce(
-                    tensor_x, 0, dist.ReduceOp.MIN, use_calc_stream=False)
+                task = dist.reduce(tensor_x,
+                                   0,
+                                   dist.ReduceOp.MIN,
+                                   use_calc_stream=False)
                 task.wait()
                 assert np.array_equal(tensor_x, min_result)
             else:
-                task = dist.reduce(
-                    tensor_y, 0, dist.ReduceOp.MIN, use_calc_stream=False)
+                task = dist.reduce(tensor_y,
+                                   0,
+                                   dist.ReduceOp.MIN,
+                                   use_calc_stream=False)
                 task.wait()
 
             print("test reduce min api ok")
@@ -366,13 +384,17 @@ def test_create_process_group_nccl(self):
             prod_result = np.multiply(x, y)
 
             if pg.rank() == 0:
-                task = dist.reduce(
-                    tensor_x, 0, dist.ReduceOp.PROD, use_calc_stream=False)
+                task = dist.reduce(tensor_x,
+                                   0,
+                                   dist.ReduceOp.PROD,
+                                   use_calc_stream=False)
                 task.wait()
                 assert np.array_equal(tensor_x, prod_result)
             else:
-                task = dist.reduce(
-                    tensor_y, 0, dist.ReduceOp.PROD, use_calc_stream=False)
+                task = dist.reduce(tensor_y,
+                                   0,
+                                   dist.ReduceOp.PROD,
+                                   use_calc_stream=False)
                 task.wait()
 
             print("test reduce prod api ok")
@@ -386,8 +408,9 @@ def test_create_process_group_nccl(self):
             tensor_y = paddle.to_tensor(y)
             if pg.rank() == 0:
                 in_1, in_2 = paddle.split(tensor_x, 2)
-                task = dist.scatter(
-                    tensor_y, [in_1, in_2], 0, use_calc_stream=True)
+                task = dist.scatter(tensor_y, [in_1, in_2],
+                                    0,
+                                    use_calc_stream=True)
                 #task.wait()
                 paddle.device.cuda.synchronize()
             # rank 1
@@ -440,6 +463,7 @@ def test_create_process_group_nccl(self):
 
 
 class TestProcessGroupFp16(TestProcessGroupFp32):
+
     def setUp(self):
         paddle.seed(2022)
         random.seed(2022)
diff --git a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
index 9af32a8aca741..ab985d73d5387 100755
--- a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
@@ -1,8 +1,11 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-    list(APPEND TEST_OPS ${TEST_OP})
-    set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 50)
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  list(APPEND TEST_OPS ${TEST_OP})
+  set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 50)
 endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/ps/__init__.py b/python/paddle/fluid/tests/unittests/ps/__init__.py
index 1f919f0f05bf8..5a5bd1e0048c4 100644
--- a/python/paddle/fluid/tests/unittests/ps/__init__.py
+++ b/python/paddle/fluid/tests/unittests/ps/__init__.py
@@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.p
 
-# Note: On Windows, import form subdirectories such as dirA()->dirB(), current directory 
+# Note: On Windows, import form subdirectories such as dirA()->dirB(), current directory
 # will still be dirA(), But is should be dirB(). So it will ModulNotFoundError
 # please refer to https://stackoverflow.com/questions/8953844/import-module-from-subfolder
diff --git a/python/paddle/fluid/tests/unittests/ps/dataset_generator_A.py b/python/paddle/fluid/tests/unittests/ps/dataset_generator_A.py
new file mode 100755
index 0000000000000..1ab4b3580d6ad
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ps/dataset_generator_A.py
@@ -0,0 +1,50 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.incubate.data_generator as dg
+
+cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+cont_max_ = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
+cont_diff_ = [20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
+hash_dim_ = 1000001
+continuous_range_ = range(1, 14)
+categorical_range_ = range(14, 40)
+
+
+class CriteoDataset(dg.MultiSlotDataGenerator):
+
+    def generate_sample(self, line):
+        """
+        Read the data line by line and process it as a dictionary
+        """
+
+        def reader():
+            """
+            This function needs to be implemented by the user, based on data format
+            """
+            features = line.rstrip('\n').split('\t')
+            feature_name = []
+            sparse_feature = []
+            for idx in categorical_range_:
+                sparse_feature.append(
+                    [hash(str(idx) + features[idx]) % hash_dim_])
+            for idx in categorical_range_:
+                feature_name.append("C" + str(idx - 13))
+            yield list(zip(feature_name, sparse_feature))
+
+        return reader
+
+
+d = CriteoDataset()
+d.run_from_stdin()
diff --git a/python/paddle/fluid/tests/unittests/ps/dataset_generator_B.py b/python/paddle/fluid/tests/unittests/ps/dataset_generator_B.py
new file mode 100755
index 0000000000000..76b2468592dff
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ps/dataset_generator_B.py
@@ -0,0 +1,54 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.incubate.data_generator as dg
+
+cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+cont_max_ = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
+cont_diff_ = [20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
+hash_dim_ = 1000001
+continuous_range_ = range(1, 14)
+categorical_range_ = range(14, 40)
+
+
+class CriteoDataset(dg.MultiSlotDataGenerator):
+
+    def generate_sample(self, line):
+        """
+        Read the data line by line and process it as a dictionary
+        """
+
+        def reader():
+            """
+            This function needs to be implemented by the user, based on data format
+            """
+            features = line.rstrip('\n').split('\t')
+            dense_feature = []
+            for idx in continuous_range_:
+                if features[idx] == "":
+                    dense_feature.append(0.0)
+                else:
+                    dense_feature.append(
+                        (float(features[idx]) - cont_min_[idx - 1]) /
+                        cont_diff_[idx - 1])
+            label = [int(features[0])]
+            feature_name = ["dense_feature"]
+            feature_name.append("label")
+            yield list(zip(feature_name, [label] + [dense_feature]))
+
+        return reader
+
+
+d = CriteoDataset()
+d.run_from_stdin()
diff --git a/python/paddle/fluid/tests/unittests/ps/download_data.sh b/python/paddle/fluid/tests/unittests/ps/download_data.sh
new file mode 100755
index 0000000000000..498d9df9c2b4a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ps/download_data.sh
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+wget --no-check-certificate https://fleet.bj.bcebos.com/ctr_data.tar.gz
+tar -zxvf ctr_data.tar.gz
+mv ./raw_data ./train_data_full
+mkdir train_data && cd train_data
+cp ../train_data_full/part-0 ../train_data_full/part-1 ./ && cd ..
+mv ./test_data ./test_data_full
+mkdir test_data && cd test_data
+cp ../test_data_full/part-220 ./  && cd ..
+echo "Complete data download."
+echo "Full Train data stored in ./train_data_full "
+echo "Full Test data stored in ./test_data_full "
+echo "Rapid Verification train data stored in ./train_data "
+echo "Rapid Verification test data stored in ./test_data "
diff --git a/python/paddle/fluid/tests/unittests/ps/fl_async_ps_config.yaml b/python/paddle/fluid/tests/unittests/ps/fl_async_ps_config.yaml
new file mode 100755
index 0000000000000..3e02046f71c91
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ps/fl_async_ps_config.yaml
@@ -0,0 +1,39 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# refer to PaddleRec/models/rank/dnn/benchmark.yaml
+
+hyper_parameters:
+  optimizer:
+    class: Adam
+    learning_rate: 0.0001
+    adam_lazy_mode: True
+  sparse_inputs_slots: 27
+  sparse_feature_number: 1000001
+  sparse_feature_dim: 10
+  dense_input_dim: 13
+  fc_sizes: []
+
+runner:
+  sync_mode: "async"  # sync / async / geo / heter
+  is_fl_ps_mode: 1
+  reader_thread_num: 16
+  use_gpu: 0
+  batch_size: 2
+  train_files_path: "./train_data"
+  epoch_num: 4
+  
+  model_path: "../ps_dnn_model.py"
+
+  
diff --git a/python/paddle/fluid/tests/unittests/ps/fl_ps_trainer.py b/python/paddle/fluid/tests/unittests/ps/fl_ps_trainer.py
new file mode 100755
index 0000000000000..6018060bba534
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ps/fl_ps_trainer.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+import numpy as np
+import time
+import paddle
+from paddle.distributed.ps.utils.public import ps_log_root_dir, debug_program
+import paddle.distributed.fleet as fleet
+import paddle.fluid as fluid
+
+
+def get_dataset(inputs, config, pipe_cmd, role="worker"):
+    dataset = fluid.DatasetFactory().create_dataset()
+    dataset.set_use_var(inputs)
+    dataset.set_pipe_command(pipe_cmd)
+    dataset.set_batch_size(config.get('runner.batch_size'))
+    reader_thread_num = int(config.get('runner.reader_thread_num'))
+    dataset.set_thread(reader_thread_num)
+    train_files_path = config.get('runner.train_files_path')
+    print('train_data_files:{}'.format(train_files_path))
+    file_list = [
+        os.path.join(train_files_path, x) for x in os.listdir(train_files_path)
+    ]
+    if role == "worker":
+        file_list = fleet.util.get_file_shard(file_list)
+        print("worker file list: {}".format(file_list))
+    elif role == "heter_worker":
+        file_list = fleet.util.get_heter_file_shard(file_list)
+        print("heter worker file list: {}".format(file_list))
+
+    return dataset, file_list
+
+
+def fl_ps_train():
+    # 0. get role
+    import paddle.distributed.fleet.base.role_maker as role_maker
+    role_maker = role_maker.PaddleCloudRoleMaker()
+    role_maker._generate_role()
+    fleet.util._set_role_maker(role_maker)
+
+    # 1. load yaml-config to dict-config
+    from ps_dnn_trainer import YamlHelper, StaticModel, get_user_defined_strategy
+    yaml_helper = YamlHelper()
+    config_yaml_path = '../ps/fl_async_ps_config.yaml'
+    config = yaml_helper.load_yaml(config_yaml_path)
+    #yaml_helper.print_yaml(config)
+
+    # 2. get static model
+    paddle.enable_static()
+    model = StaticModel(config)
+    feeds_list = model.create_feeds()
+    metrics = model.fl_net(feeds_list)
+    loss = model._cost
+
+    # 3. compile time - build program_desc
+    user_defined_strategy = get_user_defined_strategy(config)
+    a_sync_configs = user_defined_strategy.a_sync_configs
+    a_sync_configs["launch_barrier"] = True
+    user_defined_strategy.a_sync_configs = a_sync_configs
+    print("launch_barrier: ",
+          user_defined_strategy.a_sync_configs["launch_barrier"])
+    learning_rate = config.get("hyper_parameters.optimizer.learning_rate")
+    inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True)
+    from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer
+    ps_optimizer = ParameterServerOptimizer(inner_optimizer)
+    ps_optimizer._set_basic_info(loss, role_maker, inner_optimizer,
+                                 user_defined_strategy)
+    ps_optimizer.minimize_impl(loss)
+
+    # 4. runtime
+    from paddle.distributed.ps.the_one_ps import TheOnePSRuntime
+    _runtime_handle = TheOnePSRuntime()  # ps 目录下重构版的 TheOnePSRuntime
+    _runtime_handle._set_basic_info(ps_optimizer.pass_ctx._attrs)
+    epoch_num = int(config.get('runner.epoch_num'))
+    # 4.1 run server - build fleet_desc
+    if role_maker._is_server():
+        _runtime_handle._init_server()
+        _runtime_handle._run_server()
+    # 4.2 run worker
+    elif role_maker._is_worker():
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        _runtime_handle._init_worker()
+        print('trainer get dataset')
+        inputs = feeds_list[1:-1]
+        dataset, file_list = get_dataset(inputs, config,
+                                         "python dataset_generator_A.py")
+        print("fluid.default_main_program: {}".format(
+            fluid.default_main_program()._heter_pipeline_opt))
+        for epoch in range(epoch_num):
+            # A 方和 B 方如果要以文件粒度 shuffle 时，则需要固定同一个种子
+            dataset.set_filelist(file_list)
+            start_time = time.time()
+            exe.train_from_dataset(program=fluid.default_main_program(),
+                                   dataset=dataset,
+                                   print_period=2,
+                                   debug=False)
+            end_time = time.time()
+            print("trainer epoch %d finished, use time=%d\n" %
+                  ((epoch), end_time - start_time))
+        exe.close()
+        _runtime_handle._stop_worker()
+        print("Fl partyA Trainer Success!")
+    else:
+        exe = fluid.Executor()
+        exe.run(fluid.default_startup_program())
+        _runtime_handle._init_worker()
+        inputs = [feeds_list[0],
+                  feeds_list[-1]]  # 顺序务必要和 dataset_generator_B.py 中保持一致
+        dataset, file_list = get_dataset(inputs, config,
+                                         "python dataset_generator_B.py",
+                                         "heter_worker")
+        print("fluid.default_main_program: {}".format(
+            fluid.default_main_program()._heter_pipeline_opt))
+        for epoch in range(epoch_num):
+            dataset.set_filelist(file_list)
+            exe.train_from_dataset(program=fluid.default_main_program(),
+                                   dataset=dataset,
+                                   print_period=2,
+                                   debug=False)
+        exe.close()
+        _runtime_handle._stop_worker()
+        print("Fl partB Trainer Success!")
+
+
+if __name__ == '__main__':
+    fl_ps_train()
diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
index 0fd64b0d92305..2d430cac648aa 100755
--- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
+++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
@@ -26,6 +26,7 @@
 import ast
 import numpy as np
 import struct
+
 sys.path.append("..")
 from ps_dnn_model import StaticModel
 
@@ -35,7 +36,7 @@
 
 def is_distributed_env():
     node_role = os.getenv("TRAINING_ROLE")
-    logger.info("-- Role: {} --".format(node_role))
+    print("-- Role: {} --".format(node_role))
     if node_role is None:
         return False
     else:
@@ -43,6 +44,7 @@ def is_distributed_env():
 
 
 class YamlHelper(object):
+
     def load_yaml(self, yaml_file, other_part=None):
         part_list = ["runner", "hyper_parameters"]
         if other_part:
@@ -121,8 +123,8 @@ def pretty_print_envs(self, envs, header=None):
         for k, v in envs.items():
             max_k = max(max_k, len(k))
 
-        h_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(max_k, " " *
-                                                              spacing, max_v)
+        h_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(
+            max_k, " " * spacing, max_v)
         l_format = "    " + "|{{:>{}s}}{{}}{{:^{}s}}|\n".format(max_k, max_v)
         length = max_k + max_v + spacing
 
@@ -167,6 +169,14 @@ def get_user_defined_strategy(config):
     elif sync_mode == "async":
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
+        strategy.is_fl_ps_mode = True if config.get(
+            "runner.is_fl_ps_mode") == 1 else False
+        if strategy.is_fl_ps_mode == True:
+            strategy.pipeline = False
+            micro_num = 1
+            strategy.pipeline_configs = {
+                "accumulate_steps": micro_num
+            }  ## num_microbatches
     elif sync_mode == "geo":
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
@@ -215,13 +225,14 @@ def get_user_defined_strategy(config):
     print("strategy table config:", strategy.sparse_table_configs)
     a_sync_configs = strategy.a_sync_configs
     a_sync_configs["launch_barrier"] = False
+    # a_sync_configs["launch_barrier"] = True
     strategy.a_sync_configs = a_sync_configs
     print("launch_barrier: ", strategy.a_sync_configs["launch_barrier"])
 
     return strategy
 
 
-def get_distributed_strategy(user_defined_strategy):
+def get_distributed_strategy(user_defined_strategy):  # pslib
     from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 
     k_steps = user_defined_strategy.a_sync_configs["k_steps"]
@@ -251,29 +262,45 @@ def get_model(config):
 
 def parse_args():
     parser = argparse.ArgumentParser("PsTest train script")
-    parser.add_argument(
-        '-m', '--config_yaml', type=str, required=True, help='config file path')
-    parser.add_argument(
-        '-bf16',
-        '--pure_bf16',
-        type=ast.literal_eval,
-        default=False,
-        help="whether use bf16")
-
-    parser.add_argument(
-        '--run_minimize', type=int, default=0, help="test single pass")
-    parser.add_argument(
-        '--run_single_pass', type=int, default=0, help="test single pass")
-    parser.add_argument(
-        '--run_the_one_ps', type=int, default=0, help="test the_one_ps")
-    parser.add_argument(
-        '--debug_new_minimize', type=int, default=0, help="test single pass")
-    parser.add_argument(
-        '--debug_new_pass', type=int, default=0, help="test single pass")
-    parser.add_argument(
-        '--applied_pass_name', type=str, default="", help="test single pass")
-    parser.add_argument(
-        '--debug_the_one_ps', type=int, default=0, help="test the_one_ps")
+    parser.add_argument('-m',
+                        '--config_yaml',
+                        type=str,
+                        required=True,
+                        help='config file path')
+    parser.add_argument('-bf16',
+                        '--pure_bf16',
+                        type=ast.literal_eval,
+                        default=False,
+                        help="whether use bf16")
+
+    parser.add_argument('--run_minimize',
+                        type=int,
+                        default=0,
+                        help="test single pass")
+    parser.add_argument('--run_single_pass',
+                        type=int,
+                        default=0,
+                        help="test single pass")
+    parser.add_argument('--run_the_one_ps',
+                        type=int,
+                        default=0,
+                        help="test the_one_ps")
+    parser.add_argument('--debug_new_minimize',
+                        type=int,
+                        default=0,
+                        help="test single pass")
+    parser.add_argument('--debug_new_pass',
+                        type=int,
+                        default=0,
+                        help="test single pass")
+    parser.add_argument('--applied_pass_name',
+                        type=str,
+                        default="",
+                        help="test single pass")
+    parser.add_argument('--debug_the_one_ps',
+                        type=int,
+                        default=0,
+                        help="test the_one_ps")
 
     args = parser.parse_args()
     args.abs_dir = os.path.dirname(os.path.abspath(args.config_yaml))
@@ -298,6 +325,7 @@ def bf16_to_fp32(val):
 
 
 class DnnTrainer(object):
+
     def __init__(self, config):
         self.metrics = {}
         self.config = config
@@ -318,14 +346,14 @@ def init_fleet_with_gloo(self, use_gloo=False):
             fleet.init()
 
         if fleet.is_server():
-            logger.info("server: {} started".format(fleet.server_index()))
+            print("server: {} started".format(fleet.server_index()))
         else:
-            logger.info("worker: {} started".format(fleet.worker_index()))
+            print("worker: {} started".format(fleet.worker_index()))
 
     def run_minimize(self):
         self.init_fleet_with_gloo()
         self.model = get_model(self.config)
-        logger.info("cpu_num: {}".format(os.getenv("CPU_NUM")))
+        print("cpu_num: {}".format(os.getenv("CPU_NUM")))
         self.input_data = self.model.create_feeds()
         self.metrics = self.model.net(self.input_data)
         loss = self.model._cost
@@ -337,14 +365,14 @@ def run_minimize(self):
 
         self.role_maker._generate_role()  # 必要
         if self.config['debug_new_minimize'] == 1:
-            logger.info("entering run_minimize -- new")
+            print("entering run_minimize -- new")
             from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer
             ps_optimizer = ParameterServerOptimizer(inner_optimizer)
             ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer,
                                          user_defined_strategy)
             ps_optimizer.minimize_impl(loss)
         else:
-            logger.info("entering run_minimize -- old")
+            print("entering run_minimize -- old")
             fleet_obj = fleet.distributed_optimizer(
                 inner_optimizer, user_defined_strategy)  ## Fleet 对象
             fleet_obj.minimize(loss)
@@ -359,8 +387,8 @@ def run_minimize(self):
             debug_program(_main_file, loss.block.program)
         elif self.role_maker._is_heter_worker():
             _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str(
-                self.config[
-                    'debug_new_minimize']) + '_heter_worker_main.prototxt'
+                self.config['debug_new_minimize']
+            ) + '_heter_worker_main.prototxt'
             debug_program(_main_file, loss.block.program)
 
     def run_single_pass(self):
@@ -376,7 +404,7 @@ def run_single_pass(self):
         startup_program = paddle.static.default_startup_program()
         inner_optimizer.minimize(loss, startup_program)
         if self.config['debug_new_pass'] == 1:
-            logger.info("entering run {} - new".format(
+            print("entering run {} - new".format(
                 str(config["applied_pass_name"])))
             from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer
             ps_optimizer = ParameterServerOptimizer(inner_optimizer)
@@ -390,7 +418,7 @@ def run_single_pass(self):
                                             ps_optimizer.pass_ctx._attrs)
             append_send_ops_pass.apply([_main], [None], ps_optimizer.pass_ctx)
         else:
-            logger.info("entering run {} - old".format(
+            print("entering run {} - old".format(
                 str(config["applied_pass_name"])))
             from paddle.fluid.incubate.fleet.parameter_server.ir import public as public
             dist_strategy = get_distributed_strategy(user_defined_strategy)
@@ -404,14 +432,14 @@ def run_single_pass(self):
             _main = worker.append_send_ops_pass(_main, compiled_config)
 
         if fleet.is_server():
-            _main_file = ps_log_root_dir + sync_mode + "_" + str(config[
-                "applied_pass_name"]) + '_debug:_' + str(self.config[
-                    'debug_new_pass']) + '_server_main.prototxt'
+            _main_file = ps_log_root_dir + sync_mode + "_" + str(
+                config["applied_pass_name"]) + '_debug:_' + str(
+                    self.config['debug_new_pass']) + '_server_main.prototxt'
             debug_program(_main_file, _main)
         elif fleet.is_worker():
-            _main_file = ps_log_root_dir + sync_mode + "_" + str(config[
-                "applied_pass_name"]) + '_debug:_' + str(self.config[
-                    'debug_new_pass']) + '_worker_main.prototxt'
+            _main_file = ps_log_root_dir + sync_mode + "_" + str(
+                config["applied_pass_name"]) + '_debug:_' + str(
+                    self.config['debug_new_pass']) + '_worker_main.prototxt'
             debug_program(_main_file, _main)
 
     def run_the_one_ps(self):
@@ -428,7 +456,7 @@ def run_the_one_ps(self):
 
         self.role_maker._generate_role()  # 必要
         if self.config['debug_the_one_ps'] == 1:
-            logger.info("entering run_the_one_ps -- new")
+            print("entering run_the_one_ps -- new")
 
             from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer
             ps_optimizer = ParameterServerOptimizer(inner_optimizer)
@@ -442,20 +470,22 @@ def run_the_one_ps(self):
             if fleet.is_worker():
                 worker_desc = _runtime_handle.ps_desc_builder.build_worker_desc(
                 )
-                with open(ps_log_root_dir + sync_mode + '_' +
-                          'new_worker_ps_desc', 'w') as f:
+                with open(
+                        ps_log_root_dir + sync_mode + '_' +
+                        'new_worker_ps_desc', 'w') as f:
                     f.write(worker_desc)
             if fleet.is_server():
                 server_desc = _runtime_handle.ps_desc_builder.build_server_desc(
                 )
-                with open(ps_log_root_dir + sync_mode + '_' +
-                          'new_server_ps_desc', 'w') as f:
+                with open(
+                        ps_log_root_dir + sync_mode + '_' +
+                        'new_server_ps_desc', 'w') as f:
                     f.write(server_desc)
 
         else:
             pass
         '''          
-            logger.info("entering run_the_one_ps -- old")
+            print("entering run_the_one_ps -- old")
             fleet_obj = fleet.distributed_optimizer(
                 inner_optimizer, user_defined_strategy)  
             fleet_obj.minimize(loss)  
@@ -486,7 +516,7 @@ def run_the_one_ps(self):
 if __name__ == "__main__":
     paddle.enable_static()
     config = parse_args()
-    logger.info(">>>>>>>>>> python process started")
+    print(">>>>>>>>>> python process started")
     os.environ["CPU_NUM"] = str(config.get("runner.thread_num"))
     benchmark_main = DnnTrainer(config)
     if config['run_single_pass'] == 1:
diff --git a/python/paddle/fluid/tests/unittests/ps/test_fl_ps.py b/python/paddle/fluid/tests/unittests/ps/test_fl_ps.py
new file mode 100755
index 0000000000000..c2fc55efdfeb4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ps/test_fl_ps.py
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import shlex
+from paddle.fluid.tests.unittests.distributed_passes.dist_pass_test_base import prepare_python_path_and_return_module, remove_path_if_exists
+import os
+
+
+class FlPsTest(unittest.TestCase):
+
+    def test_launch_fl_ps(self):
+        pass
+        '''
+        cmd = [
+            'python', '-m', 'paddle.distributed.fleet.launch', '--log_dir',
+            '/ps_log/fl_ps', '--servers', "127.0.0.1:8070", '--workers',
+            "127.0.0.1:8080,127.0.0.1:8081", '--heter_workers',
+            "127.0.0.1:8090,127.0.0.1:8091", '--heter_devices', "cpu",
+            '--worker_num', "2", '--heter_worker_num', "2", 'fl_ps_trainer.py'
+        ]
+        cmd = [shlex.quote(c) for c in cmd]
+        prepare_python_path_and_return_module(__file__)
+        exitcode = os.system(' '.join(cmd))
+        '''
+
+
+if __name__ == '__main__':
+    remove_path_if_exists('/ps_log')
+    remove_path_if_exists('/ps_usr_print_log')
+    if not os.path.exists('./train_data'):
+        os.system('sh download_data.sh')
+        os.system('rm -rf ctr_data.tar.gz')
+        os.sysyem('rm -rf train_data_full')
+        os.sysyem('rm -rf test_data_full')
+    unittest.main()
+    if os.path.exists('./train_data'):
+        os.system('rm -rf train_data')
+        os.system('rm -rf test_data')
diff --git a/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py b/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
index 6752ea081a0e1..628d0d94ece12 100755
--- a/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
+++ b/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
@@ -31,6 +31,7 @@
 
 
 class TestTheOnePs(PsPassTestBase):
+
     def setUp(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/ps_dnn_model.py b/python/paddle/fluid/tests/unittests/ps_dnn_model.py
index 8d91e0f4678cb..d54e6cd643da5 100755
--- a/python/paddle/fluid/tests/unittests/ps_dnn_model.py
+++ b/python/paddle/fluid/tests/unittests/ps_dnn_model.py
@@ -17,10 +17,10 @@
 import paddle.nn.functional as F
 import math
 import paddle.distributed.fleet as fleet
-from paddle.distributed.ps.utils.public import logger
 
 
 class DNNLayer(nn.Layer):
+
     def __init__(self,
                  sparse_feature_number,
                  sparse_feature_dim,
@@ -90,7 +90,157 @@ def forward(self, sparse_inputs, dense_inputs):
         return y_dnn
 
 
+class FlDNNLayer(nn.Layer):
+
+    def __init__(self,
+                 sparse_feature_number,
+                 sparse_feature_dim,
+                 dense_feature_dim,
+                 sparse_number,
+                 sync_mode=None):
+        super(FlDNNLayer, self).__init__()
+
+        self.PART_A_DEVICE_FlAG = 'gpu:0'
+        self.PART_A_JOINT_OP_DEVICE_FlAG = 'gpu:2'
+        self.PART_B_DEVICE_FlAG = 'gpu:1'
+        self.PART_B_JOINT_OP_DEVICE_FlAG = 'gpu:3'
+
+        self.sync_mode = sync_mode
+        self.sparse_feature_number = sparse_feature_number
+        self.sparse_feature_dim = sparse_feature_dim
+        self.slot_num = sparse_number
+        self.dense_feature_dim = dense_feature_dim
+
+        layer_sizes_a = [self.slot_num * self.sparse_feature_dim, 5,
+                         7]  # for test
+        layer_sizes_b = [self.dense_feature_dim, 6, 7]
+        layer_sizes_top = [7, 2]
+
+        self.embedding = paddle.nn.Embedding(
+            self.sparse_feature_number,
+            self.sparse_feature_dim,
+            sparse=True,
+            weight_attr=paddle.ParamAttr(
+                name="SparseFeatFactors",
+                initializer=paddle.nn.initializer.Uniform()))
+
+        # part_a fc
+        acts = ["relu" for _ in range(len(layer_sizes_a))]
+        self._mlp_layers_a = []
+        for i in range(len(layer_sizes_a) - 1):
+            linear = paddle.nn.Linear(
+                in_features=layer_sizes_a[i],
+                out_features=layer_sizes_a[i + 1],
+                weight_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.Normal(
+                        std=1.0 / math.sqrt(layer_sizes_a[i]))))
+            self.add_sublayer('linear_%d' % i, linear)
+            self._mlp_layers_a.append(linear)
+            act = paddle.nn.ReLU()
+            self.add_sublayer('act_%d' % i, act)
+            self._mlp_layers_a.append(act)
+
+        # part_b fc
+        acts = ["relu" for _ in range(len(layer_sizes_b))]
+        self._mlp_layers_b = []
+        for i in range(len(layer_sizes_b) - 1):
+            linear = paddle.nn.Linear(
+                in_features=layer_sizes_b[i],
+                out_features=layer_sizes_b[i + 1],
+                weight_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.Normal(
+                        std=1.0 / math.sqrt(layer_sizes_b[i]))))
+            self.add_sublayer('linear_%d' % i, linear)
+            self._mlp_layers_b.append(linear)
+            act = paddle.nn.ReLU()
+            self.add_sublayer('act_%d' % i, act)
+            self._mlp_layers_b.append(act)
+
+        # top fc
+        acts = ["relu" for _ in range(len(layer_sizes_top))]
+        self._mlp_layers_top = []
+        for i in range(len(layer_sizes_top) - 1):
+            linear = paddle.nn.Linear(
+                in_features=layer_sizes_top[i],
+                out_features=layer_sizes_top[i + 1],
+                weight_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.Normal(
+                        std=1.0 / math.sqrt(layer_sizes_top[i]))))
+            self.add_sublayer('linear_%d' % i, linear)
+            self._mlp_layers_top.append(linear)
+            act = paddle.nn.ReLU()
+            self.add_sublayer('act_%d' % i, act)
+            self._mlp_layers_top.append(act)
+
+    def bottom_a_layer(self, sparse_inputs):
+        with paddle.fluid.device_guard(self.PART_A_DEVICE_FlAG):
+            sparse_embs = []
+            for s_input in sparse_inputs:
+                emb = self.embedding(s_input)
+                emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim])
+                sparse_embs.append(emb)
+
+            y = paddle.concat(x=sparse_embs, axis=1)
+            y = self._mlp_layers_a[0](y)
+            y = self._mlp_layers_a[1](y)
+
+            y = self._mlp_layers_a[2](y)
+        with paddle.fluid.device_guard(
+                self.PART_A_JOINT_OP_DEVICE_FlAG):  # joint point
+            bottom_a = self._mlp_layers_a[3](y)
+
+        return bottom_a
+
+    def bottom_b_layer(self, dense_inputs):
+        with paddle.fluid.device_guard(self.PART_B_DEVICE_FlAG):
+            y = self._mlp_layers_b[0](dense_inputs)
+            y = self._mlp_layers_b[1](y)
+
+            y = self._mlp_layers_b[2](y)
+            bottom_b = self._mlp_layers_b[3](y)
+
+        return bottom_b
+
+    def interactive_layer(self, bottom_a, bottom_b):
+        with paddle.fluid.device_guard(
+                self.PART_B_JOINT_OP_DEVICE_FlAG):  # joint point
+            interactive = paddle.fluid.layers.elementwise_add(
+                bottom_a, bottom_b)
+        return interactive
+
+    def top_layer(self, interactive, label_input):
+        with paddle.fluid.device_guard(self.PART_B_DEVICE_FlAG):
+            y = self._mlp_layers_top[0](interactive)
+            y_top = self._mlp_layers_top[1](y)
+            predict_2d = paddle.nn.functional.softmax(y_top)
+            auc, batch_auc, [
+                self.batch_stat_pos, self.batch_stat_neg, self.stat_pos,
+                self.stat_neg
+            ] = paddle.static.auc(input=predict_2d,
+                                  label=label_input,
+                                  num_thresholds=2**12,
+                                  slide_steps=20)
+
+            cost = paddle.nn.functional.cross_entropy(input=y_top,
+                                                      label=label_input)
+            avg_cost = paddle.mean(x=cost)
+
+        return auc, avg_cost
+
+    def forward(self, sparse_inputs, dense_inputs, label_input):
+        bottom_a = self.bottom_a_layer(sparse_inputs)
+
+        bottom_b = self.bottom_b_layer(dense_inputs)
+
+        interactive = self.interactive_layer(bottom_a, bottom_b)
+
+        auc, avg_cost = self.top_layer(interactive, label_input)
+
+        return auc, avg_cost
+
+
 class StaticModel():
+
     def __init__(self, config):
         self.cost = None
         self.infer_target_var = None
@@ -118,14 +268,14 @@ def _init_hyper_parameters(self):
         self.fc_sizes = self.config.get("hyper_parameters.fc_sizes")
 
     def create_feeds(self, is_infer=False):
-        dense_input = paddle.static.data(
-            name="dense_input",
-            shape=[None, self.dense_input_dim],
-            dtype="float32")
+        dense_input = paddle.static.data(name="dense_input",
+                                         shape=[None, self.dense_input_dim],
+                                         dtype="float32")
 
         sparse_input_ids = [
-            paddle.static.data(
-                name="C" + str(i), shape=[None, 1], dtype="int64")
+            paddle.static.data(name="C" + str(i),
+                               shape=[None, 1],
+                               dtype="int64")
             for i in range(1, self.sparse_inputs_slots)
         ]
 
@@ -140,20 +290,15 @@ def net(self, input, is_infer=False):
         self.dense_input = input[-1]
         sparse_number = self.sparse_inputs_slots - 1
 
-        dnn_model = DNNLayer(
-            self.sparse_feature_number,
-            self.sparse_feature_dim,
-            self.dense_input_dim,
-            sparse_number,
-            self.fc_sizes,
-            sync_mode=self.sync_mode)
-
+        dnn_model = DNNLayer(self.sparse_feature_number,
+                             self.sparse_feature_dim,
+                             self.dense_input_dim,
+                             sparse_number,
+                             self.fc_sizes,
+                             sync_mode=self.sync_mode)
         raw_predict_2d = dnn_model.forward(self.sparse_inputs, self.dense_input)
-
         predict_2d = paddle.nn.functional.softmax(raw_predict_2d)
-
         self.predict = predict_2d
-
         auc, batch_auc, [
             self.batch_stat_pos, self.batch_stat_neg, self.stat_pos,
             self.stat_neg
@@ -166,10 +311,28 @@ def net(self, input, is_infer=False):
             fetch_dict = {'auc': auc}
             return fetch_dict
 
-        cost = paddle.nn.functional.cross_entropy(
-            input=raw_predict_2d, label=self.label_input)
+        cost = paddle.nn.functional.cross_entropy(input=raw_predict_2d,
+                                                  label=self.label_input)
         avg_cost = paddle.mean(x=cost)
         self._cost = avg_cost
 
         fetch_dict = {'cost': avg_cost, 'auc': auc}
         return fetch_dict
+
+    def fl_net(self, input, is_infer=False):
+        self.label_input = input[0]
+        self.sparse_inputs = input[1:self.sparse_inputs_slots]
+        self.dense_input = input[-1]
+        self.sparse_number = self.sparse_inputs_slots - 1
+
+        fl_dnn_model = FlDNNLayer(self.sparse_feature_number,
+                                  self.sparse_feature_dim,
+                                  self.dense_input_dim,
+                                  self.sparse_number,
+                                  sync_mode=self.sync_mode)
+
+        auc, avg_cost = fl_dnn_model.forward(self.sparse_inputs,
+                                             self.dense_input, self.label_input)
+        fetch_dict = {'cost': avg_cost, 'auc': auc}
+        self._cost = avg_cost
+        return fetch_dict
diff --git a/python/paddle/fluid/tests/unittests/py_precise_roi_pool.py b/python/paddle/fluid/tests/unittests/py_precise_roi_pool.py
index aa7b8420f4817..29721c86200aa 100644
--- a/python/paddle/fluid/tests/unittests/py_precise_roi_pool.py
+++ b/python/paddle/fluid/tests/unittests/py_precise_roi_pool.py
@@ -17,6 +17,7 @@
 
 
 class PyPrRoIPool(object):
+
     def __init__(self):
         pass
 
@@ -34,36 +35,32 @@ def _PrRoIPoolingMatCalculation(self, this_data, s_h, s_w, e_h, e_w, y0, x0,
         beta = y0 - float(s_h)
         lim_alpha = x1 - float(s_w)
         lim_beta = y1 - float(s_h)
-        tmp = (
-            lim_alpha - 0.5 * lim_alpha * lim_alpha - alpha + 0.5 * alpha *
-            alpha) * (
-                lim_beta - 0.5 * lim_beta * lim_beta - beta + 0.5 * beta * beta)
+        tmp = (lim_alpha - 0.5 * lim_alpha * lim_alpha - alpha +
+               0.5 * alpha * alpha) * (lim_beta - 0.5 * lim_beta * lim_beta -
+                                       beta + 0.5 * beta * beta)
         sum_out += self._PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp
 
         alpha = float(e_w) - x1
         lim_alpha = float(e_w) - x0
-        tmp = (
-            lim_alpha - 0.5 * lim_alpha * lim_alpha - alpha + 0.5 * alpha *
-            alpha) * (
-                lim_beta - 0.5 * lim_beta * lim_beta - beta + 0.5 * beta * beta)
+        tmp = (lim_alpha - 0.5 * lim_alpha * lim_alpha - alpha +
+               0.5 * alpha * alpha) * (lim_beta - 0.5 * lim_beta * lim_beta -
+                                       beta + 0.5 * beta * beta)
         sum_out += self._PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp
 
         alpha = x0 - float(s_w)
         beta = float(e_h) - y1
         lim_alpha = x1 - float(s_w)
         lim_beta = float(e_h) - y0
-        tmp = (
-            lim_alpha - 0.5 * lim_alpha * lim_alpha - alpha + 0.5 * alpha *
-            alpha) * (
-                lim_beta - 0.5 * lim_beta * lim_beta - beta + 0.5 * beta * beta)
+        tmp = (lim_alpha - 0.5 * lim_alpha * lim_alpha - alpha +
+               0.5 * alpha * alpha) * (lim_beta - 0.5 * lim_beta * lim_beta -
+                                       beta + 0.5 * beta * beta)
         sum_out += self._PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp
 
         alpha = float(e_w) - x1
         lim_alpha = float(e_w) - x0
-        tmp = (
-            lim_alpha - 0.5 * lim_alpha * lim_alpha - alpha + 0.5 * alpha *
-            alpha) * (
-                lim_beta - 0.5 * lim_beta * lim_beta - beta + 0.5 * beta * beta)
+        tmp = (lim_alpha - 0.5 * lim_alpha * lim_alpha - alpha +
+               0.5 * alpha * alpha) * (lim_beta - 0.5 * lim_beta * lim_beta -
+                                       beta + 0.5 * beta * beta)
         sum_out += self._PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp
 
         return sum_out
@@ -141,9 +138,10 @@ def compute(self,
                                         w_iter + 1,
                                         max(win_start_h, float(h_iter)),
                                         max(win_start_w, float(w_iter)),
-                                        min(win_end_h, float(h_iter) + 1.0),
-                                        min(win_end_w, float(w_iter + 1.0)),
-                                        height, width)
+                                        min(win_end_h,
+                                            float(h_iter) + 1.0),
+                                        min(win_end_w,
+                                            float(w_iter + 1.0)), height, width)
 
                             out_data[i, c, ph, pw] = sum_out / win_size
 
diff --git a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
index ffc78d33347b7..35a95749880bd 100644
--- a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
@@ -1,10 +1,13 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 if(NOT WIN32)
-    set_tests_properties(test_rnn_nets_static PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_rnn_nets PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_rnn_nets_static PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_rnn_nets PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/rnn/convert.py b/python/paddle/fluid/tests/unittests/rnn/convert.py
index 645f67fca277f..1a3e571269d66 100644
--- a/python/paddle/fluid/tests/unittests/rnn/convert.py
+++ b/python/paddle/fluid/tests/unittests/rnn/convert.py
@@ -53,10 +53,9 @@ def convert_params_for_net_static(np_net, paddle_net, place):
 
 def get_params_for_cell(np_cell, num_layers, idx):
     state = np_cell.parameters
-    weight_list = [
-        ('{}.weight_{}'.format(num_layers, idx), state['weight_ih']),
-        ('{}.weight_{}'.format(num_layers, idx + 1), state['weight_hh'])
-    ]
+    weight_list = [('{}.weight_{}'.format(num_layers, idx), state['weight_ih']),
+                   ('{}.weight_{}'.format(num_layers,
+                                          idx + 1), state['weight_hh'])]
     bias_list = [('{}.bias_{}'.format(num_layers, idx), state['bias_ih']),
                  ('{}.bias_{}'.format(num_layers, idx + 1), state['bias_hh'])]
     return weight_list, bias_list
diff --git a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
index dd1e18b89d29d..fbdc3ec8a4854 100644
--- a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
+++ b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
@@ -17,11 +17,13 @@
 
 
 class LayerMixin(object):
+
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
 
 
 class LayerListMixin(LayerMixin):
+
     def __init__(self, layers=None):
         self._layers = list(layers) if layers else []
 
@@ -33,6 +35,7 @@ def __iter__(self):
 
 
 class SimpleRNNCell(LayerMixin):
+
     def __init__(self,
                  input_size,
                  hidden_size,
@@ -49,10 +52,10 @@ def __init__(self,
 
         self.parameters = dict()
         std = 1.0 / math.sqrt(hidden_size)
-        self.weight_ih = np.random.uniform(-std, std, (
-            hidden_size, input_size)).astype(dtype)
-        self.weight_hh = np.random.uniform(-std, std, (
-            hidden_size, hidden_size)).astype(dtype)
+        self.weight_ih = np.random.uniform(
+            -std, std, (hidden_size, input_size)).astype(dtype)
+        self.weight_hh = np.random.uniform(
+            -std, std, (hidden_size, hidden_size)).astype(dtype)
         self.parameters['weight_ih'] = self.weight_ih
         self.parameters['weight_hh'] = self.weight_hh
         if bias:
@@ -85,16 +88,17 @@ def forward(self, inputs, hx=None):
 
 
 class GRUCell(LayerMixin):
+
     def __init__(self, input_size, hidden_size, bias=True, dtype="float64"):
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.bias = bias
         self.parameters = dict()
         std = 1.0 / math.sqrt(hidden_size)
-        self.weight_ih = np.random.uniform(-std, std, (
-            3 * hidden_size, input_size)).astype(dtype)
-        self.weight_hh = np.random.uniform(-std, std, (
-            3 * hidden_size, hidden_size)).astype(dtype)
+        self.weight_ih = np.random.uniform(
+            -std, std, (3 * hidden_size, input_size)).astype(dtype)
+        self.weight_hh = np.random.uniform(
+            -std, std, (3 * hidden_size, hidden_size)).astype(dtype)
         self.parameters['weight_ih'] = self.weight_ih
         self.parameters['weight_hh'] = self.weight_hh
         if bias:
@@ -133,16 +137,17 @@ def forward(self, inputs, hx=None):
 
 
 class LSTMCell(LayerMixin):
+
     def __init__(self, input_size, hidden_size, bias=True, dtype="float64"):
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.bias = bias
         self.parameters = dict()
         std = 1.0 / math.sqrt(hidden_size)
-        self.weight_ih = np.random.uniform(-std, std, (
-            4 * hidden_size, input_size)).astype(dtype)
-        self.weight_hh = np.random.uniform(-std, std, (
-            4 * hidden_size, hidden_size)).astype(dtype)
+        self.weight_ih = np.random.uniform(
+            -std, std, (4 * hidden_size, input_size)).astype(dtype)
+        self.weight_hh = np.random.uniform(
+            -std, std, (4 * hidden_size, hidden_size)).astype(dtype)
         self.parameters['weight_ih'] = self.weight_ih
         self.parameters['weight_hh'] = self.weight_hh
         if bias:
@@ -327,6 +332,7 @@ def concat_states(states, bidirectional=False, state_components=1):
 
 
 class RNN(LayerMixin):
+
     def __init__(self, cell, is_reverse=False, time_major=False):
         super(RNN, self).__init__()
         self.cell = cell
@@ -347,6 +353,7 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
 
 
 class BiRNN(LayerMixin):
+
     def __init__(self, cell_fw, cell_bw, time_major=False):
         super(BiRNN, self).__init__()
         self.cell_fw = cell_fw
@@ -371,6 +378,7 @@ def forward(self,
 
 
 class RNNMixin(LayerListMixin):
+
     def forward(self, inputs, initial_states=None, sequence_length=None):
         batch_index = 1 if self.time_major else 0
         batch_size = inputs.shape[batch_index]
@@ -404,6 +412,7 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
 
 
 class SimpleRNN(RNNMixin):
+
     def __init__(self,
                  input_size,
                  hidden_size,
@@ -417,27 +426,36 @@ def __init__(self,
         bidirectional_list = ["bidirectional", "bidirect"]
         if direction in ["forward"]:
             is_reverse = False
-            cell = SimpleRNNCell(
-                input_size, hidden_size, nonlinearity=nonlinearity, dtype=dtype)
+            cell = SimpleRNNCell(input_size,
+                                 hidden_size,
+                                 nonlinearity=nonlinearity,
+                                 dtype=dtype)
             self.append(RNN(cell, is_reverse, time_major))
             for i in range(1, num_layers):
-                cell = SimpleRNNCell(
-                    hidden_size,
-                    hidden_size,
-                    nonlinearity=nonlinearity,
-                    dtype=dtype)
+                cell = SimpleRNNCell(hidden_size,
+                                     hidden_size,
+                                     nonlinearity=nonlinearity,
+                                     dtype=dtype)
                 self.append(RNN(cell, is_reverse, time_major))
         elif direction in bidirectional_list:
-            cell_fw = SimpleRNNCell(
-                input_size, hidden_size, nonlinearity=nonlinearity, dtype=dtype)
-            cell_bw = SimpleRNNCell(
-                input_size, hidden_size, nonlinearity=nonlinearity, dtype=dtype)
+            cell_fw = SimpleRNNCell(input_size,
+                                    hidden_size,
+                                    nonlinearity=nonlinearity,
+                                    dtype=dtype)
+            cell_bw = SimpleRNNCell(input_size,
+                                    hidden_size,
+                                    nonlinearity=nonlinearity,
+                                    dtype=dtype)
             self.append(BiRNN(cell_fw, cell_bw, time_major))
             for i in range(1, num_layers):
-                cell_fw = SimpleRNNCell(
-                    2 * hidden_size, hidden_size, nonlinearity, dtype=dtype)
-                cell_bw = SimpleRNNCell(
-                    2 * hidden_size, hidden_size, nonlinearity, dtype=dtype)
+                cell_fw = SimpleRNNCell(2 * hidden_size,
+                                        hidden_size,
+                                        nonlinearity,
+                                        dtype=dtype)
+                cell_bw = SimpleRNNCell(2 * hidden_size,
+                                        hidden_size,
+                                        nonlinearity,
+                                        dtype=dtype)
                 self.append(BiRNN(cell_fw, cell_bw, time_major))
         else:
             raise ValueError(
@@ -454,6 +472,7 @@ def __init__(self,
 
 
 class LSTM(RNNMixin):
+
     def __init__(self,
                  input_size,
                  hidden_size,
@@ -495,6 +514,7 @@ def __init__(self,
 
 
 class GRU(RNNMixin):
+
     def __init__(self,
                  input_size,
                  hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
index cade4b850cd1d..33dca32b76cd5 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+
 paddle.framework.set_default_dtype("float64")
 
 import numpy as np
@@ -23,6 +24,7 @@
 
 
 class TestSimpleRNNCell(unittest.TestCase):
+
     def __init__(self, bias=True, place="cpu"):
         super(TestSimpleRNNCell, self).__init__(methodName="runTest")
         self.bias = bias
@@ -32,8 +34,10 @@ def __init__(self, bias=True, place="cpu"):
     def setUp(self):
         paddle.disable_static(self.place)
         rnn1 = SimpleRNNCell(16, 32, bias=self.bias)
-        rnn2 = paddle.nn.SimpleRNNCell(
-            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        rnn2 = paddle.nn.SimpleRNNCell(16,
+                                       32,
+                                       bias_ih_attr=self.bias,
+                                       bias_hh_attr=self.bias)
         convert_params_for_cell(rnn1, rnn2)
 
         self.rnn1 = rnn1
@@ -61,6 +65,7 @@ def test_with_zero_state(self):
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
     def test_errors(self):
+
         def test_zero_hidden_size():
             cell = paddle.nn.SimpleRNNCell(-1, 0)
 
@@ -73,6 +78,7 @@ def runTest(self):
 
 
 class TestGRUCell(unittest.TestCase):
+
     def __init__(self, bias=True, place="cpu"):
         super(TestGRUCell, self).__init__(methodName="runTest")
         self.bias = bias
@@ -82,8 +88,10 @@ def __init__(self, bias=True, place="cpu"):
     def setUp(self):
         paddle.disable_static(self.place)
         rnn1 = GRUCell(16, 32, bias=self.bias)
-        rnn2 = paddle.nn.GRUCell(
-            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        rnn2 = paddle.nn.GRUCell(16,
+                                 32,
+                                 bias_ih_attr=self.bias,
+                                 bias_hh_attr=self.bias)
         convert_params_for_cell(rnn1, rnn2)
 
         self.rnn1 = rnn1
@@ -111,6 +119,7 @@ def test_with_zero_state(self):
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
     def test_errors(self):
+
         def test_zero_hidden_size():
             cell = paddle.nn.GRUCell(-1, 0)
 
@@ -123,6 +132,7 @@ def runTest(self):
 
 
 class TestLSTMCell(unittest.TestCase):
+
     def __init__(self, bias=True, place="cpu"):
         super(TestLSTMCell, self).__init__(methodName="runTest")
         self.bias = bias
@@ -131,8 +141,10 @@ def __init__(self, bias=True, place="cpu"):
 
     def setUp(self):
         rnn1 = LSTMCell(16, 32, bias=self.bias)
-        rnn2 = paddle.nn.LSTMCell(
-            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        rnn2 = paddle.nn.LSTMCell(16,
+                                  32,
+                                  bias_ih_attr=self.bias,
+                                  bias_hh_attr=self.bias)
         convert_params_for_cell(rnn1, rnn2)
 
         self.rnn1 = rnn1
@@ -147,9 +159,9 @@ def test_with_initial_state(self):
         prev_c = np.random.randn(4, 32)
 
         y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
-        y2, (h2, c2) = rnn2(
-            paddle.to_tensor(x),
-            (paddle.to_tensor(prev_h), paddle.to_tensor(prev_c)))
+        y2, (h2,
+             c2) = rnn2(paddle.to_tensor(x),
+                        (paddle.to_tensor(prev_h), paddle.to_tensor(prev_c)))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -165,6 +177,7 @@ def test_with_zero_state(self):
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
 
     def test_errors(self):
+
         def test_zero_hidden_size():
             cell = paddle.nn.LSTMCell(-1, 0)
 
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
index bb15b2713496d..b4a5887c593bd 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+
 paddle.framework.set_default_dtype("float64")
 paddle.enable_static()
 
@@ -24,6 +25,7 @@
 
 
 class TestSimpleRNNCell(unittest.TestCase):
+
     def __init__(self, bias=True, place="cpu"):
         super(TestSimpleRNNCell, self).__init__(methodName="runTest")
         self.bias = bias
@@ -37,8 +39,10 @@ def setUp(self):
         sp = paddle.static.Program()
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                rnn2 = paddle.nn.SimpleRNNCell(
-                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+                rnn2 = paddle.nn.SimpleRNNCell(16,
+                                               32,
+                                               bias_ih_attr=self.bias,
+                                               bias_hh_attr=self.bias)
 
         place = self.place
         exe = paddle.static.Executor(place)
@@ -119,6 +123,7 @@ def runTest(self):
 
 
 class TestGRUCell(unittest.TestCase):
+
     def __init__(self, bias=True, place="cpu"):
         super(TestGRUCell, self).__init__(methodName="runTest")
         self.bias = bias
@@ -132,8 +137,10 @@ def setUp(self):
         sp = paddle.static.Program()
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                rnn2 = paddle.nn.GRUCell(
-                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+                rnn2 = paddle.nn.GRUCell(16,
+                                         32,
+                                         bias_ih_attr=self.bias,
+                                         bias_hh_attr=self.bias)
 
         place = self.place
         exe = paddle.static.Executor(place)
@@ -215,6 +222,7 @@ def runTest(self):
 
 
 class TestLSTMCell(unittest.TestCase):
+
     def __init__(self, bias=True, place="cpu"):
         super(TestLSTMCell, self).__init__(methodName="runTest")
         self.bias = bias
@@ -228,8 +236,10 @@ def setUp(self):
         sp = paddle.static.Program()
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                rnn2 = paddle.nn.LSTMCell(
-                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+                rnn2 = paddle.nn.LSTMCell(16,
+                                          32,
+                                          bias_ih_attr=self.bias,
+                                          bias_hh_attr=self.bias)
 
         place = self.place
         exe = paddle.static.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py
index 0712d5be23e4b..f4dbc3bbbc7c8 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,12 +18,15 @@
 
 def create_model():
     hidden_size = 32
-    bilstm = paddle.nn.LSTM(
-        hidden_size, hidden_size, num_layers=1, direction='bidirectional')
+    bilstm = paddle.nn.LSTM(hidden_size,
+                            hidden_size,
+                            num_layers=1,
+                            direction='bidirectional')
     return bilstm
 
 
 class TestRNNProgramClone(TestCase):
+
     def setUp(self):
         paddle.enable_static()
 
@@ -35,14 +38,14 @@ def test_rnn_with_cudnn_clone(self):
         # test a typical case in static graph usage: create two nearly
         # identical program with a shared startup program to share their
         # parameters
-        # 
+        #
         # when creating a parameter, the name is checked. If there is already
         # a parameter with the same name, which is the output of a operator
         # (i.e. its creator), its re-creation is skipped.
-        # 
+        #
         # but if that parameter has been the output of more than one operator,
         # an exception is raised. For special cases, white list is added.
-        # flattening rnn's parameters for the need to call cudnn kernel is such 
+        # flattening rnn's parameters for the need to call cudnn kernel is such
         # a case.
         with paddle.static.program_guard(train_program, startup_prog):
             with paddle.fluid.unique_name.guard():
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
index 263efedc714b2..83a50c2a4472d 100755
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+
 paddle.set_default_dtype("float64")
 from paddle.fluid.layers import sequence_mask
 
@@ -26,6 +27,7 @@
 
 
 class TestSimpleRNN(unittest.TestCase):
+
     def __init__(self, time_major=True, direction="forward", place="cpu"):
         super(TestSimpleRNN, self).__init__("runTest")
         self.time_major = time_major
@@ -38,10 +40,16 @@ def setUp(self):
         # `__init__` to avoid using an error device set by another test case.
         place = paddle.set_device(self.place)
         paddle.disable_static(place)
-        rnn1 = SimpleRNN(
-            16, 32, 2, time_major=self.time_major, direction=self.direction)
-        rnn2 = paddle.nn.SimpleRNN(
-            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        rnn1 = SimpleRNN(16,
+                         32,
+                         2,
+                         time_major=self.time_major,
+                         direction=self.direction)
+        rnn2 = paddle.nn.SimpleRNN(16,
+                                   32,
+                                   2,
+                                   time_major=self.time_major,
+                                   direction=self.direction)
         convert_params_for_net(rnn1, rnn2)
 
         self.rnn1 = rnn1
@@ -107,6 +115,7 @@ def runTest(self):
 
 
 class TestGRU(unittest.TestCase):
+
     def __init__(self, time_major=True, direction="forward", place="cpu"):
         super(TestGRU, self).__init__("runTest")
         self.time_major = time_major
@@ -194,6 +203,7 @@ def runTest(self):
 
 
 class TestLSTM(unittest.TestCase):
+
     def __init__(self, time_major=True, direction="forward", place="cpu"):
         super(TestLSTM, self).__init__("runTest")
         self.time_major = time_major
@@ -206,10 +216,16 @@ def setUp(self):
         # `__init__` to avoid using an error device set by another test case.
         place = paddle.set_device(self.place)
         paddle.disable_static(place)
-        rnn1 = LSTM(
-            16, 32, 2, time_major=self.time_major, direction=self.direction)
-        rnn2 = paddle.nn.LSTM(
-            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        rnn1 = LSTM(16,
+                    32,
+                    2,
+                    time_major=self.time_major,
+                    direction=self.direction)
+        rnn2 = paddle.nn.LSTM(16,
+                              32,
+                              2,
+                              time_major=self.time_major,
+                              direction=self.direction)
         convert_params_for_net(rnn1, rnn2)
 
         self.rnn1 = rnn1
@@ -226,9 +242,9 @@ def test_with_initial_state(self):
         prev_c = np.random.randn(2 * self.num_directions, 4, 32)
 
         y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
-        y2, (h2, c2) = rnn2(
-            paddle.to_tensor(x),
-            (paddle.to_tensor(prev_h), paddle.to_tensor(prev_c)))
+        y2, (h2,
+             c2) = rnn2(paddle.to_tensor(x),
+                        (paddle.to_tensor(prev_h), paddle.to_tensor(prev_c)))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
@@ -287,6 +303,7 @@ def predict_test_util(place, mode, stop_gradient=True):
     np.random.seed(123)
 
     class Net(paddle.nn.Layer):
+
         def __init__(self):
             super(Net, self).__init__()
             self.rnn = getattr(paddle.nn, mode)(16,
@@ -308,8 +325,8 @@ def forward(self, input):
     y = y * mask
     loss = paddle.mean(y)
     loss.backward()
-    optimizer = paddle.optimizer.Adam(
-        learning_rate=0.1, parameters=rnn.parameters())
+    optimizer = paddle.optimizer.Adam(learning_rate=0.1,
+                                      parameters=rnn.parameters())
     optimizer.step()
     rnn.eval()
     y, _ = rnn(x)
@@ -318,8 +335,7 @@ def forward(self, input):
     rnn.train()
 
     rnn = paddle.jit.to_static(
-        rnn, [paddle.static.InputSpec(
-            shape=[None, None, 16], dtype=x.dtype)])
+        rnn, [paddle.static.InputSpec(shape=[None, None, 16], dtype=x.dtype)])
     paddle.jit.save(rnn, "./inference/%s_infer" % mode)
 
     paddle.enable_static()
@@ -327,9 +343,9 @@ def forward(self, input):
     new_scope = paddle.static.Scope()
     with paddle.static.scope_guard(new_scope):
         exe = paddle.static.Executor(place)
-        [inference_program, feed_target_names,
-         fetch_targets] = paddle.static.load_inference_model(
-             "./inference/%s_infer" % mode, exe)
+        [inference_program, feed_target_names, fetch_targets
+         ] = paddle.static.load_inference_model("./inference/%s_infer" % mode,
+                                                exe)
         results = exe.run(inference_program,
                           feed={feed_target_names[0]: x.numpy()},
                           fetch_list=fetch_targets)
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
index 5de539ebf3939..436bf0b6ea01d 100755
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 import paddle
+
 paddle.set_default_dtype("float64")
 from paddle.fluid.layers import sequence_mask
+
 paddle.enable_static()
 
 import numpy as np
@@ -27,6 +29,7 @@
 
 
 class TestSimpleRNN(unittest.TestCase):
+
     def __init__(self, time_major=True, direction="forward", place="cpu"):
         super(TestSimpleRNN, self).__init__("runTest")
         self.time_major = time_major
@@ -38,19 +41,21 @@ def setUp(self):
         # Since `set_device` is global, set `set_device` in `setUp` rather than
         # `__init__` to avoid using an error device set by another test case.
         place = paddle.set_device(self.place)
-        rnn1 = SimpleRNN(
-            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        rnn1 = SimpleRNN(16,
+                         32,
+                         2,
+                         time_major=self.time_major,
+                         direction=self.direction)
 
         mp = paddle.static.Program()
         sp = paddle.static.Program()
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                rnn2 = paddle.nn.SimpleRNN(
-                    16,
-                    32,
-                    2,
-                    time_major=self.time_major,
-                    direction=self.direction)
+                rnn2 = paddle.nn.SimpleRNN(16,
+                                           32,
+                                           2,
+                                           time_major=self.time_major,
+                                           direction=self.direction)
 
         exe = paddle.static.Executor(place)
         scope = paddle.fluid.Scope()
@@ -171,6 +176,7 @@ def runTest(self):
 
 
 class TestGRU(unittest.TestCase):
+
     def __init__(self, time_major=True, direction="forward", place="cpu"):
         super(TestGRU, self).__init__("runTest")
         self.time_major = time_major
@@ -317,6 +323,7 @@ def runTest(self):
 
 
 class TestLSTM(unittest.TestCase):
+
     def __init__(self, time_major=True, direction="forward", place="cpu"):
         super(TestLSTM, self).__init__("runTest")
         self.time_major = time_major
@@ -328,19 +335,21 @@ def setUp(self):
         # Since `set_device` is global, set `set_device` in `setUp` rather than
         # `__init__` to avoid using an error device set by another test case.
         place = paddle.set_device(self.place)
-        rnn1 = LSTM(
-            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        rnn1 = LSTM(16,
+                    32,
+                    2,
+                    time_major=self.time_major,
+                    direction=self.direction)
 
         mp = paddle.static.Program()
         sp = paddle.static.Program()
         with paddle.fluid.unique_name.guard():
             with paddle.static.program_guard(mp, sp):
-                rnn2 = paddle.nn.LSTM(
-                    16,
-                    32,
-                    2,
-                    time_major=self.time_major,
-                    direction=self.direction)
+                rnn2 = paddle.nn.LSTM(16,
+                                      32,
+                                      2,
+                                      time_major=self.time_major,
+                                      direction=self.direction)
 
         exe = paddle.static.Executor(place)
         scope = paddle.fluid.Scope()
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_wrappers.py b/python/paddle/fluid/tests/unittests/rnn/test_wrappers.py
index 85aebf86ed9ba..2442e6b7a3b8c 100755
--- a/python/paddle/fluid/tests/unittests/rnn/test_wrappers.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_wrappers.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+
 paddle.set_default_dtype("float64")
 from paddle.fluid.layers import sequence_mask
 
@@ -24,6 +25,7 @@
 
 
 class TestRNNWrapper(unittest.TestCase):
+
     def __init__(self, time_major=True, direction="forward", place="cpu"):
         super(TestRNNWrapper, self).__init__("runTest")
         self.time_major = time_major
@@ -102,6 +104,7 @@ def runTest(self):
 
 
 class TestBiRNNWrapper(unittest.TestCase):
+
     def __init__(self, time_major=True, place="cpu"):
         super(TestBiRNNWrapper, self).__init__("runTest")
         self.time_major = time_major
diff --git a/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py b/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py
index a24c087448211..61d643916f701 100644
--- a/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py
+++ b/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py
@@ -41,6 +41,7 @@
 
 
 class TestRowParallelLinearAPI(TestCollectiveAPIRunnerBase):
+
     def __init__(self):
         self.global_ring_id = 0
 
@@ -50,8 +51,9 @@ def get_model(self, main_prog, startup_program, rank):
             np.random.seed(2020)
             np_array = np.random.rand(1000, 16)
 
-            data = paddle.static.data(
-                name='tindata', shape=[10, 1000], dtype="float32")
+            data = paddle.static.data(name='tindata',
+                                      shape=[10, 1000],
+                                      dtype="float32")
             paddle.distributed.broadcast(data, src=0)
             data = paddle.split(data, 2, axis=1)[rank]
             if rank == 0:
@@ -70,7 +72,8 @@ def get_model(self, main_prog, startup_program, rank):
                 axis=0,
                 num_partitions=2,
                 weight_attr=param_attr,
-                bias_attr=True, )
+                bias_attr=True,
+            )
 
             return [linear_out]
 
diff --git a/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt b/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt
index c6ba82f8cbf0f..5c13f56d44646 100644
--- a/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt
@@ -1,8 +1,11 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 set_tests_properties(test_sequence_conv PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sequence_concat PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py
index 34b6f6dc8e545..5dca219899035 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
@@ -25,6 +26,7 @@
 
 
 class TestSequenceConcat(OpTest):
+
     def setLoD(self):
         self.lod1 = [7, 3]
         self.lod2 = [12, 8]
@@ -52,6 +54,7 @@ def test_dx(self):
 
 
 class TestSequenceConcatCase2(TestSequenceConcat):
+
     def setLoD(self):
         self.lod1 = [10, 0]
         self.lod2 = [12, 8]
@@ -59,6 +62,7 @@ def setLoD(self):
 
 
 class TestSequenceConcatCase3(TestSequenceConcat):
+
     def setLoD(self):
         self.lod1 = [10, 0]
         self.lod2 = [20, 0]
@@ -66,6 +70,7 @@ def setLoD(self):
 
 
 class TestSequenceConcatCase4(TestSequenceConcat):
+
     def setLoD(self):
         self.lod1 = [0, 10]
         self.lod2 = [0, 20]
@@ -73,6 +78,7 @@ def setLoD(self):
 
 
 class TestSequenceConcatCase5(TestSequenceConcat):
+
     def setLoD(self):
         self.lod1 = [0, 10]
         self.lod2 = [20, 0]
@@ -80,7 +86,9 @@ def setLoD(self):
 
 
 class TestSequenceConcatOpError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_input_list():
             # the input type must be list
             x_data = fluid.layers.data(name='x', shape=[4], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_conv.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_conv.py
index ed804d701a84b..e79b66c9990d2 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_conv.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_conv.py
@@ -18,6 +18,7 @@
 import numpy as np
 import random
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
@@ -46,8 +47,8 @@ def seqconv(x,
                     [offset[i] - in_begin, offset[i + 1] - offset[i]])
                 if padding_trainable:
                     sub_w = padding_data[j:j + pad_size, :]
-                    col[offset[i]:offset[i] + pad_size, j * M:(j + 1) *
-                        M] = sub_w
+                    col[offset[i]:offset[i] + pad_size,
+                        j * M:(j + 1) * M] = sub_w
                 out_begin = offset[i] + pad_size
                 in_begin = offset[i]
 
@@ -58,8 +59,8 @@ def seqconv(x,
                     sub_w = padding_data[begin_pad + context_start + j -
                                          pad_size:begin_pad + context_start +
                                          j, :]
-                    col[offset[i + 1] - pad_size:offset[i + 1], j * M:(j + 1) *
-                        M] = sub_w
+                    col[offset[i + 1] - pad_size:offset[i + 1],
+                        j * M:(j + 1) * M] = sub_w
                 in_end = offset[i + 1]
                 out_end = offset[i + 1] - pad_size
             if in_end <= in_begin:
@@ -70,6 +71,7 @@ def seqconv(x,
 
 
 class TestSeqProject(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.op_type = 'sequence_conv'
@@ -83,8 +85,8 @@ def setUp(self):
             return
 
         # one level, batch size
-        x = np.random.uniform(0.1, 1, [self.input_size[0],
-                                       self.input_size[1]]).astype('float32')
+        x = np.random.uniform(
+            0.1, 1, [self.input_size[0], self.input_size[1]]).astype('float32')
         w = np.random.uniform(0.1, 1, [
             self.context_length * self.input_size[1], self.output_represention
         ]).astype('float32')
@@ -124,51 +126,48 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.padding_trainable:
-            self.check_grad(
-                set(self.inputs_val), 'Out', max_relative_error=0.05)
+            self.check_grad(set(self.inputs_val),
+                            'Out',
+                            max_relative_error=0.05)
 
     def test_check_grad_input(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=0.05,
-            no_grad_set=set(self.inputs_val_no_x))
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.05,
+                        no_grad_set=set(self.inputs_val_no_x))
 
     def test_check_grad_padding_data(self):
         if self.padding_trainable:
-            self.check_grad(
-                ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter']))
+            self.check_grad(['PaddingData'],
+                            'Out',
+                            no_grad_set=set(['X', 'Filter']))
 
     def test_check_grad_Filter(self):
-        self.check_grad(
-            ['Filter'],
-            'Out',
-            max_relative_error=0.05,
-            no_grad_set=set(self.inputs_val_no_f))
+        self.check_grad(['Filter'],
+                        'Out',
+                        max_relative_error=0.05,
+                        no_grad_set=set(self.inputs_val_no_f))
 
     def test_check_grad_input_filter(self):
         if self.padding_trainable:
-            self.check_grad(
-                ['X', 'Filter'],
-                'Out',
-                max_relative_error=0.05,
-                no_grad_set=set(['PaddingData']))
+            self.check_grad(['X', 'Filter'],
+                            'Out',
+                            max_relative_error=0.05,
+                            no_grad_set=set(['PaddingData']))
 
     def test_check_grad_padding_input(self):
         if self.padding_trainable:
-            self.check_grad(
-                self.inputs_val_no_f,
-                'Out',
-                max_relative_error=0.05,
-                no_grad_set=set(['Filter']))
+            self.check_grad(self.inputs_val_no_f,
+                            'Out',
+                            max_relative_error=0.05,
+                            no_grad_set=set(['Filter']))
 
     def test_check_grad_padding_filter(self):
         if self.padding_trainable:
-            self.check_grad(
-                self.inputs_val_no_x,
-                'Out',
-                max_relative_error=0.05,
-                no_grad_set=set(['X']))
+            self.check_grad(self.inputs_val_no_x,
+                            'Out',
+                            max_relative_error=0.05,
+                            no_grad_set=set(['X']))
 
     def init_test_case(self):
         self.input_row = 11
@@ -187,6 +186,7 @@ def init_test_case(self):
 
 
 class TestSeqProjectCase1(TestSeqProject):
+
     def init_test_case(self):
         self.input_row = 11
         self.context_start = -1
@@ -204,6 +204,7 @@ def init_test_case(self):
 
 
 class TestSeqProjectCase2Len0(TestSeqProject):
+
     def init_test_case(self):
         self.input_row = 11
         self.context_start = -1
@@ -221,6 +222,7 @@ def init_test_case(self):
 
 
 class TestSeqProjectCase3(TestSeqProject):
+
     def init_test_case(self):
         self.input_row = 25
         self.context_start = 2
@@ -241,12 +243,15 @@ def init_test_case(self):
 
 
 class TestSeqConvApi(unittest.TestCase):
+
     def test_api(self):
         import paddle.fluid as fluid
 
         x = fluid.layers.data('x', shape=[32], lod_level=1)
-        y = fluid.layers.sequence_conv(
-            input=x, num_filters=2, filter_size=3, padding_start=None)
+        y = fluid.layers.sequence_conv(input=x,
+                                       num_filters=2,
+                                       filter_size=3,
+                                       padding_start=None)
 
         place = fluid.CPUPlace()
         x_tensor = fluid.create_lod_tensor(
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_enumerate_op.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_enumerate_op.py
index 9878e6f74139d..c2832127573e0 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_enumerate_op.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_enumerate_op.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
@@ -39,6 +40,7 @@ def sequence_enumerate(input_seq, in_lod, win_size, pad_value):
 
 
 class TestSequenceEnumerateOp(OpTest):
+
     def setUp(self):
         self.op_type = "sequence_enumerate"
         self.init_test_case()
@@ -60,6 +62,7 @@ def init_test_case(self):
 
 
 class TesSequenceEnumerateOpInt64(TestSequenceEnumerateOp):
+
     def init_test_case(self):
         self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
         self.lod = [[9, 4, 11, 6]]
@@ -71,6 +74,7 @@ def init_test_case(self):
 
 
 class TestSequenceEnumerateOpLargeWinSize(TestSequenceEnumerateOp):
+
     def init_test_case(self):
         self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
         self.lod = [[9, 4, 11, 6]]
@@ -82,6 +86,7 @@ def init_test_case(self):
 
 
 class TestSequenceEnumerateOpMaxWinSize(TestSequenceEnumerateOp):
+
     def init_test_case(self):
         self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
         self.lod = [[9, 4, 11, 6]]
@@ -93,6 +98,7 @@ def init_test_case(self):
 
 
 class TestSequenceEnumerateOpLargePadValue(TestSequenceEnumerateOp):
+
     def init_test_case(self):
         self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
         self.lod = [[9, 4, 11, 6]]
@@ -104,6 +110,7 @@ def init_test_case(self):
 
 
 class TestSequenceEnumerateOpLargePadValueSeqLen0(TestSequenceEnumerateOp):
+
     def init_test_case(self):
         self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
         self.lod = [[0, 14, 0, 16, 0]]
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_erase_op.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_erase_op.py
index 9e060201fe8df..6e9023d03c5f4 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_erase_op.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_erase_op.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
@@ -37,6 +38,7 @@ def sequence_erase(in_seq, lod0, tokens):
 
 
 class TestSequenceEraseOpInt32(OpTest):
+
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
@@ -52,6 +54,7 @@ def test_check_output(self):
 
 
 class TestSequenceEraseOpInt32LoD2(OpTest):
+
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
@@ -67,6 +70,7 @@ def test_check_output(self):
 
 
 class TestSequenceEraseOpInt64(OpTest):
+
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
@@ -82,6 +86,7 @@ def test_check_output(self):
 
 
 class TestSequenceEraseOpInt64SeqLen0(OpTest):
+
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
@@ -97,6 +102,7 @@ def test_check_output(self):
 
 
 class TestSequenceEraseOpEmpty(OpTest):
+
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_expand.py
index b3d877a0cd6df..3e75b40baf8fc 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_expand.py
@@ -17,11 +17,13 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
 
 class TestSequenceExpand(OpTest):
+
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [3, 40]).astype('float64')
         y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float64')
@@ -81,6 +83,7 @@ def test_check_grad(self):
 
 
 class TestSequenceExpandCase1(TestSequenceExpand):
+
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 20]).astype('float64')
         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float64')
@@ -90,6 +93,7 @@ def set_data(self):
 
 
 class TestSequenceExpandCase2(TestSequenceExpand):
+
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [1, 2, 50]).astype('float64')
         x_lod = [[1]]
@@ -100,6 +104,7 @@ def set_data(self):
 
 
 class TestSequenceExpandCase3(TestSequenceExpand):
+
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [4, 25]).astype('float64')
         x_lod = [[1, 1, 1, 1]]
@@ -109,6 +114,7 @@ def set_data(self):
 
 
 class TestSequenceExpandCase4(TestSequenceExpand):
+
     def set_data(self):
         data = np.random.uniform(0.1, 1, [5 * 20, 1])
         x_data = np.array(data).reshape([5, 20]).astype('float64')
@@ -119,6 +125,7 @@ def set_data(self):
 
 
 class TestSequenceExpandCase5(TestSequenceExpand):
+
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [6, 20]).astype('float64')
         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float64')
@@ -128,6 +135,7 @@ def set_data(self):
 
 
 class TestSequenceExpandCase6(TestSequenceExpand):
+
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [4, 25]).astype('float64')
         x_lod = [[1, 1, 0, 1, 1]]
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_expand_as.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_expand_as.py
index 98996e21e1cd6..2cab179b3c5b4 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_expand_as.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_expand_as.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("../")
 import paddle.fluid as fluid
 from op_test import OpTest
@@ -24,6 +25,7 @@
 
 
 class TestSequenceExpandAs(OpTest):
+
     def setUp(self):
         self.op_type = 'sequence_expand_as'
         self.set_data()
@@ -60,6 +62,7 @@ def test_check_grad(self):
 
 
 class TestSequenceExpandAsCase1(TestSequenceExpandAs):
+
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 20]).astype('float64')
         x_lod = [[2, 3]]
@@ -69,6 +72,7 @@ def set_data(self):
 
 
 class TestSequenceExpandAsCase2(TestSequenceExpandAs):
+
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 20]).astype('float64')
         x_lod = [[2, 3]]
@@ -78,6 +82,7 @@ def set_data(self):
 
 
 class TestSequenceExpandAsCase3(TestSequenceExpandAs):
+
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [1, 2, 50]).astype('float64')
         x_lod = [[1]]
@@ -87,6 +92,7 @@ def set_data(self):
 
 
 class TestSequenceExpandAsOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input x must be Variable
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_first_step.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_first_step.py
index 0e7f9202fde82..8d21ad789eab8 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_first_step.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_first_step.py
@@ -18,11 +18,13 @@
 import copy
 import unittest
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
 
 class TestSequenceFirstStepOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
@@ -35,12 +37,11 @@ def test_Variable():
 
             def test_input_dtype():
                 # the dtype of input must be int64
-                type_data = fluid.layers.data(
-                    name='type_data',
-                    shape=[7, 1],
-                    append_batch_size=False,
-                    dtype='int64',
-                    lod_level=1)
+                type_data = fluid.layers.data(name='type_data',
+                                              shape=[7, 1],
+                                              append_batch_size=False,
+                                              dtype='int64',
+                                              lod_level=1)
                 fluid.layers.sequence_last_step(type_data)
 
             self.assertRaises(TypeError, test_input_dtype)
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_last_step.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_last_step.py
index ea3a29a832e3d..0e8fe66d74979 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_last_step.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_last_step.py
@@ -18,11 +18,13 @@
 import copy
 import unittest
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
 
 class TestSequenceLastStepOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
@@ -35,12 +37,11 @@ def test_Variable():
 
             def test_input_dtype():
                 # the dtype of input must be int64
-                type_data = fluid.layers.data(
-                    name='type_data',
-                    shape=[7, 1],
-                    append_batch_size=False,
-                    dtype='int64',
-                    lod_level=1)
+                type_data = fluid.layers.data(name='type_data',
+                                              shape=[7, 1],
+                                              append_batch_size=False,
+                                              dtype='int64',
+                                              lod_level=1)
                 fluid.layers.sequence_last_step(type_data)
 
             self.assertRaises(TypeError, test_input_dtype)
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_mask.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_mask.py
index de41235fd3705..af733edfb61c8 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_mask.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_mask.py
@@ -19,11 +19,13 @@
 import copy
 import unittest
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
 
 class SequenceMaskTestBase(OpTest):
+
     def initDefaultParameters(self):
         self.op_type = 'sequence_mask'
         self.maxlen = 10
@@ -49,13 +51,13 @@ def setUp(self):
     def calc_ground_truth_mask(self):
         maxlen = np.max(self.x) if self.maxlen < 0 else self.maxlen
         shape = self.x.shape + (maxlen, )
-        index_broadcast = np.broadcast_to(
-            np.reshape(
-                range(maxlen), newshape=[1] * self.x.ndim + [-1]),
-            shape=shape)
-        x_broadcast = np.broadcast_to(
-            np.reshape(
-                self.x, newshape=self.x.shape + (-1, )), shape=shape)
+        index_broadcast = np.broadcast_to(np.reshape(
+            range(maxlen), newshape=[1] * self.x.ndim + [-1]),
+                                          shape=shape)
+        x_broadcast = np.broadcast_to(np.reshape(self.x,
+                                                 newshape=self.x.shape +
+                                                 (-1, )),
+                                      shape=shape)
         return (index_broadcast < x_broadcast).astype(self.mask_dtype)
 
     def test_check_output(self):
@@ -63,36 +65,43 @@ def test_check_output(self):
 
 
 class SequenceMaskTest1(SequenceMaskTestBase):
+
     def initParameters(self):
         self.mask_dtype = 'bool'
 
 
 class SequenceMaskTest2(SequenceMaskTestBase):
+
     def initParameters(self):
         self.mask_dtype = 'uint8'
 
 
 class SequenceMaskTest3(SequenceMaskTestBase):
+
     def initParameters(self):
         self.mask_dtype = 'int32'
 
 
 class SequenceMaskTest4(SequenceMaskTestBase):
+
     def initParameters(self):
         self.mask_dtype = 'float32'
 
 
 class SequenceMaskTest5(SequenceMaskTestBase):
+
     def initParameters(self):
         self.mask_dtype = 'float64'
 
 
 class SequenceMaskTest6(SequenceMaskTestBase):
+
     def initParameters(self):
         self.maxlen = -1
 
 
 class SequenceMaskTestBase_tensor_attr(OpTest):
+
     def initDefaultParameters(self):
         self.op_type = 'sequence_mask'
         self.maxlen = 10
@@ -116,13 +125,13 @@ def setUp(self):
     def calc_ground_truth_mask(self):
         maxlen = np.max(self.x) if self.maxlen < 0 else self.maxlen
         shape = self.x.shape + (maxlen, )
-        index_broadcast = np.broadcast_to(
-            np.reshape(
-                range(maxlen), newshape=[1] * self.x.ndim + [-1]),
-            shape=shape)
-        x_broadcast = np.broadcast_to(
-            np.reshape(
-                self.x, newshape=self.x.shape + (-1, )), shape=shape)
+        index_broadcast = np.broadcast_to(np.reshape(
+            range(maxlen), newshape=[1] * self.x.ndim + [-1]),
+                                          shape=shape)
+        x_broadcast = np.broadcast_to(np.reshape(self.x,
+                                                 newshape=self.x.shape +
+                                                 (-1, )),
+                                      shape=shape)
         return (index_broadcast < x_broadcast).astype(self.mask_dtype)
 
     def test_check_output(self):
@@ -130,31 +139,37 @@ def test_check_output(self):
 
 
 class SequenceMaskTest1_tensor_attr(SequenceMaskTestBase_tensor_attr):
+
     def initParameters(self):
         self.mask_dtype = 'bool'
 
 
 class SequenceMaskTest2_tensor_attr(SequenceMaskTestBase_tensor_attr):
+
     def initParameters(self):
         self.mask_dtype = 'uint8'
 
 
 class SequenceMaskTest3_tensor_attr(SequenceMaskTestBase_tensor_attr):
+
     def initParameters(self):
         self.mask_dtype = 'int32'
 
 
 class SequenceMaskTest4_tensor_attr(SequenceMaskTestBase_tensor_attr):
+
     def initParameters(self):
         self.mask_dtype = 'float32'
 
 
 class SequenceMaskTest5_tensor_attr(SequenceMaskTestBase_tensor_attr):
+
     def initParameters(self):
         self.mask_dtype = 'float64'
 
 
 class TestSequenceMaskOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             input_data = np.random.uniform(1, 5, [4]).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_pad_op.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_pad_op.py
index 7d2ba834de163..934e0ebe8fd78 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_pad_op.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
@@ -23,6 +24,7 @@
 
 
 class TestSequencePadOp(OpTest):
+
     def set_attr(self):
         self.x_shape = [12, 10]
         self.x_len_lod = [[2, 3, 4, 3]]
@@ -53,8 +55,8 @@ def compute(self):
         x_data = self.inputs['X'][0]
         pad_value_data = self.inputs['PadValue']
         if pad_value_data.shape == (1, ):
-            pad_value_data = np.broadcast_to(
-                pad_value_data, shape=x_data.shape[1:])
+            pad_value_data = np.broadcast_to(pad_value_data,
+                                             shape=x_data.shape[1:])
         padded_sequences = []
         start_idx = 0
         for l in x_len_lod_0:
@@ -84,6 +86,7 @@ def test_check_grad(self):
 
 
 class TestSequencePadOp2(TestSequencePadOp):
+
     def set_attr(self):
         self.x_shape = [12, 10]
         self.x_len_lod = [[2, 3, 4, 3]]
@@ -93,6 +96,7 @@ def set_attr(self):
 
 
 class TestSequencePadOp3(TestSequencePadOp):
+
     def set_attr(self):
         self.x_shape = [12, 10]
         self.x_len_lod = [[2, 3, 4, 3]]
@@ -102,6 +106,7 @@ def set_attr(self):
 
 
 class TestSequencePadOp4(TestSequencePadOp):
+
     def set_attr(self):
         self.x_shape = [12, 10]
         self.x_len_lod = [[2, 3, 4, 3]]
@@ -111,6 +116,7 @@ def set_attr(self):
 
 
 class TestSequencePadOp5(TestSequencePadOp):
+
     def set_attr(self):
         self.x_shape = [12, 2, 5]
         self.x_len_lod = [[2, 3, 4, 3]]
@@ -120,6 +126,7 @@ def set_attr(self):
 
 
 class TestSequencePadOp6(TestSequencePadOp):
+
     def set_attr(self):
         self.x_shape = [12, 2, 5]
         self.x_len_lod = [[2, 3, 4, 3]]
@@ -129,6 +136,7 @@ def set_attr(self):
 
 
 class TestSequencePadOp7(TestSequencePadOp):
+
     def set_attr(self):
         self.x_shape = [12, 2, 5]
         self.x_len_lod = [[2, 3, 4, 3]]
@@ -138,6 +146,7 @@ def set_attr(self):
 
 
 class TestSequencePadOp8(TestSequencePadOp):
+
     def set_attr(self):
         self.x_shape = [12, 2, 5]
         self.x_len_lod = [[0, 8, 0, 4, 0]]
@@ -147,29 +156,35 @@ def set_attr(self):
 
 
 class TestSequencePadOpError(unittest.TestCase):
+
     def test_error(self):
+
         def test_x_variable():
             # the input x type must be Variable
             x = np.random.random((2, 4)).astype("float32")
-            pad_value = fluid.layers.assign(input=np.array(
-                [0.0], dtype=np.float32))
+            pad_value = fluid.layers.assign(
+                input=np.array([0.0], dtype=np.float32))
             fluid.layers.sequence_pad(x=x, pad_value=pad_value)
 
         self.assertRaises(TypeError, test_x_variable)
 
         def test_pad_value_variable():
-            x1 = fluid.layers.data(
-                name='x1', shape=[10, 5], dtype='float32', lod_level=1)
+            x1 = fluid.layers.data(name='x1',
+                                   shape=[10, 5],
+                                   dtype='float32',
+                                   lod_level=1)
             pad_value1 = np.array([0.0], dtype=np.float32)
             fluid.layers.sequence_pad(x=x1, pad_value=pad_value1)
 
         self.assertRaises(TypeError, test_pad_value_variable)
 
         def test_dtype():
-            x2 = fluid.layers.data(
-                name='x2', shape=[10, 5], dtype='int16', lod_level=1)
-            pad_value2 = fluid.layers.assign(input=np.array(
-                [0.0], dtype=np.int32))
+            x2 = fluid.layers.data(name='x2',
+                                   shape=[10, 5],
+                                   dtype='int16',
+                                   lod_level=1)
+            pad_value2 = fluid.layers.assign(
+                input=np.array([0.0], dtype=np.int32))
             fluid.layers.sequence_pad(x=x2, pad_value=pad_value2)
 
         self.assertRaises(TypeError, test_dtype)
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_pool.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_pool.py
index 499955df8f10a..eff40454c4e41 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_pool.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_pool.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("../")
 from op_test import OpTest, skip_check_grad_ci
 from test_reorder_lod_tensor import convert_to_offset
@@ -54,6 +55,7 @@ def compute_seqpool_sqrt(x, offset, out, pad_value=0.0):
 
 
 class TestSeqAvgPool(OpTest):
+
     def set_lod(self):
         return [[11]]
 
@@ -95,54 +97,62 @@ def test_check_grad(self):
 
 
 class TestSeqAvgPoolBatch1(TestSeqAvgPool):
+
     def set_lod(self):
         return [[11]]
 
     def set_lod_data(self):
         lod = self.set_lod()
-        x, _ = self.get_sequence_batch_size_1_input(
-            lod=lod, shape=[lod[0][0], 23])
+        x, _ = self.get_sequence_batch_size_1_input(lod=lod,
+                                                    shape=[lod[0][0], 23])
         return x
 
 
 class TestSeqAvgPoolInstance0(TestSeqAvgPool):
+
     def set_lod(self):
         return [[0, 0, 4, 0, 3, 0, 0, 5, 0, 0]]
 
     def set_lod_data(self):
         lod = self.set_lod()
-        x, _ = self.get_sequence_instance_size_0_input(
-            lod=lod, shape=[sum(lod[0]), 10])
+        x, _ = self.get_sequence_instance_size_0_input(lod=lod,
+                                                       shape=[sum(lod[0]), 10])
         return x
 
 
 class TestSeqAvgPoolLen0(TestSeqAvgPool):
+
     def set_lod(self):
         return [[0, 4, 0, 7, 0]]
 
 
 class TestSeqAvgPoolLen0LoDLevel2(TestSeqAvgPool):
+
     def set_lod(self):
         return [[2, 0, 1, 2], [0, 4, 0, 7, 0]]
 
 
 class TestSeqSumPool(TestSeqAvgPool):
+
     def compute(self, x, offset, out):
         self.attrs = {"pad_value": 0.1, 'pooltype': "SUM"}
         compute_seqpool_sum(x, offset, out, self.attrs["pad_value"])
 
 
 class TestSeqSumPoolLen0(TestSeqSumPool):
+
     def set_lod(self):
         return [[0, 4, 0, 7, 0]]
 
 
 class TestSeqSumPoolLen0LoDLevel2(TestSeqSumPool):
+
     def set_lod(self):
         return [[2, 0, 1, 2], [0, 4, 0, 7, 0]]
 
 
 class TestSeqMaxPool(TestSeqAvgPool):
+
     def set_lod(self):
         return [[13]]
 
@@ -175,32 +185,38 @@ def compute(self, x, offset, out):
 
 
 class TestSeqMaxPoolLen0(TestSeqMaxPool):
+
     def set_lod(self):
         return [[0, 1, 1, 5, 6, 0]]
 
 
 class TestSeqMaxPoolLen0LoDLevel2(TestSeqMaxPool):
+
     def set_lod(self):
         return [[2, 0, 3, 1], [0, 1, 1, 5, 6, 0]]
 
 
 class TestSeqSqrtPool(TestSeqAvgPool):
+
     def compute(self, x, offset, out):
         self.attrs = {"pad_value": 0.0, 'pooltype': "SQRT"}
         compute_seqpool_sqrt(x, offset, out, self.attrs["pad_value"])
 
 
 class TestSeqSqrtPoolLen0(TestSeqSqrtPool):
+
     def set_lod(self):
         return [[0, 7, 0, 2, 2, 0]]
 
 
 class TestSeqSqrtPoolLen0LoDLevel2(TestSeqSqrtPool):
+
     def set_lod(self):
         return [[1, 2, 0, 3], [0, 7, 0, 2, 2, 0]]
 
 
 class TestSeqLastPool(TestSeqAvgPool):
+
     def compute(self, x, offset, out):
         self.attrs = {"pad_value": 0.0, 'pooltype': "LAST"}
         level = len(offset) - 1
@@ -213,16 +229,19 @@ def compute(self, x, offset, out):
 
 
 class TestSeqLastPoolLen0(TestSeqLastPool):
+
     def set_lod(self):
         return [[0, 3, 4, 0, 4, 0]]
 
 
 class TestSeqLastPoolLen0LoDLevel2(TestSeqLastPool):
+
     def set_lod(self):
         return [[1, 0, 2, 3], [0, 3, 4, 0, 4, 0]]
 
 
 class TestSeqFirstPool(TestSeqAvgPool):
+
     def compute(self, x, offset, out):
         self.attrs = {"pad_value": 0.3, 'pooltype': "FIRST"}
         level = len(offset) - 1
@@ -235,16 +254,19 @@ def compute(self, x, offset, out):
 
 
 class TestSeqFirstPoolLen0(TestSeqFirstPool):
+
     def set_lod(self):
         return [[0, 2, 0, 3, 6, 0]]
 
 
 class TestSeqFirstPoolLen0LoDLevel2(TestSeqFirstPool):
+
     def set_lod(self):
         return [[1, 0, 2, 3], [0, 2, 0, 3, 6, 0]]
 
 
 class TestSeqAvgPool2D(TestSeqAvgPool):
+
     def set_lod(self):
         return [[4, 1, 3, 5]]
 
@@ -273,16 +295,19 @@ def compute(self, x, offset, out):
 
 
 class TestSeqAvgPool2DLen0(TestSeqAvgPool2D):
+
     def set_lod(self):
         return [[0, 5, 0, 8, 0]]
 
 
 class TestSeqAvgPool2DLen0LoDLevel2(TestSeqAvgPool2D):
+
     def set_lod(self):
         return [[1, 0, 4], [0, 5, 0, 8, 0]]
 
 
 class TestSeqSumPool2D(TestSeqAvgPool2D):
+
     def compute(self, x, offset, out):
         self.attrs = {"pad_value": 0.2, 'pooltype': "SUM"}
         level = len(offset) - 1
@@ -296,16 +321,19 @@ def compute(self, x, offset, out):
 
 
 class TestSeqSumPool2DLen0(TestSeqSumPool2D):
+
     def set_lod(self):
         return [[0, 8, 0, 5, 0]]
 
 
 class TestSeqSumPool2DLen0LoDLevel2(TestSeqSumPool2D):
+
     def set_lod(self):
         return [[1, 0, 4], [0, 8, 0, 5, 0]]
 
 
 class TestSeqSqrtPool2D(TestSeqAvgPool2D):
+
     def compute(self, x, offset, out):
         self.attrs = {"pad_value": 0.0, 'pooltype': "SQRT"}
         level = len(offset) - 1
@@ -326,21 +354,26 @@ def test_check_grad(self):
             out = out[0]
         self.outputs['MaxIndex'] = \
             np.zeros(out.shape).astype('int32')
-        self.check_grad(
-            ["X"], "Out", max_relative_error=0.06, check_dygraph=False)
+        self.check_grad(["X"],
+                        "Out",
+                        max_relative_error=0.06,
+                        check_dygraph=False)
 
 
 class TestSeqSqrtPool2DLen0(TestSeqSqrtPool2D):
+
     def set_lod(self):
         return [[0, 8, 0, 5, 0]]
 
 
 class TestSeqSqrtPool2DLen0LoDLevel2(TestSeqSqrtPool2D):
+
     def set_lod(self):
         return [[1, 0, 2, 2], [0, 8, 0, 5, 0]]
 
 
 class TestSeqMaxPool2D(TestSeqAvgPool2D):
+
     def set_lod(self):
         return [[4, 1, 3, 5]]
 
@@ -374,11 +407,13 @@ def compute(self, x, offset, out):
 
 
 class TestSeqMaxPool2DLen0(TestSeqMaxPool2D):
+
     def set_lod(self):
         return [[0, 3, 0, 10, 0]]
 
 
 class TestSeqMaxPool2DLen0LoDLevel2(TestSeqMaxPool2D):
+
     def set_lod(self):
         return [[1, 0, 2, 2], [0, 3, 0, 10, 0]]
 
@@ -386,6 +421,7 @@ def set_lod(self):
 @skip_check_grad_ci(reason="Grad computation does not apply to Sequence MAX "
                     "Pool executed when is_test is true.")
 class TestSeqMaxPool2DInference(TestSeqMaxPool2D):
+
     def compute(self, x, offset, out):
         self.attrs = {"pad_value": 1.0, 'pooltype': "MAX", 'is_test': True}
         level = len(offset) - 1
@@ -404,16 +440,19 @@ def test_check_grad(self):
 
 
 class TestSeqMaxPool2DInferenceLen0(TestSeqMaxPool2DInference):
+
     def set_lod(self):
         return [[0, 3, 0, 10, 0]]
 
 
 class TestSeqMaxPool2DInferenceLen0LoDLevel2(TestSeqMaxPool2DInference):
+
     def set_lod(self):
         return [[1, 0, 2, 2], [0, 3, 0, 10, 0]]
 
 
 class TestSeqLastPool2D(TestSeqAvgPool2D):
+
     def compute(self, x, offset, out):
         self.attrs = {"pad_value": 0.0, 'pooltype': "LAST"}
         level = len(offset) - 1
@@ -427,16 +466,19 @@ def compute(self, x, offset, out):
 
 
 class TestSeqLastPool2DLen0(TestSeqLastPool2D):
+
     def set_lod(self):
         return [[0, 3, 0, 1, 9, 0]]
 
 
 class TestSeqLastPool2DLen0LoDLevel2(TestSeqLastPool2D):
+
     def set_lod(self):
         return [[1, 0, 2, 3], [0, 3, 0, 1, 9, 0]]
 
 
 class TestSeqFirstPool2D(TestSeqAvgPool2D):
+
     def compute(self, x, offset, out):
         self.attrs = {"pad_value": 0.0, 'pooltype': "FIRST"}
         level = len(offset) - 1
@@ -450,11 +492,13 @@ def compute(self, x, offset, out):
 
 
 class TestSeqFirstPool2DLen0(TestSeqFirstPool2D):
+
     def set_lod(self):
         return [[0, 3, 0, 3, 7, 0]]
 
 
 class TestSeqFirstPool2DLen0LoDLevel2(TestSeqFirstPool2D):
+
     def set_lod(self):
         return [[1, 0, 2, 3], [0, 3, 0, 3, 7, 0]]
 
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_reshape.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_reshape.py
index 6540c6a094448..7a20f70c2daee 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_reshape.py
@@ -18,6 +18,7 @@
 import numpy as np
 import math
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
@@ -25,6 +26,7 @@
 
 
 class TestSequenceReshape(OpTest):
+
     def init_data(self):
         self.dimension = 12
         self.x_lod = [[4, 1, 3, 3]]
@@ -58,6 +60,7 @@ def test_check_grad(self):
 
 
 class TestSequenceReshape_reduce(TestSequenceReshape):
+
     def init_data(self):
         self.dimension = 24
         self.x_lod = [[4, 2, 2, 4]]
@@ -65,6 +68,7 @@ def init_data(self):
 
 
 class TestSequenceReshape_same(TestSequenceReshape):
+
     def init_data(self):
         self.dimension = 12
         self.x_lod = [[4, 2, 2, 4]]
@@ -72,6 +76,7 @@ def init_data(self):
 
 
 class TestSequenceReshape_reduce_seq_len0(TestSequenceReshape):
+
     def init_data(self):
         self.dimension = 24
         self.x_lod = [[0, 6, 0, 2, 4]]
@@ -79,6 +84,7 @@ def init_data(self):
 
 
 class TestSequenceReshape_reduce_seq_len0_case1(TestSequenceReshape):
+
     def init_data(self):
         self.dimension = 24
         self.x_lod = [[0, 2, 8, 2, 0]]
@@ -86,7 +92,9 @@ def init_data(self):
 
 
 class TestSequenceReshapeOpError(unittest.TestCase):
+
     def test_error(self):
+
         def test_variable():
             x = np.random.random((2, 4)).astype("float32")
             fluid.layers.sequence_reshape(x=x, new_dim=4)
@@ -94,12 +102,11 @@ def test_variable():
         self.assertRaises(TypeError, test_variable)
 
         def test_dtype():
-            x1 = fluid.layers.data(
-                name='x1',
-                shape=[2, 6],
-                append_batch_size=False,
-                dtype='float16',
-                lod_level=1)
+            x1 = fluid.layers.data(name='x1',
+                                   shape=[2, 6],
+                                   append_batch_size=False,
+                                   dtype='float16',
+                                   lod_level=1)
             fluid.layers.sequence_reshape(x=x1, new_dim=4)
 
         self.assertRaises(TypeError, test_dtype)
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_reverse.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_reverse.py
index 4ffec9737af11..8e1b447f92fe5 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_reverse.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_reverse.py
@@ -17,11 +17,13 @@
 import paddle.fluid.core as core
 import numpy as np
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
 
 class TestSequenceReverseBase(OpTest):
+
     def initParameters(self):
         pass
 
@@ -34,8 +36,16 @@ def setUp(self):
         self.x = np.random.random(self.size).astype(self.dtype)
         self.y = self.get_output()
 
-        self.inputs = {'X': (self.x, [self.lod, ]), }
-        self.outputs = {'Y': (self.y, [self.lod, ]), }
+        self.inputs = {
+            'X': (self.x, [
+                self.lod,
+            ]),
+        }
+        self.outputs = {
+            'Y': (self.y, [
+                self.lod,
+            ]),
+        }
 
     def get_output(self):
         tmp_x = np.reshape(self.x, newshape=[self.x.shape[0], -1])
@@ -56,31 +66,37 @@ def test_grad(self):
 
 
 class TestSequenceReserve1(TestSequenceReverseBase):
+
     def initParameters(self):
         self.size = (12, 10)
         self.lod = [4, 5, 3]
 
 
 class TestSequenceReverse2(TestSequenceReverseBase):
+
     def initParameters(self):
         self.size = (12, 10)
         self.lod = [12]
 
 
 class TestSequenceReverse3(TestSequenceReverseBase):
+
     def initParameters(self):
         self.size = (12, 10)
         self.lod = [3, 0, 6, 3]
 
 
 class TestSequenceReverse4(TestSequenceReverseBase):
+
     def initParameters(self):
         self.size = (12, 10)
         self.lod = [0, 2, 10, 0]
 
 
 class TestSequenceReverseOpError(unittest.TestCase):
+
     def test_error(self):
+
         def test_variable():
             # the input type must be Variable
             x_data = np.random.random((2, 4)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_scatter_op.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_scatter_op.py
index 1cc78c85b5065..2b8e826182909 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_scatter_op.py
@@ -15,11 +15,13 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
 
 class TestSequenceScatterOp(OpTest):
+
     def init_lod(self):
         return [[30, 50, 40]]
 
@@ -55,31 +57,37 @@ def test_check_grad(self):
 
 
 class TestSequenceScatterOpSeqLen0(TestSequenceScatterOp):
+
     def init_lod(self):
         return [[60, 60, 00]]
 
 
 class TestSequenceScatterOpSeqLen0Case1(TestSequenceScatterOp):
+
     def init_lod(self):
         return [[0, 60, 60]]
 
 
 class TestSequenceScatterOpSeqLen0Case2(TestSequenceScatterOp):
+
     def init_lod(self):
         return [[60, 0, 60]]
 
 
 class TestSequenceScatterOpSeqLen0Case3(TestSequenceScatterOp):
+
     def init_lod(self):
         return [[120, 0, 0]]
 
 
 class TestSequenceScatterOpSeqLen0Case4(TestSequenceScatterOp):
+
     def init_lod(self):
         return [[0, 120, 0]]
 
 
 class TestSequenceScatterOpSeqLen0Case5(TestSequenceScatterOp):
+
     def init_lod(self):
         return [[0, 0, 120]]
 
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_slice_op.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_slice_op.py
index 4d254ea6d4f3e..b961bdc4e8516 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_slice_op.py
@@ -17,11 +17,13 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
 
 class TestSequenceSliceOp(OpTest):
+
     def set_data(self):
         self.init_test_case()
         # only supprot one level LoD
@@ -61,6 +63,7 @@ def test_check_grad(self):
 
 
 class TestSequenceSliceOpSeqlen0Case0(TestSequenceSliceOp):
+
     def init_test_case(self):
         self.x_dim = (100, 3, 2)
         self.x_lod = [[20, 30, 0, 30, 20]]
@@ -69,6 +72,7 @@ def init_test_case(self):
 
 
 class TestSequenceSliceOpSeqlen0Case1(TestSequenceSliceOp):
+
     def init_test_case(self):
         self.x_dim = (100, 3, 2)
         self.x_lod = [[0, 70, 0, 30, 0]]
@@ -77,6 +81,7 @@ def init_test_case(self):
 
 
 class TestSequenceSliceOpSeqlen0Case2(TestSequenceSliceOp):
+
     def init_test_case(self):
         self.x_dim = (100, 3, 2)
         self.x_lod = [[0, 100, 0, 0, 0]]
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py
index cb92a68bde638..db07a0eaa2a7c 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py
@@ -18,12 +18,14 @@
 import numpy as np
 import paddle.fluid.core as core
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 from test_softmax_op import stable_softmax
 
 
 class TestSequenceSoftmaxOp(OpTest):
+
     def setUp(self):
         self.op_type = "sequence_softmax"
         self.use_cudnn = False
@@ -45,7 +47,9 @@ def setUp(self):
 
         self.inputs = {"X": (x, self.lod)}
         self.outputs = {"Out": out}
-        self.attrs = {'use_cudnn': self.use_cudnn, }
+        self.attrs = {
+            'use_cudnn': self.use_cudnn,
+        }
 
     def init_lod(self):
         self.lod = [[40, 10, 30, 30]]
@@ -72,21 +76,25 @@ def test_check_grad(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSequenceSoftmaxCUDNNOp(TestSequenceSoftmaxOp):
+
     def init_op_type(self):
         self.use_cudnn = True
 
 
 class TestSequenceSoftmaxOpSeqLen0Case0(TestSequenceSoftmaxOp):
+
     def init_lod(self):
         self.lod = [[40, 0, 40, 30]]
 
 
 class TestSequenceSoftmaxOpSeqLen0Case1(TestSequenceSoftmaxOp):
+
     def init_lod(self):
         self.lod = [[0, 40, 70, 0]]
 
 
 class TestSequenceSoftmaxOpSeqLen0Case2(TestSequenceSoftmaxOp):
+
     def init_lod(self):
         self.lod = [[0, 0, 0, 110]]
 
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_topk_avg_pooling.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_topk_avg_pooling.py
index fe9aa5ad02578..55a0c8d0bc3cb 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_topk_avg_pooling.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_topk_avg_pooling.py
@@ -18,11 +18,13 @@
 import numpy as np
 from copy import deepcopy
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
 
 class TestSequenceTopkAvgPoolingOp(OpTest):
+
     def setUp(self):
         self.init_op_type()
         self.set_data()
@@ -72,8 +74,9 @@ def calc_gradient(self, pos_data, topks, channel_num, row, col):
                     for k_idx in range(len(topks)):
                         for k in range(topks[k_idx]):
                             if pos_data[pos_idx + k] != -1:
-                                gradient[in_idx + pos_data[
-                                    pos_idx + k]] += dout_val / topks[k_idx]
+                                gradient[in_idx +
+                                         pos_data[pos_idx +
+                                                  k]] += dout_val / topks[k_idx]
                 in_offset += row_size * col_size
                 pos_offset += row_size * max_k
         return gradient
@@ -109,10 +112,10 @@ def compute(self):
 
                     offset += col_lod[0][idx]
 
-            out_tmp = out_tmp.reshape([channel_num, -1, len(topks)]).transpose(
-                1, 0, 2)
-            pos_tmp = pos_tmp.reshape([channel_num, -1, max_k]).transpose(1, 0,
-                                                                          2)
+            out_tmp = out_tmp.reshape([channel_num, -1,
+                                       len(topks)]).transpose(1, 0, 2)
+            pos_tmp = pos_tmp.reshape([channel_num, -1,
+                                       max_k]).transpose(1, 0, 2)
             out = np.vstack(
                 (out, out_tmp.reshape([-1, len(topks) * channel_num])))
             pos = np.hstack((pos, pos_tmp.flatten()))
@@ -148,6 +151,7 @@ def test_check_grad(self):
 
 
 class TestSequenceTopkAvgPoolingOpCase1(TestSequenceTopkAvgPoolingOp):
+
     def set_data(self):
         topks = [2, 3]
         channel_num = 5
@@ -161,8 +165,11 @@ def test_api(self):
         x = fluid.layers.data(name='x', shape=[1], lod_level=1)
         row = fluid.layers.data(name='row', shape=[10], lod_level=1)
         col = fluid.layers.data(name='col', shape=[10], lod_level=1)
-        topk_avg = fluid.contrib.sequence_topk_avg_pooling(
-            input=x, row=row, col=col, topks=[1, 3, 5], channel_num=5)
+        topk_avg = fluid.contrib.sequence_topk_avg_pooling(input=x,
+                                                           row=row,
+                                                           col=col,
+                                                           topks=[1, 3, 5],
+                                                           channel_num=5)
 
         place = fluid.CPUPlace()
         x_tensor = fluid.create_lod_tensor(
@@ -174,12 +181,13 @@ def test_api(self):
 
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        ret = exe.run(
-            feed={'x': x_tensor,
-                  'row': row_tensor,
-                  'col': col_tensor},
-            fetch_list=[topk_avg],
-            return_numpy=False)
+        ret = exe.run(feed={
+            'x': x_tensor,
+            'row': row_tensor,
+            'col': col_tensor
+        },
+                      fetch_list=[topk_avg],
+                      return_numpy=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_unpad_op.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_unpad_op.py
index 1d212296227f7..ab60fbfde33b5 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_unpad_op.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_unpad_op.py
@@ -16,6 +16,7 @@
 import six
 import numpy as np
 import sys
+
 sys.path.append("../")
 from op_test import OpTest
 
@@ -23,6 +24,7 @@
 
 
 class TestSequenceUnpadOp(OpTest):
+
     def init(self):
         self.length = [2, 3, 4]
         self.x_shape = (3, 40)
@@ -59,6 +61,7 @@ def test_check_grad(self):
 
 
 class TestSequenceUnpadOp2(TestSequenceUnpadOp):
+
     def init(self):
         self.length = [2, 3, 4]
         self.x_shape = (3, 5, 4, 3)
@@ -66,6 +69,7 @@ def init(self):
 
 
 class TestSequenceUnpadOp3(TestSequenceUnpadOp):
+
     def init(self):
         self.length = [5, 2, 3, 4]
         self.x_shape = (4, 5, 3, 3, 6)
@@ -73,6 +77,7 @@ def init(self):
 
 
 class TestSequenceUnpadOp4(TestSequenceUnpadOp):
+
     def init(self):
         self.length = [5, 0, 0, 4]
         self.x_shape = (4, 5, 3, 3, 6)
@@ -80,6 +85,7 @@ def init(self):
 
 
 class TestSequenceUnpadOp5(TestSequenceUnpadOp):
+
     def init(self):
         self.length = [0, 4, 3, 0]
         self.x_shape = (4, 5, 3, 3, 6)
@@ -87,7 +93,9 @@ def init(self):
 
 
 class TestSequenceUnpadOpError(unittest.TestCase):
+
     def test_error(self):
+
         def test_x_variable():
             x = np.random.random((10, 5)).astype("float64")
             len = fluid.data(name='length2', shape=[10], dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
index 1f02562dcb4fb..b014a079b80e3 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import paddle.fluid as fluid
+
 fluid.core._set_eager_deletion_mode(-1, -1, False)
 
 import paddle.fluid.layers.ops as ops
@@ -22,6 +23,7 @@
 from seresnext_test_base import DeviceType
 import math
 import os
+
 os.environ['CPU_NUM'] = str(4)
 os.environ['FLAGS_cudnn_deterministic'] = str(1)
 
@@ -48,8 +50,8 @@ def squeeze_excitation(input, num_channels, reduction_ratio):
     #    input=input, pool_size=0, pool_type='avg', global_pooling=True)
     conv = input
     shape = conv.shape
-    reshape = fluid.layers.reshape(
-        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
+    reshape = fluid.layers.reshape(x=conv,
+                                   shape=[-1, shape[1], shape[2] * shape[3]])
     pool = fluid.layers.reduce_mean(input=reshape, dim=2)
 
     squeeze = fluid.layers.fc(input=pool,
@@ -62,18 +64,21 @@ def squeeze_excitation(input, num_channels, reduction_ratio):
     return scale
 
 
-def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
+def conv_bn_layer(input,
+                  num_filters,
+                  filter_size,
+                  stride=1,
+                  groups=1,
                   act=None):
-    conv = fluid.layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=(filter_size - 1) // 2,
-        groups=groups,
-        act=None,
-        use_cudnn=(not remove_cudnn_conv),
-        bias_attr=False)
+    conv = fluid.layers.conv2d(input=input,
+                               num_filters=num_filters,
+                               filter_size=filter_size,
+                               stride=stride,
+                               padding=(filter_size - 1) // 2,
+                               groups=groups,
+                               act=None,
+                               use_cudnn=(not remove_cudnn_conv),
+                               bias_attr=False)
     return conv if remove_bn else fluid.layers.batch_norm(
         input=conv, act=act, momentum=0.1)
 
@@ -93,21 +98,23 @@ def shortcut(input, ch_out, stride):
 def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
     # The number of first 1x1 convolutional channels for each bottleneck build block
     # was halved to reduce the compution cost.
-    conv0 = conv_bn_layer(
-        input=input, num_filters=num_filters, filter_size=1, act='relu')
-    conv1 = conv_bn_layer(
-        input=conv0,
-        num_filters=num_filters * 2,
-        filter_size=3,
-        stride=stride,
-        groups=cardinality,
-        act='relu')
-    conv2 = conv_bn_layer(
-        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
-    scale = squeeze_excitation(
-        input=conv2,
-        num_channels=num_filters * 2,
-        reduction_ratio=reduction_ratio)
+    conv0 = conv_bn_layer(input=input,
+                          num_filters=num_filters,
+                          filter_size=1,
+                          act='relu')
+    conv1 = conv_bn_layer(input=conv0,
+                          num_filters=num_filters * 2,
+                          filter_size=3,
+                          stride=stride,
+                          groups=cardinality,
+                          act='relu')
+    conv2 = conv_bn_layer(input=conv1,
+                          num_filters=num_filters * 2,
+                          filter_size=1,
+                          act=None)
+    scale = squeeze_excitation(input=conv2,
+                               num_channels=num_filters * 2,
+                               reduction_ratio=reduction_ratio)
 
     short = shortcut(input, num_filters * 2, stride)
 
@@ -122,14 +129,26 @@ def SE_ResNeXt50Small(use_feed):
     img = fluid.layers.data(name='image', shape=img_shape, dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
-    conv = conv_bn_layer(
-        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
-    conv = conv_bn_layer(
-        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
-    conv = conv_bn_layer(
-        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
-    conv = fluid.layers.pool2d(
-        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+    conv = conv_bn_layer(input=img,
+                         num_filters=16,
+                         filter_size=3,
+                         stride=2,
+                         act='relu')
+    conv = conv_bn_layer(input=conv,
+                         num_filters=16,
+                         filter_size=3,
+                         stride=1,
+                         act='relu')
+    conv = conv_bn_layer(input=conv,
+                         num_filters=16,
+                         filter_size=3,
+                         stride=1,
+                         act='relu')
+    conv = fluid.layers.pool2d(input=conv,
+                               pool_size=3,
+                               pool_stride=2,
+                               pool_padding=1,
+                               pool_type='max')
 
     cardinality = 32
     reduction_ratio = 16
@@ -138,16 +157,15 @@ def SE_ResNeXt50Small(use_feed):
 
     for block in range(len(depth)):
         for i in range(depth[block]):
-            conv = bottleneck_block(
-                input=conv,
-                num_filters=num_filters[block],
-                stride=2 if i == 0 and block != 0 else 1,
-                cardinality=cardinality,
-                reduction_ratio=reduction_ratio)
+            conv = bottleneck_block(input=conv,
+                                    num_filters=num_filters[block],
+                                    stride=2 if i == 0 and block != 0 else 1,
+                                    cardinality=cardinality,
+                                    reduction_ratio=reduction_ratio)
 
     shape = conv.shape
-    reshape = fluid.layers.reshape(
-        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
+    reshape = fluid.layers.reshape(x=conv,
+                                   shape=[-1, shape[1], shape[2] * shape[3]])
     pool = fluid.layers.reduce_mean(input=reshape, dim=2)
     dropout = pool if remove_dropout else fluid.layers.dropout(
         x=pool, dropout_prob=0.2, seed=1)
@@ -160,8 +178,9 @@ def SE_ResNeXt50Small(use_feed):
 
 def optimizer(learning_rate=0.01):
     optimizer = fluid.optimizer.Momentum(
-        learning_rate=cosine_decay(
-            learning_rate=learning_rate, step_each_epoch=2, epochs=1),
+        learning_rate=cosine_decay(learning_rate=learning_rate,
+                                   step_each_epoch=2,
+                                   epochs=1),
         momentum=0.9,
         regularization=fluid.regularizer.L2Decay(1e-4))
     return optimizer
@@ -187,10 +206,9 @@ def iter(use_device):
     batch_size=batch_size(use_device=DeviceType.CUDA),
     img_shape=img_shape,
     label_range=999)
-cpu_img, cpu_label = init_data(
-    batch_size=batch_size(use_device=DeviceType.CPU),
-    img_shape=img_shape,
-    label_range=999)
+cpu_img, cpu_label = init_data(batch_size=batch_size(use_device=DeviceType.CPU),
+                               img_shape=img_shape,
+                               label_range=999)
 feed_dict_gpu = {"image": gpu_img, "label": gpu_label}
 feed_dict_cpu = {"image": cpu_img, "label": cpu_label}
 
diff --git a/python/paddle/fluid/tests/unittests/seresnext_test_base.py b/python/paddle/fluid/tests/unittests/seresnext_test_base.py
index bf33adcf48655..f911352013145 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_test_base.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_test_base.py
@@ -21,6 +21,7 @@
 
 
 class TestResnetBase(TestParallelExecutorBase):
+
     def _compare_result_with_origin_model(self,
                                           check_func,
                                           use_device,
@@ -51,7 +52,9 @@ def _compare_result_with_origin_model(self,
             for loss in zip(func_1_last_loss, func_2_last_loss):
                 self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
         else:
-            self.assertAlmostEquals(
-                np.mean(func_1_first_loss), func_2_first_loss[0], delta=1e-5)
-            self.assertAlmostEquals(
-                np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2)
+            self.assertAlmostEquals(np.mean(func_1_first_loss),
+                                    func_2_first_loss[0],
+                                    delta=1e-5)
+            self.assertAlmostEquals(np.mean(func_1_last_loss),
+                                    func_2_last_loss[0],
+                                    delta=delta2)
diff --git a/python/paddle/fluid/tests/unittests/simnet_dataset_reader.py b/python/paddle/fluid/tests/unittests/simnet_dataset_reader.py
index 737677ccf90af..62cfa5453d4c4 100644
--- a/python/paddle/fluid/tests/unittests/simnet_dataset_reader.py
+++ b/python/paddle/fluid/tests/unittests/simnet_dataset_reader.py
@@ -29,5 +29,6 @@
 
 
 class DatasetSimnetReader(fleet.MultiSlotDataGenerator):
+
     def generate_sample(self, line):
         pass
diff --git a/python/paddle/fluid/tests/unittests/simple_nets.py b/python/paddle/fluid/tests/unittests/simple_nets.py
index 7f22df67d1b94..b9e38d21da831 100644
--- a/python/paddle/fluid/tests/unittests/simple_nets.py
+++ b/python/paddle/fluid/tests/unittests/simple_nets.py
@@ -23,8 +23,8 @@ def simple_fc_net_with_inputs(img, label, class_num=10):
             hidden,
             size=100,
             act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=1.0)))
     prediction = fluid.layers.fc(hidden, size=class_num, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
     loss = fluid.layers.mean(loss)
@@ -44,8 +44,8 @@ def batchnorm_fc_with_inputs(img, label, class_num=10):
             hidden,
             size=200,
             act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=1.0)))
 
         hidden = fluid.layers.batch_norm(input=hidden)
 
@@ -73,11 +73,14 @@ def bow_net(use_feed,
     This model is from https://github.com/PaddlePaddle/models:
     fluid/PaddleNLP/text_classification/nets.py
     """
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
+    data = fluid.layers.data(name="words",
+                             shape=[1],
+                             dtype="int64",
+                             lod_level=1)
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    emb = fluid.layers.embedding(
-        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
+    emb = fluid.layers.embedding(input=data,
+                                 is_sparse=is_sparse,
+                                 size=[dict_dim, emb_dim])
     bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
     bow_tanh = fluid.layers.tanh(bow)
     fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
index 11f8cd559d1a6..42d8f50ea3f77 100644
--- a/python/paddle/fluid/tests/unittests/spawn_runner_base.py
+++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
@@ -32,6 +32,7 @@ class SpawnAssistTestArgs(object):
 
 
 class TestDistSpawnRunner(unittest.TestCase):
+
     def setUp(self):
         # NOTE(chenweihang): keep consistent with
         # TestDistBase.check_with_place
@@ -43,11 +44,10 @@ def _run(self, model, args):
 
     def _run_parallel(self, model, args):
         args.update_method = "nccl2"
-        context = paddle.distributed.spawn(
-            func=model.run_trainer_with_spawn,
-            args=(args, ),
-            nprocs=self.nprocs,
-            join=True)
+        context = paddle.distributed.spawn(func=model.run_trainer_with_spawn,
+                                           args=(args, ),
+                                           nprocs=self.nprocs,
+                                           join=True)
         result_list = []
         for res_queue in context.return_queues:
             result_list.append(res_queue.get())
@@ -55,10 +55,10 @@ def _run_parallel(self, model, args):
 
     def check_dist_result_with_spawn(self, test_class, delta=1e-3):
         with _test_eager_guard():
-            self.check_dist_result_with_spawn_func(
-                test_class=test_class, delta=delta)
-        self.check_dist_result_with_spawn_func(
-            test_class=test_class, delta=delta)
+            self.check_dist_result_with_spawn_func(test_class=test_class,
+                                                   delta=delta)
+        self.check_dist_result_with_spawn_func(test_class=test_class,
+                                               delta=delta)
 
     def check_dist_result_with_spawn_func(self, test_class, delta=1e-3):
         # 0. prepare model and args
@@ -85,6 +85,7 @@ def check_dist_result_with_spawn_func(self, test_class, delta=1e-3):
                 loss,
                 dist_loss,
                 delta=delta,
-                msg="The results of single-card execution and multi-card execution are inconsistent."
+                msg=
+                "The results of single-card execution and multi-card execution are inconsistent."
                 "signal-card loss is:\n{}\nmulti-card average loss is:\n{}\n".
                 format(loss, dist_loss))
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py b/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py
index 6596eca4d3972..e1f8185e70415 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py
@@ -61,14 +61,13 @@ def create_model(data, rank):
         np_bias_part = np_bias[start_col:start_col + OUT_SIZE // 2]
 
         weight_attr, bias_attr = get_param_attr(np_weight_part, np_bias_part)
-        result = paddle.distributed.split(
-            data,
-            size=(IN_SIZE, OUT_SIZE),
-            operation='linear',
-            axis=1,
-            num_partitions=MODEL_PARALLEL_SIZE,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr)
+        result = paddle.distributed.split(data,
+                                          size=(IN_SIZE, OUT_SIZE),
+                                          operation='linear',
+                                          axis=1,
+                                          num_partitions=MODEL_PARALLEL_SIZE,
+                                          weight_attr=weight_attr,
+                                          bias_attr=bias_attr)
     else:
         weight_attr, bias_attr = get_param_attr(np_weight, np_bias)
         result = fluid.layers.fc(data,
@@ -81,10 +80,12 @@ def create_model(data, rank):
 
 
 class TestModelParallel(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         # Input data
-        data_in = fluid.data(
-            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+        data_in = fluid.data(name='data_in',
+                             shape=[batch_size, IN_SIZE],
+                             dtype=DTYPE)
 
         if dist_strategy:
             data_loader = fluid.io.DataLoader.from_generator(
@@ -104,8 +105,8 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         opt = fluid.optimizer.SGD(0.1)
 
         if dist_strategy:
-            dist_opt = fleet.distributed_optimizer(
-                optimizer=opt, strategy=strategy)
+            dist_opt = fleet.distributed_optimizer(optimizer=opt,
+                                                   strategy=strategy)
             dist_opt.minimize(avg_cost)
         else:
             opt.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py b/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py
index fd886e16ced5f..26ed65ce5faae 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py
@@ -60,14 +60,13 @@ def create_model(data, rank):
         np_weight_part = np_weight[start_row:start_row + IN_SIZE // 2, :]
 
         weight_attr, bias_attr = get_param_attr(np_weight_part, np_bias)
-        result = paddle.distributed.split(
-            data,
-            size=(IN_SIZE, OUT_SIZE),
-            operation='linear',
-            axis=0,
-            num_partitions=MODEL_PARALLEL_SIZE,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr)
+        result = paddle.distributed.split(data,
+                                          size=(IN_SIZE, OUT_SIZE),
+                                          operation='linear',
+                                          axis=0,
+                                          num_partitions=MODEL_PARALLEL_SIZE,
+                                          weight_attr=weight_attr,
+                                          bias_attr=bias_attr)
     else:
         weight_attr, bias_attr = get_param_attr(np_weight, np_bias)
         result = fluid.layers.fc(
@@ -82,10 +81,12 @@ def create_model(data, rank):
 
 
 class TestModelParallel(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         # Input data
-        data_in = fluid.data(
-            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+        data_in = fluid.data(name='data_in',
+                             shape=[batch_size, IN_SIZE],
+                             dtype=DTYPE)
 
         if dist_strategy:
             data_loader = fluid.io.DataLoader.from_generator(
@@ -105,8 +106,8 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         opt = fluid.optimizer.SGD(0.1)
 
         if dist_strategy:
-            dist_opt = fleet.distributed_optimizer(
-                optimizer=opt, strategy=strategy)
+            dist_opt = fleet.distributed_optimizer(optimizer=opt,
+                                                   strategy=strategy)
             dist_opt.minimize(avg_cost)
         else:
             opt.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py b/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py
index 4a98792f8a047..d72e61940f8b1 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py
@@ -55,27 +55,30 @@ def create_model(data, rank):
             operation='linear',
             axis=0,
             num_partitions=MODEL_PARALLEL_SIZE,
-            weight_attr=paddle.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    np_weight_part)),
-            bias_attr=False, )
+            weight_attr=paddle.ParamAttr(initializer=fluid.initializer.
+                                         NumpyArrayInitializer(np_weight_part)),
+            bias_attr=False,
+        )
     else:
         result = fluid.layers.fc(
             data,
             size=OUT_SIZE,
             param_attr=paddle.ParamAttr(
                 initializer=fluid.initializer.NumpyArrayInitializer(np_weight)),
-            bias_attr=False, )
+            bias_attr=False,
+        )
 
     predict = paddle.sum(result)
     return predict
 
 
 class TestModelParallel(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         # Input data
-        data_in = fluid.data(
-            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+        data_in = fluid.data(name='data_in',
+                             shape=[batch_size, IN_SIZE],
+                             dtype=DTYPE)
 
         if dist_strategy:
             data_loader = fluid.io.DataLoader.from_generator(
@@ -95,8 +98,8 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         opt = fluid.optimizer.SGD(0.1)
 
         if dist_strategy:
-            dist_opt = fleet.distributed_optimizer(
-                optimizer=opt, strategy=strategy)
+            dist_opt = fleet.distributed_optimizer(optimizer=opt,
+                                                   strategy=strategy)
             dist_opt.minimize(avg_cost)
         else:
             opt.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
index 4dc3fe6eab6be..908af43e00825 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
@@ -45,6 +45,7 @@ def _set_var_distributed(var):
 
 
 class ParallelFusedMultiHeadAttention(Layer):
+
     def __init__(self,
                  embed_dim,
                  num_heads,
@@ -106,11 +107,10 @@ def __init__(self,
             attr=linear_weight_attr,
             dtype=self._dtype,
             is_bias=False)
-        self.linear_bias = self.create_parameter(
-            shape=[embed_dim],
-            attr=linear_bias_attr,
-            dtype=self._dtype,
-            is_bias=True)
+        self.linear_bias = self.create_parameter(shape=[embed_dim],
+                                                 attr=linear_bias_attr,
+                                                 dtype=self._dtype,
+                                                 is_bias=True)
 
         # tensor model parallel
         if nranks > 1:
@@ -126,8 +126,9 @@ def __init__(self,
                 attr=pre_ln_scale_attr,
                 shape=[embed_dim],
                 default_initializer=Constant(value=1.0))
-            self.pre_ln_bias = self.create_parameter(
-                attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True)
+            self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr,
+                                                     shape=[embed_dim],
+                                                     is_bias=True)
             self.ln_scale = None
             self.ln_bias = None
         else:
@@ -137,8 +138,9 @@ def __init__(self,
                 attr=ln_scale_attr,
                 shape=[embed_dim],
                 default_initializer=Constant(value=1.0))
-            self.ln_bias = self.create_parameter(
-                attr=ln_bias_attr, shape=[embed_dim], is_bias=True)
+            self.ln_bias = self.create_parameter(attr=ln_bias_attr,
+                                                 shape=[embed_dim],
+                                                 is_bias=True)
 
         self.dropout_rate = dropout_rate
         self.attn_dropout_rate = attn_dropout_rate
@@ -187,11 +189,11 @@ def create_model(data, rank):
     np.random.seed(2021)
     pre_ln_w = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
     pre_ln_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
-    qkv_w = np.random.uniform(
-        -1, 1, size=(3, n_head, d_key, hidden)).astype(DTYPE)
+    qkv_w = np.random.uniform(-1, 1,
+                              size=(3, n_head, d_key, hidden)).astype(DTYPE)
     qkv_b = np.random.uniform(-1, 1, size=(3, n_head, d_key)).astype(DTYPE)
-    linear_w = np.random.uniform(
-        -1, 1, size=(n_head * d_key, hidden)).astype(DTYPE)
+    linear_w = np.random.uniform(-1, 1,
+                                 size=(n_head * d_key, hidden)).astype(DTYPE)
     linear_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
 
     data.stop_gradient = False
@@ -206,42 +208,40 @@ def create_model(data, rank):
         qkv_w_attr, qkv_b_attr = get_param_attr(col_qkv_w, col_qkv_b)
         linear_w_attr, linear_b_attr = get_param_attr(row_linear_w, linear_b)
 
-        attn = ParallelFusedMultiHeadAttention(
-            hidden,
-            n_head,
-            dropout_rate=0.0,
-            attn_dropout_rate=0.0,
-            normalize_before=False,
-            qkv_weight_attr=qkv_w_attr,
-            qkv_bias_attr=qkv_b_attr,
-            linear_weight_attr=linear_w_attr,
-            linear_bias_attr=linear_b_attr,
-            pre_ln_scale_attr=pre_ln_w_attr,
-            pre_ln_bias_attr=pre_ln_b_attr,
-            ln_scale_attr=pre_ln_w_attr,
-            ln_bias_attr=pre_ln_b_attr,
-            nranks=MODEL_PARALLEL_SIZE,
-            ring_id=0)
+        attn = ParallelFusedMultiHeadAttention(hidden,
+                                               n_head,
+                                               dropout_rate=0.0,
+                                               attn_dropout_rate=0.0,
+                                               normalize_before=False,
+                                               qkv_weight_attr=qkv_w_attr,
+                                               qkv_bias_attr=qkv_b_attr,
+                                               linear_weight_attr=linear_w_attr,
+                                               linear_bias_attr=linear_b_attr,
+                                               pre_ln_scale_attr=pre_ln_w_attr,
+                                               pre_ln_bias_attr=pre_ln_b_attr,
+                                               ln_scale_attr=pre_ln_w_attr,
+                                               ln_bias_attr=pre_ln_b_attr,
+                                               nranks=MODEL_PARALLEL_SIZE,
+                                               ring_id=0)
         result = attn(data)
     else:
         pre_ln_w_attr, pre_ln_b_attr = get_param_attr(pre_ln_w, pre_ln_b)
         qkv_w_attr, qkv_b_attr = get_param_attr(qkv_w, qkv_b)
         linear_w_attr, linear_b_attr = get_param_attr(linear_w, linear_b)
 
-        attn = ParallelFusedMultiHeadAttention(
-            hidden,
-            n_head,
-            dropout_rate=0.0,
-            attn_dropout_rate=0.0,
-            normalize_before=False,
-            qkv_weight_attr=qkv_w_attr,
-            qkv_bias_attr=qkv_b_attr,
-            linear_weight_attr=linear_w_attr,
-            linear_bias_attr=linear_b_attr,
-            pre_ln_scale_attr=pre_ln_w_attr,
-            pre_ln_bias_attr=pre_ln_b_attr,
-            ln_scale_attr=pre_ln_w_attr,
-            ln_bias_attr=pre_ln_b_attr)
+        attn = ParallelFusedMultiHeadAttention(hidden,
+                                               n_head,
+                                               dropout_rate=0.0,
+                                               attn_dropout_rate=0.0,
+                                               normalize_before=False,
+                                               qkv_weight_attr=qkv_w_attr,
+                                               qkv_bias_attr=qkv_b_attr,
+                                               linear_weight_attr=linear_w_attr,
+                                               linear_bias_attr=linear_b_attr,
+                                               pre_ln_scale_attr=pre_ln_w_attr,
+                                               pre_ln_bias_attr=pre_ln_b_attr,
+                                               ln_scale_attr=pre_ln_w_attr,
+                                               ln_bias_attr=pre_ln_b_attr)
         result = attn(data)
 
     predict = paddle.sum(result)
@@ -249,11 +249,13 @@ def create_model(data, rank):
 
 
 class TestModelParallel(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         # Input data
         seq_len = 2
-        data_in = fluid.data(
-            name='data_in', shape=[batch_size, seq_len, hidden], dtype=DTYPE)
+        data_in = fluid.data(name='data_in',
+                             shape=[batch_size, seq_len, hidden],
+                             dtype=DTYPE)
 
         if dist_strategy:
             data_loader = fluid.io.DataLoader.from_generator(
@@ -273,8 +275,8 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         opt = fluid.optimizer.SGD(0.1)
 
         if dist_strategy:
-            dist_opt = fleet.distributed_optimizer(
-                optimizer=opt, strategy=strategy)
+            dist_opt = fleet.distributed_optimizer(optimizer=opt,
+                                                   strategy=strategy)
             dist_opt.minimize(avg_cost)
         else:
             opt.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
index ad570fc0acfb3..a5af3cd877c53 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
@@ -56,7 +56,8 @@ def fused_feedforward(x,
     seed = None
     if mode not in ('downscale_in_infer', 'upscale_in_train'):
         raise ValueError(
-            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
+        )
     mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
 
     helper = LayerHelper("fused_feedforward")
@@ -71,69 +72,68 @@ def fused_feedforward(x,
         'uint8', stop_gradient=True)
     dropout2_mask = helper.create_variable_for_type_inference(
         'uint8', stop_gradient=True)
-    ln1_mean = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    ln1_variance = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    ln2_mean = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    ln2_variance = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    linear1_out = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    ln1_out = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    dropout1_out = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    dropout2_out = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
+    ln1_mean = helper.create_variable_for_type_inference(x.dtype,
+                                                         stop_gradient=True)
+    ln1_variance = helper.create_variable_for_type_inference(x.dtype,
+                                                             stop_gradient=True)
+    ln2_mean = helper.create_variable_for_type_inference(x.dtype,
+                                                         stop_gradient=True)
+    ln2_variance = helper.create_variable_for_type_inference(x.dtype,
+                                                             stop_gradient=True)
+    linear1_out = helper.create_variable_for_type_inference(x.dtype,
+                                                            stop_gradient=True)
+    ln1_out = helper.create_variable_for_type_inference(x.dtype,
+                                                        stop_gradient=True)
+    dropout1_out = helper.create_variable_for_type_inference(x.dtype,
+                                                             stop_gradient=True)
+    dropout2_out = helper.create_variable_for_type_inference(x.dtype,
+                                                             stop_gradient=True)
 
     if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
         seed = helper.main_program.random_seed
 
-    helper.append_op(
-        type='fused_feedforward',
-        inputs={
-            'X': x,
-            'Linear1Weight': linear1_weight,
-            'Linear1Bias': linear1_bias,
-            'Linear2Weight': linear2_weight,
-            'Linear2Bias': linear2_bias,
-            'Ln1Scale': ln1_scale,
-            'Ln1Bias': ln1_bias,
-            'Ln2Scale': ln2_scale,
-            'Ln2Bias': ln2_bias,
-        },
-        outputs={
-            'Out': out,
-            'Dropout1Mask': dropout1_mask,
-            'Dropout2Mask': dropout2_mask,
-            'Ln1Mean': ln1_mean,
-            'Ln1Variance': ln1_variance,
-            'Ln2Mean': ln2_mean,
-            'Ln2Variance': ln2_variance,
-            'Linear1Out': linear1_out,
-            'Ln1Out': ln1_out,
-            'Dropout1Out': dropout1_out,
-            'Dropout2Out': dropout2_out,
-        },
-        attrs={
-            'dropout1_rate': dropout1_rate,
-            'dropout2_rate': dropout2_rate,
-            'act_method': activation,
-            'pre_layer_norm': pre_layer_norm,
-            'ln1_epsilon': ln1_epsilon,
-            'ln2_epsilon': ln2_epsilon,
-            'dropout1_is_test': not training,
-            'dropout2_is_test': not training,
-            'dropout1_fix_seed': seed is not None,
-            'dropout2_fix_seed': seed is not None,
-            'dropout1_seed': seed if seed is not None else 0,
-            'dropout2_seed': seed if seed is not None else 0,
-            'dropout1_implementation': mode,
-            'dropout2_implementation': mode,
-            'ring_id': ring_id,
-        })
+    helper.append_op(type='fused_feedforward',
+                     inputs={
+                         'X': x,
+                         'Linear1Weight': linear1_weight,
+                         'Linear1Bias': linear1_bias,
+                         'Linear2Weight': linear2_weight,
+                         'Linear2Bias': linear2_bias,
+                         'Ln1Scale': ln1_scale,
+                         'Ln1Bias': ln1_bias,
+                         'Ln2Scale': ln2_scale,
+                         'Ln2Bias': ln2_bias,
+                     },
+                     outputs={
+                         'Out': out,
+                         'Dropout1Mask': dropout1_mask,
+                         'Dropout2Mask': dropout2_mask,
+                         'Ln1Mean': ln1_mean,
+                         'Ln1Variance': ln1_variance,
+                         'Ln2Mean': ln2_mean,
+                         'Ln2Variance': ln2_variance,
+                         'Linear1Out': linear1_out,
+                         'Ln1Out': ln1_out,
+                         'Dropout1Out': dropout1_out,
+                         'Dropout2Out': dropout2_out,
+                     },
+                     attrs={
+                         'dropout1_rate': dropout1_rate,
+                         'dropout2_rate': dropout2_rate,
+                         'act_method': activation,
+                         'pre_layer_norm': pre_layer_norm,
+                         'ln1_epsilon': ln1_epsilon,
+                         'ln2_epsilon': ln2_epsilon,
+                         'dropout1_is_test': not training,
+                         'dropout2_is_test': not training,
+                         'dropout1_fix_seed': seed is not None,
+                         'dropout2_fix_seed': seed is not None,
+                         'dropout1_seed': seed if seed is not None else 0,
+                         'dropout2_seed': seed if seed is not None else 0,
+                         'dropout1_implementation': mode,
+                         'dropout2_implementation': mode,
+                         'ring_id': ring_id,
+                     })
     return out
 
 
@@ -151,6 +151,7 @@ def _set_var_distributed(var):
 
 
 class ParallelFusedFeedForward(Layer):
+
     def __init__(self,
                  d_model,
                  dim_feedforward,
@@ -196,11 +197,10 @@ def __init__(self,
             attr=linear1_weight_attr,
             dtype=self._dtype,
             is_bias=False)
-        self._linear1_bias = self.create_parameter(
-            shape=[dim_feedforward],
-            attr=linear1_bias_attr,
-            dtype=self._dtype,
-            is_bias=True)
+        self._linear1_bias = self.create_parameter(shape=[dim_feedforward],
+                                                   attr=linear1_bias_attr,
+                                                   dtype=self._dtype,
+                                                   is_bias=True)
 
         self._linear2_weight = self.create_parameter(
             shape=[dim_feedforward, d_model],
@@ -208,11 +208,10 @@ def __init__(self,
             dtype=self._dtype,
             is_bias=False)
 
-        self._linear2_bias = self.create_parameter(
-            shape=[d_model],
-            attr=linear2_bias_attr,
-            dtype=self._dtype,
-            is_bias=True)
+        self._linear2_bias = self.create_parameter(shape=[d_model],
+                                                   attr=linear2_bias_attr,
+                                                   dtype=self._dtype,
+                                                   is_bias=True)
 
         if nranks > 1:
             assert ring_id != -1
@@ -227,8 +226,9 @@ def __init__(self,
                 attr=ln1_scale_attr,
                 is_bias=False,
                 default_initializer=Constant(1.0))
-            self._ln1_bias = self.create_parameter(
-                shape=[d_model], attr=ln1_bias_attr, is_bias=True)
+            self._ln1_bias = self.create_parameter(shape=[d_model],
+                                                   attr=ln1_bias_attr,
+                                                   is_bias=True)
             self._ln2_scale = None
             self._ln2_bias = None
         else:
@@ -239,31 +239,31 @@ def __init__(self,
                 attr=ln2_scale_attr,
                 is_bias=False,
                 default_initializer=Constant(1.0))
-            self._ln2_bias = self.create_parameter(
-                shape=[d_model], attr=ln2_bias_attr, is_bias=True)
+            self._ln2_bias = self.create_parameter(shape=[d_model],
+                                                   attr=ln2_bias_attr,
+                                                   is_bias=True)
 
         self.name = name
 
     def forward(self, src, cache=None):
-        out = fused_feedforward(
-            src,
-            self._linear1_weight,
-            self._linear2_weight,
-            self._linear1_bias,
-            self._linear2_bias,
-            self._ln1_scale,
-            self._ln1_bias,
-            self._ln2_scale,
-            self._ln2_bias,
-            dropout1_rate=self._act_dropout_rate,
-            dropout2_rate=self._dropout_rate,
-            activation=self._act_method,
-            ln1_epsilon=self._epsilon,
-            ln2_epsilon=self._epsilon,
-            pre_layer_norm=self._normalize_before,
-            training=self.training,
-            ring_id=self._ring_id,
-            name=self.name)
+        out = fused_feedforward(src,
+                                self._linear1_weight,
+                                self._linear2_weight,
+                                self._linear1_bias,
+                                self._linear2_bias,
+                                self._ln1_scale,
+                                self._ln1_bias,
+                                self._ln2_scale,
+                                self._ln2_bias,
+                                dropout1_rate=self._act_dropout_rate,
+                                dropout2_rate=self._dropout_rate,
+                                activation=self._act_method,
+                                ln1_epsilon=self._epsilon,
+                                ln2_epsilon=self._epsilon,
+                                pre_layer_norm=self._normalize_before,
+                                training=self.training,
+                                ring_id=self._ring_id,
+                                name=self.name)
         return out
 
 
@@ -295,20 +295,19 @@ def create_model(data, rank):
         w0_attr, b0_attr = get_param_attr(col_w0, col_b0)
         w1_attr, b1_attr = get_param_attr(row_w1, b1)
 
-        ffn = ParallelFusedFeedForward(
-            IN_SIZE,
-            OUT_SIZE,
-            dropout_rate=0.0,
-            activation='gelu',
-            normalize_before=True,
-            linear1_weight_attr=w0_attr,
-            linear1_bias_attr=b0_attr,
-            linear2_weight_attr=w1_attr,
-            linear2_bias_attr=b1_attr,
-            ln1_scale_attr=ln_w_attr,
-            ln1_bias_attr=ln_b_attr,
-            nranks=MODEL_PARALLEL_SIZE,
-            ring_id=0)
+        ffn = ParallelFusedFeedForward(IN_SIZE,
+                                       OUT_SIZE,
+                                       dropout_rate=0.0,
+                                       activation='gelu',
+                                       normalize_before=True,
+                                       linear1_weight_attr=w0_attr,
+                                       linear1_bias_attr=b0_attr,
+                                       linear2_weight_attr=w1_attr,
+                                       linear2_bias_attr=b1_attr,
+                                       ln1_scale_attr=ln_w_attr,
+                                       ln1_bias_attr=ln_b_attr,
+                                       nranks=MODEL_PARALLEL_SIZE,
+                                       ring_id=0)
         #ffn.eval()
         result = ffn(data)
     else:
@@ -316,18 +315,17 @@ def create_model(data, rank):
         w0_attr, b0_attr = get_param_attr(w0, b0)
         w1_attr, b1_attr = get_param_attr(w1, b1)
 
-        ffn = ParallelFusedFeedForward(
-            IN_SIZE,
-            OUT_SIZE,
-            dropout_rate=0.0,
-            activation='gelu',
-            normalize_before=True,
-            linear1_weight_attr=w0_attr,
-            linear1_bias_attr=b0_attr,
-            linear2_weight_attr=w1_attr,
-            linear2_bias_attr=b1_attr,
-            ln1_scale_attr=ln_w_attr,
-            ln1_bias_attr=ln_b_attr)
+        ffn = ParallelFusedFeedForward(IN_SIZE,
+                                       OUT_SIZE,
+                                       dropout_rate=0.0,
+                                       activation='gelu',
+                                       normalize_before=True,
+                                       linear1_weight_attr=w0_attr,
+                                       linear1_bias_attr=b0_attr,
+                                       linear2_weight_attr=w1_attr,
+                                       linear2_bias_attr=b1_attr,
+                                       ln1_scale_attr=ln_w_attr,
+                                       ln1_bias_attr=ln_b_attr)
         #ffn.eval()
         result = ffn(data)
 
@@ -336,11 +334,13 @@ def create_model(data, rank):
 
 
 class TestModelParallel(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         # Input data
         seq_len = 2
-        data_in = fluid.data(
-            name='data_in', shape=[batch_size, seq_len, IN_SIZE], dtype=DTYPE)
+        data_in = fluid.data(name='data_in',
+                             shape=[batch_size, seq_len, IN_SIZE],
+                             dtype=DTYPE)
 
         if dist_strategy:
             data_loader = fluid.io.DataLoader.from_generator(
@@ -360,8 +360,8 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         opt = fluid.optimizer.SGD(0.1)
 
         if dist_strategy:
-            dist_opt = fleet.distributed_optimizer(
-                optimizer=opt, strategy=strategy)
+            dist_opt = fleet.distributed_optimizer(optimizer=opt,
+                                                   strategy=strategy)
             dist_opt.minimize(avg_cost)
         else:
             opt.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py
index f9c5d4d78c866..5387580f2cdc6 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py
@@ -51,11 +51,11 @@ def create_model(data, rank):
     np.random.seed(2021)
     ln_w = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
     ln_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
-    qkv_w = np.random.uniform(
-        -1, 1, size=(3, num_head, dim_head, hidden)).astype(DTYPE)
+    qkv_w = np.random.uniform(-1, 1, size=(3, num_head, dim_head,
+                                           hidden)).astype(DTYPE)
     qkv_b = np.random.uniform(-1, 1, size=(3, num_head, dim_head)).astype(DTYPE)
-    linear_w = np.random.uniform(
-        -1, 1, size=(num_head * dim_head, hidden)).astype(DTYPE)
+    linear_w = np.random.uniform(-1, 1, size=(num_head * dim_head,
+                                              hidden)).astype(DTYPE)
     linear_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
 
     ffn_ln_w = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
@@ -145,11 +145,13 @@ def create_model(data, rank):
 
 
 class TestModelParallel(TestDistRunnerBase):
+
     def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         # Input data
         seq_len = 2
-        data_in = fluid.data(
-            name='data_in', shape=[batch_size, seq_len, hidden], dtype=DTYPE)
+        data_in = fluid.data(name='data_in',
+                             shape=[batch_size, seq_len, hidden],
+                             dtype=DTYPE)
 
         if dist_strategy:
             data_loader = fluid.io.DataLoader.from_generator(
@@ -169,8 +171,8 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         opt = fluid.optimizer.SGD(0.1)
 
         if dist_strategy:
-            dist_opt = fleet.distributed_optimizer(
-                optimizer=opt, strategy=strategy)
+            dist_opt = fleet.distributed_optimizer(optimizer=opt,
+                                                   strategy=strategy)
             dist_opt.minimize(avg_cost)
         else:
             opt.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_Tensor_type.py b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
index c40981c073724..176fceb310d5a 100644
--- a/python/paddle/fluid/tests/unittests/test_Tensor_type.py
+++ b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
@@ -22,6 +22,7 @@
 
 
 class TensorTypeTest(unittest.TestCase):
+
     def func_type_totensor(self):
         paddle.disable_static()
         inx = np.array([1, 2])
diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
index 10ab76e4bfb15..a03f4fa088af8 100755
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
@@ -23,6 +23,7 @@
 
 
 class TestAccuracyOp(OpTest):
+
     def setUp(self):
         self.op_type = "accuracy"
         self.dtype = np.float32
@@ -52,6 +53,7 @@ def test_check_output(self):
 
 
 class TestAccuracyOpFp16(TestAccuracyOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -60,13 +62,15 @@ def test_check_output(self):
 
 
 class TestAccuracyOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of accuracy_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            label = fluid.layers.data(
-                name='label', shape=[-1, 1], dtype="int32")
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
+            label = fluid.layers.data(name='label',
+                                      shape=[-1, 1],
+                                      dtype="int32")
             self.assertRaises(TypeError, fluid.layers.accuracy, x1, label)
             self.assertRaises(TypeError, paddle.metric.accuracy, x1, label)
             # The input dtype of accuracy_op must be float32 or float64.
@@ -79,13 +83,17 @@ def test_errors(self):
 
 
 class TestAccuracyAPI1(unittest.TestCase):
+
     def setUp(self):
-        self.predictions = paddle.static.data(
-            shape=[2, 5], name="predictions", dtype="float32")
-        self.label = paddle.static.data(
-            shape=[2, 1], name="labels", dtype="int64")
-        self.result = paddle.static.accuracy(
-            input=self.predictions, label=self.label, k=1)
+        self.predictions = paddle.static.data(shape=[2, 5],
+                                              name="predictions",
+                                              dtype="float32")
+        self.label = paddle.static.data(shape=[2, 1],
+                                        name="labels",
+                                        dtype="int64")
+        self.result = paddle.static.accuracy(input=self.predictions,
+                                             label=self.label,
+                                             k=1)
         self.input_predictions = np.array(
             [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]],
             dtype="float32")
@@ -103,6 +111,7 @@ def test_api(self):
 
 
 class TestAccuracyAPI2(unittest.TestCase):
+
     def test_api(self):
         with fluid.dygraph.guard():
             predictions = paddle.to_tensor(
@@ -115,6 +124,7 @@ def test_api(self):
 
 
 class TestAccuracyAPI(unittest.TestCase):
+
     def test_api(self):
         with fluid.dygraph.guard():
             predictions = paddle.to_tensor(
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index 919ae52447128..b512aef4f9314 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -29,6 +29,7 @@
 
 
 class TestSigmoidTripleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -39,8 +40,11 @@ def func(self, place):
         y = layers.sigmoid(x)
         x_arr = np.random.random(shape).astype(dtype)
         x_arr[np.abs(x_arr) < 0.005] = 0.002
-        gradient_checker.triple_grad_check(
-            [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.triple_grad_check([x],
+                                           y,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         paddle.enable_static()
@@ -52,6 +56,7 @@ def test_grad(self):
 
 
 class TestSigmoidDoubleGradCheck(unittest.TestCase):
+
     def sigmoid_wrapper(self, x):
         return fluid.layers.sigmoid(x[0])
 
@@ -67,8 +72,10 @@ def func(self, place):
         x_arr[np.abs(x_arr) < 0.005] = 0.002
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.sigmoid_wrapper, [x], y, x_init=x_arr, place=place)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -80,6 +87,7 @@ def test_grad(self):
 
 
 class TestTanhTripleGradCheck(unittest.TestCase):
+
     def tanh_wrapper(self, x):
         return paddle.tanh(x[0])
 
@@ -95,8 +103,10 @@ def func(self, place):
         x_arr[np.abs(x_arr) < 0.005] = 0.002
         gradient_checker.triple_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.triple_grad_check_for_dygraph(
             self.tanh_wrapper, [x], y, x_init=x_arr, place=place)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -108,6 +118,7 @@ def test_grad(self):
 
 
 class TestTanhDoubleGradCheck(unittest.TestCase):
+
     def tanh_wrapper(self, x):
         return paddle.tanh(x[0])
 
@@ -123,8 +134,10 @@ def func(self, place):
         x_arr[np.abs(x_arr) < 0.005] = 0.002
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.tanh_wrapper, [x], y, x_init=x_arr, place=place)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -136,6 +149,7 @@ def test_grad(self):
 
 
 class TestAbsDoubleGradCheck(unittest.TestCase):
+
     def abs_wrapper(self, x):
         return paddle.abs(x[0])
 
@@ -151,8 +165,10 @@ def func(self, place):
         x_arr[np.abs(x_arr) < 0.005] = 0.002
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.abs_wrapper, [x], y, x_init=x_arr, place=place)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -164,6 +180,7 @@ def test_grad(self):
 
 
 class TestReluDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -176,8 +193,11 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         x_arr[np.abs(x_arr) < 0.005] = 0.02
 
-        gradient_checker.double_grad_check(
-            [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x],
+                                           y,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         paddle.enable_static()
@@ -189,6 +209,7 @@ def test_grad(self):
 
 
 class TestLeakyReluDoubleGradCheck(unittest.TestCase):
+
     def leaky_relu_wrapper(self, x):
         return paddle.nn.functional.leaky_relu(x[0], negative_slope=0.2)
 
@@ -206,10 +227,16 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         x_arr[np.abs(x_arr) < 0.005] = 0.02
 
-        gradient_checker.double_grad_check(
-            [x], y, x_init=x_arr, place=place, eps=eps)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.leaky_relu_wrapper, [x], y, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check([x],
+                                           y,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(self.leaky_relu_wrapper,
+                                                       [x],
+                                                       y,
+                                                       x_init=x_arr,
+                                                       place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -221,6 +248,7 @@ def test_grad(self):
 
 
 class TestELUDoubleGradCheck(unittest.TestCase):
+
     def elu_wrapper(self, x):
         return paddle.nn.functional.elu(x[0], alpha=0.2)
 
@@ -240,8 +268,10 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.elu_wrapper, [x], y, x_init=x_arr, place=place)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -253,6 +283,7 @@ def test_grad(self):
 
 
 class TestCELUDoubleGradCheck(unittest.TestCase):
+
     def celu_wrapper(self, x):
         return paddle.nn.functional.celu(x[0], alpha=0.2)
 
@@ -272,8 +303,10 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.celu_wrapper, [x], y, x_init=x_arr, place=place)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -285,6 +318,7 @@ def test_grad(self):
 
 
 class TestSqrtDoubleGradCheck(unittest.TestCase):
+
     def sqrt_wrapper(self, x):
         return paddle.sqrt(x[0])
 
@@ -300,10 +334,15 @@ def func(self, place):
         y = layers.sqrt(x)
         x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x], y, x_init=x_arr, place=place, eps=eps)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.sqrt_wrapper, [x], y, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check([x],
+                                           y,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(self.sqrt_wrapper, [x],
+                                                       y,
+                                                       x_init=x_arr,
+                                                       place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -315,6 +354,7 @@ def test_grad(self):
 
 
 class TestRsqrtDoubleGradCheck(unittest.TestCase):
+
     def rsqrt_wrapper(self, x):
         return paddle.rsqrt(x[0])
 
@@ -330,10 +370,15 @@ def func(self, place):
         y = layers.rsqrt(x)
         x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x], y, x_init=x_arr, place=place, eps=eps)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.rsqrt_wrapper, [x], y, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check([x],
+                                           y,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(self.rsqrt_wrapper, [x],
+                                                       y,
+                                                       x_init=x_arr,
+                                                       place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -345,6 +390,7 @@ def test_grad(self):
 
 
 class TestSquareDoubleGradCheck(unittest.TestCase):
+
     def square_wrapper(self, x):
         return paddle.square(x[0])
 
@@ -362,8 +408,10 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.square_wrapper, [x], y, x_init=x_arr, place=place)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -375,6 +423,7 @@ def test_grad(self):
 
 
 class TestAbsDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -391,8 +440,11 @@ def func(self, place):
         # we should avoid this
         x_arr[np.abs(x_arr) < 0.005] = 0.02
 
-        gradient_checker.double_grad_check(
-            [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x],
+                                           y,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         paddle.enable_static()
@@ -404,6 +456,7 @@ def test_grad(self):
 
 
 class TestLogDoubleGradCheck(unittest.TestCase):
+
     def log_wrapper(self, x):
         return paddle.log(x[0])
 
@@ -421,8 +474,10 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.log_wrapper, [x], y, x_init=x_arr, place=place)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 7be3b300d55a1..7dde0483823be 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -31,22 +31,26 @@
 
 
 class TestSqrtOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of sqrt op must be Variable or numpy.ndarray.
             in1 = 1
             self.assertRaises(TypeError, fluid.layers.sqrt, in1)
             # The input dtype of sqrt op must be float16, float32, float64.
-            in2 = fluid.layers.data(
-                name='input2', shape=[12, 10], dtype="int32")
+            in2 = fluid.layers.data(name='input2',
+                                    shape=[12, 10],
+                                    dtype="int32")
             self.assertRaises(TypeError, fluid.layers.sqrt, in2)
 
-            in3 = fluid.layers.data(
-                name='input3', shape=[12, 10], dtype="float16")
+            in3 = fluid.layers.data(name='input3',
+                                    shape=[12, 10],
+                                    dtype="float16")
             fluid.layers.sqrt(x=in3)
 
 
 class TestActivation(OpTest):
+
     def setUp(self):
         self.op_type = "exp"
         self.init_dtype()
@@ -83,6 +87,7 @@ def init_kernel_type(self):
 
 
 class TestExpm1(TestActivation):
+
     def setUp(self):
         self.op_type = "expm1"
         self.python_api = paddle.expm1
@@ -103,6 +108,7 @@ def test_check_output(self):
 
 
 class TestExpm1API(unittest.TestCase):
+
     def init_dtype(self):
         self.dtype = 'float64'
         self.shape = [11, 17]
@@ -132,6 +138,7 @@ def run(place):
             run(place)
 
     def test_dygraph_api(self):
+
         def run(place):
             paddle.disable_static(place)
             X = paddle.to_tensor(self.x)
@@ -151,6 +158,7 @@ def test_errors(self):
 
 
 class TestParameter(object):
+
     def test_out_name(self):
         with fluid.program_guard(fluid.Program()):
             np_x = np.array([0.1])
@@ -176,6 +184,7 @@ def test_dygraph(self):
 
 
 class TestSigmoid(TestActivation):
+
     def setUp(self):
         self.op_type = "sigmoid"
         self.init_dtype()
@@ -199,6 +208,7 @@ def test_check_grad(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSigmoidBF16(OpTest):
+
     def setUp(self):
         self.op_type = "sigmoid"
         self.init_dtype()
@@ -225,6 +235,7 @@ def test_check_grad(self):
 
 
 class TestSilu(TestActivation):
+
     def setUp(self):
         self.op_type = "silu"
         self.init_dtype()
@@ -281,16 +292,19 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.silu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[11, 17], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[11, 17],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.silu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[11, 17], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[11, 17],
+                                       dtype='float16')
             F.silu(x_fp16)
 
 
 class TestLogSigmoid(TestActivation):
+
     def setUp(self):
         self.op_type = "logsigmoid"
         self.init_dtype()
@@ -356,16 +370,19 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.log_sigmoid, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[11, 17], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[11, 17],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.log_sigmoid, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[11, 17], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[11, 17],
+                                       dtype='float16')
             F.log_sigmoid(x_fp16)
 
 
 class TestTanh(TestActivation, TestParameter):
+
     def setUp(self):
         self.op_type = "tanh"
         self.init_dtype()
@@ -442,12 +459,14 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, self.tanh, 1)
             # The input dtype must be float16, float32.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, self.tanh, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             self.tanh(x_fp16)
 
 
@@ -458,6 +477,7 @@ def executed_api(self):
 
 
 class TestAtan(TestActivation, TestParameter):
+
     def setUp(self):
         self.op_type = "atan"
         self.init_dtype()
@@ -495,6 +515,7 @@ def test_dygraph(self):
 
 
 class TestSinh(TestActivation):
+
     def setUp(self):
         self.op_type = "sinh"
         self.init_dtype()
@@ -524,11 +545,10 @@ def test_api(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input_x = np.random.uniform(0.1, 1,
                                         test_data_shape).astype("float32")
-            data_x = fluid.layers.data(
-                name="data_x",
-                shape=test_data_shape,
-                append_batch_size=False,
-                dtype="float32")
+            data_x = fluid.layers.data(name="data_x",
+                                       shape=test_data_shape,
+                                       append_batch_size=False,
+                                       dtype="float32")
 
             pd_sinh_out = fluid.layers.sinh(data_x)
             exe = fluid.Executor(place=fluid.CPUPlace())
@@ -554,6 +574,7 @@ def test_backward(self):
 
 
 class TestSinhOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program()):
             # The input type must be Variable.
@@ -567,6 +588,7 @@ def test_errors(self):
 
 
 class TestCosh(TestActivation):
+
     def setUp(self):
         self.op_type = "cosh"
         self.init_dtype()
@@ -596,11 +618,10 @@ def test_api(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input_x = np.random.uniform(0.1, 1,
                                         test_data_shape).astype("float32")
-            data_x = fluid.layers.data(
-                name="data_x",
-                shape=test_data_shape,
-                append_batch_size=False,
-                dtype="float32")
+            data_x = fluid.layers.data(name="data_x",
+                                       shape=test_data_shape,
+                                       append_batch_size=False,
+                                       dtype="float32")
 
             pd_cosh_out = paddle.cosh(data_x)
             exe = fluid.Executor(place=fluid.CPUPlace())
@@ -626,6 +647,7 @@ def test_backward(self):
 
 
 class TestCoshOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program()):
             # The input type must be Variable.
@@ -644,6 +666,7 @@ def ref_tanhshrink(x):
 
 
 class TestTanhshrink(TestActivation):
+
     def setUp(self):
         self.op_type = "tanh_shrink"
         self.init_dtype()
@@ -709,12 +732,14 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.tanhshrink, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.tanhshrink, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.tanhshrink(x_fp16)
 
 
@@ -725,6 +750,7 @@ def ref_hardshrink(x, threshold):
 
 
 class TestHardShrink(TestActivation):
+
     def setUp(self):
         self.op_type = "hard_shrink"
         self.init_dtype()
@@ -749,6 +775,7 @@ def test_check_grad(self):
 
 
 class TestHardShrink_threshold_negative(TestHardShrink):
+
     def set_attrs(self):
         self.threshold = -0.1
 
@@ -808,12 +835,14 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.hardshrink, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.hardshrink, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.hardshrink(x_fp16)
 
 
@@ -870,12 +899,14 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.hardtanh, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.hardtanh, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.hardtanh(x_fp16)
 
 
@@ -887,6 +918,7 @@ def ref_softshrink(x, threshold=0.5):
 
 
 class TestSoftshrink(TestActivation):
+
     def setUp(self):
         self.op_type = "softshrink"
         self.check_eager = True
@@ -957,20 +989,24 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.softshrink, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.softshrink, x_int32)
             # The threshold must be no less than zero
-            x_fp32 = paddle.fluid.data(
-                name='x_fp32', shape=[12, 10], dtype='float32')
+            x_fp32 = paddle.fluid.data(name='x_fp32',
+                                       shape=[12, 10],
+                                       dtype='float32')
             self.assertRaises(ValueError, F.softshrink, x_fp32, -1.0)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.softshrink(x_fp16)
 
 
 class TestSqrt(TestActivation, TestParameter):
+
     def setUp(self):
         self.op_type = "sqrt"
         self.python_api = paddle.sqrt
@@ -995,6 +1031,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSqrtBF16(OpTest):
+
     def setUp(self):
         self.op_type = "sqrt"
         self.python_api = paddle.sqrt
@@ -1022,6 +1059,7 @@ def test_check_grad(self):
 
 
 class TestRsqrt(TestActivation):
+
     def setUp(self):
         self.op_type = "rsqrt"
         self.python_api = paddle.rsqrt
@@ -1037,11 +1075,14 @@ def setUp(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.0005, check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.0005,
+                        check_eager=True)
 
 
 class TestAbs(TestActivation):
+
     def setUp(self):
         self.op_type = "abs"
         self.init_dtype()
@@ -1065,6 +1106,7 @@ def test_check_grad(self):
 
 
 class TestCeil(TestActivation):
+
     def setUp(self):
         self.op_type = "ceil"
         self.check_eager = True
@@ -1084,6 +1126,7 @@ def test_check_grad(self):
 
 
 class TestFloor(TestActivation):
+
     def setUp(self):
         self.op_type = "floor"
         self.check_eager = True
@@ -1105,6 +1148,7 @@ def test_check_grad(self):
 
 
 class TestCos(TestActivation):
+
     def setUp(self):
         self.op_type = "cos"
         self.init_dtype()
@@ -1123,6 +1167,7 @@ def test_check_grad(self):
 
 
 class TestTan(TestActivation):
+
     def setUp(self):
         np.random.seed(1024)
         self.op_type = "tan"
@@ -1174,6 +1219,7 @@ def test_backward(self):
 
 
 class TestAcos(TestActivation):
+
     def setUp(self):
         self.op_type = "acos"
         self.init_dtype()
@@ -1192,6 +1238,7 @@ def test_check_grad(self):
 
 
 class TestSin(TestActivation, TestParameter):
+
     def setUp(self):
         self.op_type = "sin"
         self.init_dtype()
@@ -1210,6 +1257,7 @@ def test_check_grad(self):
 
 
 class TestAsin(TestActivation):
+
     def setUp(self):
         self.op_type = "asin"
         self.init_dtype()
@@ -1228,6 +1276,7 @@ def test_check_grad(self):
 
 
 class TestAcosh(TestActivation):
+
     def setUp(self):
         self.op_type = "acosh"
         self.init_dtype()
@@ -1246,6 +1295,7 @@ def test_check_grad(self):
 
 
 class TestAsinh(TestActivation):
+
     def setUp(self):
         self.op_type = "asinh"
         self.init_dtype()
@@ -1264,6 +1314,7 @@ def test_check_grad(self):
 
 
 class TestAtanh(TestActivation):
+
     def setUp(self):
         self.op_type = "atanh"
         self.init_dtype()
@@ -1282,6 +1333,7 @@ def test_check_grad(self):
 
 
 class TestRound(TestActivation):
+
     def setUp(self):
         self.op_type = "round"
         self.check_eager = True
@@ -1300,6 +1352,7 @@ def test_check_grad(self):
 
 
 class TestRelu(TestActivation):
+
     def setUp(self):
         self.op_type = "relu"
         self.init_dtype()
@@ -1368,12 +1421,14 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, self.relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[10, 12], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[10, 12],
+                                        dtype='int32')
             self.assertRaises(TypeError, self.relu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[10, 12], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[10, 12],
+                                       dtype='float16')
             self.relu(x_fp16)
 
 
@@ -1390,6 +1445,7 @@ def ref_leaky_relu(x, alpha=0.01):
 
 
 class TestLeakyRelu(TestActivation):
+
     def get_alpha(self):
         return 0.02
 
@@ -1415,16 +1471,19 @@ def test_check_grad(self):
 
 
 class TestLeakyReluAlpha1(TestLeakyRelu):
+
     def get_alpha(self):
         return 2
 
 
 class TestLeakyReluAlpha2(TestLeakyRelu):
+
     def get_alpha(self):
         return -0.01
 
 
 class TestLeakyReluAlpha3(TestLeakyRelu):
+
     def get_alpha(self):
         return -2.0
 
@@ -1485,25 +1544,28 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.leaky_relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.leaky_relu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.leaky_relu(x_fp16)
 
 
 def gelu(x, approximate):
     if approximate:
-        y_ref = 0.5 * x * (1.0 + np.tanh(
-            np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+        y_ref = 0.5 * x * (
+            1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
     else:
         y_ref = 0.5 * x * (1 + erf(x / np.sqrt(2)))
     return y_ref.astype(x.dtype)
 
 
 class TestGeluApproximate(TestActivation):
+
     def setUp(self):
         self.op_type = "gelu"
         self.init_dtype()
@@ -1523,6 +1585,7 @@ def test_check_grad(self):
 
 
 class TestGelu(TestActivation):
+
     def setUp(self):
         self.op_type = "gelu"
         self.init_dtype()
@@ -1586,16 +1649,19 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.gelu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[11, 17], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[11, 17],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.gelu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[11, 17], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[11, 17],
+                                       dtype='float16')
             F.gelu(x_fp16)
 
 
 class TestBRelu(TestActivation):
+
     def setUp(self):
         self.op_type = "brelu"
         self.init_dtype()
@@ -1657,8 +1723,9 @@ def test_errors(self):
             x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, fluid.layers.brelu, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.layers.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = fluid.layers.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             fluid.layers.brelu(x_fp16)
 
 
@@ -1670,6 +1737,7 @@ def ref_relu6(x, threshold=6.0):
 
 
 class TestRelu6(TestActivation):
+
     def setUp(self):
         self.op_type = "relu6"
         self.init_dtype()
@@ -1738,12 +1806,14 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.relu6, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.relu6, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.relu6(x_fp16)
 
 
@@ -1753,6 +1823,7 @@ def ref_hardswish(x, threshold=6.0, scale=6.0, offset=3.0):
 
 
 class TestHardSwish(TestActivation):
+
     def setUp(self):
         self.op_type = 'hard_swish'
         self.init_dtype()
@@ -1834,12 +1905,14 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.hardswish, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.hardswish, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.hardswish(x_fp16)
 
     def test_api_eager_dygraph(self):
@@ -1849,6 +1922,7 @@ def test_api_eager_dygraph(self):
 
 
 class TestSoftRelu(TestActivation):
+
     def setUp(self):
         self.op_type = "soft_relu"
         self.init_dtype()
@@ -1875,6 +1949,7 @@ def test_check_grad(self):
 
 
 class TestSoftReluOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program()):
             # The input type must be Variable.
@@ -1893,6 +1968,7 @@ def elu(x, alpha):
 
 
 class TestELU(TestActivation):
+
     def setUp(self):
         self.op_type = "elu"
         self.init_dtype()
@@ -1917,6 +1993,7 @@ def get_alpha(self):
 
 
 class TestELUAlpha(TestELU):
+
     def get_alpha(self):
         return -0.2
 
@@ -1972,12 +2049,14 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, self.elu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[10, 12], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[10, 12],
+                                        dtype='int32')
             self.assertRaises(TypeError, self.elu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[10, 12], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[10, 12],
+                                       dtype='float16')
             self.elu(x_fp16)
 
 
@@ -1999,6 +2078,7 @@ def celu(x, alpha):
 
 
 class TestCELU(TestActivation):
+
     def setUp(self):
         self.op_type = "celu"
         self.init_dtype()
@@ -2069,16 +2149,19 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, self.celu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[10, 12], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[10, 12],
+                                        dtype='int32')
             self.assertRaises(TypeError, self.celu, x_int32)
             # The alpha must be not equal 0
-            x_fp32 = paddle.fluid.data(
-                name='x_fp32', shape=[10, 12], dtype='float32')
+            x_fp32 = paddle.fluid.data(name='x_fp32',
+                                       shape=[10, 12],
+                                       dtype='float32')
             self.assertRaises(ZeroDivisionError, F.celu, x_fp32, 0)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[10, 12], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[10, 12],
+                                       dtype='float16')
             self.celu(x_fp16)
 
     def test_api_eager_dygraph(self):
@@ -2088,6 +2171,7 @@ def test_api_eager_dygraph(self):
 
 
 class TestReciprocal(TestActivation):
+
     def setUp(self):
         self.op_type = "reciprocal"
         self.python_api = paddle.reciprocal
@@ -2110,6 +2194,7 @@ def test_check_output(self):
 
 
 class TestLog(TestActivation):
+
     def setUp(self):
         self.op_type = "log"
         self.check_eager = True
@@ -2129,16 +2214,21 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out', check_eager=True)
 
     def test_error(self):
-        in1 = fluid.layers.data(
-            name="in1", shape=[11, 17], append_batch_size=False, dtype="int32")
-        in2 = fluid.layers.data(
-            name="in2", shape=[11, 17], append_batch_size=False, dtype="int64")
+        in1 = fluid.layers.data(name="in1",
+                                shape=[11, 17],
+                                append_batch_size=False,
+                                dtype="int32")
+        in2 = fluid.layers.data(name="in2",
+                                shape=[11, 17],
+                                append_batch_size=False,
+                                dtype="int64")
 
         self.assertRaises(TypeError, fluid.layers.log, in1)
         self.assertRaises(TypeError, fluid.layers.log, in2)
 
 
 class TestLog2(TestActivation):
+
     def setUp(self):
         self.op_type = "log2"
         self.check_eager = True
@@ -2167,8 +2257,9 @@ def test_api(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
             input_x = np.random.uniform(0.1, 1, [11, 17]).astype("float64")
-            data_x = paddle.static.data(
-                name="data_x", shape=[11, 17], dtype="float64")
+            data_x = paddle.static.data(name="data_x",
+                                        shape=[11, 17],
+                                        dtype="float64")
 
             out1 = paddle.log2(data_x)
             exe = paddle.static.Executor(place=fluid.CPUPlace())
@@ -2190,6 +2281,7 @@ def test_api(self):
 
 
 class TestLog10(TestActivation):
+
     def setUp(self):
         self.op_type = "log10"
         self.check_eager = True
@@ -2218,8 +2310,9 @@ def test_api(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
             input_x = np.random.uniform(0.1, 1, [11, 17]).astype("float64")
-            data_x = paddle.static.data(
-                name="data_x", shape=[11, 17], dtype="float64")
+            data_x = paddle.static.data(name="data_x",
+                                        shape=[11, 17],
+                                        dtype="float64")
 
             out1 = paddle.log10(data_x)
             exe = paddle.static.Executor(place=paddle.CPUPlace())
@@ -2241,6 +2334,7 @@ def test_api(self):
 
 
 class TestLog1p(TestActivation):
+
     def setUp(self):
         self.op_type = "log1p"
         self.check_eager = True
@@ -2262,11 +2356,10 @@ def test_check_grad(self):
     def test_api(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input_x = np.random.uniform(0.1, 1, [11, 17]).astype("float64")
-            data_x = fluid.layers.data(
-                name="data_x",
-                shape=[11, 17],
-                append_batch_size=False,
-                dtype="float64")
+            data_x = fluid.layers.data(name="data_x",
+                                       shape=[11, 17],
+                                       append_batch_size=False,
+                                       dtype="float64")
 
             out1 = paddle.log1p(data_x)
             exe = fluid.Executor(place=fluid.CPUPlace())
@@ -2288,6 +2381,7 @@ def test_api(self):
 
 
 class TestSquare(TestActivation):
+
     def setUp(self):
         self.op_type = "square"
         self.python_api = paddle.square
@@ -2303,8 +2397,10 @@ def setUp(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.007, check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.007,
+                        check_eager=True)
 
     def test_check_output(self):
         self.check_output(check_eager=True)
@@ -2313,6 +2409,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSquareBF16(OpTest):
+
     def setUp(self):
         self.op_type = "square"
         self.python_api = paddle.square
@@ -2336,11 +2433,14 @@ def test_check_output(self):
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', numeric_grad_delta=0.5, check_eager=True)
+        self.check_grad_with_place(place, ['X'],
+                                   'Out',
+                                   numeric_grad_delta=0.5,
+                                   check_eager=True)
 
 
 class TestPow(TestActivation):
+
     def setUp(self):
         self.op_type = "pow"
         self.python_api = paddle.pow
@@ -2365,6 +2465,7 @@ def test_check_grad(self):
 
 
 class TestPow_factor_tensor(TestActivation):
+
     def setUp(self):
         self.op_type = "pow"
         self.check_eager = False
@@ -2393,13 +2494,14 @@ def test_check_grad(self):
 
     def test_api(self):
         input = np.random.uniform(1, 2, [11, 17]).astype("float32")
-        x = fluid.layers.data(
-            name="x", shape=[11, 17], append_batch_size=False, dtype="float32")
-        res = fluid.layers.data(
-            name="res",
-            shape=[11, 17],
-            append_batch_size=False,
-            dtype="float32")
+        x = fluid.layers.data(name="x",
+                              shape=[11, 17],
+                              append_batch_size=False,
+                              dtype="float32")
+        res = fluid.layers.data(name="res",
+                                shape=[11, 17],
+                                append_batch_size=False,
+                                dtype="float32")
 
         factor_1 = 2.0
         factor_2 = fluid.layers.fill_constant([1], "float32", 3.0)
@@ -2420,20 +2522,22 @@ def test_api(self):
         assert np.allclose(res_6, np.power(input, 3))
 
     def test_error(self):
-        in1 = fluid.layers.data(
-            name="in1", shape=[11, 17], append_batch_size=False, dtype="int32")
-        in2 = fluid.layers.data(
-            name="in2", shape=[11, 17], append_batch_size=False, dtype="int64")
-        in3 = fluid.layers.data(
-            name="in3",
-            shape=[11, 17],
-            append_batch_size=False,
-            dtype="float32")
-        in4 = fluid.layers.data(
-            name="in4",
-            shape=[11, 17],
-            append_batch_size=False,
-            dtype="float64")
+        in1 = fluid.layers.data(name="in1",
+                                shape=[11, 17],
+                                append_batch_size=False,
+                                dtype="int32")
+        in2 = fluid.layers.data(name="in2",
+                                shape=[11, 17],
+                                append_batch_size=False,
+                                dtype="int64")
+        in3 = fluid.layers.data(name="in3",
+                                shape=[11, 17],
+                                append_batch_size=False,
+                                dtype="float32")
+        in4 = fluid.layers.data(name="in4",
+                                shape=[11, 17],
+                                append_batch_size=False,
+                                dtype="float64")
 
         factor_1 = fluid.layers.fill_constant([1], "float64", 3.0)
 
@@ -2449,6 +2553,7 @@ def ref_stanh(x, scale_a=0.67, scale_b=1.7159):
 
 
 class TestSTanh(TestActivation):
+
     def get_scale_a(self):
         return 0.67
 
@@ -2477,11 +2582,13 @@ def test_check_grad(self):
 
 
 class TestSTanhScaleA(TestSTanh):
+
     def get_scale_a(self):
         return 2.0
 
 
 class TestSTanhScaleB(TestSTanh):
+
     def get_scale_b(self):
         return 0.5
 
@@ -2538,21 +2645,25 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, paddle.stanh, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, paddle.stanh, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             paddle.stanh(x_fp16)
 
 
 class TestSTanhAPIScaleA(TestSTanhAPI):
+
     def get_scale_a(self):
         return 2.0
 
 
 class TestSTanhAPIScaleB(TestSTanhAPI):
+
     def get_scale_b(self):
         return 0.5
 
@@ -2565,6 +2676,7 @@ def ref_softplus(x, beta=1, threshold=20):
 
 
 class TestSoftplus(TestActivation):
+
     def setUp(self):
         self.op_type = "softplus"
         self.init_dtype()
@@ -2588,6 +2700,7 @@ def test_check_grad(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftplusBF16(OpTest):
+
     def setUp(self):
         self.op_type = "softplus"
         self.init_dtype()
@@ -2664,12 +2777,14 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.softplus, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.softplus, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.softplus(x_fp16)
 
 
@@ -2679,6 +2794,7 @@ def ref_softsign(x):
 
 
 class TestSoftsign(TestActivation):
+
     def setUp(self):
         self.op_type = "softsign"
         self.init_dtype()
@@ -2743,12 +2859,14 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.softsign, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.softsign, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.softsign(x_fp16)
 
 
@@ -2758,6 +2876,7 @@ def ref_thresholded_relu(x, threshold=1.0):
 
 
 class TestThresholdedRelu(TestActivation):
+
     def setUp(self):
         self.op_type = "thresholded_relu"
         self.init_dtype()
@@ -2828,12 +2947,14 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.thresholded_relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.thresholded_relu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.thresholded_relu(x_fp16)
 
 
@@ -2842,6 +2963,7 @@ def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5):
 
 
 class TestHardSigmoid(TestActivation):
+
     def setUp(self):
         self.op_type = "hard_sigmoid"
         self.dtype = 'float64'
@@ -2869,11 +2991,13 @@ def set_attrs(self):
 
 
 class TestHardSigmoidFP32(TestHardSigmoid):
+
     def set_attrs(self):
         self.dtype = 'float32'
 
 
 class TestHardSigmoidSlopeOffset(TestHardSigmoid):
+
     def set_attrs(self):
         self.slope = 0.2
         self.offset = 0.4
@@ -2929,12 +3053,14 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.hardsigmoid, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.hardsigmoid, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.hardsigmoid(x_fp16)
 
 
@@ -2944,6 +3070,7 @@ def ref_swish(x):
 
 
 class TestSwish(TestActivation):
+
     def setUp(self):
         self.op_type = "swish"
         self.python_api = paddle.nn.functional.swish
@@ -3018,12 +3145,14 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.swish, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.swish, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.swish(x_fp16)
 
 
@@ -3034,6 +3163,7 @@ def ref_mish(x, threshold=20.):
 
 
 class TestMish(TestActivation):
+
     def setUp(self):
         self.op_type = "mish"
         self.python_api = paddle.fluid.layers.nn.mish
@@ -3102,26 +3232,32 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.mish, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.mish, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.mish(x_fp16)
 
 
 #------------------ Test Error Activation----------------------
 def create_test_error_class(op_type):
+
     class TestOpErrors(unittest.TestCase):
+
         def test_errors(self):
             with program_guard(Program(), Program()):
                 op = getattr(fluid.layers, op_type)
                 # The input dtype of op_type must be float32, float64.
-                in1 = fluid.layers.data(
-                    name='input2', shape=[12, 10], dtype="int32")
-                in2 = fluid.layers.data(
-                    name='input3', shape=[12, 10], dtype="int64")
+                in1 = fluid.layers.data(name='input2',
+                                        shape=[12, 10],
+                                        dtype="int32")
+                in2 = fluid.layers.data(name='input3',
+                                        shape=[12, 10],
+                                        dtype="int64")
                 self.assertRaises(TypeError, op, in1)
                 self.assertRaises(TypeError, op, in2)
 
@@ -3150,9 +3286,11 @@ def test_errors(self):
 
 #------------------ Test Cudnn Activation----------------------
 def create_test_act_cudnn_class(parent, atol=1e-3, grad_atol=1e-3):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestActCudnn(parent):
+
         def init_kernel_type(self):
             self.attrs = {"use_cudnn": True}
 
@@ -3172,9 +3310,11 @@ def create_test_act_fp16_class(parent,
                                atol=1e-3,
                                grad_check=True,
                                grad_atol=0.80):
+
     @unittest.skipIf(not paddle.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestActFp16(parent):
+
         def init_dtype(self):
             self.dtype = np.float16
 
@@ -3188,8 +3328,9 @@ def test_check_grad(self):
             place = core.CUDAPlace(0)
             support_fp16 = core.is_float16_supported(place)
             if support_fp16 and grad_check:
-                self.check_grad_with_place(
-                    place, ['X'], 'Out', max_relative_error=grad_atol)
+                self.check_grad_with_place(place, ['X'],
+                                           'Out',
+                                           max_relative_error=grad_atol)
 
     cls_name = "{0}_{1}".format(parent.__name__, "fp16")
     TestActFp16.__name__ = cls_name
@@ -3253,9 +3394,11 @@ def create_test_act_bf16_class(parent,
                                atol=1e-2,
                                grad_check=True,
                                grad_atol=0.80):
+
     @unittest.skipIf(not paddle.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestActBF16(parent):
+
         def init_dtype(self):
             self.dtype = np.uint16
 
@@ -3265,8 +3408,9 @@ def test_check_output(self):
 
         def test_check_grad(self):
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['X'], 'Out', max_relative_error=grad_atol)
+            self.check_grad_with_place(place, ['X'],
+                                       'Out',
+                                       max_relative_error=grad_atol)
 
     cls_name = "{0}_{1}".format(parent.__name__, "bf16")
     TestActBF16.__name__ = cls_name
diff --git a/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py b/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py
index 5c07a544ca156..cbc32bbc4a1e6 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,10 +24,11 @@
 
 
 class TestSparseSquareOp(unittest.TestCase):
+
     def check_with_place(self, place):
         scope = core.Scope()
 
-        # create and initialize Grad Variable   
+        # create and initialize Grad Variable
         height = 10
         rows = [0, 4, 7]
         self.row_numel = 12
@@ -61,10 +62,11 @@ def test_sparse_acti(self):
 
 
 class TestSparseSqrtOp(unittest.TestCase):
+
     def check_with_place(self, place):
         scope = core.Scope()
 
-        # create and initialize Grad Variable   
+        # create and initialize Grad Variable
         height = 10
         rows = [0, 4, 7]
         self.row_numel = 12
diff --git a/python/paddle/fluid/tests/unittests/test_adadelta_op.py b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
index 44dd3d60bdca1..5d96dc38a7103 100644
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
@@ -22,6 +22,7 @@
 
 
 class TestAdadeltaOp1(OpTest):
+
     def setUp(self):
         self.op_type = "adadelta"
         param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
@@ -47,8 +48,8 @@ def setUp(self):
             (1 - rho) * np.square(grad)
         update = -np.multiply(
             np.sqrt(
-                np.divide(avg_squared_update + epsilon, avg_squared_grad_out +
-                          epsilon)), grad)
+                np.divide(avg_squared_update + epsilon,
+                          avg_squared_grad_out + epsilon)), grad)
 
         avg_squared_update_out = rho * avg_squared_update + \
             (1 - rho) * np.square(update)
@@ -92,8 +93,8 @@ def setUp(self):
             (1 - rho) * np.square(grad)
         update = -np.multiply(
             np.sqrt(
-                np.divide(avg_squared_update + epsilon, avg_squared_grad_out +
-                          epsilon)), grad)
+                np.divide(avg_squared_update + epsilon,
+                          avg_squared_grad_out + epsilon)), grad)
 
         avg_squared_update_out = rho * avg_squared_update + \
             (1 - rho) * np.square(update)
@@ -111,16 +112,16 @@ def test_check_output(self):
 
 
 class TestAdadeltaV2(unittest.TestCase):
+
     def test_adadelta_dygraph(self):
         paddle.disable_static(paddle.CPUPlace())
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Adadelta(
-            learning_rate=0.01,
-            parameters=linear.parameters(),
-            weight_decay=0.01)
+        adam = paddle.optimizer.Adadelta(learning_rate=0.01,
+                                         parameters=linear.parameters(),
+                                         weight_decay=0.01)
         out = linear(a)
         out.backward()
         adam.step()
@@ -141,8 +142,8 @@ def test_adadelta(self):
             rms_optimizer.minimize(avg_cost)
 
             fetch_list = [avg_cost]
-            train_reader = paddle.batch(
-                paddle.dataset.uci_housing.train(), batch_size=1)
+            train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
+                                        batch_size=1)
             feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -151,16 +152,18 @@ def test_adadelta(self):
 
     def test_raise_error(self):
         self.assertRaises(ValueError, paddle.optimizer.Adadelta, None)
-        self.assertRaises(
-            ValueError, paddle.optimizer.Adadelta, learning_rate=0.1, rho=None)
-        self.assertRaises(
-            ValueError,
-            paddle.optimizer.Adadelta,
-            learning_rate=0.1,
-            epsilon=None)
+        self.assertRaises(ValueError,
+                          paddle.optimizer.Adadelta,
+                          learning_rate=0.1,
+                          rho=None)
+        self.assertRaises(ValueError,
+                          paddle.optimizer.Adadelta,
+                          learning_rate=0.1,
+                          epsilon=None)
 
 
 class TestAdadeltaV2Group(TestAdadeltaV2):
+
     def test_adadelta_dygraph(self):
         paddle.disable_static(paddle.CPUPlace())
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -168,15 +171,17 @@ def test_adadelta_dygraph(self):
         linear_1 = paddle.nn.Linear(13, 5)
         linear_2 = paddle.nn.Linear(5, 5)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Adadelta(
-            learning_rate=0.01,
-            parameters=[{
-                'params': linear_1.parameters()
-            }, {
-                'params': linear_2.parameters(),
-                'weight_decay': 0.001,
-            }],
-            weight_decay=0.1)
+        adam = paddle.optimizer.Adadelta(learning_rate=0.01,
+                                         parameters=[{
+                                             'params':
+                                             linear_1.parameters()
+                                         }, {
+                                             'params':
+                                             linear_2.parameters(),
+                                             'weight_decay':
+                                             0.001,
+                                         }],
+                                         weight_decay=0.1)
         out = linear_1(a)
         out = linear_2(out)
         out.backward()
diff --git a/python/paddle/fluid/tests/unittests/test_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
index ae047e602d15a..4f290d4befa52 100644
--- a/python/paddle/fluid/tests/unittests/test_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
@@ -86,10 +86,11 @@ def test_check_output(self):
 
 
 class TestSparseAdagradOp(unittest.TestCase):
+
     def check_with_place(self, place):
         scope = core.Scope()
 
-        # create and initialize Grad Variable   
+        # create and initialize Grad Variable
         height = 10
         rows = [0, 4, 7, 4]
         row_numel = 12
@@ -120,15 +121,14 @@ def check_with_place(self, place):
         moment.set(moment_np_array, place)
 
         # create and run sgd operator
-        adagrad_op = Operator(
-            "adagrad",
-            Param='Param',
-            Grad='Grad',
-            ParamOut='Param',
-            Moment='Moment',
-            MomentOut='Moment',
-            LearningRate='LearningRate',
-            epsilon=2.0)
+        adagrad_op = Operator("adagrad",
+                              Param='Param',
+                              Grad='Grad',
+                              ParamOut='Param',
+                              Moment='Moment',
+                              MomentOut='Moment',
+                              LearningRate='LearningRate',
+                              epsilon=2.0)
 
         adagrad_op.run(scope, place)
 
@@ -152,34 +152,31 @@ def check_with_place(self, place):
         def get_out(param, lr, grad, m, epsilon):
             return param - lr * grad / (math.sqrt(m) + epsilon)
 
-        self.assertAlmostEqual(
-            get_out(5.0, 2.0, 2.0, 6.0, 2.0),
-            result_array[rows[0], 0],
-            places=5)
-        self.assertAlmostEqual(
-            get_out(5.0, 2.0, 1.0, 3.0, 2.0),
-            result_array[rows[0], 2],
-            places=5)
-        self.assertAlmostEqual(
-            get_out(5.0, 2.0, 0.0, 2.0, 2.0), result_array[1, 0], places=5)
+        self.assertAlmostEqual(get_out(5.0, 2.0, 2.0, 6.0, 2.0),
+                               result_array[rows[0], 0],
+                               places=5)
+        self.assertAlmostEqual(get_out(5.0, 2.0, 1.0, 3.0, 2.0),
+                               result_array[rows[0], 2],
+                               places=5)
+        self.assertAlmostEqual(get_out(5.0, 2.0, 0.0, 2.0, 2.0),
+                               result_array[1, 0],
+                               places=5)
 
         # grad_merge = 1.0 + 1.0
         # m = 6.0
-        self.assertAlmostEqual(
-            get_out(5.0, 2.0, 2.0, 6.0, 2.0),
-            result_array[rows[1], 10],
-            places=5)
-
-        self.assertAlmostEqual(
-            get_out(5.0, 2.0, 0.0, 2.0, 2.0), result_array[5, 8], places=5)
-        self.assertAlmostEqual(
-            get_out(5.0, 2.0, 1.0, 3.0, 2.0),
-            result_array[rows[2], 1],
-            places=5)
-        self.assertAlmostEqual(
-            get_out(5.0, 2.0, 4.0, 18.0, 2.0),
-            result_array[rows[2], 8],
-            places=5)
+        self.assertAlmostEqual(get_out(5.0, 2.0, 2.0, 6.0, 2.0),
+                               result_array[rows[1], 10],
+                               places=5)
+
+        self.assertAlmostEqual(get_out(5.0, 2.0, 0.0, 2.0, 2.0),
+                               result_array[5, 8],
+                               places=5)
+        self.assertAlmostEqual(get_out(5.0, 2.0, 1.0, 3.0, 2.0),
+                               result_array[rows[2], 1],
+                               places=5)
+        self.assertAlmostEqual(get_out(5.0, 2.0, 4.0, 18.0, 2.0),
+                               result_array[rows[2], 8],
+                               places=5)
 
     def test_sparse_adagrad(self):
         places = [core.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_adagrad_op_v2.py b/python/paddle/fluid/tests/unittests/test_adagrad_op_v2.py
index c6a69c0723ce9..3096dc33a1166 100644
--- a/python/paddle/fluid/tests/unittests/test_adagrad_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_adagrad_op_v2.py
@@ -24,20 +24,22 @@
 
 
 class TestAdagradOpV2(unittest.TestCase):
+
     def test_v20_coverage(self):
         paddle.disable_static()
         inp = paddle.rand(shape=[10, 10])
         linear = paddle.nn.Linear(10, 10)
         out = linear(inp)
         loss = paddle.mean(out)
-        adagrad = paddle.optimizer.Adagrad(
-            learning_rate=0.1, parameters=linear.parameters())
+        adagrad = paddle.optimizer.Adagrad(learning_rate=0.1,
+                                           parameters=linear.parameters())
         out.backward()
         adagrad.step()
         adagrad.clear_grad()
 
 
 class TestAdagradOpV2Group(TestAdagradOpV2):
+
     def test_v20_coverage(self):
         paddle.disable_static()
         inp = paddle.rand(shape=[10, 10])
@@ -46,15 +48,17 @@ def test_v20_coverage(self):
         out = linear_1(inp)
         out = linear_2(out)
         loss = paddle.mean(out)
-        adagrad = paddle.optimizer.Adagrad(
-            learning_rate=0.01,
-            parameters=[{
-                'params': linear_1.parameters()
-            }, {
-                'params': linear_2.parameters(),
-                'weight_decay': 0.001,
-            }],
-            weight_decay=0.1)
+        adagrad = paddle.optimizer.Adagrad(learning_rate=0.01,
+                                           parameters=[{
+                                               'params':
+                                               linear_1.parameters()
+                                           }, {
+                                               'params':
+                                               linear_2.parameters(),
+                                               'weight_decay':
+                                               0.001,
+                                           }],
+                                           weight_decay=0.1)
         out.backward()
         adagrad.step()
         adagrad.clear_grad()
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index d254cd286e666..61597562a4ab0 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -25,6 +25,7 @@
 
 
 class TestAdamOp1(OpTest):
+
     def setUp(self):
         '''Test Adam Op with supplied attributes
         '''
@@ -70,6 +71,7 @@ def test_check_output(self):
 
 
 class TestAdamOp2(OpTest):
+
     def set_shape(self):
         self.shape = (102, 105)
 
@@ -119,11 +121,13 @@ def test_check_output(self):
 
 
 class TestAdamOnlyTailOp(TestAdamOp2):
+
     def set_shape(self):
         self.shape = (3)
 
 
 class TestAdamOpMultipleSteps(OpTest):
+
     def setUp(self):
         '''Test Adam Operator with supplied attributes
         '''
@@ -294,13 +298,13 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
     param_out = np.zeros(shape=[height, row_numel])
 
     def update_row(row_id, update_value):
-        moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
-                                                         ) * update_value
+        moment1_out[row_id] = beta1 * moment1[row_id] + (1 -
+                                                         beta1) * update_value
         moment2_out[row_id] = beta2 * moment2[row_id] + (
             1 - beta2) * np.square(update_value)
         lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
-        param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
-            np.sqrt(moment2_out[row_id]) + epsilon))
+        param_out[row_id] = param[row_id] - lr_t * (
+            moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon))
 
     if lazy_mode:
         for idx, row_id in enumerate(rows):
@@ -316,6 +320,7 @@ def update_row(row_id, update_value):
 
 
 class TestSparseAdamOp(unittest.TestCase):
+
     def setup(self, scope, place, lazy_mode):
         beta1 = 0.78
         beta2 = 0.836
@@ -409,6 +414,7 @@ def test_sparse_adam(self):
 
 
 class TestAdamOpBetaVariable(OpTest):
+
     def setUp(self):
         '''Test Adam Op with beta as Variable
         '''
@@ -456,6 +462,7 @@ def test_check_output(self):
 
 
 class TestAdamOpBetaEpsilonVariable(OpTest):
+
     def setUp(self):
         '''Test Adam Op with beta/epsilon as Variable
         '''
@@ -504,6 +511,7 @@ def test_check_output(self):
 
 
 class TestAdamOpWithGlobalBetaPow(OpTest):
+
     def setUp(self):
         '''Test Adam Op with global_beta_pow
         '''
@@ -555,6 +563,7 @@ def test_check_output(self):
 
 
 class TestAdamOpWithSkipUpdate(OpTest):
+
     def setUp(self):
         '''Test Adam Op with global_beta_pow
         '''
@@ -604,6 +613,7 @@ def test_check_output(self):
 
 
 class TestAdamOpV2(unittest.TestCase):
+
     def test_adam_op(self):
         place = fluid.CPUPlace()
         shape = [2, 3, 8, 8]
@@ -616,17 +626,20 @@ def test_adam_op(self):
                 conv = fluid.layers.conv2d(data, 8, 3)
                 loss = fluid.layers.reduce_mean(conv)
 
-                beta1 = fluid.layers.create_global_var(
-                    shape=[1], value=0.85, dtype='float32', persistable=True)
-                beta2 = fluid.layers.create_global_var(
-                    shape=[1], value=0.95, dtype='float32', persistable=True)
+                beta1 = fluid.layers.create_global_var(shape=[1],
+                                                       value=0.85,
+                                                       dtype='float32',
+                                                       persistable=True)
+                beta2 = fluid.layers.create_global_var(shape=[1],
+                                                       value=0.95,
+                                                       dtype='float32',
+                                                       persistable=True)
                 betas = [beta1, beta2]
-                opt = paddle.optimizer.Adam(
-                    learning_rate=1e-5,
-                    beta1=beta1,
-                    beta2=beta2,
-                    weight_decay=0.01,
-                    epsilon=1e-8)
+                opt = paddle.optimizer.Adam(learning_rate=1e-5,
+                                            beta1=beta1,
+                                            beta2=beta2,
+                                            weight_decay=0.01,
+                                            epsilon=1e-8)
                 opt.minimize(loss)
 
         exe.run(startup)
@@ -640,8 +653,8 @@ def test_adam_op_dygraph(self):
         a = fluid.dygraph.to_variable(value)
         linear = fluid.Linear(13, 5, dtype="float32")
 
-        adam = paddle.optimizer.Adam(
-            learning_rate=0.01, parameters=linear.parameters())
+        adam = paddle.optimizer.Adam(learning_rate=0.01,
+                                     parameters=linear.parameters())
         out = linear(a)
         out.backward()
         adam.step()
@@ -672,8 +685,8 @@ def test_adam_op_with_state_dict(self):
         with self.assertRaises(TypeError):
             learning_rate = np.array([0.01]).astype("float32")
             learning_rate = paddle.to_tensor(learning_rate)
-            adam = paddle.optimizer.Adam(
-                learning_rate=learning_rate, parameters=emb.parameters())
+            adam = paddle.optimizer.Adam(learning_rate=learning_rate,
+                                         parameters=emb.parameters())
 
         params = adam.get_opti_var_name_list()
         assert (params is not None)
@@ -685,8 +698,9 @@ def test_adam_with_grad_clip(self):
         a = fluid.dygraph.to_variable(value)
         linear = fluid.Linear(13, 5, dtype="float32")
         clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
-        adam = paddle.optimizer.Adam(
-            0.1, parameters=linear.parameters(), grad_clip=clip)
+        adam = paddle.optimizer.Adam(0.1,
+                                     parameters=linear.parameters(),
+                                     grad_clip=clip)
         out = linear(a)
         out.backward()
         adam.step()
@@ -703,8 +717,9 @@ def test_adam_op_with_set_lr(self):
         cur_lr = adam.get_lr()
         assert (lr == cur_lr)
         with self.assertRaises(TypeError):
-            lr_var = paddle.fluid.layers.create_global_var(
-                shape=[1], value=lr, dtype='float32')
+            lr_var = paddle.fluid.layers.create_global_var(shape=[1],
+                                                           value=lr,
+                                                           dtype='float32')
             adam.set_lr(lr_var)
         paddle.enable_static()
 
@@ -712,14 +727,17 @@ def test_adam_op_invalid_input(self):
         paddle.disable_static()
         linear = paddle.nn.Linear(10, 10)
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.Adam(
-                0.1, beta1=-1, parameters=linear.parameters())
+            adam = paddle.optimizer.Adam(0.1,
+                                         beta1=-1,
+                                         parameters=linear.parameters())
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.Adam(
-                0.1, beta2=-1, parameters=linear.parameters())
+            adam = paddle.optimizer.Adam(0.1,
+                                         beta2=-1,
+                                         parameters=linear.parameters())
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.Adam(
-                0.1, epsilon=-1, parameters=linear.parameters())
+            adam = paddle.optimizer.Adam(0.1,
+                                         epsilon=-1,
+                                         parameters=linear.parameters())
         paddle.enable_static()
 
     def test_adam_op_with_sparse_input_and_weight_decay(self):
@@ -728,8 +746,9 @@ def test_adam_op_with_sparse_input_and_weight_decay(self):
         x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64)
         x = paddle.to_tensor(x_data, stop_gradient=False)
         emb = paddle.nn.Embedding(10, 10, sparse=True)
-        adam = paddle.optimizer.Adam(
-            0.001, parameters=emb.parameters(), weight_decay=0.01)
+        adam = paddle.optimizer.Adam(0.001,
+                                     parameters=emb.parameters(),
+                                     weight_decay=0.01)
 
         with self.assertRaises(RuntimeError):
             out = emb(x)
@@ -747,6 +766,7 @@ def test_api_eager_dygraph(self):
 
 
 class TestAdamOptimizer(unittest.TestCase):
+
     def _test(self,
               place,
               use_tensor=True,
@@ -777,8 +797,9 @@ def _test(self,
             with paddle.utils.unique_name.guard():
                 a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
                 b = paddle.static.data(name="b", shape=[2, 2], dtype='float32')
-                label = paddle.static.data(
-                    name="label", shape=[2, 1], dtype='int64')
+                label = paddle.static.data(name="label",
+                                           shape=[2, 1],
+                                           dtype='int64')
 
                 sum = paddle.add(a, b)
                 z = paddle.pow(sum, 2.0)
@@ -824,12 +845,11 @@ def _test(self,
                             align_size=256,
                             grad_clip=clip)
                     else:
-                        adam = paddle.optimizer.Adam(
-                            learning_rate=0.01,
-                            beta1=beta1,
-                            beta2=beta2,
-                            epsilon=epsilon,
-                            grad_clip=clip)
+                        adam = paddle.optimizer.Adam(learning_rate=0.01,
+                                                     beta1=beta1,
+                                                     beta2=beta2,
+                                                     epsilon=epsilon,
+                                                     grad_clip=clip)
                 else:
                     if use_fluid_api:
                         adam = fluid.optimizer.Adam(
@@ -842,12 +862,11 @@ def _test(self,
                             align_size=256,
                             grad_clip=clip)
                     else:
-                        adam = fluid.optimizer.Adam(
-                            learning_rate=0.01,
-                            beta1=beta1_init,
-                            beta2=beta2_init,
-                            epsilon=epsilon_init,
-                            grad_clip=clip)
+                        adam = fluid.optimizer.Adam(learning_rate=0.01,
+                                                    beta1=beta1_init,
+                                                    beta2=beta2_init,
+                                                    epsilon=epsilon_init,
+                                                    grad_clip=clip)
 
                 adam.minimize(loss)
 
@@ -858,12 +877,13 @@ def _test(self,
 
             print("Start run on {}".format(place))
             for epoch in range(10):
-                pred_res, loss_res = exe.run(
-                    main_prog,
-                    feed={"a": a_np,
-                          "b": b_np,
-                          "label": label_np},
-                    fetch_list=[prediction, loss])
+                pred_res, loss_res = exe.run(main_prog,
+                                             feed={
+                                                 "a": a_np,
+                                                 "b": b_np,
+                                                 "label": label_np
+                                             },
+                                             fetch_list=[prediction, loss])
                 print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
                     epoch, pred_res[0], loss_res))
             paddle.disable_static()
@@ -877,9 +897,10 @@ def _test_with_place(self, place):
             for use_fluid_api in [True, False]:
                 for use_global_beta_pow in [True, False]:
                     for flatten_param_grads in [True, False]:
-                        pred, loss = self._test(
-                            place, use_tensor, use_fluid_api,
-                            use_global_beta_pow, flatten_param_grads)
+                        pred, loss = self._test(place, use_tensor,
+                                                use_fluid_api,
+                                                use_global_beta_pow,
+                                                flatten_param_grads)
                         preds.append(pred)
                         losses.append(loss)
         for pred in preds:
@@ -913,8 +934,9 @@ def test_adam_flatten_param_grads_with_regularizer(self):
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
             avg_cost = fluid.layers.mean(cost)
 
-            adam = fluid.optimizer.AdamOptimizer(
-                0.01, flatten_param_grads=True, align_size=256)
+            adam = fluid.optimizer.AdamOptimizer(0.01,
+                                                 flatten_param_grads=True,
+                                                 align_size=256)
             adam.minimize(avg_cost)
             paddle.disable_static()
 
@@ -937,14 +959,13 @@ def test_adam_exception(self):
         adam = fluid.optimizer.Adam(use_global_beta_pow=True)
         adam.minimize(loss)
         self.assertRaises(Exception, adam._get_global_accumulator, 'tmp')
-        adam._add_global_accumulator(
-            'tmp', type=core.VarDesc.VarType.LOD_TENSOR)
+        adam._add_global_accumulator('tmp',
+                                     type=core.VarDesc.VarType.LOD_TENSOR)
         adam._get_global_accumulator('tmp')
-        self.assertRaises(
-            Exception,
-            adam._add_global_accumulator,
-            adam._beta1_pow_acc_str,
-            type=core.VarDesc.VarType.LOD_TENSOR)
+        self.assertRaises(Exception,
+                          adam._add_global_accumulator,
+                          adam._beta1_pow_acc_str,
+                          type=core.VarDesc.VarType.LOD_TENSOR)
         paddle.disable_static()
 
     def test_adam_save_load(self):
@@ -955,12 +976,12 @@ def test_adam_save_load(self):
         state_dict = linear.state_dict()
         fluid.save_dygraph(state_dict, "paddle_dy")
 
-        scheduler = paddle.optimizer.lr.NoamDecay(
-            d_model=0.01, warmup_steps=100, verbose=True)
-        adam = paddle.fluid.optimizer.Adam(
-            learning_rate=scheduler,
-            parameter_list=linear.parameters(),
-            use_global_beta_pow=True)
+        scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01,
+                                                  warmup_steps=100,
+                                                  verbose=True)
+        adam = paddle.fluid.optimizer.Adam(learning_rate=scheduler,
+                                           parameter_list=linear.parameters(),
+                                           use_global_beta_pow=True)
         adam.minimize(b)
         state_dict = adam.state_dict()
         fluid.save_dygraph(state_dict, "paddle_dy")
@@ -981,8 +1002,9 @@ def get_opt(dtype, shape):
                 state_dict = linear.state_dict()
                 fluid.save_dygraph(state_dict, "paddle_dy")
 
-                scheduler = paddle.optimizer.lr.NoamDecay(
-                    d_model=0.01, warmup_steps=100, verbose=True)
+                scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01,
+                                                          warmup_steps=100,
+                                                          verbose=True)
                 adam = paddle.fluid.optimizer.Adam(
                     learning_rate=scheduler,
                     parameter_list=linear.parameters(),
@@ -1001,13 +1023,14 @@ def get_opt(dtype, shape):
         self.assertRaises(AssertionError, adam2.set_state_dict, opt_state_dict)
 
         adam3 = get_opt('float32', [10, 10])  # shape not match
-        opt_state_dict['beta1_pow_acc_0'] = np.array(
-            [0.9, 0.9], dtype='float32')
+        opt_state_dict['beta1_pow_acc_0'] = np.array([0.9, 0.9],
+                                                     dtype='float32')
         self.assertRaises(AssertionError, adam3.set_state_dict, opt_state_dict)
         paddle.enable_static()
 
 
 class TestAdamOpV2Group(TestAdamOpV2):
+
     def test_adam_op(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -1015,17 +1038,16 @@ def test_adam_op(self):
         linear_1 = paddle.nn.Linear(13, 5)
         linear_2 = paddle.nn.Linear(5, 3)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Adam(
-            learning_rate=0.01,
-            parameters=[{
-                'params': linear_1.parameters()
-            }, {
-                'params': linear_2.parameters(),
-                'weight_decay': 0.001,
-                'beta1': 0.1,
-                'beta2': 0.99
-            }],
-            weight_decay=0.1)
+        adam = paddle.optimizer.Adam(learning_rate=0.01,
+                                     parameters=[{
+                                         'params': linear_1.parameters()
+                                     }, {
+                                         'params': linear_2.parameters(),
+                                         'weight_decay': 0.001,
+                                         'beta1': 0.1,
+                                         'beta2': 0.99
+                                     }],
+                                     weight_decay=0.1)
         out = linear_1(a)
         out = linear_2(out)
         out.backward()
@@ -1034,6 +1056,7 @@ def test_adam_op(self):
 
 
 class TestMultiTensorAdam(unittest.TestCase):
+
     def _adam_optimize_dygraph(self,
                                place,
                                use_param_attr=False,
@@ -1056,20 +1079,22 @@ def _adam_optimize_dygraph(self,
             model = paddle.nn.Linear(5, 5)
 
         if not use_param_group:
-            optimizer = paddle.optimizer.Adam(
-                parameters=model.parameters(),
-                use_multi_tensor=use_multi_tensor,
-                multi_precision=use_amp)
+            optimizer = paddle.optimizer.Adam(parameters=model.parameters(),
+                                              use_multi_tensor=use_multi_tensor,
+                                              multi_precision=use_amp)
         else:
-            optimizer = paddle.optimizer.Adam(
-                parameters=[{
-                    'params': model.parameters(),
-                    'weight_decay': 0.001,
-                    'beta1': 0.1,
-                    'beta2': 0.99
-                }],
-                use_multi_tensor=use_multi_tensor,
-                multi_precision=use_amp)
+            optimizer = paddle.optimizer.Adam(parameters=[{
+                'params':
+                model.parameters(),
+                'weight_decay':
+                0.001,
+                'beta1':
+                0.1,
+                'beta2':
+                0.99
+            }],
+                                              use_multi_tensor=use_multi_tensor,
+                                              multi_precision=use_amp)
 
         for idx in range(2):
             if place == 'gpu' and use_amp == True:
@@ -1105,8 +1130,8 @@ def _adam_optimize_static(self,
         exe = paddle.static.Executor(place=place)
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        optimizer = paddle.optimizer.Adam(
-            multi_precision=use_amp, use_multi_tensor=use_multi_tensor)
+        optimizer = paddle.optimizer.Adam(multi_precision=use_amp,
+                                          use_multi_tensor=use_multi_tensor)
         if use_amp:
             optimizer = paddle.static.amp.decorate(
                 optimizer,
@@ -1116,11 +1141,13 @@ def _adam_optimize_static(self,
                 use_fp16_guard=False)
         with paddle.static.program_guard(train_program, startup_program):
             if use_amp:
-                data = paddle.static.data(
-                    shape=[2, 2], name='X', dtype='float16')
+                data = paddle.static.data(shape=[2, 2],
+                                          name='X',
+                                          dtype='float16')
             else:
-                data = paddle.static.data(
-                    shape=[2, 2], name='X', dtype='float32')
+                data = paddle.static.data(shape=[2, 2],
+                                          name='X',
+                                          dtype='float32')
             hidden = paddle.static.nn.fc(x=data, size=10)
             loss = paddle.fluid.layers.mean(hidden)
             optimizer.minimize(loss)
@@ -1151,59 +1178,54 @@ def _check_with_place_amp(self, place, use_amp):
         output_dygraph2, params_dygraph2 = self._adam_optimize_dygraph(
             place=place, use_amp=use_amp, use_multi_tensor=False)
         self.assertEqual(
-            np.allclose(
-                output_dygraph1, output_dygraph2, rtol=1e-05), True)
+            np.allclose(output_dygraph1, output_dygraph2, rtol=1e-05), True)
         for idx in range(len(params_dygraph1)):
             self.assertEqual(
-                np.allclose(
-                    params_dygraph1[idx], params_dygraph2[idx], rtol=1e-05),
-                True)
+                np.allclose(params_dygraph1[idx],
+                            params_dygraph2[idx],
+                            rtol=1e-05), True)
         # test static mode
-        output_static1 = self._adam_optimize_static(
-            place=place, use_amp=use_amp, use_multi_tensor=True)
-        output_static2 = self._adam_optimize_static(
-            place=place, use_amp=use_amp, use_multi_tensor=False)
+        output_static1 = self._adam_optimize_static(place=place,
+                                                    use_amp=use_amp,
+                                                    use_multi_tensor=True)
+        output_static2 = self._adam_optimize_static(place=place,
+                                                    use_amp=use_amp,
+                                                    use_multi_tensor=False)
         for idx in range(len(output_static1)):
             self.assertEqual(
-                np.allclose(
-                    output_static1[idx], output_static2[idx], rtol=1e-05),
-                True)
+                np.allclose(output_static1[idx],
+                            output_static2[idx],
+                            rtol=1e-05), True)
 
     def _check_with_param_arrt(self, place, use_amp):
-        output1, params1 = self._adam_optimize_dygraph(
-            place=place,
-            use_amp=use_amp,
-            use_param_attr=True,
-            use_multi_tensor=True)
-        output2, params2 = self._adam_optimize_dygraph(
-            place=place,
-            use_amp=use_amp,
-            use_param_attr=True,
-            use_multi_tensor=False)
+        output1, params1 = self._adam_optimize_dygraph(place=place,
+                                                       use_amp=use_amp,
+                                                       use_param_attr=True,
+                                                       use_multi_tensor=True)
+        output2, params2 = self._adam_optimize_dygraph(place=place,
+                                                       use_amp=use_amp,
+                                                       use_param_attr=True,
+                                                       use_multi_tensor=False)
 
         self.assertEqual(np.allclose(output1, output2, rtol=1e-05), True)
         for idx in range(len(params1)):
             self.assertEqual(
-                np.allclose(
-                    params1[idx], params2[idx], rtol=1e-05), True)
+                np.allclose(params1[idx], params2[idx], rtol=1e-05), True)
 
     def _check_with_param_group(self, place, use_amp):
-        output1, params1 = self._adam_optimize_dygraph(
-            place=place,
-            use_amp=use_amp,
-            use_param_group=True,
-            use_multi_tensor=True)
-        output2, params2 = self._adam_optimize_dygraph(
-            place=place,
-            use_amp=use_amp,
-            use_param_group=True,
-            use_multi_tensor=False)
+        output1, params1 = self._adam_optimize_dygraph(place=place,
+                                                       use_amp=use_amp,
+                                                       use_param_group=True,
+                                                       use_multi_tensor=True)
+        output2, params2 = self._adam_optimize_dygraph(place=place,
+                                                       use_amp=use_amp,
+                                                       use_param_group=True,
+                                                       use_multi_tensor=False)
 
         self.assertEqual(np.allclose(output1, output2, rtol=1e-05), True)
         for idx in range(len(params1)):
             self.assertEqual(
-                np.allclose(
-                    params1[idx], params2[idx], rtol=1e-05), True)
+                np.allclose(params1[idx], params2[idx], rtol=1e-05), True)
 
     def test_main(self):
         for place in self._get_places():
diff --git a/python/paddle/fluid/tests/unittests/test_adam_optimizer_fp32_fp64.py b/python/paddle/fluid/tests/unittests/test_adam_optimizer_fp32_fp64.py
index 5ad83179e3cff..cc57293a7fa04 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_optimizer_fp32_fp64.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_optimizer_fp32_fp64.py
@@ -39,8 +39,8 @@ def main_test_func(place, dtype):
             adam_optimizer.minimize(avg_cost)
 
             fetch_list = [avg_cost]
-            train_reader = fluid.io.batch(
-                paddle.dataset.uci_housing.train(), batch_size=1)
+            train_reader = fluid.io.batch(paddle.dataset.uci_housing.train(),
+                                          batch_size=1)
             feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -49,6 +49,7 @@ def main_test_func(place, dtype):
 
 
 class AdamFp32Test(unittest.TestCase):
+
     def setUp(self):
         self.dtype = 'float32'
 
@@ -58,6 +59,7 @@ def test_main(self):
 
 
 class AdamFp64Test(AdamFp32Test):
+
     def setUp(self):
         self.dtype = 'float64'
 
diff --git a/python/paddle/fluid/tests/unittests/test_adamax_api.py b/python/paddle/fluid/tests/unittests/test_adamax_api.py
index 1698ac90a9f2d..dc8f1f969e760 100644
--- a/python/paddle/fluid/tests/unittests/test_adamax_api.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_api.py
@@ -23,15 +23,15 @@
 
 
 class TestAdamaxAPI(unittest.TestCase):
+
     def func_adamax_api_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
-        adam = paddle.optimizer.Adamax(
-            learning_rate=0.01,
-            parameters=linear.parameters(),
-            weight_decay=0.01)
+        adam = paddle.optimizer.Adamax(learning_rate=0.01,
+                                       parameters=linear.parameters(),
+                                       weight_decay=0.01)
         out = linear(a)
         out.backward()
         adam.step()
@@ -56,12 +56,11 @@ def func_adamax_api(self):
                 loss = paddle.mean(conv)
                 beta1 = 0.85
                 beta2 = 0.95
-                opt = paddle.optimizer.Adamax(
-                    learning_rate=1e-5,
-                    beta1=beta1,
-                    beta2=beta2,
-                    weight_decay=0.01,
-                    epsilon=1e-8)
+                opt = paddle.optimizer.Adamax(learning_rate=1e-5,
+                                              beta1=beta1,
+                                              beta2=beta2,
+                                              weight_decay=0.01,
+                                              epsilon=1e-8)
                 opt.minimize(loss)
 
         exe.run(startup)
@@ -76,6 +75,7 @@ def test_adamax_api(self):
 
 
 class TestAdamaxAPIGroup(TestAdamaxAPI):
+
     def func_adamax_api_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -83,17 +83,21 @@ def func_adamax_api_dygraph(self):
         linear_1 = paddle.nn.Linear(13, 5)
         linear_2 = paddle.nn.Linear(5, 3)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Adamax(
-            learning_rate=0.01,
-            parameters=[{
-                'params': linear_1.parameters()
-            }, {
-                'params': linear_2.parameters(),
-                'weight_decay': 0.001,
-                'beta1': 0.1,
-                'beta2': 0.99
-            }],
-            weight_decay=0.1)
+        adam = paddle.optimizer.Adamax(learning_rate=0.01,
+                                       parameters=[{
+                                           'params':
+                                           linear_1.parameters()
+                                       }, {
+                                           'params':
+                                           linear_2.parameters(),
+                                           'weight_decay':
+                                           0.001,
+                                           'beta1':
+                                           0.1,
+                                           'beta2':
+                                           0.99
+                                       }],
+                                       weight_decay=0.1)
         out = linear_1(a)
         out = linear_2(out)
         out.backward()
diff --git a/python/paddle/fluid/tests/unittests/test_adamax_op.py b/python/paddle/fluid/tests/unittests/test_adamax_op.py
index 8ce7656acfae7..3c8be0529d1f3 100644
--- a/python/paddle/fluid/tests/unittests/test_adamax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_op.py
@@ -20,6 +20,7 @@
 
 
 class TestAdamaxOp1(OpTest):
+
     def setUp(self):
         '''Test Adamax Operator with supplied attributes
         '''
@@ -47,8 +48,8 @@ def setUp(self):
 
         self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
 
-        param_out, moment_out, inf_norm_out = adamax_step(self.inputs,
-                                                          self.attrs)
+        param_out, moment_out, inf_norm_out = adamax_step(
+            self.inputs, self.attrs)
 
         self.outputs = {
             'ParamOut': param_out,
@@ -101,6 +102,7 @@ def test_check_output(self):
 
 
 class TestAdamaxOpMultipleSteps(OpTest):
+
     def setUp(self):
         '''Test Adamax Operator with supplied attributes
         '''
@@ -132,8 +134,8 @@ def setUp(self):
 
     def test_check_output(self):
         for _ in range(self.num_steps):
-            param_out, moment_out, inf_norm_out = adamax_step(self.inputs,
-                                                              self.attrs)
+            param_out, moment_out, inf_norm_out = adamax_step(
+                self.inputs, self.attrs)
 
             self.outputs = {
                 'ParamOut': param_out,
@@ -185,19 +187,23 @@ def adamax_step(inputs, attributes):
 
 
 class TestAdamaxOpV2(unittest.TestCase):
+
     def test_adamax_op_invalid_input(self):
         import paddle
         paddle.disable_static()
         linear = paddle.nn.Linear(10, 10)
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.Adamax(
-                0.1, beta1=-1, parameters=linear.parameters())
+            adam = paddle.optimizer.Adamax(0.1,
+                                           beta1=-1,
+                                           parameters=linear.parameters())
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.Adamax(
-                0.1, beta2=-1, parameters=linear.parameters())
+            adam = paddle.optimizer.Adamax(0.1,
+                                           beta2=-1,
+                                           parameters=linear.parameters())
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.Adamax(
-                0.1, epsilon=-1, parameters=linear.parameters())
+            adam = paddle.optimizer.Adamax(0.1,
+                                           epsilon=-1,
+                                           parameters=linear.parameters())
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index 225bd35a8ec9d..2ece3d2d8ddf0 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -60,6 +60,7 @@ def adamw_step(inputs, attributes):
 
 
 class TestAdamW(OpTest):
+
     def setUp(self):
         '''Test AdamW Op with supplied attributes
         '''
@@ -113,6 +114,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestAdamW2(OpTest):
+
     def setUp(self):
         '''Test AdamW Op with supplied attributes
         '''
@@ -149,8 +151,8 @@ def setUp(self):
             "with_decay": True
         }
 
-        param_out, moment1_out, moment2_out = adamw_step(self.inputs,
-                                                         self.attrs)
+        param_out, moment1_out, moment2_out = adamw_step(
+            self.inputs, self.attrs)
 
         self.outputs = {
             'Moment1Out': moment1_out,
@@ -165,16 +167,16 @@ def test_check_output(self):
 
 
 class TestAdamWOp(unittest.TestCase):
+
     def test_adamw_op_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
-        adam = paddle.optimizer.AdamW(
-            learning_rate=0.01,
-            parameters=linear.parameters(),
-            apply_decay_param_fun=lambda name: True,
-            weight_decay=0.01)
+        adam = paddle.optimizer.AdamW(learning_rate=0.01,
+                                      parameters=linear.parameters(),
+                                      apply_decay_param_fun=lambda name: True,
+                                      weight_decay=0.01)
 
         for _ in range(2):
             out = linear(a)
@@ -187,11 +189,10 @@ def test_adamw_op_coverage(self):
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
-        adam = paddle.optimizer.AdamW(
-            learning_rate=0.0,
-            parameters=linear.parameters(),
-            apply_decay_param_fun=lambda name: True,
-            weight_decay=0.01)
+        adam = paddle.optimizer.AdamW(learning_rate=0.0,
+                                      parameters=linear.parameters(),
+                                      apply_decay_param_fun=lambda name: True,
+                                      weight_decay=0.01)
         assert (adam.__str__() is not None)
 
     def test_adamw_op(self):
@@ -207,17 +208,20 @@ def test_adamw_op(self):
                 conv = fluid.layers.conv2d(data, 8, 3)
                 loss = paddle.mean(conv)
 
-                beta1 = fluid.layers.create_global_var(
-                    shape=[1], value=0.85, dtype='float32', persistable=True)
-                beta2 = fluid.layers.create_global_var(
-                    shape=[1], value=0.95, dtype='float32', persistable=True)
+                beta1 = fluid.layers.create_global_var(shape=[1],
+                                                       value=0.85,
+                                                       dtype='float32',
+                                                       persistable=True)
+                beta2 = fluid.layers.create_global_var(shape=[1],
+                                                       value=0.95,
+                                                       dtype='float32',
+                                                       persistable=True)
                 betas = [beta1, beta2]
-                opt = paddle.optimizer.AdamW(
-                    learning_rate=1e-5,
-                    beta1=beta1,
-                    beta2=beta2,
-                    weight_decay=0.01,
-                    epsilon=1e-8)
+                opt = paddle.optimizer.AdamW(learning_rate=1e-5,
+                                             beta1=beta1,
+                                             beta2=beta2,
+                                             weight_decay=0.01,
+                                             epsilon=1e-8)
                 opt.minimize(loss)
 
         exe.run(startup)
@@ -230,14 +234,17 @@ def test_adamw_op_invalid_input(self):
         paddle.disable_static()
         linear = paddle.nn.Linear(10, 10)
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.AdamW(
-                0.1, beta1=-1, parameters=linear.parameters())
+            adam = paddle.optimizer.AdamW(0.1,
+                                          beta1=-1,
+                                          parameters=linear.parameters())
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.AdamW(
-                0.1, beta2=-1, parameters=linear.parameters())
+            adam = paddle.optimizer.AdamW(0.1,
+                                          beta2=-1,
+                                          parameters=linear.parameters())
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.AdamW(
-                0.1, epsilon=-1, parameters=linear.parameters())
+            adam = paddle.optimizer.AdamW(0.1,
+                                          epsilon=-1,
+                                          parameters=linear.parameters())
 
     def test_api_eager_dygraph(self):
         with _test_eager_guard():
@@ -246,22 +253,25 @@ def test_api_eager_dygraph(self):
 
 
 class TestAdamWOpGroup(TestAdamWOp):
+
     def test_adamw_op_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_tensor(value)
         linear_1 = paddle.nn.Linear(13, 5)
         linear_2 = paddle.nn.Linear(5, 3)
-        adam = paddle.optimizer.AdamW(
-            learning_rate=0.01,
-            parameters=[{
-                'params': linear_1.parameters()
-            }, {
-                'params': linear_2.parameters(),
-                'weight_decay': 0.001
-            }],
-            apply_decay_param_fun=lambda name: True,
-            weight_decay=0.01)
+        adam = paddle.optimizer.AdamW(learning_rate=0.01,
+                                      parameters=[{
+                                          'params':
+                                          linear_1.parameters()
+                                      }, {
+                                          'params':
+                                          linear_2.parameters(),
+                                          'weight_decay':
+                                          0.001
+                                      }],
+                                      apply_decay_param_fun=lambda name: True,
+                                      weight_decay=0.01)
 
         for _ in range(2):
             out = linear_1(a)
@@ -272,6 +282,7 @@ def test_adamw_op_dygraph(self):
 
 
 class TestAdamWOpMultiPrecison(unittest.TestCase):
+
     def _test_adamw_op_dygraph_place_amp(self, place, use_amp=False):
         paddle.disable_static()
         paddle.seed(10)
@@ -281,14 +292,17 @@ def _test_adamw_op_dygraph_place_amp(self, place, use_amp=False):
 
         model = paddle.nn.Linear(5, 5)
 
-        optimizer = paddle.optimizer.AdamW(
-            parameters=[{
-                'params': model.parameters(),
-                'weight_decay': 0.001,
-                'beta1': 0.1,
-                'beta2': 0.99
-            }],
-            multi_precision=use_amp)
+        optimizer = paddle.optimizer.AdamW(parameters=[{
+            'params':
+            model.parameters(),
+            'weight_decay':
+            0.001,
+            'beta1':
+            0.1,
+            'beta2':
+            0.99
+        }],
+                                           multi_precision=use_amp)
 
         for idx in range(2):
             if place == 'gpu' and use_amp == True:
@@ -324,19 +338,19 @@ def test_main(self):
 
 
 class TestAdamWOpError(unittest.TestCase):
+
     def test_api_errors(self):
+
         def test_weight_decay_dtype():
             linear = paddle.nn.Linear(13, 5)
-            adam = paddle.optimizer.AdamW(
-                learning_rate=0.01,
-                parameters=linear.parameters(),
-                weight_decay=1)
+            adam = paddle.optimizer.AdamW(learning_rate=0.01,
+                                          parameters=linear.parameters(),
+                                          weight_decay=1)
 
         def test_parameters_dtype1():
-            adam = paddle.optimizer.AdamW(
-                learning_rate=0.01,
-                parameters=paddle.randn((5, 5)),
-                weight_decay=0.1)
+            adam = paddle.optimizer.AdamW(learning_rate=0.01,
+                                          parameters=paddle.randn((5, 5)),
+                                          weight_decay=0.1)
 
         def test_parameters_dtype2():
             linear = paddle.nn.Linear(13, 5)
@@ -346,8 +360,9 @@ def test_parameters_dtype2():
                 weight_decay=0.1)
 
         def test_parameters_dtype3():
-            adam = paddle.optimizer.AdamW(
-                learning_rate=0.01, parameters=None, weight_decay=0.1)
+            adam = paddle.optimizer.AdamW(learning_rate=0.01,
+                                          parameters=None,
+                                          weight_decay=0.1)
 
         def test_parameters_dtype4():
             linear = paddle.nn.Linear(13, 5)
@@ -358,18 +373,16 @@ def test_parameters_dtype4():
 
         def test_learning_rate_dtype():
             linear = paddle.nn.Linear(13, 5)
-            adam = paddle.optimizer.AdamW(
-                learning_rate=1,
-                parameters=linear.parameters(),
-                weight_decay=0.1)
+            adam = paddle.optimizer.AdamW(learning_rate=1,
+                                          parameters=linear.parameters(),
+                                          weight_decay=0.1)
 
         def test_grad_clip_dtype():
             linear = paddle.nn.Linear(13, 5)
-            adam = paddle.optimizer.AdamW(
-                learning_rate=0.01,
-                parameters=linear.parameters(),
-                weight_decay=0.1,
-                grad_clip=0.1)
+            adam = paddle.optimizer.AdamW(learning_rate=0.01,
+                                          parameters=linear.parameters(),
+                                          weight_decay=0.1,
+                                          grad_clip=0.1)
 
         self.assertRaises(TypeError, test_weight_decay_dtype)
         self.assertRaises(TypeError, test_parameters_dtype1)
@@ -381,6 +394,7 @@ def test_grad_clip_dtype():
 
 
 class TestAdamWOpGroupWithLR(TestAdamWOp):
+
     def test_adamw_op_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -422,6 +436,7 @@ def simple_lr_setting(param, decay_rate, n_layers):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestAdamWOpLayerwiseLR(TestAdamWOp):
+
     def setUp(self):
         random.seed(2022)
         np.random.seed(2022)
@@ -460,16 +475,15 @@ def test_adamw_op_dygraph(self):
         beta1 = 0.9
         beta2 = 0.999
 
-        opt = paddle.optimizer.AdamW(
-            learning_rate=learning_rate,
-            parameters=[{
-                'params': linear1.parameters()
-            }, {
-                'params': linear2.parameters(),
-            }],
-            apply_decay_param_fun=lambda name: True,
-            weight_decay=weight_decay,
-            lr_ratio=simple_lr_fun)
+        opt = paddle.optimizer.AdamW(learning_rate=learning_rate,
+                                     parameters=[{
+                                         'params': linear1.parameters()
+                                     }, {
+                                         'params': linear2.parameters(),
+                                     }],
+                                     apply_decay_param_fun=lambda name: True,
+                                     weight_decay=weight_decay,
+                                     lr_ratio=simple_lr_fun)
 
         def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
             np_inputs = {
@@ -490,8 +504,8 @@ def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
                 "coeff": weight_decay,
                 "with_decay": True
             }
-            param_out, moment1_out, moment2_out = adamw_step(np_inputs,
-                                                             np_attrs)
+            param_out, moment1_out, moment2_out = adamw_step(
+                np_inputs, np_attrs)
             return param_out, moment1_out, moment2_out
 
         for i in range(5):
@@ -503,20 +517,16 @@ def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
             out.backward()
 
             fc1_w, fc1_w_mon1, fc1_w_mon2 = get_numpy_output(
-                fc1_w,
-                np.array(linear1.weight.grad), fc1_w_mon1, fc1_w_mon2,
+                fc1_w, np.array(linear1.weight.grad), fc1_w_mon1, fc1_w_mon2,
                 simple_lr_fun(linear1.weight), i + 1)
             fc1_b, fc1_b_mon1, fc1_b_mon2 = get_numpy_output(
-                fc1_b,
-                np.array(linear1.bias.grad), fc1_b_mon1, fc1_b_mon2,
+                fc1_b, np.array(linear1.bias.grad), fc1_b_mon1, fc1_b_mon2,
                 simple_lr_fun(linear1.bias), i + 1)
             fc2_w, fc2_w_mon1, fc2_w_mon2 = get_numpy_output(
-                fc2_w,
-                np.array(linear2.weight.grad), fc2_w_mon1, fc2_w_mon2,
+                fc2_w, np.array(linear2.weight.grad), fc2_w_mon1, fc2_w_mon2,
                 simple_lr_fun(linear2.weight), i + 1)
             fc2_b, fc2_b_mon1, fc2_b_mon2 = get_numpy_output(
-                fc2_b,
-                np.array(linear2.bias.grad), fc2_b_mon1, fc2_b_mon2,
+                fc2_b, np.array(linear2.bias.grad), fc2_b_mon1, fc2_b_mon2,
                 simple_lr_fun(linear2.bias), i + 1)
 
             opt.step()
@@ -552,10 +562,14 @@ def test_adamw_op(self):
                 bias_attr2 = paddle.framework.ParamAttr(
                     name="linear_1.b_0",
                     initializer=paddle.nn.initializer.Constant(value=1.0))
-                linear1 = paddle.nn.Linear(
-                    10, 32, weight_attr=weight_attr1, bias_attr=bias_attr1)
-                linear2 = paddle.nn.Linear(
-                    32, 1, weight_attr=weight_attr2, bias_attr=bias_attr2)
+                linear1 = paddle.nn.Linear(10,
+                                           32,
+                                           weight_attr=weight_attr1,
+                                           bias_attr=bias_attr1)
+                linear2 = paddle.nn.Linear(32,
+                                           1,
+                                           weight_attr=weight_attr2,
+                                           bias_attr=bias_attr2)
 
                 out = linear1(x)
                 out = linear2(out)
@@ -572,16 +586,16 @@ def test_adamw_op(self):
                 cost = fluid.layers.square_error_cost(input=out, label=y)
                 avg_cost = fluid.layers.mean(cost)
 
-                simple_lr_fun = partial(
-                    simple_lr_setting, decay_rate=0.8, n_layers=2)
+                simple_lr_fun = partial(simple_lr_setting,
+                                        decay_rate=0.8,
+                                        n_layers=2)
 
-                opt = paddle.optimizer.AdamW(
-                    learning_rate=learning_rate,
-                    beta1=beta1,
-                    beta2=beta2,
-                    weight_decay=weight_decay,
-                    epsilon=epsilon,
-                    lr_ratio=simple_lr_fun)
+                opt = paddle.optimizer.AdamW(learning_rate=learning_rate,
+                                             beta1=beta1,
+                                             beta2=beta2,
+                                             weight_decay=weight_decay,
+                                             epsilon=epsilon,
+                                             lr_ratio=simple_lr_fun)
                 opt.minimize(avg_cost)
 
         def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
@@ -603,8 +617,8 @@ def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
                 "coeff": weight_decay,
                 "with_decay": True
             }
-            param_out, moment1_out, moment2_out = adamw_step(np_inputs,
-                                                             np_attrs)
+            param_out, moment1_out, moment2_out = adamw_step(
+                np_inputs, np_attrs)
             return param_out, moment1_out, moment2_out
 
         fetch_list1 = [
@@ -625,12 +639,16 @@ def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
             outputs = np.random.random(size=[8, 1]).astype('float32')
 
             param = exe.run(test_prog,
-                            feed={"x": inputs,
-                                  "y": outputs},
+                            feed={
+                                "x": inputs,
+                                "y": outputs
+                            },
                             fetch_list=fetch_list1)
             params_and_gras = exe.run(train_prog,
-                                      feed={"x": inputs,
-                                            "y": outputs},
+                                      feed={
+                                          "x": inputs,
+                                          "y": outputs
+                                      },
                                       fetch_list=fetch_list2)
 
             fc1_w = param[0]
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
index 47658518551f2..204a16668ad85 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
@@ -47,8 +47,8 @@ def avg_pool1D_forward_naive(x,
     if adaptive:
         L_out = ksize[0]
     else:
-        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] -
+                 1) // strides[0] + 1 if ceil_mode else (
                      L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
 
     out = np.zeros((N, C, L_out))
@@ -64,15 +64,16 @@ def avg_pool1D_forward_naive(x,
         field_size = (r_end - r_start) \
             if (exclusive or adaptive) else (ksize[0])
         if data_type == np.int8 or data_type == np.uint8:
-            out[:, :, i] = (np.rint(
-                np.sum(x_masked, axis=(2, 3)) / field_size)).astype(data_type)
+            out[:, :, i] = (np.rint(np.sum(x_masked, axis=(2, 3)) /
+                                    field_size)).astype(data_type)
         else:
-            out[:, :, i] = (np.sum(x_masked, axis=(2)) /
-                            field_size).astype(data_type)
+            out[:, :,
+                i] = (np.sum(x_masked, axis=(2)) / field_size).astype(data_type)
     return out
 
 
 class TestPool1D_API(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -84,8 +85,11 @@ def check_adaptive_avg_dygraph_results(self, place):
             input_np = np.random.random([2, 3, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
             result = F.adaptive_avg_pool1d(input, output_size=16)
-            result_np = avg_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+            result_np = avg_pool1D_forward_naive(input_np,
+                                                 ksize=[16],
+                                                 strides=[0],
+                                                 paddings=[0],
+                                                 adaptive=True)
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -94,8 +98,9 @@ def check_adaptive_avg_dygraph_results(self, place):
             result = ada_max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            result = paddle.nn.functional.common.interpolate(
-                input, mode="area", size=16)
+            result = paddle.nn.functional.common.interpolate(input,
+                                                             mode="area",
+                                                             size=16)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
     def check_adaptive_avg_static_results(self, place):
@@ -104,8 +109,11 @@ def check_adaptive_avg_static_results(self, place):
             result = F.adaptive_avg_pool1d(input, output_size=16)
 
             input_np = np.random.random([2, 3, 32]).astype("float32")
-            result_np = avg_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+            result_np = avg_pool1D_forward_naive(input_np,
+                                                 ksize=[16],
+                                                 strides=[2],
+                                                 paddings=[0],
+                                                 adaptive=True)
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
index 2b104041f9468..2531834b217f8 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
@@ -33,7 +33,9 @@ def adaptive_end_index(index, input_size, output_size):
     return int(np.ceil((index + 1) * input_size / output_size))
 
 
-def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
+def adaptive_pool2d_forward(x,
+                            output_size,
+                            data_format='NCHW',
                             pool_type="avg"):
 
     N = x.shape[0]
@@ -68,16 +70,16 @@ def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
             if data_format == 'NCHW':
                 x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
                 if pool_type == 'avg':
-                    field_size = (
-                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    field_size = ((in_h_end - in_h_start) *
+                                  (in_w_end - in_w_start))
                     out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
                 elif pool_type == 'max':
                     out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
             elif data_format == 'NHWC':
                 x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
                 if pool_type == 'avg':
-                    field_size = (
-                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    field_size = ((in_h_end - in_h_start) *
+                                  (in_w_end - in_w_start))
                     out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
                 elif pool_type == 'max':
                     out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
@@ -85,25 +87,29 @@ def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
 
 
 class TestAdaptiveAvgPool2DAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
-        self.res_1_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=[3, 3], pool_type="avg")
+        self.res_1_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=[3, 3],
+                                                pool_type="avg")
 
-        self.res_2_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=5, pool_type="avg")
+        self.res_2_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=5,
+                                                pool_type="avg")
 
-        self.res_3_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=[2, 5], pool_type="avg")
+        self.res_3_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=[2, 5],
+                                                pool_type="avg")
 
-        self.res_4_np = adaptive_pool2d_forward(
-            x=self.x_np,
-            output_size=[3, 3],
-            pool_type="avg",
-            data_format="NHWC")
+        self.res_4_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=[3, 3],
+                                                pool_type="avg",
+                                                data_format="NHWC")
 
-        self.res_5_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=[None, 3], pool_type="avg")
+        self.res_5_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=[None, 3],
+                                                pool_type="avg")
 
     def test_static_graph(self):
         for use_cuda in ([False, True]
@@ -112,25 +118,26 @@ def test_static_graph(self):
             paddle.enable_static()
             x = paddle.fluid.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
 
-            out_1 = paddle.nn.functional.adaptive_avg_pool2d(
-                x=x, output_size=[3, 3])
+            out_1 = paddle.nn.functional.adaptive_avg_pool2d(x=x,
+                                                             output_size=[3, 3])
 
             out_2 = paddle.nn.functional.adaptive_avg_pool2d(x=x, output_size=5)
 
-            out_3 = paddle.nn.functional.adaptive_avg_pool2d(
-                x=x, output_size=[2, 5])
+            out_3 = paddle.nn.functional.adaptive_avg_pool2d(x=x,
+                                                             output_size=[2, 5])
 
-            out_4 = paddle.nn.functional.adaptive_avg_pool2d(
-                x=x, output_size=[3, 3], data_format="NHWC")
+            out_4 = paddle.nn.functional.adaptive_avg_pool2d(x=x,
+                                                             output_size=[3, 3],
+                                                             data_format="NHWC")
 
             out_5 = paddle.nn.functional.adaptive_avg_pool2d(
                 x=x, output_size=[None, 3])
 
             exe = paddle.static.Executor(place=place)
-            [res_1, res_2, res_3, res_4, res_5] = exe.run(
-                fluid.default_main_program(),
-                feed={"x": self.x_np},
-                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+            [res_1, res_2, res_3, res_4,
+             res_5] = exe.run(fluid.default_main_program(),
+                              feed={"x": self.x_np},
+                              fetch_list=[out_1, out_2, out_3, out_4, out_5])
 
             assert np.allclose(res_1, self.res_1_np)
 
@@ -149,22 +156,24 @@ def test_dynamic_graph(self):
             paddle.disable_static(place=place)
             x = paddle.to_tensor(self.x_np)
 
-            out_1 = paddle.nn.functional.adaptive_avg_pool2d(
-                x=x, output_size=[3, 3])
+            out_1 = paddle.nn.functional.adaptive_avg_pool2d(x=x,
+                                                             output_size=[3, 3])
 
             out_2 = paddle.nn.functional.adaptive_avg_pool2d(x=x, output_size=5)
 
-            out_3 = paddle.nn.functional.adaptive_avg_pool2d(
-                x=x, output_size=[2, 5])
+            out_3 = paddle.nn.functional.adaptive_avg_pool2d(x=x,
+                                                             output_size=[2, 5])
 
-            out_4 = paddle.nn.functional.adaptive_avg_pool2d(
-                x=x, output_size=[3, 3], data_format="NHWC")
+            out_4 = paddle.nn.functional.adaptive_avg_pool2d(x=x,
+                                                             output_size=[3, 3],
+                                                             data_format="NHWC")
 
             out_5 = paddle.nn.functional.adaptive_avg_pool2d(
                 x=x, output_size=[None, 3])
 
-            out_6 = paddle.nn.functional.interpolate(
-                x=x, mode="area", size=[2, 5])
+            out_6 = paddle.nn.functional.interpolate(x=x,
+                                                     mode="area",
+                                                     size=[2, 5])
 
             assert np.allclose(out_1.numpy(), self.res_1_np)
 
@@ -180,25 +189,29 @@ def test_dynamic_graph(self):
 
 
 class TestAdaptiveAvgPool2DClassAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
-        self.res_1_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=[3, 3], pool_type="avg")
+        self.res_1_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=[3, 3],
+                                                pool_type="avg")
 
-        self.res_2_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=5, pool_type="avg")
+        self.res_2_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=5,
+                                                pool_type="avg")
 
-        self.res_3_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=[2, 5], pool_type="avg")
+        self.res_3_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=[2, 5],
+                                                pool_type="avg")
 
-        self.res_4_np = adaptive_pool2d_forward(
-            x=self.x_np,
-            output_size=[3, 3],
-            pool_type="avg",
-            data_format="NHWC")
+        self.res_4_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=[3, 3],
+                                                pool_type="avg",
+                                                data_format="NHWC")
 
-        self.res_5_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=[None, 3], pool_type="avg")
+        self.res_5_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=[None, 3],
+                                                pool_type="avg")
 
     def test_static_graph(self):
         for use_cuda in ([False, True]
@@ -216,8 +229,8 @@ def test_static_graph(self):
             adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=[2, 5])
             out_3 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(
-                output_size=[3, 3], data_format="NHWC")
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=[3, 3],
+                                                            data_format="NHWC")
             out_4 = adaptive_avg_pool(x=x)
 
             adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(
@@ -225,10 +238,10 @@ def test_static_graph(self):
             out_5 = adaptive_avg_pool(x=x)
 
             exe = paddle.static.Executor(place=place)
-            [res_1, res_2, res_3, res_4, res_5] = exe.run(
-                fluid.default_main_program(),
-                feed={"x": self.x_np},
-                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+            [res_1, res_2, res_3, res_4,
+             res_5] = exe.run(fluid.default_main_program(),
+                              feed={"x": self.x_np},
+                              fetch_list=[out_1, out_2, out_3, out_4, out_5])
 
             assert np.allclose(res_1, self.res_1_np)
 
@@ -256,8 +269,8 @@ def test_dynamic_graph(self):
             adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=[2, 5])
             out_3 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(
-                output_size=[3, 3], data_format="NHWC")
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=[3, 3],
+                                                            data_format="NHWC")
             out_4 = adaptive_avg_pool(x=x)
 
             adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
index deb45da8a0189..98258b3558fad 100755
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
@@ -76,19 +76,19 @@ def adaptive_pool3d_forward(x,
                 w_end = adaptive_end_index(j, W, output_size[2])
 
                 if data_format == 'NCDHW':
-                    x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
-                                 w_end]
+                    x_masked = x[:, :, d_start:d_end, h_start:h_end,
+                                 w_start:w_end]
                     if pool_type == 'avg':
                         field_size = (d_end - d_start) * (h_end - h_start) * (
                             w_end - w_start)
-                        out[:, :, k, i, j] = np.sum(x_masked,
-                                                    axis=(2, 3, 4)) / field_size
+                        out[:, :, k, i,
+                            j] = np.sum(x_masked, axis=(2, 3, 4)) / field_size
                     elif pool_type == 'max':
                         out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
 
                 elif data_format == 'NDHWC':
-                    x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
-                                 w_end, :]
+                    x_masked = x[:, d_start:d_end, h_start:h_end,
+                                 w_start:w_end, :]
                     if pool_type == 'avg':
                         field_size = (d_end - d_start) * (h_end - h_start) * (
                             w_end - w_start)
@@ -100,33 +100,38 @@ def adaptive_pool3d_forward(x,
 
 
 class TestAdaptiveAvgPool3DAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
-        self.res_1_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=[3, 3, 3], pool_type="avg")
+        self.res_1_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[3, 3, 3],
+                                                pool_type="avg")
 
-        self.res_2_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=5, pool_type="avg")
+        self.res_2_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=5,
+                                                pool_type="avg")
 
-        self.res_3_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=[2, 3, 5], pool_type="avg")
+        self.res_3_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[2, 3, 5],
+                                                pool_type="avg")
 
-        self.res_4_np = adaptive_pool3d_forward(
-            x=self.x_np,
-            output_size=[3, 3, 3],
-            pool_type="avg",
-            data_format="NDHWC")
+        self.res_4_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[3, 3, 3],
+                                                pool_type="avg",
+                                                data_format="NDHWC")
 
-        self.res_5_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=[None, 3, None], pool_type="avg")
+        self.res_5_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[None, 3, None],
+                                                pool_type="avg")
 
     def test_static_graph(self):
         for use_cuda in ([False, True]
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.fluid.data(
-                name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+            x = paddle.fluid.data(name="x",
+                                  shape=[2, 3, 5, 7, 7],
+                                  dtype="float32")
 
             out_1 = paddle.nn.functional.adaptive_avg_pool3d(
                 x=x, output_size=[3, 3, 3])
@@ -143,10 +148,10 @@ def test_static_graph(self):
                 x=x, output_size=[None, 3, None])
 
             exe = paddle.static.Executor(place=place)
-            [res_1, res_2, res_3, res_4, res_5] = exe.run(
-                fluid.default_main_program(),
-                feed={"x": self.x_np},
-                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+            [res_1, res_2, res_3, res_4,
+             res_5] = exe.run(fluid.default_main_program(),
+                              feed={"x": self.x_np},
+                              fetch_list=[out_1, out_2, out_3, out_4, out_5])
 
             assert np.allclose(res_1, self.res_1_np)
 
@@ -179,8 +184,9 @@ def test_dynamic_graph(self):
             out_5 = paddle.nn.functional.adaptive_avg_pool3d(
                 x=x, output_size=[None, 3, None])
 
-            out_6 = paddle.nn.functional.interpolate(
-                x=x, mode="area", size=[2, 3, 5])
+            out_6 = paddle.nn.functional.interpolate(x=x,
+                                                     mode="area",
+                                                     size=[2, 3, 5])
 
             assert np.allclose(out_1.numpy(), self.res_1_np)
 
@@ -196,33 +202,38 @@ def test_dynamic_graph(self):
 
 
 class TestAdaptiveAvgPool3DClassAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
-        self.res_1_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=[3, 3, 3], pool_type="avg")
+        self.res_1_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[3, 3, 3],
+                                                pool_type="avg")
 
-        self.res_2_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=5, pool_type="avg")
+        self.res_2_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=5,
+                                                pool_type="avg")
 
-        self.res_3_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=[2, 3, 5], pool_type="avg")
+        self.res_3_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[2, 3, 5],
+                                                pool_type="avg")
 
-        self.res_4_np = adaptive_pool3d_forward(
-            x=self.x_np,
-            output_size=[3, 3, 3],
-            pool_type="avg",
-            data_format="NDHWC")
+        self.res_4_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[3, 3, 3],
+                                                pool_type="avg",
+                                                data_format="NDHWC")
 
-        self.res_5_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=[None, 3, None], pool_type="avg")
+        self.res_5_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[None, 3, None],
+                                                pool_type="avg")
 
     def test_static_graph(self):
         for use_cuda in ([False, True]
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.fluid.data(
-                name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+            x = paddle.fluid.data(name="x",
+                                  shape=[2, 3, 5, 7, 7],
+                                  dtype="float32")
 
             adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(
                 output_size=[3, 3, 3])
@@ -244,10 +255,10 @@ def test_static_graph(self):
             out_5 = adaptive_avg_pool(x=x)
 
             exe = paddle.static.Executor(place=place)
-            [res_1, res_2, res_3, res_4, res_5] = exe.run(
-                fluid.default_main_program(),
-                feed={"x": self.x_np},
-                fetch_list=[out_1, out_2, out_3, out_4, out_5])
+            [res_1, res_2, res_3, res_4,
+             res_5] = exe.run(fluid.default_main_program(),
+                              feed={"x": self.x_np},
+                              fetch_list=[out_1, out_2, out_3, out_4, out_5])
 
             assert np.allclose(res_1, self.res_1_np)
 
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py
index 2a0415722be74..db577ec53787d 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py
@@ -45,8 +45,8 @@ def max_pool1D_forward_naive(x,
     if adaptive:
         L_out = ksize[0]
     else:
-        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] -
+                 1) // strides[0] + 1 if ceil_mode else (
                      L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
 
     out = np.zeros((N, C, L_out))
@@ -64,6 +64,7 @@ def max_pool1D_forward_naive(x,
 
 
 class TestPool1D_API(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -76,8 +77,11 @@ def check_adaptive_max_dygraph_results(self, place):
             input = fluid.dygraph.to_variable(input_np)
             result = F.adaptive_max_pool1d(input, output_size=16)
 
-            result_np = max_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
+            result_np = max_pool1D_forward_naive(input_np,
+                                                 ksize=[16],
+                                                 strides=[0],
+                                                 paddings=[0],
+                                                 adaptive=True)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
             ada_max_pool1d_dg = paddle.nn.layer.AdaptiveMaxPool1D(
@@ -91,8 +95,11 @@ def check_adaptive_max_static_results(self, place):
             result = F.adaptive_max_pool1d(input, output_size=16)
 
             input_np = np.random.random([2, 3, 32]).astype("float32")
-            result_np = max_pool1D_forward_naive(
-                input_np, ksize=[16], strides=[2], paddings=[0], adaptive=True)
+            result_np = max_pool1D_forward_naive(input_np,
+                                                 ksize=[16],
+                                                 strides=[2],
+                                                 paddings=[0],
+                                                 adaptive=True)
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
@@ -107,14 +114,14 @@ def test_adaptive_max_pool1d(self):
 
 
 class TestOutDtype(unittest.TestCase):
+
     def test_max_pool(self):
         api_fn = F.adaptive_max_pool1d
         shape = [1, 3, 32]
-        check_out_dtype(
-            api_fn,
-            in_specs=[(shape, )],
-            expect_dtypes=['float32', 'float64'],
-            output_size=16)
+        check_out_dtype(api_fn,
+                        in_specs=[(shape, )],
+                        expect_dtypes=['float32', 'float64'],
+                        output_size=16)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
index 037475e166948..f92b47a8d6d12 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
@@ -34,7 +34,9 @@ def adaptive_end_index(index, input_size, output_size):
     return int(np.ceil((index + 1) * input_size / output_size))
 
 
-def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
+def adaptive_pool2d_forward(x,
+                            output_size,
+                            data_format='NCHW',
                             pool_type="max"):
 
     N = x.shape[0]
@@ -69,16 +71,16 @@ def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
             if data_format == 'NCHW':
                 x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
                 if pool_type == 'avg':
-                    field_size = (
-                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    field_size = ((in_h_end - in_h_start) *
+                                  (in_w_end - in_w_start))
                     out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
                 elif pool_type == 'max':
                     out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
             elif data_format == 'NHWC':
                 x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
                 if pool_type == 'avg':
-                    field_size = (
-                        (in_h_end - in_h_start) * (in_w_end - in_w_start))
+                    field_size = ((in_h_end - in_h_start) *
+                                  (in_w_end - in_w_start))
                     out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
                 elif pool_type == 'max':
                     out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
@@ -86,16 +88,20 @@ def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
 
 
 class TestAdaptiveMaxPool2DAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
-        self.res_1_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=[3, 3], pool_type="max")
+        self.res_1_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=[3, 3],
+                                                pool_type="max")
 
-        self.res_2_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=5, pool_type="max")
+        self.res_2_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=5,
+                                                pool_type="max")
 
-        self.res_3_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=[2, 5], pool_type="max")
+        self.res_3_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=[2, 5],
+                                                pool_type="max")
         """
         self.res_4_np = adaptive_pool2d_forward(
             x=self.x_np,
@@ -103,8 +109,9 @@ def setUp(self):
             pool_type="max",
             data_format="NHWC")
         """
-        self.res_5_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=[None, 3], pool_type="max")
+        self.res_5_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=[None, 3],
+                                                pool_type="max")
 
     def test_static_graph(self):
         for use_cuda in ([False, True]
@@ -113,13 +120,13 @@ def test_static_graph(self):
             paddle.enable_static()
             x = paddle.fluid.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
 
-            out_1 = paddle.nn.functional.adaptive_max_pool2d(
-                x=x, output_size=[3, 3])
+            out_1 = paddle.nn.functional.adaptive_max_pool2d(x=x,
+                                                             output_size=[3, 3])
 
             out_2 = paddle.nn.functional.adaptive_max_pool2d(x=x, output_size=5)
 
-            out_3 = paddle.nn.functional.adaptive_max_pool2d(
-                x=x, output_size=[2, 5])
+            out_3 = paddle.nn.functional.adaptive_max_pool2d(x=x,
+                                                             output_size=[2, 5])
 
             #out_4 = paddle.nn.functional.adaptive_max_pool2d(
             #    x=x, output_size=[3, 3], data_format="NHWC")
@@ -128,10 +135,10 @@ def test_static_graph(self):
                 x=x, output_size=[None, 3])
 
             exe = paddle.static.Executor(place=place)
-            [res_1, res_2, res_3, res_5] = exe.run(
-                fluid.default_main_program(),
-                feed={"x": self.x_np},
-                fetch_list=[out_1, out_2, out_3, out_5])
+            [res_1, res_2, res_3,
+             res_5] = exe.run(fluid.default_main_program(),
+                              feed={"x": self.x_np},
+                              fetch_list=[out_1, out_2, out_3, out_5])
 
             assert np.allclose(res_1, self.res_1_np)
 
@@ -150,13 +157,14 @@ def test_dynamic_graph(self):
             paddle.disable_static(place=place)
             x = paddle.to_tensor(self.x_np)
 
-            out_1 = paddle.nn.functional.adaptive_max_pool2d(
-                x=x, return_mask=False, output_size=[3, 3])
+            out_1 = paddle.nn.functional.adaptive_max_pool2d(x=x,
+                                                             return_mask=False,
+                                                             output_size=[3, 3])
 
             out_2 = paddle.nn.functional.adaptive_max_pool2d(x=x, output_size=5)
 
-            out_3 = paddle.nn.functional.adaptive_max_pool2d(
-                x=x, output_size=[2, 5])
+            out_3 = paddle.nn.functional.adaptive_max_pool2d(x=x,
+                                                             output_size=[2, 5])
 
             #out_4 = paddle.nn.functional.adaptive_max_pool2d(
             #    x=x, output_size=[3, 3], data_format="NHWC")
@@ -176,16 +184,20 @@ def test_dynamic_graph(self):
 
 
 class TestAdaptiveMaxPool2DClassAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
-        self.res_1_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=[3, 3], pool_type="max")
+        self.res_1_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=[3, 3],
+                                                pool_type="max")
 
-        self.res_2_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=5, pool_type="max")
+        self.res_2_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=5,
+                                                pool_type="max")
 
-        self.res_3_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=[2, 5], pool_type="max")
+        self.res_3_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=[2, 5],
+                                                pool_type="max")
 
         #self.res_4_np = adaptive_pool2d_forward(
         #    x=self.x_np,
@@ -193,8 +205,9 @@ def setUp(self):
         #    pool_type="max",
         #    data_format="NHWC")
 
-        self.res_5_np = adaptive_pool2d_forward(
-            x=self.x_np, output_size=[None, 3], pool_type="max")
+        self.res_5_np = adaptive_pool2d_forward(x=self.x_np,
+                                                output_size=[None, 3],
+                                                pool_type="max")
 
     def test_static_graph(self):
         for use_cuda in ([False, True]
@@ -221,10 +234,10 @@ def test_static_graph(self):
             out_5 = adaptive_max_pool(x=x)
 
             exe = paddle.static.Executor(place=place)
-            [res_1, res_2, res_3, res_5] = exe.run(
-                fluid.default_main_program(),
-                feed={"x": self.x_np},
-                fetch_list=[out_1, out_2, out_3, out_5])
+            [res_1, res_2, res_3,
+             res_5] = exe.run(fluid.default_main_program(),
+                              feed={"x": self.x_np},
+                              fetch_list=[out_1, out_2, out_3, out_5])
 
             assert np.allclose(res_1, self.res_1_np)
 
@@ -272,14 +285,14 @@ def test_dynamic_graph(self):
 
 
 class TestOutDtype(unittest.TestCase):
+
     def test_max_pool(self):
         api_fn = F.adaptive_max_pool2d
         shape = [1, 3, 32, 32]
-        check_out_dtype(
-            api_fn,
-            in_specs=[(shape, )],
-            expect_dtypes=['float32', 'float64'],
-            output_size=16)
+        check_out_dtype(api_fn,
+                        in_specs=[(shape, )],
+                        expect_dtypes=['float32', 'float64'],
+                        output_size=16)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
index 2a8fe51ae7f44..0f4a89c476168 100755
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
@@ -77,19 +77,19 @@ def adaptive_pool3d_forward(x,
                 w_end = adaptive_end_index(j, W, output_size[2])
 
                 if data_format == 'NCDHW':
-                    x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
-                                 w_end]
+                    x_masked = x[:, :, d_start:d_end, h_start:h_end,
+                                 w_start:w_end]
                     if pool_type == 'avg':
                         field_size = (d_end - d_start) * (h_end - h_start) * (
                             w_end - w_start)
-                        out[:, :, k, i, j] = np.sum(x_masked,
-                                                    axis=(2, 3, 4)) / field_size
+                        out[:, :, k, i,
+                            j] = np.sum(x_masked, axis=(2, 3, 4)) / field_size
                     elif pool_type == 'max':
                         out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
 
                 elif data_format == 'NDHWC':
-                    x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
-                                 w_end, :]
+                    x_masked = x[:, d_start:d_end, h_start:h_end,
+                                 w_start:w_end, :]
                     if pool_type == 'avg':
                         field_size = (d_end - d_start) * (h_end - h_start) * (
                             w_end - w_start)
@@ -101,33 +101,38 @@ def adaptive_pool3d_forward(x,
 
 
 class TestAdaptiveMaxPool3DAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
-        self.res_1_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=[3, 3, 3], pool_type="max")
+        self.res_1_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[3, 3, 3],
+                                                pool_type="max")
 
-        self.res_2_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=5, pool_type="max")
+        self.res_2_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=5,
+                                                pool_type="max")
 
-        self.res_3_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=[2, 3, 5], pool_type="max")
+        self.res_3_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[2, 3, 5],
+                                                pool_type="max")
 
-        self.res_4_np = adaptive_pool3d_forward(
-            x=self.x_np,
-            output_size=[3, 3, 3],
-            pool_type="max",
-            data_format="NDHWC")
+        self.res_4_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[3, 3, 3],
+                                                pool_type="max",
+                                                data_format="NDHWC")
 
-        self.res_5_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=[None, 3, None], pool_type="max")
+        self.res_5_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[None, 3, None],
+                                                pool_type="max")
 
     def test_static_graph(self):
         for use_cuda in ([False, True]
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.fluid.data(
-                name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+            x = paddle.fluid.data(name="x",
+                                  shape=[2, 3, 5, 7, 7],
+                                  dtype="float32")
 
             out_1 = paddle.nn.functional.adaptive_max_pool3d(
                 x=x, output_size=[3, 3, 3])
@@ -144,10 +149,10 @@ def test_static_graph(self):
                 x=x, output_size=[None, 3, None])
 
             exe = paddle.static.Executor(place=place)
-            [res_1, res_2, res_3, res_5] = exe.run(
-                fluid.default_main_program(),
-                feed={"x": self.x_np},
-                fetch_list=[out_1, out_2, out_3, out_5])
+            [res_1, res_2, res_3,
+             res_5] = exe.run(fluid.default_main_program(),
+                              feed={"x": self.x_np},
+                              fetch_list=[out_1, out_2, out_3, out_5])
 
             assert np.allclose(res_1, self.res_1_np)
 
@@ -192,16 +197,20 @@ def test_dynamic_graph(self):
 
 
 class TestAdaptiveMaxPool3DClassAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
-        self.res_1_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=[3, 3, 3], pool_type="max")
+        self.res_1_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[3, 3, 3],
+                                                pool_type="max")
 
-        self.res_2_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=5, pool_type="max")
+        self.res_2_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=5,
+                                                pool_type="max")
 
-        self.res_3_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=[2, 3, 5], pool_type="max")
+        self.res_3_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[2, 3, 5],
+                                                pool_type="max")
 
         # self.res_4_np = adaptive_pool3d_forward(
         #     x=self.x_np,
@@ -209,16 +218,18 @@ def setUp(self):
         #     pool_type="max",
         #     data_format="NDHWC")
 
-        self.res_5_np = adaptive_pool3d_forward(
-            x=self.x_np, output_size=[None, 3, None], pool_type="max")
+        self.res_5_np = adaptive_pool3d_forward(x=self.x_np,
+                                                output_size=[None, 3, None],
+                                                pool_type="max")
 
     def test_static_graph(self):
         for use_cuda in ([False, True]
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.fluid.data(
-                name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+            x = paddle.fluid.data(name="x",
+                                  shape=[2, 3, 5, 7, 7],
+                                  dtype="float32")
 
             adaptive_max_pool = paddle.nn.AdaptiveMaxPool3D(
                 output_size=[3, 3, 3])
@@ -240,10 +251,10 @@ def test_static_graph(self):
             out_5 = adaptive_max_pool(x=x)
 
             exe = paddle.static.Executor(place=place)
-            [res_1, res_2, res_3, res_5] = exe.run(
-                fluid.default_main_program(),
-                feed={"x": self.x_np},
-                fetch_list=[out_1, out_2, out_3, out_5])
+            [res_1, res_2, res_3,
+             res_5] = exe.run(fluid.default_main_program(),
+                              feed={"x": self.x_np},
+                              fetch_list=[out_1, out_2, out_3, out_5])
 
             assert np.allclose(res_1, self.res_1_np)
 
@@ -293,14 +304,14 @@ def test_dynamic_graph(self):
 
 
 class TestOutDtype(unittest.TestCase):
+
     def test_max_pool(self):
         api_fn = F.adaptive_max_pool3d
         shape = [1, 3, 32, 32, 32]
-        check_out_dtype(
-            api_fn,
-            in_specs=[(shape, )],
-            expect_dtypes=['float32', 'float64'],
-            output_size=16)
+        check_out_dtype(api_fn,
+                        in_specs=[(shape, )],
+                        expect_dtypes=['float32', 'float64'],
+                        output_size=16)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py b/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
index 5424a1447b862..14c201d76062a 100644
--- a/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
@@ -32,8 +32,9 @@ def add_position_encoding(input, alpha=1.0, beta=1.0):
     for i in range(batch_size):
         for j in range(max_length):
             for k in range(half_shape):
-                val = j / pow(10000.0, k * 1.0 / (
-                    half_shape - 1)) if half_shape > 1 else j / 10000.0
+                val = j / pow(
+                    10000.0, k * 1.0 /
+                    (half_shape - 1)) if half_shape > 1 else j / 10000.0
                 out[i, j, k] = \
                     input[i, j, k] * alpha + math.sin(val) * beta
                 out[i, j, half_shape + k] = \
@@ -54,7 +55,9 @@ def setUp(self):
         self.dtype = np.float64
         self.init_input_output()
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x), }
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+        }
         self.outputs = {'Out': self.out}
         self.attrs = {'alpha': self.alpha, 'beta': self.beta}
 
@@ -94,7 +97,9 @@ def setUp(self):
         self.dtype = np.float64
         self.init_input_output()
 
-        self.inputs = {'X': (self.x, self.lod), }
+        self.inputs = {
+            'X': (self.x, self.lod),
+        }
         self.outputs = {'Out': (self.out, self.lod)}
         self.attrs = {'alpha': self.alpha, 'beta': self.beta}
 
@@ -129,8 +134,9 @@ def init_input_output(self):
             max_length = self.lod[0][i]
             for j in range(max_length):
                 for k in range(half_shape):
-                    val = j / pow(10000.0, k * 1.0 / (
-                        half_shape - 1)) if half_shape > 1 else j / 10000.0
+                    val = j / pow(
+                        10000.0, k * 1.0 /
+                        (half_shape - 1)) if half_shape > 1 else j / 10000.0
                     pos = start + j
                     self.out[pos, k] = \
                         self.x[pos, k] * self.alpha + math.sin(val) * self.beta
@@ -140,19 +146,22 @@ def init_input_output(self):
 
 
 class TestAddPositionEncodingOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             input_data = np.random.random((4, 16, 8)).astype("float32")
 
             def test_Variable():
                 # the input type must be Variable
-                fluid.layers.add_position_encoding(
-                    input=input_data, alpha=1.0, beta=1.0)
+                fluid.layers.add_position_encoding(input=input_data,
+                                                   alpha=1.0,
+                                                   beta=1.0)
 
             self.assertRaises(TypeError, test_Variable)
 
 
 class TestAddPositionEncodingOpDygraph(unittest.TestCase):
+
     def test_dygraph(self):
         paddle.disable_static()
         tensor = np.random.randn(16, 32, 64)
diff --git a/python/paddle/fluid/tests/unittests/test_add_reader_dependency.py b/python/paddle/fluid/tests/unittests/test_add_reader_dependency.py
index a1205f7092a01..c7479e059b48f 100644
--- a/python/paddle/fluid/tests/unittests/test_add_reader_dependency.py
+++ b/python/paddle/fluid/tests/unittests/test_add_reader_dependency.py
@@ -21,15 +21,15 @@
 
 def inplace_add(x, bias):
     helper = LayerHelper('scale', **locals())
-    helper.append_op(
-        type='scale',
-        inputs={'X': [x]},
-        outputs={'Out': [x]},
-        attrs={'bias': bias})
+    helper.append_op(type='scale',
+                     inputs={'X': [x]},
+                     outputs={'Out': [x]},
+                     attrs={'bias': bias})
     return x
 
 
 class TestAddReaderDependency(unittest.TestCase):
+
     def setUp(self):
         self.batch_num = 3
         self.sleep_time = 2
@@ -54,11 +54,12 @@ def run_main(self, place):
                 def data_source():
                     for _ in range(self.batch_num):
                         time.sleep(self.sleep_time)  # sleep some times
-                        yield np.random.uniform(
-                            low=-1, high=1, size=[1]).astype('float32'),
+                        yield np.random.uniform(low=-1, high=1,
+                                                size=[1]).astype('float32'),
 
-                persistable_in = fluid.data(
-                    name='persistable_in', dtype='float32', shape=[1])
+                persistable_in = fluid.data(name='persistable_in',
+                                            dtype='float32',
+                                            shape=[1])
                 persistable_in.persistable = True
 
                 persistable_in = inplace_add(persistable_in, bias=1)
@@ -97,6 +98,7 @@ def data_source():
 
 
 class TestAddReaderDependencyWithoutDoubleBuffer(TestAddReaderDependency):
+
     def setUp(self):
         self.batch_num = 3
         self.sleep_time = 2
diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py
index bea7588acd3d0..da2e2335f7f47 100644
--- a/python/paddle/fluid/tests/unittests/test_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py
@@ -65,102 +65,91 @@ def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of addmm_op must be Variable.
 
-            input = fluid.create_lod_tensor(
-                np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
-            x2 = fluid.create_lod_tensor(
-                np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
+            input = fluid.create_lod_tensor(np.array([[-1, -1], [-1, -1]]),
+                                            [[2]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1, -1], [-1, -1]]), [[2]],
+                                         fluid.CPUPlace())
+            x2 = fluid.create_lod_tensor(np.array([[-1, -1], [-1, -1]]), [[2]],
+                                         fluid.CPUPlace())
             self.assertRaises(TypeError, paddle.addmm, input, x1, x2)
 
             # The input dtype of mul_op must be float32 or float64.
-            input = fluid.layers.data(
-                name='input',
-                shape=[4, 4],
-                dtype="int32",
-                append_batch_size=False)
-            x3 = fluid.layers.data(
-                name='x3', shape=[4, 4], dtype="int32", append_batch_size=False)
-            x4 = fluid.layers.data(
-                name='x4', shape=[4, 4], dtype="int32", append_batch_size=False)
+            input = fluid.layers.data(name='input',
+                                      shape=[4, 4],
+                                      dtype="int32",
+                                      append_batch_size=False)
+            x3 = fluid.layers.data(name='x3',
+                                   shape=[4, 4],
+                                   dtype="int32",
+                                   append_batch_size=False)
+            x4 = fluid.layers.data(name='x4',
+                                   shape=[4, 4],
+                                   dtype="int32",
+                                   append_batch_size=False)
             self.assertRaises(TypeError, paddle.addmm, input, x3, x4)
             # x and y dimension mismatch
-            x5 = fluid.layers.data(
-                name='x5',
-                shape=[4, 5],
-                dtype="float32",
-                append_batch_size=False)
-            x6 = fluid.layers.data(
-                name='x6',
-                shape=[4, 4],
-                dtype="float32",
-                append_batch_size=False)
+            x5 = fluid.layers.data(name='x5',
+                                   shape=[4, 5],
+                                   dtype="float32",
+                                   append_batch_size=False)
+            x6 = fluid.layers.data(name='x6',
+                                   shape=[4, 4],
+                                   dtype="float32",
+                                   append_batch_size=False)
             self.assertRaises(ValueError, paddle.addmm, input, x5, x6)
             # input and x are not broadcastable
-            x7 = fluid.layers.data(
-                name='x7',
-                shape=[4, 4],
-                dtype="float32",
-                append_batch_size=False)
-            x8 = fluid.layers.data(
-                name='x8',
-                shape=[4, 4],
-                dtype="float32",
-                append_batch_size=False)
-            input1 = fluid.layers.data(
-                name='input1',
-                shape=[2, 4],
-                dtype="float32",
-                append_batch_size=False)
+            x7 = fluid.layers.data(name='x7',
+                                   shape=[4, 4],
+                                   dtype="float32",
+                                   append_batch_size=False)
+            x8 = fluid.layers.data(name='x8',
+                                   shape=[4, 4],
+                                   dtype="float32",
+                                   append_batch_size=False)
+            input1 = fluid.layers.data(name='input1',
+                                       shape=[2, 4],
+                                       dtype="float32",
+                                       append_batch_size=False)
             self.assertRaises(ValueError, paddle.addmm, input1, x7, x8)
             # input and x are not broadcastable
-            x9 = fluid.layers.data(
-                name='x9',
-                shape=[4, 4],
-                dtype="float32",
-                append_batch_size=False)
-            x10 = fluid.layers.data(
-                name='x10',
-                shape=[4, 4],
-                dtype="float32",
-                append_batch_size=False)
-            input2 = fluid.layers.data(
-                name='input2',
-                shape=[1, 2],
-                dtype="float32",
-                append_batch_size=False)
+            x9 = fluid.layers.data(name='x9',
+                                   shape=[4, 4],
+                                   dtype="float32",
+                                   append_batch_size=False)
+            x10 = fluid.layers.data(name='x10',
+                                    shape=[4, 4],
+                                    dtype="float32",
+                                    append_batch_size=False)
+            input2 = fluid.layers.data(name='input2',
+                                       shape=[1, 2],
+                                       dtype="float32",
+                                       append_batch_size=False)
             self.assertRaises(ValueError, paddle.addmm, input2, x9, x10)
-            x11 = fluid.layers.data(
-                name='x11',
-                shape=[4, 4],
-                dtype="float32",
-                append_batch_size=False)
-            x12 = fluid.layers.data(
-                name='x12',
-                shape=[4, 4],
-                dtype="float32",
-                append_batch_size=False)
-            input3 = fluid.layers.data(
-                name='input3',
-                shape=[4, 2],
-                dtype="float32",
-                append_batch_size=False)
+            x11 = fluid.layers.data(name='x11',
+                                    shape=[4, 4],
+                                    dtype="float32",
+                                    append_batch_size=False)
+            x12 = fluid.layers.data(name='x12',
+                                    shape=[4, 4],
+                                    dtype="float32",
+                                    append_batch_size=False)
+            input3 = fluid.layers.data(name='input3',
+                                       shape=[4, 2],
+                                       dtype="float32",
+                                       append_batch_size=False)
             self.assertRaises(ValueError, paddle.addmm, input3, x11, x12)
-            x13 = fluid.layers.data(
-                name='x13',
-                shape=[4, 4],
-                dtype="float32",
-                append_batch_size=False)
-            x14 = fluid.layers.data(
-                name='x14',
-                shape=[4, 4],
-                dtype="float32",
-                append_batch_size=False)
-            input4 = fluid.layers.data(
-                name='input4',
-                shape=[3, 1],
-                dtype="float32",
-                append_batch_size=False)
+            x13 = fluid.layers.data(name='x13',
+                                    shape=[4, 4],
+                                    dtype="float32",
+                                    append_batch_size=False)
+            x14 = fluid.layers.data(name='x14',
+                                    shape=[4, 4],
+                                    dtype="float32",
+                                    append_batch_size=False)
+            input4 = fluid.layers.data(name='input4',
+                                       shape=[3, 1],
+                                       dtype="float32",
+                                       append_batch_size=False)
             self.assertRaises(ValueError, paddle.addmm, input4, x13, x14)
 
 
@@ -259,6 +248,7 @@ def test_check_grad_input(self):
 
 
 class TestAddMMOp5(unittest.TestCase):
+
     def test_api_with_dygraph(self):
         np_input = np.random.random((20, 30)).astype(np.float32)
         np_x = np.random.random((20, 6)).astype(np.float32)
@@ -273,6 +263,7 @@ def test_api_with_dygraph(self):
 
 
 class TestAddMMAPI(unittest.TestCase):
+
     def test_api_error(self):
         data_x = np.ones((2, 2)).astype(np.float32)
         data_y = np.ones((2, 2)).astype(np.float32)
@@ -285,8 +276,11 @@ def test_error1():
             x = paddle.to_tensor(data_x_wrong)
             y = paddle.to_tensor(data_y)
             input = paddle.to_tensor(data_input)
-            out = paddle.tensor.addmm(
-                input=input, x=x, y=y, beta=0.5, alpha=5.0)
+            out = paddle.tensor.addmm(input=input,
+                                      x=x,
+                                      y=y,
+                                      beta=0.5,
+                                      alpha=5.0)
 
         self.assertRaises(ValueError, test_error1)
 
@@ -295,8 +289,11 @@ def test_error2():
             x = paddle.to_tensor(data_x_wrong)
             y = paddle.to_tensor(data_y)
             input = paddle.to_tensor(data_input)
-            out = paddle.tensor.addmm(
-                input=input, x=x, y=y, beta=0.5, alpha=5.0)
+            out = paddle.tensor.addmm(input=input,
+                                      x=x,
+                                      y=y,
+                                      beta=0.5,
+                                      alpha=5.0)
 
         self.assertRaises(ValueError, test_error2)
 
@@ -305,8 +302,11 @@ def test_error3():
             x = paddle.to_tensor(data_x)
             y = paddle.to_tensor(data_y)
             input = paddle.to_tensor(data_input_wrong)
-            out = paddle.tensor.addmm(
-                input=input, x=x, y=y, beta=0.5, alpha=5.0)
+            out = paddle.tensor.addmm(input=input,
+                                      x=x,
+                                      y=y,
+                                      beta=0.5,
+                                      alpha=5.0)
 
         self.assertRaises(ValueError, test_error3)
 
@@ -315,8 +315,11 @@ def test_error4():
             x = paddle.to_tensor(data_x)
             y = paddle.to_tensor(data_y)
             input = paddle.to_tensor(data_input_wrong)
-            out = paddle.tensor.addmm(
-                input=input, x=x, y=y, beta=0.5, alpha=5.0)
+            out = paddle.tensor.addmm(input=input,
+                                      x=x,
+                                      y=y,
+                                      beta=0.5,
+                                      alpha=5.0)
 
         self.assertRaises(ValueError, test_error4)
 
@@ -334,10 +337,13 @@ def test_api_normal_1(self):
         x = paddle.to_tensor(data_x)
         y = paddle.to_tensor(data_y)
         input = paddle.to_tensor(data_input)
-        paddle_output = paddle.tensor.addmm(
-            input=input, x=x, y=y, beta=data_beta, alpha=data_alpha)
-        numpy_output = data_beta * data_input + data_alpha * np.dot(data_x,
-                                                                    data_y)
+        paddle_output = paddle.tensor.addmm(input=input,
+                                            x=x,
+                                            y=y,
+                                            beta=data_beta,
+                                            alpha=data_alpha)
+        numpy_output = data_beta * data_input + data_alpha * np.dot(
+            data_x, data_y)
 
         self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()), True)
 
@@ -355,10 +361,13 @@ def test_api_normal_2(self):
         x = paddle.to_tensor(data_x)
         y = paddle.to_tensor(data_y)
         input = paddle.to_tensor(data_input)
-        paddle_output = paddle.tensor.addmm(
-            input=input, x=x, y=y, beta=data_beta, alpha=data_alpha)
-        numpy_output = data_beta * data_input + data_alpha * np.dot(data_x,
-                                                                    data_y)
+        paddle_output = paddle.tensor.addmm(input=input,
+                                            x=x,
+                                            y=y,
+                                            beta=data_beta,
+                                            alpha=data_alpha)
+        numpy_output = data_beta * data_input + data_alpha * np.dot(
+            data_x, data_y)
 
         self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()), True)
 
@@ -376,10 +385,13 @@ def test_api_normal_3(self):
         x = paddle.to_tensor(data_x)
         y = paddle.to_tensor(data_y)
         input = paddle.to_tensor(data_input)
-        paddle_output = paddle.tensor.addmm(
-            input=input, x=x, y=y, beta=data_beta, alpha=data_alpha)
-        numpy_output = data_beta * data_input + data_alpha * np.dot(data_x,
-                                                                    data_y)
+        paddle_output = paddle.tensor.addmm(input=input,
+                                            x=x,
+                                            y=y,
+                                            beta=data_beta,
+                                            alpha=data_alpha)
+        numpy_output = data_beta * data_input + data_alpha * np.dot(
+            data_x, data_y)
 
         self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()), True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
index 6157314b1f060..e22f53a2393a5 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
@@ -36,6 +36,7 @@ def affine_channel(x, scale, bias, layout):
 
 
 class TestAffineChannelOp(OpTest):
+
     def setUp(self):
         self.op_type = "affine_channel"
         self.init_test_case()
@@ -69,6 +70,7 @@ def init_test_case(self):
 
 
 class TestAffineChannelOpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program()):
 
@@ -79,28 +81,32 @@ def test_x_type():
             self.assertRaises(TypeError, test_x_type)
 
             def test_x_dtype():
-                x2 = fluid.layers.data(
-                    name='x2', shape=[None, 1, 2, 2], dtype='int32')
+                x2 = fluid.layers.data(name='x2',
+                                       shape=[None, 1, 2, 2],
+                                       dtype='int32')
                 fluid.layers.affine_channel(x2)
 
             self.assertRaises(TypeError, test_x_dtype)
 
             def test_scale_type():
-                x3 = fluid.layers.data(
-                    name='x3', shape=[None, 1, 2, 2], dtype='float32')
+                x3 = fluid.layers.data(name='x3',
+                                       shape=[None, 1, 2, 2],
+                                       dtype='float32')
                 fluid.layers.affine_channel(x3, scale=1)
 
             self.assertRaises(TypeError, test_scale_type)
 
             def test_bias_type():
-                x4 = fluid.layers.data(
-                    name='x4', shape=[None, 1, 2, 2], dtype='float32')
+                x4 = fluid.layers.data(name='x4',
+                                       shape=[None, 1, 2, 2],
+                                       dtype='float32')
                 fluid.layers.affine_channel(x4, bias=1)
 
             self.assertRaises(TypeError, test_bias_type)
 
 
 class TestAffineChannelNHWC(TestAffineChannelOp):
+
     def init_test_case(self):
         self.shape = [2, 3, 3, 100]
         self.C = 100
@@ -114,6 +120,7 @@ def test_check_grad_stopgrad_dscale_dbias(self):
 
 
 class TestAffineChannel2D(TestAffineChannelOp):
+
     def init_test_case(self):
         self.shape = [2, 100]
         self.C = 100
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_function.py b/python/paddle/fluid/tests/unittests/test_affine_grid_function.py
index 6ca13c7a7290e..61ecc6a8f1212 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_function.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_function.py
@@ -22,6 +22,7 @@
 
 
 class AffineGridTestCase(unittest.TestCase):
+
     def __init__(self,
                  methodName='runTest',
                  theta_shape=(20, 2, 3),
@@ -48,8 +49,9 @@ def fluid_layer(self, place):
         start = fluid.Program()
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
-                theta_var = fluid.data(
-                    "input", self.theta_shape, dtype=self.dtype)
+                theta_var = fluid.data("input",
+                                       self.theta_shape,
+                                       dtype=self.dtype)
                 y_var = fluid.layers.affine_grid(theta_var, self.output_shape)
         feed_dict = {"input": self.theta}
         exe = fluid.Executor(place)
@@ -63,12 +65,12 @@ def functional(self, place):
         start = fluid.Program()
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
-                theta_var = fluid.data(
-                    "input", self.theta_shape, dtype=self.dtype)
-                y_var = F.affine_grid(
-                    theta_var,
-                    self.output_shape,
-                    align_corners=self.align_corners)
+                theta_var = fluid.data("input",
+                                       self.theta_shape,
+                                       dtype=self.dtype)
+                y_var = F.affine_grid(theta_var,
+                                      self.output_shape,
+                                      align_corners=self.align_corners)
         feed_dict = {"input": self.theta}
         exe = fluid.Executor(place)
         exe.run(start)
@@ -80,10 +82,11 @@ def paddle_dygraph_layer(self):
         theta_var = dg.to_variable(
             self.theta) if not self.invalid_theta else "invalid"
         output_shape = dg.to_variable(
-            self.
-            output_shape) if self.variable_output_shape else self.output_shape
-        y_var = F.affine_grid(
-            theta_var, output_shape, align_corners=self.align_corners)
+            self.output_shape
+        ) if self.variable_output_shape else self.output_shape
+        y_var = F.affine_grid(theta_var,
+                              output_shape,
+                              align_corners=self.align_corners)
         y_np = y_var.numpy()
         return y_np
 
@@ -106,6 +109,7 @@ def runTest(self):
 
 
 class AffineGridErrorTestCase(AffineGridTestCase):
+
     def runTest(self):
         place = fluid.CPUPlace()
         with dg.guard(place):
@@ -119,21 +123,18 @@ def add_cases(suite):
 
     suite.addTest(AffineGridTestCase(methodName='runTest', align_corners=False))
     suite.addTest(
-        AffineGridTestCase(
-            methodName='runTest', variable_output_shape=True))
+        AffineGridTestCase(methodName='runTest', variable_output_shape=True))
 
     suite.addTest(
-        AffineGridTestCase(
-            methodName='runTest',
-            theta_shape=(20, 2, 3),
-            output_shape=[20, 1, 7, 7],
-            align_corners=True))
+        AffineGridTestCase(methodName='runTest',
+                           theta_shape=(20, 2, 3),
+                           output_shape=[20, 1, 7, 7],
+                           align_corners=True))
 
 
 def add_error_cases(suite):
     suite.addTest(
-        AffineGridErrorTestCase(
-            methodName='runTest', output_shape="not_valid"))
+        AffineGridErrorTestCase(methodName='runTest', output_shape="not_valid"))
     suite.addTest(
         AffineGridErrorTestCase(
             methodName='runTest',
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
index 8277256009e72..9c5b2e9971e70 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
@@ -26,14 +26,12 @@ def AffineGrid(theta, size, align_corners):
     if not align_corners:
         h_factor = (h - 1) / float(h)
         w_factor = (w - 1) / float(w)
-    h_idx = np.repeat(
-        np.linspace(-1, 1, h)[np.newaxis, :], w,
-        axis=0).T[:, :, np.newaxis] * h_factor
-    w_idx = np.repeat(
-        np.linspace(-1, 1, w)[np.newaxis, :], h,
-        axis=0)[:, :, np.newaxis] * w_factor
-    grid = np.concatenate(
-        [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
+    h_idx = np.repeat(np.linspace(-1, 1, h)[np.newaxis, :], w,
+                      axis=0).T[:, :, np.newaxis] * h_factor
+    w_idx = np.repeat(np.linspace(-1, 1, w)[np.newaxis, :], h,
+                      axis=0)[:, :, np.newaxis] * w_factor
+    grid = np.concatenate([w_idx, h_idx, np.ones([h, w, 1])],
+                          axis=2)  # h * w * 3
     grid = np.repeat(grid[np.newaxis, :], size[0], axis=0)  # n * h * w *3
 
     ret = np.zeros([n, h * w, 2])
@@ -41,11 +39,13 @@ def AffineGrid(theta, size, align_corners):
     for i in range(len(theta)):
         ret[i] = np.dot(grid[i].reshape([h * w, 3]), theta[i])
 
-#    print ret.reshape([h * w, 2]).astype("float32")    
+
+#    print ret.reshape([h * w, 2]).astype("float32")
     return ret.reshape([n, h, w, 2]).astype("float32")
 
 
 class TestAffineGridOp(OpTest):
+
     def setUp(self):
         self.initTestCase()
         self.op_type = "affine_grid"
@@ -78,6 +78,7 @@ def initTestCase(self):
 
 
 class TestAffineGridOpCase1(TestAffineGridOp):
+
     def initTestCase(self):
         self.theta_shape = (20, 2, 3)
         self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
@@ -89,6 +90,7 @@ def initTestCase(self):
 
 
 class TestAffineGridOpCase2(TestAffineGridOp):
+
     def initTestCase(self):
         self.theta_shape = (20, 2, 3)
         self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
@@ -98,6 +100,7 @@ def initTestCase(self):
 
 
 class TestAffineGridOpCase3(TestAffineGridOp):
+
     def initTestCase(self):
         self.theta_shape = (20, 2, 3)
         self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
@@ -107,6 +110,7 @@ def initTestCase(self):
 
 
 class TestAffineGridOpCase4(TestAffineGridOp):
+
     def initTestCase(self):
         self.theta_shape = (25, 2, 3)
         self.output_shape = np.array([25, 2, 5, 6]).astype("int32")
diff --git a/python/paddle/fluid/tests/unittests/test_allclose_layer.py b/python/paddle/fluid/tests/unittests/test_allclose_layer.py
index 1e080c80367f0..66afbcfe20947 100644
--- a/python/paddle/fluid/tests/unittests/test_allclose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_allclose_layer.py
@@ -20,16 +20,28 @@
 
 
 class TestAllcloseLayer(unittest.TestCase):
+
     def allclose_check(self, use_cuda, dtype='float32'):
         a = fluid.data(name="a", shape=[2], dtype=dtype)
         b = fluid.data(name="b", shape=[2], dtype=dtype)
 
-        result = paddle.allclose(
-            a, b, rtol=1e-05, atol=1e-08, equal_nan=False, name="ignore_nan")
-        result_nan = paddle.allclose(
-            a, b, rtol=1e-05, atol=1e-08, equal_nan=True, name="equal_nan")
-        result_corner = paddle.allclose(
-            a, b, rtol=0.01, atol=0.0, name="corner_case")
+        result = paddle.allclose(a,
+                                 b,
+                                 rtol=1e-05,
+                                 atol=1e-08,
+                                 equal_nan=False,
+                                 name="ignore_nan")
+        result_nan = paddle.allclose(a,
+                                     b,
+                                     rtol=1e-05,
+                                     atol=1e-08,
+                                     equal_nan=True,
+                                     name="equal_nan")
+        result_corner = paddle.allclose(a,
+                                        b,
+                                        rtol=0.01,
+                                        atol=0.0,
+                                        name="corner_case")
 
         place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -37,24 +49,30 @@ def allclose_check(self, use_cuda, dtype='float32'):
 
         x = np.array([10000., 1e-07]).astype(dtype)
         y = np.array([10000.1, 1e-08]).astype(dtype)
-        result_v, result_nan_v = exe.run(feed={'a': x,
-                                               'b': y},
+        result_v, result_nan_v = exe.run(feed={
+            'a': x,
+            'b': y
+        },
                                          fetch_list=[result, result_nan])
         self.assertEqual(result_v[0], False)
         self.assertEqual(result_nan_v[0], False)
 
         x = np.array([10000., 1e-08]).astype(dtype)
         y = np.array([10000.1, 1e-09]).astype(dtype)
-        result_v, result_nan_v = exe.run(feed={'a': x,
-                                               'b': y},
+        result_v, result_nan_v = exe.run(feed={
+            'a': x,
+            'b': y
+        },
                                          fetch_list=[result, result_nan])
         self.assertEqual(result_v[0], True)
         self.assertEqual(result_nan_v[0], True)
 
         x = np.array([1.0, float('nan')]).astype(dtype)
         y = np.array([1.0, float('nan')]).astype(dtype)
-        result_v, result_nan_v = exe.run(feed={'a': x,
-                                               'b': y},
+        result_v, result_nan_v = exe.run(feed={
+            'a': x,
+            'b': y
+        },
                                          fetch_list=[result, result_nan])
         self.assertEqual(result_v[0], False)
         self.assertEqual(result_nan_v[0], True)
@@ -111,68 +129,68 @@ def func_dygraph_mode(self):
         with fluid.dygraph.guard():
             x_v_1 = paddle.to_tensor(x_1)
             y_v_1 = paddle.to_tensor(y_1)
-            ret_1 = paddle.allclose(
-                x_v_1,
-                y_v_1,
-                rtol=1e-05,
-                atol=1e-08,
-                equal_nan=False,
-                name='test_1')
+            ret_1 = paddle.allclose(x_v_1,
+                                    y_v_1,
+                                    rtol=1e-05,
+                                    atol=1e-08,
+                                    equal_nan=False,
+                                    name='test_1')
             self.assertEqual(ret_1.numpy()[0], False)
-            ret_1 = paddle.allclose(
-                x_v_1,
-                y_v_1,
-                rtol=1e-05,
-                atol=1e-08,
-                equal_nan=True,
-                name='test_2')
+            ret_1 = paddle.allclose(x_v_1,
+                                    y_v_1,
+                                    rtol=1e-05,
+                                    atol=1e-08,
+                                    equal_nan=True,
+                                    name='test_2')
             self.assertEqual(ret_1.numpy()[0], False)
             x_v_2 = paddle.to_tensor(x_2)
             y_v_2 = paddle.to_tensor(y_2)
-            ret_2 = paddle.allclose(
-                x_v_2,
-                y_v_2,
-                rtol=1e-05,
-                atol=1e-08,
-                equal_nan=False,
-                name='test_3')
+            ret_2 = paddle.allclose(x_v_2,
+                                    y_v_2,
+                                    rtol=1e-05,
+                                    atol=1e-08,
+                                    equal_nan=False,
+                                    name='test_3')
             self.assertEqual(ret_2.numpy()[0], True)
-            ret_2 = paddle.allclose(
-                x_v_2,
-                y_v_2,
-                rtol=1e-05,
-                atol=1e-08,
-                equal_nan=True,
-                name='test_4')
+            ret_2 = paddle.allclose(x_v_2,
+                                    y_v_2,
+                                    rtol=1e-05,
+                                    atol=1e-08,
+                                    equal_nan=True,
+                                    name='test_4')
             self.assertEqual(ret_2.numpy()[0], True)
             x_v_3 = paddle.to_tensor(x_3)
             y_v_3 = paddle.to_tensor(y_3)
-            ret_3 = paddle.allclose(
-                x_v_3,
-                y_v_3,
-                rtol=1e-05,
-                atol=1e-08,
-                equal_nan=False,
-                name='test_5')
+            ret_3 = paddle.allclose(x_v_3,
+                                    y_v_3,
+                                    rtol=1e-05,
+                                    atol=1e-08,
+                                    equal_nan=False,
+                                    name='test_5')
             self.assertEqual(ret_3.numpy()[0], False)
-            ret_3 = paddle.allclose(
-                x_v_3,
-                y_v_3,
-                rtol=1e-05,
-                atol=1e-08,
-                equal_nan=True,
-                name='test_6')
+            ret_3 = paddle.allclose(x_v_3,
+                                    y_v_3,
+                                    rtol=1e-05,
+                                    atol=1e-08,
+                                    equal_nan=True,
+                                    name='test_6')
             self.assertEqual(ret_3.numpy()[0], True)
             # for corner case
             x_v_4 = paddle.to_tensor(x_4)
             y_v_4 = paddle.to_tensor(y_4)
-            ret_4 = paddle.allclose(
-                x_v_4, y_v_4, rtol=0.01, atol=0.0, name='test_7')
+            ret_4 = paddle.allclose(x_v_4,
+                                    y_v_4,
+                                    rtol=0.01,
+                                    atol=0.0,
+                                    name='test_7')
             self.assertEqual(ret_4.numpy()[0], False)
             x_v_5 = paddle.to_tensor(x_5)
             y_v_5 = paddle.to_tensor(y_5)
-            ret_5 = paddle.allclose(
-                x_v_5, y_v_5, rtol=0.015, atol=0.0, name='test_8')
+            ret_5 = paddle.allclose(x_v_5,
+                                    y_v_5,
+                                    rtol=0.015,
+                                    atol=0.0,
+                                    name='test_8')
             self.assertEqual(ret_5.numpy()[0], True)
 
     def test_dygraph_mode(self):
diff --git a/python/paddle/fluid/tests/unittests/test_allclose_op.py b/python/paddle/fluid/tests/unittests/test_allclose_op.py
index ec1c5363fcde1..26351abe802dc 100644
--- a/python/paddle/fluid/tests/unittests/test_allclose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_allclose_op.py
@@ -19,6 +19,7 @@
 
 
 class TestAllcloseOp(OpTest):
+
     def set_args(self):
         self.input = np.array([10000., 1e-07]).astype("float32")
         self.other = np.array([10000.1, 1e-08]).astype("float32")
@@ -38,13 +39,13 @@ def setUp(self):
         }
         self.attrs = {'equal_nan': self.equal_nan}
         self.outputs = {
-            'Out': np.array([
-                np.allclose(
-                    self.inputs['Input'],
-                    self.inputs['Other'],
-                    rtol=self.rtol,
-                    atol=self.atol,
-                    equal_nan=self.equal_nan)
+            'Out':
+            np.array([
+                np.allclose(self.inputs['Input'],
+                            self.inputs['Other'],
+                            rtol=self.rtol,
+                            atol=self.atol,
+                            equal_nan=self.equal_nan)
             ])
         }
 
@@ -53,7 +54,9 @@ def test_check_output(self):
 
 
 class TestAllcloseOpException(TestAllcloseOp):
+
     def test_check_output(self):
+
         def test_rtol_num():
             self.inputs['Rtol'] = np.array([1e-05, 1e-05]).astype("float64")
             self.inputs['Atol'] = np.array([1e-08]).astype("float64")
@@ -84,6 +87,7 @@ def test_atol_type():
 
 
 class TestAllcloseOpSmallNum(TestAllcloseOp):
+
     def set_args(self):
         self.input = np.array([10000., 1e-08]).astype("float32")
         self.other = np.array([10000.1, 1e-09]).astype("float32")
@@ -93,6 +97,7 @@ def set_args(self):
 
 
 class TestAllcloseOpNanFalse(TestAllcloseOp):
+
     def set_args(self):
         self.input = np.array([1.0, float('nan')]).astype("float32")
         self.other = np.array([1.0, float('nan')]).astype("float32")
@@ -102,6 +107,7 @@ def set_args(self):
 
 
 class TestAllcloseOpNanTrue(TestAllcloseOp):
+
     def set_args(self):
         self.input = np.array([1.0, float('nan')]).astype("float32")
         self.other = np.array([1.0, float('nan')]).astype("float32")
@@ -111,6 +117,7 @@ def set_args(self):
 
 
 class TestAllcloseDygraph(unittest.TestCase):
+
     def test_api_case(self):
         paddle.disable_static()
         x_data = np.random.rand(10, 10)
@@ -124,7 +131,9 @@ def test_api_case(self):
 
 
 class TestAllcloseError(unittest.TestCase):
+
     def test_input_dtype(self):
+
         def test_x_dtype():
             with paddle.static.program_guard(paddle.static.Program(),
                                              paddle.static.Program()):
@@ -164,6 +173,7 @@ def test_equal_nan():
 
 
 class TestAllcloseOpFloat32(TestAllcloseOp):
+
     def set_args(self):
         self.input = np.array([10.1]).astype("float32")
         self.other = np.array([10]).astype("float32")
@@ -173,6 +183,7 @@ def set_args(self):
 
 
 class TestAllcloseOpFloat64(TestAllcloseOp):
+
     def set_args(self):
         self.input = np.array([10.1]).astype("float64")
         self.other = np.array([10]).astype("float64")
@@ -182,6 +193,7 @@ def set_args(self):
 
 
 class TestAllcloseOpLargeDimInput(TestAllcloseOp):
+
     def set_args(self):
         self.input = np.array(np.zeros([2048, 1024])).astype("float64")
         self.other = np.array(np.zeros([2048, 1024])).astype("float64")
diff --git a/python/paddle/fluid/tests/unittests/test_allgather.py b/python/paddle/fluid/tests/unittests/test_allgather.py
index 9bb34d3db4388..ed7e531ffad47 100644
--- a/python/paddle/fluid/tests/unittests/test_allgather.py
+++ b/python/paddle/fluid/tests/unittests/test_allgather.py
@@ -23,6 +23,7 @@
 
 
 class TestAllGatherOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_allreduce.py b/python/paddle/fluid/tests/unittests/test_allreduce.py
index 660f559535cd8..d3e783b9fe3ef 100644
--- a/python/paddle/fluid/tests/unittests/test_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/test_allreduce.py
@@ -23,6 +23,7 @@
 
 
 class TestAllReduceOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py b/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py
index fbacaa3d5ce10..d5ea02e6570bf 100644
--- a/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py
@@ -19,6 +19,7 @@
 
 
 class TestCheckFiniteAndUnscaleOp(OpTest):
+
     def setUp(self):
         self.op_type = "check_finite_and_unscale"
         self.init_dtype()
@@ -39,6 +40,7 @@ def test_check_output(self):
 
 
 class TestCheckFiniteAndUnscaleOpWithNan(OpTest):
+
     def setUp(self):
         self.op_type = "check_finite_and_unscale"
         self.init_dtype()
@@ -56,12 +58,13 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        # When input contains nan, do not check the output, 
+        # When input contains nan, do not check the output,
         # since the output may be nondeterministic and will be discarded.
         self.check_output(no_check_set=['Out'])
 
 
 class TestCheckFiniteAndUnscaleOpWithInf(OpTest):
+
     def setUp(self):
         self.op_type = "check_finite_and_unscale"
         self.init_dtype()
@@ -79,7 +82,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        # When input contains inf, do not check the output, 
+        # When input contains inf, do not check the output,
         # since the output may be nondeterministic and will be discarded.
         self.check_output(no_check_set=['Out'])
 
diff --git a/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
index d31eaa0114c3b..4cc77beef8c43 100644
--- a/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
+++ b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
@@ -46,10 +46,11 @@ def anchor_generator_in_python(input_feat, anchor_sizes, aspect_ratios,
                     scale_h = anchor_size / stride[1]
                     w = scale_w * base_w
                     h = scale_h * base_h
-                    out_anchors[h_idx, w_idx, idx, :] = [
-                        (x_ctr - 0.5 * (w - 1)), (y_ctr - 0.5 * (h - 1)),
-                        (x_ctr + 0.5 * (w - 1)), (y_ctr + 0.5 * (h - 1))
-                    ]
+                    out_anchors[h_idx, w_idx,
+                                idx, :] = [(x_ctr - 0.5 * (w - 1)),
+                                           (y_ctr - 0.5 * (h - 1)),
+                                           (x_ctr + 0.5 * (w - 1)),
+                                           (y_ctr + 0.5 * (h - 1))]
                     idx += 1
 
     # set the variance.
@@ -60,6 +61,7 @@ def anchor_generator_in_python(input_feat, anchor_sizes, aspect_ratios,
 
 
 class TestAnchorGeneratorOp(OpTest):
+
     def set_data(self):
         self.init_test_params()
         self.init_test_input()
diff --git a/python/paddle/fluid/tests/unittests/test_angle_op.py b/python/paddle/fluid/tests/unittests/test_angle_op.py
index 05397c2434d8c..d21eb61b77dd9 100644
--- a/python/paddle/fluid/tests/unittests/test_angle_op.py
+++ b/python/paddle/fluid/tests/unittests/test_angle_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,6 +21,7 @@
 import paddle
 from paddle.fluid import dygraph
 from paddle import static
+
 paddle.enable_static()
 
 
@@ -39,6 +40,7 @@ def angle_grad_element(xi, douti):
 
 
 class TestAngleOpFloat(OpTest):
+
     def setUp(self):
         self.op_type = "angle"
         self.dtype = "float64"
@@ -51,15 +53,16 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            user_defined_grads=[
-                angle_grad(self.x, np.ones_like(self.x) / self.x.size)
-            ])
+        self.check_grad(['X'],
+                        'Out',
+                        user_defined_grads=[
+                            angle_grad(self.x,
+                                       np.ones_like(self.x) / self.x.size)
+                        ])
 
 
 class TestAngleOpComplex(OpTest):
+
     def setUp(self):
         self.op_type = "angle"
         self.dtype = "complex128"
@@ -74,15 +77,16 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            user_defined_grads=[
-                angle_grad(self.x, np.ones_like(self.x) / self.x.size)
-            ])
+        self.check_grad(['X'],
+                        'Out',
+                        user_defined_grads=[
+                            angle_grad(self.x,
+                                       np.ones_like(self.x) / self.x.size)
+                        ])
 
 
 class TestAngleAPI(unittest.TestCase):
+
     def setUp(self):
         self.x = np.random.randn(2, 3) + 1j * np.random.randn(2, 3)
         self.out = np.angle(self.x)
diff --git a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
index 85fe8b76e0298..54a83d2a5ec75 100644
--- a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
+++ b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,8 +26,9 @@ def get_resnet50_model():
     main = paddle.static.Program()
     startup = paddle.static.Program()
     with paddle.static.program_guard(main, startup):
-        image = paddle.static.data(
-            name="image", shape=[None, 3, 224, 224], dtype="float32")
+        image = paddle.static.data(name="image",
+                                   shape=[None, 3, 224, 224],
+                                   dtype="float32")
         label = paddle.static.data(name="label", shape=[None, 1], dtype="int64")
         model = resnet50()
         loss_fn = CrossEntropyLoss()
@@ -47,6 +48,7 @@ def global_block_contains_op(program, op_type):
 
 
 class TestApplyPassToProgram(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
 
@@ -72,6 +74,7 @@ def test_case(self):
 
 
 class TestIRPassBase(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         if paddle.is_compiled_with_cuda():
@@ -183,12 +186,13 @@ def test_main(self):
 
         for idx in range(batch_num):
             feed = {
-                image.name: np.random.rand(*image_shape).astype('float32'),
-                label.name: np.random.randint(
-                    low=0,
-                    high=self.num_classes,
-                    size=label_shape,
-                    dtype='int64'),
+                image.name:
+                np.random.rand(*image_shape).astype('float32'),
+                label.name:
+                np.random.randint(low=0,
+                                  high=self.num_classes,
+                                  size=label_shape,
+                                  dtype='int64'),
             }
             with paddle.static.scope_guard(scope1):
                 loss_value1 = self.executor.run(main1,
diff --git a/python/paddle/fluid/tests/unittests/test_arange.py b/python/paddle/fluid/tests/unittests/test_arange.py
index d62c08b072b10..b6236033f8bab 100644
--- a/python/paddle/fluid/tests/unittests/test_arange.py
+++ b/python/paddle/fluid/tests/unittests/test_arange.py
@@ -23,6 +23,7 @@
 
 
 class TestArangeOp(OpTest):
+
     def setUp(self):
         self.op_type = "range"
         self.init_config()
@@ -33,8 +34,9 @@ def setUp(self):
         }
 
         self.outputs = {
-            'Out': np.arange(self.case[0], self.case[1],
-                             self.case[2]).astype(self.dtype)
+            'Out':
+            np.arange(self.case[0], self.case[1],
+                      self.case[2]).astype(self.dtype)
         }
 
     def init_config(self):
@@ -46,42 +48,48 @@ def test_check_output(self):
 
 
 class TestFloatArangeOp(TestArangeOp):
+
     def init_config(self):
         self.dtype = np.float32
         self.case = (0, 5, 1)
 
 
 class TestInt32ArangeOp(TestArangeOp):
+
     def init_config(self):
         self.dtype = np.int32
         self.case = (0, 5, 2)
 
 
 class TestFloat64ArangeOp(TestArangeOp):
+
     def init_config(self):
         self.dtype = np.float64
         self.case = (10, 1, -2)
 
 
 class TestInt64ArangeOp(TestArangeOp):
+
     def init_config(self):
         self.dtype = np.int64
         self.case = (-1, -10, -2)
 
 
 class TestArangeOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             self.assertRaises(TypeError, paddle.arange, 10, dtype='int8')
 
 
 class TestArangeAPI(unittest.TestCase):
+
     def test_out(self):
         with program_guard(Program(), Program()):
             x1 = paddle.arange(0, 5, 1, 'float32')
 
-            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else paddle.CPUPlace()
+            place = paddle.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             out = exe.run(fetch_list=[x1])
 
@@ -90,9 +98,10 @@ def test_out(self):
 
 
 class TestArangeImperative(unittest.TestCase):
+
     def test_out(self):
-        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
         paddle.disable_static(place)
         x1 = paddle.arange(0, 5, 1)
         x2 = paddle.tensor.arange(5)
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
index cbcb4af926951..6056f8f210631 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
@@ -24,6 +24,7 @@
 
 
 class BaseTestCase(OpTest):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4, 5)
@@ -45,6 +46,7 @@ def test_check_output(self):
 
 
 class TestCase0(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4, 5)
@@ -53,6 +55,7 @@ def initTestCase(self):
 
 
 class TestCase1(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4)
@@ -61,6 +64,7 @@ def initTestCase(self):
 
 
 class TestCase2(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4)
@@ -71,6 +75,7 @@ def initTestCase(self):
 @unittest.skipIf(not paddle.is_compiled_with_cuda(),
                  "FP16 test runs only on GPU")
 class TestCase0FP16(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4, 5)
@@ -81,6 +86,7 @@ def initTestCase(self):
 @unittest.skipIf(not paddle.is_compiled_with_cuda(),
                  "FP16 test runs only on GPU")
 class TestCase1FP16(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (3, 4)
@@ -89,6 +95,7 @@ def initTestCase(self):
 
 
 class TestCase2_1(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, 4)
@@ -97,6 +104,7 @@ def initTestCase(self):
 
 
 class TestCase3(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, )
@@ -105,6 +113,7 @@ def initTestCase(self):
 
 
 class TestCase4(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (1, )
@@ -113,6 +122,7 @@ def initTestCase(self):
 
 
 class TestCase3_(BaseTestCase):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (3, )
@@ -120,6 +130,7 @@ def initTestCase(self):
 
 
 class BaseTestComplex1_1(OpTest):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (4, 5, 6)
@@ -134,17 +145,16 @@ def setUp(self):
         self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
         if self.op_type == "arg_min":
             self.outputs = {
-                'Out': np.argmin(
-                    self.x, axis=self.axis).asdtype("int32")
+                'Out': np.argmin(self.x, axis=self.axis).asdtype("int32")
             }
         else:
             self.outputs = {
-                'Out': np.argmax(
-                    self.x, axis=self.axis).asdtype("int32")
+                'Out': np.argmax(self.x, axis=self.axis).asdtype("int32")
             }
 
 
 class BaseTestComplex1_2(OpTest):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (4, 5, 6)
@@ -159,17 +169,16 @@ def setUp(self):
         self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
         if self.op_type == "arg_min":
             self.outputs = {
-                'Out': np.argmin(
-                    self.x, axis=self.axis).asdtype("int32")
+                'Out': np.argmin(self.x, axis=self.axis).asdtype("int32")
             }
         else:
             self.outputs = {
-                'Out': np.argmax(
-                    self.x, axis=self.axis).asdtype("int32")
+                'Out': np.argmax(self.x, axis=self.axis).asdtype("int32")
             }
 
 
 class BaseTestComplex2_1(OpTest):
+
     def initTestCase(self):
         self.op_type = 'arg_max'
         self.dims = (4, 5, 6)
@@ -185,17 +194,20 @@ def setUp(self):
         self.attrs = {'keep_dims': True}
         if self.op_type == "arg_min":
             self.outputs = {
-                'Out': np.argmin(
-                    self.x, axis=self.axis).asdtype("int32").reshape(4, 5, 1)
+                'Out':
+                np.argmin(self.x,
+                          axis=self.axis).asdtype("int32").reshape(4, 5, 1)
             }
         else:
             self.outputs = {
-                'Out': np.argmax(
-                    self.x, axis=self.axis).asdtype("int32").reshape(4, 5, 1)
+                'Out':
+                np.argmax(self.x,
+                          axis=self.axis).asdtype("int32").reshape(4, 5, 1)
             }
 
 
 class BaseTestComplex2_2(OpTest):
+
     def initTestCase(self):
         self.op_type = 'arg_min'
         self.dims = (4, 5, 6)
@@ -211,13 +223,15 @@ def setUp(self):
         self.attrs = {'keep_dims': True}
         if self.op_type == "arg_min":
             self.outputs = {
-                'Out': np.argmin(
-                    self.x, axis=self.axis).asdtype("int32").reshape(4, 5, 1)
+                'Out':
+                np.argmin(self.x,
+                          axis=self.axis).asdtype("int32").reshape(4, 5, 1)
             }
         else:
             self.outputs = {
-                'Out': np.argmax(
-                    self.x, axis=self.axis).asdtype("int32").reshape(4, 5, 1)
+                'Out':
+                np.argmax(self.x,
+                          axis=self.axis).asdtype("int32").reshape(4, 5, 1)
             }
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
index 74f76030a29d2..83d49acf88f6f 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
@@ -24,7 +24,9 @@
 
 
 def create_kernel_case(op_type, numpy_op_type):
+
     class ArgMinMaxKernelBaseCase(OpTest):
+
         def initTestCase(self):
             self.op_type = op_type
             self.numpy_op_type = numpy_op_type
@@ -46,30 +48,35 @@ def test_check_output(self):
             self.check_output()
 
     class ArgMinMaxKernelCase0(ArgMinMaxKernelBaseCase):
+
         def initTestCase(self):
             self.op_type = op_type
             self.numpy_op_type = numpy_op_type
             self.axis = 1
 
     class ArgMinMaxKernelCase1(ArgMinMaxKernelBaseCase):
+
         def initTestCase(self):
             self.op_type = op_type
             self.numpy_op_type = numpy_op_type
             self.axis = 2
 
     class ArgMinMaxKernelCase2(ArgMinMaxKernelBaseCase):
+
         def initTestCase(self):
             self.op_type = op_type
             self.numpy_op_type = numpy_op_type
             self.axis = -1
 
     class ArgMinMaxKernelCase3(ArgMinMaxKernelBaseCase):
+
         def initTestCase(self):
             self.op_type = op_type
             self.numpy_op_type = numpy_op_type
             self.axis = -2
 
     class ArgMinMaxKernelCase4(ArgMinMaxKernelBaseCase):
+
         def setUp(self):
             self.initTestCase()
             self.dims = (4, 5, 6)
@@ -79,11 +86,11 @@ def setUp(self):
             self.attrs = {"axis": self.axis, "keepdims": True}
             self.numpy_op = eval("np.%s" % (numpy_op_type))
             self.outputs = {
-                'Out': self.numpy_op(
-                    self.x, axis=self.axis).reshape((1, 5, 6))
+                'Out': self.numpy_op(self.x, axis=self.axis).reshape((1, 5, 6))
             }
 
     class ArgMinMaxKernelCase5(ArgMinMaxKernelBaseCase):
+
         def setUp(self):
             self.initTestCase()
             self.dims = (4)
@@ -93,11 +100,11 @@ def setUp(self):
             self.attrs = {"axis": self.axis, "flatten": True}
             self.numpy_op = eval("np.%s" % (numpy_op_type))
             self.outputs = {
-                'Out': self.numpy_op(
-                    self.x.flatten(), axis=self.axis)
+                'Out': self.numpy_op(self.x.flatten(), axis=self.axis)
             }
 
     class ArgMinMaxKernelCase6(ArgMinMaxKernelBaseCase):
+
         def setUp(self):
             self.initTestCase()
             self.dims = (4)
@@ -107,9 +114,7 @@ def setUp(self):
             self.attrs = {"axis": self.axis, "flatten": True, "keepdims": True}
             self.numpy_op = eval("np.%s" % (numpy_op_type))
             self.outputs = {
-                'Out':
-                np.array(self.numpy_op(
-                    self.x.flatten(), axis=self.axis))
+                'Out': np.array(self.numpy_op(self.x.flatten(), axis=self.axis))
             }
 
     cls_name = "ArgMinMaxKernelBaseCase_%s" % (op_type)
@@ -150,7 +155,9 @@ def setUp(self):
 
 
 def create_test_case(op_type):
+
     class ArgMaxMinTestCase(unittest.TestCase):
+
         def setUp(self):
             np.random.seed(123)
             self.input_data = np.random.rand(10, 10).astype("float32")
@@ -164,8 +171,9 @@ def setUp(self):
         def run_static(self, place):
             paddle.enable_static()
             with paddle.static.program_guard(paddle.static.Program()):
-                data_var = paddle.static.data(
-                    name="data", shape=[10, 10], dtype="float32")
+                data_var = paddle.static.data(name="data",
+                                              shape=[10, 10],
+                                              dtype="float32")
                 op = eval("paddle.%s" % (op_type))
                 result = op(data_var)
                 exe = paddle.static.Executor(place)
@@ -176,8 +184,9 @@ def run_static(self, place):
                                 True)
 
             with paddle.static.program_guard(paddle.static.Program()):
-                data_var = paddle.static.data(
-                    name="data", shape=[10, 10], dtype="float32")
+                data_var = paddle.static.data(name="data",
+                                              shape=[10, 10],
+                                              dtype="float32")
                 op = eval("paddle.%s" % (op_type))
                 result = op(data_var, axis=1)
                 exe = paddle.static.Executor(place)
@@ -187,8 +196,9 @@ def run_static(self, place):
                 self.assertTrue((result_data == expected_data).all(), True)
 
             with paddle.static.program_guard(paddle.static.Program()):
-                data_var = paddle.static.data(
-                    name="data", shape=[10, 10], dtype="float32")
+                data_var = paddle.static.data(name="data",
+                                              shape=[10, 10],
+                                              dtype="float32")
                 op = eval("paddle.%s" % (op_type))
                 result = op(data_var, axis=-1)
                 exe = paddle.static.Executor(place)
@@ -198,22 +208,24 @@ def run_static(self, place):
                 self.assertTrue((result_data == expected_data).all(), True)
 
             with paddle.static.program_guard(paddle.static.Program()):
-                data_var = paddle.static.data(
-                    name="data", shape=[10, 10], dtype="float32")
+                data_var = paddle.static.data(name="data",
+                                              shape=[10, 10],
+                                              dtype="float32")
 
                 op = eval("paddle.%s" % (op_type))
                 result = op(data_var, axis=-1, keepdim=True)
                 exe = paddle.static.Executor(place)
                 result_data = exe.run(feed={"data": self.input_data},
                                       fetch_list=[result])
-                expected_data = self.numpy_op(
-                    self.input_data, axis=-1).reshape((10, 1))
+                expected_data = self.numpy_op(self.input_data, axis=-1).reshape(
+                    (10, 1))
                 self.assertTrue((result_data == expected_data).all(), True)
 
             with paddle.static.program_guard(paddle.static.Program()):
                 op = eval("paddle.%s" % (op_type))
-                data_var = paddle.static.data(
-                    name="data", shape=[10, 10], dtype="float32")
+                data_var = paddle.static.data(name="data",
+                                              shape=[10, 10],
+                                              dtype="float32")
                 result = op(data_var, axis=-1, name="test_arg_api")
                 self.assertTrue("test_arg_api" in result.name)
 
@@ -222,28 +234,28 @@ def run_dygraph(self, place):
             op = eval("paddle.%s" % (op_type))
             data_tensor = paddle.to_tensor(self.input_data)
 
-            #case 1 
+            #case 1
             result_data = op(data_tensor)
             excepted_data = self.numpy_op(self.input_data)
             self.assertTrue((result_data.numpy() == excepted_data).all(), True)
 
-            #case 2 
+            #case 2
             result_data = op(data_tensor, axis=1)
             excepted_data = self.numpy_op(self.input_data, axis=1)
             self.assertTrue((result_data.numpy() == excepted_data).all(), True)
 
-            #case 3 
+            #case 3
             result_data = op(data_tensor, axis=-1)
             excepted_data = self.numpy_op(self.input_data, axis=-1)
             self.assertTrue((result_data.numpy() == excepted_data).all(), True)
 
-            #case 4 
+            #case 4
             result_data = op(data_tensor, axis=-1, keepdim=True)
             excepted_data = self.numpy_op(self.input_data, axis=-1)
             excepted_data = excepted_data.reshape((10, 1))
             self.assertTrue((result_data.numpy() == excepted_data).all(), True)
 
-            #case 5 
+            #case 5
             result_data = op(data_tensor, axis=-1, keepdim=True, dtype="int32")
             self.assertTrue(result_data.numpy().dtype == np.int32)
 
@@ -278,6 +290,7 @@ def test_case(self):
 
 
 class TestArgMinMaxOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
@@ -295,43 +308,49 @@ def test_argmin_x_type():
             self.assertRaises(TypeError, test_argmin_x_type)
 
             def test_argmax_attr_type():
-                data = paddle.static.data(
-                    name="test_argmax", shape=[10], dtype="float32")
+                data = paddle.static.data(name="test_argmax",
+                                          shape=[10],
+                                          dtype="float32")
                 output = paddle.argmax(x=data, dtype="float32")
 
             self.assertRaises(TypeError, test_argmax_attr_type)
 
             def test_argmin_attr_type():
-                data = paddle.static.data(
-                    name="test_argmax", shape=[10], dtype="float32")
+                data = paddle.static.data(name="test_argmax",
+                                          shape=[10],
+                                          dtype="float32")
                 output = paddle.argmin(x=data, dtype="float32")
 
             self.assertRaises(TypeError, test_argmin_attr_type)
 
             def test_argmax_axis_type():
-                data = paddle.static.data(
-                    name="test_argmax", shape=[10], dtype="float32")
+                data = paddle.static.data(name="test_argmax",
+                                          shape=[10],
+                                          dtype="float32")
                 output = paddle.argmax(x=data, axis=1.2)
 
             self.assertRaises(TypeError, test_argmax_axis_type)
 
             def test_argmin_axis_type():
-                data = paddle.static.data(
-                    name="test_argmin", shape=[10], dtype="float32")
+                data = paddle.static.data(name="test_argmin",
+                                          shape=[10],
+                                          dtype="float32")
                 output = paddle.argmin(x=data, axis=1.2)
 
             self.assertRaises(TypeError, test_argmin_axis_type)
 
             def test_argmax_dtype_type():
-                data = paddle.static.data(
-                    name="test_argmax", shape=[10], dtype="float32")
+                data = paddle.static.data(name="test_argmax",
+                                          shape=[10],
+                                          dtype="float32")
                 output = paddle.argmax(x=data, dtype=None)
 
             self.assertRaises(ValueError, test_argmax_dtype_type)
 
             def test_argmin_dtype_type():
-                data = paddle.static.data(
-                    name="test_argmin", shape=[10], dtype="float32")
+                data = paddle.static.data(name="test_argmin",
+                                          shape=[10],
+                                          dtype="float32")
                 output = paddle.argmin(x=data, dtype=None)
 
             self.assertRaises(ValueError, test_argmin_dtype_type)
diff --git a/python/paddle/fluid/tests/unittests/test_argsort_op.py b/python/paddle/fluid/tests/unittests/test_argsort_op.py
index 874d66112bdbb..50350e887957b 100644
--- a/python/paddle/fluid/tests/unittests/test_argsort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py
@@ -31,6 +31,7 @@
 
 
 class PyArgsort(object):
+
     def __init__(self, input_shape, axis, descending, dtype):
         self.x = np.random.random(input_shape).astype(dtype)
         self.label = np.random.random(input_shape).astype(dtype)
@@ -43,20 +44,17 @@ def __init__(self, input_shape, axis, descending, dtype):
     def forward(self):
         if self.descending:
             self.indices = np.flip(
-                np.argsort(
-                    self.x, kind='quicksort', axis=self.axis), self.axis)
+                np.argsort(self.x, kind='quicksort', axis=self.axis), self.axis)
             self.sorted_x = np.flip(
-                np.sort(
-                    self.x, kind='quicksort', axis=self.axis), self.axis)
+                np.sort(self.x, kind='quicksort', axis=self.axis), self.axis)
         else:
             self.indices = np.argsort(self.x, kind='quicksort', axis=self.axis)
             self.sorted_x = np.sort(self.x, kind='quicksort', axis=self.axis)
         self.loss = self.sorted_x * self.label
         self.loss = np.sum(self.loss)
-        out = (np.array(
-            self.indices, dtype=self.indices.dtype), np.array(
-                self.sorted_x, dtype=self.sorted_x.dtype), np.array(
-                    [self.loss], dtype=self.loss.dtype))
+        out = (np.array(self.indices, dtype=self.indices.dtype),
+               np.array(self.sorted_x, dtype=self.sorted_x.dtype),
+               np.array([self.loss], dtype=self.loss.dtype))
         return out
 
 
@@ -67,6 +65,7 @@ def create_tensor(np_data, place):
 
 
 class TestArgsortOpCPU(unittest.TestCase):
+
     def setup_program(self):
         self.main_program = Program()
         self.startup_program = Program()
@@ -86,11 +85,13 @@ def setUp(self):
                                     self.descending, self.dtype)
 
         with fluid.program_guard(self.main_program, self.startup_program):
-            x = fluid.layers.data(
-                name="x", shape=self.input_shape, dtype=self.dtype)
+            x = fluid.layers.data(name="x",
+                                  shape=self.input_shape,
+                                  dtype=self.dtype)
             x.stop_gradient = False
-            label = fluid.layers.data(
-                name="label", shape=self.input_shape, dtype=self.dtype)
+            label = fluid.layers.data(name="label",
+                                      shape=self.input_shape,
+                                      dtype=self.dtype)
             self.sorted_x, self.index = fluid.layers.argsort(
                 input=x, axis=self.axis, descending=self.descending)
             self.sorted_x.stop_gradient = False
@@ -133,12 +134,12 @@ def test_backward(self, numeric_grad_delta=1e-5, max_relative_error=1e-7):
         ana_grad = [np.array(x) for x in self.backward()]
 
         num_grad = self.get_numerical_gradient(delta=numeric_grad_delta)
-        self.assert_is_close(
-            num_grad,
-            ana_grad,
-            'x',
-            max_relative_error=max_relative_error,
-            msg_prefix="Gradient Check On %s" % str(self.place))
+        self.assert_is_close(num_grad,
+                             ana_grad,
+                             'x',
+                             max_relative_error=max_relative_error,
+                             msg_prefix="Gradient Check On %s" %
+                             str(self.place))
 
     def check_forward(self):
         pd_outputs = self.forward()
@@ -146,8 +147,7 @@ def check_forward(self):
         for pd_output, py_output in zip(pd_outputs, py_outputs):
             self.assertEqual(pd_output.shape, py_output.shape)
             self.assertTrue(
-                np.allclose(
-                    pd_output, py_output, atol=0, equal_nan=False))
+                np.allclose(pd_output, py_output, atol=0, equal_nan=False))
 
     def get_numerical_gradient(self, delta=1e-7):
         if self.dtype == 'float16':
@@ -204,6 +204,7 @@ def init_place(self):
 
 
 class TestArgsortOpGPU(TestArgsortOpCPU):
+
     def init_place(self):
         if core.is_compiled_with_cuda():
             self.place = core.CUDAPlace(0)
@@ -212,120 +213,144 @@ def init_place(self):
 
 
 class TestArgsortOpAxis0CPU(TestArgsortOpCPU):
+
     def init_axis(self):
         self.axis = 0
 
 
 class TestArgsortOpAxis0GPU(TestArgsortOpGPU):
+
     def init_axis(self):
         self.axis = 0
 
 
 class TestArgsortOpAxis1CPU(TestArgsortOpCPU):
+
     def init_axis(self):
         self.axis = 1
 
 
 class TestArgsortOpAxis1GPU(TestArgsortOpGPU):
+
     def init_axis(self):
         self.axis = 1
 
 
 class TestArgsortOpAxis2CPU(TestArgsortOpCPU):
+
     def init_axis(self):
         self.axis = 2
 
 
 class TestArgsortOpAxis2GPU(TestArgsortOpGPU):
+
     def init_axis(self):
         self.axis = 2
 
 
 class TestArgsortOpAxisNeg1CPU(TestArgsortOpCPU):
+
     def init_axis(self):
         self.axis = -1
 
 
 class TestArgsortOpAxisNeg1GPU(TestArgsortOpGPU):
+
     def init_axis(self):
         self.axis = -1
 
 
 class TestArgsortOpAxisNeg2CPU(TestArgsortOpCPU):
+
     def init_axis(self):
         self.axis = -2
 
 
 class TestArgsortOpAxisNeg2GPU(TestArgsortOpGPU):
+
     def init_axis(self):
         self.axis = -2
 
 
 class TestArgsortOpDescendingAxisCPU(TestArgsortOpCPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxisGPU(TestArgsortOpGPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis0CPU(TestArgsortOpAxis0CPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis0GPU(TestArgsortOpAxis0GPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis1CPU(TestArgsortOpAxis1CPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis1GPU(TestArgsortOpAxis1GPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis2CPU(TestArgsortOpAxis2CPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxis2GPU(TestArgsortOpAxis2GPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxisNeg1CPU(TestArgsortOpAxisNeg1CPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxisNeg1GPU(TestArgsortOpAxisNeg1GPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxisNeg2CPU(TestArgsortOpAxisNeg2CPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortOpDescendingAxisNeg2GPU(TestArgsortOpAxisNeg2GPU):
+
     def init_direction(self):
         self.descending = True
 
 
 class TestArgsortErrorOnCPU(unittest.TestCase):
+
     def setUp(self):
         self.place = core.CPUPlace()
 
     def test_error(self):
+
         def test_fluid_var_type():
             with fluid.program_guard(fluid.Program()):
                 x = [1]
@@ -340,6 +365,7 @@ def test_paddle_var_type():
 
 
 class TestArgsortErrorOnGPU(TestArgsortErrorOnCPU):
+
     def setUp(self):
         if core.is_compiled_with_cuda():
             self.place = core.CUDAPlace(0)
@@ -348,8 +374,11 @@ def setUp(self):
 
 
 class TestArgsort(unittest.TestCase):
+
     def init(self):
-        self.input_shape = [10000, ]
+        self.input_shape = [
+            10000,
+        ]
         self.axis = 0
 
     def setUp(self):
@@ -362,8 +391,9 @@ def setUp(self):
 
     def test_api(self):
         with fluid.program_guard(fluid.Program()):
-            input = fluid.data(
-                name="input", shape=self.input_shape, dtype="float64")
+            input = fluid.data(name="input",
+                               shape=self.input_shape,
+                               dtype="float64")
 
             output = paddle.argsort(input, axis=self.axis)
             output2 = paddle.argsort(input, axis=self.axis, descending=True)
@@ -380,26 +410,32 @@ def test_api(self):
 
 
 class TestArgsort2(TestArgsort):
+
     def init(self):
         self.input_shape = [10000, 1]
         self.axis = 0
 
 
 class TestArgsort3(TestArgsort):
+
     def init(self):
         self.input_shape = [1, 10000]
         self.axis = 1
 
 
 class TestArgsort4(TestArgsort):
+
     def init(self):
         self.input_shape = [2, 3, 4]
         self.axis = 1
 
 
 class TestArgsortImperative(unittest.TestCase):
+
     def init(self):
-        self.input_shape = [10000, ]
+        self.input_shape = [
+            10000,
+        ]
         self.axis = 0
 
     def setUp(self):
@@ -425,24 +461,28 @@ def test_api(self):
 
 
 class TestArgsortImperative2(TestArgsortImperative):
+
     def init(self):
         self.input_shape = [10000, 1]
         self.axis = 0
 
 
 class TestArgsortImperative3(TestArgsortImperative):
+
     def init(self):
         self.input_shape = [1, 10000]
         self.axis = 1
 
 
 class TestArgsortImperative4(TestArgsortImperative):
+
     def init(self):
         self.input_shape = [2, 3, 4]
         self.axis = 1
 
 
 class TestArgsortWithInputNaN(unittest.TestCase):
+
     def init(self):
         self.axis = 0
 
diff --git a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
index b02cf67f4b221..8ed220daf035a 100644
--- a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
+++ b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
@@ -60,12 +60,12 @@ def _test_read_write(x):
 
 
 class TestArrayReadWrite(unittest.TestCase):
+
     def test_read_write(self):
         x = [
-            layers.data(
-                name='x0', shape=[100]), layers.data(
-                    name='x1', shape=[100]), layers.data(
-                        name='x2', shape=[100])
+            layers.data(name='x0', shape=[100]),
+            layers.data(name='x1', shape=[100]),
+            layers.data(name='x2', shape=[100])
         ]
         for each_x in x:
             each_x.stop_gradient = False
@@ -75,9 +75,11 @@ def test_read_write(self):
 
         place = core.CPUPlace()
         exe = Executor(place)
-        outs = exe.run(feed={'x0': tensor,
-                             'x1': tensor,
-                             'x2': tensor},
+        outs = exe.run(feed={
+            'x0': tensor,
+            'x1': tensor,
+            'x2': tensor
+        },
                        fetch_list=[a_sum, x_sum],
                        scope=core.Scope())
         self.assertEqual(outs[0], outs[1])
@@ -91,12 +93,12 @@ def test_read_write(self):
             map(default_main_program().global_block().var,
                 [each_x.name + "@GRAD" for each_x in x]))
         g_out = [
-            item.sum()
-            for item in exe.run(
-                feed={'x0': tensor,
-                      'x1': tensor,
-                      'x2': tensor},
-                fetch_list=g_vars)
+            item.sum() for item in exe.run(feed={
+                'x0': tensor,
+                'x1': tensor,
+                'x2': tensor
+            },
+                                           fetch_list=g_vars)
         ]
         g_out_sum = numpy.array(g_out).sum()
 
@@ -117,8 +119,8 @@ def test_read_write(self):
 
             total_sum_dygraph = layers.sums(
                 input=[a_sum_dygraph, x_sum_dygraph])
-            total_sum_scaled_dygraph = layers.scale(
-                x=total_sum_dygraph, scale=1 / 6.0)
+            total_sum_scaled_dygraph = layers.scale(x=total_sum_dygraph,
+                                                    scale=1 / 6.0)
             total_sum_scaled_dygraph.backward()
             g_out_dygraph = [
                 item._grad_ivar().numpy().sum() for item in x_dygraph
@@ -129,32 +131,40 @@ def test_read_write(self):
 
 
 class TestArrayReadWriteOpError(unittest.TestCase):
+
     def _test_errors(self, use_fluid_api=True):
         if use_fluid_api:
             with program_guard(Program(), Program()):
                 x1 = numpy.random.randn(2, 4).astype('int32')
-                x2 = fluid.layers.fill_constant(
-                    shape=[1], dtype='int32', value=1)
+                x2 = fluid.layers.fill_constant(shape=[1],
+                                                dtype='int32',
+                                                value=1)
                 x3 = numpy.random.randn(2, 4).astype('int32')
 
-                self.assertRaises(
-                    TypeError, fluid.layers.array_read, array=x1, i=x2)
-                self.assertRaises(
-                    TypeError, fluid.layers.array_write, array=x1, i=x2, out=x3)
+                self.assertRaises(TypeError,
+                                  fluid.layers.array_read,
+                                  array=x1,
+                                  i=x2)
+                self.assertRaises(TypeError,
+                                  fluid.layers.array_write,
+                                  array=x1,
+                                  i=x2,
+                                  out=x3)
         else:
             with program_guard(Program(), Program()):
                 x1 = numpy.random.randn(2, 4).astype('int32')
                 x2 = paddle.ones(shape=[1], dtype='int32')
                 x3 = numpy.random.randn(2, 4).astype('int32')
 
-                self.assertRaises(
-                    TypeError, paddle.tensor.array_read, array=x1, i=x2)
-                self.assertRaises(
-                    TypeError,
-                    paddle.tensor.array_write,
-                    array=x1,
-                    i=x2,
-                    out=x3)
+                self.assertRaises(TypeError,
+                                  paddle.tensor.array_read,
+                                  array=x1,
+                                  i=x2)
+                self.assertRaises(TypeError,
+                                  paddle.tensor.array_write,
+                                  array=x1,
+                                  i=x2,
+                                  out=x3)
 
     def test_fluid_api(self):
         self._test_errors(use_fluid_api=True)
@@ -164,6 +174,7 @@ def test_paddle_api(self):
 
 
 class TestArrayReadWriteApi(unittest.TestCase):
+
     def test_api(self):
         paddle.disable_static()
         arr = paddle.tensor.create_array(dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_ascend_trigger.py b/python/paddle/fluid/tests/unittests/test_ascend_trigger.py
index 644b550bc426e..eb55962d6064d 100644
--- a/python/paddle/fluid/tests/unittests/test_ascend_trigger.py
+++ b/python/paddle/fluid/tests/unittests/test_ascend_trigger.py
@@ -28,11 +28,10 @@ def test_ascend_trigger_op(self):
         with fluid.program_guard(program):
             x = fluid.data(name='x', shape=[1], dtype='int64', lod_level=0)
             y = fluid.data(name='y', shape=[1], dtype='int64', lod_level=0)
-            block.append_op(
-                type="ascend_trigger",
-                inputs={"FeedList": [x]},
-                outputs={"FetchList": [y]},
-                attrs={'graph_idx': 0})
+            block.append_op(type="ascend_trigger",
+                            inputs={"FeedList": [x]},
+                            outputs={"FetchList": [y]},
+                            attrs={'graph_idx': 0})
 
         exe = paddle.static.Executor(paddle.CPUPlace())
         try:
diff --git a/python/paddle/fluid/tests/unittests/test_assert_op.py b/python/paddle/fluid/tests/unittests/test_assert_op.py
index f7ab991de56d2..5c6cbba0c2ddf 100644
--- a/python/paddle/fluid/tests/unittests/test_assert_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assert_op.py
@@ -21,6 +21,7 @@
 
 
 class TestAssertOp(unittest.TestCase):
+
     def run_network(self, net_func):
         main_program = fluid.Program()
         startup_program = fluid.Program()
@@ -30,32 +31,39 @@ def run_network(self, net_func):
         exe.run(main_program)
 
     def test_assert_true(self):
+
         def net_func():
-            condition = layers.fill_constant(
-                shape=[1], dtype='bool', value=True)
+            condition = layers.fill_constant(shape=[1],
+                                             dtype='bool',
+                                             value=True)
             layers.Assert(condition, [])
 
         self.run_network(net_func)
 
     def test_assert_false(self):
+
         def net_func():
-            condition = layers.fill_constant(
-                shape=[1], dtype='bool', value=False)
+            condition = layers.fill_constant(shape=[1],
+                                             dtype='bool',
+                                             value=False)
             layers.Assert(condition)
 
         with self.assertRaises(ValueError):
             self.run_network(net_func)
 
     def test_assert_cond_numel_error(self):
+
         def net_func():
-            condition = layers.fill_constant(
-                shape=[1, 2], dtype='bool', value=True)
+            condition = layers.fill_constant(shape=[1, 2],
+                                             dtype='bool',
+                                             value=True)
             layers.Assert(condition, [])
 
         with self.assertRaises(ValueError):
             self.run_network(net_func)
 
     def test_assert_print_data(self):
+
         def net_func():
             zero = layers.fill_constant(shape=[1], dtype='int64', value=0)
             one = layers.fill_constant(shape=[1], dtype='int64', value=1)
@@ -67,6 +75,7 @@ def net_func():
             self.run_network(net_func)
 
     def test_assert_summary(self):
+
         def net_func():
             x = layers.fill_constant(shape=[10], dtype='float32', value=2.0)
             condition = layers.reduce_max(x) < 1.0
@@ -77,6 +86,7 @@ def net_func():
             self.run_network(net_func)
 
     def test_assert_summary_greater_than_size(self):
+
         def net_func():
             x = layers.fill_constant(shape=[2, 3], dtype='float32', value=2.0)
             condition = layers.reduce_max(x) < 1.0
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index bfe23c621270d..c35d7940a8a1c 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -26,6 +26,7 @@
 
 
 class TestAssignOp(op_test.OpTest):
+
     def setUp(self):
         self.python_api = paddle.assign
         self.op_type = "assign"
@@ -34,13 +35,18 @@ def setUp(self):
         self.outputs = {'Out': x}
 
     def test_forward(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         self.check_output(check_eager=True)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_backward(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         self.check_grad(['X'], 'Out', check_eager=True)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 class TestAssignFP16Op(op_test.OpTest):
+
     def setUp(self):
         self.python_api = paddle.assign
         self.op_type = "assign"
@@ -49,21 +55,28 @@ def setUp(self):
         self.outputs = {'Out': x}
 
     def test_forward(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         self.check_output(check_eager=True)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_backward(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         self.check_grad(['X'], 'Out', check_eager=True)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 class TestAssignOpWithLoDTensorArray(unittest.TestCase):
+
     def test_assign_LoDTensorArray(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         main_program = Program()
         startup_program = Program()
         with program_guard(main_program):
             x = fluid.data(name='x', shape=[100, 10], dtype='float32')
             x.stop_gradient = False
-            y = fluid.layers.fill_constant(
-                shape=[100, 10], dtype='float32', value=1)
+            y = fluid.layers.fill_constant(shape=[100, 10],
+                                           dtype='float32',
+                                           value=1)
             z = fluid.layers.elementwise_add(x=x, y=y)
             i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
             init_array = fluid.layers.array_write(x=z, i=i)
@@ -71,9 +84,10 @@ def test_assign_LoDTensorArray(self):
             sums = fluid.layers.array_read(array=init_array, i=i)
             mean = fluid.layers.mean(sums)
             append_backward(mean)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         feed_x = np.random.random(size=(100, 10)).astype('float32')
         ones = np.ones((100, 10)).astype('float32')
@@ -86,11 +100,12 @@ def test_assign_LoDTensorArray(self):
 
 
 class TestAssignOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The type of input must be Variable or numpy.ndarray.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.assign, x1)
             # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
             x2 = np.array([[2.5, 2.5]], dtype='uint8')
@@ -98,14 +113,16 @@ def test_errors(self):
 
 
 class TestAssignOApi(unittest.TestCase):
+
     def test_assign_LoDTensorArray(self):
         main_program = Program()
         startup_program = Program()
         with program_guard(main_program):
             x = fluid.data(name='x', shape=[100, 10], dtype='float32')
             x.stop_gradient = False
-            y = fluid.layers.fill_constant(
-                shape=[100, 10], dtype='float32', value=1)
+            y = fluid.layers.fill_constant(shape=[100, 10],
+                                           dtype='float32',
+                                           value=1)
             z = fluid.layers.elementwise_add(x=x, y=y)
             i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
             init_array = fluid.layers.array_write(x=z, i=i)
@@ -114,8 +131,8 @@ def test_assign_LoDTensorArray(self):
             mean = fluid.layers.mean(sums)
             append_backward(mean)
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         feed_x = np.random.random(size=(100, 10)).astype('float32')
         ones = np.ones((100, 10)).astype('float32')
@@ -173,6 +190,7 @@ def test_assign_BasicTypes(self):
 
     def test_clone(self):
         paddle.disable_static()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         self.python_api = paddle.clone
 
         x = paddle.ones([2])
@@ -185,6 +203,7 @@ def test_clone(self):
         self.assertTrue(np.array_equal(x, [1, 1]), True)
         self.assertTrue(np.array_equal(clone_x.grad.numpy(), [3, 3]), True)
         self.assertTrue(np.array_equal(x.grad.numpy(), [3, 3]), True)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
         paddle.enable_static()
 
         with program_guard(Program(), Program()):
@@ -200,15 +219,18 @@ def test_clone(self):
 
 
 class TestAssignOpErrorApi(unittest.TestCase):
+
     def test_errors(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with program_guard(Program(), Program()):
             # The type of input must be Variable or numpy.ndarray.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             self.assertRaises(TypeError, paddle.assign, x1)
             # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
             x2 = np.array([[2.5, 2.5]], dtype='uint8')
             self.assertRaises(TypeError, paddle.assign, x2)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_type_error(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_assign_pos_op.py b/python/paddle/fluid/tests/unittests/test_assign_pos_op.py
index 46761063b8af2..3458ce64eca84 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_pos_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_pos_op.py
@@ -68,6 +68,7 @@ def assert_allclose(res, out, cum_count):
 
 
 def get_redefined_allclose(cum_count):
+
     def redefined_allclose(x, y, *args, **kwargs):
         return assert_allclose(x, y, cum_count)
 
@@ -77,6 +78,7 @@ def redefined_allclose(x, y, *args, **kwargs):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestAssignPosOpInt64(op_test.OpTest):
+
     def setUp(self):
         x = np.random.randint(0, 16, size=(100, 2)).astype("int64")
         y = count(x, 16)
@@ -98,6 +100,7 @@ def test_forward(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestAssignPosAPI(unittest.TestCase):
+
     def setUp(self):
         self.x = np.random.randint(0, 16, size=(100, 2)).astype("int64")
         y = count(self.x, 16)
@@ -109,12 +112,15 @@ def test_api_static(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.fluid.data('x', self.x.shape, dtype="int64")
-            cum_count = paddle.fluid.data(
-                'cum_count', self.cum_count.shape, dtype="int64")
+            cum_count = paddle.fluid.data('cum_count',
+                                          self.cum_count.shape,
+                                          dtype="int64")
             out = utils._assign_pos(x, cum_count)
             exe = paddle.static.Executor(self.place)
-            res = exe.run(feed={'x': self.x,
-                                "cum_count": self.cum_count},
+            res = exe.run(feed={
+                'x': self.x,
+                "cum_count": self.cum_count
+            },
                           fetch_list=[out])
             assert_allclose(res[0], self.out, self.cum_count)
 
diff --git a/python/paddle/fluid/tests/unittests/test_assign_value_op.py b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
index 2abdbdc5940f7..423f70085b8e6 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
@@ -24,6 +24,7 @@
 
 
 class TestAssignValueOp(op_test.OpTest):
+
     def setUp(self):
         self.op_type = "assign_value"
         self.inputs = {}
@@ -43,31 +44,35 @@ def test_forward(self):
 
 
 class TestAssignValueOp2(TestAssignValueOp):
+
     def init_data(self):
         self.value = numpy.random.random(size=(2, 5)).astype(numpy.int32)
         self.attrs["int32_values"] = [int(v) for v in self.value.flat]
 
 
 class TestAssignValueOp3(TestAssignValueOp):
+
     def init_data(self):
         self.value = numpy.random.random(size=(2, 5)).astype(numpy.int64)
         self.attrs["int64_values"] = [int(v) for v in self.value.flat]
 
 
 class TestAssignValueOp4(TestAssignValueOp):
+
     def init_data(self):
-        self.value = numpy.random.choice(
-            a=[False, True], size=(2, 5)).astype(numpy.bool)
+        self.value = numpy.random.choice(a=[False, True],
+                                         size=(2, 5)).astype(numpy.bool)
         self.attrs["bool_values"] = [int(v) for v in self.value.flat]
 
 
 class TestAssignApi(unittest.TestCase):
+
     def setUp(self):
         self.init_dtype()
-        self.value = (
-            -100 + 200 * numpy.random.random(size=(2, 5))).astype(self.dtype)
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.value = (-100 + 200 * numpy.random.random(size=(2, 5))).astype(
+            self.dtype)
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 
     def init_dtype(self):
         self.dtype = "float32"
@@ -80,29 +85,31 @@ def test_assign(self):
 
         exe = fluid.Executor(self.place)
         [fetched_x] = exe.run(main_program, feed={}, fetch_list=[x])
-        self.assertTrue(
-            numpy.array_equal(fetched_x, self.value),
-            "fetch_x=%s val=%s" % (fetched_x, self.value))
+        self.assertTrue(numpy.array_equal(fetched_x, self.value),
+                        "fetch_x=%s val=%s" % (fetched_x, self.value))
         self.assertEqual(fetched_x.dtype, self.value.dtype)
 
 
 class TestAssignApi2(TestAssignApi):
+
     def init_dtype(self):
         self.dtype = "int32"
 
 
 class TestAssignApi3(TestAssignApi):
+
     def init_dtype(self):
         self.dtype = "int64"
 
 
 class TestAssignApi4(TestAssignApi):
+
     def setUp(self):
         self.init_dtype()
-        self.value = numpy.random.choice(
-            a=[False, True], size=(2, 5)).astype(numpy.bool)
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.value = numpy.random.choice(a=[False, True],
+                                         size=(2, 5)).astype(numpy.bool)
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 
     def init_dtype(self):
         self.dtype = "bool"
diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
index 17507c70d90d2..9dee8088ecd96 100644
--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -38,21 +38,19 @@ def convolutional_neural_network(use_py_reader):
                 iterable=False,
                 use_double_buffer=False)
 
-        conv_pool_1 = fluid.nets.simple_img_conv_pool(
-            input=img,
-            filter_size=5,
-            num_filters=20,
-            pool_size=2,
-            pool_stride=2,
-            act="relu")
+        conv_pool_1 = fluid.nets.simple_img_conv_pool(input=img,
+                                                      filter_size=5,
+                                                      num_filters=20,
+                                                      pool_size=2,
+                                                      pool_stride=2,
+                                                      act="relu")
         conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-        conv_pool_2 = fluid.nets.simple_img_conv_pool(
-            input=conv_pool_1,
-            filter_size=5,
-            num_filters=50,
-            pool_size=2,
-            pool_stride=2,
-            act="relu")
+        conv_pool_2 = fluid.nets.simple_img_conv_pool(input=conv_pool_1,
+                                                      filter_size=5,
+                                                      num_filters=50,
+                                                      pool_size=2,
+                                                      pool_stride=2,
+                                                      act="relu")
 
         prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
         loss = fluid.layers.cross_entropy(input=prediction, label=label)
@@ -69,8 +67,8 @@ def test():
     place = fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                               batch_size=BATCH_SIZE)
 
     array, img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
         use_py_reader=False)
@@ -113,10 +111,9 @@ def train(use_cuda, thread_num, cpu_num):
     optimizer.minimize(avg_loss)
     print("Adam optimizer minimize done.")
 
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=500),
-        batch_size=BATCH_SIZE)
+    train_reader = paddle.batch(paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+                                batch_size=BATCH_SIZE)
     print("declared train reader done.")
 
     place = fluid.CPUPlace()
@@ -138,12 +135,11 @@ def train(use_cuda, thread_num, cpu_num):
     exec_strategy.num_iteration_per_run = 10
 
     main_program = fluid.default_main_program()
-    pe = fluid.ParallelExecutor(
-        use_cuda=False,
-        loss_name=avg_loss.name,
-        main_program=main_program,
-        build_strategy=build_strategy,
-        exec_strategy=exec_strategy)
+    pe = fluid.ParallelExecutor(use_cuda=False,
+                                loss_name=avg_loss.name,
+                                main_program=main_program,
+                                build_strategy=build_strategy,
+                                exec_strategy=exec_strategy)
     print("declare parallel executor done.")
 
     py_reader.set_sample_list_generator(train_reader)
@@ -172,24 +168,24 @@ def train(use_cuda, thread_num, cpu_num):
 
 
 class TestAsyncSSAGraphExecutor(unittest.TestCase):
+
     def test_check_async_ssa_exe_train(self):
         step_list = []
         for cpu_num in [1, 2, 4]:
             print("run cpu_num -> " + str(cpu_num))
             with fluid.scope_guard(fluid.core.Scope()):
-                with fluid.program_guard(
-                        main_program=fluid.Program(),
-                        startup_program=fluid.Program()):
+                with fluid.program_guard(main_program=fluid.Program(),
+                                         startup_program=fluid.Program()):
                     start_time = time.time()
-                    step = train(
-                        use_cuda=False, thread_num=cpu_num, cpu_num=cpu_num)
+                    step = train(use_cuda=False,
+                                 thread_num=cpu_num,
+                                 cpu_num=cpu_num)
                     end_time = time.time()
                     step_list.append(step)
                 print("cpu_num -> " + str(cpu_num) + " step -> " + str(step) +
                       " time -> " + str(end_time - start_time))
-                with fluid.program_guard(
-                        main_program=fluid.Program(),
-                        startup_program=fluid.Program()):
+                with fluid.program_guard(main_program=fluid.Program(),
+                                         startup_program=fluid.Program()):
                     test()
         assert abs(int(step_list[0] / 2) - int(step_list[1])) < 5
         assert abs(int(step_list[1] / 2) - int(step_list[2])) < 5
diff --git a/python/paddle/fluid/tests/unittests/test_atan2_op.py b/python/paddle/fluid/tests/unittests/test_atan2_op.py
index ca0e2d2ba6dda..90e2a37453fc3 100644
--- a/python/paddle/fluid/tests/unittests/test_atan2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_atan2_op.py
@@ -34,6 +34,7 @@ def atan2_grad(x1, x2, dout):
 
 
 class TestAtan2(OpTest):
+
     def setUp(self):
         self.op_type = "atan2"
         self.python_api = paddle.atan2
@@ -57,36 +58,40 @@ def init_dtype(self):
 
 
 class TestAtan2_float(TestAtan2):
+
     def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_grad(self):
         if self.dtype not in [np.int32, np.int64]:
-            self.check_grad(
-                ['X1', 'X2'],
-                'Out',
-                user_defined_grads=atan2_grad(self.inputs['X1'],
-                                              self.inputs['X2'],
-                                              1 / self.inputs['X1'].size),
-                check_eager=True)
+            self.check_grad(['X1', 'X2'],
+                            'Out',
+                            user_defined_grads=atan2_grad(
+                                self.inputs['X1'], self.inputs['X2'],
+                                1 / self.inputs['X1'].size),
+                            check_eager=True)
 
 
 class TestAtan2_float16(TestAtan2_float):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestAtan2_int32(TestAtan2_float):
+
     def init_dtype(self):
         self.dtype = np.int32
 
 
 class TestAtan2_int64(TestAtan2_float):
+
     def init_dtype(self):
         self.dtype = np.int64
 
 
 class TestAtan2API(unittest.TestCase):
+
     def init_dtype(self):
         self.dtype = 'float64'
         self.shape = [11, 17]
@@ -117,6 +122,7 @@ def run(place):
             run(place)
 
     def test_dygraph_api(self):
+
         def run(place):
             paddle.disable_static(place)
             X1 = paddle.to_tensor(self.x1)
diff --git a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
index a5fb80b09702b..053b716e95fc4 100644
--- a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
@@ -46,8 +46,8 @@ def attention_lstm(
     start_offset = 0
     for bid in range(N):
         seq_len = lod[0][bid]
-        xi = np.copy(x[start_offset:start_offset + seq_len, :]).reshape(seq_len,
-                                                                        M)
+        xi = np.copy(x[start_offset:start_offset + seq_len, :]).reshape(
+            seq_len, M)
         prev_cell = np.copy(c0[bid]).reshape([1, D])
         prev_hidden = np.copy(h0[bid]).reshape([1, D])
         for step in range(seq_len):
@@ -88,6 +88,7 @@ def attention_lstm(
 
 
 class TestAttentionLSTMOp(OpTest):
+
     def set_conf(self):
         pass
 
@@ -156,11 +157,13 @@ def test_check_output(self):
 
 
 class TestAttentionOpNonInit(TestAttentionLSTMOp):
+
     def set_conf(self):
         self.has_initial_hidden = False
 
 
 class TestAttentionOpAct(TestAttentionLSTMOp):
+
     def set_conf(self):
         self.M = 3
         self.D = 2
@@ -170,24 +173,28 @@ def set_conf(self):
 
 
 class TestAttentionOpMD1(TestAttentionLSTMOp):
+
     def set_conf(self):
         self.M = 36
         self.D = 8
 
 
 class TestAttentionOpMD2(TestAttentionLSTMOp):
+
     def set_conf(self):
         self.M = 8
         self.D = 8
 
 
 class TestAttentionOpMD3(TestAttentionLSTMOp):
+
     def set_conf(self):
         self.M = 15
         self.D = 30
 
 
 class TestAttentionOpBS1(TestAttentionLSTMOp):
+
     def set_conf(self):
         self.lod = [[5]]
         self.M = 16
@@ -195,11 +202,13 @@ def set_conf(self):
 
 
 class TestAttentionOpBS2(TestAttentionLSTMOp):
+
     def set_conf(self):
         self.lod = [[3, 6]]
 
 
 class TestAttentionOpBS5(TestAttentionLSTMOp):
+
     def set_conf(self):
         self.lod = [[3, 2, 4, 7, 5]]
 
diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py
index 6568da5d00cbd..c2c206905e3ac 100644
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
@@ -22,6 +22,7 @@
 
 
 class TestAucOp(OpTest):
+
     def setUp(self):
         self.op_type = "auc"
         pred = np.random.random((128, 2)).astype("float32")
@@ -29,10 +30,10 @@ def setUp(self):
         num_thresholds = 200
         slide_steps = 1
 
-        stat_pos = np.zeros((1 + slide_steps) * (num_thresholds + 1) + 1,
-                            ).astype("int64")
-        stat_neg = np.zeros((1 + slide_steps) * (num_thresholds + 1) + 1,
-                            ).astype("int64")
+        stat_pos = np.zeros(
+            (1 + slide_steps) * (num_thresholds + 1) + 1, ).astype("int64")
+        stat_neg = np.zeros(
+            (1 + slide_steps) * (num_thresholds + 1) + 1, ).astype("int64")
 
         self.inputs = {
             'Predict': pred,
@@ -66,6 +67,7 @@ def test_check_output(self):
 
 
 class TestGlobalAucOp(OpTest):
+
     def setUp(self):
         self.op_type = "auc"
         pred = np.random.random((128, 2)).astype("float32")
@@ -106,6 +108,7 @@ def test_check_output(self):
 
 
 class TestAucOpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
 
@@ -117,8 +120,9 @@ def test_type1():
             self.assertRaises(TypeError, test_type1)
 
             def test_type2():
-                data2 = fluid.data(
-                    name="input2", shape=[-1, 2], dtype="float32")
+                data2 = fluid.data(name="input2",
+                                   shape=[-1, 2],
+                                   dtype="float32")
                 label2 = fluid.data(name="label2", shape=[-1], dtype="float32")
                 result2 = fluid.layers.auc(input=data2, label=label2)
 
diff --git a/python/paddle/fluid/tests/unittests/test_auc_single_pred_op.py b/python/paddle/fluid/tests/unittests/test_auc_single_pred_op.py
index 5093dc1f990a9..aba58e3593d50 100644
--- a/python/paddle/fluid/tests/unittests/test_auc_single_pred_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_single_pred_op.py
@@ -21,6 +21,7 @@
 
 
 class TestAucSinglePredOp(OpTest):
+
     def setUp(self):
         self.op_type = "auc"
         pred = np.random.random((128, 2)).astype("float32")
@@ -29,10 +30,10 @@ def setUp(self):
         num_thresholds = 200
         slide_steps = 1
 
-        stat_pos = np.zeros((1 + slide_steps) * (num_thresholds + 1) + 1,
-                            ).astype("int64")
-        stat_neg = np.zeros((1 + slide_steps) * (num_thresholds + 1) + 1,
-                            ).astype("int64")
+        stat_pos = np.zeros(
+            (1 + slide_steps) * (num_thresholds + 1) + 1, ).astype("int64")
+        stat_neg = np.zeros(
+            (1 + slide_steps) * (num_thresholds + 1) + 1, ).astype("int64")
 
         self.inputs = {
             'Predict': pred0,
@@ -68,6 +69,7 @@ def test_check_output(self):
 
 
 class TestAucGlobalSinglePredOp(OpTest):
+
     def setUp(self):
         self.op_type = "auc"
         pred = np.random.random((128, 2)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
index 3faf7f6862058..ce1dfa743645a 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
@@ -36,6 +36,7 @@
 
 
 class AutoCheckPointACLBase(AutoCheckpointBase):
+
     def setUp(self):
         get_logger()
         logger.info("enter tests")
@@ -203,6 +204,7 @@ def _test_corner_epoch_no(self, break_epoch_no):
 
 
 class AutoCheckpointTest(AutoCheckPointACLBase):
+
     def setUp(self):
         get_logger()
         logger.info("enter tests")
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
index fca1baf85e56e..c9172e74f28a3 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
@@ -37,6 +37,7 @@
 
 
 class AutoCheckpointTest1(AutoCheckPointACLBase):
+
     def setUp(self):
         get_logger()
         logger.info("enter tests")
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
index 0c17807a689e6..22b3c15053a41 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
@@ -37,6 +37,7 @@
 
 
 class AutoCheckpointTest2(AutoCheckPointACLBase):
+
     def setUp(self):
         get_logger()
         logger.info("enter tests")
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
index ca103be59b967..8d847fe9704e0 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
@@ -37,6 +37,7 @@
 
 
 class AutoCheckpointTest3(AutoCheckPointACLBase):
+
     def setUp(self):
         get_logger()
         logger.info("enter tests")
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
index 3eeff91ff2d83..c0aa13aa03f0d 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
@@ -37,6 +37,7 @@
 
 
 class AutoCheckpointTestDist(AutoCheckPointACLBase):
+
     def setUp(self):
         get_logger()
         logger.info("enter tests")
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
index f8c12f8905112..da7f2af169d92 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
@@ -37,6 +37,7 @@
 
 
 class AutoCheckpointTestMul(AutoCheckPointACLBase):
+
     def setUp(self):
         get_logger()
         logger.info("enter tests")
diff --git a/python/paddle/fluid/tests/unittests/test_auto_growth_gpu_memory_limit.py b/python/paddle/fluid/tests/unittests/test_auto_growth_gpu_memory_limit.py
index 3ff67a923a209..948aa58990d0c 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_growth_gpu_memory_limit.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_growth_gpu_memory_limit.py
@@ -23,6 +23,7 @@
 
 
 class TestBase(unittest.TestCase):
+
     def setUp(self):
         if fluid.is_compiled_with_cuda():
             self._limit = fluid.core.globals()['FLAGS_gpu_memory_limit_mb']
@@ -35,8 +36,7 @@ def test_allocate(self):
 
         place = fluid.CUDAPlace(0)
         t = fluid.LoDTensor()
-        t.set(np.ndarray(
-            [int(self._limit / 2), other_dim], dtype='float32'),
+        t.set(np.ndarray([int(self._limit / 2), other_dim], dtype='float32'),
               place)
         del t
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py
index 7d94139e9a881..f4a02679b3220 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py
@@ -29,6 +29,7 @@
 
 
 class SimpleNet(nn.Layer):
+
     def __init__(self, vocab_size=128, hidden_size=4):
         super(SimpleNet, self).__init__()
         self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
@@ -37,10 +38,11 @@ def __init__(self, vocab_size=128, hidden_size=4):
 
     def forward(self, x, y):
         # Test shard_tensor interface with dist_attr arg
-        x = dist.shard_tensor(
-            x,
-            dist_attr={"process_mesh": process_mesh1,
-                       "dims_mapping": [0, -1]})
+        x = dist.shard_tensor(x,
+                              dist_attr={
+                                  "process_mesh": process_mesh1,
+                                  "dims_mapping": [0, -1]
+                              })
         emb_out = self.word_embeddings(x)
         # Test shard_tensor interface with no dist_attr arg
         y = dist.shard_tensor(y)
@@ -51,15 +53,18 @@ def forward(self, x, y):
 
 
 class TestAutoParallelAPI(unittest.TestCase):
+
     def test_api(self):
         dist_context = get_default_distributed_context()
 
         net = SimpleNet()
         data1 = fluid.layers.fill_constant(shape=[2, 4], value=1, dtype="int64")
-        data2 = fluid.layers.fill_constant(
-            shape=[2, 4], value=2, dtype="float32")
-        data3 = fluid.layers.fill_constant(
-            shape=[2, 4], value=4, dtype="float32")
+        data2 = fluid.layers.fill_constant(shape=[2, 4],
+                                           value=2,
+                                           dtype="float32")
+        data3 = fluid.layers.fill_constant(shape=[2, 4],
+                                           value=4,
+                                           dtype="float32")
 
         x, y = net.forward(data1, data2)
 
@@ -86,17 +91,16 @@ def test_api(self):
         # Test shard_op interface with dist_attr
         dims_mapping1 = [0, 1]
         dims_mapping2 = [-1, 0]
-        dist_add = dist.shard_op(
-            paddle.add,
-            dist_attr={
-                data2: {
-                    "process_mesh": process_mesh2,
-                    "dims_mapping": dims_mapping1
-                },
-                data3: {
-                    "dims_mapping": dims_mapping2
-                }
-            })
+        dist_add = dist.shard_op(paddle.add,
+                                 dist_attr={
+                                     data2: {
+                                         "process_mesh": process_mesh2,
+                                         "dims_mapping": dims_mapping1
+                                     },
+                                     data3: {
+                                         "dims_mapping": dims_mapping2
+                                     }
+                                 })
         results = dist_add(data2, data3)
         ops = paddle.static.default_main_program().block(0).ops
         last_op = ops[-1]
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_autoconvert.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_autoconvert.py
index 131f2d299b5d7..0390176fb5857 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_autoconvert.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_autoconvert.py
@@ -21,6 +21,7 @@
 
 
 class TestAutoParallelAutoConvert(TestMultipleGpus):
+
     def test_auto_parallel_autoconvert(self):
         self.run_mnist_2gpu('auto_parallel_autoconvert.py')
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
index 55b3665443713..7ef5516bc047e 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
@@ -200,6 +200,7 @@
 
 
 class TestAutoParallelCluster(unittest.TestCase):
+
     def test_cluster(self):
         cluster_json_file = ""
         cluster_json_object = json.loads(cluster_json)
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
index bc4f1671f4e20..393d79557a927 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
@@ -33,6 +33,7 @@
 from paddle.distributed.auto_parallel.utils import append_distributed_attr_suffix
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed.auto_parallel.dist_context import set_default_distributed_context
+
 paddle.enable_static()
 _global_parallel_strategy = None
 _global_process_mesh = None
@@ -40,6 +41,7 @@
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -48,57 +50,55 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
         self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input):
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
         elif _global_parallel_strategy == "pp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh2,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh2,
+                                  "dims_mapping": [1, -1]
+                              })
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -115,36 +115,33 @@ def mlp_pretrain_forward(train_program, start_program):
         batch_size = 4
         hidden_size = 1024
         sequence_len = 512
-        input = static.data(
-            name="input",
-            shape=[batch_size, sequence_len, hidden_size],
-            dtype='float32')
+        input = static.data(name="input",
+                            shape=[batch_size, sequence_len, hidden_size],
+                            dtype='float32')
 
         if _global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1, -1]
-                })
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1, -1]
-                })
-
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1, -1]
+                              })
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       dropout_ratio=0.1,
+                       initializer_range=0.02)
         out = mlp(input)
     return train_program, start_program
 
 
 class TestMLPAutoCompletion(unittest.TestCase):
+
     def test_mlp_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
@@ -153,8 +150,8 @@ def test_mlp_dp(self):
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
-        train_program, start_program = mlp_pretrain_forward(train_program,
-                                                            start_program)
+        train_program, start_program = mlp_pretrain_forward(
+            train_program, start_program)
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
@@ -169,8 +166,8 @@ def test_mlp_mp(self):
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
-        train_program, start_program = mlp_pretrain_forward(train_program,
-                                                            start_program)
+        train_program, start_program = mlp_pretrain_forward(
+            train_program, start_program)
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
@@ -186,8 +183,8 @@ def test_mlp_dp_mp(self):
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
-        train_program, start_program = mlp_pretrain_forward(train_program,
-                                                            start_program)
+        train_program, start_program = mlp_pretrain_forward(
+            train_program, start_program)
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
@@ -245,6 +242,7 @@ def test_mlp_dp_mp(self):
 
 
 class AttentionLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  sequence_len=512,
@@ -266,34 +264,40 @@ def __init__(self,
         self.initializer_range = initializer_range
         self.training = True
         self.attn_mask = None
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
-        self.q_proj = nn.Linear(
-            self.embed_dim, self.embed_dim, weight_attr, bias_attr=bias_attr)
-        self.k_proj = nn.Linear(
-            self.kdim, self.embed_dim, weight_attr, bias_attr=bias_attr)
-        self.v_proj = nn.Linear(
-            self.vdim, self.embed_dim, weight_attr, bias_attr=bias_attr)
-        self.out_proj = nn.Linear(
-            self.embed_dim, self.embed_dim, weight_attr, bias_attr=bias_attr)
+        self.q_proj = nn.Linear(self.embed_dim,
+                                self.embed_dim,
+                                weight_attr,
+                                bias_attr=bias_attr)
+        self.k_proj = nn.Linear(self.kdim,
+                                self.embed_dim,
+                                weight_attr,
+                                bias_attr=bias_attr)
+        self.v_proj = nn.Linear(self.vdim,
+                                self.embed_dim,
+                                weight_attr,
+                                bias_attr=bias_attr)
+        self.out_proj = nn.Linear(self.embed_dim,
+                                  self.embed_dim,
+                                  weight_attr,
+                                  bias_attr=bias_attr)
 
     def forward(self, input):
         if _global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1, -1]
-                })
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1, -1]
-                })
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1, -1]
+                              })
 
         q = self.q_proj(input)
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
@@ -303,43 +307,37 @@ def forward(self, input):
         v = self.v_proj(input)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -347,8 +345,10 @@ def forward(self, input):
         v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
 
         # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
+        product = layers.matmul(x=q,
+                                y=k,
+                                transpose_y=True,
+                                alpha=self.head_dim**-0.5)
 
         if self.attn_mask is not None:
             product = product + self.attn_mask
@@ -356,11 +356,10 @@ def forward(self, input):
         weights = F.softmax(product)
 
         if self.dropout_ratio:
-            weights = F.dropout(
-                weights,
-                self.dropout_ratio,
-                training=self.training,
-                mode="upscale_in_train")
+            weights = F.dropout(weights,
+                                self.dropout_ratio,
+                                training=self.training,
+                                mode="upscale_in_train")
 
         out = tensor.matmul(weights, v)
 
@@ -371,19 +370,17 @@ def forward(self, input):
         # project to output
         out = self.out_proj(out)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
 
         return out
 
@@ -394,23 +391,22 @@ def attn_pretrain_forward(train_program, start_program):
         batch_size = 4
         hidden_size = 1024
         sequence_len = 512
-        input = static.data(
-            name="query",
-            shape=[batch_size, sequence_len, hidden_size],
-            dtype='float32')
-        attn = AttentionLayer(
-            hidden_size=hidden_size,
-            sequence_len=sequence_len,
-            intermediate_size=4 * hidden_size,
-            num_heads=16,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+        input = static.data(name="query",
+                            shape=[batch_size, sequence_len, hidden_size],
+                            dtype='float32')
+        attn = AttentionLayer(hidden_size=hidden_size,
+                              sequence_len=sequence_len,
+                              intermediate_size=4 * hidden_size,
+                              num_heads=16,
+                              dropout_ratio=0.1,
+                              initializer_range=0.02)
         out = attn(input)
 
     return train_program, start_program
 
 
 class TestAttentionAutoCompletion(unittest.TestCase):
+
     def test_attn_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
@@ -419,8 +415,8 @@ def test_attn_dp(self):
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
-        train_program, start_program = attn_pretrain_forward(train_program,
-                                                             start_program)
+        train_program, start_program = attn_pretrain_forward(
+            train_program, start_program)
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
@@ -437,8 +433,8 @@ def test_attn_mp(self):
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
-        train_program, start_program = attn_pretrain_forward(train_program,
-                                                             start_program)
+        train_program, start_program = attn_pretrain_forward(
+            train_program, start_program)
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
@@ -454,8 +450,8 @@ def test_attn_dp_mp(self):
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
-        train_program, start_program = attn_pretrain_forward(train_program,
-                                                             start_program)
+        train_program, start_program = attn_pretrain_forward(
+            train_program, start_program)
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
@@ -463,6 +459,7 @@ def test_attn_dp_mp(self):
 
 
 class DecoderLayer(nn.Layer):
+
     def __init__(self,
                  vocab_size=32768,
                  hidden_size=1024,
@@ -492,29 +489,37 @@ def __init__(self,
         self.word_embeddings = nn.Embedding(
             self.vocab_size,
             self.hidden_size,
-            weight_attr=paddle.ParamAttr(
-                name="word_embeddings",
-                initializer=nn.initializer.Normal(
-                    mean=0.0, std=self.initializer_range)))
+            weight_attr=paddle.ParamAttr(name="word_embeddings",
+                                         initializer=nn.initializer.Normal(
+                                             mean=0.0,
+                                             std=self.initializer_range)))
         self.position_embeddings = nn.Embedding(
             self.max_position_embeddings,
             self.hidden_size,
-            weight_attr=paddle.ParamAttr(
-                name="pos_embeddings",
-                initializer=nn.initializer.Normal(
-                    mean=0.0, std=self.initializer_range)))
+            weight_attr=paddle.ParamAttr(name="pos_embeddings",
+                                         initializer=nn.initializer.Normal(
+                                             mean=0.0,
+                                             std=self.initializer_range)))
 
         weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
             mean=0.0, std=self.initializer_range))
         bias_attr = None
-        self.q_proj = nn.Linear(
-            self.embed_dim, self.embed_dim, weight_attr, bias_attr=bias_attr)
-        self.k_proj = nn.Linear(
-            self.kdim, self.embed_dim, weight_attr, bias_attr=bias_attr)
-        self.v_proj = nn.Linear(
-            self.vdim, self.embed_dim, weight_attr, bias_attr=bias_attr)
-        self.out_proj = nn.Linear(
-            self.embed_dim, self.embed_dim, weight_attr, bias_attr=bias_attr)
+        self.q_proj = nn.Linear(self.embed_dim,
+                                self.embed_dim,
+                                weight_attr,
+                                bias_attr=bias_attr)
+        self.k_proj = nn.Linear(self.kdim,
+                                self.embed_dim,
+                                weight_attr,
+                                bias_attr=bias_attr)
+        self.v_proj = nn.Linear(self.vdim,
+                                self.embed_dim,
+                                weight_attr,
+                                bias_attr=bias_attr)
+        self.out_proj = nn.Linear(self.embed_dim,
+                                  self.embed_dim,
+                                  weight_attr,
+                                  bias_attr=bias_attr)
 
         intermediate_size = 4 * self.hidden_size
         d_model = self.hidden_size
@@ -522,10 +527,14 @@ def __init__(self,
         weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
             mean=0.0, std=self.initializer_range))
         bias_attr = None
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
         self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
         self.dropout1 = nn.Dropout(self.dropout_ratio)
@@ -534,37 +543,33 @@ def __init__(self,
 
     def forward(self, input_ids, position_ids):
         if _global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                input_ids,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(input_ids,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                input_ids,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(input_ids,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
 
         input_embeddings = self.word_embeddings(input_ids)
         position_embeddings = self.position_embeddings(position_ids)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.word_embeddings.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.word_embeddings.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.word_embeddings.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.word_embeddings.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
 
         embeddings = input_embeddings + position_embeddings
         embeddings = self.dropout1(embeddings)
@@ -581,43 +586,37 @@ def forward(self, input_ids, position_ids):
         v = self.v_proj(target)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -625,8 +624,10 @@ def forward(self, input_ids, position_ids):
         v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
 
         # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
+        product = layers.matmul(x=q,
+                                y=k,
+                                transpose_y=True,
+                                alpha=self.head_dim**-0.5)
 
         if self.attn_mask is not None:
             product = product + self.attn_mask
@@ -634,11 +635,10 @@ def forward(self, input_ids, position_ids):
         weights = F.softmax(product)
 
         if self.dropout_ratio:
-            weights = F.dropout(
-                weights,
-                self.dropout_ratio,
-                training=self.training,
-                mode="upscale_in_train")
+            weights = F.dropout(weights,
+                                self.dropout_ratio,
+                                training=self.training,
+                                mode="upscale_in_train")
 
         out = tensor.matmul(weights, v)
 
@@ -650,19 +650,17 @@ def forward(self, input_ids, position_ids):
         out = self.out_proj(out)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
 
         # Add residual
         residual = embeddings + self.dropout2(out)
@@ -676,31 +674,27 @@ def forward(self, input_ids, position_ids):
         out3 = self.linear1(out2)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
 
         # Add residual
         final = residual + self.dropout3(out3)
@@ -713,27 +707,27 @@ def decoder_pretrain_forward(train_program, start_program):
         batch_size = 4
         hidden_size = 1024
         sequence_len = 512
-        input_ids = static.data(
-            name="input_ids", shape=[batch_size, sequence_len], dtype='int64')
-        position_ids = static.data(
-            name="position_ids",
-            shape=[batch_size, sequence_len],
-            dtype='int64')
-        decoder = DecoderLayer(
-            vocab_size=32768,
-            hidden_size=hidden_size,
-            sequence_len=sequence_len,
-            max_position_embeddings=512,
-            intermediate_size=4 * hidden_size,
-            num_heads=16,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+        input_ids = static.data(name="input_ids",
+                                shape=[batch_size, sequence_len],
+                                dtype='int64')
+        position_ids = static.data(name="position_ids",
+                                   shape=[batch_size, sequence_len],
+                                   dtype='int64')
+        decoder = DecoderLayer(vocab_size=32768,
+                               hidden_size=hidden_size,
+                               sequence_len=sequence_len,
+                               max_position_embeddings=512,
+                               intermediate_size=4 * hidden_size,
+                               num_heads=16,
+                               dropout_ratio=0.1,
+                               initializer_range=0.02)
         out = decoder(input_ids, position_ids)
 
     return train_program, start_program
 
 
 class TestDecoderLayerAutoCompletion(unittest.TestCase):
+
     def test_decoder_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
@@ -742,8 +736,8 @@ def test_decoder_dp(self):
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
-        train_program, start_program = decoder_pretrain_forward(train_program,
-                                                                start_program)
+        train_program, start_program = decoder_pretrain_forward(
+            train_program, start_program)
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
@@ -758,8 +752,8 @@ def test_decoder_mp(self):
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
-        train_program, start_program = decoder_pretrain_forward(train_program,
-                                                                start_program)
+        train_program, start_program = decoder_pretrain_forward(
+            train_program, start_program)
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
@@ -775,8 +769,8 @@ def test_decoder_dp_mp(self):
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
-        train_program, start_program = decoder_pretrain_forward(train_program,
-                                                                start_program)
+        train_program, start_program = decoder_pretrain_forward(
+            train_program, start_program)
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
index 1a9f70b352859..ab110c929f5c5 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
@@ -78,17 +78,27 @@ def __init__(self,
             if self.fuse:
                 assert self.kdim == embed_dim
                 assert self.vdim == embed_dim
-                self.qkv_proj = nn.Linear(
-                    embed_dim, 3 * embed_dim, weight_attr, bias_attr=bias_attr)
+                self.qkv_proj = nn.Linear(embed_dim,
+                                          3 * embed_dim,
+                                          weight_attr,
+                                          bias_attr=bias_attr)
             else:
-                self.q_proj = nn.Linear(
-                    embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
-                self.k_proj = nn.Linear(
-                    self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
-                self.v_proj = nn.Linear(
-                    self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)
-            self.out_proj = nn.Linear(
-                embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
+                self.q_proj = nn.Linear(embed_dim,
+                                        embed_dim,
+                                        weight_attr,
+                                        bias_attr=bias_attr)
+                self.k_proj = nn.Linear(self.kdim,
+                                        embed_dim,
+                                        weight_attr,
+                                        bias_attr=bias_attr)
+                self.v_proj = nn.Linear(self.vdim,
+                                        embed_dim,
+                                        weight_attr,
+                                        bias_attr=bias_attr)
+            self.out_proj = nn.Linear(embed_dim,
+                                      embed_dim,
+                                      weight_attr,
+                                      bias_attr=bias_attr)
 
     def _fuse_prepare_qkv(self, query):
         mix_layer = self.qkv_proj(query)
@@ -107,19 +117,17 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
         q = self.q_proj(query)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
 
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
         q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
@@ -152,36 +160,32 @@ def compute_kv(self, key, value):
         k = self.k_proj(key)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
 
         v = self.v_proj(value)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -237,19 +241,20 @@ def forward(self,
             q, k, v, cache = self._prepare_qkv(query, key, value, use_cache,
                                                cache)
         # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
+        product = layers.matmul(x=q,
+                                y=k,
+                                transpose_y=True,
+                                alpha=self.head_dim**-0.5)
 
         if attn_mask is not None:
             product = product + attn_mask
 
         weights = F.softmax(product)
         if self.dropout:
-            weights = F.dropout(
-                weights,
-                self.dropout,
-                training=self.training,
-                mode="upscale_in_train")
+            weights = F.dropout(weights,
+                                self.dropout,
+                                training=self.training,
+                                mode="upscale_in_train")
 
         out = tensor.matmul(weights, v)
 
@@ -261,19 +266,17 @@ def forward(self,
         out = self.out_proj(out)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
 
         outs = [out]
         if self.need_weights:
@@ -395,24 +398,21 @@ def __init__(self,
         weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
         bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
 
-        self.self_attn = MultiHeadAttention(
-            d_model,
-            nhead,
-            dropout=attn_dropout,
-            weight_attr=weight_attrs[0],
-            bias_attr=bias_attrs[0],
-            topo=topo)
+        self.self_attn = MultiHeadAttention(d_model,
+                                            nhead,
+                                            dropout=attn_dropout,
+                                            weight_attr=weight_attrs[0],
+                                            bias_attr=bias_attrs[0],
+                                            topo=topo)
         if topo is None or topo.mp_info.size == 1:
-            self.linear1 = nn.Linear(
-                d_model,
-                dim_feedforward,
-                weight_attrs[2],
-                bias_attr=bias_attrs[2])
-            self.linear2 = nn.Linear(
-                dim_feedforward,
-                d_model,
-                weight_attrs[2],
-                bias_attr=bias_attrs[2])
+            self.linear1 = nn.Linear(d_model,
+                                     dim_feedforward,
+                                     weight_attrs[2],
+                                     bias_attr=bias_attrs[2])
+            self.linear2 = nn.Linear(dim_feedforward,
+                                     d_model,
+                                     weight_attrs[2],
+                                     bias_attr=bias_attrs[2])
 
         self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
         self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
@@ -440,34 +440,30 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
             tgt = self.norm2(tgt)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.linear2.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.linear2.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.linear2.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.linear2.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
 
         # tgt = self.dropout2(
         #     self.linear2(F.gelu(
@@ -483,8 +479,8 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
         return tgt if use_cache is False else (tgt, incremental_cache)
 
     def gen_cache(self, memory):
-        incremental_cache = self.self_attn.gen_cache(
-            memory, type=self.self_attn.Cache)
+        incremental_cache = self.self_attn.gen_cache(memory,
+                                                     type=self.self_attn.Cache)
         return incremental_cache
 
 
@@ -506,17 +502,16 @@ def __init__(self,
             self.word_embeddings = nn.Embedding(
                 vocab_size,
                 hidden_size,
-                weight_attr=paddle.ParamAttr(
-                    name="word_embeddings",
-                    initializer=nn.initializer.Normal(
-                        mean=0.0, std=initializer_range)))
+                weight_attr=paddle.ParamAttr(name="word_embeddings",
+                                             initializer=nn.initializer.Normal(
+                                                 mean=0.0,
+                                                 std=initializer_range)))
         self.position_embeddings = nn.Embedding(
             max_position_embeddings,
             hidden_size,
-            weight_attr=paddle.ParamAttr(
-                name="pos_embeddings",
-                initializer=nn.initializer.Normal(
-                    mean=0.0, std=initializer_range)))
+            weight_attr=paddle.ParamAttr(name="pos_embeddings",
+                                         initializer=nn.initializer.Normal(
+                                             mean=0.0, std=initializer_range)))
 
         self.dropout = nn.Dropout(hidden_dropout_prob)
 
@@ -529,19 +524,17 @@ def forward(self, input_ids, position_ids=None):
         input_embedings = self.word_embeddings(input_ids)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.word_embeddings.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.word_embeddings.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.word_embeddings.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.word_embeddings.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
 
         position_embeddings = self.position_embeddings(position_ids)
         embeddings = input_embedings + position_embeddings
@@ -580,37 +573,36 @@ def __init__(self,
         if self.pipline_mode:
             self.layer_per_stage = num_hidden_layers // self.topo.pp_info.size
 
-        self.embeddings = GPTEmbeddings(
-            vocab_size, hidden_size, hidden_dropout_prob,
-            max_position_embeddings, type_vocab_size, self.initializer_range,
-            topo)
+        self.embeddings = GPTEmbeddings(vocab_size, hidden_size,
+                                        hidden_dropout_prob,
+                                        max_position_embeddings,
+                                        type_vocab_size, self.initializer_range,
+                                        topo)
 
         decoder_layers = nn.LayerList()
         for i in range(num_hidden_layers):
             DecoderLayer = TransformerDecoderLayer
             decoder_layers.append(
-                DecoderLayer(
-                    d_model=hidden_size,
-                    nhead=num_attention_heads,
-                    dim_feedforward=intermediate_size,
-                    dropout=hidden_dropout_prob,
-                    activation=hidden_act,
-                    attn_dropout=attention_probs_dropout_prob,
-                    act_dropout=hidden_dropout_prob,
-                    weight_attr=paddle.ParamAttr(
-                        initializer=nn.initializer.Normal(
-                            mean=0.0, std=self.initializer_range)),
-                    bias_attr=None,
-                    topo=topo))
+                DecoderLayer(d_model=hidden_size,
+                             nhead=num_attention_heads,
+                             dim_feedforward=intermediate_size,
+                             dropout=hidden_dropout_prob,
+                             activation=hidden_act,
+                             attn_dropout=attention_probs_dropout_prob,
+                             act_dropout=hidden_dropout_prob,
+                             weight_attr=paddle.ParamAttr(
+                                 initializer=nn.initializer.Normal(
+                                     mean=0.0, std=self.initializer_range)),
+                             bias_attr=None,
+                             topo=topo))
 
         Decoder = TransformerDecoder
 
-        self.decoder = Decoder(
-            decoder_layers,
-            num_hidden_layers,
-            norm="LayerNorm",
-            hidden_size=hidden_size,
-            topo=topo)
+        self.decoder = Decoder(decoder_layers,
+                               num_hidden_layers,
+                               norm="LayerNorm",
+                               hidden_size=hidden_size,
+                               topo=topo)
 
         self.checkpoints = []
 
@@ -625,29 +617,27 @@ def forward(self,
             length = paddle.shape(input_ids)[1]
             # Use bool mask
             attention_mask = paddle.tensor.tril(
-                paddle.ones(
-                    (length, length),
-                    dtype=self.embeddings.word_embeddings.weight.dtype))
+                paddle.ones((length, length),
+                            dtype=self.embeddings.word_embeddings.weight.dtype))
         if position_ids is None:
             past_length = 0
             if cache is not None:
                 past_length = paddle.shape(cache[0].k)[-2]
-            position_ids = paddle.arange(
-                past_length,
-                paddle.shape(input_ids)[-1] + past_length,
-                dtype='int64')
+            position_ids = paddle.arange(past_length,
+                                         paddle.shape(input_ids)[-1] +
+                                         past_length,
+                                         dtype='int64')
             position_ids = position_ids.unsqueeze(0)
             # .expand_as(input_ids)
-            position_ids = paddle.fluid.layers.expand_as(position_ids,
-                                                         input_ids)
-        embedding_output = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids)
+            position_ids = paddle.fluid.layers.expand_as(
+                position_ids, input_ids)
+        embedding_output = self.embeddings(input_ids=input_ids,
+                                           position_ids=position_ids)
 
         # TODO, use registered buffer
-        causal_mask = paddle.tensor.triu(
-            paddle.ones((paddle.shape(input_ids)[-1],
-                         paddle.shape(input_ids)[-1])) * -1e9,
-            diagonal=1)
+        causal_mask = paddle.tensor.triu(paddle.ones(
+            (paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1])) * -1e9,
+                                         diagonal=1)
 
         if attention_mask is not None:
             attention_mask = attention_mask + causal_mask
@@ -657,12 +647,11 @@ def forward(self,
         # The tensor returned by triu not in static graph.
         attention_mask.stop_gradient = True
 
-        encoder_outputs = self.decoder(
-            embedding_output,
-            memory=None,
-            tgt_mask=attention_mask,
-            use_cache=use_cache,
-            cache=cache)
+        encoder_outputs = self.decoder(embedding_output,
+                                       memory=None,
+                                       tgt_mask=attention_mask,
+                                       use_cache=use_cache,
+                                       cache=cache)
         self.checkpoints.extend(self.decoder.checkpoints)
         return encoder_outputs
 
@@ -686,8 +675,9 @@ def parallel_matmul(self, lm_output, logit_weights, parallel_output, topo):
             input_parallel = paddle.distributed.collective._c_identity(
                 lm_output, group=None)
 
-            logits = paddle.matmul(
-                input_parallel, logit_weights, transpose_y=True)
+            logits = paddle.matmul(input_parallel,
+                                   logit_weights,
+                                   transpose_y=True)
 
             if parallel_output:
                 return logits
@@ -750,50 +740,49 @@ def gpt_pretrain_forward(train_program, start_program):
                               start_program), utils.unique_name.guard():
         batch_size = 16
         sequence_len = 512
-        input_ids = static.data(
-            name="input_ids", shape=[batch_size, sequence_len], dtype='int64')
-        position_ids = static.data(
-            name="position_ids",
-            shape=[batch_size, sequence_len],
-            dtype='int64')
+        input_ids = static.data(name="input_ids",
+                                shape=[batch_size, sequence_len],
+                                dtype='int64')
+        position_ids = static.data(name="position_ids",
+                                   shape=[batch_size, sequence_len],
+                                   dtype='int64')
         attention_mask = static.data(
             name="attention_mask",
             shape=[batch_size, 1, sequence_len, sequence_len],
             dtype='float64')
-        labels = static.data(
-            name="labels", shape=[batch_size, sequence_len], dtype='int64')
-        loss_mask = static.data(
-            name="loss_mask", shape=[batch_size, sequence_len], dtype='float64')
+        labels = static.data(name="labels",
+                             shape=[batch_size, sequence_len],
+                             dtype='int64')
+        loss_mask = static.data(name="loss_mask",
+                                shape=[batch_size, sequence_len],
+                                dtype='float64')
 
         if _global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                input_ids,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(input_ids,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                input_ids,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
-
-        gpt = GPTModel(
-            vocab_size=32768,
-            hidden_size=1024,
-            num_hidden_layers=2,
-            num_attention_heads=16,
-            intermediate_size=4096,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=1024,
-            type_vocab_size=16,
-            initializer_range=0.02,
-            pad_token_id=0,
-            topo=None)
+            auto.shard_tensor(input_ids,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
+
+        gpt = GPTModel(vocab_size=32768,
+                       hidden_size=1024,
+                       num_hidden_layers=2,
+                       num_attention_heads=16,
+                       intermediate_size=4096,
+                       hidden_act="gelu",
+                       hidden_dropout_prob=0.1,
+                       attention_probs_dropout_prob=0.1,
+                       max_position_embeddings=1024,
+                       type_vocab_size=16,
+                       initializer_range=0.02,
+                       pad_token_id=0,
+                       topo=None)
 
         model = GPTForPretraining(gpt)
 
@@ -807,6 +796,7 @@ def gpt_pretrain_forward(train_program, start_program):
 
 
 class TestGPTAutoCompletion(unittest.TestCase):
+
     def test_gpt_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
@@ -816,8 +806,8 @@ def test_gpt_dp(self):
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
-        train_program, start_program = gpt_pretrain_forward(train_program,
-                                                            start_program)
+        train_program, start_program = gpt_pretrain_forward(
+            train_program, start_program)
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
@@ -832,8 +822,8 @@ def test_gpt_mp(self):
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
-        train_program, start_program = gpt_pretrain_forward(train_program,
-                                                            start_program)
+        train_program, start_program = gpt_pretrain_forward(
+            train_program, start_program)
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
@@ -849,8 +839,8 @@ def test_gpt_dp_mp(self):
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
-        train_program, start_program = gpt_pretrain_forward(train_program,
-                                                            start_program)
+        train_program, start_program = gpt_pretrain_forward(
+            train_program, start_program)
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
index d05e49387933d..bb8642d569e42 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
@@ -46,6 +46,7 @@
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=256,
                  intermediate_size=4 * 256,
@@ -54,28 +55,34 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
 
         self.is_distributed = is_distributed
 
     def forward(self, input):
         if self.is_distributed:
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={"process_mesh": PP_MESH_0,
-                           "dims_mapping": [-1, 1]})
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={"process_mesh": PP_MESH_1,
-                           "dims_mapping": [1, -1]})
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_0,
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_1,
+                                  "dims_mapping": [1, -1]
+                              })
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -89,8 +96,9 @@ def get_single_node_data():
     train_program = paddle.static.Program()
     startup_program = paddle.static.Program()
 
-    loss, train_program, startup_program = mlp_forward(
-        train_program, startup_program, is_distributed=False)
+    loss, train_program, startup_program = mlp_forward(train_program,
+                                                       startup_program,
+                                                       is_distributed=False)
 
     cost_model = core.CostModel()
     cost_data = cost_model.profile_measure(train_program, startup_program,
@@ -112,31 +120,36 @@ def mlp_forward(train_program, start_program, is_distributed=True):
         hidden_size = 256
         sequence_len = 128
         if is_distributed:
-            input = static.data(
-                name="input", shape=[batch_size, hidden_size], dtype='float32')
-            label = static.data(
-                name="label", shape=[batch_size, 1], dtype='float32')
+            input = static.data(name="input",
+                                shape=[batch_size, hidden_size],
+                                dtype='float32')
+            label = static.data(name="label",
+                                shape=[batch_size, 1],
+                                dtype='float32')
         else:
-            input = paddle.ones(
-                name="input", shape=[batch_size, hidden_size], dtype='float32')
-            label = paddle.ones(
-                name="label", shape=[batch_size, 1], dtype='float32')
+            input = paddle.ones(name="input",
+                                shape=[batch_size, hidden_size],
+                                dtype='float32')
+            label = paddle.ones(name="label",
+                                shape=[batch_size, 1],
+                                dtype='float32')
 
         if is_distributed:
-            auto.shard_tensor(
-                input,
-                dist_attr={"process_mesh": PP_MESH_0,
-                           "dims_mapping": [0, -1]})
-            auto.shard_tensor(
-                label,
-                dist_attr={"process_mesh": PP_MESH_1,
-                           "dims_mapping": [0, -1]})
-
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            initializer_range=0.02,
-            is_distributed=is_distributed)
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_0,
+                                  "dims_mapping": [0, -1]
+                              })
+            auto.shard_tensor(label,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_1,
+                                  "dims_mapping": [0, -1]
+                              })
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       initializer_range=0.02,
+                       is_distributed=is_distributed)
 
         predict = mlp(input)
         error_cost = paddle.nn.functional.square_error_cost(predict, label)
@@ -160,13 +173,12 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
         train_program)
     dist_context.block_state.parse_forward_blocks(complete_train_program)
 
-    params_grads = parallelizer._generate_backward(
-        complete_train_program,
-        startup_program,
-        loss,
-        parameter_list=None,
-        no_grad_set=None,
-        callbacks=None)
+    params_grads = parallelizer._generate_backward(complete_train_program,
+                                                   startup_program,
+                                                   loss,
+                                                   parameter_list=None,
+                                                   no_grad_set=None,
+                                                   callbacks=None)
 
     # logical partition
     partitioner = Partitioner(dist_context, rank_id)
@@ -207,18 +219,18 @@ def check_empty_program_memory(cost):
 
 
 class TestCostModel(unittest.TestCase):
+
     def test_empty_program_cost_model(self):
         empty_program = paddle.static.Program()
         startup_program = paddle.static.Program()
         standalone_cost_data = [{}]
         empty_pp_cfg = None
         cluster = None
-        cost = estimate_cost(
-            [empty_program],
-            cluster=cluster,
-            pipeline_config=empty_pp_cfg,
-            standalone_cost_data=standalone_cost_data,
-            batch_size=1)
+        cost = estimate_cost([empty_program],
+                             cluster=cluster,
+                             pipeline_config=empty_pp_cfg,
+                             standalone_cost_data=standalone_cost_data,
+                             batch_size=1)
 
         self.assertTrue(check_empty_program_runtime(cost))
         self.assertTrue(check_empty_program_memory(cost))
@@ -237,12 +249,11 @@ def test_auto_parallel_cost_model(self):
             resharder.reshard()
             dist_program.append(distributed_program)
         cluster = None
-        cost = estimate_cost(
-            dist_program,
-            cluster=cluster,
-            pipeline_config=pp_cfg,
-            standalone_cost_data=standalone_cost_data,
-            batch_size=4)
+        cost = estimate_cost(dist_program,
+                             cluster=cluster,
+                             pipeline_config=pp_cfg,
+                             standalone_cost_data=standalone_cost_data,
+                             batch_size=4)
         self.assertTrue(check_runtime_estimation(cost))
         self.assertTrue(check_memory_estimation(cost))
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_data_unshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_data_unshard.py
index 6cc953dfdee9a..c8753002aa603 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_data_unshard.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_data_unshard.py
@@ -21,6 +21,7 @@
 
 
 class TestAutoParallelDataUnshard(TestMultipleGpus):
+
     def test_auto_parallel_data_unshard(self):
         self.run_mnist_2gpu('auto_parallel_data_unshard.py')
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
index 29575dc76c2a1..ca69535049c3b 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
@@ -49,13 +49,12 @@ def get_dist_prog(train_program,
     ) if complete_train_program is None else complete_train_program
     dist_context.block_state.parse_forward_blocks(complete_train_program)
 
-    params_grads = parallelizer._generate_backward(
-        complete_train_program,
-        startup_program,
-        loss,
-        parameter_list=None,
-        no_grad_set=None,
-        callbacks=None)
+    params_grads = parallelizer._generate_backward(complete_train_program,
+                                                   startup_program,
+                                                   loss,
+                                                   parameter_list=None,
+                                                   no_grad_set=None,
+                                                   callbacks=None)
 
     # logical partition
     partitioner = Partitioner(dist_context, rank_id)
@@ -69,6 +68,7 @@ def get_dist_prog(train_program,
 
 
 class TestDistributedTensor(unittest.TestCase):
+
     def test_new_local_tensor(self):
         test_auto_parallel_reshard._global_process_mesh = auto.ProcessMesh(
             mesh=[0, 1])
@@ -130,40 +130,46 @@ def test_static_method(self):
         topology = [2, 3]
         global_sizes = [6, 6]
 
-        # rank 0 [(0, 2), (0, 3)]  
+        # rank 0 [(0, 2), (0, 3)]
         # rank 1 [(2, 4), (0, 3)]
         # rank 4 [(2, 4), (3, 6)]
         rank = 0
-        local_sizes = DistributedTensor.get_local_sizes(
-            global_sizes, dims_mapping, topology, processes)
+        local_sizes = DistributedTensor.get_local_sizes(global_sizes,
+                                                        dims_mapping, topology,
+                                                        processes)
         self.assertEqual(local_sizes, [2, 3])
         local_offsets = DistributedTensor.get_local_offsets(
             global_sizes, dims_mapping, topology, processes, rank)
         self.assertEqual(local_offsets, [0, 0])
-        local_shard = DistributedTensor.get_local_shard(
-            global_sizes, dims_mapping, topology, processes, rank)
+        local_shard = DistributedTensor.get_local_shard(global_sizes,
+                                                        dims_mapping, topology,
+                                                        processes, rank)
         self.assertEqual(local_shard, [(0, 2), (0, 3)])
 
         rank = 1
-        local_sizes = DistributedTensor.get_local_sizes(
-            global_sizes, dims_mapping, topology, processes)
+        local_sizes = DistributedTensor.get_local_sizes(global_sizes,
+                                                        dims_mapping, topology,
+                                                        processes)
         self.assertEqual(local_sizes, [2, 3])
         local_offsets = DistributedTensor.get_local_offsets(
             global_sizes, dims_mapping, topology, processes, rank)
         self.assertEqual(local_offsets, [2, 0])
-        local_shard = DistributedTensor.get_local_shard(
-            global_sizes, dims_mapping, topology, processes, rank)
+        local_shard = DistributedTensor.get_local_shard(global_sizes,
+                                                        dims_mapping, topology,
+                                                        processes, rank)
         self.assertEqual(local_shard, [(2, 4), (0, 3)])
 
         rank = 4
-        local_sizes = DistributedTensor.get_local_sizes(
-            global_sizes, dims_mapping, topology, processes)
+        local_sizes = DistributedTensor.get_local_sizes(global_sizes,
+                                                        dims_mapping, topology,
+                                                        processes)
         self.assertEqual(local_sizes, [2, 3])
         local_offsets = DistributedTensor.get_local_offsets(
             global_sizes, dims_mapping, topology, processes, rank)
         self.assertEqual(local_offsets, [2, 3])
-        local_shard = DistributedTensor.get_local_shard(
-            global_sizes, dims_mapping, topology, processes, rank)
+        local_shard = DistributedTensor.get_local_shard(global_sizes,
+                                                        dims_mapping, topology,
+                                                        processes, rank)
         self.assertEqual(local_shard, [(2, 4), (3, 6)])
 
         # global sizes
@@ -177,11 +183,12 @@ def test_instance_method(self):
         tensor_dist_attr.dims_mapping = [1, 0]
         tensor_dist_attr.process_mesh = auto.ProcessMesh(
             mesh=[[0, 1, 2], [3, 4, 5]])
-        serial_tensor = paddle.static.data(
-            name="data", shape=[6, 6], dtype='float32')
+        serial_tensor = paddle.static.data(name="data",
+                                           shape=[6, 6],
+                                           dtype='float32')
         dist_tensor = DistributedTensor(serial_tensor, tensor_dist_attr)
 
-        # rank 0 [(0, 2), (0, 3)]  
+        # rank 0 [(0, 2), (0, 3)]
         # rank 1 [(2, 4), (0, 3)]
         # rank 4 [(2, 4), (3, 6)]
         rank = 0
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py
index bbf7e3a46729e..3d69924fd6f30 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py
@@ -23,6 +23,7 @@
 
 
 class TestAutoParallelGraph(unittest.TestCase):
+
     def test_graph(self):
         graph = Graph(name="foo")
         self.assertEqual(graph.attrs["name"], "foo")
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index 45b9defeb7c2f..a147b0f1f376a 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -375,6 +375,7 @@
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=64,
                  intermediate_size=4 * 64,
@@ -392,54 +393,57 @@ def __init__(self,
         weight_attr2 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr2))
         weight_attr3 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr3))
         bias_attr = None
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr1, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr0,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr1,
+                                 bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
-        self.linear2 = nn.Linear(
-            d_model, dim_feedforward, weight_attr2, bias_attr=bias_attr)
-        self.linear3 = nn.Linear(
-            dim_feedforward, d_model, weight_attr3, bias_attr=bias_attr)
+        self.linear2 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr2,
+                                 bias_attr=bias_attr)
+        self.linear3 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr3,
+                                 bias_attr=bias_attr)
 
     def forward(self, input):
         if _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh[0],
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh[0],
-                    "dims_mapping": [1, -1]
-                })
-            auto.shard_tensor(
-                self.linear2.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh[1],
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.linear3.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh[1],
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh[0],
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh[0],
+                                  "dims_mapping": [1, -1]
+                              })
+            auto.shard_tensor(self.linear2.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh[1],
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.linear3.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh[1],
+                                  "dims_mapping": [1, -1]
+                              })
 
         out = self.norm(input)
         out = self.linear0(out)
         out = F.gelu(out, approximate=True)
         out = self.linear1(out)
 
-        auto.shard_tensor(
-            out,
-            dist_attr={
-                "process_mesh": _global_process_mesh[1],
-                "dims_mapping": [0, -1]
-            })
+        auto.shard_tensor(out,
+                          dist_attr={
+                              "process_mesh": _global_process_mesh[1],
+                              "dims_mapping": [0, -1]
+                          })
         out = self.linear2(out)
         out = F.gelu(out, approximate=True)
         out = self.linear3(out)
@@ -451,22 +455,22 @@ def mlp_forward(train_program, start_program):
         utils.unique_name.guard():
         batch_size = 4
         hidden_size = 64
-        input = static.data(
-            name="input", shape=[batch_size, hidden_size], dtype='float32')
-        label = static.data(
-            name="label", shape=[batch_size, 1], dtype='float32')
+        input = static.data(name="input",
+                            shape=[batch_size, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, 1],
+                            dtype='float32')
 
         if _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh[0],
-                    "dims_mapping": [0, -1]
-                })
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            initializer_range=0.02)
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh[0],
+                                  "dims_mapping": [0, -1]
+                              })
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       initializer_range=0.02)
         predict = mlp(input)
         error_cost = paddle.nn.functional.square_error_cost(predict, label)
         loss = paddle.mean(error_cost)
@@ -487,13 +491,12 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     complete_train_program = completer.complete_forward_annotation(
         train_program)
     dist_context.block_state.parse_forward_blocks(complete_train_program)
-    params_grads = parallelizer._generate_backward(
-        complete_train_program,
-        startup_program,
-        loss,
-        parameter_list=None,
-        no_grad_set=None,
-        callbacks=None)
+    params_grads = parallelizer._generate_backward(complete_train_program,
+                                                   startup_program,
+                                                   loss,
+                                                   parameter_list=None,
+                                                   no_grad_set=None,
+                                                   callbacks=None)
 
     partitioner = Partitioner(dist_context, rank_id)
     dist_train_program, dist_startup_prog, dist_params_grads = partitioner.partition(
@@ -523,6 +526,7 @@ def get_device_local_ids(machine):
 
 
 class TestAutoParallelMapper(unittest.TestCase):
+
     def test_mapper_dp_mp_pp(self):
         cluster_json_file = ""
         cluster_json_object = json.loads(cluster_json)
@@ -564,8 +568,8 @@ def test_mapper_dp_mp_pp(self):
                 self.assertTrue(is_in_machine(device_ids[0], machine))
                 machine_mapped_ranks.add(rank)
                 machine_mapped_device_local_ids.add(device_ids[0])
-            self.assertEqual(
-                len(machine_mapped_ranks), len(machine_mapped_device_local_ids))
+            self.assertEqual(len(machine_mapped_ranks),
+                             len(machine_mapped_device_local_ids))
             all_mapped_ranks.update(machine_mapped_ranks)
         self.assertEqual(set(processes), all_mapped_ranks)
 
@@ -596,24 +600,30 @@ def test_mapper_misc(self):
             broadcast_op = train_program.global_block().append_op(
                 type="c_broadcast",
                 inputs={'X': input},
-                attrs={'ring_id': ring_id,
-                       'root': root_id},
+                attrs={
+                    'ring_id': ring_id,
+                    'root': root_id
+                },
                 outputs={'Out': output})
             self.assertEqual(get_comm_volume(broadcast_op, 0, 1), 400)
             self.assertEqual(get_comm_volume(broadcast_op, 1, 0), None)
             allgather_op = train_program.global_block().append_op(
                 type="c_allgather",
                 inputs={'X': input},
-                attrs={'ring_id': ring_id,
-                       'nranks': nranks},
+                attrs={
+                    'ring_id': ring_id,
+                    'nranks': nranks
+                },
                 outputs={'Out': output})
             self.assertEqual(get_comm_volume(allgather_op, 0, 1), 400)
             self.assertEqual(get_comm_volume(allgather_op, 0, 0), None)
             reduce_op = train_program.global_block().append_op(
                 type="c_reduce_sum",
                 inputs={'X': input},
-                attrs={'ring_id': ring_id,
-                       'root_id': root_id},
+                attrs={
+                    'ring_id': ring_id,
+                    'root_id': root_id
+                },
                 outputs={'Out': output})
             self.assertEqual(get_comm_volume(reduce_op, 0, 1), None)
             self.assertEqual(get_comm_volume(reduce_op, 1, 0), 400)
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
index ef8780a020f33..80135b6288531 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
@@ -96,13 +96,13 @@ def initialization_check(mode, dist_context, dist_startup_prog,
                          serial_startup_prog, var_need_broadcast, process_mesh,
                          mp_parallel_axis, dp_parallel_axis):
     if 'mp' in mode:
-        group_ranks = _get_comm_group(
-            process_mesh.processes, process_mesh.topology, mp_parallel_axis, 3)
+        group_ranks = _get_comm_group(process_mesh.processes,
+                                      process_mesh.topology, mp_parallel_axis,
+                                      3)
         mp_ring_id = new_process_group(group_ranks).id
         broadcast_ops = [
-            op for op in dist_startup_prog.global_block().ops
-            if (op.type == "c_broadcast" and op.desc.attr("ring_id") ==
-                mp_ring_id)
+            op for op in dist_startup_prog.global_block().ops if
+            (op.type == "c_broadcast" and op.desc.attr("ring_id") == mp_ring_id)
         ]
         broadcast_varnames = sorted(
             [op.desc.output_arg_names()[0] for op in broadcast_ops])
@@ -110,14 +110,14 @@ def initialization_check(mode, dist_context, dist_startup_prog,
             return False
 
     if 'dp' in mode:
-        group_ranks = _get_comm_group(
-            process_mesh.processes, process_mesh.topology, dp_parallel_axis, 3)
+        group_ranks = _get_comm_group(process_mesh.processes,
+                                      process_mesh.topology, dp_parallel_axis,
+                                      3)
         dp_ring_id = new_process_group(group_ranks).id
         nparam = len(serial_startup_prog.all_parameters())
         nbroadcast_dp = len([
-            op for op in dist_startup_prog.global_block().ops
-            if (op.type == "c_broadcast" and op.desc.attr("ring_id") ==
-                dp_ring_id)
+            op for op in dist_startup_prog.global_block().ops if
+            (op.type == "c_broadcast" and op.desc.attr("ring_id") == dp_ring_id)
         ])
         if nparam != nbroadcast_dp:
             return False
@@ -226,7 +226,7 @@ def distributed_attr_check_for_dist_op(serial_main_prog, dist_main_prog,
             equal = check_equal_var_dist_attr(serial_out_dist_attr,
                                               out_dist_attr)
 
-        # check op's dist_attr 
+        # check op's dist_attr
         equal = check_equal_dist_op_attr(dist_context, dist_main_prog,
                                          serial_op, dist_ops, dist_op_idx[i])
 
@@ -251,6 +251,7 @@ def distributed_attr_check_for_program(dist_main_prog, dist_context):
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -259,57 +260,55 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
         self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input):
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
         else:
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -326,36 +325,33 @@ def mlp_pretrain_forward(train_program, start_program):
         batch_size = 4
         hidden_size = 1024
         sequence_len = 512
-        input = static.data(
-            name="input",
-            shape=[batch_size, sequence_len, hidden_size],
-            dtype='float32')
+        input = static.data(name="input",
+                            shape=[batch_size, sequence_len, hidden_size],
+                            dtype='float32')
 
         if _global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1, -1]
-                })
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1, -1]
-                })
-
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1, -1]
+                              })
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       dropout_ratio=0.1,
+                       initializer_range=0.02)
         out = mlp(input)
     return train_program, start_program
 
 
 class TestMLPAutoPartitioner(unittest.TestCase):
+
     def test_mlp_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
@@ -379,18 +375,17 @@ def test_mlp_dp(self):
         dist_ops = [op.type for op in dist_ops]
         self.assertTrue(serial_ops == dist_ops)
 
-        # parameter initialization 
+        # parameter initialization
         var_need_broadcast = []
         self.assertTrue(
-            initialization_check(
-                _global_parallel_strategy,
-                dist_context,
-                dist_startup_prog,
-                serial_startup_prog,
-                var_need_broadcast,
-                _global_process_mesh,
-                mp_parallel_axis=None,
-                dp_parallel_axis=0))
+            initialization_check(_global_parallel_strategy,
+                                 dist_context,
+                                 dist_startup_prog,
+                                 serial_startup_prog,
+                                 var_need_broadcast,
+                                 _global_process_mesh,
+                                 mp_parallel_axis=None,
+                                 dp_parallel_axis=0))
 
     def test_mlp_mp(self):
         global _global_parallel_strategy
@@ -430,19 +425,18 @@ def test_mlp_mp(self):
         ]
         self.assertTrue(dist_ops == ref_ops)
 
-        # parameter initialization 
+        # parameter initialization
         var_need_broadcast = sorted(
             ['layer_norm_0.b_0', 'layer_norm_0.w_0', 'linear_1.b_0'])
         self.assertTrue(
-            initialization_check(
-                _global_parallel_strategy,
-                dist_context,
-                dist_startup_prog,
-                serial_startup_prog,
-                var_need_broadcast,
-                _global_process_mesh,
-                mp_parallel_axis=0,
-                dp_parallel_axis=None))
+            initialization_check(_global_parallel_strategy,
+                                 dist_context,
+                                 dist_startup_prog,
+                                 serial_startup_prog,
+                                 var_need_broadcast,
+                                 _global_process_mesh,
+                                 mp_parallel_axis=0,
+                                 dp_parallel_axis=None))
 
         # check var and op all have dist_attr in dist_main_program
         self.assertTrue(
@@ -498,15 +492,14 @@ def test_mlp_dp_mp(self):
         var_need_broadcast = sorted(
             ['layer_norm_0.b_0', 'layer_norm_0.w_0', 'linear_1.b_0'])
         self.assertTrue(
-            initialization_check(
-                _global_parallel_strategy,
-                dist_context,
-                dist_startup_prog,
-                serial_startup_prog,
-                var_need_broadcast,
-                _global_process_mesh,
-                mp_parallel_axis=1,
-                dp_parallel_axis=0))
+            initialization_check(_global_parallel_strategy,
+                                 dist_context,
+                                 dist_startup_prog,
+                                 serial_startup_prog,
+                                 var_need_broadcast,
+                                 _global_process_mesh,
+                                 mp_parallel_axis=1,
+                                 dp_parallel_axis=0))
 
         # check var and op all have dist_attr in dist_main_program
         self.assertTrue(
@@ -521,6 +514,7 @@ def test_mlp_dp_mp(self):
 
 
 class AttentionLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  sequence_len=512,
@@ -542,34 +536,40 @@ def __init__(self,
         self.initializer_range = initializer_range
         self.training = True
         self.attn_mask = None
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
-        self.q_proj = nn.Linear(
-            self.embed_dim, self.embed_dim, weight_attr, bias_attr=bias_attr)
-        self.k_proj = nn.Linear(
-            self.kdim, self.embed_dim, weight_attr, bias_attr=bias_attr)
-        self.v_proj = nn.Linear(
-            self.vdim, self.embed_dim, weight_attr, bias_attr=bias_attr)
-        self.out_proj = nn.Linear(
-            self.embed_dim, self.embed_dim, weight_attr, bias_attr=bias_attr)
+        self.q_proj = nn.Linear(self.embed_dim,
+                                self.embed_dim,
+                                weight_attr,
+                                bias_attr=bias_attr)
+        self.k_proj = nn.Linear(self.kdim,
+                                self.embed_dim,
+                                weight_attr,
+                                bias_attr=bias_attr)
+        self.v_proj = nn.Linear(self.vdim,
+                                self.embed_dim,
+                                weight_attr,
+                                bias_attr=bias_attr)
+        self.out_proj = nn.Linear(self.embed_dim,
+                                  self.embed_dim,
+                                  weight_attr,
+                                  bias_attr=bias_attr)
 
     def forward(self, input):
         if _global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1, -1]
-                })
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1, -1]
-                })
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1, -1]
+                              })
 
         q = self.q_proj(input)
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
@@ -579,43 +579,37 @@ def forward(self, input):
         v = self.v_proj(input)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -623,8 +617,10 @@ def forward(self, input):
         v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
 
         # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
+        product = layers.matmul(x=q,
+                                y=k,
+                                transpose_y=True,
+                                alpha=self.head_dim**-0.5)
 
         if self.attn_mask is not None:
             product = product + self.attn_mask
@@ -632,11 +628,10 @@ def forward(self, input):
         weights = F.softmax(product)
 
         if self.dropout_ratio:
-            weights = F.dropout(
-                weights,
-                self.dropout_ratio,
-                training=self.training,
-                mode="upscale_in_train")
+            weights = F.dropout(weights,
+                                self.dropout_ratio,
+                                training=self.training,
+                                mode="upscale_in_train")
 
         out = tensor.matmul(weights, v)
 
@@ -647,19 +642,17 @@ def forward(self, input):
         # project to output
         out = self.out_proj(out)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
 
         return out
 
@@ -670,23 +663,22 @@ def attn_pretrain_forward(train_program, start_program):
         batch_size = 4
         hidden_size = 1024
         sequence_len = 512
-        input = static.data(
-            name="query",
-            shape=[batch_size, sequence_len, hidden_size],
-            dtype='float32')
-        attn = AttentionLayer(
-            hidden_size=hidden_size,
-            sequence_len=sequence_len,
-            intermediate_size=4 * hidden_size,
-            num_heads=16,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+        input = static.data(name="query",
+                            shape=[batch_size, sequence_len, hidden_size],
+                            dtype='float32')
+        attn = AttentionLayer(hidden_size=hidden_size,
+                              sequence_len=sequence_len,
+                              intermediate_size=4 * hidden_size,
+                              num_heads=16,
+                              dropout_ratio=0.1,
+                              initializer_range=0.02)
         out = attn(input)
 
     return train_program, start_program
 
 
 class TestAttentionAutoPartitioner(unittest.TestCase):
+
     def test_attn_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
@@ -709,18 +701,17 @@ def test_attn_dp(self):
         dist_ops = [op.type for op in dist_ops]
         self.assertTrue(serial_ops == dist_ops)
 
-        # parameter initialization 
+        # parameter initialization
         var_need_broadcast = []
         self.assertTrue(
-            initialization_check(
-                _global_parallel_strategy,
-                dist_context,
-                dist_startup_prog,
-                serial_startup_prog,
-                var_need_broadcast,
-                _global_process_mesh,
-                mp_parallel_axis=None,
-                dp_parallel_axis=0))
+            initialization_check(_global_parallel_strategy,
+                                 dist_context,
+                                 dist_startup_prog,
+                                 serial_startup_prog,
+                                 var_need_broadcast,
+                                 _global_process_mesh,
+                                 mp_parallel_axis=None,
+                                 dp_parallel_axis=0))
 
     def test_attn_mp(self):
         global _global_parallel_strategy
@@ -765,18 +756,17 @@ def test_attn_mp(self):
         ]
         self.assertTrue(dist_ops == ref_ops)
 
-        # parameter initialization 
+        # parameter initialization
         var_need_broadcast = ['linear_3.b_0']
         self.assertTrue(
-            initialization_check(
-                _global_parallel_strategy,
-                dist_context,
-                dist_startup_prog,
-                serial_startup_prog,
-                var_need_broadcast,
-                _global_process_mesh,
-                mp_parallel_axis=0,
-                dp_parallel_axis=None))
+            initialization_check(_global_parallel_strategy,
+                                 dist_context,
+                                 dist_startup_prog,
+                                 serial_startup_prog,
+                                 var_need_broadcast,
+                                 _global_process_mesh,
+                                 mp_parallel_axis=0,
+                                 dp_parallel_axis=None))
 
         # check var and op all have dist_attr in dist_main_program
         self.assertTrue(
@@ -833,18 +823,17 @@ def test_attn_dp_mp(self):
         ]
         self.assertTrue(dist_ops == ref_ops)
 
-        # parameter initialization 
+        # parameter initialization
         var_need_broadcast = ['linear_3.b_0']
         self.assertTrue(
-            initialization_check(
-                _global_parallel_strategy,
-                dist_context,
-                dist_startup_prog,
-                serial_startup_prog,
-                var_need_broadcast,
-                _global_process_mesh,
-                mp_parallel_axis=1,
-                dp_parallel_axis=0))
+            initialization_check(_global_parallel_strategy,
+                                 dist_context,
+                                 dist_startup_prog,
+                                 serial_startup_prog,
+                                 var_need_broadcast,
+                                 _global_process_mesh,
+                                 mp_parallel_axis=1,
+                                 dp_parallel_axis=0))
 
         # check var and op all have dist_attr in dist_main_program
         self.assertTrue(
@@ -859,6 +848,7 @@ def test_attn_dp_mp(self):
 
 
 class DecoderLayer(nn.Layer):
+
     def __init__(self,
                  vocab_size=32768,
                  hidden_size=1024,
@@ -888,29 +878,37 @@ def __init__(self,
         self.word_embeddings = nn.Embedding(
             self.vocab_size,
             self.hidden_size,
-            weight_attr=paddle.ParamAttr(
-                name="word_embeddings",
-                initializer=nn.initializer.Normal(
-                    mean=0.0, std=self.initializer_range)))
+            weight_attr=paddle.ParamAttr(name="word_embeddings",
+                                         initializer=nn.initializer.Normal(
+                                             mean=0.0,
+                                             std=self.initializer_range)))
         self.position_embeddings = nn.Embedding(
             self.max_position_embeddings,
             self.hidden_size,
-            weight_attr=paddle.ParamAttr(
-                name="pos_embeddings",
-                initializer=nn.initializer.Normal(
-                    mean=0.0, std=self.initializer_range)))
+            weight_attr=paddle.ParamAttr(name="pos_embeddings",
+                                         initializer=nn.initializer.Normal(
+                                             mean=0.0,
+                                             std=self.initializer_range)))
 
         weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
             mean=0.0, std=self.initializer_range))
         bias_attr = None
-        self.q_proj = nn.Linear(
-            self.embed_dim, self.embed_dim, weight_attr, bias_attr=bias_attr)
-        self.k_proj = nn.Linear(
-            self.kdim, self.embed_dim, weight_attr, bias_attr=bias_attr)
-        self.v_proj = nn.Linear(
-            self.vdim, self.embed_dim, weight_attr, bias_attr=bias_attr)
-        self.out_proj = nn.Linear(
-            self.embed_dim, self.embed_dim, weight_attr, bias_attr=bias_attr)
+        self.q_proj = nn.Linear(self.embed_dim,
+                                self.embed_dim,
+                                weight_attr,
+                                bias_attr=bias_attr)
+        self.k_proj = nn.Linear(self.kdim,
+                                self.embed_dim,
+                                weight_attr,
+                                bias_attr=bias_attr)
+        self.v_proj = nn.Linear(self.vdim,
+                                self.embed_dim,
+                                weight_attr,
+                                bias_attr=bias_attr)
+        self.out_proj = nn.Linear(self.embed_dim,
+                                  self.embed_dim,
+                                  weight_attr,
+                                  bias_attr=bias_attr)
 
         intermediate_size = 4 * self.hidden_size
         d_model = self.hidden_size
@@ -918,10 +916,14 @@ def __init__(self,
         weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
             mean=0.0, std=self.initializer_range))
         bias_attr = None
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
         self.dropout1 = nn.Dropout(self.dropout_ratio)
         self.dropout2 = nn.Dropout(self.dropout_ratio, mode="upscale_in_train")
@@ -929,37 +931,33 @@ def __init__(self,
 
     def forward(self, input_ids, position_ids):
         if _global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                input_ids,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(input_ids,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                input_ids,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(input_ids,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
 
         input_embeddings = self.word_embeddings(input_ids)
         position_embeddings = self.position_embeddings(position_ids)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.word_embeddings.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.word_embeddings.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.word_embeddings.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.word_embeddings.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
 
         embeddings = input_embeddings + position_embeddings
         embeddings = self.dropout1(embeddings)
@@ -976,43 +974,37 @@ def forward(self, input_ids, position_ids):
         v = self.v_proj(target)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -1020,8 +1012,10 @@ def forward(self, input_ids, position_ids):
         v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
 
         # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
+        product = layers.matmul(x=q,
+                                y=k,
+                                transpose_y=True,
+                                alpha=self.head_dim**-0.5)
 
         if self.attn_mask is not None:
             product = product + self.attn_mask
@@ -1029,11 +1023,10 @@ def forward(self, input_ids, position_ids):
         weights = F.softmax(product)
 
         if self.dropout_ratio:
-            weights = F.dropout(
-                weights,
-                self.dropout_ratio,
-                training=self.training,
-                mode="upscale_in_train")
+            weights = F.dropout(weights,
+                                self.dropout_ratio,
+                                training=self.training,
+                                mode="upscale_in_train")
 
         out = tensor.matmul(weights, v)
 
@@ -1045,26 +1038,23 @@ def forward(self, input_ids, position_ids):
         out = self.out_proj(out)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
         else:
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
 
         # Add residual
         residual = embeddings + self.dropout2(out)
@@ -1078,31 +1068,27 @@ def forward(self, input_ids, position_ids):
         out3 = self.linear1(out2)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
 
         # Add residual
         final = residual + self.dropout3(out3)
@@ -1115,27 +1101,27 @@ def decoder_pretrain_forward(train_program, start_program):
         batch_size = 4
         hidden_size = 1024
         sequence_len = 512
-        input_ids = static.data(
-            name="input_ids", shape=[batch_size, sequence_len], dtype='int64')
-        position_ids = static.data(
-            name="position_ids",
-            shape=[batch_size, sequence_len],
-            dtype='int64')
-        decoder = DecoderLayer(
-            vocab_size=32768,
-            hidden_size=hidden_size,
-            sequence_len=sequence_len,
-            max_position_embeddings=512,
-            intermediate_size=4 * hidden_size,
-            num_heads=16,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
+        input_ids = static.data(name="input_ids",
+                                shape=[batch_size, sequence_len],
+                                dtype='int64')
+        position_ids = static.data(name="position_ids",
+                                   shape=[batch_size, sequence_len],
+                                   dtype='int64')
+        decoder = DecoderLayer(vocab_size=32768,
+                               hidden_size=hidden_size,
+                               sequence_len=sequence_len,
+                               max_position_embeddings=512,
+                               intermediate_size=4 * hidden_size,
+                               num_heads=16,
+                               dropout_ratio=0.1,
+                               initializer_range=0.02)
         out = decoder(input_ids, position_ids)
 
     return train_program, start_program
 
 
 class TestDecoderLayerPartitioner(unittest.TestCase):
+
     def test_decoder_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
@@ -1191,21 +1177,20 @@ def test_decoder_dp_mp(self):
         ]
         self.assertTrue(dist_ops == ref_ops)
 
-        # parameter initialization 
+        # parameter initialization
         var_need_broadcast = sorted([
             'linear_3.b_0', 'pos_embeddings', 'layer_norm_0.b_0',
             'layer_norm_0.w_0', 'linear_5.b_0'
         ])
         self.assertTrue(
-            initialization_check(
-                _global_parallel_strategy,
-                dist_context,
-                dist_startup_prog,
-                serial_startup_prog,
-                var_need_broadcast,
-                _global_process_mesh,
-                mp_parallel_axis=1,
-                dp_parallel_axis=0))
+            initialization_check(_global_parallel_strategy,
+                                 dist_context,
+                                 dist_startup_prog,
+                                 serial_startup_prog,
+                                 var_need_broadcast,
+                                 _global_process_mesh,
+                                 mp_parallel_axis=1,
+                                 dp_parallel_axis=0))
 
         # check var and op all have dist_attr in dist_main_program
         self.assertTrue(
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
index 07d94d1b76fa8..96738a466626e 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
@@ -125,17 +125,27 @@ def __init__(self,
             if self.fuse:
                 assert self.kdim == embed_dim
                 assert self.vdim == embed_dim
-                self.qkv_proj = nn.Linear(
-                    embed_dim, 3 * embed_dim, weight_attr, bias_attr=bias_attr)
+                self.qkv_proj = nn.Linear(embed_dim,
+                                          3 * embed_dim,
+                                          weight_attr,
+                                          bias_attr=bias_attr)
             else:
-                self.q_proj = nn.Linear(
-                    embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
-                self.k_proj = nn.Linear(
-                    self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
-                self.v_proj = nn.Linear(
-                    self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)
-            self.out_proj = nn.Linear(
-                embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
+                self.q_proj = nn.Linear(embed_dim,
+                                        embed_dim,
+                                        weight_attr,
+                                        bias_attr=bias_attr)
+                self.k_proj = nn.Linear(self.kdim,
+                                        embed_dim,
+                                        weight_attr,
+                                        bias_attr=bias_attr)
+                self.v_proj = nn.Linear(self.vdim,
+                                        embed_dim,
+                                        weight_attr,
+                                        bias_attr=bias_attr)
+            self.out_proj = nn.Linear(embed_dim,
+                                      embed_dim,
+                                      weight_attr,
+                                      bias_attr=bias_attr)
 
     def _fuse_prepare_qkv(self, query):
         mix_layer = self.qkv_proj(query)
@@ -154,19 +164,17 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
         q = self.q_proj(query)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.q_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.q_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
 
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
         q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
@@ -199,36 +207,32 @@ def compute_kv(self, key, value):
         k = self.k_proj(key)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.k_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.k_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
 
         v = self.v_proj(value)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.v_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.v_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -284,19 +288,20 @@ def forward(self,
             q, k, v, cache = self._prepare_qkv(query, key, value, use_cache,
                                                cache)
         # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
+        product = layers.matmul(x=q,
+                                y=k,
+                                transpose_y=True,
+                                alpha=self.head_dim**-0.5)
 
         if attn_mask is not None:
             product = product + attn_mask
 
         weights = F.softmax(product)
         if self.dropout:
-            weights = F.dropout(
-                weights,
-                self.dropout,
-                training=self.training,
-                mode="upscale_in_train")
+            weights = F.dropout(weights,
+                                self.dropout,
+                                training=self.training,
+                                mode="upscale_in_train")
 
         out = tensor.matmul(weights, v)
 
@@ -308,19 +313,17 @@ def forward(self,
         out = self.out_proj(out)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.out_proj.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.out_proj.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
 
         outs = [out]
         if self.need_weights:
@@ -442,24 +445,21 @@ def __init__(self,
         weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
         bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
 
-        self.self_attn = MultiHeadAttention(
-            d_model,
-            nhead,
-            dropout=attn_dropout,
-            weight_attr=weight_attrs[0],
-            bias_attr=bias_attrs[0],
-            topo=topo)
+        self.self_attn = MultiHeadAttention(d_model,
+                                            nhead,
+                                            dropout=attn_dropout,
+                                            weight_attr=weight_attrs[0],
+                                            bias_attr=bias_attrs[0],
+                                            topo=topo)
         if topo is None or topo.mp_info.size == 1:
-            self.linear1 = nn.Linear(
-                d_model,
-                dim_feedforward,
-                weight_attrs[2],
-                bias_attr=bias_attrs[2])
-            self.linear2 = nn.Linear(
-                dim_feedforward,
-                d_model,
-                weight_attrs[2],
-                bias_attr=bias_attrs[2])
+            self.linear1 = nn.Linear(d_model,
+                                     dim_feedforward,
+                                     weight_attrs[2],
+                                     bias_attr=bias_attrs[2])
+            self.linear2 = nn.Linear(dim_feedforward,
+                                     d_model,
+                                     weight_attrs[2],
+                                     bias_attr=bias_attrs[2])
 
         self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
         self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
@@ -487,34 +487,30 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
             tgt = self.norm2(tgt)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 0]
-                })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 0]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, 1]
-                })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, 1]
+                              })
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.linear2.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.linear2.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.linear2.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.linear2.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
 
         # tgt = self.dropout2(
         #     self.linear2(F.gelu(
@@ -530,8 +526,8 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
         return tgt if use_cache is False else (tgt, incremental_cache)
 
     def gen_cache(self, memory):
-        incremental_cache = self.self_attn.gen_cache(
-            memory, type=self.self_attn.Cache)
+        incremental_cache = self.self_attn.gen_cache(memory,
+                                                     type=self.self_attn.Cache)
         return incremental_cache
 
 
@@ -553,17 +549,16 @@ def __init__(self,
             self.word_embeddings = nn.Embedding(
                 vocab_size,
                 hidden_size,
-                weight_attr=paddle.ParamAttr(
-                    name="word_embeddings",
-                    initializer=nn.initializer.Normal(
-                        mean=0.0, std=initializer_range)))
+                weight_attr=paddle.ParamAttr(name="word_embeddings",
+                                             initializer=nn.initializer.Normal(
+                                                 mean=0.0,
+                                                 std=initializer_range)))
         self.position_embeddings = nn.Embedding(
             max_position_embeddings,
             hidden_size,
-            weight_attr=paddle.ParamAttr(
-                name="pos_embeddings",
-                initializer=nn.initializer.Normal(
-                    mean=0.0, std=initializer_range)))
+            weight_attr=paddle.ParamAttr(name="pos_embeddings",
+                                         initializer=nn.initializer.Normal(
+                                             mean=0.0, std=initializer_range)))
 
         self.dropout = nn.Dropout(hidden_dropout_prob)
 
@@ -576,19 +571,17 @@ def forward(self, input_ids, position_ids=None):
         input_embedings = self.word_embeddings(input_ids)
 
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(
-                self.word_embeddings.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(self.word_embeddings.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                self.word_embeddings.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [1, -1]
-                })
+            auto.shard_tensor(self.word_embeddings.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [1, -1]
+                              })
 
         position_embeddings = self.position_embeddings(position_ids)
         embeddings = input_embedings + position_embeddings
@@ -627,37 +620,36 @@ def __init__(self,
         if self.pipline_mode:
             self.layer_per_stage = num_hidden_layers // self.topo.pp_info.size
 
-        self.embeddings = GPTEmbeddings(
-            vocab_size, hidden_size, hidden_dropout_prob,
-            max_position_embeddings, type_vocab_size, self.initializer_range,
-            topo)
+        self.embeddings = GPTEmbeddings(vocab_size, hidden_size,
+                                        hidden_dropout_prob,
+                                        max_position_embeddings,
+                                        type_vocab_size, self.initializer_range,
+                                        topo)
 
         decoder_layers = nn.LayerList()
         for i in range(num_hidden_layers):
             DecoderLayer = TransformerDecoderLayer
             decoder_layers.append(
-                DecoderLayer(
-                    d_model=hidden_size,
-                    nhead=num_attention_heads,
-                    dim_feedforward=intermediate_size,
-                    dropout=hidden_dropout_prob,
-                    activation=hidden_act,
-                    attn_dropout=attention_probs_dropout_prob,
-                    act_dropout=hidden_dropout_prob,
-                    weight_attr=paddle.ParamAttr(
-                        initializer=nn.initializer.Normal(
-                            mean=0.0, std=self.initializer_range)),
-                    bias_attr=None,
-                    topo=topo))
+                DecoderLayer(d_model=hidden_size,
+                             nhead=num_attention_heads,
+                             dim_feedforward=intermediate_size,
+                             dropout=hidden_dropout_prob,
+                             activation=hidden_act,
+                             attn_dropout=attention_probs_dropout_prob,
+                             act_dropout=hidden_dropout_prob,
+                             weight_attr=paddle.ParamAttr(
+                                 initializer=nn.initializer.Normal(
+                                     mean=0.0, std=self.initializer_range)),
+                             bias_attr=None,
+                             topo=topo))
 
         Decoder = TransformerDecoder
 
-        self.decoder = Decoder(
-            decoder_layers,
-            num_hidden_layers,
-            norm="LayerNorm",
-            hidden_size=hidden_size,
-            topo=topo)
+        self.decoder = Decoder(decoder_layers,
+                               num_hidden_layers,
+                               norm="LayerNorm",
+                               hidden_size=hidden_size,
+                               topo=topo)
 
         self.checkpoints = []
 
@@ -672,29 +664,27 @@ def forward(self,
             length = paddle.shape(input_ids)[1]
             # Use bool mask
             attention_mask = paddle.tensor.tril(
-                paddle.ones(
-                    (length, length),
-                    dtype=self.embeddings.word_embeddings.weight.dtype))
+                paddle.ones((length, length),
+                            dtype=self.embeddings.word_embeddings.weight.dtype))
         if position_ids is None:
             past_length = 0
             if cache is not None:
                 past_length = paddle.shape(cache[0].k)[-2]
-            position_ids = paddle.arange(
-                past_length,
-                paddle.shape(input_ids)[-1] + past_length,
-                dtype='int64')
+            position_ids = paddle.arange(past_length,
+                                         paddle.shape(input_ids)[-1] +
+                                         past_length,
+                                         dtype='int64')
             position_ids = position_ids.unsqueeze(0)
             # .expand_as(input_ids)
-            position_ids = paddle.fluid.layers.expand_as(position_ids,
-                                                         input_ids)
-        embedding_output = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids)
+            position_ids = paddle.fluid.layers.expand_as(
+                position_ids, input_ids)
+        embedding_output = self.embeddings(input_ids=input_ids,
+                                           position_ids=position_ids)
 
         # TODO, use registered buffer
-        causal_mask = paddle.tensor.triu(
-            paddle.ones((paddle.shape(input_ids)[-1],
-                         paddle.shape(input_ids)[-1])) * -1e9,
-            diagonal=1)
+        causal_mask = paddle.tensor.triu(paddle.ones(
+            (paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1])) * -1e9,
+                                         diagonal=1)
 
         if attention_mask is not None:
             attention_mask = attention_mask + causal_mask
@@ -704,12 +694,11 @@ def forward(self,
         # The tensor returned by triu not in static graph.
         attention_mask.stop_gradient = True
 
-        encoder_outputs = self.decoder(
-            embedding_output,
-            memory=None,
-            tgt_mask=attention_mask,
-            use_cache=use_cache,
-            cache=cache)
+        encoder_outputs = self.decoder(embedding_output,
+                                       memory=None,
+                                       tgt_mask=attention_mask,
+                                       use_cache=use_cache,
+                                       cache=cache)
         self.checkpoints.extend(self.decoder.checkpoints)
         return encoder_outputs
 
@@ -733,8 +722,9 @@ def parallel_matmul(self, lm_output, logit_weights, parallel_output, topo):
             input_parallel = paddle.distributed.collective._c_identity(
                 lm_output, group=None)
 
-            logits = paddle.matmul(
-                input_parallel, logit_weights, transpose_y=True)
+            logits = paddle.matmul(input_parallel,
+                                   logit_weights,
+                                   transpose_y=True)
 
             if parallel_output:
                 return logits
@@ -797,50 +787,49 @@ def gpt_pretrain_forward(train_program, startup_program):
                               startup_program), utils.unique_name.guard():
         batch_size = 16
         sequence_len = 512
-        input_ids = static.data(
-            name="input_ids", shape=[batch_size, sequence_len], dtype='int64')
-        position_ids = static.data(
-            name="position_ids",
-            shape=[batch_size, sequence_len],
-            dtype='int64')
+        input_ids = static.data(name="input_ids",
+                                shape=[batch_size, sequence_len],
+                                dtype='int64')
+        position_ids = static.data(name="position_ids",
+                                   shape=[batch_size, sequence_len],
+                                   dtype='int64')
         attention_mask = static.data(
             name="attention_mask",
             shape=[batch_size, 1, sequence_len, sequence_len],
             dtype='float64')
-        labels = static.data(
-            name="labels", shape=[batch_size, sequence_len], dtype='int64')
-        loss_mask = static.data(
-            name="loss_mask", shape=[batch_size, sequence_len], dtype='float64')
+        labels = static.data(name="labels",
+                             shape=[batch_size, sequence_len],
+                             dtype='int64')
+        loss_mask = static.data(name="loss_mask",
+                                shape=[batch_size, sequence_len],
+                                dtype='float64')
 
         if _global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                input_ids,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(input_ids,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(
-                input_ids,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
-
-        gpt = GPTModel(
-            vocab_size=32768,
-            hidden_size=768,
-            num_hidden_layers=2,
-            num_attention_heads=12,
-            intermediate_size=4096,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=1024,
-            type_vocab_size=16,
-            initializer_range=0.02,
-            pad_token_id=0,
-            topo=None)
+            auto.shard_tensor(input_ids,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
+
+        gpt = GPTModel(vocab_size=32768,
+                       hidden_size=768,
+                       num_hidden_layers=2,
+                       num_attention_heads=12,
+                       intermediate_size=4096,
+                       hidden_act="gelu",
+                       hidden_dropout_prob=0.1,
+                       attention_probs_dropout_prob=0.1,
+                       max_position_embeddings=1024,
+                       type_vocab_size=16,
+                       initializer_range=0.02,
+                       pad_token_id=0,
+                       topo=None)
 
         model = GPTForPretraining(gpt)
 
@@ -854,18 +843,21 @@ def gpt_pretrain_forward(train_program, startup_program):
 
 
 class FakeStrategy(object):
+
     def __init__(self):
         self.amp = False
         self.recompute = False
 
 
 class FakeFleet(object):
+
     def __init__(self):
         self.user_defined_optimizer = None
         self._user_defined_strategy = FakeStrategy()
 
 
 class TestGPTPartitioner(unittest.TestCase):
+
     def test_gpt_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
@@ -888,13 +880,12 @@ def test_gpt_dp_mp(self):
         dist_context.block_state.parse_forward_blocks(complete_train_program)
 
         # serial backward pass
-        params_grads = parallelizer._generate_backward(
-            complete_train_program,
-            startup_program,
-            loss,
-            parameter_list=None,
-            no_grad_set=None,
-            callbacks=None)
+        params_grads = parallelizer._generate_backward(complete_train_program,
+                                                       startup_program,
+                                                       loss,
+                                                       parameter_list=None,
+                                                       no_grad_set=None,
+                                                       callbacks=None)
 
         rank_id = 3
         partitioner = Partitioner(dist_context, rank_id)
@@ -919,7 +910,7 @@ def test_gpt_dp_mp(self):
         #     from paddle.distributed.auto_parallel.completion import Completer
         #     completer = Completer()
         #     completer.complete_forward_annotation(auto_parallel_main_prog)
-        #     fw.write(str(auto_parallel_main_prog))       
+        #     fw.write(str(auto_parallel_main_prog))
         nrank = 4
         # col parallel
         weights = [
@@ -953,25 +944,27 @@ def test_gpt_dp_mp(self):
         mp_parallel_axis = 1
         dp_parallel_axis = 0
 
-        group_ranks = _get_comm_group(
-            process_mesh.processes, process_mesh.topology, mp_parallel_axis, 3)
+        group_ranks = _get_comm_group(process_mesh.processes,
+                                      process_mesh.topology, mp_parallel_axis,
+                                      3)
         mp_ring_id = new_process_group(group_ranks).id
 
-        group_ranks = _get_comm_group(
-            process_mesh.processes, process_mesh.topology, dp_parallel_axis, 3)
+        group_ranks = _get_comm_group(process_mesh.processes,
+                                      process_mesh.topology, dp_parallel_axis,
+                                      3)
         dp_ring_id = new_process_group(group_ranks).id
 
         tensor_parallel_allreduce_vars = sorted([
             op.desc.output_arg_names()[0].split("@")[0]
             for op in auto_parallel_main_prog.global_block().ops
-            if (op.type == "c_allreduce_sum" and op.attr('op_role') == 1 and
-                op.desc.attr("ring_id") == mp_ring_id)
+            if (op.type == "c_allreduce_sum" and op.attr('op_role') == 1
+                and op.desc.attr("ring_id") == mp_ring_id)
         ])
         data_parallel_allreduce_vars = sorted([
             op.desc.output_arg_names()[0].split("@")[0]
             for op in auto_parallel_main_prog.global_block().ops
-            if (op.type == "c_allreduce_sum" and op.desc.attr("ring_id") ==
-                dp_ring_id)
+            if (op.type == "c_allreduce_sum"
+                and op.desc.attr("ring_id") == dp_ring_id)
         ])
 
         self.assertTrue(all_params == data_parallel_allreduce_vars)
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
index 9888d2c68f195..93c5ded0c10ea 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
@@ -39,6 +39,7 @@
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -46,43 +47,43 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
 
     def forward(self, input):
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": PP_MESH_0,
-                    "dims_mapping": [-1, -1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": PP_MESH_1,
-                    "dims_mapping": [-1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_0,
+                                  "dims_mapping": [-1, -1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_1,
+                                  "dims_mapping": [-1, -1]
+                              })
         else:
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -98,43 +99,40 @@ def mlp_forward(train_program, start_program):
         batch_size = 4
         hidden_size = 1024
         sequence_len = 512
-        input = static.data(
-            name="input", shape=[batch_size, hidden_size], dtype='float32')
-        label = static.data(
-            name="label", shape=[batch_size, 1], dtype='float32')
+        input = static.data(name="input",
+                            shape=[batch_size, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, 1],
+                            dtype='float32')
 
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": PP_MESH_0,
-                    "dims_mapping": [-1, -1]
-                })
-            auto.shard_tensor(
-                label,
-                dist_attr={
-                    "process_mesh": PP_MESH_1,
-                    "dims_mapping": [-1, -1]
-                })
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_0,
+                                  "dims_mapping": [-1, -1]
+                              })
+            auto.shard_tensor(label,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_1,
+                                  "dims_mapping": [-1, -1]
+                              })
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         else:
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
-
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            initializer_range=0.02)
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       initializer_range=0.02)
 
         predict = mlp(input)
         error_cost = paddle.nn.functional.square_error_cost(predict, label)
@@ -164,16 +162,15 @@ def get_dist_prog(train_program,
     if change_process_mesh:
         global PP_MESH_1
         dist_context.get_tensor_dist_attr_for_program(
-            train_program.global_block().vars[
-                "gelu_0.tmp_0"]).process_mesh = PP_MESH_1
+            train_program.global_block(
+            ).vars["gelu_0.tmp_0"]).process_mesh = PP_MESH_1
 
-    params_grads = parallelizer._generate_backward(
-        complete_train_program,
-        startup_program,
-        loss,
-        parameter_list=None,
-        no_grad_set=None,
-        callbacks=None)
+    params_grads = parallelizer._generate_backward(complete_train_program,
+                                                   startup_program,
+                                                   loss,
+                                                   parameter_list=None,
+                                                   no_grad_set=None,
+                                                   callbacks=None)
 
     # logical partition
     partitioner = Partitioner(dist_context, rank_id)
@@ -227,8 +224,7 @@ def check_send_recv_result(dist_main_prog, rank_id):
         for idx, op in enumerate(ops):
             if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names:
                 send_result = True
-            if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[
-                    0]:
+            if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[0]:
                 recv_result = True
 
     return send_result and recv_result
@@ -269,6 +265,7 @@ def check_initialization_for_dp(dist_startup_prog):
 
 
 class TestMLPReshard(unittest.TestCase):
+
     def test_complete_backward_annotation(self):
         global _global_process_mesh
         _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
index 62f25c5d4a0e6..7544ff4571cce 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
@@ -38,6 +38,7 @@
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -45,25 +46,31 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
 
     def forward(self, input):
-        auto.shard_tensor(
-            self.linear0.weight,
-            dist_attr={"process_mesh": PP_MESH_0,
-                       "dims_mapping": [-1, 1]})
-        auto.shard_tensor(
-            self.linear1.weight,
-            dist_attr={"process_mesh": PP_MESH_1,
-                       "dims_mapping": [1, -1]})
+        auto.shard_tensor(self.linear0.weight,
+                          dist_attr={
+                              "process_mesh": PP_MESH_0,
+                              "dims_mapping": [-1, 1]
+                          })
+        auto.shard_tensor(self.linear1.weight,
+                          dist_attr={
+                              "process_mesh": PP_MESH_1,
+                              "dims_mapping": [1, -1]
+                          })
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -79,24 +86,27 @@ def mlp_forward(train_program, start_program):
         batch_size = 4
         hidden_size = 1024
         sequence_len = 512
-        input = static.data(
-            name="input", shape=[batch_size, hidden_size], dtype='float32')
-        label = static.data(
-            name="label", shape=[batch_size, 1], dtype='float32')
-
-        auto.shard_tensor(
-            input,
-            dist_attr={"process_mesh": PP_MESH_0,
-                       "dims_mapping": [0, -1]})
-        auto.shard_tensor(
-            label,
-            dist_attr={"process_mesh": PP_MESH_1,
-                       "dims_mapping": [0, -1]})
-
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            initializer_range=0.02)
+        input = static.data(name="input",
+                            shape=[batch_size, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, 1],
+                            dtype='float32')
+
+        auto.shard_tensor(input,
+                          dist_attr={
+                              "process_mesh": PP_MESH_0,
+                              "dims_mapping": [0, -1]
+                          })
+        auto.shard_tensor(label,
+                          dist_attr={
+                              "process_mesh": PP_MESH_1,
+                              "dims_mapping": [0, -1]
+                          })
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       initializer_range=0.02)
 
         predict = mlp(input)
         error_cost = paddle.nn.functional.square_error_cost(predict, label)
@@ -121,13 +131,12 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     complete_train_program = completer.complete_forward_annotation(
         train_program)
     dist_context.block_state.parse_forward_blocks(complete_train_program)
-    params_grads = parallelizer._generate_backward(
-        complete_train_program,
-        startup_program,
-        loss,
-        parameter_list=None,
-        no_grad_set=None,
-        callbacks=None)
+    params_grads = parallelizer._generate_backward(complete_train_program,
+                                                   startup_program,
+                                                   loss,
+                                                   parameter_list=None,
+                                                   no_grad_set=None,
+                                                   callbacks=None)
 
     # logical partition
     partitioner = Partitioner(dist_context, rank_id)
@@ -155,8 +164,7 @@ def check_send_recv_result(dist_main_prog, rank_id):
         for idx, op in enumerate(ops):
             if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names:
                 send_result = True
-            if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[
-                    0]:
+            if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[0]:
                 recv_result = True
 
     return send_result and recv_result
@@ -172,6 +180,7 @@ def check_initialization_for_dpmppp(dist_startup_prog):
 
 
 class TestMLPReshard(unittest.TestCase):
+
     def test_mlp_dpmppp(self):
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
index 5f9c2ec2371a5..0e647a3db5b64 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
@@ -38,6 +38,7 @@
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -45,42 +46,51 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
         self.word_embeddings = nn.Embedding(
             hidden_size,
             hidden_size,
-            weight_attr=paddle.ParamAttr(
-                name="word_embeddings",
-                initializer=nn.initializer.Normal(
-                    mean=0.0, std=initializer_range)))
-
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
-        self.linear2 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+            weight_attr=paddle.ParamAttr(name="word_embeddings",
+                                         initializer=nn.initializer.Normal(
+                                             mean=0.0, std=initializer_range)))
+
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear2 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
 
     def forward(self, input):
-        auto.shard_tensor(
-            self.word_embeddings.weight,
-            dist_attr={"process_mesh": PP_MESH_0,
-                       "dims_mapping": [0, -1]})
-        auto.shard_tensor(
-            self.linear0.weight,
-            dist_attr={"process_mesh": PP_MESH_0,
-                       "dims_mapping": [-1, 0]})
-        auto.shard_tensor(
-            self.linear1.weight,
-            dist_attr={"process_mesh": PP_MESH_1,
-                       "dims_mapping": [0, -1]})
-        auto.shard_tensor(
-            self.linear2.weight,
-            dist_attr={"process_mesh": PP_MESH_1,
-                       "dims_mapping": [0, -1]})
+        auto.shard_tensor(self.word_embeddings.weight,
+                          dist_attr={
+                              "process_mesh": PP_MESH_0,
+                              "dims_mapping": [0, -1]
+                          })
+        auto.shard_tensor(self.linear0.weight,
+                          dist_attr={
+                              "process_mesh": PP_MESH_0,
+                              "dims_mapping": [-1, 0]
+                          })
+        auto.shard_tensor(self.linear1.weight,
+                          dist_attr={
+                              "process_mesh": PP_MESH_1,
+                              "dims_mapping": [0, -1]
+                          })
+        auto.shard_tensor(self.linear2.weight,
+                          dist_attr={
+                              "process_mesh": PP_MESH_1,
+                              "dims_mapping": [0, -1]
+                          })
         w_out = self.word_embeddings(input)
         out = self.linear0(w_out)
         gelu_out = F.gelu(out, approximate=True)
@@ -98,21 +108,24 @@ def mlp_forward(train_program, start_program):
         hidden_size = 1024
         sequence_len = 512
         input = static.data(name="input", shape=[batch_size], dtype='int32')
-        label = static.data(
-            name="label", shape=[batch_size, 1], dtype='float32')
-
-        auto.shard_tensor(
-            input, dist_attr={"process_mesh": PP_MESH_0,
-                              "dims_mapping": [-1]})
-        auto.shard_tensor(
-            label,
-            dist_attr={"process_mesh": PP_MESH_1,
-                       "dims_mapping": [-1, -1]})
-
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            initializer_range=0.02)
+        label = static.data(name="label",
+                            shape=[batch_size, 1],
+                            dtype='float32')
+
+        auto.shard_tensor(input,
+                          dist_attr={
+                              "process_mesh": PP_MESH_0,
+                              "dims_mapping": [-1]
+                          })
+        auto.shard_tensor(label,
+                          dist_attr={
+                              "process_mesh": PP_MESH_1,
+                              "dims_mapping": [-1, -1]
+                          })
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       initializer_range=0.02)
 
         predict = mlp(input)
         error_cost = paddle.nn.functional.square_error_cost(predict, label)
@@ -137,13 +150,12 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     complete_train_program = completer.complete_forward_annotation(
         train_program)
     dist_context.block_state.parse_forward_blocks(complete_train_program)
-    params_grads = parallelizer._generate_backward(
-        complete_train_program,
-        startup_program,
-        loss,
-        parameter_list=None,
-        no_grad_set=None,
-        callbacks=None)
+    params_grads = parallelizer._generate_backward(complete_train_program,
+                                                   startup_program,
+                                                   loss,
+                                                   parameter_list=None,
+                                                   no_grad_set=None,
+                                                   callbacks=None)
 
     # logical partition
     partitioner = Partitioner(dist_context, rank_id)
@@ -171,8 +183,7 @@ def check_send_recv_result(dist_main_prog, rank_id):
             if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names[
                     0]:
                 send_result = True
-            if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[
-                    0]:
+            if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[0]:
                 recv_result = True
 
     return send_result and recv_result
@@ -206,6 +217,7 @@ def check_allgather(dist_main_program):
 
 
 class TestMLPReshard(unittest.TestCase):
+
     def test_mlp_mppp(self):
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
@@ -230,20 +242,18 @@ def test_allgather(self):
         process_mesh = auto.ProcessMesh(mesh=[0, 3])
         with static.program_guard(train_program, startup_program):
             x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
-            x = auto.shard_tensor(
-                x,
-                dist_attr={
-                    "process_mesh": process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            x = auto.shard_tensor(x,
+                                  dist_attr={
+                                      "process_mesh": process_mesh,
+                                      "dims_mapping": [0, -1]
+                                  })
 
             w = paddle.static.data(name="w", shape=[4, 4], dtype='float32')
-            w = auto.shard_tensor(
-                w,
-                dist_attr={
-                    "process_mesh": process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
+            w = auto.shard_tensor(w,
+                                  dist_attr={
+                                      "process_mesh": process_mesh,
+                                      "dims_mapping": [-1, -1]
+                                  })
 
             # y = paddle.distributed.shard_op(paddle.matmul, process_mesh, {
             #     x.name: [-1, -1],
@@ -251,17 +261,16 @@ def test_allgather(self):
             # }, **{"x": x,
             #       "y": w})[0]
 
-            y = paddle.distributed.shard_op(
-                paddle.matmul,
-                dist_attr={
-                    "process_mesh": process_mesh,
-                    x: {
-                        "dims_mapping": [-1, -1]
-                    },
-                    w: {
-                        "dims_mapping": [-1, -1]
-                    }
-                })(x, w)[0]
+            y = paddle.distributed.shard_op(paddle.matmul,
+                                            dist_attr={
+                                                "process_mesh": process_mesh,
+                                                x: {
+                                                    "dims_mapping": [-1, -1]
+                                                },
+                                                w: {
+                                                    "dims_mapping": [-1, -1]
+                                                }
+                                            })(x, w)[0]
 
         rank_id = 0
         dist_context = DistributedContext()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
index ac6b06b9ca1ea..64ff030f5b1e2 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
@@ -38,6 +38,7 @@
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -45,43 +46,43 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
 
     def forward(self, input):
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": PP_MESH_0,
-                    "dims_mapping": [-1, -1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": PP_MESH_1,
-                    "dims_mapping": [-1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_0,
+                                  "dims_mapping": [-1, -1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_1,
+                                  "dims_mapping": [-1, -1]
+                              })
         else:
-            auto.shard_tensor(
-                self.linear0.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
-            auto.shard_tensor(
-                self.linear1.weight,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
+            auto.shard_tensor(self.linear0.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
+            auto.shard_tensor(self.linear1.weight,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -97,43 +98,40 @@ def mlp_forward(train_program, start_program):
         batch_size = 4
         hidden_size = 1024
         sequence_len = 512
-        input = static.data(
-            name="input", shape=[batch_size, hidden_size], dtype='float32')
-        label = static.data(
-            name="label", shape=[batch_size, 1], dtype='float32')
+        input = static.data(name="input",
+                            shape=[batch_size, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, 1],
+                            dtype='float32')
 
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": PP_MESH_0,
-                    "dims_mapping": [-1, -1]
-                })
-            auto.shard_tensor(
-                label,
-                dist_attr={
-                    "process_mesh": PP_MESH_1,
-                    "dims_mapping": [-1, -1]
-                })
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_0,
+                                  "dims_mapping": [-1, -1]
+                              })
+            auto.shard_tensor(label,
+                              dist_attr={
+                                  "process_mesh": PP_MESH_1,
+                                  "dims_mapping": [-1, -1]
+                              })
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [0, -1]
-                })
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [0, -1]
+                              })
         else:
-            auto.shard_tensor(
-                input,
-                dist_attr={
-                    "process_mesh": _global_process_mesh,
-                    "dims_mapping": [-1, -1]
-                })
-
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            initializer_range=0.02)
+            auto.shard_tensor(input,
+                              dist_attr={
+                                  "process_mesh": _global_process_mesh,
+                                  "dims_mapping": [-1, -1]
+                              })
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       initializer_range=0.02)
 
         predict = mlp(input)
         error_cost = paddle.nn.functional.square_error_cost(predict, label)
@@ -158,12 +156,11 @@ def get_dist_prog_with_parallelizer(train_program, startup_program,
     loss, train_program, startup_program = mlp_forward(train_program,
                                                        startup_program)
 
-    optimizer = paddle.fluid.optimizer.AdamOptimizer(
-        learning_rate=0.00001,
-        beta1=0.9,
-        beta2=0.999,
-        epsilon=1e-08,
-        grad_clip=None)
+    optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
+                                                     beta1=0.9,
+                                                     beta2=0.999,
+                                                     epsilon=1e-08,
+                                                     grad_clip=None)
     optimizer = fleet.distributed_optimizer(optimizer)
 
     _, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
@@ -187,14 +184,14 @@ def check_send_recv_result(dist_main_prog, rank_id):
         for idx, op in enumerate(ops):
             if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names:
                 send_result = True
-            if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[
-                    0]:
+            if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[0]:
                 recv_result = True
 
     return send_result and recv_result
 
 
 class TestMLPReshard(unittest.TestCase):
+
     def test_mlp_serial(self):
         global _global_parallel_strategy
         _global_parallel_strategy = None
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_save_load.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_save_load.py
index b96b51e556772..7de26902011a3 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_save_load.py
@@ -21,6 +21,7 @@
 
 
 class TestAutoParallelSaveLoad(TestMultipleGpus):
+
     def test_auto_parallel_save_load(self):
         self.run_mnist_2gpu('auto_parallel_save_load.py')
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
index 78ad64b1dd852..5d6119d23f321 100755
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
@@ -40,6 +40,7 @@
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -47,14 +48,18 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
 
     def forward(self, input):
@@ -73,15 +78,16 @@ def mlp_forward(train_program, start_program):
         batch_size = 4
         hidden_size = 1024
         sequence_len = 512
-        input = static.data(
-            name="input", shape=[batch_size, hidden_size], dtype='float32')
-        label = static.data(
-            name="label", shape=[batch_size, 1], dtype='float32')
+        input = static.data(name="input",
+                            shape=[batch_size, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, 1],
+                            dtype='float32')
         loss_func = paddle.nn.CrossEntropyLoss(reduction="none")
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            initializer_range=0.02)
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       initializer_range=0.02)
 
         predict = mlp(input)
         error_cost = loss_func(predict, label)
@@ -100,8 +106,8 @@ def set_default_dist_attr(program, dist_context, process_mesh):
             tensor_dist_attr = TensorDistributedAttribute()
             tensor_dist_attr.process_mesh = process_mesh
             tensor_dist_attr.dims_mapping = [-1 for i in vars[var_name].shape]
-            dist_context.set_tensor_dist_attr_for_program(vars[var_name],
-                                                          tensor_dist_attr)
+            dist_context.set_tensor_dist_attr_for_program(
+                vars[var_name], tensor_dist_attr)
             op_dist_attr.set_input_dims_mapping(var_name,
                                                 tensor_dist_attr.dims_mapping)
 
@@ -109,8 +115,8 @@ def set_default_dist_attr(program, dist_context, process_mesh):
             tensor_dist_attr = TensorDistributedAttribute()
             tensor_dist_attr.process_mesh = process_mesh
             tensor_dist_attr.dims_mapping = [-1 for i in vars[var_name].shape]
-            dist_context.set_tensor_dist_attr_for_program(vars[var_name],
-                                                          tensor_dist_attr)
+            dist_context.set_tensor_dist_attr_for_program(
+                vars[var_name], tensor_dist_attr)
             op_dist_attr.set_output_dims_mapping(var_name,
                                                  tensor_dist_attr.dims_mapping)
         dist_context.set_op_dist_attr_for_program(op, op_dist_attr)
@@ -143,6 +149,7 @@ def check_nonpipeline_enumerater(program, process_mesh_topology):
 
 
 class TestMLPSearcher(unittest.TestCase):
+
     def test_update(self):
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
@@ -170,8 +177,7 @@ def test_update(self):
 
                     dist_op.dist_attr.set_output_dims_mapping(
                         op.output_arg_names[0], [0] + [
-                            -1
-                            for i in range(
+                            -1 for i in range(
                                 1, len(vars[op.output_arg_names[0]].shape))
                         ])
                     try:
@@ -187,8 +193,7 @@ def test_update(self):
 
                     dist_op.dist_attr.set_output_dims_mapping(
                         op.output_arg_names[0], [0] + [
-                            -1
-                            for i in range(
+                            -1 for i in range(
                                 1, len(vars[op.output_arg_names[0]].shape))
                         ])
                     try:
diff --git a/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py b/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
index 8c5913c66a70d..76bcf0f194816 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
@@ -30,11 +30,13 @@
 from paddle.distributed.auto_parallel.dist_context import DistributedContext, DistributedOperatorContext
 from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
 from paddle.distributed.auto_parallel.dist_op import DistributedOperator
+
 paddle.enable_static()
 device = "gpu" if core.is_compiled_with_cuda() else "cpu"
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -42,14 +44,18 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
 
     def forward(self, input):
@@ -76,17 +82,16 @@ def mlp_forward(train_program, start_program):
         input = embedding(input)
         input = paddle.reshape(input, [hidden_size, batch_size])
         input = paddle.transpose(input, perm=[1, 0])
-        matmulinput = static.data(
-            name="matmulinput",
-            shape=[hidden_size, hidden_size],
-            dtype='float32')
+        matmulinput = static.data(name="matmulinput",
+                                  shape=[hidden_size, hidden_size],
+                                  dtype='float32')
         input = layers.matmul(x=input, y=matmulinput)
-        label = static.data(
-            name="label", shape=[batch_size, 1], dtype='float32')
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            initializer_range=0.02)
+        label = static.data(name="label",
+                            shape=[batch_size, 1],
+                            dtype='float32')
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       initializer_range=0.02)
 
         predict = mlp(input)
         error_cost = paddle.nn.functional.square_error_cost(predict, label)
@@ -97,6 +102,7 @@ def mlp_forward(train_program, start_program):
 
 
 class TestCompatible(unittest.TestCase):
+
     def test_matmulv2_matmul_2_compatible(self):
         valid_op_dist_attr_list = []
         program = paddle.static.Program()
@@ -105,16 +111,20 @@ def test_matmulv2_matmul_2_compatible(self):
 
         with static.program_guard(program,
                                   start_program), utils.unique_name.guard():
-            matmulx3 = static.data(
-                name="matmulx3", shape=[6, 2, 6], dtype='float32')
-            matmuly3 = static.data(
-                name="matmuly3", shape=[6, 6], dtype='float32')
+            matmulx3 = static.data(name="matmulx3",
+                                   shape=[6, 2, 6],
+                                   dtype='float32')
+            matmuly3 = static.data(name="matmuly3",
+                                   shape=[6, 6],
+                                   dtype='float32')
             output1 = paddle.matmul(x=matmulx3, y=matmuly3)
             output_1 = layers.matmul(x=matmulx3, y=matmuly3)
-            matmulx4 = static.data(
-                name="matmulx4", shape=[6, 6, 2, 6], dtype='float32')
-            matmuly4 = static.data(
-                name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
+            matmulx4 = static.data(name="matmulx4",
+                                   shape=[6, 6, 2, 6],
+                                   dtype='float32')
+            matmuly4 = static.data(name="matmuly4",
+                                   shape=[6, 6, 6, 6],
+                                   dtype='float32')
             output2 = paddle.matmul(x=matmulx4, y=matmuly4)
             output_2 = layers.matmul(x=matmulx4, y=matmuly4)
         ops = program.global_block().ops
@@ -202,16 +212,20 @@ def test_matmulv2_matmul_1_compatible(self):
         loss, program, start_program = mlp_forward(program, startup_program)
         with static.program_guard(program,
                                   start_program), utils.unique_name.guard():
-            matmulx3 = static.data(
-                name="matmulx3", shape=[6, 2, 6], dtype='float32')
-            matmuly3 = static.data(
-                name="matmuly3", shape=[6, 6], dtype='float32')
+            matmulx3 = static.data(name="matmulx3",
+                                   shape=[6, 2, 6],
+                                   dtype='float32')
+            matmuly3 = static.data(name="matmuly3",
+                                   shape=[6, 6],
+                                   dtype='float32')
             output1 = paddle.matmul(x=matmulx3, y=matmuly3)
             output_1 = layers.matmul(x=matmulx3, y=matmuly3)
-            matmulx4 = static.data(
-                name="matmulx4", shape=[6, 6, 6, 6], dtype='float32')
-            matmuly4 = static.data(
-                name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
+            matmulx4 = static.data(name="matmulx4",
+                                   shape=[6, 6, 6, 6],
+                                   dtype='float32')
+            matmuly4 = static.data(name="matmuly4",
+                                   shape=[6, 6, 6, 6],
+                                   dtype='float32')
             output2 = paddle.matmul(x=matmulx4, y=matmuly4)
             output_2 = layers.matmul(x=matmulx4, y=matmuly4)
         ops = program.global_block().ops
@@ -289,16 +303,20 @@ def test_matmulv2_matmul_0_compatible(self):
         loss, program, start_program = mlp_forward(program, startup_program)
         with static.program_guard(program,
                                   start_program), utils.unique_name.guard():
-            matmulx3 = static.data(
-                name="matmulx3", shape=[6, 2, 6], dtype='float32')
-            matmuly3 = static.data(
-                name="matmuly3", shape=[6, 6], dtype='float32')
+            matmulx3 = static.data(name="matmulx3",
+                                   shape=[6, 2, 6],
+                                   dtype='float32')
+            matmuly3 = static.data(name="matmuly3",
+                                   shape=[6, 6],
+                                   dtype='float32')
             output1 = paddle.matmul(x=matmulx3, y=matmuly3)
             output_1 = layers.matmul(x=matmulx3, y=matmuly3)
-            matmulx4 = static.data(
-                name="matmulx4", shape=[6, 6, 2, 6], dtype='float32')
-            matmuly4 = static.data(
-                name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
+            matmulx4 = static.data(name="matmulx4",
+                                   shape=[6, 6, 2, 6],
+                                   dtype='float32')
+            matmuly4 = static.data(name="matmuly4",
+                                   shape=[6, 6, 6, 6],
+                                   dtype='float32')
             output2 = paddle.matmul(x=matmulx4, y=matmuly4)
             output_2 = layers.matmul(x=matmulx4, y=matmuly4)
         ops = program.global_block().ops
diff --git a/python/paddle/fluid/tests/unittests/test_auto_search_dist_op.py b/python/paddle/fluid/tests/unittests/test_auto_search_dist_op.py
index 4cb58eac7cc41..568856244c648 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_search_dist_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_search_dist_op.py
@@ -30,11 +30,13 @@
 from paddle.distributed.auto_parallel.dist_context import DistributedContext, DistributedOperatorContext
 from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
 from paddle.distributed.auto_parallel.dist_op import DistributedOperator
+
 paddle.enable_static()
 device = "gpu" if core.is_compiled_with_cuda() else "cpu"
 
 
 class MLPLayer(nn.Layer):
+
     def __init__(self,
                  hidden_size=1024,
                  intermediate_size=4 * 1024,
@@ -42,14 +44,18 @@ def __init__(self,
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
         bias_attr = None
 
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
 
     def forward(self, input):
@@ -76,17 +82,16 @@ def mlp_forward(train_program, start_program):
         input = embedding(input)
         input = paddle.reshape(input, [hidden_size, batch_size])
         input = paddle.transpose(input, perm=[1, 0])
-        matmulinput = static.data(
-            name="matmulinput",
-            shape=[hidden_size, hidden_size],
-            dtype='float32')
+        matmulinput = static.data(name="matmulinput",
+                                  shape=[hidden_size, hidden_size],
+                                  dtype='float32')
         input = layers.matmul(x=input, y=matmulinput)
-        label = static.data(
-            name="label", shape=[batch_size, 1], dtype='float32')
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            initializer_range=0.02)
+        label = static.data(name="label",
+                            shape=[batch_size, 1],
+                            dtype='float32')
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       initializer_range=0.02)
 
         predict = mlp(input)
         error_cost = paddle.nn.functional.square_error_cost(predict, label)
@@ -97,6 +102,7 @@ def mlp_forward(train_program, start_program):
 
 
 class TestCompatible(unittest.TestCase):
+
     def test_reshape_remove_compatible(self):
         valid_op_dist_attr_list = []
         program = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py b/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py
index 8572572f14636..e49895ca77dc7 100644
--- a/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py
+++ b/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py
@@ -19,6 +19,7 @@
 
 
 class TestAvoidTwiceInitialization(unittest.TestCase):
+
     def test_avoid_twice_initialization(self):
         cur_program = fluid.Program()
         cur_block = cur_program.current_block()
@@ -27,18 +28,18 @@ def test_avoid_twice_initialization(self):
             shape=[2, 2],
             dtype='float32',
             name='var_a')
-        cur_block.append_op(
-            type="c_broadcast",
-            inputs={"X": [var]},
-            outputs={"Out": [var]},
-            attrs={'root': 0,
-                   'ring_id': 0,
-                   'use_calc_stream': False})
-        cur_block.append_op(
-            type="c_sync_comm_stream",
-            inputs={'X': [var]},
-            outputs={'Out': [var]},
-            attrs={'ring_id': 0})
+        cur_block.append_op(type="c_broadcast",
+                            inputs={"X": [var]},
+                            outputs={"Out": [var]},
+                            attrs={
+                                'root': 0,
+                                'ring_id': 0,
+                                'use_calc_stream': False
+                            })
+        cur_block.append_op(type="c_sync_comm_stream",
+                            inputs={'X': [var]},
+                            outputs={'Out': [var]},
+                            attrs={'ring_id': 0})
         var2 = cur_block.create_parameter(
             initializer=fluid.initializer.Constant(value=0.01),
             shape=[2, 2],
diff --git a/python/paddle/fluid/tests/unittests/test_backward.py b/python/paddle/fluid/tests/unittests/test_backward.py
index e0d6a606e2569..a6c9caacc7806 100644
--- a/python/paddle/fluid/tests/unittests/test_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_backward.py
@@ -58,8 +58,8 @@ class TestBackward(unittest.TestCase):
     """
 
     def _check_all(self, net):
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
 
         main = fluid.Program()
@@ -113,8 +113,8 @@ def _check_op_path(self, root_block, outputs, inputs=[], no_grad_dict=None):
             block_no_grad_set = None
         else:
             block_no_grad_set = set(
-                map(fluid.backward._strip_grad_suffix_, no_grad_dict[
-                    self.global_block_idx]))
+                map(fluid.backward._strip_grad_suffix_,
+                    no_grad_dict[self.global_block_idx]))
         op_path = fluid.backward._find_op_path_(root_block, outputs, inputs,
                                                 block_no_grad_set)
         op_types = [op.type for op in op_path]
@@ -131,8 +131,8 @@ def _check_find_no_grad_vars(self, root_block, op_path, targets,
         return no_grad_vars
 
     def _check_error_param_list(self, net, parameter_list):
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
 
         main = fluid.Program()
@@ -146,8 +146,8 @@ def _check_error_param_list(self, net, parameter_list):
             exe.run(feed=net.init_data())
 
     def _check_error_no_grad_set(self, net, no_grad_set):
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
 
         main = fluid.Program()
@@ -162,6 +162,7 @@ def _check_error_no_grad_set(self, net, no_grad_set):
 
 
 class SimpleNet(BackwardNet):
+
     def __init__(self):
         super(SimpleNet, self).__init__()
         self.stop_gradient_grad_vars = set([
@@ -201,20 +202,25 @@ def build_model(self):
         x = fluid.data(name='x_no_grad', shape=self.shape, dtype='int64')
         x2 = fluid.data(name='x2_no_grad', shape=self.shape, dtype='int64')
         x3 = fluid.data(name='x3_no_grad', shape=self.shape, dtype='int64')
-        label = fluid.data(
-            name='label_no_grad', shape=[self.shape[0], 1], dtype='float32')
+        label = fluid.data(name='label_no_grad',
+                           shape=[self.shape[0], 1],
+                           dtype='float32')
         # shared layer, the grad of 'w2v' will be summed and renamed.
         # To test  _addup_repetitive_outputs_
-        x_emb = fluid.embedding(
-            x, size=[100, 64], param_attr=fluid.ParamAttr(name='w2v'))
-        x2_emb = fluid.embedding(
-            x2, size=[100, 64], param_attr=fluid.ParamAttr(name='w2v'))
-        x3_emb = fluid.embedding(
-            x3, size=[100, 64], param_attr=fluid.ParamAttr(name='w2v'))
+        x_emb = fluid.embedding(x,
+                                size=[100, 64],
+                                param_attr=fluid.ParamAttr(name='w2v'))
+        x2_emb = fluid.embedding(x2,
+                                 size=[100, 64],
+                                 param_attr=fluid.ParamAttr(name='w2v'))
+        x3_emb = fluid.embedding(x3,
+                                 size=[100, 64],
+                                 param_attr=fluid.ParamAttr(name='w2v'))
         # merge layers
         x_merge = fluid.layers.elementwise_add(x_emb, x2_emb, name='x_add_x2')
-        x2_merge = fluid.layers.elementwise_add(
-            x2_emb, x3_emb, name='x2_add_x3')
+        x2_merge = fluid.layers.elementwise_add(x2_emb,
+                                                x3_emb,
+                                                name='x2_add_x3')
         # shared fc_w
         predict = fluid.layers.fc(input=x_merge,
                                   size=1,
@@ -235,6 +241,7 @@ def build_model(self):
 
 
 class TestSimpleNet(TestBackward):
+
     def test_backward(self):
         """
         Instantiate each NetClass to test backward.
@@ -245,6 +252,7 @@ def test_backward(self):
 
 
 class TestGradientsError(unittest.TestCase):
+
     def test_error(self):
         x = fluid.data(name='x', shape=[None, 2, 8, 8], dtype='float32')
         x.stop_gradient = False
@@ -265,6 +273,7 @@ def test_error(self):
 
 
 class TestSimpleNetWithErrorParamList(TestBackward):
+
     def test_parameter_list_type_error(self):
         self.global_block_idx = 0
         self.net = SimpleNet()
@@ -278,6 +287,7 @@ def test_parameter_list_type_error(self):
 
 
 class TestSimpleNetWithErrorNoGradSet(TestBackward):
+
     def test_no_grad_set_type_error(self):
         self.global_block_idx = 0
         self.net = SimpleNet()
@@ -291,6 +301,7 @@ def test_no_grad_set_type_error(self):
 
 
 class TestAppendBackwardWithError(unittest.TestCase):
+
     def build_net(self):
         x = fluid.data(name='x', shape=[None, 13], dtype='int64')
         y = fluid.data(name='y', shape=[None, 1], dtype='float32')
@@ -317,8 +328,8 @@ def test_loss_type_error(self):
     def test_parameter_list_type_error(self):
         with self.assertRaises(TypeError):
             self.param_names[0] = np.random.random([10])
-            fluid.backward.append_backward(
-                loss=self.avg_loss, parameter_list=self.param_names)
+            fluid.backward.append_backward(loss=self.avg_loss,
+                                           parameter_list=self.param_names)
 
     def test_callback_type_error(self):
         with self.assertRaises(TypeError):
@@ -326,11 +337,12 @@ def test_callback_type_error(self):
             def callback(block, context):
                 return
 
-            fluid.backward.append_backward(
-                loss=self.avg_loss, callbacks=callback)
+            fluid.backward.append_backward(loss=self.avg_loss,
+                                           callbacks=callback)
 
 
 class TestGradientsWithOptimizer(unittest.TestCase):
+
     def _check_grad_op_name(self, forward_list, optimiezed_list):
         backward_list = [op + "_grad" for op in reversed(forward_list)]
         idx = optimiezed_list.index(backward_list[0], len(backward_list))
@@ -361,6 +373,7 @@ def test_gradient_with_optimizer(self):
 
 # TODO(Aurelius84): add conditional network test
 class ConditionalNet(BackwardNet):
+
     def __init__(self):
         super(ConditionalNet, self).__init__()
 
diff --git a/python/paddle/fluid/tests/unittests/test_backward_infer_var_data_type_shape.py b/python/paddle/fluid/tests/unittests/test_backward_infer_var_data_type_shape.py
index a0cd6fca57339..0c0a2419cffb7 100644
--- a/python/paddle/fluid/tests/unittests/test_backward_infer_var_data_type_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_backward_infer_var_data_type_shape.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,11 +22,14 @@
 
 
 class TestBackwardInferVarDataTypeShape(unittest.TestCase):
+
     def test_backward_infer_var_data_type_shape(self):
         paddle.enable_static()
         program = fluid.default_main_program()
-        dy = program.global_block().create_var(
-            name="Tmp@GRAD", shape=[1, 1], dtype=np.float32, persistable=True)
+        dy = program.global_block().create_var(name="Tmp@GRAD",
+                                               shape=[1, 1],
+                                               dtype=np.float32,
+                                               persistable=True)
         # invoke warning
         fluid.backward._infer_var_data_type_shape_("Tmp@GRAD",
                                                    program.global_block())
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index 3bdd03b321276..bb5c691a6e074 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -24,20 +24,26 @@
 
 
 class L1(fluid.Layer):
+
     def __init__(self):
         super(L1, self).__init__()
         self._param_attr = fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.1))
-        self.w1 = self.create_parameter(
-            attr=self._param_attr, shape=[2, 2], dtype='float32', is_bias=False)
-        self.w2 = self.create_parameter(
-            attr=self._param_attr, shape=[2, 2], dtype='float32', is_bias=False)
+        self.w1 = self.create_parameter(attr=self._param_attr,
+                                        shape=[2, 2],
+                                        dtype='float32',
+                                        is_bias=False)
+        self.w2 = self.create_parameter(attr=self._param_attr,
+                                        shape=[2, 2],
+                                        dtype='float32',
+                                        is_bias=False)
 
     def forward(self):
         return self.w1 + self.w2
 
 
 class L2(fluid.Layer):
+
     def __init__(self):
         super(L2, self).__init__()
         self.layer1 = L1()
@@ -48,6 +54,7 @@ def forward(self):
 
 
 class L3(fluid.Layer):
+
     def __init__(self):
         super(L3, self).__init__()
         self.layer1 = L2()
@@ -58,6 +65,7 @@ def forward(self):
 
 
 class TestBaseLayer(unittest.TestCase):
+
     def func_test_one_level(self):
         with fluid.dygraph.guard():
             l = L1()
@@ -131,6 +139,7 @@ def test_add_parameter_with_error(self):
 
 
 class BufferLayer(fluid.Layer):
+
     def __init__(self):
         super(BufferLayer, self).__init__()
         buffer_var = to_variable(np.zeros([2, 4]).astype('int32'))
@@ -141,11 +150,13 @@ def forward(self):
 
 
 class BufferNet(fluid.Layer):
+
     def __init__(self):
         super(BufferNet, self).__init__()
         self.buffer_layer = BufferLayer()
-        self.w1 = self.create_parameter(
-            shape=[2, 2], dtype='float32', is_bias=False)
+        self.w1 = self.create_parameter(shape=[2, 2],
+                                        dtype='float32',
+                                        is_bias=False)
         buffer_var = to_variable(np.ones([2, 4]).astype('int32'))
         self.register_buffer("net_buffer", buffer_var)
 
@@ -156,7 +167,9 @@ def forward(self):
 
 
 class TestBuffer(unittest.TestCase):
+
     def func_test_buffers_and_named_buffers(self):
+
         def names(named_buffers):
             return [name for name, _ in named_buffers]
 
@@ -173,9 +186,8 @@ def names(named_buffers):
                 ['net_buffer', 'new_buffer', 'buffer_layer.layer_buffer'])
 
             self.assertEqual(len(net.buffers(include_sublayers=False)), 2)
-            self.assertEqual(
-                names(net.named_buffers(include_sublayers=False)),
-                ['net_buffer', 'new_buffer'])
+            self.assertEqual(names(net.named_buffers(include_sublayers=False)),
+                             ['net_buffer', 'new_buffer'])
 
     def test_buffers_and_named_buffers(self):
         with _test_eager_guard():
@@ -363,6 +375,7 @@ def assert_var_base_equal(self, var1, var2):
 
 
 class BufferNetWithModification(paddle.nn.Layer):
+
     def __init__(self, shape):
         super(BufferNetWithModification, self).__init__()
 
@@ -380,6 +393,7 @@ def forward(self, x):
 
 
 class TestModifiedBuffer(unittest.TestCase):
+
     def funcsetUp(self):
         paddle.disable_static()
         self.prog_trans = ProgramTranslator()
@@ -410,6 +424,7 @@ def test_modified(self):
 
 
 class TestLayerTo(unittest.TestCase):
+
     def funcsetUp(self):
         paddle.disable_static()
         self.linear = paddle.nn.Linear(2, 2)
@@ -454,8 +469,8 @@ def func_test_to_api(self):
             self.assertEqual(self.linear.weight.place.gpu_device_id(), 0)
             self.assertTrue(self.linear.buf_name.place.is_gpu_place())
             self.assertEqual(self.linear.buf_name.place.gpu_device_id(), 0)
-            self.assertTrue(self.linear.weight._grad_ivar().place.is_gpu_place(
-            ))
+            self.assertTrue(
+                self.linear.weight._grad_ivar().place.is_gpu_place())
             self.assertEqual(
                 self.linear.weight._grad_ivar().place.gpu_device_id(), 0)
 
@@ -464,8 +479,8 @@ def func_test_to_api(self):
             self.assertEqual(self.linear.weight.place.gpu_device_id(), 0)
             self.assertTrue(self.linear.buf_name.place.is_gpu_place())
             self.assertEqual(self.linear.buf_name.place.gpu_device_id(), 0)
-            self.assertTrue(self.linear.weight._grad_ivar().place.is_gpu_place(
-            ))
+            self.assertTrue(
+                self.linear.weight._grad_ivar().place.is_gpu_place())
             self.assertEqual(
                 self.linear.weight._grad_ivar().place.gpu_device_id(), 0)
             for p in self.linear.parameters():
diff --git a/python/paddle/fluid/tests/unittests/test_basic_gru_api.py b/python/paddle/fluid/tests/unittests/test_basic_gru_api.py
index ee8a1b7af24f3..2a06f19277707 100644
--- a/python/paddle/fluid/tests/unittests/test_basic_gru_api.py
+++ b/python/paddle/fluid/tests/unittests/test_basic_gru_api.py
@@ -56,6 +56,7 @@ def gru_np(input,
            batch_first=False,
            is_bidirect=False,
            sequence_length=None):
+
     def step(step_in, pre_hidden, gate_w, gate_b, candidate_w, candidate_b):
         concat_1 = np.concatenate([step_in, pre_hidden], 1)
 
@@ -66,8 +67,8 @@ def step(step_in, pre_hidden, gate_w, gate_b, candidate_w, candidate_b):
 
         r_hidden = r * pre_hidden
 
-        candidate = np.matmul(
-            np.concatenate([step_in, r_hidden], 1), candidate_w)
+        candidate = np.matmul(np.concatenate([step_in, r_hidden], 1),
+                              candidate_w)
 
         candidate += candidate_b
         c = tanh(candidate)
@@ -95,8 +96,8 @@ def step(step_in, pre_hidden, gate_w, gate_b, candidate_w, candidate_b):
     if is_bidirect:
         direc_num = 2
     if init_h:
-        init_h = np.reshape(
-            init_h, shape=[num_layers, direc_num, -1, hidden_size])
+        init_h = np.reshape(init_h,
+                            shape=[num_layers, direc_num, -1, hidden_size])
     else:
         init_h = np.zeros([num_layers, direc_num, batch_size, hidden_size])
 
@@ -141,8 +142,9 @@ def get_single_direction_output(rnn_input, mask=None, direc_index=0):
 
         return rnn_out, last_hidden_out
 
-    fw_rnn_out, fw_last_hidden = get_single_direction_output(
-        input, mask, direc_index=0)
+    fw_rnn_out, fw_last_hidden = get_single_direction_output(input,
+                                                             mask,
+                                                             direc_index=0)
 
     if is_bidirect:
         bw_input = input[::-1]
@@ -150,8 +152,9 @@ def get_single_direction_output(rnn_input, mask=None, direc_index=0):
         if mask is not None:
             bw_mask = mask[::-1]
 
-        bw_rnn_out, bw_last_hidden = get_single_direction_output(
-            bw_input, bw_mask, direc_index=1)
+        bw_rnn_out, bw_last_hidden = get_single_direction_output(bw_input,
+                                                                 bw_mask,
+                                                                 direc_index=1)
 
         bw_rnn_out = bw_rnn_out[::-1]
 
@@ -175,6 +178,7 @@ def get_single_direction_output(rnn_input, mask=None, direc_index=0):
 
 
 class TestBasicGRUApi(unittest.TestCase):
+
     def setUp(self):
         self.hidden_size = 10
         self.batch_size = 5
@@ -184,12 +188,12 @@ def setUp(self):
         self.batch_first = False
 
     def test_run(self):
-        x = layers.data(
-            name='x',
-            shape=[-1, self.batch_size, self.hidden_size],
-            dtype='float32')
-        sequence_length = layers.data(
-            name="sequence_length", shape=[-1], dtype='float32')
+        x = layers.data(name='x',
+                        shape=[-1, self.batch_size, self.hidden_size],
+                        dtype='float32')
+        sequence_length = layers.data(name="sequence_length",
+                                      shape=[-1],
+                                      dtype='float32')
 
         rnn_out, last_hidden = basic_gru( x, None, self.hidden_size, num_layers=self.num_layers, \
                 batch_first = self.batch_first, bidirectional=self.is_bidirect, sequence_length=sequence_length )
@@ -221,29 +225,29 @@ def test_run(self):
             candidate_b_name = "basic_gru_layers_" + str(
                 i) + "/BasicGRUUnit_0.b_1"
 
-            gate_w = np.array(fluid.global_scope().find_var(gate_w_name)
-                              .get_tensor())
-            gate_w = np.random.uniform(
-                -0.1, 0.1, size=gate_w.shape).astype('float32')
-            fluid.global_scope().find_var(gate_w_name).get_tensor().set(gate_w,
-                                                                        place)
-
-            gate_b = np.array(fluid.global_scope().find_var(gate_b_name)
-                              .get_tensor())
-            gate_b = np.random.uniform(
-                -0.1, 0.1, size=gate_b.shape).astype('float32')
-            fluid.global_scope().find_var(gate_b_name).get_tensor().set(gate_b,
-                                                                        place)
-
-            candidate_w = np.array(fluid.global_scope().find_var(
-                candidate_w_name).get_tensor())
+            gate_w = np.array(
+                fluid.global_scope().find_var(gate_w_name).get_tensor())
+            gate_w = np.random.uniform(-0.1, 0.1,
+                                       size=gate_w.shape).astype('float32')
+            fluid.global_scope().find_var(gate_w_name).get_tensor().set(
+                gate_w, place)
+
+            gate_b = np.array(
+                fluid.global_scope().find_var(gate_b_name).get_tensor())
+            gate_b = np.random.uniform(-0.1, 0.1,
+                                       size=gate_b.shape).astype('float32')
+            fluid.global_scope().find_var(gate_b_name).get_tensor().set(
+                gate_b, place)
+
+            candidate_w = np.array(
+                fluid.global_scope().find_var(candidate_w_name).get_tensor())
             candidate_w = np.random.uniform(
                 -0.1, 0.1, size=candidate_w.shape).astype('float32')
             fluid.global_scope().find_var(candidate_w_name).get_tensor().set(
                 candidate_w, place)
 
-            candidate_b = np.array(fluid.global_scope().find_var(
-                candidate_b_name).get_tensor())
+            candidate_b = np.array(
+                fluid.global_scope().find_var(candidate_b_name).get_tensor())
             candidate_b = np.random.uniform(
                 -0.1, 0.1, size=candidate_b.shape).astype('float32')
             fluid.global_scope().find_var(candidate_b_name).get_tensor().set(
@@ -265,17 +269,17 @@ def test_run(self):
                 candidate_b_name = "basic_gru_reverse_layers_" + str(
                     i) + "/BasicGRUUnit_0.b_1"
 
-                gate_w = np.array(fluid.global_scope().find_var(gate_w_name)
-                                  .get_tensor())
-                gate_w = np.random.uniform(
-                    -0.1, 0.1, size=gate_w.shape).astype('float32')
+                gate_w = np.array(
+                    fluid.global_scope().find_var(gate_w_name).get_tensor())
+                gate_w = np.random.uniform(-0.1, 0.1,
+                                           size=gate_w.shape).astype('float32')
                 fluid.global_scope().find_var(gate_w_name).get_tensor().set(
                     gate_w, place)
 
-                gate_b = np.array(fluid.global_scope().find_var(gate_b_name)
-                                  .get_tensor())
-                gate_b = np.random.uniform(
-                    -0.1, 0.1, size=gate_b.shape).astype('float32')
+                gate_b = np.array(
+                    fluid.global_scope().find_var(gate_b_name).get_tensor())
+                gate_b = np.random.uniform(-0.1, 0.1,
+                                           size=gate_b.shape).astype('float32')
                 fluid.global_scope().find_var(gate_b_name).get_tensor().set(
                     gate_b, place)
 
@@ -283,53 +287,53 @@ def test_run(self):
                     candidate_w_name).get_tensor())
                 candidate_w = np.random.uniform(
                     -0.1, 0.1, size=candidate_w.shape).astype('float32')
-                fluid.global_scope().find_var(candidate_w_name).get_tensor(
-                ).set(candidate_w, place)
+                fluid.global_scope().find_var(
+                    candidate_w_name).get_tensor().set(candidate_w, place)
 
                 candidate_b = np.array(fluid.global_scope().find_var(
                     candidate_b_name).get_tensor())
                 candidate_b = np.random.uniform(
                     -0.1, 0.1, size=candidate_b.shape).astype('float32')
-                fluid.global_scope().find_var(candidate_b_name).get_tensor(
-                ).set(candidate_b, place)
+                fluid.global_scope().find_var(
+                    candidate_b_name).get_tensor().set(candidate_b, place)
 
                 gate_weight.append(gate_w)
                 gate_bias.append(gate_b)
                 candidate_weight.append(candidate_w)
                 candidate_bias.append(candidate_b)
 
-        step_input_np = np.random.uniform(-0.1, 0.1, (
-            self.seq_len, self.batch_size, self.hidden_size)).astype('float32')
+        step_input_np = np.random.uniform(
+            -0.1, 0.1,
+            (self.seq_len, self.batch_size, self.hidden_size)).astype('float32')
         sequence_length_np = np.random.randint(
             self.seq_len // 2, self.seq_len,
             size=(self.batch_size)).astype('int64')
 
-        out = exe.run(
-            feed={'x': step_input_np,
-                  'sequence_length': sequence_length_np},
-            fetch_list=[rnn_out, last_hidden])
+        out = exe.run(feed={
+            'x': step_input_np,
+            'sequence_length': sequence_length_np
+        },
+                      fetch_list=[rnn_out, last_hidden])
 
         api_rnn_out = out[0]
         api_last_hidden = out[1]
 
-        np_out = gru_np(
-            step_input_np,
-            None,
-            self.hidden_size,
-            gate_weight,
-            gate_bias,
-            candidate_weight,
-            candidate_bias,
-            num_layers=self.num_layers,
-            batch_first=self.batch_first,
-            is_bidirect=self.is_bidirect,
-            sequence_length=sequence_length_np)
+        np_out = gru_np(step_input_np,
+                        None,
+                        self.hidden_size,
+                        gate_weight,
+                        gate_bias,
+                        candidate_weight,
+                        candidate_bias,
+                        num_layers=self.num_layers,
+                        batch_first=self.batch_first,
+                        is_bidirect=self.is_bidirect,
+                        sequence_length=sequence_length_np)
 
         self.assertTrue(np.allclose(api_rnn_out, np_out[0], rtol=1e-4, atol=0))
 
         self.assertTrue(
-            np.allclose(
-                api_last_hidden, np_out[1], rtol=1e-4, atol=0))
+            np.allclose(api_last_hidden, np_out[1], rtol=1e-4, atol=0))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_basic_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_basic_gru_unit_op.py
index 597d8306b0155..7c4c8ff5aeebd 100644
--- a/python/paddle/fluid/tests/unittests/test_basic_gru_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_basic_gru_unit_op.py
@@ -66,14 +66,16 @@ def step(step_in, pre_hidden, gate_w, gate_b, candidate_w, candidate_b):
 
 
 class TestBasicGRUUnit(unittest.TestCase):
+
     def setUp(self):
         self.hidden_size = 5
         self.batch_size = 5
 
     def test_run(self):
         x = layers.data(name='x', shape=[-1, self.hidden_size], dtype='float32')
-        pre_hidden = layers.data(
-            name="pre_hidden", shape=[-1, self.hidden_size], dtype='float32')
+        pre_hidden = layers.data(name="pre_hidden",
+                                 shape=[-1, self.hidden_size],
+                                 dtype='float32')
         gru_unit = BasicGRUUnit("gru_unit", self.hidden_size)
 
         new_hidden = gru_unit(x, pre_hidden)
@@ -97,41 +99,43 @@ def test_run(self):
         candidate_w_name = "gru_unit/BasicGRUUnit_0.w_1"
         candidate_b_name = "gru_unit/BasicGRUUnit_0.b_1"
 
-        gate_w = np.array(fluid.global_scope().find_var(gate_w_name).get_tensor(
-        ))
-        gate_w = np.random.uniform(
-            -0.1, 0.1, size=gate_w.shape).astype('float32')
-        fluid.global_scope().find_var(gate_w_name).get_tensor().set(gate_w,
-                                                                    place)
-
-        gate_b = np.array(fluid.global_scope().find_var(gate_b_name).get_tensor(
-        ))
-        gate_b = np.random.uniform(
-            -0.1, 0.1, size=gate_b.shape).astype('float32')
-        fluid.global_scope().find_var(gate_b_name).get_tensor().set(gate_b,
-                                                                    place)
-
-        candidate_w = np.array(fluid.global_scope().find_var(candidate_w_name)
-                               .get_tensor())
+        gate_w = np.array(
+            fluid.global_scope().find_var(gate_w_name).get_tensor())
+        gate_w = np.random.uniform(-0.1, 0.1,
+                                   size=gate_w.shape).astype('float32')
+        fluid.global_scope().find_var(gate_w_name).get_tensor().set(
+            gate_w, place)
+
+        gate_b = np.array(
+            fluid.global_scope().find_var(gate_b_name).get_tensor())
+        gate_b = np.random.uniform(-0.1, 0.1,
+                                   size=gate_b.shape).astype('float32')
+        fluid.global_scope().find_var(gate_b_name).get_tensor().set(
+            gate_b, place)
+
+        candidate_w = np.array(
+            fluid.global_scope().find_var(candidate_w_name).get_tensor())
         candidate_w = np.random.uniform(
             -0.1, 0.1, size=candidate_w.shape).astype('float32')
         fluid.global_scope().find_var(candidate_w_name).get_tensor().set(
             candidate_w, place)
 
-        candidate_b = np.array(fluid.global_scope().find_var(candidate_b_name)
-                               .get_tensor())
+        candidate_b = np.array(
+            fluid.global_scope().find_var(candidate_b_name).get_tensor())
         candidate_b = np.random.uniform(
             -0.1, 0.1, size=candidate_b.shape).astype('float32')
         fluid.global_scope().find_var(candidate_b_name).get_tensor().set(
             candidate_b, place)
 
-        step_input_np = np.random.uniform(-0.1, 0.1, (
-            self.batch_size, self.hidden_size)).astype('float32')
-        pre_hidden_np = np.random.uniform(-0.1, 0.1, (
-            self.batch_size, self.hidden_size)).astype('float32')
+        step_input_np = np.random.uniform(
+            -0.1, 0.1, (self.batch_size, self.hidden_size)).astype('float32')
+        pre_hidden_np = np.random.uniform(
+            -0.1, 0.1, (self.batch_size, self.hidden_size)).astype('float32')
 
-        out = exe.run(feed={'x': step_input_np,
-                            'pre_hidden': pre_hidden_np},
+        out = exe.run(feed={
+            'x': step_input_np,
+            'pre_hidden': pre_hidden_np
+        },
                       fetch_list=[new_hidden])
 
         api_out = out[0]
diff --git a/python/paddle/fluid/tests/unittests/test_basic_lstm_api.py b/python/paddle/fluid/tests/unittests/test_basic_lstm_api.py
index bedba672edf95..abe0d6f8d56dc 100644
--- a/python/paddle/fluid/tests/unittests/test_basic_lstm_api.py
+++ b/python/paddle/fluid/tests/unittests/test_basic_lstm_api.py
@@ -56,6 +56,7 @@ def lstm_np(input,
             is_bidirect=False,
             sequence_length=None,
             forget_bias=1.0):
+
     def step(step_in, pre_hidden, pre_cell, gate_w, gate_b):
         concat_1 = np.concatenate([step_in, pre_hidden], 1)
 
@@ -187,6 +188,7 @@ def get_single_direction_output(rnn_input, mask=None, direc_index=0):
 
 
 class TestBasicLSTMApi(unittest.TestCase):
+
     def setUp(self):
         self.hidden_size = 10
         self.batch_size = 5
@@ -197,12 +199,12 @@ def setUp(self):
         self.forget_bias = 1.0
 
     def test_run(self):
-        x = layers.data(
-            name='x',
-            shape=[-1, self.batch_size, self.hidden_size],
-            dtype='float32')
-        sequence_length = layers.data(
-            name="sequence_length", shape=[-1], dtype='float32')
+        x = layers.data(name='x',
+                        shape=[-1, self.batch_size, self.hidden_size],
+                        dtype='float32')
+        sequence_length = layers.data(name="sequence_length",
+                                      shape=[-1],
+                                      dtype='float32')
 
         rnn_out, last_hidden, last_cell = basic_lstm( x, None, None, self.hidden_size, num_layers=self.num_layers, \
                 batch_first = self.batch_first, bidirectional=self.is_bidirect, sequence_length=sequence_length, forget_bias = self.forget_bias )
@@ -227,19 +229,19 @@ def test_run(self):
             gate_w_name = "basic_lstm_layers_" + str(i) + "/BasicLSTMUnit_0.w_0"
             gate_b_name = "basic_lstm_layers_" + str(i) + "/BasicLSTMUnit_0.b_0"
 
-            gate_w = np.array(fluid.global_scope().find_var(gate_w_name)
-                              .get_tensor())
-            gate_w = np.random.uniform(
-                -0.1, 0.1, size=gate_w.shape).astype('float32')
-            fluid.global_scope().find_var(gate_w_name).get_tensor().set(gate_w,
-                                                                        place)
+            gate_w = np.array(
+                fluid.global_scope().find_var(gate_w_name).get_tensor())
+            gate_w = np.random.uniform(-0.1, 0.1,
+                                       size=gate_w.shape).astype('float32')
+            fluid.global_scope().find_var(gate_w_name).get_tensor().set(
+                gate_w, place)
 
-            gate_b = np.array(fluid.global_scope().find_var(gate_b_name)
-                              .get_tensor())
-            gate_b = np.random.uniform(
-                -0.1, 0.1, size=gate_b.shape).astype('float32')
-            fluid.global_scope().find_var(gate_b_name).get_tensor().set(gate_b,
-                                                                        place)
+            gate_b = np.array(
+                fluid.global_scope().find_var(gate_b_name).get_tensor())
+            gate_b = np.random.uniform(-0.1, 0.1,
+                                       size=gate_b.shape).astype('float32')
+            fluid.global_scope().find_var(gate_b_name).get_tensor().set(
+                gate_b, place)
 
             gate_weight.append(gate_w)
             gate_bias.append(gate_b)
@@ -251,57 +253,56 @@ def test_run(self):
                 gate_b_name = "basic_lstm_reverse_layers_" + str(
                     i) + "/BasicLSTMUnit_0.b_0"
 
-                gate_w = np.array(fluid.global_scope().find_var(gate_w_name)
-                                  .get_tensor())
-                gate_w = np.random.uniform(
-                    -0.1, 0.1, size=gate_w.shape).astype('float32')
+                gate_w = np.array(
+                    fluid.global_scope().find_var(gate_w_name).get_tensor())
+                gate_w = np.random.uniform(-0.1, 0.1,
+                                           size=gate_w.shape).astype('float32')
                 fluid.global_scope().find_var(gate_w_name).get_tensor().set(
                     gate_w, place)
 
-                gate_b = np.array(fluid.global_scope().find_var(gate_b_name)
-                                  .get_tensor())
-                gate_b = np.random.uniform(
-                    -0.1, 0.1, size=gate_b.shape).astype('float32')
+                gate_b = np.array(
+                    fluid.global_scope().find_var(gate_b_name).get_tensor())
+                gate_b = np.random.uniform(-0.1, 0.1,
+                                           size=gate_b.shape).astype('float32')
                 fluid.global_scope().find_var(gate_b_name).get_tensor().set(
                     gate_b, place)
 
                 gate_weight.append(gate_w)
                 gate_bias.append(gate_b)
 
-        step_input_np = np.random.uniform(-0.1, 0.1, (
-            self.seq_len, self.batch_size, self.hidden_size)).astype('float32')
+        step_input_np = np.random.uniform(
+            -0.1, 0.1,
+            (self.seq_len, self.batch_size, self.hidden_size)).astype('float32')
         sequence_length_np = np.random.randint(
             self.seq_len // 2, self.seq_len,
             size=(self.batch_size)).astype('int64')
 
-        out = exe.run(
-            feed={'x': step_input_np,
-                  'sequence_length': sequence_length_np},
-            fetch_list=[rnn_out, last_hidden, last_cell])
+        out = exe.run(feed={
+            'x': step_input_np,
+            'sequence_length': sequence_length_np
+        },
+                      fetch_list=[rnn_out, last_hidden, last_cell])
 
         api_rnn_out = out[0]
         api_last_hidden = out[1]
         api_last_cell = out[2]
 
-        np_out = lstm_np(
-            step_input_np,
-            None,
-            None,
-            self.hidden_size,
-            gate_weight,
-            gate_bias,
-            num_layers=self.num_layers,
-            batch_first=self.batch_first,
-            is_bidirect=self.is_bidirect,
-            sequence_length=sequence_length_np)
+        np_out = lstm_np(step_input_np,
+                         None,
+                         None,
+                         self.hidden_size,
+                         gate_weight,
+                         gate_bias,
+                         num_layers=self.num_layers,
+                         batch_first=self.batch_first,
+                         is_bidirect=self.is_bidirect,
+                         sequence_length=sequence_length_np)
 
         self.assertTrue(np.allclose(api_rnn_out, np_out[0], rtol=1e-4, atol=0))
         self.assertTrue(
-            np.allclose(
-                api_last_hidden, np_out[1], rtol=1e-4, atol=0))
-        self.assertTrue(
-            np.allclose(
-                api_last_cell, np_out[2], rtol=1e-4, atol=0))
+            np.allclose(api_last_hidden, np_out[1], rtol=1e-4, atol=0))
+        self.assertTrue(np.allclose(api_last_cell, np_out[2], rtol=1e-4,
+                                    atol=0))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_basic_lstm_unit_op.py b/python/paddle/fluid/tests/unittests/test_basic_lstm_unit_op.py
index b94ac1db665c4..9f76d7d736f8c 100644
--- a/python/paddle/fluid/tests/unittests/test_basic_lstm_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_basic_lstm_unit_op.py
@@ -59,16 +59,19 @@ def step(step_in, pre_hidden, pre_cell, gate_w, gate_b, forget_bias=1.0):
 
 
 class TestBasicGRUUnit(unittest.TestCase):
+
     def setUp(self):
         self.hidden_size = 5
         self.batch_size = 5
 
     def test_run(self):
         x = layers.data(name='x', shape=[-1, self.hidden_size], dtype='float32')
-        pre_hidden = layers.data(
-            name="pre_hidden", shape=[-1, self.hidden_size], dtype='float32')
-        pre_cell = layers.data(
-            name="pre_cell", shape=[-1, self.hidden_size], dtype='float32')
+        pre_hidden = layers.data(name="pre_hidden",
+                                 shape=[-1, self.hidden_size],
+                                 dtype='float32')
+        pre_cell = layers.data(name="pre_cell",
+                               shape=[-1, self.hidden_size],
+                               dtype='float32')
 
         lstm_unit = BasicLSTMUnit("lstm_unit", self.hidden_size)
 
@@ -92,26 +95,26 @@ def test_run(self):
         gate_w_name = "lstm_unit/BasicLSTMUnit_0.w_0"
         gate_b_name = "lstm_unit/BasicLSTMUnit_0.b_0"
 
-        gate_w = np.array(fluid.global_scope().find_var(gate_w_name).get_tensor(
-        ))
-        gate_w = np.random.uniform(
-            -0.1, 0.1, size=gate_w.shape).astype('float32')
-        fluid.global_scope().find_var(gate_w_name).get_tensor().set(gate_w,
-                                                                    place)
-
-        gate_b = np.array(fluid.global_scope().find_var(gate_b_name).get_tensor(
-        ))
-        gate_b = np.random.uniform(
-            -0.1, 0.1, size=gate_b.shape).astype('float32')
-        fluid.global_scope().find_var(gate_b_name).get_tensor().set(gate_b,
-                                                                    place)
-
-        step_input_np = np.random.uniform(-0.1, 0.1, (
-            self.batch_size, self.hidden_size)).astype('float32')
-        pre_hidden_np = np.random.uniform(-0.1, 0.1, (
-            self.batch_size, self.hidden_size)).astype('float32')
-        pre_cell_np = np.random.uniform(-0.1, 0.1, (
-            self.batch_size, self.hidden_size)).astype('float32')
+        gate_w = np.array(
+            fluid.global_scope().find_var(gate_w_name).get_tensor())
+        gate_w = np.random.uniform(-0.1, 0.1,
+                                   size=gate_w.shape).astype('float32')
+        fluid.global_scope().find_var(gate_w_name).get_tensor().set(
+            gate_w, place)
+
+        gate_b = np.array(
+            fluid.global_scope().find_var(gate_b_name).get_tensor())
+        gate_b = np.random.uniform(-0.1, 0.1,
+                                   size=gate_b.shape).astype('float32')
+        fluid.global_scope().find_var(gate_b_name).get_tensor().set(
+            gate_b, place)
+
+        step_input_np = np.random.uniform(
+            -0.1, 0.1, (self.batch_size, self.hidden_size)).astype('float32')
+        pre_hidden_np = np.random.uniform(
+            -0.1, 0.1, (self.batch_size, self.hidden_size)).astype('float32')
+        pre_cell_np = np.random.uniform(
+            -0.1, 0.1, (self.batch_size, self.hidden_size)).astype('float32')
 
         out = exe.run( feed={ 'x' : step_input_np, 'pre_hidden' : pre_hidden_np, \
                               'pre_cell' : pre_cell_np },
@@ -124,11 +127,9 @@ def test_run(self):
                                           pre_cell_np, gate_w, gate_b)
 
         self.assertTrue(
-            np.allclose(
-                api_hidden_out, np_hidden_out, rtol=1e-4, atol=0))
+            np.allclose(api_hidden_out, np_hidden_out, rtol=1e-4, atol=0))
         self.assertTrue(
-            np.allclose(
-                api_cell_out, np_cell_out, rtol=1e-4, atol=0))
+            np.allclose(api_cell_out, np_cell_out, rtol=1e-4, atol=0))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_basic_rnn_name.py b/python/paddle/fluid/tests/unittests/test_basic_rnn_name.py
index 303ff9c86a6c6..34ee0e1693d85 100644
--- a/python/paddle/fluid/tests/unittests/test_basic_rnn_name.py
+++ b/python/paddle/fluid/tests/unittests/test_basic_rnn_name.py
@@ -27,6 +27,7 @@
 
 
 class TestBasicGRUApiName(unittest.TestCase):
+
     def setUp(self):
         self.name_set = set([
             "test1_fw_w_0_gate", "test1_fw_w_0_candidate", "test1_fw_b_0_gate",
@@ -45,14 +46,15 @@ def test_name(self):
         batch_first = False
 
         with new_program_scope():
-            input = layers.data(
-                name="input",
-                shape=[-1, batch_size, input_size],
-                dtype='float32')
-            pre_hidden = layers.data(
-                name="pre_hidden", shape=[-1, hidden_size], dtype='float32')
-            sequence_length = layers.data(
-                name="sequence_length", shape=[-1], dtype='int32')
+            input = layers.data(name="input",
+                                shape=[-1, batch_size, input_size],
+                                dtype='float32')
+            pre_hidden = layers.data(name="pre_hidden",
+                                     shape=[-1, hidden_size],
+                                     dtype='float32')
+            sequence_length = layers.data(name="sequence_length",
+                                          shape=[-1],
+                                          dtype='int32')
 
 
             rnn_out, last_hidden = basic_gru( input, pre_hidden, hidden_size, num_layers = num_layers, \
@@ -67,6 +69,7 @@ def test_name(self):
 
 
 class TestBasicLSTMApiName(unittest.TestCase):
+
     def setUp(self):
         self.name_set = set([
             "test1_fw_w_0", "test1_fw_b_0", "test1_fw_w_1", "test1_fw_b_1",
@@ -83,16 +86,18 @@ def test_name(self):
         batch_first = False
 
         with new_program_scope():
-            input = layers.data(
-                name="input",
-                shape=[-1, batch_size, input_size],
-                dtype='float32')
-            pre_hidden = layers.data(
-                name="pre_hidden", shape=[-1, hidden_size], dtype='float32')
-            pre_cell = layers.data(
-                name="pre_cell", shape=[-1, hidden_size], dtype='float32')
-            sequence_length = layers.data(
-                name="sequence_length", shape=[-1], dtype='int32')
+            input = layers.data(name="input",
+                                shape=[-1, batch_size, input_size],
+                                dtype='float32')
+            pre_hidden = layers.data(name="pre_hidden",
+                                     shape=[-1, hidden_size],
+                                     dtype='float32')
+            pre_cell = layers.data(name="pre_cell",
+                                   shape=[-1, hidden_size],
+                                   dtype='float32')
+            sequence_length = layers.data(name="sequence_length",
+                                          shape=[-1],
+                                          dtype='int32')
 
             rnn_out, last_hidden, last_cell = basic_lstm( input, pre_hidden, pre_cell, \
                 hidden_size, num_layers = num_layers, \
diff --git a/python/paddle/fluid/tests/unittests/test_batch_fc_op.py b/python/paddle/fluid/tests/unittests/test_batch_fc_op.py
index 56631d8d3b4ad..00c743eded512 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_fc_op.py
@@ -35,6 +35,7 @@ def np_cal_batchfc(input, w, bias):
 
 
 class TestBatchFCOp(OpTest):
+
     def config(self):
         self.slot_pairs_num = 10
         self.batch_size = 5
@@ -46,10 +47,10 @@ def setUp(self):
         self.config()
         self.input = np.random.random((self.slot_pairs_num, self.batch_size,
                                        self.in_dim)).astype(self.dtype)
-        self.w = np.random.random((self.slot_pairs_num, self.in_dim,
-                                   self.out_dim)).astype(self.dtype)
-        self.bias = np.random.random((self.slot_pairs_num,
-                                      self.out_dim)).astype(self.dtype)
+        self.w = np.random.random(
+            (self.slot_pairs_num, self.in_dim, self.out_dim)).astype(self.dtype)
+        self.bias = np.random.random(
+            (self.slot_pairs_num, self.out_dim)).astype(self.dtype)
         self.op_type = "batch_fc"
         np_out = np_cal_batchfc(self.input, self.w, self.bias)
         np_out = np_out.astype(self.dtype)
@@ -62,11 +63,12 @@ def test_check_output_gpu(self):
 
     def test_check_grad_gpu(self):
         if core.is_compiled_with_cuda():
-            self.check_grad_with_place(
-                core.CUDAPlace(0), ["Bias", "W", "Input"], "Out")
+            self.check_grad_with_place(core.CUDAPlace(0),
+                                       ["Bias", "W", "Input"], "Out")
 
 
 class TestBatchFCOp1(OpTest):
+
     def config(self):
         self.slot_pairs_num = 10
         self.batch_size = 5
@@ -78,10 +80,10 @@ def setUp(self):
         self.config()
         self.input = np.random.random((self.slot_pairs_num, self.batch_size,
                                        self.in_dim)).astype(self.dtype)
-        self.w = np.random.random((self.slot_pairs_num, self.in_dim,
-                                   self.out_dim)).astype(self.dtype)
-        self.bias = np.random.random((self.slot_pairs_num,
-                                      self.out_dim)).astype(self.dtype)
+        self.w = np.random.random(
+            (self.slot_pairs_num, self.in_dim, self.out_dim)).astype(self.dtype)
+        self.bias = np.random.random(
+            (self.slot_pairs_num, self.out_dim)).astype(self.dtype)
         self.op_type = "batch_fc"
         np_out = np_cal_batchfc(self.input, self.w, self.bias)
         np_out = np_out.astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index b02df024518a8..b312baea932a3 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -156,9 +156,9 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
         x = np.transpose(x, (0, 2, 3, 1))
         y_grad = np.transpose(y_grad, (0, 2, 3, 1))
 
-    x_grad = scale * (y_grad - np.mean(
-        y_grad, axis=(0, 1, 2)) - (x - mean) * np.mean(
-            y_grad * (x - mean), axis=(0, 1, 2)) /
+    x_grad = scale * (y_grad - np.mean(y_grad, axis=(0, 1, 2)) -
+                      (x - mean) * np.mean(y_grad *
+                                           (x - mean), axis=(0, 1, 2)) /
                       (var + epsilon)) / np.sqrt(var + epsilon)
     grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
                         axis=(0, 1, 2))
@@ -186,6 +186,7 @@ def create_or_get_tensor(scope, var_name, var, place):
 
 
 def set_output_grad(scope, outputs, place, feed_dict=None):
+
     def __set_tensor__(name, data=None):
         out_tensor = scope.find_var(name).get_tensor()
         grad_tensor = scope.var(grad_var_name(name)).get_tensor()
@@ -207,6 +208,7 @@ def __set_tensor__(name, data=None):
 
 
 class TestBatchNormOpInference(unittest.TestCase):
+
     def setUp(self):
         self.dtype = np.float32
         self.use_mkldnn = False
@@ -252,8 +254,8 @@ def check_with_place(self, place, data_layout, dtype, shape):
                                         OpTest.np_dtype_to_fluid_dtype(x_val),
                                         place)
         scale_tensor = create_or_get_tensor(
-            scope, "scale_val",
-            OpTest.np_dtype_to_fluid_dtype(scale_val), place)
+            scope, "scale_val", OpTest.np_dtype_to_fluid_dtype(scale_val),
+            place)
         bias_tensor = create_or_get_tensor(
             scope, "bias_val", OpTest.np_dtype_to_fluid_dtype(bias_val), place)
         mean_tensor = create_or_get_tensor(scope, "mean",
@@ -300,7 +302,7 @@ def check_with_place(self, place, data_layout, dtype, shape):
         # of memory descripting. So we need to convert NCHW
         # dims into NHWC.
         if data_layout == "NHWC" and self.use_mkldnn == True:
-            # Create executor to have MKL-DNN cache 
+            # Create executor to have MKL-DNN cache
             # cleared after NHWC unit test
             place = core.CPUPlace()
             exe = fluid.Executor(place)
@@ -310,13 +312,12 @@ def check_with_place(self, place, data_layout, dtype, shape):
             y_tensor._set_dims(dims)
 
         # check inference result
-        self.__assert_close(
-            y_tensor,
-            y_out,
-            "inference output are different at " + str(place) + ", " +
-            data_layout + ", " + str(np.dtype(dtype)) +
-            str(np.array(y_tensor)) + str(y_out),
-            atol=1e-3)
+        self.__assert_close(y_tensor,
+                            y_out,
+                            "inference output are different at " + str(place) +
+                            ", " + data_layout + ", " + str(np.dtype(dtype)) +
+                            str(np.array(y_tensor)) + str(y_out),
+                            atol=1e-3)
 
     def test_check_output(self):
         places = [core.CPUPlace()]
@@ -334,6 +335,7 @@ def init_kernel_type(self):
 
 
 class TestFP16BatchNormOpInference(TestBatchNormOpInference):
+
     def setUp(self):
         self.dtype = np.float16
         self.use_mkldnn = False
@@ -355,6 +357,7 @@ def test_check_output(self):
 
 
 class TestBatchNormOpTraining(unittest.TestCase):
+
     def setUp(self):
         self.use_mkldnn = False
         self.fuse_with_relu = False
@@ -385,8 +388,9 @@ def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
         variance_out = var_ref * (1. - momentum) + momentum * variance
         saved_variance = 1. / np.sqrt(var_ref + epsilon)
         # run backward
-        x_grad, scale_grad, bias_grad = _reference_grad(
-            x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout)
+        x_grad, scale_grad, bias_grad = _reference_grad(x, y_grad, scale,
+                                                        saved_mean, var_ref,
+                                                        epsilon, data_layout)
 
         return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
 
@@ -402,6 +406,7 @@ def set_mean_variance(self, scale_shape, x, data_layout):
         return mean, variance
 
     def test_forward_backward(self):
+
         def test_with_place(place, data_layout, shape):
             # attr
             epsilon = self.epsilon
@@ -440,10 +445,9 @@ def test_with_place(place, data_layout, shape):
             with fluid.program_guard(program):
                 block = program.global_block()
                 for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape)
+                    block.create_var(name=name,
+                                     dtype='float32',
+                                     shape=ground_truth[name].shape)
                 inputs = {
                     "X": block.var('x'),
                     "Scale": block.var('scale'),
@@ -473,11 +477,10 @@ def test_with_place(place, data_layout, shape):
                 }
                 block.create_var(name="reserve_space", dtype='float32')
                 outputs["ReserveSpace"] = block.var('reserve_space')
-                bn_op = block.append_op(
-                    type="batch_norm",
-                    inputs=inputs,
-                    outputs=outputs,
-                    attrs=attrs)
+                bn_op = block.append_op(type="batch_norm",
+                                        inputs=inputs,
+                                        outputs=outputs,
+                                        attrs=attrs)
                 block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
 
                 # generate backward op_desc
@@ -509,8 +512,10 @@ def test_with_place(place, data_layout, shape):
 
             for id, name in enumerate(self.fetch_list):
                 if name == 'variance':
-                    self.__assert_close(
-                        var_dict[name], out[id], name, atol=1e-3)
+                    self.__assert_close(var_dict[name],
+                                        out[id],
+                                        name,
+                                        atol=1e-3)
                     continue
                 self.__assert_close(var_dict[name], out[id], name)
             print("op test forward passed: ", str(place), data_layout)
@@ -529,6 +534,7 @@ def init_kernel_type(self):
 
 
 class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining):
+
     def init_test_case(self):
         self.use_global_stats = False
         self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
@@ -536,6 +542,7 @@ def init_test_case(self):
 
 
 class TestBatchNormOpTrainingCase2(TestBatchNormOpTraining):
+
     def init_test_case(self):
         self.use_global_stats = False
         self.no_grad_set = set()
@@ -547,6 +554,7 @@ def init_test_case(self):
 
 
 class TestBatchNormOpTrainingCase3(TestBatchNormOpTraining):
+
     def init_test_case(self):
         self.use_global_stats = False
         self.no_grad_set = set(['x@GRAD'])
@@ -554,6 +562,7 @@ def init_test_case(self):
 
 
 class TestBatchNormOpTrainingMomentumVariable(TestBatchNormOpTraining):
+
     def init_test_case(self):
         self.use_momentum_variable = True
         self.use_global_stats = False
@@ -565,6 +574,7 @@ def init_test_case(self):
 
 
 class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining):
+
     def init_test_case(self):
         self.use_global_stats = True
         self.no_grad_set = set()
@@ -619,6 +629,7 @@ def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
 
 class TestBatchNormOpFreezeStatsAndScaleBiasTraining(
         TestBatchNormOpFreezeStatsTraining):
+
     def init_test_case(self):
         self.use_global_stats = True
         self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
@@ -626,11 +637,12 @@ def init_test_case(self):
 
 
 class TestBatchNormOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of batch_norm must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.batch_norm, x1)
 
             # the input dtype of batch_norm must be float16 or float32 or float64
@@ -640,12 +652,13 @@ def test_errors(self):
 
 
 class TestDygraphBatchNormAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             batch_norm = fluid.dygraph.BatchNorm(10)
             # the input of BatchNorm must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
             self.assertRaises(TypeError, batch_norm, x1)
 
             # the input dtype of BatchNorm must be float16 or float32 or float64
@@ -655,6 +668,7 @@ def test_errors(self):
 
 
 class TestDygraphBatchNormTrainableStats(unittest.TestCase):
+
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -703,6 +717,7 @@ def compute(x_np, is_test, trainable_statistics):
 
 
 class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
+
     def test_reservespace(self):
         with program_guard(Program(), Program()):
             paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index ac09d9f5fdfd0..9db95f094a7e3 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -26,6 +26,7 @@
 
 
 class TestBatchNorm(unittest.TestCase):
+
     def test_name(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -159,8 +160,9 @@ def compute_v3(x, is_test, trainable_statistics):
 
             def compute_v4(x):
                 with fluid.dygraph.guard(p):
-                    bn = paddle.nn.BatchNorm2D(
-                        shape[1], weight_attr=False, bias_attr=False)
+                    bn = paddle.nn.BatchNorm2D(shape[1],
+                                               weight_attr=False,
+                                               bias_attr=False)
                     y = bn(paddle.to_tensor(x))
                 return y.numpy()
 
@@ -208,6 +210,7 @@ def compute_v2(x_np):
 
 
 class TestBatchNormChannelLast(unittest.TestCase):
+
     def setUp(self):
         self.original_dtyep = paddle.get_default_dtype()
         # MIOPEN not support data type of double
@@ -237,8 +240,7 @@ def test_1d(self):
                 if core.is_compiled_with_rocm():
                     # HIP will fail if no atol
                     self.assertEqual(
-                        np.allclose(
-                            y1.numpy(), y2.numpy(), atol=1e-07), True)
+                        np.allclose(y1.numpy(), y2.numpy(), atol=1e-07), True)
                 else:
                     self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
 
@@ -257,8 +259,7 @@ def test_2d(self):
                 if core.is_compiled_with_rocm():
                     # HIP will fail if no atol
                     self.assertEqual(
-                        np.allclose(
-                            y1.numpy(), y2.numpy(), atol=1e-07), True)
+                        np.allclose(y1.numpy(), y2.numpy(), atol=1e-07), True)
                 else:
                     self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
 
@@ -277,13 +278,13 @@ def test_3d(self):
                 if core.is_compiled_with_rocm():
                     # HIP will fail if no atol
                     self.assertEqual(
-                        np.allclose(
-                            y1.numpy(), y2.numpy(), atol=1e-07), True)
+                        np.allclose(y1.numpy(), y2.numpy(), atol=1e-07), True)
                 else:
                     self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
 
 
 class TestBatchNormUseGlobalStats(unittest.TestCase):
+
     def setUp(self):
         self.places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_batch_sampler.py b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
index 4c5338314afb1..279176e0b5709 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_sampler.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
@@ -26,6 +26,7 @@
 
 
 class RandomDataset(Dataset):
+
     def __init__(self, sample_num, class_num):
         self.sample_num = sample_num
         self.class_num = class_num
@@ -41,6 +42,7 @@ def __len__(self):
 
 
 class TestSampler(unittest.TestCase):
+
     def test_main(self):
         dataset = RandomDataset(100, 10)
         sampler = Sampler(dataset)
@@ -52,6 +54,7 @@ def test_main(self):
 
 
 class TestSequenceSampler(unittest.TestCase):
+
     def test_main(self):
         dataset = RandomDataset(100, 10)
         sampler = SequenceSampler(dataset)
@@ -62,6 +65,7 @@ def test_main(self):
 
 
 class TestRandomSampler(unittest.TestCase):
+
     def test_main(self):
         dataset = RandomDataset(100, 10)
         sampler = RandomSampler(dataset)
@@ -96,8 +100,10 @@ def test_with_generator(self):
     def test_with_generator_num_samples(self):
         dataset = RandomDataset(100, 10)
         generator = iter(range(0, 60))
-        sampler = RandomSampler(
-            dataset, generator=generator, num_samples=50, replacement=True)
+        sampler = RandomSampler(dataset,
+                                generator=generator,
+                                num_samples=50,
+                                replacement=True)
         assert len(sampler) == 50
 
         rets = []
@@ -107,6 +113,7 @@ def test_with_generator_num_samples(self):
 
 
 class TestBatchSampler(unittest.TestCase):
+
     def setUp(self):
         self.num_samples = 1000
         self.num_classes = 10
@@ -116,11 +123,10 @@ def setUp(self):
 
     def init_batch_sampler(self):
         dataset = RandomDataset(self.num_samples, self.num_classes)
-        bs = BatchSampler(
-            dataset=dataset,
-            batch_size=self.batch_size,
-            shuffle=self.shuffle,
-            drop_last=self.drop_last)
+        bs = BatchSampler(dataset=dataset,
+                          batch_size=self.batch_size,
+                          shuffle=self.shuffle,
+                          drop_last=self.drop_last)
         return bs
 
     def test_main(self):
@@ -140,6 +146,7 @@ def test_main(self):
 
 
 class TestBatchSamplerDropLast(TestBatchSampler):
+
     def setUp(self):
         self.num_samples = 1000
         self.num_classes = 10
@@ -149,6 +156,7 @@ def setUp(self):
 
 
 class TestBatchSamplerShuffle(TestBatchSampler):
+
     def setUp(self):
         self.num_samples = 1000
         self.num_classes = 10
@@ -158,17 +166,18 @@ def setUp(self):
 
 
 class TestBatchSamplerWithSampler(TestBatchSampler):
+
     def init_batch_sampler(self):
         dataset = RandomDataset(1000, 10)
         sampler = SequenceSampler(dataset)
-        bs = BatchSampler(
-            sampler=sampler,
-            batch_size=self.batch_size,
-            drop_last=self.drop_last)
+        bs = BatchSampler(sampler=sampler,
+                          batch_size=self.batch_size,
+                          drop_last=self.drop_last)
         return bs
 
 
 class TestBatchSamplerWithSamplerDropLast(unittest.TestCase):
+
     def setUp(self):
         self.num_samples = 1000
         self.num_classes = 10
@@ -178,6 +187,7 @@ def setUp(self):
 
 
 class TestBatchSamplerWithSamplerShuffle(unittest.TestCase):
+
     def setUp(self):
         self.num_samples = 1000
         self.num_classes = 10
@@ -189,17 +199,17 @@ def test_main(self):
         try:
             dataset = RandomDataset(self.num_samples, self.num_classes)
             sampler = RandomSampler(dataset)
-            bs = BatchSampler(
-                sampler=sampler,
-                shuffle=self.shuffle,
-                batch_size=self.batch_size,
-                drop_last=self.drop_last)
+            bs = BatchSampler(sampler=sampler,
+                              shuffle=self.shuffle,
+                              batch_size=self.batch_size,
+                              drop_last=self.drop_last)
             self.assertTrue(False)
         except AssertionError:
             pass
 
 
 class TestWeightedRandomSampler(unittest.TestCase):
+
     def init_probs(self, total, pos):
         pos_probs = np.random.random((pos, )).astype('float32')
         probs = np.zeros((total, )).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/test_bce_loss.py b/python/paddle/fluid/tests/unittests/test_bce_loss.py
index 1051fa9c1aefa..b1f16a4cecd1b 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_loss.py
@@ -27,23 +27,27 @@ def test_static_layer(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        input = paddle.fluid.data(
-            name='input', shape=input_np.shape, dtype='float64')
-        label = paddle.fluid.data(
-            name='label', shape=label_np.shape, dtype='float64')
+        input = paddle.fluid.data(name='input',
+                                  shape=input_np.shape,
+                                  dtype='float64')
+        label = paddle.fluid.data(name='label',
+                                  shape=label_np.shape,
+                                  dtype='float64')
         if weight_np is not None:
-            weight = paddle.fluid.data(
-                name='weight', shape=weight_np.shape, dtype='float64')
-            bce_loss = paddle.nn.loss.BCELoss(
-                weight=weight, reduction=reduction)
+            weight = paddle.fluid.data(name='weight',
+                                       shape=weight_np.shape,
+                                       dtype='float64')
+            bce_loss = paddle.nn.loss.BCELoss(weight=weight,
+                                              reduction=reduction)
         else:
             bce_loss = paddle.nn.loss.BCELoss(reduction=reduction)
         res = bce_loss(input, label)
         exe = paddle.static.Executor(place)
         static_result = exe.run(prog,
-                                feed={"input": input_np,
-                                      "label": label_np}
-                                if weight_np is None else {
+                                feed={
+                                    "input": input_np,
+                                    "label": label_np
+                                } if weight_np is None else {
                                     "input": input_np,
                                     "label": label_np,
                                     "weight": weight_np
@@ -60,23 +64,30 @@ def test_static_functional(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        input = paddle.fluid.data(
-            name='input', shape=input_np.shape, dtype='float64')
-        label = paddle.fluid.data(
-            name='label', shape=label_np.shape, dtype='float64')
+        input = paddle.fluid.data(name='input',
+                                  shape=input_np.shape,
+                                  dtype='float64')
+        label = paddle.fluid.data(name='label',
+                                  shape=label_np.shape,
+                                  dtype='float64')
         if weight_np is not None:
-            weight = paddle.fluid.data(
-                name='weight', shape=weight_np.shape, dtype='float64')
-            res = paddle.nn.functional.binary_cross_entropy(
-                input, label, weight=weight, reduction=reduction)
+            weight = paddle.fluid.data(name='weight',
+                                       shape=weight_np.shape,
+                                       dtype='float64')
+            res = paddle.nn.functional.binary_cross_entropy(input,
+                                                            label,
+                                                            weight=weight,
+                                                            reduction=reduction)
         else:
-            res = paddle.nn.functional.binary_cross_entropy(
-                input, label, reduction=reduction)
+            res = paddle.nn.functional.binary_cross_entropy(input,
+                                                            label,
+                                                            reduction=reduction)
         exe = paddle.static.Executor(place)
         static_result = exe.run(prog,
-                                feed={"input": input_np,
-                                      "label": label_np}
-                                if weight_np is None else {
+                                feed={
+                                    "input": input_np,
+                                    "label": label_np
+                                } if weight_np is None else {
                                     "input": input_np,
                                     "label": label_np,
                                     "weight": weight_np
@@ -113,11 +124,14 @@ def test_dygraph_functional(place,
 
     if weight_np is not None:
         weight = paddle.to_tensor(weight_np)
-        dy_res = paddle.nn.functional.binary_cross_entropy(
-            input, label, weight=weight, reduction=reduction)
+        dy_res = paddle.nn.functional.binary_cross_entropy(input,
+                                                           label,
+                                                           weight=weight,
+                                                           reduction=reduction)
     else:
-        dy_res = paddle.nn.functional.binary_cross_entropy(
-            input, label, reduction=reduction)
+        dy_res = paddle.nn.functional.binary_cross_entropy(input,
+                                                           label,
+                                                           reduction=reduction)
     dy_result = dy_res.numpy()
     paddle.enable_static()
     return dy_result
@@ -142,6 +156,7 @@ def calc_bceloss(input_np, label_np, reduction='mean', weight_np=None):
 
 
 class TestBCELoss(unittest.TestCase):
+
     def test_BCELoss(self):
         input_np = np.random.uniform(0.1, 0.8, size=(20, 30)).astype(np.float64)
         label_np = np.random.randint(0, 2, size=(20, 30)).astype(np.float64)
@@ -159,8 +174,8 @@ def test_BCELoss(self):
                 self.assertTrue(np.allclose(static_result, expected))
                 self.assertTrue(np.allclose(static_result, dy_result))
                 self.assertTrue(np.allclose(dy_result, expected))
-                static_functional = test_static_functional(place, input_np,
-                                                           label_np, reduction)
+                static_functional = test_static_functional(
+                    place, input_np, label_np, reduction)
                 dy_functional = test_dygraph_functional(place, input_np,
                                                         label_np, reduction)
                 self.assertTrue(np.allclose(static_functional, expected))
@@ -168,43 +183,57 @@ def test_BCELoss(self):
                 self.assertTrue(np.allclose(dy_functional, expected))
 
     def test_BCELoss_weight(self):
-        input_np = np.random.uniform(
-            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
-        label_np = np.random.randint(
-            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
+        input_np = np.random.uniform(0.1, 0.8,
+                                     size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(0, 2,
+                                     size=(2, 3, 4, 10)).astype(np.float64)
         weight_np = np.random.random(size=(3, 4, 10)).astype(np.float64)
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         for reduction in ['sum', 'mean', 'none']:
-            static_result = test_static_layer(
-                place, input_np, label_np, reduction, weight_np=weight_np)
-            dy_result = test_dygraph_layer(
-                place, input_np, label_np, reduction, weight_np=weight_np)
-            expected = calc_bceloss(
-                input_np, label_np, reduction, weight_np=weight_np)
+            static_result = test_static_layer(place,
+                                              input_np,
+                                              label_np,
+                                              reduction,
+                                              weight_np=weight_np)
+            dy_result = test_dygraph_layer(place,
+                                           input_np,
+                                           label_np,
+                                           reduction,
+                                           weight_np=weight_np)
+            expected = calc_bceloss(input_np,
+                                    label_np,
+                                    reduction,
+                                    weight_np=weight_np)
             self.assertTrue(np.allclose(static_result, expected))
             self.assertTrue(np.allclose(static_result, dy_result))
             self.assertTrue(np.allclose(dy_result, expected))
-            static_functional = test_static_functional(
-                place, input_np, label_np, reduction, weight_np=weight_np)
-            dy_functional = test_dygraph_functional(
-                place, input_np, label_np, reduction, weight_np=weight_np)
+            static_functional = test_static_functional(place,
+                                                       input_np,
+                                                       label_np,
+                                                       reduction,
+                                                       weight_np=weight_np)
+            dy_functional = test_dygraph_functional(place,
+                                                    input_np,
+                                                    label_np,
+                                                    reduction,
+                                                    weight_np=weight_np)
             self.assertTrue(np.allclose(static_functional, expected))
             self.assertTrue(np.allclose(static_functional, dy_functional))
             self.assertTrue(np.allclose(dy_functional, expected))
 
     def test_BCELoss_error(self):
         paddle.disable_static()
-        self.assertRaises(
-            ValueError, paddle.nn.loss.BCELoss, reduction="unsupport reduction")
+        self.assertRaises(ValueError,
+                          paddle.nn.loss.BCELoss,
+                          reduction="unsupport reduction")
         input = paddle.to_tensor([[0.1, 0.3]], dtype='float32')
         label = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
-        self.assertRaises(
-            ValueError,
-            paddle.nn.functional.binary_cross_entropy,
-            input=input,
-            label=label,
-            reduction="unsupport reduction")
+        self.assertRaises(ValueError,
+                          paddle.nn.functional.binary_cross_entropy,
+                          input=input,
+                          label=label,
+                          reduction="unsupport reduction")
         paddle.enable_static()
 
 
@@ -213,6 +242,7 @@ def bce_loss(input, label):
 
 
 class TestBceLossOp(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.op_type = "bce_loss"
@@ -234,11 +264,13 @@ def init_test_case(self):
 
 
 class TestBceLossOpCase1(OpTest):
+
     def init_test_cast(self):
         self.shape = [2, 3, 4, 5]
 
 
 class TestBceLossOpCase2(OpTest):
+
     def init_test_cast(self):
         self.shape = [2, 3, 20]
 
diff --git a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
index ea6d82d15ce0c..de78c4edcf54b 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
@@ -20,10 +20,14 @@
 from paddle.fluid.framework import _test_eager_guard
 
 
-def call_bce_layer(logit, label, weight=None, reduction='mean',
+def call_bce_layer(logit,
+                   label,
+                   weight=None,
+                   reduction='mean',
                    pos_weight=None):
-    bce_logit_loss = paddle.nn.loss.BCEWithLogitsLoss(
-        weight=weight, reduction=reduction, pos_weight=pos_weight)
+    bce_logit_loss = paddle.nn.loss.BCEWithLogitsLoss(weight=weight,
+                                                      reduction=reduction,
+                                                      pos_weight=pos_weight)
     res = bce_logit_loss(logit, label)
     return res
 
@@ -49,21 +53,25 @@ def test_static(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        logit = paddle.fluid.data(
-            name='logit', shape=logit_np.shape, dtype='float64')
-        label = paddle.fluid.data(
-            name='label', shape=label_np.shape, dtype='float64')
+        logit = paddle.fluid.data(name='logit',
+                                  shape=logit_np.shape,
+                                  dtype='float64')
+        label = paddle.fluid.data(name='label',
+                                  shape=label_np.shape,
+                                  dtype='float64')
         feed_dict = {"logit": logit_np, "label": label_np}
 
         pos_weight = None
         weight = None
         if pos_weight_np is not None:
-            pos_weight = paddle.fluid.data(
-                name='pos_weight', shape=pos_weight_np.shape, dtype='float64')
+            pos_weight = paddle.fluid.data(name='pos_weight',
+                                           shape=pos_weight_np.shape,
+                                           dtype='float64')
             feed_dict["pos_weight"] = pos_weight_np
         if weight_np is not None:
-            weight = paddle.fluid.data(
-                name='weight', shape=weight_np.shape, dtype='float64')
+            weight = paddle.fluid.data(name='weight',
+                                       shape=weight_np.shape,
+                                       dtype='float64')
             feed_dict["weight"] = weight_np
         if functional:
             res = call_bce_functional(logit, label, weight, reduction,
@@ -124,6 +132,7 @@ def calc_bce_with_logits_loss(logit_np,
 
 
 class TestBCEWithLogitsLoss(unittest.TestCase):
+
     def test_BCEWithLogitsLoss(self):
         logit_np = np.random.uniform(0.1, 0.8, size=(20, 30)).astype(np.float64)
         label_np = np.random.randint(0, 2, size=(20, 30)).astype(np.float64)
@@ -133,35 +142,36 @@ def test_BCEWithLogitsLoss(self):
         reductions = ['sum', 'mean', 'none']
         for place in places:
             for reduction in reductions:
-                static_result = test_static(
-                    place, logit_np, label_np, reduction=reduction)
-                dy_result = test_dygraph(
-                    place, logit_np, label_np, reduction=reduction)
+                static_result = test_static(place,
+                                            logit_np,
+                                            label_np,
+                                            reduction=reduction)
+                dy_result = test_dygraph(place,
+                                         logit_np,
+                                         label_np,
+                                         reduction=reduction)
                 expected = calc_bce_with_logits_loss(logit_np, label_np,
                                                      reduction)
                 self.assertTrue(np.allclose(static_result, expected))
                 self.assertTrue(np.allclose(static_result, dy_result))
                 self.assertTrue(np.allclose(dy_result, expected))
-                static_functional = test_static(
-                    place,
-                    logit_np,
-                    label_np,
-                    reduction=reduction,
-                    functional=True)
-                dy_functional = test_dygraph(
-                    place,
-                    logit_np,
-                    label_np,
-                    reduction=reduction,
-                    functional=True)
+                static_functional = test_static(place,
+                                                logit_np,
+                                                label_np,
+                                                reduction=reduction,
+                                                functional=True)
+                dy_functional = test_dygraph(place,
+                                             logit_np,
+                                             label_np,
+                                             reduction=reduction,
+                                             functional=True)
 
                 with _test_eager_guard():
-                    eager_functional = test_dygraph(
-                        place,
-                        logit_np,
-                        label_np,
-                        reduction=reduction,
-                        functional=True)
+                    eager_functional = test_dygraph(place,
+                                                    logit_np,
+                                                    label_np,
+                                                    reduction=reduction,
+                                                    functional=True)
 
                 self.assertTrue(np.allclose(static_functional, expected))
                 self.assertTrue(np.allclose(static_functional, dy_functional))
@@ -169,58 +179,56 @@ def test_BCEWithLogitsLoss(self):
                 self.assertTrue(np.allclose(eager_functional, expected))
 
     def test_BCEWithLogitsLoss_weight(self):
-        logit_np = np.random.uniform(
-            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
-        label_np = np.random.randint(
-            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
+        logit_np = np.random.uniform(0.1, 0.8,
+                                     size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(0, 2,
+                                     size=(2, 3, 4, 10)).astype(np.float64)
         weight_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         for reduction in ['sum', 'mean', 'none']:
-            static_result = test_static(
-                place,
-                logit_np,
-                label_np,
-                weight_np=weight_np,
-                reduction=reduction)
-            dy_result = test_dygraph(
-                place,
-                logit_np,
-                label_np,
-                weight_np=weight_np,
-                reduction=reduction)
-            expected = calc_bce_with_logits_loss(
-                logit_np, label_np, reduction, weight_np=weight_np)
+            static_result = test_static(place,
+                                        logit_np,
+                                        label_np,
+                                        weight_np=weight_np,
+                                        reduction=reduction)
+            dy_result = test_dygraph(place,
+                                     logit_np,
+                                     label_np,
+                                     weight_np=weight_np,
+                                     reduction=reduction)
+            expected = calc_bce_with_logits_loss(logit_np,
+                                                 label_np,
+                                                 reduction,
+                                                 weight_np=weight_np)
             self.assertTrue(np.allclose(static_result, expected))
             self.assertTrue(np.allclose(static_result, dy_result))
             self.assertTrue(np.allclose(dy_result, expected))
-            static_functional = test_static(
-                place,
-                logit_np,
-                label_np,
-                weight_np=weight_np,
-                reduction=reduction,
-                functional=True)
-            dy_functional = test_dygraph(
-                place,
-                logit_np,
-                label_np,
-                weight_np=weight_np,
-                reduction=reduction,
-                functional=True)
+            static_functional = test_static(place,
+                                            logit_np,
+                                            label_np,
+                                            weight_np=weight_np,
+                                            reduction=reduction,
+                                            functional=True)
+            dy_functional = test_dygraph(place,
+                                         logit_np,
+                                         label_np,
+                                         weight_np=weight_np,
+                                         reduction=reduction,
+                                         functional=True)
             self.assertTrue(np.allclose(static_functional, expected))
             self.assertTrue(np.allclose(static_functional, dy_functional))
             self.assertTrue(np.allclose(dy_functional, expected))
 
     def test_BCEWithLogitsLoss_pos_weight(self):
-        logit_np = np.random.uniform(
-            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
-        label_np = np.random.randint(
-            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
+        logit_np = np.random.uniform(0.1, 0.8,
+                                     size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(0, 2,
+                                     size=(2, 3, 4, 10)).astype(np.float64)
         pos_weight_np = np.random.random(size=(3, 4, 10)).astype(np.float64)
         weight_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         reduction = "mean"
         static_result = test_static(place, logit_np, label_np, weight_np,
                                     reduction, pos_weight_np)
@@ -231,40 +239,36 @@ def test_BCEWithLogitsLoss_pos_weight(self):
         self.assertTrue(np.allclose(static_result, expected))
         self.assertTrue(np.allclose(static_result, dy_result))
         self.assertTrue(np.allclose(dy_result, expected))
-        static_functional = test_static(
-            place,
-            logit_np,
-            label_np,
-            weight_np,
-            reduction,
-            pos_weight_np,
-            functional=True)
-        dy_functional = test_dygraph(
-            place,
-            logit_np,
-            label_np,
-            weight_np,
-            reduction,
-            pos_weight_np,
-            functional=True)
+        static_functional = test_static(place,
+                                        logit_np,
+                                        label_np,
+                                        weight_np,
+                                        reduction,
+                                        pos_weight_np,
+                                        functional=True)
+        dy_functional = test_dygraph(place,
+                                     logit_np,
+                                     label_np,
+                                     weight_np,
+                                     reduction,
+                                     pos_weight_np,
+                                     functional=True)
         self.assertTrue(np.allclose(static_functional, expected))
         self.assertTrue(np.allclose(static_functional, dy_functional))
         self.assertTrue(np.allclose(dy_functional, expected))
 
     def test_BCEWithLogitsLoss_error(self):
         paddle.disable_static()
-        self.assertRaises(
-            ValueError,
-            paddle.nn.BCEWithLogitsLoss,
-            reduction="unsupport reduction")
+        self.assertRaises(ValueError,
+                          paddle.nn.BCEWithLogitsLoss,
+                          reduction="unsupport reduction")
         logit = paddle.to_tensor([[0.1, 0.3]], dtype='float32')
         label = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
-        self.assertRaises(
-            ValueError,
-            paddle.nn.functional.binary_cross_entropy_with_logits,
-            logit=logit,
-            label=label,
-            reduction="unsupport reduction")
+        self.assertRaises(ValueError,
+                          paddle.nn.functional.binary_cross_entropy_with_logits,
+                          logit=logit,
+                          label=label,
+                          reduction="unsupport reduction")
         paddle.enable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
index cc3fab9056d55..b3206e385f498 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
@@ -43,37 +43,28 @@ def test_get_set(self):
         # beam_size = 2, end_id = 1
         # start with start_id
         [
-            self.append_lod_tensor(
-                array, [[0, 1, 2], [0, 1, 2]], np.array(
-                    [0, 0], dtype=dtype))
+            self.append_lod_tensor(array, [[0, 1, 2], [0, 1, 2]],
+                                   np.array([0, 0], dtype=dtype))
             for array, dtype in ((ids, "int64"), (scores, "float32"))
         ]
         [
-            self.append_lod_tensor(
-                array, [[0, 1, 2], [0, 2, 4]],
-                np.array(
-                    [2, 3, 4, 5], dtype=dtype))
+            self.append_lod_tensor(array, [[0, 1, 2], [0, 2, 4]],
+                                   np.array([2, 3, 4, 5], dtype=dtype))
             for array, dtype in ((ids, "int64"), (scores, "float32"))
         ]
         [
-            self.append_lod_tensor(
-                array, [[0, 2, 4], [0, 2, 2, 4, 4]],
-                np.array(
-                    [3, 1, 5, 4], dtype=dtype))
+            self.append_lod_tensor(array, [[0, 2, 4], [0, 2, 2, 4, 4]],
+                                   np.array([3, 1, 5, 4], dtype=dtype))
             for array, dtype in ((ids, "int64"), (scores, "float32"))
         ]
         [
-            self.append_lod_tensor(
-                array, [[0, 2, 4], [0, 1, 2, 3, 4]],
-                np.array(
-                    [1, 1, 3, 5], dtype=dtype))
+            self.append_lod_tensor(array, [[0, 2, 4], [0, 1, 2, 3, 4]],
+                                   np.array([1, 1, 3, 5], dtype=dtype))
             for array, dtype in ((ids, "int64"), (scores, "float32"))
         ]
         [
-            self.append_lod_tensor(
-                array, [[0, 2, 4], [0, 0, 0, 2, 2]],
-                np.array(
-                    [5, 1], dtype=dtype))
+            self.append_lod_tensor(array, [[0, 2, 4], [0, 0, 0, 2, 2]],
+                                   np.array([5, 1], dtype=dtype))
             for array, dtype in ((ids, "int64"), (scores, "float32"))
         ]
 
@@ -89,7 +80,8 @@ def test_get_set(self):
             SentenceIds="sentence_ids",
             SentenceScores="sentence_scores",
             beam_size=2,
-            end_id=1, )
+            end_id=1,
+        )
 
         beam_search_decode_op.run(self.scope, self.place)
 
@@ -100,19 +92,21 @@ def test_get_set(self):
         expected_data = np.array(
             [0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1], "int64")
         self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data))
-        self.assertTrue(
-            np.array_equal(np.array(sentence_scores), expected_data))
+        self.assertTrue(np.array_equal(np.array(sentence_scores),
+                                       expected_data))
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestBeamSearchDecodeOpGPU(TestBeamSearchDecodeOp):
+
     def setUp(self):
         self.scope = core.Scope()
         self.place = core.CUDAPlace(0)
 
 
 class TestBeamSearchDecodeOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
@@ -120,8 +114,10 @@ def test_id_Variable():
                 # the input pre_ids must be Variable
                 test_ids = np.random.randint(1, 5, [5, 1]).astype("int64")
                 scores = fluid.layers.create_array(dtype='float32')
-                fluid.layers.beam_search_decode(
-                    test_ids, scores, beam_size=5, end_id=0)
+                fluid.layers.beam_search_decode(test_ids,
+                                                scores,
+                                                beam_size=5,
+                                                end_id=0)
 
             self.assertRaises(TypeError, test_id_Variable)
 
@@ -129,8 +125,10 @@ def test_score_Variable():
                 # the input pre_scores must be Variable
                 ids = fluid.layers.create_array(dtype='int64')
                 test_scores = np.random.uniform(1, 5, [5, 1]).astype("float32")
-                fluid.layers.beam_search_decode(
-                    ids, test_scores, beam_size=5, end_id=0)
+                fluid.layers.beam_search_decode(ids,
+                                                test_scores,
+                                                beam_size=5,
+                                                end_id=0)
 
             self.assertRaises(TypeError, test_score_Variable)
 
@@ -138,8 +136,10 @@ def test_id_dtype():
                 # the dtype of input pre_ids must be int64
                 type_ids = fluid.layers.create_array(dtype='float32')
                 scores = fluid.layers.create_array(dtype='float32')
-                fluid.layers.beam_search_decode(
-                    type_ids, scores, beam_size=5, end_id=0)
+                fluid.layers.beam_search_decode(type_ids,
+                                                scores,
+                                                beam_size=5,
+                                                end_id=0)
 
             self.assertRaises(TypeError, test_id_dtype)
 
@@ -147,8 +147,10 @@ def test_score_dtype():
                 # the dtype of input pre_scores must be float32
                 ids = fluid.layers.create_array(dtype='int64')
                 type_scores = fluid.layers.create_array(dtype='int64')
-                fluid.layers.beam_search_decode(
-                    ids, type_scores, beam_size=5, end_id=0)
+                fluid.layers.beam_search_decode(ids,
+                                                type_scores,
+                                                beam_size=5,
+                                                end_id=0)
 
             self.assertRaises(TypeError, test_score_dtype)
 
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index 99ca5779a6979..e4fe6580ea17d 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -44,19 +44,18 @@ def setUp(self):
         self.scope.var('parent_idx').get_tensor()
 
     def test_run(self):
-        op = Operator(
-            'beam_search',
-            pre_ids='pre_ids',
-            pre_scores='pre_scores',
-            ids='ids',
-            scores='scores',
-            selected_ids='selected_ids',
-            selected_scores='selected_scores',
-            parent_idx='parent_idx',
-            level=0,
-            beam_size=self.beam_size,
-            end_id=0,
-            is_accumulated=self.is_accumulated)
+        op = Operator('beam_search',
+                      pre_ids='pre_ids',
+                      pre_scores='pre_scores',
+                      ids='ids',
+                      scores='scores',
+                      selected_ids='selected_ids',
+                      selected_scores='selected_scores',
+                      parent_idx='parent_idx',
+                      level=0,
+                      beam_size=self.beam_size,
+                      end_id=0,
+                      is_accumulated=self.is_accumulated)
         op.run(self.scope, core.CPUPlace())
         selected_ids = self.scope.find_var("selected_ids").get_tensor()
         selected_scores = self.scope.find_var("selected_scores").get_tensor()
@@ -78,20 +77,19 @@ def _create_pre_scores(self):
 
     def _create_ids(self):
         self.lod = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        np_data = np.array(
-            [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64')
+        np_data = np.array([[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]],
+                           dtype='int64')
         tensor = create_tensor(self.scope, "ids", np_data)
         tensor.set_lod(self.lod)
 
     def _create_scores(self):
-        np_data = np.array(
-            [
-                [0.5, 0.3, 0.2],
-                [0.6, 0.3, 0.1],
-                [0.9, 0.5, 0.1],
-                [0.7, 0.5, 0.1],
-            ],
-            dtype='float32')
+        np_data = np.array([
+            [0.5, 0.3, 0.2],
+            [0.6, 0.3, 0.1],
+            [0.9, 0.5, 0.1],
+            [0.7, 0.5, 0.1],
+        ],
+                           dtype='float32')
         tensor = create_tensor(self.scope, "scores", np_data)
         tensor.set_lod(self.lod)
 
@@ -105,6 +103,7 @@ def set_outputs(self):
 
 
 class BeamSearchOpTester2(BeamSearchOpTester):
+
     def _create_pre_ids(self):
         np_data = np.array([[1], [2], [3], [4]], dtype='int64')
         tensor = create_tensor(self.scope, 'pre_ids', np_data)
@@ -120,13 +119,13 @@ def _create_ids(self):
         tensor.set_lod(self.lod)
 
     def _create_scores(self):
-        np_data = np.array(
-            [
-                [0.6, 0.9],
-                [0.5, 0.3],
-                [0.9, 0.5],
-                [0.1, 0.7],
-            ], dtype='float32')
+        np_data = np.array([
+            [0.6, 0.9],
+            [0.5, 0.3],
+            [0.9, 0.5],
+            [0.1, 0.7],
+        ],
+                           dtype='float32')
         tensor = create_tensor(self.scope, "scores", np_data)
         tensor.set_lod(self.lod)
 
@@ -156,13 +155,13 @@ def _create_ids(self):
         tensor.set_lod(self.lod)
 
     def _create_scores(self):
-        np_data = np.array(
-            [
-                [0.6, 0.9],
-                [0.5, 0.3],
-                [0.9, 0.5],
-                [0.6, 0.7],
-            ], dtype='float32')
+        np_data = np.array([
+            [0.6, 0.9],
+            [0.5, 0.3],
+            [0.9, 0.5],
+            [0.6, 0.7],
+        ],
+                           dtype='float32')
         tensor = create_tensor(self.scope, "scores", np_data)
         tensor.set_lod(self.lod)
 
@@ -192,13 +191,13 @@ def _create_ids(self):
         tensor.set_lod(self.lod)
 
     def _create_scores(self):
-        np_data = np.array(
-            [
-                [0.6, 0.9],
-                [0.5, 0.3],
-                [0.9, 0.5],
-                [0.6, 0.7],
-            ], dtype='float32')
+        np_data = np.array([
+            [0.6, 0.9],
+            [0.5, 0.3],
+            [0.9, 0.5],
+            [0.6, 0.7],
+        ],
+                           dtype='float32')
         tensor = create_tensor(self.scope, "scores", np_data)
         tensor.set_lod(self.lod)
 
@@ -228,13 +227,13 @@ def _create_ids(self):
         tensor.set_lod(self.lod)
 
     def _create_scores(self):
-        np_data = np.array(
-            [
-                [0.6, 0.9],
-                [0.5, 0.3],
-                [0.9, 0.5],
-                [0.1, 0.7],
-            ], dtype='float32')
+        np_data = np.array([
+            [0.6, 0.9],
+            [0.5, 0.3],
+            [0.9, 0.5],
+            [0.1, 0.7],
+        ],
+                           dtype='float32')
         tensor = create_tensor(self.scope, "scores", np_data)
         tensor.set_lod(self.lod)
 
@@ -242,8 +241,8 @@ def set_outputs(self):
         self.beam_size = 2
         self.is_accumulated = False
         self.output_ids = np.array([7, 3, 3, 1])[:, np.newaxis]
-        self.output_scores = np.array(
-            [1.50685, 0.996027, 0.194639, 0.043325])[:, np.newaxis]
+        self.output_scores = np.array([1.50685, 0.996027, 0.194639,
+                                       0.043325])[:, np.newaxis]
         self.output_lod = [[0, 2, 4], [0, 0, 2, 3, 4]]
         self.output_parent_idx = np.array([1, 1, 2, 3])
 
@@ -265,13 +264,13 @@ def _create_ids(self):
         tensor.set_lod(self.lod)
 
     def _create_scores(self):
-        np_data = np.array(
-            [
-                [0.6, 0.9],
-                [0.5, 0.3],
-                [0.9, 0.5],
-                [0.1, 0.7],
-            ], dtype='float32')
+        np_data = np.array([
+            [0.6, 0.9],
+            [0.5, 0.3],
+            [0.9, 0.5],
+            [0.1, 0.7],
+        ],
+                           dtype='float32')
         tensor = create_tensor(self.scope, "scores", np_data)
         tensor.set_lod(self.lod)
 
@@ -285,30 +284,33 @@ def set_outputs(self):
 
 
 class TestBeamSearchOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            pre_ids = fluid.data(
-                name='pre_id', shape=[1], lod_level=2, dtype='int64')
-            pre_scores = fluid.data(
-                name='pre_scores', shape=[1], lod_level=2, dtype='float32')
+            pre_ids = fluid.data(name='pre_id',
+                                 shape=[1],
+                                 lod_level=2,
+                                 dtype='int64')
+            pre_scores = fluid.data(name='pre_scores',
+                                    shape=[1],
+                                    lod_level=2,
+                                    dtype='float32')
             probs = fluid.data(name='probs', shape=[10000], dtype='float32')
             topk_scores, topk_indices = fluid.layers.topk(probs, k=4)
             accu_scores = fluid.layers.elementwise_add(
                 x=fluid.layers.log(x=topk_scores),
-                y=fluid.layers.reshape(
-                    pre_scores, shape=[-1]),
+                y=fluid.layers.reshape(pre_scores, shape=[-1]),
                 axis=0)
 
             def test_preids_Variable():
                 # the input pre_ids must be Variable
                 preids_data = np.random.randint(1, 5, [5, 1]).astype("int64")
-                fluid.layers.beam_search(
-                    pre_ids=preids_data,
-                    pre_scores=pre_scores,
-                    ids=topk_indices,
-                    scores=accu_scores,
-                    beam_size=4,
-                    end_id=1)
+                fluid.layers.beam_search(pre_ids=preids_data,
+                                         pre_scores=pre_scores,
+                                         ids=topk_indices,
+                                         scores=accu_scores,
+                                         beam_size=4,
+                                         end_id=1)
 
             self.assertRaises(TypeError, test_preids_Variable)
 
@@ -316,73 +318,66 @@ def test_prescores_Variable():
                 # the input pre_scores must be Variable
                 prescores_data = np.random.uniform(1, 5,
                                                    [5, 1]).astype("float32")
-                fluid.layers.beam_search(
-                    pre_ids=pre_ids,
-                    pre_scores=prescores_data,
-                    ids=topk_indices,
-                    scores=accu_scores,
-                    beam_size=4,
-                    end_id=1)
+                fluid.layers.beam_search(pre_ids=pre_ids,
+                                         pre_scores=prescores_data,
+                                         ids=topk_indices,
+                                         scores=accu_scores,
+                                         beam_size=4,
+                                         end_id=1)
 
             self.assertRaises(TypeError, test_prescores_Variable)
 
             def test_ids_Variable():
                 # the input ids must be Variable or None
                 ids_data = np.random.randint(1, 5, [5, 1]).astype("int64")
-                fluid.layers.beam_search(
-                    pre_ids=pre_ids,
-                    pre_scores=pre_scores,
-                    ids=ids_data,
-                    scores=accu_scores,
-                    beam_size=4,
-                    end_id=1)
+                fluid.layers.beam_search(pre_ids=pre_ids,
+                                         pre_scores=pre_scores,
+                                         ids=ids_data,
+                                         scores=accu_scores,
+                                         beam_size=4,
+                                         end_id=1)
 
             self.assertRaises(TypeError, test_ids_Variable)
 
             def test_scores_Variable():
                 # the input scores must be Variable
                 scores_data = np.random.uniform(1, 5, [5, 1]).astype("float32")
-                fluid.layers.beam_search(
-                    pre_ids=pre_ids,
-                    pre_scores=pre_scores,
-                    ids=topk_indices,
-                    scores=scores_data,
-                    beam_size=4,
-                    end_id=1)
+                fluid.layers.beam_search(pre_ids=pre_ids,
+                                         pre_scores=pre_scores,
+                                         ids=topk_indices,
+                                         scores=scores_data,
+                                         beam_size=4,
+                                         end_id=1)
 
             self.assertRaises(TypeError, test_scores_Variable)
 
             def test_preids_dtype():
                 # the dtype of input pre_ids must be int64
-                preids_type_data = fluid.data(
-                    name='preids_type_data',
-                    shape=[1],
-                    lod_level=2,
-                    dtype='float32')
-                fluid.layers.beam_search(
-                    pre_ids=preids_type_data,
-                    pre_scores=pre_scores,
-                    ids=topk_indices,
-                    scores=accu_scores,
-                    beam_size=4,
-                    end_id=1)
+                preids_type_data = fluid.data(name='preids_type_data',
+                                              shape=[1],
+                                              lod_level=2,
+                                              dtype='float32')
+                fluid.layers.beam_search(pre_ids=preids_type_data,
+                                         pre_scores=pre_scores,
+                                         ids=topk_indices,
+                                         scores=accu_scores,
+                                         beam_size=4,
+                                         end_id=1)
 
             self.assertRaises(TypeError, test_preids_dtype)
 
             def test_prescores_dtype():
                 # the dtype of input pre_scores must be float32
-                prescores_type_data = fluid.data(
-                    name='prescores_type_data',
-                    shape=[1],
-                    lod_level=2,
-                    dtype='int64')
-                fluid.layers.beam_search(
-                    pre_ids=pre_ids,
-                    pre_scores=prescores_type_data,
-                    ids=topk_indices,
-                    scores=accu_scores,
-                    beam_size=4,
-                    end_id=1)
+                prescores_type_data = fluid.data(name='prescores_type_data',
+                                                 shape=[1],
+                                                 lod_level=2,
+                                                 dtype='int64')
+                fluid.layers.beam_search(pre_ids=pre_ids,
+                                         pre_scores=prescores_type_data,
+                                         ids=topk_indices,
+                                         scores=accu_scores,
+                                         beam_size=4,
+                                         end_id=1)
 
             self.assertRaises(TypeError, test_prescores_dtype)
 
diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
index fc4ee13384b2d..4982ed451cd8c 100644
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -30,6 +30,7 @@ def output_hist(out):
 
 
 class TestBernoulliOp(OpTest):
+
     def setUp(self):
         self.op_type = "bernoulli"
         self.inputs = {"X": np.random.uniform(size=(1000, 784))}
@@ -41,21 +42,20 @@ def test_check_output(self):
 
     def verify_output(self, outs):
         hist, prob = output_hist(np.array(outs[0]))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestBernoulliApi(unittest.TestCase):
+
     def test_dygraph(self):
         paddle.disable_static()
         x = paddle.rand([1024, 1024])
         out = paddle.bernoulli(x)
         paddle.enable_static()
         hist, prob = output_hist(out.numpy())
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
     def test_static(self):
         x = paddle.rand([1024, 1024])
@@ -64,12 +64,12 @@ def test_static(self):
         out = exe.run(paddle.static.default_main_program(),
                       fetch_list=[out.name])
         hist, prob = output_hist(out[0])
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestRandomValue(unittest.TestCase):
+
     def test_fixed_random_number(self):
         # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
         if not paddle.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_bfgs.py b/python/paddle/fluid/tests/unittests/test_bfgs.py
index 8a9f9f72aa068..08ec4a2380674 100644
--- a/python/paddle/fluid/tests/unittests/test_bfgs.py
+++ b/python/paddle/fluid/tests/unittests/test_bfgs.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -45,10 +45,13 @@ def test_static_graph_H0(func, x0, H0, dtype='float32'):
     startup = paddle.static.Program()
     with paddle.static.program_guard(main, startup):
         X = paddle.static.data(name='x', shape=[x0.shape[0]], dtype=dtype)
-        H = paddle.static.data(
-            name='h', shape=[H0.shape[0], H0.shape[1]], dtype=dtype)
-        Y = minimize_bfgs(
-            func, X, initial_inverse_hessian_estimate=H, dtype=dtype)
+        H = paddle.static.data(name='h',
+                               shape=[H0.shape[0], H0.shape[1]],
+                               dtype=dtype)
+        Y = minimize_bfgs(func,
+                          X,
+                          initial_inverse_hessian_estimate=H,
+                          dtype=dtype)
 
     exe = paddle.static.Executor()
     exe.run(startup)
@@ -64,15 +67,15 @@ def test_dynamic_graph(func,
     x0 = paddle.to_tensor(x0)
     if H0 is not None:
         H0 = paddle.to_tensor(H0)
-    return minimize_bfgs(
-        func,
-        x0,
-        initial_inverse_hessian_estimate=H0,
-        line_search_fn=line_search_fn,
-        dtype=dtype)
+    return minimize_bfgs(func,
+                         x0,
+                         initial_inverse_hessian_estimate=H0,
+                         line_search_fn=line_search_fn,
+                         dtype=dtype)
 
 
 class TestBfgs(unittest.TestCase):
+
     def test_quadratic_nd(self):
         for dimension in [1, 10]:
             minimum = np.random.random(size=[dimension]).astype('float32')
@@ -106,10 +109,11 @@ def func(x):
         self.assertFalse(results[0][0])
 
     def test_multi_minima(self):
+
         def func(x):
             # df = 12(x + 1.1)(x - 0.2)(x - 0.8)
             # f = 3*x^4+0.4*x^3-5.46*x^2+2.112*x
-            # minimum = -1.1 or 0.8. 
+            # minimum = -1.1 or 0.8.
             # All these minima may be reached from appropriate starting points.
             return 3 * x**4 + 0.4 * x**3 - 5.64 * x**2 + 2.112 * x
 
@@ -143,6 +147,7 @@ def test_rosenbrock(self):
         self.func_rosenbrock()
 
     def test_exception(self):
+
         def func(x):
             return paddle.dot(x, x)
 
@@ -159,12 +164,11 @@ def func(x):
         self.assertRaises(ValueError, test_dynamic_graph, func, x0, H0=H1)
 
         # test line_search_fn is bad
-        self.assertRaises(
-            NotImplementedError,
-            test_static_graph,
-            func,
-            x0,
-            line_search_fn='other')
+        self.assertRaises(NotImplementedError,
+                          test_static_graph,
+                          func,
+                          x0,
+                          line_search_fn='other')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_bicubic_interp_op.py b/python/paddle/fluid/tests/unittests/test_bicubic_interp_op.py
index 8d7dd0d81180e..281d6811c6219 100644
--- a/python/paddle/fluid/tests/unittests/test_bicubic_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bicubic_interp_op.py
@@ -110,17 +110,20 @@ def bicubic_interp_np(input,
                         coefficients[ii] = cubic_interp1d(
                             input[i, j, access_y, access_x_0],
                             input[i, j, access_y, access_x_1],
-                            input[i, j, access_y, access_x_2],
-                            input[i, j, access_y, access_x_3], x_t)
-                    out[i, j, k, l] = cubic_interp1d(
-                        coefficients[0], coefficients[1], coefficients[2],
-                        coefficients[3], y_t)
+                            input[i, j, access_y,
+                                  access_x_2], input[i, j, access_y,
+                                                     access_x_3], x_t)
+                    out[i, j, k,
+                        l] = cubic_interp1d(coefficients[0], coefficients[1],
+                                            coefficients[2], coefficients[3],
+                                            y_t)
     if data_layout == "NHWC":
         out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
     return out.astype(input.dtype)
 
 
 class TestBicubicInterpOp(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -171,8 +174,10 @@ def test_check_output(self):
         self.check_output(check_eager=self.check_eager)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', in_place=True, check_eager=self.check_eager)
+        self.check_grad(['X'],
+                        'Out',
+                        in_place=True,
+                        check_eager=self.check_eager)
 
     def init_test_case(self):
         self.interp_method = 'bicubic'
@@ -185,6 +190,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpCase1(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [4, 1, 7, 8]
@@ -195,6 +201,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpCase2(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [3, 3, 9, 6]
@@ -205,6 +212,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpCase3(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [1, 1, 32, 64]
@@ -215,6 +223,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpCase4(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [4, 1, 7, 8]
@@ -226,6 +235,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpCase5(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [3, 3, 9, 6]
@@ -237,6 +247,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpCase6(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [1, 1, 32, 64]
@@ -248,6 +259,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpSame(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [2, 3, 32, 64]
@@ -258,6 +270,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpDataLayout(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [2, 5, 5, 3]
@@ -270,6 +283,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpOpAPI(unittest.TestCase):
+
     def test_case(self):
         np.random.seed(200)
         x_data = np.random.random((2, 3, 6, 6)).astype("float32")
@@ -280,34 +294,44 @@ def test_case(self):
 
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
 
         with fluid.program_guard(prog, startup_prog):
 
             x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
 
             dim = fluid.data(name="dim", shape=[1], dtype="int32")
-            shape_tensor = fluid.data(
-                name="shape_tensor", shape=[2], dtype="int32")
-            actual_size = fluid.data(
-                name="actual_size", shape=[2], dtype="int32")
-            scale_tensor = fluid.data(
-                name="scale_tensor", shape=[1], dtype="float32")
-
-            out1 = interpolate(
-                x, size=[12, 12], mode='bicubic', align_corners=False)
-            out2 = interpolate(
-                x, size=[12, dim], mode='bicubic', align_corners=False)
-            out3 = interpolate(
-                x, size=shape_tensor, mode='bicubic', align_corners=False)
-            out4 = interpolate(
-                x, size=[12, 12], mode='bicubic', align_corners=False)
-            out5 = interpolate(
-                x,
-                scale_factor=scale_tensor,
-                mode='bicubic',
-                align_corners=False)
+            shape_tensor = fluid.data(name="shape_tensor",
+                                      shape=[2],
+                                      dtype="int32")
+            actual_size = fluid.data(name="actual_size",
+                                     shape=[2],
+                                     dtype="int32")
+            scale_tensor = fluid.data(name="scale_tensor",
+                                      shape=[1],
+                                      dtype="float32")
+
+            out1 = interpolate(x,
+                               size=[12, 12],
+                               mode='bicubic',
+                               align_corners=False)
+            out2 = interpolate(x,
+                               size=[12, dim],
+                               mode='bicubic',
+                               align_corners=False)
+            out3 = interpolate(x,
+                               size=shape_tensor,
+                               mode='bicubic',
+                               align_corners=False)
+            out4 = interpolate(x,
+                               size=[12, 12],
+                               mode='bicubic',
+                               align_corners=False)
+            out5 = interpolate(x,
+                               scale_factor=scale_tensor,
+                               mode='bicubic',
+                               align_corners=False)
 
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -322,40 +346,51 @@ def test_case(self):
                               fetch_list=[out1, out2, out3, out4, out5],
                               return_numpy=True)
 
-            expect_res = bicubic_interp_np(
-                x_data, out_h=12, out_w=12, align_corners=False)
+            expect_res = bicubic_interp_np(x_data,
+                                           out_h=12,
+                                           out_w=12,
+                                           align_corners=False)
             for res in results:
                 self.assertTrue(np.allclose(res, expect_res))
 
         with fluid.dygraph.guard():
             x = fluid.dygraph.to_variable(x_data)
-            interp = interpolate(
-                x, size=[12, 12], mode='bicubic', align_corners=False)
+            interp = interpolate(x,
+                                 size=[12, 12],
+                                 mode='bicubic',
+                                 align_corners=False)
             dy_result = interp.numpy()
-            expect = bicubic_interp_np(
-                x_data, out_h=12, out_w=12, align_corners=False)
+            expect = bicubic_interp_np(x_data,
+                                       out_h=12,
+                                       out_w=12,
+                                       align_corners=False)
             self.assertTrue(np.allclose(dy_result, expect))
 
 
 class TestBicubicOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of interpoalte must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
             self.assertRaises(TypeError, interpolate, x1)
 
             def test_mode_type():
                 # mode must be "BILINEAR" "TRILINEAR" "NEAREST" "BICUBIC"
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
 
-                out = interpolate(
-                    x, size=[12, 12], mode='UNKONWN', align_corners=False)
+                out = interpolate(x,
+                                  size=[12, 12],
+                                  mode='UNKONWN',
+                                  align_corners=False)
 
             def test_input_shape():
                 x = fluid.data(name="x", shape=[2], dtype="float32")
-                out = interpolate(
-                    x, size=[12, 12], mode='BICUBIC', align_corners=False)
+                out = interpolate(x,
+                                  size=[12, 12],
+                                  mode='BICUBIC',
+                                  align_corners=False)
 
             def test_align_corcers():
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
@@ -363,76 +398,77 @@ def test_align_corcers():
 
             def test_out_shape():
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                out = interpolate(
-                    x, size=[12], mode='bicubic', align_corners=False)
+                out = interpolate(x,
+                                  size=[12],
+                                  mode='bicubic',
+                                  align_corners=False)
 
             def test_attr_data_format():
                 # for 5-D input, data_format only can be NCDHW or NDHWC
-                input = fluid.data(
-                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
-                out = interpolate(
-                    input,
-                    size=[4, 8, 4, 5],
-                    mode='trilinear',
-                    data_format='NHWC')
+                input = fluid.data(name="input",
+                                   shape=[2, 3, 6, 9, 4],
+                                   dtype="float32")
+                out = interpolate(input,
+                                  size=[4, 8, 4, 5],
+                                  mode='trilinear',
+                                  data_format='NHWC')
 
             def test_actual_shape():
                 # the actual_shape  must be Variable.
-                x = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
-                out = interpolate(
-                    x, size=[12, 12], mode='BICUBIC', align_corners=False)
+                x = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                            [[1, 1, 1, 1]], fluid.CPUPlace())
+                out = interpolate(x,
+                                  size=[12, 12],
+                                  mode='BICUBIC',
+                                  align_corners=False)
 
             def test_scale_value():
                 # the scale must be greater than zero.
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='BICUBIC',
-                    align_corners=False,
-                    scale_factor=-2.0)
+                out = interpolate(x,
+                                  size=None,
+                                  mode='BICUBIC',
+                                  align_corners=False,
+                                  scale_factor=-2.0)
 
             def test_attr_5D_input():
                 # for 5-D input, data_format only can be NCDHW or NDHWC
-                input = fluid.data(
-                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
-                out = interpolate(
-                    input,
-                    size=[4, 8, 4, 5],
-                    mode='trilinear',
-                    data_format='NDHWC')
+                input = fluid.data(name="input",
+                                   shape=[2, 3, 6, 9, 4],
+                                   dtype="float32")
+                out = interpolate(input,
+                                  size=[4, 8, 4, 5],
+                                  mode='trilinear',
+                                  data_format='NDHWC')
 
             def test_scale_type():
                 # the scale must be greater than zero.
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                scale = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='bicubic',
-                    align_corners=False,
-                    scale_factor=scale)
+                scale = fluid.create_lod_tensor(np.array([-1, 3, 5,
+                                                          5]), [[1, 1, 1, 1]],
+                                                fluid.CPUPlace())
+                out = interpolate(x,
+                                  size=None,
+                                  mode='bicubic',
+                                  align_corners=False,
+                                  scale_factor=scale)
 
             def test_align_mode():
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='nearest',
-                    align_corners=False,
-                    align_mode=2,
-                    scale_factor=1.0)
+                out = interpolate(x,
+                                  size=None,
+                                  mode='nearest',
+                                  align_corners=False,
+                                  align_mode=2,
+                                  scale_factor=1.0)
 
             def test_outshape_and_scale():
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='bicubic',
-                    align_corners=False,
-                    scale_factor=None)
+                out = interpolate(x,
+                                  size=None,
+                                  mode='bicubic',
+                                  align_corners=False,
+                                  scale_factor=None)
 
             self.assertRaises(ValueError, test_mode_type)
             self.assertRaises(ValueError, test_input_shape)
diff --git a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
index d5c3aee2f4372..30a175d69d0ae 100644
--- a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
@@ -118,17 +118,20 @@ def bicubic_interp_np(input,
                         coefficients[ii] = cubic_interp1d(
                             input[i, j, access_y, access_x_0],
                             input[i, j, access_y, access_x_1],
-                            input[i, j, access_y, access_x_2],
-                            input[i, j, access_y, access_x_3], x_t)
-                    out[i, j, k, l] = cubic_interp1d(
-                        coefficients[0], coefficients[1], coefficients[2],
-                        coefficients[3], y_t)
+                            input[i, j, access_y,
+                                  access_x_2], input[i, j, access_y,
+                                                     access_x_3], x_t)
+                    out[i, j, k,
+                        l] = cubic_interp1d(coefficients[0], coefficients[1],
+                                            coefficients[2], coefficients[3],
+                                            y_t)
     if data_layout == "NHWC":
         out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
     return out.astype(input.dtype)
 
 
 class TestBicubicInterpOp(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -195,8 +198,10 @@ def test_check_output(self):
         self.check_output(check_eager=self.check_eager)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', in_place=True, check_eager=self.check_eager)
+        self.check_grad(['X'],
+                        'Out',
+                        in_place=True,
+                        check_eager=self.check_eager)
 
     def init_test_case(self):
         self.interp_method = 'bicubic'
@@ -209,6 +214,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpCase1(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [4, 1, 7, 8]
@@ -219,6 +225,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpCase2(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [3, 3, 9, 6]
@@ -229,6 +236,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpCase3(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [1, 1, 32, 64]
@@ -239,6 +247,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpCase4(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [4, 1, 7, 8]
@@ -250,6 +259,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpCase5(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [3, 3, 9, 6]
@@ -261,6 +271,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpCase6(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [1, 1, 32, 64]
@@ -272,6 +283,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpSame(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [2, 3, 32, 64]
@@ -282,6 +294,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpScale(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [2, 3, 32, 64]
@@ -292,6 +305,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpDataLayout(TestBicubicInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bicubic'
         self.input_shape = [2, 5, 5, 3]
@@ -304,6 +318,7 @@ def init_test_case(self):
 
 
 class TestBicubicInterpOpAPI(unittest.TestCase):
+
     def test_case(self):
         np.random.seed(200)
         x_data = np.random.random((2, 3, 6, 6)).astype("float32")
@@ -314,38 +329,52 @@ def test_case(self):
 
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
 
         with fluid.program_guard(prog, startup_prog):
 
             x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
 
             dim = fluid.data(name="dim", shape=[1], dtype="int32")
-            shape_tensor = fluid.data(
-                name="shape_tensor", shape=[2], dtype="int32")
-            actual_size = fluid.data(
-                name="actual_size", shape=[2], dtype="int32")
-            scale_tensor = fluid.data(
-                name="scale_tensor", shape=[1], dtype="float32")
-
-            out1 = interpolate(
-                x, size=[12, 12], mode='bicubic', align_corners=False)
-            out2 = interpolate(
-                x, size=[12, dim], mode='bicubic', align_corners=False)
-            out3 = interpolate(
-                x, size=shape_tensor, mode='bicubic', align_corners=False)
-            out4 = interpolate(
-                x, size=[12, 12], mode='bicubic', align_corners=False)
-            out5 = interpolate(
-                x,
-                scale_factor=scale_tensor,
-                mode='bicubic',
-                align_corners=False)
-            out6 = interpolate(
-                x, scale_factor=2.0, mode='bicubic', align_corners=False)
-            out7 = interpolate(
-                x, scale_factor=[2.0, 2.0], mode='bicubic', align_corners=False)
+            shape_tensor = fluid.data(name="shape_tensor",
+                                      shape=[2],
+                                      dtype="int32")
+            actual_size = fluid.data(name="actual_size",
+                                     shape=[2],
+                                     dtype="int32")
+            scale_tensor = fluid.data(name="scale_tensor",
+                                      shape=[1],
+                                      dtype="float32")
+
+            out1 = interpolate(x,
+                               size=[12, 12],
+                               mode='bicubic',
+                               align_corners=False)
+            out2 = interpolate(x,
+                               size=[12, dim],
+                               mode='bicubic',
+                               align_corners=False)
+            out3 = interpolate(x,
+                               size=shape_tensor,
+                               mode='bicubic',
+                               align_corners=False)
+            out4 = interpolate(x,
+                               size=[12, 12],
+                               mode='bicubic',
+                               align_corners=False)
+            out5 = interpolate(x,
+                               scale_factor=scale_tensor,
+                               mode='bicubic',
+                               align_corners=False)
+            out6 = interpolate(x,
+                               scale_factor=2.0,
+                               mode='bicubic',
+                               align_corners=False)
+            out7 = interpolate(x,
+                               scale_factor=[2.0, 2.0],
+                               mode='bicubic',
+                               align_corners=False)
 
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -361,40 +390,51 @@ def test_case(self):
                 fetch_list=[out1, out2, out3, out4, out5, out6, out7],
                 return_numpy=True)
 
-            expect_res = bicubic_interp_np(
-                x_data, out_h=12, out_w=12, align_corners=False)
+            expect_res = bicubic_interp_np(x_data,
+                                           out_h=12,
+                                           out_w=12,
+                                           align_corners=False)
             for res in results:
                 self.assertTrue(np.allclose(res, expect_res))
 
         with fluid.dygraph.guard():
             x = fluid.dygraph.to_variable(x_data)
-            interp = interpolate(
-                x, size=[12, 12], mode='bicubic', align_corners=False)
+            interp = interpolate(x,
+                                 size=[12, 12],
+                                 mode='bicubic',
+                                 align_corners=False)
             dy_result = interp.numpy()
-            expect = bicubic_interp_np(
-                x_data, out_h=12, out_w=12, align_corners=False)
+            expect = bicubic_interp_np(x_data,
+                                       out_h=12,
+                                       out_w=12,
+                                       align_corners=False)
             self.assertTrue(np.allclose(dy_result, expect))
 
 
 class TestBicubicOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of interpoalte must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
             self.assertRaises(TypeError, interpolate, x1)
 
             def test_mode_type():
                 # mode must be "BILINEAR" "TRILINEAR" "NEAREST" "BICUBIC"
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
 
-                out = interpolate(
-                    x, size=[12, 12], mode='UNKONWN', align_corners=False)
+                out = interpolate(x,
+                                  size=[12, 12],
+                                  mode='UNKONWN',
+                                  align_corners=False)
 
             def test_input_shape():
                 x = fluid.data(name="x", shape=[2], dtype="float32")
-                out = interpolate(
-                    x, size=[12, 12], mode='BICUBIC', align_corners=False)
+                out = interpolate(x,
+                                  size=[12, 12],
+                                  mode='BICUBIC',
+                                  align_corners=False)
 
             def test_align_corcers():
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
@@ -402,132 +442,133 @@ def test_align_corcers():
 
             def test_out_shape():
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                out = interpolate(
-                    x, size=[12], mode='bicubic', align_corners=False)
+                out = interpolate(x,
+                                  size=[12],
+                                  mode='bicubic',
+                                  align_corners=False)
 
             def test_attr_data_format():
                 # for 5-D input, data_format only can be NCDHW or NDHWC
-                input = fluid.data(
-                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
-                out = interpolate(
-                    input,
-                    size=[4, 8, 4, 5],
-                    mode='trilinear',
-                    data_format='NHWC')
+                input = fluid.data(name="input",
+                                   shape=[2, 3, 6, 9, 4],
+                                   dtype="float32")
+                out = interpolate(input,
+                                  size=[4, 8, 4, 5],
+                                  mode='trilinear',
+                                  data_format='NHWC')
 
             def test_actual_shape():
                 # the actual_shape  must be Variable.
-                x = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
-                out = interpolate(
-                    x, size=[12, 12], mode='BICUBIC', align_corners=False)
+                x = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                            [[1, 1, 1, 1]], fluid.CPUPlace())
+                out = interpolate(x,
+                                  size=[12, 12],
+                                  mode='BICUBIC',
+                                  align_corners=False)
 
             def test_scale_value():
                 # the scale must be greater than zero.
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='BICUBIC',
-                    align_corners=False,
-                    scale_factor=-2.0)
+                out = interpolate(x,
+                                  size=None,
+                                  mode='BICUBIC',
+                                  align_corners=False,
+                                  scale_factor=-2.0)
 
             def test_attr_5D_input():
                 # for 5-D input, data_format only can be NCDHW or NDHWC
-                input = fluid.data(
-                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
-                out = interpolate(
-                    input,
-                    size=[4, 8, 4, 5],
-                    mode='trilinear',
-                    data_format='NDHWC')
+                input = fluid.data(name="input",
+                                   shape=[2, 3, 6, 9, 4],
+                                   dtype="float32")
+                out = interpolate(input,
+                                  size=[4, 8, 4, 5],
+                                  mode='trilinear',
+                                  data_format='NDHWC')
 
             def test_scale_type():
                 # the scale must be greater than zero.
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                scale = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='bicubic',
-                    align_corners=False,
-                    scale_factor=scale)
+                scale = fluid.create_lod_tensor(np.array([-1, 3, 5,
+                                                          5]), [[1, 1, 1, 1]],
+                                                fluid.CPUPlace())
+                out = interpolate(x,
+                                  size=None,
+                                  mode='bicubic',
+                                  align_corners=False,
+                                  scale_factor=scale)
 
             def test_align_mode():
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='nearest',
-                    align_corners=False,
-                    align_mode=2,
-                    scale_factor=1.0)
+                out = interpolate(x,
+                                  size=None,
+                                  mode='nearest',
+                                  align_corners=False,
+                                  align_mode=2,
+                                  scale_factor=1.0)
 
             def test_outshape_and_scale():
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='bicubic',
-                    align_corners=False,
-                    scale_factor=None)
+                out = interpolate(x,
+                                  size=None,
+                                  mode='bicubic',
+                                  align_corners=False,
+                                  scale_factor=None)
 
             def test_align_corners_and_nearest():
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='nearest',
-                    align_corners=True,
-                    scale_factor=None)
+                out = interpolate(x,
+                                  size=None,
+                                  mode='nearest',
+                                  align_corners=True,
+                                  scale_factor=None)
 
             def test_scale_shape():
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='nearest',
-                    align_corners=False,
-                    scale_factor=[1, 2, 2])
+                out = interpolate(x,
+                                  size=None,
+                                  mode='nearest',
+                                  align_corners=False,
+                                  scale_factor=[1, 2, 2])
 
             def test_scale_value_1():
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='bicubic',
-                    align_corners=False,
-                    scale_factor=[1, 2, 2])
+                out = interpolate(x,
+                                  size=None,
+                                  mode='bicubic',
+                                  align_corners=False,
+                                  scale_factor=[1, 2, 2])
 
             def test_size_and_scale():
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='bicubic',
-                    align_corners=False,
-                    scale_factor=None)
+                out = interpolate(x,
+                                  size=None,
+                                  mode='bicubic',
+                                  align_corners=False,
+                                  scale_factor=None)
 
             def test_size_and_scale2():
-                x = fluid.data(
-                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
-                out = interpolate(
-                    x,
-                    size=[2, 2, 2],
-                    mode='trilinear',
-                    align_corners=False,
-                    scale_factor=2.0)
+                x = fluid.data(name="input",
+                               shape=[2, 3, 6, 9, 4],
+                               dtype="float32")
+                out = interpolate(x,
+                                  size=[2, 2, 2],
+                                  mode='trilinear',
+                                  align_corners=False,
+                                  scale_factor=2.0)
 
             def test_size_type():
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-                out = interpolate(
-                    x, size={2, 2}, mode='bicubic', align_corners=False)
+                out = interpolate(x,
+                                  size={2, 2},
+                                  mode='bicubic',
+                                  align_corners=False)
 
             def test_input_shape_1():
                 x = fluid.data(name="x", shape=[2, 1, 0, 0], dtype="float32")
-                out = interpolate(
-                    x, size=[3, 3], mode="bicubic", align_corners=False)
+                out = interpolate(x,
+                                  size=[3, 3],
+                                  mode="bicubic",
+                                  align_corners=False)
 
             self.assertRaises(ValueError, test_mode_type)
             self.assertRaises(ValueError, test_input_shape)
diff --git a/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py b/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py
index c0d622d7ea187..976e7df60b80c 100644
--- a/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py
@@ -20,6 +20,7 @@
 
 
 class Gsz:
+
     def __init__(self, h, w, gd, gh, gw, input_chans):
         self.h = h
         self.w = w
@@ -98,7 +99,10 @@ def naive_bilateral_slice_forward(output, grid, guide, input, gsz, has_offset,
                         wz = weight_z(zz + 0.5 - gz)
                         c_ = coeff_stride * out_c + in_c
 
-                        coeff_sample += grid[int(b), int(c_), int(z_), int(y_),
+                        coeff_sample += grid[int(b),
+                                             int(c_),
+                                             int(z_),
+                                             int(y_),
                                              int(x_)] * wx * wy * wz
 
             if in_c < input_chans:
@@ -136,6 +140,7 @@ def naive_bilateral_slice(x, guide, grid, has_offset):
 @unittest.skipIf(not paddle.fluid.is_compiled_with_cuda(),
                  'CPU testing is not supported')
 class TestBilateralSliceOp(OpTest):
+
     def setUp(self):
         self.initTestCase()
         self.op_type = 'bilateral_slice'
@@ -153,7 +158,9 @@ def setUp(self):
         output_np = naive_bilateral_slice(x, guide, grid, self.has_offset)
 
         self.inputs = {'X': x, 'Grid': grid, 'Guide': guide}
-        self.attrs = {'has_offset': self.has_offset, }
+        self.attrs = {
+            'has_offset': self.has_offset,
+        }
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
@@ -173,19 +180,24 @@ def initTestCase(self):
 @unittest.skipIf(not paddle.fluid.is_compiled_with_cuda(),
                  'CPU testing is not supported')
 class TestBilateralSliceOp1(TestBilateralSliceOp):
+
     def initTestCase(self):
         self.has_offset = True
         self.data_type = 'float32'
 
 
 class TestBilateralSliceApi(unittest.TestCase):
+
     def test_api(self):
-        x = paddle.fluid.data(
-            name='x', shape=[None, 3, 25, 15], dtype='float32')
-        guide = paddle.fluid.data(
-            name='guide', shape=[None, 25, 15], dtype='float32')
-        grid = paddle.fluid.data(
-            name='grid', shape=[None, None, 8, 5, 3], dtype='float32')
+        x = paddle.fluid.data(name='x',
+                              shape=[None, 3, 25, 15],
+                              dtype='float32')
+        guide = paddle.fluid.data(name='guide',
+                                  shape=[None, 25, 15],
+                                  dtype='float32')
+        grid = paddle.fluid.data(name='grid',
+                                 shape=[None, None, 8, 5, 3],
+                                 dtype='float32')
         paddle.fluid.contrib.layers.bilateral_slice(x, guide, grid, False)
 
         if not paddle.fluid.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_api.py b/python/paddle/fluid/tests/unittests/test_bilinear_api.py
index 24eae4797de85..01a5eb70522b3 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_api.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_api.py
@@ -24,6 +24,7 @@
 
 
 class TestBilinearAPI(unittest.TestCase):
+
     def test_api(self):
         with fluid.program_guard(fluid.default_startup_program(),
                                  fluid.default_main_program()):
@@ -39,24 +40,29 @@ def test_api(self):
             layer1 = np.random.random((5, 5)).astype('float32')
             layer2 = np.random.random((5, 4)).astype('float32')
 
-            bilinear = paddle.nn.Bilinear(
-                in1_features=5, in2_features=4, out_features=1000)
+            bilinear = paddle.nn.Bilinear(in1_features=5,
+                                          in2_features=4,
+                                          out_features=1000)
             ret = bilinear(data1, data2)
 
             exe.run(fluid.default_startup_program())
-            ret_fetch = exe.run(feed={'X1': layer1,
-                                      'X2': layer2},
+            ret_fetch = exe.run(feed={
+                'X1': layer1,
+                'X2': layer2
+            },
                                 fetch_list=[ret.name])
             self.assertEqual(ret_fetch[0].shape, (5, 1000))
 
 
 class TestBilinearAPIDygraph(unittest.TestCase):
+
     def test_api(self):
         paddle.disable_static()
         layer1 = np.random.random((5, 5)).astype('float32')
         layer2 = np.random.random((5, 4)).astype('float32')
-        bilinear = paddle.nn.Bilinear(
-            in1_features=5, in2_features=4, out_features=1000)
+        bilinear = paddle.nn.Bilinear(in1_features=5,
+                                      in2_features=4,
+                                      out_features=1000)
         ret = bilinear(paddle.to_tensor(layer1), paddle.to_tensor(layer2))
         self.assertEqual(ret.shape, [5, 1000])
 
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index 1817ef160c70a..fa80b8ac0f8e2 100755
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -20,6 +20,7 @@
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 import paddle
+
 paddle.enable_static()
 
 
@@ -96,6 +97,7 @@ def bilinear_interp_np(input,
 
 
 class TestBilinearInterpOp(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -147,8 +149,10 @@ def test_check_output(self):
         self.check_output(check_eager=self.check_eager)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', in_place=True, check_eager=self.check_eager)
+        self.check_grad(['X'],
+                        'Out',
+                        in_place=True,
+                        check_eager=self.check_eager)
 
     def init_test_case(self):
         self.interp_method = 'bilinear'
@@ -162,6 +166,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase1(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [4, 1, 7, 8]
@@ -173,6 +178,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase2(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 3, 9, 6]
@@ -184,6 +190,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase3(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [1, 1, 32, 64]
@@ -195,6 +202,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase4(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [4, 1, 7, 8]
@@ -207,6 +215,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase5(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 3, 9, 6]
@@ -219,6 +228,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase6(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [1, 1, 32, 64]
@@ -231,6 +241,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpSame(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 32, 64]
@@ -242,6 +253,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpActualShape(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 2, 32, 16]
@@ -254,6 +266,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpDataLayout(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 5, 5, 3]
@@ -267,14 +280,15 @@ def init_test_case(self):
 
 
 class TestBilinearInterpOpUint8(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
         self.init_test_case()
         self.op_type = "bilinear_interp"
         self.check_eager = True
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape).astype("uint8")
+        input_np = np.random.randint(low=0, high=256,
+                                     size=self.input_shape).astype("uint8")
 
         if self.scale > 0:
             out_h = int(self.input_shape[2] * self.scale)
@@ -302,8 +316,9 @@ def setUp(self):
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
-        self.check_output_with_place(
-            place=core.CPUPlace(), atol=1, check_eager=self.check_eager)
+        self.check_output_with_place(place=core.CPUPlace(),
+                                     atol=1,
+                                     check_eager=self.check_eager)
 
     def init_test_case(self):
         self.interp_method = 'bilinear'
@@ -316,6 +331,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 32, 64]
@@ -327,6 +343,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [4, 1, 7, 8]
@@ -339,24 +356,28 @@ def init_test_case(self):
 
 
 class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = False
         self.align_mode = 1
 
 
 class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = False
         self.align_mode = 0
 
 
 class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = True
         self.align_mode = 0
 
 
 class TestBilinearInterpScale1(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 5, 7]
@@ -368,6 +389,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpScale2(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 5, 7]
@@ -379,6 +401,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpScale3(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 5, 7]
@@ -390,6 +413,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpZero(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 5, 7]
@@ -401,6 +425,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpOp_attr_tensor(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -448,8 +473,10 @@ def test_check_output(self):
         self.check_output(check_eager=self.check_eager)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', in_place=True, check_eager=self.check_eager)
+        self.check_grad(['X'],
+                        'Out',
+                        in_place=True,
+                        check_eager=self.check_eager)
 
     def init_test_case(self):
         self.interp_method = 'bilinear'
@@ -463,6 +490,7 @@ def init_test_case(self):
 
 # out_size is a 1-D tensor
 class TestBilinearInterp_attr_tensor_Case1(TestBilinearInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 3, 9, 6]
@@ -475,6 +503,7 @@ def init_test_case(self):
 
 # scale is a 1-D tensor
 class TestBilinearInterp_attr_tensor_Case2(TestBilinearInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 2, 32, 16]
@@ -488,6 +517,7 @@ def init_test_case(self):
 
 # scale is a 1-D tensor
 class TestBilinearInterp_attr_tensor_Case3(TestBilinearInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 2, 32, 16]
@@ -500,20 +530,23 @@ def init_test_case(self):
 
 
 class TestBilinearInterpOpAPI(unittest.TestCase):
+
     def test_case(self):
         x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
 
         dim = fluid.data(name="dim", shape=[1], dtype="int32")
         shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
         actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
-        scale_tensor = fluid.data(
-            name="scale_tensor", shape=[1], dtype="float32")
+        scale_tensor = fluid.data(name="scale_tensor",
+                                  shape=[1],
+                                  dtype="float32")
 
         out1 = fluid.layers.resize_bilinear(x, out_shape=[12, 12])
         out2 = fluid.layers.resize_bilinear(x, out_shape=[12, dim])
         out3 = fluid.layers.resize_bilinear(x, out_shape=shape_tensor)
-        out4 = fluid.layers.resize_bilinear(
-            x, out_shape=[4, 4], actual_shape=actual_size)
+        out4 = fluid.layers.resize_bilinear(x,
+                                            out_shape=[4, 4],
+                                            actual_shape=actual_size)
         out5 = fluid.layers.resize_bilinear(x, scale=scale_tensor)
 
         x_data = np.random.random((2, 3, 6, 6)).astype("float32")
@@ -539,8 +572,10 @@ def test_case(self):
                           fetch_list=[out1, out2, out3, out4, out5],
                           return_numpy=True)
 
-        expect_res = bilinear_interp_np(
-            x_data, out_h=12, out_w=12, align_corners=True)
+        expect_res = bilinear_interp_np(x_data,
+                                        out_h=12,
+                                        out_w=12,
+                                        align_corners=True)
         for res in results:
             self.assertTrue(np.allclose(res, expect_res))
 
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
index 2ff32b2f95bb4..788bd0fc4119d 100755
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
@@ -104,6 +104,7 @@ def bilinear_interp_np(input,
 
 
 class TestBilinearInterpOp(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -135,9 +136,10 @@ def setUp(self):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = bilinear_interp_np(
-            input_np, out_h, out_w, 0, 0, self.out_size, self.actual_shape,
-            self.align_corners, self.align_mode, self.data_layout)
+        output_np = bilinear_interp_np(input_np, out_h, out_w, 0, 0,
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners, self.align_mode,
+                                       self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -179,6 +181,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase1(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [4, 1, 7, 8]
@@ -190,6 +193,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase2(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 3, 9, 6]
@@ -201,6 +205,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase3(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [1, 1, 32, 64]
@@ -212,6 +217,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase4(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [4, 1, 7, 8]
@@ -224,6 +230,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase5(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 3, 9, 6]
@@ -236,6 +243,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase6(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [1, 1, 32, 64]
@@ -248,6 +256,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase7(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [1, 1, 32, 64]
@@ -259,6 +268,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpSame(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 32, 64]
@@ -270,6 +280,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpActualShape(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 2, 32, 16]
@@ -282,6 +293,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpDataLayout(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 5, 5, 3]
@@ -295,13 +307,14 @@ def init_test_case(self):
 
 
 class TestBilinearInterpOpUint8(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
         self.init_test_case()
         self.op_type = "bilinear_interp_v2"
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape).astype("uint8")
+        input_np = np.random.randint(low=0, high=256,
+                                     size=self.input_shape).astype("uint8")
 
         if self.scale:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
@@ -355,6 +368,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 32, 64]
@@ -366,6 +380,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [4, 1, 7, 8]
@@ -378,24 +393,28 @@ def init_test_case(self):
 
 
 class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = False
         self.align_mode = 1
 
 
 class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = False
         self.align_mode = 0
 
 
 class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = True
         self.align_mode = 0
 
 
 class TestBilinearInterpScale1(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 5, 7]
@@ -407,6 +426,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpScale2(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 5, 7]
@@ -418,6 +438,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpScale3(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 5, 7]
@@ -429,6 +450,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpScale4(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 5, 7]
@@ -440,6 +462,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpZero(TestBilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 5, 7]
@@ -451,6 +474,7 @@ def init_test_case(self):
 
 
 class TestBilinearInterpOp_attr_tensor(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -524,6 +548,7 @@ def init_test_case(self):
 
 # out_size is a 1-D tensor
 class TestBilinearInterp_attr_tensor_Case1(TestBilinearInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 3, 9, 6]
@@ -536,6 +561,7 @@ def init_test_case(self):
 
 # scale is a 1-D tensor
 class TestBilinearInterp_attr_tensor_Case2(TestBilinearInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 2, 32, 16]
@@ -549,6 +575,7 @@ def init_test_case(self):
 
 # scale is a 1-D tensor
 class TestBilinearInterp_attr_tensor_Case3(TestBilinearInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 2, 32, 16]
@@ -561,20 +588,23 @@ def init_test_case(self):
 
 
 class TestBilinearInterpOpAPI(unittest.TestCase):
+
     def test_case(self):
         x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
 
         dim = fluid.data(name="dim", shape=[1], dtype="int32")
         shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
         actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
-        scale_tensor = fluid.data(
-            name="scale_tensor", shape=[1], dtype="float32")
+        scale_tensor = fluid.data(name="scale_tensor",
+                                  shape=[1],
+                                  dtype="float32")
 
         out1 = fluid.layers.resize_bilinear(x, out_shape=[12, 12])
         out2 = fluid.layers.resize_bilinear(x, out_shape=[12, dim])
         out3 = fluid.layers.resize_bilinear(x, out_shape=shape_tensor)
-        out4 = fluid.layers.resize_bilinear(
-            x, out_shape=[4, 4], actual_shape=actual_size)
+        out4 = fluid.layers.resize_bilinear(x,
+                                            out_shape=[4, 4],
+                                            actual_shape=actual_size)
         out5 = fluid.layers.resize_bilinear(x, scale=scale_tensor)
 
         x_data = np.random.random((2, 3, 6, 6)).astype("float32")
@@ -600,13 +630,16 @@ def test_case(self):
                           fetch_list=[out1, out2, out3, out4, out5],
                           return_numpy=True)
 
-        expect_res = bilinear_interp_np(
-            x_data, out_h=12, out_w=12, align_corners=True)
+        expect_res = bilinear_interp_np(x_data,
+                                        out_h=12,
+                                        out_w=12,
+                                        align_corners=True)
         for res in results:
             self.assertTrue(np.allclose(res, expect_res))
 
 
 class TestBilinearInterpOpAPI_dy(unittest.TestCase):
+
     def test_case(self):
         import paddle
         if core.is_compiled_with_cuda():
@@ -616,14 +649,19 @@ def test_case(self):
         with fluid.dygraph.guard(place):
             input_data = np.random.random((2, 3, 6, 6)).astype("float32")
             input_x = paddle.to_tensor(input_data)
-            expect_res = bilinear_interp_np(
-                input_data, out_h=12, out_w=12, align_corners=False)
-            out = interpolate(
-                x=input_x, size=[12, 12], mode="bilinear", align_corners=False)
+            expect_res = bilinear_interp_np(input_data,
+                                            out_h=12,
+                                            out_w=12,
+                                            align_corners=False)
+            out = interpolate(x=input_x,
+                              size=[12, 12],
+                              mode="bilinear",
+                              align_corners=False)
             self.assertTrue(np.allclose(out.numpy(), expect_res))
 
 
 class TestBilinearInterpOpAPI_dy2(unittest.TestCase):
+
     def test_case(self):
         import paddle
         if core.is_compiled_with_cuda():
@@ -635,14 +673,19 @@ def test_case(self):
             size_np = np.array([12, 12]).astype("int64")
             input_x = paddle.to_tensor(input_data)
             size = paddle.to_tensor(size_np)
-            expect_res = bilinear_interp_np(
-                input_data, out_h=12, out_w=12, align_corners=False)
-            out = interpolate(
-                x=input_x, size=size, mode="bilinear", align_corners=False)
+            expect_res = bilinear_interp_np(input_data,
+                                            out_h=12,
+                                            out_w=12,
+                                            align_corners=False)
+            out = interpolate(x=input_x,
+                              size=size,
+                              mode="bilinear",
+                              align_corners=False)
             self.assertTrue(np.allclose(out.numpy(), expect_res))
 
 
 class TestBilinearInterpOpAPI_dy3(unittest.TestCase):
+
     def test_case(self):
         import paddle
         if core.is_compiled_with_cuda():
@@ -654,17 +697,19 @@ def test_case(self):
             size_1 = np.array([12]).astype("int64")
             input_x = paddle.to_tensor(input_data)
             size = paddle.to_tensor(size_1)
-            expect_res = bilinear_interp_np(
-                input_data, out_h=12, out_w=12, align_corners=False)
-            out = interpolate(
-                x=input_x,
-                size=[size, size],
-                mode="bilinear",
-                align_corners=False)
+            expect_res = bilinear_interp_np(input_data,
+                                            out_h=12,
+                                            out_w=12,
+                                            align_corners=False)
+            out = interpolate(x=input_x,
+                              size=[size, size],
+                              mode="bilinear",
+                              align_corners=False)
             self.assertTrue(np.allclose(out.numpy(), expect_res))
 
 
 class TestBilinearInterpOpAPI_dy4(unittest.TestCase):
+
     def test_case(self):
         import paddle
         if core.is_compiled_with_cuda():
@@ -676,13 +721,14 @@ def test_case(self):
             scale_np = np.array([2, 2]).astype("int64")
             input_x = paddle.to_tensor(input_data)
             scale = paddle.to_tensor(scale_np)
-            expect_res = bilinear_interp_np(
-                input_data, out_h=12, out_w=12, align_corners=False)
-            out = interpolate(
-                x=input_x,
-                scale_factor=scale,
-                mode="bilinear",
-                align_corners=False)
+            expect_res = bilinear_interp_np(input_data,
+                                            out_h=12,
+                                            out_w=12,
+                                            align_corners=False)
+            out = interpolate(x=input_x,
+                              scale_factor=scale,
+                              mode="bilinear",
+                              align_corners=False)
             self.assertTrue(np.allclose(out.numpy(), expect_res))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
index 60e9d0a26b380..5301924927087 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
@@ -21,13 +21,15 @@
 
 
 class TestDygraphBilinearTensorProductAPIError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            layer = fluid.dygraph.nn.BilinearTensorProduct(
-                input1_dim=5, input2_dim=4, output_dim=1000)
+            layer = fluid.dygraph.nn.BilinearTensorProduct(input1_dim=5,
+                                                           input2_dim=4,
+                                                           output_dim=1000)
             # the input must be Variable.
-            x0 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            x0 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
             self.assertRaises(TypeError, layer, x0)
             # the input dtype must be float32 or float64
             x1 = fluid.data(name='x1', shape=[-1, 5], dtype="float16")
@@ -36,6 +38,7 @@ def test_errors(self):
 
 
 class TestBilinearTensorProductOp(OpTest):
+
     def setUp(self):
         self.op_type = "bilinear_tensor_product"
         batch_size = 6
diff --git a/python/paddle/fluid/tests/unittests/test_bincount_op.py b/python/paddle/fluid/tests/unittests/test_bincount_op.py
index 17b04b954afe8..2b99c92191150 100644
--- a/python/paddle/fluid/tests/unittests/test_bincount_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bincount_op.py
@@ -43,14 +43,15 @@ def test_static_graph(self):
             img = np.array([0, 1, 1, 3, 2, 1, 7]).astype(np.int64)
             w = np.array([0, 1, 1, 2, 2, 1, 0]).astype(np.int64)
             res = exe.run(train_program,
-                          feed={'input': img,
-                                'weights': w},
+                          feed={
+                              'input': img,
+                              'weights': w
+                          },
                           fetch_list=[output])
             actual = np.array(res[0])
             expected = np.bincount(img, weights=w)
-            self.assertTrue(
-                (actual == expected).all(),
-                msg='bincount output is wrong, out =' + str(actual))
+            self.assertTrue((actual == expected).all(),
+                            msg='bincount output is wrong, out =' + str(actual))
 
     def test_dygraph(self):
         with fluid.dygraph.guard():
@@ -153,12 +154,12 @@ def setUp(self):
 
     def init_test_case(self):
         self.minlength = 0
-        self.np_weights = np.random.randint(
-            low=0, high=20, size=10).astype(np.float32)
+        self.np_weights = np.random.randint(low=0, high=20,
+                                            size=10).astype(np.float32)
         self.np_input = np.random.randint(low=0, high=20, size=10)
-        self.Out = np.bincount(
-            self.np_input, weights=self.np_weights,
-            minlength=self.minlength).astype(np.float32)
+        self.Out = np.bincount(self.np_input,
+                               weights=self.np_weights,
+                               minlength=self.minlength).astype(np.float32)
 
 
 class TestCase2(TestBincountOp):
@@ -175,8 +176,9 @@ def init_test_case(self):
         self.minlength = 0
         self.np_weights = np.random.randint(low=0, high=20, size=10)
         self.np_input = np.random.randint(low=0, high=20, size=10)
-        self.Out = np.bincount(
-            self.np_input, weights=self.np_weights, minlength=self.minlength)
+        self.Out = np.bincount(self.np_input,
+                               weights=self.np_weights,
+                               minlength=self.minlength)
 
 
 class TestCase3(TestBincountOp):
@@ -191,8 +193,8 @@ class TestCase4(TestBincountOp):
     # with input(INT32)
     def init_test_case(self):
         self.minlength = 0
-        self.np_input = np.random.randint(
-            low=0, high=20, size=10).astype(np.int32)
+        self.np_input = np.random.randint(low=0, high=20,
+                                          size=10).astype(np.int32)
         self.Out = np.bincount(self.np_input, minlength=self.minlength)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index cc2b1165ec304..b99892c65e19f 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -84,6 +84,7 @@ def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
 
 
 class TestBipartiteMatchOpWithLoD(OpTest):
+
     def setUp(self):
         self.op_type = 'bipartite_match'
         lod = [[5, 6, 12]]
@@ -101,6 +102,7 @@ def test_check_output(self):
 
 
 class TestBipartiteMatchOpWithoutLoD(OpTest):
+
     def setUp(self):
         self.op_type = 'bipartite_match'
         lod = [[8]]
@@ -118,6 +120,7 @@ def test_check_output(self):
 
 
 class TestBipartiteMatchOpWithoutLoDLargeScaleInput(OpTest):
+
     def setUp(self):
         self.op_type = 'bipartite_match'
         lod = [[300]]
@@ -135,12 +138,13 @@ def test_check_output(self):
 
 
 class TestBipartiteMatchOpWithPerPredictionType(OpTest):
+
     def setUp(self):
         self.op_type = 'bipartite_match'
         lod = [[5, 6, 12]]
         dist = np.random.random((23, 237)).astype('float32')
-        match_indices, match_dist = batch_bipartite_match(dist, lod[0],
-                                                          'per_prediction', 0.5)
+        match_indices, match_dist = batch_bipartite_match(
+            dist, lod[0], 'per_prediction', 0.5)
 
         self.inputs = {'DistMat': (dist, lod)}
         self.outputs = {
@@ -157,6 +161,7 @@ def test_check_output(self):
 
 
 class TestBipartiteMatchOpWithEmptyLoD(OpTest):
+
     def setUp(self):
         self.op_type = 'bipartite_match'
         lod = [[5, 6, 0, 12]]
diff --git a/python/paddle/fluid/tests/unittests/test_bitwise_op.py b/python/paddle/fluid/tests/unittests/test_bitwise_op.py
index ead78d75c3dc4..c387555ccda64 100644
--- a/python/paddle/fluid/tests/unittests/test_bitwise_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bitwise_op.py
@@ -22,16 +22,21 @@
 
 ################## TEST OP: BitwiseAnd ##################
 class TestBitwiseAnd(OpTest):
+
     def setUp(self):
         self.op_type = "bitwise_and"
         self.init_dtype()
         self.init_shape()
         self.init_bound()
 
-        x = np.random.randint(
-            self.low, self.high, self.x_shape, dtype=self.dtype)
-        y = np.random.randint(
-            self.low, self.high, self.y_shape, dtype=self.dtype)
+        x = np.random.randint(self.low,
+                              self.high,
+                              self.x_shape,
+                              dtype=self.dtype)
+        y = np.random.randint(self.low,
+                              self.high,
+                              self.y_shape,
+                              dtype=self.dtype)
         out = np.bitwise_and(x, y)
 
         self.inputs = {'X': x, 'Y': y}
@@ -56,6 +61,7 @@ def init_bound(self):
 
 
 class TestBitwiseAndUInt8(TestBitwiseAnd):
+
     def init_dtype(self):
         self.dtype = np.uint8
 
@@ -65,6 +71,7 @@ def init_bound(self):
 
 
 class TestBitwiseAndInt8(TestBitwiseAnd):
+
     def init_dtype(self):
         self.dtype = np.int8
 
@@ -74,6 +81,7 @@ def init_shape(self):
 
 
 class TestBitwiseAndInt16(TestBitwiseAnd):
+
     def init_dtype(self):
         self.dtype = np.int16
 
@@ -83,6 +91,7 @@ def init_shape(self):
 
 
 class TestBitwiseAndInt64(TestBitwiseAnd):
+
     def init_dtype(self):
         self.dtype = np.int64
 
@@ -92,6 +101,7 @@ def init_shape(self):
 
 
 class TestBitwiseAndBool(TestBitwiseAnd):
+
     def setUp(self):
         self.op_type = "bitwise_and"
         self.init_shape()
@@ -106,16 +116,21 @@ def setUp(self):
 
 ################## TEST OP: BitwiseOr ##################
 class TestBitwiseOr(OpTest):
+
     def setUp(self):
         self.op_type = "bitwise_or"
         self.init_dtype()
         self.init_shape()
         self.init_bound()
 
-        x = np.random.randint(
-            self.low, self.high, self.x_shape, dtype=self.dtype)
-        y = np.random.randint(
-            self.low, self.high, self.y_shape, dtype=self.dtype)
+        x = np.random.randint(self.low,
+                              self.high,
+                              self.x_shape,
+                              dtype=self.dtype)
+        y = np.random.randint(self.low,
+                              self.high,
+                              self.y_shape,
+                              dtype=self.dtype)
         out = np.bitwise_or(x, y)
 
         self.inputs = {'X': x, 'Y': y}
@@ -140,6 +155,7 @@ def init_bound(self):
 
 
 class TestBitwiseOrUInt8(TestBitwiseOr):
+
     def init_dtype(self):
         self.dtype = np.uint8
 
@@ -149,6 +165,7 @@ def init_bound(self):
 
 
 class TestBitwiseOrInt8(TestBitwiseOr):
+
     def init_dtype(self):
         self.dtype = np.int8
 
@@ -158,6 +175,7 @@ def init_shape(self):
 
 
 class TestBitwiseOrInt16(TestBitwiseOr):
+
     def init_dtype(self):
         self.dtype = np.int16
 
@@ -167,6 +185,7 @@ def init_shape(self):
 
 
 class TestBitwiseOrInt64(TestBitwiseOr):
+
     def init_dtype(self):
         self.dtype = np.int64
 
@@ -176,6 +195,7 @@ def init_shape(self):
 
 
 class TestBitwiseOrBool(TestBitwiseOr):
+
     def setUp(self):
         self.op_type = "bitwise_or"
         self.init_shape()
@@ -190,16 +210,21 @@ def setUp(self):
 
 ################## TEST OP: BitwiseXor ##################
 class TestBitwiseXor(OpTest):
+
     def setUp(self):
         self.op_type = "bitwise_xor"
         self.init_dtype()
         self.init_shape()
         self.init_bound()
 
-        x = np.random.randint(
-            self.low, self.high, self.x_shape, dtype=self.dtype)
-        y = np.random.randint(
-            self.low, self.high, self.y_shape, dtype=self.dtype)
+        x = np.random.randint(self.low,
+                              self.high,
+                              self.x_shape,
+                              dtype=self.dtype)
+        y = np.random.randint(self.low,
+                              self.high,
+                              self.y_shape,
+                              dtype=self.dtype)
         out = np.bitwise_xor(x, y)
 
         self.inputs = {'X': x, 'Y': y}
@@ -224,6 +249,7 @@ def init_bound(self):
 
 
 class TestBitwiseXorUInt8(TestBitwiseXor):
+
     def init_dtype(self):
         self.dtype = np.uint8
 
@@ -233,6 +259,7 @@ def init_bound(self):
 
 
 class TestBitwiseXorInt8(TestBitwiseXor):
+
     def init_dtype(self):
         self.dtype = np.int8
 
@@ -242,6 +269,7 @@ def init_shape(self):
 
 
 class TestBitwiseXorInt16(TestBitwiseXor):
+
     def init_dtype(self):
         self.dtype = np.int16
 
@@ -251,6 +279,7 @@ def init_shape(self):
 
 
 class TestBitwiseXorInt64(TestBitwiseXor):
+
     def init_dtype(self):
         self.dtype = np.int64
 
@@ -260,6 +289,7 @@ def init_shape(self):
 
 
 class TestBitwiseXorBool(TestBitwiseXor):
+
     def setUp(self):
         self.op_type = "bitwise_xor"
         self.init_shape()
@@ -274,14 +304,17 @@ def setUp(self):
 
 ##################  TEST OP: BitwiseNot ##################
 class TestBitwiseNot(OpTest):
+
     def setUp(self):
         self.op_type = "bitwise_not"
         self.init_dtype()
         self.init_shape()
         self.init_bound()
 
-        x = np.random.randint(
-            self.low, self.high, self.x_shape, dtype=self.dtype)
+        x = np.random.randint(self.low,
+                              self.high,
+                              self.x_shape,
+                              dtype=self.dtype)
         out = np.bitwise_not(x)
 
         self.inputs = {'X': x}
@@ -305,6 +338,7 @@ def init_bound(self):
 
 
 class TestBitwiseNotUInt8(TestBitwiseNot):
+
     def init_dtype(self):
         self.dtype = np.uint8
 
@@ -314,6 +348,7 @@ def init_bound(self):
 
 
 class TestBitwiseNotInt8(TestBitwiseNot):
+
     def init_dtype(self):
         self.dtype = np.int8
 
@@ -322,6 +357,7 @@ def init_shape(self):
 
 
 class TestBitwiseNotInt16(TestBitwiseNot):
+
     def init_dtype(self):
         self.dtype = np.int16
 
@@ -331,6 +367,7 @@ def init_shape(self):
 
 
 class TestBitwiseNotInt64(TestBitwiseNot):
+
     def init_dtype(self):
         self.dtype = np.int64
 
@@ -339,6 +376,7 @@ def init_shape(self):
 
 
 class TestBitwiseNotBool(TestBitwiseNot):
+
     def setUp(self):
         self.op_type = "bitwise_not"
         self.init_shape()
diff --git a/python/paddle/fluid/tests/unittests/test_bmm_op.py b/python/paddle/fluid/tests/unittests/test_bmm_op.py
index a1c8266842087..b9a5853c492f5 100644
--- a/python/paddle/fluid/tests/unittests/test_bmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bmm_op.py
@@ -24,6 +24,7 @@
 
 
 class TestBmmOp(OpTest):
+
     def setUp(self):
         self.op_type = "bmm"
         X = np.random.random((10, 3, 4)).astype("float64")
@@ -40,25 +41,31 @@ def test_checkout_grad(self):
 
 
 class API_TestBmm(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data1 = fluid.layers.data(
-                'data1', shape=[-1, 3, 4], dtype='float64')
-            data2 = fluid.layers.data(
-                'data2', shape=[-1, 4, 5], dtype='float64')
+            data1 = fluid.layers.data('data1',
+                                      shape=[-1, 3, 4],
+                                      dtype='float64')
+            data2 = fluid.layers.data('data2',
+                                      shape=[-1, 4, 5],
+                                      dtype='float64')
             result_bmm = paddle.bmm(data1, data2)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input1 = np.random.random([10, 3, 4]).astype('float64')
             input2 = np.random.random([10, 4, 5]).astype('float64')
-            result, = exe.run(feed={"data1": input1,
-                                    "data2": input2},
+            result, = exe.run(feed={
+                "data1": input1,
+                "data2": input2
+            },
                               fetch_list=[result_bmm])
             expected_result = np.matmul(input1, input2)
         self.assertTrue(np.allclose(expected_result, result))
 
 
 class API_TestDygraphBmm(unittest.TestCase):
+
     def test_out(self):
         input1 = np.array([[[1.0, 1.0, 1.0], [2.0, 2.0, 2.0]],
                            [[3.0, 3.0, 3.0], [4.0, 4.0, 4.0]]])
@@ -74,6 +81,7 @@ def test_out(self):
 
 
 class TestBmmAPIError(unittest.TestCase):
+
     def test_api_error(self):
         x_data = np.arange(24, dtype='float32').reshape((2, 3, 4))
         y_data = np.arange(16, dtype='float32').reshape((2, 4, 2))
diff --git a/python/paddle/fluid/tests/unittests/test_box_clip_op.py b/python/paddle/fluid/tests/unittests/test_box_clip_op.py
index b2b0598f31dd2..1324f251ee937 100644
--- a/python/paddle/fluid/tests/unittests/test_box_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_clip_op.py
@@ -25,14 +25,14 @@
 def box_clip(input_box, im_info, output_box):
     im_w = round(im_info[1] / im_info[2])
     im_h = round(im_info[0] / im_info[2])
-    output_box[:, :, 0] = np.maximum(
-        np.minimum(input_box[:, :, 0], im_w - 1), 0)
-    output_box[:, :, 1] = np.maximum(
-        np.minimum(input_box[:, :, 1], im_h - 1), 0)
-    output_box[:, :, 2] = np.maximum(
-        np.minimum(input_box[:, :, 2], im_w - 1), 0)
-    output_box[:, :, 3] = np.maximum(
-        np.minimum(input_box[:, :, 3], im_h - 1), 0)
+    output_box[:, :, 0] = np.maximum(np.minimum(input_box[:, :, 0], im_w - 1),
+                                     0)
+    output_box[:, :, 1] = np.maximum(np.minimum(input_box[:, :, 1], im_h - 1),
+                                     0)
+    output_box[:, :, 2] = np.maximum(np.minimum(input_box[:, :, 2], im_w - 1),
+                                     0)
+    output_box[:, :, 3] = np.maximum(np.minimum(input_box[:, :, 3], im_h - 1),
+                                     0)
 
 
 def batch_box_clip(input_boxes, im_info, lod):
@@ -49,6 +49,7 @@ def batch_box_clip(input_boxes, im_info, lod):
 
 
 class TestBoxClipOp(OpTest):
+
     def test_check_output(self):
         self.check_output()
 
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index 220bffebe8392..63df37f912259 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -34,8 +34,9 @@ def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0):
     pb_y = pb_y.reshape(shape)
 
     if pb_v.ndim == 2:
-        var_shape = (1, pb_v.shape[0], pb_v.shape[1]) if axis == 0 else (
-            pb_v.shape[0], 1, pb_v.shape[1])
+        var_shape = (1, pb_v.shape[0],
+                     pb_v.shape[1]) if axis == 0 else (pb_v.shape[0], 1,
+                                                       pb_v.shape[1])
         pb_v = pb_v.reshape(var_shape)
     if pb_v.ndim == 1:
         tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x
@@ -102,6 +103,7 @@ def batch_box_coder(p_box, pb_v, t_box, lod, code_type, norm, axis=0):
 
 
 class TestBoxCoderOp(OpTest):
+
     def test_check_output(self):
         self.check_output()
 
@@ -128,6 +130,7 @@ def setUp(self):
 
 
 class TestBoxCoderOpWithoutBoxVar(OpTest):
+
     def test_check_output(self):
         self.check_output()
 
@@ -154,6 +157,7 @@ def setUp(self):
 
 
 class TestBoxCoderOpWithLoD(OpTest):
+
     def test_check_output(self):
         self.check_output()
 
@@ -178,6 +182,7 @@ def setUp(self):
 
 
 class TestBoxCoderOpWithAxis(OpTest):
+
     def test_check_output(self):
         self.check_output()
 
@@ -207,6 +212,7 @@ def setUp(self):
 
 
 class TestBoxCoderOpWithVariance(OpTest):
+
     def test_check_output(self):
         self.check_output()
 
diff --git a/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py b/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
index b0afc2a2e4ad7..00f84dc9496a4 100644
--- a/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
@@ -64,6 +64,7 @@ def box_decoder_and_assign(deltas, weights, boxes, box_score, box_clip):
 
 
 class TestBoxDecoderAndAssignOpWithLoD(OpTest):
+
     def test_check_output(self):
         self.check_output()
 
diff --git a/python/paddle/fluid/tests/unittests/test_boxps.py b/python/paddle/fluid/tests/unittests/test_boxps.py
index d1340bb1ce7d6..ea98b6daf286b 100644
--- a/python/paddle/fluid/tests/unittests/test_boxps.py
+++ b/python/paddle/fluid/tests/unittests/test_boxps.py
@@ -38,42 +38,38 @@ def test_transpile(self):
         main_program = fluid.Program()
         startup_program = fluid.Program()
         t = self.get_transpile("single_process_multi_thread")
-        t.transpile(
-            trainer_id=0,
-            startup_program=startup_program,
-            trainers="127.0.0.1:6174",
-            program=main_program)
+        t.transpile(trainer_id=0,
+                    startup_program=startup_program,
+                    trainers="127.0.0.1:6174",
+                    program=main_program)
         t = self.get_transpile("grad_allreduce")
         try:
-            t.transpile(
-                trainer_id=0,
-                startup_program=startup_program,
-                trainers="127.0.0.1:6174",
-                program=main_program)
+            t.transpile(trainer_id=0,
+                        startup_program=startup_program,
+                        trainers="127.0.0.1:6174",
+                        program=main_program)
         except ValueError as e:
             print(e)
 
     def test_single_trainers(self):
         transpiler = collective.GradAllReduce(0)
         try:
-            transpiler.transpile(
-                startup_program=fluid.Program(),
-                main_program=fluid.Program(),
-                rank=1,
-                endpoints="127.0.0.1:6174",
-                current_endpoint="127.0.0.1:6174",
-                wait_port="6174")
+            transpiler.transpile(startup_program=fluid.Program(),
+                                 main_program=fluid.Program(),
+                                 rank=1,
+                                 endpoints="127.0.0.1:6174",
+                                 current_endpoint="127.0.0.1:6174",
+                                 wait_port="6174")
         except ValueError as e:
             print(e)
         transpiler = collective.LocalSGD(0)
         try:
-            transpiler.transpile(
-                startup_program=fluid.Program(),
-                main_program=fluid.Program(),
-                rank=1,
-                endpoints="127.0.0.1:6174",
-                current_endpoint="127.0.0.1:6174",
-                wait_port="6174")
+            transpiler.transpile(startup_program=fluid.Program(),
+                                 main_program=fluid.Program(),
+                                 rank=1,
+                                 endpoints="127.0.0.1:6174",
+                                 current_endpoint="127.0.0.1:6174",
+                                 wait_port="6174")
         except ValueError as e:
             print(e)
 
@@ -95,10 +91,14 @@ def test_pull_box_sparse_op(self):
         paddle.enable_static()
         program = fluid.Program()
         with fluid.program_guard(program):
-            x = fluid.layers.data(
-                name='x', shape=[1], dtype='int64', lod_level=0)
-            y = fluid.layers.data(
-                name='y', shape=[1], dtype='int64', lod_level=0)
+            x = fluid.layers.data(name='x',
+                                  shape=[1],
+                                  dtype='int64',
+                                  lod_level=0)
+            y = fluid.layers.data(name='y',
+                                  shape=[1],
+                                  dtype='int64',
+                                  lod_level=0)
             emb_x, emb_y = _pull_box_sparse([x, y], size=1)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast.py b/python/paddle/fluid/tests/unittests/test_broadcast.py
index 8b8cdb1235ce3..159e3be7ff1ed 100644
--- a/python/paddle/fluid/tests/unittests/test_broadcast.py
+++ b/python/paddle/fluid/tests/unittests/test_broadcast.py
@@ -23,6 +23,7 @@
 
 
 class TestCBroadcastOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast_error.py b/python/paddle/fluid/tests/unittests/test_broadcast_error.py
index 517de67fd6ddd..bc1d026815856 100644
--- a/python/paddle/fluid/tests/unittests/test_broadcast_error.py
+++ b/python/paddle/fluid/tests/unittests/test_broadcast_error.py
@@ -19,6 +19,7 @@
 
 
 class TestBroadcastOpCpu(OpTest):
+
     def setUp(self):
         self.op_type = "broadcast"
         input = np.random.random((100, 2)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast_shape.py b/python/paddle/fluid/tests/unittests/test_broadcast_shape.py
index b4ac096a69685..8046a02c9abdb 100644
--- a/python/paddle/fluid/tests/unittests/test_broadcast_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_broadcast_shape.py
@@ -18,6 +18,7 @@
 
 
 class TestBroadcastShape(unittest.TestCase):
+
     def test_result(self):
         shape = paddle.broadcast_shape([2, 1, 3], [1, 3, 1])
         self.assertEqual(shape, [2, 3, 3])
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py b/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py
index f60e4067a09e5..20e0ead8b3fa3 100644
--- a/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py
+++ b/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py
@@ -22,6 +22,7 @@
 from test_collective_base import TestDistBase
 
 import random
+
 random.seed(2021)
 
 paddle.enable_static()
@@ -82,6 +83,7 @@ def gen_mixed_tensors_test(dtype):
 
 
 class TestCPUBroadcastTensorsOp(OpTest):
+
     def set_place(self):
         self.place = core.CPUPlace()
 
@@ -105,22 +107,25 @@ def run_test(self, test_func, args):
                 test_func(**args)
 
     def test_check_output(self):
-        self.run_test(self.check_output_with_place,
-                      {"place": self.place,
-                       "atol": 1e-1})
-
-    def test_check_grad_normal(self):
-        self.run_test(self.check_grad_with_place, {
+        self.run_test(self.check_output_with_place, {
             "place": self.place,
-            "inputs_to_check": ['x0', 'x1'],
-            "output_names": ['out0', 'out1'],
-            "max_relative_error": 0.05,
+            "atol": 1e-1
         })
 
+    def test_check_grad_normal(self):
+        self.run_test(
+            self.check_grad_with_place, {
+                "place": self.place,
+                "inputs_to_check": ['x0', 'x1'],
+                "output_names": ['out0', 'out1'],
+                "max_relative_error": 0.05,
+            })
+
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDABroadcastTensorsOp(TestCPUBroadcastTensorsOp):
+
     def set_place(self):
         self.place = core.CUDAPlace(0)
 
@@ -131,13 +136,17 @@ def set_dtypes(self):
 
 
 class TestBroadcastTensorsAPI(unittest.TestCase):
+
     def test_api(self):
+
         def test_static():
             inputs = [
-                paddle.fluid.layers.data(
-                    shape=[4, 1, 4, 1], dtype='float32', name="x0"),
-                paddle.fluid.layers.data(
-                    shape=[1, 4, 1, 4], dtype='float32', name="x1")
+                paddle.fluid.layers.data(shape=[4, 1, 4, 1],
+                                         dtype='float32',
+                                         name="x0"),
+                paddle.fluid.layers.data(shape=[1, 4, 1, 4],
+                                         dtype='float32',
+                                         name="x1")
             ]
             paddle.broadcast_tensors(inputs)
 
@@ -159,31 +168,39 @@ def test_dynamic():
 
 
 class TestRaiseBroadcastTensorsError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_type():
             inputs = [
-                paddle.fluid.layers.data(
-                    shape=[1, 1, 1, 1], dtype='float32', name="x4"),
-                paddle.fluid.layers.data(
-                    shape=[1, 4, 1, 1], dtype='float64', name="x5")
+                paddle.fluid.layers.data(shape=[1, 1, 1, 1],
+                                         dtype='float32',
+                                         name="x4"),
+                paddle.fluid.layers.data(shape=[1, 4, 1, 1],
+                                         dtype='float64',
+                                         name="x5")
             ]
             paddle.broadcast_tensors(inputs)
 
         def test_dtype():
             inputs = [
-                paddle.fluid.layers.data(
-                    shape=[1, 1, 1, 1], dtype='int8', name="x6"),
-                paddle.fluid.layers.data(
-                    shape=[1, 4, 1, 1], dtype='int8', name="x7")
+                paddle.fluid.layers.data(shape=[1, 1, 1, 1],
+                                         dtype='int8',
+                                         name="x6"),
+                paddle.fluid.layers.data(shape=[1, 4, 1, 1],
+                                         dtype='int8',
+                                         name="x7")
             ]
             paddle.broadcast_tensors(inputs)
 
         def test_bcast_semantics():
             inputs = [
-                paddle.fluid.layers.data(
-                    shape=[1, 3, 1, 1], dtype='float32', name="x9"),
-                paddle.fluid.layers.data(
-                    shape=[1, 8, 1, 1], dtype='float32', name="x10")
+                paddle.fluid.layers.data(shape=[1, 3, 1, 1],
+                                         dtype='float32',
+                                         name="x9"),
+                paddle.fluid.layers.data(shape=[1, 8, 1, 1],
+                                         dtype='float32',
+                                         name="x10")
             ]
             paddle.broadcast_tensors(inputs)
 
@@ -193,37 +210,33 @@ def test_bcast_semantics():
 
 
 class TestRaiseBroadcastTensorsErrorDyGraph(unittest.TestCase):
+
     def test_errors(self):
+
         def test_type():
             inputs = [
                 paddle.to_tensor(
-                    np.ones(
-                        shape=[1, 1, 1, 1], dtype='float32', name="x4")),
+                    np.ones(shape=[1, 1, 1, 1], dtype='float32', name="x4")),
                 paddle.to_tensor(
-                    np.ones(
-                        shape=[1, 4, 1, 1], dtype='float64', name="x5"))
+                    np.ones(shape=[1, 4, 1, 1], dtype='float64', name="x5"))
             ]
             paddle.broadcast_tensors(inputs)
 
         def test_dtype():
             inputs = [
                 paddle.to_tensor(
-                    np.ones(
-                        shape=[1, 1, 1, 1], dtype='int8', name="x6")),
+                    np.ones(shape=[1, 1, 1, 1], dtype='int8', name="x6")),
                 paddle.to_tensor(
-                    np.ones(
-                        shape=[1, 4, 1, 1], dtype='int8', name="x7"))
+                    np.ones(shape=[1, 4, 1, 1], dtype='int8', name="x7"))
             ]
             paddle.broadcast_tensors(inputs)
 
         def test_bcast_semantics():
             inputs = [
                 paddle.to_tensor(
-                    np.ones(
-                        shape=[1, 3, 1, 1], dtype='float32', name="x9")),
+                    np.ones(shape=[1, 3, 1, 1], dtype='float32', name="x9")),
                 paddle.to_tensor(
-                    np.ones(
-                        shape=[1, 8, 1, 1], dtype='float32', name="x10"))
+                    np.ones(shape=[1, 8, 1, 1], dtype='float32', name="x10"))
             ]
             paddle.broadcast_tensors(inputs)
 
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast_to_op.py b/python/paddle/fluid/tests/unittests/test_broadcast_to_op.py
index 80f4c7a2698c6..3de96959eb4d6 100644
--- a/python/paddle/fluid/tests/unittests/test_broadcast_to_op.py
+++ b/python/paddle/fluid/tests/unittests/test_broadcast_to_op.py
@@ -25,10 +25,11 @@
 
 
 class TestBroadcastToError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             shape = [2, 2]
             self.assertRaises(TypeError, paddle.tensor.broadcast_to, x1, shape)
             x2 = fluid.layers.data(name='x2', shape=[4], dtype="uint8")
@@ -40,17 +41,19 @@ def test_errors(self):
 
 # Test python API
 class TestBroadcastToAPI(unittest.TestCase):
+
     def test_api(self):
         input = np.random.random([12, 14]).astype("float32")
-        x = fluid.layers.data(
-            name='x', shape=[12, 14], append_batch_size=False, dtype="float32")
+        x = fluid.layers.data(name='x',
+                              shape=[12, 14],
+                              append_batch_size=False,
+                              dtype="float32")
 
         positive_2 = fluid.layers.fill_constant([1], "int32", 12)
-        expand_shape = fluid.layers.data(
-            name="expand_shape",
-            shape=[2],
-            append_batch_size=False,
-            dtype="int32")
+        expand_shape = fluid.layers.data(name="expand_shape",
+                                         shape=[2],
+                                         append_batch_size=False,
+                                         dtype="int32")
 
         out_1 = paddle.broadcast_to(x, shape=[12, 14])
         out_2 = paddle.broadcast_to(x, shape=[positive_2, 14])
@@ -61,7 +64,8 @@ def test_api(self):
         exe = fluid.Executor(place=fluid.CPUPlace())
         res_1, res_2, res_3 = exe.run(fluid.default_main_program(),
                                       feed={
-                                          "x": input,
+                                          "x":
+                                          input,
                                           "expand_shape":
                                           np.array([12, 14]).astype("int32")
                                       },
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
index eda7c3caaeb08..ffc173184728e 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -24,13 +24,16 @@
 batch_size = 32
 
 feed_dict = {
-    'image': np.random.random([batch_size, 784]).astype('float32'),
-    'label': np.random.random_integers(
-        low=0, high=9, size=[batch_size, 1]).astype('int64')
+    'image':
+    np.random.random([batch_size, 784]).astype('float32'),
+    'label':
+    np.random.random_integers(low=0, high=9, size=[batch_size,
+                                                   1]).astype('int64')
 }
 
 
 class InplaceTestBase(unittest.TestCase):
+
     def initParameter(self):
         self.use_cuda = True
         self.fuse_all_optimizer_ops = False
@@ -60,8 +63,8 @@ def build_program_and_scope(self):
 
                 with fluid.scope_guard(scope):
                     exe = fluid.Executor(
-                        fluid.CUDAPlace(0)
-                        if self.use_cuda else fluid.CPUPlace())
+                        fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace(
+                        ))
                     exe.run(startup_program)
 
         return main_program, scope, exe, loss
@@ -101,7 +104,7 @@ def check_single_card_fetch_var(self):
 
         all_vars_name = self.get_all_vars(prog1)
         repeated_var_names = all_vars_name * 2
-        random.shuffle(repeated_var_names)  # add some random 
+        random.shuffle(repeated_var_names)  # add some random
 
         for fetch_var in repeated_var_names:
             for _ in range(4):
@@ -117,8 +120,8 @@ def check_single_card_fetch_var(self):
                                               fetch_list=[fetch_var])
                         self.assertTrue(
                             np.array_equal(fetch_val1, fetch_val2),
-                            "error var name: {}, fetch_val1: {}, fetch_val2: {}".
-                            format(
+                            "error var name: {}, fetch_val1: {}, fetch_val2: {}"
+                            .format(
                                 fetch_var,
                                 fetch_val1[~np.equal(fetch_val1, fetch_val2)],
                                 fetch_val2[~np.equal(fetch_val1, fetch_val2)]))
@@ -145,14 +148,13 @@ def check_multi_card_fetch_var(self):
                 build_strategy.enable_inplace = enable_inplace
                 build_strategy.fuse_all_optimizer_ops = self.fuse_all_optimizer_ops
                 compiled_program = fluid.CompiledProgram(
-                    prog).with_data_parallel(
-                        loss_name=loss.name,
-                        build_strategy=build_strategy,
-                        places=places)
+                    prog).with_data_parallel(loss_name=loss.name,
+                                             build_strategy=build_strategy,
+                                             places=places)
                 compiled_programs.append(compiled_program)
 
         repeated_var_names = self.get_all_vars(prog1) * 2
-        random.shuffle(repeated_var_names)  # add some random 
+        random.shuffle(repeated_var_names)  # add some random
 
         for fetch_var in repeated_var_names:
             for _ in range(4):
@@ -175,6 +177,7 @@ def check_multi_card_fetch_var(self):
 
 
 class CUDAInplaceTest(InplaceTestBase):
+
     def initParameter(self):
         self.use_cuda = True
         self.fuse_all_optimizer_ops = False
@@ -187,6 +190,7 @@ def test_single_card_fetch_var(self):
 
 
 class CPUInplaceTest(InplaceTestBase):
+
     def initParameter(self):
         self.use_cuda = False
         self.fuse_all_optimizer_ops = False
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py
index e9e62bee00680..6ce5e64b0ee57 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py
@@ -17,6 +17,7 @@
 
 
 class CUDAInplaceTestWithFuseOptimizationOps(InplaceTestBase):
+
     def initParameter(self):
         self.use_cuda = True
         self.fuse_all_optimizer_ops = True
@@ -30,6 +31,7 @@ def test_single_card_fetch_var(self):
 
 
 class CPUInplaceTestWithFuseOptimizationOps(InplaceTestBase):
+
     def initParameter(self):
         self.use_cuda = False
         self.fuse_all_optimizer_ops = True
diff --git a/python/paddle/fluid/tests/unittests/test_build_strategy_fusion_group_pass.py b/python/paddle/fluid/tests/unittests/test_build_strategy_fusion_group_pass.py
index 1405bf9d70b04..a5daa6917316f 100644
--- a/python/paddle/fluid/tests/unittests/test_build_strategy_fusion_group_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_build_strategy_fusion_group_pass.py
@@ -21,6 +21,7 @@
 
 
 class FusionGroupPaddingRNNTest(PaddingRNNTestBase):
+
     def set_customed_config(self):
         self.build_strategy.enable_auto_fusion = True
 
diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py b/python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py
index a7f4a15381b42..2082bc7ca4ef7 100644
--- a/python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py
+++ b/python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py
@@ -21,6 +21,7 @@
 
 
 class TestCCommInitAllOp(unittest.TestCase):
+
     def setUp(self):
         self.place = fluid.CUDAPlace(0)
         self.exe = fluid.Executor(self.place)
@@ -41,9 +42,11 @@ def test_init_with_same_ring_id(self):
     def test_specifying_devices(self):
         program = fluid.Program()
         block = program.global_block()
-        block.append_op(
-            type='c_comm_init_all', attrs={'devices': [0],
-                                           'ring_id': 1})
+        block.append_op(type='c_comm_init_all',
+                        attrs={
+                            'devices': [0],
+                            'ring_id': 1
+                        })
         self.exe.run(program)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_c_concat.py b/python/paddle/fluid/tests/unittests/test_c_concat.py
index 20f166af14c9c..17469367a8537 100644
--- a/python/paddle/fluid/tests/unittests/test_c_concat.py
+++ b/python/paddle/fluid/tests/unittests/test_c_concat.py
@@ -23,6 +23,7 @@
 
 
 class TestConcatOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_c_identity.py b/python/paddle/fluid/tests/unittests/test_c_identity.py
index c780f800d1ed5..4697f7358c9fc 100644
--- a/python/paddle/fluid/tests/unittests/test_c_identity.py
+++ b/python/paddle/fluid/tests/unittests/test_c_identity.py
@@ -23,6 +23,7 @@
 
 
 class TestIdentityOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_c_split.py b/python/paddle/fluid/tests/unittests/test_c_split.py
index 0a5d91e0625e2..24ed6b5757206 100644
--- a/python/paddle/fluid/tests/unittests/test_c_split.py
+++ b/python/paddle/fluid/tests/unittests/test_c_split.py
@@ -23,6 +23,7 @@
 
 
 class TestSplitOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
index 63ba16c57e09b..53c578fc6c1e8 100644
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -23,6 +23,7 @@
 
 
 class TestCalcGradient(unittest.TestCase):
+
     def test_calc_gradient(self):
         main = fluid.Program()
         startup = fluid.Program()
@@ -40,6 +41,7 @@ def test_calc_gradient(self):
 
 
 class TestDoubleGrad(unittest.TestCase):
+
     def test1(self):
         main = fluid.Program()
         startup = fluid.Program()
@@ -83,6 +85,7 @@ def test2(self):
 
 
 class TestGradientWithPrune(unittest.TestCase):
+
     def test_prune(self):
         with paddle.fluid.scope_guard(paddle.static.Scope()):
             x = fluid.data(name='x', shape=[3], dtype='float32')
@@ -101,6 +104,7 @@ def test_prune(self):
 
 
 class TestDoubleGradient(unittest.TestCase):
+
     def build_program(self):
         start_prog = paddle.static.Program()
         main_prog = paddle.static.Program()
@@ -135,6 +139,7 @@ def test_calc_gradient(self):
 
 
 class TestDoubleGradient2(unittest.TestCase):
+
     def build_program(self):
         start_prog = paddle.static.Program()
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_case.py b/python/paddle/fluid/tests/unittests/test_case.py
index 6391435cc8095..ed633c758b540 100644
--- a/python/paddle/fluid/tests/unittests/test_case.py
+++ b/python/paddle/fluid/tests/unittests/test_case.py
@@ -26,7 +26,9 @@
 
 
 class TestAPICase(unittest.TestCase):
+
     def test_return_single_var(self):
+
         def fn_1():
             return layers.fill_constant(shape=[4, 2], dtype='int32', value=1)
 
@@ -46,16 +48,16 @@ def fn_3():
             pred_1 = layers.less_than(z, x)  # true: 0.2 < 0.3
 
             # call fn_1
-            out_0 = layers.case(
-                pred_fn_pairs=[(pred_1, fn_1), (pred_1, fn_2)], default=fn_3)
+            out_0 = layers.case(pred_fn_pairs=[(pred_1, fn_1), (pred_1, fn_2)],
+                                default=fn_3)
 
             # call fn_2
-            out_1 = layers.case(
-                pred_fn_pairs=[(pred_2, fn_1), (pred_1, fn_2)], default=fn_3)
+            out_1 = layers.case(pred_fn_pairs=[(pred_2, fn_1), (pred_1, fn_2)],
+                                default=fn_3)
 
             # call default fn_3
-            out_2 = layers.case(
-                pred_fn_pairs=((pred_2, fn_1), (pred_2, fn_2)), default=fn_3)
+            out_2 = layers.case(pred_fn_pairs=((pred_2, fn_1), (pred_2, fn_2)),
+                                default=fn_3)
 
             # no default, call fn_2
             out_3 = layers.case(pred_fn_pairs=[(pred_1, fn_2)])
@@ -63,8 +65,8 @@ def fn_3():
             # no default, call fn_2. but pred_2 is false
             out_4 = layers.case(pred_fn_pairs=[(pred_2, fn_2)])
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
 
             res = exe.run(main_program,
@@ -77,20 +79,27 @@ def fn_3():
             self.assertTrue(np.allclose(res[4], 2))
 
     def test_return_var_tuple(self):
+
         def fn_1():
-            return layers.fill_constant(
-                shape=[1, 2], dtype='int32', value=1), layers.fill_constant(
-                    shape=[2, 3], dtype='float32', value=2)
+            return layers.fill_constant(shape=[1, 2], dtype='int32',
+                                        value=1), layers.fill_constant(
+                                            shape=[2, 3],
+                                            dtype='float32',
+                                            value=2)
 
         def fn_2():
-            return layers.fill_constant(
-                shape=[3, 4], dtype='int32', value=3), layers.fill_constant(
-                    shape=[4, 5], dtype='float32', value=4)
+            return layers.fill_constant(shape=[3, 4], dtype='int32',
+                                        value=3), layers.fill_constant(
+                                            shape=[4, 5],
+                                            dtype='float32',
+                                            value=4)
 
         def fn_3():
-            return layers.fill_constant(
-                shape=[5], dtype='int32', value=5), layers.fill_constant(
-                    shape=[5, 6], dtype='float32', value=6)
+            return layers.fill_constant(shape=[5], dtype='int32',
+                                        value=5), layers.fill_constant(
+                                            shape=[5, 6],
+                                            dtype='float32',
+                                            value=6)
 
         main_program = Program()
         startup_program = Program()
@@ -104,46 +113,54 @@ def fn_3():
 
             out = layers.case(((pred_1, fn_1), (pred_2, fn_2)), fn_3)
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
             ret = exe.run(main_program, fetch_list=out)
 
             self.assertTrue(
                 np.allclose(np.asarray(ret[0]), np.full((1, 2), 1, np.int32)))
             self.assertTrue(
-                np.allclose(
-                    np.asarray(ret[1]), np.full((2, 3), 2, np.float32)))
+                np.allclose(np.asarray(ret[1]), np.full((2, 3), 2, np.float32)))
 
 
 class TestAPICase_Nested(unittest.TestCase):
+
     def test_nested_case(self):
+
         def fn_1(x=1):
             var_5 = layers.fill_constant(shape=[1], dtype='int32', value=5)
             var_6 = layers.fill_constant(shape=[1], dtype='int32', value=6)
-            out = layers.case(pred_fn_pairs=[(var_5 < var_6, partial(
-                layers.fill_constant, shape=[1], dtype='int32', value=x)),
-                                             (var_5 == var_6, partial(
-                                                 layers.fill_constant,
-                                                 shape=[2],
-                                                 dtype='int32',
-                                                 value=x))])
+            out = layers.case(pred_fn_pairs=[
+                (var_5 < var_6,
+                 partial(
+                     layers.fill_constant, shape=[1], dtype='int32', value=x)),
+                (var_5 == var_6,
+                 partial(
+                     layers.fill_constant, shape=[2], dtype='int32', value=x))
+            ])
             return out
 
         def fn_2(x=2):
             var_5 = layers.fill_constant(shape=[1], dtype='int32', value=5)
             var_6 = layers.fill_constant(shape=[1], dtype='int32', value=6)
-            out = layers.case(pred_fn_pairs=[(var_5 < var_6, partial(
-                fn_1, x=x)), (var_5 == var_6, partial(
-                    layers.fill_constant, shape=[2], dtype='int32', value=x))])
+            out = layers.case(pred_fn_pairs=[
+                (var_5 < var_6, partial(fn_1, x=x)),
+                (var_5 == var_6,
+                 partial(
+                     layers.fill_constant, shape=[2], dtype='int32', value=x))
+            ])
             return out
 
         def fn_3():
             var_5 = layers.fill_constant(shape=[1], dtype='int32', value=5)
             var_6 = layers.fill_constant(shape=[1], dtype='int32', value=6)
-            out = layers.case(pred_fn_pairs=[(var_5 < var_6, partial(
-                fn_2, x=3)), (var_5 == var_6, partial(
-                    layers.fill_constant, shape=[2], dtype='int32', value=7))])
+            out = layers.case(pred_fn_pairs=[
+                (var_5 < var_6, partial(fn_2, x=3)),
+                (var_5 == var_6,
+                 partial(
+                     layers.fill_constant, shape=[2], dtype='int32', value=7))
+            ])
             return out
 
         main_program = Program()
@@ -155,17 +172,17 @@ def fn_3():
             pred_2 = layers.less_than(x, y)  # false: 0.3 < 0.1
             pred_1 = layers.less_than(z, x)  # true: 0.2 < 0.3
 
-            out_1 = layers.case(
-                pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3)
+            out_1 = layers.case(pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)],
+                                default=fn_3)
 
-            out_2 = layers.case(
-                pred_fn_pairs=[(pred_2, fn_1), (pred_1, fn_2)], default=fn_3)
+            out_2 = layers.case(pred_fn_pairs=[(pred_2, fn_1), (pred_1, fn_2)],
+                                default=fn_3)
 
-            out_3 = layers.case(
-                pred_fn_pairs=[(x == y, fn_1), (x == z, fn_2)], default=fn_3)
+            out_3 = layers.case(pred_fn_pairs=[(x == y, fn_1), (x == z, fn_2)],
+                                default=fn_3)
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
 
             res = exe.run(main_program, fetch_list=[out_1, out_2, out_3])
@@ -176,7 +193,9 @@ def fn_3():
 
 
 class TestAPICase_Error(unittest.TestCase):
+
     def test_error(self):
+
         def fn_1():
             return layers.fill_constant(shape=[4, 2], dtype='int32', value=1)
 
@@ -226,15 +245,18 @@ def type_error_default():
 
 # when optimizer in case
 class TestMutiTask(unittest.TestCase):
+
     def test_optimizer_in_case(self):
         BATCH_SIZE = 1
         INPUT_SIZE = 784
         EPOCH_NUM = 2
 
-        x = fluid.data(
-            name='x', shape=[BATCH_SIZE, INPUT_SIZE], dtype='float32')
-        y = fluid.data(
-            name='y', shape=[BATCH_SIZE, INPUT_SIZE], dtype='float32')
+        x = fluid.data(name='x',
+                       shape=[BATCH_SIZE, INPUT_SIZE],
+                       dtype='float32')
+        y = fluid.data(name='y',
+                       shape=[BATCH_SIZE, INPUT_SIZE],
+                       dtype='float32')
 
         switch_id = fluid.data(name='switch_id', shape=[1], dtype='int32')
 
diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py
index a828eca4f4ba7..6e9c9bcd147f1 100644
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
@@ -26,6 +26,7 @@
 
 
 class TestCastOpFp32ToFp64(OpTest):
+
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float32')}
@@ -44,6 +45,7 @@ def test_grad(self):
 
 
 class TestCastOpFp16ToFp32(OpTest):
+
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float16')}
@@ -60,6 +62,7 @@ def test_check_output(self):
 
 
 class TestCastOpFp32ToFp16(OpTest):
+
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float32')}
@@ -76,6 +79,7 @@ def test_check_output(self):
 
 
 class TestCastOpBf16ToFp32(OpTest):
+
     def setUp(self):
         ipt = np.array(np.random.randint(10, size=[10, 10])).astype('uint16')
         self.inputs = {'X': ipt}
@@ -92,6 +96,7 @@ def test_check_output(self):
 
 
 class TestCastOpFp32ToBf16(OpTest):
+
     def setUp(self):
         ipt = np.random.random(size=[10, 10]).astype('float32')
         self.inputs = {'X': ipt}
@@ -108,15 +113,17 @@ def test_check_output(self):
 
 
 class TestCastOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of cast_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.cast, x1, 'int32')
 
 
 class TestCastOpEager(unittest.TestCase):
+
     def test_eager(self):
         with paddle.fluid.dygraph.base.guard():
             with _test_eager_guard():
@@ -124,7 +131,8 @@ def test_eager(self):
                 x.stop_gradient = False
                 out = paddle.cast(x, "float32")
                 self.assertTrue(
-                    np.array_equal(out, np.ones([2, 2]).astype("float32")))
+                    np.array_equal(out,
+                                   np.ones([2, 2]).astype("float32")))
                 out.backward()
                 self.assertTrue(np.array_equal(x.gradient(), x.numpy()))
                 self.assertTrue(x.gradient().dtype == np.float16)
diff --git a/python/paddle/fluid/tests/unittests/test_center_loss.py b/python/paddle/fluid/tests/unittests/test_center_loss.py
index 07175579fdd60..fc64f37a26d8c 100644
--- a/python/paddle/fluid/tests/unittests/test_center_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_center_loss.py
@@ -22,6 +22,7 @@
 
 
 class TestCenterLossOp(OpTest):
+
     def setUp(self):
         self.op_type = "center_loss"
         self.dtype = np.float64
@@ -88,18 +89,21 @@ def test_check_grad(self):
 
 
 class TestCenterLossOpNoUpdate(TestCenterLossOp):
+
     def config(self):
         self.need_update = False
 
 
 class BadInputTestCenterLoss(unittest.TestCase):
+
     def test_error(self):
         with fluid.program_guard(fluid.Program()):
 
             def test_bad_x():
                 data = [[1, 2, 3, 4], [5, 6, 7, 8]]
-                label = fluid.layers.data(
-                    name='label', shape=[2, 1], dtype='int32')
+                label = fluid.layers.data(name='label',
+                                          shape=[2, 1],
+                                          dtype='int32')
                 res = fluid.layers.center_loss(
                     data,
                     label,
@@ -111,8 +115,9 @@ def test_bad_x():
             self.assertRaises(TypeError, test_bad_x)
 
             def test_bad_y():
-                data = fluid.layers.data(
-                    name='data', shape=[2, 32], dtype='float32')
+                data = fluid.layers.data(name='data',
+                                         shape=[2, 32],
+                                         dtype='float32')
                 label = [[2], [3]]
                 res = fluid.layers.center_loss(
                     data,
@@ -125,21 +130,18 @@ def test_bad_y():
             self.assertRaises(TypeError, test_bad_y)
 
             def test_bad_alpha():
-                data = fluid.layers.data(
-                    name='data2',
-                    shape=[2, 32],
-                    dtype='float32',
-                    append_batch_size=False)
-                label = fluid.layers.data(
-                    name='label2',
-                    shape=[2, 1],
-                    dtype='int32',
-                    append_batch_size=False)
-                alpha = fluid.layers.data(
-                    name='alpha',
-                    shape=[1],
-                    dtype='int64',
-                    append_batch_size=False)
+                data = fluid.layers.data(name='data2',
+                                         shape=[2, 32],
+                                         dtype='float32',
+                                         append_batch_size=False)
+                label = fluid.layers.data(name='label2',
+                                          shape=[2, 1],
+                                          dtype='int32',
+                                          append_batch_size=False)
+                alpha = fluid.layers.data(name='alpha',
+                                          shape=[1],
+                                          dtype='int64',
+                                          append_batch_size=False)
                 res = fluid.layers.center_loss(
                     data,
                     label,
diff --git a/python/paddle/fluid/tests/unittests/test_channel_shuffle.py b/python/paddle/fluid/tests/unittests/test_channel_shuffle.py
index b4a3fc387068c..eaccb2193dc14 100644
--- a/python/paddle/fluid/tests/unittests/test_channel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_channel_shuffle.py
@@ -42,6 +42,7 @@ def channel_shuffle_np(x, groups, data_format="NCHW"):
 
 
 class TestChannelShuffleOp(OpTest):
+
     def setUp(self):
         self.op_type = "channel_shuffle"
         self.init_data_format()
@@ -72,11 +73,13 @@ def test_check_grad(self):
 
 
 class TestChannelLast(TestChannelShuffleOp):
+
     def init_data_format(self):
         self.format = "NHWC"
 
 
 class TestChannelShuffleAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_1_np = np.random.random([2, 9, 4, 4]).astype("float64")
         self.x_2_np = np.random.random([2, 4, 4, 9]).astype("float64")
@@ -89,10 +92,12 @@ def test_static_graph_functional(self):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
             paddle.enable_static()
-            x_1 = paddle.fluid.data(
-                name="x", shape=[2, 9, 4, 4], dtype="float64")
-            x_2 = paddle.fluid.data(
-                name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            x_1 = paddle.fluid.data(name="x",
+                                    shape=[2, 9, 4, 4],
+                                    dtype="float64")
+            x_2 = paddle.fluid.data(name="x2",
+                                    shape=[2, 4, 4, 9],
+                                    dtype="float64")
             out_1 = F.channel_shuffle(x_1, 3)
             out_2 = F.channel_shuffle(x_2, 3, "NHWC")
 
@@ -117,10 +122,12 @@ def test_static_graph_layer(self):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
             paddle.enable_static()
-            x_1 = paddle.fluid.data(
-                name="x", shape=[2, 9, 4, 4], dtype="float64")
-            x_2 = paddle.fluid.data(
-                name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            x_1 = paddle.fluid.data(name="x",
+                                    shape=[2, 9, 4, 4],
+                                    dtype="float64")
+            x_2 = paddle.fluid.data(name="x2",
+                                    shape=[2, 4, 4, 9],
+                                    dtype="float64")
             # init instance
             ps_1 = paddle.nn.ChannelShuffle(3)
             ps_2 = paddle.nn.ChannelShuffle(3, "NHWC")
@@ -162,14 +169,14 @@ def run_dygraph(self, groups, data_format):
 
             paddle.disable_static(place=place)
 
-            channel_shuffle = paddle.nn.ChannelShuffle(
-                groups, data_format=data_format)
+            channel_shuffle = paddle.nn.ChannelShuffle(groups,
+                                                       data_format=data_format)
             result = channel_shuffle(paddle.to_tensor(x))
 
             self.assertTrue(np.allclose(result.numpy(), npresult))
 
-            result_functional = F.channel_shuffle(
-                paddle.to_tensor(x), 3, data_format)
+            result_functional = F.channel_shuffle(paddle.to_tensor(x), 3,
+                                                  data_format)
             self.assertTrue(np.allclose(result_functional.numpy(), npresult))
 
             channel_shuffle_str = 'groups={}'.format(groups)
@@ -185,7 +192,9 @@ def test_dygraph2(self):
 
 
 class TestChannelShuffleError(unittest.TestCase):
+
     def test_error_functional(self):
+
         def error_input():
             with paddle.fluid.dygraph.guard():
                 x = np.random.random([9, 4, 4]).astype("float64")
@@ -210,12 +219,13 @@ def error_groups_2():
         def error_data_format():
             with paddle.fluid.dygraph.guard():
                 x = np.random.random([2, 9, 4, 4]).astype("float64")
-                channel_shuffle = F.channel_shuffle(
-                    paddle.to_tensor(x), 3, "WOW")
+                channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3,
+                                                    "WOW")
 
         self.assertRaises(ValueError, error_data_format)
 
     def test_error_layer(self):
+
         def error_input_layer():
             with paddle.fluid.dygraph.guard():
                 x = np.random.random([9, 4, 4]).astype("float64")
diff --git a/python/paddle/fluid/tests/unittests/test_check_import_scipy.py b/python/paddle/fluid/tests/unittests/test_check_import_scipy.py
index 080d786cd62b4..c5e12e84c11ca 100644
--- a/python/paddle/fluid/tests/unittests/test_check_import_scipy.py
+++ b/python/paddle/fluid/tests/unittests/test_check_import_scipy.py
@@ -21,6 +21,7 @@ def my_import(name, globals=None, locals=None, fromlist=(), level=0):
 
 
 class importTest(unittest.TestCase):
+
     def test_import(self):
         testOsName = 'nt'
         old_import = builtins.__import__
diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py b/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
index 4c1b1e0f0bf90..1f90270630be1 100644
--- a/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
@@ -26,6 +26,7 @@
 
 
 class CheckpointerSaverTest(unittest.TestCase):
+
     def test(self):
         fs = HDFSClient("/usr/local/hadoop-2.7.7", None)
         dir_path = "./checkpointsaver_test"
diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
index 633aa2cd613b6..ac6525e7a478c 100644
--- a/python/paddle/fluid/tests/unittests/test_cholesky_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
@@ -26,7 +26,8 @@
 
 
 @skip_check_grad_ci(
-    reason="The input of cholesky_op should always be symmetric positive-definite. "
+    reason=
+    "The input of cholesky_op should always be symmetric positive-definite. "
     "However, OpTest calculates the numeric gradient of each element in input "
     "via small finite difference, which makes the input no longer symmetric "
     "positive-definite thus can not compute the Cholesky decomposition. "
@@ -34,13 +35,15 @@
     "check of cholesky_op, since it supports check gradient with a program "
     "and we can construct symmetric positive-definite matrices in the program")
 class TestCholeskyOp(OpTest):
+
     def setUp(self):
         self.op_type = "cholesky"
         self._input_shape = (2, 32, 32)
         self._upper = True
         self.init_config()
         self.trans_dims = list(range(len(self._input_shape) - 2)) + [
-            len(self._input_shape) - 1, len(self._input_shape) - 2
+            len(self._input_shape) - 1,
+            len(self._input_shape) - 2
         ]
         self.root_data = np.random.random(self._input_shape).astype("float64")
         # construct symmetric positive-definite matrice
@@ -69,8 +72,8 @@ def func(self, place):
         root_data = self.root_data[..., :3, :3]
         prog = fluid.Program()
         with fluid.program_guard(prog):
-            root = layers.create_parameter(
-                dtype=root_data.dtype, shape=root_data.shape)
+            root = layers.create_parameter(dtype=root_data.dtype,
+                                           shape=root_data.shape)
             root_t = layers.transpose(root, self.trans_dims)
             x = layers.matmul(x=root, y=root_t) + 1e-05
             out = paddle.cholesky(x, upper=self.attrs["upper"])
@@ -81,16 +84,19 @@ def init_config(self):
 
 
 class TestCholeskyOpLower(TestCholeskyOp):
+
     def init_config(self):
         self._upper = False
 
 
 class TestCholeskyOp2D(TestCholeskyOp):
+
     def init_config(self):
         self._input_shape = (64, 64)
 
 
 class TestDygraph(unittest.TestCase):
+
     def test_dygraph(self):
         if core.is_compiled_with_rocm():
             paddle.disable_static(place=fluid.CPUPlace())
@@ -104,6 +110,7 @@ def test_dygraph(self):
 
 
 class TestCholeskySingularAPI(unittest.TestCase):
+
     def setUp(self):
         self.places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and (not core.is_compiled_with_rocm()):
diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py
index bb45a52566211..d03cfed9697c7 100644
--- a/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py
@@ -20,6 +20,7 @@
 import scipy.linalg
 
 import sys
+
 sys.path.append("..")
 import paddle
 from op_test import OpTest
@@ -40,18 +41,17 @@ def cholesky_solution(X, B, upper=True):
         L = A
         U = A.T
     return scipy.linalg.solve_triangular(
-        U, scipy.linalg.solve_triangular(
-            L, B, lower=True))
+        U, scipy.linalg.solve_triangular(L, B, lower=True))
 
 
 #cholesky_solve implement 2
 def scipy_cholesky_solution(X, B, upper=True):
     if upper:
         umat = np.triu(X)
-        A = umat.T @umat
+        A = umat.T @ umat
     else:
         umat = np.tril(X)
-        A = umat @umat.T
+        A = umat @ umat.T
     K = scipy.linalg.cho_factor(A)
     return scipy.linalg.cho_solve(K, B)
 
@@ -88,7 +88,7 @@ def scipy_cholesky_solution_batch(bumat, bB, upper=True):
         batch *= d
     bx = []
     for b in range(batch):
-        # x = scipy_cholesky_solution(bumat[b], bB[b], upper)   #large matrix result error 
+        # x = scipy_cholesky_solution(bumat[b], bB[b], upper)   #large matrix result error
         x = cholesky_solution(bumat[b], bB[b], upper)
         bx.append(x)
     return np.array(bx).reshape(bshape)
@@ -111,8 +111,9 @@ def config(self):
     #get scipy result
     def set_output(self):
         umat = self.inputs['Y']
-        self.output = scipy_cholesky_solution_batch(
-            umat, self.inputs['X'], upper=self.upper)
+        self.output = scipy_cholesky_solution_batch(umat,
+                                                    self.inputs['X'],
+                                                    upper=self.upper)
 
     def setUp(self):
         self.op_type = "cholesky_solve"
@@ -155,6 +156,7 @@ def config(self):
 
 #API function test
 class TestCholeskySolveAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(2021)
         self.place = [paddle.CPUPlace()]
@@ -181,8 +183,10 @@ def check_static_result(self, place):
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
-                              feed={"x": x_np,
-                                    "y": umat},
+                              feed={
+                                  "x": x_np,
+                                  "y": umat
+                              },
                               fetch_list=[z])
             self.assertTrue(np.allclose(fetches[0], z_np))
 
@@ -193,6 +197,7 @@ def test_static(self):
 
     #test in dynamic mode
     def test_dygraph(self):
+
         def run(place):
             paddle.disable_static(place)
             x_np = np.random.random([20, 2]).astype(self.dtype)
@@ -212,6 +217,7 @@ def run(place):
 
     #test input with broadcast
     def test_broadcast(self):
+
         def run(place):
             paddle.disable_static()
             x_np = np.random.random([1, 30, 2]).astype(self.dtype)
@@ -232,17 +238,18 @@ def run(place):
 
 #test condition out of bounds
 class TestCholeskySolveOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input type of solve_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            y1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             self.assertRaises(TypeError, paddle.linalg.cholesky_solve, x1, y1)
 
-            # The data type of input must be float32 or float64.        
+            # The data type of input must be float32 or float64.
             x2 = fluid.data(name="x2", shape=[30, 30], dtype="bool")
             y2 = fluid.data(name="y2", shape=[30, 10], dtype="bool")
             self.assertRaises(TypeError, paddle.linalg.cholesky_solve, x2, y2)
diff --git a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
index 2ebf6070c8269..ec98b254e061b 100644
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
@@ -23,6 +23,7 @@
 
 
 class Segment(object):
+
     def __init__(self, chunk_type, start_idx, end_idx):
         self.chunk_type = chunk_type
         self.start_idx = start_idx
@@ -49,15 +50,15 @@ def fill_with_chunks(self, data, chunks):
         for chunk in chunks:
             if self.scheme == 'IOB':
                 data[chunk.start_idx] = chunk.chunk_type * self.num_tag_types
-                data[chunk.start_idx + 1:
-                     chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
+                data[chunk.start_idx + 1:chunk.
+                     end_idx] = chunk.chunk_type * self.num_tag_types + (
                          self.num_tag_types - 1)
                 data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
                     self.num_tag_types - 1
                 ) if chunk.start_idx < chunk.end_idx else data[chunk.start_idx]
             elif self.scheme == 'IOE':
-                data[chunk.start_idx:
-                     chunk.end_idx] = chunk.chunk_type * self.num_tag_types
+                data[chunk.start_idx:chunk.
+                     end_idx] = chunk.chunk_type * self.num_tag_types
                 data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
                     self.num_tag_types - 1)
 
@@ -67,15 +68,15 @@ def rand_chunks(self, starts, num_chunks):
         chunks = []
         # generate chunk beginnings
         chunk_begins = sorted(
-            np.random.choice(
-                list(range(starts[-1])), num_chunks, replace=False))
+            np.random.choice(list(range(starts[-1])), num_chunks,
+                             replace=False))
         seq_chunk_begins = []
         begin_idx = 0
         # divide chunks into sequences
         for i in range(len(starts) - 1):
             tmp_chunk_begins = []
-            while begin_idx < len(chunk_begins) and chunk_begins[
-                    begin_idx] < starts[i + 1]:
+            while begin_idx < len(
+                    chunk_begins) and chunk_begins[begin_idx] < starts[i + 1]:
                 tmp_chunk_begins.append(chunk_begins[begin_idx])
                 begin_idx += 1
             seq_chunk_begins.append(tmp_chunk_begins)
@@ -84,8 +85,9 @@ def rand_chunks(self, starts, num_chunks):
         for i in range(len(seq_chunk_begins)):
             for j in range(len(seq_chunk_begins[i])):
                 low = seq_chunk_begins[i][j]
-                high = seq_chunk_begins[i][j + 1] if j < len(seq_chunk_begins[
-                    i]) - 1 else starts[i + 1]
+                high = seq_chunk_begins[i][
+                    j + 1] if j < len(seq_chunk_begins[i]) - 1 else starts[i +
+                                                                           1]
                 chunk_ends.append(np.random.randint(low, high))
         # generate chunks
         for chunk_pos in zip(chunk_begins, chunk_ends):
@@ -94,11 +96,12 @@ def rand_chunks(self, starts, num_chunks):
         return chunks
 
     def gen_chunks(self, infer, label, starts):
-        chunks = self.rand_chunks(starts,
-                                  self.num_infer_chunks + self.num_label_chunks
-                                  - self.num_correct_chunks)
-        correct_chunks = np.random.choice(
-            list(range(len(chunks))), self.num_correct_chunks, replace=False)
+        chunks = self.rand_chunks(
+            starts, self.num_infer_chunks + self.num_label_chunks -
+            self.num_correct_chunks)
+        correct_chunks = np.random.choice(list(range(len(chunks))),
+                                          self.num_correct_chunks,
+                                          replace=False)
         infer_chunks = np.random.choice(
             [x for x in range(len(chunks)) if x not in correct_chunks],
             self.num_infer_chunks - self.num_correct_chunks,
@@ -142,10 +145,9 @@ def set_data(self):
         infer = np.zeros((self.batch_size, )).astype('int64')
         infer.fill(self.num_chunk_types * self.num_tag_types)
         label = np.copy(infer)
-        starts = np.random.choice(
-            list(range(1, self.batch_size)),
-            self.num_sequences - 1,
-            replace=False).tolist()
+        starts = np.random.choice(list(range(1, self.batch_size)),
+                                  self.num_sequences - 1,
+                                  replace=False).tolist()
         starts.extend([0, self.batch_size])
         starts = sorted(starts)
         self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = self.gen_chunks(
@@ -162,18 +164,15 @@ def set_data(self):
         f1 = float(2 * precision * recall) / (
             precision + recall) if self.num_correct_chunks else 0
         self.outputs = {
-            'Precision': np.asarray(
-                [precision], dtype='float32'),
-            'Recall': np.asarray(
-                [recall], dtype='float32'),
-            'F1-Score': np.asarray(
-                [f1], dtype='float32'),
-            'NumInferChunks': np.asarray(
-                [self.num_infer_chunks], dtype='int64'),
-            'NumLabelChunks': np.asarray(
-                [self.num_label_chunks], dtype='int64'),
-            'NumCorrectChunks': np.asarray(
-                [self.num_correct_chunks], dtype='int64')
+            'Precision': np.asarray([precision], dtype='float32'),
+            'Recall': np.asarray([recall], dtype='float32'),
+            'F1-Score': np.asarray([f1], dtype='float32'),
+            'NumInferChunks': np.asarray([self.num_infer_chunks],
+                                         dtype='int64'),
+            'NumLabelChunks': np.asarray([self.num_label_chunks],
+                                         dtype='int64'),
+            'NumCorrectChunks': np.asarray([self.num_correct_chunks],
+                                           dtype='int64')
         }
 
     def set_input(self, infer, label, lod):
@@ -189,6 +188,7 @@ def test_check_output(self):
 
 
 class TestChunkEvalOpWithExclude(TestChunkEvalOp):
+
     def set_confs(self):
         # Use the IOE scheme and labels with 3 chunk types
         self.scheme = 'IOE'
@@ -205,6 +205,7 @@ def set_confs(self):
 
 
 class TestChunkEvalOpWithTensorInput(TestChunkEvalOp):
+
     def set_input(self, infer, label, lod):
         max_len = np.max(lod)
         pad_infer = []
@@ -233,41 +234,41 @@ def set_input(self, infer, label, lod):
 
 
 class TestChunkEvalOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
             def test_input():
                 input_data = np.random.random(1, 1).astype("int64")
                 label_data = np.random.random(1).astype("int64")
-                fluid.layers.chunk_eval(
-                    input=input_data,
-                    label=label_data,
-                    chunk_scheme="IOB",
-                    num_chunk_types=3)
+                fluid.layers.chunk_eval(input=input_data,
+                                        label=label_data,
+                                        chunk_scheme="IOB",
+                                        num_chunk_types=3)
 
             self.assertRaises(TypeError, test_input)
 
             def test_label():
-                input_ = fluid.data(
-                    name="input", shape=[None, 1], dtype="int64")
+                input_ = fluid.data(name="input",
+                                    shape=[None, 1],
+                                    dtype="int64")
                 label_data = np.random.random(1).astype("int64")
-                fluid.layers.chunk_eval(
-                    input=input_,
-                    label=label_data,
-                    chunk_scheme="IOB",
-                    num_chunk_types=3)
+                fluid.layers.chunk_eval(input=input_,
+                                        label=label_data,
+                                        chunk_scheme="IOB",
+                                        num_chunk_types=3)
 
             self.assertRaises(TypeError, test_label)
 
             def test_type():
-                in_data = fluid.data(
-                    name="input_", shape=[None, 1], dtype="int32")
+                in_data = fluid.data(name="input_",
+                                     shape=[None, 1],
+                                     dtype="int32")
                 label = fluid.data(name="label_", shape=[1], dtype="int64")
-                fluid.layers.chunk_eval(
-                    input=in_data,
-                    label=label,
-                    chunk_scheme="IOB",
-                    num_chunk_types=3)
+                fluid.layers.chunk_eval(input=in_data,
+                                        label=label,
+                                        chunk_scheme="IOB",
+                                        num_chunk_types=3)
 
             self.assertRaises(TypeError, test_type)
 
diff --git a/python/paddle/fluid/tests/unittests/test_chunk_op.py b/python/paddle/fluid/tests/unittests/test_chunk_op.py
index 8488bfe773f83..d7362430f1a35 100644
--- a/python/paddle/fluid/tests/unittests/test_chunk_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_op.py
@@ -23,6 +23,7 @@
 
 
 class TestChunkOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The type of axis in chunk_op should be int or Variable.
@@ -55,6 +56,7 @@ def test_axis_type_tensor():
 
 
 class API_TestChunk(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = paddle.fluid.data('data1', shape=[4, 6, 6], dtype='float64')
@@ -64,8 +66,10 @@ def test_out(self):
             exe = paddle.static.Executor(place)
             input1 = np.random.random([4, 6, 6]).astype('float64')
             input2 = np.array([2]).astype('int32')
-            r0, r1, r2, = exe.run(feed={"data1": input1,
-                                        "data2": input2},
+            r0, r1, r2, = exe.run(feed={
+                "data1": input1,
+                "data2": input2
+            },
                                   fetch_list=[x0, x1, x2])
             ex_x0, ex_x1, ex_x2 = np.array_split(input1, 3, axis=2)
             self.assertTrue(np.allclose(ex_x0, r0))
@@ -74,6 +78,7 @@ def test_out(self):
 
 
 class API_TestChunk1(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = paddle.fluid.data('data1', shape=[4, 6, 6], dtype='float64')
@@ -90,6 +95,7 @@ def test_out(self):
 
 
 class API_TestDygraphChunk(unittest.TestCase):
+
     def test_out1(self):
         with fluid.dygraph.guard():
             input_1 = np.random.random([4, 6, 6]).astype("int32")
diff --git a/python/paddle/fluid/tests/unittests/test_class_center_sample_op.py b/python/paddle/fluid/tests/unittests/test_class_center_sample_op.py
index eb7d05df492ec..492dae47f2acb 100644
--- a/python/paddle/fluid/tests/unittests/test_class_center_sample_op.py
+++ b/python/paddle/fluid/tests/unittests/test_class_center_sample_op.py
@@ -34,8 +34,8 @@ def class_center_sample_numpy(label, classes_list, num_samples):
     for i in range(nranks):
         index = np.logical_and(unique_label >= class_interval[i],
                                unique_label < class_interval[i + 1])
-        pos_class_center_per_device.append(unique_label[index] - class_interval[
-            i])
+        pos_class_center_per_device.append(unique_label[index] -
+                                           class_interval[i])
         unique_label_per_device.append(unique_label[index])
 
     num_samples_per_device = []
@@ -57,6 +57,7 @@ def class_center_sample_numpy(label, classes_list, num_samples):
 
 
 class TestClassCenterSampleOp(OpTest):
+
     def initParams(self):
         self.op_type = "class_center_sample"
         self.batch_size = 20
@@ -74,8 +75,9 @@ def setUp(self):
         self.initParams()
         self.init_dtype()
         self.init_fix_seed()
-        label = np.random.randint(
-            0, self.num_classes, (self.batch_size, ), dtype=self.dtype)
+        label = np.random.randint(0,
+                                  self.num_classes, (self.batch_size, ),
+                                  dtype=self.dtype)
 
         remapped_label, sampled_class_center = class_center_sample_numpy(
             label, [self.num_classes], self.num_samples)
@@ -98,16 +100,19 @@ def test_check_output(self):
 
 
 class TestClassCenterSampleOpINT32(TestClassCenterSampleOp):
+
     def init_dtype(self):
         self.dtype = np.int32
 
 
 class TestClassCenterSampleOpFixSeed(TestClassCenterSampleOp):
+
     def init_fix_seed(self):
         self.fix_seed = True
 
 
 class TestClassCenterSampleV2(unittest.TestCase):
+
     def setUp(self):
         self.initParams()
         np.random.seed(self.seed)
@@ -132,21 +137,23 @@ def test_static(self):
 
     def check_static_result(self, place):
         with program_guard(Program(), Program()):
-            label_np = np.random.randint(
-                0, self.num_classes, (self.batch_size, ), dtype=self.dtype)
+            label_np = np.random.randint(0,
+                                         self.num_classes, (self.batch_size, ),
+                                         dtype=self.dtype)
 
-            label = paddle.static.data(
-                name='label', shape=[self.batch_size], dtype=self.dtype)
+            label = paddle.static.data(name='label',
+                                       shape=[self.batch_size],
+                                       dtype=self.dtype)
             remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample(
                 label, self.num_classes, self.num_samples)
 
             remapped_label_np, sampled_class_center_np = class_center_sample_numpy(
                 label_np, [self.num_classes], self.num_samples)
             exe = paddle.fluid.Executor(place)
-            [remapped_label_res, sampled_class_index_res] = exe.run(
-                paddle.fluid.default_main_program(),
-                feed={'label': label_np},
-                fetch_list=[remapped_label, sampled_class_index])
+            [remapped_label_res, sampled_class_index_res
+             ] = exe.run(paddle.fluid.default_main_program(),
+                         feed={'label': label_np},
+                         fetch_list=[remapped_label, sampled_class_index])
             np.testing.assert_allclose(remapped_label_res, remapped_label_np)
             np.testing.assert_allclose(
                 sampled_class_index_res[:len(sampled_class_center_np[0])],
@@ -158,8 +165,9 @@ def test_dynamic(self):
 
     def check_dynamic_result(self, place):
         with paddle.fluid.dygraph.guard(place):
-            label_np = np.random.randint(
-                0, self.num_classes, (self.batch_size, ), dtype=self.dtype)
+            label_np = np.random.randint(0,
+                                         self.num_classes, (self.batch_size, ),
+                                         dtype=self.dtype)
             label = paddle.to_tensor(label_np, dtype=self.dtype)
 
             remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample(
@@ -177,11 +185,13 @@ def check_dynamic_result(self, place):
 
 
 class TestClassCenterSampleV2INT32(TestClassCenterSampleV2):
+
     def init_dtype(self):
         self.dtype = np.int32
 
 
 class TestClassCenterSampleAPIError(unittest.TestCase):
+
     def setUp(self):
         self.initParams()
         np.random.seed(self.seed)
@@ -200,13 +210,14 @@ def init_dtype(self):
         self.dtype = np.int64
 
     def test_dynamic_errors(self):
+
         def test_num_samples():
             for place in self.places:
                 with paddle.fluid.dygraph.guard(place):
-                    label_np = np.random.randint(
-                        0,
-                        self.num_classes, (self.batch_size, ),
-                        dtype=self.dtype)
+                    label_np = np.random.randint(0,
+                                                 self.num_classes,
+                                                 (self.batch_size, ),
+                                                 dtype=self.dtype)
                     label = paddle.to_tensor(label_np)
 
                     remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample(
@@ -216,6 +227,7 @@ def test_num_samples():
 
 
 class TestClassCenterSampleAPIError1(unittest.TestCase):
+
     def setUp(self):
         self.initParams()
         np.random.seed(self.seed)
@@ -234,6 +246,7 @@ def init_dtype(self):
         self.dtype = np.int64
 
     def test_dynamic_errors(self):
+
         def test_empty_label():
             for place in self.places:
                 with paddle.fluid.dygraph.guard(place):
@@ -245,10 +258,10 @@ def test_empty_label():
         def test_group_value():
             for place in self.places:
                 with paddle.fluid.dygraph.guard(place):
-                    label_np = np.random.randint(
-                        0,
-                        self.num_classes, (self.batch_size, ),
-                        dtype=self.dtype)
+                    label_np = np.random.randint(0,
+                                                 self.num_classes,
+                                                 (self.batch_size, ),
+                                                 dtype=self.dtype)
                     label = paddle.to_tensor(label_np)
 
                     remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample(
diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
index 7f137cf137146..8eb4c7a8be965 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
@@ -24,6 +24,7 @@
 
 
 class TestClipByNormOp(OpTest):
+
     def setUp(self):
         self.max_relative_error = 0.006
         self.init_dtype()
@@ -31,7 +32,9 @@ def setUp(self):
         input = np.random.random(self.shape).astype(self.dtype)
         input[np.abs(input) < self.max_relative_error] = 0.5
         self.op_type = "clip_by_norm"
-        self.inputs = {'X': input, }
+        self.inputs = {
+            'X': input,
+        }
         self.attrs = {}
         self.attrs['max_norm'] = self.max_norm
         norm = np.sqrt(np.sum(np.square(input)))
@@ -53,24 +56,28 @@ def init_dtype(self):
 
 
 class TestCase1(TestClipByNormOp):
+
     def initTestCase(self):
         self.shape = (100, )
         self.max_norm = 1e20
 
 
 class TestCase2(TestClipByNormOp):
+
     def initTestCase(self):
         self.shape = (16, 16)
         self.max_norm = 0.1
 
 
 class TestCase3(TestClipByNormOp):
+
     def initTestCase(self):
         self.shape = (4, 8, 16)
         self.max_norm = 1.0
 
 
 class TestClipByNormOpFp16(TestClipByNormOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -82,24 +89,28 @@ def test_check_output(self):
 
 
 class TestClipByNormOpFp16Case1(TestClipByNormOpFp16):
+
     def initTestCase(self):
         self.shape = (100, )
         self.max_norm = 1e20
 
 
 class TestClipByNormOpFp16Case2(TestClipByNormOpFp16):
+
     def initTestCase(self):
         self.shape = (16, 16)
         self.max_norm = 0.1
 
 
 class TestClipByNormOpFp16Case3(TestClipByNormOpFp16):
+
     def initTestCase(self):
         self.shape = (4, 8, 16)
         self.max_norm = 1.0
 
 
 class TestClipByNormOpWithSelectedRows(unittest.TestCase):
+
     def check_with_place(self, place):
         self.config_test_case()
         scope = core.Scope()
@@ -116,8 +127,10 @@ def check_with_place(self, place):
         out_selected_rows = scope.var('Out').get_selected_rows()
 
         # run clip_by_norm_op
-        clip_by_norm_op = fluid.op.Operator(
-            "clip_by_norm", max_norm=self.max_norm, X='X', Out='Out')
+        clip_by_norm_op = fluid.op.Operator("clip_by_norm",
+                                            max_norm=self.max_norm,
+                                            X='X',
+                                            Out='Out')
         clip_by_norm_op.run(scope, place)
 
         # check output
@@ -133,8 +146,10 @@ def check_with_place(self, place):
         else:
             output = y_np
         self.assertTrue(
-            np.allclose(
-                np.array(out_tensor), output, atol=1e-5, equal_nan=False))
+            np.allclose(np.array(out_tensor),
+                        output,
+                        atol=1e-5,
+                        equal_nan=False))
 
     def test_clip_by_norm_with_selected_ros(self):
         places = [core.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 121b91d741546..61ff4a63befd5 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -24,6 +24,7 @@
 
 
 class TestClipOp(OpTest):
+
     def setUp(self):
         self.max_relative_error = 0.006
         self.python_api = paddle.clip
@@ -71,6 +72,7 @@ def initTestCase(self):
 
 
 class TestCase1(TestClipOp):
+
     def initTestCase(self):
         self.dtype = np.float32
         self.shape = (8, 16, 8)
@@ -79,6 +81,7 @@ def initTestCase(self):
 
 
 class TestCase2(TestClipOp):
+
     def initTestCase(self):
         self.dtype = np.float32
         self.shape = (8, 16)
@@ -87,6 +90,7 @@ def initTestCase(self):
 
 
 class TestCase3(TestClipOp):
+
     def initTestCase(self):
         self.dtype = np.float32
         self.shape = (4, 8, 16)
@@ -95,6 +99,7 @@ def initTestCase(self):
 
 
 class TestCase4(TestClipOp):
+
     def initTestCase(self):
         self.dtype = np.float32
         self.shape = (4, 8, 8)
@@ -105,6 +110,7 @@ def initTestCase(self):
 
 
 class TestCase5(TestClipOp):
+
     def initTestCase(self):
         self.dtype = np.float32
         self.shape = (4, 8, 16)
@@ -113,6 +119,7 @@ def initTestCase(self):
 
 
 class TestCase6(TestClipOp):
+
     def initTestCase(self):
         self.dtype == np.float16
         self.shape = (4, 8, 8)
@@ -123,6 +130,7 @@ def initTestCase(self):
 
 
 class TestClipOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
@@ -142,6 +150,7 @@ def test_dtype():
 
 
 class TestClipAPI(unittest.TestCase):
+
     def _executed_api(self, x, min=None, max=None):
         return paddle.clip(x, min, max)
 
@@ -153,8 +162,8 @@ def test_clip(self):
         min = fluid.data(name='min', shape=[1], dtype='float32')
         max = fluid.data(name='max', shape=[1], dtype='float32')
 
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
 
         out_1 = self._executed_api(images, min=min, max=max)
@@ -165,12 +174,15 @@ def test_clip(self):
         out_6 = self._executed_api(images, max=max)
         out_7 = self._executed_api(images, max=-1.)
         out_8 = self._executed_api(images)
-        out_9 = self._executed_api(
-            paddle.cast(images, 'float64'), min=0.2, max=0.9)
-        out_10 = self._executed_api(
-            paddle.cast(images * 10, 'int32'), min=2, max=8)
-        out_11 = self._executed_api(
-            paddle.cast(images * 10, 'int64'), min=2, max=8)
+        out_9 = self._executed_api(paddle.cast(images, 'float64'),
+                                   min=0.2,
+                                   max=0.9)
+        out_10 = self._executed_api(paddle.cast(images * 10, 'int32'),
+                                    min=2,
+                                    max=8)
+        out_11 = self._executed_api(paddle.cast(images * 10, 'int64'),
+                                    min=2,
+                                    max=8)
 
         res1, res2, res3, res4, res5, res6, res7, res8, res9, res10, res11 = exe.run(
             fluid.default_main_program(),
@@ -193,7 +205,8 @@ def test_clip(self):
         self.assertTrue(np.allclose(res7, data.clip(max=-1)))
         self.assertTrue(np.allclose(res8, data))
         self.assertTrue(
-            np.allclose(res9, data.astype(np.float64).clip(0.2, 0.9)))
+            np.allclose(res9,
+                        data.astype(np.float64).clip(0.2, 0.9)))
         self.assertTrue(
             np.allclose(res10, (data * 10).astype(np.int32).clip(2, 8)))
         self.assertTrue(
@@ -202,8 +215,8 @@ def test_clip(self):
 
     def func_clip_dygraph(self):
         paddle.disable_static()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         paddle.disable_static(place)
         data_shape = [1, 9, 9, 4]
         data = np.random.random(data_shape).astype('float32')
@@ -217,10 +230,12 @@ def func_clip_dygraph(self):
         images = paddle.to_tensor(data, dtype='float32')
         out_3 = self._executed_api(images, min=v_min, max=v_max)
 
-        out_4 = self._executed_api(
-            paddle.cast(images * 10, 'int32'), min=2, max=8)
-        out_5 = self._executed_api(
-            paddle.cast(images * 10, 'int64'), min=2, max=8)
+        out_4 = self._executed_api(paddle.cast(images * 10, 'int32'),
+                                   min=2,
+                                   max=8)
+        out_5 = self._executed_api(paddle.cast(images * 10, 'int64'),
+                                   min=2,
+                                   max=8)
         # test with numpy.generic
         out_6 = self._executed_api(images, min=np.abs(0.2), max=np.abs(0.8))
 
@@ -267,6 +282,7 @@ def test_errors(self):
 
 
 class TestInplaceClipAPI(TestClipAPI):
+
     def _executed_api(self, x, min=None, max=None):
         return x.clip_(min, max)
 
diff --git a/python/paddle/fluid/tests/unittests/test_coalesce_tensor_op.py b/python/paddle/fluid/tests/unittests/test_coalesce_tensor_op.py
index 868a72334247d..495d405c46bad 100644
--- a/python/paddle/fluid/tests/unittests/test_coalesce_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_coalesce_tensor_op.py
@@ -25,6 +25,7 @@
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestAllocContinuousSpace(OpTest):
+
     def setUp(self):
         self.op_type = "coalesce_tensor"
         self.dtype, self.fluid_dtype = self.init_dtype()
@@ -80,13 +81,15 @@ def init_output(self, input_list, set_constant, constant):
         return outputs, coalesce_tensor_var
 
     def test_check_output(self):
-        self.check_output_with_place(
-            place=core.CUDAPlace(0), no_check_set=["FusedOutput"], atol=1e-5)
+        self.check_output_with_place(place=core.CUDAPlace(0),
+                                     no_check_set=["FusedOutput"],
+                                     atol=1e-5)
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestAllocContinuousSpace2(TestAllocContinuousSpace):
+
     def init_attr(self):
         return {
             "copy_data": False,
@@ -97,8 +100,9 @@ def init_attr(self):
         }
 
     def test_check_output(self):
-        self.check_output_with_place(
-            place=core.CUDAPlace(0), no_check_set=["FusedOutput"], atol=1e-5)
+        self.check_output_with_place(place=core.CUDAPlace(0),
+                                     no_check_set=["FusedOutput"],
+                                     atol=1e-5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
index a2f56c428012c..5f854ccefcb97 100644
--- a/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
@@ -22,11 +22,12 @@
 
 
 class TestCollectFPNProposalstOp(OpTest):
+
     def set_data(self):
         self.init_test_case()
         self.make_rois()
-        self.scores_input = [('y%d' % i,
-                              (self.scores[i].reshape(-1, 1), self.rois_lod[i]))
+        self.scores_input = [('y%d' % i, (self.scores[i].reshape(-1, 1),
+                                          self.rois_lod[i]))
                              for i in range(self.num_level)]
         self.rois, self.lod = self.calc_rois_collect()
         inputs_x = [('x%d' % i, (self.roi_inputs[i][:, 1:], self.rois_lod[i]))
@@ -36,7 +37,9 @@ def set_data(self):
             "MultiLevelScores": self.scores_input,
             'MultiLevelRoIsNum': []
         }
-        self.attrs = {'post_nms_topN': self.post_nms_top_n, }
+        self.attrs = {
+            'post_nms_topN': self.post_nms_top_n,
+        }
         self.outputs = {
             'FpnRois': (self.rois, [self.lod]),
             'RoisNum': np.array(self.lod).astype('int32')
@@ -101,26 +104,28 @@ def test_check_output(self):
 
 
 class TestCollectFPNProposalstOpWithRoisNum(TestCollectFPNProposalstOp):
+
     def set_data(self):
         self.init_test_case()
         self.make_rois()
-        self.scores_input = [('y%d' % i,
-                              (self.scores[i].reshape(-1, 1), self.rois_lod[i]))
+        self.scores_input = [('y%d' % i, (self.scores[i].reshape(-1, 1),
+                                          self.rois_lod[i]))
                              for i in range(self.num_level)]
         self.rois, self.lod = self.calc_rois_collect()
         inputs_x = [('x%d' % i, (self.roi_inputs[i][:, 1:], self.rois_lod[i]))
                     for i in range(self.num_level)]
-        rois_num_per_level = [
-            ('rois%d' % i, np.array(self.rois_lod[i][0]).astype('int32'))
-            for i in range(self.num_level)
-        ]
+        rois_num_per_level = [('rois%d' % i,
+                               np.array(self.rois_lod[i][0]).astype('int32'))
+                              for i in range(self.num_level)]
 
         self.inputs = {
             'MultiLevelRois': inputs_x,
             "MultiLevelScores": self.scores_input,
             'MultiLevelRoIsNum': rois_num_per_level
         }
-        self.attrs = {'post_nms_topN': self.post_nms_top_n, }
+        self.attrs = {
+            'post_nms_topN': self.post_nms_top_n,
+        }
         self.outputs = {
             'FpnRois': (self.rois, [self.lod]),
             'RoisNum': np.array(self.lod).astype('int32')
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
index dbf77fafcc47d..ebc52ded8bc72 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
@@ -23,6 +23,7 @@
 
 
 class TestCollectiveAllgatherAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
index eed2388f36ffe..5ec08aa72e29f 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
@@ -23,6 +23,7 @@
 
 
 class TestCollectiveAllreduceAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/test_collective_alltoall_api.py
index bb6a8c29bc508..2fe1252846cb3 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_alltoall_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_alltoall_api.py
@@ -23,6 +23,7 @@
 
 
 class TestCollectiveAllToAllAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
@@ -30,11 +31,10 @@ def test_alltoall_nccl(self):
         self.check_with_place("collective_alltoall_api.py", "alltoall", "nccl")
 
     def test_alltoall_nccl_dygraph(self):
-        self.check_with_place(
-            "collective_alltoall_api_dygraph.py",
-            "alltoall",
-            "nccl",
-            static_mode="0")
+        self.check_with_place("collective_alltoall_api_dygraph.py",
+                              "alltoall",
+                              "nccl",
+                              static_mode="0")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index a4e71db3d3850..46cf0f4fcadb8 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -31,6 +31,7 @@
 
 
 class TestCollectiveAPIRunnerBase(object):
+
     def get_model(self, train_prog, startup_prog, rank, indata=None):
         raise NotImplementedError(
             "get model should be implemented by child class.")
@@ -90,6 +91,7 @@ def runtime_main(test_class, col_type):
 
 
 class TestDistBase(unittest.TestCase):
+
     def setUp(self):
         self._port_set = set()
         self._trainers = 2
@@ -98,6 +100,7 @@ def setUp(self):
         self._python_interp = sys.executable
 
     def _find_free_port(self):
+
         def __free_port():
             with closing(socket.socket(socket.AF_INET,
                                        socket.SOCK_STREAM)) as s:
@@ -157,18 +160,16 @@ def _run_cluster(self, model_file, envs):
         tr1_cmd = tr_cmd % (self._python_interp, model_file)
         tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w")
         tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
-        #print(tr0_cmd) 
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.strip().split(),
-            stdout=subprocess.PIPE,
-            stderr=tr0_pipe,
-            env=env0)
-
-        tr1_proc = subprocess.Popen(
-            tr0_cmd.strip().split(),
-            stdout=subprocess.PIPE,
-            stderr=tr1_pipe,
-            env=env1)
+        #print(tr0_cmd)
+        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr0_pipe,
+                                    env=env0)
+
+        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr1_pipe,
+                                    env=env1)
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
@@ -223,8 +224,8 @@ def check_with_place(self,
         else:
             required_envs["FLAGS_enable_eager_mode"] = "%d" % 0
 
-        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
-                                                         required_envs)
+        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(
+            model_file, required_envs)
         np.random.seed(pid0)
         input1 = np.random.random((10, 1000))
         np.random.seed(pid1)
@@ -251,11 +252,9 @@ def check_with_place(self,
         elif col_type == "allreduce":
             need_result = input1 + input2
             self.assertTrue(
-                np.allclose(
-                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr0_out, need_result, rtol=1e-05, atol=1e-05))
             self.assertTrue(
-                np.allclose(
-                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out, need_result, rtol=1e-05, atol=1e-05))
         elif col_type == "parallel_embedding":
             result_data = tr0_out[0]
             np.random.seed(2020)
@@ -263,24 +262,23 @@ def check_with_place(self,
             for i in range(result_data.shape[0]):
                 for j in range(result_data.shape[1]):
                     data = result_data[i][j]
-                    assert np.allclose(
-                        tr0_out[1][i][j], need_result[data], atol=1e-08)
+                    assert np.allclose(tr0_out[1][i][j],
+                                       need_result[data],
+                                       atol=1e-08)
         elif col_type == "row_parallel_linear":
             result_data = tr0_out[0]
             np.random.seed(2020)
             weight = np.random.rand(1000, 16)
             need_result = np.matmul(input1, weight)
             self.assertTrue(
-                np.allclose(
-                    result_data, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(result_data, need_result, rtol=1e-05, atol=1e-05))
         elif col_type == "column_parallel_linear":
             result_data = tr0_out[0]
             np.random.seed(2020)
             weight = np.random.rand(1000, 16)
             need_result = np.matmul(input1, weight)
             self.assertTrue(
-                np.allclose(
-                    result_data, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(result_data, need_result, rtol=1e-05, atol=1e-05))
         elif col_type == "alltoall":
             need_result1 = np.vstack((input1[0:input1.shape[0] // 2, :],
                                       input2[0:input2.shape[0] // 2, :]))
@@ -289,16 +287,13 @@ def check_with_place(self,
             tr0_out = np.vstack(tr0_out)
             tr1_out = np.vstack(tr1_out)
             self.assertTrue(
-                np.allclose(
-                    tr0_out, need_result1, rtol=1e-05, atol=1e-05))
+                np.allclose(tr0_out, need_result1, rtol=1e-05, atol=1e-05))
             self.assertTrue(
-                np.allclose(
-                    tr1_out, need_result2, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out, need_result2, rtol=1e-05, atol=1e-05))
         elif col_type == "sendrecv":
             result_data = tr1_out[0]
             self.assertTrue(
-                np.allclose(
-                    input1, result_data, rtol=1e-05, atol=1e-05))
+                np.allclose(input1, result_data, rtol=1e-05, atol=1e-05))
         elif col_type == "global_gather":
             in_feat = 2
             n_expert = 2
@@ -375,15 +370,13 @@ def check_with_place(self,
             if result1 == []:
                 output1 = np.array([])
             else:
-                output1 = np.concatenate(
-                    result1, axis=0).reshape(
-                        sum(local_expert_count1), in_feat)
+                output1 = np.concatenate(result1, axis=0).reshape(
+                    sum(local_expert_count1), in_feat)
             if result2 == []:
                 output2 = np.array([])
             else:
-                output2 = np.concatenate(
-                    result2, axis=0).reshape(
-                        sum(local_expert_count2), in_feat)
+                output2 = np.concatenate(result2, axis=0).reshape(
+                    sum(local_expert_count2), in_feat)
 
             if tr0_out[0] is None or tr0_out[0].shape[0] == 0:
                 tr0_out[0] = np.array([])
@@ -392,24 +385,20 @@ def check_with_place(self,
                 tr1_out[0] = np.array([])
 
             self.assertTrue(
-                np.allclose(
-                    tr0_out[0], output1, rtol=1e-05, atol=1e-05))
+                np.allclose(tr0_out[0], output1, rtol=1e-05, atol=1e-05))
             self.assertTrue(
-                np.allclose(
-                    tr1_out[0], output2, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out[0], output2, rtol=1e-05, atol=1e-05))
             if static_mode == 0:
                 self.assertTrue(
-                    np.allclose(
-                        tr0_out[1],
-                        2 * local_input_buf1,
-                        rtol=1e-05,
-                        atol=1e-05))
+                    np.allclose(tr0_out[1],
+                                2 * local_input_buf1,
+                                rtol=1e-05,
+                                atol=1e-05))
                 self.assertTrue(
-                    np.allclose(
-                        tr1_out[1],
-                        2 * local_input_buf2,
-                        rtol=1e-05,
-                        atol=1e-05))
+                    np.allclose(tr1_out[1],
+                                2 * local_input_buf2,
+                                rtol=1e-05,
+                                atol=1e-05))
 
         elif col_type == "global_scatter":
             np.random.seed(pid0)
@@ -463,23 +452,19 @@ def check_with_place(self,
                 tr1_out[0] = np.array([])
 
             self.assertTrue(
-                np.allclose(
-                    tr0_out[0], output1, rtol=1e-05, atol=1e-05))
+                np.allclose(tr0_out[0], output1, rtol=1e-05, atol=1e-05))
             self.assertTrue(
-                np.allclose(
-                    tr1_out[0], output2, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out[0], output2, rtol=1e-05, atol=1e-05))
             if static_mode == 0:
                 self.assertTrue(
-                    np.allclose(
-                        tr0_out[1],
-                        2 * local_input_buf1,
-                        rtol=1e-05,
-                        atol=1e-05))
+                    np.allclose(tr0_out[1],
+                                2 * local_input_buf1,
+                                rtol=1e-05,
+                                atol=1e-05))
                 self.assertTrue(
-                    np.allclose(
-                        tr1_out[1],
-                        2 * local_input_buf2,
-                        rtol=1e-05,
-                        atol=1e-05))
+                    np.allclose(tr1_out[1],
+                                2 * local_input_buf2,
+                                rtol=1e-05,
+                                atol=1e-05))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py b/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
index d0a67baa61e69..873ae77f08ec1 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
@@ -23,6 +23,7 @@
 
 
 class TestCollectiveBarrierAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_base.py b/python/paddle/fluid/tests/unittests/test_collective_base.py
index 1b55395ede5f6..55a009b369198 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_base.py
@@ -30,6 +30,7 @@
 
 
 class TestCollectiveRunnerBase(object):
+
     def get_model(self, train_prog, startup_prog):
         raise NotImplementedError(
             "get model should be implemented by child class.")
@@ -40,9 +41,8 @@ def wait_server_ready(self, endpoints):
             not_ready_endpoints = []
             for ep in endpoints:
                 ip_port = ep.split(":")
-                with closing(
-                        socket.socket(socket.AF_INET,
-                                      socket.SOCK_STREAM)) as sock:
+                with closing(socket.socket(socket.AF_INET,
+                                           socket.SOCK_STREAM)) as sock:
                     sock.settimeout(2)
                     sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                     if hasattr(socket, 'SO_REUSEPORT'):
@@ -55,13 +55,14 @@ def wait_server_ready(self, endpoints):
                         not_ready_endpoints.append(ep)
             if not all_ok:
                 sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-                sys.stderr.write("not ready endpoints:" + str(
-                    not_ready_endpoints) + "\n")
+                sys.stderr.write("not ready endpoints:" +
+                                 str(not_ready_endpoints) + "\n")
                 sys.stderr.flush()
                 time.sleep(3)
             else:
                 break
 
+
 #endpoints should be ["ip1:port1","ip2:port2"]
 
     def initCommunicator(self, program, rank, nranks, wait_port,
@@ -71,30 +72,27 @@ def initCommunicator(self, program, rank, nranks, wait_port,
         if rank == 0 and wait_port:
             self.wait_server_ready(other_endpoints)
         block = program.global_block()
-        nccl_id_var = block.create_var(
-            name=nameGen.generate('nccl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
+        nccl_id_var = block.create_var(name=nameGen.generate('nccl_id'),
+                                       persistable=True,
+                                       type=core.VarDesc.VarType.RAW)
 
-        block.append_op(
-            type='c_gen_nccl_id',
-            inputs={},
-            outputs={'Out': nccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints
-            })
+        block.append_op(type='c_gen_nccl_id',
+                        inputs={},
+                        outputs={'Out': nccl_id_var},
+                        attrs={
+                            'rank': rank,
+                            'endpoint': current_endpoint,
+                            'other_endpoints': other_endpoints
+                        })
 
-        block.append_op(
-            type='c_comm_init',
-            inputs={'X': nccl_id_var},
-            outputs={},
-            attrs={
-                'nranks': nranks,
-                'rank': rank,
-                'ring_id': self.global_ring_id
-            })
+        block.append_op(type='c_comm_init',
+                        inputs={'X': nccl_id_var},
+                        outputs={},
+                        attrs={
+                            'nranks': nranks,
+                            'rank': rank,
+                            'ring_id': self.global_ring_id
+                        })
 
     def run_trainer(self, args):
         train_prog = fluid.Program()
@@ -138,6 +136,7 @@ def runtime_main(test_class, col_type, sub_type):
 
 
 class TestDistBase(unittest.TestCase):
+
     def setUp(self):
         self._port_set = set()
         self._trainers = 2
@@ -146,6 +145,7 @@ def setUp(self):
         self._python_interp = sys.executable
 
     def _find_free_port(self):
+
         def __free_port():
             with closing(socket.socket(socket.AF_INET,
                                        socket.SOCK_STREAM)) as s:
@@ -185,18 +185,16 @@ def _run_cluster(self, model_file, envs):
         tr1_cmd = tr_cmd % (self._python_interp, model_file)
         tr0_pipe = open("/tmp/tr0_err.log", "wb")
         tr1_pipe = open("/tmp/tr1_err.log", "wb")
-        #print(tr0_cmd) 
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.strip().split(),
-            stdout=subprocess.PIPE,
-            stderr=tr0_pipe,
-            env=env0)
+        #print(tr0_cmd)
+        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr0_pipe,
+                                    env=env0)
 
-        tr1_proc = subprocess.Popen(
-            tr0_cmd.strip().split(),
-            stdout=subprocess.PIPE,
-            stderr=tr1_pipe,
-            env=env1)
+        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr1_pipe,
+                                    env=env1)
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
@@ -227,8 +225,8 @@ def check_with_place(self,
         if check_error_log:
             required_envs["GLOG_v"] = "3"
             required_envs["GLOG_logtostderr"] = "1"
-        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
-                                                         required_envs)
+        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(
+            model_file, required_envs)
         np.random.seed(pid0)
         input1 = np.random.random((10, 1000))
         np.random.seed(pid1)
@@ -253,26 +251,21 @@ def check_with_place(self,
         elif col_type == "allreduce":
             need_result = input1 + input2
             self.assertTrue(
-                np.allclose(
-                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr0_out, need_result, rtol=1e-05, atol=1e-05))
             self.assertTrue(
-                np.allclose(
-                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out, need_result, rtol=1e-05, atol=1e-05))
         elif col_type == "reduce_scatter":
             tmp = input1 + input2
             need_result1 = tmp[0:tmp.shape[0] // 2]
             need_result2 = tmp[tmp.shape[0] // 2:]
             self.assertTrue(
-                np.allclose(
-                    tr0_out, need_result1, rtol=1e-05, atol=1e-05))
+                np.allclose(tr0_out, need_result1, rtol=1e-05, atol=1e-05))
             self.assertTrue(
-                np.allclose(
-                    tr1_out, need_result2, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out, need_result2, rtol=1e-05, atol=1e-05))
         elif col_type == "sendrecv":
             need_result = input1
             self.assertTrue(
-                np.allclose(
-                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out, need_result, rtol=1e-05, atol=1e-05))
         elif col_type == "identity":
             need_result1 = input1
             need_result2 = input2
@@ -291,28 +284,24 @@ def check_with_place(self,
         elif col_type == "concat":
             need_result = np.concatenate((input1, input2), axis=1)
             self.assertTrue(
-                np.allclose(
-                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr0_out, need_result, rtol=1e-05, atol=1e-05))
             self.assertTrue(
-                np.allclose(
-                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out, need_result, rtol=1e-05, atol=1e-05))
         elif col_type == "split":
             need_result1 = np.split(input1, 2, axis=1)[0]
             need_result2 = np.split(input2, 2, axis=1)[1]
             self.assertTrue(
-                np.allclose(
-                    tr0_out, need_result1, rtol=1e-05, atol=1e-05))
+                np.allclose(tr0_out, need_result1, rtol=1e-05, atol=1e-05))
             self.assertTrue(
-                np.allclose(
-                    tr1_out, need_result2, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out, need_result2, rtol=1e-05, atol=1e-05))
         elif col_type == "sendrecv_array":
             need_result1 = np.array([[0, 1, 2]])
             need_result2 = np.array([[3, 4, 5]])
             self.assertTrue(
-                np.allclose(
-                    tr1_out[0][0], need_result1, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out[0][0], need_result1, rtol=1e-05,
+                            atol=1e-05))
             self.assertTrue(
-                np.allclose(
-                    tr1_out[0][1], need_result2, rtol=1e-05, atol=1e-05))
+                np.allclose(tr1_out[0][1], need_result2, rtol=1e-05,
+                            atol=1e-05))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
index 702e04311570e..289cb7152ac36 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
@@ -23,6 +23,7 @@
 
 
 class TestCollectiveBroadcastAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_cpu_barrier_with_gloo.py b/python/paddle/fluid/tests/unittests/test_collective_cpu_barrier_with_gloo.py
index 438e360f60e22..bf503d804ca23 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_cpu_barrier_with_gloo.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_cpu_barrier_with_gloo.py
@@ -28,7 +28,9 @@
 
 
 class CollectiveCPUBarrierWithGlooTest(unittest.TestCase):
+
     def find_free_port(self):
+
         def _free_port():
             with closing(socket.socket(socket.AF_INET,
                                        socket.SOCK_STREAM)) as s:
@@ -97,9 +99,9 @@ def test_barrier_func_with_multiprocess(self):
         procs_out_dict = manager.dict()
         jobs = []
         for id in range(num_of_ranks):
-            p = multiprocessing.Process(
-                target=self.barrier_func,
-                args=(id, num_of_ranks, ep_str, procs_out_dict, sleep_time))
+            p = multiprocessing.Process(target=self.barrier_func,
+                                        args=(id, num_of_ranks, ep_str,
+                                              procs_out_dict, sleep_time))
             jobs.append(p)
             p.start()
         for proc in jobs:
@@ -117,9 +119,9 @@ def test_barrier_op_with_multiprocess(self):
         procs_out_dict = manager.dict()
         jobs = []
         for id in range(num_of_ranks):
-            p = multiprocessing.Process(
-                target=self.barrier_op,
-                args=(id, num_of_ranks, ep_str, procs_out_dict, sleep_time))
+            p = multiprocessing.Process(target=self.barrier_op,
+                                        args=(id, num_of_ranks, ep_str,
+                                              procs_out_dict, sleep_time))
             jobs.append(p)
             p.start()
         for proc in jobs:
diff --git a/python/paddle/fluid/tests/unittests/test_collective_global_gather.py b/python/paddle/fluid/tests/unittests/test_collective_global_gather.py
index 6809f3970f683..949c4562ec922 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_global_gather.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_global_gather.py
@@ -22,6 +22,7 @@
 
 
 class TestCollectiveGlobalGatherAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
@@ -31,20 +32,18 @@ def test_global_gather_nccl(self):
                               "nccl")
 
     def test_global_gather_nccl_dygraph(self):
-        self.check_with_place(
-            "collective_global_gather_dygraph.py",
-            "global_gather",
-            "nccl",
-            static_mode="0",
-            eager_mode=False)
+        self.check_with_place("collective_global_gather_dygraph.py",
+                              "global_gather",
+                              "nccl",
+                              static_mode="0",
+                              eager_mode=False)
 
     def test_global_gather_nccl_dygraph_eager(self):
-        self.check_with_place(
-            "collective_global_gather_dygraph.py",
-            "global_gather",
-            "nccl",
-            static_mode="0",
-            eager_mode=True)
+        self.check_with_place("collective_global_gather_dygraph.py",
+                              "global_gather",
+                              "nccl",
+                              static_mode="0",
+                              eager_mode=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_collective_global_scatter.py b/python/paddle/fluid/tests/unittests/test_collective_global_scatter.py
index 1485bafa387f5..9bd112d906f84 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_global_scatter.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_global_scatter.py
@@ -22,6 +22,7 @@
 
 
 class TestCollectiveSelectScatterAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
@@ -31,20 +32,18 @@ def test_global_scatter_nccl(self):
                               "nccl")
 
     def test_global_scatter_nccl_dygraph(self):
-        self.check_with_place(
-            "collective_global_scatter_dygraph.py",
-            "global_scatter",
-            "nccl",
-            static_mode="0",
-            eager_mode=False)
+        self.check_with_place("collective_global_scatter_dygraph.py",
+                              "global_scatter",
+                              "nccl",
+                              static_mode="0",
+                              eager_mode=False)
 
     def test_global_scatter_nccl_dygraph_eager(self):
-        self.check_with_place(
-            "collective_global_scatter_dygraph.py",
-            "global_scatter",
-            "nccl",
-            static_mode="0",
-            eager_mode=True)
+        self.check_with_place("collective_global_scatter_dygraph.py",
+                              "global_scatter",
+                              "nccl",
+                              static_mode="0",
+                              eager_mode=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_collective_optimizer.py b/python/paddle/fluid/tests/unittests/test_collective_optimizer.py
index c91586b4d50d6..182f2b5c32f3e 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_optimizer.py
@@ -32,6 +32,7 @@
 
 
 class CollectiveOptimizerTest(unittest.TestCase):
+
     def test_ds_as_None(self):
         optimizer = fluid.optimizer.AdamOptimizer()
         dist_optimizer = CollectiveOptimizer(optimizer, strategy=None)
diff --git a/python/paddle/fluid/tests/unittests/test_collective_process_group.py b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
index e00f90f4b0d5f..5355c58753e6f 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_process_group.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
@@ -19,6 +19,7 @@
 
 
 class TestProcessGroup(TestMultipleGpus):
+
     def test_process_group_nccl(self):
         self.run_mnist_2gpu('process_group_nccl.py')
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce.py b/python/paddle/fluid/tests/unittests/test_collective_reduce.py
index c062746742810..306fb7beb8ac4 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_reduce.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce.py
@@ -23,6 +23,7 @@
 
 
 class TestCReduceOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
index 721f446c9f094..2da70f5a94dfd 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
@@ -23,6 +23,7 @@
 
 
 class TestCollectiveReduceAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_scatter.py b/python/paddle/fluid/tests/unittests/test_collective_scatter.py
index ea34d1cab5a5a..aa6676cb94127 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_scatter.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter.py
@@ -23,6 +23,7 @@
 
 
 class TestCScatterOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
index 3a37da52b8e92..18c720c562814 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
@@ -23,6 +23,7 @@
 
 
 class TestCollectiveScatterAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_sendrecv.py b/python/paddle/fluid/tests/unittests/test_collective_sendrecv.py
index d3bcd0a7e6985..4df303d1b3c60 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_sendrecv.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_sendrecv.py
@@ -23,6 +23,7 @@
 
 
 class TestSendRecvOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/test_collective_sendrecv_api.py
index f1d5ec1300e0e..c0a14f7e2860c 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_sendrecv_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_sendrecv_api.py
@@ -23,6 +23,7 @@
 
 
 class TestCollectiveSendRecvAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
@@ -33,11 +34,10 @@ def _setup_config(self):
 
     def test_sendrecv_nccl_dygraph(self):
         if paddle.fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "collective_sendrecv_api_dygraph.py",
-                "sendrecv",
-                "nccl",
-                static_mode='0')
+            self.check_with_place("collective_sendrecv_api_dygraph.py",
+                                  "sendrecv",
+                                  "nccl",
+                                  static_mode='0')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_collective_split_col_linear.py b/python/paddle/fluid/tests/unittests/test_collective_split_col_linear.py
index a88d3f119911d..632c38cc1ce91 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_split_col_linear.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_split_col_linear.py
@@ -23,6 +23,7 @@
 
 
 class TestColParallelLinearAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_split_embedding.py b/python/paddle/fluid/tests/unittests/test_collective_split_embedding.py
index f13ef81f036f3..58424984f7aef 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_split_embedding.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_split_embedding.py
@@ -23,6 +23,7 @@
 
 
 class TestParallelEmbeddingAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py b/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py
index 955adf08c4824..af10878ddaca2 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py
@@ -22,12 +22,15 @@
 
 
 class TestCollectiveSplitAssert(unittest.TestCase):
+
     def network(self):
         fleet.init()
-        data = paddle.static.data(
-            name='tindata', shape=[10, 1000], dtype="float32")
-        emb_out = paddle.distributed.split(
-            data, (7, 8), operation="embedding", num_partitions=2)
+        data = paddle.static.data(name='tindata',
+                                  shape=[10, 1000],
+                                  dtype="float32")
+        emb_out = paddle.distributed.split(data, (7, 8),
+                                           operation="embedding",
+                                           num_partitions=2)
 
     def test_assert(self):
         with self.assertRaises(AssertionError):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_split_row_linear.py b/python/paddle/fluid/tests/unittests/test_collective_split_row_linear.py
index 08aedb1feac16..b8240e8d991fe 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_split_row_linear.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_split_row_linear.py
@@ -23,6 +23,7 @@
 
 
 class TestRowParallelLinearAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_wait.py b/python/paddle/fluid/tests/unittests/test_collective_wait.py
index b34ace80723d7..c15a2d56d248c 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_wait.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_wait.py
@@ -23,14 +23,14 @@
 
 
 class TestCWaitOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
     def test_allreduce_wait(self):
-        self.check_with_place(
-            "collective_allreduce_op_wait.py",
-            "allreduce",
-            check_error_log=True)
+        self.check_with_place("collective_allreduce_op_wait.py",
+                              "allreduce",
+                              check_error_log=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_async.py b/python/paddle/fluid/tests/unittests/test_communicator_async.py
index 5e67fe3e446f4..f6fd89dc37dae 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_async.py
@@ -21,6 +21,7 @@
 import numpy
 
 import paddle
+
 paddle.enable_static()
 
 import paddle.fluid as fluid
@@ -29,6 +30,7 @@
 
 
 class TestCommunicator(unittest.TestCase):
+
     def net(self):
         x = fluid.layers.data(name='x', shape=[1], dtype='float32')
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index d9c6406422277..c3f2566d6f7f4 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -34,6 +34,7 @@
 
 
 class TestCommunicatorGeoEnd2End(unittest.TestCase):
+
     def net(self):
         x = fluid.layers.data(name='x', shape=[13], dtype='float32')
         x1 = fluid.layers.data(name='x1', shape=[1], dtype='int64', lod_level=1)
@@ -56,6 +57,7 @@ def net(self):
         return avg_cost, x, x1, y
 
     def fake_reader(self):
+
         def reader():
             for i in range(10000):
                 x = numpy.random.random((1, 13)).astype('float32')
@@ -168,10 +170,9 @@ def runTest(self):
 
         ps_cmd = "{} {}".format(_python, server_file)
 
-        ps_proc = subprocess.Popen(
-            ps_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
+        ps_proc = subprocess.Popen(ps_cmd.strip().split(" "),
+                                   stdout=subprocess.PIPE,
+                                   stderr=subprocess.PIPE)
 
         time.sleep(5)
 
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
index 5a126bfa66acd..c4a7edc21f92b 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
@@ -31,6 +31,7 @@
 
 
 class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
+
     def net(self):
         x = fluid.layers.data(name='x', shape=[13], dtype='float32')
         y_predict = fluid.layers.fc(input=x, size=1, act=None)
@@ -41,6 +42,7 @@ def net(self):
         return avg_cost, x, y
 
     def fake_reader(self):
+
         def reader():
             for i in range(10000):
                 x = numpy.random.random((1, 13)).astype('float32')
@@ -140,10 +142,9 @@ def runTest(self):
         _python = sys.executable
 
         ps_cmd = "{} {}".format(_python, server_file)
-        ps_proc = subprocess.Popen(
-            ps_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
+        ps_proc = subprocess.Popen(ps_cmd.strip().split(" "),
+                                   stdout=subprocess.PIPE,
+                                   stderr=subprocess.PIPE)
 
         os.environ["http_proxy"] = ""
         os.environ["https_proxy"] = ""
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
index 7b0c28e64bccf..f32cc4f5c9327 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
@@ -21,6 +21,7 @@
 import numpy
 
 import paddle
+
 paddle.enable_static()
 
 import paddle.fluid as fluid
@@ -29,6 +30,7 @@
 
 
 class TestCommunicator(unittest.TestCase):
+
     def test_communicator_ps_gpu(self):
         with open("test_communicator_ps_gpu.txt", "w") as f:
             data = "1 0.6 1 0.7\n"
@@ -70,8 +72,10 @@ def test_communicator_ps_gpu(self):
         optimizer.minimize(avg_cost)
 
         dataset = paddle.distributed.InMemoryDataset()
-        dataset.init(
-            batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars)
+        dataset.init(batch_size=32,
+                     thread_num=1,
+                     pipe_command="cat",
+                     use_var=slots_vars)
         dataset.set_filelist(["test_communicator_ps_gpu.txt"])
         dataset.set_date("20211111")
         dataset.load_into_memory(is_shuffle=True)
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_sync.py b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
index 8f52414f8cb29..f13cfd885765a 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
@@ -19,6 +19,7 @@
 
 import os
 import paddle
+
 paddle.enable_static()
 
 import paddle.fluid as fluid
@@ -28,6 +29,7 @@
 
 
 class TestCommunicator(unittest.TestCase):
+
     def net(self):
         x = fluid.layers.data(name='x', shape=[1], dtype='float32')
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index bd9ec6b663f60..06432e4b00720 100755
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -25,7 +25,9 @@
 
 
 def create_test_class(op_type, typename, callback):
+
     class Cls(op_test.OpTest):
+
         def setUp(self):
             a = numpy.random.random(size=(10, 7)).astype(typename)
             b = numpy.random.random(size=(10, 7)).astype(typename)
@@ -45,12 +47,11 @@ def test_errors(self):
                 y = fluid.layers.data(name='y', shape=[2], dtype='int32')
                 a = fluid.layers.data(name='a', shape=[2], dtype='int16')
                 if self.op_type == "less_than":
-                    self.assertRaises(
-                        TypeError,
-                        fluid.layers.less_than,
-                        x=x,
-                        y=y,
-                        force_cpu=1)
+                    self.assertRaises(TypeError,
+                                      fluid.layers.less_than,
+                                      x=x,
+                                      y=y,
+                                      force_cpu=1)
                 op = eval("fluid.layers.%s" % self.op_type)
                 self.assertRaises(TypeError, op, x=x, y=y, cond=1)
                 self.assertRaises(TypeError, op, x=x, y=a)
@@ -74,7 +75,9 @@ def test_errors(self):
 
 
 def create_paddle_case(op_type, callback):
+
     class PaddleCls(unittest.TestCase):
+
         def setUp(self):
             self.op_type = op_type
             self.input_x = np.array([1, 2, 3, 4]).astype(np.int64)
@@ -92,8 +95,10 @@ def test_api(self):
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
                 exe = fluid.Executor(self.place)
-                res, = exe.run(feed={"x": self.input_x,
-                                     "y": self.input_y},
+                res, = exe.run(feed={
+                    "x": self.input_x,
+                    "y": self.input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == self.real_result).all(), True)
 
@@ -106,8 +111,10 @@ def test_api_float(self):
                     op = eval("paddle.%s" % (self.op_type))
                     out = op(x, y)
                     exe = fluid.Executor(self.place)
-                    res, = exe.run(feed={"x": self.input_x,
-                                         "y": 1.0},
+                    res, = exe.run(feed={
+                        "x": self.input_x,
+                        "y": 1.0
+                    },
                                    fetch_list=[out])
                 self.real_result = np.array([1, 0, 0, 0]).astype(np.int64)
                 self.assertEqual((res == self.real_result).all(), True)
@@ -144,10 +151,10 @@ def test_dynamic_api_float(self):
         def test_not_equal(self):
             if self.op_type == "not_equal":
                 paddle.disable_static()
-                x = paddle.to_tensor(
-                    np.array([1.2e-8, 2, 2, 1]), dtype="float32")
-                y = paddle.to_tensor(
-                    np.array([1.1e-8, 2, 2, 1]), dtype="float32")
+                x = paddle.to_tensor(np.array([1.2e-8, 2, 2, 1]),
+                                     dtype="float32")
+                y = paddle.to_tensor(np.array([1.1e-8, 2, 2, 1]),
+                                     dtype="float32")
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
                 self.real_result = np.array([0, 0, 0, 0]).astype(np.int64)
@@ -155,6 +162,7 @@ def test_not_equal(self):
                 paddle.enable_static()
 
         def test_assert(self):
+
             def test_dynamic_api_string(self):
                 if self.op_type == "equal":
                     paddle.disable_static()
@@ -178,8 +186,9 @@ def test_dynamic_api_bool(self):
         def test_broadcast_api_1(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
-                x = paddle.static.data(
-                    name='x', shape=[1, 2, 1, 3], dtype='int32')
+                x = paddle.static.data(name='x',
+                                       shape=[1, 2, 1, 3],
+                                       dtype='int32')
                 y = paddle.static.data(name='y', shape=[1, 2, 3], dtype='int32')
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
@@ -187,8 +196,10 @@ def test_broadcast_api_1(self):
                 input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
                 input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
@@ -196,16 +207,19 @@ def test_broadcast_api_2(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
                 x = paddle.static.data(name='x', shape=[1, 2, 3], dtype='int32')
-                y = paddle.static.data(
-                    name='y', shape=[1, 2, 1, 3], dtype='int32')
+                y = paddle.static.data(name='y',
+                                       shape=[1, 2, 1, 3],
+                                       dtype='int32')
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
                 exe = paddle.static.Executor(self.place)
                 input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
                 input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
@@ -220,8 +234,10 @@ def test_broadcast_api_3(self):
                 input_x = np.arange(0, 5).reshape((5)).astype(np.int32)
                 input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(np.int32)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
@@ -236,8 +252,10 @@ def test_bool_api_4(self):
                 input_x = np.array([True, False, True]).astype(np.bool)
                 input_y = np.array([True, True, False]).astype(np.bool)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
@@ -252,8 +270,10 @@ def test_bool_broadcast_api_4(self):
                 input_x = np.array([True, False, True]).astype(np.bool)
                 input_y = np.array([True]).astype(np.bool)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
@@ -280,17 +300,19 @@ def test_attr_name(self):
 
 
 class TestCompareOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input x and y of compare_op must be Variable.
             x = fluid.layers.data(name='x', shape=[1], dtype="float32")
-            y = fluid.create_lod_tensor(
-                numpy.array([[-1]]), [[1]], fluid.CPUPlace())
+            y = fluid.create_lod_tensor(numpy.array([[-1]]), [[1]],
+                                        fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.greater_equal, x, y)
 
 
 class API_TestElementwise_Equal(unittest.TestCase):
+
     def test_api(self):
         paddle.enable_static()
         with fluid.program_guard(fluid.Program(), fluid.Program()):
@@ -313,6 +335,7 @@ def test_api(self):
 
 
 class TestCompareOpPlace(unittest.TestCase):
+
     def test_place_1(self):
         paddle.enable_static()
         place = paddle.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py b/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py
index 29e3436948e98..5ee1ac07e8ae4 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py
@@ -23,7 +23,9 @@
 
 
 def create_test_not_equal_class(op_type, typename, callback):
+
     class Cls(op_test.OpTest):
+
         def setUp(self):
             x = np.random.random(size=(10, 7)).astype(typename)
             y = np.random.random(size=(10, 7)).astype(typename)
@@ -42,7 +44,9 @@ def test_output(self):
 
 
 def create_test_not_shape_equal_class(op_type, typename, callback):
+
     class Cls(op_test.OpTest):
+
         def setUp(self):
             x = np.random.random(size=(10, 7)).astype(typename)
             y = np.random.random(size=(10)).astype(typename)
@@ -61,7 +65,9 @@ def test_output(self):
 
 
 def create_test_equal_class(op_type, typename, callback):
+
     class Cls(op_test.OpTest):
+
         def setUp(self):
             x = y = np.random.random(size=(10, 7)).astype(typename)
             z = callback(x, y)
@@ -79,7 +85,9 @@ def test_output(self):
 
 
 def create_test_dim1_class(op_type, typename, callback):
+
     class Cls(op_test.OpTest):
+
         def setUp(self):
             x = y = np.random.random(size=(1)).astype(typename)
             x = np.array([True, False, True]).astype(typename)
@@ -107,6 +115,7 @@ def test_output(self):
 
 
 class TestEqualReduceAPI(unittest.TestCase):
+
     def test_name(self):
         x = fluid.layers.assign(np.array([3, 4], dtype="int32"))
         y = fluid.layers.assign(np.array([3, 4], dtype="int32"))
diff --git a/python/paddle/fluid/tests/unittests/test_compat.py b/python/paddle/fluid/tests/unittests/test_compat.py
index 7f26582889de6..59dbb81889855 100644
--- a/python/paddle/fluid/tests/unittests/test_compat.py
+++ b/python/paddle/fluid/tests/unittests/test_compat.py
@@ -19,6 +19,7 @@
 
 
 class TestCompatible(unittest.TestCase):
+
     def test_type(self):
         self.assertEqual(cpt.int_type, int)
         self.assertEqual(cpt.long_type, int)
diff --git a/python/paddle/fluid/tests/unittests/test_compiled_program.py b/python/paddle/fluid/tests/unittests/test_compiled_program.py
index 79ee383f3f9ef..e16ac4881c761 100644
--- a/python/paddle/fluid/tests/unittests/test_compiled_program.py
+++ b/python/paddle/fluid/tests/unittests/test_compiled_program.py
@@ -24,24 +24,29 @@
 
 
 class TestCompiledProgram(unittest.TestCase):
+
     def setUp(self):
         self.seed = 100
         self.img = np.random.random(size=(16, 784)).astype('float32')
-        self.label = np.random.randint(
-            low=0, high=10, size=[16, 1], dtype=np.int64)
+        self.label = np.random.randint(low=0,
+                                       high=10,
+                                       size=[16, 1],
+                                       dtype=np.int64)
         with new_program_scope():
             paddle.seed(self.seed)
             paddle.framework.random._manual_program_seed(self.seed)
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
 
             loss = simple_fc_net()
             exe.run(fluid.default_startup_program())
 
             loss_data, = exe.run(fluid.default_main_program(),
-                                 feed={"image": self.img,
-                                       "label": self.label},
+                                 feed={
+                                     "image": self.img,
+                                     "label": self.label
+                                 },
                                  fetch_list=[loss.name])
             self.loss = loss_data[0]
 
@@ -49,8 +54,8 @@ def test_compiled_program_base(self):
         with new_program_scope():
             paddle.seed(self.seed)
             paddle.framework.random._manual_program_seed(self.seed)
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
 
             loss = simple_fc_net()
@@ -58,8 +63,10 @@ def test_compiled_program_base(self):
             compiled_prog = fluid.CompiledProgram(fluid.default_main_program())
 
             loss_data, = exe.run(compiled_prog,
-                                 feed={"image": self.img,
-                                       "label": self.label},
+                                 feed={
+                                     "image": self.img,
+                                     "label": self.label
+                                 },
                                  fetch_list=[loss.name])
             self.assertTrue(np.array_equal(loss_data[0], self.loss))
 
@@ -67,30 +74,34 @@ def test_compiled_program_with_data_parallel(self):
         with new_program_scope():
             paddle.seed(self.seed)
             paddle.framework.random._manual_program_seed(self.seed)
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
 
             loss = simple_fc_net()
             exe.run(fluid.default_startup_program())
-            compiled_prog = fluid.CompiledProgram(fluid.default_main_program(
-            )).with_data_parallel(
-                loss_name=loss.name, places=[place])
+            compiled_prog = fluid.CompiledProgram(
+                fluid.default_main_program()).with_data_parallel(
+                    loss_name=loss.name, places=[place])
 
             loss_data, = exe.run(compiled_prog,
-                                 feed={"image": self.img,
-                                       "label": self.label},
+                                 feed={
+                                     "image": self.img,
+                                     "label": self.label
+                                 },
                                  fetch_list=[loss.name])
             self.assertTrue(np.array_equal(loss_data[0], self.loss))
 
 
 class TestCompiledProgramError(unittest.TestCase):
+
     def test_program_or_graph_error(self):
         self.assertRaises(TypeError, fluid.CompiledProgram, "program")
 
     def build_simple_model(self):
-        img = fluid.layers.data(
-            name='image', shape=[1, 28, 28], dtype='float32')
+        img = fluid.layers.data(name='image',
+                                shape=[1, 28, 28],
+                                dtype='float32')
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
         prediction = fluid.layers.fc(input=img, size=10, act='softmax')
         loss = fluid.layers.cross_entropy(input=prediction, label=label)
diff --git a/python/paddle/fluid/tests/unittests/test_complex_abs.py b/python/paddle/fluid/tests/unittests/test_complex_abs.py
index a29d9baadead0..6c90e09d7cade 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_abs.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_abs.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,6 +24,7 @@
 
 
 class TestComplexAbsOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.python_api = paddle.abs
@@ -49,15 +50,15 @@ def test_check_output(self):
         self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=False)
+        self.check_grad(['X'],
+                        'Out',
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=False)
 
 
 class TestComplexAbsOpZeroValues(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.op_type = "abs"
@@ -71,8 +72,8 @@ def setUp(self):
         self.outputs = {'Out': self.out}
 
     def init_input_output(self):
-        self.x = np.zeros(self.shape).astype(self.dtype) + 1J * np.zeros(
-            self.shape).astype(self.dtype)
+        self.x = np.zeros(self.shape).astype(
+            self.dtype) + 1J * np.zeros(self.shape).astype(self.dtype)
         self.out = np.abs(self.x)
 
     def init_grad_input_output(self):
@@ -83,15 +84,15 @@ def test_check_output(self):
         self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=False)
+        self.check_grad(['X'],
+                        'Out',
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=False)
 
 
 class TestAbs(unittest.TestCase):
+
     def setUp(self):
         self._dtypes = ["float32", "float64"]
         self._places = [paddle.CPUPlace()]
@@ -112,6 +113,7 @@ def test_eager(self):
 
 
 class TestRealAbsOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.python_api = paddle.abs
@@ -136,12 +138,11 @@ def test_check_output(self):
         self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=False)
+        self.check_grad(['X'],
+                        'Out',
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_complex_cast.py b/python/paddle/fluid/tests/unittests/test_complex_cast.py
index 5da49ca62d90c..21db0a78e7296 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_cast.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_cast.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class TestComplexCastOp(unittest.TestCase):
+
     def test_complex_to_real(self):
         r = np.random.random(size=[10, 10]) * 10
         i = np.random.random(size=[10, 10])
diff --git a/python/paddle/fluid/tests/unittests/test_complex_elementwise_layers.py b/python/paddle/fluid/tests/unittests/test_complex_elementwise_layers.py
index d187d6d710bec..c110339bf586c 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_elementwise_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_elementwise_layers.py
@@ -30,6 +30,7 @@
 
 
 class TestComplexElementwiseLayers(unittest.TestCase):
+
     def setUp(self):
         self._dtypes = ["float32", "float64"]
         self._places = [paddle.CPUPlace()]
@@ -51,14 +52,14 @@ def assert_check(self, pd_result, np_result, place):
 
     def compare_by_basic_api(self, x, y):
         for place in self._places:
-            self.assert_check(
-                self.paddle_calc(x, y, "add", place), x + y, place)
-            self.assert_check(
-                self.paddle_calc(x, y, "sub", place), x - y, place)
-            self.assert_check(
-                self.paddle_calc(x, y, "mul", place), x * y, place)
-            self.assert_check(
-                self.paddle_calc(x, y, "div", place), x / y, place)
+            self.assert_check(self.paddle_calc(x, y, "add", place), x + y,
+                              place)
+            self.assert_check(self.paddle_calc(x, y, "sub", place), x - y,
+                              place)
+            self.assert_check(self.paddle_calc(x, y, "mul", place), x * y,
+                              place)
+            self.assert_check(self.paddle_calc(x, y, "div", place), x / y,
+                              place)
 
     def compare_op_by_basic_api(self, x, y):
         for place in self._places:
@@ -72,18 +73,18 @@ def compare_op_by_basic_api(self, x, y):
 
     def test_complex_xy(self):
         for dtype in self._dtypes:
-            x = rand([2, 3, 4, 5]).astype(dtype) + 1j * rand(
-                [2, 3, 4, 5]).astype(dtype)
-            y = rand([2, 3, 4, 5]).astype(dtype) + 1j * rand(
-                [2, 3, 4, 5]).astype(dtype)
+            x = rand([2, 3, 4, 5
+                      ]).astype(dtype) + 1j * rand([2, 3, 4, 5]).astype(dtype)
+            y = rand([2, 3, 4, 5
+                      ]).astype(dtype) + 1j * rand([2, 3, 4, 5]).astype(dtype)
 
             self.compare_by_basic_api(x, y)
             self.compare_op_by_basic_api(x, y)
 
     def test_complex_x_real_y(self):
         for dtype in self._dtypes:
-            x = rand([2, 3, 4, 5]).astype(dtype) + 1j * rand(
-                [2, 3, 4, 5]).astype(dtype)
+            x = rand([2, 3, 4, 5
+                      ]).astype(dtype) + 1j * rand([2, 3, 4, 5]).astype(dtype)
             y = rand([4, 5]).astype(dtype)
 
             # promote types cases
diff --git a/python/paddle/fluid/tests/unittests/test_complex_getitem.py b/python/paddle/fluid/tests/unittests/test_complex_getitem.py
index 5c181515f45ba..e399dea1ed9cb 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_getitem.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_getitem.py
@@ -21,6 +21,7 @@
 
 
 class TestComplexGetitemLayer(unittest.TestCase):
+
     def setUp(self):
         self._places = [fluid.CPUPlace()]
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_complex_grad_accumulated.py b/python/paddle/fluid/tests/unittests/test_complex_grad_accumulated.py
index ab40d7c006229..63f98efcfa61a 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_grad_accumulated.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_grad_accumulated.py
@@ -24,24 +24,27 @@
 
 
 class Optimization_ex1(paddle.nn.Layer):
+
     def __init__(self,
                  shape,
                  dtype,
-                 param_attr=paddle.nn.initializer.Uniform(
-                     low=-5., high=5.)):
+                 param_attr=paddle.nn.initializer.Uniform(low=-5., high=5.)):
         super(Optimization_ex1, self).__init__()
 
-        self.theta0 = self.create_parameter(
-            shape=shape, attr=param_attr, dtype=dtype, is_bias=False)
-        self.theta1 = self.create_parameter(
-            shape=shape, attr=param_attr, dtype=dtype, is_bias=False)
+        self.theta0 = self.create_parameter(shape=shape,
+                                            attr=param_attr,
+                                            dtype=dtype,
+                                            is_bias=False)
+        self.theta1 = self.create_parameter(shape=shape,
+                                            attr=param_attr,
+                                            dtype=dtype,
+                                            is_bias=False)
         self.A = paddle.to_tensor(
-            np.random.random((4, 4)).astype(dtype) + np.random.random((4, 4))
-            .astype(dtype) * 1j)
-        self.B = paddle.to_tensor(
-            np.random.random((4, 4)).astype(dtype) + np.random.random(
-                (4, 4)).astype(dtype) * 1j,
-            stop_gradient=False)
+            np.random.random((4, 4)).astype(dtype) +
+            np.random.random((4, 4)).astype(dtype) * 1j)
+        self.B = paddle.to_tensor(np.random.random(
+            (4, 4)).astype(dtype) + np.random.random((4, 4)).astype(dtype) * 1j,
+                                  stop_gradient=False)
 
     def forward(self, mode=1):
         jj = paddle.to_tensor(np.array([1j]).astype(np.complex64))
@@ -58,14 +61,15 @@ def forward(self, mode=1):
             return loss.real()
         elif mode == 3:
             # run without param
-            loss = paddle.sum(self.A + self.B) * (
-                paddle.sum(self.A + self.B).conj())
+            loss = paddle.sum(self.A + self.B) * (paddle.sum(self.A +
+                                                             self.B).conj())
             return loss.real()
         else:
             raise NotImplementedError
 
 
 class TestComplexGradAccumulated(unittest.TestCase):
+
     def setUp(self):
         self.devices = ['cpu']
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_complex_kron.py b/python/paddle/fluid/tests/unittests/test_complex_kron.py
index 24109357a4660..4f15256a8c596 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_kron.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_kron.py
@@ -21,6 +21,7 @@
 
 
 class ComplexKronTestCase(unittest.TestCase):
+
     def __init__(self, methodName='runTest', x=None, y=None):
         super(ComplexKronTestCase, self).__init__(methodName)
         self.x = x
@@ -53,27 +54,23 @@ def load_tests(loader, standard_tests, pattern):
     suite = unittest.TestSuite()
     for dtype in ["float32", "float64"]:
         suite.addTest(
-            ComplexKronTestCase(
-                x=np.random.randn(2, 2).astype(dtype) + 1j * np.random.randn(
-                    2, 2).astype(dtype),
-                y=np.random.randn(3, 3).astype(dtype) + 1j * np.random.randn(
-                    3, 3).astype(dtype)))
+            ComplexKronTestCase(x=np.random.randn(2, 2).astype(dtype) +
+                                1j * np.random.randn(2, 2).astype(dtype),
+                                y=np.random.randn(3, 3).astype(dtype) +
+                                1j * np.random.randn(3, 3).astype(dtype)))
         suite.addTest(
-            ComplexKronTestCase(
-                x=np.random.randn(2, 2).astype(dtype),
-                y=np.random.randn(3, 3).astype(dtype) + 1j * np.random.randn(
-                    3, 3).astype(dtype)))
+            ComplexKronTestCase(x=np.random.randn(2, 2).astype(dtype),
+                                y=np.random.randn(3, 3).astype(dtype) +
+                                1j * np.random.randn(3, 3).astype(dtype)))
         suite.addTest(
-            ComplexKronTestCase(
-                x=np.random.randn(2, 2).astype(dtype) + 1j * np.random.randn(
-                    2, 2).astype(dtype),
-                y=np.random.randn(3, 3).astype(dtype)))
+            ComplexKronTestCase(x=np.random.randn(2, 2).astype(dtype) +
+                                1j * np.random.randn(2, 2).astype(dtype),
+                                y=np.random.randn(3, 3).astype(dtype)))
 
         suite.addTest(
-            ComplexKronTestCase(
-                x=np.random.randn(2, 2).astype(dtype) + 1j * np.random.randn(
-                    2, 2).astype(dtype),
-                y=np.random.randn(2, 2, 3).astype(dtype)))
+            ComplexKronTestCase(x=np.random.randn(2, 2).astype(dtype) +
+                                1j * np.random.randn(2, 2).astype(dtype),
+                                y=np.random.randn(2, 2, 3).astype(dtype)))
 
     return suite
 
diff --git a/python/paddle/fluid/tests/unittests/test_complex_matmul.py b/python/paddle/fluid/tests/unittests/test_complex_matmul.py
index dac4e36ea673b..9be7933e9264f 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_matmul.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_matmul.py
@@ -21,6 +21,7 @@
 
 
 class TestComplexMatMulLayer(unittest.TestCase):
+
     def setUp(self):
         self._dtypes = ["float32", "float64"]
         self._places = [fluid.CPUPlace()]
@@ -36,9 +37,9 @@ def compare_by_basic_api(self, x, y, np_result):
                 pd_result = result.numpy()
                 self.assertTrue(
                     np.allclose(pd_result, np_result),
-                    "\nplace: {}\npaddle diff result:\n {}\nnumpy diff result:\n {}\n".
-                    format(place, pd_result[~np.isclose(pd_result, np_result)],
-                           np_result[~np.isclose(pd_result, np_result)]))
+                    "\nplace: {}\npaddle diff result:\n {}\nnumpy diff result:\n {}\n"
+                    .format(place, pd_result[~np.isclose(pd_result, np_result)],
+                            np_result[~np.isclose(pd_result, np_result)]))
 
     def compare_op_by_basic_api(self, x, y, np_result):
         for place in self._places:
@@ -49,9 +50,9 @@ def compare_op_by_basic_api(self, x, y, np_result):
                 pd_result = result.numpy()
                 self.assertTrue(
                     np.allclose(pd_result, np_result),
-                    "\nplace: {}\npaddle diff result:\n {}\nnumpy diff result:\n {}\n".
-                    format(place, pd_result[~np.isclose(pd_result, np_result)],
-                           np_result[~np.isclose(pd_result, np_result)]))
+                    "\nplace: {}\npaddle diff result:\n {}\nnumpy diff result:\n {}\n"
+                    .format(place, pd_result[~np.isclose(pd_result, np_result)],
+                            np_result[~np.isclose(pd_result, np_result)]))
 
     def test_complex_xy(self):
         for dtype in self._dtypes:
diff --git a/python/paddle/fluid/tests/unittests/test_complex_op.py b/python/paddle/fluid/tests/unittests/test_complex_op.py
index bd759f7a00f27..1faef17a2ade3 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_op.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -50,6 +50,7 @@ def ref_complex_grad(x, y, dout):
 
 
 class TestComplexOp(OpTest):
+
     def init_spec(self):
         self.x_shape = [10, 10]
         self.y_shape = [10, 10]
@@ -73,12 +74,11 @@ def test_check_grad(self):
         dout = self.out_grad
         dx, dy = ref_complex_grad(self.inputs['X'], self.inputs['Y'],
                                   self.out_grad)
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-            user_defined_grads=[dx, dy],
-            user_defined_grad_outputs=[dout],
-            check_eager=True)
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        user_defined_grads=[dx, dy],
+                        user_defined_grad_outputs=[dout],
+                        check_eager=True)
 
     def test_check_grad_ignore_x(self):
         dout = self.out_grad
@@ -86,28 +86,27 @@ def test_check_grad_ignore_x(self):
                                   self.out_grad)
         self.assertTupleEqual(dx.shape, tuple(self.x_shape))
         self.assertTupleEqual(dy.shape, tuple(self.y_shape))
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set('X'),
-            user_defined_grads=[dy],
-            user_defined_grad_outputs=[dout],
-            check_eager=True)
+        self.check_grad(['Y'],
+                        'Out',
+                        no_grad_set=set('X'),
+                        user_defined_grads=[dy],
+                        user_defined_grad_outputs=[dout],
+                        check_eager=True)
 
     def test_check_grad_ignore_y(self):
         dout = self.out_grad
         dx, dy = ref_complex_grad(self.inputs['X'], self.inputs['Y'],
                                   self.out_grad)
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            user_defined_grads=[dx],
-            user_defined_grad_outputs=[dout],
-            check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        no_grad_set=set('Y'),
+                        user_defined_grads=[dx],
+                        user_defined_grad_outputs=[dout],
+                        check_eager=True)
 
 
 class TestComplexOpBroadcast1(TestComplexOp):
+
     def init_spec(self):
         self.x_shape = [10, 3, 1, 4]
         self.y_shape = [100, 1]
@@ -115,6 +114,7 @@ def init_spec(self):
 
 
 class TestComplexOpBroadcast2(TestComplexOp):
+
     def init_spec(self):
         self.x_shape = [100, 1]
         self.y_shape = [10, 3, 1, 4]
@@ -122,6 +122,7 @@ def init_spec(self):
 
 
 class TestComplexOpBroadcast3(TestComplexOp):
+
     def init_spec(self):
         self.x_shape = [1, 100]
         self.y_shape = [100]
@@ -129,6 +130,7 @@ def init_spec(self):
 
 
 class TestComplexAPI(unittest.TestCase):
+
     def setUp(self):
         self.x = np.random.randn(10, 10)
         self.y = np.random.randn(10, 10)
@@ -151,8 +153,10 @@ def test_static(self):
         exe = static.Executor()
         exe.run(sp)
         [out_np] = exe.run(mp,
-                           feed={"x": self.x,
-                                 "y": self.y},
+                           feed={
+                               "x": self.x,
+                               "y": self.y
+                           },
                            fetch_list=[out])
         self.assertTrue(np.allclose(self.out, out_np))
 
diff --git a/python/paddle/fluid/tests/unittests/test_complex_reshape.py b/python/paddle/fluid/tests/unittests/test_complex_reshape.py
index dccfcf2e04576..c80970b33a735 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_reshape.py
@@ -21,6 +21,7 @@
 
 
 class TestComplexReshape(unittest.TestCase):
+
     def setUp(self):
         self._dtypes = ["float32", "float64"]
         self._places = [paddle.CPUPlace()]
@@ -30,8 +31,8 @@ def setUp(self):
     def test_shape_norm_dims(self):
         for dtype in self._dtypes:
             x_np = np.random.randn(
-                2, 3, 4).astype(dtype) + 1j * np.random.randn(2, 3,
-                                                              4).astype(dtype)
+                2, 3,
+                4).astype(dtype) + 1j * np.random.randn(2, 3, 4).astype(dtype)
             shape = (2, -1)
             for place in self._places:
                 with dg.guard(place):
@@ -43,8 +44,8 @@ def test_shape_norm_dims(self):
     def test_shape_omit_dims(self):
         for dtype in self._dtypes:
             x_np = np.random.randn(
-                2, 3, 4).astype(dtype) + 1j * np.random.randn(2, 3,
-                                                              4).astype(dtype)
+                2, 3,
+                4).astype(dtype) + 1j * np.random.randn(2, 3, 4).astype(dtype)
             shape = (0, -1)
             shape_ = (2, 12)
             for place in self._places:
diff --git a/python/paddle/fluid/tests/unittests/test_complex_simplenet.py b/python/paddle/fluid/tests/unittests/test_complex_simplenet.py
index 4191a0487c7d1..21bc886837a9d 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_simplenet.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_simplenet.py
@@ -24,15 +24,17 @@
 
 
 class Optimization_ex1(paddle.nn.Layer):
+
     def __init__(self,
                  shape,
-                 param_attr=paddle.nn.initializer.Uniform(
-                     low=-5., high=5.),
+                 param_attr=paddle.nn.initializer.Uniform(low=-5., high=5.),
                  dtype='float32'):
         super(Optimization_ex1, self).__init__()
 
-        self.theta = self.create_parameter(
-            shape=shape, attr=param_attr, dtype=dtype, is_bias=False)
+        self.theta = self.create_parameter(shape=shape,
+                                           attr=param_attr,
+                                           dtype=dtype,
+                                           is_bias=False)
         self.A = paddle.to_tensor(
             np.random.randn(4, 4) + np.random.randn(4, 4) * 1j)
 
@@ -42,6 +44,7 @@ def forward(self):
 
 
 class TestComplexSimpleNet(unittest.TestCase):
+
     def setUp(self):
         self.devices = ['cpu']
         if core.is_compiled_with_cuda():
@@ -54,8 +57,8 @@ def train(self, device):
         paddle.set_device(device)
 
         myLayer = Optimization_ex1(self.theta_size)
-        optimizer = paddle.optimizer.Adam(
-            learning_rate=self.learning_rate, parameters=myLayer.parameters())
+        optimizer = paddle.optimizer.Adam(learning_rate=self.learning_rate,
+                                          parameters=myLayer.parameters())
 
         for itr in range(self.iter):
             loss = myLayer()
diff --git a/python/paddle/fluid/tests/unittests/test_complex_sum_layer.py b/python/paddle/fluid/tests/unittests/test_complex_sum_layer.py
index a2f6d42dcb7fc..3c43dbd4582c9 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_sum_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_sum_layer.py
@@ -23,6 +23,7 @@
 
 
 class TestComplexSumLayer(unittest.TestCase):
+
     def setUp(self):
         self._dtypes = ["float32", "float64"]
         self._places = [paddle.CPUPlace()]
@@ -31,8 +32,9 @@ def setUp(self):
 
     def test_complex_basic_api(self):
         for dtype in self._dtypes:
-            input = rand([2, 10, 10]).astype(dtype) + 1j * rand(
-                [2, 10, 10]).astype(dtype)
+            input = rand([
+                2, 10, 10
+            ]).astype(dtype) + 1j * rand([2, 10, 10]).astype(dtype)
             for place in self._places:
                 with dg.guard(place):
                     var_x = dg.to_variable(input)
diff --git a/python/paddle/fluid/tests/unittests/test_complex_trace_layer.py b/python/paddle/fluid/tests/unittests/test_complex_trace_layer.py
index fcbab29b5d07a..1618d20da2e34 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_trace_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_trace_layer.py
@@ -23,6 +23,7 @@
 
 
 class TestComplexTraceLayer(unittest.TestCase):
+
     def setUp(self):
         self._dtypes = ["float32", "float64"]
         self._places = [fluid.CPUPlace()]
@@ -31,13 +32,14 @@ def setUp(self):
 
     def test_basic_api(self):
         for dtype in self._dtypes:
-            input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand(
-                [2, 20, 2, 3]).astype(dtype)
+            input = rand([
+                2, 20, 2, 3
+            ]).astype(dtype) + 1j * rand([2, 20, 2, 3]).astype(dtype)
             for place in self._places:
                 with dg.guard(place):
                     var_x = dg.to_variable(input)
-                    result = tensor.trace(
-                        var_x, offset=1, axis1=0, axis2=2).numpy()
+                    result = tensor.trace(var_x, offset=1, axis1=0,
+                                          axis2=2).numpy()
                     target = np.trace(input, offset=1, axis1=0, axis2=2)
                     self.assertTrue(np.allclose(result, target))
 
diff --git a/python/paddle/fluid/tests/unittests/test_complex_transpose.py b/python/paddle/fluid/tests/unittests/test_complex_transpose.py
index cc7c778a0cea8..bcbeabf8714a5 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_transpose.py
@@ -21,6 +21,7 @@
 
 
 class TestComplexTransposeLayer(unittest.TestCase):
+
     def setUp(self):
         self._dtypes = ["float32", "float64"]
         self._places = [paddle.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_complex_variable.py b/python/paddle/fluid/tests/unittests/test_complex_variable.py
index a4e2da894d23e..c9ebf27cc4d2a 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_variable.py
@@ -23,6 +23,7 @@
 
 
 class TestComplexVariable(unittest.TestCase):
+
     def compare(self):
         a = np.array([[1.0 + 1.0j, 2.0 + 1.0j],
                       [3.0 + 1.0j, 4.0 + 1.0j]]).astype(self._dtype)
@@ -45,18 +46,16 @@ def test_attrs(self):
         self.compare()
 
     def test_convert_np_dtype_to_dtype(self):
-        self.assertEqual(
-            convert_np_dtype_to_dtype_(np.complex64),
-            core.VarDesc.VarType.COMPLEX64)
-        self.assertEqual(
-            convert_np_dtype_to_dtype_(np.complex64),
-            core.VarDesc.VarType.COMPLEX64)
+        self.assertEqual(convert_np_dtype_to_dtype_(np.complex64),
+                         core.VarDesc.VarType.COMPLEX64)
+        self.assertEqual(convert_np_dtype_to_dtype_(np.complex64),
+                         core.VarDesc.VarType.COMPLEX64)
 
     def test_convert_dtype(self):
-        self.assertEqual(
-            convert_dtype(core.VarDesc.VarType.COMPLEX64), "complex64")
-        self.assertEqual(
-            convert_dtype(core.VarDesc.VarType.COMPLEX128), "complex128")
+        self.assertEqual(convert_dtype(core.VarDesc.VarType.COMPLEX64),
+                         "complex64")
+        self.assertEqual(convert_dtype(core.VarDesc.VarType.COMPLEX128),
+                         "complex128")
 
     def test_eager(self):
         with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/test_complex_view_op.py b/python/paddle/fluid/tests/unittests/test_complex_view_op.py
index 11f43c02a8217..6b224209edcc5 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_view_op.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_view_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 from paddle.fluid import dygraph
 from paddle import static
 from paddle.fluid.framework import _test_eager_guard
+
 paddle.enable_static()
 
 
@@ -35,13 +36,13 @@ def ref_view_as_real(x):
 
 
 class TestViewAsComplexOp(OpTest):
+
     def setUp(self):
         self.op_type = "as_complex"
         x = np.random.randn(10, 10, 2).astype("float64")
         out_ref = ref_view_as_complex(x)
         self.out_grad = np.ones(
-            [10, 10], dtype="float64") + 1j * np.ones(
-                [10, 10], dtype="float64")
+            [10, 10], dtype="float64") + 1j * np.ones([10, 10], dtype="float64")
         self.inputs = {'X': x}
         self.outputs = {'Out': out_ref}
 
@@ -49,15 +50,15 @@ def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            user_defined_grads=[ref_view_as_real(self.out_grad)],
-            user_defined_grad_outputs=[self.out_grad],
-            check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        user_defined_grads=[ref_view_as_real(self.out_grad)],
+                        user_defined_grad_outputs=[self.out_grad],
+                        check_eager=True)
 
 
 class TestViewAsRealOp(OpTest):
+
     def setUp(self):
         self.op_type = "as_real"
         real = np.random.randn(10, 10).astype("float64")
@@ -72,15 +73,15 @@ def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            user_defined_grads=[ref_view_as_complex(self.out_grad)],
-            user_defined_grad_outputs=[self.out_grad],
-            check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        user_defined_grads=[ref_view_as_complex(self.out_grad)],
+                        user_defined_grad_outputs=[self.out_grad],
+                        check_eager=True)
 
 
 class TestViewAsComplexAPI(unittest.TestCase):
+
     def setUp(self):
         self.x = np.random.randn(10, 10, 2)
         self.out = ref_view_as_complex(self.x)
@@ -108,6 +109,7 @@ def test_eager(self):
 
 
 class TestViewAsRealAPI(unittest.TestCase):
+
     def setUp(self):
         self.x = np.random.randn(10, 10) + 1j * np.random.randn(10, 10)
         self.out = ref_view_as_real(self.x)
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 629ddb31d7b62..130a7e8833b4a 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -24,6 +24,7 @@
 
 
 class TestConcatOp(OpTest):
+
     def setUp(self):
         self.op_type = "concat"
         self.python_api = paddle.concat
@@ -38,8 +39,8 @@ def setUp(self):
             self.actual_axis = self.axis
 
         self.outputs = {
-            'Out': np.concatenate(
-                (self.x0, self.x1, self.x2), axis=self.actual_axis)
+            'Out':
+            np.concatenate((self.x0, self.x1, self.x2), axis=self.actual_axis)
         }
 
     def get_dtype(self):
@@ -79,6 +80,7 @@ def init_test_data(self):
 
 
 class TestConcatOp2(TestConcatOp):
+
     def init_test_data(self):
         self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
         self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
@@ -89,6 +91,7 @@ def init_test_data(self):
 @skip_check_grad_ci(
     reason="The function 'check_grad' for large inputs is too slow.")
 class TestConcatOp3(TestConcatOp):
+
     def init_test_data(self):
         self.x0 = np.random.random((1, 256, 170, 256)).astype(self.dtype)
         self.x1 = np.random.random((1, 128, 170, 256)).astype(self.dtype)
@@ -100,9 +103,11 @@ def test_check_grad(self):
 
 
 @skip_check_grad_ci(
-    reason="This test will meet fetch error when there is a null grad. The detailed information is in PR#17015."
+    reason=
+    "This test will meet fetch error when there is a null grad. The detailed information is in PR#17015."
 )
 class TestConcatOp4(TestConcatOp):
+
     def init_test_data(self):
         self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
         self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
@@ -114,6 +119,7 @@ def test_check_grad(self):
 
 
 class TestConcatOp5(TestConcatOp):
+
     def init_test_data(self):
         self.x0 = np.random.random((5, 1, 4, 5)).astype(self.dtype)
         self.x1 = np.random.random((5, 2, 4, 5)).astype(self.dtype)
@@ -122,6 +128,7 @@ def init_test_data(self):
 
 
 class TestConcatOp6(TestConcatOp):
+
     def setUp(self):
         self.op_type = "concat"
         self.dtype = self.get_dtype()
@@ -158,7 +165,9 @@ def init_test_data(self):
 
 
 def create_test_AxisTensor(parent):
+
     class TestConcatAxisTensor(parent):
+
         def setUp(self):
             self.op_type = "concat"
             self.python_api = paddle.concat
@@ -178,8 +187,9 @@ def setUp(self):
                 self.actual_axis = self.axis
 
             self.outputs = {
-                'Out': np.concatenate(
-                    (self.x0, self.x1, self.x2), axis=self.actual_axis)
+                'Out':
+                np.concatenate((self.x0, self.x1, self.x2),
+                               axis=self.actual_axis)
             }
 
     cls_name = "{0}_{1}".format(parent.__name__, "AxisTensor")
@@ -198,7 +208,9 @@ def setUp(self):
 
 
 def create_test_fp16(parent):
+
     class TestConcatFp16(parent):
+
         def get_dtype(self):
             return np.float16
 
@@ -217,9 +229,11 @@ def get_dtype(self):
 
 #----------------Concat Bf16----------------
 def create_test_bf16(parent):
+
     @unittest.skipIf(not paddle.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestConcatBf16(parent):
+
         def get_dtype(self):
             return np.uint16
 
@@ -232,16 +246,17 @@ def get_dtype(self):
 
 
 class TestConcatOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of concat_op should be list.
             x1 = fluid.layers.data(shape=[4], dtype='int32', name='x1')
             fluid.layers.concat(x1)
             # The item in input must be Variable.
-            x2 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            x3 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x2 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
+            x3 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.concat, [x2])
             # The input dtype of concat_op must be float16, float32, float64, int32, int64.
             x4 = fluid.layers.data(shape=[4], dtype='uint8', name='x4')
@@ -265,6 +280,7 @@ def test_input_same_dtype():
 
 
 class TestConcatAPI(unittest.TestCase):
+
     def test_fluid_api(self):
         paddle.enable_static()
         x_1 = fluid.data(shape=[None, 1, 4, 5], dtype='int32', name='x_1')
@@ -281,20 +297,22 @@ def test_fluid_api(self):
         out_3 = fluid.layers.concat(input=[x_2, x_3], axis=positive_1_int64)
 
         exe = fluid.Executor(place=fluid.CPUPlace())
-        [res_1, res_2, res_3] = exe.run(
-            fluid.default_main_program(),
-            feed={"x_1": input_2,
-                  "x_2": input_2,
-                  "x_3": input_3},
-            fetch_list=[out_1, out_2, out_3])
+        [res_1, res_2, res_3] = exe.run(fluid.default_main_program(),
+                                        feed={
+                                            "x_1": input_2,
+                                            "x_2": input_2,
+                                            "x_3": input_3
+                                        },
+                                        fetch_list=[out_1, out_2, out_3])
         assert np.array_equal(res_1, np.concatenate((input_2, input_3), axis=1))
         assert np.array_equal(res_2, np.concatenate((input_2, input_3), axis=1))
         assert np.array_equal(res_3, np.concatenate((input_2, input_3), axis=1))
 
     def test_api(self):
         paddle.enable_static()
-        x_1 = paddle.fluid.data(
-            shape=[None, 1, 4, 5], dtype='int32', name='x_1')
+        x_1 = paddle.fluid.data(shape=[None, 1, 4, 5],
+                                dtype='int32',
+                                name='x_1')
         paddle.concat([x_1, x_1], 0)
 
         input_2 = np.random.random([2, 1, 4, 5]).astype("int32")
@@ -310,12 +328,14 @@ def test_api(self):
         out_4 = paddle.concat(x=[x_2, x_3], axis=negative_int64)
 
         exe = paddle.static.Executor(place=paddle.CPUPlace())
-        [res_1, res_2, res_3, res_4] = exe.run(
-            paddle.static.default_main_program(),
-            feed={"x_1": input_2,
-                  "x_2": input_2,
-                  "x_3": input_3},
-            fetch_list=[out_1, out_2, out_3, out_4])
+        [res_1, res_2, res_3,
+         res_4] = exe.run(paddle.static.default_main_program(),
+                          feed={
+                              "x_1": input_2,
+                              "x_2": input_2,
+                              "x_3": input_3
+                          },
+                          fetch_list=[out_1, out_2, out_3, out_4])
         assert np.array_equal(res_1, np.concatenate((input_2, input_3), axis=1))
         assert np.array_equal(res_2, np.concatenate((input_2, input_3), axis=1))
         assert np.array_equal(res_3, np.concatenate((input_2, input_3), axis=1))
@@ -346,10 +366,10 @@ def test_eager(self):
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The item in input must be Variable.
-            x2 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            x3 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x2 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
+            x3 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             self.assertRaises(TypeError, paddle.concat, [x2])
             # The input dtype of concat_op must be float16, float32, float64, int32, int64.
             x4 = paddle.fluid.data(shape=[4], dtype='uint8', name='x4')
@@ -393,8 +413,9 @@ def set_program(self, use_fluid_api):
             with fluid.program_guard(self.program):
                 input = fluid.layers.assign(self.x)
                 tensor_array = fluid.layers.create_array(dtype='float32')
-                zero = fluid.layers.fill_constant(
-                    shape=[1], value=0, dtype="int64")
+                zero = fluid.layers.fill_constant(shape=[1],
+                                                  value=0,
+                                                  dtype="int64")
 
                 for i in range(self.iter_num):
                     fluid.layers.array_write(input, zero + i, tensor_array)
@@ -428,9 +449,8 @@ def _run_static_mode(self, use_fluid_api):
         res = exe.run(self.program, fetch_list=self.out_var)
         self.assertTrue(
             np.array_equal(
-                res[0],
-                np.concatenate(
-                    [self.x] * self.iter_num, axis=self.axis)))
+                res[0], np.concatenate([self.x] * self.iter_num,
+                                       axis=self.axis)))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_cond.py b/python/paddle/fluid/tests/unittests/test_cond.py
index d9cb0ccf48209..1680461305188 100644
--- a/python/paddle/fluid/tests/unittests/test_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_cond.py
@@ -30,6 +30,7 @@
 
 
 class TestCondInputOutput(unittest.TestCase):
+
     def test_return_single_var(self):
         """
         pseudocode:
@@ -55,8 +56,8 @@ def false_func():
             out = layers.cond(pred, true_func, false_func)
             # out is one tensor
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         ret = exe.run(main_program, fetch_list=[out.name])
         self.assertTrue(
@@ -73,14 +74,18 @@ def test_return_var_tuple(self):
         """
 
         def true_func():
-            return layers.fill_constant(
-                shape=[1, 2], dtype='int32', value=1), layers.fill_constant(
-                    shape=[2, 3], dtype='bool', value=True)
+            return layers.fill_constant(shape=[1, 2], dtype='int32',
+                                        value=1), layers.fill_constant(
+                                            shape=[2, 3],
+                                            dtype='bool',
+                                            value=True)
 
         def false_func():
-            return layers.fill_constant(
-                shape=[3, 4], dtype='float32', value=3), layers.fill_constant(
-                    shape=[4, 5], dtype='int64', value=2)
+            return layers.fill_constant(shape=[3, 4], dtype='float32',
+                                        value=3), layers.fill_constant(
+                                            shape=[4, 5],
+                                            dtype='int64',
+                                            value=2)
 
         main_program = Program()
         startup_program = Program()
@@ -89,8 +94,8 @@ def false_func():
             out = layers.cond(pred, true_func, false_func)
             # out is a tuple containing 2 tensors
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         ret = exe.run(main_program, fetch_list=out)
         self.assertTrue(
@@ -125,8 +130,8 @@ def false_func(a, i):
             pred = ((i % 2) == 0)
             a = layers.cond(pred, lambda: true_func(a, i),
                             lambda: false_func(a, i))
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         for feed_i in range(5):
             expected_a = 7 * (feed_i + 1) if feed_i % 2 == 0 else 8 - feed_i
@@ -134,8 +139,8 @@ def false_func(a, i):
                           feed={'i': np.full((1), feed_i, np.int32)},
                           fetch_list=[a])
             self.assertTrue(
-                np.allclose(
-                    np.asarray(ret), np.full((3, 2, 1), expected_a, np.int32)))
+                np.allclose(np.asarray(ret),
+                            np.full((3, 2, 1), expected_a, np.int32)))
 
     def test_return_none(self):
         """
@@ -161,8 +166,8 @@ def false_func():
             out1 = layers.cond(pred, true_func, false_func)
             out2 = layers.cond(pred, None, false_func)
             out3 = layers.cond(pred, true_func, None)
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         for feed_i in range(5):
             # Test that output is None is runnable
@@ -183,9 +188,11 @@ def func_return_one_tensor():
             return layers.fill_constant(shape=[2, 7], dtype='int32', value=3)
 
         def func_return_two_tensors():
-            return layers.fill_constant(
-                shape=[3, 1], dtype='int32', value=7), layers.fill_constant(
-                    shape=[3, 1], dtype='int32', value=8)
+            return layers.fill_constant(shape=[3, 1], dtype='int32',
+                                        value=7), layers.fill_constant(
+                                            shape=[3, 1],
+                                            dtype='int32',
+                                            value=8)
 
         main_program = Program()
         startup_program = Program()
@@ -223,17 +230,19 @@ def test_extremely_simple_net_with_op_in_condition(self):
         main_program = fluid.Program()
         startup_program = fluid.Program()
         with fluid.program_guard(main_program, startup_program):
-            a = fluid.layers.fill_constant(
-                shape=[1], dtype='float32', value=1.23)
+            a = fluid.layers.fill_constant(shape=[1],
+                                           dtype='float32',
+                                           value=1.23)
             a.stop_gradient = False
-            b = fluid.layers.fill_constant(
-                shape=[1], dtype='float32', value=1.25)
+            b = fluid.layers.fill_constant(shape=[1],
+                                           dtype='float32',
+                                           value=1.25)
             b.stop_gradient = False
             out = layers.cond(a - b < -1.0, lambda: a, lambda: b)
         append_backward(out)
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         ret = exe.run(main_program,
                       fetch_list=[out, b, a.grad_name, b.grad_name])
@@ -245,6 +254,7 @@ def test_extremely_simple_net_with_op_in_condition(self):
 
 
 class TestCondNestedControlFlow(unittest.TestCase):
+
     def test_cond_inside_cond(self):
         """
         pseudocode:
@@ -280,8 +290,8 @@ def greater_equal_branch(i, a):
             mean = layers.mean(out)
             append_backward(mean)
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         for feed_i in range(0, 10):
             expected_a = 2.0 * feed_i
@@ -302,30 +312,34 @@ def test_cond_op_in_condition(self):
         startup_program = fluid.Program()
 
         with fluid.program_guard(main_program, startup_program):
-            a = fluid.layers.fill_constant(
-                shape=[1], dtype='float32', value=1.23)
+            a = fluid.layers.fill_constant(shape=[1],
+                                           dtype='float32',
+                                           value=1.23)
             a.stop_gradient = False
-            b = fluid.layers.fill_constant(
-                shape=[1], dtype='float32', value=1.24)
+            b = fluid.layers.fill_constant(shape=[1],
+                                           dtype='float32',
+                                           value=1.24)
             b.stop_gradient = False
             out = fluid.layers.cond(
-                a < b,
-                lambda: fluid.layers.cond(a - b < -1.0, lambda: fluid.layers.elementwise_add(a, b), lambda: fluid.layers.elementwise_mul(a, b)),
-                lambda: fluid.layers.cond(a == b, lambda: fluid.layers.elementwise_sub(a, b), lambda: fluid.layers.elementwise_pow(a, b))
-            )
+                a < b, lambda: fluid.layers.cond(
+                    a - b < -1.0, lambda: fluid.layers.elementwise_add(a, b),
+                    lambda: fluid.layers.elementwise_mul(a, b)), lambda:
+                fluid.layers.cond(a == b, lambda: fluid.layers.elementwise_sub(
+                    a, b), lambda: fluid.layers.elementwise_pow(a, b)))
             append_backward(out)
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         ret = exe.run(main_program, fetch_list=[out, a.grad_name, b.grad_name])
-        # Note: fill_constant has loss of precision, so we assertAlmostEqual.    
+        # Note: fill_constant has loss of precision, so we assertAlmostEqual.
         self.assertAlmostEqual(ret[0][0], 1.5252)
         self.assertAlmostEqual(ret[1][0], 1.24)
         self.assertAlmostEqual(ret[2][0], 1.23)
 
 
 class TestCondBackward(unittest.TestCase):
+
     def backward_value_helper(self, cond_func, use_cuda, use_parallel_exe):
         """
         Helper function that compares calculated backward value is close to dy/dx
@@ -348,25 +362,24 @@ def backward_value_helper(self, cond_func, use_cuda, use_parallel_exe):
         num_devices = 1
         if use_parallel_exe:
             os.environ['CPU_NUM'] = str(2)
-            exe = fluid.ParallelExecutor(
-                use_cuda=use_cuda,
-                main_program=main_program,
-                loss_name=loss.name)
+            exe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                         main_program=main_program,
+                                         loss_name=loss.name)
             num_devices = exe.device_count
 
         delta = 0.005
         for feed_i in range(0, 10):
             feed_img = np.random.random(size=[1, 9]).astype(np.float32)
-            feed_label = np.random.randint(
-                low=0, high=10, size=[1, 1], dtype=np.int64)
+            feed_label = np.random.randint(low=0,
+                                           high=10,
+                                           size=[1, 1],
+                                           dtype=np.int64)
             if use_parallel_exe:
                 img_grad, loss_value = exe.run(
                     feed={
                         'i': np.full((num_devices), feed_i, np.int32),
-                        'image': np.repeat(
-                            feed_img, num_devices, axis=0),
-                        'label': np.repeat(
-                            feed_label, num_devices, axis=0)
+                        'image': np.repeat(feed_img, num_devices, axis=0),
+                        'label': np.repeat(feed_label, num_devices, axis=0)
                     },
                     fetch_list=[img.grad_name, loss.name])
             else:
@@ -385,15 +398,16 @@ def backward_value_helper(self, cond_func, use_cuda, use_parallel_exe):
                 feed_img_delta[0][j] = feed_img[0][j] + delta
                 if use_parallel_exe:
                     loss_delta = exe.run(feed={
-                        'i': np.full((num_devices), feed_i, np.int32),
-                        'image': np.repeat(
-                            feed_img_delta, num_devices, axis=0),
-                        'label': np.repeat(
-                            feed_label, num_devices, axis=0)
+                        'i':
+                        np.full((num_devices), feed_i, np.int32),
+                        'image':
+                        np.repeat(feed_img_delta, num_devices, axis=0),
+                        'label':
+                        np.repeat(feed_label, num_devices, axis=0)
                     },
                                          fetch_list=[loss.name])
-                    multi_device_grad = (
-                        loss_delta[0] - loss_value[0]) / delta / num_devices
+                    multi_device_grad = (loss_delta[0] -
+                                         loss_value[0]) / delta / num_devices
                     for d in range(num_devices):
                         numerical_grad[d][j] = multi_device_grad[d]
                 else:
@@ -405,12 +419,12 @@ def backward_value_helper(self, cond_func, use_cuda, use_parallel_exe):
                                              'label': feed_label
                                          },
                                          fetch_list=[loss.name])
-                    numerical_grad[0][j] = (
-                        loss_delta[0] - loss_value[0]) / delta
+                    numerical_grad[0][j] = (loss_delta[0] -
+                                            loss_value[0]) / delta
                 feed_img_delta[0][j] = feed_img[0][j]
             self.assertTrue(
-                np.isclose(
-                    img_grad, numerical_grad, atol=0.05, rtol=0.05).all())
+                np.isclose(img_grad, numerical_grad, atol=0.05,
+                           rtol=0.05).all())
 
     def add_optimizer_helper(self, cond_func, use_cuda, use_parallel_exe):
         """
@@ -431,23 +445,22 @@ def add_optimizer_helper(self, cond_func, use_cuda, use_parallel_exe):
         exe.run(startup_program)
         if use_parallel_exe:
             os.environ['CPU_NUM'] = str(2)
-            exe = fluid.ParallelExecutor(
-                use_cuda=use_cuda,
-                main_program=main_program,
-                loss_name=loss.name)
+            exe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                         main_program=main_program,
+                                         loss_name=loss.name)
             num_devices = exe.device_count
 
         for feed_i in range(0, 10):
             feed_img = np.random.random(size=[16, 784]).astype(np.float32)
-            feed_label = np.random.randint(
-                low=0, high=10, size=[16, 1], dtype=np.int64)
+            feed_label = np.random.randint(low=0,
+                                           high=10,
+                                           size=[16, 1],
+                                           dtype=np.int64)
             if use_parallel_exe:
                 exe.run(feed={
                     'i': np.full((num_devices), feed_i, np.int32),
-                    'image': np.repeat(
-                        feed_img, num_devices, axis=0),
-                    'label': np.repeat(
-                        feed_label, num_devices, axis=0)
+                    'image': np.repeat(feed_img, num_devices, axis=0),
+                    'label': np.repeat(feed_label, num_devices, axis=0)
                 },
                         fetch_list=[loss.name])
             else:
@@ -460,11 +473,13 @@ def add_optimizer_helper(self, cond_func, use_cuda, use_parallel_exe):
                         fetch_list=[loss])
 
     def test_cond_backward(self):
+
         def cond_func(i, img, label):
             predicate = ((i % 2) == 0)
-            return layers.cond(predicate,
-                               lambda: simple_fc_net_with_inputs(img, label, class_num=10),
-                               lambda: batchnorm_fc_with_inputs(img, label, class_num=10))
+            return layers.cond(
+                predicate,
+                lambda: simple_fc_net_with_inputs(img, label, class_num=10),
+                lambda: batchnorm_fc_with_inputs(img, label, class_num=10))
 
         for use_parallel_exe in [False, True]:
             if use_parallel_exe and os.name == "nt":
@@ -473,17 +488,18 @@ def cond_func(i, img, label):
                 )
                 continue
 
-            self.backward_value_helper(cond_func,
-                                       core.is_compiled_with_cuda(),
+            self.backward_value_helper(cond_func, core.is_compiled_with_cuda(),
                                        use_parallel_exe)
-            self.add_optimizer_helper(cond_func,
-                                      core.is_compiled_with_cuda(),
+            self.add_optimizer_helper(cond_func, core.is_compiled_with_cuda(),
                                       use_parallel_exe)
 
     def test_half_nested_cond_backward(self):
+
         def branch(i, img, label):
-            return layers.cond((i % 2) == 0, lambda: simple_fc_net_with_inputs(img, label, class_num=10),
-                               lambda: batchnorm_fc_with_inputs(img, label, class_num=10))
+            return layers.cond(
+                (i % 2) == 0,
+                lambda: simple_fc_net_with_inputs(img, label, class_num=10),
+                lambda: batchnorm_fc_with_inputs(img, label, class_num=10))
 
         def cond_func_simple_net_at_true(i, img, label):
             return layers.cond(i < 5, lambda: branch(i, img, label),
@@ -514,13 +530,16 @@ def cond_func_simple_net_at_false(i, img, label):
                                       use_parallel_exe)
 
     def test_nested_cond_backward(self):
+
         def branch(i, img, label, mod_two):
             if mod_two:
                 predicate = ((i % 2) == 0)
             else:
                 predicate = ((i % 2) != 0)
-            return layers.cond(predicate, lambda: simple_fc_net_with_inputs(img, label, class_num=10),
-                               lambda: batchnorm_fc_with_inputs(img, label, class_num=10))
+            return layers.cond(
+                predicate,
+                lambda: simple_fc_net_with_inputs(img, label, class_num=10),
+                lambda: batchnorm_fc_with_inputs(img, label, class_num=10))
 
         def cond_func(i, img, label):
             return layers.cond(i < 5, lambda: branch(i, img, label, True),
@@ -532,15 +551,14 @@ def cond_func(i, img, label):
                     "Skip use_parallel_exe=True in Windows because of flaky test when using PE under old Windows machine"
                 )
                 continue
-            self.backward_value_helper(cond_func,
-                                       core.is_compiled_with_cuda(),
+            self.backward_value_helper(cond_func, core.is_compiled_with_cuda(),
                                        use_parallel_exe)
-            self.add_optimizer_helper(cond_func,
-                                      core.is_compiled_with_cuda(),
+            self.add_optimizer_helper(cond_func, core.is_compiled_with_cuda(),
                                       use_parallel_exe)
 
 
 class TestCondWithError(unittest.TestCase):
+
     def test_input_type_error(self):
         main_program = framework.Program()
         startup_program = framework.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_conditional_block.py b/python/paddle/fluid/tests/unittests/test_conditional_block.py
index 6a71d396b48b0..64980115d9ea6 100644
--- a/python/paddle/fluid/tests/unittests/test_conditional_block.py
+++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py
@@ -25,6 +25,7 @@
 
 
 class ConditionalBlockTest(unittest.TestCase):
+
     def test_forward(self):
         main_program = fluid.Program()
         startup_program = fluid.Program()
@@ -55,6 +56,7 @@ def test_forward(self):
 
 
 class TestConditionalBlockOpInferShape(unittest.TestCase):
+
     def test_infer_shape(self):
         main_program = fluid.Program()
         startup_program = fluid.Program()
@@ -64,19 +66,23 @@ def test_infer_shape(self):
             main_program._rollback()
             step_scope = global_block.create_var(
                 type=core.VarDesc.VarType.STEP_SCOPES)
-            cond_var = layers.fill_constant(
-                shape=[1], dtype='bool', value=False)
+            cond_var = layers.fill_constant(shape=[1],
+                                            dtype='bool',
+                                            value=False)
 
-            op = global_block.append_op(
-                type='conditional_block',
-                inputs={
-                    'Cond': [cond_var],
-                    'Input': [],
-                },
-                outputs={'Out': [],
-                         'Scope': [step_scope]},
-                attrs={'sub_block': sub_block,
-                       'is_scalar_condition': True})
+            op = global_block.append_op(type='conditional_block',
+                                        inputs={
+                                            'Cond': [cond_var],
+                                            'Input': [],
+                                        },
+                                        outputs={
+                                            'Out': [],
+                                            'Scope': [step_scope]
+                                        },
+                                        attrs={
+                                            'sub_block': sub_block,
+                                            'is_scalar_condition': True
+                                        })
             op.desc.infer_shape(global_block.desc)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_conj_op.py b/python/paddle/fluid/tests/unittests/test_conj_op.py
index fe9efc301fea7..a3b3f24326034 100644
--- a/python/paddle/fluid/tests/unittests/test_conj_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conj_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,6 +19,7 @@
 import paddle
 import paddle.fluid.core as core
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 from paddle.fluid import Program, program_guard
@@ -30,6 +31,7 @@
 
 
 class TestConjOp(OpTest):
+
     def setUp(self):
         self.op_type = "conj"
         self.python_api = paddle.tensor.conj
@@ -57,15 +59,15 @@ def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            user_defined_grads=[self.grad_in],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        user_defined_grads=[self.grad_in],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=True)
 
 
 class TestComplexConjOp(unittest.TestCase):
+
     def setUp(self):
         self._dtypes = ["float32", "float64"]
         self._places = [paddle.CPUPlace()]
@@ -74,8 +76,9 @@ def setUp(self):
 
     def test_conj_api(self):
         for dtype in self._dtypes:
-            input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand(
-                [2, 20, 2, 3]).astype(dtype)
+            input = rand([
+                2, 20, 2, 3
+            ]).astype(dtype) + 1j * rand([2, 20, 2, 3]).astype(dtype)
             for place in self._places:
                 with dg.guard(place):
                     var_x = paddle.to_tensor(input)
@@ -85,8 +88,9 @@ def test_conj_api(self):
 
     def test_conj_operator(self):
         for dtype in self._dtypes:
-            input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand(
-                [2, 20, 2, 3]).astype(dtype)
+            input = rand([
+                2, 20, 2, 3
+            ]).astype(dtype) + 1j * rand([2, 20, 2, 3]).astype(dtype)
             for place in self._places:
                 with dg.guard(place):
                     var_x = paddle.to_tensor(input)
@@ -95,9 +99,11 @@ def test_conj_operator(self):
                     self.assertTrue(np.array_equal(result, target))
 
     def test_conj_static_mode(self):
+
         def init_input_output(dtype):
-            input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand(
-                [2, 20, 2, 3]).astype(dtype)
+            input = rand([
+                2, 20, 2, 3
+            ]).astype(dtype) + 1j * rand([2, 20, 2, 3]).astype(dtype)
             return {'x': input}, np.conj(input)
 
         for dtype in self._dtypes:
@@ -105,8 +111,9 @@ def init_input_output(dtype):
             for place in self._places:
                 with static.program_guard(static.Program()):
                     x_dtype = np.complex64 if dtype == "float32" else np.complex128
-                    x = static.data(
-                        name="x", shape=[2, 20, 2, 3], dtype=x_dtype)
+                    x = static.data(name="x",
+                                    shape=[2, 20, 2, 3],
+                                    dtype=x_dtype)
                     out = paddle.conj(x)
 
                     exe = static.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_const_value.py b/python/paddle/fluid/tests/unittests/test_const_value.py
index 0b2431d7726e8..3ae68fb5042bf 100644
--- a/python/paddle/fluid/tests/unittests/test_const_value.py
+++ b/python/paddle/fluid/tests/unittests/test_const_value.py
@@ -19,6 +19,7 @@
 
 
 class ConstantTest(unittest.TestCase):
+
     def test_const_value(self):
         self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
         self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@")
diff --git a/python/paddle/fluid/tests/unittests/test_context_manager.py b/python/paddle/fluid/tests/unittests/test_context_manager.py
index bd5e1b2355c32..93c1d7fa1a499 100644
--- a/python/paddle/fluid/tests/unittests/test_context_manager.py
+++ b/python/paddle/fluid/tests/unittests/test_context_manager.py
@@ -17,8 +17,9 @@
 
 
 class TestContextManagerRaiseException(unittest.TestCase):
-    # When exception raised in 'with' context, we should safely exit the context 
+    # When exception raised in 'with' context, we should safely exit the context
     def test_func1(self):
+
         def foo():
             with fluid.dygraph.guard():
                 print("raise error in context manager")
@@ -27,7 +28,7 @@ def foo():
         self.assertRaises(TypeError, foo)
 
     def test_func2(self):
-        # After test_func1 executed, if fluid.dygraph.guard() in test_func1 safely exited, 
+        # After test_func1 executed, if fluid.dygraph.guard() in test_func1 safely exited,
         # fluid._non_static_mode() should be false.
         self.assertEqual(fluid._non_static_mode(), False)
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
index ca77177125fcd..a7ee1141358fb 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -22,6 +22,7 @@
 
 
 class Conv1DTestCase(unittest.TestCase):
+
     def __init__(self,
                  methodName='runTest',
                  batch_size=4,
@@ -67,8 +68,8 @@ def setUp(self):
             filter_size = self.filter_size
         self.weight_shape = weight_shape = (self.num_filters, self.num_channels
                                             // self.groups) + tuple(filter_size)
-        self.weight = np.random.uniform(
-            -1, 1, size=weight_shape).astype(self.dtype)
+        self.weight = np.random.uniform(-1, 1,
+                                        size=weight_shape).astype(self.dtype)
         if not self.no_bias:
             self.bias = np.random.uniform(
                 -1, 1, size=(self.num_filters, )).astype(self.dtype)
@@ -84,19 +85,19 @@ def functional(self, place):
                                -1) if not self.channel_last else (
                                    -1, -1, self.num_channels)
                 x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                w_var = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
-                b_var = fluid.data(
-                    "bias", (self.num_filters, ), dtype=self.dtype)
-                y_var = F.conv1d(
-                    x_var,
-                    w_var,
-                    b_var if not self.no_bias else None,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format)
+                w_var = fluid.data("weight",
+                                   self.weight_shape,
+                                   dtype=self.dtype)
+                b_var = fluid.data("bias", (self.num_filters, ),
+                                   dtype=self.dtype)
+                y_var = F.conv1d(x_var,
+                                 w_var,
+                                 b_var if not self.no_bias else None,
+                                 padding=self.padding,
+                                 stride=self.stride,
+                                 dilation=self.dilation,
+                                 groups=self.groups,
+                                 data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
             feed_dict["bias"] = self.bias
@@ -107,16 +108,15 @@ def functional(self, place):
 
     def paddle_nn_layer(self):
         x_var = paddle.to_tensor(self.input)
-        conv = nn.Conv1D(
-            self.num_channels,
-            self.num_filters,
-            self.filter_size,
-            padding=self.padding,
-            padding_mode=self.padding_mode,
-            stride=self.stride,
-            dilation=self.dilation,
-            groups=self.groups,
-            data_format=self.data_format)
+        conv = nn.Conv1D(self.num_channels,
+                         self.num_filters,
+                         self.filter_size,
+                         padding=self.padding,
+                         padding_mode=self.padding_mode,
+                         stride=self.stride,
+                         dilation=self.dilation,
+                         groups=self.groups,
+                         data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
@@ -140,6 +140,7 @@ def runTest(self):
 
 
 class Conv1DErrorTestCase(Conv1DTestCase):
+
     def runTest(self):
         place = fluid.CPUPlace()
         with dg.guard(place):
@@ -148,6 +149,7 @@ def runTest(self):
 
 
 class Conv1DTypeErrorTestCase(Conv1DTestCase):
+
     def runTest(self):
         place = fluid.CPUPlace()
         with dg.guard(place):
@@ -160,65 +162,58 @@ def add_cases(suite):
     suite.addTest(Conv1DTestCase(methodName='runTest', stride=[1], dilation=2))
     suite.addTest(Conv1DTestCase(methodName='runTest', stride=2, dilation=(1)))
     suite.addTest(
-        Conv1DTestCase(
-            methodName='runTest', padding="same", no_bias=True))
+        Conv1DTestCase(methodName='runTest', padding="same", no_bias=True))
     suite.addTest(
-        Conv1DTestCase(
-            methodName='runTest', filter_size=3, padding='valid'))
+        Conv1DTestCase(methodName='runTest', filter_size=3, padding='valid'))
     suite.addTest(
-        Conv1DTestCase(
-            methodName='runTest', num_filters=512, padding='valid'))
+        Conv1DTestCase(methodName='runTest', num_filters=512, padding='valid'))
     suite.addTest(
-        Conv1DTestCase(
-            methodName='runTest', num_filters=512, padding=[1, 2]))
+        Conv1DTestCase(methodName='runTest', num_filters=512, padding=[1, 2]))
     suite.addTest(
-        Conv1DTestCase(
-            methodName='runTest', padding=2, data_format='NLC'))
+        Conv1DTestCase(methodName='runTest', padding=2, data_format='NLC'))
     suite.addTest(Conv1DTestCase(methodName='runTest', padding=[1]))
     suite.addTest(Conv1DTestCase(methodName='runTest', padding=[1, 2]))
     suite.addTest(
-        Conv1DTestCase(
-            methodName='runTest', padding=[1, 2], data_format='NLC'))
+        Conv1DTestCase(methodName='runTest', padding=[1, 2], data_format='NLC'))
     suite.addTest(Conv1DTestCase(methodName='runTest', padding=2))
     suite.addTest(Conv1DTestCase(methodName='runTest'))
     suite.addTest(
-        Conv1DTestCase(
-            methodName='runTest', groups=2, padding="valid"))
+        Conv1DTestCase(methodName='runTest', groups=2, padding="valid"))
     suite.addTest(
-        Conv1DTestCase(
-            methodName='runTest',
-            num_filters=6,
-            num_channels=3,
-            groups=3,
-            padding="valid",
-            data_format='NLC'))
+        Conv1DTestCase(methodName='runTest',
+                       num_filters=6,
+                       num_channels=3,
+                       groups=3,
+                       padding="valid",
+                       data_format='NLC'))
 
 
 def add_error_cases(suite):
     suite.addTest(
-        Conv1DTypeErrorTestCase(
-            methodName='runTest', padding_mode="reflect", padding="valid"))
-    suite.addTest(
-        Conv1DErrorTestCase(
-            methodName='runTest', data_format="VALID"))
+        Conv1DTypeErrorTestCase(methodName='runTest',
+                                padding_mode="reflect",
+                                padding="valid"))
+    suite.addTest(Conv1DErrorTestCase(methodName='runTest',
+                                      data_format="VALID"))
     suite.addTest(
-        Conv1DErrorTestCase(
-            methodName='runTest', padding_mode="VALID"))
+        Conv1DErrorTestCase(methodName='runTest', padding_mode="VALID"))
     suite.addTest(
-        Conv1DErrorTestCase(
-            methodName='runTest', num_channels=5, groups=2))
+        Conv1DErrorTestCase(methodName='runTest', num_channels=5, groups=2))
     suite.addTest(
-        Conv1DErrorTestCase(
-            methodName='runTest', num_filters=8, num_channels=15, groups=3))
+        Conv1DErrorTestCase(methodName='runTest',
+                            num_filters=8,
+                            num_channels=15,
+                            groups=3))
     suite.addTest(
-        Conv1DErrorTestCase(
-            methodName='runTest', padding=[1, 2, 3, 4, 5]))
+        Conv1DErrorTestCase(methodName='runTest', padding=[1, 2, 3, 4, 5]))
     suite.addTest(
-        Conv1DErrorTestCase(
-            methodName='runTest', padding=[1, 2, 3, 4, 5], data_format='NLC'))
+        Conv1DErrorTestCase(methodName='runTest',
+                            padding=[1, 2, 3, 4, 5],
+                            data_format='NLC'))
     suite.addTest(
-        Conv1DErrorTestCase(
-            methodName='runTest', num_filters=512, padding=[1, 2, 3, 4, 5]))
+        Conv1DErrorTestCase(methodName='runTest',
+                            num_filters=512,
+                            padding=[1, 2, 3, 4, 5]))
     suite.addTest(Conv1DErrorTestCase(methodName='runTest', dilation=-10))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
index 40b7074ed3914..493cda0c92461 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
@@ -22,6 +22,7 @@
 
 
 class Conv1DTransposeTestCase(unittest.TestCase):
+
     def __init__(self,
                  methodName='runTest',
                  batch_size=4,
@@ -62,7 +63,8 @@ def setUp(self):
                        self.spartial_shape) if not self.channel_last else (
                            self.batch_size,
                            self.spartial_shape,
-                           self.in_channels, )
+                           self.in_channels,
+                       )
         self.input = np.random.randn(*input_shape).astype(self.dtype)
 
         if isinstance(self.filter_size, int):
@@ -71,8 +73,8 @@ def setUp(self):
             filter_size = self.filter_size
         self.weight_shape = weight_shape = (self.in_channels, self.out_channels
                                             // self.groups) + tuple(filter_size)
-        self.weight = np.random.uniform(
-            -1, 1, size=weight_shape).astype(self.dtype)
+        self.weight = np.random.uniform(-1, 1,
+                                        size=weight_shape).astype(self.dtype)
         if not self.no_bias:
             self.bias = np.random.uniform(
                 -1, 1, size=(self.out_channels, )).astype(self.dtype)
@@ -88,21 +90,21 @@ def functional(self, place):
                                -1) if not self.channel_last else (
                                    -1, -1, self.in_channels)
                 x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                w_var = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
-                b_var = fluid.data(
-                    "bias", (self.out_channels, ), dtype=self.dtype)
-                y_var = F.conv1d_transpose(
-                    x_var,
-                    w_var,
-                    None if self.no_bias else b_var,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    output_padding=self.output_padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format)
+                w_var = fluid.data("weight",
+                                   self.weight_shape,
+                                   dtype=self.dtype)
+                b_var = fluid.data("bias", (self.out_channels, ),
+                                   dtype=self.dtype)
+                y_var = F.conv1d_transpose(x_var,
+                                           w_var,
+                                           None if self.no_bias else b_var,
+                                           output_size=self.output_size,
+                                           padding=self.padding,
+                                           output_padding=self.output_padding,
+                                           stride=self.stride,
+                                           dilation=self.dilation,
+                                           groups=self.groups,
+                                           data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
             feed_dict["bias"] = self.bias
@@ -113,16 +115,15 @@ def functional(self, place):
 
     def paddle_nn_layer(self):
         x_var = paddle.to_tensor(self.input)
-        conv = nn.Conv1DTranspose(
-            self.in_channels,
-            self.out_channels,
-            self.filter_size,
-            padding=self.padding,
-            output_padding=self.output_padding,
-            stride=self.stride,
-            dilation=self.dilation,
-            groups=self.groups,
-            data_format=self.data_format)
+        conv = nn.Conv1DTranspose(self.in_channels,
+                                  self.out_channels,
+                                  self.filter_size,
+                                  padding=self.padding,
+                                  output_padding=self.output_padding,
+                                  stride=self.stride,
+                                  dilation=self.dilation,
+                                  groups=self.groups,
+                                  data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
@@ -146,6 +147,7 @@ def runTest(self):
 
 
 class Conv1DTransposeErrorTestCase(Conv1DTransposeTestCase):
+
     def runTest(self):
         place = fluid.CPUPlace()
         with dg.guard(place):
@@ -156,67 +158,64 @@ def runTest(self):
 def add_cases(suite):
     suite.addTest(Conv1DTransposeTestCase(methodName='runTest'))
     suite.addTest(
-        Conv1DTransposeTestCase(
-            methodName='runTest', stride=[2], no_bias=True, dilation=2))
-    suite.addTest(
-        Conv1DTransposeTestCase(
-            methodName='runTest',
-            filter_size=(3),
-            output_size=[36],
-            stride=[2],
-            dilation=2))
-    suite.addTest(
-        Conv1DTransposeTestCase(
-            methodName='runTest', stride=2, dilation=(2)))
+        Conv1DTransposeTestCase(methodName='runTest',
+                                stride=[2],
+                                no_bias=True,
+                                dilation=2))
     suite.addTest(
-        Conv1DTransposeTestCase(
-            methodName='runTest', padding="valid"))
+        Conv1DTransposeTestCase(methodName='runTest',
+                                filter_size=(3),
+                                output_size=[36],
+                                stride=[2],
+                                dilation=2))
     suite.addTest(
-        Conv1DTransposeTestCase(
-            methodName='runTest', padding='valid'))
+        Conv1DTransposeTestCase(methodName='runTest', stride=2, dilation=(2)))
+    suite.addTest(Conv1DTransposeTestCase(methodName='runTest',
+                                          padding="valid"))
+    suite.addTest(Conv1DTransposeTestCase(methodName='runTest',
+                                          padding='valid'))
     suite.addTest(
-        Conv1DTransposeTestCase(
-            methodName='runTest', filter_size=1, padding=3))
+        Conv1DTransposeTestCase(methodName='runTest', filter_size=1, padding=3))
     suite.addTest(Conv1DTransposeTestCase(methodName='runTest', padding=[2]))
     suite.addTest(
-        Conv1DTransposeTestCase(
-            methodName='runTest', data_format="NLC"))
+        Conv1DTransposeTestCase(methodName='runTest', data_format="NLC"))
     suite.addTest(
-        Conv1DTransposeTestCase(
-            methodName='runTest', groups=2, padding="valid"))
+        Conv1DTransposeTestCase(methodName='runTest', groups=2,
+                                padding="valid"))
     suite.addTest(
-        Conv1DTransposeTestCase(
-            methodName='runTest',
-            out_channels=6,
-            in_channels=3,
-            groups=3,
-            padding="valid"))
+        Conv1DTransposeTestCase(methodName='runTest',
+                                out_channels=6,
+                                in_channels=3,
+                                groups=3,
+                                padding="valid"))
     suite.addTest(
-        Conv1DTransposeTestCase(
-            methodName='runTest',
-            data_format="NLC",
-            spartial_shape=16,
-            output_size=18))
+        Conv1DTransposeTestCase(methodName='runTest',
+                                data_format="NLC",
+                                spartial_shape=16,
+                                output_size=18))
     suite.addTest(
-        Conv1DTransposeTestCase(
-            methodName='runTest', data_format="NLC", stride=3,
-            output_padding=2))
+        Conv1DTransposeTestCase(methodName='runTest',
+                                data_format="NLC",
+                                stride=3,
+                                output_padding=2))
     suite.addTest(Conv1DTransposeTestCase(methodName='runTest', padding=[1, 2]))
 
 
 def add_error_cases(suite):
     suite.addTest(
-        Conv1DTransposeErrorTestCase(
-            methodName='runTest', data_format="not_valid"))
+        Conv1DTransposeErrorTestCase(methodName='runTest',
+                                     data_format="not_valid"))
     suite.addTest(
-        Conv1DTransposeErrorTestCase(
-            methodName='runTest', in_channels=5, groups=2))
+        Conv1DTransposeErrorTestCase(methodName='runTest',
+                                     in_channels=5,
+                                     groups=2))
     suite.addTest(
-        Conv1DTransposeErrorTestCase(
-            methodName='runTest', stride=2, output_padding=3))
+        Conv1DTransposeErrorTestCase(methodName='runTest',
+                                     stride=2,
+                                     output_padding=3))
     suite.addTest(
-        Conv1DTransposeErrorTestCase(
-            methodName='runTest', output_size="not_valid"))
+        Conv1DTransposeErrorTestCase(methodName='runTest',
+                                     output_size="not_valid"))
 
 
 def load_tests(loader, standard_tests, pattern):
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_api.py b/python/paddle/fluid/tests/unittests/test_conv2d_api.py
index cb7fd8fe1bc28..5ea256efbb415 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_api.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 import paddle
+
 paddle.enable_static()
 import paddle.fluid.core as core
 import paddle.fluid as fluid
@@ -26,272 +27,251 @@
 
 
 class TestConv2DAPI(unittest.TestCase):
+
     def test_api(self):
 
-        input_NHWC = fluid.layers.data(
-            name="input_NHWC",
-            shape=[2, 5, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
-
-        input_NCHW = fluid.layers.data(
-            name="input_NCHW",
-            shape=[2, 3, 5, 5],
-            append_batch_size=False,
-            dtype="float32")
-
-        fluid.layers.conv2d(
-            input=input_NHWC,
-            num_filters=3,
-            filter_size=[3, 3],
-            stride=[1, 1],
-            padding=0,
-            dilation=[1, 1],
-            groups=1,
-            data_format="NCHW")
-
-        fluid.layers.conv2d(
-            input=input_NCHW,
-            num_filters=3,
-            filter_size=[3, 3],
-            stride=[1, 1],
-            padding=[1, 2, 1, 0],
-            dilation=[1, 1],
-            groups=1,
-            data_format="NCHW")
-
-        fluid.layers.conv2d(
-            input=input_NCHW,
-            num_filters=3,
-            filter_size=[3, 3],
-            stride=[1, 1],
-            padding=[[0, 0], [0, 0], [1, 1], [1, 1]],
-            dilation=[1, 1],
-            groups=1,
-            data_format="NCHW")
-
-        fluid.layers.conv2d(
-            input=input_NHWC,
-            num_filters=3,
-            filter_size=[3, 3],
-            stride=[1, 1],
-            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
-            dilation=[1, 1],
-            groups=1,
-            data_format="NHWC")
-
-        fluid.layers.conv2d(
-            input=input_NCHW,
-            num_filters=3,
-            filter_size=[3, 3],
-            stride=[1, 1],
-            padding="SAME",
-            dilation=[1, 1],
-            groups=1,
-            data_format="NCHW")
-
-        fluid.layers.conv2d(
-            input=input_NCHW,
-            num_filters=3,
-            filter_size=[3, 3],
-            stride=[1, 1],
-            padding="VALID",
-            dilation=[1, 1],
-            groups=1,
-            data_format="NCHW")
+        input_NHWC = fluid.layers.data(name="input_NHWC",
+                                       shape=[2, 5, 5, 3],
+                                       append_batch_size=False,
+                                       dtype="float32")
+
+        input_NCHW = fluid.layers.data(name="input_NCHW",
+                                       shape=[2, 3, 5, 5],
+                                       append_batch_size=False,
+                                       dtype="float32")
+
+        fluid.layers.conv2d(input=input_NHWC,
+                            num_filters=3,
+                            filter_size=[3, 3],
+                            stride=[1, 1],
+                            padding=0,
+                            dilation=[1, 1],
+                            groups=1,
+                            data_format="NCHW")
+
+        fluid.layers.conv2d(input=input_NCHW,
+                            num_filters=3,
+                            filter_size=[3, 3],
+                            stride=[1, 1],
+                            padding=[1, 2, 1, 0],
+                            dilation=[1, 1],
+                            groups=1,
+                            data_format="NCHW")
+
+        fluid.layers.conv2d(input=input_NCHW,
+                            num_filters=3,
+                            filter_size=[3, 3],
+                            stride=[1, 1],
+                            padding=[[0, 0], [0, 0], [1, 1], [1, 1]],
+                            dilation=[1, 1],
+                            groups=1,
+                            data_format="NCHW")
+
+        fluid.layers.conv2d(input=input_NHWC,
+                            num_filters=3,
+                            filter_size=[3, 3],
+                            stride=[1, 1],
+                            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+                            dilation=[1, 1],
+                            groups=1,
+                            data_format="NHWC")
+
+        fluid.layers.conv2d(input=input_NCHW,
+                            num_filters=3,
+                            filter_size=[3, 3],
+                            stride=[1, 1],
+                            padding="SAME",
+                            dilation=[1, 1],
+                            groups=1,
+                            data_format="NCHW")
+
+        fluid.layers.conv2d(input=input_NCHW,
+                            num_filters=3,
+                            filter_size=[3, 3],
+                            stride=[1, 1],
+                            padding="VALID",
+                            dilation=[1, 1],
+                            groups=1,
+                            data_format="NCHW")
 
     def test_depthwise_conv2d(self):
         x_var = paddle.uniform((2, 8, 8, 4), dtype='float32', min=-1., max=1.)
-        conv = paddle.nn.Conv2D(
-            in_channels=4,
-            out_channels=4,
-            kernel_size=(3, 3),
-            groups=4,
-            data_format='NHWC')
+        conv = paddle.nn.Conv2D(in_channels=4,
+                                out_channels=4,
+                                kernel_size=(3, 3),
+                                groups=4,
+                                data_format='NHWC')
         y_var = conv(x_var)
 
 
 class TestConv2DAPI_Error(unittest.TestCase):
+
     def test_api(self):
-        input = fluid.layers.data(
-            name="input",
-            shape=[2, 5, 5, 5],
-            append_batch_size=False,
-            dtype="float32")
+        input = fluid.layers.data(name="input",
+                                  shape=[2, 5, 5, 5],
+                                  append_batch_size=False,
+                                  dtype="float32")
 
         # ValueError: cudnn
         def run_1():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=3,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding=0,
-                dilation=[1, 1],
-                groups=1,
-                use_cudnn=[0],
-                data_format="NCHW")
+            fluid.layers.conv2d(input=input,
+                                num_filters=3,
+                                filter_size=[3, 3],
+                                stride=[1, 1],
+                                padding=0,
+                                dilation=[1, 1],
+                                groups=1,
+                                use_cudnn=[0],
+                                data_format="NCHW")
 
         self.assertRaises(ValueError, run_1)
 
         # ValueError: data_format
         def run_2():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=3,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding=0,
-                dilation=[1, 1],
-                groups=1,
-                use_cudnn=False,
-                data_format="NCHWC")
+            fluid.layers.conv2d(input=input,
+                                num_filters=3,
+                                filter_size=[3, 3],
+                                stride=[1, 1],
+                                padding=0,
+                                dilation=[1, 1],
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NCHWC")
 
         self.assertRaises(ValueError, run_2)
 
         # ValueError: padding
         def run_3():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=3,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding="SAMEE",
-                dilation=[1, 1],
-                groups=1,
-                use_cudnn=False,
-                data_format="NCHW")
+            fluid.layers.conv2d(input=input,
+                                num_filters=3,
+                                filter_size=[3, 3],
+                                stride=[1, 1],
+                                padding="SAMEE",
+                                dilation=[1, 1],
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NCHW")
 
         self.assertRaises(ValueError, run_3)
 
         def run_4():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=3,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding=[[0, 1], [0, 1], [0, 1], [0, 1]],
-                dilation=[1, 1],
-                groups=1,
-                use_cudnn=False,
-                data_format="NCHW")
+            fluid.layers.conv2d(input=input,
+                                num_filters=3,
+                                filter_size=[3, 3],
+                                stride=[1, 1],
+                                padding=[[0, 1], [0, 1], [0, 1], [0, 1]],
+                                dilation=[1, 1],
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NCHW")
 
         self.assertRaises(ValueError, run_4)
 
         def run_5():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=3,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding=[[0, 1], [0, 1], [0, 1], [0, 1]],
-                dilation=[1, 1],
-                groups=1,
-                use_cudnn=False,
-                data_format="NHWC")
+            fluid.layers.conv2d(input=input,
+                                num_filters=3,
+                                filter_size=[3, 3],
+                                stride=[1, 1],
+                                padding=[[0, 1], [0, 1], [0, 1], [0, 1]],
+                                dilation=[1, 1],
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NHWC")
 
         self.assertRaises(ValueError, run_5)
 
         # ValueError: channel dimmention
-        x = fluid.layers.data(
-            name="x",
-            shape=[2, 5, 5, -1],
-            append_batch_size=False,
-            dtype="float32")
+        x = fluid.layers.data(name="x",
+                              shape=[2, 5, 5, -1],
+                              append_batch_size=False,
+                              dtype="float32")
 
         def run_6():
-            fluid.layers.conv2d(
-                input=x,
-                num_filters=3,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding=0,
-                dilation=[1, 1],
-                groups=1,
-                use_cudnn=False,
-                data_format="NHWC")
+            fluid.layers.conv2d(input=x,
+                                num_filters=3,
+                                filter_size=[3, 3],
+                                stride=[1, 1],
+                                padding=0,
+                                dilation=[1, 1],
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NHWC")
 
         self.assertRaises(ValueError, run_6)
 
         # ValueError: groups
         def run_7():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=3,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding=0,
-                dilation=[1, 1],
-                groups=3,
-                use_cudnn=False,
-                data_format="NHWC")
+            fluid.layers.conv2d(input=input,
+                                num_filters=3,
+                                filter_size=[3, 3],
+                                stride=[1, 1],
+                                padding=0,
+                                dilation=[1, 1],
+                                groups=3,
+                                use_cudnn=False,
+                                data_format="NHWC")
 
         self.assertRaises(ValueError, run_7)
 
         # ValueError: filter num
         def run_8():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=0,
-                filter_size=0,
-                stride=0,
-                padding=0,
-                dilation=0,
-                groups=1,
-                use_cudnn=False,
-                data_format="NCHW")
+            fluid.layers.conv2d(input=input,
+                                num_filters=0,
+                                filter_size=0,
+                                stride=0,
+                                padding=0,
+                                dilation=0,
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NCHW")
 
         self.assertRaises(ValueError, run_8)
 
         # ValueError: groups
         def run_9():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=0,
-                filter_size=0,
-                stride=0,
-                padding=0,
-                dilation=0,
-                groups=0,
-                use_cudnn=False,
-                data_format="NCHW")
+            fluid.layers.conv2d(input=input,
+                                num_filters=0,
+                                filter_size=0,
+                                stride=0,
+                                padding=0,
+                                dilation=0,
+                                groups=0,
+                                use_cudnn=False,
+                                data_format="NCHW")
 
         self.assertRaises(ValueError, run_9)
 
-        # ValueError: stride 
+        # ValueError: stride
         def run_10():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=1,
-                filter_size=1,
-                stride=0,
-                padding=0,
-                dilation=0,
-                groups=1,
-                use_cudnn=False,
-                data_format="NCHW")
+            fluid.layers.conv2d(input=input,
+                                num_filters=1,
+                                filter_size=1,
+                                stride=0,
+                                padding=0,
+                                dilation=0,
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NCHW")
 
         self.assertRaises(ValueError, run_10)
 
     def test_api_with_error_input(self):
-        input = fluid.layers.data(
-            name="error_input",
-            shape=[1],
-            append_batch_size=False,
-            dtype="float32")
+        input = fluid.layers.data(name="error_input",
+                                  shape=[1],
+                                  append_batch_size=False,
+                                  dtype="float32")
 
         # ValueError: cudnn
         def run_1():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=0,
-                filter_size=0,
-                stride=0,
-                padding=0,
-                dilation=0,
-                groups=0,
-                use_cudnn=False,
-                data_format="NCHW")
+            fluid.layers.conv2d(input=input,
+                                num_filters=0,
+                                filter_size=0,
+                                stride=0,
+                                padding=0,
+                                dilation=0,
+                                groups=0,
+                                use_cudnn=False,
+                                data_format="NCHW")
 
         self.assertRaises(ValueError, run_1)
 
@@ -301,22 +281,21 @@ def run_1():
     not (core.is_compiled_with_cuda() or core.is_compiled_with_rocm()),
     "core is not compiled with CUDA or ROCM")
 class TestConv2DEnviron(unittest.TestCase):
+
     def run1(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            inputs = fluid.layers.data(
-                shape=[2, 3, 5, 5],
-                append_batch_size=False,
-                name="inputs",
-                dtype="float32")
-            result = fluid.layers.conv2d(
-                input=inputs,
-                num_filters=4,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding=0,
-                dilation=[1, 1],
-                groups=1,
-                data_format="NCHW")
+            inputs = fluid.layers.data(shape=[2, 3, 5, 5],
+                                       append_batch_size=False,
+                                       name="inputs",
+                                       dtype="float32")
+            result = fluid.layers.conv2d(input=inputs,
+                                         num_filters=4,
+                                         filter_size=[3, 3],
+                                         stride=[1, 1],
+                                         padding=0,
+                                         dilation=[1, 1],
+                                         groups=1,
+                                         data_format="NCHW")
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
             fetches = exe.run(fluid.default_main_program(),
@@ -326,11 +305,10 @@ def run1(self, place):
     def run2(self, place):
         with fluid.dygraph.guard(place):
             inputs = fluid.dygraph.to_variable(self.input_np)
-            conv = paddle.nn.Conv2D(
-                in_channels=3,
-                out_channels=4,
-                kernel_size=(3, 3),
-                data_format="NCHW")
+            conv = paddle.nn.Conv2D(in_channels=3,
+                                    out_channels=4,
+                                    kernel_size=(3, 3),
+                                    data_format="NCHW")
             result = conv(inputs)
 
     def run3(self, place):
@@ -339,7 +317,8 @@ def run3(self, place):
             conv = paddle.fluid.dygraph.nn.Conv2D(
                 num_channels=3,
                 num_filters=4,
-                filter_size=(3, 3), )
+                filter_size=(3, 3),
+            )
             result = conv(inputs)
 
     def run_all(self, place):
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
index 8ef2660cac2de..2a3d509e2bf11 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
@@ -24,7 +24,9 @@
 
 
 def create_test_padding_SAME_class(parent):
+
     class TestPaddingSAMECase(parent):
+
         def init_paddings(self):
             self.pad = [0, 0]
             self.padding_algorithm = "SAME"
@@ -35,7 +37,9 @@ def init_paddings(self):
 
 
 def create_test_padding_VALID_class(parent):
+
     class TestPaddingVALIDCase(parent):
+
         def init_paddings(self):
             self.pad = [1, 1]
             self.padding_algorithm = "VALID"
@@ -46,6 +50,7 @@ def init_paddings(self):
 
 
 class TestConv2DFusionOp(OpTest):
+
     def setUp(self):
         self.op_type = "conv2d_fusion"
         self.exhaustive_search = False
@@ -75,9 +80,11 @@ def setUp(self):
         filter = np.random.random(self.filter_size).astype(self.dtype)
         bias = np.random.random(self.filter_size[0]).astype(self.dtype)
 
-        self.output, _, _, _, _ = conv2d_forward_naive(
-            input, filter, self.groups, conv2d_param, self.padding_algorithm,
-            self.data_format)
+        self.output, _, _, _, _ = conv2d_forward_naive(input, filter,
+                                                       self.groups,
+                                                       conv2d_param,
+                                                       self.padding_algorithm,
+                                                       self.data_format)
 
         self.output = self.output.astype(self.dtype)
 
@@ -158,27 +165,32 @@ def init_paddings(self):
 
 
 class TestWithoutResidual(TestConv2DFusionOp):
+
     def init_residual(self):
         self.add_residual_data = False
 
 
 class TestIdentityActivation(TestConv2DFusionOp):
+
     def init_activation(self):
         self.activation = 'identity'
 
 
 class TestIdentityActivation1(TestConv2DFusionOp):
+
     def init_activation(self):
         self.activation = 'identity'
         self.add_residual_data = False
 
 
 class TestWithGroup(TestConv2DFusionOp):
+
     def init_group(self):
         self.groups = 3
 
 
 class TestWithDilation(TestConv2DFusionOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -195,11 +207,13 @@ def init_group(self):
 
 
 class TestCUDNNExhaustiveSearch(TestConv2DFusionOp):
+
     def set_search_method(self):
         self.exhaustive_search = True
 
 
 class TestMultipleOutputs(TestConv2DFusionOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -216,12 +230,14 @@ def set_outputs(self):
 
 
 class TestAsyPadding(TestConv2DFusionOp):
+
     def init_paddings(self):
         self.pad = [0, 0, 1, 2]
         self.padding_algorithm = "EXPLICIT"
 
 
 class TestWithPad_AsyPadding(TestConv2DFusionOp):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 10, 10]  # NCHW
@@ -235,6 +251,7 @@ def init_paddings(self):
 
 
 class TestWithStride_AsyPadding(TestConv2DFusionOp):
+
     def init_test_case(self):
         self.stride = [2, 2]
         self.input_size = [2, 3, 6, 6]  # NCHW
@@ -248,6 +265,7 @@ def init_paddings(self):
 
 
 class TestWith1x1_AsyPadding(TestConv2DFusionOp):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
@@ -264,11 +282,13 @@ def init_paddings(self):
 
 
 class TestWithGroup_AsyPadding(TestConv2DFusionOp):
+
     def init_group(self):
         self.groups = 3
 
 
 class TestWithDepthWise3x3_AsyPadding(TestConv2DFusionOp):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [3, 4, 10, 10]  # NCHW
@@ -288,6 +308,7 @@ def init_paddings(self):
 
 
 class TestWithDepthWise5x5_AsyPadding(TestConv2DFusionOp):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 4, 10, 10]  # NCHW
@@ -304,6 +325,7 @@ def init_paddings(self):
 
 
 class TestWithDepthWise7x7_AsyPadding(TestConv2DFusionOp):
+
     def init_test_case(self):
         self.stride = [2, 2]
         self.input_size = [2, 8, 10, 10]  # NCHW
@@ -320,6 +342,7 @@ def init_paddings(self):
 
 
 class TestWithDilation_AsyPadding(TestConv2DFusionOp):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 10, 10]  # NCHW
@@ -339,6 +362,7 @@ def init_paddings(self):
 
 
 class TestWithInput1x1Filter1x1_AsyPadding(TestConv2DFusionOp):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 1, 1]  # NCHW
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
index 508bd7b1e64d8..b8c6f1dfa2fdc 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
@@ -27,6 +27,7 @@ def _reverse_repeat_list(t, n):
 
 
 class Conv2DTestCase(unittest.TestCase):
+
     def __init__(self,
                  methodName='runTest',
                  batch_size=4,
@@ -51,8 +52,8 @@ def __init__(self,
 
         self.padding = padding
         if padding_mode in {'reflect', 'replicate', 'circular'}:
-            _paired_padding = fluid.layers.utils.convert_to_list(padding, 2,
-                                                                 'padding')
+            _paired_padding = fluid.layers.utils.convert_to_list(
+                padding, 2, 'padding')
             self._reversed_padding_repeated_twice = _reverse_repeat_list(
                 _paired_padding, 2)
         self.padding_mode = padding_mode
@@ -69,8 +70,8 @@ def setUp(self):
             input_shape = (self.batch_size, ) + self.spartial_shape + (
                 self.num_channels, )
         else:
-            input_shape = (self.batch_size, self.num_channels
-                           ) + self.spartial_shape
+            input_shape = (self.batch_size,
+                           self.num_channels) + self.spartial_shape
         self.input = np.random.randn(*input_shape).astype(self.dtype)
 
         if isinstance(self.filter_size, int):
@@ -79,8 +80,8 @@ def setUp(self):
             filter_size = self.filter_size
         self.weight_shape = weight_shape = (self.num_filters, self.num_channels
                                             // self.groups) + tuple(filter_size)
-        self.weight = np.random.uniform(
-            -1, 1, size=weight_shape).astype(self.dtype)
+        self.weight = np.random.uniform(-1, 1,
+                                        size=weight_shape).astype(self.dtype)
         if not self.no_bias:
             self.bias = np.random.uniform(
                 -1, 1, size=(self.num_filters, )).astype(self.dtype)
@@ -109,17 +110,16 @@ def fluid_layer(self, place):
                 else:
                     padding = self.padding
 
-                y_var = fluid.layers.conv2d(
-                    x_var,
-                    self.num_filters,
-                    self.filter_size,
-                    padding=padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=weight_attr,
-                    bias_attr=bias_attr,
-                    data_format=self.data_format)
+                y_var = fluid.layers.conv2d(x_var,
+                                            self.num_filters,
+                                            self.filter_size,
+                                            padding=padding,
+                                            stride=self.stride,
+                                            dilation=self.dilation,
+                                            groups=self.groups,
+                                            param_attr=weight_attr,
+                                            bias_attr=bias_attr,
+                                            data_format=self.data_format)
 
         feed_dict = {"input": self.input}
         exe = fluid.Executor(place)
@@ -135,10 +135,11 @@ def functional(self, place):
                 input_shape = (-1, -1, -1,self.num_channels) \
                     if self.channel_last else (-1, self.num_channels, -1, -1)
                 x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                w_var = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
-                b_var = fluid.data(
-                    "bias", (self.num_filters, ), dtype=self.dtype)
+                w_var = fluid.data("weight",
+                                   self.weight_shape,
+                                   dtype=self.dtype)
+                b_var = fluid.data("bias", (self.num_filters, ),
+                                   dtype=self.dtype)
 
                 if self.padding_mode != 'zeros':
                     x_var = F.pad(x_var,
@@ -149,15 +150,14 @@ def functional(self, place):
                 else:
                     padding = self.padding
 
-                y_var = F.conv2d(
-                    x_var,
-                    w_var,
-                    b_var if not self.no_bias else None,
-                    padding=padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format)
+                y_var = F.conv2d(x_var,
+                                 w_var,
+                                 b_var if not self.no_bias else None,
+                                 padding=padding,
+                                 stride=self.stride,
+                                 dilation=self.dilation,
+                                 groups=self.groups,
+                                 data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
             feed_dict["bias"] = self.bias
@@ -169,16 +169,15 @@ def functional(self, place):
     def paddle_nn_layer(self):
         x_var = paddle.to_tensor(self.input)
         x_var.stop_gradient = False
-        conv = nn.Conv2D(
-            self.num_channels,
-            self.num_filters,
-            self.filter_size,
-            padding=self.padding,
-            padding_mode=self.padding_mode,
-            stride=self.stride,
-            dilation=self.dilation,
-            groups=self.groups,
-            data_format=self.data_format)
+        conv = nn.Conv2D(self.num_channels,
+                         self.num_filters,
+                         self.filter_size,
+                         padding=self.padding,
+                         padding_mode=self.padding_mode,
+                         stride=self.stride,
+                         dilation=self.dilation,
+                         groups=self.groups,
+                         data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
@@ -211,6 +210,7 @@ def runTest(self):
 
 
 class Conv2DErrorTestCase(Conv2DTestCase):
+
     def runTest(self):
         place = fluid.CPUPlace()
         with dg.guard(place):
@@ -221,68 +221,63 @@ def runTest(self):
 def add_cases(suite):
     suite.addTest(Conv2DTestCase(methodName='runTest'))
     suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest', stride=[1, 2], dilation=2))
+        Conv2DTestCase(methodName='runTest', stride=[1, 2], dilation=2))
     suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest', stride=2, dilation=(2, 1)))
+        Conv2DTestCase(methodName='runTest', stride=2, dilation=(2, 1)))
     suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest', padding="same", no_bias=True))
+        Conv2DTestCase(methodName='runTest', padding="same", no_bias=True))
     suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest', filter_size=(3, 3), padding='valid'))
+        Conv2DTestCase(methodName='runTest',
+                       filter_size=(3, 3),
+                       padding='valid'))
     suite.addTest(Conv2DTestCase(methodName='runTest', padding=(2, 3)))
     suite.addTest(Conv2DTestCase(methodName='runTest', padding=[1, 2, 2, 1]))
     suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest', padding=[[0, 0], [0, 0], [1, 2], [2, 1]]))
+        Conv2DTestCase(methodName='runTest',
+                       padding=[[0, 0], [0, 0], [1, 2], [2, 1]]))
     suite.addTest(Conv2DTestCase(methodName='runTest', data_format="NHWC"))
     suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest',
-            data_format="NHWC",
-            padding=[[0, 0], [1, 1], [2, 2], [0, 0]]))
+        Conv2DTestCase(methodName='runTest',
+                       data_format="NHWC",
+                       padding=[[0, 0], [1, 1], [2, 2], [0, 0]]))
     suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest', groups=2, padding="valid"))
+        Conv2DTestCase(methodName='runTest', groups=2, padding="valid"))
     suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest',
-            num_filters=6,
-            num_channels=3,
-            groups=3,
-            padding="valid"))
+        Conv2DTestCase(methodName='runTest',
+                       num_filters=6,
+                       num_channels=3,
+                       groups=3,
+                       padding="valid"))
     suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest',
-            filter_size=(3, 3),
-            padding=1,
-            padding_mode='reflect'))
+        Conv2DTestCase(methodName='runTest',
+                       filter_size=(3, 3),
+                       padding=1,
+                       padding_mode='reflect'))
     suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest',
-            filter_size=(3, 3),
-            padding=1,
-            padding_mode='replicate'))
+        Conv2DTestCase(methodName='runTest',
+                       filter_size=(3, 3),
+                       padding=1,
+                       padding_mode='replicate'))
     suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest',
-            filter_size=(3, 3),
-            padding=1,
-            padding_mode='circular'))
+        Conv2DTestCase(methodName='runTest',
+                       filter_size=(3, 3),
+                       padding=1,
+                       padding_mode='circular'))
 
 
 def add_error_cases(suite):
     suite.addTest(
-        Conv2DErrorTestCase(
-            methodName='runTest', num_channels=5, groups=2))
+        Conv2DErrorTestCase(methodName='runTest', num_channels=5, groups=2))
     suite.addTest(
-        Conv2DErrorTestCase(
-            methodName='runTest', num_channels=5, groups=2, stride=0))
+        Conv2DErrorTestCase(methodName='runTest',
+                            num_channels=5,
+                            groups=2,
+                            stride=0))
     suite.addTest(
-        Conv2DErrorTestCase(
-            methodName='runTest', num_channels=5, groups=2, padding=[-1, -1]))
+        Conv2DErrorTestCase(methodName='runTest',
+                            num_channels=5,
+                            groups=2,
+                            padding=[-1, -1]))
 
 
 def load_tests(loader, standard_tests, pattern):
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index fdb93e1f1afdd..0d38a1571e0c2 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -21,8 +21,9 @@
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
-from paddle.fluid.tests.unittests.op_test import (
-    OpTest, convert_float_to_uint16, get_numeric_gradient)
+from paddle.fluid.tests.unittests.op_test import (OpTest,
+                                                  convert_float_to_uint16,
+                                                  get_numeric_gradient)
 from paddle.fluid.tests.unittests.testsuite import create_op
 from paddle.fluid import Program, program_guard
 
@@ -64,8 +65,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         for input_size, filter_size, stride_size in zip(input_shape, pool_size,
                                                         pool_stride):
             out_size = int((input_size + stride_size - 1) / stride_size)
-            pad_sum = np.max((
-                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0))
             pad_0 = int(pad_sum / 2)
             pad_1 = int(pad_sum - pad_0)
             padding.append(pad_0)
@@ -94,14 +95,14 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
     d_bolck_h = (dilation[0] * (f_h - 1) + 1)
     d_bolck_w = (dilation[1] * (f_w - 1) + 1)
 
-    input_pad = np.pad(input, ((0, 0), (0, 0), (pad_h_0, pad_h_1),
-                               (pad_w_0, pad_w_1)),
+    input_pad = np.pad(input,
+                       ((0, 0), (0, 0), (pad_h_0, pad_h_1), (pad_w_0, pad_w_1)),
                        mode='constant',
                        constant_values=0)
 
     filter_dilation = np.zeros((f_n, f_c, d_bolck_h, d_bolck_w))
-    filter_dilation[:, :, 0:d_bolck_h:dilation[0], 0:d_bolck_w:dilation[
-        1]] = filter
+    filter_dilation[:, :, 0:d_bolck_h:dilation[0],
+                    0:d_bolck_w:dilation[1]] = filter
 
     for i in range(out_h):
         for j in range(out_w):
@@ -126,9 +127,11 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
 
 
 def create_test_cudnn_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNCase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = np.float32 if core.is_compiled_with_rocm(
@@ -140,9 +143,11 @@ def init_kernel_type(self):
 
 
 def create_test_cudnn_fp16_class(parent, grad_check=True):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestConv2DCUDNNFp16(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = np.float16
@@ -156,14 +161,16 @@ def test_check_output(self):
         def test_check_grad_no_filter(self):
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place) and grad_check:
-                self.check_grad_with_place(
-                    place, ['Input'], 'Output', no_grad_set=set(['Filter']))
+                self.check_grad_with_place(place, ['Input'],
+                                           'Output',
+                                           no_grad_set=set(['Filter']))
 
         def test_check_grad_no_input(self):
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place) and grad_check:
-                self.check_grad_with_place(
-                    place, ['Filter'], 'Output', no_grad_set=set(['Input']))
+                self.check_grad_with_place(place, ['Filter'],
+                                           'Output',
+                                           no_grad_set=set(['Input']))
 
     cls_name = "{0}_{1}".format(parent.__name__, "CUDNNFp16")
     TestConv2DCUDNNFp16.__name__ = cls_name
@@ -171,11 +178,13 @@ def test_check_grad_no_input(self):
 
 
 def create_test_cudnn_bf16_class(parent):
+
     @unittest.skipIf(
-        not core.is_compiled_with_cuda() or
-        not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        not core.is_compiled_with_cuda()
+        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
         "core is not compiled with CUDA and do not support bfloat16")
     class TestConv2DCUDNNBF16(parent):
+
         def get_numeric_grad(self, place, check_name):
             scope = core.Scope()
             self._check_grad_helper()
@@ -196,20 +205,18 @@ def test_check_output(self):
         def test_check_grad_no_filter(self):
             place = core.CUDAPlace(0)
             numeric_grads = self.get_numeric_grad(place, 'Input')
-            self.check_grad_with_place(
-                place, ['Input'],
-                'Output',
-                no_grad_set=set(['Filter']),
-                user_defined_grads=[numeric_grads])
+            self.check_grad_with_place(place, ['Input'],
+                                       'Output',
+                                       no_grad_set=set(['Filter']),
+                                       user_defined_grads=[numeric_grads])
 
         def test_check_grad_no_input(self):
             place = core.CUDAPlace(0)
             numeric_grads = self.get_numeric_grad(place, 'Filter')
-            self.check_grad_with_place(
-                place, ['Filter'],
-                'Output',
-                no_grad_set=set(['Input']),
-                user_defined_grads=[numeric_grads])
+            self.check_grad_with_place(place, ['Filter'],
+                                       'Output',
+                                       no_grad_set=set(['Input']),
+                                       user_defined_grads=[numeric_grads])
 
     cls_name = "{0}_{1}".format(parent.__name__, "CUDNNBF16")
     TestConv2DCUDNNBF16.__name__ = cls_name
@@ -217,7 +224,9 @@ def test_check_grad_no_input(self):
 
 
 def create_test_channel_last_class(parent):
+
     class TestChannelLastCase(parent):
+
         def init_data_format(self):
             self.data_format = "NHWC"
 
@@ -231,9 +240,11 @@ def init_test_case_2(self):
 
 
 def create_test_cudnn_channel_last_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCudnnChannelLastCase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = np.float32 if core.is_compiled_with_rocm(
@@ -252,9 +263,11 @@ def init_test_case_2(self):
 
 
 def create_test_cudnn_channel_last_fp16_class(parent, grad_check=True):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCudnnChannelLastFp16(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = np.float16
@@ -268,14 +281,16 @@ def test_check_output(self):
         def test_check_grad_no_filter(self):
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place) and grad_check:
-                self.check_grad_with_place(
-                    place, ['Input'], 'Output', no_grad_set=set(['Filter']))
+                self.check_grad_with_place(place, ['Input'],
+                                           'Output',
+                                           no_grad_set=set(['Filter']))
 
         def test_check_grad_no_input(self):
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place) and grad_check:
-                self.check_grad_with_place(
-                    place, ['Filter'], 'Output', no_grad_set=set(['Input']))
+                self.check_grad_with_place(place, ['Filter'],
+                                           'Output',
+                                           no_grad_set=set(['Input']))
 
         def init_data_format(self):
             self.data_format = "NHWC"
@@ -290,7 +305,9 @@ def init_test_case_2(self):
 
 
 def create_test_padding_SAME_class(parent):
+
     class TestPaddingSMAECase(parent):
+
         def init_paddings(self):
             self.pad = [0, 0]
             self.padding_algorithm = "SAME"
@@ -301,7 +318,9 @@ def init_paddings(self):
 
 
 def create_test_padding_VALID_class(parent):
+
     class TestPaddingVALIDCase(parent):
+
         def init_paddings(self):
             self.pad = [1, 1]
             self.padding_algorithm = "VALID"
@@ -312,9 +331,11 @@ def init_paddings(self):
 
 
 def create_test_cudnn_padding_SAME_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNPaddingSMAECase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = np.float32 if core.is_compiled_with_rocm(
@@ -330,9 +351,11 @@ def init_paddings(self):
 
 
 def create_test_cudnn_padding_VALID_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNPaddingVALIDCase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = np.float32 if core.is_compiled_with_rocm(
@@ -348,6 +371,7 @@ def init_paddings(self):
 
 
 class TestConv2DOp(OpTest):
+
     def setUp(self):
         self.op_type = "conv2d"
         self.use_cudnn = False
@@ -422,51 +446,49 @@ def setUp(self):
         self.outputs = {'Output': output}
 
     def has_cuda(self):
-        return core.is_compiled_with_cuda() and (self.use_cudnn or
-                                                 self.use_cuda)
+        return core.is_compiled_with_cuda() and (self.use_cudnn
+                                                 or self.use_cuda)
 
     def test_check_output(self):
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output_with_place(
-            place, atol=1e-5, check_dygraph=(self.use_mkldnn == False))
+        self.check_output_with_place(place,
+                                     atol=1e-5,
+                                     check_dygraph=(self.use_mkldnn == False))
 
     def test_check_grad(self):
-        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and
-                                        self.no_need_check_grad == True):
+        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad")
+                                        and self.no_need_check_grad == True):
             return
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad_with_place(
-            place, {'Input', 'Filter'},
-            'Output',
-            max_relative_error=0.02,
-            check_dygraph=(self.use_mkldnn == False))
+        self.check_grad_with_place(place, {'Input', 'Filter'},
+                                   'Output',
+                                   max_relative_error=0.02,
+                                   check_dygraph=(self.use_mkldnn == False))
 
     def test_check_grad_no_filter(self):
-        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and
-                                        self.no_need_check_grad == True):
+        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad")
+                                        and self.no_need_check_grad == True):
             return
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad_with_place(
-            place, ['Input'],
-            'Output',
-            max_relative_error=0.02,
-            no_grad_set=set(['Filter']),
-            check_dygraph=(self.use_mkldnn == False))
+        self.check_grad_with_place(place, ['Input'],
+                                   'Output',
+                                   max_relative_error=0.02,
+                                   no_grad_set=set(['Filter']),
+                                   check_dygraph=(self.use_mkldnn == False))
 
     def test_check_grad_no_input(self):
-        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and
-                                        self.no_need_check_grad == True):
+        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad")
+                                        and self.no_need_check_grad == True):
             return
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad_with_place(
-            place, ['Filter'],
-            'Output',
-            no_grad_set=set(['Input']),
-            check_dygraph=(self.use_mkldnn == False))
+        self.check_grad_with_place(place, ['Filter'],
+                                   'Output',
+                                   no_grad_set=set(['Input']),
+                                   check_dygraph=(self.use_mkldnn == False))
 
     def init_test_case(self):
         self.pad = [0, 0]
@@ -490,6 +512,7 @@ def init_kernel_type(self):
 
 
 class TestWithPad(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -500,6 +523,7 @@ def init_test_case(self):
 
 
 class TestWithStride(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -510,6 +534,7 @@ def init_test_case(self):
 
 
 class TestWithGroup(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -521,6 +546,7 @@ def init_test_case(self):
 
 
 class TestWith1x1(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -534,6 +560,7 @@ def init_group(self):
 
 
 class TestWithDepthWise3x3(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -550,6 +577,7 @@ def init_group(self):
 
 
 class TestWithDepthWise5x5(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -563,6 +591,7 @@ def init_group(self):
 
 
 class TestWithDepthWise7x7(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -576,6 +605,7 @@ def init_group(self):
 
 
 class TestWithDilation(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -592,6 +622,7 @@ def init_group(self):
 
 
 class TestWithInput1x1Filter1x1(TestConv2DOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -633,6 +664,7 @@ def init_group(self):
 
 
 class TestCUDNNExhaustiveSearch(TestConv2DOp):
+
     def init_kernel_type(self):
         self.use_cudnn = True
         self.exhaustive_search = True
@@ -640,13 +672,14 @@ def init_kernel_type(self):
 
 
 class TestConv2DOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
             def test_Variable():
                 # the input of conv2d must be Variable.
-                x1 = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                             [[1, 1, 1, 1]], fluid.CPUPlace())
                 fluid.layers.conv2d(x1, 1, 1)
 
             self.assertRaises(TypeError, test_Variable)
@@ -654,8 +687,9 @@ def test_Variable():
             def test_dtype():
                 # the input dtype of conv2d must be float16 or float32 or float64
                 # float16 only can be set on GPU place
-                x2 = fluid.layers.data(
-                    name='x2', shape=[3, 4, 5, 6], dtype="int32")
+                x2 = fluid.layers.data(name='x2',
+                                       shape=[3, 4, 5, 6],
+                                       dtype="int32")
                 fluid.layers.conv2d(x2, 1, 1)
 
             self.assertRaises(TypeError, test_dtype)
@@ -671,6 +705,7 @@ def test_dtype():
 
 
 class TestConv2DOp_v2(OpTest):
+
     def setUp(self):
         self.op_type = "conv2d"
         self.use_cudnn = False
@@ -704,9 +739,10 @@ def setUp(self):
         else:
             input2 = input
         filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
-        output, _, _, _, _ = conv2d_forward_naive(
-            input2, filter, self.groups, conv2d_param, self.padding_algorithm,
-            self.data_format)
+        output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups,
+                                                  conv2d_param,
+                                                  self.padding_algorithm,
+                                                  self.data_format)
         output = output.astype(self.dtype)
 
         self.inputs = {
@@ -729,48 +765,46 @@ def setUp(self):
         self.outputs = {'Output': output}
 
     def has_cuda(self):
-        return core.is_compiled_with_cuda() and (self.use_cudnn or
-                                                 self.use_cuda)
+        return core.is_compiled_with_cuda() and (self.use_cudnn
+                                                 or self.use_cuda)
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
-        self.check_output_with_place(
-            place, atol=1e-5, check_dygraph=(self.use_mkldnn == False))
+        self.check_output_with_place(place,
+                                     atol=1e-5,
+                                     check_dygraph=(self.use_mkldnn == False))
 
     def test_check_grad(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.dtype == np.float16:
             return
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
-        self.check_grad_with_place(
-            place, {'Input', 'Filter'},
-            'Output',
-            max_relative_error=0.02,
-            check_dygraph=(self.use_mkldnn == False))
+        self.check_grad_with_place(place, {'Input', 'Filter'},
+                                   'Output',
+                                   max_relative_error=0.02,
+                                   check_dygraph=(self.use_mkldnn == False))
 
     def test_check_grad_no_filter(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.dtype == np.float16:
             return
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
-        self.check_grad_with_place(
-            place, ['Input'],
-            'Output',
-            max_relative_error=0.02,
-            no_grad_set=set(['Filter']),
-            check_dygraph=(self.use_mkldnn == False))
+        self.check_grad_with_place(place, ['Input'],
+                                   'Output',
+                                   max_relative_error=0.02,
+                                   no_grad_set=set(['Filter']),
+                                   check_dygraph=(self.use_mkldnn == False))
 
     def test_check_grad_no_input(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.dtype == np.float16:
             return
         place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace()
-        self.check_grad_with_place(
-            place, ['Filter'],
-            'Output',
-            no_grad_set=set(['Input']),
-            check_dygraph=(self.use_mkldnn == False))
+        self.check_grad_with_place(place, ['Filter'],
+                                   'Output',
+                                   no_grad_set=set(['Input']),
+                                   check_dygraph=(self.use_mkldnn == False))
 
     def init_test_case(self):
         self.pad = [0, 0]
@@ -801,12 +835,14 @@ def init_test_case_2(self):
 
 
 class TestConv2DOp_AsyPadding(TestConv2DOp_v2):
+
     def init_paddings(self):
         self.pad = [0, 0, 1, 2]
         self.padding_algorithm = "EXPLICIT"
 
 
 class TestWithPad_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
@@ -820,6 +856,7 @@ def init_paddings(self):
 
 
 class TestWithStride_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [2, 2]
         self.input_size = [2, 3, 6, 6]  # NCHW
@@ -833,6 +870,7 @@ def init_paddings(self):
 
 
 class TestWithGroup_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 2]
@@ -844,6 +882,7 @@ def init_test_case(self):
 
 
 class TestWith1x1_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
@@ -860,6 +899,7 @@ def init_paddings(self):
 
 
 class TestWithDepthWise3x3_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [3, 4, 10, 10]  # NCHW
@@ -879,6 +919,7 @@ def init_paddings(self):
 
 
 class TestWithDepthWise5x5_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 4, 10, 10]  # NCHW
@@ -895,6 +936,7 @@ def init_paddings(self):
 
 
 class TestWithDepthWise7x7_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [2, 2]
         self.input_size = [2, 8, 10, 10]  # NCHW
@@ -911,6 +953,7 @@ def init_paddings(self):
 
 
 class TestWithDilation_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 10, 10]  # NCHW
@@ -930,6 +973,7 @@ def init_paddings(self):
 
 
 class TestWithInput1x1Filter1x1_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [40, 3, 1, 1]  # NCHW
@@ -990,16 +1034,16 @@ def init_paddings(self):
 create_test_cudnn_channel_last_class(TestWithGroup_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithDilation_AsyPadding)
 
-create_test_cudnn_channel_last_fp16_class(
-    TestConv2DOp_AsyPadding, grad_check=False)
-create_test_cudnn_channel_last_fp16_class(
-    TestWithPad_AsyPadding, grad_check=False)
-create_test_cudnn_channel_last_fp16_class(
-    TestWithStride_AsyPadding, grad_check=False)
-create_test_cudnn_channel_last_fp16_class(
-    TestWithGroup_AsyPadding, grad_check=False)
-create_test_cudnn_channel_last_fp16_class(
-    TestWithDilation_AsyPadding, grad_check=False)
+create_test_cudnn_channel_last_fp16_class(TestConv2DOp_AsyPadding,
+                                          grad_check=False)
+create_test_cudnn_channel_last_fp16_class(TestWithPad_AsyPadding,
+                                          grad_check=False)
+create_test_cudnn_channel_last_fp16_class(TestWithStride_AsyPadding,
+                                          grad_check=False)
+create_test_cudnn_channel_last_fp16_class(TestWithGroup_AsyPadding,
+                                          grad_check=False)
+create_test_cudnn_channel_last_fp16_class(TestWithDilation_AsyPadding,
+                                          grad_check=False)
 
 if __name__ == '__main__':
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op_depthwise_conv.py b/python/paddle/fluid/tests/unittests/test_conv2d_op_depthwise_conv.py
index 1b680c5a06be6..8e43e4d48de16 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op_depthwise_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op_depthwise_conv.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 import paddle
+
 paddle.enable_static()
 import paddle.fluid.core as core
 import paddle.fluid as fluid
@@ -29,6 +30,7 @@
 
 
 class TestDepthwiseConv(TestConv2DOp):
+
     def init_test_case(self):
         self.use_cuda = True
         self.pad = [1, 1]
@@ -42,6 +44,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConv2(TestConv2DOp):
+
     def init_test_case(self):
         self.use_cuda = True
         self.pad = [1, 1]
@@ -55,6 +58,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConv3(TestConv2DOp):
+
     def init_test_case(self):
         self.use_cuda = True
         self.pad = [1, 1]
@@ -68,6 +72,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConvWithDilation(TestConv2DOp):
+
     def init_test_case(self):
         self.use_cuda = True
         self.pad = [1, 1]
@@ -82,6 +87,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConvWithDilation2(TestConv2DOp):
+
     def init_test_case(self):
         self.use_cuda = True
         self.pad = [1, 1]
@@ -96,6 +102,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConvandFuse(TestConv2DOp):
+
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -110,6 +117,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConv2andFuse(TestConv2DOp):
+
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -124,6 +132,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConv3andFuse(TestConv2DOp):
+
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -138,6 +147,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConvWithDilationandFuse(TestConv2DOp):
+
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -153,6 +163,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConvWithDilation2andFuse(TestConv2DOp):
+
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -168,6 +179,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConv_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.use_cuda = True
         self.stride = [2, 2]
@@ -184,6 +196,7 @@ def init_paddings(self):
 
 
 class TestDepthwiseConv2_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.use_cuda = True
         self.stride = [1, 1]
@@ -200,6 +213,7 @@ def init_paddings(self):
 
 
 class TestDepthwiseConv3_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.use_cuda = True
         self.stride = [1, 1]
@@ -216,6 +230,7 @@ def init_paddings(self):
 
 
 class TestDepthwiseConvWithDilation_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.use_cuda = True
         self.pad = [1, 1]
@@ -234,6 +249,7 @@ def init_paddings(self):
 
 
 class TestDepthwiseConvWithDilation2_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.use_cuda = True
         self.pad = [1, 1]
@@ -252,6 +268,7 @@ def init_paddings(self):
 
 
 class TestDepthwiseConvandFuse_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -270,6 +287,7 @@ def init_paddings(self):
 
 
 class TestDepthwiseConv2andFuse_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -288,6 +306,7 @@ def init_paddings(self):
 
 
 class TestDepthwiseConv3andFuse_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -306,6 +325,7 @@ def init_paddings(self):
 
 
 class TestDepthwiseConvWithDilationandFuse_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -325,6 +345,7 @@ def init_paddings(self):
 
 
 class TestDepthwiseConvWithDilation2andFuse_AsyPadding(TestConv2DOp_v2):
+
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
index 83d2734318961..74d50c545c658 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
@@ -21,6 +21,7 @@
 
 
 class Conv2DTransposeTestCase(unittest.TestCase):
+
     def __init__(self,
                  methodName='runTest',
                  batch_size=4,
@@ -60,8 +61,8 @@ def setUp(self):
             input_shape = (self.batch_size, ) + self.spartial_shape + (
                 self.num_channels, )
         else:
-            input_shape = (self.batch_size, self.num_channels
-                           ) + self.spartial_shape
+            input_shape = (self.batch_size,
+                           self.num_channels) + self.spartial_shape
         self.input = np.random.randn(*input_shape).astype(self.dtype)
 
         if isinstance(self.filter_size, int):
@@ -70,8 +71,8 @@ def setUp(self):
             filter_size = self.filter_size
         self.weight_shape = weight_shape = (self.num_channels, self.num_filters
                                             // self.groups) + tuple(filter_size)
-        self.weight = np.random.uniform(
-            -1, 1, size=weight_shape).astype(self.dtype)
+        self.weight = np.random.uniform(-1, 1,
+                                        size=weight_shape).astype(self.dtype)
         if not self.no_bias:
             self.bias = np.random.uniform(
                 -1, 1, size=(self.num_filters, )).astype(self.dtype)
@@ -118,27 +119,27 @@ def functional(self, place):
                 input_shape = (-1, -1, -1,self.num_channels) \
                     if self.channel_last else (-1, self.num_channels, -1, -1)
                 x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                w_var = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
-                b_var = fluid.data(
-                    "bias", (self.num_filters, ), dtype=self.dtype)
+                w_var = fluid.data("weight",
+                                   self.weight_shape,
+                                   dtype=self.dtype)
+                b_var = fluid.data("bias", (self.num_filters, ),
+                                   dtype=self.dtype)
 
                 if self.output_padding != 0:
                     output_size = None
                 else:
                     output_size = self.output_size
 
-                y_var = F.conv2d_transpose(
-                    x_var,
-                    w_var,
-                    None if self.no_bias else b_var,
-                    output_size=output_size,
-                    padding=self.padding,
-                    output_padding=self.output_padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format)
+                y_var = F.conv2d_transpose(x_var,
+                                           w_var,
+                                           None if self.no_bias else b_var,
+                                           output_size=output_size,
+                                           padding=self.padding,
+                                           output_padding=self.output_padding,
+                                           stride=self.stride,
+                                           dilation=self.dilation,
+                                           groups=self.groups,
+                                           data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
             feed_dict["bias"] = self.bias
@@ -155,16 +156,15 @@ def paddle_nn_layer(self):
         else:
             output_size = self.output_size
 
-        conv = nn.Conv2DTranspose(
-            self.num_channels,
-            self.num_filters,
-            self.filter_size,
-            padding=self.padding,
-            output_padding=self.output_padding,
-            stride=self.stride,
-            dilation=self.dilation,
-            groups=self.groups,
-            data_format=self.data_format)
+        conv = nn.Conv2DTranspose(self.num_channels,
+                                  self.num_filters,
+                                  self.filter_size,
+                                  padding=self.padding,
+                                  output_padding=self.output_padding,
+                                  stride=self.stride,
+                                  dilation=self.dilation,
+                                  groups=self.groups,
+                                  data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
@@ -194,6 +194,7 @@ def runTest(self):
 
 
 class Conv2DTransposeErrorTestCase(Conv2DTransposeTestCase):
+
     def runTest(self):
         place = fluid.CPUPlace()
         with dg.guard(place):
@@ -204,49 +205,46 @@ def runTest(self):
 def add_cases(suite):
     suite.addTest(Conv2DTransposeTestCase(methodName='runTest'))
     suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', stride=[1, 2], no_bias=True, dilation=2))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest',
-            filter_size=(3, 3),
-            output_size=[20, 36],
-            stride=[1, 2],
-            dilation=2))
+        Conv2DTransposeTestCase(methodName='runTest',
+                                stride=[1, 2],
+                                no_bias=True,
+                                dilation=2))
     suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', stride=2, dilation=(2, 1)))
+        Conv2DTransposeTestCase(methodName='runTest',
+                                filter_size=(3, 3),
+                                output_size=[20, 36],
+                                stride=[1, 2],
+                                dilation=2))
     suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', padding="valid"))
+        Conv2DTransposeTestCase(methodName='runTest', stride=2,
+                                dilation=(2, 1)))
+    suite.addTest(Conv2DTransposeTestCase(methodName='runTest',
+                                          padding="valid"))
     suite.addTest(Conv2DTransposeTestCase(methodName='runTest', padding="same"))
     suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', filter_size=1, padding=(2, 3)))
+        Conv2DTransposeTestCase(methodName='runTest',
+                                filter_size=1,
+                                padding=(2, 3)))
     suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', padding=[1, 2, 2, 1]))
+        Conv2DTransposeTestCase(methodName='runTest', padding=[1, 2, 2, 1]))
     suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', padding=[[0, 0], [0, 0], [1, 2], [2, 1]]))
+        Conv2DTransposeTestCase(methodName='runTest',
+                                padding=[[0, 0], [0, 0], [1, 2], [2, 1]]))
     suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', data_format="NHWC"))
+        Conv2DTransposeTestCase(methodName='runTest', data_format="NHWC"))
     suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest',
-            data_format="NHWC",
-            padding=[[0, 0], [1, 1], [2, 2], [0, 0]]))
+        Conv2DTransposeTestCase(methodName='runTest',
+                                data_format="NHWC",
+                                padding=[[0, 0], [1, 1], [2, 2], [0, 0]]))
     suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', groups=2, padding="valid"))
+        Conv2DTransposeTestCase(methodName='runTest', groups=2,
+                                padding="valid"))
     suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest',
-            num_filters=6,
-            num_channels=3,
-            groups=3,
-            padding="valid"))
+        Conv2DTransposeTestCase(methodName='runTest',
+                                num_filters=6,
+                                num_channels=3,
+                                groups=3,
+                                padding="valid"))
     suite.addTest(
         Conv2DTransposeTestCase(
             methodName='runTest',
@@ -258,16 +256,18 @@ def add_cases(suite):
             padding=2,
             stride=2,
             output_size=[14, 14],
-            output_padding=[1, 1], ))
+            output_padding=[1, 1],
+        ))
 
 
 def add_error_cases(suite):
     suite.addTest(
-        Conv2DTransposeErrorTestCase(
-            methodName='runTest', num_channels=5, groups=2))
+        Conv2DTransposeErrorTestCase(methodName='runTest',
+                                     num_channels=5,
+                                     groups=2))
     suite.addTest(
-        Conv2DTransposeErrorTestCase(
-            methodName='runTest', output_size="not_valid"))
+        Conv2DTransposeErrorTestCase(methodName='runTest',
+                                     output_size="not_valid"))
 
 
 def load_tests(loader, standard_tests, pattern):
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index 89125dc326d15..c10d71baf32de 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -19,6 +19,7 @@
 
 import paddle
 import paddle.nn as nn
+
 paddle.enable_static()
 import paddle.fluid.core as core
 import paddle.fluid as fluid
@@ -47,11 +48,12 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
     # update pad and dilation
     def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
         padding = []
-        for input_size, filter_size, stride_size in zip(
-                input_shape, kernel_size, kernel_stride):
+        for input_size, filter_size, stride_size in zip(input_shape,
+                                                        kernel_size,
+                                                        kernel_stride):
             out_size = int((input_size + stride_size - 1) / stride_size)
-            pad_sum = np.max((
-                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0))
             pad_0 = int(pad_sum / 2)
             pad_1 = int(pad_sum - pad_0)
             padding.append(pad_0)
@@ -85,8 +87,8 @@ def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
     if 'output_padding' in attrs:
         out_pad_h = attrs['output_padding'][0]
         out_pad_w = attrs['output_padding'][1]
-    out = np.zeros(
-        (in_n, out_c, out_h + out_pad_h, out_w + out_pad_w), dtype=input_.dtype)
+    out = np.zeros((in_n, out_c, out_h + out_pad_h, out_w + out_pad_w),
+                   dtype=input_.dtype)
 
     for n in range(in_n):
         for i in range(in_h):
@@ -104,17 +106,18 @@ def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
                             axis=0)
                         i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
                         j1, j2 = j * stride[1], j * stride[1] + d_bolck_w
-                        out[n, g * f_out_c + k, i1:i2:dilations[0], j1:j2:
-                            dilations[1]] += tmp_out
+                        out[n, g * f_out_c + k, i1:i2:dilations[0],
+                            j1:j2:dilations[1]] += tmp_out
 
-    out = out[:, :, pad_h_0:out_h - pad_h_1 + out_pad_h, pad_w_0:out_w - pad_w_1
-              + out_pad_w]
+    out = out[:, :, pad_h_0:out_h - pad_h_1 + out_pad_h,
+              pad_w_0:out_w - pad_w_1 + out_pad_w]
     if attrs['data_format'] == 'NHWC':
         out = np.transpose(out, [0, 2, 3, 1])
     return out
 
 
 class TestConv2DTransposeOp(OpTest):
+
     def setUp(self):
         # init as conv transpose
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
@@ -169,37 +172,39 @@ def test_check_grad_no_input(self):
         if self.need_check_grad:
             if self.use_cudnn:
                 place = core.CUDAPlace(0)
-                self.check_grad_with_place(
-                    place, ['Filter'],
-                    'Output',
-                    max_relative_error=0.02,
-                    no_grad_set=set(['Input']))
+                self.check_grad_with_place(place, ['Filter'],
+                                           'Output',
+                                           max_relative_error=0.02,
+                                           no_grad_set=set(['Input']))
             else:
-                self.check_grad(
-                    ['Filter'], 'Output', no_grad_set=set(['Input']))
+                self.check_grad(['Filter'],
+                                'Output',
+                                no_grad_set=set(['Input']))
 
     def test_check_grad_no_filter(self):
         if self.need_check_grad:
             if self.use_cudnn:
                 place = core.CUDAPlace(0)
-                self.check_grad_with_place(
-                    place, ['Input'], 'Output', no_grad_set=set(['Filter']))
+                self.check_grad_with_place(place, ['Input'],
+                                           'Output',
+                                           no_grad_set=set(['Filter']))
             else:
-                self.check_grad(
-                    ['Input'], 'Output', no_grad_set=set(['Filter']))
+                self.check_grad(['Input'],
+                                'Output',
+                                no_grad_set=set(['Filter']))
 
     def test_check_grad(self):
         if self.need_check_grad:
             if self.use_cudnn:
                 place = core.CUDAPlace(0)
-                self.check_grad_with_place(
-                    place,
-                    set(['Input', 'Filter']),
-                    'Output',
-                    max_relative_error=0.02)
+                self.check_grad_with_place(place,
+                                           set(['Input', 'Filter']),
+                                           'Output',
+                                           max_relative_error=0.02)
             else:
-                self.check_grad(
-                    set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+                self.check_grad(set(['Input', 'Filter']),
+                                'Output',
+                                max_relative_error=0.02)
 
     def init_test_case(self):
         self.pad = [0, 0]
@@ -215,6 +220,7 @@ def init_op_type(self):
 
 
 class TestWithSymmetricPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -226,6 +232,7 @@ def init_test_case(self):
 
 
 class TestWithAsymmetricPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 0, 1, 2]
         self.stride = [1, 1]
@@ -237,6 +244,7 @@ def init_test_case(self):
 
 
 class TestWithSAMEPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.stride = [2, 1]
         self.dilations = [1, 2]
@@ -248,6 +256,7 @@ def init_test_case(self):
 
 
 class TestWithVALIDPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.dilations = [1, 1]
@@ -259,6 +268,7 @@ def init_test_case(self):
 
 
 class TestWithGroups(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -270,6 +280,7 @@ def init_test_case(self):
 
 
 class TestWithStride(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -281,6 +292,7 @@ def init_test_case(self):
 
 
 class TestWithDilation(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -292,6 +304,7 @@ def init_test_case(self):
 
 
 class TestWithEvenUpsample(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [2, 2]
@@ -304,6 +317,7 @@ def init_test_case(self):
 
 
 class TestWithEvenUpsampleOutputPadding(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [2, 2]
@@ -316,6 +330,7 @@ def init_test_case(self):
 
 
 class Test_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -328,6 +343,7 @@ def init_test_case(self):
 
 
 class TestWithSymmetricPad_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -340,6 +356,7 @@ def init_test_case(self):
 
 
 class TestWithAsymmetricPad_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 0, 1, 2]
         self.stride = [1, 1]
@@ -352,6 +369,7 @@ def init_test_case(self):
 
 
 class TestWithGroups_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -364,6 +382,7 @@ def init_test_case(self):
 
 
 class TestWithStride_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -376,6 +395,7 @@ def init_test_case(self):
 
 
 class TestWithDilation_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -388,6 +408,7 @@ def init_test_case(self):
 
 
 class TestWithEvenUpsample_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [2, 2]
@@ -401,6 +422,7 @@ def init_test_case(self):
 
 
 class TestWithEvenUpsample_NHWC_output_padding(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [2, 2]
@@ -417,6 +439,7 @@ def init_test_case(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNN(TestConv2DTransposeOp):
+
     def init_op_type(self):
         self.use_cudnn = True
         self.op_type = "conv2d_transpose"
@@ -425,6 +448,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -442,6 +466,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
+
     def init_test_case(self):
         self.pad = [1, 0, 1, 2]
         self.stride = [1, 1]
@@ -459,6 +484,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithSAMEPad(TestWithSAMEPad):
+
     def init_test_case(self):
         self.pad = [1, 0, 1, 2]
         self.stride = [1, 2]
@@ -476,6 +502,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithVALIDPad(TestWithVALIDPad):
+
     def init_test_case(self):
         self.pad = [1, 0, 1, 2]
         self.stride = [1, 1]
@@ -493,6 +520,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithStride(TestWithStride):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -510,6 +538,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithGroups(TestWithGroups):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -528,6 +557,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithEvenUpsample(TestWithEvenUpsample):
+
     def init_op_type(self):
         self.use_cudnn = True
         self.op_type = "conv2d_transpose"
@@ -551,6 +581,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNN_NHWC(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -569,6 +600,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -587,6 +619,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithAsymmetricPad_NHWC(TestWithSymmetricPad):
+
     def init_test_case(self):
         self.pad = [1, 0, 2, 3]
         self.stride = [2, 2]
@@ -605,6 +638,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithStride_NHWC(TestWithStride):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -623,6 +657,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithGroups_NHWC(TestWithGroups):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -641,6 +676,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithEvenUpsample_NHWC(TestWithEvenUpsample):
+
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [2, 2]
@@ -660,6 +696,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNN_FP16(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.dtype = np.float16
         self.pad = [1, 1]
@@ -687,6 +724,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNN_NHWC_FP16(TestCUDNN_FP16):
+
     def init_test_case(self):
         self.dtype = np.float16
         self.pad = [0, 0]
@@ -702,6 +740,7 @@ def init_test_case(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithSymmetricPad_NHWC_FP16(TestCUDNN_FP16):
+
     def init_test_case(self):
         self.dtype = np.float16
         self.pad = [1, 1]
@@ -717,6 +756,7 @@ def init_test_case(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithAsymmetricPad_NHWC_FP16(TestCUDNN_FP16):
+
     def init_test_case(self):
         self.dtype = np.float16
         self.pad = [1, 0, 2, 3]
@@ -732,6 +772,7 @@ def init_test_case(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithStride_NHWC_FP16(TestCUDNN_FP16):
+
     def init_test_case(self):
         self.dtype = np.float16
         self.pad = [1, 1]
@@ -747,6 +788,7 @@ def init_test_case(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithGroups_NHWC_FP16(TestCUDNN_FP16):
+
     def init_test_case(self):
         self.dtype = np.float16
         self.pad = [1, 1]
@@ -762,6 +804,7 @@ def init_test_case(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithEvenUpsample_NHWC_FP16(TestCUDNN_FP16):
+
     def init_test_case(self):
         self.dtype = np.float16
         self.pad = [2, 2]
@@ -776,58 +819,56 @@ def init_test_case(self):
 
 
 class TestConv2DTransposeAPI(unittest.TestCase):
+
     def test_case1(self):
-        data1 = fluid.layers.data(
-            name='data1', shape=[3, 5, 5], dtype='float32')
-        data2 = fluid.layers.data(
-            name='data2', shape=[5, 5, 3], dtype='float32')
-        out1 = fluid.layers.conv2d_transpose(
-            input=data1,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            data_format='NCHW')
-        out2 = fluid.layers.conv2d_transpose(
-            input=data2,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            data_format='NHWC')
-        out3 = fluid.layers.conv2d_transpose(
-            input=data1,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
-            data_format='NHWC')
-        out4 = fluid.layers.conv2d_transpose(
-            input=data1,
-            groups=3,
-            num_filters=6,
-            filter_size=3,
-            padding=[[0, 0], [0, 0], [2, 1], [0, 0]],
-            data_format='NCHW')
-        out5 = fluid.layers.conv2d_transpose(
-            input=data2,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            padding='SAME',
-            data_format='NCHW')
-        out6 = fluid.layers.conv2d_transpose(
-            input=data1,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            padding='VALID',
-            data_format='NHWC')
-        out7 = fluid.layers.conv2d_transpose(
-            input=data1,
-            groups=1,
-            num_filters=6,
-            output_size=[7, 7],
-            padding=[0, 0],
-            data_format='NHWC')
+        data1 = fluid.layers.data(name='data1',
+                                  shape=[3, 5, 5],
+                                  dtype='float32')
+        data2 = fluid.layers.data(name='data2',
+                                  shape=[5, 5, 3],
+                                  dtype='float32')
+        out1 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             data_format='NCHW')
+        out2 = fluid.layers.conv2d_transpose(input=data2,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             data_format='NHWC')
+        out3 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding=[[0, 0], [1, 1], [1, 1],
+                                                      [0, 0]],
+                                             data_format='NHWC')
+        out4 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=3,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding=[[0, 0], [0, 0], [2, 1],
+                                                      [0, 0]],
+                                             data_format='NCHW')
+        out5 = fluid.layers.conv2d_transpose(input=data2,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding='SAME',
+                                             data_format='NCHW')
+        out6 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding='VALID',
+                                             data_format='NHWC')
+        out7 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=1,
+                                             num_filters=6,
+                                             output_size=[7, 7],
+                                             padding=[0, 0],
+                                             data_format='NHWC')
 
         data1_np = np.random.random((2, 3, 5, 5)).astype("float32")
         data2_np = np.random.random((2, 5, 5, 3)).astype("float32")
@@ -838,12 +879,13 @@ def test_case1(self):
             place = core.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        results = exe.run(
-            fluid.default_main_program(),
-            feed={"data1": data1_np,
-                  "data2": data2_np},
-            fetch_list=[out1, out2, out3, out4, out5, out6, out7],
-            return_numpy=True)
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "data1": data1_np,
+                              "data2": data2_np
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5, out6, out7],
+                          return_numpy=True)
         self.assertIsNotNone(results[0])
         self.assertIsNotNone(results[1])
         self.assertIsNotNone(results[2])
@@ -854,71 +896,73 @@ def test_case1(self):
 
 
 class TestConv2DTransposeOpException(unittest.TestCase):
+
     def test_exception(self):
         data = fluid.layers.data(name='data', shape=[3, 5, 5], dtype="float32")
 
         def attr_data_format():
-            out = fluid.layers.conv2d_transpose(
-                input=data,
-                groups=1,
-                num_filters=6,
-                filter_size=3,
-                data_format="NCDHW")
+            out = fluid.layers.conv2d_transpose(input=data,
+                                                groups=1,
+                                                num_filters=6,
+                                                filter_size=3,
+                                                data_format="NCDHW")
 
         self.assertRaises(ValueError, attr_data_format)
 
         def attr_padding_str():
-            out = fluid.layers.conv2d_transpose(
-                input=data,
-                groups=1,
-                num_filters=6,
-                filter_size=3,
-                padding='Vald')
+            out = fluid.layers.conv2d_transpose(input=data,
+                                                groups=1,
+                                                num_filters=6,
+                                                filter_size=3,
+                                                padding='Vald')
 
         self.assertRaises(ValueError, attr_padding_str)
 
         def attr_padding_list():
-            out = fluid.layers.conv2d_transpose(
-                input=data,
-                groups=1,
-                num_filters=6,
-                filter_size=3,
-                padding=[[1, 1], [1, 1], [0, 0], [0, 0]])
+            out = fluid.layers.conv2d_transpose(input=data,
+                                                groups=1,
+                                                num_filters=6,
+                                                filter_size=3,
+                                                padding=[[1, 1], [1, 1], [0, 0],
+                                                         [0, 0]])
 
         self.assertRaises(ValueError, attr_padding_list)
 
         def attr_padding_with_data_format():
-            out = fluid.layers.conv2d_transpose(
-                input=data,
-                groups=1,
-                num_filters=6,
-                filter_size=3,
-                padding=[[1, 1], [0, 0], [0, 0], [1, 1]],
-                data_format='NHWC')
+            out = fluid.layers.conv2d_transpose(input=data,
+                                                groups=1,
+                                                num_filters=6,
+                                                filter_size=3,
+                                                padding=[[1, 1], [0, 0], [0, 0],
+                                                         [1, 1]],
+                                                data_format='NHWC')
 
         self.assertRaises(ValueError, attr_padding_with_data_format)
 
-        error_input = fluid.layers.data(
-            name='error_data', shape=[1], dtype="float32")
+        error_input = fluid.layers.data(name='error_data',
+                                        shape=[1],
+                                        dtype="float32")
 
         def error_input_size():
-            out = fluid.layers.conv2d_transpose(
-                input=error_input, groups=1, num_filters=6, filter_size=3)
+            out = fluid.layers.conv2d_transpose(input=error_input,
+                                                groups=1,
+                                                num_filters=6,
+                                                filter_size=3)
 
         self.assertRaises(ValueError, error_input_size)
 
         def error_groups():
-            out = fluid.layers.conv2d_transpose(
-                input=data,
-                groups=0,
-                num_filters=6,
-                filter_size=3,
-                data_format='NHWC')
+            out = fluid.layers.conv2d_transpose(input=data,
+                                                groups=0,
+                                                num_filters=6,
+                                                filter_size=3,
+                                                data_format='NHWC')
 
         self.assertRaises(ValueError, error_groups)
 
 
 class TestConv2DTransposeRepr(unittest.TestCase):
+
     def test_case(self):
         paddle.disable_static()
         x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op_depthwise_conv.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op_depthwise_conv.py
index 65c5d35fe53dd..665413ee4cff4 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op_depthwise_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op_depthwise_conv.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 import paddle
+
 paddle.enable_static()
 import paddle.fluid.core as core
 import paddle.fluid as fluid
@@ -26,6 +27,7 @@
 
 
 class TestDepthwiseConvTranspose(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -39,6 +41,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConvTransposeAsymmetricPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1, 1, 2]
         self.stride = [1, 1]
@@ -53,6 +56,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConvTransposeSAMEPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.dilations = [1, 1]
@@ -66,6 +70,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConvTransposeVALIDPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.dilations = [1, 1]
@@ -79,6 +84,7 @@ def init_test_case(self):
 
 
 class TestDepthwiseConvTranspose_NHWC_3x3kernel(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
index dd6dcf6d5e9ae..42c23eb64fdd3 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
@@ -23,6 +23,7 @@
 
 
 class Conv3DTestCase(unittest.TestCase):
+
     def __init__(self,
                  methodName='runTest',
                  batch_size=4,
@@ -58,8 +59,8 @@ def setUp(self):
             input_shape = (self.batch_size, ) + self.spartial_shape + (
                 self.num_channels, )
         else:
-            input_shape = (self.batch_size, self.num_channels
-                           ) + self.spartial_shape
+            input_shape = (self.batch_size,
+                           self.num_channels) + self.spartial_shape
         self.input = np.random.randn(*input_shape).astype(self.dtype)
 
         if isinstance(self.filter_size, int):
@@ -68,8 +69,8 @@ def setUp(self):
             filter_size = self.filter_size
         self.weight_shape = weight_shape = (self.num_filters, self.num_channels
                                             // self.groups) + tuple(filter_size)
-        self.weight = np.random.uniform(
-            -1, 1, size=weight_shape).astype(self.dtype)
+        self.weight = np.random.uniform(-1, 1,
+                                        size=weight_shape).astype(self.dtype)
         if not self.no_bias:
             self.bias = np.random.uniform(
                 -1, 1, size=(self.num_filters, )).astype(self.dtype)
@@ -89,17 +90,16 @@ def fluid_layer(self, place):
                     bias_attr = False
                 else:
                     bias_attr = I.NumpyArrayInitializer(self.bias)
-                y_var = fluid.layers.conv3d(
-                    x_var,
-                    self.num_filters,
-                    self.filter_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=weight_attr,
-                    bias_attr=bias_attr,
-                    data_format=self.data_format)
+                y_var = fluid.layers.conv3d(x_var,
+                                            self.num_filters,
+                                            self.filter_size,
+                                            padding=self.padding,
+                                            stride=self.stride,
+                                            dilation=self.dilation,
+                                            groups=self.groups,
+                                            param_attr=weight_attr,
+                                            bias_attr=bias_attr,
+                                            data_format=self.data_format)
         feed_dict = {"input": self.input}
         exe = fluid.Executor(place)
         exe.run(start)
@@ -114,19 +114,19 @@ def functional(self, place):
                 input_shape = (-1, -1, -1, -1, self.num_channels) \
                     if self.channel_last else (-1, self.num_channels, -1, -1, -1)
                 x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                w_var = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
-                b_var = fluid.data(
-                    "bias", (self.num_filters, ), dtype=self.dtype)
-                y_var = F.conv3d(
-                    x_var,
-                    w_var,
-                    None if self.no_bias else b_var,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format)
+                w_var = fluid.data("weight",
+                                   self.weight_shape,
+                                   dtype=self.dtype)
+                b_var = fluid.data("bias", (self.num_filters, ),
+                                   dtype=self.dtype)
+                y_var = F.conv3d(x_var,
+                                 w_var,
+                                 None if self.no_bias else b_var,
+                                 padding=self.padding,
+                                 stride=self.stride,
+                                 dilation=self.dilation,
+                                 groups=self.groups,
+                                 data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
             feed_dict["bias"] = self.bias
@@ -138,15 +138,14 @@ def functional(self, place):
     def paddle_nn_layer(self):
         x_var = paddle.to_tensor(self.input)
         x_var.stop_gradient = False
-        conv = nn.Conv3D(
-            self.num_channels,
-            self.num_filters,
-            self.filter_size,
-            padding=self.padding,
-            stride=self.stride,
-            dilation=self.dilation,
-            groups=self.groups,
-            data_format=self.data_format)
+        conv = nn.Conv3D(self.num_channels,
+                         self.num_filters,
+                         self.filter_size,
+                         padding=self.padding,
+                         stride=self.stride,
+                         dilation=self.dilation,
+                         groups=self.groups,
+                         data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
@@ -179,6 +178,7 @@ def runTest(self):
 
 
 class Conv3DErrorTestCase(Conv3DTestCase):
+
     def runTest(self):
         place = fluid.CPUPlace()
         with dg.guard(place):
@@ -189,50 +189,44 @@ def runTest(self):
 def add_cases(suite):
     suite.addTest(Conv3DTestCase(methodName='runTest'))
     suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest', stride=[1, 2, 1], dilation=2))
+        Conv3DTestCase(methodName='runTest', stride=[1, 2, 1], dilation=2))
     suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest', stride=2, dilation=(2, 1, 2)))
+        Conv3DTestCase(methodName='runTest', stride=2, dilation=(2, 1, 2)))
     suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest', padding="same", no_bias=True))
+        Conv3DTestCase(methodName='runTest', padding="same", no_bias=True))
     suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest', filter_size=(3, 2, 3), padding='valid'))
+        Conv3DTestCase(methodName='runTest',
+                       filter_size=(3, 2, 3),
+                       padding='valid'))
     suite.addTest(Conv3DTestCase(methodName='runTest', padding=(2, 3, 1)))
     suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest', padding=[1, 2, 2, 1, 2, 3]))
+        Conv3DTestCase(methodName='runTest', padding=[1, 2, 2, 1, 2, 3]))
     suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest',
-            padding=[[0, 0], [0, 0], [1, 2], [2, 1], [2, 2]]))
+        Conv3DTestCase(methodName='runTest',
+                       padding=[[0, 0], [0, 0], [1, 2], [2, 1], [2, 2]]))
     suite.addTest(Conv3DTestCase(methodName='runTest', data_format="NDHWC"))
     suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest',
-            data_format="NDHWC",
-            padding=[[0, 0], [1, 1], [3, 3], [2, 2], [0, 0]]))
+        Conv3DTestCase(methodName='runTest',
+                       data_format="NDHWC",
+                       padding=[[0, 0], [1, 1], [3, 3], [2, 2], [0, 0]]))
     suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest', groups=2, padding="valid"))
+        Conv3DTestCase(methodName='runTest', groups=2, padding="valid"))
     suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest',
-            num_filters=6,
-            num_channels=3,
-            groups=3,
-            padding="valid"))
+        Conv3DTestCase(methodName='runTest',
+                       num_filters=6,
+                       num_channels=3,
+                       groups=3,
+                       padding="valid"))
 
 
 def add_error_cases(suite):
     suite.addTest(
-        Conv3DErrorTestCase(
-            methodName='runTest', num_channels=5, groups=2))
+        Conv3DErrorTestCase(methodName='runTest', num_channels=5, groups=2))
     suite.addTest(
-        Conv3DErrorTestCase(
-            methodName='runTest', num_channels=5, groups=2, padding=[-1, 1, 3]))
+        Conv3DErrorTestCase(methodName='runTest',
+                            num_channels=5,
+                            groups=2,
+                            padding=[-1, 1, 3]))
 
 
 def load_tests(loader, standard_tests, pattern):
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index 8cf779ccfdd42..370a4820ddaf0 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -63,8 +63,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         for input_size, filter_size, stride_size in zip(input_shape, pool_size,
                                                         pool_stride):
             out_size = int((input_size + stride_size - 1) / stride_size)
-            pad_sum = np.max((
-                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0))
             pad_0 = int(pad_sum / 2)
             pad_1 = int(pad_sum - pad_0)
             padding.append(pad_0)
@@ -106,8 +106,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                        constant_values=0)
 
     filter_dilation = np.zeros((f_n, f_c, d_bolck_d, d_bolck_h, d_bolck_w))
-    filter_dilation[:, :, 0:d_bolck_d:dilation[0], 0:d_bolck_h:dilation[1], 0:
-                    d_bolck_w:dilation[2]] = filter
+    filter_dilation[:, :, 0:d_bolck_d:dilation[0], 0:d_bolck_h:dilation[1],
+                    0:d_bolck_w:dilation[2]] = filter
 
     for d in range(out_d):
         for i in range(out_h):
@@ -131,9 +131,11 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
 
 
 def create_test_cudnn_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNCase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = np.float32 if core.is_compiled_with_rocm(
@@ -145,7 +147,9 @@ def init_kernel_type(self):
 
 
 def create_test_padding_SAME_class(parent):
+
     class TestPaddingSMAECase(parent):
+
         def init_paddings(self):
             self.pad = [0, 0, 0]
             self.padding_algorithm = "SAME"
@@ -156,7 +160,9 @@ def init_paddings(self):
 
 
 def create_test_padding_VALID_class(parent):
+
     class TestPaddingVALIDCase(parent):
+
         def init_paddings(self):
             self.pad = [1, 1, 1]
             self.padding_algorithm = "VALID"
@@ -167,9 +173,11 @@ def init_paddings(self):
 
 
 def create_test_cudnn_padding_SAME_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNPaddingSMAECase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = np.float32 if core.is_compiled_with_rocm(
@@ -185,9 +193,11 @@ def init_paddings(self):
 
 
 def create_test_cudnn_padding_VALID_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNPaddingVALIDCase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = np.float32 if core.is_compiled_with_rocm(
@@ -203,7 +213,9 @@ def init_paddings(self):
 
 
 def create_test_channel_last_class(parent):
+
     class TestChannelLastCase(parent):
+
         def init_data_format(self):
             self.data_format = "NDHWC"
 
@@ -217,9 +229,11 @@ def init_test_case_2(self):
 
 
 def create_test_cudnn_channel_last_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCudnnChannelLastCase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = np.float32 if core.is_compiled_with_rocm(
@@ -238,6 +252,7 @@ def init_test_case_2(self):
 
 
 class TestConv3DOp(OpTest):
+
     def setUp(self):
         self.op_type = "conv3d"
         self.use_cudnn = False
@@ -261,7 +276,8 @@ def setUp(self):
             input,
             filter,
             self.groups,
-            conv3d_param, ).astype(self.dtype)
+            conv3d_param,
+        ).astype(self.dtype)
 
         self.inputs = {
             'Input': OpTest.np_dtype_to_fluid_dtype(input),
@@ -284,43 +300,41 @@ def has_cudnn(self):
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
-        self.check_output_with_place(
-            place, atol=1e-5, check_dygraph=(self.use_mkldnn == False))
+        self.check_output_with_place(place,
+                                     atol=1e-5,
+                                     check_dygraph=(self.use_mkldnn == False))
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
         place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad_with_place(
-            place, {'Input', 'Filter'},
-            'Output',
-            max_relative_error=0.03,
-            check_dygraph=(self.use_mkldnn == False))
+        self.check_grad_with_place(place, {'Input', 'Filter'},
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   check_dygraph=(self.use_mkldnn == False))
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
         place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad_with_place(
-            place, ['Input'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Filter']),
-            check_dygraph=(self.use_mkldnn == False))
+        self.check_grad_with_place(place, ['Input'],
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set(['Filter']),
+                                   check_dygraph=(self.use_mkldnn == False))
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
         place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad_with_place(
-            place, ['Filter'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Input']),
-            check_dygraph=(self.use_mkldnn == False))
+        self.check_grad_with_place(place, ['Filter'],
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set(['Input']),
+                                   check_dygraph=(self.use_mkldnn == False))
 
     def init_test_case(self):
         self.pad = [0, 0, 0]
@@ -344,6 +358,7 @@ def init_kernel_type(self):
 
 
 class TestCase1(TestConv3DOp):
+
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
@@ -354,16 +369,19 @@ def init_test_case(self):
 
 
 class TestWithGroup1(TestConv3DOp):
+
     def init_group(self):
         self.groups = 3
 
 
 class TestWithGroup2(TestCase1):
+
     def init_group(self):
         self.groups = 3
 
 
 class TestWith1x1(TestConv3DOp):
+
     def init_test_case(self):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
@@ -380,6 +398,7 @@ def init_group(self):
 
 
 class TestWithInput1x1Filter1x1(TestConv3DOp):
+
     def init_test_case(self):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
@@ -396,6 +415,7 @@ def init_group(self):
 
 
 class TestWithDilation(TestConv3DOp):
+
     def init_test_case(self):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
@@ -417,6 +437,7 @@ def init_group(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNN(TestConv3DOp):
+
     def init_kernel_type(self):
         self.use_cudnn = True
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
@@ -425,6 +446,7 @@ def init_kernel_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16CUDNN(TestConv3DOp):
+
     def init_kernel_type(self):
         self.use_cudnn = True
         self.dtype = np.float16
@@ -439,6 +461,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestWithGroup1CUDNN(TestWithGroup1):
+
     def init_kernel_type(self):
         self.use_cudnn = True
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
@@ -447,6 +470,7 @@ def init_kernel_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16WithGroup1CUDNN(TestWithGroup1):
+
     def init_kernel_type(self):
         self.use_cudnn = True
         self.dtype = np.float16
@@ -461,6 +485,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestWithGroup2CUDNN(TestWithGroup2):
+
     def init_kernel_type(self):
         self.use_cudnn = True
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
@@ -469,6 +494,7 @@ def init_kernel_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16WithGroup2CUDNN(TestWithGroup2):
+
     def init_kernel_type(self):
         self.use_cudnn = True
         self.dtype = np.float16
@@ -483,6 +509,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestWith1x1CUDNN(TestWith1x1):
+
     def init_kernel_type(self):
         self.use_cudnn = True
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
@@ -491,6 +518,7 @@ def init_kernel_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16With1x1CUDNN(TestWith1x1):
+
     def init_kernel_type(self):
         self.use_cudnn = True
         self.dtype = np.float16
@@ -505,6 +533,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
+
     def init_kernel_type(self):
         self.use_cudnn = True
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
@@ -513,6 +542,7 @@ def init_kernel_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
+
     def init_kernel_type(self):
         self.use_cudnn = True
         self.dtype = np.float16
@@ -525,6 +555,7 @@ def test_check_output(self):
 
 
 class TestCUDNNExhaustiveSearch(TestCUDNN):
+
     def init_kernel_type(self):
         self.use_cudnn = True
         self.exhaustive_search = True
@@ -535,6 +566,7 @@ def init_kernel_type(self):
 
 
 class TestConv3DOp_2(OpTest):
+
     def setUp(self):
         self.op_type = "conv3d"
         self.use_cudnn = False
@@ -589,28 +621,27 @@ def test_check_grad(self):
         if self.dtype == np.float16:
             return
         place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
-        self.check_grad_with_place(
-            place, {'Input', 'Filter'}, 'Output', max_relative_error=0.03)
+        self.check_grad_with_place(place, {'Input', 'Filter'},
+                                   'Output',
+                                   max_relative_error=0.03)
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
         place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
-        self.check_grad_with_place(
-            place, ['Input'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Filter']))
+        self.check_grad_with_place(place, ['Input'],
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
         place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
-        self.check_grad_with_place(
-            place, ['Filter'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Input']))
+        self.check_grad_with_place(place, ['Filter'],
+                                   'Output',
+                                   max_relative_error=0.03,
+                                   no_grad_set=set(['Input']))
 
     def init_test_case(self):
         self.stride = [1, 1, 1]
@@ -640,6 +671,7 @@ def init_data_format(self):
 
 
 class TestConv3DOp_AsyPadding(TestConv3DOp_2):
+
     def init_test_case(self):
         self.stride = [1, 1, 2]
         self.input_size = [2, 3, 4, 4, 4]  # NCDHW
@@ -653,6 +685,7 @@ def init_paddings(self):
 
 
 class TestConv3DOp_DiffDataInDiffDim(TestConv3DOp_2):
+
     def init_test_case(self):
         self.stride = [1, 1, 2]
         self.input_size = [2, 3, 4, 5, 5]  # NCDHW
@@ -671,6 +704,7 @@ def init_paddings(self):
 
 
 class TestCase1_AsyPadding(TestConv3DOp_2):
+
     def init_test_case(self):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 4, 4, 4]  # NCDHW
@@ -684,6 +718,7 @@ def init_paddings(self):
 
 
 class TestWithGroup1_AsyPadding(TestConv3DOp_2):
+
     def init_group(self):
         self.groups = 3
 
@@ -693,6 +728,7 @@ def init_paddings(self):
 
 
 class TestWithGroup2_AsyPadding(TestConv3DOp_2):
+
     def init_test_case(self):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 4, 4, 4]  # NCDHW
@@ -709,6 +745,7 @@ def init_paddings(self):
 
 
 class TestWith1x1_AsyPadding(TestConv3DOp_2):
+
     def init_test_case(self):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 4, 4, 4]
@@ -728,6 +765,7 @@ def init_paddings(self):
 
 
 class TestWithDilation_AsyPadding(TestConv3DOp_2):
+
     def init_test_case(self):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 6, 6, 6]
@@ -793,210 +831,196 @@ def init_paddings(self):
 
 # --------- test python API ---------------
 class TestConv3DAPI(unittest.TestCase):
+
     def test_api(self):
 
-        input_NDHWC = fluid.layers.data(
-            name="input_NDHWC",
-            shape=[2, 5, 5, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
-
-        input_NCDHW = fluid.layers.data(
-            name="input_NCDHW",
-            shape=[2, 3, 5, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
-
-        fluid.layers.conv3d(
-            input=input_NDHWC,
-            num_filters=3,
-            filter_size=[3, 3, 3],
-            stride=[1, 1, 1],
-            padding=0,
-            dilation=[1, 1, 1],
-            groups=1,
-            data_format="NCDHW")
-
-        fluid.layers.conv3d(
-            input=input_NCDHW,
-            num_filters=3,
-            filter_size=[3, 3, 3],
-            stride=[1, 1, 1],
-            padding=[1, 2, 1, 0, 1, 0],
-            dilation=[1, 1, 1],
-            groups=1,
-            data_format="NCDHW")
-
-        fluid.layers.conv3d(
-            input=input_NCDHW,
-            num_filters=3,
-            filter_size=[3, 3, 3],
-            stride=[1, 1, 1],
-            padding=[[0, 0], [0, 0], [1, 1], [1, 1], [1, 1]],
-            dilation=[1, 1, 1],
-            groups=1,
-            data_format="NCDHW")
-
-        fluid.layers.conv3d(
-            input=input_NDHWC,
-            num_filters=3,
-            filter_size=[3, 3, 3],
-            stride=[1, 1, 1],
-            padding=[[0, 0], [1, 1], [1, 1], [1, 1], [0, 0]],
-            dilation=[1, 1, 1],
-            groups=1,
-            data_format="NDHWC")
-
-        fluid.layers.conv3d(
-            input=input_NCDHW,
-            num_filters=3,
-            filter_size=[3, 3, 3],
-            stride=[1, 1, 1],
-            padding="SAME",
-            dilation=[1, 1, 1],
-            groups=1,
-            data_format="NCDHW")
-
-        fluid.layers.conv3d(
-            input=input_NCDHW,
-            num_filters=3,
-            filter_size=[3, 3, 3],
-            stride=[1, 1, 1],
-            padding="VALID",
-            dilation=[1, 1, 1],
-            groups=1,
-            data_format="NCDHW")
+        input_NDHWC = fluid.layers.data(name="input_NDHWC",
+                                        shape=[2, 5, 5, 5, 3],
+                                        append_batch_size=False,
+                                        dtype="float32")
+
+        input_NCDHW = fluid.layers.data(name="input_NCDHW",
+                                        shape=[2, 3, 5, 5, 3],
+                                        append_batch_size=False,
+                                        dtype="float32")
+
+        fluid.layers.conv3d(input=input_NDHWC,
+                            num_filters=3,
+                            filter_size=[3, 3, 3],
+                            stride=[1, 1, 1],
+                            padding=0,
+                            dilation=[1, 1, 1],
+                            groups=1,
+                            data_format="NCDHW")
+
+        fluid.layers.conv3d(input=input_NCDHW,
+                            num_filters=3,
+                            filter_size=[3, 3, 3],
+                            stride=[1, 1, 1],
+                            padding=[1, 2, 1, 0, 1, 0],
+                            dilation=[1, 1, 1],
+                            groups=1,
+                            data_format="NCDHW")
+
+        fluid.layers.conv3d(input=input_NCDHW,
+                            num_filters=3,
+                            filter_size=[3, 3, 3],
+                            stride=[1, 1, 1],
+                            padding=[[0, 0], [0, 0], [1, 1], [1, 1], [1, 1]],
+                            dilation=[1, 1, 1],
+                            groups=1,
+                            data_format="NCDHW")
+
+        fluid.layers.conv3d(input=input_NDHWC,
+                            num_filters=3,
+                            filter_size=[3, 3, 3],
+                            stride=[1, 1, 1],
+                            padding=[[0, 0], [1, 1], [1, 1], [1, 1], [0, 0]],
+                            dilation=[1, 1, 1],
+                            groups=1,
+                            data_format="NDHWC")
+
+        fluid.layers.conv3d(input=input_NCDHW,
+                            num_filters=3,
+                            filter_size=[3, 3, 3],
+                            stride=[1, 1, 1],
+                            padding="SAME",
+                            dilation=[1, 1, 1],
+                            groups=1,
+                            data_format="NCDHW")
+
+        fluid.layers.conv3d(input=input_NCDHW,
+                            num_filters=3,
+                            filter_size=[3, 3, 3],
+                            stride=[1, 1, 1],
+                            padding="VALID",
+                            dilation=[1, 1, 1],
+                            groups=1,
+                            data_format="NCDHW")
 
 
 class TestConv3DAPI_Error(unittest.TestCase):
+
     def test_api(self):
-        input = fluid.layers.data(
-            name="input",
-            shape=[2, 5, 5, 5, 4],
-            append_batch_size=False,
-            dtype="float32")
+        input = fluid.layers.data(name="input",
+                                  shape=[2, 5, 5, 5, 4],
+                                  append_batch_size=False,
+                                  dtype="float32")
 
         # ValueError: cudnn
         def run_1():
-            fluid.layers.conv3d(
-                input=input,
-                num_filters=3,
-                filter_size=3,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=1,
-                use_cudnn=[0],
-                data_format="NCDHW")
+            fluid.layers.conv3d(input=input,
+                                num_filters=3,
+                                filter_size=3,
+                                stride=1,
+                                padding=0,
+                                dilation=1,
+                                groups=1,
+                                use_cudnn=[0],
+                                data_format="NCDHW")
 
         self.assertRaises(ValueError, run_1)
 
         # ValueError: data_format
         def run_2():
-            fluid.layers.conv3d(
-                input=input,
-                num_filters=3,
-                filter_size=[3, 3, 3],
-                stride=[1, 1, 1],
-                padding=0,
-                dilation=[1, 1, 1],
-                groups=1,
-                use_cudnn=False,
-                data_format="NCHWC")
+            fluid.layers.conv3d(input=input,
+                                num_filters=3,
+                                filter_size=[3, 3, 3],
+                                stride=[1, 1, 1],
+                                padding=0,
+                                dilation=[1, 1, 1],
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NCHWC")
 
         self.assertRaises(ValueError, run_2)
 
         # ValueError: padding
         def run_3():
-            fluid.layers.conv3d(
-                input=input,
-                num_filters=3,
-                filter_size=3,
-                stride=1,
-                padding="SAMEE",
-                dilation=1,
-                groups=1,
-                use_cudnn=False,
-                data_format="NCDHW")
+            fluid.layers.conv3d(input=input,
+                                num_filters=3,
+                                filter_size=3,
+                                stride=1,
+                                padding="SAMEE",
+                                dilation=1,
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NCDHW")
 
         self.assertRaises(ValueError, run_3)
 
         def run_4():
-            fluid.layers.conv3d(
-                input=input,
-                num_filters=3,
-                filter_size=3,
-                stride=1,
-                padding=[[0, 1], [0, 0], [0, 1], [0, 1], [0, 1]],
-                dilation=1,
-                groups=1,
-                use_cudnn=False,
-                data_format="NCDHW")
+            fluid.layers.conv3d(input=input,
+                                num_filters=3,
+                                filter_size=3,
+                                stride=1,
+                                padding=[[0, 1], [0, 0], [0, 1], [0, 1], [0,
+                                                                          1]],
+                                dilation=1,
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NCDHW")
 
         self.assertRaises(ValueError, run_4)
 
         def run_5():
-            fluid.layers.conv3d(
-                input=input,
-                num_filters=3,
-                filter_size=0,
-                stride=0,
-                padding=[[0, 1], [0, 1], [0, 1], [0, 1], [0, 1]],
-                dilation=1,
-                groups=1,
-                use_cudnn=False,
-                data_format="NDHWC")
+            fluid.layers.conv3d(input=input,
+                                num_filters=3,
+                                filter_size=0,
+                                stride=0,
+                                padding=[[0, 1], [0, 1], [0, 1], [0, 1], [0,
+                                                                          1]],
+                                dilation=1,
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NDHWC")
 
         self.assertRaises(ValueError, run_5)
 
         # ValueError: channel dimmention
-        x = fluid.layers.data(
-            name="x",
-            shape=[2, 5, 5, 5, -1],
-            append_batch_size=False,
-            dtype="float32")
+        x = fluid.layers.data(name="x",
+                              shape=[2, 5, 5, 5, -1],
+                              append_batch_size=False,
+                              dtype="float32")
 
         def run_6():
-            fluid.layers.conv3d(
-                input=x,
-                num_filters=3,
-                filter_size=3,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=1,
-                use_cudnn=False,
-                data_format="NDHWC")
+            fluid.layers.conv3d(input=x,
+                                num_filters=3,
+                                filter_size=3,
+                                stride=1,
+                                padding=0,
+                                dilation=1,
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NDHWC")
 
         self.assertRaises(ValueError, run_6)
 
         # ValueError: groups
         def run_7():
-            fluid.layers.conv3d(
-                input=input,
-                num_filters=3,
-                filter_size=3,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=3,
-                use_cudnn=False,
-                data_format="NDHWC")
+            fluid.layers.conv3d(input=input,
+                                num_filters=3,
+                                filter_size=3,
+                                stride=1,
+                                padding=0,
+                                dilation=1,
+                                groups=3,
+                                use_cudnn=False,
+                                data_format="NDHWC")
 
         self.assertRaises(ValueError, run_7)
 
         # ValueError: filter num
         def run_8():
-            fluid.layers.conv3d(
-                input=input,
-                num_filters=0,
-                filter_size=0,
-                stride=0,
-                padding=0,
-                dilation=0,
-                groups=1,
-                use_cudnn=False,
-                data_format="NDHWC")
+            fluid.layers.conv3d(input=input,
+                                num_filters=0,
+                                filter_size=0,
+                                stride=0,
+                                padding=0,
+                                dilation=0,
+                                groups=1,
+                                use_cudnn=False,
+                                data_format="NDHWC")
 
         self.assertRaises(ValueError, run_8)
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
index 19249fcfeb3a6..9ad3eaaccfcf2 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
@@ -21,6 +21,7 @@
 
 
 class Conv3DTransposeTestCase(unittest.TestCase):
+
     def __init__(self,
                  methodName='runTest',
                  batch_size=2,
@@ -58,8 +59,8 @@ def setUp(self):
             input_shape = (self.batch_size, ) + self.spartial_shape + (
                 self.num_channels, )
         else:
-            input_shape = (self.batch_size, self.num_channels
-                           ) + self.spartial_shape
+            input_shape = (self.batch_size,
+                           self.num_channels) + self.spartial_shape
         self.input = np.random.randn(*input_shape).astype(self.dtype)
 
         if isinstance(self.filter_size, int):
@@ -68,8 +69,8 @@ def setUp(self):
             filter_size = self.filter_size
         self.weight_shape = weight_shape = (self.num_channels, self.num_filters
                                             // self.groups) + tuple(filter_size)
-        self.weight = np.random.uniform(
-            -1, 1, size=weight_shape).astype(self.dtype)
+        self.weight = np.random.uniform(-1, 1,
+                                        size=weight_shape).astype(self.dtype)
         if self.no_bias:
             self.bias = None
         else:
@@ -115,20 +116,20 @@ def functional(self, place):
                 input_shape = (-1, -1, -1, -1, self.num_channels) \
                     if self.channel_last else (-1, self.num_channels, -1, -1, -1)
                 x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                w_var = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
-                b_var = fluid.data(
-                    "bias", (self.num_filters, ), dtype=self.dtype)
-                y_var = F.conv3d_transpose(
-                    x_var,
-                    w_var,
-                    None if self.no_bias else b_var,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format)
+                w_var = fluid.data("weight",
+                                   self.weight_shape,
+                                   dtype=self.dtype)
+                b_var = fluid.data("bias", (self.num_filters, ),
+                                   dtype=self.dtype)
+                y_var = F.conv3d_transpose(x_var,
+                                           w_var,
+                                           None if self.no_bias else b_var,
+                                           output_size=self.output_size,
+                                           padding=self.padding,
+                                           stride=self.stride,
+                                           dilation=self.dilation,
+                                           groups=self.groups,
+                                           data_format=self.data_format)
         feed_dict = {"input": self.input, "weight": self.weight}
         if self.bias is not None:
             feed_dict["bias"] = self.bias
@@ -139,15 +140,14 @@ def functional(self, place):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv3DTranspose(
-            self.num_channels,
-            self.num_filters,
-            self.filter_size,
-            padding=self.padding,
-            stride=self.stride,
-            dilation=self.dilation,
-            groups=self.groups,
-            data_format=self.data_format)
+        conv = nn.Conv3DTranspose(self.num_channels,
+                                  self.num_filters,
+                                  self.filter_size,
+                                  padding=self.padding,
+                                  stride=self.stride,
+                                  dilation=self.dilation,
+                                  groups=self.groups,
+                                  data_format=self.data_format)
         conv.weight.set_value(self.weight)
         if not self.no_bias:
             conv.bias.set_value(self.bias)
@@ -174,6 +174,7 @@ def runTest(self):
 
 
 class Conv3DTransposeErrorTestCase(Conv3DTransposeTestCase):
+
     def runTest(self):
         place = fluid.CPUPlace()
         with dg.guard(place):
@@ -184,63 +185,65 @@ def runTest(self):
 def add_cases(suite):
     suite.addTest(Conv3DTransposeTestCase(methodName='runTest'))
     suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', stride=[1, 2, 1], dilation=2, no_bias=True))
-    suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest',
-            output_size=[12, 19, 12],
-            stride=[1, 2, 1],
-            dilation=2))
-    suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', stride=2, dilation=(2, 1, 2)))
+        Conv3DTransposeTestCase(methodName='runTest',
+                                stride=[1, 2, 1],
+                                dilation=2,
+                                no_bias=True))
     suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', padding="valid"))
+        Conv3DTransposeTestCase(methodName='runTest',
+                                output_size=[12, 19, 12],
+                                stride=[1, 2, 1],
+                                dilation=2))
     suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', padding='valid'))
+        Conv3DTransposeTestCase(methodName='runTest',
+                                stride=2,
+                                dilation=(2, 1, 2)))
+    suite.addTest(Conv3DTransposeTestCase(methodName='runTest',
+                                          padding="valid"))
+    suite.addTest(Conv3DTransposeTestCase(methodName='runTest',
+                                          padding='valid'))
     suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', filter_size=1, padding=(2, 3, 1)))
+        Conv3DTransposeTestCase(methodName='runTest',
+                                filter_size=1,
+                                padding=(2, 3, 1)))
     suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', padding=[1, 2, 2, 3, 2, 1]))
+        Conv3DTransposeTestCase(methodName='runTest',
+                                padding=[1, 2, 2, 3, 2, 1]))
     suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest',
-            padding=[[0, 0], [0, 0], [2, 3], [1, 2], [2, 1]]))
+        Conv3DTransposeTestCase(methodName='runTest',
+                                padding=[[0, 0], [0, 0], [2, 3], [1, 2], [2,
+                                                                          1]]))
     suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', data_format="NDHWC"))
+        Conv3DTransposeTestCase(methodName='runTest', data_format="NDHWC"))
     suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest',
-            data_format="NDHWC",
-            padding=[[0, 0], [1, 1], [2, 2], [3, 3], [0, 0]]))
+        Conv3DTransposeTestCase(methodName='runTest',
+                                data_format="NDHWC",
+                                padding=[[0, 0], [1, 1], [2, 2], [3, 3], [0,
+                                                                          0]]))
     suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', groups=2, padding="valid"))
+        Conv3DTransposeTestCase(methodName='runTest', groups=2,
+                                padding="valid"))
     suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest',
-            num_filters=6,
-            num_channels=3,
-            groups=3,
-            padding="valid"))
+        Conv3DTransposeTestCase(methodName='runTest',
+                                num_filters=6,
+                                num_channels=3,
+                                groups=3,
+                                padding="valid"))
 
 
 def add_error_cases(suite):
     suite.addTest(
-        Conv3DTransposeErrorTestCase(
-            methodName='runTest', num_channels=5, groups=2))
+        Conv3DTransposeErrorTestCase(methodName='runTest',
+                                     num_channels=5,
+                                     groups=2))
     suite.addTest(
-        Conv3DTransposeErrorTestCase(
-            methodName='runTest', output_size="not_valid"))
+        Conv3DTransposeErrorTestCase(methodName='runTest',
+                                     output_size="not_valid"))
     suite.addTest(
-        Conv3DTransposeErrorTestCase(
-            methodName='runTest', num_channels=5, groups=2, padding=[-1, 1, 3]))
+        Conv3DTransposeErrorTestCase(methodName='runTest',
+                                     num_channels=5,
+                                     groups=2,
+                                     padding=[-1, 1, 3]))
 
 
 def load_tests(loader, standard_tests, pattern):
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
index 1e4d09c509e6c..0042585aef8ed 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 import paddle
+
 paddle.enable_static()
 import paddle.fluid.core as core
 import paddle.fluid as fluid
@@ -45,11 +46,12 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs):
 
     def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
         padding = []
-        for input_size, filter_size, stride_size in zip(
-                input_shape, kernel_size, kernel_stride):
+        for input_size, filter_size, stride_size in zip(input_shape,
+                                                        kernel_size,
+                                                        kernel_stride):
             out_size = int((input_size + stride_size - 1) / stride_size)
-            pad_sum = np.max((
-                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0))
             pad_0 = int(pad_sum / 2)
             pad_1 = int(pad_sum - pad_0)
             padding.append(pad_0)
@@ -85,31 +87,34 @@ def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
             for i in range(in_h):
                 for j in range(in_w):
                     for g in range(groups):
-                        input_masked = input_[n, g * sub_in_c:(g + 1
-                                                               ) * sub_in_c, d,
-                                              i, j]  # (c)
+                        input_masked = input_[n,
+                                              g * sub_in_c:(g + 1) * sub_in_c,
+                                              d, i, j]  # (c)
                         input_masked = np.reshape(input_masked,
                                                   (sub_in_c, 1, 1, 1))
                         input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
 
                         for k in range(f_out_c):
-                            tmp_out = np.sum(input_masked * filter_[
-                                g * sub_in_c:(g + 1) * sub_in_c, k, :, :, :],
+                            tmp_out = np.sum(input_masked *
+                                             filter_[g * sub_in_c:(g + 1) *
+                                                     sub_in_c, k, :, :, :],
                                              axis=0)
                             d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
                             i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
                             j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
-                            out[n, g * f_out_c + k, d1:d2:dilations[0], i1:i2:
-                                dilations[1], j1:j2:dilations[2]] += tmp_out
+                            out[n, g * f_out_c + k, d1:d2:dilations[0],
+                                i1:i2:dilations[1],
+                                j1:j2:dilations[2]] += tmp_out
 
-    out = out[:, :, pad_d_0:out_d - pad_d_1, pad_h_0:out_h - pad_h_1, pad_w_0:
-              out_w - pad_w_1]
+    out = out[:, :, pad_d_0:out_d - pad_d_1, pad_h_0:out_h - pad_h_1,
+              pad_w_0:out_w - pad_w_1]
     if attrs['data_format'] == 'NHWC':
         out = np.transpose(out, [0, 2, 3, 4, 1])
     return out
 
 
 class TestConv3DTransposeOp(OpTest):
+
     def setUp(self):
         # init as conv transpose
         self.use_cudnn = False
@@ -150,44 +155,40 @@ def test_check_output(self):
     def test_check_grad(self):
         if self.use_cudnn:
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place,
-                set(['Input', 'Filter']),
-                'Output',
-                max_relative_error=0.03)
+            self.check_grad_with_place(place,
+                                       set(['Input', 'Filter']),
+                                       'Output',
+                                       max_relative_error=0.03)
         else:
-            self.check_grad(
-                set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
+            self.check_grad(set(['Input', 'Filter']),
+                            'Output',
+                            max_relative_error=0.03)
 
     def test_check_grad_no_filter(self):
         if self.use_cudnn:
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Input'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Filter']))
+            self.check_grad_with_place(place, ['Input'],
+                                       'Output',
+                                       max_relative_error=0.03,
+                                       no_grad_set=set(['Filter']))
         elif self.check_no_filter:
-            self.check_grad(
-                ['Input'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Filter']))
+            self.check_grad(['Input'],
+                            'Output',
+                            max_relative_error=0.03,
+                            no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
         if self.use_cudnn:
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Filter'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Input']))
+            self.check_grad_with_place(place, ['Filter'],
+                                       'Output',
+                                       max_relative_error=0.03,
+                                       no_grad_set=set(['Input']))
         elif self.check_no_input:
-            self.check_grad(
-                ['Filter'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Input']))
+            self.check_grad(['Filter'],
+                            'Output',
+                            max_relative_error=0.03,
+                            no_grad_set=set(['Input']))
 
     def init_test_case(self):
         self.pad = [0, 0, 0]
@@ -203,6 +204,7 @@ def init_op_type(self):
 
 
 class TestWithSymmetricPad(TestConv3DTransposeOp):
+
     def init_test_case(self):
         self.check_no_input = True
         self.pad = [1, 1, 1]
@@ -215,6 +217,7 @@ def init_test_case(self):
 
 
 class TestWithAsymmetricPad(TestConv3DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 0, 1, 0, 1, 2]
         self.stride = [1, 1, 1]
@@ -226,6 +229,7 @@ def init_test_case(self):
 
 
 class TestWithSAMEPad(TestConv3DTransposeOp):
+
     def init_test_case(self):
         self.stride = [1, 1, 2]
         self.dilations = [1, 2, 1]
@@ -237,6 +241,7 @@ def init_test_case(self):
 
 
 class TestWithVALIDPad(TestConv3DTransposeOp):
+
     def init_test_case(self):
         self.stride = [2, 1, 1]
         self.dilations = [1, 1, 1]
@@ -248,6 +253,7 @@ def init_test_case(self):
 
 
 class TestWithStride(TestConv3DTransposeOp):
+
     def init_test_case(self):
         self.check_no_filter = True
         self.pad = [1, 1, 1]
@@ -260,6 +266,7 @@ def init_test_case(self):
 
 
 class TestWithGroups(TestConv3DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
@@ -271,6 +278,7 @@ def init_test_case(self):
 
 
 class TestWithDilation(TestConv3DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
@@ -282,6 +290,7 @@ def init_test_case(self):
 
 
 class Test_NHWC(TestConv3DTransposeOp):
+
     def init_test_case(self):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
@@ -297,6 +306,7 @@ def init_test_case(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNN(TestConv3DTransposeOp):
+
     def init_op_type(self):
         self.use_cudnn = True
         self.op_type = "conv3d_transpose"
@@ -305,6 +315,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
+
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
@@ -322,6 +333,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
+
     def init_test_case(self):
         self.pad = [1, 1, 1, 0, 0, 2]
         self.stride = [1, 1, 1]
@@ -339,6 +351,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithSAMEPad(TestWithSAMEPad):
+
     def init_test_case(self):
         self.stride = [1, 1, 2]
         self.dilations = [1, 2, 1]
@@ -356,6 +369,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithVALIDPad(TestWithVALIDPad):
+
     def init_test_case(self):
         self.stride = [1, 1, 1]
         self.dilations = [1, 1, 1]
@@ -373,6 +387,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithStride(TestWithStride):
+
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [2, 2, 2]
@@ -390,6 +405,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithGroups(TestWithGroups):
+
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
@@ -422,6 +438,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNN_NHWC(TestConv3DTransposeOp):
+
     def init_test_case(self):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
@@ -440,6 +457,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
+
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
@@ -458,6 +476,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
+
     def init_test_case(self):
         self.pad = [1, 0, 1, 0, 0, 2]
         self.stride = [1, 1, 1]
@@ -476,6 +495,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithStride_NHWC(TestWithStride):
+
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [2, 2, 2]
@@ -494,6 +514,7 @@ def init_op_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNWithGroups_NHWC(TestWithGroups):
+
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py
index d597045641913..74122abf77b43 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py
@@ -24,6 +24,7 @@
 
 
 class TestWithSymmetricPad_NHWC(TestConv3DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
@@ -36,6 +37,7 @@ def init_test_case(self):
 
 
 class TestWithAsymmetricPad_NHWC(TestConv3DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 0, 1, 0, 1, 2]
         self.stride = [1, 1, 1]
@@ -48,6 +50,7 @@ def init_test_case(self):
 
 
 class TestWithGroups_NHWC(TestConv3DTransposeOp):
+
     def init_test_case(self):
         self.check_no_filter = True
         self.pad = [1, 1, 1]
@@ -61,6 +64,7 @@ def init_test_case(self):
 
 
 class TestWithStride_NHWC(TestConv3DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [2, 2, 2]
@@ -73,6 +77,7 @@ def init_test_case(self):
 
 
 class TestWithDilation_NHWC(TestConv3DTransposeOp):
+
     def init_test_case(self):
         self.check_no_input = True
         self.pad = [1, 1, 1]
@@ -86,59 +91,57 @@ def init_test_case(self):
 
 
 class TestConv3DTransposeAPI(unittest.TestCase):
+
     def test_case1(self):
-        data1 = fluid.layers.data(
-            name='data1', shape=[3, 5, 5, 5], dtype='float32')
-        data2 = fluid.layers.data(
-            name='data2', shape=[5, 5, 5, 3], dtype='float32')
-
-        out1 = fluid.layers.conv3d_transpose(
-            input=data1,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            data_format='NCDHW')
-        out2 = fluid.layers.conv3d_transpose(
-            input=data2,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            data_format='NDHWC')
-        out3 = fluid.layers.conv3d_transpose(
-            input=data1,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            padding=[[0, 0], [0, 0], [1, 1], [0, 0], [1, 1]],
-            data_format='NCDHW')
-        out4 = fluid.layers.conv3d_transpose(
-            input=data2,
-            groups=3,
-            num_filters=6,
-            filter_size=3,
-            padding=[[0, 0], [0, 0], [1, 1], [1, 2], [0, 0]],
-            data_format='NDHWC')
-        out5 = fluid.layers.conv3d_transpose(
-            input=data2,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            padding='SAME',
-            data_format='NCDHW')
-        out6 = fluid.layers.conv3d_transpose(
-            input=data2,
-            groups=1,
-            num_filters=6,
-            filter_size=3,
-            padding='VALID',
-            data_format='NDHWC')
-        out7 = fluid.layers.conv3d_transpose(
-            input=data2,
-            groups=1,
-            num_filters=6,
-            output_size=[7, 7, 7],
-            padding=[0, 0, 0],
-            data_format='NDHWC')
+        data1 = fluid.layers.data(name='data1',
+                                  shape=[3, 5, 5, 5],
+                                  dtype='float32')
+        data2 = fluid.layers.data(name='data2',
+                                  shape=[5, 5, 5, 3],
+                                  dtype='float32')
+
+        out1 = fluid.layers.conv3d_transpose(input=data1,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             data_format='NCDHW')
+        out2 = fluid.layers.conv3d_transpose(input=data2,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             data_format='NDHWC')
+        out3 = fluid.layers.conv3d_transpose(input=data1,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding=[[0, 0], [0, 0], [1, 1],
+                                                      [0, 0], [1, 1]],
+                                             data_format='NCDHW')
+        out4 = fluid.layers.conv3d_transpose(input=data2,
+                                             groups=3,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding=[[0, 0], [0, 0], [1, 1],
+                                                      [1, 2], [0, 0]],
+                                             data_format='NDHWC')
+        out5 = fluid.layers.conv3d_transpose(input=data2,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding='SAME',
+                                             data_format='NCDHW')
+        out6 = fluid.layers.conv3d_transpose(input=data2,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding='VALID',
+                                             data_format='NDHWC')
+        out7 = fluid.layers.conv3d_transpose(input=data2,
+                                             groups=1,
+                                             num_filters=6,
+                                             output_size=[7, 7, 7],
+                                             padding=[0, 0, 0],
+                                             data_format='NDHWC')
 
         data1_np = np.random.random((2, 3, 5, 5, 5)).astype("float32")
         data2_np = np.random.random((2, 5, 5, 5, 3)).astype("float32")
@@ -149,12 +152,13 @@ def test_case1(self):
             place = core.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        results = exe.run(
-            fluid.default_main_program(),
-            feed={"data1": data1_np,
-                  "data2": data2_np},
-            fetch_list=[out1, out2, out3, out4, out5, out6, out7],
-            return_numpy=True)
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "data1": data1_np,
+                              "data2": data2_np
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5, out6, out7],
+                          return_numpy=True)
         self.assertIsNotNone(results[0])
         self.assertIsNotNone(results[1])
         self.assertIsNotNone(results[2])
@@ -165,48 +169,48 @@ def test_case1(self):
 
 
 class TestConv3DTransposeOpException(unittest.TestCase):
+
     def test_exception(self):
-        data = fluid.layers.data(
-            name='data', shape=[3, 5, 5, 5], dtype="float32")
+        data = fluid.layers.data(name='data',
+                                 shape=[3, 5, 5, 5],
+                                 dtype="float32")
 
         def attr_data_format():
-            out = fluid.layers.conv2d_transpose(
-                input=data,
-                groups=1,
-                num_filters=6,
-                filter_size=3,
-                data_format="NCDW")
+            out = fluid.layers.conv2d_transpose(input=data,
+                                                groups=1,
+                                                num_filters=6,
+                                                filter_size=3,
+                                                data_format="NCDW")
 
         self.assertRaises(ValueError, attr_data_format)
 
         def attr_padding_str():
-            out = fluid.layers.conv2d_transpose(
-                input=data,
-                groups=1,
-                num_filters=6,
-                filter_size=3,
-                padding='Vald')
+            out = fluid.layers.conv2d_transpose(input=data,
+                                                groups=1,
+                                                num_filters=6,
+                                                filter_size=3,
+                                                padding='Vald')
 
         self.assertRaises(ValueError, attr_padding_str)
 
         def attr_padding_list():
-            out = fluid.layers.conv2d_transpose(
-                input=data,
-                groups=1,
-                num_filters=6,
-                filter_size=3,
-                padding=[[1, 1], [1, 1], [0, 0], [0, 0], [1, 1]])
+            out = fluid.layers.conv2d_transpose(input=data,
+                                                groups=1,
+                                                num_filters=6,
+                                                filter_size=3,
+                                                padding=[[1, 1], [1, 1], [0, 0],
+                                                         [0, 0], [1, 1]])
 
         self.assertRaises(ValueError, attr_padding_list)
 
         def attr_padding_with_data_format():
-            out = fluid.layers.conv2d_transpose(
-                input=data,
-                groups=1,
-                num_filters=6,
-                filter_size=3,
-                padding=[[1, 1], [0, 0], [0, 0], [1, 0], [1, 1]],
-                data_format='NDHWC')
+            out = fluid.layers.conv2d_transpose(input=data,
+                                                groups=1,
+                                                num_filters=6,
+                                                filter_size=3,
+                                                padding=[[1, 1], [0, 0], [0, 0],
+                                                         [1, 0], [1, 1]],
+                                                data_format='NDHWC')
 
         self.assertRaises(ValueError, attr_padding_with_data_format)
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
index 5bff8b3142106..2bee23cbdbdd1 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
@@ -27,6 +27,7 @@
 
 
 class TestConvDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 4, 3, 3]
@@ -40,8 +41,11 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -53,6 +57,7 @@ def test_grad(self):
 
 
 class TestConvDoubleGradCheckTest0(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 4, 3, 3]
@@ -66,8 +71,11 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -78,6 +86,7 @@ def test_grad(self):
 
 
 class TestConvDoubleGradCheckTest1(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 3, 3]
@@ -91,8 +100,11 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -103,6 +115,7 @@ def test_grad(self):
 
 
 class TestConv3DDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 4, 3, 4, 2]
@@ -116,8 +129,11 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         #places = [fluid.CPUPlace()]
@@ -129,6 +145,7 @@ def test_grad(self):
 
 
 class TestConv3DDoubleGradCheckTest1(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 4, 5, 3, 2]
@@ -142,8 +159,11 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -154,27 +174,30 @@ def test_grad(self):
 
 
 class TestConv2DoubleGradCheck_AsyPadding(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
         dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv2d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding=[1, 0, 0, 1],
-            bias_attr=False,
-            use_cudnn=True)
+        y = layers.conv2d(input=x,
+                          num_filters=2,
+                          filter_size=1,
+                          padding=[1, 0, 0, 1],
+                          bias_attr=False,
+                          use_cudnn=True)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -185,27 +208,30 @@ def test_grad(self):
 
 
 class TestConv2DoubleGradCheck_PaddingSAME(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
         dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv2d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding="SAME",
-            bias_attr=False,
-            use_cudnn=True)
+        y = layers.conv2d(input=x,
+                          num_filters=2,
+                          filter_size=1,
+                          padding="SAME",
+                          bias_attr=False,
+                          use_cudnn=True)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -216,27 +242,30 @@ def test_grad(self):
 
 
 class TestConv2DoubleGradCheck_PaddingVALID(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
         dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv2d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding="VALID",
-            bias_attr=False,
-            use_cudnn=True)
+        y = layers.conv2d(input=x,
+                          num_filters=2,
+                          filter_size=1,
+                          padding="VALID",
+                          bias_attr=False,
+                          use_cudnn=True)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -247,29 +276,32 @@ def test_grad(self):
 
 
 class TestConv2DoubleGradCheck_ChannelLast(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
         dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv2d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding=[1, 1],
-            bias_attr=False,
-            use_cudnn=True,
-            groups=1,
-            data_format="NHWC")
+        y = layers.conv2d(input=x,
+                          num_filters=2,
+                          filter_size=1,
+                          padding=[1, 1],
+                          bias_attr=False,
+                          use_cudnn=True,
+                          groups=1,
+                          data_format="NHWC")
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -280,29 +312,32 @@ def test_grad(self):
 
 
 class TestConv2DoubleGradCheck_ChannelLast_AsyPadding(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
         dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv2d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding=[1, 0, 1, 0],
-            bias_attr=False,
-            use_cudnn=True,
-            groups=1,
-            data_format="NHWC")
+        y = layers.conv2d(input=x,
+                          num_filters=2,
+                          filter_size=1,
+                          padding=[1, 0, 1, 0],
+                          bias_attr=False,
+                          use_cudnn=True,
+                          groups=1,
+                          data_format="NHWC")
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -313,27 +348,30 @@ def test_grad(self):
 
 
 class TestConv3DDoubleGradCheck_AsyPadding(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 2, 2, 2]
         eps = 0.005
         dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv3d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding=[1, 0, 0, 1, 1, 2],
-            bias_attr=False,
-            use_cudnn=True)
+        y = layers.conv3d(input=x,
+                          num_filters=2,
+                          filter_size=1,
+                          padding=[1, 0, 0, 1, 1, 2],
+                          bias_attr=False,
+                          use_cudnn=True)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -344,28 +382,31 @@ def test_grad(self):
 
 
 class TestConv3DoubleGradCheck_PaddingSAME(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 2, 2, 2]
         eps = 0.005
         dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv3d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding="SAME",
-            groups=1,
-            bias_attr=False,
-            use_cudnn=True)
+        y = layers.conv3d(input=x,
+                          num_filters=2,
+                          filter_size=1,
+                          padding="SAME",
+                          groups=1,
+                          bias_attr=False,
+                          use_cudnn=True)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -376,27 +417,30 @@ def test_grad(self):
 
 
 class TestConv3DoubleGradCheck_PaddingVALID(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3, 2]
         eps = 0.005
         dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv3d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding="VALID",
-            bias_attr=False,
-            use_cudnn=True)
+        y = layers.conv3d(input=x,
+                          num_filters=2,
+                          filter_size=1,
+                          padding="VALID",
+                          bias_attr=False,
+                          use_cudnn=True)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -407,29 +451,32 @@ def test_grad(self):
 
 
 class TestConv3DDoubleGradCheck_ChannelLast(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 2, 2, 3]
         eps = 0.005
         dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv3d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding=[1, 1, 1],
-            bias_attr=False,
-            use_cudnn=True,
-            groups=1,
-            data_format="NDHWC")
+        y = layers.conv3d(input=x,
+                          num_filters=2,
+                          filter_size=1,
+                          padding=[1, 1, 1],
+                          bias_attr=False,
+                          use_cudnn=True,
+                          groups=1,
+                          data_format="NDHWC")
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -440,29 +487,32 @@ def test_grad(self):
 
 
 class TestConv3DDoubleGradCheck_ChannelLast_AsyPadding(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 2, 2, 3]
         eps = 0.005
         dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv3d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding=[1, 0, 1, 0, 1, 0],
-            bias_attr=False,
-            use_cudnn=True,
-            groups=1,
-            data_format="NDHWC")
+        y = layers.conv3d(input=x,
+                          num_filters=2,
+                          filter_size=1,
+                          padding=[1, 0, 1, 0, 1, 0],
+                          bias_attr=False,
+                          use_cudnn=True,
+                          groups=1,
+                          data_format="NDHWC")
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -473,6 +523,7 @@ def test_grad(self):
 
 
 class TestDepthWiseConvDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [2, 4, 3, 3]
@@ -480,20 +531,27 @@ def func(self, place):
         dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
 
-        # condition of depthwise conv: 
+        # condition of depthwise conv:
         # use_cudnn == False
         # groups == filters
         # num_filters % num_channels == 0
-        y = layers.conv2d(
-            x, shape[1], 1, groups=shape[1], bias_attr=False, use_cudnn=False)
+        y = layers.conv2d(x,
+                          shape[1],
+                          1,
+                          groups=shape[1],
+                          bias_attr=False,
+                          use_cudnn=False)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x] + w,
+                                           y,
+                                           x_init=[x_arr] + w_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = []
@@ -504,6 +562,7 @@ def test_grad(self):
 
 
 class TestDepthWiseConvDoubleGradCheckCase1(unittest.TestCase):
+
     def depthwise_conv2d_wrapper(self, x):
         return paddle.nn.functional.conv2d(x[0], x[1], groups=4)
 
@@ -516,7 +575,7 @@ def func(self, place):
         x = layers.data('x', x_shape, False, dtype)
         w = layers.data('w', w_shape, False, dtype)
 
-        # condition of depthwise conv: 
+        # condition of depthwise conv:
         # use_cudnn == False
         # groups == filters
         # num_filters % num_channels == 0
@@ -525,8 +584,11 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
         w_arr = np.random.uniform(-1, 1, w_shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x, w], y, x_init=[x_arr, w_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check([x, w],
+                                           y,
+                                           x_init=[x_arr, w_arr],
+                                           place=place,
+                                           eps=eps)
         gradient_checker.double_grad_check_for_dygraph(
             self.depthwise_conv2d_wrapper, [x, w],
             y,
@@ -542,6 +604,7 @@ def test_grad(self):
 
 
 class TestConv3DDoubleGradCheck_NN(unittest.TestCase):
+
     def conv3d_wrapper(self, x):
         return paddle.nn.functional.conv3d(x[0], x[1])
 
@@ -559,10 +622,16 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
         w_arr = np.random.uniform(-1, 1, w_shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x, w], y, x_init=[x_arr, w_arr], place=place, eps=eps)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.conv3d_wrapper, [x, w], y, x_init=[x_arr, w_arr], place=place)
+        gradient_checker.double_grad_check([x, w],
+                                           y,
+                                           x_init=[x_arr, w_arr],
+                                           place=place,
+                                           eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(self.conv3d_wrapper,
+                                                       [x, w],
+                                                       y,
+                                                       x_init=[x_arr, w_arr],
+                                                       place=place)
 
     def test_grad(self):
         places = []
diff --git a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
index 59241a408697d..4718d94ba4fb0 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
@@ -31,6 +31,7 @@ def conv_shift_forward(x, y):
 
 
 class TestConvShiftOp(OpTest):
+
     def setUp(self):
         self.op_type = "conv_shift"
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
index b9e9224b9e402..45b0a2b991ea6 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
@@ -27,6 +27,7 @@
 
 
 class TestConvTransposeDoubleGradCheck(unittest.TestCase):
+
     def conv_transpose_wrapper(self, x):
         return paddle.nn.functional.conv2d_transpose(x[0], x[1], groups=1)
 
@@ -38,8 +39,11 @@ def func(self, place):
         if core.is_compiled_with_rocm():
             dtype = np.float32
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv2d_transpose(
-            x, 2, filter_size=1, groups=1, bias_attr=False)
+        y = layers.conv2d_transpose(x,
+                                    2,
+                                    filter_size=1,
+                                    groups=1,
+                                    bias_attr=False)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
@@ -48,16 +52,18 @@ def func(self, place):
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
         if core.is_compiled_with_rocm():
             # HIP will sometimes fail if no atol
-            gradient_checker.double_grad_check(
-                [x] + w,
-                y,
-                x_init=[x_arr] + w_arr,
-                place=place,
-                eps=eps,
-                atol=1e-4)
+            gradient_checker.double_grad_check([x] + w,
+                                               y,
+                                               x_init=[x_arr] + w_arr,
+                                               place=place,
+                                               eps=eps,
+                                               atol=1e-4)
         else:
-            gradient_checker.double_grad_check(
-                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+            gradient_checker.double_grad_check([x] + w,
+                                               y,
+                                               x_init=[x_arr] + w_arr,
+                                               place=place,
+                                               eps=eps)
         gradient_checker.double_grad_check_for_dygraph(
             self.conv_transpose_wrapper, [x] + w,
             y,
@@ -75,9 +81,12 @@ def test_grad(self):
 
 class TestConvTranspose2DoubleGradCheck_AsyPadding(
         TestConvTransposeDoubleGradCheck):
+
     def conv_transpose_wrapper(self, x):
-        return paddle.nn.functional.conv2d_transpose(
-            x[0], x[1], groups=1, padding=[1, 0, 0, 1])
+        return paddle.nn.functional.conv2d_transpose(x[0],
+                                                     x[1],
+                                                     groups=1,
+                                                     padding=[1, 0, 0, 1])
 
     @prog_scope()
     def func(self, place):
@@ -87,13 +96,12 @@ def func(self, place):
         if core.is_compiled_with_rocm():
             dtype = np.float32
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv2d_transpose(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding=[1, 0, 0, 1],
-            bias_attr=False,
-            use_cudnn=True)
+        y = layers.conv2d_transpose(input=x,
+                                    num_filters=2,
+                                    filter_size=1,
+                                    padding=[1, 0, 0, 1],
+                                    bias_attr=False,
+                                    use_cudnn=True)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
@@ -102,16 +110,18 @@ def func(self, place):
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
         if core.is_compiled_with_rocm():
             # HIP will sometimes fail if no atol
-            gradient_checker.double_grad_check(
-                [x] + w,
-                y,
-                x_init=[x_arr] + w_arr,
-                place=place,
-                eps=eps,
-                atol=1e-4)
+            gradient_checker.double_grad_check([x] + w,
+                                               y,
+                                               x_init=[x_arr] + w_arr,
+                                               place=place,
+                                               eps=eps,
+                                               atol=1e-4)
         else:
-            gradient_checker.double_grad_check(
-                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+            gradient_checker.double_grad_check([x] + w,
+                                               y,
+                                               x_init=[x_arr] + w_arr,
+                                               place=place,
+                                               eps=eps)
         gradient_checker.double_grad_check_for_dygraph(
             self.conv_transpose_wrapper, [x] + w,
             y,
@@ -121,9 +131,12 @@ def func(self, place):
 
 class TestConvTranspose2DoubleGradCheck_PaddingSAME(
         TestConvTransposeDoubleGradCheck):
+
     def conv_transpose_wrapper(self, x):
-        return paddle.nn.functional.conv2d_transpose(
-            x[0], x[1], groups=1, padding="SAME")
+        return paddle.nn.functional.conv2d_transpose(x[0],
+                                                     x[1],
+                                                     groups=1,
+                                                     padding="SAME")
 
     @prog_scope()
     def func(self, place):
@@ -133,13 +146,12 @@ def func(self, place):
         if core.is_compiled_with_rocm():
             dtype = np.float32
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv2d_transpose(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding="SAME",
-            bias_attr=False,
-            use_cudnn=True)
+        y = layers.conv2d_transpose(input=x,
+                                    num_filters=2,
+                                    filter_size=1,
+                                    padding="SAME",
+                                    bias_attr=False,
+                                    use_cudnn=True)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
@@ -148,16 +160,18 @@ def func(self, place):
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
         if core.is_compiled_with_rocm():
             # HIP will sometimes fail if no atol
-            gradient_checker.double_grad_check(
-                [x] + w,
-                y,
-                x_init=[x_arr] + w_arr,
-                place=place,
-                eps=eps,
-                atol=1e-4)
+            gradient_checker.double_grad_check([x] + w,
+                                               y,
+                                               x_init=[x_arr] + w_arr,
+                                               place=place,
+                                               eps=eps,
+                                               atol=1e-4)
         else:
-            gradient_checker.double_grad_check(
-                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+            gradient_checker.double_grad_check([x] + w,
+                                               y,
+                                               x_init=[x_arr] + w_arr,
+                                               place=place,
+                                               eps=eps)
         gradient_checker.double_grad_check_for_dygraph(
             self.conv_transpose_wrapper, [x] + w,
             y,
@@ -167,9 +181,12 @@ def func(self, place):
 
 class TestConvTranspose2DoubleGradCheck_PaddingVALID(
         TestConvTransposeDoubleGradCheck):
+
     def conv_transpose_wrapper(self, x):
-        return paddle.nn.functional.conv2d_transpose(
-            x[0], x[1], groups=1, padding="VALID")
+        return paddle.nn.functional.conv2d_transpose(x[0],
+                                                     x[1],
+                                                     groups=1,
+                                                     padding="VALID")
 
     @prog_scope()
     def func(self, place):
@@ -179,13 +196,12 @@ def func(self, place):
         if core.is_compiled_with_rocm():
             dtype = np.float32
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv2d_transpose(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding="VALID",
-            bias_attr=False,
-            use_cudnn=True)
+        y = layers.conv2d_transpose(input=x,
+                                    num_filters=2,
+                                    filter_size=1,
+                                    padding="VALID",
+                                    bias_attr=False,
+                                    use_cudnn=True)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
@@ -194,16 +210,18 @@ def func(self, place):
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
         if core.is_compiled_with_rocm():
             # HIP will sometimes fail if no atol
-            gradient_checker.double_grad_check(
-                [x] + w,
-                y,
-                x_init=[x_arr] + w_arr,
-                place=place,
-                eps=eps,
-                atol=1e-4)
+            gradient_checker.double_grad_check([x] + w,
+                                               y,
+                                               x_init=[x_arr] + w_arr,
+                                               place=place,
+                                               eps=eps,
+                                               atol=1e-4)
         else:
-            gradient_checker.double_grad_check(
-                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+            gradient_checker.double_grad_check([x] + w,
+                                               y,
+                                               x_init=[x_arr] + w_arr,
+                                               place=place,
+                                               eps=eps)
         gradient_checker.double_grad_check_for_dygraph(
             self.conv_transpose_wrapper, [x] + w,
             y,
@@ -213,9 +231,13 @@ def func(self, place):
 
 class TestConvTranspose2DoubleGradCheck_ChannelLast(
         TestConvTransposeDoubleGradCheck):
+
     def conv_transpose_wrapper(self, x):
-        return paddle.nn.functional.conv2d_transpose(
-            x[0], x[1], groups=1, padding=[1, 1], data_format="NHWC")
+        return paddle.nn.functional.conv2d_transpose(x[0],
+                                                     x[1],
+                                                     groups=1,
+                                                     padding=[1, 1],
+                                                     data_format="NHWC")
 
     @prog_scope()
     def func(self, place):
@@ -225,15 +247,14 @@ def func(self, place):
         if core.is_compiled_with_rocm():
             dtype = np.float32
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv2d_transpose(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding=[1, 1],
-            bias_attr=False,
-            use_cudnn=True,
-            groups=1,
-            data_format="NHWC")
+        y = layers.conv2d_transpose(input=x,
+                                    num_filters=2,
+                                    filter_size=1,
+                                    padding=[1, 1],
+                                    bias_attr=False,
+                                    use_cudnn=True,
+                                    groups=1,
+                                    data_format="NHWC")
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
         w = fluid.default_main_program().global_block().all_parameters()
@@ -242,16 +263,18 @@ def func(self, place):
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
         if core.is_compiled_with_rocm():
             # HIP will sometimes fail if no atol
-            gradient_checker.double_grad_check(
-                [x] + w,
-                y,
-                x_init=[x_arr] + w_arr,
-                place=place,
-                eps=eps,
-                atol=1e-4)
+            gradient_checker.double_grad_check([x] + w,
+                                               y,
+                                               x_init=[x_arr] + w_arr,
+                                               place=place,
+                                               eps=eps,
+                                               atol=1e-4)
         else:
-            gradient_checker.double_grad_check(
-                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+            gradient_checker.double_grad_check([x] + w,
+                                               y,
+                                               x_init=[x_arr] + w_arr,
+                                               place=place,
+                                               eps=eps)
         gradient_checker.double_grad_check_for_dygraph(
             self.conv_transpose_wrapper, [x] + w,
             y,
diff --git a/python/paddle/fluid/tests/unittests/test_corr.py b/python/paddle/fluid/tests/unittests/test_corr.py
index 1e1dd3b369584..6a9d931e22daf 100644
--- a/python/paddle/fluid/tests/unittests/test_corr.py
+++ b/python/paddle/fluid/tests/unittests/test_corr.py
@@ -31,6 +31,7 @@ def numpy_corr(np_arr, rowvar=True, dtype='float64'):
 
 
 class Corr_Test(unittest.TestCase):
+
     def setUp(self):
         self.shape = [4, 5]
 
@@ -52,8 +53,7 @@ def test_tensor_corr_default(self):
                 np_corr = numpy_corr(np_arr, rowvar=True, dtype=dtype)
                 if dtype == 'float32':
                     self.assertTrue(
-                        np.allclose(
-                            np_corr, corr.numpy(), atol=1.e-5))
+                        np.allclose(np_corr, corr.numpy(), atol=1.e-5))
                 else:
                     self.assertTrue(np.allclose(np_corr, corr.numpy()))
 
@@ -76,29 +76,32 @@ def test_tensor_corr_rowvar(self):
                 np_corr = numpy_corr(np_arr, rowvar=False, dtype=dtype)
                 if dtype == 'float32':
                     self.assertTrue(
-                        np.allclose(
-                            np_corr, corr.numpy(), atol=1.e-5))
+                        np.allclose(np_corr, corr.numpy(), atol=1.e-5))
                 else:
                     self.assertTrue(np.allclose(np_corr, corr.numpy()))
 
 
 # Input(x) only support N-D (1<=N<=2) tensor
 class Corr_Test2(Corr_Test):
+
     def setUp(self):
         self.shape = [10]
 
 
 class Corr_Test3(Corr_Test):
+
     def setUp(self):
         self.shape = [4, 5]
 
 
 # Input(x) only support N-D (1<=N<=2) tensor
 class Corr_Test4(unittest.TestCase):
+
     def setUp(self):
         self.shape = [2, 5, 2]
 
     def test_errors(self):
+
         def test_err():
             np_arr = np.random.rand(*self.shape).astype('float64')
             tensor = paddle.to_tensor(np_arr)
@@ -109,6 +112,7 @@ def test_err():
 
 # test unsupported complex input
 class Corr_Comeplex_Test(unittest.TestCase):
+
     def setUp(self):
         self.dtype = 'complex128'
 
@@ -120,6 +124,7 @@ def test_errors(self):
 
 
 class Corr_Test5(Corr_Comeplex_Test):
+
     def setUp(self):
         self.dtype = 'complex64'
 
diff --git a/python/paddle/fluid/tests/unittests/test_cos_sim_op.py b/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
index 23b0fcc691a9a..8e5d30d274377 100644
--- a/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
@@ -22,6 +22,7 @@
 
 
 class TestCosSimOp(OpTest):
+
     def setUp(self):
         self.op_type = "cos_sim"
         self.inputs = {
@@ -45,15 +46,20 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.06)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.06, no_grad_set=set("X"))
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=0.06,
+                        no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Y'))
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.06,
+                        no_grad_set=set('Y'))
 
 
 class TestCosSimOp2(TestCosSimOp):
+
     def setUp(self):
         self.op_type = "cos_sim"
         self.inputs = {
@@ -72,6 +78,7 @@ def setUp(self):
 
 
 class TestCosSimOp3(TestCosSimOp):
+
     def setUp(self):
         self.op_type = "cos_sim"
         self.inputs = {
@@ -90,6 +97,7 @@ def setUp(self):
 
 
 class TestCosSimOp4(TestCosSimOp):
+
     def setUp(self):
         self.op_type = "cos_sim"
         self.inputs = {
@@ -108,13 +116,14 @@ def setUp(self):
 
 
 class TestCosSimOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of batch_norm must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
-            x2 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
+            x2 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.cos_sim, x1, x2)
 
             # the input dtype of batch_norm must be float32
diff --git a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
index 0b6e5b444caf7..45000c3aef8a8 100644
--- a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
+++ b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
@@ -24,6 +24,7 @@
 
 
 class TestCosineSimilarityAPI(unittest.TestCase):
+
     def setUp(self):
         self.places = [paddle.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -53,8 +54,10 @@ def check_static_result(self, place):
             result = F.cosine_similarity(x1, x2, axis=axis, eps=eps)
             exe = Executor(place)
             fetches = exe.run(default_main_program(),
-                              feed={"x1": np_x1,
-                                    "x2": np_x2},
+                              feed={
+                                  "x1": np_x1,
+                                  "x2": np_x2
+                              },
                               fetch_list=[result])
 
             np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
diff --git a/python/paddle/fluid/tests/unittests/test_cost_model.py b/python/paddle/fluid/tests/unittests/test_cost_model.py
index 79e2b78792142..7557093285855 100644
--- a/python/paddle/fluid/tests/unittests/test_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/test_cost_model.py
@@ -26,6 +26,7 @@
 
 
 class TestCostModel(unittest.TestCase):
+
     def test_profiler_measure_empty_program(self):
         cost_model = core.CostModel()
         empty_program = paddle.static.Program()
@@ -71,16 +72,16 @@ def test_static_op_benchmark_cost_model(self):
         print("conv2d_op_time:", conv2d_op_time)
         print("conv2d_op_config:", conv2d_op_config)
 
-        conv2d_backward_op_cost = cost_model.get_static_op_time(
-            "conv2d", forward=False)
+        conv2d_backward_op_cost = cost_model.get_static_op_time("conv2d",
+                                                                forward=False)
         conv2d_backward_op_time = conv2d_backward_op_cost["op_time"]
         conv2d_backward_op_config = conv2d_backward_op_cost["config"]
         self.assertGreater(float(conv2d_backward_op_time), 0)
         print("conv2d_backward_op_time:", conv2d_backward_op_time)
         print("conv2d_backward_op_config:", conv2d_backward_op_config)
 
-        conv2d_fp16_op_cost = cost_model.get_static_op_time(
-            "conv2d", dtype="float16")
+        conv2d_fp16_op_cost = cost_model.get_static_op_time("conv2d",
+                                                            dtype="float16")
         conv2d_fp16_op_time = conv2d_fp16_op_cost["op_time"]
         conv2d_fp16_op_config = conv2d_fp16_op_cost["config"]
         self.assertGreater(float(conv2d_fp16_op_time), 0)
diff --git a/python/paddle/fluid/tests/unittests/test_cov.py b/python/paddle/fluid/tests/unittests/test_cov.py
index 5c4b9cbab2790..c67b2c2d35770 100644
--- a/python/paddle/fluid/tests/unittests/test_cov.py
+++ b/python/paddle/fluid/tests/unittests/test_cov.py
@@ -29,6 +29,7 @@ def numpy_cov(np_arr, rowvar=True, ddof=1, fweights=None, aweights=None):
 
 
 class Cov_Test(unittest.TestCase):
+
     def setUp(self):
         self.shape = [20, 10]
         self.weightshape = [10]
@@ -53,8 +54,11 @@ def func_test_tensor_cov_default(self):
                                         ddof=True,
                                         fweights=None,
                                         aweights=None)
-                np_cov = numpy_cov(
-                    np_arr, rowvar=True, ddof=1, fweights=None, aweights=None)
+                np_cov = numpy_cov(np_arr,
+                                   rowvar=True,
+                                   ddof=1,
+                                   fweights=None,
+                                   aweights=None)
                 self.assertTrue(np.allclose(np_cov, cov.numpy()))
 
     def test_tensor_cov_default(self):
@@ -82,8 +86,11 @@ def func_test_tensor_cov_rowvar(self):
                                         ddof=True,
                                         fweights=None,
                                         aweights=None)
-                np_cov = numpy_cov(
-                    np_arr, rowvar=False, ddof=1, fweights=None, aweights=None)
+                np_cov = numpy_cov(np_arr,
+                                   rowvar=False,
+                                   ddof=1,
+                                   fweights=None,
+                                   aweights=None)
                 self.assertTrue(np.allclose(np_cov, cov.numpy()))
 
     def test_tensor_cov_rowvar(self):
@@ -111,8 +118,11 @@ def func_test_tensor_cov_ddof(self):
                                         ddof=False,
                                         fweights=None,
                                         aweights=None)
-                np_cov = numpy_cov(
-                    np_arr, rowvar=True, ddof=0, fweights=None, aweights=None)
+                np_cov = numpy_cov(np_arr,
+                                   rowvar=True,
+                                   ddof=0,
+                                   fweights=None,
+                                   aweights=None)
                 self.assertTrue(np.allclose(np_cov, cov.numpy()))
 
     def test_tensor_cov_ddof(self):
@@ -134,8 +144,8 @@ def func_test_tensor_cov_fweights(self):
 
             for dtype in typelist:
                 np_arr = np.random.rand(*self.shape).astype(dtype)
-                np_fw = np.random.randint(
-                    10, size=self.weightshape).astype('int32')
+                np_fw = np.random.randint(10,
+                                          size=self.weightshape).astype('int32')
                 tensor = paddle.to_tensor(np_arr, place=p)
                 fweights = paddle.to_tensor(np_fw, place=p)
                 cov = paddle.linalg.cov(tensor,
@@ -143,8 +153,11 @@ def func_test_tensor_cov_fweights(self):
                                         ddof=True,
                                         fweights=fweights,
                                         aweights=None)
-                np_cov = numpy_cov(
-                    np_arr, rowvar=True, ddof=1, fweights=np_fw, aweights=None)
+                np_cov = numpy_cov(np_arr,
+                                   rowvar=True,
+                                   ddof=1,
+                                   fweights=np_fw,
+                                   aweights=None)
                 self.assertTrue(np.allclose(np_cov, cov.numpy()))
 
     def test_tensor_cov_fweights(self):
@@ -166,8 +179,8 @@ def func_test_tensor_cov_aweights(self):
 
             for dtype in typelist:
                 np_arr = np.random.rand(*self.shape).astype(dtype)
-                np_aw = np.random.randint(
-                    10, size=self.weightshape).astype('int32')
+                np_aw = np.random.randint(10,
+                                          size=self.weightshape).astype('int32')
                 tensor = paddle.to_tensor(np_arr, place=p)
                 aweights = paddle.to_tensor(np_aw, place=p)
                 cov = paddle.linalg.cov(tensor,
@@ -175,8 +188,11 @@ def func_test_tensor_cov_aweights(self):
                                         ddof=True,
                                         fweights=None,
                                         aweights=aweights)
-                np_cov = numpy_cov(
-                    np_arr, rowvar=True, ddof=1, fweights=None, aweights=np_aw)
+                np_cov = numpy_cov(np_arr,
+                                   rowvar=True,
+                                   ddof=1,
+                                   fweights=None,
+                                   aweights=np_aw)
                 self.assertTrue(np.allclose(np_cov, cov.numpy()))
 
     def test_tensor_cov_aweights(self):
@@ -198,8 +214,8 @@ def func_test_tensor_cov_weights(self):
 
             for dtype in typelist:
                 np_arr = np.random.rand(*self.shape).astype(dtype)
-                np_fw = np.random.randint(
-                    10, size=self.weightshape).astype('int64')
+                np_fw = np.random.randint(10,
+                                          size=self.weightshape).astype('int64')
                 np_aw = np.random.rand(*self.weightshape).astype('float64')
                 tensor = paddle.to_tensor(np_arr, place=p)
                 fweights = paddle.to_tensor(np_fw, place=p)
@@ -209,8 +225,11 @@ def func_test_tensor_cov_weights(self):
                                         ddof=True,
                                         fweights=fweights,
                                         aweights=aweights)
-                np_cov = numpy_cov(
-                    np_arr, rowvar=True, ddof=1, fweights=np_fw, aweights=np_aw)
+                np_cov = numpy_cov(np_arr,
+                                   rowvar=True,
+                                   ddof=1,
+                                   fweights=np_fw,
+                                   aweights=np_aw)
                 self.assertTrue(np.allclose(np_cov, cov.numpy()))
 
     def test_tensor_cov_weights(self):
@@ -220,6 +239,7 @@ def test_tensor_cov_weights(self):
 
 
 class Cov_Test2(Cov_Test):
+
     def setUp(self):
         self.shape = [10]
         self.weightshape = [10]
@@ -227,6 +247,7 @@ def setUp(self):
 
 # Input(x) only support N-D (1<=N<=2) tensor
 class Cov_Test3(unittest.TestCase):
+
     def setUp(self):
         self.shape = [2, 5, 10]
         self.fweightshape = [10]
@@ -235,12 +256,13 @@ def setUp(self):
         self.aw_s = 1.
 
     def func_test_errors(self):
+
         def test_err():
             np_arr = np.random.rand(*self.shape).astype('float64')
-            np_fw = self.fw_s * np.random.rand(
-                *self.fweightshape).astype('int32')
-            np_aw = self.aw_s * np.random.rand(
-                *self.aweightshape).astype('float64')
+            np_fw = self.fw_s * np.random.rand(*self.fweightshape).astype(
+                'int32')
+            np_aw = self.aw_s * np.random.rand(*self.aweightshape).astype(
+                'float64')
             tensor = paddle.to_tensor(np_arr)
             fweights = paddle.to_tensor(np_fw)
             aweights = paddle.to_tensor(np_aw)
@@ -260,6 +282,7 @@ def test_errors(self):
 
 #Input(fweights) only support N-D (N<=1) tensor
 class Cov_Test4(Cov_Test3):
+
     def setUp(self):
         self.shape = [5, 10]
         self.fweightshape = [2, 10]
@@ -270,6 +293,7 @@ def setUp(self):
 
 #The number of Input(fweights) should equal to x's dim[1]
 class Cov_Test5(Cov_Test3):
+
     def setUp(self):
         self.shape = [5, 10]
         self.fweightshape = [5]
@@ -280,6 +304,7 @@ def setUp(self):
 
 #The value of Input(fweights) cannot be negtive
 class Cov_Test6(Cov_Test3):
+
     def setUp(self):
         self.shape = [5, 10]
         self.fweightshape = [10]
@@ -290,6 +315,7 @@ def setUp(self):
 
 #Input(aweights) only support N-D (N<=1) tensor
 class Cov_Test7(Cov_Test3):
+
     def setUp(self):
         self.shape = [5, 10]
         self.fweightshape = [10]
@@ -300,6 +326,7 @@ def setUp(self):
 
 #The number of Input(aweights) should equal to x's dim[1]
 class Cov_Test8(Cov_Test3):
+
     def setUp(self):
         self.shape = [5, 10]
         self.fweightshape = [10]
@@ -310,6 +337,7 @@ def setUp(self):
 
 #The value of Input(aweights) cannot be negtive
 class Cov_Test9(Cov_Test3):
+
     def setUp(self):
         self.shape = [5, 10]
         self.fweightshape = [10]
diff --git a/python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py b/python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py
index 1def2ffd82ad7..4b351b9f19919 100644
--- a/python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py
+++ b/python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class LinearNet(nn.Layer):
+
     def __init__(self):
         super(LinearNet, self).__init__()
         self._linear1 = nn.Linear(10, 10)
@@ -58,6 +59,7 @@ def train(print_result=False):
 
 
 class TestSpawn(unittest.TestCase):
+
     def test_spawn(self):
         dist.spawn(train, backend='gloo', nprocs=4)
 
diff --git a/python/paddle/fluid/tests/unittests/test_create_global_var.py b/python/paddle/fluid/tests/unittests/test_create_global_var.py
index 39fb0355190c6..1517ce6422282 100644
--- a/python/paddle/fluid/tests/unittests/test_create_global_var.py
+++ b/python/paddle/fluid/tests/unittests/test_create_global_var.py
@@ -20,6 +20,7 @@
 
 
 class TestCreateGlobalVarError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
@@ -33,8 +34,8 @@ def test_shape_item():
 
             self.assertRaises(TypeError, test_shape_item)
 
-            # Since create_global_var support all dtype in convert_dtype(). 
-            # Hence, assertRaises ValueError not TypeError. 
+            # Since create_global_var support all dtype in convert_dtype().
+            # Hence, assertRaises ValueError not TypeError.
             def test_dtype():
                 fluid.layers.create_global_var([1, 2, 3], 2.0, np.complex128)
 
diff --git a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
index fd34c8fc9390b..207bef9ed8143 100644
--- a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
+++ b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
@@ -19,6 +19,7 @@
 
 
 class TestDocString(unittest.TestCase):
+
     def test_layer_doc_string(self):
         print(layers.dropout.__doc__)
 
diff --git a/python/paddle/fluid/tests/unittests/test_create_parameter.py b/python/paddle/fluid/tests/unittests/test_create_parameter.py
index fb4b5e4b6fa88..85a3045881fb4 100644
--- a/python/paddle/fluid/tests/unittests/test_create_parameter.py
+++ b/python/paddle/fluid/tests/unittests/test_create_parameter.py
@@ -22,6 +22,7 @@
 
 
 class TestCreateParameterError(unittest.TestCase):
+
     def func_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
@@ -37,16 +38,18 @@ def test_shape_item():
             self.assertRaises(TypeError, test_shape_item)
 
             def test_attr():
-                fluid.layers.create_parameter(
-                    [1, 2, 3], np.float32, attr=np.array([i for i in range(6)]))
+                fluid.layers.create_parameter([1, 2, 3],
+                                              np.float32,
+                                              attr=np.array(
+                                                  [i for i in range(6)]))
 
             self.assertRaises(TypeError, test_attr)
 
             def test_default_initializer():
-                fluid.layers.create_parameter(
-                    [1, 2, 3],
-                    np.float32,
-                    default_initializer=np.array([i for i in range(6)]))
+                fluid.layers.create_parameter([1, 2, 3],
+                                              np.float32,
+                                              default_initializer=np.array(
+                                                  [i for i in range(6)]))
 
             self.assertRaises(TypeError, test_default_initializer)
 
diff --git a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
index 6f594d16074b4..6724c327b6020 100644
--- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
@@ -22,6 +22,7 @@
 
 
 class CRFDecoding(object):
+
     def __init__(self, emission_weights, transition_weights,
                  seq_start_positions):
         assert (emission_weights.shape[0] == sum(seq_start_positions))
@@ -35,10 +36,10 @@ def __init__(self, emission_weights, transition_weights,
         self.b = transition_weights[1, :]
         self.w = transition_weights[2:, :]
 
-        self.track = np.zeros(
-            (sum(seq_start_positions), self.tag_num), dtype="int64")
-        self.decoded_path = np.zeros(
-            (sum(seq_start_positions), 1), dtype="int64")
+        self.track = np.zeros((sum(seq_start_positions), self.tag_num),
+                              dtype="int64")
+        self.decoded_path = np.zeros((sum(seq_start_positions), 1),
+                                     dtype="int64")
 
     def _decode_one_sequence(self, decoded_path, x):
         seq_len, tag_num = x.shape
@@ -137,19 +138,19 @@ def setUp(self):
 
         self.init_lod()
         total_len = sum(self.lod[-1])
-        transition = np.repeat(
-            np.arange(
-                TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
-            TAG_NUM + 2,
-            axis=0)
-        emission = np.repeat(
-            np.arange(
-                TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
-            total_len,
-            axis=0)
-
-        labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(total_len, 1), dtype="int64")
+        transition = np.repeat(np.arange(TAG_NUM,
+                                         dtype="float64").reshape(1, TAG_NUM),
+                               TAG_NUM + 2,
+                               axis=0)
+        emission = np.repeat(np.arange(TAG_NUM,
+                                       dtype="float64").reshape(1, TAG_NUM),
+                             total_len,
+                             axis=0)
+
+        labels = np.random.randint(low=0,
+                                   high=TAG_NUM,
+                                   size=(total_len, 1),
+                                   dtype="int64")
         predicted_labels = np.ones(
             (total_len, 1), dtype="int64") * (TAG_NUM - 1)
         expected_output = (labels == predicted_labels).astype("int64")
@@ -167,11 +168,13 @@ def test_check_output(self):
 
 
 class TestCRFDecodingOp3(TestCRFDecodingOp2):
+
     def init_lod(self):
         self.lod = [[1, 0, 0, 4]]
 
 
 class TestCRFDecodingOp4(TestCRFDecodingOp2):
+
     def init_lod(self):
         self.lod = [[0, 2, 3, 0]]
 
@@ -228,6 +231,7 @@ def test_check_output(self):
 
 
 class TestCRFDecodingOp6(OpTest):
+
     def init_lod(self):
         self.lod = [[1, 2, 3, 4]]
 
@@ -237,19 +241,19 @@ def setUp(self):
 
         self.init_lod()
         total_len = sum(self.lod[-1])
-        transition = np.repeat(
-            np.arange(
-                TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
-            TAG_NUM + 2,
-            axis=0)
-        emission = np.repeat(
-            np.arange(
-                TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
-            total_len,
-            axis=0)
-
-        labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(total_len, 1), dtype="int64")
+        transition = np.repeat(np.arange(TAG_NUM,
+                                         dtype="float64").reshape(1, TAG_NUM),
+                               TAG_NUM + 2,
+                               axis=0)
+        emission = np.repeat(np.arange(TAG_NUM,
+                                       dtype="float64").reshape(1, TAG_NUM),
+                             total_len,
+                             axis=0)
+
+        labels = np.random.randint(low=0,
+                                   high=TAG_NUM,
+                                   size=(total_len, 1),
+                                   dtype="int64")
         predicted_labels = np.ones(
             (total_len, 1), dtype="int64") * (TAG_NUM - 1)
         expected_output = (labels == predicted_labels).astype("int64")
diff --git a/python/paddle/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py
index acb652ad6f9e8..29d0bdde6e9d8 100644
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_op.py
@@ -20,6 +20,7 @@
 
 
 def crop(data, offsets, crop_shape):
+
     def indexOf(shape, index):
         result = []
         for dim in reversed(shape):
@@ -41,6 +42,7 @@ def indexOf(shape, index):
 
 
 class TestCropOp(OpTest):
+
     def setUp(self):
         self.op_type = "crop"
         self.crop_by_input = False
@@ -78,6 +80,7 @@ def test_check_grad_normal(self):
 
 
 class TestCase1(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (16, 8, 32)
         self.crop_shape = [2, 2, 3]
@@ -85,6 +88,7 @@ def initTestCase(self):
 
 
 class TestCase2(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (15, 8)
         self.crop_shape = [15, 8]
@@ -92,6 +96,7 @@ def initTestCase(self):
 
 
 class TestCase3(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (4, 8, 16)
         self.crop_shape = [2, 2, 3]
@@ -100,6 +105,7 @@ def initTestCase(self):
 
 
 class TestCase4(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (10, 10)
         self.crop_shape = [10, 10]
@@ -108,6 +114,7 @@ def initTestCase(self):
 
 
 class TestCase5(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (3, 4, 10)
         self.crop_shape = [2, 2, 3]
@@ -116,6 +123,7 @@ def initTestCase(self):
 
 
 class TestCase6(TestCropOp):
+
     def initTestCase(self):
         self.x_shape = (10, 9, 14)
         self.crop_shape = [3, 3, 5]
diff --git a/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py b/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py
index 04e47bd30ce24..49805c578bf47 100644
--- a/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py
@@ -22,6 +22,7 @@
 
 
 def crop(data, offsets, crop_shape):
+
     def indexOf(shape, index):
         result = []
         for dim in reversed(shape):
@@ -43,6 +44,7 @@ def indexOf(shape, index):
 
 
 class TestCropTensorOp(OpTest):
+
     def setUp(self):
         self.op_type = "crop_tensor"
         self.shape_by_input = False
@@ -85,6 +87,7 @@ def test_check_grad_normal(self):
 
 
 class TestCase1(TestCropTensorOp):
+
     def initTestCase(self):
         self.x_shape = (100)
         self.crop_shape = [64]
@@ -92,6 +95,7 @@ def initTestCase(self):
 
 
 class TestCase2(TestCropTensorOp):
+
     def initTestCase(self):
         self.x_shape = (12, 24)
         self.crop_shape = [-1, 8]
@@ -99,6 +103,7 @@ def initTestCase(self):
 
 
 class TestCase3(TestCropTensorOp):
+
     def initTestCase(self):
         self.x_shape = (4, 8, 16)
         self.crop_shape = [2, 2, 3]
@@ -107,6 +112,7 @@ def initTestCase(self):
 
 
 class TestCase4(TestCropTensorOp):
+
     def initTestCase(self):
         self.x_shape = (8, 3, 6, 6)
         self.crop_shape = [-1, 3, -1, 4]
@@ -115,6 +121,7 @@ def initTestCase(self):
 
 
 class TestCase5(TestCropTensorOp):
+
     def initTestCase(self):
         self.x_shape = (2, 4, 5, 8, 8)
         self.crop_shape = [1, 1, 2, 4, 4]
@@ -123,6 +130,7 @@ def initTestCase(self):
 
 
 class TestCase6(TestCropTensorOp):
+
     def initTestCase(self):
         self.x_shape = (2, 2, 4, 4, 4, 2)
         self.crop_shape = [1, 1, 4, 2, 2, 2]
@@ -132,6 +140,7 @@ def initTestCase(self):
 
 
 class TestCropTensorOpTensorAttr(OpTest):
+
     def setUp(self):
         self.op_type = "crop_tensor"
         self.OffsetsTensor = False
@@ -183,6 +192,7 @@ def test_check_grad_normal(self):
 
 
 class TestCropTensorOpTensorAttrCase1(TestCropTensorOpTensorAttr):
+
     def initTestCase(self):
         self.x_shape = (16, 8, 32)
         self.crop_shape = [-1, -1, 3]
@@ -191,6 +201,7 @@ def initTestCase(self):
 
 
 class TestCropTensorOpTensorAttrCase2(TestCropTensorOpTensorAttr):
+
     def initTestCase(self):
         self.x_shape = (4, 8, 16, 8)
         self.crop_shape = [2, 2, 3, 4]
@@ -199,6 +210,7 @@ def initTestCase(self):
 
 
 class TestCropTensorOpTensorAttrCase3(TestCropTensorOpTensorAttr):
+
     def initTestCase(self):
         self.x_shape = (16, 8, 32)
         self.crop_shape = [2, 2, 3]
@@ -209,6 +221,7 @@ def initTestCase(self):
 
 
 class TestCropTensorOpTensorAttrCase4(TestCropTensorOpTensorAttr):
+
     def initTestCase(self):
         self.x_shape = (16, 8, 32)
         self.crop_shape = [2, 2, 3]
@@ -219,6 +232,7 @@ def initTestCase(self):
 
 
 class TestCropTensorException(unittest.TestCase):
+
     def test_exception(self):
         input1 = fluid.data(name="input1", shape=[2, 3, 6, 6], dtype="float32")
         input2 = fluid.data(name="input2", shape=[2, 3, 6, 6], dtype="float16")
@@ -241,12 +255,14 @@ def attr_offsets_type():
             out = paddle.crop(input1, shape=[2, 2, 3, 3], offsets=0)
 
         def attr_offsets_dtype():
-            out = paddle.crop(
-                input1, shape=[2, 2, 3, 3], offsets=[0, 1.0, 0, 0])
+            out = paddle.crop(input1,
+                              shape=[2, 2, 3, 3],
+                              offsets=[0, 1.0, 0, 0])
 
         def attr_offsets_value():
-            out = paddle.crop(
-                input1, shape=[2, 2, 3, 3], offsets=[0, -1, offset, 0])
+            out = paddle.crop(input1,
+                              shape=[2, 2, 3, 3],
+                              offsets=[0, -1, offset, 0])
 
         def input_dtype():
             out = paddle.crop(input2, shape=[2, 2, 3, 3])
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py
index e9f7e6ef050ea..f332800bdd4c5 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py
@@ -19,6 +19,7 @@
 
 
 class CrossEntropy2OpTestBase(OpTest):
+
     def initParameters(self):
         return [32, 64], 'float64', -100, False
 
@@ -39,10 +40,11 @@ def setUp(self):
         feature_size = int(self.shape[-1])
         batch_size = int(np.prod(self.shape) / feature_size)
         logits = (np.random.random(size=self.shape) + 1).astype(self.dtype)
-        label_shape = self.shape[0:-1] if self.drop_last_dim else self.shape[
-            0:-1] + [1]
-        label = np.random.random_integers(
-            low=0, high=feature_size - 1, size=label_shape).astype('int64')
+        label_shape = self.shape[
+            0:-1] if self.drop_last_dim else self.shape[0:-1] + [1]
+        label = np.random.random_integers(low=0,
+                                          high=feature_size - 1,
+                                          size=label_shape).astype('int64')
         outputs, match_x = self.calc_output(
             np.reshape(logits, [batch_size, feature_size]),
             np.reshape(label, [batch_size, 1]), self.ignore_index)
@@ -51,8 +53,7 @@ def setUp(self):
         self.outputs = {
             'Y': np.reshape(outputs, out_shape),
             'MatchX': np.reshape(match_x, self.shape[:-1] + [1]),
-            'XShape': np.zeros(
-                shape=logits.shape, dtype=logits.dtype)
+            'XShape': np.zeros(shape=logits.shape, dtype=logits.dtype)
         }
         self.attrs = {'ignore_index': self.ignore_index}
 
@@ -60,33 +61,37 @@ def test_check_output(self):
         self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
-        self.check_grad(
-            inputs_to_check=['X'],
-            output_names=['Y'],
-            no_grad_set=['XShape', 'MatchX', 'Label'])
+        self.check_grad(inputs_to_check=['X'],
+                        output_names=['Y'],
+                        no_grad_set=['XShape', 'MatchX', 'Label'])
 
 
 class CrossEntropy2OpTest2(CrossEntropy2OpTestBase):
+
     def initParameters(self):
         return [32, 64], 'float64', 3, False
 
 
 class CrossEntropy2OpTest2RemoveLastDim(CrossEntropy2OpTestBase):
+
     def initParameters(self):
         return [32, 64], 'float64', 3, True
 
 
 class CrossEntropy2OpTest3(CrossEntropy2OpTestBase):
+
     def initParameters(self):
         return [4, 8, 16, 32], 'float64', -100, False
 
 
 class CrossEntropy2OpTest3RemoveLastDim(CrossEntropy2OpTestBase):
+
     def initParameters(self):
         return [4, 8, 16, 32], 'float64', -100, True
 
 
 class CrossEntropy2OpTest4(CrossEntropy2OpTestBase):
+
     def initParameters(self):
         return [4, 8, 16, 32], 'float64', 3, False
 
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 4402d875a41f6..4982ae59d43ba 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -83,8 +83,8 @@ def cross_entropy_loss_2d(input,
                     continue
                 cur_weight = weight[cur_target] if weight is not None else 1
                 total_weight += cur_weight
-                out[i][h][w] = -log_softmax_out[i][h][w][
-                    cur_target] * cur_weight
+                out[i][h][
+                    w] = -log_softmax_out[i][h][w][cur_target] * cur_weight
     if reduction == 'sum':
         return np.sum(out), np.array([total_weight]).astype('float64')
     elif reduction == 'mean':
@@ -187,6 +187,7 @@ def cross_entropy_soft_2d(softmax,
 
 
 class CrossEntropyLoss(unittest.TestCase):
+
     def setUp(self):
         self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
         ) else 'float64'
@@ -213,14 +214,13 @@ def test_softmax_with_cross_entropy(self):
         self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
         self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
 
-        expected = cross_entropy_soft(
-            softmax,
-            self.labels,
-            self.axis,
-            self.N,
-            weight=self.weight,
-            reduction=self.reduction,
-            ignore_index=self.ignore_index)
+        expected = cross_entropy_soft(softmax,
+                                      self.labels,
+                                      self.axis,
+                                      self.N,
+                                      weight=self.weight,
+                                      reduction=self.reduction,
+                                      ignore_index=self.ignore_index)
 
         paddle.set_device("cpu")
 
@@ -266,14 +266,13 @@ def test_cross_entropy_loss_soft_1d(self):
         self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
         self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
 
-        expected = cross_entropy_soft(
-            softmax,
-            self.labels,
-            self.axis,
-            self.N,
-            weight=self.weight,
-            reduction=self.reduction,
-            ignore_index=self.ignore_index)
+        expected = cross_entropy_soft(softmax,
+                                      self.labels,
+                                      self.axis,
+                                      self.N,
+                                      weight=self.weight,
+                                      reduction=self.reduction,
+                                      ignore_index=self.ignore_index)
 
         paddle.set_device("cpu")
 
@@ -293,13 +292,15 @@ def test_cross_entropy_loss_soft_1d(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[self.N, self.C], dtype=self.dtype)
-            label = fluid.data(
-                name='label', shape=[self.N, self.C], dtype=self.dtype)
+            input = fluid.data(name='input',
+                               shape=[self.N, self.C],
+                               dtype=self.dtype)
+            label = fluid.data(name='label',
+                               shape=[self.N, self.C],
+                               dtype=self.dtype)
 
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction=self.reduction, soft_label=True)
@@ -344,18 +345,19 @@ def test_cross_entropy_loss_soft_1d_weight(self):
         else:
             axis_dim = self.shape[self.axis]
             self.shape[self.axis] = 1
-            self.labels = np.random.randint(
-                0, axis_dim, self.shape, dtype="int64")
+            self.labels = np.random.randint(0,
+                                            axis_dim,
+                                            self.shape,
+                                            dtype="int64")
 
         #1. numpy
-        expected = cross_entropy_soft(
-            softmax,
-            self.labels,
-            self.axis,
-            self.N,
-            weight=self.weight,
-            reduction=self.reduction,
-            ignore_index=self.ignore_index)
+        expected = cross_entropy_soft(softmax,
+                                      self.labels,
+                                      self.axis,
+                                      self.N,
+                                      weight=self.weight,
+                                      reduction=self.reduction,
+                                      ignore_index=self.ignore_index)
 
         paddle.set_device("cpu")
 
@@ -374,13 +376,15 @@ def test_cross_entropy_loss_soft_1d_weight(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[self.N, self.C], dtype=self.dtype)
-            label = fluid.data(
-                name='label', shape=[self.N, self.C], dtype=self.dtype)
+            input = fluid.data(name='input',
+                               shape=[self.N, self.C],
+                               dtype=self.dtype)
+            label = fluid.data(name='label',
+                               shape=[self.N, self.C],
+                               dtype=self.dtype)
             weight = fluid.data(name='weight', shape=[self.C], dtype=self.dtype)
 
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
@@ -424,14 +428,13 @@ def test_cross_entropy_loss_soft_1d_mean(self):
         self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
 
         #1. numpy
-        expected = cross_entropy_soft(
-            softmax,
-            self.labels,
-            self.axis,
-            self.N,
-            weight=self.weight,
-            reduction=self.reduction,
-            ignore_index=self.ignore_index)
+        expected = cross_entropy_soft(softmax,
+                                      self.labels,
+                                      self.axis,
+                                      self.N,
+                                      weight=self.weight,
+                                      reduction=self.reduction,
+                                      ignore_index=self.ignore_index)
 
         paddle.set_device("cpu")
 
@@ -450,24 +453,27 @@ def test_cross_entropy_loss_soft_1d_mean(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[self.N, self.C], dtype=self.dtype)
-            label = fluid.data(
-                name='label', shape=[self.N, self.C], dtype=self.dtype)
+            input = fluid.data(name='input',
+                               shape=[self.N, self.C],
+                               dtype=self.dtype)
+            label = fluid.data(name='label',
+                               shape=[self.N, self.C],
+                               dtype=self.dtype)
 
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction=self.reduction, soft_label=True)
             ret = cross_entropy_loss(input, label)
 
             exe = fluid.Executor(place)
-            static_ret = exe.run(
-                prog,
-                feed={'input': self.logits,
-                      'label': self.labels},
-                fetch_list=[ret])
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels
+                                 },
+                                 fetch_list=[ret])
             self.assertIsNotNone(static_ret)
         paddle.disable_static()
 
@@ -497,14 +503,13 @@ def test_cross_entropy_loss_soft_1d_weight_mean(self):
         self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
 
         #1. numpy
-        expected = cross_entropy_soft(
-            softmax,
-            self.labels,
-            self.axis,
-            self.N,
-            weight=self.weight,
-            reduction=self.reduction,
-            ignore_index=self.ignore_index)
+        expected = cross_entropy_soft(softmax,
+                                      self.labels,
+                                      self.axis,
+                                      self.N,
+                                      weight=self.weight,
+                                      reduction=self.reduction,
+                                      ignore_index=self.ignore_index)
 
         paddle.set_device("cpu")
         paddle.disable_static()
@@ -523,13 +528,15 @@ def test_cross_entropy_loss_soft_1d_weight_mean(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[self.N, self.C], dtype=self.dtype)
-            label = fluid.data(
-                name='label', shape=[self.N, self.C], dtype=self.dtype)
+            input = fluid.data(name='input',
+                               shape=[self.N, self.C],
+                               dtype=self.dtype)
+            label = fluid.data(name='label',
+                               shape=[self.N, self.C],
+                               dtype=self.dtype)
             weight = fluid.data(name='weight', shape=[self.C], dtype=self.dtype)
 
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
@@ -574,16 +581,15 @@ def test_cross_entropy_loss_soft_2d(self):
         self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
 
         #1. numpy
-        expected = cross_entropy_soft_2d(
-            softmax,
-            self.labels,
-            self.axis,
-            self.N,
-            self.H,
-            self.W,
-            weight=self.weight,
-            reduction=self.reduction,
-            ignore_index=self.ignore_index)
+        expected = cross_entropy_soft_2d(softmax,
+                                         self.labels,
+                                         self.axis,
+                                         self.N,
+                                         self.H,
+                                         self.W,
+                                         weight=self.weight,
+                                         reduction=self.reduction,
+                                         ignore_index=self.ignore_index)
 
         paddle.set_device("cpu")
         paddle.disable_static()
@@ -603,17 +609,15 @@ def test_cross_entropy_loss_soft_2d(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input',
-                shape=[self.N, self.H, self.W, self.C],
-                dtype=self.dtype)
-            label = fluid.data(
-                name='label',
-                shape=[self.N, self.H, self.W, self.C],
-                dtype=self.dtype)
+            input = fluid.data(name='input',
+                               shape=[self.N, self.H, self.W, self.C],
+                               dtype=self.dtype)
+            label = fluid.data(name='label',
+                               shape=[self.N, self.H, self.W, self.C],
+                               dtype=self.dtype)
 
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction=self.reduction, soft_label=True)
@@ -657,16 +661,15 @@ def test_cross_entropy_loss_soft_2d_weight_mean(self):
         self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
 
         #1. numpy
-        expected = cross_entropy_soft_2d(
-            softmax,
-            self.labels,
-            self.axis,
-            self.N,
-            self.H,
-            self.W,
-            weight=self.weight,
-            reduction=self.reduction,
-            ignore_index=self.ignore_index)
+        expected = cross_entropy_soft_2d(softmax,
+                                         self.labels,
+                                         self.axis,
+                                         self.N,
+                                         self.H,
+                                         self.W,
+                                         weight=self.weight,
+                                         reduction=self.reduction,
+                                         ignore_index=self.ignore_index)
 
         paddle.set_device("cpu")
         paddle.disable_static()
@@ -685,17 +688,15 @@ def test_cross_entropy_loss_soft_2d_weight_mean(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input',
-                shape=[self.N, self.H, self.W, self.C],
-                dtype=self.dtype)
-            label = fluid.data(
-                name='label',
-                shape=[self.N, self.H, self.W, self.C],
-                dtype=self.dtype)
+            input = fluid.data(name='input',
+                               shape=[self.N, self.H, self.W, self.C],
+                               dtype=self.dtype)
+            label = fluid.data(name='label',
+                               shape=[self.N, self.H, self.W, self.C],
+                               dtype=self.dtype)
             weight = fluid.data(name='weight', shape=[self.C], dtype=self.dtype)
 
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
@@ -724,8 +725,8 @@ def test_cross_entropy_loss_1d_with_mean_ignore(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[2, 4], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2], dtype='int64')
@@ -743,11 +744,10 @@ def test_cross_entropy_loss_1d_with_mean_ignore(self):
         expected = cross_entropy_loss_1d(input_np, label_np)[0]
 
         with fluid.dygraph.guard():
-            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
-                axis=1, ignore_index=0)
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(axis=1,
+                                                                 ignore_index=0)
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
         expected = cross_entropy_loss_1d(input_np, label_np, ignore_index=0)[0]
@@ -763,8 +763,8 @@ def test_cross_entropy_loss_1d_with_mean_ignore_negative(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[N, C], dtype=self.dtype)
             label = fluid.data(name='label', shape=[N], dtype='int64')
@@ -783,9 +783,8 @@ def test_cross_entropy_loss_1d_with_mean_ignore_negative(self):
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 axis=1, ignore_index=-1)
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
         expected = cross_entropy_loss_1d(input_np, label_np, ignore_index=-1)[0]
@@ -803,16 +802,15 @@ def test_cross_entropy_loss_1d_with_weight_mean_ignore(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[N, C], dtype=self.dtype)
             label = fluid.data(name='label', shape=[N], dtype='int64')
-            weight = fluid.data(
-                name='weight', shape=[C],
-                dtype=self.dtype)  #weight for each class
-            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
-                weight=weight, ignore_index=0)
+            weight = fluid.data(name='weight', shape=[C],
+                                dtype=self.dtype)  #weight for each class
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(weight=weight,
+                                                                 ignore_index=0)
             ret = cross_entropy_loss(input, label)
 
             exe = fluid.Executor(place)
@@ -830,13 +828,14 @@ def test_cross_entropy_loss_1d_with_weight_mean_ignore(self):
                 weight=fluid.dygraph.to_variable(weight_np),
                 axis=1,
                 ignore_index=0)
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_1d(
-            input_np, label_np, weight=weight_np, ignore_index=0)[0]
+        expected = cross_entropy_loss_1d(input_np,
+                                         label_np,
+                                         weight=weight_np,
+                                         ignore_index=0)[0]
 
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
         self.assertTrue(np.allclose(static_ret, expected))
@@ -853,13 +852,14 @@ def test_cross_entropy_loss_1d_with_weight_mean_ignore_exceedlabel(self):
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=fluid.dygraph.to_variable(weight_np), ignore_index=255)
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_1d(
-            input_np, label_np, weight=weight_np, ignore_index=255)[0]
+        expected = cross_entropy_loss_1d(input_np,
+                                         label_np,
+                                         weight=weight_np,
+                                         ignore_index=255)[0]
 
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
@@ -870,14 +870,13 @@ def test_cross_entropy_loss_1d_with_weight_mean(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[2, 4], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2], dtype='int64')
-            weight = fluid.data(
-                name='weight', shape=[4],
-                dtype=self.dtype)  #weight for each class
+            weight = fluid.data(name='weight', shape=[4],
+                                dtype=self.dtype)  #weight for each class
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(weight=weight)
             ret = cross_entropy_loss(input, label)
 
@@ -890,19 +889,18 @@ def test_cross_entropy_loss_1d_with_weight_mean(self):
                                  },
                                  fetch_list=[ret])
             self.assertIsNotNone(static_ret)
-        expected = cross_entropy_loss_1d(
-            input_np, label_np, weight=weight_np)[0]
+        expected = cross_entropy_loss_1d(input_np, label_np,
+                                         weight=weight_np)[0]
 
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=fluid.dygraph.to_variable(weight_np), axis=1)
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_1d(
-            input_np, label_np, weight=weight_np)[0]
+        expected = cross_entropy_loss_1d(input_np, label_np,
+                                         weight=weight_np)[0]
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
@@ -914,8 +912,8 @@ def test_cross_entropy_loss_1d_with_weight_sum(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
@@ -936,13 +934,14 @@ def test_cross_entropy_loss_1d_with_weight_sum(self):
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=fluid.dygraph.to_variable(weight_np), reduction='sum')
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_1d(
-            input_np, label_np, weight=weight_np, reduction='sum')[0]
+        expected = cross_entropy_loss_1d(input_np,
+                                         label_np,
+                                         weight=weight_np,
+                                         reduction='sum')[0]
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
@@ -955,8 +954,8 @@ def test_cross_entropy_loss_1d_with_weight_none(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
@@ -979,14 +978,15 @@ def test_cross_entropy_loss_1d_with_weight_none(self):
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=fluid.dygraph.to_variable(weight_np), reduction='none')
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             dy_ret_value = np.squeeze(dy_ret_value)
             self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_1d(
-            input_np, label_np, weight=weight_np, reduction='none')
+        expected = cross_entropy_loss_1d(input_np,
+                                         label_np,
+                                         weight=weight_np,
+                                         reduction='none')
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
@@ -998,14 +998,16 @@ def test_cross_entropy_loss_1d_with_weight_none_func(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
             weight = fluid.data(name='weight', shape=[200], dtype=self.dtype)
-            ret = paddle.nn.functional.cross_entropy(
-                input, label, weight=weight, reduction='none')
+            ret = paddle.nn.functional.cross_entropy(input,
+                                                     label,
+                                                     weight=weight,
+                                                     reduction='none')
 
             exe = fluid.Executor(place)
             static_ret = exe.run(prog,
@@ -1026,8 +1028,10 @@ def test_cross_entropy_loss_1d_with_weight_none_func(self):
             dy_ret_value = dy_ret.numpy()
             dy_ret_value = np.squeeze(dy_ret_value)
             self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_1d(
-            input_np, label_np, weight=weight_np, reduction='none')
+        expected = cross_entropy_loss_1d(input_np,
+                                         label_np,
+                                         weight=weight_np,
+                                         reduction='none')
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
@@ -1038,8 +1042,8 @@ def test_cross_entropy_loss_1d_mean(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
@@ -1047,15 +1051,16 @@ def test_cross_entropy_loss_1d_mean(self):
             ret = cross_entropy_loss(input, label)
             exe = fluid.Executor(place)
             static_ret = exe.run(prog,
-                                 feed={'input': input_np,
-                                       'label': label_np},
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np
+                                 },
                                  fetch_list=[ret])
             self.assertIsNotNone(static_ret)
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss()
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
         expected = cross_entropy_loss_1d(input_np, label_np)[0]
@@ -1069,8 +1074,8 @@ def test_cross_entropy_loss_1d_sum(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
@@ -1079,16 +1084,17 @@ def test_cross_entropy_loss_1d_sum(self):
             ret = cross_entropy_loss(input, label)
             exe = fluid.Executor(place)
             static_ret = exe.run(prog,
-                                 feed={'input': input_np,
-                                       'label': label_np},
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np
+                                 },
                                  fetch_list=[ret])
             self.assertIsNotNone(static_ret)
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='sum')
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
         expected = cross_entropy_loss_1d(input_np, label_np, reduction='sum')[0]
@@ -1102,8 +1108,8 @@ def test_cross_entropy_loss_1d_none(self):
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
@@ -1112,17 +1118,18 @@ def test_cross_entropy_loss_1d_none(self):
             ret = cross_entropy_loss(input, label)
             exe = fluid.Executor(place)
             static_ret = exe.run(prog,
-                                 feed={'input': input_np,
-                                       'label': label_np},
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np
+                                 },
                                  fetch_list=[ret])
             static_ret = np.squeeze(static_ret)
             self.assertIsNotNone(static_ret)
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='none')
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             dy_ret_value = np.squeeze(dy_ret_value)
             self.assertIsNotNone(dy_ret_value)
@@ -1133,18 +1140,19 @@ def test_cross_entropy_loss_1d_none(self):
 
     def test_cross_entropy_loss_2d_with_weight_none(self):
         input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
-        label_np = np.random.randint(
-            0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW1
+        label_np = np.random.randint(0, 3,
+                                     size=(2, 2, 2)).astype(np.int64)  #NHW1
         weight_np = np.random.random(size=(3, )).astype(self.dtype)  #C
 
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
+            input = fluid.data(name='input',
+                               shape=[2, 2, 2, 3],
+                               dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             weight = fluid.data(name='weight', shape=[3], dtype=self.dtype)
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
@@ -1164,32 +1172,34 @@ def test_cross_entropy_loss_2d_with_weight_none(self):
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=fluid.dygraph.to_variable(weight_np), reduction='none')
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             dy_ret_value = np.squeeze(dy_ret_value)
             self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_2d(
-            input_np, label_np, weight=weight_np, reduction='none')
+        expected = cross_entropy_loss_2d(input_np,
+                                         label_np,
+                                         weight=weight_np,
+                                         reduction='none')
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_with_weight_axis_change_mean(self):
         input_np = np.random.random(size=(2, 3, 2, 2)).astype(self.dtype)  #NCHW
-        label_np = np.random.randint(
-            0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
+        label_np = np.random.randint(0, 3,
+                                     size=(2, 2, 2)).astype(np.int64)  #NHW
         weight_np = np.random.random(size=(3, )).astype(self.dtype)  #C
 
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[2, 3, 2, 2], dtype=self.dtype)
+            input = fluid.data(name='input',
+                               shape=[2, 3, 2, 2],
+                               dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             weight = fluid.data(name='weight', shape=[3], dtype=self.dtype)
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
@@ -1212,16 +1222,14 @@ def test_cross_entropy_loss_2d_with_weight_axis_change_mean(self):
                 weight=fluid.dygraph.to_variable(weight_np),
                 reduction='mean',
                 axis=1)
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_2d(
-            np.transpose(input_np, [0, 2, 3, 1]),
-            label_np,
-            weight=weight_np,
-            reduction='mean')[0]
+        expected = cross_entropy_loss_2d(np.transpose(input_np, [0, 2, 3, 1]),
+                                         label_np,
+                                         weight=weight_np,
+                                         reduction='mean')[0]
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
@@ -1238,28 +1246,30 @@ def test_cross_entropy_loss_2d_with_weight_mean_ignore_exceedlabel(self):
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=fluid.dygraph.to_variable(weight_np), ignore_index=255)
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_2d(
-            input_np, label_np, weight=weight_np, ignore_index=255)[0]
+        expected = cross_entropy_loss_2d(input_np,
+                                         label_np,
+                                         weight=weight_np,
+                                         ignore_index=255)[0]
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_with_weight_mean(self):
         input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
-        label_np = np.random.randint(
-            0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
+        label_np = np.random.randint(0, 3,
+                                     size=(2, 2, 2)).astype(np.int64)  #NHW
         weight_np = np.random.random(size=(3, )).astype(self.dtype)  #C
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
+            input = fluid.data(name='input',
+                               shape=[2, 2, 2, 3],
+                               dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             weight = fluid.data(name='weight', shape=[3], dtype=self.dtype)
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
@@ -1278,31 +1288,33 @@ def test_cross_entropy_loss_2d_with_weight_mean(self):
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=fluid.dygraph.to_variable(weight_np), reduction='mean')
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_2d(
-            input_np, label_np, weight=weight_np, reduction='mean')[0]
+        expected = cross_entropy_loss_2d(input_np,
+                                         label_np,
+                                         weight=weight_np,
+                                         reduction='mean')[0]
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_with_weight_sum(self):
         input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
-        label_np = np.random.randint(
-            0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
+        label_np = np.random.randint(0, 3,
+                                     size=(2, 2, 2)).astype(np.int64)  #NHW
         weight_np = np.random.random(size=(3, )).astype(self.dtype)  #C
         paddle.enable_static()
 
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
+            input = fluid.data(name='input',
+                               shape=[2, 2, 2, 3],
+                               dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             weight = fluid.data(name='weight', shape=[3], dtype=self.dtype)
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
@@ -1321,29 +1333,31 @@ def test_cross_entropy_loss_2d_with_weight_sum(self):
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=fluid.dygraph.to_variable(weight_np), reduction='sum')
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_2d(
-            input_np, label_np, weight=weight_np, reduction='sum')[0]
+        expected = cross_entropy_loss_2d(input_np,
+                                         label_np,
+                                         weight=weight_np,
+                                         reduction='sum')[0]
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_none(self):
         input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
-        label_np = np.random.randint(
-            0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
+        label_np = np.random.randint(0, 3,
+                                     size=(2, 2, 2)).astype(np.int64)  #NHW
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
+            input = fluid.data(name='input',
+                               shape=[2, 2, 2, 3],
+                               dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='none')
@@ -1360,9 +1374,8 @@ def test_cross_entropy_loss_2d_none(self):
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='none')
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             dy_ret_value = np.squeeze(dy_ret_value)
             self.assertIsNotNone(dy_ret_value)
@@ -1373,16 +1386,17 @@ def test_cross_entropy_loss_2d_none(self):
 
     def test_cross_entropy_loss_2d_mean(self):
         input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
-        label_np = np.random.randint(
-            0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
+        label_np = np.random.randint(0, 3,
+                                     size=(2, 2, 2)).astype(np.int64)  #NHW
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
+            input = fluid.data(name='input',
+                               shape=[2, 2, 2, 3],
+                               dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='mean')
@@ -1399,29 +1413,29 @@ def test_cross_entropy_loss_2d_mean(self):
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='mean')
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_2d(
-            input_np, label_np, reduction='mean')[0]
+        expected = cross_entropy_loss_2d(input_np, label_np,
+                                         reduction='mean')[0]
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_sum(self):
         input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
-        label_np = np.random.randint(
-            0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
+        label_np = np.random.randint(0, 3,
+                                     size=(2, 2, 2)).astype(np.int64)  #NHW
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
+            input = fluid.data(name='input',
+                               shape=[2, 2, 2, 3],
+                               dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='sum')
@@ -1438,9 +1452,8 @@ def test_cross_entropy_loss_2d_sum(self):
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='sum')
-            dy_ret = cross_entropy_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = cross_entropy_loss(fluid.dygraph.to_variable(input_np),
+                                        fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
         expected = cross_entropy_loss_2d(input_np, label_np, reduction='sum')[0]
@@ -1487,47 +1500,51 @@ def test_other_dygraph_final_state_api(self):
 
 
 class TestCrossEntropyFAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
             def test_WeightLength_NotEqual():
                 input_data = paddle.rand(shape=[20, 100])
-                label_data = paddle.randint(
-                    0, 100, shape=[20, 1], dtype="int64")
+                label_data = paddle.randint(0,
+                                            100,
+                                            shape=[20, 1],
+                                            dtype="int64")
                 weight_data = paddle.rand([100 + 1])
-                paddle.nn.functional.cross_entropy(
-                    input=input_data,
-                    label=label_data,
-                    weight=weight_data,
-                    ignore_index=-100)
+                paddle.nn.functional.cross_entropy(input=input_data,
+                                                   label=label_data,
+                                                   weight=weight_data,
+                                                   ignore_index=-100)
 
             self.assertRaises(ValueError, test_WeightLength_NotEqual)
 
             def test_LabelValue_ExceedMax():
                 input_data = paddle.rand(shape=[20, 100])
-                label_data = paddle.randint(
-                    0, 100, shape=[20, 1], dtype="int64")
+                label_data = paddle.randint(0,
+                                            100,
+                                            shape=[20, 1],
+                                            dtype="int64")
                 label_data[0] = 100
                 weight_data = paddle.rand([100])
-                paddle.nn.functional.cross_entropy(
-                    input=input_data,
-                    label=label_data,
-                    weight=weight_data,
-                    ignore_index=-100)
+                paddle.nn.functional.cross_entropy(input=input_data,
+                                                   label=label_data,
+                                                   weight=weight_data,
+                                                   ignore_index=-100)
 
             self.assertRaises(ValueError, test_LabelValue_ExceedMax)
 
             def test_LabelValue_ExceedMin():
                 input_data = paddle.rand(shape=[20, 100])
-                label_data = paddle.randint(
-                    0, 100, shape=[20, 1], dtype="int64")
+                label_data = paddle.randint(0,
+                                            100,
+                                            shape=[20, 1],
+                                            dtype="int64")
                 label_data[0] = -1
                 weight_data = paddle.rand([100])
-                paddle.nn.functional.cross_entropy(
-                    input=input_data,
-                    label=label_data,
-                    weight=weight_data,
-                    ignore_index=-100)
+                paddle.nn.functional.cross_entropy(input=input_data,
+                                                   label=label_data,
+                                                   weight=weight_data,
+                                                   ignore_index=-100)
 
             self.assertRaises(ValueError, test_LabelValue_ExceedMin)
 
@@ -1541,12 +1558,13 @@ def static_test_WeightLength_NotEqual():
                 place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
                 ) else fluid.CPUPlace()
                 with fluid.program_guard(prog, startup_prog):
-                    input = fluid.data(
-                        name='input', shape=[2, 4], dtype='float32')
+                    input = fluid.data(name='input',
+                                       shape=[2, 4],
+                                       dtype='float32')
                     label = fluid.data(name='label', shape=[2], dtype='int64')
-                    weight = fluid.data(
-                        name='weight', shape=[3],
-                        dtype='float32')  #weight for each class
+                    weight = fluid.data(name='weight',
+                                        shape=[3],
+                                        dtype='float32')  #weight for each class
                     cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                         weight=weight)
                     ret = cross_entropy_loss(input, label)
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
index ba39b072303fe..35d73759be552 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
@@ -49,12 +49,14 @@ def setUp(self):
         }
 
     def init_x(self):
-        self.x = randomize_probability(
-            self.batch_size, self.class_num, dtype=self.dtype)
+        self.x = randomize_probability(self.batch_size,
+                                       self.class_num,
+                                       dtype=self.dtype)
 
     def init_label(self):
-        self.label = np.random.randint(
-            0, self.class_num, (self.batch_size, 1), dtype="int64")
+        self.label = np.random.randint(0,
+                                       self.class_num, (self.batch_size, 1),
+                                       dtype="int64")
 
     def get_cross_entropy(self):
         self.cross_entropy = np.asmatrix(
@@ -83,15 +85,13 @@ class TestCrossEntropyOpRemoveLastDim(TestCrossEntropyOp):
     """
 
     def init_label(self):
-        self.label = np.random.randint(
-            0, self.class_num, (self.batch_size), dtype="int64")
+        self.label = np.random.randint(0,
+                                       self.class_num, (self.batch_size),
+                                       dtype="int64")
 
     def get_cross_entropy(self):
         self.cross_entropy = np.asmatrix(
-            [
-                -np.log(self.x[i][self.label[i]])
-                for i in range(self.x.shape[0])
-            ],
+            [-np.log(self.x[i][self.label[i]]) for i in range(self.x.shape[0])],
             dtype="float64")
 
 
@@ -119,8 +119,10 @@ def init_bs_class_num(self):
         self.class_num = 37
 
     def test_check_grad(self):
-        self.check_grad(
-            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+        self.check_grad(["X"],
+                        "Y",
+                        max_relative_error=0.05,
+                        numeric_grad_delta=0.001)
 
 
 class TestCrossEntropyOp3(TestCrossEntropyOp):
@@ -149,8 +151,10 @@ def init_bs_class_num(self):
         self.class_num = 27
 
     def test_check_grad(self):
-        self.check_grad(
-            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+        self.check_grad(["X"],
+                        "Y",
+                        max_relative_error=0.05,
+                        numeric_grad_delta=0.001)
 
 
 class TestCrossEntropyOp4(TestCrossEntropyOp):
@@ -165,8 +169,9 @@ def init_x(self):
         self.x = self.X_2d.reshape(self.shape + [self.class_num])
 
     def init_label(self):
-        self.label_2d = np.random.randint(
-            0, self.class_num, (self.ins_num, 1), dtype="int64")
+        self.label_2d = np.random.randint(0,
+                                          self.class_num, (self.ins_num, 1),
+                                          dtype="int64")
         self.label = self.label_2d.reshape(self.shape + [1])
 
     def get_cross_entropy(self):
@@ -191,8 +196,9 @@ class TestCrossEntropyOp4RemoveLastDim(TestCrossEntropyOp4):
     """
 
     def init_label(self):
-        self.label_2d = np.random.randint(
-            0, self.class_num, (self.ins_num, 1), dtype="int64")
+        self.label_2d = np.random.randint(0,
+                                          self.class_num, (self.ins_num, 1),
+                                          dtype="int64")
         self.label = self.label_2d.reshape(self.shape)
 
     def get_cross_entropy(self):
@@ -235,8 +241,10 @@ def init_bs_class_num(self):
         self.class_num = 37
 
     def test_check_grad(self):
-        self.check_grad(
-            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+        self.check_grad(["X"],
+                        "Y",
+                        max_relative_error=0.05,
+                        numeric_grad_delta=0.001)
 
 
 class TestCrossEntropyOp6(TestCrossEntropyOp):
@@ -251,8 +259,9 @@ def init_x(self):
         self.x = self.X_2d.reshape(self.shape + [self.class_num])
 
     def init_label(self):
-        self.label_index_2d = np.random.randint(
-            0, self.class_num, (self.ins_num), dtype="int64")
+        self.label_index_2d = np.random.randint(0,
+                                                self.class_num, (self.ins_num),
+                                                dtype="int64")
         label_2d = np.zeros(self.X_2d.shape)
         label_2d[np.arange(self.ins_num), self.label_index_2d] = 1
         self.label = label_2d.reshape(self.shape + [self.class_num]).astype(
@@ -262,8 +271,9 @@ def get_cross_entropy(self):
         cross_entropy_2d = np.asmatrix(
             [[-np.log(self.X_2d[i][self.label_index_2d[i]])]
              for i in range(self.X_2d.shape[0])])
-        self.cross_entropy = np.array(cross_entropy_2d).reshape(
-            self.shape + [1]).astype(self.dtype)
+        self.cross_entropy = np.array(cross_entropy_2d).reshape(self.shape +
+                                                                [1]).astype(
+                                                                    self.dtype)
 
     def init_attr_type(self):
         self.soft_label = True
@@ -275,8 +285,10 @@ def init_bs_class_num(self):
         self.class_num = 17
 
     def test_check_grad(self):
-        self.check_grad(
-            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+        self.check_grad(["X"],
+                        "Y",
+                        max_relative_error=0.05,
+                        numeric_grad_delta=0.001)
 
 
 class TestCrossEntropyOp7(TestCrossEntropyOp):
@@ -284,8 +296,9 @@ class TestCrossEntropyOp7(TestCrossEntropyOp):
     """
 
     def init_label(self):
-        self.label = np.random.randint(
-            0, self.class_num, (self.batch_size, 1), dtype="int64")
+        self.label = np.random.randint(0,
+                                       self.class_num, (self.batch_size, 1),
+                                       dtype="int64")
 
     def get_cross_entropy(self):
         self.cross_entropy = np.asmatrix(
@@ -310,8 +323,9 @@ class TestCrossEntropyOp7RemoveLastDim(TestCrossEntropyOp7):
     """
 
     def init_label(self):
-        self.label = np.random.randint(
-            0, self.class_num, (self.batch_size), dtype="int64")
+        self.label = np.random.randint(0,
+                                       self.class_num, (self.batch_size),
+                                       dtype="int64")
 
     def get_cross_entropy(self):
         self.cross_entropy = np.asmatrix(
@@ -324,9 +338,11 @@ def get_cross_entropy(self):
 
 # Add Fp16 test
 def create_test_class(parent, cls_name):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCrossEntropyFP16Op(parent):
+
         def init_dtype_type(self):
             return np.float16
 
@@ -338,8 +354,9 @@ def test_check_output(self):
         def test_check_grad(self):
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
-                self.check_grad_with_place(
-                    place, ['X'], 'Y', max_relative_error=0.9)
+                self.check_grad_with_place(place, ['X'],
+                                           'Y',
+                                           max_relative_error=0.9)
 
     cls_name = "{0}".format(cls_name)
     TestCrossEntropyFP16Op.__name__ = cls_name
@@ -360,15 +377,16 @@ def test_check_grad(self):
 
 
 class TestCrossEntropyOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
             def test_Variable():
                 # the input of cross_entropy must be Variable.
-                x1 = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
-                lab1 = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                             [[1, 1, 1, 1]], fluid.CPUPlace())
+                lab1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                               [[1, 1, 1, 1]], fluid.CPUPlace())
                 fluid.layers.cross_entropy(x1, lab1)
 
             self.assertRaises(TypeError, test_Variable)
@@ -376,10 +394,12 @@ def test_Variable():
             def test_dtype():
                 # the input dtype of cross_entropy must be float16 or float32 or float64
                 # float16 only can be set on GPU place
-                x2 = fluid.layers.data(
-                    name='x2', shape=[3, 4, 5, 6], dtype="int32")
-                lab2 = fluid.layers.data(
-                    name='lab2', shape=[3, 4, 5, 6], dtype="int32")
+                x2 = fluid.layers.data(name='x2',
+                                       shape=[3, 4, 5, 6],
+                                       dtype="int32")
+                lab2 = fluid.layers.data(name='lab2',
+                                         shape=[3, 4, 5, 6],
+                                         dtype="int32")
                 fluid.layers.cross_entropy(x2, lab2)
 
             self.assertRaises(TypeError, test_dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_cross_op.py b/python/paddle/fluid/tests/unittests/test_cross_op.py
index 8b884583646a7..b54883975a6cc 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_op.py
@@ -24,6 +24,7 @@
 
 
 class TestCrossOp(OpTest):
+
     def setUp(self):
         self.op_type = "cross"
         self.python_api = paddle.cross
@@ -55,6 +56,7 @@ def test_check_grad_normal(self):
 
 
 class TestCrossOpCase1(TestCrossOp):
+
     def initTestCase(self):
         self.shape = (2048, 3)
         self.dtype = np.float32
@@ -67,11 +69,12 @@ def init_output(self):
 
 
 class TestCrossAPI(unittest.TestCase):
+
     def input_data(self):
-        self.data_x = np.array(
-            [[1.0, 1.0, 1.0], [2.0, 2.0, 2.0], [3.0, 3.0, 3.0]])
-        self.data_y = np.array(
-            [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])
+        self.data_x = np.array([[1.0, 1.0, 1.0], [2.0, 2.0, 2.0],
+                                [3.0, 3.0, 3.0]])
+        self.data_y = np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0],
+                                [1.0, 1.0, 1.0]])
 
     def test_cross_api(self):
         self.input_data()
@@ -82,8 +85,10 @@ def test_cross_api(self):
             y = fluid.layers.data(name='y', shape=[-1, 3])
             z = paddle.cross(x, y, axis=1)
             exe = fluid.Executor(fluid.CPUPlace())
-            res, = exe.run(feed={'x': self.data_x,
-                                 'y': self.data_y},
+            res, = exe.run(feed={
+                'x': self.data_x,
+                'y': self.data_y
+            },
                            fetch_list=[z.name],
                            return_numpy=False)
         expect_out = np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0],
@@ -96,8 +101,10 @@ def test_cross_api(self):
             y = fluid.layers.data(name='y', shape=[-1, 3])
             z = paddle.cross(x, y)
             exe = fluid.Executor(fluid.CPUPlace())
-            res, = exe.run(feed={'x': self.data_x,
-                                 'y': self.data_y},
+            res, = exe.run(feed={
+                'x': self.data_x,
+                'y': self.data_y
+            },
                            fetch_list=[z.name],
                            return_numpy=False)
         expect_out = np.array([[-1.0, -1.0, -1.0], [2.0, 2.0, 2.0],
diff --git a/python/paddle/fluid/tests/unittests/test_crypto.py b/python/paddle/fluid/tests/unittests/test_crypto.py
index 2a9bed7acbb88..54db1f8bfbbfe 100644
--- a/python/paddle/fluid/tests/unittests/test_crypto.py
+++ b/python/paddle/fluid/tests/unittests/test_crypto.py
@@ -20,6 +20,7 @@
 
 
 class CipherUtilsTestCase(unittest.TestCase):
+
     def test_gen_key(self):
         key1 = CipherUtils.gen_key(256)
         key2 = CipherUtils.gen_key_to_file(256, "paddle_aes_test.keyfile")
@@ -31,6 +32,7 @@ def test_gen_key(self):
 
 
 class CipherTestCase(unittest.TestCase):
+
     def test_aes_cipher(self):
         plaintext = "hello world"
         key = CipherUtils.gen_key(256)
diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py
index ffc5bc184efc2..ee22e227228e4 100644
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
@@ -32,8 +32,8 @@ def CTCAlign(input, lod, blank, merge_repeated, padding=0, input_length=None):
             prev_token = -1
             for j in range(cur_offset, cur_offset + lod0[i]):
                 token = input[j][0]
-                if (token != blank) and not (merge_repeated and
-                                             token == prev_token):
+                if (token != blank) and not (merge_repeated
+                                             and token == prev_token):
                     result.append(token)
                 prev_token = token
             cur_offset += lod0[i]
@@ -48,38 +48,41 @@ def CTCAlign(input, lod, blank, merge_repeated, padding=0, input_length=None):
             prev_token = -1
             for j in range(input_length[i][0]):
                 token = input[i][j]
-                if (token != blank) and not (merge_repeated and
-                                             token == prev_token):
+                if (token != blank) and not (merge_repeated
+                                             and token == prev_token):
                     result[i].append(token)
                 prev_token = token
             start = len(result[i])
             output_length.append([start])
             for j in range(start, len(input[i])):
                 result[i].append(padding)
-        result = np.array(result).reshape(
-            [len(input), len(input[0])]).astype("int32")
-        output_length = np.array(output_length).reshape(
-            [len(input), 1]).astype("int32")
+        result = np.array(result).reshape([len(input),
+                                           len(input[0])]).astype("int32")
+        output_length = np.array(output_length).reshape([len(input),
+                                                         1]).astype("int32")
 
     return result, output_length
 
 
 class TestCTCAlignOp(OpTest):
+
     def config(self):
         self.op_type = "ctc_align"
         self.input_lod = [[11, 7]]
         self.blank = 0
         self.merge_repeated = False
         self.input = np.array(
-            [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0]).reshape(
-                [18, 1]).astype("int32")
+            [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7,
+             0]).reshape([18, 1]).astype("int32")
 
     def setUp(self):
         self.config()
         output = CTCAlign(self.input, self.input_lod, self.blank,
                           self.merge_repeated)
 
-        self.inputs = {"Input": (self.input, self.input_lod), }
+        self.inputs = {
+            "Input": (self.input, self.input_lod),
+        }
         self.outputs = {"Output": output}
         self.attrs = {
             "blank": self.blank,
@@ -92,17 +95,19 @@ def test_check_output(self):
 
 
 class TestCTCAlignOpCase1(TestCTCAlignOp):
+
     def config(self):
         self.op_type = "ctc_align"
         self.input_lod = [[11, 8]]
         self.blank = 0
         self.merge_repeated = True
         self.input = np.array(
-            [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0, 0]).reshape(
-                [19, 1]).astype("int32")
+            [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0,
+             0]).reshape([19, 1]).astype("int32")
 
 
 class TestCTCAlignOpCase2(TestCTCAlignOp):
+
     def config(self):
         self.op_type = "ctc_align"
         self.input_lod = [[4]]
@@ -112,6 +117,7 @@ def config(self):
 
 
 class TestCTCAlignPaddingOp(OpTest):
+
     def config(self):
         self.op_type = "ctc_align"
         self.input_lod = []
@@ -119,8 +125,8 @@ def config(self):
         self.padding_value = 0
         self.merge_repeated = True
         self.input = np.array([[0, 2, 4, 4, 0, 6, 3, 6, 6, 0, 0],
-                               [1, 1, 3, 0, 0, 4, 5, 6, 0, 0, 0]]).reshape(
-                                   [2, 11]).astype("int32")
+                               [1, 1, 3, 0, 0, 4, 5, 6, 0, 0,
+                                0]]).reshape([2, 11]).astype("int32")
         self.input_length = np.array([[9], [8]]).reshape([2, 1]).astype("int32")
 
     def setUp(self):
@@ -144,6 +150,7 @@ def test_check_output(self):
 
 
 class TestCTCAlignOpCase3(TestCTCAlignPaddingOp):
+
     def config(self):
         self.op_type = "ctc_align"
         self.blank = 0
@@ -151,8 +158,8 @@ def config(self):
         self.merge_repeated = True
         self.padding_value = 0
         self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0],
-                               [0, 7, 7, 7, 0, 0]]).reshape(
-                                   [3, 6]).astype("int32")
+                               [0, 7, 7, 7, 0, 0]]).reshape([3,
+                                                             6]).astype("int32")
         self.input_length = np.array([[6], [5],
                                       [4]]).reshape([3, 1]).astype("int32")
 
@@ -169,13 +176,14 @@ def config(self):
         self.merge_repeated = False
         self.padding_value = 0
         self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0],
-                               [0, 7, 7, 7, 0, 0]]).reshape(
-                                   [3, 6]).astype("int32")
+                               [0, 7, 7, 7, 0, 0]]).reshape([3,
+                                                             6]).astype("int32")
         self.input_length = np.array([[6], [5],
                                       [4]]).reshape([3, 1]).astype("int32")
 
 
 class TestCTCAlignOpCase5(TestCTCAlignPaddingOp):
+
     def config(self):
         self.op_type = "ctc_align"
         self.blank = 0
@@ -183,13 +191,14 @@ def config(self):
         self.merge_repeated = False
         self.padding_value = 1
         self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0],
-                               [0, 7, 1, 7, 0, 0]]).reshape(
-                                   [3, 6]).astype("int32")
+                               [0, 7, 1, 7, 0, 0]]).reshape([3,
+                                                             6]).astype("int32")
         self.input_length = np.array([[6], [5],
                                       [4]]).reshape([3, 1]).astype("int32")
 
 
 class TestCTCAlignOpApi(unittest.TestCase):
+
     def test_api(self):
         x = fluid.layers.data('x', shape=[4], dtype='float32')
         y = fluid.layers.ctc_greedy_decoder(x, blank=0)
@@ -219,6 +228,7 @@ def test_api(self):
 
 
 class BadInputTestCTCAlignr(unittest.TestCase):
+
     def test_error(self):
         with fluid.program_guard(fluid.Program()):
 
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_cudnn_version.py b/python/paddle/fluid/tests/unittests/test_cuda_cudnn_version.py
index d8229247a817f..36637971f9e48 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_cudnn_version.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_cudnn_version.py
@@ -17,6 +17,7 @@
 
 
 class TestCPUVersion(unittest.TestCase):
+
     def test_cuda_cudnn_version_in_cpu_package(self):
         if not paddle.is_compiled_with_cuda():
             self.assertEqual(paddle.version.cuda(), 'False')
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_device_count.py b/python/paddle/fluid/tests/unittests/test_cuda_device_count.py
index f4114c9d451b3..482a3413caf46 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_device_count.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_device_count.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,6 +17,7 @@
 
 
 class TestDeviceCount(unittest.TestCase):
+
     def test_device_count(self):
         s = paddle.device.cuda.device_count()
         self.assertIsNotNone(s)
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_device_name_capability.py b/python/paddle/fluid/tests/unittests/test_cuda_device_name_capability.py
index 88f71f28412e3..0d749c5d17729 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_device_name_capability.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_device_name_capability.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,6 +17,7 @@
 
 
 class TestDeviceName(unittest.TestCase):
+
     def test_device_name_default(self):
         if paddle.is_compiled_with_cuda():
             name = paddle.device.cuda.get_device_name()
@@ -34,6 +35,7 @@ def test_device_name_CUDAPlace(self):
 
 
 class TestDeviceCapability(unittest.TestCase):
+
     def test_device_capability_default(self):
         if paddle.is_compiled_with_cuda():
             capability = paddle.device.cuda.get_device_capability()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_empty_cache.py b/python/paddle/fluid/tests/unittests/test_cuda_empty_cache.py
index 4aefb234bbfc1..0ec066eb7cdce 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_empty_cache.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_empty_cache.py
@@ -17,6 +17,7 @@
 
 
 class TestEmptyCache(unittest.TestCase):
+
     def test_empty_cache(self):
         x = paddle.randn((2, 10, 12)).astype('float32')
         del x
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
index 66228856effe4..fda3fa79ef664 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,6 +29,7 @@ def can_use_cuda_graph():
 
 
 class TestCUDAGraph(unittest.TestCase):
+
     def setUp(self):
         if can_use_cuda_graph():
             paddle.set_flags({
@@ -40,8 +41,7 @@ def setUp(self):
 
     def random_tensor(self, shape):
         return paddle.to_tensor(
-            np.random.randint(
-                low=0, high=10, size=shape).astype("float32"))
+            np.random.randint(low=0, high=10, size=shape).astype("float32"))
 
     @switch_to_static_graph
     def test_cuda_graph_static_graph(self):
@@ -49,8 +49,8 @@ def test_cuda_graph_static_graph(self):
             return
 
         seed = 100
-        loss_cuda_graph = self.cuda_graph_static_graph_main(
-            seed, use_cuda_graph=True)
+        loss_cuda_graph = self.cuda_graph_static_graph_main(seed,
+                                                            use_cuda_graph=True)
         loss_no_cuda_graph = self.cuda_graph_static_graph_main(
             seed, use_cuda_graph=False)
         self.assertEqual(loss_cuda_graph, loss_no_cuda_graph)
@@ -66,10 +66,12 @@ def cuda_graph_static_graph_main(self, seed, use_cuda_graph):
         startup = paddle.static.Program()
         main = paddle.static.Program()
         with paddle.static.program_guard(main, startup):
-            image = paddle.static.data(
-                name="image", shape=image_shape, dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=label_shape, dtype='int64')
+            image = paddle.static.data(name="image",
+                                       shape=image_shape,
+                                       dtype='float32')
+            label = paddle.static.data(name="label",
+                                       shape=label_shape,
+                                       dtype='int64')
             image.persistable = True
             label.persistable = True
             loss = simple_fc_net_with_inputs(image, label, class_num)
@@ -88,10 +90,9 @@ def cuda_graph_static_graph_main(self, seed, use_cuda_graph):
             build_strategy.fix_op_run_order = True
             build_strategy.fuse_all_optimizer_ops = True
             compiled_program = paddle.static.CompiledProgram(
-                main).with_data_parallel(
-                    loss_name=loss.name,
-                    build_strategy=build_strategy,
-                    places=place)
+                main).with_data_parallel(loss_name=loss.name,
+                                         build_strategy=build_strategy,
+                                         places=place)
             image_t = scope.var(image.name).get_tensor()
             label_t = scope.var(label.name).get_tensor()
             loss_t = scope.var(loss.name).get_tensor()
@@ -102,9 +103,11 @@ def cuda_graph_static_graph_main(self, seed, use_cuda_graph):
             for batch_id in range(20):
                 image_t.set(
                     np.random.rand(*image_shape).astype('float32'), place)
-                label_t.set(np.random.randint(
-                    low=0, high=class_num, size=label_shape, dtype='int64'),
-                            place)
+                label_t.set(
+                    np.random.randint(low=0,
+                                      high=class_num,
+                                      size=label_shape,
+                                      dtype='int64'), place)
 
                 if batch_id == 1 and use_cuda_graph:
                     cuda_graph = CUDAGraph(place, mode="global")
@@ -193,6 +196,7 @@ def test_dataloader(self):
             return
 
         class AutoIncDataset(paddle.io.Dataset):
+
             def __init__(self, n, dtype):
                 self.n = n
                 self.dtype = dtype
@@ -206,8 +210,10 @@ def __getitem__(self, idx):
         n = 100
         dtype = 'int64'
         dataset = AutoIncDataset(n, dtype)
-        data_loader = paddle.io.DataLoader(
-            dataset, batch_size=1, num_workers=2, use_buffer_reader=True)
+        data_loader = paddle.io.DataLoader(dataset,
+                                           batch_size=1,
+                                           num_workers=2,
+                                           use_buffer_reader=True)
         x = None
         y = None
 
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph.py
new file mode 100644
index 0000000000000..b0e6878e3fef2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import unittest
+import numpy as np
+from paddle.device.cuda.graphs import wrap_cuda_graph, is_cuda_graph_supported
+
+
+class SimpleModel(nn.Layer):
+
+    def __init__(self, in_size, out_size):
+        super(SimpleModel, self).__init__()
+        self.linear = nn.Linear(in_size, out_size)
+        self.dropout_1 = paddle.nn.Dropout(0.1)
+        self.relu = nn.ReLU()
+        self.dropout_2 = paddle.nn.Dropout(0.5)
+        self.gelu = nn.GELU()
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.dropout_1(x)
+        x = self.relu(x)
+        x = self.dropout_2(x)
+        x = self.gelu(x)
+        return x
+
+
+class TestSimpleModel(unittest.TestCase):
+
+    def setUp(self):
+        paddle.set_flags({'FLAGS_eager_delete_tensor_gb': 0.0})
+
+    def run_base(self, func, use_cuda_graph, memory_pool="default", seed=10):
+        paddle.seed(seed)
+        is_layer = isinstance(func, paddle.nn.Layer)
+        if use_cuda_graph:
+            func = wrap_cuda_graph(func, memory_pool=memory_pool)
+
+        for _ in range(10):
+            x = paddle.randn([3, 10], dtype='float32')
+            x.stop_gradient = False
+            y = x * x + 100
+            loss = func(y).mean()
+            loss.backward()
+            if is_layer:
+                func.clear_gradients()
+
+        return func, x.grad.numpy()
+
+    def check(self, func):
+        if not is_cuda_graph_supported():
+            return
+
+        _, value1 = self.run_base(func, False)
+        layer, value2 = self.run_base(func, True, "default")
+        _, value3 = self.run_base(func, True, "new")
+        _, value4 = self.run_base(func, True, layer)
+        self.assertTrue(np.array_equal(value1, value2))
+        self.assertTrue(np.array_equal(value1, value3))
+        self.assertTrue(np.array_equal(value1, value4))
+
+    def test_layer(self):
+        self.check(SimpleModel(10, 20))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py
index ae8bdeed1ef7a..7b8c6e9d22efe 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@
 
 
 class TestMaxMemoryAllocated(unittest.TestCase):
+
     def func_test_max_memory_allocated(self, device=None):
         if core.is_compiled_with_cuda():
             alloc_time = 100
@@ -58,7 +59,8 @@ def test_max_memory_allocated_for_all_places(self):
     def func_test_max_memory_allocated_exception(self):
         if core.is_compiled_with_cuda():
             wrong_device = [
-                core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu"
+                core.CPUPlace(),
+                device_count() + 1, -2, 0.5, "gpu1", "npu"
             ]
             for device in wrong_device:
                 with self.assertRaises(BaseException):
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py
index e64e02bb7f0f3..936a084abb704 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,6 +19,7 @@
 
 
 class TestMaxMemoryreserved(unittest.TestCase):
+
     def test_max_memory_reserved(self, device=None):
         if core.is_compiled_with_cuda():
             alloc_time = 100
@@ -47,7 +48,8 @@ def test_max_memory_reserved_for_all_places(self):
     def test_max_memory_reserved_exception(self):
         if core.is_compiled_with_cuda():
             wrong_device = [
-                core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu"
+                core.CPUPlace(),
+                device_count() + 1, -2, 0.5, "gpu1", "npu"
             ]
             for device in wrong_device:
                 with self.assertRaises(BaseException):
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py b/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py
index af45537b6d489..4922b8df1fcd0 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@
 
 
 class TestMemoryAllocated(unittest.TestCase):
+
     def test_memory_allocated(self, device=None):
         if core.is_compiled_with_cuda():
             tensor = paddle.zeros(shape=[256])
@@ -39,7 +40,8 @@ def test_memory_allocated_for_all_places(self):
     def test_memory_allocated_exception(self):
         if core.is_compiled_with_cuda():
             wrong_device = [
-                core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu"
+                core.CPUPlace(),
+                device_count() + 1, -2, 0.5, "gpu1", "npu"
             ]
             for device in wrong_device:
                 with self.assertRaises(BaseException):
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py b/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py
index ca551ab4a3f28..c434691398010 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,6 +21,7 @@
 
 
 class TestMemoryreserved(unittest.TestCase):
+
     def func_test_memory_reserved(self, device=None):
         if core.is_compiled_with_cuda():
             tensor = paddle.zeros(shape=[256])
@@ -50,7 +51,8 @@ def test_memory_reserved_for_all_places(self):
     def func_test_memory_reserved_exception(self):
         if core.is_compiled_with_cuda():
             wrong_device = [
-                core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu"
+                core.CPUPlace(),
+                device_count() + 1, -2, 0.5, "gpu1", "npu"
             ]
             for device in wrong_device:
                 with self.assertRaises(BaseException):
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
index 14a91b0c2c5fe..ef886d2067a51 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -40,12 +40,18 @@ def test_gen_dropout_dygraph(self):
         gen.manual_seed(111111111)
         st = paddle.get_cuda_rng_state()
 
-        x = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0)
-        x_again = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0)
-        x_third = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x = fluid.layers.uniform_random([2, 10],
+                                        dtype="float32",
+                                        min=0.0,
+                                        max=1.0)
+        x_again = fluid.layers.uniform_random([2, 10],
+                                              dtype="float32",
+                                              min=0.0,
+                                              max=1.0)
+        x_third = fluid.layers.uniform_random([2, 10],
+                                              dtype="float32",
+                                              min=0.0,
+                                              max=1.0)
         print("x: {}".format(x.numpy()))
         print("x_again: {}".format(x_again.numpy()))
         x = x + x_again + x_third
@@ -53,12 +59,18 @@ def test_gen_dropout_dygraph(self):
 
         paddle.set_cuda_rng_state(st)
 
-        x1 = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0)
-        x1_again = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0)
-        x1_third = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x1 = fluid.layers.uniform_random([2, 10],
+                                         dtype="float32",
+                                         min=0.0,
+                                         max=1.0)
+        x1_again = fluid.layers.uniform_random([2, 10],
+                                               dtype="float32",
+                                               min=0.0,
+                                               max=1.0)
+        x1_third = fluid.layers.uniform_random([2, 10],
+                                               dtype="float32",
+                                               min=0.0,
+                                               max=1.0)
         x1 = x1 + x1_again + x1_third
         y1 = fluid.layers.dropout(x1, 0.5)
         y_np = y.numpy()
@@ -125,13 +137,13 @@ def test_gen_TruncatedNormal_initializer(self):
             result_1 = fluid.layers.fc(
                 input=x,
                 size=10,
-                param_attr=fluid.initializer.TruncatedNormal(
-                    loc=0.0, scale=2.0))
+                param_attr=fluid.initializer.TruncatedNormal(loc=0.0,
+                                                             scale=2.0))
             result_2 = fluid.layers.fc(
                 input=x,
                 size=10,
-                param_attr=fluid.initializer.TruncatedNormal(
-                    loc=0.0, scale=2.0))
+                param_attr=fluid.initializer.TruncatedNormal(loc=0.0,
+                                                             scale=2.0))
 
             exe = fluid.Executor(fluid.CPUPlace())
             exe.run(startup_program)
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py b/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py
index 30bc00c9d9427..8063331fe39c6 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,6 +21,7 @@
 
 
 class TestCurrentStream(unittest.TestCase):
+
     def test_current_stream(self):
         if paddle.is_compiled_with_cuda():
             s = cuda.current_stream()
@@ -38,6 +39,7 @@ def test_current_stream(self):
 
 
 class TestSynchronize(unittest.TestCase):
+
     def test_synchronize(self):
         if paddle.is_compiled_with_cuda():
             self.assertIsNone(cuda.synchronize())
@@ -48,6 +50,7 @@ def test_synchronize(self):
 
 
 class TestCUDAStream(unittest.TestCase):
+
     def test_cuda_stream(self):
         if paddle.is_compiled_with_cuda():
             s = paddle.device.cuda.Stream()
@@ -85,6 +88,7 @@ def test_cuda_stream_wait_event_and_record_event(self):
 
 
 class TestCUDAEvent(unittest.TestCase):
+
     def test_cuda_event(self):
         if paddle.is_compiled_with_cuda():
             e = paddle.device.cuda.Event(True, False, False)
@@ -158,6 +162,7 @@ def test_set_current_stream_raise_error(self):
 
 
 class TestRawStream(unittest.TestCase):
+
     def test_cuda_stream(self):
         if paddle.is_compiled_with_cuda():
             cuda_stream = paddle.device.cuda.current_stream().cuda_stream
diff --git a/python/paddle/fluid/tests/unittests/test_cudnn_grucell.py b/python/paddle/fluid/tests/unittests/test_cudnn_grucell.py
index 2335293b22e7e..3b7093db39111 100644
--- a/python/paddle/fluid/tests/unittests/test_cudnn_grucell.py
+++ b/python/paddle/fluid/tests/unittests/test_cudnn_grucell.py
@@ -79,6 +79,7 @@ def non_cudnn_step(step_in, pre_hidden, gate_w, gate_b, candidate_w,
 
 
 class TestCudnnGRU(unittest.TestCase):
+
     def setUp(self):
         self.input_size = 100
         self.hidden_size = 200
@@ -115,8 +116,8 @@ def test_run(self):
             named_param_list[weight_ih_name].set_value(weight_ih)
 
             bias_ih = param_list[bias_ih_name].numpy()
-            bias_ih = np.random.uniform(
-                -0.1, 0.1, size=bias_ih.shape).astype('float64')
+            bias_ih = np.random.uniform(-0.1, 0.1,
+                                        size=bias_ih.shape).astype('float64')
             param_list[bias_ih_name].set_value(bias_ih)
             named_param_list[bias_ih_name].set_value(bias_ih)
 
@@ -127,15 +128,16 @@ def test_run(self):
             named_param_list[weight_hh_name].set_value(weight_hh)
 
             bias_hh = param_list[bias_hh_name].numpy()
-            bias_hh = np.random.uniform(
-                -0.1, 0.1, size=bias_hh.shape).astype('float64')
+            bias_hh = np.random.uniform(-0.1, 0.1,
+                                        size=bias_hh.shape).astype('float64')
             param_list[bias_hh_name].set_value(bias_hh)
             named_param_list[bias_hh_name].set_value(bias_hh)
 
-            step_input_np = np.random.uniform(-0.1, 0.1, (
-                self.batch_size, self.input_size)).astype('float64')
-            pre_hidden_np = np.random.uniform(-0.1, 0.1, (
-                self.batch_size, self.hidden_size)).astype('float64')
+            step_input_np = np.random.uniform(
+                -0.1, 0.1, (self.batch_size, self.input_size)).astype('float64')
+            pre_hidden_np = np.random.uniform(
+                -0.1, 0.1,
+                (self.batch_size, self.hidden_size)).astype('float64')
 
             step_input_var = fluid.dygraph.to_variable(step_input_np)
             pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
@@ -147,11 +149,11 @@ def test_run(self):
 
         self.assertTrue(np.allclose(api_out.numpy(), np_out, rtol=1e-5, atol=0))
         self.assertTrue(
-            np.allclose(
-                named_api_out.numpy(), np_out, rtol=1e-5, atol=0))
+            np.allclose(named_api_out.numpy(), np_out, rtol=1e-5, atol=0))
 
 
 class TestNonCudnnGRU(unittest.TestCase):
+
     def setUp(self):
         self.input_size = 100
         self.hidden_size = 200
@@ -167,14 +169,14 @@ def test_run(self):
         with fluid.dygraph.guard(place):
             param_attr = fluid.ParamAttr(name="param_attr")
             bias_attr = fluid.ParamAttr(name="bias_attr")
-            named_non_cudnn_gru = GRUCell(
-                self.hidden_size,
-                self.input_size,
-                param_attr,
-                bias_attr,
-                use_cudnn_impl=False)
-            non_cudnn_gru = GRUCell(
-                self.hidden_size, self.input_size, use_cudnn_impl=False)
+            named_non_cudnn_gru = GRUCell(self.hidden_size,
+                                          self.input_size,
+                                          param_attr,
+                                          bias_attr,
+                                          use_cudnn_impl=False)
+            non_cudnn_gru = GRUCell(self.hidden_size,
+                                    self.input_size,
+                                    use_cudnn_impl=False)
 
             param_list = non_cudnn_gru.state_dict()
             named_param_list = named_non_cudnn_gru.state_dict()
@@ -187,14 +189,14 @@ def test_run(self):
             candidate_b_name = "_candidate_bias"
 
             gate_w = param_list[gate_w_name].numpy()
-            gate_w = np.random.uniform(
-                -0.1, 0.1, size=gate_w.shape).astype('float64')
+            gate_w = np.random.uniform(-0.1, 0.1,
+                                       size=gate_w.shape).astype('float64')
             param_list[gate_w_name].set_value(gate_w)
             named_param_list[gate_w_name].set_value(gate_w)
 
             gate_b = param_list[gate_b_name].numpy()
-            gate_b = np.random.uniform(
-                -0.1, 0.1, size=gate_b.shape).astype('float64')
+            gate_b = np.random.uniform(-0.1, 0.1,
+                                       size=gate_b.shape).astype('float64')
             param_list[gate_b_name].set_value(gate_b)
             named_param_list[gate_b_name].set_value(gate_b)
 
@@ -210,10 +212,11 @@ def test_run(self):
             param_list[candidate_b_name].set_value(candidate_b)
             named_param_list[candidate_b_name].set_value(candidate_b)
 
-            step_input_np = np.random.uniform(-0.1, 0.1, (
-                self.batch_size, self.input_size)).astype('float64')
-            pre_hidden_np = np.random.uniform(-0.1, 0.1, (
-                self.batch_size, self.hidden_size)).astype('float64')
+            step_input_np = np.random.uniform(
+                -0.1, 0.1, (self.batch_size, self.input_size)).astype('float64')
+            pre_hidden_np = np.random.uniform(
+                -0.1, 0.1,
+                (self.batch_size, self.hidden_size)).astype('float64')
 
             step_input_var = fluid.dygraph.to_variable(step_input_np)
             pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
@@ -225,8 +228,7 @@ def test_run(self):
 
         self.assertTrue(np.allclose(api_out.numpy(), np_out, rtol=1e-5, atol=0))
         self.assertTrue(
-            np.allclose(
-                named_api_out.numpy(), np_out, rtol=1e-5, atol=0))
+            np.allclose(named_api_out.numpy(), np_out, rtol=1e-5, atol=0))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_cudnn_lstmcell.py b/python/paddle/fluid/tests/unittests/test_cudnn_lstmcell.py
index ddba6bc69d25e..36b563a97c732 100644
--- a/python/paddle/fluid/tests/unittests/test_cudnn_lstmcell.py
+++ b/python/paddle/fluid/tests/unittests/test_cudnn_lstmcell.py
@@ -80,6 +80,7 @@ def cudnn_step(step_input_np, pre_hidden_np, pre_cell_np, weight_ih, bias_ih,
 
 
 class TestCudnnLSTM(unittest.TestCase):
+
     def setUp(self):
         self.input_size = 100
         self.hidden_size = 200
@@ -114,8 +115,8 @@ def test_run(self):
             named_param_list[weight_ih_name].set_value(weight_ih)
 
             bias_ih = param_list[bias_ih_name].numpy()
-            bias_ih = np.random.uniform(
-                -0.1, 0.1, size=bias_ih.shape).astype('float64')
+            bias_ih = np.random.uniform(-0.1, 0.1,
+                                        size=bias_ih.shape).astype('float64')
             param_list[bias_ih_name].set_value(bias_ih)
             named_param_list[bias_ih_name].set_value(bias_ih)
 
@@ -126,17 +127,19 @@ def test_run(self):
             named_param_list[weight_hh_name].set_value(weight_hh)
 
             bias_hh = param_list[bias_hh_name].numpy()
-            bias_hh = np.random.uniform(
-                -0.1, 0.1, size=bias_hh.shape).astype('float64')
+            bias_hh = np.random.uniform(-0.1, 0.1,
+                                        size=bias_hh.shape).astype('float64')
             param_list[bias_hh_name].set_value(bias_hh)
             named_param_list[bias_hh_name].set_value(bias_hh)
 
-            step_input_np = np.random.uniform(-0.1, 0.1, (
-                self.batch_size, self.input_size)).astype('float64')
-            pre_hidden_np = np.random.uniform(-0.1, 0.1, (
-                self.batch_size, self.hidden_size)).astype('float64')
-            pre_cell_np = np.random.uniform(-0.1, 0.1, (
-                self.batch_size, self.hidden_size)).astype('float64')
+            step_input_np = np.random.uniform(
+                -0.1, 0.1, (self.batch_size, self.input_size)).astype('float64')
+            pre_hidden_np = np.random.uniform(
+                -0.1, 0.1,
+                (self.batch_size, self.hidden_size)).astype('float64')
+            pre_cell_np = np.random.uniform(
+                -0.1, 0.1,
+                (self.batch_size, self.hidden_size)).astype('float64')
 
             step_input_var = fluid.dygraph.to_variable(step_input_np)
             pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
@@ -150,27 +153,34 @@ def test_run(self):
             named_api_hidden_out = named_api_out[0]
             named_api_cell_out = named_api_out[1]
 
-            np_hidden_out, np_cell_out = cudnn_step(
-                step_input_np, pre_hidden_np, pre_cell_np, weight_ih, bias_ih,
-                weight_hh, bias_hh)
+            np_hidden_out, np_cell_out = cudnn_step(step_input_np,
+                                                    pre_hidden_np, pre_cell_np,
+                                                    weight_ih, bias_ih,
+                                                    weight_hh, bias_hh)
             self.assertTrue(
-                np.allclose(
-                    api_hidden_out.numpy(), np_hidden_out, rtol=1e-5, atol=0))
+                np.allclose(api_hidden_out.numpy(),
+                            np_hidden_out,
+                            rtol=1e-5,
+                            atol=0))
             self.assertTrue(
-                np.allclose(
-                    api_cell_out.numpy(), np_cell_out, rtol=1e-5, atol=0))
+                np.allclose(api_cell_out.numpy(),
+                            np_cell_out,
+                            rtol=1e-5,
+                            atol=0))
             self.assertTrue(
-                np.allclose(
-                    named_api_hidden_out.numpy(),
-                    np_hidden_out,
-                    rtol=1e-5,
-                    atol=0))
+                np.allclose(named_api_hidden_out.numpy(),
+                            np_hidden_out,
+                            rtol=1e-5,
+                            atol=0))
             self.assertTrue(
-                np.allclose(
-                    named_api_cell_out.numpy(), np_cell_out, rtol=1e-5, atol=0))
+                np.allclose(named_api_cell_out.numpy(),
+                            np_cell_out,
+                            rtol=1e-5,
+                            atol=0))
 
 
 class TestNonCudnnLSTM(unittest.TestCase):
+
     def setUp(self):
         self.input_size = 100
         self.hidden_size = 200
@@ -185,14 +195,14 @@ def test_run(self):
         with fluid.dygraph.guard(place):
             param_attr = fluid.ParamAttr(name="param_attr")
             bias_attr = fluid.ParamAttr(name="bias_attr")
-            named_cudnn_lstm = LSTMCell(
-                self.hidden_size,
-                self.input_size,
-                param_attr,
-                bias_attr,
-                use_cudnn_impl=False)
-            cudnn_lstm = LSTMCell(
-                self.hidden_size, self.input_size, use_cudnn_impl=False)
+            named_cudnn_lstm = LSTMCell(self.hidden_size,
+                                        self.input_size,
+                                        param_attr,
+                                        bias_attr,
+                                        use_cudnn_impl=False)
+            cudnn_lstm = LSTMCell(self.hidden_size,
+                                  self.input_size,
+                                  use_cudnn_impl=False)
 
             param_list = cudnn_lstm.state_dict()
             named_param_list = named_cudnn_lstm.state_dict()
@@ -203,23 +213,25 @@ def test_run(self):
             gate_b_name = "_bias"
 
             gate_w = param_list[gate_w_name].numpy()
-            gate_w = np.random.uniform(
-                -0.1, 0.1, size=gate_w.shape).astype('float64')
+            gate_w = np.random.uniform(-0.1, 0.1,
+                                       size=gate_w.shape).astype('float64')
             param_list[gate_w_name].set_value(gate_w)
             named_param_list[gate_w_name].set_value(gate_w)
 
             gate_b = param_list[gate_b_name].numpy()
-            gate_b = np.random.uniform(
-                -0.1, 0.1, size=gate_b.shape).astype('float64')
+            gate_b = np.random.uniform(-0.1, 0.1,
+                                       size=gate_b.shape).astype('float64')
             param_list[gate_b_name].set_value(gate_b)
             named_param_list[gate_b_name].set_value(gate_b)
 
-            step_input_np = np.random.uniform(-0.1, 0.1, (
-                self.batch_size, self.input_size)).astype('float64')
-            pre_hidden_np = np.random.uniform(-0.1, 0.1, (
-                self.batch_size, self.hidden_size)).astype('float64')
-            pre_cell_np = np.random.uniform(-0.1, 0.1, (
-                self.batch_size, self.hidden_size)).astype('float64')
+            step_input_np = np.random.uniform(
+                -0.1, 0.1, (self.batch_size, self.input_size)).astype('float64')
+            pre_hidden_np = np.random.uniform(
+                -0.1, 0.1,
+                (self.batch_size, self.hidden_size)).astype('float64')
+            pre_cell_np = np.random.uniform(
+                -0.1, 0.1,
+                (self.batch_size, self.hidden_size)).astype('float64')
 
             step_input_var = fluid.dygraph.to_variable(step_input_np)
             pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
@@ -233,24 +245,31 @@ def test_run(self):
             named_api_hidden_out = named_api_out[0]
             named_api_cell_out = named_api_out[1]
 
-            np_hidden_out, np_cell_out = non_cudnn_step(
-                step_input_np, pre_hidden_np, pre_cell_np, gate_w, gate_b)
+            np_hidden_out, np_cell_out = non_cudnn_step(step_input_np,
+                                                        pre_hidden_np,
+                                                        pre_cell_np, gate_w,
+                                                        gate_b)
 
             self.assertTrue(
-                np.allclose(
-                    api_hidden_out.numpy(), np_hidden_out, rtol=1e-5, atol=0))
+                np.allclose(api_hidden_out.numpy(),
+                            np_hidden_out,
+                            rtol=1e-5,
+                            atol=0))
             self.assertTrue(
-                np.allclose(
-                    api_cell_out.numpy(), np_cell_out, rtol=1e-5, atol=0))
+                np.allclose(api_cell_out.numpy(),
+                            np_cell_out,
+                            rtol=1e-5,
+                            atol=0))
             self.assertTrue(
-                np.allclose(
-                    named_api_hidden_out.numpy(),
-                    np_hidden_out,
-                    rtol=1e-5,
-                    atol=0))
+                np.allclose(named_api_hidden_out.numpy(),
+                            np_hidden_out,
+                            rtol=1e-5,
+                            atol=0))
             self.assertTrue(
-                np.allclose(
-                    named_api_cell_out.numpy(), np_cell_out, rtol=1e-5, atol=0))
+                np.allclose(named_api_cell_out.numpy(),
+                            np_cell_out,
+                            rtol=1e-5,
+                            atol=0))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_cumprod_op.py b/python/paddle/fluid/tests/unittests/test_cumprod_op.py
index 681b8d6cc0bdf..66b4a60197344 100644
--- a/python/paddle/fluid/tests/unittests/test_cumprod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumprod_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -61,6 +61,7 @@ def cumprod_grad(x, y, dy, dx, shape, dim):
 
 # test function.
 class TestCumprod(OpTest):
+
     def init_params(self):
         self.shape = (2, 3, 4, 5)
         self.zero_nums = [0, 10, 20, 30, int(np.prod(self.shape))]
@@ -122,34 +123,37 @@ def test_check_grad(self):
                 if self.dtype == np.float64:
                     self.check_grad(['X'], 'Out', check_eager=True)
                 else:
-                    self.check_grad(
-                        ['X'],
-                        'Out',
-                        user_defined_grads=[self.grad_x],
-                        user_defined_grad_outputs=[self.grad_out],
-                        check_eager=True)
+                    self.check_grad(['X'],
+                                    'Out',
+                                    user_defined_grads=[self.grad_x],
+                                    user_defined_grad_outputs=[self.grad_out],
+                                    check_eager=True)
 
 
 # test float32 case.
 class TestCumprod_float32(TestCumprod):
+
     def init_dtype(self):
         self.dtype = np.float32
 
 
 # test complex64 case.
 class TestCumprod_complex64(TestCumprod):
+
     def init_dtype(self):
         self.dtype = np.complex64
 
 
 # test complex128 case.
 class TestCumprod_complex128(TestCumprod):
+
     def init_dtype(self):
         self.dtype = np.complex128
 
 
 # test api.
 class TestCumprodAPI(unittest.TestCase):
+
     def init_dtype(self):
         self.dtype = 'float64'
         self.shape = [2, 3, 10, 10]
@@ -182,6 +186,7 @@ def run(place):
 
     # test dynamic graph api.
     def test_dygraph_api(self):
+
         def run(place):
             paddle.disable_static(place)
             x = paddle.to_tensor(self.x)
diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
index 818e15bb319b1..7e11ad647d963 100644
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -24,6 +24,7 @@
 
 
 class TestCumsumOp(unittest.TestCase):
+
     def run_cases(self):
         data_np = np.arange(12).reshape(3, 4)
         data = paddle.to_tensor(data_np)
@@ -105,6 +106,7 @@ def test_name(self):
 
 
 class TestSumOp1(OpTest):
+
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 2}
@@ -119,14 +121,14 @@ def test_check_grad(self):
 
 
 class TestSumOp2(OpTest):
+
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': -1, 'reverse': True}
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
         self.outputs = {
-            'Out': np.flip(
-                np.flip(
-                    self.inputs['X'], axis=2).cumsum(axis=2), axis=2)
+            'Out': np.flip(np.flip(self.inputs['X'], axis=2).cumsum(axis=2),
+                           axis=2)
         }
 
     def test_check_output(self):
@@ -137,6 +139,7 @@ def test_check_grad(self):
 
 
 class TestSumOp3(OpTest):
+
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 1}
@@ -151,6 +154,7 @@ def test_check_grad(self):
 
 
 class TestSumOp4(OpTest):
+
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 0}
@@ -165,6 +169,7 @@ def test_check_grad(self):
 
 
 class TestSumOp5(OpTest):
+
     def setUp(self):
         self.op_type = "cumsum"
         self.inputs = {'X': np.random.random((5, 20)).astype("float64")}
@@ -178,6 +183,7 @@ def test_check_grad(self):
 
 
 class TestSumOp7(OpTest):
+
     def setUp(self):
         self.op_type = "cumsum"
         self.inputs = {'X': np.random.random((100)).astype("float64")}
@@ -191,16 +197,17 @@ def test_check_grad(self):
 
 
 class TestSumOpExclusive1(OpTest):
+
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((4, 5, 65)).astype("float64")
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (4, 5, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (4, 5, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
     def test_check_output(self):
@@ -208,16 +215,17 @@ def test_check_output(self):
 
 
 class TestSumOpExclusive2(OpTest):
+
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((1, 1, 888)).astype("float64")
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (1, 1, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (1, 1, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
     def test_check_output(self):
@@ -225,16 +233,17 @@ def test_check_output(self):
 
 
 class TestSumOpExclusive3(OpTest):
+
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((4, 5, 888)).astype("float32")
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (4, 5, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (4, 5, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
     def test_check_output(self):
@@ -242,16 +251,17 @@ def test_check_output(self):
 
 
 class TestSumOpExclusive4(OpTest):
+
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((1, 1, 3049)).astype("float64")
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (1, 1, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (1, 1, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
     def test_check_output(self):
@@ -259,16 +269,17 @@ def test_check_output(self):
 
 
 class TestSumOpExclusive5(OpTest):
+
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 2, "exclusive": True}
         a = np.random.random((4, 5, 3096)).astype("float64")
         self.inputs = {'X': a}
         self.outputs = {
-            'Out': np.concatenate(
-                (np.zeros(
-                    (4, 5, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
-                axis=2)
+            'Out':
+            np.concatenate((np.zeros(
+                (4, 5, 1), dtype=np.float64), a[:, :, :-1].cumsum(axis=2)),
+                           axis=2)
         }
 
     def test_check_output(self):
@@ -276,6 +287,7 @@ def test_check_output(self):
 
 
 class TestSumOpReverseExclusive(OpTest):
+
     def setUp(self):
         self.op_type = "cumsum"
         self.attrs = {'axis': 2, 'reverse': True, "exclusive": True}
@@ -283,10 +295,10 @@ def setUp(self):
         self.inputs = {'X': a}
         a = np.flip(a, axis=2)
         self.outputs = {
-            'Out': np.concatenate(
-                (np.flip(
-                    a[:, :, :-1].cumsum(axis=2), axis=2), np.zeros(
-                        (4, 5, 1), dtype=np.float64)),
+            'Out':
+            np.concatenate(
+                (np.flip(a[:, :, :-1].cumsum(axis=2),
+                         axis=2), np.zeros((4, 5, 1), dtype=np.float64)),
                 axis=2)
         }
 
@@ -295,6 +307,7 @@ def test_check_output(self):
 
 
 class BadInputTest(unittest.TestCase):
+
     def test_error(self):
         with fluid.program_guard(fluid.Program()):
 
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
index 83a25b71626e1..2d12243de52c0 100644
--- a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,6 +24,7 @@
 
 
 class TestTensorBackward(unittest.TestCase):
+
     def setUp(self):
         self._dtypes = ["float32", "float64"]
         self._places = [paddle.CPUPlace()]
@@ -56,6 +57,7 @@ def test_tensor_backward(self):
 
 
 class TestBackwardAPI(unittest.TestCase):
+
     def setUp(self):
         self._dtypes = ["float32", "float64"]
         self._places = [paddle.CPUPlace()]
@@ -138,9 +140,15 @@ def test_backward_none_grad_tensor(self):
 
     def func_backward_accumulator_with_init_grad(self):
         for dtype in self._dtypes:
-            x = np.random.random([10, ]).astype(dtype)
-            y_grad = np.random.random([10, ]).astype(dtype)
-            z_grad = np.random.random([10, ]).astype(dtype)
+            x = np.random.random([
+                10,
+            ]).astype(dtype)
+            y_grad = np.random.random([
+                10,
+            ]).astype(dtype)
+            z_grad = np.random.random([
+                10,
+            ]).astype(dtype)
             self._places = [paddle.CPUPlace()]
             for place in self._places:
                 with dg.guard(place):
diff --git a/python/paddle/fluid/tests/unittests/test_cvm_op.py b/python/paddle/fluid/tests/unittests/test_cvm_op.py
index 276d00bb2bfcf..a1db1a0c6b4fb 100644
--- a/python/paddle/fluid/tests/unittests/test_cvm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cvm_op.py
@@ -66,9 +66,10 @@ def setUp(self):
         lod = [[1]]
         self.inputs = {
             'X': (np.random.uniform(
-                0, 1, [self.batch_size, self.item_width]).astype("float32"),
-                  lod),
-            'CVM': np.array([[0.6, 0.4]]).astype("float32"),
+                0, 1,
+                [self.batch_size, self.item_width]).astype("float32"), lod),
+            'CVM':
+            np.array([[0.6, 0.4]]).astype("float32"),
         }
         self.attrs = {'use_cvm': False}
         out = []
@@ -85,8 +86,10 @@ def test_check_grad(self):
                 (self.batch_size, self.item_width)).astype("float32")
         user_grads[:, :2] = self.inputs['CVM'].reshape(self.batch_size, 2)
         user_grads = [user_grads]
-        self.check_grad(
-            ['X'], 'Y', user_defined_grads=user_grads, check_dygraph=False)
+        self.check_grad(['X'],
+                        'Y',
+                        user_defined_grads=user_grads,
+                        check_dygraph=False)
 
 
 class TestCVMOpWithOutLodTensor1(OpTest):
@@ -120,8 +123,10 @@ def test_check_grad(self):
             (self.batch_size, self.item_width)).astype("float32")
         user_grads[:, :2] = self.inputs['CVM'].reshape(self.batch_size, 2)
         user_grads = [user_grads]
-        self.check_grad(
-            ['X'], 'Y', user_defined_grads=user_grads, check_dygraph=False)
+        self.check_grad(['X'],
+                        'Y',
+                        user_defined_grads=user_grads,
+                        check_dygraph=False)
 
 
 class TestCVMOpWithOutLodTensor2(OpTest):
@@ -156,8 +161,10 @@ def test_check_grad(self):
                 (self.batch_size, self.item_width)).astype("float32")
         user_grads[:, :2] = self.inputs['CVM'].reshape(self.batch_size, 2)
         user_grads = [user_grads]
-        self.check_grad(
-            ['X'], 'Y', user_defined_grads=user_grads, check_dygraph=False)
+        self.check_grad(['X'],
+                        'Y',
+                        user_defined_grads=user_grads,
+                        check_dygraph=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_cyclic_cifar_dataset.py b/python/paddle/fluid/tests/unittests/test_cyclic_cifar_dataset.py
index 01a588c4058a4..e014a25ab791b 100644
--- a/python/paddle/fluid/tests/unittests/test_cyclic_cifar_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_cyclic_cifar_dataset.py
@@ -17,6 +17,7 @@
 
 
 class TestCifar10(unittest.TestCase):
+
     def test_main(self):
         reader = paddle.dataset.cifar.train10(cycle=False)
         sample_num = 0
diff --git a/python/paddle/fluid/tests/unittests/test_data.py b/python/paddle/fluid/tests/unittests/test_data.py
index 98739f6e1631e..ebbcab6d9dc62 100644
--- a/python/paddle/fluid/tests/unittests/test_data.py
+++ b/python/paddle/fluid/tests/unittests/test_data.py
@@ -24,6 +24,7 @@
 
 
 class TestApiDataError(unittest.TestCase):
+
     def test_fluid_data(self):
         with program_guard(Program(), Program()):
 
@@ -56,6 +57,7 @@ def test_shape_type():
 
 
 class TestApiStaticDataError(unittest.TestCase):
+
     def test_fluid_dtype(self):
         with program_guard(Program(), Program()):
             x1 = paddle.static.data(name="x1", shape=[2, 25])
@@ -100,14 +102,19 @@ def test_shape_type():
 
 
 class TestApiErrorWithDynamicMode(unittest.TestCase):
+
     def test_error(self):
         with program_guard(Program(), Program()):
             paddle.disable_static()
             self.assertRaises(AssertionError, fluid.data, 'a', [2, 25])
-            self.assertRaises(
-                AssertionError, fluid.layers.data, 'b', shape=[2, 25])
-            self.assertRaises(
-                AssertionError, paddle.static.data, 'c', shape=[2, 25])
+            self.assertRaises(AssertionError,
+                              fluid.layers.data,
+                              'b',
+                              shape=[2, 25])
+            self.assertRaises(AssertionError,
+                              paddle.static.data,
+                              'c',
+                              shape=[2, 25])
             paddle.enable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_data_generator.py b/python/paddle/fluid/tests/unittests/test_data_generator.py
index 69d8e01fd464a..62e8f607367af 100644
--- a/python/paddle/fluid/tests/unittests/test_data_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_data_generator.py
@@ -19,7 +19,9 @@
 
 
 class MyMultiSlotDataGenerator(fleet.MultiSlotDataGenerator):
+
     def generate_sample(self, line):
+
         def data_iter():
             for i in range(40):
                 if i == 1:
@@ -30,7 +32,9 @@ def data_iter():
 
 
 class MyMultiSlotStringDataGenerator(fleet.MultiSlotStringDataGenerator):
+
     def generate_sample(self, line):
+
         def data_iter():
             for i in range(40):
                 if i == 1:
@@ -41,7 +45,9 @@ def data_iter():
 
 
 class MyMultiSlotDataGenerator_error(fleet.MultiSlotDataGenerator):
+
     def generate_sample(self, line):
+
         def data_iter():
             for i in range(40):
                 if i == 1:
@@ -52,7 +58,9 @@ def data_iter():
 
 
 class MyMultiSlotDataGenerator_error_2(fleet.MultiSlotStringDataGenerator):
+
     def generate_sample(self, line):
+
         def data_iter():
             for i in range(40):
                 if i == 1:
@@ -63,7 +71,9 @@ def data_iter():
 
 
 class MyMultiSlotDataGenerator_error_3(fleet.MultiSlotDataGenerator):
+
     def generate_sample(self, line):
+
         def data_iter():
             for i in range(40):
                 if i == 1:
@@ -74,7 +84,9 @@ def data_iter():
 
 
 class MyMultiSlotDataGenerator_error_4(fleet.MultiSlotDataGenerator):
+
     def generate_sample(self, line):
+
         def data_iter():
             for i in range(40):
                 if i == 1:
@@ -85,7 +97,9 @@ def data_iter():
 
 
 class MyMultiSlotDataGenerator_error_5(fleet.MultiSlotDataGenerator):
+
     def generate_sample(self, line):
+
         def data_iter():
             for i in range(40):
                 if i == 1:
@@ -96,7 +110,9 @@ def data_iter():
 
 
 class MyMultiSlotStringDataGenerator_zip(fleet.MultiSlotStringDataGenerator):
+
     def generate_sample(self, line):
+
         def data_iter():
             for i in range(40):
                 if i == 1:
@@ -109,7 +125,9 @@ def data_iter():
 
 
 class MyMultiSlotDataGenerator_zip(fleet.MultiSlotDataGenerator):
+
     def generate_sample(self, line):
+
         def data_iter():
             for i in range(40):
                 if i == 1:
@@ -122,6 +140,7 @@ def data_iter():
 
 
 class TestMultiSlotDataGenerator(unittest.TestCase):
+
     def test_MultiSlotDataGenerator_basic(self):
         my_ms_dg = MyMultiSlotDataGenerator()
         my_ms_dg.set_batch(1)
@@ -129,6 +148,7 @@ def test_MultiSlotDataGenerator_basic(self):
 
 
 class TestMultiSlotStringDataGenerator(unittest.TestCase):
+
     def test_MyMultiSlotStringDataGenerator_basic(self):
         my_ms_dg = MyMultiSlotStringDataGenerator()
         my_ms_dg.set_batch(1)
@@ -136,6 +156,7 @@ def test_MyMultiSlotStringDataGenerator_basic(self):
 
 
 class TestMultiSlotDataGenerator_error(unittest.TestCase):
+
     def test_MultiSlotDataGenerator_error(self):
         with self.assertRaises(ValueError):
             my_ms_dg = MyMultiSlotDataGenerator_error()
@@ -144,6 +165,7 @@ def test_MultiSlotDataGenerator_error(self):
 
 
 class TestMultiSlotDataGenerator_error_2(unittest.TestCase):
+
     def test_MultiSlotDataGenerator_error(self):
         with self.assertRaises(ValueError):
             my_ms_dg = MyMultiSlotDataGenerator_error_2()
@@ -152,6 +174,7 @@ def test_MultiSlotDataGenerator_error(self):
 
 
 class TestMultiSlotDataGenerator_error_3(unittest.TestCase):
+
     def test_MultiSlotDataGenerator_error(self):
         with self.assertRaises(ValueError):
             my_ms_dg = MyMultiSlotDataGenerator_error_3()
@@ -160,6 +183,7 @@ def test_MultiSlotDataGenerator_error(self):
 
 
 class TestMultiSlotDataGenerator_error_4(unittest.TestCase):
+
     def test_MultiSlotDataGenerator_error(self):
         with self.assertRaises(ValueError):
             my_ms_dg = MyMultiSlotDataGenerator_error_4()
@@ -168,6 +192,7 @@ def test_MultiSlotDataGenerator_error(self):
 
 
 class TestMultiSlotDataGenerator_error_5(unittest.TestCase):
+
     def test_MultiSlotDataGenerator_error(self):
         with self.assertRaises(ValueError):
             my_ms_dg = MyMultiSlotDataGenerator_error_5()
@@ -176,6 +201,7 @@ def test_MultiSlotDataGenerator_error(self):
 
 
 class TestMultiSlotStringDataGeneratorZip(unittest.TestCase):
+
     def test_MultiSlotStringDataGenerator_zip(self):
         my_ms_dg = MyMultiSlotStringDataGenerator_zip()
         my_ms_dg.set_batch(1)
@@ -183,6 +209,7 @@ def test_MultiSlotStringDataGenerator_zip(self):
 
 
 class TestMultiSlotDataGeneratorZip(unittest.TestCase):
+
     def test_MultiSlotDataGenerator_zip(self):
         my_ms_dg = MyMultiSlotDataGenerator_zip()
         my_ms_dg.set_batch(1)
diff --git a/python/paddle/fluid/tests/unittests/test_data_norm_op.py b/python/paddle/fluid/tests/unittests/test_data_norm_op.py
index cefef9ff9183e..650ca5ca1341e 100644
--- a/python/paddle/fluid/tests/unittests/test_data_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_data_norm_op.py
@@ -43,8 +43,8 @@ def _reference_testing(x, batch_size, batch_sum, batch_square_sum, slot_dim=-1):
             for j in range(0, x_shape[1], slot_dim):
                 if x[i][j] <= -min_precision or x[i][j] >= min_precision:
                     for k in range(0, slot_dim):
-                        y[i][j + k] = (
-                            x[i][j + k] - means_arr[j + k]) * scales_arr[j + k]
+                        y[i][j + k] = (x[i][j + k] -
+                                       means_arr[j + k]) * scales_arr[j + k]
     return y
 
 
@@ -120,11 +120,11 @@ def check_with_place(self,
                                         OpTest.np_dtype_to_fluid_dtype(x_val),
                                         place)
         batch_size_tensor = create_or_get_tensor(
-            scope, "batch_size",
-            OpTest.np_dtype_to_fluid_dtype(batch_size), place)
+            scope, "batch_size", OpTest.np_dtype_to_fluid_dtype(batch_size),
+            place)
         batch_sum_tensor = create_or_get_tensor(
-            scope, "batch_sum",
-            OpTest.np_dtype_to_fluid_dtype(batch_sum), place)
+            scope, "batch_sum", OpTest.np_dtype_to_fluid_dtype(batch_sum),
+            place)
         batch_square_sum_tensor = create_or_get_tensor(
             scope, "batch_square_sum",
             OpTest.np_dtype_to_fluid_dtype(batch_square_sum), place)
@@ -155,8 +155,8 @@ def check_with_place(self,
             scale_w = np.ones(scale_shape).astype(np.float32)
             bias = np.zeros(scale_shape).astype(np.float32)
             scale_w_tensor = create_or_get_tensor(
-                scope, "scale_w",
-                OpTest.np_dtype_to_fluid_dtype(scale_w), place)
+                scope, "scale_w", OpTest.np_dtype_to_fluid_dtype(scale_w),
+                place)
             bias_tensor = create_or_get_tensor(
                 scope, "bias", OpTest.np_dtype_to_fluid_dtype(bias), place)
             data_norm_op = Operator(
@@ -181,13 +181,12 @@ def check_with_place(self,
         data_norm_op.run(scope, place)
 
         # check inference result
-        self.__assert_close(
-            y_tensor,
-            y_out,
-            "inference output are different at " + str(place) + ", " +
-            data_layout + ", " + str(np.dtype(dtype)) +
-            str(np.array(y_tensor)) + str(y_out),
-            atol=1e-3)
+        self.__assert_close(y_tensor,
+                            y_out,
+                            "inference output are different at " + str(place) +
+                            ", " + data_layout + ", " + str(np.dtype(dtype)) +
+                            str(np.array(y_tensor)) + str(y_out),
+                            atol=1e-3)
 
     def test_check_output(self):
         """
@@ -495,12 +494,14 @@ def test_check_grad(self):
 
 
 class TestDataNormOpErrorr(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             x2 = fluid.layers.data(name='x2', shape=[3, 4], dtype="int32")
             #self.assertRaises(TypeError, fluid.data_norm, x2)
-            fluid.layers.data_norm(
-                input=x2, param_attr={}, enable_scale_and_shift=True)
+            fluid.layers.data_norm(input=x2,
+                                   param_attr={},
+                                   enable_scale_and_shift=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_autotune.py b/python/paddle/fluid/tests/unittests/test_dataloader_autotune.py
index 7348783bd6748..a12ccf79dd2e9 100755
--- a/python/paddle/fluid/tests/unittests/test_dataloader_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_autotune.py
@@ -26,6 +26,7 @@
 
 
 class RandomDataset(Dataset):
+
     def __init__(self, num_samples):
         self.num_samples = num_samples
 
@@ -39,6 +40,7 @@ def __len__(self):
 
 
 class SimpleNet(nn.Layer):
+
     def __init__(self):
         super(SimpleNet, self).__init__()
         self.fc = nn.Linear(10, 10)
@@ -48,6 +50,7 @@ def forward(self, image):
 
 
 class TestAutoTune(unittest.TestCase):
+
     def setUp(self):
         self.batch_size = 1
         self.dataset = RandomDataset(10)
@@ -58,8 +61,9 @@ def test_dataloader_use_autotune(self):
                 "enable": True,
                 "tuning_steps": 1,
             }})
-        loader = DataLoader(
-            self.dataset, batch_size=self.batch_size, num_workers=0)
+        loader = DataLoader(self.dataset,
+                            batch_size=self.batch_size,
+                            num_workers=0)
 
     def test_dataloader_disable_autotune(self):
         config = {"dataloader": {"enable": False, "tuning_steps": 1}}
@@ -68,8 +72,9 @@ def test_dataloader_disable_autotune(self):
         tfile.close()
         paddle.incubate.autotune.set_config(tfile.name)
         os.remove(tfile.name)
-        loader = DataLoader(
-            self.dataset, batch_size=self.batch_size, num_workers=2)
+        loader = DataLoader(self.dataset,
+                            batch_size=self.batch_size,
+                            num_workers=2)
         if (sys.platform == 'darwin' or sys.platform == 'win32'):
             self.assertEqual(loader.num_workers, 0)
         else:
@@ -83,11 +88,13 @@ def test_distributer_batch_sampler_autotune(self):
             }})
         batch_sampler = paddle.io.DistributedBatchSampler(
             self.dataset, batch_size=self.batch_size)
-        loader = DataLoader(
-            self.dataset, batch_sampler=batch_sampler, num_workers=2)
+        loader = DataLoader(self.dataset,
+                            batch_sampler=batch_sampler,
+                            num_workers=2)
 
 
 class TestAutoTuneAPI(unittest.TestCase):
+
     def test_set_config_warnings(self):
         with warnings.catch_warnings(record=True) as w:
             config = {"kernel": {"enable": 1, "tuning_range": True}}
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
index 786d04272e3eb..81e52d5175d72 100644
--- a/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
@@ -26,6 +26,7 @@
 
 
 class TestDatasetAbstract(unittest.TestCase):
+
     def func_test_main(self):
         dataset = Dataset()
         try:
@@ -47,15 +48,20 @@ def test_main(self):
 
 
 class TestDatasetWithDiffOutputPlace(unittest.TestCase):
+
     def get_dataloader(self, num_workers):
         dataset = paddle.vision.datasets.MNIST(
             mode='test',
             transform=transforms.Compose([
-                transforms.CenterCrop(20), transforms.RandomResizedCrop(14),
-                transforms.Normalize(), transforms.ToTensor()
+                transforms.CenterCrop(20),
+                transforms.RandomResizedCrop(14),
+                transforms.Normalize(),
+                transforms.ToTensor()
             ]))
-        loader = paddle.io.DataLoader(
-            dataset, batch_size=32, num_workers=num_workers, shuffle=True)
+        loader = paddle.io.DataLoader(dataset,
+                                      batch_size=32,
+                                      num_workers=num_workers,
+                                      shuffle=True)
         return loader
 
     def run_check_on_cpu(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_early_reset.py b/python/paddle/fluid/tests/unittests/test_dataloader_early_reset.py
index 02501d51c4975..82f92bd633e9e 100644
--- a/python/paddle/fluid/tests/unittests/test_dataloader_early_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_early_reset.py
@@ -26,6 +26,7 @@ def infinite_reader():
 
 
 class TestDataLoaderEarlyReset(unittest.TestCase):
+
     def setUp(self):
         self.stop_batch = 10
         self.iterable = True
@@ -45,8 +46,9 @@ def get_place(self):
 
     def create_data_loader(self):
         self.x = fluid.data(name='x', shape=[None, 32], dtype='float32')
-        return fluid.io.DataLoader.from_generator(
-            feed_list=[self.x], capacity=10, iterable=self.iterable)
+        return fluid.io.DataLoader.from_generator(feed_list=[self.x],
+                                                  capacity=10,
+                                                  iterable=self.iterable)
 
     def test_main(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
@@ -88,6 +90,7 @@ def run_network(self):
 
 
 class TestDataLoaderEarlyReset2(TestDataLoaderEarlyReset):
+
     def setUp(self):
         self.stop_batch = 20
         self.iterable = False
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py b/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
index 6e8ee5589db77..6f18c87623318 100644
--- a/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
@@ -20,6 +20,7 @@
 
 
 def create_reader(shape, batch_number):
+
     def __impl__():
         idx = 0
         for _ in six.moves.range(batch_number):
@@ -30,6 +31,7 @@ def __impl__():
 
 
 class DataLoaderKeepOrderTestBase(unittest.TestCase):
+
     def initParameters(self):
         self.iterable = False
         self.break_num = 100
@@ -42,15 +44,15 @@ def setUp(self):
 
     def build_network(self, places):
         input_data = fluid.data(shape=self.shape, dtype='float32', name="input")
-        loader = fluid.io.DataLoader.from_generator(
-            capacity=16, feed_list=[input_data], iterable=self.iterable)
+        loader = fluid.io.DataLoader.from_generator(capacity=16,
+                                                    feed_list=[input_data],
+                                                    iterable=self.iterable)
 
         fc = fluid.layers.fc(input_data, size=10)
         loss = fluid.layers.reduce_mean(fc)
 
-        loader.set_batch_generator(
-            create_reader(self.shape, self.batch_num),
-            places=places if loader.iterable else None)
+        loader.set_batch_generator(create_reader(self.shape, self.batch_num),
+                                   places=places if loader.iterable else None)
 
         return input_data, loss, loader
 
@@ -64,9 +66,8 @@ def assertInputData(self, batch_id, input_data, dev_cnt):
                 self.assertTrue((input_tensor == start_val).all())
                 start_val += 1
         else:
-            self.assertEqual(
-                list(input_data.shape),
-                [self.shape[0] * dev_cnt] + self.shape[1:])
+            self.assertEqual(list(input_data.shape),
+                             [self.shape[0] * dev_cnt] + self.shape[1:])
             start_val = dev_cnt * batch_id
             for idx in six.moves.range(dev_cnt):
                 data_part = input_data[idx * self.shape[0]:(idx + 1) *
@@ -81,7 +82,8 @@ def get_places(self):
                 place_list.extend([fluid.cuda_places(0)])
             else:
                 place_list.extend(
-                    [fluid.cuda_places(0), fluid.cuda_places([0, 1])])
+                    [fluid.cuda_places(0),
+                     fluid.cuda_places([0, 1])])
         return place_list
 
     def test_main(self):
@@ -106,8 +108,8 @@ def run_main_with_place(self, places, use_compiled_program=True):
                 main_program = fluid.default_main_program()
                 if use_compiled_program:
                     main_program = fluid.CompiledProgram(
-                        main_program).with_data_parallel(
-                            loss_name=loss.name, places=places)
+                        main_program).with_data_parallel(loss_name=loss.name,
+                                                         places=places)
 
                 max_batch_num = min(self.break_num,
                                     int(self.batch_num / dev_cnt))
@@ -153,30 +155,35 @@ def run_main_with_place(self, places, use_compiled_program=True):
 
 
 class IterableDataLoaderKeepOrderTest2(DataLoaderKeepOrderTestBase):
+
     def initParameters(self):
         self.iterable = True
         self.break_num = 100
 
 
 class IterableDataLoaderKeepOrderTest3(DataLoaderKeepOrderTestBase):
+
     def initParameters(self):
         self.iterable = False
         self.break_num = 2
 
 
 class IterableDataLoaderKeepOrderTest4(DataLoaderKeepOrderTestBase):
+
     def initParameters(self):
         self.iterable = True
         self.break_num = 2
 
 
 class IterableDataLoaderKeepOrderTest5(DataLoaderKeepOrderTestBase):
+
     def initParameters(self):
         self.iterable = False
         self.break_num = 0
 
 
 class IterableDataLoaderKeepOrderTest6(DataLoaderKeepOrderTestBase):
+
     def initParameters(self):
         self.iterable = True
         self.break_num = 0
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py b/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py
index f779d762fb302..94cc701b598eb 100644
--- a/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py
@@ -23,6 +23,7 @@
 
 
 def create_reader(shape, batch_number):
+
     def __impl__():
         idx = 0
         for _ in six.moves.range(batch_number):
@@ -33,6 +34,7 @@ def __impl__():
 
 
 class DataLoaderKeepOrderTestBase(unittest.TestCase):
+
     def initParameters(self):
         self.iterable = False
         self.break_num = 10000
@@ -48,19 +50,22 @@ def clear_visited(self):
 
     def build_network(self, places):
         input_data = fluid.data(shape=self.shape, dtype='float32', name="input")
-        loader = fluid.io.DataLoader.from_generator(
-            capacity=16, feed_list=[input_data], iterable=self.iterable)
+        loader = fluid.io.DataLoader.from_generator(capacity=16,
+                                                    feed_list=[input_data],
+                                                    iterable=self.iterable)
 
         fc = fluid.layers.fc(input_data, size=10)
         loss = fluid.layers.reduce_mean(fc)
 
-        loader.set_batch_generator(
-            create_reader(self.shape, self.batch_num),
-            places=places if loader.iterable else None)
+        loader.set_batch_generator(create_reader(self.shape, self.batch_num),
+                                   places=places if loader.iterable else None)
 
         return input_data, loss, loader
 
-    def assertInputData(self, batch_id, input_data, dev_cnt,
+    def assertInputData(self,
+                        batch_id,
+                        input_data,
+                        dev_cnt,
                         check_visited=True):
         if isinstance(input_data, list):
             self.assertTrue(len(input_data), dev_cnt)
@@ -78,9 +83,8 @@ def assertInputData(self, batch_id, input_data, dev_cnt,
 
                 start_val += 1
         else:
-            self.assertEqual(
-                list(input_data.shape),
-                [self.shape[0] * dev_cnt] + self.shape[1:])
+            self.assertEqual(list(input_data.shape),
+                             [self.shape[0] * dev_cnt] + self.shape[1:])
             start_val = dev_cnt * batch_id
             for idx in six.moves.range(dev_cnt):
                 data_part = input_data[idx * self.shape[0]:(idx + 1) *
@@ -100,7 +104,8 @@ def get_places(self):
                 place_list.extend([fluid.cuda_places(0)])
             else:
                 place_list.extend(
-                    [fluid.cuda_places(0), fluid.cuda_places([0, 1])])
+                    [fluid.cuda_places(0),
+                     fluid.cuda_places([0, 1])])
         return place_list
 
     def test_main(self):
@@ -125,8 +130,8 @@ def run_main_with_place(self, places, use_compiled_program=True):
                 main_program = fluid.default_main_program()
                 if use_compiled_program:
                     main_program = fluid.CompiledProgram(
-                        main_program).with_data_parallel(
-                            loss_name=loss.name, places=places)
+                        main_program).with_data_parallel(loss_name=loss.name,
+                                                         places=places)
 
                 max_batch_num = min(self.break_num,
                                     int(self.batch_num / dev_cnt))
@@ -141,8 +146,10 @@ def run_main_with_place(self, places, use_compiled_program=True):
                             if batch_id >= self.break_num:
                                 early_break = True
                                 break
-                            self.assertInputData(
-                                batch_id, data, dev_cnt, check_visited=False)
+                            self.assertInputData(batch_id,
+                                                 data,
+                                                 dev_cnt,
+                                                 check_visited=False)
                             fetch_val, = exe.run(program=main_program,
                                                  feed=data,
                                                  fetch_list=fetch_list)
@@ -181,30 +188,35 @@ def run_main_with_place(self, places, use_compiled_program=True):
 
 
 class IterableDataLoaderKeepOrderTest2(DataLoaderKeepOrderTestBase):
+
     def initParameters(self):
         self.iterable = True
         self.break_num = 10000
 
 
 class IterableDataLoaderKeepOrderTest3(DataLoaderKeepOrderTestBase):
+
     def initParameters(self):
         self.iterable = False
         self.break_num = 2
 
 
 class IterableDataLoaderKeepOrderTest4(DataLoaderKeepOrderTestBase):
+
     def initParameters(self):
         self.iterable = True
         self.break_num = 2
 
 
 class IterableDataLoaderKeepOrderTest5(DataLoaderKeepOrderTestBase):
+
     def initParameters(self):
         self.iterable = False
         self.break_num = 0
 
 
 class IterableDataLoaderKeepOrderTest6(DataLoaderKeepOrderTestBase):
+
     def initParameters(self):
         self.iterable = True
         self.break_num = 0
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 5ef5a1016cc8b..e31baf9fe2e70 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -103,19 +103,22 @@ def test_run_with_dump(self):
         slots = ["slot1", "slot2", "slot3", "slot4"]
         slots_vars = []
         for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="int64", lod_level=1)
+            var = fluid.layers.data(name=slot,
+                                    shape=[1],
+                                    dtype="int64",
+                                    lod_level=1)
             slots_vars.append(var)
 
         dataset = paddle.distributed.InMemoryDataset()
-        dataset.init(
-            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
+        dataset.init(batch_size=32,
+                     thread_num=3,
+                     pipe_command="cat",
+                     use_var=slots_vars)
         dataset.update_settings(pipe_command="cat1")
-        dataset._init_distributed_settings(
-            parse_ins_id=True,
-            parse_content=True,
-            fea_eval=True,
-            candidate_size=10000)
+        dataset._init_distributed_settings(parse_ins_id=True,
+                                           parse_content=True,
+                                           fea_eval=True,
+                                           candidate_size=10000)
         dataset.set_filelist([dump_a_path, dump_b_path])
         dataset.load_into_memory()
         dataset.local_shuffle()
@@ -186,17 +189,18 @@ def test_set_download_cmd(self):
         slots = ["slot1", "slot2", "slot3", "slot4"]
         slots_vars = []
         for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="int64", lod_level=1)
+            var = fluid.layers.data(name=slot,
+                                    shape=[1],
+                                    dtype="int64",
+                                    lod_level=1)
             slots_vars.append(var)
 
         dataset = paddle.distributed.InMemoryDataset()
-        dataset.init(
-            batch_size=32,
-            thread_num=3,
-            pipe_command="cat",
-            download_cmd="cat",
-            use_var=slots_vars)
+        dataset.init(batch_size=32,
+                     thread_num=3,
+                     pipe_command="cat",
+                     download_cmd="cat",
+                     use_var=slots_vars)
         dataset.set_filelist([filename1, filename2])
         dataset.load_into_memory()
         paddle.enable_static()
@@ -207,9 +211,8 @@ def test_set_download_cmd(self):
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(startup_program)
         if self.use_data_loader:
-            data_loader = fluid.io.DataLoader.from_dataset(dataset,
-                                                           fluid.cpu_places(),
-                                                           self.drop_last)
+            data_loader = fluid.io.DataLoader.from_dataset(
+                dataset, fluid.cpu_places(), self.drop_last)
             for i in range(self.epoch_num):
                 for data in data_loader():
                     exe.run(main_program, feed=data)
@@ -242,13 +245,17 @@ def test_in_memory_dataset_run(self):
         slots = ["slot1", "slot2", "slot3", "slot4"]
         slots_vars = []
         for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="int64", lod_level=1)
+            var = fluid.layers.data(name=slot,
+                                    shape=[1],
+                                    dtype="int64",
+                                    lod_level=1)
             slots_vars.append(var)
 
         dataset = paddle.distributed.InMemoryDataset()
-        dataset.init(
-            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
+        dataset.init(batch_size=32,
+                     thread_num=3,
+                     pipe_command="cat",
+                     use_var=slots_vars)
         dataset._init_distributed_settings(fea_eval=True, candidate_size=1)
         dataset.set_filelist([
             "test_in_memory_dataset_run_a.txt",
@@ -262,9 +269,8 @@ def test_in_memory_dataset_run(self):
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
         if self.use_data_loader:
-            data_loader = fluid.io.DataLoader.from_dataset(dataset,
-                                                           fluid.cpu_places(),
-                                                           self.drop_last)
+            data_loader = fluid.io.DataLoader.from_dataset(
+                dataset, fluid.cpu_places(), self.drop_last)
             for i in range(self.epoch_num):
                 for data in data_loader():
                     exe.run(fluid.default_main_program(), feed=data)
@@ -307,17 +313,23 @@ def test_in_memory_dataset_masterpatch(self):
         startup_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
             for slot in slots[:2]:
-                var = fluid.layers.data(
-                    name=slot, shape=[1], dtype="int64", lod_level=1)
+                var = fluid.layers.data(name=slot,
+                                        shape=[1],
+                                        dtype="int64",
+                                        lod_level=1)
                 slots_vars.append(var)
             for slot in slots[2:]:
-                var = fluid.layers.data(
-                    name=slot, shape=[1], dtype="float32", lod_level=1)
+                var = fluid.layers.data(name=slot,
+                                        shape=[1],
+                                        dtype="float32",
+                                        lod_level=1)
                 slots_vars.append(var)
 
         dataset = paddle.distributed.InMemoryDataset()
-        dataset.init(
-            batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars)
+        dataset.init(batch_size=32,
+                     thread_num=1,
+                     pipe_command="cat",
+                     use_var=slots_vars)
         dataset._init_distributed_settings(parse_ins_id=True)
         dataset.set_filelist([
             "test_in_memory_dataset_masterpatch_a.txt",
@@ -370,19 +382,29 @@ def test_in_memory_dataset_masterpatch1(self):
         train_program = fluid.Program()
         startup_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
-            var1 = fluid.layers.data(
-                name="slot1", shape=[1], dtype="int64", lod_level=0)
-            var2 = fluid.layers.data(
-                name="slot2", shape=[1], dtype="int64", lod_level=0)
-            var3 = fluid.layers.data(
-                name="slot3", shape=[1], dtype="float32", lod_level=0)
-            var4 = fluid.layers.data(
-                name="slot4", shape=[1], dtype="float32", lod_level=0)
+            var1 = fluid.layers.data(name="slot1",
+                                     shape=[1],
+                                     dtype="int64",
+                                     lod_level=0)
+            var2 = fluid.layers.data(name="slot2",
+                                     shape=[1],
+                                     dtype="int64",
+                                     lod_level=0)
+            var3 = fluid.layers.data(name="slot3",
+                                     shape=[1],
+                                     dtype="float32",
+                                     lod_level=0)
+            var4 = fluid.layers.data(name="slot4",
+                                     shape=[1],
+                                     dtype="float32",
+                                     lod_level=0)
             slots_vars = [var1, var2, var3, var4]
 
         dataset = paddle.distributed.InMemoryDataset()
-        dataset.init(
-            batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars)
+        dataset.init(batch_size=32,
+                     thread_num=1,
+                     pipe_command="cat",
+                     use_var=slots_vars)
         dataset._init_distributed_settings(parse_ins_id=True)
         dataset.set_filelist([
             "test_in_memory_dataset_masterpatch1_a.txt",
@@ -429,13 +451,17 @@ def test_in_memory_dataset_run_2(self):
         slots = ["slot1_f", "slot2_f", "slot3_f", "slot4_f"]
         slots_vars = []
         for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="float32", lod_level=1)
+            var = fluid.layers.data(name=slot,
+                                    shape=[1],
+                                    dtype="float32",
+                                    lod_level=1)
             slots_vars.append(var)
 
         dataset = paddle.distributed.InMemoryDataset()
-        dataset.init(
-            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
+        dataset.init(batch_size=32,
+                     thread_num=3,
+                     pipe_command="cat",
+                     use_var=slots_vars)
         dataset.set_filelist([
             "test_in_memory_dataset_run_a.txt",
             "test_in_memory_dataset_run_b.txt"
@@ -443,32 +469,36 @@ def test_in_memory_dataset_run_2(self):
         dataset.load_into_memory()
         dataset.local_shuffle()
 
-        exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0))
+        exe = fluid.Executor(fluid.CPUPlace(
+        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
         exe.run(fluid.default_startup_program())
 
         for i in range(2):
             try:
                 exe.train_from_dataset(fluid.default_main_program(), dataset)
-                exe.train_from_dataset(
-                    fluid.default_main_program(), dataset, thread=1)
-                exe.train_from_dataset(
-                    fluid.default_main_program(), dataset, thread=2)
-                exe.train_from_dataset(
-                    fluid.default_main_program(), dataset, thread=2)
-                exe.train_from_dataset(
-                    fluid.default_main_program(), dataset, thread=3)
-                exe.train_from_dataset(
-                    fluid.default_main_program(), dataset, thread=4)
+                exe.train_from_dataset(fluid.default_main_program(),
+                                       dataset,
+                                       thread=1)
+                exe.train_from_dataset(fluid.default_main_program(),
+                                       dataset,
+                                       thread=2)
+                exe.train_from_dataset(fluid.default_main_program(),
+                                       dataset,
+                                       thread=2)
+                exe.train_from_dataset(fluid.default_main_program(),
+                                       dataset,
+                                       thread=3)
+                exe.train_from_dataset(fluid.default_main_program(),
+                                       dataset,
+                                       thread=4)
             except ImportError as e:
                 pass
             except Exception as e:
                 self.assertTrue(False)
 
         if self.use_data_loader:
-            data_loader = fluid.io.DataLoader.from_dataset(dataset,
-                                                           fluid.cpu_places(),
-                                                           self.drop_last)
+            data_loader = fluid.io.DataLoader.from_dataset(
+                dataset, fluid.cpu_places(), self.drop_last)
             for i in range(self.epoch_num):
                 for data in data_loader():
                     exe.run(fluid.default_main_program(), feed=data)
@@ -492,21 +522,20 @@ def test_in_memory_dataset_run_2(self):
         dataset._set_parse_ins_id(False)
         dataset.load_into_memory()
         dataset.dataset.merge_by_lineid()
-        dataset.update_settings(
-            batch_size=1,
-            thread_num=2,
-            input_type=1,
-            pipe_command="cat",
-            use_var=[],
-            fs_name="",
-            fs_ugi="",
-            download_cmd="cat",
-            merge_size=-1,
-            parse_ins_id=False,
-            parse_content=False,
-            fleet_send_batch_size=2,
-            fleet_send_sleep_seconds=2,
-            fea_eval=True)
+        dataset.update_settings(batch_size=1,
+                                thread_num=2,
+                                input_type=1,
+                                pipe_command="cat",
+                                use_var=[],
+                                fs_name="",
+                                fs_ugi="",
+                                download_cmd="cat",
+                                merge_size=-1,
+                                parse_ins_id=False,
+                                parse_content=False,
+                                fleet_send_batch_size=2,
+                                fleet_send_sleep_seconds=2,
+                                fea_eval=True)
         fleet_ptr = fluid.core.Fleet()
         fleet_ptr.set_client2client_config(1, 1, 1)
         fleet_ptr.get_cache_threshold(0)
@@ -533,22 +562,25 @@ def test_queue_dataset_run(self):
         slots = ["slot1", "slot2", "slot3", "slot4"]
         slots_vars = []
         for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="int64", lod_level=1)
+            var = fluid.layers.data(name=slot,
+                                    shape=[1],
+                                    dtype="int64",
+                                    lod_level=1)
             slots_vars.append(var)
 
         dataset = paddle.distributed.QueueDataset()
-        dataset.init(
-            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
+        dataset.init(batch_size=32,
+                     thread_num=3,
+                     pipe_command="cat",
+                     use_var=slots_vars)
         dataset.set_filelist(
             ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
 
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
         if self.use_data_loader:
-            data_loader = fluid.io.DataLoader.from_dataset(dataset,
-                                                           fluid.cpu_places(),
-                                                           self.drop_last)
+            data_loader = fluid.io.DataLoader.from_dataset(
+                dataset, fluid.cpu_places(), self.drop_last)
             for i in range(self.epoch_num):
                 for data in data_loader():
                     exe.run(fluid.default_main_program(), feed=data)
@@ -561,8 +593,10 @@ def test_queue_dataset_run(self):
                     self.assertTrue(False)
 
         dataset2 = paddle.distributed.QueueDataset()
-        dataset2.init(
-            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
+        dataset2.init(batch_size=32,
+                      thread_num=3,
+                      pipe_command="cat",
+                      use_var=slots_vars)
         dataset.set_filelist([])
         try:
             exe.train_from_dataset(fluid.default_main_program(), dataset2)
@@ -597,23 +631,26 @@ def test_queue_dataset_run_2(self):
         slots = ["slot1_f", "slot2_f", "slot3_f", "slot4_f"]
         slots_vars = []
         for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="float32", lod_level=1)
+            var = fluid.layers.data(name=slot,
+                                    shape=[1],
+                                    dtype="float32",
+                                    lod_level=1)
             slots_vars.append(var)
 
         dataset = paddle.distributed.QueueDataset()
-        dataset.init(
-            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
+        dataset.init(batch_size=32,
+                     thread_num=3,
+                     pipe_command="cat",
+                     use_var=slots_vars)
         dataset.set_filelist(
             ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
 
-        exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0))
+        exe = fluid.Executor(fluid.CPUPlace(
+        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
         exe.run(fluid.default_startup_program())
         if self.use_data_loader:
-            data_loader = fluid.io.DataLoader.from_dataset(dataset,
-                                                           fluid.cpu_places(),
-                                                           self.drop_last)
+            data_loader = fluid.io.DataLoader.from_dataset(
+                dataset, fluid.cpu_places(), self.drop_last)
             for i in range(self.epoch_num):
                 for data in data_loader():
                     exe.run(fluid.default_main_program(), feed=data)
@@ -652,28 +689,28 @@ def test_queue_dataset_run_3(self):
         slots = ["slot1", "slot2", "slot3", "slot4"]
         slots_vars = []
         for slot in slots:
-            var = fluid.data(
-                name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+            var = fluid.data(name=slot,
+                             shape=[None, 1],
+                             dtype="int64",
+                             lod_level=1)
             slots_vars.append(var)
 
         dataset = paddle.distributed.InMemoryDataset()
-        dataset.init(
-            batch_size=1,
-            thread_num=2,
-            input_type=1,
-            pipe_command="cat",
-            use_var=slots_vars)
+        dataset.init(batch_size=1,
+                     thread_num=2,
+                     input_type=1,
+                     pipe_command="cat",
+                     use_var=slots_vars)
         dataset.set_filelist(
             ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
         dataset.load_into_memory()
 
-        exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0))
+        exe = fluid.Executor(fluid.CPUPlace(
+        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
         exe.run(fluid.default_startup_program())
         if self.use_data_loader:
-            data_loader = fluid.io.DataLoader.from_dataset(dataset,
-                                                           fluid.cpu_places(),
-                                                           self.drop_last)
+            data_loader = fluid.io.DataLoader.from_dataset(
+                dataset, fluid.cpu_places(), self.drop_last)
             for i in range(self.epoch_num):
                 for data in data_loader():
                     exe.run(fluid.default_main_program(), feed=data)
@@ -718,8 +755,10 @@ def net(self):
         slots_vars = []
         poolings = []
         for slot in slots:
-            data = fluid.layers.data(
-                name=slot, shape=[1], dtype="int64", lod_level=1)
+            data = fluid.layers.data(name=slot,
+                                     shape=[1],
+                                     dtype="int64",
+                                     lod_level=1)
             var = fluid.layers.cast(x=data, dtype='float32')
             pool = fluid.layers.sequence_pool(input=var, pool_type='AVERAGE')
 
@@ -739,8 +778,10 @@ def get_dataset(self, inputs, files):
             files(list): files of  get_dataset
         """
         dataset = paddle.distributed.QueueDataset()
-        dataset.init(
-            batch_size=32, thread_num=3, pipe_command="cat", use_var=inputs)
+        dataset.init(batch_size=32,
+                     thread_num=3,
+                     pipe_command="cat",
+                     use_var=inputs)
         dataset.set_filelist(files)
         return dataset
 
@@ -822,10 +863,9 @@ def test_fetch_handler(self):
         fh.help()
 
         try:
-            exe.train_from_dataset(
-                program=fluid.default_main_program(),
-                dataset=dataset,
-                fetch_handler=fh)
+            exe.train_from_dataset(program=fluid.default_main_program(),
+                                   dataset=dataset,
+                                   fetch_handler=fh)
         except ImportError as e:
             print("warning: we skip trainer_desc_pb2 import problem in windows")
         except RuntimeError as e:
@@ -895,11 +935,10 @@ def test_dataset_fleet(self):
             exe.run(startup_program)
             dataset = paddle.distributed.InMemoryDataset()
 
-            dataset.init(
-                batch_size=32,
-                thread_num=3,
-                pipe_command="cat",
-                use_var=slots_vars)
+            dataset.init(batch_size=32,
+                         thread_num=3,
+                         pipe_command="cat",
+                         use_var=slots_vars)
             dataset.set_filelist([
                 "test_in_memory_dataset2_run_a.txt",
                 "test_in_memory_dataset2_run_b.txt"
@@ -950,14 +989,17 @@ def test_dataset_fleet2(self):
                 print("warning: no mpi4py")
             adam = fluid.optimizer.Adam(learning_rate=0.000005)
             try:
-                adam = fleet.distributed_optimizer(
-                    adam,
-                    strategy={
-                        "fs_uri": "fs_uri_xxx",
-                        "fs_user": "fs_user_xxx",
-                        "fs_passwd": "fs_passwd_xxx",
-                        "fs_hadoop_bin": "fs_hadoop_bin_xxx"
-                    })
+                adam = fleet.distributed_optimizer(adam,
+                                                   strategy={
+                                                       "fs_uri":
+                                                       "fs_uri_xxx",
+                                                       "fs_user":
+                                                       "fs_user_xxx",
+                                                       "fs_passwd":
+                                                       "fs_passwd_xxx",
+                                                       "fs_hadoop_bin":
+                                                       "fs_hadoop_bin_xxx"
+                                                   })
                 adam.minimize([fake_cost], [scope])
             except AttributeError as e:
                 print("warning: no mpi")
@@ -965,11 +1007,10 @@ def test_dataset_fleet2(self):
                 print("warning: no mpi4py")
             exe.run(startup_program)
             dataset = paddle.distributed.InMemoryDataset()
-            dataset.init(
-                batch_size=32,
-                thread_num=3,
-                pipe_command="cat",
-                use_var=slots_vars)
+            dataset.init(batch_size=32,
+                         thread_num=3,
+                         pipe_command="cat",
+                         use_var=slots_vars)
             dataset.set_filelist([
                 "test_in_memory_dataset2_run2_a.txt",
                 "test_in_memory_dataset2_run2_b.txt"
@@ -1074,14 +1115,17 @@ def test_bosps_dataset_fleet2(self):
                 print("warning: no mpi4py")
             adam = fluid.optimizer.Adam(learning_rate=0.000005)
             try:
-                adam = fleet.distributed_optimizer(
-                    adam,
-                    strategy={
-                        "fs_uri": "fs_uri_xxx",
-                        "fs_user": "fs_user_xxx",
-                        "fs_passwd": "fs_passwd_xxx",
-                        "fs_hadoop_bin": "fs_hadoop_bin_xxx"
-                    })
+                adam = fleet.distributed_optimizer(adam,
+                                                   strategy={
+                                                       "fs_uri":
+                                                       "fs_uri_xxx",
+                                                       "fs_user":
+                                                       "fs_user_xxx",
+                                                       "fs_passwd":
+                                                       "fs_passwd_xxx",
+                                                       "fs_hadoop_bin":
+                                                       "fs_hadoop_bin_xxx"
+                                                   })
                 adam.minimize([fake_cost], [scope])
             except AttributeError as e:
                 print("warning: no mpi")
@@ -1089,11 +1133,10 @@ def test_bosps_dataset_fleet2(self):
                 print("warning: no mpi4py")
             exe.run(startup_program)
             dataset = paddle.distributed.fleet.BoxPSDataset()
-            dataset.init(
-                batch_size=32,
-                thread_num=3,
-                pipe_command="cat",
-                use_var=slots_vars)
+            dataset.init(batch_size=32,
+                         thread_num=3,
+                         pipe_command="cat",
+                         use_var=slots_vars)
             dataset.set_filelist([
                 "test_in_memory_dataset2_run2_a.txt",
                 "test_in_memory_dataset2_run2_b.txt"
@@ -1106,15 +1149,14 @@ def test_bosps_dataset_fleet2(self):
             fleet._opt_info = None
             fleet._fleet_ptr = None
             dataset = paddle.distributed.fleet.BoxPSDataset()
-            dataset.init(
-                rank_offset="",
-                pv_batch_size=1,
-                fs_name="",
-                fs_ugi="",
-                data_feed_type="MultiSlotInMemoryDataFeed",
-                parse_logkey=True,
-                merge_by_sid=True,
-                enable_pv_merge=True)
+            dataset.init(rank_offset="",
+                         pv_batch_size=1,
+                         fs_name="",
+                         fs_ugi="",
+                         data_feed_type="MultiSlotInMemoryDataFeed",
+                         parse_logkey=True,
+                         merge_by_sid=True,
+                         enable_pv_merge=True)
             d = paddle.distributed.fleet.DatasetBase()
             try:
                 dataset._set_feed_type("MultiSlotInMemoryDataFeed")
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py
index 911bee69e8b77..ed4ff10758ea5 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py
@@ -45,10 +45,12 @@
 
 
 class CTRDataset(dg.MultiSlotDataGenerator):
+
     def __init__(self, mode):
         self.test = mode
 
     def generate_sample(self, line):
+
         def reader():
             ins = line.strip().split(';')
             label_pos_num = int(ins[1].split(' ')[0])
@@ -296,64 +298,74 @@ def test_var_consistency_insepection(self):
             f.write(data)
 
         slot_data = []
-        label = fluid.layers.data(
-            name="click",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=0,
-            append_batch_size=False)
+        label = fluid.layers.data(name="click",
+                                  shape=[-1, 1],
+                                  dtype="int64",
+                                  lod_level=0,
+                                  append_batch_size=False)
         slot_data.append(label)
 
         # sprase_query_feat_names
         len_sparse_query = 19
         for feat_name in range(1, len_sparse_query + 1):
             slot_data.append(
-                fluid.layers.data(
-                    name=str(feat_name), shape=[1], dtype='int64', lod_level=1))
+                fluid.layers.data(name=str(feat_name),
+                                  shape=[1],
+                                  dtype='int64',
+                                  lod_level=1))
 
-        # sparse_url_feat_names 
+        # sparse_url_feat_names
         for feat_name in range(len_sparse_query + 1, len_sparse_query + 5):
             slot_data.append(
-                fluid.layers.data(
-                    name=str(feat_name), shape=[1], dtype='int64', lod_level=1))
+                fluid.layers.data(name=str(feat_name),
+                                  shape=[1],
+                                  dtype='int64',
+                                  lod_level=1))
 
         # dense_feat_names
         for feat_name in range(len_sparse_query + 5, len_sparse_query + 16):
             slot_data.append(
-                fluid.layers.data(
-                    name=str(feat_name), shape=[1], dtype='float32'))
+                fluid.layers.data(name=str(feat_name),
+                                  shape=[1],
+                                  dtype='float32'))
 
         # context_feat_namess
         for feat_name in range(len_sparse_query + 16, len_sparse_query + 18):
             slot_data.append(
-                fluid.layers.data(
-                    name=str(feat_name), shape=[1], dtype='float32'))
+                fluid.layers.data(name=str(feat_name),
+                                  shape=[1],
+                                  dtype='float32'))
 
-        # neg sparse_url_feat_names 
+        # neg sparse_url_feat_names
         for feat_name in range(len_sparse_query + 18, len_sparse_query + 22):
             slot_data.append(
-                fluid.layers.data(
-                    name=str(feat_name), shape=[1], dtype='int64', lod_level=1))
+                fluid.layers.data(name=str(feat_name),
+                                  shape=[1],
+                                  dtype='int64',
+                                  lod_level=1))
 
         # neg dense_feat_names
         for feat_name in range(len_sparse_query + 22, len_sparse_query + 33):
             slot_data.append(
-                fluid.layers.data(
-                    name=str(feat_name), shape=[1], dtype='float32'))
+                fluid.layers.data(name=str(feat_name),
+                                  shape=[1],
+                                  dtype='float32'))
 
         # neg context_feat_namess
         for feat_name in range(len_sparse_query + 33, len_sparse_query + 35):
             slot_data.append(
-                fluid.layers.data(
-                    name=str(feat_name), shape=[1], dtype='float32'))
+                fluid.layers.data(name=str(feat_name),
+                                  shape=[1],
+                                  dtype='float32'))
 
         dataset = paddle.distributed.InMemoryDataset()
 
         print("========================================")
         generator_class = CTRDataset(mode=0)
         try:
-            dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, dump_a_path)
+            dataset._check_use_var_with_data_generator(slot_data,
+                                                       generator_class,
+                                                       dump_a_path)
             print("case 1: check passed!")
         except Exception as e:
             print("warning: catch expected error")
@@ -364,8 +376,9 @@ def test_var_consistency_insepection(self):
         print("========================================")
         generator_class = CTRDataset(mode=2)
         try:
-            dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, dump_a_path)
+            dataset._check_use_var_with_data_generator(slot_data,
+                                                       generator_class,
+                                                       dump_a_path)
         except Exception as e:
             print("warning: case 2 catch expected error")
             print(e)
@@ -375,8 +388,9 @@ def test_var_consistency_insepection(self):
         print("========================================")
         generator_class = CTRDataset(mode=3)
         try:
-            dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, dump_a_path)
+            dataset._check_use_var_with_data_generator(slot_data,
+                                                       generator_class,
+                                                       dump_a_path)
         except Exception as e:
             print("warning: case 3 catch expected error")
             print(e)
@@ -386,8 +400,9 @@ def test_var_consistency_insepection(self):
         print("========================================")
         generator_class = CTRDataset(mode=4)
         try:
-            dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, dump_a_path)
+            dataset._check_use_var_with_data_generator(slot_data,
+                                                       generator_class,
+                                                       dump_a_path)
         except Exception as e:
             print("warning: case 4 catch expected error")
             print(e)
@@ -397,8 +412,9 @@ def test_var_consistency_insepection(self):
         print("========================================")
         generator_class = CTRDataset(mode=5)
         try:
-            dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, dump_a_path)
+            dataset._check_use_var_with_data_generator(slot_data,
+                                                       generator_class,
+                                                       dump_a_path)
         except Exception as e:
             print("warning: case 5 catch expected error")
             print(e)
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
index 9195ac277b93a..8d949bf51a7da 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
@@ -52,7 +52,9 @@ def write_reader_data_to_file(filename, reader):
     with open(filename, 'w') as fid:
         for instance_list in reader():
             for i, instance in enumerate(instance_list):
-                instance = np.reshape(instance, [instance.size, ])
+                instance = np.reshape(instance, [
+                    instance.size,
+                ])
                 fid.write(str(instance.size) + ' ')
                 fid.write(' '.join(map(str, instance)))
                 fid.write(' ')
@@ -61,19 +63,21 @@ def write_reader_data_to_file(filename, reader):
 
 
 def fake_reader(batch_size=BATCH_SIZE, batch_num=BATCH_NUM):
+
     def __reader__():
         iteration = BATCH_SIZE * BATCH_NUM
         iteration = int(iteration + BATCH_SIZE / 2)
         for _ in six.moves.range(iteration):
             image = np.random.random(size=IMAGE_SHAPE).astype('float32')
-            label = np.random.random_integers(
-                size=LABEL_SHAPE, low=0, high=9).astype('int64')
+            label = np.random.random_integers(size=LABEL_SHAPE, low=0,
+                                              high=9).astype('int64')
             yield image, label
 
     return __reader__
 
 
 class DatasetLoaderTestBase(unittest.TestCase):
+
     def setUp(self):
         self.dataset_name = "QueueDataset"
         self.drop_last = False
@@ -86,10 +90,12 @@ def build_network(self):
         main_prog = fluid.Program()
         startup_prog = fluid.Program()
         with fluid.program_guard(main_prog, startup_prog):
-            image = fluid.layers.data(
-                name='image', shape=IMAGE_SHAPE, dtype='float32')
-            label = fluid.layers.data(
-                name='label', shape=LABEL_SHAPE, dtype='int64')
+            image = fluid.layers.data(name='image',
+                                      shape=IMAGE_SHAPE,
+                                      dtype='float32')
+            label = fluid.layers.data(name='label',
+                                      shape=LABEL_SHAPE,
+                                      dtype='int64')
 
             simple_fc_net_with_inputs(image, label)
 
@@ -135,8 +141,9 @@ def check_batch_number(self, place, randomize_batch_num=False):
         if self.dataset_name == 'InMemoryDataset':
             dataset.load_into_memory()
 
-        dataloader = fluid.io.DataLoader.from_dataset(
-            dataset=dataset, places=places, drop_last=self.drop_last)
+        dataloader = fluid.io.DataLoader.from_dataset(dataset=dataset,
+                                                      places=places,
+                                                      drop_last=self.drop_last)
         prog = fluid.CompiledProgram(main_prog).with_data_parallel()
         exe = fluid.Executor(place)
 
@@ -159,23 +166,22 @@ def check_batch_number(self, place, randomize_batch_num=False):
                             batch_size = BATCH_SIZE
 
                     self.assertEquals(image.shape()[1:], IMAGE_SHAPE)
-                    self.assertTrue(
-                        image._place()._equals(places[idx]),
-                        msg=get_place_string(image._place()) + ' vs ' +
-                        get_place_string(places[idx]))
+                    self.assertTrue(image._place()._equals(places[idx]),
+                                    msg=get_place_string(image._place()) +
+                                    ' vs ' + get_place_string(places[idx]))
                     if self.drop_last:
                         self.assertEquals(image.shape()[0], BATCH_SIZE)
                     else:
-                        self.assertTrue(image.shape()[0] == BATCH_SIZE or
-                                        image.shape()[0] == BATCH_SIZE / 2)
+                        self.assertTrue(image.shape()[0] == BATCH_SIZE
+                                        or image.shape()[0] == BATCH_SIZE / 2)
 
                     self.assertEquals(label.shape()[1:], LABEL_SHAPE)
                     self.assertTrue(label._place()._equals(places[idx]))
                     if self.drop_last:
                         self.assertEquals(label.shape()[0], BATCH_SIZE)
                     else:
-                        self.assertTrue(label.shape()[0] == BATCH_SIZE or
-                                        label.shape()[0] == BATCH_SIZE / 2)
+                        self.assertTrue(label.shape()[0] == BATCH_SIZE
+                                        or label.shape()[0] == BATCH_SIZE / 2)
 
                     self.assertEquals(image.shape()[0], label.shape()[0])
 
@@ -204,18 +210,21 @@ def test_batch_number_with_different_length_files(self):
 
 
 class QueueDatasetTestWithoutDropLast(DatasetLoaderTestBase):
+
     def setUp(self):
         self.dataset_name = "QueueDataset"
         self.drop_last = True
 
 
 class InMemoryDatasetTestWithoutDropLast(DatasetLoaderTestBase):
+
     def setUp(self):
         self.dataset_name = "InMemoryDataset"
         self.drop_last = False
 
 
 class InMemoryDatasetTestWithDropLast(DatasetLoaderTestBase):
+
     def setUp(self):
         self.dataset_name = "InMemoryDataset"
         self.drop_last = True
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_download.py b/python/paddle/fluid/tests/unittests/test_dataset_download.py
index f1fba215b931f..06f015edf9561 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset_download.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_download.py
@@ -18,6 +18,7 @@
 
 
 class TestDataSetDownload(unittest.TestCase):
+
     def setUp(self):
         flower_path = DATA_HOME + "/flowers/imagelabels.mat"
 
diff --git a/python/paddle/fluid/tests/unittests/test_debugger.py b/python/paddle/fluid/tests/unittests/test_debugger.py
index f4c9466d63a20..884b58a4acc17 100644
--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ b/python/paddle/fluid/tests/unittests/test_debugger.py
@@ -22,36 +22,42 @@
 
 
 class TestDebugger(unittest.TestCase):
+
     def test_debug_str(self):
         p = Program()
         b = p.current_block()
 
         #selected_rows
-        b.create_var(
-            name='selected_rows',
-            dtype="float32",
-            shape=[5, 10],
-            type=core.VarDesc.VarType.SELECTED_ROWS)
+        b.create_var(name='selected_rows',
+                     dtype="float32",
+                     shape=[5, 10],
+                     type=core.VarDesc.VarType.SELECTED_ROWS)
 
         #tensor array
-        b.create_var(
-            name='tensor_array',
-            shape=[5, 10],
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+        b.create_var(name='tensor_array',
+                     shape=[5, 10],
+                     type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
 
         #operator
-        mul_x = b.create_parameter(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = b.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = b.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        b.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
+        mul_x = b.create_parameter(dtype="float32",
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="mul.x")
+        mul_y = b.create_var(dtype="float32",
+                             shape=[10, 8],
+                             lod_level=0,
+                             name="mul.y")
+        mul_out = b.create_var(dtype="float32",
+                               shape=[5, 8],
+                               lod_level=0,
+                               name="mul.out")
+        b.append_op(type="mul",
+                    inputs={
+                        "X": mul_x,
+                        "Y": mul_y
+                    },
+                    outputs={"Out": mul_out},
+                    attrs={"x_num_col_dims": 1})
 
         print(debugger.pprint_program_codes(p))
 
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
index 0be329ac959f0..75dc36f9bb938 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -41,14 +41,14 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
 
     with fluid.unique_name.guard():
         with fluid.program_guard(main_prog, startup_prog):
-            image = fluid.layers.data(
-                name='image', shape=[784], dtype='float32')
+            image = fluid.layers.data(name='image',
+                                      shape=[784],
+                                      dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            py_reader = fluid.io.PyReader(
-                feed_list=[image, label],
-                capacity=4,
-                iterable=not use_legacy_py_reader,
-                use_double_buffer=use_double_buffer)
+            py_reader = fluid.io.PyReader(feed_list=[image, label],
+                                          capacity=4,
+                                          iterable=not use_legacy_py_reader,
+                                          use_double_buffer=use_double_buffer)
             hidden = image
             for hidden_size in [10, 20, 30]:
                 hidden = fluid.layers.fc(
@@ -62,8 +62,7 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
                                             size=CLASS_NUM,
                                             act='softmax')
             loss = fluid.layers.mean(
-                fluid.layers.cross_entropy(
-                    input=predict_label, label=label))
+                fluid.layers.cross_entropy(input=predict_label, label=label))
 
             optimizer = fluid.optimizer.Adam()
             optimizer.minimize(loss)
@@ -71,6 +70,7 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
 
 
 class TestBase(unittest.TestCase):
+
     def run_main(self, use_legacy_py_reader, with_data_parallel, places,
                  use_double_buffer):
         scope = fluid.Scope()
@@ -90,8 +90,8 @@ def run_main(self, use_legacy_py_reader, with_data_parallel, places,
 
             prog = fluid.CompiledProgram(main_prog)
             if with_data_parallel:
-                prog = prog.with_data_parallel(
-                    loss_name=loss.name, places=places)
+                prog = prog.with_data_parallel(loss_name=loss.name,
+                                               places=places)
 
             step = 0
             step_list = []
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py
index b2cb3141aad48..e2062238b1161 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py
@@ -20,6 +20,7 @@
 
 
 class TestClass(unittest.TestCase):
+
     def setUp(self):
         self.use_double_buffer = True
         self.use_py_reader = True
@@ -48,10 +49,12 @@ def fake_reader():
             main_prog = fluid.Program()
             startup_prog = fluid.Program()
             with fluid.program_guard(main_prog, startup_prog):
-                img = fluid.layers.data(
-                    shape=img_shape, dtype='float32', name='image')
-                label = fluid.layers.data(
-                    shape=label_shape, dtype='int64', name='label')
+                img = fluid.layers.data(shape=img_shape,
+                                        dtype='float32',
+                                        name='image')
+                label = fluid.layers.data(shape=label_shape,
+                                          dtype='int64',
+                                          name='label')
 
                 feeder = fluid.DataFeeder(feed_list=[img, label], place=p)
 
@@ -66,16 +69,15 @@ def fake_reader():
                         capacity=4,
                         iterable=True,
                         use_double_buffer=use_double_buffer)
-                    py_reader.decorate_sample_list_generator(
-                        batch_reader, places=p)
+                    py_reader.decorate_sample_list_generator(batch_reader,
+                                                             places=p)
                 else:
                     py_reader = fluid.io.DataLoader.from_generator(
                         feed_list=[img, label],
                         capacity=4,
                         iterable=True,
                         use_double_buffer=use_double_buffer
-                    ).set_sample_list_generator(
-                        batch_reader, places=p)
+                    ).set_sample_list_generator(batch_reader, places=p)
 
                 for break_beforehand in [True, False]:
                     for epoch_id in six.moves.range(10):
@@ -95,8 +97,8 @@ def fake_reader():
                             self.assertTrue(np.array_equal(L1, L2))
 
                             batch_id += 1
-                            if break_beforehand and batch_id >= int(batch_num /
-                                                                    2):
+                            if break_beforehand and batch_id >= int(
+                                    batch_num / 2):
                                 break
 
                         if break_beforehand:
@@ -106,18 +108,21 @@ def fake_reader():
 
 
 class TestClass2(TestClass):
+
     def setUp(self):
         self.use_double_buffer = False
         self.use_py_reader = True
 
 
 class TestClass3(TestClass):
+
     def setUp(self):
         self.use_double_buffer = True
         self.use_py_reader = False
 
 
 class TestClass4(TestClass):
+
     def setUp(self):
         self.use_double_buffer = False
         self.use_py_reader = False
diff --git a/python/paddle/fluid/tests/unittests/test_default_dtype.py b/python/paddle/fluid/tests/unittests/test_default_dtype.py
index 29ca9a9398597..378f3eb7e8f13 100644
--- a/python/paddle/fluid/tests/unittests/test_default_dtype.py
+++ b/python/paddle/fluid/tests/unittests/test_default_dtype.py
@@ -23,6 +23,7 @@
 
 
 class TestDefaultType(unittest.TestCase):
+
     def check_default(self):
         self.assertEqual("float32", get_default_dtype())
 
@@ -49,6 +50,7 @@ def test_api(self):
 
 
 class TestRaiseError(unittest.TestCase):
+
     def test_error(self):
         self.assertRaises(TypeError, set_default_dtype, "int32")
         self.assertRaises(TypeError, set_default_dtype, np.int32)
diff --git a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
index 01a7b6824885b..be52e033011ec 100644
--- a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
+++ b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
@@ -19,6 +19,7 @@
 
 
 class TestDefaultScopeFuncs(unittest.TestCase):
+
     def test_cur_scope(self):
         self.assertIsNotNone(get_cur_scope())
 
@@ -34,6 +35,7 @@ def test_create_var_get_var(self):
         leave_local_scope()
 
     def test_var_get_int(self):
+
         def __new_scope__():
             i = var("var_i")
             self.assertFalse(i.is_int())
diff --git a/python/paddle/fluid/tests/unittests/test_deform_conv2d.py b/python/paddle/fluid/tests/unittests/test_deform_conv2d.py
index f5f1479d07d2f..90b2feccd3907 100644
--- a/python/paddle/fluid/tests/unittests/test_deform_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_deform_conv2d.py
@@ -47,11 +47,11 @@ def prepare(self):
         self.filter_shape = filter_shape
 
         self.weight = np.random.uniform(
-            -1, 1, (self.out_channels, self.in_channels // self.groups
-                    ) + filter_shape).astype(self.dtype)
+            -1, 1, (self.out_channels, self.in_channels // self.groups) +
+            filter_shape).astype(self.dtype)
         if not self.no_bias:
-            self.bias = np.random.uniform(-1, 1, (
-                self.out_channels, )).astype(self.dtype)
+            self.bias = np.random.uniform(-1, 1, (self.out_channels, )).astype(
+                self.dtype)
 
         def out_size(in_size, pad_size, dilation_size, kernel_size,
                      stride_size):
@@ -66,8 +66,8 @@ def out_size(in_size, pad_size, dilation_size, kernel_size,
                      self.kernel_size[1], self.stride[1]))
         out_shape = (out_h, out_w)
 
-        self.input_shape = (self.batch_size, self.in_channels
-                            ) + self.spatial_shape
+        self.input_shape = (self.batch_size,
+                            self.in_channels) + self.spatial_shape
 
         self.offset_shape = (self.batch_size, self.deformable_groups * 2 *
                              filter_shape[0] * filter_shape[1]) + out_shape
@@ -88,8 +88,8 @@ def static_graph_case_dcn(self):
         start = paddle.static.Program()
         paddle.enable_static()
         with paddle.static.program_guard(main, start):
-            x = paddle.static.data(
-                "input", (-1, self.in_channels, -1, -1), dtype=self.dtype)
+            x = paddle.static.data("input", (-1, self.in_channels, -1, -1),
+                                   dtype=self.dtype)
             offset = paddle.static.data(
                 "offset", (-1, self.deformable_groups * 2 *
                            self.filter_shape[0] * self.filter_shape[1], -1, -1),
@@ -215,11 +215,11 @@ def prepare(self):
         self.filter_shape = filter_shape
 
         self.weight = np.random.uniform(
-            -1, 1, (self.out_channels, self.in_channels // self.groups
-                    ) + filter_shape).astype(self.dtype)
+            -1, 1, (self.out_channels, self.in_channels // self.groups) +
+            filter_shape).astype(self.dtype)
         if not self.no_bias:
-            self.bias = np.random.uniform(-1, 1, (
-                self.out_channels, )).astype(self.dtype)
+            self.bias = np.random.uniform(-1, 1, (self.out_channels, )).astype(
+                self.dtype)
 
         def out_size(in_size, pad_size, dilation_size, kernel_size,
                      stride_size):
@@ -234,8 +234,8 @@ def out_size(in_size, pad_size, dilation_size, kernel_size,
                      self.kernel_size[1], self.stride[1]))
         out_shape = (out_h, out_w)
 
-        self.input_shape = (self.batch_size, self.in_channels
-                            ) + self.spatial_shape
+        self.input_shape = (self.batch_size,
+                            self.in_channels) + self.spatial_shape
 
         self.offset_shape = (self.batch_size, self.deformable_groups * 2 *
                              filter_shape[0] * filter_shape[1]) + out_shape
@@ -256,8 +256,8 @@ def static_graph_case_dcn(self):
         start = paddle.static.Program()
         paddle.enable_static()
         with paddle.static.program_guard(main, start):
-            x = paddle.static.data(
-                "input", (-1, self.in_channels, -1, -1), dtype=self.dtype)
+            x = paddle.static.data("input", (-1, self.in_channels, -1, -1),
+                                   dtype=self.dtype)
             offset = paddle.static.data(
                 "offset", (-1, self.deformable_groups * 2 *
                            self.filter_shape[0] * self.filter_shape[1], -1, -1),
@@ -326,7 +326,8 @@ def dygraph_case_dcn(self):
             padding=self.padding,
             dilation=self.dilation,
             deformable_groups=self.deformable_groups,
-            groups=self.groups, )
+            groups=self.groups,
+        )
 
         y_v2 = paddle.vision.ops.deform_conv2d(
             x=x,
@@ -338,7 +339,8 @@ def dygraph_case_dcn(self):
             padding=self.padding,
             dilation=self.dilation,
             deformable_groups=self.deformable_groups,
-            groups=self.groups, )
+            groups=self.groups,
+        )
 
         out_v1 = y_v1.numpy()
         out_v2 = y_v2.numpy()
@@ -350,8 +352,8 @@ def new_api_static_graph_case_dcn(self):
         start = paddle.static.Program()
         paddle.enable_static()
         with paddle.static.program_guard(main, start):
-            x = paddle.static.data(
-                "input", (-1, self.in_channels, -1, -1), dtype=self.dtype)
+            x = paddle.static.data("input", (-1, self.in_channels, -1, -1),
+                                   dtype=self.dtype)
             offset = paddle.static.data(
                 "offset", (-1, self.deformable_groups * 2 *
                            self.filter_shape[0] * self.filter_shape[1], -1, -1),
@@ -361,8 +363,9 @@ def new_api_static_graph_case_dcn(self):
                          self.filter_shape[1], -1, -1),
                 dtype=self.dtype)
 
-            weight = paddle.static.data(
-                "weight", list(self.weight.shape), dtype=self.dtype)
+            weight = paddle.static.data("weight",
+                                        list(self.weight.shape),
+                                        dtype=self.dtype)
 
             if not self.no_bias:
                 bias = paddle.static.data("bias", [-1], dtype=self.dtype)
@@ -376,7 +379,8 @@ def new_api_static_graph_case_dcn(self):
                 padding=self.padding,
                 dilation=self.dilation,
                 deformable_groups=self.deformable_groups,
-                groups=self.groups, )
+                groups=self.groups,
+            )
 
             y_v2 = paddle.vision.ops.deform_conv2d(
                 x=x,
@@ -388,7 +392,8 @@ def new_api_static_graph_case_dcn(self):
                 padding=self.padding,
                 dilation=self.dilation,
                 deformable_groups=self.deformable_groups,
-                groups=self.groups, )
+                groups=self.groups,
+            )
 
         exe = paddle.static.Executor(self.place)
         exe.run(start)
@@ -430,6 +435,7 @@ def test_identity_with_eager_guard(self):
 
 # testcases for DeformConv2D
 class TestDeformConv2DWithPadding(TestDeformConv2D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -443,6 +449,7 @@ def setUp(self):
 
 
 class TestDeformConv2DWithBias(TestDeformConv2D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -456,6 +463,7 @@ def setUp(self):
 
 
 class TestDeformConv2DWithAsynPadding(TestDeformConv2D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -469,6 +477,7 @@ def setUp(self):
 
 
 class TestDeformConv2DWithDilation(TestDeformConv2D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -482,6 +491,7 @@ def setUp(self):
 
 
 class TestDeformConv2DWithStride(TestDeformConv2D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -495,6 +505,7 @@ def setUp(self):
 
 
 class TestDeformConv2DWithDeformable_Groups(TestDeformConv2D):
+
     def setUp(self):
         self.in_channels = 5
         self.out_channels = 5
@@ -508,6 +519,7 @@ def setUp(self):
 
 
 class TestDeformConv2DWithGroups(TestDeformConv2D):
+
     def setUp(self):
         self.in_channels = 5
         self.out_channels = 5
@@ -522,6 +534,7 @@ def setUp(self):
 
 # testcases for deform_conv2d
 class TestDeformConv2DFunctionalWithPadding(TestDeformConv2DFunctional):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -535,6 +548,7 @@ def setUp(self):
 
 
 class TestDeformConv2DFunctionalWithBias(TestDeformConv2DFunctional):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -548,6 +562,7 @@ def setUp(self):
 
 
 class TestDeformConv2DFunctionalWithAsynPadding(TestDeformConv2DFunctional):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -561,6 +576,7 @@ def setUp(self):
 
 
 class TestDeformConv2DFunctionalWithDilation(TestDeformConv2DFunctional):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -574,6 +590,7 @@ def setUp(self):
 
 
 class TestDeformConv2DFunctionalWithStride(TestDeformConv2DFunctional):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -586,8 +603,9 @@ def setUp(self):
         self.no_bias = False
 
 
-class TestDeformConv2DFunctionalWithDeformable_Groups(
-        TestDeformConv2DFunctional):
+class TestDeformConv2DFunctionalWithDeformable_Groups(TestDeformConv2DFunctional
+                                                      ):
+
     def setUp(self):
         self.in_channels = 5
         self.out_channels = 5
@@ -601,6 +619,7 @@ def setUp(self):
 
 
 class TestDeformConv2DFunctionalWithGroups(TestDeformConv2DFunctional):
+
     def setUp(self):
         self.in_channels = 5
         self.out_channels = 5
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
index 5fc849575b659..d653e7a99e4e3 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
@@ -96,8 +96,8 @@ def dconv_im2col_gemm(input, offset, mask, filter, group, conv_param):
                                 val = dmc_bilinear(input[n, c], in_h, in_w,
                                                    im_h, im_w)
                             val_out = val * mask_table[kh, kw]
-                            col_buffer[n, c * f_h * f_w + kh * f_w + kw, h *
-                                       in_w + w] = val_out
+                            col_buffer[n, c * f_h * f_w + kh * f_w + kw,
+                                       h * in_w + w] = val_out
 
     out = np.zeros((in_n, group, int(out_c // group), out_h * out_w))
     weight = filter.reshape(group, int(out_c // group), f_c * f_h * f_w)
@@ -126,6 +126,7 @@ def deform_conv2d_wrapper(x,
 
 
 class TestModulatedDeformableConvOp(OpTest):
+
     def setUp(self):
         self.python_api = deform_conv2d_wrapper
         self.op_type = "deformable_conv"
@@ -169,11 +170,10 @@ def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(
-            {'Input', 'Offset', 'Mask', 'Filter'},
-            'Output',
-            max_relative_error=0.05,
-            check_eager=True)
+        self.check_grad({'Input', 'Offset', 'Mask', 'Filter'},
+                        'Output',
+                        max_relative_error=0.05,
+                        check_eager=True)
 
     def init_test_case(self):
         self.pad = [1, 1]
@@ -207,6 +207,7 @@ def init_type(self):
 
 
 class TestWithStride(TestModulatedDeformableConvOp):
+
     def init_test_case(self):
         self.pad = [3, 3]
         self.stride = [2, 2]
@@ -229,6 +230,7 @@ def init_test_case(self):
 
 
 class TestWithDilation(TestModulatedDeformableConvOp):
+
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [1, 1]
@@ -254,6 +256,7 @@ def init_dilation(self):
 
 
 class TestWith3x3(TestModulatedDeformableConvOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -276,11 +279,13 @@ def init_test_case(self):
 
 
 class TestWithGroup(TestModulatedDeformableConvOp):
+
     def init_group(self):
         self.groups = 2
 
 
 class TestWithDouble(TestModulatedDeformableConvOp):
+
     def init_type(self):
         self.dtype = np.float64
 
@@ -307,42 +312,61 @@ def init_test_case(self):
 
 
 class TestModulatedDeformableConvInvalidInput(unittest.TestCase):
+
     def test_error(self):
+
         def test_invalid_input():
             paddle.enable_static()
             input = [1, 3, 32, 32]
-            offset = fluid.data(
-                name='offset', shape=[None, 3, 32, 32], dtype='float32')
-            mask = fluid.data(
-                name='mask', shape=[None, 3, 32, 32], dtype='float32')
-            loss = fluid.layers.deformable_conv(
-                input, offset, mask, num_filters=4, filter_size=1)
+            offset = fluid.data(name='offset',
+                                shape=[None, 3, 32, 32],
+                                dtype='float32')
+            mask = fluid.data(name='mask',
+                              shape=[None, 3, 32, 32],
+                              dtype='float32')
+            loss = fluid.layers.deformable_conv(input,
+                                                offset,
+                                                mask,
+                                                num_filters=4,
+                                                filter_size=1)
 
         self.assertRaises(TypeError, test_invalid_input)
 
         def test_invalid_offset():
             paddle.enable_static()
-            input = fluid.data(
-                name='input', shape=[None, 3, 32, 32], dtype='int32')
-            offset = fluid.data(
-                name='offset', shape=[None, 3, 32, 32], dtype='float32')
-            mask = fluid.data(
-                name='mask', shape=[None, 3, 32, 32], dtype='float32')
-            loss = fluid.layers.deformable_conv(
-                input, offset, mask, num_filters=4, filter_size=1)
+            input = fluid.data(name='input',
+                               shape=[None, 3, 32, 32],
+                               dtype='int32')
+            offset = fluid.data(name='offset',
+                                shape=[None, 3, 32, 32],
+                                dtype='float32')
+            mask = fluid.data(name='mask',
+                              shape=[None, 3, 32, 32],
+                              dtype='float32')
+            loss = fluid.layers.deformable_conv(input,
+                                                offset,
+                                                mask,
+                                                num_filters=4,
+                                                filter_size=1)
 
         self.assertRaises(TypeError, test_invalid_offset)
 
         def test_invalid_filter():
             paddle.enable_static()
-            input = fluid.data(
-                name='input_filter', shape=[None, 3, 32, 32], dtype='float32')
-            offset = fluid.data(
-                name='offset_filter', shape=[None, 3, 32, 32], dtype='float32')
-            mask = fluid.data(
-                name='mask_filter', shape=[None, 3, 32, 32], dtype='float32')
-            loss = fluid.layers.deformable_conv(
-                input, offset, mask, num_filters=4, filter_size=0)
+            input = fluid.data(name='input_filter',
+                               shape=[None, 3, 32, 32],
+                               dtype='float32')
+            offset = fluid.data(name='offset_filter',
+                                shape=[None, 3, 32, 32],
+                                dtype='float32')
+            mask = fluid.data(name='mask_filter',
+                              shape=[None, 3, 32, 32],
+                              dtype='float32')
+            loss = fluid.layers.deformable_conv(input,
+                                                offset,
+                                                mask,
+                                                num_filters=4,
+                                                filter_size=0)
 
         self.assertRaises(ValueError, test_invalid_filter)
 
@@ -352,15 +376,22 @@ def test_error_with_eager_guard(self):
 
 
 class TestDeformConv2DAPI(unittest.TestCase):
+
     def test_api(self):
+
         def test_deform_conv2d_v1():
             paddle.enable_static()
-            input = paddle.static.data(
-                name='input_v1', shape=[None, 3, 32, 32], dtype='float32')
-            offset = paddle.static.data(
-                name='offset_v1', shape=[None, 4, 32, 32], dtype='float32')
-            out = paddle.static.nn.deform_conv2d(
-                input, offset, None, num_filters=4, filter_size=1)
+            input = paddle.static.data(name='input_v1',
+                                       shape=[None, 3, 32, 32],
+                                       dtype='float32')
+            offset = paddle.static.data(name='offset_v1',
+                                        shape=[None, 4, 32, 32],
+                                        dtype='float32')
+            out = paddle.static.nn.deform_conv2d(input,
+                                                 offset,
+                                                 None,
+                                                 num_filters=4,
+                                                 filter_size=1)
 
             assert (out.shape == (-1, 4, 32, 32))
 
@@ -368,14 +399,20 @@ def test_deform_conv2d_v1():
 
         def test_deform_conv2d_v2():
             paddle.enable_static()
-            input = paddle.static.data(
-                name='input_v2', shape=[None, 3, 32, 32], dtype='float32')
-            offset = paddle.static.data(
-                name='offset_v2', shape=[None, 4, 32, 32], dtype='float32')
-            mask = paddle.static.data(
-                name='mask_v2', shape=[None, 2, 32, 32], dtype='float32')
-            out = paddle.static.nn.deform_conv2d(
-                input, offset, mask, num_filters=4, filter_size=1)
+            input = paddle.static.data(name='input_v2',
+                                       shape=[None, 3, 32, 32],
+                                       dtype='float32')
+            offset = paddle.static.data(name='offset_v2',
+                                        shape=[None, 4, 32, 32],
+                                        dtype='float32')
+            mask = paddle.static.data(name='mask_v2',
+                                      shape=[None, 2, 32, 32],
+                                      dtype='float32')
+            out = paddle.static.nn.deform_conv2d(input,
+                                                 offset,
+                                                 mask,
+                                                 num_filters=4,
+                                                 filter_size=1)
 
             assert (out.shape == (-1, 4, 32, 32))
 
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
index 304a151c4d3bf..a60881e8dded2 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
@@ -92,8 +92,8 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param):
                                                    im_h, im_w)
                             val_out = val
 
-                            col_buffer[n, c * f_h * f_w + kh * f_w + kw, h *
-                                       in_w + w] = val_out
+                            col_buffer[n, c * f_h * f_w + kh * f_w + kw,
+                                       h * in_w + w] = val_out
 
     out = np.zeros((in_n, group, int(out_c // group), out_h * out_w))
     weight = filter.reshape(group, int(out_c // group), f_c * f_h * f_w)
@@ -122,6 +122,7 @@ def deform_conv2d_wrapper(x,
 
 
 class TestModulatedDeformableConvOp(OpTest):
+
     def setUp(self):
         self.python_api = deform_conv2d_wrapper
         self.op_type = "deformable_conv_v1"
@@ -162,19 +163,17 @@ def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['Input', 'Offset', 'Filter'],
-            'Output',
-            max_relative_error=0.05,
-            check_eager=True)
+        self.check_grad(['Input', 'Offset', 'Filter'],
+                        'Output',
+                        max_relative_error=0.05,
+                        check_eager=True)
 
     def test_check_grad_no_filter(self):
-        self.check_grad(
-            ['Input', 'Offset'],
-            'Output',
-            max_relative_error=0.1,
-            no_grad_set=set(['Filter']),
-            check_eager=True)
+        self.check_grad(['Input', 'Offset'],
+                        'Output',
+                        max_relative_error=0.1,
+                        no_grad_set=set(['Filter']),
+                        check_eager=True)
 
     def init_test_case(self):
         self.pad = [1, 1]
@@ -203,6 +202,7 @@ def init_type(self):
 
 
 class TestWithStride(TestModulatedDeformableConvOp):
+
     def init_test_case(self):
         self.pad = [3, 3]
         self.stride = [2, 2]
@@ -220,6 +220,7 @@ def init_test_case(self):
 
 
 class TestWithDilation(TestModulatedDeformableConvOp):
+
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [1, 1]
@@ -240,6 +241,7 @@ def init_dilation(self):
 
 
 class TestWith1x1(TestModulatedDeformableConvOp):
+
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -257,6 +259,7 @@ def init_test_case(self):
 
 
 class TestWithGroup(TestModulatedDeformableConvOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -278,38 +281,42 @@ def init_group(self):
 
 
 class TestWithDouble(TestModulatedDeformableConvOp):
+
     def init_type(self):
         self.dtype = np.float64
 
 
 class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase):
+
     def test_error(self):
+
         def test_invalid_input():
             input = [1, 3, 32, 32]
-            offset = fluid.data(
-                name='offset', shape=[None, 3, 32, 32], dtype='float32')
-            loss = fluid.layers.deformable_conv(
-                input,
-                offset,
-                mask=None,
-                num_filters=4,
-                filter_size=1,
-                modulated=False)
+            offset = fluid.data(name='offset',
+                                shape=[None, 3, 32, 32],
+                                dtype='float32')
+            loss = fluid.layers.deformable_conv(input,
+                                                offset,
+                                                mask=None,
+                                                num_filters=4,
+                                                filter_size=1,
+                                                modulated=False)
 
         self.assertRaises(TypeError, test_invalid_input)
 
         def test_invalid_offset():
-            input = fluid.data(
-                name='input', shape=[None, 3, 32, 32], dtype='int32')
-            offset = fluid.data(
-                name='offset', shape=[None, 3, 32, 32], dtype='float32')
-            loss = fluid.layers.deformable_conv(
-                input,
-                offset,
-                mask=None,
-                num_filters=4,
-                filter_size=1,
-                modulated=False)
+            input = fluid.data(name='input',
+                               shape=[None, 3, 32, 32],
+                               dtype='int32')
+            offset = fluid.data(name='offset',
+                                shape=[None, 3, 32, 32],
+                                dtype='float32')
+            loss = fluid.layers.deformable_conv(input,
+                                                offset,
+                                                mask=None,
+                                                num_filters=4,
+                                                filter_size=1,
+                                                modulated=False)
 
         self.assertRaises(TypeError, test_invalid_offset)
 
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py b/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py
index 20d72f2d95f3a..f1cd04bd3f58b 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py
@@ -52,6 +52,7 @@ def set_outputs(output, top_count):
 
 
 class TestDeformablePSROIPoolOp(OpTest):
+
     def set_data(self):
         self.start_test1()
         self.start_test2()
@@ -369,130 +370,131 @@ def test_check_grad(self):
 
 
 class TestDeformablePSROIPoolOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            input1 = fluid.data(
-                name="input1", shape=[2, 192, 64, 64], dtype='float32')
-            rois1 = fluid.data(
-                name="rois1", shape=[-1, 4], dtype='float32', lod_level=1)
-            trans1 = fluid.data(
-                name="trans1", shape=[2, 384, 64, 64], dtype='float32')
+            input1 = fluid.data(name="input1",
+                                shape=[2, 192, 64, 64],
+                                dtype='float32')
+            rois1 = fluid.data(name="rois1",
+                               shape=[-1, 4],
+                               dtype='float32',
+                               lod_level=1)
+            trans1 = fluid.data(name="trans1",
+                                shape=[2, 384, 64, 64],
+                                dtype='float32')
 
             # The `input` must be Variable and the data type of `input` Tensor must be one of float32 and float64.
             def test_input_type():
-                fluid.layers.deformable_roi_pooling(
-                    input=[3, 4],
-                    rois=rois1,
-                    trans=trans1,
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=(8, 8),
-                    sample_per_part=4,
-                    position_sensitive=True)
+                fluid.layers.deformable_roi_pooling(input=[3, 4],
+                                                    rois=rois1,
+                                                    trans=trans1,
+                                                    pooled_height=8,
+                                                    pooled_width=8,
+                                                    part_size=(8, 8),
+                                                    sample_per_part=4,
+                                                    position_sensitive=True)
 
             self.assertRaises(TypeError, test_input_type)
 
             def test_input_tensor_dtype():
-                input2 = fluid.data(
-                    name="input2", shape=[2, 192, 64, 64], dtype='int32')
-                fluid.layers.deformable_roi_pooling(
-                    input=input2,
-                    rois=rois1,
-                    trans=trans1,
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=(8, 8),
-                    sample_per_part=4,
-                    position_sensitive=True)
+                input2 = fluid.data(name="input2",
+                                    shape=[2, 192, 64, 64],
+                                    dtype='int32')
+                fluid.layers.deformable_roi_pooling(input=input2,
+                                                    rois=rois1,
+                                                    trans=trans1,
+                                                    pooled_height=8,
+                                                    pooled_width=8,
+                                                    part_size=(8, 8),
+                                                    sample_per_part=4,
+                                                    position_sensitive=True)
 
             self.assertRaises(TypeError, test_input_tensor_dtype)
 
             # The `rois` must be Variable and the data type of `rois` Tensor must be one of float32 and float64.
             def test_rois_type():
-                fluid.layers.deformable_roi_pooling(
-                    input=input1,
-                    rois=2,
-                    trans=trans1,
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=(8, 8),
-                    sample_per_part=4,
-                    position_sensitive=True)
+                fluid.layers.deformable_roi_pooling(input=input1,
+                                                    rois=2,
+                                                    trans=trans1,
+                                                    pooled_height=8,
+                                                    pooled_width=8,
+                                                    part_size=(8, 8),
+                                                    sample_per_part=4,
+                                                    position_sensitive=True)
 
             self.assertRaises(TypeError, test_rois_type)
 
             def test_rois_tensor_dtype():
-                rois2 = fluid.data(
-                    name="rois2", shape=[-1, 4], dtype='int32', lod_level=1)
-                fluid.layers.deformable_roi_pooling(
-                    input=input1,
-                    rois=rois2,
-                    trans=trans1,
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=(8, 8),
-                    sample_per_part=4,
-                    position_sensitive=True)
+                rois2 = fluid.data(name="rois2",
+                                   shape=[-1, 4],
+                                   dtype='int32',
+                                   lod_level=1)
+                fluid.layers.deformable_roi_pooling(input=input1,
+                                                    rois=rois2,
+                                                    trans=trans1,
+                                                    pooled_height=8,
+                                                    pooled_width=8,
+                                                    part_size=(8, 8),
+                                                    sample_per_part=4,
+                                                    position_sensitive=True)
 
             self.assertRaises(TypeError, test_rois_tensor_dtype)
 
             # The `trans` must be Variable and the data type of `trans` Tensor must be one of float32 and float64.
             def test_trans_type():
-                fluid.layers.deformable_roi_pooling(
-                    input=input1,
-                    rois=rois1,
-                    trans=[2],
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=(8, 8),
-                    sample_per_part=4,
-                    position_sensitive=True)
+                fluid.layers.deformable_roi_pooling(input=input1,
+                                                    rois=rois1,
+                                                    trans=[2],
+                                                    pooled_height=8,
+                                                    pooled_width=8,
+                                                    part_size=(8, 8),
+                                                    sample_per_part=4,
+                                                    position_sensitive=True)
 
             self.assertRaises(TypeError, test_trans_type)
 
             def test_trans_tensor_dtype():
-                trans2 = fluid.data(
-                    name="trans2", shape=[2, 384, 64, 64], dtype='int32')
-                fluid.layers.deformable_roi_pooling(
-                    input=input1,
-                    rois=rois1,
-                    trans=trans2,
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=(8, 8),
-                    sample_per_part=4,
-                    position_sensitive=True)
+                trans2 = fluid.data(name="trans2",
+                                    shape=[2, 384, 64, 64],
+                                    dtype='int32')
+                fluid.layers.deformable_roi_pooling(input=input1,
+                                                    rois=rois1,
+                                                    trans=trans2,
+                                                    pooled_height=8,
+                                                    pooled_width=8,
+                                                    part_size=(8, 8),
+                                                    sample_per_part=4,
+                                                    position_sensitive=True)
 
             self.assertRaises(TypeError, test_trans_tensor_dtype)
 
             # The `group_size` must be one of list and tuple.
             # Each element must be int.
             def test_group_size_type():
-                fluid.layers.deformable_roi_pooling(
-                    input=input1,
-                    rois=rois1,
-                    trans=trans1,
-                    group_size=1,
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=(8, 8),
-                    sample_per_part=4,
-                    position_sensitive=True)
+                fluid.layers.deformable_roi_pooling(input=input1,
+                                                    rois=rois1,
+                                                    trans=trans1,
+                                                    group_size=1,
+                                                    pooled_height=8,
+                                                    pooled_width=8,
+                                                    part_size=(8, 8),
+                                                    sample_per_part=4,
+                                                    position_sensitive=True)
 
             self.assertRaises(TypeError, test_group_size_type)
 
             # The `part_size` must be one of list, tuple and None.
             # Each element must be int.
             def test_part_size_type():
-                fluid.layers.deformable_roi_pooling(
-                    input=input1,
-                    rois=rois1,
-                    trans=trans1,
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=8,
-                    sample_per_part=4,
-                    position_sensitive=True)
+                fluid.layers.deformable_roi_pooling(input=input1,
+                                                    rois=rois1,
+                                                    trans=trans1,
+                                                    pooled_height=8,
+                                                    pooled_width=8,
+                                                    part_size=8,
+                                                    sample_per_part=4,
+                                                    position_sensitive=True)
 
             self.assertRaises(TypeError, test_part_size_type)
 
diff --git a/python/paddle/fluid/tests/unittests/test_deg2rad.py b/python/paddle/fluid/tests/unittests/test_deg2rad.py
index 31219d5ab97af..c3e77c0ac5d8b 100644
--- a/python/paddle/fluid/tests/unittests/test_deg2rad.py
+++ b/python/paddle/fluid/tests/unittests/test_deg2rad.py
@@ -26,10 +26,11 @@
 
 
 class TestDeg2radAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_dtype = 'float64'
-        self.x_np = np.array(
-            [180.0, -180.0, 360.0, -360.0, 90.0, -90.0]).astype(np.float64)
+        self.x_np = np.array([180.0, -180.0, 360.0, -360.0, 90.0,
+                              -90.0]).astype(np.float64)
         self.x_shape = [6]
         self.out_np = np.deg2rad(self.x_np)
 
@@ -40,8 +41,8 @@ def test_static_graph(self):
             x = fluid.data(name='input', dtype=self.x_dtype, shape=self.x_shape)
             out = paddle.deg2rad(x)
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
             res = exe.run(fluid.default_main_program(),
                           feed={'input': self.x_np},
diff --git a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
index 4b0bc1dcf85fb..c00e75882943f 100644
--- a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
@@ -22,6 +22,7 @@
 
 
 class TestDensityPriorBoxOp(OpTest):
+
     def set_data(self):
         self.init_test_params()
         self.init_test_input()
@@ -76,8 +77,8 @@ def init_test_params(self):
         if len(self.fixed_sizes) > 0 and len(self.densities) > 0:
             for density in self.densities:
                 if len(self.fixed_ratios) > 0:
-                    self.num_priors += len(self.fixed_ratios) * (pow(density,
-                                                                     2))
+                    self.num_priors += len(self.fixed_ratios) * (pow(
+                        density, 2))
         self.offset = 0.5
 
     def init_test_input(self):
@@ -135,6 +136,7 @@ def init_test_output(self):
 
 
 class TestDensityPriorBox(TestDensityPriorBoxOp):
+
     def set_density(self):
         self.densities = [3, 4]
         self.fixed_sizes = [1.0, 2.0]
diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
index 654397b6c201f..dc9991c3836f7 100755
--- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
@@ -24,6 +24,7 @@
 import warnings
 import paddle.utils.deprecated as deprecated
 from paddle import _C_ops
+
 LOWEST_WARNING_POSTION = 3
 ERROR_WARNING_POSTION = sys.maxsize
 
@@ -81,7 +82,7 @@ def test_fluid_data(self):
         # expected
         expected = LOWEST_WARNING_POSTION
 
-        # captured        
+        # captured
         captured = get_warning_index(fluid.data)
         paddle.disable_static()
 
@@ -104,7 +105,7 @@ def test_fluid_elementwise_mul(self):
         # expected
         expected = LOWEST_WARNING_POSTION
 
-        # captured   
+        # captured
         captured = get_warning_index(fluid.layers.elementwise_mul)
 
         # testting
@@ -124,7 +125,7 @@ def test_new_multiply(self):
         # expected
         expected = LOWEST_WARNING_POSTION
 
-        # captured        
+        # captured
         captured = get_warning_index(paddle.multiply)
 
         # testting
@@ -145,7 +146,7 @@ def test_ops_elementwise_mul(self):
         # expected
         expected = LOWEST_WARNING_POSTION
 
-        # captured        
+        # captured
         captured = get_warning_index(fluid.layers.elementwise_mul)
 
         # testting
@@ -175,8 +176,8 @@ def test_softmax_with_cross_entropy(self):
         x = linear(data)
 
         with warnings.catch_warnings(record=True) as w:
-            out = paddle.nn.functional.softmax_with_cross_entropy(
-                logits=x, label=label)
+            out = paddle.nn.functional.softmax_with_cross_entropy(logits=x,
+                                                                  label=label)
             assert (
                 'API "paddle.nn.functional.loss.softmax_with_cross_entropy" is '
                 'deprecated since 2.0.0') in str(w[-1].message)
diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_memory_optimize_interfaces.py b/python/paddle/fluid/tests/unittests/test_deprecated_memory_optimize_interfaces.py
index c3a21ba0bcbb6..bd91e14e34d38 100644
--- a/python/paddle/fluid/tests/unittests/test_deprecated_memory_optimize_interfaces.py
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_memory_optimize_interfaces.py
@@ -18,6 +18,7 @@
 
 
 class DeprecatedMemoryOptimizationInterfaceTest(unittest.TestCase):
+
     def setUp(self):
         self.method = fluid.memory_optimize
 
@@ -60,6 +61,7 @@ def test_main(self):
 
 
 class ReleaseMemoryTest(DeprecatedMemoryOptimizationInterfaceTest):
+
     def setUp(self):
         self.method = fluid.release_memory
 
diff --git a/python/paddle/fluid/tests/unittests/test_dequantize_abs_max_op.py b/python/paddle/fluid/tests/unittests/test_dequantize_abs_max_op.py
index 696a60787b754..7750a41701e1c 100644
--- a/python/paddle/fluid/tests/unittests/test_dequantize_abs_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dequantize_abs_max_op.py
@@ -32,6 +32,7 @@ def dequantize_max_abs(x, scale, max_range):
 
 
 class TestDequantizeMaxAbsOp(OpTest):
+
     def set_args(self):
         self.num_bits = 8
         self.max_range = math.pow(2, self.num_bits - 1) - 1
@@ -56,6 +57,7 @@ def test_check_output(self):
 
 
 class TestDequantizeMaxAbsOp5Bits(TestDequantizeMaxAbsOp):
+
     def set_args(self):
         self.num_bits = 5
         self.max_range = math.pow(2, self.num_bits - 1) - 1
@@ -63,6 +65,7 @@ def set_args(self):
 
 
 class TestDequantizeMaxAbsOpInt16(TestDequantizeMaxAbsOp):
+
     def set_args(self):
         self.num_bits = 16
         self.max_range = math.pow(2, self.num_bits - 1) - 1
diff --git a/python/paddle/fluid/tests/unittests/test_dequantize_log_op.py b/python/paddle/fluid/tests/unittests/test_dequantize_log_op.py
index 3ad1f05f92d58..8b7b5df656add 100644
--- a/python/paddle/fluid/tests/unittests/test_dequantize_log_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dequantize_log_op.py
@@ -33,6 +33,7 @@ def dequantize_log(x, dict_data):
 
 
 class TestDequantizeLogOp(OpTest):
+
     def setUp(self):
         self.op_type = "dequantize_log"
         x = np.random.randint(low=-128, high=127, size=(20, 10)).astype('int8')
diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py
index b63c4f55dbcb1..c82ba2bc8cb8e 100644
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -39,20 +39,18 @@
 # random seed must set before configuring the network.
 # fluid.default_startup_program().random_seed = SEED
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=data,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(input=data,
+                                                  filter_size=5,
+                                                  num_filters=20,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(input=conv_pool_1,
+                                                  filter_size=5,
+                                                  num_filters=50,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
 
     # TODO(dzhwinter) : refine the initializer and random seed settting
     SIZE = 10
@@ -66,8 +64,8 @@ def cnn_model(data):
         size=SIZE,
         act="softmax",
         param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.NormalInitializer(
-                loc=0.0, scale=scale)))
+            initializer=fluid.initializer.NormalInitializer(loc=0.0,
+                                                            scale=scale)))
     return predict
 
 
@@ -83,19 +81,21 @@ def get_model(batch_size):
 
     # Evaluator
     batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
+    batch_acc = fluid.layers.accuracy(input=predict,
+                                      label=label,
+                                      total=batch_size_tensor)
 
     inference_program = fluid.default_main_program().clone()
     # Optimization
-    opt = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.001, beta1=0.9, beta2=0.999)
+    opt = fluid.optimizer.AdamOptimizer(learning_rate=0.001,
+                                        beta1=0.9,
+                                        beta2=0.999)
 
     # Reader
-    train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=batch_size)
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=batch_size)
+    train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                batch_size=batch_size)
+    test_reader = paddle.batch(paddle.dataset.mnist.test(),
+                               batch_size=batch_size)
     opt.minimize(avg_cost)
     return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
 
@@ -156,8 +156,8 @@ def program_equal(a, b):
         elif k == 'blocks':
             for i in range(0, len(a.blocks)):
                 if not block_equal(a.blocks[i], b.blocks[i]):
-                    raise ValueError("In operator_equal not equal:{0}\n".format(
-                        k))
+                    raise ValueError(
+                        "In operator_equal not equal:{0}\n".format(k))
                     return False
             assert (len(a.blocks) == len(b.blocks))
         elif k == '_auto_checkpoint_name':
@@ -169,6 +169,7 @@ def program_equal(a, b):
 
 
 class TestCloneWithStopGradient(unittest.TestCase):
+
     def test_clone_with_stop_gradient(self):
         train_program = fluid.Program()
         startup_program = fluid.Program()
@@ -179,8 +180,7 @@ def test_clone_with_stop_gradient(self):
             hidden2 = fluid.layers.dropout(hidden1, dropout_prob=0.5)
             loss = fluid.layers.cross_entropy(
                 input=fluid.layers.fc(hidden2, size=10, act='softmax'),
-                label=fluid.layers.data(
-                    name='label', shape=[1], dtype='int64'))
+                label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
             avg_loss = fluid.layers.mean(loss)
             test_program = train_program.clone(for_test=False)
 
@@ -191,6 +191,7 @@ def test_clone_with_stop_gradient(self):
 
 
 class TestCloneWithStopGradientInSubBlock(unittest.TestCase):
+
     def test_clone_with_stop_gradient(self):
         train_program = fluid.Program()
         startup_program = fluid.Program()
@@ -215,8 +216,7 @@ def false_fn():
 
             loss = fluid.layers.cross_entropy(
                 input=fluid.layers.fc(hidden2, size=10, act='softmax'),
-                label=fluid.layers.data(
-                    name='label', shape=[1], dtype='int64'))
+                label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
             avg_loss = fluid.layers.mean(loss)
             test_program = train_program.clone(for_test=False)
 
@@ -231,6 +231,7 @@ def false_fn():
 
 
 class TestCloneWithRaise(unittest.TestCase):
+
     def test_clone_with_stop_gradient(self):
         train_program = fluid.Program()
         startup_program = fluid.Program()
@@ -254,8 +255,7 @@ def false_fn():
             hidden2 = fluid.layers.cond(cond, true_fn, false_fn)
             loss = fluid.layers.cross_entropy(
                 input=fluid.layers.fc(hidden2, size=10, act='softmax'),
-                label=fluid.layers.data(
-                    name='label', shape=[1], dtype='int64'))
+                label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
             avg_loss = fluid.layers.mean(loss)
             test_program = train_program.clone(for_test=False)
 
diff --git a/python/paddle/fluid/tests/unittests/test_detach.py b/python/paddle/fluid/tests/unittests/test_detach.py
index 8d19a1d3f65cd..9950aa65c01bc 100644
--- a/python/paddle/fluid/tests/unittests/test_detach.py
+++ b/python/paddle/fluid/tests/unittests/test_detach.py
@@ -25,9 +25,10 @@
 
 
 class Test_Detach(unittest.TestCase):
+
     def generate_Data(self):
-        data = np.array(
-            [[1, 8, 3, 9], [7, 20, 9, 6], [4, 6, 8, 10]]).astype('float32')
+        data = np.array([[1, 8, 3, 9], [7, 20, 9, 6], [4, 6, 8,
+                                                       10]]).astype('float32')
         return data
 
     def no_detach_multi(self):
@@ -37,29 +38,26 @@ def no_detach_multi(self):
                 initializer=fluid.initializer.Constant(5.0))
             linear_b_param_attrs = fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(6.0))
-            linear = Linear(
-                4,
-                10,
-                param_attr=linear_w_param_attrs,
-                bias_attr=linear_b_param_attrs)
+            linear = Linear(4,
+                            10,
+                            param_attr=linear_w_param_attrs,
+                            bias_attr=linear_b_param_attrs)
             linear1_w_param_attrs = fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(7.0))
             linear1_b_param_attrs = fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(8.0))
-            linear1 = Linear(
-                10,
-                1,
-                param_attr=linear1_w_param_attrs,
-                bias_attr=linear1_b_param_attrs)
+            linear1 = Linear(10,
+                             1,
+                             param_attr=linear1_w_param_attrs,
+                             bias_attr=linear1_b_param_attrs)
             linear2_w_param_attrs = fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(9.0))
             linear2_b_param_attrs = fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(10.0))
-            linear2 = Linear(
-                10,
-                1,
-                param_attr=linear2_w_param_attrs,
-                bias_attr=linear2_b_param_attrs)
+            linear2 = Linear(10,
+                             1,
+                             param_attr=linear2_w_param_attrs,
+                             bias_attr=linear2_b_param_attrs)
             data = to_variable(data)
             x = linear(data)
             x1 = linear1(x)
@@ -76,20 +74,18 @@ def no_detach_single(self):
                 initializer=fluid.initializer.Constant(5.0))
             linear_b_param_attrs = fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(6.0))
-            linear = Linear(
-                4,
-                10,
-                param_attr=linear_w_param_attrs,
-                bias_attr=linear_b_param_attrs)
+            linear = Linear(4,
+                            10,
+                            param_attr=linear_w_param_attrs,
+                            bias_attr=linear_b_param_attrs)
             linear1_w_param_attrs = fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(7.0))
             linear1_b_param_attrs = fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(8.0))
-            linear1 = Linear(
-                10,
-                1,
-                param_attr=linear1_w_param_attrs,
-                bias_attr=linear1_b_param_attrs)
+            linear1 = Linear(10,
+                             1,
+                             param_attr=linear1_w_param_attrs,
+                             bias_attr=linear1_b_param_attrs)
             data = to_variable(data)
             x = linear(data)
             x1 = linear1(x)
@@ -105,29 +101,26 @@ def detach_multi(self):
                 initializer=fluid.initializer.Constant(5.0))
             linear_b_param_attrs = fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(6.0))
-            linear = Linear(
-                4,
-                10,
-                param_attr=linear_w_param_attrs,
-                bias_attr=linear_b_param_attrs)
+            linear = Linear(4,
+                            10,
+                            param_attr=linear_w_param_attrs,
+                            bias_attr=linear_b_param_attrs)
             linear1_w_param_attrs = fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(7.0))
             linear1_b_param_attrs = fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(8.0))
-            linear1 = Linear(
-                10,
-                1,
-                param_attr=linear1_w_param_attrs,
-                bias_attr=linear1_b_param_attrs)
+            linear1 = Linear(10,
+                             1,
+                             param_attr=linear1_w_param_attrs,
+                             bias_attr=linear1_b_param_attrs)
             linear2_w_param_attrs = fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(9.0))
             linear2_b_param_attrs = fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(10.0))
-            linear2 = Linear(
-                10,
-                1,
-                param_attr=linear2_w_param_attrs,
-                bias_attr=linear2_b_param_attrs)
+            linear2 = Linear(10,
+                             1,
+                             param_attr=linear2_w_param_attrs,
+                             bias_attr=linear2_b_param_attrs)
             data = to_variable(data)
             x = linear(data)
             x_detach = x.detach()
@@ -139,10 +132,12 @@ def detach_multi(self):
             return x.gradient()
 
     def test_NoDetachMulti_DetachMulti(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         array_no_detach_multi = self.no_detach_multi()
         array_detach_multi = self.detach_multi()
 
         assert not np.array_equal(array_no_detach_multi, array_detach_multi)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_NoDetachSingle_DetachMulti(self):
         array_no_detach_single = self.no_detach_single()
@@ -151,6 +146,7 @@ def test_NoDetachSingle_DetachMulti(self):
 
 
 class TestInplace(unittest.TestCase):
+
     def test_forward_version(self):
         with paddle.fluid.dygraph.guard():
             var = paddle.to_tensor(np.ones((4, 2, 3)).astype(np.float32))
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
index 93ab4a73906a0..c545484bff3fb 100644
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -25,6 +25,7 @@
 
 
 class TestDetectionMAPOp(OpTest):
+
     def set_data(self):
         self.class_num = 4
         self.init_test_case()
@@ -34,8 +35,8 @@ def set_data(self):
         self.mAP = np.array(self.mAP).astype('float32')
 
         if len(self.class_pos_count) > 0:
-            self.class_pos_count = np.array(self.class_pos_count).astype(
-                'int32')
+            self.class_pos_count = np.array(
+                self.class_pos_count).astype('int32')
             self.true_pos = np.array(self.true_pos).astype('float32')
             self.false_pos = np.array(self.false_pos).astype('float32')
             self.has_state = np.array([1]).astype('int32')
@@ -61,8 +62,8 @@ def set_data(self):
             'class_num': self.class_num
         }
 
-        self.out_class_pos_count = np.array(self.out_class_pos_count).astype(
-            'int')
+        self.out_class_pos_count = np.array(
+            self.out_class_pos_count).astype('int')
         self.out_true_pos = np.array(self.out_true_pos).astype('float32')
         self.out_false_pos = np.array(self.out_false_pos).astype('float32')
 
@@ -85,12 +86,13 @@ def init_test_case(self):
 
         # label score xmin ymin xmax ymax difficult
         self.detect_lod = [[3, 4]]
-        self.detect = [
-            [1, 0.3, 0.1, 0.0, 0.4, 0.3], [1, 0.7, 0.0, 0.1, 0.2, 0.3],
-            [1, 0.9, 0.7, 0.6, 0.8, 0.8], [2, 0.8, 0.2, 0.1, 0.4, 0.4],
-            [2, 0.1, 0.4, 0.3, 0.7, 0.5], [1, 0.2, 0.8, 0.1, 1.0, 0.3],
-            [3, 0.2, 0.8, 0.1, 1.0, 0.3]
-        ]
+        self.detect = [[1, 0.3, 0.1, 0.0, 0.4,
+                        0.3], [1, 0.7, 0.0, 0.1, 0.2, 0.3],
+                       [1, 0.9, 0.7, 0.6, 0.8,
+                        0.8], [2, 0.8, 0.2, 0.1, 0.4, 0.4],
+                       [2, 0.1, 0.4, 0.3, 0.7,
+                        0.5], [1, 0.2, 0.8, 0.1, 1.0, 0.3],
+                       [3, 0.2, 0.8, 0.1, 1.0, 0.3]]
 
         # label score true_pos false_pos
         self.tf_pos_lod = [[3, 4]]
@@ -247,6 +249,7 @@ def test_check_output(self):
 
 
 class TestDetectionMAPOpSkipDiff(TestDetectionMAPOp):
+
     def init_test_case(self):
         super(TestDetectionMAPOpSkipDiff, self).init_test_case()
 
@@ -259,6 +262,7 @@ def init_test_case(self):
 
 
 class TestDetectionMAPOpWithoutDiff(TestDetectionMAPOp):
+
     def init_test_case(self):
         super(TestDetectionMAPOpWithoutDiff, self).init_test_case()
 
@@ -268,6 +272,7 @@ def init_test_case(self):
 
 
 class TestDetectionMAPOp11Point(TestDetectionMAPOp):
+
     def init_test_case(self):
         super(TestDetectionMAPOp11Point, self).init_test_case()
 
@@ -275,6 +280,7 @@ def init_test_case(self):
 
 
 class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
+
     def init_test_case(self):
         super(TestDetectionMAPOpMultiBatch, self).init_test_case()
         self.class_pos_count = [0, 2, 1, 0]
@@ -285,6 +291,7 @@ def init_test_case(self):
 
 
 class TestDetectionMAPOp11PointWithClassNoTP(TestDetectionMAPOp):
+
     def init_test_case(self):
         self.overlap_threshold = 0.3
         self.evaluate_difficult = True
diff --git a/python/paddle/fluid/tests/unittests/test_determinant_op.py b/python/paddle/fluid/tests/unittests/test_determinant_op.py
index d447d213f3c81..7a799ad377606 100644
--- a/python/paddle/fluid/tests/unittests/test_determinant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_determinant_op.py
@@ -28,6 +28,7 @@
 
 
 class TestDeterminantOp(OpTest):
+
     def setUp(self):
         self.python_api = paddle.linalg.det
         self.init_data()
@@ -48,6 +49,7 @@ def init_data(self):
 
 
 class TestDeterminantOpCase1(TestDeterminantOp):
+
     def init_data(self):
         np.random.seed(0)
         self.case = np.random.rand(10, 10).astype('float32')
@@ -56,6 +58,7 @@ def init_data(self):
 
 
 class TestDeterminantOpCase2(TestDeterminantOp):
+
     def init_data(self):
         np.random.seed(0)
         # not invertible matrix
@@ -65,6 +68,7 @@ def init_data(self):
 
 
 class TestDeterminantAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(0)
         self.shape = [3, 3, 5, 5]
@@ -97,6 +101,7 @@ def test_eager(self):
 
 
 class TestSlogDeterminantOp(OpTest):
+
     def setUp(self):
         self.op_type = "slogdeterminant"
         self.init_data()
@@ -117,6 +122,7 @@ def init_data(self):
 
 
 class TestSlogDeterminantOpCase1(TestSlogDeterminantOp):
+
     def init_data(self):
         np.random.seed(0)
         self.case = np.random.rand(2, 2, 5, 5).astype(np.float32)
@@ -125,6 +131,7 @@ def init_data(self):
 
 
 class TestSlogDeterminantAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(0)
         self.shape = [3, 3, 5, 5]
diff --git a/python/paddle/fluid/tests/unittests/test_device.py b/python/paddle/fluid/tests/unittests/test_device.py
index fc3734c78743a..eff2bf490dfa3 100644
--- a/python/paddle/fluid/tests/unittests/test_device.py
+++ b/python/paddle/fluid/tests/unittests/test_device.py
@@ -23,6 +23,7 @@
 
 
 class TestStaticDeviceManage(unittest.TestCase):
+
     def _test_device(self, device_name, device_class):
         paddle.set_device(device_name)
 
@@ -55,6 +56,7 @@ def test_npu_device(self):
 
 
 class TestImperativeDeviceManage(unittest.TestCase):
+
     def test_cpu(self):
         with fluid.dygraph.guard():
             paddle.set_device('cpu')
diff --git a/python/paddle/fluid/tests/unittests/test_device_guard.py b/python/paddle/fluid/tests/unittests/test_device_guard.py
index e547c786feb11..911c6c4a2d56e 100644
--- a/python/paddle/fluid/tests/unittests/test_device_guard.py
+++ b/python/paddle/fluid/tests/unittests/test_device_guard.py
@@ -43,14 +43,17 @@ def get_vaild_warning_num(warning, w):
 
 
 class TestDeviceGuard(unittest.TestCase):
+
     def test_device_guard(self):
         main_program = paddle.static.Program()
         startup_program = paddle.static.Program()
         with paddle.static.program_guard(main_program, startup_program):
-            data1 = paddle.full(
-                shape=[1, 3, 8, 8], fill_value=0.5, dtype='float32')
-            data2 = paddle.full(
-                shape=[1, 3, 5, 5], fill_value=0.5, dtype='float32')
+            data1 = paddle.full(shape=[1, 3, 8, 8],
+                                fill_value=0.5,
+                                dtype='float32')
+            data2 = paddle.full(shape=[1, 3, 5, 5],
+                                fill_value=0.5,
+                                dtype='float32')
             shape = paddle.shape(data2)
             with paddle.static.device_guard("cpu"):
                 shape = paddle.slice(shape, axes=[0], starts=[0], ends=[4])
@@ -71,10 +74,12 @@ def test_device_guard_with_id(self):
         main_program = paddle.static.Program()
         startup_program = paddle.static.Program()
         with paddle.static.program_guard(main_program, startup_program):
-            data1 = paddle.full(
-                shape=[1, 3, 8, 8], fill_value=0.5, dtype='float32')
-            data2 = paddle.full(
-                shape=[1, 3, 5, 5], fill_value=0.5, dtype='float32')
+            data1 = paddle.full(shape=[1, 3, 8, 8],
+                                fill_value=0.5,
+                                dtype='float32')
+            data2 = paddle.full(shape=[1, 3, 5, 5],
+                                fill_value=0.5,
+                                dtype='float32')
             shape = paddle.shape(data2)
             with paddle.static.device_guard("cpu"):
                 shape = paddle.slice(shape, axes=[0], starts=[0], ends=[4])
@@ -95,13 +100,16 @@ def test_cpu_only_op(self):
         main_program = paddle.static.Program()
         startup_program = paddle.static.Program()
         with paddle.static.program_guard(main_program, startup_program):
-            x = paddle.full(
-                shape=[2, 255, 13, 13], fill_value=0.3, dtype='float32')
-            gt_box = paddle.full(
-                shape=[2, 6, 4], fill_value=0.5, dtype='float32')
+            x = paddle.full(shape=[2, 255, 13, 13],
+                            fill_value=0.3,
+                            dtype='float32')
+            gt_box = paddle.full(shape=[2, 6, 4],
+                                 fill_value=0.5,
+                                 dtype='float32')
             gt_label = paddle.full(shape=[2, 6], fill_value=1.0, dtype='int32')
-            gt_score = paddle.full(
-                shape=[2, 6], fill_value=0.5, dtype='float32')
+            gt_score = paddle.full(shape=[2, 6],
+                                   fill_value=0.5,
+                                   dtype='float32')
             anchors = [
                 10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156,
                 198, 373, 326
@@ -109,16 +117,15 @@ def test_cpu_only_op(self):
             anchor_mask = [0, 1, 2]
             with paddle.static.device_guard("gpu"):
                 # yolov3_loss only has cpu kernel, so its cpu kernel will be executed
-                loss = fluid.layers.yolov3_loss(
-                    x=x,
-                    gt_box=gt_box,
-                    gt_label=gt_label,
-                    gt_score=gt_score,
-                    anchors=anchors,
-                    anchor_mask=anchor_mask,
-                    class_num=80,
-                    ignore_thresh=0.7,
-                    downsample_ratio=32)
+                loss = fluid.layers.yolov3_loss(x=x,
+                                                gt_box=gt_box,
+                                                gt_label=gt_label,
+                                                gt_score=gt_score,
+                                                anchors=anchors,
+                                                anchor_mask=anchor_mask,
+                                                class_num=80,
+                                                ignore_thresh=0.7,
+                                                downsample_ratio=32)
 
         execute(main_program, startup_program)
 
@@ -151,6 +158,7 @@ def test_without_kernel_op(self):
         execute(main_program, startup_program)
 
     def test_error(self):
+
         def device_attr():
             with paddle.static.device_guard("cpu1"):
                 out = paddle.full(shape=[1], fill_value=0.2, dtype='float32')
@@ -167,10 +175,12 @@ def test_op_descs_device_attr(self):
         main_program = paddle.static.Program()
         startup_program = paddle.static.Program()
         with paddle.static.program_guard(main_program, startup_program):
-            data1 = paddle.static.data(
-                name="data_1", shape=[4, 2], dtype="float32")
-            label = paddle.static.data(
-                name="label", shape=[4, 1], dtype="int64")
+            data1 = paddle.static.data(name="data_1",
+                                       shape=[4, 2],
+                                       dtype="float32")
+            label = paddle.static.data(name="label",
+                                       shape=[4, 1],
+                                       dtype="int64")
             fc1 = paddle.static.nn.fc(x=data1, size=10)
             fc2 = paddle.static.nn.fc(x=fc1, size=10)
             with paddle.static.device_guard("gpu"):
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py b/python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py
index 39558d95a6e0c..d827c50099553 100644
--- a/python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py
@@ -22,6 +22,7 @@
 
 
 class TestDGCMomentumOp1(unittest.TestCase):
+
     def get_tensor(self, name, value, place=None):
         tensor = self.scope.var(name).get_tensor()
         tensor.set(value, self.place if place is None else place)
@@ -49,14 +50,14 @@ def setup(self, place, step=0.0):
         # get tensor
         self.param_name, self.param_tensor = self.get_tensor('Param', param)
         self.grad_name, self.grad_tensor = self.get_tensor('Grad', grad)
-        self.velocity_name, self.velocity_tensor = self.get_tensor('Velocity',
-                                                                   velocity)
+        self.velocity_name, self.velocity_tensor = self.get_tensor(
+            'Velocity', velocity)
         self.learning_rate_name, self.learning_rate_tensor = self.get_tensor(
             'LearningRate', learning_rate)
         self.current_step_name, self.current_step_tensor = self.get_tensor(
             'current_step', current_step, core.CPUPlace())
-        self.nranks_name, self.nranks_tensor = self.get_tensor('nranks', nranks,
-                                                               core.CPUPlace())
+        self.nranks_name, self.nranks_tensor = self.get_tensor(
+            'nranks', nranks, core.CPUPlace())
 
         self.kwargs = {
             # inputs
@@ -95,10 +96,9 @@ def setup(self, place, step=0.0):
 
     def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
         self.assertTrue(
-            np.allclose(
-                actual_t, expect_t, atol=atol),
-            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
-            + str(expect_t) + "\n" + "But Got" + str(actual_t))
+            np.allclose(actual_t, expect_t, atol=atol),
+            "Output (" + out_name + ") has diff at " + str(place) +
+            "\nExpect " + str(expect_t) + "\n" + "But Got" + str(actual_t))
 
     def check_momentum_step(self, place):
         self.setup(place=place)
@@ -106,13 +106,11 @@ def check_momentum_step(self, place):
         dgc_momentum_op = Operator(self.op_type, **self.kwargs)
         dgc_momentum_op.run(self.scope, self.place)
 
-        self.check(
-            np.array(self.param_tensor), self.outputs['ParamOut'], self.place,
-            self.param_name)
+        self.check(np.array(self.param_tensor), self.outputs['ParamOut'],
+                   self.place, self.param_name)
 
-        self.check(
-            np.array(self.velocity_tensor), self.outputs['VelocityOut'],
-            self.place, self.velocity_name)
+        self.check(np.array(self.velocity_tensor), self.outputs['VelocityOut'],
+                   self.place, self.velocity_name)
 
     def check_sgd_step(self, place):
         self.setup(place=place, step=15.0)
@@ -120,9 +118,8 @@ def check_sgd_step(self, place):
         dgc_momentum_op = Operator(self.op_type, **self.kwargs)
         dgc_momentum_op.run(self.scope, self.place)
 
-        self.check(
-            np.array(self.param_tensor), self.outputs['SGDOut'], self.place,
-            self.param_name)
+        self.check(np.array(self.param_tensor), self.outputs['SGDOut'],
+                   self.place, self.param_name)
 
     def test_cuda_place(self):
         if not core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_op.py b/python/paddle/fluid/tests/unittests/test_dgc_op.py
index 634fd64bc72c6..0ab710b8cbbb4 100644
--- a/python/paddle/fluid/tests/unittests/test_dgc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dgc_op.py
@@ -25,6 +25,7 @@
 
 
 class TestDGCOp(unittest.TestCase):
+
     def setup(self, place, array_size=g_array_size):
         size = array_size
         np.random.seed(5)  # fix seed
@@ -59,7 +60,7 @@ def setup(self, place, array_size=g_array_size):
         self.k = np.full((1), 0.0).astype("float32")
         self.gather_buff_name = "GatherBuff"
 
-        # scope data 
+        # scope data
         self.u_tensor = self.scope.var(self.u_name).get_tensor()
         self.u_tensor.set(self.u, place)
 
@@ -90,10 +91,9 @@ def setup(self, place, array_size=g_array_size):
 
     def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
         self.assertTrue(
-            np.allclose(
-                actual_t, expect_t, atol=atol),
-            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
-            + str(expect_t) + "\n" + "But Got" + str(actual_t))
+            np.allclose(actual_t, expect_t, atol=atol),
+            "Output (" + out_name + ") has diff at " + str(place) +
+            "\nExpect " + str(expect_t) + "\n" + "But Got" + str(actual_t))
 
     def test_run_and_check(self):
         self.setup(place=core.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
index f3878dfa2bc76..06488c8f59bd7 100644
--- a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
@@ -23,11 +23,14 @@
 import paddle.fluid.clip as clip
 import paddle.compat as cpt
 from paddle.fluid.backward import append_backward
+
 paddle.enable_static()
 
 
 class TestDGCMomentumOptimizer(unittest.TestCase):
+
     class MockDGCMomentum(optimizer.DGCMomentumOptimizer):
+
         def get_accumulators(self):
             return self._accumulators
 
@@ -50,22 +53,21 @@ def check_dgc_momentum_optimizer(self,
             optimize_attr={'learning_rate': 1.1},
             regularizer=None if regularization is not None else
             regularizer.L2DecayRegularizer(2e-4))
-        mul_y = block.create_var(
-            dtype="float32",
-            shape=[dims[1], dims[2]],
-            lod_level=0,
-            name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32",
-            shape=[dims[0], dims[2]],
-            lod_level=0,
-            name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
+        mul_y = block.create_var(dtype="float32",
+                                 shape=[dims[1], dims[2]],
+                                 lod_level=0,
+                                 name="mul.y")
+        mul_out = block.create_var(dtype="float32",
+                                   shape=[dims[0], dims[2]],
+                                   lod_level=0,
+                                   name="mul.out")
+        block.append_op(type="mul",
+                        inputs={
+                            "X": mul_x,
+                            "Y": mul_y
+                        },
+                        outputs={"Out": mul_out},
+                        attrs={"x_num_col_dims": 1})
         learning_rate = 0.01
 
         dgc_momentum_optimizer = self.MockDGCMomentum(
@@ -83,10 +85,13 @@ def check_dgc_momentum_optimizer(self,
             dgc_momentum_optimizer.get_accumulators = dgc_momentum_optimizer._optimizer.get_accumulators
             dgc_momentum_optimizer.get_velocity_str = dgc_momentum_optimizer._optimizer.get_velocity_str
 
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        mean_out = block.create_var(dtype="float32",
+                                    shape=[1],
+                                    lod_level=0,
+                                    name="mean.out")
+        block.append_op(type="mean",
+                        inputs={"X": mul_out},
+                        outputs={"Out": mean_out})
         # params_grads = append_backward(mean_out)
         params_grads = dgc_momentum_optimizer.backward(
             mean_out, startup_program=init_program)
@@ -96,8 +101,8 @@ def check_dgc_momentum_optimizer(self,
 
         accumulator_count = 1 if name == "momentum" else 2
         self.assertEqual(len(params_grads), 1)
-        self.assertEqual(
-            len(dgc_momentum_optimizer.get_accumulators()), accumulator_count)
+        self.assertEqual(len(dgc_momentum_optimizer.get_accumulators()),
+                         accumulator_count)
 
         self.assertEqual(len(opts), 2)
         sgd_op = opts[-1]
@@ -152,8 +157,8 @@ def test_momentum_with_dgc(self):
             regularization=regularizer.L2Decay(1e-4))
 
         # check param.regularizer in dgc
-        self.check_dgc_momentum_optimizer(
-            dims=[16, 1024, 8], name="dgc_momentum")
+        self.check_dgc_momentum_optimizer(dims=[16, 1024, 8],
+                                          name="dgc_momentum")
 
     def test_momentum_with_dgc_recompute(self):
         # 16 * 1024 = 16384, use dgc momentum
diff --git a/python/paddle/fluid/tests/unittests/test_diag.py b/python/paddle/fluid/tests/unittests/test_diag.py
index 29f5a90726d8f..507f98a613cc6 100644
--- a/python/paddle/fluid/tests/unittests/test_diag.py
+++ b/python/paddle/fluid/tests/unittests/test_diag.py
@@ -24,6 +24,7 @@
 
 
 class TestDiagOp(OpTest):
+
     def setUp(self):
         self.op_type = "diag"
         self.init_config()
@@ -40,11 +41,13 @@ def init_config(self):
 
 
 class TestDiagOpCase1(TestDiagOp):
+
     def init_config(self):
         self.case = np.array([3], dtype='int32')
 
 
 class TestDiagError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_diag_embed.py b/python/paddle/fluid/tests/unittests/test_diag_embed.py
index 9df8fc7d5750c..c7f933d23ea21 100644
--- a/python/paddle/fluid/tests/unittests/test_diag_embed.py
+++ b/python/paddle/fluid/tests/unittests/test_diag_embed.py
@@ -24,6 +24,7 @@
 
 
 class TestDiagEmbedOp(OpTest):
+
     def setUp(self):
         self.op_type = "diag_embed"
         self.init_config()
@@ -40,6 +41,7 @@ def init_config(self):
 
 
 class TestDiagEmbedOpCase1(TestDiagEmbedOp):
+
     def init_config(self):
         self.case = np.random.randn(2, 3).astype('float32')
         self.inputs = {'Input': self.case}
@@ -49,6 +51,7 @@ def init_config(self):
 
 
 class TestDiagEmbedAPICase(unittest.TestCase):
+
     def test_case1(self):
         diag_embed = np.random.randn(2, 3, 4).astype('float32')
         data1 = fluid.data(name='data1', shape=[2, 3, 4], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/test_diag_v2.py b/python/paddle/fluid/tests/unittests/test_diag_v2.py
index 4047ccb8782c8..aaae8e65730a0 100644
--- a/python/paddle/fluid/tests/unittests/test_diag_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_diag_v2.py
@@ -25,6 +25,7 @@
 
 
 class TestDiagV2Op(OpTest):
+
     def setUp(self):
         self.op_type = "diag_v2"
         self.python_api = paddle.diag
@@ -54,24 +55,28 @@ def init_config(self):
 
 
 class TestDiagV2OpCase1(TestDiagV2Op):
+
     def init_config(self):
         self.offset = 1
         self.out = np.diag(self.x, self.offset)
 
 
 class TestDiagV2OpCase2(TestDiagV2Op):
+
     def init_config(self):
         self.offset = -1
         self.out = np.diag(self.x, self.offset)
 
 
 class TestDiagV2OpCase3(TestDiagV2Op):
+
     def init_config(self):
         self.x = np.random.randint(-10, 10, size=(10, 10)).astype("float64")
         self.out = np.diag(self.x, self.offset)
 
 
 class TestDiagV2OpCase4(TestDiagV2Op):
+
     def init_config(self):
         self.x = np.random.rand(100)
         self.padding_value = 2
@@ -81,6 +86,7 @@ def init_config(self):
 
 
 class TestDiagV2Error(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
@@ -101,6 +107,7 @@ def test_diag_v2_type():
 
 
 class TestDiagV2API(unittest.TestCase):
+
     def setUp(self):
         self.input_np = np.random.random(size=(10, 10)).astype(np.float32)
         self.expected0 = np.diag(self.input_np)
@@ -190,11 +197,13 @@ def run_static(self, use_gpu=False):
         x = paddle.static.data(name='input', shape=[10, 10], dtype='float32')
         x2 = paddle.static.data(name='input2', shape=[100], dtype='float64')
         x3 = paddle.static.data(name='input3', shape=[100], dtype='int64')
-        x4 = paddle.static.data(
-            name='input4', shape=[2000, 2000], dtype='float32')
+        x4 = paddle.static.data(name='input4',
+                                shape=[2000, 2000],
+                                dtype='float32')
         x5 = paddle.static.data(name='input5', shape=[2000], dtype='float32')
-        x6 = paddle.static.data(
-            name='input6', shape=[2000, 1500], dtype='float32')
+        x6 = paddle.static.data(name='input6',
+                                shape=[2000, 1500],
+                                dtype='float32')
         result0 = paddle.diag(x)
         result1 = paddle.diag(x, offset=1)
         result2 = paddle.diag(x, offset=-1)
diff --git a/python/paddle/fluid/tests/unittests/test_diagflat.py b/python/paddle/fluid/tests/unittests/test_diagflat.py
index ec74855ba2523..98f8c3d434f69 100644
--- a/python/paddle/fluid/tests/unittests/test_diagflat.py
+++ b/python/paddle/fluid/tests/unittests/test_diagflat.py
@@ -21,6 +21,7 @@
 
 
 class TestDiagFlatError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
@@ -36,6 +37,7 @@ def test_diagflat_type():
 
 
 class TestDiagFlatAPI(unittest.TestCase):
+
     def setUp(self):
         self.input_np = np.random.random(size=(10, 10)).astype(np.float64)
         self.expected0 = np.diagflat(self.input_np)
@@ -77,10 +79,11 @@ def run_static(self, use_gpu=False):
         place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
         exe = paddle.static.Executor(place)
         exe.run(paddle.static.default_startup_program())
-        res0, res3 = exe.run(
-            feed={"input": self.input_np,
-                  'input2': self.input_np2},
-            fetch_list=[result0, result3])
+        res0, res3 = exe.run(feed={
+            "input": self.input_np,
+            'input2': self.input_np2
+        },
+                             fetch_list=[result0, result3])
 
         self.assertTrue(np.allclose(res0, self.expected0))
         self.assertTrue(np.allclose(res3, self.expected3))
diff --git a/python/paddle/fluid/tests/unittests/test_diagonal_op.py b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
index 7db5fcb9625a6..b5600f21b788c 100644
--- a/python/paddle/fluid/tests/unittests/test_diagonal_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
@@ -28,6 +28,7 @@
 
 
 class TestDiagonalOp(OpTest):
+
     def setUp(self):
         self.op_type = "diagonal"
         self.python_api = paddle.diagonal
@@ -44,63 +45,62 @@ def init_config(self):
         self.case = np.random.randn(10, 5, 2).astype('float64')
         self.inputs = {'Input': self.case}
         self.attrs = {'offset': 0, 'axis1': 0, 'axis2': 1}
-        self.target = np.diagonal(
-            self.inputs['Input'],
-            offset=self.attrs['offset'],
-            axis1=self.attrs['axis1'],
-            axis2=self.attrs['axis2'])
+        self.target = np.diagonal(self.inputs['Input'],
+                                  offset=self.attrs['offset'],
+                                  axis1=self.attrs['axis1'],
+                                  axis2=self.attrs['axis2'])
 
 
 class TestDiagonalOpCase1(TestDiagonalOp):
+
     def init_config(self):
         self.case = np.random.randn(4, 2, 4, 4).astype('float32')
         self.inputs = {'Input': self.case}
         self.attrs = {'offset': -2, 'axis1': 3, 'axis2': 0}
-        self.target = np.diagonal(
-            self.inputs['Input'],
-            offset=self.attrs['offset'],
-            axis1=self.attrs['axis1'],
-            axis2=self.attrs['axis2'])
+        self.target = np.diagonal(self.inputs['Input'],
+                                  offset=self.attrs['offset'],
+                                  axis1=self.attrs['axis1'],
+                                  axis2=self.attrs['axis2'])
 
 
 class TestDiagonalOpCase2(TestDiagonalOp):
+
     def init_config(self):
         self.case = np.random.randn(100, 100).astype('int64')
         self.inputs = {'Input': self.case}
         self.attrs = {'offset': 0, 'axis1': 0, 'axis2': 1}
-        self.target = np.diagonal(
-            self.inputs['Input'],
-            offset=self.attrs['offset'],
-            axis1=self.attrs['axis1'],
-            axis2=self.attrs['axis2'])
+        self.target = np.diagonal(self.inputs['Input'],
+                                  offset=self.attrs['offset'],
+                                  axis1=self.attrs['axis1'],
+                                  axis2=self.attrs['axis2'])
         self.grad_x = np.eye(100).astype('int64')
         self.grad_out = np.ones(100).astype('int64')
 
     def test_check_grad(self):
-        self.check_grad(
-            ['Input'],
-            'Out',
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=True)
+        self.check_grad(['Input'],
+                        'Out',
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=True)
 
 
 class TestDiagonalOpCase3(TestDiagonalOp):
+
     def init_config(self):
         self.case = np.random.randint(0, 2, (4, 2, 4, 4)).astype('bool')
         self.inputs = {'Input': self.case}
         self.attrs = {'offset': -2, 'axis1': 3, 'axis2': 0}
-        self.target = np.diagonal(
-            self.inputs['Input'],
-            offset=self.attrs['offset'],
-            axis1=self.attrs['axis1'],
-            axis2=self.attrs['axis2'])
+        self.target = np.diagonal(self.inputs['Input'],
+                                  offset=self.attrs['offset'],
+                                  axis1=self.attrs['axis1'],
+                                  axis2=self.attrs['axis2'])
 
     def test_check_grad(self):
         pass
 
 
 class TestDiagonalAPI(unittest.TestCase):
+
     def setUp(self):
         self.shape = [10, 3, 4]
         self.x = np.random.random((10, 3, 4)).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_diff_op.py b/python/paddle/fluid/tests/unittests/test_diff_op.py
index b435975452009..dad8bcd70c127 100644
--- a/python/paddle/fluid/tests/unittests/test_diff_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diff_op.py
@@ -23,6 +23,7 @@
 
 
 class TestDiffOp(unittest.TestCase):
+
     def set_args(self):
         self.input = np.array([1, 4, 5, 2]).astype('float32')
         self.n = 1
@@ -32,18 +33,21 @@ def set_args(self):
 
     def get_output(self):
         if self.prepend is not None and self.append is not None:
-            self.output = np.diff(
-                self.input,
-                n=self.n,
-                axis=self.axis,
-                prepend=self.prepend,
-                append=self.append)
+            self.output = np.diff(self.input,
+                                  n=self.n,
+                                  axis=self.axis,
+                                  prepend=self.prepend,
+                                  append=self.append)
         elif self.prepend is not None:
-            self.output = np.diff(
-                self.input, n=self.n, axis=self.axis, prepend=self.prepend)
+            self.output = np.diff(self.input,
+                                  n=self.n,
+                                  axis=self.axis,
+                                  prepend=self.prepend)
         elif self.append is not None:
-            self.output = np.diff(
-                self.input, n=self.n, axis=self.axis, append=self.append)
+            self.output = np.diff(self.input,
+                                  n=self.n,
+                                  axis=self.axis,
+                                  append=self.append)
         else:
             self.output = np.diff(self.input, n=self.n, axis=self.axis)
 
@@ -62,12 +66,11 @@ def func_dygraph(self):
                 self.prepend = paddle.to_tensor(self.prepend, place=place)
             if self.append is not None:
                 self.append = paddle.to_tensor(self.append, place=place)
-            out = paddle.diff(
-                x,
-                n=self.n,
-                axis=self.axis,
-                prepend=self.prepend,
-                append=self.append)
+            out = paddle.diff(x,
+                              n=self.n,
+                              axis=self.axis,
+                              prepend=self.prepend,
+                              append=self.append)
             self.assertTrue((out.numpy() == self.output).all(), True)
 
     def test_dygraph(self):
@@ -84,29 +87,29 @@ def test_static(self):
             places.append(fluid.CUDAPlace(0))
         for place in places:
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                x = paddle.fluid.data(
-                    name="input",
-                    shape=self.input.shape,
-                    dtype=self.input.dtype)
+                x = paddle.fluid.data(name="input",
+                                      shape=self.input.shape,
+                                      dtype=self.input.dtype)
                 has_pend = False
                 prepend = None
                 append = None
                 if self.prepend is not None:
                     has_pend = True
-                    prepend = paddle.fluid.data(
-                        name="prepend",
-                        shape=self.prepend.shape,
-                        dtype=self.prepend.dtype)
+                    prepend = paddle.fluid.data(name="prepend",
+                                                shape=self.prepend.shape,
+                                                dtype=self.prepend.dtype)
                 if self.append is not None:
                     has_pend = True
-                    append = paddle.fluid.data(
-                        name="append",
-                        shape=self.append.shape,
-                        dtype=self.append.dtype)
+                    append = paddle.fluid.data(name="append",
+                                               shape=self.append.shape,
+                                               dtype=self.append.dtype)
 
                 exe = fluid.Executor(place)
-                out = paddle.diff(
-                    x, n=self.n, axis=self.axis, prepend=prepend, append=append)
+                out = paddle.diff(x,
+                                  n=self.n,
+                                  axis=self.axis,
+                                  prepend=prepend,
+                                  append=append)
                 fetches = exe.run(fluid.default_main_program(),
                                   feed={
                                       "input": self.input,
@@ -123,12 +126,11 @@ def func_grad(self):
                 self.prepend = paddle.to_tensor(self.prepend, place=place)
             if self.append is not None:
                 self.append = paddle.to_tensor(self.append, place=place)
-            out = paddle.diff(
-                x,
-                n=self.n,
-                axis=self.axis,
-                prepend=self.prepend,
-                append=self.append)
+            out = paddle.diff(x,
+                              n=self.n,
+                              axis=self.axis,
+                              prepend=self.prepend,
+                              append=self.append)
             try:
                 out.backward()
                 x_grad = x.grad
@@ -144,6 +146,7 @@ def test_grad(self):
 
 
 class TestDiffOpAxis(TestDiffOp):
+
     def set_args(self):
         self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
         self.n = 1
@@ -153,6 +156,7 @@ def set_args(self):
 
 
 class TestDiffOpNDim(TestDiffOp):
+
     def set_args(self):
         self.input = np.random.rand(10, 10).astype('float32')
         self.n = 1
@@ -162,6 +166,7 @@ def set_args(self):
 
 
 class TestDiffOpBool(TestDiffOp):
+
     def set_args(self):
         self.input = np.array([0, 1, 1, 0, 1, 0]).astype('bool')
         self.n = 1
@@ -171,6 +176,7 @@ def set_args(self):
 
 
 class TestDiffOpPrepend(TestDiffOp):
+
     def set_args(self):
         self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
         self.n = 1
@@ -180,16 +186,18 @@ def set_args(self):
 
 
 class TestDiffOpPrependAxis(TestDiffOp):
+
     def set_args(self):
         self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
         self.n = 1
         self.axis = 0
-        self.prepend = np.array(
-            [[0, 2, 3, 4], [1, 3, 5, 7], [2, 5, 8, 0]]).astype('float32')
+        self.prepend = np.array([[0, 2, 3, 4], [1, 3, 5, 7],
+                                 [2, 5, 8, 0]]).astype('float32')
         self.append = None
 
 
 class TestDiffOpAppend(TestDiffOp):
+
     def set_args(self):
         self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
         self.n = 1
@@ -199,6 +207,7 @@ def set_args(self):
 
 
 class TestDiffOpAppendAxis(TestDiffOp):
+
     def set_args(self):
         self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
         self.n = 1
@@ -208,6 +217,7 @@ def set_args(self):
 
 
 class TestDiffOpPreAppend(TestDiffOp):
+
     def set_args(self):
         self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
         self.n = 1
@@ -217,6 +227,7 @@ def set_args(self):
 
 
 class TestDiffOpPreAppendAxis(TestDiffOp):
+
     def set_args(self):
         self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
         self.n = 1
diff --git a/python/paddle/fluid/tests/unittests/test_digamma_op.py b/python/paddle/fluid/tests/unittests/test_digamma_op.py
index 4897becf61144..27ba710a96d69 100644
--- a/python/paddle/fluid/tests/unittests/test_digamma_op.py
+++ b/python/paddle/fluid/tests/unittests/test_digamma_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,6 +24,7 @@
 
 
 class TestDigammaOp(OpTest):
+
     def setUp(self):
         # switch to static
         paddle.enable_static()
@@ -49,6 +50,7 @@ def test_check_grad_normal(self):
 
 
 class TestDigammaOpFp32(TestDigammaOp):
+
     def init_dtype_type(self):
         self.dtype = np.float32
 
@@ -57,6 +59,7 @@ def test_check_grad_normal(self):
 
 
 class TestDigammaAPI(unittest.TestCase):
+
     def setUp(self):
         # switch to static
         paddle.enable_static()
@@ -68,6 +71,7 @@ def setUp(self):
         self._shape = [8, 3, 32, 32]
 
     def test_in_static_mode(self):
+
         def init_input_output(dtype):
             input = np.random.random(self._shape).astype(dtype)
             return {'x': input}, psi(input)
@@ -82,8 +86,7 @@ def init_input_output(dtype):
                     exe = static.Executor(place)
                     out_value = exe.run(feed=input_dict, fetch_list=[out.name])
                     self.assertEqual(
-                        np.allclose(
-                            out_value[0], sc_res, rtol=1e-5), True)
+                        np.allclose(out_value[0], sc_res, rtol=1e-5), True)
 
     def test_in_dynamic_mode(self):
         for dtype in self.dtypes:
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 2ec16a9dcab6e..727fcb28cc211 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -24,6 +24,7 @@
 
 
 class TestDirectory(unittest.TestCase):
+
     def get_import_command(self, module):
         paths = module.split('.')
         if len(paths) == 1:
@@ -86,10 +87,9 @@ def test_new_directory(self):
         _python = sys.executable
 
         ps_cmd = "{} {}".format(_python, import_file)
-        ps_proc = subprocess.Popen(
-            ps_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
+        ps_proc = subprocess.Popen(ps_cmd.strip().split(" "),
+                                   stdout=subprocess.PIPE,
+                                   stderr=subprocess.PIPE)
         stdout, stderr = ps_proc.communicate()
 
         self.assertFalse("Error" in str(stderr),
@@ -169,10 +169,9 @@ def test_old_directory(self):
         _python = sys.executable
 
         ps_cmd = "{} {}".format(_python, import_file)
-        ps_proc = subprocess.Popen(
-            ps_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
+        ps_proc = subprocess.Popen(ps_cmd.strip().split(" "),
+                                   stdout=subprocess.PIPE,
+                                   stderr=subprocess.PIPE)
         stdout, stderr = ps_proc.communicate()
 
         self.assertFalse("Error" in str(stdout), bytes.decode(stdout))
diff --git a/python/paddle/fluid/tests/unittests/test_disable_signal_handler.py b/python/paddle/fluid/tests/unittests/test_disable_signal_handler.py
index dbe9dcb7f823d..655c2fbfb79be 100644
--- a/python/paddle/fluid/tests/unittests/test_disable_signal_handler.py
+++ b/python/paddle/fluid/tests/unittests/test_disable_signal_handler.py
@@ -27,15 +27,15 @@
 
 
 class TestSignOpError(unittest.TestCase):
+
     def test_errors(self):
         try:
             for sig in SignalsToTest:
-                output = subprocess.check_output(
-                    [
-                        "python", "-c",
-                        f"import paddle; import signal,os; paddle.disable_signal_handler(); os.kill(os.getpid(), {sig})"
-                    ],
-                    stderr=subprocess.STDOUT)
+                output = subprocess.check_output([
+                    "python", "-c",
+                    f"import paddle; import signal,os; paddle.disable_signal_handler(); os.kill(os.getpid(), {sig})"
+                ],
+                                                 stderr=subprocess.STDOUT)
         except Exception as e:
             # If paddle signal handler is enabled
             # One would expect "paddle::framework::SignalHandle" in STDERR
diff --git a/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
index 2adf6e4193181..3c3a5b047f3b5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
@@ -21,6 +21,7 @@
 
 
 class TestDistMnistNCCL2(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 4f21b3220a9d3..3e03634987ad4 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -59,6 +59,7 @@ def eprint(*args, **kwargs):
 
 
 class TestDistRunnerBase(object):
+
     def get_model(self,
                   batch_size=DEFAULT_BATCH_SIZE,
                   lr=0.1,
@@ -88,13 +89,12 @@ def get_transpiler(trainer_id,
             config.nccl_comm_num = nccl_comm_num
         # config.runtime_split_send_recv = True
         t = fluid.DistributeTranspiler(config=config)
-        t.transpile(
-            trainer_id=trainer_id,
-            program=main_program,
-            pservers=pserver_endpoints,
-            trainers=trainers,
-            sync_mode=sync_mode,
-            current_endpoint=current_endpoint)
+        t.transpile(trainer_id=trainer_id,
+                    program=main_program,
+                    pservers=pserver_endpoints,
+                    trainers=trainers,
+                    sync_mode=sync_mode,
+                    current_endpoint=current_endpoint)
         return t
 
     @staticmethod
@@ -111,14 +111,13 @@ def run_pserver(self, args):
         self.get_model(batch_size=args.batch_size)
         # NOTE: pserver should not call memory optimize
 
-        t = self.get_transpiler(
-            trainer_id=args.trainer_id,
-            main_program=fluid.default_main_program(),
-            pserver_endpoints=args.endpoints,
-            trainers=args.trainers,
-            sync_mode=args.sync_mode,
-            dc_asgd=args.dc_asgd,
-            hogwild_mode=args.hogwild)
+        t = self.get_transpiler(trainer_id=args.trainer_id,
+                                main_program=fluid.default_main_program(),
+                                pserver_endpoints=args.endpoints,
+                                trainers=args.trainers,
+                                sync_mode=args.sync_mode,
+                                dc_asgd=args.dc_asgd,
+                                hogwild_mode=args.hogwild)
         pserver_prog = t.get_pserver_program(args.current_endpoint)
         startup_prog = t.get_startup_program(args.current_endpoint,
                                              pserver_prog)
@@ -195,8 +194,8 @@ def run_use_fleet_api_20_trainer(self, args):
         eprint(type(self).__name__, "run worker startup program done.")
 
         feed_var_list = [
-            var
-            for var in fluid.default_main_program().global_block().vars.values()
+            var for var in
+            fluid.default_main_program().global_block().vars.values()
             if var.is_data
         ]
 
@@ -366,14 +365,13 @@ def run_trainer(self, args):
             print_to_err(
                 type(self).__name__,
                 "begin to run transpile on trainer with pserver mode")
-            t = self.get_transpiler(
-                trainer_id=args.trainer_id,
-                main_program=fluid.default_main_program(),
-                pserver_endpoints=args.endpoints,
-                trainers=args.trainers,
-                sync_mode=args.sync_mode,
-                dc_asgd=args.dc_asgd,
-                hogwild_mode=args.hogwild)
+            t = self.get_transpiler(trainer_id=args.trainer_id,
+                                    main_program=fluid.default_main_program(),
+                                    pserver_endpoints=args.endpoints,
+                                    trainers=args.trainers,
+                                    sync_mode=args.sync_mode,
+                                    dc_asgd=args.dc_asgd,
+                                    hogwild_mode=args.hogwild)
 
             trainer_prog = t.get_trainer_program()
             print_to_err(
@@ -391,12 +389,11 @@ def run_trainer(self, args):
                 type(self).__name__,
                 "begin to run transpile on trainer with nccl2 mode")
             nccl2_t = fluid.DistributeTranspiler(config=config)
-            nccl2_t.transpile(
-                args.trainer_id,
-                program=fluid.default_main_program(),
-                startup_program=fluid.default_startup_program(),
-                trainers=args.endpoints,
-                current_endpoint=args.current_endpoint)
+            nccl2_t.transpile(args.trainer_id,
+                              program=fluid.default_main_program(),
+                              startup_program=fluid.default_startup_program(),
+                              trainers=args.endpoints,
+                              current_endpoint=args.current_endpoint)
             print_to_err(
                 type(self).__name__,
                 "get trainer program done. with nccl2 mode")
@@ -502,6 +499,7 @@ def get_data():
 
 
 class TestParallelDyGraphRunnerBase(object):
+
     def get_model(self):
         raise NotImplementedError(
             "get_model should be implemented by child classes.")
@@ -517,9 +515,9 @@ def _get_data(self, batch, args):
         elif args.update_method != "local":
             new_batch = []
 
-            # NOTE(@xiongkun03) args.diff_batch means batch length is different: 
-            # such as : batch = [2,3,4,5], then the first rank will get [2]  and 
-            # the second rank will get [3,4,5]. 
+            # NOTE(@xiongkun03) args.diff_batch means batch length is different:
+            # such as : batch = [2,3,4,5], then the first rank will get [2]  and
+            # the second rank will get [3,4,5].
             # this function is for test sparse_embedding_differ_length
             if hasattr(args, "diff_batch") and args.diff_batch:
                 assert len(
@@ -700,17 +698,18 @@ def run_use_fleet_api_trainer(self, args):
 
 def runtime_main(test_class):
     parser = argparse.ArgumentParser(description='Run dist test.')
-    parser.add_argument(
-        '--role', type=str, required=True, choices=['pserver', 'trainer'])
+    parser.add_argument('--role',
+                        type=str,
+                        required=True,
+                        choices=['pserver', 'trainer'])
     parser.add_argument('--endpoints', type=str, required=False, default="")
-    parser.add_argument(
-        '--update_method',
-        type=str,
-        default="local",
-        choices=[
-            "pserver", "nccl2", "bkcl", "local", "nccl2_reduce_layer", "gloo",
-            "hccl"
-        ])
+    parser.add_argument('--update_method',
+                        type=str,
+                        default="local",
+                        choices=[
+                            "pserver", "nccl2", "bkcl", "local",
+                            "nccl2_reduce_layer", "gloo", "hccl"
+                        ])
     parser.add_argument('--trainer_id', type=int, required=False, default=0)
     parser.add_argument('--trainers', type=int, required=False, default=1)
     parser.add_argument('--nccl_comm_num', type=int, required=False, default=1)
@@ -722,10 +721,14 @@ def runtime_main(test_class):
     parser.add_argument('--use_local_sgd', action='store_true')
     parser.add_argument('--diff_batch', action='store_true')
     parser.add_argument('--ut4grad_allreduce', action='store_true')
-    parser.add_argument(
-        '--hallreduce_inter_nranks', type=int, required=False, default=2)
-    parser.add_argument(
-        '--current_endpoint', type=str, required=False, default="")
+    parser.add_argument('--hallreduce_inter_nranks',
+                        type=int,
+                        required=False,
+                        default=2)
+    parser.add_argument('--current_endpoint',
+                        type=str,
+                        required=False,
+                        default="")
     parser.add_argument('--sync_mode', action='store_true')
     parser.add_argument('--use_cuda', action='store_true')
     parser.add_argument('--use_cpu', action='store_true')
@@ -738,23 +741,24 @@ def runtime_main(test_class):
     parser.add_argument('--dc_asgd', action='store_true')
     parser.add_argument('--hogwild', action='store_true')
     parser.add_argument('--save_model', action='store_true')
-    parser.add_argument(
-        '--use_reader_alloc', action='store_true', required=False)
+    parser.add_argument('--use_reader_alloc',
+                        action='store_true',
+                        required=False)
     parser.add_argument('--batch_size', required=False, type=int, default=2)
     parser.add_argument('--lr', required=False, type=float, default=0.001)
-    parser.add_argument(
-        '--batch_merge_repeat', required=False, type=int, default=1)
-    parser.add_argument(
-        '--nccl2_reduce_layer_local_run',
-        required=False,
-        type=bool,
-        default=False)
+    parser.add_argument('--batch_merge_repeat',
+                        required=False,
+                        type=int,
+                        default=1)
+    parser.add_argument('--nccl2_reduce_layer_local_run',
+                        required=False,
+                        type=bool,
+                        default=False)
     parser.add_argument('--sync_batch_norm', action='store_true')
-    parser.add_argument(
-        '--fuse_all_reduce',
-        required=False,
-        type=ast.literal_eval,
-        default=None)
+    parser.add_argument('--fuse_all_reduce',
+                        required=False,
+                        type=ast.literal_eval,
+                        default=None)
 
     args = parser.parse_args()
 
@@ -780,6 +784,7 @@ def runtime_main(test_class):
 
 
 class TestDistBase(unittest.TestCase):
+
     def _setup_config(self):
         raise NotImplementedError("tests should have _setup_config implemented")
 
@@ -868,6 +873,7 @@ def setUp(self):
         self._after_setup_config()
 
     def _find_free_port(self):
+
         def __free_port():
             with closing(socket.socket(socket.AF_INET,
                                        socket.SOCK_STREAM)) as s:
@@ -913,17 +919,15 @@ def start_pserver(self,
         ps1_pipe = open(log_name + "_ps1_err.log", "wb")
 
         print_to_err(type(self).__name__, "going to start pserver process 0")
-        ps0_proc = subprocess.Popen(
-            ps0_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=ps0_pipe,
-            env=required_envs)
+        ps0_proc = subprocess.Popen(ps0_cmd.strip().split(" "),
+                                    stdout=subprocess.PIPE,
+                                    stderr=ps0_pipe,
+                                    env=required_envs)
         print_to_err(type(self).__name__, "going to start pserver process 1")
-        ps1_proc = subprocess.Popen(
-            ps1_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=ps1_pipe,
-            env=required_envs)
+        ps1_proc = subprocess.Popen(ps1_cmd.strip().split(" "),
+                                    stdout=subprocess.PIPE,
+                                    stderr=ps1_pipe,
+                                    env=required_envs)
 
         return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
 
@@ -991,17 +995,15 @@ def _run_local(self,
 
         if check_error_log:
             err_log = open(log_name + "_local.log", "wb")
-            local_proc = subprocess.Popen(
-                cmd.split(" "),
-                stdout=subprocess.PIPE,
-                stderr=err_log,
-                env=env_local)
+            local_proc = subprocess.Popen(cmd.split(" "),
+                                          stdout=subprocess.PIPE,
+                                          stderr=err_log,
+                                          env=env_local)
         else:
-            local_proc = subprocess.Popen(
-                cmd.split(" "),
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                env=env_local)
+            local_proc = subprocess.Popen(cmd.split(" "),
+                                          stdout=subprocess.PIPE,
+                                          stderr=subprocess.PIPE,
+                                          env=env_local)
 
         local_out, local_err = local_proc.communicate()
 
@@ -1030,8 +1032,10 @@ def _run_local_gloo(self,
 
     def _run_cluster(self, model, envs, check_error_log, log_name):
         # Run dist train to compare with local results
-        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(
-            model, check_error_log, envs, log_name=log_name)
+        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
+                                                          check_error_log,
+                                                          envs,
+                                                          log_name=log_name)
 
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
 
@@ -1080,17 +1084,15 @@ def _run_cluster(self, model, envs, check_error_log, log_name):
         tr1_pipe = open(log_name + "_tr1_err.log", "wb")
 
         print_to_err(type(self).__name__, "going to start trainer process 0")
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=tr0_pipe,
-            env=env0)
+        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(" "),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr0_pipe,
+                                    env=env0)
         print_to_err(type(self).__name__, "going to start trainer process 1")
-        tr1_proc = subprocess.Popen(
-            tr1_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=tr1_pipe,
-            env=env1)
+        tr1_proc = subprocess.Popen(tr1_cmd.strip().split(" "),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr1_pipe,
+                                    env=env1)
 
         # Wait until trainer process terminate
         while True:
@@ -1285,8 +1287,10 @@ def _run_cluster_gloo(self, model, envs, update_method, check_error_log,
         procs = []
         pipes = []
         for i in range(0, trainer_num):
-            tr_cmd, tr_env = self._get_gloo_trainer_cmd(
-                model, worker_endpoints[i], update_method, i, trainer_num)
+            tr_cmd, tr_env = self._get_gloo_trainer_cmd(model,
+                                                        worker_endpoints[i],
+                                                        update_method, i,
+                                                        trainer_num)
             tr_env.update(envs)
             tr_env["GLOG_vmodule"] = 'gloo_context=4'
             tr_env["GLOG_v"] = '3'
@@ -1298,11 +1302,10 @@ def _run_cluster_gloo(self, model, envs, update_method, check_error_log,
             print_to_err(
                 type(self).__name__,
                 "going to start process {} with nccl2".format(i))
-            tr_proc = subprocess.Popen(
-                tr_cmd.strip().split(" "),
-                stdout=subprocess.PIPE,
-                stderr=tr_pipe,
-                env=tr_env)
+            tr_proc = subprocess.Popen(tr_cmd.strip().split(" "),
+                                       stdout=subprocess.PIPE,
+                                       stderr=tr_pipe,
+                                       env=tr_env)
 
             procs.append(tr_proc)
             pipes.append(tr_pipe)
@@ -1360,11 +1363,10 @@ def _run_cluster_nccl2(self, model, envs, update_method, check_error_log,
             print_to_err(
                 type(self).__name__,
                 "going to start process {} with nccl2".format(i))
-            tr_proc = subprocess.Popen(
-                tr_cmd.strip().split(" "),
-                stdout=subprocess.PIPE,
-                stderr=tr_pipe,
-                env=tr_env)
+            tr_proc = subprocess.Popen(tr_cmd.strip().split(" "),
+                                       stdout=subprocess.PIPE,
+                                       stderr=tr_pipe,
+                                       env=tr_env)
 
             procs.append(tr_proc)
             pipes.append(tr_pipe)
@@ -1406,11 +1408,10 @@ def _run_pipeline(self, model, envs, check_error_log, log_name):
             print_to_err(
                 type(self).__name__,
                 "going to start process {} with nccl2".format(i))
-            tr_proc = subprocess.Popen(
-                tr_cmd.strip().split(" "),
-                stdout=subprocess.PIPE,
-                stderr=tr_pipe,
-                env=tr_env)
+            tr_proc = subprocess.Popen(tr_cmd.strip().split(" "),
+                                       stdout=subprocess.PIPE,
+                                       stderr=tr_pipe,
+                                       env=tr_env)
 
             procs.append(tr_proc)
             pipes.append(tr_pipe)
@@ -1463,26 +1464,23 @@ def check_with_place(self,
         if self._dygraph and (self._gloo_mode or self._nccl2_mode):
             need_envs.update({"FLAGS_enable_eager_mode": "1"})
             with _test_eager_guard():
-                self.check_with_place_func(
-                    model_file=model_file,
-                    delta=delta,
-                    check_error_log=check_error_log,
-                    need_envs=need_envs,
-                    log_name=log_name)
+                self.check_with_place_func(model_file=model_file,
+                                           delta=delta,
+                                           check_error_log=check_error_log,
+                                           need_envs=need_envs,
+                                           log_name=log_name)
             need_envs.update({"FLAGS_enable_eager_mode": "0"})
-            self.check_with_place_func(
-                model_file=model_file,
-                delta=delta,
-                check_error_log=check_error_log,
-                need_envs=need_envs,
-                log_name=log_name)
+            self.check_with_place_func(model_file=model_file,
+                                       delta=delta,
+                                       check_error_log=check_error_log,
+                                       need_envs=need_envs,
+                                       log_name=log_name)
         else:
-            self.check_with_place_func(
-                model_file=model_file,
-                delta=delta,
-                check_error_log=check_error_log,
-                need_envs=need_envs,
-                log_name=log_name)
+            self.check_with_place_func(model_file=model_file,
+                                       delta=delta,
+                                       check_error_log=check_error_log,
+                                       need_envs=need_envs,
+                                       log_name=log_name)
 
     def check_with_place_func(self,
                               model_file,
@@ -1540,11 +1538,15 @@ def check_with_place_func(self,
                 log_name=log_name)
 
         elif self._pipeline_mode:
-            tr0_losses, tr1_losses = self._run_pipeline(
-                model_file, required_envs, check_error_log, log_name=log_name)
+            tr0_losses, tr1_losses = self._run_pipeline(model_file,
+                                                        required_envs,
+                                                        check_error_log,
+                                                        log_name=log_name)
         else:
-            tr0_losses, tr1_losses = self._run_cluster(
-                model_file, required_envs, check_error_log, log_name=log_name)
+            tr0_losses, tr1_losses = self._run_cluster(model_file,
+                                                       required_envs,
+                                                       check_error_log,
+                                                       log_name=log_name)
 
         for step_id in range(RUN_STEP):
             local_loss = local_losses[step_id]
@@ -1570,20 +1572,19 @@ def check_with_place_multi_cards(self,
         required_envs = self._get_required_envs(check_error_log, need_envs)
 
         if self._use_dgc:
-            multi_cards_losses = self._run_local(
-                model_file,
-                required_envs,
-                check_error_log,
-                log_name=log_name + "_dgc_2cards",
-                devices="0,1")
+            multi_cards_losses = self._run_local(model_file,
+                                                 required_envs,
+                                                 check_error_log,
+                                                 log_name=log_name +
+                                                 "_dgc_2cards",
+                                                 devices="0,1")
 
             self._use_dgc = False
-            base_losses = self._run_local(
-                model_file,
-                required_envs,
-                check_error_log,
-                log_name=log_name + "_base_2cards",
-                devices="0,1")
+            base_losses = self._run_local(model_file,
+                                          required_envs,
+                                          check_error_log,
+                                          log_name=log_name + "_base_2cards",
+                                          devices="0,1")
 
             self._use_dgc = True
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_dygraph_apis.py b/python/paddle/fluid/tests/unittests/test_dist_dygraph_apis.py
index 8e6fb99ae9355..d64c2acae7ebe 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_dygraph_apis.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_dygraph_apis.py
@@ -19,6 +19,7 @@
 
 
 class TestDygraphFleetApi(TestMultipleGpus):
+
     def test_dygraph_fleet_api(self):
         self.run_mnist_2gpu('dygraph_fleet_api.py')
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
index 92dbf9f2c8ce7..38fea7f2413c7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
@@ -24,6 +24,7 @@
 
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_PSERVER_NUMS"] = "2"
         os.environ["PADDLE_TRAINERS_NUM"] = "2"
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
index 35577c2712169..3e683b0d693c0 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
@@ -22,6 +22,7 @@
 
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_PSERVER_NUMS"] = "2"
         os.environ["PADDLE_TRAINERS_NUM"] = "2"
@@ -43,15 +44,16 @@ def test_a_sync_optimizer1(self):
         paddle.fluid.framework.switch_startup_program(startup_program)
 
         fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
+        input_x = paddle.fluid.layers.data(name="x",
+                                           shape=[32],
+                                           dtype='float32')
         input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
         fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
         fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
+        cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                 label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
         os.environ["FLAGS_LAUNCH_BARRIER"] = "0"
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
index 36ba8f38c9958..d2ed6ad7ff1de 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+
 os.environ["WITH_DISTRIBUTE"] = "ON"
 import unittest
 import paddle
@@ -23,6 +24,7 @@
 
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_PSERVER_NUMS"] = "2"
         os.environ["PADDLE_TRAINERS_NUM"] = "2"
@@ -44,12 +46,11 @@ def test_a_sync_optimizer3(self):
         paddle.fluid.framework.switch_startup_program(startup_program)
 
         fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=1,
-            append_batch_size=False)
+        input_x = paddle.fluid.layers.data(name="x",
+                                           shape=[-1, 1],
+                                           dtype="int64",
+                                           lod_level=1,
+                                           append_batch_size=False)
         x_embedding = paddle.fluid.layers.embedding(
             is_distributed=False,
             input=input_x,
@@ -63,8 +64,8 @@ def test_a_sync_optimizer3(self):
         fc_1 = paddle.fluid.layers.fc(input=x_embedding, size=64, act='tanh')
         fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
+        cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                 label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
         os.environ["FLAGS_LAUNCH_BARRIER"] = "0"
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
index 60fd1c525c11b..707f072060a80 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+
 os.environ["WITH_DISTRIBUTE"] = "ON"
 import unittest
 import paddle
@@ -22,6 +23,7 @@
 
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_PSERVER_NUMS"] = "2"
         os.environ["PADDLE_TRAINERS_NUM"] = "2"
@@ -47,14 +49,15 @@ def test_a_sync_optimizer2(self):
         input_x = paddle.fluid.layers.data(name="x", shape=[1], dtype='int64')
         input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
-        emb = paddle.fluid.layers.embedding(
-            input=input_x, size=[100, 10], is_sparse=True)
+        emb = paddle.fluid.layers.embedding(input=input_x,
+                                            size=[100, 10],
+                                            is_sparse=True)
 
         fc_1 = paddle.fluid.layers.fc(input=emb, size=64, act='tanh')
         fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
+        cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                 label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
         os.environ["FLAGS_LAUNCH_BARRIER"] = "0"
         strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
index 6c8ce0a5acc3a..51eb9b81619b7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+
 os.environ["WITH_DISTRIBUTE"] = "ON"
 import unittest
 import paddle
@@ -22,6 +23,7 @@
 
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_PSERVER_NUMS"] = "2"
         os.environ["PADDLE_TRAINERS_NUM"] = "2"
@@ -43,15 +45,16 @@ def test_a_sync_optimizer_trainer(self):
         paddle.fluid.framework.switch_startup_program(startup_program)
 
         fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
+        input_x = paddle.fluid.layers.data(name="x",
+                                           shape=[32],
+                                           dtype='float32')
         input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
         fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
         fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
+        cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                 label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
@@ -75,15 +78,16 @@ def test_a_sync_optimizer_pserver(self):
         paddle.fluid.framework.switch_startup_program(startup_program)
 
         fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
+        input_x = paddle.fluid.layers.data(name="x",
+                                           shape=[32],
+                                           dtype='float32')
         input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
         fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
         fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
+        cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                 label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
index 4b1f0ee85d944..3d7aa1b3fee0d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
@@ -23,6 +23,7 @@
 
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_PSERVER_NUMS"] = "2"
         os.environ["PADDLE_TRAINERS_NUM"] = "2"
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index 72f8a117ea95a..ac1bf48618236 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -38,6 +38,7 @@
 import unittest
 
 import paddle
+
 paddle.enable_static()
 
 __all__ = ['FleetDistRunnerBase', 'TestFleetBase', 'runtime_main']
@@ -104,9 +105,12 @@ def build_strategy(self, args):
         # TODO(update strategy to support dump params)
         if False:  # debug:
             self.strategy.set_debug_opt({
-                "dump_param": self.dump_param,
-                "dump_fields": self.dump_fields,
-                "dump_fields_path": self.dump_fields_path
+                "dump_param":
+                self.dump_param,
+                "dump_fields":
+                self.dump_fields,
+                "dump_fields_path":
+                self.dump_fields_path
             })
 
         return self.strategy
@@ -226,6 +230,7 @@ def setUp(self):
         self._setup_config()
 
     def _find_free_port(self):
+
         def __free_port():
             with closing(socket.socket(socket.AF_INET,
                                        socket.SOCK_STREAM)) as s:
@@ -258,17 +263,15 @@ def _start_pserver(self, cmd, required_envs):
         ps0_out = open(ps0_out_log, "wb+")
         ps1_out = open(ps1_out_log, "wb+")
 
-        ps0_proc = subprocess.Popen(
-            ps0_cmd.strip().split(" "),
-            stdout=ps0_out,
-            stderr=ps0_err,
-            env=required_envs)
+        ps0_proc = subprocess.Popen(ps0_cmd.strip().split(" "),
+                                    stdout=ps0_out,
+                                    stderr=ps0_err,
+                                    env=required_envs)
 
-        ps1_proc = subprocess.Popen(
-            ps1_cmd.strip().split(" "),
-            stdout=ps1_out,
-            stderr=ps1_err,
-            env=required_envs)
+        ps1_proc = subprocess.Popen(ps1_cmd.strip().split(" "),
+                                    stdout=ps1_out,
+                                    stderr=ps1_err,
+                                    env=required_envs)
 
         return ((ps0_proc, ps0_out, ps0_err, ps0_out_log, ps0_err_log),
                 (ps1_proc, ps1_out, ps1_err, ps1_out_log, ps1_err_log))
@@ -293,17 +296,15 @@ def _start_trainer(self, cmd, required_envs):
         tr0_out = open(tr0_out_log, "wb+")
         tr1_out = open(tr1_out_log, "wb+")
 
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.strip().split(" "),
-            stdout=tr0_out,
-            stderr=tr0_err,
-            env=required_envs)
+        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(" "),
+                                    stdout=tr0_out,
+                                    stderr=tr0_err,
+                                    env=required_envs)
 
-        tr1_proc = subprocess.Popen(
-            tr1_cmd.strip().split(" "),
-            stdout=tr1_out,
-            stderr=tr1_err,
-            env=required_envs)
+        tr1_proc = subprocess.Popen(tr1_cmd.strip().split(" "),
+                                    stdout=tr1_out,
+                                    stderr=tr1_err,
+                                    env=required_envs)
 
         return ((tr0_proc, tr0_out, tr0_err, tr0_out_log, tr0_err_log),
                 (tr1_proc, tr1_out, tr1_err, tr1_out_log, tr1_err_log))
@@ -397,10 +398,10 @@ def catlog(logx):
                 print("find parameter server port bind failed, skip the error")
                 tr0_ret, tr1_ret = 0, 0
             else:
-                for out, err in [
-                    (ps0_out_log, ps0_err_log), (ps1_out_log, ps1_err_log),
-                    (tr0_out_log, tr0_err_log), (tr1_out_log, tr1_err_log)
-                ]:
+                for out, err in [(ps0_out_log, ps0_err_log),
+                                 (ps1_out_log, ps1_err_log),
+                                 (tr0_out_log, tr0_err_log),
+                                 (tr1_out_log, tr1_err_log)]:
                     catlog(out)
                     catlog(err)
 
@@ -441,17 +442,23 @@ def check_with_place(self,
 
 def runtime_main(test_class):
     parser = argparse.ArgumentParser(description='Run Fleet test.')
-    parser.add_argument(
-        '--role', type=str, required=True, choices=['pserver', 'trainer'])
+    parser.add_argument('--role',
+                        type=str,
+                        required=True,
+                        choices=['pserver', 'trainer'])
     parser.add_argument('--endpoints', type=str, required=False, default="")
-    parser.add_argument(
-        '--trainer_endpoints', type=str, required=False, default="")
+    parser.add_argument('--trainer_endpoints',
+                        type=str,
+                        required=False,
+                        default="")
     parser.add_argument('--gloo_path', type=str, required=False, default="")
     parser.add_argument('--current_id', type=int, required=False, default=0)
     parser.add_argument('--trainers', type=int, required=False, default=1)
     parser.add_argument('--mode', type=str, required=False, default='geo')
-    parser.add_argument(
-        '--geo_sgd_need_push_nums', type=int, required=False, default=2)
+    parser.add_argument('--geo_sgd_need_push_nums',
+                        type=int,
+                        required=False,
+                        default=2)
     parser.add_argument('--reader', type=str, required=False, default='dataset')
     parser.add_argument('--test', type=int, required=False, default=0)
     parser.add_argument('--model_dir', type=str, required=False, default="")
@@ -464,11 +471,10 @@ def runtime_main(test_class):
     if args.test and args.model_dir != "":
         avg_cost = model.net(args, is_train=False)
         dist_infer = DistributedInfer()
-        dist_infer.init_distributed_infer_env(
-            exe=model.get_executor(),
-            loss=model.avg_cost,
-            role_maker=role,
-            dirname=args.model_dir)
+        dist_infer.init_distributed_infer_env(exe=model.get_executor(),
+                                              loss=model.avg_cost,
+                                              role_maker=role,
+                                              dirname=args.model_dir)
 
         if fleet.is_worker():
             with paddle.static.program_guard(
@@ -501,9 +507,8 @@ def runtime_main(test_class):
                     startup_program=test_startup_program):
                 with paddle.utils.unique_name.guard():
                     avg_cost = model.net(args, is_train=False)
-            dist_infer = DistributedInfer(
-                main_program=test_origin_program,
-                startup_program=test_startup_program)
+            dist_infer = DistributedInfer(main_program=test_origin_program,
+                                          startup_program=test_startup_program)
             with paddle.static.program_guard(
                     main_program=dist_infer.get_dist_infer_program()):
                 model.do_distributed_testing(fleet)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
index 09d64a318d6d8..59d6ce70ddc9b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -21,6 +21,7 @@
 
 
 class TestDistMnistAsyncInMemoryDataset2x2(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         #self._reader = "pyreader"
@@ -53,11 +54,13 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
+        self.check_with_place("dist_fleet_ctr.py",
+                              delta=1e-5,
+                              check_error_log=False)
 
 
 class TestDistMnistAsync2x2(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "pyreader"
@@ -87,11 +90,13 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
+        self.check_with_place("dist_fleet_ctr.py",
+                              delta=1e-5,
+                              check_error_log=False)
 
 
 class TestDistCtrHalfAsync2x2(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "pyreader"
@@ -124,8 +129,9 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
+        self.check_with_place("dist_fleet_ctr.py",
+                              delta=1e-5,
+                              check_error_log=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py
index e73eff2acc967..ecffd1ca76e82 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py
@@ -22,6 +22,7 @@
 
 @unittest.skip(reason="Skip unstable ut, need paddle sync mode fix")
 class TestDistMnistSync2x2(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "sync"
         self._reader = "pyreader"
@@ -52,12 +53,14 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
+        self.check_with_place("dist_fleet_ctr.py",
+                              delta=1e-5,
+                              check_error_log=False)
 
 
 # @unittest.skip(reason="Skip unstable ut, reader need to be rewrite")
 class TestDistMnistAsyncDataset2x2(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "dataset"
@@ -91,8 +94,9 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
+        self.check_with_place("dist_fleet_ctr.py",
+                              delta=1e-5,
+                              check_error_log=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_decay.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_decay.py
index f52cace4cf3bd..0fa3552a09127 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_decay.py
@@ -19,6 +19,7 @@
 import os
 import unittest
 import paddle
+
 paddle.enable_static()
 
 # For Net
@@ -33,14 +34,18 @@
 
 
 class TestNoamDecay(unittest.TestCase):
+
     def net(self):
-        input_data = paddle.static.data(
-            name="sparse_input", shape=[None, 1], dtype="int64")
-        input_label = paddle.static.data(
-            name="label", shape=[None, 1], dtype="int64")
+        input_data = paddle.static.data(name="sparse_input",
+                                        shape=[None, 1],
+                                        dtype="int64")
+        input_label = paddle.static.data(name="label",
+                                         shape=[None, 1],
+                                         dtype="int64")
         label = paddle.cast(input_label, dtype="float32")
-        embedding = paddle.static.nn.embedding(
-            input_data, is_sparse=True, size=[1000, 128])
+        embedding = paddle.static.nn.embedding(input_data,
+                                               is_sparse=True,
+                                               size=[1000, 128])
 
         fc1 = paddle.static.nn.fc(embedding, size=1024, activation="relu")
         fc2 = paddle.static.nn.fc(fc1, size=512, activation="relu")
@@ -57,16 +62,16 @@ def test(self):
             "127.0.0.1:36007"
         ]
 
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER,
-            worker_num=2,
-            server_endpoints=endpoints)
+        role = role_maker.UserDefinedRoleMaker(current_id=0,
+                                               role=role_maker.Role.WORKER,
+                                               worker_num=2,
+                                               server_endpoints=endpoints)
 
         fleet.init(role)
         loss = self.net()
-        scheduler = paddle.optimizer.lr.NoamDecay(
-            d_model=0.01, warmup_steps=100, verbose=True)
+        scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01,
+                                                  warmup_steps=100,
+                                                  verbose=True)
         optimizer = fluid.optimizer.Adam(scheduler)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
index 052dec6981e32..164694de8d59b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import os
+
 os.environ["WITH_DISTRIBUTE"] = "ON"
 import unittest
 import paddle
@@ -24,10 +25,12 @@
 
 from test_dist_fleet_base import TestFleetBase
 from dist_fleet_simnet_bow import train_network
+
 paddle.enable_static()
 
 
 class TestDistGeoCtr_2x2(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "geo"
         self._reader = "pyreader"
@@ -57,11 +60,13 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
+        self.check_with_place("dist_fleet_ctr.py",
+                              delta=1e-5,
+                              check_error_log=False)
 
 
 class TestGeoSgdTranspiler(unittest.TestCase):
+
     def test_pserver(self):
         role = role_maker.UserDefinedRoleMaker(
             current_id=0,
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py
index b4bc0d8dadce4..c01314389e89c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_gloo.py
@@ -31,6 +31,7 @@
 
 
 class TestDistGloo_2x2(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "sync"
         self._reader = "pyreader"
@@ -51,20 +52,18 @@ def _start_pserver(self, cmd, required_envs):
         required_envs["POD_IP"] = "127.0.0.1"
         required_envs["PADDLE_PSERVER_ID"] = "0"
         required_envs["PADDLE_PORT"] = "36011"
-        ps0_proc = subprocess.Popen(
-            ps0_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=ps0_pipe,
-            env=required_envs)
+        ps0_proc = subprocess.Popen(ps0_cmd.strip().split(" "),
+                                    stdout=subprocess.PIPE,
+                                    stderr=ps0_pipe,
+                                    env=required_envs)
         print("PADDLE_PSERVER_ID=0:")
         print(required_envs)
         required_envs["PADDLE_PSERVER_ID"] = "1"
         required_envs["PADDLE_PORT"] = "36012"
-        ps1_proc = subprocess.Popen(
-            ps1_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=ps1_pipe,
-            env=required_envs)
+        ps1_proc = subprocess.Popen(ps1_cmd.strip().split(" "),
+                                    stdout=subprocess.PIPE,
+                                    stderr=ps1_pipe,
+                                    env=required_envs)
         print("PADDLE_PSERVER_ID=1:")
         print(required_envs)
         return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
@@ -78,19 +77,17 @@ def _start_trainer(self, cmd, required_envs):
         tr0_pipe = open(tempfile.gettempdir() + "/tr0_err.log", "wb+")
         tr1_pipe = open(tempfile.gettempdir() + "/tr1_err.log", "wb+")
         required_envs["PADDLE_TRAINER_ID"] = "0"
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=tr0_pipe,
-            env=required_envs)
+        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(" "),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr0_pipe,
+                                    env=required_envs)
         print("PADDLE_TRAINER_ID=0:")
         print(required_envs)
         required_envs["PADDLE_TRAINER_ID"] = "1"
-        tr1_proc = subprocess.Popen(
-            tr1_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=tr1_pipe,
-            env=required_envs)
+        tr1_proc = subprocess.Popen(tr1_cmd.strip().split(" "),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr1_pipe,
+                                    env=required_envs)
         print("PADDLE_TRAINER_ID=1:")
         print(required_envs)
         return tr0_proc, tr1_proc, tr0_pipe, tr1_pipe
@@ -182,8 +179,9 @@ def check_with_place(self,
 
     def test_dist_train(self):
         print("path is not delete", os.path.exists("./tmp4"))
-        self.check_with_place(
-            "dist_fleet_debug_gloo.py", delta=1e-5, check_error_log=True)
+        self.check_with_place("dist_fleet_debug_gloo.py",
+                              delta=1e-5,
+                              check_error_log=True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
index 7807646dca3a5..f929bc2ae1edd 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
@@ -20,6 +20,7 @@
 
 
 class TestDistGeoClipByGlobalNorm(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "geo"
         self._reader = "dataset"
@@ -43,8 +44,9 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
+        self.check_with_place("dist_fleet_ctr.py",
+                              delta=1e-5,
+                              check_error_log=True)
 
     def _setup_config1(self):
         self._sync_mode = False
@@ -52,6 +54,7 @@ def _setup_config1(self):
 
 
 class TestDistASyncClipByValue(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "dataset"
@@ -74,11 +77,13 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
+        self.check_with_place("dist_fleet_ctr.py",
+                              delta=1e-5,
+                              check_error_log=True)
 
 
 class TestDistASyncClipByNorm(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "dataset"
@@ -101,11 +106,13 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
+        self.check_with_place("dist_fleet_ctr.py",
+                              delta=1e-5,
+                              check_error_log=True)
 
 
 class TestDistASyncClipByGlobalNorm(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "dataset"
@@ -128,8 +135,9 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
+        self.check_with_place("dist_fleet_ctr.py",
+                              delta=1e-5,
+                              check_error_log=True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
index 6111d40c7d640..560cfb0b36da9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
@@ -222,6 +222,7 @@ def setUp(self):
         self._setup_config()
 
     def _find_free_port(self):
+
         def __free_port():
             with closing(socket.socket(socket.AF_INET,
                                        socket.SOCK_STREAM)) as s:
@@ -240,16 +241,14 @@ def _start_pserver(self, cmd, required_envs):
         ps0_pipe = open(tempfile.gettempdir() + "/ps0_err.log", "wb+")
         ps1_pipe = open(tempfile.gettempdir() + "/ps1_err.log", "wb+")
 
-        ps0_proc = subprocess.Popen(
-            ps0_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=ps0_pipe,
-            env=required_envs)
-        ps1_proc = subprocess.Popen(
-            ps1_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=ps1_pipe,
-            env=required_envs)
+        ps0_proc = subprocess.Popen(ps0_cmd.strip().split(" "),
+                                    stdout=subprocess.PIPE,
+                                    stderr=ps0_pipe,
+                                    env=required_envs)
+        ps1_proc = subprocess.Popen(ps1_cmd.strip().split(" "),
+                                    stdout=subprocess.PIPE,
+                                    stderr=ps1_pipe,
+                                    env=required_envs)
         return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
 
     def _start_trainer(self, cmd, required_envs):
@@ -261,16 +260,14 @@ def _start_trainer(self, cmd, required_envs):
         tr0_out = open(tempfile.gettempdir() + "/tr0_out.log", "wb+")
         tr1_out = open(tempfile.gettempdir() + "/tr1_out.log", "wb+")
 
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.strip().split(" "),
-            stdout=tr0_out,
-            stderr=tr0_pipe,
-            env=required_envs)
-        tr1_proc = subprocess.Popen(
-            tr1_cmd.strip().split(" "),
-            stdout=tr1_out,
-            stderr=tr1_pipe,
-            env=required_envs)
+        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(" "),
+                                    stdout=tr0_out,
+                                    stderr=tr0_pipe,
+                                    env=required_envs)
+        tr1_proc = subprocess.Popen(tr1_cmd.strip().split(" "),
+                                    stdout=tr1_out,
+                                    stderr=tr1_pipe,
+                                    env=required_envs)
 
         return tr0_proc, tr1_proc, tr0_pipe, tr1_pipe
 
@@ -287,26 +284,22 @@ def _start_heter_trainer(self, cmd, required_envs):
         heter2_out = open(tempfile.gettempdir() + "/heter2_out.log", "wb+")
         heter3_out = open(tempfile.gettempdir() + "/heter3_out.log", "wb+")
 
-        heter0_proc = subprocess.Popen(
-            heter0_cmd.strip().split(" "),
-            stdout=heter0_out,
-            stderr=heter0_pipe,
-            env=required_envs)
-        heter1_proc = subprocess.Popen(
-            heter1_cmd.strip().split(" "),
-            stdout=heter1_out,
-            stderr=heter1_pipe,
-            env=required_envs)
-        heter2_proc = subprocess.Popen(
-            heter2_cmd.strip().split(" "),
-            stdout=heter2_out,
-            stderr=heter2_pipe,
-            env=required_envs)
-        heter3_proc = subprocess.Popen(
-            heter3_cmd.strip().split(" "),
-            stdout=heter3_out,
-            stderr=heter3_pipe,
-            env=required_envs)
+        heter0_proc = subprocess.Popen(heter0_cmd.strip().split(" "),
+                                       stdout=heter0_out,
+                                       stderr=heter0_pipe,
+                                       env=required_envs)
+        heter1_proc = subprocess.Popen(heter1_cmd.strip().split(" "),
+                                       stdout=heter1_out,
+                                       stderr=heter1_pipe,
+                                       env=required_envs)
+        heter2_proc = subprocess.Popen(heter2_cmd.strip().split(" "),
+                                       stdout=heter2_out,
+                                       stderr=heter2_pipe,
+                                       env=required_envs)
+        heter3_proc = subprocess.Popen(heter3_cmd.strip().split(" "),
+                                       stdout=heter3_out,
+                                       stderr=heter3_pipe,
+                                       env=required_envs)
 
         return heter0_proc, heter1_proc, heter2_proc, heter3_proc, heter0_pipe, heter1_pipe, heter2_pipe, heter3_pipe
 
@@ -414,25 +407,32 @@ def check_with_place(self,
 
 def runtime_main(test_class):
     parser = argparse.ArgumentParser(description='Run Fleet test.')
-    parser.add_argument(
-        '--role',
-        type=str,
-        required=True,
-        choices=['pserver', 'trainer', 'heter_trainer'])
+    parser.add_argument('--role',
+                        type=str,
+                        required=True,
+                        choices=['pserver', 'trainer', 'heter_trainer'])
     parser.add_argument('--endpoints', type=str, required=False, default="")
-    parser.add_argument(
-        '--trainer_endpoints', type=str, required=False, default="")
-    parser.add_argument(
-        '--heter_trainer_endpoints', type=str, required=False, default="")
-    parser.add_argument(
-        '--heter_trainer_device', type=str, required=False, default="gpu")
+    parser.add_argument('--trainer_endpoints',
+                        type=str,
+                        required=False,
+                        default="")
+    parser.add_argument('--heter_trainer_endpoints',
+                        type=str,
+                        required=False,
+                        default="")
+    parser.add_argument('--heter_trainer_device',
+                        type=str,
+                        required=False,
+                        default="gpu")
     parser.add_argument('--gloo_path', type=str, required=False, default="")
     parser.add_argument('--current_id', type=int, required=False, default=0)
     parser.add_argument('--trainers', type=int, required=False, default=1)
     parser.add_argument('--stage_id', type=int, required=False, default=1)
     parser.add_argument('--mode', type=str, required=False, default='async')
-    parser.add_argument(
-        '--geo_sgd_need_push_nums', type=int, required=False, default=2)
+    parser.add_argument('--geo_sgd_need_push_nums',
+                        type=int,
+                        required=False,
+                        default=2)
     parser.add_argument('--reader', type=str, required=False, default='dataset')
     args = parser.parse_args()
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
index 2ed331c628424..eaae0eff55abd 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
@@ -24,6 +24,7 @@
 
 
 class TestDistHeterDatasetAsync2x2(TestFleetHeterBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "dataset"
@@ -51,10 +52,9 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_heter_pipeline_ctr.py",
-            delta=1e-5,
-            check_error_log=True)
+        self.check_with_place("dist_fleet_heter_pipeline_ctr.py",
+                              delta=1e-5,
+                              check_error_log=True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
index 61f15e7dffff2..db5f5bccdac91 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
@@ -26,6 +26,7 @@
 
 
 class TestDistFleetHeterProgram(unittest.TestCase):
+
     def build_role(self):
         environs = {}
         environs[
@@ -63,13 +64,15 @@ def build_strategy(self):
         return self.strategy
 
     def build_input(self):
-        dense_input = fluid.layers.data(
-            name="dense_input", shape=[10], dtype="float32")
+        dense_input = fluid.layers.data(name="dense_input",
+                                        shape=[10],
+                                        dtype="float32")
 
         sparse_input_ids = [
-            fluid.layers.data(
-                name="C" + str(i), shape=[1], lod_level=1, dtype="int64")
-            for i in range(1, 27)
+            fluid.layers.data(name="C" + str(i),
+                              shape=[1],
+                              lod_level=1,
+                              dtype="int64") for i in range(1, 27)
         ]
 
         label = fluid.layers.data(name="label", shape=[1], dtype="float32")
@@ -78,6 +81,7 @@ def build_input(self):
         return inputs
 
     def build_net(self, inputs):
+
         def embedding_layer(input):
             return fluid.layers.embedding(
                 input=input,
@@ -85,7 +89,8 @@ def embedding_layer(input):
                 size=[100001, 10],
                 param_attr=fluid.ParamAttr(
                     name="SparseFeatFactors",
-                    initializer=fluid.initializer.Uniform()), )
+                    initializer=fluid.initializer.Uniform()),
+            )
 
         sparse_embed_seq = list(map(embedding_layer, inputs[1:-1]))
 
@@ -101,22 +106,22 @@ def embedding_layer(input):
                 name="fc1")
 
         with fluid.device_guard("cpu"):
-            fc2 = fluid.layers.fc(input=fc1,
-                                  size=400,
-                                  act="relu",
-                                  param_attr=fluid.ParamAttr(
-                                      initializer=fluid.initializer.Normal(
-                                          scale=1 / math.sqrt(fc1.shape[1]))),
-                                  name="fc2")
+            fc2 = fluid.layers.fc(
+                input=fc1,
+                size=400,
+                act="relu",
+                param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                    scale=1 / math.sqrt(fc1.shape[1]))),
+                name="fc2")
 
         with fluid.device_guard("gpu"):
-            fc3 = fluid.layers.fc(input=fc2,
-                                  size=400,
-                                  act="relu",
-                                  param_attr=fluid.ParamAttr(
-                                      initializer=fluid.initializer.Normal(
-                                          scale=1 / math.sqrt(fc2.shape[1]))),
-                                  name="fc3")
+            fc3 = fluid.layers.fc(
+                input=fc2,
+                size=400,
+                act="relu",
+                param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                    scale=1 / math.sqrt(fc2.shape[1]))),
+                name="fc3")
 
         with fluid.device_guard("cpu"):
             predict = fluid.layers.fc(
@@ -124,7 +129,8 @@ def embedding_layer(input):
                 size=2,
                 act="softmax",
                 param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
-                    scale=1 / math.sqrt(fc3.shape[1]))), )
+                    scale=1 / math.sqrt(fc3.shape[1]))),
+            )
 
         with fluid.device_guard("gpu"):
             labels = fluid.layers.cast(inputs[-1], dtype="int64")
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_infer.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_infer.py
index 82a3d73da2c71..6febcd9478bfe 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_infer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_infer.py
@@ -24,6 +24,7 @@
 
 
 class TestDistCtrInfer(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "pyreader"
@@ -57,22 +58,25 @@ def check_with_place(self,
     def test_dist_infer(self):
         model_dirname = tempfile.mkdtemp()
 
-        self.check_with_place(
-            "dist_fleet_ctr.py",
-            delta=1e-5,
-            check_error_log=False,
-            need_envs={"SAVE_DIRNAME": model_dirname, })
+        self.check_with_place("dist_fleet_ctr.py",
+                              delta=1e-5,
+                              check_error_log=False,
+                              need_envs={
+                                  "SAVE_DIRNAME": model_dirname,
+                              })
 
         self._need_test = 1
         self._model_dir = model_dirname
 
-        self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
+        self.check_with_place("dist_fleet_ctr.py",
+                              delta=1e-5,
+                              check_error_log=False)
 
         shutil.rmtree(model_dirname)
 
 
 class TestDistCtrTrainInfer(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "pyreader"
@@ -106,8 +110,9 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train_infer(self):
-        self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
+        self.check_with_place("dist_fleet_ctr.py",
+                              delta=1e-5,
+                              check_error_log=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
index fbd58e015c17e..7e3e5258aed60 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
@@ -18,6 +18,7 @@
 import unittest
 
 import paddle
+
 paddle.enable_static()
 
 import paddle.fluid as fluid
@@ -36,30 +37,34 @@
 
 
 class TestPSPassWithBow(unittest.TestCase):
+
     def net(self):
+
         def get_acc(cos_q_nt, cos_q_pt, batch_size):
             cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
             cond = fluid.layers.cast(cond, dtype='float64')
             cond_3 = fluid.layers.reduce_sum(cond)
-            acc = fluid.layers.elementwise_div(
-                cond_3,
-                fluid.layers.fill_constant(
-                    shape=[1], value=batch_size * 1.0, dtype='float64'),
-                name="simnet_acc")
+            acc = fluid.layers.elementwise_div(cond_3,
+                                               fluid.layers.fill_constant(
+                                                   shape=[1],
+                                                   value=batch_size * 1.0,
+                                                   dtype='float64'),
+                                               name="simnet_acc")
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
             loss_op1 = fluid.layers.elementwise_sub(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt,
-                    shape=[-1, 1],
-                    value=margin,
-                    dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=cos_q_pt,
+                                                           shape=[-1, 1],
+                                                           value=margin,
+                                                           dtype='float32'),
                 cos_q_pt)
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
             loss_op3 = fluid.layers.elementwise_max(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=loss_op2,
+                                                           shape=[-1, 1],
+                                                           value=0.0,
+                                                           dtype='float32'),
                 loss_op2)
             avg_cost = fluid.layers.mean(loss_op3)
             return avg_cost
@@ -68,8 +73,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         is_sparse = True
 
         # query
-        q = fluid.layers.data(
-            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        q = fluid.layers.data(name="query_ids",
+                              shape=[1],
+                              dtype="int64",
+                              lod_level=1)
         # embedding
         q_emb = fluid.layers.embedding(
             input=q,
@@ -95,8 +102,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         # label data
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
         # pt
-        pt = fluid.layers.data(
-            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        pt = fluid.layers.data(name="pos_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         pt_emb = fluid.layers.embedding(
             input=pt,
@@ -121,8 +130,10 @@ def get_loss(cos_q_pt, cos_q_nt):
                 learning_rate=base_lr),
             bias_attr=fluid.ParamAttr(name="__fc_b__"))
         # nt
-        nt = fluid.layers.data(
-            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        nt = fluid.layers.data(name="neg_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         nt_emb = fluid.layers.embedding(
             input=nt,
@@ -160,11 +171,10 @@ def test(self):
             "127.0.0.1:36007"
         ]
 
-        role = fleet.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_num=2,
-            server_endpoints=endpoints)
+        role = fleet.UserDefinedRoleMaker(current_id=0,
+                                          role=role_maker.Role.SERVER,
+                                          worker_num=2,
+                                          server_endpoints=endpoints)
 
         fleet.init(role)
         loss, acc, _ = self.net()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
index 3fa4cc1c1b6fd..65fc10031cc7d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import os
+
 os.environ["WITH_DISTRIBUTE"] = "ON"
 import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
@@ -35,14 +36,18 @@
 
 
 class TestExponentialDecay(unittest.TestCase):
+
     def net(self):
-        input_data = paddle.static.data(
-            name="sparse_input", shape=[None, 1], dtype="int64")
-        input_label = paddle.static.data(
-            name="label", shape=[None, 1], dtype="int64")
+        input_data = paddle.static.data(name="sparse_input",
+                                        shape=[None, 1],
+                                        dtype="int64")
+        input_label = paddle.static.data(name="label",
+                                         shape=[None, 1],
+                                         dtype="int64")
         label = paddle.cast(input_label, dtype="float32")
-        embedding = paddle.static.nn.embedding(
-            input_data, is_sparse=True, size=[1000, 128])
+        embedding = paddle.static.nn.embedding(input_data,
+                                               is_sparse=True,
+                                               size=[1000, 128])
 
         fc1 = paddle.static.nn.fc(embedding, size=1024, activation="relu")
         fc2 = paddle.static.nn.fc(fc1, size=512, activation="relu")
@@ -59,16 +64,16 @@ def test(self):
             "127.0.0.1:36007"
         ]
 
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_num=2,
-            server_endpoints=endpoints)
+        role = role_maker.UserDefinedRoleMaker(current_id=0,
+                                               role=role_maker.Role.SERVER,
+                                               worker_num=2,
+                                               server_endpoints=endpoints)
 
         fleet.init(role)
         loss = self.net()
-        scheduler = paddle.optimizer.lr.InverseTimeDecay(
-            learning_rate=base_lr, gamma=0.999, verbose=True)
+        scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=base_lr,
+                                                         gamma=0.999,
+                                                         verbose=True)
         optimizer = fluid.optimizer.Adam(scheduler)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
index cad7d067e9019..171889ae917e0 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
@@ -38,30 +38,34 @@
 
 
 class TestPSPassWithBow(unittest.TestCase):
+
     def net(self):
+
         def get_acc(cos_q_nt, cos_q_pt, batch_size):
             cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
             cond = fluid.layers.cast(cond, dtype='float64')
             cond_3 = fluid.layers.reduce_sum(cond)
-            acc = fluid.layers.elementwise_div(
-                cond_3,
-                fluid.layers.fill_constant(
-                    shape=[1], value=batch_size * 1.0, dtype='float64'),
-                name="simnet_acc")
+            acc = fluid.layers.elementwise_div(cond_3,
+                                               fluid.layers.fill_constant(
+                                                   shape=[1],
+                                                   value=batch_size * 1.0,
+                                                   dtype='float64'),
+                                               name="simnet_acc")
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
             loss_op1 = fluid.layers.elementwise_sub(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt,
-                    shape=[-1, 1],
-                    value=margin,
-                    dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=cos_q_pt,
+                                                           shape=[-1, 1],
+                                                           value=margin,
+                                                           dtype='float32'),
                 cos_q_pt)
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
             loss_op3 = fluid.layers.elementwise_max(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=loss_op2,
+                                                           shape=[-1, 1],
+                                                           value=0.0,
+                                                           dtype='float32'),
                 loss_op2)
             avg_cost = fluid.layers.mean(loss_op3)
             return avg_cost
@@ -70,8 +74,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         is_sparse = True
 
         # query
-        q = fluid.layers.data(
-            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        q = fluid.layers.data(name="query_ids",
+                              shape=[1],
+                              dtype="int64",
+                              lod_level=1)
         # embedding
         q_emb = fluid.contrib.layers.sparse_embedding(
             input=q,
@@ -95,8 +101,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         # label data
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
         # pt
-        pt = fluid.layers.data(
-            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        pt = fluid.layers.data(name="pos_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         pt_emb = fluid.contrib.layers.sparse_embedding(
             input=pt,
@@ -119,8 +127,10 @@ def get_loss(cos_q_pt, cos_q_nt):
                 learning_rate=base_lr),
             bias_attr=fluid.ParamAttr(name="__fc_b__"))
         # nt
-        nt = fluid.layers.data(
-            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        nt = fluid.layers.data(name="neg_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         nt_emb = fluid.contrib.layers.sparse_embedding(
             input=nt,
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
index 14ed9dc04277d..65e4381bc2a3f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import os
+
 os.environ["WITH_DISTRIBUTE"] = "ON"
 
 import unittest
@@ -40,30 +41,34 @@
 
 
 class TestPSPassWithBow(unittest.TestCase):
+
     def net(self):
+
         def get_acc(cos_q_nt, cos_q_pt, batch_size):
             cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
             cond = fluid.layers.cast(cond, dtype='float64')
             cond_3 = fluid.layers.reduce_sum(cond)
-            acc = fluid.layers.elementwise_div(
-                cond_3,
-                fluid.layers.fill_constant(
-                    shape=[1], value=batch_size * 1.0, dtype='float64'),
-                name="simnet_acc")
+            acc = fluid.layers.elementwise_div(cond_3,
+                                               fluid.layers.fill_constant(
+                                                   shape=[1],
+                                                   value=batch_size * 1.0,
+                                                   dtype='float64'),
+                                               name="simnet_acc")
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
             loss_op1 = fluid.layers.elementwise_sub(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt,
-                    shape=[-1, 1],
-                    value=margin,
-                    dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=cos_q_pt,
+                                                           shape=[-1, 1],
+                                                           value=margin,
+                                                           dtype='float32'),
                 cos_q_pt)
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
             loss_op3 = fluid.layers.elementwise_max(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=loss_op2,
+                                                           shape=[-1, 1],
+                                                           value=0.0,
+                                                           dtype='float32'),
                 loss_op2)
             avg_cost = fluid.layers.mean(loss_op3)
             return avg_cost
@@ -72,8 +77,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         is_sparse = True
 
         # query
-        q = fluid.layers.data(
-            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        q = fluid.layers.data(name="query_ids",
+                              shape=[1],
+                              dtype="int64",
+                              lod_level=1)
         # embedding
         q_emb = fluid.contrib.layers.sparse_embedding(
             input=q,
@@ -97,8 +104,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         # label data
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
         # pt
-        pt = fluid.layers.data(
-            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        pt = fluid.layers.data(name="pos_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         pt_emb = fluid.contrib.layers.sparse_embedding(
             input=pt,
@@ -121,8 +130,10 @@ def get_loss(cos_q_pt, cos_q_nt):
                 learning_rate=base_lr),
             bias_attr=fluid.ParamAttr(name="__fc_b__"))
         # nt
-        nt = fluid.layers.data(
-            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        nt = fluid.layers.data(name="neg_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         nt_emb = fluid.contrib.layers.sparse_embedding(
             input=nt,
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
index 858b1acb4fde1..243023b4fe1c6 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import os
+
 os.environ["WITH_DISTRIBUTE"] = "ON"
 
 import unittest
@@ -40,30 +41,34 @@
 
 
 class TestPSPassWithBow(unittest.TestCase):
+
     def net(self):
+
         def get_acc(cos_q_nt, cos_q_pt, batch_size):
             cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
             cond = fluid.layers.cast(cond, dtype='float64')
             cond_3 = fluid.layers.reduce_sum(cond)
-            acc = fluid.layers.elementwise_div(
-                cond_3,
-                fluid.layers.fill_constant(
-                    shape=[1], value=batch_size * 1.0, dtype='float64'),
-                name="simnet_acc")
+            acc = fluid.layers.elementwise_div(cond_3,
+                                               fluid.layers.fill_constant(
+                                                   shape=[1],
+                                                   value=batch_size * 1.0,
+                                                   dtype='float64'),
+                                               name="simnet_acc")
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
             loss_op1 = fluid.layers.elementwise_sub(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt,
-                    shape=[-1, 1],
-                    value=margin,
-                    dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=cos_q_pt,
+                                                           shape=[-1, 1],
+                                                           value=margin,
+                                                           dtype='float32'),
                 cos_q_pt)
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
             loss_op3 = fluid.layers.elementwise_max(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=loss_op2,
+                                                           shape=[-1, 1],
+                                                           value=0.0,
+                                                           dtype='float32'),
                 loss_op2)
             avg_cost = fluid.layers.mean(loss_op3)
             return avg_cost
@@ -72,8 +77,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         is_sparse = True
 
         # query
-        q = fluid.layers.data(
-            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        q = fluid.layers.data(name="query_ids",
+                              shape=[1],
+                              dtype="int64",
+                              lod_level=1)
         # embedding
         q_emb = fluid.contrib.layers.sparse_embedding(
             input=q,
@@ -97,8 +104,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         # label data
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
         # pt
-        pt = fluid.layers.data(
-            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        pt = fluid.layers.data(name="pos_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         pt_emb = fluid.contrib.layers.sparse_embedding(
             input=pt,
@@ -121,8 +130,10 @@ def get_loss(cos_q_pt, cos_q_nt):
                 learning_rate=base_lr),
             bias_attr=fluid.ParamAttr(name="__fc_b__"))
         # nt
-        nt = fluid.layers.data(
-            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        nt = fluid.layers.data(name="neg_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         nt_emb = fluid.contrib.layers.sparse_embedding(
             input=nt,
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
index aa7975d2b8bef..b8ff052c192cd 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
@@ -18,6 +18,7 @@
 import unittest
 
 import paddle
+
 paddle.enable_static()
 
 import paddle.fluid as fluid
@@ -36,30 +37,34 @@
 
 
 class TestPSPassWithBow(unittest.TestCase):
+
     def net(self):
+
         def get_acc(cos_q_nt, cos_q_pt, batch_size):
             cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
             cond = fluid.layers.cast(cond, dtype='float64')
             cond_3 = fluid.layers.reduce_sum(cond)
-            acc = fluid.layers.elementwise_div(
-                cond_3,
-                fluid.layers.fill_constant(
-                    shape=[1], value=batch_size * 1.0, dtype='float64'),
-                name="simnet_acc")
+            acc = fluid.layers.elementwise_div(cond_3,
+                                               fluid.layers.fill_constant(
+                                                   shape=[1],
+                                                   value=batch_size * 1.0,
+                                                   dtype='float64'),
+                                               name="simnet_acc")
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
             loss_op1 = fluid.layers.elementwise_sub(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt,
-                    shape=[-1, 1],
-                    value=margin,
-                    dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=cos_q_pt,
+                                                           shape=[-1, 1],
+                                                           value=margin,
+                                                           dtype='float32'),
                 cos_q_pt)
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
             loss_op3 = fluid.layers.elementwise_max(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=loss_op2,
+                                                           shape=[-1, 1],
+                                                           value=0.0,
+                                                           dtype='float32'),
                 loss_op2)
             avg_cost = fluid.layers.mean(loss_op3)
             return avg_cost
@@ -68,8 +73,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         is_sparse = False
 
         # query
-        q = fluid.layers.data(
-            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        q = fluid.layers.data(name="query_ids",
+                              shape=[1],
+                              dtype="int64",
+                              lod_level=1)
         # embedding
         q_emb = fluid.layers.embedding(
             input=q,
@@ -95,8 +102,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         # label data
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
         # pt
-        pt = fluid.layers.data(
-            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        pt = fluid.layers.data(name="pos_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         pt_emb = fluid.layers.embedding(
             input=pt,
@@ -121,8 +130,10 @@ def get_loss(cos_q_pt, cos_q_nt):
                 learning_rate=base_lr),
             bias_attr=fluid.ParamAttr(name="__fc_b__"))
         # nt
-        nt = fluid.layers.data(
-            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        nt = fluid.layers.data(name="neg_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         nt_emb = fluid.layers.embedding(
             input=nt,
@@ -160,11 +171,10 @@ def test(self):
             "127.0.0.1:36007"
         ]
 
-        role = fleet.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER,
-            worker_num=2,
-            server_endpoints=endpoints)
+        role = fleet.UserDefinedRoleMaker(current_id=0,
+                                          role=role_maker.Role.WORKER,
+                                          worker_num=2,
+                                          server_endpoints=endpoints)
 
         fleet.init(role)
         loss, acc, _ = self.net()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
index ca8f5261045f7..32af1959f25db 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
@@ -18,6 +18,7 @@
 import unittest
 
 import paddle
+
 paddle.enable_static()
 
 import paddle.fluid as fluid
@@ -36,30 +37,34 @@
 
 
 class TestPSPassWithBow(unittest.TestCase):
+
     def net(self):
+
         def get_acc(cos_q_nt, cos_q_pt, batch_size):
             cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
             cond = fluid.layers.cast(cond, dtype='float64')
             cond_3 = fluid.layers.reduce_sum(cond)
-            acc = fluid.layers.elementwise_div(
-                cond_3,
-                fluid.layers.fill_constant(
-                    shape=[1], value=batch_size * 1.0, dtype='float64'),
-                name="simnet_acc")
+            acc = fluid.layers.elementwise_div(cond_3,
+                                               fluid.layers.fill_constant(
+                                                   shape=[1],
+                                                   value=batch_size * 1.0,
+                                                   dtype='float64'),
+                                               name="simnet_acc")
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
             loss_op1 = fluid.layers.elementwise_sub(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt,
-                    shape=[-1, 1],
-                    value=margin,
-                    dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=cos_q_pt,
+                                                           shape=[-1, 1],
+                                                           value=margin,
+                                                           dtype='float32'),
                 cos_q_pt)
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
             loss_op3 = fluid.layers.elementwise_max(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=loss_op2,
+                                                           shape=[-1, 1],
+                                                           value=0.0,
+                                                           dtype='float32'),
                 loss_op2)
             avg_cost = fluid.layers.mean(loss_op3)
             return avg_cost
@@ -68,8 +73,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         is_sparse = True
 
         # query
-        q = fluid.layers.data(
-            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        q = fluid.layers.data(name="query_ids",
+                              shape=[1],
+                              dtype="int64",
+                              lod_level=1)
         # embedding
         q_emb = fluid.contrib.layers.sparse_embedding(
             input=q,
@@ -93,8 +100,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         # label data
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
         # pt
-        pt = fluid.layers.data(
-            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        pt = fluid.layers.data(name="pos_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         pt_emb = fluid.contrib.layers.sparse_embedding(
             input=pt,
@@ -117,8 +126,10 @@ def get_loss(cos_q_pt, cos_q_nt):
                 learning_rate=base_lr),
             bias_attr=fluid.ParamAttr(name="__fc_b__"))
         # nt
-        nt = fluid.layers.data(
-            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        nt = fluid.layers.data(name="neg_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         nt_emb = fluid.contrib.layers.sparse_embedding(
             input=nt,
@@ -154,11 +165,10 @@ def test(self):
             "127.0.0.1:36007"
         ]
 
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_num=2,
-            server_endpoints=endpoints)
+        role = role_maker.UserDefinedRoleMaker(current_id=0,
+                                               role=role_maker.Role.SERVER,
+                                               worker_num=2,
+                                               server_endpoints=endpoints)
 
         fleet.init(role)
         loss, acc, _ = self.net()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
index 2812cb4b3d633..63ea8f639aae4 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
@@ -18,6 +18,7 @@
 import unittest
 
 import paddle
+
 paddle.enable_static()
 
 import paddle.fluid as fluid
@@ -36,30 +37,34 @@
 
 
 class TestPSPassWithBow(unittest.TestCase):
+
     def net(self):
+
         def get_acc(cos_q_nt, cos_q_pt, batch_size):
             cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
             cond = fluid.layers.cast(cond, dtype='float64')
             cond_3 = fluid.layers.reduce_sum(cond)
-            acc = fluid.layers.elementwise_div(
-                cond_3,
-                fluid.layers.fill_constant(
-                    shape=[1], value=batch_size * 1.0, dtype='float64'),
-                name="simnet_acc")
+            acc = fluid.layers.elementwise_div(cond_3,
+                                               fluid.layers.fill_constant(
+                                                   shape=[1],
+                                                   value=batch_size * 1.0,
+                                                   dtype='float64'),
+                                               name="simnet_acc")
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
             loss_op1 = fluid.layers.elementwise_sub(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt,
-                    shape=[-1, 1],
-                    value=margin,
-                    dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=cos_q_pt,
+                                                           shape=[-1, 1],
+                                                           value=margin,
+                                                           dtype='float32'),
                 cos_q_pt)
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
             loss_op3 = fluid.layers.elementwise_max(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=loss_op2,
+                                                           shape=[-1, 1],
+                                                           value=0.0,
+                                                           dtype='float32'),
                 loss_op2)
             avg_cost = fluid.layers.mean(loss_op3)
             return avg_cost
@@ -68,8 +73,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         is_sparse = True
 
         # query
-        q = fluid.layers.data(
-            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        q = fluid.layers.data(name="query_ids",
+                              shape=[1],
+                              dtype="int64",
+                              lod_level=1)
         # embedding
         q_emb = fluid.layers.embedding(
             input=q,
@@ -95,8 +102,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         # label data
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
         # pt
-        pt = fluid.layers.data(
-            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        pt = fluid.layers.data(name="pos_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         pt_emb = fluid.layers.embedding(
             input=pt,
@@ -121,8 +130,10 @@ def get_loss(cos_q_pt, cos_q_nt):
                 learning_rate=base_lr),
             bias_attr=fluid.ParamAttr(name="__fc_b__"))
         # nt
-        nt = fluid.layers.data(
-            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        nt = fluid.layers.data(name="neg_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         nt_emb = fluid.layers.embedding(
             input=nt,
@@ -160,21 +171,19 @@ def test(self):
             "127.0.0.1:36007"
         ]
 
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_num=2,
-            server_endpoints=endpoints)
+        role = role_maker.UserDefinedRoleMaker(current_id=0,
+                                               role=role_maker.Role.SERVER,
+                                               worker_num=2,
+                                               server_endpoints=endpoints)
 
         fleet.init(role)
         loss, acc, _ = self.net()
 
         optimizer = fluid.optimizer.Adam(
-            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=base_lr,
-                decay_steps=500,
-                decay_rate=0.969,
-                staircase=True))
+            learning_rate=fluid.layers.exponential_decay(learning_rate=base_lr,
+                                                         decay_steps=500,
+                                                         decay_rate=0.969,
+                                                         staircase=True))
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
index 902870789e8a5..692f586a43546 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
@@ -18,6 +18,7 @@
 import unittest
 
 import paddle
+
 paddle.enable_static()
 
 import paddle.fluid as fluid
@@ -36,30 +37,34 @@
 
 
 class TestPSPassWithBow(unittest.TestCase):
+
     def net(self):
+
         def get_acc(cos_q_nt, cos_q_pt, batch_size):
             cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
             cond = fluid.layers.cast(cond, dtype='float64')
             cond_3 = fluid.layers.reduce_sum(cond)
-            acc = fluid.layers.elementwise_div(
-                cond_3,
-                fluid.layers.fill_constant(
-                    shape=[1], value=batch_size * 1.0, dtype='float64'),
-                name="simnet_acc")
+            acc = fluid.layers.elementwise_div(cond_3,
+                                               fluid.layers.fill_constant(
+                                                   shape=[1],
+                                                   value=batch_size * 1.0,
+                                                   dtype='float64'),
+                                               name="simnet_acc")
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
             loss_op1 = fluid.layers.elementwise_sub(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt,
-                    shape=[-1, 1],
-                    value=margin,
-                    dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=cos_q_pt,
+                                                           shape=[-1, 1],
+                                                           value=margin,
+                                                           dtype='float32'),
                 cos_q_pt)
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
             loss_op3 = fluid.layers.elementwise_max(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                fluid.layers.fill_constant_batch_size_like(input=loss_op2,
+                                                           shape=[-1, 1],
+                                                           value=0.0,
+                                                           dtype='float32'),
                 loss_op2)
             avg_cost = fluid.layers.mean(loss_op3)
             return avg_cost
@@ -68,8 +73,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         is_sparse = True
 
         # query
-        q = fluid.layers.data(
-            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        q = fluid.layers.data(name="query_ids",
+                              shape=[1],
+                              dtype="int64",
+                              lod_level=1)
         # embedding
         q_emb = fluid.contrib.layers.sparse_embedding(
             input=q,
@@ -93,8 +100,10 @@ def get_loss(cos_q_pt, cos_q_nt):
         # label data
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
         # pt
-        pt = fluid.layers.data(
-            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        pt = fluid.layers.data(name="pos_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         pt_emb = fluid.contrib.layers.sparse_embedding(
             input=pt,
@@ -117,8 +126,10 @@ def get_loss(cos_q_pt, cos_q_nt):
                 learning_rate=base_lr),
             bias_attr=fluid.ParamAttr(name="__fc_b__"))
         # nt
-        nt = fluid.layers.data(
-            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        nt = fluid.layers.data(name="neg_title_ids",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
         # embedding
         nt_emb = fluid.contrib.layers.sparse_embedding(
             input=nt,
@@ -154,11 +165,10 @@ def test(self):
             "127.0.0.1:36007"
         ]
 
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_num=2,
-            server_endpoints=endpoints)
+        role = role_maker.UserDefinedRoleMaker(current_id=0,
+                                               role=role_maker.Role.SERVER,
+                                               worker_num=2,
+                                               server_endpoints=endpoints)
 
         fleet.init(role)
         loss, acc, _ = self.net()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps7.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps7.py
index b63301b87dcdf..466ceb5c6dbe9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps7.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps7.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import os
+
 os.environ["WITH_DISTRIBUTE"] = "ON"
 
 import paddle.distributed.fleet as fleet
@@ -21,6 +22,7 @@
 import paddle.fluid as fluid
 import unittest
 import paddle
+
 paddle.enable_static()
 
 # For Net
@@ -35,14 +37,18 @@
 
 
 class TestNaturalExpDecay(unittest.TestCase):
+
     def net(self):
-        input_data = paddle.static.data(
-            name="sparse_input", shape=[None, 1], dtype="int64")
-        input_label = paddle.static.data(
-            name="label", shape=[None, 1], dtype="int64")
+        input_data = paddle.static.data(name="sparse_input",
+                                        shape=[None, 1],
+                                        dtype="int64")
+        input_label = paddle.static.data(name="label",
+                                         shape=[None, 1],
+                                         dtype="int64")
         label = paddle.cast(input_label, dtype="float32")
-        embedding = paddle.static.nn.embedding(
-            input_data, is_sparse=True, size=[1000, 128])
+        embedding = paddle.static.nn.embedding(input_data,
+                                               is_sparse=True,
+                                               size=[1000, 128])
 
         fc1 = paddle.static.nn.fc(embedding, size=1024, activation="relu")
         fc2 = paddle.static.nn.fc(fc1, size=512, activation="relu")
@@ -59,16 +65,16 @@ def test(self):
             "127.0.0.1:36007"
         ]
 
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_num=2,
-            server_endpoints=endpoints)
+        role = role_maker.UserDefinedRoleMaker(current_id=0,
+                                               role=role_maker.Role.SERVER,
+                                               worker_num=2,
+                                               server_endpoints=endpoints)
 
         fleet.init(role)
         loss = self.net()
-        scheduler = paddle.optimizer.lr.NaturalExpDecay(
-            learning_rate=base_lr, gamma=0.999, verbose=True)
+        scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=base_lr,
+                                                        gamma=0.999,
+                                                        verbose=True)
         optimizer = fluid.optimizer.Adam(scheduler)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps8.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps8.py
index d213014da9afb..834f7d1273fd3 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps8.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps8.py
@@ -14,12 +14,14 @@
 
 from __future__ import print_function
 import os
+
 os.environ["WITH_DISTRIBUTE"] = "ON"
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.fluid as fluid
 import unittest
 import paddle
+
 paddle.enable_static()
 
 # For Net
@@ -34,14 +36,18 @@
 
 
 class TestNoamDecay(unittest.TestCase):
+
     def net(self):
-        input_data = paddle.static.data(
-            name="sparse_input", shape=[None, 1], dtype="int64")
-        input_label = paddle.static.data(
-            name="label", shape=[None, 1], dtype="int64")
+        input_data = paddle.static.data(name="sparse_input",
+                                        shape=[None, 1],
+                                        dtype="int64")
+        input_label = paddle.static.data(name="label",
+                                         shape=[None, 1],
+                                         dtype="int64")
         label = paddle.cast(input_label, dtype="float32")
-        embedding = paddle.static.nn.embedding(
-            input_data, is_sparse=True, size=[1000, 128])
+        embedding = paddle.static.nn.embedding(input_data,
+                                               is_sparse=True,
+                                               size=[1000, 128])
 
         fc1 = paddle.static.nn.fc(embedding, size=1024, activation="relu")
         fc2 = paddle.static.nn.fc(fc1, size=512, activation="relu")
@@ -58,16 +64,16 @@ def test(self):
             "127.0.0.1:36007"
         ]
 
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_num=2,
-            server_endpoints=endpoints)
+        role = role_maker.UserDefinedRoleMaker(current_id=0,
+                                               role=role_maker.Role.SERVER,
+                                               worker_num=2,
+                                               server_endpoints=endpoints)
 
         fleet.init(role)
         loss = self.net()
-        scheduler = paddle.optimizer.lr.NoamDecay(
-            d_model=0.01, warmup_steps=100, verbose=True)
+        scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01,
+                                                  warmup_steps=100,
+                                                  verbose=True)
         optimizer = fluid.optimizer.Adam(scheduler)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps9.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps9.py
index 926789f4fba1b..a1bd087cbe8b5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps9.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps9.py
@@ -14,12 +14,14 @@
 
 from __future__ import print_function
 import os
+
 os.environ["WITH_DISTRIBUTE"] = "ON"
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.fluid as fluid
 import unittest
 import paddle
+
 paddle.enable_static()
 
 # For Net
@@ -34,14 +36,18 @@
 
 
 class TestExponentialDecay(unittest.TestCase):
+
     def net(self):
-        input_data = paddle.static.data(
-            name="sparse_input", shape=[None, 1], dtype="int64")
-        input_label = paddle.static.data(
-            name="label", shape=[None, 1], dtype="int64")
+        input_data = paddle.static.data(name="sparse_input",
+                                        shape=[None, 1],
+                                        dtype="int64")
+        input_label = paddle.static.data(name="label",
+                                         shape=[None, 1],
+                                         dtype="int64")
         label = paddle.cast(input_label, dtype="float32")
-        embedding = paddle.static.nn.embedding(
-            input_data, is_sparse=True, size=[1000, 128])
+        embedding = paddle.static.nn.embedding(input_data,
+                                               is_sparse=True,
+                                               size=[1000, 128])
 
         fc1 = paddle.static.nn.fc(embedding, size=1024, activation="relu")
         fc2 = paddle.static.nn.fc(fc1, size=512, activation="relu")
@@ -58,16 +64,16 @@ def test(self):
             "127.0.0.1:36007"
         ]
 
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_num=2,
-            server_endpoints=endpoints)
+        role = role_maker.UserDefinedRoleMaker(current_id=0,
+                                               role=role_maker.Role.SERVER,
+                                               worker_num=2,
+                                               server_endpoints=endpoints)
 
         fleet.init(role)
         loss = self.net()
-        scheduler = paddle.optimizer.lr.ExponentialDecay(
-            learning_rate=base_lr, gamma=0.999, verbose=True)
+        scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=base_lr,
+                                                         gamma=0.999,
+                                                         verbose=True)
         optimizer = fluid.optimizer.Adam(scheduler)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps_gpu_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps_gpu_ctr.py
index 9308a3e4792f3..d29ea0daad693 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps_gpu_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps_gpu_ctr.py
@@ -21,6 +21,7 @@
 
 
 class TestPsGPUAsyncDataset2x2(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "dataset"
@@ -50,8 +51,9 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
+        self.check_with_place("dist_fleet_ctr.py",
+                              delta=1e-5,
+                              check_error_log=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py
index e729bfe053752..b49a2599b76a5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py
@@ -22,6 +22,7 @@
 
 
 class TestFleetMetaOptimizerPrecision(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -34,11 +35,10 @@ def _setup_config(self):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "dist_fleet_raw_program_optimizer.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("dist_fleet_raw_program_optimizer.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer_fuse_allreduce.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer_fuse_allreduce.py
index 21b921c52c8cf..be85ea71040ab 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer_fuse_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer_fuse_allreduce.py
@@ -22,6 +22,7 @@
 
 
 class TestFleetMetaOptimizerAllReduceFusePrecision(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
index e0fa590db2abd..6e45b142399a8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
@@ -24,6 +24,7 @@
 
 
 class TestDistSimnetASync2x2(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "pyreader"
@@ -51,8 +52,9 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_simnet_bow.py", delta=1e-5, check_error_log=True)
+        self.check_with_place("dist_fleet_simnet_bow.py",
+                              delta=1e-5,
+                              check_error_log=True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py
index 4e0241c1e9c52..17d50f988a6a8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py
@@ -30,6 +30,7 @@
 
 @unittest.skip(reason="Skip unstable ut, need paddle sync mode fix")
 class TestDistMnistSync2x2(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "sync"
         self._reader = "pyreader"
@@ -59,13 +60,13 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_sparse_embedding_ctr.py",
-            delta=1e-5,
-            check_error_log=True)
+        self.check_with_place("dist_fleet_sparse_embedding_ctr.py",
+                              delta=1e-5,
+                              check_error_log=True)
 
 
 class TestDistMnistAsync2x2(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "pyreader"
@@ -95,13 +96,13 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_sparse_embedding_ctr.py",
-            delta=1e-5,
-            check_error_log=True)
+        self.check_with_place("dist_fleet_sparse_embedding_ctr.py",
+                              delta=1e-5,
+                              check_error_log=True)
 
 
 class TestDistMnistAsync2x2WithDecay(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "pyreader"
@@ -132,13 +133,13 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_sparse_embedding_ctr.py",
-            delta=1e-5,
-            check_error_log=True)
+        self.check_with_place("dist_fleet_sparse_embedding_ctr.py",
+                              delta=1e-5,
+                              check_error_log=True)
 
 
 class TestDistMnistAsync2x2WithUnifrom(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "pyreader"
@@ -169,19 +170,20 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_sparse_embedding_ctr.py",
-            delta=1e-5,
-            check_error_log=True)
+        self.check_with_place("dist_fleet_sparse_embedding_ctr.py",
+                              delta=1e-5,
+                              check_error_log=True)
 
 
 @unittest.skip(reason="Skip unstable ut, need tensor table to enhance")
 class TestDistMnistAsync2x2WithGauss(TestFleetBase):
+
     def _setup_config(self):
         self._mode = "async"
         self._reader = "pyreader"
 
     def _run_local_infer(self, model_file):
+
         def net():
             """
             network definition
@@ -194,24 +196,21 @@ def net():
             """
             dnn_input_dim, lr_input_dim = 10, 10
 
-            dnn_data = fluid.layers.data(
-                name="dnn_data",
-                shape=[-1, 1],
-                dtype="int64",
-                lod_level=1,
-                append_batch_size=False)
-            lr_data = fluid.layers.data(
-                name="lr_data",
-                shape=[-1, 1],
-                dtype="int64",
-                lod_level=1,
-                append_batch_size=False)
-            label = fluid.layers.data(
-                name="click",
-                shape=[-1, 1],
-                dtype="int64",
-                lod_level=0,
-                append_batch_size=False)
+            dnn_data = fluid.layers.data(name="dnn_data",
+                                         shape=[-1, 1],
+                                         dtype="int64",
+                                         lod_level=1,
+                                         append_batch_size=False)
+            lr_data = fluid.layers.data(name="lr_data",
+                                        shape=[-1, 1],
+                                        dtype="int64",
+                                        lod_level=1,
+                                        append_batch_size=False)
+            label = fluid.layers.data(name="click",
+                                      shape=[-1, 1],
+                                      dtype="int64",
+                                      lod_level=0,
+                                      append_batch_size=False)
 
             datas = [dnn_data, lr_data, label]
 
@@ -223,10 +222,10 @@ def net():
                 input=dnn_data,
                 size=[dnn_input_dim, dnn_layer_dims[0]],
                 is_test=inference,
-                param_attr=fluid.ParamAttr(
-                    name="deep_embedding", initializer=init))
-            dnn_pool = fluid.layers.sequence_pool(
-                input=dnn_embedding, pool_type="sum")
+                param_attr=fluid.ParamAttr(name="deep_embedding",
+                                           initializer=init))
+            dnn_pool = fluid.layers.sequence_pool(input=dnn_embedding,
+                                                  pool_type="sum")
             dnn_out = dnn_pool
             for i, dim in enumerate(dnn_layer_dims[1:]):
                 fc = fluid.layers.fc(
@@ -247,8 +246,8 @@ def net():
                     name="wide_embedding",
                     initializer=fluid.initializer.Constant(value=0.01)))
 
-            lr_pool = fluid.layers.sequence_pool(
-                input=lr_embbding, pool_type="sum")
+            lr_pool = fluid.layers.sequence_pool(input=lr_embbding,
+                                                 pool_type="sum")
             merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)
             predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
             return datas, predict
@@ -296,10 +295,9 @@ def check_with_place(self,
         shutil.rmtree(model_dir)
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_sparse_embedding_ctr.py",
-            delta=1e-5,
-            check_error_log=True)
+        self.check_with_place("dist_fleet_sparse_embedding_ctr.py",
+                              delta=1e-5,
+                              check_error_log=True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_trainer_desc_config.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_trainer_desc_config.py
index 574a6888fdb64..d692528f5bb34 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_trainer_desc_config.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_trainer_desc_config.py
@@ -15,6 +15,7 @@
 import os
 import time
 import unittest
+
 os.environ["WITH_DISTRIBUTE"] = "ON"
 import paddle
 import paddle.distributed.fleet.base.role_maker as role_maker
@@ -24,6 +25,7 @@
 
 
 class TestDistStrategyTrainerDescConfig(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_PSERVER_NUMS"] = "2"
         os.environ["PADDLE_TRAINERS_NUM"] = "2"
diff --git a/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py b/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
index 11ac301b72a00..88d9e46370bd3 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
@@ -21,11 +21,13 @@
 import paddle.fluid.core as core
 
 import paddle
+
 paddle.enable_static()
 
 
 @unittest.skip("do not need currently")
 class TestLookupTableFuseOp(unittest.TestCase):
+
     def test_fuse(self):
         places = [core.CPUPlace()]
         # currently only support CPU
@@ -73,25 +75,25 @@ def check_with_place(self, place):
             outputs=None,
             attrs={"large_scale_metas": metas})
 
-        init_program.global_block().append_op(
-            type="lookup_sparse_table_read",
-            inputs={"Ids": ids},
-            outputs={"Out": output},
-            attrs={
-                "tablename": "embedding_1.block0",
-                "init": True,
-                "value_names": ["Param"],
-            })
-
-        init_program.global_block().append_op(
-            type="lookup_sparse_table_read",
-            inputs={"Ids": ids},
-            outputs={"Out": output},
-            attrs={
-                "tablename": "embedding_2.block0",
-                "init": True,
-                "value_names": ["Param"],
-            })
+        init_program.global_block().append_op(type="lookup_sparse_table_read",
+                                              inputs={"Ids": ids},
+                                              outputs={"Out": output},
+                                              attrs={
+                                                  "tablename":
+                                                  "embedding_1.block0",
+                                                  "init": True,
+                                                  "value_names": ["Param"],
+                                              })
+
+        init_program.global_block().append_op(type="lookup_sparse_table_read",
+                                              inputs={"Ids": ids},
+                                              outputs={"Out": output},
+                                              attrs={
+                                                  "tablename":
+                                                  "embedding_2.block0",
+                                                  "init": True,
+                                                  "value_names": ["Param"],
+                                              })
 
         executor = fluid.Executor(place)
         executor.run(init_program)
@@ -150,8 +152,10 @@ def check_with_place(self, place):
                 "Beta1Pow": beta1,
                 "Beta2Pow": beta2,
             },
-            outputs={"Beta1PowOut": beta1,
-                     "Beta2PowOut": beta2},
+            outputs={
+                "Beta1PowOut": beta1,
+                "Beta2PowOut": beta2
+            },
             attrs={
                 "is_entry": False,
                 "tablename": "embedding_1.block0",
@@ -160,8 +164,10 @@ def check_with_place(self, place):
 
         training_program.global_block().append_op(
             type="lookup_sparse_table_fuse_sgd",
-            inputs={"Grad": grads,
-                    "LearningRate": lr},
+            inputs={
+                "Grad": grads,
+                "LearningRate": lr
+            },
             attrs={
                 "is_entry": False,
                 "tablename": "embedding_2.block0",
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py
index 23a2b8fd30607..257f5f0db9cf3 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py
@@ -21,6 +21,7 @@
 
 
 class TestDistMnistNCCL2BackWardDeps(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
index 4cf2cf5f36754..483d47577606c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
@@ -24,6 +24,7 @@
 
 
 class TestDistMnist2x2(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -52,20 +53,18 @@ def check_with_place(self,
                 "fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10"
             required_envs["GLOG_logtostderr"] = "1"
 
-        no_merge_losses = self._run_local(
-            model_file,
-            required_envs,
-            check_error_log=check_error_log,
-            batch_size=4,
-            log_name=flag_name)
+        no_merge_losses = self._run_local(model_file,
+                                          required_envs,
+                                          check_error_log=check_error_log,
+                                          batch_size=4,
+                                          log_name=flag_name)
 
-        batch_merge_losses = self._run_local(
-            model_file,
-            required_envs,
-            check_error_log=check_error_log,
-            batch_size=2,
-            batch_merge_repeat=2,
-            log_name=flag_name)
+        batch_merge_losses = self._run_local(model_file,
+                                             required_envs,
+                                             check_error_log=check_error_log,
+                                             batch_size=2,
+                                             batch_merge_repeat=2,
+                                             log_name=flag_name)
         # Ensure both result have values.
         self.assertGreater(len(no_merge_losses), 1)
         self.assertEqual(len(no_merge_losses), len(batch_merge_losses))
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
index eae19afb2ef86..f62ce85032bc9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
@@ -38,6 +38,7 @@ def count_of_sparse_all_reduce_calls(file_name):
 
 
 class TestDistMnistNCCL2DGC(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -48,11 +49,10 @@ def _setup_config(self):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "dist_mnist.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("dist_mnist.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
     def tearDown(self):
         import paddle.fluid as fluid
@@ -67,6 +67,7 @@ def tearDown(self):
 
 
 class TestDistMnistNCCL2DGCMultiCards(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -77,11 +78,10 @@ def _setup_config(self):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place_multi_cards(
-                "dist_mnist.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place_multi_cards("dist_mnist.py",
+                                              delta=1e-5,
+                                              check_error_log=True,
+                                              log_name=flag_name)
 
     def tearDown(self):
         import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleet_save.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleet_save.py
index 1cecb99620245..fc7d004672cc3 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleet_save.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleet_save.py
@@ -23,6 +23,7 @@
 
 
 class TestDistMnistFleetSave(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
index 3b15b06b5efa8..265e59ff94919 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
@@ -21,6 +21,7 @@
 
 
 class TestDistMnistNCCL2FleetApi(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -40,6 +41,7 @@ def test_dist_train(self):
 
 
 class FleetCollectiveTest(unittest.TestCase):
+
     def test_open_sync_batch_norm(self):
         import paddle.fluid as fluid
         import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -61,8 +63,8 @@ def test_open_sync_batch_norm(self):
         dist_strategy = DistributedStrategy()
         dist_strategy.sync_batch_norm = True
 
-        dist_optimizer = fleet.distributed_optimizer(
-            optimizer, strategy=dist_strategy)
+        dist_optimizer = fleet.distributed_optimizer(optimizer,
+                                                     strategy=dist_strategy)
         dist_optimizer.minimize(loss)
 
         self.assertEqual(dist_strategy.exec_strategy.num_threads, 1)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py
index d74d08681c18c..3a249c929e378 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py
@@ -18,6 +18,7 @@
 
 
 class TestDistMnist2x2FP16AllReduce(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_gradient_merge.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_gradient_merge.py
index 8056ab8633380..7e4453ca4c44d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_gradient_merge.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_gradient_merge.py
@@ -22,6 +22,7 @@
 
 
 class TestDistMnistGradMerge(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -29,14 +30,14 @@ def _setup_config(self):
 
     def test_dist_train(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "dist_mnist_gradient_merge.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("dist_mnist_gradient_merge.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestDistMnistGradMergeNoFuse(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -45,14 +46,14 @@ def _setup_config(self):
 
     def test_dist_train(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "dist_mnist_gradient_merge.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name + "_no_fuse")
+            self.check_with_place("dist_mnist_gradient_merge.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name + "_no_fuse")
 
 
 class TestDistMnistGradMergeRawOptimizerBase(TestDistBase):
+
     def _setup_config(self):
         self._use_reader_alloc = False
         self._nccl2_mode = True
@@ -66,19 +67,19 @@ def test_dist_train(self):
         if fluid.core.is_compiled_with_cuda():
             avg = str(self.enable_avg())
             log_name = flag_name + "_raw_optimizer_gm_avg_" + avg
-            self.check_with_place(
-                "dist_mnist_gradient_merge_raw_optimizer.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=log_name,
-                need_envs={
-                    'FLAGS_apply_pass_to_program': '1',
-                    'enable_gm_avg': avg,
-                })
+            self.check_with_place("dist_mnist_gradient_merge_raw_optimizer.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=log_name,
+                                  need_envs={
+                                      'FLAGS_apply_pass_to_program': '1',
+                                      'enable_gm_avg': avg,
+                                  })
 
 
 class TestDistMnistGradMergeRawOptimizerAvg(
         TestDistMnistGradMergeRawOptimizerBase):
+
     def enable_avg(self):
         return True
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py
index e1fbbebe171fc..c615b75316943 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py
@@ -24,6 +24,7 @@
 
 
 class TestDistMnistNCCL2HAllreduce(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -35,11 +36,10 @@ def _setup_config(self):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "dist_mnist.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("dist_mnist.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_lars.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_lars.py
index 53c7527fdafbd..f714a8ad00f77 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_lars.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_lars.py
@@ -18,6 +18,7 @@
 
 
 class TestDistMnist2x2Lars(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py
index d9e6be8609d27..b520f03153d2f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py
@@ -24,6 +24,7 @@
 
 
 class TestDistMnistNCCL2MultiNCCLComm(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -34,11 +35,10 @@ def _setup_config(self):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "dist_mnist.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("dist_mnist.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
index 28ef31875dbde..cbf5972e6dbc0 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
@@ -21,6 +21,7 @@
 
 
 class TestDistMnistNCCL2(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -30,13 +31,12 @@ def _setup_config(self):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "dist_mnist.py",
-                delta=1,
-                need_envs={
-                    "FLAGS_enable_parallel_graph": "1",
-                    "FLAGS_sync_nccl_allreduce": "1"
-                })
+            self.check_with_place("dist_mnist.py",
+                                  delta=1,
+                                  need_envs={
+                                      "FLAGS_enable_parallel_graph": "1",
+                                      "FLAGS_sync_nccl_allreduce": "1"
+                                  })
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py
index 4436064dc28ed..30d651ed0d480 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py
@@ -21,6 +21,7 @@
 
 
 class TestDistMnistNCCL2(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_train.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_train.py
index a5bcada14d8b0..f881847164736 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_train.py
@@ -17,59 +17,60 @@
 from test_dist_base import TestDistBase
 
 import os
+
 flag_name = os.path.splitext(__file__)[0]
 
 
 class TestDistMnist2x2(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_mnist.py",
-            delta=1e-5,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("dist_mnist.py",
+                              delta=1e-5,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 class TestDistMnist2x2WithMemopt(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._mem_opt = True
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_mnist.py",
-            delta=1e-5,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("dist_mnist.py",
+                              delta=1e-5,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 class TestDistMnistAsync(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._use_reduce = False
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_mnist.py",
-            delta=200,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("dist_mnist.py",
+                              delta=200,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 class TestDistMnistDcAsgd(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._dc_asgd = True
 
     def test_se_resnext(self):
-        self.check_with_place(
-            "dist_mnist.py",
-            delta=200,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("dist_mnist.py",
+                              delta=200,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
index 0ee6740ac2357..fd58564479153 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
@@ -21,6 +21,7 @@
 
 
 class TestDistMnistLocalSGDFleetApi(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -36,6 +37,7 @@ def test_dist_train(self):
 
 
 class TestDistMnistGradAllReduceFleetApi(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_oneps.py b/python/paddle/fluid/tests/unittests/test_dist_oneps.py
index 2493c7aab5510..7704a4c715efc 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_oneps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_oneps.py
@@ -15,12 +15,14 @@
 import unittest
 
 import paddle
+
 paddle.enable_static()
 
 from paddle.distributed.fleet.runtime.the_one_ps import Table
 
 
 class TestTable(unittest.TestCase):
+
     def test_table_tensor(self):
         table = Table()
         table.id = 1001
diff --git a/python/paddle/fluid/tests/unittests/test_dist_op.py b/python/paddle/fluid/tests/unittests/test_dist_op.py
index ad999c3feae42..255431544f9f0 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_op.py
@@ -35,6 +35,7 @@ def dist(x, y, p):
 
 
 class TestDistOp(OpTest):
+
     def setUp(self):
         self.op_type = 'dist'
         self.python_api = paddle.dist
@@ -110,14 +111,14 @@ def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(
-            ["X", "Y"],
-            "Out",
-            user_defined_grads=self.gradient,
-            check_eager=True)
+        self.check_grad(["X", "Y"],
+                        "Out",
+                        user_defined_grads=self.gradient,
+                        check_eager=True)
 
 
 class TestDistOpCase1(TestDistOp):
+
     def init_case(self):
         self.x_shape = (3, 5, 5, 6)
         self.y_shape = (5, 5, 6)
@@ -125,6 +126,7 @@ def init_case(self):
 
 
 class TestDistOpCase2(TestDistOp):
+
     def init_case(self):
         self.x_shape = (10, 10)
         self.y_shape = (4, 10, 10)
@@ -132,6 +134,7 @@ def init_case(self):
 
 
 class TestDistOpCase3(TestDistOp):
+
     def init_case(self):
         self.x_shape = (15, 10)
         self.y_shape = (15, 10)
@@ -139,6 +142,7 @@ def init_case(self):
 
 
 class TestDistOpCase4(TestDistOp):
+
     def init_case(self):
         self.x_shape = (2, 3, 4, 5, 8)
         self.y_shape = (3, 1, 5, 8)
@@ -146,6 +150,7 @@ def init_case(self):
 
 
 class TestDistOpCase5(TestDistOp):
+
     def init_case(self):
         self.x_shape = (4, 1, 4, 8)
         self.y_shape = (2, 2, 1, 4, 4, 8)
@@ -153,6 +158,7 @@ def init_case(self):
 
 
 class TestDistAPI(unittest.TestCase):
+
     def init_data_type(self):
         self.data_type = 'float32' if core.is_compiled_with_rocm(
         ) else 'float64'
@@ -168,12 +174,14 @@ def test_api(self):
             x_i = np.random.random((2, 3, 4, 5)).astype(self.data_type)
             y_i = np.random.random((3, 1, 5)).astype(self.data_type)
             result = paddle.dist(x, y, p)
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
             out = exe.run(fluid.default_main_program(),
-                          feed={'x': x_i,
-                                'y': y_i},
+                          feed={
+                              'x': x_i,
+                              'y': y_i
+                          },
                           fetch_list=[result])
             self.assertTrue(np.allclose(dist(x_i, y_i, p), out[0]))
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
index ed71a3897562c..08e8adaa9322d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
@@ -23,10 +23,12 @@
 from test_dist_base import TestDistBase, RUN_STEP
 
 import os
+
 flag_name = os.path.splitext(__file__)[0]
 
 
 class TestDistSaveLoadDense2x2(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._enforce_place = "CPU"
@@ -64,8 +66,10 @@ def check_with_place(self,
         cluster_env.update(required_envs)
 
         local_var = self._run_local(model_file, local_env, check_error_log)
-        tr0_var, tr1_var = self._run_cluster(
-            model_file, cluster_env, check_error_log, log_name=flag_name)
+        tr0_var, tr1_var = self._run_cluster(model_file,
+                                             cluster_env,
+                                             check_error_log,
+                                             log_name=flag_name)
 
         shutil.rmtree(model_dir)
 
@@ -84,14 +88,14 @@ def test_dist(self):
             'IS_SELF_CONTAINED_LR': '1',
             'SAVE_MODE': 'LOCAL',
         }
-        self.check_with_place(
-            "dist_save_load.py",
-            delta=0,
-            check_error_log=False,
-            need_envs=need_envs)
+        self.check_with_place("dist_save_load.py",
+                              delta=0,
+                              check_error_log=False,
+                              need_envs=need_envs)
 
 
 class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._enforce_place = "CPU"
@@ -124,15 +128,19 @@ def check_with_place(self,
         save_env["MODEL_DIR"] = model_dir
         save_env.update(required_envs)
 
-        tr0_var_1, tr1_var_1 = self._run_cluster(
-            model_file, save_env, check_error_log, log_name=flag_name)
+        tr0_var_1, tr1_var_1 = self._run_cluster(model_file,
+                                                 save_env,
+                                                 check_error_log,
+                                                 log_name=flag_name)
 
         load_env = {}
         load_env["LOAD"] = "1"
         load_env["MODEL_DIR"] = model_dir
         load_env.update(required_envs)
-        tr0_var_2, tr1_var_2 = self._run_cluster(
-            model_file, load_env, check_error_log, log_name=flag_name)
+        tr0_var_2, tr1_var_2 = self._run_cluster(model_file,
+                                                 load_env,
+                                                 check_error_log,
+                                                 log_name=flag_name)
 
         shutil.rmtree(model_dir)
 
@@ -153,12 +161,11 @@ def test_dist(self):
             'OPTIMIZER': 'ADAM',
             'SKIP_STEPS': str(np.random.randint(2, 6))
         }
-        self.check_with_place(
-            "dist_save_load.py",
-            delta=0,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
+        self.check_with_place("dist_save_load.py",
+                              delta=0,
+                              check_error_log=True,
+                              need_envs=need_envs,
+                              log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_dgc.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_dgc.py
index b48ec89a2afc9..86101cf9fe4db 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_dgc.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_dgc.py
@@ -18,10 +18,12 @@
 import os
 
 import os
+
 flag_name = os.path.splitext(__file__)[0]
 
 
 class TestDistSeResnetNCCL2DGC(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -33,11 +35,10 @@ def _setup_config(self):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "dist_se_resnext.py",
-                delta=30,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("dist_se_resnext.py",
+                                  delta=30,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
index 64217135be735..21d002ef318a7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
@@ -25,6 +25,7 @@
 
 
 class TestDistSeResneXtNCCL(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reader_alloc = False
@@ -33,14 +34,14 @@ def _setup_config(self):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "dist_se_resnext.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("dist_se_resnext.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestDistSeResneXtNCCLMP(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reader_alloc = False
@@ -50,12 +51,11 @@ def _setup_config(self):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "dist_se_resnext.py",
-                delta=1e-5,
-                check_error_log=True,
-                need_envs={"NCCL_P2P_DISABLE": "1"},
-                log_name=flag_name)
+            self.check_with_place("dist_se_resnext.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  need_envs={"NCCL_P2P_DISABLE": "1"},
+                                  log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync.py
index cb4d07b4ccb66..3c1dc9d989f4b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync.py
@@ -18,21 +18,22 @@
 import os
 
 import os
+
 flag_name = os.path.splitext(__file__)[0]
 
 
 class TestDistSeResneXt2x2(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reader_alloc = False
 
     @unittest.skip(reason="Skip unstable ci")
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_se_resnext.py",
-            delta=1e-7,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("dist_se_resnext.py",
+                              delta=1e-7,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sharding_save.py b/python/paddle/fluid/tests/unittests/test_dist_sharding_save.py
index 051bb7724ebea..9f78f229612d6 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_sharding_save.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sharding_save.py
@@ -23,6 +23,7 @@
 
 
 class TestDistMnistFleetSave(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
index 0044be23260ca..56bc41690e7f0 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
@@ -41,7 +41,8 @@ def net(self, emb_array, fc_array):
                 param_attr=fluid.ParamAttr(
                     name="embedding",
                     initializer=fluid.initializer.NumpyArrayInitializer(
-                        emb_array)), )
+                        emb_array)),
+            )
 
             fc1 = fluid.layers.fc(
                 input=emb,
@@ -72,6 +73,7 @@ def save_origin_model(self, emb_array, fc_array):
 
 @unittest.skip(reason="Skip unstable ut, need rewrite with new implement")
 class TestSparseLoadOpCase1(SparseLoadOp):
+
     def test_2ps_0_load(self):
         # init No.0 server env
         env = {}
@@ -110,8 +112,8 @@ def test_2ps_0_load(self):
 
         fc_w = np.array(fluid.global_scope().find_var("fc").get_tensor())
 
-        emb = np.array(fluid.global_scope().find_var("embedding.block0")
-                       .get_tensor())
+        emb = np.array(
+            fluid.global_scope().find_var("embedding.block0").get_tensor())
 
         assert fc_w.all() == fc_array.all()
         assert emb.all() == emb_array[::2].all()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps1.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps1.py
index b06d718e598de..6ae1afa7dc587 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps1.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps1.py
@@ -29,6 +29,7 @@
 
 @unittest.skip(reason="Skip unstable ut, need rewrite with new implement")
 class TestSparseLoadOpCase2(SparseLoadOp):
+
     def test_2ps_0_load(self):
         # init No.1 server env
         env = {}
@@ -66,8 +67,8 @@ def test_2ps_0_load(self):
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(loss)
         fleet.init_server(model_path)
-        emb = np.array(fluid.global_scope().find_var("embedding.block1")
-                       .get_tensor())
+        emb = np.array(
+            fluid.global_scope().find_var("embedding.block1").get_tensor())
         assert emb.all() == emb_array[1::2].all()
         shutil.rmtree(model_path)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py
index 17bff651c4489..416a629071509 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py
@@ -51,8 +51,9 @@ def net(self):
             with fluid.program_guard(train_program, startup_program):
                 with fluid.unique_name.guard():
                     inputs = fluid.data('input', shape=[None, 1], dtype="int64")
-                    emb = fluid.layers.embedding(
-                        inputs, is_sparse=True, size=[10000, 128])
+                    emb = fluid.layers.embedding(inputs,
+                                                 is_sparse=True,
+                                                 size=[10000, 128])
                     fc1 = fluid.layers.fc(input=emb, size=128, act="relu")
                     fc2 = fluid.layers.fc(input=fc1, size=64, act="relu")
                     loss = fluid.layers.reduce_mean(fc2)
@@ -60,6 +61,7 @@ def net(self):
 
 
 class TestSparseLoadProgramSGD(TestSparseLoadProgram):
+
     def test_server_init(self):
         scope, train_program, startup_program, loss = self.net()
         with fluid.scope_guard(scope):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
index d49ea3372e5f3..78264228a0f0f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
@@ -18,33 +18,34 @@
 from test_dist_base import TestDistBase
 
 import os
+
 flag_name = os.path.splitext(__file__)[0]
 
 
 class TestDistTextClassification2x2(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._enforce_place = "CPU"
 
     def test_text_classification(self):
-        self.check_with_place(
-            "dist_text_classification.py",
-            delta=1e-6,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("dist_text_classification.py",
+                              delta=1e-6,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 class TestDistTextClassification2x2Async(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._enforce_place = "CPU"
 
     def test_se_resnext(self):
-        self.check_with_place(
-            "dist_text_classification.py",
-            delta=100,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("dist_text_classification.py",
+                              delta=100,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
index e9f39f1090411..bdaee7665435f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -38,6 +38,7 @@
 
 
 class TestSendOp(unittest.TestCase):
+
     def test_send(self):
         remove_ps_flag(os.getpid())
         # Run init_serv in a thread
@@ -80,16 +81,14 @@ def init_serv(self, place):
         with fluid.program_guard(main):
             serv = ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False)
             with serv.do():
-                out_var = main.global_block().create_var(
-                    name="scale_0.tmp_0",
-                    psersistable=True,
-                    dtype="float32",
-                    shape=[32, 32])
-                x = layers.data(
-                    shape=[32, 32],
-                    dtype='float32',
-                    name="X",
-                    append_batch_size=False)
+                out_var = main.global_block().create_var(name="scale_0.tmp_0",
+                                                         psersistable=True,
+                                                         dtype="float32",
+                                                         shape=[32, 32])
+                x = layers.data(shape=[32, 32],
+                                dtype='float32',
+                                name="X",
+                                append_batch_size=False)
                 fluid.initializer.Constant(value=1.0)(x, main.global_block())
                 ops._scale(x=x, scale=10.0, out=out_var)
 
@@ -99,20 +98,20 @@ def init_serv(self, place):
     def init_client(self, place, port):
         main = fluid.Program()
         with fluid.program_guard(main):
-            main.global_block().append_op(
-                type="fetch_barrier",
-                inputs={},
-                outputs={"Out": []},
-                attrs={
-                    "endpoints": ["127.0.0.1:{0}".format(port)],
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-                })
-
-            x = layers.data(
-                shape=[32, 32],
-                dtype='float32',
-                name='X',
-                append_batch_size=False)
+            main.global_block().append_op(type="fetch_barrier",
+                                          inputs={},
+                                          outputs={"Out": []},
+                                          attrs={
+                                              "endpoints":
+                                              ["127.0.0.1:{0}".format(port)],
+                                              RPC_OP_ROLE_ATTR_NAME:
+                                              RPC_OP_ROLE_ATTR_VALUE
+                                          })
+
+            x = layers.data(shape=[32, 32],
+                            dtype='float32',
+                            name='X',
+                            append_batch_size=False)
             x.persistable = True
             fluid.initializer.Constant(value=2.3)(x, main.global_block())
 
@@ -123,13 +122,13 @@ def init_client(self, place, port):
                 shape=[32, 32])
             fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
 
-            # NOTE(zjl): `Send` is async send, which means that the sent 
-            # variable would be needed even though `Send` op runs. 
+            # NOTE(zjl): `Send` is async send, which means that the sent
+            # variable would be needed even though `Send` op runs.
             # Is it a right design? If I do not set `x.persistable = True`,
-            # this unittest would hang in rpc client after x is deleted. 
+            # this unittest would hang in rpc client after x is deleted.
             #
-            # BTW, `Send` is not a public API to users. So I set 
-            # `x.persistable = True` to be a hot fix of this unittest. 
+            # BTW, `Send` is not a public API to users. So I set
+            # `x.persistable = True` to be a hot fix of this unittest.
             Send("127.0.0.1:%d" % port, [x])
             o = Recv("127.0.0.1:%d" % port, [get_var])
 
@@ -139,11 +138,10 @@ def init_client(self, place, port):
     def run_local(self, place):
         main = fluid.Program()
         with fluid.program_guard(main):
-            x = layers.data(
-                shape=[32, 32],
-                dtype='float32',
-                name='X',
-                append_batch_size=False)
+            x = layers.data(shape=[32, 32],
+                            dtype='float32',
+                            name='X',
+                            append_batch_size=False)
             fluid.initializer.Constant(value=2.3)(x, main.global_block())
             o = layers.scale(x=x, scale=10.0)
         exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
index 3307caa8b2d62..073cab807b67b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -56,23 +56,27 @@ def download_files():
 
 
 class TestDistTransformer2x2Sync(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
 
     def test_dist_train(self):
         download_files()
-        self.check_with_place(
-            "dist_transformer.py", delta=1e-5, check_error_log=False)
+        self.check_with_place("dist_transformer.py",
+                              delta=1e-5,
+                              check_error_log=False)
 
 
 class TestDistTransformer2x2Async(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
 
     def test_dist_train(self):
         download_files()
-        self.check_with_place(
-            "dist_transformer.py", delta=1.0, check_error_log=False)
+        self.check_with_place("dist_transformer.py",
+                              delta=1.0,
+                              check_error_log=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 13a36f4a81e1f..5905b682d8941 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -30,6 +30,7 @@
 
 
 class TranspilerTest(unittest.TestCase):
+
     def setUp(self):
         self.trainer_id = 0
         self.trainers = 2
@@ -85,12 +86,11 @@ def _transpiler_instance(self, config=None, sync_mode=True):
         if not self.transpiler:
             main = self.get_main_program()
             self.transpiler = fluid.DistributeTranspiler(config=config)
-            self.transpiler.transpile(
-                self.trainer_id,
-                program=main,
-                pservers=self.pserver_eps,
-                trainers=self.trainers,
-                sync_mode=sync_mode)
+            self.transpiler.transpile(self.trainer_id,
+                                      program=main,
+                                      pservers=self.pserver_eps,
+                                      trainers=self.trainers,
+                                      sync_mode=sync_mode)
 
         return self.transpiler
 
@@ -112,6 +112,7 @@ def test_transpiler(self):
 
 
 class TestBasicModel(TranspilerTest):
+
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         pserver2, startup2 = self.get_pserver(self.pserver2_ep)
@@ -174,6 +175,7 @@ def transpiler_test_impl(self):
 
 
 class TestBasicModelWithLargeBlockSize(TranspilerTest):
+
     def transpiler_test_impl(self):
         config = fluid.DistributeTranspilerConfig()
         config.min_block_size = 1048576
@@ -225,6 +227,7 @@ def transpiler_test_impl(self):
 
 
 class TestNoSliceVar(TranspilerTest):
+
     def setUp(self):
         super(TestNoSliceVar, self).setUp()
 
@@ -244,6 +247,7 @@ def transpiler_test_impl(self):
 
 
 class TestLRDecay(TranspilerTest):
+
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
         y_predict = fluid.layers.fc(input=x,
@@ -255,11 +259,10 @@ def net_conf(self):
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
         avg_cost = fluid.layers.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(
-            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=1.0,
-                decay_steps=2100,
-                decay_rate=0.1,
-                staircase=True))
+            learning_rate=fluid.layers.exponential_decay(learning_rate=1.0,
+                                                         decay_steps=2100,
+                                                         decay_rate=0.1,
+                                                         staircase=True))
         sgd_optimizer.minimize(avg_cost)
 
     def transpiler_test_impl(self):
@@ -276,15 +279,22 @@ def transpiler_test_impl(self):
 
 
 class TestFakeInit(TranspilerTest):
+
     def net_conf(self):
         dict_size, embedding_size, neg_num = 10000, 8, 5
 
-        input_word = fluid.layers.data(
-            name="input_word", shape=[1], dtype='int64', lod_level=1)
-        true_word = fluid.layers.data(
-            name='true_label', shape=[1], dtype='int64', lod_level=1)
-        neg_word = fluid.layers.data(
-            name="neg_label", shape=[1], dtype='int64', lod_level=1)
+        input_word = fluid.layers.data(name="input_word",
+                                       shape=[1],
+                                       dtype='int64',
+                                       lod_level=1)
+        true_word = fluid.layers.data(name='true_label',
+                                      shape=[1],
+                                      dtype='int64',
+                                      lod_level=1)
+        neg_word = fluid.layers.data(name="neg_label",
+                                     shape=[1],
+                                     dtype='int64',
+                                     lod_level=1)
         inputs = [input_word, true_word, neg_word]
 
         init_width = 0.5 / embedding_size
@@ -292,9 +302,9 @@ def net_conf(self):
             input=inputs[0],
             is_sparse=True,
             size=[dict_size, embedding_size],
-            param_attr=fluid.ParamAttr(
-                name='emb',
-                initializer=fluid.initializer.Uniform(-init_width, init_width)))
+            param_attr=fluid.ParamAttr(name='emb',
+                                       initializer=fluid.initializer.Uniform(
+                                           -init_width, init_width)))
 
         true_emb_w = fluid.layers.embedding(
             input=inputs[1],
@@ -315,62 +325,59 @@ def net_conf(self):
         neg_word_reshape = fluid.layers.reshape(inputs[2], shape=[-1, 1])
         neg_word_reshape.stop_gradient = True
 
-        neg_emb_w = fluid.layers.embedding(
-            input=neg_word_reshape,
-            is_sparse=True,
-            size=[dict_size, embedding_size],
-            param_attr=fluid.ParamAttr(
-                name='emb_w', learning_rate=1.0))
+        neg_emb_w = fluid.layers.embedding(input=neg_word_reshape,
+                                           is_sparse=True,
+                                           size=[dict_size, embedding_size],
+                                           param_attr=fluid.ParamAttr(
+                                               name='emb_w', learning_rate=1.0))
 
-        neg_emb_w_re = fluid.layers.reshape(
-            neg_emb_w, shape=[-1, neg_num, embedding_size])
+        neg_emb_w_re = fluid.layers.reshape(neg_emb_w,
+                                            shape=[-1, neg_num, embedding_size])
 
-        neg_emb_b = fluid.layers.embedding(
-            input=neg_word_reshape,
-            is_sparse=True,
-            size=[dict_size, 1],
-            param_attr=fluid.ParamAttr(
-                name='emb_b', learning_rate=1.0))
+        neg_emb_b = fluid.layers.embedding(input=neg_word_reshape,
+                                           is_sparse=True,
+                                           size=[dict_size, 1],
+                                           param_attr=fluid.ParamAttr(
+                                               name='emb_b', learning_rate=1.0))
 
         neg_emb_b_vec = fluid.layers.reshape(neg_emb_b, shape=[-1, neg_num])
 
         true_logits = fluid.layers.elementwise_add(
-            fluid.layers.reduce_sum(
-                fluid.layers.elementwise_mul(input_emb, true_emb_w),
-                dim=1,
-                keep_dim=True),
-            true_emb_b)
+            fluid.layers.reduce_sum(fluid.layers.elementwise_mul(
+                input_emb, true_emb_w),
+                                    dim=1,
+                                    keep_dim=True), true_emb_b)
 
-        input_emb_re = fluid.layers.reshape(
-            input_emb, shape=[-1, 1, embedding_size])
+        input_emb_re = fluid.layers.reshape(input_emb,
+                                            shape=[-1, 1, embedding_size])
 
-        neg_matmul = fluid.layers.matmul(
-            input_emb_re, neg_emb_w_re, transpose_y=True)
+        neg_matmul = fluid.layers.matmul(input_emb_re,
+                                         neg_emb_w_re,
+                                         transpose_y=True)
         neg_matmul_re = fluid.layers.reshape(neg_matmul, shape=[-1, neg_num])
         neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec)
         # nce loss
-        label_ones = fluid.layers.fill_constant_batch_size_like(
-            true_logits, shape=[-1, 1], value=1.0, dtype='float32')
+        label_ones = fluid.layers.fill_constant_batch_size_like(true_logits,
+                                                                shape=[-1, 1],
+                                                                value=1.0,
+                                                                dtype='float32')
         label_zeros = fluid.layers.fill_constant_batch_size_like(
             true_logits, shape=[-1, neg_num], value=0.0, dtype='float32')
 
-        true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits,
-                                                                   label_ones)
-        neg_xent = fluid.layers.sigmoid_cross_entropy_with_logits(neg_logits,
-                                                                  label_zeros)
+        true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(
+            true_logits, label_ones)
+        neg_xent = fluid.layers.sigmoid_cross_entropy_with_logits(
+            neg_logits, label_zeros)
         cost = fluid.layers.elementwise_add(
-            fluid.layers.reduce_sum(
-                true_xent, dim=1),
-            fluid.layers.reduce_sum(
-                neg_xent, dim=1))
+            fluid.layers.reduce_sum(true_xent, dim=1),
+            fluid.layers.reduce_sum(neg_xent, dim=1))
         avg_cost = fluid.layers.reduce_mean(cost)
 
         sgd_optimizer = fluid.optimizer.SGD(
-            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=1.0,
-                decay_steps=2100,
-                decay_rate=0.1,
-                staircase=True))
+            learning_rate=fluid.layers.exponential_decay(learning_rate=1.0,
+                                                         decay_steps=2100,
+                                                         decay_rate=0.1,
+                                                         staircase=True))
         sgd_optimizer.minimize(avg_cost)
 
     def transpiler_test_impl(self):
@@ -385,6 +392,7 @@ def transpiler_test_impl(self):
 
 
 class TestDecayedAdagrad(TranspilerTest):
+
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
         y_predict = fluid.layers.fc(input=x,
@@ -404,6 +412,7 @@ def transpiler_test_impl(self):
 
 
 class TestFtrl(TranspilerTest):
+
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
         y_predict = fluid.layers.fc(input=x,
@@ -423,6 +432,7 @@ def transpiler_test_impl(self):
 
 
 class TestLRDecayConditional(TranspilerTest):
+
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
         y_predict = fluid.layers.fc(input=x,
@@ -469,14 +479,15 @@ def transpiler_test_impl(self):
 
 
 class TestL2Decay(TranspilerTest):
+
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
         y_predict = fluid.layers.fc(
             input=x,
             size=1000,
             act=None,
-            param_attr=fluid.ParamAttr(
-                name='fc_w', regularizer=fluid.regularizer.L2Decay()),
+            param_attr=fluid.ParamAttr(name='fc_w',
+                                       regularizer=fluid.regularizer.L2Decay()),
             bias_attr=fluid.ParamAttr(name='fc_b'))
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
@@ -502,6 +513,7 @@ def transpiler_test_impl(self):
 
 
 class TestL2DecayWithPiecewise(TranspilerTest):
+
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
         y_predict = fluid.layers.fc(input=x,
@@ -516,8 +528,8 @@ def net_conf(self):
         bd = [1, 10, 20, 30]
         lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
         sgd_optimizer = fluid.optimizer.Momentum(
-            learning_rate=fluid.layers.piecewise_decay(
-                boundaries=bd, values=lr),
+            learning_rate=fluid.layers.piecewise_decay(boundaries=bd,
+                                                       values=lr),
             momentum=0.9,
             regularization=fluid.regularizer.L2Decay(1e-4))
         sgd_optimizer.minimize(avg_cost)
@@ -545,6 +557,7 @@ def transpiler_test_impl(self):
 
 
 class TestEmptyPserverOptimizeBlocks(TranspilerTest):
+
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
         # only one parameter
@@ -570,33 +583,39 @@ def transpiler_test_impl(self):
 
 
 class TestDistLookupTableBase(TranspilerTest):
+
     def network_with_table(self, is_sparse, is_distributed):
         self.table_size = 1000
         self.emb_size = 64
         self.lookup_table_name = 'shared_w'
 
         def emb_pool(ids, table_name, is_distributed):
-            emb = fluid.layers.embedding(
-                input=ids,
-                size=[self.table_size, self.emb_size],
-                dtype='float32',
-                param_attr=table_name,
-                is_sparse=is_sparse,
-                is_distributed=is_distributed)
+            emb = fluid.layers.embedding(input=ids,
+                                         size=[self.table_size, self.emb_size],
+                                         dtype='float32',
+                                         param_attr=table_name,
+                                         is_sparse=is_sparse,
+                                         is_distributed=is_distributed)
             pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
             return pool
 
-        title_ids = fluid.layers.data(
-            name='title_ids', shape=[1], dtype='int64', lod_level=1)
-        brand_ids = fluid.layers.data(
-            name='brand_ids', shape=[1], dtype='int64', lod_level=1)
-        profile_ids = fluid.layers.data(
-            name='brand_ids', shape=[1], dtype='int64', lod_level=1)
+        title_ids = fluid.layers.data(name='title_ids',
+                                      shape=[1],
+                                      dtype='int64',
+                                      lod_level=1)
+        brand_ids = fluid.layers.data(name='brand_ids',
+                                      shape=[1],
+                                      dtype='int64',
+                                      lod_level=1)
+        profile_ids = fluid.layers.data(name='brand_ids',
+                                        shape=[1],
+                                        dtype='int64',
+                                        lod_level=1)
         title_emb = emb_pool(title_ids, self.lookup_table_name, is_distributed)
         brand_emb = emb_pool(brand_ids, self.lookup_table_name, is_distributed)
         profile_emb = emb_pool(profile_ids, "profile_emb", False)
-        fc0 = fluid.layers.concat(
-            input=[title_emb, brand_emb, profile_emb], axis=1)
+        fc0 = fluid.layers.concat(input=[title_emb, brand_emb, profile_emb],
+                                  axis=1)
         predict = fluid.layers.fc(input=fc0,
                                   size=2,
                                   act=None,
@@ -611,6 +630,7 @@ def emb_pool(ids, table_name, is_distributed):
 
 
 class TestLocalLookupTable(TestDistLookupTableBase):
+
     def net_conf(self):
         self.network_with_table(is_sparse=True, is_distributed=False)
 
@@ -649,6 +669,7 @@ def transpiler_test_impl(self):
 
 
 class TestDistLookupTable(TestDistLookupTableBase):
+
     def net_conf(self):
         self.network_with_table(is_sparse=True, is_distributed=True)
 
@@ -699,6 +720,7 @@ def transpiler_test_impl(self):
 
 
 class TestAsyncLocalLookupTable(TestDistLookupTableBase):
+
     def net_conf(self):
         self.network_with_table(is_sparse=True, is_distributed=False)
 
@@ -736,6 +758,7 @@ def transpiler_test_impl(self):
 
 
 class TestAsyncDistLookupTable(TestDistLookupTableBase):
+
     def net_conf(self):
         self.network_with_table(is_sparse=True, is_distributed=True)
 
@@ -786,6 +809,7 @@ def transpiler_test_impl(self):
 
 
 class TestDistLookupTableSliceSize(TestDistLookupTableBase):
+
     def net_conf(self):
         self.network_with_table(is_sparse=True, is_distributed=True)
 
@@ -802,6 +826,7 @@ def transpiler_test_impl(self):
 
 
 class TestDistArgsInProgram(TestDistLookupTableBase):
+
     def net_conf(self):
         self.network_with_table(is_sparse=True, is_distributed=True)
 
@@ -817,6 +842,7 @@ def transpiler_test_impl(self):
 
 
 class TestRMSPropOptimizer(TranspilerTest):
+
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
         y_predict = fluid.layers.fc(input=x,
@@ -846,6 +872,7 @@ def transpiler_test_impl(self):
 
 
 class TestLoadSliceVar(TranspilerTest):
+
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
         y_predict = fluid.layers.fc(input=x,
@@ -900,6 +927,7 @@ def transpiler_test_impl(self):
 
 
 class TestNCCL2Transpile(TranspilerTest):
+
     def test_nccl2_transpile(self):
         if fluid.core.is_compiled_with_cuda():  # test nccl2 only with cuda
             main = fluid.Program()
@@ -911,11 +939,10 @@ def test_nccl2_transpile(self):
             config.mode = "nccl2"
             config.wait_port = False
             t = fluid.DistributeTranspiler(config=config)
-            t.transpile(
-                0,
-                trainers="127.0.0.1:6174,127.0.0.1:6175",
-                current_endpoint="127.0.0.1:6174",
-                startup_program=startup)
+            t.transpile(0,
+                        trainers="127.0.0.1:6174,127.0.0.1:6175",
+                        current_endpoint="127.0.0.1:6174",
+                        startup_program=startup)
             print([op.type for op in startup.global_block().ops])
             self.assertEqual(startup.global_block().ops[-1].type, "gen_nccl_id")
             self.assertIsNotNone(startup.global_block().vars.get("NCCLID"))
@@ -926,6 +953,7 @@ def test_nccl2_transpile(self):
 
 # test for remote prefetch
 class TestRemoteLookupTable(TestDistLookupTableBase):
+
     def net_conf(self):
         import os
         os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
@@ -967,6 +995,7 @@ def transpiler_test_impl(self):
 
 # test for remote prefetch
 class TestRemoteNce(TestDistLookupTableBase):
+
     def network_with_table(self, is_sparse, is_distributed):
 
         num_total_classes = 20
@@ -1029,16 +1058,19 @@ def transpiler_test_impl(self):
 
 # test for remote prefetch
 class TestRemoteHsigmoid(TestDistLookupTableBase):
+
     def network_with_table(self, is_sparse, is_distributed):
 
         num_total_classes = 3
 
         input = fluid.layers.data(name="input", shape=[1], dtype="float32")
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-        path_table = fluid.layers.data(
-            name='path_table', shape=[3], dtype='int64')
-        path_code = fluid.layers.data(
-            name='path_code', shape=[3], dtype='int64')
+        path_table = fluid.layers.data(name='path_table',
+                                       shape=[3],
+                                       dtype='int64')
+        path_code = fluid.layers.data(name='path_code',
+                                      shape=[3],
+                                      dtype='int64')
         w_param = fluid.default_main_program().global_block().create_parameter(
             shape=[num_total_classes, 10],
             dtype='float32',
@@ -1057,14 +1089,13 @@ def network_with_table(self, is_sparse, is_distributed):
             param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
                 scale=1 / math.sqrt(num_total_classes))))
 
-        cost = fluid.layers.hsigmoid(
-            input=emb,
-            label=label,
-            num_classes=num_total_classes,
-            path_table=path_table,
-            path_code=path_code,
-            is_custom=True,
-            is_sparse=is_sparse)
+        cost = fluid.layers.hsigmoid(input=emb,
+                                     label=label,
+                                     num_classes=num_total_classes,
+                                     path_table=path_table,
+                                     path_code=path_code,
+                                     is_custom=True,
+                                     is_sparse=is_sparse)
         avg_cost = fluid.layers.mean(cost)
         # optimizer
         optimizer = fluid.optimizer.SGD(learning_rate=0.003)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_tree_index.py b/python/paddle/fluid/tests/unittests/test_dist_tree_index.py
index 6ea15319204f2..18500d9b1520b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_tree_index.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_tree_index.py
@@ -18,26 +18,36 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle
+
 paddle.enable_static()
 
 
 def create_feeds():
-    user_input = fluid.layers.data(
-        name="item_id", shape=[1], dtype="int64", lod_level=1)
-
-    item = fluid.layers.data(
-        name="unit_id", shape=[1], dtype="int64", lod_level=1)
-
-    label = fluid.layers.data(
-        name="label", shape=[1], dtype="int64", lod_level=1)
-    labels = fluid.layers.data(
-        name="labels", shape=[1], dtype="int64", lod_level=1)
+    user_input = fluid.layers.data(name="item_id",
+                                   shape=[1],
+                                   dtype="int64",
+                                   lod_level=1)
+
+    item = fluid.layers.data(name="unit_id",
+                             shape=[1],
+                             dtype="int64",
+                             lod_level=1)
+
+    label = fluid.layers.data(name="label",
+                              shape=[1],
+                              dtype="int64",
+                              lod_level=1)
+    labels = fluid.layers.data(name="labels",
+                               shape=[1],
+                               dtype="int64",
+                               lod_level=1)
 
     feed_list = [user_input, item, label, labels]
     return feed_list
 
 
 class TestTreeIndex(unittest.TestCase):
+
     def test_tree_index(self):
         path = download(
             "https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb",
@@ -102,6 +112,7 @@ def test_tree_index(self):
 
 
 class TestIndexSampler(unittest.TestCase):
+
     def test_layerwise_sampler(self):
         path = download(
             "https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb",
@@ -123,11 +134,10 @@ def test_layerwise_sampler(self):
             slots_vars.append(var)
 
         dataset = paddle.distributed.InMemoryDataset()
-        dataset.init(
-            batch_size=1,
-            pipe_command="cat",
-            download_cmd="cat",
-            use_var=slots_vars)
+        dataset.init(batch_size=1,
+                     pipe_command="cat",
+                     download_cmd="cat",
+                     use_var=slots_vars)
         dataset.set_filelist([file_name])
         #dataset.update_settings(pipe_command="cat")
         #dataset._init_distributed_settings(
@@ -137,14 +147,13 @@ def test_layerwise_sampler(self):
         #    candidate_size=10000)
 
         dataset.load_into_memory()
-        dataset.tdm_sample(
-            'demo',
-            tree_path=path,
-            tdm_layer_counts=tdm_layer_counts,
-            start_sample_layer=1,
-            with_hierachy=False,
-            seed=0,
-            id_slot=2)
+        dataset.tdm_sample('demo',
+                           tree_path=path,
+                           tdm_layer_counts=tdm_layer_counts,
+                           start_sample_layer=1,
+                           with_hierachy=False,
+                           seed=0,
+                           id_slot=2)
         self.assertTrue(dataset.get_shuffle_data_size() == 8)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
index 9385d42c5590b..ef4d3c8f169a5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -17,47 +17,48 @@
 from test_dist_base import TestDistBase
 
 import os
+
 flag_name = os.path.splitext(__file__)[0]
 
 
 class TestDistW2V2x2(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._enforce_place = "CPU"
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_word2vec.py",
-            delta=1e-4,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("dist_word2vec.py",
+                              delta=1e-4,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 class TestDistW2V2x2WithMemOpt(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._mem_opt = True
         self._enforce_place = "CPU"
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_word2vec.py",
-            delta=1e-4,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("dist_word2vec.py",
+                              delta=1e-4,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 class TestDistW2V2x2Async(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._enforce_place = "CPU"
 
     def test_dist_train(self):
-        self.check_with_place(
-            "dist_word2vec.py",
-            delta=100,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("dist_word2vec.py",
+                              delta=100,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
index 2cd7889d6e3aa..06cdaed1988cc 100644
--- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
@@ -22,6 +22,7 @@
 
 
 class TestDistributeFPNProposalsOp(OpTest):
+
     def set_data(self):
         self.init_test_case()
         self.make_rois()
@@ -84,8 +85,8 @@ def add_multilevel_roi(self, rois, target_lvls, lvl_min, lvl_max):
             sub_lod = self.get_sub_lod(rois[idx_lvl, 0])
             rois_fpn.append((rois[idx_lvl, 1:], [sub_lod]))
             rois_idx_order = np.concatenate((rois_idx_order, idx_lvl))
-        rois_idx_restore = np.argsort(rois_idx_order).astype(
-            np.int32, copy=False)
+        rois_idx_restore = np.argsort(rois_idx_order).astype(np.int32,
+                                                             copy=False)
         return rois_fpn, rois_idx_restore
 
     def calc_rois_distribute(self):
@@ -122,6 +123,7 @@ def test_check_output(self):
 
 
 class TestDistributeFPNProposalsOpWithRoisNum(TestDistributeFPNProposalsOp):
+
     def set_data(self):
         self.init_test_case()
         self.make_rois()
@@ -139,10 +141,9 @@ def set_data(self):
         }
         output = [('out%d' % i, self.rois_fpn[i])
                   for i in range(len(self.rois_fpn))]
-        rois_num_per_level = [
-            ('rois_num%d' % i, np.array(self.rois_fpn[i][1][0]).astype('int32'))
-            for i in range(len(self.rois_fpn))
-        ]
+        rois_num_per_level = [('rois_num%d' % i,
+                               np.array(self.rois_fpn[i][1][0]).astype('int32'))
+                              for i in range(len(self.rois_fpn))]
 
         self.outputs = {
             'MultiFpnRois': output,
@@ -153,6 +154,7 @@ def set_data(self):
 
 class TestDistributeFPNProposalsOpNoOffset(
         TestDistributeFPNProposalsOpWithRoisNum):
+
     def init_test_case(self):
         self.roi_max_level = 5
         self.roi_min_level = 2
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
index 315580dd31ad7..324da95f37d80 100644
--- a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -72,6 +72,7 @@ def run_test(clip_after_allreduce=True,
 
 
 class TestDistributedFusedLambWithClip(unittest.TestCase):
+
     def test_1(self):
         run_test(clip_after_allreduce=True, max_global_norm=0.01)
 
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py
index 1822b77d0d0e5..c2089b1d97db6 100644
--- a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py
+++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,11 +17,11 @@
 
 
 class TestDistributedFusedLambGradientMerge(unittest.TestCase):
+
     def test_gm(self):
-        run_test(
-            clip_after_allreduce=True,
-            max_global_norm=-1.0,
-            gradient_merge_steps=2)
+        run_test(clip_after_allreduce=True,
+                 max_global_norm=-1.0,
+                 gradient_merge_steps=2)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_without_clip.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_without_clip.py
index dbd2d72fd2f6b..8d4dfa84d2f1b 100644
--- a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_without_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_without_clip.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,6 +17,7 @@
 
 
 class TestDistributedFusedLambWithoutClip(unittest.TestCase):
+
     def test_1(self):
         run_test(clip_after_allreduce=True, max_global_norm=-1.0)
 
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_distributed_strategy.py
index df32912b0c291..491555907ec40 100644
--- a/python/paddle/fluid/tests/unittests/test_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_distributed_strategy.py
@@ -22,6 +22,7 @@
 
 
 class TestStrategyFactor(unittest.TestCase):
+
     def test_sync_strategy(self):
         os.environ['CPU_NUM'] = "2"
         strategy = StrategyFactory.create_sync_strategy()
@@ -108,8 +109,9 @@ def test_async_strategy(self):
         self.assertEqual(strategy._build_strategy.async_mode, True)
 
         trainer_runtime_config = strategy.get_trainer_runtime_config()
-        self.assertEqual(trainer_runtime_config.runtime_configs[
-            'communicator_send_queue_size'], '100')
+        self.assertEqual(
+            trainer_runtime_config.
+            runtime_configs['communicator_send_queue_size'], '100')
 
         # test set_trainer_runtime_config using dict
         trainer_runtime_config_dict = dict()
@@ -193,6 +195,7 @@ def test_half_async_strategy(self):
 
 
 class TestCreateDefaultStrategy(unittest.TestCase):
+
     def test_default_strategy(self):
         role = role_maker.UserDefinedRoleMaker(
             current_id=0,
@@ -209,6 +212,7 @@ def type_error_optimizer():
 
 
 class TestHalfAsyncStrategy(unittest.TestCase):
+
     def test_half_async_strategy(self):
         role = role_maker.UserDefinedRoleMaker(
             current_id=0,
@@ -228,6 +232,7 @@ def test_half_async_strategy(self):
 
 
 class TestDebugInfo(unittest.TestCase):
+
     def test_debug_info(self):
         x = fluid.layers.data(name='x', shape=[1], dtype='float32')
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/test_distributions.py b/python/paddle/fluid/tests/unittests/test_distributions.py
index 838744171831a..daf2adacb4168 100644
--- a/python/paddle/fluid/tests/unittests/test_distributions.py
+++ b/python/paddle/fluid/tests/unittests/test_distributions.py
@@ -43,6 +43,7 @@ def log_prob(self, value):
 
 
 class UniformNumpy(DistributionNumpy):
+
     def __init__(self, low, high):
         self.low = np.array(low).astype('float32')
         self.high = np.array(high).astype('float32')
@@ -62,6 +63,7 @@ def entropy(self):
 
 
 class NormalNumpy(DistributionNumpy):
+
     def __init__(self, loc, scale):
         self.loc = np.array(loc).astype('float32')
         self.scale = np.array(scale).astype('float32')
@@ -73,12 +75,13 @@ def sample(self, shape):
     def log_prob(self, value):
         var = self.scale * self.scale
         log_scale = np.log(self.scale)
-        return -((value - self.loc) * (value - self.loc)) / (
-            2. * var) - log_scale - math.log(math.sqrt(2. * math.pi))
+        return -((value - self.loc) *
+                 (value - self.loc)) / (2. * var) - log_scale - math.log(
+                     math.sqrt(2. * math.pi))
 
     def entropy(self):
-        return 0.5 + 0.5 * np.log(np.array(2. * math.pi).astype(
-            'float32')) + np.log(self.scale)
+        return 0.5 + 0.5 * np.log(np.array(
+            2. * math.pi).astype('float32')) + np.log(self.scale)
 
     def kl_divergence(self, other):
         var_ratio = (self.scale / other.scale)
@@ -89,6 +92,7 @@ def kl_divergence(self, other):
 
 
 class CategoricalNumpy(DistributionNumpy):
+
     def __init__(self, logits):
         self.logits = np.array(logits).astype('float32')
 
@@ -113,6 +117,7 @@ def kl_divergence(self, other):
 
 
 class MultivariateNormalDiagNumpy(DistributionNumpy):
+
     def __init__(self, loc, scale):
         self.loc = np.array(loc).astype('float32')
         self.scale = np.array(scale).astype('float32')
@@ -135,8 +140,8 @@ def _inv(self, value):
 
     def entropy(self):
         return 0.5 * (self.scale.shape[0] *
-                      (1.0 + np.log(np.array(2 * math.pi).astype('float32'))
-                       ) + np.log(self._det(self.scale)))
+                      (1.0 + np.log(np.array(2 * math.pi).astype('float32'))) +
+                      np.log(self._det(self.scale)))
 
     def kl_divergence(self, other):
         tr_cov_matmul = np.sum(self._inv(other.scale) * self.scale)
@@ -151,6 +156,7 @@ def kl_divergence(self, other):
 
 
 class DistributionTest(unittest.TestCase):
+
     def setUp(self, use_gpu=False):
         self.use_gpu = use_gpu
         if not use_gpu:
@@ -169,10 +175,12 @@ def build_normal_program(self, test_program, batch_size, dims, loc_float,
             loc = layers.data(name='loc', shape=[dims], dtype='float32')
             scale = layers.data(name='scale', shape=[dims], dtype='float32')
 
-            other_loc = layers.data(
-                name='other_loc', shape=[dims], dtype='float32')
-            other_scale = layers.data(
-                name='other_scale', shape=[dims], dtype='float32')
+            other_loc = layers.data(name='other_loc',
+                                    shape=[dims],
+                                    dtype='float32')
+            other_scale = layers.data(name='other_scale',
+                                      shape=[dims],
+                                      dtype='float32')
 
             values = layers.data(name='values', shape=[dims], dtype='float32')
 
@@ -264,8 +272,8 @@ def test_normal_distribution(self, batch_size=2, dims=3, tolerance=1e-6):
         np_normal_float = NormalNumpy(loc_float, scale_float)
         np_other_normal_float = NormalNumpy(other_loc_float, other_scale_float)
         np_normal_float_np_broadcast = NormalNumpy(loc_float, scale_np)
-        np_other_normal_float_np_broadcast = NormalNumpy(other_loc_float,
-                                                         other_scale_np)
+        np_other_normal_float_np_broadcast = NormalNumpy(
+            other_loc_float, other_scale_np)
         np_normal = NormalNumpy(loc_np, scale_np)
         np_other_normal = NormalNumpy(other_loc_np, other_scale_np)
 
@@ -295,60 +303,66 @@ def test_normal_distribution(self, batch_size=2, dims=3, tolerance=1e-6):
                               feed=feed_vars,
                               fetch_list=fetch_list)
 
-        np.testing.assert_allclose(
-            output_sample_float.shape,
-            gt_sample_float.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_float_np_broadcast.shape,
-            gt_sample_float_np_broadcast.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_np.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_variable.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float,
-            gt_entropy_float,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float_np_broadcast,
-            gt_entropy_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_float_np_broadcast,
-            gt_lp_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_float, gt_kl_float, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_float_np_broadcast,
-            gt_kl_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_np, gt_kl, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_variable, gt_kl, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(output_sample_float.shape,
+                                   gt_sample_float.shape,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_sample_float_np_broadcast.shape,
+                                   gt_sample_float_np_broadcast.shape,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_sample_np.shape,
+                                   gt_sample_np.shape,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_sample_variable.shape,
+                                   gt_sample_np.shape,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_entropy_float,
+                                   gt_entropy_float,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_entropy_float_np_broadcast,
+                                   gt_entropy_float_np_broadcast,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_entropy_np,
+                                   gt_entropy,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_entropy_variable,
+                                   gt_entropy,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_lp_float_np_broadcast,
+                                   gt_lp_float_np_broadcast,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_lp_np,
+                                   gt_lp,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_lp_variable,
+                                   gt_lp,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_kl_float,
+                                   gt_kl_float,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_kl_float_np_broadcast,
+                                   gt_kl_float_np_broadcast,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_kl_np,
+                                   gt_kl,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_kl_variable,
+                                   gt_kl,
+                                   rtol=tolerance,
+                                   atol=tolerance)
 
     def build_uniform_program(self, test_program, batch_size, dims, low_float,
                               high_float, high_np, low_np, values_np):
@@ -428,49 +442,50 @@ def test_uniform_distribution(self, batch_size=2, dims=3, tolerance=1e-6):
                               feed=feed_vars,
                               fetch_list=fetch_list)
 
-        np.testing.assert_allclose(
-            output_sample_float.shape,
-            gt_sample_float.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_float_np_broadcast.shape,
-            gt_sample_float_np_broadcast.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_np.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_sample_variable.shape,
-            gt_sample_np.shape,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float,
-            gt_entropy_float,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float_np_broadcast,
-            gt_entropy_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_float_np_broadcast,
-            gt_lp_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(output_sample_float.shape,
+                                   gt_sample_float.shape,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_sample_float_np_broadcast.shape,
+                                   gt_sample_float_np_broadcast.shape,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_sample_np.shape,
+                                   gt_sample_np.shape,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_sample_variable.shape,
+                                   gt_sample_np.shape,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_entropy_float,
+                                   gt_entropy_float,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_entropy_float_np_broadcast,
+                                   gt_entropy_float_np_broadcast,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_entropy_np,
+                                   gt_entropy,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_entropy_variable,
+                                   gt_entropy,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_lp_float_np_broadcast,
+                                   gt_lp_float_np_broadcast,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_lp_np,
+                                   gt_lp,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_lp_variable,
+                                   gt_lp,
+                                   rtol=tolerance,
+                                   atol=tolerance)
 
     def test_categorical_distribution(self,
                                       batch_size=2,
@@ -483,8 +498,9 @@ def test_categorical_distribution(self,
 
         with fluid.program_guard(test_program):
             logits = layers.data(name='logits', shape=[dims], dtype='float32')
-            other_logits = layers.data(
-                name='other_logits', shape=[dims], dtype='float32')
+            other_logits = layers.data(name='other_logits',
+                                       shape=[dims],
+                                       dtype='float32')
 
             categorical_np = Categorical(logits_np)
             other_categorical_np = Categorical(other_logits_np)
@@ -504,10 +520,14 @@ def test_categorical_distribution(self,
          output_kl_np] = self.executor.run(program=test_program,
                                            feed={'logits': logits_np},
                                            fetch_list=[entropy_np, kl_np])
-        np.testing.assert_allclose(
-            output_entropy_np, gt_entropy_np, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_np, gt_kl_np, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(output_entropy_np,
+                                   gt_entropy_np,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_kl_np,
+                                   gt_kl_np,
+                                   rtol=tolerance,
+                                   atol=tolerance)
 
     def test_multivariateNormalDiag_distribution(self,
                                                  batch_size=2,
@@ -517,34 +537,34 @@ def test_multivariateNormalDiag_distribution(self,
         loc_np = np.random.random(batch_size, ).astype('float32')
         scale_np = np.diag(np.random.random(batch_size, )).astype('float32')
         other_loc_np = np.random.random(batch_size, ).astype('float32')
-        other_scale_np = np.diag(np.random.random(batch_size, )).astype(
-            'float32')
+        other_scale_np = np.diag(np.random.random(
+            batch_size, )).astype('float32')
 
         with fluid.program_guard(test_program):
-            loc = layers.data(
-                name='loc',
-                shape=[batch_size, ],
-                dtype='float32',
-                append_batch_size=False)
-            scale = layers.data(
-                name='scale',
-                shape=[batch_size, batch_size],
-                dtype='float32',
-                append_batch_size=False)
-            other_loc = layers.data(
-                name='other_loc',
-                shape=[batch_size, ],
-                dtype='float32',
-                append_batch_size=False)
-            other_scale = layers.data(
-                name='other_scale',
-                shape=[batch_size, batch_size],
-                dtype='float32',
-                append_batch_size=False)
+            loc = layers.data(name='loc',
+                              shape=[
+                                  batch_size,
+                              ],
+                              dtype='float32',
+                              append_batch_size=False)
+            scale = layers.data(name='scale',
+                                shape=[batch_size, batch_size],
+                                dtype='float32',
+                                append_batch_size=False)
+            other_loc = layers.data(name='other_loc',
+                                    shape=[
+                                        batch_size,
+                                    ],
+                                    dtype='float32',
+                                    append_batch_size=False)
+            other_scale = layers.data(name='other_scale',
+                                      shape=[batch_size, batch_size],
+                                      dtype='float32',
+                                      append_batch_size=False)
 
             multivariate_np = MultivariateNormalDiag(loc, scale)
-            other_multivariate_np = MultivariateNormalDiag(other_loc,
-                                                           other_scale)
+            other_multivariate_np = MultivariateNormalDiag(
+                other_loc, other_scale)
 
             entropy_np = multivariate_np.entropy()
             other_entropy_np = other_multivariate_np.entropy()
@@ -553,8 +573,8 @@ def test_multivariateNormalDiag_distribution(self,
         self.executor.run(fluid.default_main_program())
 
         np_multivariate = MultivariateNormalDiagNumpy(loc_np, scale_np)
-        np_other_multivariate = MultivariateNormalDiagNumpy(other_loc_np,
-                                                            other_scale_np)
+        np_other_multivariate = MultivariateNormalDiagNumpy(
+            other_loc_np, other_scale_np)
         gt_entropy_np = np_multivariate.entropy()
         gt_kl_np = np_multivariate.kl_divergence(np_other_multivariate)
 
@@ -568,13 +588,18 @@ def test_multivariateNormalDiag_distribution(self,
                                                'other_scale': other_scale_np
                                            },
                                            fetch_list=[entropy_np, kl_np])
-        np.testing.assert_allclose(
-            output_entropy_np, gt_entropy_np, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_np, gt_kl_np, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(output_entropy_np,
+                                   gt_entropy_np,
+                                   rtol=tolerance,
+                                   atol=tolerance)
+        np.testing.assert_allclose(output_kl_np,
+                                   gt_kl_np,
+                                   rtol=tolerance,
+                                   atol=tolerance)
 
 
 class DistributionTestError(unittest.TestCase):
+
     def test_normal_error(self):
         loc = int(1)
         scale = int(1)
diff --git a/python/paddle/fluid/tests/unittests/test_dot_op.py b/python/paddle/fluid/tests/unittests/test_dot_op.py
index a92104a5a6f49..536f8fd8d8af7 100644
--- a/python/paddle/fluid/tests/unittests/test_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dot_op.py
@@ -24,6 +24,7 @@
 
 
 class DotOp(OpTest):
+
     def setUp(self):
         self.op_type = "dot"
         self.init_dtype()
@@ -50,21 +51,19 @@ def test_check_grad_normal(self):
 
     def test_check_grad_ingore_x(self):
         if core.is_compiled_with_rocm():
-            self.check_grad(
-                ['Y'],
-                'Out',
-                no_grad_set=set("X"),
-                user_defined_grads=[self.inputs['X']])
+            self.check_grad(['Y'],
+                            'Out',
+                            no_grad_set=set("X"),
+                            user_defined_grads=[self.inputs['X']])
         else:
             self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
         if core.is_compiled_with_rocm():
-            self.check_grad(
-                ['X'],
-                'Out',
-                no_grad_set=set('Y'),
-                user_defined_grads=[self.inputs['Y']])
+            self.check_grad(['X'],
+                            'Out',
+                            no_grad_set=set('Y'),
+                            user_defined_grads=[self.inputs['Y']])
         else:
             self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
 
@@ -78,11 +77,12 @@ def init_dtype(self):
 
 
 class DotOpBatch(DotOp):
+
     def init_input_output(self):
-        self.x = np.random.uniform(0.1, 1, [132]).astype(self.dtype).reshape(
-            [11, 12])
-        self.y = np.random.uniform(1, 3, [132]).astype(self.dtype).reshape(
-            [11, 12])
+        self.x = np.random.uniform(0.1, 1,
+                                   [132]).astype(self.dtype).reshape([11, 12])
+        self.y = np.random.uniform(1, 3,
+                                   [132]).astype(self.dtype).reshape([11, 12])
         self.out = np.sum(self.x * self.y, axis=1).reshape([11, 1])
 
     def test_check_grad_normal(self):
@@ -96,6 +96,7 @@ def test_check_grad_ingore_y(self):
 
 
 class TestDotOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
@@ -115,6 +116,7 @@ def test_errors(self):
 
 
 class TestDygraph(unittest.TestCase):
+
     def test_dygraph(self):
         with fluid.dygraph.guard():
             x1 = fluid.dygraph.to_variable(np.array([1, 3]).astype(np.float32))
@@ -132,6 +134,7 @@ def test_dygraph(self):
 
 
 class TestComplexDotOp(OpTest):
+
     def setUp(self):
         self.op_type = "dot"
         self.init_base_dtype()
@@ -164,30 +167,28 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-            user_defined_grads=[self.grad_x, self.grad_y],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        user_defined_grads=[self.grad_x, self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out])
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[self.grad_y],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['Y'],
+                        'Out',
+                        no_grad_set=set("X"),
+                        user_defined_grads=[self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out])
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['X'],
+                        'Out',
+                        no_grad_set=set('Y'),
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_out])
 
 
 class TestComplexDotOp2D(OpTest):
+
     def setUp(self):
         self.op_type = "dot"
         self.init_base_dtype()
@@ -229,27 +230,24 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-            user_defined_grads=[self.grad_x, self.grad_y],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        user_defined_grads=[self.grad_x, self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out])
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[self.grad_y],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['Y'],
+                        'Out',
+                        no_grad_set=set("X"),
+                        user_defined_grads=[self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out])
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['X'],
+                        'Out',
+                        no_grad_set=set('Y'),
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_out])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_downpoursgd.py b/python/paddle/fluid/tests/unittests/test_downpoursgd.py
index f22a956b6876e..030af8f809e3e 100644
--- a/python/paddle/fluid/tests/unittests/test_downpoursgd.py
+++ b/python/paddle/fluid/tests/unittests/test_downpoursgd.py
@@ -54,8 +54,9 @@ def test_device_work_use_cvm(self):
                     cache_path)
                 os.system(cmd)
             x = fluid.layers.data(name='x', shape=[1], dtype='int64')
-            x_emb = fluid.layers.embedding(
-                input=x, size=[1, 2], is_distributed=True)
+            x_emb = fluid.layers.embedding(input=x,
+                                           size=[1, 2],
+                                           is_distributed=True)
             y_predict = fluid.layers.fc(input=x_emb, size=1, act=None)
             y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
@@ -113,8 +114,9 @@ def test_device_work(self):
                     cache_path)
                 os.system(cmd)
             x = fluid.layers.data(name='x', shape=[1], dtype='int64')
-            x_emb = fluid.layers.embedding(
-                input=x, size=[1, 2], is_distributed=True)
+            x_emb = fluid.layers.embedding(input=x,
+                                           size=[1, 2],
+                                           is_distributed=True)
             y_predict = fluid.layers.fc(input=x_emb, size=1, act=None)
             y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
@@ -170,8 +172,9 @@ def test_downpour_opt_work(self):
                     cache_path)
                 os.system(cmd)
             x = fluid.layers.data(name='x', shape=[1], dtype='int64')
-            x_emb = fluid.layers.embedding(
-                input=x, size=[1, 2], is_distributed=True)
+            x_emb = fluid.layers.embedding(input=x,
+                                           size=[1, 2],
+                                           is_distributed=True)
             y_predict = fluid.layers.fc(input=x_emb, size=1, act=None)
             y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
diff --git a/python/paddle/fluid/tests/unittests/test_dpsgd_op.py b/python/paddle/fluid/tests/unittests/test_dpsgd_op.py
index 35a922b78205f..bc0de9c89522e 100644
--- a/python/paddle/fluid/tests/unittests/test_dpsgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dpsgd_op.py
@@ -20,6 +20,7 @@
 
 
 class TestDpsgdOp(OpTest):
+
     def setUp(self):
         '''Test Dpsgd Operator with supplied attributes
         '''
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index e8d4fc260b87a..33992b1881ec4 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -29,6 +29,7 @@
 
 
 class TestDropoutOp(OpTest):
+
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
@@ -46,6 +47,7 @@ def test_check_grad_normal(self):
 
 
 class TestDropoutOpInput1d(OpTest):
+
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((2000, )).astype("float32")}
@@ -63,6 +65,7 @@ def test_check_grad_normal(self):
 
 
 class TestDropoutOp2(TestDropoutOp):
+
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
@@ -74,6 +77,7 @@ def setUp(self):
 
 
 class TestDropoutOp3(TestDropoutOp):
+
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
@@ -86,6 +90,7 @@ def setUp(self):
 
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestDropoutOp4(OpTest):
+
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
@@ -100,6 +105,7 @@ def test_check_output(self):
 
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestDropoutOp5(OpTest):
+
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
@@ -113,6 +119,7 @@ def test_check_output(self):
 
 
 class TestDropoutOp6(TestDropoutOp):
+
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
@@ -129,6 +136,7 @@ def setUp(self):
 
 
 class TestDropoutOp7(TestDropoutOp):
+
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
@@ -146,6 +154,7 @@ def setUp(self):
 
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestDropoutOp8(OpTest):
+
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
@@ -163,6 +172,7 @@ def test_check_output(self):
 
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestDropoutOp9(OpTest):
+
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
@@ -178,14 +188,16 @@ def test_check_output(self):
 
 
 class TestDropoutOpWithSeed(OpTest):
+
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {
             "X": np.random.random((32, 64)).astype("float32"),
-            "Seed": np.asarray(
-                [125], dtype="int32")
+            "Seed": np.asarray([125], dtype="int32")
+        }
+        self.attrs = {
+            'dropout_prob': 0.0,
         }
-        self.attrs = {'dropout_prob': 0.0, }
         self.outputs = {
             'Out': self.inputs['X'],
             'Mask': np.ones((32, 64)).astype('uint8')
@@ -198,11 +210,13 @@ def test_check_grad_normal(self):
         self.check_grad(['X'], 'Out', max_relative_error=0.05)
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda() or not core.op_support_gpu("dropout"),
-    "core is not compiled with CUDA or core is not support dropout")
+@unittest.skipIf(not core.is_compiled_with_cuda()
+                 or not core.op_support_gpu("dropout"),
+                 "core is not compiled with CUDA or core is not support dropout"
+                 )
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestFP16DropoutOp(OpTest):
+
     def setUp(self):
         self.op_type = "dropout"
         self.init_test_case()
@@ -226,11 +240,13 @@ def test_check_output(self):
         self.check_output_with_place(core.CUDAPlace(0), atol=1e-3)
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda() or not core.op_support_gpu("dropout"),
-    "core is not compiled with CUDA or core is not support dropout")
+@unittest.skipIf(not core.is_compiled_with_cuda()
+                 or not core.op_support_gpu("dropout"),
+                 "core is not compiled with CUDA or core is not support dropout"
+                 )
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestFP16DropoutOp2(TestFP16DropoutOp):
+
     def init_test_case(self):
         self.input_size = [32, 64, 3]
         self.prob = 0.75
@@ -238,6 +254,7 @@ def init_test_case(self):
 
 
 class TestBF16DropoutOp(OpTest):
+
     def setUp(self):
         self.op_type = "dropout"
         self.dtype = np.uint16
@@ -259,6 +276,7 @@ def test_check_grad_normal(self):
 
 
 class TestDropoutOpWithSeedOnCPUPlace(unittest.TestCase):
+
     def test_seed_cpu_place(self):
         paddle.enable_static()
         main_program = Program()
@@ -280,12 +298,11 @@ def test_seed_cpu_place(self):
                 dtype='float32',
                 persistable=False,
                 stop_gradient=True)
-            x_var = main_program.global_block().create_var(
-                name=x_var_name,
-                shape=[40, 40],
-                dtype='float32',
-                persistable=False,
-                stop_gradient=True)
+            x_var = main_program.global_block().create_var(name=x_var_name,
+                                                           shape=[40, 40],
+                                                           dtype='float32',
+                                                           persistable=False,
+                                                           stop_gradient=True)
             mask_var = main_program.global_block().create_var(
                 name=mask_var_name,
                 shape=[1],
@@ -293,28 +310,32 @@ def test_seed_cpu_place(self):
                 persistable=False,
                 stop_gradient=True)
 
-            main_program.global_block().append_op(
-                type="fill_constant",
-                outputs={"Out": x_var_name},
-                attrs={
-                    "shape": [40, 40],
-                    "dtype": x_var.dtype,
-                    "value": 1.0,
-                    "place_type": 0
-                })
+            main_program.global_block().append_op(type="fill_constant",
+                                                  outputs={"Out": x_var_name},
+                                                  attrs={
+                                                      "shape": [40, 40],
+                                                      "dtype": x_var.dtype,
+                                                      "value": 1.0,
+                                                      "place_type": 0
+                                                  })
             main_program.global_block().append_op(
                 type='seed',
                 inputs={},
                 outputs={'Out': seed_input_var},
-                attrs={'seed': 1,
-                       'force_cpu': True})
-            main_program.global_block().append_op(
-                type='dropout',
-                inputs={'X': x_var,
-                        'Seed': seed_input_var},
-                attrs={'dropout_prob': 0.},
-                outputs={'Out': x_out_var,
-                         'Mask': mask_var})
+                attrs={
+                    'seed': 1,
+                    'force_cpu': True
+                })
+            main_program.global_block().append_op(type='dropout',
+                                                  inputs={
+                                                      'X': x_var,
+                                                      'Seed': seed_input_var
+                                                  },
+                                                  attrs={'dropout_prob': 0.},
+                                                  outputs={
+                                                      'Out': x_out_var,
+                                                      'Mask': mask_var
+                                                  })
             place = fluid.CPUPlace()
             if core.is_compiled_with_cuda():
                 place = fluid.CUDAPlace(0)
@@ -328,13 +349,14 @@ def test_seed_cpu_place(self):
 
 
 class TestDropoutOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
             def test_Variable():
                 # the input of dropout must be Variable.
-                x1 = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                             [[1, 1, 1, 1]], fluid.CPUPlace())
                 fluid.layers.dropout(x1, dropout_prob=0.5)
 
             self.assertRaises(TypeError, test_Variable)
@@ -342,14 +364,16 @@ def test_Variable():
             def test_dtype():
                 # the input dtype of dropout must be float16 or float32 or float64
                 # float16 only can be set on GPU place
-                x2 = fluid.layers.data(
-                    name='x2', shape=[3, 4, 5, 6], dtype="int32")
+                x2 = fluid.layers.data(name='x2',
+                                       shape=[3, 4, 5, 6],
+                                       dtype="int32")
                 fluid.layers.dropout(x2, dropout_prob=0.5)
 
             self.assertRaises(TypeError, test_dtype)
 
 
 class TestDropoutFAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -360,53 +384,59 @@ def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[-1, -1], dtype="float32")
             res1 = paddle.nn.functional.dropout(x=input, p=0., training=False)
-            res2 = paddle.nn.functional.dropout(
-                x=input, p=0., axis=0, training=True, mode='upscale_in_train')
-            res3 = paddle.nn.functional.dropout(
-                x=input, p=0., axis=0, training=True, mode='downscale_in_infer')
-            res4 = paddle.nn.functional.dropout(
-                x=input, p=0., axis=0, training=False, mode='upscale_in_train')
-            res5 = paddle.nn.functional.dropout(
-                x=input,
-                p=0.,
-                axis=0,
-                training=False,
-                mode='downscale_in_infer')
-            res6 = paddle.nn.functional.dropout(
-                x=input,
-                p=0.,
-                axis=[0, 1],
-                training=True,
-                mode='upscale_in_train')
-            res7 = paddle.nn.functional.dropout(
-                x=input,
-                p=0.,
-                axis=[0, 1],
-                training=True,
-                mode='downscale_in_infer')
-            res8 = paddle.nn.functional.dropout(
-                x=input,
-                p=0.,
-                axis=[0, 1],
-                training=False,
-                mode='upscale_in_train')
-            res9 = paddle.nn.functional.dropout(
-                x=input,
-                p=0.,
-                axis=[0, 1],
-                training=False,
-                mode='downscale_in_infer')
+            res2 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=0,
+                                                training=True,
+                                                mode='upscale_in_train')
+            res3 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=0,
+                                                training=True,
+                                                mode='downscale_in_infer')
+            res4 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=0,
+                                                training=False,
+                                                mode='upscale_in_train')
+            res5 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=0,
+                                                training=False,
+                                                mode='downscale_in_infer')
+            res6 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=[0, 1],
+                                                training=True,
+                                                mode='upscale_in_train')
+            res7 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=[0, 1],
+                                                training=True,
+                                                mode='downscale_in_infer')
+            res8 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=[0, 1],
+                                                training=False,
+                                                mode='upscale_in_train')
+            res9 = paddle.nn.functional.dropout(x=input,
+                                                p=0.,
+                                                axis=[0, 1],
+                                                training=False,
+                                                mode='downscale_in_infer')
             res10 = paddle.nn.functional.dropout(x=input, p=1., training=True)
             res11 = paddle.fluid.layers.dropout(x=input, dropout_prob=0.)
-            res12 = paddle.nn.functional.dropout(
-                x=input,
-                p=0.,
-                axis=(0, 1),
-                training=False,
-                mode='upscale_in_train')
-
-            res13 = paddle.nn.functional.dropout(
-                x=input, p=0.7, axis=1, training=True, mode='upscale_in_train')
+            res12 = paddle.nn.functional.dropout(x=input,
+                                                 p=0.,
+                                                 axis=(0, 1),
+                                                 training=False,
+                                                 mode='upscale_in_train')
+
+            res13 = paddle.nn.functional.dropout(x=input,
+                                                 p=0.7,
+                                                 axis=1,
+                                                 training=True,
+                                                 mode='upscale_in_train')
 
             in_np = np.ones([40, 40]).astype("float32")
             res_np = in_np
@@ -442,72 +472,64 @@ def test_dygraph(self):
                 res_np2 = np.zeros_like(in_np)
                 input = fluid.dygraph.to_variable(in_np)
 
-                res1 = paddle.nn.functional.dropout(
-                    x=input, p=0., training=False)
-                res2 = paddle.nn.functional.dropout(
-                    x=input,
-                    p=0.,
-                    axis=0,
-                    training=True,
-                    mode='upscale_in_train')
-                res3 = paddle.nn.functional.dropout(
-                    x=input,
-                    p=0.,
-                    axis=0,
-                    training=True,
-                    mode='downscale_in_infer')
-                res4 = paddle.nn.functional.dropout(
-                    x=input,
-                    p=0.,
-                    axis=0,
-                    training=False,
-                    mode='upscale_in_train')
-                res5 = paddle.nn.functional.dropout(
-                    x=input,
-                    p=0.,
-                    axis=0,
-                    training=False,
-                    mode='downscale_in_infer')
-                res6 = paddle.nn.functional.dropout(
-                    x=input,
-                    p=0.,
-                    axis=[0, 1],
-                    training=True,
-                    mode='upscale_in_train')
-                res7 = paddle.nn.functional.dropout(
-                    x=input,
-                    p=0.,
-                    axis=[0, 1],
-                    training=True,
-                    mode='downscale_in_infer')
-                res8 = paddle.nn.functional.dropout(
-                    x=input,
-                    p=0.,
-                    axis=[0, 1],
-                    training=False,
-                    mode='upscale_in_train')
-                res9 = paddle.nn.functional.dropout(
-                    x=input,
-                    p=0.,
-                    axis=[0, 1],
-                    training=False,
-                    mode='downscale_in_infer')
-                res10 = paddle.nn.functional.dropout(
-                    x=input, p=1., training=True)
+                res1 = paddle.nn.functional.dropout(x=input,
+                                                    p=0.,
+                                                    training=False)
+                res2 = paddle.nn.functional.dropout(x=input,
+                                                    p=0.,
+                                                    axis=0,
+                                                    training=True,
+                                                    mode='upscale_in_train')
+                res3 = paddle.nn.functional.dropout(x=input,
+                                                    p=0.,
+                                                    axis=0,
+                                                    training=True,
+                                                    mode='downscale_in_infer')
+                res4 = paddle.nn.functional.dropout(x=input,
+                                                    p=0.,
+                                                    axis=0,
+                                                    training=False,
+                                                    mode='upscale_in_train')
+                res5 = paddle.nn.functional.dropout(x=input,
+                                                    p=0.,
+                                                    axis=0,
+                                                    training=False,
+                                                    mode='downscale_in_infer')
+                res6 = paddle.nn.functional.dropout(x=input,
+                                                    p=0.,
+                                                    axis=[0, 1],
+                                                    training=True,
+                                                    mode='upscale_in_train')
+                res7 = paddle.nn.functional.dropout(x=input,
+                                                    p=0.,
+                                                    axis=[0, 1],
+                                                    training=True,
+                                                    mode='downscale_in_infer')
+                res8 = paddle.nn.functional.dropout(x=input,
+                                                    p=0.,
+                                                    axis=[0, 1],
+                                                    training=False,
+                                                    mode='upscale_in_train')
+                res9 = paddle.nn.functional.dropout(x=input,
+                                                    p=0.,
+                                                    axis=[0, 1],
+                                                    training=False,
+                                                    mode='downscale_in_infer')
+                res10 = paddle.nn.functional.dropout(x=input,
+                                                     p=1.,
+                                                     training=True)
                 dropout = paddle.fluid.dygraph.Dropout(p=0, )
                 res11 = dropout(input)
-                res12 = paddle.nn.functional.dropout(
-                    x=input,
-                    p=0.,
-                    axis=(0, 1),
-                    training=False,
-                    mode='upscale_in_train')
-                res13 = paddle.nn.functional.dropout(
-                    x=input,
-                    p=0.5,
-                    axis=1,
-                    training=True,
-                    mode='upscale_in_train')
+                res12 = paddle.nn.functional.dropout(x=input,
+                                                     p=0.,
+                                                     axis=(0, 1),
+                                                     training=False,
+                                                     mode='upscale_in_train')
+                res13 = paddle.nn.functional.dropout(x=input,
+                                                     p=0.5,
+                                                     axis=1,
+                                                     training=True,
+                                                     mode='upscale_in_train')
 
             res_list = [
                 res1, res2, res3, res4, res5, res6, res7, res8, res9, res11,
@@ -519,21 +541,22 @@ def test_dygraph(self):
 
 
 class TestDropoutFAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
             def test_Variable():
                 # the input of dropout must be Variable.
-                x1 = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                             [[1, 1, 1, 1]], fluid.CPUPlace())
                 paddle.nn.functional.dropout(x1, p=0.5)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_Variable2():
                 # the input of dropout must be Variable.
-                x1 = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                             [[1, 1, 1, 1]], fluid.CPUPlace())
                 paddle.nn.functional.dropout(x1, p=0.5, axis=0)
 
             self.assertRaises(TypeError, test_Variable2)
@@ -597,6 +620,7 @@ def test_axis_len():
 
 
 class TestDropoutCAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -616,6 +640,7 @@ def test_dygraph(self):
 
 
 class TestDropout2DFAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -624,12 +649,17 @@ def setUp(self):
 
     def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(
-                name="input", shape=[2, 3, 4, 5], dtype="float32")
-            res1 = paddle.nn.functional.dropout2d(
-                x=input, p=0., training=False, data_format='NCHW')
-            res2 = paddle.nn.functional.dropout2d(
-                x=input, p=0., training=False, data_format='NHWC')
+            input = fluid.data(name="input",
+                               shape=[2, 3, 4, 5],
+                               dtype="float32")
+            res1 = paddle.nn.functional.dropout2d(x=input,
+                                                  p=0.,
+                                                  training=False,
+                                                  data_format='NCHW')
+            res2 = paddle.nn.functional.dropout2d(x=input,
+                                                  p=0.,
+                                                  training=False,
+                                                  data_format='NHWC')
 
             in_np = np.random.random([2, 3, 4, 5]).astype("float32")
             res_np = in_np
@@ -653,10 +683,14 @@ def test_dygraph(self):
                 res_np = in_np
                 input = fluid.dygraph.to_variable(in_np)
 
-                res1 = paddle.nn.functional.dropout2d(
-                    x=input, p=0., training=False, data_format='NCHW')
-                res2 = paddle.nn.functional.dropout2d(
-                    x=input, p=0., training=False, data_format='NHWC')
+                res1 = paddle.nn.functional.dropout2d(x=input,
+                                                      p=0.,
+                                                      training=False,
+                                                      data_format='NCHW')
+                res2 = paddle.nn.functional.dropout2d(x=input,
+                                                      p=0.,
+                                                      training=False,
+                                                      data_format='NHWC')
 
             res_list = [res1, res2]
             for res in res_list:
@@ -664,6 +698,7 @@ def test_dygraph(self):
 
 
 class TestDropout2DFAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
@@ -683,6 +718,7 @@ def test_dataformat():
 
 
 class TestDropout2DCAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -702,6 +738,7 @@ def test_dygraph(self):
 
 
 class TestDropout3DFAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -710,12 +747,17 @@ def setUp(self):
 
     def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(
-                name="input", shape=[2, 3, 4, 5, 6], dtype="float32")
-            res1 = paddle.nn.functional.dropout3d(
-                x=input, p=0., training=False, data_format='NCDHW')
-            res2 = paddle.nn.functional.dropout3d(
-                x=input, p=0., training=False, data_format='NDHWC')
+            input = fluid.data(name="input",
+                               shape=[2, 3, 4, 5, 6],
+                               dtype="float32")
+            res1 = paddle.nn.functional.dropout3d(x=input,
+                                                  p=0.,
+                                                  training=False,
+                                                  data_format='NCDHW')
+            res2 = paddle.nn.functional.dropout3d(x=input,
+                                                  p=0.,
+                                                  training=False,
+                                                  data_format='NDHWC')
 
             in_np = np.random.random([2, 3, 4, 5, 6]).astype("float32")
             res_np = in_np
@@ -739,10 +781,14 @@ def test_dygraph(self):
                 res_np = in_np
                 input = fluid.dygraph.to_variable(in_np)
 
-                res1 = paddle.nn.functional.dropout3d(
-                    x=input, p=0., training=False, data_format='NCDHW')
-                res2 = paddle.nn.functional.dropout3d(
-                    x=input, p=0., training=False, data_format='NDHWC')
+                res1 = paddle.nn.functional.dropout3d(x=input,
+                                                      p=0.,
+                                                      training=False,
+                                                      data_format='NCDHW')
+                res2 = paddle.nn.functional.dropout3d(x=input,
+                                                      p=0.,
+                                                      training=False,
+                                                      data_format='NDHWC')
 
             res_list = [res1, res2]
             for res in res_list:
@@ -750,6 +796,7 @@ def test_dygraph(self):
 
 
 class TestDropout3DFAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
@@ -769,6 +816,7 @@ def test_dataformat():
 
 
 class TestDropout3DCAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -788,6 +836,7 @@ def test_dygraph(self):
 
 
 class TestAlphaDropoutFAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -798,8 +847,9 @@ def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[40, 40], dtype="float32")
             res1 = paddle.nn.functional.alpha_dropout(x=input, p=0.)
-            res2 = paddle.nn.functional.alpha_dropout(
-                x=input, p=0., training=False)
+            res2 = paddle.nn.functional.alpha_dropout(x=input,
+                                                      p=0.,
+                                                      training=False)
             res3 = paddle.nn.functional.alpha_dropout(x=input, p=1.)
 
             in_np = np.random.random([40, 40]).astype("float32")
@@ -831,8 +881,9 @@ def test_dygraph(self):
                 input = fluid.dygraph.to_variable(in_np)
 
                 res1 = paddle.nn.functional.alpha_dropout(x=input, p=0.)
-                res2 = paddle.nn.functional.alpha_dropout(
-                    x=input, p=0., training=False)
+                res2 = paddle.nn.functional.alpha_dropout(x=input,
+                                                          p=0.,
+                                                          training=False)
                 res3 = paddle.nn.functional.alpha_dropout(x=input, p=1.)
 
             res_list = [res1, res2]
@@ -842,13 +893,14 @@ def test_dygraph(self):
 
 
 class TestAlphaDropoutFAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
             def test_Variable():
                 # the input of dropout must be Variable.
-                x1 = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                             [[1, 1, 1, 1]], fluid.CPUPlace())
                 paddle.nn.functional.alpha_dropout(x1, p=0.5)
 
             self.assertRaises(TypeError, test_Variable)
@@ -876,6 +928,7 @@ def test_pvalue():
 
 
 class TestAlphaDropoutCAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -895,6 +948,7 @@ def test_dygraph(self):
 
 
 class TestDropoutWithDeterminateSeedGenerator(unittest.TestCase):
+
     def setUp(self):
         paddle.framework.random.set_random_seed_generator('seed0', 123)
         paddle.framework.random.set_random_seed_generator('seed1', 123)
@@ -908,18 +962,16 @@ def check_static_result(self, place):
         from paddle.distributed.fleet.meta_parallel.parallel_layers.random import dropout
         with static.program_guard(static.Program(), static.Program()):
             input = static.data(name="input", shape=[40, 40], dtype="float32")
-            res1 = dropout(
-                input,
-                p=0.3,
-                training=True,
-                mode='upscale_in_train',
-                rng_name='seed0')
-            res2 = dropout(
-                input,
-                p=0.3,
-                training=True,
-                mode='upscale_in_train',
-                rng_name='seed1')
+            res1 = dropout(input,
+                           p=0.3,
+                           training=True,
+                           mode='upscale_in_train',
+                           rng_name='seed0')
+            res2 = dropout(input,
+                           p=0.3,
+                           training=True,
+                           mode='upscale_in_train',
+                           rng_name='seed1')
             res3 = dropout(input, p=0.3)
 
             in_np = np.random.random([40, 40]).astype("float32")
@@ -938,6 +990,7 @@ def test_static(self):
 
 
 class TestDropoutBackward(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -961,8 +1014,9 @@ def test_backward_downscale_in_infer(self):
                 out.backward()
 
                 self.assertTrue(
-                    np.array_equal(input.gradient(
-                    ), self.cal_grad_downscale_in_infer(mask.numpy())))
+                    np.array_equal(
+                        input.gradient(),
+                        self.cal_grad_downscale_in_infer(mask.numpy())))
 
     def test_backward_downscale_in_infer_eager(self):
         for place in self.places:
@@ -974,8 +1028,9 @@ def test_backward_downscale_in_infer_eager(self):
                         input, None, 0.5, False, "downgrade_in_infer", 0, False)
                     out.backward()
                     self.assertTrue(
-                        np.array_equal(input.gradient(
-                        ), self.cal_grad_downscale_in_infer(mask.numpy())))
+                        np.array_equal(
+                            input.gradient(),
+                            self.cal_grad_downscale_in_infer(mask.numpy())))
 
     def test_backward_upscale_train(self):
         _enable_legacy_dygraph()
@@ -991,8 +1046,9 @@ def test_backward_upscale_train(self):
                 out.backward()
 
                 self.assertTrue(
-                    np.allclose(input.gradient(
-                    ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+                    np.allclose(input.gradient(),
+                                self.cal_grad_upscale_train(mask.numpy(),
+                                                            prob)))
 
     def test_backward_upscale_train_eager(self):
         for place in self.places:
@@ -1006,8 +1062,9 @@ def test_backward_upscale_train_eager(self):
                     out.backward()
 
                     self.assertTrue(
-                        np.allclose(input.gradient(
-                        ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+                        np.allclose(
+                            input.gradient(),
+                            self.cal_grad_upscale_train(mask.numpy(), prob)))
 
     def test_backward_upscale_train_2(self):
         _enable_legacy_dygraph()
@@ -1023,8 +1080,9 @@ def test_backward_upscale_train_2(self):
                 out.backward()
 
                 self.assertTrue(
-                    np.allclose(input.gradient(
-                    ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+                    np.allclose(input.gradient(),
+                                self.cal_grad_upscale_train(mask.numpy(),
+                                                            prob)))
 
     def test_backward_upscale_train_2_eager(self):
         for place in self.places:
@@ -1040,11 +1098,13 @@ def test_backward_upscale_train_2_eager(self):
                     out.backward()
 
                     self.assertTrue(
-                        np.allclose(input.gradient(
-                        ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+                        np.allclose(
+                            input.gradient(),
+                            self.cal_grad_upscale_train(mask.numpy(), prob)))
 
 
 class TestRandomValue(unittest.TestCase):
+
     def test_fixed_random_number(self):
         # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
         if not paddle.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py b/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
index 6c2516d6c11ef..f77f54a636ee7 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
@@ -23,6 +23,7 @@
 
 
 class SimpleImgConvPool(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -43,27 +44,25 @@ def __init__(self,
                  bias_attr=None):
         super(SimpleImgConvPool, self).__init__()
 
-        self._conv2d = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=conv_stride,
-            padding=conv_padding,
-            dilation=conv_dilation,
-            groups=conv_groups,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            use_cudnn=use_cudnn,
-            dtype=dtype,
-            act=act)
-
-        self._pool2d = Pool2D(
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn)
+        self._conv2d = Conv2D(num_channels=num_channels,
+                              num_filters=num_filters,
+                              filter_size=filter_size,
+                              stride=conv_stride,
+                              padding=conv_padding,
+                              dilation=conv_dilation,
+                              groups=conv_groups,
+                              param_attr=param_attr,
+                              bias_attr=bias_attr,
+                              use_cudnn=use_cudnn,
+                              dtype=dtype,
+                              act=act)
+
+        self._pool2d = Pool2D(pool_size=pool_size,
+                              pool_type=pool_type,
+                              pool_stride=pool_stride,
+                              pool_padding=pool_padding,
+                              global_pooling=global_pooling,
+                              use_cudnn=use_cudnn)
 
     def forward(self, inputs):
         x = self._conv2d(inputs)
@@ -72,28 +71,27 @@ def forward(self, inputs):
 
 
 class MNIST(fluid.dygraph.Layer):
+
     def __init__(self, dtype="float32"):
         super(MNIST, self).__init__()
 
-        self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            num_channels=3,
-            num_filters=20,
-            filter_size=5,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            dtype=dtype,
-            use_cudnn=True)
-
-        self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            num_channels=20,
-            num_filters=50,
-            filter_size=5,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            dtype=dtype,
-            use_cudnn=True)
+        self._simple_img_conv_pool_1 = SimpleImgConvPool(num_channels=3,
+                                                         num_filters=20,
+                                                         filter_size=5,
+                                                         pool_size=2,
+                                                         pool_stride=2,
+                                                         act="relu",
+                                                         dtype=dtype,
+                                                         use_cudnn=True)
+
+        self._simple_img_conv_pool_2 = SimpleImgConvPool(num_channels=20,
+                                                         num_filters=50,
+                                                         filter_size=5,
+                                                         pool_size=2,
+                                                         pool_stride=2,
+                                                         act="relu",
+                                                         dtype=dtype,
+                                                         use_cudnn=True)
 
         self.pool_2_shape = 50 * 53 * 53
         SIZE = 10
@@ -102,8 +100,8 @@ def __init__(self, dtype="float32"):
             self.pool_2_shape,
             10,
             param_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.NormalInitializer(
-                    loc=0.0, scale=scale)),
+                initializer=fluid.initializer.NormalInitializer(loc=0.0,
+                                                                scale=scale)),
             act="softmax",
             dtype=dtype)
 
@@ -118,6 +116,7 @@ def forward(self, inputs, label):
 
 
 class TestMnist(unittest.TestCase):
+
     def func_mnist_fp16(self):
         if not fluid.is_compiled_with_cuda():
             return
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_mode_of_unittest.py b/python/paddle/fluid/tests/unittests/test_dygraph_mode_of_unittest.py
index 739a0fbbfd323..8e1761e9cd72d 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_mode_of_unittest.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_mode_of_unittest.py
@@ -19,10 +19,12 @@
 
 
 class TestDygraphModeOfUnittest(unittest.TestCase):
+
     def test_dygraph_mode(self):
-        self.assertTrue(paddle.in_dynamic_mode(
-        ), 'Default Mode of Unittest should be dygraph mode, but get static mode.'
-                        )
+        self.assertTrue(
+            paddle.in_dynamic_mode(),
+            'Default Mode of Unittest should be dygraph mode, but get static mode.'
+        )
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
index a1165f3358415..2487bc15660e2 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
@@ -31,6 +31,7 @@
 
 
 class SimpleImgConvPool(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -50,25 +51,23 @@ def __init__(self,
                  bias_attr=None):
         super(SimpleImgConvPool, self).__init__()
 
-        self._conv2d = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=conv_stride,
-            padding=conv_padding,
-            dilation=conv_dilation,
-            groups=conv_groups,
-            param_attr=None,
-            bias_attr=None,
-            use_cudnn=use_cudnn)
-
-        self._pool2d = Pool2D(
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn)
+        self._conv2d = Conv2D(num_channels=num_channels,
+                              num_filters=num_filters,
+                              filter_size=filter_size,
+                              stride=conv_stride,
+                              padding=conv_padding,
+                              dilation=conv_dilation,
+                              groups=conv_groups,
+                              param_attr=None,
+                              bias_attr=None,
+                              use_cudnn=use_cudnn)
+
+        self._pool2d = Pool2D(pool_size=pool_size,
+                              pool_type=pool_type,
+                              pool_stride=pool_stride,
+                              pool_padding=pool_padding,
+                              global_pooling=global_pooling,
+                              use_cudnn=use_cudnn)
 
     def forward(self, inputs):
         x = self._conv2d(inputs)
@@ -77,25 +76,33 @@ def forward(self, inputs):
 
 
 class MNIST(fluid.dygraph.Layer):
+
     def __init__(self):
         super(MNIST, self).__init__()
 
-        self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            1, 20, 5, 2, 2, act="relu")
+        self._simple_img_conv_pool_1 = SimpleImgConvPool(1,
+                                                         20,
+                                                         5,
+                                                         2,
+                                                         2,
+                                                         act="relu")
 
-        self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            20, 50, 5, 2, 2, act="relu")
+        self._simple_img_conv_pool_2 = SimpleImgConvPool(20,
+                                                         50,
+                                                         5,
+                                                         2,
+                                                         2,
+                                                         act="relu")
 
         self.pool_2_shape = 50 * 4 * 4
         SIZE = 100  #10
         scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
-        self._fc = Linear(
-            self.pool_2_shape,
-            SIZE,
-            param_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.NormalInitializer(
-                    loc=0.0, scale=scale)),
-            act="softmax")
+        self._fc = Linear(self.pool_2_shape,
+                          SIZE,
+                          param_attr=fluid.param_attr.ParamAttr(
+                              initializer=fluid.initializer.NormalInitializer(
+                                  loc=0.0, scale=scale)),
+                          act="softmax")
 
     def forward(self, inputs):
         x = self._simple_img_conv_pool_1(inputs)
@@ -106,6 +113,7 @@ def forward(self, inputs):
 
 
 class TestDygraphMultiForward(unittest.TestCase):
+
     def test_mnist_forward_float32(self):
         epoch_num = 1
 
@@ -113,20 +121,21 @@ def test_mnist_forward_float32(self):
             paddle.seed(SEED)
             paddle.framework.random._manual_program_seed(SEED)
             mnist = MNIST()
-            sgd = SGDOptimizer(
-                learning_rate=1e-3, parameter_list=mnist.parameters())
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+            sgd = SGDOptimizer(learning_rate=1e-3,
+                               parameter_list=mnist.parameters())
+            train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                        batch_size=128,
+                                        drop_last=True)
 
             dy_param_init_value = {}
             mnist.eval()
             for epoch in range(epoch_num):
                 for batch_id, data in enumerate(train_reader()):
-                    dy_x_data = np.array(
-                        [x[0].reshape(1, 28, 28)
-                         for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(128, 1)
+                    dy_x_data = np.array([
+                        x[0].reshape(1, 28, 28) for x in data
+                    ]).astype('float32')
+                    y_data = np.array([x[1] for x in data
+                                       ]).astype('int64').reshape(128, 1)
 
                     img = to_variable(dy_x_data)
                     label = to_variable(y_data)
@@ -150,11 +159,13 @@ def test_mnist_forward_float32(self):
 
             mnist = MNIST()
             sgd = SGDOptimizer(learning_rate=1e-3)
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+            train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                        batch_size=128,
+                                        drop_last=True)
 
-            img = fluid.layers.data(
-                name='pixel', shape=[1, 28, 28], dtype='float32')
+            img = fluid.layers.data(name='pixel',
+                                    shape=[1, 28, 28],
+                                    dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             cost = mnist(img)
             loss = fluid.layers.cross_entropy(cost, label)
@@ -174,18 +185,19 @@ def test_mnist_forward_float32(self):
 
             for epoch in range(epoch_num):
                 for batch_id, data in enumerate(train_reader()):
-                    static_x_data = np.array(
-                        [x[0].reshape(1, 28, 28)
-                         for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape([128, 1])
+                    static_x_data = np.array([
+                        x[0].reshape(1, 28, 28) for x in data
+                    ]).astype('float32')
+                    y_data = np.array([x[1] for x in data
+                                       ]).astype('int64').reshape([128, 1])
 
                     fetch_list = [avg_loss.name]
-                    out = exe.run(
-                        fluid.default_main_program(),
-                        feed={"pixel": static_x_data,
-                              "label": y_data},
-                        fetch_list=fetch_list)
+                    out = exe.run(fluid.default_main_program(),
+                                  feed={
+                                      "pixel": static_x_data,
+                                      "label": y_data
+                                  },
+                                  fetch_list=fetch_list)
 
                     static_out = out[0]
 
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
index fa9ea5d086c03..799555a7b03d8 100755
--- a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,27 +29,28 @@
 def get_fc_block(block_idx, input_size, is_last=False):
     block_name = "block_" + str(block_idx)
     block = paddle.nn.Sequential(
-        (block_name + "_fc_0", paddle.nn.Linear(
-            input_size, input_size, bias_attr=False)),
+        (block_name + "_fc_0",
+         paddle.nn.Linear(input_size, input_size, bias_attr=False)),
         (block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
         (block_name + "_relu_1", paddle.nn.ReLU()),
-        (block_name + "_fc_1", paddle.nn.Linear(
-            input_size, input_size, bias_attr=False)),
-        (block_name + "_relu_2", paddle.nn.ReLU()), )
+        (block_name + "_fc_1",
+         paddle.nn.Linear(input_size, input_size, bias_attr=False)),
+        (block_name + "_relu_2", paddle.nn.ReLU()),
+    )
     if is_last:
-        block.add_sublayer(
-            block_name + "_fc_2",
-            paddle.nn.Linear(
-                input_size, 1, bias_attr=False))  # add sublayer
+        block.add_sublayer(block_name + "_fc_2",
+                           paddle.nn.Linear(input_size, 1,
+                                            bias_attr=False))  # add sublayer
     else:
-        block.add_sublayer(
-            block_name + "_fc_2",
-            paddle.nn.Linear(
-                input_size, input_size, bias_attr=False))  # add sublayer
+        block.add_sublayer(block_name + "_fc_2",
+                           paddle.nn.Linear(input_size,
+                                            input_size,
+                                            bias_attr=False))  # add sublayer
     return block
 
 
 class Naive_fc_net(paddle.nn.Layer):
+
     def __init__(self,
                  input_size=10,
                  recompute_blocks=[1, 3],
@@ -103,10 +104,9 @@ def run_model(recompute_block=[],
     random.seed(10)
 
     batch_size, input_size = 1, 10
-    model = Naive_fc_net(
-        input_size,
-        recompute_blocks=recompute_block,
-        recompute_kwargs=recompute_kwargs)
+    model = Naive_fc_net(input_size,
+                         recompute_blocks=recompute_block,
+                         recompute_kwargs=recompute_kwargs)
     loss_fn = paddle.nn.MSELoss(reduction='mean')
     optimizer = paddle.optimizer.SGD(learning_rate=0.01,
                                      parameters=model.parameters())
@@ -142,7 +142,9 @@ def run_model(recompute_block=[],
 
 
 class TestPyLayer(unittest.TestCase):
+
     def test_base_case(self, enable_autocast=False, pure_fp16=False):
+
         def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
             self.assertEqual(loss_ref, loss)
             self.assertEqual(param_ref, param)
@@ -155,31 +157,27 @@ def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
             pure_fp16=pure_fp16)
 
         # recompute second block
-        loss, param, grad = run_model(
-            recompute_block=[1],
-            enable_autocast=enable_autocast,
-            pure_fp16=pure_fp16)
+        loss, param, grad = run_model(recompute_block=[1],
+                                      enable_autocast=enable_autocast,
+                                      pure_fp16=pure_fp16)
         check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
         # recompute fourth block
-        loss, param, grad = run_model(
-            recompute_block=[3],
-            enable_autocast=enable_autocast,
-            pure_fp16=pure_fp16)
+        loss, param, grad = run_model(recompute_block=[3],
+                                      enable_autocast=enable_autocast,
+                                      pure_fp16=pure_fp16)
         check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
         # recompute second to fourth block
-        loss, param, grad = run_model(
-            recompute_block=[1, 2, 3],
-            enable_autocast=enable_autocast,
-            pure_fp16=pure_fp16)
+        loss, param, grad = run_model(recompute_block=[1, 2, 3],
+                                      enable_autocast=enable_autocast,
+                                      pure_fp16=pure_fp16)
         check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
         # recompute second & fourth block
-        loss, param, grad = run_model(
-            recompute_block=[1, 3],
-            enable_autocast=enable_autocast,
-            pure_fp16=pure_fp16)
+        loss, param, grad = run_model(recompute_block=[1, 3],
+                                      enable_autocast=enable_autocast,
+                                      pure_fp16=pure_fp16)
         check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
     def test_fc_net_with_dropout(self):
@@ -214,8 +212,8 @@ def test_recompute_kwargs(self):
         paddle.set_device("gpu")
         kwargs = {"is_test": False}
         with self.assertRaises(ValueError):
-            loss_ref, param_ref, grad_ref = run_model(
-                recompute_block=[2], recompute_kwargs=kwargs)
+            loss_ref, param_ref, grad_ref = run_model(recompute_block=[2],
+                                                      recompute_kwargs=kwargs)
 
     def test_recompute_cpu_rng(self):
         with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py
index 50e1985138610..e76b4ab3674d6 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py
@@ -24,8 +24,8 @@ class TestDygraphShardingOptimizerStage2(TestMultipleGpus):
 
     # check sharding logic as well as the accuracy with single mode
     def test_dygraph_sharding_optimizer_stage2(self):
-        self.run_mnist_2gpu(
-            'dygraph_sharding_optimizer_stage2.py', eager_mode=False)
+        self.run_mnist_2gpu('dygraph_sharding_optimizer_stage2.py',
+                            eager_mode=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
index 866577ea7aa8c..9d842d8719fe3 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
@@ -30,8 +30,8 @@ def test_dygraph_sharding_stage2(self):
 
     def test_dygraph_sharding_stage2_offload(self):
         self.run_mnist_2gpu('dygraph_group_sharded_stage2_offload.py')
-        self.run_mnist_2gpu(
-            'dygraph_sharding_stage2_offload.py', eager_mode=False)
+        self.run_mnist_2gpu('dygraph_sharding_stage2_offload.py',
+                            eager_mode=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
index c1f5e06f42b53..6175634e70013 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
@@ -30,8 +30,8 @@ def test_dygraph_sharding_stage3(self):
 
     def test_dygraph_sharding_stage3_offload(self):
         self.run_mnist_2gpu('dygraph_group_sharded_stage3_offload.py')
-        self.run_mnist_2gpu(
-            'dygraph_sharding_stage3_offload.py', eager_mode=False)
+        self.run_mnist_2gpu('dygraph_sharding_stage3_offload.py',
+                            eager_mode=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_spectral_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_spectral_norm.py
index ef220ba101617..9ca53d9a925fd 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_spectral_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_spectral_norm.py
@@ -23,6 +23,7 @@
 
 
 class TestDygraphSpectralNorm(unittest.TestCase):
+
     def setUp(self):
         self.init_test_case()
         self.set_data()
@@ -39,8 +40,8 @@ def set_data(self):
         for desc in self.data_desc:
             data_name = desc[0]
             data_shape = desc[1]
-            data_value = np.random.random(
-                size=[self.batch_size] + data_shape).astype('float32')
+            data_value = np.random.random(size=[self.batch_size] +
+                                          data_shape).astype('float32')
             self.data[data_name] = data_value
 
     def spectral_normalize(self, weight, u, v, dim, power_iters, eps):
@@ -77,11 +78,10 @@ def test_check_output(self):
         else:
             self.dim = (self.dim + len(before_weight)) % len(before_weight)
 
-        sn = spectral_norm(
-            linear,
-            n_power_iterations=self.n_power_iterations,
-            eps=self.eps,
-            dim=self.dim)
+        sn = spectral_norm(linear,
+                           n_power_iterations=self.n_power_iterations,
+                           eps=self.eps,
+                           dim=self.dim)
         u = sn.weight_u.numpy().copy()
         v = sn.weight_v.numpy().copy()
         outputs = []
@@ -90,16 +90,17 @@ def test_check_output(self):
             outputs.append(output.numpy())
         self.actual_outputs = linear.weight.numpy()
 
-        expect_output = self.spectral_normalize(
-            before_weight, u, v, self.dim, self.n_power_iterations, self.eps)
+        expect_output = self.spectral_normalize(before_weight, u, v, self.dim,
+                                                self.n_power_iterations,
+                                                self.eps)
 
         for expect, actual in zip(expect_output, self.actual_outputs):
             self.assertTrue(
-                np.allclose(
-                    np.array(actual), np.array(expect), atol=0.001))
+                np.allclose(np.array(actual), np.array(expect), atol=0.001))
 
 
 class TestDygraphWeightNormCase(TestDygraphSpectralNorm):
+
     def init_test_case(self):
         self.batch_size = 3
         self.data_desc = (['x', [2, 3, 3]], )
@@ -109,6 +110,7 @@ def init_test_case(self):
 
 
 class TestDygraphWeightNormWithIterations(TestDygraphSpectralNorm):
+
     def init_test_case(self):
         self.batch_size = 3
         self.data_desc = (['x', [2, 3, 3]], )
@@ -118,6 +120,7 @@ def init_test_case(self):
 
 
 class TestDygraphWeightNormWithDim(TestDygraphSpectralNorm):
+
     def init_test_case(self):
         self.batch_size = 3
         self.data_desc = (['x', [2, 3, 3]], )
@@ -127,6 +130,7 @@ def init_test_case(self):
 
 
 class TestDygraphWeightNormWithEps(TestDygraphSpectralNorm):
+
     def init_test_case(self):
         self.batch_size = 3
         self.data_desc = (['x', [2, 3, 3]], )
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
index 27d82fcc8903b..6ca02794a8ab8 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
@@ -25,6 +25,7 @@
 
 
 class TestDygraphWeightNorm(unittest.TestCase):
+
     def setUp(self):
         self.init_test_case()
         self.set_data()
@@ -39,8 +40,8 @@ def set_data(self):
         for desc in self.data_desc:
             data_name = desc[0]
             data_shape = desc[1]
-            data_value = numpy.random.random(
-                size=[self.batch_size] + data_shape).astype('float32')
+            data_value = numpy.random.random(size=[self.batch_size] +
+                                             data_shape).astype('float32')
             self.data[data_name] = data_value
 
     def norm_except_dim(self, w, dim=None):
@@ -95,11 +96,9 @@ def weight_normalize(self, w, dim=None):
             p_matrix = numpy.reshape(
                 p_transposed, (p_transposed.shape[0],
                                transposed_shape_numel // p_transposed.shape[0]))
-            v_norm = v / numpy.expand_dims(
-                numpy.expand_dims(
-                    numpy.linalg.norm(
-                        p_matrix, axis=1, keepdims=True), axis=0),
-                axis=(ndims - 1))
+            v_norm = v / numpy.expand_dims(numpy.expand_dims(
+                numpy.linalg.norm(p_matrix, axis=1, keepdims=True), axis=0),
+                                           axis=(ndims - 1))
             v_norm = numpy.reshape(v_norm, transposed_shape)
             v_norm = numpy.transpose(v_norm, perm)
             g = numpy.squeeze(g, axis=1)
@@ -107,11 +106,10 @@ def weight_normalize(self, w, dim=None):
                 eaxis = 2
             elif dim == 2:
                 eaxis = 1
-            g_mul = numpy.expand_dims(
-                numpy.expand_dims(
-                    numpy.expand_dims(
-                        g, axis=0), axis=eaxis),
-                axis=(ndims - 1))
+            g_mul = numpy.expand_dims(numpy.expand_dims(numpy.expand_dims(
+                g, axis=0),
+                                                        axis=eaxis),
+                                      axis=(ndims - 1))
         w = g_mul * v_norm
         return g, v
 
@@ -136,11 +134,11 @@ def test_check_output(self):
 
         for expect, actual in zip(expect_output, self.actual_outputs):
             self.assertTrue(
-                numpy.allclose(
-                    numpy.array(actual), expect, atol=0.001))
+                numpy.allclose(numpy.array(actual), expect, atol=0.001))
 
 
 class TestDygraphWeightNormCase1(TestDygraphWeightNorm):
+
     def init_test_case(self):
         self.batch_size = 3
         self.data_desc = (['x', [2, 3, 3]], )
@@ -148,6 +146,7 @@ def init_test_case(self):
 
 
 class TestDygraphWeightNormCase2(TestDygraphWeightNorm):
+
     def init_test_case(self):
         self.batch_size = 3
         self.data_desc = (['x', [2, 3, 3]], )
@@ -155,6 +154,7 @@ def init_test_case(self):
 
 
 class TestDygraphWeightNormCase3(TestDygraphWeightNorm):
+
     def init_test_case(self):
         self.batch_size = 3
         self.data_desc = (['x', [2, 3, 3]], )
@@ -162,6 +162,7 @@ def init_test_case(self):
 
 
 class TestDygraphWeightNormCase4(TestDygraphWeightNorm):
+
     def init_test_case(self):
         self.batch_size = 3
         self.data_desc = (['x', [2, 3, 3]], )
@@ -169,6 +170,7 @@ def init_test_case(self):
 
 
 class TestDygraphRemoveWeightNorm(unittest.TestCase):
+
     def setUp(self):
         self.init_test_case()
 
@@ -185,8 +187,9 @@ def test_check_output(self):
         rwn = remove_weight_norm(linear)
         after_weight = linear.weight
         self.assertTrue(
-            numpy.allclose(
-                before_weight.numpy(), after_weight.numpy(), atol=0.001))
+            numpy.allclose(before_weight.numpy(),
+                           after_weight.numpy(),
+                           atol=0.001))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index 1cf0c145f830d..0698a8b40df59 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -31,6 +31,7 @@
 
 
 class TestDynamicRNN(unittest.TestCase):
+
     def setUp(self):
         self.word_dict_len = 5147
         self.BATCH_SIZE = 2
@@ -82,10 +83,13 @@ def test_plain_while_op(self):
         startup_program = fluid.Program()
 
         with fluid.program_guard(main_program, startup_program):
-            sentence = fluid.layers.data(
-                name='word', shape=[1], dtype='int64', lod_level=1)
-            sent_emb = fluid.layers.embedding(
-                input=sentence, size=[self.word_dict_len, 32], dtype='float32')
+            sentence = fluid.layers.data(name='word',
+                                         shape=[1],
+                                         dtype='int64',
+                                         lod_level=1)
+            sent_emb = fluid.layers.embedding(input=sentence,
+                                              size=[self.word_dict_len, 32],
+                                              dtype='float32')
 
             rank_table = lod_rank_table(x=sent_emb)
             sent_emb_array = lod_tensor_to_array(x=sent_emb, table=rank_table)
@@ -95,8 +99,7 @@ def test_plain_while_op(self):
             i.stop_gradient = False
 
             boot_mem = fluid.layers.fill_constant_batch_size_like(
-                input=fluid.layers.array_read(
-                    array=sent_emb_array, i=i),
+                input=fluid.layers.array_read(array=sent_emb_array, i=i),
                 value=0,
                 shape=[-1, 100],
                 dtype='float32')
@@ -126,8 +129,8 @@ def test_plain_while_op(self):
 
             logits = fluid.layers.fc(input=last, size=1, act=None)
             label = fluid.layers.data(name='label', shape=[1], dtype='float32')
-            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-                x=logits, label=label)
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logits,
+                                                                  label=label)
             loss = fluid.layers.mean(loss)
             sgd = fluid.optimizer.SGD(1e-4)
             sgd.minimize(loss=loss)
@@ -135,13 +138,12 @@ def test_plain_while_op(self):
         # Check for lod_level set in compile-time.
         self.assertEqual(sent_emb.lod_level, result_all_timesteps.lod_level)
 
-        self._train(
-            main_program=main_program,
-            startup_program=startup_program,
-            feed_list=[sentence, label],
-            fetch_list=[sent_emb, result_all_timesteps, loss],
-            is_nested=False,
-            max_iters=1)
+        self._train(main_program=main_program,
+                    startup_program=startup_program,
+                    feed_list=[sentence, label],
+                    fetch_list=[sent_emb, result_all_timesteps, loss],
+                    is_nested=False,
+                    max_iters=1)
 
     def test_train_dynamic_rnn(self):
         main_program = fluid.Program()
@@ -149,10 +151,13 @@ def test_train_dynamic_rnn(self):
         main_program.random_seed = 10
         startup_program.random_seed = 10
         with fluid.program_guard(main_program, startup_program):
-            sentence = fluid.layers.data(
-                name='word', shape=[1], dtype='int64', lod_level=1)
-            sent_emb = fluid.layers.embedding(
-                input=sentence, size=[self.word_dict_len, 32], dtype='float32')
+            sentence = fluid.layers.data(name='word',
+                                         shape=[1],
+                                         dtype='int64',
+                                         lod_level=1)
+            sent_emb = fluid.layers.embedding(input=sentence,
+                                              size=[self.word_dict_len, 32],
+                                              dtype='float32')
 
             drnn = fluid.layers.DynamicRNN()
             with drnn.block():
@@ -167,8 +172,8 @@ def test_train_dynamic_rnn(self):
             logits = fluid.layers.fc(input=last, size=1, act=None)
 
             label = fluid.layers.data(name='label', shape=[1], dtype='float32')
-            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-                x=logits, label=label)
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logits,
+                                                                  label=label)
             loss = fluid.layers.mean(loss)
             sgd = fluid.optimizer.Adam(1e-3)
             sgd.minimize(loss=loss)
@@ -176,13 +181,12 @@ def test_train_dynamic_rnn(self):
         # Check for lod_level set in compile-time.
         self.assertEqual(sent_emb.lod_level, drnn_result.lod_level)
 
-        self._train(
-            main_program=main_program,
-            startup_program=startup_program,
-            feed_list=[sentence, label],
-            fetch_list=[sent_emb, drnn_result, loss],
-            is_nested=False,
-            max_iters=100)
+        self._train(main_program=main_program,
+                    startup_program=startup_program,
+                    feed_list=[sentence, label],
+                    fetch_list=[sent_emb, drnn_result, loss],
+                    is_nested=False,
+                    max_iters=100)
 
     def _fake_reader(self):
         seq_len, label = [[2, 2]], [0, 1]
@@ -203,17 +207,22 @@ def test_train_nested_dynamic_rnn(self):
         main_program.random_seed = 10
         startup_program.random_seed = 10
         with fluid.program_guard(main_program, startup_program):
-            sentence = fluid.layers.data(
-                name='word', shape=[1], dtype='int64', lod_level=2)
-            label = fluid.layers.data(
-                name='label', shape=[1], dtype='float32', lod_level=1)
+            sentence = fluid.layers.data(name='word',
+                                         shape=[1],
+                                         dtype='int64',
+                                         lod_level=2)
+            label = fluid.layers.data(name='label',
+                                      shape=[1],
+                                      dtype='float32',
+                                      lod_level=1)
 
             drnn0 = fluid.layers.DynamicRNN()
             with drnn0.block():
                 in_0 = drnn0.step_input(sentence)
                 assert in_0.lod_level == 1, "the lod level of in_ should be 1"
-                sentence_emb = fluid.layers.embedding(
-                    input=in_0, size=[len(word_dict), 32], dtype='float32')
+                sentence_emb = fluid.layers.embedding(input=in_0,
+                                                      size=[len(word_dict), 32],
+                                                      dtype='float32')
                 out_0 = fluid.layers.fc(input=sentence_emb,
                                         size=100,
                                         act='tanh')
@@ -231,21 +240,20 @@ def test_train_nested_dynamic_rnn(self):
 
             last = drnn0()
             logits = fluid.layers.fc(input=last, size=1, act=None)
-            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-                x=logits, label=label)
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logits,
+                                                                  label=label)
             loss = fluid.layers.mean(loss)
             sgd = fluid.optimizer.SGD(1e-3)
             sgd.minimize(loss=loss)
 
         train_data_orig = self.train_data
         self.train_data = paddle.batch(self._fake_reader, batch_size=2)
-        self._train(
-            main_program=main_program,
-            startup_program=startup_program,
-            feed_list=[sentence, label],
-            fetch_list=[loss],
-            is_nested=True,
-            max_iters=100)
+        self._train(main_program=main_program,
+                    startup_program=startup_program,
+                    feed_list=[sentence, label],
+                    fetch_list=[loss],
+                    is_nested=True,
+                    max_iters=100)
         self.train_data = train_data_orig
 
     # this unit test is just used to the two layer nested dyn_rnn.
@@ -258,10 +266,14 @@ def test_train_nested_dynamic_rnn2(self):
         main_program.random_seed = 10
         startup_program.random_seed = 10
         with fluid.program_guard(main_program, startup_program):
-            sentence = fluid.layers.data(
-                name='word', shape=[1], dtype='int64', lod_level=2)
-            label = fluid.layers.data(
-                name='label', shape=[1], dtype='float32', lod_level=1)
+            sentence = fluid.layers.data(name='word',
+                                         shape=[1],
+                                         dtype='int64',
+                                         lod_level=2)
+            label = fluid.layers.data(name='label',
+                                      shape=[1],
+                                      dtype='float32',
+                                      lod_level=1)
 
             drnn0 = fluid.layers.DynamicRNN()
             with drnn0.block():
@@ -274,10 +286,9 @@ def test_train_nested_dynamic_rnn2(self):
                                                      size=hidden_size * 4,
                                                      act=None,
                                                      bias_attr=False)
-                forward, _ = fluid.layers.dynamic_lstm(
-                    input=input_forward_proj,
-                    size=hidden_size * 4,
-                    use_peepholes=False)
+                forward, _ = fluid.layers.dynamic_lstm(input=input_forward_proj,
+                                                       size=hidden_size * 4,
+                                                       use_peepholes=False)
 
                 drnn1 = fluid.layers.DynamicRNN()
                 with drnn1.block():
@@ -290,31 +301,33 @@ def test_train_nested_dynamic_rnn2(self):
 
             last = drnn0()
             logits = fluid.layers.fc(input=last, size=1, act=None)
-            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-                x=logits, label=label)
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logits,
+                                                                  label=label)
             loss = fluid.layers.mean(loss)
             sgd = fluid.optimizer.SGD(1e-3)
             sgd.minimize(loss=loss)
 
         train_data_orig = self.train_data
         self.train_data = paddle.batch(self._fake_reader, batch_size=2)
-        self._train(
-            main_program=main_program,
-            startup_program=startup_program,
-            feed_list=[sentence, label],
-            fetch_list=[loss],
-            is_nested=True,
-            max_iters=100)
+        self._train(main_program=main_program,
+                    startup_program=startup_program,
+                    feed_list=[sentence, label],
+                    fetch_list=[loss],
+                    is_nested=True,
+                    max_iters=100)
         self.train_data = train_data_orig
 
 
 class TestDynamicRNNErrors(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             init = fluid.layers.zeros(shape=[1], dtype='float32')
             shape = 'shape'
-            sentence = fluid.data(
-                name='sentence', shape=[None, 32], dtype='float32', lod_level=1)
+            sentence = fluid.data(name='sentence',
+                                  shape=[None, 32],
+                                  dtype='float32',
+                                  lod_level=1)
 
             # The type of Input(shape) in API(memory) must be list or tuple
             def input_shape_type_of_memory():
diff --git a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py
index 243ad4c082ab0..167748c5a98be 100644
--- a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py
@@ -27,10 +27,14 @@ def build_and_run_program(place, batch_size, beam_size, stop_gradient=False):
     x = layers.assign(
         np.random.rand(batch_size, beam_size, 32).astype("float32"))
     indices = fluid.data(shape=[None, beam_size], dtype="int64", name="indices")
-    step_idx = layers.fill_constant(
-        shape=[1], dtype="int64", value=0, force_cpu=True)
-    max_len = layers.fill_constant(
-        shape=[1], dtype="int64", value=10, force_cpu=True)
+    step_idx = layers.fill_constant(shape=[1],
+                                    dtype="int64",
+                                    value=0,
+                                    force_cpu=True)
+    max_len = layers.fill_constant(shape=[1],
+                                   dtype="int64",
+                                   value=10,
+                                   force_cpu=True)
     cond = layers.less_than(x=step_idx, y=max_len)
     while_op = layers.While(cond)
     scores = layers.array_write(x, step_idx)
@@ -40,9 +44,8 @@ def build_and_run_program(place, batch_size, beam_size, stop_gradient=False):
             bs = layers.cast(bs, 'int64')
         bs.stop_gradient = stop_gradient
         batch_pos = layers.expand(
-            layers.unsqueeze(
-                layers.range(
-                    0, bs, 1, dtype=bs.dtype), [1]), [1, beam_size])
+            layers.unsqueeze(layers.range(0, bs, 1, dtype=bs.dtype), [1]),
+            [1, beam_size])
         topk_coordinates = layers.stack([batch_pos, indices], axis=2)
         topk_coordinates.stop_gradient = stop_gradient
         score = layers.gather_nd(x, topk_coordinates)
@@ -56,14 +59,17 @@ def build_and_run_program(place, batch_size, beam_size, stop_gradient=False):
     opt = fluid.optimizer.Adam(0.01)
     opt.minimize(loss)
     exe = fluid.Executor(place)
-    data = np.random.random_integers(
-        low=0, high=beam_size - 1, size=(batch_size, beam_size)).astype("int64")
+    data = np.random.random_integers(low=0,
+                                     high=beam_size - 1,
+                                     size=(batch_size,
+                                           beam_size)).astype("int64")
     loss_val, = exe.run(feed={"indices": data}, fetch_list=[loss])
 
     return loss_val
 
 
 class TestDynRNNStopGradient(unittest.TestCase):
+
     def setUp(self):
         self.batch_size = 20
         self.beam_size = 64
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 5328f73b31513..0d6fa635a8fd4 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -23,6 +23,7 @@
 
 
 class Memory(object):
+
     def __init__(self, shape, dtype='float32'):
         self.ex = numpy.zeros(shape=shape, dtype=dtype)
         self.cur = None
@@ -45,6 +46,7 @@ def reset(self):
 
 
 class Output(object):
+
     def __init__(self):
         self.outs = []
 
@@ -59,6 +61,7 @@ def last(self):
 
 
 class BaseRNN(object):
+
     def __init__(self, ins, mems, params, outs, num_seq=5, max_seq_len=15):
         self.num_seq = num_seq
         self.inputs = collections.defaultdict(list)
@@ -211,6 +214,7 @@ def _exe_mean_out_(self):
 
 
 class SeedFixedTestCase(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         """Fix random seeds to remove randomness from tests"""
@@ -235,17 +239,17 @@ class TestSimpleMul(SeedFixedTestCase):
     OUT_NAME = 'Out'
 
     class SimpleMul(BaseRNN):
+
         def __init__(self):
             base = TestSimpleMul
-            super(base.SimpleMul, self).__init__({
-                base.DATA_NAME: {
-                    'shape': [base.DATA_WIDTH]
-                }
-            }, {}, {
-                base.PARAM_NAME: {
-                    'shape': [base.DATA_WIDTH, base.HIDDEN_WIDTH]
-                }
-            }, [base.OUT_NAME])
+            super(base.SimpleMul,
+                  self).__init__({base.DATA_NAME: {
+                      'shape': [base.DATA_WIDTH]
+                  }}, {}, {
+                      base.PARAM_NAME: {
+                          'shape': [base.DATA_WIDTH, base.HIDDEN_WIDTH]
+                      }
+                  }, [base.OUT_NAME])
 
         def step(self, X, W, Out):
             Out.out(numpy.matmul(X, W))
@@ -255,8 +259,9 @@ def step(self, X, W, Out):
     @prog_scope()
     def test_forward_backward(self):
         py_rnn = TestSimpleMul.SimpleMul()
-        dat = fluid.layers.data(
-            name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1)
+        dat = fluid.layers.data(name=self.DATA_NAME,
+                                shape=[self.DATA_WIDTH],
+                                lod_level=1)
         dat.stop_gradient = False
 
         rnn = fluid.layers.DynamicRNN()
@@ -277,11 +282,12 @@ def test_forward_backward(self):
         cpu = fluid.CPUPlace()
         exe = fluid.Executor(cpu)
         out, w_g, i_g = list(
-            map(numpy.array,
+            map(
+                numpy.array,
                 exe.run(feed=py_rnn.to_feed(cpu),
                         fetch_list=[
-                            out, self.PARAM_NAME + "@GRAD", self.DATA_NAME +
-                            "@GRAD"
+                            out, self.PARAM_NAME + "@GRAD",
+                            self.DATA_NAME + "@GRAD"
                         ],
                         return_numpy=False)))
         out_by_python = py_rnn.exe()[self.OUT_NAME]
@@ -301,21 +307,23 @@ class TestSimpleMulWithMemory(SeedFixedTestCase):
     PARAM_NAME = 'W'
 
     class SimpleMulWithMemory(BaseRNN):
+
         def __init__(self):
-            super(TestSimpleMulWithMemory.SimpleMulWithMemory, self).__init__({
-                TestSimpleMulWithMemory.DATA_NAME: {
-                    'shape': [TestSimpleMulWithMemory.DATA_WIDTH]
-                }
-            }, {'Mem': {
-                'shape': [TestSimpleMulWithMemory.HIDDEN_WIDTH]
-            }}, {
-                TestSimpleMulWithMemory.PARAM_NAME: {
-                    'shape': [
-                        TestSimpleMulWithMemory.DATA_WIDTH,
-                        TestSimpleMulWithMemory.HIDDEN_WIDTH
-                    ]
-                }
-            }, ['Out'])
+            super(TestSimpleMulWithMemory.SimpleMulWithMemory, self).__init__(
+                {
+                    TestSimpleMulWithMemory.DATA_NAME: {
+                        'shape': [TestSimpleMulWithMemory.DATA_WIDTH]
+                    }
+                }, {'Mem': {
+                    'shape': [TestSimpleMulWithMemory.HIDDEN_WIDTH]
+                }}, {
+                    TestSimpleMulWithMemory.PARAM_NAME: {
+                        'shape': [
+                            TestSimpleMulWithMemory.DATA_WIDTH,
+                            TestSimpleMulWithMemory.HIDDEN_WIDTH
+                        ]
+                    }
+                }, ['Out'])
 
         def step(self, X, Mem, W, Out):
             o = numpy.matmul(X, W)
@@ -330,8 +338,9 @@ def step(self, X, Mem, W, Out):
     @prog_scope()
     def test_forward_backward(self):
         py_rnn = TestSimpleMulWithMemory.SimpleMulWithMemory()
-        data = fluid.layers.data(
-            name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1)
+        data = fluid.layers.data(name=self.DATA_NAME,
+                                 shape=[self.DATA_WIDTH],
+                                 lod_level=1)
         data.stop_gradient = False
         rnn = fluid.layers.DynamicRNN()
         with rnn.block():
@@ -355,11 +364,12 @@ def test_forward_backward(self):
         exe = fluid.Executor(cpu)
         feed = py_rnn.to_feed(cpu)
         last_np, w_g, i_g = list(
-            map(numpy.array,
+            map(
+                numpy.array,
                 exe.run(feed=feed,
                         fetch_list=[
-                            last, self.PARAM_NAME + "@GRAD", self.DATA_NAME +
-                            "@GRAD"
+                            last, self.PARAM_NAME + "@GRAD",
+                            self.DATA_NAME + "@GRAD"
                         ],
                         return_numpy=False)))
         last_by_py, = list(py_rnn.exe().values())
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index 698f914f89984..07f7fa818aa0e 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -29,6 +29,7 @@
 
 
 class TestDyRnnStaticInput(unittest.TestCase):
+
     def setUp(self):
         self._delta = 0.005
         self._max_sequence_len = 3
@@ -60,8 +61,10 @@ def prepare_static_input_tensor(self):
 
     def fetch_value(self, var):
         fetch_outs = self.exe.run(feed={
-            'x_tensor': self.x_tensor,
-            'static_input_tensor': self.static_input_tensor
+            'x_tensor':
+            self.x_tensor,
+            'static_input_tensor':
+            self.static_input_tensor
         },
                                   fetch_list=[var],
                                   return_numpy=False)
@@ -75,11 +78,10 @@ def _lodtensor_to_ndarray(self, lod_tensor):
         return ndarray, lod_tensor.recursive_sequence_lengths()
 
     def build_graph(self, only_forward=False):
-        x_tensor = fluid.layers.data(
-            name='x_tensor',
-            shape=[self.x_tensor_dim],
-            dtype='float32',
-            lod_level=1)
+        x_tensor = fluid.layers.data(name='x_tensor',
+                                     shape=[self.x_tensor_dim],
+                                     dtype='float32',
+                                     lod_level=1)
         x_tensor.stop_gradient = False
 
         static_input_tensor = fluid.layers.data(
@@ -101,20 +103,20 @@ def build_graph(self, only_forward=False):
             step_x = rnn.step_input(x_tensor)
             step_static_input = rnn.static_input(static_input_tensor)
             if only_forward:
-                fluid.layers.array_write(
-                    x=step_static_input,
-                    i=rnn.step_idx,
-                    array=static_input_out_array)
-            last = fluid.layers.sequence_pool(
-                input=step_static_input, pool_type='last')
+                fluid.layers.array_write(x=step_static_input,
+                                         i=rnn.step_idx,
+                                         array=static_input_out_array)
+            last = fluid.layers.sequence_pool(input=step_static_input,
+                                              pool_type='last')
             projected = fluid.layers.fc(input=[step_x, last],
                                         size=self.output_dim)
             rnn.output(projected)
 
         if only_forward:
             static_input_step_outs = []
-            step_idx = fluid.layers.fill_constant(
-                shape=[1], dtype='int64', value=0)
+            step_idx = fluid.layers.fill_constant(shape=[1],
+                                                  dtype='int64',
+                                                  value=0)
             step_idx.stop_gradient = True
 
             for i in range(self._max_sequence_len):
@@ -144,8 +146,9 @@ def get_expected_static_step_outs(self):
         static_sliced = []
         cur_offset = 0
         for i in range(len(static_lod[0])):
-            static_sliced.append(self.static_input_data[cur_offset:(
-                cur_offset + static_lod[0][i])])
+            static_sliced.append(
+                self.static_input_data[cur_offset:(cur_offset +
+                                                   static_lod[0][i])])
             cur_offset += static_lod[0][i]
         static_seq_len = static_lod[0]
         static_reordered = []
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
index de85c76351448..4bf8faf25ef44 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
@@ -14,6 +14,7 @@
 
 import os
 import numpy as np
+
 os.environ['FLAGS_use_mkldnn'] = '0'
 os.environ['CPU_NUM'] = '4'
 
@@ -24,6 +25,7 @@
 from functools import reduce
 
 import paddle
+
 paddle.enable_static()
 
 fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
@@ -38,8 +40,8 @@ def simple_fc_net():
             hidden,
             size=200,
             act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=1.0)))
     prediction = fluid.layers.fc(hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
     loss = fluid.layers.mean(loss)
@@ -64,6 +66,7 @@ def get_persistables_and_non_persistables(prog, fetch_list):
 
 
 class TestExecutor(unittest.TestCase):
+
     def test_executor_main(self):
         places = [fluid.CPUPlace()]
         if fluid.core.is_compiled_with_cuda():
@@ -89,8 +92,8 @@ def prepare_feed(self, image, label, dev_cnt=1):
         label_shape = (batch_size, ) + tuple(label.shape[1:])
 
         image_np = np.random.random(size=image_shape).astype('float32')
-        label_np = np.random.random_integers(
-            low=0, high=9, size=label_shape).astype('int64')
+        label_np = np.random.random_integers(low=0, high=9,
+                                             size=label_shape).astype('int64')
 
         return image_np, label_np
 
@@ -111,10 +114,10 @@ def assertScopeVar(self, scope, persitables, non_persistables):
             if t._is_initialized():
                 outline_np_vars.append(name)
 
-        print('Non-alive persistable vars {} in {}'.format(outline_p_vars,
-                                                           persitables))
-        print('Alive non-persistable vars {} in {}'.format(outline_np_vars,
-                                                           non_persistables))
+        print('Non-alive persistable vars {} in {}'.format(
+            outline_p_vars, persitables))
+        print('Alive non-persistable vars {} in {}'.format(
+            outline_np_vars, non_persistables))
         self.assertEqual(len(outline_p_vars), 0)
         self.assertEqual(len(outline_np_vars), 0)
 
@@ -144,14 +147,14 @@ def executor_main(self):
 
         for _ in six.moves.range(10):
             image_np, label_np = self.prepare_feed(image, label)
-            fluid.global_scope().var(image.name).get_tensor().set(image_np,
-                                                                  self.place)
-            fluid.global_scope().var(label.name).get_tensor().set(label_np,
-                                                                  self.place)
+            fluid.global_scope().var(image.name).get_tensor().set(
+                image_np, self.place)
+            fluid.global_scope().var(label.name).get_tensor().set(
+                label_np, self.place)
             # exe.run would not create local scope
             # so that we can detect whether gc clears temporary variables
-            exe.run(fluid.default_main_program().desc,
-                    fluid.global_scope(), 0, False, True, [loss.name])
+            exe.run(fluid.default_main_program().desc, fluid.global_scope(), 0,
+                    False, True, [loss.name])
             self.assertScopeVar(fluid.global_scope(), persistables,
                                 non_persistables)
 
@@ -173,9 +176,9 @@ def pe_main(self):
         build_strategy.memory_optimize = False
         build_strategy.enable_inplace = False
 
-        prog = fluid.CompiledProgram(fluid.default_main_program(
-        )).with_data_parallel(
-            loss_name=loss.name, exec_strategy=exec_strategy)
+        prog = fluid.CompiledProgram(
+            fluid.default_main_program()).with_data_parallel(
+                loss_name=loss.name, exec_strategy=exec_strategy)
 
         dev_cnt = fluid.core.get_cuda_device_count() if isinstance(self.place, fluid.CUDAPlace)    \
             else int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
index e4bde606ca670..2f67627a73e50 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+
 os.environ['CPU_NUM'] = '2'
 
 import six
@@ -41,8 +42,10 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
     reader = fake_imdb_reader(word_dict_size, batch_size * 40)
     train_reader = paddle.batch(reader, batch_size=batch_size)
 
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
+    data = fluid.layers.data(name="words",
+                             shape=[1],
+                             dtype="int64",
+                             lod_level=1)
 
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
@@ -53,8 +56,8 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-    reader = feeder.decorate_reader(
-        train_reader, multi_devices=use_parallel_executor)
+    reader = feeder.decorate_reader(train_reader,
+                                    multi_devices=use_parallel_executor)
 
     exe = fluid.Executor(place)
     fluid.default_startup_program().random_seed = 1
@@ -63,8 +66,9 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
 
     train_cp = fluid.default_main_program()
     if use_parallel_executor:
-        train_cp = compiler.CompiledProgram(fluid.default_main_program(
-        )).with_data_parallel(loss_name=cost.name)
+        train_cp = compiler.CompiledProgram(
+            fluid.default_main_program()).with_data_parallel(
+                loss_name=cost.name)
         fetch_list = [cost.name]
     else:
         fetch_list = [cost]
@@ -81,6 +85,7 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
 
 
 class TestBase(unittest.TestCase):
+
     def setUp(self):
         self.net = None
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
index 1023c18f410fb..39dc0caefd335 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
@@ -43,6 +43,7 @@ def gru_net(data,
 
 
 class GRUTest(TestBase):
+
     def setUp(self):
         self.net = gru_net
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
index 6784edb9d7b2e..07f78d3b84568 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
@@ -32,8 +32,9 @@ def lstm_net(data,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(learning_rate=emb_lr))
     fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
-    lstm_h, c = fluid.layers.dynamic_lstm(
-        input=fc0, size=hid_dim * 4, is_reverse=False)
+    lstm_h, c = fluid.layers.dynamic_lstm(input=fc0,
+                                          size=hid_dim * 4,
+                                          is_reverse=False)
     lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
     lstm_max_tanh = fluid.layers.tanh(lstm_max)
     fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
@@ -44,6 +45,7 @@ def lstm_net(data,
 
 
 class LSTMTest(TestBase):
+
     def setUp(self):
         self.net = lstm_net
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
index ecdf9efa45174..d44a74ccb5718 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
@@ -18,7 +18,7 @@
 
 fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 
-# FIXME(zjl): It seems that this unittest fails randomly 
+# FIXME(zjl): It seems that this unittest fails randomly
 # when comparing all reduce last loss and reduce last loss
 # e.g.: AssertionError: 1.0357145 != 1.0673475 within 0.01 delta
 # Disable it temporarily.
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
index ff99a06e49e78..180e1229514ee 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -33,6 +33,7 @@
 
 
 class RNNConfig(object):
+
     def __init__(self, model_type, rnn_model):
         self.model_type = model_type
         self.rnn_model = rnn_model
@@ -99,14 +100,13 @@ def __init__(self, model_type, rnn_model):
 
 # Fake data reader for test
 class Reader(object):
+
     def get_data_iter(self, rnn_config):
         for i in range(rnn_config.max_epoch):
-            x = np.zeros(
-                shape=(rnn_config.batch_size, rnn_config.num_steps),
-                dtype='int64')
-            y = np.ones(
-                shape=(rnn_config.batch_size, rnn_config.num_steps),
-                dtype='int64')
+            x = np.zeros(shape=(rnn_config.batch_size, rnn_config.num_steps),
+                         dtype='int64')
+            y = np.ones(shape=(rnn_config.batch_size, rnn_config.num_steps),
+                        dtype='int64')
             yield (x, y)
 
 
@@ -119,6 +119,7 @@ def lm_model(hidden_size,
              init_scale=0.1,
              dropout=None,
              rnn_model='static'):
+
     def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
         weight_1_arr = []
         weight_2_arr = []
@@ -141,10 +142,14 @@ def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
                 default_initializer=fluid.initializer.Constant(0.0))
             bias_arr.append(bias_1)
 
-            pre_hidden = layers.slice(
-                init_hidden, axes=[0], starts=[i], ends=[i + 1])
-            pre_cell = layers.slice(
-                init_cell, axes=[0], starts=[i], ends=[i + 1])
+            pre_hidden = layers.slice(init_hidden,
+                                      axes=[0],
+                                      starts=[i],
+                                      ends=[i + 1])
+            pre_cell = layers.slice(init_cell,
+                                    axes=[0],
+                                    starts=[i],
+                                    ends=[i + 1])
             pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
             pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
             hidden_array.append(pre_hidden)
@@ -165,23 +170,22 @@ def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
                 gate_input = layers.matmul(x=nn, y=weight_1)
 
                 gate_input = layers.elementwise_add(gate_input, bias)
-                i = layers.slice(
-                    gate_input, axes=[1], starts=[0], ends=[hidden_size])
-                j = layers.slice(
-                    gate_input,
-                    axes=[1],
-                    starts=[hidden_size],
-                    ends=[hidden_size * 2])
-                f = layers.slice(
-                    gate_input,
-                    axes=[1],
-                    starts=[hidden_size * 2],
-                    ends=[hidden_size * 3])
-                o = layers.slice(
-                    gate_input,
-                    axes=[1],
-                    starts=[hidden_size * 3],
-                    ends=[hidden_size * 4])
+                i = layers.slice(gate_input,
+                                 axes=[1],
+                                 starts=[0],
+                                 ends=[hidden_size])
+                j = layers.slice(gate_input,
+                                 axes=[1],
+                                 starts=[hidden_size],
+                                 ends=[hidden_size * 2])
+                f = layers.slice(gate_input,
+                                 axes=[1],
+                                 starts=[hidden_size * 2],
+                                 ends=[hidden_size * 3])
+                o = layers.slice(gate_input,
+                                 axes=[1],
+                                 starts=[hidden_size * 3],
+                                 ends=[hidden_size * 4])
 
                 c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                     i) * layers.tanh(j)
@@ -212,11 +216,15 @@ def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
             c = rnnout[i * 2 + 1]
             m.stop_gradient = True
             c.stop_gradient = True
-            last_h = layers.slice(
-                m, axes=[0], starts=[num_steps - 1], ends=[num_steps])
+            last_h = layers.slice(m,
+                                  axes=[0],
+                                  starts=[num_steps - 1],
+                                  ends=[num_steps])
             last_hidden_array.append(last_h)
-            last_c = layers.slice(
-                c, axes=[0], starts=[num_steps - 1], ends=[num_steps])
+            last_c = layers.slice(c,
+                                  axes=[0],
+                                  starts=[num_steps - 1],
+                                  ends=[num_steps])
             last_cell_array.append(last_c)
         real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
         last_hidden = layers.concat(last_hidden_array, 0)
@@ -224,7 +232,9 @@ def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
 
         return real_res, last_hidden, last_cell
 
-    def encoder_static(input_embedding, len=3, init_hidden=None,
+    def encoder_static(input_embedding,
+                       len=3,
+                       init_hidden=None,
                        init_cell=None):
 
         weight_1_arr = []
@@ -248,20 +258,27 @@ def encoder_static(input_embedding, len=3, init_hidden=None,
                 default_initializer=fluid.initializer.Constant(0.0))
             bias_arr.append(bias_1)
 
-            pre_hidden = layers.slice(
-                init_hidden, axes=[0], starts=[i], ends=[i + 1])
-            pre_cell = layers.slice(
-                init_cell, axes=[0], starts=[i], ends=[i + 1])
-            pre_hidden = layers.reshape(
-                pre_hidden, shape=[-1, hidden_size], inplace=True)
-            pre_cell = layers.reshape(
-                pre_cell, shape=[-1, hidden_size], inplace=True)
+            pre_hidden = layers.slice(init_hidden,
+                                      axes=[0],
+                                      starts=[i],
+                                      ends=[i + 1])
+            pre_cell = layers.slice(init_cell,
+                                    axes=[0],
+                                    starts=[i],
+                                    ends=[i + 1])
+            pre_hidden = layers.reshape(pre_hidden,
+                                        shape=[-1, hidden_size],
+                                        inplace=True)
+            pre_cell = layers.reshape(pre_cell,
+                                      shape=[-1, hidden_size],
+                                      inplace=True)
             hidden_array.append(pre_hidden)
             cell_array.append(pre_cell)
 
         res = []
-        sliced_inputs = layers.split(
-            input_embedding, num_or_sections=len, dim=1)
+        sliced_inputs = layers.split(input_embedding,
+                                     num_or_sections=len,
+                                     dim=1)
 
         for index in range(len):
             input = sliced_inputs[index]
@@ -295,52 +312,50 @@ def encoder_static(input_embedding, len=3, init_hidden=None,
             res.append(input)
 
         last_hidden = layers.concat(hidden_array, 1)
-        last_hidden = layers.reshape(
-            last_hidden, shape=[-1, num_layers, hidden_size], inplace=True)
+        last_hidden = layers.reshape(last_hidden,
+                                     shape=[-1, num_layers, hidden_size],
+                                     inplace=True)
         last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])
 
         last_cell = layers.concat(cell_array, 1)
-        last_cell = layers.reshape(
-            last_cell, shape=[-1, num_layers, hidden_size])
+        last_cell = layers.reshape(last_cell,
+                                   shape=[-1, num_layers, hidden_size])
         last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])
 
         real_res = layers.concat(res, 0)
-        real_res = layers.reshape(
-            real_res, shape=[len, -1, hidden_size], inplace=True)
+        real_res = layers.reshape(real_res,
+                                  shape=[len, -1, hidden_size],
+                                  inplace=True)
         real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
 
         return real_res, last_hidden, last_cell
 
     batch_size_each = batch_size
-    x = layers.data(
-        name="x",
-        shape=[batch_size_each, num_steps, 1],
-        dtype='int64',
-        append_batch_size=False)
-    y = layers.data(
-        name="y",
-        shape=[batch_size_each * num_steps, 1],
-        dtype='int64',
-        append_batch_size=False)
-
-    init_hidden = layers.data(
-        name="init_hidden",
-        shape=[num_layers, batch_size_each, hidden_size],
-        dtype='float32',
-        append_batch_size=False)
-    init_cell = layers.data(
-        name="init_cell",
-        shape=[num_layers, batch_size_each, hidden_size],
-        dtype='float32',
-        append_batch_size=False)
+    x = layers.data(name="x",
+                    shape=[batch_size_each, num_steps, 1],
+                    dtype='int64',
+                    append_batch_size=False)
+    y = layers.data(name="y",
+                    shape=[batch_size_each * num_steps, 1],
+                    dtype='int64',
+                    append_batch_size=False)
+
+    init_hidden = layers.data(name="init_hidden",
+                              shape=[num_layers, batch_size_each, hidden_size],
+                              dtype='float32',
+                              append_batch_size=False)
+    init_cell = layers.data(name="init_cell",
+                            shape=[num_layers, batch_size_each, hidden_size],
+                            dtype='float32',
+                            append_batch_size=False)
 
     init_cell.persistable = True
     init_hidden.persistable = True
 
-    init_hidden_reshape = layers.reshape(
-        init_hidden, shape=[num_layers, -1, hidden_size])
-    init_cell_reshape = layers.reshape(
-        init_cell, shape=[num_layers, -1, hidden_size])
+    init_hidden_reshape = layers.reshape(init_hidden,
+                                         shape=[num_layers, -1, hidden_size])
+    init_cell_reshape = layers.reshape(init_cell,
+                                       shape=[num_layers, -1, hidden_size])
 
     x_emb = layers.embedding(
         input=x,
@@ -349,16 +364,16 @@ def encoder_static(input_embedding, len=3, init_hidden=None,
         is_sparse=False,
         param_attr=fluid.ParamAttr(
             name='embedding_para',
-            initializer=fluid.initializer.UniformInitializer(
-                low=-init_scale, high=init_scale)))
+            initializer=fluid.initializer.UniformInitializer(low=-init_scale,
+                                                             high=init_scale)))
 
-    x_emb = layers.reshape(
-        x_emb, shape=[-1, num_steps, hidden_size], inplace=True)
+    x_emb = layers.reshape(x_emb,
+                           shape=[-1, num_steps, hidden_size],
+                           inplace=True)
     if dropout != None and dropout > 0.0:
-        x_emb = layers.dropout(
-            x_emb,
-            dropout_prob=dropout,
-            dropout_implementation='upscale_in_train')
+        x_emb = layers.dropout(x_emb,
+                               dropout_prob=dropout,
+                               dropout_implementation='upscale_in_train')
 
     if rnn_model == "padding":
         rnn_out, last_hidden, last_cell = padding_rnn(
@@ -395,8 +410,9 @@ def encoder_static(input_embedding, len=3, init_hidden=None,
         print("type not support")
         return
 
-    rnn_out = layers.reshape(
-        rnn_out, shape=[-1, num_steps, hidden_size], inplace=True)
+    rnn_out = layers.reshape(rnn_out,
+                             shape=[-1, num_steps, hidden_size],
+                             inplace=True)
 
     softmax_weight = layers.create_parameter(
         [hidden_size, vocab_size],
@@ -413,11 +429,13 @@ def encoder_static(input_embedding, len=3, init_hidden=None,
 
     projection = layers.matmul(rnn_out, softmax_weight)
     projection = layers.elementwise_add(projection, softmax_bias)
-    projection = layers.reshape(
-        projection, shape=[-1, vocab_size], inplace=True)
+    projection = layers.reshape(projection,
+                                shape=[-1, vocab_size],
+                                inplace=True)
 
-    loss = layers.softmax_with_cross_entropy(
-        logits=projection, label=y, soft_label=False)
+    loss = layers.softmax_with_cross_entropy(logits=projection,
+                                             label=y,
+                                             soft_label=False)
 
     loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True)
     loss = layers.reduce_mean(loss, dim=[0])
@@ -439,6 +457,7 @@ def encoder_static(input_embedding, len=3, init_hidden=None,
 
 
 class PaddingRNNTestBase(unittest.TestCase):
+
     def setUp(self):
         self.reader = Reader()
         self.device_count = 1
@@ -471,15 +490,14 @@ def _prepare_program(self, config, parallel=True):
         self.startup_program = fluid.Program()
         with fluid.program_guard(self.main_program, self.startup_program):
             with fluid.unique_name.guard():
-                res_vars = lm_model(
-                    config.hidden_size,
-                    config.vocab_size,
-                    config.batch_size,
-                    num_layers=config.num_layers,
-                    num_steps=config.num_steps,
-                    init_scale=config.init_scale,
-                    dropout=config.dropout,
-                    rnn_model=config.rnn_model)
+                res_vars = lm_model(config.hidden_size,
+                                    config.vocab_size,
+                                    config.batch_size,
+                                    num_layers=config.num_layers,
+                                    num_steps=config.num_steps,
+                                    init_scale=config.init_scale,
+                                    dropout=config.dropout,
+                                    rnn_model=config.rnn_model)
                 self.loss, self.last_hidden, self.last_cell, self.feed_order = res_vars
 
                 fluid.clip.set_gradient_clip(
@@ -509,14 +527,12 @@ def _prepare_program(self, config, parallel=True):
             self.train_program = self.main_program
 
     def _generate_init_data(self):
-        init_hidden = np.zeros(
-            (self.config.num_layers, self.config.batch_size,
-             self.config.hidden_size),
-            dtype='float32')
-        init_cell = np.zeros(
-            (self.config.num_layers, self.config.batch_size,
-             self.config.hidden_size),
-            dtype='float32')
+        init_hidden = np.zeros((self.config.num_layers, self.config.batch_size,
+                                self.config.hidden_size),
+                               dtype='float32')
+        init_cell = np.zeros((self.config.num_layers, self.config.batch_size,
+                              self.config.hidden_size),
+                             dtype='float32')
         return init_hidden, init_cell
 
     def _generate_new_lr(self, epoch_id=0, device_count=1):
@@ -596,7 +612,8 @@ def train(self, config, parallel=True, use_program_cache=True):
             ppl = np.append(ppl, train_ppl)
         return ppl
 
-    def compare_padding_static_mode(self, parallel=True,
+    def compare_padding_static_mode(self,
+                                    parallel=True,
                                     use_program_cache=True):
         '''
         Test that train ppl of padding mode is same to that of static mode 
@@ -608,11 +625,11 @@ def compare_padding_static_mode(self, parallel=True,
         with fluid.scope_guard(fluid.Scope()):
             static_rnn_ppl = self.train(config, parallel, use_program_cache)
         self.assertTrue(
-            np.isclose(
-                padding_rnn_ppl, static_rnn_ppl, rtol=0.001).all())
+            np.isclose(padding_rnn_ppl, static_rnn_ppl, rtol=0.001).all())
 
 
 class EagerDeletionPaddingRNNTest(PaddingRNNTestBase):
+
     def test_padding_mode_no_eager_deletion(self):
         '''
         Test that train ppl of padding mode is same to that of static mode without eager deletion
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
index 01d8cbc5b7dd1..907e167b5f1d4 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
@@ -27,6 +27,7 @@
 from paddle.fluid.executor import Executor
 from paddle.fluid.backward import append_backward
 import paddle
+
 paddle.enable_static()
 
 np.random.seed(123)
@@ -35,6 +36,7 @@
 
 
 class PyRNNBase(object):
+
     def __init__(self, input_shape, output_shape):
         self.x = np.ones(shape=input_shape).astype("float32")
         self.y = np.zeros(shape=output_shape).astype("float32")
@@ -52,6 +54,7 @@ def segment_inputs(self):
 
 
 class PySimpleRNN1(PyRNNBase):
+
     def __init__(self, input_shape, output_shape):
         super(PySimpleRNN1, self).__init__(input_shape, output_shape)
 
@@ -73,6 +76,7 @@ def step(self, step_id, x):
 
 
 class PySimpleRNN2(PyRNNBase):
+
     def __init__(self, input_shape, output_shape):
         super(PySimpleRNN2, self).__init__(input_shape, output_shape)
 
@@ -139,14 +143,14 @@ def setUp(self):
             self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
+        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
+                        dtype='float32',
+                        name='x',
+                        append_batch_size=False)
         x.stop_gradient = False
-        h_boot = layers.data(
-            shape=[self.input_dim], dtype='float32', name='h_boot')
+        h_boot = layers.data(shape=[self.input_dim],
+                             dtype='float32',
+                             name='h_boot')
         h_boot.stop_gradient = False
 
         rnn = layers.StaticRNN()
@@ -154,10 +158,8 @@ def create_rnn_op(self):
             h_pre = rnn.memory(init=h_boot)
             x_t = rnn.step_input(x)
 
-            h = layers.scale(
-                x=layers.elementwise_add(
-                    x=h_pre, y=x_t),
-                scale=self.py_rnn.scale)
+            h = layers.scale(x=layers.elementwise_add(x=h_pre, y=x_t),
+                             scale=self.py_rnn.scale)
 
             rnn.update_memory(h_pre, h)
             rnn.output(h)
@@ -211,8 +213,7 @@ def test_backward(self, rtol=0.01):
         for idx, name in enumerate(self.data_field):
             self.assertEqual(num_grad[idx].shape, ana_grad[idx].shape)
             self.assertTrue(
-                np.isclose(
-                    num_grad[idx], ana_grad[idx], rtol=rtol).all(),
+                np.isclose(num_grad[idx], ana_grad[idx], rtol=rtol).all(),
                 "num_grad (" + name + ") has diff at " + str(self.place) +
                 "\nExpect " + str(num_grad[idx]) + "\n" + "But Got" +
                 str(ana_grad[idx]) + " in class " + self.__class__.__name__)
@@ -276,14 +277,14 @@ def setUp(self):
             self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
+        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
+                        dtype='float32',
+                        name='x',
+                        append_batch_size=False)
         x.stop_gradient = False
-        h_boot = layers.data(
-            shape=[self.input_dim], dtype='float32', name='h_boot')
+        h_boot = layers.data(shape=[self.input_dim],
+                             dtype='float32',
+                             name='h_boot')
         h_boot.stop_gradient = False
 
         rnn = layers.StaticRNN()
@@ -333,6 +334,7 @@ class EagerDeletionRecurrentOpMultipleMemoryTest(EagerDeletionRecurrentOpTest1):
     '''
 
     class PySimpleRNN3(PyRNNBase):
+
         def __init__(self, input_shape, output_shape):
             super(EagerDeletionRecurrentOpMultipleMemoryTest.PySimpleRNN3,
                   self).__init__(input_shape, output_shape)
@@ -376,23 +378,20 @@ def setUp(self):
             self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
+        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
+                        dtype='float32',
+                        name='x',
+                        append_batch_size=False)
         x.stop_gradient = False
-        h_boot1 = layers.data(
-            shape=[self.batch_size, self.input_dim],
-            dtype='float32',
-            name='h_boot1',
-            append_batch_size=False)
+        h_boot1 = layers.data(shape=[self.batch_size, self.input_dim],
+                              dtype='float32',
+                              name='h_boot1',
+                              append_batch_size=False)
         h_boot1.stop_gradient = False
-        h_boot2 = layers.data(
-            shape=[self.batch_size, self.input_dim],
-            dtype='float32',
-            name='h_boot2',
-            append_batch_size=False)
+        h_boot2 = layers.data(shape=[self.batch_size, self.input_dim],
+                              dtype='float32',
+                              name='h_boot2',
+                              append_batch_size=False)
         h_boot2.stop_gradient = False
 
         rnn = layers.StaticRNN()
@@ -427,6 +426,7 @@ class EagerDeletionRecurrentOpNoMemBootTest(EagerDeletionRecurrentOpTest1):
     '''
 
     class PySimpleRNN4(PyRNNBase):
+
         def __init__(self, input_shape, output_shape):
             super(EagerDeletionRecurrentOpNoMemBootTest.PySimpleRNN4,
                   self).__init__(input_shape, output_shape)
@@ -459,11 +459,10 @@ def setUp(self):
             self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
+        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
+                        dtype='float32',
+                        name='x',
+                        append_batch_size=False)
         x.stop_gradient = False
 
         rnn = layers.StaticRNN()
@@ -497,6 +496,7 @@ class EagerDeletionTwoRecurrentOpsTest(EagerDeletionRecurrentOpTest1):
     '''
 
     class PySimpleRNN5(PyRNNBase):
+
         def __init__(self, input_shape, output_shape):
             super(EagerDeletionTwoRecurrentOpsTest.PySimpleRNN5,
                   self).__init__(input_shape, output_shape)
@@ -536,11 +536,10 @@ def setUp(self):
             self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
+        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
+                        dtype='float32',
+                        name='x',
+                        append_batch_size=False)
         x.stop_gradient = False
 
         rnn_0 = layers.StaticRNN()
@@ -564,8 +563,8 @@ def create_rnn_op(self):
         return rnn_1()
 
 
-class EagerDeletionRecurrentOpParallelExecutorTest(
-        EagerDeletionRecurrentOpTest1):
+class EagerDeletionRecurrentOpParallelExecutorTest(EagerDeletionRecurrentOpTest1
+                                                   ):
     '''
     Test RNNOp with ParallelExecutor
     equation:
@@ -587,11 +586,10 @@ def forward(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.enable_inplace = True
         exec_strategy = fluid.ExecutionStrategy()
-        parallel_exe = fluid.ParallelExecutor(
-            use_cuda=False,
-            main_program=self.main_program,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy)
+        parallel_exe = fluid.ParallelExecutor(use_cuda=False,
+                                              main_program=self.main_program,
+                                              build_strategy=build_strategy,
+                                              exec_strategy=exec_strategy)
         out = parallel_exe.run(feed=self.feed_map, fetch_list=[self.output])
         return out[0]
 
@@ -608,12 +606,11 @@ def backward(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.enable_inplace = True
         exec_strategy = fluid.ExecutionStrategy()
-        parallel_exe = fluid.ParallelExecutor(
-            use_cuda=False,
-            loss_name=self.output.name,
-            main_program=self.main_program,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy)
+        parallel_exe = fluid.ParallelExecutor(use_cuda=False,
+                                              loss_name=self.output.name,
+                                              main_program=self.main_program,
+                                              build_strategy=build_strategy,
+                                              exec_strategy=exec_strategy)
         return parallel_exe.run(feed=self.feed_map,
                                 fetch_list=fetch_list,
                                 return_numpy=False)
@@ -640,8 +637,9 @@ def setUp(self):
                 name='x',
                 append_batch_size=False)
             x.stop_gradient = False
-            h_boot = layers.data(
-                shape=[self.input_dim], dtype='float32', name='h_boot')
+            h_boot = layers.data(shape=[self.input_dim],
+                                 dtype='float32',
+                                 name='h_boot')
             h_boot.stop_gradient = False
 
             forward_only_rnn = layers.StaticRNN()
@@ -649,10 +647,8 @@ def setUp(self):
                 h_pre = forward_only_rnn.memory(init=h_boot)
                 x_t = forward_only_rnn.step_input(x)
 
-                h = layers.scale(
-                    x=layers.elementwise_add(
-                        x=h_pre, y=x_t),
-                    scale=self.py_rnn.scale)
+                h = layers.scale(x=layers.elementwise_add(x=h_pre, y=x_t),
+                                 scale=self.py_rnn.scale)
 
                 forward_only_rnn.update_memory(h_pre, h)
                 forward_only_rnn.output(h)
@@ -665,10 +661,8 @@ def setUp(self):
                 h_pre = rnn.memory(init=h_boot)
                 x_t = rnn.step_input(x)
 
-                h = layers.scale(
-                    x=layers.elementwise_add(
-                        x=h_pre, y=x_t),
-                    scale=self.py_rnn.scale)
+                h = layers.scale(x=layers.elementwise_add(x=h_pre, y=x_t),
+                                 scale=self.py_rnn.scale)
 
                 rnn.update_memory(h_pre, h)
                 rnn.output(h)
@@ -693,8 +687,7 @@ def check_forward(self):
         self.assertEqual(forward_only_output.shape, py_output.shape)
         self.assertEqual(pd_output.shape, py_output.shape)
         self.assertTrue(
-            np.isclose(
-                forward_only_output, py_output, rtol=0.01).all)
+            np.isclose(forward_only_output, py_output, rtol=0.01).all)
         self.assertTrue(np.isclose(pd_output, py_output, rtol=0.01).all())
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
index 936651d8324fc..41685fa4254bf 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import os
+
 os.environ['CPU_NUM'] = '2'
 
 import unittest
@@ -28,13 +29,17 @@
 import multiprocessing
 
 import paddle
+
 paddle.enable_static()
 fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 
 
 class TestEagerDeletionWhileOpBase(unittest.TestCase):
+
     def test_main(self):
-        places = [core.CPUPlace(), ]
+        places = [
+            core.CPUPlace(),
+        ]
         if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
 
@@ -48,8 +53,8 @@ def run_main(self, place, with_data_parallel):
         self.place = place
         self.with_data_parallel = with_data_parallel
 
-        if not core.is_compiled_with_cuda() and isinstance(self.place,
-                                                           core.CUDAPlace):
+        if not core.is_compiled_with_cuda() and isinstance(
+                self.place, core.CUDAPlace):
             return
 
         if isinstance(self.place, core.CUDAPlace):
@@ -57,15 +62,21 @@ def run_main(self, place, with_data_parallel):
             ) if self.with_data_parallel else 1
         else:
             device_cnt = int(
-                os.environ.get('CPU_NUM', multiprocessing.cpu_count(
-                ))) if self.with_data_parallel else 1
+                os.environ.get('CPU_NUM', multiprocessing.cpu_count())
+            ) if self.with_data_parallel else 1
 
-        d0 = layers.data(
-            "d0", shape=[10], append_batch_size=False, dtype='float32')
-        d1 = layers.data(
-            "d1", shape=[10], append_batch_size=False, dtype='float32')
-        d2 = layers.data(
-            "d2", shape=[10], append_batch_size=False, dtype='float32')
+        d0 = layers.data("d0",
+                         shape=[10],
+                         append_batch_size=False,
+                         dtype='float32')
+        d1 = layers.data("d1",
+                         shape=[10],
+                         append_batch_size=False,
+                         dtype='float32')
+        d2 = layers.data("d2",
+                         shape=[10],
+                         append_batch_size=False,
+                         dtype='float32')
 
         i = layers.zeros(shape=[1], dtype='int64')
         i.stop_gradient = True
@@ -136,8 +147,9 @@ def run_main(self, place, with_data_parallel):
 
         prog = fluid.default_main_program()
         if self.with_data_parallel:
-            prog = compiler.CompiledProgram(fluid.default_main_program(
-            )).with_data_parallel(loss_name=loss.name)
+            prog = compiler.CompiledProgram(
+                fluid.default_main_program()).with_data_parallel(
+                    loss_name=loss.name)
 
         for _ in range(5):
             d = []
@@ -149,9 +161,11 @@ def run_main(self, place, with_data_parallel):
                     d.append(numpy.array([tmp] * device_cnt))
 
             outs = exe.run(program=prog,
-                           feed={'d0': d[0],
-                                 'd1': d[1],
-                                 'd2': d[2]},
+                           feed={
+                               'd0': d[0],
+                               'd1': d[1],
+                               'd2': d[2]
+                           },
                            fetch_list=[sum_result])
             self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_dist_api.py b/python/paddle/fluid/tests/unittests/test_eager_dist_api.py
index e00f90f4b0d5f..5355c58753e6f 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_dist_api.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_dist_api.py
@@ -19,6 +19,7 @@
 
 
 class TestProcessGroup(TestMultipleGpus):
+
     def test_process_group_nccl(self):
         self.run_mnist_2gpu('process_group_nccl.py')
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_run_program.py b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
index 0253f9a21c6ad..8d3ebcfbac5ac 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_run_program.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -59,17 +59,16 @@ def _create_out(var):
     var_desc = var.desc
     varbase = None
     if _in_legacy_dygraph():
-        var_base = core.VarBase(var_desc.dtype(),
-                                var_desc.shape(),
+        var_base = core.VarBase(var_desc.dtype(), var_desc.shape(),
                                 var_desc.name(), var_desc.type(), False)
     else:
-        var_base = core.eager.Tensor(var_desc.dtype(),
-                                     var_desc.shape(),
+        var_base = core.eager.Tensor(var_desc.dtype(), var_desc.shape(),
                                      var_desc.name(), var_desc.type(), False)
     return var_base
 
 
 class TestRunProgram(unittest.TestCase):
+
     def test_eager(self):
         paddle.set_device('cpu')
         paddle.enable_static()
@@ -104,7 +103,7 @@ def test_eager(self):
                      'is_test', False, 'program_id', _hash_with_id(program))
 
             _C_ops.run_program([x_t, y_t], [fake_var], [out_t], [scope],
-                               [fake_var], *attrs)
+                               [fake_var], None, *attrs)
 
             loss = paddle.mean(out_t)
             loss.backward()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_trace_op.py b/python/paddle/fluid/tests/unittests/test_eager_trace_op.py
index b67dbd0ba622d..1266e1c9a6a6e 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_trace_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_trace_op.py
@@ -26,21 +26,26 @@
 
 
 class TestEagerTraceOp(unittest.TestCase):
+
     def test_branches(self):
         with _test_eager_guard():
             data = np.random.random([1, 1]).astype(np.float32)
             x = paddle.to_tensor(data)
 
             paddle.fluid.framework._dygraph_tracer().trace_op(
-                'broadcast_tensors', {'X': [x, x],
-                                      'Out': [x, x]}, {'Out': [x, x]}, {})
+                'broadcast_tensors', {
+                    'X': [x, x],
+                    'Out': [x, x]
+                }, {'Out': [x, x]}, {})
             paddle.fluid.framework._dygraph_tracer().trace_op(
                 'scale', {'X': x}, {'Out': x}, {'scale': 0.5})
 
             scale = paddle.to_tensor(np.random.random([1]).astype(np.float32))
             paddle.fluid.framework._dygraph_tracer().trace_op(
-                'instance_norm', {'Scale': [scale],
-                                  'X': [x]}, {'Y': [x]}, {})
+                'instance_norm', {
+                    'Scale': [scale],
+                    'X': [x]
+                }, {'Y': [x]}, {})
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
index ba48b143a8e43..561a379b6fa62 100644
--- a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
@@ -51,6 +51,7 @@ def Levenshtein(hyp, ref):
 
 
 class TestEditDistanceOp(OpTest):
+
     def setUp(self):
         self.op_type = "edit_distance"
         normalized = False
@@ -86,6 +87,7 @@ def test_check_output(self):
 
 
 class TestEditDistanceOpNormalizedCase0(OpTest):
+
     def reset_config(self):
         pass
 
@@ -134,18 +136,21 @@ def test_check_output(self):
 
 
 class TestEditDistanceOpNormalizedCase1(TestEditDistanceOpNormalizedCase0):
+
     def reset_config(self):
         self.x1_lod = [0, 6, 0]
         self.x2_lod = [2, 1, 2]
 
 
 class TestEditDistanceOpNormalizedCase2(TestEditDistanceOpNormalizedCase0):
+
     def reset_config(self):
         self.x1_lod = [0, 0, 6]
         self.x2_lod = [2, 2, 1]
 
 
 class TestEditDistanceOpNormalizedTensor(OpTest):
+
     def reset_config(self):
         self.x1 = np.array([[10, 3, 0, 0], [6, 5, 8, 2]], dtype=np.int64)
         self.x2 = np.array([[10, 4, 0], [6, 7, 8]], dtype=np.int64)
@@ -163,9 +168,8 @@ def setUp(self):
         sequence_num = np.array(num_strs).astype("int64")
 
         for i in range(0, num_strs):
-            distance[i] = Levenshtein(
-                hyp=self.x1[i][0:self.x1_lod[i]],
-                ref=self.x2[i][0:self.x2_lod[i]])
+            distance[i] = Levenshtein(hyp=self.x1[i][0:self.x1_lod[i]],
+                                      ref=self.x2[i][0:self.x2_lod[i]])
             if normalized is True:
                 len_ref = self.x2_lod[i]
                 distance[i] = distance[i] / len_ref
diff --git a/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py b/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
index 45cb7e785bc5e..4afbe2d715592 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@
 
 
 class EagerOpAPIGenerateTestCase(unittest.TestCase):
+
     def test_elementwise_add(self):
         with _test_eager_guard():
             paddle.set_device("cpu")
@@ -35,8 +36,8 @@ def test_elementwise_add(self):
 
     def test_sum(self):
         with _test_eager_guard():
-            x_data = np.array(
-                [[0.2, 0.3, 0.5, 0.9], [0.1, 0.2, 0.6, 0.7]]).astype('float32')
+            x_data = np.array([[0.2, 0.3, 0.5, 0.9], [0.1, 0.2, 0.6,
+                                                      0.7]]).astype('float32')
             x = paddle.to_tensor(x_data, 'float32')
             out = paddle.sum(x, axis=0)
             out_arr = out.numpy()
@@ -61,8 +62,8 @@ def test_sigmoid(self):
             out = paddle.nn.functional.sigmoid(x)
             out_arr = out.numpy()
             out_arr_expected = np.array(
-                [0.40131234, 0.450166, 0.52497919, 0.57444252]).astype(
-                    'float32')
+                [0.40131234, 0.450166, 0.52497919,
+                 0.57444252]).astype('float32')
             self.assertTrue(np.allclose(out_arr, out_arr_expected))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index bb8c6346eb5a5..7fe755225f41a 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 
 class EagerScaleTestCase(unittest.TestCase):
+
     def test_scale_base(self):
         with _test_eager_guard():
             paddle.set_device("cpu")
@@ -85,6 +86,7 @@ def test_retain_grad_and_run_backward_raises(self):
 
 
 class EagerDtypeTestCase(unittest.TestCase):
+
     def check_to_tesnsor_and_numpy(self, dtype, proto_dtype):
         with _test_eager_guard():
             arr = np.random.random([4, 16, 16, 32]).astype(dtype)
@@ -110,6 +112,7 @@ def test_dtype_base(self):
 
 
 class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
+
     def constructor(self, place):
         egr_tensor = core.eager.Tensor()
         self.assertEqual(egr_tensor.persistable, False)
@@ -170,8 +173,8 @@ def constructor(self, place):
         self.assertTrue(
             egr_tensor4.place._equals(
                 paddle.fluid.framework._current_expected_place()))
-        self.assertTrue(
-            np.array_equal(egr_tensor4.numpy(), egr_tensor3.numpy()))
+        self.assertTrue(np.array_equal(egr_tensor4.numpy(),
+                                       egr_tensor3.numpy()))
 
         arr4 = np.random.rand(4, 16, 16, 32).astype('float32')
         egr_tensor5 = core.eager.Tensor(arr4, place)
@@ -190,8 +193,8 @@ def constructor(self, place):
         self.assertEqual(egr_tensor6.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor6.stop_gradient, True)
         self.assertEqual(egr_tensor6.place.is_cpu_place(), True)
-        self.assertTrue(
-            np.array_equal(egr_tensor6.numpy(), egr_tensor5.numpy()))
+        self.assertTrue(np.array_equal(egr_tensor6.numpy(),
+                                       egr_tensor5.numpy()))
 
         egr_tensor7 = core.eager.Tensor(arr4, place, True)
         self.assertEqual(egr_tensor7.persistable, True)
@@ -209,8 +212,8 @@ def constructor(self, place):
         self.assertEqual(egr_tensor8.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor8.stop_gradient, True)
         self.assertTrue(egr_tensor8.place._equals(place))
-        self.assertTrue(
-            np.array_equal(egr_tensor8.numpy(), egr_tensor5.numpy()))
+        self.assertTrue(np.array_equal(egr_tensor8.numpy(),
+                                       egr_tensor5.numpy()))
 
         egr_tensor9 = core.eager.Tensor(arr4, place, True, True)
         self.assertEqual(egr_tensor9.persistable, True)
@@ -279,8 +282,9 @@ def constructor(self, place):
                 "The type of trainable MUST be bool, but the type is /*"):
             eager_param.trainable = "False"
 
-        eager_param_2 = EagerParamBase(
-            shape=paddle.shape(paddle.to_tensor([1, 2, 3, 4])), dtype="float32")
+        eager_param_2 = EagerParamBase(shape=paddle.shape(
+            paddle.to_tensor([1, 2, 3, 4])),
+                                       dtype="float32")
         self.assertTrue(eager_param_2.trainable)
         eager_param_2.trainable = False
         self.assertFalse(eager_param_2.trainable)
@@ -329,8 +333,9 @@ def constructor_with_kwargs(self, place):
         self.assertEqual(egr_tensor2.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor2.stop_gradient, True)
 
-        egr_tensor3 = core.eager.Tensor(
-            arr, place=place, name="new_eager_tensor")
+        egr_tensor3 = core.eager.Tensor(arr,
+                                        place=place,
+                                        name="new_eager_tensor")
         self.assertEqual(egr_tensor3.persistable, False)
         self.assertTrue("new_eager_tensor" in egr_tensor3.name)
         self.assertEqual(egr_tensor3.shape, [4, 16, 16, 32])
@@ -338,8 +343,10 @@ def constructor_with_kwargs(self, place):
         self.assertEqual(egr_tensor3.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor3.stop_gradient, True)
 
-        egr_tensor4 = core.eager.Tensor(
-            arr, place=place, persistable=True, name="new_eager_tensor")
+        egr_tensor4 = core.eager.Tensor(arr,
+                                        place=place,
+                                        persistable=True,
+                                        name="new_eager_tensor")
         self.assertEqual(egr_tensor4.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor4.name)
         self.assertEqual(egr_tensor4.shape, [4, 16, 16, 32])
@@ -347,12 +354,11 @@ def constructor_with_kwargs(self, place):
         self.assertEqual(egr_tensor4.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor4.stop_gradient, True)
 
-        egr_tensor5 = core.eager.Tensor(
-            arr,
-            core.CPUPlace(),
-            persistable=True,
-            name="new_eager_tensor",
-            zero_copy=True)
+        egr_tensor5 = core.eager.Tensor(arr,
+                                        core.CPUPlace(),
+                                        persistable=True,
+                                        name="new_eager_tensor",
+                                        zero_copy=True)
         self.assertEqual(egr_tensor5.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor5.name)
         self.assertEqual(egr_tensor5.shape, [4, 16, 16, 32])
@@ -360,12 +366,11 @@ def constructor_with_kwargs(self, place):
         self.assertEqual(egr_tensor5.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor5.stop_gradient, True)
 
-        egr_tensor6 = core.eager.Tensor(
-            arr,
-            place=core.CPUPlace(),
-            persistable=True,
-            name="new_eager_tensor",
-            zero_copy=True)
+        egr_tensor6 = core.eager.Tensor(arr,
+                                        place=core.CPUPlace(),
+                                        persistable=True,
+                                        name="new_eager_tensor",
+                                        zero_copy=True)
         self.assertEqual(egr_tensor6.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor6.name)
         self.assertEqual(egr_tensor6.shape, [4, 16, 16, 32])
@@ -373,12 +378,11 @@ def constructor_with_kwargs(self, place):
         self.assertEqual(egr_tensor6.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor6.stop_gradient, True)
 
-        egr_tensor7 = core.eager.Tensor(
-            arr,
-            place=place,
-            persistable=True,
-            name="new_eager_tensor",
-            zero_copy=True)
+        egr_tensor7 = core.eager.Tensor(arr,
+                                        place=place,
+                                        persistable=True,
+                                        name="new_eager_tensor",
+                                        zero_copy=True)
         self.assertEqual(egr_tensor7.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor7.name)
         self.assertEqual(egr_tensor7.shape, [4, 16, 16, 32])
@@ -386,13 +390,12 @@ def constructor_with_kwargs(self, place):
         self.assertEqual(egr_tensor7.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor7.stop_gradient, True)
 
-        egr_tensor8 = core.eager.Tensor(
-            arr,
-            place=place,
-            persistable=True,
-            name="new_eager_tensor",
-            zero_copy=True,
-            stop_gradient=False)
+        egr_tensor8 = core.eager.Tensor(arr,
+                                        place=place,
+                                        persistable=True,
+                                        name="new_eager_tensor",
+                                        zero_copy=True,
+                                        stop_gradient=False)
         self.assertEqual(egr_tensor8.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor8.name)
         self.assertEqual(egr_tensor8.shape, [4, 16, 16, 32])
@@ -400,8 +403,12 @@ def constructor_with_kwargs(self, place):
         self.assertEqual(egr_tensor8.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor8.stop_gradient, False)
 
-        egr_tensor9 = core.eager.Tensor(
-            arr, place, True, True, "new_eager_tensor", stop_gradient=False)
+        egr_tensor9 = core.eager.Tensor(arr,
+                                        place,
+                                        True,
+                                        True,
+                                        "new_eager_tensor",
+                                        stop_gradient=False)
         self.assertEqual(egr_tensor9.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor9.name)
         self.assertEqual(egr_tensor9.shape, [4, 16, 16, 32])
@@ -409,13 +416,12 @@ def constructor_with_kwargs(self, place):
         self.assertEqual(egr_tensor9.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor9.stop_gradient, False)
 
-        egr_tensor10 = core.eager.Tensor(
-            arr,
-            place,
-            True,
-            True,
-            name="new_eager_tensor",
-            stop_gradient=False)
+        egr_tensor10 = core.eager.Tensor(arr,
+                                         place,
+                                         True,
+                                         True,
+                                         name="new_eager_tensor",
+                                         stop_gradient=False)
         self.assertEqual(egr_tensor10.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor10.name)
         self.assertEqual(egr_tensor10.shape, [4, 16, 16, 32])
@@ -423,13 +429,12 @@ def constructor_with_kwargs(self, place):
         self.assertEqual(egr_tensor10.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor10.stop_gradient, False)
 
-        egr_tensor11 = core.eager.Tensor(
-            arr,
-            place,
-            True,
-            zero_copy=True,
-            name="new_eager_tensor",
-            stop_gradient=False)
+        egr_tensor11 = core.eager.Tensor(arr,
+                                         place,
+                                         True,
+                                         zero_copy=True,
+                                         name="new_eager_tensor",
+                                         stop_gradient=False)
         self.assertEqual(egr_tensor11.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor11.name)
         self.assertEqual(egr_tensor11.shape, [4, 16, 16, 32])
@@ -437,13 +442,12 @@ def constructor_with_kwargs(self, place):
         self.assertEqual(egr_tensor11.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor11.stop_gradient, False)
 
-        egr_tensor12 = core.eager.Tensor(
-            arr,
-            place,
-            persistable=True,
-            zero_copy=True,
-            name="new_eager_tensor",
-            stop_gradient=False)
+        egr_tensor12 = core.eager.Tensor(arr,
+                                         place,
+                                         persistable=True,
+                                         zero_copy=True,
+                                         name="new_eager_tensor",
+                                         stop_gradient=False)
         self.assertEqual(egr_tensor12.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor12.name)
         self.assertEqual(egr_tensor12.shape, [4, 16, 16, 32])
@@ -451,13 +455,12 @@ def constructor_with_kwargs(self, place):
         self.assertEqual(egr_tensor12.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor12.stop_gradient, False)
 
-        egr_tensor13 = core.eager.Tensor(
-            value=arr,
-            place=place,
-            persistable=True,
-            zero_copy=True,
-            name="new_eager_tensor",
-            stop_gradient=False)
+        egr_tensor13 = core.eager.Tensor(value=arr,
+                                         place=place,
+                                         persistable=True,
+                                         zero_copy=True,
+                                         name="new_eager_tensor",
+                                         stop_gradient=False)
         self.assertEqual(egr_tensor13.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor13.name)
         self.assertEqual(egr_tensor13.shape, [4, 16, 16, 32])
@@ -466,12 +469,11 @@ def constructor_with_kwargs(self, place):
         self.assertEqual(egr_tensor13.stop_gradient, False)
 
         # special case
-        egr_tensor14 = core.eager.Tensor(
-            dtype=core.VarDesc.VarType.FP32,
-            dims=[4, 16, 16, 32],
-            name="special_eager_tensor",
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            persistable=True)
+        egr_tensor14 = core.eager.Tensor(dtype=core.VarDesc.VarType.FP32,
+                                         dims=[4, 16, 16, 32],
+                                         name="special_eager_tensor",
+                                         type=core.VarDesc.VarType.LOD_TENSOR,
+                                         persistable=True)
         self.assertEqual(egr_tensor14.persistable, True)
         self.assertEqual(egr_tensor14.name, "special_eager_tensor")
         self.assertEqual(egr_tensor14.shape, [4, 16, 16, 32])
@@ -490,8 +492,8 @@ def constructor_with_kwargs(self, place):
         self.assertTrue(
             np.array_equal(egr_tensor15.numpy(), egr_tensor4.numpy()))
 
-        egr_tensor16 = core.eager.Tensor(
-            value=egr_tensor4, name="new_eager_tensor")
+        egr_tensor16 = core.eager.Tensor(value=egr_tensor4,
+                                         name="new_eager_tensor")
         self.assertEqual(egr_tensor16.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor16.name)
         self.assertEqual(egr_tensor16.shape, egr_tensor4.shape)
@@ -506,7 +508,8 @@ def constructor_with_kwargs(self, place):
         egr_tensor17 = core.eager.Tensor(
             value=egr_tensor4,
             place=place,
-            name="new_eager_tensor", )
+            name="new_eager_tensor",
+        )
         self.assertEqual(egr_tensor17.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor17.name)
         self.assertEqual(egr_tensor17.shape, egr_tensor4.shape)
@@ -519,7 +522,8 @@ def constructor_with_kwargs(self, place):
         egr_tensor18 = core.eager.Tensor(
             egr_tensor4,
             place=place,
-            name="new_eager_tensor", )
+            name="new_eager_tensor",
+        )
         self.assertEqual(egr_tensor18.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor18.name)
         self.assertEqual(egr_tensor18.shape, egr_tensor4.shape)
@@ -532,7 +536,8 @@ def constructor_with_kwargs(self, place):
         egr_tensor19 = core.eager.Tensor(
             egr_tensor4,
             place,
-            name="new_eager_tensor", )
+            name="new_eager_tensor",
+        )
         self.assertEqual(egr_tensor19.persistable, True)
         self.assertTrue("new_eager_tensor" in egr_tensor19.name)
         self.assertEqual(egr_tensor19.shape, egr_tensor4.shape)
@@ -584,8 +589,9 @@ def constructor_with_kwargs(self, place):
         self.assertTrue(egr_tensor23.place._equals(place))
         self.assertTrue(np.array_equal(egr_tensor23.numpy(), x))
 
-        egr_tensor24 = core.eager.Tensor(
-            value=t, place=place, name="from_framework_tensor")
+        egr_tensor24 = core.eager.Tensor(value=t,
+                                         place=place,
+                                         name="from_framework_tensor")
         self.assertEqual(egr_tensor24.persistable, False)
         self.assertTrue("from_framework_tensor" in egr_tensor24.name)
         self.assertEqual(egr_tensor24.shape, [3, 3])
@@ -596,7 +602,7 @@ def constructor_with_kwargs(self, place):
 
         # Bad usage
         # SyntaxError: positional argument follows keyword argument
-        # egr_tensor25 = core.eager.Tensor(value=t, place) 
+        # egr_tensor25 = core.eager.Tensor(value=t, place)
 
     def test_constructor_with_kwargs(self):
         print("Test_constructor_with_kwargs")
@@ -770,14 +776,14 @@ def test_place_guard(self):
             paddle.set_device("gpu:0")
             with paddle.fluid.framework._dygraph_place_guard(core.CPUPlace()):
                 self.assertTrue(
-                    isinstance(_current_expected_place(), type(core.CPUPlace(
-                    ))))
+                    isinstance(_current_expected_place(),
+                               type(core.CPUPlace())))
         else:
             paddle.set_device("cpu")
             with paddle.fluid.framework._dygraph_place_guard(core.CPUPlace()):
                 self.assertTrue(
-                    isinstance(_current_expected_place(), type(core.CPUPlace(
-                    ))))
+                    isinstance(_current_expected_place(),
+                               type(core.CPUPlace())))
 
     def test_value(self):
         with _test_eager_guard():
@@ -819,8 +825,7 @@ def test_set_value(self):
     def test_sharding_related_api(self):
         with _test_eager_guard():
             arr0 = np.random.rand(4, 16, 16, 32).astype('float32')
-            egr_tensor1 = core.eager.Tensor(arr0,
-                                            core.CPUPlace(), True, False,
+            egr_tensor1 = core.eager.Tensor(arr0, core.CPUPlace(), True, False,
                                             "numpy_tensor1", False)
             self.assertEqual(egr_tensor1._numel(), 32768)
             self.assertEqual(egr_tensor1._slice(0, 2)._numel(), 16384)
@@ -846,6 +851,7 @@ def test_clear(self):
 
 
 class EagerParamBaseUsageTestCase(unittest.TestCase):
+
     def test_print(self):
         with _test_eager_guard():
             linear = paddle.nn.Linear(3, 3, bias_attr=False)
@@ -881,8 +887,10 @@ def func_fp16_initilaizer(self):
             bias_attr=False,
             weight_attr=paddle.fluid.initializer.MSRAInitializer())
         res = [
-            linear1.weight.numpy(), linear2.weight.numpy(),
-            linear3.weight.numpy(), linear4.weight.numpy()
+            linear1.weight.numpy(),
+            linear2.weight.numpy(),
+            linear3.weight.numpy(),
+            linear4.weight.numpy()
         ]
         paddle.set_default_dtype("float32")
         return res
@@ -900,8 +908,8 @@ def test_fp16_initializer(self):
             self.assertTrue(np.array_equal(res1[i], res2[i]))
 
     def func_layer_helper_base(self, value):
-        base = paddle.fluid.layer_helper_base.LayerHelperBase("test_layer",
-                                                              "test_layer")
+        base = paddle.fluid.layer_helper_base.LayerHelperBase(
+            "test_layer", "test_layer")
         return base.to_variable(value).numpy()
 
     def func_base_to_variable(self, value):
@@ -950,6 +958,7 @@ def test_set_value(self):
 
 
 class EagerGuardTestCase(unittest.TestCase):
+
     def test__test_eager_guard(self):
         tracer = paddle.fluid.dygraph.tracer.Tracer()
         with _test_eager_guard(tracer):
diff --git a/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py b/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py
index def5f569b8f4c..3b5ec683bc7bd 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,6 +21,7 @@
 
 
 class EagerStringTensorTestCase(unittest.TestCase):
+
     def setUp(self):
         self.str_arr = np.array([
             ["15.4寸笔记本的键盘确实爽，基本跟台式机差不多了，蛮喜欢数字小键盘，输数字特方便，样子也很美观，做工也相当不错"
@@ -40,9 +41,7 @@ def test_constructor_with_args(self):
             self.assertEqual(ST2.name, "ST2")
             self.assertEqual(ST2.shape, shape)
             self.assertTrue(
-                np.array_equal(
-                    ST2.numpy(), np.empty(
-                        shape, dtype=np.unicode_)))
+                np.array_equal(ST2.numpy(), np.empty(shape, dtype=np.unicode_)))
 
             ST3 = core.eager.StringTensor(self.str_arr, "ST3")  # constructor 3
             self.assertEqual(ST3.name, "ST3")
@@ -71,17 +70,15 @@ def test_constructor_with_args(self):
     def test_constructor_with_kwargs(self):
         with _test_eager_guard():
             shape = [2, 3]
-            ST1 = core.eager.StringTensor(
-                dims=shape, name="ST1")  # constructor 2
+            ST1 = core.eager.StringTensor(dims=shape,
+                                          name="ST1")  # constructor 2
             self.assertEqual(ST1.name, "ST1")
             self.assertEqual(ST1.shape, shape)
             self.assertTrue(
-                np.array_equal(
-                    ST1.numpy(), np.empty(
-                        shape, dtype=np.unicode_)))
+                np.array_equal(ST1.numpy(), np.empty(shape, dtype=np.unicode_)))
 
-            ST2 = core.eager.StringTensor(
-                self.str_arr, name="ST2")  # constructor 3
+            ST2 = core.eager.StringTensor(self.str_arr,
+                                          name="ST2")  # constructor 3
             self.assertEqual(ST2.name, "ST2")
             self.assertEqual(ST2.shape, list(self.str_arr.shape))
             self.assertTrue(np.array_equal(ST2.numpy(), self.str_arr))
@@ -91,8 +88,8 @@ def test_constructor_with_kwargs(self):
             self.assertEqual(ST3.shape, list(self.str_arr.shape))
             self.assertTrue(np.array_equal(ST3.numpy(), self.str_arr))
 
-            ST4 = core.eager.StringTensor(
-                value=ST2, name="ST4")  # constructor 6
+            ST4 = core.eager.StringTensor(value=ST2,
+                                          name="ST4")  # constructor 6
             self.assertEqual(ST4.name, "ST4")
             self.assertEqual(ST4.shape, list(self.str_arr.shape))
             self.assertTrue(np.array_equal(ST4.numpy(), self.str_arr))
diff --git a/python/paddle/fluid/tests/unittests/test_eig_op.py b/python/paddle/fluid/tests/unittests/test_eig_op.py
index bb83de7d0dd67..b4044c9e7991c 100644
--- a/python/paddle/fluid/tests/unittests/test_eig_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eig_op.py
@@ -59,6 +59,7 @@ def eig_backward(w, v, grad_w, grad_v):
 
 
 class TestEigOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         paddle.device.set_device("cpu")
@@ -142,26 +143,28 @@ def init_grad(self):
                                    self.grad_v)
 
     def test_check_output(self):
-        self.check_output_with_place_customized(
-            checker=self.checker, place=core.CPUPlace())
+        self.check_output_with_place_customized(checker=self.checker,
+                                                place=core.CPUPlace())
 
     def test_check_grad(self):
         self.init_grad()
-        self.check_grad(
-            ['X'], ['Eigenvalues', 'Eigenvectors'],
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_w, self.grad_v])
+        self.check_grad(['X'], ['Eigenvalues', 'Eigenvectors'],
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_w, self.grad_v])
 
 
 class TestComplex128(TestEigOp):
+
     def set_dtype(self):
         self.dtype = np.complex128
 
 
 @skip_check_grad_ci(
-    reason="For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig"
+    reason=
+    "For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig"
 )
 class TestDouble(TestEigOp):
+
     def set_dtype(self):
         self.dtype = np.float64
 
@@ -170,9 +173,11 @@ def test_check_grad(self):
 
 
 @skip_check_grad_ci(
-    reason="For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig"
+    reason=
+    "For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig"
 )
 class TestEigBatchMarices(TestEigOp):
+
     def set_dtype(self):
         self.dtype = np.float64
 
@@ -184,9 +189,11 @@ def test_check_grad(self):
 
 
 @skip_check_grad_ci(
-    reason="For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig"
+    reason=
+    "For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig"
 )
 class TestFloat(TestEigOp):
+
     def set_dtype(self):
         self.dtype = np.float32
 
@@ -195,6 +202,7 @@ def test_check_grad(self):
 
 
 class TestEigStatic(TestEigOp):
+
     def test_check_output_with_place(self):
         paddle.enable_static()
         place = core.CPUPlace()
@@ -209,17 +217,18 @@ def test_check_output_with_place(self):
                                            feed={"input": input_np},
                                            fetch_list=[act_val, act_vec])
         self.assertTrue(
-            np.allclose(expect_val, fetch_val, 1e-6, 1e-6),
-            "The eigen values have diff: \nExpected " + str(expect_val) + "\n" +
-            "But got: " + str(fetch_val))
+            np.allclose(expect_val, fetch_val, 1e-6,
+                        1e-6), "The eigen values have diff: \nExpected " +
+            str(expect_val) + "\n" + "But got: " + str(fetch_val))
         self.assertTrue(
-            np.allclose(np.abs(expect_vec), np.abs(fetch_vec), 1e-6, 1e-6),
-            "The eigen vectors have diff: \nExpected " +
+            np.allclose(np.abs(expect_vec), np.abs(fetch_vec), 1e-6,
+                        1e-6), "The eigen vectors have diff: \nExpected " +
             str(np.abs(expect_vec)) + "\n" + "But got: " +
             str(np.abs(fetch_vec)))
 
 
 class TestEigWrongDimsError(unittest.TestCase):
+
     def test_error(self):
         paddle.device.set_device("cpu")
         paddle.disable_static()
@@ -229,6 +238,7 @@ def test_error(self):
 
 
 class TestEigNotSquareError(unittest.TestCase):
+
     def test_error(self):
         paddle.device.set_device("cpu")
         paddle.disable_static()
@@ -238,6 +248,7 @@ def test_error(self):
 
 
 class TestEigUnsupportedDtypeError(unittest.TestCase):
+
     def test_error(self):
         paddle.device.set_device("cpu")
         paddle.disable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_eigh_op.py b/python/paddle/fluid/tests/unittests/test_eigh_op.py
index 2abbcc98a6b7e..cc5fdcca6e1c5 100644
--- a/python/paddle/fluid/tests/unittests/test_eigh_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eigh_op.py
@@ -56,7 +56,7 @@ def valid_single_eigh_result(A, eigh_value, eigh_vector, uplo):
     T = np.diag(eigh_value)
 
     # A = Q*T*Q'
-    residual = A - (eigh_vector @T @np.linalg.inv(eigh_vector))
+    residual = A - (eigh_vector @ T @ np.linalg.inv(eigh_vector))
 
     # ||A - Q*T*Q'|| / (N*||A||) < rtol
     np.testing.assert_array_less(
@@ -64,11 +64,12 @@ def valid_single_eigh_result(A, eigh_value, eigh_vector, uplo):
         rtol)
 
     # ||I - Q*Q'|| / M < rtol
-    residual = np.eye(M) - eigh_vector @np.linalg.inv(eigh_vector)
+    residual = np.eye(M) - eigh_vector @ np.linalg.inv(eigh_vector)
     np.testing.assert_array_less(np.linalg.norm(residual, np.inf) / M, rtol)
 
 
 class TestEighOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.op_type = "eigh"
@@ -96,11 +97,13 @@ def test_grad(self):
 
 
 class TestEighUPLOCase(TestEighOp):
+
     def init_config(self):
         self.UPLO = 'U'
 
 
 class TestEighGPUCase(unittest.TestCase):
+
     def setUp(self):
         self.x_shape = [32, 32]
         self.dtype = "float32"
@@ -113,11 +116,12 @@ def test_check_output_gpu(self):
             paddle.disable_static(place=paddle.CUDAPlace(0))
             input_real_data = paddle.to_tensor(self.x_np)
             actual_w, actual_v = paddle.linalg.eigh(input_real_data, self.UPLO)
-            valid_eigh_result(self.x_np,
-                              actual_w.numpy(), actual_v.numpy(), self.UPLO)
+            valid_eigh_result(self.x_np, actual_w.numpy(), actual_v.numpy(),
+                              self.UPLO)
 
 
 class TestEighAPI(unittest.TestCase):
+
     def setUp(self):
         self.init_input_data()
         self.UPLO = 'L'
@@ -147,8 +151,9 @@ def check_static_float_result(self):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, startup_prog):
-            input_x = paddle.static.data(
-                'input_x', shape=self.x_shape, dtype=self.dtype)
+            input_x = paddle.static.data('input_x',
+                                         shape=self.x_shape,
+                                         dtype=self.dtype)
             output_w, output_v = paddle.linalg.eigh(input_x)
             exe = paddle.static.Executor(self.place)
             actual_w, actual_v = exe.run(main_prog,
@@ -161,8 +166,9 @@ def check_static_complex_result(self):
         startup_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, startup_prog):
             x_dtype = np.complex64 if self.dtype == "float32" else np.complex128
-            input_x = paddle.static.data(
-                'input_x', shape=self.x_shape, dtype=x_dtype)
+            input_x = paddle.static.data('input_x',
+                                         shape=self.x_shape,
+                                         dtype=x_dtype)
             output_w, output_v = paddle.linalg.eigh(input_x)
             exe = paddle.static.Executor(self.place)
             actual_w, actual_v = exe.run(main_prog,
@@ -179,55 +185,61 @@ def test_in_dynamic_mode(self):
         paddle.disable_static()
         input_real_data = paddle.to_tensor(self.real_data)
         actual_w, actual_v = paddle.linalg.eigh(input_real_data)
-        valid_eigh_result(self.real_data,
-                          actual_w.numpy(), actual_v.numpy(), self.UPLO)
+        valid_eigh_result(self.real_data, actual_w.numpy(), actual_v.numpy(),
+                          self.UPLO)
 
         input_complex_data = paddle.to_tensor(self.complex_symm)
         actual_w, actual_v = paddle.linalg.eigh(input_complex_data)
-        valid_eigh_result(self.complex_symm,
-                          actual_w.numpy(), actual_v.numpy(), self.UPLO)
+        valid_eigh_result(self.complex_symm, actual_w.numpy(), actual_v.numpy(),
+                          self.UPLO)
 
     def test_eigh_grad(self):
         paddle.disable_static()
         x = paddle.to_tensor(self.complex_symm, stop_gradient=False)
         w, v = paddle.linalg.eigh(x)
         (w.sum() + paddle.abs(v).sum()).backward()
-        np.testing.assert_allclose(
-            abs(x.grad.numpy()),
-            abs(x.grad.numpy().conj().transpose(self.trans_dims)),
-            rtol=self.rtol,
-            atol=self.atol)
+        np.testing.assert_allclose(abs(x.grad.numpy()),
+                                   abs(x.grad.numpy().conj().transpose(
+                                       self.trans_dims)),
+                                   rtol=self.rtol,
+                                   atol=self.atol)
 
 
 class TestEighBatchAPI(TestEighAPI):
+
     def init_input_shape(self):
         self.x_shape = [2, 5, 5]
 
 
 class TestEighAPIError(unittest.TestCase):
+
     def test_error(self):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, startup_prog):
             #input maxtrix must greater than 2 dimensions
-            input_x = paddle.static.data(
-                name='x_1', shape=[12], dtype='float32')
+            input_x = paddle.static.data(name='x_1',
+                                         shape=[12],
+                                         dtype='float32')
             self.assertRaises(ValueError, paddle.linalg.eigh, input_x)
 
             #input matrix must be square matrix
-            input_x = paddle.static.data(
-                name='x_2', shape=[12, 32], dtype='float32')
+            input_x = paddle.static.data(name='x_2',
+                                         shape=[12, 32],
+                                         dtype='float32')
             self.assertRaises(ValueError, paddle.linalg.eigh, input_x)
 
             #uplo must be in 'L' or 'U'
-            input_x = paddle.static.data(
-                name='x_3', shape=[4, 4], dtype="float32")
+            input_x = paddle.static.data(name='x_3',
+                                         shape=[4, 4],
+                                         dtype="float32")
             uplo = 'R'
             self.assertRaises(ValueError, paddle.linalg.eigh, input_x, uplo)
 
             #x_data cannot be integer
-            input_x = paddle.static.data(
-                name='x_4', shape=[4, 4], dtype="int32")
+            input_x = paddle.static.data(name='x_4',
+                                         shape=[4, 4],
+                                         dtype="int32")
             self.assertRaises(TypeError, paddle.linalg.eigh, input_x)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_eigvals_op.py b/python/paddle/fluid/tests/unittests/test_eigvals_op.py
index eff9d4ea6e801..6d52d7fa4d156 100644
--- a/python/paddle/fluid/tests/unittests/test_eigvals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eigvals_op.py
@@ -33,6 +33,7 @@ def np_eigvals(a):
 
 
 class TestEigvalsOp(OpTest):
+
     def setUp(self):
         np.random.seed(0)
         paddle.enable_static()
@@ -57,14 +58,14 @@ def set_input_data(self):
             self.input_data = np.random.random(self.input_dims).astype(
                 self.dtype)
         else:
-            self.input_data = (
-                np.random.random(self.input_dims) +
-                np.random.random(self.input_dims) * 1j).astype(self.dtype)
+            self.input_data = (np.random.random(self.input_dims) +
+                               np.random.random(self.input_dims) * 1j).astype(
+                                   self.dtype)
 
     def test_check_output(self):
         self.__class__.no_need_check_grad = True
-        self.check_output_with_place_customized(
-            checker=self.verify_output, place=core.CPUPlace())
+        self.check_output_with_place_customized(checker=self.verify_output,
+                                                place=core.CPUPlace())
 
     def verify_output(self, outs):
         actual_outs = np.sort(np.array(outs[0]))
@@ -75,9 +76,8 @@ def verify_output(self, outs):
             str(actual_outs.shape) + " in class " + self.__class__.__name__)
 
         n_dim = actual_outs.shape[-1]
-        for actual_row, expect_row in zip(
-                actual_outs.reshape((-1, n_dim)),
-                expect_outs.reshape((-1, n_dim))):
+        for actual_row, expect_row in zip(actual_outs.reshape((-1, n_dim)),
+                                          expect_outs.reshape((-1, n_dim))):
             is_mapped_index = np.zeros((n_dim, ))
             for i in range(n_dim):
                 is_mapped = False
@@ -98,56 +98,67 @@ def verify_output(self, outs):
 
 
 class TestEigvalsOpFloat64(TestEigvalsOp):
+
     def set_dtype(self):
         self.dtype = np.float64
 
 
 class TestEigvalsOpComplex64(TestEigvalsOp):
+
     def set_dtype(self):
         self.dtype = np.complex64
 
 
 class TestEigvalsOpComplex128(TestEigvalsOp):
+
     def set_dtype(self):
         self.dtype = np.complex128
 
 
 class TestEigvalsOpLargeScare(TestEigvalsOp):
+
     def set_input_dims(self):
         self.input_dims = (128, 128)
 
 
 class TestEigvalsOpLargeScareFloat64(TestEigvalsOpLargeScare):
+
     def set_dtype(self):
         self.dtype = np.float64
 
 
 class TestEigvalsOpLargeScareComplex64(TestEigvalsOpLargeScare):
+
     def set_dtype(self):
         self.dtype = np.complex64
 
 
 class TestEigvalsOpLargeScareComplex128(TestEigvalsOpLargeScare):
+
     def set_dtype(self):
         self.dtype = np.complex128
 
 
 class TestEigvalsOpBatch1(TestEigvalsOp):
+
     def set_input_dims(self):
         self.input_dims = (1, 2, 3, 4, 4)
 
 
 class TestEigvalsOpBatch2(TestEigvalsOp):
+
     def set_input_dims(self):
         self.input_dims = (3, 1, 4, 5, 5)
 
 
 class TestEigvalsOpBatch3(TestEigvalsOp):
+
     def set_input_dims(self):
         self.input_dims = (6, 2, 9, 6, 6)
 
 
 class TestEigvalsAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(0)
 
@@ -177,9 +188,9 @@ def set_input_data(self):
             self.input_data = np.random.random(self.input_dims).astype(
                 self.dtype)
         else:
-            self.input_data = (
-                np.random.random(self.input_dims) +
-                np.random.random(self.input_dims) * 1j).astype(self.dtype)
+            self.input_data = (np.random.random(self.input_dims) +
+                               np.random.random(self.input_dims) * 1j).astype(
+                                   self.dtype)
 
     def verify_output(self, actural_outs, expect_outs):
         actual_outs = np.array(actural_outs)
@@ -190,9 +201,8 @@ def verify_output(self, actural_outs, expect_outs):
             str(actual_outs.shape) + " in class " + self.__class__.__name__)
 
         n_dim = actual_outs.shape[-1]
-        for actual_row, expect_row in zip(
-                actual_outs.reshape((-1, n_dim)),
-                expect_outs.reshape((-1, n_dim))):
+        for actual_row, expect_row in zip(actual_outs.reshape((-1, n_dim)),
+                                          expect_outs.reshape((-1, n_dim))):
             is_mapped_index = np.zeros((n_dim, ))
             for i in range(n_dim):
                 is_mapped = False
@@ -234,19 +244,22 @@ def run_static(self, place):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            small_input_tensor = paddle.static.data(
-                name='small_x', shape=self.small_dims, dtype=self.dtype)
-            large_input_tensor = paddle.static.data(
-                name='large_x', shape=self.large_dims, dtype=self.dtype)
-            batch_input_tensor = paddle.static.data(
-                name='batch_x', shape=self.batch_dims, dtype=self.dtype)
-
-            small_outs = paddle.linalg.eigvals(
-                small_input_tensor, name='small_x')
-            large_outs = paddle.linalg.eigvals(
-                large_input_tensor, name='large_x')
-            batch_outs = paddle.linalg.eigvals(
-                batch_input_tensor, name='batch_x')
+            small_input_tensor = paddle.static.data(name='small_x',
+                                                    shape=self.small_dims,
+                                                    dtype=self.dtype)
+            large_input_tensor = paddle.static.data(name='large_x',
+                                                    shape=self.large_dims,
+                                                    dtype=self.dtype)
+            batch_input_tensor = paddle.static.data(name='batch_x',
+                                                    shape=self.batch_dims,
+                                                    dtype=self.dtype)
+
+            small_outs = paddle.linalg.eigvals(small_input_tensor,
+                                               name='small_x')
+            large_outs = paddle.linalg.eigvals(large_input_tensor,
+                                               name='large_x')
+            batch_outs = paddle.linalg.eigvals(batch_input_tensor,
+                                               name='batch_x')
 
             exe = paddle.static.Executor(place)
 
@@ -289,16 +302,19 @@ def test_error(self):
 
 
 class TestEigvalsAPIFloat64(TestEigvalsAPI):
+
     def set_dtype(self):
         self.dtype = np.float64
 
 
 class TestEigvalsAPIComplex64(TestEigvalsAPI):
+
     def set_dtype(self):
         self.dtype = np.complex64
 
 
 class TestEigvalsAPIComplex128(TestEigvalsAPI):
+
     def set_dtype(self):
         self.dtype = np.complex128
 
diff --git a/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py b/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py
index 8b7ca9189e1c4..e518491588d51 100644
--- a/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py
@@ -47,6 +47,7 @@ def valid_eigenvalues(actual, expected):
 
 
 class TestEigvalshOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.op_type = "eigvalsh"
@@ -75,11 +76,13 @@ def test_grad(self):
 
 
 class TestEigvalshUPLOCase(TestEigvalshOp):
+
     def init_config(self):
         self.UPLO = 'U'
 
 
 class TestEigvalshGPUCase(unittest.TestCase):
+
     def setUp(self):
         self.x_shape = [32, 32]
         self.dtype = "float32"
@@ -96,6 +99,7 @@ def test_check_output_gpu(self):
 
 
 class TestEigvalshAPI(unittest.TestCase):
+
     def setUp(self):
         self.dtype = "float32"
         self.UPLO = 'L'
@@ -124,8 +128,9 @@ def check_static_float_result(self):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, startup_prog):
-            input_x = paddle.static.data(
-                'input_x', shape=self.x_shape, dtype=self.dtype)
+            input_x = paddle.static.data('input_x',
+                                         shape=self.x_shape,
+                                         dtype=self.dtype)
             output_w = paddle.linalg.eigvalsh(input_x)
             exe = paddle.static.Executor(self.place)
             actual_w = exe.run(main_prog,
@@ -140,8 +145,9 @@ def check_static_complex_result(self):
         startup_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, startup_prog):
             x_dtype = np.complex64 if self.dtype == "float32" else np.complex128
-            input_x = paddle.static.data(
-                'input_x', shape=self.x_shape, dtype=x_dtype)
+            input_x = paddle.static.data('input_x',
+                                         shape=self.x_shape,
+                                         dtype=x_dtype)
             output_w = paddle.linalg.eigvalsh(input_x)
             exe = paddle.static.Executor(self.place)
             actual_w = exe.run(main_prog,
@@ -172,42 +178,48 @@ def test_eigvalsh_grad(self):
         x = paddle.to_tensor(self.complex_symm, stop_gradient=False)
         w = paddle.linalg.eigvalsh(x)
         (w.sum()).backward()
-        np.testing.assert_allclose(
-            abs(x.grad.numpy()),
-            abs(x.grad.numpy().conj().transpose(self.trans_dims)),
-            rtol=self.rtol,
-            atol=self.atol)
+        np.testing.assert_allclose(abs(x.grad.numpy()),
+                                   abs(x.grad.numpy().conj().transpose(
+                                       self.trans_dims)),
+                                   rtol=self.rtol,
+                                   atol=self.atol)
 
 
 class TestEigvalshBatchAPI(TestEigvalshAPI):
+
     def init_input_shape(self):
         self.x_shape = [2, 5, 5]
 
 
 class TestEigvalshAPIError(unittest.TestCase):
+
     def test_error(self):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, startup_prog):
             #input maxtrix must greater than 2 dimensions
-            input_x = paddle.static.data(
-                name='x_1', shape=[12], dtype='float32')
+            input_x = paddle.static.data(name='x_1',
+                                         shape=[12],
+                                         dtype='float32')
             self.assertRaises(ValueError, paddle.linalg.eigvalsh, input_x)
 
             #input matrix must be square matrix
-            input_x = paddle.static.data(
-                name='x_2', shape=[12, 32], dtype='float32')
+            input_x = paddle.static.data(name='x_2',
+                                         shape=[12, 32],
+                                         dtype='float32')
             self.assertRaises(ValueError, paddle.linalg.eigvalsh, input_x)
 
             #uplo must be in 'L' or 'U'
-            input_x = paddle.static.data(
-                name='x_3', shape=[4, 4], dtype="float32")
+            input_x = paddle.static.data(name='x_3',
+                                         shape=[4, 4],
+                                         dtype="float32")
             uplo = 'R'
             self.assertRaises(ValueError, paddle.linalg.eigvalsh, input_x, uplo)
 
             #x_data cannot be integer
-            input_x = paddle.static.data(
-                name='x_4', shape=[4, 4], dtype="int32")
+            input_x = paddle.static.data(name='x_4',
+                                         shape=[4, 4],
+                                         dtype="int32")
             self.assertRaises(TypeError, paddle.linalg.eigvalsh, input_x)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_einsum.py b/python/paddle/fluid/tests/unittests/test_einsum.py
index 26aaf0f44f1d2..9ba4869786c20 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum.py
@@ -19,10 +19,12 @@
 from paddle.fluid import core
 
 import os
+
 os.environ['FLAGS_new_einsum'] = "0"
 
 
 class TestErrors(unittest.TestCase):
+
     def setUp(self):
         pass
 
@@ -45,50 +47,62 @@ def test_param_errors(self):
         with self.assertRaisesRegex(AssertionError,
                                     ('At least one operand is expected.')):
             paddle.einsum('ijk')
-        with self.assertRaisesRegex(AssertionError, (
-                'Invalid equation: multiple `->` were found.')):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ('Invalid equation: multiple `->` were found.')):
             paddle.einsum('i -> j -> k', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: the number of operands is 2, "
-                "but found 3 segments in the label equation.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: the number of operands is 2, "
+             "but found 3 segments in the label equation.")):
             paddle.einsum('i,j,k', a, a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: the number of operands is 2, "
-                "but found 1 segments in the label equation.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: the number of operands is 2, "
+             "but found 1 segments in the label equation.")):
             paddle.einsum('ij -> k', a, a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: the number of operands is 1, "
-                "but found 2 segments in the label equation.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: the number of operands is 1, "
+             "but found 2 segments in the label equation.")):
             paddle.einsum('i, -> k', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: the label string '' misses dimensions.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: the label string '' misses dimensions.")):
             paddle.einsum('->', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: the label string 'i' misses dimensions.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: the label string 'i' misses dimensions.")):
             paddle.einsum('i', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: _ is not a valid label, "
-                "which should be letters.")):
+        with self.assertRaisesRegex(
+                AssertionError, ("Invalid equation: _ is not a valid label, "
+                                 "which should be letters.")):
             paddle.einsum('i_', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: `.` is found outside of an ellipsis.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: `.` is found outside of an ellipsis.")):
             paddle.einsum('i..j', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: `.` is found outside of an ellipsis.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: `.` is found outside of an ellipsis.")):
             paddle.einsum('...k...', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: missing ellipsis in output labels.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: missing ellipsis in output labels.")):
             paddle.einsum('i...->i', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: duplicate output labels are found.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: duplicate output labels are found.")):
             paddle.einsum('i...->i...i', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid operands: label i "
-                "corresponds to non-broadcastable dimensions.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid operands: label i "
+             "corresponds to non-broadcastable dimensions.")):
             paddle.einsum('ij...,ji...', a, a)
 
 
 class TestEinsum(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         np.random.seed(12345)
@@ -122,8 +136,7 @@ def _get_place(self, force_to_use_cpu=False):
     def check_output_equal(self, actual, expect, rtol=1.e-5, atol=1.e-8):
         error_msg = 'Output has diff at place:{}. \nExpect: {} \nBut Got: {} in class {}'
         self.assertTrue(
-            np.allclose(
-                actual, expect, rtol=rtol, atol=atol),
+            np.allclose(actual, expect, rtol=rtol, atol=atol),
             error_msg.format(paddle.get_device(), expect, actual,
                              self.__class__.__name__))
 
@@ -150,136 +163,163 @@ def test_forward(self):
 
 
 class TestEinsumVectorDot(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "i,i->", "data": ["x", "x"]}
 
 
 class TestEinsumVectorMul(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "i,i->i", "data": ["x", "x"]}
 
 
 class TestEinsumVectorOuter(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "i,j->ij", "data": ["x", "y"]}
 
 
 class TestEinsumMatrixTranspose(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij->ji", "data": ["A"]}
 
 
 class TestEinsumMatrixRowSum(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij->j", "data": ["A"]}
 
 
 class TestEinsumMatrixColSum(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij->i", "data": ["A"]}
 
 
 class TestEinsumMatrixEleMul(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij,ij->ij", "data": ["A", "A"]}
 
 
 class TestEinsumDegenerateMatrixVecMul(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij,j", "data": ["a", "b"]}
 
 
 class TestEinsumMatrixVecMul(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij,j->i", "data": ["A", "x"]}
 
 
 class TestEinsumMatrixMul(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij,kj->ik", "data": ["A", "B"]}
 
 
 class TestEinsumMatrixOuter(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij,kl->ijkl", "data": ["A", "C"]}
 
 
 class TestEinsumTensorBMM(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "bij,bjk->bik", "data": ["D", "E"]}
 
 
 class TestEinsumTensorContract1(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ijk,jk->i", "data": ["D", "A"]}
 
 
 class TestEinsumTensorContract2(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ijk,lk->ijl", "data": ["D", "B"]}
 
 
 class TestEinsumTensorContract3(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "abcd,dfg->abcfg", "data": ["F", "D"]}
 
 
 class TestEinsumTensorContract4(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ijk,jk->ik", "data": ["D", "A"]}
 
 
 class TestEinsumTensorContract5(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ijk,jk->ij", "data": ["D", "A"]}
 
 
 class TestEinsumTensorContract6(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ik, ijk->j", "data": ["A", "G"]}
 
 
 class TestEinsumTensorContract7(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ijk, ik->jk", "data": ["G", "A"]}
 
 
 class TestEinsumEllipsis1(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "i...->...", "data": ["G"]}
 
 
 class TestEinsumEllipsis2(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij,...i->j...", "data": ["A", "H"]}
 
 
 class TestEinsumEllipsis3(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "k...,jk", "data": ["F", "I"]}
 
 
 class TestEinsumTestEinsumBilinear(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "bn,anm,bm->ba", "data": ["B", "E", "I"]}
 
 
 class TestEinsumTestEinsumOthers1(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ijkl, lmn->kmn", "data": ["F", "H"]}
 
 
 class TestEinsumTestEinsumOthers2(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ijkl, lmn->ijn", "data": ["F", "H"]}
 
 
 class TestEinsumBatch1(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "blq,bhlk->bhlqk", "data": ["J", "K"]}
 
 
 class TestNumpyTests(unittest.TestCase):
+
     def setUp(self):
         pass
 
@@ -294,8 +334,7 @@ def _get_place(self, force_to_use_cpu=False):
     def check_output_equal(self, actual, expect, rtol=1.e-5, atol=1.e-8):
         error_msg = 'Output has diff at place:{}. \nExpect: {} \nBut Got: {} in class {}'
         self.assertTrue(
-            np.allclose(
-                actual, expect, rtol=rtol, atol=atol),
+            np.allclose(actual, expect, rtol=rtol, atol=atol),
             error_msg.format(paddle.get_device(), expect, actual,
                              self.__class__.__name__))
 
@@ -409,16 +448,21 @@ def test_static_graph(self):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
-            a = paddle.static.data(
-                name='a', shape=[3, None, None, None], dtype='float')
-            b = paddle.static.data(
-                name='b', shape=[2, None, None, None], dtype='float')
-            c = paddle.static.data(
-                name='c', shape=[None, None, 2, None], dtype='float')
-            d = paddle.static.data(
-                name='d', shape=[None, None, 5], dtype='float')
-            e = paddle.static.data(
-                name='e', shape=[None, 2, None], dtype='float')
+            a = paddle.static.data(name='a',
+                                   shape=[3, None, None, None],
+                                   dtype='float')
+            b = paddle.static.data(name='b',
+                                   shape=[2, None, None, None],
+                                   dtype='float')
+            c = paddle.static.data(name='c',
+                                   shape=[None, None, 2, None],
+                                   dtype='float')
+            d = paddle.static.data(name='d',
+                                   shape=[None, None, 5],
+                                   dtype='float')
+            e = paddle.static.data(name='e',
+                                   shape=[None, 2, None],
+                                   dtype='float')
 
             outs = []
             outs.append(paddle.einsum("ibnd,jbnd->bnij", a, b))
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_op.py b/python/paddle/fluid/tests/unittests/test_einsum_op.py
index 1a4ae54afefe2..c36950b6922fe 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_op.py
@@ -21,6 +21,7 @@
 
 
 class TestEinsumBinary(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.op_type = "einsum"
@@ -35,7 +36,8 @@ def setUp(self):
         self.inputs = {"Operands": self.operands}
         self.attrs = {"equation": self.equation}
         self.outputs = {
-            'Out': out,
+            'Out':
+            out,
             "InnerCache": [('cache_' + str(i), np.array([1.0]))
                            for i in range(len(self.operands))]
         }
@@ -61,6 +63,7 @@ def test_grad(self):
 
 
 class TestEinsum1(TestEinsumBinary):
+
     def set_mandatory(self):
         self.shapes = [(20, 3, 3), (20, 3, 3)]
         self.types = [np.float64, np.float64]
@@ -68,6 +71,7 @@ def set_mandatory(self):
 
 
 class TestEinsum2(TestEinsumBinary):
+
     def set_mandatory(self):
         self.shapes = [(20, 3, 3), (20, 3, 3)]
         self.types = [np.float64, np.float64]
@@ -75,6 +79,7 @@ def set_mandatory(self):
 
 
 class TestEinsum3(TestEinsumBinary):
+
     def set_mandatory(self):
         self.shapes = [(10, 10), (10, 10)]
         self.types = [np.float64, np.float64]
@@ -82,6 +87,7 @@ def set_mandatory(self):
 
 
 class TestEinsumWithReduction(TestEinsumBinary):
+
     def set_mandatory(self):
         self.shapes = [(10, 3, 5), (5, 30)]
         self.types = [np.float64, np.float64]
@@ -89,6 +95,7 @@ def set_mandatory(self):
 
 
 class TestEinsumWithReduction1(TestEinsumBinary):
+
     def set_mandatory(self):
         self.shapes = [(10, 3, 3, 5), (10, 5, 10, 10)]
         self.types = [np.float64, np.float64]
@@ -96,6 +103,7 @@ def set_mandatory(self):
 
 
 class TestEinsumWithUnary(TestEinsumBinary):
+
     def set_mandatory(self):
         self.shapes = [(10, 10, 3, 5)]
         self.types = [np.float64]
@@ -103,6 +111,7 @@ def set_mandatory(self):
 
 
 class TestEinsumWithUnary1(TestEinsumBinary):
+
     def set_mandatory(self):
         self.shapes = [(5, 10, 3, 3), (3, 6, 3, 10)]
         self.types = [np.float64, np.float64]
@@ -110,6 +119,7 @@ def set_mandatory(self):
 
 
 class TestEinsumWithBroadcast1(TestEinsumBinary):
+
     def set_mandatory(self):
         self.shapes = [(5, 10, 3, 3)]
         self.types = [np.float64]
@@ -117,6 +127,7 @@ def set_mandatory(self):
 
 
 class TestEinsumWithBroadcast2(TestEinsumBinary):
+
     def set_mandatory(self):
         self.shapes = [(10, 11), (3, 4, 5, 10)]
         self.types = [np.float64, np.float64]
@@ -124,6 +135,7 @@ def set_mandatory(self):
 
 
 class TestEinsumWithBroadcast3(TestEinsumBinary):
+
     def set_mandatory(self):
         self.shapes = [(10, 3, 2, 3, 4), (12, 10)]
         self.types = [np.float64, np.float64]
@@ -131,6 +143,7 @@ def set_mandatory(self):
 
 
 class TestEinsumWithBroadcast4(TestEinsumBinary):
+
     def set_mandatory(self):
         self.shapes = [(10, 3, 2, 3, 4), (12, 10)]
         self.types = [np.float64, np.float64]
@@ -138,6 +151,7 @@ def set_mandatory(self):
 
 
 class TestEinsumWithBroadcast5(TestEinsumBinary):
+
     def set_mandatory(self):
         self.shapes = [(3, 2, 2, 10), (10, 3, 2, 2)]
         self.types = [np.float64, np.float64]
@@ -145,6 +159,7 @@ def set_mandatory(self):
 
 
 class TestEinsumWithBroadcast6(TestEinsumBinary):
+
     def set_mandatory(self):
         self.shapes = [(100), (100)]
         self.types = [np.float64, np.float64]
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
index b33a943c9f27e..97f3eef51a5bf 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
@@ -19,6 +19,7 @@
 from paddle.fluid import core
 
 import os
+
 os.environ['FLAGS_new_einsum'] = "1"
 
 
@@ -36,6 +37,7 @@ def error_trans(func, *args, **kargs):
 
 
 class TestErrors(unittest.TestCase):
+
     def setUp(self):
         pass
 
@@ -59,50 +61,62 @@ def test_param_errors(self):
                 AssertionError,
             ("Required at least one operand in Einsum API, but received 0 ")):
             paddle.einsum('ijk')
-        with self.assertRaisesRegex(AssertionError, (
-                'Invalid equation: multiple `->` were found.')):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ('Invalid equation: multiple `->` were found.')):
             paddle.einsum('i -> j -> k', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: the number of operands is 2, "
-                "but found 3 segments in the label equation.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: the number of operands is 2, "
+             "but found 3 segments in the label equation.")):
             paddle.einsum('i,j,k', a, a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: the number of operands is 2, "
-                "but found 1 segments in the label equation.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: the number of operands is 2, "
+             "but found 1 segments in the label equation.")):
             paddle.einsum('ij -> k', a, a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: the number of operands is 1, "
-                "but found 2 segments in the label equation.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: the number of operands is 1, "
+             "but found 2 segments in the label equation.")):
             paddle.einsum('i, -> k', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: the label string '' misses dimensions.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: the label string '' misses dimensions.")):
             paddle.einsum('->', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: the label string 'i' misses dimensions.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: the label string 'i' misses dimensions.")):
             paddle.einsum('i', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: _ is not a valid label, "
-                "which should be letters.")):
+        with self.assertRaisesRegex(
+                AssertionError, ("Invalid equation: _ is not a valid label, "
+                                 "which should be letters.")):
             paddle.einsum('i_', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: `.` is found outside of an ellipsis.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: `.` is found outside of an ellipsis.")):
             paddle.einsum('i..j', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: `.` is found outside of an ellipsis.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: `.` is found outside of an ellipsis.")):
             paddle.einsum('...k...', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: missing ellipsis in output labels.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: missing ellipsis in output labels.")):
             paddle.einsum('i...->i', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid equation: duplicate output labels are found.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid equation: duplicate output labels are found.")):
             paddle.einsum('i...->i...i', a)
-        with self.assertRaisesRegex(AssertionError, (
-                "Invalid operands: label i "
-                "corresponds to non-broadcastable dimensions.")):
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Invalid operands: label i "
+             "corresponds to non-broadcastable dimensions.")):
             error_trans(paddle.einsum, 'ij...,ji...', a, a)
 
 
 class TestEinsum(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         np.random.seed(12345)
@@ -136,8 +150,7 @@ def _get_place(self, force_to_use_cpu=False):
     def check_output_equal(self, actual, expect, rtol=1.e-5, atol=1.e-8):
         error_msg = 'Output has diff at place:{}. \nExpect: {} \nBut Got: {} in class {}'
         self.assertTrue(
-            np.allclose(
-                actual, expect, rtol=rtol, atol=atol),
+            np.allclose(actual, expect, rtol=rtol, atol=atol),
             error_msg.format(paddle.get_device(), expect, actual,
                              self.__class__.__name__))
 
@@ -164,136 +177,163 @@ def test_forward(self):
 
 
 class TestEinsumVectorDot(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "i,i->", "data": ["x", "x"]}
 
 
 class TestEinsumVectorMul(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "i,i->i", "data": ["x", "x"]}
 
 
 class TestEinsumVectorOuter(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "i,j->ij", "data": ["x", "y"]}
 
 
 class TestEinsumMatrixTranspose(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij->ji", "data": ["A"]}
 
 
 class TestEinsumMatrixRowSum(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij->j", "data": ["A"]}
 
 
 class TestEinsumMatrixColSum(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij->i", "data": ["A"]}
 
 
 class TestEinsumMatrixEleMul(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij,ij->ij", "data": ["A", "A"]}
 
 
 class TestEinsumDegenerateMatrixVecMul(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij,j", "data": ["a", "b"]}
 
 
 class TestEinsumMatrixVecMul(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij,j->i", "data": ["A", "x"]}
 
 
 class TestEinsumMatrixMul(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij,kj->ik", "data": ["A", "B"]}
 
 
 class TestEinsumMatrixOuter(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij,kl->ijkl", "data": ["A", "C"]}
 
 
 class TestEinsumTensorBMM(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "bij,bjk->bik", "data": ["D", "E"]}
 
 
 class TestEinsumTensorContract1(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ijk,jk->i", "data": ["D", "A"]}
 
 
 class TestEinsumTensorContract2(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ijk,lk->ijl", "data": ["D", "B"]}
 
 
 class TestEinsumTensorContract3(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "abcd,dfg->abcfg", "data": ["F", "D"]}
 
 
 class TestEinsumTensorContract4(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ijk,jk->ik", "data": ["D", "A"]}
 
 
 class TestEinsumTensorContract5(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ijk,jk->ij", "data": ["D", "A"]}
 
 
 class TestEinsumTensorContract6(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ik, ijk->j", "data": ["A", "G"]}
 
 
 class TestEinsumTensorContract7(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ijk, ik->jk", "data": ["G", "A"]}
 
 
 class TestEinsumEllipsis1(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "i...->...", "data": ["G"]}
 
 
 class TestEinsumEllipsis2(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ij,...i->j...", "data": ["A", "H"]}
 
 
 class TestEinsumEllipsis3(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "k...,jk", "data": ["F", "I"]}
 
 
 class TestEinsumTestEinsumBilinear(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "bn,anm,bm->ba", "data": ["B", "E", "I"]}
 
 
 class TestEinsumTestEinsumOthers1(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ijkl, lmn->kmn", "data": ["F", "H"]}
 
 
 class TestEinsumTestEinsumOthers2(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "ijkl, lmn->ijn", "data": ["F", "H"]}
 
 
 class TestEinsumBatch1(TestEinsum):
+
     def setUp(self):
         self.sample = {"paradigm": "blq,bhlk->bhlqk", "data": ["J", "K"]}
 
 
 class TestNumpyTests(unittest.TestCase):
+
     def setUp(self):
         pass
 
@@ -308,8 +348,7 @@ def _get_place(self, force_to_use_cpu=False):
     def check_output_equal(self, actual, expect, rtol=1.e-5, atol=1.e-8):
         error_msg = 'Output has diff at place:{}. \nExpect: {} \nBut Got: {} in class {}'
         self.assertTrue(
-            np.allclose(
-                actual, expect, rtol=rtol, atol=atol),
+            np.allclose(actual, expect, rtol=rtol, atol=atol),
             error_msg.format(paddle.get_device(), expect, actual,
                              self.__class__.__name__))
 
@@ -428,16 +467,21 @@ def test_static_graph(self):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
-            a = paddle.static.data(
-                name='a', shape=[3, None, None, None], dtype='float')
-            b = paddle.static.data(
-                name='b', shape=[2, None, None, None], dtype='float')
-            c = paddle.static.data(
-                name='c', shape=[None, None, 2, None], dtype='float')
-            d = paddle.static.data(
-                name='d', shape=[None, None, 5], dtype='float')
-            e = paddle.static.data(
-                name='e', shape=[None, 2, None], dtype='float')
+            a = paddle.static.data(name='a',
+                                   shape=[3, None, None, None],
+                                   dtype='float')
+            b = paddle.static.data(name='b',
+                                   shape=[2, None, None, None],
+                                   dtype='float')
+            c = paddle.static.data(name='c',
+                                   shape=[None, None, 2, None],
+                                   dtype='float')
+            d = paddle.static.data(name='d',
+                                   shape=[None, None, 5],
+                                   dtype='float')
+            e = paddle.static.data(name='e',
+                                   shape=[None, 2, None],
+                                   dtype='float')
 
             outs = []
             outs.append(paddle.einsum("ibnd,jbnd->bnij", a, b))
@@ -465,6 +509,7 @@ def test_static_graph(self):
 
 
 class TestStaticGraphShape(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index 22787a23feadf..714ef764a9262 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -23,6 +23,7 @@
 
 
 class TestElementwiseAddOp(OpTest):
+
     def init_kernel_type(self):
         self.use_mkldnn = False
 
@@ -46,41 +47,37 @@ def check_eager(self):
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(
-            check_dygraph=(self.use_mkldnn == False),
-            check_eager=self.check_eager())
+        self.check_output(check_dygraph=(self.use_mkldnn == False),
+                          check_eager=self.check_eager())
 
     def test_check_grad_normal(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.dtype == np.float16:
             return
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-            check_dygraph=(self.use_mkldnn == False),
-            check_eager=self.check_eager())
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        check_dygraph=(self.use_mkldnn == False),
+                        check_eager=self.check_eager())
 
     def test_check_grad_ingore_x(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.dtype == np.float16:
             return
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            check_dygraph=(self.use_mkldnn == False),
-            check_eager=self.check_eager())
+        self.check_grad(['Y'],
+                        'Out',
+                        no_grad_set=set("X"),
+                        check_dygraph=(self.use_mkldnn == False),
+                        check_eager=self.check_eager())
 
     def test_check_grad_ingore_y(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.dtype == np.float16:
             return
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            check_dygraph=(self.use_mkldnn == False),
-            check_eager=self.check_eager())
+        self.check_grad(['X'],
+                        'Out',
+                        no_grad_set=set('Y'),
+                        check_dygraph=(self.use_mkldnn == False),
+                        check_eager=self.check_eager())
 
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
@@ -97,6 +94,7 @@ def init_axis(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -113,6 +111,7 @@ def test_check_output(self):
     not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
     "core is not compiled with CUDA and cudnn version need larger than 8.1.0")
 class TestBF16ElementwiseAddOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_add"
         self.dtype = np.uint16
@@ -126,8 +125,7 @@ def setUp(self):
         self.inputs = {
             'X':
             OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.x)),
-            'Y':
-            OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.y))
+            'Y': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.y))
         }
         self.attrs = {'axis': self.axis, 'use_mkldnn': False}
         self.outputs = {'Out': convert_float_to_uint16(self.out)}
@@ -142,18 +140,23 @@ def test_check_grad_normal(self):
 
     def test_check_grad_ingore_x(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['Y'], 'Out', no_grad_set=set("X"), check_eager=False)
+        self.check_grad_with_place(place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"),
+                                   check_eager=False)
 
     def test_check_grad_ingore_y(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', no_grad_set=set('Y'), check_eager=False)
+        self.check_grad_with_place(place, ['X'],
+                                   'Out',
+                                   no_grad_set=set('Y'),
+                                   check_eager=False)
 
 
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(1).astype(self.dtype)
@@ -163,6 +166,7 @@ def init_input_output(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestFP16ElementwiseAddOp_scalar(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(1).astype(self.dtype)
@@ -172,6 +176,7 @@ def init_input_output(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
 class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(1, 1).astype(self.dtype)
@@ -181,6 +186,7 @@ def init_input_output(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
 class TestFP16ElementwiseAddOp_scalar2(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(1, 1).astype(self.dtype)
@@ -188,6 +194,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.random((100, )).astype(self.dtype)
         self.y = np.random.random((100, )).astype(self.dtype)
@@ -195,6 +202,7 @@ def init_input_output(self):
 
 
 class TestFP16ElementwiseAddOp_Vector(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.random((100, )).astype(self.dtype)
         self.y = np.random.random((100, )).astype(self.dtype)
@@ -202,6 +210,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -212,6 +221,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_0(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -222,6 +232,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 100, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -232,6 +243,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_1(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 100, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -242,6 +254,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -249,6 +262,7 @@ def init_input_output(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_2(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -256,6 +270,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 1).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -266,6 +281,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_3(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -276,6 +292,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
@@ -286,6 +303,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_4(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
@@ -296,6 +314,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 3, 12).astype(self.dtype)
         self.y = np.random.rand(10, 1, 12).astype(self.dtype)
@@ -303,6 +322,7 @@ def init_input_output(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_5(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 3, 12).astype(self.dtype)
         self.y = np.random.rand(10, 1, 12).astype(self.dtype)
@@ -310,6 +330,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
         self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
@@ -317,6 +338,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
         self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
@@ -324,6 +346,7 @@ def init_input_output(self):
 
 
 class TestFP16ElementwiseAddOp_broadcast_6(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
         self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
@@ -331,6 +354,7 @@ def init_input_output(self):
 
 
 class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -341,6 +365,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_rowwise_add_0(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -353,6 +378,7 @@ def init_axis(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 1).astype(self.dtype)
         self.y = np.random.rand(1).astype(self.dtype)
@@ -365,6 +391,7 @@ def init_axis(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 1).astype(self.dtype)
         self.y = np.random.rand(1).astype(self.dtype)
@@ -375,6 +402,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100, 1, 1).astype(self.dtype)
@@ -385,6 +413,7 @@ def init_axis(self):
 
 
 class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100, 1, 1).astype(self.dtype)
@@ -395,6 +424,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(1, 1, 100).astype(self.dtype)
@@ -405,6 +435,7 @@ def init_axis(self):
 
 
 class TestElementwiseFP16AddOp_commonuse_add1(TestFP16ElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(1, 1, 100).astype(self.dtype)
@@ -415,6 +446,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
         self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
@@ -425,6 +457,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 12).astype(self.dtype)
         self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype)
@@ -435,6 +468,7 @@ def init_axis(self):
 
 
 class TestElementwiseAddOp_same_shape_ysize_large(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 1, 12).astype(self.dtype)
         self.y = np.random.rand(10, 2, 12).astype(self.dtype)
@@ -445,13 +479,14 @@ def init_axis(self):
 
 
 class TestElementwiseAddOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of elementwise_add must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
-            y1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1)
 
             # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64
@@ -462,6 +497,7 @@ def test_errors(self):
 
 
 class TestAddApi(unittest.TestCase):
+
     def _executed_api(self, x, y, name=None):
         return paddle.add(x, y, name)
 
@@ -505,11 +541,13 @@ def test_dygraph(self):
 
 
 class TestAddInplaceApi(TestAddApi):
+
     def _executed_api(self, x, y, name=None):
         return x.add_(y, name)
 
 
 class TestAddInplaceBroadcastSuccess(unittest.TestCase):
+
     def init_data(self):
         self.x_numpy = np.random.rand(2, 3, 4).astype('float')
         self.y_numpy = np.random.rand(3, 4).astype('float')
@@ -526,18 +564,21 @@ def test_broadcast_success(self):
 
 
 class TestAddInplaceBroadcastSuccess2(TestAddInplaceBroadcastSuccess):
+
     def init_data(self):
         self.x_numpy = np.random.rand(1, 2, 3, 1).astype('float')
         self.y_numpy = np.random.rand(3, 1).astype('float')
 
 
 class TestAddInplaceBroadcastSuccess3(TestAddInplaceBroadcastSuccess):
+
     def init_data(self):
         self.x_numpy = np.random.rand(2, 3, 1, 5).astype('float')
         self.y_numpy = np.random.rand(1, 3, 1, 5).astype('float')
 
 
 class TestAddInplaceBroadcastError(unittest.TestCase):
+
     def init_data(self):
         self.x_numpy = np.random.rand(3, 4).astype('float')
         self.y_numpy = np.random.rand(2, 3, 4).astype('float')
@@ -556,18 +597,21 @@ def broadcast_shape_error():
 
 
 class TestAddInplaceBroadcastError2(TestAddInplaceBroadcastError):
+
     def init_data(self):
         self.x_numpy = np.random.rand(2, 1, 4).astype('float')
         self.y_numpy = np.random.rand(2, 3, 4).astype('float')
 
 
 class TestAddInplaceBroadcastError3(TestAddInplaceBroadcastError):
+
     def init_data(self):
         self.x_numpy = np.random.rand(5, 2, 1, 4).astype('float')
         self.y_numpy = np.random.rand(2, 3, 4).astype('float')
 
 
 class TestComplexElementwiseAddOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_add"
         self.dtype = np.float64
@@ -593,8 +637,8 @@ def init_input_output(self):
         self.out = self.x + self.y
 
     def init_grad_input_output(self):
-        self.grad_out = np.ones(self.shape, self.dtype) + 1J * np.ones(
-            self.shape, self.dtype)
+        self.grad_out = np.ones(
+            self.shape, self.dtype) + 1J * np.ones(self.shape, self.dtype)
         self.grad_x = self.grad_out
         self.grad_y = self.grad_out
 
@@ -602,30 +646,28 @@ def test_check_output(self):
         self.check_output(check_eager=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-            user_defined_grads=[self.grad_x, self.grad_y],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        user_defined_grads=[self.grad_x, self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out])
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[self.grad_y],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['Y'],
+                        'Out',
+                        no_grad_set=set("X"),
+                        user_defined_grads=[self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out])
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['X'],
+                        'Out',
+                        no_grad_set=set('Y'),
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_out])
 
 
 class TestRealComplexElementwiseAddOp(TestComplexElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.random(self.shape).astype(self.dtype)
         self.y = np.random.random(self.shape).astype(
@@ -633,13 +675,14 @@ def init_input_output(self):
         self.out = self.x + self.y
 
     def init_grad_input_output(self):
-        self.grad_out = np.ones(self.shape, self.dtype) + 1J * np.ones(
-            self.shape, self.dtype)
+        self.grad_out = np.ones(
+            self.shape, self.dtype) + 1J * np.ones(self.shape, self.dtype)
         self.grad_x = np.real(self.grad_out)
         self.grad_y = self.grad_out
 
 
 class TestBoolAddFloatElementwiseAddop(unittest.TestCase):
+
     def test_static_add(self):
         paddle.enable_static()
         a = 1.5
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index 27dbd3752b550..d522a9d0cde8f 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -22,6 +22,7 @@
 
 
 class ElementwiseDivOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
@@ -49,21 +50,26 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X"))
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=0.05,
+                        no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.05,
+                        no_grad_set=set('Y'))
 
     def init_dtype(self):
         pass
 
 
-@unittest.skipIf(not core.is_compiled_with_cuda() or
-                 not core.is_bfloat16_supported(core.CUDAPlace(0)),
+@unittest.skipIf(not core.is_compiled_with_cuda()
+                 or not core.is_bfloat16_supported(core.CUDAPlace(0)),
                  "core is not compiled with CUDA and not support the bfloat16")
 class TestElementwiseDivOpBF16(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
@@ -100,6 +106,7 @@ def test_check_grad_ingore_y(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseDivOp_scalar(ElementwiseDivOp):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
@@ -111,6 +118,7 @@ def setUp(self):
 
 
 class TestElementwiseDivOp_Vector(ElementwiseDivOp):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
@@ -122,6 +130,7 @@ def setUp(self):
 
 
 class TestElementwiseDivOp_broadcast_0(ElementwiseDivOp):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
@@ -132,12 +141,13 @@ def setUp(self):
 
         self.attrs = {'axis': 0}
         self.outputs = {
-            'Out':
-            np.divide(self.inputs['X'], self.inputs['Y'].reshape(100, 1, 1))
+            'Out': np.divide(self.inputs['X'],
+                             self.inputs['Y'].reshape(100, 1, 1))
         }
 
 
 class TestElementwiseDivOp_broadcast_1(ElementwiseDivOp):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
@@ -148,12 +158,13 @@ def setUp(self):
 
         self.attrs = {'axis': 1}
         self.outputs = {
-            'Out':
-            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 100, 1))
+            'Out': np.divide(self.inputs['X'],
+                             self.inputs['Y'].reshape(1, 100, 1))
         }
 
 
 class TestElementwiseDivOp_broadcast_2(ElementwiseDivOp):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
@@ -163,12 +174,13 @@ def setUp(self):
         }
 
         self.outputs = {
-            'Out':
-            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 100))
+            'Out': np.divide(self.inputs['X'],
+                             self.inputs['Y'].reshape(1, 1, 100))
         }
 
 
 class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
@@ -185,6 +197,7 @@ def setUp(self):
 
 
 class TestElementwiseDivOp_broadcast_4(ElementwiseDivOp):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
@@ -196,6 +209,7 @@ def setUp(self):
 
 
 class TestElementwiseDivOp_broadcast_5(ElementwiseDivOp):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
@@ -207,6 +221,7 @@ def setUp(self):
 
 
 class TestElementwiseDivOp_commonuse_1(ElementwiseDivOp):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
@@ -218,6 +233,7 @@ def setUp(self):
 
 
 class TestElementwiseDivOp_commonuse_2(ElementwiseDivOp):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
@@ -229,6 +245,7 @@ def setUp(self):
 
 
 class TestElementwiseDivOp_xsize_lessthan_ysize(ElementwiseDivOp):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
@@ -243,16 +260,15 @@ def setUp(self):
 
 
 class TestElementwiseDivOp_INT(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
         self.dtype = np.int32
         self.init_dtype()
         self.inputs = {
-            'X': np.random.randint(
-                1, 5, size=[13, 17]).astype(self.dtype),
-            'Y': np.random.randint(
-                1, 5, size=[13, 17]).astype(self.dtype)
+            'X': np.random.randint(1, 5, size=[13, 17]).astype(self.dtype),
+            'Y': np.random.randint(1, 5, size=[13, 17]).astype(self.dtype)
         }
         self.outputs = {'Out': self.inputs['X'] // self.inputs['Y']}
 
@@ -266,6 +282,7 @@ def init_dtype(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestElementwiseDivOpFp16(ElementwiseDivOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -273,19 +290,25 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out', max_relative_error=1)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=1, no_grad_set=set("X"))
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=1,
+                        no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=1, no_grad_set=set('Y'))
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=1,
+                        no_grad_set=set('Y'))
 
 
 class TestElementwiseDivBroadcast(unittest.TestCase):
+
     def test_shape_with_batch_sizes(self):
         with fluid.program_guard(fluid.Program()):
-            x_var = fluid.data(
-                name='x', dtype='float32', shape=[None, 3, None, None])
+            x_var = fluid.data(name='x',
+                               dtype='float32',
+                               shape=[None, 3, None, None])
             one = 2.
             out = one / x_var
             exe = fluid.Executor(fluid.CPUPlace())
@@ -295,6 +318,7 @@ def test_shape_with_batch_sizes(self):
 
 
 class TestDivideOp(unittest.TestCase):
+
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
             x = fluid.data(name="x", shape=[2, 3], dtype="float32")
@@ -316,6 +340,7 @@ def test_dygraph(self):
 
 
 class TestComplexElementwiseDivOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_div"
         self.python_api = paddle.divide
@@ -352,30 +377,28 @@ def test_check_output(self):
         self.check_output(check_eager=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-            user_defined_grads=[self.grad_x, self.grad_y],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        user_defined_grads=[self.grad_x, self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out])
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[self.grad_y],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['Y'],
+                        'Out',
+                        no_grad_set=set("X"),
+                        user_defined_grads=[self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out])
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['X'],
+                        'Out',
+                        no_grad_set=set('Y'),
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_out])
 
 
 class TestRealComplexElementwiseDivOp(TestComplexElementwiseDivOp):
+
     def init_input_output(self):
         self.x = np.random.random((2, 3, 4, 5)).astype(self.dtype)
         self.y = np.random.random(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
index 6ea24b4543f3f..6a74acd89b075 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
@@ -24,6 +24,7 @@
 
 
 class TestElementwiseModOp(OpTest):
+
     def init_kernel_type(self):
         self.use_mkldnn = False
 
@@ -60,6 +61,7 @@ def init_axis(self):
 
 
 class TestElementwiseModOp_scalar(TestElementwiseModOp):
+
     def init_input_output(self):
         scale_x = random.randint(0, 100000000)
         scale_y = random.randint(1, 100000000)
@@ -69,6 +71,7 @@ def init_input_output(self):
 
 
 class TestElementwiseModOpInverse(TestElementwiseModOp):
+
     def init_input_output(self):
         self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
         self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
@@ -76,6 +79,7 @@ def init_input_output(self):
 
 
 class TestFloorDivideOp(unittest.TestCase):
+
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
             x = fluid.data(name="x", shape=[2, 3], dtype="int64")
@@ -96,7 +100,7 @@ def test_dygraph(self):
             self.assertEqual((np_z == z_expected).all(), True)
 
         with fluid.dygraph.guard(fluid.CPUPlace()):
-            # divide by zero 
+            # divide by zero
             np_x = np.array([2, 3, 4])
             np_y = np.array([0])
             x = paddle.to_tensor(np_x)
@@ -106,7 +110,7 @@ def test_dygraph(self):
             except Exception as e:
                 print("Error: Divide by zero encounter in floor_divide\n")
 
-            # divide by zero 
+            # divide by zero
             np_x = np.array([2])
             np_y = np.array([0, 0, 0])
             x = paddle.to_tensor(np_x, dtype="int32")
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
index 9f452ffde74ee..6c300ce24d3d1 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
@@ -21,10 +21,12 @@
 
 
 class TestElementWiseAddOp(unittest.TestCase):
+
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
         self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
 
     def check_forward_backward(self):
+
         def test_with_place(place):
             out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
             x_grad = out_grad
@@ -47,18 +49,21 @@ def test_with_place(place):
             with fluid.program_guard(program):
                 block = program.global_block()
                 for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape)
-                elementwise_add_op = block.append_op(
-                    type="elementwise_add",
-                    inputs={
-                        "X": block.var('x'),
-                        "Y": block.var('y'),
-                    },
-                    outputs={"Out": block.var('out'), },
-                    attrs={"axis": self.axis, })
+                    block.create_var(name=name,
+                                     dtype='float32',
+                                     shape=ground_truth[name].shape)
+                elementwise_add_op = block.append_op(type="elementwise_add",
+                                                     inputs={
+                                                         "X": block.var('x'),
+                                                         "Y": block.var('y'),
+                                                     },
+                                                     outputs={
+                                                         "Out":
+                                                         block.var('out'),
+                                                     },
+                                                     attrs={
+                                                         "axis": self.axis,
+                                                     })
 
                 # generate backward op_desc
                 grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_heaviside_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_heaviside_op.py
index 8a8e74e28ec72..73d110ce132a5 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_heaviside_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_heaviside_op.py
@@ -19,6 +19,7 @@
 
 
 class TestElementwiseOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_heaviside"
         x = np.random.random((13, 17)).astype("float64")
@@ -40,6 +41,7 @@ def test_check_grad_ingore_y(self):
 
 
 class TestHeavisideBroadcast(unittest.TestCase):
+
     def setUp(self):
         self.input_1 = np.random.rand(2, 100, 13, 17).astype("float32")
         self.input_2 = np.random.rand(100, 13, 17).astype("float32")
@@ -78,6 +80,7 @@ def test_broadcast(self):
 
 
 class TestHeavisideAPI_float64(unittest.TestCase):
+
     def setUp(self):
         self.x_np = np.random.random((13, 17)).astype("float64")
         self.y_np = np.random.random((13, 17)).astype("float64")
@@ -92,10 +95,12 @@ def test_static(self):
             paddle.enable_static()
             prog = paddle.static.Program()
             with paddle.static.program_guard(prog):
-                x = paddle.static.data(
-                    name=f"x_{self.dtype}", shape=[13, 17], dtype=self.dtype)
-                y = paddle.static.data(
-                    name=f"y_{self.dtype}", shape=[13, 17], dtype=self.dtype)
+                x = paddle.static.data(name=f"x_{self.dtype}",
+                                       shape=[13, 17],
+                                       dtype=self.dtype)
+                y = paddle.static.data(name=f"y_{self.dtype}",
+                                       shape=[13, 17],
+                                       dtype=self.dtype)
                 out = paddle.heaviside(x, y)
 
             exe = paddle.static.Executor(place=place)
@@ -114,13 +119,14 @@ def test_dygraph(self):
                          if paddle.device.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            result = paddle.heaviside(
-                paddle.to_tensor(self.x_np), paddle.to_tensor(self.y_np))
+            result = paddle.heaviside(paddle.to_tensor(self.x_np),
+                                      paddle.to_tensor(self.y_np))
 
             self.assertTrue(np.allclose(result.numpy(), self.out_np))
 
 
 class TestHeavisideAPI_float32(TestHeavisideAPI_float64):
+
     def setUp(self):
         self.x_np = np.random.random((13, 17)).astype("float32")
         self.y_np = np.random.random((13, 17)).astype("float32")
@@ -129,6 +135,7 @@ def setUp(self):
 
 
 class TestHeavisideAPI_int64(TestHeavisideAPI_float64):
+
     def setUp(self):
         self.x_np = np.random.random((13, 17)).astype("int64")
         self.y_np = np.random.random((13, 17)).astype("int64")
@@ -137,6 +144,7 @@ def setUp(self):
 
 
 class TestHeavisideAPI_int32(TestHeavisideAPI_float64):
+
     def setUp(self):
         self.x_np = np.random.random((13, 17)).astype("int32")
         self.y_np = np.random.random((13, 17)).astype("int32")
@@ -145,6 +153,7 @@ def setUp(self):
 
 
 class TestHeavisideError(unittest.TestCase):
+
     def test_input(self):
         paddle.disable_static()
 
@@ -159,8 +168,8 @@ def test_input_y():
         self.assertRaises(ValueError, test_input_y)
 
         def test_input_xy():
-            paddle.heaviside(
-                paddle.randn([100], 'float32'), paddle.randn([100], 'float64'))
+            paddle.heaviside(paddle.randn([100], 'float32'),
+                             paddle.randn([100], 'float64'))
 
         self.assertRaises(ValueError, test_input_xy)
 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
index 21b0595b6dc86..1ab1bf07b0d5c 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
@@ -24,6 +24,7 @@
 
 
 class TestElementwiseOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_max"
         self.python_api = paddle.maximum
@@ -49,18 +50,23 @@ def test_check_grad_normal(self):
             self.check_grad(['X', 'Y'], 'Out', check_eager=True)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=0.005,
+                        no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.005,
+                        no_grad_set=set('Y'))
 
 
 @unittest.skipIf(
     core.is_compiled_with_cuda() and core.cudnn_version() < 8100,
     "run test when gpu is availble and the minimum cudnn version is 8.1.0.")
 class TestElementwiseBF16Op(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_max"
         self.python_api = paddle.maximum
@@ -99,6 +105,7 @@ def test_check_grad_ingore_y(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseMaxOp_scalar(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_max"
         self.python_api = paddle.maximum
@@ -109,6 +116,7 @@ def setUp(self):
 
 
 class TestElementwiseMaxOp_Vector(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_max"
         self.python_api = paddle.maximum
@@ -120,6 +128,7 @@ def setUp(self):
 
 
 class TestElementwiseMaxOp_broadcast_0(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_max"
         self.python_api = paddle.maximum
@@ -131,12 +140,13 @@ def setUp(self):
 
         self.attrs = {'axis': 0}
         self.outputs = {
-            'Out':
-            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(100, 1, 1))
+            'Out': np.maximum(self.inputs['X'],
+                              self.inputs['Y'].reshape(100, 1, 1))
         }
 
 
 class TestElementwiseMaxOp_broadcast_1(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_max"
         self.python_api = paddle.maximum
@@ -148,12 +158,13 @@ def setUp(self):
 
         self.attrs = {'axis': 1}
         self.outputs = {
-            'Out':
-            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 100, 1))
+            'Out': np.maximum(self.inputs['X'],
+                              self.inputs['Y'].reshape(1, 100, 1))
         }
 
 
 class TestElementwiseMaxOp_broadcast_2(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_max"
         self.python_api = paddle.maximum
@@ -164,12 +175,13 @@ def setUp(self):
         self.inputs = {'X': x, 'Y': y}
 
         self.outputs = {
-            'Out':
-            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 100))
+            'Out': np.maximum(self.inputs['X'],
+                              self.inputs['Y'].reshape(1, 1, 100))
         }
 
 
 class TestElementwiseMaxOp_broadcast_3(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_max"
         self.python_api = paddle.maximum
@@ -187,6 +199,7 @@ def setUp(self):
 
 
 class TestElementwiseMaxOp_broadcast_4(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_max"
         self.python_api = paddle.maximum
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
index f8dc9602c35a5..e23662483919f 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
@@ -25,6 +25,7 @@
 
 
 class TestElementwiseOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_min"
         self.python_api = paddle.minimum
@@ -50,17 +51,22 @@ def test_check_grad_normal(self):
             self.check_grad(['X', 'Y'], 'Out', check_eager=True)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=0.005,
+                        no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.005,
+                        no_grad_set=set('Y'))
 
 
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseMinOp_scalar(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_min"
         self.python_api = paddle.minimum
@@ -71,6 +77,7 @@ def setUp(self):
 
 
 class TestElementwiseMinOp_Vector(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_min"
         self.python_api = paddle.minimum
@@ -82,6 +89,7 @@ def setUp(self):
 
 
 class TestElementwiseMinOp_broadcast_0(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_min"
         self.python_api = paddle.minimum
@@ -93,12 +101,13 @@ def setUp(self):
 
         self.attrs = {'axis': 0}
         self.outputs = {
-            'Out':
-            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(100, 1, 1))
+            'Out': np.minimum(self.inputs['X'],
+                              self.inputs['Y'].reshape(100, 1, 1))
         }
 
 
 class TestElementwiseMinOp_broadcast_1(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_min"
         self.python_api = paddle.minimum
@@ -110,12 +119,13 @@ def setUp(self):
 
         self.attrs = {'axis': 1}
         self.outputs = {
-            'Out':
-            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 100, 1))
+            'Out': np.minimum(self.inputs['X'],
+                              self.inputs['Y'].reshape(1, 100, 1))
         }
 
 
 class TestElementwiseMinOp_broadcast_2(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_min"
         self.python_api = paddle.minimum
@@ -126,12 +136,13 @@ def setUp(self):
         self.inputs = {'X': x, 'Y': y}
 
         self.outputs = {
-            'Out':
-            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 100))
+            'Out': np.minimum(self.inputs['X'],
+                              self.inputs['Y'].reshape(1, 1, 100))
         }
 
 
 class TestElementwiseMinOp_broadcast_3(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_min"
         self.python_api = paddle.minimum
@@ -149,6 +160,7 @@ def setUp(self):
 
 
 class TestElementwiseMinOp_broadcast_4(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_min"
         self.python_api = paddle.minimum
@@ -162,6 +174,7 @@ def setUp(self):
 
 
 class TestElementwiseMinOpFP16(unittest.TestCase):
+
     def get_out_and_grad(self, x_np, y_np, axis, place, use_fp32=False):
         assert x_np.dtype == np.float16
         assert y_np.dtype == np.float16
@@ -194,10 +207,10 @@ def check_main(self, x_shape, y_shape, axis=-1):
                                                   False)
         z_2, x_g_2, y_g_2 = self.get_out_and_grad(x_np, y_np, axis, place, True)
         self.assertTrue(np.array_equal(z_1, z_2), "{} vs {}".format(z_1, z_2))
-        self.assertTrue(
-            np.array_equal(x_g_1, x_g_2), "{} vs {}".format(x_g_1, x_g_2))
-        self.assertTrue(
-            np.array_equal(y_g_1, y_g_2), "{} vs {}".format(y_g_1, y_g_2))
+        self.assertTrue(np.array_equal(x_g_1, x_g_2),
+                        "{} vs {}".format(x_g_1, x_g_2))
+        self.assertTrue(np.array_equal(y_g_1, y_g_2),
+                        "{} vs {}".format(y_g_1, y_g_2))
 
     def test_main(self):
         self.check_main((13, 17), (13, 17))
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
index c6973255f2644..436ce466be384 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
@@ -24,6 +24,7 @@
 
 
 class TestElementwiseModOp(OpTest):
+
     def init_kernel_type(self):
         self.use_mkldnn = False
 
@@ -62,6 +63,7 @@ def init_axis(self):
 
 
 class TestElementwiseModOp_scalar(TestElementwiseModOp):
+
     def init_input_output(self):
         scale_x = random.randint(0, 100000000)
         scale_y = random.randint(1, 100000000)
@@ -71,6 +73,7 @@ def init_input_output(self):
 
 
 class TestElementwiseModOpFloat(TestElementwiseModOp):
+
     def init_dtype(self):
         self.dtype = np.float32
 
@@ -87,11 +90,13 @@ def test_check_output(self):
 
 
 class TestElementwiseModOpDouble(TestElementwiseModOpFloat):
+
     def init_dtype(self):
         self.dtype = np.float64
 
 
 class TestRemainderOp(unittest.TestCase):
+
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
             x = fluid.data(name="x", shape=[2, 3], dtype="int64")
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index b35b2840ed30a..7035f3b1ca7a5 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -27,6 +27,7 @@
 
 
 class ElementwiseMulOp(OpTest):
+
     def init_kernel_type(self):
         self.use_mkldnn = False
 
@@ -52,24 +53,23 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X', 'Y'], 'Out', check_dygraph=(self.use_mkldnn == False))
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        check_dygraph=(self.use_mkldnn == False))
 
     def test_check_grad_ingore_x(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            check_dygraph=(self.use_mkldnn == False))
+        self.check_grad(['Y'],
+                        'Out',
+                        no_grad_set=set("X"),
+                        check_dygraph=(self.use_mkldnn == False))
 
     def test_check_grad_ingore_y(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            check_dygraph=(self.use_mkldnn == False))
+        self.check_grad(['X'],
+                        'Out',
+                        no_grad_set=set('Y'),
+                        check_dygraph=(self.use_mkldnn == False))
 
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
@@ -84,6 +84,7 @@ def init_axis(self):
 
 
 class TestBF16ElementwiseMulOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.dtype = np.uint16
@@ -97,8 +98,7 @@ def setUp(self):
         self.inputs = {
             'X':
             OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.x)),
-            'Y':
-            OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.y))
+            'Y': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.y))
         }
         self.outputs = {'Out': convert_float_to_uint16(self.out)}
         self.attrs = {'axis': self.axis, 'use_mkldnn': False}
@@ -119,6 +119,7 @@ def test_check_grad_ingore_y(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseMulOp_scalar(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -130,6 +131,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_Vector(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -141,6 +143,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -151,6 +154,7 @@ def init_axis(self):
 
 
 class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -166,6 +170,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -180,6 +185,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -195,6 +201,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -206,6 +213,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -219,11 +227,13 @@ def setUp(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestElementwiseMulOpFp16(ElementwiseMulOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -235,6 +245,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -246,6 +257,7 @@ def setUp(self):
 
 
 class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
@@ -262,13 +274,14 @@ def setUp(self):
 
 
 class TestElementwiseMulOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of elementwise_mul must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
-            y1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.elementwise_mul, x1, y1)
 
             # the input dtype of elementwise_mul must be float16 or float32 or float64 or int32 or int64
@@ -279,6 +292,7 @@ def test_errors(self):
 
 
 class TestComplexElementwiseMulOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.init_base_dtype()
@@ -314,30 +328,28 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-            user_defined_grads=[self.grad_x, self.grad_y],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        user_defined_grads=[self.grad_x, self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out])
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[self.grad_y],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['Y'],
+                        'Out',
+                        no_grad_set=set("X"),
+                        user_defined_grads=[self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out])
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['X'],
+                        'Out',
+                        no_grad_set=set('Y'),
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_out])
 
 
 class TestRealComplexElementwiseMulOp(TestComplexElementwiseMulOp):
+
     def init_input_output(self):
         self.x = np.random.random((2, 3, 4, 5)).astype(self.dtype)
         self.y = np.random.random(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
index ccfed61185f0c..f06d90d27d42d 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
@@ -27,6 +27,7 @@
 
 
 class TestElementwiseMulDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -42,8 +43,11 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         paddle.enable_static()
@@ -55,6 +59,7 @@ def test_grad(self):
 
 
 class TestElementwiseMulBroadcastDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -70,8 +75,11 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, shape[:-1]).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         paddle.enable_static()
@@ -83,6 +91,7 @@ def test_grad(self):
 
 
 class TestElementwiseAddDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -98,8 +107,11 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         paddle.enable_static()
@@ -111,6 +123,7 @@ def test_grad(self):
 
 
 class TestElementwiseAddBroadcastDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -126,8 +139,11 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, shape[:-1]).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         paddle.enable_static()
@@ -139,6 +155,7 @@ def test_grad(self):
 
 
 class TestElementwiseSubDoubleGradCheck(unittest.TestCase):
+
     def subtract_wrapper(self, x):
         return paddle.subtract(x[0], x[1])
 
@@ -157,13 +174,16 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.subtract_wrapper, [x, y],
-            out,
-            x_init=[x_arr, y_arr],
-            place=place)
+        gradient_checker.double_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(self.subtract_wrapper,
+                                                       [x, y],
+                                                       out,
+                                                       x_init=[x_arr, y_arr],
+                                                       place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -175,6 +195,7 @@ def test_grad(self):
 
 
 class TestElementwiseSubBroadcastDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -190,8 +211,11 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, shape[:-1]).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         paddle.enable_static()
@@ -203,6 +227,7 @@ def test_grad(self):
 
 
 class TestElementwiseDivDoubleGradCheck(unittest.TestCase):
+
     def divide_wrapper(self, x):
         return paddle.divide(x[0], x[1])
 
@@ -222,14 +247,18 @@ def func(self, place):
         y_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         y_arr[np.abs(y_arr) < 0.005] = 0.02
 
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps, atol=1e-3)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.divide_wrapper, [x, y],
-            out,
-            x_init=[x_arr, y_arr],
-            place=place,
-            atol=1e-3)
+        gradient_checker.double_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps,
+                                           atol=1e-3)
+        gradient_checker.double_grad_check_for_dygraph(self.divide_wrapper,
+                                                       [x, y],
+                                                       out,
+                                                       x_init=[x_arr, y_arr],
+                                                       place=place,
+                                                       atol=1e-3)
 
     def test_grad(self):
         paddle.enable_static()
@@ -241,6 +270,7 @@ def test_grad(self):
 
 
 class TestElementwiseDivBroadcastDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -257,8 +287,12 @@ def func(self, place):
         y_arr = np.random.uniform(-1, 1, shape[1:-1]).astype(dtype)
         y_arr[np.abs(y_arr) < 0.005] = 0.02
 
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps, atol=1e-3)
+        gradient_checker.double_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps,
+                                           atol=1e-3)
 
     def test_grad(self):
         paddle.enable_static()
@@ -270,6 +304,7 @@ def test_grad(self):
 
 
 class TestElementwiseAddTripleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -285,8 +320,11 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
-        gradient_checker.triple_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.triple_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         paddle.enable_static()
@@ -298,6 +336,7 @@ def test_grad(self):
 
 
 class TestElementwiseAddBroadcastTripleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -313,8 +352,11 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, shape[:-1]).astype(dtype)
 
-        gradient_checker.triple_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.triple_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         paddle.enable_static()
@@ -326,6 +368,7 @@ def test_grad(self):
 
 
 class TestElementwiseMulTripleGradCheck(unittest.TestCase):
+
     def multiply_wrapper(self, x):
         return paddle.multiply(x[0], x[1])
 
@@ -346,11 +389,13 @@ def func(self, place):
 
         gradient_checker.triple_grad_check(
             [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.triple_grad_check_for_dygraph(
             self.multiply_wrapper, [x, y],
             out,
             x_init=[x_arr, y_arr],
             place=place)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_grad(self):
         paddle.enable_static()
@@ -362,6 +407,7 @@ def test_grad(self):
 
 
 class TestElementwiseMulBroadcastTripleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -377,8 +423,11 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, shape[:-1]).astype(dtype)
 
-        gradient_checker.triple_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.triple_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
index 3c9e350360dd1..12f2a21736084 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
@@ -21,6 +21,7 @@
 
 
 class TestElementwisePowOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_pow"
         self.python_api = paddle.pow
@@ -44,6 +45,7 @@ def test_check_grad_normal(self):
 
 
 class TestElementwisePowOp_big_shape_1(TestElementwisePowOp):
+
     def setUp(self):
         self.op_type = "elementwise_pow"
         self.python_api = paddle.pow
@@ -55,6 +57,7 @@ def setUp(self):
 
 
 class TestElementwisePowOp_big_shape_2(TestElementwisePowOp):
+
     def setUp(self):
         self.op_type = "elementwise_pow"
         self.python_api = paddle.pow
@@ -68,6 +71,7 @@ def setUp(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwisePowOp_scalar(TestElementwisePowOp):
+
     def setUp(self):
         self.op_type = "elementwise_pow"
         self.python_api = paddle.pow
@@ -79,6 +83,7 @@ def setUp(self):
 
 
 class TestElementwisePowOp_tensor(TestElementwisePowOp):
+
     def setUp(self):
         self.op_type = "elementwise_pow"
         self.python_api = paddle.pow
@@ -90,6 +95,7 @@ def setUp(self):
 
 
 class TestElementwisePowOp_broadcast_0(TestElementwisePowOp):
+
     def setUp(self):
         self.op_type = "elementwise_pow"
         self.python_api = paddle.pow
@@ -101,6 +107,7 @@ def setUp(self):
 
 
 class TestElementwisePowOp_broadcast_1(TestElementwisePowOp):
+
     def setUp(self):
         self.op_type = "elementwise_pow"
         self.python_api = paddle.pow
@@ -115,6 +122,7 @@ def setUp(self):
 
 
 class TestElementwisePowOp_broadcast_2(TestElementwisePowOp):
+
     def setUp(self):
         self.op_type = "elementwise_pow"
         self.python_api = paddle.pow
@@ -124,12 +132,13 @@ def setUp(self):
         }
         self.attrs = {'axis': 0}
         self.outputs = {
-            'Out':
-            np.power(self.inputs['X'], self.inputs['Y'].reshape(100, 1, 1))
+            'Out': np.power(self.inputs['X'],
+                            self.inputs['Y'].reshape(100, 1, 1))
         }
 
 
 class TestElementwisePowOp_broadcast_3(TestElementwisePowOp):
+
     def setUp(self):
         self.op_type = "elementwise_pow"
         self.python_api = paddle.pow
@@ -139,12 +148,13 @@ def setUp(self):
         }
         self.attrs = {'axis': 1}
         self.outputs = {
-            'Out': np.power(self.inputs['X'], self.inputs['Y'].reshape(1, 20, 5,
-                                                                       1))
+            'Out': np.power(self.inputs['X'],
+                            self.inputs['Y'].reshape(1, 20, 5, 1))
         }
 
 
 class TestElementwisePowOp_broadcast_4(TestElementwisePowOp):
+
     def setUp(self):
         self.op_type = "elementwise_pow"
         self.python_api = paddle.pow
@@ -156,6 +166,7 @@ def setUp(self):
 
 
 class TestElementwisePowOpInt(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_pow"
         self.python_api = paddle.pow
@@ -170,6 +181,7 @@ def test_check_output(self):
 
 
 class TestElementwisePowGradOpInt(unittest.TestCase):
+
     def setUp(self):
         self.x = np.asarray([1, 3, 6])
         self.y = np.asarray([1, 1, 1])
@@ -185,6 +197,7 @@ def setUp(self):
         print(self.grad_res, self.grad_x, self.grad_y)
 
     def test_grad(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         places = [fluid.CPUPlace()]
         if fluid.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
@@ -200,6 +213,7 @@ def test_grad(self):
                 self.assertTrue(np.array_equal(res.gradient(), self.grad_res))
                 self.assertTrue(np.array_equal(x.gradient(), self.grad_x))
                 self.assertTrue(np.array_equal(y.gradient(), self.grad_y))
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index 6801a4bc5f30b..0c5fc98397850 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -22,6 +22,7 @@
 
 
 class TestElementwiseOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
@@ -37,15 +38,20 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=0.005,
+                        no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.005,
+                        no_grad_set=set('Y'))
 
 
 class TestBF16ElementwiseOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.dtype = np.uint16
@@ -75,6 +81,7 @@ def test_check_grad_ingore_y(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseSubOp_scalar(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
@@ -85,6 +92,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_Vector(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
@@ -95,6 +103,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_broadcast_0(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
@@ -109,6 +118,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_broadcast_1(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
@@ -123,6 +133,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
@@ -136,6 +147,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_broadcast_3(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
@@ -150,6 +162,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_broadcast_4(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
@@ -160,6 +173,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_commonuse_1(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
@@ -170,6 +184,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_commonuse_2(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
@@ -180,6 +195,7 @@ def setUp(self):
 
 
 class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseOp):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
@@ -195,6 +211,7 @@ def setUp(self):
 
 
 class TestComplexElementwiseSubOp(OpTest):
+
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.dtype = np.float64
@@ -220,8 +237,8 @@ def init_input_output(self):
         self.out = self.x - self.y
 
     def init_grad_input_output(self):
-        self.grad_out = np.ones(self.shape, self.dtype) + 1J * np.ones(
-            self.shape, self.dtype)
+        self.grad_out = np.ones(
+            self.shape, self.dtype) + 1J * np.ones(self.shape, self.dtype)
         self.grad_x = self.grad_out
         self.grad_y = -self.grad_out
 
@@ -229,30 +246,28 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-            user_defined_grads=[self.grad_x, self.grad_y],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        user_defined_grads=[self.grad_x, self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out])
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[self.grad_y],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['Y'],
+                        'Out',
+                        no_grad_set=set("X"),
+                        user_defined_grads=[self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out])
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out])
+        self.check_grad(['X'],
+                        'Out',
+                        no_grad_set=set('Y'),
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_out])
 
 
 class TestRealComplexElementwiseSubOp(TestComplexElementwiseSubOp):
+
     def init_input_output(self):
         self.x = np.random.random(self.shape).astype(self.dtype)
         self.y = np.random.random(self.shape).astype(
@@ -260,13 +275,14 @@ def init_input_output(self):
         self.out = self.x - self.y
 
     def init_grad_input_output(self):
-        self.grad_out = np.ones(self.shape, self.dtype) + 1J * np.ones(
-            self.shape, self.dtype)
+        self.grad_out = np.ones(
+            self.shape, self.dtype) + 1J * np.ones(self.shape, self.dtype)
         self.grad_x = np.real(self.grad_out)
         self.grad_y = -self.grad_out
 
 
 class TestSubtractApi(unittest.TestCase):
+
     def _executed_api(self, x, y, name=None):
         return paddle.subtract(x, y, name)
 
@@ -309,11 +325,13 @@ def test_dygraph(self):
 
 
 class TestSubtractInplaceApi(TestSubtractApi):
+
     def _executed_api(self, x, y, name=None):
         return x.subtract_(y, name)
 
 
 class TestSubtractInplaceBroadcastSuccess(unittest.TestCase):
+
     def init_data(self):
         self.x_numpy = np.random.rand(2, 3, 4).astype('float')
         self.y_numpy = np.random.rand(3, 4).astype('float')
@@ -330,18 +348,21 @@ def test_broadcast_success(self):
 
 
 class TestSubtractInplaceBroadcastSuccess2(TestSubtractInplaceBroadcastSuccess):
+
     def init_data(self):
         self.x_numpy = np.random.rand(1, 2, 3, 1).astype('float')
         self.y_numpy = np.random.rand(3, 1).astype('float')
 
 
 class TestSubtractInplaceBroadcastSuccess3(TestSubtractInplaceBroadcastSuccess):
+
     def init_data(self):
         self.x_numpy = np.random.rand(2, 3, 1, 5).astype('float')
         self.y_numpy = np.random.rand(1, 3, 1, 5).astype('float')
 
 
 class TestSubtractInplaceBroadcastError(unittest.TestCase):
+
     def init_data(self):
         self.x_numpy = np.random.rand(3, 4).astype('float')
         self.y_numpy = np.random.rand(2, 3, 4).astype('float')
@@ -360,12 +381,14 @@ def broadcast_shape_error():
 
 
 class TestSubtractInplaceBroadcastError2(TestSubtractInplaceBroadcastError):
+
     def init_data(self):
         self.x_numpy = np.random.rand(2, 1, 4).astype('float')
         self.y_numpy = np.random.rand(2, 3, 4).astype('float')
 
 
 class TestSubtractInplaceBroadcastError3(TestSubtractInplaceBroadcastError):
+
     def init_data(self):
         self.x_numpy = np.random.rand(5, 2, 1, 4).astype('float')
         self.y_numpy = np.random.rand(2, 3, 4).astype('float')
diff --git a/python/paddle/fluid/tests/unittests/test_ema.py b/python/paddle/fluid/tests/unittests/test_ema.py
index ec992a8132aa7..ae0dff4edf9e3 100644
--- a/python/paddle/fluid/tests/unittests/test_ema.py
+++ b/python/paddle/fluid/tests/unittests/test_ema.py
@@ -20,6 +20,7 @@
 
 
 class TestExponentialMovingAverage(unittest.TestCase):
+
     def setUp(self):
         self._places = [fluid.CPUPlace()]
         if fluid.core.is_compiled_with_cuda():
@@ -63,8 +64,8 @@ def train(self, place):
                 params.append(tmp_param)
 
         with self._ema.apply(exe):
-            final_ema = np.array(fluid.global_scope().find_var(self._param_name)
-                                 .get_tensor())
+            final_ema = np.array(fluid.global_scope().find_var(
+                self._param_name).get_tensor())
             data = np.random.random(size=(10, 5)).astype('float32')
             exe.run(program=self._test_program, feed={'x': data})
         return params, final_ema
@@ -75,8 +76,8 @@ def test_check_ema(self):
             manu_ema = np.zeros_like(final_ema)
             if len(params) > 0:
                 for param in params:
-                    manu_ema = self._ema_decay * manu_ema + (1 - self._ema_decay
-                                                             ) * param
+                    manu_ema = self._ema_decay * manu_ema + (
+                        1 - self._ema_decay) * param
                 manu_ema = manu_ema / (1.0 - self._ema_decay**len(params))
             self.assertTrue(np.allclose(manu_ema, final_ema))
 
diff --git a/python/paddle/fluid/tests/unittests/test_ema_fleet.py b/python/paddle/fluid/tests/unittests/test_ema_fleet.py
index e0526deb59af8..c08f811a1783d 100644
--- a/python/paddle/fluid/tests/unittests/test_ema_fleet.py
+++ b/python/paddle/fluid/tests/unittests/test_ema_fleet.py
@@ -26,6 +26,7 @@ def gen_data():
 
 
 class TestFleetStaticEMA(unittest.TestCase):
+
     def setUp(self):
         self._places = [paddle.CPUPlace()]
         if paddle.device.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
index 120880a5fc969..a1a4a263d936a 100644
--- a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
@@ -20,6 +20,7 @@
 
 
 class TestEmbeddingIdStopGradientBase(unittest.TestCase):
+
     def setUp(self):
         self.reshape_times = 1
         self.iteration = 10
@@ -70,15 +71,17 @@ def run_program(self, place, stop_gradient=False):
 
                 fetch_val = None
                 for _ in six.moves.range(self.iteration):
-                    fetch_val = exe.run(
-                        feed={x_1.name: x1_data,
-                              x_2.name: x2_data},
-                        fetch_list=[emb])[0]
+                    fetch_val = exe.run(feed={
+                        x_1.name: x1_data,
+                        x_2.name: x2_data
+                    },
+                                        fetch_list=[emb])[0]
 
                 return fetch_val
 
 
 class TestEmbeddingIdStopGradient2(TestEmbeddingIdStopGradientBase):
+
     def setUp(self):
         self.reshape_times = 100
         self.iteration = 10
diff --git a/python/paddle/fluid/tests/unittests/test_empty_like_op.py b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
index 385a0c0b6e84c..ea37f6a6d1ac0 100644
--- a/python/paddle/fluid/tests/unittests/test_empty_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
@@ -24,16 +24,17 @@
 
 
 class TestEmptyLikeAPICommon(unittest.TestCase):
+
     def __check_out__(self, out):
         data_type = convert_dtype(out.dtype)
-        self.assertEqual(data_type, self.dst_dtype,
-                         'dtype should be %s, but get %s' %
-                         (self.dst_dtype, data_type))
+        self.assertEqual(
+            data_type, self.dst_dtype,
+            'dtype should be %s, but get %s' % (self.dst_dtype, data_type))
 
         shape = out.shape
-        self.assertTupleEqual(shape, self.dst_shape,
-                              'shape should be %s, but get %s' %
-                              (self.dst_shape, shape))
+        self.assertTupleEqual(
+            shape, self.dst_shape,
+            'shape should be %s, but get %s' % (self.dst_shape, shape))
 
         if data_type in ['float32', 'float64', 'int32', 'int64']:
             max_value = np.nanmax(out)
@@ -53,6 +54,7 @@ def __check_out__(self, out):
 
 
 class TestEmptyLikeAPI(TestEmptyLikeAPICommon):
+
     def setUp(self):
         self.init_config()
 
@@ -70,6 +72,7 @@ def init_config(self):
 
 
 class TestEmptyLikeAPI2(TestEmptyLikeAPI):
+
     def init_config(self):
         self.x = np.random.random((200, 3)).astype("float64")
         self.dtype = self.x.dtype
@@ -78,6 +81,7 @@ def init_config(self):
 
 
 class TestEmptyLikeAPI3(TestEmptyLikeAPI):
+
     def init_config(self):
         self.x = np.random.random((200, 3)).astype("int")
         self.dtype = self.x.dtype
@@ -86,6 +90,7 @@ def init_config(self):
 
 
 class TestEmptyLikeAPI4(TestEmptyLikeAPI):
+
     def init_config(self):
         self.x = np.random.random((200, 3)).astype("int64")
         self.dtype = self.x.dtype
@@ -94,6 +99,7 @@ def init_config(self):
 
 
 class TestEmptyLikeAPI5(TestEmptyLikeAPI):
+
     def init_config(self):
         self.x = np.random.random((200, 3)).astype("bool")
         self.dtype = self.x.dtype
@@ -102,6 +108,7 @@ def init_config(self):
 
 
 class TestEmptyLikeAPI6(TestEmptyLikeAPI):
+
     def init_config(self):
         self.x = np.random.random((200, 3)).astype("float64")
         self.dtype = "float32"
@@ -110,6 +117,7 @@ def init_config(self):
 
 
 class TestEmptyLikeAPI7(TestEmptyLikeAPI):
+
     def init_config(self):
         self.x = np.random.random((200, 3)).astype("int")
         self.dtype = "float32"
@@ -118,6 +126,7 @@ def init_config(self):
 
 
 class TestEmptyLikeAPI8(TestEmptyLikeAPI):
+
     def init_config(self):
         self.x = np.random.random((200, 3)).astype("int64")
         self.dtype = "float32"
@@ -126,6 +135,7 @@ def init_config(self):
 
 
 class TestEmptyLikeAPI9(TestEmptyLikeAPI):
+
     def init_config(self):
         self.x = np.random.random((200, 3)).astype("bool")
         self.dtype = "float32"
@@ -134,6 +144,7 @@ def init_config(self):
 
 
 class TestEmptyLikeAPI10(TestEmptyLikeAPI):
+
     def init_config(self):
         self.x = np.random.random((200, 3)).astype("float32")
         self.dtype = "bool"
@@ -142,6 +153,7 @@ def init_config(self):
 
 
 class TestEmptyLikeAPI_Static(TestEmptyLikeAPICommon):
+
     def setUp(self):
         self.init_config()
 
@@ -155,13 +167,14 @@ def test_static_graph(self):
 
         with program_guard(train_program, startup_program):
             x = np.random.random(self.x_shape).astype(dtype)
-            data_x = paddle.static.data(
-                'x', shape=self.data_x_shape, dtype=dtype)
+            data_x = paddle.static.data('x',
+                                        shape=self.data_x_shape,
+                                        dtype=dtype)
 
             out = paddle.empty_like(data_x)
 
-        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
         exe = paddle.static.Executor(place)
         res = exe.run(train_program, feed={'x': x}, fetch_list=[out])
 
@@ -177,13 +190,16 @@ def init_config(self):
 
 
 class TestEmptyLikeAPI_Static2(TestEmptyLikeAPI_Static):
+
     def init_config(self):
         self.x_shape = (3, 200, 3)
         self.data_x_shape = [-1, 200, 3]
 
 
 class TestEmptyError(unittest.TestCase):
+
     def test_attr(self):
+
         def test_dtype():
             x = np.random.random((200, 3)).astype("float64")
             dtype = 'uint8'
diff --git a/python/paddle/fluid/tests/unittests/test_empty_op.py b/python/paddle/fluid/tests/unittests/test_empty_op.py
index 371c59a1b8cce..50580cded909b 100644
--- a/python/paddle/fluid/tests/unittests/test_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_empty_op.py
@@ -25,6 +25,7 @@
 
 # Situation 1: Attr(shape) is a list(without tensor)
 class TestEmptyOp(OpTest):
+
     def setUp(self):
         self.op_type = "empty"
         self.init_config()
@@ -61,6 +62,7 @@ def init_config(self):
 
 
 class TestEmptyOp2(TestEmptyOp):
+
     def init_config(self):
         shape = [500, 3]
         dtype = 'float64'
@@ -71,6 +73,7 @@ def init_config(self):
 
 
 class TestEmptyOp3(TestEmptyOp):
+
     def init_config(self):
         shape = [500, 3]
         dtype = 'int32'
@@ -81,6 +84,7 @@ def init_config(self):
 
 
 class TestEmptyOp4(TestEmptyOp):
+
     def init_config(self):
         shape = [500, 3]
         dtype = 'int64'
@@ -91,6 +95,7 @@ def init_config(self):
 
 
 class TestEmptyOp5(TestEmptyOp):
+
     def init_config(self):
         shape = [500, 3]
         dtype = 'bool'
@@ -102,6 +107,7 @@ def init_config(self):
 
 # Situation 2: shape is a tensor
 class TestEmptyOp_ShapeTensor(OpTest):
+
     def setUp(self):
         self.op_type = "empty"
         self.init_config()
@@ -139,6 +145,7 @@ def verify_output(self, outs):
 
 # Situation 3: Attr(shape) is a list(with tensor)
 class TestEmptyOp_ShapeTensorList(OpTest):
+
     def setUp(self):
         self.op_type = "empty"
         self.init_config()
@@ -183,6 +190,7 @@ def verify_output(self, outs):
 
 
 class TestEmptyAPI(unittest.TestCase):
+
     def __check_out__(self, out, dtype='float32'):
         max_value = np.nanmax(np.array(out))
         min_value = np.nanmin(np.array(out))
@@ -228,12 +236,15 @@ def test_static_graph(self):
         positive_2_int32 = fluid.layers.fill_constant([1], "int32", 3)
         positive_2_int64 = fluid.layers.fill_constant([1], "int64", 3)
 
-        shape_tensor_int32 = fluid.data(
-            name="shape_tensor_int32", shape=[2], dtype="int32")
-        shape_tensor_int64 = fluid.data(
-            name="shape_tensor_int64", shape=[2], dtype="int64")
-        shape_tensor_unknown = fluid.data(
-            name="shape_tensor_unknown", shape=[-1], dtype="int64")
+        shape_tensor_int32 = fluid.data(name="shape_tensor_int32",
+                                        shape=[2],
+                                        dtype="int32")
+        shape_tensor_int64 = fluid.data(name="shape_tensor_int64",
+                                        shape=[2],
+                                        dtype="int64")
+        shape_tensor_unknown = fluid.data(name="shape_tensor_unknown",
+                                          shape=[-1],
+                                          dtype="int64")
 
         out_1 = paddle.empty(shape=[200, 3], dtype=dtype)
         out_2 = paddle.empty(shape=shape_tensor_int32, dtype=dtype)
@@ -262,7 +273,9 @@ def test_static_graph(self):
 
 
 class TestEmptyError(unittest.TestCase):
+
     def test_attr(self):
+
         def test_dtype():
             shape = [200, 3]
             dtype = 'uint8'
diff --git a/python/paddle/fluid/tests/unittests/test_entry_attr.py b/python/paddle/fluid/tests/unittests/test_entry_attr.py
index bdfe95560e594..e963fbd81bc80 100644
--- a/python/paddle/fluid/tests/unittests/test_entry_attr.py
+++ b/python/paddle/fluid/tests/unittests/test_entry_attr.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+
 paddle.enable_static()
 
 import unittest
@@ -23,6 +24,7 @@
 
 
 class EntryAttrChecks(unittest.TestCase):
+
     def base(self):
         with self.assertRaises(NotImplementedError):
             from paddle.distributed.entry_attr import EntryAttr
@@ -62,12 +64,11 @@ def spaese_layer(self):
 
         with fluid.scope_guard(scope):
             with fluid.program_guard(prog):
-                input = fluid.layers.data(
-                    name="dnn_data",
-                    shape=[-1, 1],
-                    dtype="int64",
-                    lod_level=1,
-                    append_batch_size=False)
+                input = fluid.layers.data(name="dnn_data",
+                                          shape=[-1, 1],
+                                          dtype="int64",
+                                          lod_level=1,
+                                          append_batch_size=False)
                 prob = ProbabilityEntry(0.5)
                 emb = paddle.static.nn.sparse_embedding(
                     input=input,
@@ -93,6 +94,7 @@ def spaese_layer(self):
 
 
 class TestEntryAttrs(EntryAttrChecks):
+
     def test_base(self):
         self.base()
 
diff --git a/python/paddle/fluid/tests/unittests/test_entry_attr2.py b/python/paddle/fluid/tests/unittests/test_entry_attr2.py
index 96301c4a878d3..87d8bb70f3841 100644
--- a/python/paddle/fluid/tests/unittests/test_entry_attr2.py
+++ b/python/paddle/fluid/tests/unittests/test_entry_attr2.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+
 paddle.enable_static()
 
 import unittest
@@ -24,18 +25,18 @@
 
 
 class EntryAttrChecks(unittest.TestCase):
+
     def embedding_layer(self):
         prog = fluid.Program()
         scope = fluid.core.Scope()
 
         with fluid.scope_guard(scope):
             with fluid.program_guard(prog):
-                input = fluid.layers.data(
-                    name="dnn_data",
-                    shape=[-1, 1],
-                    dtype="int64",
-                    lod_level=1,
-                    append_batch_size=False)
+                input = fluid.layers.data(name="dnn_data",
+                                          shape=[-1, 1],
+                                          dtype="int64",
+                                          lod_level=1,
+                                          append_batch_size=False)
                 emb = fluid.layers.embedding(
                     input=input,
                     size=[100, 10],
@@ -56,6 +57,7 @@ def embedding_layer(self):
 
 
 class TestEntryAttrs(EntryAttrChecks):
+
     def test_embedding_layer(self):
         self.embedding_layer()
 
diff --git a/python/paddle/fluid/tests/unittests/test_erf_op.py b/python/paddle/fluid/tests/unittests/test_erf_op.py
index 964e704c6a2cc..c7d7b3abc9a37 100644
--- a/python/paddle/fluid/tests/unittests/test_erf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_erf_op.py
@@ -25,6 +25,7 @@
 
 
 class TestErfOp(OpTest):
+
     def setUp(self):
         self.op_type = "erf"
         self.dtype = self._init_dtype()
@@ -45,6 +46,7 @@ def test_check_grad(self):
 
 
 class TestErfLayer(unittest.TestCase):
+
     def _test_case(self, place):
         x = np.random.uniform(-1, 1, size=(11, 17)).astype(np.float64)
         y_ref = erf(x)
diff --git a/python/paddle/fluid/tests/unittests/test_erfinv_op.py b/python/paddle/fluid/tests/unittests/test_erfinv_op.py
index 5b5a7c0384316..4f10f1daaf716 100644
--- a/python/paddle/fluid/tests/unittests/test_erfinv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_erfinv_op.py
@@ -26,6 +26,7 @@
 
 
 class TestErfinv(OpTest):
+
     def setUp(self):
         self.op_type = "erfinv"
         self.python_api = paddle.erfinv
@@ -46,19 +47,20 @@ def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            user_defined_grads=[self.gradient],
-            user_defined_grad_outputs=self.grad_out)
+        self.check_grad(['X'],
+                        'Out',
+                        user_defined_grads=[self.gradient],
+                        user_defined_grad_outputs=self.grad_out)
 
 
 class TestErfinvFP32(TestErfinv):
+
     def init_dtype(self):
         self.dtype = np.float32
 
 
 class TestErfinvAPI(unittest.TestCase):
+
     def init_dtype(self):
         self.dtype = 'float32'
 
@@ -86,6 +88,7 @@ def run(place):
             run(place)
 
     def test_dygraph_api(self):
+
         def run(place):
             paddle.disable_static(place)
             x = paddle.to_tensor(self.x)
@@ -97,6 +100,7 @@ def run(place):
             run(place)
 
     def test_inplace_api(self):
+
         def run(place):
             paddle.disable_static(place)
             x = paddle.to_tensor(self.x)
diff --git a/python/paddle/fluid/tests/unittests/test_exception.py b/python/paddle/fluid/tests/unittests/test_exception.py
index adc7386bdeba6..6e826dacf7ca5 100644
--- a/python/paddle/fluid/tests/unittests/test_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_exception.py
@@ -24,6 +24,7 @@
 
 
 class TestException(unittest.TestCase):
+
     def test_exception(self):
         exception = None
         try:
@@ -37,6 +38,7 @@ def test_exception(self):
 
 
 class TestExceptionNoCStack(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         # test no C++ stack format
@@ -60,8 +62,10 @@ def test_exception_in_static_mode(self):
 
         with self.assertRaises(ValueError):
             exe.run(fluid.default_main_program(),
-                    feed={'X': x,
-                          'Y': y},
+                    feed={
+                        'X': x,
+                        'Y': y
+                    },
                     fetch_list=[avg_loss.name])
 
     def test_exception_in_dynamic_mode(self):
diff --git a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
index ebe820cb90ae2..1f3394b6019e8 100644
--- a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
@@ -23,17 +23,17 @@
 
 
 class TestExecutor(unittest.TestCase):
+
     def test_mul(self):
         i = zeros(shape=[1], dtype='int64')
         a = data(name='a', shape=[784], dtype='float32')
         array = array_write(x=a, i=i)
 
         i = increment(i)
-        b = data(
-            name='b',
-            shape=[784, 100],
-            dtype='float32',
-            append_batch_size=False)
+        b = data(name='b',
+                 shape=[784, 100],
+                 dtype='float32',
+                 append_batch_size=False)
         array_write(x=b, i=i, array=array)
 
         i = increment(i)
@@ -44,8 +44,10 @@ def test_mul(self):
         b_np = numpy.random.random((784, 100)).astype('float32')
 
         exe = Executor()
-        res, res_array = exe.run(feed={'a': a_np,
-                                       'b': b_np},
+        res, res_array = exe.run(feed={
+            'a': a_np,
+            'b': b_np
+        },
                                  fetch_list=[out, array])
 
         self.assertEqual((100, 100), res.shape)
diff --git a/python/paddle/fluid/tests/unittests/test_executor_and_use_program_cache.py b/python/paddle/fluid/tests/unittests/test_executor_and_use_program_cache.py
index 96d23174071f2..ad7a319f9c2e1 100644
--- a/python/paddle/fluid/tests/unittests/test_executor_and_use_program_cache.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_and_use_program_cache.py
@@ -23,16 +23,16 @@
 
 
 class TestExecutor(unittest.TestCase):
+
     def test_mul(self):
         main_program = fluid.Program()
         startup_program = fluid.Program()
         with fluid.program_guard(main_program, startup_program):
             a = fluid.layers.data(name='a', shape=[784], dtype='float32')
-            b = fluid.layers.data(
-                name='b',
-                shape=[784, 100],
-                dtype='float32',
-                append_batch_size=False)
+            b = fluid.layers.data(name='b',
+                                  shape=[784, 100],
+                                  dtype='float32',
+                                  append_batch_size=False)
             output = fluid.layers.mul(x=a, y=b)
 
         # Compute with numpy
@@ -50,8 +50,10 @@ def _train(use_program_cache, max_iters=1):
             for i in range(max_iters):
                 begin = time.time()
                 outs = exe.run(program=main_program,
-                               feed={'a': a_np,
-                                     'b': b_np},
+                               feed={
+                                   'a': a_np,
+                                   'b': b_np
+                               },
                                fetch_list=[output.name],
                                use_program_cache=use_program_cache)
                 end = time.time()
@@ -62,24 +64,25 @@ def _train(use_program_cache, max_iters=1):
             return run_time
 
         max_iters = 3
-        run_time_with_cache = _train(
-            use_program_cache=True, max_iters=max_iters)
+        run_time_with_cache = _train(use_program_cache=True,
+                                     max_iters=max_iters)
         print("run time with program cache: %f" % run_time_with_cache)
 
-        run_time_without_cache = _train(
-            use_program_cache=False, max_iters=max_iters)
+        run_time_without_cache = _train(use_program_cache=False,
+                                        max_iters=max_iters)
         print("run time without program cache: %f" % run_time_without_cache)
 
-        run_time_with_cache = _train(
-            use_program_cache=True, max_iters=max_iters)
+        run_time_with_cache = _train(use_program_cache=True,
+                                     max_iters=max_iters)
         print("run time with program cache: %f" % run_time_with_cache)
 
-        run_time_with_cache = _train(
-            use_program_cache=True, max_iters=max_iters)
+        run_time_with_cache = _train(use_program_cache=True,
+                                     max_iters=max_iters)
         print("run time with program cache: %f" % run_time_with_cache)
 
 
 class ExecutorPaddingRNNTest(PaddingRNNTestBase):
+
     def train_and_save_inference_program(self,
                                          rnn_model="static",
                                          parallel=True,
@@ -98,8 +101,9 @@ def train_and_save_inference_program(self,
     def test_inference_output(self):
         for rnn_model in ["static", "padding"]:
             # Set parallel to False to use the default executor.
-            self.train_and_save_inference_program(
-                rnn_model=rnn_model, parallel=True, use_program_cache=True)
+            self.train_and_save_inference_program(rnn_model=rnn_model,
+                                                  parallel=True,
+                                                  use_program_cache=True)
 
             x_np = numpy.random.random(
                 (self.config.batch_size, self.config.num_steps,
@@ -134,14 +138,14 @@ def test_inference_output(self):
                         results_with_cache = results
                     else:
                         results_without_cache = results
-            self.assertEqual(
-                len(results_with_cache), len(results_without_cache))
+            self.assertEqual(len(results_with_cache),
+                             len(results_without_cache))
             for i in range(len(results_with_cache)):
                 self.assertEqual(results_with_cache[i].shape,
                                  results_without_cache[i].shape)
                 self.assertTrue(
-                    numpy.allclose(results_with_cache[i], results_without_cache[
-                        i]))
+                    numpy.allclose(results_with_cache[i],
+                                   results_without_cache[i]))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_executor_check_feed.py b/python/paddle/fluid/tests/unittests/test_executor_check_feed.py
index 6b1e3c5a28a54..a35ebfbab173e 100644
--- a/python/paddle/fluid/tests/unittests/test_executor_check_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_check_feed.py
@@ -22,6 +22,7 @@
 
 
 class TestExecutor(unittest.TestCase):
+
     def net(self):
         lr = fluid.data(name="lr", shape=[1], dtype='float32')
         x = fluid.data(name="x", shape=[None, 1], dtype='float32')
@@ -50,8 +51,10 @@ def test_program_check_feed(self):
                 y_true = [[2.0], [4.0], [6.0], [8.0]]
                 a = 0
                 with self.assertRaises(ValueError):
-                    exe.run(feed={'x': train_data,
-                                  'lr': a},
+                    exe.run(feed={
+                        'x': train_data,
+                        'lr': a
+                    },
                             fetch_list=[lr, cost],
                             return_numpy=False,
                             use_prune=True)
@@ -73,8 +76,10 @@ def test_compiled_program_check_feed(self):
                 a = 0
                 with self.assertRaises(ValueError):
                     exe.run(compiled_prog,
-                            feed={'x': train_data,
-                                  'lr': a},
+                            feed={
+                                'x': train_data,
+                                'lr': a
+                            },
                             fetch_list=[lr, cost],
                             return_numpy=False,
                             use_prune=True)
diff --git a/python/paddle/fluid/tests/unittests/test_executor_check_fetch_list.py b/python/paddle/fluid/tests/unittests/test_executor_check_fetch_list.py
index 1af2009f217e3..9d1c902fdc29d 100644
--- a/python/paddle/fluid/tests/unittests/test_executor_check_fetch_list.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_check_fetch_list.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class TestCheckFetchList(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         self.feed = {"x": np.array([[0], [0], [1], [0]], dtype='float32')}
@@ -33,8 +34,10 @@ def build_program(self):
         main_program = paddle.static.Program()
         with paddle.static.program_guard(main_program):
             x = paddle.static.data(name='x', shape=[4, 1], dtype='float32')
-            output = paddle.unique_consecutive(
-                x, return_inverse=True, return_counts=True, axis=0)
+            output = paddle.unique_consecutive(x,
+                                               return_inverse=True,
+                                               return_counts=True,
+                                               axis=0)
 
         self.main_program = main_program
         self.fetch_list = output
diff --git a/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py b/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py
index 23c4191f6cfd8..05676c34e6def 100644
--- a/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py
@@ -22,6 +22,7 @@
 
 
 class TestExecutor(unittest.TestCase):
+
     def net(self):
         lr = fluid.data(name="lr", shape=[1], dtype='float32')
         x = fluid.data(name="x", shape=[None, 1], dtype='float32')
@@ -46,14 +47,16 @@ def test_program_feed_float(self):
                 exe = fluid.Executor(cpu)
                 lr, cost = self.net()
                 exe.run(startup_program)
-                train_data = numpy.array(
-                    [[1.0], [2.0], [3.0], [4.0]]).astype('float32')
-                y_true = numpy.array(
-                    [[2.0], [4.0], [6.0], [8.0]]).astype('float32')
+                train_data = numpy.array([[1.0], [2.0], [3.0],
+                                          [4.0]]).astype('float32')
+                y_true = numpy.array([[2.0], [4.0], [6.0],
+                                      [8.0]]).astype('float32')
                 a = 0.01
-                _lr, _ = exe.run(feed={'x': train_data,
-                                       'y': y_true,
-                                       'lr': a},
+                _lr, _ = exe.run(feed={
+                    'x': train_data,
+                    'y': y_true,
+                    'lr': a
+                },
                                  fetch_list=[lr, cost],
                                  return_numpy=False)
             self.assertEqual(_lr._dtype(), lr.dtype)
@@ -70,14 +73,16 @@ def test_program_feed_int(self):
                 exe = fluid.Executor(cpu)
                 lr, cost = self.net()
                 exe.run(startup_program)
-                train_data = numpy.array(
-                    [[1.0], [2.0], [3.0], [4.0]]).astype('float32')
-                y_true = numpy.array(
-                    [[2.0], [4.0], [6.0], [8.0]]).astype('float32')
+                train_data = numpy.array([[1.0], [2.0], [3.0],
+                                          [4.0]]).astype('float32')
+                y_true = numpy.array([[2.0], [4.0], [6.0],
+                                      [8.0]]).astype('float32')
                 a = 0
-                _lr, _ = exe.run(feed={'x': train_data,
-                                       'y': y_true,
-                                       'lr': a},
+                _lr, _ = exe.run(feed={
+                    'x': train_data,
+                    'y': y_true,
+                    'lr': a
+                },
                                  fetch_list=[lr, cost],
                                  return_numpy=False)
             self.assertEqual(_lr._dtype(), lr.dtype)
@@ -97,9 +102,11 @@ def test_program_feed_list(self):
                 train_data = [[1.0], [2.0], [3.0], [4.0]]
                 y_true = [[2.0], [4.0], [6.0], [8.0]]
                 a = 0
-                _lr, _ = exe.run(feed={'x': train_data,
-                                       'y': y_true,
-                                       'lr': a},
+                _lr, _ = exe.run(feed={
+                    'x': train_data,
+                    'y': y_true,
+                    'lr': a
+                },
                                  fetch_list=[lr, cost],
                                  return_numpy=False)
             self.assertEqual(_lr._dtype(), lr.dtype)
@@ -118,15 +125,17 @@ def test_compiled_program_feed_scalar(self):
                 exe.run(startup_program)
                 compiled_prog = fluid.CompiledProgram(
                     main_program).with_data_parallel(loss_name=cost.name)
-                train_data = numpy.array(
-                    [[1.0], [2.0], [3.0], [4.0]]).astype('float32')
-                y_true = numpy.array(
-                    [[2.0], [4.0], [6.0], [8.0]]).astype('float32')
+                train_data = numpy.array([[1.0], [2.0], [3.0],
+                                          [4.0]]).astype('float32')
+                y_true = numpy.array([[2.0], [4.0], [6.0],
+                                      [8.0]]).astype('float32')
                 a = 0.01
                 _lr, _ = exe.run(compiled_prog,
-                                 feed={'x': train_data,
-                                       'y': y_true,
-                                       'lr': a},
+                                 feed={
+                                     'x': train_data,
+                                     'y': y_true,
+                                     'lr': a
+                                 },
                                  fetch_list=[lr, cost],
                                  return_numpy=False)
                 self.assertEqual(_lr._dtype(), lr.dtype)
@@ -135,6 +144,7 @@ def test_compiled_program_feed_scalar(self):
 
 
 class TestAsLodTensor(unittest.TestCase):
+
     def test_as_lodtensor_int32(self):
         cpu = fluid.CPUPlace()
         tensor = fluid.executor._as_lodtensor(1.0, cpu,
diff --git a/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py b/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py
index a7ee6b31b09f8..81bc702128052 100644
--- a/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py
@@ -22,6 +22,7 @@
 
 @skip_check_grad_ci(reason="Not op test but call the method of class OpTest.")
 class TestExecutorReturnTensorNotOverwritingWithOptest(OpTest):
+
     def setUp(self):
         pass
 
@@ -68,6 +69,7 @@ def test_executor_run_twice(self):
 
 
 class TestExecutorReturnTensorNotOverOverwritingWithLayers(unittest.TestCase):
+
     def setUp(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_expand_as_op.py b/python/paddle/fluid/tests/unittests/test_expand_as_op.py
index 150aff78508c6..aa4f0b2f3caf0 100755
--- a/python/paddle/fluid/tests/unittests/test_expand_as_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_as_op.py
@@ -31,6 +31,7 @@ def bcast(x, target_tensor):
 
 
 class TestExpandAsOpRank1(OpTest):
+
     def setUp(self):
         self.op_type = "expand_as"
         x = np.random.rand(100).astype("float64")
@@ -49,6 +50,7 @@ def test_check_grad(self):
 
 
 class TestExpandAsOpRank2(OpTest):
+
     def setUp(self):
         self.op_type = "expand_as"
         x = np.random.rand(10, 12).astype("float64")
@@ -67,6 +69,7 @@ def test_check_grad(self):
 
 
 class TestExpandAsOpRank3(OpTest):
+
     def setUp(self):
         self.op_type = "expand_as"
         x = np.random.rand(2, 3, 20).astype("float64")
@@ -85,6 +88,7 @@ def test_check_grad(self):
 
 
 class TestExpandAsOpRank4(OpTest):
+
     def setUp(self):
         self.op_type = "expand_as"
         x = np.random.rand(1, 1, 7, 16).astype("float64")
@@ -104,6 +108,7 @@ def test_check_grad(self):
 
 # Test dygraph API
 class TestExpandAsDygraphAPI(unittest.TestCase):
+
     def test_api(self):
         import paddle
         paddle.disable_static()
@@ -119,24 +124,28 @@ def test_api(self):
 
 # Test python API
 class TestExpandAsAPI(unittest.TestCase):
+
     def test_api(self):
         input1 = np.random.random([12, 14]).astype("float32")
         input2 = np.random.random([48, 14]).astype("float32")
-        x = fluid.layers.data(
-            name='x', shape=[12, 14], append_batch_size=False, dtype="float32")
+        x = fluid.layers.data(name='x',
+                              shape=[12, 14],
+                              append_batch_size=False,
+                              dtype="float32")
 
-        y = fluid.layers.data(
-            name='target_tensor',
-            shape=[48, 14],
-            append_batch_size=False,
-            dtype="float32")
+        y = fluid.layers.data(name='target_tensor',
+                              shape=[48, 14],
+                              append_batch_size=False,
+                              dtype="float32")
 
         out_1 = fluid.layers.expand_as(x, target_tensor=y)
 
         exe = fluid.Executor(place=fluid.CPUPlace())
         res_1 = exe.run(fluid.default_main_program(),
-                        feed={"x": input1,
-                              "target_tensor": input2},
+                        feed={
+                            "x": input1,
+                            "target_tensor": input2
+                        },
                         fetch_list=[out_1])
         assert np.array_equal(res_1[0], np.tile(input1, (4, 1)))
 
diff --git a/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
index 3bf6868fed9c9..f107fec1c4e4e 100755
--- a/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
@@ -22,6 +22,7 @@
 
 
 class TestExpandAsBasic(OpTest):
+
     def setUp(self):
         self.op_type = "expand_as_v2"
         self.python_api = paddle.expand_as
@@ -41,6 +42,7 @@ def test_check_grad(self):
 
 
 class TestExpandAsOpRank2(TestExpandAsBasic):
+
     def setUp(self):
         self.op_type = "expand_as_v2"
         self.python_api = paddle.expand_as
@@ -54,6 +56,7 @@ def setUp(self):
 
 
 class TestExpandAsOpRank3(TestExpandAsBasic):
+
     def setUp(self):
         self.op_type = "expand_as_v2"
         self.python_api = paddle.expand_as
@@ -67,6 +70,7 @@ def setUp(self):
 
 
 class TestExpandAsOpRank4(TestExpandAsBasic):
+
     def setUp(self):
         self.op_type = "expand_as_v2"
         self.python_api = paddle.expand_as
@@ -80,6 +84,7 @@ def setUp(self):
 
 
 class TestExpandAsV2Error(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             x1 = fluid.layers.data(name='x1', shape=[4], dtype="uint8")
@@ -92,24 +97,28 @@ def test_errors(self):
 
 # Test python API
 class TestExpandAsV2API(unittest.TestCase):
+
     def test_api(self):
         input1 = np.random.random([12, 14]).astype("float32")
         input2 = np.random.random([2, 12, 14]).astype("float32")
-        x = fluid.layers.data(
-            name='x', shape=[12, 14], append_batch_size=False, dtype="float32")
+        x = fluid.layers.data(name='x',
+                              shape=[12, 14],
+                              append_batch_size=False,
+                              dtype="float32")
 
-        y = fluid.layers.data(
-            name='target_tensor',
-            shape=[2, 12, 14],
-            append_batch_size=False,
-            dtype="float32")
+        y = fluid.layers.data(name='target_tensor',
+                              shape=[2, 12, 14],
+                              append_batch_size=False,
+                              dtype="float32")
 
         out_1 = paddle.expand_as(x, y=y)
 
         exe = fluid.Executor(place=fluid.CPUPlace())
         res_1 = exe.run(fluid.default_main_program(),
-                        feed={"x": input1,
-                              "target_tensor": input2},
+                        feed={
+                            "x": input1,
+                            "target_tensor": input2
+                        },
                         fetch_list=[out_1])
         assert np.array_equal(res_1[0], np.tile(input1, (2, 1, 1)))
 
diff --git a/python/paddle/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py
index edda6da655ddd..d0d9a1f7e21fe 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_op.py
@@ -24,6 +24,7 @@
 
 # Situation 1: expand_times is a list(without tensor)
 class TestExpandOpRank1(OpTest):
+
     def setUp(self):
         self.op_type = "expand"
         self.init_data()
@@ -47,30 +48,35 @@ def test_check_grad(self):
 
 
 class TestExpandOpRank2_Corner(TestExpandOpRank1):
+
     def init_data(self):
         self.ori_shape = [120]
         self.expand_times = [2]
 
 
 class TestExpandOpRank2(TestExpandOpRank1):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.expand_times = [2, 3]
 
 
 class TestExpandOpRank3_Corner(TestExpandOpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 10, 5)
         self.expand_times = (1, 1, 1)
 
 
 class TestExpandOpRank3(TestExpandOpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 4, 15)
         self.expand_times = (2, 1, 4)
 
 
 class TestExpandOpRank4(TestExpandOpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 4, 5, 7)
         self.expand_times = (3, 2, 1, 2)
@@ -78,6 +84,7 @@ def init_data(self):
 
 # Situation 2: expand_times is a list(with tensor)
 class TestExpandOpRank1_tensor_attr(OpTest):
+
     def setUp(self):
         self.op_type = "expand"
         self.init_data()
@@ -110,6 +117,7 @@ def test_check_grad(self):
 
 
 class TestExpandOpRank2_Corner_tensor_attr(TestExpandOpRank1_tensor_attr):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.expand_times = [1, 1]
@@ -117,6 +125,7 @@ def init_data(self):
 
 
 class TestExpandOpRank2_attr_tensor(TestExpandOpRank1_tensor_attr):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.expand_times = [2, 3]
@@ -125,6 +134,7 @@ def init_data(self):
 
 # Situation 3: expand_times is a tensor
 class TestExpandOpRank1_tensor(OpTest):
+
     def setUp(self):
         self.op_type = "expand"
         self.init_data()
@@ -151,6 +161,7 @@ def test_check_grad(self):
 
 
 class TestExpandOpRank2_tensor(TestExpandOpRank1_tensor):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.expand_times = [2, 3]
@@ -158,11 +169,11 @@ def init_data(self):
 
 # Situation 4: input x is Integer
 class TestExpandOpInteger(OpTest):
+
     def setUp(self):
         self.op_type = "expand"
         self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 5)).astype("int32")
+            'X': np.random.randint(10, size=(2, 4, 5)).astype("int32")
         }
         self.attrs = {'expand_times': [2, 1, 4]}
         output = np.tile(self.inputs['X'], (2, 1, 4))
@@ -174,6 +185,7 @@ def test_check_output(self):
 
 # Situation 5: input x is Bool
 class TestExpandOpBoolean(OpTest):
+
     def setUp(self):
         self.op_type = "expand"
         self.inputs = {'X': np.random.randint(2, size=(2, 4, 5)).astype("bool")}
@@ -187,11 +199,11 @@ def test_check_output(self):
 
 # Situation 56: input x is Integer
 class TestExpandOpInt64_t(OpTest):
+
     def setUp(self):
         self.op_type = "expand"
         self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 5)).astype("int64")
+            'X': np.random.randint(10, size=(2, 4, 5)).astype("int64")
         }
         self.attrs = {'expand_times': [2, 1, 4]}
         output = np.tile(self.inputs['X'], (2, 1, 4))
@@ -202,10 +214,11 @@ def test_check_output(self):
 
 
 class TestExpandError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             expand_times = [2, 2]
             self.assertRaises(TypeError, fluid.layers.expand, x1, expand_times)
             x2 = fluid.layers.data(name='x2', shape=[4], dtype="uint8")
@@ -217,14 +230,18 @@ def test_errors(self):
 
 # Test python API
 class TestExpandAPI(unittest.TestCase):
+
     def test_api(self):
         input = np.random.random([12, 14]).astype("float32")
-        x = fluid.layers.data(
-            name='x', shape=[12, 14], append_batch_size=False, dtype="float32")
+        x = fluid.layers.data(name='x',
+                              shape=[12, 14],
+                              append_batch_size=False,
+                              dtype="float32")
 
         positive_2 = fluid.layers.fill_constant([1], "int32", 2)
-        expand_times = fluid.layers.data(
-            name="expand_times", shape=[2], append_batch_size=False)
+        expand_times = fluid.layers.data(name="expand_times",
+                                         shape=[2],
+                                         append_batch_size=False)
 
         out_1 = fluid.layers.expand(x, expand_times=[2, 3])
         out_2 = fluid.layers.expand(x, expand_times=[positive_2, 3])
@@ -235,7 +252,8 @@ def test_api(self):
         exe = fluid.Executor(place=fluid.CPUPlace())
         res_1, res_2, res_3 = exe.run(fluid.default_main_program(),
                                       feed={
-                                          "x": input,
+                                          "x":
+                                          input,
                                           "expand_times":
                                           np.array([1, 3]).astype("int32")
                                       },
@@ -246,13 +264,14 @@ def test_api(self):
 
 
 class TestExpandDygraphAPI(unittest.TestCase):
+
     def test_expand_times_is_tensor(self):
         with paddle.fluid.dygraph.guard():
             a = paddle.rand([2, 5])
             b = paddle.fluid.layers.expand(a, expand_times=[2, 3])
-            c = paddle.fluid.layers.expand(
-                a, expand_times=paddle.to_tensor(
-                    [2, 3], dtype='int32'))
+            c = paddle.fluid.layers.expand(a,
+                                           expand_times=paddle.to_tensor(
+                                               [2, 3], dtype='int32'))
             self.assertTrue(
                 np.array_equal(b.numpy(), np.tile(a.numpy(), [2, 3])))
             self.assertTrue(
diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
index 4932ea8a1b5c9..52b9234263d96 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -25,6 +25,7 @@
 
 # Situation 1: shape is a list(without tensor)
 class TestExpandV2OpRank1(OpTest):
+
     def setUp(self):
         self.op_type = "expand_v2"
         self.init_data()
@@ -48,6 +49,7 @@ def test_check_grad(self):
 
 
 class TestExpandV2OpRank2_DimExpanding(TestExpandV2OpRank1):
+
     def init_data(self):
         self.ori_shape = [120]
         self.shape = [2, 120]
@@ -55,6 +57,7 @@ def init_data(self):
 
 
 class TestExpandV2OpRank2(TestExpandV2OpRank1):
+
     def init_data(self):
         self.ori_shape = [1, 140]
         self.shape = [12, 140]
@@ -62,6 +65,7 @@ def init_data(self):
 
 
 class TestExpandV2OpRank3_Corner(TestExpandV2OpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 10, 5)
         self.shape = (2, 10, 5)
@@ -69,6 +73,7 @@ def init_data(self):
 
 
 class TestExpandV2OpRank4(TestExpandV2OpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 4, 5, 7)
         self.shape = (-1, -1, -1, -1)
@@ -77,6 +82,7 @@ def init_data(self):
 
 # Situation 2: shape is a list(with tensor)
 class TestExpandV2OpRank1_tensor_attr(OpTest):
+
     def setUp(self):
         self.op_type = "expand_v2"
         self.init_data()
@@ -107,6 +113,7 @@ def test_check_grad(self):
 
 
 class TestExpandV2OpRank2_Corner_tensor_attr(TestExpandV2OpRank1_tensor_attr):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.expand_times = [1, 1]
@@ -116,6 +123,7 @@ def init_data(self):
 
 # Situation 3: shape is a tensor
 class TestExpandV2OpRank1_tensor(OpTest):
+
     def setUp(self):
         self.op_type = "expand_v2"
         self.init_data()
@@ -142,11 +150,11 @@ def test_check_grad(self):
 
 # Situation 4: input x is Integer
 class TestExpandV2OpInteger(OpTest):
+
     def setUp(self):
         self.op_type = "expand_v2"
         self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 5)).astype("int32")
+            'X': np.random.randint(10, size=(2, 4, 5)).astype("int32")
         }
         self.attrs = {'shape': [2, 4, 5]}
         output = np.tile(self.inputs['X'], (1, 1, 1))
@@ -158,6 +166,7 @@ def test_check_output(self):
 
 # Situation 5: input x is Bool
 class TestExpandV2OpBoolean(OpTest):
+
     def setUp(self):
         self.op_type = "expand_v2"
         self.inputs = {'X': np.random.randint(2, size=(2, 4, 5)).astype("bool")}
@@ -171,11 +180,11 @@ def test_check_output(self):
 
 # Situation 56: input x is Integer
 class TestExpandV2OpInt64_t(OpTest):
+
     def setUp(self):
         self.op_type = "expand_v2"
         self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 5)).astype("int64")
+            'X': np.random.randint(10, size=(2, 4, 5)).astype("int64")
         }
         self.attrs = {'shape': [2, 4, 5]}
         output = np.tile(self.inputs['X'], (1, 1, 1))
@@ -186,10 +195,11 @@ def test_check_output(self):
 
 
 class TestExpandV2Error(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             shape = [2, 2]
             self.assertRaises(TypeError, paddle.tensor.expand, x1, shape)
             x2 = fluid.layers.data(name='x2', shape=[4], dtype="uint8")
@@ -201,17 +211,19 @@ def test_errors(self):
 
 # Test python API
 class TestExpandV2API(unittest.TestCase):
+
     def test_api(self):
         input = np.random.random([12, 14]).astype("float32")
-        x = fluid.layers.data(
-            name='x', shape=[12, 14], append_batch_size=False, dtype="float32")
+        x = fluid.layers.data(name='x',
+                              shape=[12, 14],
+                              append_batch_size=False,
+                              dtype="float32")
 
         positive_2 = fluid.layers.fill_constant([1], "int32", 12)
-        expand_shape = fluid.layers.data(
-            name="expand_shape",
-            shape=[2],
-            append_batch_size=False,
-            dtype="int32")
+        expand_shape = fluid.layers.data(name="expand_shape",
+                                         shape=[2],
+                                         append_batch_size=False,
+                                         dtype="int32")
 
         out_1 = paddle.expand(x, shape=[12, 14])
         out_2 = paddle.expand(x, shape=[positive_2, 14])
@@ -222,7 +234,8 @@ def test_api(self):
         exe = fluid.Executor(place=fluid.CPUPlace())
         res_1, res_2, res_3 = exe.run(fluid.default_main_program(),
                                       feed={
-                                          "x": input,
+                                          "x":
+                                          input,
                                           "expand_shape":
                                           np.array([12, 14]).astype("int32")
                                       },
@@ -233,19 +246,22 @@ def test_api(self):
 
 
 class TestExpandInferShape(unittest.TestCase):
+
     def test_shape_with_var(self):
         with program_guard(Program(), Program()):
             x = paddle.static.data(shape=[-1, 1, 3], name='x')
             fake_var = paddle.randn([2, 3])
             target_shape = [
-                -1, paddle.shape(fake_var)[0], paddle.shape(fake_var)[1]
+                -1, paddle.shape(fake_var)[0],
+                paddle.shape(fake_var)[1]
             ]
             out = paddle.expand(x, shape=target_shape)
             self.assertListEqual(list(out.shape), [-1, -1, -1])
 
 
-# Test python Dygraph API 
+# Test python Dygraph API
 class TestExpandV2DygraphAPI(unittest.TestCase):
+
     def test_expand_times_is_tensor(self):
         with paddle.fluid.dygraph.guard():
             with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py
index c8f4101ea5d6b..57c4fb02d858a 100644
--- a/python/paddle/fluid/tests/unittests/test_exponential_op.py
+++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py
@@ -23,6 +23,7 @@
 
 
 class TestExponentialOp1(OpTest):
+
     def setUp(self):
         self.op_type = "exponential"
         self.config()
@@ -48,29 +49,28 @@ def verify_output(self, outs):
         hist2 = hist2.astype("float32")
         hist2 = hist2 / float(data_np.size)
 
-        self.assertTrue(
-            np.allclose(
-                hist1, hist2, rtol=0.02),
-            "actual: {}, expected: {}".format(hist1, hist2))
+        self.assertTrue(np.allclose(hist1, hist2, rtol=0.02),
+                        "actual: {}, expected: {}".format(hist1, hist2))
 
     def test_check_grad_normal(self):
         self.check_grad(
             ['X'],
             'Out',
-            user_defined_grads=[np.zeros(
-                [1024, 1024], dtype=self.dtype)],
+            user_defined_grads=[np.zeros([1024, 1024], dtype=self.dtype)],
             user_defined_grad_outputs=[
                 np.random.rand(1024, 1024).astype(self.dtype)
             ])
 
 
 class TestExponentialOp2(TestExponentialOp1):
+
     def config(self):
         self.lam = 0.25
         self.dtype = "float32"
 
 
 class TestExponentialAPI(unittest.TestCase):
+
     def test_static(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_eye_op.py b/python/paddle/fluid/tests/unittests/test_eye_op.py
index 704762d809414..d74cabb4275ad 100644
--- a/python/paddle/fluid/tests/unittests/test_eye_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eye_op.py
@@ -24,6 +24,7 @@
 
 
 class TestEyeOp(OpTest):
+
     def setUp(self):
         '''
 	Test eye op with specified shape
@@ -44,6 +45,7 @@ def test_check_output(self):
 
 
 class TestEyeOp1(OpTest):
+
     def setUp(self):
         '''
 	Test eye op with default parameters
@@ -60,6 +62,7 @@ def test_check_output(self):
 
 
 class TestEyeOp2(OpTest):
+
     def setUp(self):
         '''
         Test eye op with specified shape
@@ -76,6 +79,7 @@ def test_check_output(self):
 
 
 class API_TestTensorEye(unittest.TestCase):
+
     def test_out(self):
         with paddle.static.program_guard(paddle.static.Program()):
             data = paddle.eye(10)
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
index 07f3eaa04ad2e..adfd15f2dd176 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -75,6 +75,7 @@ def channel_wise_dequantize_max_abs(x,
 
 
 class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest):
+
     def set_args(self):
         self.quant_bits = [8, 8]
         self.activation_scale = 0.7861
@@ -92,10 +93,11 @@ def setUp(self):
                                               self.activation_scale)
 
         self.inputs = {
-            'X': yq,
-            'Scales': [("scales0", np.array(scales).astype(self.dtype)),
-                       ("scales1",
-                        np.array([self.activation_scale]).astype(self.dtype))]
+            'X':
+            yq,
+            'Scales':
+            [("scales0", np.array(scales).astype(self.dtype)),
+             ("scales1", np.array([self.activation_scale]).astype(self.dtype))]
         }
         self.attrs = {'quant_bits': self.quant_bits}
         self.outputs = {'Out': ydq}
@@ -106,6 +108,7 @@ def test_check_output(self):
 
 class TestFakeChannelWiseDequantizeMaxAbsOpTwoScalesFloat16(
         TestFakeChannelWiseDequantizeMaxAbsOpTwoScales):
+
     def set_dtype(self):
         self.dtype = np.float16
 
@@ -114,6 +117,7 @@ def test_check_output(self):
 
 
 class TestFakeChannelWiseDequantizeMaxAbsOpOneScale(OpTest):
+
     def set_args(self):
         self.quant_bits = [8]
         self.quant_axis = 0
@@ -147,6 +151,7 @@ def test_check_output(self):
 
 class TestFakeChannelWiseDequantizeMaxAbsOpOneScale1(
         TestFakeChannelWiseDequantizeMaxAbsOpOneScale):
+
     def set_args(self):
         self.quant_bits = [8]
         self.quant_axis = 1
@@ -154,6 +159,7 @@ def set_args(self):
 
 class TestFakeChannelWiseDequantizeMaxAbsOpOneScaleFloat16(
         TestFakeChannelWiseDequantizeMaxAbsOpOneScale):
+
     def set_dtype(self):
         self.dtype = np.float16
 
@@ -163,6 +169,7 @@ def test_check_output(self):
 
 class TestFakeChannelWiseDequantizeMaxAbsOpOneScale1Float16(
         TestFakeChannelWiseDequantizeMaxAbsOpOneScale1):
+
     def set_dtype(self):
         self.dtype = np.float16
 
@@ -171,6 +178,7 @@ def test_check_output(self):
 
 
 class TestFakeDequantizeMaxAbsOp(OpTest):
+
     def set_args(self):
         self.num_bits = 8
         self.max_range = math.pow(2, self.num_bits - 1) - 1
@@ -195,17 +203,20 @@ def test_check_output(self):
 
 
 class TestFakeDequantizeMaxAbsOpDouble(TestFakeDequantizeMaxAbsOp):
+
     def set_dtype(self):
         self.dtype = np.float64
 
 
 class TestFakeDequantizeMaxAbsOp5Bits(TestFakeDequantizeMaxAbsOp):
+
     def set_args(self):
         self.num_bits = 5
         self.max_range = math.pow(2, self.num_bits - 1) - 1
 
 
 class TestFakeDequantizeMaxAbsOpFloat16(TestFakeDequantizeMaxAbsOp):
+
     def set_dtype(self):
         self.dtype = np.float16
 
@@ -214,6 +225,7 @@ def test_check_output(self):
 
 
 class TestChannelWiseDequantizeOp(OpTest):
+
     def set_args(self):
         self.bit_length = 8
         self.data_type = "float32"
@@ -242,6 +254,7 @@ def test_check_output(self):
 
 
 class TestChannelWiseDequantizeOp1(TestChannelWiseDequantizeOp):
+
     def set_args(self):
         self.bit_length = 8
         self.data_type = "float32"
@@ -249,6 +262,7 @@ def set_args(self):
 
 
 class TestDequantizeOp(OpTest):
+
     def set_args(self):
         self.bit_length = 8
         self.quant_axis = -1
@@ -276,6 +290,7 @@ def test_check_output(self):
 
 
 class TestDequantizeOpDouble(TestDequantizeOp):
+
     def set_args(self):
         self.bit_length = 8
         self.max_range = math.pow(2, self.bit_length - 1) - 1
@@ -284,6 +299,7 @@ def set_args(self):
 
 
 class TestDequantizeOp5Bits(TestDequantizeOp):
+
     def set_args(self):
         self.bit_length = 5
         self.max_range = math.pow(2, self.bit_length - 1) - 1
diff --git a/python/paddle/fluid/tests/unittests/test_fake_init_op.py b/python/paddle/fluid/tests/unittests/test_fake_init_op.py
index a62b7aed66b59..e094b82c41a88 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_init_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_init_op.py
@@ -21,13 +21,14 @@
 
 
 class TestFakeInitOpSelectedRows(unittest.TestCase):
+
     def check_with_place(self, place, is_selected_rows):
         scope = core.Scope()
 
         out_var_name = 'Out'
         if is_selected_rows:
-            out_tensor = scope.var(out_var_name).get_selected_rows().get_tensor(
-            )
+            out_tensor = scope.var(
+                out_var_name).get_selected_rows().get_tensor()
         else:
             out_tensor = scope.var(out_var_name).get_tensor()
 
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 0c8e115d7cebf..3693ba615d94f 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -41,6 +41,7 @@ def get_compute_type(dtype):
 
 
 class TestFakeQuantizeAbsMaxOp(OpTest):
+
     def setUp(self):
         self.op_type = 'fake_quantize_abs_max'
         self.attrs = {'bit_length': 8}
@@ -72,6 +73,7 @@ def test_fake_quantize_abs_max_underflow2(self):
 
 
 class TestFakeChannelWiseQuantizeAbsMaxOp(OpTest):
+
     def setUp(self):
         self.op_type = 'fake_channel_wise_quantize_abs_max'
         self.attrs = {'bit_length': 8}
@@ -82,8 +84,8 @@ def _fake_channel_wise_quantize_abs_max(self, dtype, input_shape,
         input_data = distribution(input_shape).astype(dtype)
         compute_type = get_compute_type(dtype)
         bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
-        compute_axis = tuple(
-            i for i in range(len(input_shape)) if i != quant_axis)
+        compute_axis = tuple(i for i in range(len(input_shape))
+                             if i != quant_axis)
         scale_broadcast = np.amax(input_data, axis=compute_axis, keepdims=True)
         output_data = round_c(bnt * input_data.astype(compute_type) /
                               scale_broadcast)
@@ -105,14 +107,15 @@ def test_fake_channel_wise_quantize_abs_max(self):
         for dtype, input_shape_quant_axis in itertools.product(
                 dtype_options, input_shape_quant_axis_options):
             input_shape, quant_axis = input_shape_quant_axis
-            with self.subTest(
-                    dtype=dtype, input_shape=input_shape,
-                    quant_axis=quant_axis):
+            with self.subTest(dtype=dtype,
+                              input_shape=input_shape,
+                              quant_axis=quant_axis):
                 self._fake_channel_wise_quantize_abs_max(
                     dtype, input_shape, quant_axis, np.random.random)
 
 
 class TestFakeQuantizeRangeAbsMaxOp(OpTest):
+
     def setUp(self):
         self.op_type = 'fake_quantize_range_abs_max'
         self.attrs = {'bit_length': 5, 'window_size': 1}
@@ -162,6 +165,7 @@ def test_fake_quantize_range_abs_max(self):
 
 
 class TestMovingAverageAbsMaxScaleOp(OpTest):
+
     def setUp(self):
         self.op_type = 'moving_average_abs_max_scale'
         self.attrs = {'moving_rate': float(0.9), 'is_test': False}
@@ -194,6 +198,7 @@ def test_moving_average_abs_max(self):
 
 
 class TestFakeQuantizeMovingAverageAbsMaxOp(OpTest):
+
     def setUp(self):
         self.op_type = 'fake_quantize_moving_average_abs_max'
         self.attrs = {'bit_length': 5, 'moving_rate': 0.9, 'is_test': False}
@@ -252,14 +257,14 @@ def test_fake_quantize_moving_average_abs_max_float16(self):
                                                    np.random.random)
 
     def test_fake_quantize_dequantize_moving_average_abs_max(self):
-        self._fake_quantize_moving_average_abs_max(
-            np.float32, (8, 16, 7, 7),
-            np.random.random,
-            dequantize=True,
-            with_gradient=True)
+        self._fake_quantize_moving_average_abs_max(np.float32, (8, 16, 7, 7),
+                                                   np.random.random,
+                                                   dequantize=True,
+                                                   with_gradient=True)
 
 
 class TestFakeQuantizeDequantizeAbsMaxOp(OpTest):
+
     def setUp(self):
         self.op_type = 'fake_quantize_dequantize_abs_max'
         self.attrs = {'bit_length': 8}
@@ -286,22 +291,24 @@ def test_fake_quantize_dequantize_abs_max(self):
 
 
 class TestChannelWiseFakeQuantizeDequantizeAbsMaxOp(OpTest):
+
     def setUp(self):
         self.op_type = 'fake_channel_wise_quantize_dequantize_abs_max'
         self.attrs = {'bit_length': 8}
 
-    def _fake_channel_wise_quantize_dequantize_abs_max(
-            self, dtype, input_shape, quant_axis, distribution):
+    def _fake_channel_wise_quantize_dequantize_abs_max(self, dtype, input_shape,
+                                                       quant_axis,
+                                                       distribution):
         assert quant_axis in [0, 1], 'quant_axis should be 0 or 1.'
         input_data = distribution(input_shape).astype(dtype)
         compute_type = get_compute_type(dtype)
         bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
         output_data = input_data.copy().astype(compute_type)
-        compute_axis = tuple(
-            i for i in range(len(input_shape)) if i != quant_axis)
+        compute_axis = tuple(i for i in range(len(input_shape))
+                             if i != quant_axis)
         scale_broadcast = np.amax(input_data, axis=compute_axis, keepdims=True)
-        output_data = round_c(bnt * output_data /
-                              scale_broadcast) * scale_broadcast / bnt
+        output_data = round_c(
+            bnt * output_data / scale_broadcast) * scale_broadcast / bnt
         if quant_axis == 1:
             scale_broadcast = np.transpose(scale_broadcast,
                                            (1, ) + compute_axis)
@@ -315,8 +322,9 @@ def _fake_channel_wise_quantize_dequantize_abs_max(
         self.check_grad(['X'], 'Out', user_defined_grads=gradient)
 
     def test_channel_wise_fake_quant_dequant_abs_max(self):
-        input_shape_quant_axis_options = [[(3, 4, 64, 64), 0], [(
-            15, 20, 5, 5), 1], [(30, 15), 0], [(30, 15), 1]]
+        input_shape_quant_axis_options = [[(3, 4, 64, 64), 0],
+                                          [(15, 20, 5, 5), 1], [(30, 15), 0],
+                                          [(30, 15), 1]]
         for input_shape, quant_axis in input_shape_quant_axis_options:
             with self.subTest(input_shape=input_shape, quant_axis=quant_axis):
                 self._fake_channel_wise_quantize_dequantize_abs_max(
@@ -348,6 +356,7 @@ def channel_wise_quantize_max_abs(x, quant_bit=8, quant_axis=0):
 
 
 class TestChannelWiseQuantizeOp(OpTest):
+
     def set_args(self):
         self.bit_length = 8
         self.data_type = "float32"
@@ -374,6 +383,7 @@ def test_check_output(self):
 
 
 class TestChannelWiseQuantizeOp1(TestChannelWiseQuantizeOp):
+
     def set_args(self):
         self.bit_length = 8
         self.data_type = "float32"
@@ -381,6 +391,7 @@ def set_args(self):
 
 
 class TestChannelWiseQuantizeOpTrain(OpTest):
+
     def set_args(self):
         self.bit_length = 8
         self.data_type = "float32"
@@ -409,6 +420,7 @@ def test_check_output(self):
 
 
 class TestquantizeOp(OpTest):
+
     def set_args(self):
         self.bit_length = 8
         self.quant_axis = -1
@@ -435,6 +447,7 @@ def test_check_output(self):
 
 
 class TestquantizeOpTrain(TestquantizeOp):
+
     def set_args(self):
         self.bit_length = 8
         self.quant_axis = -1
diff --git a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
index a460c5f252777..d6ccec25a43f8 100755
--- a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
@@ -27,6 +27,7 @@
 from paddle import _C_ops
 
 import sys
+
 sys.path.append("./tokenizer")
 from tokenizer.bert_tokenizer import BertTokenizer
 
@@ -63,6 +64,7 @@ def to_map_tensor(string_dict, name):
 
 
 class FasterTokenizer(nn.Layer):
+
     def __init__(self, vocab_dict):
         super(FasterTokenizer, self).__init__()
         vocab_tensor = to_map_tensor(vocab_dict, "vocab")
@@ -92,28 +94,33 @@ def forward(self,
         input_ids = helper.create_variable_for_type_inference(dtype="int64")
         seg_ids = helper.create_variable_for_type_inference(dtype="int64")
         if text_pair is None:
-            helper.append_op(
-                type='faster_tokenizer',
-                inputs={'Vocab': self.vocab,
-                        'Text': text},
-                outputs={'InputIds': input_ids,
-                         'SegmentIds': seg_ids},
-                attrs=attrs)
+            helper.append_op(type='faster_tokenizer',
+                             inputs={
+                                 'Vocab': self.vocab,
+                                 'Text': text
+                             },
+                             outputs={
+                                 'InputIds': input_ids,
+                                 'SegmentIds': seg_ids
+                             },
+                             attrs=attrs)
         else:
-            helper.append_op(
-                type='faster_tokenizer',
-                inputs={
-                    'Vocab': self.vocab,
-                    'Text': text,
-                    'TextPair': text_pair
-                },
-                outputs={'InputIds': input_ids,
-                         'SegmentIds': seg_ids},
-                attrs=attrs)
+            helper.append_op(type='faster_tokenizer',
+                             inputs={
+                                 'Vocab': self.vocab,
+                                 'Text': text,
+                                 'TextPair': text_pair
+                             },
+                             outputs={
+                                 'InputIds': input_ids,
+                                 'SegmentIds': seg_ids
+                             },
+                             attrs=attrs)
         return input_ids, seg_ids
 
 
 class Predictor(object):
+
     def __init__(self, model_dir):
         model_file = os.path.join(model_dir, "inference.pdmodel")
         params_file = os.path.join(model_dir, "inference.pdiparams")
@@ -148,6 +155,7 @@ def predict(self, data):
 
 
 class TestBertTokenizerOp(unittest.TestCase):
+
     def setUp(self):
         self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
         self.save_path = os.path.join(DATA_HOME, "fast_tokenizer")
@@ -199,12 +207,11 @@ def run_padding(self):
             pad_to_max_seq_len=self.pad_to_max_seq_len,
             is_split_into_words=self.is_split_into_words)
         py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
-        py_token_type_ids = np.array(encoded_inputs[0][
-            "token_type_ids"]).reshape([1, -1])
+        py_token_type_ids = np.array(
+            encoded_inputs[0]["token_type_ids"]).reshape([1, -1])
         self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
         self.assertTrue(
-            np.allclose(
-                token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
+            np.allclose(token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
         # case 2: only one text and one text_pair (batch_size = 1)
         input_ids, token_type_ids = self.faster_tokenizer(
@@ -224,12 +231,11 @@ def run_padding(self):
             pad_to_max_seq_len=self.pad_to_max_seq_len,
             is_split_into_words=self.is_split_into_words)
         py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
-        py_token_type_ids = np.array(encoded_inputs[0][
-            "token_type_ids"]).reshape([1, -1])
+        py_token_type_ids = np.array(
+            encoded_inputs[0]["token_type_ids"]).reshape([1, -1])
         self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
         self.assertTrue(
-            np.allclose(
-                token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
+            np.allclose(token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
         # case 3: only texts (batch_size = 3)
         input_ids, token_type_ids = self.faster_tokenizer(
@@ -252,8 +258,7 @@ def run_padding(self):
         py_token_type_ids = np.array(py_token_type_ids).reshape([3, -1])
         self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
         self.assertTrue(
-            np.allclose(
-                token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
+            np.allclose(token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
         # case 4: texts and text pairs (batch_size = 3)
         input_ids, token_type_ids = self.faster_tokenizer(
@@ -278,8 +283,7 @@ def run_padding(self):
         py_token_type_ids = np.array(py_token_type_ids).reshape([3, -1])
         self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
         self.assertTrue(
-            np.allclose(
-                token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
+            np.allclose(token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
     def test_padding(self):
         with _test_eager_guard():
@@ -308,12 +312,11 @@ def run_no_padding(self):
             pad_to_max_seq_len=self.pad_to_max_seq_len,
             is_split_into_words=self.is_split_into_words)
         py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
-        py_token_type_ids = np.array(encoded_inputs[0][
-            "token_type_ids"]).reshape([1, -1])
+        py_token_type_ids = np.array(
+            encoded_inputs[0]["token_type_ids"]).reshape([1, -1])
         self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
         self.assertTrue(
-            np.allclose(
-                token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
+            np.allclose(token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
         # case 2: only one text and one text_pair (batch_size = 1)
         input_ids, token_type_ids = self.faster_tokenizer(
@@ -333,12 +336,11 @@ def run_no_padding(self):
             pad_to_max_seq_len=self.pad_to_max_seq_len,
             is_split_into_words=self.is_split_into_words)
         py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
-        py_token_type_ids = np.array(encoded_inputs[0][
-            "token_type_ids"]).reshape([1, -1])
+        py_token_type_ids = np.array(
+            encoded_inputs[0]["token_type_ids"]).reshape([1, -1])
         self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
         self.assertTrue(
-            np.allclose(
-                token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
+            np.allclose(token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
     def test_no_padding(self):
         with _test_eager_guard():
@@ -362,8 +364,7 @@ def run_is_split_into_words(self):
             [1, -1])
         self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
         self.assertTrue(
-            np.allclose(
-                token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
+            np.allclose(token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
     def test_is_split_into_words(self):
         with _test_eager_guard():
@@ -391,18 +392,18 @@ def test_inference(self):
 
         encoded_inputs = self.bert_tokenizer(self.text)
         py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
-        py_token_type_ids = np.array(encoded_inputs[0][
-            "token_type_ids"]).reshape([1, -1])
+        py_token_type_ids = np.array(
+            encoded_inputs[0]["token_type_ids"]).reshape([1, -1])
         self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
         self.assertTrue(
-            np.allclose(
-                token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
+            np.allclose(token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
     def test_feed_string_var(self):
         self.init_data()
         paddle.enable_static()
-        x = paddle.static.data(
-            name="x", shape=[-1], dtype=core.VarDesc.VarType.STRINGS)
+        x = paddle.static.data(name="x",
+                               shape=[-1],
+                               dtype=core.VarDesc.VarType.STRINGS)
         exe = paddle.static.Executor(paddle.framework.CPUPlace())
         exe.run(paddle.static.default_main_program(), feed={'x': self.text})
         paddle.disable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
index 22126ce41d05c..439296e4d8f84 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -43,6 +43,7 @@ def fc_refer(matrix, with_bias, with_relu=False):
 
 
 class MatrixGenerate:
+
     def __init__(self, mb, ic, oc, h, w, bias_dims=2):
         self.input = np.random.random((mb, ic, h, w)).astype("float32")
         self.weights = np.random.random((ic * h * w, oc)).astype("float32")
@@ -53,6 +54,7 @@ def __init__(self, mb, ic, oc, h, w, bias_dims=2):
 
 
 class TestFCOp(OpTest):
+
     def config(self):
         self.with_bias = True
         self.with_relu = True
@@ -86,6 +88,7 @@ def test_check_output(self):
 
 
 class TestFCOpNoBias1(TestFCOp):
+
     def config(self):
         self.with_bias = False
         self.with_relu = False
@@ -93,6 +96,7 @@ def config(self):
 
 
 class TestFCOpNoBias2(TestFCOp):
+
     def config(self):
         self.with_bias = False
         self.with_relu = False
@@ -100,6 +104,7 @@ def config(self):
 
 
 class TestFCOpNoBias4(TestFCOp):
+
     def config(self):
         self.with_bias = False
         self.with_relu = False
@@ -107,6 +112,7 @@ def config(self):
 
 
 class TestFCOpWithBias1(TestFCOp):
+
     def config(self):
         self.with_bias = True
         self.with_relu = False
@@ -114,6 +120,7 @@ def config(self):
 
 
 class TestFCOpWithBias2(TestFCOp):
+
     def config(self):
         self.with_bias = True
         self.with_relu = True
@@ -121,6 +128,7 @@ def config(self):
 
 
 class TestFCOpWithBias3(TestFCOp):
+
     def config(self):
         self.with_bias = True
         self.with_relu = True
@@ -128,6 +136,7 @@ def config(self):
 
 
 class TestFCOpWithPadding(TestFCOp):
+
     def config(self):
         self.with_bias = True
         self.with_relu = True
@@ -135,7 +144,9 @@ def config(self):
 
 
 class TestFcOp_NumFlattenDims_NegOne(unittest.TestCase):
+
     def test_api(self):
+
         def run_program(num_flatten_dims):
             paddle.seed(SEED)
             np.random.seed(SEED)
@@ -144,18 +155,17 @@ def run_program(num_flatten_dims):
 
             with program_guard(main_program, startup_program):
                 input = np.random.random([2, 2, 25]).astype("float32")
-                x = fluid.layers.data(
-                    name="x",
-                    shape=[2, 2, 25],
-                    append_batch_size=False,
-                    dtype="float32")
+                x = fluid.layers.data(name="x",
+                                      shape=[2, 2, 25],
+                                      append_batch_size=False,
+                                      dtype="float32")
 
                 out = paddle.static.nn.fc(x=x,
                                           size=1,
                                           num_flatten_dims=num_flatten_dims)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
             exe = fluid.Executor(place=place)
             exe.run(startup_program)
             out = exe.run(main_program, feed={"x": input}, fetch_list=[out])
@@ -167,6 +177,7 @@ def run_program(num_flatten_dims):
 
 
 class TestFCOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             input_data = np.random.random((2, 4)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py b/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py
index 3bbc4cc2904b8..f3fe43e315212 100644
--- a/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py
@@ -62,16 +62,15 @@ def _simple_fc_net(self, in_size, label_size, class_num, hidden_sizes):
 
         predict_label = fluid.layers.fc(hidden, size=class_num, act='softmax')
         loss = fluid.layers.mean(
-            fluid.layers.cross_entropy(
-                input=predict_label, label=label))
+            fluid.layers.cross_entropy(input=predict_label, label=label))
 
         optimizer = fluid.optimizer.Adam()
         optimizer.minimize(loss)
         return in_data, label, loss
 
     def test(self):
-        for use_cuda in [True,
-                         False] if core.is_compiled_with_cuda() else [False]:
+        for use_cuda in [True, False
+                         ] if core.is_compiled_with_cuda() else [False]:
             for use_parallel_executor in [False, True]:
                 print('Test Parameters:'),
                 print({
@@ -85,7 +84,7 @@ def test(self):
                                                       use_parallel_executor)
                 self._test_feed_lod_tensor(use_cuda, use_parallel_executor)
 
-                # Test exception message when feeding with error 
+                # Test exception message when feeding with error
                 in_shape_tuple = (-1, 3, 4, 8)
                 error_shape_list = [self.data_batch_size, 3, 4, 5]
 
@@ -114,31 +113,34 @@ def _test_feed_data_dtype_mismatch(self, use_cuda, use_parallel_executor):
         feed_in_data = np.random.uniform(
             size=[feed_batch_size, 3, 4, 5]).astype(np.float32)
         label_size = [self.data_batch_size, 1]
-        feed_label = np.random.randint(
-            low=0, high=self.class_num,
-            size=[feed_batch_size, 1]).astype(np.float64)
+        feed_label = np.random.randint(low=0,
+                                       high=self.class_num,
+                                       size=[feed_batch_size,
+                                             1]).astype(np.float64)
         self._feed_data_in_executor(in_size, label_size, feed_in_data,
                                     feed_label, use_cuda, use_parallel_executor)
 
     def _test_feed_data_shape_mismatch(self, use_cuda, use_parallel_executor):
         batch_size = self._get_feed_batch_size(use_cuda, use_parallel_executor)
         in_size = [None, 3, 4, 8]
-        feed_in_data = np.random.uniform(
-            size=[batch_size, 3, 4, 5]).astype(np.float32)
+        feed_in_data = np.random.uniform(size=[batch_size, 3, 4, 5]).astype(
+            np.float32)
         label_size = [-1, 1]
-        feed_label = np.random.randint(
-            low=0, high=self.class_num, size=[batch_size, 1]).astype(np.int64)
+        feed_label = np.random.randint(low=0,
+                                       high=self.class_num,
+                                       size=[batch_size, 1]).astype(np.int64)
         self._feed_data_in_executor(in_size, label_size, feed_in_data,
                                     feed_label, use_cuda, use_parallel_executor)
 
     def _test_feed_data_contains_neg_one(self, use_cuda, use_parallel_executor):
         batch_size = self._get_feed_batch_size(use_cuda, use_parallel_executor)
         in_size = [-1, 3, 4, 5]
-        feed_in_data = np.random.uniform(
-            size=[batch_size, 3, 4, 5]).astype(np.float32)
+        feed_in_data = np.random.uniform(size=[batch_size, 3, 4, 5]).astype(
+            np.float32)
         label_size = (None, 1)
-        feed_label = np.random.randint(
-            low=0, high=self.class_num, size=[batch_size, 1]).astype(np.int64)
+        feed_label = np.random.randint(low=0,
+                                       high=self.class_num,
+                                       size=[batch_size, 1]).astype(np.int64)
         self._feed_data_in_executor(in_size, label_size, feed_in_data,
                                     feed_label, use_cuda, use_parallel_executor)
 
@@ -149,9 +151,10 @@ def _test_feed_data_match_shape_type(self, use_cuda, use_parallel_executor):
         feed_in_data = np.random.uniform(
             size=[feed_batch_size, 3, 4, 5]).astype(np.float32)
         label_size = [self.data_batch_size, 1]
-        feed_label = np.random.randint(
-            low=0, high=self.class_num,
-            size=[feed_batch_size, 1]).astype(np.int64)
+        feed_label = np.random.randint(low=0,
+                                       high=self.class_num,
+                                       size=[feed_batch_size,
+                                             1]).astype(np.int64)
         self._feed_data_in_executor(in_size, label_size, feed_in_data,
                                     feed_label, use_cuda, use_parallel_executor)
 
@@ -163,16 +166,17 @@ def _test_feed_lod_tensor(self, use_cuda, use_parallel_executor):
         # sum from 1 to device_count
         sum_length = int((device_count + 1) * device_count / 2)
 
-        feed_in_data = np.random.uniform(
-            size=[sum_length, 3, 4, 5]).astype(np.float32)
+        feed_in_data = np.random.uniform(size=[sum_length, 3, 4, 5]).astype(
+            np.float32)
         feed_data_tensor = fluid.LoDTensor()
         feed_data_tensor.set(feed_in_data, fluid.CPUPlace())
         feed_data_tensor.set_recursive_sequence_lengths(sequence_lengths)
 
         label_size = [device_count, 1]
         feed_label_tensor = fluid.LoDTensor()
-        feed_label = np.random.randint(
-            low=0, high=self.class_num, size=[sum_length, 1]).astype(np.int64)
+        feed_label = np.random.randint(low=0,
+                                       high=self.class_num,
+                                       size=[sum_length, 1]).astype(np.int64)
         feed_label_tensor.set(feed_label, fluid.CPUPlace())
         feed_label_tensor.set_recursive_sequence_lengths(sequence_lengths)
 
@@ -187,8 +191,9 @@ def _feed_data_in_executor(self, in_size, label_size, feed_in_data,
         main_program = fluid.Program()
 
         with fluid.program_guard(main_program, startup_program):
-            in_data, label, loss = self._simple_fc_net(
-                in_size, label_size, self.class_num, self.hidden_sizes)
+            in_data, label, loss = self._simple_fc_net(in_size, label_size,
+                                                       self.class_num,
+                                                       self.hidden_sizes)
 
         place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
@@ -201,11 +206,12 @@ def _feed_data_in_executor(self, in_size, label_size, feed_in_data,
                 main_program).with_data_parallel(loss_name=loss.name)
 
         for i in range(self.iterations):
-            fetches = exe.run(
-                train_program,
-                feed={in_data.name: feed_in_data,
-                      label.name: feed_label},
-                fetch_list=[loss.name])
+            fetches = exe.run(train_program,
+                              feed={
+                                  in_data.name: feed_in_data,
+                                  label.name: feed_label
+                              },
+                              fetch_list=[loss.name])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
index d1842001379ec..282f9bbb6f889 100644
--- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
@@ -20,6 +20,7 @@
 
 
 class TestFeedFetch(unittest.TestCase):
+
     def test_feed_fetch(self):
         scope = core.Scope()
         place = core.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_fetch_handler.py b/python/paddle/fluid/tests/unittests/test_fetch_handler.py
index de9e456f68cd5..f5e1b3c687a0b 100644
--- a/python/paddle/fluid/tests/unittests/test_fetch_handler.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_handler.py
@@ -24,6 +24,7 @@
 
 
 class TestFetchHandler(unittest.TestCase):
+
     @unittest.skip(reason="Skip unstable ci")
     def test_fetch_handler(self):
         place = core.CPUPlace()
@@ -37,6 +38,7 @@ def test_fetch_handler(self):
         var_emb3 = block.create_var(name='emb3', type=core.VarDesc.VarType.FP32)
 
         class FH(fluid.executor.FetchHandler):
+
             def handler(self, fetch_dict):
                 assert len(fetch_dict) == 1
 
@@ -49,13 +51,14 @@ def handler(self, fetch_dict):
         time.sleep(3)
         fm.stop()
 
-        default_fh = fluid.executor.FetchHandler(
-            var_dict={'emb': var_emb,
-                      'emb2': None,
-                      'emb3': var_emb3},
-            period_secs=1)
-        default_fm = fluid.trainer_factory.FetchHandlerMonitor(scope,
-                                                               default_fh)
+        default_fh = fluid.executor.FetchHandler(var_dict={
+            'emb': var_emb,
+            'emb2': None,
+            'emb3': var_emb3
+        },
+                                                 period_secs=1)
+        default_fm = fluid.trainer_factory.FetchHandlerMonitor(
+            scope, default_fh)
         default_fm.start()
         time.sleep(5)
         default_fm.stop()
diff --git a/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py
index 50ad2a4087afe..ee168cc36c18f 100644
--- a/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py
@@ -23,6 +23,7 @@
 
 
 class TestFetchLoDTensorArray(unittest.TestCase):
+
     def build_program(self, main_program, startup_program):
         with fluid.unique_name.guard():
             with fluid.program_guard(main_program, startup_program):
diff --git a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
index 37d269e3369bf..2e48157f950f8 100644
--- a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
@@ -24,24 +24,23 @@
 
 
 class TestFetchUnmerged(unittest.TestCase):
+
     def conv_net(self, img, label):
-        conv_pool_1 = fluid.nets.simple_img_conv_pool(
-            input=img,
-            filter_size=5,
-            num_filters=8,
-            pool_size=2,
-            pool_stride=2,
-            pool_type='max',
-            act="relu")
+        conv_pool_1 = fluid.nets.simple_img_conv_pool(input=img,
+                                                      filter_size=5,
+                                                      num_filters=8,
+                                                      pool_size=2,
+                                                      pool_stride=2,
+                                                      pool_type='max',
+                                                      act="relu")
         conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-        conv_pool_2 = fluid.nets.simple_img_conv_pool(
-            input=conv_pool_1,
-            filter_size=5,
-            num_filters=16,
-            pool_size=2,
-            pool_stride=2,
-            pool_type='avg',
-            act="relu")
+        conv_pool_2 = fluid.nets.simple_img_conv_pool(input=conv_pool_1,
+                                                      filter_size=5,
+                                                      num_filters=16,
+                                                      pool_size=2,
+                                                      pool_stride=2,
+                                                      pool_type='avg',
+                                                      act="relu")
         hidden = fluid.layers.fc(input=conv_pool_2, size=32, act='relu')
         prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
         loss = fluid.layers.cross_entropy(input=prediction, label=label)
@@ -51,10 +50,12 @@ def conv_net(self, img, label):
     def build_program(self, main, startup, is_test):
         with fluid.unique_name.guard():
             with fluid.program_guard(main, startup):
-                img = fluid.layers.data(
-                    name='image', shape=[1, 28, 28], dtype='float32')
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64')
+                img = fluid.layers.data(name='image',
+                                        shape=[1, 28, 28],
+                                        dtype='float32')
+                label = fluid.layers.data(name='label',
+                                          shape=[1],
+                                          dtype='int64')
                 loss, prediction = self.conv_net(img, label)
                 if not is_test:
                     opt = fluid.optimizer.Adam(learning_rate=0.001)
@@ -77,10 +78,9 @@ def fetch_unmerged(self, use_cuda=True):
 
         iters = 2
         batch_size = 16
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=500),
-            batch_size=batch_size)
+        train_reader = paddle.batch(paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+                                    batch_size=batch_size)
         feeder = fluid.DataFeeder(feed_list=feeds, place=place)
 
         device_num = fluid.core.get_cuda_device_count() if use_cuda else 2
diff --git a/python/paddle/fluid/tests/unittests/test_fetch_var.py b/python/paddle/fluid/tests/unittests/test_fetch_var.py
index d78b27566ebc3..2a0d29be47dad 100644
--- a/python/paddle/fluid/tests/unittests/test_fetch_var.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_var.py
@@ -22,6 +22,7 @@
 
 
 class TestFetchVar(unittest.TestCase):
+
     def set_input(self):
         self.val = numpy.array([1, 3, 5]).astype(numpy.int32)
 
@@ -32,13 +33,13 @@ def test_fetch_var(self):
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_main_program(), feed={}, fetch_list=[])
         fetched_x = fluid.executor._fetch_var("x")
-        self.assertTrue(
-            numpy.array_equal(fetched_x, self.val),
-            "fetch_x=%s val=%s" % (fetched_x, self.val))
+        self.assertTrue(numpy.array_equal(fetched_x, self.val),
+                        "fetch_x=%s val=%s" % (fetched_x, self.val))
         self.assertEqual(fetched_x.dtype, self.val.dtype)
 
 
 class TestFetchNullVar(TestFetchVar):
+
     def set_input(self):
         self.val = numpy.array([]).astype(numpy.int32)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
index 95537d4332739..1e7d097134925 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
@@ -25,6 +25,7 @@
 
 
 class TestFillAnyLikeOp(OpTest):
+
     def setUp(self):
         self.op_type = "fill_any_like"
         self.dtype = np.int32
@@ -42,6 +43,7 @@ def test_check_output(self):
 
 
 class TestFillAnyLikeOpFloat32(TestFillAnyLikeOp):
+
     def init(self):
         self.dtype = np.float32
         self.value = 0.0
@@ -50,6 +52,7 @@ def init(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFillAnyLikeOpBfloat16(OpTest):
+
     def setUp(self):
         self.op_type = "fill_any_like"
         self.dtype = np.uint16
@@ -67,21 +70,25 @@ def test_check_output(self):
 
 
 class TestFillAnyLikeOpValue1(TestFillAnyLikeOp):
+
     def init(self):
         self.value = 1.0
 
 
 class TestFillAnyLikeOpValue2(TestFillAnyLikeOp):
+
     def init(self):
         self.value = 1e-10
 
 
 class TestFillAnyLikeOpValue3(TestFillAnyLikeOp):
+
     def init(self):
         self.value = 1e-100
 
 
 class TestFillAnyLikeOpType(TestFillAnyLikeOp):
+
     def setUp(self):
         self.op_type = "fill_any_like"
         self.dtype = np.int32
@@ -99,6 +106,7 @@ def setUp(self):
 
 
 class TestFillAnyLikeOpFloat16(TestFillAnyLikeOp):
+
     def init(self):
         self.dtype = np.float16
 
diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_op.py
index 2066084753631..1262c28edda84 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_any_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_any_op.py
@@ -22,6 +22,7 @@
 
 
 class TestFillAnyOp(OpTest):
+
     def setUp(self):
         self.op_type = "fill_any"
         self.dtype = 'float64'
@@ -48,23 +49,27 @@ def test_check_grad(self):
 
 
 class TestFillAnyOpFloat32(TestFillAnyOp):
+
     def init(self):
         self.dtype = np.float32
         self.value = 0.0
 
 
 class TestFillAnyOpFloat16(TestFillAnyOp):
+
     def init(self):
         self.dtype = np.float16
 
 
 class TestFillAnyOpvalue1(TestFillAnyOp):
+
     def init(self):
         self.dtype = np.float32
         self.value = 111111555
 
 
 class TestFillAnyOpvalue2(TestFillAnyOp):
+
     def init(self):
         self.dtype = np.float32
         self.value = 11111.1111
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 15071b2b6aa69..bd87181ebcc91 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -28,6 +28,7 @@
 
 # Situation 1: Attr(shape) is a list(without tensor)
 class TestFillConstantOp1(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified value
         '''
@@ -42,6 +43,7 @@ def test_check_output(self):
 
 
 class TestFillConstantOp2(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with default value
         '''
@@ -56,6 +58,7 @@ def test_check_output(self):
 
 
 class TestFillConstantOp3(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified int64 value
         '''
@@ -70,6 +73,7 @@ def test_check_output(self):
 
 
 class TestFillConstantOp4(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified int value
         '''
@@ -86,6 +90,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFillConstantBF16Op(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified value
         '''
@@ -105,14 +110,17 @@ def test_check_output(self):
 
 
 class TestFillConstantOpWithSelectedRows(unittest.TestCase):
+
     def check_with_place(self, place):
         scope = core.Scope()
         # create Out Variable
         out = scope.var('Out').get_selected_rows()
 
         # create and run fill_constant_op operator
-        fill_constant_op = Operator(
-            "fill_constant", shape=[123, 92], value=3.8, Out='Out')
+        fill_constant_op = Operator("fill_constant",
+                                    shape=[123, 92],
+                                    value=3.8,
+                                    Out='Out')
         fill_constant_op.run(scope, place)
 
         # get result from Out
@@ -132,6 +140,7 @@ def test_fill_constant_with_selected_rows(self):
 
 # Situation 2: Attr(shape) is a list(with tensor)
 class TestFillConstantOp1_ShapeTensorList(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified value
         '''
@@ -156,6 +165,7 @@ def test_check_output(self):
 
 
 class TestFillConstantOp2_ShapeTensorList(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with default value
         '''
@@ -179,6 +189,7 @@ def test_check_output(self):
 
 
 class TestFillConstantOp3_ShapeTensorList(TestFillConstantOp1_ShapeTensorList):
+
     def init_data(self):
         self.shape = [123, 92]
         self.infer_shape = [123, -1]
@@ -186,6 +197,7 @@ def init_data(self):
 
 
 class TestFillConstantOp4_ShapeTensorList(TestFillConstantOp1_ShapeTensorList):
+
     def init_data(self):
         self.shape = [123, 92]
         self.infer_shape = [123, -1]
@@ -194,6 +206,7 @@ def init_data(self):
 
 # Situation 3: shape is a tensor
 class TestFillConstantOp1_ShapeTensor(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified value
         '''
@@ -214,6 +227,7 @@ def test_check_output(self):
 
 # Situation 4: value is a tensor
 class TestFillConstantOp1_ValueTensor(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified value
         '''
@@ -238,6 +252,7 @@ def test_check_output(self):
 
 # Situation 5: value is a tensor
 class TestFillConstantOp2_ValueTensor(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified value
         '''
@@ -262,43 +277,56 @@ def test_check_output(self):
 
 # Test python API
 class TestFillConstantAPI(unittest.TestCase):
+
     def test_api(self):
 
         positive_2_int32 = fluid.layers.fill_constant([1], "int32", 2)
         positive_2_int64 = fluid.layers.fill_constant([1], "int64", 2)
 
-        shape_tensor_int32 = fluid.data(
-            name="shape_tensor_int32", shape=[2], dtype="int32")
-        shape_tensor_int64 = fluid.data(
-            name="shape_tensor_int64", shape=[2], dtype="int64")
-
-        out_1 = fluid.layers.fill_constant(
-            shape=[1, 2], dtype="float32", value=1.1)
-
-        out_2 = fluid.layers.fill_constant(
-            shape=[1, positive_2_int32], dtype="float32", value=1.1)
-
-        out_3 = fluid.layers.fill_constant(
-            shape=[1, positive_2_int64], dtype="float32", value=1.1)
-
-        out_4 = fluid.layers.fill_constant(
-            shape=shape_tensor_int32, dtype="float32", value=1.1)
-
-        out_5 = fluid.layers.fill_constant(
-            shape=shape_tensor_int64, dtype="float32", value=1.1)
-
-        out_6 = fluid.layers.fill_constant(
-            shape=shape_tensor_int64, dtype=np.float32, value=1.1)
-
-        val1 = fluid.layers.fill_constant(
-            shape=[1], dtype=np.float32, value=1.1)
-        val2 = fluid.layers.fill_constant(
-            shape=[1], dtype=np.float64, value=1.1)
-        out_7 = fluid.layers.fill_constant(
-            shape=shape_tensor_int64, dtype=np.float32, value=val1)
-
-        out_8 = fluid.layers.fill_constant(
-            shape=shape_tensor_int64, dtype=np.float32, value=val2)
+        shape_tensor_int32 = fluid.data(name="shape_tensor_int32",
+                                        shape=[2],
+                                        dtype="int32")
+        shape_tensor_int64 = fluid.data(name="shape_tensor_int64",
+                                        shape=[2],
+                                        dtype="int64")
+
+        out_1 = fluid.layers.fill_constant(shape=[1, 2],
+                                           dtype="float32",
+                                           value=1.1)
+
+        out_2 = fluid.layers.fill_constant(shape=[1, positive_2_int32],
+                                           dtype="float32",
+                                           value=1.1)
+
+        out_3 = fluid.layers.fill_constant(shape=[1, positive_2_int64],
+                                           dtype="float32",
+                                           value=1.1)
+
+        out_4 = fluid.layers.fill_constant(shape=shape_tensor_int32,
+                                           dtype="float32",
+                                           value=1.1)
+
+        out_5 = fluid.layers.fill_constant(shape=shape_tensor_int64,
+                                           dtype="float32",
+                                           value=1.1)
+
+        out_6 = fluid.layers.fill_constant(shape=shape_tensor_int64,
+                                           dtype=np.float32,
+                                           value=1.1)
+
+        val1 = fluid.layers.fill_constant(shape=[1],
+                                          dtype=np.float32,
+                                          value=1.1)
+        val2 = fluid.layers.fill_constant(shape=[1],
+                                          dtype=np.float64,
+                                          value=1.1)
+        out_7 = fluid.layers.fill_constant(shape=shape_tensor_int64,
+                                           dtype=np.float32,
+                                           value=val1)
+
+        out_8 = fluid.layers.fill_constant(shape=shape_tensor_int64,
+                                           dtype=np.float32,
+                                           value=val2)
 
         exe = fluid.Executor(place=fluid.CPUPlace())
         res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8 = exe.run(
@@ -307,9 +335,7 @@ def test_api(self):
                 "shape_tensor_int32": np.array([1, 2]).astype("int32"),
                 "shape_tensor_int64": np.array([1, 2]).astype("int64"),
             },
-            fetch_list=[
-                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
-            ])
+            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8])
 
         assert np.array_equal(res_1, np.full([1, 2], 1.1, dtype="float32"))
         assert np.array_equal(res_2, np.full([1, 2], 1.1, dtype="float32"))
@@ -322,6 +348,7 @@ def test_api(self):
 
 
 class TestFillConstantImperative(unittest.TestCase):
+
     def test_api(self):
         with fluid.dygraph.guard():
             data1 = np.array([1, 2]).astype('int32')
@@ -330,26 +357,26 @@ def test_api(self):
             shape = fluid.dygraph.to_variable(data1)
             val = fluid.dygraph.to_variable(data2)
             value = fluid.dygraph.to_variable(data3)
-            res1 = fluid.layers.fill_constant(
-                shape=[1, 2], dtype='float32', value=1.1)
-            res2 = fluid.layers.fill_constant(
-                shape=shape, dtype='float32', value=1.1)
-            res3 = fluid.layers.fill_constant(
-                shape=shape, dtype='float32', value=val)
-            res4 = fluid.layers.fill_constant(
-                shape=shape, dtype='int32', value=value)
-            assert np.array_equal(
-                res1.numpy(), np.full(
-                    [1, 2], 1.1, dtype="float32"))
-            assert np.array_equal(
-                res2.numpy(), np.full(
-                    [1, 2], 1.1, dtype="float32"))
-            assert np.array_equal(
-                res3.numpy(), np.full(
-                    [1, 2], 1.1, dtype="float32"))
-            assert np.array_equal(
-                res4.numpy(), np.full(
-                    [1, 2], 88, dtype="int32"))
+            res1 = fluid.layers.fill_constant(shape=[1, 2],
+                                              dtype='float32',
+                                              value=1.1)
+            res2 = fluid.layers.fill_constant(shape=shape,
+                                              dtype='float32',
+                                              value=1.1)
+            res3 = fluid.layers.fill_constant(shape=shape,
+                                              dtype='float32',
+                                              value=val)
+            res4 = fluid.layers.fill_constant(shape=shape,
+                                              dtype='int32',
+                                              value=value)
+            assert np.array_equal(res1.numpy(),
+                                  np.full([1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(res2.numpy(),
+                                  np.full([1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(res3.numpy(),
+                                  np.full([1, 2], 1.1, dtype="float32"))
+            assert np.array_equal(res4.numpy(),
+                                  np.full([1, 2], 88, dtype="int32"))
 
     def test_nan(self):
         with fluid.dygraph.guard():
@@ -369,45 +396,42 @@ def test_ninf(self):
 
 
 class TestFillConstantOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             #for ci coverage
             x1 = fluid.layers.data(name='x1', shape=[1], dtype="int16")
-            self.assertRaises(
-                TypeError,
-                fluid.layers.fill_constant,
-                shape=[1],
-                value=5,
-                dtype='uint4')
-
-            self.assertRaises(
-                TypeError,
-                fluid.layers.fill_constant,
-                shape=[1.1],
-                value=5,
-                dtype='float32',
-                out=x1)
+            self.assertRaises(TypeError,
+                              fluid.layers.fill_constant,
+                              shape=[1],
+                              value=5,
+                              dtype='uint4')
+
+            self.assertRaises(TypeError,
+                              fluid.layers.fill_constant,
+                              shape=[1.1],
+                              value=5,
+                              dtype='float32',
+                              out=x1)
 
             # The argument dtype of fill_constant_op must be one of bool, float16,
             #float32, float64, uint8, int16, int32 or int64
             x2 = fluid.layers.data(name='x2', shape=[1], dtype="int32")
 
-            self.assertRaises(
-                TypeError,
-                fluid.layers.fill_constant,
-                shape=[1],
-                value=5,
-                dtype='float64',
-                out=x2)
+            self.assertRaises(TypeError,
+                              fluid.layers.fill_constant,
+                              shape=[1],
+                              value=5,
+                              dtype='float64',
+                              out=x2)
 
             x3 = np.random.randn(100, 100).astype('int32')
-            self.assertRaises(
-                TypeError,
-                fluid.layers.fill_constant,
-                shape=[100, 100],
-                value=5,
-                dtype='float64',
-                out=x3)
+            self.assertRaises(TypeError,
+                              fluid.layers.fill_constant,
+                              shape=[100, 100],
+                              value=5,
+                              dtype='float64',
+                              out=x3)
 
             # The argument shape's type of fill_constant_op must be list, tuple or Variable.
             def test_shape_type():
@@ -423,23 +447,28 @@ def test_shape_size():
 
             # The shape dtype of fill_constant_op must be int32 or int64.
             def test_shape_tensor_dtype():
-                shape = fluid.data(
-                    name="shape_tensor", shape=[2], dtype="float32")
-                fluid.layers.fill_constant(
-                    shape=shape, dtype="float32", value=1)
+                shape = fluid.data(name="shape_tensor",
+                                   shape=[2],
+                                   dtype="float32")
+                fluid.layers.fill_constant(shape=shape,
+                                           dtype="float32",
+                                           value=1)
 
             self.assertRaises(TypeError, test_shape_tensor_dtype)
 
             def test_shape_tensor_list_dtype():
-                shape = fluid.data(
-                    name="shape_tensor_list", shape=[1], dtype="bool")
-                fluid.layers.fill_constant(
-                    shape=[shape, 2], dtype="float32", value=1)
+                shape = fluid.data(name="shape_tensor_list",
+                                   shape=[1],
+                                   dtype="bool")
+                fluid.layers.fill_constant(shape=[shape, 2],
+                                           dtype="float32",
+                                           value=1)
 
             self.assertRaises(TypeError, test_shape_tensor_list_dtype)
 
 
 class TestFillConstantOp_ValueTensorBf16(OpTest):
+
     def setUp(self):
         '''Test fill_constant op with specified value
         '''
@@ -447,7 +476,8 @@ def setUp(self):
         self.init_data()
 
         self.inputs = {
-            "ShapeTensor": np.array(self.shape).astype("int32"),
+            "ShapeTensor":
+            np.array(self.shape).astype("int32"),
             'ValueTensor':
             convert_float_to_uint16(np.array([self.value]).astype("float32"))
         }
diff --git a/python/paddle/fluid/tests/unittests/test_fill_diagonal_tensor_op.py b/python/paddle/fluid/tests/unittests/test_fill_diagonal_tensor_op.py
index 8ac7a9586cb42..c1a187d7bbaaf 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_diagonal_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_diagonal_tensor_op.py
@@ -83,6 +83,7 @@ def fill_gt(x, y, offset, dim1, dim2):
 
 
 class TensorFillDiagTensor_Test(OpTest):
+
     def setUp(self):
         self.op_type = "fill_diagonal_tensor"
         self.init_kernel_type()
@@ -108,6 +109,7 @@ def test_check_grad(self):
 
 
 class TensorFillDiagTensor_Test2(TensorFillDiagTensor_Test):
+
     def setUp(self):
         self.op_type = "fill_diagonal_tensor"
         self.init_kernel_type()
@@ -127,6 +129,7 @@ def init_kernel_type(self):
 
 
 class TensorFillDiagTensor_Test3(TensorFillDiagTensor_Test):
+
     def setUp(self):
         self.op_type = "fill_diagonal_tensor"
         self.init_kernel_type()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_op.py b/python/paddle/fluid/tests/unittests/test_fill_op.py
index 7c8587dc40020..fdf4ec85627d5 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_op.py
@@ -22,6 +22,7 @@
 
 
 class TestFillOp1(OpTest):
+
     def setUp(self):
         self.op_type = "fill"
         val = np.random.random(size=[100, 200])
@@ -39,6 +40,7 @@ def test_check_output(self):
 
 
 class TestFillOp2(OpTest):
+
     def setUp(self):
         self.op_type = "fill"
         val = np.random.random(size=[100, 200])
@@ -56,6 +58,7 @@ def test_check_output(self):
 
 
 class TestFillOp3(unittest.TestCase):
+
     def check_with_place(self, place, f_cpu):
         scope = core.Scope()
         # create Out Variable
@@ -63,13 +66,12 @@ def check_with_place(self, place, f_cpu):
 
         # create and run fill_op operator
         val = np.random.random(size=[300, 200])
-        fill_op = Operator(
-            "fill",
-            value=val.flatten(),
-            shape=[300, 200],
-            dtype=int(core.VarDesc.VarType.FP32),
-            force_cpu=f_cpu,
-            Out='Out')
+        fill_op = Operator("fill",
+                           value=val.flatten(),
+                           shape=[300, 200],
+                           dtype=int(core.VarDesc.VarType.FP32),
+                           force_cpu=f_cpu,
+                           Out='Out')
         fill_op.run(scope, place)
 
         # get result from Out
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py
index 46590bf187a86..1371f202cb6b6 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py
@@ -22,6 +22,7 @@
 
 
 class TestFillZerosLike2Op(OpTest):
+
     def setUp(self):
         self.op_type = "fill_zeros_like2"
         self.dtype = np.float32
@@ -38,17 +39,21 @@ def test_check_output(self):
 
 
 class TestFillZerosLike2OpFp16(TestFillZerosLike2Op):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestFillZerosLike2OpFp64(TestFillZerosLike2Op):
+
     def init_dtype(self):
         self.dtype = np.float64
 
 
 class TestZerosError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_zeros_like_type_error():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
                 fluid.layers.zeros_like([10], dtype="float")
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
index 20f1a110c35d6..1b23078e7d455 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
@@ -20,6 +20,7 @@
 
 
 class TestFillZerosLikeOp(OpTest):
+
     def setUp(self):
         self.op_type = "fill_zeros_like"
         self.dtype = np.float32
@@ -35,6 +36,7 @@ def test_check_output(self):
 
 
 class TestFillZerosLikeOpFp16(TestFillZerosLikeOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
diff --git a/python/paddle/fluid/tests/unittests/test_filter_by_instag_op.py b/python/paddle/fluid/tests/unittests/test_filter_by_instag_op.py
index ecd2e2cd6c3cf..32aa5c15997fe 100644
--- a/python/paddle/fluid/tests/unittests/test_filter_by_instag_op.py
+++ b/python/paddle/fluid/tests/unittests/test_filter_by_instag_op.py
@@ -28,6 +28,7 @@
 
 
 class TestFilterByInstagOp(OpTest):
+
     def setUp(self):
         self.op_type = 'filter_by_instag'
         x1 = np.zeros((36, 4), dtype=np.float64)
@@ -55,8 +56,8 @@ def setUp(self):
                     out[ln, k] = cur
                 ln += 1
 
-        mmap = np.array(
-            [[0, 1, 2], [2, 6, 4], [6, 15, 6], [12, 28, 8]]).astype('int64')
+        mmap = np.array([[0, 1, 2], [2, 6, 4], [6, 15, 6], [12, 28,
+                                                            8]]).astype('int64')
         mmap_lod = [[1, 1, 1, 1]]
 
         loss_weight = np.array([[1], [1], [1], [1]]).astype('double')
@@ -78,14 +79,16 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            ['Ins'], 'Out', no_grad_set=set(['Ins_tag', 'Filter_tag']))
+        self.check_grad(['Ins'],
+                        'Out',
+                        no_grad_set=set(['Ins_tag', 'Filter_tag']))
 
 
 """This is Test Case 2"""
 
 
 class TestFilterByInstagOp2(OpTest):
+
     def setUp(self):
         self.op_type = 'filter_by_instag'
 
@@ -123,14 +126,16 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            ['Ins'], 'Out', no_grad_set=set(['Ins_tag', 'Filter_tag']))
+        self.check_grad(['Ins'],
+                        'Out',
+                        no_grad_set=set(['Ins_tag', 'Filter_tag']))
 
 
 """This is Test Case 3"""
 
 
 class TestFilterByInstagOp3(OpTest):
+
     def setUp(self):
         self.op_type = 'filter_by_instag'
 
@@ -165,14 +170,16 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            ['Ins'], 'Out', no_grad_set=set(['Ins_tag', 'Filter_tag']))
+        self.check_grad(['Ins'],
+                        'Out',
+                        no_grad_set=set(['Ins_tag', 'Filter_tag']))
 
 
 """This is Test Case 4"""
 
 
 class TestFilterByInstagOp4(OpTest):
+
     def setUp(self):
         self.op_type = 'filter_by_instag'
 
@@ -206,11 +213,13 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            ['Ins'], 'Out', no_grad_set=set(['Ins_tag', 'Filter_tag']))
+        self.check_grad(['Ins'],
+                        'Out',
+                        no_grad_set=set(['Ins_tag', 'Filter_tag']))
 
 
 class TestFilterByInstagOp6(OpTest):
+
     def setUp(self):
         self.op_type = 'filter_by_instag'
 
@@ -248,6 +257,7 @@ def test_check_grad(self):
 
 
 class TestFilterByInstagOp7(OpTest):
+
     def setUp(self):
         self.op_type = 'filter_by_instag'
 
diff --git a/python/paddle/fluid/tests/unittests/test_flatten2_op.py b/python/paddle/fluid/tests/unittests/test_flatten2_op.py
index 42b43cc46a69b..b0e821c269379 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten2_op.py
@@ -22,6 +22,7 @@
 
 
 class TestFlattenOp(OpTest):
+
     def setUp(self):
         self.op_type = "flatten2"
         self.init_test_case()
@@ -48,6 +49,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp1(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.axis = 0
@@ -55,6 +57,7 @@ def init_test_case(self):
 
 
 class TestFlattenOpWithDefaultAxis(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (10, 2, 2, 3)
         self.new_shape = (10, 12)
@@ -64,6 +67,7 @@ def init_attrs(self):
 
 
 class TestFlattenOpSixDims(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 3, 2, 4, 4)
         self.axis = 4
@@ -71,6 +75,7 @@ def init_test_case(self):
 
 
 class TestStaticFlattenInferShapePythonAPI(unittest.TestCase):
+
     def execute_api(self, x, axis=1):
         return fluid.layers.flatten(x, axis=axis)
 
@@ -78,13 +83,15 @@ def test_static_api(self):
         paddle.enable_static()
         main_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, paddle.static.Program()):
-            x = paddle.static.data(
-                name="x", shape=[-1, 3, -1, -1], dtype='float32')
+            x = paddle.static.data(name="x",
+                                   shape=[-1, 3, -1, -1],
+                                   dtype='float32')
             out = self.execute_api(x, axis=2)
         self.assertTrue((-1, -1) == out.shape)
 
 
 class TestFlatten2OpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input_data = np.random.random((3, 2, 4, 5)).astype("float64")
@@ -97,8 +104,9 @@ def test_Variable():
 
         def test_type():
             # dtype must be float32, float64, int8, int32, int64, uint8.
-            x2 = fluid.layers.data(
-                name='x2', shape=[3, 2, 4, 5], dtype='float16')
+            x2 = fluid.layers.data(name='x2',
+                                   shape=[3, 2, 4, 5],
+                                   dtype='float16')
             fluid.layers.flatten(x2, axis=1)
 
         self.assertRaises(TypeError, test_type)
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
index ac352fcdf87ea..bcb9a99e7d80c 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
@@ -22,6 +22,7 @@
 
 
 class TestFlattenOp(OpTest):
+
     def setUp(self):
         self.python_api = paddle.flatten
         self.python_out_sig = ["Out"]
@@ -56,6 +57,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_1(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 1
@@ -70,6 +72,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_2(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -84,6 +87,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_3(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -98,6 +102,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_4(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = -2
@@ -112,6 +117,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_5(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 2
@@ -126,6 +132,7 @@ def init_attrs(self):
 
 
 class TestFlattenOpSixDims(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 3, 2, 4, 4)
         self.start_axis = 3
@@ -140,6 +147,7 @@ def init_attrs(self):
 
 
 class TestFlatten2OpError(unittest.TestCase):
+
     def test_errors(self):
         image_shape = (2, 3, 4, 4)
         x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
@@ -147,22 +155,25 @@ def test_errors(self):
         x = x.astype('float32')
 
         def test_ValueError1():
-            x_var = paddle.static.data(
-                name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(name="x",
+                                       shape=image_shape,
+                                       dtype='float32')
             out = paddle.flatten(x_var, start_axis=2, stop_axis=1)
 
         self.assertRaises(ValueError, test_ValueError1)
 
         def test_ValueError2():
-            x_var = paddle.static.data(
-                name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(name="x",
+                                       shape=image_shape,
+                                       dtype='float32')
             paddle.flatten(x_var, start_axis=10, stop_axis=1)
 
         self.assertRaises(ValueError, test_ValueError2)
 
         def test_ValueError3():
-            x_var = paddle.static.data(
-                name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(name="x",
+                                       shape=image_shape,
+                                       dtype='float32')
             paddle.flatten(x_var, start_axis=2, stop_axis=10)
 
         self.assertRaises(ValueError, test_ValueError3)
@@ -172,8 +183,9 @@ def test_type():
             x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
                            image_shape[3]).reshape(image_shape) / 100.
             x2 = x2.astype('float16')
-            x2_var = paddle.fluid.data(
-                name='x2', shape=[3, 2, 4, 5], dtype='float16')
+            x2_var = paddle.fluid.data(name='x2',
+                                       shape=[3, 2, 4, 5],
+                                       dtype='float16')
             paddle.flatten(x2_var)
 
         self.assertRaises(TypeError, test_type)
@@ -185,6 +197,7 @@ def test_InputError():
 
 
 class TestStaticFlattenPythonAPI(unittest.TestCase):
+
     def execute_api(self, x, start_axis=0, stop_axis=-1):
         return paddle.flatten(x, start_axis, stop_axis)
 
@@ -194,8 +207,9 @@ def test_static_api(self):
 
         main_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, paddle.static.Program()):
-            x = paddle.static.data(
-                name="x", shape=[2, 3, 4, 4], dtype='float32')
+            x = paddle.static.data(name="x",
+                                   shape=[2, 3, 4, 4],
+                                   dtype='float32')
             out = self.execute_api(x, start_axis=-2, stop_axis=-1)
 
         exe = paddle.static.Executor(place=paddle.CPUPlace())
@@ -204,6 +218,7 @@ def test_static_api(self):
 
 
 class TestStaticFlattenInferShapePythonAPI(unittest.TestCase):
+
     def execute_api(self, x, start_axis=0, stop_axis=-1):
         return paddle.flatten(x, start_axis, stop_axis)
 
@@ -211,18 +226,21 @@ def test_static_api(self):
         paddle.enable_static()
         main_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, paddle.static.Program()):
-            x = paddle.static.data(
-                name="x", shape=[-1, 3, -1, -1], dtype='float32')
+            x = paddle.static.data(name="x",
+                                   shape=[-1, 3, -1, -1],
+                                   dtype='float32')
             out = self.execute_api(x, start_axis=2, stop_axis=3)
         self.assertTrue((-1, 3, -1) == out.shape)
 
 
 class TestStaticInplaceFlattenPythonAPI(TestStaticFlattenPythonAPI):
+
     def execute_api(self, x, start_axis=0, stop_axis=-1):
         return x.flatten_(start_axis, stop_axis)
 
 
 class TestFlattenPython(unittest.TestCase):
+
     def test_python_api(self):
         image_shape = (2, 3, 4, 4)
         x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
@@ -245,6 +263,7 @@ def test_Negative():
 
 
 class TestDygraphInplaceFlattenPython(unittest.TestCase):
+
     def test_python_api(self):
         image_shape = (2, 3, 4, 4)
         x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_op.py b/python/paddle/fluid/tests/unittests/test_flatten_op.py
index a5b24debaee7f..91e2ba89dc0e9 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
@@ -21,6 +21,7 @@
 
 
 class TestFlattenOp(OpTest):
+
     def setUp(self):
         self.op_type = "flatten"
         self.init_test_case()
@@ -44,6 +45,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp1(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 2, 10)
         self.axis = 0
@@ -51,6 +53,7 @@ def init_test_case(self):
 
 
 class TestFlattenOpWithDefaultAxis(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (10, 2, 2, 3)
         self.new_shape = (10, 12)
@@ -60,6 +63,7 @@ def init_attrs(self):
 
 
 class TestFlattenOpSixDims(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 3, 2, 4, 4)
         self.axis = 4
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
index a9a6b9c0660b4..a7df64c1d9211 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
@@ -26,8 +26,7 @@
 def gen_data():
     return {
         "x": np.random.random(size=(128, 32)).astype('float32'),
-        "y": np.random.randint(
-            2, size=(128, 1)).astype('int64')
+        "y": np.random.randint(2, size=(128, 1)).astype('int64')
     }
 
 
@@ -43,6 +42,7 @@ def mlp(input_x, input_y, hid_dim=128, label_dim=2):
 
 
 class TestFleetAMPInit(unittest.TestCase):
+
     def test_fleet_amp_init(self):
         if not fluid.core.is_compiled_with_cuda():
             return
@@ -54,10 +54,12 @@ def test_fleet_amp_init(self):
         fleet.init(role)
 
         with paddle.static.program_guard(main_program, startup_program):
-            input_x = paddle.static.data(
-                name="x", shape=[None, 32], dtype='float32')
-            input_y = paddle.static.data(
-                name="y", shape=[None, 1], dtype='int64')
+            input_x = paddle.static.data(name="x",
+                                         shape=[None, 32],
+                                         dtype='float32')
+            input_y = paddle.static.data(name="y",
+                                         shape=[None, 1],
+                                         dtype='int64')
 
             cost = mlp(input_x, input_y)
             optimizer = paddle.optimizer.Momentum(
@@ -95,10 +97,12 @@ def test_fleet_amp_meta_optimizer_init(self):
         fleet.init(role)
 
         with paddle.static.program_guard(main_program, startup_program):
-            input_x = paddle.static.data(
-                name="x", shape=[None, 32], dtype='float32')
-            input_y = paddle.static.data(
-                name="y", shape=[None, 1], dtype='int64')
+            input_x = paddle.static.data(name="x",
+                                         shape=[None, 32],
+                                         dtype='float32')
+            input_y = paddle.static.data(name="y",
+                                         shape=[None, 1],
+                                         dtype='int64')
 
             cost = mlp(input_x, input_y)
             optimizer = paddle.optimizer.Momentum(
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
index 982ec4eb5c7a0..6b05e63482b5d 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -25,13 +25,14 @@
 
 
 class TestFleetAMPOptimizer(TestFleetMetaOptimizer):
+
     def test_amp_optimizer_backward(self):
         """ test amp optimizer backward """
         train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
 
-        opt = fluid.optimizer.MomentumOptimizer(
-            learning_rate=0.001, momentum=0.9)
+        opt = fluid.optimizer.MomentumOptimizer(learning_rate=0.001,
+                                                momentum=0.9)
         opt = AMPOptimizer(opt)
 
         self.set_strategy(strategy, 'amp')
@@ -48,8 +49,8 @@ def test_amp_optimizer_backward_gradients(self):
         train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
 
-        opt = fluid.optimizer.MomentumOptimizer(
-            learning_rate=0.001, momentum=0.9)
+        opt = fluid.optimizer.MomentumOptimizer(learning_rate=0.001,
+                                                momentum=0.9)
         opt = AMPOptimizer(opt)
 
         self.set_strategy(strategy, 'amp')
@@ -68,8 +69,8 @@ def test_amp_optimizer_backward_optimize(self):
         train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
 
-        opt = fluid.optimizer.MomentumOptimizer(
-            learning_rate=0.001, momentum=0.9)
+        opt = fluid.optimizer.MomentumOptimizer(learning_rate=0.001,
+                                                momentum=0.9)
         opt = AMPOptimizer(opt)
 
         self.set_strategy(strategy, 'amp')
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_api_input.py b/python/paddle/fluid/tests/unittests/test_fleet_api_input.py
index 9ca2b7c567c24..139ce121ad587 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_api_input.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_api_input.py
@@ -28,6 +28,7 @@
 
 
 class DistributeTranspilerConfigTest(unittest.TestCase):
+
     def set_runtime_split_send_recv(self, config, value):
         config.runtime_split_send_recv = value
 
@@ -48,6 +49,7 @@ def testConfig(self):
 
 
 class FleetTest(unittest.TestCase):
+
     def testInvalidInputs(self):
         self.assertRaises(Exception, fleet.split_files, "files")
         self.assertRaises(Exception, fleet.init, "pserver")
@@ -60,43 +62,40 @@ def testInvalidInputs(self):
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         pe = fluid.ParallelExecutor(use_cuda=False, loss_name=loss.name)
-        self.assertRaises(
-            Exception,
-            fleet.save_inference_model,
-            dirname='/tmp/',
-            feeded_var_names=['X'],
-            target_vars=[loss],
-            executor=pe)
-        self.assertRaises(
-            Exception,
-            fleet.save_inference_model,
-            dirname='/tmp/',
-            feeded_var_names=['X'],
-            target_vars=[loss],
-            executor="executor")
+        self.assertRaises(Exception,
+                          fleet.save_inference_model,
+                          dirname='/tmp/',
+                          feeded_var_names=['X'],
+                          target_vars=[loss],
+                          executor=pe)
+        self.assertRaises(Exception,
+                          fleet.save_inference_model,
+                          dirname='/tmp/',
+                          feeded_var_names=['X'],
+                          target_vars=[loss],
+                          executor="executor")
         compiled_prog = fluid.compiler.CompiledProgram(
             fluid.default_main_program())
-        self.assertRaises(
-            Exception,
-            fleet.save_inference_model,
-            dirname='/tmp/',
-            feeded_var_names=['X'],
-            target_vars=[loss],
-            executor=exe,
-            main_program=compiled_prog)
-        self.assertRaises(
-            Exception, fleet.save_persistables, executor=pe, dirname='/tmp/')
-        self.assertRaises(
-            Exception,
-            fleet.save_persistables,
-            executor="executor",
-            dirname='/tmp/')
-        self.assertRaises(
-            Exception,
-            fleet.save_persistables,
-            executor=exe,
-            dirname='/tmp/',
-            main_program=compiled_prog)
+        self.assertRaises(Exception,
+                          fleet.save_inference_model,
+                          dirname='/tmp/',
+                          feeded_var_names=['X'],
+                          target_vars=[loss],
+                          executor=exe,
+                          main_program=compiled_prog)
+        self.assertRaises(Exception,
+                          fleet.save_persistables,
+                          executor=pe,
+                          dirname='/tmp/')
+        self.assertRaises(Exception,
+                          fleet.save_persistables,
+                          executor="executor",
+                          dirname='/tmp/')
+        self.assertRaises(Exception,
+                          fleet.save_persistables,
+                          executor=exe,
+                          dirname='/tmp/',
+                          main_program=compiled_prog)
         self.assertRaises(Exception, fleet._transpile, "config")
 
     def set_program(self, avg_cost, strategy):
@@ -147,6 +146,7 @@ def test_transpile(self):
 
 
 class TranspilerOptimizerTest(unittest.TestCase):
+
     def testInvalidInputs(self):
         self.assertRaises(Exception, TranspilerOptimizer, "Adam", None)
         self.assertRaises(Exception, TranspilerOptimizer,
@@ -157,11 +157,14 @@ def testInvalidInputs(self):
         data = fluid.layers.data(name='X', shape=[1], dtype='float32')
         hidden = fluid.layers.fc(input=data, size=10)
         loss = fluid.layers.mean(hidden)
-        self.assertRaises(
-            Exception, transpiler.minimize, loss=loss.name, startup_program=[])
+        self.assertRaises(Exception,
+                          transpiler.minimize,
+                          loss=loss.name,
+                          startup_program=[])
 
 
 class UserDefinedRoleMakerTest(unittest.TestCase):
+
     def createRoleMaker(self,
                         current_id=0,
                         role=Role.WORKER,
@@ -179,19 +182,19 @@ def testRoleMaker(self):
         self.assertRaises(
             Exception, self.createRoleMaker,
             server_endpoints=[])  # server_endpoints can't be empty
-        self.assertRaises(
-            Exception, self.createRoleMaker, server_endpoints=[
-                3, []
-            ])  # element in server_endpoints must be as string
-        self.assertRaises(
-            Exception,
-            self.createRoleMaker,
-            server_endpoints=["127.0.0.1:8080", "127.0.0.1:8080"]
-        )  # element in server_endpoints can't be duplicate
+        self.assertRaises(Exception,
+                          self.createRoleMaker,
+                          server_endpoints=[
+                              3, []
+                          ])  # element in server_endpoints must be as string
+        self.assertRaises(Exception,
+                          self.createRoleMaker,
+                          server_endpoints=[
+                              "127.0.0.1:8080", "127.0.0.1:8080"
+                          ])  # element in server_endpoints can't be duplicate
         # test all invalid current_id
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            current_id="0")  # current_id must be as int
+        self.assertRaises(Exception, self.createRoleMaker,
+                          current_id="0")  # current_id must be as int
         self.assertRaises(
             Exception, self.createRoleMaker,
             current_id=-1)  # current_id must be greater than or equal to 0
@@ -203,12 +206,10 @@ def testRoleMaker(self):
             server_endpoints=["127.0.0.1:8080"]
         )  # if role is server, current_id must be less than len(server_endpoints)
         # test all invalid worker_num
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            worker_num="1")  # worker_num must be as int
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            worker_num=0)  # worker_num must be greater than 0
+        self.assertRaises(Exception, self.createRoleMaker,
+                          worker_num="1")  # worker_num must be as int
+        self.assertRaises(Exception, self.createRoleMaker,
+                          worker_num=0)  # worker_num must be greater than 0
         # test all invalid role
         self.assertRaises(
             Exception, self.createRoleMaker,
@@ -216,7 +217,9 @@ def testRoleMaker(self):
 
 
 class UserDefinedCollectiveRoleMakerTest(unittest.TestCase):
-    def createRoleMaker(self, current_id=0,
+
+    def createRoleMaker(self,
+                        current_id=0,
                         worker_endpoints=["127.0.0.1:8080"]):
         role = UserDefinedCollectiveRoleMaker(current_id, worker_endpoints)
 
@@ -229,19 +232,19 @@ def testRoleMaker(self):
         self.assertRaises(
             Exception, self.createRoleMaker,
             worker_endpoints=[])  # worker_endpoints can't be empty
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            worker_endpoints=[3,
-                              []])  # element worker_endpoints must be as string
-        self.assertRaises(
-            Exception,
-            self.createRoleMaker,
-            worker_endpoints=["127.0.0.1:8080", "127.0.0.1:8080"]
-        )  # element in worker_endpoints can't be duplicate
+        self.assertRaises(Exception,
+                          self.createRoleMaker,
+                          worker_endpoints=[
+                              3, []
+                          ])  # element worker_endpoints must be as string
+        self.assertRaises(Exception,
+                          self.createRoleMaker,
+                          worker_endpoints=[
+                              "127.0.0.1:8080", "127.0.0.1:8080"
+                          ])  # element in worker_endpoints can't be duplicate
         # test all invalid current_id
-        self.assertRaises(
-            Exception, self.createRoleMaker,
-            current_id="0")  # current_id must be as int
+        self.assertRaises(Exception, self.createRoleMaker,
+                          current_id="0")  # current_id must be as int
         self.assertRaises(
             Exception, self.createRoleMaker,
             current_id=-1)  # current_id must be greater than or equal to 0
@@ -249,11 +252,13 @@ def testRoleMaker(self):
             Exception,
             self.createRoleMaker,
             current_id=1,
-            worker_endpoints=["127.0.0.1:8080"]
-        )  # current_id must be less than len(worker_endpoints)
+            worker_endpoints=[
+                "127.0.0.1:8080"
+            ])  # current_id must be less than len(worker_endpoints)
 
 
 class CollectiveOptimizerTest(unittest.TestCase):
+
     def test_ds_as_None(self):
         optimizer = fluid.optimizer.AdamOptimizer()
         dist_optimizer = CollectiveOptimizer(optimizer, strategy=None)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
index b9d88a8e1155e..d81406387054d 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,11 +25,15 @@
 import paddle.distributed.fleet.ascend_utils as ascend_utils
 
 RANK_TABLE_JSON = {
-    "status": "completed",
-    "version": "1.0",
-    "server_count": "1",
+    "status":
+    "completed",
+    "version":
+    "1.0",
+    "server_count":
+    "1",
     "server_list": [{
-        "server_id": "127.0.0.1",
+        "server_id":
+        "127.0.0.1",
         "device": [{
             "device_id": "0",
             "device_ip": "192.1.184.23",
@@ -44,6 +48,7 @@
 
 
 class TestAscendUtil(unittest.TestCase):
+
     def test_get_cloud_cluster(self):
         cluster, pod = ascend_utils.get_cloud_cluster()
         self.assertTrue(cluster)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_auto.py b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
index 3e5b479fab559..460ef27f63c18 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
@@ -22,6 +22,7 @@
 
 
 class TestDistributedStrategyAuto(unittest.TestCase):
+
     def setUp(self):
         os.environ["POD_IP"] = "127.0.0.1"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
@@ -31,15 +32,16 @@ def setUp(self):
 
     def test_distributed_strategy_auto(self):
         fleet.init(is_collective=True)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
+        input_x = paddle.fluid.layers.data(name="x",
+                                           shape=[32],
+                                           dtype='float32')
         input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
         fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
         fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
+        cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                 label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index 99986043ec70e..46263d1a10ec0 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -23,6 +23,7 @@
 
 
 class TestFleetBase(unittest.TestCase):
+
     def setUp(self):
         os.environ["POD_IP"] = "127.0.0.1"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36000"
@@ -59,8 +60,8 @@ def test_is_worker(self):
     def test_worker_endpoints(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        self.assertEqual(
-            "127.0.0.1:36000", fleet.worker_endpoints(to_string=True))
+        self.assertEqual("127.0.0.1:36000",
+                         fleet.worker_endpoints(to_string=True))
         self.assertEqual(["127.0.0.1:36000"], fleet.worker_endpoints())
 
     def test_server_num(self):
@@ -90,9 +91,8 @@ def test_server_endpoints(self):
         role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
         if fleet.is_server():
-            self.assertEqual(
-                "127.0.0.1:36001,127.0.0.2:36002",
-                fleet.server_endpoints(to_string=True))
+            self.assertEqual("127.0.0.1:36001,127.0.0.2:36002",
+                             fleet.server_endpoints(to_string=True))
             self.assertEqual(["127.0.0.1:36001", "127.0.0.2:36002"],
                              fleet.server_endpoints())
 
@@ -144,6 +144,7 @@ def test_exception(self):
 
 
 class TestFleetDygraph(unittest.TestCase):
+
     def setUp(self):
         os.environ[
             "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213,127.0.0.1:36214"
@@ -156,15 +157,15 @@ def test_dygraph_method(self):
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = fluid.dygraph.to_variable(value)
         layer = paddle.nn.Linear(13, 5)
-        adam = paddle.optimizer.Adam(
-            learning_rate=0.01, parameters=layer.parameters())
+        adam = paddle.optimizer.Adam(learning_rate=0.01,
+                                     parameters=layer.parameters())
         # remove init cause this UT cannot launch distributed task
         adam = fleet.distributed_optimizer(adam)
         try:
             dp_layer = fleet.distributed_model(layer)
         except Exception as e:
-            # This is just for testing the interface, 
-            # and will not actually be called. Therefore, 
+            # This is just for testing the interface,
+            # and will not actually be called. Therefore,
             # use "try-except" to avoid errors.
             lr = 0.001
             adam.set_lr(lr)
@@ -177,20 +178,22 @@ def test_dygraph_method(self):
 
 
 class TestFleetBaseSingleError(unittest.TestCase):
+
     def setUp(self):
         os.environ.pop("PADDLE_TRAINER_ENDPOINTS")
 
     def gen_data(self):
         return {
             "x": np.random.random(size=(128, 32)).astype('float32'),
-            "y": np.random.randint(
-                2, size=(128, 1)).astype('int64')
+            "y": np.random.randint(2, size=(128, 1)).astype('int64')
         }
 
     def test_single_run_collective_minimize(self):
+
         def test_single_error():
-            input_x = paddle.static.data(
-                name="x", shape=[-1, 32], dtype='float32')
+            input_x = paddle.static.data(name="x",
+                                         shape=[-1, 32],
+                                         dtype='float32')
             input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
 
             fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
index 9675a77d6766b..529e9995bd54b 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
@@ -14,6 +14,7 @@
 
 import unittest
 import paddle
+
 paddle.enable_static()
 
 import os
@@ -21,6 +22,7 @@
 
 
 class TestFleetBase(unittest.TestCase):
+
     def setUp(self):
         os.environ["POD_IP"] = "127.0.0.1"
         os.environ["PADDLE_PORT"] = "36000"
@@ -35,20 +37,23 @@ def test_ps_minimize(self):
         os.environ["TRAINING_ROLE"] = "TRAINER"
         os.environ["PADDLE_TRAINER_ID"] = "1"
 
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_slot = paddle.fluid.layers.data(
-            name="slot", shape=[1], dtype='int64')
+        input_x = paddle.fluid.layers.data(name="x",
+                                           shape=[32],
+                                           dtype='float32')
+        input_slot = paddle.fluid.layers.data(name="slot",
+                                              shape=[1],
+                                              dtype='int64')
         input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
-        emb = paddle.fluid.layers.embedding(
-            input=input_slot, size=[10, 9], is_sparse=True)
+        emb = paddle.fluid.layers.embedding(input=input_slot,
+                                            size=[10, 9],
+                                            is_sparse=True)
         input_x = paddle.concat(x=[input_x, emb], axis=1)
         fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
         fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
+        cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                 label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
         role = fleet.PaddleCloudRoleMaker(is_collective=False)
@@ -71,8 +76,9 @@ def test_ps_minimize(self):
 
         fleet.init_worker()
         fleet.fleet.save(dirname="/tmp", feed=['x', 'y'], fetch=[avg_cost])
-        fleet.fleet.save(
-            dirname="/tmp", feed=[input_x, input_y], fetch=[avg_cost])
+        fleet.fleet.save(dirname="/tmp",
+                         feed=[input_x, input_y],
+                         fetch=[avg_cost])
         fleet.fleet.save(dirname="/tmp")
 
         fleet.load_model(path="/tmp", mode=0)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
index 8dcacafabbbf2..5e6aabe308ec1 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
@@ -18,10 +18,12 @@
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.fluid as fluid
+
 paddle.enable_static()
 
 
 class TestFleetBase_1(unittest.TestCase):
+
     def setUp(self):
         os.environ["POD_IP"] = "127.0.0.1"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
@@ -30,15 +32,16 @@ def setUp(self):
                        "127.0.0.1:36001,127.0.0.2:36001"
 
     def test_collective_minimize(self):
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
+        input_x = paddle.fluid.layers.data(name="x",
+                                           shape=[32],
+                                           dtype='float32')
         input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
         fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
         fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
+        cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                 label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
@@ -50,6 +53,7 @@ def test_collective_minimize(self):
 
 
 class TestFleetBase(unittest.TestCase):
+
     def setUp(self):
         os.environ["POD_IP"] = "127.0.0.1"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
@@ -58,15 +62,16 @@ def setUp(self):
                        "127.0.0.1:36001,127.0.0.2:36001"
 
     def test_fleet_get_applied_optimizer(self):
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
+        input_x = paddle.fluid.layers.data(name="x",
+                                           shape=[32],
+                                           dtype='float32')
         input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
         fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
         fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
+        cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                 label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
         fleet.init(is_collective=True)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_4.py b/python/paddle/fluid/tests/unittests/test_fleet_base_4.py
index dba409ec9200e..fa154285f21a3 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_4.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_4.py
@@ -22,6 +22,7 @@
 
 
 class TestFleetBase(unittest.TestCase):
+
     def setUp(self):
         os.environ["POD_IP"] = "127.0.0.1"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_single.py b/python/paddle/fluid/tests/unittests/test_fleet_base_single.py
index ff54035045b2e..a782bf3842d82 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_single.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_single.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 import os
+
 cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
 if cuda_visible_devices is None or cuda_visible_devices == "":
     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
@@ -28,6 +29,7 @@
 
 
 class LinearNet(nn.Layer):
+
     def __init__(self):
         super(LinearNet, self).__init__()
         self._linear1 = nn.Linear(10, 10)
@@ -38,6 +40,7 @@ def forward(self, x):
 
 
 class TestFleetDygraphSingle(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
         os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
@@ -50,8 +53,8 @@ def test_dygraph_single(self):
 
         layer = LinearNet()
         loss_fn = nn.MSELoss()
-        adam = paddle.optimizer.Adam(
-            learning_rate=0.001, parameters=layer.parameters())
+        adam = paddle.optimizer.Adam(learning_rate=0.001,
+                                     parameters=layer.parameters())
 
         adam = fleet.distributed_optimizer(adam)
         dp_layer = fleet.distributed_model(layer)
@@ -66,14 +69,14 @@ def test_dygraph_single(self):
 
 
 class TestFleetBaseSingleRunCollective(unittest.TestCase):
+
     def setUp(self):
         pass
 
     def gen_data(self):
         return {
             "x": np.random.random(size=(128, 32)).astype('float32'),
-            "y": np.random.randint(
-                2, size=(128, 1)).astype('int64')
+            "y": np.random.randint(2, size=(128, 1)).astype('int64')
         }
 
     def test_single_run_collective_minimize(self):
@@ -91,8 +94,8 @@ def test_single_run_collective_minimize(self):
         optimizer = fleet.distributed_optimizer(optimizer)
         optimizer.minimize(avg_cost)
 
-        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if paddle.fluid.is_compiled_with_cuda() else fluid.CPUPlace()
 
         exe = fluid.Executor(place)
         exe.run(paddle.static.default_startup_program())
@@ -103,14 +106,14 @@ def test_single_run_collective_minimize(self):
 
 
 class TestFleetBaseSingleRunPS(unittest.TestCase):
+
     def setUp(self):
         pass
 
     def gen_data(self):
         return {
             "x": np.random.random(size=(128, 32)).astype('float32'),
-            "y": np.random.randint(
-                2, size=(128, 1)).astype('int64')
+            "y": np.random.randint(2, size=(128, 1)).astype('int64')
         }
 
     def test_single_run_ps_minimize(self):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
index fc57602b445dd..f48b166f97035 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
@@ -26,6 +26,7 @@
 
 
 class FleetTest(unittest.TestCase):
+
     def _test_checkpoint(self, fs, dir_path):
         file_name = "persistables"
 
@@ -38,8 +39,8 @@ def _test_checkpoint(self, fs, dir_path):
 
         image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
         label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-        feeder = fluid.DataFeeder(
-            feed_list=[image, label], place=fluid.CPUPlace())
+        feeder = fluid.DataFeeder(feed_list=[image, label],
+                                  place=fluid.CPUPlace())
         predict = fluid.layers.fc(input=image, size=10, act='softmax')
         loss = fluid.layers.cross_entropy(input=predict, label=label)
         avg_loss = fluid.layers.mean(loss)
@@ -53,21 +54,26 @@ def _test_checkpoint(self, fs, dir_path):
 
         status = ExeTrainStatus()
         status.epoch_no = 2
-        _, n1 = fleet.save_checkpoint(
-            exe, dir_path, trainer_id=0, train_status=status, fs=fs)
+        _, n1 = fleet.save_checkpoint(exe,
+                                      dir_path,
+                                      trainer_id=0,
+                                      train_status=status,
+                                      fs=fs)
 
         status2 = ExeTrainStatus()
-        fleet.load_checkpoint(
-            exe, dir_path, trainer_id=0, fs=fs, train_status=status2)
+        fleet.load_checkpoint(exe,
+                              dir_path,
+                              trainer_id=0,
+                              fs=fs,
+                              train_status=status2)
         self.assertEqual(status2, status)
 
-        _, n2 = fleet.save_checkpoint(
-            exe,
-            dir_path,
-            trainer_id=0,
-            train_status=status,
-            fs=fs,
-            remain_all_checkpoint=False)
+        _, n2 = fleet.save_checkpoint(exe,
+                                      dir_path,
+                                      trainer_id=0,
+                                      train_status=status,
+                                      fs=fs,
+                                      remain_all_checkpoint=False)
         self.assertEqual(n2, n1 + 1)
 
         c = CheckpointSaver(fs)
@@ -75,40 +81,37 @@ def _test_checkpoint(self, fs, dir_path):
         assert len(cp_nos) == 1  # cleanup all others
 
         # unnormal
-        # test remain_all_checkpoint 
-        fleet.save_checkpoint(
-            exe,
-            dir_path,
-            trainer_id=0,
-            train_status=status,
-            fs=fs,
-            remain_all_checkpoint=False)
+        # test remain_all_checkpoint
+        fleet.save_checkpoint(exe,
+                              dir_path,
+                              trainer_id=0,
+                              train_status=status,
+                              fs=fs,
+                              remain_all_checkpoint=False)
 
         # can't save under a file
         fs = LocalFS()
         cache_path = "./.load_cache"
         fs.touch(cache_path)
         try:
-            fleet.save_checkpoint(
-                exe,
-                dir_path,
-                trainer_id=0,
-                train_status=status,
-                fs=fs,
-                cache_path=cache_path)
+            fleet.save_checkpoint(exe,
+                                  dir_path,
+                                  trainer_id=0,
+                                  train_status=status,
+                                  fs=fs,
+                                  cache_path=cache_path)
             self.assertFalse(True)
         except:
             pass
 
         # can't load under a file
         try:
-            fleet.load_checkpoint(
-                exe,
-                dir_path,
-                trainer_id=0,
-                train_status=status2,
-                fs=fs,
-                cache_path=cache_path)
+            fleet.load_checkpoint(exe,
+                                  dir_path,
+                                  trainer_id=0,
+                                  train_status=status2,
+                                  fs=fs,
+                                  cache_path=cache_path)
             self.assertFalse(True)
         except:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
index 3a64c1818ccc6..522b563bc5683 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
@@ -25,14 +25,15 @@
 
 
 class TestFleetDGCOptimizer(TestFleetMetaOptimizer):
+
     def test_dgc_optimizer_backward(self):
         """ test dgc optimizer backward """
         train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
 
         self.set_strategy(strategy, 'dgc')
-        opt = fluid.optimizer.MomentumOptimizer(
-            learning_rate=0.001, momentum=0.9)
+        opt = fluid.optimizer.MomentumOptimizer(learning_rate=0.001,
+                                                momentum=0.9)
         dgc_opt = DGCOptimizer(opt)
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
@@ -47,8 +48,8 @@ def test_dgc_optimizer_gradients(self):
         avg_cost, strategy = self.net(train_prog, startup_prog)
 
         self.set_strategy(strategy, 'dgc')
-        opt = fluid.optimizer.MomentumOptimizer(
-            learning_rate=0.001, momentum=0.9)
+        opt = fluid.optimizer.MomentumOptimizer(learning_rate=0.001,
+                                                momentum=0.9)
         dgc_opt = DGCOptimizer(opt)
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
@@ -66,8 +67,8 @@ def test_dgc_optimizer_optimize(self):
         avg_cost, strategy = self.net(train_prog, startup_prog)
 
         self.set_strategy(strategy, 'dgc')
-        opt = fluid.optimizer.MomentumOptimizer(
-            learning_rate=0.001, momentum=0.9)
+        opt = fluid.optimizer.MomentumOptimizer(learning_rate=0.001,
+                                                momentum=0.9)
         dgc_opt = DGCOptimizer(opt)
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index ffc3f2b21a476..455a7a30cfd18 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -18,6 +18,7 @@
 
 
 class TestStrategyConfig(unittest.TestCase):
+
     def test_amp(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.amp = True
@@ -266,10 +267,12 @@ def test_sparse_table_configs(self):
             [1, 2]
         }
         strategy.sparse_table_configs = configs
-        self.assertEqual(strategy.sparse_table_configs[0]
-                         .accessor.embed_sgd_param.adagrad.learning_rate, 0.05)
-        self.assertEqual(strategy.sparse_table_configs[0]
-                         .accessor.table_accessor_save_param[0].param, 1)
+        self.assertEqual(
+            strategy.sparse_table_configs[0].accessor.embed_sgd_param.adagrad.
+            learning_rate, 0.05)
+        self.assertEqual(
+            strategy.sparse_table_configs[0].accessor.
+            table_accessor_save_param[0].param, 1)
 
         strategy.adam_d2sum = True
         self.assertEqual(strategy.adam_d2sum, True)
@@ -286,22 +289,25 @@ def test_fleet_desc_configs(self):
         configs = {}
         configs['emb'] = {"sparse_optimizer": "adagrad"}
         strategy.fleet_desc_configs = configs
-        self.assertEqual(strategy.sparse_table_configs[0]
-                         .accessor.embed_sgd_param.adagrad.learning_rate, 0.05)
+        self.assertEqual(
+            strategy.sparse_table_configs[0].accessor.embed_sgd_param.adagrad.
+            learning_rate, 0.05)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {}
         configs['emb'] = {"sparse_optimizer": "naive"}
         strategy.fleet_desc_configs = configs
-        self.assertEqual(strategy.sparse_table_configs[0]
-                         .accessor.embed_sgd_param.naive.learning_rate, 0.05)
+        self.assertEqual(
+            strategy.sparse_table_configs[0].accessor.embed_sgd_param.naive.
+            learning_rate, 0.05)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {}
         configs['emb'] = {"sparse_optimizer": "adam"}
         strategy.fleet_desc_configs = configs
-        self.assertEqual(strategy.sparse_table_configs[0]
-                         .accessor.embed_sgd_param.adam.beta1_decay_rate, 0.9)
+        self.assertEqual(
+            strategy.sparse_table_configs[0].accessor.embed_sgd_param.adam.
+            beta1_decay_rate, 0.9)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {}
@@ -310,10 +316,12 @@ def test_fleet_desc_configs(self):
             "embed_sparse_optimizer": "std_adagrad"
         }
         strategy.fleet_desc_configs = configs
-        self.assertEqual(strategy.sparse_table_configs[0]
-                         .accessor.ctr_accessor_param.show_scale, False)
-        self.assertEqual(strategy.sparse_table_configs[0]
-                         .accessor.embed_sgd_param.adagrad.initial_range, 0)
+        self.assertEqual(
+            strategy.sparse_table_configs[0].accessor.ctr_accessor_param.
+            show_scale, False)
+        self.assertEqual(
+            strategy.sparse_table_configs[0].accessor.embed_sgd_param.adagrad.
+            initial_range, 0)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {}
@@ -322,9 +330,9 @@ def test_fleet_desc_configs(self):
             "embed_sparse_optimizer": "std_adagrad"
         }
         strategy.fleet_desc_configs = configs
-        self.assertEqual(strategy.sparse_table_configs[0]
-                         .accessor.embed_sgd_param.adagrad.initial_range,
-                         0.0001)
+        self.assertEqual(
+            strategy.sparse_table_configs[0].accessor.embed_sgd_param.adagrad.
+            initial_range, 0.0001)
 
     def test_trainer_desc_configs(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py b/python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py
index 2d2f019c5ed09..3bc5d886011e4 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py
@@ -32,6 +32,7 @@
 
 
 class TestCollectiveLauncher(unittest.TestCase):
+
     def setUp(self):
         file_dir = os.path.dirname(os.path.abspath(__file__))
 
@@ -40,6 +41,7 @@ def setUp(self):
             f.write(fake_python_code)
 
     def test_launch(self):
+
         class Argument:
             elastic_server = "127.0.0.1:2379"
             job_id = "test_job_id_123"
@@ -78,6 +80,7 @@ class Argument:
             pass
 
     def test_stop(self):
+
         class Argument:
             elastic_server = "127.0.0.1:2379"
             job_id = "test_job_id_123"
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_elastic_init.py b/python/paddle/fluid/tests/unittests/test_fleet_elastic_init.py
index 10028d2d98f67..b7310ab4486d0 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_elastic_init.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_elastic_init.py
@@ -25,7 +25,9 @@
 
 
 class TestElasticInit(unittest.TestCase):
+
     def setUp(self):
+
         class Argument:
             elastic_server = "127.0.0.1:2379"
             job_id = "test_job_id_123"
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py b/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py
index 6dc9f69d03f7c..61d84151b68b2 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py
@@ -26,11 +26,13 @@
 
 
 class MockLease():
+
     def refresh(self):
         pass
 
 
 class MockEtcdClient:
+
     def __init__(self, lease=None):
         self._lease = lease
 
@@ -69,10 +71,12 @@ def lease(self, ttl):
 
 
 class TestElasticManager(unittest.TestCase):
+
     def setUp(self):
         self.etcd_client = MockEtcdClient()
 
     def test_elastic_manager_init(self):
+
         class Argument:
             elastic_server = "127.0.0.1:2379"
             job_id = "test_job_id_123"
@@ -89,6 +93,7 @@ class Argument:
         args = Argument()
 
         class _MockLease():
+
             def refresh(self):
                 raise ValueError("valid error, this only for unittest")
 
@@ -96,6 +101,7 @@ def refresh(self):
         elastic = ElasticManager(args, etcd_client=etcd_client)
 
     def test_match_faulttolerance(self):
+
         class Argument:
             elastic_server = "127.0.0.1:2379"
             job_id = "test_job_id_123"
@@ -126,6 +132,7 @@ class Argument:
         self.assertEqual(elastic._match(hosts), False)
 
     def test_match_elastic(self):
+
         class Argument:
             elastic_server = "127.0.0.1:2379"
             job_id = "test_job_id_123"
@@ -177,6 +184,7 @@ class Argument:
         #self.assertEqual(elastic._match(hosts), True)
 
     def test_update_hosts_for_faulttolerance(self):
+
         class Argument:
             elastic_server = "127.0.0.1:2379"
             job_id = "test_job_id_123"
@@ -253,8 +261,8 @@ class Argument:
         elastic._update_hosts()
         #self.assertEqual(elastic.all_host_endpoints,
         #                 ["10.10.10.1:6001", "10.10.10.2:6001", "10.10.10.3:6001"])
-        self.assertEqual(
-            os.getenv('PADDLE_TRAINERS'), "10.10.10.1,10.10.10.2,10.10.10.3")
+        self.assertEqual(os.getenv('PADDLE_TRAINERS'),
+                         "10.10.10.1,10.10.10.2,10.10.10.3")
 
         #######################
         # elastic, scale in #
@@ -279,11 +287,10 @@ class Argument:
         elastic._update_hosts()
         #self.assertEqual(elastic.all_host_endpoints,
         #                 ["10.10.10.3:6001", "10.10.10.1:6001", "10.10.10.2:6001"])
-        self.assertEqual(
-            os.getenv('PADDLE_TRAINERS'), "10.10.10.3,10.10.10.1,10.10.10.2")
-        self.assertEqual(
-            os.getenv('DISTRIBUTED_TRAINER_ENDPOINTS'),
-            "10.10.10.3:6001,10.10.10.1:6001,10.10.10.2:6001")
+        self.assertEqual(os.getenv('PADDLE_TRAINERS'),
+                         "10.10.10.3,10.10.10.1,10.10.10.2")
+        self.assertEqual(os.getenv('DISTRIBUTED_TRAINER_ENDPOINTS'),
+                         "10.10.10.3:6001,10.10.10.1:6001,10.10.10.2:6001")
 
         ############
         os.environ[
@@ -305,11 +312,11 @@ class Argument:
         #self.assertEqual(elastic.all_host_endpoints,
         #                 ["10.10.10.1:6001", "10.10.10.1:6001"])
         self.assertEqual(os.getenv('PADDLE_TRAINERS'), "10.10.10.1,10.10.10.1")
-        self.assertEqual(
-            os.getenv('DISTRIBUTED_TRAINER_ENDPOINTS'),
-            "10.10.10.1:6001,10.10.10.1:6003")
+        self.assertEqual(os.getenv('DISTRIBUTED_TRAINER_ENDPOINTS'),
+                         "10.10.10.1:6001,10.10.10.1:6003")
 
     def test_exit(self):
+
         class Argument:
             elastic_server = "127.0.0.1:2379"
             job_id = "test_job_id_123"
@@ -328,6 +335,7 @@ class Argument:
         elastic.exit()
 
     def test_pre_hook(self):
+
         class Argument:
             elastic_server = "127.0.0.1:2379"
             job_id = "test_job_id_123"
@@ -350,6 +358,7 @@ class Argument:
         elastic.pre_hook()
 
     def test_watch(self):
+
         class Argument:
             elastic_server = "127.0.0.1:2379"
             job_id = "test_job_id_123"
@@ -365,6 +374,7 @@ class Argument:
             elastic_pre_hook = None
 
         class ElasticLauncher:
+
             def watch(self):
                 return ELASTIC_AUTO_PARALLEL_EXIT_CODE
 
@@ -378,11 +388,14 @@ def stop(self):
         elastic.watch()
 
     def test_launcher_interface_check_procs(self):
+
         class Proc:
+
             def poll(self):
                 return ELASTIC_AUTO_PARALLEL_EXIT_CODE
 
         class ProcList:
+
             def __init__(self):
                 self.proc = Proc()
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py
index 544fe4dd43e6b..0c672f1ff1ec1 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py
@@ -22,6 +22,7 @@
 
 
 class TestDistModelRun(unittest.TestCase):
+
     def test_dist_model_run(self):
         # step 0: declare folder to save the model and params
         folder = './dist_model_run_test/'
@@ -39,8 +40,10 @@ def test_dist_model_run(self):
         x_data = np.random.randn(28, 28).astype('float32')
         y_data = np.random.randint(0, 9, size=[28, 1]).astype('int64')
         exe.run(paddle.static.default_main_program(),
-                feed={'x': x_data,
-                      'y': y_data},
+                feed={
+                    'x': x_data,
+                    'y': y_data
+                },
                 fetch_list=[avg_loss])
         paddle.static.save_inference_model(path_prefix, [x, y], [avg_loss], exe)
         print('save model to', path_prefix)
@@ -63,11 +66,13 @@ def test_dist_model_run(self):
         print("dist model rst:", dist_model_rst)
 
         # step 4: use framework's api to inference with fake data
-        [inference_program, feed_target_names, fetch_targets] = (
-            paddle.static.load_inference_model(path_prefix, exe))
+        [inference_program, feed_target_names,
+         fetch_targets] = (paddle.static.load_inference_model(path_prefix, exe))
         results = exe.run(inference_program,
-                          feed={'x': x_tensor,
-                                'y': y_tensor},
+                          feed={
+                              'x': x_tensor,
+                              'y': y_tensor
+                          },
                           fetch_list=fetch_targets)
         load_inference_model_rst = results[0]
         print("load inference model api rst:", load_inference_model_rst)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
index 2d4fe92f05156..98affdfa54071 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
@@ -22,12 +22,13 @@
 
 
 class TestDistModelTensor(unittest.TestCase):
+
     def test_dist_model_tensor(self):
         tensor_32 = np.random.randint(10, 20, size=[20, 2]).astype('int32')
         dist_tensor32 = DistModelTensor(tensor_32, '32_tensor')
         self.assertEqual(dist_tensor32.dtype, DistModelDataType.INT32)
-        self.assertEqual(
-            dist_tensor32.data.tolist('int32'), tensor_32.ravel().tolist())
+        self.assertEqual(dist_tensor32.data.tolist('int32'),
+                         tensor_32.ravel().tolist())
         # the length is how many byte the data contains
         self.assertEqual(dist_tensor32.data.length(), 40 * 4)
         self.assertEqual(dist_tensor32.name, '32_tensor')
@@ -38,8 +39,8 @@ def test_dist_model_tensor(self):
         tensor_64 = np.random.randint(10, 20, size=[20, 2]).astype('int64')
         dist_tensor64 = DistModelTensor(tensor_64, '64_tensor')
         self.assertEqual(dist_tensor64.dtype, DistModelDataType.INT64)
-        self.assertEqual(
-            dist_tensor64.data.tolist('int64'), tensor_64.ravel().tolist())
+        self.assertEqual(dist_tensor64.data.tolist('int64'),
+                         tensor_64.ravel().tolist())
         self.assertEqual(dist_tensor64.data.length(), 40 * 8)
         self.assertEqual(dist_tensor64.name, '64_tensor')
         dist_tensor64.data.reset(tensor_64)
@@ -49,9 +50,8 @@ def test_dist_model_tensor(self):
         tensor_float = np.random.randn(20, 2).astype('float32')
         dist_tensor_float = DistModelTensor(tensor_float, 'float_tensor')
         self.assertEqual(dist_tensor_float.dtype, DistModelDataType.FLOAT32)
-        self.assertEqual(
-            dist_tensor_float.data.tolist('float32'),
-            tensor_float.ravel().tolist())
+        self.assertEqual(dist_tensor_float.data.tolist('float32'),
+                         tensor_float.ravel().tolist())
         self.assertEqual(dist_tensor_float.data.length(), 40 * 4)
         self.assertEqual(dist_tensor_float.name, 'float_tensor')
         dist_tensor_float.data.reset(tensor_float)
@@ -62,9 +62,8 @@ def test_dist_model_tensor(self):
         dist_tensor_float_16 = DistModelTensor(tensor_float_16,
                                                'float_tensor_16')
         self.assertEqual(dist_tensor_float_16.dtype, DistModelDataType.FLOAT16)
-        self.assertEqual(
-            dist_tensor_float_16.data.tolist('float16'),
-            tensor_float_16.ravel().tolist())
+        self.assertEqual(dist_tensor_float_16.data.tolist('float16'),
+                         tensor_float_16.ravel().tolist())
         self.assertEqual(dist_tensor_float_16.data.length(), 40 * 2)
         self.assertEqual(dist_tensor_float_16.name, 'float_tensor_16')
         dist_tensor_float_16.data.reset(tensor_float_16)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
index 8b73a714bbbc5..b824df45e3e1f 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
@@ -21,6 +21,7 @@
 
 
 class TestFleetExecutor(unittest.TestCase):
+
     def fake_fleet_opt(self):
         # TODO: Fake for coverage will be removed in the future
         import paddle.distributed.fleet as fleet
@@ -42,10 +43,12 @@ def run_fleet_executor(self, place, x_data, y_data):
         exe = paddle.static.Executor(place)
         empty_program = paddle.static.Program()
         with fluid.program_guard(empty_program, empty_program):
-            x = fluid.layers.data(
-                name='x', shape=x_data.shape, dtype=x_data.dtype)
-            y = fluid.layers.data(
-                name='y', shape=y_data.shape, dtype=y_data.dtype)
+            x = fluid.layers.data(name='x',
+                                  shape=x_data.shape,
+                                  dtype=x_data.dtype)
+            y = fluid.layers.data(name='y',
+                                  shape=y_data.shape,
+                                  dtype=y_data.dtype)
             z = x + y
             a = 2 * x + 3 * y
             loss = paddle.mean(a)
@@ -54,8 +57,8 @@ def run_fleet_executor(self, place, x_data, y_data):
             steps_per_pass = 10
             bd = [steps_per_pass * p for p in passes]
             lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-            lr_val = paddle.optimizer.lr.PiecewiseDecay(
-                boundaries=bd, values=lr)
+            lr_val = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd,
+                                                        values=lr)
             opt = paddle.optimizer.AdamW(
                 learning_rate=lr_val,
                 grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
@@ -66,8 +69,10 @@ def run_fleet_executor(self, place, x_data, y_data):
             "section_program": empty_program
         }
         res = exe.run(empty_program,
-                      feed={'x': x_data,
-                            'y': y_data},
+                      feed={
+                          'x': x_data,
+                          'y': y_data
+                      },
                       fetch_list=[z.name, a.name])
         return res
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py
index fb82c71b2ff7f..c21549c3ce334 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py
@@ -22,6 +22,7 @@
 
 
 class TestFleetExecutor(unittest.TestCase):
+
     def run_fleet_executor(self, place, fleet_opt=dict()):
         exe = paddle.static.Executor(place)
         empty_program = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
index 4bbb3bff07f97..295530d9c9d43 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
@@ -21,6 +21,7 @@
 
 
 class TestFleetExecutor(unittest.TestCase):
+
     def fake_fleet_opt(self):
         # TODO: Fake for coverage will be removed in the future
         import paddle.distributed.fleet as fleet
@@ -42,10 +43,12 @@ def run_fleet_executor(self, place, x_data, y_data):
         exe = paddle.static.Executor(place)
         empty_program = paddle.static.Program()
         with fluid.program_guard(empty_program, empty_program):
-            x = fluid.layers.data(
-                name='x', shape=x_data.shape, dtype=x_data.dtype)
-            y = fluid.layers.data(
-                name='y', shape=y_data.shape, dtype=y_data.dtype)
+            x = fluid.layers.data(name='x',
+                                  shape=x_data.shape,
+                                  dtype=x_data.dtype)
+            y = fluid.layers.data(name='y',
+                                  shape=y_data.shape,
+                                  dtype=y_data.dtype)
             z = x + y
             a = 2 * x + 3 * y
             loss = paddle.mean(a)
@@ -54,8 +57,8 @@ def run_fleet_executor(self, place, x_data, y_data):
             steps_per_pass = 10
             bd = [steps_per_pass * p for p in passes]
             lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-            lr_val = paddle.optimizer.lr.PiecewiseDecay(
-                boundaries=bd, values=lr)
+            lr_val = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd,
+                                                        values=lr)
             opt = paddle.optimizer.AdamW(
                 learning_rate=lr_val,
                 grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
@@ -66,8 +69,10 @@ def run_fleet_executor(self, place, x_data, y_data):
             "section_program": empty_program
         }
         res = exe.run(empty_program,
-                      feed={'x': x_data,
-                            'y': y_data},
+                      feed={
+                          'x': x_data,
+                          'y': y_data
+                      },
                       fetch_list=[z.name, a.name])
         return res
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py
index 3dae8a5bf6b95..0830782c86d42 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@
 
 
 class TestFleetExecutorTaskNode(unittest.TestCase):
+
     def test_task_node(self):
         program = paddle.static.Program()
         task_node_0 = core.TaskNode(program.desc, 0, 1, 1)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
index 61064175266bb..f531b85c3ddc5 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
@@ -22,14 +22,17 @@
 
 
 class TestFleetExecutor(unittest.TestCase):
+
     def run_fleet_executor(self, place, x_data, y_data):
         exe = paddle.static.Executor(place)
         empty_program = paddle.static.Program()
         with fluid.program_guard(empty_program, empty_program):
-            x = fluid.layers.data(
-                name='x', shape=x_data.shape, dtype=x_data.dtype)
-            y = fluid.layers.data(
-                name='y', shape=y_data.shape, dtype=y_data.dtype)
+            x = fluid.layers.data(name='x',
+                                  shape=x_data.shape,
+                                  dtype=x_data.dtype)
+            y = fluid.layers.data(name='y',
+                                  shape=y_data.shape,
+                                  dtype=y_data.dtype)
             z = x + y
             a = 2 * x + 3 * y
             loss = paddle.mean(a)
@@ -38,8 +41,8 @@ def run_fleet_executor(self, place, x_data, y_data):
             steps_per_pass = 10
             bd = [steps_per_pass * p for p in passes]
             lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-            lr_val = paddle.optimizer.lr.PiecewiseDecay(
-                boundaries=bd, values=lr)
+            lr_val = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd,
+                                                        values=lr)
             opt = paddle.optimizer.AdamW(
                 learning_rate=lr_val,
                 grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
@@ -61,8 +64,10 @@ def run_fleet_executor(self, place, x_data, y_data):
             "section_program": empty_program
         }
         res = exe.run(empty_program,
-                      feed={'x': x_data,
-                            'y': y_data},
+                      feed={
+                          'x': x_data,
+                          'y': y_data
+                      },
                       fetch_list=[z.name, a.name])
         return res
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
index efffa9fa88fde..d7de5ef3d40eb 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
@@ -23,24 +23,27 @@
 
 
 class TestFleetFP16CompressOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "0"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
 
     def net(self, main_prog, startup_prog, dtype='float32'):
         with fluid.program_guard(main_prog, startup_prog):
-            input_x = paddle.fluid.layers.data(
-                name="x", shape=[32], dtype=dtype)
-            input_y = paddle.fluid.layers.data(
-                name="y", shape=[1], dtype='int64')
+            input_x = paddle.fluid.layers.data(name="x",
+                                               shape=[32],
+                                               dtype=dtype)
+            input_y = paddle.fluid.layers.data(name="y",
+                                               shape=[1],
+                                               dtype='int64')
 
             fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
             fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
             prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                 size=2,
                                                 act='softmax')
-            cost = paddle.fluid.layers.cross_entropy(
-                input=prediction, label=input_y)
+            cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                     label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
index efe62a32fc3f7..0f8b36e3f89b3 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
@@ -24,6 +24,7 @@
 
 
 class TestFleetGradientMergeMetaOptimizer(TestFleetMetaOptimizer):
+
     def test_gradient_merge_optimizer(self):
         train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
         )
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_gradient_scale.py b/python/paddle/fluid/tests/unittests/test_fleet_gradient_scale.py
index d64b534398ddf..7fd6211b33b39 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_gradient_scale.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_gradient_scale.py
@@ -24,6 +24,7 @@
 
 
 class TestGradientScale(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "0"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
@@ -34,16 +35,15 @@ def mlp(self, input_x, input_y, hid_dim=128, label_dim=2):
         prediction = paddle.static.nn.fc(x=[fc_2],
                                          size=label_dim,
                                          activation='softmax')
-        cost = paddle.nn.functional.cross_entropy(
-            input=prediction, label=input_y)
+        cost = paddle.nn.functional.cross_entropy(input=prediction,
+                                                  label=input_y)
         avg_cost = paddle.mean(x=cost)
         return avg_cost
 
     def gen_data(self):
         return {
             "x": np.random.random(size=(128, 32)).astype('float32'),
-            "y": np.random.randint(
-                2, size=(128, 1)).astype('int64')
+            "y": np.random.randint(2, size=(128, 1)).astype('int64')
         }
 
     def test_single_gpu(self):
@@ -55,10 +55,12 @@ def test_single_gpu(self):
         strategy.gradient_scale_configs = {'scale_strategy': 'sum'}
         with fluid.program_guard(main_program, startup_program):
             with fluid.unique_name.guard():
-                input_x = paddle.static.data(
-                    name="x", shape=[None, 32], dtype='float32')
-                input_y = paddle.static.data(
-                    name="y", shape=[None, 1], dtype='int64')
+                input_x = paddle.static.data(name="x",
+                                             shape=[None, 32],
+                                             dtype='float32')
+                input_y = paddle.static.data(name="y",
+                                             shape=[None, 1],
+                                             dtype='int64')
                 cost = self.mlp(input_x=input_x, input_y=input_y)
                 output_name = cost.name
                 optimizer = fleet.distributed_optimizer(fluid.optimizer.Adam(),
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
index 393de688aa5e0..6ca078cdde7f5 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
@@ -21,6 +21,7 @@
 
 
 class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
+
     def setUp(self):
         try:
             self._dist_ut_port_0 = int(os.environ["PADDLE_DIST_UT_PORT"])
@@ -33,46 +34,58 @@ def test_graph_execution_optimizer_not_apply(self):
         port_a = self._dist_ut_port_0
         port_b = self._dist_ut_port_1
         node_a = {
-            "PADDLE_TRAINER_ID": "0",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
-            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ID":
+            "0",
+            "PADDLE_CURRENT_ENDPOINT":
+            "127.0.0.1:{}".format(port_a),
+            "PADDLE_TRAINERS_NUM":
+            "2",
             "PADDLE_TRAINER_ENDPOINTS":
             "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
-            "http_proxy": "",
-            "https_proxy": ""
+            "http_proxy":
+            "",
+            "https_proxy":
+            ""
         }
 
         node_b = {
-            "PADDLE_TRAINER_ID": "1",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
-            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ID":
+            "1",
+            "PADDLE_CURRENT_ENDPOINT":
+            "127.0.0.1:{}".format(port_b),
+            "PADDLE_TRAINERS_NUM":
+            "2",
             "PADDLE_TRAINER_ENDPOINTS":
             "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
-            "http_proxy": "",
-            "https_proxy": ""
+            "http_proxy":
+            "",
+            "https_proxy":
+            ""
         }
 
         def node_func():
             import paddle.distributed.fleet as fleet
             fleet.init(is_collective=True)
-            input_x = paddle.fluid.layers.data(
-                name="x", shape=[32], dtype='float32')
-            input_y = paddle.fluid.layers.data(
-                name="y", shape=[1], dtype='int64')
+            input_x = paddle.fluid.layers.data(name="x",
+                                               shape=[32],
+                                               dtype='float32')
+            input_y = paddle.fluid.layers.data(name="y",
+                                               shape=[1],
+                                               dtype='int64')
 
             fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
             fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
             prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                 size=2,
                                                 act='softmax')
-            cost = paddle.fluid.layers.cross_entropy(
-                input=prediction, label=input_y)
+            cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                     label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(
-                optimizer, strategy=strategy)
+            optimizer = fleet.distributed_optimizer(optimizer,
+                                                    strategy=strategy)
             optimizer.minimize(avg_cost)
 
             exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace())
@@ -89,48 +102,60 @@ def test_graph_execution_optimizer(self):
         port_b = self._dist_ut_port_1 + 2
 
         node_a = {
-            "PADDLE_TRAINER_ID": "0",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
-            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ID":
+            "0",
+            "PADDLE_CURRENT_ENDPOINT":
+            "127.0.0.1:{}".format(port_a),
+            "PADDLE_TRAINERS_NUM":
+            "2",
             "PADDLE_TRAINER_ENDPOINTS":
             "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
-            "http_proxy": "",
-            "https_proxy": ""
+            "http_proxy":
+            "",
+            "https_proxy":
+            ""
         }
 
         node_b = {
-            "PADDLE_TRAINER_ID": "1",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
-            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ID":
+            "1",
+            "PADDLE_CURRENT_ENDPOINT":
+            "127.0.0.1:{}".format(port_b),
+            "PADDLE_TRAINERS_NUM":
+            "2",
             "PADDLE_TRAINER_ENDPOINTS":
             "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
-            "http_proxy": "",
-            "https_proxy": ""
+            "http_proxy":
+            "",
+            "https_proxy":
+            ""
         }
 
         def node_func():
             import paddle.distributed.fleet as fleet
             fleet.init(is_collective=True)
-            input_x = paddle.fluid.layers.data(
-                name="x", shape=[32], dtype='float32')
-            input_y = paddle.fluid.layers.data(
-                name="y", shape=[1], dtype='int64')
+            input_x = paddle.fluid.layers.data(name="x",
+                                               shape=[32],
+                                               dtype='float32')
+            input_y = paddle.fluid.layers.data(name="y",
+                                               shape=[1],
+                                               dtype='int64')
 
             fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
             fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
             prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                 size=2,
                                                 act='softmax')
-            cost = paddle.fluid.layers.cross_entropy(
-                input=prediction, label=input_y)
+            cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                     label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.nccl_comm_num = 2
             strategy.sync_nccl_allreduce = True
             optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(
-                optimizer, strategy=strategy)
+            optimizer = fleet.distributed_optimizer(optimizer,
+                                                    strategy=strategy)
             optimizer.minimize(avg_cost)
             exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace())
             exe.run(paddle.fluid.default_startup_program())
@@ -140,8 +165,7 @@ def node_func():
             def gen_data():
                 return {
                     "x": np.random.random(size=(128, 32)).astype('float32'),
-                    "y": np.random.randint(
-                        2, size=(128, 1)).astype('int64')
+                    "y": np.random.randint(2, size=(128, 1)).astype('int64')
                 }
 
             for i in range(10):
@@ -158,46 +182,58 @@ def test_graph_execution_optimizer_not_apply_v2(self):
         port_a = self._dist_ut_port_0 + 4
         port_b = self._dist_ut_port_1 + 4
         node_a = {
-            "PADDLE_TRAINER_ID": "0",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
-            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ID":
+            "0",
+            "PADDLE_CURRENT_ENDPOINT":
+            "127.0.0.1:{}".format(port_a),
+            "PADDLE_TRAINERS_NUM":
+            "2",
             "PADDLE_TRAINER_ENDPOINTS":
             "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
-            "http_proxy": "",
-            "https_proxy": ""
+            "http_proxy":
+            "",
+            "https_proxy":
+            ""
         }
 
         node_b = {
-            "PADDLE_TRAINER_ID": "1",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
-            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ID":
+            "1",
+            "PADDLE_CURRENT_ENDPOINT":
+            "127.0.0.1:{}".format(port_b),
+            "PADDLE_TRAINERS_NUM":
+            "2",
             "PADDLE_TRAINER_ENDPOINTS":
             "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
-            "http_proxy": "",
-            "https_proxy": ""
+            "http_proxy":
+            "",
+            "https_proxy":
+            ""
         }
 
         def node_func():
             import paddle.distributed.fleet as fleet
             fleet.init(is_collective=True)
-            input_x = paddle.fluid.layers.data(
-                name="x", shape=[32], dtype='float32')
-            input_y = paddle.fluid.layers.data(
-                name="y", shape=[1], dtype='int64')
+            input_x = paddle.fluid.layers.data(name="x",
+                                               shape=[32],
+                                               dtype='float32')
+            input_y = paddle.fluid.layers.data(name="y",
+                                               shape=[1],
+                                               dtype='int64')
 
             fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
             fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
             prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                 size=2,
                                                 act='softmax')
-            cost = paddle.fluid.layers.cross_entropy(
-                input=prediction, label=input_y)
+            cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                     label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(
-                optimizer, strategy=strategy)
+            optimizer = fleet.distributed_optimizer(optimizer,
+                                                    strategy=strategy)
             optimizer.minimize(avg_cost)
 
             exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace())
@@ -213,48 +249,60 @@ def test_graph_execution_optimizer_v2(self):
         port_a = self._dist_ut_port_0 + 6
         port_b = self._dist_ut_port_1 + 6
         node_a = {
-            "PADDLE_TRAINER_ID": "0",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_a),
-            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ID":
+            "0",
+            "PADDLE_CURRENT_ENDPOINT":
+            "127.0.0.1:{}".format(port_a),
+            "PADDLE_TRAINERS_NUM":
+            "2",
             "PADDLE_TRAINER_ENDPOINTS":
             "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
-            "http_proxy": "",
-            "https_proxy": ""
+            "http_proxy":
+            "",
+            "https_proxy":
+            ""
         }
 
         node_b = {
-            "PADDLE_TRAINER_ID": "1",
-            "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:{}".format(port_b),
-            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ID":
+            "1",
+            "PADDLE_CURRENT_ENDPOINT":
+            "127.0.0.1:{}".format(port_b),
+            "PADDLE_TRAINERS_NUM":
+            "2",
             "PADDLE_TRAINER_ENDPOINTS":
             "127.0.0.1:{},127.0.0.1:{}".format(port_a, port_b),
-            "http_proxy": "",
-            "https_proxy": ""
+            "http_proxy":
+            "",
+            "https_proxy":
+            ""
         }
 
         def node_func():
             import paddle.distributed.fleet as fleet
             fleet.init(is_collective=True)
-            input_x = paddle.fluid.layers.data(
-                name="x", shape=[32], dtype='float32')
-            input_y = paddle.fluid.layers.data(
-                name="y", shape=[1], dtype='int64')
+            input_x = paddle.fluid.layers.data(name="x",
+                                               shape=[32],
+                                               dtype='float32')
+            input_y = paddle.fluid.layers.data(name="y",
+                                               shape=[1],
+                                               dtype='int64')
 
             fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
             fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
             prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                 size=2,
                                                 act='softmax')
-            cost = paddle.fluid.layers.cross_entropy(
-                input=prediction, label=input_y)
+            cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                     label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.nccl_comm_num = 2
             strategy.sync_nccl_allreduce = True
             optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(
-                optimizer, strategy=strategy)
+            optimizer = fleet.distributed_optimizer(optimizer,
+                                                    strategy=strategy)
             optimizer.minimize(avg_cost)
             exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace())
             exe.run(paddle.fluid.default_startup_program())
@@ -264,8 +312,7 @@ def node_func():
             def gen_data():
                 return {
                     "x": np.random.random(size=(128, 32)).astype('float32'),
-                    "y": np.random.randint(
-                        2, size=(128, 1)).astype('int64')
+                    "y": np.random.randint(2, size=(128, 1)).astype('int64')
                 }
 
             for i in range(10):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
index 628f1db80d2d4..2afe4af3645f2 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
@@ -21,6 +21,7 @@
 
 
 class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
+
     def test_graph_execution_optimizer(self):
         node_a = {
             "PADDLE_TRAINER_ID": "0",
@@ -43,26 +44,28 @@ def test_graph_execution_optimizer(self):
         def node_func():
             role = role_maker.PaddleCloudRoleMaker(is_collective=True)
             fleet.init(role)
-            input_x = paddle.fluid.layers.data(
-                name="x", shape=[32], dtype='float32')
-            input_y = paddle.fluid.layers.data(
-                name="y", shape=[1], dtype='int64')
+            input_x = paddle.fluid.layers.data(name="x",
+                                               shape=[32],
+                                               dtype='float32')
+            input_y = paddle.fluid.layers.data(name="y",
+                                               shape=[1],
+                                               dtype='int64')
 
             fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
             fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
             prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                 size=2,
                                                 act='softmax')
-            cost = paddle.fluid.layers.cross_entropy(
-                input=prediction, label=input_y)
+            cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                     label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.nccl_comm_num = 2
             strategy.sync_nccl_allreduce = True
             optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(
-                optimizer, strategy=strategy)
+            optimizer = fleet.distributed_optimizer(optimizer,
+                                                    strategy=strategy)
             optimizer.minimize(avg_cost)
             exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace())
             exe.run(paddle.fluid.default_startup_program())
@@ -72,8 +75,7 @@ def node_func():
             def gen_data():
                 return {
                     "x": np.random.random(size=(128, 32)).astype('float32'),
-                    "y": np.random.randint(
-                        2, size=(128, 1)).astype('int64')
+                    "y": np.random.randint(2, size=(128, 1)).astype('int64')
                 }
 
             for i in range(5):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py
index 35b74eac4b075..928ea06a611d4 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py
@@ -24,6 +24,7 @@
 
 
 class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "3"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = \
@@ -193,8 +194,11 @@ def test_opt_sharding_with_pp_amp_gclip(self):
         strategy.fuse_grad_size_in_MB = 32
         clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
 
-        self.optimizer(
-            avg_cost, strategy, train_prog, startup_prog, grad_clip=clip)
+        self.optimizer(avg_cost,
+                       strategy,
+                       train_prog,
+                       startup_prog,
+                       grad_clip=clip)
         train_prog = train_prog._pipeline_opt['section_program']
         startup_prog = startup_prog._pipeline_opt['startup_program']
         self.debug_program(train_prog, startup_prog)
@@ -267,8 +271,11 @@ def test_opt_sharding_with_pp_amp_gclip_fuse_gm(self):
         strategy.fuse_grad_merge = True
         clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
 
-        self.optimizer(
-            avg_cost, strategy, train_prog, startup_prog, grad_clip=clip)
+        self.optimizer(avg_cost,
+                       strategy,
+                       train_prog,
+                       startup_prog,
+                       grad_clip=clip)
         train_prog = train_prog._pipeline_opt['section_program']
         startup_prog = startup_prog._pipeline_opt['startup_program']
         self.debug_program(train_prog, startup_prog)
@@ -325,7 +332,9 @@ def test_opt_sharding_with_pp_amp_ckp_fuse_gm_optcast(self):
 
         self.set_strategy(strategy, 'pipeline')
         self.set_strategy(strategy, 'amp')
-        strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], }
+        strategy.amp_configs = {
+            'custom_black_varnames': ['fc_6.b_0'],
+        }
         strategy.recompute = True
         strategy.recompute_configs = {
             "checkpoints":
@@ -397,6 +406,7 @@ def test_opt_sharding_with_pp_amp_ckp_fuse_gm_optcast(self):
 
 
 class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "3"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = \
@@ -430,8 +440,11 @@ def test_opt_sharding_with_pp_amp_gclip_boundary(self):
         strategy.fuse_grad_size_in_MB = 32
         clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
 
-        self.optimizer(
-            avg_cost, strategy, train_prog, startup_prog, grad_clip=clip)
+        self.optimizer(avg_cost,
+                       strategy,
+                       train_prog,
+                       startup_prog,
+                       grad_clip=clip)
         train_prog = train_prog._pipeline_opt['section_program']
         startup_prog = startup_prog._pipeline_opt['startup_program']
         self.debug_program(train_prog, startup_prog)
@@ -491,8 +504,11 @@ def test_opt_sharding_with_pp_amp_gclip_boundary_card1(self):
         strategy.fuse_grad_size_in_MB = 32
         clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
 
-        self.optimizer(
-            avg_cost, strategy, train_prog, startup_prog, grad_clip=clip)
+        self.optimizer(avg_cost,
+                       strategy,
+                       train_prog,
+                       startup_prog,
+                       grad_clip=clip)
         train_prog = train_prog._pipeline_opt['section_program']
         startup_prog = startup_prog._pipeline_opt['startup_program']
         self.debug_program(train_prog, startup_prog)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
index 022e0b99ce871..f6f3f50be0dee 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
@@ -23,6 +23,7 @@
 
 
 class TestFleetLambMetaOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "1"
         os.environ[
@@ -31,10 +32,12 @@ def setUp(self):
     def net(self, main_prog, startup_prog):
         with fluid.program_guard(main_prog, startup_prog):
             with fluid.unique_name.guard():
-                input_x = paddle.fluid.layers.data(
-                    name="x", shape=[32], dtype='float32')
-                input_y = paddle.fluid.layers.data(
-                    name="y", shape=[1], dtype='int64')
+                input_x = paddle.fluid.layers.data(name="x",
+                                                   shape=[32],
+                                                   dtype='float32')
+                input_y = paddle.fluid.layers.data(name="y",
+                                                   shape=[1],
+                                                   dtype='int64')
 
                 fc_1 = paddle.fluid.layers.fc(input=input_x,
                                               size=64,
@@ -43,8 +46,8 @@ def net(self, main_prog, startup_prog):
                 prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                     size=2,
                                                     act='softmax')
-                cost = paddle.fluid.layers.cross_entropy(
-                    input=prediction, label=input_y)
+                cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                         label=input_y)
                 avg_cost = paddle.fluid.layers.mean(x=cost)
 
                 strategy = paddle.distributed.fleet.DistributedStrategy()
@@ -75,8 +78,8 @@ def test_lamb_not_apply_with_momentum(self):
         startup_prog = fluid.Program()
         train_prog = fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.fluid.optimizer.Momentum(
-            learning_rate=0.1, momentum=0.9)
+        optimizer = paddle.fluid.optimizer.Momentum(learning_rate=0.1,
+                                                    momentum=0.9)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
@@ -107,15 +110,16 @@ def test_lamb_exclude_fn(self):
     def test_lamb_apply_with_amp(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
+        input_x = paddle.fluid.layers.data(name="x",
+                                           shape=[32],
+                                           dtype='float32')
         input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
         fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
         fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
+        cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                 label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
index bee6acf732460..b4f0c93d09ccc 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
@@ -23,6 +23,7 @@
 
 
 class TestFleetLarsMetaOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "1"
         os.environ[
@@ -31,10 +32,12 @@ def setUp(self):
     def net(self, main_prog, startup_prog):
         with fluid.program_guard(main_prog, startup_prog):
             with fluid.unique_name.guard():
-                input_x = paddle.fluid.layers.data(
-                    name="x", shape=[32], dtype='float32')
-                input_y = paddle.fluid.layers.data(
-                    name="y", shape=[1], dtype='int64')
+                input_x = paddle.fluid.layers.data(name="x",
+                                                   shape=[32],
+                                                   dtype='float32')
+                input_y = paddle.fluid.layers.data(name="y",
+                                                   shape=[1],
+                                                   dtype='int64')
 
                 fc_1 = paddle.fluid.layers.fc(input=input_x,
                                               size=64,
@@ -43,8 +46,8 @@ def net(self, main_prog, startup_prog):
                 prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                     size=2,
                                                     act='softmax')
-                cost = paddle.fluid.layers.cross_entropy(
-                    input=prediction, label=input_y)
+                cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                         label=input_y)
                 avg_cost = paddle.fluid.layers.mean(x=cost)
 
                 strategy = paddle.distributed.fleet.DistributedStrategy()
@@ -64,8 +67,8 @@ def test_lars_optimizer(self):
         startup_prog = fluid.Program()
         train_prog = fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.fluid.optimizer.Momentum(
-            learning_rate=0.01, momentum=0.9)
+        optimizer = paddle.fluid.optimizer.Momentum(learning_rate=0.01,
+                                                    momentum=0.9)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
@@ -91,8 +94,8 @@ def test_lars_exclude_fn(self):
         startup_prog = fluid.Program()
         train_prog = fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.fluid.optimizer.Momentum(
-            learning_rate=0.01, momentum=0.9)
+        optimizer = paddle.fluid.optimizer.Momentum(learning_rate=0.01,
+                                                    momentum=0.9)
 
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
@@ -108,15 +111,16 @@ def test_lars_exclude_fn(self):
     def test_lars_apply_with_amp(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
+        input_x = paddle.fluid.layers.data(name="x",
+                                           shape=[32],
+                                           dtype='float32')
         input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
         fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
         fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
+        cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                 label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
@@ -139,8 +143,8 @@ def test_lars_apply_with_amp(self):
             "exclude_from_weight_decay": ["batch_norm", ".b"],
         }
 
-        optimizer = paddle.fluid.optimizer.Momentum(
-            learning_rate=0.01, momentum=0.9)
+        optimizer = paddle.fluid.optimizer.Momentum(learning_rate=0.01,
+                                                    momentum=0.9)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
index bafb2419123b0..ac7b203d5ee4f 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
@@ -26,6 +26,7 @@
 
 
 class TestFleetLocalSGDMetaOptimizer(TestFleetMetaOptimizer):
+
     def test_localsgd_optimizer(self):
         train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
@@ -63,6 +64,7 @@ def test_localsgd_amp_optimizer(self):
 
 
 class TestFleetAdaptiveLocalSGDMetaOptimizer(TestFleetMetaOptimizer):
+
     def test_adaptive_localsgd_optimizer(self):
         train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
index dfea848aadfc4..f39f916dbbe64 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
@@ -22,15 +22,18 @@
 
 
 class TestFleetMetaOptimizerBase(unittest.TestCase):
+
     def net(main_prog, startup_prog):
         with fluid.program_guard(main_prog, startup_prog):
             with fluid.unique_name.guard():
                 role = role_maker.PaddleCloudRoleMaker(is_collective=True)
                 fleet.init(role)
-                input_x = paddle.fluid.layers.data(
-                    name="x", shape=[32], dtype='float32')
-                input_y = paddle.fluid.layers.data(
-                    name="y", shape=[1], dtype='int64')
+                input_x = paddle.fluid.layers.data(name="x",
+                                                   shape=[32],
+                                                   dtype='float32')
+                input_y = paddle.fluid.layers.data(name="y",
+                                                   shape=[1],
+                                                   dtype='int64')
 
                 fc_1 = paddle.fluid.layers.fc(input=input_x,
                                               size=64,
@@ -39,8 +42,8 @@ def net(main_prog, startup_prog):
                 prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                     size=2,
                                                     act='softmax')
-                cost = paddle.fluid.layers.cross_entropy(
-                    input=prediction, label=input_y)
+                cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                         label=input_y)
                 avg_cost = paddle.fluid.layers.mean(x=cost)
 
                 optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_metric.py b/python/paddle/fluid/tests/unittests/test_fleet_metric.py
index 5dce59ac23d92..ae2313518886e 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_metric.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_metric.py
@@ -34,6 +34,7 @@ def setUp(self):
         """Set up, set envs."""
 
         class FakeUtil(UtilBase):
+
             def __init__(self, fake_fleet):
                 super(FakeUtil, self).__init__()
                 self.fleet = fake_fleet
@@ -80,18 +81,16 @@ def test_metric_1(self):
         train = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(train, startup):
-            t = fluid.layers.create_global_var(
-                shape=[1, 1],
-                value=1,
-                dtype='int64',
-                persistable=True,
-                force_cpu=True)
-            t1 = fluid.layers.create_global_var(
-                shape=[1, 1],
-                value=1,
-                dtype='int64',
-                persistable=True,
-                force_cpu=True)
+            t = fluid.layers.create_global_var(shape=[1, 1],
+                                               value=1,
+                                               dtype='int64',
+                                               persistable=True,
+                                               force_cpu=True)
+            t1 = fluid.layers.create_global_var(shape=[1, 1],
+                                                value=1,
+                                                dtype='int64',
+                                                persistable=True,
+                                                force_cpu=True)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         scope = fluid.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py b/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py
index e6138296a6c33..2dccf8bca8273 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_nocvm_1.py
@@ -64,13 +64,13 @@ def test_pslib_1(self):
             cost = fluid.layers.log_loss(fc, label_cast)
         try:
             adam = fluid.optimizer.Adam(learning_rate=0.000005)
-            adam = fleet.distributed_optimizer(
-                adam,
-                strategy={
-                    "embedding": {
-                        "sparse_accessor_class": "DownpourCtrAccessor"
-                    }
-                })
+            adam = fleet.distributed_optimizer(adam,
+                                               strategy={
+                                                   "embedding": {
+                                                       "sparse_accessor_class":
+                                                       "DownpourCtrAccessor"
+                                                   }
+                                               })
             adam.minimize([cost], [scope])
             fleet.run_server()
         except:
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
index 263c578a57127..d9bc0c7a5f39c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
@@ -24,6 +24,7 @@
 
 
 class TestFleetMetaOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "1"
         os.environ[
@@ -31,12 +32,15 @@ def setUp(self):
 
     def net(self):
         with static.device_guard("gpu:0"):
-            input_x = paddle.fluid.layers.data(
-                name="x", shape=[32], dtype='float32')
-            input_y = paddle.fluid.layers.data(
-                name="y", shape=[1], dtype='int64')
-            input_z = paddle.fluid.layers.data(
-                name="z", shape=[1], dtype="float32")
+            input_x = paddle.fluid.layers.data(name="x",
+                                               shape=[32],
+                                               dtype='float32')
+            input_y = paddle.fluid.layers.data(name="y",
+                                               shape=[1],
+                                               dtype='int64')
+            input_z = paddle.fluid.layers.data(name="z",
+                                               shape=[1],
+                                               dtype="float32")
             with static.device_guard("gpu:all"):
                 input_z = input_z * 1.0
                 input_z.stop_gradient = True
@@ -51,8 +55,8 @@ def net(self):
             prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                 size=2,
                                                 act='softmax')
-            cost = paddle.fluid.layers.cross_entropy(
-                input=prediction, label=input_y)
+            cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                     label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
         return avg_cost
 
@@ -73,8 +77,8 @@ def test_pipeline_optimizer(self):
                 avg_cost = self.net()
 
                 optimizer = paddle.fluid.optimizer.Adam(0.01)
-                optimizer = fleet.distributed_optimizer(
-                    optimizer, strategy=strategy)
+                optimizer = fleet.distributed_optimizer(optimizer,
+                                                        strategy=strategy)
                 optimizer.minimize(avg_cost)
 
     def test_pipeline_amp_optimizer(self):
@@ -96,8 +100,8 @@ def test_pipeline_amp_optimizer(self):
                 avg_cost = self.net()
 
                 optimizer = paddle.fluid.optimizer.Adam(0.01)
-                optimizer = fleet.distributed_optimizer(
-                    optimizer, strategy=strategy)
+                optimizer = fleet.distributed_optimizer(optimizer,
+                                                        strategy=strategy)
                 optimizer.minimize(avg_cost)
 
         ops = train_prog._pipeline_opt['section_program'].global_block().ops
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py
index f67b26e0aef65..5c086a5994f0b 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py
@@ -20,6 +20,7 @@
 
 
 class TestFleetMetaOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "1"
         os.environ[
@@ -31,10 +32,12 @@ def test_pipeline_optimizer(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         with paddle.fluid.device_guard("gpu:0"):
-            input_x = paddle.fluid.layers.data(
-                name="x", shape=[32], dtype='float32')
-            input_y = paddle.fluid.layers.data(
-                name="y", shape=[1], dtype='int64')
+            input_x = paddle.fluid.layers.data(name="x",
+                                               shape=[32],
+                                               dtype='float32')
+            input_y = paddle.fluid.layers.data(name="y",
+                                               shape=[1],
+                                               dtype='int64')
             fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
             fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
             fc_3 = paddle.fluid.layers.fc(input=fc_2, size=64, act='tanh')
@@ -47,8 +50,8 @@ def test_pipeline_optimizer(self):
             prediction = paddle.fluid.layers.fc(input=[fc_7],
                                                 size=2,
                                                 act='softmax')
-            cost = paddle.fluid.layers.cross_entropy(
-                input=prediction, label=input_y)
+            cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                     label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_private_function.py b/python/paddle/fluid/tests/unittests/test_fleet_private_function.py
index beec6d7f51c4f..063cda8aa9de4 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_private_function.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_private_function.py
@@ -20,7 +20,9 @@
 
 
 class TestFleetPrivateFunction(unittest.TestCase):
+
     def test_wait_port(self):
+
         def init_server(port):
             import time
             time.sleep(5)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_ps.py b/python/paddle/fluid/tests/unittests/test_fleet_ps.py
index 04d1616399a26..5ad8785956062 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_ps.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_ps.py
@@ -17,10 +17,12 @@
 import unittest
 from paddle.fluid.framework import default_main_program
 from paddle.fluid.incubate.fleet.parameter_server.ir.pserver_pass import _get_optimizer_input_shape
+
 main_program = default_main_program()
 
 
 class TestFleetPS(unittest.TestCase):
+
     def test_version(self):
         from paddle.fluid.incubate.fleet.parameter_server import version
         transpiler = version.is_transpiler()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pyramid_hash.py b/python/paddle/fluid/tests/unittests/test_fleet_pyramid_hash.py
index 91e9cddd2a8dc..d22fc3a1b8c43 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_pyramid_hash.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_pyramid_hash.py
@@ -20,6 +20,7 @@
 
 
 class TestPyramidHashOpApi(unittest.TestCase):
+
     def test_dist_geo_server_transpiler(self):
         num_voc = 128
         embed_dim = 64
@@ -40,10 +41,12 @@ def test_dist_geo_server_transpiler(self):
             lr=0.002,
             param_attr=fluid.ParamAttr(
                 name="PyramidHash_emb_0",
-                learning_rate=0, ),
+                learning_rate=0,
+            ),
             param_attr_wl=fluid.ParamAttr(
                 name="Filter",
-                learning_rate=0, ),
+                learning_rate=0,
+            ),
             param_attr_bl=None,
             distribute_update_vars=["PyramidHash_emb_0"],
             name=None)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py
index 604109b262d6c..05c3391565ea2 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py
@@ -20,6 +20,7 @@
 
 
 class TestFleetMetaOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "1"
         os.environ[
@@ -30,15 +31,16 @@ def test_pipeline_optimizer(self):
         import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
+        input_x = paddle.fluid.layers.data(name="x",
+                                           shape=[32],
+                                           dtype='float32')
         input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
         fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
 
         fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
+        cost = paddle.fluid.layers.cross_entropy(input=prediction,
+                                                 label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
index 59a8fa48d94f7..230cad18361a8 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
@@ -23,14 +23,15 @@
 
 
 class TestFleetRecomputeMetaOptimizer(TestFleetMetaOptimizer):
+
     def test_recompute_optimizer_backward(self):
         """ test recompute optimizer backward """
         train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
 
         self.set_strategy(strategy, 'recompute')
-        opt = fluid.optimizer.MomentumOptimizer(
-            learning_rate=0.001, momentum=0.9)
+        opt = fluid.optimizer.MomentumOptimizer(learning_rate=0.001,
+                                                momentum=0.9)
         opt = RecomputeOptimizer(opt)
         opt.user_defined_strategy = strategy
         params_grads = opt.backward(avg_cost, startup_prog)
@@ -46,8 +47,8 @@ def test_recompute_optimizer_backward_gradients(self):
         avg_cost, strategy = self.net(train_prog, startup_prog)
 
         self.set_strategy(strategy, 'recompute')
-        opt = fluid.optimizer.MomentumOptimizer(
-            learning_rate=0.001, momentum=0.9)
+        opt = fluid.optimizer.MomentumOptimizer(learning_rate=0.001,
+                                                momentum=0.9)
         opt = RecomputeOptimizer(opt)
         opt.user_defined_strategy = strategy
         params_grads = opt.backward(avg_cost, startup_prog)
@@ -65,8 +66,8 @@ def test_recompute_optimizer_backward_optimize(self):
         avg_cost, strategy = self.net(train_prog, startup_prog)
 
         self.set_strategy(strategy, 'recompute')
-        opt = fluid.optimizer.MomentumOptimizer(
-            learning_rate=0.001, momentum=0.9)
+        opt = fluid.optimizer.MomentumOptimizer(learning_rate=0.001,
+                                                momentum=0.9)
         opt = RecomputeOptimizer(opt)
         opt.user_defined_strategy = strategy
         params_grads = opt.backward(avg_cost, startup_prog)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py
index 396d705508b34..9636efdbfcbc3 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_3.py
@@ -43,10 +43,9 @@ def test_pslib_1(self):
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
         os.environ["PADDLE_TRAINER_ID"] = "0"
-        role_maker = GeneralRoleMaker(
-            init_timeout_seconds=100,
-            run_timeout_seconds=100,
-            http_ip_port="127.0.0.1:36003")
+        role_maker = GeneralRoleMaker(init_timeout_seconds=100,
+                                      run_timeout_seconds=100,
+                                      http_ip_port="127.0.0.1:36003")
         #role_maker.generate_role()
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
index 86ee0db30ef8c..7fc68ec15636a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
@@ -104,8 +104,8 @@ def test_ps_rolemaker(self):
         os.environ["POD_IP"] = "127.0.0.1"
         os.environ["PADDLE_PORT"] = "36001"
 
-        ro = role_maker.PaddleCloudRoleMaker(
-            is_collective=False, init_gloo=False)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False,
+                                             init_gloo=False)
         self.assertEqual(ro._server_index(), 0)
         self.assertFalse(ro._is_worker())
         self.assertTrue(ro._is_server())
@@ -161,6 +161,7 @@ def test_tr_rolemaker(self):
 
 
 class TestGlooWithCloudRoleMaker(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINERS_NUM"] = "1"
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
@@ -443,8 +444,8 @@ def net():
             x = paddle.fluid.layers.data(name='x', shape=[13], dtype='float32')
             y_predict = paddle.fluid.layers.fc(input=x, size=1, act=None)
             y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
-            cost = paddle.fluid.layers.square_error_cost(
-                input=y_predict, label=y)
+            cost = paddle.fluid.layers.square_error_cost(input=y_predict,
+                                                         label=y)
             avg_cost = paddle.fluid.layers.mean(cost)
             return avg_cost
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_runtime.py b/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
index 80109716a54e5..19c407bf57f83 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
@@ -18,6 +18,7 @@
 
 
 class TestFleetRuntime(unittest.TestCase):
+
     def test_fleet_runtime_base(self):
         import paddle.distributed.fleet.runtime
         base = paddle.distributed.fleet.runtime.runtime_base.RuntimeBase()
@@ -44,19 +45,18 @@ def test_fleet_ps_runtime(self):
         ps_runtime = paddle.distributed.fleet.runtime.ParameterServerRuntime()
         self.assertRaises(Exception, ps_runtime._get_optimizer_status,
                           "test_op", None)
-        reshaped_names, origin_names = ps_runtime._get_optimizer_status("adam",
-                                                                        "param")
+        reshaped_names, origin_names = ps_runtime._get_optimizer_status(
+            "adam", "param")
         self.assertTrue(
-            len(reshaped_names) == 2 and
-            reshaped_names[0] == 'param_moment1_0' and
-            reshaped_names[1] == 'param_moment2_0')
+            len(reshaped_names) == 2 and reshaped_names[0] == 'param_moment1_0'
+            and reshaped_names[1] == 'param_moment2_0')
         self.assertTrue(
-            len(origin_names) == 2 and
-            origin_names[0] == 'param_beta1_pow_acc_0' and
-            origin_names[1] == 'param_beta2_pow_acc_0')
+            len(origin_names) == 2
+            and origin_names[0] == 'param_beta1_pow_acc_0'
+            and origin_names[1] == 'param_beta2_pow_acc_0')
 
-        reshaped_names, origin_names = ps_runtime._get_optimizer_status("sgd",
-                                                                        "param")
+        reshaped_names, origin_names = ps_runtime._get_optimizer_status(
+            "sgd", "param")
         self.assertTrue(len(reshaped_names) == 0 and len(origin_names) == 0)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index 28e03fdfd70e1..20eace7cce3c0 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -27,6 +27,7 @@
 
 
 class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
+
     def test_sharding_optimizer(self):
         train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
         )
@@ -244,12 +245,11 @@ def test_sharding_weight_decay(self):
         avg_cost, strategy = self.net(train_prog, startup_prog)
         self.set_strategy(strategy, 'sharding')
         regularization = paddle.fluid.regularizer.L2Decay(0.0001)
-        self.optimizer(
-            avg_cost,
-            strategy,
-            train_prog,
-            startup_prog,
-            regularization=regularization)
+        self.optimizer(avg_cost,
+                       strategy,
+                       train_prog,
+                       startup_prog,
+                       regularization=regularization)
         parameters = [
             x.name for x in train_prog.list_vars() if x.persistable == True
         ]
@@ -285,8 +285,11 @@ def test_sharding_gradient_clip(self):
         avg_cost, strategy = self.net(train_prog, startup_prog)
         self.set_strategy(strategy, 'sharding')
         clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
-        self.optimizer(
-            avg_cost, strategy, train_prog, startup_prog, grad_clip=clip)
+        self.optimizer(avg_cost,
+                       strategy,
+                       train_prog,
+                       startup_prog,
+                       grad_clip=clip)
         parameters = [
             x.name for x in train_prog.list_vars() if x.persistable == True
         ]
@@ -340,6 +343,7 @@ def test_sharding_clone_for_test(self):
 
 
 class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "3"
         os.environ[
@@ -382,8 +386,8 @@ def test_sharding_with_mp(self):
         # check correctness of MP group
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_0":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_0":
                 sharding_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
@@ -391,8 +395,8 @@ def test_sharding_with_mp(self):
         # check correctness of sharding group
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_1":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_1":
                 dp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
@@ -429,8 +433,8 @@ def test_sharding_hybrid_dp(self):
         # check correctness of sharding group
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_0":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_0":
                 sharding_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
@@ -438,8 +442,8 @@ def test_sharding_hybrid_dp(self):
         # check correctness of dp group
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_1":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_1":
                 dp_group_waiting_ports = op.desc.attr("other_endpoints")
         self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
 
@@ -501,8 +505,8 @@ def test_sharding_hybrid_dp_gm(self):
         # check correctness of sharding group
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_0":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_0":
                 sharding_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
@@ -510,8 +514,8 @@ def test_sharding_hybrid_dp_gm(self):
         # check correctness of dp group
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_1":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_1":
                 dp_group_waiting_ports = op.desc.attr("other_endpoints")
         self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
 
@@ -618,8 +622,8 @@ def test_sharding_with_pp(self):
         # check correctness of pp group
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_0":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_0":
                 sharding_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
@@ -627,8 +631,8 @@ def test_sharding_with_pp(self):
         # check correctness of sharding group
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_1":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_1":
                 dp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
@@ -682,8 +686,11 @@ def test_hybrid_with_mp_pp_amp_gclip(self):
             "accumulate_steps": 4,
         }
         clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
-        self.optimizer(
-            avg_cost, strategy, train_prog, startup_prog, grad_clip=clip)
+        self.optimizer(avg_cost,
+                       strategy,
+                       train_prog,
+                       startup_prog,
+                       grad_clip=clip)
         train_prog = train_prog._pipeline_opt['section_program']
         startup_prog = startup_prog._pipeline_opt['startup_program']
 
@@ -757,8 +764,8 @@ def test_hybrid_with_mp_pp_amp_gclip(self):
         # check correctness of pp group
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_0":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_0":
                 mp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(mp_group_waiting_ports, ['127.0.0.1:36003'])
@@ -766,8 +773,8 @@ def test_hybrid_with_mp_pp_amp_gclip(self):
         # check correctness of sharding group
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_1":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_1":
                 pp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36002'])
@@ -791,13 +798,12 @@ def test_hybrid_with_mp_pp_amp_gclip_for_optimizer(self):
             "accumulate_steps": 4,
         }
         clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
-        self.optimizer(
-            avg_cost,
-            strategy,
-            train_prog,
-            startup_prog,
-            grad_clip=clip,
-            name="adamw")
+        self.optimizer(avg_cost,
+                       strategy,
+                       train_prog,
+                       startup_prog,
+                       grad_clip=clip,
+                       name="adamw")
         train_prog = train_prog._pipeline_opt['section_program']
         startup_prog = startup_prog._pipeline_opt['startup_program']
 
@@ -876,8 +882,8 @@ def test_hybrid_with_mp_pp_amp_gclip_for_optimizer(self):
         # check correctness of pp group
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_0":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_0":
                 mp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(mp_group_waiting_ports, ['127.0.0.1:36003'])
@@ -885,8 +891,8 @@ def test_hybrid_with_mp_pp_amp_gclip_for_optimizer(self):
         # check correctness of sharding group
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_1":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_1":
                 pp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36002'])
@@ -896,7 +902,9 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce(self):
         )
         avg_cost, strategy = self.pp_net(train_prog, startup_prog)
         strategy.amp = True
-        strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], }
+        strategy.amp_configs = {
+            'custom_black_varnames': ['fc_6.b_0'],
+        }
         strategy.sharding = True
         strategy.sharding_configs = {
             "sharding_degree": 1,
@@ -970,16 +978,16 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce(self):
 
         # check correctness of pp group
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_0":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_0":
                 pp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36003'])
 
         # check correctness of dp group
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_3":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_3":
                 dp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
@@ -989,7 +997,9 @@ def test_hybrid_with_sharding_pp_amp_fp16allreduce_in_optimize(self):
         )
         avg_cost, strategy = self.pp_net(train_prog, startup_prog)
         strategy.amp = True
-        strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], }
+        strategy.amp_configs = {
+            'custom_black_varnames': ['fc_6.b_0'],
+        }
         strategy.sharding = True
         strategy.sharding_configs = {
             "segment_broadcast_MB": 0.1,
@@ -1043,16 +1053,16 @@ def test_hybrid_with_sharding_pp_amp_fp16allreduce_in_optimize(self):
 
         # check correctness of sharding group
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_0":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_0":
                 sharding_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
 
         # check correctness of pp group
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_1":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_1":
                 pp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36002'])
@@ -1062,7 +1072,9 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast(self):
         )
         avg_cost, strategy = self.pp_net(train_prog, startup_prog)
         strategy.amp = True
-        strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], }
+        strategy.amp_configs = {
+            'custom_black_varnames': ['fc_6.b_0'],
+        }
         strategy.sharding = True
         strategy.sharding_configs = {
             "sharding_degree": 1,
@@ -1139,16 +1151,16 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast(self):
 
         # check correctness of pp group
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_0":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_0":
                 pp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36003'])
 
         # check correctness of dp group
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_3":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_3":
                 dp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
@@ -1158,7 +1170,9 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_offload(self):
         )
         avg_cost, strategy = self.pp_net(train_prog, startup_prog)
         strategy.amp = True
-        strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], }
+        strategy.amp_configs = {
+            'custom_black_varnames': ['fc_6.b_0'],
+        }
         strategy.sharding = True
         strategy.sharding_configs = {
             "sharding_degree": 1,
@@ -1238,16 +1252,16 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_offload(self):
 
         # check correctness of pp group
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_0":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_0":
                 pp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36003'])
 
         # check correctness of dp group
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_3":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_3":
                 dp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
@@ -1258,7 +1272,9 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast_with_gradient_fuse(
         )
         avg_cost, strategy = self.pp_net(train_prog, startup_prog)
         strategy.amp = True
-        strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], }
+        strategy.amp_configs = {
+            'custom_black_varnames': ['fc_6.b_0'],
+        }
         strategy.sharding = True
         strategy.sharding_configs = {
             "sharding_degree": 1,
@@ -1334,16 +1350,16 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast_with_gradient_fuse(
 
         # check correctness of pp group
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_0":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_0":
                 pp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36003'])
 
         # check correctness of dp group
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_3":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_3":
                 dp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
@@ -1353,7 +1369,9 @@ def test_hybrid_with_pp_dp_amp_with_gradient_fuse(self):
         )
         avg_cost, strategy = self.pp_net(train_prog, startup_prog)
         strategy.amp = True
-        strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], }
+        strategy.amp_configs = {
+            'custom_black_varnames': ['fc_6.b_0'],
+        }
         strategy.sharding = True
         strategy.sharding_configs = {
             "sharding_degree": 1,
@@ -1425,16 +1443,16 @@ def test_hybrid_with_pp_dp_amp_with_gradient_fuse(self):
 
         # check correctness of pp group
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_0":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_0":
                 pp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36003'])
 
         # check correctness of dp group
         for op in startup_prog_ops:
-            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "comm_id_3":
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names(
+            )[0] == "comm_id_3":
                 dp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
@@ -1444,7 +1462,9 @@ def test_hybrid_with_pp_dp_amp_with_gradient_fuse_and_avg_after_sum(self):
         )
         avg_cost, strategy = self.pp_net(train_prog, startup_prog)
         strategy.amp = True
-        strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], }
+        strategy.amp_configs = {
+            'custom_black_varnames': ['fc_6.b_0'],
+        }
         strategy.sharding = True
         strategy.sharding_configs = {
             "sharding_degree": 1,
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_static_mp_layers.py b/python/paddle/fluid/tests/unittests/test_fleet_static_mp_layers.py
index ed64c7421d0c1..10d80bc434b38 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_static_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_static_mp_layers.py
@@ -30,6 +30,7 @@
 
 
 class ColumnLinearNet(fluid.dygraph.Layer):
+
     def __init__(self, input_size, output_size):
         super(ColumnLinearNet, self).__init__()
         self.parallel_linear = fleet.meta_parallel.ColumnParallelLinear(
@@ -46,6 +47,7 @@ def forward(self, x):
 
 
 class RowLinearNet(fluid.dygraph.Layer):
+
     def __init__(self, input_size, output_size):
         super(RowLinearNet, self).__init__()
         self.parallel_linear = fleet.meta_parallel.RowParallelLinear(
@@ -61,10 +63,11 @@ def forward(self, x):
 
 
 class EmbeddingNet(fluid.dygraph.Layer):
+
     def __init__(self, vocab_size, hidden_size):
         super(EmbeddingNet, self).__init__()
-        self.embedding = fleet.meta_parallel.VocabParallelEmbedding(vocab_size,
-                                                                    hidden_size)
+        self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
+            vocab_size, hidden_size)
 
     def forward(self, x):
         output = self.embedding(x)
@@ -72,6 +75,7 @@ def forward(self, x):
 
 
 class TestDistTraning(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "2"
         os.environ[
@@ -104,13 +108,13 @@ def test_column_parallel_layer(self):
             ops = main_program.global_block().ops
             ops = [op.type for op in ops]
             self.assertEqual(
-                ops,
-                ['c_identity', 'matmul_v2', 'elementwise_add', 'c_concat'])
+                ops, ['c_identity', 'matmul_v2', 'elementwise_add', 'c_concat'])
 
             weight = model_a.parallel_linear.weight
             bias = model_a.parallel_linear.bias
-            self.assertEqual(weight.shape, (input_size, output_size //
-                                            self.model_parallel_size))
+            self.assertEqual(
+                weight.shape,
+                (input_size, output_size // self.model_parallel_size))
             self.assertEqual(bias.shape,
                              (output_size // self.model_parallel_size, ))
 
@@ -132,8 +136,9 @@ def test_row_parallel_layer(self):
 
             weight = model_a.parallel_linear.weight
             bias = model_a.parallel_linear.bias
-            self.assertEqual(weight.shape, (
-                input_size // self.model_parallel_size, output_size))
+            self.assertEqual(
+                weight.shape,
+                (input_size // self.model_parallel_size, output_size))
             self.assertEqual(bias.shape, (output_size, ))
 
     def test_parallel_embedding(self):
@@ -145,8 +150,9 @@ def test_parallel_embedding(self):
             # model_a
             model_a = EmbeddingNet(vocab_size, hidden_size)
 
-            x = paddle.static.data(
-                name='x', shape=[None, seq_len], dtype='int64')
+            x = paddle.static.data(name='x',
+                                   shape=[None, seq_len],
+                                   dtype='int64')
             y = model_a(x)
 
             #print(main_program)
@@ -155,8 +161,9 @@ def test_parallel_embedding(self):
             self.assertEqual(ops, ['c_embedding', 'c_allreduce_sum'])
 
             weight = model_a.embedding.weight
-            self.assertEqual(weight.shape, (
-                vocab_size // self.model_parallel_size, hidden_size))
+            self.assertEqual(
+                weight.shape,
+                (vocab_size // self.model_parallel_size, hidden_size))
 
     def test_parallel_cross_entropy(self):
         main_program, startup_program = self.get_program()
@@ -171,8 +178,9 @@ def test_parallel_cross_entropy(self):
 
             x = paddle.static.data(
                 name='x', shape=[batch_size, seq_length, class_size_per_card])
-            label = paddle.static.data(
-                name='label', shape=[batch_size, seq_length], dtype='int64')
+            label = paddle.static.data(name='label',
+                                       shape=[batch_size, seq_length],
+                                       dtype='int64')
             loss_a = model_a(x, label)
 
             #print(main_program)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_util.py b/python/paddle/fluid/tests/unittests/test_fleet_util.py
index a3a526aaa6158..91d9e062b9a14 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -64,6 +64,7 @@ def test_set_user_defined_util(self):
         import paddle.distributed.fleet as fleet
 
         class UserDefinedUtil(fleet.UtilBase):
+
             def __init__(self):
                 super(UserDefinedUtil, self).__init__()
 
@@ -166,8 +167,7 @@ class config:
         results = fleet.util._params_check(conf)
         self.assertTrue(len(results) == 1)
         np.testing.assert_array_almost_equal(
-            results[0], np.array(
-                [[3.0590223e-07]], dtype=np.float32))
+            results[0], np.array([[3.0590223e-07]], dtype=np.float32))
 
         # test feed_var's shape
         conf.dump_program_filename = "pruned_main_program.feed_var_shape_not_match"
@@ -178,8 +178,7 @@ class config:
         results = fleet.util._params_check(conf)
         self.assertTrue(len(results) == 1)
         np.testing.assert_array_almost_equal(
-            results[0], np.array(
-                [[3.0590223e-07]], dtype=np.float32))
+            results[0], np.array([[3.0590223e-07]], dtype=np.float32))
 
         # test correct case without feed_vars_filelist
         conf.feed_config.feeded_vars_filelist = None
@@ -215,8 +214,8 @@ class config:
 
         # test match
         conf.pruned_prog_path = os.path.join(
-            data_dir,
-            os.path.join(self.pruned_dir, "pruned_main_program.pbtxt"))
+            data_dir, os.path.join(self.pruned_dir,
+                                   "pruned_main_program.pbtxt"))
         if sys.platform == 'win32' or sys.platform == 'sys.platform':
             conf.draw = False
         else:
@@ -232,8 +231,8 @@ def test_visualize(self):
         else:
             data_dir = self.download_files()
             program_path = os.path.join(
-                data_dir,
-                os.path.join(self.train_dir, "join_main_program.pbtxt"))
+                data_dir, os.path.join(self.train_dir,
+                                       "join_main_program.pbtxt"))
             is_text = True
             program = fleet.util._load_program(program_path, is_text)
             output_dir = os.path.join(data_dir, self.train_dir)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_utils.py b/python/paddle/fluid/tests/unittests/test_fleet_utils.py
index 09de4867ef9f4..be3376a1d9afd 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_utils.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_utils.py
@@ -67,8 +67,8 @@ def test_program_type_trans(self):
     def test_parse_program_proto(self):
         data_dir = self.download_files()
         parse_program_file_path = os.path.join(
-            data_dir,
-            os.path.join(self.pruned_dir, "pruned_main_program.pbtxt"))
+            data_dir, os.path.join(self.pruned_dir,
+                                   "pruned_main_program.pbtxt"))
         is_text_parse_program = True
         parse_output_dir = os.path.join(data_dir, self.pruned_dir)
         fleet_util = FleetUtil()
@@ -119,8 +119,7 @@ class config:
         results = fleet_util.check_vars_and_dump(conf)
         self.assertTrue(len(results) == 1)
         np.testing.assert_array_almost_equal(
-            results[0], np.array(
-                [[3.0590223e-07]], dtype=np.float32))
+            results[0], np.array([[3.0590223e-07]], dtype=np.float32))
 
         # test feed_var's shape
         conf.dump_program_filename = "pruned_main_program.feed_var_shape_not_match"
@@ -131,8 +130,7 @@ class config:
         results = fleet_util.check_vars_and_dump(conf)
         self.assertTrue(len(results) == 1)
         np.testing.assert_array_almost_equal(
-            results[0], np.array(
-                [[3.0590223e-07]], dtype=np.float32))
+            results[0], np.array([[3.0590223e-07]], dtype=np.float32))
 
         # test correct case without feed_vars_filelist
         conf.feed_config.feeded_vars_filelist = None
@@ -168,8 +166,8 @@ class config:
 
         # test match
         conf.pruned_prog_path = os.path.join(
-            data_dir,
-            os.path.join(self.pruned_dir, "pruned_main_program.pbtxt"))
+            data_dir, os.path.join(self.pruned_dir,
+                                   "pruned_main_program.pbtxt"))
         if sys.platform == 'win32' or sys.platform == 'sys.platform':
             conf.draw = False
         else:
@@ -184,8 +182,8 @@ def test_draw_program(self):
         else:
             data_dir = self.download_files()
             program_path = os.path.join(
-                data_dir,
-                os.path.join(self.train_dir, "join_main_program.pbtxt"))
+                data_dir, os.path.join(self.train_dir,
+                                       "join_main_program.pbtxt"))
             is_text = True
             program = utils.load_program(program_path, is_text)
             output_dir = os.path.join(data_dir, self.train_dir)
diff --git a/python/paddle/fluid/tests/unittests/test_flip.py b/python/paddle/fluid/tests/unittests/test_flip.py
index 010d23bca51d7..a933595be8783 100644
--- a/python/paddle/fluid/tests/unittests/test_flip.py
+++ b/python/paddle/fluid/tests/unittests/test_flip.py
@@ -46,9 +46,8 @@ def test_static_graph(self):
                           fetch_list=[output])
             out_np = np.array(res[0])
             out_ref = np.array([[3, 2, 1], [6, 5, 4]]).astype(np.float32)
-            self.assertTrue(
-                (out_np == out_ref).all(),
-                msg='flip output is wrong, out =' + str(out_np))
+            self.assertTrue((out_np == out_ref).all(),
+                            msg='flip output is wrong, out =' + str(out_np))
 
     def test_dygraph(self):
         img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
@@ -65,6 +64,7 @@ def test_dygraph(self):
 
 
 class TestFlipOp(OpTest):
+
     def setUp(self):
         self.op_type = 'flip'
         self.python_api = paddle.tensor.flip
@@ -96,36 +96,42 @@ def calc_ref_res(self):
 
 
 class TestFlipOpAxis1(TestFlipOp):
+
     def init_test_case(self):
         self.in_shape = (2, 4, 4)
         self.axis = [0]
 
 
 class TestFlipOpAxis2(TestFlipOp):
+
     def init_test_case(self):
         self.in_shape = (4, 4, 6, 3)
         self.axis = [0, 2]
 
 
 class TestFlipOpAxis3(TestFlipOp):
+
     def init_test_case(self):
         self.in_shape = (4, 3, 1)
         self.axis = [0, 1, 2]
 
 
 class TestFlipOpAxis4(TestFlipOp):
+
     def init_test_case(self):
         self.in_shape = (6, 4, 2, 2)
         self.axis = [0, 1, 2, 3]
 
 
 class TestFlipOpEmptyAxis(TestFlipOp):
+
     def init_test_case(self):
         self.in_shape = (6, 4, 2, 2)
         self.axis = []
 
 
 class TestFlipOpNegAxis(TestFlipOp):
+
     def init_test_case(self):
         self.in_shape = (6, 4, 2, 2)
         self.axis = [-1]
diff --git a/python/paddle/fluid/tests/unittests/test_fmax_op.py b/python/paddle/fluid/tests/unittests/test_fmax_op.py
index 608d97b68ac22..359b98c4b492d 100644
--- a/python/paddle/fluid/tests/unittests/test_fmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fmax_op.py
@@ -52,8 +52,10 @@ def test_static_api(self):
             data_y = paddle.static.data("y", shape=[10, 15], dtype="float32")
             result_fmax = paddle.fmax(data_x, data_y)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"x": self.input_x,
-                                 "y": self.input_y},
+            res, = exe.run(feed={
+                "x": self.input_x,
+                "y": self.input_y
+            },
                            fetch_list=[result_fmax])
         self.assertTrue(np.allclose(res, self.np_expected1))
 
@@ -63,8 +65,10 @@ def test_static_api(self):
             data_z = paddle.static.data("z", shape=[15], dtype="float32")
             result_fmax = paddle.fmax(data_x, data_z)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"x": self.input_x,
-                                 "z": self.input_z},
+            res, = exe.run(feed={
+                "x": self.input_x,
+                "z": self.input_z
+            },
                            fetch_list=[result_fmax])
         self.assertTrue(np.allclose(res, self.np_expected2))
 
@@ -74,8 +78,10 @@ def test_static_api(self):
             data_c = paddle.static.data("c", shape=[3], dtype="int64")
             result_fmax = paddle.fmax(data_a, data_c)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"a": self.input_a,
-                                 "c": self.input_c},
+            res, = exe.run(feed={
+                "a": self.input_a,
+                "c": self.input_c
+            },
                            fetch_list=[result_fmax])
         self.assertTrue(np.allclose(res, self.np_expected3))
 
@@ -85,8 +91,10 @@ def test_static_api(self):
             data_c = paddle.static.data("c", shape=[3], dtype="int64")
             result_fmax = paddle.fmax(data_b, data_c)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"b": self.input_b,
-                                 "c": self.input_c},
+            res, = exe.run(feed={
+                "b": self.input_b,
+                "c": self.input_c
+            },
                            fetch_list=[result_fmax])
         self.assertTrue(np.allclose(res, self.np_expected4))
 
@@ -145,21 +153,19 @@ def test_check_grad_normal(self):
 
     def test_check_grad_ingore_x(self):
         """test_check_grad_ingore_x"""
-        self.check_grad(
-            ['Y'],
-            'Out',
-            max_relative_error=0.005,
-            no_grad_set=set("X"),
-            check_eager=True)
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=0.005,
+                        no_grad_set=set("X"),
+                        check_eager=True)
 
     def test_check_grad_ingore_y(self):
         """test_check_grad_ingore_y"""
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=0.005,
-            no_grad_set=set('Y'),
-            check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.005,
+                        no_grad_set=set('Y'),
+                        check_eager=True)
 
 
 class TestElementwiseFmax2Op(OpTest):
@@ -190,18 +196,16 @@ def test_check_grad_normal(self):
 
     def test_check_grad_ingore_x(self):
         """test_check_grad_ingore_x"""
-        self.check_grad(
-            ['Y'],
-            'Out',
-            max_relative_error=0.005,
-            no_grad_set=set("X"),
-            check_eager=True)
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=0.005,
+                        no_grad_set=set("X"),
+                        check_eager=True)
 
     def test_check_grad_ingore_y(self):
         """test_check_grad_ingore_y"""
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=0.005,
-            no_grad_set=set('Y'),
-            check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.005,
+                        no_grad_set=set('Y'),
+                        check_eager=True)
diff --git a/python/paddle/fluid/tests/unittests/test_fmin_op.py b/python/paddle/fluid/tests/unittests/test_fmin_op.py
index b9d26827988cd..88542ba936563 100644
--- a/python/paddle/fluid/tests/unittests/test_fmin_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fmin_op.py
@@ -54,8 +54,10 @@ def test_static_api(self):
             data_y = paddle.static.data("y", shape=[10, 15], dtype="float32")
             result_fmin = paddle.fmin(data_x, data_y)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"x": self.input_x,
-                                 "y": self.input_y},
+            res, = exe.run(feed={
+                "x": self.input_x,
+                "y": self.input_y
+            },
                            fetch_list=[result_fmin])
         self.assertTrue(np.allclose(res, self.np_expected1))
 
@@ -65,8 +67,10 @@ def test_static_api(self):
             data_z = paddle.static.data("z", shape=[15], dtype="float32")
             result_fmin = paddle.fmin(data_x, data_z)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"x": self.input_x,
-                                 "z": self.input_z},
+            res, = exe.run(feed={
+                "x": self.input_x,
+                "z": self.input_z
+            },
                            fetch_list=[result_fmin])
         self.assertTrue(np.allclose(res, self.np_expected2))
 
@@ -76,8 +80,10 @@ def test_static_api(self):
             data_c = paddle.static.data("c", shape=[3], dtype="int64")
             result_fmin = paddle.fmin(data_a, data_c)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"a": self.input_a,
-                                 "c": self.input_c},
+            res, = exe.run(feed={
+                "a": self.input_a,
+                "c": self.input_c
+            },
                            fetch_list=[result_fmin])
         self.assertTrue(np.allclose(res, self.np_expected3))
 
@@ -87,8 +93,10 @@ def test_static_api(self):
             data_c = paddle.static.data("c", shape=[3], dtype="int64")
             result_fmin = paddle.fmin(data_b, data_c)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"b": self.input_b,
-                                 "c": self.input_c},
+            res, = exe.run(feed={
+                "b": self.input_b,
+                "c": self.input_c
+            },
                            fetch_list=[result_fmin])
         self.assertTrue(np.allclose(res, self.np_expected4))
 
@@ -147,21 +155,19 @@ def test_check_grad_normal(self):
 
     def test_check_grad_ingore_x(self):
         """test_check_grad_ingore_x"""
-        self.check_grad(
-            ['Y'],
-            'Out',
-            max_relative_error=0.005,
-            no_grad_set=set("X"),
-            check_eager=True)
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=0.005,
+                        no_grad_set=set("X"),
+                        check_eager=True)
 
     def test_check_grad_ingore_y(self):
         """test_check_grad_ingore_y"""
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=0.005,
-            no_grad_set=set('Y'),
-            check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.005,
+                        no_grad_set=set('Y'),
+                        check_eager=True)
 
 
 class TestElementwiseFmin2Op(OpTest):
@@ -192,21 +198,19 @@ def test_check_grad_normal(self):
 
     def test_check_grad_ingore_x(self):
         """test_check_grad_ingore_x"""
-        self.check_grad(
-            ['Y'],
-            'Out',
-            max_relative_error=0.005,
-            no_grad_set=set("X"),
-            check_eager=True)
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=0.005,
+                        no_grad_set=set("X"),
+                        check_eager=True)
 
     def test_check_grad_ingore_y(self):
         """test_check_grad_ingore_y"""
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=0.005,
-            no_grad_set=set('Y'),
-            check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.005,
+                        no_grad_set=set('Y'),
+                        check_eager=True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fold_op.py b/python/paddle/fluid/tests/unittests/test_fold_op.py
index 44b94cd3b66ee..a919cac6b7d94 100644
--- a/python/paddle/fluid/tests/unittests/test_fold_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fold_op.py
@@ -71,8 +71,8 @@ def calc_fold(self):
                                     w_offset * self.dilations[1])
                         if (h_out >= 0 and h_out < self.output_sizes[0]) and (
                                 w_out >= 0 and w_out < self.output_sizes[1]):
-                            output[b, c_out, h_out, w_out] += self.x[
-                                b, c, w + col_width * h]
+                            output[b, c_out, h_out,
+                                   w_out] += self.x[b, c, w + col_width * h]
 
         self.outputs = output
 
@@ -125,6 +125,7 @@ def test_info(self):
 
 
 class TestFoldOpError(unittest.TestCase):
+
     def test_errors(self):
         from paddle.nn.functional import fold
         from paddle.fluid.framework import Program, program_guard
@@ -143,61 +144,59 @@ def test_kernel_shape():
             def test_padding_shape():
                 # padding_size must be 2 or 4
                 x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(
-                    x,
-                    output_sizes=[2, 3],
-                    kernel_sizes=[2, 2],
-                    paddings=[2, 2, 3])
+                out = fold(x,
+                           output_sizes=[2, 3],
+                           kernel_sizes=[2, 2],
+                           paddings=[2, 2, 3])
 
             def test_dilations_shape():
-                # dialtions_size must be 2 
+                # dialtions_size must be 2
                 x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(
-                    x,
-                    output_sizes=[2, 3],
-                    kernel_sizes=[2, 2],
-                    dilations=[2, 2, 3])
+                out = fold(x,
+                           output_sizes=[2, 3],
+                           kernel_sizes=[2, 2],
+                           dilations=[2, 2, 3])
 
             def test_strides_shape():
                 # strids_size must be 2
                 x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(
-                    x,
-                    output_sizes=[2, 3],
-                    kernel_sizes=[2, 2],
-                    strides=[2, 2, 3])
+                out = fold(x,
+                           output_sizes=[2, 3],
+                           kernel_sizes=[2, 2],
+                           strides=[2, 2, 3])
 
             def test_output_size():
                 # im_h * im_w must be L
                 x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(
-                    x, output_sizes=[6, 6], kernel_sizes=[2, 2],
-                    strides=[1, 1])
+                out = fold(x,
+                           output_sizes=[6, 6],
+                           kernel_sizes=[2, 2],
+                           strides=[1, 1])
 
             def test_output_size_2():
                 # out_size must GT 1
                 x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(
-                    x,
-                    output_sizes=[0.1, 0.2],
-                    kernel_sizes=[2, 2],
-                    strides=[1, 1])
+                out = fold(x,
+                           output_sizes=[0.1, 0.2],
+                           kernel_sizes=[2, 2],
+                           strides=[1, 1])
 
             def test_block_h_w():
                 # test_block_h_w GT 0
                 x = paddle.randn(shape=[2, 1, 1], dtype="float32")
-                out = fold(
-                    x, output_sizes=[1, 1], kernel_sizes=[2, 2], strides=1)
+                out = fold(x,
+                           output_sizes=[1, 1],
+                           kernel_sizes=[2, 2],
+                           strides=1)
 
             def test_GT_0():
                 x = paddle.randn(shape=[2, 1, 1], dtype="float32")
-                out = fold(
-                    x,
-                    output_sizes=[0, 0],
-                    kernel_sizes=[0, 0],
-                    dilations=0,
-                    paddings=[0, 0],
-                    strides=0)
+                out = fold(x,
+                           output_sizes=[0, 0],
+                           kernel_sizes=[0, 0],
+                           dilations=0,
+                           paddings=[0, 0],
+                           strides=0)
 
             self.assertRaises(AssertionError, test_input_shape)
             self.assertRaises(AssertionError, test_kernel_shape)
diff --git a/python/paddle/fluid/tests/unittests/test_frame_op.py b/python/paddle/fluid/tests/unittests/test_frame_op.py
index f26662dcf4f26..528446f3eb474 100644
--- a/python/paddle/fluid/tests/unittests/test_frame_op.py
+++ b/python/paddle/fluid/tests/unittests/test_frame_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -44,6 +44,7 @@ def frame_from_librosa(x, frame_length, hop_length, axis=-1):
 
 
 class TestFrameOp(OpTest):
+
     def setUp(self):
         self.op_type = "frame"
         self.shape, self.type, self.attrs = self.initTestCase()
@@ -51,8 +52,7 @@ def setUp(self):
             'X': np.random.random(size=self.shape).astype(self.type),
         }
         self.outputs = {
-            'Out': frame_from_librosa(
-                x=self.inputs['X'], **self.attrs)
+            'Out': frame_from_librosa(x=self.inputs['X'], **self.attrs)
         }
 
     def initTestCase(self):
@@ -77,6 +77,7 @@ def test_check_grad_normal(self):
 
 
 class TestCase1(TestFrameOp):
+
     def initTestCase(self):
         input_shape = (150, )
         input_type = 'float64'
@@ -89,6 +90,7 @@ def initTestCase(self):
 
 
 class TestCase2(TestFrameOp):
+
     def initTestCase(self):
         input_shape = (8, 150)
         input_type = 'float64'
@@ -101,6 +103,7 @@ def initTestCase(self):
 
 
 class TestCase3(TestFrameOp):
+
     def initTestCase(self):
         input_shape = (150, 8)
         input_type = 'float64'
@@ -113,6 +116,7 @@ def initTestCase(self):
 
 
 class TestCase4(TestFrameOp):
+
     def initTestCase(self):
         input_shape = (4, 2, 150)
         input_type = 'float64'
@@ -125,6 +129,7 @@ def initTestCase(self):
 
 
 class TestCase5(TestFrameOp):
+
     def initTestCase(self):
         input_shape = (150, 4, 2)
         input_type = 'float64'
diff --git a/python/paddle/fluid/tests/unittests/test_framework_debug_str.py b/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
index 6511b56b5e823..420c7c55149cf 100644
--- a/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
+++ b/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
@@ -19,6 +19,7 @@
 
 
 class TestDebugStringFramework(unittest.TestCase):
+
     def test_debug_str(self):
         p = Program()
         p.current_block().create_var(name='t', shape=[0, 1])
diff --git a/python/paddle/fluid/tests/unittests/test_fs_interface.py b/python/paddle/fluid/tests/unittests/test_fs_interface.py
index 581fa9738116d..56341fa489812 100644
--- a/python/paddle/fluid/tests/unittests/test_fs_interface.py
+++ b/python/paddle/fluid/tests/unittests/test_fs_interface.py
@@ -24,6 +24,7 @@
 
 
 class FSTest(unittest.TestCase):
+
     def _test_method(self, func):
         if sys.version_info[0] <= 2:
             args = inspect.getargspec(func).args
diff --git a/python/paddle/fluid/tests/unittests/test_fsp_op.py b/python/paddle/fluid/tests/unittests/test_fsp_op.py
index 7864f4efcdf24..0f7eb4ad95d04 100644
--- a/python/paddle/fluid/tests/unittests/test_fsp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fsp_op.py
@@ -29,15 +29,16 @@ def fsp_matrix(a, b):
     a_t = a.transpose([0, 2, 3, 1])
     a_t = a_t.reshape([batch, h * w, a_channel])
     b_t = b.transpose([0, 2, 3, 1]).reshape([batch, h * w, b_channel])
-    a_r = a_t.repeat(
-        b_channel, axis=1).reshape(
-            [batch, h * w, b_channel, a_channel]).transpose([0, 1, 3, 2])
-    b_r = b_t.repeat(
-        a_channel, axis=1).reshape([batch, h * w, a_channel, b_channel])
+    a_r = a_t.repeat(b_channel,
+                     axis=1).reshape([batch, h * w, b_channel,
+                                      a_channel]).transpose([0, 1, 3, 2])
+    b_r = b_t.repeat(a_channel,
+                     axis=1).reshape([batch, h * w, a_channel, b_channel])
     return np.mean(a_r * b_r, axis=1)
 
 
 class TestFSPOp(OpTest):
+
     def setUp(self):
         self.op_type = "fsp"
         self.initTestCase()
@@ -60,22 +61,25 @@ def test_check_grad_normal(self):
 
 
 class BadInputTest(unittest.TestCase):
+
     def test_error(self):
         with fluid.program_guard(fluid.Program()):
 
             def test_bad_x():
                 data = fluid.layers.data(name='data', shape=[3, 32, 32])
                 feature_map_0 = [1, 2, 3]
-                feature_map_1 = fluid.layers.conv2d(
-                    data, num_filters=2, filter_size=3)
+                feature_map_1 = fluid.layers.conv2d(data,
+                                                    num_filters=2,
+                                                    filter_size=3)
                 loss = fluid.layers.fsp_matrix(feature_map_0, feature_map_1)
 
             self.assertRaises(TypeError, test_bad_x)
 
             def test_bad_y():
                 data = fluid.layers.data(name='data', shape=[3, 32, 32])
-                feature_map_0 = fluid.layers.conv2d(
-                    data, num_filters=2, filter_size=3)
+                feature_map_0 = fluid.layers.conv2d(data,
+                                                    num_filters=2,
+                                                    filter_size=3)
                 feature_map_1 = [1, 2, 3]
                 loss = fluid.layers.fsp_matrix(feature_map_0, feature_map_1)
 
diff --git a/python/paddle/fluid/tests/unittests/test_ftrl_op.py b/python/paddle/fluid/tests/unittests/test_ftrl_op.py
index 1826fdc3c0604..d35f6dadac6a2 100644
--- a/python/paddle/fluid/tests/unittests/test_ftrl_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ftrl_op.py
@@ -35,8 +35,8 @@ def ftrl_step(param, grad, rows, sq_accum, lin_accum, lr, l1, l2, lr_power):
             (np.sqrt(new_accum) - np.sqrt(sq_accum_hit)) / lr) * param_hit
     else:
         lin_accum_updated = lin_accum_hit + grad - (
-            (np.power(new_accum, -lr_power) - np.power(sq_accum_hit, -lr_power)
-             ) / lr) * param_hit
+            (np.power(new_accum, -lr_power) - np.power(sq_accum_hit, -lr_power))
+            / lr) * param_hit
 
     x = l1 * np.sign(lin_accum_updated) - lin_accum_updated
     if lr_power == -0.5:
@@ -65,6 +65,7 @@ def ftrl_step(param, grad, rows, sq_accum, lin_accum, lr, l1, l2, lr_power):
 
 
 class TestFTRLOp(OpTest):
+
     def setUp(self):
         self.op_type = "ftrl"
         rows = 102
@@ -105,6 +106,7 @@ def test_check_output(self):
 
 
 class TestSparseFTRLOp(unittest.TestCase):
+
     def setUp(self):
         self.lr_power = -0.5
 
@@ -154,19 +156,18 @@ def check_with_place(self, place):
             l1, l2, lr_power)
 
         # create and run operator
-        op = Operator(
-            "ftrl",
-            Param='Param',
-            Grad='Grad',
-            ParamOut='Param',
-            SquaredAccumulator='SquaredAccumulator',
-            SquaredAccumOut='SquaredAccumulator',
-            LinearAccumulator='LinearAccumulator',
-            LinearAccumOut='LinearAccumulator',
-            LearningRate='LearningRate',
-            l1=l1,
-            l2=l2,
-            lr_power=lr_power)
+        op = Operator("ftrl",
+                      Param='Param',
+                      Grad='Grad',
+                      ParamOut='Param',
+                      SquaredAccumulator='SquaredAccumulator',
+                      SquaredAccumOut='SquaredAccumulator',
+                      LinearAccumulator='LinearAccumulator',
+                      LinearAccumOut='LinearAccumulator',
+                      LearningRate='LearningRate',
+                      l1=l1,
+                      l2=l2,
+                      lr_power=lr_power)
 
         op.run(scope, place)
 
@@ -177,12 +178,15 @@ def check_with_place(self, place):
 
         for i in range(height):
             for j in range(row_numel):
-                self.assertAlmostEqual(
-                    param_out[i][j], param_array[i][j], places=4)
-                self.assertAlmostEqual(
-                    sq_accum_out[i][j], sq_accum_array[i][j], places=4)
-                self.assertAlmostEqual(
-                    lin_accum_out[i][j], lin_accum_array[i][j], places=4)
+                self.assertAlmostEqual(param_out[i][j],
+                                       param_array[i][j],
+                                       places=4)
+                self.assertAlmostEqual(sq_accum_out[i][j],
+                                       sq_accum_array[i][j],
+                                       places=4)
+                self.assertAlmostEqual(lin_accum_out[i][j],
+                                       lin_accum_array[i][j],
+                                       places=4)
 
     def init_kernel(self):
         pass
@@ -196,6 +200,7 @@ def test_sparse_ftrl(self):
 
 
 class TestSparseFTRLOp2(TestSparseFTRLOp):
+
     def init_kernel(self):
         self.lr_power = -0.6
 
diff --git a/python/paddle/fluid/tests/unittests/test_full_like_op.py b/python/paddle/fluid/tests/unittests/test_full_like_op.py
index d3fea677a47d9..7a55125d1b4b5 100644
--- a/python/paddle/fluid/tests/unittests/test_full_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_like_op.py
@@ -33,8 +33,9 @@ def test_attr_tensor_API(self):
         train_program = Program()
         with program_guard(train_program, startup_program):
             fill_value = 2.0
-            input = paddle.fluid.data(
-                name='input', dtype='float32', shape=[2, 3])
+            input = paddle.fluid.data(name='input',
+                                      dtype='float32',
+                                      shape=[2, 3])
             output = paddle.full_like(input, fill_value)
             output_dtype = paddle.full_like(input, fill_value, dtype='float32')
 
@@ -51,9 +52,9 @@ def test_attr_tensor_API(self):
                           fetch_list=[output])
 
             out_np = np.array(res[0])
-            self.assertTrue(
-                not (out_np - np.full_like(img, fill_value)).any(),
-                msg="full_like output is wrong, out = " + str(out_np))
+            self.assertTrue(not (out_np - np.full_like(img, fill_value)).any(),
+                            msg="full_like output is wrong, out = " +
+                            str(out_np))
 
     def test_full_like_imperative(self):
         paddle.disable_static()
@@ -75,23 +76,24 @@ def test_full_like_fill_inf(self):
 
 
 class TestFullOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             #for ci coverage
 
-            input_data = paddle.fluid.data(
-                name='input', dtype='float32', shape=[2, 3])
+            input_data = paddle.fluid.data(name='input',
+                                           dtype='float32',
+                                           shape=[2, 3])
             output = paddle.full_like(input_data, 2.0)
 
             def test_input_dtype():
                 paddle.full_like
 
-            self.assertRaises(
-                TypeError,
-                paddle.full_like,
-                x=input_data,
-                fill_value=2,
-                dtype='uint4')
+            self.assertRaises(TypeError,
+                              paddle.full_like,
+                              x=input_data,
+                              fill_value=2,
+                              dtype='uint4')
 
 
 class TestFullLikeOp1(OpTest):
@@ -121,6 +123,7 @@ def test_check_output(self):
 
 
 class TestFullLikeOp2(TestFullLikeOp1):
+
     def init_data(self):
         self.fill_value = 1000
         self.shape = [1024, 1024]
@@ -128,6 +131,7 @@ def init_data(self):
 
 
 class TestFullLikeOp3(TestFullLikeOp1):
+
     def init_data(self):
         self.fill_value = 8888
         self.shape = [5000, 5000]
@@ -137,11 +141,12 @@ def init_data(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFullLikeOp4(unittest.TestCase):
+
     def test_skip_data_transform(self):
         paddle.disable_static()
         with _test_eager_guard():
-            x = paddle.to_tensor(
-                [1., 2., 3., 4.], place=paddle.CUDAPinnedPlace())
+            x = paddle.to_tensor([1., 2., 3., 4.],
+                                 place=paddle.CUDAPinnedPlace())
             out = paddle.full_like(x, 1.)
             self.assertTrue(
                 (out.numpy() == np.ones([4]).astype(np.float32)).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_full_op.py b/python/paddle/fluid/tests/unittests/test_full_op.py
index 723c4609bc96b..c0aba3ff3660c 100644
--- a/python/paddle/fluid/tests/unittests/test_full_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_op.py
@@ -28,36 +28,45 @@
 
 # Test python API
 class TestFullAPI(unittest.TestCase):
+
     def test_api(self):
         positive_2_int32 = fluid.layers.fill_constant([1], "int32", 2)
 
         positive_2_int64 = fluid.layers.fill_constant([1], "int64", 2)
-        shape_tensor_int32 = fluid.data(
-            name="shape_tensor_int32", shape=[2], dtype="int32")
+        shape_tensor_int32 = fluid.data(name="shape_tensor_int32",
+                                        shape=[2],
+                                        dtype="int32")
 
-        shape_tensor_int64 = fluid.data(
-            name="shape_tensor_int64", shape=[2], dtype="int64")
+        shape_tensor_int64 = fluid.data(name="shape_tensor_int64",
+                                        shape=[2],
+                                        dtype="int64")
 
         out_1 = paddle.full(shape=[1, 2], dtype="float32", fill_value=1.1)
 
-        out_2 = paddle.full(
-            shape=[1, positive_2_int32], dtype="float32", fill_value=1.1)
+        out_2 = paddle.full(shape=[1, positive_2_int32],
+                            dtype="float32",
+                            fill_value=1.1)
 
-        out_3 = paddle.full(
-            shape=[1, positive_2_int64], dtype="float32", fill_value=1.1)
+        out_3 = paddle.full(shape=[1, positive_2_int64],
+                            dtype="float32",
+                            fill_value=1.1)
 
-        out_4 = paddle.full(
-            shape=shape_tensor_int32, dtype="float32", fill_value=1.2)
+        out_4 = paddle.full(shape=shape_tensor_int32,
+                            dtype="float32",
+                            fill_value=1.2)
 
-        out_5 = paddle.full(
-            shape=shape_tensor_int64, dtype="float32", fill_value=1.1)
+        out_5 = paddle.full(shape=shape_tensor_int64,
+                            dtype="float32",
+                            fill_value=1.1)
 
-        out_6 = paddle.full(
-            shape=shape_tensor_int64, dtype=np.float32, fill_value=1.1)
+        out_6 = paddle.full(shape=shape_tensor_int64,
+                            dtype=np.float32,
+                            fill_value=1.1)
 
         val = fluid.layers.fill_constant(shape=[1], dtype=np.float32, value=1.1)
-        out_7 = paddle.full(
-            shape=shape_tensor_int64, dtype=np.float32, fill_value=val)
+        out_7 = paddle.full(shape=shape_tensor_int64,
+                            dtype=np.float32,
+                            fill_value=val)
 
         exe = fluid.Executor(place=fluid.CPUPlace())
         res_1, res_2, res_3, res_4, res_5, res_6, res_7 = exe.run(
@@ -84,83 +93,83 @@ def test_api_eager(self):
                 positive_4_int64 = fluid.layers.fill_constant([1], "int64", 4,
                                                               True)
 
-                out_1 = paddle.full(
-                    shape=[1, 2], dtype="float32", fill_value=1.1)
+                out_1 = paddle.full(shape=[1, 2],
+                                    dtype="float32",
+                                    fill_value=1.1)
 
-                out_2 = paddle.full(
-                    shape=[1, positive_2_int32.item()],
-                    dtype="float32",
-                    fill_value=1.1)
+                out_2 = paddle.full(shape=[1, positive_2_int32.item()],
+                                    dtype="float32",
+                                    fill_value=1.1)
 
-                out_3 = paddle.full(
-                    shape=[1, positive_2_int64.item()],
-                    dtype="float32",
-                    fill_value=1.1)
+                out_3 = paddle.full(shape=[1, positive_2_int64.item()],
+                                    dtype="float32",
+                                    fill_value=1.1)
 
-                out_4 = paddle.full(
-                    shape=[1, 2], dtype="float32", fill_value=1.2)
+                out_4 = paddle.full(shape=[1, 2],
+                                    dtype="float32",
+                                    fill_value=1.2)
 
-                out_5 = paddle.full(
-                    shape=[1, 2], dtype="float32", fill_value=1.1)
+                out_5 = paddle.full(shape=[1, 2],
+                                    dtype="float32",
+                                    fill_value=1.1)
 
-                out_6 = paddle.full(
-                    shape=[1, 2], dtype=np.float32, fill_value=1.1)
+                out_6 = paddle.full(shape=[1, 2],
+                                    dtype=np.float32,
+                                    fill_value=1.1)
 
-                val = fluid.layers.fill_constant(
-                    shape=[1], dtype=np.float32, value=1.1)
-                out_7 = paddle.full(
-                    shape=[1, 2], dtype=np.float32, fill_value=val)
+                val = fluid.layers.fill_constant(shape=[1],
+                                                 dtype=np.float32,
+                                                 value=1.1)
+                out_7 = paddle.full(shape=[1, 2],
+                                    dtype=np.float32,
+                                    fill_value=val)
 
-                out_8 = paddle.full(
-                    shape=positive_2_int32, dtype="float32", fill_value=1.1)
+                out_8 = paddle.full(shape=positive_2_int32,
+                                    dtype="float32",
+                                    fill_value=1.1)
 
-                out_9 = paddle.full(
-                    shape=[
-                        positive_2_int32, positive_2_int64, positive_4_int64
-                    ],
-                    dtype="float32",
-                    fill_value=1.1)
+                out_9 = paddle.full(shape=[
+                    positive_2_int32, positive_2_int64, positive_4_int64
+                ],
+                                    dtype="float32",
+                                    fill_value=1.1)
 
                 # test for numpy.float64 as fill_value
-                out_10 = paddle.full_like(
-                    out_7, dtype=np.float32, fill_value=np.abs(1.1))
-
-                assert np.array_equal(
-                    out_1, np.full(
-                        [1, 2], 1.1, dtype="float32"))
-                assert np.array_equal(
-                    out_2, np.full(
-                        [1, 2], 1.1, dtype="float32"))
-                assert np.array_equal(
-                    out_3, np.full(
-                        [1, 2], 1.1, dtype="float32"))
-                assert np.array_equal(
-                    out_4, np.full(
-                        [1, 2], 1.2, dtype="float32"))
-                assert np.array_equal(
-                    out_5, np.full(
-                        [1, 2], 1.1, dtype="float32"))
-                assert np.array_equal(
-                    out_6, np.full(
-                        [1, 2], 1.1, dtype="float32"))
-                assert np.array_equal(
-                    out_7, np.full(
-                        [1, 2], 1.1, dtype="float32"))
+                out_10 = paddle.full_like(out_7,
+                                          dtype=np.float32,
+                                          fill_value=np.abs(1.1))
+
+                assert np.array_equal(out_1,
+                                      np.full([1, 2], 1.1, dtype="float32"))
+                assert np.array_equal(out_2,
+                                      np.full([1, 2], 1.1, dtype="float32"))
+                assert np.array_equal(out_3,
+                                      np.full([1, 2], 1.1, dtype="float32"))
+                assert np.array_equal(out_4,
+                                      np.full([1, 2], 1.2, dtype="float32"))
+                assert np.array_equal(out_5,
+                                      np.full([1, 2], 1.1, dtype="float32"))
+                assert np.array_equal(out_6,
+                                      np.full([1, 2], 1.1, dtype="float32"))
+                assert np.array_equal(out_7,
+                                      np.full([1, 2], 1.1, dtype="float32"))
                 assert np.array_equal(out_8, np.full([2], 1.1, dtype="float32"))
-                assert np.array_equal(
-                    out_9, np.full(
-                        [2, 2, 4], 1.1, dtype="float32"))
-                assert np.array_equal(
-                    out_10, np.full(
-                        [1, 2], 1.1, dtype="float32"))
+                assert np.array_equal(out_9,
+                                      np.full([2, 2, 4], 1.1, dtype="float32"))
+                assert np.array_equal(out_10,
+                                      np.full([1, 2], 1.1, dtype="float32"))
 
 
 class TestFullOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             #for ci coverage
-            self.assertRaises(
-                TypeError, paddle.full, shape=[1], fill_value=5, dtype='uint4')
+            self.assertRaises(TypeError,
+                              paddle.full,
+                              shape=[1],
+                              fill_value=5,
+                              dtype='uint4')
 
             # The argument dtype of full must be one of bool, float16,
             #float32, float64, uint8, int16, int32 or int64
@@ -179,15 +188,17 @@ def test_shape_size():
 
             # The shape dtype of full op must be int32 or int64.
             def test_shape_tensor_dtype():
-                shape = fluid.data(
-                    name="shape_tensor", shape=[2], dtype="float32")
+                shape = fluid.data(name="shape_tensor",
+                                   shape=[2],
+                                   dtype="float32")
                 paddle.full(shape=shape, dtype="float32", fill_value=1)
 
             self.assertRaises(TypeError, test_shape_tensor_dtype)
 
             def test_shape_tensor_list_dtype():
-                shape = fluid.data(
-                    name="shape_tensor_list", shape=[1], dtype="bool")
+                shape = fluid.data(name="shape_tensor_list",
+                                   shape=[1],
+                                   dtype="bool")
                 paddle.full(shape=[shape, 2], dtype="float32", fill_value=1)
 
             self.assertRaises(TypeError, test_shape_tensor_list_dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_function_hook.py b/python/paddle/fluid/tests/unittests/test_function_hook.py
index 55981b01c4084..8c88ee06c1efd 100644
--- a/python/paddle/fluid/tests/unittests/test_function_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_function_hook.py
@@ -24,6 +24,7 @@
 
 
 class TestCapture:
+
     def __init__(self):
         self.list = []
 
@@ -42,6 +43,7 @@ def grad_hook(grad):
 
 
 class TestBakcwardFunctionHookError(unittest.TestCase):
+
     def func_hook(self):
         input_data = np.ones([4, 4]).astype('float32')
 
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv1d.py b/python/paddle/fluid/tests/unittests/test_functional_conv1d.py
index b803835d107d4..88dd98f1f3a32 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv1d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv1d.py
@@ -23,6 +23,7 @@
 
 
 class TestFunctionalConv1DError(TestCase):
+
     def setUp(self):
         self.input = []
         self.filter = []
@@ -39,15 +40,14 @@ def dygraph_case(self):
             w = dg.to_variable(self.filter, dtype=paddle.float32)
             b = None if self.bias is None else dg.to_variable(
                 self.bias, dtype=paddle.float32)
-            y = F.conv1d(
-                x,
-                w,
-                b,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                groups=self.groups,
-                data_format=self.data_format)
+            y = F.conv1d(x,
+                         w,
+                         b,
+                         padding=self.padding,
+                         stride=self.stride,
+                         dilation=self.dilation,
+                         groups=self.groups,
+                         data_format=self.data_format)
 
     def test_exception(self):
         with self.assertRaises(ValueError):
@@ -55,6 +55,7 @@ def test_exception(self):
 
 
 class TestFunctionalConv1DErrorCase1(TestFunctionalConv1DError):
+
     def setUp(self):
         self.input = np.random.randn(1, 3, 3)
         self.filter = np.random.randn(3, 3, 1)
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py
index 4284ab48827e0..3d719de367492 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py
@@ -23,6 +23,7 @@
 
 
 class TestFunctionalConv1DError(TestCase):
+
     def setUp(self):
         self.input = []
         self.filter = []
@@ -39,15 +40,14 @@ def dygraph_case(self):
             w = dg.to_variable(self.filter, dtype=paddle.float32)
             b = None if self.bias is None else dg.to_variable(
                 self.bias, dtype=paddle.float32)
-            y = F.conv1d_transpose(
-                x,
-                w,
-                b,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                groups=self.groups,
-                data_format=self.data_format)
+            y = F.conv1d_transpose(x,
+                                   w,
+                                   b,
+                                   padding=self.padding,
+                                   stride=self.stride,
+                                   dilation=self.dilation,
+                                   groups=self.groups,
+                                   data_format=self.data_format)
 
     def test_exception(self):
         with self.assertRaises(ValueError):
@@ -55,6 +55,7 @@ def test_exception(self):
 
 
 class TestFunctionalConv1DErrorCase1(TestFunctionalConv1DError):
+
     def setUp(self):
         self.input = np.random.randn(1, 3, 3)
         self.filter = np.random.randn(3, 3, 1)
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
index 8e0a744ecdbda..6c0f526f236cb 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
@@ -46,19 +46,19 @@ def prepare(self):
             filter_shape = tuple(self.filter_shape)
 
         self.weight = np.random.uniform(
-            -1, 1, (self.out_channels, self.in_channels // self.groups
-                    ) + filter_shape).astype(self.dtype)
+            -1, 1, (self.out_channels, self.in_channels // self.groups) +
+            filter_shape).astype(self.dtype)
         if not self.no_bias:
-            self.bias = np.random.uniform(-1, 1, (
-                self.out_channels, )).astype(self.dtype)
+            self.bias = np.random.uniform(-1, 1, (self.out_channels, )).astype(
+                self.dtype)
 
         self.channel_last = (self.data_format == "NHWC")
         if self.channel_last:
             self.input_shape = (self.batch_size, ) + self.spatial_shape + (
                 self.in_channels, )
         else:
-            self.input_shape = (self.batch_size, self.in_channels
-                                ) + self.spatial_shape
+            self.input_shape = (self.batch_size,
+                                self.in_channels) + self.spatial_shape
 
         self.input = np.random.uniform(-1, 1,
                                        self.input_shape).astype(self.dtype)
@@ -69,13 +69,11 @@ def static_graph_case_1(self):
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
                 if self.channel_last:
-                    x = fluid.data(
-                        "input", (-1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
+                    x = fluid.data("input", (-1, -1, -1, self.in_channels),
+                                   dtype=self.dtype)
                 else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1),
-                        dtype=self.dtype)
+                    x = fluid.data("input", (-1, self.in_channels, -1, -1),
+                                   dtype=self.dtype)
                 y = fluid.layers.conv2d(
                     x,
                     self.out_channels,
@@ -100,26 +98,24 @@ def static_graph_case_2(self):
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
                 if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
+                    x = x = fluid.data("input", (-1, -1, -1, self.in_channels),
+                                       dtype=self.dtype)
                 else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight.shape, dtype=self.dtype)
+                    x = fluid.data("input", (-1, self.in_channels, -1, -1),
+                                   dtype=self.dtype)
+                weight = fluid.data("weight",
+                                    self.weight.shape,
+                                    dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv2d(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format)
+                y = F.conv2d(x,
+                             weight,
+                             None if self.no_bias else bias,
+                             padding=self.padding,
+                             stride=self.stride,
+                             dilation=self.dilation,
+                             groups=self.groups,
+                             data_format=self.data_format)
 
                 if self.act == 'sigmoid':
                     y = F.sigmoid(y)
@@ -137,15 +133,14 @@ def dygraph_case(self):
             x = dg.to_variable(self.input)
             weight = dg.to_variable(self.weight)
             bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv2d(
-                x,
-                weight,
-                bias,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                groups=self.groups,
-                data_format=self.data_format)
+            y = F.conv2d(x,
+                         weight,
+                         bias,
+                         padding=self.padding,
+                         stride=self.stride,
+                         dilation=self.dilation,
+                         groups=self.groups,
+                         data_format=self.data_format)
 
             if self.act == 'sigmoid':
                 y = F.sigmoid(y)
@@ -199,8 +194,8 @@ def prepare(self):
             filter_shape = (self.filter_shape, ) * 2
         else:
             filter_shape = tuple(self.filter_shape)
-        self.weight_shape = (self.out_channels, self.in_channels // self.groups
-                             ) + filter_shape
+        self.weight_shape = (self.out_channels,
+                             self.in_channels // self.groups) + filter_shape
         self.bias_shape = (self.out_channels, )
 
     def static_graph_case(self):
@@ -210,29 +205,28 @@ def static_graph_case(self):
             with fluid.program_guard(main, start):
                 self.channel_last = self.data_format == "NHWC"
                 if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
+                    x = x = fluid.data("input", (-1, -1, -1, self.in_channels),
+                                       dtype=self.dtype)
                 else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
+                    x = fluid.data("input", (-1, self.in_channels, -1, -1),
+                                   dtype=self.dtype)
+                weight = fluid.data("weight",
+                                    self.weight_shape,
+                                    dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv2d(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format)
+                y = F.conv2d(x,
+                             weight,
+                             None if self.no_bias else bias,
+                             padding=self.padding,
+                             stride=self.stride,
+                             dilation=self.dilation,
+                             groups=self.groups,
+                             data_format=self.data_format)
 
 
 class TestFunctionalConv2DCase2(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -248,6 +242,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase3(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -263,6 +258,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase4(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -278,6 +274,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase5(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -293,6 +290,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase6(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -308,6 +306,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase7(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 6
         self.out_channels = 8
@@ -323,6 +322,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase8(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 6
         self.out_channels = 12
@@ -338,6 +338,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase2(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -353,6 +354,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase3(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 4
@@ -368,6 +370,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase4(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 3
@@ -383,6 +386,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase7(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -398,6 +402,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase8(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -413,6 +418,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase9(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = -5
         self.out_channels = 5
@@ -428,6 +434,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase10(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 4
@@ -443,6 +450,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase11(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -458,6 +466,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase12(TestCase):
+
     def setUp(self):
         self.input = np.array([])
         self.filter = np.array([])
@@ -476,19 +485,19 @@ def static_graph_case(self):
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
                 x = fluid.data("input", self.input.shape, dtype=paddle.float32)
-                y = fluid.layers.conv2d(
-                    x,
-                    self.num_filters,
-                    self.filter_size,
-                    stride=self.stride,
-                    padding=self.padding,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.filter),
-                    bias_attr=False if self.bias is None else
-                    I.NumpyArrayInitializer(self.bias),
-                    act=None,
-                    data_format=self.data_format)
+                y = fluid.layers.conv2d(x,
+                                        self.num_filters,
+                                        self.filter_size,
+                                        stride=self.stride,
+                                        padding=self.padding,
+                                        dilation=self.dilation,
+                                        groups=self.groups,
+                                        param_attr=I.NumpyArrayInitializer(
+                                            self.filter),
+                                        bias_attr=False if self.bias is None
+                                        else I.NumpyArrayInitializer(self.bias),
+                                        act=None,
+                                        data_format=self.data_format)
         exe = fluid.Executor()
         exe.run(start)
         out, = exe.run(main, feed={"input": self.input}, fetch_list=[y])
@@ -500,15 +509,14 @@ def dygraph_case(self):
             w = dg.to_variable(self.filter, dtype=paddle.float32)
             b = None if self.bias is None else dg.to_variable(
                 self.bias, dtype=paddle.float32)
-            y = F.conv2d(
-                x,
-                w,
-                b,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                groups=self.groups,
-                data_format=self.data_format)
+            y = F.conv2d(x,
+                         w,
+                         b,
+                         padding=self.padding,
+                         stride=self.stride,
+                         dilation=self.dilation,
+                         groups=self.groups,
+                         data_format=self.data_format)
 
     def test_dygraph_exception(self):
         with self.assertRaises(ValueError):
@@ -520,6 +528,7 @@ def test_static_exception(self):
 
 
 class TestFunctionalConv2DErrorCase13(TestFunctionalConv2DErrorCase12):
+
     def setUp(self):
         self.input = np.random.randn(1, 3, 3, 3)
         self.filter = np.random.randn(3, 3, 1, 1)
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
index 781169d70c17c..d1b9c68925747 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
@@ -47,19 +47,19 @@ def prepare(self):
             filter_shape = tuple(self.filter_shape)
 
         self.weight = np.random.uniform(
-            -1, 1, (self.in_channels, self.out_channels // self.groups
-                    ) + filter_shape).astype(self.dtype)
+            -1, 1, (self.in_channels, self.out_channels // self.groups) +
+            filter_shape).astype(self.dtype)
         if not self.no_bias:
-            self.bias = np.random.uniform(-1, 1, (
-                self.out_channels, )).astype(self.dtype)
+            self.bias = np.random.uniform(-1, 1, (self.out_channels, )).astype(
+                self.dtype)
 
         self.channel_last = (self.data_format == "NHWC")
         if self.channel_last:
             self.input_shape = (self.batch_size, ) + self.spatial_shape + (
                 self.in_channels, )
         else:
-            self.input_shape = (self.batch_size, self.in_channels
-                                ) + self.spatial_shape
+            self.input_shape = (self.batch_size,
+                                self.in_channels) + self.spatial_shape
 
         self.input = np.random.uniform(-1, 1,
                                        self.input_shape).astype(self.dtype)
@@ -70,13 +70,11 @@ def static_graph_case_1(self):
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
                 if self.channel_last:
-                    x = fluid.data(
-                        "input", (-1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
+                    x = fluid.data("input", (-1, -1, -1, self.in_channels),
+                                   dtype=self.dtype)
                 else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1),
-                        dtype=self.dtype)
+                    x = fluid.data("input", (-1, self.in_channels, -1, -1),
+                                   dtype=self.dtype)
                 y = fluid.layers.conv2d_transpose(
                     x,
                     self.out_channels,
@@ -101,27 +99,25 @@ def static_graph_case_2(self):
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
                 if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
+                    x = x = fluid.data("input", (-1, -1, -1, self.in_channels),
+                                       dtype=self.dtype)
                 else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight.shape, dtype=self.dtype)
+                    x = fluid.data("input", (-1, self.in_channels, -1, -1),
+                                   dtype=self.dtype)
+                weight = fluid.data("weight",
+                                    self.weight.shape,
+                                    dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv2d_transpose(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format)
+                y = F.conv2d_transpose(x,
+                                       weight,
+                                       None if self.no_bias else bias,
+                                       output_size=self.output_size,
+                                       padding=self.padding,
+                                       stride=self.stride,
+                                       dilation=self.dilation,
+                                       groups=self.groups,
+                                       data_format=self.data_format)
         exe = fluid.Executor(self.place)
         exe.run(start)
         feed_dict = {"input": self.input, "weight": self.weight}
@@ -135,16 +131,15 @@ def dygraph_case(self):
             x = dg.to_variable(self.input)
             weight = dg.to_variable(self.weight)
             bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv2d_transpose(
-                x,
-                weight,
-                bias,
-                output_size=self.output_size,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                groups=self.groups,
-                data_format=self.data_format)
+            y = F.conv2d_transpose(x,
+                                   weight,
+                                   bias,
+                                   output_size=self.output_size,
+                                   padding=self.padding,
+                                   stride=self.stride,
+                                   dilation=self.dilation,
+                                   groups=self.groups,
+                                   data_format=self.data_format)
             out = y.numpy()
         return out
 
@@ -204,8 +199,8 @@ def prepare(self):
             filter_shape = (self.filter_shape, ) * 2
         else:
             filter_shape = tuple(self.filter_shape)
-        self.weight_shape = (self.in_channels, self.out_channels // self.groups
-                             ) + filter_shape
+        self.weight_shape = (self.in_channels,
+                             self.out_channels // self.groups) + filter_shape
         self.bias_shape = (self.out_channels, )
 
     def static_graph_case(self):
@@ -215,30 +210,29 @@ def static_graph_case(self):
             with fluid.program_guard(main, start):
                 self.channel_last = self.data_format == "NHWC"
                 if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
+                    x = x = fluid.data("input", (-1, -1, -1, self.in_channels),
+                                       dtype=self.dtype)
                 else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
+                    x = fluid.data("input", (-1, self.in_channels, -1, -1),
+                                   dtype=self.dtype)
+                weight = fluid.data("weight",
+                                    self.weight_shape,
+                                    dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv2d_transpose(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format)
+                y = F.conv2d_transpose(x,
+                                       weight,
+                                       None if self.no_bias else bias,
+                                       output_size=self.output_size,
+                                       padding=self.padding,
+                                       stride=self.stride,
+                                       dilation=self.dilation,
+                                       groups=self.groups,
+                                       data_format=self.data_format)
 
 
 class TestFunctionalConv2DCase2(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -252,6 +246,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase3(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -265,6 +260,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase4(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 6
@@ -278,6 +274,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase5(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 6
@@ -291,6 +288,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase6(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 6
@@ -304,6 +302,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase7(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 4
@@ -317,6 +316,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase8(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 4
@@ -331,6 +331,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase9(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 6
@@ -344,6 +345,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase10(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 6
@@ -357,6 +359,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase11(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 6
@@ -370,6 +373,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DCase12(TestFunctionalConv2D):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 6
@@ -383,6 +387,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase2(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -396,6 +401,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase3(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -409,6 +415,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase4(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -422,6 +429,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase5(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = -2
         self.out_channels = 5
@@ -435,6 +443,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase7(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 5
@@ -449,6 +458,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase8(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 5
@@ -462,6 +472,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase9(TestFunctionalConv2DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 4
@@ -475,6 +486,7 @@ def setUp(self):
 
 
 class TestFunctionalConv2DErrorCase10(TestCase):
+
     def setUp(self):
         self.input = np.array([])
         self.filter = np.array([])
@@ -493,19 +505,19 @@ def static_graph_case(self):
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
                 x = fluid.data("input", self.input.shape, dtype=paddle.float32)
-                y = fluid.layers.conv2d(
-                    x,
-                    self.num_filters,
-                    self.filter_size,
-                    stride=self.stride,
-                    padding=self.padding,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.filter),
-                    bias_attr=False if self.bias is None else
-                    I.NumpyArrayInitializer(self.bias),
-                    act=None,
-                    data_format=self.data_format)
+                y = fluid.layers.conv2d(x,
+                                        self.num_filters,
+                                        self.filter_size,
+                                        stride=self.stride,
+                                        padding=self.padding,
+                                        dilation=self.dilation,
+                                        groups=self.groups,
+                                        param_attr=I.NumpyArrayInitializer(
+                                            self.filter),
+                                        bias_attr=False if self.bias is None
+                                        else I.NumpyArrayInitializer(self.bias),
+                                        act=None,
+                                        data_format=self.data_format)
         exe = fluid.Executor()
         exe.run(start)
         out, = exe.run(main, feed={"input": self.input}, fetch_list=[y])
@@ -517,15 +529,14 @@ def dygraph_case(self):
             w = dg.to_variable(self.filter, dtype=paddle.float32)
             b = None if self.bias is None else dg.to_variable(
                 self.bias, dtype=paddle.float32)
-            y = F.conv2d_transpose(
-                x,
-                w,
-                b,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                groups=self.groups,
-                data_format=self.data_format)
+            y = F.conv2d_transpose(x,
+                                   w,
+                                   b,
+                                   padding=self.padding,
+                                   stride=self.stride,
+                                   dilation=self.dilation,
+                                   groups=self.groups,
+                                   data_format=self.data_format)
 
     def test_dygraph_exception(self):
         with self.assertRaises(ValueError):
@@ -541,6 +552,7 @@ def test_static_exception(self):
 
 
 class TestFunctionalConv2DErrorCase11(TestFunctionalConv2DErrorCase10):
+
     def setUp(self):
         self.input = np.random.randn(1, 3, 3, 3)
         self.filter = np.random.randn(3, 3, 1, 1)
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
index 6c20816065882..9ecbf2bf46c5d 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
@@ -46,19 +46,19 @@ def prepare(self):
             filter_shape = tuple(self.filter_shape)
 
         self.weight = np.random.uniform(
-            -1, 1, (self.out_channels, self.in_channels // self.groups
-                    ) + filter_shape).astype(self.dtype)
+            -1, 1, (self.out_channels, self.in_channels // self.groups) +
+            filter_shape).astype(self.dtype)
         if not self.no_bias:
-            self.bias = np.random.uniform(-1, 1, (
-                self.out_channels, )).astype(self.dtype)
+            self.bias = np.random.uniform(-1, 1, (self.out_channels, )).astype(
+                self.dtype)
 
         self.channel_last = (self.data_format == "NDHWC")
         if self.channel_last:
             self.input_shape = (self.batch_size, ) + self.spatial_shape + (
                 self.in_channels, )
         else:
-            self.input_shape = (self.batch_size, self.in_channels
-                                ) + self.spatial_shape
+            self.input_shape = (self.batch_size,
+                                self.in_channels) + self.spatial_shape
 
         self.input = np.random.uniform(-1, 1,
                                        self.input_shape).astype(self.dtype)
@@ -69,13 +69,11 @@ def static_graph_case_1(self):
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
                 if self.channel_last:
-                    x = fluid.data(
-                        "input", (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
+                    x = fluid.data("input", (-1, -1, -1, -1, self.in_channels),
+                                   dtype=self.dtype)
                 else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype)
+                    x = fluid.data("input", (-1, self.in_channels, -1, -1, -1),
+                                   dtype=self.dtype)
                 y = fluid.layers.conv3d(
                     x,
                     self.out_channels,
@@ -100,26 +98,25 @@ def static_graph_case_2(self):
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
                 if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
+                    x = x = fluid.data("input",
+                                       (-1, -1, -1, -1, self.in_channels),
+                                       dtype=self.dtype)
                 else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight.shape, dtype=self.dtype)
+                    x = fluid.data("input", (-1, self.in_channels, -1, -1, -1),
+                                   dtype=self.dtype)
+                weight = fluid.data("weight",
+                                    self.weight.shape,
+                                    dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv3d(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format)
+                y = F.conv3d(x,
+                             weight,
+                             None if self.no_bias else bias,
+                             padding=self.padding,
+                             stride=self.stride,
+                             dilation=self.dilation,
+                             groups=self.groups,
+                             data_format=self.data_format)
 
                 if self.act == 'sigmoid':
                     y = F.sigmoid(y)
@@ -137,15 +134,14 @@ def dygraph_case(self):
             x = dg.to_variable(self.input)
             weight = dg.to_variable(self.weight)
             bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv3d(
-                x,
-                weight,
-                bias,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                groups=self.groups,
-                data_format=self.data_format)
+            y = F.conv3d(x,
+                         weight,
+                         bias,
+                         padding=self.padding,
+                         stride=self.stride,
+                         dilation=self.dilation,
+                         groups=self.groups,
+                         data_format=self.data_format)
 
             if self.act == 'sigmoid':
                 y = F.sigmoid(y)
@@ -199,8 +195,8 @@ def prepare(self):
             filter_shape = (self.filter_shape, ) * 3
         else:
             filter_shape = tuple(self.filter_shape)
-        self.weight_shape = (self.out_channels, self.in_channels // self.groups
-                             ) + filter_shape
+        self.weight_shape = (self.out_channels,
+                             self.in_channels // self.groups) + filter_shape
         self.bias_shape = (self.out_channels, )
 
     def static_graph_case(self):
@@ -210,32 +206,32 @@ def static_graph_case(self):
             with fluid.program_guard(main, start):
                 self.channel_last = self.data_format == "NDHWC"
                 if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
+                    x = x = fluid.data("input",
+                                       (-1, -1, -1, -1, self.in_channels),
+                                       dtype=self.dtype)
                 else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
+                    x = fluid.data("input", (-1, self.in_channels, -1, -1, -1),
+                                   dtype=self.dtype)
+                weight = fluid.data("weight",
+                                    self.weight_shape,
+                                    dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv3d(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format)
+                y = F.conv3d(x,
+                             weight,
+                             None if self.no_bias else bias,
+                             padding=self.padding,
+                             stride=self.stride,
+                             dilation=self.dilation,
+                             groups=self.groups,
+                             data_format=self.data_format)
 
                 if self.act == 'sigmoid':
                     y = F.sigmoid(y)
 
 
 class TestFunctionalConv3DCase2(TestFunctionalConv3D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -250,6 +246,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DCase3(TestFunctionalConv3D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -264,6 +261,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DCase4(TestFunctionalConv3D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -278,6 +276,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DCase5(TestFunctionalConv3D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -292,6 +291,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DCase6(TestFunctionalConv3D):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -306,6 +306,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DCase7(TestFunctionalConv3D):
+
     def setUp(self):
         self.in_channels = 6
         self.out_channels = 8
@@ -320,6 +321,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DCase8(TestFunctionalConv3D):
+
     def setUp(self):
         self.in_channels = 6
         self.out_channels = 12
@@ -335,6 +337,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DErrorCase2(TestFunctionalConv3DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -349,6 +352,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DErrorCase3(TestFunctionalConv3DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 4
@@ -363,6 +367,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DErrorCase4(TestFunctionalConv3DError):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 3
@@ -377,6 +382,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DErrorCase7(TestFunctionalConv3DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -391,6 +397,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DErrorCase8(TestFunctionalConv3DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -405,6 +412,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DErrorCase9(TestFunctionalConv3DError):
+
     def setUp(self):
         self.in_channels = -5
         self.out_channels = 5
@@ -419,6 +427,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DErrorCase10(TestFunctionalConv3DError):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 4
@@ -433,6 +442,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DErrorCase11(TestCase):
+
     def setUp(self):
         self.input = np.array([])
         self.filter = np.array([])
@@ -451,19 +461,19 @@ def static_graph_case(self):
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
                 x = fluid.data("input", self.input.shape, dtype=paddle.float32)
-                y = fluid.layers.conv3d(
-                    x,
-                    self.num_filters,
-                    self.filter_size,
-                    stride=self.stride,
-                    padding=self.padding,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.filter),
-                    bias_attr=False if self.bias is None else
-                    I.NumpyArrayInitializer(self.bias),
-                    act=None,
-                    data_format=self.data_format)
+                y = fluid.layers.conv3d(x,
+                                        self.num_filters,
+                                        self.filter_size,
+                                        stride=self.stride,
+                                        padding=self.padding,
+                                        dilation=self.dilation,
+                                        groups=self.groups,
+                                        param_attr=I.NumpyArrayInitializer(
+                                            self.filter),
+                                        bias_attr=False if self.bias is None
+                                        else I.NumpyArrayInitializer(self.bias),
+                                        act=None,
+                                        data_format=self.data_format)
         exe = fluid.Executor()
         exe.run(start)
         out, = exe.run(main, feed={"input": self.input}, fetch_list=[y])
@@ -475,15 +485,14 @@ def dygraph_case(self):
             w = dg.to_variable(self.filter, dtype=paddle.float32)
             b = None if self.bias is None else dg.to_variable(
                 self.bias, dtype=paddle.float32)
-            y = F.conv3d(
-                x,
-                w,
-                b,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                groups=self.groups,
-                data_format=self.data_format)
+            y = F.conv3d(x,
+                         w,
+                         b,
+                         padding=self.padding,
+                         stride=self.stride,
+                         dilation=self.dilation,
+                         groups=self.groups,
+                         data_format=self.data_format)
 
     def test_dygraph_exception(self):
         with self.assertRaises(ValueError):
@@ -495,6 +504,7 @@ def test_static_exception(self):
 
 
 class TestFunctionalConv3DErrorCase12(TestFunctionalConv3DErrorCase11):
+
     def setUp(self):
         self.input = np.random.randn(1, 3, 3, 3, 3)
         self.filter = np.random.randn(3, 3, 1, 1, 1)
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
index 6f25d65aac227..0190779a021c9 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
@@ -48,19 +48,19 @@ def prepare(self):
             filter_shape = tuple(self.filter_shape)
 
         self.weight = np.random.uniform(
-            -1, 1, (self.in_channels, self.out_channels // self.groups
-                    ) + filter_shape).astype(self.dtype)
+            -1, 1, (self.in_channels, self.out_channels // self.groups) +
+            filter_shape).astype(self.dtype)
         if not self.no_bias:
-            self.bias = np.random.uniform(-1, 1, (
-                self.out_channels, )).astype(self.dtype)
+            self.bias = np.random.uniform(-1, 1, (self.out_channels, )).astype(
+                self.dtype)
 
         self.channel_last = (self.data_format == "NDHWC")
         if self.channel_last:
             self.input_shape = (self.batch_size, ) + self.spatial_shape + (
                 self.in_channels, )
         else:
-            self.input_shape = (self.batch_size, self.in_channels
-                                ) + self.spatial_shape
+            self.input_shape = (self.batch_size,
+                                self.in_channels) + self.spatial_shape
 
         self.input = np.random.uniform(-1, 1,
                                        self.input_shape).astype(self.dtype)
@@ -71,13 +71,11 @@ def static_graph_case_1(self):
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
                 if self.channel_last:
-                    x = fluid.data(
-                        "input", (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
+                    x = fluid.data("input", (-1, -1, -1, -1, self.in_channels),
+                                   dtype=self.dtype)
                 else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype)
+                    x = fluid.data("input", (-1, self.in_channels, -1, -1, -1),
+                                   dtype=self.dtype)
                 y = fluid.layers.conv3d_transpose(
                     x,
                     self.out_channels,
@@ -103,27 +101,26 @@ def static_graph_case_2(self):
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
                 if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
+                    x = x = fluid.data("input",
+                                       (-1, -1, -1, -1, self.in_channels),
+                                       dtype=self.dtype)
                 else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight.shape, dtype=self.dtype)
+                    x = fluid.data("input", (-1, self.in_channels, -1, -1, -1),
+                                   dtype=self.dtype)
+                weight = fluid.data("weight",
+                                    self.weight.shape,
+                                    dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv3d_transpose(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format)
+                y = F.conv3d_transpose(x,
+                                       weight,
+                                       None if self.no_bias else bias,
+                                       output_size=self.output_size,
+                                       padding=self.padding,
+                                       stride=self.stride,
+                                       dilation=self.dilation,
+                                       groups=self.groups,
+                                       data_format=self.data_format)
                 if self.act == 'sigmoid':
                     y = F.sigmoid(y)
         exe = fluid.Executor(self.place)
@@ -139,16 +136,15 @@ def dygraph_case(self):
             x = dg.to_variable(self.input)
             weight = dg.to_variable(self.weight)
             bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv3d_transpose(
-                x,
-                weight,
-                bias,
-                output_size=self.output_size,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                groups=self.groups,
-                data_format=self.data_format)
+            y = F.conv3d_transpose(x,
+                                   weight,
+                                   bias,
+                                   output_size=self.output_size,
+                                   padding=self.padding,
+                                   stride=self.stride,
+                                   dilation=self.dilation,
+                                   groups=self.groups,
+                                   data_format=self.data_format)
             if self.act == 'sigmoid':
                 y = F.sigmoid(y)
             out = y.numpy()
@@ -211,8 +207,8 @@ def prepare(self):
             filter_shape = (self.filter_shape, ) * 3
         else:
             filter_shape = tuple(self.filter_shape)
-        self.weight_shape = (self.in_channels, self.out_channels // self.groups
-                             ) + filter_shape
+        self.weight_shape = (self.in_channels,
+                             self.out_channels // self.groups) + filter_shape
         self.bias_shape = (self.out_channels, )
 
     def static_graph_case(self):
@@ -222,32 +218,32 @@ def static_graph_case(self):
             with fluid.program_guard(main, start):
                 self.channel_last = self.data_format == "NDHWC"
                 if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
+                    x = x = fluid.data("input",
+                                       (-1, -1, -1, -1, self.in_channels),
+                                       dtype=self.dtype)
                 else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
+                    x = fluid.data("input", (-1, self.in_channels, -1, -1, -1),
+                                   dtype=self.dtype)
+                weight = fluid.data("weight",
+                                    self.weight_shape,
+                                    dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv3d_transpose(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format)
+                y = F.conv3d_transpose(x,
+                                       weight,
+                                       None if self.no_bias else bias,
+                                       output_size=self.output_size,
+                                       padding=self.padding,
+                                       stride=self.stride,
+                                       dilation=self.dilation,
+                                       groups=self.groups,
+                                       data_format=self.data_format)
                 if self.act == 'sigmoid':
                     y = F.sigmoid(y)
 
 
 class TestFunctionalConv3DTransposeCase2(TestFunctionalConv3DTranspose):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -262,6 +258,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DTransposeCase3(TestFunctionalConv3DTranspose):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 6
@@ -276,6 +273,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DTransposeCase4(TestFunctionalConv3DTranspose):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 6
@@ -290,6 +288,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DTransposeCase5(TestFunctionalConv3DTranspose):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 6
@@ -304,6 +303,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DTransposeCase6(TestFunctionalConv3DTranspose):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 4
@@ -318,6 +318,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DTransposeCase7(TestFunctionalConv3DTranspose):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 4
@@ -333,6 +334,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DTransposeCase8(TestFunctionalConv3DTranspose):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 6
@@ -347,6 +349,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DTransposeCase9(TestFunctionalConv3DTranspose):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 6
@@ -361,6 +364,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DTransposeCase10(TestFunctionalConv3DTranspose):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 6
@@ -375,6 +379,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DTransposeCase11(TestFunctionalConv3DTranspose):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 6
@@ -388,8 +393,9 @@ def setUp(self):
         self.data_format = "NCDHW"
 
 
-class TestFunctionalConv3DTransposeErrorCase2(
-        TestFunctionalConv3DTransposeError):
+class TestFunctionalConv3DTransposeErrorCase2(TestFunctionalConv3DTransposeError
+                                              ):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -403,8 +409,9 @@ def setUp(self):
         self.data_format = "NDHWC"
 
 
-class TestFunctionalConv3DTransposeErrorCase3(
-        TestFunctionalConv3DTransposeError):
+class TestFunctionalConv3DTransposeErrorCase3(TestFunctionalConv3DTransposeError
+                                              ):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -418,8 +425,9 @@ def setUp(self):
         self.data_format = "NDHWC"
 
 
-class TestFunctionalConv3DTransposeErrorCase4(
-        TestFunctionalConv3DTransposeError):
+class TestFunctionalConv3DTransposeErrorCase4(TestFunctionalConv3DTransposeError
+                                              ):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 5
@@ -433,8 +441,9 @@ def setUp(self):
         self.data_format = "NCDHW"
 
 
-class TestFunctionalConv3DTransposeErrorCase5(
-        TestFunctionalConv3DTransposeError):
+class TestFunctionalConv3DTransposeErrorCase5(TestFunctionalConv3DTransposeError
+                                              ):
+
     def setUp(self):
         self.in_channels = -2
         self.out_channels = 5
@@ -448,8 +457,9 @@ def setUp(self):
         self.data_format = "NCDHW"
 
 
-class TestFunctionalConv3DTransposeErrorCase7(
-        TestFunctionalConv3DTransposeError):
+class TestFunctionalConv3DTransposeErrorCase7(TestFunctionalConv3DTransposeError
+                                              ):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 5
@@ -464,8 +474,9 @@ def setUp(self):
         self.data_format = "NCDHW"
 
 
-class TestFunctionalConv3DTransposeErrorCase8(
-        TestFunctionalConv3DTransposeError):
+class TestFunctionalConv3DTransposeErrorCase8(TestFunctionalConv3DTransposeError
+                                              ):
+
     def setUp(self):
         self.in_channels = 4
         self.out_channels = 5
@@ -479,8 +490,9 @@ def setUp(self):
         self.data_format = "not_valid"
 
 
-class TestFunctionalConv3DTransposeErrorCase9(
-        TestFunctionalConv3DTransposeError):
+class TestFunctionalConv3DTransposeErrorCase9(TestFunctionalConv3DTransposeError
+                                              ):
+
     def setUp(self):
         self.in_channels = 3
         self.out_channels = 4
@@ -495,6 +507,7 @@ def setUp(self):
 
 
 class TestFunctionalConv3DTransposeErrorCase10(TestCase):
+
     def setUp(self):
         self.input = np.array([])
         self.filter = np.array([])
@@ -537,15 +550,14 @@ def dygraph_case(self):
             w = dg.to_variable(self.filter, dtype=paddle.float32)
             b = None if self.bias is None else dg.to_variable(
                 self.bias, dtype=paddle.float32)
-            y = F.conv3d_transpose(
-                x,
-                w,
-                b,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                groups=self.groups,
-                data_format=self.data_format)
+            y = F.conv3d_transpose(x,
+                                   w,
+                                   b,
+                                   padding=self.padding,
+                                   stride=self.stride,
+                                   dilation=self.dilation,
+                                   groups=self.groups,
+                                   data_format=self.data_format)
 
     def test_dygraph_exception(self):
         with self.assertRaises(ValueError):
@@ -562,6 +574,7 @@ def test_static_exception(self):
 
 class TestFunctionalConv3DTransposeErrorCase11(
         TestFunctionalConv3DTransposeErrorCase10):
+
     def setUp(self):
         self.input = np.random.randn(1, 3, 3, 3, 3)
         self.filter = np.random.randn(3, 3, 1, 1, 1)
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
index e3a2566133742..c3e8a51397f98 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
@@ -26,6 +26,7 @@
 
 
 class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
+
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
@@ -77,13 +78,13 @@ def optimizer(self, learning_rate=1e-3):
 
 
 class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
+
     def _decorate_compare_fused_all_reduce(self, model, use_device):
-        self.compare_fuse_all_reduce_ops(
-            model,
-            use_device,
-            init_feed_dict=init_data,
-            optimizer=self.optimizer,
-            fuse_all_optimizer_ops=True)
+        self.compare_fuse_all_reduce_ops(model,
+                                         use_device,
+                                         init_feed_dict=init_data,
+                                         optimizer=self.optimizer,
+                                         fuse_all_optimizer_ops=True)
 
     def test_simple_fc_with_fuse_all_reduce(self):
         self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)
@@ -101,16 +102,17 @@ def test_batchnorm_fc_with_fuse_all_reduce(self):
 
 
 class TestFuseAllReduceOpsAndOptiOps(TestFuseAllReduceOps):
+
     def _decorate_compare_fused_all_reduce(self, model, use_device):
-        self.compare_fuse_all_reduce_ops(
-            model,
-            use_device,
-            init_feed_dict=init_data,
-            optimizer=self.optimizer,
-            fuse_all_optimizer_ops=True)
+        self.compare_fuse_all_reduce_ops(model,
+                                         use_device,
+                                         init_feed_dict=init_data,
+                                         optimizer=self.optimizer,
+                                         fuse_all_optimizer_ops=True)
 
 
 class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
+
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
index 6a1700e758e57..c8106db13300f 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@@ -18,34 +18,34 @@
 
 
 class TestFuseBatchNormActPass(unittest.TestCase):
+
     def build_program(self, main_program, startup_program, use_cuda, seed=1):
         with fluid.program_guard(main_program, startup_program):
             x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
             y = fluid.layers.data(name="y", shape=[1], dtype='int64')
-            hidden1 = fluid.layers.conv2d(
-                input=x,
-                filter_size=3,
-                num_filters=16,
-                stride=1,
-                padding=1,
-                act=None,
-                bias_attr=False,
-                data_format='NHWC')
+            hidden1 = fluid.layers.conv2d(input=x,
+                                          filter_size=3,
+                                          num_filters=16,
+                                          stride=1,
+                                          padding=1,
+                                          act=None,
+                                          bias_attr=False,
+                                          data_format='NHWC')
             param_attr = fluid.ParamAttr(
                 name='batch_norm_w',
                 initializer=fluid.initializer.Constant(value=1.0))
             bias_attr = fluid.ParamAttr(
                 name='batch_norm_b',
                 initializer=fluid.initializer.Constant(value=0.0))
-            hidden2 = fluid.layers.batch_norm(
-                input=hidden1,
-                param_attr=param_attr,
-                bias_attr=bias_attr,
-                act='relu',
-                data_layout='NHWC')
+            hidden2 = fluid.layers.batch_norm(input=hidden1,
+                                              param_attr=param_attr,
+                                              bias_attr=bias_attr,
+                                              act='relu',
+                                              data_layout='NHWC')
             hidden3 = fluid.layers.fc(input=hidden2, size=32, act='relu')
-            hidden4 = fluid.layers.batch_norm(
-                input=hidden3, act='relu', data_layout='NHWC')
+            hidden4 = fluid.layers.batch_norm(input=hidden3,
+                                              act='relu',
+                                              data_layout='NHWC')
             prediction = fluid.layers.fc(input=hidden4, size=10, act='softmax')
             loss = fluid.layers.cross_entropy(input=prediction, label=y)
             loss = fluid.layers.mean(loss)
@@ -72,8 +72,8 @@ def check(self, place, use_cuda):
         build_strategy.fuse_bn_act_ops = False
         binary = fluid.CompiledProgram(main_program).with_data_parallel(
             loss_name=loss.name, build_strategy=build_strategy)
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=batch_size)
+        train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                    batch_size=batch_size)
         loss_vals = []
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
@@ -90,8 +90,8 @@ def check(self, place, use_cuda):
         build_strategy_fused.fuse_bn_act_ops = True
         binary_fused = fluid.CompiledProgram(main_program).with_data_parallel(
             loss_name=loss.name, build_strategy=build_strategy_fused)
-        train_reader_fused = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=batch_size)
+        train_reader_fused = paddle.batch(paddle.dataset.mnist.train(),
+                                          batch_size=batch_size)
         loss_vals_fused = []
         scope_fused = fluid.Scope()
         with fluid.scope_guard(scope_fused):
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
index f4cb53b31c574..59b85530f10da 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
@@ -27,6 +27,7 @@
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "Paddle core is not compiled with CUDA")
 class TestFusedBnAddActAPI(unittest.TestCase):
+
     def setUp(self):
         self.conv_param_attr1 = fluid.ParamAttr(
             name='conv2d_1.weight',
@@ -60,32 +61,29 @@ def build_fused_program(self,
         with fluid.program_guard(main_program, startup_program):
             x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
             y = fluid.layers.data(name="y", shape=[1], dtype='int64')
-            conv1_1 = fluid.layers.conv2d(
-                input=x,
-                filter_size=3,
-                num_filters=32,
-                stride=1,
-                padding=1,
-                act=None,
-                param_attr=self.conv_param_attr1,
-                bias_attr=False,
-                data_format='NHWC')
-            conv1_2 = fluid.layers.conv2d(
-                input=x,
-                filter_size=3,
-                num_filters=32,
-                stride=1,
-                padding=1,
-                act=None,
-                param_attr=self.conv_param_attr2,
-                bias_attr=False,
-                data_format='NHWC')
-            bn = fluid.layers.batch_norm(
-                input=conv1_1,
-                param_attr=self.bn_param_attr1,
-                bias_attr=self.bn_bias_attr1,
-                act=None,
-                data_layout='NHWC')
+            conv1_1 = fluid.layers.conv2d(input=x,
+                                          filter_size=3,
+                                          num_filters=32,
+                                          stride=1,
+                                          padding=1,
+                                          act=None,
+                                          param_attr=self.conv_param_attr1,
+                                          bias_attr=False,
+                                          data_format='NHWC')
+            conv1_2 = fluid.layers.conv2d(input=x,
+                                          filter_size=3,
+                                          num_filters=32,
+                                          stride=1,
+                                          padding=1,
+                                          act=None,
+                                          param_attr=self.conv_param_attr2,
+                                          bias_attr=False,
+                                          data_format='NHWC')
+            bn = fluid.layers.batch_norm(input=conv1_1,
+                                         param_attr=self.bn_param_attr1,
+                                         bias_attr=self.bn_bias_attr1,
+                                         act=None,
+                                         data_layout='NHWC')
             fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act(
                 conv1_2,
                 bn,
@@ -112,37 +110,33 @@ def build_origin_program(self,
         with fluid.program_guard(main_program, startup_program):
             x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
             y = fluid.layers.data(name="y", shape=[1], dtype='int64')
-            conv1_1 = fluid.layers.conv2d(
-                input=x,
-                filter_size=3,
-                num_filters=32,
-                stride=1,
-                padding=1,
-                act=None,
-                param_attr=self.conv_param_attr1,
-                bias_attr=False,
-                data_format='NHWC')
-            bn1 = fluid.layers.batch_norm(
-                input=conv1_1,
-                param_attr=self.bn_param_attr1,
-                bias_attr=self.bn_bias_attr1,
-                act=None,
-                data_layout='NHWC')
-            conv1_2 = fluid.layers.conv2d(
-                input=conv1_1,
-                filter_size=1,
-                num_filters=32,
-                stride=1,
-                act=None,
-                param_attr=self.conv_param_attr2,
-                bias_attr=False,
-                data_format='NHWC')
-            bn2 = fluid.layers.batch_norm(
-                input=conv1_1,
-                param_attr=self.bn_param_attr2,
-                bias_attr=self.bn_bias_attr2,
-                act=None,
-                data_layout='NHWC')
+            conv1_1 = fluid.layers.conv2d(input=x,
+                                          filter_size=3,
+                                          num_filters=32,
+                                          stride=1,
+                                          padding=1,
+                                          act=None,
+                                          param_attr=self.conv_param_attr1,
+                                          bias_attr=False,
+                                          data_format='NHWC')
+            bn1 = fluid.layers.batch_norm(input=conv1_1,
+                                          param_attr=self.bn_param_attr1,
+                                          bias_attr=self.bn_bias_attr1,
+                                          act=None,
+                                          data_layout='NHWC')
+            conv1_2 = fluid.layers.conv2d(input=conv1_1,
+                                          filter_size=1,
+                                          num_filters=32,
+                                          stride=1,
+                                          act=None,
+                                          param_attr=self.conv_param_attr2,
+                                          bias_attr=False,
+                                          data_format='NHWC')
+            bn2 = fluid.layers.batch_norm(input=conv1_1,
+                                          param_attr=self.bn_param_attr2,
+                                          bias_attr=self.bn_bias_attr2,
+                                          act=None,
+                                          data_layout='NHWC')
             out = bn1 + bn2
             out = fluid.layers.relu(out)
             prediction = fluid.layers.fc(input=out,
@@ -186,8 +180,10 @@ def check(self, place, use_cuda):
                 x_data.append(x)
                 y_data.append(y)
                 loss_v = exe.run(binary_fused,
-                                 feed={"x": x,
-                                       "y": y},
+                                 feed={
+                                     "x": x,
+                                     "y": y
+                                 },
                                  fetch_list=[loss])
                 loss_vals_fused.append(loss_v[0][0])
 
@@ -202,8 +198,10 @@ def check(self, place, use_cuda):
             exe.run(startup_program)
             for i in range(iters):
                 loss_v = exe.run(binary,
-                                 feed={"x": x_data[i],
-                                       "y": y_data[i]},
+                                 feed={
+                                     "x": x_data[i],
+                                     "y": y_data[i]
+                                 },
                                  fetch_list=[loss])
                 loss_vals.append(loss_v[0][0])
 
@@ -220,8 +218,9 @@ def test_fuse_bn_add_act_API(self):
         main_program = fluid.Program()
         startup_program = fluid.Program()
         place = fluid.CUDAPlace(0)
-        x, y, loss = self.build_fused_program(
-            main_program, startup_program, use_cuda=True)
+        x, y, loss = self.build_fused_program(main_program,
+                                              startup_program,
+                                              use_cuda=True)
         exe = fluid.Executor(place)
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
@@ -230,8 +229,10 @@ def test_fuse_bn_add_act_API(self):
                 x = np.random.random((4, 1, 28, 28)).astype("float32")
                 y = np.random.random((4, 1)).astype("int64")
                 loss_v = exe.run(main_program,
-                                 feed={"x": x,
-                                       "y": y},
+                                 feed={
+                                     "x": x,
+                                     "y": y
+                                 },
                                  fetch_list=[loss])
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
index 6c3fa9e61d240..97fa40a89de41 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
@@ -21,6 +21,7 @@
 
 
 class TestMNIST(TestParallelExecutorBase):
+
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
@@ -43,8 +44,10 @@ def _optimizer(learning_rate=1e-6):
         # add enable_inplace=False here to force pass the unittest
         not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict={
+                "image": img,
+                "label": label
+            },
             use_device=use_device,
             fuse_elewise_add_act_ops=False,
             use_ir_memory_optimize=False,
@@ -52,8 +55,10 @@ def _optimizer(learning_rate=1e-6):
             optimizer=_optimizer)
         fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict={
+                "image": img,
+                "label": label
+            },
             use_device=use_device,
             fuse_elewise_add_act_ops=True,
             use_ir_memory_optimize=False,
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
index 00d91b1fab0f1..29bfca4dd786b 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
@@ -47,6 +47,7 @@ def verify_node_count(graph, node_name, target_count):
 
 
 class MultiFCLayer(paddle.nn.Layer):
+
     def __init__(self, hidden, Activation):
         super(MultiFCLayer, self).__init__()
         self.linear1 = paddle.nn.Linear(hidden, 4 * hidden)
@@ -76,6 +77,7 @@ def forward(self, x, matmul_y, ele_y):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueFWDBase(unittest.TestCase):
+
     def setUp(self):
         self.batch = 64
         self.seqlen = 128
@@ -87,16 +89,17 @@ def setUp(self):
         self.startup_prog = paddle.static.Program()
 
         with paddle.static.program_guard(self.main_prog, self.startup_prog):
-            data = paddle.static.data(
-                name="_data",
-                shape=[-1, self.seqlen, self.hidden],
-                dtype='float32')
-            matmul_y = paddle.static.data(
-                name="_matmul_y",
-                shape=[1, self.hidden, self.hidden],
-                dtype='float32')
-            ele_y = paddle.static.data(
-                name="_ele_y", shape=[self.hidden, ], dtype='float32')
+            data = paddle.static.data(name="_data",
+                                      shape=[-1, self.seqlen, self.hidden],
+                                      dtype='float32')
+            matmul_y = paddle.static.data(name="_matmul_y",
+                                          shape=[1, self.hidden, self.hidden],
+                                          dtype='float32')
+            ele_y = paddle.static.data(name="_ele_y",
+                                       shape=[
+                                           self.hidden,
+                                       ],
+                                       dtype='float32')
 
             multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0])
             with paddle.static.amp.fp16_guard():
@@ -131,10 +134,9 @@ def _test_output(self):
         build_strategy = paddle.static.BuildStrategy()
         build_strategy.fuse_gemm_epilogue = True
         program = paddle.static.CompiledProgram(self.main_prog)
-        program = program.with_data_parallel(
-            loss_name=self.loss.name,
-            build_strategy=build_strategy,
-            places=paddle.static.cuda_places())
+        program = program.with_data_parallel(loss_name=self.loss.name,
+                                             build_strategy=build_strategy,
+                                             places=paddle.static.cuda_places())
 
         result = self.exe.run(program,
                               feed=self.feed,
@@ -144,8 +146,8 @@ def _test_output(self):
             "[{}] outputs are miss-matched.".format(type(self).__name__))
         self.assertTrue(
             verify_node_count(program._graph, "fused_gemm_epilogue", 3),
-            "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.".
-            format(type(self).__name__))
+            "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph."
+            .format(type(self).__name__))
         act_fwd_name = self._get_act_type()[1]
         self.assertTrue(
             verify_node_count(program._graph, act_fwd_name, 1),
@@ -163,6 +165,7 @@ def _get_act_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueReluFWDFP32(TestFuseGemmEpilogueFWDBase):
+
     def _pre_test_hooks(self):
         self.atol = 1e-3
         self.rtol = 1e-2
@@ -177,6 +180,7 @@ def test_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueReluFWDFP16(TestFuseGemmEpilogueReluFWDFP32):
+
     def _pre_test_hooks(self):
         self.atol = 1e-3
         self.rtol = 1e-2
@@ -193,6 +197,7 @@ def _pre_test_hooks(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueGeluFWDFP32(TestFuseGemmEpilogueFWDBase):
+
     def _pre_test_hooks(self):
         self.atol = 1e-4
         self.rtol = 1e-3
@@ -207,6 +212,7 @@ def test_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueGeluFWDFP16(TestFuseGemmEpilogueGeluFWDFP32):
+
     def _pre_test_hooks(self):
         self.atol = 1e-3
         self.rtol = 1e-2
@@ -223,6 +229,7 @@ def _pre_test_hooks(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueBWDBase(unittest.TestCase):
+
     def setUp(self):
         self.batch = 64
         self.seqlen = 128
@@ -234,16 +241,17 @@ def setUp(self):
         self.startup_prog = paddle.static.Program()
 
         with paddle.static.program_guard(self.main_prog, self.startup_prog):
-            data = paddle.static.data(
-                name="_data",
-                shape=[-1, self.seqlen, self.hidden],
-                dtype='float32')
-            matmul_y = paddle.static.data(
-                name="_matmul_y",
-                shape=[1, self.hidden, self.hidden],
-                dtype='float32')
-            ele_y = paddle.static.data(
-                name="_ele_y", shape=[self.hidden, ], dtype='float32')
+            data = paddle.static.data(name="_data",
+                                      shape=[-1, self.seqlen, self.hidden],
+                                      dtype='float32')
+            matmul_y = paddle.static.data(name="_matmul_y",
+                                          shape=[1, self.hidden, self.hidden],
+                                          dtype='float32')
+            ele_y = paddle.static.data(name="_ele_y",
+                                       shape=[
+                                           self.hidden,
+                                       ],
+                                       dtype='float32')
 
             multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0])
             with paddle.static.amp.fp16_guard():
@@ -289,10 +297,9 @@ def _test_output(self):
         build_strategy = paddle.static.BuildStrategy()
         build_strategy.fuse_gemm_epilogue = True
         program = paddle.static.CompiledProgram(self.main_prog)
-        program = program.with_data_parallel(
-            loss_name=self.loss.name,
-            build_strategy=build_strategy,
-            places=paddle.static.cuda_places())
+        program = program.with_data_parallel(loss_name=self.loss.name,
+                                             build_strategy=build_strategy,
+                                             places=paddle.static.cuda_places())
 
         outs_res = self.exe.run(program, feed=self.feed, fetch_list=self.fetch)
 
@@ -303,12 +310,12 @@ def _test_output(self):
 
         self.assertTrue(
             verify_node_count(program._graph, "fused_gemm_epilogue", 3),
-            "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.".
-            format(type(self).__name__))
+            "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph."
+            .format(type(self).__name__))
         self.assertTrue(
             verify_node_count(program._graph, "fused_gemm_epilogue_grad", 3),
-            "[{}] The number of fused_gemm_epilogue_grad is miss-matched in the computing graph.".
-            format(type(self).__name__))
+            "[{}] The number of fused_gemm_epilogue_grad is miss-matched in the computing graph."
+            .format(type(self).__name__))
         _, act_fwd_name, act_bwd_name = self._get_act_type()
         self.assertTrue(
             verify_node_count(program._graph, act_fwd_name, 1),
@@ -330,6 +337,7 @@ def _get_act_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueReLUBWDFP32(TestFuseGemmEpilogueBWDBase):
+
     def _pre_test_hooks(self):
         self.atol = 1e-4
         self.rtol = 1e-3
@@ -344,6 +352,7 @@ def test_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueReLUBWDFP16(TestFuseGemmEpilogueReLUBWDFP32):
+
     def _pre_test_hooks(self):
         self.atol = 1e-3
         self.rtol = 1e-2
@@ -360,6 +369,7 @@ def _pre_test_hooks(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueGeLUBWDFP32(TestFuseGemmEpilogueBWDBase):
+
     def _pre_test_hooks(self):
         self.atol = 5e-4
         self.rtol = 1e-3
@@ -374,6 +384,7 @@ def test_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueGeLUBWDFP16(TestFuseGemmEpilogueGeLUBWDFP32):
+
     def _pre_test_hooks(self):
         self.atol = 1e-3
         self.rtol = 1e-2
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
index 51c06bb79d728..b1451e83f9ce7 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
@@ -24,6 +24,7 @@
 
 
 class TestFuseOptimizationOps(TestParallelExecutorBase):
+
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
@@ -63,36 +64,41 @@ def _compare_fused_optimizer_ops(self,
 
     def _decorate_compare_fused_optimizer_ops(self, model, use_device,
                                               optimizer):
-        self._compare_fused_optimizer_ops(
-            model,
-            use_device,
-            feed_dict=self._get_feed_dict(),
-            optimizer=optimizer)
+        self._compare_fused_optimizer_ops(model,
+                                          use_device,
+                                          feed_dict=self._get_feed_dict(),
+                                          optimizer=optimizer)
 
 
 class TestFuseAdamOps(TestFuseOptimizationOps):
+
     def optimizer(self, learning_rate=1e-4):
         return fluid.optimizer.Adam(learning_rate=learning_rate)
 
     def test_batchnorm_fc_with_fuse_op(self):
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer)
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer)
+        self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
+                                                   DeviceType.CUDA,
+                                                   optimizer=self.optimizer)
+        self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
+                                                   DeviceType.CPU,
+                                                   optimizer=self.optimizer)
 
 
 class TestFuseSGDOps(TestFuseAdamOps):
+
     def optimizer(self, learning_rate=1e-3):
         return fluid.optimizer.SGD(learning_rate=learning_rate)
 
 
 class TestFuseMomentumOps(TestFuseAdamOps):
+
     def optimizer(self, learning_rate=1e-3):
-        return fluid.optimizer.Momentum(
-            learning_rate=learning_rate, momentum=0.1)
+        return fluid.optimizer.Momentum(learning_rate=learning_rate,
+                                        momentum=0.1)
 
 
 class TestSpareFuseAdamOps(TestFuseOptimizationOps):
+
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
@@ -120,24 +126,29 @@ def optimizer(self, learning_rate=1e-4):
 
     def test_simple_bow_net_with_fuse_op(self):
         model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
-        self._decorate_compare_fused_optimizer_ops(
-            model, DeviceType.CUDA, optimizer=self.optimizer)
-        self._decorate_compare_fused_optimizer_ops(
-            model, DeviceType.CPU, optimizer=self.optimizer)
+        self._decorate_compare_fused_optimizer_ops(model,
+                                                   DeviceType.CUDA,
+                                                   optimizer=self.optimizer)
+        self._decorate_compare_fused_optimizer_ops(model,
+                                                   DeviceType.CPU,
+                                                   optimizer=self.optimizer)
 
 
 class TestSpareFuseSGDOps(TestSpareFuseAdamOps):
+
     def optimizer(self, learning_rate=1e-3):
         return fluid.optimizer.SGD(learning_rate=learning_rate)
 
 
 class TestSpareFuseMomentumOps(TestSpareFuseAdamOps):
+
     def optimizer(self, learning_rate=1e-3):
-        return fluid.optimizer.Momentum(
-            learning_rate=learning_rate, momentum=0.1)
+        return fluid.optimizer.Momentum(learning_rate=learning_rate,
+                                        momentum=0.1)
 
 
 class TestPassConflictBase(TestFuseAdamOps):
+
     def _compare_fused_optimizer_ops(self,
                                      model,
                                      use_device,
@@ -147,36 +158,40 @@ def _compare_fused_optimizer_ops(self,
         if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
 
-        self.check_pass_conflict(
-            model,
-            feed_dict=feed_dict,
-            get_data_from_feeder=get_data_from_feeder,
-            use_device=use_device,
-            fuse_all_optimizer_ops=True,
-            optimizer=optimizer,
-            enable_sequential_execution=True)
+        self.check_pass_conflict(model,
+                                 feed_dict=feed_dict,
+                                 get_data_from_feeder=get_data_from_feeder,
+                                 use_device=use_device,
+                                 fuse_all_optimizer_ops=True,
+                                 optimizer=optimizer,
+                                 enable_sequential_execution=True)
 
 
 class TestFuseAdamOpsPassConflict(TestPassConflictBase):
+
     def optimizer(self, learning_rate=1e-4):
         return fluid.optimizer.Adam(learning_rate=learning_rate)
 
     def test_batchnorm_fc_with_fuse_op(self):
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer)
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer)
+        self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
+                                                   DeviceType.CPU,
+                                                   optimizer=self.optimizer)
+        self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
+                                                   DeviceType.CUDA,
+                                                   optimizer=self.optimizer)
 
 
 class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict):
+
     def optimizer(self, learning_rate=1e-3):
         return fluid.optimizer.SGD(learning_rate=learning_rate)
 
 
 class TestFuseMomentumOpsPassConflict(TestFuseAdamOpsPassConflict):
+
     def optimizer(self, learning_rate=1e-3):
-        return fluid.optimizer.Momentum(
-            learning_rate=learning_rate, momentum=0.1)
+        return fluid.optimizer.Momentum(learning_rate=learning_rate,
+                                        momentum=0.1)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
index d391b04aa4772..a86ca3e31f694 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
@@ -28,21 +28,25 @@ def norm(*args, **kargs):
 
 def sep_conv(input, channel, stride, filter, dilation=1, act=None):
     # with scope('depthwise'):
-    input = fluid.layers.conv2d(
-        input,
-        input.shape[1],
-        filter,
-        stride,
-        groups=input.shape[1],
-        padding=(filter // 2) * dilation,
-        dilation=dilation,
-        use_cudnn=False,
-        bias_attr=False)
+    input = fluid.layers.conv2d(input,
+                                input.shape[1],
+                                filter,
+                                stride,
+                                groups=input.shape[1],
+                                padding=(filter // 2) * dilation,
+                                dilation=dilation,
+                                use_cudnn=False,
+                                bias_attr=False)
     input = norm(input)
     if act: input = act(input)
     # with scope('pointwise'):
-    input = fluid.layers.conv2d(
-        input, channel, 1, 1, groups=1, padding=0, bias_attr=False)
+    input = fluid.layers.conv2d(input,
+                                channel,
+                                1,
+                                1,
+                                groups=1,
+                                padding=0,
+                                bias_attr=False)
     input = norm(input)
     if act: input = act(input)
     return input
@@ -63,6 +67,7 @@ def simple_depthwise_net(use_feed):
 
 
 class TestMNIST(TestParallelExecutorBase):
+
     def _init_data(self, random=True):
         np.random.seed(5)
         if random:
@@ -88,16 +93,20 @@ def _optimizer(learning_rate=1e-6):
 
         fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict={
+                "image": img,
+                "label": label
+            },
             use_device=use_device,
             fuse_relu_depthwise_conv=True,
             use_ir_memory_optimize=True,
             optimizer=_optimizer)
         not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict={
+                "image": img,
+                "label": label
+            },
             use_device=use_device,
             fuse_relu_depthwise_conv=False,
             optimizer=_optimizer)
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index 445620f9e1cb1..6507cc1ee3258 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -27,12 +27,14 @@
 import unittest
 from op_test import OpTest
 from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
+
 _enable_legacy_dygraph()
 
 default_main_program().random_seed = 42
 
 
 class TestFusedAttentionOp(OpTest):
+
     def setUp(self):
         self.config()
         self.generate_input_data()
@@ -52,26 +54,22 @@ def setUp(self):
         self.__class__.op_type = "fused_attention"
         # use autograd to check grad in this unittest.
         self.__class__.no_need_check_grad = True
-        self.q_proj = Linear(
-            self.embed_dim,
-            self.embed_dim,
-            self.weight_attr,
-            bias_attr=self.bias_attr)
-        self.k_proj = Linear(
-            self.kdim,
-            self.embed_dim,
-            self.weight_attr,
-            bias_attr=self.bias_attr)
-        self.v_proj = Linear(
-            self.vdim,
-            self.embed_dim,
-            self.weight_attr,
-            bias_attr=self.bias_attr)
-        self.out_proj = Linear(
-            self.embed_dim,
-            self.embed_dim,
-            self.weight_attr,
-            bias_attr=self.bias_attr)
+        self.q_proj = Linear(self.embed_dim,
+                             self.embed_dim,
+                             self.weight_attr,
+                             bias_attr=self.bias_attr)
+        self.k_proj = Linear(self.kdim,
+                             self.embed_dim,
+                             self.weight_attr,
+                             bias_attr=self.bias_attr)
+        self.v_proj = Linear(self.vdim,
+                             self.embed_dim,
+                             self.weight_attr,
+                             bias_attr=self.bias_attr)
+        self.out_proj = Linear(self.embed_dim,
+                               self.embed_dim,
+                               self.weight_attr,
+                               bias_attr=self.bias_attr)
         paddle.set_default_dtype(np.float32)
         self.norm1 = LayerNorm(self.embed_dim)
         self.norm2 = LayerNorm(self.embed_dim)
@@ -116,10 +114,9 @@ def generate_input_data(self):
 
         if self.has_attn_mask:
             # [B, n_head, seq_len, out_seq_len]
-            self.attn_mask = np.ones(
-                (self.batch_size, self.num_heads, self.query_length,
-                 out_seq_len),
-                dtype=self.attn_mask_type)
+            self.attn_mask = np.ones((self.batch_size, self.num_heads,
+                                      self.query_length, out_seq_len),
+                                     dtype=self.attn_mask_type)
             if self.attn_mask_type == np.int64:
                 self.attn_mask = np.tril(self.attn_mask)
             elif self.attn_mask_type == np.float64:
@@ -174,8 +171,10 @@ def GetBaselineOut(self):
 
         # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
         # --> [B, n_head, seq_len, out_seq_len]
-        qk_out = layers.matmul(
-            x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5)
+        qk_out = layers.matmul(x=q_out,
+                               y=k_out,
+                               transpose_y=True,
+                               alpha=self.head_dim**-0.5)
 
         if attn_mask is not None:
             attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype)
@@ -185,11 +184,10 @@ def GetBaselineOut(self):
             softmax_out = F.softmax(qk_out)
 
         if self.dropout_prob:
-            dropout_out = F.dropout(
-                softmax_out,
-                self.dropout_prob,
-                training=self.training,
-                mode="upscale_in_train")
+            dropout_out = F.dropout(softmax_out,
+                                    self.dropout_prob,
+                                    training=self.training,
+                                    mode="upscale_in_train")
             # [B, n_head, seq_len, out_seq_len] * [B, n_head, out_seq_len, head_dim]
             # --> [B, n_head, seq_len, head_dim]
             qktv_out = tensor.matmul(dropout_out, v_out)
@@ -210,37 +208,37 @@ def GetBaselineOut(self):
         if self.has_cache_kv:
             return final_out
 
-        paddle.autograd.backward(
-            [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        paddle.autograd.backward([final_out], [paddle.to_tensor(self.dout)],
+                                 retain_graph=True)
         return final_out, tensor_query.grad
 
     def GetFusedAttentionOut(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
-        q_proj_weight = paddle.to_tensor(
-            self.q_proj.weight, stop_gradient=False)
-        k_proj_weight = paddle.to_tensor(
-            self.k_proj.weight, stop_gradient=False)
-        v_proj_weight = paddle.to_tensor(
-            self.v_proj.weight, stop_gradient=False)
-        out_linear_weight = paddle.to_tensor(
-            self.out_proj.weight, stop_gradient=False)
+        q_proj_weight = paddle.to_tensor(self.q_proj.weight,
+                                         stop_gradient=False)
+        k_proj_weight = paddle.to_tensor(self.k_proj.weight,
+                                         stop_gradient=False)
+        v_proj_weight = paddle.to_tensor(self.v_proj.weight,
+                                         stop_gradient=False)
+        out_linear_weight = paddle.to_tensor(self.out_proj.weight,
+                                             stop_gradient=False)
 
         if self.bias_attr is False:
             qkv_bias_tensor = None
             out_linear_bias = None
         else:
-            q_proj_bias = paddle.to_tensor(
-                self.q_proj.bias, stop_gradient=False)
-            k_proj_bias = paddle.to_tensor(
-                self.k_proj.bias, stop_gradient=False)
-            v_proj_bias = paddle.to_tensor(
-                self.v_proj.bias, stop_gradient=False)
+            q_proj_bias = paddle.to_tensor(self.q_proj.bias,
+                                           stop_gradient=False)
+            k_proj_bias = paddle.to_tensor(self.k_proj.bias,
+                                           stop_gradient=False)
+            v_proj_bias = paddle.to_tensor(self.v_proj.bias,
+                                           stop_gradient=False)
             qkv_bias = np.concatenate(
                 (q_proj_bias.numpy(), k_proj_bias.numpy(), v_proj_bias.numpy()))
             qkv_bias = qkv_bias.reshape((3, self.num_heads, self.head_dim))
             qkv_bias_tensor = paddle.to_tensor(qkv_bias, stop_gradient=False)
-            out_linear_bias = paddle.to_tensor(
-                self.out_proj.bias, stop_gradient=False)
+            out_linear_bias = paddle.to_tensor(self.out_proj.bias,
+                                               stop_gradient=False)
 
         ln1_scale = paddle.to_tensor(self.norm1.weight, stop_gradient=False)
         ln1_bias = paddle.to_tensor(self.norm1.bias, stop_gradient=False)
@@ -278,32 +276,39 @@ def GetFusedAttentionOut(self):
         if self.has_cache_kv:
             return final_out[0], final_out[1]
 
-        paddle.autograd.backward(
-            [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        paddle.autograd.backward([final_out], [paddle.to_tensor(self.dout)],
+                                 retain_graph=True)
         return final_out, x.grad
 
     def test_fused_attention_op(self):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
         final_out, x_grad = self.GetFusedAttentionOut()
-        np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=self.rtol, atol=self.atol)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=self.rtol, atol=self.atol)
+        np.testing.assert_allclose(final_out_ref,
+                                   final_out.numpy(),
+                                   rtol=self.rtol,
+                                   atol=self.atol)
+        np.testing.assert_allclose(x_grad_ref,
+                                   x_grad.numpy(),
+                                   rtol=self.rtol,
+                                   atol=self.atol)
 
 
 class TestFusedAttentionOpBiasIsNone(TestFusedAttentionOp):
+
     def config(self):
         super().config()
         self.bias_attr = False
 
 
 class TestFusedAttentionOpPreLn(TestFusedAttentionOp):
+
     def config(self):
         super().config()
         self.pre_layer_norm = True
 
 
 class TestFusedAttentionOpNoneAttnMask(TestFusedAttentionOp):
+
     def config(self):
         super().config()
         self.pre_layer_norm = True
@@ -311,6 +316,7 @@ def config(self):
 
 
 class TestFusedAttentionOpFp16(TestFusedAttentionOp):
+
     def config(self):
         super().config()
         self.x_type = np.float16
@@ -318,13 +324,18 @@ def config(self):
     def test_fused_attention_op(self):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
         final_out, x_grad = self.GetFusedAttentionOut()
-        np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=self.rtol, atol=self.atol)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=self.rtol, atol=self.atol)
+        np.testing.assert_allclose(final_out_ref,
+                                   final_out.numpy(),
+                                   rtol=self.rtol,
+                                   atol=self.atol)
+        np.testing.assert_allclose(x_grad_ref,
+                                   x_grad.numpy(),
+                                   rtol=self.rtol,
+                                   atol=self.atol)
 
 
 class TestFusedAttentionOpCacheKV(TestFusedAttentionOp):
+
     def config(self):
         super().config()
         self.has_cache_kv = True
@@ -336,11 +347,10 @@ def test_fused_attention_op(self):
         with paddle.no_grad():
             final_out_ref = self.GetBaselineOut()
             final_out, cache_kv_out = self.GetFusedAttentionOut()
-            np.testing.assert_allclose(
-                final_out_ref,
-                final_out.numpy(),
-                rtol=self.rtol,
-                atol=self.atol)
+            np.testing.assert_allclose(final_out_ref,
+                                       final_out.numpy(),
+                                       rtol=self.rtol,
+                                       atol=self.atol)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
index 74dc9351a25b4..89689942a0274 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
@@ -44,8 +44,8 @@ def softmax(x):
 def batch_matmul(x, y):
     assert x.shape[0] == y.shape[0]
     assert x.shape[1] == y.shape[1]
-    retval = np.zeros(
-        (x.shape[0], x.shape[1], x.shape[2], y.shape[3]), dtype=np.float64)
+    retval = np.zeros((x.shape[0], x.shape[1], x.shape[2], y.shape[3]),
+                      dtype=np.float64)
     for i in range(x.shape[0]):
         for j in range(x.shape[1]):
             retval[i, j, :, :] = np.matmul(x[i, j, :, :], y[i, j, :, :])
@@ -90,8 +90,9 @@ def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias,
     head_dim = qkv_weight.shape[2]
     # embed_dim, 3, num_heads, self.head_dim
     qkv_weight = qkv_weight.transpose((3, 0, 1, 2))
-    qkv_weight = qkv_weight.reshape(qkv_weight.shape[0], qkv_weight.shape[1] *
-                                    qkv_weight.shape[2] * qkv_weight.shape[3])
+    qkv_weight = qkv_weight.reshape(
+        qkv_weight.shape[0],
+        qkv_weight.shape[1] * qkv_weight.shape[2] * qkv_weight.shape[3])
 
     if qkv_bias is not None:
         qkv_bias = qkv_bias.reshape(qkv_bias.shape[0] * qkv_bias.shape[1] *
@@ -165,6 +166,7 @@ def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias,
 
 
 class TestFusedAttentionAPI(unittest.TestCase):
+
     def setUp(self):
         self.setXType()
         self.setPreLn()
@@ -218,10 +220,9 @@ def generate_input_data(self):
         self.query = np.random.rand(self.batch_size, self.query_length,
                                     self.embed_dim).astype(self.x_type)
         if self.has_attn_mask:
-            self.attn_mask = np.ones(
-                (self.batch_size, self.num_heads, self.query_length,
-                 self.key_length),
-                dtype=self.attn_mask_type)
+            self.attn_mask = np.ones((self.batch_size, self.num_heads,
+                                      self.query_length, self.key_length),
+                                     dtype=self.attn_mask_type)
             if self.attn_mask_type == np.int64:
                 self.attn_mask = np.tril(self.attn_mask)
             elif self.attn_mask_type == np.float64:
@@ -238,18 +239,19 @@ def run_imperative(self):
             attn_mask_tensor = paddle.to_tensor(self.attn_mask)
         else:
             attn_mask_tensor = None
-        fused_attn = FusedMultiHeadAttention(
-            self.embed_dim, self.num_heads, self.dropout_prob,
-            self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm,
-            self.need_weight, self.weight_attr, self.bias_attr)
+        fused_attn = FusedMultiHeadAttention(self.embed_dim, self.num_heads,
+                                             self.dropout_prob,
+                                             self.attn_dropout_prob, self.kdim,
+                                             self.vdim, self.pre_layer_norm,
+                                             self.need_weight, self.weight_attr,
+                                             self.bias_attr)
         if self.bias_attr is not False:
-            qkv_bias = np.random.random(fused_attn.qkv_bias.shape).astype(
-                'float32')
+            qkv_bias = np.random.random(
+                fused_attn.qkv_bias.shape).astype('float32')
             fused_attn.qkv_bias.set_value(paddle.to_tensor(qkv_bias))
-        out = fused_attn(
-            paddle.to_tensor(self.query),
-            paddle.to_tensor(self.query),
-            paddle.to_tensor(self.query), attn_mask_tensor)
+        out = fused_attn(paddle.to_tensor(self.query),
+                         paddle.to_tensor(self.query),
+                         paddle.to_tensor(self.query), attn_mask_tensor)
 
         fused_attn_qkv_bias = None
         fused_attn_linear_bias = None
@@ -267,27 +269,31 @@ def run_imperative(self):
             fused_attn.ln_scale.numpy(), fused_attn_ln_bias,
             fused_attn.qkv_weight.numpy(), fused_attn_qkv_bias,
             fused_attn.linear_weight.numpy(), fused_attn_linear_bias)
-        np.testing.assert_allclose(
-            ref_out, out.numpy(), rtol=self.rtol, atol=self.atol)
+        np.testing.assert_allclose(ref_out,
+                                   out.numpy(),
+                                   rtol=self.rtol,
+                                   atol=self.atol)
 
     def run_static(self):
-        fused_attn = FusedMultiHeadAttention(
-            self.embed_dim, self.num_heads, self.dropout_prob,
-            self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm,
-            self.need_weight, self.weight_attr, self.bias_attr)
+        fused_attn = FusedMultiHeadAttention(self.embed_dim, self.num_heads,
+                                             self.dropout_prob,
+                                             self.attn_dropout_prob, self.kdim,
+                                             self.vdim, self.pre_layer_norm,
+                                             self.need_weight, self.weight_attr,
+                                             self.bias_attr)
 
         x = paddle.static.data(
             name='X',
             shape=[self.batch_size, self.query_length, self.embed_dim],
             dtype=self.x_type)
         if self.has_attn_mask:
-            attn_mask = paddle.static.data(
-                name='SrcMask',
-                shape=[
-                    self.batch_size, self.num_heads, self.query_length,
-                    self.key_length
-                ],
-                dtype=self.attn_mask_type)
+            attn_mask = paddle.static.data(name='SrcMask',
+                                           shape=[
+                                               self.batch_size, self.num_heads,
+                                               self.query_length,
+                                               self.key_length
+                                           ],
+                                           dtype=self.attn_mask_type)
             final_out = fused_attn(x, x, x, attn_mask)
         else:
             final_out = fused_attn(x, x, x)
@@ -304,8 +310,10 @@ def run_static(self):
             if self.bias_attr is False:
                 out, qkv_weight, out_linear_weight, ln_scale, ln_2_scale = exe.run(
                     paddle.static.default_main_program(),
-                    feed={"X": self.query,
-                          "SrcMask": self.attn_mask},
+                    feed={
+                        "X": self.query,
+                        "SrcMask": self.attn_mask
+                    },
                     fetch_list=[
                         final_out, fused_attn.qkv_weight,
                         fused_attn.linear_weight, fused_attn.pre_ln_scale,
@@ -314,8 +322,10 @@ def run_static(self):
             else:
                 out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run(
                     paddle.static.default_main_program(),
-                    feed={"X": self.query,
-                          "SrcMask": self.attn_mask},
+                    feed={
+                        "X": self.query,
+                        "SrcMask": self.attn_mask
+                    },
                     fetch_list=[
                         final_out, fused_attn.qkv_weight, fused_attn.qkv_bias,
                         fused_attn.linear_weight, fused_attn.linear_bias,
@@ -326,7 +336,9 @@ def run_static(self):
             if self.bias_attr is False:
                 out, qkv_weight, out_linear_weight, ln_scale, ln_2_scale = exe.run(
                     paddle.static.default_main_program(),
-                    feed={"X": self.query, },
+                    feed={
+                        "X": self.query,
+                    },
                     fetch_list=[
                         final_out, fused_attn.qkv_weight,
                         fused_attn.linear_weight, fused_attn.pre_ln_scale,
@@ -335,7 +347,9 @@ def run_static(self):
             else:
                 out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run(
                     paddle.static.default_main_program(),
-                    feed={"X": self.query, },
+                    feed={
+                        "X": self.query,
+                    },
                     fetch_list=[
                         final_out, fused_attn.qkv_weight, fused_attn.qkv_bias,
                         fused_attn.linear_weight, fused_attn.linear_bias,
@@ -361,6 +375,7 @@ def test_dynamic_api(self):
 
 
 class TestFusedAttentionAPINoneAttnMask(TestFusedAttentionAPI):
+
     def setAttnMask(self):
         self.has_attn_mask = False
 
@@ -369,6 +384,7 @@ def setPreLn(self):
 
 
 class TestFusedAttentionAPIBiasIsNone(TestFusedAttentionAPI):
+
     def setBiasAttr(self):
         self.bias_attr = False
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
index d47450837a455..92c815a246f73 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
@@ -32,6 +32,7 @@
 
 
 class TestFusedBiasDropoutResidualLayerNormOp(OpTest):
+
     def setUp(self):
         self.config()
         self.generate_input_data()
@@ -67,12 +68,12 @@ def generate_input_data(self):
         if self.bias_attr is False:
             self.tensor_linear_bias = None
         else:
-            self.tensor_linear_bias = paddle.to_tensor(
-                self.linear_bias, stop_gradient=False)
+            self.tensor_linear_bias = paddle.to_tensor(self.linear_bias,
+                                                       stop_gradient=False)
 
         self.tensor_x = paddle.to_tensor(self.x, stop_gradient=False)
-        self.tensor_residual = paddle.to_tensor(
-            self.residual, stop_gradient=False)
+        self.tensor_residual = paddle.to_tensor(self.residual,
+                                                stop_gradient=False)
 
     def GetBaselineOut(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
@@ -85,8 +86,8 @@ def GetBaselineOut(self):
         residual_out = self.tensor_residual + self.dropout(out)
         final_out = self.norm1(residual_out)
 
-        paddle.autograd.backward(
-            [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        paddle.autograd.backward([final_out], [paddle.to_tensor(self.dout)],
+                                 retain_graph=True)
 
         if self.tensor_linear_bias is not None:
             tensor_linear_bias_grad = self.tensor_linear_bias.grad
@@ -105,8 +106,8 @@ def GetFusedBiasDropoutResidualLayerNormOut(self):
             self.tensor_x, self.tensor_residual, self.tensor_linear_bias,
             ln_scale, ln_bias, self.dropout_prob, epsilon)
 
-        paddle.autograd.backward(
-            [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        paddle.autograd.backward([final_out], [paddle.to_tensor(self.dout)],
+                                 retain_graph=True)
         if self.tensor_linear_bias is not None:
             tensor_linear_bias_grad = self.tensor_linear_bias.grad
         else:
@@ -118,22 +119,28 @@ def test_fused_op(self):
         )
         out, x_grad, residual_grad, linear_bias_grad = self.GetFusedBiasDropoutResidualLayerNormOut(
         )
-        np.testing.assert_allclose(
-            out_ref, out.numpy(), rtol=1e-5, atol=self.atol)
-        np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=self.atol)
-        np.testing.assert_allclose(
-            residual_grad_ref, residual_grad.numpy(), rtol=1e-5, atol=self.atol)
+        np.testing.assert_allclose(out_ref,
+                                   out.numpy(),
+                                   rtol=1e-5,
+                                   atol=self.atol)
+        np.testing.assert_allclose(x_grad_ref,
+                                   x_grad.numpy(),
+                                   rtol=1e-5,
+                                   atol=self.atol)
+        np.testing.assert_allclose(residual_grad_ref,
+                                   residual_grad.numpy(),
+                                   rtol=1e-5,
+                                   atol=self.atol)
         if linear_bias_grad_ref is not None:
-            np.testing.assert_allclose(
-                linear_bias_grad_ref,
-                linear_bias_grad.numpy(),
-                rtol=1e-5,
-                atol=self.atol)
+            np.testing.assert_allclose(linear_bias_grad_ref,
+                                       linear_bias_grad.numpy(),
+                                       rtol=1e-5,
+                                       atol=self.atol)
 
 
 class TestFusedBiasDropoutResidualLayerNormOpBiasIsNone(
         TestFusedBiasDropoutResidualLayerNormOp):
+
     def config(self):
         super().config()
         self.bias_attr = False
@@ -141,6 +148,7 @@ def config(self):
 
 class TestFusedBiasDropoutResidualLayerNormOpFp16(
         TestFusedBiasDropoutResidualLayerNormOp):
+
     def config(self):
         super().config()
         self.x_type = np.float16
diff --git a/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op_api.py b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op_api.py
index 19fc3972e58d4..f0c6bd83d40dd 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op_api.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op_api.py
@@ -66,6 +66,7 @@ def compute_reference(x, residual, ln_scale, ln_bias, linear_bias):
 
 
 class TestFusedBiasDropoutResidualLayerNormAPI(unittest.TestCase):
+
     def setUp(self):
         self.setXType()
         self.setBiasAttr()
@@ -99,12 +100,13 @@ def run_imperative(self):
 
         linear_bias = None
         if self.bias_attr is not False:
-            linear_bias = np.random.random(fused_bias_dropout_residual_ln.
-                                           linear_bias.shape).astype('float32')
+            linear_bias = np.random.random(
+                fused_bias_dropout_residual_ln.linear_bias.shape).astype(
+                    'float32')
             fused_bias_dropout_residual_ln.linear_bias.set_value(
                 paddle.to_tensor(linear_bias))
-        out = fused_bias_dropout_residual_ln(
-            paddle.to_tensor(self.x), paddle.to_tensor(self.residual))
+        out = fused_bias_dropout_residual_ln(paddle.to_tensor(self.x),
+                                             paddle.to_tensor(self.residual))
 
         ln_bias = None
         if self.bias_attr is not False:
@@ -112,12 +114,16 @@ def run_imperative(self):
         ln_scale = fused_bias_dropout_residual_ln.ln_scale.numpy(),
         ref_out = compute_reference(self.x, self.residual, ln_scale, ln_bias,
                                     linear_bias)
-        np.testing.assert_allclose(
-            ref_out, out.numpy(), rtol=1e-5, atol=self.atol)
+        np.testing.assert_allclose(ref_out,
+                                   out.numpy(),
+                                   rtol=1e-5,
+                                   atol=self.atol)
 
     def run_static(self):
-        fused_op = FusedBiasDropoutResidualLayerNorm(
-            self.embed_dim, self.dropout_prob, self.weight_attr, self.bias_attr)
+        fused_op = FusedBiasDropoutResidualLayerNorm(self.embed_dim,
+                                                     self.dropout_prob,
+                                                     self.weight_attr,
+                                                     self.bias_attr)
 
         x = paddle.static.data(
             name='X',
@@ -136,16 +142,19 @@ def run_static(self):
         linear_bias = None
         ln_bias = None
         if self.bias_attr is False:
-            out, ln_scale = exe.run(
-                paddle.static.default_main_program(),
-                feed={"X": self.x,
-                      "Residual": self.residual},
-                fetch_list=[final_out, fused_op.ln_scale])
+            out, ln_scale = exe.run(paddle.static.default_main_program(),
+                                    feed={
+                                        "X": self.x,
+                                        "Residual": self.residual
+                                    },
+                                    fetch_list=[final_out, fused_op.ln_scale])
         else:
             out, linear_bias, ln_scale, ln_bias = exe.run(
                 paddle.static.default_main_program(),
-                feed={"X": self.x,
-                      "Residual": self.residual},
+                feed={
+                    "X": self.x,
+                    "Residual": self.residual
+                },
                 fetch_list=[
                     final_out, fused_op.linear_bias, fused_op.ln_scale,
                     fused_op.ln_bias
@@ -167,6 +176,7 @@ def test_dynamic_api(self):
 
 class TestFusedBiasDropoutResidualLayerNormAPIBiasIsNone(
         TestFusedBiasDropoutResidualLayerNormAPI):
+
     def setBiasAttr(self):
         self.bias_attr = False
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
index ba9e05470e3d8..07a2e28f678b1 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
@@ -38,7 +38,9 @@ def create_test_class(test_case,
                       attrs,
                       dtype=np.float32,
                       grad_chek=True):
+
     class TestFusedElementwiseActivationOp_base(OpTest):
+
         def setUp(self):
             self.op_type = "fused_elemwise_activation"
             self.dtype = dtype
@@ -73,7 +75,9 @@ def init_output(self):
                 callback(self.x, self.y, self.x, self.y)
 
         def init_attr(self):
-            self.attrs = {'axis': self.axis, }
+            self.attrs = {
+                'axis': self.axis,
+            }
             for key in attrs.keys():
                 self.attrs[key] = attrs[key]
 
@@ -98,50 +102,50 @@ def test_check_grad_ingore_x(self):
             if not grad_chek:
                 return
             if self.attrs["save_intermediate_out"]:
-                self.check_grad(
-                    ['Y'], ['Out'],
-                    max_relative_error=0.005,
-                    no_grad_set=set("X"))
+                self.check_grad(['Y'], ['Out'],
+                                max_relative_error=0.005,
+                                no_grad_set=set("X"))
             else:
-                self.check_grad(
-                    ['Y'], ['Out'],
-                    max_relative_error=0.005,
-                    no_grad_set=set("X"))
+                self.check_grad(['Y'], ['Out'],
+                                max_relative_error=0.005,
+                                no_grad_set=set("X"))
 
         def test_check_grad_ingore_y(self):
             if not grad_chek:
                 return
             if self.attrs["save_intermediate_out"]:
-                self.check_grad(
-                    ['X'], ['Out'],
-                    max_relative_error=0.005,
-                    no_grad_set=set("Y"))
+                self.check_grad(['X'], ['Out'],
+                                max_relative_error=0.005,
+                                no_grad_set=set("Y"))
             else:
-                self.check_grad(
-                    ['X'], ['Out'],
-                    max_relative_error=0.005,
-                    no_grad_set=set("Y"))
+                self.check_grad(['X'], ['Out'],
+                                max_relative_error=0.005,
+                                no_grad_set=set("Y"))
 
     class TestFusedElementwiseActivationOp_scalar(
             TestFusedElementwiseActivationOp_base):
+
         def init_input(self):
             self.x = np.random.rand(2, 3, 4).astype(self.dtype)
             self.y = np.random.rand(1).astype(self.dtype)
 
     class TestFusedElementwiseActivationOp_scalar2(
             TestFusedElementwiseActivationOp_base):
+
         def init_input(self):
             self.x = np.random.rand(2, 3, 4).astype(self.dtype)
             self.y = np.random.rand(1, 1).astype(self.dtype)
 
     class TestFusedElementwiseActivationOp_Vector(
             TestFusedElementwiseActivationOp_base):
+
         def init_input(self):
             self.x = np.random.random((32, )).astype(self.dtype)
             self.y = np.random.random((32, )).astype(self.dtype)
 
     class TestFusedElementwiseActivationOp_broadcast_0(
             TestFusedElementwiseActivationOp_base):
+
         def init_input(self):
             self.x = np.random.rand(2, 3, 4).astype(self.dtype)
             self.y = np.random.rand(2).astype(self.dtype)
@@ -153,6 +157,7 @@ def init_output(self):
 
     class TestFusedElementwiseActivationOp_broadcast_1(
             TestFusedElementwiseActivationOp_base):
+
         def init_input(self):
             self.x = np.random.rand(2, 3, 4).astype(self.dtype)
             self.y = np.random.rand(3).astype(self.dtype)
@@ -164,6 +169,7 @@ def init_output(self):
 
     class TestFusedElementwiseActivationOp_broadcast_2(
             TestFusedElementwiseActivationOp_base):
+
         def init_input(self):
             self.x = np.random.rand(2, 3, 4).astype(self.dtype)
             self.y = np.random.rand(4).astype(self.dtype)
@@ -174,6 +180,7 @@ def init_output(self):
 
     class TestFusedElementwiseActivationOp_broadcast_3(
             TestFusedElementwiseActivationOp_base):
+
         def init_input(self):
             self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
             self.y = np.random.rand(3, 4).astype(self.dtype)
@@ -185,6 +192,7 @@ def init_output(self):
 
     class TestFusedElementwiseActivationOp_broadcast_4(
             TestFusedElementwiseActivationOp_base):
+
         def init_input(self):
             self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
             self.y = np.random.rand(2, 1).astype(self.dtype)
@@ -196,6 +204,7 @@ def init_output(self):
 
     class TestFusedElementwiseActivationOp_rowwise_add_0(
             TestFusedElementwiseActivationOp_base):
+
         def init_input(self):
             self.x = np.random.rand(2, 3, 4).astype(self.dtype)
             self.y = np.random.rand(3, 4).astype(self.dtype)
@@ -207,6 +216,7 @@ def init_output(self):
 
     class TestFusedElementwiseActivationOp_rowwise_add_1(
             TestFusedElementwiseActivationOp_base):
+
         def init_input(self):
             self.x = np.random.rand(2, 1).astype(self.dtype)
             self.y = np.random.rand(1).astype(self.dtype)
@@ -218,6 +228,7 @@ def init_output(self):
 
     class TestFusedElementwiseActivationOp_channelwise_add(
             TestFusedElementwiseActivationOp_base):
+
         def init_input(self):
             self.x = np.random.rand(3, 20, 20).astype(self.dtype)
             self.y = np.random.rand(3, 1, 1).astype(self.dtype)
@@ -253,9 +264,9 @@ def init_input(self):
               "_rowwise_add_0"] = TestFusedElementwiseActivationOp_rowwise_add_0
     globals()[test_case +
               "_rowwise_add_1"] = TestFusedElementwiseActivationOp_rowwise_add_1
-    globals(
-    )[test_case +
-      "_channelwise_add"] = TestFusedElementwiseActivationOp_channelwise_add
+    globals()[
+        test_case +
+        "_channelwise_add"] = TestFusedElementwiseActivationOp_channelwise_add
 
 
 def scale_add_func(x, y, x_bcast, y_bcast, scale, mode=0):
@@ -330,33 +341,39 @@ def gelu_add_func(x, y, x_bcast, y_bcast, mode=0):
     for save_intermediate_out in {True, False}:
         suffix = ("_save_intermediate_out" if save_intermediate_out else "") \
                  + ("_mode_"+ str(mode))
-        create_test_class('scale_add' + suffix, scale_add_func, {
-            'scale': scale,
-            'functor_list': ["scale", "elementwise_add"],
-            'save_intermediate_out': save_intermediate_out,
-        })
-        create_test_class('add_scale' + suffix, add_scale_func, {
-            'scale': scale,
-            'functor_list': ["elementwise_add", "scale"],
-            'save_intermediate_out': save_intermediate_out,
-        })
-        create_test_class('add_relu' + suffix, add_relu_func, {
-            'functor_list': ["elementwise_add", "relu"],
-            'save_intermediate_out': save_intermediate_out,
-        })
-        create_test_class('relu_add' + suffix, relu_add_func, {
-            'functor_list': ["relu", "elementwise_add"],
-            'save_intermediate_out': save_intermediate_out,
-        })
-        create_test_class('mul_scale' + suffix, mul_scale_func, {
-            'scale': scale,
-            'functor_list': ["elementwise_mul", "scale"],
-            'save_intermediate_out': save_intermediate_out,
-        })
-        create_test_class('gelu_add' + suffix, gelu_add_func, {
-            'functor_list': ["gelu", "elementwise_add"],
-            'save_intermediate_out': save_intermediate_out,
-        })
+        create_test_class(
+            'scale_add' + suffix, scale_add_func, {
+                'scale': scale,
+                'functor_list': ["scale", "elementwise_add"],
+                'save_intermediate_out': save_intermediate_out,
+            })
+        create_test_class(
+            'add_scale' + suffix, add_scale_func, {
+                'scale': scale,
+                'functor_list': ["elementwise_add", "scale"],
+                'save_intermediate_out': save_intermediate_out,
+            })
+        create_test_class(
+            'add_relu' + suffix, add_relu_func, {
+                'functor_list': ["elementwise_add", "relu"],
+                'save_intermediate_out': save_intermediate_out,
+            })
+        create_test_class(
+            'relu_add' + suffix, relu_add_func, {
+                'functor_list': ["relu", "elementwise_add"],
+                'save_intermediate_out': save_intermediate_out,
+            })
+        create_test_class(
+            'mul_scale' + suffix, mul_scale_func, {
+                'scale': scale,
+                'functor_list': ["elementwise_mul", "scale"],
+                'save_intermediate_out': save_intermediate_out,
+            })
+        create_test_class(
+            'gelu_add' + suffix, gelu_add_func, {
+                'functor_list': ["gelu", "elementwise_add"],
+                'save_intermediate_out': save_intermediate_out,
+            })
 
         if core.is_compiled_with_cuda():
             create_test_class(
diff --git a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
index d756394535a9e..7d06ae3e1345b 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
@@ -28,6 +28,7 @@
 @skip_check_grad_ci(reason="check_grad is called when ver.mkl() == ON"
                     "and 'Linux' in platform.platform().")
 class TestFusedEmbeddingSeqPoolOp(OpTest):
+
     def setUp(self):
         self.op_type = "fused_embedding_seq_pool"
         self.emb_size = 6
@@ -39,7 +40,8 @@ def setUp(self):
         self.attrs = {'is_sparse': True}
         self.inputs = {'W': self.table, 'Ids': (ids_expand, self.lod)}
         self.outputs = {
-            'Out': np.reshape(
+            'Out':
+            np.reshape(
                 np.array([
                     self.table[[4, 3]] + self.table[[4, 3]] +
                     self.table[[2, 1]], self.table[[16, 1]]
@@ -54,11 +56,14 @@ def test_check_grad(self):
         # TODO(wangzhongpu): support lod in dygraph mode
         if ver.mkl() == "ON" and 'Linux' in platform.platform():
             self.attrs = {'is_sparse': False}
-            self.check_grad(
-                ['W'], 'Out', no_grad_set=['Ids'], check_dygraph=False)
+            self.check_grad(['W'],
+                            'Out',
+                            no_grad_set=['Ids'],
+                            check_dygraph=False)
 
 
 class TestLookupTableOpWithPadding(TestFusedEmbeddingSeqPoolOp):
+
     def test_check_output(self):
         if ver.mkl() == "ON" and 'Linux' in platform.platform():
             ids = np.squeeze(self.ids, axis=2)
@@ -75,8 +80,9 @@ def test_check_output(self):
                 output.append(np.sum(out, 0))
                 index += count
             self.outputs = {
-                'Out': np.reshape(
-                    np.array(output), [len(self.lod[0]), 2 * self.emb_size])
+                'Out':
+                np.reshape(np.array(output),
+                           [len(self.lod[0]), 2 * self.emb_size])
             }
             self.attrs = {'padding_idx': int(padding_idx)}
             # TODO(wangzhongpu): support lod in dygraph mode
@@ -88,18 +94,23 @@ def test_check_grad(self):
             padding_idx = np.random.choice(ids.flatten(), 1)[0]
             self.attrs = {'padding_idx': int(padding_idx), 'is_sparse': False}
             # TODO(wangzhongpu): support lod in dygraph mode
-            self.check_grad(
-                ['W'], 'Out', no_grad_set=['Ids'], check_dygraph=False)
+            self.check_grad(['W'],
+                            'Out',
+                            no_grad_set=['Ids'],
+                            check_dygraph=False)
 
 
 class TestFusedEmbeddingSeqPoolApi(unittest.TestCase):
+
     def test_api(self):
         if ver.mkl() == "ON" and 'Linux' in platform.platform():
             import paddle.fluid as fluid
 
             dict_size = 20
-            data_t = fluid.layers.data(
-                name='word', shape=[1], dtype='int64', lod_level=1)
+            data_t = fluid.layers.data(name='word',
+                                       shape=[1],
+                                       dtype='int64',
+                                       lod_level=1)
             padding_idx = np.random.randint(1, 10)
             out = fluid.contrib.fused_embedding_seq_pool(
                 input=data_t,
diff --git a/python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py
index 7988c66c17240..9957b16a1b12e 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py
@@ -43,12 +43,12 @@ def fused_embedded_fc_lstm(
     T = ids.shape[0]
     M = embeddings.shape[1]
     x = embeddings[ids].reshape([T, M])
-    return lstm(
-        fc(x, wx, bx), lod, h0, c0, w_h, w_b, w_c, is_reverse, act_gate,
-        act_cell, act_cand)
+    return lstm(fc(x, wx, bx), lod, h0, c0, w_h, w_b, w_c, is_reverse, act_gate,
+                act_cell, act_cand)
 
 
 class TestFusionLSTMOp(OpTest):
+
     def set_conf(self):
         pass
 
@@ -56,7 +56,7 @@ def setUp(self):
         self.op_type = 'fused_embedding_fc_lstm'
         self.lod = [[2, 3, 5, 4]]
         self.M = 8  # Embedding size
-        self.D = 16  # Hidden size 
+        self.D = 16  # Hidden size
         self.dict_size = 18
         self.has_initial_state = False
         self.use_peepholes = False
@@ -82,8 +82,8 @@ def setUp(self):
         w_c = b[:, 4 * self.D:] if self.use_peepholes else None
 
         # low is 0 , high is voc_size - 1
-        ids = np.random.randint(
-            low=0, high=self.dict_size - 1, size=(T, 1)).astype("int64")
+        ids = np.random.randint(low=0, high=self.dict_size - 1,
+                                size=(T, 1)).astype("int64")
         # embeddings as they were trained , so each entry is of M size
         embeddings = np.random.random(
             (self.dict_size, self.M)).astype("float32")
@@ -109,10 +109,11 @@ def setUp(self):
 
         wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float32')
 
-        h, c = fused_embedded_fc_lstm(
-            ids, self.lod, embeddings, wx, bx, h0, c0, wh, w_b, w_c,
-            self.is_reverse, ACTIVATION[self.act_gate],
-            ACTIVATION[self.act_cell], ACTIVATION[self.act_cand])
+        h, c = fused_embedded_fc_lstm(ids, self.lod, embeddings, wx, bx, h0, c0,
+                                      wh, w_b, w_c, self.is_reverse,
+                                      ACTIVATION[self.act_gate],
+                                      ACTIVATION[self.act_cell],
+                                      ACTIVATION[self.act_cand])
 
         self.inputs = {
             'Ids': (ids, self.lod),
@@ -144,63 +145,74 @@ def test_check_output(self):
 
 
 class TestFusionLSTMOpInit(TestFusionLSTMOp):
+
     def set_conf(self):
         self.has_initial_state = True
 
 
 class TestFusionLSTMOpReverse(TestFusionLSTMOp):
+
     def set_conf(self):
         self.is_reverse = True
 
 
 class TestFusionLSTMOpInitReverse(TestFusionLSTMOp):
+
     def set_conf(self):
         self.has_initial_state = True
         self.is_reverse = True
 
 
 class TestFusionLSTMOpMD1(TestFusionLSTMOp):
+
     def set_conf(self):
         self.M = 36
         self.D = 8
 
 
 class TestFusionLSTMOpMD2(TestFusionLSTMOp):
+
     def set_conf(self):
         self.M = 8
         self.D = 8
 
 
 class TestFusionLSTMOpMD3(TestFusionLSTMOp):
+
     def set_conf(self):
         self.M = 15
         self.D = 3
 
 
 class TestFusionLSTMOpBS1(TestFusionLSTMOp):
+
     def set_conf(self):
         self.lod = [[3]]
         self.D = 16
 
 
 class TestFusionLSTMOpPeepholes(TestFusionLSTMOp):
+
     def set_conf(self):
         self.use_peepholes = True
 
 
 class TestFusionLSTMOpPeepholesInit(TestFusionLSTMOp):
+
     def set_conf(self):
         self.use_peepholes = True
         self.has_initial_state = True
 
 
 class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
+
     def set_conf(self):
         self.use_peepholes = True
         self.is_reverse = True
 
 
 class TestFusionLSTMOpPeepholesInitReverse(TestFusionLSTMOp):
+
     def set_conf(self):
         self.use_peepholes = True
         self.has_initial_state = True
@@ -208,6 +220,7 @@ def set_conf(self):
 
 
 class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
+
     def set_conf(self):
         self.use_peepholes = True
         self.lod = [[2]]
diff --git a/python/paddle/fluid/tests/unittests/test_fused_fc_elementwise_layernorm_op.py b/python/paddle/fluid/tests/unittests/test_fused_fc_elementwise_layernorm_op.py
index 9604201e04e1d..cd2f6b6e66c1e 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_fc_elementwise_layernorm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_fc_elementwise_layernorm_op.py
@@ -27,6 +27,7 @@
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "Paddle core is not compiled with CUDA")
 class TestFusedFCElementwiseLayerNormOp(OpTest):
+
     def config(self):
         self.matrix = MatrixGenerate(1, 10, 15, 3, 3, 2)
         self.y_shape = [1, 15]
@@ -72,6 +73,7 @@ def test_check_output(self):
 
 
 class TestFusedFCElementwiseLayerNormOp2(TestFusedFCElementwiseLayerNormOp):
+
     def config(self):
         self.matrix = MatrixGenerate(4, 5, 6, 2, 2, 1)
         self.y_shape = [4, 6]
diff --git a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
index 25336efd6a7fb..43d39224287e6 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
@@ -24,10 +24,12 @@
 import unittest
 from op_test import OpTest
 from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
+
 _enable_legacy_dygraph()
 
 
 class TestFusedFFNOp(OpTest):
+
     def getDtype(self):
         self.dtype = "float32"
         self.layer_norm_dtype = "float32"
@@ -71,16 +73,14 @@ def setUp(self):
             self.weight_attr, 2)
         self.bias_attrs = transformer._convert_param_attr_to_list(
             self.bias_attr, 2)
-        self.linear1 = Linear(
-            self.d_model,
-            self.dim_feedforward,
-            self.weight_attrs[1],
-            bias_attr=self.bias_attrs[1])
-        self.linear2 = Linear(
-            self.dim_feedforward,
-            self.d_model,
-            self.weight_attrs[1],
-            bias_attr=self.bias_attrs[1])
+        self.linear1 = Linear(self.d_model,
+                              self.dim_feedforward,
+                              self.weight_attrs[1],
+                              bias_attr=self.bias_attrs[1])
+        self.linear2 = Linear(self.dim_feedforward,
+                              self.d_model,
+                              self.weight_attrs[1],
+                              bias_attr=self.bias_attrs[1])
 
         paddle.set_default_dtype(self.layer_norm_dtype)
         self.norm1 = LayerNorm(self.d_model)
@@ -118,31 +118,30 @@ def Base(self):
 
     def FusedFFN(self):
         paddle.disable_static()
-        linear1_weight = paddle.to_tensor(
-            self.linear1.weight, stop_gradient=False)
+        linear1_weight = paddle.to_tensor(self.linear1.weight,
+                                          stop_gradient=False)
         linear1_bias = paddle.to_tensor(self.linear1.bias, stop_gradient=False)
-        linear2_weight = paddle.to_tensor(
-            self.linear2.weight, stop_gradient=False)
+        linear2_weight = paddle.to_tensor(self.linear2.weight,
+                                          stop_gradient=False)
         linear2_bias = paddle.to_tensor(self.linear2.bias, stop_gradient=False)
         ln1_scale = paddle.to_tensor(self.norm1.weight, stop_gradient=False)
         ln1_bias = paddle.to_tensor(self.norm1.bias, stop_gradient=False)
         ln2_scale = paddle.to_tensor(self.norm2.weight, stop_gradient=False)
         ln2_bias = paddle.to_tensor(self.norm2.bias, stop_gradient=False)
         x = paddle.to_tensor(self.src, stop_gradient=False)
-        out = incubate_f.fused_feedforward(
-            x,
-            linear1_weight,
-            linear2_weight,
-            linear1_bias,
-            linear2_bias,
-            ln1_scale,
-            ln1_bias,
-            ln2_scale,
-            ln2_bias,
-            0.0,
-            0.0,
-            activation=self.act_method,
-            pre_layer_norm=self.pre_layer_norm)
+        out = incubate_f.fused_feedforward(x,
+                                           linear1_weight,
+                                           linear2_weight,
+                                           linear1_bias,
+                                           linear2_bias,
+                                           ln1_scale,
+                                           ln1_bias,
+                                           ln2_scale,
+                                           ln2_bias,
+                                           0.0,
+                                           0.0,
+                                           activation=self.act_method,
+                                           pre_layer_norm=self.pre_layer_norm)
         paddle.autograd.backward([out], [paddle.to_tensor(self.dout)])
         return out, x.grad
 
@@ -150,16 +149,18 @@ def test_out_and_grad(self):
         default_main_program().random_seed = 42
         base_out, base_grad = self.Base()
         fused_out, fused_grad = self.FusedFFN()
-        np.testing.assert_allclose(
-            base_out.numpy(), fused_out.numpy(), rtol=self.rtol, atol=self.atol)
-        np.testing.assert_allclose(
-            base_grad.numpy(),
-            fused_grad.numpy(),
-            rtol=self.rtol,
-            atol=self.atol)
+        np.testing.assert_allclose(base_out.numpy(),
+                                   fused_out.numpy(),
+                                   rtol=self.rtol,
+                                   atol=self.atol)
+        np.testing.assert_allclose(base_grad.numpy(),
+                                   fused_grad.numpy(),
+                                   rtol=self.rtol,
+                                   atol=self.atol)
 
 
 class TestFusedFFNOpFp16(TestFusedFFNOp):
+
     def getDtype(self):
         self.dtype = "float16"
         self.layer_norm_dtype = "float32"
@@ -176,17 +177,20 @@ def getShape(self):
 
 
 class TestFusedFFNOpFp64(TestFusedFFNOp):
+
     def getDtype(self):
         self.dtype = "float64"
         self.layer_norm_dtype = "float64"
 
 
 class TestFusedFFNOpActivation(TestFusedFFNOp):
+
     def getActivation(self):
         self.act_method = "relu"
 
 
 class TestFusedFFNOpNormalizeBefore(TestFusedFFNOp):
+
     def getNormalizeBefore(self):
         self.pre_layer_norm = True
 
@@ -198,6 +202,7 @@ def getShape(self):
 
 
 class APITestStaticFusedFFN(unittest.TestCase):
+
     def test_static(self):
         paddle.enable_static()
         default_main_program().random_seed = 42
@@ -207,38 +212,36 @@ def test_static(self):
         d_model = 8
         dim_feedforward = 8
 
-        x = paddle.static.data(
-            name='x', shape=[batch_size, d_model, dim_feedforward], dtype=dtype)
-        linear1_weight = paddle.static.data(
-            name='linear1_weight',
-            shape=[d_model, dim_feedforward],
-            dtype=dtype)
-        linear1_bias = paddle.static.data(
-            name='linear1_bias', shape=[dim_feedforward])
-        linear2_weight = paddle.static.data(
-            name='linear2_weight',
-            shape=[dim_feedforward, d_model],
-            dtype=dtype)
+        x = paddle.static.data(name='x',
+                               shape=[batch_size, d_model, dim_feedforward],
+                               dtype=dtype)
+        linear1_weight = paddle.static.data(name='linear1_weight',
+                                            shape=[d_model, dim_feedforward],
+                                            dtype=dtype)
+        linear1_bias = paddle.static.data(name='linear1_bias',
+                                          shape=[dim_feedforward])
+        linear2_weight = paddle.static.data(name='linear2_weight',
+                                            shape=[dim_feedforward, d_model],
+                                            dtype=dtype)
         linear2_bias = paddle.static.data(name='linear2_bias', shape=[d_model])
         ln1_scale = paddle.static.data(name='ln1_scale', shape=[d_model])
         ln1_bias = paddle.static.data(name='ln1_scale', shape=[d_model])
         ln2_scale = paddle.static.data(name='ln2_scale', shape=[d_model])
         ln2_bias = paddle.static.data(name='ln2_scale', shape=[d_model])
 
-        fused_out = incubate_f.fused_feedforward(
-            x,
-            linear1_weight,
-            linear2_weight,
-            linear1_bias,
-            linear2_bias,
-            ln1_scale,
-            ln1_bias,
-            ln2_scale,
-            ln2_bias,
-            0.0,
-            0.0,
-            activation="relu",
-            pre_layer_norm=False)
+        fused_out = incubate_f.fused_feedforward(x,
+                                                 linear1_weight,
+                                                 linear2_weight,
+                                                 linear1_bias,
+                                                 linear2_bias,
+                                                 ln1_scale,
+                                                 ln1_bias,
+                                                 ln2_scale,
+                                                 ln2_bias,
+                                                 0.0,
+                                                 0.0,
+                                                 activation="relu",
+                                                 pre_layer_norm=False)
 
         ######base ffn######
         linear1_out = F.linear(x, linear1_weight, linear1_bias)
@@ -246,11 +249,10 @@ def test_static(self):
         dropout1_out = F.dropout(x=act_out, p=0.0, training=False)
         linear2_out = F.linear(dropout1_out, linear2_weight, linear2_bias)
         dropout2_out = x + F.dropout(x=linear2_out, p=0.0, training=False)
-        ln_out = F.layer_norm(
-            dropout2_out,
-            normalized_shape=list([d_model]),
-            weight=ln2_scale,
-            bias=ln2_bias)
+        ln_out = F.layer_norm(dropout2_out,
+                              normalized_shape=list([d_model]),
+                              weight=ln2_scale,
+                              bias=ln2_bias)
         ######base ffn######
 
         exe = paddle.static.Executor(paddle.CUDAPlace(0))
@@ -286,62 +288,79 @@ def test_static(self):
             },
                             fetch_list=[res])
             real_res.append(fetch)
-        self.assertTrue(
-            np.allclose(
-                real_res[0], real_res[1], atol=1e-3),
-            "two value is check diff")
+        self.assertTrue(np.allclose(real_res[0], real_res[1], atol=1e-3),
+                        "two value is check diff")
 
 
 class TestFusedFFNOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
 
             def test_dtype():
-                x = paddle.static.data(
-                    name='x', shape=[1, 10, 10], dtype="int32")
-                linear1_weight = paddle.static.data(
-                    name='linear1_weight', shape=[1, 10, 10], dtype="float32")
-                linear2_weight = paddle.static.data(
-                    name='linear2_weight', shape=[1, 10, 10], dtype="float32")
+                x = paddle.static.data(name='x',
+                                       shape=[1, 10, 10],
+                                       dtype="int32")
+                linear1_weight = paddle.static.data(name='linear1_weight',
+                                                    shape=[1, 10, 10],
+                                                    dtype="float32")
+                linear2_weight = paddle.static.data(name='linear2_weight',
+                                                    shape=[1, 10, 10],
+                                                    dtype="float32")
                 incubate_f.fused_feedforward(x, linear1_weight, linear2_weight)
 
             self.assertRaises(TypeError, test_dtype)
 
             def test_dropout_rate_type():
-                x = paddle.static.data(
-                    name='x1', shape=[1, 10, 10], dtype="float32")
-                linear1_weight = paddle.static.data(
-                    name='linear1_weight1', shape=[10, 10], dtype="float32")
-                linear2_weight = paddle.static.data(
-                    name='linear2_weight1', shape=[10, 10], dtype="float32")
-                incubate_f.fused_feedforward(
-                    x, linear1_weight, linear2_weight, dropout1_rate="a")
+                x = paddle.static.data(name='x1',
+                                       shape=[1, 10, 10],
+                                       dtype="float32")
+                linear1_weight = paddle.static.data(name='linear1_weight1',
+                                                    shape=[10, 10],
+                                                    dtype="float32")
+                linear2_weight = paddle.static.data(name='linear2_weight1',
+                                                    shape=[10, 10],
+                                                    dtype="float32")
+                incubate_f.fused_feedforward(x,
+                                             linear1_weight,
+                                             linear2_weight,
+                                             dropout1_rate="a")
 
             self.assertRaises(TypeError, test_dropout_rate_type)
 
             def test_dropout_rate_value():
-                x = paddle.static.data(
-                    name='x2', shape=[1, 10, 10], dtype="float32")
-                linear1_weight = paddle.static.data(
-                    name='linear1_weight2', shape=[10, 10], dtype="float32")
-                linear2_weight = paddle.static.data(
-                    name='linear2_weight2', shape=[10, 10], dtype="float32")
-                incubate_f.fused_feedforward(
-                    x, linear1_weight, linear2_weight, dropout2_rate=-1)
+                x = paddle.static.data(name='x2',
+                                       shape=[1, 10, 10],
+                                       dtype="float32")
+                linear1_weight = paddle.static.data(name='linear1_weight2',
+                                                    shape=[10, 10],
+                                                    dtype="float32")
+                linear2_weight = paddle.static.data(name='linear2_weight2',
+                                                    shape=[10, 10],
+                                                    dtype="float32")
+                incubate_f.fused_feedforward(x,
+                                             linear1_weight,
+                                             linear2_weight,
+                                             dropout2_rate=-1)
 
             self.assertRaises(ValueError, test_dropout_rate_value)
 
             def test_dropout_mode():
-                x = paddle.static.data(
-                    name='x3', shape=[1, 10, 10], dtype="float32")
-                linear1_weight = paddle.static.data(
-                    name='linear1_weight3', shape=[10, 10], dtype="float32")
-                linear2_weight = paddle.static.data(
-                    name='linear2_weight3', shape=[10, 10], dtype="float32")
-                incubate_f.fused_feedforward(
-                    x, linear1_weight, linear2_weight, mode='test')
+                x = paddle.static.data(name='x3',
+                                       shape=[1, 10, 10],
+                                       dtype="float32")
+                linear1_weight = paddle.static.data(name='linear1_weight3',
+                                                    shape=[10, 10],
+                                                    dtype="float32")
+                linear2_weight = paddle.static.data(name='linear2_weight3',
+                                                    shape=[10, 10],
+                                                    dtype="float32")
+                incubate_f.fused_feedforward(x,
+                                             linear1_weight,
+                                             linear2_weight,
+                                             mode='test')
 
             self.assertRaises(ValueError, test_dropout_mode)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
index 6f9ba5f5e4e57..2d6243955478c 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
@@ -28,6 +28,7 @@
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "Paddle is not compiled with CUDA")
 class TestFusedGateAttentionOp(OpTest):
+
     def setUp(self):
         self.__class__.op_type = "fused_gate_attention"
         # use autograd to check grad in this unittest.
@@ -51,6 +52,7 @@ def config(self):
         self.bias_attr = True
 
     def generate_input_data(self):
+
         def _random(shape):
             if self.dtype == "bfloat16":
                 data = np.random.random(shape).astype("float32")
@@ -97,8 +99,8 @@ def get_reference_out(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
 
         query = paddle.to_tensor(self.query, stop_gradient=False)
-        key = query if self.merge_qkv else paddle.to_tensor(
-            self.key, stop_gradient=False)
+        key = query if self.merge_qkv else paddle.to_tensor(self.key,
+                                                            stop_gradient=False)
         q_weight = paddle.to_tensor(self.q_weight, stop_gradient=False)
         k_weight = paddle.to_tensor(self.k_weight, stop_gradient=False)
         v_weight = paddle.to_tensor(self.v_weight, stop_gradient=False)
@@ -112,12 +114,12 @@ def get_reference_out(self):
         # [batch_size, msa_len, num_heads, m_size, key_dim]
         v = paddle.einsum('nbka,ahc->nbkhc', key, v_weight)
 
-        # [batch_size, msa_len, num_heads, res_len, m_size] 
+        # [batch_size, msa_len, num_heads, res_len, m_size]
         logits = paddle.einsum('nbqhc,nbkhc->nbhqk', q, k)  # qk_out
         logits = logits + src_mask
         if self.bias_attr:
-            nonbatched_bias = paddle.to_tensor(
-                self.nonbatched_bias, stop_gradient=False)
+            nonbatched_bias = paddle.to_tensor(self.nonbatched_bias,
+                                               stop_gradient=False)
             logits = logits + nonbatched_bias
 
         weights = nn.functional.softmax(logits)  # softmax_out
@@ -136,8 +138,8 @@ def get_reference_out(self):
 
         out = paddle.einsum('nbqhc,hco->nbqo', weighted_avg,
                             output_w) + output_b
-        paddle.autograd.backward(
-            [out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        paddle.autograd.backward([out], [paddle.to_tensor(self.dout)],
+                                 retain_graph=True)
         if self.merge_qkv:
             return out, query.grad, None
         else:
@@ -163,8 +165,8 @@ def get_fused_gate_attention_out(self):
         src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
 
         if self.bias_attr:
-            nonbatched_bias = paddle.to_tensor(
-                self.nonbatched_bias, stop_gradient=False)
+            nonbatched_bias = paddle.to_tensor(self.nonbatched_bias,
+                                               stop_gradient=False)
         else:
             nonbatched_bias = None
         if self.has_gating:
@@ -182,8 +184,8 @@ def get_fused_gate_attention_out(self):
             nonbatched_bias, src_mask, gating_w, gating_b, output_w, output_b,
             'has_gating', self.has_gating, 'merge_qkv', self.merge_qkv)
 
-        paddle.autograd.backward(
-            [out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        paddle.autograd.backward([out], [paddle.to_tensor(self.dout)],
+                                 retain_graph=True)
         if key is not None:
             return out, query.grad, key.grad
         else:
@@ -193,17 +195,22 @@ def check_output_and_grad(self, atol, rtol):
         out_ref, query_grad_ref, key_grad_ref = self.get_reference_out()
         out, query_grad, key_grad = self.get_fused_gate_attention_out()
         np.testing.assert_allclose(out_ref, out.numpy(), atol=atol, rtol=rtol)
-        np.testing.assert_allclose(
-            query_grad_ref, query_grad.numpy(), atol=atol, rtol=rtol)
+        np.testing.assert_allclose(query_grad_ref,
+                                   query_grad.numpy(),
+                                   atol=atol,
+                                   rtol=rtol)
         if key_grad_ref is not None and key_grad is not None:
-            np.testing.assert_allclose(
-                key_grad_ref, key_grad.numpy(), atol=atol, rtol=rtol)
+            np.testing.assert_allclose(key_grad_ref,
+                                       key_grad.numpy(),
+                                       atol=atol,
+                                       rtol=rtol)
 
     def test_output_and_grad(self):
         self.check_output_and_grad(atol=1e-5, rtol=1e-5)
 
 
 class TestSeparatedQKVCase(TestFusedGateAttentionOp):
+
     def config(self):
         self.dtype = "float32"
         self.has_gating = False
@@ -220,6 +227,7 @@ def config(self):
 
 
 class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp):
+
     def config(self):
         super().config()
         self.has_gating = False
@@ -227,6 +235,7 @@ def config(self):
 
 
 class TestMergeQKVFp16Case(TestFusedGateAttentionOp):
+
     def config(self):
         super().config()
         self.dtype = "float16"
@@ -240,6 +249,7 @@ def test_output_and_grad(self):
     "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
 )
 class TestMergeQKVBF16Case(TestFusedGateAttentionOp):
+
     def config(self):
         super().config()
         self.dtype = "bfloat16"
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
index 2ea1bf2e9cb81..b536b0d7e66f5 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -34,6 +34,7 @@ def get_outputs(DOut, X, Y):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueGradOpDXYBiasFP16(OpTest):
+
     def setUp(self):
         self.op_type = "fused_gemm_epilogue_grad"
         self.place = core.CUDAPlace(0)
@@ -67,6 +68,7 @@ def test_check_output(self):
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueGradOpDXYBiasFP32(
         TestFuseGemmEpilogueGradOpDXYBiasFP16):
+
     def init_dtype_type(self):
         self.dtype = np.single
         self.atol = 1e-6
@@ -77,6 +79,7 @@ def init_dtype_type(self):
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueGradOpDXYBiasFP64(
         TestFuseGemmEpilogueGradOpDXYBiasFP16):
+
     def init_dtype_type(self):
         self.dtype = np.double
         self.atol = 1e-6
@@ -86,6 +89,7 @@ def init_dtype_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueGradOpDYBiasFP16(OpTest):
+
     def setUp(self):
         self.op_type = "fused_gemm_epilogue_grad"
         self.place = core.CUDAPlace(0)
@@ -117,8 +121,9 @@ def test_check_output(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueGradOpDYBiasFP32(
-        TestFuseGemmEpilogueGradOpDYBiasFP16):
+class TestFuseGemmEpilogueGradOpDYBiasFP32(TestFuseGemmEpilogueGradOpDYBiasFP16
+                                           ):
+
     def init_dtype_type(self):
         self.dtype = np.single
         self.atol = 1e-6
@@ -127,8 +132,9 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueGradOpDYBiasFP64(
-        TestFuseGemmEpilogueGradOpDYBiasFP16):
+class TestFuseGemmEpilogueGradOpDYBiasFP64(TestFuseGemmEpilogueGradOpDYBiasFP16
+                                           ):
+
     def init_dtype_type(self):
         self.dtype = np.double
         self.atol = 1e-6
@@ -138,6 +144,7 @@ def init_dtype_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueGradOpDYFP16(OpTest):
+
     def setUp(self):
         self.op_type = "fused_gemm_epilogue_grad"
         self.place = core.CUDAPlace(0)
@@ -170,6 +177,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueGradOpDYFP32(TestFuseGemmEpilogueGradOpDYFP16):
+
     def init_dtype_type(self):
         self.dtype = np.single
         self.atol = 1e-6
@@ -179,6 +187,7 @@ def init_dtype_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueGradOpDYFP64(TestFuseGemmEpilogueGradOpDYFP16):
+
     def init_dtype_type(self):
         self.dtype = np.double
         self.atol = 1e-6
@@ -188,6 +197,7 @@ def init_dtype_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueGradOpDXYFP16(OpTest):
+
     def setUp(self):
         self.op_type = "fused_gemm_epilogue_grad"
         self.place = core.CUDAPlace(0)
@@ -220,6 +230,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueGradOpDXYFP32(TestFuseGemmEpilogueGradOpDXYFP16):
+
     def init_dtype_type(self):
         self.dtype = np.single
         self.atol = 1e-6
@@ -229,11 +240,13 @@ def init_dtype_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueGradOpDXYFP64(TestFuseGemmEpilogueGradOpDXYFP16):
+
     def init_dtype_type(self):
         self.dtype = np.double
         self.atol = 1e-6
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     np.random.seed(0)
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py
index f826898f9e5dd..bd29ebbf12a35 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,7 +19,7 @@
 import numpy as np
 import paddle
 import paddle.fluid.core as core
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci, skip_check_inplace_ci
 
 
 def gelu(x):
@@ -43,10 +43,16 @@ def get_output(X, Y, bias, act):
         return out
 
 
+@skip_check_inplace_ci(reason="no inplace op")
+class TestFuseGemmBase(OpTest):
+    pass
+
+
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpReluMMFP16(OpTest):
+class TestFuseGemmEpilogueOpReluMMFP16(TestFuseGemmBase):
+
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -58,8 +64,9 @@ def setUp(self):
             'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
         }
         self.outputs = {
-            'Out': get_output(self.inputs['X'], self.inputs['Y'],
-                              self.inputs['Bias'], 'relu')
+            'Out':
+            get_output(self.inputs['X'], self.inputs['Y'], self.inputs['Bias'],
+                       'relu')
         }
         self.attrs = {"activation": 'relu'}
 
@@ -78,6 +85,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpReluMMFP32(TestFuseGemmEpilogueOpReluMMFP16):
+
     def init_dtype_type(self):
         self.dtype = np.single
         self.atol = 1e-6
@@ -87,6 +95,7 @@ def init_dtype_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpReluMMFP64(TestFuseGemmEpilogueOpReluMMFP16):
+
     def init_dtype_type(self):
         self.dtype = np.double
         self.atol = 1e-6
@@ -95,7 +104,8 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpReluMTMFP16(OpTest):
+class TestFuseGemmEpilogueOpReluMTMFP16(TestFuseGemmBase):
+
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -107,8 +117,9 @@ def setUp(self):
             'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
         }
         self.outputs = {
-            'Out': get_output(self.inputs['X'].T, self.inputs['Y'],
-                              self.inputs['Bias'], 'relu')
+            'Out':
+            get_output(self.inputs['X'].T, self.inputs['Y'],
+                       self.inputs['Bias'], 'relu')
         }
         self.attrs = {'trans_x': True, "activation": 'relu'}
 
@@ -127,6 +138,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpReluMTMFP32(TestFuseGemmEpilogueOpReluMTMFP16):
+
     def init_dtype_type(self):
         self.dtype = np.single
         self.atol = 1e-6
@@ -136,6 +148,7 @@ def init_dtype_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpReluMTMFP64(TestFuseGemmEpilogueOpReluMTMFP16):
+
     def init_dtype_type(self):
         self.dtype = np.double
         self.atol = 1e-6
@@ -144,7 +157,8 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpReluMMTFP16(OpTest):
+class TestFuseGemmEpilogueOpReluMMTFP16(TestFuseGemmBase):
+
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -156,8 +170,9 @@ def setUp(self):
             'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
         }
         self.outputs = {
-            'Out': get_output(self.inputs['X'], self.inputs['Y'].T,
-                              self.inputs['Bias'], 'relu')
+            'Out':
+            get_output(self.inputs['X'], self.inputs['Y'].T,
+                       self.inputs['Bias'], 'relu')
         }
         self.attrs = {'trans_y': True, "activation": 'relu'}
 
@@ -176,6 +191,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpReluMMTFP32(TestFuseGemmEpilogueOpReluMMTFP16):
+
     def init_dtype_type(self):
         self.dtype = np.single
         self.atol = 1e-6
@@ -185,6 +201,7 @@ def init_dtype_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpReluMMTFP64(TestFuseGemmEpilogueOpReluMMTFP16):
+
     def init_dtype_type(self):
         self.dtype = np.double
         self.atol = 1e-6
@@ -193,7 +210,8 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpReluMTMTFP16(OpTest):
+class TestFuseGemmEpilogueOpReluMTMTFP16(TestFuseGemmBase):
+
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -205,8 +223,9 @@ def setUp(self):
             'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
         }
         self.outputs = {
-            'Out': get_output(self.inputs['X'].T, self.inputs['Y'].T,
-                              self.inputs['Bias'], 'relu')
+            'Out':
+            get_output(self.inputs['X'].T, self.inputs['Y'].T,
+                       self.inputs['Bias'], 'relu')
         }
         self.attrs = {'trans_x': True, 'trans_y': True, "activation": 'relu'}
 
@@ -225,6 +244,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpReluMTMTFP32(TestFuseGemmEpilogueOpReluMTMTFP16):
+
     def init_dtype_type(self):
         self.dtype = np.single
         self.atol = 1e-6
@@ -234,6 +254,7 @@ def init_dtype_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpReluMTMTFP64(TestFuseGemmEpilogueOpReluMTMTFP16):
+
     def init_dtype_type(self):
         self.dtype = np.double
         self.atol = 1e-6
@@ -242,7 +263,8 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpReluMMFP16MultiDimX(OpTest):
+class TestFuseGemmEpilogueOpReluMMFP16MultiDimX(TestFuseGemmBase):
+
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -254,9 +276,9 @@ def setUp(self):
             'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
         }
         self.outputs = {
-            'Out': get_output(self.inputs['X'].reshape(
-                (-1, 4)), self.inputs['Y'], self.inputs['Bias'],
-                              'relu').reshape((2, 2, 8, 128))
+            'Out':
+            get_output(self.inputs['X'].reshape((-1, 4)), self.inputs['Y'],
+                       self.inputs['Bias'], 'relu').reshape((2, 2, 8, 128))
         }
         self.attrs = {"activation": 'relu'}
 
@@ -276,6 +298,7 @@ def test_check_output(self):
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpReluMMFP32MultiDimX(
         TestFuseGemmEpilogueOpReluMMFP16MultiDimX):
+
     def init_dtype_type(self):
         self.dtype = np.single
         self.atol = 1e-6
@@ -286,6 +309,7 @@ def init_dtype_type(self):
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpReluMMFP64MultiDimX(
         TestFuseGemmEpilogueOpReluMMFP16MultiDimX):
+
     def init_dtype_type(self):
         self.dtype = np.double
         self.atol = 1e-6
@@ -294,7 +318,8 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpReluMTMFP16MultiDimX(OpTest):
+class TestFuseGemmEpilogueOpReluMTMFP16MultiDimX(TestFuseGemmBase):
+
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -306,9 +331,9 @@ def setUp(self):
             'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
         }
         self.outputs = {
-            'Out': get_output(self.inputs['X'].reshape(
-                (4, -1)).T, self.inputs['Y'], self.inputs['Bias'],
-                              'relu').reshape((2, 2, 8, 128))
+            'Out':
+            get_output(self.inputs['X'].reshape((4, -1)).T, self.inputs['Y'],
+                       self.inputs['Bias'], 'relu').reshape((2, 2, 8, 128))
         }
         self.attrs = {'trans_x': True, "activation": 'relu'}
 
@@ -328,6 +353,7 @@ def test_check_output(self):
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpReluMTMFP32MultiDimX(
         TestFuseGemmEpilogueOpReluMTMFP16MultiDimX):
+
     def init_dtype_type(self):
         self.dtype = np.single
         self.atol = 1e-6
@@ -338,6 +364,7 @@ def init_dtype_type(self):
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpReluMTMFP64MultiDimX(
         TestFuseGemmEpilogueOpReluMTMFP16MultiDimX):
+
     def init_dtype_type(self):
         self.dtype = np.double
         self.atol = 1e-6
@@ -346,7 +373,8 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpGeluMMFP16(OpTest):
+class TestFuseGemmEpilogueOpGeluMMFP16(TestFuseGemmBase):
+
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -361,8 +389,9 @@ def setUp(self):
         self.attrs = {"activation": 'gelu'}
 
         self.outputs = {
-            'Out': get_output(self.inputs['X'], self.inputs['Y'],
-                              self.inputs['Bias'], 'gelu')
+            'Out':
+            get_output(self.inputs['X'], self.inputs['Y'], self.inputs['Bias'],
+                       'gelu')
         }
 
     def init_dtype_type(self):
@@ -380,6 +409,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpGeluMMFP32(TestFuseGemmEpilogueOpGeluMMFP16):
+
     def init_dtype_type(self):
         self.dtype = np.single
         self.atol = 1e-6
@@ -389,6 +419,7 @@ def init_dtype_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpGeluMMFP64(TestFuseGemmEpilogueOpGeluMMFP16):
+
     def init_dtype_type(self):
         self.dtype = np.double
         self.atol = 1e-6
@@ -397,7 +428,8 @@ def init_dtype_type(self):
 @skip_check_grad_ci(reason="no grap op")
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFuseGemmEpilogueOpNoneMMFP16(OpTest):
+class TestFuseGemmEpilogueOpNoneMMFP16(TestFuseGemmBase):
+
     def setUp(self):
         self.op_type = "fused_gemm_epilogue"
         self.place = core.CUDAPlace(0)
@@ -412,8 +444,9 @@ def setUp(self):
         self.attrs = {"activation": 'none'}
 
         self.outputs = {
-            'Out': get_output(self.inputs['X'], self.inputs['Y'],
-                              self.inputs['Bias'], 'none')
+            'Out':
+            get_output(self.inputs['X'], self.inputs['Y'], self.inputs['Bias'],
+                       'none')
         }
 
     def init_dtype_type(self):
@@ -431,6 +464,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpNoneMMFP32(TestFuseGemmEpilogueOpNoneMMFP16):
+
     def init_dtype_type(self):
         self.dtype = np.single
         self.atol = 1e-6
@@ -440,11 +474,13 @@ def init_dtype_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFuseGemmEpilogueOpNoneMMFP64(TestFuseGemmEpilogueOpNoneMMFP16):
+
     def init_dtype_type(self):
         self.dtype = np.double
         self.atol = 1e-6
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     np.random.seed(0)
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_matmul_bias.py b/python/paddle/fluid/tests/unittests/test_fused_matmul_bias.py
new file mode 100644
index 0000000000000..f2f56e42543c6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_matmul_bias.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid.core as core
+import unittest
+import numpy as np
+from paddle.incubate.nn.functional import fused_matmul_bias, fused_linear
+from paddle.incubate.nn import FusedLinear
+
+
+def is_fused_matmul_bias_supported():
+    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():
+        return hasattr(core.ops, 'fused_gemm_epilogue')
+    else:
+        return False
+
+
+def matmul(x, y, bias, trans_x, trans_y):
+    x = np.array(x)
+    if trans_x:
+        x = np.ascontiguousarray(np.transpose(x))
+    if trans_y:
+        y = np.ascontiguousarray(np.transpose(y))
+    z = np.matmul(x, y)
+    if bias is None:
+        return z
+    else:
+        return z + bias
+
+
+def matmul_grad(x, y, bias, dz, trans_x, trans_y):
+    if trans_x:
+        if trans_y:
+            dx = matmul(y, dz, None, True, True)
+            dy = matmul(dz, x, None, True, True)
+        else:
+            dx = matmul(y, dz, None, False, True)
+            dy = matmul(x, dz, None, False, False)
+    else:
+        if trans_y:
+            dx = matmul(dz, y, None, False, False)
+            dy = matmul(dz, x, None, True, False)
+        else:
+            dx = matmul(dz, y, None, False, True)
+            dy = matmul(x, dz, None, True, False)
+    if bias is None:
+        dbias = None
+    else:
+        dbias = np.sum(dz, axis=0, keepdims=False)
+    return dx, dy, dbias
+
+
+@unittest.skipIf(
+    not is_fused_matmul_bias_supported(),
+    "fused_gemm_epilogue is only supported when CUDA version >= 11.6")
+class TestFusedMatmulBias(unittest.TestCase):
+
+    def setUp(self):
+        paddle.set_device('gpu')
+
+    def rand_data(self, shape, dtype):
+        return np.random.randint(low=-20, high=20, size=shape).astype(dtype)
+
+    def rand_test_base(self, m, n, k, trans_x, trans_y, need_bias, dtype, seed):
+        np.random.seed(seed)
+        x_shape = [k, m] if trans_x else [m, k]
+        y_shape = [n, k] if trans_y else [k, n]
+        bias_shape = [n]
+
+        x_np = self.rand_data(x_shape, dtype)
+        x = paddle.to_tensor(x_np)
+        x.stop_gradient = False
+
+        y_np = self.rand_data(y_shape, dtype)
+        y = paddle.to_tensor(y_np)
+        y.stop_gradient = False
+
+        if need_bias:
+            bias_np = self.rand_data(bias_shape, dtype)
+            bias = paddle.to_tensor(bias_np)
+            bias.stop_gradient = False
+        else:
+            bias_np = None
+            bias = None
+
+        z = fused_matmul_bias(x, y, bias, trans_x, trans_y)
+        z_np = matmul(x_np, y_np, bias_np, trans_x, trans_y)
+        self.assertTrue(np.array_equal(z.numpy(), z_np))
+
+        z_grad_np = self.rand_data(z_np.shape, dtype)
+        paddle.autograd.backward(z, grad_tensors=[paddle.to_tensor(z_grad_np)])
+
+        x_grad_np, y_grad_np, bias_grad_np = matmul_grad(
+            x_np, y_np, bias_np, z_grad_np, trans_x, trans_y)
+        self.assertTrue(np.array_equal(x.grad.numpy(), x_grad_np))
+        self.assertEqual(y_grad_np.shape, y_np.shape)
+        self.assertTrue(np.array_equal(y.grad.numpy(), y_grad_np))
+
+        if need_bias:
+            self.assertTrue(np.array_equal(bias.grad.numpy(), bias_grad_np))
+        else:
+            self.assertTrue(bias_grad_np is None)
+
+    def rand_test(self, m, n, k, dtype):
+        seed = int(np.random.randint(low=0, high=1000, size=[1]))
+        for trans_x in [False, True]:
+            for trans_y in [False, True]:
+                for need_bias in [False, True]:
+                    self.rand_test_base(m, n, k, trans_x, trans_y, need_bias,
+                                        dtype, seed)
+
+    def test_fp32(self):
+        self.rand_test(30, 40, 50, np.float32)
+
+    def test_fp16(self):
+        self.rand_test(4, 5, 7, np.float16)
+
+
+@unittest.skipIf(
+    not is_fused_matmul_bias_supported(),
+    "fused_gemm_epilogue is only supported when CUDA version >= 11.6")
+class TestFusedLinear(unittest.TestCase):
+
+    def check_fused_linear(self, transpose):
+        x = paddle.randn([30, 40])
+        linear = FusedLinear(40, 50, transpose_weight=transpose)
+        y1 = linear(x)
+        y2 = fused_linear(x, linear.weight, linear.bias, transpose)
+        self.assertTrue(np.array_equal(y1.numpy(), y2.numpy()))
+
+    def test_non_transpose(self):
+        self.check_fused_linear(False)
+
+    def test_transpose(self):
+        self.check_fused_linear(True)
+
+
+@unittest.skipIf(
+    not is_fused_matmul_bias_supported(),
+    "fused_gemm_epilogue is only supported when CUDA version >= 11.6")
+class TestStaticGraph(unittest.TestCase):
+
+    def test_static_graph(self):
+        paddle.enable_static()
+        x = paddle.static.data(name='x', dtype='float32', shape=[-1, 100])
+        linear = FusedLinear(100, 300)
+        y = linear(x)
+        self.assertEqual(list(y.shape), [-1, 300])
+        paddle.disable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
index 67f382a439d8c..ffe6fa8d41aa0 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
@@ -39,6 +39,7 @@
 
 
 class TestFusedMultiTransformerOp(OpTest):
+
     def setUp(self):
         self.config()
         self.generate_input_data()
@@ -61,39 +62,33 @@ def setUp(self):
 
         bias_attr = paddle.fluid.ParamAttr(
             initializer=paddle.fluid.initializer.Constant(value=0.0005))
-        self.q_proj = Linear(
-            self.embed_dim,
-            self.embed_dim,
-            self.weight_attr,
-            bias_attr=bias_attr)
+        self.q_proj = Linear(self.embed_dim,
+                             self.embed_dim,
+                             self.weight_attr,
+                             bias_attr=bias_attr)
         #bias_attr=self.bias_attr)
 
-        self.k_proj = Linear(
-            self.kdim,
-            self.embed_dim,
-            self.weight_attr,
-            bias_attr=self.bias_attr)
-        self.v_proj = Linear(
-            self.vdim,
-            self.embed_dim,
-            self.weight_attr,
-            bias_attr=self.bias_attr)
-        self.out_proj = Linear(
-            self.embed_dim,
-            self.embed_dim,
-            self.weight_attr,
-            bias_attr=self.bias_attr)
-
-        self.ffn1_proj = Linear(
-            self.embed_dim,
-            4 * self.embed_dim,
-            self.weight_attr,
-            bias_attr=self.bias_attr)
-        self.ffn2_proj = Linear(
-            4 * self.embed_dim,
-            self.embed_dim,
-            self.weight_attr,
-            bias_attr=self.bias_attr)
+        self.k_proj = Linear(self.kdim,
+                             self.embed_dim,
+                             self.weight_attr,
+                             bias_attr=self.bias_attr)
+        self.v_proj = Linear(self.vdim,
+                             self.embed_dim,
+                             self.weight_attr,
+                             bias_attr=self.bias_attr)
+        self.out_proj = Linear(self.embed_dim,
+                               self.embed_dim,
+                               self.weight_attr,
+                               bias_attr=self.bias_attr)
+
+        self.ffn1_proj = Linear(self.embed_dim,
+                                4 * self.embed_dim,
+                                self.weight_attr,
+                                bias_attr=self.bias_attr)
+        self.ffn2_proj = Linear(4 * self.embed_dim,
+                                self.embed_dim,
+                                self.weight_attr,
+                                bias_attr=self.bias_attr)
 
         paddle.set_default_dtype(np.float32)
         self.norm = LayerNorm(self.embed_dim)
@@ -234,8 +229,10 @@ def GetBaselineOut(self):
 
             # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
             # --> [B, n_head, seq_len, out_seq_len]
-            qk_out = layers.matmul(
-                x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5)
+            qk_out = layers.matmul(x=q_out,
+                                   y=k_out,
+                                   transpose_y=True,
+                                   alpha=self.head_dim**-0.5)
 
             if self.debug:
                 print('qk out is')
@@ -255,11 +252,10 @@ def GetBaselineOut(self):
                 print('softmax out is')
                 print(softmax_out[0][0][0])
             if self.dropout_prob:
-                dropout_out = F.dropout(
-                    softmax_out,
-                    self.dropout_prob,
-                    training=self.training,
-                    mode="upscale_in_train")
+                dropout_out = F.dropout(softmax_out,
+                                        self.dropout_prob,
+                                        training=self.training,
+                                        mode="upscale_in_train")
                 # [B, n_head, seq_len, out_seq_len] * [B, n_head, out_seq_len, head_dim]
                 # --> [B, n_head, seq_len, head_dim]
                 qktv_out = tensor.matmul(dropout_out, v_out)
@@ -271,8 +267,7 @@ def GetBaselineOut(self):
                 print('fmha out is')
                 print(fmha_out[0][0][0])
             out_linear_in = tensor.reshape(
-                x=fmha_out,
-                shape=[0, 0, fmha_out.shape[2] * fmha_out.shape[3]])
+                x=fmha_out, shape=[0, 0, fmha_out.shape[2] * fmha_out.shape[3]])
             out = self.out_proj(out_linear_in)
 
             residual_out = residual + self.dropout(out)
@@ -302,44 +297,44 @@ def GetBaselineOut(self):
 
     def GetFusedMultiTransformerOut(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
-        q_proj_weight = paddle.to_tensor(
-            self.q_proj.weight, stop_gradient=False)
-        k_proj_weight = paddle.to_tensor(
-            self.k_proj.weight, stop_gradient=False)
-        v_proj_weight = paddle.to_tensor(
-            self.v_proj.weight, stop_gradient=False)
-        out_linear_weight = paddle.to_tensor(
-            self.out_proj.weight, stop_gradient=False)
-        ffn1_weight = paddle.to_tensor(
-            self.ffn1_proj.weight, stop_gradient=False)
-        ffn2_weight = paddle.to_tensor(
-            self.ffn2_proj.weight, stop_gradient=False)
+        q_proj_weight = paddle.to_tensor(self.q_proj.weight,
+                                         stop_gradient=False)
+        k_proj_weight = paddle.to_tensor(self.k_proj.weight,
+                                         stop_gradient=False)
+        v_proj_weight = paddle.to_tensor(self.v_proj.weight,
+                                         stop_gradient=False)
+        out_linear_weight = paddle.to_tensor(self.out_proj.weight,
+                                             stop_gradient=False)
+        ffn1_weight = paddle.to_tensor(self.ffn1_proj.weight,
+                                       stop_gradient=False)
+        ffn2_weight = paddle.to_tensor(self.ffn2_proj.weight,
+                                       stop_gradient=False)
 
         if self.bias_attr is False:
             qkv_bias_tensor = None
             out_linear_bias = None
         else:
-            q_proj_bias = paddle.to_tensor(
-                self.q_proj.bias, stop_gradient=False)
-            k_proj_bias = paddle.to_tensor(
-                self.k_proj.bias, stop_gradient=False)
-            v_proj_bias = paddle.to_tensor(
-                self.v_proj.bias, stop_gradient=False)
+            q_proj_bias = paddle.to_tensor(self.q_proj.bias,
+                                           stop_gradient=False)
+            k_proj_bias = paddle.to_tensor(self.k_proj.bias,
+                                           stop_gradient=False)
+            v_proj_bias = paddle.to_tensor(self.v_proj.bias,
+                                           stop_gradient=False)
             qkv_bias = np.concatenate(
                 (q_proj_bias.numpy(), k_proj_bias.numpy(), v_proj_bias.numpy()))
             qkv_bias = qkv_bias.reshape((3, self.num_heads, self.head_dim))
             qkv_bias_tensor = paddle.to_tensor(qkv_bias, stop_gradient=False)
-            out_linear_bias = paddle.to_tensor(
-                self.out_proj.bias, stop_gradient=False)
-            ffn1_bias = paddle.to_tensor(
-                self.ffn1_proj.bias, stop_gradient=False)
-            ffn2_bias = paddle.to_tensor(
-                self.ffn2_proj.bias, stop_gradient=False)
+            out_linear_bias = paddle.to_tensor(self.out_proj.bias,
+                                               stop_gradient=False)
+            ffn1_bias = paddle.to_tensor(self.ffn1_proj.bias,
+                                         stop_gradient=False)
+            ffn2_bias = paddle.to_tensor(self.ffn2_proj.bias,
+                                         stop_gradient=False)
 
         ln_scale = paddle.to_tensor(self.norm.weight, stop_gradient=False)
         ln_bias = paddle.to_tensor(self.norm.bias, stop_gradient=False)
-        ffn_ln_scale = paddle.to_tensor(
-            self.ffn_norm.weight, stop_gradient=False)
+        ffn_ln_scale = paddle.to_tensor(self.ffn_norm.weight,
+                                        stop_gradient=False)
         ffn_ln_bias = paddle.to_tensor(self.ffn_norm.bias, stop_gradient=False)
 
         q_proj_weight = q_proj_weight.numpy().transpose((1, 0))
@@ -357,12 +352,11 @@ def GetFusedMultiTransformerOut(self):
             cache_kvs = []
 
             max_seq_length = (self.cache_length + 128) // 128 * 128
-            cache_kv = np.zeros(
-                [
-                    2, self.batch_size, self.num_heads, max_seq_length,
-                    self.head_dim
-                ],
-                dtype=self.x_type)
+            cache_kv = np.zeros([
+                2, self.batch_size, self.num_heads, max_seq_length,
+                self.head_dim
+            ],
+                                dtype=self.x_type)
 
             elems = 4
             if self.x_type is np.float16:
@@ -390,8 +384,9 @@ def GetFusedMultiTransformerOut(self):
                 assert self.query_length == self.cache_length
                 cache_kv[:] = 0
             else:
-                time_step = paddle.to_tensor(
-                    [self.cache_length], dtype='int32', place=paddle.CPUPlace())
+                time_step = paddle.to_tensor([self.cache_length],
+                                             dtype='int32',
+                                             place=paddle.CPUPlace())
         if self.has_attn_mask:
             attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
         else:
@@ -423,31 +418,29 @@ def GetFusedMultiTransformerOut(self):
             ffn_ln_scales.append(ffn_ln_scale)
             ffn_ln_biases.append(ffn_ln_bias)
             if self.has_cache_kv:
-                cache_kvs.append(
-                    paddle.to_tensor(
-                        cache_kv, stop_gradient=False))
-
-        final_out = fused_multi_transformer(
-            x,
-            ln_scales,
-            ln_biases,
-            qkv_weights,
-            qkv_biases,
-            out_weights,
-            out_biases,
-            ffn_ln_scales,
-            ffn_ln_biases,
-            ffn1_weights,
-            ffn1_biases,
-            ffn2_weights,
-            ffn2_biases,
-            pre_layer_norm=self.pre_layer_norm,
-            epsilon=epsilon,
-            cache_kvs=cache_kvs,
-            time_step=time_step,
-            attn_mask=attn_mask,
-            dropout_rate=self.dropout_prob,
-            training=self.training)
+                cache_kvs.append(paddle.to_tensor(cache_kv,
+                                                  stop_gradient=False))
+
+        final_out = fused_multi_transformer(x,
+                                            ln_scales,
+                                            ln_biases,
+                                            qkv_weights,
+                                            qkv_biases,
+                                            out_weights,
+                                            out_biases,
+                                            ffn_ln_scales,
+                                            ffn_ln_biases,
+                                            ffn1_weights,
+                                            ffn1_biases,
+                                            ffn2_weights,
+                                            ffn2_biases,
+                                            pre_layer_norm=self.pre_layer_norm,
+                                            epsilon=epsilon,
+                                            cache_kvs=cache_kvs,
+                                            time_step=time_step,
+                                            attn_mask=attn_mask,
+                                            dropout_rate=self.dropout_prob,
+                                            training=self.training)
 
         if self.has_cache_kv:
             return final_out[0], final_out[1]
@@ -469,9 +462,9 @@ def test_fused_multi_transformer_op(self):
 
             if self.debug:
                 print("cache_k out timestep=128")
-                print(cache_kv_out[0].reshape([
-                    2, bsz, num_head, v_elems, max_seq_len, elems
-                ])[0, 0, 0, :, self.cache_length, :])
+                print(cache_kv_out[0].reshape(
+                    [2, bsz, num_head, v_elems, max_seq_len,
+                     elems])[0, 0, 0, :, self.cache_length, :])
 
                 print("cache_v out timestep=128")
                 print(cache_kv_out[0][1, 0, 0, self.cache_length, :])
@@ -492,18 +485,25 @@ def test_fused_multi_transformer_op(self):
 
                     cache_v = cache_kv_out[i][1, :, :, :self.cache_length, :]
 
-                    np.testing.assert_allclose(
-                        cache_k_ref, cache_k, rtol=self.rtol, atol=self.atol)
-                    np.testing.assert_allclose(
-                        cache_v_ref, cache_v, rtol=self.rtol, atol=self.atol)
+                    np.testing.assert_allclose(cache_k_ref,
+                                               cache_k,
+                                               rtol=self.rtol,
+                                               atol=self.atol)
+                    np.testing.assert_allclose(cache_v_ref,
+                                               cache_v,
+                                               rtol=self.rtol,
+                                               atol=self.atol)
                     if i == 0:
                         break
 
-        np.testing.assert_allclose(
-            final_out_ref, final_out, rtol=self.rtol, atol=self.atol)
+        np.testing.assert_allclose(final_out_ref,
+                                   final_out,
+                                   rtol=self.rtol,
+                                   atol=self.atol)
 
 
 class TestFusedMultiTransformerOpFp16(TestFusedMultiTransformerOp):
+
     def config(self):
         super().config()
         self.x_type = np.float16
@@ -511,6 +511,7 @@ def config(self):
 
 
 class TestFusedMultiTransformerOpCacheKV(TestFusedMultiTransformerOp):
+
     def config(self):
         super().config()
         self.has_cache_kv = True
@@ -520,6 +521,7 @@ def config(self):
 
 
 class TestFusedMultiTransformerOpCacheKVFp16(TestFusedMultiTransformerOp):
+
     def config(self):
         super().config()
         self.has_cache_kv = True
@@ -529,6 +531,7 @@ def config(self):
 
 
 class TestFusedMultiTransformerOpGenCacheKV(TestFusedMultiTransformerOp):
+
     def config(self):
         super().config()
         self.has_cache_kv = True
@@ -536,6 +539,7 @@ def config(self):
 
 
 class TestFusedMultiTransformerOpGenCacheKVFp16(TestFusedMultiTransformerOp):
+
     def config(self):
         super().config()
         self.has_cache_kv = True
diff --git a/python/paddle/fluid/tests/unittests/test_fused_multihead_matmul_op.py b/python/paddle/fluid/tests/unittests/test_fused_multihead_matmul_op.py
index d78e929fb60a1..0b05a660243cf 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_multihead_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_multihead_matmul_op.py
@@ -35,6 +35,7 @@ def stable_softmax(x):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "Paddle core is not compiled with CUDA")
 class TestFusedMultiheadMatmulOp(OpTest):
+
     def config(self):
         self.seq_len = 128
         self.size_per_head = 64
@@ -113,6 +114,7 @@ def test_check_output(self):
 
 
 class TestFusedMultiHeadMatmulOp2(TestFusedMultiheadMatmulOp):
+
     def config(self):
         self.seq_len = 256
         self.size_per_head = 32
diff --git a/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py b/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
index 843b495e85b9a..882258239d09e 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
@@ -21,6 +21,7 @@
 
 
 class TestFusedTransformerEncoderLayer(unittest.TestCase):
+
     def setActivation(self):
         self.activation = 'gelu'
 
@@ -60,7 +61,8 @@ def setUp(self):
     def fused_weight(self, weight, num_head):
         a = paddle.transpose(weight, perm=[1, 0])
         return paddle.reshape(
-            a, shape=[1, num_head, int(a.shape[0] / num_head), a.shape[1]])
+            a, shape=[1, num_head,
+                      int(a.shape[0] / num_head), a.shape[1]])
 
     def fused_qkv(self, q, k, v, num_head):
         fq = self.fused_weight(q, num_head)
@@ -80,10 +82,9 @@ def test_out(self):
                              self.embed_dim).astype(self.dtype)
 
         if self.has_attn_mask:
-            attn_mask = np.ones(
-                (self.batch_size, self.num_heads, self.query_length,
-                 self.key_length),
-                dtype=self.attn_mask_type)
+            attn_mask = np.ones((self.batch_size, self.num_heads,
+                                 self.query_length, self.key_length),
+                                dtype=self.attn_mask_type)
             attn_mask_tensor = paddle.to_tensor(attn_mask)
         else:
             attn_mask = None
@@ -91,9 +92,8 @@ def test_out(self):
 
         dout = np.random.random(src.shape).astype(self.dtype)
 
-        base_out = base_encoder(
-            paddle.to_tensor(
-                src, stop_gradient=False), attn_mask_tensor)
+        base_out = base_encoder(paddle.to_tensor(src, stop_gradient=False),
+                                attn_mask_tensor)
         paddle.autograd.backward([base_out], [paddle.to_tensor(dout)], True)
 
         fused_encoder = FusedTransformerEncoderLayer(
@@ -138,12 +138,12 @@ def test_out(self):
         tmp = paddle.concat(x=[q_bias, k_bias, v_bias], axis=0)
         qkv_bias = paddle.reshape(
             tmp,
-            shape=[3, self.num_heads, int(tmp.shape[0] / 3 / self.num_heads)])
+            shape=[3, self.num_heads,
+                   int(tmp.shape[0] / 3 / self.num_heads)])
         fused_encoder.fused_attn.qkv_bias.set_value(qkv_bias)
 
-        fused_out = fused_encoder(
-            paddle.to_tensor(
-                src, stop_gradient=False), attn_mask_tensor)
+        fused_out = fused_encoder(paddle.to_tensor(src, stop_gradient=False),
+                                  attn_mask_tensor)
         paddle.autograd.backward([fused_out], [paddle.to_tensor(dout)], True)
 
         correct_ffn_str = 'd_model={}, dim_feedforward={}, dropout_rate={}, epsilon={}, activation={}, act_dropout_rate={}, normalize_before={}, dtype={}'.format(
@@ -158,35 +158,40 @@ def test_out(self):
             self.pre_layer_norm, False, self.dtype)
         self.assertTrue(fused_encoder.fused_attn.extra_repr(), correct_attn_str)
 
-        np.testing.assert_allclose(
-            fused_out.numpy(), base_out.numpy(), rtol=self.rtol, atol=self.atol)
+        np.testing.assert_allclose(fused_out.numpy(),
+                                   base_out.numpy(),
+                                   rtol=self.rtol,
+                                   atol=self.atol)
         self.assertTrue(
-            np.allclose(
-                fused_out.grad.numpy(),
-                base_out.grad.numpy(),
-                rtol=self.rtol,
-                atol=self.atol))
+            np.allclose(fused_out.grad.numpy(),
+                        base_out.grad.numpy(),
+                        rtol=self.rtol,
+                        atol=self.atol))
 
 
 class TestFusedTransformerEncoderLayerAct(TestFusedTransformerEncoderLayer):
+
     def setActivation(self):
         self.activation = 'relu'
 
 
 class TestFusedTransformerEncoderLayerPreLayerNorm(
         TestFusedTransformerEncoderLayer):
+
     def setPreLayerNorm(self):
         self.pre_layer_norm = True
 
 
 class TestFusedTransformerEncoderLayerAttnMaskIsNone(
         TestFusedTransformerEncoderLayer):
+
     def setAttnMask(self):
         self.has_attn_mask = False
 
 
 class TestFusedTransformerEncoderLayerPreLnTrueAttnMaskIsNone(
         TestFusedTransformerEncoderLayer):
+
     def setPreLayerNorm(self):
         self.pre_layer_norm = True
 
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
index c241fc65d9b82..dd4707c3fc366 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
@@ -37,8 +37,7 @@ def fusion_gru(
                lod,
                h0,
                wh,
-               np.zeros(
-                   (1, wh.shape[1]), dtype='float32'),
+               np.zeros((1, wh.shape[1]), dtype='float32'),
                is_reverse,
                act_state,
                act_gate,
@@ -46,6 +45,7 @@ def fusion_gru(
 
 
 class TestFusionGRUOp(OpTest):
+
     def set_confs(self):
         pass
 
@@ -76,9 +76,10 @@ def setUp(self):
             N, self.D).astype('float32') if self.with_h0 else np.zeros(
                 (N, self.D), dtype='float32')
 
-        _, _, _, hidden = fusion_gru(
-            x, self.lod, h0, wx, wh, bias, self.is_reverse, self.origin_mode,
-            ACTIVATION[self.act_state], ACTIVATION[self.act_gate])
+        _, _, _, hidden = fusion_gru(x, self.lod, h0, wx, wh, bias,
+                                     self.is_reverse, self.origin_mode,
+                                     ACTIVATION[self.act_state],
+                                     ACTIVATION[self.act_gate])
 
         self.inputs = {'X': (x, self.lod), 'WeightX': wx, 'WeightH': wh}
 
@@ -105,39 +106,46 @@ def test_check_output(self):
 
 
 class TestFusionGRUOpNoInitial(TestFusionGRUOp):
+
     def set_confs(self):
         self.with_h0 = False
 
 
 class TestFusionGRUOpNoBias(TestFusionGRUOp):
+
     def set_confs(self):
         self.with_bias = False
 
 
 class TestFusionGRUOpReverse(TestFusionGRUOp):
+
     def set_confs(self):
         self.is_reverse = True
 
 
 class TestFusionGRUOpMD1(TestFusionGRUOp):
+
     def set_confs(self):
         self.M = 36
         self.D = 8
 
 
 class TestFusionGRUOpMD2(TestFusionGRUOp):
+
     def set_confs(self):
         self.M = 8
         self.D = 8
 
 
 class TestFusionGRUOpMD3(TestFusionGRUOp):
+
     def set_confs(self):
         self.M = 17
         self.D = 15
 
 
 class TestFusionGRUOpBS1(TestFusionGRUOp):
+
     def set_confs(self):
         self.lod = [[3]]
         self.D = 16
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
index 4899927a7694f..c7dfaa1d90747 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -38,12 +38,12 @@ def fusion_lstm(
         act_gate=None,
         act_cell=None,
         act_cand=None):
-    return lstm(
-        fc(x, wx, bx), lod, h0, c0, w_h, w_b, w_c, is_reverse, act_gate,
-        act_cell, act_cand)
+    return lstm(fc(x, wx, bx), lod, h0, c0, w_h, w_b, w_c, is_reverse, act_gate,
+                act_cell, act_cand)
 
 
 class TestFusionLSTMOp(OpTest):
+
     def set_conf(self):
         pass
 
@@ -122,63 +122,74 @@ def test_check_output(self):
 
 
 class TestFusionLSTMOpInit(TestFusionLSTMOp):
+
     def set_conf(self):
         self.has_initial_state = True
 
 
 class TestFusionLSTMOpReverse(TestFusionLSTMOp):
+
     def set_conf(self):
         self.is_reverse = True
 
 
 class TestFusionLSTMOpInitReverse(TestFusionLSTMOp):
+
     def set_conf(self):
         self.has_initial_state = True
         self.is_reverse = True
 
 
 class TestFusionLSTMOpMD1(TestFusionLSTMOp):
+
     def set_conf(self):
         self.M = 36
         self.D = 8
 
 
 class TestFusionLSTMOpMD2(TestFusionLSTMOp):
+
     def set_conf(self):
         self.M = 8
         self.D = 8
 
 
 class TestFusionLSTMOpMD3(TestFusionLSTMOp):
+
     def set_conf(self):
         self.M = 15
         self.D = 3
 
 
 class TestFusionLSTMOpBS1(TestFusionLSTMOp):
+
     def set_conf(self):
         self.lod = [[3]]
         self.D = 16
 
 
 class TestFusionLSTMOpPeepholes(TestFusionLSTMOp):
+
     def set_conf(self):
         self.use_peepholes = True
 
 
 class TestFusionLSTMOpPeepholesInit(TestFusionLSTMOp):
+
     def set_conf(self):
         self.use_peepholes = True
         self.has_initial_state = True
 
 
 class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
+
     def set_conf(self):
         self.use_peepholes = True
         self.is_reverse = True
 
 
 class TestFusionLSTMOpPeepholesInitReverse(TestFusionLSTMOp):
+
     def set_conf(self):
         self.use_peepholes = True
         self.has_initial_state = True
@@ -186,6 +197,7 @@ def set_conf(self):
 
 
 class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
+
     def set_conf(self):
         self.use_peepholes = True
         self.lod = [[2]]
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_repeated_fc_relu_op.py b/python/paddle/fluid/tests/unittests/test_fusion_repeated_fc_relu_op.py
index aa2440803490c..c32d1db5e5d98 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_repeated_fc_relu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_repeated_fc_relu_op.py
@@ -21,6 +21,7 @@
 
 
 class TestFusionRepeatedFCReluOp(OpTest):
+
     def setUp(self):
         self.bs = 3
         self.ic = 9
@@ -38,20 +39,21 @@ def setUp(self):
         i = 0
         matrix = MatrixGenerate(self.bs, ics[i], self.oc[i], 1, 1)
         inp = np.reshape(matrix.input, [self.bs, ics[i]])
-        weights.append(('W_{0}'.format(i), np.reshape(matrix.weights,
-                                                      [ics[i], self.oc[i]])))
+        weights.append(
+            ('W_{0}'.format(i), np.reshape(matrix.weights,
+                                           [ics[i], self.oc[i]])))
         biases.append(('B_{0}'.format(i), matrix.bias))
         outs.append(
-            np.reshape(
-                np.maximum(fc_refer(matrix, True), 0), [self.bs, self.oc[i]]))
+            np.reshape(np.maximum(fc_refer(matrix, True), 0),
+                       [self.bs, self.oc[i]]))
 
         for i in range(sz - 1):
             matrix = MatrixGenerate(self.bs, ics[i + 1], self.oc[i + 1], 1, 1)
             matrix.input = np.reshape(outs[i], [self.bs, ics[i + 1], 1, 1])
             out = fc_refer(matrix, True)
-            weights.append(
-                ('W_{0}'.format(i + 1),
-                 np.reshape(matrix.weights, [ics[i + 1], self.oc[i + 1]])))
+            weights.append(('W_{0}'.format(i + 1),
+                            np.reshape(matrix.weights,
+                                       [ics[i + 1], self.oc[i + 1]])))
             biases.append(('B_{0}'.format(i + 1), matrix.bias))
             outs.append(
                 np.reshape(np.maximum(out, 0), [self.bs, self.oc[i + 1]]))
@@ -76,6 +78,7 @@ def set_conf(self):
 
 
 class TestFusionRepeatedFCReluOpBS1(TestFusionRepeatedFCReluOp):
+
     def set_conf(self):
         self.bs = 1
         self.oc = [4, 2, 7, 5, 512, 1024]
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py
index b6d643c357140..fc40d6dc21d8d 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py
@@ -22,6 +22,7 @@
 
 
 class TestSeqConvEltAddRelu(OpTest):
+
     def set_conf(self):
         pass
 
@@ -40,8 +41,8 @@ def setUp(self):
         T = sum(self.lod[0])
         x = np.random.uniform(-1, 1, [T, self.in_fea_size]).astype('float32')
         w = np.random.uniform(
-            -1, 1, [self.in_fea_size * self.context_length,
-                    self.out_fea_size]).astype('float32')
+            -1, 1, [self.in_fea_size * self.context_length, self.out_fea_size
+                    ]).astype('float32')
         b = np.random.uniform(-2, 1, [1, self.out_fea_size]).astype('float32')
         out = seqconv(x, self.lod, w, self.context_length, self.context_start)
         out = np.maximum(out + b, 0)
@@ -59,16 +60,19 @@ def test_check_output(self):
 
 
 class TestSeqConvEltAddReluBS1(TestSeqConvEltAddRelu):
+
     def set_conf(self):
         self.lod = [[10]]
 
 
 class TestSeqConvEltAddReluBS1Case2(TestSeqConvEltAddRelu):
+
     def set_conf(self):
         self.lod = [[2]]
 
 
 class TestSeqConvEltAddReluCase1(TestSeqConvEltAddRelu):
+
     def set_conf(self):
         self.lod = [[3, 5, 1, 6]]
         self.context_length = 3
@@ -76,6 +80,7 @@ def set_conf(self):
 
 
 class TestSeqConvEltAddReluCase2(TestSeqConvEltAddRelu):
+
     def set_conf(self):
         self.lod = [[10, 1, 2, 4, 1, 5, 6]]
         self.in_fea_size = 2
@@ -84,6 +89,7 @@ def set_conf(self):
 
 
 class TestSeqConvEltAddReluCase3(TestSeqConvEltAddRelu):
+
     def set_conf(self):
         self.lod = [[10, 1, 2, 4, 1, 5, 6]]
         self.context_length = 5
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py
index 702545d2ee4ca..d519d3eee2d82 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py
@@ -47,6 +47,7 @@ def fusion_seqexpand_concat_fc(xs, lod, w, b, fc_act):
 
 
 class TestFusionSeqExpandConcatFCOp(OpTest):
+
     def set_conf(self):
         pass
 
@@ -73,8 +74,8 @@ def setUp(self):
         # fc weight and bias
         w = np.random.normal(size=(sum(self.inputs_M),
                                    self.D)).astype('float32')
-        b = np.random.normal(size=(
-            1, self.D)).astype('float32') if self.with_bias else np.zeros(
+        b = np.random.normal(
+            size=(1, self.D)).astype('float32') if self.with_bias else np.zeros(
                 (1, self.D)).astype('float32')
 
         out = fusion_seqexpand_concat_fc(xs, self.lod, w, b,
@@ -96,40 +97,47 @@ def test_check_output(self):
 
 
 class TestFusionSECFCOpNonBias(TestFusionSeqExpandConcatFCOp):
+
     def set_conf(self):
         self.with_bias = False
 
 
 class TestFusionSECFCOpNonAct(TestFusionSeqExpandConcatFCOp):
+
     def set_conf(self):
         self.fc_act = 'identity'
 
 
 class TestFusionSECFCOpMD1(TestFusionSeqExpandConcatFCOp):
+
     def set_conf(self):
         self.inputs_M = [3, 4, 2, 1, 5]
         self.D = 8
 
 
 class TestFusionSECFCOpMD2(TestFusionSeqExpandConcatFCOp):
+
     def set_conf(self):
         self.lod = [[5, 6]]
         self.inputs_M = [1, 1]
 
 
 class TestFusionSECFCOpBS1_1(TestFusionSeqExpandConcatFCOp):
+
     def set_conf(self):
         self.lod = [[1]]
         self.inputs_M = [3, 4, 2]
 
 
 class TestFusionSECFCOpBS1_2(TestFusionSeqExpandConcatFCOp):
+
     def set_conf(self):
         self.lod = [[1]]
         self.inputs_M = [3, 4]
 
 
 class TestFusionSECFCOpBS1_3(TestFusionSeqExpandConcatFCOp):
+
     def set_conf(self):
         self.lod = [[5]]
         self.inputs_M = [6, 3]
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
index fa42f5d09b86a..34ce7beea22ff 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
@@ -22,6 +22,7 @@
 
 
 class TestFusionSeqPoolConcatOp(OpTest):
+
     def setUp(self):
         self.w = 11
         self.lods = [[[2, 3, 5]], [[1, 5, 2]]]
@@ -69,22 +70,26 @@ def test_check_output(self):
 
 
 class TestFusionSeqPoolConcatOpCase1(TestFusionSeqPoolConcatOp):
+
     def set_conf(self):
         self.lods = [[[1]]]
 
 
 class TestFusionSeqPoolConcatOpCase2(TestFusionSeqPoolConcatOp):
+
     def set_conf(self):
         self.lods = [[[1]], [[1]], [[1]]]
 
 
 class TestFusionSeqPoolConcatOpCase3(TestFusionSeqPoolConcatOp):
+
     def set_conf(self):
         self.lods = [[[1, 3, 4, 6]]]
         self.w = 10
 
 
 class TestFusionSeqPoolConcatOpCase4(TestFusionSeqPoolConcatOp):
+
     def set_conf(self):
         self.lods = [[[2, 13, 4]], [[1, 1, 1]], [[5, 3, 1]], [[9, 10, 3]]]
         self.w = 3
@@ -92,11 +97,14 @@ def set_conf(self):
 
 ## test avg pool and sqrt
 def create_test_avg_sqrt_class(parent):
+
     class TestSeqPoolAvgCase(parent):
+
         def set_pooltype(self):
             self.pooltype = "AVERAGE"
 
     class TestSeqPoolSqrtCase(parent):
+
         def set_pooltype(self):
             self.pooltype = "SQRT"
 
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py
index eb681b1f167ad..8d3ac3e19ad59 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py
@@ -23,6 +23,7 @@
 
 
 class TestFusionSeqPoolCVMConcatOp(OpTest):
+
     def setUp(self):
         self.w = 11
         self.use_cvm = True
@@ -76,22 +77,26 @@ def test_check_output(self):
 
 
 class TestFusionSeqPoolCVMConcatOpCase1(TestFusionSeqPoolCVMConcatOp):
+
     def set_conf(self):
         self.lods = [[[1]]]
 
 
 class TestFusionSeqPoolCVMConcatOpCase2(TestFusionSeqPoolCVMConcatOp):
+
     def set_conf(self):
         self.lods = [[[1]], [[1]], [[1]]]
 
 
 class TestFusionSeqPoolCVMConcatOpCase3(TestFusionSeqPoolCVMConcatOp):
+
     def set_conf(self):
         self.lods = [[[1, 3, 4, 6]]]
         self.w = 10
 
 
 class TestFusionSeqPoolCVMConcatOpCase4(TestFusionSeqPoolCVMConcatOp):
+
     def set_conf(self):
         self.lods = [[[2, 13, 4]], [[1, 1, 1]], [[5, 3, 1]], [[9, 10, 3]]]
         self.w = 3
@@ -99,11 +104,14 @@ def set_conf(self):
 
 ## test avg pool and sqrt
 def create_test_avg_sqrt_class(parent):
+
     class TestSeqPoolAvgCase(parent):
+
         def set_pooltype(self):
             self.pooltype = "AVERAGE"
 
     class TestSeqPoolSqrtCase(parent):
+
         def set_pooltype(self):
             self.pooltype = "SQRT"
 
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_squared_mat_sub_op.py b/python/paddle/fluid/tests/unittests/test_fusion_squared_mat_sub_op.py
index a097d3d9a20f0..6bf1e308585d0 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_squared_mat_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_squared_mat_sub_op.py
@@ -20,6 +20,7 @@
 
 
 class TestFusionSquaredMatSubOp(OpTest):
+
     def setUp(self):
         self.op_type = 'fusion_squared_mat_sub'
         self.m = 11
@@ -35,7 +36,9 @@ def setUp(self):
             'Out':
             (np.dot(matx, maty)**2 - np.dot(matx**2, maty**2)) * self.scalar
         }
-        self.attrs = {'scalar': self.scalar, }
+        self.attrs = {
+            'scalar': self.scalar,
+        }
 
     def set_conf(self):
         pass
@@ -45,6 +48,7 @@ def test_check_output(self):
 
 
 class TestFusionSquaredMatSubOpCase1(TestFusionSquaredMatSubOp):
+
     def set_conf(self):
         self.scalar = -0.3
 
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py b/python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
index 9fe1df39d3a5e..dc827f27ab411 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
@@ -23,6 +23,7 @@
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFusionTransposeFlattenConcationOp(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.op_type = "fusion_transpose_flatten_concat"
@@ -63,6 +64,7 @@ def init_test_case(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCase1(TestFusionTransposeFlattenConcationOp):
+
     def init_test_case(self):
         self.shapes = [(3, 4, 18, 17), (3, 8, 18, 7), (6, 12, 9, 5)]
         self.trans_axis = (0, 2, 3, 1)
@@ -73,6 +75,7 @@ def init_test_case(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCase2(TestFusionTransposeFlattenConcationOp):
+
     def init_test_case(self):
         self.shapes = [(3, 8, 20, 17), (3, 8, 19, 17), (3, 8, 40, 17)]
         self.trans_axis = (0, 2, 3, 1)
@@ -83,6 +86,7 @@ def init_test_case(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCase3(TestFusionTransposeFlattenConcationOp):
+
     def init_test_case(self):
         self.shapes = [(3, 8, 20, 17), (3, 8, 19, 17), (3, 8, 40, 17)]
         self.trans_axis = (0, 3, 2, 1)
@@ -93,6 +97,7 @@ def init_test_case(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCase4(TestFusionTransposeFlattenConcationOp):
+
     def init_test_case(self):
         self.shapes = [(3, 8, 9, 17), (8, 3, 9, 17), (4, 6, 9, 17)]
         self.trans_axis = (0, 2, 1, 3)
@@ -103,6 +108,7 @@ def init_test_case(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCase5(TestFusionTransposeFlattenConcationOp):
+
     def init_test_case(self):
         self.shapes = [(3, 8, 9, 17, 2), (3, 8, 2, 17, 9), (3, 17, 9, 8, 2)]
         self.trans_axis = (0, 2, 1, 4, 3)
diff --git a/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py b/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py
index 8404c563274b1..1d37558bcfaef 100644
--- a/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py
+++ b/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py
@@ -22,6 +22,7 @@
 
 
 class GastNodeTransformer(gast.NodeTransformer):
+
     def __init__(self, root):
         self.root = root
 
@@ -123,6 +124,7 @@ def code_ast(source):
 
 
 class TestPythonCompatibility(unittest.TestCase):
+
     def _check_compatibility(self, source, target):
         source_dump = code_gast_ast(source)
         target_dump = code_ast(target)
@@ -144,12 +146,12 @@ def foo(x_new, y_new):
         self._check_compatibility(source, target)
 
     # The 0.3.3 version of gast has a bug in python3.8 that
-    # would cause the following tests to fail. But this 
-    # problem doesn't affect the use of Paddle's related 
-    # functions, therefore, the following tests would be 
+    # would cause the following tests to fail. But this
+    # problem doesn't affect the use of Paddle's related
+    # functions, therefore, the following tests would be
     # disable in python3.8.
     #
-    # This problem had been fixed and updated to version 
+    # This problem had been fixed and updated to version
     # 0.4.1 of gast.
     #
     # More information please refer to:
diff --git a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
index ac2d980f7fd38..31cf8cdc3a744 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
@@ -41,6 +41,7 @@ def test_check_grad(self):
 
 
 class TestGatherNdOpWithIndex1(OpTest):
+
     def setUp(self):
         self.op_type = "gather_nd"
         self.python_api = paddle.gather_nd
@@ -157,9 +158,11 @@ def test_check_grad(self):
 
 #Test Python API
 class TestGatherNdOpAPI(unittest.TestCase):
+
     def test_case1(self):
-        x1 = fluid.layers.data(
-            name='x1', shape=[30, 40, 50, 60], dtype='float32')
+        x1 = fluid.layers.data(name='x1',
+                               shape=[30, 40, 50, 60],
+                               dtype='float32')
         index1 = fluid.layers.data(name='index1', shape=[2, 4], dtype='int32')
         output1 = fluid.layers.gather_nd(x1, index1)
 
@@ -176,13 +179,17 @@ def test_case3(self):
 
 #Test Raise Index Error
 class TestGatherNdOpRaise(unittest.TestCase):
+
     def test_check_raise(self):
+
         def check_raise_is_test():
             try:
-                x = fluid.layers.data(
-                    name='x', shape=[3, 4, 5], dtype='float32')
-                index = fluid.layers.data(
-                    name='index', shape=[2, 10], dtype='int32')
+                x = fluid.layers.data(name='x',
+                                      shape=[3, 4, 5],
+                                      dtype='float32')
+                index = fluid.layers.data(name='index',
+                                          shape=[2, 10],
+                                          dtype='int32')
                 output = fluid.layers.gather_nd(x, index)
             except Exception as e:
                 t = \
@@ -194,6 +201,7 @@ def check_raise_is_test():
 
 
 class TestGatherNdError(unittest.TestCase):
+
     def test_error(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
@@ -201,8 +209,9 @@ def test_error(self):
             shape = [8, 9, 6]
             x = paddle.fluid.data(shape=shape, dtype='float32', name='x')
             index = paddle.fluid.data(shape=shape, dtype='bool', name='index')
-            index_float = paddle.fluid.data(
-                shape=shape, dtype='float32', name='index_float')
+            index_float = paddle.fluid.data(shape=shape,
+                                            dtype='float32',
+                                            name='index_float')
             np_x = np.random.random(shape).astype('float32')
             np_index = np.array(np.random.randint(2, size=shape, dtype=bool))
 
@@ -223,6 +232,7 @@ def test_index_dtype():
 
 
 class TestGatherNdAPI2(unittest.TestCase):
+
     def test_static(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float64')
@@ -232,8 +242,10 @@ def test_static(self):
             exe = fluid.Executor(place)
             input = np.array([[1, 2], [3, 4], [5, 6]])
             index_1 = np.array([[1]])
-            result, = exe.run(feed={"data1": input,
-                                    "index": index_1},
+            result, = exe.run(feed={
+                "data1": input,
+                "index": index_1
+            },
                               fetch_list=[out])
             expected_output = np.array([[3, 4]])
         self.assertTrue(np.allclose(result, expected_output))
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 3d7dc2da052f3..0c356f4bc38cd 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -31,6 +31,7 @@ def gather_numpy(x, index, axis):
 
 
 class TestGatherOp(OpTest):
+
     def setUp(self):
         self.op_type = "gather"
         self.python_api = paddle.gather
@@ -59,6 +60,7 @@ def config(self):
 
 
 class TestCase1(TestGatherOp):
+
     def config(self):
         """
         For one dimension input
@@ -70,6 +72,7 @@ def config(self):
 
 
 class TestCase2(TestGatherOp):
+
     def config(self):
         """
         For int64_t index type
@@ -81,6 +84,7 @@ def config(self):
 
 
 class TestCase3(TestGatherOp):
+
     def config(self):
         """
         For other input type
@@ -92,6 +96,7 @@ def config(self):
 
 
 class TestCase4(TestGatherOp):
+
     def config(self):
         self.x_shape = (10, 20)
         self.attrs = {'overwrite': False}
@@ -101,6 +106,7 @@ def config(self):
 
 
 class TestCase5(TestGatherOp):
+
     def config(self):
         self.x_shape = (10, 20)
         self.attrs = {'overwrite': False}
@@ -110,6 +116,7 @@ def config(self):
 
 
 class TestCase6(TestGatherOp):
+
     def config(self):
         self.x_shape = (10, 20)
         self.attrs = {'overwrite': True}
@@ -119,6 +126,7 @@ def config(self):
 
 
 class TestGatherBF16Op(OpTest):
+
     def setUp(self):
         self.op_type = "gather"
         self.python_api = paddle.gather
@@ -153,6 +161,7 @@ def config(self):
 
 
 class TestGatherOp1(OpTest):
+
     def setUp(self):
         self.op_type = "gather"
         self.python_api = paddle.gather
@@ -183,6 +192,7 @@ def config(self):
 
 
 class TestGatherOp2(TestGatherOp1):
+
     def config(self):
         """
         For multi-dimension input
@@ -196,6 +206,7 @@ def config(self):
 
 
 class TestGatherOp3(TestGatherOp1):
+
     def config(self):
         """
         For multi-dimension input
@@ -209,6 +220,7 @@ def config(self):
 
 
 class TestGatherOp4(TestGatherOp1):
+
     def config(self):
         """
         For multi-dimension input
@@ -223,6 +235,7 @@ def config(self):
 
 
 class API_TestGather(unittest.TestCase):
+
     def test_out1(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float64')
@@ -232,8 +245,10 @@ def test_out1(self):
             exe = fluid.Executor(place)
             input = np.array([[1, 2], [3, 4], [5, 6]])
             index_1 = np.array([1, 2])
-            result, = exe.run(feed={"data1": input,
-                                    "index": index_1},
+            result, = exe.run(feed={
+                "data1": input,
+                "index": index_1
+            },
                               fetch_list=[out])
             expected_output = np.array([[3, 4], [5, 6]])
         self.assertTrue(np.allclose(result, expected_output))
@@ -250,16 +265,18 @@ def test_out2(self):
             x_np = np.array([[1, 2], [3, 4], [5, 6]]).astype('float64')
             index_np = np.array([1, 1]).astype('int32')
             axis_np = np.array([1]).astype('int32')
-            result, = exe.run(
-                feed={"x": x_np,
-                      "index": index_np,
-                      'axis': axis_np},
-                fetch_list=[out])
+            result, = exe.run(feed={
+                "x": x_np,
+                "index": index_np,
+                'axis': axis_np
+            },
+                              fetch_list=[out])
             expected_output = gather_numpy(x_np, index_np, axis_np[0])
         self.assertTrue(np.allclose(result, expected_output))
 
 
 class API_TestDygraphGather(unittest.TestCase):
+
     def test_out1(self):
         paddle.disable_static()
         input_1 = np.array([[1, 2], [3, 4], [5, 6]])
@@ -304,8 +321,8 @@ def test_large_data(self):
 
         def test_dygraph():
             with fluid.dygraph.guard():
-                gpu_out = paddle.gather(
-                    paddle.to_tensor(x), paddle.to_tensor(index))
+                gpu_out = paddle.gather(paddle.to_tensor(x),
+                                        paddle.to_tensor(index))
                 return gpu_out.numpy()
 
         @switch_to_static_graph
@@ -313,8 +330,9 @@ def test_static_graph():
             with paddle.static.program_guard(paddle.static.Program(),
                                              paddle.static.Program()):
                 x_t = paddle.static.data(name="x", dtype=x.dtype, shape=x.shape)
-                index_t = paddle.static.data(
-                    name="index", dtype=index.dtype, shape=index.shape)
+                index_t = paddle.static.data(name="index",
+                                             dtype=index.dtype,
+                                             shape=index.shape)
                 out_t = paddle.gather(x_t, index_t)
                 feed = {x_t.name: x, index_t.name: index}
                 fetch = [out_t]
@@ -327,6 +345,7 @@ def test_static_graph():
 
 
 class TestGathertError(unittest.TestCase):
+
     def test_error1(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
@@ -335,8 +354,9 @@ def test_error1(self):
             x = paddle.fluid.data(shape=shape, dtype='int8', name='x')
             axis = paddle.fluid.data(shape=[1], dtype='float32', name='axis')
             index = paddle.fluid.data(shape=shape, dtype='int32', name='index')
-            index_float = paddle.fluid.data(
-                shape=shape, dtype='float32', name='index_float')
+            index_float = paddle.fluid.data(shape=shape,
+                                            dtype='float32',
+                                            name='index_float')
 
             def test_x_type():
                 paddle.gather(x, index)
@@ -364,8 +384,9 @@ def test_error2(self):
             shape = [8, 9, 6]
             x = fluid.data(shape=shape, dtype='int8', name='x')
             index = fluid.data(shape=shape, dtype='int32', name='mask')
-            index_float = fluid.data(
-                shape=shape, dtype='float32', name='index_float')
+            index_float = fluid.data(shape=shape,
+                                     dtype='float32',
+                                     name='index_float')
 
             def test_x_type():
                 paddle.fluid.layers.gather(x, index)
@@ -379,6 +400,7 @@ def test_index_type():
 
 
 class TestCheckOutType(unittest.TestCase):
+
     def test_out_type(self):
         data = paddle.static.data(shape=[16, 10], dtype='int64', name='x')
         index = paddle.static.data(shape=[4], dtype='int64', name='index')
diff --git a/python/paddle/fluid/tests/unittests/test_gather_tree_op.py b/python/paddle/fluid/tests/unittests/test_gather_tree_op.py
index 6fe68c5d34ffa..f3a5acc048404 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_tree_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_tree_op.py
@@ -23,14 +23,17 @@
 
 
 class TestGatherTreeOp(OpTest):
+
     def setUp(self):
         self.op_type = "gather_tree"
         self.python_api = paddle.nn.functional.gather_tree
         max_length, batch_size, beam_size = 5, 2, 2
-        ids = np.random.randint(
-            0, high=10, size=(max_length, batch_size, beam_size))
-        parents = np.random.randint(
-            0, high=beam_size, size=(max_length, batch_size, beam_size))
+        ids = np.random.randint(0,
+                                high=10,
+                                size=(max_length, batch_size, beam_size))
+        parents = np.random.randint(0,
+                                    high=beam_size,
+                                    size=(max_length, batch_size, beam_size))
         self.inputs = {"Ids": ids, "Parents": parents}
         self.outputs = {'Out': self.backtrace(ids, parents)}
 
@@ -53,40 +56,41 @@ def backtrace(ids, parents):
 
 
 class TestGatherTreeOpAPI(unittest.TestCase):
+
     def test_case(self):
         paddle.enable_static()
-        ids = fluid.layers.data(
-            name='ids', shape=[5, 2, 2], dtype='int64', append_batch_size=False)
-        parents = fluid.layers.data(
-            name='parents',
-            shape=[5, 2, 2],
-            dtype='int64',
-            append_batch_size=False)
+        ids = fluid.layers.data(name='ids',
+                                shape=[5, 2, 2],
+                                dtype='int64',
+                                append_batch_size=False)
+        parents = fluid.layers.data(name='parents',
+                                    shape=[5, 2, 2],
+                                    dtype='int64',
+                                    append_batch_size=False)
         final_sequences = fluid.layers.gather_tree(ids, parents)
         paddle.disable_static()
 
     def test_case2(self):
-        ids = paddle.to_tensor(
-            [[[2, 2], [6, 1]], [[3, 9], [6, 1]], [[0, 1], [9, 0]]])
-        parents = paddle.to_tensor(
-            [[[0, 0], [1, 1]], [[1, 0], [1, 0]], [[0, 0], [0, 1]]])
+        ids = paddle.to_tensor([[[2, 2], [6, 1]], [[3, 9], [6, 1]],
+                                [[0, 1], [9, 0]]])
+        parents = paddle.to_tensor([[[0, 0], [1, 1]], [[1, 0], [1, 0]],
+                                    [[0, 0], [0, 1]]])
         final_sequences = paddle.nn.functional.gather_tree(ids, parents)
 
 
 class TestGatherTreeOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
-            ids = fluid.layers.data(
-                name='ids',
-                shape=[5, 2, 2],
-                dtype='int64',
-                append_batch_size=False)
-            parents = fluid.layers.data(
-                name='parents',
-                shape=[5, 2, 2],
-                dtype='int64',
-                append_batch_size=False)
+            ids = fluid.layers.data(name='ids',
+                                    shape=[5, 2, 2],
+                                    dtype='int64',
+                                    append_batch_size=False)
+            parents = fluid.layers.data(name='parents',
+                                        shape=[5, 2, 2],
+                                        dtype='int64',
+                                        append_batch_size=False)
 
             def test_Variable_ids():
                 # the input type must be Variable
@@ -104,22 +108,20 @@ def test_Variable_parents():
 
             def test_type_ids():
                 # dtype must be int32 or int64
-                bad_ids = fluid.layers.data(
-                    name='bad_ids',
-                    shape=[5, 2, 2],
-                    dtype='float32',
-                    append_batch_size=False)
+                bad_ids = fluid.layers.data(name='bad_ids',
+                                            shape=[5, 2, 2],
+                                            dtype='float32',
+                                            append_batch_size=False)
                 fluid.layers.gather_tree(bad_ids, parents)
 
             self.assertRaises(TypeError, test_type_ids)
 
             def test_type_parents():
                 # dtype must be int32 or int64
-                bad_parents = fluid.layers.data(
-                    name='bad_parents',
-                    shape=[5, 2, 2],
-                    dtype='float32',
-                    append_batch_size=False)
+                bad_parents = fluid.layers.data(name='bad_parents',
+                                                shape=[5, 2, 2],
+                                                dtype='float32',
+                                                append_batch_size=False)
                 fluid.layers.gather_tree(ids, bad_parents)
 
             self.assertRaises(TypeError, test_type_parents)
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 4140ce44648fa..43eaa7bf6a125 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -28,6 +28,7 @@
 
 
 class TestGaussianRandomOp(OpTest):
+
     def setUp(self):
         self.op_type = "gaussian_random"
         self.python_api = paddle.normal
@@ -65,15 +66,14 @@ def verify_output(self, outs):
         hist2, _ = np.histogram(data, range=(-3, 5))
         hist2 = hist2.astype("float32")
         hist2 /= float(outs[0].size)
-        self.assertTrue(
-            np.allclose(
-                hist, hist2, rtol=0, atol=0.01),
-            "hist: " + str(hist) + " hist2: " + str(hist2))
+        self.assertTrue(np.allclose(hist, hist2, rtol=0, atol=0.01),
+                        "hist: " + str(hist) + " hist2: " + str(hist2))
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestGaussianRandomBF16Op(OpTest):
+
     def setUp(self):
         self.op_type = "gaussian_random"
         self.python_api = paddle.normal
@@ -97,8 +97,8 @@ def set_attrs(self):
         self.std = 2.
 
     def test_check_output(self):
-        self.check_output_with_place_customized(
-            self.verify_output, place=core.CUDAPlace(0))
+        self.check_output_with_place_customized(self.verify_output,
+                                                place=core.CUDAPlace(0))
 
     def test_eager(self):
         with _test_eager_guard():
@@ -114,13 +114,12 @@ def verify_output(self, outs):
         hist2, _ = np.histogram(data, range=(-3, 5))
         hist2 = hist2.astype("float32")
         hist2 /= float(outs[0].size)
-        self.assertTrue(
-            np.allclose(
-                hist, hist2, rtol=0, atol=0.05),
-            "hist: " + str(hist) + " hist2: " + str(hist2))
+        self.assertTrue(np.allclose(hist, hist2, rtol=0, atol=0.05),
+                        "hist: " + str(hist) + " hist2: " + str(hist2))
 
 
 class TestMeanStdAreInt(TestGaussianRandomOp):
+
     def set_attrs(self):
         self.mean = 1
         self.std = 2
@@ -128,6 +127,7 @@ def set_attrs(self):
 
 # Situation 2: Attr(shape) is a list(with tensor)
 class TestGaussianRandomOp_ShapeTensorList(TestGaussianRandomOp):
+
     def setUp(self):
         '''Test gaussian_random op with specified value
         '''
@@ -161,8 +161,9 @@ def test_check_output(self):
         self.check_output_customized(self.verify_output)
 
 
-class TestGaussianRandomOp2_ShapeTensorList(
-        TestGaussianRandomOp_ShapeTensorList):
+class TestGaussianRandomOp2_ShapeTensorList(TestGaussianRandomOp_ShapeTensorList
+                                            ):
+
     def init_data(self):
         self.shape = [123, 92]
         self.infer_shape = [-1, -1]
@@ -172,8 +173,9 @@ def init_data(self):
         self.seed = 10
 
 
-class TestGaussianRandomOp3_ShapeTensorList(
-        TestGaussianRandomOp_ShapeTensorList):
+class TestGaussianRandomOp3_ShapeTensorList(TestGaussianRandomOp_ShapeTensorList
+                                            ):
+
     def init_data(self):
         self.shape = [123, 92]
         self.infer_shape = [123, -1]
@@ -183,8 +185,9 @@ def init_data(self):
         self.seed = 10
 
 
-class TestGaussianRandomOp4_ShapeTensorList(
-        TestGaussianRandomOp_ShapeTensorList):
+class TestGaussianRandomOp4_ShapeTensorList(TestGaussianRandomOp_ShapeTensorList
+                                            ):
+
     def init_data(self):
         self.shape = [123, 92]
         self.infer_shape = [123, -1]
@@ -196,6 +199,7 @@ def init_data(self):
 
 # Situation 3: shape is a tensor
 class TestGaussianRandomOp1_ShapeTensor(TestGaussianRandomOp):
+
     def setUp(self):
         '''Test gaussian_random op with specified value
         '''
@@ -222,53 +226,54 @@ def init_data(self):
 
 # Test python API
 class TestGaussianRandomAPI(unittest.TestCase):
+
     def test_api(self):
         positive_2_int32 = fluid.layers.fill_constant([1], "int32", 2000)
 
         positive_2_int64 = fluid.layers.fill_constant([1], "int64", 500)
-        shape_tensor_int32 = fluid.data(
-            name="shape_tensor_int32", shape=[2], dtype="int32")
-
-        shape_tensor_int64 = fluid.data(
-            name="shape_tensor_int64", shape=[2], dtype="int64")
-
-        out_1 = fluid.layers.gaussian_random(
-            shape=[2000, 500], dtype="float32", mean=0.0, std=1.0, seed=10)
-
-        out_2 = fluid.layers.gaussian_random(
-            shape=[2000, positive_2_int32],
-            dtype="float32",
-            mean=0.,
-            std=1.0,
-            seed=10)
-
-        out_3 = fluid.layers.gaussian_random(
-            shape=[2000, positive_2_int64],
-            dtype="float32",
-            mean=0.,
-            std=1.0,
-            seed=10)
-
-        out_4 = fluid.layers.gaussian_random(
-            shape=shape_tensor_int32,
-            dtype="float32",
-            mean=0.,
-            std=1.0,
-            seed=10)
-
-        out_5 = fluid.layers.gaussian_random(
-            shape=shape_tensor_int64,
-            dtype="float32",
-            mean=0.,
-            std=1.0,
-            seed=10)
-
-        out_6 = fluid.layers.gaussian_random(
-            shape=shape_tensor_int64,
-            dtype=np.float32,
-            mean=0.,
-            std=1.0,
-            seed=10)
+        shape_tensor_int32 = fluid.data(name="shape_tensor_int32",
+                                        shape=[2],
+                                        dtype="int32")
+
+        shape_tensor_int64 = fluid.data(name="shape_tensor_int64",
+                                        shape=[2],
+                                        dtype="int64")
+
+        out_1 = fluid.layers.gaussian_random(shape=[2000, 500],
+                                             dtype="float32",
+                                             mean=0.0,
+                                             std=1.0,
+                                             seed=10)
+
+        out_2 = fluid.layers.gaussian_random(shape=[2000, positive_2_int32],
+                                             dtype="float32",
+                                             mean=0.,
+                                             std=1.0,
+                                             seed=10)
+
+        out_3 = fluid.layers.gaussian_random(shape=[2000, positive_2_int64],
+                                             dtype="float32",
+                                             mean=0.,
+                                             std=1.0,
+                                             seed=10)
+
+        out_4 = fluid.layers.gaussian_random(shape=shape_tensor_int32,
+                                             dtype="float32",
+                                             mean=0.,
+                                             std=1.0,
+                                             seed=10)
+
+        out_5 = fluid.layers.gaussian_random(shape=shape_tensor_int64,
+                                             dtype="float32",
+                                             mean=0.,
+                                             std=1.0,
+                                             seed=10)
+
+        out_6 = fluid.layers.gaussian_random(shape=shape_tensor_int64,
+                                             dtype=np.float32,
+                                             mean=0.,
+                                             std=1.0,
+                                             seed=10)
 
         exe = fluid.Executor(place=fluid.CPUPlace())
         res_1, res_2, res_3, res_4, res_5, res_6 = exe.run(
@@ -318,6 +323,7 @@ def test_default_fp64():
 
 
 class TestStandardNormalDtype(unittest.TestCase):
+
     def test_default_dtype(self):
         paddle.disable_static()
 
@@ -344,6 +350,7 @@ def test_default_fp64():
 
 
 class TestRandomValue(unittest.TestCase):
+
     def test_fixed_random_number(self):
         # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
         if not paddle.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_gcd.py b/python/paddle/fluid/tests/unittests/test_gcd.py
index 820216dc56cd6..b3ada9cdaa6d1 100644
--- a/python/paddle/fluid/tests/unittests/test_gcd.py
+++ b/python/paddle/fluid/tests/unittests/test_gcd.py
@@ -26,6 +26,7 @@
 
 
 class TestGcdAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_np = 12
         self.y_np = 20
@@ -40,15 +41,17 @@ def test_static_graph(self):
             y = fluid.data(name='input2', dtype='int32', shape=self.y_shape)
             out = paddle.gcd(x, y)
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
             res = exe.run(fluid.default_main_program(),
-                          feed={'input1': self.x_np,
-                                'input2': self.y_np},
+                          feed={
+                              'input1': self.x_np,
+                              'input2': self.y_np
+                          },
                           fetch_list=[out])
-            self.assertTrue((np.array(res[0]) == np.gcd(self.x_np, self.y_np)
-                             ).all())
+            self.assertTrue((np.array(res[0]) == np.gcd(self.x_np,
+                                                        self.y_np)).all())
 
     def test_dygraph(self):
         paddle.disable_static()
@@ -62,6 +65,7 @@ def test_dygraph(self):
 
 
 class TestGcdAPI2(TestGcdAPI):
+
     def setUp(self):
         self.x_np = np.arange(6).astype(np.int32)
         self.y_np = np.array([20]).astype(np.int32)
@@ -70,6 +74,7 @@ def setUp(self):
 
 
 class TestGcdAPI3(TestGcdAPI):
+
     def setUp(self):
         self.x_np = 0
         self.y_np = 20
@@ -78,6 +83,7 @@ def setUp(self):
 
 
 class TestGcdAPI4(TestGcdAPI):
+
     def setUp(self):
         self.x_np = 0
         self.y_np = 0
@@ -86,6 +92,7 @@ def setUp(self):
 
 
 class TestGcdAPI5(TestGcdAPI):
+
     def setUp(self):
         self.x_np = 12
         self.y_np = -20
diff --git a/python/paddle/fluid/tests/unittests/test_gelu_op.py b/python/paddle/fluid/tests/unittests/test_gelu_op.py
index abfb65c27a951..f6fa4e2da5967 100644
--- a/python/paddle/fluid/tests/unittests/test_gelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gelu_op.py
@@ -26,14 +26,15 @@
 
 def gelu(x, approximate):
     if approximate:
-        y_ref = 0.5 * x * (1.0 + np.tanh(
-            np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+        y_ref = 0.5 * x * (
+            1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
     else:
         y_ref = 0.5 * x * (1 + erf(x / np.sqrt(2)))
     return y_ref.astype(x.dtype)
 
 
 class TestGeluOp(unittest.TestCase):
+
     def _test_case1_cpu(self, approximate):
         x = np.random.uniform(-1, 1, size=(11, 17)).astype(np.float32)
         y_ref = gelu(x, approximate)
@@ -89,8 +90,7 @@ def run_gelu_op(approximate):
         self.assertTrue(np.allclose(y_ref, y_fast_math, rtol=1e-5, atol=5e-4))
 
         self.assertTrue(
-            np.allclose(
-                x_g_ref, x_g_fast_math, rtol=1e-5, atol=5e-4))
+            np.allclose(x_g_ref, x_g_fast_math, rtol=1e-5, atol=5e-4))
 
     def test_fast_math_eager(self):
         with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/test_gen_nccl_id_op.py b/python/paddle/fluid/tests/unittests/test_gen_nccl_id_op.py
index c5e48e27a75d5..8543912f04f2e 100644
--- a/python/paddle/fluid/tests/unittests/test_gen_nccl_id_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gen_nccl_id_op.py
@@ -67,6 +67,7 @@ def run_gen_ncc_id(attr):
 
 
 class TestGenNcclIdOp(unittest.TestCase):
+
     def setUp(self):
         try:
             self._dist_ut_port_0 = int(os.environ["PADDLE_DIST_UT_PORT"])
diff --git a/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
index 1d7ce33ea7ca2..8414cd941c207 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
@@ -285,6 +285,7 @@ def trans_lod(lod):
 
 
 class TestGenerateMaskLabels(OpTest):
+
     def set_data(self):
         self.init_test_case()
         self.make_generate_proposal_labels_out()
@@ -362,8 +363,9 @@ def generate_gt_polys(self):
                 lod1.append(poly_num)
                 pts = []
                 for j in range(poly_num):
-                    poly_size = np.random.randint(
-                        min_poly_size, max_poly_size, size=1)[0]
+                    poly_size = np.random.randint(min_poly_size,
+                                                  max_poly_size,
+                                                  size=1)[0]
                     x = np.random.rand(poly_size, 1) * w
                     y = np.random.rand(poly_size, 1) * h
                     xy = np.concatenate((x, y), axis=1)
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
index 2e9a5229e2ee0..d1bf246b5a75a 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
@@ -50,11 +50,12 @@ def generate_proposal_labels_in_python(rpn_rois,
 
     for im_i in range(len(im_info)):
         max_overlap = max_overlaps[im_i] if is_cascade_rcnn else None
-        frcn_blobs = _sample_rois(
-            rpn_rois[im_i], gt_classes[im_i], is_crowd[im_i], gt_boxes[im_i],
-            im_info[im_i], batch_size_per_im, fg_fraction, fg_thresh,
-            bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
-            use_random, is_cls_agnostic, is_cascade_rcnn, max_overlap)
+        frcn_blobs = _sample_rois(rpn_rois[im_i], gt_classes[im_i],
+                                  is_crowd[im_i], gt_boxes[im_i], im_info[im_i],
+                                  batch_size_per_im, fg_fraction, fg_thresh,
+                                  bg_thresh_hi, bg_thresh_lo, bbox_reg_weights,
+                                  class_nums, use_random, is_cls_agnostic,
+                                  is_cascade_rcnn, max_overlap)
         lod.append(frcn_blobs['rois'].shape[0])
         rois.append(frcn_blobs['rois'])
         labels_int32.append(frcn_blobs['labels_int32'])
@@ -100,11 +101,11 @@ def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
     overlaps_max = proposal_to_gt_overlaps.max(axis=1)
     # Boxes which with non-zero overlap with gt boxes
     overlapped_boxes_ind = np.where(overlaps_max > 0)[0]
-    overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[
-        overlapped_boxes_ind]]
-    gt_overlaps[overlapped_boxes_ind,
-                overlapped_boxes_gt_classes] = overlaps_max[
-                    overlapped_boxes_ind]
+    overlapped_boxes_gt_classes = gt_classes[
+        overlaps_argmax[overlapped_boxes_ind]]
+    gt_overlaps[
+        overlapped_boxes_ind,
+        overlapped_boxes_gt_classes] = overlaps_max[overlapped_boxes_ind]
     box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[
         overlapped_boxes_ind]
 
@@ -116,8 +117,8 @@ def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
     if is_cascade_rcnn:
         # Cascade RCNN Decode Filter
         fg_inds = np.where(max_overlaps >= fg_thresh)[0]
-        bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
-                                                            bg_thresh_lo))[0]
+        bg_inds = np.where((max_overlaps < bg_thresh_hi)
+                           & (max_overlaps >= bg_thresh_lo))[0]
         fg_rois_per_this_image = fg_inds.shape[0]
         bg_rois_per_this_image = bg_inds.shape[0]
     else:
@@ -126,19 +127,21 @@ def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
         fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0])
         # Sample foreground if there are too many
         if (fg_inds.shape[0] > fg_rois_per_this_image) and use_random:
-            fg_inds = np.random.choice(
-                fg_inds, size=fg_rois_per_this_image, replace=False)
+            fg_inds = np.random.choice(fg_inds,
+                                       size=fg_rois_per_this_image,
+                                       replace=False)
         fg_inds = fg_inds[:fg_rois_per_this_image]
         # Background
-        bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
-                                                            bg_thresh_lo))[0]
+        bg_inds = np.where((max_overlaps < bg_thresh_hi)
+                           & (max_overlaps >= bg_thresh_lo))[0]
         bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
         bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
                                             bg_inds.shape[0])
         # Sample background if there are too many
         if (bg_inds.shape[0] > bg_rois_per_this_image) and use_random:
-            bg_inds = np.random.choice(
-                bg_inds, size=bg_rois_per_this_image, replace=False)
+            bg_inds = np.random.choice(bg_inds,
+                                       size=bg_rois_per_this_image,
+                                       replace=False)
         bg_inds = bg_inds[:bg_rois_per_this_image]
 
     keep_inds = np.append(fg_inds, bg_inds)
@@ -152,19 +155,18 @@ def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
                                           sampled_labels, bbox_reg_weights)
     bbox_targets, bbox_inside_weights = _expand_bbox_targets(
         bbox_label_targets, class_nums, is_cls_agnostic)
-    bbox_outside_weights = np.array(
-        bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype)
+    bbox_outside_weights = np.array(bbox_inside_weights > 0,
+                                    dtype=bbox_inside_weights.dtype)
     # Scale rois
     sampled_rois = sampled_boxes * im_scale
 
     # Faster RCNN blobs
-    frcn_blobs = dict(
-        rois=sampled_rois,
-        labels_int32=sampled_labels,
-        bbox_targets=bbox_targets,
-        bbox_inside_weights=bbox_inside_weights,
-        bbox_outside_weights=bbox_outside_weights,
-        max_overlap=sampled_max_overlap)
+    frcn_blobs = dict(rois=sampled_rois,
+                      labels_int32=sampled_labels,
+                      bbox_targets=bbox_targets,
+                      bbox_inside_weights=bbox_inside_weights,
+                      bbox_outside_weights=bbox_outside_weights,
+                      max_overlap=sampled_max_overlap)
     return frcn_blobs
 
 
@@ -198,11 +200,12 @@ def _compute_targets(roi_boxes, gt_boxes, labels, bbox_reg_weights):
 
     targets = np.zeros(roi_boxes.shape)
     bbox_reg_weights = np.asarray(bbox_reg_weights)
-    targets = _box_to_delta(
-        ex_boxes=roi_boxes, gt_boxes=gt_boxes, weights=bbox_reg_weights)
+    targets = _box_to_delta(ex_boxes=roi_boxes,
+                            gt_boxes=gt_boxes,
+                            weights=bbox_reg_weights)
 
-    return np.hstack([labels[:, np.newaxis], targets]).astype(
-        np.float32, copy=False)
+    return np.hstack([labels[:, np.newaxis], targets]).astype(np.float32,
+                                                              copy=False)
 
 
 def _box_to_delta(ex_boxes, gt_boxes, weights):
@@ -232,8 +235,8 @@ def _expand_bbox_targets(bbox_targets_input, class_nums, is_cls_agnostic):
     #	class_labels = [1 if ll > 0 else 0 for ll in class_labels]
     #    class_labels = np.array(class_labels, dtype=np.int32)
     #	class_nums = 2
-    bbox_targets = np.zeros((class_labels.shape[0], 4 * class_nums
-                             if not is_cls_agnostic else 4 * 2))
+    bbox_targets = np.zeros((class_labels.shape[0],
+                             4 * class_nums if not is_cls_agnostic else 4 * 2))
     bbox_inside_weights = np.zeros(bbox_targets.shape)
     for ind in fg_inds:
         class_label = int(class_labels[ind]) if not is_cls_agnostic else 1
@@ -245,6 +248,7 @@ def _expand_bbox_targets(bbox_targets_input, class_nums, is_cls_agnostic):
 
 
 class TestGenerateProposalLabelsOp(OpTest):
+
     def set_data(self):
         #self.use_random = False
         self.init_use_random()
@@ -320,8 +324,8 @@ def init_test_input(self):
             self.im_info[i, 1] = images_shape[i][1]
             self.im_info[i, 2] = 0.8  #scale
 
-        self.rpn_rois, self.rpn_rois_lod = _generate_proposals(images_shape,
-                                                               proposal_nums)
+        self.rpn_rois, self.rpn_rois_lod = _generate_proposals(
+            images_shape, proposal_nums)
         ground_truth, self.gts_lod = _generate_groundtruth(
             images_shape, self.class_nums, gt_nums)
 
@@ -350,6 +354,7 @@ def init_test_output(self):
 
 
 class TestCascade(TestGenerateProposalLabelsOp):
+
     def init_test_cascade(self):
         self.is_cascade_rcnn = True
         roi_num = len(self.rpn_rois[0])
@@ -361,6 +366,7 @@ def init_test_cascade(self):
 
 
 class TestUseRandom(TestGenerateProposalLabelsOp):
+
     def init_use_random(self):
         self.use_random = True
         self.is_cascade_rcnn = False
@@ -383,6 +389,7 @@ def init_test_params(self):
 
 
 class TestClsAgnostic(TestCascade):
+
     def init_test_params(self):
         self.batch_size_per_im = 512
         self.fg_fraction = 0.25
@@ -395,6 +402,7 @@ def init_test_params(self):
 
 
 class TestOnlyGT(TestCascade):
+
     def init_test_input(self):
         np.random.seed(0)
         gt_nums = 6  # Keep same with batch_size_per_im for unittest
@@ -417,6 +425,7 @@ def init_test_input(self):
 
 
 class TestOnlyGT2(TestCascade):
+
     def init_test_cascade(self):
         self.is_cascade_rcnn = True
         roi_num = len(self.rpn_rois[0])
@@ -443,14 +452,13 @@ def _generate_groundtruth(images_shape, class_nums, gt_nums):
     num_gts = 0
     for i, image_shape in enumerate(images_shape):
         # Avoid background
-        gt_classes = np.random.randint(
-            low=1, high=class_nums, size=gt_nums).astype(np.int32)
+        gt_classes = np.random.randint(low=1, high=class_nums,
+                                       size=gt_nums).astype(np.int32)
         gt_boxes = _generate_boxes(image_shape, gt_nums)
         is_crowd = np.zeros((gt_nums), dtype=np.int32)
         is_crowd[0] = 1
         ground_truth.append(
-            dict(
-                gt_classes=gt_classes, boxes=gt_boxes, is_crowd=is_crowd))
+            dict(gt_classes=gt_classes, boxes=gt_boxes, is_crowd=is_crowd))
         num_gts += len(gt_classes)
         gts_lod.append(num_gts)
     return ground_truth, [gts_lod]
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index 6b9eeaa0867c1..460f58d87b7d7 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -126,7 +126,7 @@ def box_coder(all_anchors, bbox_deltas, variances, pixel_offset=True):
     anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0]
     anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1]
 
-    #predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height 
+    #predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height
     pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32)
     if variances is not None:
         for i in range(bbox_deltas.shape[0]):
@@ -142,10 +142,12 @@ def box_coder(all_anchors, bbox_deltas, variances, pixel_offset=True):
                     1000 / 16.0))) * anchor_loc[i, 1]
     else:
         for i in range(bbox_deltas.shape[0]):
-            pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[
-                i, 2]
-            pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[
-                i, 3]
+            pred_bbox[i,
+                      0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[i,
+                                                                             2]
+            pred_bbox[i,
+                      1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[i,
+                                                                             3]
             pred_bbox[i, 2] = math.exp(
                 min(bbox_deltas[i, 2], math.log(1000 / 16.0))) * anchor_loc[i,
                                                                             0]
@@ -169,17 +171,21 @@ def clip_tiled_boxes(boxes, im_shape, pixel_offset=True):
     )
     offset = 1 if pixel_offset else 0
     # x1 >= 0
-    boxes[:, 0::4] = np.maximum(
-        np.minimum(boxes[:, 0::4], im_shape[1] - offset), 0)
+    boxes[:,
+          0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - offset),
+                             0)
     # y1 >= 0
-    boxes[:, 1::4] = np.maximum(
-        np.minimum(boxes[:, 1::4], im_shape[0] - offset), 0)
+    boxes[:,
+          1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - offset),
+                             0)
     # x2 < im_shape[1]
-    boxes[:, 2::4] = np.maximum(
-        np.minimum(boxes[:, 2::4], im_shape[1] - offset), 0)
+    boxes[:,
+          2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - offset),
+                             0)
     # y2 < im_shape[0]
-    boxes[:, 3::4] = np.maximum(
-        np.minimum(boxes[:, 3::4], im_shape[0] - offset), 0)
+    boxes[:,
+          3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - offset),
+                             0)
     return boxes
 
 
@@ -197,9 +203,9 @@ def filter_boxes(boxes, min_size, im_info, pixel_offset=True):
         hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1
         x_ctr = boxes[:, 0] + ws / 2.
         y_ctr = boxes[:, 1] + hs / 2.
-        keep = np.where((ws_orig_scale >= min_size) & (
-            hs_orig_scale >= min_size) & (x_ctr < im_info[1]) & (y_ctr <
-                                                                 im_info[0]))[0]
+        keep = np.where((ws_orig_scale >= min_size)
+                        & (hs_orig_scale >= min_size) & (x_ctr < im_info[1])
+                        & (y_ctr < im_info[0]))[0]
     else:
         keep = np.where((ws >= min_size) & (hs >= min_size))[0]
     return keep
@@ -275,6 +281,7 @@ def nms(boxes, scores, nms_threshold, eta=1.0, pixel_offset=True):
 
 
 class TestGenerateProposalsOp(OpTest):
+
     def set_data(self):
         self.init_test_params()
         self.init_test_input()
@@ -343,6 +350,7 @@ def init_test_output(self):
 
 
 class TestGenerateProposalsOutLodOp(TestGenerateProposalsOp):
+
     def set_data(self):
         self.init_test_params()
         self.init_test_input()
@@ -367,12 +375,12 @@ def set_data(self):
         self.outputs = {
             'RpnRois': (self.rpn_rois[0], [self.rois_num]),
             'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
-            'RpnRoisNum': (np.asarray(
-                self.rois_num, dtype=np.int32))
+            'RpnRoisNum': (np.asarray(self.rois_num, dtype=np.int32))
         }
 
 
 class TestGenerateProposalsOpNoBoxLeft(TestGenerateProposalsOp):
+
     def init_test_params(self):
         self.pre_nms_topN = 12000  # train 12000, test 2000
         self.post_nms_topN = 5000  # train 6000, test 1000
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py
index 0a67004518771..32d7d308e5392 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py
@@ -124,14 +124,15 @@ def filter_boxes(boxes, min_size, im_shape, pixel_offset=True):
     if pixel_offset:
         x_ctr = boxes[:, 0] + ws / 2.
         y_ctr = boxes[:, 1] + hs / 2.
-        keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_shape[
-            1]) & (y_ctr < im_shape[0]))[0]
+        keep = np.where((ws >= min_size) & (hs >= min_size)
+                        & (x_ctr < im_shape[1]) & (y_ctr < im_shape[0]))[0]
     else:
         keep = np.where((ws >= min_size) & (hs >= min_size))[0]
     return keep
 
 
 class TestGenerateProposalsV2Op(OpTest):
+
     def set_data(self):
         self.init_test_params()
         self.init_test_input()
@@ -202,6 +203,7 @@ def init_test_output(self):
 
 
 class TestGenerateProposalsV2OutLodOp(TestGenerateProposalsV2Op):
+
     def set_data(self):
         self.init_test_params()
         self.init_test_input()
@@ -226,12 +228,12 @@ def set_data(self):
         self.outputs = {
             'RpnRois': (self.rpn_rois[0], [self.rois_num]),
             'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
-            'RpnRoisNum': (np.asarray(
-                self.rois_num, dtype=np.int32))
+            'RpnRoisNum': (np.asarray(self.rois_num, dtype=np.int32))
         }
 
 
 class TestGenerateProposalsV2OpNoBoxLeft(TestGenerateProposalsV2Op):
+
     def init_test_params(self):
         self.pre_nms_topN = 12000  # train 12000, test 2000
         self.post_nms_topN = 5000  # train 6000, test 1000
@@ -242,6 +244,7 @@ def init_test_params(self):
 
 
 class TestGenerateProposalsV2OpNoOffset(TestGenerateProposalsV2Op):
+
     def init_test_params(self):
         self.pre_nms_topN = 12000  # train 12000, test 2000
         self.post_nms_topN = 5000  # train 6000, test 1000
diff --git a/python/paddle/fluid/tests/unittests/test_generator.py b/python/paddle/fluid/tests/unittests/test_generator.py
index ef9a305053e12..7335718f0f563 100644
--- a/python/paddle/fluid/tests/unittests/test_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_generator.py
@@ -36,8 +36,9 @@ def test_basic_generator(self):
 
     def test_basic_generator_error(self):
         if paddle.fluid.core.is_compiled_with_cuda():
-            self.assertRaises(
-                ValueError, generator.Generator, place=paddle.CUDAPlace(0))
+            self.assertRaises(ValueError,
+                              generator.Generator,
+                              place=paddle.CUDAPlace(0))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
index c36550fca8cab..674c0b4d12fe4 100644
--- a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
@@ -42,8 +42,9 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
 
     with fluid.unique_name.guard():
         with fluid.program_guard(main_prog, startup_prog):
-            image = fluid.layers.data(
-                name='image', shape=[784], dtype='float32')
+            image = fluid.layers.data(name='image',
+                                      shape=[784],
+                                      dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             py_reader = fluid.io.DataLoader.from_generator(
                 feed_list=[image, label],
@@ -63,8 +64,7 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
                                             size=CLASS_NUM,
                                             act='softmax')
             loss = fluid.layers.mean(
-                fluid.layers.cross_entropy(
-                    input=predict_label, label=label))
+                fluid.layers.cross_entropy(input=predict_label, label=label))
 
             optimizer = fluid.optimizer.Adam()
             optimizer.minimize(loss)
@@ -72,6 +72,7 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
 
 
 class TestBase(unittest.TestCase):
+
     def run_main(self, use_legacy_py_reader, with_data_parallel, places,
                  use_double_buffer):
         scope = fluid.Scope()
@@ -91,8 +92,8 @@ def run_main(self, use_legacy_py_reader, with_data_parallel, places,
 
             prog = fluid.CompiledProgram(main_prog)
             if with_data_parallel:
-                prog = prog.with_data_parallel(
-                    loss_name=loss.name, places=places)
+                prog = prog.with_data_parallel(loss_name=loss.name,
+                                               places=places)
 
             step = 0
             step_list = []
@@ -176,6 +177,7 @@ def test_main(self):
 
 
 class TestDataLoaderBaseAbstract(unittest.TestCase):
+
     def test_main(self):
         loader = DataLoaderBase()
         try:
diff --git a/python/paddle/fluid/tests/unittests/test_get_device_properties.py b/python/paddle/fluid/tests/unittests/test_get_device_properties.py
index 4cfb91bfae93e..750a257b0d960 100644
--- a/python/paddle/fluid/tests/unittests/test_get_device_properties.py
+++ b/python/paddle/fluid/tests/unittests/test_get_device_properties.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,6 +19,7 @@
 
 
 class TestGetDeviceProperties(unittest.TestCase):
+
     def test_get_device_properties_default(self):
         if core.is_compiled_with_cuda():
             props = get_device_properties()
@@ -44,6 +45,7 @@ def test_get_device_properties_CUDAPlace(self):
 
 
 class TestGetDevicePropertiesError(unittest.TestCase):
+
     def test_error_api(self):
         if core.is_compiled_with_cuda():
 
diff --git a/python/paddle/fluid/tests/unittests/test_get_inputs_outputs_in_block.py b/python/paddle/fluid/tests/unittests/test_get_inputs_outputs_in_block.py
index 9e82057959408..1896f0a4bf993 100644
--- a/python/paddle/fluid/tests/unittests/test_get_inputs_outputs_in_block.py
+++ b/python/paddle/fluid/tests/unittests/test_get_inputs_outputs_in_block.py
@@ -23,6 +23,7 @@
 
 
 class TestGetInputsOutputsInBlock(unittest.TestCase):
+
     def test_ordered(self):
         # Program variable names may be different when test order is different
         # This helper makes the test ordered.
@@ -68,7 +69,7 @@ def _test_cond(self):
         sub_block = main_program.block(1)
         inner_inputs, inner_outputs = utils.get_inputs_outputs_in_block(
             sub_block)
-        #'fill_constant_1.tmp_0', 'tmp_3' are names of a, c 
+        #'fill_constant_1.tmp_0', 'tmp_3' are names of a, c
         self.assertTrue(inner_inputs == {'fill_constant_1.tmp_0', 'tmp_3'})
         #'_generated_var_1', is name of a + c
         self.assertTrue(inner_outputs == {'_generated_var_1'})
diff --git a/python/paddle/fluid/tests/unittests/test_get_places_op.py b/python/paddle/fluid/tests/unittests/test_get_places_op.py
index a6deeab457c09..1e0c99bac084b 100644
--- a/python/paddle/fluid/tests/unittests/test_get_places_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_places_op.py
@@ -22,6 +22,7 @@
 
 
 class TestGetPlaces(unittest.TestCase):
+
     @prog_scope()
     def check_get_cpu_places(self):
         places = get_places()
diff --git a/python/paddle/fluid/tests/unittests/test_get_set_flags.py b/python/paddle/fluid/tests/unittests/test_get_set_flags.py
index e2761ff4358e3..80300eb7dfcb4 100644
--- a/python/paddle/fluid/tests/unittests/test_get_set_flags.py
+++ b/python/paddle/fluid/tests/unittests/test_get_set_flags.py
@@ -17,6 +17,7 @@
 
 
 class TestGetAndSetFlags(unittest.TestCase):
+
     def test_api(self):
         flags = {
             'FLAGS_eager_delete_tensor_gb': 1.0,
@@ -37,6 +38,7 @@ def test_api(self):
 
 
 class TestGetAndSetFlagsErrors(unittest.TestCase):
+
     def test_errors(self):
         flags_list = ['FLAGS_eager_delete_tensor_gb', 'FLAGS_check_nan_inf']
         flag = 1
diff --git a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
index 2f6c87aefaa9a..2540fa78d62d8 100644
--- a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
@@ -42,6 +42,7 @@ def test_SELECTED_ROWS():
 
 
 class TestGetTensorFromSelectedRows(unittest.TestCase):
+
     def get_places(self):
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_global_var_getter_setter.py b/python/paddle/fluid/tests/unittests/test_global_var_getter_setter.py
index 3394a08de8b19..90fe9988ac2d8 100644
--- a/python/paddle/fluid/tests/unittests/test_global_var_getter_setter.py
+++ b/python/paddle/fluid/tests/unittests/test_global_var_getter_setter.py
@@ -17,6 +17,7 @@
 
 
 class VarInfo(object):
+
     def __init__(self, var_name, var_type, writable):
         self.name = var_name
         self.type = var_type
@@ -24,6 +25,7 @@ def __init__(self, var_name, var_type, writable):
 
 
 class TestGlobalVarGetterSetter(unittest.TestCase):
+
     def test_main(self):
         var_infos = [
             VarInfo("FLAGS_free_idle_chunk", bool, False),
diff --git a/python/paddle/fluid/tests/unittests/test_glu.py b/python/paddle/fluid/tests/unittests/test_glu.py
index 25f1975db0c52..c8f0098456cbd 100644
--- a/python/paddle/fluid/tests/unittests/test_glu.py
+++ b/python/paddle/fluid/tests/unittests/test_glu.py
@@ -32,6 +32,7 @@ def glu(x, dim=-1):
 
 
 class TestGLUCase(unittest.TestCase):
+
     def setUp(self):
         self.x = np.random.randn(5, 20)
         self.dim = -1
@@ -52,6 +53,7 @@ def test_case(self):
 
 
 class TestGLUV2(unittest.TestCase):
+
     def setUp(self):
         self.x = np.random.randn(5, 20)
         self.dim = -1
diff --git a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
index e528e742a277a..d4dc21e7646d6 100644
--- a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
+++ b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
@@ -24,6 +24,7 @@
 
 
 class TestGPUPackagePaddle(unittest.TestCase):
+
     def test_import_paddle(self):
         if core.is_compiled_with_cuda():
             if core.is_compiled_with_rocm():
@@ -43,11 +44,10 @@ def test_import_paddle(self):
             _python = sys.executable
 
             ps_cmd = '{} {}'.format(_python, test_file)
-            ps_proc = subprocess.Popen(
-                ps_cmd.strip().split(" "),
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                env=os.environ)
+            ps_proc = subprocess.Popen(ps_cmd.strip().split(" "),
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.PIPE,
+                                       env=os.environ)
             stdout, stderr = ps_proc.communicate()
 
             assert 'CPU device will be used by default' in str(
diff --git a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
index 39a5b9391e0b5..15009ea8c58aa 100644
--- a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
+++ b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
@@ -29,6 +29,7 @@
 
 
 class TestGradClipByGlobalNorm(unittest.TestCase):
+
     def init_value(self):
         self.max_global_norm = 5.0
         self.init_scale = 1.0
@@ -103,6 +104,7 @@ def test_clip_by_global_norm_2(self):
 
 
 class TestGradClipByNorm(unittest.TestCase):
+
     def init_value(self):
         self.max_norm = 5.0
         self.init_scale = 1.0
@@ -173,6 +175,7 @@ def test_clip_by_norm_2(self):
 
 
 class TestGradClipByValue(unittest.TestCase):
+
     def init_value(self):
         self.max_value = 0.8
         self.min_value = -0.1
@@ -200,8 +203,8 @@ def get_numpy_clip_result(self):
 
     def get_dygrap_clip_result(self):
         with fluid.dygraph.guard():
-            value_clip = GradientClipByValue(
-                max=self.max_value, min=self.min_value)
+            value_clip = GradientClipByValue(max=self.max_value,
+                                             min=self.min_value)
             p_g_var = []
             for p, g in self.para_and_grad:
                 new_p = to_variable(p)
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index 20a55af15c441..dfdb3c32dc232 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -38,8 +38,9 @@ def bow_net(data,
     This model is from https://github.com/PaddlePaddle/models:
     fluid/PaddleNLP/text_classification/nets.py
     """
-    emb = fluid.layers.embedding(
-        input=data, is_sparse=True, size=[dict_dim, emb_dim])
+    emb = fluid.layers.embedding(input=data,
+                                 is_sparse=True,
+                                 size=[dict_dim, emb_dim])
     bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
     bow_tanh = fluid.layers.tanh(bow)
     fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
@@ -52,6 +53,7 @@ def bow_net(data,
 
 
 class TestGradientClip(unittest.TestCase):
+
     def setUp(self):
         self.word_dict_len = 5147
         self.BATCH_SIZE = 2
@@ -75,8 +77,8 @@ def check_clip_result(self, out, out_clip):
     def check_gradient_clip(self, place, dtype='float32'):
         prog = fluid.Program()
         startup_program = fluid.Program()
-        with fluid.program_guard(
-                main_program=prog, startup_program=startup_program):
+        with fluid.program_guard(main_program=prog,
+                                 startup_program=startup_program):
             image = fluid.data(name="a", shape=[-1, 784], dtype='float32')
             label = fluid.data(name="b", shape=[-1, 1], dtype='int64')
             if dtype != 'float32':
@@ -97,8 +99,8 @@ def check_gradient_clip(self, place, dtype='float32'):
 
         p_g = sorted(p_g, key=lambda x: x[0].name)
         p_g_clip = sorted(p_g_clip, key=lambda x: x[0].name)
-        with fluid.program_guard(
-                main_program=prog_clip, startup_program=startup_program):
+        with fluid.program_guard(main_program=prog_clip,
+                                 startup_program=startup_program):
             p_g_clip = self.clip_gradient(p_g_clip)
 
         grad_list = [elem[1] for elem in p_g]
@@ -119,10 +121,12 @@ def check_gradient_clip(self, place, dtype='float32'):
     def check_sparse_gradient_clip(self, place):
         prog = fluid.Program()
         startup_program = fluid.Program()
-        with fluid.program_guard(
-                main_program=prog, startup_program=startup_program):
-            data = fluid.data(
-                name="words", shape=[-1, 1], dtype="int64", lod_level=1)
+        with fluid.program_guard(main_program=prog,
+                                 startup_program=startup_program):
+            data = fluid.data(name="words",
+                              shape=[-1, 1],
+                              dtype="int64",
+                              lod_level=1)
             label = fluid.data(name="label", shape=[-1, 1], dtype="int64")
             cost = bow_net(data, label, self.word_dict_len)
 
@@ -142,6 +146,7 @@ def backward_and_optimize(self, cost):
 
 
 class TestGradientClipByGlobalNorm(TestGradientClip):
+
     def init(self):
         self.clip_norm = 0.2
 
@@ -157,13 +162,13 @@ def check_clip_result(self, out, out_clip):
 
         for u, v in zip(out, out_clip):
             self.assertTrue(
-                np.allclose(
-                    a=u, b=v, rtol=1e-5, atol=1e-8),
-                "gradient clip by global norm has wrong results!, \nu={}\nv={}\ndiff={}".
-                format(u, v, u - v))
+                np.allclose(a=u, b=v, rtol=1e-5, atol=1e-8),
+                "gradient clip by global norm has wrong results!, \nu={}\nv={}\ndiff={}"
+                .format(u, v, u - v))
 
     # test whether the output is right when use 'set_gradient_clip'
     def test_old_gradient_clip(self):
+
         def func(params_grads):
             clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
             fluid.clip.set_gradient_clip(clip)
@@ -174,6 +179,7 @@ def func(params_grads):
 
     # test whether the output is right when use grad_clip
     def test_new_gradient_clip(self):
+
         def func(params_grads):
             clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
             return clip(params_grads)
@@ -183,6 +189,7 @@ def func(params_grads):
 
     # test whether the output is right when use grad_clip under float64
     def test_new_gradient_clip_fp64(self):
+
         def func(params_grads):
             clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
             return clip(params_grads)
@@ -192,6 +199,7 @@ def func(params_grads):
 
     # invoke 'set_gradient_clip' in a wrong order
     def test_wrong_API_order(self):
+
         def backward_func(cost):
             clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
             fluid.clip.set_gradient_clip(clip)
@@ -233,8 +241,8 @@ def test_none_grad_fp16(self):
     def _test_none_grad_helper(self, dtype):
         prog = fluid.Program()
         startup_program = fluid.Program()
-        with fluid.program_guard(
-                main_program=prog, startup_program=startup_program):
+        with fluid.program_guard(main_program=prog,
+                                 startup_program=startup_program):
             clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
             x = fluid.default_main_program().global_block().create_parameter(
                 name="x", shape=[2, 3], dtype=dtype)
@@ -254,6 +262,7 @@ def _test_none_grad_helper(self, dtype):
 
 
 class TestGradientClipByNorm(TestGradientClip):
+
     def init(self):
         self.clip_norm = 0.2
 
@@ -262,13 +271,12 @@ def check_clip_result(self, out, out_clip):
             norm = np.sqrt(np.sum(np.power(u, 2)))
             scale = self.clip_norm / np.maximum(self.clip_norm, norm)
             u = u * scale
-            self.assertTrue(
-                np.allclose(
-                    a=u, b=v, rtol=1e-5, atol=1e-8),
-                "gradient clip by norm has wrong results!")
+            self.assertTrue(np.allclose(a=u, b=v, rtol=1e-5, atol=1e-8),
+                            "gradient clip by norm has wrong results!")
 
     # test whether the output is right when use grad_clip
     def test_gradient_clip(self):
+
         def func(params_grads):
             clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
             return clip(params_grads)
@@ -297,6 +305,7 @@ def test_none_grad(self):
 
 
 class TestGradientClipByValue(TestGradientClip):
+
     def init(self):
         self.max = 0.2
         self.min = 0.1
@@ -306,13 +315,12 @@ def check_clip_result(self, out, out_clip):
             out[i] = np.clip(v, self.min, self.max)
         for u, v in zip(out, out_clip):
             u = np.clip(u, self.min, self.max)
-            self.assertTrue(
-                np.allclose(
-                    a=u, b=v, rtol=1e-6, atol=1e-8),
-                "gradient clip by value has wrong results!")
+            self.assertTrue(np.allclose(a=u, b=v, rtol=1e-6, atol=1e-8),
+                            "gradient clip by value has wrong results!")
 
     # test whether the output is right when use grad_clip
     def test_gradient_clip(self):
+
         def func(params_grads):
             clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
             return clip(params_grads)
@@ -341,11 +349,12 @@ def test_none_grad(self):
 
 
 class TestDygraphGradientClip(unittest.TestCase):
+
     def test_gradient_clip(self):
         with fluid.dygraph.guard():
             linear = fluid.dygraph.Linear(5, 5)
-            inputs = fluid.layers.uniform_random(
-                [16, 5], min=-10, max=10).astype('float32')
+            inputs = fluid.layers.uniform_random([16, 5], min=-10,
+                                                 max=10).astype('float32')
             out = linear(fluid.dygraph.to_variable(inputs))
             loss = fluid.layers.reduce_mean(out)
             loss.backward()
@@ -360,6 +369,7 @@ def check_clip_result(self, loss, optimizer):
 
 
 class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
+
     def setUp(self):
         self.clip_norm = 0.8
         self.clip1 = fluid.clip.GradientClipByGlobalNorm(
@@ -369,10 +379,10 @@ def setUp(self):
 
     def check_clip_result(self, loss, optimizer):
         # if grad is None
-        x = fluid.dygraph.to_variable(
-            np.array([2, 3]).astype("float32"), name="x")
-        y = fluid.dygraph.to_variable(
-            np.array([3, 4]).astype("float32"), name="y")
+        x = fluid.dygraph.to_variable(np.array([2, 3]).astype("float32"),
+                                      name="x")
+        y = fluid.dygraph.to_variable(np.array([3, 4]).astype("float32"),
+                                      name="y")
         assert len(self.clip1([(x, x), (x, y), (x, None)])) == 2
         # get params and grads from network
         opt, params_grads = optimizer.minimize(loss)
@@ -395,13 +405,13 @@ def check_clip_result(self, loss, optimizer):
         a = np.minimum(global_norm, self.clip_norm)
         b = global_norm_clip
         self.assertTrue(
-            np.isclose(
-                a=a, b=b, rtol=1e-6, atol=1e-8),
+            np.isclose(a=a, b=b, rtol=1e-6, atol=1e-8),
             "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
             % (a, b))
 
 
 class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
+
     def setUp(self):
         self.clip_norm = 0.8
         self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
@@ -424,13 +434,13 @@ def check_clip_result(self, loss, optimizer):
             a = np.minimum(a, self.clip_norm)
             b = np.sqrt(np.sum(np.power(v, 2)))
             self.assertTrue(
-                np.isclose(
-                    a=a, b=b, rtol=1e-6, atol=1e-8),
+                np.isclose(a=a, b=b, rtol=1e-6, atol=1e-8),
                 "gradient clip by norm has wrong results, expetcd:%f, but received:%f"
                 % (a, b))
 
 
 class TestDygraphGradientClipByValue(TestDygraphGradientClip):
+
     def setUp(self):
         self.max = 0.2
         self.min = 0.1
@@ -448,13 +458,12 @@ def check_clip_result(self, loss, optimizer):
         for u, v in zip(grads, grads_clip):
             u = np.clip(u.numpy(), self.min, self.max)
             v = v.numpy()
-            self.assertTrue(
-                np.allclose(
-                    a=u, b=v, rtol=1e-6, atol=1e-8),
-                "gradient clip by value has wrong results!")
+            self.assertTrue(np.allclose(a=u, b=v, rtol=1e-6, atol=1e-8),
+                            "gradient clip by value has wrong results!")
 
 
 class SimpleNet(paddle.nn.Layer):
+
     def __init__(self):
         super(SimpleNet, self).__init__()
         self.linear = paddle.nn.Linear(5, 5)
@@ -467,6 +476,7 @@ def forward(self, x):
 
 
 class TestDygraphGradientClipFP16(unittest.TestCase):
+
     def test_gradient_clip(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.dygraph.guard():
@@ -477,8 +487,8 @@ def test_gradient_clip(self):
                 model, sgd_optimizer = paddle.amp.decorate(
                     models=model, optimizers=sgd_optimizer, level='O2')
                 scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
-                inputs = fluid.layers.uniform_random(
-                    [1, 5], min=-10, max=10).astype('float32')
+                inputs = fluid.layers.uniform_random([1, 5], min=-10,
+                                                     max=10).astype('float32')
                 with paddle.amp.auto_cast(level='O2'):
                     out = model(fluid.dygraph.to_variable(inputs))
                     loss = fluid.layers.reduce_mean(out)
@@ -497,7 +507,7 @@ def test_gradient_clip(self):
                 clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.8)
                 params_grads = clip(params_grads)
                 _, grads_clip = zip(*params_grads)
-                # param update                      
+                # param update
                 scaler.step(sgd_optimizer)
                 scaler.update()
 
@@ -515,17 +525,17 @@ def test_gradient_clip(self):
                 a = np.minimum(global_norm, 0.8)
                 b = global_norm_clip
                 self.assertTrue(
-                    np.isclose(
-                        a=a, b=b, rtol=1e-3, atol=1e-8),
+                    np.isclose(a=a, b=b, rtol=1e-3, atol=1e-8),
                     "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
                     % (a, b))
 
 
 class TestDygraphGradientClipFP64(unittest.TestCase):
+
     def test_gradient_clip(self):
         with fluid.dygraph.guard():
-            inputs = fluid.layers.uniform_random(
-                [16, 5], min=-10, max=10).astype('float64')
+            inputs = fluid.layers.uniform_random([16, 5], min=-10,
+                                                 max=10).astype('float64')
             linear = fluid.dygraph.Linear(5, 5, dtype="float64")
             out = linear(fluid.dygraph.to_variable(inputs))
             loss = fluid.layers.reduce_mean(out)
@@ -561,13 +571,13 @@ def test_gradient_clip(self):
             b = global_norm_clip
 
             self.assertTrue(
-                np.isclose(
-                    a=a, b=b, rtol=1e-6, atol=1e-8),
+                np.isclose(a=a, b=b, rtol=1e-6, atol=1e-8),
                 "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
                 % (a, b))
 
 
 class TestPureFP16ClipGradByGlobalNorm(unittest.TestCase):
+
     def check_main(self, expected_has_cast_op):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -578,10 +588,12 @@ def check_main(self, expected_has_cast_op):
             param_and_grads = []
             main_block = main_prog.global_block()
             for name, shape in zip(names, shapes):
-                p = main_block.create_parameter(
-                    name=name, shape=shape, dtype='float16')
-                g = main_block.create_parameter(
-                    name=p.name + '@GRAD', shape=p.shape, dtype=p.dtype)
+                p = main_block.create_parameter(name=name,
+                                                shape=shape,
+                                                dtype='float16')
+                g = main_block.create_parameter(name=p.name + '@GRAD',
+                                                shape=p.shape,
+                                                dtype=p.dtype)
                 param_and_grads.append((p, g))
 
             clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
diff --git a/python/paddle/fluid/tests/unittests/test_graph_khop_sampler.py b/python/paddle/fluid/tests/unittests/test_graph_khop_sampler.py
index 6e6175d669515..57b8209d8d69b 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_khop_sampler.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_khop_sampler.py
@@ -19,6 +19,7 @@
 
 
 class TestGraphKhopSampler(unittest.TestCase):
+
     def setUp(self):
         num_nodes = 20
         edges = np.random.randint(num_nodes, size=(100, 2))
@@ -41,8 +42,8 @@ def setUp(self):
         self.row = sorted_edges[:, 0].astype("int64")
         self.colptr = colptr.astype("int64")
         self.sorted_eid = sorted_eid.astype("int64")
-        self.nodes = np.unique(np.random.randint(
-            num_nodes, size=5)).astype("int64")
+        self.nodes = np.unique(np.random.randint(num_nodes,
+                                                 size=5)).astype("int64")
         self.sample_sizes = [5, 5]
         self.dst_src_dict = dst_src_dict
 
@@ -73,8 +74,8 @@ def func_sample_result(self):
             self.assertTrue(
                 edge_src_n.shape[0] == paddle.unique(edge_src_n).shape[0])
             # Ensure the correct sample size.
-            self.assertTrue(edge_src_n.shape[0] == self.sample_sizes[0] or
-                            edge_src_n.shape[0] == len(self.dst_src_dict[n]))
+            self.assertTrue(edge_src_n.shape[0] == self.sample_sizes[0]
+                            or edge_src_n.shape[0] == len(self.dst_src_dict[n]))
             in_neighbors = np.isin(edge_src_n.numpy(), self.dst_src_dict[n])
             # Ensure the correct sample neighbors.
             self.assertTrue(np.sum(in_neighbors) == in_neighbors.shape[0])
@@ -121,8 +122,8 @@ def func_uva_sample_result(self):
                 self.assertTrue(
                     edge_src_n.shape[0] == paddle.unique(edge_src_n).shape[0])
                 self.assertTrue(
-                    edge_src_n.shape[0] == self.sample_sizes[0] or
-                    edge_src_n.shape[0] == len(self.dst_src_dict[n]))
+                    edge_src_n.shape[0] == self.sample_sizes[0]
+                    or edge_src_n.shape[0] == len(self.dst_src_dict[n]))
                 in_neighbors = np.isin(edge_src_n.numpy(), self.dst_src_dict[n])
                 self.assertTrue(np.sum(in_neighbors) == in_neighbors.shape[0])
 
@@ -134,16 +135,18 @@ def test_uva_sample_result(self):
     def test_sample_result_static_with_eids(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            row = paddle.static.data(
-                name="row", shape=self.row.shape, dtype=self.row.dtype)
-            sorted_eids = paddle.static.data(
-                name="eids",
-                shape=self.sorted_eid.shape,
-                dtype=self.sorted_eid.dtype)
-            colptr = paddle.static.data(
-                name="colptr", shape=self.colptr.shape, dtype=self.colptr.dtype)
-            nodes = paddle.static.data(
-                name="nodes", shape=self.nodes.shape, dtype=self.nodes.dtype)
+            row = paddle.static.data(name="row",
+                                     shape=self.row.shape,
+                                     dtype=self.row.dtype)
+            sorted_eids = paddle.static.data(name="eids",
+                                             shape=self.sorted_eid.shape,
+                                             dtype=self.sorted_eid.dtype)
+            colptr = paddle.static.data(name="colptr",
+                                        shape=self.colptr.shape,
+                                        dtype=self.colptr.dtype)
+            nodes = paddle.static.data(name="nodes",
+                                       shape=self.nodes.shape,
+                                       dtype=self.nodes.dtype)
 
             edge_src, edge_dst, sample_index, reindex_nodes, edge_eids = \
                 paddle.incubate.graph_khop_sampler(row, colptr,
@@ -174,20 +177,23 @@ def test_sample_result_static_with_eids(self):
                 self.assertTrue(
                     edge_src_n.shape[0] == np.unique(edge_src_n).shape[0])
                 self.assertTrue(
-                    edge_src_n.shape[0] == self.sample_sizes[0] or
-                    edge_src_n.shape[0] == len(self.dst_src_dict[n]))
+                    edge_src_n.shape[0] == self.sample_sizes[0]
+                    or edge_src_n.shape[0] == len(self.dst_src_dict[n]))
                 in_neighbors = np.isin(edge_src_n, self.dst_src_dict[n])
                 self.assertTrue(np.sum(in_neighbors) == in_neighbors.shape[0])
 
     def test_sample_result_static_without_eids(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            row = paddle.static.data(
-                name="row", shape=self.row.shape, dtype=self.row.dtype)
-            colptr = paddle.static.data(
-                name="colptr", shape=self.colptr.shape, dtype=self.colptr.dtype)
-            nodes = paddle.static.data(
-                name="nodes", shape=self.nodes.shape, dtype=self.nodes.dtype)
+            row = paddle.static.data(name="row",
+                                     shape=self.row.shape,
+                                     dtype=self.row.dtype)
+            colptr = paddle.static.data(name="colptr",
+                                        shape=self.colptr.shape,
+                                        dtype=self.colptr.dtype)
+            nodes = paddle.static.data(name="nodes",
+                                       shape=self.nodes.shape,
+                                       dtype=self.nodes.dtype)
             edge_src, edge_dst, sample_index, reindex_nodes = \
                 paddle.incubate.graph_khop_sampler(row, colptr,
                                                        nodes, self.sample_sizes)
@@ -214,8 +220,8 @@ def test_sample_result_static_without_eids(self):
                 self.assertTrue(
                     edge_src_n.shape[0] == np.unique(edge_src_n).shape[0])
                 self.assertTrue(
-                    edge_src_n.shape[0] == self.sample_sizes[0] or
-                    edge_src_n.shape[0] == len(self.dst_src_dict[n]))
+                    edge_src_n.shape[0] == self.sample_sizes[0]
+                    or edge_src_n.shape[0] == len(self.dst_src_dict[n]))
                 in_neighbors = np.isin(edge_src_n, self.dst_src_dict[n])
                 self.assertTrue(np.sum(in_neighbors) == in_neighbors.shape[0])
 
diff --git a/python/paddle/fluid/tests/unittests/test_graph_reindex.py b/python/paddle/fluid/tests/unittests/test_graph_reindex.py
index 52abbbe81aef9..1323aaeb02bb6 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_reindex.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_reindex.py
@@ -19,6 +19,7 @@
 
 
 class TestGraphReindex(unittest.TestCase):
+
     def setUp(self):
         self.x = np.arange(5).astype("int64")
         self.neighbors = np.random.randint(100, size=20).astype("int64")
@@ -62,21 +63,81 @@ def test_reindex_result(self):
         self.assertTrue(np.allclose(self.reindex_dst, reindex_dst))
         self.assertTrue(np.allclose(self.out_nodes, out_nodes))
 
+    def test_heter_reindex_result(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        neighbors = paddle.to_tensor(self.neighbors)
+        neighbors = paddle.concat([neighbors, neighbors])
+        count = paddle.to_tensor(self.count)
+        count = paddle.concat([count, count])
+
+        reindex_src, reindex_dst, out_nodes = \
+            paddle.incubate.graph_reindex(x, neighbors, count)
+        self.assertTrue(
+            np.allclose(self.reindex_src,
+                        reindex_src[:self.neighbors.shape[0]]))
+        self.assertTrue(
+            np.allclose(self.reindex_src,
+                        reindex_src[self.neighbors.shape[0]:]))
+        self.assertTrue(
+            np.allclose(self.reindex_dst,
+                        reindex_dst[:self.neighbors.shape[0]]))
+        self.assertTrue(
+            np.allclose(self.reindex_dst,
+                        reindex_dst[self.neighbors.shape[0]:]))
+        self.assertTrue(np.allclose(self.out_nodes, out_nodes))
+
+    def test_heter_reindex_result_v2(self):
+        paddle.disable_static()
+        x = np.arange(5).astype("int64")
+        neighbors1 = np.random.randint(100, size=20).astype("int64")
+        count1 = np.array([2, 8, 4, 3, 3], dtype="int32")
+        neighbors2 = np.random.randint(100, size=20).astype("int64")
+        count2 = np.array([4, 5, 1, 6, 4], dtype="int32")
+        neighbors = np.concatenate([neighbors1, neighbors2])
+        counts = np.concatenate([count1, count2])
+
+        # Get numpy result.
+        out_nodes = list(x)
+        for neighbor in neighbors:
+            if neighbor not in out_nodes:
+                out_nodes.append(neighbor)
+        out_nodes = np.array(out_nodes, dtype="int64")
+        reindex_dict = {node: ind for ind, node in enumerate(out_nodes)}
+        reindex_src = np.array([reindex_dict[node] for node in neighbors])
+        reindex_dst = []
+        for count in [count1, count2]:
+            for node, c in zip(x, count):
+                for i in range(c):
+                    reindex_dst.append(reindex_dict[node])
+        reindex_dst = np.array(reindex_dst, dtype="int64")
+
+        reindex_src_, reindex_dst_, out_nodes_ = \
+            paddle.incubate.graph_reindex(paddle.to_tensor(x),
+                                          paddle.to_tensor(neighbors),
+                                          paddle.to_tensor(counts))
+        self.assertTrue(np.allclose(reindex_src, reindex_src_))
+        self.assertTrue(np.allclose(reindex_dst, reindex_dst_))
+        self.assertTrue(np.allclose(out_nodes, out_nodes_))
+
     def test_reindex_result_static(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.static.data(
-                name="x", shape=self.x.shape, dtype=self.x.dtype)
-            neighbors = paddle.static.data(
-                name="neighbors",
-                shape=self.neighbors.shape,
-                dtype=self.neighbors.dtype)
-            count = paddle.static.data(
-                name="count", shape=self.count.shape, dtype=self.count.dtype)
-            value_buffer = paddle.static.data(
-                name="value_buffer", shape=[self.num_nodes], dtype="int32")
-            index_buffer = paddle.static.data(
-                name="index_buffer", shape=[self.num_nodes], dtype="int32")
+            x = paddle.static.data(name="x",
+                                   shape=self.x.shape,
+                                   dtype=self.x.dtype)
+            neighbors = paddle.static.data(name="neighbors",
+                                           shape=self.neighbors.shape,
+                                           dtype=self.neighbors.dtype)
+            count = paddle.static.data(name="count",
+                                       shape=self.count.shape,
+                                       dtype=self.count.dtype)
+            value_buffer = paddle.static.data(name="value_buffer",
+                                              shape=[self.num_nodes],
+                                              dtype="int32")
+            index_buffer = paddle.static.data(name="index_buffer",
+                                              shape=[self.num_nodes],
+                                              dtype="int32")
 
             reindex_src_1, reindex_dst_1, out_nodes_1 = \
                 paddle.incubate.graph_reindex(x, neighbors, count)
@@ -87,13 +148,16 @@ def test_reindex_result_static(self):
 
             exe = paddle.static.Executor(paddle.CPUPlace())
             ret = exe.run(feed={
-                'x': self.x,
-                'neighbors': self.neighbors,
-                'count': self.count,
-                'value_buffer': np.full(
-                    [self.num_nodes], -1, dtype="int32"),
-                'index_buffer': np.full(
-                    [self.num_nodes], -1, dtype="int32")
+                'x':
+                self.x,
+                'neighbors':
+                self.neighbors,
+                'count':
+                self.count,
+                'value_buffer':
+                np.full([self.num_nodes], -1, dtype="int32"),
+                'index_buffer':
+                np.full([self.num_nodes], -1, dtype="int32")
             },
                           fetch_list=[
                               reindex_src_1, reindex_dst_1, out_nodes_1,
diff --git a/python/paddle/fluid/tests/unittests/test_graph_sample_neighbors.py b/python/paddle/fluid/tests/unittests/test_graph_sample_neighbors.py
index 675a3429ab55f..f84513506b324 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_sample_neighbors.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_sample_neighbors.py
@@ -19,6 +19,7 @@
 
 
 class TestGraphSampleNeighbors(unittest.TestCase):
+
     def setUp(self):
         num_nodes = 20
         edges = np.random.randint(num_nodes, size=(100, 2))
@@ -39,8 +40,8 @@ def setUp(self):
 
         self.row = sorted_edges[:, 0].astype("int64")
         self.colptr = colptr.astype("int64")
-        self.nodes = np.unique(np.random.randint(
-            num_nodes, size=5)).astype("int64")
+        self.nodes = np.unique(np.random.randint(num_nodes,
+                                                 size=5)).astype("int64")
         self.sample_size = 5
         self.dst_src_dict = dst_src_dict
 
@@ -57,12 +58,12 @@ def test_sample_result(self):
             if i == 0:
                 neighbors = out_neighbors[0:out_count_cumsum[i]]
             else:
-                neighbors = out_neighbors[out_count_cumsum[i - 1]:
-                                          out_count_cumsum[i]]
+                neighbors = out_neighbors[
+                    out_count_cumsum[i - 1]:out_count_cumsum[i]]
             # Ensure the correct sample size.
             self.assertTrue(
-                out_count[i] == self.sample_size or
-                out_count[i] == len(self.dst_src_dict[self.nodes[i]]))
+                out_count[i] == self.sample_size
+                or out_count[i] == len(self.dst_src_dict[self.nodes[i]]))
             # Ensure no repetitive sample neighbors.
             self.assertTrue(
                 neighbors.shape[0] == paddle.unique(neighbors).shape[0])
@@ -91,12 +92,12 @@ def test_sample_result_fisher_yates_sampling(self):
                 if i == 0:
                     neighbors = out_neighbors[0:out_count_cumsum[i]]
                 else:
-                    neighbors = out_neighbors[out_count_cumsum[i - 1]:
-                                              out_count_cumsum[i]]
+                    neighbors = out_neighbors[
+                        out_count_cumsum[i - 1]:out_count_cumsum[i]]
                 # Ensure the correct sample size.
                 self.assertTrue(
-                    out_count[i] == self.sample_size or
-                    out_count[i] == len(self.dst_src_dict[self.nodes[i]]))
+                    out_count[i] == self.sample_size
+                    or out_count[i] == len(self.dst_src_dict[self.nodes[i]]))
                 # Ensure no repetitive sample neighbors.
                 self.assertTrue(
                     neighbors.shape[0] == paddle.unique(neighbors).shape[0])
@@ -108,12 +109,15 @@ def test_sample_result_fisher_yates_sampling(self):
     def test_sample_result_static(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            row = paddle.static.data(
-                name="row", shape=self.row.shape, dtype=self.row.dtype)
-            colptr = paddle.static.data(
-                name="colptr", shape=self.colptr.shape, dtype=self.colptr.dtype)
-            nodes = paddle.static.data(
-                name="nodes", shape=self.nodes.shape, dtype=self.nodes.dtype)
+            row = paddle.static.data(name="row",
+                                     shape=self.row.shape,
+                                     dtype=self.row.dtype)
+            colptr = paddle.static.data(name="colptr",
+                                        shape=self.colptr.shape,
+                                        dtype=self.colptr.dtype)
+            nodes = paddle.static.data(name="nodes",
+                                       shape=self.nodes.shape,
+                                       dtype=self.nodes.dtype)
 
             out_neighbors, out_count = paddle.incubate.graph_sample_neighbors(
                 row, colptr, nodes, sample_size=self.sample_size)
@@ -129,8 +133,8 @@ def test_sample_result_static(self):
             out_neighbors = np.split(out_neighbors, out_count_cumsum)[:-1]
             for neighbors, node, count in zip(out_neighbors, self.nodes,
                                               out_count):
-                self.assertTrue(count == self.sample_size or
-                                count == len(self.dst_src_dict[node]))
+                self.assertTrue(count == self.sample_size
+                                or count == len(self.dst_src_dict[node]))
                 self.assertTrue(
                     neighbors.shape[0] == np.unique(neighbors).shape[0])
                 in_neighbors = np.isin(neighbors, self.dst_src_dict[node])
@@ -143,20 +147,18 @@ def test_raise_errors(self):
         nodes = paddle.to_tensor(self.nodes)
 
         def check_eid_error():
-            paddle.incubate.graph_sample_neighbors(
-                row,
-                colptr,
-                nodes,
-                sample_size=self.sample_size,
-                return_eids=True)
+            paddle.incubate.graph_sample_neighbors(row,
+                                                   colptr,
+                                                   nodes,
+                                                   sample_size=self.sample_size,
+                                                   return_eids=True)
 
         def check_perm_buffer_error():
-            paddle.incubate.graph_sample_neighbors(
-                row,
-                colptr,
-                nodes,
-                sample_size=self.sample_size,
-                flag_perm_buffer=True)
+            paddle.incubate.graph_sample_neighbors(row,
+                                                   colptr,
+                                                   nodes,
+                                                   sample_size=self.sample_size,
+                                                   flag_perm_buffer=True)
 
         self.assertRaises(ValueError, check_eid_error)
         self.assertRaises(ValueError, check_perm_buffer_error)
@@ -189,14 +191,18 @@ def test_sample_result_with_eids(self):
 
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            row = paddle.static.data(
-                name="row", shape=self.row.shape, dtype=self.row.dtype)
-            colptr = paddle.static.data(
-                name="colptr", shape=self.colptr.shape, dtype=self.colptr.dtype)
-            nodes = paddle.static.data(
-                name="nodes", shape=self.nodes.shape, dtype=self.nodes.dtype)
-            eids = paddle.static.data(
-                name="eids", shape=self.edges_id.shape, dtype=self.nodes.dtype)
+            row = paddle.static.data(name="row",
+                                     shape=self.row.shape,
+                                     dtype=self.row.dtype)
+            colptr = paddle.static.data(name="colptr",
+                                        shape=self.colptr.shape,
+                                        dtype=self.colptr.dtype)
+            nodes = paddle.static.data(name="nodes",
+                                       shape=self.nodes.shape,
+                                       dtype=self.nodes.dtype)
+            eids = paddle.static.data(name="eids",
+                                      shape=self.edges_id.shape,
+                                      dtype=self.nodes.dtype)
 
             out_neighbors, out_count, out_eids = paddle.incubate.graph_sample_neighbors(
                 row,
diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py
index c233606c053d8..c0fdb134f16d6 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py
@@ -33,6 +33,7 @@ def graph_send_recv_wrapper(x,
 
 
 class TestGraphSendRecvMaxOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.python_api = graph_send_recv_wrapper
@@ -47,19 +48,22 @@ def setUp(self):
 
         self.attrs = {'pool_type': 'MAX'}
 
-        out, self.gradient = compute_graph_send_recv_for_min_max(self.inputs,
-                                                                 self.attrs)
+        out, self.gradient = compute_graph_send_recv_for_min_max(
+            self.inputs, self.attrs)
         self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', user_defined_grads=[self.gradient], check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        user_defined_grads=[self.gradient],
+                        check_eager=True)
 
 
 class TestGraphSendRecvMinOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.python_api = graph_send_recv_wrapper
@@ -74,8 +78,8 @@ def setUp(self):
 
         self.attrs = {'pool_type': 'MIN'}
 
-        out, self.gradient = compute_graph_send_recv_for_min_max(self.inputs,
-                                                                 self.attrs)
+        out, self.gradient = compute_graph_send_recv_for_min_max(
+            self.inputs, self.attrs)
 
         self.outputs = {'Out': out}
 
@@ -83,11 +87,14 @@ def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', user_defined_grads=[self.gradient], check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        user_defined_grads=[self.gradient],
+                        check_eager=True)
 
 
 class TestGraphSendRecvSumOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.python_api = graph_send_recv_wrapper
@@ -114,6 +121,7 @@ def test_check_grad(self):
 
 
 class TestGraphSendRecvMeanOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.python_api = graph_send_recv_wrapper
@@ -128,8 +136,8 @@ def setUp(self):
 
         self.attrs = {'pool_type': 'MEAN'}
 
-        out, dst_count = compute_graph_send_recv_for_sum_mean(self.inputs,
-                                                              self.attrs)
+        out, dst_count = compute_graph_send_recv_for_sum_mean(
+            self.inputs, self.attrs)
 
         self.outputs = {'Out': out, 'Dst_count': dst_count}
 
@@ -182,7 +190,7 @@ def compute_graph_send_recv_for_min_max(inputs, attributes):
     results = np.zeros(target_shape, dtype=x.dtype)
     gradient = np.zeros_like(x)
 
-    # Calculate forward output 
+    # Calculate forward output
     if pool_type == "MAX":
         first_set = set()
         for index, s_id in enumerate(dst_index):
@@ -209,13 +217,14 @@ def compute_graph_send_recv_for_min_max(inputs, attributes):
     for i in range(index_size):
         forward_src_idx = src_index[i]
         forward_dst_idx = dst_index[i]
-        gradient[forward_src_idx] += 1 * (
-            x[forward_src_idx] == results[forward_dst_idx])
+        gradient[forward_src_idx] += 1 * (x[forward_src_idx]
+                                          == results[forward_dst_idx])
 
     return results, gradient / results.size
 
 
 class API_GraphSendRecvOpTest(unittest.TestCase):
+
     def test_static(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
@@ -237,32 +246,32 @@ def test_static(self):
             data2 = np.array([0, 1, 2, 0], dtype="int32")
             data3 = np.array([1, 2, 1, 0], dtype="int32")
 
-            np_sum = np.array(
-                [[0, 2, 3], [2, 8, 10], [1, 4, 5]], dtype="float32")
-            np_mean = np.array(
-                [[0, 2, 3], [1, 4, 5], [1, 4, 5]], dtype="float32")
-            np_max = np.array(
-                [[0, 2, 3], [2, 6, 7], [1, 4, 5]], dtype="float32")
-            np_min = np.array(
-                [[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="float32")
-
-            ret = exe.run(feed={'x': data1,
-                                'src': data2,
-                                'dst': data3},
+            np_sum = np.array([[0, 2, 3], [2, 8, 10], [1, 4, 5]],
+                              dtype="float32")
+            np_mean = np.array([[0, 2, 3], [1, 4, 5], [1, 4, 5]],
+                               dtype="float32")
+            np_max = np.array([[0, 2, 3], [2, 6, 7], [1, 4, 5]],
+                              dtype="float32")
+            np_min = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]],
+                              dtype="float32")
+
+            ret = exe.run(feed={
+                'x': data1,
+                'src': data2,
+                'dst': data3
+            },
                           fetch_list=[res_sum, res_mean, res_max, res_min])
 
         for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret):
             self.assertTrue(
-                np.allclose(
-                    np_res, ret_res, atol=1e-6),
-                "two value is\
+                np.allclose(np_res, ret_res, atol=1e-6), "two value is\
                 {}\n{}, check diff!".format(np_res, ret_res))
 
     def test_dygraph(self):
         device = paddle.CPUPlace()
         with paddle.fluid.dygraph.guard(device):
-            x = paddle.to_tensor(
-                np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32")
+            x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]),
+                                 dtype="float32")
             src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32")
             dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32")
             res_sum = paddle.incubate.graph_send_recv(x, src_index, dst_index,
@@ -274,33 +283,31 @@ def test_dygraph(self):
             res_min = paddle.incubate.graph_send_recv(x, src_index, dst_index,
                                                       "min")
 
-            np_sum = np.array(
-                [[0, 2, 3], [2, 8, 10], [1, 4, 5]], dtype="float32")
-            np_mean = np.array(
-                [[0, 2, 3], [1, 4, 5], [1, 4, 5]], dtype="float32")
-            np_max = np.array(
-                [[0, 2, 3], [2, 6, 7], [1, 4, 5]], dtype="float32")
-            np_min = np.array(
-                [[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="float32")
+            np_sum = np.array([[0, 2, 3], [2, 8, 10], [1, 4, 5]],
+                              dtype="float32")
+            np_mean = np.array([[0, 2, 3], [1, 4, 5], [1, 4, 5]],
+                               dtype="float32")
+            np_max = np.array([[0, 2, 3], [2, 6, 7], [1, 4, 5]],
+                              dtype="float32")
+            np_min = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]],
+                              dtype="float32")
 
             ret = [res_sum, res_mean, res_max, res_min]
 
         for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret):
             self.assertTrue(
-                np.allclose(
-                    np_res, ret_res, atol=1e-6),
-                "two value is\
+                np.allclose(np_res, ret_res, atol=1e-6), "two value is\
                 {}\n{}, check diff!".format(np_res, ret_res))
 
     def test_int32_input(self):
         device = paddle.CPUPlace()
         with paddle.fluid.dygraph.guard(device):
-            x = paddle.to_tensor(
-                np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), dtype="int32")
-            src_index = paddle.to_tensor(
-                np.array([0, 1, 2, 0, 1]), dtype="int32")
-            dst_index = paddle.to_tensor(
-                np.array([1, 2, 1, 0, 1]), dtype="int32")
+            x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]),
+                                 dtype="int32")
+            src_index = paddle.to_tensor(np.array([0, 1, 2, 0, 1]),
+                                         dtype="int32")
+            dst_index = paddle.to_tensor(np.array([1, 2, 1, 0, 1]),
+                                         dtype="int32")
             res_sum = paddle.incubate.graph_send_recv(x, src_index, dst_index,
                                                       "sum")
             res_mean = paddle.incubate.graph_send_recv(x, src_index, dst_index,
@@ -310,8 +317,8 @@ def test_int32_input(self):
             res_min = paddle.incubate.graph_send_recv(x, src_index, dst_index,
                                                       "min")
 
-            np_sum = np.array(
-                [[0, 2, 3], [3, 12, 14], [1, 4, 5]], dtype="int32")
+            np_sum = np.array([[0, 2, 3], [3, 12, 14], [1, 4, 5]],
+                              dtype="int32")
             np_mean = np.array([[0, 2, 3], [1, 4, 4], [1, 4, 5]], dtype="int32")
             np_max = np.array([[0, 2, 3], [2, 6, 6], [1, 4, 5]], dtype="int32")
             np_min = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="int32")
@@ -320,15 +327,13 @@ def test_int32_input(self):
 
         for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret):
             self.assertTrue(
-                np.allclose(
-                    np_res, ret_res, atol=1e-6),
-                "two value is\
+                np.allclose(np_res, ret_res, atol=1e-6), "two value is\
                 {}\n{}, check diff!".format(np_res, ret_res))
 
     def test_set_outsize_gpu(self):
         if paddle.fluid.core.is_compiled_with_cuda():
-            x = paddle.to_tensor(
-                np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), dtype="float32")
+            x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]),
+                                 dtype="float32")
             src_index = paddle.to_tensor(np.array([0, 0, 1]), dtype="int32")
             dst_index = paddle.to_tensor(np.array([0, 1, 1]), dtype="int32")
             res = paddle.incubate.graph_send_recv(x, src_index, dst_index,
@@ -337,22 +342,19 @@ def test_set_outsize_gpu(self):
             res_set_outsize = paddle.incubate.graph_send_recv(
                 x, src_index, dst_index, "sum", out_size)
 
-            np_res = np.array(
-                [[0, 2, 3], [1, 6, 8], [0, 0, 0]], dtype="float32")
-            np_res_set_outsize = np.array(
-                [[0, 2, 3], [1, 6, 8]], dtype="float32")
+            np_res = np.array([[0, 2, 3], [1, 6, 8], [0, 0, 0]],
+                              dtype="float32")
+            np_res_set_outsize = np.array([[0, 2, 3], [1, 6, 8]],
+                                          dtype="float32")
 
             self.assertTrue(
-                np.allclose(
-                    np_res, res, atol=1e-6),
-                "two value is\
+                np.allclose(np_res, res, atol=1e-6), "two value is\
                 {}\n{}, check diff!".format(np_res, res))
             self.assertTrue(
-                np.allclose(
-                    np_res_set_outsize, res_set_outsize, atol=1e-6),
+                np.allclose(np_res_set_outsize, res_set_outsize, atol=1e-6),
                 "two value is\
-                {}\n{}, check diff!"
-                .format(np_res_set_outsize, res_set_outsize))
+                {}\n{}, check diff!".format(np_res_set_outsize,
+                                            res_set_outsize))
 
     def test_api_eager_dygraph(self):
         with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
index 9ad0309a70e31..90e80e013ece1 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
@@ -21,6 +21,7 @@
 
 
 class GridSampleTestCase(unittest.TestCase):
+
     def __init__(self,
                  methodName='runTest',
                  x_shape=[2, 2, 3, 3],
@@ -48,12 +49,11 @@ def static_functional(self, place):
             with fluid.program_guard(main, start):
                 x = fluid.data("x", self.x_shape, dtype=self.dtype)
                 grid = fluid.data("grid", self.grid_shape, dtype=self.dtype)
-                y_var = F.grid_sample(
-                    x,
-                    grid,
-                    mode=self.mode,
-                    padding_mode=self.padding_mode,
-                    align_corners=self.align_corners)
+                y_var = F.grid_sample(x,
+                                      grid,
+                                      mode=self.mode,
+                                      padding_mode=self.padding_mode,
+                                      align_corners=self.align_corners)
         feed_dict = {"x": self.x, "grid": self.grid}
         exe = fluid.Executor(place)
         exe.run(start)
@@ -63,12 +63,11 @@ def static_functional(self, place):
     def dynamic_functional(self):
         x_t = paddle.to_tensor(self.x)
         grid_t = paddle.to_tensor(self.grid)
-        y_t = F.grid_sample(
-            x_t,
-            grid_t,
-            mode=self.mode,
-            padding_mode=self.padding_mode,
-            align_corners=self.align_corners)
+        y_t = F.grid_sample(x_t,
+                            grid_t,
+                            mode=self.mode,
+                            padding_mode=self.padding_mode,
+                            align_corners=self.align_corners)
         y_np = y_t.numpy()
         return y_np
 
@@ -88,6 +87,7 @@ def runTest(self):
 
 
 class GridSampleErrorTestCase(GridSampleTestCase):
+
     def runTest(self):
         place = fluid.CPUPlace()
         with self.assertRaises(ValueError):
@@ -97,26 +97,22 @@ def runTest(self):
 def add_cases(suite):
     suite.addTest(GridSampleTestCase(methodName='runTest'))
     suite.addTest(
-        GridSampleTestCase(
-            methodName='runTest',
-            mode='bilinear',
-            padding_mode='reflection',
-            align_corners=True))
+        GridSampleTestCase(methodName='runTest',
+                           mode='bilinear',
+                           padding_mode='reflection',
+                           align_corners=True))
     suite.addTest(
-        GridSampleTestCase(
-            methodName='runTest',
-            mode='bilinear',
-            padding_mode='zeros',
-            align_corners=True))
+        GridSampleTestCase(methodName='runTest',
+                           mode='bilinear',
+                           padding_mode='zeros',
+                           align_corners=True))
 
 
 def add_error_cases(suite):
     suite.addTest(
-        GridSampleErrorTestCase(
-            methodName='runTest', padding_mode="VALID"))
+        GridSampleErrorTestCase(methodName='runTest', padding_mode="VALID"))
     suite.addTest(
-        GridSampleErrorTestCase(
-            methodName='runTest', align_corners="VALID"))
+        GridSampleErrorTestCase(methodName='runTest', align_corners="VALID"))
     suite.addTest(GridSampleErrorTestCase(methodName='runTest', mode="VALID"))
 
 
@@ -128,6 +124,7 @@ def load_tests(loader, standard_tests, pattern):
 
 
 class TestGridSampleAPI(unittest.TestCase):
+
     def test_errors(self):
         with self.assertRaises(ValueError):
             x = paddle.randn([1, 1, 3, 3])
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
index 531aa1dcc3c47..f32387d07a9fd 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -17,6 +17,7 @@
 import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest, skip_check_grad_ci
+
 paddle.enable_static()
 
 
@@ -24,12 +25,12 @@ def AffineGrid(theta, grid_shape):
     n = grid_shape[0]
     h = grid_shape[1]
     w = grid_shape[2]
-    h_idx = np.repeat(
-        np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
-    w_idx = np.repeat(
-        np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
-    grid = np.concatenate(
-        [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
+    h_idx = np.repeat(np.linspace(-1, 1, h)[np.newaxis, :], w,
+                      axis=0).T[:, :, np.newaxis]
+    w_idx = np.repeat(np.linspace(-1, 1, w)[np.newaxis, :], h,
+                      axis=0)[:, :, np.newaxis]
+    grid = np.concatenate([w_idx, h_idx, np.ones([h, w, 1])],
+                          axis=2)  # h * w * 3
     grid = np.repeat(grid[np.newaxis, :], n, axis=0)  # n * h * w *3
 
     ret = np.zeros([n, h * w, 2])
@@ -71,8 +72,8 @@ def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode):
     if align_corners:
         grid_slice = 0.5 * ((grid_slice.astype('float64') + 1.0) * max_val)
     else:
-        grid_slice = 0.5 * (
-            (grid_slice.astype('float64') + 1.0) * (max_val + 1)) - 0.5
+        grid_slice = 0.5 * ((grid_slice.astype('float64') + 1.0) *
+                            (max_val + 1)) - 0.5
 
     if padding_mode == "border":
         grid_slice = clip(grid_slice, 0, max_val)
@@ -82,8 +83,8 @@ def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode):
                                                                    0.5)
         extra = grid_abs - np.floor(grid_abs / double_range) * double_range
         grid_slice = np.minimum(extra, double_range - extra)
-        grid_slice = grid_slice if align_corners else clip(grid_slice - 0.5, 0,
-                                                           max_val)
+        grid_slice = grid_slice if align_corners else clip(
+            grid_slice - 0.5, 0, max_val)
     return grid_slice
 
 
@@ -138,6 +139,7 @@ def GridSampler(data,
 
 
 class TestGridSamplerOp(OpTest):
+
     def setUp(self):
         self.use_cudnn = False
         self.numeric_grad_delta = 0.0001
@@ -163,19 +165,19 @@ def setUp(self):
             "mode": self.mode
         }
         self.outputs = {
-            'Output': GridSampler(x, grid, self.align_corners, self.mode,
-                                  self.padding_mode)
+            'Output':
+            GridSampler(x, grid, self.align_corners, self.mode,
+                        self.padding_mode)
         }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Grid'],
-            'Output',
-            max_relative_error=0.01,
-            numeric_grad_delta=self.numeric_grad_delta)
+        self.check_grad(['X', 'Grid'],
+                        'Output',
+                        max_relative_error=0.01,
+                        numeric_grad_delta=self.numeric_grad_delta)
 
     def initTestCase(self):
         self.x_shape = (2, 3, 8, 8)
@@ -188,6 +190,7 @@ def initTestCase(self):
 
 
 class Case1(TestGridSamplerOp):
+
     def initTestCase(self):
         self.x_shape = (2, 3, 5, 6)
         self.grid_shape = (2, 8, 9, 2)
@@ -198,6 +201,7 @@ def initTestCase(self):
 
 
 class Case1_(TestGridSamplerOp):
+
     def initTestCase(self):
         self.x_shape = (2, 3, 5, 6)
         self.grid_shape = (2, 8, 9, 2)
@@ -208,6 +212,7 @@ def initTestCase(self):
 
 
 class Case2(TestGridSamplerOp):
+
     def initTestCase(self):
         self.x_shape = (2, 3, 5, 6)
         self.grid_shape = (2, 8, 9, 2)
@@ -218,6 +223,7 @@ def initTestCase(self):
 
 
 class Case3(TestGridSamplerOp):
+
     def initTestCase(self):
         self.x_shape = (2, 3, 5, 6)
         self.grid_shape = (2, 8, 9, 2)
@@ -228,6 +234,7 @@ def initTestCase(self):
 
 
 class Case4(TestGridSamplerOp):
+
     def initTestCase(self):
         self.x_shape = (2, 3, 5, 6)
         self.grid_shape = (2, 8, 9, 2)
@@ -241,6 +248,7 @@ def initTestCase(self):
 @skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
                     "however it is desirable to cover the forward pass")
 class LargeInputCase(TestGridSamplerOp):
+
     def get_places(self):
         places = []
         if core.is_compiled_with_cuda():
@@ -263,6 +271,7 @@ def test_check_grad_normal(self):
 @skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
                     "however it is desirable to cover the forward pass")
 class Case5(LargeInputCase):
+
     def initTestCase(self):
         self.no_need_check_grad = True
         self.x_shape = (2, 3, 128, 128)
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op.py b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
index 61a51d9b5dd86..94793ad85cf30 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
@@ -41,6 +41,7 @@ def group_norm_naive(x, scale, bias, epsilon, groups, data_layout):
 
 
 class TestGroupNormOpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
 
@@ -52,8 +53,9 @@ def test_x_type():
             self.assertRaises(TypeError, test_x_type)
 
             def test_x_dtype():
-                x2 = fluid.layers.data(
-                    name='x2', shape=[2, 100, 3, 5], dtype='int32')
+                x2 = fluid.layers.data(name='x2',
+                                       shape=[2, 100, 3, 5],
+                                       dtype='int32')
                 groups = 2
                 fluid.layers.group_norm(x2, groups)
 
@@ -61,6 +63,7 @@ def test_x_dtype():
 
 
 class TestGroupNormOp(OpTest):
+
     def setUp(self):
         self.op_type = "group_norm"
         self.data_format = "NCHW"
@@ -75,9 +78,10 @@ def setUp(self):
             input = np.transpose(input, (0, 2, 3, 1))
         scale = np.random.random([self.shape[1]]).astype(self.dtype)
         bias = np.random.random([self.shape[1]]).astype(self.dtype)
-        output, mean, var = group_norm_naive(
-            input, scale, bias, self.attrs['epsilon'], self.attrs['groups'],
-            self.data_format)
+        output, mean, var = group_norm_naive(input, scale, bias,
+                                             self.attrs['epsilon'],
+                                             self.attrs['groups'],
+                                             self.data_format)
 
         self.inputs = {
             'X': OpTest.np_dtype_to_fluid_dtype(input),
@@ -97,15 +101,16 @@ def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
             # group_norm uses AtomicAdd on CUDAPlace, which do not ensure
-            # computation order when multiple threads write the same address. So the 
+            # computation order when multiple threads write the same address. So the
             # result of group_norm is non-deterministic when datatype is float.
             # When inplace_atol is not None, the inplace check uses numpy.allclose
             # to check inplace result instead of numpy.array_equal.
             # Set to inplace_atol to 0, which means the absolute error is 0, and the
             # relative error is 1e-05 in numpy.allclose by default.
             # Reference: https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html
-            self.check_output_with_place(
-                place, atol=atol, inplace_atol=inplace_atol)
+            self.check_output_with_place(place,
+                                         atol=atol,
+                                         inplace_atol=inplace_atol)
 
     def do_compare_between_place(self):
         if not core.is_compiled_with_cuda(): return
@@ -138,44 +143,52 @@ def test_check_grad(self):
             self.check_grad_with_place(
                 place,
                 set(['X', 'Scale', 'Bias']),
-                'Y', )
+                'Y',
+            )
 
     def init_test_case(self):
         pass
 
 
 class TestGroupNormOp1(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 1
 
 
 class TestGroupNormOp2(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 4
 
 
 class TestGroupNormOpBigEps1(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 1
         self.attrs['epsilon'] = 0.5
 
 
 class TestGroupNormOpBigEps2(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 4
         self.attrs['epsilon'] = 0.5
 
 
 class TestGroupNormOpBigEps3(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['epsilon'] = 0.5
 
 
 @skip_check_grad_ci(
-    reason='''This test case is used to ensure whether the gradient checking results between CPU and GPU  
+    reason=
+    '''This test case is used to ensure whether the gradient checking results between CPU and GPU  
             are consistent when using the same inputs, thus, it doesn't need to call check_grad.'''
 )
 class TestGroupNormOpLargeData(TestGroupNormOp):
+
     def init_test_case(self):
         self.shape = (2, 32, 64, 64)
         self.attrs['groups'] = 8
@@ -183,18 +196,21 @@ def init_test_case(self):
 
 
 class TestGroupNormOp1_With_NHWC(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 1
         self.data_format = "NHWC"
 
 
 class TestGroupNormOp2_With_NHWC(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 4
         self.data_format = "NHWC"
 
 
 class TestGroupNormOpBigEps1_With_NHWC(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 1
         self.attrs['epsilon'] = 0.5
@@ -202,6 +218,7 @@ def init_test_case(self):
 
 
 class TestGroupNormOpBigEps2_With_NHWC(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['groups'] = 4
         self.attrs['epsilon'] = 0.5
@@ -209,16 +226,19 @@ def init_test_case(self):
 
 
 class TestGroupNormOpBigEps3_With_NHWC(TestGroupNormOp):
+
     def init_test_case(self):
         self.attrs['epsilon'] = 0.5
         self.data_format = "NHWC"
 
 
 @skip_check_grad_ci(
-    reason='''This test case is used to ensure whether the gradient checking results between CPU and GPU  
+    reason=
+    '''This test case is used to ensure whether the gradient checking results between CPU and GPU  
             are consistent when using the same inputs, thus, it doesn't need to call check_grad.'''
 )
 class TestGroupNormOpLargeData_With_NHWC(TestGroupNormOp):
+
     def init_test_case(self):
         self.shape = (2, 64, 32, 32)  # NCHW
         self.attrs['groups'] = 8
@@ -227,13 +247,16 @@ def init_test_case(self):
 
 
 class TestGroupNormAPI_With_NHWC(unittest.TestCase):
+
     def test_case1(self):
         data1 = fluid.data(name='data1', shape=[None, 3, 3, 4], dtype='float64')
-        out1 = fluid.layers.group_norm(
-            input=data1, groups=2, data_layout="NHWC")
+        out1 = fluid.layers.group_norm(input=data1,
+                                       groups=2,
+                                       data_layout="NHWC")
         data2 = fluid.data(name='data2', shape=[None, 4, 3, 3], dtype='float64')
-        out2 = fluid.layers.group_norm(
-            input=data2, groups=2, data_layout="NCHW")
+        out2 = fluid.layers.group_norm(input=data2,
+                                       groups=2,
+                                       data_layout="NCHW")
 
         data1_np = np.random.random((2, 3, 3, 4)).astype("float64")
         data2_np = np.random.random((2, 4, 3, 3)).astype("float64")
@@ -243,14 +266,24 @@ def test_case1(self):
         place = core.CPUPlace()
         exe = fluid.Executor(place)
         results = exe.run(fluid.default_main_program(),
-                          feed={"data1": data1_np,
-                                "data2": data2_np},
+                          feed={
+                              "data1": data1_np,
+                              "data2": data2_np
+                          },
                           fetch_list=[out1, out2],
                           return_numpy=True)
-        expect_res1 = group_norm_naive(
-            data1_np, scale, bias, epsilon=1e-5, groups=2, data_layout="NHWC")
-        expect_res2 = group_norm_naive(
-            data2_np, scale, bias, epsilon=1e-5, groups=2, data_layout="NCHW")
+        expect_res1 = group_norm_naive(data1_np,
+                                       scale,
+                                       bias,
+                                       epsilon=1e-5,
+                                       groups=2,
+                                       data_layout="NHWC")
+        expect_res2 = group_norm_naive(data2_np,
+                                       scale,
+                                       bias,
+                                       epsilon=1e-5,
+                                       groups=2,
+                                       data_layout="NCHW")
         self.assertTrue(np.allclose(results[0], expect_res1[0]))
         self.assertTrue(np.allclose(results[1], expect_res2[0]))
 
@@ -261,8 +294,9 @@ def test_exception(self):
         data = fluid.data(name='data', shape=[None, 3, 3, 4], dtype="float64")
 
         def attr_data_format():
-            out = fluid.layers.group_norm(
-                input=data, groups=2, data_layout="NDHW")
+            out = fluid.layers.group_norm(input=data,
+                                          groups=2,
+                                          data_layout="NDHW")
 
         self.assertRaises(ValueError, attr_data_format)
 
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
index 2f3adbe861a1f..c6bc44ebd2f24 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -41,6 +41,7 @@ def group_norm_naive_for_general_dimension(x, scale, bias, epsilon, groups):
 
 
 class TestDygraphGroupNormv2(unittest.TestCase):
+
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
@@ -63,18 +64,18 @@ def compute_v2(x):
 
             def test_weight_bias_false():
                 with fluid.dygraph.guard(p):
-                    gn = paddle.nn.GroupNorm(
-                        num_channels=2,
-                        num_groups=2,
-                        weight_attr=False,
-                        bias_attr=False)
+                    gn = paddle.nn.GroupNorm(num_channels=2,
+                                             num_groups=2,
+                                             weight_attr=False,
+                                             bias_attr=False)
 
             def test_nn_exception():
                 with fluid.dygraph.guard(p):
 
                     def attr_data_format():
-                        out = paddle.nn.GroupNorm(
-                            num_groups=2, num_channels=2, data_format="NHWC")
+                        out = paddle.nn.GroupNorm(num_groups=2,
+                                                  num_channels=2,
+                                                  data_format="NHWC")
 
                     self.assertRaises(ValueError, attr_data_format)
 
@@ -125,10 +126,11 @@ def compute_v2(x_np):
 
 
 class TestGroupNormAPIV2_With_General_Dimensions(unittest.TestCase):
+
     def test_numerical_accuracy(self):
         paddle.disable_static()
-        shapes = [(2, 6), (2, 6, 4), (2, 6, 4, 4), (2, 6, 6, 6, 2), (2, 6, 6, 6,
-                                                                     2, 3)]
+        shapes = [(2, 6), (2, 6, 4), (2, 6, 4, 4), (2, 6, 6, 6, 2),
+                  (2, 6, 6, 6, 2, 3)]
         np.random.seed(10)
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
@@ -154,7 +156,9 @@ def test_numerical_accuracy(self):
 
 
 class TestGroupNormDimException(unittest.TestCase):
+
     def test_exception(self):
+
         def test_empty_input_static_API():
             x = paddle.to_tensor([], dtype='float32')
             paddle.static.nn.group_norm(x, 3)
diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
index 7740cc0b03b49..1006a43b2e96a 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -35,6 +35,7 @@ def gru(
         act_gate,
         dtype='float32',
         origin_mode=False):
+
     def _seq_to_batch(lod, is_reverse):
         idx_in_seq_list = []
         seq_lens = lod[0]
@@ -50,8 +51,8 @@ def _seq_to_batch(lod, is_reverse):
             for i in range(len(seq_lens)):
                 if seq_lens[sorted_seqs[i]] <= batch_idx:
                     break
-                idx = (seq_starts[sorted_seqs[i] + 1] - 1 - batch_idx
-                       ) if is_reverse else (
+                idx = (seq_starts[sorted_seqs[i] + 1] - 1 -
+                       batch_idx) if is_reverse else (
                            seq_starts[sorted_seqs[i]] + batch_idx)
                 idx_in_seq.append(idx)
             idx_in_seq_list.append(idx_in_seq)
@@ -103,6 +104,7 @@ def _step(x, h_p, w, b, act_state, act_gate):
 
 
 class TestGRUOp(OpTest):
+
     def set_confs(self):
         pass
 
@@ -127,11 +129,11 @@ def setUp(self):
         N = len(self.lod[0])
         input = np.random.rand(T, 3 * self.D).astype(self.dtype)
         weight = np.random.rand(self.D, 3 * self.D).astype(self.dtype)
-        bias = np.random.rand(
-            1, 3 * self.D).astype(self.dtype) if self.with_bias else np.zeros(
+        bias = np.random.rand(1, 3 * self.D).astype(
+            self.dtype) if self.with_bias else np.zeros(
                 (1, 3 * self.D), dtype=self.dtype)
-        h0 = np.random.rand(
-            N, self.D).astype(self.dtype) if self.with_h0 else np.zeros(
+        h0 = np.random.rand(N, self.D).astype(
+            self.dtype) if self.with_h0 else np.zeros(
                 (N, self.D), dtype=self.dtype)
 
         batch_gate, batch_reset_hidden_prev, batch_hidden, hidden = gru(
@@ -165,33 +167,38 @@ def test_check_output(self):
         self.check_output(atol=1e-8, check_dygraph=False)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['Input', 'H0', 'Weight', 'Bias'], ['Hidden'], check_dygraph=False)
+        self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'],
+                        check_dygraph=False)
 
 
 class TestGRUOriginMode(TestGRUOp):
+
     def set_confs(self):
         self.origin_mode = True
 
 
 class TestGRUOp2(TestGRUOp):
+
     def set_confs(self):
         self.dtype = 'float64'
 
 
 class TestGRUOp2Len0(TestGRUOp):
+
     def set_confs(self):
         self.lod = [[2, 0, 4]]
         self.dtype = 'float64'
 
 
 class TestGRUOp2OriginMode(TestGRUOp):
+
     def set_confs(self):
         self.dtype = 'float64'
         self.origin_mode = True
 
 
 class TestGRUOp2OriginModeLen0(TestGRUOp):
+
     def set_confs(self):
         self.lod = [[0, 3, 4]]
         self.dtype = 'float64'
@@ -199,6 +206,7 @@ def set_confs(self):
 
 
 class TestGRUOp2OriginModeLastLen0(TestGRUOp):
+
     def set_confs(self):
         self.lod = [[0, 3, 0]]
         self.dtype = 'float64'
@@ -206,35 +214,40 @@ def set_confs(self):
 
 
 class TestGRUOpNoInitial(TestGRUOp):
+
     def set_confs(self):
         self.with_h0 = False
 
     def test_check_grad(self):
-        self.check_grad(
-            ['Input', 'Weight', 'Bias'], ['Hidden'], check_dygraph=False)
+        self.check_grad(['Input', 'Weight', 'Bias'], ['Hidden'],
+                        check_dygraph=False)
 
 
 class TestGRUOpNoBias(TestGRUOp):
+
     def set_confs(self):
         self.with_bias = False
 
     def test_check_grad(self):
-        self.check_grad(
-            ['Input', 'H0', 'Weight'], ['Hidden'], check_dygraph=False)
+        self.check_grad(['Input', 'H0', 'Weight'], ['Hidden'],
+                        check_dygraph=False)
 
 
 class TestGRUOpReverse(TestGRUOp):
+
     def set_confs(self):
         self.is_reverse = True
 
 
 class TestGRUOpReverseOriginMode(TestGRUOp):
+
     def set_confs(self):
         self.is_reverse = True
         self.origin_mode = True
 
 
 class TestGRUOpInference(TestGRUOp):
+
     def set_is_test(self):
         self.is_test = True
 
@@ -250,6 +263,7 @@ def test_check_grad(self):
 
 
 class TestGruOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
@@ -260,8 +274,9 @@ def test_Variable():
             self.assertRaises(TypeError, test_Variable)
 
             def test_h_0():
-                in_data = fluid.data(
-                    name="input", shape=[None, 1536], dtype="float32")
+                in_data = fluid.data(name="input",
+                                     shape=[None, 1536],
+                                     dtype="float32")
                 h = fluid.data(name="h", shape=[None, 512], dtype="int32")
                 fluid.layers.dynamic_gru(input=in_data, size=512, h_0=h)
 
diff --git a/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py b/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
index 77b88161d3a72..abce0e1127884 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
@@ -23,15 +23,18 @@
 import paddle.fluid.layers as layers
 import random
 import sys
+
 sys.path.append("./rnn")
 from rnn_numpy import GRU
 from convert import get_params_for_net
+
 random.seed(2)
 np.set_printoptions(threshold=np.inf)
 paddle.enable_static()
 
 
 class TestGRUOp(OpTest):
+
     def get_weight_names(self):
         weight_names = []
         for i in range(self.num_layers):
@@ -46,8 +49,7 @@ def setUp(self):
         self.op_type = "rnn"
         self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
         self.sequence_length = None if core.is_compiled_with_rocm(
-        ) else np.array(
-            [12, 11, 10, 9, 8, 7, 6, 5], dtype=np.int32)
+        ) else np.array([12, 11, 10, 9, 8, 7, 6, 5], dtype=np.int32)
         self.num_layers = 1
         self.is_bidirec = False
         self.is_test = False
@@ -62,9 +64,10 @@ def setUp(self):
         self.direction_num = 2 if self.is_bidirec else 1
         direction = "bidirectional" if self.is_bidirec else "forward"
 
-        input = np.random.uniform(
-            low=-0.1, high=0.1,
-            size=(seq_length, batch_size, input_size)).astype(self.dtype)
+        input = np.random.uniform(low=-0.1,
+                                  high=0.1,
+                                  size=(seq_length, batch_size,
+                                        input_size)).astype(self.dtype)
 
         if self.sequence_length is not None:
             input[3][1:][:] = 0
@@ -140,23 +143,27 @@ def test_grad(self):
 
 
 class TestGRUOp1(TestGRUOp):
+
     def set_attrs(self):
         self.sequence_length = None
 
 
 class TestGRUOp2(TestGRUOp):
+
     def set_attrs(self):
         self.sequence_length = None
         self.is_bidirec = True
 
 
 class TestGRUOp3(TestGRUOp):
+
     def set_attrs(self):
         self.sequence_length = None
         self.is_test = True
 
 
 class TestGRUOp4(TestGRUOp):
+
     def set_attrs(self):
         self.sequence_length = None
         self.is_bidirec = True
@@ -164,6 +171,7 @@ def set_attrs(self):
 
 
 class TestGRUOpAvx(TestGRUOp):
+
     def set_attrs(self):
         self.dtype = "float32"
         self.hidden_size = 8
diff --git a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
index 74afa7db2899b..ac70901c2eb35 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
@@ -25,13 +25,14 @@
 
 
 class TestGRUUnitAPIError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             D = 5
             layer = fluid.dygraph.nn.GRUUnit(size=D * 3)
             # the input must be Variable.
-            x0 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            x0 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
             self.assertRaises(TypeError, layer, x0)
             # the input dtype must be float32 or float64
             x = fluid.data(name='x', shape=[-1, D * 3], dtype='float16')
@@ -63,14 +64,17 @@ def relu(x):
 
 
 class TestGRUUnitOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             batch_size = 5
             hidden_dim = 40
-            input = fluid.data(
-                name='input', shape=[None, hidden_dim * 3], dtype='float32')
-            pre_hidden = fluid.data(
-                name='pre_hidden', shape=[None, hidden_dim], dtype='float32')
+            input = fluid.data(name='input',
+                               shape=[None, hidden_dim * 3],
+                               dtype='float32')
+            pre_hidden = fluid.data(name='pre_hidden',
+                                    shape=[None, hidden_dim],
+                                    dtype='float32')
             np_input = np.random.uniform(
                 -0.1, 0.1, (batch_size, hidden_dim * 3)).astype('float64')
             np_pre_hidden = np.random.uniform(
@@ -87,19 +91,17 @@ def test_pre_hidden_Variable():
             self.assertRaises(TypeError, test_pre_hidden_Variable)
 
             def test_input_type():
-                error_input = fluid.data(
-                    name='error_input',
-                    shape=[None, hidden_dim * 3],
-                    dtype='int32')
+                error_input = fluid.data(name='error_input',
+                                         shape=[None, hidden_dim * 3],
+                                         dtype='int32')
                 gru_unit(error_input, pre_hidden, hidden_dim * 3)
 
             self.assertRaises(TypeError, test_input_type)
 
             def test_pre_hidden_type():
-                error_pre_hidden = fluid.data(
-                    name='error_pre_hidden',
-                    shape=[None, hidden_dim],
-                    dtype='int32')
+                error_pre_hidden = fluid.data(name='error_pre_hidden',
+                                              shape=[None, hidden_dim],
+                                              dtype='int32')
                 gru_unit(input, error_pre_hidden, hidden_dim * 3)
 
             self.assertRaises(TypeError, test_pre_hidden_type)
@@ -120,13 +122,16 @@ def set_inputs(self, origin_mode=False):
         frame_size = self.frame_size
         self.op_type = 'gru_unit'
         self.inputs = {
-            'Input': np.random.uniform(
-                -0.1, 0.1, (batch_size, frame_size * 3)).astype(self.dtype),
-            'HiddenPrev': np.random.uniform(
-                -0.1, 0.1, (batch_size, frame_size)).astype(self.dtype),
-            'Weight': np.random.uniform(
-                -1. / math.sqrt(frame_size), 1. / math.sqrt(frame_size),
-                (frame_size, frame_size * 3)).astype(self.dtype),
+            'Input':
+            np.random.uniform(-0.1, 0.1,
+                              (batch_size, frame_size * 3)).astype(self.dtype),
+            'HiddenPrev':
+            np.random.uniform(-0.1, 0.1,
+                              (batch_size, frame_size)).astype(self.dtype),
+            'Weight':
+            np.random.uniform(-1. / math.sqrt(frame_size),
+                              1. / math.sqrt(frame_size),
+                              (frame_size, frame_size * 3)).astype(self.dtype),
         }
         self.attrs = {
             'activation': GRUActivationType.tanh,
@@ -146,8 +151,9 @@ def set_outputs(self, origin_mode=False):
         g = x + np.tile(b, (batch_size, 1))
         w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
             (frame_size, frame_size * 2))
-        u_r = self.activate[self.attrs['gate_activation']](np.dot(
-            h_p, w_u_r) + g[:, :frame_size * 2])
+        u_r = self.activate[self.attrs['gate_activation']](np.dot(h_p, w_u_r) +
+                                                           g[:, :frame_size *
+                                                             2])
         u = u_r[:, :frame_size]
         r = u_r[:, frame_size:frame_size * 2]
         r_h_p = r * h_p
@@ -180,6 +186,7 @@ def test_check_grad(self):
 
 
 class TestGRUUnitOpOriginMode(TestGRUUnitOp):
+
     def setUp(self):
         self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
         ) else 'float64'
@@ -188,6 +195,7 @@ def setUp(self):
 
 
 class TestGRUUnitOpWithBias(TestGRUUnitOp):
+
     def set_inputs(self, origin_mode=False):
         batch_size = self.batch_size
         frame_size = self.frame_size
@@ -204,12 +212,12 @@ def test_check_grad(self):
         self.check_grad(['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'])
 
     def test_check_grad_ingore_input(self):
-        self.check_grad(
-            ['HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
-            no_grad_set=set('Input'))
+        self.check_grad(['HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
+                        no_grad_set=set('Input'))
 
 
 class TestGRUUnitOpWithBiasOriginMode(TestGRUUnitOpWithBias):
+
     def setUp(self):
         self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
         ) else 'float64'
diff --git a/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py b/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py
index 7c706eabd1d7a..650626883c735 100644
--- a/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py
@@ -18,10 +18,12 @@
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 from paddle.fluid.framework import _test_eager_guard
+
 paddle.enable_static()
 
 
 class TestGumbelSoftmaxOp(OpTest):
+
     def init_attrs(self):
         self.shape = [20, 10]
         self.attrs = {"hard": True, "axis": -1}
@@ -51,6 +53,7 @@ def test_check_grad(self):
 
 
 class TestGumbelSoftmaxOp2(TestGumbelSoftmaxOp):
+
     def init_attrs(self):
         self.shape = [20, 10]
         self.attrs = {"hard": True, "axis": 0}
@@ -59,6 +62,7 @@ def init_attrs(self):
 
 
 class TestGumbelSoftmaxOp3(TestGumbelSoftmaxOp):
+
     def init_attrs(self):
         self.shape = [100]
         self.attrs = {"hard": True, "axis": -1}
@@ -67,6 +71,7 @@ def init_attrs(self):
 
 
 class TestGumbelSoftmaxOp4(TestGumbelSoftmaxOp):
+
     def init_attrs(self):
         self.shape = [20, 10, 5]
         self.attrs = {"hard": True, "axis": -1}
@@ -75,6 +80,7 @@ def init_attrs(self):
 
 
 class TestGumbelSoftmaxOp5(TestGumbelSoftmaxOp):
+
     def init_attrs(self):
         self.shape = [20, 10, 5]
         self.attrs = {"hard": True, "axis": 1}
@@ -83,6 +89,7 @@ def init_attrs(self):
 
 
 class TestGumbelSoftmaxOpSampleDistribution(OpTest):
+
     def softmax(self, x):
         x_row_max = x.max(axis=-1)
         x_row_max = x_row_max.reshape(list(x.shape)[:-1] + [1])
@@ -118,10 +125,10 @@ def test_check_output(self):
         # Experiment should result in batch num .
         self.assertEqual(self.counts.sum(), self.shape[0])
 
-        # Treat the probability from softmax as 
+        # Treat the probability from softmax as
         # the probability of binomial distribution.
         # Samples from gumbel softmax meet this binomial distribution.
-        # Construct statistics z for samples and 
+        # Construct statistics z for samples and
         # z is approximately N(0,1) for unbiased count
         expected = self.probs * self.shape[0]
         z = (self.counts - expected) / np.sqrt((expected * (1 - self.probs)))
@@ -134,6 +141,7 @@ def test_check_grad(self):
 
 
 class TestGumbelSoftmaxOpGrad(unittest.TestCase):
+
     def init_attrs(self):
         self.shape = [20, 10]
         self.dtype = "float64"
@@ -153,12 +161,13 @@ def test_dygraph_check(self):
         out_hard.sum().backward()
         out_soft.sum().backward()
 
-        self.assertEqual(
-            np.allclose(x_hard.grad.numpy(), x_soft.grad.numpy()), True)
+        self.assertEqual(np.allclose(x_hard.grad.numpy(), x_soft.grad.numpy()),
+                         True)
         paddle.enable_static()
 
 
 class TestGumbelSoftmaxAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_shape = [2, 3, 4, 5]
         self.x = np.random.uniform(-1., 1., self.x_shape).astype(np.float32)
@@ -192,12 +201,13 @@ def test_check_api(self):
 
 
 class TestGumbelSoftmaxOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.disable_static()
 
         def test_Variable():
-            x1 = fluid.create_lod_tensor(
-                np.zeros((100, 784)), [[10, 10, 10, 70]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.zeros((100, 784)),
+                                         [[10, 10, 10, 70]], fluid.CPUPlace())
             paddle.nn.functional.gumbel_softmax(x1)
 
         self.assertRaises(ValueError, test_Variable)
@@ -224,8 +234,9 @@ def test_argument2():
 
         def test_dtype():
             with paddle.static.program_guard(paddle.static.Program()):
-                x_int32 = paddle.fluid.data(
-                    name='x_int32', shape=[2, 3], dtype='int32')
+                x_int32 = paddle.fluid.data(name='x_int32',
+                                            shape=[2, 3],
+                                            dtype='int32')
                 paddle.nn.functional.gumbel_softmax(x_int32)
 
         self.assertRaises(TypeError, test_dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_hash_op.py b/python/paddle/fluid/tests/unittests/test_hash_op.py
index 3fe8bca2f192e..fe0762909488a 100644
--- a/python/paddle/fluid/tests/unittests/test_hash_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hash_op.py
@@ -19,6 +19,7 @@
 
 
 class TestHashOp(OpTest):
+
     def setUp(self):
         self.op_type = "hash"
         self.init_test_case()
@@ -40,6 +41,7 @@ def test_check_output(self):
 
 
 class TestHashNotLoDOp(TestHashOp):
+
     def setUp(self):
         self.op_type = "hash"
         self.init_test_case()
@@ -96,14 +98,15 @@ def setUp(self):
 
     def init_test_case(self):
         self.in_seq = np.array([10, 5]).reshape((2, 1)).astype("int64")
-        self.out_seq = np.array(
-            [1204014882, 393011615, 3586283837, 2814821595]).reshape((2, 2, 1))
+        self.out_seq = np.array([1204014882, 393011615, 3586283837,
+                                 2814821595]).reshape((2, 2, 1))
 
     def test_check_output(self):
         self.check_output()
 
 
 class TestHashOpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input_data = np.random.randint(0, 10, (8, 1)).astype("int32")
@@ -116,24 +119,30 @@ def test_Variable():
 
             def test_type():
                 # dtype must be int32, int64.
-                x2 = fluid.layers.data(
-                    name='x2', shape=[1], dtype="float32", lod_level=1)
+                x2 = fluid.layers.data(name='x2',
+                                       shape=[1],
+                                       dtype="float32",
+                                       lod_level=1)
                 fluid.layers.hash(input=x2, hash_size=2**32)
 
             self.assertRaises(TypeError, test_type)
 
             def test_hash_size_type():
                 # hash_size dtype must be int32, int64.
-                x3 = fluid.layers.data(
-                    name='x3', shape=[1], dtype="int32", lod_level=1)
+                x3 = fluid.layers.data(name='x3',
+                                       shape=[1],
+                                       dtype="int32",
+                                       lod_level=1)
                 fluid.layers.hash(input=x3, hash_size=1024.5)
 
             self.assertRaises(TypeError, test_hash_size_type)
 
             def test_num_hash_type():
                 # num_hash dtype must be int32, int64.
-                x4 = fluid.layers.data(
-                    name='x4', shape=[1], dtype="int32", lod_level=1)
+                x4 = fluid.layers.data(name='x4',
+                                       shape=[1],
+                                       dtype="int32",
+                                       lod_level=1)
                 fluid.layers.hash(input=x4, hash_size=2**32, num_hash=2.5)
 
             self.assertRaises(TypeError, test_num_hash_type)
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs1.py b/python/paddle/fluid/tests/unittests/test_hdfs1.py
index 65d12c31e39ab..2fa312fc20be9 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs1.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs1.py
@@ -26,12 +26,12 @@
 
 
 class FSTest1(FSTestBase):
+
     def test_timeout(self):
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            None,
-            time_out=6 * 1000,
-            sleep_inter=100)
+        fs = HDFSClient("/usr/local/hadoop-2.7.7/",
+                        None,
+                        time_out=6 * 1000,
+                        sleep_inter=100)
         src = "hdfs_test_timeout"
         dst = "new_hdfs_test_timeout"
         fs.delete(dst)
@@ -42,8 +42,8 @@ def test_timeout(self):
         cmd = "{} -mv {} {}".format(fs._base_cmd, src, dst)
         try:
             fs.mv(src, dst, test_exists=False)
-            self.assertFalse(1, "can't execute cmd:{} output:{}".format(cmd,
-                                                                        output))
+            self.assertFalse(
+                1, "can't execute cmd:{} output:{}".format(cmd, output))
         except FSTimeOut as e:
             print("execute mv {} to {} timeout".format(src, dst))
 
@@ -52,11 +52,10 @@ def test_timeout(self):
         print("second mv ret:{} output:{}".format(ret, output))
 
     def test_is_dir(self):
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            None,
-            time_out=6 * 1000,
-            sleep_inter=100)
+        fs = HDFSClient("/usr/local/hadoop-2.7.7/",
+                        None,
+                        time_out=6 * 1000,
+                        sleep_inter=100)
         self.assertFalse(fs.is_dir("./test_hdfs.py"))
         s = """
 java.io.IOException: Input/output error
@@ -78,18 +77,16 @@ def test_is_dir(self):
 
     def test_config(self):
         config = {"fs.default.name": "hdfs://xxx", "hadoop.job.ugi": "ugi"}
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            config,
-            time_out=6 * 1000,
-            sleep_inter=100)
+        fs = HDFSClient("/usr/local/hadoop-2.7.7/",
+                        config,
+                        time_out=6 * 1000,
+                        sleep_inter=100)
 
     def test_exists(self):
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            None,
-            time_out=6 * 1000,
-            sleep_inter=100)
+        fs = HDFSClient("/usr/local/hadoop-2.7.7/",
+                        None,
+                        time_out=6 * 1000,
+                        sleep_inter=100)
         self.assertFalse(fs.is_exist(os.path.abspath("./xxxx")))
         self.assertFalse(fs.is_dir(os.path.abspath("./xxxx")))
         self.assertTrue(fs.is_dir(os.path.abspath("./xxx/..")))
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs2.py b/python/paddle/fluid/tests/unittests/test_hdfs2.py
index a74fc558382fe..a77368d11a163 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs2.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py
@@ -26,12 +26,12 @@
 
 
 class FSTest2(FSTestBase):
+
     def test_hdfs(self):
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            None,
-            time_out=5 * 1000,
-            sleep_inter=100)
+        fs = HDFSClient("/usr/local/hadoop-2.7.7/",
+                        None,
+                        time_out=5 * 1000,
+                        sleep_inter=100)
         self._test_rm(fs)
         self._test_touch(fs)
         self._test_dirs(fs)
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs3.py b/python/paddle/fluid/tests/unittests/test_hdfs3.py
index 57b0b1ba45f24..450aceb5a747c 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs3.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs3.py
@@ -26,12 +26,12 @@
 
 
 class FSTest3(FSTestBase):
+
     def test_hdfs(self):
-        fs = HDFSClient(
-            "/usr/local/hadoop-2.7.7/",
-            None,
-            time_out=5 * 1000,
-            sleep_inter=100)
+        fs = HDFSClient("/usr/local/hadoop-2.7.7/",
+                        None,
+                        time_out=5 * 1000,
+                        sleep_inter=100)
         self._test_mkdirs(fs)
         self._test_list_dir(fs)
         self._test_try_upload(fs)
diff --git a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
index 91c1b45cbca41..9f281e6bf3931 100644
--- a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py
@@ -34,6 +34,7 @@ def calc_hinge_embedding_loss(input, label, margin=1.0, reduction='mean'):
 
 
 class TestFunctionalHingeEmbeddingLoss(unittest.TestCase):
+
     def setUp(self):
         self.margin = 1.0
         self.shape = (10, 10, 5)
@@ -51,37 +52,45 @@ def run_dynamic_check(self, place=paddle.CPUPlace()):
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
-        dy_result = paddle.nn.functional.hinge_embedding_loss(
-            input, label, reduction='sum')
-        expected = calc_hinge_embedding_loss(
-            self.input_np, self.label_np, reduction='sum')
+        dy_result = paddle.nn.functional.hinge_embedding_loss(input,
+                                                              label,
+                                                              reduction='sum')
+        expected = calc_hinge_embedding_loss(self.input_np,
+                                             self.label_np,
+                                             reduction='sum')
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
-        dy_result = paddle.nn.functional.hinge_embedding_loss(
-            input, label, reduction='none')
-        expected = calc_hinge_embedding_loss(
-            self.input_np, self.label_np, reduction='none')
+        dy_result = paddle.nn.functional.hinge_embedding_loss(input,
+                                                              label,
+                                                              reduction='none')
+        expected = calc_hinge_embedding_loss(self.input_np,
+                                             self.label_np,
+                                             reduction='none')
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, self.shape)
 
     def run_static_check(self, place=paddle.CPUPlace):
         paddle.enable_static()
         for reduction in ['none', 'mean', 'sum']:
-            expected = calc_hinge_embedding_loss(
-                self.input_np, self.label_np, reduction=reduction)
+            expected = calc_hinge_embedding_loss(self.input_np,
+                                                 self.label_np,
+                                                 reduction=reduction)
             with program_guard(Program(), Program()):
-                input = paddle.static.data(
-                    name="input", shape=self.shape, dtype=paddle.float64)
-                label = paddle.static.data(
-                    name="label", shape=self.shape, dtype=paddle.float64)
+                input = paddle.static.data(name="input",
+                                           shape=self.shape,
+                                           dtype=paddle.float64)
+                label = paddle.static.data(name="label",
+                                           shape=self.shape,
+                                           dtype=paddle.float64)
                 st_result = paddle.nn.functional.hinge_embedding_loss(
                     input, label, reduction=reduction)
                 exe = paddle.static.Executor(place)
-                result_numpy, = exe.run(
-                    feed={"input": self.input_np,
-                          "label": self.label_np},
-                    fetch_list=[st_result])
+                result_numpy, = exe.run(feed={
+                    "input": self.input_np,
+                    "label": self.label_np
+                },
+                                        fetch_list=[st_result])
                 self.assertTrue(np.allclose(result_numpy, expected))
 
     def test_cpu(self):
@@ -96,6 +105,7 @@ def test_gpu(self):
 
     # test case the raise message
     def test_reduce_errors(self):
+
         def test_value_error():
             loss = paddle.nn.functional.hinge_embedding_loss(
                 self.input_np, self.label_np, reduction='reduce_mean')
@@ -104,6 +114,7 @@ def test_value_error():
 
 
 class TestClassHingeEmbeddingLoss(unittest.TestCase):
+
     def setUp(self):
         self.margin = 1.0
         self.shape = (10, 10, 5)
@@ -124,37 +135,43 @@ def run_dynamic_check(self, place=paddle.CPUPlace()):
         hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
             reduction='sum')
         dy_result = hinge_embedding_loss(input, label)
-        expected = calc_hinge_embedding_loss(
-            self.input_np, self.label_np, reduction='sum')
+        expected = calc_hinge_embedding_loss(self.input_np,
+                                             self.label_np,
+                                             reduction='sum')
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, [1])
 
         hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
             reduction='none')
         dy_result = hinge_embedding_loss(input, label)
-        expected = calc_hinge_embedding_loss(
-            self.input_np, self.label_np, reduction='none')
+        expected = calc_hinge_embedding_loss(self.input_np,
+                                             self.label_np,
+                                             reduction='none')
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
         self.assertTrue(dy_result.shape, self.shape)
 
     def run_static_check(self, place=paddle.CPUPlace):
         paddle.enable_static()
         for reduction in ['none', 'mean', 'sum']:
-            expected = calc_hinge_embedding_loss(
-                self.input_np, self.label_np, reduction=reduction)
+            expected = calc_hinge_embedding_loss(self.input_np,
+                                                 self.label_np,
+                                                 reduction=reduction)
             with program_guard(Program(), Program()):
-                input = paddle.static.data(
-                    name="input", shape=self.shape, dtype=paddle.float64)
-                label = paddle.static.data(
-                    name="label", shape=self.shape, dtype=paddle.float64)
+                input = paddle.static.data(name="input",
+                                           shape=self.shape,
+                                           dtype=paddle.float64)
+                label = paddle.static.data(name="label",
+                                           shape=self.shape,
+                                           dtype=paddle.float64)
                 hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
                     reduction=reduction)
                 st_result = hinge_embedding_loss(input, label)
                 exe = paddle.static.Executor(place)
-                result_numpy, = exe.run(
-                    feed={"input": self.input_np,
-                          "label": self.label_np},
-                    fetch_list=[st_result])
+                result_numpy, = exe.run(feed={
+                    "input": self.input_np,
+                    "label": self.label_np
+                },
+                                        fetch_list=[st_result])
                 self.assertTrue(np.allclose(result_numpy, expected))
 
     def test_cpu(self):
@@ -169,6 +186,7 @@ def test_gpu(self):
 
     # test case the raise message
     def test_reduce_errors(self):
+
         def test_value_error():
             hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss(
                 reduction='reduce_mean')
diff --git a/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py b/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
index 2e2d0d2ea4878..60ea132961e3c 100644
--- a/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
@@ -20,6 +20,7 @@
 
 
 class TestHingeLossOp(OpTest):
+
     def setUp(self):
         self.op_type = 'hinge_loss'
         samples_num = 100
diff --git a/python/paddle/fluid/tests/unittests/test_histogram_op.py b/python/paddle/fluid/tests/unittests/test_histogram_op.py
index 819029c5fcd9d..17b7b95942fe2 100644
--- a/python/paddle/fluid/tests/unittests/test_histogram_op.py
+++ b/python/paddle/fluid/tests/unittests/test_histogram_op.py
@@ -62,10 +62,9 @@ def test_dygraph(self):
                 inputs_np = np.array([[2, 4, 2], [2, 5, 4]]).astype(np.int64)
                 inputs = paddle.to_tensor(inputs_np)
                 actual = paddle.histogram(inputs, bins=5, min=1, max=5)
-                self.assertTrue(
-                    (actual.numpy() == expected).all(),
-                    msg='histogram output is wrong, out =' +
-                    str(actual.numpy()))
+                self.assertTrue((actual.numpy() == expected).all(),
+                                msg='histogram output is wrong, out =' +
+                                str(actual.numpy()))
 
 
 class TestHistogramOpError(unittest.TestCase):
@@ -83,8 +82,9 @@ def test_bins_error(self):
         """Test bins should be greater than or equal to 1."""
 
         def net_func():
-            input_value = paddle.fluid.layers.fill_constant(
-                shape=[3, 4], dtype='float32', value=3.0)
+            input_value = paddle.fluid.layers.fill_constant(shape=[3, 4],
+                                                            dtype='float32',
+                                                            value=3.0)
             paddle.histogram(input=input_value, bins=-1, min=1, max=5)
 
         with self.assertRaises(IndexError):
@@ -94,8 +94,9 @@ def test_min_max_error(self):
         """Test max must be larger or equal to min."""
 
         def net_func():
-            input_value = paddle.fluid.layers.fill_constant(
-                shape=[3, 4], dtype='float32', value=3.0)
+            input_value = paddle.fluid.layers.fill_constant(shape=[3, 4],
+                                                            dtype='float32',
+                                                            value=3.0)
             paddle.histogram(input=input_value, bins=1, min=5, max=1)
 
         with self.assertRaises(ValueError):
@@ -105,8 +106,9 @@ def test_min_max_range_error(self):
         """Test range of min, max is not finite"""
 
         def net_func():
-            input_value = paddle.fluid.layers.fill_constant(
-                shape=[3, 4], dtype='float32', value=3.0)
+            input_value = paddle.fluid.layers.fill_constant(shape=[3, 4],
+                                                            dtype='float32',
+                                                            value=3.0)
             paddle.histogram(input=input_value, bins=1, min=-np.inf, max=5)
 
         with self.assertRaises(ValueError):
@@ -115,15 +117,24 @@ def net_func():
     def test_type_errors(self):
         with program_guard(Program()):
             # The input type must be Variable.
-            self.assertRaises(
-                TypeError, paddle.histogram, 1, bins=5, min=1, max=5)
+            self.assertRaises(TypeError,
+                              paddle.histogram,
+                              1,
+                              bins=5,
+                              min=1,
+                              max=5)
             # The input type must be 'int32', 'int64', 'float32', 'float64'
             x_bool = fluid.data(name='x_bool', shape=[4, 3], dtype='bool')
-            self.assertRaises(
-                TypeError, paddle.histogram, x_bool, bins=5, min=1, max=5)
+            self.assertRaises(TypeError,
+                              paddle.histogram,
+                              x_bool,
+                              bins=5,
+                              min=1,
+                              max=5)
 
 
 class TestHistogramOp(OpTest):
+
     def setUp(self):
         self.op_type = "histogram"
         self.init_test_case()
@@ -131,8 +142,9 @@ def setUp(self):
         self.python_api = paddle.histogram
         self.inputs = {"X": np_input}
         self.init_attrs()
-        Out, _ = np.histogram(
-            np_input, bins=self.bins, range=(self.min, self.max))
+        Out, _ = np.histogram(np_input,
+                              bins=self.bins,
+                              range=(self.min, self.max))
         self.outputs = {"Out": Out.astype(np.int64)}
 
     def init_test_case(self):
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 51ff8ec943d01..fc8b0d114d5ac 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -34,6 +34,7 @@ def find_latest_set(num):
 
 
 class CodeTable(object):
+
     def __init__(self, num_classes, code):
         self.c = num_classes + code
 
@@ -48,6 +49,7 @@ def cal_bit(self, bit):
 
 
 class CodeTableWithCustomTree(object):
+
     def __init__(self, path_table, path_code, index):
         self.ptable_ = path_table
         self.pcode_ = path_code
@@ -171,6 +173,7 @@ def hsigmoidWithCustomTree(x, w, path_table, path_code, label, bias,
 
 
 class TestHSigmoidOp(OpTest):
+
     def setUp(self):
         self.op_type = "hierarchical_sigmoid"
         num_classes = 101
@@ -193,14 +196,16 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X', 'W', 'Bias'], ['Out'], user_defined_grads=self.user_grads)
+        self.check_grad(['X', 'W', 'Bias'], ['Out'],
+                        user_defined_grads=self.user_grads)
 
 
 @skip_check_grad_ci(
-    reason="For 'TestHSigmoidOpSparse', check_grad is separately calculated by 'TestHSigmoidOpWithSparseGrad'."
+    reason=
+    "For 'TestHSigmoidOpSparse', check_grad is separately calculated by 'TestHSigmoidOpWithSparseGrad'."
 )
 class TestHSigmoidOpSparse(OpTest):
+
     def setUp(self):
         self.op_type = "hierarchical_sigmoid"
         num_classes = 6  #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
@@ -210,13 +215,13 @@ def setUp(self):
         w = np.random.random((num_classes - 1, feature_size))
         label = np.array([0, 1, 4, 5]).astype('int64')
         path_table = np.array([
-            (0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), (0, 2, -1,
-                                                                       -1, -1)
+            (0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
+            (0, 2, -1, -1, -1)
         ]).astype(
             'int64')  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
-        path_code = np.array(
-            [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1),
-             (0, 1, -1, -1, -1)]).astype('int64')  #np.array to store 
+        path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1),
+                              (1, 0, 0, -1, -1), (0, 1, -1, -1, -1)
+                              ]).astype('int64')  #np.array to store
         bias = np.random.random((num_classes - 1, 1))
         self.attrs = {'num_classes': num_classes, 'is_sparse': True}
         self.inputs = {
@@ -236,12 +241,15 @@ def test_check_output(self):
 
 
 class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
+
     def hs_net_conf(self, is_sparse):
         input_word = fluid.layers.data(name="x", shape=[1], dtype='int64')
-        path_table = fluid.layers.data(
-            name='path_table', shape=[3], dtype='int64')
-        path_code = fluid.layers.data(
-            name='path_code', shape=[3], dtype='int64')
+        path_table = fluid.layers.data(name='path_table',
+                                       shape=[3],
+                                       dtype='int64')
+        path_code = fluid.layers.data(name='path_code',
+                                      shape=[3],
+                                      dtype='int64')
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
         data_list = [input_word, path_table, path_code, label]
@@ -253,15 +261,14 @@ def hs_net_conf(self, is_sparse):
             param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
                 scale=1 / math.sqrt(3))))
 
-        cost = fluid.layers.hsigmoid(
-            input=emb,
-            label=label,
-            bias_attr=True,
-            num_classes=3,
-            path_table=path_table,
-            path_code=path_code,
-            is_custom=True,
-            is_sparse=is_sparse)
+        cost = fluid.layers.hsigmoid(input=emb,
+                                     label=label,
+                                     bias_attr=True,
+                                     num_classes=3,
+                                     path_table=path_table,
+                                     path_code=path_code,
+                                     is_custom=True,
+                                     is_sparse=is_sparse)
 
         avg_cost = fluid.layers.reduce_mean(cost)
 
@@ -304,9 +311,11 @@ def test_hs_grad_with_sparse(self):
 
 
 @skip_check_grad_ci(
-    reason="[skip shape check] The huffman tree is structed separately. It will be complicated if use large shape."
+    reason=
+    "[skip shape check] The huffman tree is structed separately. It will be complicated if use large shape."
 )
 class TestHSigmoidOpWithCostumTree(OpTest):
+
     def setUp(self):
         self.op_type = "hierarchical_sigmoid"
         num_classes = 6  #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
@@ -316,13 +325,13 @@ def setUp(self):
         w = np.random.uniform(-1, 1, (num_classes - 1, feature_size))
         label = np.array([0, 1, 4, 5]).astype('int64')
         path_table = np.array([
-            (0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), (0, 2, -1,
-                                                                       -1, -1)
+            (0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
+            (0, 2, -1, -1, -1)
         ]).astype(
             'int64')  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
-        path_code = np.array(
-            [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1),
-             (0, 1, -1, -1, -1)]).astype('int64')  #np.array to store 
+        path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1),
+                              (1, 0, 0, -1, -1), (0, 1, -1, -1, -1)
+                              ]).astype('int64')  #np.array to store
         bias = np.random.random((num_classes - 1, 1))
         self.attrs = {'num_classes': num_classes, 'is_sparse': False}
         self.inputs = {
@@ -345,9 +354,11 @@ def test_check_grad(self):
 
 
 @skip_check_grad_ci(
-    reason="[skip shape check] The huffman tree is structed separately. It will be complicated if use large shape."
+    reason=
+    "[skip shape check] The huffman tree is structed separately. It will be complicated if use large shape."
 )
 class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest):
+
     def setUp(self):
         self.op_type = "hierarchical_sigmoid"
         num_classes = 6  #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
@@ -357,13 +368,13 @@ def setUp(self):
         w = np.random.uniform(-1, 1, (num_classes - 1, feature_size))
         label = np.array([0, 1, 4, 5]).astype('int64')
         path_table = np.array([
-            (0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), (0, 2, -1,
-                                                                       -1, -1)
+            (0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
+            (0, 2, -1, -1, -1)
         ]).astype(
             'int64')  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
-        path_code = np.array(
-            [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1),
-             (0, 1, -1, -1, -1)]).astype('int64')  #np.array to store 
+        path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1),
+                              (1, 0, 0, -1, -1), (0, 1, -1, -1, -1)
+                              ]).astype('int64')  #np.array to store
         # bias = np.random.random((num_classes - 1, 1)).astype("float32")
         self.attrs = {'num_classes': num_classes, 'is_sparse': False}
         self.inputs = {
@@ -373,14 +384,13 @@ def setUp(self):
             'PathCode': path_code,
             'Label': label,
         }
-        pre_output, out = hsigmoidWithCustomTree(
-            x=x,
-            w=w,
-            path_table=path_table,
-            path_code=path_code,
-            label=label,
-            bias=None,
-            num_classes=num_classes)
+        pre_output, out = hsigmoidWithCustomTree(x=x,
+                                                 w=w,
+                                                 path_table=path_table,
+                                                 path_code=path_code,
+                                                 label=label,
+                                                 bias=None,
+                                                 num_classes=num_classes)
         self.outputs = {'PreOut': pre_output, 'Out': out}
 
     def test_check_output(self):
@@ -404,12 +414,13 @@ def setUp(self):
 
         self.x_np = np.random.uniform(
             -1, 1, [self.batch_size, self.feature_size]).astype(self.dtype)
-        self.labels_np = np.random.randint(
-            self.num_classes, size=(self.batch_size, 1), dtype='int64')
+        self.labels_np = np.random.randint(self.num_classes,
+                                           size=(self.batch_size, 1),
+                                           dtype='int64')
         self.weight_np = np.random.uniform(
             -1, 1, [self.num_classes - 1, self.feature_size]).astype(self.dtype)
-        self.bias_np = np.random.uniform(-1, 1, (
-            self.num_classes - 1, )).astype(self.dtype)
+        self.bias_np = np.random.uniform(
+            -1, 1, (self.num_classes - 1, )).astype(self.dtype)
         self.path_table_np = None
         self.path_code_np = None
         _, self.out_np = hsigmoid(self.x_np, self.weight_np, self.labels_np,
@@ -417,10 +428,12 @@ def setUp(self):
         self.set_attrs()
 
         if self.is_custom:
-            _, self.out_np = hsigmoidWithCustomTree(
-                self.x_np, self.weight_np, self.path_table_np,
-                self.path_code_np, self.labels_np,
-                self.bias_np.reshape(-1, 1), self.num_classes)
+            _, self.out_np = hsigmoidWithCustomTree(self.x_np, self.weight_np,
+                                                    self.path_table_np,
+                                                    self.path_code_np,
+                                                    self.labels_np,
+                                                    self.bias_np.reshape(-1, 1),
+                                                    self.num_classes)
 
     def set_attrs(self):
         pass
@@ -456,7 +469,9 @@ def test_static_api(self):
             x = paddle.static.data('x', [-1, self.feature_size])
             labels = paddle.static.data('labels', [-1, 1], 'int64')
             weight = paddle.static.data('weight', [-1, self.feature_size])
-            bias = paddle.static.data('bias', [-1, ])
+            bias = paddle.static.data('bias', [
+                -1,
+            ])
             path_table = None
             path_code = None
             if self.is_custom:
@@ -544,36 +559,33 @@ def test_errors(self):
                               weight_int32)
 
             bias_int32 = paddle.static.data('bias_int32', [7], 'int32')
-            self.assertRaises(
-                TypeError,
-                F.hsigmoid_loss,
-                x,
-                label,
-                8,
-                weight,
-                bias=bias_int32)
+            self.assertRaises(TypeError,
+                              F.hsigmoid_loss,
+                              x,
+                              label,
+                              8,
+                              weight,
+                              bias=bias_int32)
 
             path_table_int32 = paddle.static.data('path_table_int32', [7],
                                                   'int32')
-            self.assertRaises(
-                TypeError,
-                F.hsigmoid_loss,
-                x,
-                label,
-                8,
-                weight,
-                path_table=path_table_int32)
+            self.assertRaises(TypeError,
+                              F.hsigmoid_loss,
+                              x,
+                              label,
+                              8,
+                              weight,
+                              path_table=path_table_int32)
 
             path_code_int32 = paddle.static.data('path_code_int32', [7],
                                                  'int32')
-            self.assertRaises(
-                TypeError,
-                F.hsigmoid_loss,
-                x,
-                label,
-                8,
-                weight,
-                path_code=path_code_int32)
+            self.assertRaises(TypeError,
+                              F.hsigmoid_loss,
+                              x,
+                              label,
+                              8,
+                              weight,
+                              path_code=path_code_int32)
 
         # test paddle.nn.HSigmoidLoss
         paddle.disable_static(self.place)
@@ -611,12 +623,15 @@ def test_errors(self):
 
 
 class TestHSigmoidLossAPICustom(TestHSigmoidLossAPI):
+
     def set_attrs(self):
         self.is_custom = True
-        self.path_table_np = np.array([(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (
-            0, 1, 4, -1, -1), (0, 2, -1, -1, -1)]).astype(np.int64)
-        self.path_code_np = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
-            1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]).astype(np.int64)
+        self.path_table_np = np.array([(0, 2, -1, -1, -1), (0, 1, 3, -1, -1),
+                                       (0, 1, 4, -1, -1),
+                                       (0, 2, -1, -1, -1)]).astype(np.int64)
+        self.path_code_np = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1),
+                                      (1, 0, 0, -1, -1),
+                                      (0, 1, -1, -1, -1)]).astype(np.int64)
 
     def test_errors(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/test_huber_loss_op.py b/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
index 9a0437ad2f556..13460af90eded 100644
--- a/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
@@ -31,6 +31,7 @@ def huber_loss_forward(val, delta):
 
 
 class TestHuberLossOp(OpTest):
+
     def setUp(self):
         self.op_type = 'huber_loss'
         self.python_api = paddle.fluid.layers.huber_loss
@@ -61,30 +62,38 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out', check_eager=True)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.008, no_grad_set=set("residual"))
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=0.008,
+                        no_grad_set=set("residual"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.008, no_grad_set=set('residual'))
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.008,
+                        no_grad_set=set('residual'))
 
 
 def TestHuberLossOp1(TestHuberLossOp):
+
     def set_shape(self):
         return (64)
 
 
 def TestHuberLossOp2(TestHuberLossOp):
+
     def set_shape(self):
         return (6, 6)
 
 
 def TestHuberLossOp3(TestHuberLossOp):
+
     def set_shape(self):
         return (6, 6, 1)
 
 
 class TestHuberLossOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input and label must be Variable
diff --git a/python/paddle/fluid/tests/unittests/test_hybrid_parallel_inference_helper.py b/python/paddle/fluid/tests/unittests/test_hybrid_parallel_inference_helper.py
index c7c3c87fadce3..d10673829f3e3 100644
--- a/python/paddle/fluid/tests/unittests/test_hybrid_parallel_inference_helper.py
+++ b/python/paddle/fluid/tests/unittests/test_hybrid_parallel_inference_helper.py
@@ -21,6 +21,7 @@
 
 
 class TestHybridParallelInferenceHelper(TestMultipleGpus):
+
     def test_hybrid_parallel_inference_helper(self):
         self.run_mnist_2gpu('hybrid_parallel_inference_helper.py')
 
diff --git a/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py b/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py
index e8300113ddc42..e3d341bcc0201 100644
--- a/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py
+++ b/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py
@@ -21,6 +21,7 @@
 
 
 class TestCommunicateTopology(unittest.TestCase):
+
     def test_topology(self):
         topo = fleet.CommunicateTopology(["dp", "mp", "pp"], [2, 2, 2])
 
@@ -151,20 +152,20 @@ def test_topology_4D(self):
 
         # test get_axis_list
         self.assertEqual(topo.get_axis_list("dp", 0), [0, 1, 2, 3, 4, 5, 6, 7])
-        self.assertEqual(
-            topo.get_axis_list("dp", 1), [8, 9, 10, 11, 12, 13, 14, 15])
-        self.assertEqual(
-            topo.get_axis_list("mp", 0), [0, 2, 4, 6, 8, 10, 12, 14])
-        self.assertEqual(
-            topo.get_axis_list("mp", 1), [1, 3, 5, 7, 9, 11, 13, 15])
-        self.assertEqual(
-            topo.get_axis_list("pp", 0), [0, 1, 2, 3, 8, 9, 10, 11])
-        self.assertEqual(
-            topo.get_axis_list("pp", 1), [4, 5, 6, 7, 12, 13, 14, 15])
-        self.assertEqual(
-            topo.get_axis_list("sharding", 0), [0, 1, 4, 5, 8, 9, 12, 13])
-        self.assertEqual(
-            topo.get_axis_list("sharding", 1), [2, 3, 6, 7, 10, 11, 14, 15])
+        self.assertEqual(topo.get_axis_list("dp", 1),
+                         [8, 9, 10, 11, 12, 13, 14, 15])
+        self.assertEqual(topo.get_axis_list("mp", 0),
+                         [0, 2, 4, 6, 8, 10, 12, 14])
+        self.assertEqual(topo.get_axis_list("mp", 1),
+                         [1, 3, 5, 7, 9, 11, 13, 15])
+        self.assertEqual(topo.get_axis_list("pp", 0),
+                         [0, 1, 2, 3, 8, 9, 10, 11])
+        self.assertEqual(topo.get_axis_list("pp", 1),
+                         [4, 5, 6, 7, 12, 13, 14, 15])
+        self.assertEqual(topo.get_axis_list("sharding", 0),
+                         [0, 1, 4, 5, 8, 9, 12, 13])
+        self.assertEqual(topo.get_axis_list("sharding", 1),
+                         [2, 3, 6, 7, 10, 11, 14, 15])
 
         # test get_dim_size
         self.assertEqual(topo.get_dim_size("dp"), 2)
diff --git a/python/paddle/fluid/tests/unittests/test_identity_op.py b/python/paddle/fluid/tests/unittests/test_identity_op.py
index 5c2ff2138ee40..17174b0d8e9ec 100644
--- a/python/paddle/fluid/tests/unittests/test_identity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_identity_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@
 
 
 class TestIdentityAPI(unittest.TestCase):
+
     def setUp(self):
         self.shape = [4, 4]
         self.x = np.random.random((4, 4)).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
index c540531e7cffe..bdfcf739b8f53 100644
--- a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
+++ b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
@@ -87,10 +87,9 @@ def im2col(attrs, im, col):
                         im_col_offset = col_col_idx * stride_width \
                             + filter_col_idx - padding_width
 
-                        if (im_row_offset < 0 or
-                                im_row_offset >= input_height or
-                                im_col_offset < 0 or
-                                im_col_offset >= input_width):
+                        if (im_row_offset < 0 or im_row_offset >= input_height
+                                or im_col_offset < 0
+                                or im_col_offset >= input_width):
                             col[col_row_idx][col_col_idx][channel][\
                                 filter_row_idx][filter_col_idx] = 0.0
                         else:
@@ -125,6 +124,7 @@ def Im2Sequence(inputs, img_real_size, attrs):
 
 
 class TestBlockExpandOp(OpTest):
+
     def config(self):
         self.batch_size = 1
         self.img_channels = 3
@@ -155,6 +155,7 @@ def test_check_grad_normal(self):
 
 
 class TestBlockExpandOpCase2(TestBlockExpandOp):
+
     def config(self):
         self.batch_size = 2
         self.img_channels = 3
@@ -168,6 +169,7 @@ def config(self):
 
 
 class TestBlockExpandOpCase3(TestBlockExpandOp):
+
     def config(self):
         self.batch_size = 6
         self.img_channels = 1
@@ -181,6 +183,7 @@ def config(self):
 
 
 class TestBlockExpandOpCase4(TestBlockExpandOp):
+
     def config(self):
         self.batch_size = 6
         self.img_channels = 2
@@ -194,9 +197,11 @@ def config(self):
 
 
 @skip_check_grad_ci(
-    reason="Since 'real_size' is used just in forward computation, we don't test the gradient here."
+    reason=
+    "Since 'real_size' is used just in forward computation, we don't test the gradient here."
 )
 class TestBlockExpandOpCase5(OpTest):
+
     def config(self):
         self.batch_size = 1
         self.img_channels = 3
@@ -225,6 +230,7 @@ def test_check_output(self):
 
 
 class TestBlockExpandOpCase6(TestBlockExpandOpCase5):
+
     def config(self):
         self.batch_size = 3
         self.img_channels = 1
@@ -240,6 +246,7 @@ def config(self):
 
 
 class TestBlockExpandOpCase7(TestBlockExpandOpCase6):
+
     def config(self):
         self.batch_size = 2
         self.img_channels = 2
diff --git a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
index 405637969af6f..a4404e0093913 100644
--- a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
@@ -22,25 +22,26 @@
 
 
 def conv_block(input, num_filter, groups, dropouts):
-    return nets.img_conv_group(
-        input=input,
-        pool_size=2,
-        pool_stride=2,
-        conv_num_filter=[num_filter] * groups,
-        conv_filter_size=3,
-        conv_act='relu',
-        conv_with_batchnorm=True,
-        conv_batchnorm_drop_rate=dropouts,
-        pool_type='max')
+    return nets.img_conv_group(input=input,
+                               pool_size=2,
+                               pool_stride=2,
+                               conv_num_filter=[num_filter] * groups,
+                               conv_filter_size=3,
+                               conv_act='relu',
+                               conv_with_batchnorm=True,
+                               conv_batchnorm_drop_rate=dropouts,
+                               pool_type='max')
 
 
 class TestLayer(unittest.TestCase):
+
     def test_batch_norm_layer(self):
         main_program = Program()
         startup_program = Program()
         with fluid.program_guard(main_program, startup_program):
-            images = fluid.layers.data(
-                name='pixel', shape=[3, 48, 48], dtype='float32')
+            images = fluid.layers.data(name='pixel',
+                                       shape=[3, 48, 48],
+                                       dtype='float32')
             hidden1 = fluid.layers.batch_norm(input=images)
             hidden2 = fluid.layers.fc(input=hidden1, size=128, act='relu')
             fluid.layers.batch_norm(input=hidden2)
@@ -51,8 +52,9 @@ def test_dropout_layer(self):
         main_program = Program()
         startup_program = Program()
         with fluid.program_guard(main_program, startup_program):
-            images = fluid.layers.data(
-                name='pixel', shape=[3, 48, 48], dtype='float32')
+            images = fluid.layers.data(name='pixel',
+                                       shape=[3, 48, 48],
+                                       dtype='float32')
             fluid.layers.dropout(x=images, dropout_prob=0.5)
 
         print(str(main_program))
@@ -62,8 +64,9 @@ def test_img_conv_group(self):
         startup_program = Program()
 
         with fluid.program_guard(main_program, startup_program):
-            images = fluid.layers.data(
-                name='pixel', shape=[3, 48, 48], dtype='float32')
+            images = fluid.layers.data(name='pixel',
+                                       shape=[3, 48, 48],
+                                       dtype='float32')
             conv1 = conv_block(images, 64, 2, [0.3, 0])
             conv_block(conv1, 256, 3, [0.4, 0.4, 0])
 
@@ -73,10 +76,12 @@ def test_elementwise_add_with_act(self):
         main_program = Program()
         startup_program = Program()
         with fluid.program_guard(main_program, startup_program):
-            image1 = fluid.layers.data(
-                name='pixel1', shape=[3, 48, 48], dtype='float32')
-            image2 = fluid.layers.data(
-                name='pixel2', shape=[3, 48, 48], dtype='float32')
+            image1 = fluid.layers.data(name='pixel1',
+                                       shape=[3, 48, 48],
+                                       dtype='float32')
+            image2 = fluid.layers.data(name='pixel2',
+                                       shape=[3, 48, 48],
+                                       dtype='float32')
             fluid.layers.elementwise_add(x=image1, y=image2, act='relu')
         print(main_program)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index d200b77eea83f..f06bb96ae9298 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -28,6 +28,7 @@
 
 
 class SimpleConv(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -36,22 +37,22 @@ def __init__(self,
                  groups=1,
                  act=None):
         super(SimpleConv, self).__init__()
-        self._conv = fluid.dygraph.Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=None,
-            use_cudnn=True)
+        self._conv = fluid.dygraph.Conv2D(num_channels=num_channels,
+                                          num_filters=num_filters,
+                                          filter_size=filter_size,
+                                          stride=stride,
+                                          padding=(filter_size - 1) // 2,
+                                          groups=groups,
+                                          act=None,
+                                          bias_attr=None,
+                                          use_cudnn=True)
 
     def forward(self, inputs):
         return self._conv(inputs)
 
 
 class TestAutoCast(unittest.TestCase):
+
     def amp_guard_white_op(self):
         data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
         with fluid.dygraph.guard():
@@ -92,31 +93,30 @@ def custom_op_list(self):
             tracer = fluid.framework._dygraph_tracer()
             base_white_list = fluid.dygraph.amp.auto_cast.WHITE_LIST
             base_black_list = fluid.dygraph.amp.auto_cast.BLACK_LIST
-            with fluid.dygraph.amp_guard(
-                    custom_white_list=["log"], custom_black_list=["conv2d"]):
+            with fluid.dygraph.amp_guard(custom_white_list=["log"],
+                                         custom_black_list=["conv2d"]):
                 white_list, black_list = tracer._get_amp_op_list()
                 self.assertTrue(
-                    set(white_list) ==
-                    (set(base_white_list) | {"log"}) - {"conv2d"})
+                    set(white_list) == (set(base_white_list) | {"log"}) -
+                    {"conv2d"})
 
                 self.assertTrue(
-                    set(black_list) ==
-                    (set(base_black_list) - {"log"}) | {"conv2d"})
+                    set(black_list) == (set(base_black_list) - {"log"})
+                    | {"conv2d"})
 
             base_white_list = fluid.dygraph.amp.auto_cast.PURE_FP16_WHITE_LIST
             base_black_list = fluid.dygraph.amp.auto_cast.PURE_FP16_BLACK_LIST
-            with fluid.dygraph.amp_guard(
-                    custom_white_list=["log"],
-                    custom_black_list=["conv2d"],
-                    level='O2'):
+            with fluid.dygraph.amp_guard(custom_white_list=["log"],
+                                         custom_black_list=["conv2d"],
+                                         level='O2'):
                 white_list, black_list = tracer._get_amp_op_list()
                 self.assertTrue(
-                    set(white_list) ==
-                    (set(base_white_list) | {"log"}) - {"conv2d"})
+                    set(white_list) == (set(base_white_list) | {"log"}) -
+                    {"conv2d"})
 
                 self.assertTrue(
-                    set(black_list) ==
-                    (set(base_black_list) - {"log"}) | {"conv2d"})
+                    set(black_list) == (set(base_black_list) - {"log"})
+                    | {"conv2d"})
 
     def test_custom_op_list(self):
         with _test_eager_guard():
@@ -128,15 +128,13 @@ def custom_op_list_exception(self):
 
         def func():
             with fluid.dygraph.guard():
-                model = SimpleConv(
-                    num_channels=3,
-                    num_filters=64,
-                    filter_size=7,
-                    stride=2,
-                    act='relu')
-                with fluid.dygraph.amp_guard(
-                        custom_white_list=["conv2d"],
-                        custom_black_list=["conv2d"]):
+                model = SimpleConv(num_channels=3,
+                                   num_filters=64,
+                                   filter_size=7,
+                                   stride=2,
+                                   act='relu')
+                with fluid.dygraph.amp_guard(custom_white_list=["conv2d"],
+                                             custom_black_list=["conv2d"]):
                     inp = fluid.dygraph.to_variable(inp_np)
                     out = model(inp)
 
@@ -177,11 +175,15 @@ def test_amp_guard_upsupported_fp16_op(self):
         self.amp_guard_upsupported_fp16_op()
 
     def mode_exception(self):
+
         def func():
             data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
             with fluid.dygraph.guard():
-                conv2d = fluid.dygraph.Conv2D(
-                    3, 2, 3, bias_attr=False, act=None)
+                conv2d = fluid.dygraph.Conv2D(3,
+                                              2,
+                                              3,
+                                              bias_attr=False,
+                                              act=None)
                 data = fluid.dygraph.to_variable(data)
                 with fluid.dygraph.amp_guard(level='O'):
                     out = conv2d(data)
@@ -195,13 +197,15 @@ def test_mode_exception(self):
 
 
 class TestAmpScaler(unittest.TestCase):
+
     def scale(self):
         with fluid.dygraph.guard():
             data = paddle.rand([10, 1024])
             scaler = paddle.fluid.dygraph.AmpScaler(init_loss_scaling=1024)
             scaled_data = scaler.scale(data)
             self.assertEqual(
-                np.array_equal(scaled_data.numpy(), data.numpy() * 1024), True)
+                np.array_equal(scaled_data.numpy(),
+                               data.numpy() * 1024), True)
 
     def test_scale(self):
         with _test_eager_guard():
@@ -215,12 +219,11 @@ def run_simple_conv(inp_np, use_scaler=True):
             paddle.seed(10)
             paddle.framework.random._manual_program_seed(10)
             with fluid.dygraph.guard():
-                model = SimpleConv(
-                    num_channels=3,
-                    num_filters=64,
-                    filter_size=7,
-                    stride=2,
-                    act='relu')
+                model = SimpleConv(num_channels=3,
+                                   num_filters=64,
+                                   filter_size=7,
+                                   stride=2,
+                                   act='relu')
                 optimizer = fluid.optimizer.SGDOptimizer(
                     learning_rate=0.01, parameter_list=model.parameters())
                 scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
@@ -232,8 +235,8 @@ def run_simple_conv(inp_np, use_scaler=True):
                     print('use scaler')
                     scaled_loss = scaler.scale(loss)
                     scaled_loss.backward()
-                    optimize_ops, params_grads = scaler.minimize(optimizer,
-                                                                 scaled_loss)
+                    optimize_ops, params_grads = scaler.minimize(
+                        optimizer, scaled_loss)
                 else:
                     print('use no scaler')
                     loss.backward()
@@ -269,12 +272,11 @@ def run_simple_conv(inp_np, use_scaler=True):
             paddle.seed(10)
             paddle.framework.random._manual_program_seed(10)
             with fluid.dygraph.guard():
-                model = SimpleConv(
-                    num_channels=3,
-                    num_filters=64,
-                    filter_size=7,
-                    stride=2,
-                    act='relu')
+                model = SimpleConv(num_channels=3,
+                                   num_filters=64,
+                                   filter_size=7,
+                                   stride=2,
+                                   act='relu')
                 optimizer = paddle.optimizer.SGD(learning_rate=0.01,
                                                  parameters=model.parameters())
                 scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
@@ -312,12 +314,11 @@ def nan_inf(self):
         inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
         inp_np[0][1][2][3] = np.nan
         with fluid.dygraph.guard():
-            model = SimpleConv(
-                num_channels=3,
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
+            model = SimpleConv(num_channels=3,
+                               num_filters=64,
+                               filter_size=7,
+                               stride=2,
+                               act='relu')
             params_init = {}
             for param in model.parameters():
                 params_init[param.name] = param.numpy()
@@ -344,6 +345,7 @@ def test_nan_inf(self):
         self.nan_inf()
 
     def step_update_exception(self):
+
         def func1():
             model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
             optimizer = paddle.optimizer.SGD(learning_rate=0.01,
@@ -396,14 +398,13 @@ def test_step_update_exception(self):
 
     def test_get_and_set(self):
         with fluid.dygraph.guard():
-            scaler = paddle.amp.GradScaler(
-                enable=True,
-                init_loss_scaling=1024,
-                incr_ratio=2.0,
-                decr_ratio=0.5,
-                incr_every_n_steps=1000,
-                decr_every_n_nan_or_inf=2,
-                use_dynamic_loss_scaling=True)
+            scaler = paddle.amp.GradScaler(enable=True,
+                                           init_loss_scaling=1024,
+                                           incr_ratio=2.0,
+                                           decr_ratio=0.5,
+                                           incr_every_n_steps=1000,
+                                           decr_every_n_nan_or_inf=2,
+                                           use_dynamic_loss_scaling=True)
             self.assertEqual(scaler.is_enable() == True, True)
             self.assertEqual(scaler.get_init_loss_scaling() == 1024, True)
             self.assertEqual(scaler.get_incr_ratio() == 2.0, True)
@@ -424,14 +425,13 @@ def test_get_and_set(self):
 
     def test_state_dict_and_load_state_dict(self):
         with fluid.dygraph.guard():
-            scaler1 = paddle.amp.GradScaler(
-                enable=True,
-                init_loss_scaling=14,
-                incr_ratio=233.0,
-                decr_ratio=0.523,
-                incr_every_n_steps=1090,
-                decr_every_n_nan_or_inf=20,
-                use_dynamic_loss_scaling=True)
+            scaler1 = paddle.amp.GradScaler(enable=True,
+                                            init_loss_scaling=14,
+                                            incr_ratio=233.0,
+                                            decr_ratio=0.523,
+                                            incr_every_n_steps=1090,
+                                            decr_every_n_nan_or_inf=20,
+                                            use_dynamic_loss_scaling=True)
             scaler_state = scaler1.state_dict()
             scaler2 = paddle.amp.GradScaler(enable=True)
             scaler2.load_state_dict(scaler_state)
@@ -446,6 +446,7 @@ def test_state_dict_and_load_state_dict(self):
             self.assertEqual(scaler3.is_enable() == False, True)
 
     def test_state_dict_and_load_state_dict_error(self):
+
         def test_error():
             state_empty = {}
             scaler = paddle.amp.GradScaler(enable=True)
@@ -455,6 +456,7 @@ def test_error():
 
 
 def reader_decorator(reader):
+
     def __reader__():
         for item in reader():
             img = np.array(item[0]).astype('float32').reshape(3, 224, 224)
@@ -465,6 +467,7 @@ def __reader__():
 
 
 class TestGradScalerStateDict(unittest.TestCase):
+
     def train_resnet(self,
                      enable_amp=True,
                      use_data_loader=True,
@@ -478,8 +481,8 @@ def train_resnet(self,
         paddle.framework.random._manual_program_seed(seed)
 
         resnet = ResNet(use_cudnn=True)
-        optimizer = optimizer_setting(
-            train_parameters, parameter_list=resnet.parameters())
+        optimizer = optimizer_setting(train_parameters,
+                                      parameter_list=resnet.parameters())
         np.random.seed(seed)
         train_reader = paddle.batch(
             paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size)
@@ -489,14 +492,14 @@ def train_resnet(self,
             dy_param_init_value[param.name] = param.numpy()
 
         program = None
-        scaler = paddle.amp.GradScaler(
-            enable=enable_amp, init_loss_scaling=2.**10)
+        scaler = paddle.amp.GradScaler(enable=enable_amp,
+                                       init_loss_scaling=2.**10)
 
         if use_data_loader:
-            train_reader = paddle.batch(
-                reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
-                batch_size=batch_size,
-                drop_last=True)
+            train_reader = paddle.batch(reader_decorator(
+                paddle.dataset.flowers.train(use_xmap=False)),
+                                        batch_size=batch_size,
+                                        drop_last=True)
             train_loader = fluid.io.DataLoader.from_generator(
                 capacity=4,
                 use_double_buffer=True,
@@ -516,8 +519,8 @@ def train_resnet(self,
                 if len(np.array([x[1]
                                  for x in data]).astype('int64')) != batch_size:
                     continue
-                y_data = np.array(
-                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
+                y_data = np.array([x[1] for x in data
+                                   ]).astype('int64').reshape(-1, 1)
 
                 img = paddle.to_tensor(dy_x_data)
                 label = paddle.to_tensor(y_data)
@@ -540,8 +543,8 @@ def train_resnet(self,
             for param in resnet.parameters():
                 if param.trainable:
                     np_array = np.array(param._grad_ivar().value().get_tensor())
-                    dy_grad_value[param.name + fluid.core.grad_var_suffix(
-                    )] = np_array
+                    dy_grad_value[param.name +
+                                  fluid.core.grad_var_suffix()] = np_array
 
             resnet.clear_gradients()
 
@@ -558,12 +561,15 @@ def train_resnet(self,
         return dy_out, dy_param_value, dy_grad_value
 
     def test_with_state_dict(self):
+
         def func_isinstance():
             with fluid.dygraph.guard():
-                out_use_state_dict = self.train_resnet(
-                    enable_amp=True, use_data_loader=True, use_save_load=True)
-                out_no_state_dict = self.train_resnet(
-                    enable_amp=True, use_data_loader=True, use_save_load=False)
+                out_use_state_dict = self.train_resnet(enable_amp=True,
+                                                       use_data_loader=True,
+                                                       use_save_load=True)
+                out_no_state_dict = self.train_resnet(enable_amp=True,
+                                                      use_data_loader=True,
+                                                      use_save_load=False)
             print('save_load:', out_use_state_dict[0], out_no_state_dict[0])
             self.assertTrue(
                 np.allclose(out_use_state_dict[0], out_no_state_dict[0]))
@@ -574,19 +580,25 @@ def func_isinstance():
 
 
 class TestAmpDecorator(unittest.TestCase):
+
     def test_mode_exception(self):
+
         def func():
             with fluid.dygraph.guard():
                 model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
                 opt = paddle.optimizer.SGD(parameters=model.parameters())
-                model, opt = paddle.amp.decorate(
-                    models=model, optimizers=opt, level='O')
+                model, opt = paddle.amp.decorate(models=model,
+                                                 optimizers=opt,
+                                                 level='O')
 
         self.assertRaises(ValueError, func)
 
     def test_input_type_exception(self):
+
         def test_error_model():
+
             class MyModel(object):
+
                 def __init__(self):
                     print("A fake Model")
 
@@ -605,7 +617,9 @@ def test_error_distributed_model():
         self.assertRaises(RuntimeError, test_error_distributed_model)
 
         def test_error_optimizer():
+
             class MyOptimizer(object):
+
                 def __init__(self):
                     print("A fake Optimizer")
 
@@ -618,51 +632,48 @@ def __init__(self):
 
     def test_set_master_weight(self):
         model1 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
-        opt1 = paddle.optimizer.Adam(
-            learning_rate=0.0001,
-            parameters=model1.parameters(),
-            multi_precision=True)
+        opt1 = paddle.optimizer.Adam(learning_rate=0.0001,
+                                     parameters=model1.parameters(),
+                                     multi_precision=True)
 
         model2 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
-        opt2 = paddle.optimizer.Adam(
-            learning_rate=0.0001,
-            parameters=model2.parameters(),
-            multi_precision=False)
-
-        model1, opt1 = paddle.amp.decorate(
-            models=model1, optimizers=opt1, level='O2', master_weight=None)
+        opt2 = paddle.optimizer.Adam(learning_rate=0.0001,
+                                     parameters=model2.parameters(),
+                                     multi_precision=False)
+
+        model1, opt1 = paddle.amp.decorate(models=model1,
+                                           optimizers=opt1,
+                                           level='O2',
+                                           master_weight=None)
         self.assertEqual(opt1._multi_precision, True)
 
-        models, opt2 = paddle.amp.decorate(
-            models=[model1, model2],
-            optimizers=opt2,
-            level='O2',
-            master_weight=None)
+        models, opt2 = paddle.amp.decorate(models=[model1, model2],
+                                           optimizers=opt2,
+                                           level='O2',
+                                           master_weight=None)
         self.assertEqual(opt2._multi_precision, True)
 
         model3 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
-        opt3 = paddle.optimizer.Adam(
-            learning_rate=0.0001, parameters=model3.parameters())
+        opt3 = paddle.optimizer.Adam(learning_rate=0.0001,
+                                     parameters=model3.parameters())
 
         model4 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
-        opt4 = paddle.optimizer.Adam(
-            learning_rate=0.0001, parameters=model4.parameters())
-
-        model3, opts = paddle.amp.decorate(
-            models=model3,
-            optimizers=[opt3, opt4],
-            level='O2',
-            master_weight=True)
+        opt4 = paddle.optimizer.Adam(learning_rate=0.0001,
+                                     parameters=model4.parameters())
+
+        model3, opts = paddle.amp.decorate(models=model3,
+                                           optimizers=[opt3, opt4],
+                                           level='O2',
+                                           master_weight=True)
         self.assertEqual(opts[0]._multi_precision, True)
         self.assertEqual(opts[1]._multi_precision, True)
 
         models = [model3, model4]
         optimizers = [opt3, opt4]
-        models, optimizers = paddle.amp.decorate(
-            models=models,
-            optimizers=optimizers,
-            level='O2',
-            master_weight=False)
+        models, optimizers = paddle.amp.decorate(models=models,
+                                                 optimizers=optimizers,
+                                                 level='O2',
+                                                 master_weight=False)
         self.assertEqual(optimizers[0]._multi_precision, False)
         self.assertEqual(optimizers[1]._multi_precision, False)
 
@@ -694,13 +705,17 @@ def test_skip_BatchNorm_Layer_norm(self):
 
 
 class TestPureFp16SaveLoad(unittest.TestCase):
+
     def test_save_dtype_exception(self):
+
         def func():
             paddle.disable_static()
             model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
             opt = paddle.optimizer.SGD(parameters=model.parameters())
-            paddle.amp.decorate(
-                models=model, optimizers=opt, level='O2', save_dtype='int')
+            paddle.amp.decorate(models=model,
+                                optimizers=opt,
+                                level='O2',
+                                save_dtype='int')
 
         self.assertRaises(ValueError, func)
 
@@ -717,8 +732,8 @@ def train_resnet(self,
         paddle.framework.random._manual_program_seed(seed)
 
         resnet = ResNet(use_cudnn=True)
-        optimizer = optimizer_setting(
-            train_parameters, parameter_list=resnet.parameters())
+        optimizer = optimizer_setting(train_parameters,
+                                      parameter_list=resnet.parameters())
         np.random.seed(seed)
         train_reader = paddle.batch(
             paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size)
@@ -728,14 +743,14 @@ def train_resnet(self,
             dy_param_init_value[param.name] = param.numpy()
 
         program = None
-        scaler = paddle.amp.GradScaler(
-            enable=enable_amp, init_loss_scaling=2.**10)
+        scaler = paddle.amp.GradScaler(enable=enable_amp,
+                                       init_loss_scaling=2.**10)
 
         if use_data_loader:
-            train_reader = paddle.batch(
-                reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
-                batch_size=batch_size,
-                drop_last=True)
+            train_reader = paddle.batch(reader_decorator(
+                paddle.dataset.flowers.train(use_xmap=False)),
+                                        batch_size=batch_size,
+                                        drop_last=True)
             train_loader = fluid.io.DataLoader.from_generator(
                 capacity=4,
                 use_double_buffer=True,
@@ -745,11 +760,10 @@ def train_resnet(self,
             train_reader = train_loader
 
         if enable_amp:
-            resnet, optimizer = paddle.amp.decorate(
-                models=resnet,
-                optimizers=optimizer,
-                level='O2',
-                save_dtype='float32')
+            resnet, optimizer = paddle.amp.decorate(models=resnet,
+                                                    optimizers=optimizer,
+                                                    level='O2',
+                                                    save_dtype='float32')
 
         for batch_id, data in enumerate(train_reader()):
             if batch_id >= batch_num:
@@ -762,8 +776,8 @@ def train_resnet(self,
                 if len(np.array([x[1]
                                  for x in data]).astype('int64')) != batch_size:
                     continue
-                y_data = np.array(
-                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
+                y_data = np.array([x[1] for x in data
+                                   ]).astype('int64').reshape(-1, 1)
 
                 img = paddle.to_tensor(dy_x_data)
                 label = paddle.to_tensor(y_data)
@@ -787,8 +801,8 @@ def train_resnet(self,
             for param in resnet.parameters():
                 if param.trainable:
                     np_array = np.array(param._grad_ivar().value().get_tensor())
-                    dy_grad_value[param.name + fluid.core.grad_var_suffix(
-                    )] = np_array
+                    dy_grad_value[param.name +
+                                  fluid.core.grad_var_suffix()] = np_array
 
             resnet.clear_gradients()
 
@@ -813,23 +827,25 @@ def train_resnet(self,
                 resnet.set_state_dict(obj_load['model'])
                 optimizer.set_state_dict(obj_load['opt'])
                 scaler.load_state_dict(obj_load['scaler'])
-                resnet, optimizer = paddle.amp.decorate(
-                    models=resnet,
-                    optimizers=optimizer,
-                    level='O2',
-                    save_dtype='float32')
+                resnet, optimizer = paddle.amp.decorate(models=resnet,
+                                                        optimizers=optimizer,
+                                                        level='O2',
+                                                        save_dtype='float32')
 
         if use_data_loader:
             train_reader._reset()
         return dy_out, dy_param_value, dy_grad_value
 
     def test_with_save_load(self):
+
         def func_isinstance():
             with fluid.dygraph.guard():
-                out_use_save_load = self.train_resnet(
-                    enable_amp=True, use_data_loader=True, use_save_load=True)
-                out_no_save_load = self.train_resnet(
-                    enable_amp=True, use_data_loader=True, use_save_load=False)
+                out_use_save_load = self.train_resnet(enable_amp=True,
+                                                      use_data_loader=True,
+                                                      use_save_load=True)
+                out_no_save_load = self.train_resnet(enable_amp=True,
+                                                     use_data_loader=True,
+                                                     use_save_load=False)
             print('save_load:', out_use_save_load[0], out_no_save_load[0])
             self.assertTrue(
                 np.allclose(out_use_save_load[0], out_no_save_load[0]))
@@ -840,6 +856,7 @@ def func_isinstance():
 
 
 class TestPureFp16InferenceSaveLoad(unittest.TestCase):
+
     def inference_save_load(self):
         BATCH_SIZE = 16
         BATCH_NUM = 4
@@ -849,6 +866,7 @@ def inference_save_load(self):
 
         # define a random dataset
         class RandomDataset(paddle.io.Dataset):
+
             def __init__(self, num_samples):
                 self.num_samples = num_samples
 
@@ -862,6 +880,7 @@ def __len__(self):
                 return self.num_samples
 
         class LinearNet(nn.Layer):
+
             def __init__(self):
                 super(LinearNet, self).__init__()
                 self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
@@ -872,11 +891,10 @@ def forward(self, x):
         def train(layer, loader, loss_fn, opt):
             for epoch_id in range(EPOCH_NUM):
                 for batch_id, (image, label) in enumerate(loader()):
-                    with paddle.amp.auto_cast(
-                            enable=True,
-                            custom_white_list=None,
-                            custom_black_list=None,
-                            level='O2'):
+                    with paddle.amp.auto_cast(enable=True,
+                                              custom_white_list=None,
+                                              custom_black_list=None,
+                                              level='O2'):
                         out = layer(image)
                         loss = loss_fn(out, label)
                     loss.backward()
@@ -885,28 +903,27 @@ def train(layer, loader, loss_fn, opt):
 
         # train
         layer = LinearNet()
-        adam = paddle.optimizer.Adam(
-            learning_rate=0.001,
-            parameters=layer.parameters(),
-            multi_precision=True)
+        adam = paddle.optimizer.Adam(learning_rate=0.001,
+                                     parameters=layer.parameters(),
+                                     multi_precision=True)
         loss_fn = nn.CrossEntropyLoss()
-        layer, adam = paddle.amp.decorate(
-            models=layer, optimizers=adam, save_dtype='float32')
+        layer, adam = paddle.amp.decorate(models=layer,
+                                          optimizers=adam,
+                                          save_dtype='float32')
         dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
-        loader = paddle.io.DataLoader(
-            dataset,
-            batch_size=BATCH_SIZE,
-            shuffle=True,
-            drop_last=True,
-            num_workers=2)
+        loader = paddle.io.DataLoader(dataset,
+                                      batch_size=BATCH_SIZE,
+                                      shuffle=True,
+                                      drop_last=True,
+                                      num_workers=2)
 
         train(layer, loader, loss_fn, adam)
 
-        # save 
+        # save
         path = "example_model/linear"
-        paddle.jit.save(
-            layer, path, input_spec=[InputSpec(
-                shape=[IMAGE_SIZE], name='x')])
+        paddle.jit.save(layer,
+                        path,
+                        input_spec=[InputSpec(shape=[IMAGE_SIZE], name='x')])
 
         # jit.load
         loaded_layer = paddle.jit.load(path)
@@ -920,8 +937,8 @@ def train(layer, loader, loss_fn, opt):
         # load_inference_model
         paddle.enable_static()
         exe = paddle.static.Executor()
-        [inference_program, feed_target_names, fetch_targets] = (
-            paddle.static.load_inference_model(path, exe))
+        [inference_program, feed_target_names,
+         fetch_targets] = (paddle.static.load_inference_model(path, exe))
         tensor_img = x
         results = exe.run(inference_program,
                           feed={feed_target_names[0]: tensor_img},
@@ -970,15 +987,18 @@ def train_resnet(self,
             # NOTE(zhiqiu): The Membership test operations(in / not in) calls "is" and "equal",
             # see details: https://docs.python.org/3/reference/expressions.html#membership-test-operations.
             # So do not use other_params =  [p for p in resnet.parameters() if p not in conv_params]
-            optimizer = paddle.optimizer.Momentum(
-                parameters=[{
-                    'params': conv_params,
-                    'learning_rate': 0.01
-                }, {
-                    'params': other_params,
-                    'learning_rate': 0.001
-                }],
-                multi_precision=True)
+            optimizer = paddle.optimizer.Momentum(parameters=[{
+                'params':
+                conv_params,
+                'learning_rate':
+                0.01
+            }, {
+                'params':
+                other_params,
+                'learning_rate':
+                0.001
+            }],
+                                                  multi_precision=True)
         else:
             optimizer = paddle.optimizer.SGD(parameters=resnet.parameters())
 
@@ -991,14 +1011,14 @@ def train_resnet(self,
             dy_param_init_value[param.name] = param.numpy()
 
         program = None
-        scaler = paddle.amp.GradScaler(
-            enable=enable_amp, init_loss_scaling=2.**10)
+        scaler = paddle.amp.GradScaler(enable=enable_amp,
+                                       init_loss_scaling=2.**10)
 
         if use_data_loader:
-            train_reader = paddle.batch(
-                reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
-                batch_size=batch_size,
-                drop_last=True)
+            train_reader = paddle.batch(reader_decorator(
+                paddle.dataset.flowers.train(use_xmap=False)),
+                                        batch_size=batch_size,
+                                        drop_last=True)
             train_loader = fluid.io.DataLoader.from_generator(
                 capacity=4,
                 use_double_buffer=True,
@@ -1016,13 +1036,13 @@ def train_resnet(self,
             if use_data_loader:
                 img, label = data
             else:
-                dy_x_data = np.array(
-                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                dy_x_data = np.array([x[0].reshape(3, 224, 224)
+                                      for x in data]).astype('float32')
                 if len(np.array([x[1]
                                  for x in data]).astype('int64')) != batch_size:
                     continue
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    -1, 1)
+                y_data = np.array([x[1] for x in data
+                                   ]).astype('int64').reshape(-1, 1)
 
                 img = paddle.to_tensor(dy_x_data)
                 label = paddle.to_tensor(y_data)
@@ -1047,8 +1067,8 @@ def train_resnet(self,
             for param in resnet.parameters():
                 if param.trainable:
                     np_array = np.array(param._grad_ivar().value().get_tensor())
-                    dy_grad_value[param.name + fluid.core.grad_var_suffix(
-                    )] = np_array
+                    dy_grad_value[param.name +
+                                  fluid.core.grad_var_suffix()] = np_array
 
             resnet.clear_gradients()
 
@@ -1060,6 +1080,7 @@ def train_resnet(self,
         return dy_out, dy_param_value, dy_grad_value
 
     def test_resnet(self):
+
         def func_isinstance():
             with fluid.dygraph.guard():
                 out_fp32 = self.train_resnet(enable_amp=False)
@@ -1068,51 +1089,50 @@ def func_isinstance():
             print(out_fp32[0], out_amp[0], out_pure_fp16[0])
             self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
             self.assertTrue(
-                np.allclose(
-                    out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+                np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
 
         with _test_eager_guard():
             func_isinstance()
         func_isinstance()
 
     def test_with_data_loader(self):
+
         def func_isinstance():
             with fluid.dygraph.guard():
-                out_fp32 = self.train_resnet(
-                    enable_amp=False, use_data_loader=True)
-                out_amp = self.train_resnet(
-                    enable_amp=True, use_data_loader=True)
-                out_pure_fp16 = self.train_resnet(
-                    enable_amp=True, use_data_loader=True, level='O2')
+                out_fp32 = self.train_resnet(enable_amp=False,
+                                             use_data_loader=True)
+                out_amp = self.train_resnet(enable_amp=True,
+                                            use_data_loader=True)
+                out_pure_fp16 = self.train_resnet(enable_amp=True,
+                                                  use_data_loader=True,
+                                                  level='O2')
             print(out_fp32[0], out_amp[0], out_pure_fp16[0])
             self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
             self.assertTrue(
-                np.allclose(
-                    out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+                np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
 
         with _test_eager_guard():
             func_isinstance()
         func_isinstance()
 
     def test_param_group(self):
+
         def func_isinstance():
             with fluid.dygraph.guard():
-                out_fp32 = self.train_resnet(
-                    enable_amp=False,
-                    use_data_loader=True,
-                    use_param_group=True)
-                out_amp = self.train_resnet(
-                    enable_amp=True, use_data_loader=True, use_param_group=True)
-                out_pure_fp16 = self.train_resnet(
-                    enable_amp=True,
-                    use_data_loader=True,
-                    use_param_group=True,
-                    level='O2')
+                out_fp32 = self.train_resnet(enable_amp=False,
+                                             use_data_loader=True,
+                                             use_param_group=True)
+                out_amp = self.train_resnet(enable_amp=True,
+                                            use_data_loader=True,
+                                            use_param_group=True)
+                out_pure_fp16 = self.train_resnet(enable_amp=True,
+                                                  use_data_loader=True,
+                                                  use_param_group=True,
+                                                  level='O2')
             print(out_fp32[0], out_amp[0], out_pure_fp16[0])
             self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
             self.assertTrue(
-                np.allclose(
-                    out_fp32[0], out_pure_fp16[0], atol=1.e-2))
+                np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
 
         with _test_eager_guard():
             func_isinstance()
@@ -1135,8 +1155,8 @@ def train_resnet(self, enable_amp=True, level='O1'):
             paddle.framework.random._manual_program_seed(seed)
 
             resnet = ResNet(use_cudnn=True)
-            optimizer = optimizer_setting(
-                train_parameters, parameter_list=resnet.parameters())
+            optimizer = optimizer_setting(train_parameters,
+                                          parameter_list=resnet.parameters())
             optimizer = paddle.optimizer.Momentum(
                 parameters=resnet.parameters(), multi_precision=True)
             np.random.seed(seed)
@@ -1149,8 +1169,8 @@ def train_resnet(self, enable_amp=True, level='O1'):
                 dy_param_init_value[param.name] = param.numpy()
 
             program = None
-            scaler = paddle.fluid.dygraph.AmpScaler(
-                enable=enable_amp, init_loss_scaling=2.**10)
+            scaler = paddle.fluid.dygraph.AmpScaler(enable=enable_amp,
+                                                    init_loss_scaling=2.**10)
 
             if enable_amp and (level == 'O2'):
                 resnet, optimizer = paddle.fluid.dygraph.amp_decorate(
@@ -1159,18 +1179,18 @@ def train_resnet(self, enable_amp=True, level='O1'):
             for batch_id, data in enumerate(train_reader()):
                 if batch_id >= batch_num:
                     break
-                dy_x_data = np.array(
-                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                dy_x_data = np.array([x[0].reshape(3, 224, 224)
+                                      for x in data]).astype('float32')
                 if len(np.array([x[1]
                                  for x in data]).astype('int64')) != batch_size:
                     continue
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    -1, 1)
+                y_data = np.array([x[1] for x in data
+                                   ]).astype('int64').reshape(-1, 1)
                 img = fluid.dygraph.to_variable(dy_x_data)
                 label = fluid.dygraph.to_variable(y_data)
                 label.stop_gradient = True
-                with paddle.fluid.dygraph.amp_guard(
-                        enable=enable_amp, level=level):
+                with paddle.fluid.dygraph.amp_guard(enable=enable_amp,
+                                                    level=level):
                     out = resnet(img)
 
                 loss = fluid.layers.cross_entropy(input=out, label=label)
@@ -1186,10 +1206,10 @@ def train_resnet(self, enable_amp=True, level='O1'):
                 dy_grad_value = {}
                 for param in resnet.parameters():
                     if param.trainable:
-                        np_array = np.array(param._grad_ivar().value()
-                                            .get_tensor())
-                        dy_grad_value[param.name + fluid.core.grad_var_suffix(
-                        )] = np_array
+                        np_array = np.array(
+                            param._grad_ivar().value().get_tensor())
+                        dy_grad_value[param.name +
+                                      fluid.core.grad_var_suffix()] = np_array
 
                 resnet.clear_gradients()
 
@@ -1200,6 +1220,7 @@ def train_resnet(self, enable_amp=True, level='O1'):
         return dy_out, dy_param_value, dy_grad_value
 
     def test_resnet(self):
+
         def func_isinstance():
             out_fp32 = self.train_resnet(enable_amp=False)
             out_amp = self.train_resnet(enable_amp=True)
@@ -1207,8 +1228,7 @@ def func_isinstance():
             print(out_fp32[0], out_amp[0], out_pure_fp16[0])
             self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2))
             self.assertTrue(
-                np.allclose(
-                    out_fp32[0], out_pure_fp16[0], atol=1.e-1))
+                np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-1))
 
         with _test_eager_guard():
             func_isinstance()
@@ -1221,6 +1241,7 @@ class TestLayerNormFp16(unittest.TestCase):
     '''
 
     def test_layer_norm_fp16(self):
+
         def func_isinstance():
             if fluid.is_compiled_with_cuda():
                 with fluid.dygraph.guard(fluid.CUDAPlace(0)):
@@ -1246,13 +1267,15 @@ def train(self, enable_amp=True, amp_level='O1'):
         paddle.seed(100)
         input = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
         conv = paddle.nn.Conv2D(4, 6, (3, 3))
-        with paddle.amp.auto_cast(
-                enable=enable_amp, level=amp_level, dtype='bfloat16'):
+        with paddle.amp.auto_cast(enable=enable_amp,
+                                  level=amp_level,
+                                  dtype='bfloat16'):
             output = conv(input)
         output = output.cast('float32')
         return output.numpy()
 
     def test_bf16(self):
+
         def func_isinstance():
             if fluid.core.is_compiled_with_cuda(
             ) and fluid.core.is_bfloat16_supported(paddle.CUDAPlace(0)):
@@ -1260,11 +1283,9 @@ def func_isinstance():
                 out_bf16_O1 = self.train(enable_amp=True, amp_level='O1')
                 out_bf16_O2 = self.train(enable_amp=True, amp_level='O2')
                 self.assertTrue(
-                    np.allclose(
-                        out_fp32, out_bf16_O1, rtol=1.e-3, atol=1.e-1))
+                    np.allclose(out_fp32, out_bf16_O1, rtol=1.e-3, atol=1.e-1))
                 self.assertTrue(
-                    np.allclose(
-                        out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
+                    np.allclose(out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
 
         with _test_eager_guard():
             func_isinstance()
@@ -1272,8 +1293,11 @@ def func_isinstance():
 
 
 class TestAmpWithPyLyer(unittest.TestCase):
+
     def test_pylayer(self):
+
         class MyMM(PyLayer):
+
             @staticmethod
             def forward(ctx, a, b):
                 ctx.save_for_backward(a, b)
@@ -1298,7 +1322,9 @@ def backward(ctx, grad):
 
 
 class TestAmpWithHook(unittest.TestCase):
+
     def test_hook_change_dtype(self):
+
         def func_isinstance():
             with paddle.fluid.dygraph.guard():
                 v = paddle.rand([3, 3])
@@ -1321,6 +1347,7 @@ def foo(grad):
         func_isinstance()
 
     def test_hook_change_place(self):
+
         def func_isinstance():
             with paddle.fluid.dygraph.guard():
                 v = paddle.rand([3, 3])
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index 39b79dd4ba26b..4dee7cf963348 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -19,6 +19,7 @@
 
 
 class AutoPruneLayer0(fluid.Layer):
+
     def __init__(self, input_size):
         super(AutoPruneLayer0, self).__init__()
         self.linear1 = fluid.dygraph.Linear(
@@ -41,6 +42,7 @@ def forward(self, x, y):
 
 
 class AutoPruneLayer1(fluid.Layer):
+
     def __init__(self, input_size):
         super(AutoPruneLayer1, self).__init__()
         self.linear1 = fluid.dygraph.Linear(
@@ -64,6 +66,7 @@ def forward(self, x, y):
 
 
 class AutoPruneLayer2(fluid.Layer):
+
     def __init__(self, input_size):
         super(AutoPruneLayer2, self).__init__()
         self.linear = fluid.dygraph.Linear(input_size, 10, act=None)
@@ -81,14 +84,16 @@ def forward(self, x, label):
 
 
 class AutoPruneLayer3(fluid.Layer):
+
     def __init__(self, input_size):
         super(AutoPruneLayer3, self).__init__()
         self.linear = fluid.dygraph.Linear(input_size, 20, act=None)
 
     def forward(self, x, label, test_num):
         feature = self.linear(x)
-        part1, part2 = fluid.layers.split(
-            feature, num_or_sections=[10, 10], dim=1)
+        part1, part2 = fluid.layers.split(feature,
+                                          num_or_sections=[10, 10],
+                                          dim=1)
         # Note that: part2 is not used.
         loss = fluid.layers.cross_entropy(input=part1, label=label)
         loss = fluid.layers.mean(loss)
@@ -99,6 +104,7 @@ def forward(self, x, label, test_num):
 
 
 class MyLayer(fluid.Layer):
+
     def __init__(self, input_size, vocab_size, size, dtype="float32"):
         super(MyLayer, self).__init__(dtype=dtype)
         self.embed0 = fluid.Embedding(size=(vocab_size, size))
@@ -121,6 +127,7 @@ def embed_linear0(self, x):
 
 
 class MyLayer2(fluid.Layer):
+
     def __init__(self, input_size, vocab_size, size, dtype="float32"):
         super(MyLayer2, self).__init__(dtype=dtype)
         self.embed0 = fluid.Embedding(size=(vocab_size, size))
@@ -132,8 +139,8 @@ def forward(self, indices):
         # mind the difference with MyLayer
         # In this example, the forward method involes all params
         loss = fluid.layers.reduce_mean(
-            self.linear_0(self.embed0(indices)) + self.linear_1(
-                self.embed1(indices)))
+            self.linear_0(self.embed0(indices)) +
+            self.linear_1(self.embed1(indices)))
         return loss
 
     def linear0(self, x):
@@ -146,6 +153,7 @@ def embed_linear0(self, x):
 
 
 class TestImperativeAutoPrune(unittest.TestCase):
+
     def func_auto_prune(self):
         with fluid.dygraph.guard():
             case1 = AutoPruneLayer0(input_size=5)
@@ -195,9 +203,11 @@ def func_auto_prune3(self):
             self.assertTrue((part2.gradient() == 0).all())
 
     def test_auto_prune3(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_auto_prune3()
         self.func_auto_prune3()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_auto_prune4(self):
         with fluid.dygraph.guard():
@@ -212,9 +222,11 @@ def func_auto_prune4(self):
             self.assertTrue((part2.gradient() == 1).all())
 
     def test_auto_prune4(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_auto_prune4()
         self.func_auto_prune4()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_auto_prune5(self):
         with fluid.dygraph.guard():
@@ -229,9 +241,11 @@ def func_auto_prune5(self):
             self.assertTrue((part2.gradient() == 0).all())
 
     def test_auto_prune5(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_auto_prune5()
         self.func_auto_prune5()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_auto_prune6(self):
         with fluid.dygraph.guard():
@@ -331,8 +345,8 @@ def func_auto_prune9(self):
             optimizer.minimize(out2)
             self.assertTrue(
                 np.array_equal(linear2_origin, linear2.weight.numpy()))
-            self.assertTrue(
-                np.array_equal(linear_origin, linear.weight.numpy()))
+            self.assertTrue(np.array_equal(linear_origin,
+                                           linear.weight.numpy()))
             try:
                 linear2.weight.gradient()
             except ValueError as e:
@@ -373,8 +387,8 @@ def func_auto_prune_with_optimizer(self):
         size = 20
         batch_size = 16
 
-        indices = np.random.randint(
-            low=0, high=100, size=(batch_size, 1)).astype("int64")
+        indices = np.random.randint(low=0, high=100,
+                                    size=(batch_size, 1)).astype("int64")
         embed = np.random.randn(batch_size, size).astype("float32")
 
         place = fluid.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index ebbf681f3dcef..e67bae46a53a7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -28,6 +28,7 @@
 
 
 class MyLayer(fluid.Layer):
+
     def __init__(self):
         super(MyLayer, self).__init__()
 
@@ -40,22 +41,23 @@ def forward(self, inputs):
 
 
 class MLP(fluid.Layer):
+
     def __init__(self, input_size):
         super(MLP, self).__init__()
         self._linear1 = Linear(
             input_size,
             3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.1)),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.1)))
         self._linear2 = Linear(
             3,
             4,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.1)),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.1)))
 
     def forward(self, inputs):
         x = self._linear1(inputs)
@@ -65,6 +67,7 @@ def forward(self, inputs):
 
 
 class SimpleRNNCell(fluid.Layer):
+
     def __init__(self, step_input_size, hidden_size, output_size, param_attr):
         super(SimpleRNNCell, self).__init__()
         self.step_input_size = step_input_size
@@ -77,21 +80,18 @@ def __init__(self, step_input_size, hidden_size, output_size, param_attr):
         h2h_param_shape = [self.hidden_size, self.hidden_size]
         h2o_param_shape = [self.output_size, self.hidden_size]
         self._i2h_w = None
-        self._i2h_w = self.create_parameter(
-            attr=self.param_attr,
-            shape=i2h_param_shape,
-            dtype=self._dtype,
-            is_bias=False)
-        self._h2h_w = self.create_parameter(
-            attr=self.param_attr,
-            shape=h2h_param_shape,
-            dtype=self._dtype,
-            is_bias=False)
-        self._h2o_w = self.create_parameter(
-            attr=self.param_attr,
-            shape=h2o_param_shape,
-            dtype=self._dtype,
-            is_bias=False)
+        self._i2h_w = self.create_parameter(attr=self.param_attr,
+                                            shape=i2h_param_shape,
+                                            dtype=self._dtype,
+                                            is_bias=False)
+        self._h2h_w = self.create_parameter(attr=self.param_attr,
+                                            shape=h2h_param_shape,
+                                            dtype=self._dtype,
+                                            is_bias=False)
+        self._h2o_w = self.create_parameter(attr=self.param_attr,
+                                            shape=h2o_param_shape,
+                                            dtype=self._dtype,
+                                            is_bias=False)
 
     def forward(self, input, pre_hidden):
         tmp_i2h = paddle.fluid.layers.nn.mul(input, self._i2h_w)
@@ -105,29 +105,29 @@ def forward(self, input, pre_hidden):
 
 
 class SimpleRNN(fluid.Layer):
+
     def __init__(self):
         super(SimpleRNN, self).__init__()
         self.seq_len = 4
         self._cell = SimpleRNNCell(
-            3,
-            3,
-            3,
+            3, 3, 3,
             fluid.ParamAttr(initializer=fluid.initializer.Constant(value=0.1)))
 
     def forward(self, inputs):
         outs = list()
         pre_hiddens = list()
 
-        init_hidden = self.create_parameter(
-            attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            shape=[1, 3],
-            dtype='float32',
-            is_bias=False)
+        init_hidden = self.create_parameter(attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.1)),
+                                            shape=[1, 3],
+                                            dtype='float32',
+                                            is_bias=False)
         pre_hidden = init_hidden
         for i in range(self.seq_len):
-            input = fluid.layers.slice(
-                inputs, axes=[1], starts=[i], ends=[i + 1])
+            input = fluid.layers.slice(inputs,
+                                       axes=[1],
+                                       starts=[i],
+                                       ends=[i + 1])
             input = fluid.layers.reshape(input, shape=[1, 3])
             out_softmax, pre_hidden = self._cell(input, pre_hidden)
             outs.append(out_softmax)
@@ -136,6 +136,7 @@ def forward(self, inputs):
 
 
 class TestImperative(unittest.TestCase):
+
     def functional_dygraph_context(self):
         self.assertFalse(fluid.dygraph.enabled())
         fluid.enable_dygraph()
@@ -220,8 +221,8 @@ def func_create_varbase(self):
         t = fluid.Tensor()
         t.set(x, fluid.CPUPlace())
         if not _in_legacy_dygraph():
-            egr_tmp = fluid.core.eager.Tensor(
-                value=x, place=fluid.core.CPUPlace())
+            egr_tmp = fluid.core.eager.Tensor(value=x,
+                                              place=fluid.core.CPUPlace())
             egr_tmp2 = fluid.core.eager.Tensor(y, fluid.core.CPUPlace())
             egr_tmp3 = paddle.to_tensor(x)
             egr_tmp4 = fluid.core.eager.Tensor(y)
@@ -359,11 +360,13 @@ def func_empty_var(self):
             cur_block = cur_program.current_block()
             # Normally, we don't allow tensor with -1 shape being created in dygraph mode, this test is not good.
             if _in_legacy_dygraph():
-                new_variable = cur_block.create_var(
-                    name="X", shape=[-1, 23, 48], dtype='float32')
+                new_variable = cur_block.create_var(name="X",
+                                                    shape=[-1, 23, 48],
+                                                    dtype='float32')
             else:
-                new_variable = cur_block.create_var(
-                    name="X", shape=[1, 23, 48], dtype='float32')
+                new_variable = cur_block.create_var(name="X",
+                                                    shape=[1, 23, 48],
+                                                    dtype='float32')
             try:
                 new_variable.numpy()
             except Exception as e:
@@ -398,11 +401,13 @@ def func_empty_grad(self):
             cur_block = cur_program.current_block()
             # Normally, we don't allow tensor with -1 shape being created in dygraph mode, this test is not good.
             if _in_legacy_dygraph():
-                new_variable = cur_block.create_var(
-                    name="X", shape=[-1, 23, 48], dtype='float32')
+                new_variable = cur_block.create_var(name="X",
+                                                    shape=[-1, 23, 48],
+                                                    dtype='float32')
             else:
-                new_variable = cur_block.create_var(
-                    name="X", shape=[1, 23, 48], dtype='float32')
+                new_variable = cur_block.create_var(name="X",
+                                                    shape=[1, 23, 48],
+                                                    dtype='float32')
             try:
                 new_variable.gradient()
             except Exception as e:
@@ -460,8 +465,9 @@ def func_layer_in_out(self):
             dy_grad2 = l2._x_for_debug.gradient()
 
         with new_program_scope():
-            inp = fluid.layers.data(
-                name="inp", shape=[3], append_batch_size=False)
+            inp = fluid.layers.data(name="inp",
+                                    shape=[3],
+                                    append_batch_size=False)
             l = MyLayer()
             x = l(inp)[0]
             param_grads = fluid.backward.append_backward(
@@ -479,9 +485,11 @@ def func_layer_in_out(self):
         self.assertTrue(np.array_equal(dy_grad2, static_grad))
 
     def test_layer_in_out(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_layer_in_out()
         self.func_layer_in_out()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_mlp(self):
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
@@ -503,8 +511,9 @@ def func_mlp(self):
             dy_grad2 = mlp2._linear1.weight.gradient()
 
         with new_program_scope():
-            inp = fluid.layers.data(
-                name="inp", shape=[2, 2], append_batch_size=False)
+            inp = fluid.layers.data(name="inp",
+                                    shape=[2, 2],
+                                    append_batch_size=False)
             mlp = MLP(input_size=2)
             out = mlp(inp)
             param_grads = fluid.backward.append_backward(
@@ -540,6 +549,7 @@ def test_mlp(self):
         self.func_mlp()
 
     def test_gradient_accumulation(self):
+
         def test_single_api(sort_sum_gradient):
             fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient})
             x = paddle.to_tensor(5., stop_gradient=False)
@@ -581,7 +591,7 @@ def fun(x, y, z):
 
             loss = fun(x, y, z)
             loss.backward(retain_graph=True)
-            # x.grad = 2*x*y + z + 2*y = 27 
+            # x.grad = 2*x*y + z + 2*y = 27
             self.assertTrue(np.array_equal(x.grad.numpy(), [27]))
 
             loss.backward(retain_graph=True)
@@ -619,14 +629,14 @@ def test_mlp(sort_sum_gradient):
                 detach_x = x.detach()
                 clear_loss = mlp2(detach_x)
                 clear_loss.backward()
-                expected_weight1_grad = (
-                    expected_weight1_grad + mlp2._linear1.weight.grad.numpy())
-                expected_bias1_grad = (
-                    expected_bias1_grad + mlp2._linear1.bias.grad.numpy())
-                expected_weight2_grad = (
-                    expected_weight2_grad + mlp2._linear2.weight.grad.numpy())
-                expected_bias2_grad = (
-                    expected_bias2_grad + mlp2._linear2.bias.grad.numpy())
+                expected_weight1_grad = (expected_weight1_grad +
+                                         mlp2._linear1.weight.grad.numpy())
+                expected_bias1_grad = (expected_bias1_grad +
+                                       mlp2._linear1.bias.grad.numpy())
+                expected_weight2_grad = (expected_weight2_grad +
+                                         mlp2._linear2.weight.grad.numpy())
+                expected_bias2_grad = (expected_bias2_grad +
+                                       mlp2._linear2.bias.grad.numpy())
 
                 loss = mlp1(x)
                 loss.backward()
@@ -687,17 +697,19 @@ def func_dygraph_vs_static(self):
 
         # static graph
         with new_program_scope():
-            inp_data1 = fluid.layers.data(
-                name='inp1', shape=[3, 3], dtype=np.float32)
-            inp_data2 = fluid.layers.data(
-                name='inp2', shape=[3, 3], dtype=np.float32)
+            inp_data1 = fluid.layers.data(name='inp1',
+                                          shape=[3, 3],
+                                          dtype=np.float32)
+            inp_data2 = fluid.layers.data(name='inp2',
+                                          shape=[3, 3],
+                                          dtype=np.float32)
 
             a = fluid.layers.expand(
-                fluid.layers.reshape(
-                    fluid.layers.reduce_sum(inp_data1), [1, 1]), [4, 1])
+                fluid.layers.reshape(fluid.layers.reduce_sum(inp_data1),
+                                     [1, 1]), [4, 1])
             b = fluid.layers.expand(
-                fluid.layers.reshape(
-                    fluid.layers.reduce_sum(inp_data2), [1, 1]), [4, 1])
+                fluid.layers.reshape(fluid.layers.reduce_sum(inp_data2),
+                                     [1, 1]), [4, 1])
             cond = fluid.layers.less_than(x=a, y=b)
 
             ie = fluid.layers.IfElse(cond)
@@ -717,8 +729,10 @@ def func_dygraph_vs_static(self):
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             static_result = exe.run(fluid.default_main_program(),
-                                    feed={'inp1': np_inp1,
-                                          'inp2': np_inp2},
+                                    feed={
+                                        'inp1': np_inp1,
+                                        'inp2': np_inp2
+                                    },
                                     fetch_list=out)[0]
         self.assertTrue(np.allclose(dygraph_result, static_result))
 
@@ -756,8 +770,9 @@ def func_rnn(self):
             dy_grad_i2h2 = simple_rnn2._cell._i2h_w.gradient()
 
         with new_program_scope():
-            inp = fluid.layers.data(
-                name="inp", shape=[1, 4, 3], append_batch_size=False)
+            inp = fluid.layers.data(name="inp",
+                                    shape=[1, 4, 3],
+                                    append_batch_size=False)
             simple_rnn = SimpleRNN()
             outs, pre_hiddens = simple_rnn(inp)
             param_grads = fluid.backward.append_backward(outs[3])
@@ -811,6 +826,7 @@ def test_layer_attrs(self):
 
 
 class TestDygraphUtils(unittest.TestCase):
+
     def func_append_activation_in_dygraph_exception(self):
         with new_program_scope():
             np_inp = np.random.random(size=(10, 20, 30)).astype(np.float32)
@@ -868,8 +884,9 @@ def test_append_activation_in_dygraph3(self):
 
     def func_append_activation_in_dygraph_use_mkldnn(self):
         a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
-        helper = LayerHelper(
-            fluid.unique_name.generate("test"), act="relu", use_mkldnn=True)
+        helper = LayerHelper(fluid.unique_name.generate("test"),
+                             act="relu",
+                             use_mkldnn=True)
         func = helper.append_activation
         with fluid.dygraph.guard():
             a = paddle.to_tensor(a_np)
@@ -929,6 +946,7 @@ def test_append_bias_in_dygraph(self):
 
 
 class TestDygraphGuardWithError(unittest.TestCase):
+
     def func_without_guard(self):
         with fluid.dygraph.guard():
             x = paddle.to_tensor(np.zeros([10, 10]))
@@ -943,6 +961,7 @@ def test_without_guard(self):
 
 
 class TestMetaclass(unittest.TestCase):
+
     def func_metaclass(self):
         self.assertEqual(type(MyLayer).__name__, 'type')
         self.assertNotEqual(type(MyLayer).__name__, 'pybind11_type')
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_layerdict.py b/python/paddle/fluid/tests/unittests/test_imperative_container_layerdict.py
index d624495f71df7..fd6c5f33119ad 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_container_layerdict.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_layerdict.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class TestLayerDict(unittest.TestCase):
+
     def func_layer_dict(self):
         layers = OrderedDict([
             ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
index cf7fc9ba96b9b..18e0bff411c48 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
@@ -22,6 +22,7 @@
 
 
 class MyLayer(fluid.Layer):
+
     def __init__(self, layerlist):
         super(MyLayer, self).__init__()
         self.layerlist = layerlist
@@ -33,6 +34,7 @@ def forward(self, x):
 
 
 class TestImperativeContainer(unittest.TestCase):
+
     def fluid_dygraph_list(self):
         return fluid.dygraph.LayerList(
             [fluid.dygraph.Linear(2**i, 2**(i + 1)) for i in range(6)])
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py b/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py
index 349f18fe79985..97101c619ce6d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py
@@ -23,6 +23,7 @@
 
 
 class MyLayer(fluid.Layer):
+
     def __init__(self, num_stacked_param, use_fluid_api):
         super(MyLayer, self).__init__()
         # create ParameterList with iterable Parameters
@@ -33,13 +34,13 @@ def __init__(self, num_stacked_param, use_fluid_api):
 
     def fluid_dygraph_ParameterList(self, num_stacked_param):
         return fluid.dygraph.ParameterList(
-            [fluid.layers.create_parameter(
-                shape=[2, 2], dtype='float32')] * num_stacked_param)
+            [fluid.layers.create_parameter(shape=[2, 2], dtype='float32')] *
+            num_stacked_param)
 
     def paddle_imperative_ParameterList(self, num_stacked_param):
         return paddle.nn.ParameterList(
-            [fluid.layers.create_parameter(
-                shape=[2, 2], dtype='float32')] * num_stacked_param)
+            [fluid.layers.create_parameter(shape=[2, 2], dtype='float32')] *
+            num_stacked_param)
 
     def forward(self, x):
         for i, p in enumerate(self.params):
@@ -48,6 +49,7 @@ def forward(self, x):
 
 
 class TestImperativeContainerParameterList(unittest.TestCase):
+
     def paramter_list(self, use_fluid_api):
         data_np = np.random.uniform(-1, 1, [5, 2]).astype('float32')
         with fluid.dygraph.guard():
@@ -65,8 +67,7 @@ def paramter_list(self, use_fluid_api):
             res = model(x)
             self.assertListEqual(res.shape, [5, 3])
             model.params.append(
-                fluid.layers.create_parameter(
-                    shape=[3, 4], dtype='float32'))
+                fluid.layers.create_parameter(shape=[3, 4], dtype='float32'))
             self.assertEqual(len(model.params), num_stacked_param + 1)
             res = model(x)
             self.assertListEqual(res.shape, [5, 4])
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py b/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py
index dcf4e8de5e441..dc0ce69644783 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py
@@ -21,12 +21,13 @@
 
 
 class TestImperativeContainerSequential(unittest.TestCase):
+
     def func_sequential(self):
         data = np.random.uniform(-1, 1, [5, 10]).astype('float32')
         with fluid.dygraph.guard():
             data = fluid.dygraph.to_variable(data)
-            model1 = fluid.dygraph.Sequential(
-                fluid.Linear(10, 1), fluid.Linear(1, 2))
+            model1 = fluid.dygraph.Sequential(fluid.Linear(10, 1),
+                                              fluid.Linear(1, 2))
             res1 = model1(data)
             self.assertListEqual(res1.shape, [5, 2])
             model1[1] = fluid.Linear(1, 3)
@@ -65,8 +66,8 @@ def func_sequential_list_params(self):
         data = np.random.uniform(-1, 1, [5, 10]).astype('float32')
         with fluid.dygraph.guard():
             data = fluid.dygraph.to_variable(data)
-            model1 = fluid.dygraph.Sequential(
-                fluid.Linear(10, 1), fluid.Linear(1, 2))
+            model1 = fluid.dygraph.Sequential(fluid.Linear(10, 1),
+                                              fluid.Linear(1, 2))
             res1 = model1(data)
             self.assertListEqual(res1.shape, [5, 2])
             model1[1] = fluid.Linear(1, 3)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_base.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_base.py
index 6f0876dcfc32e..4ef5f423d4d76 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_base.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_base.py
@@ -28,6 +28,7 @@ def get_random_images_and_labels(image_shape, label_shape):
 
 
 def sample_generator_creator(batch_size, batch_num):
+
     def __reader__():
         for _ in range(batch_num * batch_size):
             image, label = get_random_images_and_labels([784], [1])
@@ -37,6 +38,7 @@ def __reader__():
 
 
 class TestDygraphDataLoader(unittest.TestCase):
+
     def setUp(self):
         self.batch_size = 8
         self.batch_num = 4
@@ -53,12 +55,13 @@ def iter_loader_data(self, loader):
 
     def func_test_single_process_loader(self):
         with fluid.dygraph.guard():
-            loader = fluid.io.DataLoader.from_generator(
-                capacity=self.capacity, iterable=False, use_multiprocess=False)
-            loader.set_sample_generator(
-                sample_generator_creator(self.batch_size, self.batch_num),
-                batch_size=self.batch_size,
-                places=fluid.CPUPlace())
+            loader = fluid.io.DataLoader.from_generator(capacity=self.capacity,
+                                                        iterable=False,
+                                                        use_multiprocess=False)
+            loader.set_sample_generator(sample_generator_creator(
+                self.batch_size, self.batch_num),
+                                        batch_size=self.batch_size,
+                                        places=fluid.CPUPlace())
             self.iter_loader_data(loader)
 
     def test_single_process_loader(self):
@@ -68,12 +71,12 @@ def test_single_process_loader(self):
 
     def func_test_multi_process_loader(self):
         with fluid.dygraph.guard():
-            loader = fluid.io.DataLoader.from_generator(
-                capacity=self.capacity, use_multiprocess=True)
-            loader.set_sample_generator(
-                sample_generator_creator(self.batch_size, self.batch_num),
-                batch_size=self.batch_size,
-                places=fluid.CPUPlace())
+            loader = fluid.io.DataLoader.from_generator(capacity=self.capacity,
+                                                        use_multiprocess=True)
+            loader.set_sample_generator(sample_generator_creator(
+                self.batch_size, self.batch_num),
+                                        batch_size=self.batch_size,
+                                        places=fluid.CPUPlace())
             self.iter_loader_data(loader)
 
     def test_multi_process_loader(self):
@@ -84,9 +87,9 @@ def test_multi_process_loader(self):
     def func_test_generator_no_places(self):
         with fluid.dygraph.guard():
             loader = fluid.io.DataLoader.from_generator(capacity=self.capacity)
-            loader.set_sample_generator(
-                sample_generator_creator(self.batch_size, self.batch_num),
-                batch_size=self.batch_size)
+            loader.set_sample_generator(sample_generator_creator(
+                self.batch_size, self.batch_num),
+                                        batch_size=self.batch_size)
             self.iter_loader_data(loader)
 
     def test_generator_no_places(self):
@@ -97,12 +100,13 @@ def test_generator_no_places(self):
     def func_test_set_pin_memory(self):
         with fluid.dygraph.guard():
             use_pinned_memory(False)
-            loader = fluid.io.DataLoader.from_generator(
-                capacity=self.capacity, iterable=False, use_multiprocess=False)
-            loader.set_sample_generator(
-                sample_generator_creator(self.batch_size, self.batch_num),
-                batch_size=self.batch_size,
-                places=fluid.CPUPlace())
+            loader = fluid.io.DataLoader.from_generator(capacity=self.capacity,
+                                                        iterable=False,
+                                                        use_multiprocess=False)
+            loader.set_sample_generator(sample_generator_creator(
+                self.batch_size, self.batch_num),
+                                        batch_size=self.batch_size,
+                                        places=fluid.CPUPlace())
             self.iter_loader_data(loader)
             use_pinned_memory(True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exception.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exception.py
index 4ab58919fdb6e..034d38c4e8ae3 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exception.py
@@ -29,6 +29,7 @@ def get_random_images_and_labels(image_shape, label_shape):
 
 
 class TestDygraphDataLoaderWithException(unittest.TestCase):
+
     def setUp(self):
         self.batch_size = 8
         self.batch_num = 4
@@ -47,7 +48,9 @@ def test_not_capacity(self):
         self.func_test_not_capacity()
 
     def func_test_single_process_with_thread_expection(self):
+
         def error_sample_genarator(batch_num):
+
             def __reader__():
                 for _ in range(batch_num):
                     yield [[[1, 2], [1]]]
@@ -55,10 +58,11 @@ def __reader__():
             return __reader__
 
         with fluid.dygraph.guard():
-            loader = fluid.io.DataLoader.from_generator(
-                capacity=self.capacity, iterable=False, use_multiprocess=False)
-            loader.set_batch_generator(
-                error_sample_genarator(self.batch_num), places=fluid.CPUPlace())
+            loader = fluid.io.DataLoader.from_generator(capacity=self.capacity,
+                                                        iterable=False,
+                                                        use_multiprocess=False)
+            loader.set_batch_generator(error_sample_genarator(self.batch_num),
+                                       places=fluid.CPUPlace())
             exception = None
             try:
                 for _ in loader():
@@ -75,7 +79,9 @@ def test_single_process_with_thread_expection(self):
         self.func_test_single_process_with_thread_expection()
 
     def func_test_multi_process_with_process_expection(self):
+
         def error_sample_genarator(batch_num):
+
             def __reader__():
                 for _ in range(batch_num):
                     yield [[[1, 2], [1]]]
@@ -83,10 +89,10 @@ def __reader__():
             return __reader__
 
         with fluid.dygraph.guard():
-            loader = fluid.io.DataLoader.from_generator(
-                capacity=self.capacity, use_multiprocess=True)
-            loader.set_batch_generator(
-                error_sample_genarator(self.batch_num), places=fluid.CPUPlace())
+            loader = fluid.io.DataLoader.from_generator(capacity=self.capacity,
+                                                        use_multiprocess=True)
+            loader.set_batch_generator(error_sample_genarator(self.batch_num),
+                                       places=fluid.CPUPlace())
             exception = None
             try:
                 for _ in loader():
@@ -101,7 +107,9 @@ def test_multi_process_with_process_expection(self):
         self.func_test_multi_process_with_process_expection()
 
     def func_test_multi_process_with_get_timeout(self):
+
         def slow_batch_generator_creator(batch_size, batch_num):
+
             def __reader__():
                 for _ in range(batch_num):
                     time.sleep(80)
@@ -112,11 +120,11 @@ def __reader__():
             return __reader__
 
         with fluid.dygraph.guard():
-            loader = fluid.io.DataLoader.from_generator(
-                capacity=self.capacity, use_multiprocess=True)
-            loader.set_batch_generator(
-                slow_batch_generator_creator(self.batch_size, self.batch_num),
-                places=fluid.CPUPlace())
+            loader = fluid.io.DataLoader.from_generator(capacity=self.capacity,
+                                                        use_multiprocess=True)
+            loader.set_batch_generator(slow_batch_generator_creator(
+                self.batch_size, self.batch_num),
+                                       places=fluid.CPUPlace())
             exception = None
             try:
                 for _ in range(self.epoch_num):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exit_func.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exit_func.py
index e83d6210f8499..021637ec3a60b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exit_func.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_exit_func.py
@@ -33,6 +33,7 @@
 
 
 class TestDygraphDataLoaderCleanUpFunc(unittest.TestCase):
+
     def setUp(self):
         self.capacity = 10
 
@@ -70,8 +71,8 @@ def test_not_callable_func(self):
         self.func_test_not_callable_func()
 
     def func_test_old_handler_for_sigint(self):
-        CleanupFuncRegistrar.register(
-            function=self.none_func, signals=[signal.SIGINT])
+        CleanupFuncRegistrar.register(function=self.none_func,
+                                      signals=[signal.SIGINT])
 
     def test_old_handler_for_sigint(self):
         with _test_eager_guard():
@@ -83,8 +84,8 @@ def func_test_signal_wrapper_by_sigchld(self):
         def __test_process__():
             pass
 
-        CleanupFuncRegistrar.register(
-            function=self.none_func, signals=[signal.SIGCHLD])
+        CleanupFuncRegistrar.register(function=self.none_func,
+                                      signals=[signal.SIGCHLD])
 
         exception = None
         try:
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_fds_clear.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_fds_clear.py
index 0ef2e19c44b19..bb7bb89d781b8 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_fds_clear.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_fds_clear.py
@@ -28,6 +28,7 @@ def get_random_images_and_labels(image_shape, label_shape):
 
 
 def batch_generator_creator(batch_size, batch_num):
+
     def __reader__():
         for _ in range(batch_num):
             batch_image, batch_label = get_random_images_and_labels(
@@ -38,6 +39,7 @@ def __reader__():
 
 
 class RandomDataset(Dataset):
+
     def __init__(self, sample_num):
         self.sample_num = sample_num
 
@@ -52,6 +54,7 @@ def __len__(self):
 
 
 class TestDygraphDataLoaderMmapFdsClear(unittest.TestCase):
+
     def setUp(self):
         self.batch_size = 8
         self.batch_num = 100
@@ -59,11 +62,11 @@ def setUp(self):
         self.capacity = 50
 
     def prepare_data_loader(self):
-        loader = fluid.io.DataLoader.from_generator(
-            capacity=self.capacity, use_multiprocess=True)
-        loader.set_batch_generator(
-            batch_generator_creator(self.batch_size, self.batch_num),
-            places=fluid.CPUPlace())
+        loader = fluid.io.DataLoader.from_generator(capacity=self.capacity,
+                                                    use_multiprocess=True)
+        loader.set_batch_generator(batch_generator_creator(
+            self.batch_size, self.batch_num),
+                                   places=fluid.CPUPlace())
         return loader
 
     def run_one_epoch_with_break(self, loader):
@@ -101,16 +104,16 @@ def test_data_loader_continue_break(self):
 
 
 class TestMultiProcessDataLoaderMmapFdsClear(TestDygraphDataLoaderMmapFdsClear):
+
     def prepare_data_loader(self):
         place = fluid.CPUPlace()
         with fluid.dygraph.guard(place):
             dataset = RandomDataset(self.batch_size * self.batch_num)
-            loader = DataLoader(
-                dataset,
-                places=place,
-                batch_size=self.batch_size,
-                drop_last=True,
-                num_workers=2)
+            loader = DataLoader(dataset,
+                                places=place,
+                                batch_size=self.batch_size,
+                                drop_last=True,
+                                num_workers=2)
             return loader
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
index 0eb5aa55eb38e..c860e5ed09754 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
@@ -34,6 +34,7 @@ def get_random_images_and_labels(image_shape, label_shape):
 
 
 def batch_generator_creator(batch_size, batch_num):
+
     def __reader__():
         for _ in range(batch_num):
             batch_image, batch_label = get_random_images_and_labels(
@@ -46,6 +47,7 @@ def __reader__():
 # NOTE: coverage CI can't cover child process code, so need these test.
 # Here test child process loop function in main process
 class TestDygraphDataLoaderProcess(unittest.TestCase):
+
     def setUp(self):
         self.batch_size = 8
         self.batch_num = 4
@@ -64,9 +66,9 @@ def __clear_process__(util_queue):
         with fluid.dygraph.guard():
             loader = fluid.io.DataLoader.from_generator(
                 capacity=self.batch_num + 1, use_multiprocess=True)
-            loader.set_batch_generator(
-                batch_generator_creator(self.batch_size, self.batch_num),
-                places=fluid.CPUPlace())
+            loader.set_batch_generator(batch_generator_creator(
+                self.batch_size, self.batch_num),
+                                       places=fluid.CPUPlace())
             loader._data_queue = queue.Queue(self.batch_num + 1)
             _reader_process_loop(loader._batch_reader, loader._data_queue)
             # For clean memory mapped files
@@ -76,8 +78,8 @@ def __clear_process__(util_queue):
                 util_queue.put(data)
 
             # Clean up memory mapped files
-            clear_process = multiprocessing.Process(
-                target=__clear_process__, args=(util_queue, ))
+            clear_process = multiprocessing.Process(target=__clear_process__,
+                                                    args=(util_queue, ))
             clear_process.start()
 
     def test_reader_process_loop(self):
@@ -86,7 +88,9 @@ def test_reader_process_loop(self):
         self.func_test_reader_process_loop()
 
     def func_test_reader_process_loop_simple_none(self):
+
         def none_sample_genarator(batch_num):
+
             def __reader__():
                 for _ in range(batch_num):
                     yield None
@@ -96,8 +100,8 @@ def __reader__():
         with fluid.dygraph.guard():
             loader = fluid.io.DataLoader.from_generator(
                 capacity=self.batch_num + 1, use_multiprocess=True)
-            loader.set_batch_generator(
-                none_sample_genarator(self.batch_num), places=fluid.CPUPlace())
+            loader.set_batch_generator(none_sample_genarator(self.batch_num),
+                                       places=fluid.CPUPlace())
             loader._data_queue = queue.Queue(self.batch_num + 1)
             exception = None
             try:
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
index d645a0a5ceb60..8e9c3c280f466 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
@@ -28,6 +28,7 @@
 
 
 class MLP(fluid.Layer):
+
     def __init__(self, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
@@ -41,6 +42,7 @@ def forward(self, inputs):
 
 
 class TestDataParallelStateDict(unittest.TestCase):
+
     def test_data_parallel_state_dict(self):
         with fluid.dygraph.guard():
             strategy = dygraph.parallel.prepare_context()
@@ -51,8 +53,8 @@ def test_data_parallel_state_dict(self):
             parallel_state = parallel_mlp.state_dict()
 
             base_para = {}
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
             for k, v in single_state.items():
                 self.assertTrue(k in parallel_state)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
index 38c7de24b7734..f783b18b2b2e9 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
@@ -23,6 +23,7 @@
 
 
 class TestTracerMode(unittest.TestCase):
+
     def setUp(self):
         self.init_mode = True
 
@@ -59,8 +60,8 @@ def need_no_grad_func(a, b=1):
 
             decorated_func = fluid.dygraph.no_grad(need_no_grad_func)
             self.assertTrue(
-                str(inspect.getfullargspec(decorated_func)) ==
-                str(inspect.getfullargspec(need_no_grad_func)))
+                str(inspect.getfullargspec(decorated_func)) == str(
+                    inspect.getfullargspec(need_no_grad_func)))
 
             self.assertEqual(self.tracer._train_mode, self.init_mode)
 
@@ -78,11 +79,13 @@ def test_main(self):
 
 
 class TestTracerMode2(TestTracerMode):
+
     def setUp(self):
         self.init_mode = False
 
 
 class TestNoGradClass(unittest.TestCase):
+
     @paddle.no_grad()
     def no_grad_func(self, a):
         self.assertEqual(self.tracer._train_mode, True)
@@ -102,9 +105,8 @@ def need_no_grad_func(a, b=1):
             return a + b
 
         decorated_func = paddle.no_grad()(need_no_grad_func)
-        self.assertEqual(
-            str(inspect.getfullargspec(decorated_func)),
-            str(inspect.getfullargspec(need_no_grad_func)))
+        self.assertEqual(str(inspect.getfullargspec(decorated_func)),
+                         str(inspect.getfullargspec(need_no_grad_func)))
 
         def test_gen():
             for i in range(3):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index 3e222e3c658ec..822a0fcc449dd 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -35,6 +35,7 @@
 
 
 class DMF(fluid.Layer):
+
     def __init__(self):
         super(DMF, self).__init__()
         self._user_latent = Linear(1000, 256)
@@ -47,17 +48,15 @@ def __init__(self):
             self._user_layers.append(
                 self.add_sublayer(
                     'user_layer_%d' % i,
-                    Linear(
-                        256 if i == 0 else self._hid_sizes[i - 1],
-                        self._hid_sizes[i],
-                        act='relu')))
+                    Linear(256 if i == 0 else self._hid_sizes[i - 1],
+                           self._hid_sizes[i],
+                           act='relu')))
             self._item_layers.append(
                 self.add_sublayer(
                     'item_layer_%d' % i,
-                    Linear(
-                        256 if i == 0 else self._hid_sizes[i - 1],
-                        self._hid_sizes[i],
-                        act='relu')))
+                    Linear(256 if i == 0 else self._hid_sizes[i - 1],
+                           self._hid_sizes[i],
+                           act='relu')))
 
     def forward(self, users, items):
         users = self._user_latent(users)
@@ -70,6 +69,7 @@ def forward(self, users, items):
 
 
 class MLP(fluid.Layer):
+
     def __init__(self):
         super(MLP, self).__init__()
         self._user_latent = Linear(1000, 256)
@@ -80,22 +80,22 @@ def __init__(self):
             self._match_layers.append(
                 self.add_sublayer(
                     'match_layer_%d' % i,
-                    Linear(
-                        256 * 2 if i == 0 else self._hid_sizes[i - 1],
-                        self._hid_sizes[i],
-                        act='relu')))
+                    Linear(256 * 2 if i == 0 else self._hid_sizes[i - 1],
+                           self._hid_sizes[i],
+                           act='relu')))
 
     def forward(self, users, items):
         users = self._user_latent(users)
         items = self._item_latent(items)
-        match_vec = fluid.layers.concat(
-            [users, items], axis=len(users.shape) - 1)
+        match_vec = fluid.layers.concat([users, items],
+                                        axis=len(users.shape) - 1)
         for l in self._match_layers:
             match_vec = l(match_vec)
         return match_vec
 
 
 class DeepCF(fluid.Layer):
+
     def __init__(self, num_users, num_items, matrix):
         super(DeepCF, self).__init__()
         self._num_users = num_users
@@ -123,9 +123,8 @@ def forward(self, users, items):
 
         mlp_predictive = self._mlp(users_emb, items_emb)
         dmf_predictive = self._dmf(users_emb, items_emb)
-        predictive = fluid.layers.concat(
-            [mlp_predictive, dmf_predictive],
-            axis=len(mlp_predictive.shape) - 1)
+        predictive = fluid.layers.concat([mlp_predictive, dmf_predictive],
+                                         axis=len(mlp_predictive.shape) - 1)
         prediction = self._match_fc(predictive)
         return prediction
 
@@ -199,6 +198,7 @@ def load_data(DATA_PATH):
 
 
 class TestDygraphDeepCF(unittest.TestCase):
+
     def test_deefcf(self):
         seed = 90
         if DATA_PATH:
@@ -259,9 +259,9 @@ def test_deefcf(self):
                         to_variable(users_np[slice:slice + BATCH_SIZE]),
                         to_variable(items_np[slice:slice + BATCH_SIZE]))
                     loss = fluid.layers.reduce_sum(
-                        fluid.layers.log_loss(prediction,
-                                              to_variable(labels_np[
-                                                  slice:slice + BATCH_SIZE])))
+                        fluid.layers.log_loss(
+                            prediction,
+                            to_variable(labels_np[slice:slice + BATCH_SIZE])))
                     loss.backward()
                     adam.minimize(loss)
                     deepcf.clear_gradients()
@@ -285,9 +285,9 @@ def test_deefcf(self):
                         to_variable(users_np[slice:slice + BATCH_SIZE]),
                         to_variable(items_np[slice:slice + BATCH_SIZE]))
                     loss2 = fluid.layers.reduce_sum(
-                        fluid.layers.log_loss(prediction2,
-                                              to_variable(labels_np[
-                                                  slice:slice + BATCH_SIZE])))
+                        fluid.layers.log_loss(
+                            prediction2,
+                            to_variable(labels_np[slice:slice + BATCH_SIZE])))
                     loss2.backward()
                     adam2.minimize(loss2)
                     deepcf2.clear_gradients()
@@ -315,10 +315,10 @@ def test_deefcf(self):
                             to_variable(users_np[slice:slice + BATCH_SIZE]),
                             to_variable(items_np[slice:slice + BATCH_SIZE]))
                         loss = fluid.layers.reduce_sum(
-                            fluid.layers.log_loss(prediction,
-                                                  to_variable(
-                                                      labels_np[slice:slice +
-                                                                BATCH_SIZE])))
+                            fluid.layers.log_loss(
+                                prediction,
+                                to_variable(labels_np[slice:slice +
+                                                      BATCH_SIZE])))
                         loss.backward()
                         adam.minimize(loss)
                         deepcf.clear_gradients()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
index 00b192b2a057b..5e9374bac0551 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -25,6 +25,7 @@
 
 
 def _dygraph_guard_(func):
+
     def __impl__(*args, **kwargs):
         if fluid._non_static_mode():
             return func(*args, **kwargs)
@@ -44,6 +45,7 @@ def random_var(size, low=-1, high=1, dtype='float32'):
 
 
 class TestEagerGrad(TestCase):
+
     def func_simple_example_eager_grad(self):
         np.random.seed(2021)
         paddle.set_device('cpu')
@@ -166,6 +168,7 @@ def test_simple_example_eager_grad_duplicate_output(self):
 
 
 class TestDygraphDoubleGrad(TestCase):
+
     def setUp(self):
         self.sort_sum_gradient = False
         self.shape = [5, 10]
@@ -179,14 +182,13 @@ def grad(self,
              create_graph=False,
              allow_unused=False):
         fluid.set_flags({'FLAGS_sort_sum_gradient': self.sort_sum_gradient})
-        return fluid.dygraph.grad(
-            outputs=outputs,
-            inputs=inputs,
-            grad_outputs=grad_outputs,
-            no_grad_vars=no_grad_vars,
-            retain_graph=retain_graph,
-            create_graph=create_graph,
-            allow_unused=allow_unused)
+        return fluid.dygraph.grad(outputs=outputs,
+                                  inputs=inputs,
+                                  grad_outputs=grad_outputs,
+                                  no_grad_vars=no_grad_vars,
+                                  retain_graph=retain_graph,
+                                  create_graph=create_graph,
+                                  allow_unused=allow_unused)
 
     @dygraph_guard
     def func_exception(self):
@@ -212,8 +214,8 @@ def func_exception(self):
                       [random_var(shape)], [random_var(shape)])
 
         with self.assertRaises(AssertionError):
-            self.grad(
-                [random_var(shape)], [random_var(shape)], no_grad_vars=[1])
+            self.grad([random_var(shape)], [random_var(shape)],
+                      no_grad_vars=[1])
 
         with self.assertRaises(AssertionError):
             self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
@@ -230,24 +232,27 @@ def func_simple_example(self):
         y = x + 1
 
         for create_graph in [False, True]:
-            dx, = self.grad(
-                [x], [x], create_graph=create_graph, retain_graph=True)
+            dx, = self.grad([x], [x],
+                            create_graph=create_graph,
+                            retain_graph=True)
             self.assertEqual(dx.shape, x.shape)
             self.assertTrue(np.all(dx.numpy() == 1))
             self.assertNotEqual(dx.stop_gradient, create_graph)
 
-            dx_mul_2, = self.grad(
-                [y, x], [x], create_graph=create_graph, retain_graph=True)
+            dx_mul_2, = self.grad([y, x], [x],
+                                  create_graph=create_graph,
+                                  retain_graph=True)
             self.assertEqual(dx_mul_2.shape, x.shape)
             self.assertTrue(np.all(dx_mul_2.numpy() == 2))
             self.assertNotEqual(dx_mul_2.stop_gradient, create_graph)
 
-            none_grad, = self.grad(
-                [x], [y], create_graph=create_graph, allow_unused=True)
+            none_grad, = self.grad([x], [y],
+                                   create_graph=create_graph,
+                                   allow_unused=True)
             self.assertTrue(none_grad is None)
 
-            grad_with_none_and_not_none, = self.grad(
-                [x, y], [y], create_graph=create_graph)
+            grad_with_none_and_not_none, = self.grad([x, y], [y],
+                                                     create_graph=create_graph)
             self.assertTrue(grad_with_none_and_not_none.shape, x.shape)
             self.assertTrue(np.all(grad_with_none_and_not_none.numpy() == 1))
             self.assertNotEqual(grad_with_none_and_not_none.stop_gradient,
@@ -273,8 +278,9 @@ def func_example_no_grad_vars(self):
         w_mean = fluid.layers.reduce_mean(w)
         del y1, z, w
 
-        dx_actual, = self.grad(
-            [w_mean], [x], create_graph=True, no_grad_vars=[y2])
+        dx_actual, = self.grad([w_mean], [x],
+                               create_graph=True,
+                               no_grad_vars=[y2])
 
         self.assertFalse(y2.stop_gradient)
         self.assertFalse(dx_actual.stop_gradient)
@@ -297,10 +303,11 @@ def func_none_one_initial_gradient(self):
 
         half_numel = int(numel / 2)
         half_x_positive = np.random.uniform(low=1, high=2, size=[half_numel])
-        half_x_negative = np.random.uniform(
-            low=-2, high=-1, size=[numel - half_numel])
-        x_np = np.array(list(half_x_positive) + list(half_x_negative)).astype(
-            'float32')
+        half_x_negative = np.random.uniform(low=-2,
+                                            high=-1,
+                                            size=[numel - half_numel])
+        x_np = np.array(list(half_x_positive) +
+                        list(half_x_negative)).astype('float32')
         np.random.shuffle(x_np)
 
         x = fluid.dygraph.to_variable(x_np)
@@ -329,12 +336,11 @@ def func_none_one_initial_gradient(self):
         for grad_y in [random_grad_y]:
             for grad_z in [random_grad_z]:
                 for create_graph in [False, True]:
-                    dx_actual, = self.grad(
-                        outputs=[y, z],
-                        inputs=[x],
-                        grad_outputs=[grad_y, grad_z],
-                        create_graph=create_graph,
-                        retain_graph=True)
+                    dx_actual, = self.grad(outputs=[y, z],
+                                           inputs=[x],
+                                           grad_outputs=[grad_y, grad_z],
+                                           create_graph=create_graph,
+                                           retain_graph=True)
 
                     grad_y_np = ones_grad_y if grad_y is None else grad_y.numpy(
                     )
@@ -397,9 +403,10 @@ def func_example_with_gradient_accumulation_and_create_graph(self):
         for i in range(5):
             loss.backward(retain_graph=True)
             x_grad_actual = x.gradient()
-            x_grad_expected = (i + 2) * (2.0 / float(numel) * (
-                x_np + dx_expected *
-                (x_np > 0) * 2 / float(numel))).astype('float32')
+            x_grad_expected = (
+                i + 2) * (2.0 / float(numel) *
+                          (x_np + dx_expected *
+                           (x_np > 0) * 2 / float(numel))).astype('float32')
             self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
 
     def test_example_with_gradient_accumulation_and_create_graph(self):
@@ -422,11 +429,10 @@ def func_example_with_gradient_accumulation_and_no_grad_vars(self):
         w_mean = fluid.layers.reduce_mean(w)
         del y1, z, w
 
-        dx_actual, = self.grad(
-            [w_mean], [x],
-            retain_graph=True,
-            create_graph=True,
-            no_grad_vars=[y2])
+        dx_actual, = self.grad([w_mean], [x],
+                               retain_graph=True,
+                               create_graph=True,
+                               no_grad_vars=[y2])
 
         self.assertFalse(y2.stop_gradient)
         self.assertFalse(dx_actual.stop_gradient)
@@ -487,12 +493,14 @@ def test_example_with_gradient_accumulation_and_not_create_graph(self):
 
 
 class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
+
     def setUp(self):
         self.sort_sum_gradient = True
         self.shape = [5, 10]
 
 
 class TestDygraphDoubleGradVisitedUniq(TestCase):
+
     def func_compare(self):
         value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2,
                                                           5).astype("float32")
@@ -516,12 +524,11 @@ def model_f(input):
 
             out = model_f(a)
 
-            dx = fluid.dygraph.grad(
-                outputs=[out],
-                inputs=[a],
-                create_graph=False,
-                only_inputs=True,
-                allow_unused=False)
+            dx = fluid.dygraph.grad(outputs=[out],
+                                    inputs=[a],
+                                    create_graph=False,
+                                    only_inputs=True,
+                                    allow_unused=False)
 
             grad_1 = dx[0].numpy()
 
@@ -545,15 +552,17 @@ def test_compare(self):
 
 
 class TestRaiseNoDoubleGradOp(TestCase):
+
     def raise_no_grad_op(self):
         with fluid.dygraph.guard():
             x = fluid.layers.ones(shape=[2, 3, 2, 2], dtype='float32')
             x.stop_gradient = False
             y = paddle.fluid.layers.group_norm(x, groups=1)
 
-            dx = fluid.dygraph.grad(
-                outputs=[y], inputs=[x], create_graph=True,
-                retain_graph=True)[0]
+            dx = fluid.dygraph.grad(outputs=[y],
+                                    inputs=[x],
+                                    create_graph=True,
+                                    retain_graph=True)[0]
 
             loss = fluid.layers.reduce_mean(dx)
             loss.backward()
@@ -563,6 +572,7 @@ def test_raise(self):
 
 
 class TestDoubleGradResNet(TestCase):
+
     def setUp(self):
         paddle.seed(123)
         paddle.framework.random._manual_program_seed(123)
@@ -589,8 +599,8 @@ def test_resnet_resnet50(self):
         data.stop_gradient = False
         out = model(data)
         preds = paddle.argmax(out, axis=1)
-        label_onehot = paddle.nn.functional.one_hot(
-            paddle.to_tensor(preds), num_classes=out.shape[1])
+        label_onehot = paddle.nn.functional.one_hot(paddle.to_tensor(preds),
+                                                    num_classes=out.shape[1])
         target = paddle.sum(out * label_onehot, axis=1)
 
         g = paddle.grad(outputs=target, inputs=out)[0]
@@ -621,8 +631,8 @@ def test_resnet_resnet101(self):
         data.stop_gradient = False
         out = model(data)
         preds = paddle.argmax(out, axis=1)
-        label_onehot = paddle.nn.functional.one_hot(
-            paddle.to_tensor(preds), num_classes=out.shape[1])
+        label_onehot = paddle.nn.functional.one_hot(paddle.to_tensor(preds),
+                                                    num_classes=out.shape[1])
         target = paddle.sum(out * label_onehot, axis=1)
 
         g = paddle.grad(outputs=target, inputs=out)[0]
@@ -634,19 +644,24 @@ def test_resnet_resnet101(self):
 
 
 class TestDoubleGradBasics(TestCase):
+
     def test_matmul(self):
         input_numpy = np.ones([3, 3]) * 2
         with _test_eager_guard():
-            x = paddle.to_tensor(
-                input_numpy, stop_gradient=False, dtype='float32')
-            y = paddle.to_tensor(
-                input_numpy, stop_gradient=False, dtype='float32')
-            grad_out = paddle.to_tensor(
-                np.ones([3, 3]), stop_gradient=False, dtype='float32')
+            x = paddle.to_tensor(input_numpy,
+                                 stop_gradient=False,
+                                 dtype='float32')
+            y = paddle.to_tensor(input_numpy,
+                                 stop_gradient=False,
+                                 dtype='float32')
+            grad_out = paddle.to_tensor(np.ones([3, 3]),
+                                        stop_gradient=False,
+                                        dtype='float32')
 
             out = paddle.matmul(x, y, False, False)
-            new_x_g, new_y_g = paddle.grad(
-                [out], [x, y], [grad_out], retain_graph=True, create_graph=True)
+            new_x_g, new_y_g = paddle.grad([out], [x, y], [grad_out],
+                                           retain_graph=True,
+                                           create_graph=True)
             new_x_g.backward()
 
             out_ref = np.ones([3, 3]) * 12.0
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_framework.py b/python/paddle/fluid/tests/unittests/test_imperative_framework.py
index 2d900d65976e7..5e1c59dba578b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_framework.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_framework.py
@@ -22,22 +22,23 @@
 
 
 class MLP(fluid.Layer):
+
     def __init__(self, input_size):
         super(MLP, self).__init__()
         self._linear1 = fluid.dygraph.Linear(
             input_size,
             3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.1)),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.1)))
         self._linear2 = fluid.dygraph.Linear(
             3,
             4,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.1)),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.1)))
 
     def forward(self, inputs):
         x = self._linear1(inputs)
@@ -47,11 +48,14 @@ def forward(self, inputs):
 
 
 class TestDygraphFramework(unittest.TestCase):
+
     def func_test_dygraph_backward(self):
         with new_program_scope():
             mlp = MLP(input_size=2)
-            var_inp = fluid.layers.data(
-                "input", shape=[2, 2], dtype="float32", append_batch_size=False)
+            var_inp = fluid.layers.data("input",
+                                        shape=[2, 2],
+                                        dtype="float32",
+                                        append_batch_size=False)
             out = mlp(var_inp)
             try:
                 out.backward()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 39b7f941c4bba..e724421d1db77 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -29,6 +29,7 @@
 
 
 class Discriminator(fluid.Layer):
+
     def __init__(self):
         super(Discriminator, self).__init__()
         self._fc1 = Linear(1, 32, act='elu')
@@ -41,6 +42,7 @@ def forward(self, inputs):
 
 
 class Generator(fluid.Layer):
+
     def __init__(self):
         super(Generator, self).__init__()
         self._fc1 = Linear(2, 64, act='elu')
@@ -55,6 +57,7 @@ def forward(self, inputs):
 
 
 class TestDygraphGAN(unittest.TestCase):
+
     def func_test_gan_float32(self):
         seed = 90
         paddle.seed(1)
@@ -64,29 +67,34 @@ def func_test_gan_float32(self):
         generate_p = fluid.Program()
 
         scope = fluid.core.Scope()
-        with new_program_scope(
-                main=discriminate_p, startup=startup, scope=scope):
+        with new_program_scope(main=discriminate_p,
+                               startup=startup,
+                               scope=scope):
             discriminator = Discriminator()
             generator = Generator()
 
-            img = fluid.layers.data(
-                name="img", shape=[2, 1], append_batch_size=False)
-            noise = fluid.layers.data(
-                name="noise", shape=[2, 2], append_batch_size=False)
+            img = fluid.layers.data(name="img",
+                                    shape=[2, 1],
+                                    append_batch_size=False)
+            noise = fluid.layers.data(name="noise",
+                                      shape=[2, 2],
+                                      append_batch_size=False)
 
             d_real = discriminator(img)
             d_loss_real = fluid.layers.reduce_mean(
                 fluid.layers.sigmoid_cross_entropy_with_logits(
                     x=d_real,
-                    label=fluid.layers.fill_constant(
-                        shape=[2, 1], dtype='float32', value=1.0)))
+                    label=fluid.layers.fill_constant(shape=[2, 1],
+                                                     dtype='float32',
+                                                     value=1.0)))
 
             d_fake = discriminator(generator(noise))
             d_loss_fake = fluid.layers.reduce_mean(
                 fluid.layers.sigmoid_cross_entropy_with_logits(
                     x=d_fake,
-                    label=fluid.layers.fill_constant(
-                        shape=[2, 1], dtype='float32', value=0.0)))
+                    label=fluid.layers.fill_constant(shape=[2, 1],
+                                                     dtype='float32',
+                                                     value=0.0)))
 
             d_loss = d_loss_real + d_loss_fake
 
@@ -97,29 +105,33 @@ def func_test_gan_float32(self):
             discriminator = Discriminator()
             generator = Generator()
 
-            noise = fluid.layers.data(
-                name="noise", shape=[2, 2], append_batch_size=False)
+            noise = fluid.layers.data(name="noise",
+                                      shape=[2, 2],
+                                      append_batch_size=False)
 
             d_fake = discriminator(generator(noise))
             g_loss = fluid.layers.reduce_mean(
                 fluid.layers.sigmoid_cross_entropy_with_logits(
                     x=d_fake,
-                    label=fluid.layers.fill_constant(
-                        shape=[2, 1], dtype='float32', value=1.0)))
+                    label=fluid.layers.fill_constant(shape=[2, 1],
+                                                     dtype='float32',
+                                                     value=1.0)))
 
             sgd = SGDOptimizer(learning_rate=1e-3)
             sgd.minimize(g_loss)
 
-        exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0))
+        exe = fluid.Executor(fluid.CPUPlace(
+        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
         static_params = dict()
         with fluid.scope_guard(scope):
             img = np.ones([2, 1], np.float32)
             noise = np.ones([2, 2], np.float32)
             exe.run(startup)
             static_d_loss = exe.run(discriminate_p,
-                                    feed={'img': img,
-                                          'noise': noise},
+                                    feed={
+                                        'img': img,
+                                        'noise': noise
+                                    },
                                     fetch_list=[d_loss])[0]
             static_g_loss = exe.run(generate_p,
                                     feed={'noise': noise},
@@ -137,10 +149,9 @@ def func_test_gan_float32(self):
 
             discriminator = Discriminator()
             generator = Generator()
-            sgd = SGDOptimizer(
-                learning_rate=1e-3,
-                parameter_list=(
-                    discriminator.parameters() + generator.parameters()))
+            sgd = SGDOptimizer(learning_rate=1e-3,
+                               parameter_list=(discriminator.parameters() +
+                                               generator.parameters()))
 
             d_real = discriminator(to_variable(np.ones([2, 1], np.float32)))
             d_loss_real = fluid.layers.reduce_mean(
@@ -181,10 +192,9 @@ def func_test_gan_float32(self):
             paddle.framework.random._manual_program_seed(1)
             discriminator2 = Discriminator()
             generator2 = Generator()
-            sgd2 = SGDOptimizer(
-                learning_rate=1e-3,
-                parameter_list=(
-                    discriminator2.parameters() + generator2.parameters()))
+            sgd2 = SGDOptimizer(learning_rate=1e-3,
+                                parameter_list=(discriminator2.parameters() +
+                                                generator2.parameters()))
 
             d_real2 = discriminator2(to_variable(np.ones([2, 1], np.float32)))
             d_loss_real2 = fluid.layers.reduce_mean(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
index a5a90461551ff..6acab36221fa2 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -31,6 +31,7 @@ def gen_data():
 
 
 class GraphConv(fluid.Layer):
+
     def __init__(self, name_scope, in_features, out_features):
         super(GraphConv, self).__init__(name_scope)
 
@@ -40,8 +41,9 @@ def __init__(self, name_scope, in_features, out_features):
             attr=None,
             dtype='float32',
             shape=[self._in_features, self._out_features])
-        self.bias = self.create_parameter(
-            attr=None, dtype='float32', shape=[self._out_features])
+        self.bias = self.create_parameter(attr=None,
+                                          dtype='float32',
+                                          shape=[self._out_features])
 
     def forward(self, features, adj):
         support = fluid.layers.matmul(features, self.weight)
@@ -50,6 +52,7 @@ def forward(self, features, adj):
 
 
 class GCN(fluid.Layer):
+
     def __init__(self, name_scope, num_hidden):
         super(GCN, self).__init__(name_scope)
         self.gc = GraphConv(self.full_name(), num_hidden, 32)
@@ -61,6 +64,7 @@ def forward(self, x, adj):
 
 
 class TestDygraphGNN(unittest.TestCase):
+
     def func_gnn_float32(self):
         paddle.seed(90)
         paddle.framework.random._manual_program_seed(90)
@@ -69,22 +73,19 @@ def func_gnn_float32(self):
 
         scope = fluid.core.Scope()
         with new_program_scope(main=main, startup=startup, scope=scope):
-            features = fluid.layers.data(
-                name='features',
-                shape=[1, 100, 50],
-                dtype='float32',
-                append_batch_size=False)
+            features = fluid.layers.data(name='features',
+                                         shape=[1, 100, 50],
+                                         dtype='float32',
+                                         append_batch_size=False)
             # Use selected rows when it's supported.
-            adj = fluid.layers.data(
-                name='adj',
-                shape=[1, 100, 100],
-                dtype='float32',
-                append_batch_size=False)
-            labels = fluid.layers.data(
-                name='labels',
-                shape=[100, 1],
-                dtype='int64',
-                append_batch_size=False)
+            adj = fluid.layers.data(name='adj',
+                                    shape=[1, 100, 100],
+                                    dtype='float32',
+                                    append_batch_size=False)
+            labels = fluid.layers.data(name='labels',
+                                       shape=[100, 1],
+                                       dtype='int64',
+                                       append_batch_size=False)
 
             model = GCN('test_gcn', 50)
             logits = model(features, adj)
@@ -100,12 +101,12 @@ def func_gnn_float32(self):
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             exe.run(startup)
             static_loss = exe.run(feed={
-                'features': np.ones(
-                    [1, 100, 50], dtype=np.float32),
-                'adj': np.ones(
-                    [1, 100, 100], dtype=np.float32),
-                'labels': np.ones(
-                    [100, 1], dtype=np.int64)
+                'features':
+                np.ones([1, 100, 50], dtype=np.float32),
+                'adj':
+                np.ones([1, 100, 100], dtype=np.float32),
+                'labels':
+                np.ones([100, 1], dtype=np.int64)
             },
                                   fetch_list=[loss])[0]
 
@@ -126,12 +127,12 @@ def func_gnn_float32(self):
             logits = fluid.layers.reshape(logits, logits.shape[1:])
             # In other example, it's nll with log_softmax. However, paddle's
             # log_loss only supports binary classification now.
-            loss = fluid.layers.softmax_with_cross_entropy(logits,
-                                                           to_variable(labels))
+            loss = fluid.layers.softmax_with_cross_entropy(
+                logits, to_variable(labels))
             loss = fluid.layers.reduce_sum(loss)
             loss.backward()
-            adam = AdamOptimizer(
-                learning_rate=1e-3, parameter_list=model.parameters())
+            adam = AdamOptimizer(learning_rate=1e-3,
+                                 parameter_list=model.parameters())
 
             adam.minimize(loss)
             model.clear_gradients()
@@ -156,8 +157,8 @@ def func_gnn_float32(self):
                 logits2, to_variable(labels2))
             loss2 = fluid.layers.reduce_sum(loss2)
             loss2.backward()
-            adam2 = AdamOptimizer(
-                learning_rate=1e-3, parameter_list=model2.parameters())
+            adam2 = AdamOptimizer(learning_rate=1e-3,
+                                  parameter_list=model2.parameters())
             adam2.minimize(loss2)
             model2.clear_gradients()
             loss2_value = loss2.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_group.py b/python/paddle/fluid/tests/unittests/test_imperative_group.py
index 994ae27a290a3..ca7af2d6d49f3 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_group.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_group.py
@@ -30,6 +30,7 @@
 
 
 class TestDataParallelGroup(unittest.TestCase):
+
     def create_varbase(self, dtype, shape):
         return paddle.rand(shape=shape, dtype=dtype)
 
@@ -118,7 +119,8 @@ def test_construct_group6(self):
         var_list = []
         var_list.append(self.create_varbase(
             "float32",
-            [1, 50], ))
+            [1, 50],
+        ))
         var_list.append(self.create_varbase("float64", [1, 25]))
         var_list.append(self.create_varbase("float32", [1, 50]))
         var_list.append(self.create_varbase("float64", [1, 25]))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
index 4c457e9345c5d..87d0d8e81b03c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py
@@ -80,13 +80,12 @@ def func_forward_hook_return_value(self):
                 input1 = base.to_variable(input_word1)
                 y = base.to_variable(y_data)
 
-                simplenet = SimpleNet(
-                    hidden_size=20,
-                    vocab_size=32,
-                    num_steps=3,
-                    init_scale=0.1,
-                    is_sparse=False,
-                    dtype="float32")
+                simplenet = SimpleNet(hidden_size=20,
+                                      vocab_size=32,
+                                      num_steps=3,
+                                      init_scale=0.1,
+                                      is_sparse=False,
+                                      dtype="float32")
 
                 # origin, don't register any hook
                 outs_origin = simplenet(input, y)
@@ -149,13 +148,12 @@ def func_forward_hook(self):
                 input = base.to_variable(input_word)
                 y = base.to_variable(y_data)
 
-                simplenet = SimpleNet(
-                    hidden_size=20,
-                    vocab_size=32,
-                    num_steps=3,
-                    init_scale=0.1,
-                    is_sparse=False,
-                    dtype="float32")
+                simplenet = SimpleNet(hidden_size=20,
+                                      vocab_size=32,
+                                      num_steps=3,
+                                      init_scale=0.1,
+                                      is_sparse=False,
+                                      dtype="float32")
 
                 # origin, don't register any hook
                 outs_origin = simplenet(input, y)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
index 0bc56294876d3..127aed8cabc2f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
@@ -25,24 +25,21 @@
 
 
 class LeNetDygraph(fluid.dygraph.Layer):
+
     def __init__(self, num_classes=10, classifier_activation='softmax'):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
-        self.features = nn.Sequential(
-            nn.Conv2D(
-                1, 6, 3, stride=1, padding=1),
-            nn.ReLU(),
-            paddle.fluid.dygraph.Pool2D(2, 'max', 2),
-            nn.Conv2D(
-                6, 16, 5, stride=1, padding=0),
-            nn.ReLU(),
-            paddle.fluid.dygraph.Pool2D(2, 'max', 2))
+        self.features = nn.Sequential(nn.Conv2D(1, 6, 3, stride=1, padding=1),
+                                      nn.ReLU(),
+                                      paddle.fluid.dygraph.Pool2D(2, 'max', 2),
+                                      nn.Conv2D(6, 16, 5, stride=1, padding=0),
+                                      nn.ReLU(),
+                                      paddle.fluid.dygraph.Pool2D(2, 'max', 2))
 
         if num_classes > 0:
-            self.fc = nn.Sequential(
-                nn.Linear(400, 120),
-                nn.Linear(120, 84), nn.Linear(84, 10),
-                nn.Softmax())  #Todo: accept any activation
+            self.fc = nn.Sequential(nn.Linear(400, 120), nn.Linear(120, 84),
+                                    nn.Linear(84, 10),
+                                    nn.Softmax())  #Todo: accept any activation
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -55,22 +52,27 @@ def forward(self, inputs):
 
 def init_weights(layer):
     if type(layer) == nn.Linear:
-        new_weight = paddle.fluid.layers.fill_constant(
-            layer.weight.shape, layer.weight.dtype, value=0.9)
+        new_weight = paddle.fluid.layers.fill_constant(layer.weight.shape,
+                                                       layer.weight.dtype,
+                                                       value=0.9)
         layer.weight.set_value(new_weight)
-        new_bias = paddle.fluid.layers.fill_constant(
-            layer.bias.shape, layer.bias.dtype, value=-0.1)
+        new_bias = paddle.fluid.layers.fill_constant(layer.bias.shape,
+                                                     layer.bias.dtype,
+                                                     value=-0.1)
         layer.bias.set_value(new_bias)
     elif type(layer) == nn.Conv2D:
-        new_weight = paddle.fluid.layers.fill_constant(
-            layer.weight.shape, layer.weight.dtype, value=0.7)
+        new_weight = paddle.fluid.layers.fill_constant(layer.weight.shape,
+                                                       layer.weight.dtype,
+                                                       value=0.7)
         layer.weight.set_value(new_weight)
-        new_bias = paddle.fluid.layers.fill_constant(
-            layer.bias.shape, layer.bias.dtype, value=-0.2)
+        new_bias = paddle.fluid.layers.fill_constant(layer.bias.shape,
+                                                     layer.bias.dtype,
+                                                     value=-0.2)
         layer.bias.set_value(new_bias)
 
 
 class TestLayerApply(unittest.TestCase):
+
     def func_apply_init_weight(self):
         with fluid.dygraph.guard():
             net = LeNetDygraph()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
index 0cce1efd1f873..7d9c6e1dc4eba 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
@@ -25,17 +25,15 @@
 
 
 class LeNetDygraph(fluid.dygraph.Layer):
+
     def __init__(self):
         super(LeNetDygraph, self).__init__()
-        self.features = nn.Sequential(
-            nn.Conv2D(
-                1, 6, 3, stride=1, padding=1),
-            nn.ReLU(),
-            paddle.fluid.dygraph.Pool2D(2, 'max', 2),
-            nn.Conv2D(
-                6, 16, 5, stride=1, padding=0),
-            nn.ReLU(),
-            paddle.fluid.dygraph.Pool2D(2, 'max', 2))
+        self.features = nn.Sequential(nn.Conv2D(1, 6, 3, stride=1, padding=1),
+                                      nn.ReLU(),
+                                      paddle.fluid.dygraph.Pool2D(2, 'max', 2),
+                                      nn.Conv2D(6, 16, 5, stride=1, padding=0),
+                                      nn.ReLU(),
+                                      paddle.fluid.dygraph.Pool2D(2, 'max', 2))
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -43,6 +41,7 @@ def forward(self, inputs):
 
 
 class TestLayerChildren(unittest.TestCase):
+
     def func_apply_init_weight(self):
         with fluid.dygraph.guard():
             net = LeNetDygraph()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_trainable.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_trainable.py
index b0dcfd653fb75..c359d99c81946 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_trainable.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_trainable.py
@@ -21,6 +21,7 @@
 
 
 class TestImperativeLayerTrainable(unittest.TestCase):
+
     def func_set_trainable(self):
         with fluid.dygraph.guard():
             label = np.random.uniform(-1, 1, [10, 10]).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layers.py b/python/paddle/fluid/tests/unittests/test_imperative_layers.py
index 15dada8c8239b..4f7e8c1ac1672 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layers.py
@@ -19,6 +19,7 @@
 
 
 class TestLayerPrint(unittest.TestCase):
+
     def func_test_layer_str(self):
         module = nn.ELU(0.2)
         self.assertEqual(str(module), 'ELU(alpha=0.2)')
@@ -39,8 +40,8 @@ def func_test_layer_str(self):
         self.assertEqual(str(module), 'Tanh(name=Tanh)')
 
         module = nn.Hardtanh(name="Hardtanh")
-        self.assertEqual(
-            str(module), 'Hardtanh(min=-1.0, max=1.0, name=Hardtanh)')
+        self.assertEqual(str(module),
+                         'Hardtanh(min=-1.0, max=1.0, name=Hardtanh)')
 
         module = nn.PReLU(1, 0.25, name="PReLU", data_format="NCHW")
         self.assertEqual(
@@ -125,8 +126,8 @@ def func_test_layer_str(self):
         )
 
         module = nn.Dropout(p=0.5)
-        self.assertEqual(
-            str(module), 'Dropout(p=0.5, axis=None, mode=upscale_in_train)')
+        self.assertEqual(str(module),
+                         'Dropout(p=0.5, axis=None, mode=upscale_in_train)')
 
         module = nn.Dropout2D(p=0.5)
         self.assertEqual(str(module), 'Dropout2D(p=0.5, data_format=NCHW)')
@@ -149,8 +150,8 @@ def func_test_layer_str(self):
         )
 
         module = nn.ZeroPad2D(padding=[1, 0, 1, 2])
-        self.assertEqual(
-            str(module), 'ZeroPad2D(padding=[1, 0, 1, 2], data_format=NCHW)')
+        self.assertEqual(str(module),
+                         'ZeroPad2D(padding=[1, 0, 1, 2], data_format=NCHW)')
 
         module = nn.Pad3D(padding=[1, 0, 1, 2, 0, 0], mode='constant')
         self.assertEqual(
@@ -165,8 +166,8 @@ def func_test_layer_str(self):
         self.assertEqual(str(module), 'Embedding(10, 3, sparse=True)')
 
         module = nn.Conv1D(3, 2, 3)
-        self.assertEqual(
-            str(module), 'Conv1D(3, 2, kernel_size=[3], data_format=NCL)')
+        self.assertEqual(str(module),
+                         'Conv1D(3, 2, kernel_size=[3], data_format=NCL)')
 
         module = nn.Conv1DTranspose(2, 1, 2)
         self.assertEqual(
@@ -174,8 +175,8 @@ def func_test_layer_str(self):
             'Conv1DTranspose(2, 1, kernel_size=[2], data_format=NCL)')
 
         module = nn.Conv2D(4, 6, (3, 3))
-        self.assertEqual(
-            str(module), 'Conv2D(4, 6, kernel_size=[3, 3], data_format=NCHW)')
+        self.assertEqual(str(module),
+                         'Conv2D(4, 6, kernel_size=[3, 3], data_format=NCHW)')
 
         module = nn.Conv2DTranspose(4, 6, (3, 3))
         self.assertEqual(
@@ -196,16 +197,16 @@ def func_test_layer_str(self):
         self.assertEqual(str(module), 'PairwiseDistance(p=2.0)')
 
         module = nn.InstanceNorm1D(2)
-        self.assertEqual(
-            str(module), 'InstanceNorm1D(num_features=2, epsilon=1e-05)')
+        self.assertEqual(str(module),
+                         'InstanceNorm1D(num_features=2, epsilon=1e-05)')
 
         module = nn.InstanceNorm2D(2)
-        self.assertEqual(
-            str(module), 'InstanceNorm2D(num_features=2, epsilon=1e-05)')
+        self.assertEqual(str(module),
+                         'InstanceNorm2D(num_features=2, epsilon=1e-05)')
 
         module = nn.InstanceNorm3D(2)
-        self.assertEqual(
-            str(module), 'InstanceNorm3D(num_features=2, epsilon=1e-05)')
+        self.assertEqual(str(module),
+                         'InstanceNorm3D(num_features=2, epsilon=1e-05)')
 
         module = nn.GroupNorm(num_channels=6, num_groups=6)
         self.assertEqual(
@@ -244,28 +245,28 @@ def func_test_layer_str(self):
             'LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=1.0)')
 
         module = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
-        self.assertEqual(
-            str(module), 'AvgPool1D(kernel_size=2, stride=2, padding=0)')
+        self.assertEqual(str(module),
+                         'AvgPool1D(kernel_size=2, stride=2, padding=0)')
 
         module = nn.AvgPool2D(kernel_size=2, stride=2, padding=0)
-        self.assertEqual(
-            str(module), 'AvgPool2D(kernel_size=2, stride=2, padding=0)')
+        self.assertEqual(str(module),
+                         'AvgPool2D(kernel_size=2, stride=2, padding=0)')
 
         module = nn.AvgPool3D(kernel_size=2, stride=2, padding=0)
-        self.assertEqual(
-            str(module), 'AvgPool3D(kernel_size=2, stride=2, padding=0)')
+        self.assertEqual(str(module),
+                         'AvgPool3D(kernel_size=2, stride=2, padding=0)')
 
         module = nn.MaxPool1D(kernel_size=2, stride=2, padding=0)
-        self.assertEqual(
-            str(module), 'MaxPool1D(kernel_size=2, stride=2, padding=0)')
+        self.assertEqual(str(module),
+                         'MaxPool1D(kernel_size=2, stride=2, padding=0)')
 
         module = nn.MaxPool2D(kernel_size=2, stride=2, padding=0)
-        self.assertEqual(
-            str(module), 'MaxPool2D(kernel_size=2, stride=2, padding=0)')
+        self.assertEqual(str(module),
+                         'MaxPool2D(kernel_size=2, stride=2, padding=0)')
 
         module = nn.MaxPool3D(kernel_size=2, stride=2, padding=0)
-        self.assertEqual(
-            str(module), 'MaxPool3D(kernel_size=2, stride=2, padding=0)')
+        self.assertEqual(str(module),
+                         'MaxPool3D(kernel_size=2, stride=2, padding=0)')
 
         module = nn.AdaptiveAvgPool1D(output_size=16)
         self.assertEqual(str(module), 'AdaptiveAvgPool1D(output_size=16)')
@@ -277,16 +278,16 @@ def func_test_layer_str(self):
         self.assertEqual(str(module), 'AdaptiveAvgPool3D(output_size=3)')
 
         module = nn.AdaptiveMaxPool1D(output_size=16, return_mask=True)
-        self.assertEqual(
-            str(module), 'AdaptiveMaxPool1D(output_size=16, return_mask=True)')
+        self.assertEqual(str(module),
+                         'AdaptiveMaxPool1D(output_size=16, return_mask=True)')
 
         module = nn.AdaptiveMaxPool2D(output_size=3, return_mask=True)
-        self.assertEqual(
-            str(module), 'AdaptiveMaxPool2D(output_size=3, return_mask=True)')
+        self.assertEqual(str(module),
+                         'AdaptiveMaxPool2D(output_size=3, return_mask=True)')
 
         module = nn.AdaptiveMaxPool3D(output_size=3, return_mask=True)
-        self.assertEqual(
-            str(module), 'AdaptiveMaxPool3D(output_size=3, return_mask=True)')
+        self.assertEqual(str(module),
+                         'AdaptiveMaxPool3D(output_size=3, return_mask=True)')
 
         module = nn.SimpleRNNCell(16, 32)
         self.assertEqual(str(module), 'SimpleRNNCell(16, 32)')
@@ -332,14 +333,9 @@ def func_test_layer_str(self):
 
         module2 = nn.Sequential(
             nn.Conv3DTranspose(4, 6, (3, 3, 3)),
-            nn.AvgPool3D(
-                kernel_size=2, stride=2, padding=0),
-            nn.Tanh(name="Tanh"),
-            module1,
-            nn.Conv3D(4, 6, (3, 3, 3)),
-            nn.MaxPool3D(
-                kernel_size=2, stride=2, padding=0),
-            nn.GELU(True))
+            nn.AvgPool3D(kernel_size=2, stride=2, padding=0),
+            nn.Tanh(name="Tanh"), module1, nn.Conv3D(4, 6, (3, 3, 3)),
+            nn.MaxPool3D(kernel_size=2, stride=2, padding=0), nn.GELU(True))
         self.assertEqual(
             str(module2),
             'Sequential(\n  '\
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
index e4faa7e259a67..2c860a0a6243b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -20,6 +20,7 @@
 
 
 class TestDygraphLoadStatic(unittest.TestCase):
+
     def testLoadStaticModel(self):
         # static mode
         a = fluid.data(name="a", shape=[10, 10])
@@ -28,20 +29,30 @@ def testLoadStaticModel(self):
         fc_out1 = fluid.layers.fc(a, 10)
         fc_out2 = fluid.layers.fc(a, 20)
 
-        conv_out_1 = fluid.layers.conv2d(
-            conv_in, num_filters=10, filter_size=5, act="relu")
-        conv_out_2 = fluid.layers.conv2d(
-            conv_in, num_filters=10, filter_size=5, act="relu")
-
-        conv3d_in = fluid.data(
-            name='conv3d_in', shape=[None, 3, 12, 32, 32], dtype='float32')
-        conv3d_out_1 = fluid.layers.conv3d(
-            input=conv3d_in, num_filters=2, filter_size=3, act="relu")
-        conv3d_out_2 = fluid.layers.conv3d(
-            input=conv3d_in, num_filters=2, filter_size=3, act="relu")
-
-        batchnorm_in = fluid.data(
-            name="batchnorm_in", shape=[None, 10], dtype='float32')
+        conv_out_1 = fluid.layers.conv2d(conv_in,
+                                         num_filters=10,
+                                         filter_size=5,
+                                         act="relu")
+        conv_out_2 = fluid.layers.conv2d(conv_in,
+                                         num_filters=10,
+                                         filter_size=5,
+                                         act="relu")
+
+        conv3d_in = fluid.data(name='conv3d_in',
+                               shape=[None, 3, 12, 32, 32],
+                               dtype='float32')
+        conv3d_out_1 = fluid.layers.conv3d(input=conv3d_in,
+                                           num_filters=2,
+                                           filter_size=3,
+                                           act="relu")
+        conv3d_out_2 = fluid.layers.conv3d(input=conv3d_in,
+                                           num_filters=2,
+                                           filter_size=3,
+                                           act="relu")
+
+        batchnorm_in = fluid.data(name="batchnorm_in",
+                                  shape=[None, 10],
+                                  dtype='float32')
         batchnorm_out_1 = fluid.layers.batch_norm(batchnorm_in)
         batchnorm_out_2 = fluid.layers.batch_norm(batchnorm_in)
 
@@ -54,45 +65,53 @@ def testLoadStaticModel(self):
         layernorm_2 = fluid.layers.layer_norm(layernorm)
 
         nce_in = fluid.data(name="nce_in", shape=[None, 100], dtype='float32')
-        nce_label = fluid.data(
-            name="nce_label", shape=[None, 10], dtype='int64')
+        nce_label = fluid.data(name="nce_label",
+                               shape=[None, 10],
+                               dtype='int64')
         nce_out_1 = fluid.layers.nce(nce_in, nce_label, 10000)
         nce_out_2 = fluid.layers.nce(nce_in, nce_label, 10000)
 
-        prelu_in = fluid.data(
-            name="prelu_in", shape=[None, 5, 10, 10], dtype='float32')
+        prelu_in = fluid.data(name="prelu_in",
+                              shape=[None, 5, 10, 10],
+                              dtype='float32')
         prelu_out_1 = fluid.layers.prelu(prelu_in, "channel")
         prelu_out_2 = fluid.layers.prelu(prelu_in, "channel")
 
-        bilinear_tensor_pro_x = fluid.data(
-            "t1", shape=[None, 5], dtype="float32")
-        bilinear_tensor_pro_y = fluid.data(
-            "t2", shape=[None, 4], dtype="float32")
+        bilinear_tensor_pro_x = fluid.data("t1",
+                                           shape=[None, 5],
+                                           dtype="float32")
+        bilinear_tensor_pro_y = fluid.data("t2",
+                                           shape=[None, 4],
+                                           dtype="float32")
 
         bilinear_tensor_pro_out_1 = fluid.layers.bilinear_tensor_product(
             x=bilinear_tensor_pro_x, y=bilinear_tensor_pro_y, size=1000)
         bilinear_tensor_pro_out_2 = fluid.layers.bilinear_tensor_product(
             x=bilinear_tensor_pro_x, y=bilinear_tensor_pro_y, size=1000)
 
-        conv2d_trans_in = fluid.data(
-            name="conv2d_trans_in", shape=[None, 10, 10, 10])
-
-        conv2d_trans_out_1 = fluid.layers.conv2d_transpose(
-            conv2d_trans_in, num_filters=10, filter_size=5, act="relu")
-        conv2d_trans_out_2 = fluid.layers.conv2d_transpose(
-            conv2d_trans_in, num_filters=10, filter_size=5, act="relu")
-
-        conv3d_trans_in = fluid.data(
-            name='conv3d_trans_in',
-            shape=[None, 3, 12, 32, 32],
-            dtype='float32')
+        conv2d_trans_in = fluid.data(name="conv2d_trans_in",
+                                     shape=[None, 10, 10, 10])
+
+        conv2d_trans_out_1 = fluid.layers.conv2d_transpose(conv2d_trans_in,
+                                                           num_filters=10,
+                                                           filter_size=5,
+                                                           act="relu")
+        conv2d_trans_out_2 = fluid.layers.conv2d_transpose(conv2d_trans_in,
+                                                           num_filters=10,
+                                                           filter_size=5,
+                                                           act="relu")
+
+        conv3d_trans_in = fluid.data(name='conv3d_trans_in',
+                                     shape=[None, 3, 12, 32, 32],
+                                     dtype='float32')
         conv3d_trans_out_1 = fluid.layers.conv3d_transpose(
             input=conv3d_trans_in, num_filters=2, filter_size=3, act="relu")
         conv3d_trans_out_2 = fluid.layers.conv3d_transpose(
             input=conv3d_trans_in, num_filters=2, filter_size=3, act="relu")
 
-        groupnorm_in = fluid.data(
-            name='groupnorm_in', shape=[None, 8, 32, 32], dtype='float32')
+        groupnorm_in = fluid.data(name='groupnorm_in',
+                                  shape=[None, 8, 32, 32],
+                                  dtype='float32')
         groupnorm_out1 = fluid.layers.group_norm(input=groupnorm_in, groups=4)
         groupnorm_out2 = fluid.layers.group_norm(input=groupnorm_in, groups=4)
         '''
@@ -101,19 +120,23 @@ def testLoadStaticModel(self):
         spe_norm_out_2 = fluid.layers.spectral_norm(weight=spec_norm, dim=1, power_iters=2)
         '''
 
-        nodes_vector = fluid.data(
-            name='vectors', shape=[None, 10, 5], dtype='float32')
-        edge_set = fluid.data(
-            name='edge_set', shape=[None, 10, 2], dtype='float32')
+        nodes_vector = fluid.data(name='vectors',
+                                  shape=[None, 10, 5],
+                                  dtype='float32')
+        edge_set = fluid.data(name='edge_set',
+                              shape=[None, 10, 2],
+                              dtype='float32')
         tree_conv_out1 = fluid.contrib.layers.tree_conv(nodes_vector, edge_set,
                                                         6, 1, 2)
         tree_conv_out2 = fluid.contrib.layers.tree_conv(nodes_vector, edge_set,
                                                         6, 1, 2)
 
-        para1 = fluid.layers.create_parameter(
-            [100, 100], 'float32', name="weight_test_1")
-        para2 = fluid.layers.create_parameter(
-            [20, 200], 'float32', name="weight_test_2")
+        para1 = fluid.layers.create_parameter([100, 100],
+                                              'float32',
+                                              name="weight_test_1")
+        para2 = fluid.layers.create_parameter([20, 200],
+                                              'float32',
+                                              name="weight_test_2")
 
         para_list = fluid.default_main_program().list_vars()
 
@@ -137,33 +160,30 @@ def testLoadStaticModel(self):
         with fluid.dygraph.guard():
 
             class MyTest(fluid.dygraph.Layer):
+
                 def __init__(self):
                     super(MyTest, self).__init__()
 
                     self.linear1 = Linear(10, 10)
                     self.lienar2 = Linear(10, 20)
 
-                    self.conv2d_1 = Conv2D(
-                        num_channels=10,
-                        num_filters=10,
-                        filter_size=5,
-                        act="relu")
-                    self.conv2d_2 = Conv2D(
-                        num_channels=10,
-                        num_filters=10,
-                        filter_size=5,
-                        act="relu")
-
-                    self.conv3d_1 = Conv3D(
-                        num_channels=3,
-                        num_filters=2,
-                        filter_size=3,
-                        act="relu")
-                    self.conv3d_2 = Conv3D(
-                        num_channels=3,
-                        num_filters=2,
-                        filter_size=3,
-                        act="relu")
+                    self.conv2d_1 = Conv2D(num_channels=10,
+                                           num_filters=10,
+                                           filter_size=5,
+                                           act="relu")
+                    self.conv2d_2 = Conv2D(num_channels=10,
+                                           num_filters=10,
+                                           filter_size=5,
+                                           act="relu")
+
+                    self.conv3d_1 = Conv3D(num_channels=3,
+                                           num_filters=2,
+                                           filter_size=3,
+                                           act="relu")
+                    self.conv3d_2 = Conv3D(num_channels=3,
+                                           num_filters=2,
+                                           filter_size=3,
+                                           act="relu")
 
                     self.batch_norm_1 = BatchNorm(10)
                     self.batch_norm_2 = BatchNorm(10)
@@ -183,10 +203,12 @@ def __init__(self):
                     self.group_norm1 = GroupNorm(8, 4)
                     self.gourp_norm2 = GroupNorm(8, 4)
 
-                    self.w_1 = self.create_parameter(
-                        [100, 100], dtype='float32', attr="weight_test_1")
-                    self.w_2 = self.create_parameter(
-                        [20, 200], dtype='float32', attr="weight_test_2")
+                    self.w_1 = self.create_parameter([100, 100],
+                                                     dtype='float32',
+                                                     attr="weight_test_1")
+                    self.w_2 = self.create_parameter([20, 200],
+                                                     dtype='float32',
+                                                     attr="weight_test_2")
 
             my_test = MyTest()
             my_test.set_dict(new_dict, use_structured_name=False)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index 110bb961bbe12..f9306d0cfebd7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -30,6 +30,7 @@
 
 
 class SimpleNet(fluid.Layer):
+
     def __init__(self,
                  hidden_size,
                  vocab_size,
@@ -60,13 +61,13 @@ def __init__(self,
     def forward(self, input, label):
         x_emb = self.embedding(input)
         projection = fluid.layers.matmul(
-            x_emb, fluid.layers.transpose(
-                self.embedding.weight, perm=[1, 0]))
+            x_emb, fluid.layers.transpose(self.embedding.weight, perm=[1, 0]))
         projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
-        projection = fluid.layers.reshape(
-            projection, shape=[-1, self.vocab_size])
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=projection, label=label, soft_label=False)
+        projection = fluid.layers.reshape(projection,
+                                          shape=[-1, self.vocab_size])
+        loss = fluid.layers.softmax_with_cross_entropy(logits=projection,
+                                                       label=label,
+                                                       soft_label=False)
         loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
         loss = fluid.layers.reduce_mean(loss, dim=[0])
         loss = fluid.layers.reduce_sum(loss)
@@ -75,6 +76,7 @@ def forward(self, input, label):
 
 
 class TestDygraphSimpleNet(unittest.TestCase):
+
     def func_simple_net(self):
         for is_sparse in [True, False]:
             dtype_list = ["float32"]
@@ -107,25 +109,22 @@ def simple_net_float32(self, is_sparse, dtype):
                     paddle.seed(seed)
                     paddle.framework.random._manual_program_seed(seed)
 
-                    simple_net = SimpleNet(
-                        hidden_size=hidden_size,
-                        vocab_size=vocab_size,
-                        num_steps=num_steps,
-                        init_scale=init_scale,
-                        is_sparse=is_sparse,
-                        dtype=dtype)
-
-                    sgd = SGDOptimizer(
-                        learning_rate=1e-3,
-                        parameter_list=simple_net.parameters())
+                    simple_net = SimpleNet(hidden_size=hidden_size,
+                                           vocab_size=vocab_size,
+                                           num_steps=num_steps,
+                                           init_scale=init_scale,
+                                           is_sparse=is_sparse,
+                                           dtype=dtype)
+
+                    sgd = SGDOptimizer(learning_rate=1e-3,
+                                       parameter_list=simple_net.parameters())
                     dy_param_updated = dict()
                     dy_param_init = dict()
                     dy_loss = None
 
                     helper = DyGraphProgramDescTracerTestHelper(self)
-                    fluid.set_flags({
-                        'FLAGS_sort_sum_gradient': is_sort_sum_gradient
-                    })
+                    fluid.set_flags(
+                        {'FLAGS_sort_sum_gradient': is_sort_sum_gradient})
 
                     for i in range(batch_num):
                         x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -152,17 +151,17 @@ def simple_net_float32(self, is_sparse, dtype):
                     paddle.seed(seed)
                     paddle.framework.random._manual_program_seed(seed)
 
-                    simple_net = SimpleNet(
-                        hidden_size=hidden_size,
-                        vocab_size=vocab_size,
-                        num_steps=num_steps,
-                        is_sparse=is_sparse,
-                        dtype=dtype)
+                    simple_net = SimpleNet(hidden_size=hidden_size,
+                                           vocab_size=vocab_size,
+                                           num_steps=num_steps,
+                                           is_sparse=is_sparse,
+                                           dtype=dtype)
 
                     exe = fluid.Executor(place)
                     sgd = SGDOptimizer(learning_rate=1e-3)
-                    x = fluid.layers.data(
-                        name="x", shape=[-1, num_steps], dtype='int64')
+                    x = fluid.layers.data(name="x",
+                                          shape=[-1, num_steps],
+                                          dtype='int64')
                     y = fluid.layers.data(name="y", shape=[-1, 1], dtype=dtype)
 
                     static_loss = simple_net(x, y)
@@ -186,8 +185,10 @@ def simple_net_float32(self, is_sparse, dtype):
                         fetch_list = [static_loss]
                         fetch_list.extend(static_param_name_list)
                         out = exe.run(fluid.default_main_program(),
-                                      feed={"x": x_data,
-                                            "y": y_data},
+                                      feed={
+                                          "x": x_data,
+                                          "y": y_data
+                                      },
                                       fetch_list=fetch_list)
                         static_loss_value = out[0]
 
@@ -197,13 +198,12 @@ def simple_net_float32(self, is_sparse, dtype):
                                     k - 1]] = out[k]
 
                 self.assertTrue(
-                    np.allclose(
-                        static_loss_value, dy_loss_value, rtol=1e-3))
+                    np.allclose(static_loss_value, dy_loss_value, rtol=1e-3))
                 for key, value in six.iteritems(static_param_init):
                     self.assertTrue(np.array_equal(value, dy_param_init[key]))
                 for key, value in six.iteritems(static_param_updated):
-                    self.assertTrue(
-                        np.array_equal(value, dy_param_updated[key]))
+                    self.assertTrue(np.array_equal(value,
+                                                   dy_param_updated[key]))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
index f9bd5e4597121..aeead6ff74745 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -31,6 +31,7 @@
 
 
 class SimpleImgConvPool(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -50,25 +51,23 @@ def __init__(self,
                  bias_attr=None):
         super(SimpleImgConvPool, self).__init__()
 
-        self._conv2d = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=conv_stride,
-            padding=conv_padding,
-            dilation=conv_dilation,
-            groups=conv_groups,
-            param_attr=None,
-            bias_attr=None,
-            use_cudnn=use_cudnn)
-
-        self._pool2d = Pool2D(
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn)
+        self._conv2d = Conv2D(num_channels=num_channels,
+                              num_filters=num_filters,
+                              filter_size=filter_size,
+                              stride=conv_stride,
+                              padding=conv_padding,
+                              dilation=conv_dilation,
+                              groups=conv_groups,
+                              param_attr=None,
+                              bias_attr=None,
+                              use_cudnn=use_cudnn)
+
+        self._pool2d = Pool2D(pool_size=pool_size,
+                              pool_type=pool_type,
+                              pool_stride=pool_stride,
+                              pool_padding=pool_padding,
+                              global_pooling=global_pooling,
+                              use_cudnn=use_cudnn)
 
     def forward(self, inputs):
         x = self._conv2d(inputs)
@@ -77,25 +76,33 @@ def forward(self, inputs):
 
 
 class MNIST(fluid.dygraph.Layer):
+
     def __init__(self):
         super(MNIST, self).__init__()
 
-        self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            1, 20, 5, 2, 2, act="relu")
+        self._simple_img_conv_pool_1 = SimpleImgConvPool(1,
+                                                         20,
+                                                         5,
+                                                         2,
+                                                         2,
+                                                         act="relu")
 
-        self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            20, 50, 5, 2, 2, act="relu")
+        self._simple_img_conv_pool_2 = SimpleImgConvPool(20,
+                                                         50,
+                                                         5,
+                                                         2,
+                                                         2,
+                                                         act="relu")
 
         self.pool_2_shape = 50 * 4 * 4
         SIZE = 10
         scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
-        self._fc = Linear(
-            self.pool_2_shape,
-            10,
-            param_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.NormalInitializer(
-                    loc=0.0, scale=scale)),
-            act="softmax")
+        self._fc = Linear(self.pool_2_shape,
+                          10,
+                          param_attr=fluid.param_attr.ParamAttr(
+                              initializer=fluid.initializer.NormalInitializer(
+                                  loc=0.0, scale=scale)),
+                          act="softmax")
 
     def forward(self, inputs):
         x = self._simple_img_conv_pool_1(inputs)
@@ -106,7 +113,9 @@ def forward(self, inputs):
 
 
 class TestImperativeMnist(unittest.TestCase):
+
     def reader_decorator(self, reader):
+
         def _reader_imple():
             for item in reader():
                 image = np.array(item[0]).reshape(1, 28, 28)
@@ -128,15 +137,15 @@ def func_test_mnist_float32(self):
             fluid.default_main_program().random_seed = seed
 
             mnist = MNIST()
-            sgd = SGDOptimizer(
-                learning_rate=1e-3, parameter_list=mnist.parameters())
+            sgd = SGDOptimizer(learning_rate=1e-3,
+                               parameter_list=mnist.parameters())
 
             batch_py_reader = fluid.io.PyReader(capacity=1)
             batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(paddle.dataset.mnist.train()),
-                    batch_size=batch_size,
-                    drop_last=True),
+                paddle.batch(self.reader_decorator(
+                    paddle.dataset.mnist.train()),
+                             batch_size=batch_size,
+                             drop_last=True),
                 places=fluid.CPUPlace())
 
             mnist.train()
@@ -194,13 +203,13 @@ def func_test_mnist_float32(self):
 
             mnist = MNIST()
             sgd = SGDOptimizer(learning_rate=1e-3)
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(),
-                batch_size=batch_size,
-                drop_last=True)
+            train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                        batch_size=batch_size,
+                                        drop_last=True)
 
-            img = fluid.layers.data(
-                name='pixel', shape=[1, 28, 28], dtype='float32')
+            img = fluid.layers.data(name='pixel',
+                                    shape=[1, 28, 28],
+                                    dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             cost = mnist(img)
             loss = fluid.layers.cross_entropy(cost, label)
@@ -223,12 +232,12 @@ def func_test_mnist_float32(self):
                 for batch_id, data in enumerate(train_reader()):
                     if batch_id >= batch_num:
                         break
-                    static_x_data = np.array(
-                        [x[0].reshape(1, 28, 28)
-                         for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(
-                            [batch_size, 1])
+                    static_x_data = np.array([
+                        x[0].reshape(1, 28, 28) for x in data
+                    ]).astype('float32')
+                    y_data = np.array([x[1]
+                                       for x in data]).astype('int64').reshape(
+                                           [batch_size, 1])
 
                     fetch_list = [avg_loss.name]
                     fetch_list.extend(static_param_name_list)
@@ -236,17 +245,18 @@ def func_test_mnist_float32(self):
                     if traced_layer is not None:
                         traced_layer([static_x_data])
 
-                    out = exe.run(
-                        fluid.default_main_program(),
-                        feed={"pixel": static_x_data,
-                              "label": y_data},
-                        fetch_list=fetch_list)
+                    out = exe.run(fluid.default_main_program(),
+                                  feed={
+                                      "pixel": static_x_data,
+                                      "label": y_data
+                                  },
+                                  fetch_list=fetch_list)
 
                     static_param_value = {}
                     static_out = out[0]
                     for i in range(1, len(out)):
-                        static_param_value[static_param_name_list[i - 1]] = out[
-                            i]
+                        static_param_value[static_param_name_list[i -
+                                                                  1]] = out[i]
 
         self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
index 8e3cbaf9488bd..23af23a4286ea 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
@@ -30,6 +30,7 @@
 
 
 class TestImperativeMnistSortGradient(unittest.TestCase):
+
     def func_test_mnist_sort_gradient_float32(self):
         seed = 90
         epoch_num = 1
@@ -40,20 +41,21 @@ def func_test_mnist_sort_gradient_float32(self):
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
             mnist2 = MNIST()
-            sgd2 = SGDOptimizer(
-                learning_rate=1e-3, parameter_list=mnist2.parameters())
-            train_reader2 = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+            sgd2 = SGDOptimizer(learning_rate=1e-3,
+                                parameter_list=mnist2.parameters())
+            train_reader2 = paddle.batch(paddle.dataset.mnist.train(),
+                                         batch_size=128,
+                                         drop_last=True)
 
             mnist2.train()
             dy_param_init_value2 = {}
             for epoch in range(epoch_num):
                 for batch_id, data in enumerate(train_reader2()):
-                    dy_x_data2 = np.array(
-                        [x[0].reshape(1, 28, 28)
-                         for x in data]).astype('float32')
-                    y_data2 = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(128, 1)
+                    dy_x_data2 = np.array([
+                        x[0].reshape(1, 28, 28) for x in data
+                    ]).astype('float32')
+                    y_data2 = np.array([x[1] for x in data
+                                        ]).astype('int64').reshape(128, 1)
 
                     img2 = to_variable(dy_x_data2)
                     label2 = to_variable(y_data2)
@@ -88,11 +90,13 @@ def func_test_mnist_sort_gradient_float32(self):
 
             mnist = MNIST()
             sgd = SGDOptimizer(learning_rate=1e-3)
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+            train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                        batch_size=128,
+                                        drop_last=True)
 
-            img = fluid.layers.data(
-                name='pixel', shape=[1, 28, 28], dtype='float32')
+            img = fluid.layers.data(name='pixel',
+                                    shape=[1, 28, 28],
+                                    dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             cost = mnist(img)
             loss = fluid.layers.cross_entropy(cost, label)
@@ -113,25 +117,26 @@ def func_test_mnist_sort_gradient_float32(self):
 
             for epoch in range(epoch_num):
                 for batch_id, data in enumerate(train_reader()):
-                    static_x_data = np.array(
-                        [x[0].reshape(1, 28, 28)
-                         for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape([128, 1])
+                    static_x_data = np.array([
+                        x[0].reshape(1, 28, 28) for x in data
+                    ]).astype('float32')
+                    y_data = np.array([x[1] for x in data
+                                       ]).astype('int64').reshape([128, 1])
 
                     fetch_list = [avg_loss.name]
                     fetch_list.extend(static_param_name_list)
-                    out = exe.run(
-                        fluid.default_main_program(),
-                        feed={"pixel": static_x_data,
-                              "label": y_data},
-                        fetch_list=fetch_list)
+                    out = exe.run(fluid.default_main_program(),
+                                  feed={
+                                      "pixel": static_x_data,
+                                      "label": y_data
+                                  },
+                                  fetch_list=fetch_list)
 
                     static_param_value = {}
                     static_out = out[0]
                     for i in range(1, len(out)):
-                        static_param_value[static_param_name_list[i - 1]] = out[
-                            i]
+                        static_param_value[static_param_name_list[i -
+                                                                  1]] = out[i]
                     if batch_id == 20:
                         break
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
index 223ccd3a3d5c2..c3b052edeace0 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
@@ -20,6 +20,7 @@
 
 
 class MyLayer(fluid.Layer):
+
     def __init__(self, num_channel, dim, num_filter=5):
         super(MyLayer, self).__init__()
         self.fc = fluid.dygraph.Linear(dim, dim)
@@ -32,6 +33,7 @@ def forward(self, x):
 
 
 class TestImperativeNamedSubLayers(unittest.TestCase):
+
     def func_test_named_sublayers(self):
         with fluid.dygraph.guard():
             fc1 = fluid.Linear(10, 3)
@@ -43,14 +45,16 @@ def func_test_named_sublayers(self):
 
             expected_sublayers = [fc1, fc2, custom, custom.fc, custom.conv]
             self.assertEqual(len(list_named_sublayers), len(expected_sublayers))
-            for (name, sublayer), expected_sublayer in zip(list_named_sublayers,
-                                                           expected_sublayers):
+            for (name,
+                 sublayer), expected_sublayer in zip(list_named_sublayers,
+                                                     expected_sublayers):
                 self.assertEqual(sublayer, expected_sublayer)
 
             list_sublayers = list(model.sublayers())
             self.assertEqual(len(list_named_sublayers), len(list_sublayers))
-            for (name, sublayer), expected_sublayer in zip(list_named_sublayers,
-                                                           list_sublayers):
+            for (name,
+                 sublayer), expected_sublayer in zip(list_named_sublayers,
+                                                     list_sublayers):
                 self.assertEqual(sublayer, expected_sublayer)
 
             self.assertListEqual(
@@ -64,6 +68,7 @@ def test_named_sublayers(self):
 
 
 class TestImperativeNamedParameters(unittest.TestCase):
+
     def func_test_named_parameters(self):
         with fluid.dygraph.guard():
             fc1 = fluid.Linear(10, 3)
@@ -90,6 +95,7 @@ def func_test_dir_layer(self):
         with fluid.dygraph.guard():
 
             class Mymodel(fluid.dygraph.Layer):
+
                 def __init__(self):
                     super(Mymodel, self).__init__()
                     self.linear1 = fluid.dygraph.Linear(10, 10)
@@ -98,11 +104,10 @@ def __init__(self):
                     self.embedding = fluid.dygraph.Embedding(size=[128, 16])
                     self.h_0 = fluid.dygraph.to_variable(
                         np.zeros([10, 10]).astype('float32'))
-                    self.weight = self.create_parameter(
-                        shape=[2, 3],
-                        attr=fluid.ParamAttr(),
-                        dtype="float32",
-                        is_bias=False)
+                    self.weight = self.create_parameter(shape=[2, 3],
+                                                        attr=fluid.ParamAttr(),
+                                                        dtype="float32",
+                                                        is_bias=False)
 
             model = Mymodel()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
index 158f71cc300c6..c0287668a3195 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
@@ -20,6 +20,7 @@
 
 
 class TestImperativeNumpyBridge(unittest.TestCase):
+
     def func_tensor_from_numpy(self):
         data_np = np.array([[2, 3, 1]]).astype('float32')
         with fluid.dygraph.guard(fluid.CPUPlace()):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index fd53b42450d33..064f0948cade5 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -60,6 +60,7 @@ class Config(object):
 
 
 class ConvBNPool(fluid.dygraph.Layer):
+
     def __init__(self,
                  group,
                  out_ch,
@@ -81,34 +82,31 @@ def __init__(self,
         conv_param_1 = fluid.ParamAttr(
             initializer=fluid.initializer.Normal(0.0, conv_std_1))
 
-        self.conv_0_layer = Conv2D(
-            channels[0],
-            out_ch[0],
-            3,
-            padding=1,
-            param_attr=conv_param_0,
-            bias_attr=False,
-            act=None,
-            use_cudnn=use_cudnn)
+        self.conv_0_layer = Conv2D(channels[0],
+                                   out_ch[0],
+                                   3,
+                                   padding=1,
+                                   param_attr=conv_param_0,
+                                   bias_attr=False,
+                                   act=None,
+                                   use_cudnn=use_cudnn)
         self.bn_0_layer = BatchNorm(out_ch[0], act=act, is_test=is_test)
-        self.conv_1_layer = Conv2D(
-            out_ch[0],
-            num_filters=out_ch[1],
-            filter_size=3,
-            padding=1,
-            param_attr=conv_param_1,
-            bias_attr=False,
-            act=None,
-            use_cudnn=use_cudnn)
+        self.conv_1_layer = Conv2D(out_ch[0],
+                                   num_filters=out_ch[1],
+                                   filter_size=3,
+                                   padding=1,
+                                   param_attr=conv_param_1,
+                                   bias_attr=False,
+                                   act=None,
+                                   use_cudnn=use_cudnn)
         self.bn_1_layer = BatchNorm(out_ch[1], act=act, is_test=is_test)
 
         if self.pool:
-            self.pool_layer = Pool2D(
-                pool_size=2,
-                pool_type='max',
-                pool_stride=2,
-                use_cudnn=use_cudnn,
-                ceil_mode=True)
+            self.pool_layer = Pool2D(pool_size=2,
+                                     pool_type='max',
+                                     pool_stride=2,
+                                     use_cudnn=use_cudnn,
+                                     ceil_mode=True)
 
     def forward(self, inputs):
         conv_0 = self.conv_0_layer(inputs)
@@ -122,19 +120,22 @@ def forward(self, inputs):
 
 
 class OCRConv(fluid.dygraph.Layer):
+
     def __init__(self, is_test=False, use_cudnn=True):
         super(OCRConv, self).__init__()
-        self.conv_bn_pool_1 = ConvBNPool(
-            2, [8, 8], [1, 8], is_test=is_test, use_cudnn=use_cudnn)
-        self.conv_bn_pool_2 = ConvBNPool(
-            2, [8, 8], [8, 8], is_test=is_test, use_cudnn=use_cudnn)
-        self.conv_bn_pool_3 = ConvBNPool(
-            2, [8, 8], [8, 8], is_test=is_test, use_cudnn=use_cudnn)
-        self.conv_bn_pool_4 = ConvBNPool(
-            2, [16, 16], [8, 16],
-            is_test=is_test,
-            pool=False,
-            use_cudnn=use_cudnn)
+        self.conv_bn_pool_1 = ConvBNPool(2, [8, 8], [1, 8],
+                                         is_test=is_test,
+                                         use_cudnn=use_cudnn)
+        self.conv_bn_pool_2 = ConvBNPool(2, [8, 8], [8, 8],
+                                         is_test=is_test,
+                                         use_cudnn=use_cudnn)
+        self.conv_bn_pool_3 = ConvBNPool(2, [8, 8], [8, 8],
+                                         is_test=is_test,
+                                         use_cudnn=use_cudnn)
+        self.conv_bn_pool_4 = ConvBNPool(2, [16, 16], [8, 16],
+                                         is_test=is_test,
+                                         pool=False,
+                                         use_cudnn=use_cudnn)
 
     def forward(self, inputs):
         inputs_1 = self.conv_bn_pool_1(inputs)
@@ -146,6 +147,7 @@ def forward(self, inputs):
 
 
 class DynamicGRU(fluid.dygraph.Layer):
+
     def __init__(self,
                  size,
                  param_attr=None,
@@ -157,13 +159,12 @@ def __init__(self,
                  origin_mode=False):
         super(DynamicGRU, self).__init__()
 
-        self.gru_unit = GRUUnit(
-            size * 3,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            activation=candidate_activation,
-            gate_activation=gate_activation,
-            origin_mode=origin_mode)
+        self.gru_unit = GRUUnit(size * 3,
+                                param_attr=param_attr,
+                                bias_attr=bias_attr,
+                                activation=candidate_activation,
+                                gate_activation=gate_activation,
+                                origin_mode=origin_mode)
 
         self.h_0 = h_0
         self.is_reverse = is_reverse
@@ -175,13 +176,15 @@ def forward(self, inputs):
         for i in range(inputs.shape[1]):
             if self.is_reverse:
                 i = inputs.shape[1] - 1 - i
-            input_ = fluid.layers.slice(
-                inputs, axes=[1], starts=[i], ends=[i + 1])
-            input_ = fluid.layers.reshape(
-                input_, [-1, input_.shape[2]], inplace=False)
+            input_ = fluid.layers.slice(inputs,
+                                        axes=[1],
+                                        starts=[i],
+                                        ends=[i + 1])
+            input_ = fluid.layers.reshape(input_, [-1, input_.shape[2]],
+                                          inplace=False)
             hidden, reset, gate = self.gru_unit(input_, hidden)
-            hidden_ = fluid.layers.reshape(
-                hidden, [-1, 1, hidden.shape[1]], inplace=False)
+            hidden_ = fluid.layers.reshape(hidden, [-1, 1, hidden.shape[1]],
+                                           inplace=False)
             if self.is_reverse:
                 res = [hidden_] + res
             else:
@@ -191,19 +194,21 @@ def forward(self, inputs):
 
 
 class EncoderNet(fluid.dygraph.Layer):
+
     def __init__(self,
                  rnn_hidden_size=Config.encoder_size,
                  is_test=False,
                  use_cudnn=True):
         super(EncoderNet, self).__init__()
         self.rnn_hidden_size = rnn_hidden_size
-        para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0,
-                                                                         0.02))
-        bias_attr = fluid.ParamAttr(
-            initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0)
+        para_attr = fluid.ParamAttr(
+            initializer=fluid.initializer.Normal(0.0, 0.02))
+        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(
+            0.0, 0.02),
+                                    learning_rate=2.0)
         if fluid.framework._non_static_mode():
-            h_0 = np.zeros(
-                (Config.batch_size, rnn_hidden_size), dtype="float32")
+            h_0 = np.zeros((Config.batch_size, rnn_hidden_size),
+                           dtype="float32")
             h_0 = to_variable(h_0)
         else:
             h_0 = fluid.layers.fill_constant(
@@ -212,26 +217,29 @@ def __init__(self,
                 value=0)
         self.ocr_convs = OCRConv(is_test=is_test, use_cudnn=use_cudnn)
 
-        self.fc_1_layer = Linear(
-            32, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False)
-        self.fc_2_layer = Linear(
-            32, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False)
-        self.gru_forward_layer = DynamicGRU(
-            size=rnn_hidden_size,
-            h_0=h_0,
-            param_attr=para_attr,
-            bias_attr=bias_attr,
-            candidate_activation='relu')
-        self.gru_backward_layer = DynamicGRU(
-            size=rnn_hidden_size,
-            h_0=h_0,
-            param_attr=para_attr,
-            bias_attr=bias_attr,
-            candidate_activation='relu',
-            is_reverse=True)
-
-        self.encoded_proj_fc = Linear(
-            rnn_hidden_size * 2, Config.decoder_size, bias_attr=False)
+        self.fc_1_layer = Linear(32,
+                                 rnn_hidden_size * 3,
+                                 param_attr=para_attr,
+                                 bias_attr=False)
+        self.fc_2_layer = Linear(32,
+                                 rnn_hidden_size * 3,
+                                 param_attr=para_attr,
+                                 bias_attr=False)
+        self.gru_forward_layer = DynamicGRU(size=rnn_hidden_size,
+                                            h_0=h_0,
+                                            param_attr=para_attr,
+                                            bias_attr=bias_attr,
+                                            candidate_activation='relu')
+        self.gru_backward_layer = DynamicGRU(size=rnn_hidden_size,
+                                             h_0=h_0,
+                                             param_attr=para_attr,
+                                             bias_attr=bias_attr,
+                                             candidate_activation='relu',
+                                             is_reverse=True)
+
+        self.encoded_proj_fc = Linear(rnn_hidden_size * 2,
+                                      Config.decoder_size,
+                                      bias_attr=False)
 
     def forward(self, inputs):
         conv_features = self.ocr_convs(inputs)
@@ -240,22 +248,21 @@ def forward(self, inputs):
         #    stride=[1, 1],
         #    filter_size=[conv_features.shape[2], 1])
 
-        transpose_conv_features = fluid.layers.transpose(
-            conv_features, perm=[0, 3, 1, 2])
-        sliced_feature = fluid.layers.reshape(
-            transpose_conv_features, [
-                -1, 8, transpose_conv_features.shape[2] *
-                transpose_conv_features.shape[3]
-            ],
-            inplace=False)
+        transpose_conv_features = fluid.layers.transpose(conv_features,
+                                                         perm=[0, 3, 1, 2])
+        sliced_feature = fluid.layers.reshape(transpose_conv_features, [
+            -1, 8,
+            transpose_conv_features.shape[2] * transpose_conv_features.shape[3]
+        ],
+                                              inplace=False)
         fc_1 = self.fc_1_layer(sliced_feature)
         fc_2 = self.fc_2_layer(sliced_feature)
         gru_forward = self.gru_forward_layer(fc_1)
 
         gru_backward = self.gru_backward_layer(fc_2)
 
-        encoded_vector = fluid.layers.concat(
-            input=[gru_forward, gru_backward], axis=2)
+        encoded_vector = fluid.layers.concat(input=[gru_forward, gru_backward],
+                                             axis=2)
 
         encoded_proj = self.encoded_proj_fc(encoded_vector)
 
@@ -263,11 +270,14 @@ def forward(self, inputs):
 
 
 class SimpleAttention(fluid.dygraph.Layer):
+
     def __init__(self, decoder_size):
         super(SimpleAttention, self).__init__()
 
-        self.fc_1 = Linear(
-            decoder_size, decoder_size, act=None, bias_attr=False)
+        self.fc_1 = Linear(decoder_size,
+                           decoder_size,
+                           act=None,
+                           bias_attr=False)
         self.fc_2 = Linear(decoder_size, 1, act=None, bias_attr=False)
 
     def forward(self, encoder_vec, encoder_proj, decoder_state):
@@ -288,26 +298,33 @@ def forward(self, encoder_vec, encoder_proj, decoder_state):
             inplace=False)
 
         weights_reshape = fluid.layers.softmax(weights_reshape)
-        scaled = fluid.layers.elementwise_mul(
-            x=encoder_vec, y=weights_reshape, axis=0)
+        scaled = fluid.layers.elementwise_mul(x=encoder_vec,
+                                              y=weights_reshape,
+                                              axis=0)
         context = fluid.layers.reduce_sum(scaled, dim=1)
 
         return context
 
 
 class GRUDecoderWithAttention(fluid.dygraph.Layer):
+
     def __init__(self, decoder_size, num_classes):
         super(GRUDecoderWithAttention, self).__init__()
         self.simple_attention = SimpleAttention(decoder_size)
 
-        self.fc_1_layer = Linear(
-            Config.encoder_size * 2, decoder_size * 3, bias_attr=False)
-        self.fc_2_layer = Linear(
-            decoder_size, decoder_size * 3, bias_attr=False)
-        self.gru_unit = GRUUnit(
-            size=decoder_size * 3, param_attr=None, bias_attr=None)
-        self.out_layer = Linear(
-            decoder_size, num_classes + 2, bias_attr=None, act='softmax')
+        self.fc_1_layer = Linear(Config.encoder_size * 2,
+                                 decoder_size * 3,
+                                 bias_attr=False)
+        self.fc_2_layer = Linear(decoder_size,
+                                 decoder_size * 3,
+                                 bias_attr=False)
+        self.gru_unit = GRUUnit(size=decoder_size * 3,
+                                param_attr=None,
+                                bias_attr=None)
+        self.out_layer = Linear(decoder_size,
+                                num_classes + 2,
+                                bias_attr=None,
+                                act='softmax')
 
         self.decoder_size = decoder_size
 
@@ -316,10 +333,13 @@ def forward(self, target_embedding, encoder_vec, encoder_proj,
         res = []
         hidden_mem = decoder_boot
         for i in range(target_embedding.shape[1]):
-            current_word = fluid.layers.slice(
-                target_embedding, axes=[1], starts=[i], ends=[i + 1])
-            current_word = fluid.layers.reshape(
-                current_word, [-1, current_word.shape[2]], inplace=False)
+            current_word = fluid.layers.slice(target_embedding,
+                                              axes=[1],
+                                              starts=[i],
+                                              ends=[i + 1])
+            current_word = fluid.layers.reshape(current_word,
+                                                [-1, current_word.shape[2]],
+                                                inplace=False)
 
             context = self.simple_attention(encoder_vec, encoder_proj,
                                             hidden_mem)
@@ -338,14 +358,14 @@ def forward(self, target_embedding, encoder_vec, encoder_proj,
 
 
 class OCRAttention(fluid.dygraph.Layer):
+
     def __init__(self):
         super(OCRAttention, self).__init__()
         self.encoder_net = EncoderNet()
-        self.fc = Linear(
-            Config.encoder_size,
-            Config.decoder_size,
-            bias_attr=False,
-            act='relu')
+        self.fc = Linear(Config.encoder_size,
+                         Config.decoder_size,
+                         bias_attr=False,
+                         act='relu')
         self.embedding = Embedding(
             [Config.num_classes + 2, Config.word_vector_dim], dtype='float32')
         self.gru_decoder_with_attention = GRUDecoderWithAttention(
@@ -353,10 +373,13 @@ def __init__(self):
 
     def forward(self, inputs, label_in):
         gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs)
-        backward_first = fluid.layers.slice(
-            gru_backward, axes=[1], starts=[0], ends=[1])
-        backward_first = fluid.layers.reshape(
-            backward_first, [-1, backward_first.shape[2]], inplace=False)
+        backward_first = fluid.layers.slice(gru_backward,
+                                            axes=[1],
+                                            starts=[0],
+                                            ends=[1])
+        backward_first = fluid.layers.reshape(backward_first,
+                                              [-1, backward_first.shape[2]],
+                                              inplace=False)
         decoder_boot = self.fc(backward_first)
         label_in = fluid.layers.reshape(label_in, [-1], inplace=False)
         trg_embedding = self.embedding(label_in)
@@ -365,13 +388,15 @@ def forward(self, inputs, label_in):
             trg_embedding, [-1, Config.max_length, trg_embedding.shape[1]],
             inplace=False)
 
-        prediction = self.gru_decoder_with_attention(
-            trg_embedding, encoded_vector, encoded_proj, decoder_boot)
+        prediction = self.gru_decoder_with_attention(trg_embedding,
+                                                     encoded_vector,
+                                                     encoded_proj, decoder_boot)
 
         return prediction
 
 
 class TestDygraphOCRAttention(unittest.TestCase):
+
     def test_ocr_test(self):
         seed = 90
         epoch_num = 1
@@ -383,23 +408,23 @@ def test_ocr_test(self):
         image_np = np.random.randn(Config.batch_size, Config.DATA_SHAPE[0],
                                    Config.DATA_SHAPE[1],
                                    Config.DATA_SHAPE[2]).astype('float32')
-        label_in_np = np.arange(
-            0, Config.max_length,
-            dtype='int64').reshape([1, Config.max_length])
+        label_in_np = np.arange(0, Config.max_length,
+                                dtype='int64').reshape([1, Config.max_length])
         for i in range(2, Config.batch_size + 1):
-            label_in_np = np.vstack((label_in_np, np.arange(
-                (i - 1) * Config.max_length,
-                i * Config.max_length,
-                dtype='int64').reshape([1, Config.max_length])))
-
-        label_out_np = np.arange(
-            0, Config.max_length,
-            dtype='int64').reshape([1, Config.max_length])
+            label_in_np = np.vstack(
+                (label_in_np,
+                 np.arange((i - 1) * Config.max_length,
+                           i * Config.max_length,
+                           dtype='int64').reshape([1, Config.max_length])))
+
+        label_out_np = np.arange(0, Config.max_length,
+                                 dtype='int64').reshape([1, Config.max_length])
         for i in range(2, Config.batch_size + 1):
-            label_out_np = np.vstack((label_out_np, np.arange(
-                (i - 1) * Config.max_length,
-                i * Config.max_length,
-                dtype='int64').reshape([1, Config.max_length])))
+            label_out_np = np.vstack(
+                (label_out_np,
+                 np.arange((i - 1) * Config.max_length,
+                           i * Config.max_length,
+                           dtype='int64').reshape([1, Config.max_length])))
 
         def run_dygraph():
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
@@ -424,12 +449,12 @@ def run_dygraph():
                     label_out.stop_gradient = True
                     img = to_variable(image_np)
                     dy_prediction = ocr_attention(img, label_in)
-                    label_out = fluid.layers.reshape(
-                        label_out, [-1, 1], inplace=False)
+                    label_out = fluid.layers.reshape(label_out, [-1, 1],
+                                                     inplace=False)
                     dy_prediction = fluid.layers.reshape(
                         dy_prediction, [label_out.shape[0], -1], inplace=False)
-                    loss = fluid.layers.cross_entropy(
-                        input=dy_prediction, label=label_out)
+                    loss = fluid.layers.cross_entropy(input=dy_prediction,
+                                                      label=label_out)
                     avg_loss = fluid.layers.reduce_sum(loss)
 
                     dy_out = avg_loss.numpy()
@@ -442,10 +467,10 @@ def run_dygraph():
                     dy_grad_value = {}
                     for param in ocr_attention.parameters():
                         if param.trainable:
-                            np_array = np.array(param._grad_ivar().value()
-                                                .get_tensor())
-                            dy_grad_value[param.name + core.grad_var_suffix(
-                            )] = np_array
+                            np_array = np.array(
+                                param._grad_ivar().value().get_tensor())
+                            dy_grad_value[param.name +
+                                          core.grad_var_suffix()] = np_array
 
                     optimizer.minimize(avg_loss)
                     ocr_attention.clear_gradients()
@@ -478,12 +503,17 @@ def run_dygraph():
 
             optimizer = fluid.optimizer.SGD(learning_rate=0.001)
 
-            images = fluid.layers.data(
-                name='pixel', shape=Config.DATA_SHAPE, dtype='float32')
-            static_label_in = fluid.layers.data(
-                name='label_in', shape=[1], dtype='int64', lod_level=0)
-            static_label_out = fluid.layers.data(
-                name='label_out', shape=[1], dtype='int64', lod_level=0)
+            images = fluid.layers.data(name='pixel',
+                                       shape=Config.DATA_SHAPE,
+                                       dtype='float32')
+            static_label_in = fluid.layers.data(name='label_in',
+                                                shape=[1],
+                                                dtype='int64',
+                                                lod_level=0)
+            static_label_out = fluid.layers.data(name='label_out',
+                                                 shape=[1],
+                                                 dtype='int64',
+                                                 lod_level=0)
             static_label_out.stop_gradient = True
             static_label_out.trainable = False
 
@@ -492,8 +522,8 @@ def run_dygraph():
             static_prediction = fluid.layers.reshape(
                 static_prediction, shape=[-1, Config.num_classes + 2])
 
-            cost = fluid.layers.cross_entropy(
-                input=static_prediction, label=static_label_out)
+            cost = fluid.layers.cross_entropy(input=static_prediction,
+                                              label=static_label_out)
             static_avg_loss = fluid.layers.reduce_sum(cost)
             # param_grad_list = fluid.backward.append_backward(static_avg_loss)
             optimizer.minimize(static_avg_loss)
@@ -532,8 +562,8 @@ def run_dygraph():
                     static_grad_value = {}
                     static_out = out[0]
                     for i in range(1, len(static_param_name_list) + 1):
-                        static_param_value[static_param_name_list[i - 1]] = out[
-                            i]
+                        static_param_value[static_param_name_list[i -
+                                                                  1]] = out[i]
                     grad_start_pos = len(static_param_name_list) + 1
                     for i in range(grad_start_pos,
                                    len(static_grad_name_list) + grad_start_pos):
@@ -556,8 +586,7 @@ def run_dygraph():
 
         for key, value in six.iteritems(static_param_value):
             self.assertTrue(
-                np.allclose(
-                    value, eager_param_value[key], rtol=1e-05))
+                np.allclose(value, eager_param_value[key], rtol=1e-05))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index 950416996874d..d7b55215ae703 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -35,6 +35,7 @@
 
 
 class MLP(fluid.Layer):
+
     def __init__(self, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
@@ -48,6 +49,7 @@ def forward(self, inputs):
 
 
 class TestImperativeOptimizerBase(unittest.TestCase):
+
     def setUp(self):
         self.batch_num = 20
 
@@ -58,6 +60,7 @@ def get_optimizer(self):
         raise NotImplementedError()
 
     def reader_decorator(self, reader):
+
         def _reader_imple():
             for item in reader():
                 image = np.array(item[0]).reshape(1, 784)
@@ -70,8 +73,8 @@ def _check_exception(self, exception_message, place=None):
         seed = 90
         batch_size = 128
         if place == None:
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
 
         with fluid.dygraph.guard(place):
             try:
@@ -88,8 +91,8 @@ def _check_mlp(self, place=None):
         batch_size = 128
 
         if place == None:
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
 
         with fluid.dygraph.guard(place):
             paddle.seed(seed)
@@ -101,10 +104,10 @@ def _check_mlp(self, place=None):
 
             batch_py_reader = fluid.io.PyReader(capacity=1)
             batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(paddle.dataset.mnist.train()),
-                    batch_size=batch_size,
-                    drop_last=True),
+                paddle.batch(self.reader_decorator(
+                    paddle.dataset.mnist.train()),
+                             batch_size=batch_size,
+                             drop_last=True),
                 places=fluid.CPUPlace())
 
             dy_param_init_value = {}
@@ -137,18 +140,20 @@ def _check_mlp(self, place=None):
             paddle.framework.random._manual_program_seed(seed)
 
             if place == None:
-                place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-                ) else fluid.CUDAPlace(0)
+                place = fluid.CPUPlace(
+                ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
 
             exe = fluid.Executor(place)
 
             mlp = MLP()
             optimizer = self.get_optimizer()
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+            train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                        batch_size=128,
+                                        drop_last=True)
 
-            img = fluid.layers.data(
-                name='pixel', shape=[1, 28, 28], dtype='float32')
+            img = fluid.layers.data(name='pixel',
+                                    shape=[1, 28, 28],
+                                    dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             img = fluid.layers.reshape(img, shape=[batch_size, 784])
             cost = mlp(img)
@@ -173,14 +178,16 @@ def _check_mlp(self, place=None):
 
                 static_x_data = np.array(
                     [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    [128, 1])
+                y_data = np.array([x[1] for x in data
+                                   ]).astype('int64').reshape([128, 1])
 
                 fetch_list = [avg_loss.name]
                 fetch_list.extend(static_param_name_list)
                 out = exe.run(fluid.default_main_program(),
-                              feed={"pixel": static_x_data,
-                                    "label": y_data},
+                              feed={
+                                  "pixel": static_x_data,
+                                  "label": y_data
+                              },
                               fetch_list=fetch_list)
 
                 static_param_value = {}
@@ -199,20 +206,18 @@ def _check_mlp(self, place=None):
         for key, value in six.iteritems(static_param_value):
             if core.is_compiled_with_rocm():
                 self.assertTrue(
-                    np.allclose(
-                        value, dy_param_value[key], atol=1e-3))
+                    np.allclose(value, dy_param_value[key], atol=1e-3))
             else:
                 self.assertTrue(np.allclose(value, dy_param_value[key]))
 
 
 class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         bd = [3, 6, 9]
-        optimizer = SGDOptimizer(
-            learning_rate=fluid.layers.piecewise_decay(
-                boundaries=bd,
-                values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
-            parameter_list=parameter_list)
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
+            boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
+                                 parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -231,22 +236,22 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
-    def get_optimizer_dygraph(self, parameter_list):
-        optimizer = SGDOptimizer(
-            learning_rate=fluid.layers.natural_exp_decay(
-                learning_rate=0.1,
-                decay_steps=10000,
-                decay_rate=0.5,
-                staircase=True),
-            parameter_list=parameter_list)
-        return optimizer
 
-    def get_optimizer(self):
+    def get_optimizer_dygraph(self, parameter_list):
         optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
             learning_rate=0.1,
             decay_steps=10000,
             decay_rate=0.5,
-            staircase=True))
+            staircase=True),
+                                 parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.natural_exp_decay(learning_rate=0.1,
+                                                         decay_steps=10000,
+                                                         decay_rate=0.5,
+                                                         staircase=True))
         return optimizer
 
     def func_test_sgd(self):
@@ -259,22 +264,22 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
-    def get_optimizer_dygraph(self, parameter_list):
-        optimizer = SGDOptimizer(
-            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=0.1,
-                decay_steps=10000,
-                decay_rate=0.5,
-                staircase=True),
-            parameter_list=parameter_list)
-        return optimizer
 
-    def get_optimizer(self):
+    def get_optimizer_dygraph(self, parameter_list):
         optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
             learning_rate=0.1,
             decay_steps=10000,
             decay_rate=0.5,
-            staircase=True))
+            staircase=True),
+                                 parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(
+            learning_rate=fluid.layers.exponential_decay(learning_rate=0.1,
+                                                         decay_steps=10000,
+                                                         decay_rate=0.5,
+                                                         staircase=True))
         return optimizer
 
     def func_test_sgd(self):
@@ -287,22 +292,22 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
-    def get_optimizer_dygraph(self, parameter_list):
-        optimizer = Adam(
-            learning_rate=fluid.layers.inverse_time_decay(
-                learning_rate=0.1,
-                decay_steps=10000,
-                decay_rate=0.5,
-                staircase=True),
-            parameter_list=parameter_list)
-        return optimizer
 
-    def get_optimizer(self):
+    def get_optimizer_dygraph(self, parameter_list):
         optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
             learning_rate=0.1,
             decay_steps=10000,
             decay_rate=0.5,
-            staircase=True))
+            staircase=True),
+                         parameter_list=parameter_list)
+        return optimizer
+
+    def get_optimizer(self):
+        optimizer = Adam(
+            learning_rate=fluid.layers.inverse_time_decay(learning_rate=0.1,
+                                                          decay_steps=10000,
+                                                          decay_rate=0.5,
+                                                          staircase=True))
         return optimizer
 
     def func_test_adam(self):
@@ -315,11 +320,11 @@ def test_adam(self):
 
 
 class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = SGDOptimizer(
-            learning_rate=fluid.layers.polynomial_decay(
-                learning_rate=0.1, decay_steps=5, cycle=self.cycle),
-            parameter_list=parameter_list)
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
+            learning_rate=0.1, decay_steps=5, cycle=self.cycle),
+                                 parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -347,11 +352,11 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = SGDOptimizer(
-            learning_rate=fluid.layers.cosine_decay(
-                learning_rate=0.1, step_each_epoch=10000, epochs=120),
-            parameter_list=parameter_list)
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
+            learning_rate=0.1, step_each_epoch=10000, epochs=120),
+                                 parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -369,11 +374,11 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = SGDOptimizer(
-            learning_rate=fluid.layers.noam_decay(
-                d_model=512, warmup_steps=8000),
-            parameter_list=parameter_list)
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
+            d_model=512, warmup_steps=8000),
+                                 parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -391,6 +396,7 @@ def test_sgd(self):
 
 
 class TestOptimizerLearningRate(unittest.TestCase):
+
     def func_test_constant_lr(self):
         with fluid.dygraph.guard():
             a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
@@ -403,12 +409,12 @@ def func_test_constant_lr(self):
 
             loss = fluid.layers.reduce_mean(b)
 
-            adam = fluid.optimizer.Adam(
-                0.001, parameter_list=linear.parameters())
+            adam = fluid.optimizer.Adam(0.001,
+                                        parameter_list=linear.parameters())
 
             self.assertTrue(
-                np.allclose(
-                    adam.current_step_lr(), 0.001, rtol=1e-06, atol=0.0))
+                np.allclose(adam.current_step_lr(), 0.001, rtol=1e-06,
+                            atol=0.0))
 
             for i in range(10):
                 adam.minimize(loss)
@@ -436,13 +442,12 @@ def func_test_lr_decay(self):
             bd = [2, 4, 6, 8]
             value = [0.2, 0.4, 0.6, 0.8, 1.0]
 
-            adam = fluid.optimizer.Adam(
-                fluid.dygraph.PiecewiseDecay(bd, value, 0),
-                parameter_list=linear.parameters())
+            adam = fluid.optimizer.Adam(fluid.dygraph.PiecewiseDecay(
+                bd, value, 0),
+                                        parameter_list=linear.parameters())
 
             self.assertTrue(
-                np.allclose(
-                    adam.current_step_lr(), 0.2, rtol=1e-06, atol=0.0))
+                np.allclose(adam.current_step_lr(), 0.2, rtol=1e-06, atol=0.0))
 
             ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0]
             for i in range(12):
@@ -469,17 +474,15 @@ def func_test_lr_decay_natural_exp(self):
             loss = fluid.layers.reduce_mean(b)
             base_lr = 1.0
 
-            adam = fluid.optimizer.Adam(
-                fluid.dygraph.NaturalExpDecay(
-                    learning_rate=base_lr,
-                    decay_steps=3,
-                    decay_rate=0.5,
-                    staircase=True),
-                parameter_list=linear.parameters())
+            adam = fluid.optimizer.Adam(fluid.dygraph.NaturalExpDecay(
+                learning_rate=base_lr,
+                decay_steps=3,
+                decay_rate=0.5,
+                staircase=True),
+                                        parameter_list=linear.parameters())
 
             self.assertTrue(
-                np.allclose(
-                    adam.current_step_lr(), 1.0, rtol=1e-06, atol=0.0))
+                np.allclose(adam.current_step_lr(), 1.0, rtol=1e-06, atol=0.0))
 
             ret = [1.0, 1.0, 1.0, np.exp(-0.5), np.exp(-0.5)]
             for i in range(5):
@@ -513,24 +516,23 @@ def func_test_set_lr(self):
                 adam.minimize(loss)
                 lr = adam.current_step_lr()
                 self.assertTrue(
-                    np.allclose(
-                        lr, lr_list[i], rtol=1e-06, atol=0.0))
+                    np.allclose(lr, lr_list[i], rtol=1e-06, atol=0.0))
 
-            lr_var = fluid.layers.create_global_var(
-                shape=[1], value=0.7, dtype='float32')
+            lr_var = fluid.layers.create_global_var(shape=[1],
+                                                    value=0.7,
+                                                    dtype='float32')
             adam.set_lr(lr_var)
             adam.minimize(loss)
             lr = adam.current_step_lr()
             self.assertTrue(np.allclose(lr, 0.7, rtol=1e-06, atol=0.0))
 
             with self.assertRaises(RuntimeError):
-                adam = fluid.optimizer.Adam(
-                    fluid.dygraph.NaturalExpDecay(
-                        learning_rate=0.1,
-                        decay_steps=3,
-                        decay_rate=0.5,
-                        staircase=True),
-                    parameter_list=linear.parameters())
+                adam = fluid.optimizer.Adam(fluid.dygraph.NaturalExpDecay(
+                    learning_rate=0.1,
+                    decay_steps=3,
+                    decay_rate=0.5,
+                    staircase=True),
+                                            parameter_list=linear.parameters())
                 adam.set_lr(0.01)
 
     def test_set_lr(self):
@@ -540,9 +542,11 @@ def test_set_lr(self):
 
 
 class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = MomentumOptimizer(
-            learning_rate=0.001, momentum=0.9, parameter_list=parameter_list)
+        optimizer = MomentumOptimizer(learning_rate=0.001,
+                                      momentum=0.9,
+                                      parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -559,9 +563,11 @@ def test_momentum(self):
 
 
 class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = LarsMomentumOptimizer(
-            learning_rate=0.001, momentum=0.9, parameter_list=parameter_list)
+        optimizer = LarsMomentumOptimizer(learning_rate=0.001,
+                                          momentum=0.9,
+                                          parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -578,9 +584,10 @@ def test_larsmomentum(self):
 
 
 class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = AdagradOptimizer(
-            learning_rate=0.2, parameter_list=parameter_list)
+        optimizer = AdagradOptimizer(learning_rate=0.2,
+                                     parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -597,9 +604,10 @@ def test_adagrad(self):
 
 
 class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = AdamaxOptimizer(
-            learning_rate=0.2, parameter_list=parameter_list)
+        optimizer = AdamaxOptimizer(learning_rate=0.2,
+                                    parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -616,19 +624,21 @@ def test_adamax(self):
 
 
 class TestImperativeDpsgdOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = DpsgdOptimizer(
-            learning_rate=0.01,
-            clip=10.0,
-            batch_size=16.0,
-            sigma=1.0,
-            parameter_list=parameter_list)
+        optimizer = DpsgdOptimizer(learning_rate=0.01,
+                                   clip=10.0,
+                                   batch_size=16.0,
+                                   sigma=1.0,
+                                   parameter_list=parameter_list)
         optimizer._seed = 100
         return optimizer
 
     def get_optimizer(self):
-        optimizer = DpsgdOptimizer(
-            learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0)
+        optimizer = DpsgdOptimizer(learning_rate=0.01,
+                                   clip=10.0,
+                                   batch_size=16.0,
+                                   sigma=1.0)
         optimizer._seed = 100
         return optimizer
 
@@ -642,9 +652,10 @@ def test_dpsgd(self):
 
 
 class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = DecayedAdagradOptimizer(
-            learning_rate=0.2, parameter_list=parameter_list)
+        optimizer = DecayedAdagradOptimizer(learning_rate=0.2,
+                                            parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -661,17 +672,18 @@ def test_decayadagrad(self):
 
 
 class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = AdadeltaOptimizer(
-            learning_rate=0.0003,
-            epsilon=1.0e-6,
-            rho=0.95,
-            parameter_list=parameter_list)
+        optimizer = AdadeltaOptimizer(learning_rate=0.0003,
+                                      epsilon=1.0e-6,
+                                      rho=0.95,
+                                      parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
-        optimizer = AdadeltaOptimizer(
-            learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
+        optimizer = AdadeltaOptimizer(learning_rate=0.0003,
+                                      epsilon=1.0e-6,
+                                      rho=0.95)
         return optimizer
 
     def func_test_adadelta(self):
@@ -684,9 +696,10 @@ def test_adadelta(self):
 
 
 class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = RMSPropOptimizer(
-            learning_rate=0.1, parameter_list=parameter_list)
+        optimizer = RMSPropOptimizer(learning_rate=0.1,
+                                     parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -703,9 +716,10 @@ def test_rmsprop(self):
 
 
 class TestImperativeFtrlOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = FtrlOptimizer(
-            learning_rate=0.1, parameter_list=parameter_list)
+        optimizer = FtrlOptimizer(learning_rate=0.1,
+                                  parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -726,16 +740,16 @@ def exclude_fn(param):
 
 
 class TestImperativeLambOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = LambOptimizer(
-            learning_rate=0.002,
-            exclude_from_weight_decay_fn=exclude_fn,
-            parameter_list=parameter_list)
+        optimizer = LambOptimizer(learning_rate=0.002,
+                                  exclude_from_weight_decay_fn=exclude_fn,
+                                  parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
-        optimizer = LambOptimizer(
-            learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn)
+        optimizer = LambOptimizer(learning_rate=0.002,
+                                  exclude_from_weight_decay_fn=exclude_fn)
         return optimizer
 
     # should fix: may fail in CI-windows
@@ -744,9 +758,11 @@ def _test_lamb(self):
 
 
 class TestImperativeModelAverage(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = ModelAverage(
-            0.15, min_average_window=10000, max_average_window=12500)
+        optimizer = ModelAverage(0.15,
+                                 min_average_window=10000,
+                                 max_average_window=12500)
         return optimizer
 
     def func_test_modelaverage(self):
@@ -760,13 +776,13 @@ def test_modelaverage(self):
 
 
 class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = DGCMomentumOptimizer(
-            learning_rate=0.0001,
-            momentum=0.9,
-            rampup_step=1000,
-            rampup_begin_step=1252,
-            sparsity=[0.999, 0.999])
+        optimizer = DGCMomentumOptimizer(learning_rate=0.0001,
+                                         momentum=0.9,
+                                         rampup_step=1000,
+                                         rampup_begin_step=1252,
+                                         sparsity=[0.999, 0.999])
         return optimizer
 
     def func_test_dgcmomentum(self):
@@ -780,6 +796,7 @@ def test_dgcmomentum(self):
 
 
 class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = ExponentialMovingAverage(0.999)
         return optimizer
@@ -795,6 +812,7 @@ def test_exponentialmoving(self):
 
 
 class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = fluid.optimizer.SGD(learning_rate=0.5,
                                         parameter_list=parameter_list)
@@ -812,6 +830,7 @@ def test_pipline(self):
 
 
 class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = fluid.optimizer.SGD(learning_rate=0.5,
                                         parameter_list=parameter_list)
@@ -829,6 +848,7 @@ def test_lookahead(self):
 
 
 class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = fluid.optimizer.SGD(learning_rate=0.5,
                                         parameter_list=parameter_list)
@@ -846,15 +866,16 @@ def test_recompute(self):
 
 
 class TestImperativeOptimizerList(unittest.TestCase):
+
     def func_test_parameter_list(self):
         with fluid.dygraph.guard():
             linear_1 = Linear(10, 10)
             linear_2 = Linear(10, 10)
 
-            sgd = SGDOptimizer(
-                1.0,
-                parameter_list=itertools.chain(linear_1.parameters(),
-                                               linear_2.parameters()))
+            sgd = SGDOptimizer(1.0,
+                               parameter_list=itertools.chain(
+                                   linear_1.parameters(),
+                                   linear_2.parameters()))
 
             in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
             in_data = fluid.dygraph.to_variable(in_np)
@@ -866,8 +887,8 @@ def func_test_parameter_list(self):
             sgd.minimize(loss)
 
             self.assertTrue(
-                len(sgd._parameter_list) ==
-                len(linear_1.parameters() + linear_2.parameters()))
+                len(sgd._parameter_list) == len(linear_1.parameters() +
+                                                linear_2.parameters()))
 
     def test_parameter_list(self):
         with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
index b27ce6bb01f86..2bcf0b97bf859 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -35,6 +35,7 @@
 
 
 class MLP(fluid.Layer):
+
     def __init__(self, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
@@ -48,6 +49,7 @@ def forward(self, inputs):
 
 
 class TestImperativeOptimizerBase(unittest.TestCase):
+
     def setUp(self):
         self.batch_num = 20
 
@@ -58,6 +60,7 @@ def get_optimizer(self):
         raise NotImplementedError()
 
     def reader_decorator(self, reader):
+
         def _reader_imple():
             for item in reader():
                 image = np.array(item[0]).reshape(1, 784)
@@ -70,8 +73,8 @@ def _check_exception(self, exception_message, place=None):
         seed = 90
         batch_size = 128
         if place == None:
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
 
         try:
             paddle.disable_static()
@@ -90,8 +93,8 @@ def _check_mlp(self, place=None):
         batch_size = 128
 
         if place == None:
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
 
         paddle.disable_static(place)
         paddle.seed(seed)
@@ -101,12 +104,11 @@ def _check_mlp(self, place=None):
         optimizer = self.get_optimizer_dygraph(parameter_list=mlp.parameters())
 
         batch_py_reader = fluid.io.PyReader(capacity=1)
-        batch_py_reader.decorate_sample_list_generator(
-            paddle.batch(
-                self.reader_decorator(paddle.dataset.mnist.train()),
-                batch_size=batch_size,
-                drop_last=True),
-            places=fluid.CPUPlace())
+        batch_py_reader.decorate_sample_list_generator(paddle.batch(
+            self.reader_decorator(paddle.dataset.mnist.train()),
+            batch_size=batch_size,
+            drop_last=True),
+                                                       places=fluid.CPUPlace())
 
         dy_param_init_value = {}
         for batch_id, data in enumerate(batch_py_reader()):
@@ -147,18 +149,20 @@ def _check_mlp(self, place=None):
             paddle.framework.random._manual_program_seed(seed)
 
             if place == None:
-                place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-                ) else fluid.CUDAPlace(0)
+                place = fluid.CPUPlace(
+                ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
 
             exe = fluid.Executor(place)
 
             mlp = MLP()
             optimizer = self.get_optimizer()
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+            train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                        batch_size=128,
+                                        drop_last=True)
 
-            img = fluid.layers.data(
-                name='pixel', shape=[1, 28, 28], dtype='float32')
+            img = fluid.layers.data(name='pixel',
+                                    shape=[1, 28, 28],
+                                    dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             img = fluid.layers.reshape(img, shape=[batch_size, 784])
             cost = mlp(img)
@@ -183,14 +187,16 @@ def _check_mlp(self, place=None):
 
                 static_x_data = np.array(
                     [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    [128, 1])
+                y_data = np.array([x[1] for x in data
+                                   ]).astype('int64').reshape([128, 1])
 
                 fetch_list = [avg_loss.name]
                 fetch_list.extend(static_param_name_list)
                 out = exe.run(fluid.default_main_program(),
-                              feed={"pixel": static_x_data,
-                                    "label": y_data},
+                              feed={
+                                  "pixel": static_x_data,
+                                  "label": y_data
+                              },
                               fetch_list=fetch_list)
                 if isinstance(optimizer._learning_rate,
                               paddle.optimizer.lr.LRScheduler):
@@ -216,13 +222,13 @@ def _check_mlp(self, place=None):
         for key, value in six.iteritems(static_param_value):
             if core.is_compiled_with_rocm():
                 self.assertTrue(
-                    np.allclose(
-                        value, dy_param_value[key], atol=1e-3))
+                    np.allclose(value, dy_param_value[key], atol=1e-3))
             else:
                 self.assertTrue(np.allclose(value, dy_param_value[key]))
 
 
 class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         bd = [3, 6, 9]
         optimizer = paddle.optimizer.SGD(
@@ -250,17 +256,18 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(
-            learning_rate=paddle.optimizer.lr.NaturalExpDecay(
-                learning_rate=0.5, gamma=0.9),
+            learning_rate=paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5,
+                                                              gamma=0.9),
             parameters=parameter_list)
         return optimizer
 
     def get_optimizer(self):
         optimizer = paddle.optimizer.SGD(
-            learning_rate=paddle.optimizer.lr.NaturalExpDecay(
-                learning_rate=0.5, gamma=0.9))
+            learning_rate=paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5,
+                                                              gamma=0.9))
         return optimizer
 
     def func_test_sgd(self):
@@ -273,6 +280,7 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(
             learning_rate=paddle.optimizer.lr.ExponentialDecay(
@@ -296,6 +304,7 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.Adam(
             learning_rate=paddle.optimizer.lr.InverseTimeDecay(
@@ -319,10 +328,12 @@ def test_adam(self):
 
 
 class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(
-            learning_rate=paddle.optimizer.lr.PolynomialDecay(
-                learning_rate=0.5, decay_steps=5, cycle=self.cycle),
+            learning_rate=paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5,
+                                                              decay_steps=5,
+                                                              cycle=self.cycle),
             parameters=parameter_list)
         return optimizer
 
@@ -352,6 +363,7 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerCosineAnnealingDecay(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(
             learning_rate=paddle.optimizer.lr.CosineAnnealingDecay(
@@ -375,17 +387,19 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(
-            learning_rate=paddle.optimizer.lr.NoamDecay(
-                d_model=0.01, warmup_steps=100, verbose=True),
+            learning_rate=paddle.optimizer.lr.NoamDecay(d_model=0.01,
+                                                        warmup_steps=100,
+                                                        verbose=True),
             parameters=parameter_list)
         return optimizer
 
     def get_optimizer(self):
         optimizer = paddle.optimizer.SGD(
-            learning_rate=paddle.optimizer.lr.NoamDecay(
-                d_model=0.01, warmup_steps=100))
+            learning_rate=paddle.optimizer.lr.NoamDecay(d_model=0.01,
+                                                        warmup_steps=100))
         return optimizer
 
     def func_test_sgd(self):
@@ -398,6 +412,7 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerLambdaDecay(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(
             learning_rate=paddle.optimizer.lr.LambdaDecay(
@@ -421,21 +436,23 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerLinearWarmup(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(
-            learning_rate=paddle.optimizer.lr.LinearWarmup(
-                learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5),
+            learning_rate=paddle.optimizer.lr.LinearWarmup(learning_rate=0.5,
+                                                           warmup_steps=20,
+                                                           start_lr=0,
+                                                           end_lr=0.5),
             parameters=parameter_list)
         return optimizer
 
     def get_optimizer(self):
         optimizer = paddle.optimizer.SGD(
-            learning_rate=paddle.optimizer.lr.LinearWarmup(
-                learning_rate=0.5,
-                warmup_steps=20,
-                start_lr=0,
-                end_lr=0.5,
-                verbose=True))
+            learning_rate=paddle.optimizer.lr.LinearWarmup(learning_rate=0.5,
+                                                           warmup_steps=20,
+                                                           start_lr=0,
+                                                           end_lr=0.5,
+                                                           verbose=True))
         return optimizer
 
     def func_test_sgd(self):
@@ -448,6 +465,7 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerMultiStepDecay(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(
             learning_rate=paddle.optimizer.lr.MultiStepDecay(
@@ -471,10 +489,12 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerStepLR(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(
-            learning_rate=paddle.optimizer.lr.StepDecay(
-                learning_rate=0.5, step_size=5, gamma=0.8),
+            learning_rate=paddle.optimizer.lr.StepDecay(learning_rate=0.5,
+                                                        step_size=5,
+                                                        gamma=0.8),
             parameters=parameter_list)
         return optimizer
 
@@ -494,6 +514,7 @@ def test_sgd(self):
 
 
 class TestImperativeOptimizerReduceOnPlateau(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(
             learning_rate=paddle.optimizer.lr.ReduceOnPlateau(
@@ -517,6 +538,7 @@ def test_sgd(self):
 
 
 class TestOptimizerLearningRate(unittest.TestCase):
+
     def func_test_constant_lr(self):
         with fluid.dygraph.guard():
             a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
@@ -532,8 +554,7 @@ def func_test_constant_lr(self):
             adam = paddle.optimizer.Adam(0.001, parameters=linear.parameters())
 
             self.assertTrue(
-                np.allclose(
-                    adam.get_lr(), 0.001, rtol=1e-06, atol=0.0))
+                np.allclose(adam.get_lr(), 0.001, rtol=1e-06, atol=0.0))
 
             for i in range(10):
                 adam.minimize(loss)
@@ -562,12 +583,11 @@ def func_test_lr_decay(self):
             value = [0.2, 0.4, 0.6, 0.8, 1.0]
 
             scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value)
-            adam = paddle.optimizer.Adam(
-                scheduler, parameters=linear.parameters())
+            adam = paddle.optimizer.Adam(scheduler,
+                                         parameters=linear.parameters())
 
             self.assertTrue(
-                np.allclose(
-                    adam.get_lr(), 0.2, rtol=1e-06, atol=0.0))
+                np.allclose(adam.get_lr(), 0.2, rtol=1e-06, atol=0.0))
 
             ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0]
             for i in range(12):
@@ -593,12 +613,11 @@ def func_test_lr_scheduler_natural_exp(self):
             base_lr = 1.0
 
             scheduler = paddle.optimizer.lr.NaturalExpDecay(1.0, gamma=0.5)
-            adam = paddle.optimizer.Adam(
-                scheduler, parameters=linear.parameters())
+            adam = paddle.optimizer.Adam(scheduler,
+                                         parameters=linear.parameters())
 
             self.assertTrue(
-                np.allclose(
-                    adam.get_lr(), 1.0, rtol=1e-06, atol=0.0))
+                np.allclose(adam.get_lr(), 1.0, rtol=1e-06, atol=0.0))
 
             ret = [1.0, np.exp(-0.5), np.exp(-1)]
             for i in range(3):
@@ -632,18 +651,18 @@ def func_test_set_lr(self):
                 adam.minimize(loss)
                 lr = adam.get_lr()
                 self.assertTrue(
-                    np.allclose(
-                        lr, lr_list[i], rtol=1e-06, atol=0.0))
+                    np.allclose(lr, lr_list[i], rtol=1e-06, atol=0.0))
 
             with self.assertRaises(TypeError):
-                lr_var = fluid.layers.create_global_var(
-                    shape=[1], value=0.7, dtype='float32')
+                lr_var = fluid.layers.create_global_var(shape=[1],
+                                                        value=0.7,
+                                                        dtype='float32')
                 adam.set_lr(lr_var)
 
             with self.assertRaises(RuntimeError):
                 adam = paddle.optimizer.Adam(
-                    paddle.optimizer.lr.NaturalExpDecay(
-                        learning_rate=0.1, gamma=0.5),
+                    paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.1,
+                                                        gamma=0.5),
                     parameters=linear.parameters())
                 adam.set_lr(0.01)
 
@@ -654,9 +673,11 @@ def test_set_lr(self):
 
 
 class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = MomentumOptimizer(
-            learning_rate=0.001, momentum=0.9, parameter_list=parameter_list)
+        optimizer = MomentumOptimizer(learning_rate=0.001,
+                                      momentum=0.9,
+                                      parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -673,9 +694,11 @@ def test_momentum(self):
 
 
 class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = LarsMomentumOptimizer(
-            learning_rate=0.001, momentum=0.9, parameter_list=parameter_list)
+        optimizer = LarsMomentumOptimizer(learning_rate=0.001,
+                                          momentum=0.9,
+                                          parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -692,9 +715,10 @@ def test_larsmomentum(self):
 
 
 class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = AdagradOptimizer(
-            learning_rate=0.2, parameter_list=parameter_list)
+        optimizer = AdagradOptimizer(learning_rate=0.2,
+                                     parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -711,9 +735,10 @@ def test_adagrad(self):
 
 
 class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = AdamaxOptimizer(
-            learning_rate=0.2, parameter_list=parameter_list)
+        optimizer = AdamaxOptimizer(learning_rate=0.2,
+                                    parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -730,19 +755,21 @@ def test_adamax(self):
 
 
 class TestImperativeDpsgdOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = DpsgdOptimizer(
-            learning_rate=0.01,
-            clip=10.0,
-            batch_size=16.0,
-            sigma=1.0,
-            parameter_list=parameter_list)
+        optimizer = DpsgdOptimizer(learning_rate=0.01,
+                                   clip=10.0,
+                                   batch_size=16.0,
+                                   sigma=1.0,
+                                   parameter_list=parameter_list)
         optimizer._seed = 100
         return optimizer
 
     def get_optimizer(self):
-        optimizer = DpsgdOptimizer(
-            learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0)
+        optimizer = DpsgdOptimizer(learning_rate=0.01,
+                                   clip=10.0,
+                                   batch_size=16.0,
+                                   sigma=1.0)
         optimizer._seed = 100
         return optimizer
 
@@ -756,9 +783,10 @@ def test_dpsgd(self):
 
 
 class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = DecayedAdagradOptimizer(
-            learning_rate=0.2, parameter_list=parameter_list)
+        optimizer = DecayedAdagradOptimizer(learning_rate=0.2,
+                                            parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -775,17 +803,18 @@ def test_decayadagrad(self):
 
 
 class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = AdadeltaOptimizer(
-            learning_rate=0.0003,
-            epsilon=1.0e-6,
-            rho=0.95,
-            parameter_list=parameter_list)
+        optimizer = AdadeltaOptimizer(learning_rate=0.0003,
+                                      epsilon=1.0e-6,
+                                      rho=0.95,
+                                      parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
-        optimizer = AdadeltaOptimizer(
-            learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
+        optimizer = AdadeltaOptimizer(learning_rate=0.0003,
+                                      epsilon=1.0e-6,
+                                      rho=0.95)
         return optimizer
 
     def func_test_adadelta(self):
@@ -798,9 +827,10 @@ def test_adadelta(self):
 
 
 class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = RMSPropOptimizer(
-            learning_rate=0.1, parameter_list=parameter_list)
+        optimizer = RMSPropOptimizer(learning_rate=0.1,
+                                     parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -817,9 +847,10 @@ def test_rmsprop(self):
 
 
 class TestImperativeFtrlOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = FtrlOptimizer(
-            learning_rate=0.1, parameter_list=parameter_list)
+        optimizer = FtrlOptimizer(learning_rate=0.1,
+                                  parameter_list=parameter_list)
         return optimizer
 
     def get_optimizer(self):
@@ -840,6 +871,7 @@ def exclude_fn(param):
 
 
 class TestImperativeLambOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.Lamb(
             learning_rate=0.002,
@@ -858,9 +890,11 @@ def _test_lamb(self):
 
 
 class TestImperativeModelAverage(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = ModelAverage(
-            0.15, min_average_window=10000, max_average_window=12500)
+        optimizer = ModelAverage(0.15,
+                                 min_average_window=10000,
+                                 max_average_window=12500)
         return optimizer
 
     def func_test_modelaverage(self):
@@ -874,13 +908,13 @@ def test_modelaverage(self):
 
 
 class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
-        optimizer = DGCMomentumOptimizer(
-            learning_rate=0.0001,
-            momentum=0.9,
-            rampup_step=1000,
-            rampup_begin_step=1252,
-            sparsity=[0.999, 0.999])
+        optimizer = DGCMomentumOptimizer(learning_rate=0.0001,
+                                         momentum=0.9,
+                                         rampup_step=1000,
+                                         rampup_begin_step=1252,
+                                         sparsity=[0.999, 0.999])
         return optimizer
 
     def func_test_dgcmomentum(self):
@@ -894,6 +928,7 @@ def test_dgcmomentum(self):
 
 
 class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = ExponentialMovingAverage(0.999)
         return optimizer
@@ -909,6 +944,7 @@ def test_exponentialmoving(self):
 
 
 class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(learning_rate=0.5,
                                          parameters=parameter_list)
@@ -926,6 +962,7 @@ def test_pipline(self):
 
 
 class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(learning_rate=0.5,
                                          parameters=parameter_list)
@@ -943,6 +980,7 @@ def test_lookahead(self):
 
 
 class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase):
+
     def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.optimizer.SGD(learning_rate=0.5,
                                          parameters=parameter_list)
@@ -960,6 +998,7 @@ def test_recompute(self):
 
 
 class TestImperativeOptimizerList(unittest.TestCase):
+
     def func_test_parameter_list(self):
         with fluid.dygraph.guard():
             linear_1 = Linear(10, 10)
@@ -980,8 +1019,8 @@ def func_test_parameter_list(self):
             sgd.minimize(loss)
 
             self.assertTrue(
-                len(sgd._parameter_list) ==
-                len(linear_1.parameters() + linear_2.parameters()))
+                len(sgd._parameter_list) == len(linear_1.parameters() +
+                                                linear_2.parameters()))
 
     def test_parameter_list(self):
         with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_parallel_coalesce_split.py b/python/paddle/fluid/tests/unittests/test_imperative_parallel_coalesce_split.py
index 480df7482e305..54da2becfde48 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_parallel_coalesce_split.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_parallel_coalesce_split.py
@@ -26,6 +26,7 @@
 
 
 class MyLayer(fluid.Layer):
+
     def __init__(self, name_scope):
         super(MyLayer, self).__init__(name_scope)
 
@@ -37,6 +38,7 @@ def forward(self, inputs):
 
 
 class TestImperativeParallelCoalesceSplit(unittest.TestCase):
+
     def test_coalesce_split(self):
         with fluid.dygraph.guard():
             test_layer = MyLayer("test_layer")
@@ -47,8 +49,8 @@ def test_coalesce_split(self):
             vars = []
             vars.append(to_variable(np.random.random([2, 3]).astype("float32")))
             vars.append(to_variable(np.random.random([4, 9]).astype("float32")))
-            vars.append(
-                to_variable(np.random.random([10, 1]).astype("float32")))
+            vars.append(to_variable(
+                np.random.random([10, 1]).astype("float32")))
             var_groups = OrderedDict()
             var_groups.setdefault(0, vars)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py b/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py
index cd31b13083de4..b20dcffa8a0f3 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py
@@ -21,6 +21,7 @@
 
 
 class TestImperativePartitialBackward(unittest.TestCase):
+
     def func_partitial_backward(self):
         with fluid.dygraph.guard():
             x = np.random.randn(2, 4, 5).astype("float32")
@@ -39,13 +40,12 @@ def func_partitial_backward(self):
             for param in linear2.parameters():
                 self.assertIsNone(param._grad_ivar())
 
-            optimizer = fluid.optimizer.AdamOptimizer(parameter_list=(
-                linear1.parameters() + linear2.parameters()))
+            optimizer = fluid.optimizer.AdamOptimizer(
+                parameter_list=(linear1.parameters() + linear2.parameters()))
             _, params_grads = optimizer.minimize(loss)
 
-            self.assertListEqual(
-                sorted([p.name for p in linear1.parameters()]),
-                sorted([p_g[0].name for p_g in params_grads]))
+            self.assertListEqual(sorted([p.name for p in linear1.parameters()]),
+                                 sorted([p_g[0].name for p_g in params_grads]))
 
             linear1.clear_gradients()
             linear2.clear_gradients()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 234e327935a50..e5e26111381a7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -31,6 +31,7 @@
 
 
 class SimpleLSTMRNN(fluid.Layer):
+
     def __init__(self,
                  hidden_size,
                  num_steps,
@@ -78,23 +79,29 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
         self.hidden_array = []
 
         for i in range(self._num_layers):
-            pre_hidden = fluid.layers.slice(
-                init_hidden, axes=[0], starts=[i], ends=[i + 1])
-            pre_cell = fluid.layers.slice(
-                init_cell, axes=[0], starts=[i], ends=[i + 1])
-            pre_hidden = fluid.layers.reshape(
-                pre_hidden, shape=[-1, self._hidden_size])
-            pre_cell = fluid.layers.reshape(
-                pre_cell, shape=[-1, self._hidden_size])
+            pre_hidden = fluid.layers.slice(init_hidden,
+                                            axes=[0],
+                                            starts=[i],
+                                            ends=[i + 1])
+            pre_cell = fluid.layers.slice(init_cell,
+                                          axes=[0],
+                                          starts=[i],
+                                          ends=[i + 1])
+            pre_hidden = fluid.layers.reshape(pre_hidden,
+                                              shape=[-1, self._hidden_size])
+            pre_cell = fluid.layers.reshape(pre_cell,
+                                            shape=[-1, self._hidden_size])
             self.hidden_array.append(pre_hidden)
             self.cell_array.append(pre_cell)
 
         res = []
         for index in range(self._num_steps):
-            self._input = fluid.layers.slice(
-                input_embedding, axes=[1], starts=[index], ends=[index + 1])
-            self._input = fluid.layers.reshape(
-                self._input, shape=[-1, self._hidden_size])
+            self._input = fluid.layers.slice(input_embedding,
+                                             axes=[1],
+                                             starts=[index],
+                                             ends=[index + 1])
+            self._input = fluid.layers.reshape(self._input,
+                                               shape=[-1, self._hidden_size])
             for k in range(self._num_layers):
                 pre_hidden = self.hidden_array[k]
                 pre_cell = self.cell_array[k]
@@ -105,8 +112,9 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                 gate_input = fluid.layers.matmul(x=nn, y=weight_1)
 
                 gate_input = fluid.layers.elementwise_add(gate_input, bias)
-                i, j, f, o = fluid.layers.split(
-                    gate_input, num_or_sections=4, dim=-1)
+                i, j, f, o = fluid.layers.split(gate_input,
+                                                num_or_sections=4,
+                                                dim=-1)
                 c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
                     i) * fluid.layers.tanh(j)
                 m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
@@ -120,8 +128,8 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                         dropout_prob=self._dropout,
                         dropout_implementation='upscale_in_train')
             res.append(
-                fluid.layers.reshape(
-                    self._input, shape=[1, -1, self._hidden_size]))
+                fluid.layers.reshape(self._input,
+                                     shape=[1, -1, self._hidden_size]))
         real_res = fluid.layers.concat(res, 0)
         real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2])
         last_hidden = fluid.layers.concat(self.hidden_array, 1)
@@ -136,6 +144,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
 
 
 class PtbModel(fluid.Layer):
+
     def __init__(self,
                  hidden_size,
                  vocab_size,
@@ -151,12 +160,11 @@ def __init__(self,
         self.num_layers = num_layers
         self.num_steps = num_steps
         self.dropout = dropout
-        self.simple_lstm_rnn = SimpleLSTMRNN(
-            hidden_size,
-            num_steps,
-            num_layers=num_layers,
-            init_scale=init_scale,
-            dropout=dropout)
+        self.simple_lstm_rnn = SimpleLSTMRNN(hidden_size,
+                                             num_steps,
+                                             num_layers=num_layers,
+                                             init_scale=init_scale,
+                                             dropout=dropout)
         self.embedding = Embedding(
             size=[vocab_size, hidden_size],
             dtype='float32',
@@ -193,16 +201,17 @@ def forward(self, input, label, init_hidden, init_cell):
                 x_emb,
                 dropout_prob=self.drop_out,
                 dropout_implementation='upscale_in_train')
-        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h,
-                                                               init_c)
+        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(
+            x_emb, init_h, init_c)
         rnn_out = fluid.layers.reshape(
             rnn_out, shape=[-1, self.num_steps, self.hidden_size])
         projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
         projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
-        projection = fluid.layers.reshape(
-            projection, shape=[-1, self.vocab_size])
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=projection, label=label, soft_label=False)
+        projection = fluid.layers.reshape(projection,
+                                          shape=[-1, self.vocab_size])
+        loss = fluid.layers.softmax_with_cross_entropy(logits=projection,
+                                                       label=label,
+                                                       soft_label=False)
         loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
         loss = fluid.layers.reduce_mean(loss, dim=[0])
         loss = fluid.layers.reduce_sum(loss)
@@ -211,6 +220,7 @@ def forward(self, input, label, init_hidden, init_cell):
 
 
 class TestDygraphPtbRnn(unittest.TestCase):
+
     def func_test_ptb_rnn(self):
         for is_sparse in [True, False]:
             self.ptb_rnn_cpu_float32(is_sparse)
@@ -235,16 +245,15 @@ def ptb_rnn_cpu_float32(self, is_sparse):
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale,
-                is_sparse=is_sparse)
-
-            sgd = SGDOptimizer(
-                learning_rate=1e-3, parameter_list=ptb_model.parameters())
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale,
+                                 is_sparse=is_sparse)
+
+            sgd = SGDOptimizer(learning_rate=1e-3,
+                               parameter_list=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -260,8 +269,8 @@ def ptb_rnn_cpu_float32(self, is_sparse):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
@@ -302,24 +311,26 @@ def ptb_rnn_cpu_float32(self, is_sparse):
         with new_program_scope():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale,
-                is_sparse=is_sparse)
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale,
+                                 is_sparse=is_sparse)
 
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             sgd = SGDOptimizer(learning_rate=1e-3)
-            x = fluid.layers.data(
-                name="x", shape=[-1, num_steps], dtype='int64')
+            x = fluid.layers.data(name="x",
+                                  shape=[-1, num_steps],
+                                  dtype='int64')
             y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
-            init_hidden = fluid.layers.data(
-                name="init_hidden", shape=[1], dtype='float32')
-            init_cell = fluid.layers.data(
-                name="init_cell", shape=[1], dtype='float32')
+            init_hidden = fluid.layers.data(name="init_hidden",
+                                            shape=[1],
+                                            dtype='float32')
+            init_cell = fluid.layers.data(name="init_cell",
+                                          shape=[1],
+                                          dtype='float32')
 
             static_loss, static_last_hidden, static_last_cell = ptb_model(
                 x, y, init_hidden, init_cell)
@@ -344,8 +355,8 @@ def ptb_rnn_cpu_float32(self, is_sparse):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 fetch_list = [static_loss, static_last_hidden, static_last_cell]
                 fetch_list.extend(static_param_name_list)
                 out = exe.run(fluid.default_main_program(),
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
index f659d83435433..06bca877c8775 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
@@ -30,6 +30,7 @@
 
 
 class TestDygraphPtbRnnSortGradient(unittest.TestCase):
+
     def func_ptb_rnn_sort_gradient(self):
         for is_sparse in [True, False]:
             self.ptb_rnn_sort_gradient_cpu_float32(is_sparse)
@@ -50,16 +51,15 @@ def ptb_rnn_sort_gradient_cpu_float32(self, is_sparse):
             paddle.framework.random._manual_program_seed(seed)
 
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale,
-                is_sparse=is_sparse)
-
-            sgd = SGDOptimizer(
-                learning_rate=1e-3, parameter_list=ptb_model.parameters())
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale,
+                                 is_sparse=is_sparse)
+
+            sgd = SGDOptimizer(learning_rate=1e-3,
+                               parameter_list=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -73,14 +73,14 @@ def ptb_rnn_sort_gradient_cpu_float32(self, is_sparse):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
                 if i == 0:
                     for param in ptb_model.parameters():
                         dy_param_init[param.name] = param.numpy()
@@ -99,24 +99,26 @@ def ptb_rnn_sort_gradient_cpu_float32(self, is_sparse):
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale,
-                is_sparse=is_sparse)
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale,
+                                 is_sparse=is_sparse)
 
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             sgd = SGDOptimizer(learning_rate=1e-3)
-            x = fluid.layers.data(
-                name="x", shape=[-1, num_steps, 1], dtype='int64')
+            x = fluid.layers.data(name="x",
+                                  shape=[-1, num_steps, 1],
+                                  dtype='int64')
             y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
-            init_hidden = fluid.layers.data(
-                name="init_hidden", shape=[1], dtype='float32')
-            init_cell = fluid.layers.data(
-                name="init_cell", shape=[1], dtype='float32')
+            init_hidden = fluid.layers.data(name="init_hidden",
+                                            shape=[1],
+                                            dtype='float32')
+            init_cell = fluid.layers.data(name="init_cell",
+                                          shape=[1],
+                                          dtype='float32')
 
             static_loss, static_last_hidden, static_last_cell = ptb_model(
                 x, y, init_hidden, init_cell)
@@ -141,8 +143,8 @@ def ptb_rnn_sort_gradient_cpu_float32(self, is_sparse):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 fetch_list = [static_loss, static_last_hidden, static_last_cell]
                 fetch_list.extend(static_param_name_list)
                 out = exe.run(fluid.default_main_program(),
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
index f12ca0a93ffd9..f59256f25f8ff 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py
@@ -29,6 +29,7 @@
 
 
 class RecurrentTest(fluid.Layer):
+
     def __init__(self, name_scope):
         super(RecurrentTest, self).__init__(name_scope)
 
@@ -39,12 +40,14 @@ def forward(self, in1, in2):
 
 
 class TestRecurrentFeed(unittest.TestCase):
+
     def test_recurrent_feed(self):
 
         seed = 90
         original_np1 = np.arange(1, 5).reshape(2, 2).astype("float32")
         original_np2 = np.arange(5, 9).reshape(2, 2).astype("float32")
         with fluid.dygraph.guard():
+            fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
             original_in1 = to_variable(original_np1)
@@ -61,8 +64,10 @@ def test_recurrent_feed(self):
                 dyout = out.gradient()
                 original_in1.stop_gradient = True
                 rt.clear_gradients()
+            fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
         with fluid.dygraph.guard():
+            fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
             with _test_eager_guard():
                 fluid.default_startup_program().random_seed = seed
                 fluid.default_main_program().random_seed = seed
@@ -80,14 +85,17 @@ def test_recurrent_feed(self):
                     eager_dyout = out.gradient()
                     original_in1.stop_gradient = True
                     rt.clear_gradients()
+            fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            in1 = fluid.layers.data(
-                name="inp1", shape=[2, 2], append_batch_size=False)
-            in2 = fluid.layers.data(
-                name="inp2", shape=[2, 2], append_batch_size=False)
+            in1 = fluid.layers.data(name="inp1",
+                                    shape=[2, 2],
+                                    append_batch_size=False)
+            in2 = fluid.layers.data(name="inp2",
+                                    shape=[2, 2],
+                                    append_batch_size=False)
             rt1 = RecurrentTest("RecurrentTest")
             static_sum_out, static_out = rt1(in1, in2)
             fluid.backward.append_backward(static_sum_out)
@@ -98,11 +106,12 @@ def test_recurrent_feed(self):
                 0)._find_var_recursive(static_out.name + "@GRAD")
             fetch_list = [static_sum_out, static_out, static_dout]
             for i in range(3):
-                out = exe.run(
-                    fluid.default_main_program(),
-                    feed={"inp1": original_np1,
-                          "inp2": original_np2},
-                    fetch_list=fetch_list)
+                out = exe.run(fluid.default_main_program(),
+                              feed={
+                                  "inp1": original_np1,
+                                  "inp2": original_np2
+                              },
+                              fetch_list=fetch_list)
                 static_out_value = out[1]
                 static_sum_out = out[0]
                 static_dout = out[2]
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
index 08320d04d9996..ac41f84be34cb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
@@ -31,6 +31,7 @@
 
 
 class Policy(fluid.dygraph.Layer):
+
     def __init__(self, input_size):
         super(Policy, self).__init__()
 
@@ -51,6 +52,7 @@ def forward(self, inputs):
 
 
 class TestImperativeMnist(unittest.TestCase):
+
     def test_mnist_float32(self):
         seed = 90
         epoch_num = 1
@@ -87,8 +89,8 @@ def run_dygraph():
             loss_probs = fluid.layers.elementwise_mul(dy_reward, loss_probs)
             loss = fluid.layers.reduce_sum(loss_probs)
 
-            sgd = SGDOptimizer(
-                learning_rate=1e-3, parameter_list=policy.parameters())
+            sgd = SGDOptimizer(learning_rate=1e-3,
+                               parameter_list=policy.parameters())
 
             dy_param_init_value = {}
 
@@ -126,12 +128,15 @@ def run_dygraph():
 
             st_sgd = SGDOptimizer(learning_rate=1e-3)
 
-            st_state = fluid.layers.data(
-                name='st_state', shape=[4], dtype='float32')
-            st_reward = fluid.layers.data(
-                name='st_reward', shape=[1], dtype='float32')
-            st_mask = fluid.layers.data(
-                name='st_mask', shape=[2], dtype='float32')
+            st_state = fluid.layers.data(name='st_state',
+                                         shape=[4],
+                                         dtype='float32')
+            st_reward = fluid.layers.data(name='st_reward',
+                                          shape=[1],
+                                          dtype='float32')
+            st_mask = fluid.layers.data(name='st_mask',
+                                        shape=[2],
+                                        dtype='float32')
 
             st_loss_probs = policy(st_state)
 
@@ -139,8 +144,8 @@ def run_dygraph():
             st_loss_probs = fluid.layers.elementwise_mul(st_loss_probs, st_mask)
             st_loss_probs = fluid.layers.reduce_sum(st_loss_probs, dim=-1)
 
-            st_loss_probs = fluid.layers.elementwise_mul(st_reward,
-                                                         st_loss_probs)
+            st_loss_probs = fluid.layers.elementwise_mul(
+                st_reward, st_loss_probs)
             st_loss = fluid.layers.reduce_sum(st_loss_probs)
 
             st_sgd.minimize(st_loss)
@@ -160,12 +165,13 @@ def run_dygraph():
             fetch_list = [st_loss.name]
             fetch_list.extend(static_param_name_list)
 
-            out = exe.run(
-                fluid.default_main_program(),
-                feed={"st_state": state,
-                      "st_reward": reward,
-                      "st_mask": mask},
-                fetch_list=fetch_list)
+            out = exe.run(fluid.default_main_program(),
+                          feed={
+                              "st_state": state,
+                              "st_reward": reward,
+                              "st_mask": mask
+                          },
+                          fetch_list=fetch_list)
 
             static_param_value = {}
             static_out = out[0]
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index e48e75c661fd1..69ebf875b3d0b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -78,6 +78,7 @@ def optimizer_setting(params, parameter_list=None):
 
 
 class ConvBNLayer(fluid.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -88,16 +89,15 @@ def __init__(self,
                  use_cudnn=False):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=False,
-            use_cudnn=use_cudnn)
+        self._conv = Conv2D(num_channels=num_channels,
+                            num_filters=num_filters,
+                            filter_size=filter_size,
+                            stride=stride,
+                            padding=(filter_size - 1) // 2,
+                            groups=groups,
+                            act=None,
+                            bias_attr=False,
+                            use_cudnn=use_cudnn)
 
         self._batch_norm = BatchNorm(num_filters, act=act)
 
@@ -109,6 +109,7 @@ def forward(self, inputs):
 
 
 class BottleneckBlock(fluid.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -117,33 +118,29 @@ def __init__(self,
                  use_cudnn=False):
         super(BottleneckBlock, self).__init__()
 
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1,
-            act='relu',
-            use_cudnn=use_cudnn)
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu',
-            use_cudnn=use_cudnn)
-        self.conv2 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters * 4,
-            filter_size=1,
-            act=None,
-            use_cudnn=use_cudnn)
+        self.conv0 = ConvBNLayer(num_channels=num_channels,
+                                 num_filters=num_filters,
+                                 filter_size=1,
+                                 act='relu',
+                                 use_cudnn=use_cudnn)
+        self.conv1 = ConvBNLayer(num_channels=num_filters,
+                                 num_filters=num_filters,
+                                 filter_size=3,
+                                 stride=stride,
+                                 act='relu',
+                                 use_cudnn=use_cudnn)
+        self.conv2 = ConvBNLayer(num_channels=num_filters,
+                                 num_filters=num_filters * 4,
+                                 filter_size=1,
+                                 act=None,
+                                 use_cudnn=use_cudnn)
 
         if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * 4,
-                filter_size=1,
-                stride=stride,
-                use_cudnn=use_cudnn)
+            self.short = ConvBNLayer(num_channels=num_channels,
+                                     num_filters=num_filters * 4,
+                                     filter_size=1,
+                                     stride=stride,
+                                     use_cudnn=use_cudnn)
 
         self.shortcut = shortcut
 
@@ -164,6 +161,7 @@ def forward(self, inputs):
 
 
 class ResNet(fluid.Layer):
+
     def __init__(self, layers=50, class_dim=102, use_cudnn=True):
         super(ResNet, self).__init__()
 
@@ -181,15 +179,16 @@ def __init__(self, layers=50, class_dim=102, use_cudnn=True):
         num_channels = [64, 256, 512, 1024]
         num_filters = [64, 128, 256, 512]
 
-        self.conv = ConvBNLayer(
-            num_channels=3,
-            num_filters=64,
-            filter_size=7,
-            stride=2,
-            act='relu',
-            use_cudnn=use_cudnn)
-        self.pool2d_max = Pool2D(
-            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+        self.conv = ConvBNLayer(num_channels=3,
+                                num_filters=64,
+                                filter_size=7,
+                                stride=2,
+                                act='relu',
+                                use_cudnn=use_cudnn)
+        self.pool2d_max = Pool2D(pool_size=3,
+                                 pool_stride=2,
+                                 pool_padding=1,
+                                 pool_type='max')
 
         self.bottleneck_block_list = []
         for block in range(len(depth)):
@@ -197,18 +196,18 @@ def __init__(self, layers=50, class_dim=102, use_cudnn=True):
             for i in range(depth[block]):
                 bottleneck_block = self.add_sublayer(
                     'bb_%d_%d' % (block, i),
-                    BottleneckBlock(
-                        num_channels=num_channels[block]
-                        if i == 0 else num_filters[block] * 4,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        shortcut=shortcut,
-                        use_cudnn=use_cudnn))
+                    BottleneckBlock(num_channels=num_channels[block]
+                                    if i == 0 else num_filters[block] * 4,
+                                    num_filters=num_filters[block],
+                                    stride=2 if i == 0 and block != 0 else 1,
+                                    shortcut=shortcut,
+                                    use_cudnn=use_cudnn))
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
 
-        self.pool2d_avg = Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True)
+        self.pool2d_avg = Pool2D(pool_size=7,
+                                 pool_type='avg',
+                                 global_pooling=True)
 
         self.pool2d_avg_output = num_filters[-1] * 4 * 1 * 1
 
@@ -234,7 +233,9 @@ def forward(self, inputs):
 
 
 class TestDygraphResnet(unittest.TestCase):
+
     def reader_decorator(self, reader):
+
         def _reader_imple():
             for item in reader():
                 doc = np.array(item[0]).reshape(3, 224, 224)
@@ -256,8 +257,8 @@ def func_test_resnet_float32(self):
             paddle.framework.random._manual_program_seed(seed)
 
             resnet = ResNet()
-            optimizer = optimizer_setting(
-                train_parameters, parameter_list=resnet.parameters())
+            optimizer = optimizer_setting(train_parameters,
+                                          parameter_list=resnet.parameters())
             np.random.seed(seed)
 
             train_reader = paddle.batch(
@@ -275,10 +276,10 @@ def func_test_resnet_float32(self):
                 if batch_id >= batch_num:
                     break
 
-                dy_x_data = np.array(
-                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    batch_size, 1)
+                dy_x_data = np.array([x[0].reshape(3, 224, 224)
+                                      for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data
+                                   ]).astype('int64').reshape(batch_size, 1)
 
                 img = to_variable(dy_x_data)
                 label = to_variable(y_data)
@@ -322,10 +323,10 @@ def func_test_resnet_float32(self):
                 dy_grad_value = {}
                 for param in resnet.parameters():
                     if param.trainable:
-                        np_array = np.array(param._grad_ivar().value()
-                                            .get_tensor())
-                        dy_grad_value[param.name + core.grad_var_suffix(
-                        )] = np_array
+                        np_array = np.array(
+                            param._grad_ivar().value().get_tensor())
+                        dy_grad_value[param.name +
+                                      core.grad_var_suffix()] = np_array
 
                 optimizer.minimize(avg_loss)
                 resnet.clear_gradients()
@@ -349,8 +350,9 @@ def func_test_resnet_float32(self):
                 paddle.dataset.flowers.train(use_xmap=False),
                 batch_size=batch_size)
 
-            img = fluid.layers.data(
-                name='pixel', shape=[3, 224, 224], dtype='float32')
+            img = fluid.layers.data(name='pixel',
+                                    shape=[3, 224, 224],
+                                    dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             out = resnet(img)
             loss = fluid.layers.cross_entropy(input=out, label=label)
@@ -380,8 +382,8 @@ def func_test_resnet_float32(self):
 
                 static_x_data = np.array(
                     [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    [batch_size, 1])
+                y_data = np.array([x[1] for x in data
+                                   ]).astype('int64').reshape([batch_size, 1])
 
                 if traced_layer is not None:
                     traced_layer([static_x_data])
@@ -390,8 +392,10 @@ def func_test_resnet_float32(self):
                 fetch_list.extend(static_param_name_list)
                 fetch_list.extend(static_grad_name_list)
                 out = exe.run(fluid.default_main_program(),
-                              feed={"pixel": static_x_data,
-                                    "label": y_data},
+                              feed={
+                                  "pixel": static_x_data,
+                                  "label": y_data
+                              },
                               fetch_list=fetch_list)
 
                 static_param_value = {}
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
index 93a2b96df52a3..0a1d1c0cfb315 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
@@ -72,6 +72,7 @@ def optimizer_setting(params, parameter_list=None):
 
 
 class TestDygraphResnetSortGradient(unittest.TestCase):
+
     def func_test_resnet_sort_gradient_float32(self):
         seed = 90
 
@@ -83,8 +84,8 @@ def func_test_resnet_sort_gradient_float32(self):
             paddle.framework.random._manual_program_seed(seed)
 
             resnet = ResNet()
-            optimizer = optimizer_setting(
-                train_parameters, parameter_list=resnet.parameters())
+            optimizer = optimizer_setting(train_parameters,
+                                          parameter_list=resnet.parameters())
             np.random.seed(seed)
             import random
             random.seed = seed
@@ -100,10 +101,10 @@ def func_test_resnet_sort_gradient_float32(self):
                 if batch_id >= batch_num:
                     break
 
-                dy_x_data = np.array(
-                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    batch_size, 1)
+                dy_x_data = np.array([x[0].reshape(3, 224, 224)
+                                      for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data
+                                   ]).astype('int64').reshape(batch_size, 1)
 
                 img = to_variable(dy_x_data)
                 label = to_variable(y_data)
@@ -125,10 +126,10 @@ def func_test_resnet_sort_gradient_float32(self):
                 dy_grad_value = {}
                 for param in resnet.parameters():
                     if param.trainable:
-                        np_array = np.array(param._grad_ivar().value()
-                                            .get_tensor())
-                        dy_grad_value[param.name + core.grad_var_suffix(
-                        )] = np_array
+                        np_array = np.array(
+                            param._grad_ivar().value().get_tensor())
+                        dy_grad_value[param.name +
+                                      core.grad_var_suffix()] = np_array
 
                 optimizer.minimize(avg_loss)
                 resnet.clear_gradients()
@@ -154,8 +155,9 @@ def func_test_resnet_sort_gradient_float32(self):
                 paddle.dataset.flowers.train(use_xmap=False),
                 batch_size=batch_size)
 
-            img = fluid.layers.data(
-                name='pixel', shape=[3, 224, 224], dtype='float32')
+            img = fluid.layers.data(name='pixel',
+                                    shape=[3, 224, 224],
+                                    dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             out = resnet(img)
             loss = fluid.layers.cross_entropy(input=out, label=label)
@@ -185,15 +187,17 @@ def func_test_resnet_sort_gradient_float32(self):
 
                 static_x_data = np.array(
                     [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    [batch_size, 1])
+                y_data = np.array([x[1] for x in data
+                                   ]).astype('int64').reshape([batch_size, 1])
 
                 fetch_list = [avg_loss.name]
                 fetch_list.extend(static_param_name_list)
                 fetch_list.extend(static_grad_name_list)
                 out = exe.run(fluid.default_main_program(),
-                              feed={"pixel": static_x_data,
-                                    "label": y_data},
+                              feed={
+                                  "pixel": static_x_data,
+                                  "label": y_data
+                              },
                               fetch_list=fetch_list)
 
                 static_param_value = {}
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index 160c94a549c91..593c046212276 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -31,6 +31,7 @@
 
 
 class SimpleLSTMRNN(fluid.Layer):
+
     def __init__(self,
                  hidden_size,
                  num_steps,
@@ -75,23 +76,29 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
         self.hidden_array = []
 
         for i in range(self._num_layers):
-            pre_hidden = fluid.layers.slice(
-                init_hidden, axes=[0], starts=[i], ends=[i + 1])
-            pre_cell = fluid.layers.slice(
-                init_cell, axes=[0], starts=[i], ends=[i + 1])
-            pre_hidden = fluid.layers.reshape(
-                pre_hidden, shape=[-1, self._hidden_size])
-            pre_cell = fluid.layers.reshape(
-                pre_cell, shape=[-1, self._hidden_size])
+            pre_hidden = fluid.layers.slice(init_hidden,
+                                            axes=[0],
+                                            starts=[i],
+                                            ends=[i + 1])
+            pre_cell = fluid.layers.slice(init_cell,
+                                          axes=[0],
+                                          starts=[i],
+                                          ends=[i + 1])
+            pre_hidden = fluid.layers.reshape(pre_hidden,
+                                              shape=[-1, self._hidden_size])
+            pre_cell = fluid.layers.reshape(pre_cell,
+                                            shape=[-1, self._hidden_size])
             self.hidden_array.append(pre_hidden)
             self.cell_array.append(pre_cell)
 
         res = []
         for index in range(self._num_steps):
-            self._input = fluid.layers.slice(
-                input_embedding, axes=[1], starts=[index], ends=[index + 1])
-            self._input = fluid.layers.reshape(
-                self._input, shape=[-1, self._hidden_size])
+            self._input = fluid.layers.slice(input_embedding,
+                                             axes=[1],
+                                             starts=[index],
+                                             ends=[index + 1])
+            self._input = fluid.layers.reshape(self._input,
+                                               shape=[-1, self._hidden_size])
             for k in range(self._num_layers):
                 pre_hidden = self.hidden_array[k]
                 pre_cell = self.cell_array[k]
@@ -102,8 +109,9 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                 gate_input = fluid.layers.matmul(x=nn, y=weight_1)
 
                 gate_input = fluid.layers.elementwise_add(gate_input, bias)
-                i, j, f, o = fluid.layers.split(
-                    gate_input, num_or_sections=4, dim=-1)
+                i, j, f, o = fluid.layers.split(gate_input,
+                                                num_or_sections=4,
+                                                dim=-1)
                 c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
                     i) * fluid.layers.tanh(j)
                 m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
@@ -117,8 +125,8 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                         dropout_prob=self._dropout,
                         dropout_implementation='upscale_in_train')
             res.append(
-                fluid.layers.reshape(
-                    self._input, shape=[1, -1, self._hidden_size]))
+                fluid.layers.reshape(self._input,
+                                     shape=[1, -1, self._hidden_size]))
         real_res = fluid.layers.concat(res, 0)
         real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2])
         last_hidden = fluid.layers.concat(self.hidden_array, 1)
@@ -133,6 +141,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
 
 
 class PtbModel(fluid.Layer):
+
     def __init__(self,
                  hidden_size,
                  vocab_size,
@@ -147,12 +156,11 @@ def __init__(self,
         self.num_layers = num_layers
         self.num_steps = num_steps
         self.dropout = dropout
-        self.simple_lstm_rnn = SimpleLSTMRNN(
-            hidden_size,
-            num_steps,
-            num_layers=num_layers,
-            init_scale=init_scale,
-            dropout=dropout)
+        self.simple_lstm_rnn = SimpleLSTMRNN(hidden_size,
+                                             num_steps,
+                                             num_layers=num_layers,
+                                             init_scale=init_scale,
+                                             dropout=dropout)
         self.embedding = Embedding(
             size=[vocab_size, hidden_size],
             dtype='float32',
@@ -190,17 +198,18 @@ def forward(self, input, label, init_hidden, init_cell):
                 x_emb,
                 dropout_prob=self.drop_out,
                 dropout_implementation='upscale_in_train')
-        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h,
-                                                               init_c)
+        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(
+            x_emb, init_h, init_c)
         rnn_out = fluid.layers.reshape(
             rnn_out, shape=[-1, self.num_steps, self.hidden_size])
 
         projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
         projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
-        projection = fluid.layers.reshape(
-            projection, shape=[-1, self.vocab_size])
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=projection, label=label, soft_label=False)
+        projection = fluid.layers.reshape(projection,
+                                          shape=[-1, self.vocab_size])
+        loss = fluid.layers.softmax_with_cross_entropy(logits=projection,
+                                                       label=label,
+                                                       soft_label=False)
         loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
         loss = fluid.layers.reduce_mean(loss, dim=[0])
         loss = fluid.layers.reduce_sum(loss)
@@ -209,6 +218,7 @@ def forward(self, input, label, init_hidden, init_cell):
 
 
 class TestDygraphPtbRnn(unittest.TestCase):
+
     def func_setUp(self):
         seed = 90
         hidden_size = 10
@@ -223,12 +233,11 @@ def func_setUp(self):
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             bd = []
             lr_arr = [1.0]
@@ -238,12 +247,11 @@ def func_setUp(self):
                 new_lr = 1.0
                 lr_arr.append(new_lr)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            adam = Adam(
-                learning_rate=fluid.layers.piecewise_decay(
-                    boundaries=bd, values=lr_arr),
-                parameter_list=ptb_model.parameters())
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
+            adam = Adam(learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr_arr),
+                        parameter_list=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -256,14 +264,14 @@ def func_setUp(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
                 if i == 0:
                     for param in ptb_model.parameters():
                         dy_param_init[param.name] = param.numpy()
@@ -309,12 +317,11 @@ def func_testLoadAndSetVarBase(self):
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             bd = []
             lr_arr = [1.0]
@@ -324,12 +331,11 @@ def func_testLoadAndSetVarBase(self):
                 new_lr = 1.0
                 lr_arr.append(new_lr)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            adam = Adam(
-                learning_rate=fluid.layers.piecewise_decay(
-                    boundaries=bd, values=lr_arr),
-                parameter_list=ptb_model.parameters())
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
+            adam = Adam(learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr_arr),
+                        parameter_list=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -342,14 +348,14 @@ def func_testLoadAndSetVarBase(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
                 if i == 0:
                     for param in ptb_model.parameters():
                         dy_param_init[param.name] = param.numpy()
@@ -419,12 +425,11 @@ def func_testSetVariable(self):
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             bd = []
             lr_arr = [1.0]
@@ -434,12 +439,11 @@ def func_testSetVariable(self):
                 new_lr = 1.0
                 lr_arr.append(new_lr)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            adam = Adam(
-                learning_rate=fluid.layers.piecewise_decay(
-                    boundaries=bd, values=lr_arr),
-                parameter_list=ptb_model.parameters())
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
+            adam = Adam(learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr_arr),
+                        parameter_list=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -452,14 +456,14 @@ def func_testSetVariable(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
                 if i == 0:
                     for param in ptb_model.parameters():
                         dy_param_init[param.name] = param.numpy()
@@ -526,12 +530,11 @@ def func_testSetNumpy(self):
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             bd = []
             lr_arr = [1.0]
@@ -541,12 +544,11 @@ def func_testSetNumpy(self):
                 new_lr = 1.0
                 lr_arr.append(new_lr)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            adam = Adam(
-                learning_rate=fluid.layers.piecewise_decay(
-                    boundaries=bd, values=lr_arr),
-                parameter_list=ptb_model.parameters())
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
+            adam = Adam(learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr_arr),
+                        parameter_list=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -559,14 +561,14 @@ def func_testSetNumpy(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
                 if i == 0:
                     for param in ptb_model.parameters():
                         dy_param_init[param.name] = param.numpy()
@@ -637,20 +639,18 @@ def func_testSetVariableBeforeTrain(self):
 
         with fluid.dygraph.guard():
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
-
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            adam = Adam(
-                learning_rate=0.0,
-                beta1=0.8,
-                beta2=0.6,
-                parameter_list=ptb_model.parameters())
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
+
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
+            adam = Adam(learning_rate=0.0,
+                        beta1=0.8,
+                        beta2=0.6,
+                        parameter_list=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -666,14 +666,14 @@ def func_testSetVariableBeforeTrain(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
 
                 dy_loss.backward()
                 adam.minimize(dy_loss)
@@ -687,12 +687,12 @@ def func_testSetVariableBeforeTrain(self):
 
                 if k.find("beta1_pow_acc_0") > 0:
                     self.assertTrue(
-                        np.array_equal(v.numpy(), self.base_opti[v.name] *
-                                       adam._beta1))
+                        np.array_equal(v.numpy(),
+                                       self.base_opti[v.name] * adam._beta1))
                 if k.find("beta2_pow_acc_0") > 0:
                     self.assertTrue(
-                        np.array_equal(v.numpy(), self.base_opti[v.name] *
-                                       adam._beta2))
+                        np.array_equal(v.numpy(),
+                                       self.base_opti[v.name] * adam._beta2))
 
             state_dict = ptb_model.state_dict()
 
@@ -716,12 +716,11 @@ def func_testLoadAndSetVarBaseBeforeTrain(self):
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             bd = []
             lr_arr = [0.0]
@@ -732,13 +731,12 @@ def func_testLoadAndSetVarBaseBeforeTrain(self):
                 new_lr = 0.0
                 lr_arr.append(new_lr)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            adam = Adam(
-                learning_rate=0.0,
-                beta1=0.8,
-                beta2=0.6,
-                parameter_list=ptb_model.parameters())
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
+            adam = Adam(learning_rate=0.0,
+                        beta1=0.8,
+                        beta2=0.6,
+                        parameter_list=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -755,14 +753,14 @@ def func_testLoadAndSetVarBaseBeforeTrain(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
 
                 dy_loss.backward()
                 adam.minimize(dy_loss)
@@ -776,12 +774,12 @@ def func_testLoadAndSetVarBaseBeforeTrain(self):
 
                 if k.find("beta1_pow_acc_0") > 0:
                     self.assertTrue(
-                        np.array_equal(v.numpy(), self.base_opti[v.name] *
-                                       adam._beta1))
+                        np.array_equal(v.numpy(),
+                                       self.base_opti[v.name] * adam._beta1))
                 if k.find("beta2_pow_acc_0") > 0:
                     self.assertTrue(
-                        np.array_equal(v.numpy(), self.base_opti[v.name] *
-                                       adam._beta2))
+                        np.array_equal(v.numpy(),
+                                       self.base_opti[v.name] * adam._beta2))
 
             # check parameter
 
@@ -808,12 +806,11 @@ def func_testSetNumpyBeforeTrain(self):
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
 
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             bd = []
             lr_arr = [0.0]
@@ -824,14 +821,13 @@ def func_testSetNumpyBeforeTrain(self):
                 new_lr = 0.0
                 lr_arr.append(new_lr)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            adam = Adam(
-                learning_rate=fluid.layers.piecewise_decay(
-                    boundaries=bd, values=lr_arr),
-                beta1=0.8,
-                beta2=0.6,
-                parameter_list=ptb_model.parameters())
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
+            adam = Adam(learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr_arr),
+                        beta1=0.8,
+                        beta2=0.6,
+                        parameter_list=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -858,14 +854,14 @@ def func_testSetNumpyBeforeTrain(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
 
                 dy_loss.backward()
                 adam.minimize(dy_loss)
@@ -879,12 +875,12 @@ def func_testSetNumpyBeforeTrain(self):
 
                 if k.find("beta1_pow_acc_0") > 0:
                     self.assertTrue(
-                        np.array_equal(v.numpy(), self.base_opti[v.name] *
-                                       adam._beta1))
+                        np.array_equal(v.numpy(),
+                                       self.base_opti[v.name] * adam._beta1))
                 if k.find("beta2_pow_acc_0") > 0:
                     self.assertTrue(
-                        np.array_equal(v.numpy(), self.base_opti[v.name] *
-                                       adam._beta2))
+                        np.array_equal(v.numpy(),
+                                       self.base_opti[v.name] * adam._beta2))
 
             # check parameter
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 7e7b2e2fd5206..91bb1b7e94fda 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -31,6 +31,7 @@
 
 
 class SimpleLSTMRNN(fluid.Layer):
+
     def __init__(self,
                  hidden_size,
                  num_steps,
@@ -75,23 +76,29 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
         self.hidden_array = []
 
         for i in range(self._num_layers):
-            pre_hidden = fluid.layers.slice(
-                init_hidden, axes=[0], starts=[i], ends=[i + 1])
-            pre_cell = fluid.layers.slice(
-                init_cell, axes=[0], starts=[i], ends=[i + 1])
-            pre_hidden = fluid.layers.reshape(
-                pre_hidden, shape=[-1, self._hidden_size])
-            pre_cell = fluid.layers.reshape(
-                pre_cell, shape=[-1, self._hidden_size])
+            pre_hidden = fluid.layers.slice(init_hidden,
+                                            axes=[0],
+                                            starts=[i],
+                                            ends=[i + 1])
+            pre_cell = fluid.layers.slice(init_cell,
+                                          axes=[0],
+                                          starts=[i],
+                                          ends=[i + 1])
+            pre_hidden = fluid.layers.reshape(pre_hidden,
+                                              shape=[-1, self._hidden_size])
+            pre_cell = fluid.layers.reshape(pre_cell,
+                                            shape=[-1, self._hidden_size])
             self.hidden_array.append(pre_hidden)
             self.cell_array.append(pre_cell)
 
         res = []
         for index in range(self._num_steps):
-            self._input = fluid.layers.slice(
-                input_embedding, axes=[1], starts=[index], ends=[index + 1])
-            self._input = fluid.layers.reshape(
-                self._input, shape=[-1, self._hidden_size])
+            self._input = fluid.layers.slice(input_embedding,
+                                             axes=[1],
+                                             starts=[index],
+                                             ends=[index + 1])
+            self._input = fluid.layers.reshape(self._input,
+                                               shape=[-1, self._hidden_size])
             for k in range(self._num_layers):
                 pre_hidden = self.hidden_array[k]
                 pre_cell = self.cell_array[k]
@@ -102,8 +109,9 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                 gate_input = fluid.layers.matmul(x=nn, y=weight_1)
 
                 gate_input = fluid.layers.elementwise_add(gate_input, bias)
-                i, j, f, o = fluid.layers.split(
-                    gate_input, num_or_sections=4, dim=-1)
+                i, j, f, o = fluid.layers.split(gate_input,
+                                                num_or_sections=4,
+                                                dim=-1)
                 c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
                     i) * fluid.layers.tanh(j)
                 m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
@@ -117,8 +125,8 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                         dropout_prob=self._dropout,
                         dropout_implementation='upscale_in_train')
             res.append(
-                fluid.layers.reshape(
-                    self._input, shape=[1, -1, self._hidden_size]))
+                fluid.layers.reshape(self._input,
+                                     shape=[1, -1, self._hidden_size]))
         real_res = fluid.layers.concat(res, 0)
         real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2])
         last_hidden = fluid.layers.concat(self.hidden_array, 1)
@@ -133,6 +141,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
 
 
 class PtbModel(fluid.Layer):
+
     def __init__(self,
                  hidden_size,
                  vocab_size,
@@ -147,12 +156,11 @@ def __init__(self,
         self.num_layers = num_layers
         self.num_steps = num_steps
         self.dropout = dropout
-        self.simple_lstm_rnn = SimpleLSTMRNN(
-            hidden_size,
-            num_steps,
-            num_layers=num_layers,
-            init_scale=init_scale,
-            dropout=dropout)
+        self.simple_lstm_rnn = SimpleLSTMRNN(hidden_size,
+                                             num_steps,
+                                             num_layers=num_layers,
+                                             init_scale=init_scale,
+                                             dropout=dropout)
         self.embedding = Embedding(
             size=[vocab_size, hidden_size],
             dtype='float32',
@@ -190,17 +198,18 @@ def forward(self, input, label, init_hidden, init_cell):
                 x_emb,
                 dropout_prob=self.drop_out,
                 dropout_implementation='upscale_in_train')
-        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h,
-                                                               init_c)
+        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(
+            x_emb, init_h, init_c)
         rnn_out = fluid.layers.reshape(
             rnn_out, shape=[-1, self.num_steps, self.hidden_size])
 
         projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
         projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
-        projection = fluid.layers.reshape(
-            projection, shape=[-1, self.vocab_size])
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=projection, label=label, soft_label=False)
+        projection = fluid.layers.reshape(projection,
+                                          shape=[-1, self.vocab_size])
+        loss = fluid.layers.softmax_with_cross_entropy(logits=projection,
+                                                       label=label,
+                                                       soft_label=False)
         loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
         loss = fluid.layers.reduce_mean(loss, dim=[0])
         loss = fluid.layers.reduce_sum(loss)
@@ -209,6 +218,7 @@ def forward(self, input, label, init_hidden, init_cell):
 
 
 class TestDygraphPtbRnn(unittest.TestCase):
+
     def func_setUp(self):
         seed = 90
         hidden_size = 10
@@ -223,12 +233,11 @@ def func_setUp(self):
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             bd = []
             lr_arr = [1.0]
@@ -238,12 +247,12 @@ def func_setUp(self):
                 new_lr = 1.0
                 lr_arr.append(new_lr)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            scheduler = paddle.optimizer.lr.PiecewiseDecay(
-                boundaries=bd, values=lr_arr)
-            adam = Adam(
-                learning_rate=scheduler, parameters=ptb_model.parameters())
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd,
+                                                           values=lr_arr)
+            adam = Adam(learning_rate=scheduler,
+                        parameters=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -256,14 +265,14 @@ def func_setUp(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
                 if i == 0:
                     for param in ptb_model.parameters():
                         dy_param_init[param.name] = param.numpy()
@@ -312,12 +321,11 @@ def func_testLoadAndSetVarBase(self):
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             bd = []
             lr_arr = [1.0]
@@ -327,12 +335,12 @@ def func_testLoadAndSetVarBase(self):
                 new_lr = 1.0
                 lr_arr.append(new_lr)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            scheduler = paddle.optimizer.lr.PiecewiseDecay(
-                boundaries=bd, values=lr_arr)
-            adam = Adam(
-                learning_rate=scheduler, parameters=ptb_model.parameters())
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd,
+                                                           values=lr_arr)
+            adam = Adam(learning_rate=scheduler,
+                        parameters=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -345,14 +353,14 @@ def func_testLoadAndSetVarBase(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
                 if i == 0:
                     for param in ptb_model.parameters():
                         dy_param_init[param.name] = param.numpy()
@@ -420,12 +428,11 @@ def func_testSetVariable(self):
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             bd = []
             lr_arr = [1.0]
@@ -435,12 +442,12 @@ def func_testSetVariable(self):
                 new_lr = 1.0
                 lr_arr.append(new_lr)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            scheduler = paddle.optimizer.lr.PiecewiseDecay(
-                boundaries=bd, values=lr_arr)
-            adam = Adam(
-                learning_rate=scheduler, parameters=ptb_model.parameters())
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd,
+                                                           values=lr_arr)
+            adam = Adam(learning_rate=scheduler,
+                        parameters=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -453,14 +460,14 @@ def func_testSetVariable(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
                 if i == 0:
                     for param in ptb_model.parameters():
                         dy_param_init[param.name] = param.numpy()
@@ -528,12 +535,11 @@ def func_testSetNumpy(self):
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             bd = []
             lr_arr = [1.0]
@@ -543,12 +549,12 @@ def func_testSetNumpy(self):
                 new_lr = 1.0
                 lr_arr.append(new_lr)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            scheduler = paddle.optimizer.lr.PiecewiseDecay(
-                boundaries=bd, values=lr_arr)
-            adam = Adam(
-                learning_rate=scheduler, parameters=ptb_model.parameters())
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd,
+                                                           values=lr_arr)
+            adam = Adam(learning_rate=scheduler,
+                        parameters=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -561,14 +567,14 @@ def func_testSetNumpy(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
                 if i == 0:
                     for param in ptb_model.parameters():
                         dy_param_init[param.name] = param.numpy()
@@ -642,20 +648,18 @@ def func_testSetVariableBeforeTrain(self):
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
-
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            adam = Adam(
-                learning_rate=0.0,
-                beta1=0.8,
-                beta2=0.6,
-                parameters=ptb_model.parameters())
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
+
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
+            adam = Adam(learning_rate=0.0,
+                        beta1=0.8,
+                        beta2=0.6,
+                        parameters=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -671,14 +675,14 @@ def func_testSetVariableBeforeTrain(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
 
                 dy_loss.backward()
                 adam.minimize(dy_loss)
@@ -692,12 +696,12 @@ def func_testSetVariableBeforeTrain(self):
 
                 if k.find("beta1_pow_acc_0") > 0:
                     self.assertTrue(
-                        np.array_equal(v.numpy(), self.base_opti[v.name] *
-                                       adam._beta1))
+                        np.array_equal(v.numpy(),
+                                       self.base_opti[v.name] * adam._beta1))
                 if k.find("beta2_pow_acc_0") > 0:
                     self.assertTrue(
-                        np.array_equal(v.numpy(), self.base_opti[v.name] *
-                                       adam._beta2))
+                        np.array_equal(v.numpy(),
+                                       self.base_opti[v.name] * adam._beta2))
 
             state_dict = ptb_model.state_dict()
 
@@ -721,12 +725,11 @@ def func_testLoadAndSetVarBaseBeforeTrain(self):
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             bd = []
             lr_arr = [0.0]
@@ -737,13 +740,12 @@ def func_testLoadAndSetVarBaseBeforeTrain(self):
                 new_lr = 0.0
                 lr_arr.append(new_lr)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            adam = Adam(
-                learning_rate=0.0,
-                beta1=0.8,
-                beta2=0.6,
-                parameters=ptb_model.parameters())
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
+            adam = Adam(learning_rate=0.0,
+                        beta1=0.8,
+                        beta2=0.6,
+                        parameters=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -760,14 +762,14 @@ def func_testLoadAndSetVarBaseBeforeTrain(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
 
                 dy_loss.backward()
                 adam.minimize(dy_loss)
@@ -781,12 +783,12 @@ def func_testLoadAndSetVarBaseBeforeTrain(self):
 
                 if k.find("beta1_pow_acc_0") > 0:
                     self.assertTrue(
-                        np.array_equal(v.numpy(), self.base_opti[v.name] *
-                                       adam._beta1))
+                        np.array_equal(v.numpy(),
+                                       self.base_opti[v.name] * adam._beta1))
                 if k.find("beta2_pow_acc_0") > 0:
                     self.assertTrue(
-                        np.array_equal(v.numpy(), self.base_opti[v.name] *
-                                       adam._beta2))
+                        np.array_equal(v.numpy(),
+                                       self.base_opti[v.name] * adam._beta2))
 
             # check parameter
 
@@ -812,12 +814,11 @@ def func_testSetNumpyBeforeTrain(self):
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
-            ptb_model = PtbModel(
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel(hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             bd = []
             lr_arr = [0.0]
@@ -828,15 +829,14 @@ def func_testSetNumpyBeforeTrain(self):
                 new_lr = 0.0
                 lr_arr.append(new_lr)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            scheduler = paddle.optimizer.lr.PiecewiseDecay(
-                boundaries=bd, values=lr_arr)
-            adam = Adam(
-                learning_rate=scheduler,
-                beta1=0.8,
-                beta2=0.6,
-                parameters=ptb_model.parameters())
+            place = fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
+            scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd,
+                                                           values=lr_arr)
+            adam = Adam(learning_rate=scheduler,
+                        beta1=0.8,
+                        beta2=0.6,
+                        parameters=ptb_model.parameters())
             dy_param_updated = dict()
             dy_param_init = dict()
             dy_loss = None
@@ -863,14 +863,14 @@ def func_testSetNumpyBeforeTrain(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 x = to_variable(x_data)
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
-                                                            init_cell)
+                dy_loss, last_hidden, last_cell = ptb_model(
+                    x, y, init_hidden, init_cell)
 
                 dy_loss.backward()
                 scheduler.step()
@@ -881,17 +881,17 @@ def func_testSetNumpyBeforeTrain(self):
             for k, v in opti_dict.items():
                 if k == "LR_Scheduler":
                     self.assertTrue(
-                        np.array_equal(v['last_epoch'], self.base_opti[k][
-                            'last_epoch'] + 1))
+                        np.array_equal(v['last_epoch'],
+                                       self.base_opti[k]['last_epoch'] + 1))
 
                 if k.find("beta1_pow_acc_0") > 0:
                     self.assertTrue(
-                        np.array_equal(v.numpy(), self.base_opti[v.name] *
-                                       adam._beta1))
+                        np.array_equal(v.numpy(),
+                                       self.base_opti[v.name] * adam._beta1))
                 if k.find("beta2_pow_acc_0") > 0:
                     self.assertTrue(
-                        np.array_equal(v.numpy(), self.base_opti[v.name] *
-                                       adam._beta2))
+                        np.array_equal(v.numpy(),
+                                       self.base_opti[v.name] * adam._beta2))
 
             # check parameter
 
@@ -930,8 +930,9 @@ def func_test_state_shape_mismatch(self):
             state_dict = emb.state_dict()
             paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-            para_state_dict = paddle.load(
-                os.path.join('saved_dy', 'emb_dy.pdparams'), return_numpy=True)
+            para_state_dict = paddle.load(os.path.join('saved_dy',
+                                                       'emb_dy.pdparams'),
+                                          return_numpy=True)
             para_state_dict['weight'] = np.expand_dims(
                 para_state_dict['weight'], axis=-1)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 9890dfa43a4e3..245982c71ccc2 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -70,6 +70,7 @@ def optimizer_setting(params, parameter_list=None):
 
 
 class ConvBNLayer(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -79,15 +80,14 @@ def __init__(self,
                  act=None):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=None)
+        self._conv = Conv2D(num_channels=num_channels,
+                            num_filters=num_filters,
+                            filter_size=filter_size,
+                            stride=stride,
+                            padding=(filter_size - 1) // 2,
+                            groups=groups,
+                            act=None,
+                            bias_attr=None)
 
         self._batch_norm = BatchNorm(num_filters, act=act)
 
@@ -99,6 +99,7 @@ def forward(self, inputs):
 
 
 class SqueezeExcitation(fluid.dygraph.Layer):
+
     def __init__(self, num_channels, reduction_ratio):
 
         super(SqueezeExcitation, self).__init__()
@@ -107,14 +108,14 @@ def __init__(self, num_channels, reduction_ratio):
         self._squeeze = Linear(
             num_channels,
             num_channels // reduction_ratio,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)),
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.05)),
             act='relu')
         self._excitation = Linear(
             num_channels // reduction_ratio,
             num_channels,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)),
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.05)),
             act='sigmoid')
 
     def forward(self, input):
@@ -127,6 +128,7 @@ def forward(self, input):
 
 
 class BottleneckBlock(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -136,29 +138,27 @@ def __init__(self,
                  shortcut=True):
         super(BottleneckBlock, self).__init__()
 
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels, num_filters=num_filters, filter_size=1)
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            groups=cardinality)
-        self.conv2 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters * 4,
-            filter_size=1,
-            act='relu')
-
-        self.scale = SqueezeExcitation(
-            num_channels=num_filters * 4, reduction_ratio=reduction_ratio)
+        self.conv0 = ConvBNLayer(num_channels=num_channels,
+                                 num_filters=num_filters,
+                                 filter_size=1)
+        self.conv1 = ConvBNLayer(num_channels=num_filters,
+                                 num_filters=num_filters,
+                                 filter_size=3,
+                                 stride=stride,
+                                 groups=cardinality)
+        self.conv2 = ConvBNLayer(num_channels=num_filters,
+                                 num_filters=num_filters * 4,
+                                 filter_size=1,
+                                 act='relu')
+
+        self.scale = SqueezeExcitation(num_channels=num_filters * 4,
+                                       reduction_ratio=reduction_ratio)
 
         if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * 4,
-                filter_size=1,
-                stride=stride)
+            self.short = ConvBNLayer(num_channels=num_channels,
+                                     num_filters=num_filters * 4,
+                                     filter_size=1,
+                                     stride=stride)
 
         self.shortcut = shortcut
 
@@ -183,6 +183,7 @@ def forward(self, inputs):
 
 
 class SeResNeXt(fluid.dygraph.Layer):
+
     def __init__(self, layers=50, class_dim=102):
         super(SeResNeXt, self).__init__()
 
@@ -196,52 +197,53 @@ def __init__(self, layers=50, class_dim=102):
             reduction_ratio = 16
             depth = [3, 4, 6, 3]
             num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                num_channels=3,
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            self.pool = Pool2D(
-                pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+            self.conv0 = ConvBNLayer(num_channels=3,
+                                     num_filters=64,
+                                     filter_size=7,
+                                     stride=2,
+                                     act='relu')
+            self.pool = Pool2D(pool_size=3,
+                               pool_stride=2,
+                               pool_padding=1,
+                               pool_type='max')
         elif layers == 101:
             cardinality = 32
             reduction_ratio = 16
             depth = [3, 4, 23, 3]
             num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                num_channels=3,
-                num_filters=64,
-                filter_size=7,
-                stride=2,
-                act='relu')
-            self.pool = Pool2D(
-                pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+            self.conv0 = ConvBNLayer(num_channels=3,
+                                     num_filters=64,
+                                     filter_size=7,
+                                     stride=2,
+                                     act='relu')
+            self.pool = Pool2D(pool_size=3,
+                               pool_stride=2,
+                               pool_padding=1,
+                               pool_type='max')
         elif layers == 152:
             cardinality = 64
             reduction_ratio = 16
             depth = [3, 8, 36, 3]
             num_filters = [128, 256, 512, 1024]
-            self.conv0 = ConvBNLayer(
-                num_channels=3,
-                num_filters=64,
-                filter_size=3,
-                stride=2,
-                act='relu')
-            self.conv1 = ConvBNLayer(
-                num_channels=64,
-                num_filters=64,
-                filter_size=3,
-                stride=2,
-                act='relu')
-            self.conv2 = ConvBNLayer(
-                num_channels=64,
-                num_filters=128,
-                filter_size=3,
-                stride=1,
-                act='relu')
-            self.pool = Pool2D(
-                pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+            self.conv0 = ConvBNLayer(num_channels=3,
+                                     num_filters=64,
+                                     filter_size=3,
+                                     stride=2,
+                                     act='relu')
+            self.conv1 = ConvBNLayer(num_channels=64,
+                                     num_filters=64,
+                                     filter_size=3,
+                                     stride=2,
+                                     act='relu')
+            self.conv2 = ConvBNLayer(num_channels=64,
+                                     num_filters=128,
+                                     filter_size=3,
+                                     stride=1,
+                                     act='relu')
+            self.pool = Pool2D(pool_size=3,
+                               pool_stride=2,
+                               pool_padding=1,
+                               pool_type='max')
 
         self.bottleneck_block_list = []
         num_channels = 64
@@ -252,19 +254,19 @@ def __init__(self, layers=50, class_dim=102):
             for i in range(depth[block]):
                 bottleneck_block = self.add_sublayer(
                     'bb_%d_%d' % (block, i),
-                    BottleneckBlock(
-                        num_channels=num_channels,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        cardinality=cardinality,
-                        reduction_ratio=reduction_ratio,
-                        shortcut=shortcut))
+                    BottleneckBlock(num_channels=num_channels,
+                                    num_filters=num_filters[block],
+                                    stride=2 if i == 0 and block != 0 else 1,
+                                    cardinality=cardinality,
+                                    reduction_ratio=reduction_ratio,
+                                    shortcut=shortcut))
                 num_channels = bottleneck_block._num_channels_out
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
 
-        self.pool2d_avg = Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True)
+        self.pool2d_avg = Pool2D(pool_size=7,
+                                 pool_type='avg',
+                                 global_pooling=True)
         import math
         stdv = 1.0 / math.sqrt(2048 * 1.0)
 
@@ -296,7 +298,9 @@ def forward(self, inputs):
 
 
 class TestImperativeResneXt(unittest.TestCase):
+
     def reader_decorator(self, reader):
+
         def _reader_imple():
             for item in reader():
                 doc = np.array(item[0]).reshape(3, 224, 224)
@@ -323,11 +327,10 @@ def run_dygraph():
 
             batch_py_reader = fluid.io.PyReader(capacity=1)
             batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(
-                        paddle.dataset.flowers.train(use_xmap=False)),
-                    batch_size=batch_size,
-                    drop_last=True),
+                paddle.batch(self.reader_decorator(
+                    paddle.dataset.flowers.train(use_xmap=False)),
+                             batch_size=batch_size,
+                             drop_last=True),
                 places=fluid.CPUPlace())
 
             dy_param_init_value = {}
@@ -346,8 +349,8 @@ def run_dygraph():
 
                     out = se_resnext(img)
                     softmax_out = fluid.layers.softmax(out, use_cudnn=False)
-                    loss = fluid.layers.cross_entropy(
-                        input=softmax_out, label=label)
+                    loss = fluid.layers.cross_entropy(input=softmax_out,
+                                                      label=label)
                     avg_loss = fluid.layers.mean(x=loss)
 
                     dy_out = avg_loss.numpy()
@@ -361,10 +364,10 @@ def run_dygraph():
                     dy_grad_value = {}
                     for param in se_resnext.parameters():
                         if param.trainable:
-                            np_array = np.array(param._grad_ivar().value()
-                                                .get_tensor())
-                            dy_grad_value[param.name + core.grad_var_suffix(
-                            )] = np_array
+                            np_array = np.array(
+                                param._grad_ivar().value().get_tensor())
+                            dy_grad_value[param.name +
+                                          core.grad_var_suffix()] = np_array
 
                     optimizer.minimize(avg_loss)
                     se_resnext.clear_gradients()
@@ -400,8 +403,9 @@ def run_dygraph():
                 batch_size=batch_size,
                 drop_last=True)
 
-            img = fluid.layers.data(
-                name='pixel', shape=[3, 224, 224], dtype='float32')
+            img = fluid.layers.data(name='pixel',
+                                    shape=[3, 224, 224],
+                                    dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             out = se_resnext(img)
             softmax_out = fluid.layers.softmax(out, use_cudnn=False)
@@ -430,21 +434,22 @@ def run_dygraph():
                     if batch_id >= batch_num and batch_num != -1:
                         break
 
-                    static_x_data = np.array(
-                        [x[0].reshape(3, 224, 224)
-                         for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(
-                            [batch_size, 1])
+                    static_x_data = np.array([
+                        x[0].reshape(3, 224, 224) for x in data
+                    ]).astype('float32')
+                    y_data = np.array([x[1]
+                                       for x in data]).astype('int64').reshape(
+                                           [batch_size, 1])
 
                     fetch_list = [avg_loss.name]
                     fetch_list.extend(static_param_name_list)
                     fetch_list.extend(static_grad_name_list)
-                    out = exe.run(
-                        fluid.default_main_program(),
-                        feed={"pixel": static_x_data,
-                              "label": y_data},
-                        fetch_list=fetch_list)
+                    out = exe.run(fluid.default_main_program(),
+                                  feed={
+                                      "pixel": static_x_data,
+                                      "label": y_data
+                                  },
+                                  fetch_list=fetch_list)
 
                     static_param_value = {}
                     static_grad_value = {}
@@ -497,8 +502,8 @@ def run_dygraph():
             np.allclose(static_out, eager_out),
             "\nstatic_out: {}\neager_out: {}".format(static_out, eager_out))
 
-        self.assertEqual(
-            len(eager_param_init_value), len(static_param_init_value))
+        self.assertEqual(len(eager_param_init_value),
+                         len(static_param_init_value))
 
         for key, value in six.iteritems(static_param_init_value):
             self.assertTrue(np.allclose(value, eager_param_init_value[key]))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
index 8bb4088dc3bf9..962b6e2b0af5d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@@ -26,13 +26,13 @@
 
 
 class SimpleNet(paddle.nn.Layer):
+
     def __init__(self, vocab_size, hidden_size, dtype):
         super(SimpleNet, self).__init__()
-        self.emb = fluid.dygraph.Embedding(
-            size=[vocab_size, hidden_size],
-            dtype=dtype,
-            param_attr='emb.w',
-            is_sparse=True)
+        self.emb = fluid.dygraph.Embedding(size=[vocab_size, hidden_size],
+                                           dtype=dtype,
+                                           param_attr='emb.w',
+                                           is_sparse=True)
 
     def forward(self, input):
         input_emb = self.emb(input)
@@ -40,6 +40,7 @@ def forward(self, input):
 
 
 class TestSimpleNet(unittest.TestCase):
+
     def func_selectedrows_gradient1(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -49,19 +50,17 @@ def func_selectedrows_gradient1(self):
             for dtype in ["float32", "float64"]:
                 for sort_sum_gradient in [True, False]:
                     paddle.disable_static(place)
-                    fluid.set_flags({
-                        'FLAGS_sort_sum_gradient': sort_sum_gradient
-                    })
+                    fluid.set_flags(
+                        {'FLAGS_sort_sum_gradient': sort_sum_gradient})
                     # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                     input = paddle.to_tensor(input_word)
 
                     simplenet = SimpleNet(20, 32, dtype)
-                    adam = SGDOptimizer(
-                        learning_rate=0.001,
-                        parameter_list=simplenet.parameters(
-                        ))  # grad_clip=grad_clip
+                    adam = SGDOptimizer(learning_rate=0.001,
+                                        parameter_list=simplenet.parameters()
+                                        )  # grad_clip=grad_clip
                     input_emb, emb = simplenet(input)
 
                     self.assertTrue(emb.weight.gradient() is None)
@@ -79,9 +78,11 @@ def func_selectedrows_gradient1(self):
                     paddle.enable_static()
 
     def test_selectedrows_gradient1(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_selectedrows_gradient1()
         self.func_selectedrows_gradient1()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_selectedrows_gradient2(self):
         places = [fluid.CPUPlace()]
@@ -91,19 +92,17 @@ def func_selectedrows_gradient2(self):
         for place in places:
             for sort_sum_gradient in [True, False]:
                 with fluid.dygraph.guard(place):
-                    fluid.set_flags({
-                        'FLAGS_sort_sum_gradient': sort_sum_gradient
-                    })
+                    fluid.set_flags(
+                        {'FLAGS_sort_sum_gradient': sort_sum_gradient})
                     grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                     input = to_variable(input_word)
 
                     simplenet = SimpleNet(20, 32, "float32")
-                    adam = SGDOptimizer(
-                        learning_rate=0.001,
-                        parameter_list=simplenet.parameters(),
-                        grad_clip=grad_clip)
+                    adam = SGDOptimizer(learning_rate=0.001,
+                                        parameter_list=simplenet.parameters(),
+                                        grad_clip=grad_clip)
                     input_emb, emb = simplenet(input)
 
                     self.assertTrue(emb.weight.gradient() is None)
@@ -120,9 +119,11 @@ def func_selectedrows_gradient2(self):
                     self.assertTrue(input_emb.gradient() is not None)
 
     def test_selectedrows_gradient2(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_selectedrows_gradient2()
         self.func_selectedrows_gradient2()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
index eaf63436ae088..9f01315720500 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
@@ -29,6 +29,7 @@
 
 
 class SimpleNet(fluid.Layer):
+
     def __init__(self,
                  hidden_size,
                  vocab_size,
@@ -67,12 +68,12 @@ def forward(self, input, label):
         fc = fluid.layers.matmul(x_emb, self.softmax_weight)
         fc = fluid.layers.elementwise_add(fc, self.softmax_bias)
         projection = fluid.layers.matmul(
-            fc, fluid.layers.transpose(
-                self.embedding.weight, perm=[1, 0]))
-        projection = fluid.layers.reshape(
-            projection, shape=[-1, self.vocab_size])
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=projection, label=label, soft_label=False)
+            fc, fluid.layers.transpose(self.embedding.weight, perm=[1, 0]))
+        projection = fluid.layers.reshape(projection,
+                                          shape=[-1, self.vocab_size])
+        loss = fluid.layers.softmax_with_cross_entropy(logits=projection,
+                                                       label=label,
+                                                       soft_label=False)
         loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
         loss = fluid.layers.reduce_mean(loss, dim=[0])
         loss = fluid.layers.reduce_sum(loss)
@@ -81,6 +82,7 @@ def forward(self, input, label):
 
 
 class TestDygraphSimpleNet(unittest.TestCase):
+
     def func_simple_net(self):
         for is_sparse in [True, False]:
             dtype_list = ["float32"]
@@ -114,24 +116,21 @@ def simple_net_float(self, is_sparse, dtype):
                     paddle.seed(seed)
                     paddle.framework.random._manual_program_seed(seed)
 
-                    simple_net = SimpleNet(
-                        hidden_size=hidden_size,
-                        vocab_size=vocab_size,
-                        num_steps=num_steps,
-                        init_scale=init_scale,
-                        is_sparse=is_sparse,
-                        dtype=dtype)
-
-                    sgd = SGDOptimizer(
-                        learning_rate=1e-3,
-                        parameter_list=simple_net.parameters())
+                    simple_net = SimpleNet(hidden_size=hidden_size,
+                                           vocab_size=vocab_size,
+                                           num_steps=num_steps,
+                                           init_scale=init_scale,
+                                           is_sparse=is_sparse,
+                                           dtype=dtype)
+
+                    sgd = SGDOptimizer(learning_rate=1e-3,
+                                       parameter_list=simple_net.parameters())
                     dy_param_updated = dict()
                     dy_param_init = dict()
                     dy_loss = None
 
-                    fluid.set_flags({
-                        'FLAGS_sort_sum_gradient': is_sort_sum_gradient
-                    })
+                    fluid.set_flags(
+                        {'FLAGS_sort_sum_gradient': is_sort_sum_gradient})
 
                     for i in range(batch_num):
                         x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -158,17 +157,17 @@ def simple_net_float(self, is_sparse, dtype):
                     paddle.seed(seed)
                     paddle.framework.random._manual_program_seed(seed)
 
-                    simple_net = SimpleNet(
-                        hidden_size=hidden_size,
-                        vocab_size=vocab_size,
-                        num_steps=num_steps,
-                        is_sparse=is_sparse,
-                        dtype=dtype)
+                    simple_net = SimpleNet(hidden_size=hidden_size,
+                                           vocab_size=vocab_size,
+                                           num_steps=num_steps,
+                                           is_sparse=is_sparse,
+                                           dtype=dtype)
 
                     exe = fluid.Executor(place)
                     sgd = SGDOptimizer(learning_rate=1e-3)
-                    x = fluid.layers.data(
-                        name="x", shape=[-1, num_steps], dtype='int64')
+                    x = fluid.layers.data(name="x",
+                                          shape=[-1, num_steps],
+                                          dtype='int64')
                     y = fluid.layers.data(name="y", shape=[-1, 1], dtype=dtype)
 
                     static_loss = simple_net(x, y)
@@ -192,8 +191,10 @@ def simple_net_float(self, is_sparse, dtype):
                         fetch_list = [static_loss]
                         fetch_list.extend(static_param_name_list)
                         out = exe.run(fluid.default_main_program(),
-                                      feed={"x": x_data,
-                                            "y": y_data},
+                                      feed={
+                                          "x": x_data,
+                                          "y": y_data
+                                      },
                                       fetch_list=fetch_list)
                         static_loss_value = out[0]
 
@@ -202,13 +203,13 @@ def simple_net_float(self, is_sparse, dtype):
                                 static_param_updated[static_param_name_list[
                                     k - 1]] = out[k]
 
-                self.assertTrue(
-                    np.array_equal(static_loss_value, dy_loss_value))
+                self.assertTrue(np.array_equal(static_loss_value,
+                                               dy_loss_value))
                 for key, value in six.iteritems(static_param_init):
                     self.assertTrue(np.array_equal(value, dy_param_init[key]))
                 for key, value in six.iteritems(static_param_updated):
-                    self.assertTrue(
-                        np.array_equal(value, dy_param_updated[key]))
+                    self.assertTrue(np.array_equal(value,
+                                                   dy_param_updated[key]))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
index 8aadb155b0c0a..3f2a897b6b3e0 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
@@ -39,7 +39,9 @@ def __handler__(signum, frame):
 
 
 class DygraphDataLoaderSingalHandler(unittest.TestCase):
+
     def func_child_process_exit_with_error(self):
+
         def __test_process__():
             core._set_process_signal_handler()
             sys.exit(1)
@@ -72,6 +74,7 @@ def test_child_process_exit_with_error(self):
         self.func_child_process_exit_with_error()
 
     def func_child_process_killed_by_sigsegv(self):
+
         def __test_process__():
             core._set_process_signal_handler()
             os.kill(os.getpid(), signal.SIGSEGV)
@@ -105,6 +108,7 @@ def test_child_process_killed_by_sigsegv(self):
         self.func_child_process_killed_by_sigsegv()
 
     def func_child_process_killed_by_sigbus(self):
+
         def __test_process__():
             core._set_process_signal_handler()
             os.kill(os.getpid(), signal.SIGBUS)
@@ -137,6 +141,7 @@ def test_child_process_killed_by_sigbus(self):
         self.func_child_process_killed_by_sigbus()
 
     def func_child_process_killed_by_sigterm(self):
+
         def __test_process__():
             core._set_process_signal_handler()
             time.sleep(10)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index 4e542fb13cd76..092478bbf2ae1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -24,6 +24,7 @@
 
 
 class Config(object):
+
     def __init__(self, place, sort_sum_gradient=True):
         self.place = place
 
@@ -59,6 +60,7 @@ def __init__(self, place, sort_sum_gradient=True):
 
 
 def create_mnist_dataset(cfg):
+
     def create_target_label(label):
         return label
         # return (label + 1) % cfg.c_dim # fake label target
@@ -105,6 +107,7 @@ def __impl__():
 
 
 class InstanceNorm(fluid.dygraph.Layer):
+
     def __init__(self, num_channels, epsilon=1e-5):
         super(InstanceNorm, self).__init__()
         self.epsilon = epsilon
@@ -126,6 +129,7 @@ def forward(self, input):
 
 
 class Conv2DLayer(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters=64,
@@ -136,13 +140,12 @@ def __init__(self,
                  use_bias=False,
                  relufactor=None):
         super(Conv2DLayer, self).__init__()
-        self._conv = fluid.dygraph.Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            bias_attr=None if use_bias else False)
+        self._conv = fluid.dygraph.Conv2D(num_channels=num_channels,
+                                          num_filters=num_filters,
+                                          filter_size=filter_size,
+                                          stride=stride,
+                                          padding=padding,
+                                          bias_attr=None if use_bias else False)
 
         if norm is not None:
             self._norm = InstanceNorm(num_filters)
@@ -164,6 +167,7 @@ def forward(self, input):
 
 
 class Deconv2DLayer(fluid.dygraph.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters=64,
@@ -203,25 +207,24 @@ def forward(self, input):
 
 
 class ResidualBlock(fluid.dygraph.Layer):
+
     def __init__(self, num_channels, num_filters):
         super(ResidualBlock, self).__init__()
-        self._conv0 = Conv2DLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=1,
-            padding=1,
-            norm=True,
-            relufactor=0)
-
-        self._conv1 = Conv2DLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=1,
-            padding=1,
-            norm=True,
-            relufactor=None)
+        self._conv0 = Conv2DLayer(num_channels=num_channels,
+                                  num_filters=num_filters,
+                                  filter_size=3,
+                                  stride=1,
+                                  padding=1,
+                                  norm=True,
+                                  relufactor=0)
+
+        self._conv1 = Conv2DLayer(num_channels=num_filters,
+                                  num_filters=num_filters,
+                                  filter_size=3,
+                                  stride=1,
+                                  padding=1,
+                                  norm=True,
+                                  relufactor=None)
 
     def forward(self, input):
         conv0 = self._conv0(input)
@@ -230,28 +233,27 @@ def forward(self, input):
 
 
 class Generator(fluid.dygraph.Layer):
+
     def __init__(self, cfg, num_channels=3):
         super(Generator, self).__init__()
-        conv_base = Conv2DLayer(
-            num_channels=cfg.c_dim + num_channels,
-            num_filters=cfg.g_base_dims,
-            filter_size=7,
-            stride=1,
-            padding=3,
-            norm=True,
-            relufactor=0)
+        conv_base = Conv2DLayer(num_channels=cfg.c_dim + num_channels,
+                                num_filters=cfg.g_base_dims,
+                                filter_size=7,
+                                stride=1,
+                                padding=3,
+                                norm=True,
+                                relufactor=0)
 
         sub_layers = [conv_base]
         cur_channels = cfg.g_base_dims
         for i in range(2):
-            sub_layer = Conv2DLayer(
-                num_channels=cur_channels,
-                num_filters=cur_channels * 2,
-                filter_size=4,
-                stride=2,
-                padding=1,
-                norm=True,
-                relufactor=0)
+            sub_layer = Conv2DLayer(num_channels=cur_channels,
+                                    num_filters=cur_channels * 2,
+                                    filter_size=4,
+                                    stride=2,
+                                    padding=1,
+                                    norm=True,
+                                    relufactor=0)
 
             cur_channels *= 2
             sub_layers.append(sub_layer)
@@ -261,8 +263,8 @@ def __init__(self, cfg, num_channels=3):
         repeat_num = cfg.g_repeat_num
         sub_layers = []
         for i in range(repeat_num):
-            res_block = ResidualBlock(
-                num_channels=cur_channels, num_filters=cfg.g_base_dims * 4)
+            res_block = ResidualBlock(num_channels=cur_channels,
+                                      num_filters=cfg.g_base_dims * 4)
             sub_layers.append(res_block)
 
         self._res_block = fluid.dygraph.Sequential(*sub_layers)
@@ -271,26 +273,24 @@ def __init__(self, cfg, num_channels=3):
         sub_layers = []
         for i in range(2):
             rate = 2**(1 - i)
-            deconv = Deconv2DLayer(
-                num_channels=cur_channels,
-                num_filters=cfg.g_base_dims * rate,
-                filter_size=4,
-                stride=2,
-                padding=1,
-                relufactor=0,
-                norm=True)
+            deconv = Deconv2DLayer(num_channels=cur_channels,
+                                   num_filters=cfg.g_base_dims * rate,
+                                   filter_size=4,
+                                   stride=2,
+                                   padding=1,
+                                   relufactor=0,
+                                   norm=True)
             cur_channels = cfg.g_base_dims * rate
             sub_layers.append(deconv)
 
         self._deconv = fluid.dygraph.Sequential(*sub_layers)
 
-        self._conv1 = Conv2DLayer(
-            num_channels=cur_channels,
-            num_filters=3,
-            filter_size=7,
-            stride=1,
-            padding=3,
-            relufactor=None)
+        self._conv1 = Conv2DLayer(num_channels=cur_channels,
+                                  num_filters=3,
+                                  filter_size=7,
+                                  stride=1,
+                                  padding=3,
+                                  relufactor=None)
 
     def forward(self, input, label_trg):
         shape = input.shape
@@ -310,29 +310,28 @@ def forward(self, input, label_trg):
 
 
 class Discriminator(fluid.dygraph.Layer):
+
     def __init__(self, cfg, num_channels=3):
         super(Discriminator, self).__init__()
 
         cur_dim = cfg.d_base_dims
 
-        conv_base = Conv2DLayer(
-            num_channels=num_channels,
-            num_filters=cur_dim,
-            filter_size=4,
-            stride=2,
-            padding=1,
-            relufactor=0.2)
+        conv_base = Conv2DLayer(num_channels=num_channels,
+                                num_filters=cur_dim,
+                                filter_size=4,
+                                stride=2,
+                                padding=1,
+                                relufactor=0.2)
 
         repeat_num = cfg.d_repeat_num
         sub_layers = [conv_base]
         for i in range(1, repeat_num):
-            sub_layer = Conv2DLayer(
-                num_channels=cur_dim,
-                num_filters=cur_dim * 2,
-                filter_size=4,
-                stride=2,
-                padding=1,
-                relufactor=0.2)
+            sub_layer = Conv2DLayer(num_channels=cur_dim,
+                                    num_filters=cur_dim * 2,
+                                    filter_size=4,
+                                    stride=2,
+                                    padding=1,
+                                    relufactor=0.2)
             cur_dim *= 2
             sub_layers.append(sub_layer)
 
@@ -340,17 +339,15 @@ def __init__(self, cfg, num_channels=3):
 
         kernel_size = int(cfg.image_size / np.power(2, repeat_num))
 
-        self._conv1 = Conv2DLayer(
-            num_channels=cur_dim,
-            num_filters=1,
-            filter_size=3,
-            stride=1,
-            padding=1)
+        self._conv1 = Conv2DLayer(num_channels=cur_dim,
+                                  num_filters=1,
+                                  filter_size=3,
+                                  stride=1,
+                                  padding=1)
 
-        self._conv2 = Conv2DLayer(
-            num_channels=cur_dim,
-            num_filters=cfg.c_dim,
-            filter_size=kernel_size)
+        self._conv2 = Conv2DLayer(num_channels=cur_dim,
+                                  num_filters=cfg.c_dim,
+                                  filter_size=kernel_size)
 
     def forward(self, input):
         conv = self._conv0(input)
@@ -361,8 +358,8 @@ def forward(self, input):
 
 def loss_cls(cls, label, cfg):
     cls_shape = cls.shape
-    cls = fluid.layers.reshape(
-        cls, [-1, cls_shape[1] * cls_shape[2] * cls_shape[3]])
+    cls = fluid.layers.reshape(cls,
+                               [-1, cls_shape[1] * cls_shape[2] * cls_shape[3]])
     return fluid.layers.reduce_sum(
         fluid.layers.sigmoid_cross_entropy_with_logits(cls,
                                                        label)) / cfg.batch_size
@@ -370,21 +367,25 @@ def loss_cls(cls, label, cfg):
 
 def calc_gradients(outputs, inputs, no_grad_set):
     if fluid._non_static_mode():
-        return fluid.dygraph.grad(
-            outputs=outputs,
-            inputs=inputs,
-            no_grad_vars=no_grad_set,
-            create_graph=True)
+        return fluid.dygraph.grad(outputs=outputs,
+                                  inputs=inputs,
+                                  no_grad_vars=no_grad_set,
+                                  create_graph=True)
     else:
-        return fluid.gradients(
-            targets=outputs, inputs=inputs, no_grad_set=no_grad_set)
+        return fluid.gradients(targets=outputs,
+                               inputs=inputs,
+                               no_grad_set=no_grad_set)
 
 
 def gradient_penalty(f, real, fake, no_grad_set, cfg):
+
     def _interpolate(a, b):
         shape = [a.shape[0]]
-        alpha = fluid.layers.uniform_random_batch_size_like(
-            input=a, shape=shape, min=0.1, max=1.0, seed=cfg.seed)
+        alpha = fluid.layers.uniform_random_batch_size_like(input=a,
+                                                            shape=shape,
+                                                            min=0.1,
+                                                            max=1.0,
+                                                            seed=cfg.seed)
 
         inner = fluid.layers.elementwise_mul(
             b, 1.0 - alpha, axis=0) + fluid.layers.elementwise_mul(
@@ -396,8 +397,9 @@ def _interpolate(a, b):
     if isinstance(pred, tuple):
         pred = pred[0]
 
-    gradient = calc_gradients(
-        outputs=[pred], inputs=[x], no_grad_set=no_grad_set)
+    gradient = calc_gradients(outputs=[pred],
+                              inputs=[x],
+                              no_grad_set=no_grad_set)
 
     if gradient is None:
         return None
@@ -410,8 +412,7 @@ def _interpolate(a, b):
 
     epsilon = 1e-16
     norm = fluid.layers.sqrt(
-        fluid.layers.reduce_sum(
-            fluid.layers.square(gradient), dim=1) + epsilon)
+        fluid.layers.reduce_sum(fluid.layers.square(gradient), dim=1) + epsilon)
 
     gp = fluid.layers.reduce_mean(fluid.layers.square(norm - 1.0))
     return gp
@@ -455,20 +456,21 @@ def build_optimizer(layer, cfg, loss=None):
     beta1 = 0.5
     beta2 = 0.999
     if fluid._non_static_mode():
-        return fluid.optimizer.Adam(
-            learning_rate=learning_rate,
-            beta1=beta1,
-            beta2=beta2,
-            parameter_list=layer.parameters())
+        return fluid.optimizer.Adam(learning_rate=learning_rate,
+                                    beta1=beta1,
+                                    beta2=beta2,
+                                    parameter_list=layer.parameters())
     else:
-        optimizer = fluid.optimizer.Adam(
-            learning_rate=learning_rate, beta1=beta1, beta2=beta2)
+        optimizer = fluid.optimizer.Adam(learning_rate=learning_rate,
+                                         beta1=beta1,
+                                         beta2=beta2)
 
         optimizer.minimize(loss, parameter_list=layer.parameters())
         return optimizer
 
 
 class DyGraphTrainModel(object):
+
     def __init__(self, cfg):
         paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
@@ -517,6 +519,7 @@ def run(self, image_real, label_org, label_trg):
 
 
 class StaticGraphTrainModel(object):
+
     def __init__(self, cfg):
         self.cfg = cfg
 
@@ -525,10 +528,12 @@ def create_data_layer():
                 shape=[None, 3, cfg.image_size, cfg.image_size],
                 dtype='float32',
                 name='image_real')
-            label_org = fluid.data(
-                shape=[None, cfg.c_dim], dtype='float32', name='label_org')
-            label_trg = fluid.data(
-                shape=[None, cfg.c_dim], dtype='float32', name='label_trg')
+            label_org = fluid.data(shape=[None, cfg.c_dim],
+                                   dtype='float32',
+                                   name='label_org')
+            label_trg = fluid.data(shape=[None, cfg.c_dim],
+                                   dtype='float32',
+                                   name='label_trg')
             return image_real, label_org, label_trg
 
         paddle.seed(cfg.seed)
@@ -584,6 +589,7 @@ def run(self, image_real, label_org, label_trg):
 
 
 class TestStarGANWithGradientPenalty(unittest.TestCase):
+
     def func_main(self):
         self.place_test(fluid.CPUPlace())
 
@@ -614,8 +620,9 @@ def place_test(self, place):
                                                    label_trg)
                     eager_dygraph_loss.append(loss)
 
-        for (g_loss_f, d_loss_f), (g_loss_e, d_loss_e) in zip(
-                fluid_dygraph_loss, eager_dygraph_loss):
+        for (g_loss_f, d_loss_f), (g_loss_e,
+                                   d_loss_e) in zip(fluid_dygraph_loss,
+                                                    eager_dygraph_loss):
             self.assertEqual(g_loss_f, g_loss_e)
             self.assertEqual(d_loss_f, d_loss_e)
 
@@ -624,6 +631,7 @@ def test_all_cases(self):
 
 
 class TestStarGANWithGradientPenaltyLegacy(unittest.TestCase):
+
     def func_main(self):
         self.place_test(fluid.CPUPlace())
 
@@ -651,8 +659,8 @@ def place_test(self, place):
                 loss = dygraph_model.run(image_real, label_org, label_trg)
                 dygraph_loss.append(loss)
 
-        for (g_loss_s, d_loss_s), (g_loss_d, d_loss_d) in zip(static_loss,
-                                                              dygraph_loss):
+        for (g_loss_s, d_loss_s), (g_loss_d,
+                                   d_loss_d) in zip(static_loss, dygraph_loss):
             self.assertEqual(g_loss_s, g_loss_d)
             self.assertEqual(d_loss_s, d_loss_d)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
index 8a0d92fa415c4..619e1ba37d60c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
@@ -31,21 +31,19 @@
 
 
 def convolutional_neural_network(img):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(input=img,
+                                                  filter_size=5,
+                                                  num_filters=20,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
     conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(input=conv_pool_1,
+                                                  filter_size=5,
+                                                  num_filters=50,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
     prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
     return prediction
 
@@ -63,6 +61,7 @@ def static_train_net(img, label):
 
 
 class TestImperativeStaticModelRunnerMnist(unittest.TestCase):
+
     def setUp(self):
         self.seed = 90
         self.epoch_num = 1
@@ -70,6 +69,7 @@ def setUp(self):
         self.batch_num = 50
 
     def reader_decorator(self, reader):
+
         def _reader_impl():
             for item in reader():
                 image = np.array(item[0]).reshape(1, 28, 28)
@@ -83,24 +83,24 @@ def train_and_save_model(self):
             startup_program = fluid.default_startup_program()
             main_program = fluid.default_main_program()
 
-            img = fluid.data(
-                name='img', shape=[None, 1, 28, 28], dtype='float32')
+            img = fluid.data(name='img',
+                             shape=[None, 1, 28, 28],
+                             dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
 
             prediction, avg_loss = static_train_net(img, label)
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
 
             exe = fluid.Executor(place)
 
             feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
             exe.run(startup_program)
 
-            train_reader = paddle.batch(
-                paddle.reader.shuffle(
-                    paddle.dataset.mnist.train(), buf_size=100),
-                batch_size=self.batch_size)
+            train_reader = paddle.batch(paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=100),
+                                        batch_size=self.batch_size)
 
             for _ in range(0, self.epoch_num):
                 for batch_id, data in enumerate(train_reader()):
@@ -111,16 +111,16 @@ def train_and_save_model(self):
                     if batch_id > self.batch_num:
                         break
 
-            fluid.io.save_inference_model(
-                self.save_dirname, ["img"], [prediction],
-                exe,
-                model_filename=self.model_filename,
-                params_filename=self.params_filename,
-                clip_extra=False)
+            fluid.io.save_inference_model(self.save_dirname, ["img"],
+                                          [prediction],
+                                          exe,
+                                          model_filename=self.model_filename,
+                                          params_filename=self.params_filename,
+                                          clip_extra=False)
 
     def load_and_train_dygraph(self):
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.dygraph.guard(place):
             fluid.default_startup_program().random_seed = self.seed
             fluid.default_main_program().random_seed = self.seed
@@ -141,10 +141,10 @@ def load_and_train_dygraph(self):
             sgd = fluid.optimizer.SGD(learning_rate=0.001,
                                       parameter_list=mnist.parameters())
 
-            train_reader = paddle.batch(
-                self.reader_decorator(paddle.dataset.mnist.train()),
-                batch_size=self.batch_size,
-                drop_last=True)
+            train_reader = paddle.batch(self.reader_decorator(
+                paddle.dataset.mnist.train()),
+                                        batch_size=self.batch_size,
+                                        drop_last=True)
             train_loader = fluid.io.DataLoader.from_generator(capacity=10)
             train_loader.set_sample_list_generator(train_reader, places=place)
 
@@ -182,23 +182,23 @@ def load_and_train_static(self):
             fluid.default_startup_program().random_seed = self.seed
             fluid.default_main_program().random_seed = self.seed
 
-            img = fluid.data(
-                name='img', shape=[None, 1, 28, 28], dtype='float32')
+            img = fluid.data(name='img',
+                             shape=[None, 1, 28, 28],
+                             dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
 
             prediction, avg_loss = static_train_net(img, label)
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
 
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
 
-            fluid.io.load_params(
-                exe,
-                self.save_dirname,
-                main_program=fluid.default_main_program(),
-                filename=self.params_filename)
+            fluid.io.load_params(exe,
+                                 self.save_dirname,
+                                 main_program=fluid.default_main_program(),
+                                 filename=self.params_filename)
 
             static_param_init_value = {}
             static_param_name_list = []
@@ -207,23 +207,25 @@ def load_and_train_static(self):
                 static_param_init_value[param.name] = fluid.executor._fetch_var(
                     param.name)
 
-            train_reader = paddle.batch(
-                self.reader_decorator(paddle.dataset.mnist.train()),
-                batch_size=self.batch_size,
-                drop_last=True)
+            train_reader = paddle.batch(self.reader_decorator(
+                paddle.dataset.mnist.train()),
+                                        batch_size=self.batch_size,
+                                        drop_last=True)
 
             for epoch in range(self.epoch_num):
                 for batch_id, data in enumerate(train_reader()):
                     static_x_data = np.array([x[0] for x in data])
-                    y_data = np.array([x[1] for x in data]).reshape(
-                        [self.batch_size, 1])
+                    y_data = np.array([x[1] for x in data
+                                       ]).reshape([self.batch_size, 1])
 
                     fetch_list = [avg_loss.name]
                     fetch_list.extend(static_param_name_list)
 
                     out = exe.run(fluid.default_main_program(),
-                                  feed={"img": static_x_data,
-                                        "label": y_data},
+                                  feed={
+                                      "img": static_x_data,
+                                      "label": y_data
+                                  },
                                   fetch_list=fetch_list)
 
                     if batch_id >= self.batch_num:
@@ -237,18 +239,18 @@ def load_and_train_static(self):
         return static_x_data, static_out, static_param_init_value, static_param_value
 
     def load_and_infer_dygraph(self):
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.dygraph.guard(place):
             fluid.default_main_program().random_seed = self.seed
 
             mnist = fluid.dygraph.static_runner.StaticModelRunner(
                 model_dir=self.save_dirname, model_filename=self.model_filename)
 
-            train_reader = paddle.batch(
-                self.reader_decorator(paddle.dataset.mnist.test()),
-                batch_size=self.batch_size,
-                drop_last=True)
+            train_reader = paddle.batch(self.reader_decorator(
+                paddle.dataset.mnist.test()),
+                                        batch_size=self.batch_size,
+                                        drop_last=True)
             train_loader = fluid.io.DataLoader.from_generator(capacity=10)
             train_loader.set_sample_list_generator(train_reader, places=place)
 
@@ -268,19 +270,18 @@ def load_and_infer_dygraph(self):
 
     def load_and_infer_static(self):
         with new_program_scope():
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
 
             exe = fluid.Executor(place)
-            [infer_program, feed_target_names,
-             fetch_targets] = fluid.io.load_inference_model(self.save_dirname,
-                                                            exe)
+            [infer_program, feed_target_names, fetch_targets
+             ] = fluid.io.load_inference_model(self.save_dirname, exe)
             infer_program.random_seed = self.seed
 
-            train_reader = paddle.batch(
-                self.reader_decorator(paddle.dataset.mnist.test()),
-                batch_size=self.batch_size,
-                drop_last=True)
+            train_reader = paddle.batch(self.reader_decorator(
+                paddle.dataset.mnist.test()),
+                                        batch_size=self.batch_size,
+                                        drop_last=True)
 
             for batch_id, data in enumerate(train_reader()):
                 static_x_data = np.array([x[0] for x in data])
@@ -332,7 +333,7 @@ def test_mnist_train_with_params_filename(self):
         # Phase 1. run and save static model
         self.train_and_save_model()
 
-        # Phase 2. load model & train dygraph        
+        # Phase 2. load model & train dygraph
         dy_x_data, dy_out, dy_param_init_value, dy_param_value, dict_old_new_init= \
             self.load_and_train_dygraph()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
index 13ed7a4d334bd..6c90b8348714c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
@@ -33,6 +33,7 @@
 
 
 def while_softmax_regression(img):
+
     def cond(i, times, pred):
         return i < times
 
@@ -44,12 +45,14 @@ def body(i, times, pred):
     i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
     times = fluid.layers.fill_constant(shape=[1], dtype='int64', value=5)
     pred = fluid.layers.fc(input=img, size=10, act='softmax')
-    i, times, pred = fluid.layers.while_loop(
-        cond=cond, body=body, loop_vars=[i, times, pred])
+    i, times, pred = fluid.layers.while_loop(cond=cond,
+                                             body=body,
+                                             loop_vars=[i, times, pred])
     return pred
 
 
 class TestImperativeStaticModelRunnerWhile(unittest.TestCase):
+
     def setUp(self):
         self.seed = 90
         self.batch_size = 32
@@ -59,6 +62,7 @@ def setUp(self):
         self.params_filename = None
 
     def _random_batch_reader(self):
+
         def _get_random_images_and_labels(image_shape, label_shape):
             image = np.random.random(size=image_shape).astype('float32')
             label = np.random.random(size=label_shape).astype('int64')
@@ -87,29 +91,29 @@ def train_and_save_model(self):
         optimizer = fluid.optimizer.SGD(learning_rate=0.001)
         optimizer.minimize(avg_loss)
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
 
         exe = fluid.Executor(place)
         exe.run(startup_program)
 
-        loader = fluid.io.DataLoader.from_generator(
-            feed_list=[img, label], capacity=5, iterable=True)
+        loader = fluid.io.DataLoader.from_generator(feed_list=[img, label],
+                                                    capacity=5,
+                                                    iterable=True)
         loader.set_batch_generator(self._random_batch_reader(), places=place)
 
         for data in loader():
             exe.run(main_program, feed=data, fetch_list=[avg_loss])
 
-        fluid.io.save_inference_model(
-            self.save_dirname, ["img"], [pred],
-            exe,
-            model_filename=self.model_filename,
-            params_filename=self.params_filename,
-            clip_extra=False)
+        fluid.io.save_inference_model(self.save_dirname, ["img"], [pred],
+                                      exe,
+                                      model_filename=self.model_filename,
+                                      params_filename=self.params_filename,
+                                      clip_extra=False)
 
     def load_and_train_dygraph(self):
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.dygraph.guard(place):
             fluid.default_startup_program().random_seed = self.seed
             fluid.default_main_program().random_seed = self.seed
@@ -127,8 +131,8 @@ def load_and_train_dygraph(self):
                                       parameter_list=while_net.parameters())
 
             train_loader = fluid.io.DataLoader.from_generator(capacity=10)
-            train_loader.set_batch_generator(
-                self._random_batch_reader(), places=place)
+            train_loader.set_batch_generator(self._random_batch_reader(),
+                                             places=place)
 
             while_net.train()
 
@@ -170,17 +174,16 @@ def load_and_train_static(self):
             optimizer = fluid.optimizer.SGD(learning_rate=0.001)
             optimizer.minimize(avg_loss)
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
 
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
 
-            fluid.io.load_params(
-                exe,
-                self.save_dirname,
-                main_program=fluid.default_main_program(),
-                filename=self.params_filename)
+            fluid.io.load_params(exe,
+                                 self.save_dirname,
+                                 main_program=fluid.default_main_program(),
+                                 filename=self.params_filename)
 
             static_param_init_value = {}
             static_param_name_list = []
@@ -189,10 +192,11 @@ def load_and_train_static(self):
                 static_param_init_value[param.name] = fluid.executor._fetch_var(
                     param.name)
 
-            loader = fluid.io.DataLoader.from_generator(
-                feed_list=[img, label], capacity=5, iterable=True)
-            loader.set_batch_generator(
-                self._random_batch_reader(), places=place)
+            loader = fluid.io.DataLoader.from_generator(feed_list=[img, label],
+                                                        capacity=5,
+                                                        iterable=True)
+            loader.set_batch_generator(self._random_batch_reader(),
+                                       places=place)
 
             for data in loader():
                 fetch_list = [avg_loss.name]
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_tensor_clear_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_tensor_clear_gradient.py
index 7f34bf43688f1..55879293734ad 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_tensor_clear_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_tensor_clear_gradient.py
@@ -21,6 +21,7 @@
 
 
 def _dygraph_guard_(func):
+
     def __impl__(*args, **kwargs):
         if fluid._non_static_mode():
             return func(*args, **kwargs)
@@ -35,6 +36,7 @@ def __impl__(*args, **kwargs):
 
 
 class TestDygraphClearGradient(TestCase):
+
     def setUp(self):
         self.input_shape = [10, 2]
 
@@ -63,7 +65,7 @@ def test_tensor_method_clear_gradient_case2(self):
         # default arg set_to_zero is true
         # so, False means real clear gradient
         linear.weight.clear_gradient(False)
-        # before ._gradient_set_empty(False), 
+        # before ._gradient_set_empty(False),
         # the return of ._is_gradient_set_empty() should be True
         if not fluid.framework.in_dygraph_mode():
             self.assertTrue(linear.weight._is_gradient_set_empty())
@@ -73,7 +75,7 @@ def test_tensor_method_clear_gradient_case2(self):
         # reset, because ClearGradient will call SetIsEmpty(True), but this is not our expectation.
         if not fluid.framework.in_dygraph_mode():
             linear.weight._gradient_set_empty(False)
-            # after ._gradient_set_empty(False), 
+            # after ._gradient_set_empty(False),
             # the return of ._is_gradient_set_empty() should be False
             self.assertFalse(linear.weight._is_gradient_set_empty())
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_thread_local_has_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_thread_local_has_grad.py
index b9b3158515aa9..5e8a54ca39014 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_thread_local_has_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_thread_local_has_grad.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class SimpleNet(nn.Layer):
+
     def __init__(self, in_dim, out_dim):
         super(SimpleNet, self).__init__()
         self.fc = nn.Linear(in_dim, out_dim)
@@ -31,6 +32,7 @@ def forward(self, x):
 
 
 class TestCases(unittest.TestCase):
+
     @paddle.no_grad()
     def thread_1_main(self):
         time.sleep(8)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
index a621105f5084c..8a7fa967897eb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
@@ -20,6 +20,7 @@
 
 
 class SimpleFCLayer(fluid.dygraph.Layer):
+
     def __init__(self, feature_size, batch_size, fc_size):
         super(SimpleFCLayer, self).__init__()
         self._linear = fluid.dygraph.Linear(feature_size, fc_size)
@@ -32,6 +33,7 @@ def forward(self, x):
 
 
 class TestTracedLayerRecordNonPersistableInput(unittest.TestCase):
+
     def test_main(self):
         if fluid.framework.in_dygraph_mode():
             return
@@ -51,8 +53,8 @@ def test_main(self):
 
             for _ in six.moves.range(10):
                 in_x = fluid.dygraph.to_variable(
-                    np.random.random((batch_size, feature_size)).astype(
-                        'float32'))
+                    np.random.random(
+                        (batch_size, feature_size)).astype('float32'))
                 if traced_layer is None:
                     dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
                         layer, [in_x])
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index 531c89fb19ec6..7f60d6c64acb7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -25,6 +25,7 @@
 from paddle.fluid import core
 import numpy as np
 import six
+
 np.set_printoptions(suppress=True)
 
 from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
@@ -153,10 +154,10 @@ def position_encoding_init(n_position, d_pos_vec):
     num_timescales = channels // 2
     log_timescale_increment = (np.log(float(1e4) / float(1)) /
                                (num_timescales - 1))
-    inv_timescales = np.exp(np.arange(
-        num_timescales)) * -log_timescale_increment
-    scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
-                                                               0)
+    inv_timescales = np.exp(
+        np.arange(num_timescales)) * -log_timescale_increment
+    scaled_time = np.expand_dims(position, 1) * np.expand_dims(
+        inv_timescales, 0)
     signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
     signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
     position_enc = signal
@@ -172,18 +173,15 @@ def create_data(is_static=False):
         ]
     else:
         enc_inputs = [
-            to_variable(
-                src_word_np, name='src_word'), to_variable(
-                    src_pos_np, name='src_pos'), to_variable(
-                        src_slf_attn_bias_np, name='src_slf_attn_bias')
+            to_variable(src_word_np, name='src_word'),
+            to_variable(src_pos_np, name='src_pos'),
+            to_variable(src_slf_attn_bias_np, name='src_slf_attn_bias')
         ]
         dec_inputs = [
-            to_variable(
-                trg_word_np, name='trg_word'), to_variable(
-                    trg_pos_np, name='trg_pos'), to_variable(
-                        trg_slf_attn_bias_np, name='trg_slf_attn_bias'),
-            to_variable(
-                trg_src_attn_bias_np, name='trg_src_attn_bias')
+            to_variable(trg_word_np, name='trg_word'),
+            to_variable(trg_pos_np, name='trg_pos'),
+            to_variable(trg_slf_attn_bias_np, name='trg_slf_attn_bias'),
+            to_variable(trg_src_attn_bias_np, name='trg_src_attn_bias')
         ]
         label = to_variable(lbl_word_np, name='lbl_word')
         weight = to_variable(lbl_weight_np, name='lbl_weight')
@@ -209,13 +207,12 @@ def make_all_inputs(input_fields):
     """
     inputs = []
     for input_field in input_fields:
-        input_var = fluid.layers.data(
-            name=input_field,
-            shape=input_descs[input_field][0],
-            dtype=input_descs[input_field][1],
-            lod_level=input_descs[input_field][2]
-            if len(input_descs[input_field]) == 3 else 0,
-            append_batch_size=False)
+        input_var = fluid.layers.data(name=input_field,
+                                      shape=input_descs[input_field][0],
+                                      dtype=input_descs[input_field][1],
+                                      lod_level=input_descs[input_field][2] if
+                                      len(input_descs[input_field]) == 3 else 0,
+                                      append_batch_size=False)
         inputs.append(input_var)
     return inputs
 
@@ -240,8 +237,8 @@ def make_all_inputs(input_fields):
     # encoder.
     # The actual data shape of src_slf_attn_bias is:
     # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
-    "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
-                           seq_len), "float32"],
+    "src_slf_attn_bias":
+    [(batch_size, ModelHyperParams.n_head, seq_len, seq_len), "float32"],
     # The actual data shape of trg_word is:
     # [batch_size, max_trg_len_in_batch]
     "trg_word": [(batch_size, seq_len), "int64",
@@ -253,14 +250,14 @@ def make_all_inputs(input_fields):
     # subsequent words in the decoder.
     # The actual data shape of trg_slf_attn_bias is:
     # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
-    "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
-                           seq_len), "float32"],
+    "trg_slf_attn_bias":
+    [(batch_size, ModelHyperParams.n_head, seq_len, seq_len), "float32"],
     # This input is used to remove attention weights on paddings of the source
     # input in the encoder-decoder attention.
     # The actual data shape of trg_src_attn_bias is:
     # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
-    "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
-                           seq_len), "float32"],
+    "trg_src_attn_bias":
+    [(batch_size, ModelHyperParams.n_head, seq_len, seq_len), "float32"],
     # This input is used in independent decoder program for inference.
     # The actual data shape of enc_output is:
     # [batch_size, max_src_len_in_batch, d_model]
@@ -282,32 +279,38 @@ def make_all_inputs(input_fields):
 # Names of word embedding table which might be reused for weight sharing.
 word_emb_param_names = (
     "src_word_emb_table",
-    "trg_word_emb_table", )
+    "trg_word_emb_table",
+)
 # Names of position encoding table which will be initialized externally.
 pos_enc_param_names = (
     "src_pos_enc_table",
-    "trg_pos_enc_table", )
+    "trg_pos_enc_table",
+)
 # separated inputs for different usages.
 encoder_data_input_fields = (
     "src_word",
     "src_pos",
-    "src_slf_attn_bias", )
+    "src_slf_attn_bias",
+)
 decoder_data_input_fields = (
     "trg_word",
     "trg_pos",
     "trg_slf_attn_bias",
     "trg_src_attn_bias",
-    "enc_output", )
+    "enc_output",
+)
 label_data_input_fields = (
     "lbl_word",
-    "lbl_weight", )
+    "lbl_weight",
+)
 # In fast decoder, trg_pos (only containing the current time step) is generated
 # by ops and trg_slf_attn_bias is not needed.
 fast_decoder_data_input_fields = (
     "trg_word",
     "init_score",
     "init_idx",
-    "trg_src_attn_bias", )
+    "trg_src_attn_bias",
+)
 # if we use py_reader
 use_py_reader = False
 
@@ -320,16 +323,20 @@ def make_all_inputs(input_fields):
 np.random.seed(90)
 src_word_np = np.arange(1, TrainTaskConfig.batch_size * seq_len + 1).reshape(
     [TrainTaskConfig.batch_size, seq_len]).astype('int64')
-src_pos_np = np.random.randint(
-    1, seq_len, size=(TrainTaskConfig.batch_size, seq_len), dtype='int64')
+src_pos_np = np.random.randint(1,
+                               seq_len,
+                               size=(TrainTaskConfig.batch_size, seq_len),
+                               dtype='int64')
 src_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
                                        ModelHyperParams.n_head, seq_len,
                                        seq_len).astype('float32')
 
 trg_word_np = np.arange(1, TrainTaskConfig.batch_size * seq_len + 1).reshape(
     [TrainTaskConfig.batch_size, seq_len]).astype('int64')
-trg_pos_np = np.random.randint(
-    1, seq_len, size=(TrainTaskConfig.batch_size, seq_len), dtype='int64')
+trg_pos_np = np.random.randint(1,
+                               seq_len,
+                               size=(TrainTaskConfig.batch_size, seq_len),
+                               dtype='int64')
 trg_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
                                        ModelHyperParams.n_head, seq_len,
                                        seq_len).astype('float32')
@@ -337,11 +344,10 @@ def make_all_inputs(input_fields):
                                        ModelHyperParams.n_head, seq_len,
                                        seq_len).astype('float32')
 
-lbl_word_np = np.random.randint(
-    1,
-    ModelHyperParams.src_vocab_size - 1,
-    size=(TrainTaskConfig.batch_size * seq_len, 1),
-    dtype='int64')
+lbl_word_np = np.random.randint(1,
+                                ModelHyperParams.src_vocab_size - 1,
+                                size=(TrainTaskConfig.batch_size * seq_len, 1),
+                                dtype='int64')
 lbl_weight_np = np.random.randn(TrainTaskConfig.batch_size * seq_len,
                                 1).astype('float32')
 
@@ -352,6 +358,7 @@ def make_all_inputs(input_fields):
 
 
 class PrePostProcessLayer(Layer):
+
     def __init__(self, d_model, process_cmd, shape_len=None):
         super(PrePostProcessLayer, self).__init__()
         for cmd in process_cmd:
@@ -380,6 +387,7 @@ def forward(self, prev_out, out, process_cmd, dropout_rate=0.):
 
 
 class PositionwiseFeedForwardLayer(Layer):
+
     def __init__(self, d_inner_hid, d_hid, dropout_rate):
         super(PositionwiseFeedForwardLayer, self).__init__()
         self._i2h = Linear(d_hid, d_inner_hid, act="relu")
@@ -389,16 +397,16 @@ def __init__(self, d_inner_hid, d_hid, dropout_rate):
     def forward(self, x):
         hidden = self._i2h(x)
         if self._dropout_rate:
-            hidden = fluid.layers.dropout(
-                hidden,
-                dropout_prob=self._dropout_rate,
-                seed=ModelHyperParams.dropout_seed,
-                is_test=False)
+            hidden = fluid.layers.dropout(hidden,
+                                          dropout_prob=self._dropout_rate,
+                                          seed=ModelHyperParams.dropout_seed,
+                                          is_test=False)
         out = self._h2o(hidden)
         return out
 
 
 class MultiHeadAttentionLayer(Layer):
+
     def __init__(self,
                  d_key,
                  d_value,
@@ -440,11 +448,10 @@ def forward(self, queries, keys, values, attn_bias):
         transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
 
         # scale dot product attention
-        product = fluid.layers.matmul(
-            x=transpose_q,
-            y=transpose_k,
-            transpose_y=True,
-            alpha=self._d_model**-0.5)
+        product = fluid.layers.matmul(x=transpose_q,
+                                      y=transpose_k,
+                                      transpose_y=True,
+                                      alpha=self._d_model**-0.5)
         if attn_bias is not None:
             product += attn_bias
         weights = fluid.layers.softmax(product)
@@ -473,6 +480,7 @@ def forward(self, queries, keys, values, attn_bias):
 
 
 class EncoderSubLayer(Layer):
+
     def __init__(self,
                  n_head,
                  d_key,
@@ -494,14 +502,16 @@ def __init__(self,
                                                      self._preprocess_cmd, 3)
         self._multihead_attention_layer = MultiHeadAttentionLayer(
             d_key, d_value, d_model, n_head, attention_dropout)
-        self._postprocess_layer = PrePostProcessLayer(
-            d_model, self._postprocess_cmd, None)
+        self._postprocess_layer = PrePostProcessLayer(d_model,
+                                                      self._postprocess_cmd,
+                                                      None)
         self._preprocess_layer2 = PrePostProcessLayer(d_model,
                                                       self._preprocess_cmd, 3)
         self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
             d_inner_hid, d_model, relu_dropout)
-        self._postprocess_layer2 = PrePostProcessLayer(
-            d_model, self._postprocess_cmd, None)
+        self._postprocess_layer2 = PrePostProcessLayer(d_model,
+                                                       self._postprocess_cmd,
+                                                       None)
 
     def forward(self, enc_input, attn_bias):
         pre_process_multihead = self._preprocess_layer(
@@ -521,6 +531,7 @@ def forward(self, enc_input, attn_bias):
 
 
 class EncoderLayer(Layer):
+
     def __init__(self,
                  n_layer,
                  n_head,
@@ -560,6 +571,7 @@ def forward(self, enc_input, attn_bias):
 
 
 class PrepareEncoderDecoderLayer(Layer):
+
     def __init__(self,
                  src_vocab_size,
                  src_emb_dim,
@@ -573,13 +585,13 @@ def __init__(self,
         self._src_emb_dim = src_emb_dim
         self._src_vocab_size = src_vocab_size
         self._dropout_rate = dropout_rate
-        self._input_emb = Embedding(
-            size=[src_vocab_size, src_emb_dim],
-            is_sparse=is_sparse,
-            padding_idx=0,
-            param_attr=fluid.ParamAttr(
-                name=word_emb_param_name,
-                initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
+        self._input_emb = Embedding(size=[src_vocab_size, src_emb_dim],
+                                    is_sparse=is_sparse,
+                                    padding_idx=0,
+                                    param_attr=fluid.ParamAttr(
+                                        name=word_emb_param_name,
+                                        initializer=fluid.initializer.Normal(
+                                            0., src_emb_dim**-0.5)))
 
         if pos_enc_param_name is pos_enc_param_names[0]:
             pos_inp = pos_inp1
@@ -599,8 +611,8 @@ def __init__(self,
 
     def forward(self, src_word, src_pos):
         src_word_emb = self._input_emb(src_word)
-        src_word_emb = fluid.layers.scale(
-            x=src_word_emb, scale=self._src_emb_dim**0.5)
+        src_word_emb = fluid.layers.scale(x=src_word_emb,
+                                          scale=self._src_emb_dim**0.5)
         # # TODO change this to fit dynamic length input
         src_pos_emb = self._pos_emb(src_pos)
         src_pos_emb.stop_gradient = True
@@ -613,6 +625,7 @@ def forward(self, src_word, src_pos):
 
 
 class WrapEncoderLayer(Layer):
+
     def __init__(self,
                  src_vocab_size,
                  max_length,
@@ -655,6 +668,7 @@ def forward(self, enc_inputs):
 
 
 class DecoderSubLayer(Layer):
+
     def __init__(self,
                  n_head,
                  d_key,
@@ -705,10 +719,11 @@ def __init__(self,
                                                         postprocess_cmd, None)
 
     def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
-        pre_process_rlt = self._pre_process_layer(
-            None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout)
-        slf_attn_output = self._multihead_attention_layer(pre_process_rlt, None,
-                                                          None, slf_attn_bias)
+        pre_process_rlt = self._pre_process_layer(None, dec_input,
+                                                  self._preprocess_cmd,
+                                                  self._prepostprcess_dropout)
+        slf_attn_output = self._multihead_attention_layer(
+            pre_process_rlt, None, None, slf_attn_bias)
         slf_attn_output_pp = self._post_process_layer(
             dec_input, slf_attn_output, self._postprocess_cmd,
             self._prepostprcess_dropout)
@@ -717,9 +732,10 @@ def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
                                                     self._prepostprcess_dropout)
         enc_attn_output_pp = self._multihead_attention_layer2(
             pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
-        enc_attn_output = self._post_process_layer2(
-            slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd,
-            self._prepostprcess_dropout)
+        enc_attn_output = self._post_process_layer2(slf_attn_output_pp,
+                                                    enc_attn_output_pp,
+                                                    self._postprocess_cmd,
+                                                    self._prepostprcess_dropout)
         pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
                                                     self._preprocess_cmd,
                                                     self._prepostprcess_dropout)
@@ -731,6 +747,7 @@ def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
 
 
 class DecoderLayer(Layer):
+
     def __init__(self,
                  n_layer,
                  n_head,
@@ -756,25 +773,25 @@ def __init__(self,
             self._decoder_sub_layers.append(
                 self.add_sublayer(
                     'dsl_%d' % i,
-                    DecoderSubLayer(
-                        n_head,
-                        d_key,
-                        d_value,
-                        d_model,
-                        d_inner_hid,
-                        prepostprocess_dropout,
-                        attention_dropout,
-                        relu_dropout,
-                        preprocess_cmd,
-                        postprocess_cmd,
-                        cache=None if caches is None else caches[i],
-                        gather_idx=gather_idx)))
+                    DecoderSubLayer(n_head,
+                                    d_key,
+                                    d_value,
+                                    d_model,
+                                    d_inner_hid,
+                                    prepostprocess_dropout,
+                                    attention_dropout,
+                                    relu_dropout,
+                                    preprocess_cmd,
+                                    postprocess_cmd,
+                                    cache=None if caches is None else caches[i],
+                                    gather_idx=gather_idx)))
 
     def forward(self, dec_input, enc_output, dec_slf_attn_bias,
                 dec_enc_attn_bias):
         for i in range(self._n_layer):
-            tmp_dec_output = self._decoder_sub_layers[i](
-                dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias)
+            tmp_dec_output = self._decoder_sub_layers[i](dec_input, enc_output,
+                                                         dec_slf_attn_bias,
+                                                         dec_enc_attn_bias)
             dec_input = tmp_dec_output
 
         dec_output = self._pre_process_layer(None, tmp_dec_output,
@@ -784,6 +801,7 @@ def forward(self, dec_input, enc_output, dec_slf_attn_bias,
 
 
 class WrapDecoderLayer(Layer):
+
     def __init__(self,
                  trg_vocab_size,
                  max_length,
@@ -815,20 +833,19 @@ def __init__(self,
             is_sparse=is_sparse,
             word_emb_param_name=word_emb_param_names[1],
             pos_enc_param_name=pos_enc_param_names[1])
-        self._decoder_layer = DecoderLayer(
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd,
-            postprocess_cmd,
-            caches=caches,
-            gather_idx=gather_idx)
+        self._decoder_layer = DecoderLayer(n_layer,
+                                           n_head,
+                                           d_key,
+                                           d_value,
+                                           d_model,
+                                           d_inner_hid,
+                                           prepostprocess_dropout,
+                                           attention_dropout,
+                                           relu_dropout,
+                                           preprocess_cmd,
+                                           postprocess_cmd,
+                                           caches=caches,
+                                           gather_idx=gather_idx)
         self._weight_sharing = weight_sharing
         if not weight_sharing:
             self._fc = Linear(d_model, trg_vocab_size, bias_attr=False)
@@ -858,6 +875,7 @@ def forward(self, dec_inputs=None, enc_output=None):
 
 
 class TransFormer(Layer):
+
     def __init__(self,
                  src_vocab_size,
                  trg_vocab_size,
@@ -885,38 +903,36 @@ def __init__(self,
             assert src_vocab_size == trg_vocab_size, (
                 "Vocabularies in source and target should be same for weight sharing."
             )
-        self._wrap_encoder_layer = WrapEncoderLayer(
-            src_vocab_size,
-            max_length,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd,
-            postprocess_cmd,
-            weight_sharing,
-            is_sparse=is_sparse)
-        self._wrap_decoder_layer = WrapDecoderLayer(
-            trg_vocab_size,
-            max_length,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd,
-            postprocess_cmd,
-            weight_sharing,
-            is_sparse=is_sparse)
+        self._wrap_encoder_layer = WrapEncoderLayer(src_vocab_size,
+                                                    max_length,
+                                                    n_layer,
+                                                    n_head,
+                                                    d_key,
+                                                    d_value,
+                                                    d_model,
+                                                    d_inner_hid,
+                                                    prepostprocess_dropout,
+                                                    attention_dropout,
+                                                    relu_dropout,
+                                                    preprocess_cmd,
+                                                    postprocess_cmd,
+                                                    weight_sharing,
+                                                    is_sparse=is_sparse)
+        self._wrap_decoder_layer = WrapDecoderLayer(trg_vocab_size,
+                                                    max_length,
+                                                    n_layer,
+                                                    n_head,
+                                                    d_key,
+                                                    d_value,
+                                                    d_model,
+                                                    d_inner_hid,
+                                                    prepostprocess_dropout,
+                                                    attention_dropout,
+                                                    relu_dropout,
+                                                    preprocess_cmd,
+                                                    postprocess_cmd,
+                                                    weight_sharing,
+                                                    is_sparse=is_sparse)
 
         if weight_sharing:
             self._wrap_decoder_layer._prepare_decoder_layer._input_emb.weight = self._wrap_encoder_layer._prepare_encoder_layer._input_emb.weight
@@ -926,8 +942,8 @@ def forward(self, enc_inputs, dec_inputs, label, weights):
         predict = self._wrap_decoder_layer(dec_inputs, enc_output)
         if self._label_smooth_eps:
             label_out = fluid.layers.label_smooth(
-                label=fluid.layers.one_hot(
-                    input=label, depth=self._trg_vocab_size),
+                label=fluid.layers.one_hot(input=label,
+                                           depth=self._trg_vocab_size),
                 epsilon=self._label_smooth_eps)
 
         cost = fluid.layers.softmax_with_cross_entropy(
@@ -943,6 +959,7 @@ def forward(self, enc_inputs, dec_inputs, label, weights):
 
 
 class TestDygraphTransformerSortGradient(unittest.TestCase):
+
     def test_transformer_sort_gradient(self):
         for is_sparse in [True, False]:
             self.transformer_sort_gradient_float32(is_sparse)
@@ -955,26 +972,25 @@ def run_dygraph():
             fluid.set_flags({'FLAGS_new_executor_use_inplace': False})
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
-            transformer = TransFormer(
-                ModelHyperParams.src_vocab_size,
-                ModelHyperParams.trg_vocab_size,
-                ModelHyperParams.max_length + 1,
-                ModelHyperParams.n_layer,
-                ModelHyperParams.n_head,
-                ModelHyperParams.d_key,
-                ModelHyperParams.d_value,
-                ModelHyperParams.d_model,
-                ModelHyperParams.d_inner_hid,
-                ModelHyperParams.prepostprocess_dropout,
-                ModelHyperParams.attention_dropout,
-                ModelHyperParams.relu_dropout,
-                ModelHyperParams.preprocess_cmd,
-                ModelHyperParams.postprocess_cmd,
-                ModelHyperParams.weight_sharing,
-                TrainTaskConfig.label_smooth_eps,
-                use_py_reader=use_py_reader,
-                is_test=False,
-                is_sparse=is_sparse)
+            transformer = TransFormer(ModelHyperParams.src_vocab_size,
+                                      ModelHyperParams.trg_vocab_size,
+                                      ModelHyperParams.max_length + 1,
+                                      ModelHyperParams.n_layer,
+                                      ModelHyperParams.n_head,
+                                      ModelHyperParams.d_key,
+                                      ModelHyperParams.d_value,
+                                      ModelHyperParams.d_model,
+                                      ModelHyperParams.d_inner_hid,
+                                      ModelHyperParams.prepostprocess_dropout,
+                                      ModelHyperParams.attention_dropout,
+                                      ModelHyperParams.relu_dropout,
+                                      ModelHyperParams.preprocess_cmd,
+                                      ModelHyperParams.postprocess_cmd,
+                                      ModelHyperParams.weight_sharing,
+                                      TrainTaskConfig.label_smooth_eps,
+                                      use_py_reader=use_py_reader,
+                                      is_test=False,
+                                      is_sparse=is_sparse)
             if sync:
                 lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
                     ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
@@ -1048,26 +1064,25 @@ def run_dygraph():
         with new_program_scope():
             paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
-            transformer = TransFormer(
-                ModelHyperParams.src_vocab_size,
-                ModelHyperParams.trg_vocab_size,
-                ModelHyperParams.max_length + 1,
-                ModelHyperParams.n_layer,
-                ModelHyperParams.n_head,
-                ModelHyperParams.d_key,
-                ModelHyperParams.d_value,
-                ModelHyperParams.d_model,
-                ModelHyperParams.d_inner_hid,
-                ModelHyperParams.prepostprocess_dropout,
-                ModelHyperParams.attention_dropout,
-                ModelHyperParams.relu_dropout,
-                ModelHyperParams.preprocess_cmd,
-                ModelHyperParams.postprocess_cmd,
-                ModelHyperParams.weight_sharing,
-                TrainTaskConfig.label_smooth_eps,
-                use_py_reader=use_py_reader,
-                is_test=False,
-                is_sparse=is_sparse)
+            transformer = TransFormer(ModelHyperParams.src_vocab_size,
+                                      ModelHyperParams.trg_vocab_size,
+                                      ModelHyperParams.max_length + 1,
+                                      ModelHyperParams.n_layer,
+                                      ModelHyperParams.n_head,
+                                      ModelHyperParams.d_key,
+                                      ModelHyperParams.d_value,
+                                      ModelHyperParams.d_model,
+                                      ModelHyperParams.d_inner_hid,
+                                      ModelHyperParams.prepostprocess_dropout,
+                                      ModelHyperParams.attention_dropout,
+                                      ModelHyperParams.relu_dropout,
+                                      ModelHyperParams.preprocess_cmd,
+                                      ModelHyperParams.postprocess_cmd,
+                                      ModelHyperParams.weight_sharing,
+                                      TrainTaskConfig.label_smooth_eps,
+                                      use_py_reader=use_py_reader,
+                                      is_test=False,
+                                      is_sparse=is_sparse)
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             optimizer = fluid.optimizer.SGD(learning_rate=0.003)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
index 3a8a3a96e9a33..b814ca87dcd76 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
@@ -23,6 +23,7 @@
 
 
 def _dygraph_guard_(func):
+
     def __impl__(*args, **kwargs):
         if fluid._non_static_mode():
             return func(*args, **kwargs)
@@ -43,30 +44,36 @@ def random_var(size, low=-1, high=1, dtype='float32'):
 
 
 class TestDygraphTripleGradMatmul(TestCase):
+
     def test_matmul_triple_grad(self):
         input_numpy = np.ones([3, 3]) * 2
         with _test_eager_guard():
-            x = paddle.to_tensor(
-                input_numpy, stop_gradient=False, dtype='float32')
-            y = paddle.to_tensor(
-                input_numpy, stop_gradient=False, dtype='float32')
+            x = paddle.to_tensor(input_numpy,
+                                 stop_gradient=False,
+                                 dtype='float32')
+            y = paddle.to_tensor(input_numpy,
+                                 stop_gradient=False,
+                                 dtype='float32')
             out = paddle.matmul(x, y, False, False)
 
-            new_out_g = paddle.to_tensor(
-                np.ones([3, 3]), stop_gradient=False, dtype='float32')
-            new_x_g, new_y_g = paddle.grad(
-                [out], [x, y], [new_out_g],
-                retain_graph=True,
-                create_graph=True)
-
-            new_x_g_g = paddle.to_tensor(
-                np.ones([3, 3]), stop_gradient=False, dtype='float32')
-            new_y_g_g = paddle.to_tensor(
-                np.ones([3, 3]), stop_gradient=False, dtype='float32')
-            new_a, new_b, new_c = paddle.grad(
-                [new_x_g, new_y_g], [x, y, new_out_g], [new_x_g_g, new_y_g_g],
-                retain_graph=True,
-                create_graph=True)
+            new_out_g = paddle.to_tensor(np.ones([3, 3]),
+                                         stop_gradient=False,
+                                         dtype='float32')
+            new_x_g, new_y_g = paddle.grad([out], [x, y], [new_out_g],
+                                           retain_graph=True,
+                                           create_graph=True)
+
+            new_x_g_g = paddle.to_tensor(np.ones([3, 3]),
+                                         stop_gradient=False,
+                                         dtype='float32')
+            new_y_g_g = paddle.to_tensor(np.ones([3, 3]),
+                                         stop_gradient=False,
+                                         dtype='float32')
+            new_a, new_b, new_c = paddle.grad([new_x_g, new_y_g],
+                                              [x, y, new_out_g],
+                                              [new_x_g_g, new_y_g_g],
+                                              retain_graph=True,
+                                              create_graph=True)
 
             new_a.backward()
 
@@ -105,6 +112,7 @@ def test_matmul_triple_grad(self):
 
 
 class TestDygraphTripleGrad(TestCase):
+
     def setUp(self):
         self.sort_sum_gradient = False
         self.shape = [5, 5]
@@ -118,14 +126,13 @@ def grad(self,
              create_graph=False,
              allow_unused=False):
         fluid.set_flags({'FLAGS_sort_sum_gradient': self.sort_sum_gradient})
-        return fluid.dygraph.grad(
-            outputs=outputs,
-            inputs=inputs,
-            grad_outputs=grad_outputs,
-            no_grad_vars=no_grad_vars,
-            retain_graph=retain_graph,
-            create_graph=create_graph,
-            allow_unused=allow_unused)
+        return fluid.dygraph.grad(outputs=outputs,
+                                  inputs=inputs,
+                                  grad_outputs=grad_outputs,
+                                  no_grad_vars=no_grad_vars,
+                                  retain_graph=retain_graph,
+                                  create_graph=create_graph,
+                                  allow_unused=allow_unused)
 
     @dygraph_guard
     def func_exception(self):
@@ -151,8 +158,8 @@ def func_exception(self):
                       [random_var(shape)], [random_var(shape)])
 
         with self.assertRaises(AssertionError):
-            self.grad(
-                [random_var(shape)], [random_var(shape)], no_grad_vars=[1])
+            self.grad([random_var(shape)], [random_var(shape)],
+                      no_grad_vars=[1])
 
         with self.assertRaises(AssertionError):
             self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
@@ -209,14 +216,17 @@ def func_example_with_gradient_and_create_graph(self):
         self.assertTrue(np.allclose(dddx_grad_actual, dddx_expected))
 
     def test_all_cases(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         self.func_exception()
         self.func_example_with_gradient_and_create_graph()
         with _test_eager_guard():
             self.func_exception()
             self.func_example_with_gradient_and_create_graph()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 class TestDygraphTripleGradBradcastCase(TestCase):
+
     def setUp(self):
         self.sort_sum_gradient = False
         self.x_shape = [3, 2, 2]
@@ -232,14 +242,13 @@ def grad(self,
              create_graph=False,
              allow_unused=False):
         fluid.set_flags({'FLAGS_sort_sum_gradient': self.sort_sum_gradient})
-        return fluid.dygraph.grad(
-            outputs=outputs,
-            inputs=inputs,
-            grad_outputs=grad_outputs,
-            no_grad_vars=no_grad_vars,
-            retain_graph=retain_graph,
-            create_graph=create_graph,
-            allow_unused=allow_unused)
+        return fluid.dygraph.grad(outputs=outputs,
+                                  inputs=inputs,
+                                  grad_outputs=grad_outputs,
+                                  no_grad_vars=no_grad_vars,
+                                  retain_graph=retain_graph,
+                                  create_graph=create_graph,
+                                  allow_unused=allow_unused)
 
     @dygraph_guard
     def func_example_with_gradient_and_create_graph(self):
@@ -262,24 +271,21 @@ def func_example_with_gradient_and_create_graph(self):
         dx_actual, = self.grad([out], [x], create_graph=True)
         # Theoritical result based on math calculation
         dout = np.ones(self.x_shape).astype('float32')
-        dx_expected = np.matmul(
-            dout * out_np * (1 - out_np), np.transpose(
-                y_np, axes=(0, 2, 1)))
+        dx_expected = np.matmul(dout * out_np * (1 - out_np),
+                                np.transpose(y_np, axes=(0, 2, 1)))
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
         ddx_actual, = self.grad([dx_actual], [x], create_graph=True)
         # Theoritical result based on math calculation
         DDY = np.zeros(self.y_shape).astype('float32')
         DDX = np.ones(self.x_shape).astype('float32')
-        double_grad_tmp1 = np.matmul(
-            dout * out_np * (1 - out_np), np.transpose(
-                DDY, axes=(0, 2, 1)))
+        double_grad_tmp1 = np.matmul(dout * out_np * (1 - out_np),
+                                     np.transpose(DDY, axes=(0, 2, 1)))
         double_grad_tmp2 = np.matmul(DDX, y_np) + np.matmul(x_np, DDY)
         double_grad_tmp3 = (
             1 - 2 * out_np) * dout * double_grad_tmp2 * out_np * (1 - out_np)
         ddx_expected = double_grad_tmp1 + np.matmul(
-            double_grad_tmp3, np.transpose(
-                y_np, axes=(0, 2, 1)))
+            double_grad_tmp3, np.transpose(y_np, axes=(0, 2, 1)))
         self.assertTrue(np.allclose(ddx_actual.numpy(), ddx_expected))
 
         # Theoritical result based on math calculation
@@ -288,19 +294,19 @@ def func_example_with_gradient_and_create_graph(self):
         tmp1 = (1 - 2 * out_np) * ((1 - 2 * out_np) * dout * tmp0 * tmp0)
         tmp2 = tmp0 * (1 - 2 * out_np) * d_ddout - 2 * dout * (
             1 - out_np) * out_np * tmp0 * tmp0
-        dddx_expected = np.matmul(
-            ((tmp1 + tmp2) * out_np * (1 - out_np)),
-            np.transpose(
-                y_np, axes=(0, 2, 1)))
+        dddx_expected = np.matmul(((tmp1 + tmp2) * out_np * (1 - out_np)),
+                                  np.transpose(y_np, axes=(0, 2, 1)))
 
         ddx_actual.backward()
         dddx_grad_actual = x.gradient()
         self.assertTrue(np.allclose(dddx_grad_actual, dddx_expected))
 
     def test_all_cases(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         self.func_example_with_gradient_and_create_graph()
         with _test_eager_guard():
             self.func_example_with_gradient_and_create_graph()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
index 46a89efcec491..84180fa299bdb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
@@ -21,6 +21,7 @@
 
 
 class TestImperativeUsingNonZeroGpu(unittest.TestCase):
+
     def run_main(self, np_arr, place):
         with guard(place):
             var = to_variable(np_arr)
diff --git a/python/paddle/fluid/tests/unittests/test_increment.py b/python/paddle/fluid/tests/unittests/test_increment.py
index 38f6a546071b0..d7b0f0632950a 100755
--- a/python/paddle/fluid/tests/unittests/test_increment.py
+++ b/python/paddle/fluid/tests/unittests/test_increment.py
@@ -22,10 +22,12 @@
 
 
 class TestIncrement(unittest.TestCase):
+
     def test_api(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.layers.fill_constant(
-                shape=[1], dtype='int64', value=5)
+            input = fluid.layers.fill_constant(shape=[1],
+                                               dtype='int64',
+                                               value=5)
             expected_result = np.array([8], dtype='int64')
 
             output = paddle.tensor.math.increment(input, value=3)
@@ -41,6 +43,7 @@ def test_api(self):
 
 
 class TestInplaceApiWithDataTransform(unittest.TestCase):
+
     def test_increment(self):
         if fluid.core.is_compiled_with_cuda():
             paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_index_sample_op.py b/python/paddle/fluid/tests/unittests/test_index_sample_op.py
index 4da03c9643fa9..e6a76a3a0251e 100644
--- a/python/paddle/fluid/tests/unittests/test_index_sample_op.py
+++ b/python/paddle/fluid/tests/unittests/test_index_sample_op.py
@@ -22,14 +22,16 @@
 
 
 class TestIndexSampleOp(OpTest):
+
     def setUp(self):
         self.op_type = "index_sample"
         self.python_api = paddle.index_sample
         self.config()
         xnp = np.random.random(self.x_shape).astype(self.x_type)
-        indexnp = np.random.randint(
-            low=0, high=self.x_shape[1],
-            size=self.index_shape).astype(self.index_type)
+        indexnp = np.random.randint(low=0,
+                                    high=self.x_shape[1],
+                                    size=self.index_shape).astype(
+                                        self.index_type)
         self.inputs = {'X': xnp, 'Index': indexnp}
         index_array = []
         for i in range(self.index_shape[0]):
@@ -56,6 +58,7 @@ def config(self):
 
 
 class TestCase1(TestIndexSampleOp):
+
     def config(self):
         """
         For one dimension input
@@ -67,6 +70,7 @@ def config(self):
 
 
 class TestCase2(TestIndexSampleOp):
+
     def config(self):
         """
         For int64_t index type
@@ -78,6 +82,7 @@ def config(self):
 
 
 class TestCase3(TestIndexSampleOp):
+
     def config(self):
         """
         For int index type
@@ -89,6 +94,7 @@ def config(self):
 
 
 class TestCase4(TestIndexSampleOp):
+
     def config(self):
         """
         For int64 index type
@@ -100,6 +106,7 @@ def config(self):
 
 
 class TestIndexSampleShape(unittest.TestCase):
+
     def test_shape(self):
         paddle.enable_static()
         # create x value
@@ -110,8 +117,8 @@ def test_shape(self):
         # create index value
         index_shape = (2, 3)
         index_type = "int32"
-        index_np = np.random.randint(
-            low=0, high=x_shape[1], size=index_shape).astype(index_type)
+        index_np = np.random.randint(low=0, high=x_shape[1],
+                                     size=index_shape).astype(index_type)
 
         x = fluid.data(name='x', shape=[-1, 5], dtype='float64')
         index = fluid.data(name='index', shape=[-1, 3], dtype='int32')
@@ -126,18 +133,18 @@ def test_shape(self):
 
 
 class TestIndexSampleDynamic(unittest.TestCase):
+
     def test_result(self):
         with fluid.dygraph.guard():
-            x = paddle.to_tensor(
-                [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
-                 [9.0, 10.0, 11.0, 12.0]],
-                dtype='float32')
-            index = paddle.to_tensor(
-                [[0, 1, 2], [1, 2, 3], [0, 0, 0]], dtype='int32')
+            x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
+                                  [9.0, 10.0, 11.0, 12.0]],
+                                 dtype='float32')
+            index = paddle.to_tensor([[0, 1, 2], [1, 2, 3], [0, 0, 0]],
+                                     dtype='int32')
             out_z1 = paddle.index_sample(x, index)
 
-            except_output = np.array(
-                [[1.0, 2.0, 3.0], [6.0, 7.0, 8.0], [9.0, 9.0, 9.0]])
+            except_output = np.array([[1.0, 2.0, 3.0], [6.0, 7.0, 8.0],
+                                      [9.0, 9.0, 9.0]])
             assert out_z1.numpy().all() == except_output.all()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_index_select_op.py b/python/paddle/fluid/tests/unittests/test_index_select_op.py
index 0c0e946fddede..c8bb7890964dc 100644
--- a/python/paddle/fluid/tests/unittests/test_index_select_op.py
+++ b/python/paddle/fluid/tests/unittests/test_index_select_op.py
@@ -24,12 +24,14 @@
 
 
 class TestIndexSelectOp(OpTest):
+
     def setUp(self):
         self.python_api = paddle.index_select
         self.op_type = "index_select"
         self.init_dtype_type()
-        index_np = np.random.randint(
-            low=0, high=self.x_shape[self.dim], size=self.index_size)
+        index_np = np.random.randint(low=0,
+                                     high=self.x_shape[self.dim],
+                                     size=self.index_size)
         x_np = np.random.random(self.x_shape).astype(self.x_type)
         self.inputs = {'X': x_np, 'Index': index_np}
         self.attrs = {'dim': self.dim}
@@ -62,6 +64,7 @@ def test_check_grad_normal(self):
 
 
 class TestIndexSelectOpCase2(TestIndexSelectOp):
+
     def init_dtype_type(self):
         self.x_type = np.float32
         self.index_type = np.int32
@@ -71,6 +74,7 @@ def init_dtype_type(self):
 
 
 class TestIndexSelectOpCaseSingleThread(TestIndexSelectOp):
+
     def init_dtype_type(self):
         if fluid.is_compiled_with_cuda():
             fluid.set_flags({'FLAGS_cudnn_deterministic': True})
@@ -82,6 +86,7 @@ def init_dtype_type(self):
 
 
 class TestIndexSelectAPI(unittest.TestCase):
+
     def input_data(self):
         self.data_x = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
                                 [9.0, 10.0, 11.0, 12.0]])
@@ -93,12 +98,16 @@ def test_index_select_api(self):
         # case 1:
         with program_guard(Program(), Program()):
             x = fluid.layers.data(name='x', shape=[-1, 4])
-            index = fluid.layers.data(
-                name='index', shape=[3], dtype='int32', append_batch_size=False)
+            index = fluid.layers.data(name='index',
+                                      shape=[3],
+                                      dtype='int32',
+                                      append_batch_size=False)
             z = paddle.index_select(x, index, axis=1)
             exe = fluid.Executor(fluid.CPUPlace())
-            res, = exe.run(feed={'x': self.data_x,
-                                 'index': self.data_index},
+            res, = exe.run(feed={
+                'x': self.data_x,
+                'index': self.data_index
+            },
                            fetch_list=[z.name],
                            return_numpy=False)
         expect_out = np.array([[1.0, 2.0, 2.0], [5.0, 6.0, 6.0],
@@ -108,16 +117,20 @@ def test_index_select_api(self):
         # case 2:
         with program_guard(Program(), Program()):
             x = fluid.layers.data(name='x', shape=[-1, 4])
-            index = fluid.layers.data(
-                name='index', shape=[3], dtype='int32', append_batch_size=False)
+            index = fluid.layers.data(name='index',
+                                      shape=[3],
+                                      dtype='int32',
+                                      append_batch_size=False)
             z = paddle.index_select(x, index)
             exe = fluid.Executor(fluid.CPUPlace())
-            res, = exe.run(feed={'x': self.data_x,
-                                 'index': self.data_index},
+            res, = exe.run(feed={
+                'x': self.data_x,
+                'index': self.data_index
+            },
                            fetch_list=[z.name],
                            return_numpy=False)
-        expect_out = np.array(
-            [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [5.0, 6.0, 7.0, 8.0]])
+        expect_out = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
+                               [5.0, 6.0, 7.0, 8.0]])
         self.assertTrue(np.allclose(expect_out, np.array(res)))
 
     def test_dygraph_api(self):
@@ -128,8 +141,8 @@ def test_dygraph_api(self):
             index = fluid.dygraph.to_variable(self.data_index)
             z = paddle.index_select(x, index)
             np_z = z.numpy()
-        expect_out = np.array(
-            [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [5.0, 6.0, 7.0, 8.0]])
+        expect_out = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
+                               [5.0, 6.0, 7.0, 8.0]])
         self.assertTrue(np.allclose(expect_out, np_z))
 
         # case 2:
diff --git a/python/paddle/fluid/tests/unittests/test_infer_no_need_buffer_slots.py b/python/paddle/fluid/tests/unittests/test_infer_no_need_buffer_slots.py
index 3656cdfd5a034..a1e0da500e422 100644
--- a/python/paddle/fluid/tests/unittests/test_infer_no_need_buffer_slots.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_no_need_buffer_slots.py
@@ -23,6 +23,7 @@
 
 
 class TestInferNoNeedBufferSlots(unittest.TestCase):
+
     def net(self):
         x1 = fluid.default_main_program().global_block().create_var(
             dtype="float32", shape=[1], lod_level=0, name="x1")
diff --git a/python/paddle/fluid/tests/unittests/test_infer_shape.py b/python/paddle/fluid/tests/unittests/test_infer_shape.py
index 553ebaec7f1bc..c3e58ddaac11c 100644
--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
@@ -21,6 +21,7 @@
 
 
 class TestInferShape(unittest.TestCase):
+
     def test_sum_op(self):
         prog = core.ProgramDesc()
         self.assertIsNotNone(prog)
diff --git a/python/paddle/fluid/tests/unittests/test_inference_api.py b/python/paddle/fluid/tests/unittests/test_inference_api.py
index 7ed908eb33b81..a590dcecbfe0e 100644
--- a/python/paddle/fluid/tests/unittests/test_inference_api.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_api.py
@@ -15,6 +15,7 @@
 import os, shutil
 import unittest
 import paddle
+
 paddle.enable_static()
 import numpy as np
 import paddle.fluid as fluid
@@ -25,13 +26,14 @@
 
 
 class TestInferenceApi(unittest.TestCase):
+
     def test_inference_api(self):
         tensor32 = np.random.randint(10, 20, size=[20, 2]).astype('int32')
         paddletensor32 = PaddleTensor(tensor32)
         dtype32 = paddletensor32.dtype
         self.assertEqual(dtype32, PaddleDType.INT32)
-        self.assertEqual(
-            paddletensor32.data.tolist('int32'), tensor32.ravel().tolist())
+        self.assertEqual(paddletensor32.data.tolist('int32'),
+                         tensor32.ravel().tolist())
         paddletensor32.data.reset(tensor32)
         self.assertEqual(paddletensor32.as_ndarray().ravel().tolist(),
                          tensor32.ravel().tolist())
@@ -40,8 +42,8 @@ def test_inference_api(self):
         paddletensor64 = PaddleTensor(tensor64)
         dtype64 = paddletensor64.dtype
         self.assertEqual(dtype64, PaddleDType.INT64)
-        self.assertEqual(
-            paddletensor64.data.tolist('int64'), tensor64.ravel().tolist())
+        self.assertEqual(paddletensor64.data.tolist('int64'),
+                         tensor64.ravel().tolist())
         paddletensor64.data.reset(tensor64)
         self.assertEqual(paddletensor64.as_ndarray().ravel().tolist(),
                          tensor64.ravel().tolist())
@@ -50,9 +52,8 @@ def test_inference_api(self):
         paddletensor_float = PaddleTensor(tensor_float)
         dtype_float = paddletensor_float.dtype
         self.assertEqual(dtype_float, PaddleDType.FLOAT32)
-        self.assertEqual(
-            paddletensor_float.data.tolist('float32'),
-            tensor_float.ravel().tolist())
+        self.assertEqual(paddletensor_float.data.tolist('float32'),
+                         tensor_float.ravel().tolist())
         paddletensor_float.data.reset(tensor_float)
         self.assertEqual(paddletensor_float.as_ndarray().ravel().tolist(),
                          tensor_float.ravel().tolist())
@@ -66,23 +67,24 @@ def get_sample_model():
     startup_program = fluid.Program()
     with fluid.program_guard(main_program, startup_program):
         data = fluid.data(name="data", shape=[-1, 6, 64, 64], dtype="float32")
-        conv_out = fluid.layers.conv2d(
-            input=data,
-            num_filters=3,
-            filter_size=3,
-            groups=1,
-            padding=0,
-            bias_attr=False,
-            act=None)
+        conv_out = fluid.layers.conv2d(input=data,
+                                       num_filters=3,
+                                       filter_size=3,
+                                       groups=1,
+                                       padding=0,
+                                       bias_attr=False,
+                                       act=None)
     exe.run(startup_program)
-    serialized_program = paddle.static.serialize_program(
-        data, conv_out, program=main_program)
+    serialized_program = paddle.static.serialize_program(data,
+                                                         conv_out,
+                                                         program=main_program)
     serialized_params = paddle.static.serialize_persistables(
         data, conv_out, executor=exe, program=main_program)
     return serialized_program, serialized_params
 
 
 class TestInferenceBaseAPI(unittest.TestCase):
+
     def get_config(self, model, params):
         config = Config()
         config.set_model_buffer(model, len(model), params, len(params))
diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
index 9abcf2a767662..c19c2c65e6e6a 100644
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
@@ -31,10 +31,12 @@
 from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.io import save_inference_model, load_inference_model, save_persistables
 from paddle.fluid.transpiler import memory_optimize
+
 paddle.enable_static()
 
 
 class InferModel(object):
+
     def __init__(self, list):
         self.program = list[0]
         self.feed_var_names = list[1]
@@ -42,6 +44,7 @@ def __init__(self, list):
 
 
 class TestBook(unittest.TestCase):
+
     def test_fit_line_inference_model(self):
         MODEL_DIR = "./tmp/inference_model"
         UNI_MODEL_DIR = "./tmp/inference_model1"
@@ -67,13 +70,15 @@ def test_fit_line_inference_model(self):
         exe.run(init_program, feed={}, fetch_list=[])
 
         for i in six.moves.xrange(100):
-            tensor_x = np.array(
-                [[1, 1], [1, 2], [3, 4], [5, 2]]).astype("float32")
+            tensor_x = np.array([[1, 1], [1, 2], [3, 4], [5,
+                                                          2]]).astype("float32")
             tensor_y = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
 
             exe.run(program,
-                    feed={'x': tensor_x,
-                          'y': tensor_y},
+                    feed={
+                        'x': tensor_x,
+                        'y': tensor_y
+                    },
                     fetch_list=[avg_cost])
 
         # Separated model and unified model
@@ -85,8 +90,10 @@ def test_fit_line_inference_model(self):
         params_str = save_persistables(exe, None, main_program, None)
 
         expected = exe.run(program,
-                           feed={'x': tensor_x,
-                                 'y': tensor_y},
+                           feed={
+                               'x': tensor_x,
+                               'y': tensor_y
+                           },
                            fetch_list=[avg_cost])[0]
 
         six.moves.reload_module(executor)  # reload to build a new scope
@@ -116,6 +123,7 @@ def test_fit_line_inference_model(self):
 
 
 class TestSaveInferenceModel(unittest.TestCase):
+
     def test_save_inference_model(self):
         MODEL_DIR = "./tmp/inference_model2"
         init_program = Program()
@@ -166,6 +174,7 @@ def test_save_inference_model_with_auc(self):
 
 
 class TestInstance(unittest.TestCase):
+
     def test_save_inference_model(self):
         MODEL_DIR = "./tmp/inference_model3"
         init_program = Program()
@@ -196,6 +205,7 @@ def test_save_inference_model(self):
 
 
 class TestSaveInferenceModelNew(unittest.TestCase):
+
     def test_save_and_load_inference_model(self):
         MODEL_DIR = "./tmp/inference_model5"
         init_program = fluid.default_startup_program()
@@ -222,8 +232,10 @@ def test_save_and_load_inference_model(self):
         tensor_y = np.array([[-2], [-3], [-7]]).astype("float32")
         for i in six.moves.xrange(3):
             exe.run(program,
-                    feed={'x': tensor_x,
-                          'y': tensor_y},
+                    feed={
+                        'x': tensor_x,
+                        'y': tensor_y
+                    },
                     fetch_list=[avg_cost])
 
         self.assertRaises(ValueError, paddle.static.save_inference_model, None,
@@ -258,8 +270,10 @@ def test_save_and_load_inference_model(self):
         self.assertTrue(os.path.exists(MODEL_DIR + ".pdiparams"))
 
         expected = exe.run(program,
-                           feed={'x': tensor_x,
-                                 'y': tensor_y},
+                           feed={
+                               'x': tensor_x,
+                               'y': tensor_y
+                           },
                            fetch_list=[avg_cost])[0]
 
         six.moves.reload_module(executor)  # reload to build a new scope
@@ -270,28 +284,25 @@ def test_save_and_load_inference_model(self):
                           MODEL_DIR + "/", exe)
         self.assertRaises(ValueError, paddle.static.load_inference_model,
                           [MODEL_DIR], exe)
-        self.assertRaises(
-            ValueError,
-            paddle.static.load_inference_model,
-            MODEL_DIR,
-            exe,
-            pserver_endpoints=None)
-        self.assertRaises(
-            ValueError,
-            paddle.static.load_inference_model,
-            MODEL_DIR,
-            exe,
-            unsupported_param=None)
-        self.assertRaises(
-            (TypeError, ValueError),
-            paddle.static.load_inference_model,
-            None,
-            exe,
-            model_filename="illegal",
-            params_filename="illegal")
-
-        model = InferModel(
-            paddle.static.io.load_inference_model(MODEL_DIR, exe))
+        self.assertRaises(ValueError,
+                          paddle.static.load_inference_model,
+                          MODEL_DIR,
+                          exe,
+                          pserver_endpoints=None)
+        self.assertRaises(ValueError,
+                          paddle.static.load_inference_model,
+                          MODEL_DIR,
+                          exe,
+                          unsupported_param=None)
+        self.assertRaises((TypeError, ValueError),
+                          paddle.static.load_inference_model,
+                          None,
+                          exe,
+                          model_filename="illegal",
+                          params_filename="illegal")
+
+        model = InferModel(paddle.static.io.load_inference_model(
+            MODEL_DIR, exe))
 
         outs = exe.run(model.program,
                        feed={
@@ -340,8 +351,10 @@ def test_serialize_program_and_persistables(self):
         tensor_y = np.array([[-2], [-3], [-7]]).astype("float32")
         for i in six.moves.xrange(3):
             exe.run(program,
-                    feed={'x': tensor_x,
-                          'y': tensor_y},
+                    feed={
+                        'x': tensor_x,
+                        'y': tensor_y
+                    },
                     fetch_list=[avg_cost])
 
         # test if return type of serialize_program is bytes
@@ -381,8 +394,10 @@ def test_normalize_program(self):
         tensor_y = np.array([[-2], [-3], [-7]]).astype("float32")
         for i in six.moves.xrange(3):
             exe.run(program,
-                    feed={'x': tensor_x,
-                          'y': tensor_y},
+                    feed={
+                        'x': tensor_x,
+                        'y': tensor_y
+                    },
                     fetch_list=[avg_cost])
 
         # test if return type of serialize_program is bytes
@@ -400,6 +415,7 @@ def test_normalize_program(self):
 
 
 class TestLoadInferenceModelError(unittest.TestCase):
+
     def test_load_model_not_exist(self):
         place = core.CPUPlace()
         exe = executor.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 52137b22a790c..7138c2393ffca 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -42,15 +42,16 @@ def output_hist(out):
 
 
 class TestConstantInitializer(unittest.TestCase):
+
     def test_calculate_gain(self):
         self.assertEqual(paddle.nn.initializer.calculate_gain('sigmoid'), 1)
         self.assertEqual(paddle.nn.initializer.calculate_gain('linear'), 1)
         self.assertEqual(paddle.nn.initializer.calculate_gain('conv2d'), 1)
         self.assertEqual(paddle.nn.initializer.calculate_gain('tanh'), 5.0 / 3)
-        self.assertEqual(
-            paddle.nn.initializer.calculate_gain('relu'), math.sqrt(2.0))
-        self.assertEqual(
-            paddle.nn.initializer.calculate_gain('leaky_relu', 1), 1)
+        self.assertEqual(paddle.nn.initializer.calculate_gain('relu'),
+                         math.sqrt(2.0))
+        self.assertEqual(paddle.nn.initializer.calculate_gain('leaky_relu', 1),
+                         1)
         self.assertEqual(paddle.nn.initializer.calculate_gain('selu'), 3.0 / 4)
 
     def test_constant_initializer_default_value(self, dtype="float32"):
@@ -106,18 +107,18 @@ def test_constant_initializer_bf16(self):
 
 
 class TestUniformInitializer(unittest.TestCase):
+
     def test_uniform_initializer_default_value(self, dtype="float32"):
         """Test the uniform initializer with default value
         """
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.UniformInitializer())
+            block.create_parameter(dtype=dtype,
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.UniformInitializer())
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
@@ -134,12 +135,11 @@ def test_uniform_initializer_random_seed(self):
         program.random_seed = 123
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="param1",
-                initializer=initializer.UniformInitializer())
+            block.create_parameter(dtype="float32",
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param1",
+                                   initializer=initializer.UniformInitializer())
             block.create_parameter(
                 dtype="float32",
                 shape=[5, 10],
@@ -157,12 +157,12 @@ def test_uniform_initializer(self, dtype="float32"):
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
+            block.create_parameter(dtype=dtype,
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.UniformInitializer(
+                                       -4.2, 3.1, 123))
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
@@ -178,12 +178,12 @@ def test_uniform_initializer_two_op(self, dtype="float32"):
         program = framework.Program()
         block = program.global_block()
         for i in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.UniformInitializer(-4.2, float(i), 123))
+            block.create_parameter(dtype=dtype,
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.UniformInitializer(
+                                       -4.2, float(i), 123))
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op0 = block.ops[0]
@@ -213,18 +213,18 @@ def test_uniform_initializer_bf16(self):
 
 
 class TestNormalInitializer(unittest.TestCase):
+
     def test_normal_initializer_default_value(self):
         """Test the normal initializer with default value
         """
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.NormalInitializer())
+            block.create_parameter(dtype="float32",
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.NormalInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -238,12 +238,12 @@ def test_normal_initializer(self, dtype="float32"):
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.NormalInitializer(2.3, 1.9, 123))
+            block.create_parameter(dtype=dtype,
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.NormalInitializer(
+                                       2.3, 1.9, 123))
         num_ops = 2 if (dtype == "float16" or dtype == "uint16") else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
@@ -265,6 +265,7 @@ def test_normal_initializer_bf16(self):
 
 
 class TestXavierInitializer(unittest.TestCase):
+
     def test_uniform_xavier_initializer(self):
         """Test Xavier initializer with uniform distribution on
            for matrix multiply.
@@ -303,8 +304,8 @@ def test_uniform_xavier_initializer_conv(self):
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
         receptive_field_size = float(15 * 20)
-        limit = np.sqrt(6.0 / (
-            (param.shape[0] + param.shape[1]) * receptive_field_size))
+        limit = np.sqrt(
+            6.0 / ((param.shape[0] + param.shape[1]) * receptive_field_size))
         self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
         self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 0)
@@ -347,8 +348,8 @@ def test_normal_xavier_initializer_conv(self):
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
         receptive_field_size = float(15 * 20)
-        std = np.sqrt(2.0 / (
-            (param.shape[0] + param.shape[1]) * receptive_field_size))
+        std = np.sqrt(
+            2.0 / ((param.shape[0] + param.shape[1]) * receptive_field_size))
         self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
         self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 0)
@@ -361,15 +362,17 @@ def test_xavier_initializer_supplied_arguments(self,
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.XavierInitializer(
-                    uniform=uniform, fan_in=12, fan_out=23, seed=134))
-        num_ops = 2 if (dtype == "float16" or (dtype == "uint16" and
-                                               not uniform)) else 1
+            block.create_parameter(dtype=dtype,
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.XavierInitializer(
+                                       uniform=uniform,
+                                       fan_in=12,
+                                       fan_out=23,
+                                       seed=134))
+        num_ops = 2 if (dtype == "float16" or
+                        (dtype == "uint16" and not uniform)) else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         if uniform:
@@ -400,6 +403,7 @@ def test_xavier_initializer_bf16(self):
 
 
 class TestMSRAInitializer(unittest.TestCase):
+
     def test_uniform_msra_initializer(self):
         """Test MSRA initializer with uniform distribution on
            for matrix multiply.
@@ -492,13 +496,12 @@ def test_msra_initializer_supplied_arguments(self, dtype="float32"):
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.MSRAInitializer(
-                    fan_in=12, seed=134))
+            block.create_parameter(dtype=dtype,
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.MSRAInitializer(
+                                       fan_in=12, seed=134))
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
@@ -522,6 +525,7 @@ def test_msra_initializer_bf16(self):
 
 
 class TestBilinearInitializer(unittest.TestCase):
+
     def test_bilinear_initializer(self, dtype="float32"):
         """Test the bilinear initializer with supplied arguments
         """
@@ -560,6 +564,7 @@ def test_type_error(self):
 
 
 class TestNumpyArrayInitializer(unittest.TestCase):
+
     def test_numpy_array_initializer(self, dtype="float32"):
         """Test the numpy array initializer with supplied arguments
         """
@@ -595,6 +600,7 @@ def test_numpy_array_initializer_bf16(self):
 
 
 class TestSetGlobalInitializer(unittest.TestCase):
+
     def test_set_global_weight_initilizer(self):
         """Test Set Global Param initilizer with UniformInitializer
         """
@@ -626,11 +632,9 @@ def test_set_global_bias_initilizer(self):
         """
         main_prog = framework.Program()
         startup_prog = framework.Program()
-        fluid.set_global_initializer(
-            initializer.Uniform(
-                low=-0.5, high=0.5),
-            bias_init=initializer.Normal(
-                loc=0.0, scale=2.0))
+        fluid.set_global_initializer(initializer.Uniform(low=-0.5, high=0.5),
+                                     bias_init=initializer.Normal(loc=0.0,
+                                                                  scale=2.0))
         with fluid.program_guard(main_prog, startup_prog):
             x = fluid.data(name="x", shape=[1, 3, 32, 32])
             # default initilizer of bias in layers.conv2d is ConstantInitializer
@@ -655,6 +659,7 @@ def test_set_global_bias_initilizer(self):
 
 
 class TestUniformInitializerDygraph(unittest.TestCase):
+
     def func_uniform_initializer(self, dtype="float32"):
         """
         In dygraph mode, we can use initializer directly to initialize a tensor.
@@ -673,9 +678,8 @@ def func_uniform_initializer(self, dtype="float32"):
 
         hist, prob = output_hist(tensor.numpy())
 
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=1e-3), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=1e-3),
+                        "hist: " + str(hist))
 
         paddle.enable_static()
 
@@ -686,6 +690,7 @@ def test_uniform_initializer(self, dtype="float32"):
 
 
 class TestXavierInitializerDygraph(unittest.TestCase):
+
     def func_xvarier_initializer(self, dtype="float32"):
         """
         In dygraph mode, we can use initializer directly to initialize a tensor.
@@ -695,8 +700,9 @@ def func_xvarier_initializer(self, dtype="float32"):
         tensor = paddle.zeros([1024, 1024, 16])
         tensor.stop_gradient = False
 
-        xavier_ = paddle.fluid.initializer.XavierInitializer(
-            uniform=False, fan_in=3, fan_out=5)
+        xavier_ = paddle.fluid.initializer.XavierInitializer(uniform=False,
+                                                             fan_in=3,
+                                                             fan_out=5)
         xavier_(tensor)
 
         hist, _ = output_hist(tensor.numpy())
@@ -704,10 +710,8 @@ def func_xvarier_initializer(self, dtype="float32"):
         hist2, _ = output_hist(
             np.random.normal(0, np.sqrt(2.0 / (3 + 5)), [1024, 1024, 16]))
 
-        self.assertTrue(
-            np.allclose(
-                hist, hist2, rtol=0, atol=0.01),
-            "hist: " + str(hist) + " hist2: " + str(hist2))
+        self.assertTrue(np.allclose(hist, hist2, rtol=0, atol=0.01),
+                        "hist: " + str(hist) + " hist2: " + str(hist2))
         paddle.enable_static()
 
     def test_xavier_initializer(self, dtype="float32"):
@@ -717,6 +721,7 @@ def test_xavier_initializer(self, dtype="float32"):
 
 
 class TestMSRAInitializerDygraph(unittest.TestCase):
+
     def func_msra_initializer(self, dtype="float32"):
         """
         In dygraph mode, we can use initializer directly to initialize a tensor.
@@ -726,8 +731,8 @@ def func_msra_initializer(self, dtype="float32"):
         tensor = paddle.zeros([1024, 1024, 16])
         tensor.stop_gradient = False
 
-        msra_ = paddle.fluid.initializer.MSRAInitializer(
-            uniform=False, fan_in=4)
+        msra_ = paddle.fluid.initializer.MSRAInitializer(uniform=False,
+                                                         fan_in=4)
         msra_(tensor)
 
         hist, _ = output_hist(tensor.numpy())
@@ -735,10 +740,8 @@ def func_msra_initializer(self, dtype="float32"):
         hist2, _ = output_hist(
             np.random.normal(0, np.sqrt(2.0 / (4)), [1024, 1024, 16]))
 
-        self.assertTrue(
-            np.allclose(
-                hist, hist2, rtol=0, atol=0.01),
-            "hist: " + str(hist) + " hist2: " + str(hist2))
+        self.assertTrue(np.allclose(hist, hist2, rtol=0, atol=0.01),
+                        "hist: " + str(hist) + " hist2: " + str(hist2))
         paddle.enable_static()
 
     def test_msra_initializer(self, dtype="float32"):
@@ -748,6 +751,7 @@ def test_msra_initializer(self, dtype="float32"):
 
 
 class TesetconsistencyOfDynamicAndStaticGraph(unittest.TestCase):
+
     def func_order(self):
         paddle.set_device('cpu')
         SEED = 123
@@ -756,21 +760,23 @@ def func_order(self):
             learning_rate=1.0,
             trainable=False,
             regularizer=None,
-            initializer=paddle.nn.initializer.TruncatedNormal(
-                mean=0.0, std=2.0))
+            initializer=paddle.nn.initializer.TruncatedNormal(mean=0.0,
+                                                              std=2.0))
         bias_attr = paddle.framework.ParamAttr(
             name="linear_bias",
             learning_rate=1.0,
             trainable=False,
             regularizer=None,
-            initializer=paddle.nn.initializer.TruncatedNormal(
-                mean=0.0, std=2.0))
+            initializer=paddle.nn.initializer.TruncatedNormal(mean=0.0,
+                                                              std=2.0))
 
         def run_dynamic_graph():
             paddle.disable_static()
             paddle.seed(SEED)
-            linear = paddle.nn.Linear(
-                1, 1, weight_attr=weight_attr, bias_attr=bias_attr)
+            linear = paddle.nn.Linear(1,
+                                      1,
+                                      weight_attr=weight_attr,
+                                      bias_attr=bias_attr)
             return linear.weight.numpy(), linear.bias.numpy()
             paddle.enable_static()
 
@@ -778,8 +784,10 @@ def run_static_graph():
             paddle.enable_static()
             exe = paddle.static.Executor(paddle.CPUPlace())
             paddle.seed(SEED)
-            linear = paddle.nn.Linear(
-                1, 1, weight_attr=weight_attr, bias_attr=bias_attr)
+            linear = paddle.nn.Linear(1,
+                                      1,
+                                      weight_attr=weight_attr,
+                                      bias_attr=bias_attr)
             res = exe.run(paddle.static.default_startup_program(),
                           fetch_list=['linear_weight', 'linear_bias'])
             return res[0], res[1]
@@ -820,8 +828,9 @@ def func_orthogonal(self):
 
         paddle.disable_static()
         paddle.seed(2021)
-        linear = paddle.nn.Linear(
-            self.in_features, self.out_features, weight_attr=self.weight_attr)
+        linear = paddle.nn.Linear(self.in_features,
+                                  self.out_features,
+                                  weight_attr=self.weight_attr)
         res_dygraph = linear.weight.numpy()
 
         paddle.enable_static()
@@ -829,10 +838,9 @@ def func_orthogonal(self):
         start_prog = paddle.static.Program()
         main_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, start_prog):
-            linear = paddle.nn.Linear(
-                self.in_features,
-                self.out_features,
-                weight_attr=self.weight_attr)
+            linear = paddle.nn.Linear(self.in_features,
+                                      self.out_features,
+                                      weight_attr=self.weight_attr)
 
             block = start_prog.global_block()
             self.assertEqual(len(block.ops), self.num_ops)
@@ -924,11 +932,10 @@ def func_orthogonal(self):
 
         paddle.disable_static()
         paddle.seed(2021)
-        conv2d = paddle.nn.Conv2D(
-            self.in_features,
-            self.out_features,
-            self.kernel_size,
-            weight_attr=self.weight_attr)
+        conv2d = paddle.nn.Conv2D(self.in_features,
+                                  self.out_features,
+                                  self.kernel_size,
+                                  weight_attr=self.weight_attr)
         res_dygraph = conv2d.weight.numpy()
 
         paddle.enable_static()
@@ -936,11 +943,10 @@ def func_orthogonal(self):
         start_prog = paddle.static.Program()
         main_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, start_prog):
-            conv2d = paddle.nn.Conv2D(
-                self.in_features,
-                self.out_features,
-                self.kernel_size,
-                weight_attr=self.weight_attr)
+            conv2d = paddle.nn.Conv2D(self.in_features,
+                                      self.out_features,
+                                      self.kernel_size,
+                                      weight_attr=self.weight_attr)
             exe = paddle.static.Executor()
             res_static = exe.run(paddle.static.default_startup_program(),
                                  fetch_list=[conv2d.weight])[0]
@@ -995,6 +1001,7 @@ def check_result(self, a, b):
 
 # initialize Conv1D weight
 class TestDiracInitializer1(unittest.TestCase):
+
     def config(self):
         self.weight_attr = paddle.ParamAttr(
             initializer=paddle.nn.initializer.Dirac())
@@ -1015,11 +1022,10 @@ def func_dirac(self):
         paddle.set_default_dtype(self.dtype)
 
         paddle.disable_static()
-        conv = self.conv_layer(
-            self.in_channels,
-            self.out_channels,
-            self.kernel_size,
-            weight_attr=self.weight_attr)
+        conv = self.conv_layer(self.in_channels,
+                               self.out_channels,
+                               self.kernel_size,
+                               weight_attr=self.weight_attr)
         weight_dygraph = conv.weight.numpy()
 
         paddle.enable_static()
@@ -1027,11 +1033,10 @@ def func_dirac(self):
         main_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, start_prog):
             inp = paddle.rand(self.input_shape)
-            conv = self.conv_layer(
-                self.in_channels,
-                self.out_channels,
-                self.kernel_size,
-                weight_attr=self.weight_attr)
+            conv = self.conv_layer(self.in_channels,
+                                   self.out_channels,
+                                   self.kernel_size,
+                                   weight_attr=self.weight_attr)
 
             output = conv(inp)
             block = start_prog.global_block()
@@ -1061,6 +1066,7 @@ def test_dirac(self):
 
 # initialize Conv2D weight
 class TestDiracInitializer2(TestDiracInitializer1):
+
     def config(self):
         self.weight_attr = paddle.ParamAttr(
             initializer=paddle.nn.initializer.Dirac(groups=1))
@@ -1082,6 +1088,7 @@ def check_result(self, w_dygraph, w_static, conv_in, conv_out):
 
 # initialize Conv3D weight
 class TestDiracInitializer3(TestDiracInitializer1):
+
     def config(self):
         self.weight_attr = paddle.ParamAttr(
             initializer=paddle.nn.initializer.Dirac(groups=2))
@@ -1096,8 +1103,8 @@ def config(self):
     def check_result(self, w_dygraph, w_static, conv_in, conv_out):
         self.assertTrue(np.array_equal(w_dygraph, w_static))
         self.assertTrue(
-            np.array_equal(conv_out[:, 0:5, :, :, :], conv_in[:, :, 1:9, 1:9, 1:
-                                                              9]))
+            np.array_equal(conv_out[:, 0:5, :, :, :], conv_in[:, :, 1:9, 1:9,
+                                                              1:9]))
         self.assertTrue(
             np.array_equal(conv_out[:, 5:10, :, :, :], conv_in[:, :, 1:9, 1:9,
                                                                1:9]))
diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
index 9953681e0f5bd..0f4a2e7a67c6b 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer_nn.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
@@ -40,6 +40,7 @@ def check_cast_op(op):
 
 
 class TestConstantInitializer(unittest.TestCase):
+
     def static_test_constant_initializer_common(self,
                                                 init_inst,
                                                 dtype="float32",
@@ -48,12 +49,11 @@ def static_test_constant_initializer_common(self,
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=init_inst)
+            block.create_parameter(dtype=dtype,
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=init_inst)
         num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
@@ -91,8 +91,9 @@ def test_constant_initializer_dygraph(self, dtype="float32"):
         """Test constant initializer with supplied value in dygraph
         """
         with fluid.dygraph.guard():
-            linear = nn.Linear(
-                2, 4, weight_attr=nn.initializer.Constant(value=2.0))
+            linear = nn.Linear(2,
+                               4,
+                               weight_attr=nn.initializer.Constant(value=2.0))
             mat_target = np.ones((2, 4), dtype=dtype) * 2.0
             mat_linear = linear.weight.numpy()
             mismatch = np.sum(
@@ -116,6 +117,7 @@ def test_constant_initializer_bf16(self):
 
 
 class TestKaimingInitializer(unittest.TestCase):
+
     def static_test_kaiming_initializer_common(self,
                                                init_inst,
                                                dtype="float32",
@@ -126,12 +128,11 @@ def static_test_kaiming_initializer_common(self,
         block = program.global_block()
         shape_mat = [5, 10, 15, 20] if is_conv else [5, 10]
         for _ in range(2):
-            param = block.create_parameter(
-                dtype="float32",
-                shape=shape_mat,
-                lod_level=0,
-                name="param",
-                initializer=init_inst)
+            param = block.create_parameter(dtype="float32",
+                                           shape=shape_mat,
+                                           lod_level=0,
+                                           name="param",
+                                           initializer=init_inst)
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         if uniform:
@@ -208,6 +209,7 @@ def test_kaiming_normal_initializer_conv_static(self):
 
 
 class TestUniform(unittest.TestCase):
+
     def test_uniform_common(self, dtype="float32", seed=0):
         """Test the uniform initializer with default value
         """
@@ -217,12 +219,11 @@ def test_uniform_common(self, dtype="float32", seed=0):
         program.random_seed = seed
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.Uniform())
+            block.create_parameter(dtype=dtype,
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.Uniform())
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
@@ -248,12 +249,11 @@ def test_uniform_initializer_default_value(self,
         program.random_seed = seed
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.Uniform())
+            block.create_parameter(dtype=dtype,
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.Uniform())
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
@@ -279,12 +279,12 @@ def test_uniform_initializer(self,
         program.random_seed = seed
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.Uniform(min_value, max_vlaue))
+            block.create_parameter(dtype=dtype,
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.Uniform(
+                                       min_value, max_vlaue))
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
@@ -309,12 +309,12 @@ def test_uniform_initializer_two_op(self,
         program.random_seed = seed
         block = program.global_block()
         for i in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.Uniform(min_value, float(i)))
+            block.create_parameter(dtype=dtype,
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.Uniform(
+                                       min_value, float(i)))
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op0 = block.ops[0]
@@ -351,8 +351,7 @@ def test_uniform_initializer_dygraph(self):
 
         weight_attr = paddle.framework.ParamAttr(
             name="linear_weight",
-            initializer=paddle.nn.initializer.Uniform(
-                low=-0.5, high=0.5))
+            initializer=paddle.nn.initializer.Uniform(low=-0.5, high=0.5))
         linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr)
 
         min_value, max_value = get_uniform_min_and_max(linear.weight.numpy())
@@ -363,6 +362,7 @@ def test_uniform_initializer_dygraph(self):
 
 
 class TestNormal(unittest.TestCase):
+
     def test_normal_initializer_default_value(self):
         """Test the normal initializer with default value
         """
@@ -371,12 +371,11 @@ def test_normal_initializer_default_value(self):
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.Normal())
+            block.create_parameter(dtype="float32",
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.Normal())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -394,12 +393,11 @@ def test_normal_initializer(self, dtype="float32"):
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.Normal(2.3, 1.9))
+            block.create_parameter(dtype=dtype,
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.Normal(2.3, 1.9))
         num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
@@ -428,12 +426,12 @@ def test_normal_initializer_dygraph(self):
 
         weight_attr = paddle.framework.ParamAttr(
             name="linear_weight",
-            initializer=paddle.nn.initializer.Normal(
-                mean=0.0, std=2.0))
+            initializer=paddle.nn.initializer.Normal(mean=0.0, std=2.0))
         linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr)
 
 
 class TestTruncatedNormal(unittest.TestCase):
+
     def test_truncated_normal_initializer_default_value(self):
         """Test the truncated normal initializer with default value
         """
@@ -442,12 +440,11 @@ def test_truncated_normal_initializer_default_value(self):
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.TruncatedNormal())
+            block.create_parameter(dtype="float32",
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.TruncatedNormal())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'truncated_gaussian_random')
@@ -465,12 +462,12 @@ def test_truncated_normal_initializer(self, dtype="float32"):
         program = framework.Program()
         block = program.global_block()
         for _ in range(2):
-            block.create_parameter(
-                dtype=dtype,
-                shape=[5, 10],
-                lod_level=0,
-                name="param",
-                initializer=initializer.TruncatedNormal(2.3, 1.9))
+            block.create_parameter(dtype=dtype,
+                                   shape=[5, 10],
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.TruncatedNormal(
+                                       2.3, 1.9))
         num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
@@ -505,12 +502,13 @@ def test_truncated_normal_initializer_dygraph(self):
 
         weight_attr = paddle.framework.ParamAttr(
             name="linear_weight",
-            initializer=paddle.nn.initializer.TruncatedNormal(
-                mean=0.0, std=2.0))
+            initializer=paddle.nn.initializer.TruncatedNormal(mean=0.0,
+                                                              std=2.0))
         linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr)
 
 
 class TestXavierUniform(unittest.TestCase):
+
     def test_xavier_uniform_initializer(self):
         """Test Xavier initializer with uniform distribution on
            for matrix multiply.
@@ -555,8 +553,8 @@ def test_xavier_uniform_initializer_conv(self):
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
         receptive_field_size = float(15 * 20)
-        limit = np.sqrt(6.0 / (
-            (param.shape[0] + param.shape[1]) * receptive_field_size))
+        limit = np.sqrt(
+            6.0 / ((param.shape[0] + param.shape[1]) * receptive_field_size))
         self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
         self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 0)
@@ -573,6 +571,7 @@ def test_xavier_uniform_initializer_dygraph(self):
 
 
 class TestXavierNormal(unittest.TestCase):
+
     def test_xavier_normal_initializer(self):
         """Test Xavier initializer with normal distribution on
            for matrix multiply.
@@ -617,8 +616,8 @@ def test_xavier_normal_initializer_conv(self):
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
         receptive_field_size = float(15 * 20)
-        std = np.sqrt(2.0 / (
-            (param.shape[0] + param.shape[1]) * receptive_field_size))
+        std = np.sqrt(
+            2.0 / ((param.shape[0] + param.shape[1]) * receptive_field_size))
         self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
         self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 0)
@@ -637,6 +636,7 @@ def test_xavier_normal_initializer_dygraph(self):
 
 
 class TestAssign(unittest.TestCase):
+
     def test_assign_initializer(self, dtype="float32"):
         """Test the numpy array initializer with supplied arguments
         """
@@ -647,12 +647,11 @@ def test_assign_initializer(self, dtype="float32"):
         block = program.global_block()
         np_array = numpy.random.random((10000)).astype(dtype)
         for _ in range(2):
-            block.create_parameter(
-                dtype=np_array.dtype,
-                shape=np_array.shape,
-                lod_level=0,
-                name="param",
-                initializer=initializer.Assign(np_array))
+            block.create_parameter(dtype=np_array.dtype,
+                                   shape=np_array.shape,
+                                   lod_level=0,
+                                   name="param",
+                                   initializer=initializer.Assign(np_array))
         num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
diff --git a/python/paddle/fluid/tests/unittests/test_inner.py b/python/paddle/fluid/tests/unittests/test_inner.py
index 2174c20c9a095..8a412d8138f79 100644
--- a/python/paddle/fluid/tests/unittests/test_inner.py
+++ b/python/paddle/fluid/tests/unittests/test_inner.py
@@ -23,21 +23,26 @@
 
 
 class TestMultiplyApi(unittest.TestCase):
+
     def _run_static_graph_case(self, x_data, y_data):
         with program_guard(Program(), Program()):
             paddle.enable_static()
-            x = paddle.static.data(
-                name='x', shape=x_data.shape, dtype=x_data.dtype)
-            y = paddle.static.data(
-                name='y', shape=y_data.shape, dtype=y_data.dtype)
+            x = paddle.static.data(name='x',
+                                   shape=x_data.shape,
+                                   dtype=x_data.dtype)
+            y = paddle.static.data(name='y',
+                                   shape=y_data.shape,
+                                   dtype=y_data.dtype)
             res = paddle.inner(x, y)
 
-            place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
-            ) else paddle.CPUPlace()
+            place = paddle.CUDAPlace(
+                0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             outs = exe.run(paddle.static.default_main_program(),
-                           feed={'x': x_data,
-                                 'y': y_data},
+                           feed={
+                               'x': x_data,
+                               'y': y_data
+                           },
                            fetch_list=[res])
             res = outs[0]
             return res
@@ -89,20 +94,18 @@ def func_test_multiply(self):
         self.assertTrue(np.allclose(res, np.inner(x_data, y_data)))
 
         # test dynamic computation graph: 2-d array Complex
-        x_data = np.random.rand(20,
-                                50).astype(np.float64) + 1J * np.random.rand(
-                                    20, 50).astype(np.float64)
-        y_data = np.random.rand(50).astype(np.float64) + 1J * np.random.rand(
-            50).astype(np.float64)
+        x_data = np.random.rand(20, 50).astype(
+            np.float64) + 1J * np.random.rand(20, 50).astype(np.float64)
+        y_data = np.random.rand(50).astype(
+            np.float64) + 1J * np.random.rand(50).astype(np.float64)
         res = self._run_dynamic_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.inner(x_data, y_data)))
 
         # test dynamic computation graph: 3-d array Complex
-        x_data = np.random.rand(5, 10,
-                                10).astype(np.float64) + 1J * np.random.rand(
-                                    5, 10, 10).astype(np.float64)
-        y_data = np.random.rand(2, 10).astype(np.float64) + 1J * np.random.rand(
-            2, 10).astype(np.float64)
+        x_data = np.random.rand(5, 10, 10).astype(
+            np.float64) + 1J * np.random.rand(5, 10, 10).astype(np.float64)
+        y_data = np.random.rand(2, 10).astype(
+            np.float64) + 1J * np.random.rand(2, 10).astype(np.float64)
         res = self._run_dynamic_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.inner(x_data, y_data)))
 
@@ -113,6 +116,7 @@ def test_multiply(self):
 
 
 class TestMultiplyError(unittest.TestCase):
+
     def func_test_errors(self):
         # test static computation graph: dtype can not be int8
         paddle.enable_static()
@@ -121,7 +125,7 @@ def func_test_errors(self):
             y = paddle.static.data(name='y', shape=[100], dtype=np.int8)
             self.assertRaises(TypeError, paddle.inner, x, y)
 
-        # test static computation graph: inputs must be broadcastable 
+        # test static computation graph: inputs must be broadcastable
         with program_guard(Program(), Program()):
             x = paddle.static.data(name='x', shape=[20, 50], dtype=np.float64)
             y = paddle.static.data(name='y', shape=[20], dtype=np.float64)
@@ -143,7 +147,7 @@ def func_test_errors(self):
         y = paddle.to_tensor(y_data)
         self.assertRaises(ValueError, paddle.inner, x, y)
 
-        # test dynamic computation graph: dtype must be same	
+        # test dynamic computation graph: dtype must be same
         x_data = np.random.randn(200).astype(np.float32)
         y_data = np.random.randn(200).astype(np.float64)
         x = paddle.to_tensor(x_data)
diff --git a/python/paddle/fluid/tests/unittests/test_inplace.py b/python/paddle/fluid/tests/unittests/test_inplace.py
index 99873eaa98870..b81fcd90746d1 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace.py
@@ -23,6 +23,7 @@
 
 
 class TestInplace(unittest.TestCase):
+
     def func_test_forward_version(self):
         with paddle.fluid.dygraph.guard():
             var = paddle.to_tensor(np.ones((4, 2, 3)).astype(np.float32))
@@ -117,6 +118,7 @@ def test_backward_success_2(self):
 
 
 class TestDygraphInplace(unittest.TestCase):
+
     def setUp(self):
         self.init_data()
         self.set_np_compare_func()
@@ -283,6 +285,7 @@ def test_backward_success_2(self):
 
 
 class TestDygraphInplaceUnsqueeze(TestDygraphInplace):
+
     def non_inplace_api_processing(self, var):
         return paddle.unsqueeze(var, -1)
 
@@ -291,6 +294,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceReshape(TestDygraphInplace):
+
     def non_inplace_api_processing(self, var):
         return paddle.reshape(var, [-1])
 
@@ -299,6 +303,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceFlatten(TestDygraphInplace):
+
     def non_inplace_api_processing(self, var):
         return var.flatten()
 
@@ -307,26 +312,28 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceScatter(TestDygraphInplace):
+
     def init_data(self):
         self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]])
         self.dtype = "float32"
 
     def non_inplace_api_processing(self, var):
         index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
-        updates = paddle.to_tensor(
-            [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
+        updates = paddle.to_tensor([[1, 1], [2, 2], [3, 3], [4, 4]],
+                                   dtype='float32')
 
         return paddle.scatter(var, index, updates, overwrite=False)
 
     def inplace_api_processing(self, var):
         index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
-        updates = paddle.to_tensor(
-            [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
+        updates = paddle.to_tensor([[1, 1], [2, 2], [3, 3], [4, 4]],
+                                   dtype='float32')
 
         return paddle.scatter_(var, index, updates, overwrite=False)
 
 
 class TestDygraphInplaceElu(TestDygraphInplace):
+
     def non_inplace_api_processing(self, var):
         return paddle.nn.functional.elu(var)
 
@@ -335,6 +342,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceRelu(TestDygraphInplace):
+
     def non_inplace_api_processing(self, var):
         return paddle.nn.functional.relu(var)
 
@@ -343,6 +351,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceSoftmax(TestDygraphInplace):
+
     def non_inplace_api_processing(self, var):
         return paddle.nn.functional.softmax(var)
 
@@ -351,6 +360,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceTanh(TestDygraphInplace):
+
     def non_inplace_api_processing(self, var):
         return paddle.tanh(var)
 
@@ -359,6 +369,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceCeil(TestDygraphInplace):
+
     def non_inplace_api_processing(self, var):
         return var.ceil()
 
@@ -367,6 +378,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceFloor(TestDygraphInplace):
+
     def non_inplace_api_processing(self, var):
         return var.floor()
 
@@ -375,6 +387,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceExp(TestDygraphInplace):
+
     def set_np_compare_func(self):
         self.np_compare = np.allclose
 
@@ -386,6 +399,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceReciprocal(TestDygraphInplace):
+
     def non_inplace_api_processing(self, var):
         return var.reciprocal()
 
@@ -394,6 +408,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceRound(TestDygraphInplace):
+
     def non_inplace_api_processing(self, var):
         return var.round()
 
@@ -402,6 +417,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceSqrt(TestDygraphInplace):
+
     def init_data(self):
         self.input_var_numpy = np.random.uniform(0, 5, [10, 20, 1])
         self.dtype = "float32"
@@ -414,6 +430,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceRsqrt(TestDygraphInplaceSqrt):
+
     def non_inplace_api_processing(self, var):
         return var.rsqrt()
 
@@ -422,6 +439,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceClip(TestDygraphInplace):
+
     def non_inplace_api_processing(self, var):
         return var.clip(0.6, 1.5)
 
@@ -430,6 +448,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceScale(TestDygraphInplace):
+
     def non_inplace_api_processing(self, var):
         return var.scale(scale=2.0, bias=3.0)
 
@@ -438,6 +457,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceAdd(TestDygraphInplace):
+
     def init_data(self):
         self.input_var_numpy = np.random.rand(2, 3, 4)
         self.dtype = "float32"
@@ -453,6 +473,7 @@ def inplace_api_processing(self, var):
 
 
 class TestDygraphInplaceSubtract(TestDygraphInplaceAdd):
+
     def non_inplace_api_processing(self, var):
         input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
         return var.subtract(input_var_2)
@@ -463,6 +484,7 @@ def inplace_api_processing(self, var):
 
 
 class TestLossIsInplaceVar(unittest.TestCase):
+
     def func_test_loss_is_inplace_var(self):
         with paddle.fluid.dygraph.guard():
             var_a = paddle.ones((2, 2))
@@ -493,6 +515,7 @@ def test_loss_is_inplace_var(self):
 
 
 class TestContinuouslyInplace(unittest.TestCase):
+
     def func_test_continuously_inplace(self):
         a = paddle.rand([2, 3])
         a.stop_gradient = False
@@ -511,6 +534,7 @@ def test_continuously_inplace(self):
 
 
 class TestGetitemBeforeInplace(unittest.TestCase):
+
     def test_getitem_before_inplace(self):
         with _test_eager_guard():
             a = paddle.ones(shape=[4, 2, 3], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
index 67f6b91021472..dc0b45206d990 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
@@ -27,6 +27,7 @@
 
 
 class TestInplaceANBOpTraining(unittest.TestCase):
+
     def setUp(self):
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.N = 4
@@ -50,12 +51,11 @@ def build_program(self,
         startup.random_seed = seed
         with fluid.unique_name.guard():
             with fluid.program_guard(main, startup):
-                data = fluid.layers.data(
-                    name='input',
-                    shape=self.dshape,
-                    dtype=self.dtype,
-                    append_batch_size=False,
-                    stop_gradient=False)
+                data = fluid.layers.data(name='input',
+                                         shape=self.dshape,
+                                         dtype=self.dtype,
+                                         append_batch_size=False,
+                                         stop_gradient=False)
                 if inplace:
                     bn = fluid.layers.inplace_abn(
                         data,
@@ -83,7 +83,7 @@ def build_program(self,
                         bn = fluid.layers.elu(bn, alpha)
 
                 # NOTE: in inplace mode input and output of bn
-                # may have same name, multiply 1. to generate 
+                # may have same name, multiply 1. to generate
                 # a new Variable for fetch
                 bn = bn * 1.
 
@@ -102,14 +102,13 @@ def compare(self, place, layout, only_forward, activation, alpha, use_cuda):
         fetch_outs = []
         fetch_names = []
         for inplace in [False, True]:
-            main, startup, outs = self.build_program(
-                place,
-                layout,
-                seed,
-                only_forward,
-                activation,
-                alpha,
-                inplace=inplace)
+            main, startup, outs = self.build_program(place,
+                                                     layout,
+                                                     seed,
+                                                     only_forward,
+                                                     activation,
+                                                     alpha,
+                                                     inplace=inplace)
             exe = fluid.Executor(place)
             exe.run(startup)
 
@@ -145,12 +144,11 @@ def compare(self, place, layout, only_forward, activation, alpha, use_cuda):
             fetch_outs.append(bn_fetches)
             fetch_names.append(fetch_name)
 
-        for bn_val, inplace_abn_val, name1, name2 in zip(*(
-                fetch_outs + fetch_names)):
+        for bn_val, inplace_abn_val, name1, name2 in zip(*(fetch_outs +
+                                                           fetch_names)):
             self.assertTrue(
-                np.allclose(
-                    bn_val, inplace_abn_val, atol=1e-2),
-                "Output (" + name1 + ":" + name2 +
+                np.allclose(bn_val, inplace_abn_val,
+                            atol=1e-2), "Output (" + name1 + ":" + name2 +
                 ") has diff on {} with {} layout and {} activation. \n".format(
                     place, layout, activation) + "\nBN     " + str(bn_val) +
                 "\n" + "Inplace ABN " + str(inplace_abn_val))
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
index b9089448d53f1..39e493b1b344d 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
@@ -24,6 +24,7 @@
 
 
 class ConvBNLayer(fluid.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -33,18 +34,17 @@ def __init__(self,
                  data_format="NCHW"):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = paddle.nn.Conv2D(
-            in_channels=num_channels,
-            out_channels=num_filters,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            bias_attr=False,
-            data_format=data_format)
+        self._conv = paddle.nn.Conv2D(in_channels=num_channels,
+                                      out_channels=num_filters,
+                                      kernel_size=filter_size,
+                                      stride=stride,
+                                      padding=(filter_size - 1) // 2,
+                                      groups=groups,
+                                      bias_attr=False,
+                                      data_format=data_format)
 
-        self._batch_norm = paddle.nn.BatchNorm(
-            num_filters, data_layout=data_format)
+        self._batch_norm = paddle.nn.BatchNorm(num_filters,
+                                               data_layout=data_format)
 
     def forward(self, inputs):
         y = self._conv(inputs)
@@ -61,11 +61,10 @@ def create_program(data_format="NCHW"):
         if data_format == "NHWC":
             x = paddle.transpose(x, [0, 2, 3, 1])
         x = fluid.layers.prelu(x, mode="channel")
-        conv = ConvBNLayer(
-            num_channels=3,
-            num_filters=3,
-            filter_size=1,
-            data_format=data_format)
+        conv = ConvBNLayer(num_channels=3,
+                           num_filters=3,
+                           filter_size=1,
+                           data_format=data_format)
         y = conv(x) + x
 
         loss = fluid.layers.reduce_sum(y)
@@ -77,7 +76,9 @@ def create_program(data_format="NCHW"):
 
 
 class TestInplaceAddto(unittest.TestCase):
+
     def check_result(self, data_format="NCHW"):
+
         def run_program(enable_addto):
             np.random.seed(10)
             paddle.seed(10)
@@ -86,8 +87,8 @@ def run_program(enable_addto):
                 fluid.set_flags({"FLAGS_cudnn_deterministic": True})
             fluid.set_flags({"FLAGS_max_inplace_grad_add": 2})
             loss, main, startup, w = create_program(data_format=data_format)
-            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
 
             strategy = fluid.BuildStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_and_clear_gradient.py b/python/paddle/fluid/tests/unittests/test_inplace_and_clear_gradient.py
index b685900eadfa3..7ec04ed90b0ae 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_and_clear_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_and_clear_gradient.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 def clear_grad(w, a):
+
     @paddle.no_grad()
     def warp(*_):
         assert w.grad is not None
@@ -32,6 +33,7 @@ def warp(*_):
 
 
 class TestInplaceAndClearGradient(unittest.TestCase):
+
     def test(self):
         paddle.set_device('cpu')
 
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py b/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py
index abc8849b614f7..581ce0d5d0259 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py
@@ -25,6 +25,7 @@
 
 # In static mode, inplace strategy will not be used in Inplace APIs.
 class TestStaticAutoGeneratedAPI(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         self.init_data()
@@ -61,11 +62,13 @@ def test_api(self):
 
 
 class TestStaticInplaceAutoGeneratedAPI(TestStaticAutoGeneratedAPI):
+
     def executed_paddle_api(self, x):
         return x.ceil_()
 
 
 class TestStaticFloorAPI(TestStaticAutoGeneratedAPI):
+
     def executed_paddle_api(self, x):
         return x.floor()
 
@@ -74,11 +77,13 @@ def executed_numpy_api(self, x):
 
 
 class TestStaticInplaceFloorAPI(TestStaticFloorAPI):
+
     def executed_paddle_api(self, x):
         return x.floor_()
 
 
 class TestStaticExpAPI(TestStaticAutoGeneratedAPI):
+
     def set_np_compare_func(self):
         self.np_compare = np.allclose
 
@@ -90,11 +95,13 @@ def executed_numpy_api(self, x):
 
 
 class TestStaticInplaceExpAPI(TestStaticExpAPI):
+
     def executed_paddle_api(self, x):
         return x.exp_()
 
 
 class TestStaticReciprocalAPI(TestStaticAutoGeneratedAPI):
+
     def executed_paddle_api(self, x):
         return x.reciprocal()
 
@@ -103,11 +110,13 @@ def executed_numpy_api(self, x):
 
 
 class TestStaticInplaceReciprocalAPI(TestStaticReciprocalAPI):
+
     def executed_paddle_api(self, x):
         return x.reciprocal_()
 
 
 class TestStaticRoundAPI(TestStaticAutoGeneratedAPI):
+
     def executed_paddle_api(self, x):
         return x.round()
 
@@ -116,11 +125,13 @@ def executed_numpy_api(self, x):
 
 
 class TestStaticInplaceRoundAPI(TestStaticRoundAPI):
+
     def executed_paddle_api(self, x):
         return x.round_()
 
 
 class TestStaticSqrtAPI(TestStaticAutoGeneratedAPI):
+
     def init_data(self):
         self.dtype = 'float32'
         self.shape = [10, 20]
@@ -137,11 +148,13 @@ def executed_numpy_api(self, x):
 
 
 class TestStaticInplaceSqrtAPI(TestStaticSqrtAPI):
+
     def executed_paddle_api(self, x):
         return x.sqrt_()
 
 
 class TestStaticRsqrtAPI(TestStaticSqrtAPI):
+
     def executed_paddle_api(self, x):
         return x.rsqrt()
 
@@ -150,12 +163,14 @@ def executed_numpy_api(self, x):
 
 
 class TestStaticInplaceRsqrtAPI(TestStaticRsqrtAPI):
+
     def executed_paddle_api(self, x):
         return x.rsqrt_()
 
 
 # In dygraph mode, inplace strategy will be used in Inplace APIs.
 class TestDygraphAutoGeneratedAPI(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
         self.init_data()
@@ -184,11 +199,13 @@ def test_api(self):
 
 
 class TestDygraphInplaceAutoGeneratedAPI(TestDygraphAutoGeneratedAPI):
+
     def executed_paddle_api(self, x):
         return x.ceil_()
 
 
 class TestDygraphFloorAPI(TestDygraphAutoGeneratedAPI):
+
     def executed_paddle_api(self, x):
         return x.floor()
 
@@ -197,11 +214,13 @@ def executed_numpy_api(self, x):
 
 
 class TestDygraphInplaceFloorAPI(TestDygraphFloorAPI):
+
     def executed_paddle_api(self, x):
         return x.floor_()
 
 
 class TestDygraphExpAPI(TestDygraphAutoGeneratedAPI):
+
     def executed_paddle_api(self, x):
         return x.exp()
 
@@ -213,11 +232,13 @@ def set_np_compare_func(self):
 
 
 class TestDygraphInplaceExpAPI(TestDygraphExpAPI):
+
     def executed_paddle_api(self, x):
         return x.exp_()
 
 
 class TestDygraphReciprocalAPI(TestDygraphAutoGeneratedAPI):
+
     def executed_paddle_api(self, x):
         return x.reciprocal()
 
@@ -226,11 +247,13 @@ def executed_numpy_api(self, x):
 
 
 class TestDygraphInplaceReciprocalAPI(TestDygraphReciprocalAPI):
+
     def executed_paddle_api(self, x):
         return x.reciprocal_()
 
 
 class TestDygraphRoundAPI(TestDygraphAutoGeneratedAPI):
+
     def executed_paddle_api(self, x):
         return x.round()
 
@@ -239,11 +262,13 @@ def executed_numpy_api(self, x):
 
 
 class TestDygraphInplaceRoundAPI(TestDygraphRoundAPI):
+
     def executed_paddle_api(self, x):
         return x.round_()
 
 
 class TestDygraphSqrtAPI(TestDygraphAutoGeneratedAPI):
+
     def init_data(self):
         self.dtype = 'float32'
         self.shape = [10, 20]
@@ -260,11 +285,13 @@ def executed_numpy_api(self, x):
 
 
 class TestDygraphInplaceSqrtAPI(TestDygraphSqrtAPI):
+
     def executed_paddle_api(self, x):
         return x.sqrt_()
 
 
 class TestDygraphRsqrtAPI(TestDygraphSqrtAPI):
+
     def executed_paddle_api(self, x):
         return x.rsqrt()
 
@@ -273,6 +300,7 @@ def executed_numpy_api(self, x):
 
 
 class TestDygraphInplaceRsqrtAPI(TestDygraphRsqrtAPI):
+
     def executed_paddle_api(self, x):
         return x.rsqrt_()
 
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py b/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py
index 90666d4ebb6e6..643ff14b87882 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py
@@ -20,10 +20,11 @@
 
 
 class TestSoftmaxWithXe(unittest.TestCase):
+
     def setUp(self):
         self.initParameter()
-        self.m, self.n = np.random.random_integers(
-            low=100, high=2000, size=[2]).astype('int64')
+        self.m, self.n = np.random.random_integers(low=100, high=2000,
+                                                   size=[2]).astype('int64')
 
     def initParameter(self):
         self.dtype = 'float32'
@@ -38,11 +39,10 @@ def softmax_with_xe(self,
         m, n = x.shape
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             with fluid.scope_guard(fluid.Scope()):
-                x_d = fluid.layers.data(
-                    name='x',
-                    shape=[m, n],
-                    dtype=self.dtype,
-                    append_batch_size=False)
+                x_d = fluid.layers.data(name='x',
+                                        shape=[m, n],
+                                        dtype=self.dtype,
+                                        append_batch_size=False)
                 y_d = fluid.layers.data(
                     name='y',
                     shape=[m, 1] if not self.soft_label else [m, n],
@@ -61,17 +61,19 @@ def softmax_with_xe(self,
 
                 build_strategy = fluid.BuildStrategy()
                 build_strategy.enable_inplace = inplace
-                prog = fluid.CompiledProgram(fluid.default_main_program(
-                )).with_data_parallel(
-                    build_strategy=build_strategy, places=place)
+                prog = fluid.CompiledProgram(
+                    fluid.default_main_program()).with_data_parallel(
+                        build_strategy=build_strategy, places=place)
 
                 fetch_list = [z_d.name, s_d.name]
 
                 print('Inplace is {}'.format("ON" if inplace else "OFF"))
 
                 z, s = exe.run(prog,
-                               feed={x_d.name: x,
-                                     y_d.name: y},
+                               feed={
+                                   x_d.name: x,
+                                   y_d.name: y
+                               },
                                fetch_list=fetch_list)
                 return z, s
 
@@ -82,27 +84,39 @@ def main_with_place(self, place):
         for a, b in x_range:
             x = ((b - a) * x + a).astype(self.dtype)
             if not self.soft_label:
-                y = np.random.random_integers(
-                    size=[self.m, 1], low=0, high=self.n - 1).astype('int64')
+                y = np.random.random_integers(size=[self.m, 1],
+                                              low=0,
+                                              high=self.n - 1).astype('int64')
             else:
                 y = np.random.random(size=[self.m, self.n]).astype(self.dtype)
-                norm_y = np.broadcast_to(
-                    np.reshape(
-                        np.sum(y, axis=1), [-1, 1]), y.shape)
+                norm_y = np.broadcast_to(np.reshape(np.sum(y, axis=1), [-1, 1]),
+                                         y.shape)
                 y = y / norm_y
 
-            z1, s1 = self.softmax_with_xe(
-                x, y, place, inplace=False, numeric_stable_mode=False)
-            z2, s2 = self.softmax_with_xe(
-                x, y, place, inplace=True, numeric_stable_mode=False)
+            z1, s1 = self.softmax_with_xe(x,
+                                          y,
+                                          place,
+                                          inplace=False,
+                                          numeric_stable_mode=False)
+            z2, s2 = self.softmax_with_xe(x,
+                                          y,
+                                          place,
+                                          inplace=True,
+                                          numeric_stable_mode=False)
 
             self.assertTrue((z1 == z2).all())
             self.assertTrue((s1 == s2).all())
 
-            z1, s1 = self.softmax_with_xe(
-                x, y, place, inplace=False, numeric_stable_mode=True)
-            z2, s2 = self.softmax_with_xe(
-                x, y, place, inplace=True, numeric_stable_mode=True)
+            z1, s1 = self.softmax_with_xe(x,
+                                          y,
+                                          place,
+                                          inplace=False,
+                                          numeric_stable_mode=True)
+            z2, s2 = self.softmax_with_xe(x,
+                                          y,
+                                          place,
+                                          inplace=True,
+                                          numeric_stable_mode=True)
             self.assertTrue((z1 == z2).all())
             self.assertTrue((s1 == s2).all())
 
@@ -113,18 +127,21 @@ def test_main(self):
 
 
 class TestSoftmaxWithXe1(TestSoftmaxWithXe):
+
     def initParameter(self):
         self.dtype = 'float32'
         self.soft_label = True
 
 
 class TestSoftmaxWithXe2(TestSoftmaxWithXe):
+
     def initParameter(self):
         self.dtype = 'float64'
         self.soft_label = False
 
 
 class TestSoftmaxWithXe3(TestSoftmaxWithXe):
+
     def initParameter(self):
         self.dtype = 'float64'
         self.soft_label = True
diff --git a/python/paddle/fluid/tests/unittests/test_input_spec.py b/python/paddle/fluid/tests/unittests/test_input_spec.py
index 4e0aa4a9bcad7..f8f04229a4de8 100644
--- a/python/paddle/fluid/tests/unittests/test_input_spec.py
+++ b/python/paddle/fluid/tests/unittests/test_input_spec.py
@@ -22,6 +22,7 @@
 
 
 class TestInputSpec(unittest.TestCase):
+
     def test_default(self):
         tensor_spec = InputSpec([3, 4])
         self.assertEqual(tensor_spec.dtype,
@@ -112,6 +113,7 @@ def test_eq_and_hash(self):
 
 
 class NetWithNonTensorSpec(paddle.nn.Layer):
+
     def __init__(self, in_num, out_num):
         super(NetWithNonTensorSpec, self).__init__()
         self.linear_1 = paddle.nn.Linear(in_num, out_num)
@@ -152,6 +154,7 @@ def another_func(self, x, config=None):
 
 
 class TestNetWithNonTensorSpec(unittest.TestCase):
+
     def setUp(self):
         self.in_num = 16
         self.out_num = 16
@@ -233,6 +236,7 @@ def test_spec_compatible(self):
 
 
 class NetWithNonTensorSpecPrune(paddle.nn.Layer):
+
     def __init__(self, in_num, out_num):
         super(NetWithNonTensorSpecPrune, self).__init__()
         self.linear_1 = paddle.nn.Linear(in_num, out_num)
@@ -252,6 +256,7 @@ def forward(self, x, y, use_bn=False):
 
 
 class TestNetWithNonTensorSpecWithPrune(unittest.TestCase):
+
     def setUp(self):
         self.in_num = 16
         self.out_num = 16
@@ -298,6 +303,7 @@ def test_non_tensor_with_prune(self):
 
 
 class UnHashableObject:
+
     def __init__(self, val):
         self.val = val
 
@@ -306,6 +312,7 @@ def __hash__(self):
 
 
 class TestCompatibleNonTensorSpec(unittest.TestCase):
+
     def test_case(self):
         self.assertTrue(_compatible_non_tensor_spec([1, 2, 3], [1, 2, 3]))
         self.assertFalse(_compatible_non_tensor_spec([1, 2, 3], [1, 2]))
@@ -313,8 +320,8 @@ def test_case(self):
 
         # not supported unhashable object.
         self.assertTrue(
-            _compatible_non_tensor_spec(
-                UnHashableObject(1), UnHashableObject(1)))
+            _compatible_non_tensor_spec(UnHashableObject(1),
+                                        UnHashableObject(1)))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_install_check.py b/python/paddle/fluid/tests/unittests/test_install_check.py
index 15f2b5f3b7eed..e51079278db18 100644
--- a/python/paddle/fluid/tests/unittests/test_install_check.py
+++ b/python/paddle/fluid/tests/unittests/test_install_check.py
@@ -18,6 +18,7 @@
 
 
 class TestInstallCheck(unittest.TestCase):
+
     def test_paddle_fluid(self):
         paddle.fluid.install_check.run_check()
 
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
index aa184dd42e6fc..f932df9dd33cd 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
@@ -22,6 +22,7 @@
 from op_test import OpTest
 from paddle.fluid import Program, program_guard
 from paddle.fluid.dygraph import to_variable
+from paddle.fluid.framework import _test_eager_guard
 
 
 def _reference_instance_norm_naive(x, scale, bias, epsilon, mean, var):
@@ -64,9 +65,10 @@ def _reference_instance_norm_grad(x, d_y, scale, mean, var, epsilon):
     scale_tile = np.reshape(scale, (1, c, 1, 1))
     scale_tile = np.tile(scale_tile, (n, 1, h, w))
 
-    d_x = scale_tile * var_inv * (d_y - np.mean(
-        d_y, axis=(2, 3), keepdims=True) - (x - mean_tile) * var_inv * np.mean(
-            d_y * (x - mean_tile) * var_inv, axis=(2, 3), keepdims=True))
+    d_x = scale_tile * var_inv * (
+        d_y - np.mean(d_y, axis=(2, 3), keepdims=True) -
+        (x - mean_tile) * var_inv *
+        np.mean(d_y * (x - mean_tile) * var_inv, axis=(2, 3), keepdims=True))
     return d_x, d_scale, d_bias
 
 
@@ -77,6 +79,7 @@ def _cal_mean_variance(x, epsilon, mean_shape):
 
 
 class TestInstanceNormOpTraining(unittest.TestCase):
+
     def setUp(self):
         self.epsilon = 1e-5
         self.init_test_case()
@@ -97,6 +100,7 @@ def set_global_mean_var(self, mean_shape, x):
         return mean, variance
 
     def test_forward_backward(self):
+
         def test_with_place(place, shape):
             epsilon = self.epsilon
             n, c, h, w = shape[0], shape[1], shape[2], shape[3]
@@ -133,23 +137,26 @@ def test_with_place(place, shape):
             with fluid.program_guard(program):
                 block = program.global_block()
                 for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape)
-                in_op = block.append_op(
-                    type="instance_norm",
-                    inputs={
-                        "X": block.var("x"),
-                        "Scale": block.var("scale"),
-                        "Bias": block.var("bias"),
-                    },
-                    outputs={
-                        "Y": block.var("y"),
-                        "SavedMean": block.var("saved_mean"),
-                        "SavedVariance": block.var("saved_variance")
-                    },
-                    attrs={"epsilon": epsilon, })
+                    block.create_var(name=name,
+                                     dtype='float32',
+                                     shape=ground_truth[name].shape)
+                in_op = block.append_op(type="instance_norm",
+                                        inputs={
+                                            "X": block.var("x"),
+                                            "Scale": block.var("scale"),
+                                            "Bias": block.var("bias"),
+                                        },
+                                        outputs={
+                                            "Y":
+                                            block.var("y"),
+                                            "SavedMean":
+                                            block.var("saved_mean"),
+                                            "SavedVariance":
+                                            block.var("saved_variance")
+                                        },
+                                        attrs={
+                                            "epsilon": epsilon,
+                                        })
 
                 block.create_var(name="y@GRAD", dtype='float32', shape=y.shape)
 
@@ -190,6 +197,7 @@ def test_with_place(place, shape):
 
 
 class TestInstanceNormOpTrainingCase1(TestInstanceNormOpTraining):
+
     def init_test_case(self):
         self.shape = [2, 3, 4, 5]
         self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
@@ -197,6 +205,7 @@ def init_test_case(self):
 
 
 class TestInstanceNormOpTrainingCase2(TestInstanceNormOpTraining):
+
     def init_test_case(self):
         self.shape = [20, 50, 4, 5]
         self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
@@ -204,11 +213,12 @@ def init_test_case(self):
 
 
 class TestInstanceNormOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of instance_norm must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.instance_norm, x1)
 
             # the input dtype of instance_norm must be float32 or float64
@@ -217,15 +227,19 @@ def test_errors(self):
 
 
 class TestInstanceNormOpErrorCase1(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            # the first dimension of input for instance_norm must between [2d, 5d] 
-            x = fluid.layers.data(
-                name='x', shape=[3], dtype="float32", append_batch_size=False)
+            # the first dimension of input for instance_norm must between [2d, 5d]
+            x = fluid.layers.data(name='x',
+                                  shape=[3],
+                                  dtype="float32",
+                                  append_batch_size=False)
             self.assertRaises(ValueError, paddle.static.nn.instance_norm, x)
 
 
 class TestElasticNormOp(unittest.TestCase):
+
     def init_test_case(self):
         self.epsilon = 1e-5
         self.places = [core.CPUPlace()]
@@ -243,18 +257,25 @@ def test_norm(self):
         scale = np.ones(scale_shape).astype(np.float32)
         bias = np.zeros(scale_shape).astype(np.float32)
         mean, variance = _cal_mean_variance(inputs, self.epsilon, mean_shape)
-        out_np, _, _ = _reference_instance_norm_naive(
-            inputs, scale, bias, self.epsilon, mean, variance)
+        out_np, _, _ = _reference_instance_norm_naive(inputs, scale, bias,
+                                                      self.epsilon, mean,
+                                                      variance)
 
         for place in self.places:
             with fluid.dygraph.guard(place):
-                instance_norm = fluid.dygraph.InstanceNorm(
-                    5, param_attr=False, bias_attr=False)
+                instance_norm = fluid.dygraph.InstanceNorm(5,
+                                                           param_attr=False,
+                                                           bias_attr=False)
                 outputs = instance_norm(to_variable(inputs))
                 self.assertTrue(np.allclose(outputs.numpy(), out_np, atol=1e-6))
 
+    def test_eager_api(self):
+        with _test_eager_guard():
+            self.test_norm()
+
 
 class TestElasticNormOpCase2(unittest.TestCase):
+
     def init_test_case(self):
         self.epsilon = 1e-5
         self.places = [core.CPUPlace()]
@@ -272,16 +293,22 @@ def test_norm(self):
         scale = np.ones(scale_shape).astype(np.float32)
         bias = np.zeros(scale_shape).astype(np.float32)
         mean, variance = _cal_mean_variance(inputs, self.epsilon, mean_shape)
-        out_np, _, _ = _reference_instance_norm_naive(
-            inputs, scale, bias, self.epsilon, mean, variance)
+        out_np, _, _ = _reference_instance_norm_naive(inputs, scale, bias,
+                                                      self.epsilon, mean,
+                                                      variance)
 
         for place in self.places:
             with fluid.dygraph.guard(place):
-                instance_norm = fluid.dygraph.InstanceNorm(
-                    3, param_attr=True, bias_attr=True)
+                instance_norm = fluid.dygraph.InstanceNorm(3,
+                                                           param_attr=True,
+                                                           bias_attr=True)
                 outputs = instance_norm(to_variable(inputs))
                 self.assertTrue(np.allclose(outputs.numpy(), out_np, atol=1e-6))
 
+    def test_eager_api(self):
+        with _test_eager_guard():
+            self.test_norm()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
index 102e08e36a9e5..62677ed26212d 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -22,10 +22,12 @@
 from paddle.fluid.framework import grad_var_name
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+from paddle.fluid.framework import _test_eager_guard
 import paddle
 
 
 class TestInstanceNorm(unittest.TestCase):
+
     def test_error(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(
@@ -50,8 +52,9 @@ def error3d():
 
             def weight_bias_false():
                 x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
-                instance_norm3d = paddle.nn.InstanceNorm3D(
-                    1, weight_attr=False, bias_attr=False)
+                instance_norm3d = paddle.nn.InstanceNorm3D(1,
+                                                           weight_attr=False,
+                                                           bias_attr=False)
 
             with fluid.dygraph.guard(p):
                 weight_bias_false()
@@ -116,6 +119,11 @@ def compute_v2(x_np):
             y2 = compute_v2(x)
             self.assertTrue(np.allclose(y1, y2))
 
+    def test_eager_api(self):
+        with _test_eager_guard():
+            self.test_dygraph()
+            self.test_error()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inverse_op.py b/python/paddle/fluid/tests/unittests/test_inverse_op.py
index 85c4c6000a684..b868fef15ec32 100644
--- a/python/paddle/fluid/tests/unittests/test_inverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inverse_op.py
@@ -21,6 +21,7 @@
 
 
 class TestInverseOp(OpTest):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -44,12 +45,14 @@ def test_grad(self):
 
 
 class TestInverseOpBatched(TestInverseOp):
+
     def config(self):
         self.matrix_shape = [8, 4, 4]
         self.dtype = "float64"
 
 
 class TestInverseOpLarge(TestInverseOp):
+
     def config(self):
         self.matrix_shape = [32, 32]
         self.dtype = "float64"
@@ -59,6 +62,7 @@ def test_grad(self):
 
 
 class TestInverseOpFP32(TestInverseOp):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float32"
@@ -68,18 +72,21 @@ def test_grad(self):
 
 
 class TestInverseOpBatchedFP32(TestInverseOpFP32):
+
     def config(self):
         self.matrix_shape = [8, 4, 4]
         self.dtype = "float32"
 
 
 class TestInverseOpLargeFP32(TestInverseOpFP32):
+
     def config(self):
         self.matrix_shape = [32, 32]
         self.dtype = "float32"
 
 
 class TestInverseAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -114,6 +121,7 @@ def test_dygraph(self):
 
 
 class TestInverseAPIError(unittest.TestCase):
+
     def test_errors(self):
         input_np = np.random.random([4, 4]).astype("float64")
 
@@ -136,6 +144,7 @@ def test_errors(self):
 
 
 class TestInverseSingularAPI(unittest.TestCase):
+
     def setUp(self):
         self.places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_io_save_load.py b/python/paddle/fluid/tests/unittests/test_io_save_load.py
index a9a223f8f99a0..0d5573ae7021a 100644
--- a/python/paddle/fluid/tests/unittests/test_io_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_io_save_load.py
@@ -22,6 +22,7 @@
 
 
 class TestSaveLoadAPIError(unittest.TestCase):
+
     def func_test_get_valid_program_error(self):
         # case 1: CompiledProgram no program
         graph = core.Graph(core.ProgramDesc())
@@ -43,16 +44,16 @@ def func_test_load_vars_error(self):
         exe = fluid.Executor(place)
         # case 1: main_program type error when vars None
         with self.assertRaises(TypeError):
-            fluid.io.load_vars(
-                executor=exe, dirname="./fake_dir", main_program="program")
+            fluid.io.load_vars(executor=exe,
+                               dirname="./fake_dir",
+                               main_program="program")
 
         # case 2: main_program type error when vars not None
         with self.assertRaises(TypeError):
-            fluid.io.load_vars(
-                executor=exe,
-                dirname="./fake_dir",
-                main_program="program",
-                vars="vars")
+            fluid.io.load_vars(executor=exe,
+                               dirname="./fake_dir",
+                               main_program="program",
+                               vars="vars")
 
     def test_load_vars_error(self):
         with _test_eager_guard():
@@ -61,6 +62,7 @@ def test_load_vars_error(self):
 
 
 class TestSaveInferenceModelAPIError(unittest.TestCase):
+
     def func_test_useless_feeded_var_names(self):
         start_prog = fluid.Program()
         main_prog = fluid.Program()
@@ -73,12 +75,11 @@ def func_test_useless_feeded_var_names(self):
         exe.run(start_prog)
         with self.assertRaisesRegexp(
                 ValueError, "not involved in the target_vars calculation"):
-            fluid.io.save_inference_model(
-                dirname='./model',
-                feeded_var_names=['x', 'y'],
-                target_vars=[z],
-                executor=exe,
-                main_program=main_prog)
+            fluid.io.save_inference_model(dirname='./model',
+                                          feeded_var_names=['x', 'y'],
+                                          target_vars=[z],
+                                          executor=exe,
+                                          main_program=main_prog)
 
     def test_useless_feeded_var_names(self):
         with _test_eager_guard():
@@ -87,6 +88,7 @@ def test_useless_feeded_var_names(self):
 
 
 class TestWhenTrainWithNoGrad(unittest.TestCase):
+
     def func_test_when_train_with_no_grad(self):
         paddle.disable_static()
         net = paddle.nn.Linear(1024, 1)
diff --git a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
index 6d7ffecc38ff1..672498ab56e81 100644
--- a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
@@ -23,6 +23,7 @@
 
 
 class TestIOUSimilarityOp(OpTest):
+
     def test_check_output(self):
         self.check_output(check_dygraph=False)
 
@@ -32,7 +33,7 @@ def setUp(self):
         self.boxes2 = random.rand(3, 4).astype('float32')
         self.output = random.rand(2, 3).astype('float32')
         self.box_normalized = False
-        # run python iou computation 
+        # run python iou computation
         self._compute_iou()
         self.inputs = {'X': self.boxes1, 'Y': self.boxes2}
         self.attrs = {"box_normalized": self.box_normalized}
@@ -68,6 +69,7 @@ def _compute_iou(self, ):
 
 
 class TestIOUSimilarityOpWithLoD(TestIOUSimilarityOp):
+
     def test_check_output(self):
         self.check_output(check_dygraph=False)
 
@@ -76,7 +78,7 @@ def setUp(self):
         self.boxes1_lod = [[1, 1]]
         self.output_lod = [[1, 1]]
         self.box_normalized = False
-        # run python iou computation 
+        # run python iou computation
         self._compute_iou()
         self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
         self.attrs = {"box_normalized": self.box_normalized}
@@ -84,6 +86,7 @@ def setUp(self):
 
 
 class TestIOUSimilarityOpWithBoxNormalized(TestIOUSimilarityOp):
+
     def test_check_output(self):
         self.check_output(check_dygraph=False)
 
@@ -92,7 +95,7 @@ def setUp(self):
         self.boxes1_lod = [[1, 1]]
         self.output_lod = [[1, 1]]
         self.box_normalized = True
-        # run python iou computation 
+        # run python iou computation
         self._compute_iou()
         self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
         self.attrs = {"box_normalized": self.box_normalized}
diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
index e2094c76b7d1b..f45ada0a52980 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
@@ -32,8 +32,8 @@ def fc_with_batchnorm(use_feed):
             hidden,
             size=200,
             act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=1.0)))
 
         hidden = fluid.layers.batch_norm(input=hidden)
     prediction = fluid.layers.fc(hidden, size=10, act='softmax')
@@ -43,6 +43,7 @@ def fc_with_batchnorm(use_feed):
 
 
 class TestIrInplace(TestParallelExecutorBase):
+
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
@@ -56,8 +57,10 @@ def _fc_with_batchnorm(self, ir_memory_optimize, enable_inplace):
         label = np.ones(shape=[32, 1], dtype='int64')
         self.check_network_convergence(
             fc_with_batchnorm,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict={
+                "image": img,
+                "label": label
+            },
             use_device=DeviceType.CUDA,
             use_ir_memory_optimize=ir_memory_optimize,
             enable_inplace=enable_inplace)
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
index dba92a68cd671..cd34e9070213a 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
@@ -33,6 +33,7 @@
 
 
 class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
+
     def check_network_convergence(self,
                                   use_cuda=True,
                                   use_mem_opt=False,
@@ -68,8 +69,8 @@ def check_network_convergence(self,
 
             optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
             optimizer.minimize(avg_loss, startup_prog)
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=200)
+            train_reader = paddle.batch(paddle.dataset.mnist.train(),
+                                        batch_size=200)
 
             place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
             exe = Executor(place)
@@ -98,8 +99,10 @@ def check_network_convergence(self,
                     y_data = y_data.reshape((y_data.shape[0], 1))
 
                     outs = exe.run(train_cp,
-                                   feed={'x': x_data,
-                                         'y': y_data},
+                                   feed={
+                                       'x': x_data,
+                                       'y': y_data
+                                   },
                                    fetch_list=[avg_loss])
 
                     loop += 1
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
index 30b6d6106cdc4..360457000befd 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
@@ -35,8 +35,9 @@ def lstm_net(data,
         param_attr=fluid.ParamAttr(learning_rate=emb_lr))
     fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
 
-    lstm_h, c = fluid.layers.dynamic_lstm(
-        input=fc0, size=hid_dim * 4, is_reverse=False)
+    lstm_h, c = fluid.layers.dynamic_lstm(input=fc0,
+                                          size=hid_dim * 4,
+                                          is_reverse=False)
     lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
     lstm_max_tanh = fluid.layers.tanh(lstm_max)
     fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
@@ -47,6 +48,7 @@ def lstm_net(data,
 
 
 class TestIrMemOptRNN(TestIrMemOptBase):
+
     def setUp(self):
         self.network = lstm_net
 
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
index f4ec63a8b916e..24ac463011109 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
@@ -54,6 +54,7 @@ def fc_with_inplace_net(use_feed):
 
 
 class TestMNIST(TestParallelExecutorBase):
+
     def _dummy_data(self):
         np.random.seed(5)
         img = np.random.random(size=[32, 784]).astype(np.float32)
@@ -67,14 +68,18 @@ def _compare_ir_memory_optimize(self, model, use_device):
         img, label = self._dummy_data()
         first_loss0, last_loss0 = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict={
+                "image": img,
+                "label": label
+            },
             use_device=use_device,
             use_ir_memory_optimize=False)
         first_loss1, last_loss1 = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict={
+                "image": img,
+                "label": label
+            },
             use_device=use_device,
             use_ir_memory_optimize=True)
         for loss in zip(first_loss0, first_loss1):
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
index aa495c7533ce0..38d419530d70d 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
@@ -30,6 +30,7 @@
 # NOTE(dzhwinter): test diferent strategy colisions.
 # open the eager delete tensor strategy by default.
 class TestTransformerWithIR(TestParallelExecutorBase):
+
     def test_main(self):
         if core.is_compiled_with_cuda():
             # check python transpiler
diff --git a/python/paddle/fluid/tests/unittests/test_ir_pass_pipeline.py b/python/paddle/fluid/tests/unittests/test_ir_pass_pipeline.py
index 7d11c03a1f177..1b445f8f9873f 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_pass_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_pass_pipeline.py
@@ -17,6 +17,7 @@
 
 
 class TestPipelineWithIRPass(test_pipeline.TestPipeline):
+
     def need_envs(self):
         return {'FLAGS_apply_pass_to_program': '1'}
 
diff --git a/python/paddle/fluid/tests/unittests/test_is_complex.py b/python/paddle/fluid/tests/unittests/test_is_complex.py
index 988c55ea61afd..a441bd8629670 100644
--- a/python/paddle/fluid/tests/unittests/test_is_complex.py
+++ b/python/paddle/fluid/tests/unittests/test_is_complex.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,6 +18,7 @@
 
 
 class TestIsComplex(unittest.TestCase):
+
     def test_for_integer(self):
         x = paddle.arange(10)
         self.assertFalse(paddle.is_complex(x))
diff --git a/python/paddle/fluid/tests/unittests/test_is_empty_op.py b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
index 520e55e9f98d5..b017341b6c1c3 100644
--- a/python/paddle/fluid/tests/unittests/test_is_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
@@ -21,6 +21,7 @@
 
 
 class TestEmpty(OpTest):
+
     def setUp(self):
         self.op_type = "is_empty"
         self.inputs = {'X': np.array([1, 2, 3])}
@@ -31,6 +32,7 @@ def test_check_output(self):
 
 
 class TestNotEmpty(TestEmpty):
+
     def setUp(self):
         self.op_type = "is_empty"
         self.inputs = {'X': np.array([])}
@@ -38,6 +40,7 @@ def setUp(self):
 
 
 class TestIsEmptyOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
@@ -52,22 +55,25 @@ def test_Variable():
 
             def test_type():
                 # dtype must be float32, float64, int32, int64
-                x3 = paddle.static.data(
-                    name="x3", shape=[4, 32, 32], dtype="bool")
+                x3 = paddle.static.data(name="x3",
+                                        shape=[4, 32, 32],
+                                        dtype="bool")
                 res = paddle.is_empty(x=x3)
 
             self.assertRaises(TypeError, test_type)
 
             def test_name_type():
                 # name type must be string.
-                x4 = paddle.static.data(
-                    name="x4", shape=[3, 2], dtype="float32")
+                x4 = paddle.static.data(name="x4",
+                                        shape=[3, 2],
+                                        dtype="float32")
                 res = paddle.is_empty(x=x4, name=1)
 
             self.assertRaises(TypeError, test_name_type)
 
 
 class TestIsEmptyOpDygraph(unittest.TestCase):
+
     def test_dygraph(self):
         paddle.disable_static()
         input = paddle.rand(shape=[4, 32, 32], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/test_is_integer.py b/python/paddle/fluid/tests/unittests/test_is_integer.py
index 1c33065e10b05..a933e9fac6678 100644
--- a/python/paddle/fluid/tests/unittests/test_is_integer.py
+++ b/python/paddle/fluid/tests/unittests/test_is_integer.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,6 +18,7 @@
 
 
 class TestIsInteger(unittest.TestCase):
+
     def test_for_integer(self):
         x = paddle.arange(10)
         self.assertTrue(paddle.is_integer(x))
diff --git a/python/paddle/fluid/tests/unittests/test_is_tensor.py b/python/paddle/fluid/tests/unittests/test_is_tensor.py
index 616aaa019ba33..59ac179bdaf5c 100644
--- a/python/paddle/fluid/tests/unittests/test_is_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_is_tensor.py
@@ -21,6 +21,7 @@
 
 
 class TestIsTensorApi(unittest.TestCase):
+
     def test_is_tensor_real(self, dtype="float32"):
         """Test is_tensor api with a real tensor
         """
diff --git a/python/paddle/fluid/tests/unittests/test_isclose_op.py b/python/paddle/fluid/tests/unittests/test_isclose_op.py
index 245520e5ab666..04b7fbe54e7ee 100644
--- a/python/paddle/fluid/tests/unittests/test_isclose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isclose_op.py
@@ -19,6 +19,7 @@
 
 
 class TestIscloseOp(OpTest):
+
     def set_args(self):
         self.input = np.array([10000., 1e-07]).astype("float32")
         self.other = np.array([10000.1, 1e-08]).astype("float32")
@@ -39,13 +40,13 @@ def setUp(self):
         }
         self.attrs = {'equal_nan': self.equal_nan}
         self.outputs = {
-            'Out': np.array([
-                np.isclose(
-                    self.inputs['Input'],
-                    self.inputs['Other'],
-                    rtol=self.rtol,
-                    atol=self.atol,
-                    equal_nan=self.equal_nan)
+            'Out':
+            np.array([
+                np.isclose(self.inputs['Input'],
+                           self.inputs['Other'],
+                           rtol=self.rtol,
+                           atol=self.atol,
+                           equal_nan=self.equal_nan)
             ])
         }
 
@@ -54,7 +55,9 @@ def test_check_output(self):
 
 
 class TestIscloseOpException(TestIscloseOp):
+
     def test_check_output(self):
+
         def test_rtol_num():
             self.inputs['Rtol'] = np.array([1e-05, 1e-05]).astype("float64")
             self.inputs['Atol'] = np.array([1e-08]).astype("float64")
@@ -85,6 +88,7 @@ def test_atol_type():
 
 
 class TestIscloseOpSmallNum(TestIscloseOp):
+
     def set_args(self):
         self.input = np.array([10000., 1e-08]).astype("float32")
         self.other = np.array([10000.1, 1e-09]).astype("float32")
@@ -94,6 +98,7 @@ def set_args(self):
 
 
 class TestIscloseOpNanFalse(TestIscloseOp):
+
     def set_args(self):
         self.input = np.array([1.0, float('nan')]).astype("float32")
         self.other = np.array([1.0, float('nan')]).astype("float32")
@@ -103,6 +108,7 @@ def set_args(self):
 
 
 class TestIscloseOpNanTrue(TestIscloseOp):
+
     def set_args(self):
         self.input = np.array([1.0, float('nan')]).astype("float32")
         self.other = np.array([1.0, float('nan')]).astype("float32")
@@ -112,6 +118,7 @@ def set_args(self):
 
 
 class TestIscloseStatic(unittest.TestCase):
+
     def test_api_case(self):
         paddle.enable_static()
         x_data = np.random.rand(10, 10)
@@ -127,14 +134,17 @@ def test_api_case(self):
                 result = paddle.isclose(x, y)
                 exe = paddle.fluid.Executor(place)
                 fetches = exe.run(paddle.fluid.default_main_program(),
-                                  feed={"x": x_data,
-                                        "y": y_data},
+                                  feed={
+                                      "x": x_data,
+                                      "y": y_data
+                                  },
                                   fetch_list=[result])
                 expected_out = np.isclose(x_data, y_data)
                 self.assertTrue((fetches[0] == expected_out).all(), True)
 
 
 class TestIscloseDygraph(unittest.TestCase):
+
     def test_api_case(self):
         places = [paddle.CPUPlace()]
         if paddle.fluid.core.is_compiled_with_cuda():
@@ -152,6 +162,7 @@ def test_api_case(self):
 
 
 class TestIscloseError(unittest.TestCase):
+
     def test_input_dtype(self):
         paddle.enable_static()
 
@@ -195,6 +206,7 @@ def test_equal_nan():
 
 
 class TestIscloseOpFloat32(TestIscloseOp):
+
     def set_args(self):
         self.input = np.array([10.1]).astype("float32")
         self.other = np.array([10]).astype("float32")
@@ -204,6 +216,7 @@ def set_args(self):
 
 
 class TestIscloseOpFloat64(TestIscloseOp):
+
     def set_args(self):
         self.input = np.array([10.1]).astype("float64")
         self.other = np.array([10]).astype("float64")
@@ -216,6 +229,7 @@ def test_check_output(self):
 
 
 class TestIscloseOpLargeDimInput(TestIscloseOp):
+
     def set_args(self):
         self.input = np.array(np.zeros([2048, 1024])).astype("float64")
         self.other = np.array(np.zeros([2048, 1024])).astype("float64")
diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_op.py
index 83d86aff7ac9c..65b197067d9d5 100644
--- a/python/paddle/fluid/tests/unittests/test_isfinite_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_op.py
@@ -22,6 +22,7 @@
 
 
 class TestInf(OpTest):
+
     def setUp(self):
         self.op_type = "isinf"
         self.dtype = np.float32
@@ -42,7 +43,9 @@ def test_output(self):
 
 
 class TestRaiseError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_type():
             fluid.layers.isfinite([10])
 
@@ -58,11 +61,13 @@ def test_dtype():
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16Inf(TestInf):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestNAN(OpTest):
+
     def setUp(self):
         self.op_type = "isnan"
         self.dtype = np.float32
@@ -85,11 +90,13 @@ def test_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16NAN(TestNAN):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class TestIsfinite(OpTest):
+
     def setUp(self):
         self.op_type = "isfinite"
         self.dtype = np.float32
@@ -113,11 +120,13 @@ def test_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16Isfinite(TestIsfinite):
+
     def init_dtype(self):
         self.dtype = np.float16
 
 
 class BadInputTest(unittest.TestCase):
+
     def test_error(self):
         with fluid.program_guard(fluid.Program()):
 
diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
index c861f912803f9..252e43b3423dc 100644
--- a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
@@ -127,6 +127,7 @@ def test(test_case, op_str, use_gpu=False):
 
 
 class TestCPUNormal(unittest.TestCase):
+
     def test_inf(self):
         test(self, 'isinf')
 
@@ -138,6 +139,7 @@ def test_finite(self):
 
 
 class TestCUDANormal(unittest.TestCase):
+
     def test_inf(self):
         test(self, 'isinf', True)
 
@@ -149,6 +151,7 @@ def test_finite(self):
 
 
 class TestError(unittest.TestCase):
+
     def test_bad_input(self):
         paddle.enable_static()
         with fluid.program_guard(fluid.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_jit_pre_save_hooks.py b/python/paddle/fluid/tests/unittests/test_jit_pre_save_hooks.py
index a938024e3c9b4..5722d36ca5e7b 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_pre_save_hooks.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_pre_save_hooks.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,7 +24,9 @@
 
 
 class TestPreSaveHooks(unittest.TestCase):
+
     def test_pre_save_hook_functions(self):
+
         def fake_func(*args, **kwgs):
             global _counter
             _counter += 1
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 5dabf854734dd..6c1bbdac68995 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -34,6 +34,7 @@
 
 
 def random_batch_reader(input_size, label_size):
+
     def _get_random_inputs_and_labels(input_size, label_size):
         np.random.seed(SEED)
         input = np.random.random(size=input_size).astype('float32')
@@ -50,6 +51,7 @@ def __reader__():
 
 
 class LinearNet(fluid.dygraph.Layer):
+
     def __init__(self, in_size, out_size):
         super(LinearNet, self).__init__()
         self._linear = Linear(in_size, out_size)
@@ -60,6 +62,7 @@ def forward(self, x):
 
 
 class LinearNetWithInputSpec(fluid.dygraph.Layer):
+
     def __init__(self, in_size, out_size):
         super(LinearNetWithInputSpec, self).__init__()
         self._linear = Linear(in_size, out_size)
@@ -70,6 +73,7 @@ def forward(self, x):
 
 
 class LinearNetNotDeclarative(fluid.dygraph.Layer):
+
     def __init__(self, in_size, out_size):
         super(LinearNetNotDeclarative, self).__init__()
         self._linear = Linear(in_size, out_size)
@@ -79,14 +83,14 @@ def forward(self, x):
 
 
 class LinerNetWithLabel(paddle.nn.Layer):
+
     def __init__(self, in_size, out_size):
         super(LinerNetWithLabel, self).__init__()
         self._linear = Linear(in_size, out_size)
 
     @declarative(input_spec=[
-        InputSpec(
-            shape=[None, 784], dtype='float32', name="image"), InputSpec(
-                shape=[None, 1], dtype='int64', name="label")
+        InputSpec(shape=[None, 784], dtype='float32', name="image"),
+        InputSpec(shape=[None, 1], dtype='int64', name="label")
     ])
     def forward(self, x, label):
         out = self._linear(x)
@@ -96,14 +100,14 @@ def forward(self, x, label):
 
 
 class LinerNetWithPruneInput(paddle.nn.Layer):
+
     def __init__(self, in_size, out_size):
         super(LinerNetWithPruneInput, self).__init__()
         self._linear = Linear(in_size, out_size)
 
     @declarative(input_spec=[
-        InputSpec(
-            shape=[None, 784], dtype='float32', name="image"), InputSpec(
-                shape=[None, 1], dtype='int64', name="label")
+        InputSpec(shape=[None, 784], dtype='float32', name="image"),
+        InputSpec(shape=[None, 1], dtype='int64', name="label")
     ])
     def forward(self, x, label):
         out = self._linear(x)
@@ -113,14 +117,14 @@ def forward(self, x, label):
 
 
 class LinerNetWithUselessInput(paddle.nn.Layer):
+
     def __init__(self, in_size, out_size):
         super(LinerNetWithUselessInput, self).__init__()
         self._linear = Linear(in_size, out_size)
 
     @declarative(input_spec=[
-        InputSpec(
-            shape=[None, 784], dtype='float32', name="image"), InputSpec(
-                shape=[None, 1], dtype='int64', name="label")
+        InputSpec(shape=[None, 784], dtype='float32', name="image"),
+        InputSpec(shape=[None, 1], dtype='int64', name="label")
     ])
     def forward(self, x, label):
         out = self._linear(x)
@@ -128,6 +132,7 @@ def forward(self, x, label):
 
 
 class LinearNetReturnLoss(fluid.dygraph.Layer):
+
     def __init__(self, in_size, out_size):
         super(LinearNetReturnLoss, self).__init__()
         self._linear = Linear(in_size, out_size)
@@ -141,15 +146,15 @@ def forward(self, x):
 
 
 class LinearNetMultiInput(fluid.dygraph.Layer):
+
     def __init__(self, in_size, out_size):
         super(LinearNetMultiInput, self).__init__()
         self._linear1 = Linear(in_size, out_size)
         self._linear2 = Linear(in_size, out_size)
 
     @declarative(input_spec=[
-        InputSpec(
-            [None, 8], dtype='float32'), InputSpec(
-                [None, 8], dtype='float32')
+        InputSpec([None, 8], dtype='float32'),
+        InputSpec([None, 8], dtype='float32')
     ])
     def forward(self, x, y):
         x_out = self._linear1(x)
@@ -159,14 +164,14 @@ def forward(self, x, y):
 
 
 class LinearNetMultiInput1(fluid.dygraph.Layer):
+
     def __init__(self, in_size, out_size):
         super(LinearNetMultiInput1, self).__init__()
         self._linear1 = Linear(in_size, out_size)
         self._linear2 = Linear(in_size, out_size)
 
-    @declarative(input_spec=(InputSpec(
-        [None, 8], dtype='float32'), InputSpec(
-            [None, 8], dtype='float32')))
+    @declarative(input_spec=(InputSpec([None, 8], dtype='float32'),
+                             InputSpec([None, 8], dtype='float32')))
     def forward(self, x, y):
         x_out = self._linear1(x)
         y_out = self._linear2(y)
@@ -175,6 +180,7 @@ def forward(self, x, y):
 
 
 class MultiLoadingLinearNet(fluid.dygraph.Layer):
+
     def __init__(self, size, model_path):
         super(MultiLoadingLinearNet, self).__init__()
         self._linear = Linear(size, size)
@@ -191,6 +197,7 @@ def forward(self, x):
 
 
 class LinearNetReturnHidden(fluid.dygraph.Layer):
+
     def __init__(self, in_size, out_size):
         super(LinearNetReturnHidden, self).__init__()
         self._linear_1 = Linear(in_size, out_size)
@@ -205,6 +212,7 @@ def forward(self, x):
 
 
 class LinearNetWithNestOut(fluid.dygraph.Layer):
+
     def __init__(self, in_size, out_size):
         super(LinearNetWithNestOut, self).__init__()
         self._linear_1 = Linear(in_size, out_size)
@@ -220,16 +228,17 @@ def forward(self, x):
 
 
 class LinearNetWithDictInput(paddle.nn.Layer):
+
     def __init__(self, in_size, out_size):
         super(LinearNetWithDictInput, self).__init__()
         self._linear = Linear(in_size, out_size)
 
     @paddle.jit.to_static(input_spec=[{
-        'img': InputSpec(
-            shape=[None, 8], dtype='float32', name='img')
+        'img':
+        InputSpec(shape=[None, 8], dtype='float32', name='img')
     }, {
-        'label': InputSpec(
-            shape=[None, 1], dtype='int64', name='label')
+        'label':
+        InputSpec(shape=[None, 1], dtype='int64', name='label')
     }])
     def forward(self, img, label):
         out = self._linear(img['img'])
@@ -239,6 +248,7 @@ def forward(self, img, label):
 
 
 class LinearNetWithDictInputNoPrune(paddle.nn.Layer):
+
     def __init__(self, in_size, out_size):
         super(LinearNetWithDictInputNoPrune, self).__init__()
         self._linear = Linear(in_size, out_size)
@@ -249,6 +259,7 @@ def forward(self, img):
 
 
 class EmptyLayer(paddle.nn.Layer):
+
     def __init__(self):
         super(EmptyLayer, self).__init__()
 
@@ -258,6 +269,7 @@ def forward(self, x):
 
 
 class NoParamLayer(paddle.nn.Layer):
+
     def __init__(self):
         super(NoParamLayer, self).__init__()
 
@@ -267,6 +279,7 @@ def forward(self, x, y):
 
 
 class LinearNetWithMultiStaticFunc(fluid.dygraph.Layer):
+
     def __init__(self, in_size, out_size):
         super(LinearNetWithMultiStaticFunc, self).__init__()
         self._linear_0 = Linear(in_size, out_size)
@@ -288,12 +301,12 @@ def forward_general(self, x):
 
 def train(layer, input_size=784, label_size=1):
     # create optimizer
-    sgd = fluid.optimizer.SGDOptimizer(
-        learning_rate=0.01, parameter_list=layer.parameters())
+    sgd = fluid.optimizer.SGDOptimizer(learning_rate=0.01,
+                                       parameter_list=layer.parameters())
     # create data loader
     train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-    train_loader.set_batch_generator(
-        random_batch_reader(input_size, label_size))
+    train_loader.set_batch_generator(random_batch_reader(
+        input_size, label_size))
     # train
     for data in train_loader():
         img, label = data
@@ -312,12 +325,12 @@ def train(layer, input_size=784, label_size=1):
 
 def train_with_label(layer, input_size=784, label_size=1):
     # create optimizer
-    sgd = fluid.optimizer.SGDOptimizer(
-        learning_rate=0.01, parameter_list=layer.parameters())
+    sgd = fluid.optimizer.SGDOptimizer(learning_rate=0.01,
+                                       parameter_list=layer.parameters())
     # create data loader
     train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-    train_loader.set_batch_generator(
-        random_batch_reader(input_size, label_size))
+    train_loader.set_batch_generator(random_batch_reader(
+        input_size, label_size))
     # train
     for data in train_loader():
         img, label = data
@@ -332,6 +345,7 @@ def train_with_label(layer, input_size=784, label_size=1):
 
 
 class TestJitSaveLoad(unittest.TestCase):
+
     def setUp(self):
         self.model_path = "test_jit_save_load/model"
         # enable dygraph mode
@@ -345,8 +359,9 @@ def train_and_save_model(self, model_path=None):
         example_inputs, layer, _ = train(layer)
         final_model_path = model_path if model_path else self.model_path
         orig_input_types = [type(x) for x in example_inputs]
-        paddle.jit.save(
-            layer=layer, path=final_model_path, input_spec=example_inputs)
+        paddle.jit.save(layer=layer,
+                        path=final_model_path,
+                        input_spec=example_inputs)
         new_input_types = [type(x) for x in example_inputs]
         self.assertEqual(orig_input_types, new_input_types)
         return layer
@@ -367,7 +382,8 @@ def load_and_inference(self, train_layer, infer_layer):
         x = fluid.dygraph.to_variable(
             np.random.random((1, 784)).astype('float32'))
         self.assertTrue(
-            np.array_equal(train_layer(x).numpy(), infer_layer(x).numpy()))
+            np.array_equal(train_layer(x).numpy(),
+                           infer_layer(x).numpy()))
 
     def load_and_finetune(self, train_layer, load_train_layer):
         train_layer.train()
@@ -392,7 +408,8 @@ def load_dygraph_state_dict(self, train_layer):
         x = fluid.dygraph.to_variable(
             np.random.random((1, 784)).astype('float32'))
         self.assertTrue(
-            np.array_equal(train_layer(x).numpy(), new_layer(x).numpy()))
+            np.array_equal(train_layer(x).numpy(),
+                           new_layer(x).numpy()))
 
     def test_load_dygraph_no_path(self):
         model_path = "test_jit_save_load.no_path/model_path"
@@ -406,6 +423,7 @@ def test_jit_load_no_path(self):
 
 
 class TestSaveLoadWithNestOut(unittest.TestCase):
+
     def setUp(self):
         # enable dygraph mode
         fluid.enable_dygraph()
@@ -430,26 +448,28 @@ def test_nest_output(self):
 
 
 class TestSaveLoadWithDictInput(unittest.TestCase):
+
     def test_dict_input(self):
-        # NOTE: This net cannot be executed, it is just 
+        # NOTE: This net cannot be executed, it is just
         # a special case for exporting models in model validation
         # We DO NOT recommend this writing way of Layer
         net = LinearNetWithDictInput(8, 8)
-        # net.forward.concrete_program.inputs: 
-        # (<__main__.LinearNetWithDictInput object at 0x7f2655298a98>, 
-        #  {'img': var img : fluid.VarType.LOD_TENSOR.shape(-1, 8).astype(VarType.FP32)}, 
+        # net.forward.concrete_program.inputs:
+        # (<__main__.LinearNetWithDictInput object at 0x7f2655298a98>,
+        #  {'img': var img : fluid.VarType.LOD_TENSOR.shape(-1, 8).astype(VarType.FP32)},
         #  {'label': var label : fluid.VarType.LOD_TENSOR.shape(-1, 1).astype(VarType.INT64)})
         self.assertEqual(len(net.forward.concrete_program.inputs), 3)
 
         path = "test_jit_save_load_with_dict_input/model"
         # prune inputs
-        paddle.jit.save(
-            layer=net,
-            path=path,
-            input_spec=[{
-                'img': InputSpec(
-                    shape=[None, 8], dtype='float32', name='img')
-            }])
+        paddle.jit.save(layer=net,
+                        path=path,
+                        input_spec=[{
+                            'img':
+                            InputSpec(shape=[None, 8],
+                                      dtype='float32',
+                                      name='img')
+                        }])
 
         img = paddle.randn(shape=[4, 8], dtype='float32')
         loaded_net = paddle.jit.load(path)
@@ -461,20 +481,24 @@ def test_dict_input(self):
 
 
 class TestSaveLoadWithDictInputNoPrune(unittest.TestCase):
+
     def test_dict_input(self):
         net = LinearNetWithDictInputNoPrune(8, 8)
 
         path = "test_jit_save_load_with_dict_input_no_prune/model"
         # prune inputs
-        paddle.jit.save(
-            layer=net,
-            path=path,
-            input_spec=[{
-                'img': InputSpec(
-                    shape=[None, 8], dtype='float32', name='img'),
-                'img2': InputSpec(
-                    shape=[None, 8], dtype='float32', name='img2')
-            }])
+        paddle.jit.save(layer=net,
+                        path=path,
+                        input_spec=[{
+                            'img':
+                            InputSpec(shape=[None, 8],
+                                      dtype='float32',
+                                      name='img'),
+                            'img2':
+                            InputSpec(shape=[None, 8],
+                                      dtype='float32',
+                                      name='img2')
+                        }])
 
         img = paddle.randn(shape=[4, 8], dtype='float32')
         img2 = paddle.randn(shape=[4, 8], dtype='float32')
@@ -485,6 +509,7 @@ def test_dict_input(self):
 
 
 class TestSaveLoadWithInputSpec(unittest.TestCase):
+
     def setUp(self):
         # enable dygraph mode
         fluid.enable_dygraph()
@@ -492,9 +517,8 @@ def setUp(self):
     def test_with_input_spec(self):
         net = LinearNetReturnLoss(8, 8)
         # set x.shape = [None, 8]
-        net.forward = declarative(
-            net.forward, input_spec=[InputSpec(
-                [None, 8], name='x')])
+        net.forward = declarative(net.forward,
+                                  input_spec=[InputSpec([None, 8], name='x')])
 
         model_path = "input_spec.output_spec/model"
         # check inputs and outputs
@@ -587,6 +611,7 @@ def test_multi_in_out1(self):
 
 
 class TestJitSaveLoadConfig(unittest.TestCase):
+
     def setUp(self):
         # enable dygraph mode
         fluid.enable_dygraph()
@@ -608,18 +633,18 @@ def test_output_spec(self):
 
         model_path = "save_load_config.output_spec"
         output_spec = [out]
-        paddle.jit.save(
-            layer=train_layer,
-            path=model_path,
-            input_spec=[x],
-            output_spec=output_spec)
+        paddle.jit.save(layer=train_layer,
+                        path=model_path,
+                        input_spec=[x],
+                        output_spec=output_spec)
 
         train_layer.eval()
         infer_layer = paddle.jit.load(model_path)
         x = fluid.dygraph.to_variable(
             np.random.random((4, 8)).astype('float32'))
         self.assertTrue(
-            np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
+            np.array_equal(train_layer(x)[0].numpy(),
+                           infer_layer(x).numpy()))
 
     def test_save_no_support_config_error(self):
         layer = LinearNet(784, 1)
@@ -644,6 +669,7 @@ def test_load_with_no_support_config(self):
 
 
 class TestJitMultipleLoading(unittest.TestCase):
+
     def setUp(self):
         self.linear_size = 4
         self.model_path = "jit_multi_load/model"
@@ -658,8 +684,9 @@ def setUp(self):
     def train_and_save_orig_model(self):
         layer = LinearNet(self.linear_size, self.linear_size)
         example_inputs, layer, _ = train(layer, self.linear_size, 1)
-        paddle.jit.save(
-            layer=layer, path=self.model_path, input_spec=example_inputs)
+        paddle.jit.save(layer=layer,
+                        path=self.model_path,
+                        input_spec=example_inputs)
 
     def test_load_model_retransform_inference(self):
         multi_loaded_layer = MultiLoadingLinearNet(self.linear_size,
@@ -672,6 +699,7 @@ def test_load_model_retransform_inference(self):
 
 
 class TestJitPruneModelAndLoad(unittest.TestCase):
+
     def setUp(self):
         self.linear_size = 4
         self.model_path = "jit_prune_model_and_load/model"
@@ -694,11 +722,10 @@ def train_and_save(self):
             train_layer.clear_gradients()
 
         output_spec = [hidden]
-        paddle.jit.save(
-            layer=train_layer,
-            path=self.model_path,
-            input_spec=[x],
-            output_spec=output_spec)
+        paddle.jit.save(layer=train_layer,
+                        path=self.model_path,
+                        input_spec=[x],
+                        output_spec=output_spec)
 
         return train_layer
 
@@ -711,7 +738,8 @@ def test_load_pruned_model(self):
         x = fluid.dygraph.to_variable(
             np.random.random((4, 8)).astype('float32'))
         self.assertTrue(
-            np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
+            np.array_equal(train_layer(x)[0].numpy(),
+                           infer_layer(x).numpy()))
 
     def test_load_var_not_in_extra_var_info(self):
         self.train_and_save()
@@ -729,6 +757,7 @@ def test_load_var_not_in_extra_var_info(self):
 
 
 class TestJitSaveMultiCases(unittest.TestCase):
+
     def setUp(self):
         # enable dygraph mode
         fluid.enable_dygraph()
@@ -789,8 +818,7 @@ def test_no_prune_no_to_static_after_train(self):
         paddle.jit.save(
             layer,
             model_path,
-            input_spec=[InputSpec(
-                shape=[None, 784], dtype='float32')])
+            input_spec=[InputSpec(shape=[None, 784], dtype='float32')])
 
         self.verify_inference_correctness(layer, model_path)
 
@@ -811,8 +839,7 @@ def test_no_prune_no_to_static_no_train(self):
         paddle.jit.save(
             layer,
             model_path,
-            input_spec=[InputSpec(
-                shape=[None, 784], dtype='float32')])
+            input_spec=[InputSpec(shape=[None, 784], dtype='float32')])
 
         self.verify_inference_correctness(layer, model_path)
 
@@ -822,17 +849,18 @@ def test_prune_to_static_after_train(self):
         out = train_with_label(layer)
 
         model_path = "test_prune_to_static_after_train/model"
-        paddle.jit.save(
-            layer,
-            model_path,
-            input_spec=[
-                InputSpec(
-                    shape=[None, 784], dtype='float32', name="image")
-            ],
-            output_spec=[out])
-
-        self.verify_inference_correctness(
-            layer, model_path, with_label_and_loss=True)
+        paddle.jit.save(layer,
+                        model_path,
+                        input_spec=[
+                            InputSpec(shape=[None, 784],
+                                      dtype='float32',
+                                      name="image")
+                        ],
+                        output_spec=[out])
+
+        self.verify_inference_correctness(layer,
+                                          model_path,
+                                          with_label_and_loss=True)
 
     def test_prune_to_static_no_train(self):
         layer = LinerNetWithLabel(784, 1)
@@ -841,29 +869,30 @@ def test_prune_to_static_no_train(self):
         # TODO: no train, cannot get output_spec var here
         # now only can use index
         output_spec = layer.forward.outputs[:1]
-        paddle.jit.save(
-            layer,
-            model_path,
-            input_spec=[
-                InputSpec(
-                    shape=[None, 784], dtype='float32', name="image")
-            ],
-            output_spec=output_spec)
-
-        self.verify_inference_correctness(
-            layer, model_path, with_label_and_loss=True)
+        paddle.jit.save(layer,
+                        model_path,
+                        input_spec=[
+                            InputSpec(shape=[None, 784],
+                                      dtype='float32',
+                                      name="image")
+                        ],
+                        output_spec=output_spec)
+
+        self.verify_inference_correctness(layer,
+                                          model_path,
+                                          with_label_and_loss=True)
 
     def test_prune_input_to_static_no_train(self):
         layer = LinerNetWithPruneInput(784, 1)
 
         model_path = "test_prune_input_to_static_no_train/model"
-        paddle.jit.save(
-            layer,
-            model_path,
-            input_spec=[
-                InputSpec(
-                    shape=[None, 784], dtype='float32', name="image")
-            ])
+        paddle.jit.save(layer,
+                        model_path,
+                        input_spec=[
+                            InputSpec(shape=[None, 784],
+                                      dtype='float32',
+                                      name="image")
+                        ])
 
         self.verify_inference_correctness(layer, model_path, with_label=True)
 
@@ -871,13 +900,13 @@ def test_prune_useless_input_to_static_no_train(self):
         layer = LinerNetWithUselessInput(784, 1)
 
         model_path = "test_prune_useless_input_to_static_no_train/model"
-        paddle.jit.save(
-            layer,
-            model_path,
-            input_spec=[
-                InputSpec(
-                    shape=[None, 784], dtype='float32', name="image")
-            ])
+        paddle.jit.save(layer,
+                        model_path,
+                        input_spec=[
+                            InputSpec(shape=[None, 784],
+                                      dtype='float32',
+                                      name="image")
+                        ])
 
         self.verify_inference_correctness(layer, model_path, with_label=True)
 
@@ -890,15 +919,14 @@ def test_no_prune_input_spec_name_warning(self):
         paddle.jit.save(
             layer,
             model_path,
-            input_spec=[InputSpec(
-                shape=[None, 784], dtype='float32')])
-        paddle.jit.save(
-            layer,
-            model_path,
-            input_spec=[
-                InputSpec(
-                    shape=[None, 784], dtype='float32', name='feed_input')
-            ])
+            input_spec=[InputSpec(shape=[None, 784], dtype='float32')])
+        paddle.jit.save(layer,
+                        model_path,
+                        input_spec=[
+                            InputSpec(shape=[None, 784],
+                                      dtype='float32',
+                                      name='feed_input')
+                        ])
 
         self.verify_inference_correctness(layer, model_path)
 
@@ -921,16 +949,15 @@ def test_prune_input_spec_name_error(self):
             paddle.jit.save(
                 layer,
                 model_path,
-                input_spec=[InputSpec(
-                    shape=[None, 784], dtype='float32')])
+                input_spec=[InputSpec(shape=[None, 784], dtype='float32')])
         with self.assertRaises(ValueError):
-            paddle.jit.save(
-                layer,
-                model_path,
-                input_spec=[
-                    InputSpec(
-                        shape=[None, 784], dtype='float32', name='feed_input')
-                ])
+            paddle.jit.save(layer,
+                            model_path,
+                            input_spec=[
+                                InputSpec(shape=[None, 784],
+                                          dtype='float32',
+                                          name='feed_input')
+                            ])
 
     def test_prune_output_spec_name_error(self):
         layer = LinerNetWithLabel(784, 1)
@@ -940,17 +967,18 @@ def test_prune_output_spec_name_error(self):
         model_path = "test_prune_to_static_after_train/model"
         out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
         with self.assertRaises(ValueError):
-            paddle.jit.save(
-                layer,
-                model_path,
-                input_spec=[
-                    InputSpec(
-                        shape=[None, 784], dtype='float32', name="image")
-                ],
-                output_spec=[out])
+            paddle.jit.save(layer,
+                            model_path,
+                            input_spec=[
+                                InputSpec(shape=[None, 784],
+                                          dtype='float32',
+                                          name="image")
+                            ],
+                            output_spec=[out])
 
 
 class TestJitSaveLoadEmptyLayer(unittest.TestCase):
+
     def setUp(self):
         self.model_path = "jit_save_load_empty_layer/model"
         # enable dygraph mode
@@ -967,6 +995,7 @@ def test_save_load_empty_layer(self):
 
 
 class TestJitSaveLoadNoParamLayer(unittest.TestCase):
+
     def setUp(self):
         self.model_path = "jit_save_load_no_param_layer/model"
         # enable dygraph mode
@@ -984,6 +1013,7 @@ def test_save_load_no_param_layer(self):
 
 
 class TestJitSaveLoadMultiMethods(unittest.TestCase):
+
     def setUp(self):
         # enable dygraph mode
         paddle.disable_static()
@@ -1001,15 +1031,16 @@ def test_jit_save_load_inference(self):
         load_net = paddle.jit.load(model_path_inference)
         for func, result in result_origin.items():
             self.assertTrue(
-                float((result - getattr(load_net, func, None)(inps)).abs().max(
-                )) < 1e-5)
+                float((result -
+                       getattr(load_net, func, None)(inps)).abs().max()) < 1e-5)
 
     def test_jit_save_load_multi_methods_inputspec(self):
         model_path = 'jit_save_load_multi_methods/model'
         layer = LinearNetWithMultiStaticFunc(784, 1)
         with self.assertRaises(ValueError):
-            paddle.jit.save(
-                layer, model_path, input_spec=[InputSpec(shape=[None, 784])])
+            paddle.jit.save(layer,
+                            model_path,
+                            input_spec=[InputSpec(shape=[None, 784])])
 
     def test_parse_name(self):
         model_path_inference = "jit_save_load_parse_name/model"
@@ -1025,6 +1056,7 @@ def test_parse_name(self):
 
 
 class LayerSaved(paddle.nn.Layer):
+
     def __init__(self, in_size, out_size):
         super(LayerSaved, self).__init__()
         self.hidden = 100
@@ -1046,6 +1078,7 @@ def forward(self, x):
 
 
 class LayerLoadFinetune(paddle.nn.Layer):
+
     def __init__(self, in_size, out_size, load_path):
         super(LayerLoadFinetune, self).__init__()
         # Test duplicate name
@@ -1079,6 +1112,7 @@ def forward(self, x):
 
 
 class TestJitSaveLoadSaveWithoutRunning(unittest.TestCase):
+
     def setUp(self):
         # enable dygraph mode
         paddle.disable_static()
@@ -1092,25 +1126,23 @@ def test_save_load_finetune_load(self):
         with unique_name.guard():
             layer_save = LayerSaved(IMAGE_SIZE, IMAGE_SIZE)
         #save
-        paddle.jit.save(
-            layer_save,
-            model_path,
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, IMAGE_SIZE], dtype='float32')
-            ])
+        paddle.jit.save(layer_save,
+                        model_path,
+                        input_spec=[
+                            paddle.static.InputSpec(shape=[None, IMAGE_SIZE],
+                                                    dtype='float32')
+                        ])
         result_00 = layer_save(inps0)
         result_01 = layer_save(inps1)
         #load and save without running
         with unique_name.guard():
             layer_load = paddle.jit.load(model_path)
-            paddle.jit.save(
-                layer_load,
-                model_path,
-                input_spec=[
-                    paddle.static.InputSpec(
-                        shape=[None, IMAGE_SIZE], dtype='float32')
-                ])
+            paddle.jit.save(layer_load,
+                            model_path,
+                            input_spec=[
+                                paddle.static.InputSpec(
+                                    shape=[None, IMAGE_SIZE], dtype='float32')
+                            ])
         #reload
         layer_reload = paddle.jit.load(model_path)
         result_10 = layer_reload(inps0)
@@ -1121,6 +1153,7 @@ def test_save_load_finetune_load(self):
 
 
 class TestJitSaveLoadFinetuneLoad(unittest.TestCase):
+
     def setUp(self):
         # enable dygraph mode
         paddle.disable_static()
@@ -1154,16 +1187,18 @@ def test_save_load_finetune_load(self):
         self.assertTrue(float(((result_01 - result_11)).abs().max()) < 1e-5)
 
 
-# NOTE(weixin): When there are multiple test functions in an 
-# `unittest.TestCase`, functions will affect each other, 
-# and there is a risk of random failure. 
-# So divided into three TestCase: TestJitSaveLoadFunctionCase1, 
+# NOTE(weixin): When there are multiple test functions in an
+# `unittest.TestCase`, functions will affect each other,
+# and there is a risk of random failure.
+# So divided into three TestCase: TestJitSaveLoadFunctionCase1,
 # TestJitSaveLoadFunctionCase2, TestJitSaveLoadFunctionCase3.
 class TestJitSaveLoadFunctionCase1(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
 
     def test_jit_save_load_static_function(self):
+
         @paddle.jit.to_static
         def fun(inputs):
             return paddle.tanh(inputs)
@@ -1180,13 +1215,14 @@ def fun(inputs):
 
 
 class TestJitSaveLoadFunctionCase2(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
 
     def test_jit_save_load_function_input_spec(self):
+
         @paddle.jit.to_static(input_spec=[
-            InputSpec(
-                shape=[None, 6], dtype='float32', name='x'),
+            InputSpec(shape=[None, 6], dtype='float32', name='x'),
         ])
         def fun(inputs):
             return paddle.nn.functional.relu(inputs)
@@ -1202,10 +1238,12 @@ def fun(inputs):
 
 
 class TestJitSaveLoadFunctionCase3(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
 
     def test_jit_save_load_function_function(self):
+
         def fun(inputs):
             return paddle.tanh(inputs)
 
@@ -1213,13 +1251,13 @@ def fun(inputs):
         inps = paddle.rand([3, 6])
         origin = fun(inps)
 
-        paddle.jit.save(
-            fun,
-            path,
-            input_spec=[
-                InputSpec(
-                    shape=[None, 6], dtype='float32', name='x'),
-            ])
+        paddle.jit.save(fun,
+                        path,
+                        input_spec=[
+                            InputSpec(shape=[None, 6],
+                                      dtype='float32',
+                                      name='x'),
+                        ])
         load_func = paddle.jit.load(path)
 
         load_result = load_func(inps)
@@ -1227,11 +1265,14 @@ def fun(inputs):
 
 
 class TestJitSaveLoadFunctionWithParamCase1(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
 
     def test_jit_save_load_function(self):
+
         class LinearNet(paddle.nn.Layer):
+
             def __init__(self):
                 super(LinearNet, self).__init__()
                 self._linear = paddle.nn.Linear(5, 6)
@@ -1247,8 +1288,8 @@ def anothor_forward(self, x):
         inps = paddle.rand([3, 5])
         origin = layer.anothor_forward(inps)
 
-        func = paddle.jit.to_static(
-            layer.anothor_forward, [paddle.static.InputSpec(shape=[-1, 5])])
+        func = paddle.jit.to_static(layer.anothor_forward,
+                                    [paddle.static.InputSpec(shape=[-1, 5])])
         path = 'test_jit_save_load_function_with_params_case1/func'
         paddle.jit.save(func, path)
         load_func = paddle.jit.load(path)
@@ -1258,11 +1299,14 @@ def anothor_forward(self, x):
 
 
 class TestJitSaveLoadFunctionWithParamCase2(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
 
     def test_jit_save_load_function(self):
+
         class LinearNet(paddle.nn.Layer):
+
             def __init__(self):
                 super(LinearNet, self).__init__()
                 self._linear = paddle.nn.Linear(5, 6)
@@ -1290,11 +1334,14 @@ def anothor_forward(self, x):
 
 
 class TestJitSaveLoadFunctionWithParamCase3(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
 
     def test_jit_save_load_function(self):
+
         class LinearNet(paddle.nn.Layer):
+
             def __init__(self):
                 super(LinearNet, self).__init__()
                 self._linear = paddle.nn.Linear(5, 6)
@@ -1320,6 +1367,7 @@ def anothor_forward(self, x):
 
 
 class TestJitSaveLoadDataParallel(unittest.TestCase):
+
     def verify_inference_correctness(self, layer, path):
         layer.eval()
         loaded_layer = paddle.jit.load(path)
@@ -1338,8 +1386,9 @@ def test_jit_save_data_parallel_with_inputspec(self):
         layer = paddle.DataParallel(layer)
 
         path = "jit_save_data_parallel_with_inputspec/model"
-        paddle.jit.save(
-            layer=layer, path=path, input_spec=[InputSpec(shape=[None, 784])])
+        paddle.jit.save(layer=layer,
+                        path=path,
+                        input_spec=[InputSpec(shape=[None, 784])])
 
         self.verify_inference_correctness(layer, path)
 
@@ -1359,15 +1408,15 @@ class InputSepcLayer(paddle.nn.Layer):
     '''
 
     @paddle.jit.to_static(input_spec=[
-        InputSpec(
-            shape=[None, 8], dtype='float32', name='x'), InputSpec(
-                shape=[None, 1], dtype='float64', name='y')
+        InputSpec(shape=[None, 8], dtype='float32', name='x'),
+        InputSpec(shape=[None, 1], dtype='float64', name='y')
     ])
     def forward(self, x, y):
         return x, y
 
 
 class TestInputSpecCompatibility(unittest.TestCase):
+
     def _assert_input_spec_layer_return(self, expect_layer, test_layer):
         input_x = paddle.uniform([8, 8], dtype='float32')
         input_y = paddle.uniform([8, 1], dtype='float64')
@@ -1388,26 +1437,26 @@ def test_jit_save_compatible_input_sepc(self):
         self._assert_input_spec_layer_return(layer, no_input_spec_layer)
         shutil.rmtree(save_dir)
 
-        paddle.jit.save(
-            layer=layer,
-            path=path,
-            input_spec=[
-                InputSpec(
-                    shape=[None, 8], dtype='float32', name='x'), InputSpec(
-                        shape=[None, 1], dtype='float64', name='y')
-            ])
+        paddle.jit.save(layer=layer,
+                        path=path,
+                        input_spec=[
+                            InputSpec(shape=[None, 8],
+                                      dtype='float32',
+                                      name='x'),
+                            InputSpec(shape=[None, 1],
+                                      dtype='float64',
+                                      name='y')
+                        ])
         same_input_spec_layer = paddle.jit.load(path)
         self._assert_input_spec_layer_return(layer, same_input_spec_layer)
         shutil.rmtree(save_dir)
 
-        paddle.jit.save(
-            layer=layer,
-            path=path,
-            input_spec=[
-                InputSpec(
-                    shape=[8, 8], dtype='float32'), InputSpec(
-                        shape=[8, -1], dtype='float64')
-            ])
+        paddle.jit.save(layer=layer,
+                        path=path,
+                        input_spec=[
+                            InputSpec(shape=[8, 8], dtype='float32'),
+                            InputSpec(shape=[8, -1], dtype='float64')
+                        ])
         compatible_input_spec_layer = paddle.jit.load(path)
         self._assert_input_spec_layer_return(layer, compatible_input_spec_layer)
         shutil.rmtree(save_dir)
@@ -1419,36 +1468,30 @@ def test_jit_save_incompatible_input_sepc(self):
 
         with self.assertRaises(ValueError):
             # type mismatch
-            paddle.jit.save(
-                layer=layer,
-                path=path,
-                input_spec=[
-                    InputSpec(
-                        shape=[None, 8], dtype='float64'), InputSpec(
-                            shape=[None, 1], dtype='float64')
-                ])
+            paddle.jit.save(layer=layer,
+                            path=path,
+                            input_spec=[
+                                InputSpec(shape=[None, 8], dtype='float64'),
+                                InputSpec(shape=[None, 1], dtype='float64')
+                            ])
 
         with self.assertRaises(ValueError):
             # shape len mismatch
-            paddle.jit.save(
-                layer=layer,
-                path=path,
-                input_spec=[
-                    InputSpec(
-                        shape=[None, 8, 1], dtype='float32'), InputSpec(
-                            shape=[None, 1], dtype='float64')
-                ])
+            paddle.jit.save(layer=layer,
+                            path=path,
+                            input_spec=[
+                                InputSpec(shape=[None, 8, 1], dtype='float32'),
+                                InputSpec(shape=[None, 1], dtype='float64')
+                            ])
 
         with self.assertRaises(ValueError):
             # shape mismatch
-            paddle.jit.save(
-                layer=layer,
-                path=path,
-                input_spec=[
-                    InputSpec(
-                        shape=[None, 8], dtype='float32'), InputSpec(
-                            shape=[None, 2], dtype='float64')
-                ])
+            paddle.jit.save(layer=layer,
+                            path=path,
+                            input_spec=[
+                                InputSpec(shape=[None, 8], dtype='float32'),
+                                InputSpec(shape=[None, 2], dtype='float64')
+                            ])
         if os.path.exists(save_dir):
             shutil.rmtree(save_dir)
 
diff --git a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
index aa94cf2d35cc7..930d8666ba142 100644
--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
@@ -38,6 +38,7 @@ def kldiv_loss(x, target, reduction):
 
 
 class TestKLDivLossOp(OpTest):
+
     def setUp(self):
         self.initTestCase()
         self.op_type = 'kldiv_loss'
@@ -58,8 +59,10 @@ def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Loss', no_grad_set=set(["Target"]), check_eager=True)
+        self.check_grad(['X'],
+                        'Loss',
+                        no_grad_set=set(["Target"]),
+                        check_eager=True)
 
     def initTestCase(self):
         self.x_shape = (4, 5, 5)
@@ -67,24 +70,28 @@ def initTestCase(self):
 
 
 class TestKLDivLossOp2(TestKLDivLossOp):
+
     def initTestCase(self):
         self.x_shape = (3, 2, 7, 7)
         self.reduction = 'none'
 
 
 class TestKLDivLossOp3(TestKLDivLossOp):
+
     def initTestCase(self):
         self.x_shape = (2, 3, 5, 7, 9)
         self.reduction = 'mean'
 
 
 class TestKLDivLossOp4(TestKLDivLossOp):
+
     def initTestCase(self):
         self.x_shape = (5, 20)
         self.reduction = 'sum'
 
 
 class TestKLDivLossDygraph(unittest.TestCase):
+
     def run_kl_loss(self, reduction, shape=(5, 20)):
         x = np.random.uniform(-10, 10, shape).astype('float64')
         target = np.random.uniform(-10, 10, shape).astype('float64')
@@ -92,8 +99,8 @@ def run_kl_loss(self, reduction, shape=(5, 20)):
 
         with paddle.fluid.dygraph.guard():
             kldiv_criterion = paddle.nn.KLDivLoss(reduction)
-            pred_loss = kldiv_criterion(
-                paddle.to_tensor(x), paddle.to_tensor(target))
+            pred_loss = kldiv_criterion(paddle.to_tensor(x),
+                                        paddle.to_tensor(target))
             self.assertTrue(np.allclose(pred_loss.numpy(), gt_loss))
 
     def test_kl_loss_batchmean(self):
@@ -121,6 +128,7 @@ def test_kl_loss_static_api(self):
 
 
 class TestKLDivLossTypePromotion(unittest.TestCase):
+
     def test_kl_div_promotion(self):
 
         with paddle.fluid.dygraph.guard():
diff --git a/python/paddle/fluid/tests/unittests/test_kron_op.py b/python/paddle/fluid/tests/unittests/test_kron_op.py
index f4d013b7c6a3e..61b5b92c007e9 100644
--- a/python/paddle/fluid/tests/unittests/test_kron_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kron_op.py
@@ -25,6 +25,7 @@
 
 
 class TestKronOp(OpTest):
+
     def setUp(self):
         self.op_type = "kron"
         self.python_api = paddle.kron
@@ -52,6 +53,7 @@ def test_check_grad_ignore_y(self):
 
 
 class TestKronOp2(TestKronOp):
+
     def setUp(self):
         self.op_type = "kron"
         self.python_api = paddle.kron
@@ -64,6 +66,7 @@ def setUp(self):
 
 
 class TestKronOp3(TestKronOp):
+
     def setUp(self):
         self.op_type = "kron"
         self.python_api = paddle.kron
@@ -76,6 +79,7 @@ def setUp(self):
 
 
 class TestKronLayer(unittest.TestCase):
+
     def test_case(self):
         a = np.random.randn(10, 10).astype(np.float64)
         b = np.random.randn(10, 10).astype(np.float64)
@@ -112,6 +116,7 @@ def test_api_eager_dygraph(self):
 
 
 class TestComplexKronOp(OpTest):
+
     def setUp(self):
         self.op_type = "kron"
         self.python_api = paddle.kron
@@ -173,33 +178,31 @@ def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-            user_defined_grads=[self.grad_x, self.grad_y],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=True)
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        user_defined_grads=[self.grad_x, self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=True)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[self.grad_y],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=True)
+        self.check_grad(['Y'],
+                        'Out',
+                        no_grad_set=set("X"),
+                        user_defined_grads=[self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=True)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        no_grad_set=set('Y'),
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=True)
 
 
 class TestKronOpTypePromotion(TestComplexKronOp):
+
     def init_input_output(self):
         self.x = np.random.random(self.x_shape).astype(self.dtype)
         self.y = np.random.random(self.y_shape).astype(
diff --git a/python/paddle/fluid/tests/unittests/test_kthvalue_op.py b/python/paddle/fluid/tests/unittests/test_kthvalue_op.py
index e1b1422580983..66eb8ab4f31fb 100644
--- a/python/paddle/fluid/tests/unittests/test_kthvalue_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kthvalue_op.py
@@ -35,6 +35,7 @@ def cal_kthvalue(x, k, axis, keepdim=False):
 
 
 class TestKthvalueOp(OpTest):
+
     def init_args(self):
         self.k = 5
         self.axis = -1
@@ -47,8 +48,9 @@ def setUp(self):
         self.init_args()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis}
-        output, indices = cal_kthvalue(
-            self.input_data, k=self.k, axis=self.axis)
+        output, indices = cal_kthvalue(self.input_data,
+                                       k=self.k,
+                                       axis=self.axis)
         self.outputs = {'Out': output, 'Indices': indices}
 
     def test_check_output(self):
@@ -61,6 +63,7 @@ def test_check_grad(self):
 
 
 class TestKthvalueOpWithKeepdim(OpTest):
+
     def init_args(self):
         self.k = 2
         self.axis = 1
@@ -73,8 +76,10 @@ def setUp(self):
         self.input_data = np.random.random((1, 3, 2, 4, 10))
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'keepdim': True}
-        output, indices = cal_kthvalue(
-            self.input_data, k=self.k, axis=self.axis, keepdim=True)
+        output, indices = cal_kthvalue(self.input_data,
+                                       k=self.k,
+                                       axis=self.axis,
+                                       keepdim=True)
         self.outputs = {'Out': output, 'Indices': indices}
 
     def test_check_output(self):
@@ -87,6 +92,7 @@ def test_check_grad(self):
 
 
 class TestKthvalueOpKernels(unittest.TestCase):
+
     def setUp(self):
         self.axises = [2, -1]
 
@@ -123,11 +129,13 @@ def test_gpu_kernel():
 
 
 class TestKthvalueOpWithNaN(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
         self.x = paddle.uniform([2, 200, 10], dtype='float32')
 
     def test_errors(self):
+
         def test_nan_in_cpu_kernel():
             paddle.set_device('cpu')
             nan_position = 100
@@ -150,6 +158,7 @@ def test_nan_in_gpu_kernel():
 
 
 class TestKthvalueOpErrors(unittest.TestCase):
+
     def setUp(self):
         self.x = paddle.uniform([2, 10, 20, 25], dtype='float32')
 
@@ -173,6 +182,7 @@ def test_dim_range_error():
 
 
 class TestModeOpInStatic(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(666)
         self.input_data = np.random.random((2, 20, 1, 2, 80)).astype(np.float64)
@@ -182,8 +192,9 @@ def test_run_static(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            input_tensor = paddle.static.data(
-                name="x", shape=[2, 20, 1, 2, 80], dtype="float64")
+            input_tensor = paddle.static.data(name="x",
+                                              shape=[2, 20, 1, 2, 80],
+                                              dtype="float64")
             result = paddle.kthvalue(input_tensor, self.k, axis=1)
             expect_value = cal_kthvalue(self.input_data, self.k, axis=1)[0]
             exe = paddle.static.Executor(paddle.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_l1_loss.py b/python/paddle/fluid/tests/unittests/test_l1_loss.py
index c35188623b440..01d9dba7b420c 100644
--- a/python/paddle/fluid/tests/unittests/test_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_loss.py
@@ -21,6 +21,7 @@
 
 
 class TestFunctionalL1Loss(unittest.TestCase):
+
     def setUp(self):
         self.input_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
         self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
@@ -44,10 +45,12 @@ def run_imperative(self):
         self.assertTrue(dy_result.shape, [10, 10, 5])
 
     def run_static(self, use_gpu=False):
-        input = paddle.fluid.data(
-            name='input', shape=[10, 10, 5], dtype='float32')
-        label = paddle.fluid.data(
-            name='label', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(name='input',
+                                  shape=[10, 10, 5],
+                                  dtype='float32')
+        label = paddle.fluid.data(name='label',
+                                  shape=[10, 10, 5],
+                                  dtype='float32')
         result0 = paddle.nn.functional.l1_loss(input, label)
         result1 = paddle.nn.functional.l1_loss(input, label, reduction='sum')
         result2 = paddle.nn.functional.l1_loss(input, label, reduction='none')
@@ -56,10 +59,11 @@ def run_static(self, use_gpu=False):
         place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        static_result = exe.run(
-            feed={"input": self.input_np,
-                  "label": self.label_np},
-            fetch_list=[result0, result1, result2])
+        static_result = exe.run(feed={
+            "input": self.input_np,
+            "label": self.label_np
+        },
+                                fetch_list=[result0, result1, result2])
 
         expected = np.mean(np.abs(self.input_np - self.label_np))
         self.assertTrue(np.allclose(static_result[0], expected))
@@ -91,18 +95,23 @@ def test_gpu(self):
 
     # test case the raise message
     def test_errors(self):
+
         def test_value_error():
-            input = paddle.fluid.data(
-                name='input', shape=[10, 10, 5], dtype='float32')
-            label = paddle.fluid.data(
-                name='label', shape=[10, 10, 5], dtype='float32')
-            loss = paddle.nn.functional.l1_loss(
-                input, label, reduction='reduce_mean')
+            input = paddle.fluid.data(name='input',
+                                      shape=[10, 10, 5],
+                                      dtype='float32')
+            label = paddle.fluid.data(name='label',
+                                      shape=[10, 10, 5],
+                                      dtype='float32')
+            loss = paddle.nn.functional.l1_loss(input,
+                                                label,
+                                                reduction='reduce_mean')
 
         self.assertRaises(ValueError, test_value_error)
 
 
 class TestClassL1Loss(unittest.TestCase):
+
     def setUp(self):
         self.input_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
         self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
@@ -129,10 +138,12 @@ def run_imperative(self):
         self.assertTrue(dy_result.shape, [10, 10, 5])
 
     def run_static(self, use_gpu=False):
-        input = paddle.fluid.data(
-            name='input', shape=[10, 10, 5], dtype='float32')
-        label = paddle.fluid.data(
-            name='label', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(name='input',
+                                  shape=[10, 10, 5],
+                                  dtype='float32')
+        label = paddle.fluid.data(name='label',
+                                  shape=[10, 10, 5],
+                                  dtype='float32')
         l1_loss = paddle.nn.loss.L1Loss()
         result0 = l1_loss(input, label)
         l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
@@ -145,10 +156,11 @@ def run_static(self, use_gpu=False):
         place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        static_result = exe.run(
-            feed={"input": self.input_np,
-                  "label": self.label_np},
-            fetch_list=[result0, result1, result2])
+        static_result = exe.run(feed={
+            "input": self.input_np,
+            "label": self.label_np
+        },
+                                fetch_list=[result0, result1, result2])
 
         expected = np.mean(np.abs(self.input_np - self.label_np))
         self.assertTrue(np.allclose(static_result[0], expected))
@@ -179,6 +191,7 @@ def test_gpu(self):
 
     # test case the raise message
     def test_errors(self):
+
         def test_value_error():
             loss = paddle.nn.loss.L1Loss(reduction="reduce_mean")
 
diff --git a/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py b/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py
index 54f5e64fda4b6..2fef18fd6b5a5 100644
--- a/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py
+++ b/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py
@@ -22,6 +22,7 @@
 
 
 class LabelSmoothTestCase(unittest.TestCase):
+
     def __init__(self,
                  methodName='runTest',
                  label_shape=(20, 1),
@@ -44,13 +45,13 @@ def fluid_layer(self, place):
         start = fluid.Program()
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
-                label_var = fluid.data(
-                    "input", self.label_shape, dtype=self.dtype)
-                y_var = fluid.layers.label_smooth(
-                    label_var,
-                    prior_dist=self.prior_dist,
-                    epsilon=self.epsilon,
-                    dtype=self.dtype)
+                label_var = fluid.data("input",
+                                       self.label_shape,
+                                       dtype=self.dtype)
+                y_var = fluid.layers.label_smooth(label_var,
+                                                  prior_dist=self.prior_dist,
+                                                  epsilon=self.epsilon,
+                                                  dtype=self.dtype)
         feed_dict = {"input": self.label}
         exe = fluid.Executor(place)
         exe.run(start)
@@ -63,10 +64,12 @@ def functional(self, place):
         start = fluid.Program()
         with fluid.unique_name.guard():
             with fluid.program_guard(main, start):
-                label_var = fluid.data(
-                    "input", self.label_shape, dtype=self.dtype)
-                y_var = F.label_smooth(
-                    label_var, prior_dist=self.prior_dist, epsilon=self.epsilon)
+                label_var = fluid.data("input",
+                                       self.label_shape,
+                                       dtype=self.dtype)
+                y_var = F.label_smooth(label_var,
+                                       prior_dist=self.prior_dist,
+                                       epsilon=self.epsilon)
         feed_dict = {"input": self.label}
         exe = fluid.Executor(place)
         exe.run(start)
@@ -76,8 +79,9 @@ def functional(self, place):
     def paddle_dygraph_layer(self):
         paddle.disable_static()
         label_var = dg.to_variable(self.label)
-        y_var = F.label_smooth(
-            label_var, prior_dist=self.prior_dist, epsilon=self.epsilon)
+        y_var = F.label_smooth(label_var,
+                               prior_dist=self.prior_dist,
+                               epsilon=self.epsilon)
         y_np = y_var.numpy()
         return y_np
 
@@ -98,6 +102,7 @@ def runTest(self):
 
 
 class LabelSmoothErrorTestCase(LabelSmoothTestCase):
+
     def runTest(self):
         place = fluid.CPUPlace()
         with dg.guard(place):
@@ -108,8 +113,7 @@ def runTest(self):
 def add_cases(suite):
     suite.addTest(LabelSmoothTestCase(methodName='runTest'))
     suite.addTest(
-        LabelSmoothTestCase(
-            methodName='runTest', label_shape=[2, 3, 1]))
+        LabelSmoothTestCase(methodName='runTest', label_shape=[2, 3, 1]))
 
 
 def add_error_cases(suite):
diff --git a/python/paddle/fluid/tests/unittests/test_label_smooth_op.py b/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
index b1d49f8604ec7..926f86abeea54 100644
--- a/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
+++ b/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
@@ -21,6 +21,7 @@
 
 
 class TestLabelSmoothOp(OpTest):
+
     def config(self):
         self.op_type = "label_smooth"
         self.python_api = paddle.nn.functional.label_smooth
@@ -32,8 +33,8 @@ def config(self):
 
     def setUp(self):
         self.config()
-        smoothed_label = (1 - self.epsilon
-                          ) * self.label + self.epsilon / self.label_dim
+        smoothed_label = (
+            1 - self.epsilon) * self.label + self.epsilon / self.label_dim
         self.inputs = {'X': self.label}
         self.attrs = {'epsilon': self.epsilon}
         self.outputs = {'Out': smoothed_label}
@@ -46,6 +47,7 @@ def test_check_grad(self):
 
 
 class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp):
+
     def setUp(self):
         self.config()
         dist = np.random.random((1, self.label_dim))
@@ -56,21 +58,23 @@ def setUp(self):
 
 
 class TestLabelSmoothOp3D(TestLabelSmoothOp):
+
     def setUp(self):
         super(TestLabelSmoothOp3D, self).setUp()
         self.inputs['X'] = self.inputs['X'].reshape(
             [2, -1, self.inputs['X'].shape[-1]])
-        self.outputs['Out'] = self.outputs['Out'].reshape(self.inputs['X']
-                                                          .shape)
+        self.outputs['Out'] = self.outputs['Out'].reshape(
+            self.inputs['X'].shape)
 
 
 class TestLabelSmoothOpWithPriorDist3D(TestLabelSmoothOpWithPriorDist):
+
     def setUp(self):
         super(TestLabelSmoothOpWithPriorDist3D, self).setUp()
         self.inputs['X'] = self.inputs['X'].reshape(
             [2, -1, self.inputs['X'].shape[-1]])
-        self.outputs['Out'] = self.outputs['Out'].reshape(self.inputs['X']
-                                                          .shape)
+        self.outputs['Out'] = self.outputs['Out'].reshape(
+            self.inputs['X'].shape)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lamb_op.py b/python/paddle/fluid/tests/unittests/test_lamb_op.py
index 26a8064dd9014..e244e54e312db 100644
--- a/python/paddle/fluid/tests/unittests/test_lamb_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lamb_op.py
@@ -26,6 +26,7 @@
 
 
 class TestLambOp1(OpTest):
+
     def set_attrs(self):
         self.attrs = {
             'epsilon': 1e-4,
@@ -75,6 +76,7 @@ def test_check_output(self):
 
 
 class TestLambOp2(TestLambOp1):
+
     def set_attrs(self):
         self.attrs = {
             'epsilon': 1e-8,
@@ -85,6 +87,7 @@ def set_attrs(self):
 
 
 class TestLambOpMultipleSteps(TestLambOp1):
+
     def set_attrs(self):
         self.attrs = {
             'epsilon': 1e-8,
@@ -152,12 +155,14 @@ def lamb_step(inputs, attributes):
     moment2_unbiased = moment2_out / (1 - beta2_pow)
 
     r_1 = np.linalg.norm(param)
-    r_2 = np.linalg.norm(moment1_unbiased / (np.sqrt(moment2_unbiased) + epsilon
-                                             ) + weight_decay * param)
+    r_2 = np.linalg.norm(moment1_unbiased /
+                         (np.sqrt(moment2_unbiased) + epsilon) +
+                         weight_decay * param)
     lr_t = lr * r_1 / r_2
 
-    param_out = param - lr_t * (moment1_unbiased / (
-        np.sqrt(moment2_unbiased) + epsilon) + weight_decay * param)
+    param_out = param - lr_t * (moment1_unbiased /
+                                (np.sqrt(moment2_unbiased) + epsilon) +
+                                weight_decay * param)
 
     beta1_pow_out = beta1_pow * beta1
     beta2_pow_out = beta2_pow * beta2
@@ -193,13 +198,13 @@ def lamb_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
     moment2_unbiased = np.zeros(shape=[height, row_numel])
 
     def update_mom(row_id, update_value):
-        moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
-                                                         ) * update_value
+        moment1_out[row_id] = beta1 * moment1[row_id] + (1 -
+                                                         beta1) * update_value
         moment2_out[row_id] = beta2 * moment2[row_id] + (
             1 - beta2) * np.square(update_value)
 
-        moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
-                                                         ) * update_value
+        moment1_out[row_id] = beta1 * moment1[row_id] + (1 -
+                                                         beta1) * update_value
         moment2_out[row_id] = beta2 * moment2[row_id] + (
             1 - beta2) * np.square(update_value)
 
@@ -209,8 +214,9 @@ def update_param():
                              weight_decay * param)
         lr_t = lr * r_1 / r_2
 
-        param_out = param - lr_t * (moment1_out / (
-            np.sqrt(moment2_out) + epsilon) + weight_decay * param)
+        param_out = param - lr_t * (moment1_out /
+                                    (np.sqrt(moment2_out) + epsilon) +
+                                    weight_decay * param)
 
     for row_id in range(param_out.shape[0]):
         update_value = np.zeros(np_grad[0].shape).astype("float32")
@@ -226,6 +232,7 @@ def update_param():
 
 
 class TestSparseLambOp(unittest.TestCase):
+
     def setup(self, scope, place):
         beta1 = 0.78
         beta2 = 0.836
diff --git a/python/paddle/fluid/tests/unittests/test_lambv2_op.py b/python/paddle/fluid/tests/unittests/test_lambv2_op.py
index 674cd9a3e9c5b..cde23216c1093 100644
--- a/python/paddle/fluid/tests/unittests/test_lambv2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lambv2_op.py
@@ -26,6 +26,7 @@
 
 
 class LAMBOptimizer(paddle.optimizer.Lamb):
+
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, fluid.framework.Block)
         block.program._use_lamb = True
@@ -38,18 +39,24 @@ def _append_optimize_op(self, block, param_and_grad):
         beta_2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
                                                param_and_grad[0])
 
-        beta_1 = layers.fill_constant(
-            dtype='float32', shape=[1], value=self._beta1, name='lamb_beta_1')
-        beta_2 = layers.fill_constant(
-            dtype='float32', shape=[1], value=self._beta2, name='lamb_beta_2')
-        epsilon = layers.fill_constant(
-            dtype='float32', shape=[1], value=self._epsilon, name='epsilon')
+        beta_1 = layers.fill_constant(dtype='float32',
+                                      shape=[1],
+                                      value=self._beta1,
+                                      name='lamb_beta_1')
+        beta_2 = layers.fill_constant(dtype='float32',
+                                      shape=[1],
+                                      value=self._beta2,
+                                      name='lamb_beta_2')
+        epsilon = layers.fill_constant(dtype='float32',
+                                       shape=[1],
+                                       value=self._epsilon,
+                                       name='epsilon')
 
         one = paddle.ones(shape=[1]).astype('float32')
         zero = paddle.zeros(shape=[1]).astype('float32')
 
-        next_m = paddle.multiply(m, beta_1) + paddle.multiply(param_and_grad[1],
-                                                              one - beta_1)
+        next_m = paddle.multiply(m, beta_1) + paddle.multiply(
+            param_and_grad[1], one - beta_1)
         next_v = paddle.multiply(v, beta_2) + paddle.multiply(
             paddle.pow(param_and_grad[1], 2), one - beta_2)
 
@@ -73,8 +80,8 @@ def _append_optimize_op(self, block, param_and_grad):
 
         ratio = paddle.where(
             paddle.greater_than(w_norm, zero),
-            paddle.where(
-                paddle.greater_than(g_norm, zero), (w_norm / g_norm), one), one)
+            paddle.where(paddle.greater_than(g_norm, zero), (w_norm / g_norm),
+                         one), one)
         update_with_lr = ratio * learning_rate * update
         next_param = param_and_grad[0] - update_with_lr
 
@@ -89,14 +96,16 @@ def _append_optimize_op(self, block, param_and_grad):
 
 
 class TestLambOpV2(unittest.TestCase):
+
     def test_lamb_op(self):
         shape = [2, 4, 8, 8]
         data = paddle.to_tensor(np.random.random(size=shape).astype("float32"))
         conv = paddle.nn.Conv2D(4, 6, (3, 3))
         data = conv(data)
         loss = paddle.mean(data)
-        opt = paddle.optimizer.Lamb(
-            learning_rate=1e-5, epsilon=1e-8, parameters=conv.parameters())
+        opt = paddle.optimizer.Lamb(learning_rate=1e-5,
+                                    epsilon=1e-8,
+                                    parameters=conv.parameters())
         loss.backward()
         opt.minimize(loss)
 
@@ -104,6 +113,7 @@ def test_lamb_op(self):
 
 
 class TestLambOpWithCombinedOp(unittest.TestCase):
+
     def test_lamb_op_with_multi_steps(self):
         paddle.enable_static()
 
@@ -135,8 +145,10 @@ def _build_static_model(main, startup, seed=100):
             executor = fluid.Executor(place)
             executor.run(startup_program)
             output = executor.run(program=main_program,
-                                  feed={'X': feed_x,
-                                        'Y': feed_y},
+                                  feed={
+                                      'X': feed_x,
+                                      'Y': feed_y
+                                  },
                                   fetch_list=[avg_loss.name])
 
             main = fluid.Program()
@@ -149,14 +161,17 @@ def _build_static_model(main, startup, seed=100):
             exe = fluid.Executor(place)
             exe.run(startup)
             out = exe.run(program=main,
-                          feed={'X': feed_x,
-                                'Y': feed_y},
+                          feed={
+                              'X': feed_x,
+                              'Y': feed_y
+                          },
                           fetch_list=[loss.name])
 
             self.assertTrue(np.allclose(out, output))
 
 
 class TestLambOpV2Group(TestLambOpV2):
+
     def test_lamb_op(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -164,17 +179,16 @@ def test_lamb_op(self):
         linear_1 = paddle.nn.Linear(13, 5)
         linear_2 = paddle.nn.Linear(5, 3)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Lamb(
-            learning_rate=0.01,
-            parameters=[{
-                'params': linear_1.parameters()
-            }, {
-                'params': linear_2.parameters(),
-                'lamb_weight_decay': 0.001,
-                'beta1': 0.9,
-                'beta2': 0.99
-            }],
-            lamb_weight_decay=0.01)
+        adam = paddle.optimizer.Lamb(learning_rate=0.01,
+                                     parameters=[{
+                                         'params': linear_1.parameters()
+                                     }, {
+                                         'params': linear_2.parameters(),
+                                         'lamb_weight_decay': 0.001,
+                                         'beta1': 0.9,
+                                         'beta2': 0.99
+                                     }],
+                                     lamb_weight_decay=0.01)
         out = linear_1(a)
         out = linear_2(out)
         out.backward()
@@ -183,14 +197,16 @@ def test_lamb_op(self):
 
 
 class TestLambOpMultiPrecision(unittest.TestCase):
+
     def check_main(self, x_np, place, multi_precision=False, seed=10, n=10):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, startup_prog):
             paddle.seed(seed)
             with paddle.static.amp.fp16_guard():
-                x = paddle.static.data(
-                    name='x', shape=[None, 10], dtype='float32')
+                x = paddle.static.data(name='x',
+                                       shape=[None, 10],
+                                       dtype='float32')
                 linear = paddle.nn.Linear(10, 2)
                 hidden = linear(x)
                 loss = paddle.mean(hidden)
@@ -198,8 +214,9 @@ def check_main(self, x_np, place, multi_precision=False, seed=10, n=10):
             original_optimizer = paddle.optimizer.Lamb(learning_rate=1e-3)
             original_optimizer._multi_precision = multi_precision
             if multi_precision:
-                optimizer = paddle.static.amp.decorate(
-                    original_optimizer, use_pure_fp16=True, use_fp16_guard=True)
+                optimizer = paddle.static.amp.decorate(original_optimizer,
+                                                       use_pure_fp16=True,
+                                                       use_fp16_guard=True)
             else:
                 optimizer = original_optimizer
             optimizer.minimize(loss)
@@ -242,8 +259,8 @@ def get_parameter(var):
                                              fetch_list=[weight, bias])
                 weight_np = weight_np.astype('float32')
                 bias_np = bias_np.astype('float32')
-                self.assertTrue(
-                    np.array_equal(weight_np, get_parameter(weight)))
+                self.assertTrue(np.array_equal(weight_np,
+                                               get_parameter(weight)))
                 self.assertTrue(np.array_equal(bias_np, get_parameter(bias)))
             return weight_np, bias_np
 
diff --git a/python/paddle/fluid/tests/unittests/test_launch_coverage.py b/python/paddle/fluid/tests/unittests/test_launch_coverage.py
index 9fbf27e3c1d06..e4c35a6347185 100644
--- a/python/paddle/fluid/tests/unittests/test_launch_coverage.py
+++ b/python/paddle/fluid/tests/unittests/test_launch_coverage.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -42,69 +42,70 @@ def _parse_args():
 POD_IP (current node ip address, not needed for local training)	
 ''')
 
-    #Optional arguments for the launch helper	
+    #Optional arguments for the launch helper
     parser.add_argument(
         "--cluster_node_ips",
         type=str,
         default="127.0.0.1",
         help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
-    parser.add_argument(
-        "--node_ip",
-        type=str,
-        default="127.0.0.1",
-        help="The current node ip. ")
+    parser.add_argument("--node_ip",
+                        type=str,
+                        default="127.0.0.1",
+                        help="The current node ip. ")
     parser.add_argument(
         "--use_paddlecloud",
         action='store_true',
-        help="wheter to use paddlecloud platform to run your multi-process job. If false, no need to set this argument."
+        help=
+        "wheter to use paddlecloud platform to run your multi-process job. If false, no need to set this argument."
     )
-    parser.add_argument(
-        "--started_port",
-        type=int,
-        default=None,
-        help="The trainer's started port on a single node")
+    parser.add_argument("--started_port",
+                        type=int,
+                        default=None,
+                        help="The trainer's started port on a single node")
 
-    parser.add_argument(
-        "--print_config",
-        type=bool,
-        default=True,
-        help="Print the config or not")
+    parser.add_argument("--print_config",
+                        type=bool,
+                        default=True,
+                        help="Print the config or not")
 
     parser.add_argument(
         "--selected_gpus",
         type=str,
         default=None,
-        help="It's for gpu training and the training process will run on the selected_gpus,"
+        help=
+        "It's for gpu training and the training process will run on the selected_gpus,"
         "each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training."
     )
 
     parser.add_argument(
         "--log_level",
         type=int,
-        default=20,  # logging.INFO, details are here:https://docs.python.org/3/library/logging.html#levels	
+        default=
+        20,  # logging.INFO, details are here:https://docs.python.org/3/library/logging.html#levels	
         help="Logging level, default is logging.INFO")
 
     parser.add_argument(
         "--log_dir",
         type=str,
-        help="The path for each process's log.If it's not set, the log will printed to default pipe."
+        help=
+        "The path for each process's log.If it's not set, the log will printed to default pipe."
     )
 
-    #positional	
-    parser.add_argument(
-        "training_script",
-        type=str,
-        help="The full path to the single GPU training "
-        "program/script to be launched in parallel, "
-        "followed by all the arguments for the "
-        "training script")
+    #positional
+    parser.add_argument("training_script",
+                        type=str,
+                        help="The full path to the single GPU training "
+                        "program/script to be launched in parallel, "
+                        "followed by all the arguments for the "
+                        "training script")
 
-    #rest from the training program	
+    #rest from the training program
     parser.add_argument('training_script_args', nargs=REMAINDER)
     return parser.parse_args()
 
 
 class TestCoverage(unittest.TestCase):
+
     def test_gpus(self):
         args = _parse_args()
 
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index b75dc2c964ca0..1cc2906731bd8 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -83,27 +83,33 @@ def _reference_layer_norm_grad(x,
     # dx
     if scale is not None:
         dx_end = scale * np.sqrt(1.0 / var) * grad_y
-        d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape(
-            [N, 1])  # the second part equals to zero.
+        d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale,
+                          axis=1).reshape([N, 1
+                                           ])  # the second part equals to zero.
         d_mean = 1.0 / D * d_mean_0
-        d_std = np.sum(-(1.0 / var) * (x - mean) * grad_y * scale,
-                       axis=1).reshape([N, 1]) * (
+        d_std = np.sum(-(1.0 / var) *
+                       (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * (
                            1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) *
                            (x - mean))
     else:
         dx_end = 1.0 * np.sqrt(1.0 / var) * grad_y
-        d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * 1.0, axis=1).reshape(
-            [N, 1])  # the second part equals to zero.
+        d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * 1.0,
+                          axis=1).reshape([N, 1
+                                           ])  # the second part equals to zero.
         d_mean = 1.0 / D * d_mean_0
-        d_std = np.sum(-(1.0 / var) * (x - mean) * grad_y * 1.0,
-                       axis=1).reshape([N, 1]) * (
+        d_std = np.sum(-(1.0 / var) *
+                       (x - mean) * grad_y * 1.0, axis=1).reshape([N, 1]) * (
                            1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) *
                            (x - mean))
 
     grad_x = dx_end + d_mean + d_std
 
     grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape
-    var.shape, mean.shape = [N, ], [N, ]
+    var.shape, mean.shape = [
+        N,
+    ], [
+        N,
+    ]
 
     if scale is not None:
         scale.shape = scale_shape
@@ -111,6 +117,7 @@ def _reference_layer_norm_grad(x,
 
 
 class TestLayerNormOp(unittest.TestCase):
+
     def setUp(self):
         self.use_cudnn = True
 
@@ -124,6 +131,7 @@ def check_forward_backward(self,
                                has_bias=True,
                                y_grad_scale=1.0,
                                use_mkldnn=False):
+
         def test_with_place(place,
                             shape,
                             begin_norm_axis,
@@ -140,8 +148,8 @@ def test_with_place(place,
                 np.float32) if has_scale else None
             bias = np.random.random_sample(scale_shape).astype(
                 np.float32) if has_bias else None
-            y_grad = (np.random.random_sample(x_shape) *
-                      y_grad_scale).astype(np.float32)
+            y_grad = (np.random.random_sample(x_shape) * y_grad_scale).astype(
+                np.float32)
 
             # reference forward & backward
             y, mean, variance = _reference_layer_norm_naive(
@@ -162,10 +170,9 @@ def test_with_place(place,
             with fluid.program_guard(program):
                 block = program.global_block()
                 for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape)
+                    block.create_var(name=name,
+                                     dtype='float32',
+                                     shape=ground_truth[name].shape)
                 inputs = {"X": block.var('x')}
                 fetch_list = [
                     'y',
@@ -242,83 +249,79 @@ def test_check_forward_backward_with_scale_and_bias(self):
         self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
 
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
-        self.check_forward_backward(
-            shape=[2, 3, 4, 5],
-            begin_norm_axis=1,
-            has_scale=False,
-            has_bias=True)
-        self.check_forward_backward(
-            shape=[2, 3, 4, 5],
-            begin_norm_axis=1,
-            has_scale=True,
-            has_bias=False)
-        self.check_forward_backward(
-            shape=[2, 3, 4, 5],
-            begin_norm_axis=1,
-            has_scale=False,
-            has_bias=False)
+        self.check_forward_backward(shape=[2, 3, 4, 5],
+                                    begin_norm_axis=1,
+                                    has_scale=False,
+                                    has_bias=True)
+        self.check_forward_backward(shape=[2, 3, 4, 5],
+                                    begin_norm_axis=1,
+                                    has_scale=True,
+                                    has_bias=False)
+        self.check_forward_backward(shape=[2, 3, 4, 5],
+                                    begin_norm_axis=1,
+                                    has_scale=False,
+                                    has_bias=False)
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
-        self.check_forward_backward(
-            shape=[92, 513, 129], begin_norm_axis=2, y_grad_scale=0.1)
+        self.check_forward_backward(shape=[92, 513, 129],
+                                    begin_norm_axis=2,
+                                    y_grad_scale=0.1)
         self.check_forward_backward(shape=[3, 34, 1134], begin_norm_axis=2)
-        self.check_forward_backward(
-            shape=[92, 513, 1134], begin_norm_axis=2, y_grad_scale=0.1)
-        self.check_forward_backward(
-            shape=[92, 513, 1134],
-            begin_norm_axis=2,
-            has_scale=False,
-            has_bias=True,
-            y_grad_scale=0.1)
-        self.check_forward_backward(
-            shape=[92, 513, 1134],
-            begin_norm_axis=2,
-            has_scale=True,
-            has_bias=False,
-            y_grad_scale=0.1)
-        self.check_forward_backward(
-            shape=[92, 513, 1134],
-            begin_norm_axis=2,
-            has_scale=False,
-            has_bias=False,
-            y_grad_scale=0.1)
-        self.check_forward_backward(
-            shape=[512, 1024], begin_norm_axis=1, has_scale=True, has_bias=True)
+        self.check_forward_backward(shape=[92, 513, 1134],
+                                    begin_norm_axis=2,
+                                    y_grad_scale=0.1)
+        self.check_forward_backward(shape=[92, 513, 1134],
+                                    begin_norm_axis=2,
+                                    has_scale=False,
+                                    has_bias=True,
+                                    y_grad_scale=0.1)
+        self.check_forward_backward(shape=[92, 513, 1134],
+                                    begin_norm_axis=2,
+                                    has_scale=True,
+                                    has_bias=False,
+                                    y_grad_scale=0.1)
+        self.check_forward_backward(shape=[92, 513, 1134],
+                                    begin_norm_axis=2,
+                                    has_scale=False,
+                                    has_bias=False,
+                                    y_grad_scale=0.1)
+        self.check_forward_backward(shape=[512, 1024],
+                                    begin_norm_axis=1,
+                                    has_scale=True,
+                                    has_bias=True)
 
 
 class TestLayerNormAPI(unittest.TestCase):
+
     def test_case(self):
-        x = fluid.layers.data(
-            name='x',
-            shape=[64, 32, 256],
-            dtype='float32',
-            append_batch_size=False)
-        x = fluid.layers.layer_norm(
-            x,
-            scale=True,
-            shift=True,
-            begin_norm_axis=1,
-            epsilon=1e-05,
-            param_attr=None,
-            bias_attr=None)
-        x = fluid.layers.layer_norm(
-            x,
-            scale=False,
-            shift=False,
-            begin_norm_axis=1,
-            epsilon=1e-05,
-            param_attr=None,
-            bias_attr=None)
-        x = fluid.layers.layer_norm(
-            x,
-            scale=False,
-            shift=False,
-            begin_norm_axis=1,
-            epsilon=1e-05,
-            param_attr="scale",
-            bias_attr="shift")
+        x = fluid.layers.data(name='x',
+                              shape=[64, 32, 256],
+                              dtype='float32',
+                              append_batch_size=False)
+        x = fluid.layers.layer_norm(x,
+                                    scale=True,
+                                    shift=True,
+                                    begin_norm_axis=1,
+                                    epsilon=1e-05,
+                                    param_attr=None,
+                                    bias_attr=None)
+        x = fluid.layers.layer_norm(x,
+                                    scale=False,
+                                    shift=False,
+                                    begin_norm_axis=1,
+                                    epsilon=1e-05,
+                                    param_attr=None,
+                                    bias_attr=None)
+        x = fluid.layers.layer_norm(x,
+                                    scale=False,
+                                    shift=False,
+                                    begin_norm_axis=1,
+                                    epsilon=1e-05,
+                                    param_attr="scale",
+                                    bias_attr="shift")
 
 
 class TestDygraphLayerNormAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             paddle.enable_static()
@@ -335,6 +338,7 @@ def test_errors(self):
 
 
 class TestFP16ScaleBiasLayerNorm(unittest.TestCase):
+
     def check_main(self, x_np, weight_np, bias_np, dtype):
         paddle.disable_static()
 
@@ -379,6 +383,7 @@ def assert_equal(x, y):
 
 
 class TestBF16ScaleBiasLayerNorm(unittest.TestCase):
+
     def check_main(self, x_np, weight_np, bias_np, dtype):
         paddle.disable_static()
 
@@ -426,6 +431,7 @@ def assert_equal(x, y):
 
 
 class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):
+
     def test_main(self):
         self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
         _keep_layer_norm_scale_bias_to_fp32(False)
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
index 85c6694324d25..0242df213f264 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
@@ -26,6 +26,7 @@
 
 
 class TestDygraphLayerNormv2(unittest.TestCase):
+
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
@@ -116,6 +117,7 @@ def compute_v2(x_np):
 
 
 class TestLayerNormFunction(unittest.TestCase):
+
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
@@ -163,11 +165,10 @@ def compute_v4(x):
             y4 = compute_v4(x)
             self.assertTrue(np.allclose(y3, y4))
 
-            self.assertRaises(
-                ValueError,
-                paddle.nn.functional.layer_norm,
-                x=x,
-                normalized_shape=1.0)
+            self.assertRaises(ValueError,
+                              paddle.nn.functional.layer_norm,
+                              x=x,
+                              normalized_shape=1.0)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 676f35838ad33..aead014e7abb1 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -38,6 +38,7 @@
 
 
 class LayerTest(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         cls.seed = 111
@@ -84,12 +85,16 @@ def dynamic_graph(self, force_to_use_cpu=False):
 
 
 class TestLayer(LayerTest):
+
     def test_custom_layer_with_kwargs(self):
+
         class CustomLayer(fluid.Layer):
+
             def __init__(self, input_size, linear1_size=4):
                 super(CustomLayer, self).__init__()
-                self.linear1 = nn.Linear(
-                    input_size, linear1_size, bias_attr=False)
+                self.linear1 = nn.Linear(input_size,
+                                         linear1_size,
+                                         bias_attr=False)
                 self.linear2 = nn.Linear(linear1_size, 1, bias_attr=False)
 
             def forward(self, x, do_linear2=False):
@@ -118,15 +123,16 @@ def forward(self, x, do_linear2=False):
     def test_dropout(self):
         inp = np.ones([3, 32, 32], dtype='float32')
         with self.static_graph():
-            t = layers.data(
-                name='data',
-                shape=[3, 32, 32],
-                dtype='float32',
-                append_batch_size=False)
+            t = layers.data(name='data',
+                            shape=[3, 32, 32],
+                            dtype='float32',
+                            append_batch_size=False)
             dropout = nn.Dropout(p=0.35, seed=1, is_test=False)
             ret = dropout(t)
-            ret2 = fluid.layers.dropout(
-                t, dropout_prob=0.35, seed=1, is_test=False)
+            ret2 = fluid.layers.dropout(t,
+                                        dropout_prob=0.35,
+                                        seed=1,
+                                        is_test=False)
             static_ret, static_ret2 = self.get_static_graph_result(
                 feed={'data': inp}, fetch_list=[ret, ret2])
         with self.dynamic_graph():
@@ -134,16 +140,20 @@ def test_dropout(self):
                 t = base.to_variable(inp)
                 dropout = nn.Dropout(p=0.35, seed=1, is_test=False)
                 dy_eager_ret = dropout(t)
-                dy_eager_ret2 = fluid.layers.dropout(
-                    t, dropout_prob=0.35, seed=1, is_test=False)
+                dy_eager_ret2 = fluid.layers.dropout(t,
+                                                     dropout_prob=0.35,
+                                                     seed=1,
+                                                     is_test=False)
                 dy_eager_ret_value = dy_eager_ret.numpy()
                 dy_eager_ret2_value = dy_eager_ret2.numpy()
 
             t = base.to_variable(inp)
             dropout = nn.Dropout(p=0.35, seed=1, is_test=False)
             dy_ret = dropout(t)
-            dy_ret2 = fluid.layers.dropout(
-                t, dropout_prob=0.35, seed=1, is_test=False)
+            dy_ret2 = fluid.layers.dropout(t,
+                                           dropout_prob=0.35,
+                                           seed=1,
+                                           is_test=False)
             dy_ret_value = dy_ret.numpy()
             dy_ret2_value = dy_ret2.numpy()
 
@@ -157,16 +167,15 @@ def test_dropout(self):
     def test_linear(self):
         inp = np.ones([3, 32, 32], dtype='float32')
         with self.static_graph():
-            t = layers.data(
-                name='data',
-                shape=[3, 32, 32],
-                dtype='float32',
-                append_batch_size=False)
+            t = layers.data(name='data',
+                            shape=[3, 32, 32],
+                            dtype='float32',
+                            append_batch_size=False)
             linear = nn.Linear(
                 32, 4, bias_attr=fluid.initializer.ConstantInitializer(value=1))
             ret = linear(t)
-            static_ret = self.get_static_graph_result(
-                feed={'data': inp}, fetch_list=[ret])[0]
+            static_ret = self.get_static_graph_result(feed={'data': inp},
+                                                      fetch_list=[ret])[0]
         with self.dynamic_graph():
             with _test_eager_guard():
                 t = base.to_variable(inp)
@@ -214,15 +223,14 @@ def test_type():
     def test_Flatten(self):
         inp = np.ones([3, 4, 4, 5], dtype='float32')
         with self.static_graph():
-            t = layers.data(
-                name='data',
-                shape=[3, 4, 4, 5],
-                dtype='float32',
-                append_batch_size=False)
+            t = layers.data(name='data',
+                            shape=[3, 4, 4, 5],
+                            dtype='float32',
+                            append_batch_size=False)
             flatten = nn.Flatten()
             ret = flatten(t)
-            static_ret = self.get_static_graph_result(
-                feed={'data': inp}, fetch_list=[ret])[0]
+            static_ret = self.get_static_graph_result(feed={'data': inp},
+                                                      fetch_list=[ret])[0]
         with self.dynamic_graph():
             with _test_eager_guard():
                 t = base.to_variable(inp)
@@ -266,30 +274,28 @@ def test_type():
     def test_layer_norm(self):
         inp = np.ones([3, 32, 32], dtype='float32')
         with self.static_graph():
-            t = layers.data(
-                name='data',
-                shape=[3, 32, 32],
-                dtype='float32',
-                append_batch_size=False)
+            t = layers.data(name='data',
+                            shape=[3, 32, 32],
+                            dtype='float32',
+                            append_batch_size=False)
             ret = layers.layer_norm(
                 t,
                 bias_attr=fluid.initializer.ConstantInitializer(value=1),
                 act='sigmoid')
-            static_ret = self.get_static_graph_result(
-                feed={'data': inp}, fetch_list=[ret])[0]
+            static_ret = self.get_static_graph_result(feed={'data': inp},
+                                                      fetch_list=[ret])[0]
         with self.static_graph():
-            t = layers.data(
-                name='data',
-                shape=[3, 32, 32],
-                dtype='float32',
-                append_batch_size=False)
+            t = layers.data(name='data',
+                            shape=[3, 32, 32],
+                            dtype='float32',
+                            append_batch_size=False)
             lm = nn.LayerNorm(
                 normalized_shape=[32, 32],
                 bias_attr=fluid.initializer.ConstantInitializer(value=1),
                 act='sigmoid')
             ret = lm(t)
-            static_ret2 = self.get_static_graph_result(
-                feed={'data': inp}, fetch_list=[ret])[0]
+            static_ret2 = self.get_static_graph_result(feed={'data': inp},
+                                                       fetch_list=[ret])[0]
         with self.dynamic_graph():
             with _test_eager_guard():
                 lm = nn.LayerNorm(
@@ -359,8 +365,7 @@ def test_SyncBatchNorm(self):
                 my_sync_bn = paddle.nn.SyncBatchNorm(3)
                 ret = my_sync_bn(t)
                 static_ret = self.get_static_graph_result(
-                    feed={'t': np.ones(
-                        [3, 3, 5, 5], dtype='float32')},
+                    feed={'t': np.ones([3, 3, 5, 5], dtype='float32')},
                     fetch_list=[ret])[0]
 
             with self.dynamic_graph():
@@ -382,8 +387,8 @@ def test_relu(self):
             t = layers.data(name='t', shape=[3, 3], dtype='float32')
             ret = layers.relu(t)
             static_ret = self.get_static_graph_result(
-                feed={'t': np.ones(
-                    [3, 3], dtype='float32')}, fetch_list=[ret])[0]
+                feed={'t': np.ones([3, 3],
+                                   dtype='float32')}, fetch_list=[ret])[0]
 
         with self.dynamic_graph():
             with _test_eager_guard():
@@ -403,21 +408,20 @@ def test_matmul(self):
             t = layers.data(name='t', shape=[3, 3], dtype='float32')
             t2 = layers.data(name='t2', shape=[3, 3], dtype='float32')
             ret = layers.matmul(t, t2)
-            static_ret = self.get_static_graph_result(
-                feed={
-                    't': np.ones(
-                        [3, 3], dtype='float32'),
-                    't2': np.ones(
-                        [3, 3], dtype='float32')
-                },
-                fetch_list=[ret])[0]
+            static_ret = self.get_static_graph_result(feed={
+                't':
+                np.ones([3, 3], dtype='float32'),
+                't2':
+                np.ones([3, 3], dtype='float32')
+            },
+                                                      fetch_list=[ret])[0]
 
         with self.dynamic_graph():
             with _test_eager_guard():
                 t = np.ones([3, 3], dtype='float32')
                 t2 = np.ones([3, 3], dtype='float32')
-                dy_eager_ret = layers.matmul(
-                    base.to_variable(t), base.to_variable(t2))
+                dy_eager_ret = layers.matmul(base.to_variable(t),
+                                             base.to_variable(t2))
                 dy_eager_ret_value = dy_eager_ret.numpy()
 
             t = np.ones([3, 3], dtype='float32')
@@ -433,51 +437,50 @@ def test_conv2d(self):
             images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32')
             ret = layers.conv2d(input=images, num_filters=3, filter_size=[2, 2])
             static_ret = self.get_static_graph_result(
-                feed={'pixel': np.ones(
-                    [2, 3, 5, 5], dtype='float32')},
+                feed={'pixel': np.ones([2, 3, 5, 5], dtype='float32')},
                 fetch_list=[ret])[0]
 
         with self.static_graph():
             images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32')
-            conv2d = nn.Conv2D(
-                num_channels=3, num_filters=3, filter_size=[2, 2])
+            conv2d = nn.Conv2D(num_channels=3,
+                               num_filters=3,
+                               filter_size=[2, 2])
             ret = conv2d(images)
             static_ret2 = self.get_static_graph_result(
-                feed={'pixel': np.ones(
-                    [2, 3, 5, 5], dtype='float32')},
+                feed={'pixel': np.ones([2, 3, 5, 5], dtype='float32')},
                 fetch_list=[ret])[0]
 
         with self.dynamic_graph():
             with _test_eager_guard():
                 images = np.ones([2, 3, 5, 5], dtype='float32')
-                conv2d = nn.Conv2D(
-                    num_channels=3, num_filters=3, filter_size=[2, 2])
+                conv2d = nn.Conv2D(num_channels=3,
+                                   num_filters=3,
+                                   filter_size=[2, 2])
                 dy_eager_ret = conv2d(base.to_variable(images))
                 dy_eager_ret_value = dy_eager_ret.numpy()
 
             images = np.ones([2, 3, 5, 5], dtype='float32')
-            conv2d = nn.Conv2D(
-                num_channels=3, num_filters=3, filter_size=[2, 2])
+            conv2d = nn.Conv2D(num_channels=3,
+                               num_filters=3,
+                               filter_size=[2, 2])
             dy_ret = conv2d(base.to_variable(images))
             dy_ret_value = dy_ret.numpy()
 
         with self.dynamic_graph():
             with _test_eager_guard():
                 images = np.ones([2, 3, 5, 5], dtype='float32')
-                conv2d = nn.Conv2D(
-                    num_channels=3,
-                    num_filters=3,
-                    filter_size=[2, 2],
-                    bias_attr=False)
+                conv2d = nn.Conv2D(num_channels=3,
+                                   num_filters=3,
+                                   filter_size=[2, 2],
+                                   bias_attr=False)
                 dy_ret = conv2d(base.to_variable(images))
                 self.assertTrue(conv2d.bias is None)
 
             images = np.ones([2, 3, 5, 5], dtype='float32')
-            conv2d = nn.Conv2D(
-                num_channels=3,
-                num_filters=3,
-                filter_size=[2, 2],
-                bias_attr=False)
+            conv2d = nn.Conv2D(num_channels=3,
+                               num_filters=3,
+                               filter_size=[2, 2],
+                               bias_attr=False)
             dy_ret = conv2d(base.to_variable(images))
             self.assertTrue(conv2d.bias is None)
 
@@ -485,8 +488,9 @@ def test_conv2d(self):
             # the input of Conv2D must be Variable.
             def test_Variable():
                 images = np.ones([2, 3, 5, 5], dtype='float32')
-                conv2d = nn.Conv2D(
-                    num_channels=3, num_filters=3, filter_size=[2, 2])
+                conv2d = nn.Conv2D(num_channels=3,
+                                   num_filters=3,
+                                   filter_size=[2, 2])
                 conv2d_ret1 = conv2d(images)
 
             self.assertRaises(TypeError, test_Variable)
@@ -494,10 +498,12 @@ def test_Variable():
             # the input dtype of Conv2D must be float16 or float32 or float64
             # float16 only can be set on GPU place
             def test_type():
-                images = layers.data(
-                    name='pixel', shape=[3, 5, 5], dtype='int32')
-                conv2d = nn.Conv2D(
-                    num_channels=3, num_filters=3, filter_size=[2, 2])
+                images = layers.data(name='pixel',
+                                     shape=[3, 5, 5],
+                                     dtype='int32')
+                conv2d = nn.Conv2D(num_channels=3,
+                                   num_filters=3,
+                                   filter_size=[2, 2])
                 conv2d_ret2 = conv2d(images)
 
             self.assertRaises(TypeError, test_type)
@@ -513,13 +519,13 @@ def test_type():
                 weight_attr = fluid.ParamAttr(
                     initializer=fluid.initializer.NumpyArrayInitializer(
                         custom_weight))
-                conv2d1 = nn.Conv2D(
-                    num_channels=3, num_filters=3, filter_size=[2, 2])
-                conv2d2 = nn.Conv2D(
-                    num_channels=3,
-                    num_filters=3,
-                    filter_size=[2, 2],
-                    param_attr=weight_attr)
+                conv2d1 = nn.Conv2D(num_channels=3,
+                                    num_filters=3,
+                                    filter_size=[2, 2])
+                conv2d2 = nn.Conv2D(num_channels=3,
+                                    num_filters=3,
+                                    filter_size=[2, 2],
+                                    param_attr=weight_attr)
                 dy_ret1 = conv2d1(base.to_variable(images))
                 dy_ret2 = conv2d2(base.to_variable(images))
                 self.assertFalse(
@@ -535,8 +541,8 @@ def test_type():
                 conv2d2.bias.set_value(conv2d1_bias)
                 dy_ret1 = conv2d1(base.to_variable(images))
                 dy_ret2 = conv2d2(base.to_variable(images))
-                self.assertTrue(
-                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+                self.assertTrue(np.array_equal(dy_ret1.numpy(),
+                                               dy_ret2.numpy()))
 
                 conv2d2.weight = conv2d1.weight
                 conv2d2.bias = conv2d1.bias
@@ -548,16 +554,15 @@ def test_type():
 
             images = np.ones([2, 3, 5, 5], dtype='float32')
             custom_weight = np.random.randn(3, 3, 2, 2).astype("float32")
-            weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight))
-            conv2d1 = nn.Conv2D(
-                num_channels=3, num_filters=3, filter_size=[2, 2])
-            conv2d2 = nn.Conv2D(
-                num_channels=3,
-                num_filters=3,
-                filter_size=[2, 2],
-                param_attr=weight_attr)
+            weight_attr = fluid.ParamAttr(initializer=fluid.initializer.
+                                          NumpyArrayInitializer(custom_weight))
+            conv2d1 = nn.Conv2D(num_channels=3,
+                                num_filters=3,
+                                filter_size=[2, 2])
+            conv2d2 = nn.Conv2D(num_channels=3,
+                                num_filters=3,
+                                filter_size=[2, 2],
+                                param_attr=weight_attr)
             dy_ret1 = conv2d1(base.to_variable(images))
             dy_ret2 = conv2d2(base.to_variable(images))
             self.assertFalse(np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
@@ -596,8 +601,10 @@ def test_gru_unit(self):
             updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
                 input=x, hidden=hidden, size=D * 3)
             static_ret = self.get_static_graph_result(
-                feed={'x': input,
-                      'hidden': hidden_input},
+                feed={
+                    'x': input,
+                    'hidden': hidden_input
+                },
                 fetch_list=[updated_hidden, reset_hidden_pre, gate])
 
         with self.static_graph():
@@ -609,22 +616,24 @@ def test_gru_unit(self):
             updated_hidden, reset_hidden_pre, gate = gru(x, hidden)
 
             static_ret2 = self.get_static_graph_result(
-                feed={'x': input,
-                      'hidden': hidden_input},
+                feed={
+                    'x': input,
+                    'hidden': hidden_input
+                },
                 fetch_list=[updated_hidden, reset_hidden_pre, gate])
 
         with self.dynamic_graph():
             with _test_eager_guard():
                 gru = nn.GRUUnit(size=D * 3)
-                dy_eager_ret = gru(
-                    base.to_variable(input), base.to_variable(hidden_input))
+                dy_eager_ret = gru(base.to_variable(input),
+                                   base.to_variable(hidden_input))
                 dy_eager_ret_value = []
                 for i in range(len(static_ret)):
                     dy_eager_ret_value.append(dy_eager_ret[i].numpy())
 
             gru = nn.GRUUnit(size=D * 3)
-            dy_ret = gru(
-                base.to_variable(input), base.to_variable(hidden_input))
+            dy_ret = gru(base.to_variable(input),
+                         base.to_variable(hidden_input))
             dy_ret_value = []
             for i in range(len(static_ret)):
                 dy_ret_value.append(dy_ret[i].numpy())
@@ -642,20 +651,20 @@ def test_gru_unit(self):
                         custom_weight))
                 gru1 = nn.GRUUnit(size=D * 3)
                 gru2 = nn.GRUUnit(size=D * 3, param_attr=weight_attr)
-                dy_ret1 = gru1(
-                    base.to_variable(input), base.to_variable(hidden_input))
-                dy_ret2 = gru2(
-                    base.to_variable(input), base.to_variable(hidden_input))
+                dy_ret1 = gru1(base.to_variable(input),
+                               base.to_variable(hidden_input))
+                dy_ret2 = gru2(base.to_variable(input),
+                               base.to_variable(hidden_input))
                 self.assertFalse(
                     np.array_equal(gru1.weight.numpy(), gru2.weight.numpy()))
                 for o1, o2 in zip(dy_ret1, dy_ret2):
                     self.assertFalse(np.array_equal(o1.numpy(), o2.numpy()))
                 gru2.weight.set_value(gru1.weight.numpy())
                 gru2.bias.set_value(gru1.bias)
-                dy_ret1 = gru1(
-                    base.to_variable(input), base.to_variable(hidden_input))
-                dy_ret2 = gru2(
-                    base.to_variable(input), base.to_variable(hidden_input))
+                dy_ret1 = gru1(base.to_variable(input),
+                               base.to_variable(hidden_input))
+                dy_ret2 = gru2(base.to_variable(input),
+                               base.to_variable(hidden_input))
                 for o1, o2 in zip(dy_ret1, dy_ret2):
                     self.assertTrue(np.array_equal(o1.numpy(), o2.numpy()))
 
@@ -667,25 +676,24 @@ def test_gru_unit(self):
                     np.array_equal(gru1.bias.numpy(), gru2.bias.numpy()))
 
             custom_weight = np.random.randn(D, D * 3).astype("float32")
-            weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight))
+            weight_attr = fluid.ParamAttr(initializer=fluid.initializer.
+                                          NumpyArrayInitializer(custom_weight))
             gru1 = nn.GRUUnit(size=D * 3)
             gru2 = nn.GRUUnit(size=D * 3, param_attr=weight_attr)
-            dy_ret1 = gru1(
-                base.to_variable(input), base.to_variable(hidden_input))
-            dy_ret2 = gru2(
-                base.to_variable(input), base.to_variable(hidden_input))
+            dy_ret1 = gru1(base.to_variable(input),
+                           base.to_variable(hidden_input))
+            dy_ret2 = gru2(base.to_variable(input),
+                           base.to_variable(hidden_input))
             self.assertFalse(
                 np.array_equal(gru1.weight.numpy(), gru2.weight.numpy()))
             for o1, o2 in zip(dy_ret1, dy_ret2):
                 self.assertFalse(np.array_equal(o1.numpy(), o2.numpy()))
             gru2.weight.set_value(gru1.weight.numpy())
             gru2.bias.set_value(gru1.bias)
-            dy_ret1 = gru1(
-                base.to_variable(input), base.to_variable(hidden_input))
-            dy_ret2 = gru2(
-                base.to_variable(input), base.to_variable(hidden_input))
+            dy_ret1 = gru1(base.to_variable(input),
+                           base.to_variable(hidden_input))
+            dy_ret2 = gru2(base.to_variable(input),
+                           base.to_variable(hidden_input))
             for o1, o2 in zip(dy_ret1, dy_ret2):
                 self.assertTrue(np.array_equal(o1.numpy(), o2.numpy()))
 
@@ -693,8 +701,8 @@ def test_gru_unit(self):
             gru2.bias = gru1.bias
             self.assertTrue(
                 np.array_equal(gru1.weight.numpy(), gru2.weight.numpy()))
-            self.assertTrue(
-                np.array_equal(gru1.bias.numpy(), gru2.bias.numpy()))
+            self.assertTrue(np.array_equal(gru1.bias.numpy(),
+                                           gru2.bias.numpy()))
 
     def test_elementwise_math(self):
         n = np.ones([3, 3], dtype='float32')
@@ -718,16 +726,15 @@ def test_elementwise_math(self):
             ret = layers.elementwise_sub(ret, t5)
             ret = layers.elementwise_mul(ret, t6)
 
-            static_ret = self.get_static_graph_result(
-                feed={
-                    't': n,
-                    't2': n2,
-                    't3': n3,
-                    't4': n4,
-                    't5': n5,
-                    't6': n6
-                },
-                fetch_list=[ret])[0]
+            static_ret = self.get_static_graph_result(feed={
+                't': n,
+                't2': n2,
+                't3': n3,
+                't4': n4,
+                't5': n5,
+                't6': n6
+            },
+                                                      fetch_list=[ret])[0]
 
         with self.dynamic_graph():
             with _test_eager_guard():
@@ -754,10 +761,10 @@ def test_elementwise_minmax(self):
 
         with self.dynamic_graph():
             with _test_eager_guard():
-                min_eager_ret = layers.elementwise_min(
-                    to_variable(n), to_variable(n2))
-                max_eager_ret = layers.elementwise_max(
-                    to_variable(n), to_variable(n2))
+                min_eager_ret = layers.elementwise_min(to_variable(n),
+                                                       to_variable(n2))
+                max_eager_ret = layers.elementwise_max(to_variable(n),
+                                                       to_variable(n2))
                 min_eager_ret_value = min_eager_ret.numpy()
                 max_eager_ret_value = max_eager_ret.numpy()
 
@@ -778,41 +785,37 @@ def test_sequence_conv(self):
         else:
             place = core.CPUPlace()
         with self.static_graph():
-            seq = layers.data(
-                name='seq_in',
-                shape=[3, 4],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
+            seq = layers.data(name='seq_in',
+                              shape=[3, 4],
+                              dtype='float32',
+                              lod_level=1,
+                              append_batch_size=False)
             out = layers.sequence_conv(seq, 2, act='sigmoid')
-            static_rlt = self.get_static_graph_result(
-                feed={
-                    "seq_in": fluid.create_lod_tensor(
-                        data=inp_np,
-                        recursive_seq_lens=[[1, 1, 1]],
-                        place=place)
-                },
-                fetch_list=[out],
-                with_lod=True)[0]
+            static_rlt = self.get_static_graph_result(feed={
+                "seq_in":
+                fluid.create_lod_tensor(data=inp_np,
+                                        recursive_seq_lens=[[1, 1, 1]],
+                                        place=place)
+            },
+                                                      fetch_list=[out],
+                                                      with_lod=True)[0]
 
         with self.static_graph():
-            seq = layers.data(
-                name='seq_in',
-                shape=[3, 4],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
+            seq = layers.data(name='seq_in',
+                              shape=[3, 4],
+                              dtype='float32',
+                              lod_level=1,
+                              append_batch_size=False)
             seq_conv = nn.SequenceConv('seq_conv', num_filters=2, act='sigmoid')
             out = seq_conv(seq)
-            static_rlt2 = self.get_static_graph_result(
-                feed={
-                    "seq_in": fluid.create_lod_tensor(
-                        data=inp_np,
-                        recursive_seq_lens=[[1, 1, 1]],
-                        place=place)
-                },
-                fetch_list=[out],
-                with_lod=True)[0]
+            static_rlt2 = self.get_static_graph_result(feed={
+                "seq_in":
+                fluid.create_lod_tensor(data=inp_np,
+                                        recursive_seq_lens=[[1, 1, 1]],
+                                        place=place)
+            },
+                                                       fetch_list=[out],
+                                                       with_lod=True)[0]
         self.assertTrue(
             np.array_equal(np.array(static_rlt), np.array(static_rlt2)))
 
@@ -826,8 +829,8 @@ def test_conv2d_transpose(self):
                 filter_size=27,
                 act='sigmoid',
                 bias_attr=fluid.initializer.ConstantInitializer(value=1))
-            static_rlt = self.get_static_graph_result(
-                feed={'pixel': inp_np}, fetch_list=[out])[0]
+            static_rlt = self.get_static_graph_result(feed={'pixel': inp_np},
+                                                      fetch_list=[out])[0]
         with self.static_graph():
             img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
             conv2d_transpose = nn.Conv2DTranspose(
@@ -837,8 +840,8 @@ def test_conv2d_transpose(self):
                 act='sigmoid',
                 bias_attr=fluid.initializer.ConstantInitializer(value=1))
             out = conv2d_transpose(img)
-            static_rlt2 = self.get_static_graph_result(
-                feed={'pixel': inp_np}, fetch_list=[out])[0]
+            static_rlt2 = self.get_static_graph_result(feed={'pixel': inp_np},
+                                                       fetch_list=[out])[0]
         with self.dynamic_graph():
             with _test_eager_guard():
                 conv2d_transpose = nn.Conv2DTranspose(
@@ -869,13 +872,13 @@ def test_conv2d_transpose(self):
                 weight_attr = fluid.ParamAttr(
                     initializer=fluid.initializer.NumpyArrayInitializer(
                         custom_weight))
-                conv2d1 = nn.Conv2DTranspose(
-                    num_channels=3, num_filters=3, filter_size=[2, 2])
-                conv2d2 = nn.Conv2DTranspose(
-                    num_channels=3,
-                    num_filters=3,
-                    filter_size=[2, 2],
-                    param_attr=weight_attr)
+                conv2d1 = nn.Conv2DTranspose(num_channels=3,
+                                             num_filters=3,
+                                             filter_size=[2, 2])
+                conv2d2 = nn.Conv2DTranspose(num_channels=3,
+                                             num_filters=3,
+                                             filter_size=[2, 2],
+                                             param_attr=weight_attr)
                 dy_ret1 = conv2d1(base.to_variable(images))
                 dy_ret2 = conv2d2(base.to_variable(images))
                 self.assertFalse(
@@ -891,8 +894,8 @@ def test_conv2d_transpose(self):
                 conv2d2.bias.set_value(conv2d1_bias)
                 dy_ret1 = conv2d1(base.to_variable(images))
                 dy_ret2 = conv2d2(base.to_variable(images))
-                self.assertTrue(
-                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+                self.assertTrue(np.array_equal(dy_ret1.numpy(),
+                                               dy_ret2.numpy()))
 
                 conv2d2.weight = conv2d1.weight
                 conv2d2.bias = conv2d1.bias
@@ -904,16 +907,15 @@ def test_conv2d_transpose(self):
 
             images = np.ones([2, 3, 5, 5], dtype='float32')
             custom_weight = np.random.randn(3, 3, 2, 2).astype("float32")
-            weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight))
-            conv2d1 = nn.Conv2DTranspose(
-                num_channels=3, num_filters=3, filter_size=[2, 2])
-            conv2d2 = nn.Conv2DTranspose(
-                num_channels=3,
-                num_filters=3,
-                filter_size=[2, 2],
-                param_attr=weight_attr)
+            weight_attr = fluid.ParamAttr(initializer=fluid.initializer.
+                                          NumpyArrayInitializer(custom_weight))
+            conv2d1 = nn.Conv2DTranspose(num_channels=3,
+                                         num_filters=3,
+                                         filter_size=[2, 2])
+            conv2d2 = nn.Conv2DTranspose(num_channels=3,
+                                         num_filters=3,
+                                         filter_size=[2, 2],
+                                         param_attr=weight_attr)
             dy_ret1 = conv2d1(base.to_variable(images))
             dy_ret2 = conv2d2(base.to_variable(images))
             self.assertFalse(np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
@@ -942,8 +944,9 @@ def test_conv2d_transpose(self):
             # the input of Conv2DTranspose must be Variable.
             def test_Variable():
                 images = np.ones([2, 3, 5, 5], dtype='float32')
-                conv2d = nn.Conv2DTranspose(
-                    num_channels=3, num_filters=3, filter_size=[2, 2])
+                conv2d = nn.Conv2DTranspose(num_channels=3,
+                                            num_filters=3,
+                                            filter_size=[2, 2])
                 conv2d_ret1 = conv2d(images)
 
             self.assertRaises(TypeError, test_Variable)
@@ -951,10 +954,12 @@ def test_Variable():
             # the input dtype of Conv2DTranspose must be float16 or float32 or float64
             # float16 only can be set on GPU place
             def test_type():
-                images = layers.data(
-                    name='pixel', shape=[3, 5, 5], dtype='int32')
-                conv2d = nn.Conv2DTranspose(
-                    num_channels=3, num_filters=3, filter_size=[2, 2])
+                images = layers.data(name='pixel',
+                                     shape=[3, 5, 5],
+                                     dtype='int32')
+                conv2d = nn.Conv2DTranspose(num_channels=3,
+                                            num_filters=3,
+                                            filter_size=[2, 2])
                 conv2d_ret2 = conv2d(images)
 
             self.assertRaises(TypeError, test_type)
@@ -964,16 +969,14 @@ def test_bilinear_tensor_product(self):
         inp_np_y = np.array([[4, 5, 6]]).astype('float32')
 
         with self.static_graph():
-            data_x = layers.data(
-                name='x',
-                shape=[1, 3],
-                dtype="float32",
-                append_batch_size=False)
-            data_y = layers.data(
-                name='y',
-                shape=[1, 3],
-                dtype="float32",
-                append_batch_size=False)
+            data_x = layers.data(name='x',
+                                 shape=[1, 3],
+                                 dtype="float32",
+                                 append_batch_size=False)
+            data_y = layers.data(name='y',
+                                 shape=[1, 3],
+                                 dtype="float32",
+                                 append_batch_size=False)
             out = layers.bilinear_tensor_product(
                 data_x,
                 data_y,
@@ -981,21 +984,21 @@ def test_bilinear_tensor_product(self):
                 bias_attr=fluid.initializer.ConstantInitializer(value=1),
                 act='sigmoid')
 
-            static_rlt = self.get_static_graph_result(
-                feed={'x': inp_np_x,
-                      'y': inp_np_y}, fetch_list=[out])[0]
+            static_rlt = self.get_static_graph_result(feed={
+                'x': inp_np_x,
+                'y': inp_np_y
+            },
+                                                      fetch_list=[out])[0]
 
         with self.static_graph():
-            data_x = layers.data(
-                name='x',
-                shape=[1, 3],
-                dtype="float32",
-                append_batch_size=False)
-            data_y = layers.data(
-                name='y',
-                shape=[1, 3],
-                dtype="float32",
-                append_batch_size=False)
+            data_x = layers.data(name='x',
+                                 shape=[1, 3],
+                                 dtype="float32",
+                                 append_batch_size=False)
+            data_y = layers.data(name='y',
+                                 shape=[1, 3],
+                                 dtype="float32",
+                                 append_batch_size=False)
             btp = nn.BilinearTensorProduct(
                 3,
                 3,
@@ -1003,9 +1006,11 @@ def test_bilinear_tensor_product(self):
                 bias_attr=fluid.initializer.ConstantInitializer(value=1),
                 act='sigmoid')
             out = btp(data_x, data_y)
-            static_rlt2 = self.get_static_graph_result(
-                feed={'x': inp_np_x,
-                      'y': inp_np_y}, fetch_list=[out])[0]
+            static_rlt2 = self.get_static_graph_result(feed={
+                'x': inp_np_x,
+                'y': inp_np_y
+            },
+                                                       fetch_list=[out])[0]
         with self.dynamic_graph():
             with _test_eager_guard():
                 btp = nn.BilinearTensorProduct(
@@ -1014,8 +1019,8 @@ def test_bilinear_tensor_product(self):
                     6,
                     bias_attr=fluid.initializer.ConstantInitializer(value=1),
                     act='sigmoid')
-                dy_eager_rlt = btp(
-                    base.to_variable(inp_np_x), base.to_variable(inp_np_y))
+                dy_eager_rlt = btp(base.to_variable(inp_np_x),
+                                   base.to_variable(inp_np_y))
                 dy_eager_rlt_value = dy_eager_rlt.numpy()
 
             btp = nn.BilinearTensorProduct(
@@ -1030,32 +1035,34 @@ def test_bilinear_tensor_product(self):
         with self.dynamic_graph():
             with _test_eager_guard():
                 btp2 = nn.BilinearTensorProduct(3, 3, 6, act='sigmoid')
-                dy_eager_rlt2 = btp2(
-                    base.to_variable(inp_np_x), base.to_variable(inp_np_y))
+                dy_eager_rlt2 = btp2(base.to_variable(inp_np_x),
+                                     base.to_variable(inp_np_y))
                 dy_eager_rlt2_value = dy_eager_rlt2.numpy()
 
             btp2 = nn.BilinearTensorProduct(3, 3, 6, act='sigmoid')
-            dy_rlt2 = btp2(
-                base.to_variable(inp_np_x), base.to_variable(inp_np_y))
+            dy_rlt2 = btp2(base.to_variable(inp_np_x),
+                           base.to_variable(inp_np_y))
             dy_rlt2_value = dy_rlt2.numpy()
 
         with self.static_graph():
-            data_x2 = layers.data(
-                name='x',
-                shape=[1, 3],
-                dtype="float32",
-                append_batch_size=False)
-            data_y2 = layers.data(
-                name='y',
-                shape=[1, 3],
-                dtype="float32",
-                append_batch_size=False)
-            out2 = layers.bilinear_tensor_product(
-                data_x2, data_y2, 6, act='sigmoid')
-
-            static_rlt3 = self.get_static_graph_result(
-                feed={'x': inp_np_x,
-                      'y': inp_np_y}, fetch_list=[out2])[0]
+            data_x2 = layers.data(name='x',
+                                  shape=[1, 3],
+                                  dtype="float32",
+                                  append_batch_size=False)
+            data_y2 = layers.data(name='y',
+                                  shape=[1, 3],
+                                  dtype="float32",
+                                  append_batch_size=False)
+            out2 = layers.bilinear_tensor_product(data_x2,
+                                                  data_y2,
+                                                  6,
+                                                  act='sigmoid')
+
+            static_rlt3 = self.get_static_graph_result(feed={
+                'x': inp_np_x,
+                'y': inp_np_y
+            },
+                                                       fetch_list=[out2])[0]
 
         self.assertTrue(np.array_equal(dy_rlt2_value, static_rlt3))
         self.assertTrue(np.array_equal(dy_eager_rlt2_value, static_rlt3))
@@ -1070,22 +1077,25 @@ def test_bilinear_tensor_product(self):
                     initializer=fluid.initializer.NumpyArrayInitializer(
                         custom_weight))
                 btp1 = nn.BilinearTensorProduct(3, 3, 6, act='sigmoid')
-                btp2 = nn.BilinearTensorProduct(
-                    3, 3, 6, act='sigmoid', param_attr=weight_attr)
-                dy_rlt1 = btp1(
-                    base.to_variable(inp_np_x), base.to_variable(inp_np_y))
-                dy_rlt2 = btp2(
-                    base.to_variable(inp_np_x), base.to_variable(inp_np_y))
+                btp2 = nn.BilinearTensorProduct(3,
+                                                3,
+                                                6,
+                                                act='sigmoid',
+                                                param_attr=weight_attr)
+                dy_rlt1 = btp1(base.to_variable(inp_np_x),
+                               base.to_variable(inp_np_y))
+                dy_rlt2 = btp2(base.to_variable(inp_np_x),
+                               base.to_variable(inp_np_y))
                 self.assertFalse(
                     np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()))
                 btp2.weight.set_value(btp1.weight.numpy())
                 btp2.bias.set_value(btp1.bias)
-                dy_rlt1 = btp1(
-                    base.to_variable(inp_np_x), base.to_variable(inp_np_y))
-                dy_rlt2 = btp2(
-                    base.to_variable(inp_np_x), base.to_variable(inp_np_y))
-                self.assertTrue(
-                    np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()))
+                dy_rlt1 = btp1(base.to_variable(inp_np_x),
+                               base.to_variable(inp_np_y))
+                dy_rlt2 = btp2(base.to_variable(inp_np_x),
+                               base.to_variable(inp_np_y))
+                self.assertTrue(np.array_equal(dy_rlt1.numpy(),
+                                               dy_rlt2.numpy()))
 
                 btp2.weight = btp1.weight
                 btp2.bias = btp1.bias
@@ -1095,59 +1105,59 @@ def test_bilinear_tensor_product(self):
                     np.array_equal(btp1.bias.numpy(), btp2.bias.numpy()))
 
             custom_weight = np.random.randn(6, 3, 3).astype("float32")
-            weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight))
+            weight_attr = fluid.ParamAttr(initializer=fluid.initializer.
+                                          NumpyArrayInitializer(custom_weight))
             btp1 = nn.BilinearTensorProduct(3, 3, 6, act='sigmoid')
-            btp2 = nn.BilinearTensorProduct(
-                3, 3, 6, act='sigmoid', param_attr=weight_attr)
-            dy_rlt1 = btp1(
-                base.to_variable(inp_np_x), base.to_variable(inp_np_y))
-            dy_rlt2 = btp2(
-                base.to_variable(inp_np_x), base.to_variable(inp_np_y))
+            btp2 = nn.BilinearTensorProduct(3,
+                                            3,
+                                            6,
+                                            act='sigmoid',
+                                            param_attr=weight_attr)
+            dy_rlt1 = btp1(base.to_variable(inp_np_x),
+                           base.to_variable(inp_np_y))
+            dy_rlt2 = btp2(base.to_variable(inp_np_x),
+                           base.to_variable(inp_np_y))
             self.assertFalse(np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()))
             btp2.weight.set_value(btp1.weight.numpy())
             btp2.bias.set_value(btp1.bias)
-            dy_rlt1 = btp1(
-                base.to_variable(inp_np_x), base.to_variable(inp_np_y))
-            dy_rlt2 = btp2(
-                base.to_variable(inp_np_x), base.to_variable(inp_np_y))
+            dy_rlt1 = btp1(base.to_variable(inp_np_x),
+                           base.to_variable(inp_np_y))
+            dy_rlt2 = btp2(base.to_variable(inp_np_x),
+                           base.to_variable(inp_np_y))
             self.assertTrue(np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()))
 
             btp2.weight = btp1.weight
             btp2.bias = btp1.bias
             self.assertTrue(
                 np.array_equal(btp1.weight.numpy(), btp2.weight.numpy()))
-            self.assertTrue(
-                np.array_equal(btp1.bias.numpy(), btp2.bias.numpy()))
+            self.assertTrue(np.array_equal(btp1.bias.numpy(),
+                                           btp2.bias.numpy()))
 
     def prelu_test(self, mode):
         inp_np = np.ones([5, 200, 100, 100]).astype('float32')
         with self.static_graph():
-            data_t = layers.data(
-                name="input",
-                shape=[5, 200, 100, 100],
-                dtype="float32",
-                append_batch_size=False)
-            out = layers.prelu(
-                data_t, mode, param_attr=ParamAttr(initializer=Constant(1.0)))
-            static_rlt = self.get_static_graph_result(
-                feed={"input": inp_np}, fetch_list=[out])[0]
+            data_t = layers.data(name="input",
+                                 shape=[5, 200, 100, 100],
+                                 dtype="float32",
+                                 append_batch_size=False)
+            out = layers.prelu(data_t,
+                               mode,
+                               param_attr=ParamAttr(initializer=Constant(1.0)))
+            static_rlt = self.get_static_graph_result(feed={"input": inp_np},
+                                                      fetch_list=[out])[0]
 
         with self.static_graph():
-            data_t = layers.data(
-                name="input",
-                shape=[5, 200, 100, 100],
-                dtype="float32",
-                append_batch_size=False)
-            prelu = nn.PRelu(
-                mode=mode,
-                channel=inp_np.shape[1],
-                input_shape=data_t.shape,
-                param_attr=ParamAttr(initializer=Constant(1.0)))
+            data_t = layers.data(name="input",
+                                 shape=[5, 200, 100, 100],
+                                 dtype="float32",
+                                 append_batch_size=False)
+            prelu = nn.PRelu(mode=mode,
+                             channel=inp_np.shape[1],
+                             input_shape=data_t.shape,
+                             param_attr=ParamAttr(initializer=Constant(1.0)))
             out = prelu(data_t)
-            static_rlt2 = self.get_static_graph_result(
-                feed={"input": inp_np}, fetch_list=[out])[0]
+            static_rlt2 = self.get_static_graph_result(feed={"input": inp_np},
+                                                       fetch_list=[out])[0]
 
         with self.dynamic_graph():
             with _test_eager_guard():
@@ -1159,11 +1169,10 @@ def prelu_test(self, mode):
                 dy_eager_rlt = prelu(base.to_variable(inp_np))
                 dy_eager_rlt_value = dy_eager_rlt.numpy()
 
-            prelu = nn.PRelu(
-                mode=mode,
-                channel=inp_np.shape[1],
-                input_shape=inp_np.shape,
-                param_attr=ParamAttr(initializer=Constant(1.0)))
+            prelu = nn.PRelu(mode=mode,
+                             channel=inp_np.shape[1],
+                             input_shape=inp_np.shape,
+                             param_attr=ParamAttr(initializer=Constant(1.0)))
             dy_rlt = prelu(base.to_variable(inp_np))
             dy_rlt_value = dy_rlt.numpy()
 
@@ -1188,33 +1197,31 @@ def prelu_test(self, mode):
                 dy_rlt1 = prelu1(inp)
                 dy_rlt2 = prelu2(inp)
                 self.assertFalse(
-                    np.array_equal(prelu1.weight.numpy(), prelu2.weight.numpy(
-                    )))
+                    np.array_equal(prelu1.weight.numpy(),
+                                   prelu2.weight.numpy()))
                 self.assertFalse(
                     np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()))
                 prelu2.weight.set_value(prelu1.weight.numpy())
                 dy_rlt1 = prelu1(inp)
                 dy_rlt2 = prelu2(inp)
-                self.assertTrue(
-                    np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()))
+                self.assertTrue(np.array_equal(dy_rlt1.numpy(),
+                                               dy_rlt2.numpy()))
 
                 prelu2.weight = prelu1.weight
                 self.assertTrue(
-                    np.array_equal(prelu1.weight.numpy(), prelu2.weight.numpy(
-                    )))
+                    np.array_equal(prelu1.weight.numpy(),
+                                   prelu2.weight.numpy()))
 
             inp_np = np.random.randn(5, 200, 100, 100).astype("float32")
             inp = base.to_variable(inp_np)
-            prelu1 = nn.PRelu(
-                mode=mode,
-                channel=inp_np.shape[1],
-                input_shape=inp_np.shape,
-                param_attr=ParamAttr(initializer=Constant(2.0)))
-            prelu2 = nn.PRelu(
-                mode=mode,
-                channel=inp_np.shape[1],
-                input_shape=inp_np.shape,
-                param_attr=ParamAttr(initializer=Constant(1.0)))
+            prelu1 = nn.PRelu(mode=mode,
+                              channel=inp_np.shape[1],
+                              input_shape=inp_np.shape,
+                              param_attr=ParamAttr(initializer=Constant(2.0)))
+            prelu2 = nn.PRelu(mode=mode,
+                              channel=inp_np.shape[1],
+                              input_shape=inp_np.shape,
+                              param_attr=ParamAttr(initializer=Constant(1.0)))
             dy_rlt1 = prelu1(inp)
             dy_rlt2 = prelu2(inp)
             self.assertFalse(
@@ -1239,31 +1246,31 @@ def test_embeding(self):
         dict_size = 20
         with self.static_graph():
             data_t = layers.data(name='word', shape=[1], dtype='int64')
-            emb = layers.embedding(
-                input=data_t,
-                size=[dict_size, 32],
-                param_attr='emb.w',
-                is_sparse=False)
-            static_rlt = self.get_static_graph_result(
-                feed={'word': inp_word}, fetch_list=[emb])[0]
+            emb = layers.embedding(input=data_t,
+                                   size=[dict_size, 32],
+                                   param_attr='emb.w',
+                                   is_sparse=False)
+            static_rlt = self.get_static_graph_result(feed={'word': inp_word},
+                                                      fetch_list=[emb])[0]
         with self.static_graph():
             data_t = layers.data(name='word', shape=[1], dtype='int64')
-            emb2 = nn.Embedding(
-                size=[dict_size, 32], param_attr='emb.w', is_sparse=False)
+            emb2 = nn.Embedding(size=[dict_size, 32],
+                                param_attr='emb.w',
+                                is_sparse=False)
             emb_rlt = emb2(data_t)
-            static_rlt2 = self.get_static_graph_result(
-                feed={'word': inp_word}, fetch_list=[emb_rlt])[0]
+            static_rlt2 = self.get_static_graph_result(feed={'word': inp_word},
+                                                       fetch_list=[emb_rlt])[0]
         with self.dynamic_graph():
             with _test_eager_guard():
-                emb2 = nn.Embedding(
-                    size=[dict_size, 32],
-                    param_attr='eager_emb.w',
-                    is_sparse=False)
+                emb2 = nn.Embedding(size=[dict_size, 32],
+                                    param_attr='eager_emb.w',
+                                    is_sparse=False)
                 dy_eager_rlt = emb2(base.to_variable(inp_word))
                 dy_eager_rlt_value = dy_eager_rlt.numpy()
 
-            emb2 = nn.Embedding(
-                size=[dict_size, 32], param_attr='emb.w', is_sparse=False)
+            emb2 = nn.Embedding(size=[dict_size, 32],
+                                param_attr='emb.w',
+                                is_sparse=False)
             dy_rlt = emb2(base.to_variable(inp_word))
             dy_rlt_value = dy_rlt.numpy()
 
@@ -1278,10 +1285,9 @@ def test_embeding(self):
                     initializer=fluid.initializer.NumpyArrayInitializer(
                         custom_weight))
                 emb1 = nn.Embedding(size=[dict_size, 32], is_sparse=False)
-                emb2 = nn.Embedding(
-                    size=[dict_size, 32],
-                    param_attr=weight_attr,
-                    is_sparse=False)
+                emb2 = nn.Embedding(size=[dict_size, 32],
+                                    param_attr=weight_attr,
+                                    is_sparse=False)
                 rep1 = emb1(base.to_variable(inp_word))
                 rep2 = emb2(base.to_variable(inp_word))
                 self.assertFalse(
@@ -1298,12 +1304,12 @@ def test_embeding(self):
                     np.array_equal(emb1.weight.numpy(), emb2.weight.numpy()))
 
             custom_weight = np.random.randn(dict_size, 32).astype("float32")
-            weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight))
+            weight_attr = fluid.ParamAttr(initializer=fluid.initializer.
+                                          NumpyArrayInitializer(custom_weight))
             emb1 = nn.Embedding(size=[dict_size, 32], is_sparse=False)
-            emb2 = nn.Embedding(
-                size=[dict_size, 32], param_attr=weight_attr, is_sparse=False)
+            emb2 = nn.Embedding(size=[dict_size, 32],
+                                param_attr=weight_attr,
+                                is_sparse=False)
             rep1 = emb1(base.to_variable(inp_word))
             rep2 = emb2(base.to_variable(inp_word))
             self.assertFalse(np.array_equal(emb1.weight.numpy(), custom_weight))
@@ -1328,20 +1334,21 @@ def test_nce(self):
             words = []
             for i in range(window_size):
                 words.append(
-                    layers.data(
-                        name='word_{0}'.format(i), shape=[None], dtype='int64'))
-            sample_weights = layers.fill_constant(
-                shape=[5, 1], dtype='float32', value=1)
+                    layers.data(name='word_{0}'.format(i),
+                                shape=[None],
+                                dtype='int64'))
+            sample_weights = layers.fill_constant(shape=[5, 1],
+                                                  dtype='float32',
+                                                  value=1)
             embs = []
             for i in range(window_size):
                 if i == label_word:
                     continue
 
-                emb = fluid.embedding(
-                    input=words[i],
-                    size=[dict_size, 32],
-                    param_attr='emb.w',
-                    is_sparse=False)
+                emb = fluid.embedding(input=words[i],
+                                      size=[dict_size, 32],
+                                      param_attr='emb.w',
+                                      is_sparse=False)
                 embs.append(emb)
 
             embs = layers.concat(input=embs, axis=1)
@@ -1359,19 +1366,22 @@ def test_nce(self):
             feed_dict = dict()
             for i in range(window_size):
                 feed_dict['word_{0}'.format(i)] = inp_word[i]
-            static_rlt = self.get_static_graph_result(
-                feed=feed_dict, fetch_list=[nce_loss])[0]
+            static_rlt = self.get_static_graph_result(feed=feed_dict,
+                                                      fetch_list=[nce_loss])[0]
 
         with self.static_graph():
             words = []
             for i in range(window_size):
                 words.append(
-                    layers.data(
-                        name='word_{0}'.format(i), shape=[None], dtype='int64'))
-            sample_weights = layers.fill_constant(
-                shape=[5, 1], dtype='float32', value=1)
-            emb = nn.Embedding(
-                size=[dict_size, 32], param_attr='emb.w', is_sparse=False)
+                    layers.data(name='word_{0}'.format(i),
+                                shape=[None],
+                                dtype='int64'))
+            sample_weights = layers.fill_constant(shape=[5, 1],
+                                                  dtype='float32',
+                                                  value=1)
+            emb = nn.Embedding(size=[dict_size, 32],
+                               param_attr='emb.w',
+                               is_sparse=False)
 
             embs2 = []
             for i in range(window_size):
@@ -1398,20 +1408,21 @@ def test_nce(self):
             for i in range(len(words)):
                 feed_dict['word_{0}'.format(i)] = inp_word[i]
 
-            static_rlt2 = self.get_static_graph_result(
-                feed=feed_dict, fetch_list=[nce_loss2])[0]
+            static_rlt2 = self.get_static_graph_result(feed=feed_dict,
+                                                       fetch_list=[nce_loss2
+                                                                   ])[0]
 
         with self.dynamic_graph():
             with _test_eager_guard():
                 words = []
                 for i in range(window_size):
                     words.append(base.to_variable(inp_word[i]))
-                sample_weights = layers.fill_constant(
-                    shape=[5, 1], dtype='float32', value=1)
-                emb = nn.Embedding(
-                    size=[dict_size, 32],
-                    param_attr='eager_emb.w',
-                    is_sparse=False)
+                sample_weights = layers.fill_constant(shape=[5, 1],
+                                                      dtype='float32',
+                                                      value=1)
+                emb = nn.Embedding(size=[dict_size, 32],
+                                   param_attr='eager_emb.w',
+                                   is_sparse=False)
 
                 embs3 = []
                 for i in range(window_size):
@@ -1421,8 +1432,9 @@ def test_nce(self):
                     emb_rlt = emb(words[i])
                     embs3.append(emb_rlt)
 
-                embs3 = layers.concat(
-                    input=embs3, axis=fluid.dygraph.to_variable(np.array([1])))
+                embs3 = layers.concat(input=embs3,
+                                      axis=fluid.dygraph.to_variable(
+                                          np.array([1])))
                 nce = nn.NCE(num_total_classes=dict_size,
                              dim=embs3.shape[1],
                              num_neg_samples=2,
@@ -1440,10 +1452,12 @@ def test_nce(self):
             words = []
             for i in range(window_size):
                 words.append(base.to_variable(inp_word[i]))
-            sample_weights = layers.fill_constant(
-                shape=[5, 1], dtype='float32', value=1)
-            emb = nn.Embedding(
-                size=[dict_size, 32], param_attr='emb.w', is_sparse=False)
+            sample_weights = layers.fill_constant(shape=[5, 1],
+                                                  dtype='float32',
+                                                  value=1)
+            emb = nn.Embedding(size=[dict_size, 32],
+                               param_attr='emb.w',
+                               is_sparse=False)
 
             embs3 = []
             for i in range(window_size):
@@ -1453,8 +1467,8 @@ def test_nce(self):
                 emb_rlt = emb(words[i])
                 embs3.append(emb_rlt)
 
-            embs3 = layers.concat(
-                input=embs3, axis=fluid.dygraph.to_variable(np.array([1])))
+            embs3 = layers.concat(input=embs3,
+                                  axis=fluid.dygraph.to_variable(np.array([1])))
             nce = nn.NCE(num_total_classes=dict_size,
                          dim=embs3.shape[1],
                          num_neg_samples=2,
@@ -1487,10 +1501,9 @@ def test_nce(self):
                     shape=fluid.dygraph.to_variable(np.array([5, 1])),
                     dtype='float32',
                     value=1)
-                emb = nn.Embedding(
-                    size=[dict_size, 32],
-                    param_attr='eager_emb.w',
-                    is_sparse=False)
+                emb = nn.Embedding(size=[dict_size, 32],
+                                   param_attr='eager_emb.w',
+                                   is_sparse=False)
 
                 embs3 = []
                 for i in range(window_size):
@@ -1541,9 +1554,8 @@ def test_nce(self):
                     np.array_equal(nce1.bias.numpy(), nce2.bias.numpy()))
 
             custom_weight = np.random.randn(dict_size, 128).astype("float32")
-            weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight))
+            weight_attr = fluid.ParamAttr(initializer=fluid.initializer.
+                                          NumpyArrayInitializer(custom_weight))
             words = []
             for i in range(window_size):
                 words.append(base.to_variable(inp_word[i]))
@@ -1551,8 +1563,9 @@ def test_nce(self):
                 shape=fluid.dygraph.to_variable(np.array([5, 1])),
                 dtype='float32',
                 value=1)
-            emb = nn.Embedding(
-                size=[dict_size, 32], param_attr='emb.w', is_sparse=False)
+            emb = nn.Embedding(size=[dict_size, 32],
+                               param_attr='emb.w',
+                               is_sparse=False)
 
             embs3 = []
             for i in range(window_size):
@@ -1592,25 +1605,24 @@ def test_nce(self):
             nce2.bias.set_value(nce1.bias)
             nce1_loss = nce1(embs3, wl)
             nce2_loss = nce2(embs3, wl)
-            self.assertTrue(
-                np.array_equal(nce1_loss.numpy(), nce2_loss.numpy()))
+            self.assertTrue(np.array_equal(nce1_loss.numpy(),
+                                           nce2_loss.numpy()))
 
             nce2.weight = nce1.weight
             nce2.bias = nce1.bias
             self.assertTrue(
                 np.array_equal(nce1.weight.numpy(), nce2.weight.numpy()))
-            self.assertTrue(
-                np.array_equal(nce1.bias.numpy(), nce2.bias.numpy()))
+            self.assertTrue(np.array_equal(nce1.bias.numpy(),
+                                           nce2.bias.numpy()))
 
     def test_one_hot(self):
         with self.dynamic_graph():
             with _test_eager_guard():
-                label = fluid.dygraph.to_variable(
-                    np.array([[1], [1], [3], [0]]))
+                label = fluid.dygraph.to_variable(np.array([[1], [1], [3],
+                                                            [0]]))
                 one_hot_label1 = fluid.layers.one_hot(input=label, depth=4)
                 one_hot_label2 = fluid.layers.one_hot(
-                    input=label,
-                    depth=fluid.dygraph.to_variable(np.array([4])))
+                    input=label, depth=fluid.dygraph.to_variable(np.array([4])))
                 self.assertTrue(
                     np.array_equal(one_hot_label1.numpy(),
                                    one_hot_label2.numpy()))
@@ -1627,19 +1639,19 @@ def test_split(self):
             with _test_eager_guard():
                 input = fluid.dygraph.to_variable(np.random.random((3, 8, 5)))
                 x0, x1 = fluid.layers.split(input, num_or_sections=2, dim=1)
-                x00, x11 = fluid.layers.split(
-                    input,
-                    num_or_sections=2,
-                    dim=fluid.dygraph.to_variable(np.array([1])))
+                x00, x11 = fluid.layers.split(input,
+                                              num_or_sections=2,
+                                              dim=fluid.dygraph.to_variable(
+                                                  np.array([1])))
                 self.assertTrue(np.array_equal(x0.numpy(), x00.numpy()))
                 self.assertTrue(np.array_equal(x1.numpy(), x11.numpy()))
 
             input = fluid.dygraph.to_variable(np.random.random((3, 8, 5)))
             x0, x1 = fluid.layers.split(input, num_or_sections=2, dim=1)
-            x00, x11 = fluid.layers.split(
-                input,
-                num_or_sections=2,
-                dim=fluid.dygraph.to_variable(np.array([1])))
+            x00, x11 = fluid.layers.split(input,
+                                          num_or_sections=2,
+                                          dim=fluid.dygraph.to_variable(
+                                              np.array([1])))
             self.assertTrue(np.array_equal(x0.numpy(), x00.numpy()))
             self.assertTrue(np.array_equal(x1.numpy(), x11.numpy()))
 
@@ -1653,8 +1665,8 @@ def test_topk(self):
                 self.assertTrue(
                     np.array_equal(top5_values1.numpy(), top5_values2.numpy()))
                 self.assertTrue(
-                    np.array_equal(top5_indices1.numpy(), top5_indices2.numpy(
-                    )))
+                    np.array_equal(top5_indices1.numpy(),
+                                   top5_indices2.numpy()))
 
             input = fluid.dygraph.to_variable(np.random.random((13, 11)))
             top5_values1, top5_indices1 = layers.topk(input, k=5)
@@ -1667,22 +1679,22 @@ def test_topk(self):
 
     def test_conv3d(self):
         with self.static_graph():
-            images = layers.data(
-                name='pixel', shape=[3, 6, 6, 6], dtype='float32')
+            images = layers.data(name='pixel',
+                                 shape=[3, 6, 6, 6],
+                                 dtype='float32')
             ret = layers.conv3d(input=images, num_filters=3, filter_size=2)
             static_ret = self.get_static_graph_result(
-                feed={'pixel': np.ones(
-                    [2, 3, 6, 6, 6], dtype='float32')},
+                feed={'pixel': np.ones([2, 3, 6, 6, 6], dtype='float32')},
                 fetch_list=[ret])[0]
 
         with self.static_graph():
-            images = layers.data(
-                name='pixel', shape=[3, 6, 6, 6], dtype='float32')
+            images = layers.data(name='pixel',
+                                 shape=[3, 6, 6, 6],
+                                 dtype='float32')
             conv3d = nn.Conv3D(num_channels=3, num_filters=3, filter_size=2)
             ret = conv3d(images)
             static_ret2 = self.get_static_graph_result(
-                feed={'pixel': np.ones(
-                    [2, 3, 6, 6, 6], dtype='float32')},
+                feed={'pixel': np.ones([2, 3, 6, 6, 6], dtype='float32')},
                 fetch_list=[ret])[0]
 
         with self.dynamic_graph():
@@ -1708,13 +1720,13 @@ def test_conv3d(self):
                 weight_attr = fluid.ParamAttr(
                     initializer=fluid.initializer.NumpyArrayInitializer(
                         custom_weight))
-                conv3d1 = nn.Conv3D(
-                    num_channels=3, num_filters=3, filter_size=2)
-                conv3d2 = nn.Conv3D(
-                    num_channels=3,
-                    num_filters=3,
-                    filter_size=2,
-                    param_attr=weight_attr)
+                conv3d1 = nn.Conv3D(num_channels=3,
+                                    num_filters=3,
+                                    filter_size=2)
+                conv3d2 = nn.Conv3D(num_channels=3,
+                                    num_filters=3,
+                                    filter_size=2,
+                                    param_attr=weight_attr)
                 dy_ret1 = conv3d1(base.to_variable(images))
                 dy_ret2 = conv3d2(base.to_variable(images))
                 self.assertFalse(
@@ -1730,8 +1742,8 @@ def test_conv3d(self):
                 conv3d1.bias.set_value(conv3d1_bias)
                 dy_ret1 = conv3d1(base.to_variable(images))
                 dy_ret2 = conv3d2(base.to_variable(images))
-                self.assertTrue(
-                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+                self.assertTrue(np.array_equal(dy_ret1.numpy(),
+                                               dy_ret2.numpy()))
 
                 conv3d2.weight = conv3d1.weight
                 conv3d2.bias = conv3d1.bias
@@ -1743,15 +1755,13 @@ def test_conv3d(self):
 
             images = np.ones([2, 3, 6, 6, 6], dtype='float32')
             custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32")
-            weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight))
+            weight_attr = fluid.ParamAttr(initializer=fluid.initializer.
+                                          NumpyArrayInitializer(custom_weight))
             conv3d1 = nn.Conv3D(num_channels=3, num_filters=3, filter_size=2)
-            conv3d2 = nn.Conv3D(
-                num_channels=3,
-                num_filters=3,
-                filter_size=2,
-                param_attr=weight_attr)
+            conv3d2 = nn.Conv3D(num_channels=3,
+                                num_filters=3,
+                                filter_size=2,
+                                param_attr=weight_attr)
             dy_ret1 = conv3d1(base.to_variable(images))
             dy_ret2 = conv3d2(base.to_variable(images))
             self.assertFalse(np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
@@ -1783,37 +1793,37 @@ def test_row_conv(self):
             place = core.CPUPlace()
 
         with self.static_graph():
-            x = layers.data(
-                name='X',
-                shape=[3, 5],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
+            x = layers.data(name='X',
+                            shape=[3, 5],
+                            dtype='float32',
+                            lod_level=1,
+                            append_batch_size=False)
             ret = layers.row_conv(input=x, future_context_size=2)
-            static_ret = self.get_static_graph_result(
-                feed={
-                    'X': fluid.create_lod_tensor(
-                        data=input, recursive_seq_lens=[[1, 1, 1]], place=place)
-                },
-                fetch_list=[ret],
-                with_lod=True)[0]
+            static_ret = self.get_static_graph_result(feed={
+                'X':
+                fluid.create_lod_tensor(data=input,
+                                        recursive_seq_lens=[[1, 1, 1]],
+                                        place=place)
+            },
+                                                      fetch_list=[ret],
+                                                      with_lod=True)[0]
 
         with self.static_graph():
-            x = layers.data(
-                name='X',
-                shape=[3, 5],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
+            x = layers.data(name='X',
+                            shape=[3, 5],
+                            dtype='float32',
+                            lod_level=1,
+                            append_batch_size=False)
             rowConv = nn.RowConv('RowConv', future_context_size=2)
             ret = rowConv(x)
-            static_ret2 = self.get_static_graph_result(
-                feed={
-                    'X': fluid.create_lod_tensor(
-                        data=input, recursive_seq_lens=[[1, 1, 1]], place=place)
-                },
-                fetch_list=[ret],
-                with_lod=True)[0]
+            static_ret2 = self.get_static_graph_result(feed={
+                'X':
+                fluid.create_lod_tensor(data=input,
+                                        recursive_seq_lens=[[1, 1, 1]],
+                                        place=place)
+            },
+                                                       fetch_list=[ret],
+                                                       with_lod=True)[0]
 
         # TODO: dygraph can't support LODTensor
 
@@ -1830,54 +1840,51 @@ def func_group_norm(self):
         input = np.random.random(shape).astype('float32')
 
         with self.static_graph():
-            X = fluid.layers.data(
-                name='X',
-                shape=shape,
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
+            X = fluid.layers.data(name='X',
+                                  shape=shape,
+                                  dtype='float32',
+                                  lod_level=1,
+                                  append_batch_size=False)
             ret = layers.group_norm(
                 input=X,
                 groups=2,
-                param_attr=fluid.initializer.Uniform(
-                    low=-0.5, high=0.5),
+                param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5),
                 bias_attr=fluid.initializer.ConstantInitializer(value=1))
-            static_ret = self.get_static_graph_result(
-                feed={
-                    'X': fluid.create_lod_tensor(
-                        data=input, recursive_seq_lens=[[1, 1]], place=place)
-                },
-                fetch_list=[ret],
-                with_lod=True)[0]
+            static_ret = self.get_static_graph_result(feed={
+                'X':
+                fluid.create_lod_tensor(data=input,
+                                        recursive_seq_lens=[[1, 1]],
+                                        place=place)
+            },
+                                                      fetch_list=[ret],
+                                                      with_lod=True)[0]
 
         with self.static_graph():
-            X = fluid.layers.data(
-                name='X',
-                shape=shape,
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
+            X = fluid.layers.data(name='X',
+                                  shape=shape,
+                                  dtype='float32',
+                                  lod_level=1,
+                                  append_batch_size=False)
             groupNorm = nn.GroupNorm(
                 channels=shape[1],
                 groups=2,
-                param_attr=fluid.initializer.Uniform(
-                    low=-0.5, high=0.5),
+                param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5),
                 bias_attr=fluid.initializer.ConstantInitializer(value=1))
             ret = groupNorm(X)
-            static_ret2 = self.get_static_graph_result(
-                feed={
-                    'X': fluid.create_lod_tensor(
-                        data=input, recursive_seq_lens=[[1, 1]], place=place)
-                },
-                fetch_list=[ret],
-                with_lod=True)[0]
+            static_ret2 = self.get_static_graph_result(feed={
+                'X':
+                fluid.create_lod_tensor(data=input,
+                                        recursive_seq_lens=[[1, 1]],
+                                        place=place)
+            },
+                                                       fetch_list=[ret],
+                                                       with_lod=True)[0]
 
         with self.dynamic_graph():
             groupNorm = nn.GroupNorm(
                 channels=shape[1],
                 groups=2,
-                param_attr=fluid.initializer.Uniform(
-                    low=-0.5, high=0.5),
+                param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5),
                 bias_attr=fluid.initializer.ConstantInitializer(value=1))
             dy_ret = groupNorm(base.to_variable(input))
             dy_rlt_value = dy_ret.numpy()
@@ -1901,19 +1908,23 @@ def test_instance_norm(self):
         input = np.random.random(shape).astype('float32')
 
         with self.static_graph():
-            X = fluid.layers.data(
-                name='X', shape=shape, dtype='float32', append_batch_size=False)
+            X = fluid.layers.data(name='X',
+                                  shape=shape,
+                                  dtype='float32',
+                                  append_batch_size=False)
             ret = layers.instance_norm(input=X)
-            static_ret = self.get_static_graph_result(
-                feed={'X': input}, fetch_list=[ret])[0]
+            static_ret = self.get_static_graph_result(feed={'X': input},
+                                                      fetch_list=[ret])[0]
 
         with self.static_graph():
-            X = fluid.layers.data(
-                name='X', shape=shape, dtype='float32', append_batch_size=False)
+            X = fluid.layers.data(name='X',
+                                  shape=shape,
+                                  dtype='float32',
+                                  append_batch_size=False)
             instanceNorm = nn.InstanceNorm(num_channels=shape[1])
             ret = instanceNorm(X)
-            static_ret2 = self.get_static_graph_result(
-                feed={'X': input}, fetch_list=[ret])[0]
+            static_ret2 = self.get_static_graph_result(feed={'X': input},
+                                                       fetch_list=[ret])[0]
 
         with self.dynamic_graph():
             with _test_eager_guard():
@@ -1968,37 +1979,37 @@ def test_spectral_norm(self):
         input = np.random.random(shape).astype('float32')
 
         with self.static_graph():
-            Weight = fluid.layers.data(
-                name='Weight',
-                shape=shape,
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
+            Weight = fluid.layers.data(name='Weight',
+                                       shape=shape,
+                                       dtype='float32',
+                                       lod_level=1,
+                                       append_batch_size=False)
             ret = layers.spectral_norm(weight=Weight, dim=1, power_iters=2)
-            static_ret = self.get_static_graph_result(
-                feed={
-                    'Weight': fluid.create_lod_tensor(
-                        data=input, recursive_seq_lens=[[1, 1]], place=place),
-                },
-                fetch_list=[ret],
-                with_lod=True)[0]
+            static_ret = self.get_static_graph_result(feed={
+                'Weight':
+                fluid.create_lod_tensor(data=input,
+                                        recursive_seq_lens=[[1, 1]],
+                                        place=place),
+            },
+                                                      fetch_list=[ret],
+                                                      with_lod=True)[0]
 
         with self.static_graph():
-            Weight = fluid.layers.data(
-                name='Weight',
-                shape=shape,
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
+            Weight = fluid.layers.data(name='Weight',
+                                       shape=shape,
+                                       dtype='float32',
+                                       lod_level=1,
+                                       append_batch_size=False)
             spectralNorm = nn.SpectralNorm(shape, dim=1, power_iters=2)
             ret = spectralNorm(Weight)
-            static_ret2 = self.get_static_graph_result(
-                feed={
-                    'Weight': fluid.create_lod_tensor(
-                        data=input, recursive_seq_lens=[[1, 1]], place=place)
-                },
-                fetch_list=[ret],
-                with_lod=True)[0]
+            static_ret2 = self.get_static_graph_result(feed={
+                'Weight':
+                fluid.create_lod_tensor(data=input,
+                                        recursive_seq_lens=[[1, 1]],
+                                        place=place)
+            },
+                                                       fetch_list=[ret],
+                                                       with_lod=True)[0]
 
         with self.dynamic_graph():
             with _test_eager_guard():
@@ -2024,70 +2035,77 @@ def test_tree_conv(self):
         adj = np.tile(adj, (1, 1, 1))
         vectors = np.random.random((1, 10, 5)).astype('float32')
         with self.static_graph():
-            NodesVector = fluid.layers.data(
-                name='NodesVector',
-                shape=(1, 10, 5),
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            EdgeSet = fluid.layers.data(
-                name='EdgeSet',
-                shape=(1, 9, 2),
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            ret = fluid.contrib.layers.tree_conv(
-                nodes_vector=NodesVector,
-                edge_set=EdgeSet,
-                output_size=6,
-                num_filters=1,
-                max_depth=2)
-            static_ret = self.get_static_graph_result(
-                feed={
-                    'NodesVector': fluid.create_lod_tensor(
-                        data=vectors, recursive_seq_lens=[[1]], place=place),
-                    'EdgeSet': fluid.create_lod_tensor(
-                        data=adj, recursive_seq_lens=[[1]], place=place)
-                },
-                fetch_list=[ret],
-                with_lod=False)[0]
+            NodesVector = fluid.layers.data(name='NodesVector',
+                                            shape=(1, 10, 5),
+                                            dtype='float32',
+                                            lod_level=1,
+                                            append_batch_size=False)
+            EdgeSet = fluid.layers.data(name='EdgeSet',
+                                        shape=(1, 9, 2),
+                                        dtype='int32',
+                                        lod_level=1,
+                                        append_batch_size=False)
+            ret = fluid.contrib.layers.tree_conv(nodes_vector=NodesVector,
+                                                 edge_set=EdgeSet,
+                                                 output_size=6,
+                                                 num_filters=1,
+                                                 max_depth=2)
+            static_ret = self.get_static_graph_result(feed={
+                'NodesVector':
+                fluid.create_lod_tensor(data=vectors,
+                                        recursive_seq_lens=[[1]],
+                                        place=place),
+                'EdgeSet':
+                fluid.create_lod_tensor(data=adj,
+                                        recursive_seq_lens=[[1]],
+                                        place=place)
+            },
+                                                      fetch_list=[ret],
+                                                      with_lod=False)[0]
 
         with self.static_graph():
-            NodesVector = fluid.layers.data(
-                name='NodesVector',
-                shape=(1, 10, 5),
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            EdgeSet = fluid.layers.data(
-                name='EdgeSet',
-                shape=(1, 9, 2),
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            treeConv = nn.TreeConv(
-                feature_size=5, output_size=6, num_filters=1, max_depth=2)
+            NodesVector = fluid.layers.data(name='NodesVector',
+                                            shape=(1, 10, 5),
+                                            dtype='float32',
+                                            lod_level=1,
+                                            append_batch_size=False)
+            EdgeSet = fluid.layers.data(name='EdgeSet',
+                                        shape=(1, 9, 2),
+                                        dtype='int32',
+                                        lod_level=1,
+                                        append_batch_size=False)
+            treeConv = nn.TreeConv(feature_size=5,
+                                   output_size=6,
+                                   num_filters=1,
+                                   max_depth=2)
             ret = treeConv(NodesVector, EdgeSet)
-            static_ret2 = self.get_static_graph_result(
-                feed={
-                    'NodesVector': fluid.create_lod_tensor(
-                        data=vectors, recursive_seq_lens=[[1]], place=place),
-                    'EdgeSet': fluid.create_lod_tensor(
-                        data=adj, recursive_seq_lens=[[1]], place=place)
-                },
-                fetch_list=[ret],
-                with_lod=False)[0]
+            static_ret2 = self.get_static_graph_result(feed={
+                'NodesVector':
+                fluid.create_lod_tensor(data=vectors,
+                                        recursive_seq_lens=[[1]],
+                                        place=place),
+                'EdgeSet':
+                fluid.create_lod_tensor(data=adj,
+                                        recursive_seq_lens=[[1]],
+                                        place=place)
+            },
+                                                       fetch_list=[ret],
+                                                       with_lod=False)[0]
 
         with self.dynamic_graph():
             with _test_eager_guard():
-                treeConv = nn.TreeConv(
-                    feature_size=5, output_size=6, num_filters=1, max_depth=2)
-                dy_eager_ret = treeConv(
-                    base.to_variable(vectors), base.to_variable(adj))
+                treeConv = nn.TreeConv(feature_size=5,
+                                       output_size=6,
+                                       num_filters=1,
+                                       max_depth=2)
+                dy_eager_ret = treeConv(base.to_variable(vectors),
+                                        base.to_variable(adj))
                 dy_eager_rlt_value = dy_eager_ret.numpy()
 
-            treeConv = nn.TreeConv(
-                feature_size=5, output_size=6, num_filters=1, max_depth=2)
+            treeConv = nn.TreeConv(feature_size=5,
+                                   output_size=6,
+                                   num_filters=1,
+                                   max_depth=2)
             dy_ret = treeConv(base.to_variable(vectors), base.to_variable(adj))
             dy_rlt_value = dy_ret.numpy()
 
@@ -2101,33 +2119,31 @@ def test_tree_conv(self):
                 weight_attr = fluid.ParamAttr(
                     initializer=fluid.initializer.NumpyArrayInitializer(
                         custom_weight))
-                treeConv1 = nn.TreeConv(
-                    feature_size=5,
-                    output_size=6,
-                    num_filters=1,
-                    max_depth=2,
-                    bias_attr='eager_tc1_b')
-                treeConv2 = nn.TreeConv(
-                    feature_size=5,
-                    output_size=6,
-                    num_filters=1,
-                    max_depth=2,
-                    param_attr=weight_attr,
-                    bias_attr='eager_tc2_b')
-                dy_ret1 = treeConv1(
-                    base.to_variable(vectors), base.to_variable(adj))
-                dy_ret2 = treeConv2(
-                    base.to_variable(vectors), base.to_variable(adj))
+                treeConv1 = nn.TreeConv(feature_size=5,
+                                        output_size=6,
+                                        num_filters=1,
+                                        max_depth=2,
+                                        bias_attr='eager_tc1_b')
+                treeConv2 = nn.TreeConv(feature_size=5,
+                                        output_size=6,
+                                        num_filters=1,
+                                        max_depth=2,
+                                        param_attr=weight_attr,
+                                        bias_attr='eager_tc2_b')
+                dy_ret1 = treeConv1(base.to_variable(vectors),
+                                    base.to_variable(adj))
+                dy_ret2 = treeConv2(base.to_variable(vectors),
+                                    base.to_variable(adj))
                 self.assertFalse(
                     np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
                 treeConv2.weight.set_value(treeConv1.weight.numpy())
                 treeConv2.bias.set_value(treeConv1.bias)
-                dy_ret1 = treeConv1(
-                    base.to_variable(vectors), base.to_variable(adj))
-                dy_ret2 = treeConv2(
-                    base.to_variable(vectors), base.to_variable(adj))
-                self.assertTrue(
-                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+                dy_ret1 = treeConv1(base.to_variable(vectors),
+                                    base.to_variable(adj))
+                dy_ret2 = treeConv2(base.to_variable(vectors),
+                                    base.to_variable(adj))
+                self.assertTrue(np.array_equal(dy_ret1.numpy(),
+                                               dy_ret2.numpy()))
 
                 treeConv2.weight = treeConv1.weight
                 treeConv2.bias = treeConv1.bias
@@ -2139,33 +2155,30 @@ def test_tree_conv(self):
                                    treeConv2.bias.numpy()))
 
             custom_weight = np.random.randn(5, 3, 6, 1).astype("float32")
-            weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight))
-            treeConv1 = nn.TreeConv(
-                feature_size=5,
-                output_size=6,
-                num_filters=1,
-                max_depth=2,
-                bias_attr='tc1_b')
-            treeConv2 = nn.TreeConv(
-                feature_size=5,
-                output_size=6,
-                num_filters=1,
-                max_depth=2,
-                param_attr=weight_attr,
-                bias_attr='tc2_b')
-            dy_ret1 = treeConv1(
-                base.to_variable(vectors), base.to_variable(adj))
-            dy_ret2 = treeConv2(
-                base.to_variable(vectors), base.to_variable(adj))
+            weight_attr = fluid.ParamAttr(initializer=fluid.initializer.
+                                          NumpyArrayInitializer(custom_weight))
+            treeConv1 = nn.TreeConv(feature_size=5,
+                                    output_size=6,
+                                    num_filters=1,
+                                    max_depth=2,
+                                    bias_attr='tc1_b')
+            treeConv2 = nn.TreeConv(feature_size=5,
+                                    output_size=6,
+                                    num_filters=1,
+                                    max_depth=2,
+                                    param_attr=weight_attr,
+                                    bias_attr='tc2_b')
+            dy_ret1 = treeConv1(base.to_variable(vectors),
+                                base.to_variable(adj))
+            dy_ret2 = treeConv2(base.to_variable(vectors),
+                                base.to_variable(adj))
             self.assertFalse(np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
             treeConv2.weight.set_value(treeConv1.weight.numpy())
             treeConv2.bias.set_value(treeConv1.bias)
-            dy_ret1 = treeConv1(
-                base.to_variable(vectors), base.to_variable(adj))
-            dy_ret2 = treeConv2(
-                base.to_variable(vectors), base.to_variable(adj))
+            dy_ret1 = treeConv1(base.to_variable(vectors),
+                                base.to_variable(adj))
+            dy_ret2 = treeConv2(base.to_variable(vectors),
+                                base.to_variable(adj))
             self.assertTrue(np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
 
             treeConv2.weight = treeConv1.weight
@@ -2177,34 +2190,39 @@ def test_tree_conv(self):
                 np.array_equal(treeConv1.bias.numpy(), treeConv2.bias.numpy()))
 
     def test_conv3d_transpose(self):
-        input_array = np.arange(0, 48).reshape(
-            [2, 3, 2, 2, 2]).astype('float32')
+        input_array = np.arange(0, 48).reshape([2, 3, 2, 2,
+                                                2]).astype('float32')
 
         with self.static_graph():
             img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32')
-            out = layers.conv3d_transpose(
-                input=img, num_filters=12, filter_size=12, use_cudnn=False)
+            out = layers.conv3d_transpose(input=img,
+                                          num_filters=12,
+                                          filter_size=12,
+                                          use_cudnn=False)
             static_rlt = self.get_static_graph_result(
                 feed={'pixel': input_array}, fetch_list=[out])[0]
         with self.static_graph():
             img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32')
-            conv3d_transpose = nn.Conv3DTranspose(
-                num_channels=3, num_filters=12, filter_size=12, use_cudnn=False)
+            conv3d_transpose = nn.Conv3DTranspose(num_channels=3,
+                                                  num_filters=12,
+                                                  filter_size=12,
+                                                  use_cudnn=False)
             out = conv3d_transpose(img)
             static_rlt2 = self.get_static_graph_result(
                 feed={'pixel': input_array}, fetch_list=[out])[0]
         with self.dynamic_graph():
             with _test_eager_guard():
-                conv3d_transpose = nn.Conv3DTranspose(
-                    num_channels=3,
-                    num_filters=12,
-                    filter_size=12,
-                    use_cudnn=False)
+                conv3d_transpose = nn.Conv3DTranspose(num_channels=3,
+                                                      num_filters=12,
+                                                      filter_size=12,
+                                                      use_cudnn=False)
                 dy_eager_rlt = conv3d_transpose(base.to_variable(input_array))
                 dy_eager_rlt_value = dy_eager_rlt.numpy()
 
-            conv3d_transpose = nn.Conv3DTranspose(
-                num_channels=3, num_filters=12, filter_size=12, use_cudnn=False)
+            conv3d_transpose = nn.Conv3DTranspose(num_channels=3,
+                                                  num_filters=12,
+                                                  filter_size=12,
+                                                  use_cudnn=False)
             dy_rlt = conv3d_transpose(base.to_variable(input_array))
             dy_rlt_value = dy_rlt.numpy()
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
@@ -2218,19 +2236,17 @@ def test_conv3d_transpose(self):
                 weight_attr = fluid.ParamAttr(
                     initializer=fluid.initializer.NumpyArrayInitializer(
                         custom_weight))
-                conv3d1 = nn.Conv3DTranspose(
-                    num_channels=3,
-                    num_filters=3,
-                    filter_size=2,
-                    bias_attr='eager_conv3d1_b',
-                    use_cudnn=False)
-                conv3d2 = nn.Conv3DTranspose(
-                    num_channels=3,
-                    num_filters=3,
-                    filter_size=2,
-                    param_attr=weight_attr,
-                    bias_attr='eager_conv3d2_b',
-                    use_cudnn=False)
+                conv3d1 = nn.Conv3DTranspose(num_channels=3,
+                                             num_filters=3,
+                                             filter_size=2,
+                                             bias_attr='eager_conv3d1_b',
+                                             use_cudnn=False)
+                conv3d2 = nn.Conv3DTranspose(num_channels=3,
+                                             num_filters=3,
+                                             filter_size=2,
+                                             param_attr=weight_attr,
+                                             bias_attr='eager_conv3d2_b',
+                                             use_cudnn=False)
                 dy_ret1 = conv3d1(base.to_variable(images))
                 dy_ret2 = conv3d2(base.to_variable(images))
                 self.assertFalse(
@@ -2246,8 +2262,8 @@ def test_conv3d_transpose(self):
                 conv3d1.bias.set_value(conv3d1_bias)
                 dy_ret1 = conv3d1(base.to_variable(images))
                 dy_ret2 = conv3d2(base.to_variable(images))
-                self.assertTrue(
-                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
+                self.assertTrue(np.array_equal(dy_ret1.numpy(),
+                                               dy_ret2.numpy()))
 
                 conv3d2.weight = conv3d1.weight
                 conv3d2.bias = conv3d1.bias
@@ -2259,22 +2275,19 @@ def test_conv3d_transpose(self):
 
             images = np.ones([2, 3, 6, 6, 6], dtype='float32')
             custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32")
-            weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight))
-            conv3d1 = nn.Conv3DTranspose(
-                num_channels=3,
-                num_filters=3,
-                filter_size=2,
-                bias_attr='conv3d1_b',
-                use_cudnn=False)
-            conv3d2 = nn.Conv3DTranspose(
-                num_channels=3,
-                num_filters=3,
-                filter_size=2,
-                param_attr=weight_attr,
-                bias_attr='conv3d2_b',
-                use_cudnn=False)
+            weight_attr = fluid.ParamAttr(initializer=fluid.initializer.
+                                          NumpyArrayInitializer(custom_weight))
+            conv3d1 = nn.Conv3DTranspose(num_channels=3,
+                                         num_filters=3,
+                                         filter_size=2,
+                                         bias_attr='conv3d1_b',
+                                         use_cudnn=False)
+            conv3d2 = nn.Conv3DTranspose(num_channels=3,
+                                         num_filters=3,
+                                         filter_size=2,
+                                         param_attr=weight_attr,
+                                         bias_attr='conv3d2_b',
+                                         use_cudnn=False)
             dy_ret1 = conv3d1(base.to_variable(images))
             dy_ret2 = conv3d2(base.to_variable(images))
             self.assertFalse(np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
@@ -2400,9 +2413,11 @@ def test_compare(self):
             a = layers.data(name='a', shape=[1], dtype='int64')
             b = layers.data(name='b', shape=[1], dtype='int64')
             cond = layers.less_than(x=a, y=b)
-            static_ret = self.get_static_graph_result(
-                feed={"a": value_a,
-                      "b": value_b}, fetch_list=[cond])[0]
+            static_ret = self.get_static_graph_result(feed={
+                "a": value_a,
+                "b": value_b
+            },
+                                                      fetch_list=[cond])[0]
         with self.dynamic_graph():
             with _test_eager_guard():
                 da = base.to_variable(value_a)
@@ -2424,9 +2439,11 @@ def test_compare(self):
             a1 = layers.data(name='a1', shape=[1], dtype='int64')
             b1 = layers.data(name='b1', shape=[1], dtype='int64')
             cond1 = layers.less_equal(x=a1, y=b1)
-            static_ret1 = self.get_static_graph_result(
-                feed={"a1": value_a,
-                      "b1": value_b}, fetch_list=[cond1])[0]
+            static_ret1 = self.get_static_graph_result(feed={
+                "a1": value_a,
+                "b1": value_b
+            },
+                                                       fetch_list=[cond1])[0]
         with self.dynamic_graph():
             with _test_eager_guard():
                 da1 = base.to_variable(value_a)
@@ -2448,9 +2465,11 @@ def test_compare(self):
             a2 = layers.data(name='a2', shape=[1], dtype='int64')
             b2 = layers.data(name='b2', shape=[1], dtype='int64')
             cond2 = layers.greater_than(x=a2, y=b2)
-            static_ret2 = self.get_static_graph_result(
-                feed={"a2": value_a,
-                      "b2": value_b}, fetch_list=[cond2])[0]
+            static_ret2 = self.get_static_graph_result(feed={
+                "a2": value_a,
+                "b2": value_b
+            },
+                                                       fetch_list=[cond2])[0]
         with self.dynamic_graph():
             with _test_eager_guard():
                 da2 = base.to_variable(value_a)
@@ -2472,9 +2491,11 @@ def test_compare(self):
             a3 = layers.data(name='a3', shape=[1], dtype='int64')
             b3 = layers.data(name='b3', shape=[1], dtype='int64')
             cond3 = layers.greater_equal(x=a3, y=b3)
-            static_ret3 = self.get_static_graph_result(
-                feed={"a3": value_a,
-                      "b3": value_b}, fetch_list=[cond3])[0]
+            static_ret3 = self.get_static_graph_result(feed={
+                "a3": value_a,
+                "b3": value_b
+            },
+                                                       fetch_list=[cond3])[0]
         with self.dynamic_graph():
             with _test_eager_guard():
                 da3 = base.to_variable(value_a)
@@ -2496,9 +2517,11 @@ def test_compare(self):
             a4 = layers.data(name='a4', shape=[1], dtype='int64')
             b4 = layers.data(name='b4', shape=[1], dtype='int64')
             cond4 = layers.equal(x=a4, y=b4)
-            static_ret4 = self.get_static_graph_result(
-                feed={"a4": value_a,
-                      "b4": value_b}, fetch_list=[cond4])[0]
+            static_ret4 = self.get_static_graph_result(feed={
+                "a4": value_a,
+                "b4": value_b
+            },
+                                                       fetch_list=[cond4])[0]
         with self.dynamic_graph():
             with _test_eager_guard():
                 da4 = base.to_variable(value_a)
@@ -2520,9 +2543,11 @@ def test_compare(self):
             a5 = layers.data(name='a5', shape=[1], dtype='int64')
             b5 = layers.data(name='b5', shape=[1], dtype='int64')
             cond5 = layers.equal(x=a5, y=b5)
-            static_ret5 = self.get_static_graph_result(
-                feed={"a5": value_a,
-                      "b5": value_b}, fetch_list=[cond5])[0]
+            static_ret5 = self.get_static_graph_result(feed={
+                "a5": value_a,
+                "b5": value_b
+            },
+                                                       fetch_list=[cond5])[0]
         with self.dynamic_graph():
             with _test_eager_guard():
                 da5 = base.to_variable(value_a)
@@ -2540,6 +2565,7 @@ def test_compare(self):
                 self.assertTrue(dcond5.numpy()[i] == static_ret5[i])
 
     def test_cond(self):
+
         def less_than_branch(a, b):
             return fluid.layers.elementwise_add(a, b)
 
@@ -2547,14 +2573,16 @@ def greater_equal_branch(a, b):
             return fluid.layers.elementwise_sub(a, b)
 
         with self.static_graph():
-            a = fluid.layers.fill_constant(
-                shape=[1], dtype='float32', value=0.1)
-            b = fluid.layers.fill_constant(
-                shape=[1], dtype='float32', value=0.23)
+            a = fluid.layers.fill_constant(shape=[1],
+                                           dtype='float32',
+                                           value=0.1)
+            b = fluid.layers.fill_constant(shape=[1],
+                                           dtype='float32',
+                                           value=0.23)
             out = fluid.layers.cond(a >= b, lambda: greater_equal_branch(a, b),
                                     lambda: less_than_branch(a, b))
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
             ret = exe.run(fetch_list=[out])
             static_res = ret[0]
@@ -2595,6 +2623,7 @@ def greater_equal_branch(a, b):
         self.assertTrue(np.array_equal(static_res, eager_dynamic_res))
 
     def test_case(self):
+
         def fn_1():
             return layers.fill_constant(shape=[1, 2], dtype='float32', value=1)
 
@@ -2613,12 +2642,12 @@ def fn_3():
             pred_2 = layers.less_than(x, y)  # false: 0.3 < 0.1
             pred_3 = layers.equal(x, y)  # false: 0.3 == 0.1
 
-            out_1 = layers.case(
-                pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3)
+            out_1 = layers.case(pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)],
+                                default=fn_3)
             out_2 = layers.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
             static_res1, static_res2 = exe.run(fetch_list=[out_1, out_2])
 
@@ -2632,11 +2661,11 @@ def fn_3():
                 pred_2 = layers.less_than(x, y)  # false: 0.3 < 0.1
                 pred_3 = layers.equal(x, y)  # false: 0.3 == 0.1
 
-                out_1 = layers.case(
-                    pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)],
-                    default=fn_3)
-                out_2 = layers.case(
-                    pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])
+                out_1 = layers.case(pred_fn_pairs=[(pred_1, fn_1),
+                                                   (pred_2, fn_2)],
+                                    default=fn_3)
+                out_2 = layers.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3,
+                                                                    fn_3)])
                 eager_dynamic_res1 = out_1.numpy()
                 eager_dynamic_res2 = out_2.numpy()
 
@@ -2648,8 +2677,8 @@ def fn_3():
             pred_2 = layers.less_than(x, y)  # false: 0.3 < 0.1
             pred_3 = layers.equal(x, y)  # false: 0.3 == 0.1
 
-            out_1 = layers.case(
-                pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3)
+            out_1 = layers.case(pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)],
+                                default=fn_3)
             out_2 = layers.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])
             dynamic_res1 = out_1.numpy()
             dynamic_res2 = out_2.numpy()
@@ -2660,6 +2689,7 @@ def fn_3():
         self.assertTrue(np.array_equal(static_res2, eager_dynamic_res2))
 
     def test_switch_case(self):
+
         def fn_1():
             return layers.fill_constant(shape=[1, 2], dtype='float32', value=1)
 
@@ -2673,44 +2703,46 @@ def fn_3():
             index_1 = layers.fill_constant(shape=[1], dtype='int32', value=1)
             index_2 = layers.fill_constant(shape=[1], dtype='int32', value=2)
 
-            out_1 = layers.switch_case(
-                branch_index=index_1,
-                branch_fns={1: fn_1,
-                            2: fn_2},
-                default=fn_3)
-            out_2 = layers.switch_case(
-                branch_index=index_2,
-                branch_fns=[(1, fn_1), (2, fn_2)],
-                default=fn_3)
-            out_3 = layers.switch_case(
-                branch_index=index_2,
-                branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)])
-
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            out_1 = layers.switch_case(branch_index=index_1,
+                                       branch_fns={
+                                           1: fn_1,
+                                           2: fn_2
+                                       },
+                                       default=fn_3)
+            out_2 = layers.switch_case(branch_index=index_2,
+                                       branch_fns=[(1, fn_1), (2, fn_2)],
+                                       default=fn_3)
+            out_3 = layers.switch_case(branch_index=index_2,
+                                       branch_fns=[(0, fn_1), (4, fn_2),
+                                                   (7, fn_3)])
+
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
             static_res1, static_res2, static_res3 = exe.run(
                 fetch_list=[out_1, out_2, out_3])
 
         with self.dynamic_graph():
             with _test_eager_guard():
-                index_1 = layers.fill_constant(
-                    shape=[1], dtype='int32', value=1)
-                index_2 = layers.fill_constant(
-                    shape=[1], dtype='int32', value=2)
-
-                out_1 = layers.switch_case(
-                    branch_index=index_1,
-                    branch_fns={1: fn_1,
-                                2: fn_2},
-                    default=fn_3)
-                out_2 = layers.switch_case(
-                    branch_index=index_2,
-                    branch_fns=[(1, fn_1), (2, fn_2)],
-                    default=fn_3)
-                out_3 = layers.switch_case(
-                    branch_index=index_2,
-                    branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)])
+                index_1 = layers.fill_constant(shape=[1],
+                                               dtype='int32',
+                                               value=1)
+                index_2 = layers.fill_constant(shape=[1],
+                                               dtype='int32',
+                                               value=2)
+
+                out_1 = layers.switch_case(branch_index=index_1,
+                                           branch_fns={
+                                               1: fn_1,
+                                               2: fn_2
+                                           },
+                                           default=fn_3)
+                out_2 = layers.switch_case(branch_index=index_2,
+                                           branch_fns=[(1, fn_1), (2, fn_2)],
+                                           default=fn_3)
+                out_3 = layers.switch_case(branch_index=index_2,
+                                           branch_fns=[(0, fn_1), (4, fn_2),
+                                                       (7, fn_3)])
 
                 eager_dynamic_res1 = out_1.numpy()
                 eager_dynamic_res2 = out_2.numpy()
@@ -2719,18 +2751,18 @@ def fn_3():
             index_1 = layers.fill_constant(shape=[1], dtype='int32', value=1)
             index_2 = layers.fill_constant(shape=[1], dtype='int32', value=2)
 
-            out_1 = layers.switch_case(
-                branch_index=index_1,
-                branch_fns={1: fn_1,
-                            2: fn_2},
-                default=fn_3)
-            out_2 = layers.switch_case(
-                branch_index=index_2,
-                branch_fns=[(1, fn_1), (2, fn_2)],
-                default=fn_3)
-            out_3 = layers.switch_case(
-                branch_index=index_2,
-                branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)])
+            out_1 = layers.switch_case(branch_index=index_1,
+                                       branch_fns={
+                                           1: fn_1,
+                                           2: fn_2
+                                       },
+                                       default=fn_3)
+            out_2 = layers.switch_case(branch_index=index_2,
+                                       branch_fns=[(1, fn_1), (2, fn_2)],
+                                       default=fn_3)
+            out_3 = layers.switch_case(branch_index=index_2,
+                                       branch_fns=[(0, fn_1), (4, fn_2),
+                                                   (7, fn_3)])
 
             dynamic_res1 = out_1.numpy()
             dynamic_res2 = out_2.numpy()
@@ -2747,25 +2779,32 @@ def test_crop_tensor(self):
         with self.static_graph():
             x = fluid.layers.data(name="x1", shape=[6, 5, 8])
 
-            dim1 = fluid.layers.data(
-                name="dim1", shape=[1], append_batch_size=False)
-            dim2 = fluid.layers.data(
-                name="dim2", shape=[1], append_batch_size=False)
+            dim1 = fluid.layers.data(name="dim1",
+                                     shape=[1],
+                                     append_batch_size=False)
+            dim2 = fluid.layers.data(name="dim2",
+                                     shape=[1],
+                                     append_batch_size=False)
             crop_shape1 = (1, 2, 4, 4)
-            crop_shape2 = fluid.layers.data(
-                name="crop_shape", shape=[4], append_batch_size=False)
+            crop_shape2 = fluid.layers.data(name="crop_shape",
+                                            shape=[4],
+                                            append_batch_size=False)
             crop_shape3 = [-1, dim1, dim2, 4]
             crop_offsets1 = [0, 0, 1, 0]
-            crop_offsets2 = fluid.layers.data(
-                name="crop_offset", shape=[4], append_batch_size=False)
+            crop_offsets2 = fluid.layers.data(name="crop_offset",
+                                              shape=[4],
+                                              append_batch_size=False)
             crop_offsets3 = [0, dim1, dim2, 0]
 
-            out1 = fluid.layers.crop_tensor(
-                x, shape=crop_shape1, offsets=crop_offsets1)
-            out2 = fluid.layers.crop_tensor(
-                x, shape=crop_shape2, offsets=crop_offsets2)
-            out3 = fluid.layers.crop_tensor(
-                x, shape=crop_shape3, offsets=crop_offsets3)
+            out1 = fluid.layers.crop_tensor(x,
+                                            shape=crop_shape1,
+                                            offsets=crop_offsets1)
+            out2 = fluid.layers.crop_tensor(x,
+                                            shape=crop_shape2,
+                                            offsets=crop_offsets2)
+            out3 = fluid.layers.crop_tensor(x,
+                                            shape=crop_shape3,
+                                            offsets=crop_offsets3)
 
             self.assertIsNotNone(out1)
             self.assertIsNotNone(out2)
@@ -2774,8 +2813,10 @@ def test_crop_tensor(self):
     def test_shard_index(self):
         with self.static_graph():
             x = fluid.layers.data(name="label", shape=[4, 1], dtype='int64')
-            shard_label = fluid.layers.shard_index(
-                input=x, index_num=20, nshards=2, shard_id=0)
+            shard_label = fluid.layers.shard_index(input=x,
+                                                   index_num=20,
+                                                   nshards=2,
+                                                   shard_id=0)
 
         self.assertIsNotNone(shard_label)
 
@@ -2794,8 +2835,10 @@ def test_accuracy(self):
             exe.run(fluid.default_startup_program())
             # x = np.random.rand(3, 32, 32).astype("float32")
             # y = np.array([[1], [0], [1]])
-            static_out = exe.run(feed={"input": x,
-                                       "label": y},
+            static_out = exe.run(feed={
+                "input": x,
+                "label": y
+            },
                                  fetch_list=result[0])
 
         with self.dynamic_graph(force_to_use_cpu=True):
@@ -2809,6 +2852,7 @@ def test_accuracy(self):
 
 
 class TestBook(LayerTest):
+
     def setUp(self):
         self.only_static_set = set({"make_word_embedding"})
         self.not_compare_static_dygraph_set = set({
@@ -2856,8 +2900,10 @@ def func_all_layers(self):
 
             if method.__name__ in self.all_close_compare:
                 self.assertTrue(
-                    np.allclose(
-                        static_result[0], dy_result_value, atol=0, rtol=1e-05),
+                    np.allclose(static_result[0],
+                                dy_result_value,
+                                atol=0,
+                                rtol=1e-05),
                     "Result of function [{}] compare failed".format(
                         method.__name__))
                 continue
@@ -2894,19 +2940,18 @@ def _get_data(self,
                   set_feed_dict=True,
                   append_batch_size=True):
         if base.enabled():
-            return base.to_variable(
-                value=self._get_np_data(shape, dtype, append_batch_size),
-                name=name,
-                zero_copy=False)
+            return base.to_variable(value=self._get_np_data(
+                shape, dtype, append_batch_size),
+                                    name=name,
+                                    zero_copy=False)
         else:
             if set_feed_dict:
-                self._feed_dict[name] = self._get_np_data(shape, dtype,
-                                                          append_batch_size)
-            return layers.data(
-                name=name,
-                shape=shape,
-                dtype=dtype,
-                append_batch_size=append_batch_size)
+                self._feed_dict[name] = self._get_np_data(
+                    shape, dtype, append_batch_size)
+            return layers.data(name=name,
+                               shape=shape,
+                               dtype=dtype,
+                               append_batch_size=append_batch_size)
 
     def make_sampled_softmax_with_cross_entropy(self):
         with program_guard(fluid.default_main_program(),
@@ -2914,14 +2959,13 @@ def make_sampled_softmax_with_cross_entropy(self):
             logits = self._get_data(name='Logits', shape=[256], dtype='float32')
             label = self._get_data(name='Label', shape=[1], dtype='int64')
             num_samples = 25
-            output = layers.sampled_softmax_with_cross_entropy(logits, label,
-                                                               num_samples)
+            output = layers.sampled_softmax_with_cross_entropy(
+                logits, label, num_samples)
             return (output)
 
     def make_fit_a_line(self):
-        with program_guard(
-                fluid.default_main_program(),
-                startup_program=fluid.default_startup_program()):
+        with program_guard(fluid.default_main_program(),
+                           startup_program=fluid.default_startup_program()):
             x = self._get_data(name='x', shape=[13], dtype='float32')
             y_predict = layers.fc(input=x, size=1, act=None)
             y = self._get_data(name='y', shape=[1], dtype='float32')
@@ -2949,29 +2993,29 @@ def make_conv2d_transpose(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
             img = self._get_data(name='pixel', shape=[3, 2, 2], dtype='float32')
-            return layers.conv2d_transpose(
-                input=img, num_filters=10, output_size=28)
+            return layers.conv2d_transpose(input=img,
+                                           num_filters=10,
+                                           output_size=28)
 
     def make_recognize_digits_conv(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            images = self._get_data(
-                name='pixel', shape=[1, 28, 28], dtype='float32')
+            images = self._get_data(name='pixel',
+                                    shape=[1, 28, 28],
+                                    dtype='float32')
             label = self._get_data(name='label', shape=[1], dtype='int64')
-            conv_pool_1 = nets.simple_img_conv_pool(
-                input=images,
-                filter_size=5,
-                num_filters=2,
-                pool_size=2,
-                pool_stride=2,
-                act="relu")
-            conv_pool_2 = nets.simple_img_conv_pool(
-                input=conv_pool_1,
-                filter_size=5,
-                num_filters=4,
-                pool_size=2,
-                pool_stride=2,
-                act="relu")
+            conv_pool_1 = nets.simple_img_conv_pool(input=images,
+                                                    filter_size=5,
+                                                    num_filters=2,
+                                                    pool_size=2,
+                                                    pool_stride=2,
+                                                    act="relu")
+            conv_pool_2 = nets.simple_img_conv_pool(input=conv_pool_1,
+                                                    filter_size=5,
+                                                    num_filters=4,
+                                                    pool_size=2,
+                                                    pool_stride=2,
+                                                    act="relu")
 
             predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
             cost = layers.cross_entropy(input=predict, label=label)
@@ -2984,33 +3028,30 @@ def make_word_embedding(self):
             dict_size = 10000
             embed_size = 32
             first_word = self._get_data(name='firstw', shape=[1], dtype='int64')
-            second_word = self._get_data(
-                name='secondw', shape=[1], dtype='int64')
+            second_word = self._get_data(name='secondw',
+                                         shape=[1],
+                                         dtype='int64')
             third_word = self._get_data(name='thirdw', shape=[1], dtype='int64')
             forth_word = self._get_data(name='forthw', shape=[1], dtype='int64')
             next_word = self._get_data(name='nextw', shape=[1], dtype='int64')
 
-            embed_first = layers.embedding(
-                input=first_word,
-                size=[dict_size, embed_size],
-                dtype='float32',
-                param_attr='shared_w')
-            embed_second = layers.embedding(
-                input=second_word,
-                size=[dict_size, embed_size],
-                dtype='float32',
-                param_attr='shared_w')
-
-            embed_third = layers.embedding(
-                input=third_word,
-                size=[dict_size, embed_size],
-                dtype='float32',
-                param_attr='shared_w')
-            embed_forth = layers.embedding(
-                input=forth_word,
-                size=[dict_size, embed_size],
-                dtype='float32',
-                param_attr='shared_w')
+            embed_first = layers.embedding(input=first_word,
+                                           size=[dict_size, embed_size],
+                                           dtype='float32',
+                                           param_attr='shared_w')
+            embed_second = layers.embedding(input=second_word,
+                                            size=[dict_size, embed_size],
+                                            dtype='float32',
+                                            param_attr='shared_w')
+
+            embed_third = layers.embedding(input=third_word,
+                                           size=[dict_size, embed_size],
+                                           dtype='float32',
+                                           param_attr='shared_w')
+            embed_forth = layers.embedding(input=forth_word,
+                                           size=[dict_size, embed_size],
+                                           dtype='float32',
+                                           param_attr='shared_w')
 
             concat_embed = layers.concat(
                 input=[embed_first, embed_second, embed_third, embed_forth],
@@ -3045,43 +3086,48 @@ def make_hsigmoid(self):
         with program_guard(program2):
             x2 = self._get_data(name='x2', shape=[4, 8], dtype='float32')
             y2 = self._get_data(name='y2', shape=[4], dtype='int64')
-            path_table = self._get_data(
-                name='path_table', shape=[4, 6], dtype='int64')
-            path_code = self._get_data(
-                name='path_code', shape=[4, 6], dtype='int64')
-            return (layers.hsigmoid(
-                input=x2,
-                label=y2,
-                num_classes=6,
-                path_table=path_table,
-                path_code=path_code,
-                is_custom=True))
+            path_table = self._get_data(name='path_table',
+                                        shape=[4, 6],
+                                        dtype='int64')
+            path_code = self._get_data(name='path_code',
+                                       shape=[4, 6],
+                                       dtype='int64')
+            return (layers.hsigmoid(input=x2,
+                                    label=y2,
+                                    num_classes=6,
+                                    path_table=path_table,
+                                    path_code=path_code,
+                                    is_custom=True))
 
     def make_pool2d(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
             x = self._get_data(name='x', shape=[3, 224, 224], dtype='float32')
-            return (layers.pool2d(
-                x, pool_size=[5, 3], pool_stride=[1, 2], pool_padding=(2, 1)))
+            return (layers.pool2d(x,
+                                  pool_size=[5, 3],
+                                  pool_stride=[1, 2],
+                                  pool_padding=(2, 1)))
 
     def make_pool2d_infershape(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
             theta = self._get_data("theta", shape=[2, 3], dtype='float32')
             x = fluid.layers.affine_grid(theta, out_shape=[2, 3, 244, 244])
-            return (layers.pool2d(
-                x, pool_size=[5, 3], pool_stride=[1, 2], pool_padding=(2, 1)))
+            return (layers.pool2d(x,
+                                  pool_size=[5, 3],
+                                  pool_stride=[1, 2],
+                                  pool_padding=(2, 1)))
 
     def make_pool3d(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            x = self._get_data(
-                name='x', shape=[3, 244, 244, 244], dtype='float32')
-            return (layers.pool3d(
-                x,
-                pool_size=[5, 3, 2],
-                pool_stride=[1, 2, 3],
-                pool_padding=(2, 1, 1)))
+            x = self._get_data(name='x',
+                               shape=[3, 244, 244, 244],
+                               dtype='float32')
+            return (layers.pool3d(x,
+                                  pool_size=[5, 3, 2],
+                                  pool_stride=[1, 2, 3],
+                                  pool_padding=(2, 1, 1)))
 
     def make_adaptive_pool2d(self):
         with program_guard(fluid.default_main_program(),
@@ -3099,11 +3145,12 @@ def make_adaptive_pool2d(self):
     def make_adaptive_pool3d(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            x = self._get_data(
-                name='x', shape=[3, 244, 224, 224], dtype='float32')
+            x = self._get_data(name='x',
+                               shape=[3, 244, 224, 224],
+                               dtype='float32')
             return (layers.adaptive_pool3d(x, [3, 3, 3], pool_type='avg'))
-            pool, mask = layers.adaptive_pool3d(
-                x, [3, 3, 3], require_index=True)
+            pool, mask = layers.adaptive_pool3d(x, [3, 3, 3],
+                                                require_index=True)
             return (pool)
             return (mask)
             return (layers.adaptive_pool3d(x, 3, pool_type='avg'))
@@ -3114,17 +3161,21 @@ def make_adaptive_pool3d(self):
     def make_lstm_unit(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            x_t_data = self._get_data(
-                name='x_t_data', shape=[10, 10], dtype='float32')
+            x_t_data = self._get_data(name='x_t_data',
+                                      shape=[10, 10],
+                                      dtype='float32')
             x_t = layers.fc(input=x_t_data, size=10)
-            prev_hidden_data = self._get_data(
-                name='prev_hidden_data', shape=[10, 30], dtype='float32')
+            prev_hidden_data = self._get_data(name='prev_hidden_data',
+                                              shape=[10, 30],
+                                              dtype='float32')
             prev_hidden = layers.fc(input=prev_hidden_data, size=30)
-            prev_cell_data = self._get_data(
-                name='prev_cell', shape=[10, 30], dtype='float32')
+            prev_cell_data = self._get_data(name='prev_cell',
+                                            shape=[10, 30],
+                                            dtype='float32')
             prev_cell = layers.fc(input=prev_cell_data, size=30)
-            return (layers.lstm_unit(
-                x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
+            return (layers.lstm_unit(x_t=x_t,
+                                     hidden_t_prev=prev_hidden,
+                                     cell_t_prev=prev_cell))
 
     def make_softmax(self):
         with program_guard(fluid.default_main_program(),
@@ -3136,11 +3187,10 @@ def make_softmax(self):
     def make_space_to_depth(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            data = self._get_data(
-                name='data',
-                shape=[32, 9, 6, 6],
-                append_batch_size=False,
-                dtype='float32')
+            data = self._get_data(name='data',
+                                  shape=[32, 9, 6, 6],
+                                  append_batch_size=False,
+                                  dtype='float32')
             return (layers.space_to_depth(data, 3))
 
     def make_lrn(self):
@@ -3160,8 +3210,9 @@ def make_nce(self):
         words = []
         for i in range(window_size):
             words.append(
-                self._get_data(
-                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
+                self._get_data(name='word_{0}'.format(i),
+                               shape=[1],
+                               dtype='int64'))
 
         dict_size = 10000
         label_word = int(window_size // 2) + 1
@@ -3171,11 +3222,10 @@ def make_nce(self):
             if i == label_word:
                 continue
 
-            emb = layers.embedding(
-                input=words[i],
-                size=[dict_size, 32],
-                param_attr='emb.w',
-                is_sparse=True)
+            emb = layers.embedding(input=words[i],
+                                   size=[dict_size, 32],
+                                   param_attr='emb.w',
+                                   is_sparse=True)
 
             embs.append(emb)
 
@@ -3235,18 +3285,18 @@ def make_smooth_l1(self):
     def make_scatter(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            x = self._get_data(
-                name='x',
-                shape=[3, 3],
-                append_batch_size=False,
-                dtype='float32')
-            idx = self._get_data(
-                name='idx', shape=[2], append_batch_size=False, dtype='int32')
-            updates = self._get_data(
-                name='updates',
-                shape=[2, 3],
-                append_batch_size=False,
-                dtype='float32')
+            x = self._get_data(name='x',
+                               shape=[3, 3],
+                               append_batch_size=False,
+                               dtype='float32')
+            idx = self._get_data(name='idx',
+                                 shape=[2],
+                                 append_batch_size=False,
+                                 dtype='int32')
+            updates = self._get_data(name='updates',
+                                     shape=[2, 3],
+                                     append_batch_size=False,
+                                     dtype='float32')
             out = layers.scatter(input=x, index=idx, updates=updates)
             return (out)
 
@@ -3262,8 +3312,9 @@ def make_label_smooth(self):
         with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()):
             label = self._get_data(name="label", shape=[1], dtype="int32")
             one_hot_label = layers.one_hot(input=label, depth=10)
-            smooth_label = layers.label_smooth(
-                label=one_hot_label, epsilon=0.1, dtype="int32")
+            smooth_label = layers.label_smooth(label=one_hot_label,
+                                               epsilon=0.1,
+                                               dtype="int32")
             return (smooth_label)
 
     def make_topk(self):
@@ -3300,8 +3351,9 @@ def make_resize_nearest(self):
         try:
             with program_guard(fluid.default_main_program(),
                                fluid.default_startup_program()):
-                x = self._get_data(
-                    name='x2', shape=[3, 9, 6, 7], dtype="float32")
+                x = self._get_data(name='x2',
+                                   shape=[3, 9, 6, 7],
+                                   dtype="float32")
                 output = layers.resize_nearest(x, out_shape=[12, 12, 12])
         except ValueError:
             pass
@@ -3331,8 +3383,9 @@ def make_resize_trilinear(self):
         try:
             with program_guard(fluid.default_main_program(),
                                fluid.default_startup_program()):
-                x = self._get_data(
-                    name='x', shape=[3, 9, 6, 7], dtype="float32")
+                x = self._get_data(name='x',
+                                   shape=[3, 9, 6, 7],
+                                   dtype="float32")
                 output = layers.resize_trilinear(x, out_shape=[12, 12])
         except ValueError:
             pass
@@ -3390,64 +3443,61 @@ def make_argsort(self):
     def make_rank_loss(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            label = self._get_data(
-                name='label',
-                append_batch_size=False,
-                shape=[16, 1],
-                dtype="float32")
-            left = self._get_data(
-                name='left',
-                append_batch_size=False,
-                shape=[16, 1],
-                dtype="float32")
-            right = self._get_data(
-                name='right',
-                append_batch_size=False,
-                shape=[16, 1],
-                dtype="float32")
+            label = self._get_data(name='label',
+                                   append_batch_size=False,
+                                   shape=[16, 1],
+                                   dtype="float32")
+            left = self._get_data(name='left',
+                                  append_batch_size=False,
+                                  shape=[16, 1],
+                                  dtype="float32")
+            right = self._get_data(name='right',
+                                   append_batch_size=False,
+                                   shape=[16, 1],
+                                   dtype="float32")
             out = layers.rank_loss(label, left, right, name="rank_loss")
             return (out)
 
     def make_shape(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[3, 100, 100], dtype="float32")
+            input = self._get_data(name="input",
+                                   shape=[3, 100, 100],
+                                   dtype="float32")
             out = layers.shape(input)
             return (out)
 
     def make_pad2d(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[3, 100, 100], dtype="float32")
+            input = self._get_data(name="input",
+                                   shape=[3, 100, 100],
+                                   dtype="float32")
             paddings = layers.fill_constant(shape=[4], dtype='int32', value=1)
-            out = layers.pad2d(
-                input,
-                paddings=[1, 2, 3, 4],
-                mode='reflect',
-                data_format='NCHW',
-                name="shape")
-            out_1 = layers.pad2d(
-                input,
-                paddings=paddings,
-                mode='reflect',
-                data_format='NCHW',
-                name="shape")
+            out = layers.pad2d(input,
+                               paddings=[1, 2, 3, 4],
+                               mode='reflect',
+                               data_format='NCHW',
+                               name="shape")
+            out_1 = layers.pad2d(input,
+                                 paddings=paddings,
+                                 mode='reflect',
+                                 data_format='NCHW',
+                                 name="shape")
             return (out)
             return (out_1)
 
     def make_prelu(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[5, 200, 100, 100], dtype="float32")
+            input = self._get_data(name="input",
+                                   shape=[5, 200, 100, 100],
+                                   dtype="float32")
             mode = 'channel'
-            out = layers.prelu(
-                input,
-                mode,
-                param_attr=ParamAttr(initializer=Constant(1.0)),
-                name='prelu')
+            out = layers.prelu(input,
+                               mode,
+                               param_attr=ParamAttr(initializer=Constant(1.0)),
+                               name='prelu')
             return (out)
 
     def make_soft_relu(self):
@@ -3596,8 +3646,9 @@ def make_expand(self):
     def make_uniform_random_batch_size_like(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[13, 11], dtype='float32')
+            input = self._get_data(name="input",
+                                   shape=[13, 11],
+                                   dtype='float32')
             out = layers.uniform_random_batch_size_like(input, [-1, 11])
             return (out)
 
@@ -3610,11 +3661,10 @@ def make_gaussian_random(self):
     def make_sampling_id(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            x = self._get_data(
-                name="X",
-                shape=[13, 11],
-                dtype='float32',
-                append_batch_size=False)
+            x = self._get_data(name="X",
+                               shape=[13, 11],
+                               dtype='float32',
+                               append_batch_size=False)
 
             out = layers.sampling_id(x)
             return (out)
@@ -3622,18 +3672,22 @@ def make_sampling_id(self):
     def make_gaussian_random_batch_size_like(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[13, 11], dtype='float32')
-
-            out = layers.gaussian_random_batch_size_like(
-                input, shape=[-1, 11], mean=1.0, std=2.0)
+            input = self._get_data(name="input",
+                                   shape=[13, 11],
+                                   dtype='float32')
+
+            out = layers.gaussian_random_batch_size_like(input,
+                                                         shape=[-1, 11],
+                                                         mean=1.0,
+                                                         std=2.0)
             return (out)
 
     def make_sum(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[13, 11], dtype='float32')
+            input = self._get_data(name="input",
+                                   shape=[13, 11],
+                                   dtype='float32')
 
             out = layers.sum(input)
             return (out)
@@ -3645,8 +3699,9 @@ def make_slice(self):
 
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[3, 4, 5, 6], dtype='float32')
+            input = self._get_data(name="input",
+                                   shape=[3, 4, 5, 6],
+                                   dtype='float32')
 
             out = layers.slice(input, axes=axes, starts=starts, ends=ends)
             return out
@@ -3654,13 +3709,13 @@ def make_slice(self):
     def make_scale_variable(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            input = self._get_data(
-                name="input", shape=[3, 4, 5, 6], dtype='float32')
-            scale_var = self._get_data(
-                name="scale",
-                shape=[1],
-                dtype='float32',
-                append_batch_size=False)
+            input = self._get_data(name="input",
+                                   shape=[3, 4, 5, 6],
+                                   dtype='float32')
+            scale_var = self._get_data(name="scale",
+                                       shape=[1],
+                                       dtype='float32',
+                                       append_batch_size=False)
             out = layers.scale(input, scale=scale_var)
             return out
 
@@ -3699,44 +3754,48 @@ def make_bilinear_tensor_product_layer(self):
     def make_batch_norm(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            data = self._get_data(
-                name='data', shape=[32, 128, 128], dtype="float32")
+            data = self._get_data(name='data',
+                                  shape=[32, 128, 128],
+                                  dtype="float32")
             out = layers.batch_norm(data)
             return (out)
 
     def make_batch_norm_momentum_variable(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            data = self._get_data(
-                name='data', shape=[32, 128, 128], dtype="float32")
-            momentum = self._get_data(
-                name='momentum',
-                shape=[1],
-                dtype='float32',
-                append_batch_size=False)
+            data = self._get_data(name='data',
+                                  shape=[32, 128, 128],
+                                  dtype="float32")
+            momentum = self._get_data(name='momentum',
+                                      shape=[1],
+                                      dtype='float32',
+                                      append_batch_size=False)
             out = layers.batch_norm(data, momentum=momentum)
             return (out)
 
     def make_inplace_abn(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            data = self._get_data(
-                name='data', shape=[32, 128, 128], dtype="float32")
+            data = self._get_data(name='data',
+                                  shape=[32, 128, 128],
+                                  dtype="float32")
             out = layers.inplace_abn(data, act='leaky_relu', act_alpha=0.2)
             return (out)
 
     def make_inplace_abn_momentum_variable(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            data = self._get_data(
-                name='data', shape=[32, 128, 128], dtype="float32")
-            momentum = self._get_data(
-                name='momentum',
-                shape=[1],
-                dtype='float32',
-                append_batch_size=False)
-            out = layers.inplace_abn(
-                data, momentum=momentum, act='elu', act_alpha=2.0)
+            data = self._get_data(name='data',
+                                  shape=[32, 128, 128],
+                                  dtype="float32")
+            momentum = self._get_data(name='momentum',
+                                      shape=[1],
+                                      dtype='float32',
+                                      append_batch_size=False)
+            out = layers.inplace_abn(data,
+                                     momentum=momentum,
+                                     act='elu',
+                                     act_alpha=2.0)
             return (out)
 
     def make_range(self):
@@ -3754,27 +3813,24 @@ def make_range(self):
     def make_spectral_norm(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            weight = self._get_data(
-                name='weight',
-                shape=[2, 3, 32, 32],
-                dtype="float32",
-                append_batch_size=False)
+            weight = self._get_data(name='weight',
+                                    shape=[2, 3, 32, 32],
+                                    dtype="float32",
+                                    append_batch_size=False)
             out = layers.spectral_norm(weight, dim=1, power_iters=1)
             return (out)
 
     def make_kldiv_loss(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            x = self._get_data(
-                name='x',
-                shape=[32, 128, 128],
-                dtype="float32",
-                append_batch_size=False)
-            target = self._get_data(
-                name='target',
-                shape=[32, 128, 128],
-                dtype="float32",
-                append_batch_size=False)
+            x = self._get_data(name='x',
+                               shape=[32, 128, 128],
+                               dtype="float32",
+                               append_batch_size=False)
+            target = self._get_data(name='target',
+                                    shape=[32, 128, 128],
+                                    dtype="float32",
+                                    append_batch_size=False)
             loss = layers.kldiv_loss(x=x, target=target, reduction='batchmean')
             return (loss)
 
@@ -3827,12 +3883,15 @@ def test_dynamic_lstmp(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
             hidden_dim, proj_dim = 16, 8
-            seq_data = layers.data(
-                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
+            seq_data = layers.data(name='seq_data',
+                                   shape=[10, 10],
+                                   dtype='float32',
+                                   lod_level=1)
             fc_out = layers.fc(input=seq_data, size=4 * hidden_dim)
             self.assertIsNotNone(
-                layers.dynamic_lstmp(
-                    input=fc_out, size=4 * hidden_dim, proj_size=proj_dim))
+                layers.dynamic_lstmp(input=fc_out,
+                                     size=4 * hidden_dim,
+                                     proj_size=proj_dim))
 
     def test_linear_chain_crf(self):
         with self.static_graph():
@@ -3840,55 +3899,52 @@ def test_linear_chain_crf(self):
             feature = layers.data(name='feature', shape=[784], dtype='float32')
             label = layers.data(name='label', shape=[1], dtype='int64')
             emission = layers.fc(input=feature, size=10)
-            crf = layers.linear_chain_crf(
-                input=emission, label=label, param_attr=ParamAttr(name="crfw"))
-            crf_decode = layers.crf_decoding(
-                input=emission, param_attr=ParamAttr(name="crfw"))
+            crf = layers.linear_chain_crf(input=emission,
+                                          label=label,
+                                          param_attr=ParamAttr(name="crfw"))
+            crf_decode = layers.crf_decoding(input=emission,
+                                             param_attr=ParamAttr(name="crfw"))
             self.assertFalse(crf is None)
             self.assertFalse(crf_decode is None)
-            return layers.chunk_eval(
-                input=crf_decode,
-                label=label,
-                chunk_scheme="IOB",
-                num_chunk_types=(label_dict_len - 1) // 2)
+            return layers.chunk_eval(input=crf_decode,
+                                     label=label,
+                                     chunk_scheme="IOB",
+                                     num_chunk_types=(label_dict_len - 1) // 2)
 
     def test_linear_chain_crf_padding(self):
         with self.static_graph():
             label_dict_len, max_len = 10, 20
-            feature = layers.data(
-                name='feature', shape=[max_len, 784], dtype='float32')
+            feature = layers.data(name='feature',
+                                  shape=[max_len, 784],
+                                  dtype='float32')
             label = layers.data(name='label', shape=[max_len], dtype='int64')
             length = layers.data(name='length', shape=[1], dtype='int64')
             emission = layers.fc(input=feature, size=10, num_flatten_dims=2)
-            crf = layers.linear_chain_crf(
-                input=emission,
-                label=label,
-                length=length,
-                param_attr=ParamAttr(name="crfw"))
-            crf_decode = layers.crf_decoding(
-                input=emission,
-                length=length,
-                param_attr=ParamAttr(name="crfw"))
+            crf = layers.linear_chain_crf(input=emission,
+                                          label=label,
+                                          length=length,
+                                          param_attr=ParamAttr(name="crfw"))
+            crf_decode = layers.crf_decoding(input=emission,
+                                             length=length,
+                                             param_attr=ParamAttr(name="crfw"))
             self.assertFalse(crf is None)
             self.assertFalse(crf_decode is None)
-            return layers.chunk_eval(
-                input=crf_decode,
-                label=label,
-                seq_length=length,
-                chunk_scheme="IOB",
-                num_chunk_types=(label_dict_len - 1) // 2)
+            return layers.chunk_eval(input=crf_decode,
+                                     label=label,
+                                     seq_length=length,
+                                     chunk_scheme="IOB",
+                                     num_chunk_types=(label_dict_len - 1) // 2)
 
     def test_im2sequence(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
             x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
             y = layers.data(name='y', shape=[], dtype='float32')
-            output = layers.im2sequence(
-                input=x,
-                input_image_size=y,
-                stride=[1, 1],
-                filter_size=[2, 2],
-                out_stride=[1, 1])
+            output = layers.im2sequence(input=x,
+                                        input_image_size=y,
+                                        stride=[1, 1],
+                                        filter_size=[2, 2],
+                                        out_stride=[1, 1])
             return (output)
 
     def test_lod_reset(self):
@@ -3896,8 +3952,10 @@ def test_lod_reset(self):
         with self.static_graph():
             # case 1
             x = layers.data(name='x', shape=[10], dtype='float32')
-            y = layers.data(
-                name='y', shape=[10, 20], dtype='float32', lod_level=2)
+            y = layers.data(name='y',
+                            shape=[10, 20],
+                            dtype='float32',
+                            lod_level=2)
             z = layers.lod_reset(x=x, y=y)
             self.assertTrue(z.lod_level == 2)
             # case 2
@@ -3929,24 +3987,32 @@ def test_stridedslice(self):
         strides = [1, 1, 1]
         with self.static_graph():
             x = layers.data(name="x", shape=[245, 30, 30], dtype="float32")
-            out = layers.strided_slice(
-                x, axes=axes, starts=starts, ends=ends, strides=strides)
+            out = layers.strided_slice(x,
+                                       axes=axes,
+                                       starts=starts,
+                                       ends=ends,
+                                       strides=strides)
             return out
 
     def test_fill_constant_batch_size_like(self):
         with self.static_graph():
-            like = fluid.layers.fill_constant(
-                shape=[1, 200], value=10, dtype='int64')
-            out = layers.fill_constant_batch_size_like(
-                input=like, shape=[2, 3300], value=1315454564656, dtype='int64')
+            like = fluid.layers.fill_constant(shape=[1, 200],
+                                              value=10,
+                                              dtype='int64')
+            out = layers.fill_constant_batch_size_like(input=like,
+                                                       shape=[2, 3300],
+                                                       value=1315454564656,
+                                                       dtype='int64')
             return out
 
     def test_psroi_pool(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
             x = layers.data(name="x", shape=[245, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
+            rois = layers.data(name="rois",
+                               shape=[4],
+                               dtype="float32",
+                               lod_level=1)
             output = layers.psroi_pool(x, rois, 5, 0.25, 7, 7)
             return (output)
 
@@ -3954,8 +4020,10 @@ def test_sequence_expand(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
             x = layers.data(name='x', shape=[10], dtype='float32')
-            y = layers.data(
-                name='y', shape=[10, 20], dtype='float32', lod_level=2)
+            y = layers.data(name='y',
+                            shape=[10, 20],
+                            dtype='float32',
+                            lod_level=2)
             return (layers.sequence_expand(x=x, y=y, ref_level=1))
 
     def test_sequence_reshape(self):
@@ -3975,8 +4043,10 @@ def test_sequence_unpad(self):
     def test_sequence_softmax(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
-            seq_data = layers.data(
-                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
+            seq_data = layers.data(name='seq_data',
+                                   shape=[10, 10],
+                                   dtype='float32',
+                                   lod_level=1)
             seq = layers.fc(input=seq_data, size=20)
             return (layers.sequence_softmax(seq))
 
@@ -3990,23 +4060,20 @@ def test_sequence_unsqueeze(self):
     def test_sequence_scatter(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
-            x = layers.data(
-                name='x',
-                shape=[3, 6],
-                append_batch_size=False,
-                dtype='float32')
-            idx = layers.data(
-                name='idx',
-                shape=[12, 1],
-                append_batch_size=False,
-                dtype='int32',
-                lod_level=1)
-            updates = layers.data(
-                name='updates',
-                shape=[12, 1],
-                append_batch_size=False,
-                dtype='float32',
-                lod_level=1)
+            x = layers.data(name='x',
+                            shape=[3, 6],
+                            append_batch_size=False,
+                            dtype='float32')
+            idx = layers.data(name='idx',
+                              shape=[12, 1],
+                              append_batch_size=False,
+                              dtype='int32',
+                              lod_level=1)
+            updates = layers.data(name='updates',
+                                  shape=[12, 1],
+                                  append_batch_size=False,
+                                  dtype='float32',
+                                  lod_level=1)
             out = layers.sequence_scatter(input=x, index=idx, updates=updates)
             return (out)
 
@@ -4014,39 +4081,44 @@ def test_sequence_slice(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
             import numpy as np
-            seqs = layers.data(
-                name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            seqs = layers.data(name='x',
+                               shape=[10, 5],
+                               dtype='float32',
+                               lod_level=1)
             offset = layers.assign(input=np.array([[0, 1]]).astype('int32'))
             length = layers.assign(input=np.array([[2, 1]]).astype('int32'))
-            out = layers.sequence_slice(
-                input=seqs, offset=offset, length=length)
+            out = layers.sequence_slice(input=seqs,
+                                        offset=offset,
+                                        length=length)
             return (out)
 
     def test_filter_by_instag(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
-            x1 = layers.data(
-                name='Ins', shape=[32, 1], dtype='float32', lod_level=0)
-            x2 = layers.data(
-                name='Ins_tag',
-                shape=[32, 1],
-                dtype='int64',
-                lod_level=0,
-                stop_gradient=True)
-            x3 = layers.create_global_var(
-                shape=[1, 1],
-                value=20,
-                dtype='int64',
-                persistable=True,
-                force_cpu=True,
-                name='Filter_tag')
+            x1 = layers.data(name='Ins',
+                             shape=[32, 1],
+                             dtype='float32',
+                             lod_level=0)
+            x2 = layers.data(name='Ins_tag',
+                             shape=[32, 1],
+                             dtype='int64',
+                             lod_level=0,
+                             stop_gradient=True)
+            x3 = layers.create_global_var(shape=[1, 1],
+                                          value=20,
+                                          dtype='int64',
+                                          persistable=True,
+                                          force_cpu=True,
+                                          name='Filter_tag')
             out1, out2 = layers.filter_by_instag(x1, x2, x3, is_lod=True)
 
     def test_shuffle_batch(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
-            x = layers.data(
-                name='X', shape=[4, 50], dtype='float32', lod_level=0)
+            x = layers.data(name='X',
+                            shape=[4, 50],
+                            dtype='float32',
+                            lod_level=0)
             out1 = fluid.contrib.layers.shuffle_batch(x)
             default_main_program().random_seed = 1000
             out2 = fluid.contrib.layers.shuffle_batch(x)
@@ -4058,8 +4130,9 @@ def test_partial_sum(self):
         with self.static_graph():
             x = fluid.data(name="x", shape=[None, 3], dtype="float32")
             y = fluid.data(name="y", shape=[None, 3], dtype="float32")
-            sum = fluid.contrib.layers.partial_sum(
-                [x, y], start_index=0, length=2)
+            sum = fluid.contrib.layers.partial_sum([x, y],
+                                                   start_index=0,
+                                                   length=2)
             return (sum)
 
     def test_batch_fc(self):
@@ -4083,8 +4156,9 @@ def test_batch_fc(self):
     def test_rank_attention(self):
         with self.static_graph():
             input = fluid.data(name="input", shape=[None, 2], dtype="float32")
-            rank_offset = fluid.data(
-                name="rank_offset", shape=[None, 7], dtype="int32")
+            rank_offset = fluid.data(name="rank_offset",
+                                     shape=[None, 7],
+                                     dtype="int32")
             out = fluid.contrib.layers.rank_attention(
                 input=input,
                 rank_offset=rank_offset,
@@ -4106,26 +4180,35 @@ def test_roi_pool(self):
             rois = layers.data(name="rois", shape=[4], dtype="float32")
             rois_num = fluid.data(name="rois_num", shape=[None], dtype="int32")
             output = layers.roi_pool(x, rois, 4, 4, 0.5, rois_num=rois_num)
-            static_res = self.get_static_graph_result(
-                feed={'x': x_np,
-                      'rois': rois_np,
-                      'rois_num': rois_num_np},
-                fetch_list=[output])[0]
+            static_res = self.get_static_graph_result(feed={
+                'x': x_np,
+                'rois': rois_np,
+                'rois_num': rois_num_np
+            },
+                                                      fetch_list=[output])[0]
 
         with self.dynamic_graph():
             with _test_eager_guard():
                 x_dy = base.to_variable(x_np)
                 rois_dy = base.to_variable(rois_np)
                 rois_num_dy = base.to_variable(rois_num_np)
-                dy_eager_res = layers.roi_pool(
-                    x_dy, rois_dy, 4, 4, 0.5, rois_num=rois_num_dy)
+                dy_eager_res = layers.roi_pool(x_dy,
+                                               rois_dy,
+                                               4,
+                                               4,
+                                               0.5,
+                                               rois_num=rois_num_dy)
                 dy_eager_res_value = dy_eager_res[0].numpy()
 
             x_dy = base.to_variable(x_np)
             rois_dy = base.to_variable(rois_np)
             rois_num_dy = base.to_variable(rois_num_np)
-            dy_res = layers.roi_pool(
-                x_dy, rois_dy, 4, 4, 0.5, rois_num=rois_num_dy)
+            dy_res = layers.roi_pool(x_dy,
+                                     rois_dy,
+                                     4,
+                                     4,
+                                     0.5,
+                                     rois_num=rois_num_dy)
             dy_res_value = dy_res[0].numpy()
         self.assertTrue(np.array_equal(static_res, dy_res_value))
         self.assertTrue(np.array_equal(static_res, dy_eager_res_value))
@@ -4146,26 +4229,37 @@ def test_roi_align(self):
             rois = layers.data(name="rois", shape=[4], dtype="float32")
             rois_num = fluid.data(name="rois_num", shape=[None], dtype="int32")
             output = layers.roi_align(x, rois, 4, 4, 0.5, 2, rois_num=rois_num)
-            static_res = self.get_static_graph_result(
-                feed={'x': x_np,
-                      'rois': rois_np,
-                      'rois_num': rois_num_np},
-                fetch_list=[output])[0]
+            static_res = self.get_static_graph_result(feed={
+                'x': x_np,
+                'rois': rois_np,
+                'rois_num': rois_num_np
+            },
+                                                      fetch_list=[output])[0]
 
         with self.dynamic_graph():
             with _test_eager_guard():
                 x_dy = base.to_variable(x_np)
                 rois_dy = base.to_variable(rois_np)
                 rois_num_dy = base.to_variable(rois_num_np)
-                dy_eager_res = layers.roi_align(
-                    x_dy, rois_dy, 4, 4, 0.5, 2, rois_num=rois_num_dy)
+                dy_eager_res = layers.roi_align(x_dy,
+                                                rois_dy,
+                                                4,
+                                                4,
+                                                0.5,
+                                                2,
+                                                rois_num=rois_num_dy)
                 dy_eager_res_value = dy_eager_res.numpy()
 
             x_dy = base.to_variable(x_np)
             rois_dy = base.to_variable(rois_np)
             rois_num_dy = base.to_variable(rois_num_np)
-            dy_res = layers.roi_align(
-                x_dy, rois_dy, 4, 4, 0.5, 2, rois_num=rois_num_dy)
+            dy_res = layers.roi_align(x_dy,
+                                      rois_dy,
+                                      4,
+                                      4,
+                                      0.5,
+                                      2,
+                                      rois_num=rois_num_dy)
             dy_res_value = dy_res.numpy()
         self.assertTrue(np.array_equal(static_res, dy_eager_res_value))
         self.assertTrue(np.array_equal(static_res, dy_res_value))
@@ -4177,15 +4271,18 @@ def test_dice_loss(self):
         label_np = np.random.randint(0, num_classes, [2, 3, 1], dtype=np.int64)
 
         with self.static_graph():
-            input_ = layers.data(
-                name="input", shape=[None, 3, num_classes], dtype="float32")
-            label_ = layers.data(
-                name="label", shape=[None, 3, 1], dtype="int64")
+            input_ = layers.data(name="input",
+                                 shape=[None, 3, num_classes],
+                                 dtype="float32")
+            label_ = layers.data(name="label",
+                                 shape=[None, 3, 1],
+                                 dtype="int64")
             output = layers.dice_loss(input_, label_, eps)
-            static_res = self.get_static_graph_result(
-                feed={'input': input_np,
-                      'label': label_np},
-                fetch_list=[output])[0]
+            static_res = self.get_static_graph_result(feed={
+                'input': input_np,
+                'label': label_np
+            },
+                                                      fetch_list=[output])[0]
 
         with self.dynamic_graph():
             with _test_eager_guard():
@@ -4205,8 +4302,10 @@ def test_roi_perspective_transform(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
             x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[8], dtype="float32", lod_level=1)
+            rois = layers.data(name="rois",
+                               shape=[8],
+                               dtype="float32",
+                               lod_level=1)
             output = layers.roi_perspective_transform(x, rois, 7, 7, 0.6)
             return (output)
 
@@ -4220,10 +4319,12 @@ def test_row_conv(self):
     def test_simple_conv2d(self):
         # TODO(minqiyang): dygraph do not support layers with param now
         with self.static_graph():
-            images = layers.data(
-                name='pixel', shape=[3, 48, 48], dtype='float32')
-            return layers.conv2d(
-                input=images, num_filters=3, filter_size=[4, 4])
+            images = layers.data(name='pixel',
+                                 shape=[3, 48, 48],
+                                 dtype='float32')
+            return layers.conv2d(input=images,
+                                 num_filters=3,
+                                 filter_size=[4, 4])
 
     def test_squeeze(self):
         # TODO(minqiyang): dygraph do not support layers with param now
@@ -4235,11 +4336,10 @@ def test_squeeze(self):
     def test_flatten(self):
         # TODO(minqiyang): dygraph do not support op without kernel now
         with self.static_graph():
-            x = layers.data(
-                name='x',
-                append_batch_size=False,
-                shape=[4, 4, 3],
-                dtype="float32")
+            x = layers.data(name='x',
+                            append_batch_size=False,
+                            shape=[4, 4, 3],
+                            dtype="float32")
             out = layers.flatten(x, axis=1, name="flatten")
             return (out)
 
@@ -4252,45 +4352,43 @@ def test_linspace(self):
 
     def test_deformable_conv(self):
         with self.static_graph():
-            input = layers.data(
-                name='input',
-                append_batch_size=False,
-                shape=[2, 3, 32, 32],
-                dtype="float32")
-            offset = layers.data(
-                name='offset',
-                append_batch_size=False,
-                shape=[2, 18, 32, 32],
-                dtype="float32")
-            mask = layers.data(
-                name='mask',
-                append_batch_size=False,
-                shape=[2, 9, 32, 32],
-                dtype="float32")
-            out = layers.deformable_conv(
-                input=input,
-                offset=offset,
-                mask=mask,
-                num_filters=2,
-                filter_size=3,
-                padding=1)
+            input = layers.data(name='input',
+                                append_batch_size=False,
+                                shape=[2, 3, 32, 32],
+                                dtype="float32")
+            offset = layers.data(name='offset',
+                                 append_batch_size=False,
+                                 shape=[2, 18, 32, 32],
+                                 dtype="float32")
+            mask = layers.data(name='mask',
+                               append_batch_size=False,
+                               shape=[2, 9, 32, 32],
+                               dtype="float32")
+            out = layers.deformable_conv(input=input,
+                                         offset=offset,
+                                         mask=mask,
+                                         num_filters=2,
+                                         filter_size=3,
+                                         padding=1)
             return (out)
 
     def test_deformable_conv2(self):
         with self.static_graph():
-            input = fluid.data(
-                name='input', shape=[None, 3, None, None], dtype="float32")
-            offset = fluid.data(
-                name='offset', shape=[None, 18, None, None], dtype="float32")
-            mask = fluid.data(
-                name='mask', shape=[None, 9, None, None], dtype="float32")
-            out = layers.deformable_conv(
-                input=input,
-                offset=offset,
-                mask=mask,
-                num_filters=2,
-                filter_size=3,
-                padding=1)
+            input = fluid.data(name='input',
+                               shape=[None, 3, None, None],
+                               dtype="float32")
+            offset = fluid.data(name='offset',
+                                shape=[None, 18, None, None],
+                                dtype="float32")
+            mask = fluid.data(name='mask',
+                              shape=[None, 9, None, None],
+                              dtype="float32")
+            out = layers.deformable_conv(input=input,
+                                         offset=offset,
+                                         mask=mask,
+                                         num_filters=2,
+                                         filter_size=3,
+                                         padding=1)
             return (out)
 
     def test_unfold(self):
@@ -4303,151 +4401,139 @@ def test_partial_concat(self):
         with self.static_graph():
             x = fluid.data(name="x", shape=[None, 3], dtype="float32")
             y = fluid.data(name="y", shape=[None, 3], dtype="float32")
-            concat1 = fluid.contrib.layers.partial_concat(
-                [x, y], start_index=0, length=2)
-            concat2 = fluid.contrib.layers.partial_concat(
-                x, start_index=0, length=-1)
+            concat1 = fluid.contrib.layers.partial_concat([x, y],
+                                                          start_index=0,
+                                                          length=2)
+            concat2 = fluid.contrib.layers.partial_concat(x,
+                                                          start_index=0,
+                                                          length=-1)
             return concat1, concat2
 
     def test_deform_roi_pooling(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            input = layers.data(
-                name='input',
-                shape=[2, 3, 32, 32],
-                dtype='float32',
-                append_batch_size=False)
-            rois = layers.data(
-                name="rois", shape=[4], dtype='float32', lod_level=1)
-            trans = layers.data(
-                name="trans",
-                shape=[2, 3, 32, 32],
-                dtype='float32',
-                append_batch_size=False)
-            out = layers.deformable_roi_pooling(
-                input=input,
-                rois=rois,
-                trans=trans,
-                no_trans=False,
-                spatial_scale=1.0,
-                group_size=(1, 1),
-                pooled_height=8,
-                pooled_width=8,
-                part_size=(8, 8),
-                sample_per_part=4,
-                trans_std=0.1)
+            input = layers.data(name='input',
+                                shape=[2, 3, 32, 32],
+                                dtype='float32',
+                                append_batch_size=False)
+            rois = layers.data(name="rois",
+                               shape=[4],
+                               dtype='float32',
+                               lod_level=1)
+            trans = layers.data(name="trans",
+                                shape=[2, 3, 32, 32],
+                                dtype='float32',
+                                append_batch_size=False)
+            out = layers.deformable_roi_pooling(input=input,
+                                                rois=rois,
+                                                trans=trans,
+                                                no_trans=False,
+                                                spatial_scale=1.0,
+                                                group_size=(1, 1),
+                                                pooled_height=8,
+                                                pooled_width=8,
+                                                part_size=(8, 8),
+                                                sample_per_part=4,
+                                                trans_std=0.1)
         return (out)
 
     def test_deformable_conv_v1(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            input = layers.data(
-                name='input',
-                append_batch_size=False,
-                shape=[2, 3, 32, 32],
-                dtype="float32")
-            offset = layers.data(
-                name='offset',
-                append_batch_size=False,
-                shape=[2, 18, 32, 32],
-                dtype="float32")
-            out = layers.deformable_conv(
-                input=input,
-                offset=offset,
-                mask=None,
-                num_filters=2,
-                filter_size=3,
-                padding=1,
-                modulated=False)
+            input = layers.data(name='input',
+                                append_batch_size=False,
+                                shape=[2, 3, 32, 32],
+                                dtype="float32")
+            offset = layers.data(name='offset',
+                                 append_batch_size=False,
+                                 shape=[2, 18, 32, 32],
+                                 dtype="float32")
+            out = layers.deformable_conv(input=input,
+                                         offset=offset,
+                                         mask=None,
+                                         num_filters=2,
+                                         filter_size=3,
+                                         padding=1,
+                                         modulated=False)
             return (out)
 
     def test_retinanet_target_assign(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            bbox_pred = layers.data(
-                name='bbox_pred',
-                shape=[1, 100, 4],
-                append_batch_size=False,
-                dtype='float32')
-            cls_logits = layers.data(
-                name='cls_logits',
-                shape=[1, 100, 10],
-                append_batch_size=False,
-                dtype='float32')
-            anchor_box = layers.data(
-                name='anchor_box',
-                shape=[100, 4],
-                append_batch_size=False,
-                dtype='float32')
-            anchor_var = layers.data(
-                name='anchor_var',
-                shape=[100, 4],
-                append_batch_size=False,
-                dtype='float32')
-            gt_boxes = layers.data(
-                name='gt_boxes',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            gt_labels = layers.data(
-                name='gt_labels',
-                shape=[10, 1],
-                append_batch_size=False,
-                dtype='int32')
-            is_crowd = layers.data(
-                name='is_crowd',
-                shape=[1],
-                append_batch_size=False,
-                dtype='int32')
-            im_info = layers.data(
-                name='im_info',
-                shape=[1, 3],
-                append_batch_size=False,
-                dtype='float32')
-            return (layers.retinanet_target_assign(
-                bbox_pred, cls_logits, anchor_box, anchor_var, gt_boxes,
-                gt_labels, is_crowd, im_info, 10))
+            bbox_pred = layers.data(name='bbox_pred',
+                                    shape=[1, 100, 4],
+                                    append_batch_size=False,
+                                    dtype='float32')
+            cls_logits = layers.data(name='cls_logits',
+                                     shape=[1, 100, 10],
+                                     append_batch_size=False,
+                                     dtype='float32')
+            anchor_box = layers.data(name='anchor_box',
+                                     shape=[100, 4],
+                                     append_batch_size=False,
+                                     dtype='float32')
+            anchor_var = layers.data(name='anchor_var',
+                                     shape=[100, 4],
+                                     append_batch_size=False,
+                                     dtype='float32')
+            gt_boxes = layers.data(name='gt_boxes',
+                                   shape=[10, 4],
+                                   append_batch_size=False,
+                                   dtype='float32')
+            gt_labels = layers.data(name='gt_labels',
+                                    shape=[10, 1],
+                                    append_batch_size=False,
+                                    dtype='int32')
+            is_crowd = layers.data(name='is_crowd',
+                                   shape=[1],
+                                   append_batch_size=False,
+                                   dtype='int32')
+            im_info = layers.data(name='im_info',
+                                  shape=[1, 3],
+                                  append_batch_size=False,
+                                  dtype='float32')
+            return (layers.retinanet_target_assign(bbox_pred, cls_logits,
+                                                   anchor_box, anchor_var,
+                                                   gt_boxes, gt_labels,
+                                                   is_crowd, im_info, 10))
 
     def test_sigmoid_focal_loss(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            input = layers.data(
-                name='data',
-                shape=[10, 80],
-                append_batch_size=False,
-                dtype='float32')
-            label = layers.data(
-                name='label',
-                shape=[10, 1],
-                append_batch_size=False,
-                dtype='int32')
-            fg_num = layers.data(
-                name='fg_num',
-                shape=[1],
-                append_batch_size=False,
-                dtype='int32')
-            out = fluid.layers.sigmoid_focal_loss(
-                x=input, label=label, fg_num=fg_num, gamma=2., alpha=0.25)
+            input = layers.data(name='data',
+                                shape=[10, 80],
+                                append_batch_size=False,
+                                dtype='float32')
+            label = layers.data(name='label',
+                                shape=[10, 1],
+                                append_batch_size=False,
+                                dtype='int32')
+            fg_num = layers.data(name='fg_num',
+                                 shape=[1],
+                                 append_batch_size=False,
+                                 dtype='int32')
+            out = fluid.layers.sigmoid_focal_loss(x=input,
+                                                  label=label,
+                                                  fg_num=fg_num,
+                                                  gamma=2.,
+                                                  alpha=0.25)
             return (out)
 
     def test_addmm(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            input = layers.data(
-                name='input_data',
-                shape=[3, 3],
-                append_batch_size=False,
-                dtype='float32')
-            x = layers.data(
-                name='x',
-                shape=[3, 2],
-                append_batch_size=False,
-                dtype='float32')
-            y = layers.data(
-                name='y',
-                shape=[2, 3],
-                append_batch_size=False,
-                dtype='float32')
+            input = layers.data(name='input_data',
+                                shape=[3, 3],
+                                append_batch_size=False,
+                                dtype='float32')
+            x = layers.data(name='x',
+                            shape=[3, 2],
+                            append_batch_size=False,
+                            dtype='float32')
+            y = layers.data(name='y',
+                            shape=[2, 3],
+                            append_batch_size=False,
+                            dtype='float32')
 
             out = paddle.addmm(input=input, x=x, y=y)
             return (out)
@@ -4455,26 +4541,22 @@ def test_addmm(self):
     def test_retinanet_detection_output(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
-            bboxes = layers.data(
-                name='bboxes',
-                shape=[1, 21, 4],
-                append_batch_size=False,
-                dtype='float32')
-            scores = layers.data(
-                name='scores',
-                shape=[1, 21, 10],
-                append_batch_size=False,
-                dtype='float32')
-            anchors = layers.data(
-                name='anchors',
-                shape=[21, 4],
-                append_batch_size=False,
-                dtype='float32')
-            im_info = layers.data(
-                name="im_info",
-                shape=[1, 3],
-                append_batch_size=False,
-                dtype='float32')
+            bboxes = layers.data(name='bboxes',
+                                 shape=[1, 21, 4],
+                                 append_batch_size=False,
+                                 dtype='float32')
+            scores = layers.data(name='scores',
+                                 shape=[1, 21, 10],
+                                 append_batch_size=False,
+                                 dtype='float32')
+            anchors = layers.data(name='anchors',
+                                  shape=[21, 4],
+                                  append_batch_size=False,
+                                  dtype='float32')
+            im_info = layers.data(name="im_info",
+                                  shape=[1, 3],
+                                  append_batch_size=False,
+                                  dtype='float32')
             nmsed_outs = layers.retinanet_detection_output(
                 bboxes=[bboxes, bboxes],
                 scores=[scores, scores],
@@ -4490,26 +4572,32 @@ def test_retinanet_detection_output(self):
     def test_warpctc_with_padding(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
-            input_length = layers.data(
-                name='logits_length', shape=[11], dtype='int64')
-            label_length = layers.data(
-                name='labels_length', shape=[12], dtype='int64')
+            input_length = layers.data(name='logits_length',
+                                       shape=[11],
+                                       dtype='int64')
+            label_length = layers.data(name='labels_length',
+                                       shape=[12],
+                                       dtype='int64')
             label = layers.data(name='label', shape=[12, 1], dtype='int32')
-            predict = layers.data(
-                name='predict', shape=[4, 4, 8], dtype='float32')
-            output = layers.warpctc(
-                input=predict,
-                label=label,
-                input_length=input_length,
-                label_length=label_length)
+            predict = layers.data(name='predict',
+                                  shape=[4, 4, 8],
+                                  dtype='float32')
+            output = layers.warpctc(input=predict,
+                                    label=label,
+                                    input_length=input_length,
+                                    label_length=label_length)
             return (output)
 
     def test_edit_distance(self):
         with self.static_graph():
-            predict = layers.data(
-                name='predict', shape=[-1, 1], dtype='int64', lod_level=1)
-            label = layers.data(
-                name='label', shape=[-1, 1], dtype='int64', lod_level=1)
+            predict = layers.data(name='predict',
+                                  shape=[-1, 1],
+                                  dtype='int64',
+                                  lod_level=1)
+            label = layers.data(name='label',
+                                shape=[-1, 1],
+                                dtype='int64',
+                                lod_level=1)
             evaluator = fluid.evaluator.EditDistance(predict, label)
             return evaluator.metrics
 
@@ -4517,12 +4605,15 @@ def test_basic_gru(self):
         input_size = 128
         hidden_size = 256
         with self.static_graph():
-            input = fluid.data(
-                name="input", shape=[None, None, input_size], dtype='float32')
-            pre_hidden = fluid.data(
-                name="pre_hidden", shape=[None, hidden_size], dtype='float32')
-            sequence_length = fluid.data(
-                name="sequence_length", shape=[None], dtype='int32')
+            input = fluid.data(name="input",
+                               shape=[None, None, input_size],
+                               dtype='float32')
+            pre_hidden = fluid.data(name="pre_hidden",
+                                    shape=[None, hidden_size],
+                                    dtype='float32')
+            sequence_length = fluid.data(name="sequence_length",
+                                         shape=[None],
+                                         dtype='int32')
 
             for bidirectional in [True, False]:
                 for batch_first in [True, False]:
@@ -4538,26 +4629,26 @@ def test_basic_gru(self):
 
 
 class TestMetricsDetectionMap(unittest.TestCase):
+
     def test_detection_map(self):
         program = fluid.Program()
         with program_guard(program):
-            detect_res = fluid.layers.data(
-                name='detect_res',
-                shape=[10, 6],
-                append_batch_size=False,
-                dtype='float32')
-            label = fluid.layers.data(
-                name='label',
-                shape=[10, 1],
-                append_batch_size=False,
-                dtype='float32')
-            box = fluid.layers.data(
-                name='bbox',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            map_eval = fluid.metrics.DetectionMAP(
-                detect_res, label, box, class_num=21)
+            detect_res = fluid.layers.data(name='detect_res',
+                                           shape=[10, 6],
+                                           append_batch_size=False,
+                                           dtype='float32')
+            label = fluid.layers.data(name='label',
+                                      shape=[10, 1],
+                                      append_batch_size=False,
+                                      dtype='float32')
+            box = fluid.layers.data(name='bbox',
+                                    shape=[10, 4],
+                                    append_batch_size=False,
+                                    dtype='float32')
+            map_eval = fluid.metrics.DetectionMAP(detect_res,
+                                                  label,
+                                                  box,
+                                                  class_num=21)
             cur_map, accm_map = map_eval.get_map_var()
             self.assertIsNotNone(cur_map)
             self.assertIsNotNone(accm_map)
@@ -4565,6 +4656,7 @@ def test_detection_map(self):
 
 
 class ExampleNet(paddle.nn.Layer):
+
     def __init__(self):
         super(ExampleNet, self).__init__()
         self.weight = self.create_parameter(
@@ -4576,6 +4668,7 @@ def forward(self):
 
 
 class TestLayerParameterTrainableSet(unittest.TestCase):
+
     def test_layer_parameter_set(self):
         with fluid.dygraph.guard():
             net = ExampleNet()
@@ -4583,6 +4676,7 @@ def test_layer_parameter_set(self):
 
 
 class TestLayerTrainingAttribute(unittest.TestCase):
+
     def test_set_train_eval_in_dynamic_mode(self):
         with fluid.dygraph.guard():
             net = paddle.nn.Dropout()
@@ -4600,6 +4694,7 @@ def test_set_train_eval_in_static_mode(self):
 
 
 class MyLayer(paddle.nn.Layer):
+
     def __init__(self):
         super(MyLayer, self).__init__()
         self._linear = paddle.nn.Linear(1, 1)
@@ -4612,6 +4707,7 @@ def forward(self, input):
 
 
 class MySuperLayer(paddle.nn.Layer):
+
     def __init__(self):
         super(MySuperLayer, self).__init__()
         self._mylayer = MyLayer()
@@ -4622,6 +4718,7 @@ def forward(self, input):
 
 
 class TestSubLayerCount(unittest.TestCase):
+
     def test_sublayer(self):
         with fluid.dygraph.guard():
             mySuperlayer = MySuperLayer()
diff --git a/python/paddle/fluid/tests/unittests/test_layout_autotune.py b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
index a1440f8587ab6..bd73d9526c0ef 100644
--- a/python/paddle/fluid/tests/unittests/test_layout_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 
 class SimpleNet(paddle.nn.Layer):
+
     def __init__(self, data_format="NCHW", class_num=2):
         super(SimpleNet, self).__init__()
         self.conv = paddle.nn.Conv2D(3, 8, (3, 3))
@@ -43,6 +44,7 @@ def forward(self, image):
 
 
 class LayoutAutoTune(unittest.TestCase):
+
     def use_autoune(self):
         if paddle.is_compiled_with_cuda():
             paddle.incubate.autotune.set_config(
@@ -101,7 +103,7 @@ def test_transpose_op_transposer(self):
         with paddle.amp.auto_cast(level="O2"):
             conv_out = conv(data)
             # conv_out.shape = [1, 14, 12, 8] with NHWC
-            # layout tuner will transpose conv_out to 
+            # layout tuner will transpose conv_out to
             # [1, 8, 14, 12] with NCHW before the following transpose op.
             out = paddle.transpose(conv_out, perm=[0, 3, 1, 2])
             loss = out.mean()
@@ -131,6 +133,7 @@ def test_flatten_op_transposer(self):
 
 
 class TestAutoTuneAPI(unittest.TestCase):
+
     def test_set_config_warnings(self):
         with warnings.catch_warnings(record=True) as w:
             config = {"layout": {"enable": 1}}
diff --git a/python/paddle/fluid/tests/unittests/test_lbfgs.py b/python/paddle/fluid/tests/unittests/test_lbfgs.py
index bb3818747601f..d4875bce50357 100644
--- a/python/paddle/fluid/tests/unittests/test_lbfgs.py
+++ b/python/paddle/fluid/tests/unittests/test_lbfgs.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -44,10 +44,13 @@ def test_static_graph_H0(func, x0, H0, dtype='float32'):
     startup = paddle.static.Program()
     with paddle.static.program_guard(main, startup):
         X = paddle.static.data(name='x', shape=[x0.shape[0]], dtype=dtype)
-        H = paddle.static.data(
-            name='h', shape=[H0.shape[0], H0.shape[1]], dtype=dtype)
-        Y = minimize_lbfgs(
-            func, X, initial_inverse_hessian_estimate=H, dtype=dtype)
+        H = paddle.static.data(name='h',
+                               shape=[H0.shape[0], H0.shape[1]],
+                               dtype=dtype)
+        Y = minimize_lbfgs(func,
+                           X,
+                           initial_inverse_hessian_estimate=H,
+                           dtype=dtype)
 
     exe = paddle.static.Executor()
     exe.run(startup)
@@ -63,15 +66,15 @@ def test_dynamic_graph(func,
     x0 = paddle.to_tensor(x0)
     if H0 is not None:
         H0 = paddle.to_tensor(H0)
-    return minimize_lbfgs(
-        func,
-        x0,
-        initial_inverse_hessian_estimate=H0,
-        line_search_fn=line_search_fn,
-        dtype=dtype)
+    return minimize_lbfgs(func,
+                          x0,
+                          initial_inverse_hessian_estimate=H0,
+                          line_search_fn=line_search_fn,
+                          dtype=dtype)
 
 
 class TestLbfgs(unittest.TestCase):
+
     def test_quadratic_nd(self):
         for dimension in [1, 10]:
             minimum = np.random.random(size=[dimension]).astype('float32')
@@ -105,10 +108,11 @@ def func(x):
         self.assertFalse(results[0][0])
 
     def test_multi_minima(self):
+
         def func(x):
             # df = 12(x + 1.1)(x - 0.2)(x - 0.8)
             # f = 3*x^4+0.4*x^3-5.46*x^2+2.112*x
-            # minimum = -1.1 or 0.8. 
+            # minimum = -1.1 or 0.8.
             # All these minima may be reached from appropriate starting points.
             return 3 * x**4 + 0.4 * x**3 - 5.64 * x**2 + 2.112 * x
 
@@ -137,6 +141,7 @@ def func(position):
         self.assertTrue(np.allclose(minimum, results[2]))
 
     def test_exception(self):
+
         def func(x):
             return paddle.dot(x, x)
 
@@ -145,8 +150,11 @@ def func(x):
 
         # test dtype is not float32 or float64
         x1 = np.random.random(size=[2]).astype('int32')
-        self.assertRaises(
-            ValueError, test_static_graph, func, x1, dtype='int32')
+        self.assertRaises(ValueError,
+                          test_static_graph,
+                          func,
+                          x1,
+                          dtype='int32')
 
         # test initial_inverse_hessian_estimate is good
         results = test_static_graph_H0(func, x0, H0, dtype='float32')
@@ -156,8 +164,12 @@ def func(x):
         # test initial_inverse_hessian_estimate is bad and float64
         x2 = np.random.random(size=[2]).astype('float64')
         H1 = np.array([[1.0, 2.0], [3.0, 1.0]]).astype('float64')
-        self.assertRaises(
-            ValueError, test_static_graph_H0, func, x2, H0=H1, dtype='float64')
+        self.assertRaises(ValueError,
+                          test_static_graph_H0,
+                          func,
+                          x2,
+                          H0=H1,
+                          dtype='float64')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lcm.py b/python/paddle/fluid/tests/unittests/test_lcm.py
index 123c3e3d444e1..ca78e239da4ea 100644
--- a/python/paddle/fluid/tests/unittests/test_lcm.py
+++ b/python/paddle/fluid/tests/unittests/test_lcm.py
@@ -26,6 +26,7 @@
 
 
 class TestLcmAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_np = 12
         self.y_np = 20
@@ -40,15 +41,17 @@ def test_static_graph(self):
             x2 = fluid.data(name='input2', dtype='int32', shape=self.y_shape)
             out = paddle.lcm(x1, x2)
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
             res = exe.run(fluid.default_main_program(),
-                          feed={'input1': self.x_np,
-                                'input2': self.y_np},
+                          feed={
+                              'input1': self.x_np,
+                              'input2': self.y_np
+                          },
                           fetch_list=[out])
-            self.assertTrue((np.array(res[0]) == np.lcm(self.x_np, self.y_np)
-                             ).all())
+            self.assertTrue((np.array(res[0]) == np.lcm(self.x_np,
+                                                        self.y_np)).all())
 
     def test_dygraph(self):
         paddle.disable_static()
@@ -62,6 +65,7 @@ def test_dygraph(self):
 
 
 class TestLcmAPI2(TestLcmAPI):
+
     def setUp(self):
         self.x_np = np.arange(6).astype(np.int32)
         self.y_np = np.array([20]).astype(np.int32)
@@ -70,6 +74,7 @@ def setUp(self):
 
 
 class TestLcmAPI3(TestLcmAPI):
+
     def setUp(self):
         self.x_np = 0
         self.y_np = 20
@@ -78,6 +83,7 @@ def setUp(self):
 
 
 class TestLcmAPI4(TestLcmAPI):
+
     def setUp(self):
         self.x_np = 0
         self.y_np = 0
@@ -86,6 +92,7 @@ def setUp(self):
 
 
 class TestLcmAPI5(TestLcmAPI):
+
     def setUp(self):
         self.x_np = 12
         self.y_np = -20
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index 36368a83893c7..b70acce323583 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -86,8 +86,8 @@ def piecewise_decay(global_step, boundaries, values):
 
 def cosine_decay(global_step, learning_rate, step_each_epoch, epochs):
     cur_epoch = math.floor(global_step / step_each_epoch)
-    decayed_lr = learning_rate * 0.5 * (
-        math.cos(cur_epoch * math.pi / epochs) + 1)
+    decayed_lr = learning_rate * 0.5 * (math.cos(cur_epoch * math.pi / epochs) +
+                                        1)
     return decayed_lr
 
 
@@ -122,6 +122,7 @@ def lambda_decay(global_step, learning_rate, lr_lambda):
 
 
 class TestLearningRateDecayDygraph(unittest.TestCase):
+
     def test_LR_state_dict(self):
         with fluid.dygraph.guard():
             x = np.random.uniform(-1, 1, [3, 10]).astype("float32")
@@ -137,15 +138,12 @@ def test_LR_state_dict(self):
             Reducelr_scheduler = fluid.dygraph.ReduceLROnPlateau(
                 learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3)
 
-            adam1 = fluid.optimizer.Adam(
-                learning_rate=Exponential_scheduler,
-                parameter_list=linear.parameters())
-            adam2 = fluid.optimizer.Adam(
-                learning_rate=Step_scheduler,
-                parameter_list=linear.parameters())
-            adam3 = fluid.optimizer.Adam(
-                learning_rate=Reducelr_scheduler,
-                parameter_list=linear.parameters())
+            adam1 = fluid.optimizer.Adam(learning_rate=Exponential_scheduler,
+                                         parameter_list=linear.parameters())
+            adam2 = fluid.optimizer.Adam(learning_rate=Step_scheduler,
+                                         parameter_list=linear.parameters())
+            adam3 = fluid.optimizer.Adam(learning_rate=Reducelr_scheduler,
+                                         parameter_list=linear.parameters())
             print(adam3.state_dict())
 
             for epoch in range(10):
@@ -177,22 +175,22 @@ def test_LR_state_dict(self):
                 learning_rate=Exponential_scheduler_test,
                 parameter_list=linear.parameters())
             adam_test.set_dict(opt_state)
-            self.assertEqual(adam_test._learning_rate.step_num,
-                             adam1._learning_rate.step_num,
-                             "epoch_num is different before and after set_dict")
+            self.assertEqual(
+                adam_test._learning_rate.step_num,
+                adam1._learning_rate.step_num,
+                "epoch_num is different before and after set_dict")
 
             fluid.dygraph.save_dygraph(adam2.state_dict(), "save_path")
             _, opt_state = fluid.dygraph.load_dygraph("save_path")
-            adam_test = fluid.optimizer.Adam(
-                learning_rate=Step_scheduler_test,
-                parameter_list=linear.parameters())
+            adam_test = fluid.optimizer.Adam(learning_rate=Step_scheduler_test,
+                                             parameter_list=linear.parameters())
             adam_test.set_dict(opt_state)
-            self.assertEqual(adam_test._learning_rate.epoch_num,
-                             adam2._learning_rate.epoch_num,
-                             "epoch_num is different before and after set_dict")
             self.assertEqual(
-                adam_test._learning_rate(),
-                adam2._learning_rate(),
+                adam_test._learning_rate.epoch_num,
+                adam2._learning_rate.epoch_num,
+                "epoch_num is different before and after set_dict")
+            self.assertEqual(
+                adam_test._learning_rate(), adam2._learning_rate(),
                 "current learning rate is different before and after set_dict")
 
             fluid.dygraph.save_dygraph(adam3.state_dict(), "save_path")
@@ -201,9 +199,10 @@ def test_LR_state_dict(self):
                 learning_rate=Reducelr_scheduler_test,
                 parameter_list=linear.parameters())
             adam_test.set_dict(opt_state)
-            self.assertEqual(adam_test._learning_rate.best_loss,
-                             adam3._learning_rate.best_loss.numpy()[0],
-                             "best_loss is different before and after set_dict")
+            self.assertEqual(
+                adam_test._learning_rate.best_loss,
+                adam3._learning_rate.best_loss.numpy()[0],
+                "best_loss is different before and after set_dict")
             self.assertEqual(
                 adam_test._learning_rate.cooldown_counter,
                 adam3._learning_rate.cooldown_counter,
@@ -216,8 +215,7 @@ def test_LR_state_dict(self):
                              adam3._learning_rate.epoch_num,
                              "epoch is different before and after set_dict")
             self.assertEqual(
-                adam_test._learning_rate(),
-                adam3._learning_rate(),
+                adam_test._learning_rate(), adam3._learning_rate(),
                 "current learning rate is different before and after set_dict")
 
     def test_NoamDecay(self):
@@ -235,18 +233,20 @@ def test_NoamDecay(self):
                 self.assertAlmostEqual(
                     right_result,
                     fluid_result[0],
-                    msg='Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'.
-                    format(step, right_result, fluid_result[0]))
+                    msg=
+                    'Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'
+                    .format(step, right_result, fluid_result[0]))
 
     def test_LinearLrWarmup(self):
         with fluid.dygraph.guard():
-            lr = fluid.layers.polynomial_decay(
-                learning_rate=1.0,
-                decay_steps=10,
-                end_learning_rate=0.0,
-                power=1.0)
-            lr = fluid.layers.linear_lr_warmup(
-                learning_rate=lr, warmup_steps=2, start_lr=0.0, end_lr=1.0)
+            lr = fluid.layers.polynomial_decay(learning_rate=1.0,
+                                               decay_steps=10,
+                                               end_learning_rate=0.0,
+                                               power=1.0)
+            lr = fluid.layers.linear_lr_warmup(learning_rate=lr,
+                                               warmup_steps=2,
+                                               start_lr=0.0,
+                                               end_lr=1.0)
 
             right_result = [0.5, 0.9, 0.8, 0.7, 0.6]
             for i in range(5):
@@ -257,11 +257,10 @@ def test_LinearLrWarmup(self):
                     np.allclose((t.numpy())[0].item(), right_result[i]))
 
             with self.assertRaises(TypeError):
-                lr = fluid.layers.linear_lr_warmup(
-                    learning_rate="fake_lr",
-                    warmup_steps=2,
-                    start_lr=0.0,
-                    end_lr=1.0)
+                lr = fluid.layers.linear_lr_warmup(learning_rate="fake_lr",
+                                                   warmup_steps=2,
+                                                   start_lr=0.0,
+                                                   end_lr=1.0)
 
     def test_MultiStepDecay(self):
         with fluid.dygraph.guard():
@@ -283,8 +282,9 @@ def test_MultiStepDecay(self):
                 self.assertAlmostEqual(
                     right_result,
                     fluid_result,
-                    msg='Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.
-                    format(epoch, right_result, fluid_result))
+                    msg=
+                    'Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'
+                    .format(epoch, right_result, fluid_result))
 
             with self.assertRaises(ValueError):
                 lr = fluid.dygraph.MultiStepDecay(learning_rate, [30, 50, 20],
@@ -315,8 +315,9 @@ def test_StepDecay(self):
                 self.assertAlmostEqual(
                     right_result,
                     fluid_result,
-                    msg='Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.
-                    format(epoch, right_result, fluid_result))
+                    msg=
+                    'Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'
+                    .format(epoch, right_result, fluid_result))
 
             with self.assertRaises(TypeError):
                 lr = fluid.dygraph.StepDecay(learning_rate, "test", 0.1)
@@ -331,8 +332,8 @@ def test_LambdaDecay(self):
             scheduler = fluid.dygraph.LambdaDecay(learning_rate, lr_lambda)
 
             linear = fluid.dygraph.nn.Linear(10, 10)
-            adam = fluid.optimizer.Adam(
-                scheduler, parameter_list=linear.parameters())
+            adam = fluid.optimizer.Adam(scheduler,
+                                        parameter_list=linear.parameters())
 
             for epoch in range(30):
                 right_result = lambda_decay(epoch, learning_rate, lr_lambda)
@@ -341,14 +342,16 @@ def test_LambdaDecay(self):
                 self.assertAlmostEqual(
                     right_result,
                     fluid_result,
-                    msg='Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.
-                    format(epoch, right_result, fluid_result))
+                    msg=
+                    'Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'
+                    .format(epoch, right_result, fluid_result))
 
             with self.assertRaises(TypeError):
                 lr = fluid.dygraph.LambdaDecay(learning_rate, "test")
 
 
 class TestLearningRateDecay(unittest.TestCase):
+
     def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -375,14 +378,15 @@ def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
             if python_decay_fn.__name__ == 'noam_decay':
                 step += 1
             lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
-            python_decayed_lr = python_decay_fn(
-                global_step=float(step), **kwargs)
+            python_decayed_lr = python_decay_fn(global_step=float(step),
+                                                **kwargs)
             self.assertAlmostEqual(
                 python_decayed_lr,
                 lr_val[0],
-                msg='Failed lr scheduler is {0}, step {1}, Python result is {2}, Fluid result is {3}'.
-                format(python_decay_fn.__name__,
-                       str(step), str(python_decayed_lr), str(lr_val[0])))
+                msg=
+                'Failed lr scheduler is {0}, step {1}, Python result is {2}, Fluid result is {3}'
+                .format(python_decay_fn.__name__, str(step),
+                        str(python_decayed_lr), str(lr_val[0])))
 
     def test_decay(self):
         common_kwargs_true = {
@@ -401,26 +405,31 @@ def test_decay(self):
             (natural_exp_decay, layers.natural_exp_decay, common_kwargs_false),
             (inverse_time_decay, layers.inverse_time_decay, common_kwargs_true),
             (inverse_time_decay, layers.inverse_time_decay,
-             common_kwargs_false), (polynomial_decay, layers.polynomial_decay, {
-                 "learning_rate": 1.0,
-                 "decay_steps": 5,
-                 "cycle": True
-             }), (polynomial_decay, layers.polynomial_decay, {
-                 "learning_rate": 1.0,
-                 "decay_steps": 5,
-                 "cycle": False
-             }), (piecewise_decay, layers.piecewise_decay, {
-                 "boundaries": [3, 6, 9],
-                 "values": [0.1, 0.2, 0.3, 0.4]
-             }), (cosine_decay, layers.cosine_decay, {
-                 "learning_rate": 0.1,
-                 "step_each_epoch": 100,
-                 "epochs": 120
-             }), (noam_decay, layers.noam_decay, {
-                 "d_model": 0.01,
-                 "warmup_steps": 200,
-                 "learning_rate": 2.0
-             })
+             common_kwargs_false),
+            (polynomial_decay, layers.polynomial_decay, {
+                "learning_rate": 1.0,
+                "decay_steps": 5,
+                "cycle": True
+            }),
+            (polynomial_decay, layers.polynomial_decay, {
+                "learning_rate": 1.0,
+                "decay_steps": 5,
+                "cycle": False
+            }),
+            (piecewise_decay, layers.piecewise_decay, {
+                "boundaries": [3, 6, 9],
+                "values": [0.1, 0.2, 0.3, 0.4]
+            }),
+            (cosine_decay, layers.cosine_decay, {
+                "learning_rate": 0.1,
+                "step_each_epoch": 100,
+                "epochs": 120
+            }),
+            (noam_decay, layers.noam_decay, {
+                "d_model": 0.01,
+                "warmup_steps": 200,
+                "learning_rate": 2.0
+            })
         ]
 
         for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
@@ -433,6 +442,7 @@ def test_decay(self):
 
 
 class TestLinearWamrupLearningRateDecay(unittest.TestCase):
+
     def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
                                kwargs):
         main_prog = fluid.Program()
@@ -443,8 +453,8 @@ def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
         end_lr = 0.1
 
         with fluid.program_guard(main_prog, startup_prog):
-            decayed_lr = layers.linear_lr_warmup(
-                fluid_decay_fn(**kwargs), warmup_steps, start_lr, end_lr)
+            decayed_lr = layers.linear_lr_warmup(fluid_decay_fn(**kwargs),
+                                                 warmup_steps, start_lr, end_lr)
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -456,20 +466,22 @@ def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
                 step += 1
             lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
             if step < warmup_steps:
-                python_decayed_lr = linear_lr_warmup(
-                    float(step), warmup_steps, start_lr, end_lr)
+                python_decayed_lr = linear_lr_warmup(float(step), warmup_steps,
+                                                     start_lr, end_lr)
             else:
-                python_decayed_lr = python_decay_fn(
-                    global_step=float(step), **kwargs)
+                python_decayed_lr = python_decay_fn(global_step=float(step),
+                                                    **kwargs)
             self.assertAlmostEqual(
                 python_decayed_lr,
                 lr_val[0],
-                msg='Test {0} Failed, step {1}, Python result is {2}, Fluid result is {3}'.
-                format(python_decay_fn.__name__,
-                       str(step), str(python_decayed_lr), str(lr_val[0])))
+                msg=
+                'Test {0} Failed, step {1}, Python result is {2}, Fluid result is {3}'
+                .format(python_decay_fn.__name__, str(step),
+                        str(python_decayed_lr), str(lr_val[0])))
 
 
 class TestLinearWamrupLearningRateDecayWithScalarInput(unittest.TestCase):
+
     def run_scalar_lr(self, place, lr, start_lr, end_lr):
         main_prog = fluid.Program()
         startup_prog = fluid.Program()
@@ -486,8 +498,8 @@ def run_scalar_lr(self, place, lr, start_lr, end_lr):
         for step in range(20):
             lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
             if step < warmup_steps:
-                expected_lr = linear_lr_warmup(
-                    float(step), warmup_steps, start_lr, end_lr)
+                expected_lr = linear_lr_warmup(float(step), warmup_steps,
+                                               start_lr, end_lr)
             else:
                 expected_lr = lr
             self.assertAlmostEqual(
@@ -497,6 +509,7 @@ def run_scalar_lr(self, place, lr, start_lr, end_lr):
                     step, expected_lr, lr_val[0]))
 
     def test_scalar_lr(self):
+
         def run_places(lr, start_lr, end_lr):
             places = [fluid.CPUPlace()]
             if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_lerp_op.py b/python/paddle/fluid/tests/unittests/test_lerp_op.py
index 10ab2610a26e4..0af6e46c73d7c 100644
--- a/python/paddle/fluid/tests/unittests/test_lerp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lerp_op.py
@@ -25,6 +25,7 @@
 
 
 class TestLerp(OpTest):
+
     def setUp(self):
         self.op_type = "lerp"
         self.python_api = paddle.lerp
@@ -50,31 +51,37 @@ def test_check_grad(self):
 
 
 class TestLerpWithDim2(TestLerp):
+
     def init_shape(self):
         self.shape = [2, 50]
 
 
 class TestLerpWithDim3(TestLerp):
+
     def init_shape(self):
         self.shape = [2, 2, 25]
 
 
 class TestLerpWithDim4(TestLerp):
+
     def init_shape(self):
         self.shape = [2, 2, 5, 5]
 
 
 class TestLerpWithDim5(TestLerp):
+
     def init_shape(self):
         self.shape = [2, 1, 2, 5, 5]
 
 
 class TestLerpWithDim6(TestLerp):
+
     def init_shape(self):
         self.shape = [2, 1, 2, 5, 1, 5]
 
 
 class TestLerpAPI(unittest.TestCase):
+
     def init_dtype(self):
         self.dtype = 'float32'
 
@@ -108,6 +115,7 @@ def run(place):
             run(place)
 
     def test_dygraph_api(self):
+
         def run(place):
             paddle.disable_static(place)
             x = paddle.to_tensor(self.x)
@@ -121,6 +129,7 @@ def run(place):
             run(place)
 
     def test_inplace_api(self):
+
         def run(place):
             paddle.disable_static(place)
             x = paddle.to_tensor(self.x)
@@ -133,6 +142,7 @@ def run(place):
             run(place)
 
     def test_inplace_api_exception(self):
+
         def run(place):
             paddle.disable_static(place)
             x = paddle.to_tensor(self.x)
@@ -159,8 +169,8 @@ def test_x_y_broadcast_w(self):
         x = np.arange(11., 21.).astype(self.dtype).reshape([2, 5])
         y = np.full(20, 7.5).astype(self.dtype).reshape([2, 2, 5])
         w = np.full(40, 0.225).astype(self.dtype).reshape([2, 2, 2, 5])
-        out = paddle.lerp(
-            paddle.to_tensor(x), paddle.to_tensor(y), paddle.to_tensor(w))
+        out = paddle.lerp(paddle.to_tensor(x), paddle.to_tensor(y),
+                          paddle.to_tensor(w))
         res_ref = x + w * (y - x)
         self.assertEqual(np.allclose(res_ref, out.numpy()), True)
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_lgamma_op.py b/python/paddle/fluid/tests/unittests/test_lgamma_op.py
index 8e9edab55baf8..cb6b031eb9824 100644
--- a/python/paddle/fluid/tests/unittests/test_lgamma_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lgamma_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class TestLgammaOp(OpTest):
+
     def setUp(self):
         self.op_type = 'lgamma'
         self.python_api = paddle.lgamma
@@ -46,12 +47,15 @@ def test_check_grad_normal(self):
 
 
 class TestLgammaOpFp32(TestLgammaOp):
+
     def init_dtype_type(self):
         self.dtype = np.float32
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X'], 'Out', numeric_grad_delta=0.005, check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        numeric_grad_delta=0.005,
+                        check_eager=True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_limit_by_capacity_op.py b/python/paddle/fluid/tests/unittests/test_limit_by_capacity_op.py
index d273185ad185f..3e604d25657ee 100644
--- a/python/paddle/fluid/tests/unittests/test_limit_by_capacity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_limit_by_capacity_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -46,9 +46,12 @@ def all_close(exp, out, n_worker):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestLimitByCapacityInt64API(unittest.TestCase):
+
     def init_test_case(self):
-        self.expert_count = np.random.randint(
-            0, 1000, size=(len(self.capacity) * self.n_worker))
+        self.expert_count = np.random.randint(0,
+                                              1000,
+                                              size=(len(self.capacity) *
+                                                    self.n_worker))
         self.out = limit_by_capacity(self.expert_count, self.capacity,
                                      self.n_worker)
         self.expert_count = self.expert_count.astype("int64")
@@ -63,8 +66,9 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            capacity = paddle.static.data(
-                'capacity', shape=self.capacity.shape, dtype="int64")
+            capacity = paddle.static.data('capacity',
+                                          shape=self.capacity.shape,
+                                          dtype="int64")
             expert_count_tensor = paddle.static.data(
                 'ExpertCount', shape=self.expert_count.shape, dtype="int64")
             out = utils._limit_by_capacity(expert_count_tensor, capacity,
@@ -95,6 +99,7 @@ def test_dygraph_api(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestLimitByCapacityInt64API_SmallWorker(TestLimitByCapacityInt64API):
+
     def setUp(self):
         self.capacity = np.array([100, 12000, 1200, 0, 4700, 1000, 57, 200])
         self.n_worker = 1
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_cond.py b/python/paddle/fluid/tests/unittests/test_linalg_cond.py
index 42fb2fbc578bf..74b50c11ce482 100644
--- a/python/paddle/fluid/tests/unittests/test_linalg_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_linalg_cond.py
@@ -33,8 +33,9 @@ def test_static_assert_true(self, x_list, p_list):
                 exe = static.Executor()
                 result = exe.run(feed={"X": x}, fetch_list=[output])
                 expected_output = np.linalg.cond(x, p)
-                np.testing.assert_allclose(
-                    result[0], expected_output, rtol=5e-5)
+                np.testing.assert_allclose(result[0],
+                                           expected_output,
+                                           rtol=5e-5)
 
 
 def test_dygraph_assert_true(self, x_list, p_list):
@@ -43,8 +44,9 @@ def test_dygraph_assert_true(self, x_list, p_list):
             input_tensor = paddle.to_tensor(x)
             output = paddle.linalg.cond(input_tensor, p)
             expected_output = np.linalg.cond(x, p)
-            np.testing.assert_allclose(
-                output.numpy(), expected_output, rtol=5e-5)
+            np.testing.assert_allclose(output.numpy(),
+                                       expected_output,
+                                       rtol=5e-5)
 
 
 def gen_input():
@@ -81,6 +83,7 @@ def gen_empty_input():
 
 
 class API_TestStaticCond(unittest.TestCase):
+
     def test_out(self):
         paddle.enable_static()
         # test calling results of 'cond' in static mode
@@ -90,6 +93,7 @@ def test_out(self):
 
 
 class API_TestDygraphCond(unittest.TestCase):
+
     def func_out(self):
         paddle.disable_static()
         # test calling results of 'cond' in dynamic mode
@@ -104,6 +108,7 @@ def test_out(self):
 
 
 class TestCondAPIError(unittest.TestCase):
+
     def func_dygraph_api_error(self):
         paddle.disable_static()
         # test raising errors when 'cond' is called in dygraph mode
@@ -160,6 +165,7 @@ def test_static_empty_input_error(self):
 
 
 class TestCondEmptyTensorInput(unittest.TestCase):
+
     def func_dygraph_empty_tensor_input(self):
         paddle.disable_static()
         # test calling results of 'cond' when input is an empty tensor in dynamic mode
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
index 59ac2e28087c8..07729ae4e79cf 100644
--- a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
@@ -22,6 +22,7 @@
 
 
 class LinalgLstsqTestCase(unittest.TestCase):
+
     def setUp(self):
         self.devices = ["cpu"]
         self.init_config()
@@ -45,8 +46,9 @@ def generate_input(self):
 
     def generate_output(self):
         if len(self._input_shape_1) == 2:
-            out = np.linalg.lstsq(
-                self._input_data_1, self._input_data_2, rcond=self.rcond)
+            out = np.linalg.lstsq(self._input_data_1,
+                                  self._input_data_2,
+                                  rcond=self.rcond)
             self._output_solution = out[0]
             self._output_residuals = out[1]
             self._output_rank = out[2]
@@ -57,10 +59,9 @@ def generate_output(self):
             self._output_rank = []
             self._output_sg_values = []
             for i in range(self._input_shape_1[0]):
-                out = np.linalg.lstsq(
-                    self._input_data_1[i],
-                    self._input_data_2[i],
-                    rcond=self.rcond)
+                out = np.linalg.lstsq(self._input_data_1[i],
+                                      self._input_data_2[i],
+                                      rcond=self.rcond)
                 self._output_solution.append(out[0])
                 self._output_residuals.append(out[1])
                 self._output_rank.append(out[2])
@@ -71,12 +72,16 @@ def test_dygraph(self):
         for dev in self.devices:
             paddle.set_device(dev)
             place = paddle.CPUPlace() if dev == "cpu" else paddle.CUDAPlace(0)
-            x = paddle.to_tensor(
-                self._input_data_1, place=place, dtype=self.dtype)
-            y = paddle.to_tensor(
-                self._input_data_2, place=place, dtype=self.dtype)
-            results = paddle.linalg.lstsq(
-                x, y, rcond=self.rcond, driver=self.driver)
+            x = paddle.to_tensor(self._input_data_1,
+                                 place=place,
+                                 dtype=self.dtype)
+            y = paddle.to_tensor(self._input_data_2,
+                                 place=place,
+                                 dtype=self.dtype)
+            results = paddle.linalg.lstsq(x,
+                                          y,
+                                          rcond=self.rcond,
+                                          driver=self.driver)
             self._result_solution = results[0].numpy()
             self._result_residuals = results[1].numpy()
             self._result_rank = results[2].numpy()
@@ -89,22 +94,23 @@ def test_static(self):
             paddle.set_device(dev)
             place = fluid.CPUPlace() if dev == "cpu" else fluid.CUDAPlace(0)
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                x = paddle.fluid.data(
-                    name="x",
-                    shape=self._input_shape_1,
-                    dtype=self._input_data_1.dtype)
-                y = paddle.fluid.data(
-                    name="y",
-                    shape=self._input_shape_2,
-                    dtype=self._input_data_2.dtype)
-                results = paddle.linalg.lstsq(
-                    x, y, rcond=self.rcond, driver=self.driver)
+                x = paddle.fluid.data(name="x",
+                                      shape=self._input_shape_1,
+                                      dtype=self._input_data_1.dtype)
+                y = paddle.fluid.data(name="y",
+                                      shape=self._input_shape_2,
+                                      dtype=self._input_data_2.dtype)
+                results = paddle.linalg.lstsq(x,
+                                              y,
+                                              rcond=self.rcond,
+                                              driver=self.driver)
                 exe = fluid.Executor(place)
-                fetches = exe.run(
-                    fluid.default_main_program(),
-                    feed={"x": self._input_data_1,
-                          "y": self._input_data_2},
-                    fetch_list=[results])
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={
+                                      "x": self._input_data_1,
+                                      "y": self._input_data_2
+                                  },
+                                  fetch_list=[results])
                 self._result_solution = fetches[0]
                 self._result_residuals = fetches[1]
                 self._result_rank = fetches[2]
@@ -113,41 +119,44 @@ def test_static(self):
 
     def assert_np_close(self):
         if len(self._input_shape_1) == 2:
-            np.testing.assert_allclose(
-                self._result_solution, self._output_solution, rtol=1e-3)
+            np.testing.assert_allclose(self._result_solution,
+                                       self._output_solution,
+                                       rtol=1e-3)
             if self._input_shape_1[-2] > self._input_shape_1[
                     -1] and self._output_rank == self._input_shape_1[-1]:
-                np.testing.assert_allclose(
-                    self._result_residuals, self._output_residuals, rtol=1e-5)
+                np.testing.assert_allclose(self._result_residuals,
+                                           self._output_residuals,
+                                           rtol=1e-5)
             if self.driver in ("gelsy", "gelsd", "gelss"):
-                np.testing.assert_allclose(
-                    self._result_rank, self._output_rank, rtol=1e-5)
+                np.testing.assert_allclose(self._result_rank,
+                                           self._output_rank,
+                                           rtol=1e-5)
             if self.driver in ("gelsd", "gelss"):
-                np.testing.assert_allclose(
-                    self._result_sg_values, self._output_sg_values, rtol=1e-5)
+                np.testing.assert_allclose(self._result_sg_values,
+                                           self._output_sg_values,
+                                           rtol=1e-5)
         else:
             for i in range(len(self._output_solution)):
-                np.testing.assert_allclose(
-                    self._result_solution[i],
-                    self._output_solution[i],
-                    rtol=1e-3)
+                np.testing.assert_allclose(self._result_solution[i],
+                                           self._output_solution[i],
+                                           rtol=1e-3)
                 if self._input_shape_1[-2] > self._input_shape_1[
                         -1] and self._output_rank[i] == self._input_shape_1[-1]:
-                    np.testing.assert_allclose(
-                        self._result_residuals[i],
-                        self._output_residuals[i],
-                        rtol=1e-5)
+                    np.testing.assert_allclose(self._result_residuals[i],
+                                               self._output_residuals[i],
+                                               rtol=1e-5)
                 if self.driver in ("gelsy", "gelsd", "gelss"):
-                    np.testing.assert_allclose(
-                        self._result_rank[i], self._output_rank[i], rtol=1e-5)
+                    np.testing.assert_allclose(self._result_rank[i],
+                                               self._output_rank[i],
+                                               rtol=1e-5)
                 if self.driver in ("gelsd", "gelss"):
-                    np.testing.assert_allclose(
-                        self._result_sg_values[i],
-                        self._output_sg_values[i],
-                        rtol=1e-5)
+                    np.testing.assert_allclose(self._result_sg_values[i],
+                                               self._output_sg_values[i],
+                                               rtol=1e-5)
 
 
 class LinalgLstsqTestCase1(LinalgLstsqTestCase):
+
     def init_config(self):
         self.dtype = 'float32'
         self.rcond = 1e-15
@@ -157,6 +166,7 @@ def init_config(self):
 
 
 class LinalgLstsqTestCase2(LinalgLstsqTestCase):
+
     def init_config(self):
         self.dtype = 'float64'
         self.rcond = 1e-15
@@ -166,6 +176,7 @@ def init_config(self):
 
 
 class LinalgLstsqTestCaseRcond(LinalgLstsqTestCase):
+
     def init_config(self):
         self.dtype = 'float64'
         self.rcond = 1e-7
@@ -175,6 +186,7 @@ def init_config(self):
 
 
 class LinalgLstsqTestCaseGelsFloat32(LinalgLstsqTestCase):
+
     def init_config(self):
         self.dtype = 'float32'
         self.rcond = None
@@ -184,6 +196,7 @@ def init_config(self):
 
 
 class LinalgLstsqTestCaseGelssFloat64(LinalgLstsqTestCase):
+
     def init_config(self):
         self.dtype = 'float64'
         self.rcond = None
@@ -193,6 +206,7 @@ def init_config(self):
 
 
 class LinalgLstsqTestCaseGelsyFloat32(LinalgLstsqTestCase):
+
     def init_config(self):
         self.dtype = 'float32'
         self.rcond = 1e-15
@@ -202,6 +216,7 @@ def init_config(self):
 
 
 class LinalgLstsqTestCaseBatch1(LinalgLstsqTestCase):
+
     def init_config(self):
         self.dtype = 'float32'
         self.rcond = 1e-15
@@ -211,6 +226,7 @@ def init_config(self):
 
 
 class LinalgLstsqTestCaseBatch2(LinalgLstsqTestCase):
+
     def init_config(self):
         self.dtype = 'float64'
         self.rcond = 1e-15
@@ -220,6 +236,7 @@ def init_config(self):
 
 
 class LinalgLstsqTestCaseLarge1(LinalgLstsqTestCase):
+
     def init_config(self):
         self.dtype = 'float64'
         self.rcond = 1e-15
@@ -229,6 +246,7 @@ def init_config(self):
 
 
 class LinalgLstsqTestCaseLarge2(LinalgLstsqTestCase):
+
     def init_config(self):
         self.dtype = 'float64'
         self.rcond = 1e-15
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py b/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py
index 8d0a34009d6e5..5d3c1ff96415d 100644
--- a/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linalg_pinv_op.py
@@ -26,6 +26,7 @@
 
 
 class LinalgPinvTestCase(unittest.TestCase):
+
     def setUp(self):
         self.init_config()
         self.generate_input()
@@ -53,8 +54,9 @@ def test_dygraph(self):
         for place in self.places:
             paddle.disable_static(place)
             x = paddle.to_tensor(self._input_data, place=place)
-            out = paddle.linalg.pinv(
-                x, rcond=self.rcond, hermitian=self.hermitian).numpy()
+            out = paddle.linalg.pinv(x,
+                                     rcond=self.rcond,
+                                     hermitian=self.hermitian).numpy()
             if (np.abs(out - self._output_data) < 1e-6).any():
                 pass
             else:
@@ -69,12 +71,12 @@ def test_static(self):
             places.append(fluid.CUDAPlace(0))
         for place in places:
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                x = paddle.fluid.data(
-                    name="input",
-                    shape=self._input_shape,
-                    dtype=self._input_data.dtype)
-                out = paddle.linalg.pinv(
-                    x, rcond=self.rcond, hermitian=self.hermitian)
+                x = paddle.fluid.data(name="input",
+                                      shape=self._input_shape,
+                                      dtype=self._input_data.dtype)
+                out = paddle.linalg.pinv(x,
+                                         rcond=self.rcond,
+                                         hermitian=self.hermitian)
                 exe = fluid.Executor(place)
                 fetches = exe.run(fluid.default_main_program(),
                                   feed={"input": self._input_data},
@@ -88,10 +90,12 @@ def test_static(self):
 
     def test_grad(self):
         for place in self.places:
-            x = paddle.to_tensor(
-                self._input_data, place=place, stop_gradient=False)
-            out = paddle.linalg.pinv(
-                x, rcond=self.rcond, hermitian=self.hermitian)
+            x = paddle.to_tensor(self._input_data,
+                                 place=place,
+                                 stop_gradient=False)
+            out = paddle.linalg.pinv(x,
+                                     rcond=self.rcond,
+                                     hermitian=self.hermitian)
             try:
                 out.backward()
                 x_grad = x.grad
@@ -101,6 +105,7 @@ def test_grad(self):
 
 
 class LinalgPinvTestCase1(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (4, 5)
         np.random.seed(123)
@@ -109,6 +114,7 @@ def generate_input(self):
 
 
 class LinalgPinvTestCase2(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (5, 4)
         np.random.seed(123)
@@ -117,6 +123,7 @@ def generate_input(self):
 
 
 class LinalgPinvTestCaseBatch1(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (3, 5, 5)
         np.random.seed(123)
@@ -125,6 +132,7 @@ def generate_input(self):
 
 
 class LinalgPinvTestCaseBatch2(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (3, 4, 5)
         np.random.seed(123)
@@ -133,6 +141,7 @@ def generate_input(self):
 
 
 class LinalgPinvTestCaseBatch3(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (3, 5, 4)
         np.random.seed(123)
@@ -141,6 +150,7 @@ def generate_input(self):
 
 
 class LinalgPinvTestCaseBatch4(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (3, 6, 5, 4)
         np.random.seed(123)
@@ -149,6 +159,7 @@ def generate_input(self):
 
 
 class LinalgPinvTestCaseBatchBig(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (2, 200, 300)
         np.random.seed(123)
@@ -157,6 +168,7 @@ def generate_input(self):
 
 
 class LinalgPinvTestCaseFP32(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (3, 5, 5)
         np.random.seed(123)
@@ -170,6 +182,7 @@ def init_config(self):
 
 
 class LinalgPinvTestCaseRcond(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (3, 5, 5)
         np.random.seed(123)
@@ -183,6 +196,7 @@ def init_config(self):
 
 
 class LinalgPinvTestCaseHermitian1(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (5, 5)
         np.random.seed(123)
@@ -197,6 +211,7 @@ def init_config(self):
 
 
 class LinalgPinvTestCaseHermitian2(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (3, 5, 5)
         np.random.seed(123)
@@ -211,6 +226,7 @@ def init_config(self):
 
 
 class LinalgPinvTestCaseHermitian3(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (3, 5, 5)
         np.random.seed(123)
@@ -225,6 +241,7 @@ def init_config(self):
 
 
 class LinalgPinvTestCaseHermitian4(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (5, 5)
         np.random.seed(123)
@@ -238,6 +255,7 @@ def init_config(self):
 
 
 class LinalgPinvTestCaseHermitian5(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (3, 5, 5)
         np.random.seed(123)
@@ -251,6 +269,7 @@ def init_config(self):
 
 
 class LinalgPinvTestCaseHermitianFP32(LinalgPinvTestCase):
+
     def generate_input(self):
         self._input_shape = (3, 5, 5)
         np.random.seed(123)
diff --git a/python/paddle/fluid/tests/unittests/test_linear.py b/python/paddle/fluid/tests/unittests/test_linear.py
index 6b00a86e3e900..b03b8866eaf97 100644
--- a/python/paddle/fluid/tests/unittests/test_linear.py
+++ b/python/paddle/fluid/tests/unittests/test_linear.py
@@ -26,13 +26,14 @@
 
 
 class LinearTestCase(unittest.TestCase):
+
     def setUp(self):
         self.dtype = 'float32'
         self.input = np.ones((3, 1, 2)).astype(self.dtype)
         self.weight = np.ones((2, 2)).astype(self.dtype)
         self.bias = np.ones((2)).astype(self.dtype)
-        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        self.place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
 
     def functional(self, place):
         paddle.disable_static(place)
@@ -57,8 +58,10 @@ def paddle_nn_layer(self, place):
             trainable=False,
             regularizer=None,
             initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
-        linear = paddle.nn.Linear(
-            2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
+        linear = paddle.nn.Linear(2,
+                                  2,
+                                  weight_attr=weight_attr,
+                                  bias_attr=bias_attr)
         y = linear(input)
         return y.numpy()
 
@@ -77,8 +80,10 @@ def test_weight_init(self):
         if not paddle.is_compiled_with_cuda():
             return
         paddle.seed(100)
-        linear = paddle.nn.Linear(
-            2, 3, weight_attr=paddle.nn.initializer.Normal(0, 1.))
+        linear = paddle.nn.Linear(2,
+                                  3,
+                                  weight_attr=paddle.nn.initializer.Normal(
+                                      0, 1.))
         paddle.nn.utils._stride_column(linear.weight)
         expect = [[1.4349908, -0.8099171, -2.64788],
                   [-1.4981681, -1.1784115, -0.023253186]]
diff --git a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
index 8a9204c73fc03..45ae358f886ab 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
@@ -22,6 +22,7 @@
 
 
 class LinearChainCrfForward(object):
+
     def __init__(self, seq_start_positions, emission_weights, emission_row_max,
                  emission_exps, transition_weights, transition_exps, labels):
         self.tag_num = emission_weights.shape[1]
@@ -47,8 +48,8 @@ def __init__(self, seq_start_positions, emission_weights, emission_row_max,
         # The output of linear chain crf operator.
         # alpha is a memo table in dynamic programming to calculate
         # nomalization factor.
-        self.alpha = np.zeros(
-            (seq_start_positions[-1], self.tag_num), dtype="float64")
+        self.alpha = np.zeros((seq_start_positions[-1], self.tag_num),
+                              dtype="float64")
         self.log_likelihood = np.zeros((self.seq_num, 1))
 
     def _l1_norm(self, x):
@@ -78,8 +79,8 @@ def _forward_a_sequence(self, x, x_row_max, x_exps, label, alpha):
         log_likelihood -= np.log(s)
 
         # calculate the nominator part.
-        log_likelihood += (
-            self.a[label[0]] + x[0, label[0]] + self.b[label[-1]])
+        log_likelihood += (self.a[label[0]] + x[0, label[0]] +
+                           self.b[label[-1]])
 
         for k in range(1, seq_len):
             log_likelihood += (x[k, label[k]] + self.w[label[k - 1], label[k]])
@@ -99,6 +100,7 @@ def crf_forward_compute(self):
 
 
 class TestLinearChainCrfOp(OpTest):
+
     def set_test_data(self):
         # TODO(caoying) Fix the unittest by: add the boundary cases when
         # sequence lengths are 1, 2, and 3.
@@ -122,8 +124,10 @@ def set_test_data(self):
                                        [TAG_NUM + 2, TAG_NUM]).astype("float64")
         transition_exps = np.exp(transition)
 
-        labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64")
+        labels = np.random.randint(low=0,
+                                   high=TAG_NUM,
+                                   size=(seq_start_pos[-1], 1),
+                                   dtype="int64")
 
         self.inputs = {
             "Emission": (emission, lod),
@@ -153,11 +157,13 @@ def test_check_grad(self):
         self.check_grad(["Emission", "Transition"], "LogLikelihood")
 
     def test_check_grad_ignore_transition(self):
-        self.check_grad(
-            ["Emission"], "LogLikelihood", no_grad_set=set("Transition"))
+        self.check_grad(["Emission"],
+                        "LogLikelihood",
+                        no_grad_set=set("Transition"))
 
 
 class TestLinearChainCrfPaddingTensor(OpTest):
+
     def seq_pad(self, data, length):
         max_len = np.max(length)
         shape = [len(length), max_len] + list(data.shape[1:])
@@ -180,7 +186,7 @@ def seq_pad_exps(self, data, length):
         return padded
 
     def set_test_data_1(self):
-        # Fix the unittest by: add padding tensor in inputs 
+        # Fix the unittest by: add padding tensor in inputs
         SEQ_NUM = 3
         TAG_NUM = 17
         MAX_SEQ_LEN = 5
@@ -199,8 +205,10 @@ def set_test_data_1(self):
                                        [TAG_NUM + 2, TAG_NUM]).astype("float64")
         transition_exps = np.exp(transition)
 
-        labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64")
+        labels = np.random.randint(low=0,
+                                   high=TAG_NUM,
+                                   size=(seq_start_pos[-1], 1),
+                                   dtype="int64")
         self.inputs = {
             "Emission": self.seq_pad(emission, lod[0]),
             "Transition": transition,
@@ -229,8 +237,9 @@ def test_check_grad(self):
         self.check_grad(["Emission", "Transition"], "LogLikelihood")
 
     def test_check_grad_ignore_transition(self):
-        self.check_grad(
-            ["Emission"], "LogLikelihood", no_grad_set=set("Transition"))
+        self.check_grad(["Emission"],
+                        "LogLikelihood",
+                        no_grad_set=set("Transition"))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
index c9948edad0061..dd44e70d92ef6 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
@@ -63,8 +63,8 @@ def linear_interp_np(input,
             w1lambda = ratio_w * j - w
         w2lambda = 1.0 - w1lambda
 
-        out[:, :, j] = w2lambda * input[:, :, w] + w1lambda * input[:, :, w +
-                                                                    wid]
+        out[:, :,
+            j] = w2lambda * input[:, :, w] + w1lambda * input[:, :, w + wid]
 
     if data_layout == "NHWC":
         out = np.transpose(out, (0, 2, 1))  # NCHW => NHWC
@@ -73,6 +73,7 @@ def linear_interp_np(input,
 
 
 class TestLinearInterpOp(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -124,46 +125,58 @@ def init_test_case(self):
         self.input_shape = [1, 3, 100]
         self.out_w = 50
         self.scale = 0.
-        self.out_size = np.array([50, ]).astype("int32")
+        self.out_size = np.array([
+            50,
+        ]).astype("int32")
         self.align_corners = False
         self.align_mode = 1
 
 
 class TestLinearInterpOpDataLayout(TestLinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'linear'
         self.input_shape = [1, 3, 100]
         self.out_w = 50
         self.scale = 0.
-        self.out_size = np.array([50, ]).astype("int32")
+        self.out_size = np.array([
+            50,
+        ]).astype("int32")
         self.align_corners = False
         self.align_mode = 1
         self.data_layout = 'NHWC'
 
 
 class TestLinearInterpOpAlignMode(TestLinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'linear'
         self.input_shape = [1, 3, 100]
         self.out_w = 50
         self.scale = 0.
-        self.out_size = np.array([50, ]).astype("int32")
+        self.out_size = np.array([
+            50,
+        ]).astype("int32")
         self.align_corners = False
         self.align_mode = 0
 
 
 class TestLinearInterpOpScale(TestLinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'linear'
         self.input_shape = [1, 3, 100]
         self.out_w = 50
         self.scale = 0.5
-        self.out_size = np.array([50, ]).astype("int32")
+        self.out_size = np.array([
+            50,
+        ]).astype("int32")
         self.align_corners = False
         self.align_mode = 0
 
 
 class TestLinearInterpOpSizeTensor(TestLinearInterpOp):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -212,56 +225,72 @@ def setUp(self):
 
 
 class TestResizeLinearAPI(unittest.TestCase):
+
     def test_case(self):
         x = fluid.data(name="x", shape=[1, 3, 64], dtype="float32")
 
         dim = fluid.data(name="dim", shape=[1], dtype="int32")
         shape_tensor = fluid.data(name="shape_tensor", shape=[1], dtype="int32")
         actual_size = fluid.data(name="actual_size", shape=[1], dtype="int32")
-        scale_tensor = fluid.data(
-            name="scale_tensor", shape=[1], dtype="float32")
-
-        out1 = fluid.layers.resize_linear(
-            x, out_shape=[128, ], align_mode=1, align_corners=False)
-        out2 = fluid.layers.resize_linear(
-            x, out_shape=[128], align_mode=1, align_corners=False)
-        out3 = fluid.layers.resize_linear(
-            x, out_shape=shape_tensor, align_mode=1, align_corners=False)
-        out4 = fluid.layers.resize_linear(
-            x,
-            out_shape=[128, ],
-            actual_shape=actual_size,
-            align_mode=1,
-            align_corners=False)
-        out5 = fluid.layers.resize_linear(
-            x, scale=scale_tensor, align_mode=1, align_corners=False)
-
-        out6 = interpolate(
-            x,
-            scale_factor=scale_tensor,
-            mode='linear',
-            align_mode=1,
-            align_corners=False,
-            data_format='NCW')
-        out7 = interpolate(
-            x,
-            size=[128, ],
-            mode='linear',
-            align_mode=1,
-            align_corners=False,
-            data_format='NCW')
-        out8 = interpolate(
-            x,
-            size=shape_tensor,
-            mode='linear',
-            align_mode=1,
-            align_corners=False,
-            data_format='NCW')
+        scale_tensor = fluid.data(name="scale_tensor",
+                                  shape=[1],
+                                  dtype="float32")
+
+        out1 = fluid.layers.resize_linear(x,
+                                          out_shape=[
+                                              128,
+                                          ],
+                                          align_mode=1,
+                                          align_corners=False)
+        out2 = fluid.layers.resize_linear(x,
+                                          out_shape=[128],
+                                          align_mode=1,
+                                          align_corners=False)
+        out3 = fluid.layers.resize_linear(x,
+                                          out_shape=shape_tensor,
+                                          align_mode=1,
+                                          align_corners=False)
+        out4 = fluid.layers.resize_linear(x,
+                                          out_shape=[
+                                              128,
+                                          ],
+                                          actual_shape=actual_size,
+                                          align_mode=1,
+                                          align_corners=False)
+        out5 = fluid.layers.resize_linear(x,
+                                          scale=scale_tensor,
+                                          align_mode=1,
+                                          align_corners=False)
+
+        out6 = interpolate(x,
+                           scale_factor=scale_tensor,
+                           mode='linear',
+                           align_mode=1,
+                           align_corners=False,
+                           data_format='NCW')
+        out7 = interpolate(x,
+                           size=[
+                               128,
+                           ],
+                           mode='linear',
+                           align_mode=1,
+                           align_corners=False,
+                           data_format='NCW')
+        out8 = interpolate(x,
+                           size=shape_tensor,
+                           mode='linear',
+                           align_mode=1,
+                           align_corners=False,
+                           data_format='NCW')
 
         x_data = np.random.random((1, 3, 64)).astype("float32")
         dim_data = np.array([128]).astype("int32")
-        shape_data = np.array([128, ]).astype("int32")
-        actual_size_data = np.array([128, ]).astype("int32")
+        shape_data = np.array([
+            128,
+        ]).astype("int32")
+        actual_size_data = np.array([
+            128,
+        ]).astype("int32")
         scale_data = np.array([2.0]).astype("float32")
 
         if core.is_compiled_with_cuda():
@@ -282,34 +311,41 @@ def test_case(self):
             fetch_list=[out1, out2, out3, out4, out5, out6, out7, out8],
             return_numpy=True)
 
-        expect_res = linear_interp_np(
-            x_data, out_w=128, align_mode=1, align_corners=False)
+        expect_res = linear_interp_np(x_data,
+                                      out_w=128,
+                                      align_mode=1,
+                                      align_corners=False)
         for res in results:
             self.assertTrue(np.allclose(res, expect_res))
 
 
 class TestLinearInterpOpAPI2_0(unittest.TestCase):
+
     def test_case(self):
 
-        # dygraph 
+        # dygraph
         x_data = np.random.random((1, 3, 128)).astype("float32")
-        us_1 = paddle.nn.Upsample(
-            size=[64, ],
-            mode='linear',
-            align_mode=1,
-            align_corners=False,
-            data_format='NCW')
+        us_1 = paddle.nn.Upsample(size=[
+            64,
+        ],
+                                  mode='linear',
+                                  align_mode=1,
+                                  align_corners=False,
+                                  data_format='NCW')
         with fluid.dygraph.guard():
             x = fluid.dygraph.to_variable(x_data)
             interp = us_1(x)
 
-            expect = linear_interp_np(
-                x_data, out_w=64, align_mode=1, align_corners=False)
+            expect = linear_interp_np(x_data,
+                                      out_w=64,
+                                      align_mode=1,
+                                      align_corners=False)
 
             self.assertTrue(np.allclose(interp.numpy(), expect))
 
 
 class TestResizeLinearOpUint8(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -349,30 +385,41 @@ def init_test_case(self):
         self.input_shape = [2, 3, 100]
         self.out_w = 50
         self.scale = 0.
-        self.out_size = np.array([50, ]).astype("int32")
+        self.out_size = np.array([
+            50,
+        ]).astype("int32")
         self.align_corners = True
         self.align_mode = 1
 
 
 class TestLinearInterpOpException(unittest.TestCase):
+
     def test_exception(self):
+
         def input_shape_error():
             x1 = fluid.data(name="x1", shape=[1], dtype="float32")
-            out = fluid.layers.resize_linear(
-                x1, out_shape=[256, ], data_format='NCW')
+            out = fluid.layers.resize_linear(x1,
+                                             out_shape=[
+                                                 256,
+                                             ],
+                                             data_format='NCW')
 
         def data_format_error():
             x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
-            out = fluid.layers.resize_linear(
-                x2, out_shape=[256, ], data_format='NHWCD')
+            out = fluid.layers.resize_linear(x2,
+                                             out_shape=[
+                                                 256,
+                                             ],
+                                             data_format='NHWCD')
 
         def out_shape_error():
             x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
-            out = fluid.layers.resize_linear(
-                x3, out_shape=[
-                    256,
-                    256,
-                ], data_format='NHWC')
+            out = fluid.layers.resize_linear(x3,
+                                             out_shape=[
+                                                 256,
+                                                 256,
+                                             ],
+                                             data_format='NHWC')
 
         self.assertRaises(ValueError, input_shape_error)
         self.assertRaises(ValueError, data_format_error)
@@ -380,28 +427,36 @@ def out_shape_error():
 
 
 class TestLinearInterpOpError(unittest.TestCase):
+
     def test_error(self):
         with program_guard(Program(), Program()):
 
             def input_shape_error():
                 x1 = fluid.data(name="x1", shape=[1], dtype="float32")
-                out1 = paddle.nn.Upsample(
-                    size=[256, ], data_format='NCW', mode='linear')
+                out1 = paddle.nn.Upsample(size=[
+                    256,
+                ],
+                                          data_format='NCW',
+                                          mode='linear')
                 out1_res = out1(x1)
 
             def data_format_error():
                 x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
-                out2 = paddle.nn.Upsample(
-                    size=[256, ], data_format='NHWCD', mode='linear')
+                out2 = paddle.nn.Upsample(size=[
+                    256,
+                ],
+                                          data_format='NHWCD',
+                                          mode='linear')
                 out2_res = out2(x2)
 
             def out_shape_error():
                 x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
-                out3 = paddle.nn.Upsample(
-                    size=[
-                        256,
-                        256,
-                    ], data_format='NHWC', mode='linear')
+                out3 = paddle.nn.Upsample(size=[
+                    256,
+                    256,
+                ],
+                                          data_format='NHWC',
+                                          mode='linear')
                 out3_res = out3(x3)
 
             self.assertRaises(ValueError, input_shape_error)
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
index b34989f5f5c79..69d652299bef3 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
@@ -67,8 +67,8 @@ def linear_interp_np(input,
             w1lambda = ratio_w * j - w
         w2lambda = 1.0 - w1lambda
 
-        out[:, :, j] = w2lambda * input[:, :, w] + w1lambda * input[:, :, w +
-                                                                    wid]
+        out[:, :,
+            j] = w2lambda * input[:, :, w] + w1lambda * input[:, :, w + wid]
 
     if data_layout == "NHWC":
         out = np.transpose(out, (0, 2, 1))  # NCHW => NHWC
@@ -77,6 +77,7 @@ def linear_interp_np(input,
 
 
 class TestLinearInterpOp(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -136,46 +137,58 @@ def init_test_case(self):
         self.input_shape = [1, 3, 100]
         self.out_w = 50
         self.scale = 0.
-        self.out_size = np.array([50, ]).astype("int32")
+        self.out_size = np.array([
+            50,
+        ]).astype("int32")
         self.align_corners = False
         self.align_mode = 1
 
 
 class TestLinearInterpOpDataLayout(TestLinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'linear'
         self.input_shape = [1, 3, 100]
         self.out_w = 50
         self.scale = 0.
-        self.out_size = np.array([50, ]).astype("int32")
+        self.out_size = np.array([
+            50,
+        ]).astype("int32")
         self.align_corners = False
         self.align_mode = 1
         self.data_layout = 'NHWC'
 
 
 class TestLinearInterpOpAlignMode(TestLinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'linear'
         self.input_shape = [1, 3, 100]
         self.out_w = 50
         self.scale = 0.
-        self.out_size = np.array([50, ]).astype("int32")
+        self.out_size = np.array([
+            50,
+        ]).astype("int32")
         self.align_corners = False
         self.align_mode = 0
 
 
 class TestLinearInterpOpScale(TestLinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'linear'
         self.input_shape = [1, 3, 100]
         self.out_w = 50
         self.scale = 0.5
-        self.out_size = np.array([50, ]).astype("int32")
+        self.out_size = np.array([
+            50,
+        ]).astype("int32")
         self.align_corners = False
         self.align_mode = 0
 
 
 class TestLinearInterpOpSizeTensor(TestLinearInterpOp):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -233,56 +246,72 @@ def setUp(self):
 
 
 class TestResizeLinearAPI(unittest.TestCase):
+
     def test_case(self):
         x = fluid.data(name="x", shape=[1, 3, 64], dtype="float32")
 
         dim = fluid.data(name="dim", shape=[1], dtype="int32")
         shape_tensor = fluid.data(name="shape_tensor", shape=[1], dtype="int32")
         actual_size = fluid.data(name="actual_size", shape=[1], dtype="int32")
-        scale_tensor = fluid.data(
-            name="scale_tensor", shape=[1], dtype="float32")
-
-        out1 = fluid.layers.resize_linear(
-            x, out_shape=[128, ], align_mode=1, align_corners=False)
-        out2 = fluid.layers.resize_linear(
-            x, out_shape=[128], align_mode=1, align_corners=False)
-        out3 = fluid.layers.resize_linear(
-            x, out_shape=shape_tensor, align_mode=1, align_corners=False)
-        out4 = fluid.layers.resize_linear(
-            x,
-            out_shape=[128, ],
-            actual_shape=actual_size,
-            align_mode=1,
-            align_corners=False)
-        out5 = fluid.layers.resize_linear(
-            x, scale=scale_tensor, align_mode=1, align_corners=False)
-
-        out6 = interpolate(
-            x,
-            scale_factor=scale_tensor,
-            mode='linear',
-            align_mode=1,
-            align_corners=False,
-            data_format='NCW')
-        out7 = interpolate(
-            x,
-            size=[128, ],
-            mode='linear',
-            align_mode=1,
-            align_corners=False,
-            data_format='NCW')
-        out8 = interpolate(
-            x,
-            size=shape_tensor,
-            mode='linear',
-            align_mode=1,
-            align_corners=False,
-            data_format='NCW')
+        scale_tensor = fluid.data(name="scale_tensor",
+                                  shape=[1],
+                                  dtype="float32")
+
+        out1 = fluid.layers.resize_linear(x,
+                                          out_shape=[
+                                              128,
+                                          ],
+                                          align_mode=1,
+                                          align_corners=False)
+        out2 = fluid.layers.resize_linear(x,
+                                          out_shape=[128],
+                                          align_mode=1,
+                                          align_corners=False)
+        out3 = fluid.layers.resize_linear(x,
+                                          out_shape=shape_tensor,
+                                          align_mode=1,
+                                          align_corners=False)
+        out4 = fluid.layers.resize_linear(x,
+                                          out_shape=[
+                                              128,
+                                          ],
+                                          actual_shape=actual_size,
+                                          align_mode=1,
+                                          align_corners=False)
+        out5 = fluid.layers.resize_linear(x,
+                                          scale=scale_tensor,
+                                          align_mode=1,
+                                          align_corners=False)
+
+        out6 = interpolate(x,
+                           scale_factor=scale_tensor,
+                           mode='linear',
+                           align_mode=1,
+                           align_corners=False,
+                           data_format='NCW')
+        out7 = interpolate(x,
+                           size=[
+                               128,
+                           ],
+                           mode='linear',
+                           align_mode=1,
+                           align_corners=False,
+                           data_format='NCW')
+        out8 = interpolate(x,
+                           size=shape_tensor,
+                           mode='linear',
+                           align_mode=1,
+                           align_corners=False,
+                           data_format='NCW')
 
         x_data = np.random.random((1, 3, 64)).astype("float32")
         dim_data = np.array([128]).astype("int32")
-        shape_data = np.array([128, ]).astype("int32")
-        actual_size_data = np.array([128, ]).astype("int32")
+        shape_data = np.array([
+            128,
+        ]).astype("int32")
+        actual_size_data = np.array([
+            128,
+        ]).astype("int32")
         scale_data = np.array([2.0]).astype("float32")
 
         if core.is_compiled_with_cuda():
@@ -303,34 +332,41 @@ def test_case(self):
             fetch_list=[out1, out2, out3, out4, out5, out6, out7, out8],
             return_numpy=True)
 
-        expect_res = linear_interp_np(
-            x_data, out_w=128, align_mode=1, align_corners=False)
+        expect_res = linear_interp_np(x_data,
+                                      out_w=128,
+                                      align_mode=1,
+                                      align_corners=False)
         for res in results:
             self.assertTrue(np.allclose(res, expect_res))
 
 
 class TestLinearInterpOpAPI2_0(unittest.TestCase):
+
     def test_case(self):
 
-        # dygraph 
+        # dygraph
         x_data = np.random.random((1, 3, 128)).astype("float32")
-        us_1 = paddle.nn.Upsample(
-            size=[64, ],
-            mode='linear',
-            align_mode=1,
-            align_corners=False,
-            data_format='NCW')
+        us_1 = paddle.nn.Upsample(size=[
+            64,
+        ],
+                                  mode='linear',
+                                  align_mode=1,
+                                  align_corners=False,
+                                  data_format='NCW')
         with fluid.dygraph.guard():
             x = fluid.dygraph.to_variable(x_data)
             interp = us_1(x)
 
-            expect = linear_interp_np(
-                x_data, out_w=64, align_mode=1, align_corners=False)
+            expect = linear_interp_np(x_data,
+                                      out_w=64,
+                                      align_mode=1,
+                                      align_corners=False)
 
             self.assertTrue(np.allclose(interp.numpy(), expect))
 
 
 class TestResizeLinearOpUint8(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -379,30 +415,41 @@ def init_test_case(self):
         self.input_shape = [2, 3, 100]
         self.out_w = 50
         self.scale = 0.
-        self.out_size = np.array([50, ]).astype("int32")
+        self.out_size = np.array([
+            50,
+        ]).astype("int32")
         self.align_corners = True
         self.align_mode = 1
 
 
 class TestLinearInterpOpException(unittest.TestCase):
+
     def test_exception(self):
+
         def input_shape_error():
             x1 = fluid.data(name="x1", shape=[1], dtype="float32")
-            out = fluid.layers.resize_linear(
-                x1, out_shape=[256, ], data_format='NCW')
+            out = fluid.layers.resize_linear(x1,
+                                             out_shape=[
+                                                 256,
+                                             ],
+                                             data_format='NCW')
 
         def data_format_error():
             x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
-            out = fluid.layers.resize_linear(
-                x2, out_shape=[256, ], data_format='NHWCD')
+            out = fluid.layers.resize_linear(x2,
+                                             out_shape=[
+                                                 256,
+                                             ],
+                                             data_format='NHWCD')
 
         def out_shape_error():
             x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
-            out = fluid.layers.resize_linear(
-                x3, out_shape=[
-                    256,
-                    256,
-                ], data_format='NHWC')
+            out = fluid.layers.resize_linear(x3,
+                                             out_shape=[
+                                                 256,
+                                                 256,
+                                             ],
+                                             data_format='NHWC')
 
         self.assertRaises(ValueError, input_shape_error)
         self.assertRaises(ValueError, data_format_error)
@@ -410,28 +457,36 @@ def out_shape_error():
 
 
 class TestLinearInterpOpError(unittest.TestCase):
+
     def test_error(self):
         with program_guard(Program(), Program()):
 
             def input_shape_error():
                 x1 = fluid.data(name="x1", shape=[1], dtype="float32")
-                out1 = paddle.nn.Upsample(
-                    size=[256, ], data_format='NCW', mode='linear')
+                out1 = paddle.nn.Upsample(size=[
+                    256,
+                ],
+                                          data_format='NCW',
+                                          mode='linear')
                 out1_res = out1(x1)
 
             def data_format_error():
                 x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
-                out2 = paddle.nn.Upsample(
-                    size=[256, ], data_format='NHWCD', mode='linear')
+                out2 = paddle.nn.Upsample(size=[
+                    256,
+                ],
+                                          data_format='NHWCD',
+                                          mode='linear')
                 out2_res = out2(x2)
 
             def out_shape_error():
                 x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
-                out3 = paddle.nn.Upsample(
-                    size=[
-                        256,
-                        256,
-                    ], data_format='NHWC', mode='linear')
+                out3 = paddle.nn.Upsample(size=[
+                    256,
+                    256,
+                ],
+                                          data_format='NHWC',
+                                          mode='linear')
                 out3_res = out3(x3)
 
             self.assertRaises(ValueError, input_shape_error)
diff --git a/python/paddle/fluid/tests/unittests/test_linspace.py b/python/paddle/fluid/tests/unittests/test_linspace.py
index 65a6c21fb0720..e22fb537f4531 100644
--- a/python/paddle/fluid/tests/unittests/test_linspace.py
+++ b/python/paddle/fluid/tests/unittests/test_linspace.py
@@ -25,6 +25,7 @@
 
 
 class TestLinspaceOpCommonCase(OpTest):
+
     def setUp(self):
         self.op_type = "linspace"
         self.python_api = paddle.linspace
@@ -43,6 +44,7 @@ def test_check_output(self):
 
 
 class TestLinspaceOpReverseCase(OpTest):
+
     def setUp(self):
         self.op_type = "linspace"
         self.python_api = paddle.linspace
@@ -61,6 +63,7 @@ def test_check_output(self):
 
 
 class TestLinspaceOpNumOneCase(OpTest):
+
     def setUp(self):
         self.op_type = "linspace"
         self.python_api = paddle.linspace
@@ -79,6 +82,7 @@ def test_check_output(self):
 
 
 class TestLinspaceAPI(unittest.TestCase):
+
     def test_variable_input1(self):
         start = paddle.full(shape=[1], fill_value=0, dtype='float32')
         stop = paddle.full(shape=[1], fill_value=10, dtype='float32')
@@ -110,8 +114,11 @@ def test_dtype(self):
 
     def test_name(self):
         with paddle.static.program_guard(paddle.static.Program()):
-            out = paddle.linspace(
-                0, 10, 5, dtype='float32', name='linspace_res')
+            out = paddle.linspace(0,
+                                  10,
+                                  5,
+                                  dtype='float32',
+                                  name='linspace_res')
             assert 'linspace_res' in out.name
 
     def test_imperative(self):
@@ -134,6 +141,7 @@ def test_api_eager_dygraph(self):
 
 
 class TestLinspaceOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index 23c4bc7b97818..0d328034ab7ea 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -54,11 +54,10 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
     config = fluid.DistributeTranspilerConfig()
     config.sync_mode = sync_mode
     t = fluid.DistributeTranspiler(config=config)
-    t.transpile(
-        trainer_id,
-        pservers=pserver_endpoints,
-        trainers=trainers,
-        sync_mode=sync_mode)
+    t.transpile(trainer_id,
+                pservers=pserver_endpoints,
+                trainers=trainers,
+                sync_mode=sync_mode)
     pserver_prog = t.get_pserver_program(current_endpoint)
     pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
     exe.run(pserver_startup)
@@ -92,11 +91,10 @@ def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers,
     config.slice_var_up = False
 
     t = fluid.DistributeTranspiler(config=config)
-    t.transpile(
-        trainer_id,
-        pservers=pserver_endpoints,
-        trainers=trainers,
-        sync_mode=sync_mode)
+    t.transpile(trainer_id,
+                pservers=pserver_endpoints,
+                trainers=trainers,
+                sync_mode=sync_mode)
     pserver_prog = t.get_pserver_program(ps2)
 
     # pserver2 have no parameter
@@ -114,6 +112,7 @@ def gen_complete_file_flag(flag_file):
 
 
 class TestListenAndServOp(unittest.TestCase):
+
     def setUp(self):
         self.ps_timeout = 200
         self.ip = "127.0.0.1"
@@ -122,10 +121,9 @@ def setUp(self):
         self.trainer_id = 0
 
     def _start_pserver(self, use_cuda, sync_mode, pserver_func):
-        p = Process(
-            target=pserver_func,
-            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
-                  self.trainer_id))
+        p = Process(target=pserver_func,
+                    args=(use_cuda, sync_mode, self.ip, self.port,
+                          self.trainers, self.trainer_id))
         p.daemon = True
         p.start()
         return p
diff --git a/python/paddle/fluid/tests/unittests/test_load_op.py b/python/paddle/fluid/tests/unittests/test_load_op.py
index 885c26e2be0a6..2896ff218c7a4 100644
--- a/python/paddle/fluid/tests/unittests/test_load_op.py
+++ b/python/paddle/fluid/tests/unittests/test_load_op.py
@@ -40,8 +40,9 @@ def setUp(self):
                         self.ones)))
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(start_prog)
-        fluid.io.save_persistables(
-            exe, dirname="./model", main_program=main_prog)
+        fluid.io.save_persistables(exe,
+                                   dirname="./model",
+                                   main_program=main_prog)
 
     def test_load(self):
         main_prog = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_load_op_xpu.py b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
index a5af6871be474..f3c8024a21ad3 100644
--- a/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
@@ -43,8 +43,9 @@ def setUp(self):
                         self.ones)))
         exe = fluid.Executor(fluid.XPUPlace(0))
         exe.run(start_prog)
-        fluid.io.save_persistables(
-            exe, dirname="./model", main_program=main_prog)
+        fluid.io.save_persistables(exe,
+                                   dirname="./model",
+                                   main_program=main_prog)
 
     def test_load_xpu(self):
         main_prog = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
index 35ad6fdb30e7b..ac88b7960541c 100644
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -26,21 +26,19 @@
 
 
 def convolutional_neural_network(img):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(input=img,
+                                                  filter_size=5,
+                                                  num_filters=20,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
     conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(input=conv_pool_1,
+                                                  filter_size=5,
+                                                  num_filters=50,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
     prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
     return prediction
 
@@ -58,6 +56,7 @@ def static_train_net(img, label):
 
 
 class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
+
     def setUp(self):
         self.seed = 90
         self.epoch_num = 1
@@ -71,24 +70,24 @@ def train_and_save_model(self, only_params=False):
             startup_program = fluid.default_startup_program()
             main_program = fluid.default_main_program()
 
-            img = fluid.data(
-                name='img', shape=[None, 1, 28, 28], dtype='float32')
+            img = fluid.data(name='img',
+                             shape=[None, 1, 28, 28],
+                             dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
 
             prediction, avg_loss = static_train_net(img, label)
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
 
             exe = fluid.Executor(place)
 
             feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
             exe.run(startup_program)
 
-            train_reader = paddle.batch(
-                paddle.reader.shuffle(
-                    paddle.dataset.mnist.train(), buf_size=100),
-                batch_size=self.batch_size)
+            train_reader = paddle.batch(paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=100),
+                                        batch_size=self.batch_size)
 
             for _ in range(0, self.epoch_num):
                 for batch_id, data in enumerate(train_reader()):
@@ -105,8 +104,9 @@ def train_and_save_model(self, only_params=False):
                     param.name)
 
             if only_params:
-                fluid.io.save_params(
-                    exe, self.save_dirname, filename=self.params_filename)
+                fluid.io.save_params(exe,
+                                     self.save_dirname,
+                                     filename=self.params_filename)
             else:
                 fluid.io.save_inference_model(
                     self.save_dirname, ["img"], [prediction],
@@ -142,8 +142,8 @@ def test_load_with_model_filename(self):
             self.save_dirname, model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
-        new_load_param_dict = paddle.load(
-            self.save_dirname, model_filename=self.model_filename)
+        new_load_param_dict = paddle.load(self.save_dirname,
+                                          model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_with_param_filename(self):
@@ -156,8 +156,8 @@ def test_load_with_param_filename(self):
             self.save_dirname, params_filename=self.params_filename)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
-        new_load_param_dict = paddle.load(
-            self.save_dirname, params_filename=self.params_filename)
+        new_load_param_dict = paddle.load(self.save_dirname,
+                                          params_filename=self.params_filename)
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_with_model_and_param_filename(self):
@@ -172,10 +172,9 @@ def test_load_with_model_and_param_filename(self):
             model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
-        new_load_param_dict = paddle.load(
-            self.save_dirname,
-            params_filename=self.params_filename,
-            model_filename=self.model_filename)
+        new_load_param_dict = paddle.load(self.save_dirname,
+                                          params_filename=self.params_filename,
+                                          model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_state_dict_from_save_params(self):
diff --git a/python/paddle/fluid/tests/unittests/test_load_vars_shape_check.py b/python/paddle/fluid/tests/unittests/test_load_vars_shape_check.py
index 3e2e778d40e46..566c4929bf76a 100644
--- a/python/paddle/fluid/tests/unittests/test_load_vars_shape_check.py
+++ b/python/paddle/fluid/tests/unittests/test_load_vars_shape_check.py
@@ -22,6 +22,7 @@
 
 
 class TestLoadVarsShapeCheck(unittest.TestCase):
+
     def setUp(self):
         self.model_path = "./model_temp/"
 
diff --git a/python/paddle/fluid/tests/unittests/test_locality_aware_nms_op.py b/python/paddle/fluid/tests/unittests/test_locality_aware_nms_op.py
index 71e2e6fe5925a..54678b2f45c60 100644
--- a/python/paddle/fluid/tests/unittests/test_locality_aware_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_locality_aware_nms_op.py
@@ -122,8 +122,9 @@ def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
                 else:
                     score_index.append((scores[idx][c], c, idx))
 
-        sorted_score_index = sorted(
-            score_index, key=lambda tup: tup[0], reverse=True)
+        sorted_score_index = sorted(score_index,
+                                    key=lambda tup: tup[0],
+                                    reverse=True)
         sorted_score_index = sorted_score_index[:keep_top_k]
         selected_indices = {}
 
@@ -153,16 +154,15 @@ def batched_multiclass_nms(boxes,
 
     lod = []
     for n in range(batch_size):
-        nmsed_outs, nmsed_num = multiclass_nms(
-            boxes[n],
-            scores[n],
-            background,
-            score_threshold,
-            nms_threshold,
-            nms_top_k,
-            keep_top_k,
-            normalized,
-            shared=True)
+        nmsed_outs, nmsed_num = multiclass_nms(boxes[n],
+                                               scores[n],
+                                               background,
+                                               score_threshold,
+                                               nms_threshold,
+                                               nms_top_k,
+                                               keep_top_k,
+                                               normalized,
+                                               shared=True)
         lod.append(nmsed_num)
 
         if nmsed_num == 0:
@@ -175,13 +175,15 @@ def batched_multiclass_nms(boxes,
                     c, scores[n][c][idx], xmin, ymin, xmax, ymax,
                     idx + n * num_boxes
                 ])
-        sorted_det_out = sorted(
-            tmp_det_out, key=lambda tup: tup[0], reverse=False)
+        sorted_det_out = sorted(tmp_det_out,
+                                key=lambda tup: tup[0],
+                                reverse=False)
         det_outs.extend(sorted_det_out)
     return det_outs, lod
 
 
 class TestLocalAwareNMSOp(OpTest):
+
     def set_argument(self):
         self.score_threshold = 0.01
 
@@ -216,9 +218,10 @@ def softmax(x):
 
         boxes_copy = copy.deepcopy(boxes)
         scores_copy = copy.deepcopy(scores)
-        det_outs, lod = batched_multiclass_nms(
-            boxes_copy, scores_copy, background, score_threshold, nms_threshold,
-            nms_top_k, keep_top_k)
+        det_outs, lod = batched_multiclass_nms(boxes_copy, scores_copy,
+                                               background, score_threshold,
+                                               nms_threshold, nms_top_k,
+                                               keep_top_k)
 
         lod = [1] if not det_outs else lod
         det_outs = [[-1, 0]] if not det_outs else det_outs
@@ -243,11 +246,13 @@ def test_check_output(self):
 
 
 class TestLocalAwareNMSOpNoBoxes(TestLocalAwareNMSOp):
+
     def set_argument(self):
         self.score_threshold = 2.0
 
 
 class TestLocalAwareNMSOp4Points(OpTest):
+
     def set_argument(self):
         self.score_threshold = 0.01
 
@@ -267,24 +272,29 @@ def setUp(self):
         boxes = np.array([[[
             0.42078365, 0.58117018, 2.92776169, 3.28557757, 4.24344318,
             0.92196165, 2.72370856, -1.66141214
-        ], [
-            0.13856006, 1.86871034, 2.81287224, 3.61381734, 4.5505249,
-            0.51766346, 2.75630304, -1.91459389
-        ]], [[
-            1.57533883, 1.3217477, 3.07904942, 3.89512545, 4.78680923,
-            1.96914586, 3.539482, -1.59739244
-        ], [
-            0.55084125, 1.71596215, 2.52476074, 3.18940435, 5.09035159,
-            0.91959482, 3.71442385, -0.57299128
-        ]]])
+        ],
+                           [
+                               0.13856006, 1.86871034, 2.81287224, 3.61381734,
+                               4.5505249, 0.51766346, 2.75630304, -1.91459389
+                           ]],
+                          [[
+                              1.57533883, 1.3217477, 3.07904942, 3.89512545,
+                              4.78680923, 1.96914586, 3.539482, -1.59739244
+                          ],
+                           [
+                               0.55084125, 1.71596215, 2.52476074, 3.18940435,
+                               5.09035159, 0.91959482, 3.71442385, -0.57299128
+                           ]]])
 
         det_outs = np.array([[
             0., 1.5008917, 0.28206837, 1.2140071, 2.8712926, 3.4469104,
             4.3943763, 0.7232457, 2.7397292, -1.7858533
-        ], [
-            0., 1.1446586, 1.1640508, 1.4800063, 2.856528, 3.6118112, 4.908667,
-            1.5478, 3.609713, -1.1861432
-        ]])
+        ],
+                             [
+                                 0., 1.1446586, 1.1640508, 1.4800063, 2.856528,
+                                 3.6118112, 4.908667, 1.5478, 3.609713,
+                                 -1.1861432
+                             ]])
         lod = [1, 1]
         nmsed_outs = det_outs.astype('float32')
 
@@ -308,28 +318,31 @@ def test_check_output(self):
 
 
 class TestLocalityAwareNMSAPI(unittest.TestCase):
+
     def test_api(self):
         boxes = fluid.data(name='bboxes', shape=[None, 81, 8], dtype='float32')
         scores = fluid.data(name='scores', shape=[None, 1, 81], dtype='float32')
-        fluid.layers.locality_aware_nms(
-            bboxes=boxes,
-            scores=scores,
-            score_threshold=0.5,
-            nms_top_k=400,
-            nms_threshold=0.3,
-            keep_top_k=200,
-            normalized=False)
+        fluid.layers.locality_aware_nms(bboxes=boxes,
+                                        scores=scores,
+                                        score_threshold=0.5,
+                                        nms_top_k=400,
+                                        nms_threshold=0.3,
+                                        keep_top_k=200,
+                                        normalized=False)
 
 
 class TestLocalityAwareNMSError(unittest.TestCase):
+
     def test_error(self):
         boxes = fluid.data(name='bboxes', shape=[None, 81, 8], dtype='float32')
         scores = fluid.data(name='scores', shape=[None, 1, 81], dtype='float32')
 
-        boxes_int = fluid.data(
-            name='bboxes_int', shape=[None, 81, 8], dtype='int32')
-        scores_int = fluid.data(
-            name='scores_int', shape=[None, 1, 81], dtype='int32')
+        boxes_int = fluid.data(name='bboxes_int',
+                               shape=[None, 81, 8],
+                               dtype='int32')
+        scores_int = fluid.data(name='scores_int',
+                                shape=[None, 1, 81],
+                                dtype='int32')
         boxes_tmp = [1, 2]
         scores_tmp = [1, 2]
 
@@ -361,7 +374,7 @@ def test_error(self):
                           scores, 0.5, 400, keep_top_k)
 
         nms_threshold = int(0)
-        # type of nms_threshold must be int 
+        # type of nms_threshold must be int
         self.assertRaises(TypeError, fluid.layers.locality_aware_nms, boxes,
                           scores, 0.5, 400, 200, nms_threshold)
 
@@ -371,7 +384,7 @@ def test_error(self):
                           scores, 0.5, 400, 200, 0.5, nms_eta)
 
         bg_label = 1.5
-        # type of background_label must be int 
+        # type of background_label must be int
         self.assertRaises(TypeError, fluid.layers.locality_aware_nms, boxes,
                           scores, 0.5, 400, 200, 0.5, 1.0, bg_label)
 
diff --git a/python/paddle/fluid/tests/unittests/test_lod_append_op.py b/python/paddle/fluid/tests/unittests/test_lod_append_op.py
index 806880bdce09e..f5102b46586b2 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_append_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_append_op.py
@@ -25,12 +25,15 @@
 
 
 class TestLoDAppendAPI(unittest.TestCase):
+
     def test_api(self, use_cuda=False):
         main_program = Program()
         with fluid.program_guard(main_program):
             x = fluid.layers.data(name='x', shape=[6], dtype='float32')
-            level = fluid.layers.data(
-                name='level', shape=[3], dtype='int32', lod_level=0)
+            level = fluid.layers.data(name='level',
+                                      shape=[3],
+                                      dtype='int32',
+                                      lod_level=0)
             result = fluid.layers.lod_append(x, level)
 
             x_i = np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0]).astype("float32")
@@ -42,14 +45,17 @@ def test_api(self, use_cuda=False):
                 place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
                 exe = fluid.Executor(place)
                 [out] = exe.run(fluid.default_main_program(),
-                                feed={'x': x_i,
-                                      'level': level_i},
+                                feed={
+                                    'x': x_i,
+                                    'level': level_i
+                                },
                                 fetch_list=[result],
                                 return_numpy=False)
                 self.assertEqual(out.recursive_sequence_lengths(), [[2, 4]])
 
 
 class TestLodAppendOpError(unittest.TestCase):
+
     def test_error(self):
         # The input(x) must be Variable.
         x1 = np.array([0.9383, 0.1983, 3.2, 1.2]).astype("float64")
@@ -63,8 +69,10 @@ def test_error(self):
         # Input(x) dtype must be float32 or float64 or int32 or int64
         for dtype in ["bool", "float16"]:
             x3 = fluid.layers.data(name='x3_' + dtype, shape=[4], dtype=dtype)
-            level3 = fluid.layers.data(
-                name='level3' + dtype, shape=[4], dtype='int32', lod_level=2)
+            level3 = fluid.layers.data(name='level3' + dtype,
+                                       shape=[4],
+                                       dtype='int32',
+                                       lod_level=2)
             self.assertRaises(TypeError, fluid.layers.lod_append, x3, level3)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py b/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
index 353cdc5ab8bde..438c6c82f38fb 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
@@ -26,6 +26,7 @@
 
 
 class TestLoDArrayLength(unittest.TestCase):
+
     def test_array_length(self):
         tmp = layers.zeros(shape=[10], dtype='int32')
         i = layers.fill_constant(shape=[1], dtype='int64', value=10)
@@ -38,6 +39,7 @@ def test_array_length(self):
 
 
 class TestLoDArrayLengthOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             #for ci coverage
@@ -47,6 +49,7 @@ def test_errors(self):
 
 
 class TestArrayLengthApi(unittest.TestCase):
+
     def test_api(self):
         paddle.disable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
index d8b5c2eef37b9..ae1d61f3d44b0 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
@@ -23,6 +23,7 @@
 
 
 class TestLoDRankTable(unittest.TestCase):
+
     def test_lod_rank_table(self):
         x = data(name='x', shape=[100])
         cpu = core.CPUPlace()
@@ -33,8 +34,8 @@ def test_lod_rank_table(self):
 
         tensor = core.LoDTensor()
         tensor.set(numpy.random.random(size=(17, 100)), cpu)
-        tensor.set_recursive_sequence_lengths(
-            [[1, 2], [5, 1, 1], [3, 1, 5, 1, 3, 3, 1]])
+        tensor.set_recursive_sequence_lengths([[1, 2], [5, 1, 1],
+                                               [3, 1, 5, 1, 3, 3, 1]])
         exe.run(scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
@@ -42,6 +43,7 @@ def test_lod_rank_table(self):
 
 
 class TestLoDRankTableError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             x = numpy.random.random((2, 4)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
index 1fa172cf0312d..1565e0c563c33 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
@@ -22,6 +22,7 @@
 
 
 class TestLodResetOpByAttr(OpTest):
+
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float64")
@@ -45,6 +46,7 @@ def test_check_grad(self):
 
 
 class TestLodResetOpByInput(OpTest):
+
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float64")
@@ -69,6 +71,7 @@ def test_check_grad(self):
 
 
 class TestLodResetOpBoth(OpTest):
+
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float64")
@@ -93,6 +96,7 @@ def test_check_grad(self):
 
 
 class TestLodResetOpYIsLoDTensor(OpTest):
+
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float64")
@@ -112,6 +116,7 @@ def test_check_grad(self):
 
 
 class TestLodAppendOpByAttr(OpTest):
+
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float64")
@@ -135,6 +140,7 @@ def test_check_grad(self):
 
 
 class TestLodResetOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input must be Variable.
@@ -144,10 +150,13 @@ def test_errors(self):
 
             # Input(x) dtype must be float32 or float64 or int32 or int64
             for dtype in ["bool", "float16"]:
-                x2 = fluid.layers.data(
-                    name='x2' + dtype, shape=[4], dtype=dtype)
-                y2 = fluid.layers.data(
-                    name='y2' + dtype, shape=[4], dtype='int32', lod_level=2)
+                x2 = fluid.layers.data(name='x2' + dtype,
+                                       shape=[4],
+                                       dtype=dtype)
+                y2 = fluid.layers.data(name='y2' + dtype,
+                                       shape=[4],
+                                       dtype='int32',
+                                       lod_level=2)
                 self.assertRaises(TypeError, fluid.layers.lod_reset, x2, y2)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
index 678e9e2119725..793d0e9bf5ab8 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
@@ -21,6 +21,7 @@
 
 
 class TestLoDTensorArray(unittest.TestCase):
+
     def test_get_set(self):
         scope = core.Scope()
         arr = scope.var('tmp_lod_tensor_array')
@@ -45,13 +46,13 @@ def test_get_set(self):
             t.set_recursive_sequence_lengths([[1]])
             tensor_array[i] = t
             t = tensor_array[i]
-            self.assertEqual(
-                numpy.array(t), numpy.array(
-                    [i + 10], dtype='float32'))
+            self.assertEqual(numpy.array(t),
+                             numpy.array([i + 10], dtype='float32'))
             self.assertEqual([[1]], t.recursive_sequence_lengths())
 
 
 class TestCreateArray(unittest.TestCase):
+
     def setUp(self):
         self.place = paddle.CPUPlace()
         self.shapes = [[10, 4], [8, 12], [1]]
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
index 0148e15b07990..2911e7a6b71af 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
@@ -29,6 +29,7 @@
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
+
     def place(self):
         return core.CPUPlace()
 
@@ -41,11 +42,10 @@ def test_lod_tensor_to_array_level_0(self):
             numpy.array(x).astype('int32')
             for x in [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]]
         ]
-        self.main(
-            tensor=tensor,
-            expect_array=expect,
-            expect_lod=[] * 6,
-            expect_max_len=6)
+        self.main(tensor=tensor,
+                  expect_array=expect,
+                  expect_lod=[] * 6,
+                  expect_max_len=6)
 
     def test_lod_tensor_to_array_level_0_empty_seq(self):
         tensor = core.LoDTensor()
@@ -56,11 +56,10 @@ def test_lod_tensor_to_array_level_0_empty_seq(self):
             numpy.array(x).astype('int32')
             for x in [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]]
         ]
-        self.main(
-            tensor=tensor,
-            expect_array=expect,
-            expect_lod=[] * 6,
-            expect_max_len=6)
+        self.main(tensor=tensor,
+                  expect_array=expect,
+                  expect_lod=[] * 6,
+                  expect_max_len=6)
 
     def test_lod_tensor_to_array_level_1(self):
         tensor = core.LoDTensor()
@@ -69,19 +68,17 @@ def test_lod_tensor_to_array_level_1(self):
         tensor.set_recursive_sequence_lengths([[2, 3], [3, 6, 2, 6, 3]])
 
         expect = [
-            numpy.array(
-                [9, 10, 0, 1, 2], dtype='int32'), numpy.array(
-                    [11, 12, 13, 14, 15, 16, 3, 4, 5, 6, 7, 8], dtype='int32'),
-            numpy.array(
-                [17, 18, 19], dtype='int32')
+            numpy.array([9, 10, 0, 1, 2], dtype='int32'),
+            numpy.array([11, 12, 13, 14, 15, 16, 3, 4, 5, 6, 7, 8],
+                        dtype='int32'),
+            numpy.array([17, 18, 19], dtype='int32')
         ]
 
         lod = [[[2, 3]], [[6, 6]], [[3]]]
-        self.main(
-            tensor=tensor,
-            expect_array=expect,
-            expect_lod=lod,
-            expect_max_len=3)
+        self.main(tensor=tensor,
+                  expect_array=expect,
+                  expect_lod=lod,
+                  expect_max_len=3)
 
     def test_lod_tensor_to_array_level_1_empty_seq(self):
         tensor = core.LoDTensor()
@@ -92,19 +89,16 @@ def test_lod_tensor_to_array_level_1_empty_seq(self):
             [[3, 2, 4, 2], [3, 4, 4, 0, 1, 5, 2, 2, 2, 7, 1]])
 
         expect = [
-            numpy.array(
-                item, dtype='int32')
-            for item in [[
+            numpy.array(item, dtype='int32') for item in [[
                 12, 13, 14, 15, 16, 0, 1, 2, 23, 24, 25, 26, 27, 28, 29
             ], [17, 18, 3, 4, 5, 6, 11, 30], [19, 20, 7, 8, 9, 10], [21, 22]]
         ]
 
         lod = [[[5, 3, 0, 7]], [[2, 4, 1, 1]], [[2, 4]], [[2]]]
-        self.main(
-            tensor=tensor,
-            expect_array=expect,
-            expect_lod=lod,
-            expect_max_len=4)
+        self.main(tensor=tensor,
+                  expect_array=expect,
+                  expect_lod=lod,
+                  expect_max_len=4)
 
     def test_lod_tensor_to_array_level_2(self):
         tensor = core.LoDTensor()
@@ -115,18 +109,17 @@ def test_lod_tensor_to_array_level_2(self):
              [3, 4, 4, 6, 4, 1, 1, 4, 4, 8, 6, 1, 4]])
 
         expect = [
-            numpy.array(
-                item, dtype='int32')
-            for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], list(
-                range(22, 39)) + list(range(7, 21)), list(range(39, 46))]
+            numpy.array(item, dtype='int32')
+            for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49],
+                         list(range(22, 39)) + list(range(7, 21)),
+                         list(range(39, 46))]
         ]
         lod = [[[1, 2, 1], [1, 3, 4, 4]], [[4, 3], [1, 4, 4, 8, 4, 6, 4]],
                [[2], [6, 1]]]
-        self.main(
-            tensor=tensor,
-            expect_array=expect,
-            expect_lod=lod,
-            expect_max_len=3)
+        self.main(tensor=tensor,
+                  expect_array=expect,
+                  expect_lod=lod,
+                  expect_max_len=3)
 
     def test_lod_tensor_to_array_level_2_skip_level(self):
         tensor = core.LoDTensor()
@@ -135,12 +128,11 @@ def test_lod_tensor_to_array_level_2_skip_level(self):
         tensor.set_recursive_sequence_lengths(
             [[2, 3, 1], [2, 3, 1, 4, 2, 1],
              [3, 4, 4, 6, 4, 1, 1, 4, 4, 8, 6, 1, 4]])
-        self.main(
-            tensor=tensor,
-            expect_array=None,
-            expect_lod=None,
-            expect_max_len=4,
-            level=1)
+        self.main(tensor=tensor,
+                  expect_array=None,
+                  expect_lod=None,
+                  expect_max_len=4,
+                  level=1)
 
     def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
         place = self.place()
@@ -178,20 +170,23 @@ def check_array_same(self, array, expect_tensor, expect_lod):
             self.assertEqual(exp_lod, array[i].recursive_sequence_lengths())
 
     def check_tensor_same(self, actual, expect):
-        self.assertTrue(
-            numpy.allclose(numpy.array(actual), numpy.array(expect)))
+        self.assertTrue(numpy.allclose(numpy.array(actual),
+                                       numpy.array(expect)))
         self.assertEqual(actual.recursive_sequence_lengths(),
                          expect.recursive_sequence_lengths())
 
 
 class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
+
     def test_grad(self):
         place = core.CPUPlace()
         program = Program()
 
         with program_guard(program):
-            x = layers.data(
-                name='x', shape=[1], dtype='float32', stop_gradient=False)
+            x = layers.data(name='x',
+                            shape=[1],
+                            dtype='float32',
+                            stop_gradient=False)
             table = lod_rank_table(x, level=0)
             array = lod_tensor_to_array(x, table)
             result = array_to_lod_tensor(array, table)
@@ -208,11 +203,10 @@ def test_grad(self):
 
         exe = Executor(place)
         g_out = [
-            numpy.array(item).sum()
-            for item in exe.run(program,
-                                feed={'x': tensor},
-                                fetch_list=[g_vars],
-                                return_numpy=False)
+            numpy.array(item).sum() for item in exe.run(program,
+                                                        feed={'x': tensor},
+                                                        fetch_list=[g_vars],
+                                                        return_numpy=False)
         ]
         g_out_sum = numpy.array(g_out).sum()
 
@@ -220,6 +214,7 @@ def test_grad(self):
 
 
 class TestLoDTensorArrayError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             x = numpy.random.random((10)).astype("float32")
@@ -252,6 +247,7 @@ def test_table_list_Variable():
 
 
 class TestArrayLoDTensorError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             x = numpy.random.random((10)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_log_loss_op.py b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
index 0c57c0addf261..5f7f3019bd10e 100644
--- a/python/paddle/fluid/tests/unittests/test_log_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
@@ -25,6 +25,7 @@ def sigmoid_array(x):
 
 
 class TestLogLossOp(OpTest):
+
     def setUp(self):
         self.op_type = 'log_loss'
         samples_num = 100
@@ -51,6 +52,7 @@ def test_check_grad(self):
 
 
 class TestLogLossOpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program()):
 
diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py
index b3b164725fc34..b1b21e0666f5d 100644
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -40,6 +40,7 @@ def ref_log_softmax_grad(x, axis):
 
 
 class TestLogSoftmaxOp(OpTest):
+
     def setUp(self):
         self.op_type = 'log_softmax'
         self.python_api = F.log_softmax
@@ -63,16 +64,19 @@ def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], ['Out'], user_defined_grads=[self.x_grad], check_eager=True)
+        self.check_grad(['X'], ['Out'],
+                        user_defined_grads=[self.x_grad],
+                        check_eager=True)
 
 
 class TestLogSoftmaxShape(TestLogSoftmaxOp):
+
     def set_attrs(self):
         self.shape = [12, 10]
 
 
 class TestLogSoftmaxAxis(TestLogSoftmaxOp):
+
     def set_attrs(self):
         self.axis = 1
 
@@ -80,6 +84,7 @@ def set_attrs(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestLogSoftmaxBF16Op(OpTest):
+
     def setUp(self):
         self.op_type = 'log_softmax'
         self.python_api = F.log_softmax
@@ -101,13 +106,13 @@ def test_check_output(self):
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], ['Out'],
-            user_defined_grads=[self.x_grad],
-            check_eager=True)
+        self.check_grad_with_place(place, ['X'], ['Out'],
+                                   user_defined_grads=[self.x_grad],
+                                   check_eager=True)
 
 
 class TestNNLogSoftmaxAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_shape = [2, 3, 4, 5]
         self.x = np.random.uniform(-1., 1., self.x_shape).astype(np.float32)
@@ -140,6 +145,7 @@ def test_check_api(self):
 
 
 class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_shape = [2, 3, 4, 5]
         self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_logical_op.py b/python/paddle/fluid/tests/unittests/test_logical_op.py
index e2c7cf3a5bb2b..80f65401176ca 100755
--- a/python/paddle/fluid/tests/unittests/test_logical_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logical_op.py
@@ -178,10 +178,10 @@ def test(unit_test, use_gpu=False, test_error=False):
             META_DATA = dict(TEST_META_WRONG_SHAPE_DATA)
         for shape_data in META_DATA.values():
             for data_type in SUPPORTED_DTYPES:
-                meta_data['x_np'] = np_data_generator(
-                    shape_data['x_shape'], dtype=data_type)
-                meta_data['y_np'] = np_data_generator(
-                    shape_data['y_shape'], dtype=data_type)
+                meta_data['x_np'] = np_data_generator(shape_data['x_shape'],
+                                                      dtype=data_type)
+                meta_data['y_np'] = np_data_generator(shape_data['y_shape'],
+                                                      dtype=data_type)
                 if meta_data['binary_op'] and test_error:
                     # catch C++ Exception
                     unit_test.assertRaises(BaseException, run_static,
@@ -197,12 +197,13 @@ def test(unit_test, use_gpu=False, test_error=False):
                 else:
                     np_result = np_op(meta_data['x_np'])
                 unit_test.assertTrue((static_result == np_result).all())
-                unit_test.assertTrue((dygraph_result.numpy() == np_result).all(
-                ))
+                unit_test.assertTrue(
+                    (dygraph_result.numpy() == np_result).all())
                 unit_test.assertTrue((eager_result.numpy() == np_result).all())
 
 
 def test_type_error(unit_test, use_gpu, type_str_map):
+
     def check_type(op_str, x, y, binary_op):
         op = getattr(paddle, op_str)
         error_type = ValueError
@@ -237,10 +238,12 @@ def check_type(op_str, x, y, binary_op):
         startup_program = paddle.static.Program()
         main_program = paddle.static.Program()
         with paddle.static.program_guard(main_program, startup_program):
-            x = paddle.static.data(
-                name='x', shape=[10], dtype=type_str_map['x'])
-            y = paddle.static.data(
-                name='y', shape=[10], dtype=type_str_map['y'])
+            x = paddle.static.data(name='x',
+                                   shape=[10],
+                                   dtype=type_str_map['x'])
+            y = paddle.static.data(name='y',
+                                   shape=[10],
+                                   dtype=type_str_map['y'])
             check_type(meta_data['op_str'], x, y, binary_op)
 
 
@@ -252,6 +255,7 @@ def type_map_factory():
 
 
 class TestCPU(unittest.TestCase):
+
     def test(self):
         test(self)
 
@@ -265,6 +269,7 @@ def test_type_error(self):
 
 
 class TestCUDA(unittest.TestCase):
+
     def test(self):
         test(self, True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_logit_op.py b/python/paddle/fluid/tests/unittests/test_logit_op.py
index 9b46039da13b1..44865936af9e5 100644
--- a/python/paddle/fluid/tests/unittests/test_logit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logit_op.py
@@ -17,6 +17,7 @@
 from op_test import OpTest
 import paddle
 from paddle.fluid.framework import _test_eager_guard
+
 np.random.seed(10)
 
 
@@ -36,6 +37,7 @@ def logit_grad(x, eps=1e-8):
 
 
 class TestLogitOp(OpTest):
+
     def setUp(self):
         self.op_type = 'logit'
         self.python_api = paddle.logit
@@ -57,21 +59,25 @@ def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], ['Out'], user_defined_grads=[self.x_grad], check_eager=True)
+        self.check_grad(['X'], ['Out'],
+                        user_defined_grads=[self.x_grad],
+                        check_eager=True)
 
 
 class TestLogitShape(TestLogitOp):
+
     def set_attrs(self):
         self.shape = [2, 60]
 
 
 class TestLogitEps(TestLogitOp):
+
     def set_attrs(self):
         self.eps = 1e-8
 
 
 class TestLogitAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_shape = [120]
         self.x = np.random.uniform(0., 1., self.x_shape).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_logspace.py b/python/paddle/fluid/tests/unittests/test_logspace.py
index ffa9885e7671e..11da5d4accb9f 100644
--- a/python/paddle/fluid/tests/unittests/test_logspace.py
+++ b/python/paddle/fluid/tests/unittests/test_logspace.py
@@ -21,6 +21,7 @@
 
 
 class TestLogspaceOpCommonCase(OpTest):
+
     def setUp(self):
         self.op_type = "logspace"
         dtype = 'float32'
@@ -39,6 +40,7 @@ def test_check_output(self):
 
 
 class TestLogspaceOpReverseCase(OpTest):
+
     def setUp(self):
         self.op_type = "logspace"
         dtype = 'float32'
@@ -57,6 +59,7 @@ def test_check_output(self):
 
 
 class TestLogspaceOpNumOneCase(OpTest):
+
     def setUp(self):
         self.op_type = "logspace"
         dtype = 'float32'
@@ -75,6 +78,7 @@ def test_check_output(self):
 
 
 class TestLogspaceOpMinusBaseCase(OpTest):
+
     def setUp(self):
         self.op_type = "logspace"
         dtype = 'float32'
@@ -93,6 +97,7 @@ def test_check_output(self):
 
 
 class TestLogspaceOpZeroBaseCase(OpTest):
+
     def setUp(self):
         self.op_type = "logspace"
         dtype = 'float32'
@@ -111,6 +116,7 @@ def test_check_output(self):
 
 
 class TestLogspaceAPI(unittest.TestCase):
+
     def test_variable_input1(self):
         paddle.enable_static()
         prog = paddle.static.Program()
@@ -152,8 +158,12 @@ def test_dtype(self):
 
     def test_name(self):
         with paddle.static.program_guard(paddle.static.Program()):
-            out = paddle.logspace(
-                0, 10, 5, 2, dtype='float32', name='logspace_res')
+            out = paddle.logspace(0,
+                                  10,
+                                  5,
+                                  2,
+                                  dtype='float32',
+                                  name='logspace_res')
             assert 'logspace_res' in out.name
 
     def test_imperative(self):
@@ -171,6 +181,7 @@ def test_imperative(self):
 
 
 class TestLogspaceOpError(unittest.TestCase):
+
     def test_errors(self):
         with paddle.static.program_guard(paddle.static.Program()):
 
@@ -200,8 +211,9 @@ def test_num_type():
             self.assertRaises(TypeError, test_num_type)
 
             def test_start_dtype():
-                start = paddle.static.data(
-                    shape=[1], dtype="float64", name="start")
+                start = paddle.static.data(shape=[1],
+                                           dtype="float64",
+                                           name="start")
                 paddle.logspace(start, 10, 1, 2, dtype="float32")
 
             self.assertRaises(ValueError, test_start_dtype)
@@ -213,15 +225,17 @@ def test_end_dtype():
             self.assertRaises(ValueError, test_end_dtype)
 
             def test_num_dtype():
-                num = paddle.static.data(
-                    shape=[1], dtype="float32", name="step")
+                num = paddle.static.data(shape=[1],
+                                         dtype="float32",
+                                         name="step")
                 paddle.logspace(0, 10, num, 2, dtype="float32")
 
             self.assertRaises(TypeError, test_num_dtype)
 
             def test_base_dtype():
-                base = paddle.static.data(
-                    shape=[1], dtype="float64", name="end")
+                base = paddle.static.data(shape=[1],
+                                          dtype="float64",
+                                          name="end")
                 paddle.logspace(0, 10, 1, base, dtype="float32")
 
             self.assertRaises(ValueError, test_base_dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py
index 91eb65ef284a5..3e54147b18068 100644
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
@@ -36,6 +36,7 @@ def logsumexp_wrapper(x, axis=None, keepdim=False, allreduce=False):
 
 
 class TestLogsumexp(OpTest):
+
     def setUp(self):
         self.op_type = 'logsumexp'
         self.python_api = logsumexp_wrapper
@@ -85,16 +86,19 @@ def calc_grad(self):
 
 
 class TestLogsumexp_shape(TestLogsumexp):
+
     def set_attrs(self):
         self.shape = [4, 5, 6]
 
 
 class TestLogsumexp_axis(TestLogsumexp):
+
     def set_attrs(self):
         self.axis = [0, -1]
 
 
 class TestLogsumexp_axis_all(TestLogsumexp):
+
     def set_attrs(self):
         self.axis = [0, 1, 2, 3]
 
@@ -105,11 +109,13 @@ def set_attrs_addition(self):
 
 
 class TestLogsumexp_keepdim(TestLogsumexp):
+
     def set_attrs(self):
         self.keepdim = True
 
 
 class TestLogsumexp_reduce_all(TestLogsumexp):
+
     def set_attrs(self):
         self.reduce_all = True
 
@@ -120,6 +126,7 @@ def set_attrs_addition(self):
 
 
 class TestLogsumexpError(unittest.TestCase):
+
     def test_errors(self):
         with paddle.static.program_guard(paddle.static.Program()):
             self.assertRaises(TypeError, paddle.logsumexp, 1)
@@ -128,6 +135,7 @@ def test_errors(self):
 
 
 class TestLogsumexpAPI(unittest.TestCase):
+
     def setUp(self):
         self.shape = [2, 3, 4, 5]
         self.x = np.random.uniform(-1, 1, self.shape).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_lookahead.py b/python/paddle/fluid/tests/unittests/test_lookahead.py
index 263310043a5f7..efbc28cfa6cea 100644
--- a/python/paddle/fluid/tests/unittests/test_lookahead.py
+++ b/python/paddle/fluid/tests/unittests/test_lookahead.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -30,6 +30,7 @@
 
 
 class TestLookAhead(unittest.TestCase):
+
     def test_lookahead_static(self):
         paddle.enable_static()
         place = fluid.CPUPlace()
@@ -65,8 +66,9 @@ def test_lookahead_static(self):
             if i == 0:
                 slow_param = latest_b
             if (i + 1) % LOOKAHEAD_K == 0:
-                self.assertAlmostEqual(
-                    slow_param.all(), latest_b.all(), delta=5e-3)
+                self.assertAlmostEqual(slow_param.all(),
+                                       latest_b.all(),
+                                       delta=5e-3)
             fast_param = latest_b - SGD_LR * b_grad
 
     def func_test_look_ahead_dygraph(self):
@@ -79,6 +81,7 @@ def func_test_look_ahead_dygraph(self):
 
         # define a random dataset
         class RandomDataset(paddle.io.Dataset):
+
             def __init__(self, num_samples):
                 self.num_samples = num_samples
 
@@ -92,6 +95,7 @@ def __len__(self):
                 return self.num_samples
 
         class LinearNet(nn.Layer):
+
             def __init__(self):
                 super(LinearNet, self).__init__()
                 self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
@@ -111,35 +115,34 @@ def train(layer, loader, loss_fn, opt):
                     out = layer(image)
                     loss = loss_fn(out, label)
                     loss.backward()
-                    fast_param = (
-                        layer.bias.numpy() - SGD_LR * layer.bias.grad.numpy())
+                    fast_param = (layer.bias.numpy() -
+                                  SGD_LR * layer.bias.grad.numpy())
                     opt.step()
                     if idx == 1:
                         slow_param = fast_param
                     if idx % LOOKAHEAD_K == 0:
                         slow_param = slow_param + LOOKAHEAD_ALPHA * (
                             fast_param - slow_param)
-                        self.assertAlmostEqual(
-                            np.mean(slow_param),
-                            np.mean(layer.bias.numpy()),
-                            delta=5e-3)
+                        self.assertAlmostEqual(np.mean(slow_param),
+                                               np.mean(layer.bias.numpy()),
+                                               delta=5e-3)
                     opt.clear_grad()
 
         layer = LinearNet()
         loss_fn = nn.CrossEntropyLoss()
         optimizer = paddle.optimizer.SGD(learning_rate=SGD_LR,
                                          parameters=layer.parameters())
-        lookahead = paddle.incubate.optimizer.LookAhead(
-            optimizer, alpha=LOOKAHEAD_ALPHA, k=LOOKAHEAD_K)
+        lookahead = paddle.incubate.optimizer.LookAhead(optimizer,
+                                                        alpha=LOOKAHEAD_ALPHA,
+                                                        k=LOOKAHEAD_K)
 
         # create data loader
         dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
-        loader = paddle.io.DataLoader(
-            dataset,
-            batch_size=BATCH_SIZE,
-            shuffle=True,
-            drop_last=True,
-            num_workers=2)
+        loader = paddle.io.DataLoader(dataset,
+                                      batch_size=BATCH_SIZE,
+                                      shuffle=True,
+                                      drop_last=True,
+                                      num_workers=2)
 
         train(layer, loader, loss_fn, lookahead)
 
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
index 0a247b4dbe0a9..9dc7c1aa63656 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
@@ -16,9 +16,10 @@
 
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import (
-    OpTest, convert_float_to_uint16, convert_uint16_to_float,
-    skip_check_grad_ci)
+from paddle.fluid.tests.unittests.op_test import (OpTest,
+                                                  convert_float_to_uint16,
+                                                  convert_uint16_to_float,
+                                                  skip_check_grad_ci)
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
@@ -49,6 +50,7 @@ def _get_grad(weights, ids, flat_ids, op_version="lookup_table"):
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestLookupTableBF16Op(OpTest):
+
     def init_test(self):
         self.op_type = "lookup_table"
         self.ids_shape = (4, 1)
@@ -75,19 +77,19 @@ def test_check_output(self):
         self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            core.CPUPlace(), ['W'],
-            'Out',
-            no_grad_set=set('Ids'),
-            check_dygraph=False,
-            max_relative_error=1.5e-2,
-            user_defined_grads=[self.w_grad_fp32],
-            user_defined_grad_outputs=[self.out_bf16])
+        self.check_grad_with_place(core.CPUPlace(), ['W'],
+                                   'Out',
+                                   no_grad_set=set('Ids'),
+                                   check_dygraph=False,
+                                   max_relative_error=1.5e-2,
+                                   user_defined_grads=[self.w_grad_fp32],
+                                   user_defined_grad_outputs=[self.out_bf16])
 
 
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestLookupTableBF16OpIds4D(TestLookupTableBF16Op):
+
     def init_test(self):
         self.op_type = "lookup_table"
         self.ids_shape = (2, 4, 5, 1)
@@ -96,14 +98,15 @@ def init_test(self):
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestLookupTableBF16OpWIsSelectedRows(unittest.TestCase):
+
     def init_test(self):
         self.op_type = "lookup_table"
         self.ids_shape = (10, 1)
 
     def setUp(self):
         self.init_test()
-        self.ids = np.random.randint(
-            low=0, high=15, size=self.ids_shape).astype("int64")
+        self.ids = np.random.randint(low=0, high=15,
+                                     size=self.ids_shape).astype("int64")
         self.flat_ids = self.ids.flatten()
         self.w_fp32 = np.random.random((15, 32)).astype("float32")
         self.w_bf16 = convert_float_to_uint16(self.w_fp32)
@@ -147,6 +150,7 @@ def test_check_output(self):
                  "place does not support BF16 evaluation")
 class TestLookupTableBF16OpWIsSelectedRows4DIds(
         TestLookupTableBF16OpWIsSelectedRows):
+
     def init_test(self):
         self.op_type = "lookup_table"
         self.ids_shape = (3, 4, 5, 1)
@@ -163,6 +167,7 @@ def setUp(self):
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestLookupTableBF16OpWithPadding(TestLookupTableBF16Op):
+
     def test_check_output(self):
         ids = np.squeeze(self.inputs['Ids'])
         padding_idx = np.random.choice(ids, 1)[0]
@@ -178,6 +183,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestLookupTableBF16OpIds4DPadding(TestLookupTableBF16OpIds4D):
+
     def test_check_output(self):
         ids = self.inputs['Ids']
         flatten_idx = ids.flatten()
@@ -198,8 +204,8 @@ def set_initializer(self):
     def setUp(self):
         self.ids_shape = [4, 1]
         self.w_shape = [10, 64]
-        self.ids = np.random.randint(
-            low=0, high=9, size=self.ids_shape).astype("int64")
+        self.ids = np.random.randint(low=0, high=9,
+                                     size=self.ids_shape).astype("int64")
         self.flat_ids = self.ids.flatten()
         self.value = 3.0
         self.w_fp32 = np.full(self.w_shape, self.value)
@@ -210,13 +216,13 @@ def setUp(self):
 
         with fluid.program_guard(self.prog, self.startup_prog):
             x = fluid.layers.data(name='x', shape=self.ids_shape, dtype='int64')
-            self.emb = fluid.layers.embedding(
-                input=x,
-                size=self.w_shape,
-                param_attr=fluid.ParamAttr(
-                    name="emb_weight", initializer=self.initializer),
-                is_sparse=False,
-                dtype="uint16")  # bfloat16
+            self.emb = fluid.layers.embedding(input=x,
+                                              size=self.w_shape,
+                                              param_attr=fluid.ParamAttr(
+                                                  name="emb_weight",
+                                                  initializer=self.initializer),
+                                              is_sparse=False,
+                                              dtype="uint16")  # bfloat16
         exe = fluid.Executor(self.place)
         exe.run(self.startup_prog)
         self.result = exe.run(self.prog,
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_dequant_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_dequant_op.py
index 689b9992a6d9f..934504e02bf6e 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_dequant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_dequant_op.py
@@ -26,6 +26,7 @@
 
 
 class TestLookupTableDequantOp(OpTest):
+
     def setUp(self):
         self.op_type = "lookup_table_dequant"
         table = np.random.random((17, 32)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index f3546a7c50d97..9d9fa9e385d8e 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -26,6 +26,7 @@
 
 
 class TestLookupTableOp(OpTest):
+
     def setUp(self):
         self.op_type = "lookup_table"
         table = np.random.random((17, 31)).astype("float64")
@@ -42,11 +43,12 @@ def test_check_grad(self):
 
 
 class TestLookupTableOpWithTensorIds(OpTest):
+
     def setUp(self):
         self.op_type = "lookup_table"
         table = np.random.random((17, 31)).astype("float64")
-        ids = np.random.randint(
-            low=0, high=17, size=(2, 4, 5, 1)).astype("int64")
+        ids = np.random.randint(low=0, high=17,
+                                size=(2, 4, 5, 1)).astype("int64")
         self.inputs = {'W': table, 'Ids': ids}
         self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
 
@@ -62,6 +64,7 @@ def test_check_grad(self):
     "the gradient of paddings makes no sense and we don't "
     "test the gradient here.")
 class TestLookupTableOpWithPadding(TestLookupTableOp):
+
     def test_check_output(self):
         ids = np.squeeze(self.inputs['Ids'])
         padding_idx = np.random.choice(ids, 1)[0]
@@ -75,6 +78,7 @@ def test_check_output(self):
     "the gradient of paddings makes no sense and we don't "
     "test the gradient here.")
 class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
+
     def test_check_output(self):
         ids = self.inputs['Ids']
         flatten_idx = ids.flatten()
@@ -85,6 +89,7 @@ def test_check_output(self):
 
 
 class TestLookupTableWIsSelectedRows(unittest.TestCase):
+
     def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
         ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
@@ -137,12 +142,13 @@ def test_w_is_selected_rows(self):
             self.check_with_place(place)
 
 
-class TestLookupTableWithTensorIdsWIsSelectedRows(
-        TestLookupTableWIsSelectedRows):
+class TestLookupTableWithTensorIdsWIsSelectedRows(TestLookupTableWIsSelectedRows
+                                                  ):
+
     def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.random.randint(
-            low=0, high=6, size=(2, 4, 3, 1)).astype("int64")
+        ids_array = np.random.randint(low=0, high=6,
+                                      size=(2, 4, 3, 1)).astype("int64")
         ids_tensor.set(ids_array, place)
         return ids_array
 
@@ -152,6 +158,7 @@ def check_result(self, ids_array, result_array):
 
 
 class TestEmbedOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             input_data = np.random.randint(0, 10, (4, 1)).astype("int64")
@@ -172,8 +179,9 @@ def test_input_dtype():
             def test_param_dtype():
                 # dtype must be float32 or float64
                 input2 = fluid.data(name='x2', shape=[4, 1], dtype='int64')
-                fluid.layers.embedding(
-                    input=input2, size=(10, 64), dtype='int64')
+                fluid.layers.embedding(input=input2,
+                                       size=(10, 64),
+                                       dtype='int64')
 
             self.assertRaises(TypeError, test_param_dtype)
 
@@ -182,10 +190,11 @@ def test_param_dtype():
 
 
 class TestLookupTableOpInt8(OpTest):
+
     def setUp(self):
         self.op_type = "lookup_table"
-        table = np.random.randint(
-            low=-128, high=127, size=(17, 31)).astype("int8")
+        table = np.random.randint(low=-128, high=127,
+                                  size=(17, 31)).astype("int8")
         ids = np.random.randint(0, 17, 4).astype("int64")
         ids_expand = np.expand_dims(ids, axis=1)
         self.inputs = {'W': table, 'Ids': ids_expand}
@@ -195,18 +204,19 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        # since int8 type only be used in test and inference, there is 
+        # since int8 type only be used in test and inference, there is
         # no gradient implement, so we don't need to test it
         pass
 
 
 class TestLookupTableOpWithTensorIdsInt8(OpTest):
+
     def setUp(self):
         self.op_type = "lookup_table"
-        table = np.random.randint(
-            low=-128, high=127, size=(17, 31)).astype("int8")
-        ids = np.random.randint(
-            low=0, high=17, size=(2, 4, 5, 1)).astype("int64")
+        table = np.random.randint(low=-128, high=127,
+                                  size=(17, 31)).astype("int8")
+        ids = np.random.randint(low=0, high=17,
+                                size=(2, 4, 5, 1)).astype("int64")
         self.inputs = {'W': table, 'Ids': ids}
         self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
 
@@ -214,12 +224,13 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        # since int8 type only be used in test and inference, there is 
+        # since int8 type only be used in test and inference, there is
         # no gradient implement, so we don't need to test it
         pass
 
 
 class TestLookupTableOpWithPaddingInt8(TestLookupTableOpInt8):
+
     def test_check_output(self):
         ids = np.squeeze(self.inputs['Ids'])
         padding_idx = np.random.choice(ids, 1)[0]
@@ -235,6 +246,7 @@ def test_check_grad(self):
 
 class TestLookupTableOpWithTensorIdsAndPaddingInt8(
         TestLookupTableOpWithTensorIdsInt8):
+
     def test_check_output(self):
         ids = self.inputs['Ids']
         flatten_idx = ids.flatten()
@@ -250,6 +262,7 @@ def test_check_grad(self):
 
 
 class TestLookupTableWIsSelectedRowsInt8(unittest.TestCase):
+
     def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
         ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
@@ -304,10 +317,11 @@ def test_w_is_selected_rows(self):
 
 class TestLookupTableWithTensorIdsWIsSelectedRowsInt8(
         TestLookupTableWIsSelectedRowsInt8):
+
     def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.random.randint(
-            low=0, high=6, size=(2, 4, 3, 1)).astype("int64")
+        ids_array = np.random.randint(low=0, high=6,
+                                      size=(2, 4, 3, 1)).astype("int64")
         ids_tensor.set(ids_array, place)
         return ids_array
 
@@ -318,10 +332,11 @@ def check_result(self, ids_array, result_array):
 
 @skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
 class TestLookupTableOpInt16(OpTest):
+
     def setUp(self):
         self.op_type = "lookup_table"
-        table = np.random.randint(
-            low=-128, high=127, size=(17, 31)).astype("int16")
+        table = np.random.randint(low=-128, high=127,
+                                  size=(17, 31)).astype("int16")
         ids = np.random.randint(0, 17, 4).astype("int64")
         ids_expand = np.expand_dims(ids, axis=1)
         self.inputs = {'W': table, 'Ids': ids_expand}
@@ -333,12 +348,13 @@ def test_check_output(self):
 
 @skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
 class TestLookupTableOpWithTensorIdsInt16(OpTest):
+
     def setUp(self):
         self.op_type = "lookup_table"
-        table = np.random.randint(
-            low=-128, high=127, size=(17, 31)).astype("int16")
-        ids = np.random.randint(
-            low=0, high=17, size=(2, 4, 5, 1)).astype("int64")
+        table = np.random.randint(low=-128, high=127,
+                                  size=(17, 31)).astype("int16")
+        ids = np.random.randint(low=0, high=17,
+                                size=(2, 4, 5, 1)).astype("int64")
         self.inputs = {'W': table, 'Ids': ids}
         self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
 
@@ -348,6 +364,7 @@ def test_check_output(self):
 
 @skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
 class TestLookupTableOpWithPaddingInt16(TestLookupTableOpInt16):
+
     def test_check_output(self):
         ids = np.squeeze(self.inputs['Ids'])
         padding_idx = np.random.choice(ids, 1)[0]
@@ -359,6 +376,7 @@ def test_check_output(self):
 @skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
 class TestLookupTableOpWithTensorIdsAndPaddingInt16(
         TestLookupTableOpWithTensorIdsInt16):
+
     def test_check_output(self):
         ids = self.inputs['Ids']
         flatten_idx = ids.flatten()
@@ -369,6 +387,7 @@ def test_check_output(self):
 
 
 class TestLookupTableWIsSelectedRowsInt16(unittest.TestCase):
+
     def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
         ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
@@ -422,10 +441,11 @@ def test_w_is_selected_rows(self):
 
 class TestLookupTableWithTensorIdsWIsSelectedRowsInt16(
         TestLookupTableWIsSelectedRowsInt16):
+
     def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.random.randint(
-            low=0, high=6, size=(2, 4, 3, 1)).astype("int64")
+        ids_array = np.random.randint(low=0, high=6,
+                                      size=(2, 4, 3, 1)).astype("int64")
         ids_tensor.set(ids_array, place)
         return ids_array
 
@@ -435,13 +455,13 @@ def check_result(self, ids_array, result_array):
 
 
 class TestOutDtype(unittest.TestCase):
+
     def test_dtype(self):
         api_fn = F.embedding
-        check_out_dtype(
-            api_fn,
-            in_specs=[([10, 16], 'int64'), ([100, 64], )],
-            expect_dtypes=['float32', 'float64'],
-            target_index=1)
+        check_out_dtype(api_fn,
+                        in_specs=[([10, 16], 'int64'), ([100, 64], )],
+                        expect_dtypes=['float32', 'float64'],
+                        target_index=1)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py
index 0776ae852d19e..06b232443a8ea 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py
@@ -28,6 +28,7 @@
 
 
 class TestLookupTableV2BF16Op(TestLookupTableBF16Op):
+
     def init_test(self):
         self.op_type = "lookup_table_v2"
         self.ids_shape = (4)
@@ -35,6 +36,7 @@ def init_test(self):
 
 
 class TestLookupTableV2BF16OpIds4D(TestLookupTableBF16OpIds4D):
+
     def init_test(self):
         self.op_type = "lookup_table_v2"
         self.ids_shape = (2, 4, 5)
@@ -43,6 +45,7 @@ def init_test(self):
 
 class TestLookupTableV2BF16OpWIsSelectedRows(
         TestLookupTableBF16OpWIsSelectedRows):
+
     def init_test(self):
         self.op_type = "lookup_table_v2"
         self.ids_shape = (10)
@@ -50,12 +53,14 @@ def init_test(self):
 
 class TestLookupTableV2BF16OpWIsSelectedRows4DIds(
         TestLookupTableBF16OpWIsSelectedRows4DIds):
+
     def init_test(self):
         self.op_type = "lookup_table_v2"
         self.ids_shape = (3, 4, 5)
 
 
 class TestLookupTableBF16OpWithPadding(TestLookupTableV2BF16Op):
+
     def test_check_output(self):
         ids = np.squeeze(self.inputs['Ids'])
         padding_idx = np.random.choice(ids, 1)[0]
@@ -65,6 +70,7 @@ def test_check_output(self):
 
 
 class TestLookupTableBF16OpIds4DPadding(TestLookupTableV2BF16OpIds4D):
+
     def test_check_output(self):
         ids = self.inputs['Ids']
         flatten_idx = ids.flatten()
@@ -86,8 +92,8 @@ def setUp(self):
         self.op_type = "lookup_table_v2"
         self.ids_shape = [4]
         self.w_shape = [10, 64]
-        self.ids = np.random.randint(
-            low=0, high=9, size=self.ids_shape).astype("int64")
+        self.ids = np.random.randint(low=0, high=9,
+                                     size=self.ids_shape).astype("int64")
         self.flat_ids = self.ids.flatten()
         self.value = 3.0
         self.w_fp32 = np.full(self.w_shape, self.value)
@@ -98,13 +104,13 @@ def setUp(self):
 
         with fluid.program_guard(self.prog, self.startup_prog):
             x = fluid.layers.data(name='x', shape=self.ids_shape, dtype='int64')
-            self.emb = fluid.input.embedding(
-                input=x,
-                size=self.w_shape,
-                param_attr=fluid.ParamAttr(
-                    name="emb_weight", initializer=self.initializer),
-                is_sparse=False,
-                dtype="uint16")  # bfloat16
+            self.emb = fluid.input.embedding(input=x,
+                                             size=self.w_shape,
+                                             param_attr=fluid.ParamAttr(
+                                                 name="emb_weight",
+                                                 initializer=self.initializer),
+                                             is_sparse=False,
+                                             dtype="uint16")  # bfloat16
         exe = fluid.Executor(self.place)
         exe.run(self.startup_prog)
         self.result = exe.run(self.prog,
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
index 21844c9e402ad..eed0530e76113 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
@@ -27,6 +27,7 @@
 
 
 class TestStaticGraphSupportMultipleInt(unittest.TestCase):
+
     def test_main(self):
         dtypes = ['uint8', 'int8', 'int16', 'int32', 'int64']
         if paddle.in_dynamic_mode():
@@ -46,6 +47,7 @@ def test_main(self):
 
 
 class TestLookupTableOp(OpTest):
+
     def setUp(self):
         self.op_type = "lookup_table_v2"
         self.python_api = paddle.nn.functional.embedding
@@ -65,21 +67,25 @@ def test_check_grad(self):
 
 
 class TestLookupTableOpInt16(OpTest):
+
     def id_dtype(self):
         return "int16"
 
 
 class TestLookupTableOpInt8(OpTest):
+
     def id_dtype(self):
         return "int8"
 
 
 class TestLookupTableOpUInt8(OpTest):
+
     def id_dtype(self):
         return "uint8"
 
 
 class TestLookupTableOpWithTensorIds(OpTest):
+
     def setUp(self):
         self.op_type = "lookup_table_v2"
         table = np.random.random((17, 31)).astype("float64")
@@ -99,6 +105,7 @@ def test_check_grad(self):
     "the gradient of paddings makes no sense and we don't "
     "test the gradient here.")
 class TestLookupTableOpWithPadding(TestLookupTableOp):
+
     def test_check_output(self):
         ids = np.squeeze(self.inputs['Ids'])
         padding_idx = np.random.choice(ids, 1)[0]
@@ -112,6 +119,7 @@ def test_check_output(self):
     "the gradient of paddings makes no sense and we don't "
     "test the gradient here.")
 class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
+
     def test_check_output(self):
         ids = self.inputs['Ids']
         flatten_idx = ids.flatten()
@@ -122,6 +130,7 @@ def test_check_output(self):
 
 
 class TestLookupTableWIsSelectedRows(unittest.TestCase):
+
     def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
         ids_array = np.array([0, 4, 3, 5]).astype("int32")
@@ -174,12 +183,13 @@ def test_w_is_selected_rows(self):
             self.check_with_place(place)
 
 
-class TestLookupTableWithTensorIdsWIsSelectedRows(
-        TestLookupTableWIsSelectedRows):
+class TestLookupTableWithTensorIdsWIsSelectedRows(TestLookupTableWIsSelectedRows
+                                                  ):
+
     def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.random.randint(
-            low=0, high=6, size=(2, 4, 3)).astype("int64")
+        ids_array = np.random.randint(low=0, high=6,
+                                      size=(2, 4, 3)).astype("int64")
         ids_tensor.set(ids_array, place)
         return ids_array
 
@@ -189,6 +199,7 @@ def check_result(self, ids_array, result_array):
 
 
 class TestLookupTableIsSparse(unittest.TestCase):
+
     def init_data(self):
         self.x_data = np.array([[1, 3, 0, 4, 7]]).astype("int64")
         self.y_data = np.array([[0.1, 0.3, 0, 0.4, 0.7]]).astype("float32")
@@ -219,8 +230,10 @@ def get_w_grad(self, is_sparse):
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
-            ret = exe.run(feed={'x': self.x_data,
-                                'y_': self.y_data},
+            ret = exe.run(feed={
+                'x': self.x_data,
+                'y_': self.y_data
+            },
                           fetch_list=['emb_weight'],
                           return_numpy=False)
             return np.array(ret[0])
@@ -232,11 +245,14 @@ def test_w_grad(self):
         self.check_grad(w_grad, w_grad_with_sparse)
 
     def check_grad(self, w_grad1, w_grad2, tolerance=1e-6):
-        np.testing.assert_allclose(
-            w_grad1, w_grad2, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(w_grad1,
+                                   w_grad2,
+                                   rtol=tolerance,
+                                   atol=tolerance)
 
 
 class TestLookupTableApi(unittest.TestCase):
+
     def test_api(self):
         x = fluid.layers.data(name='x', shape=[20], dtype='int64')
         emb = fluid.embedding(input=x, size=[128, 64])
@@ -246,12 +262,15 @@ def test_api(self):
 
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'x': x_data, },
+        ret = exe.run(feed={
+            'x': x_data,
+        },
                       fetch_list=[emb],
                       return_numpy=False)
 
 
 class TestEmbedOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             input_data = np.random.randint(0, 10, (4, 6)).astype("int64")
diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index 65a5b3506b7df..def22575eea91 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -28,6 +28,7 @@
 
 def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss,
                          var_list):
+
     def is_better(current, best, m, n):
         if m == 'min' and n == 'rel':
             return current < best - best * threshold
@@ -57,6 +58,7 @@ def is_better(current, best, m, n):
 
 
 class TestReduceOnPlateauDecay(object):
+
     def test_ReduceLR(self):
         # the decay rate must be less than 1.0
         with self.assertRaises(ValueError):
@@ -66,8 +68,8 @@ def test_ReduceLR(self):
             paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, mode="test")
         # the threshold_mode must be "rel" or "abs"
         with self.assertRaises(ValueError):
-            paddle.optimizer.lr.ReduceOnPlateau(
-                learning_rate=1.0, threshold_mode="test")
+            paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0,
+                                                threshold_mode="test")
         with self.assertRaises(TypeError):
             paddle.optimizer.lr.ReduceOnPlateau(learning_rate="test")
         with self.assertRaises(TypeError):
@@ -110,8 +112,10 @@ def _test_static(self, place, kwargs):
         main_prog = paddle.static.Program()
         start_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, start_prog):
-            x = fluid.layers.create_global_var(
-                [1], 1, 'float32', persistable=True)
+            x = fluid.layers.create_global_var([1],
+                                               1,
+                                               'float32',
+                                               persistable=True)
             paddle.increment(x)
             loss = paddle.sin(x)
             scheduler = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
@@ -159,8 +163,8 @@ def _test_dygraph(self, place, kwargs):
 
         linear = paddle.nn.Linear(10, 10)
         scheduler = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
-        adam = paddle.optimizer.Adam(
-            learning_rate=scheduler, parameters=linear.parameters())
+        adam = paddle.optimizer.Adam(learning_rate=scheduler,
+                                     parameters=linear.parameters())
 
         for epoch in range(20):
             for batch_id in range(1):
@@ -181,8 +185,8 @@ def _test_dygraph(self, place, kwargs):
             self.assertEqual(current_lr, expected_lr)
         state_dict = adam.state_dict()
         scheduler1 = paddle.optimizer.lr.ReduceOnPlateau(**kwargs)
-        adam1 = paddle.optimizer.Adam(
-            learning_rate=scheduler1, parameters=linear.parameters())
+        adam1 = paddle.optimizer.Adam(learning_rate=scheduler1,
+                                      parameters=linear.parameters())
         adam1.set_state_dict(state_dict)
         self.assertEqual(scheduler.cooldown_counter,
                          scheduler1.cooldown_counter)
@@ -254,8 +258,8 @@ def get_lr(self):
         if self.last_epoch == 0:
             return self.base_lr
         elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
-            return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos(
-                math.pi / self.T_max)) / 2
+            return self.last_lr + (self.base_lr - self.eta_min) * (
+                1 - math.cos(math.pi / self.T_max)) / 2
 
         return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
             1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
@@ -275,13 +279,13 @@ def cosine_annealing_lr(epoch_num,
         cosine_annealing_lr_current = learning_rate
     elif (epoch_num - 1 - T_max) % (2 * T_max) == 0:
         cosine_annealing_lr_current = cosine_annealing_lr_current + (
-            learning_rate - eta_min) * (1 - math.cos(math.pi / float(T_max))
-                                        ) / 2
+            learning_rate - eta_min) * (1 -
+                                        math.cos(math.pi / float(T_max))) / 2
     else:
-        cosine_annealing_lr_current = (1 + math.cos(
-            math.pi * epoch_num / float(T_max))) / (1 + math.cos(math.pi * (
-                epoch_num - 1) / float(T_max))) * (cosine_annealing_lr_current -
-                                                   eta_min) + eta_min
+        cosine_annealing_lr_current = (
+            1 + math.cos(math.pi * epoch_num / float(T_max))) / (
+                1 + math.cos(math.pi * (epoch_num - 1) / float(T_max))) * (
+                    cosine_annealing_lr_current - eta_min) + eta_min
     return cosine_annealing_lr_current
 
 
@@ -433,6 +437,7 @@ def exp_range(x):
 
 
 class TestLRScheduler(unittest.TestCase):
+
     def _test_static(self, python_func, paddle_api, kwarg, place):
         scheduler = paddle_api(**kwarg)
         adam = paddle.optimizer.Adam(learning_rate=scheduler)
@@ -473,17 +478,15 @@ def _test_static(self, python_func, paddle_api, kwarg, place):
 
         if isinstance(place, paddle.CPUPlace):
             compiled_train_prog = paddle.static.CompiledProgram(
-                main_prog).with_data_parallel(
-                    loss_name=loss.name, places=fluid.cpu_places(4))
+                main_prog).with_data_parallel(loss_name=loss.name,
+                                              places=fluid.cpu_places(4))
             for epoch in range(5):
                 python_result = python_func(num, **kwarg)
                 for batch_id in range(2):
-                    _ = exe.run(compiled_train_prog,
-                                feed={
-                                    'x':
-                                    np.random.randn(12, 4, 5).astype('float32')
-                                },
-                                fetch_list=lr_var.name)
+                    _ = exe.run(
+                        compiled_train_prog,
+                        feed={'x': np.random.randn(12, 4, 5).astype('float32')},
+                        fetch_list=lr_var.name)
                 scopes = compiled_train_prog._executor.local_scopes()
                 out = np.array(scopes[0].var(lr_var.name).get_tensor())
                 self.assertEqual(out, np.array(python_result))
@@ -504,12 +507,10 @@ def _test_static(self, python_func, paddle_api, kwarg, place):
             for epoch in range(5):
                 python_result = python_func(num, **kwarg)
                 for batch_id in range(2):
-                    _ = exe.run(compiled_test_prog,
-                                feed={
-                                    'x':
-                                    np.random.randn(12, 4, 5).astype('float32')
-                                },
-                                fetch_list=lr_var.name)
+                    _ = exe.run(
+                        compiled_test_prog,
+                        feed={'x': np.random.randn(12, 4, 5).astype('float32')},
+                        fetch_list=lr_var.name)
                 scopes = compiled_test_prog._executor.local_scopes()
                 out = np.array(scopes[0].var(lr_var.name).get_tensor())
                 self.assertEqual(out, np.array(python_result))
@@ -530,8 +531,8 @@ def _test_dygraph(self, python_func, paddle_api, kwarg, place):
             kwarg['learning_rate'] = paddle.optimizer.lr.PiecewiseDecay(
                 [3, 6], [0.5, 0.2, 0.1])
         scheduler = paddle_api(**kwarg)
-        adam = paddle.optimizer.Adam(
-            learning_rate=scheduler, parameters=linear.parameters())
+        adam = paddle.optimizer.Adam(learning_rate=scheduler,
+                                     parameters=linear.parameters())
         for epoch in range(20):
             for batch_id in range(2):
                 x = paddle.to_tensor(x)
@@ -549,8 +550,8 @@ def _test_dygraph(self, python_func, paddle_api, kwarg, place):
                 self.assertAlmostEqual(current_lr, expected_lr)
                 state_dict = adam.state_dict()
                 scheduler1 = paddle.optimizer.lr.LinearWarmup(**kwarg)
-                adam1 = paddle.optimizer.Adam(
-                    learning_rate=scheduler1, parameters=linear.parameters())
+                adam1 = paddle.optimizer.Adam(learning_rate=scheduler1,
+                                              parameters=linear.parameters())
                 adam1.set_state_dict(state_dict)
                 self.assertEqual(scheduler.last_epoch, scheduler1.last_epoch)
                 self.assertEqual(scheduler.last_lr, scheduler1.last_lr)
@@ -567,245 +568,267 @@ def test_scheduler(self):
         with self.assertRaises(NotImplementedError):
             paddle.optimizer.lr.LRScheduler().step()
         with self.assertRaises(TypeError):
-            paddle.optimizer.lr.MultiStepDecay(
-                learning_rate="test", milestones=[1, 2, 3])
+            paddle.optimizer.lr.MultiStepDecay(learning_rate="test",
+                                               milestones=[1, 2, 3])
         with self.assertRaises(TypeError):
-            paddle.optimizer.lr.MultiStepDecay(
-                learning_rate=0.5, milestones='test')
+            paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5,
+                                               milestones='test')
         with self.assertRaises(ValueError):
-            paddle.optimizer.lr.MultiStepDecay(
-                learning_rate=0.5, milestones=[3, 2, 1])
+            paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5,
+                                               milestones=[3, 2, 1])
         with self.assertRaises(ValueError):
-            paddle.optimizer.lr.MultiStepDecay(
-                learning_rate=0.5, milestones=[1, 2, 3], gamma=2)
+            paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5,
+                                               milestones=[1, 2, 3],
+                                               gamma=2)
         # check type of max_learning_rate
         with self.assertRaises(TypeError):
-            paddle.optimizer.lr.OneCycleLR(
-                max_learning_rate='test', total_steps=20)
+            paddle.optimizer.lr.OneCycleLR(max_learning_rate='test',
+                                           total_steps=20)
         # check value of max_learning_rate
         with self.assertRaises(ValueError):
-            paddle.optimizer.lr.OneCycleLR(
-                max_learning_rate=-1.5, total_steps=20)
+            paddle.optimizer.lr.OneCycleLR(max_learning_rate=-1.5,
+                                           total_steps=20)
         # check type of end_learning_rate
         with self.assertRaises(TypeError):
-            paddle.optimizer.lr.OneCycleLR(
-                max_learning_rate=0.1, total_steps=20, end_learning_rate='test')
+            paddle.optimizer.lr.OneCycleLR(max_learning_rate=0.1,
+                                           total_steps=20,
+                                           end_learning_rate='test')
         # check value of end_learning_rate
         with self.assertRaises(ValueError):
-            paddle.optimizer.lr.OneCycleLR(
-                max_learning_rate=0.1, total_steps=20, end_learning_rate=-1)
+            paddle.optimizer.lr.OneCycleLR(max_learning_rate=0.1,
+                                           total_steps=20,
+                                           end_learning_rate=-1)
         # check type of total_steps
         with self.assertRaises(TypeError):
-            paddle.optimizer.lr.OneCycleLR(
-                max_learning_rate=0.1, total_steps='test')
+            paddle.optimizer.lr.OneCycleLR(max_learning_rate=0.1,
+                                           total_steps='test')
         # check value of total_steps
         with self.assertRaises(ValueError):
-            paddle.optimizer.lr.OneCycleLR(
-                max_learning_rate=0.1, total_steps=-10)
+            paddle.optimizer.lr.OneCycleLR(max_learning_rate=0.1,
+                                           total_steps=-10)
         # check value of anneal_strategy
         with self.assertRaises(ValueError):
-            paddle.optimizer.lr.OneCycleLR(
-                max_learning_rate=0.1, total_steps=20, anneal_strategy='test')
+            paddle.optimizer.lr.OneCycleLR(max_learning_rate=0.1,
+                                           total_steps=20,
+                                           anneal_strategy='test')
         # check value of phase_pct when three_phase is True
         with self.assertRaises(ValueError):
-            paddle.optimizer.lr.OneCycleLR(
-                max_learning_rate=0.1,
-                total_steps=20,
-                phase_pct=0.6,
-                three_phase=True)
+            paddle.optimizer.lr.OneCycleLR(max_learning_rate=0.1,
+                                           total_steps=20,
+                                           phase_pct=0.6,
+                                           three_phase=True)
         # check type of max_learning_rate
         with self.assertRaises(TypeError):
-            paddle.optimizer.lr.CyclicLR(
-                base_learning_rate=0.5,
-                max_learning_rate='test',
-                step_size_up=10)
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate='test',
+                                         step_size_up=10)
         # check value of max_learning_rate
         with self.assertRaises(ValueError):
-            paddle.optimizer.lr.CyclicLR(
-                base_learning_rate=0.5, max_learning_rate=-1, step_size_up=10)
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate=-1,
+                                         step_size_up=10)
         # check type of step_size_up
         with self.assertRaises(TypeError):
-            paddle.optimizer.lr.CyclicLR(
-                base_learning_rate=0.5,
-                max_learning_rate=1.0,
-                step_size_up='test')
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate=1.0,
+                                         step_size_up='test')
         # check value of step_size_up
         with self.assertRaises(ValueError):
-            paddle.optimizer.lr.CyclicLR(
-                base_learning_rate=0.5, max_learning_rate=1.0, step_size_up=-1)
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate=1.0,
+                                         step_size_up=-1)
         # check type of step_size_down
         with self.assertRaises(TypeError):
-            paddle.optimizer.lr.CyclicLR(
-                base_learning_rate=0.5,
-                max_learning_rate=1.0,
-                step_size_up=500,
-                step_size_down='test')
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate=1.0,
+                                         step_size_up=500,
+                                         step_size_down='test')
         # check type of step_size_down
         with self.assertRaises(ValueError):
-            paddle.optimizer.lr.CyclicLR(
-                base_learning_rate=0.5,
-                max_learning_rate=1.0,
-                step_size_up=500,
-                step_size_down=-1)
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate=1.0,
+                                         step_size_up=500,
+                                         step_size_down=-1)
         # check value of mode
         with self.assertRaises(ValueError):
-            paddle.optimizer.lr.CyclicLR(
-                base_learning_rate=0.5,
-                max_learning_rate=1.0,
-                step_size_up=500,
-                step_size_down=500,
-                mode='test')
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate=1.0,
+                                         step_size_up=500,
+                                         step_size_down=500,
+                                         mode='test')
         # check type value of scale_mode
         with self.assertRaises(ValueError):
-            paddle.optimizer.lr.CyclicLR(
-                base_learning_rate=0.5,
-                max_learning_rate=1.0,
-                step_size_up=500,
-                step_size_down=-1,
-                scale_mode='test')
-
-        func_api_kwargs = [(noam_lr, paddle.optimizer.lr.NoamDecay, {
-            "d_model": 0.01,
-            "warmup_steps": 100,
-            "verbose": False
-        }), (piecewise_lr, paddle.optimizer.lr.PiecewiseDecay, {
-            "boundaries": [3, 6, 9, 15, 20],
-            "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
-            "verbose": False
-        }), (natural_exp_lr, paddle.optimizer.lr.NaturalExpDecay, {
-            "learning_rate": 0.5,
-            "gamma": 0.1,
-            "verbose": True
-        }), (inverse_time_lr, paddle.optimizer.lr.InverseTimeDecay, {
-            "learning_rate": 0.5,
-            "gamma": 0.1,
-            "verbose": False
-        }), (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, {
-            "learning_rate": 0.5,
-            "decay_steps": 20,
-            "end_lr": 0,
-            "power": 1.0,
-            "cycle": False
-        }), (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, {
-            "learning_rate": 0.5,
-            "decay_steps": 20,
-            "end_lr": 0,
-            "power": 1.0,
-            "cycle": True,
-            "verbose": False
-        }), (linear_warmup_lr, paddle.optimizer.lr.LinearWarmup, {
-            'learning_rate': 0.5,
-            'warmup_steps': 10,
-            'start_lr': 0,
-            'end_lr': 0.5
-        }), (exponential_lr, paddle.optimizer.lr.ExponentialDecay, {
-            "learning_rate": 0.5,
-            "gamma": 0.9,
-            "verbose": False
-        }), (multi_step_lr, paddle.optimizer.lr.MultiStepDecay, {
-            "learning_rate": 0.5,
-            "milestones": [3, 6, 9, 15, 20],
-            "gamma": 0.8
-        }), (step_lr, paddle.optimizer.lr.StepDecay, {
-            "learning_rate": 0.5,
-            "step_size": 2,
-            "gamma": 0.8,
-            "verbose": False
-        }), (lambda_lr, paddle.optimizer.lr.LambdaDecay, {
-            "learning_rate": 0.5,
-            "lr_lambda": lambda x: 0.95**x,
-            "verbose": True
-        }), (multiplicative_lr, paddle.optimizer.lr.MultiplicativeDecay, {
-            "learning_rate": 0.5,
-            "lr_lambda": lambda x: 0.95,
-            "verbose": True
-        }), (cosine_annealing_lr, paddle.optimizer.lr.CosineAnnealingDecay, {
-            "learning_rate": 0.5,
-            "T_max": 10,
-            "verbose": False
-        }), (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
-            "max_learning_rate": 0.1,
-            "total_steps": 20,
-            "divide_factor": 5,
-            "end_learning_rate": 0.0001,
-            "anneal_strategy": 'cos',
-            "phase_pct": 0.3,
-            "three_phase": False,
-        }), (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
-            "max_learning_rate": 0.5,
-            "total_steps": 20,
-            "divide_factor": 10,
-            "end_learning_rate": 0.001,
-            "anneal_strategy": 'linear',
-            "phase_pct": 0.4,
-            "three_phase": False,
-        }), (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
-            "max_learning_rate": 1.0,
-            "total_steps": 20,
-            "divide_factor": 9,
-            "end_learning_rate": 0.0001,
-            "anneal_strategy": 'cos',
-            "phase_pct": 0.3,
-            "three_phase": True,
-        }), (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
-            "max_learning_rate": 0.3,
-            "total_steps": 20,
-            "divide_factor": 25,
-            "end_learning_rate": 0.0005,
-            "anneal_strategy": 'linear',
-            "phase_pct": 0.2,
-            "three_phase": True,
-        }), (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
-            "base_learning_rate": 0.5,
-            "max_learning_rate": 1.0,
-            "step_size_up": 15,
-            "step_size_down": 5,
-            "mode": 'triangular',
-            "exp_gamma": 1.,
-            "scale_fn": None,
-            "scale_mode": 'cycle',
-            "verbose": False
-        }), (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
-            "base_learning_rate": 0.5,
-            "max_learning_rate": 1.0,
-            "step_size_up": 15,
-            "step_size_down": 5,
-            "mode": 'triangular2',
-            "exp_gamma": 1.,
-            "scale_fn": None,
-            "scale_mode": 'cycle',
-            "verbose": False
-        }), (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
-            "base_learning_rate": 0.5,
-            "max_learning_rate": 1.0,
-            "step_size_up": 15,
-            "step_size_down": 5,
-            "mode": 'exp_range',
-            "exp_gamma": 0.8,
-            "scale_fn": None,
-            "scale_mode": 'cycle',
-            "verbose": False
-        }), (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
-            "base_learning_rate": 0.5,
-            "max_learning_rate": 1.0,
-            "step_size_up": 15,
-            "step_size_down": 5,
-            "mode": 'exp_range',
-            "exp_gamma": 1.,
-            "scale_fn": lambda x: 0.95**x,
-            "scale_mode": 'cycle',
-            "verbose": False
-        }), (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
-            "base_learning_rate": 0.5,
-            "max_learning_rate": 1.0,
-            "step_size_up": 15,
-            "step_size_down": 5,
-            "mode": 'exp_range',
-            "exp_gamma": 1.,
-            "scale_fn": lambda x: 0.95,
-            "scale_mode": 'iterations',
-            "verbose": False
-        })]
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate=1.0,
+                                         step_size_up=500,
+                                         step_size_down=-1,
+                                         scale_mode='test')
+
+        func_api_kwargs = [
+            (noam_lr, paddle.optimizer.lr.NoamDecay, {
+                "d_model": 0.01,
+                "warmup_steps": 100,
+                "verbose": False
+            }),
+            (piecewise_lr, paddle.optimizer.lr.PiecewiseDecay, {
+                "boundaries": [3, 6, 9, 15, 20],
+                "values": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+                "verbose": False
+            }),
+            (natural_exp_lr, paddle.optimizer.lr.NaturalExpDecay, {
+                "learning_rate": 0.5,
+                "gamma": 0.1,
+                "verbose": True
+            }),
+            (inverse_time_lr, paddle.optimizer.lr.InverseTimeDecay, {
+                "learning_rate": 0.5,
+                "gamma": 0.1,
+                "verbose": False
+            }),
+            (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, {
+                "learning_rate": 0.5,
+                "decay_steps": 20,
+                "end_lr": 0,
+                "power": 1.0,
+                "cycle": False
+            }),
+            (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, {
+                "learning_rate": 0.5,
+                "decay_steps": 20,
+                "end_lr": 0,
+                "power": 1.0,
+                "cycle": True,
+                "verbose": False
+            }),
+            (linear_warmup_lr, paddle.optimizer.lr.LinearWarmup, {
+                'learning_rate': 0.5,
+                'warmup_steps': 10,
+                'start_lr': 0,
+                'end_lr': 0.5
+            }),
+            (exponential_lr, paddle.optimizer.lr.ExponentialDecay, {
+                "learning_rate": 0.5,
+                "gamma": 0.9,
+                "verbose": False
+            }),
+            (multi_step_lr, paddle.optimizer.lr.MultiStepDecay, {
+                "learning_rate": 0.5,
+                "milestones": [3, 6, 9, 15, 20],
+                "gamma": 0.8
+            }),
+            (step_lr, paddle.optimizer.lr.StepDecay, {
+                "learning_rate": 0.5,
+                "step_size": 2,
+                "gamma": 0.8,
+                "verbose": False
+            }),
+            (lambda_lr, paddle.optimizer.lr.LambdaDecay, {
+                "learning_rate": 0.5,
+                "lr_lambda": lambda x: 0.95**x,
+                "verbose": True
+            }),
+            (multiplicative_lr, paddle.optimizer.lr.MultiplicativeDecay, {
+                "learning_rate": 0.5,
+                "lr_lambda": lambda x: 0.95,
+                "verbose": True
+            }),
+            (cosine_annealing_lr, paddle.optimizer.lr.CosineAnnealingDecay, {
+                "learning_rate": 0.5,
+                "T_max": 10,
+                "verbose": False
+            }),
+            (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
+                "max_learning_rate": 0.1,
+                "total_steps": 20,
+                "divide_factor": 5,
+                "end_learning_rate": 0.0001,
+                "anneal_strategy": 'cos',
+                "phase_pct": 0.3,
+                "three_phase": False,
+            }),
+            (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
+                "max_learning_rate": 0.5,
+                "total_steps": 20,
+                "divide_factor": 10,
+                "end_learning_rate": 0.001,
+                "anneal_strategy": 'linear',
+                "phase_pct": 0.4,
+                "three_phase": False,
+            }),
+            (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
+                "max_learning_rate": 1.0,
+                "total_steps": 20,
+                "divide_factor": 9,
+                "end_learning_rate": 0.0001,
+                "anneal_strategy": 'cos',
+                "phase_pct": 0.3,
+                "three_phase": True,
+            }),
+            (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
+                "max_learning_rate": 0.3,
+                "total_steps": 20,
+                "divide_factor": 25,
+                "end_learning_rate": 0.0005,
+                "anneal_strategy": 'linear',
+                "phase_pct": 0.2,
+                "three_phase": True,
+            }),
+            (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
+                "base_learning_rate": 0.5,
+                "max_learning_rate": 1.0,
+                "step_size_up": 15,
+                "step_size_down": 5,
+                "mode": 'triangular',
+                "exp_gamma": 1.,
+                "scale_fn": None,
+                "scale_mode": 'cycle',
+                "verbose": False
+            }),
+            (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
+                "base_learning_rate": 0.5,
+                "max_learning_rate": 1.0,
+                "step_size_up": 15,
+                "step_size_down": 5,
+                "mode": 'triangular2',
+                "exp_gamma": 1.,
+                "scale_fn": None,
+                "scale_mode": 'cycle',
+                "verbose": False
+            }),
+            (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
+                "base_learning_rate": 0.5,
+                "max_learning_rate": 1.0,
+                "step_size_up": 15,
+                "step_size_down": 5,
+                "mode": 'exp_range',
+                "exp_gamma": 0.8,
+                "scale_fn": None,
+                "scale_mode": 'cycle',
+                "verbose": False
+            }),
+            (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
+                "base_learning_rate": 0.5,
+                "max_learning_rate": 1.0,
+                "step_size_up": 15,
+                "step_size_down": 5,
+                "mode": 'exp_range',
+                "exp_gamma": 1.,
+                "scale_fn": lambda x: 0.95**x,
+                "scale_mode": 'cycle',
+                "verbose": False
+            }),
+            (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
+                "base_learning_rate": 0.5,
+                "max_learning_rate": 1.0,
+                "step_size_up": 15,
+                "step_size_down": 5,
+                "mode": 'exp_range',
+                "exp_gamma": 1.,
+                "scale_fn": lambda x: 0.95,
+                "scale_mode": 'iterations',
+                "verbose": False
+            })
+        ]
 
         for python_func, paddle_api, kwarg in func_api_kwargs:
             places = [paddle.CPUPlace()]
@@ -820,8 +843,8 @@ def test_scheduler(self):
                 paddle.enable_static()
 
     def test_linear_warmp(self):
-        natural_lr = paddle.optimizer.lr.NaturalExpDecay(
-            learning_rate=0.5, gamma=0.1)
+        natural_lr = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5,
+                                                         gamma=0.1)
         natural_lr_warmup = paddle.optimizer.lr.LinearWarmup(
             learning_rate=natural_lr, warmup_steps=10, start_lr=0.0, end_lr=0.1)
         for idx in range(30):
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
index 4589f84deb3f4..1f8f9c62f0bce 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -24,6 +24,7 @@
 
 
 class TestLRNOp(OpTest):
+
     def get_input(self):
         r''' TODO(gongweibao): why it's grad diff is so large?
         x = np.ndarray(
@@ -104,11 +105,13 @@ def test_check_grad_normal(self):
 
 
 class TestLRNOpAttrDataFormat(TestLRNOp):
+
     def init_test_case(self):
         self.data_format = 'NHWC'
 
 
 class TestLRNAPI(unittest.TestCase):
+
     def test_case(self):
         data1 = fluid.data(name='data1', shape=[2, 4, 5, 5], dtype='float32')
         data2 = fluid.data(name='data2', shape=[2, 5, 5, 4], dtype='float32')
@@ -124,8 +127,10 @@ def test_case(self):
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
         results = exe.run(fluid.default_main_program(),
-                          feed={"data1": data1_np,
-                                "data2": data2_np},
+                          feed={
+                              "data1": data1_np,
+                              "data2": data2_np
+                          },
                           fetch_list=[out1, out2],
                           return_numpy=True)
 
@@ -134,8 +139,9 @@ def test_case(self):
 
     def test_exception(self):
         input1 = fluid.data(name="input1", shape=[2, 4, 5, 5], dtype="float32")
-        input2 = fluid.data(
-            name="input2", shape=[2, 4, 5, 5, 5], dtype="float32")
+        input2 = fluid.data(name="input2",
+                            shape=[2, 4, 5, 5, 5],
+                            dtype="float32")
 
         def _attr_data_fromat():
             out = fluid.layers.lrn(input1, data_format='NDHW')
@@ -148,6 +154,7 @@ def _input_dim_size():
 
 
 class TestLRNOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input must be float32
@@ -156,6 +163,7 @@ def test_errors(self):
 
 
 class TestLocalResponseNormFAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -167,18 +175,24 @@ def check_static_3d_input(self, place):
             in_np1 = np.random.random([3, 40, 40]).astype("float32")
             in_np2 = np.transpose(in_np1, (0, 2, 1))
 
-            input1 = fluid.data(
-                name="input1", shape=[3, 40, 40], dtype="float32")
-            input2 = fluid.data(
-                name="input2", shape=[3, 40, 40], dtype="float32")
-            res1 = paddle.nn.functional.local_response_norm(
-                x=input1, size=5, data_format='NCL')
-            res2 = paddle.nn.functional.local_response_norm(
-                x=input2, size=5, data_format='NLC')
+            input1 = fluid.data(name="input1",
+                                shape=[3, 40, 40],
+                                dtype="float32")
+            input2 = fluid.data(name="input2",
+                                shape=[3, 40, 40],
+                                dtype="float32")
+            res1 = paddle.nn.functional.local_response_norm(x=input1,
+                                                            size=5,
+                                                            data_format='NCL')
+            res2 = paddle.nn.functional.local_response_norm(x=input2,
+                                                            size=5,
+                                                            data_format='NLC')
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
-                              feed={"input1": in_np1,
-                                    "input2": in_np2},
+                              feed={
+                                  "input1": in_np1,
+                                  "input2": in_np2
+                              },
                               fetch_list=[res1, res2])
 
             fetches1_tran = np.transpose(fetches[1], (0, 2, 1))
@@ -186,23 +200,29 @@ def check_static_3d_input(self, place):
 
     def check_static_4d_input(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input1 = fluid.data(
-                name="input1", shape=[3, 3, 40, 40], dtype="float32")
-            input2 = fluid.data(
-                name="input2", shape=[3, 40, 40, 3], dtype="float32")
-
-            res1 = paddle.nn.functional.local_response_norm(
-                x=input1, size=5, data_format='NCHW')
-            res2 = paddle.nn.functional.local_response_norm(
-                x=input2, size=5, data_format='NHWC')
+            input1 = fluid.data(name="input1",
+                                shape=[3, 3, 40, 40],
+                                dtype="float32")
+            input2 = fluid.data(name="input2",
+                                shape=[3, 40, 40, 3],
+                                dtype="float32")
+
+            res1 = paddle.nn.functional.local_response_norm(x=input1,
+                                                            size=5,
+                                                            data_format='NCHW')
+            res2 = paddle.nn.functional.local_response_norm(x=input2,
+                                                            size=5,
+                                                            data_format='NHWC')
 
             in_np1 = np.random.random([3, 3, 40, 40]).astype("float32")
             in_np2 = np.transpose(in_np1, (0, 2, 3, 1))
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
-                              feed={"input1": in_np1,
-                                    "input2": in_np2},
+                              feed={
+                                  "input1": in_np1,
+                                  "input2": in_np2
+                              },
                               fetch_list=[res1, res2])
 
             fetches1_tran = np.transpose(fetches[1], (0, 3, 1, 2))
@@ -210,22 +230,28 @@ def check_static_4d_input(self, place):
 
     def check_static_5d_input(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input1 = fluid.data(
-                name="input1", shape=[3, 3, 3, 40, 40], dtype="float32")
-            input2 = fluid.data(
-                name="input2", shape=[3, 3, 40, 40, 3], dtype="float32")
-            res1 = paddle.nn.functional.local_response_norm(
-                x=input1, size=5, data_format='NCDHW')
-            res2 = paddle.nn.functional.local_response_norm(
-                x=input2, size=5, data_format='NDHWC')
+            input1 = fluid.data(name="input1",
+                                shape=[3, 3, 3, 40, 40],
+                                dtype="float32")
+            input2 = fluid.data(name="input2",
+                                shape=[3, 3, 40, 40, 3],
+                                dtype="float32")
+            res1 = paddle.nn.functional.local_response_norm(x=input1,
+                                                            size=5,
+                                                            data_format='NCDHW')
+            res2 = paddle.nn.functional.local_response_norm(x=input2,
+                                                            size=5,
+                                                            data_format='NDHWC')
 
             in_np1 = np.random.random([3, 3, 3, 40, 40]).astype("float32")
             in_np2 = np.transpose(in_np1, (0, 2, 3, 4, 1))
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
-                              feed={"input1": in_np1,
-                                    "input2": in_np2},
+                              feed={
+                                  "input1": in_np1,
+                                  "input2": in_np2
+                              },
                               fetch_list=[res1, res2])
 
             fetches1_tran = np.transpose(fetches[1], (0, 4, 1, 2, 3))
@@ -245,10 +271,12 @@ def check_dygraph_3d_input(self, place):
             in1 = paddle.to_tensor(in_np1)
             in2 = paddle.to_tensor(in_np2)
 
-            res1 = paddle.nn.functional.local_response_norm(
-                x=in1, size=5, data_format='NCL')
-            res2 = paddle.nn.functional.local_response_norm(
-                x=in2, size=5, data_format='NLC')
+            res1 = paddle.nn.functional.local_response_norm(x=in1,
+                                                            size=5,
+                                                            data_format='NCL')
+            res2 = paddle.nn.functional.local_response_norm(x=in2,
+                                                            size=5,
+                                                            data_format='NLC')
 
             res2_tran = np.transpose(res2.numpy(), (0, 2, 1))
             self.assertTrue(np.allclose(res1.numpy(), res2_tran))
@@ -261,10 +289,12 @@ def check_dygraph_4d_input(self, place):
             in1 = paddle.to_tensor(in_np1)
             in2 = paddle.to_tensor(in_np2)
 
-            res1 = paddle.nn.functional.local_response_norm(
-                x=in1, size=5, data_format='NCHW')
-            res2 = paddle.nn.functional.local_response_norm(
-                x=in2, size=5, data_format='NHWC')
+            res1 = paddle.nn.functional.local_response_norm(x=in1,
+                                                            size=5,
+                                                            data_format='NCHW')
+            res2 = paddle.nn.functional.local_response_norm(x=in2,
+                                                            size=5,
+                                                            data_format='NHWC')
 
             res2_tran = np.transpose(res2.numpy(), (0, 3, 1, 2))
             self.assertTrue(np.allclose(res1.numpy(), res2_tran))
@@ -277,10 +307,12 @@ def check_dygraph_5d_input(self, place):
             in1 = paddle.to_tensor(in_np1)
             in2 = paddle.to_tensor(in_np2)
 
-            res1 = paddle.nn.functional.local_response_norm(
-                x=in1, size=5, data_format='NCDHW')
-            res2 = paddle.nn.functional.local_response_norm(
-                x=in2, size=5, data_format='NDHWC')
+            res1 = paddle.nn.functional.local_response_norm(x=in1,
+                                                            size=5,
+                                                            data_format='NCDHW')
+            res2 = paddle.nn.functional.local_response_norm(x=in2,
+                                                            size=5,
+                                                            data_format='NDHWC')
 
             res2_tran = np.transpose(res2.numpy(), (0, 4, 1, 2, 3))
             self.assertTrue(np.allclose(res1.numpy(), res2_tran))
@@ -293,13 +325,14 @@ def test_dygraph(self):
 
 
 class TestLocalResponseNormFAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
             def test_Variable():
                 # the input of lrn must be Variable.
-                x1 = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+                x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                             [[1, 1, 1, 1]], fluid.CPUPlace())
                 paddle.nn.functional.local_response_norm(x1, size=5)
 
             self.assertRaises(TypeError, test_Variable)
@@ -312,8 +345,9 @@ def test_datatype():
 
             def test_dataformat():
                 x = fluid.data(name='x', shape=[3, 4, 5, 6], dtype="float32")
-                paddle.nn.functional.local_response_norm(
-                    x, size=5, data_format="NCTHW")
+                paddle.nn.functional.local_response_norm(x,
+                                                         size=5,
+                                                         data_format="NCTHW")
 
             self.assertRaises(ValueError, test_dataformat)
 
@@ -331,6 +365,7 @@ def test_shape():
 
 
 class TestLocalResponseNormCAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index 372b8d0d4d276..cdde705475e89 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -24,6 +24,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import random
+
 random.seed(2)
 np.set_printoptions(threshold=np.inf)
 paddle.enable_static()
@@ -34,6 +35,7 @@
 
 
 class RandomWeight:
+
     def __init__(self):
         pass
 
@@ -43,27 +45,34 @@ def updata_weight(self, hidden_size, input_size, dtype):
         self.input_size = input_size
         self.dtype = dtype
 
-        self.weight_ih = np.random.uniform(
-            low=-std, high=std, size=(4 * self.hidden_size,
-                                      self.input_size)).astype(dtype)
+        self.weight_ih = np.random.uniform(low=-std,
+                                           high=std,
+                                           size=(4 * self.hidden_size,
+                                                 self.input_size)).astype(dtype)
         self.weight_hh = np.random.uniform(
-            low=-std, high=std, size=(4 * self.hidden_size,
-                                      self.hidden_size)).astype(dtype)
-        self.bias_ih = np.random.uniform(
-            low=-std, high=std, size=(4 * self.hidden_size)).astype(dtype)
-        self.bias_hh = np.random.uniform(
-            low=-std, high=std, size=(4 * self.hidden_size)).astype(dtype)
+            low=-std, high=std,
+            size=(4 * self.hidden_size, self.hidden_size)).astype(dtype)
+        self.bias_ih = np.random.uniform(low=-std,
+                                         high=std,
+                                         size=(4 *
+                                               self.hidden_size)).astype(dtype)
+        self.bias_hh = np.random.uniform(low=-std,
+                                         high=std,
+                                         size=(4 *
+                                               self.hidden_size)).astype(dtype)
 
 
 weight = RandomWeight()
 
 
 class LayerMixin(object):
+
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
 
 
 class LayerListMixin(LayerMixin):
+
     def __init__(self, layers=None):
         self._layers = list(layers) if layers else []
 
@@ -75,6 +84,7 @@ def __iter__(self):
 
 
 class LSTMCell(LayerMixin):
+
     def __init__(self, input_size, hidden_size, bias=True):
         self.input_size = input_size
         self.hidden_size = hidden_size
@@ -263,6 +273,7 @@ def concat_states(states, bidirectional=False, state_components=1):
 
 
 class RNN(LayerMixin):
+
     def __init__(self, cell, is_reverse=False, time_major=False):
         super(RNN, self).__init__()
         self.cell = cell
@@ -283,6 +294,7 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
 
 
 class BiRNN(LayerMixin):
+
     def __init__(self, cell_fw, cell_bw, time_major=False):
         super(BiRNN, self).__init__()
         self.cell_fw = cell_fw
@@ -307,6 +319,7 @@ def forward(self,
 
 
 class RNNMixin(LayerListMixin):
+
     def forward(self, inputs, initial_states=None, sequence_length=None):
         batch_index = 1 if self.time_major else 0
         batch_size = inputs.shape[batch_index]
@@ -339,6 +352,7 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
 
 
 class LSTM(RNNMixin):
+
     def __init__(self,
                  input_size,
                  hidden_size,
@@ -380,6 +394,7 @@ def __init__(self,
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNLstmOp(OpTest):
+
     def get_weight_names(self):
         weight_names = []
         for i in range(2 * self.num_layers):
@@ -392,8 +407,7 @@ def setUp(self):
         self.op_type = "cudnn_lstm"
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.sequence_length = None if core.is_compiled_with_rocm(
-        ) else np.array(
-            [12, 11, 10, 9, 8], dtype=np.int32)
+        ) else np.array([12, 11, 10, 9, 8], dtype=np.int32)
         self.num_layers = 1
         self.set_attrs()
 
@@ -402,24 +416,24 @@ def setUp(self):
         input_size = 21
         hidden_size = 21
 
-        input = np.random.uniform(
-            low=-0.1, high=0.1,
-            size=(seq_length, batch_size, input_size)).astype(self.dtype)
+        input = np.random.uniform(low=-0.1,
+                                  high=0.1,
+                                  size=(seq_length, batch_size,
+                                        input_size)).astype(self.dtype)
         input[11][1:][:] = 0
         input[10][2:][:] = 0
         input[9][3:][:] = 0
         input[8][4:][:] = 0
 
         weight.updata_weight(hidden_size, input_size, self.dtype)
-        rnn1 = LSTM(
-            input_size,
-            hidden_size,
-            num_layers=self.num_layers,
-            time_major=True,
-            direction="forward")
+        rnn1 = LSTM(input_size,
+                    hidden_size,
+                    num_layers=self.num_layers,
+                    time_major=True,
+                    direction="forward")
 
-        output, (last_hidden, last_cell) = rnn1(
-            input, sequence_length=self.sequence_length)
+        output, (last_hidden,
+                 last_cell) = rnn1(input, sequence_length=self.sequence_length)
 
         flat_w = []
         num = 0
@@ -443,10 +457,10 @@ def setUp(self):
             bias_hh = weight.bias_hh
             flat_w.append(("bias" + str(num), bias_hh))
             num += 1
-        init_h = np.zeros((self.num_layers, batch_size,
-                           hidden_size)).astype(self.dtype)
-        init_c = np.zeros((self.num_layers, batch_size,
-                           hidden_size)).astype(self.dtype)
+        init_h = np.zeros(
+            (self.num_layers, batch_size, hidden_size)).astype(self.dtype)
+        init_c = np.zeros(
+            (self.num_layers, batch_size, hidden_size)).astype(self.dtype)
         state_out = np.ndarray((300)).astype("uint8")
 
         if core.is_compiled_with_rocm():
@@ -491,25 +505,26 @@ def set_attrs(self):
     def test_output_with_place(self):
         place = core.CUDAPlace(0)
         if core.is_compiled_with_rocm():
-            self.check_output_with_place(
-                place, atol=1e-5, no_check_set=['Reserve', 'StateOut'])
+            self.check_output_with_place(place,
+                                         atol=1e-5,
+                                         no_check_set=['Reserve', 'StateOut'])
         else:
-            self.check_output_with_place(
-                place, no_check_set=['Reserve', 'StateOut'])
+            self.check_output_with_place(place,
+                                         no_check_set=['Reserve', 'StateOut'])
 
     def test_grad_with_place(self):
         place = core.CUDAPlace(0)
         var_name_list = self.get_weight_names()
         for var_name in var_name_list:
             self.check_grad_with_place(
-                place,
-                set(['Input', var_name, 'InitH', 'InitC']),
+                place, set(['Input', var_name, 'InitH', 'InitC']),
                 ['Out', 'LastH', 'LastC'])
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNlstmAPI(unittest.TestCase):
+
     def test_lstm(self):
         seq_len = 20
         batch_size = 5
@@ -517,8 +532,9 @@ def test_lstm(self):
         dropout_prob = 0.0
         num_layers = 1
         dtype = 'float32' if core.is_compiled_with_rocm() else 'float64'
-        input = fluid.data(
-            name='input', shape=[seq_len, batch_size, hidden_size], dtype=dtype)
+        input = fluid.data(name='input',
+                           shape=[seq_len, batch_size, hidden_size],
+                           dtype=dtype)
         init_h = layers.fill_constant([num_layers, batch_size, hidden_size],
                                       dtype, 0.0)
         init_c = layers.fill_constant([num_layers, batch_size, hidden_size],
@@ -528,9 +544,10 @@ def test_lstm(self):
                                               dropout_prob, False)
         exe = fluid.Executor(fluid.CUDAPlace(0))
         exe.run(fluid.default_startup_program())
-        input_i = np.random.uniform(
-            low=-0.1, high=0.1, size=(seq_len, batch_size,
-                                      hidden_size)).astype("float64")
+        input_i = np.random.uniform(low=-0.1,
+                                    high=0.1,
+                                    size=(seq_len, batch_size,
+                                          hidden_size)).astype("float64")
         out = exe.run(fluid.default_main_program(),
                       feed={'input': input_i},
                       fetch_list=[rnn_out, last_h, last_c, 'cudnn_lstm_0.w_0'])
@@ -539,6 +556,7 @@ def test_lstm(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNlstmAPI(unittest.TestCase):
+
     def test_lstm(self):
         seq_len = 20
         batch_size = 5
@@ -546,8 +564,9 @@ def test_lstm(self):
         dropout_prob = 0.0
         num_layers = 2
         dtype = 'float32' if core.is_compiled_with_rocm() else 'float64'
-        input = fluid.data(
-            name='input', shape=[seq_len, batch_size, hidden_size], dtype=dtype)
+        input = fluid.data(name='input',
+                           shape=[seq_len, batch_size, hidden_size],
+                           dtype=dtype)
         init_h = layers.fill_constant([num_layers, batch_size, hidden_size],
                                       dtype, 0.0)
         init_c = layers.fill_constant([num_layers, batch_size, hidden_size],
@@ -557,9 +576,10 @@ def test_lstm(self):
                                               dropout_prob, False, True)
         exe = fluid.Executor(fluid.CUDAPlace(0))
         exe.run(fluid.default_startup_program())
-        input_i = np.random.uniform(
-            low=-0.1, high=0.1, size=(seq_len, batch_size,
-                                      hidden_size)).astype(dtype)
+        input_i = np.random.uniform(low=-0.1,
+                                    high=0.1,
+                                    size=(seq_len, batch_size,
+                                          hidden_size)).astype(dtype)
         out = exe.run(fluid.default_main_program(),
                       feed={'input': input_i},
                       fetch_list=[rnn_out, last_h, last_c, 'cudnn_lstm_0.w_0'])
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py
index fff5fef29221e..c4ddab74aae38 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
@@ -68,6 +68,7 @@ def lstm(
         act_gate=None,
         act_cell=None,
         act_cand=None):
+
     def _step(x, w_h, w_c, h_pre, c_pre, act_gate, act_cell, act_cand):
         g = np.dot(h_pre, w_h)  # 1 x 4D
         g = g + x
@@ -131,6 +132,7 @@ def _reverse(x, offset):
 
 
 class LstmUnitTestError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             batch_size = 20
@@ -138,10 +140,9 @@ def test_errors(self):
             dropout_prob = 0.2
             hidden_size = 150
             num_layers = 1
-            input = fluid.data(
-                name='input',
-                shape=[batch_size, seq_len, hidden_size],
-                dtype='float32')
+            input = fluid.data(name='input',
+                               shape=[batch_size, seq_len, hidden_size],
+                               dtype='float32')
             pre_hidden = fill_constant([num_layers, batch_size, hidden_size],
                                        'float32', 0.0)
             pre_cell = fill_constant([num_layers, batch_size, hidden_size],
@@ -178,10 +179,9 @@ def test_pre_cell_Variable():
             self.assertRaises(TypeError, test_pre_cell_Variable)
 
             def test_input_type():
-                error_input = fluid.data(
-                    name='error_input',
-                    shape=[None, hidden_size * 3],
-                    dtype='int32')
+                error_input = fluid.data(name='error_input',
+                                         shape=[None, hidden_size * 3],
+                                         dtype='int32')
                 LSTM(error_input, pre_hidden, pre_cell, \
                     seq_len, hidden_size, num_layers, \
                     dropout_prob=dropout_prob)
@@ -189,10 +189,9 @@ def test_input_type():
             self.assertRaises(TypeError, test_input_type)
 
             def test_pre_hidden_type():
-                error_pre_hidden = fluid.data(
-                    name='error_pre_hidden',
-                    shape=[None, hidden_size],
-                    dtype='int32')
+                error_pre_hidden = fluid.data(name='error_pre_hidden',
+                                              shape=[None, hidden_size],
+                                              dtype='int32')
                 LSTM(input, error_pre_hidden, pre_cell, \
                     seq_len, hidden_size, num_layers, \
                     dropout_prob=dropout_prob)
@@ -200,10 +199,9 @@ def test_pre_hidden_type():
             self.assertRaises(TypeError, test_pre_hidden_type)
 
             def test_pre_cell_type():
-                error_pre_cell = fluid.data(
-                    name='error_pre_cell',
-                    shape=[None, hidden_size],
-                    dtype='int32')
+                error_pre_cell = fluid.data(name='error_pre_cell',
+                                            shape=[None, hidden_size],
+                                            dtype='int32')
                 LSTM(input, pre_hidden, error_pre_cell, \
                     seq_len, hidden_size, num_layers, \
                     dropout_prob=dropout_prob)
@@ -212,6 +210,7 @@ def test_pre_cell_type():
 
 
 class TestLstmOp(OpTest):
+
     def set_is_test(self):
         self.is_test = False
 
@@ -286,28 +285,31 @@ def test_check_grad(self):
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
-            check_dygraph=False)
+        self.check_grad(['Input', 'Weight', 'Bias'], ['Hidden'],
+                        max_relative_error=5e-4,
+                        check_dygraph=False)
 
 
 class TestLstmOpCase1(TestLstmOp):
+
     def set_lod(self):
         self.lod = [[0, 3, 2]]
 
 
 class TestLstmOpCase2(TestLstmOp):
+
     def set_lod(self):
         self.lod = [[0, 3, 0]]
 
 
 class TestLstmOpCase3(TestLstmOp):
+
     def set_lod(self):
         self.lod = [[2, 0, 4]]
 
 
 class TestLstmOpInference(TestLstmOp):
+
     def set_is_test(self):
         self.is_test = True
 
@@ -317,37 +319,43 @@ def test_check_grad(self):
 
 
 class TestLstmOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
             def test_Variable():
                 input_data = np.random.random((1, 2048)).astype("float32")
-                fluid.layers.dynamic_lstm(
-                    input=input_data, size=2048, use_peepholes=False)
+                fluid.layers.dynamic_lstm(input=input_data,
+                                          size=2048,
+                                          use_peepholes=False)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_h_0():
-                in_data = fluid.data(
-                    name="input", shape=[None, 2048], dtype="float32")
+                in_data = fluid.data(name="input",
+                                     shape=[None, 2048],
+                                     dtype="float32")
                 h = fluid.data(name="h", shape=[None, 512], dtype="int32")
                 c = fluid.data(name="c", shape=[None, 512], dtype="float32")
-                fluid.layers.dynamic_lstm(
-                    input=in_data, size=2048, use_peepholes=False, h_0=h, c_0=c)
+                fluid.layers.dynamic_lstm(input=in_data,
+                                          size=2048,
+                                          use_peepholes=False,
+                                          h_0=h,
+                                          c_0=c)
 
             self.assertRaises(TypeError, test_h_0)
 
             def test_c_0():
-                in_data_ = fluid.data(
-                    name="input_", shape=[None, 2048], dtype="float32")
+                in_data_ = fluid.data(name="input_",
+                                      shape=[None, 2048],
+                                      dtype="float32")
                 h_ = fluid.data(name="h_", shape=[None, 512], dtype="float32")
                 c_ = fluid.data(name="c_", shape=[None, 512], dtype="int32")
-                fluid.layers.dynamic_lstm(
-                    input=in_data_,
-                    size=2048,
-                    use_peepholes=False,
-                    h_0=h_,
-                    c_0=c_)
+                fluid.layers.dynamic_lstm(input=in_data_,
+                                          size=2048,
+                                          use_peepholes=False,
+                                          h_0=h_,
+                                          c_0=c_)
 
             self.assertRaises(TypeError, test_c_0)
 
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py b/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
index c0875462e33f3..89a60e265619d 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
@@ -31,20 +31,20 @@ def tanh_np(x):
 
 
 class LstmUnitTestError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             batch_size, dict_dim, emb_dim, hidden_dim = 32, 128, 64, 512
-            data = fluid.data(
-                name='step_data', shape=[batch_size], dtype='int64')
+            data = fluid.data(name='step_data',
+                              shape=[batch_size],
+                              dtype='int64')
             inputs = fluid.embedding(input=data, size=[dict_dim, emb_dim])
-            pre_hidden = fluid.data(
-                name='pre_hidden',
-                shape=[batch_size, hidden_dim],
-                dtype='float32')
-            pre_cell = fluid.data(
-                name='pre_cell',
-                shape=[batch_size, hidden_dim],
-                dtype='float32')
+            pre_hidden = fluid.data(name='pre_hidden',
+                                    shape=[batch_size, hidden_dim],
+                                    dtype='float32')
+            pre_cell = fluid.data(name='pre_cell',
+                                  shape=[batch_size, hidden_dim],
+                                  dtype='float32')
 
             np_input = np.random.uniform(
                 -0.1, 0.1, (batch_size, emb_dim)).astype('float64')
@@ -69,34 +69,32 @@ def test_pre_cell_Variable():
             self.assertRaises(TypeError, test_pre_cell_Variable)
 
             def test_input_type():
-                error_input = fluid.data(
-                    name='error_input',
-                    shape=[batch_size, emb_dim],
-                    dtype='int32')
+                error_input = fluid.data(name='error_input',
+                                         shape=[batch_size, emb_dim],
+                                         dtype='int32')
                 lstm_unit(error_input, pre_hidden, pre_cell)
 
             self.assertRaises(TypeError, test_input_type)
 
             def test_pre_hidden_type():
-                error_pre_hidden = fluid.data(
-                    name='error_pre_hidden',
-                    shape=[batch_size, hidden_dim],
-                    dtype='int32')
+                error_pre_hidden = fluid.data(name='error_pre_hidden',
+                                              shape=[batch_size, hidden_dim],
+                                              dtype='int32')
                 lstm_unit(inputs, error_pre_hidden, pre_cell)
 
             self.assertRaises(TypeError, test_pre_hidden_type)
 
             def test_pre_cell_type():
-                error_pre_cell = fluid.data(
-                    name='error_pre_cell',
-                    shape=[batch_size, hidden_dim],
-                    dtype='int32')
+                error_pre_cell = fluid.data(name='error_pre_cell',
+                                            shape=[batch_size, hidden_dim],
+                                            dtype='int32')
                 lstm_unit(inputs, pre_hidden, error_pre_cell)
 
             self.assertRaises(TypeError, test_pre_cell_type)
 
 
 class LstmUnitTest(OpTest):
+
     def setUp(self):
         self.op_type = "lstm_unit"
         x_np = np.random.normal(size=(15, 160)).astype("float64")
diff --git a/python/paddle/fluid/tests/unittests/test_lstmp_op.py b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
index 186504af08701..abd670079415d 100644
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
@@ -44,6 +44,7 @@ def lstmp(
         act_cell=None,
         act_cand=None,
         act_proj=None):
+
     def _step(x, w_r, w_rh, w_c, r_pre, c_pre, proj_clip, cell_clip, act_gate,
               act_cell, act_cand, act_proj):
         g = np.dot(r_pre, w_r)  # 1 x 4D
@@ -126,6 +127,7 @@ def _reverse(x, offset):
 
 
 class TestLstmpOp(LstmTest.TestLstmOp):
+
     def reset_argument(self):
         pass
 
@@ -196,13 +198,14 @@ def test_check_grad(self):
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'ProjWeight', 'Bias'], ['Projection'],
-            numeric_grad_delta=0.0000005,
-            check_dygraph=False)
+        self.check_grad(['Input', 'Weight', 'ProjWeight', 'Bias'],
+                        ['Projection'],
+                        numeric_grad_delta=0.0000005,
+                        check_dygraph=False)
 
 
 class TestLstmpOpHasInitial(TestLstmpOp):
+
     def reset_argument(self):
         self.has_initial_state = True
 
@@ -213,11 +216,10 @@ def test_check_grad(self):
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
-            ['Projection'],
-            numeric_grad_delta=0.0000005,
-            check_dygraph=False)
+        self.check_grad(['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
+                        ['Projection'],
+                        numeric_grad_delta=0.0000005,
+                        check_dygraph=False)
 
     def test_check_grad_ingore_bias(self):
         N = len(self.lod[0])
@@ -225,11 +227,10 @@ def test_check_grad_ingore_bias(self):
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'ProjWeight', 'Weight'], ['Projection'],
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('Bias'),
-            check_dygraph=False)
+        self.check_grad(['Input', 'ProjWeight', 'Weight'], ['Projection'],
+                        numeric_grad_delta=0.0000005,
+                        no_grad_set=set('Bias'),
+                        check_dygraph=False)
 
     def test_check_grad_ingore_weight(self):
         N = len(self.lod[0])
@@ -237,11 +238,10 @@ def test_check_grad_ingore_weight(self):
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'ProjWeight', 'Bias'], ['Projection'],
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('Weight'),
-            check_dygraph=False)
+        self.check_grad(['Input', 'ProjWeight', 'Bias'], ['Projection'],
+                        numeric_grad_delta=0.0000005,
+                        no_grad_set=set('Weight'),
+                        check_dygraph=False)
 
     def test_check_grad_ingore_proj_weight(self):
         N = len(self.lod[0])
@@ -249,11 +249,10 @@ def test_check_grad_ingore_proj_weight(self):
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias'], ['Projection'],
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('ProjWeight'),
-            check_dygraph=False)
+        self.check_grad(['Input', 'Weight', 'Bias'], ['Projection'],
+                        numeric_grad_delta=0.0000005,
+                        no_grad_set=set('ProjWeight'),
+                        check_dygraph=False)
 
     def test_check_grad_ingore_input(self):
         N = len(self.lod[0])
@@ -261,11 +260,10 @@ def test_check_grad_ingore_input(self):
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Weight', 'ProjWeight', 'Bias'], ['Projection'],
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('Input'),
-            check_dygraph=False)
+        self.check_grad(['Weight', 'ProjWeight', 'Bias'], ['Projection'],
+                        numeric_grad_delta=0.0000005,
+                        no_grad_set=set('Input'),
+                        check_dygraph=False)
 
     def test_check_grad_ingore_h0(self):
         N = len(self.lod[0])
@@ -273,11 +271,11 @@ def test_check_grad_ingore_h0(self):
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], ['Projection'],
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('H0'),
-            check_dygraph=False)
+        self.check_grad(['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'],
+                        ['Projection'],
+                        numeric_grad_delta=0.0000005,
+                        no_grad_set=set('H0'),
+                        check_dygraph=False)
 
     def test_check_grad_ingore_c0(self):
         N = len(self.lod[0])
@@ -285,88 +283,93 @@ def test_check_grad_ingore_c0(self):
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], ['Projection'],
-            numeric_grad_delta=0.0000005,
-            no_grad_set=set('C0'),
-            check_dygraph=False)
+        self.check_grad(['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'],
+                        ['Projection'],
+                        numeric_grad_delta=0.0000005,
+                        no_grad_set=set('C0'),
+                        check_dygraph=False)
 
 
 class TestLstmpOpRerverse(TestLstmpOp):
+
     def reset_argument(self):
         self.is_reverse = True
 
 
 class TestLstmpOpNotUsePeepholes(TestLstmpOp):
+
     def reset_argument(self):
         self.use_peepholes = False
 
 
 class TestLstmpOpLinearProjection(TestLstmpOp):
+
     def reset_argument(self):
         self.act_proj = 'identity'
 
 
 class TestLstmpOpLen0Case1(TestLstmpOp):
+
     def reset_argument(self):
         self.lod = [[0, 4, 0]]
 
 
 class TestLstmpOpLen0Case2(TestLstmpOp):
+
     def reset_argument(self):
         self.lod = [[2, 0, 3]]
 
 
 class TestLstmpOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
             def test_Variable():
                 input_data = np.random.random((1, 2048)).astype("float32")
-                fluid.layers.dynamic_lstmp(
-                    input=input_data,
-                    size=2048,
-                    proj_size=256,
-                    use_peepholes=False,
-                    is_reverse=True,
-                    cell_activation="tanh",
-                    proj_activation="tanh")
+                fluid.layers.dynamic_lstmp(input=input_data,
+                                           size=2048,
+                                           proj_size=256,
+                                           use_peepholes=False,
+                                           is_reverse=True,
+                                           cell_activation="tanh",
+                                           proj_activation="tanh")
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_h_0():
-                in_data = fluid.data(
-                    name="input", shape=[None, 2048], dtype="float32")
+                in_data = fluid.data(name="input",
+                                     shape=[None, 2048],
+                                     dtype="float32")
                 h = fluid.data(name="h", shape=[None, 512], dtype="int32")
                 c = fluid.data(name="c", shape=[None, 512], dtype="float32")
-                fluid.layers.dynamic_lstmp(
-                    input=in_data,
-                    size=2048,
-                    proj_size=256,
-                    use_peepholes=False,
-                    is_reverse=True,
-                    cell_activation="tanh",
-                    proj_activation="tanh",
-                    h_0=h,
-                    c_0=c)
+                fluid.layers.dynamic_lstmp(input=in_data,
+                                           size=2048,
+                                           proj_size=256,
+                                           use_peepholes=False,
+                                           is_reverse=True,
+                                           cell_activation="tanh",
+                                           proj_activation="tanh",
+                                           h_0=h,
+                                           c_0=c)
 
             self.assertRaises(TypeError, test_h_0)
 
             def test_c_0():
-                in_data_ = fluid.data(
-                    name="input_", shape=[None, 2048], dtype="float32")
+                in_data_ = fluid.data(name="input_",
+                                      shape=[None, 2048],
+                                      dtype="float32")
                 h_ = fluid.data(name="h_", shape=[None, 512], dtype="float32")
                 c_ = fluid.data(name="c_", shape=[None, 512], dtype="int32")
-                fluid.layers.dynamic_lstmp(
-                    input=in_data_,
-                    size=2048,
-                    proj_size=256,
-                    use_peepholes=False,
-                    is_reverse=True,
-                    cell_activation="tanh",
-                    proj_activation="tanh",
-                    h_0=h_,
-                    c_0=c_)
+                fluid.layers.dynamic_lstmp(input=in_data_,
+                                           size=2048,
+                                           proj_size=256,
+                                           use_peepholes=False,
+                                           is_reverse=True,
+                                           cell_activation="tanh",
+                                           proj_activation="tanh",
+                                           h_0=h_,
+                                           c_0=c_)
 
             self.assertRaises(TypeError, test_c_0)
 
diff --git a/python/paddle/fluid/tests/unittests/test_lu_op.py b/python/paddle/fluid/tests/unittests/test_lu_op.py
index 1f1e3d1a2fb02..2989a0307400a 100644
--- a/python/paddle/fluid/tests/unittests/test_lu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lu_op.py
@@ -68,7 +68,9 @@ def Pmat_to_perm(Pmat_org, cut):
             sP[idx, :] = tmp
 
         permmat.append(permlst)
-    Pivot = np.array(permmat).reshape(list(shape[:-2]) + [rows, ]) + 1
+    Pivot = np.array(permmat).reshape(list(shape[:-2]) + [
+        rows,
+    ]) + 1
     return Pivot[..., :cut]
 
 
@@ -111,18 +113,18 @@ def set_output(self):
         lshape = np.array(sL.shape)
         ushape = np.array(sU.shape)
 
-        lpad = (len(sL.shape) - 2) * [(0, 0)] + list((
-            (0, (ashape - lshape)[-2]), (0, (ashape - lshape)[-1])))
-        upad = (len(sU.shape) - 2) * [(0, 0)] + list((
-            (0, (ashape - ushape)[-2]), (0, (ashape - ushape)[-1])))
+        lpad = (len(sL.shape) - 2) * [(0, 0)] + list(
+            ((0, (ashape - lshape)[-2]), (0, (ashape - lshape)[-1])))
+        upad = (len(sU.shape) - 2) * [(0, 0)] + list(
+            ((0, (ashape - ushape)[-2]), (0, (ashape - ushape)[-1])))
 
         NsL = np.pad(sL, lpad)
         NsU = np.pad(sU, upad)
         NLU = NsL + NsU
         self.output = NLU
         self.Pivots = Pmat_to_perm(sP, min(ashape[-2], ashape[-1]))
-        self.Infos = np.zeros(self.x_shape[:-2]) if len(
-            X.shape) > 2 else np.array([0])
+        self.Infos = np.zeros(
+            self.x_shape[:-2]) if len(X.shape) > 2 else np.array([0])
 
     def setUp(self):
         self.op_type = "lu"
@@ -171,7 +173,9 @@ def config(self):
 
 
 class TestLUAPI(unittest.TestCase):
+
     def test_dygraph(self):
+
         def run_lu_dygraph(shape, dtype):
             if dtype == "float32":
                 np_dtype = np.float32
@@ -246,17 +250,20 @@ def run_lu_static(shape, dtype):
                     lshape = np.array(sL.shape)
                     ushape = np.array(sU.shape)
 
-                    lpad = (len(sL.shape) - 2) * [(0, 0)] + list((
-                        (0, (ashape - lshape)[-2]), (0, (ashape - lshape)[-1])))
-                    upad = (len(sU.shape) - 2) * [(0, 0)] + list((
-                        (0, (ashape - ushape)[-2]), (0, (ashape - ushape)[-1])))
+                    lpad = (len(sL.shape) - 2) * [(0, 0)] + list(
+                        ((0, (ashape - lshape)[-2]), (0,
+                                                      (ashape - lshape)[-1])))
+                    upad = (len(sU.shape) - 2) * [(0, 0)] + list(
+                        ((0, (ashape - ushape)[-2]), (0,
+                                                      (ashape - ushape)[-1])))
 
                     NsL = np.pad(sL, lpad)
                     NsU = np.pad(sU, upad)
                     NLU = NsL + NsU
 
-                    x = paddle.fluid.data(
-                        name="input", shape=shape, dtype=dtype)
+                    x = paddle.fluid.data(name="input",
+                                          shape=shape,
+                                          dtype=dtype)
                     lu, p = paddle.linalg.lu(x, pivot=pivot)
                     exe = fluid.Executor(place)
                     fetches = exe.run(fluid.default_main_program(),
diff --git a/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py b/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
index 0aff38cb78543..1757adef8e36f 100644
--- a/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
@@ -74,7 +74,9 @@ def Pmat_to_perm(Pmat_org, cut):
             sP[idx, :] = tmp
 
         permmat.append(permlst)
-    Pivot = np.array(permmat).reshape(list(shape[:-2]) + [rows, ]) + 1
+    Pivot = np.array(permmat).reshape(list(shape[:-2]) + [
+        rows,
+    ]) + 1
 
     return Pivot[..., :cut]
 
@@ -130,8 +132,9 @@ def setUp(self):
                 place = fluid.CPUPlace()
                 if core.is_compiled_with_cuda():
                     place = fluid.CUDAPlace(0)
-                xv = paddle.fluid.data(
-                    name="input", shape=self.x_shape, dtype=self.dtype)
+                xv = paddle.fluid.data(name="input",
+                                       shape=self.x_shape,
+                                       dtype=self.dtype)
                 lu, p = paddle.linalg.lu(xv)
                 exe = fluid.Executor(place)
                 fetches = exe.run(fluid.default_main_program(),
@@ -186,7 +189,9 @@ def config(self):
 
 
 class TestLU_UnpackAPI(unittest.TestCase):
+
     def test_dygraph(self):
+
         def run_lu_unpack_dygraph(shape, dtype):
             if dtype == "float32":
                 np_dtype = np.float32
@@ -247,8 +252,9 @@ def run_lu_static(shape, dtype):
                 with fluid.program_guard(fluid.Program(), fluid.Program()):
                     sP, sL, sU = scipy_lu_unpack(a)
 
-                    x = paddle.fluid.data(
-                        name="input", shape=shape, dtype=dtype)
+                    x = paddle.fluid.data(name="input",
+                                          shape=shape,
+                                          dtype=dtype)
                     lu, p = paddle.linalg.lu(x)
                     pP, pL, pU = paddle.linalg.lu_unpack(lu, p)
                     exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_manual_seed.py b/python/paddle/fluid/tests/unittests/test_manual_seed.py
index 75753dcd1e880..e42487df79a9f 100644
--- a/python/paddle/fluid/tests/unittests/test_manual_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_manual_seed.py
@@ -23,6 +23,7 @@
 
 
 class TestManualSeed(unittest.TestCase):
+
     def test_seed(self):
         fluid.enable_dygraph()
 
diff --git a/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py
index 2b511b9eb442b..c337736c88144 100644
--- a/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py
@@ -69,6 +69,7 @@ def margin_cross_entropy(logits,
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestMarginCrossEntropyOp(OpTest):
+
     def initParams(self):
         self.op_type = "margin_cross_entropy"
         self.axis = -1
@@ -99,8 +100,9 @@ def setUp(self):
             np.sum(np.square(weights), axis=0, keepdims=True))
         logits = np.matmul(datas, weights)
 
-        labels = np.random.randint(
-            0, self.num_class, (self.batch_dim, ), dtype="int64")
+        labels = np.random.randint(0,
+                                   self.num_class, (self.batch_dim, ),
+                                   dtype="int64")
 
         loss, softmax = margin_cross_entropy(logits, labels, self.axis,
                                              self.margin1, self.margin2,
@@ -128,20 +130,21 @@ def test_check_grad(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestMarginCrossEntropyOpFP32(TestMarginCrossEntropyOp):
+
     def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            core.CUDAPlace(0), ["Logits"],
-            "Loss",
-            numeric_grad_delta=5e-2,
-            max_relative_error=5e-2)
+        self.check_grad_with_place(core.CUDAPlace(0), ["Logits"],
+                                   "Loss",
+                                   numeric_grad_delta=5e-2,
+                                   max_relative_error=5e-2)
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestMarginCrossEntropyOpFP16(TestMarginCrossEntropyOp):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -149,16 +152,16 @@ def test_check_output(self):
         self.check_output_with_place(core.CUDAPlace(0), atol=5e-2)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            core.CUDAPlace(0), ["Logits"],
-            "Loss",
-            numeric_grad_delta=6e-1,
-            max_relative_error=6e-1)
+        self.check_grad_with_place(core.CUDAPlace(0), ["Logits"],
+                                   "Loss",
+                                   numeric_grad_delta=6e-1,
+                                   max_relative_error=6e-1)
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestMarginCrossEntropyOpCosFace(TestMarginCrossEntropyOp):
+
     def init_loss_params(self):
         self.margin1 = 1.0
         self.margin2 = 0.0
@@ -169,6 +172,7 @@ def init_loss_params(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestMarginCrossEntropyOpSphereFace(TestMarginCrossEntropyOp):
+
     def init_loss_params(self):
         self.margin1 = 1.35
         self.margin2 = 0.0
@@ -177,6 +181,7 @@ def init_loss_params(self):
 
 
 class TestMarginCrossEntropyOpCPU(TestMarginCrossEntropyOp):
+
     def test_check_output(self):
         try:
             self.check_output_with_place(core.CPUPlace(), atol=1e-5)
@@ -193,6 +198,7 @@ def test_check_grad(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestMarginCrossEntropyOpV2(unittest.TestCase):
+
     def setUp(self):
         self.initParams()
         np.random.seed(self.seed)
@@ -239,19 +245,22 @@ def check_static_result(self, place):
                 np.sum(np.square(weights), axis=0, keepdims=True))
 
             logits_np = np.matmul(datas, weights)
-            labels_np = np.random.randint(
-                0, self.num_class, (self.batch_dim, ), dtype="int64")
-
-            loss_np, softmax_np = margin_cross_entropy(
-                logits_np, labels_np, self.axis, self.margin1, self.margin2,
-                self.margin3, self.scale, self.reduction)
-
-            logits = paddle.static.data(
-                name='logits',
-                shape=[self.batch_dim, self.num_class],
-                dtype=self.dtype)
-            label = paddle.static.data(
-                name='label', shape=[self.batch_dim], dtype="int64")
+            labels_np = np.random.randint(0,
+                                          self.num_class, (self.batch_dim, ),
+                                          dtype="int64")
+
+            loss_np, softmax_np = margin_cross_entropy(logits_np, labels_np,
+                                                       self.axis, self.margin1,
+                                                       self.margin2,
+                                                       self.margin3, self.scale,
+                                                       self.reduction)
+
+            logits = paddle.static.data(name='logits',
+                                        shape=[self.batch_dim, self.num_class],
+                                        dtype=self.dtype)
+            label = paddle.static.data(name='label',
+                                       shape=[self.batch_dim],
+                                       dtype="int64")
             loss, softmax = paddle.nn.functional.margin_cross_entropy(
                 logits,
                 label,
@@ -263,11 +272,13 @@ def check_static_result(self, place):
                 reduction=self.reduction)
 
             exe = paddle.fluid.Executor(place)
-            [loss_res, softmax_res] = exe.run(
-                paddle.fluid.default_main_program(),
-                feed={'logits': logits_np,
-                      'label': labels_np},
-                fetch_list=[loss, softmax])
+            [loss_res,
+             softmax_res] = exe.run(paddle.fluid.default_main_program(),
+                                    feed={
+                                        'logits': logits_np,
+                                        'label': labels_np
+                                    },
+                                    fetch_list=[loss, softmax])
             np.testing.assert_allclose(loss_res, loss_np)
             np.testing.assert_allclose(softmax_res, softmax_np)
 
@@ -287,12 +298,15 @@ def check_dynamic_result(self, place):
                 np.sum(np.square(weights), axis=0, keepdims=True))
 
             logits_np = np.matmul(datas, weights)
-            labels_np = np.random.randint(
-                0, self.num_class, (self.batch_dim, ), dtype="int64")
+            labels_np = np.random.randint(0,
+                                          self.num_class, (self.batch_dim, ),
+                                          dtype="int64")
 
-            loss_np, softmax_np = margin_cross_entropy(
-                logits_np, labels_np, self.axis, self.margin1, self.margin2,
-                self.margin3, self.scale, self.reduction)
+            loss_np, softmax_np = margin_cross_entropy(logits_np, labels_np,
+                                                       self.axis, self.margin1,
+                                                       self.margin2,
+                                                       self.margin3, self.scale,
+                                                       self.reduction)
 
             logits = paddle.to_tensor(logits_np, dtype=self.dtype)
             labels = paddle.to_tensor(labels_np, dtype="int64")
@@ -316,6 +330,7 @@ def check_dynamic_result(self, place):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestMarginCrossEntropyOpV3(TestMarginCrossEntropyOpV2):
+
     def init_reduction(self):
         self.reduction = 'mean'
 
@@ -323,6 +338,7 @@ def init_reduction(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestMarginCrossEntropyOpV4(TestMarginCrossEntropyOpV2):
+
     def init_reduction(self):
         self.reduction = 'sum'
 
@@ -330,6 +346,7 @@ def init_reduction(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestMarginCrossEntropyOpAPIError(unittest.TestCase):
+
     def setUp(self):
         self.initParams()
         np.random.seed(self.seed)
@@ -357,11 +374,14 @@ def init_dtype(self):
         self.dtype = np.float64
 
     def test_dynamic_errors(self):
+
         def test_dim():
             for place in self.places:
                 with paddle.fluid.dygraph.guard(place):
-                    labels_np = np.random.randint(
-                        0, self.num_class, (self.batch_dim, 2), dtype="int64")
+                    labels_np = np.random.randint(0,
+                                                  self.num_class,
+                                                  (self.batch_dim, 2),
+                                                  dtype="int64")
                     logits_np = np.random.uniform(
                         -0.99, 0.99,
                         [self.batch_dim, self.num_class]).astype(self.dtype)
@@ -381,9 +401,9 @@ def test_dim():
         def test_label_type():
             for place in self.places:
                 with paddle.fluid.dygraph.guard(place):
-                    labels_np = np.random.uniform(
-                        0, self.num_class,
-                        (self.batch_dim, 1)).astype(self.dtype)
+                    labels_np = np.random.uniform(0, self.num_class,
+                                                  (self.batch_dim, 1)).astype(
+                                                      self.dtype)
                     logits_np = np.random.uniform(
                         -0.99, 0.99,
                         [self.batch_dim, self.num_class]).astype(self.dtype)
@@ -403,8 +423,10 @@ def test_label_type():
         def test_group_value():
             for place in self.places:
                 with paddle.fluid.dygraph.guard(place):
-                    labels_np = np.random.randint(
-                        0, self.num_class, (self.batch_dim, ), dtype="int64")
+                    labels_np = np.random.randint(0,
+                                                  self.num_class,
+                                                  (self.batch_dim, ),
+                                                  dtype="int64")
                     logits_np = np.random.uniform(
                         -0.99, 0.99,
                         [self.batch_dim, self.num_class]).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py b/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
index 86fda635baa99..e1ae71a9d7ac7 100644
--- a/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
@@ -21,6 +21,7 @@
 
 
 class TestMarginRankLossOp(OpTest):
+
     def setUp(self):
         self.op_type = "margin_rank_loss"
         batch_size = 5
@@ -53,6 +54,7 @@ def test_check_grad_ignore_x2(self):
 
 
 class TestMarginRankLossLayer(unittest.TestCase):
+
     def setUp(self):
         self.batch_size = 5
         self.margin = 0.5
@@ -86,12 +88,13 @@ def check_identity(self, place):
 
         exe = fluid.Executor(place)
         exe.run(start)
-        out_np, = exe.run(
-            main,
-            feed={"label": self.label,
-                  "x1": self.x1,
-                  "x2": self.x2},
-            fetch_list=[out])
+        out_np, = exe.run(main,
+                          feed={
+                              "label": self.label,
+                              "x1": self.x1,
+                              "x2": self.x2
+                          },
+                          fetch_list=[out])
         np.testing.assert_allclose(out_np, self.loss)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_marker_op.py b/python/paddle/fluid/tests/unittests/test_marker_op.py
index 3f9f8c7d6bc8c..cdf132b72b1ba 100644
--- a/python/paddle/fluid/tests/unittests/test_marker_op.py
+++ b/python/paddle/fluid/tests/unittests/test_marker_op.py
@@ -18,6 +18,7 @@
 
 
 class TestMarkerOp(OpTest):
+
     def setUp(self):
         self.op_type = "marker"
         self.inputs = {}
diff --git a/python/paddle/fluid/tests/unittests/test_masked_select_op.py b/python/paddle/fluid/tests/unittests/test_masked_select_op.py
index 764f4806ba4ba..2bd2a8f4549a0 100644
--- a/python/paddle/fluid/tests/unittests/test_masked_select_op.py
+++ b/python/paddle/fluid/tests/unittests/test_masked_select_op.py
@@ -30,6 +30,7 @@ def np_masked_select(x, mask):
 
 
 class TestMaskedSelectOp(OpTest):
+
     def setUp(self):
         self.init()
         self.op_type = "masked_select"
@@ -51,16 +52,19 @@ def init(self):
 
 
 class TestMaskedSelectOp1(TestMaskedSelectOp):
+
     def init(self):
         self.shape = (6, 8, 9, 18)
 
 
 class TestMaskedSelectOp2(TestMaskedSelectOp):
+
     def init(self):
         self.shape = (168, )
 
 
 class TestMaskedSelectAPI(unittest.TestCase):
+
     def test_imperative_mode(self):
         paddle.disable_static()
         shape = (88, 6, 8)
@@ -86,13 +90,16 @@ def test_static_mode(self):
         exe = paddle.static.Executor(place=paddle.CPUPlace())
 
         res = exe.run(paddle.static.default_main_program(),
-                      feed={"x": np_x,
-                            "mask": np_mask},
+                      feed={
+                          "x": np_x,
+                          "mask": np_mask
+                      },
                       fetch_list=[out])
         self.assertEqual(np.allclose(res, np_out), True)
 
 
 class TestMaskedSelectError(unittest.TestCase):
+
     def test_error(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
@@ -100,8 +107,9 @@ def test_error(self):
             shape = [8, 9, 6]
             x = paddle.fluid.data(shape=shape, dtype='float32', name='x')
             mask = paddle.fluid.data(shape=shape, dtype='bool', name='mask')
-            mask_float = paddle.fluid.data(
-                shape=shape, dtype='float32', name='mask_float')
+            mask_float = paddle.fluid.data(shape=shape,
+                                           dtype='float32',
+                                           name='mask_float')
             np_x = np.random.random(shape).astype('float32')
             np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
 
diff --git a/python/paddle/fluid/tests/unittests/test_match_matrix_tensor_op.py b/python/paddle/fluid/tests/unittests/test_match_matrix_tensor_op.py
index 5784d3b5d7491..3c41ad0f93de6 100644
--- a/python/paddle/fluid/tests/unittests/test_match_matrix_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_match_matrix_tensor_op.py
@@ -21,6 +21,7 @@
 
 
 class TestMatchMatrixTensorOp(OpTest):
+
     def setUp(self):
         self.init_op_type()
         self.set_data()
@@ -78,6 +79,7 @@ def test_check_grad(self):
 
 
 class TestMatchMatrixTensorOpCase1(TestMatchMatrixTensorOp):
+
     def set_data(self):
         ix, iy, h, dim_t = [5, 8, 25, 4]
         x_lod = [[5]]
@@ -86,6 +88,7 @@ def set_data(self):
 
 
 class TestMatchMatrixTensorOpCase2(TestMatchMatrixTensorOp):
+
     def set_data(self):
         ix, iy, h, dim_t = [105, 120, 1, 4]
         x_lod = [[30, 45, 30]]
@@ -94,6 +97,7 @@ def set_data(self):
 
 
 class TestMatchMatrixTensorOpCase3(TestMatchMatrixTensorOp):
+
     def set_data(self):
         ix, iy, h, dim_t = [5, 9, 32, 1]
         x_lod = [[1, 2, 2]]
@@ -102,6 +106,7 @@ def set_data(self):
 
 
 class TestMatchMatrixTensorOpCase4(TestMatchMatrixTensorOp):
+
     def set_data(self):
         ix, iy, h, dim_t = [8, 12, 16, 5]
         x_lod = [[1, 2, 3, 1, 1]]
@@ -111,8 +116,9 @@ def set_data(self):
     def test_api(self):
         x_lod_tensor = fluid.layers.data(name='x', shape=[10], lod_level=1)
         y_lod_tensor = fluid.layers.data(name='y', shape=[10], lod_level=1)
-        out, out_tmp = fluid.contrib.match_matrix_tensor(
-            x=x_lod_tensor, y=y_lod_tensor, channel_num=3)
+        out, out_tmp = fluid.contrib.match_matrix_tensor(x=x_lod_tensor,
+                                                         y=y_lod_tensor,
+                                                         channel_num=3)
 
         place = fluid.CPUPlace()
         x_data = np.random.rand(7, 10).astype('float32')
@@ -122,8 +128,10 @@ def test_api(self):
 
         exe = fluid.Executor(place=place)
         exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'x': x,
-                            'y': y},
+        ret = exe.run(feed={
+            'x': x,
+            'y': y
+        },
                       fetch_list=[out],
                       return_numpy=False)
 
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
index 258543631f970..9dd47647a1a24 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
@@ -23,6 +23,7 @@
 
 
 class TestMathOpPatches(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
 
@@ -141,8 +142,10 @@ def test_div_two_tensor(self):
         a_np = numpy.random.random(size=[10, 1]).astype('float32')
         b_np = numpy.random.random(size=[10, 1]).astype('float32') + 1e-2
         c_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np,
-                             'b': b_np},
+                       feed={
+                           "a": a_np,
+                           'b': b_np
+                       },
                        fetch_list=[c])
         self.assertTrue(numpy.allclose(a_np / b_np, c_np))
 
@@ -156,8 +159,10 @@ def test_mul_two_tensor(self):
         a_np = numpy.random.random(size=[10, 1]).astype('float32')
         b_np = numpy.random.random(size=[10, 1]).astype('float32')
         c_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np,
-                             'b': b_np},
+                       feed={
+                           "a": a_np,
+                           'b': b_np
+                       },
                        fetch_list=[c])
         self.assertTrue(numpy.allclose(a_np * b_np, c_np))
 
@@ -171,8 +176,10 @@ def test_add_two_tensor(self):
         a_np = numpy.random.random(size=[10, 1]).astype('float32')
         b_np = numpy.random.random(size=[10, 1]).astype('float32')
         c_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np,
-                             'b': b_np},
+                       feed={
+                           "a": a_np,
+                           'b': b_np
+                       },
                        fetch_list=[c])
         self.assertTrue(numpy.allclose(a_np + b_np, c_np))
 
@@ -186,8 +193,10 @@ def test_sub_two_tensor(self):
         a_np = numpy.random.random(size=[10, 1]).astype('float32')
         b_np = numpy.random.random(size=[10, 1]).astype('float32')
         c_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np,
-                             'b': b_np},
+                       feed={
+                           "a": a_np,
+                           'b': b_np
+                       },
                        fetch_list=[c])
         self.assertTrue(numpy.allclose(a_np - b_np, c_np))
 
@@ -217,8 +226,10 @@ def test_equal(self):
         b_np = numpy.array([3, 4, 11, 15, 8, 18]).astype('float32')
 
         c_np, = exe.run(fluid.default_main_program(),
-                        feed={"a": a_np,
-                              "b": b_np},
+                        feed={
+                            "a": a_np,
+                            "b": b_np
+                        },
                         fetch_list=[c])
 
         self.assertTrue(numpy.array_equal(c_np, a_np == b_np))
@@ -239,8 +250,10 @@ def test_equal_and_cond(self):
         a_np = numpy.array([3, 4, 10, 14, 9, 18]).astype('float')
         b_np = numpy.array([3, 4, 11, 15, 8, 18]).astype('float')
         c_np, = exe.run(fluid.default_main_program(),
-                        feed={"a": a_np,
-                              "b": b_np},
+                        feed={
+                            "a": a_np,
+                            "b": b_np
+                        },
                         fetch_list=[c])
 
         self.assertTrue(numpy.array_equal(c_np, a_np - b_np))
@@ -282,8 +295,10 @@ def test_bitwise_and(self):
 
         exe = fluid.Executor()
         out = exe.run(fluid.default_main_program(),
-                      feed={"x": x_np,
-                            "y": y_np},
+                      feed={
+                          "x": x_np,
+                          "y": y_np
+                      },
                       fetch_list=[z])
         self.assertTrue(np.array_equal(out[0], out_np))
 
@@ -299,8 +314,10 @@ def test_bitwise_or(self):
 
         exe = fluid.Executor()
         out = exe.run(fluid.default_main_program(),
-                      feed={"x": x_np,
-                            "y": y_np},
+                      feed={
+                          "x": x_np,
+                          "y": y_np
+                      },
                       fetch_list=[z])
         self.assertTrue(np.array_equal(out[0], out_np))
 
@@ -316,8 +333,10 @@ def test_bitwise_xor(self):
 
         exe = fluid.Executor()
         out = exe.run(fluid.default_main_program(),
-                      feed={"x": x_np,
-                            "y": y_np},
+                      feed={
+                          "x": x_np,
+                          "y": y_np
+                      },
                       fetch_list=[z])
         self.assertTrue(np.array_equal(out[0], out_np))
 
@@ -360,16 +379,18 @@ def test_ndim(self):
     def test_matmul(self):
         a = paddle.static.data(name='a', shape=[2, 3], dtype='float32')
         b = paddle.static.data(name='b', shape=[3, 5], dtype='float32')
-        c = a @b  # __matmul__
+        c = a @ b  # __matmul__
         a_np = numpy.random.uniform(-1, 1, size=[2, 3]).astype('float32')
         b_np = numpy.random.uniform(-1, 1, size=[3, 5]).astype('float32')
         place = paddle.CPUPlace()
         exe = paddle.static.Executor(place)
         c_np = exe.run(paddle.static.default_main_program(),
-                       feed={"a": a_np,
-                             "b": b_np},
+                       feed={
+                           "a": a_np,
+                           "b": b_np
+                       },
                        fetch_list=[c])
-        self.assertTrue(numpy.allclose(a_np @b_np, c_np))
+        self.assertTrue(numpy.allclose(a_np @ b_np, c_np))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index 48aa530ff87f9..92fa9049dab54 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -23,6 +23,7 @@
 
 
 class TestMathOpPatchesVarBase(unittest.TestCase):
+
     def setUp(self):
         self.shape = [10, 1024]
         self.dtype = np.float32
@@ -388,12 +389,10 @@ def func_test_np_left_mul(self):
             y = t * x
 
             self.assertTrue(
-                np.allclose(
-                    y.numpy(),
-                    t * np.ones(
-                        (2, 2), dtype="float32"),
-                    rtol=1e-05,
-                    atol=0.0))
+                np.allclose(y.numpy(),
+                            t * np.ones((2, 2), dtype="float32"),
+                            rtol=1e-05,
+                            atol=0.0))
 
     def test_np_left_mul(self):
         with _test_eager_guard():
@@ -482,50 +481,57 @@ def func_test_tensor_patch_method(self):
         self.assertEqual(x.size, 6)
         self.assertEqual(x.numel(), 6)
         self.assertTrue(np.array_equal(x.exp().numpy(), paddle.exp(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.tanh().numpy(), paddle.tanh(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.atan().numpy(), paddle.atan(x).numpy()))
+        self.assertTrue(np.array_equal(x.tanh().numpy(),
+                                       paddle.tanh(x).numpy()))
+        self.assertTrue(np.array_equal(x.atan().numpy(),
+                                       paddle.atan(x).numpy()))
         self.assertTrue(np.array_equal(x.abs().numpy(), paddle.abs(x).numpy()))
         m = x.abs()
+        self.assertTrue(np.array_equal(m.sqrt().numpy(),
+                                       paddle.sqrt(m).numpy()))
         self.assertTrue(
-            np.array_equal(m.sqrt().numpy(), paddle.sqrt(m).numpy()))
-        self.assertTrue(
-            np.array_equal(m.rsqrt().numpy(), paddle.rsqrt(m).numpy()))
+            np.array_equal(m.rsqrt().numpy(),
+                           paddle.rsqrt(m).numpy()))
+        self.assertTrue(np.array_equal(x.ceil().numpy(),
+                                       paddle.ceil(x).numpy()))
         self.assertTrue(
-            np.array_equal(x.ceil().numpy(), paddle.ceil(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.floor().numpy(), paddle.floor(x).numpy()))
+            np.array_equal(x.floor().numpy(),
+                           paddle.floor(x).numpy()))
         self.assertTrue(np.array_equal(x.cos().numpy(), paddle.cos(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.acos().numpy(), paddle.acos(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.asin().numpy(), paddle.asin(x).numpy()))
+        self.assertTrue(np.array_equal(x.acos().numpy(),
+                                       paddle.acos(x).numpy()))
+        self.assertTrue(np.array_equal(x.asin().numpy(),
+                                       paddle.asin(x).numpy()))
         self.assertTrue(np.array_equal(x.sin().numpy(), paddle.sin(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.sinh().numpy(), paddle.sinh(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.cosh().numpy(), paddle.cosh(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.round().numpy(), paddle.round(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.reciprocal().numpy(), paddle.reciprocal(x).numpy(
-            )))
-        self.assertTrue(
-            np.array_equal(x.square().numpy(), paddle.square(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.rank().numpy(), paddle.rank(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x[0].t().numpy(), paddle.t(x[0]).numpy()))
-        self.assertTrue(
-            np.array_equal(x.asinh().numpy(), paddle.asinh(x).numpy()))
+        self.assertTrue(np.array_equal(x.sinh().numpy(),
+                                       paddle.sinh(x).numpy()))
+        self.assertTrue(np.array_equal(x.cosh().numpy(),
+                                       paddle.cosh(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.round().numpy(),
+                           paddle.round(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.reciprocal().numpy(),
+                           paddle.reciprocal(x).numpy()))
+        self.assertTrue(
+            np.array_equal(x.square().numpy(),
+                           paddle.square(x).numpy()))
+        self.assertTrue(np.array_equal(x.rank().numpy(),
+                                       paddle.rank(x).numpy()))
+        self.assertTrue(np.array_equal(x[0].t().numpy(),
+                                       paddle.t(x[0]).numpy()))
+        self.assertTrue(
+            np.array_equal(x.asinh().numpy(),
+                           paddle.asinh(x).numpy()))
         ### acosh(x) = nan, need to change input
         t_np = np.random.uniform(1, 2, [2, 3]).astype(self.dtype)
         t = paddle.to_tensor(t_np)
         self.assertTrue(
-            np.array_equal(t.acosh().numpy(), paddle.acosh(t).numpy()))
+            np.array_equal(t.acosh().numpy(),
+                           paddle.acosh(t).numpy()))
         self.assertTrue(
-            np.array_equal(x.atanh().numpy(), paddle.atanh(x).numpy()))
+            np.array_equal(x.atanh().numpy(),
+                           paddle.atanh(x).numpy()))
         d = paddle.to_tensor([[1.2285208, 1.3491015, 1.4899898],
                               [1.30058, 1.0688717, 1.4928783],
                               [1.0958099, 1.3724753, 1.8926544]])
@@ -533,62 +539,74 @@ def func_test_tensor_patch_method(self):
         # ROCM not support cholesky
         if not fluid.core.is_compiled_with_rocm():
             self.assertTrue(
-                np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy(
-                )))
+                np.array_equal(d.cholesky().numpy(),
+                               paddle.cholesky(d).numpy()))
 
         self.assertTrue(
-            np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy()))
+            np.array_equal(x.is_empty().numpy(),
+                           paddle.is_empty(x).numpy()))
         self.assertTrue(
-            np.array_equal(x.isfinite().numpy(), paddle.isfinite(x).numpy()))
+            np.array_equal(x.isfinite().numpy(),
+                           paddle.isfinite(x).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.cast('int32').numpy(), paddle.cast(x, 'int32').numpy()))
+                x.cast('int32').numpy(),
+                paddle.cast(x, 'int32').numpy()))
         self.assertTrue(
             np.array_equal(
                 x.expand([3, 2, 3]).numpy(),
                 paddle.expand(x, [3, 2, 3]).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.tile([2, 2]).numpy(), paddle.tile(x, [2, 2]).numpy()))
+                x.tile([2, 2]).numpy(),
+                paddle.tile(x, [2, 2]).numpy()))
         self.assertTrue(
-            np.array_equal(x.flatten().numpy(), paddle.flatten(x).numpy()))
+            np.array_equal(x.flatten().numpy(),
+                           paddle.flatten(x).numpy()))
         index = paddle.to_tensor([0, 1])
         self.assertTrue(
             np.array_equal(
-                x.gather(index).numpy(), paddle.gather(x, index).numpy()))
+                x.gather(index).numpy(),
+                paddle.gather(x, index).numpy()))
         index = paddle.to_tensor([[0, 1], [1, 2]])
         self.assertTrue(
             np.array_equal(
-                x.gather_nd(index).numpy(), paddle.gather_nd(x, index).numpy()))
+                x.gather_nd(index).numpy(),
+                paddle.gather_nd(x, index).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.reverse([0, 1]).numpy(), paddle.reverse(x, [0, 1]).numpy()))
+                x.reverse([0, 1]).numpy(),
+                paddle.reverse(x, [0, 1]).numpy()))
         self.assertTrue(
             np.array_equal(
-                a.reshape([3, 2]).numpy(), paddle.reshape(a, [3, 2]).numpy()))
+                a.reshape([3, 2]).numpy(),
+                paddle.reshape(a, [3, 2]).numpy()))
         self.assertTrue(
             np.array_equal(
                 x.slice([0, 1], [0, 0], [1, 2]).numpy(),
                 paddle.slice(x, [0, 1], [0, 0], [1, 2]).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.split(2)[0].numpy(), paddle.split(x, 2)[0].numpy()))
+                x.split(2)[0].numpy(),
+                paddle.split(x, 2)[0].numpy()))
         m = paddle.to_tensor(
             np.random.uniform(-1, 1, [1, 6, 1, 1]).astype(self.dtype))
         self.assertTrue(
             np.array_equal(
-                m.squeeze([]).numpy(), paddle.squeeze(m, []).numpy()))
+                m.squeeze([]).numpy(),
+                paddle.squeeze(m, []).numpy()))
         self.assertTrue(
             np.array_equal(
-                m.squeeze([1, 2]).numpy(), paddle.squeeze(m, [1, 2]).numpy()))
+                m.squeeze([1, 2]).numpy(),
+                paddle.squeeze(m, [1, 2]).numpy()))
         m = paddle.to_tensor([2, 3, 3, 1, 5, 3], 'float32')
         self.assertTrue(
-            np.array_equal(m.unique()[0].numpy(), paddle.unique(m)[0].numpy()))
+            np.array_equal(m.unique()[0].numpy(),
+                           paddle.unique(m)[0].numpy()))
         self.assertTrue(
             np.array_equal(
                 m.unique(return_counts=True)[1],
-                paddle.unique(
-                    m, return_counts=True)[1]))
+                paddle.unique(m, return_counts=True)[1]))
         self.assertTrue(np.array_equal(x.flip([0]), paddle.flip(x, [0])))
         self.assertTrue(np.array_equal(x.unbind(0), paddle.unbind(x, 0)))
         self.assertTrue(np.array_equal(x.roll(1), paddle.roll(x, 1)))
@@ -602,56 +620,67 @@ def func_test_tensor_patch_method(self):
 
         # 2. Binary operation
         self.assertTrue(
-            np.array_equal(x.divide(y).numpy(), paddle.divide(x, y).numpy()))
+            np.array_equal(x.divide(y).numpy(),
+                           paddle.divide(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
                 x.matmul(y, True, False).numpy(),
                 paddle.matmul(x, y, True, False).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.norm(
-                    p='fro', axis=[0, 1]).numpy(),
-                paddle.norm(
-                    x, p='fro', axis=[0, 1]).numpy()))
+                x.norm(p='fro', axis=[0, 1]).numpy(),
+                paddle.norm(x, p='fro', axis=[0, 1]).numpy()))
         self.assertTrue(
-            np.array_equal(x.dist(y).numpy(), paddle.dist(x, y).numpy()))
+            np.array_equal(x.dist(y).numpy(),
+                           paddle.dist(x, y).numpy()))
         self.assertTrue(
-            np.array_equal(x.cross(y).numpy(), paddle.cross(x, y).numpy()))
+            np.array_equal(x.cross(y).numpy(),
+                           paddle.cross(x, y).numpy()))
         m = x.expand([2, 2, 3])
         n = y.expand([2, 2, 3]).transpose([0, 2, 1])
         self.assertTrue(
-            np.array_equal(m.bmm(n).numpy(), paddle.bmm(m, n).numpy()))
+            np.array_equal(m.bmm(n).numpy(),
+                           paddle.bmm(m, n).numpy()))
         self.assertTrue(
             np.array_equal(
                 x.histogram(5, -1, 1).numpy(),
                 paddle.histogram(x, 5, -1, 1).numpy()))
         self.assertTrue(
-            np.array_equal(x.equal(y).numpy(), paddle.equal(x, y).numpy()))
+            np.array_equal(x.equal(y).numpy(),
+                           paddle.equal(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.greater_equal(y).numpy(), paddle.greater_equal(x, y).numpy()))
+                x.greater_equal(y).numpy(),
+                paddle.greater_equal(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.greater_than(y).numpy(), paddle.greater_than(x, y).numpy()))
+                x.greater_than(y).numpy(),
+                paddle.greater_than(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.less_equal(y).numpy(), paddle.less_equal(x, y).numpy()))
+                x.less_equal(y).numpy(),
+                paddle.less_equal(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.less_than(y).numpy(), paddle.less_than(x, y).numpy()))
+                x.less_than(y).numpy(),
+                paddle.less_than(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.not_equal(y).numpy(), paddle.not_equal(x, y).numpy()))
+                x.not_equal(y).numpy(),
+                paddle.not_equal(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.equal_all(y).numpy(), paddle.equal_all(x, y).numpy()))
+                x.equal_all(y).numpy(),
+                paddle.equal_all(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.allclose(y).numpy(), paddle.allclose(x, y).numpy()))
+                x.allclose(y).numpy(),
+                paddle.allclose(x, y).numpy()))
         m = x.expand([2, 2, 3])
         self.assertTrue(
             np.array_equal(
-                x.expand_as(m).numpy(), paddle.expand_as(x, m).numpy()))
+                x.expand_as(m).numpy(),
+                paddle.expand_as(x, m).numpy()))
         index = paddle.to_tensor([2, 1, 0])
         self.assertTrue(
             np.array_equal(
@@ -663,24 +692,30 @@ def func_test_tensor_patch_method(self):
         y = paddle.to_tensor([[False, False], [False, True]])
         self.assertTrue(
             np.array_equal(
-                x.logical_and(y).numpy(), paddle.logical_and(x, y).numpy()))
+                x.logical_and(y).numpy(),
+                paddle.logical_and(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.logical_not(y).numpy(), paddle.logical_not(x, y).numpy()))
+                x.logical_not(y).numpy(),
+                paddle.logical_not(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.logical_or(y).numpy(), paddle.logical_or(x, y).numpy()))
+                x.logical_or(y).numpy(),
+                paddle.logical_or(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.logical_xor(y).numpy(), paddle.logical_xor(x, y).numpy()))
+                x.logical_xor(y).numpy(),
+                paddle.logical_xor(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
-                x.logical_and(y).numpy(), paddle.logical_and(x, y).numpy()))
+                x.logical_and(y).numpy(),
+                paddle.logical_and(x, y).numpy()))
         a = paddle.to_tensor([[1, 2], [3, 4]])
         b = paddle.to_tensor([[4, 3], [2, 1]])
         self.assertTrue(
             np.array_equal(
-                x.where(a, b).numpy(), paddle.where(x, a, b).numpy()))
+                x.where(a, b).numpy(),
+                paddle.where(x, a, b).numpy()))
 
         x_np = np.random.randn(3, 6, 9, 7)
         x = paddle.to_tensor(x_np)
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py
index aa67d92337017..4b4a4c7e15fd5 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@@ -89,6 +89,7 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
 
 
 class Generator(object):
+
     def setUp(self):
         self.op_type = "matmul"
         X = np.random.random(self.shape_X).astype("float32")
@@ -108,26 +109,33 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3)
 
     def test_check_grad_ignore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X"))
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=1e-3,
+                        no_grad_set=set("X"))
 
     def test_check_grad_ignore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y'))
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=1e-3,
+                        no_grad_set=set('Y'))
 
 
 class TestMatmulOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The inputs type of matmul_op must be Variable.
             input1 = 12
             self.assertRaises(TypeError, fluid.layers.matmul, input1, input1)
             # The inputs dtype of matmul_op must be float32, float64.
-            input2 = fluid.layers.data(
-                name='input2', shape=[10, 10], dtype="int32")
+            input2 = fluid.layers.data(name='input2',
+                                       shape=[10, 10],
+                                       dtype="int32")
             self.assertRaises(TypeError, fluid.layers.matmul, input2, input2)
-            input3 = fluid.layers.data(
-                name='input3', shape=[2, 2], dtype="float16")
+            input3 = fluid.layers.data(name='input3',
+                                       shape=[2, 2],
+                                       dtype="float16")
             fluid.layers.matmul(input3, input3)
 
 
@@ -163,8 +171,10 @@ def test_negative_dims_program(obj):
                         obj.assertEqual(Ref.shape[idx], output.shape[idx])
                 exe = fluid.Executor(fluid.CPUPlace())
                 res, = exe.run(fluid.default_main_program(),
-                               feed={'x': X,
-                                     'y': Y},
+                               feed={
+                                   'x': X,
+                                   'y': Y
+                               },
                                fetch_list=[output])
                 np.allclose(res, Ref, atol=1e-5)
 
@@ -175,13 +185,14 @@ def api_test(dim_x, dim_y, trans_x, trans_y):
         dim_x, dim_y, trans_x, trans_y))
     shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
                                                   trans_y)
-    globals()[test_name] = type(test_name, (unittest.TestCase, ), {
-        'shape_X': shape_x,
-        'shape_Y': shape_y,
-        'transpose_X': trans_x,
-        'transpose_Y': trans_y,
-        'test_propram': test_negative_dims_program,
-    })
+    globals()[test_name] = type(
+        test_name, (unittest.TestCase, ), {
+            'shape_X': shape_x,
+            'shape_Y': shape_y,
+            'transpose_X': trans_x,
+            'transpose_Y': trans_y,
+            'test_propram': test_negative_dims_program,
+        })
 
 
 # Generate operators cases for all possibilities
@@ -190,12 +201,13 @@ def inject_test(dim_x, dim_y, trans_x, trans_y):
         dim_x, dim_y, trans_x, trans_y))
     shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
                                                   trans_y)
-    globals()[test_name] = type(test_name, (Generator, OpTest), {
-        'shape_X': shape_x,
-        'shape_Y': shape_y,
-        'transpose_X': trans_x,
-        'transpose_Y': trans_y,
-    })
+    globals()[test_name] = type(
+        test_name, (Generator, OpTest), {
+            'shape_X': shape_x,
+            'shape_Y': shape_y,
+            'transpose_X': trans_x,
+            'transpose_Y': trans_y,
+        })
 
 
 for dim_X in (1, 2, 3):
@@ -270,17 +282,19 @@ def generate_compatible_shapes_ndim(dim, transpose_X, transpose_Y):
             test_name = (
                 'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
                     dim, dim, transpose_X, transpose_Y))
-            shape_X, shape_Y = generate_compatible_shapes_ndim(dim, transpose_X,
-                                                               transpose_Y)
-            globals()[test_name] = type(test_name, (Generator, OpTest), {
-                'shape_X': shape_X,
-                'shape_Y': shape_Y,
-                'transpose_X': transpose_X,
-                'transpose_Y': transpose_Y,
-            })
+            shape_X, shape_Y = generate_compatible_shapes_ndim(
+                dim, transpose_X, transpose_Y)
+            globals()[test_name] = type(
+                test_name, (Generator, OpTest), {
+                    'shape_X': shape_X,
+                    'shape_Y': shape_Y,
+                    'transpose_X': transpose_X,
+                    'transpose_Y': transpose_Y,
+                })
 
 
 class API_TestMm(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program()):
             x = fluid.data(name="x", shape=[2], dtype="float64")
@@ -291,13 +305,11 @@ def test_out(self):
             data1 = np.random.rand(2)
             data2 = np.random.rand(2)
             np_res = exe.run(feed={'x': data1, 'y': data2}, fetch_list=[result])
-            expected_result = np.matmul(
-                data1.reshape(1, 2), data2.reshape(2, 1))
+            expected_result = np.matmul(data1.reshape(1, 2),
+                                        data2.reshape(2, 1))
 
         self.assertTrue(
-            np.allclose(
-                np_res, expected_result, atol=1e-5),
-            "two value is\
+            np.allclose(np_res, expected_result, atol=1e-5), "two value is\
             {}\n{}, check diff!".format(np_res, expected_result))
 
     def test_dygraph_without_out(self):
@@ -313,6 +325,7 @@ def test_dygraph_without_out(self):
 
 
 class Test_API_Matmul(unittest.TestCase):
+
     def test_dygraph_without_out(self):
         device = fluid.CPUPlace()
         with fluid.dygraph.guard(device):
@@ -326,7 +339,9 @@ def test_dygraph_without_out(self):
 
 
 class API_TestMmError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_error1():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
                 data1 = fluid.data(name="data1", shape=[10, 2], dtype="float32")
@@ -337,20 +352,24 @@ def test_error1():
 
         def test_error2():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data1 = fluid.data(
-                    name="data1", shape=[-1, 10, 2], dtype="float32")
-                data2 = fluid.data(
-                    name="data2", shape=[-1, 2, 10], dtype="float32")
+                data1 = fluid.data(name="data1",
+                                   shape=[-1, 10, 2],
+                                   dtype="float32")
+                data2 = fluid.data(name="data2",
+                                   shape=[-1, 2, 10],
+                                   dtype="float32")
                 paddle.mm(data1, data2)
 
         test_error2()
 
         def test_error3():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data1 = fluid.data(
-                    name="data1", shape=[10, 10, 2], dtype="float32")
-                data2 = fluid.data(
-                    name="data2", shape=[3, 2, 10], dtype="float32")
+                data1 = fluid.data(name="data1",
+                                   shape=[10, 10, 2],
+                                   dtype="float32")
+                data2 = fluid.data(name="data2",
+                                   shape=[3, 2, 10],
+                                   dtype="float32")
                 paddle.mm(data1, data2)
 
         self.assertRaises(ValueError, test_error3)
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op_with_head.py b/python/paddle/fluid/tests/unittests/test_matmul_op_with_head.py
index e180faf3806e7..58cfc004092a8 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_op_with_head.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op_with_head.py
@@ -114,6 +114,7 @@ def reference_matmul_mul_head(X,
 
 # Generator for multiple head
 class GeneratorMulHead(object):
+
     def setUp(self):
         self.op_type = "matmul"
         X = np.random.random(self.shape_X).astype("float32")
@@ -137,15 +138,16 @@ def inject_test_multiple_head(dim_x, dim_y, trans_x, trans_y, head_number):
     test_name = (
         'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_head_{}'.format(
             dim_x, dim_y, trans_x, trans_y, head_number))
-    shape_x, shape_y = generate_compatible_shapes_mul_head(dim_x, dim_y,
-                                                           trans_x, trans_y)
-    globals()[test_name] = type(test_name, (GeneratorMulHead, OpTest), {
-        'shape_X': shape_x,
-        'shape_Y': shape_y,
-        'transpose_X': trans_x,
-        'transpose_Y': trans_y,
-        'head_number': head_number
-    })
+    shape_x, shape_y = generate_compatible_shapes_mul_head(
+        dim_x, dim_y, trans_x, trans_y)
+    globals()[test_name] = type(
+        test_name, (GeneratorMulHead, OpTest), {
+            'shape_X': shape_x,
+            'shape_Y': shape_y,
+            'transpose_X': trans_x,
+            'transpose_Y': trans_y,
+            'head_number': head_number
+        })
 
 
 def matmul_head2(X, Y, head_number=1):
@@ -227,26 +229,29 @@ def generate_compatible_shapes_mul_head2(dim_X, dim_Y, transpose_X,
 
 # Generator for multiple head, case 2 when width of X is not same as height of Y
 class GeneratorMulHead2(object):
+
     def setUp(self):
         self.op_type = "matmul"
 
         X = np.zeros(self.shape_X)
         Y = np.zeros(self.shape_Y)
         if len(self.shape_X) == 2:
-            X = np.arange(
-                0, self.shape_X[-1] * self.shape_X[-2],
-                dtype=np.float32).reshape(self.shape_X)
-            Y = np.arange(
-                0, self.shape_Y[-1] * self.shape_Y[-2],
-                dtype=np.float32).reshape(self.shape_Y)
+            X = np.arange(0,
+                          self.shape_X[-1] * self.shape_X[-2],
+                          dtype=np.float32).reshape(self.shape_X)
+            Y = np.arange(0,
+                          self.shape_Y[-1] * self.shape_Y[-2],
+                          dtype=np.float32).reshape(self.shape_Y)
         else:
             for i in range(0, len(self.shape_X) - 1):
-                X[i, :, :] = np.arange(
-                    0, self.shape_X[-1] * self.shape_X[-2],
-                    dtype=np.float32).reshape(list(self.shape_X)[-2:])
-                Y[i, :, :] = np.arange(
-                    0, self.shape_Y[-1] * self.shape_Y[-2],
-                    dtype=np.float32).reshape(list(self.shape_Y)[-2:])
+                X[i, :, :] = np.arange(0,
+                                       self.shape_X[-1] * self.shape_X[-2],
+                                       dtype=np.float32).reshape(
+                                           list(self.shape_X)[-2:])
+                Y[i, :, :] = np.arange(0,
+                                       self.shape_Y[-1] * self.shape_Y[-2],
+                                       dtype=np.float32).reshape(
+                                           list(self.shape_Y)[-2:])
 
         Out = reference_matmul_mul_head2(X, Y, 4, self.transpose_X,
                                          self.transpose_Y)
@@ -267,15 +272,16 @@ def inject_test_multiple_head2(dim_x, dim_y, trans_x, trans_y, head_number):
     test_name = (
         'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_head2_{}'.format(
             dim_x, dim_y, trans_x, trans_y, head_number))
-    shape_x, shape_y = generate_compatible_shapes_mul_head2(dim_x, dim_y,
-                                                            trans_x, trans_y)
-    globals()[test_name] = type(test_name, (GeneratorMulHead2, OpTest), {
-        'shape_X': shape_x,
-        'shape_Y': shape_y,
-        'transpose_X': trans_x,
-        'transpose_Y': trans_y,
-        'head_number': head_number
-    })
+    shape_x, shape_y = generate_compatible_shapes_mul_head2(
+        dim_x, dim_y, trans_x, trans_y)
+    globals()[test_name] = type(
+        test_name, (GeneratorMulHead2, OpTest), {
+            'shape_X': shape_x,
+            'shape_Y': shape_y,
+            'transpose_X': trans_x,
+            'transpose_Y': trans_y,
+            'head_number': head_number
+        })
 
 
 #test case for multiple head
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index f6f62045b19f9..e6481e12f1e1e 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -109,8 +109,10 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if core.is_compiled_with_rocm():
-            self.check_grad(
-                ['X', 'Y'], 'Out', max_relative_error=1e-2, check_eager=False)
+            self.check_grad(['X', 'Y'],
+                            'Out',
+                            max_relative_error=1e-2,
+                            check_eager=False)
         else:
             self.check_grad(['X', 'Y'], 'Out', check_eager=False)
 
@@ -335,9 +337,11 @@ def config(self):
 
 
 def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestMatMulOpFp16Case(parent):
+
         def init_kernel_type(self):
             self.dtype = np.float16
 
@@ -345,8 +349,9 @@ def test_check_output(self):
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(0)
                 if core.is_float16_supported(place):
-                    self.check_output_with_place(
-                        place, atol=atol, check_eager=False)
+                    self.check_output_with_place(place,
+                                                 atol=atol,
+                                                 check_eager=False)
 
         def test_check_grad(self):
             place = core.CUDAPlace(0)
@@ -384,11 +389,13 @@ def test_check_grad(self):
 
 
 def create_test_bf16_class(parent, atol=0.01):
+
     @unittest.skipIf(
-        not core.is_compiled_with_cuda() or
-        not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        not core.is_compiled_with_cuda()
+        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
         "core is not compiled with CUDA and not support the bfloat16")
     class TestMatMulOpBf16Case(parent):
+
         def get_numeric_grad(self, place, check_name):
             scope = core.Scope()
             self._check_grad_helper()
@@ -407,20 +414,18 @@ def test_check_output(self):
         def test_check_grad_x(self):
             place = core.CUDAPlace(0)
             numeric_grads = self.get_numeric_grad(place, 'X')
-            self.check_grad_with_place(
-                place, ['X'],
-                'Out',
-                no_grad_set=set(['Y']),
-                user_defined_grads=[numeric_grads])
+            self.check_grad_with_place(place, ['X'],
+                                       'Out',
+                                       no_grad_set=set(['Y']),
+                                       user_defined_grads=[numeric_grads])
 
         def test_check_grad_y(self):
             place = core.CUDAPlace(0)
             numeric_grads = self.get_numeric_grad(place, 'Y')
-            self.check_grad_with_place(
-                place, ['Y'],
-                'Out',
-                no_grad_set=set(['X']),
-                user_defined_grads=[numeric_grads])
+            self.check_grad_with_place(place, ['Y'],
+                                       'Out',
+                                       no_grad_set=set(['X']),
+                                       user_defined_grads=[numeric_grads])
 
         def test_check_grad(self):
             pass
@@ -450,6 +455,7 @@ def test_check_grad(self):
 
 
 class TestMatMulV2API(unittest.TestCase):
+
     def setUp(self):
         self.places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -467,8 +473,10 @@ def check_static_result(self, place):
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
-                              feed={"input_x": x_np,
-                                    "input_y": y_np},
+                              feed={
+                                  "input_x": x_np,
+                                  "input_y": y_np
+                              },
                               fetch_list=[result])
 
     def test_static(self):
@@ -500,9 +508,8 @@ def test_compute_type_fp32(self):
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
                 with fluid.dygraph.guard(place):
-                    paddle.set_flags({
-                        'FLAGS_gemm_use_half_precision_compute_type': False
-                    })
+                    paddle.set_flags(
+                        {'FLAGS_gemm_use_half_precision_compute_type': False})
                     input_x = np.random.random([2, 8, 16]).astype("float16")
                     input_y = np.random.random([2, 16, 8]).astype("float16")
                     for i in range(0, 16, 2):
@@ -517,18 +524,16 @@ def test_compute_type_fp32(self):
                     self.assertTrue(paddle.isfinite(result)[0, 0, 0])
                     self.assertTrue(np.isfinite(result_np)[0, 0, 0])
                     self.assertTrue(np.array_equal(result_np, result.numpy()))
-                    paddle.set_flags({
-                        'FLAGS_gemm_use_half_precision_compute_type': True
-                    })
+                    paddle.set_flags(
+                        {'FLAGS_gemm_use_half_precision_compute_type': True})
 
     def test_compute_type_fp16_nan(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
                 with fluid.dygraph.guard(place):
-                    paddle.set_flags({
-                        'FLAGS_gemm_use_half_precision_compute_type': True
-                    })
+                    paddle.set_flags(
+                        {'FLAGS_gemm_use_half_precision_compute_type': True})
                     input_x = np.random.random([2, 8, 16]).astype("float16")
                     input_y = np.random.random([2, 16, 8]).astype("float16")
                     for i in range(0, 16, 2):
@@ -543,9 +548,8 @@ def test_compute_type_fp16_nan(self):
                     self.assertFalse(
                         paddle.isfinite(result)[0, 0, 0])  # contains nan/inf
                     self.assertTrue(np.isfinite(result_np)[0, 0, 0])
-                    paddle.set_flags({
-                        'FLAGS_gemm_use_half_precision_compute_type': False
-                    })
+                    paddle.set_flags(
+                        {'FLAGS_gemm_use_half_precision_compute_type': False})
 
     def test_api_eager_dygraph(self):
         with _test_eager_guard():
@@ -554,6 +558,7 @@ def test_api_eager_dygraph(self):
 
 
 class TestComplexMatMulOp(OpTest):
+
     def setUp(self):
         self.op_type = "matmul_v2"
         self.init_base_dtype()
@@ -589,33 +594,31 @@ def test_check_output(self):
         self.check_output(check_eager=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-            user_defined_grads=[self.grad_x, self.grad_y],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=False)
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        user_defined_grads=[self.grad_x, self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=False)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[self.grad_y],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=False)
+        self.check_grad(['Y'],
+                        'Out',
+                        no_grad_set=set("X"),
+                        user_defined_grads=[self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=False)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=False)
+        self.check_grad(['X'],
+                        'Out',
+                        no_grad_set=set('Y'),
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=False)
 
 
 class TestComplexMatMulOpBroadcast(OpTest):
+
     def setUp(self):
         self.op_type = "matmul_v2"
         self.init_base_dtype()
@@ -653,33 +656,31 @@ def test_check_output(self):
         self.check_output(check_eager=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-            user_defined_grads=[self.grad_x, self.grad_y],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=False)
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        user_defined_grads=[self.grad_x, self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=False)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            user_defined_grads=[self.grad_y],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=False)
+        self.check_grad(['Y'],
+                        'Out',
+                        no_grad_set=set("X"),
+                        user_defined_grads=[self.grad_y],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=False)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=False)
+        self.check_grad(['X'],
+                        'Out',
+                        no_grad_set=set('Y'),
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=False)
 
 
 class TestMatMulTypePromotion(TestComplexMatMulOp):
+
     def init_input_output(self):
         self.x = np.random.random((10, 10)).astype(self.dtype)
         self.y = np.random.random(
diff --git a/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py b/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py
index 2bbacc316f6e6..2e73e4d782d0b 100644
--- a/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py
@@ -141,9 +141,10 @@ def batched_multiclass_nms(boxes,
     index_outs = []
     lod = []
     for n in range(batch_size):
-        nmsed_outs, indices = multiclass_nms(
-            boxes[n], scores[n], background, score_threshold, post_threshold,
-            nms_top_k, keep_top_k, normalized, use_gaussian, gaussian_sigma)
+        nmsed_outs, indices = multiclass_nms(boxes[n], scores[n], background,
+                                             score_threshold, post_threshold,
+                                             nms_top_k, keep_top_k, normalized,
+                                             use_gaussian, gaussian_sigma)
         nmsed_num = len(nmsed_outs)
         lod.append(nmsed_num)
         if nmsed_num == 0:
@@ -158,6 +159,7 @@ def batched_multiclass_nms(boxes,
 
 
 class TestMatrixNMSOp(OpTest):
+
     def set_argument(self):
         self.post_threshold = 0.
         self.use_gaussian = False
@@ -220,17 +222,20 @@ def test_check_output(self):
 
 
 class TestMatrixNMSOpNoOutput(TestMatrixNMSOp):
+
     def set_argument(self):
         self.post_threshold = 2.0
 
 
 class TestMatrixNMSOpGaussian(TestMatrixNMSOp):
+
     def set_argument(self):
         self.post_threshold = 0.
         self.use_gaussian = True
 
 
 class TestMatrixNMSError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             M = 1200
@@ -248,54 +253,52 @@ def test_errors(self):
             scores = np.reshape(scores, (N, M, C))
             scores_np = np.transpose(scores, (0, 2, 1))
 
-            boxes_data = fluid.data(
-                name='bboxes', shape=[M, C, BOX_SIZE], dtype='float32')
-            scores_data = fluid.data(
-                name='scores', shape=[N, C, M], dtype='float32')
+            boxes_data = fluid.data(name='bboxes',
+                                    shape=[M, C, BOX_SIZE],
+                                    dtype='float32')
+            scores_data = fluid.data(name='scores',
+                                     shape=[N, C, M],
+                                     dtype='float32')
 
             def test_bboxes_Variable():
                 # the bboxes type must be Variable
-                fluid.layers.matrix_nms(
-                    bboxes=boxes_np,
-                    scores=scores_data,
-                    nms_top_k=nms_top_k,
-                    keep_top_k=keep_top_k,
-                    score_threshold=score_threshold,
-                    post_threshold=post_threshold)
+                fluid.layers.matrix_nms(bboxes=boxes_np,
+                                        scores=scores_data,
+                                        nms_top_k=nms_top_k,
+                                        keep_top_k=keep_top_k,
+                                        score_threshold=score_threshold,
+                                        post_threshold=post_threshold)
 
             def test_scores_Variable():
                 # the scores type must be Variable
-                fluid.layers.matrix_nms(
-                    bboxes=boxes_data,
-                    scores=scores_np,
-                    nms_top_k=nms_top_k,
-                    keep_top_k=keep_top_k,
-                    score_threshold=score_threshold,
-                    post_threshold=post_threshold)
+                fluid.layers.matrix_nms(bboxes=boxes_data,
+                                        scores=scores_np,
+                                        nms_top_k=nms_top_k,
+                                        keep_top_k=keep_top_k,
+                                        score_threshold=score_threshold,
+                                        post_threshold=post_threshold)
 
             def test_empty():
                 # when all score are lower than threshold
                 try:
-                    fluid.layers.matrix_nms(
-                        bboxes=boxes_data,
-                        scores=scores_data,
-                        nms_top_k=nms_top_k,
-                        keep_top_k=keep_top_k,
-                        score_threshold=10.,
-                        post_threshold=post_threshold)
+                    fluid.layers.matrix_nms(bboxes=boxes_data,
+                                            scores=scores_data,
+                                            nms_top_k=nms_top_k,
+                                            keep_top_k=keep_top_k,
+                                            score_threshold=10.,
+                                            post_threshold=post_threshold)
                 except Exception as e:
                     self.fail(e)
 
             def test_coverage():
                 # cover correct workflow
                 try:
-                    fluid.layers.matrix_nms(
-                        bboxes=boxes_data,
-                        scores=scores_data,
-                        nms_top_k=nms_top_k,
-                        keep_top_k=keep_top_k,
-                        score_threshold=score_threshold,
-                        post_threshold=post_threshold)
+                    fluid.layers.matrix_nms(bboxes=boxes_data,
+                                            scores=scores_data,
+                                            nms_top_k=nms_top_k,
+                                            keep_top_k=keep_top_k,
+                                            score_threshold=score_threshold,
+                                            post_threshold=post_threshold)
                 except Exception as e:
                     self.fail(e)
 
diff --git a/python/paddle/fluid/tests/unittests/test_matrix_power_op.py b/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
index 96823f49d2f08..1eb1f42671b41 100644
--- a/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
@@ -23,6 +23,7 @@
 
 
 class TestMatrixPowerOp(OpTest):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -44,11 +45,14 @@ def test_check_output(self):
         self.check_output()
 
     def test_grad(self):
-        self.check_grad(
-            ["X"], "Out", numeric_grad_delta=1e-5, max_relative_error=1e-7)
+        self.check_grad(["X"],
+                        "Out",
+                        numeric_grad_delta=1e-5,
+                        max_relative_error=1e-7)
 
 
 class TestMatrixPowerOpN1(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -56,6 +60,7 @@ def config(self):
 
 
 class TestMatrixPowerOpN2(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -63,6 +68,7 @@ def config(self):
 
 
 class TestMatrixPowerOpN3(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -70,6 +76,7 @@ def config(self):
 
 
 class TestMatrixPowerOpN4(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -77,6 +84,7 @@ def config(self):
 
 
 class TestMatrixPowerOpN5(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -84,6 +92,7 @@ def config(self):
 
 
 class TestMatrixPowerOpN6(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -91,6 +100,7 @@ def config(self):
 
 
 class TestMatrixPowerOpN10(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -98,17 +108,21 @@ def config(self):
 
 
 class TestMatrixPowerOpNMinus(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
         self.n = -1
 
     def test_grad(self):
-        self.check_grad(
-            ["X"], "Out", numeric_grad_delta=1e-5, max_relative_error=1e-6)
+        self.check_grad(["X"],
+                        "Out",
+                        numeric_grad_delta=1e-5,
+                        max_relative_error=1e-6)
 
 
 class TestMatrixPowerOpNMinus2(TestMatrixPowerOpNMinus):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -116,6 +130,7 @@ def config(self):
 
 
 class TestMatrixPowerOpNMinus3(TestMatrixPowerOpNMinus):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -123,6 +138,7 @@ def config(self):
 
 
 class TestMatrixPowerOpNMinus4(TestMatrixPowerOpNMinus):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -130,6 +146,7 @@ def config(self):
 
 
 class TestMatrixPowerOpNMinus5(TestMatrixPowerOpNMinus):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -137,6 +154,7 @@ def config(self):
 
 
 class TestMatrixPowerOpNMinus6(TestMatrixPowerOpNMinus):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -144,17 +162,21 @@ def config(self):
 
 
 class TestMatrixPowerOpNMinus10(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
         self.n = -10
 
     def test_grad(self):
-        self.check_grad(
-            ["X"], "Out", numeric_grad_delta=1e-5, max_relative_error=1e-6)
+        self.check_grad(["X"],
+                        "Out",
+                        numeric_grad_delta=1e-5,
+                        max_relative_error=1e-6)
 
 
 class TestMatrixPowerOpBatched1(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [8, 4, 4]
         self.dtype = "float64"
@@ -162,6 +184,7 @@ def config(self):
 
 
 class TestMatrixPowerOpBatched2(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [2, 6, 4, 4]
         self.dtype = "float64"
@@ -169,6 +192,7 @@ def config(self):
 
 
 class TestMatrixPowerOpBatched3(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [2, 6, 4, 4]
         self.dtype = "float64"
@@ -176,6 +200,7 @@ def config(self):
 
 
 class TestMatrixPowerOpBatchedLong(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [1, 2, 3, 4, 4, 3, 3]
         self.dtype = "float64"
@@ -183,6 +208,7 @@ def config(self):
 
 
 class TestMatrixPowerOpLarge1(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [32, 32]
         self.dtype = "float64"
@@ -190,6 +216,7 @@ def config(self):
 
 
 class TestMatrixPowerOpLarge2(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float64"
@@ -197,6 +224,7 @@ def config(self):
 
 
 class TestMatrixPowerOpFP32(TestMatrixPowerOp):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float32"
@@ -207,6 +235,7 @@ def test_grad(self):
 
 
 class TestMatrixPowerOpBatchedFP32(TestMatrixPowerOpFP32):
+
     def config(self):
         self.matrix_shape = [2, 8, 4, 4]
         self.dtype = "float32"
@@ -214,6 +243,7 @@ def config(self):
 
 
 class TestMatrixPowerOpLarge1FP32(TestMatrixPowerOpFP32):
+
     def config(self):
         self.matrix_shape = [32, 32]
         self.dtype = "float32"
@@ -221,6 +251,7 @@ def config(self):
 
 
 class TestMatrixPowerOpLarge2FP32(TestMatrixPowerOpFP32):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float32"
@@ -228,6 +259,7 @@ def config(self):
 
 
 class TestMatrixPowerOpFP32Minus(TestMatrixPowerOpFP32):
+
     def config(self):
         self.matrix_shape = [10, 10]
         self.dtype = "float32"
@@ -235,6 +267,7 @@ def config(self):
 
 
 class TestMatrixPowerAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -271,6 +304,7 @@ def test_dygraph(self):
 
 
 class TestMatrixPowerAPIError(unittest.TestCase):
+
     def test_errors(self):
         input_np = np.random.random([4, 4]).astype("float64")
 
@@ -279,11 +313,12 @@ def test_errors(self):
 
         # n must be int
         for n in [2.0, '2', -2.0]:
-            input = fluid.data(
-                name="input_float32", shape=[4, 4], dtype='float32')
+            input = fluid.data(name="input_float32",
+                               shape=[4, 4],
+                               dtype='float32')
             self.assertRaises(TypeError, paddle.linalg.matrix_power, input, n)
 
-        # The data type of input must be float32 or float64.        
+        # The data type of input must be float32 or float64.
         for dtype in ["bool", "int32", "int64", "float16"]:
             input = fluid.data(name="input_" + dtype, shape=[4, 4], dtype=dtype)
             self.assertRaises(TypeError, paddle.linalg.matrix_power, input, 2)
@@ -303,6 +338,7 @@ def test_errors(self):
 
 
 class TestMatrixPowerSingularAPI(unittest.TestCase):
+
     def setUp(self):
         self.places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py b/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py
index b13b346261762..b0b04a3cc1013 100644
--- a/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,6 +35,7 @@ def matrix_rank_wraper(x, tol=None, use_default_tol=True, hermitian=False):
 
 
 class TestMatrixRankOP(OpTest):
+
     def setUp(self):
         self.python_api = matrix_rank_wraper
         self.op_type = "matrix_rank"
@@ -61,6 +62,7 @@ def init_data(self):
 
 
 class TestMatrixRankOP1(TestMatrixRankOP):
+
     def init_data(self):
         self.x = np.eye(3, k=1, dtype=np.float64)
         self.tol_tensor = None
@@ -72,6 +74,7 @@ def init_data(self):
 
 
 class TestMatrixRankOP2(TestMatrixRankOP):
+
     def init_data(self):
         self.x = np.random.rand(3, 4, 5, 6).astype(np.float32)
         self.tol_tensor = np.random.random([3, 4]).astype(self.x.dtype)
@@ -83,6 +86,7 @@ def init_data(self):
 
 
 class TestMatrixRankOP3(TestMatrixRankOP):
+
     def init_data(self):
         self.x = np.eye(200, dtype=np.float64)
         self.tol_tensor = None
@@ -94,6 +98,7 @@ def init_data(self):
 
 
 class TestMatrixRankOP4(TestMatrixRankOP):
+
     def init_data(self):
         self.x = np.random.rand(1, 10).astype(np.float32)
         self.tol_tensor = None
@@ -105,6 +110,7 @@ def init_data(self):
 
 
 class TestMatrixRankOP5(TestMatrixRankOP):
+
     def init_data(self):
         self.x = np.random.rand(5, 1).astype(np.float64)
         self.tol_tensor = np.random.random([1, 4]).astype(self.x.dtype)
@@ -116,6 +122,7 @@ def init_data(self):
 
 
 class TestMatrixRankOP6(TestMatrixRankOP):
+
     def init_data(self):
         self.x = np.random.rand(3, 4, 5, 6).astype(np.float32)
         self.tol_tensor = None
@@ -127,6 +134,7 @@ def init_data(self):
 
 
 class TestMatrixRankOP7(TestMatrixRankOP):
+
     def init_data(self):
         self.x = np.eye(200, dtype=np.float64)
         self.tol_tensor = np.random.random([200, 200]).astype(self.x.dtype)
@@ -138,6 +146,7 @@ def init_data(self):
 
 
 class TestMatrixRankAPI(unittest.TestCase):
+
     def test_dygraph(self):
         paddle.disable_static()
 
@@ -172,25 +181,31 @@ def test_static(self):
             with fluid.program_guard(fluid.Program(), fluid.Program()):
                 x_np = np.random.rand(3, 4, 7, 7).astype(np.float64)
                 tol_np = np.random.random([3, 4]).astype(np.float32)
-                x_pd = paddle.fluid.data(
-                    name="X", shape=[3, 4, 7, 7], dtype='float64')
-                tol_pd = paddle.fluid.data(
-                    name="TolTensor", shape=[3, 4], dtype='float32')
+                x_pd = paddle.fluid.data(name="X",
+                                         shape=[3, 4, 7, 7],
+                                         dtype='float64')
+                tol_pd = paddle.fluid.data(name="TolTensor",
+                                           shape=[3, 4],
+                                           dtype='float32')
                 rank_np = np.linalg.matrix_rank(x_np, tol_np, hermitian=False)
-                rank_pd = paddle.linalg.matrix_rank(
-                    x_pd, tol_pd, hermitian=False)
+                rank_pd = paddle.linalg.matrix_rank(x_pd,
+                                                    tol_pd,
+                                                    hermitian=False)
                 exe = fluid.Executor(place)
                 fetches = exe.run(fluid.default_main_program(),
-                                  feed={"X": x_np,
-                                        "TolTensor": tol_np},
+                                  feed={
+                                      "X": x_np,
+                                      "TolTensor": tol_np
+                                  },
                                   fetch_list=[rank_pd])
                 self.assertTrue(np.allclose(fetches[0], rank_np))
 
         for place in places:
             with fluid.program_guard(fluid.Program(), fluid.Program()):
                 x_np = np.random.rand(3, 4, 7, 7).astype(np.float64)
-                x_pd = paddle.fluid.data(
-                    name="X", shape=[3, 4, 7, 7], dtype='float64')
+                x_pd = paddle.fluid.data(name="X",
+                                         shape=[3, 4, 7, 7],
+                                         dtype='float64')
                 rank_np = np.linalg.matrix_rank(x_np, hermitian=True)
                 rank_pd = paddle.linalg.matrix_rank(x_pd, hermitian=True)
                 exe = fluid.Executor(place)
@@ -202,8 +217,9 @@ def test_static(self):
         for place in places:
             with fluid.program_guard(fluid.Program(), fluid.Program()):
                 x_np = np.random.rand(3, 4, 7, 7).astype(np.float64)
-                x_pd = paddle.fluid.data(
-                    name="X", shape=[3, 4, 7, 7], dtype='float64')
+                x_pd = paddle.fluid.data(name="X",
+                                         shape=[3, 4, 7, 7],
+                                         dtype='float64')
                 rank_np = np.linalg.matrix_rank(x_np, 0.1, hermitian=False)
                 rank_pd = paddle.linalg.matrix_rank(x_pd, 0.1, hermitian=False)
                 exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_max_min_amax_amin_op.py b/python/paddle/fluid/tests/unittests/test_max_min_amax_amin_op.py
index fe00a825ba1cd..cadbca93ad3f2 100644
--- a/python/paddle/fluid/tests/unittests/test_max_min_amax_amin_op.py
+++ b/python/paddle/fluid/tests/unittests/test_max_min_amax_amin_op.py
@@ -26,11 +26,12 @@
 
 
 class TestMaxMinAmaxAminAPI(unittest.TestCase):
+
     def setUp(self):
         self.init_case()
         self.cal_np_out_and_gradient()
-        self.place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
 
     def init_case(self):
         self.x_np = np.array([[0.2, 0.3, 0.5, 0.9], [0.1, 0.2, 0.6, 0.7]])
@@ -40,9 +41,10 @@ def init_case(self):
         self.keepdim = False
 
     # If there are multiple minimum or maximum elements, max/min/amax/amin is non-derivable,
-    # its gradient check is not supported by unittest framework, 
+    # its gradient check is not supported by unittest framework,
     # thus we calculate the gradient by numpy function.
     def cal_np_out_and_gradient(self):
+
         def _cal_np_out_and_gradient(func):
             if func is 'amax':
                 out = np.amax(self.x_np, axis=self.axis, keepdims=self.keepdim)
@@ -88,6 +90,7 @@ def _choose_paddle_func(self, func, x):
 
     # We check the output between paddle API and numpy in static graph.
     def test_static_graph(self):
+
         def _test_static_graph(func):
             startup_program = fluid.Program()
             train_program = fluid.Program()
@@ -107,13 +110,15 @@ def _test_static_graph(func):
         _test_static_graph('max')
         _test_static_graph('min')
 
-    # As dygraph is easy to compute gradient, we check the gradient between 
+    # As dygraph is easy to compute gradient, we check the gradient between
     # paddle API and numpy in dygraph.
     def test_dygraph(self):
+
         def _test_dygraph(func):
             paddle.disable_static()
-            x = paddle.to_tensor(
-                self.x_np, dtype=self.dtype, stop_gradient=False)
+            x = paddle.to_tensor(self.x_np,
+                                 dtype=self.dtype,
+                                 stop_gradient=False)
             out = self._choose_paddle_func(func, x)
             grad_tensor = paddle.ones_like(x)
             paddle.autograd.backward([out], [grad_tensor], True)
@@ -130,6 +135,7 @@ def _test_dygraph(func):
 
     # test two minimum or maximum elements
 class TestMaxMinAmaxAminAPI2(TestMaxMinAmaxAminAPI):
+
     def init_case(self):
         self.x_np = np.array([[0.2, 0.3, 0.9, 0.9], [0.1, 0.1, 0.6, 0.7]])
         self.shape = [2, 4]
@@ -140,6 +146,7 @@ def init_case(self):
 
 # test different axis
 class TestMaxMinAmaxAminAPI3(TestMaxMinAmaxAminAPI):
+
     def init_case(self):
         self.x_np = np.array([[0.2, 0.3, 0.9, 0.9], [0.1, 0.1, 0.6, 0.7]])
         self.shape = [2, 4]
@@ -150,6 +157,7 @@ def init_case(self):
 
 # test keepdim = True
 class TestMaxMinAmaxAminAPI4(TestMaxMinAmaxAminAPI):
+
     def init_case(self):
         self.x_np = np.array([[0.2, 0.3, 0.9, 0.9], [0.1, 0.1, 0.6, 0.7]])
         self.shape = [2, 4]
@@ -160,9 +168,10 @@ def init_case(self):
 
 # test axis is tuple
 class TestMaxMinAmaxAminAPI5(TestMaxMinAmaxAminAPI):
+
     def init_case(self):
-        self.x_np = np.array(
-            [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]).astype(np.int32)
+        self.x_np = np.array([[[1, 2], [3, 4]], [[5, 6], [7,
+                                                          8]]]).astype(np.int32)
         self.shape = [2, 2, 2]
         self.dtype = 'int32'
         self.axis = (0, 1)
@@ -171,6 +180,7 @@ def init_case(self):
 
 # test multiple minimum or maximum elements
 class TestMaxMinAmaxAminAPI6(TestMaxMinAmaxAminAPI):
+
     def init_case(self):
         self.x_np = np.array([[0.2, 0.9, 0.9, 0.9], [0.9, 0.9, 0.2, 0.2]])
         self.shape = [2, 4]
diff --git a/python/paddle/fluid/tests/unittests/test_max_op.py b/python/paddle/fluid/tests/unittests/test_max_op.py
index d5b884dfcc93b..dc11d78699e73 100644
--- a/python/paddle/fluid/tests/unittests/test_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_max_op.py
@@ -23,6 +23,7 @@
 
 
 class ApiMaxTest(unittest.TestCase):
+
     def setUp(self):
         if core.is_compiled_with_cuda():
             self.place = core.CUDAPlace(0)
@@ -114,13 +115,13 @@ def test_all_negative_axis(self):
 
 
 class TestOutDtype(unittest.TestCase):
+
     def test_max(self):
         api_fn = paddle.max
         shape = [10, 16]
-        check_out_dtype(
-            api_fn,
-            in_specs=[(shape, )],
-            expect_dtypes=['float32', 'float64', 'int32', 'int64'])
+        check_out_dtype(api_fn,
+                        in_specs=[(shape, )],
+                        expect_dtypes=['float32', 'float64', 'int32', 'int64'])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_maximum_op.py b/python/paddle/fluid/tests/unittests/test_maximum_op.py
index 72db3df044e63..9568a145ed4a0 100644
--- a/python/paddle/fluid/tests/unittests/test_maximum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maximum_op.py
@@ -21,6 +21,7 @@
 
 
 class ApiMaximumTest(unittest.TestCase):
+
     def setUp(self):
         if core.is_compiled_with_cuda():
             self.place = core.CUDAPlace(0)
@@ -47,8 +48,10 @@ def test_static_api(self):
             data_y = paddle.static.data("y", shape=[10, 15], dtype="float32")
             result_max = paddle.maximum(data_x, data_y)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"x": self.input_x,
-                                 "y": self.input_y},
+            res, = exe.run(feed={
+                "x": self.input_x,
+                "y": self.input_y
+            },
                            fetch_list=[result_max])
         self.assertTrue(np.allclose(res, self.np_expected1))
 
@@ -58,8 +61,10 @@ def test_static_api(self):
             data_z = paddle.static.data("z", shape=[15], dtype="float32")
             result_max = paddle.maximum(data_x, data_z)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"x": self.input_x,
-                                 "z": self.input_z},
+            res, = exe.run(feed={
+                "x": self.input_x,
+                "z": self.input_z
+            },
                            fetch_list=[result_max])
         self.assertTrue(np.allclose(res, self.np_expected2))
 
@@ -69,8 +74,10 @@ def test_static_api(self):
             data_c = paddle.static.data("c", shape=[3], dtype="int64")
             result_max = paddle.maximum(data_a, data_c)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"a": self.input_a,
-                                 "c": self.input_c},
+            res, = exe.run(feed={
+                "a": self.input_a,
+                "c": self.input_c
+            },
                            fetch_list=[result_max])
         self.assertTrue(np.allclose(res, self.np_expected3))
 
@@ -80,8 +87,10 @@ def test_static_api(self):
             data_c = paddle.static.data("c", shape=[3], dtype="int64")
             result_max = paddle.maximum(data_b, data_c)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"b": self.input_b,
-                                 "c": self.input_c},
+            res, = exe.run(feed={
+                "b": self.input_b,
+                "c": self.input_c
+            },
                            fetch_list=[result_max])
         self.assertTrue(np.allclose(res, self.np_expected4))
 
diff --git a/python/paddle/fluid/tests/unittests/test_maxout_op.py b/python/paddle/fluid/tests/unittests/test_maxout_op.py
index 4bc7b09c71e6e..64803bf39fef7 100644
--- a/python/paddle/fluid/tests/unittests/test_maxout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maxout_op.py
@@ -37,6 +37,7 @@ def maxout_forward_naive(x, groups, channel_axis):
 
 
 class TestMaxOutOp(OpTest):
+
     def setUp(self):
         self.op_type = "maxout"
         self.python_api = paddle.nn.functional.maxout
@@ -64,21 +65,25 @@ def test_check_grad(self):
 
 
 class TestMaxOutOpAxis0(TestMaxOutOp):
+
     def set_attrs(self):
         self.axis = -1
 
 
 class TestMaxOutOpAxis1(TestMaxOutOp):
+
     def set_attrs(self):
         self.axis = 3
 
 
 class TestMaxOutOpFP32(TestMaxOutOp):
+
     def set_attrs(self):
         self.dtype = 'float32'
 
 
 class TestMaxOutOpGroups(TestMaxOutOp):
+
     def set_attrs(self):
         self.groups = 3
 
@@ -139,8 +144,9 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.maxout, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[2, 4, 6, 8], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[2, 4, 6, 8],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.maxout, x_int32)
 
             x_float32 = paddle.fluid.data(name='x_float32', shape=[2, 4, 6, 8])
diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py
index b392a328494b3..a6b1ca4dab476 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
@@ -55,6 +55,7 @@ def compute_mean_iou(predictions, labels, num_classes, in_wrongs, in_corrects,
 
 
 class TestMeanIOUOp(OpTest):
+
     def setUp(self):
         self.config()
         self.op_type = "mean_iou"
@@ -65,18 +66,21 @@ def setUp(self):
 
         in_wrongs = []
         for i in range(self.in_wrong_num):
-            in_wrongs.append(("in_wrong_%d" % i, np.random.randint(
-                0, 10, [self.num_classes]).astype("int32")))
+            in_wrongs.append(
+                ("in_wrong_%d" % i,
+                 np.random.randint(0, 10, [self.num_classes]).astype("int32")))
 
         in_corrects = []
         for i in range(self.in_correct_num):
-            in_corrects.append(("in_correct_%d" % i, np.random.randint(
-                0, 10, [self.num_classes]).astype("int32")))
+            in_corrects.append(
+                ("in_correct_%d" % i,
+                 np.random.randint(0, 10, [self.num_classes]).astype("int32")))
 
         in_mean_ious = []
         for i in range(self.in_mean_iou_num):
-            in_mean_ious.append(("in_mean_iou_%d" % i, np.random.uniform(
-                0, 1, [1]).astype("float32")))
+            in_mean_ious.append(("in_mean_iou_%d" % i,
+                                 np.random.uniform(0, 1,
+                                                   [1]).astype("float32")))
 
         self.inputs = {
             'Predictions': predictions,
@@ -107,6 +111,7 @@ def test_check_output(self):
 
 
 class TestCase1(TestMeanIOUOp):
+
     def config(self):
         self.num_classes = 5
         self.image_size = [100, 128]
@@ -121,13 +126,14 @@ def test_check_output(self):
 
 
 class TestMeanIOUOpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             # The input type of accuracy_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            y1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.mean_iou, x1, y1)
             # The input dtype of accuracy_op must be float32 or float64.
             x2 = fluid.layers.data(name='x2', shape=[4], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index c5ee5c91e1c75..af15f271b4a70 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -22,6 +22,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 from paddle.fluid.framework import _test_eager_guard
+
 np.random.seed(10)
 
 
@@ -38,6 +39,7 @@ def reduce_mean_wrapper(x, axis=0, keepdim=False, reduce_all=False):
 
 
 class TestMeanOp(OpTest):
+
     def setUp(self):
         self.op_type = "mean"
         self.python_api = fluid.layers.mean
@@ -57,23 +59,27 @@ def test_checkout_grad(self):
 
 
 class TestMeanOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of mean_op must be Variable.
             input1 = 12
             self.assertRaises(TypeError, fluid.layers.mean, input1)
             # The input dtype of mean_op must be float16, float32, float64.
-            input2 = fluid.layers.data(
-                name='input2', shape=[12, 10], dtype="int32")
+            input2 = fluid.layers.data(name='input2',
+                                       shape=[12, 10],
+                                       dtype="int32")
             self.assertRaises(TypeError, fluid.layers.mean, input2)
-            input3 = fluid.layers.data(
-                name='input3', shape=[4], dtype="float16")
+            input3 = fluid.layers.data(name='input3',
+                                       shape=[4],
+                                       dtype="float16")
             fluid.layers.softmax(input3)
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16MeanOp(TestMeanOp):
+
     def init_dtype_type(self):
         self.dtype = np.float16
         self.__class__.no_need_check_grad = True
@@ -99,6 +105,7 @@ def test_checkout_grad(self):
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestBF16MeanOp(TestMeanOp):
+
     def init_dtype_type(self):
         self.dtype = np.uint16
 
@@ -128,6 +135,7 @@ def ref_reduce_mean_grad(x, axis, dtype):
 
 
 class TestReduceMeanOp(OpTest):
+
     def setUp(self):
         self.op_type = 'reduce_mean'
         self.python_api = reduce_mean_wrapper
@@ -178,15 +186,18 @@ def test_check_grad(self):
                 return
             with fluid.dygraph.guard(place=place):
                 x = paddle.tensor(self.inputs['X'])
-                y = paddle.mean(
-                    x, axis=self.attrs['dim'], keepdim=self.attrs['keep_dim'])
+                y = paddle.mean(x,
+                                axis=self.attrs['dim'],
+                                keepdim=self.attrs['keep_dim'])
                 dx = paddle.grad(y, x)[0].numpy()
-                dx_expected = ref_reduce_mean_grad(
-                    self.inputs['X'], self.attrs['dim'], self.dtype)
+                dx_expected = ref_reduce_mean_grad(self.inputs['X'],
+                                                   self.attrs['dim'],
+                                                   self.dtype)
                 self.assertTrue(np.array_equal(dx, dx_expected))
 
 
 class TestReduceMeanOpDefaultAttrs(TestReduceMeanOp):
+
     def setUp(self):
         self.op_type = 'reduce_mean'
         self.python_api = reduce_mean_wrapper
@@ -200,88 +211,104 @@ def setUp(self):
 
 
 class TestReduceMeanOpFloat32(TestReduceMeanOp):
+
     def set_attrs(self):
         self.dtype = 'float32'
 
 
 class TestReduceMeanOpFloat16(TestReduceMeanOp):
+
     def set_attrs(self):
         self.dtype = 'float16'
 
 
 class TestReduceMeanOpShape1D(TestReduceMeanOp):
+
     def set_attrs(self):
         self.shape = [100]
 
 
 class TestReduceMeanOpShape1DFP16(TestReduceMeanOp):
+
     def set_attrs(self):
         self.shape = [100]
         self.dtype = 'float16'
 
 
 class TestReduceMeanOpShape6D(TestReduceMeanOp):
+
     def set_attrs(self):
         self.shape = [2, 3, 4, 5, 6, 7]
 
 
 class TestReduceMeanOpShape6DFP16(TestReduceMeanOp):
+
     def set_attrs(self):
         self.shape = [2, 3, 4, 5, 6, 7]
         self.dtype = 'float16'
 
 
 class TestReduceMeanOpAxisAll(TestReduceMeanOp):
+
     def set_attrs(self):
         self.axis = [0, 1, 2, 3]
 
 
 class TestReduceMeanOpAxisAllFP16(TestReduceMeanOp):
+
     def set_attrs(self):
         self.axis = [0, 1, 2, 3]
         self.dtype = 'float16'
 
 
 class TestReduceMeanOpAxisTuple(TestReduceMeanOp):
+
     def set_attrs(self):
         self.axis = (0, 1, 2)
 
 
 class TestReduceMeanOpAxisTupleFP16(TestReduceMeanOp):
+
     def set_attrs(self):
         self.axis = (0, 1, 2)
         self.dtype = 'float16'
 
 
 class TestReduceMeanOpAxisNegative(TestReduceMeanOp):
+
     def set_attrs(self):
         self.axis = [-2, -1]
 
 
 class TestReduceMeanOpAxisNegativeFP16(TestReduceMeanOp):
+
     def set_attrs(self):
         self.axis = [-2, -1]
         self.dtype = 'float16'
 
 
 class TestReduceMeanOpKeepdimTrue1(TestReduceMeanOp):
+
     def set_attrs(self):
         self.keepdim = True
 
 
 class TestReduceMeanOpKeepdimTrue1FP16(TestReduceMeanOp):
+
     def set_attrs(self):
         self.keepdim = True
         self.dtype = 'float16'
 
 
 class TestReduceMeanOpKeepdimTrue2(TestReduceMeanOp):
+
     def set_attrs(self):
         self.axis = [0, 1, 2, 3]
         self.keepdim = True
 
 
 class TestReduceMeanOpKeepdimTrue2FP16(TestReduceMeanOp):
+
     def set_attrs(self):
         self.axis = [0, 1, 2, 3]
         self.keepdim = True
@@ -289,11 +316,13 @@ def set_attrs(self):
 
 
 class TestReduceMeanOpReduceAllTrue(TestReduceMeanOp):
+
     def set_attrs(self):
         self.reduce_all = True
 
 
 class TestReduceMeanOpReduceAllTrueFP16(TestReduceMeanOp):
+
     def set_attrs(self):
         self.reduce_all = True
         self.dtype = 'float16'
@@ -337,9 +366,8 @@ def test_case(x, axis=None, keepdim=False):
                 if len(axis) == 0:
                     axis = None
             out_ref = np.mean(x, axis, keepdims=keepdim)
-            self.assertEqual(
-                np.allclose(
-                    out.numpy(), out_ref, rtol=1e-04), True)
+            self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-04),
+                             True)
 
         test_case(self.x)
         test_case(self.x, [])
diff --git a/python/paddle/fluid/tests/unittests/test_median.py b/python/paddle/fluid/tests/unittests/test_median.py
index be2206d0267ef..2f5680224491f 100644
--- a/python/paddle/fluid/tests/unittests/test_median.py
+++ b/python/paddle/fluid/tests/unittests/test_median.py
@@ -23,6 +23,7 @@
 
 
 class TestMedian(unittest.TestCase):
+
     def check_numpy_res(self, np1, np2):
         self.assertEqual(np1.shape, np2.shape)
         mismatch = np.sum((np1 - np2) * (np1 - np2))
@@ -57,8 +58,7 @@ def test_median_static(self):
         w = 4
         l = 2
         x = np.arange(h * w * l).reshape([h, w, l])
-        lis_tests = [[x, axis, keepdims]
-                     for axis in [-1, 0, 1, 2, None]
+        lis_tests = [[x, axis, keepdims] for axis in [-1, 0, 1, 2, None]
                      for keepdims in [False, True]]
         for lis_test in lis_tests:
             self.static_single_test_median(lis_test)
@@ -69,8 +69,7 @@ def test_median_dygraph(self):
         w = 4
         l = 2
         x = np.arange(h * w * l).reshape([h, w, l])
-        lis_tests = [[x, axis, keepdims]
-                     for axis in [-1, 0, 1, 2, None]
+        lis_tests = [[x, axis, keepdims] for axis in [-1, 0, 1, 2, None]
                      for keepdims in [False, True]]
         for lis_test in lis_tests:
             self.dygraph_single_test_median(lis_test)
diff --git a/python/paddle/fluid/tests/unittests/test_memcpy_op.py b/python/paddle/fluid/tests/unittests/test_memcpy_op.py
index 623c43f5b75f3..a1469ca558be0 100755
--- a/python/paddle/fluid/tests/unittests/test_memcpy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_memcpy_op.py
@@ -26,6 +26,7 @@
 
 
 class TestMemcpy_FillConstant(unittest.TestCase):
+
     def get_prog(self):
         paddle.enable_static()
         main_program = Program()
@@ -38,21 +39,19 @@ def get_prog(self):
                 dtype='float32',
                 persistable=False,
                 stop_gradient=True)
-            gpu_var = main_program.global_block().create_var(
-                name=gpu_var_name,
-                shape=[10, 10],
-                dtype='float32',
-                persistable=False,
-                stop_gradient=True)
-            main_program.global_block().append_op(
-                type="fill_constant",
-                outputs={"Out": gpu_var_name},
-                attrs={
-                    "shape": [10, 10],
-                    "dtype": gpu_var.dtype,
-                    "value": 1.0,
-                    "place_type": 1
-                })
+            gpu_var = main_program.global_block().create_var(name=gpu_var_name,
+                                                             shape=[10, 10],
+                                                             dtype='float32',
+                                                             persistable=False,
+                                                             stop_gradient=True)
+            main_program.global_block().append_op(type="fill_constant",
+                                                  outputs={"Out": gpu_var_name},
+                                                  attrs={
+                                                      "shape": [10, 10],
+                                                      "dtype": gpu_var.dtype,
+                                                      "value": 1.0,
+                                                      "place_type": 1
+                                                  })
             main_program.global_block().append_op(
                 type="fill_constant",
                 outputs={"Out": pinned_var_name},
@@ -66,11 +65,10 @@ def get_prog(self):
 
     def test_gpu_copy_to_pinned(self):
         main_program, gpu_var, pinned_var = self.get_prog()
-        main_program.global_block().append_op(
-            type='memcpy',
-            inputs={'X': gpu_var},
-            outputs={'Out': pinned_var},
-            attrs={'dst_place_type': 2})
+        main_program.global_block().append_op(type='memcpy',
+                                              inputs={'X': gpu_var},
+                                              outputs={'Out': pinned_var},
+                                              attrs={'dst_place_type': 2})
         place = fluid.CUDAPlace(0)
         exe = fluid.Executor(place)
         gpu_, pinned_ = exe.run(main_program,
@@ -81,11 +79,10 @@ def test_gpu_copy_to_pinned(self):
 
     def test_pinned_copy_gpu(self):
         main_program, gpu_var, pinned_var = self.get_prog()
-        main_program.global_block().append_op(
-            type='memcpy',
-            inputs={'X': pinned_var},
-            outputs={'Out': gpu_var},
-            attrs={'dst_place_type': 1})
+        main_program.global_block().append_op(type='memcpy',
+                                              inputs={'X': pinned_var},
+                                              outputs={'Out': gpu_var},
+                                              attrs={'dst_place_type': 1})
         place = fluid.CUDAPlace(0)
         exe = fluid.Executor(place)
         gpu_, pinned_ = exe.run(main_program,
@@ -132,11 +129,10 @@ def test_hip_copy_bool_value(self):
                         "place_type": 2
                     })
 
-            main_program.global_block().append_op(
-                type='memcpy',
-                inputs={'X': pinned_var},
-                outputs={'Out': gpu_var},
-                attrs={'dst_place_type': 1})
+            main_program.global_block().append_op(type='memcpy',
+                                                  inputs={'X': pinned_var},
+                                                  outputs={'Out': gpu_var},
+                                                  attrs={'dst_place_type': 1})
             place = fluid.CUDAPlace(0)
             exe = fluid.Executor(place)
             gpu_, pinned_ = exe.run(main_program,
@@ -149,6 +145,7 @@ def test_hip_copy_bool_value(self):
 
 
 class TestMemcpyOPError(unittest.TestCase):
+
     def get_prog(self):
         paddle.enable_static()
         main_program = Program()
@@ -175,20 +172,20 @@ def test_SELECTED_ROWS(self):
         selected_row_var = main_program.global_block().create_var( \
             name="selected_row_0", dtype="float32", persistable=False, \
             type=fluid.core.VarDesc.VarType.SELECTED_ROWS, stop_gradient=True)
-        main_program.global_block().append_op(
-            type="fill_constant",
-            outputs={"Out": selected_row_var},
-            attrs={
-                "shape": selected_row_var.shape,
-                "dtype": selected_row_var.dtype,
-                "value": 1.0,
-                "place_type": 1
-            })
-        main_program.global_block().append_op(
-            type='memcpy',
-            inputs={'X': selected_row_var},
-            outputs={'Out': pinned_var},
-            attrs={'dst_place_type': 2})
+        main_program.global_block().append_op(type="fill_constant",
+                                              outputs={"Out": selected_row_var},
+                                              attrs={
+                                                  "shape":
+                                                  selected_row_var.shape,
+                                                  "dtype":
+                                                  selected_row_var.dtype,
+                                                  "value": 1.0,
+                                                  "place_type": 1
+                                              })
+        main_program.global_block().append_op(type='memcpy',
+                                              inputs={'X': selected_row_var},
+                                              outputs={'Out': pinned_var},
+                                              attrs={'dst_place_type': 2})
         with self.assertRaises(NotImplementedError):
             place = fluid.CUDAPlace(0)
             exe = fluid.Executor(place)
@@ -199,6 +196,7 @@ def test_SELECTED_ROWS(self):
 
 
 class TestMemcpyApi(unittest.TestCase):
+
     def test_api(self):
         a = paddle.ones([1024, 1024])
         b = paddle.tensor.creation._memcpy(a, paddle.CUDAPinnedPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_memory_analysis.py b/python/paddle/fluid/tests/unittests/test_memory_analysis.py
index 9388e07dbf891..1672e7371cbc6 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_analysis.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_analysis.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,6 +19,7 @@
 
 
 class TestMemoryAnalysis(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
 
@@ -27,17 +28,18 @@ def test_get_memory_info(self):
         optimizer = paddle.optimizer.Adam(learning_rate=1e-3)
         optimizer.minimize(loss)
         main_prog = paddle.static.default_main_program()
-        max_tmp_mem_1, max_persitable_mem_1 = get_max_memory_info(
-            main_prog, batch_size=32)
+        max_tmp_mem_1, max_persitable_mem_1 = get_max_memory_info(main_prog,
+                                                                  batch_size=32)
         self.assertGreater(max_tmp_mem_1, 0)
         self.assertGreater(max_persitable_mem_1, 0)
-        max_tmp_mem_2, max_persitable_mem_2 = get_max_memory_info(
-            main_prog, batch_size=64)
+        max_tmp_mem_2, max_persitable_mem_2 = get_max_memory_info(main_prog,
+                                                                  batch_size=64)
         self.assertEqual(max_persitable_mem_1, max_persitable_mem_2)
         self.assertLess(max_tmp_mem_1, max_tmp_mem_2)
 
 
 class TestPreAllocateMemory(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py b/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py
index a1b7380fdd9a2..98550ac5018ab 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py
@@ -18,13 +18,15 @@
 
 
 class TestMemoryReuseExcludeFeedVar(unittest.TestCase):
+
     def setUp(self):
         self.image_shape = [28, 28]
         self.iteration = 10
 
     def main_impl(self, place):
-        image = fluid.layers.data(
-            name='image', shape=self.image_shape, dtype='float32')
+        image = fluid.layers.data(name='image',
+                                  shape=self.image_shape,
+                                  dtype='float32')
         relu_image = fluid.layers.relu(image)
         loss = fluid.layers.reduce_mean(relu_image)
 
@@ -35,13 +37,13 @@ def main_impl(self, place):
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
 
-        compiled_prog = fluid.CompiledProgram(fluid.default_main_program(
-        )).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy)
+        compiled_prog = fluid.CompiledProgram(
+            fluid.default_main_program()).with_data_parallel(
+                loss_name=loss.name, build_strategy=build_strategy)
 
         image_tensor = fluid.LoDTensor()
-        np_image = np.random.uniform(
-            low=-10, high=10, size=self.image_shape).astype('float32')
+        np_image = np.random.uniform(low=-10, high=10,
+                                     size=self.image_shape).astype('float32')
         image_tensor.set(np_image, place)
 
         feed_dict = [{image.name: image_tensor}]
diff --git a/python/paddle/fluid/tests/unittests/test_memory_usage.py b/python/paddle/fluid/tests/unittests/test_memory_usage.py
index 4cdb5b5d9f7f0..adc3cd0a8442e 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_usage.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_usage.py
@@ -43,6 +43,7 @@ def train_simulator(test_batch_size=10):
 
 
 class TestMemoryUsage(unittest.TestCase):
+
     def test_with_unit_B(self):
         with self.program_scope_guard():
             train_simulator()
diff --git a/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
index d2fa344b67ab3..ef9e948d98294 100644
--- a/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
@@ -21,6 +21,7 @@
 
 
 class TestMergeSelectedRows(unittest.TestCase):
+
     def get_places(self):
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_merged_adam_op.py b/python/paddle/fluid/tests/unittests/test_merged_adam_op.py
index f515a9f95b109..02cadf0230071 100644
--- a/python/paddle/fluid/tests/unittests/test_merged_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merged_adam_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -83,6 +83,7 @@ def run_adam_op(params,
 
 
 class TestMergedAdam(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
         self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
@@ -110,21 +111,20 @@ def check_with_place(self, place, multi_precision):
             self.shapes, multi_precision, self.seed, place)
 
         def run_op(use_merged):
-            return run_adam_op(
-                params=params,
-                grads=grads,
-                lrs=lrs,
-                moment1s=moment1s,
-                moment2s=moment2s,
-                beta1_pows=beta1_pows,
-                beta2_pows=beta2_pows,
-                master_params=master_params,
-                epsilon=0.9,
-                beta1=0.9,
-                beta2=0.99,
-                place=place,
-                multi_precision=multi_precision,
-                use_merged=use_merged)
+            return run_adam_op(params=params,
+                               grads=grads,
+                               lrs=lrs,
+                               moment1s=moment1s,
+                               moment2s=moment2s,
+                               beta1_pows=beta1_pows,
+                               beta2_pows=beta2_pows,
+                               master_params=master_params,
+                               epsilon=0.9,
+                               beta1=0.9,
+                               beta2=0.99,
+                               place=place,
+                               multi_precision=multi_precision,
+                               use_merged=use_merged)
 
         outs1 = run_op(True)
         outs2 = run_op(False)
@@ -137,9 +137,8 @@ def run_op(use_merged):
                 if place == 'gpu':
                     self.assertTrue(np.array_equal(value1[i], value2[i]))
                 else:
-                    self.assertTrue(
-                        np.allclose(
-                            value1[i], value2[i], atol=1e-7))
+                    self.assertTrue(np.allclose(value1[i], value2[i],
+                                                atol=1e-7))
 
     def get_places(self):
         places = ['cpu']
diff --git a/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py b/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py
index c38dea8bc3942..4afdc267de5cb 100644
--- a/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -45,22 +45,21 @@ def run_momentum_op(params,
         }
 
         param_vars = [
-            helper.create_variable(
-                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
+            helper.create_variable(persistable=True,
+                                   shape=p.shape,
+                                   dtype=p.dtype) for p in params
         ]
         grad_vars = [
-            helper.create_variable(
-                shape=g.shape, dtype=g.dtype) for g in grads
+            helper.create_variable(shape=g.shape, dtype=g.dtype) for g in grads
         ]
         velocity_vars = [
-            helper.create_variable(
-                persistable=True, shape=v.shape, dtype=v.dtype)
-            for v in velocitys
+            helper.create_variable(persistable=True,
+                                   shape=v.shape,
+                                   dtype=v.dtype) for v in velocitys
         ]
-        lr_var = helper.create_variable(
-            persistable=True,
-            shape=learning_rate.shape,
-            dtype=learning_rate.dtype)
+        lr_var = helper.create_variable(persistable=True,
+                                        shape=learning_rate.shape,
+                                        dtype=learning_rate.dtype)
 
         feed_dict = OrderedDict()
 
@@ -79,14 +78,15 @@ def run_momentum_op(params,
 
         if multi_precision:
             master_param_vars = [
-                helper.create_variable(
-                    persistable=True, shape=p.shape, dtype=p.dtype)
-                for p in master_params
+                helper.create_variable(persistable=True,
+                                       shape=p.shape,
+                                       dtype=p.dtype) for p in master_params
             ]
             feed_dict.update(
-                OrderedDict([(mp_var.name, mp_val)
-                             for mp_var, mp_val in zip(master_param_vars,
-                                                       master_params)]))
+                OrderedDict([
+                    (mp_var.name, mp_val)
+                    for mp_var, mp_val in zip(master_param_vars, master_params)
+                ]))
             # CPUPlace does not use MasterParam
             if isinstance(place, paddle.CUDAPlace):
                 fetch_list = fetch_list + [
@@ -108,8 +108,10 @@ def run_momentum_op(params,
                 if multi_precision:
                     inputs['MasterParam'] = master_param_vars[i]
                     outputs['MasterParamOut'] = master_param_vars[i]
-                helper.append_op(
-                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+                helper.append_op(type=op_type,
+                                 inputs=inputs,
+                                 outputs=outputs,
+                                 attrs=attrs)
         else:
             inputs = {
                 'Param': param_vars,
@@ -121,8 +123,10 @@ def run_momentum_op(params,
             if multi_precision:
                 inputs['MasterParam'] = master_param_vars
                 outputs['MasterParamOut'] = master_param_vars
-            helper.append_op(
-                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+            helper.append_op(type=op_type,
+                             inputs=inputs,
+                             outputs=outputs,
+                             attrs=attrs)
 
     exe = paddle.static.Executor(place)
     with paddle.static.scope_guard(paddle.static.Scope()):
@@ -152,22 +156,21 @@ def run_momentum_op2(params,
         helper = LayerHelper(op_type, **locals())
 
         param_vars = [
-            helper.create_variable(
-                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
+            helper.create_variable(persistable=True,
+                                   shape=p.shape,
+                                   dtype=p.dtype) for p in params
         ]
         grad_vars = [
-            helper.create_variable(
-                shape=g.shape, dtype=g.dtype) for g in grads
+            helper.create_variable(shape=g.shape, dtype=g.dtype) for g in grads
         ]
         velocity_vars = [
-            helper.create_variable(
-                persistable=True, shape=v.shape, dtype=v.dtype)
-            for v in velocitys
+            helper.create_variable(persistable=True,
+                                   shape=v.shape,
+                                   dtype=v.dtype) for v in velocitys
         ]
-        lr_var = helper.create_variable(
-            persistable=True,
-            shape=learning_rate.shape,
-            dtype=learning_rate.dtype)
+        lr_var = helper.create_variable(persistable=True,
+                                        shape=learning_rate.shape,
+                                        dtype=learning_rate.dtype)
 
         feed_dict = OrderedDict()
 
@@ -186,14 +189,15 @@ def run_momentum_op2(params,
 
         if multi_precision:
             master_param_vars = [
-                helper.create_variable(
-                    persistable=True, shape=p.shape, dtype=p.dtype)
-                for p in master_params
+                helper.create_variable(persistable=True,
+                                       shape=p.shape,
+                                       dtype=p.dtype) for p in master_params
             ]
             feed_dict.update(
-                OrderedDict([(mp_var.name, mp_val)
-                             for mp_var, mp_val in zip(master_param_vars,
-                                                       master_params)]))
+                OrderedDict([
+                    (mp_var.name, mp_val)
+                    for mp_var, mp_val in zip(master_param_vars, master_params)
+                ]))
             # CPUPlace does not use MasterParam
             if isinstance(place, paddle.CUDAPlace):
                 fetch_list = fetch_list + [
@@ -223,8 +227,10 @@ def run_momentum_op2(params,
                     'regularization_method': 'l2_decay',
                     'regularization_coeff': 2.0,
                 }
-                helper.append_op(
-                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+                helper.append_op(type=op_type,
+                                 inputs=inputs,
+                                 outputs=outputs,
+                                 attrs=attrs)
         else:
             inputs = {
                 'Param': param_vars,
@@ -237,16 +243,22 @@ def run_momentum_op2(params,
                 inputs['MasterParam'] = master_param_vars
                 outputs['MasterParamOut'] = master_param_vars
             attrs = {
-                'mu': mu,
-                'multi_precision': multi_precision,
-                'rescale_grad': rescale_grad,
-                'use_nesterov': use_nesterov,
+                'mu':
+                mu,
+                'multi_precision':
+                multi_precision,
+                'rescale_grad':
+                rescale_grad,
+                'use_nesterov':
+                use_nesterov,
                 'regularization_method':
                 ['l2_decay' for i in range(len(param_vars))],
                 'regularization_coeff': [2.0 for i in range(len(param_vars))],
             }
-            helper.append_op(
-                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+            helper.append_op(type=op_type,
+                             inputs=inputs,
+                             outputs=outputs,
+                             attrs=attrs)
 
     exe = paddle.static.Executor(place)
     with paddle.static.scope_guard(paddle.static.Scope()):
@@ -255,6 +267,7 @@ def run_momentum_op2(params,
 
 
 class TestMergedMomentum(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
@@ -284,18 +297,17 @@ def check_with_place(self, place, multi_precision):
             self.shapes, multi_precision, self.seed, place)
 
         def run_op(use_merged):
-            # FIXME(zengjinle): CPU Momentum Op does not support rescale_grad 
+            # FIXME(zengjinle): CPU Momentum Op does not support rescale_grad
             rescale_grad = 1.0 if isinstance(place, paddle.CPUPlace) else 0.01
-            return run_momentum_op(
-                params,
-                grads,
-                velocitys,
-                master_params,
-                learning_rate,
-                place,
-                multi_precision,
-                rescale_grad=rescale_grad,
-                use_merged=use_merged)
+            return run_momentum_op(params,
+                                   grads,
+                                   velocitys,
+                                   master_params,
+                                   learning_rate,
+                                   place,
+                                   multi_precision,
+                                   rescale_grad=rescale_grad,
+                                   use_merged=use_merged)
 
         outs1 = run_op(True)
         outs2 = run_op(False)
@@ -319,6 +331,7 @@ def test_main(self):
 
 
 class TestMergedMomentum2(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
@@ -347,19 +360,18 @@ def check_with_place(self, place, multi_precision):
             self.shapes, multi_precision, self.seed, place)
 
         def run_op(use_nesterov, use_merged):
-            # FIXME(zengjinle): CPU Momentum Op does not support rescale_grad 
+            # FIXME(zengjinle): CPU Momentum Op does not support rescale_grad
             rescale_grad = 1.0 if isinstance(place, paddle.CPUPlace) else 0.01
-            return run_momentum_op2(
-                params,
-                grads,
-                velocitys,
-                master_params,
-                learning_rate,
-                place,
-                multi_precision,
-                rescale_grad=rescale_grad,
-                use_merged=use_merged,
-                use_nesterov=use_nesterov)
+            return run_momentum_op2(params,
+                                    grads,
+                                    velocitys,
+                                    master_params,
+                                    learning_rate,
+                                    place,
+                                    multi_precision,
+                                    rescale_grad=rescale_grad,
+                                    use_merged=use_merged,
+                                    use_nesterov=use_nesterov)
 
         outs1 = run_op(use_nesterov=True, use_merged=True)
         outs2 = run_op(use_nesterov=True, use_merged=False)
diff --git a/python/paddle/fluid/tests/unittests/test_meshgrid_op.py b/python/paddle/fluid/tests/unittests/test_meshgrid_op.py
index 95acdbe4a0687..8e76859c880d4 100644
--- a/python/paddle/fluid/tests/unittests/test_meshgrid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_meshgrid_op.py
@@ -24,6 +24,7 @@
 
 
 class TestMeshgridOp(OpTest):
+
     def setUp(self):
         self.op_type = "meshgrid"
         self.dtype = self.get_dtype()
@@ -62,17 +63,23 @@ def get_x_shape(self):
 
 
 class TestMeshgridOp2(TestMeshgridOp):
+
     def get_x_shape(self):
         return [100, 300]
 
 
 class TestMeshgridOp3(unittest.TestCase):
+
     def test_api(self):
         x = fluid.data(shape=[100], dtype='int32', name='x')
         y = fluid.data(shape=[200], dtype='int32', name='y')
 
-        input_1 = np.random.randint(0, 100, [100, ]).astype('int32')
-        input_2 = np.random.randint(0, 100, [200, ]).astype('int32')
+        input_1 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_2 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
 
         out_1 = np.reshape(input_1, [100, 1])
         out_1 = np.broadcast_to(out_1, [100, 200])
@@ -82,20 +89,27 @@ def test_api(self):
         exe = fluid.Executor(place=fluid.CPUPlace())
         grid_x, grid_y = paddle.tensor.meshgrid(x, y)
         res_1, res_2 = exe.run(fluid.default_main_program(),
-                               feed={'x': input_1,
-                                     'y': input_2},
+                               feed={
+                                   'x': input_1,
+                                   'y': input_2
+                               },
                                fetch_list=[grid_x, grid_y])
         assert np.array_equal(res_1, out_1)
         assert np.array_equal(res_2, out_2)
 
 
 class TestMeshgridOp4(unittest.TestCase):
+
     def test_list_input(self):
         x = fluid.data(shape=[100], dtype='int32', name='x')
         y = fluid.data(shape=[200], dtype='int32', name='y')
 
-        input_1 = np.random.randint(0, 100, [100, ]).astype('int32')
-        input_2 = np.random.randint(0, 100, [200, ]).astype('int32')
+        input_1 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_2 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
 
         out_1 = np.reshape(input_1, [100, 1])
         out_1 = np.broadcast_to(out_1, [100, 200])
@@ -105,8 +119,10 @@ def test_list_input(self):
         exe = fluid.Executor(place=fluid.CPUPlace())
         grid_x, grid_y = paddle.tensor.meshgrid([x, y])
         res_1, res_2 = exe.run(fluid.default_main_program(),
-                               feed={'x': input_1,
-                                     'y': input_2},
+                               feed={
+                                   'x': input_1,
+                                   'y': input_2
+                               },
                                fetch_list=[grid_x, grid_y])
 
         assert np.array_equal(res_1, out_1)
@@ -114,12 +130,17 @@ def test_list_input(self):
 
 
 class TestMeshgridOp5(unittest.TestCase):
+
     def test_tuple_input(self):
         x = fluid.data(shape=[100], dtype='int32', name='x')
         y = fluid.data(shape=[200], dtype='int32', name='y')
 
-        input_1 = np.random.randint(0, 100, [100, ]).astype('int32')
-        input_2 = np.random.randint(0, 100, [200, ]).astype('int32')
+        input_1 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_2 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
 
         out_1 = np.reshape(input_1, [100, 1])
         out_1 = np.broadcast_to(out_1, [100, 200])
@@ -129,8 +150,10 @@ def test_tuple_input(self):
         exe = fluid.Executor(place=fluid.CPUPlace())
         grid_x, grid_y = paddle.tensor.meshgrid((x, y))
         res_1, res_2 = exe.run(fluid.default_main_program(),
-                               feed={'x': input_1,
-                                     'y': input_2},
+                               feed={
+                                   'x': input_1,
+                                   'y': input_2
+                               },
                                fetch_list=[grid_x, grid_y])
 
         assert np.array_equal(res_1, out_1)
@@ -138,9 +161,14 @@ def test_tuple_input(self):
 
 
 class TestMeshgridOp6(unittest.TestCase):
+
     def test_api_with_dygraph(self):
-        input_3 = np.random.randint(0, 100, [100, ]).astype('int32')
-        input_4 = np.random.randint(0, 100, [200, ]).astype('int32')
+        input_3 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_4 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
 
         with fluid.dygraph.guard():
             tensor_3 = fluid.dygraph.to_variable(input_3)
@@ -156,9 +184,14 @@ def test_api_eager_dygraph(self):
 
 
 class TestMeshgridOp7(unittest.TestCase):
+
     def test_api_with_dygraph_list_input(self):
-        input_3 = np.random.randint(0, 100, [100, ]).astype('int32')
-        input_4 = np.random.randint(0, 100, [200, ]).astype('int32')
+        input_3 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_4 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
 
         with fluid.dygraph.guard():
             tensor_3 = fluid.dygraph.to_variable(input_3)
@@ -174,9 +207,14 @@ def test_api_eager_dygraph(self):
 
 
 class TestMeshgridOp8(unittest.TestCase):
+
     def test_api_with_dygraph_tuple_input(self):
-        input_3 = np.random.randint(0, 100, [100, ]).astype('int32')
-        input_4 = np.random.randint(0, 100, [200, ]).astype('int32')
+        input_3 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_4 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
 
         with fluid.dygraph.guard():
             tensor_3 = fluid.dygraph.to_variable(input_3)
@@ -192,9 +230,14 @@ def test_api_eager_dygraph(self):
 
 
 class TestMeshgridEager(unittest.TestCase):
+
     def test_dygraph_final_state_api(self):
-        input_1 = np.random.randint(0, 100, [100, ]).astype('int32')
-        input_2 = np.random.randint(0, 100, [200, ]).astype('int32')
+        input_1 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_2 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
 
         with fluid.dygraph.guard():
             tensor_1 = fluid.dygraph.to_variable(input_1)
diff --git a/python/paddle/fluid/tests/unittests/test_min_op.py b/python/paddle/fluid/tests/unittests/test_min_op.py
index 13f82fb9bd7cb..6e5f9d1321593 100644
--- a/python/paddle/fluid/tests/unittests/test_min_op.py
+++ b/python/paddle/fluid/tests/unittests/test_min_op.py
@@ -23,6 +23,7 @@
 
 
 class ApiMinTest(unittest.TestCase):
+
     def setUp(self):
         if core.is_compiled_with_cuda():
             self.place = core.CUDAPlace(0)
@@ -93,13 +94,13 @@ def test_eager_api(self):
 
 
 class TestOutDtype(unittest.TestCase):
+
     def test_min(self):
         api_fn = paddle.min
         shape = [10, 16]
-        check_out_dtype(
-            api_fn,
-            in_specs=[(shape, )],
-            expect_dtypes=['float32', 'float64', 'int32', 'int64'])
+        check_out_dtype(api_fn,
+                        in_specs=[(shape, )],
+                        expect_dtypes=['float32', 'float64', 'int32', 'int64'])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
index 6730d5de2c2db..6b0f6075e8165 100644
--- a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
@@ -22,6 +22,7 @@
 
 
 class TestMineHardExamplesOp(OpTest):
+
     def set_data(self):
         self.init_test_data()
         self.inputs = {
@@ -58,17 +59,17 @@ def init_test_data(self):
         self.neg_overlap = 0.5
         self.sample_size = 0
         self.mining_type = "max_negative"
-        self.cls_loss = np.array([[0.1, 0.1, 0.3],
-                                  [0.3, 0.1, 0.1]]).astype('float64')
+        self.cls_loss = np.array([[0.1, 0.1, 0.3], [0.3, 0.1,
+                                                    0.1]]).astype('float64')
 
-        self.loc_loss = np.array([[0.1, 0.2, 0.3],
-                                  [0.3, 0.4, 0.1]]).astype('float64')
+        self.loc_loss = np.array([[0.1, 0.2, 0.3], [0.3, 0.4,
+                                                    0.1]]).astype('float64')
 
-        self.match_dis = np.array([[0.2, 0.4, 0.8],
-                                   [0.1, 0.9, 0.3]]).astype('float64')
+        self.match_dis = np.array([[0.2, 0.4, 0.8], [0.1, 0.9,
+                                                     0.3]]).astype('float64')
 
-        self.match_indices = np.array([[0, -1, -1],
-                                       [-1, 0, -1]]).astype('int32')
+        self.match_indices = np.array([[0, -1, -1], [-1, 0,
+                                                     -1]]).astype('int32')
 
         self.updated_match_indices = self.match_indices
 
@@ -77,19 +78,20 @@ def init_test_data(self):
 
 
 class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp):
+
     def init_test_data(self):
         super(TestMineHardExamplesOpHardExample, self).init_test_data()
         self.mining_type = "hard_example"
         self.sample_size = 2
 
-        self.cls_loss = np.array([[0.5, 0.1, 0.3],
-                                  [0.3, 0.1, 0.1]]).astype('float64')
+        self.cls_loss = np.array([[0.5, 0.1, 0.3], [0.3, 0.1,
+                                                    0.1]]).astype('float64')
 
-        self.loc_loss = np.array([[0.2, 0.2, 0.3],
-                                  [0.3, 0.1, 0.2]]).astype('float64')
+        self.loc_loss = np.array([[0.2, 0.2, 0.3], [0.3, 0.1,
+                                                    0.2]]).astype('float64')
 
-        self.match_indices = np.array([[0, -1, -1],
-                                       [-1, 0, -1]]).astype('int32')
+        self.match_indices = np.array([[0, -1, -1], [-1, 0,
+                                                     -1]]).astype('int32')
 
         self.updated_match_indices = np.array([[0, -1, -1],
                                                [-1, -1, -1]]).astype('int32')
diff --git a/python/paddle/fluid/tests/unittests/test_minimum_op.py b/python/paddle/fluid/tests/unittests/test_minimum_op.py
index ce7b9f72b6605..2da5df8582606 100644
--- a/python/paddle/fluid/tests/unittests/test_minimum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minimum_op.py
@@ -21,6 +21,7 @@
 
 
 class ApiMinimumTest(unittest.TestCase):
+
     def setUp(self):
         if core.is_compiled_with_cuda():
             self.place = core.CUDAPlace(0)
@@ -47,8 +48,10 @@ def test_static_api(self):
             data_y = paddle.static.data("y", shape=[10, 15], dtype="float32")
             result_max = paddle.minimum(data_x, data_y)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"x": self.input_x,
-                                 "y": self.input_y},
+            res, = exe.run(feed={
+                "x": self.input_x,
+                "y": self.input_y
+            },
                            fetch_list=[result_max])
         self.assertTrue(np.allclose(res, self.np_expected1))
 
@@ -58,8 +61,10 @@ def test_static_api(self):
             data_z = paddle.static.data("z", shape=[15], dtype="float32")
             result_max = paddle.minimum(data_x, data_z)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"x": self.input_x,
-                                 "z": self.input_z},
+            res, = exe.run(feed={
+                "x": self.input_x,
+                "z": self.input_z
+            },
                            fetch_list=[result_max])
         self.assertTrue(np.allclose(res, self.np_expected2))
 
@@ -69,8 +74,10 @@ def test_static_api(self):
             data_c = paddle.static.data("c", shape=[3], dtype="int64")
             result_max = paddle.minimum(data_a, data_c)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"a": self.input_a,
-                                 "c": self.input_c},
+            res, = exe.run(feed={
+                "a": self.input_a,
+                "c": self.input_c
+            },
                            fetch_list=[result_max])
         self.assertTrue(np.allclose(res, self.np_expected3))
 
@@ -80,8 +87,10 @@ def test_static_api(self):
             data_c = paddle.static.data("c", shape=[3], dtype="int64")
             result_max = paddle.minimum(data_b, data_c)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"b": self.input_b,
-                                 "c": self.input_c},
+            res, = exe.run(feed={
+                "b": self.input_b,
+                "c": self.input_c
+            },
                            fetch_list=[result_max])
         self.assertTrue(np.allclose(res, self.np_expected4))
 
diff --git a/python/paddle/fluid/tests/unittests/test_minus_op.py b/python/paddle/fluid/tests/unittests/test_minus_op.py
index 461ff6a9273cd..9a63947b877f3 100644
--- a/python/paddle/fluid/tests/unittests/test_minus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minus_op.py
@@ -21,6 +21,7 @@
 
 
 class TestMinusOp(OpTest):
+
     def setUp(self):
         self.op_type = "minus"
         self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
index 33393bc2fcd20..650b6a9a247d5 100644
--- a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
+++ b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
@@ -41,23 +41,21 @@ def conv_net(use_feed):
     img = fluid.layers.data(name='image', shape=img_shape, dtype='float16')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(input=img,
+                                                  filter_size=5,
+                                                  num_filters=20,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
     conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
 
     conv_pool_1 = fluid.layers.cast(conv_pool_1, np.float32)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(input=conv_pool_1,
+                                                  filter_size=5,
+                                                  num_filters=50,
+                                                  pool_size=2,
+                                                  pool_stride=2,
+                                                  act="relu")
     hidden = fluid.layers.cast(conv_pool_2, np.float32)
     return loss_net(hidden, label)
 
@@ -68,9 +66,11 @@ def _optimizer(learning_rate=1e-6):
 
 
 class TestResnet(TestParallelExecutorBase):
+
     def check_model(self, use_device):
-        img, label = init_data(
-            batch_size=batch_size, img_shape=img_shape, label_range=9)
+        img, label = init_data(batch_size=batch_size,
+                               img_shape=img_shape,
+                               label_range=9)
         img = np.float16(img)
         feed_dict = {"image": img, "label": label}
 
diff --git a/python/paddle/fluid/tests/unittests/test_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_mixed_precision.py
index 57ea7ad1aa250..68dfb88ccd05f 100644
--- a/python/paddle/fluid/tests/unittests/test_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_mixed_precision.py
@@ -25,6 +25,7 @@
 
 
 class SimpleNet(nn.Layer):
+
     def __init__(self, input_size, output_size):
         super(SimpleNet, self).__init__()
         self.linear1 = nn.Linear(input_size, output_size)
@@ -38,7 +39,7 @@ def forward(self, x):
         x = self.linear1(x)
         # currently, paddle's relu may hide nan/inf, relu(nan) = 0, relu(inf)= inf
         # so, do not use it here.
-        #x = self.relu1(x) 
+        #x = self.relu1(x)
         x = self.linear2(x)
         #x = self.relu2(x)
         x = self.linear3(x)
@@ -47,6 +48,7 @@ def forward(self, x):
 
 
 class AMPTest(unittest.TestCase):
+
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
 
@@ -63,8 +65,9 @@ def net(self):
 
         opt = paddle.fluid.optimizer.Adam(
             learning_rate=0.0001, parameter_list=model.parameters())  # 定义优化器
-        opt = paddle.static.amp.decorate(
-            opt, init_loss_scaling=128.0, use_dynamic_loss_scaling=True)
+        opt = paddle.static.amp.decorate(opt,
+                                         init_loss_scaling=128.0,
+                                         use_dynamic_loss_scaling=True)
         opt.minimize(loss)
         return model, loss, opt
 
@@ -105,8 +108,10 @@ def test_skip_update(self):
                     train_data[i][10] = np.inf
                 loss_, weight_, moment1_, beta_pow1_, found_inf = exe.run(
                     main_prog,
-                    feed={"X": train_data[i],
-                          "Y": labels[i]},
+                    feed={
+                        "X": train_data[i],
+                        "Y": labels[i]
+                    },
                     fetch_list=fetch_list)
                 print(loss_, weight_[0][0], moment1_[0][0], beta_pow1_,
                       found_inf)
diff --git a/python/paddle/fluid/tests/unittests/test_mode_op.py b/python/paddle/fluid/tests/unittests/test_mode_op.py
index 471904b0c9426..ebb95dfea393b 100644
--- a/python/paddle/fluid/tests/unittests/test_mode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mode_op.py
@@ -57,6 +57,7 @@ def cal_mode(a, axis, keepdim=False):
 
 
 class TestModeOp(OpTest):
+
     def init_args(self):
         self.axis = 1
 
@@ -82,6 +83,7 @@ def test_check_grad(self):
 
 
 class TestModeOpLastdim(OpTest):
+
     def init_args(self):
         self.axis = -1
 
@@ -107,12 +109,14 @@ def test_check_grad(self):
 
 
 class TestModeOpKernels(unittest.TestCase):
+
     def setUp(self):
         self.axises = [-1, 1]
         np.random.seed(666)
         self.inputs = np.ceil(np.random.rand(2, 10, 10) * 1000)
 
     def test_mode_op(self):
+
         def test_cpu_kernel():
             paddle.set_device('cpu')
             tensor = paddle.to_tensor(self.inputs)
@@ -121,8 +125,9 @@ def test_cpu_kernel():
                 v, inds = paddle.mode(tensor, axis)
                 self.assertTrue(np.allclose(v.numpy(), value_expect))
 
-                value_expect, indice_expect = cal_mode(
-                    self.inputs, axis, keepdim=True)
+                value_expect, indice_expect = cal_mode(self.inputs,
+                                                       axis,
+                                                       keepdim=True)
                 v, inds = paddle.mode(tensor, axis, keepdim=True)
                 self.assertTrue(np.allclose(v.numpy(), value_expect))
 
@@ -134,8 +139,9 @@ def test_gpu_kernel():
                 v, inds = paddle.mode(tensor, axis)
                 self.assertTrue(np.allclose(v.numpy(), value_expect))
 
-                value_expect, indice_expect = cal_mode(
-                    self.inputs, axis, keepdim=True)
+                value_expect, indice_expect = cal_mode(self.inputs,
+                                                       axis,
+                                                       keepdim=True)
                 v, inds = paddle.mode(tensor, axis, keepdim=True)
                 self.assertTrue(np.allclose(v.numpy(), value_expect))
 
@@ -146,6 +152,7 @@ def test_gpu_kernel():
 
 
 class TestModeOpErrors(unittest.TestCase):
+
     def setUp(self):
         self.x = paddle.uniform([2, 10, 20, 25], dtype='float32')
 
@@ -156,17 +163,19 @@ def test_dim_range_error():
 
 
 class TestModeOpInStatic(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(666)
-        self.input_data = np.ceil(
-            np.random.random((2, 10, 10)) * 1000, dtype=np.float64)
+        self.input_data = np.ceil(np.random.random((2, 10, 10)) * 1000,
+                                  dtype=np.float64)
 
     def test_run_static(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            input_tensor = paddle.static.data(
-                name="x", shape=[2, 10, 10], dtype="float64")
+            input_tensor = paddle.static.data(name="x",
+                                              shape=[2, 10, 10],
+                                              dtype="float64")
 
             result = paddle.mode(input_tensor, axis=1)
             expect_value = cal_mode(self.input_data, axis=1)[0]
diff --git a/python/paddle/fluid/tests/unittests/test_modelaverage.py b/python/paddle/fluid/tests/unittests/test_modelaverage.py
index 8dab35f7f54e7..7bb1e7d2e7a27 100644
--- a/python/paddle/fluid/tests/unittests/test_modelaverage.py
+++ b/python/paddle/fluid/tests/unittests/test_modelaverage.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,6 +25,7 @@
 
 
 class TestModelAverage(unittest.TestCase):
+
     def test_model_average_static(self):
         paddle.enable_static()
         place = fluid.CPUPlace()
@@ -39,8 +40,8 @@ def test_model_average_static(self):
                 hidden = fluid.layers.fc(input=data, size=10)
                 loss = fluid.layers.mean(hidden)
                 test_program = train_program.clone()
-                optimizer = paddle.optimizer.Momentum(
-                    learning_rate=0.2, momentum=0.1)
+                optimizer = paddle.optimizer.Momentum(learning_rate=0.2,
+                                                      momentum=0.1)
 
                 optimizer.minimize(loss)
                 # build ModelAverage optimizer
@@ -59,28 +60,18 @@ def test_model_average_static(self):
                     'fc_0.b_0_old_num_accumulates_0', 'fc_0.b_0_num_updates_0'
                 ])
         self.assertTrue(
-            np.equal(
-                sum_1, np.zeros(
-                    shape=[10], dtype='float32')).all())
+            np.equal(sum_1, np.zeros(shape=[10], dtype='float32')).all())
         self.assertTrue(
-            np.equal(
-                sum_2, np.zeros(
-                    shape=[10], dtype='float32')).all())
+            np.equal(sum_2, np.zeros(shape=[10], dtype='float32')).all())
         self.assertTrue(
-            np.equal(
-                num_accumulates, np.array(
-                    [0], dtype='int64')).all())
+            np.equal(num_accumulates, np.array([0], dtype='int64')).all())
         self.assertTrue(
-            np.equal(
-                old_num_accumulates, np.array(
-                    [2], dtype='int64')).all())
+            np.equal(old_num_accumulates, np.array([2], dtype='int64')).all())
         self.assertTrue(
-            np.equal(
-                num_updates, np.array(
-                    [10], dtype='int64')).all())
+            np.equal(num_updates, np.array([10], dtype='int64')).all())
 
-        average_b = (sum_1 + sum_2 + sum_3) / (
-            num_accumulates + old_num_accumulates)
+        average_b = (sum_1 + sum_2 + sum_3) / (num_accumulates +
+                                               old_num_accumulates)
         # apply ModelAverage
         with model_average.apply(exe):
             x = np.random.random(size=(10, 1)).astype('float32')
@@ -105,6 +96,7 @@ def test_model_average_dygraph(self):
 
         # define a random dataset
         class RandomDataset(paddle.io.Dataset):
+
             def __init__(self, num_samples):
                 self.num_samples = num_samples
 
@@ -118,6 +110,7 @@ def __len__(self):
                 return self.num_samples
 
         class LinearNet(nn.Layer):
+
             def __init__(self):
                 super(LinearNet, self).__init__()
                 self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
@@ -142,12 +135,12 @@ def train(layer, loader, loss_fn, opt, model_average):
             sum_1 = model_average._get_accumulator('sum_1', layer.bias)
             sum_2 = model_average._get_accumulator('sum_2', layer.bias)
             sum_3 = model_average._get_accumulator('sum_3', layer.bias)
-            num_accumulates = model_average._get_accumulator('num_accumulates',
-                                                             layer.bias)
+            num_accumulates = model_average._get_accumulator(
+                'num_accumulates', layer.bias)
             old_num_accumulates = model_average._get_accumulator(
                 'old_num_accumulates', layer.bias)
-            num_updates = model_average._get_accumulator('num_updates',
-                                                         layer.bias)
+            num_updates = model_average._get_accumulator(
+                'num_updates', layer.bias)
 
             return ((sum_1 + sum_2 + sum_3) /
                     (num_accumulates + old_num_accumulates)).numpy()
@@ -157,10 +150,9 @@ def evaluate(layer, loader, loss_fn, check_param):
                 out = layer(image)
                 loss = loss_fn(out, label)
                 loss.backward()
-                self.assertAlmostEqual(
-                    np.mean(layer.bias.numpy()),
-                    np.mean(check_param),
-                    delta=5e-3)
+                self.assertAlmostEqual(np.mean(layer.bias.numpy()),
+                                       np.mean(check_param),
+                                       delta=5e-3)
                 # print("Evaluate batch {}: loss = {}, bias = {}".format(
                 #     batch_id, np.mean(loss.numpy()), layer.bias.numpy()))
 
@@ -168,8 +160,9 @@ def evaluate(layer, loader, loss_fn, check_param):
 
         layer = LinearNet()
         loss_fn = nn.CrossEntropyLoss()
-        optimizer = paddle.optimizer.Momentum(
-            learning_rate=0.2, momentum=0.1, parameters=layer.parameters())
+        optimizer = paddle.optimizer.Momentum(learning_rate=0.2,
+                                              momentum=0.1,
+                                              parameters=layer.parameters())
         # build ModelAverage optimizer
         model_average = paddle.incubate.optimizer.ModelAverage(
             0.15,
@@ -179,18 +172,16 @@ def evaluate(layer, loader, loss_fn, check_param):
 
         # create data loader
         dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
-        loader = paddle.io.DataLoader(
-            dataset,
-            batch_size=BATCH_SIZE,
-            shuffle=True,
-            drop_last=True,
-            num_workers=2)
-        eval_loader = paddle.io.DataLoader(
-            dataset,
-            batch_size=BATCH_SIZE,
-            shuffle=True,
-            drop_last=True,
-            num_workers=1)
+        loader = paddle.io.DataLoader(dataset,
+                                      batch_size=BATCH_SIZE,
+                                      shuffle=True,
+                                      drop_last=True,
+                                      num_workers=2)
+        eval_loader = paddle.io.DataLoader(dataset,
+                                           batch_size=BATCH_SIZE,
+                                           shuffle=True,
+                                           drop_last=True,
+                                           num_workers=1)
         # train
         check_param = train(layer, loader, loss_fn, optimizer, model_average)
         # print(check_param)
diff --git a/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py b/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
index f7b6ad9bfee42..2258f32dafd47 100644
--- a/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
@@ -29,6 +29,7 @@ def modified_huber_loss_forward(val):
 
 
 class TestModifiedHuberLossOp(OpTest):
+
     def setUp(self):
         self.op_type = 'modified_huber_loss'
         samples_num = 100
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index a4f38e37731e8..0b6bd99e6592f 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -53,6 +53,7 @@ def calculate_momentum_by_numpy(param,
 
 
 class TestMomentumOp1(OpTest):
+
     def setUp(self):
         self.op_type = "momentum"
         self.dtype = np.float32
@@ -92,6 +93,7 @@ def test_check_output(self):
 
 
 class TestMomentumOpFp16(TestMomentumOp1):
+
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -139,6 +141,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestLarsMomentumOpWithMP(OpTest):
+
     def setUp(self):
         self.config()
         self.op_type = "lars_momentum"
@@ -218,6 +221,7 @@ def config(self):
 
 
 class TestLarsMomentumOp(OpTest):
+
     def setUp(self):
         self.config()
         self.op_type = "lars_momentum"
@@ -240,8 +244,8 @@ def setUp(self):
             gnorm = np.sqrt(np.square(grad).sum())
             local_lr = learning_rate * lars_coeff * pnorm / (
                 gnorm + lars_weight_decay * param)
-            velocity_out = mu * velocity + local_lr * (grad + lars_weight_decay
-                                                       * param)
+            velocity_out = mu * velocity + local_lr * (
+                grad + lars_weight_decay * param)
             param_out = param - velocity_out
 
             params.append(("SubParam_" + str(i), param))
@@ -274,6 +278,7 @@ def config(self):
 
 
 class TestSparseMomentumOp(unittest.TestCase):
+
     def setUp(self):
         self.use_nesterov = False
         self.regularization_method = ""
@@ -322,18 +327,17 @@ def check_with_place(self, place):
         lr.set(lr_array, place)
 
         # create and run operator
-        op = Operator(
-            "momentum",
-            Param='Param',
-            Grad='Grad',
-            Velocity='Velocity',
-            ParamOut='ParamOut',
-            VelocityOut='VelocityOut',
-            LearningRate='LearningRate',
-            mu=mu,
-            use_nesterov=use_nesterov,
-            regularization_method=regularization_method,
-            regularization_coeff=regularization_coeff)
+        op = Operator("momentum",
+                      Param='Param',
+                      Grad='Grad',
+                      Velocity='Velocity',
+                      ParamOut='ParamOut',
+                      VelocityOut='VelocityOut',
+                      LearningRate='LearningRate',
+                      mu=mu,
+                      use_nesterov=use_nesterov,
+                      regularization_method=regularization_method,
+                      regularization_coeff=regularization_coeff)
         op.run(scope, place)
 
         # get and compare result
@@ -373,11 +377,13 @@ def test_sparse_momentum(self):
 
 
 class TestSparseMomentumOp2(TestSparseMomentumOp):
+
     def init_kernel(self):
         self.use_nesterov = True
 
 
 class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
+
     def setUp(self):
         self.init_args()
         self.regularization_method = ""
@@ -431,22 +437,21 @@ def check_with_place(self, place):
         lr.set(lr_array, place)
 
         # create and run operator
-        op = Operator(
-            "momentum",
-            Param='Param',
-            Grad='Grad',
-            Velocity='Velocity',
-            MasterParam='MasterParam',
-            ParamOut='ParamOut',
-            VelocityOut='VelocityOut',
-            MasterParamOut='MasterParamOut',
-            LearningRate='LearningRate',
-            mu=mu,
-            use_nesterov=use_nesterov,
-            regularization_method=regularization_method,
-            regularization_coeff=regularization_coeff,
-            multi_precision=True,
-            rescale_grad=1.0)
+        op = Operator("momentum",
+                      Param='Param',
+                      Grad='Grad',
+                      Velocity='Velocity',
+                      MasterParam='MasterParam',
+                      ParamOut='ParamOut',
+                      VelocityOut='VelocityOut',
+                      MasterParamOut='MasterParamOut',
+                      LearningRate='LearningRate',
+                      mu=mu,
+                      use_nesterov=use_nesterov,
+                      regularization_method=regularization_method,
+                      regularization_coeff=regularization_coeff,
+                      multi_precision=True,
+                      rescale_grad=1.0)
         op.run(scope, place)
 
         # get and compare result
@@ -482,19 +487,22 @@ def test_sparse_momentum(self):
 
 class TestSparseMomentumOpWithMultiPrecision2(
         TestSparseMomentumOpWithMultiPrecision):
+
     def init_args(self):
         self.use_nesterov = True
 
 
 class TestMomentumV2(unittest.TestCase):
+
     def test_momentum_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Momentum(
-            learning_rate=0.01, momentum=0.9, parameters=linear.parameters())
+        adam = paddle.optimizer.Momentum(learning_rate=0.01,
+                                         momentum=0.9,
+                                         parameters=linear.parameters())
         out = linear(a)
         out.backward()
         adam.step()
@@ -511,13 +519,13 @@ def test_momentum(self):
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
             avg_cost = fluid.layers.mean(cost)
 
-            rms_optimizer = paddle.optimizer.Momentum(
-                learning_rate=0.1, momentum=0.9)
+            rms_optimizer = paddle.optimizer.Momentum(learning_rate=0.1,
+                                                      momentum=0.9)
             rms_optimizer.minimize(avg_cost)
 
             fetch_list = [avg_cost]
-            train_reader = paddle.batch(
-                paddle.dataset.uci_housing.train(), batch_size=1)
+            train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
+                                        batch_size=1)
             feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -525,8 +533,9 @@ def test_momentum(self):
                 exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
 
     def test_raise_error(self):
-        self.assertRaises(
-            ValueError, paddle.optimizer.Momentum, learning_rate=None)
+        self.assertRaises(ValueError,
+                          paddle.optimizer.Momentum,
+                          learning_rate=None)
         self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None)
 
     def test_api_eager_dygraph(self):
@@ -536,6 +545,7 @@ def test_api_eager_dygraph(self):
 
 
 class TestMomentumOpWithDecay(OpTest):
+
     def setUp(self):
         self.op_type = "momentum"
         self.dtype = np.float32
@@ -588,6 +598,7 @@ def test_check_output(self):
 
 
 class TestMomentumOpWithDecayFP16(TestMomentumOpWithDecay):
+
     def init_config(self):
         self.dtype = np.float16
 
@@ -597,11 +608,13 @@ def test_check_output(self):
 
 
 class TestMomentumOpWithDecay2(TestMomentumOpWithDecay):
+
     def init_config(self):
         self.use_nesterov = False
 
 
 class TestSparseMomentumOpWithDecay(TestSparseMomentumOp):
+
     def setUp(self):
         self.use_nesterov = False
         self.regularization_method = 'l2_decay'
@@ -609,11 +622,13 @@ def setUp(self):
 
 
 class TestSparseMomentumOpWithDecay2(TestSparseMomentumOpWithDecay):
+
     def init_kernel(self):
         self.use_nesterov = True
 
 
 class TestMomentumOpWithDecayAPI(unittest.TestCase):
+
     def _test_momentum_dygraph_common(self, regularization):
         paddle.disable_static()
         inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
@@ -650,8 +665,8 @@ def test_momentum_static(self):
             momentum_optimizer.minimize(avg_cost)
 
             fetch_list = [avg_cost]
-            train_reader = paddle.batch(
-                paddle.dataset.uci_housing.train(), batch_size=1)
+            train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
+                                        batch_size=1)
             feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -660,14 +675,17 @@ def test_momentum_static(self):
 
 
 class TestFusedMomentumWithDecayAPI(unittest.TestCase):
+
     def get_program(self, weight_attr, bias_attr=False):
         main_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        with paddle.static.program_guard(
-                main_program=main_program, startup_program=startup_program):
+        with paddle.static.program_guard(main_program=main_program,
+                                         startup_program=startup_program):
             x = paddle.static.data(name='x', shape=[10, 10])
-            linear = paddle.nn.Linear(
-                10, 10, weight_attr=weight_attr, bias_attr=bias_attr)
+            linear = paddle.nn.Linear(10,
+                                      10,
+                                      weight_attr=weight_attr,
+                                      bias_attr=bias_attr)
             out = linear(x)
             loss = paddle.mean(out)
             optimizer = paddle.optimizer.Momentum(
@@ -731,10 +749,11 @@ def test_param_has_no_regularizer(self):
 
 
 class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
+
     def __update_params(self, momentum, linear):
         for i in range(10):
-            inp = paddle.full(
-                shape=[2, 2], fill_value=i, dtype='float32').astype("float32")
+            inp = paddle.full(shape=[2, 2], fill_value=i,
+                              dtype='float32').astype("float32")
             inp = paddle.to_tensor(inp)
             out = linear(inp)
             loss = paddle.mean(out)
@@ -786,6 +805,7 @@ def test_vs(self, place=fluid.CPUPlace()):
 
 
 class TestMomentumV2Group(TestMomentumV2):
+
     def test_momentum_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -793,18 +813,22 @@ def test_momentum_dygraph(self):
         linear_1 = paddle.nn.Linear(13, 5)
         linear_2 = paddle.nn.Linear(5, 3)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Momentum(
-            learning_rate=0.01,
-            parameters=[{
-                'params': linear_1.parameters()
-            }, {
-                'params': linear_2.parameters(),
-                'weight_decay': 0.001,
-                'learning_rate': 0.1,
-                'momentum': 0.99
-            }],
-            weight_decay=0.1,
-            momentum=0.9)
+        adam = paddle.optimizer.Momentum(learning_rate=0.01,
+                                         parameters=[{
+                                             'params':
+                                             linear_1.parameters()
+                                         }, {
+                                             'params':
+                                             linear_2.parameters(),
+                                             'weight_decay':
+                                             0.001,
+                                             'learning_rate':
+                                             0.1,
+                                             'momentum':
+                                             0.99
+                                         }],
+                                         weight_decay=0.1,
+                                         momentum=0.9)
         out = linear_1(a)
         out = linear_2(out)
         out.backward()
@@ -813,6 +837,7 @@ def test_momentum_dygraph(self):
 
 
 class TestMultiTensorMomentumDygraph(unittest.TestCase):
+
     def _momentum_optimize_dygraph(self,
                                    place,
                                    use_param_attr=False,
@@ -882,8 +907,7 @@ def _check_with_place_amp(self, place, use_amp):
         self.assertEqual(np.allclose(output1, output2, rtol=1e-05), True)
         for idx in range(len(params1)):
             self.assertEqual(
-                np.allclose(
-                    params1[idx], params2[idx], rtol=1e-05), True)
+                np.allclose(params1[idx], params2[idx], rtol=1e-05), True)
 
     def _check_with_param_arrt(self, place, use_amp):
         output1, params1 = self._momentum_optimize_dygraph(
@@ -899,8 +923,7 @@ def _check_with_param_arrt(self, place, use_amp):
         self.assertEqual(np.allclose(output1, output2, rtol=1e-05), True)
         for idx in range(len(params1)):
             self.assertEqual(
-                np.allclose(
-                    params1[idx], params2[idx], rtol=1e-05), True)
+                np.allclose(params1[idx], params2[idx], rtol=1e-05), True)
 
     def _check_with_param_group(self, place, use_amp):
         output1, params1 = self._momentum_optimize_dygraph(
@@ -916,8 +939,7 @@ def _check_with_param_group(self, place, use_amp):
         self.assertEqual(np.allclose(output1, output2, rtol=1e-05), True)
         for idx in range(len(params1)):
             self.assertEqual(
-                np.allclose(
-                    params1[idx], params2[idx], rtol=1e-05), True)
+                np.allclose(params1[idx], params2[idx], rtol=1e-05), True)
 
     def test_main(self):
         for place in self._get_places():
@@ -933,6 +955,7 @@ def test_api_eager_dygraph(self):
 
 
 class TestMultiTensorMomentumStatic(unittest.TestCase):
+
     def _momentum_optimize_static(self,
                                   place,
                                   use_amp=False,
@@ -945,8 +968,8 @@ def _momentum_optimize_static(self,
         exe = paddle.static.Executor(place=place)
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        optimizer = paddle.optimizer.Momentum(
-            multi_precision=use_amp, use_multi_tensor=use_multi_tensor)
+        optimizer = paddle.optimizer.Momentum(multi_precision=use_amp,
+                                              use_multi_tensor=use_multi_tensor)
         if use_amp:
             optimizer = paddle.static.amp.decorate(
                 optimizer,
@@ -956,11 +979,13 @@ def _momentum_optimize_static(self,
                 use_fp16_guard=False)
         with paddle.static.program_guard(train_program, startup_program):
             if use_amp:
-                data = paddle.static.data(
-                    shape=[2, 2], name='X', dtype='float16')
+                data = paddle.static.data(shape=[2, 2],
+                                          name='X',
+                                          dtype='float16')
             else:
-                data = paddle.static.data(
-                    shape=[2, 2], name='X', dtype='float32')
+                data = paddle.static.data(shape=[2, 2],
+                                          name='X',
+                                          dtype='float32')
             hidden = paddle.static.nn.fc(x=data, size=10)
             loss = paddle.fluid.layers.mean(hidden)
             optimizer.minimize(loss)
@@ -985,14 +1010,15 @@ def _get_places(self):
         return places
 
     def _check_with_place_amp(self, place, use_amp):
-        output1 = self._momentum_optimize_static(
-            place=place, use_amp=use_amp, use_multi_tensor=True)
-        output2 = self._momentum_optimize_static(
-            place=place, use_amp=use_amp, use_multi_tensor=False)
+        output1 = self._momentum_optimize_static(place=place,
+                                                 use_amp=use_amp,
+                                                 use_multi_tensor=True)
+        output2 = self._momentum_optimize_static(place=place,
+                                                 use_amp=use_amp,
+                                                 use_multi_tensor=False)
         for idx in range(len(output1)):
             self.assertEqual(
-                np.allclose(
-                    output1[idx], output2[idx], rtol=1e-05), True)
+                np.allclose(output1[idx], output2[idx], rtol=1e-05), True)
 
     def test_main(self):
         for place in self._get_places():
diff --git a/python/paddle/fluid/tests/unittests/test_monitor.py b/python/paddle/fluid/tests/unittests/test_monitor.py
index bea2f6c8b38b2..107d0ba6f4c11 100644
--- a/python/paddle/fluid/tests/unittests/test_monitor.py
+++ b/python/paddle/fluid/tests/unittests/test_monitor.py
@@ -17,6 +17,7 @@
 
 from __future__ import print_function
 import paddle
+
 paddle.enable_static()
 
 import paddle.fluid as fluid
@@ -50,8 +51,10 @@ def test_dataset_run_with_stat(self):
         slots = ["slot1", "slot2", "slot3", "slot4"]
         slots_vars = []
         for slot in slots:
-            var = fluid.layers.data(
-                name=slot, shape=[1], dtype="int64", lod_level=1)
+            var = fluid.layers.data(name=slot,
+                                    shape=[1],
+                                    dtype="int64",
+                                    lod_level=1)
             slots_vars.append(var)
 
         embs = []
@@ -75,9 +78,8 @@ def test_dataset_run_with_stat(self):
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
         if self.use_data_loader:
-            data_loader = fluid.io.DataLoader.from_dataset(dataset,
-                                                           fluid.cpu_places(),
-                                                           self.drop_last)
+            data_loader = fluid.io.DataLoader.from_dataset(
+                dataset, fluid.cpu_places(), self.drop_last)
             for i in range(self.epoch_num):
                 for data in data_loader():
                     exe.run(fluid.default_main_program(), feed=data)
@@ -85,12 +87,11 @@ def test_dataset_run_with_stat(self):
         else:
             for i in range(self.epoch_num):
                 try:
-                    exe.train_from_dataset(
-                        fluid.default_main_program(),
-                        dataset,
-                        fetch_list=[embs[0], embs[1]],
-                        fetch_info=["emb0", "emb1"],
-                        print_period=1)
+                    exe.train_from_dataset(fluid.default_main_program(),
+                                           dataset,
+                                           fetch_list=[embs[0], embs[1]],
+                                           fetch_info=["emb0", "emb1"],
+                                           print_period=1)
 
                 except Exception as e:
                     self.assertTrue(False)
diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py
index 89eef6ca24243..b32833916e2b1 100644
--- a/python/paddle/fluid/tests/unittests/test_mse_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py
@@ -24,6 +24,7 @@
 
 
 class TestMseLoss(unittest.TestCase):
+
     def test_mse_loss(self):
         input_val = np.random.uniform(0.1, 0.5, (2, 3)).astype("float32")
         label_val = np.random.uniform(0.1, 0.5, (2, 3)).astype("float32")
@@ -40,15 +41,19 @@ def test_mse_loss(self):
             place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
             exe = Executor(place)
             result = exe.run(fluid.default_main_program(),
-                             feed={"input": input_val,
-                                   "label": label_val},
+                             feed={
+                                 "input": input_val,
+                                 "label": label_val
+                             },
                              fetch_list=[output])
 
             self.assertTrue(np.isclose(np_result, result).all())
 
 
 class TestMseInvalidInput(unittest.TestCase):
+
     def test_error(self):
+
         def test_invalid_input():
             input = [256, 3]
             label = fluid.data(name='label', shape=[None, 3], dtype='float32')
@@ -65,6 +70,7 @@ def test_invalid_label():
 
 
 class TestNNMseLoss(unittest.TestCase):
+
     def test_NNMseLoss_mean(self):
         for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]:
             input_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
@@ -72,28 +78,30 @@ def test_NNMseLoss_mean(self):
             paddle.enable_static()
             prog = fluid.Program()
             startup_prog = fluid.Program()
-            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
             with fluid.program_guard(prog, startup_prog):
-                input = fluid.layers.data(
-                    name='input', shape=dim, dtype='float32')
-                label = fluid.layers.data(
-                    name='label', shape=dim, dtype='float32')
+                input = fluid.layers.data(name='input',
+                                          shape=dim,
+                                          dtype='float32')
+                label = fluid.layers.data(name='label',
+                                          shape=dim,
+                                          dtype='float32')
                 mse_loss = paddle.nn.loss.MSELoss()
                 ret = mse_loss(input, label)
 
                 exe = fluid.Executor(place)
-                static_result = exe.run(
-                    prog,
-                    feed={"input": input_np,
-                          "label": label_np},
-                    fetch_list=[ret])
+                static_result = exe.run(prog,
+                                        feed={
+                                            "input": input_np,
+                                            "label": label_np
+                                        },
+                                        fetch_list=[ret])
 
             with fluid.dygraph.guard():
                 mse_loss = paddle.nn.loss.MSELoss()
-                dy_ret = mse_loss(
-                    fluid.dygraph.to_variable(input_np),
-                    fluid.dygraph.to_variable(label_np))
+                dy_ret = mse_loss(fluid.dygraph.to_variable(input_np),
+                                  fluid.dygraph.to_variable(label_np))
                 dy_result = dy_ret.numpy()
 
             sub = input_np - label_np
@@ -110,28 +118,30 @@ def test_NNMseLoss_sum(self):
             paddle.enable_static()
             prog = fluid.Program()
             startup_prog = fluid.Program()
-            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
             with fluid.program_guard(prog, startup_prog):
-                input = fluid.layers.data(
-                    name='input', shape=dim, dtype='float32')
-                label = fluid.layers.data(
-                    name='label', shape=dim, dtype='float32')
+                input = fluid.layers.data(name='input',
+                                          shape=dim,
+                                          dtype='float32')
+                label = fluid.layers.data(name='label',
+                                          shape=dim,
+                                          dtype='float32')
                 mse_loss = paddle.nn.loss.MSELoss(reduction='sum')
                 ret = mse_loss(input, label)
 
                 exe = fluid.Executor(place)
-                static_result = exe.run(
-                    prog,
-                    feed={"input": input_np,
-                          "label": label_np},
-                    fetch_list=[ret])
+                static_result = exe.run(prog,
+                                        feed={
+                                            "input": input_np,
+                                            "label": label_np
+                                        },
+                                        fetch_list=[ret])
 
             with fluid.dygraph.guard():
                 mse_loss = paddle.nn.loss.MSELoss(reduction='sum')
-                dy_ret = mse_loss(
-                    fluid.dygraph.to_variable(input_np),
-                    fluid.dygraph.to_variable(label_np))
+                dy_ret = mse_loss(fluid.dygraph.to_variable(input_np),
+                                  fluid.dygraph.to_variable(label_np))
                 dy_result = dy_ret.numpy()
 
             sub = input_np - label_np
@@ -148,28 +158,30 @@ def test_NNMseLoss_none(self):
             paddle.enable_static()
             prog = fluid.Program()
             startup_prog = fluid.Program()
-            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
             with fluid.program_guard(prog, startup_prog):
-                input = fluid.layers.data(
-                    name='input', shape=dim, dtype='float32')
-                label = fluid.layers.data(
-                    name='label', shape=dim, dtype='float32')
+                input = fluid.layers.data(name='input',
+                                          shape=dim,
+                                          dtype='float32')
+                label = fluid.layers.data(name='label',
+                                          shape=dim,
+                                          dtype='float32')
                 mse_loss = paddle.nn.loss.MSELoss(reduction='none')
                 ret = mse_loss(input, label)
 
                 exe = fluid.Executor(place)
-                static_result = exe.run(
-                    prog,
-                    feed={"input": input_np,
-                          "label": label_np},
-                    fetch_list=[ret])
+                static_result = exe.run(prog,
+                                        feed={
+                                            "input": input_np,
+                                            "label": label_np
+                                        },
+                                        fetch_list=[ret])
 
             with fluid.dygraph.guard():
                 mse_loss = paddle.nn.loss.MSELoss(reduction='none')
-                dy_ret = mse_loss(
-                    fluid.dygraph.to_variable(input_np),
-                    fluid.dygraph.to_variable(label_np))
+                dy_ret = mse_loss(fluid.dygraph.to_variable(input_np),
+                                  fluid.dygraph.to_variable(label_np))
                 dy_result = dy_ret.numpy()
 
             sub = input_np - label_np
@@ -181,6 +193,7 @@ def test_NNMseLoss_none(self):
 
 
 class TestNNFunctionalMseLoss(unittest.TestCase):
+
     def test_NNFunctionalMseLoss_mean(self):
         for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]:
             input_np = np.random.uniform(0.1, 0.5, dim).astype("float32")
@@ -188,26 +201,30 @@ def test_NNFunctionalMseLoss_mean(self):
             paddle.enable_static()
             prog = paddle.static.Program()
             startup_prog = paddle.static.Program()
-            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else paddle.CPUPlace()
+            place = paddle.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.fluid.data(
-                    name='input', shape=dim, dtype='float32')
-                target = paddle.fluid.data(
-                    name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(name='input',
+                                          shape=dim,
+                                          dtype='float32')
+                target = paddle.fluid.data(name='target',
+                                           shape=dim,
+                                           dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'mean')
 
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
-            static_result = exe.run(
-                prog,
-                feed={"input": input_np,
-                      "target": target_np},
-                fetch_list=[mse_loss])
+            static_result = exe.run(prog,
+                                    feed={
+                                        "input": input_np,
+                                        "target": target_np
+                                    },
+                                    fetch_list=[mse_loss])
 
             paddle.disable_static()
-            dy_ret = paddle.nn.functional.mse_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'mean')
+            dy_ret = paddle.nn.functional.mse_loss(paddle.to_tensor(input_np),
+                                                   paddle.to_tensor(target_np),
+                                                   'mean')
             dy_result = dy_ret.numpy()
 
             sub = input_np - target_np
@@ -224,26 +241,30 @@ def test_NNFunctionalMseLoss_sum(self):
             paddle.enable_static()
             prog = paddle.static.Program()
             startup_prog = paddle.static.Program()
-            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else paddle.CPUPlace()
+            place = paddle.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.fluid.data(
-                    name='input', shape=dim, dtype='float32')
-                target = paddle.fluid.data(
-                    name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(name='input',
+                                          shape=dim,
+                                          dtype='float32')
+                target = paddle.fluid.data(name='target',
+                                           shape=dim,
+                                           dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'sum')
 
                 exe = paddle.static.Executor(place)
                 exe.run(startup_prog)
-                static_result = exe.run(
-                    prog,
-                    feed={"input": input_np,
-                          "target": target_np},
-                    fetch_list=[mse_loss])
+                static_result = exe.run(prog,
+                                        feed={
+                                            "input": input_np,
+                                            "target": target_np
+                                        },
+                                        fetch_list=[mse_loss])
 
             paddle.disable_static()
-            dy_ret = paddle.nn.functional.mse_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'sum')
+            dy_ret = paddle.nn.functional.mse_loss(paddle.to_tensor(input_np),
+                                                   paddle.to_tensor(target_np),
+                                                   'sum')
             dy_result = dy_ret.numpy()
 
             sub = input_np - target_np
@@ -260,26 +281,30 @@ def test_NNFunctionalMseLoss_none(self):
             paddle.enable_static()
             prog = paddle.static.Program()
             startup_prog = paddle.static.Program()
-            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else paddle.CPUPlace()
+            place = paddle.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.fluid.data(
-                    name='input', shape=dim, dtype='float32')
-                target = paddle.fluid.data(
-                    name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(name='input',
+                                          shape=dim,
+                                          dtype='float32')
+                target = paddle.fluid.data(name='target',
+                                           shape=dim,
+                                           dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'none')
 
                 exe = paddle.static.Executor(place)
                 exe.run(startup_prog)
-                static_result = exe.run(
-                    prog,
-                    feed={"input": input_np,
-                          "target": target_np},
-                    fetch_list=[mse_loss])
+                static_result = exe.run(prog,
+                                        feed={
+                                            "input": input_np,
+                                            "target": target_np
+                                        },
+                                        fetch_list=[mse_loss])
 
             paddle.disable_static()
-            dy_ret = paddle.nn.functional.mse_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'none')
+            dy_ret = paddle.nn.functional.mse_loss(paddle.to_tensor(input_np),
+                                                   paddle.to_tensor(target_np),
+                                                   'none')
             dy_result = dy_ret.numpy()
 
             sub = input_np - target_np
diff --git a/python/paddle/fluid/tests/unittests/test_mul_nn_grad.py b/python/paddle/fluid/tests/unittests/test_mul_nn_grad.py
index c862c555c897a..cc4ed645c7d1d 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_nn_grad.py
@@ -23,10 +23,12 @@
 import paddle.fluid.core as core
 import gradient_checker
 from decorator_helper import prog_scope
+
 paddle.enable_static()
 
 
 class TestMulGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         prog = fluid.Program()
@@ -45,6 +47,7 @@ def test_grad(self):
 
 
 class TestMulDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -61,8 +64,11 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, y_shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -73,6 +79,7 @@ def test_grad(self):
 
 
 class TestMatmulDoubleGradCheck(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -87,17 +94,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = layers.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = layers.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = layers.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = layers.create_parameter(dtype=typename,
+                                    shape=self.x_shape,
+                                    name='x')
+        y = layers.create_parameter(dtype=typename,
+                                    shape=self.y_shape,
+                                    name='y')
+        out = layers.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
 
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -108,6 +123,7 @@ def test_grad(self):
 
 
 def TestMatmulDoubleGradCheckCase1(TestMatmulDoubleGradCheck):
+
     def init_test(self):
         self.x_shape = [2, 3]
         self.y_shape = [3, 2]
@@ -116,6 +132,7 @@ def init_test(self):
 
 
 def TestMatmulDoubleGradCheckCase2(TestMatmulDoubleGradCheck):
+
     def init_test(self):
         self.x_shape = [2, 4, 3]
         self.y_shape = [2, 4, 5]
@@ -124,6 +141,7 @@ def init_test(self):
 
 
 def TestMatmulDoubleGradCheckCase3(TestMatmulDoubleGradCheck):
+
     def init_test(self):
         self.x_shape = [2, 3, 4, 5]
         self.y_shape = [2, 3, 3, 5]
@@ -132,6 +150,7 @@ def init_test(self):
 
 
 def TestMatmulDoubleGradCheckCase4(TestMatmulDoubleGradCheck):
+
     def init_test(self):
         self.x_shape = [2, 3, 4]
         self.y_shape = [4, 3]
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index 927383c1223d5..23904f9fa4f10 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -19,6 +19,7 @@
 import paddle
 import paddle.fluid.core as core
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle.fluid as fluid
@@ -26,6 +27,7 @@
 
 
 class TestMulOp(OpTest):
+
     def setUp(self):
         self.op_type = "mul"
         self.dtype = np.float64
@@ -46,22 +48,27 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=0.5,
+                        no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.5,
+                        no_grad_set=set('Y'))
 
 
 class TestMulOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of mul_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            x2 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
+            x2 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.mul, x1, x2)
             # The input dtype of mul_op must be float32 or float64.
             x3 = fluid.layers.data(name='x3', shape=[4], dtype="int32")
@@ -70,6 +77,7 @@ def test_errors(self):
 
 
 class TestMulOp2(OpTest):
+
     def setUp(self):
         self.op_type = "mul"
         self.dtype = np.float64
@@ -97,17 +105,22 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set('X'))
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=0.5,
+                        no_grad_set=set('X'))
 
     def test_check_grad_ignore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.5,
+                        no_grad_set=set('Y'))
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16MulOp1(TestMulOp):
+
     def init_dtype_type(self):
         self.dtype = np.float16
 
@@ -119,31 +132,31 @@ def test_check_output(self):
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['X', 'Y'], 'Out', max_relative_error=0.5)
+            self.check_grad_with_place(place, ['X', 'Y'],
+                                       'Out',
+                                       max_relative_error=0.5)
 
     def test_check_grad_ingore_x(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['Y'],
-                'Out',
-                max_relative_error=0.5,
-                no_grad_set=set("X"))
+            self.check_grad_with_place(place, ['Y'],
+                                       'Out',
+                                       max_relative_error=0.5,
+                                       no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['X'],
-                'Out',
-                max_relative_error=0.5,
-                no_grad_set=set('Y'))
+            self.check_grad_with_place(place, ['X'],
+                                       'Out',
+                                       max_relative_error=0.5,
+                                       no_grad_set=set('Y'))
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16MulOp2(TestMulOp2):
+
     def init_dtype_type(self):
         self.dtype = np.float16
 
@@ -155,26 +168,25 @@ def test_check_output(self):
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['X', 'Y'], 'Out', max_relative_error=0.9)
+            self.check_grad_with_place(place, ['X', 'Y'],
+                                       'Out',
+                                       max_relative_error=0.9)
 
     def test_check_grad_ingore_x(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['Y'],
-                'Out',
-                max_relative_error=0.5,
-                no_grad_set=set("X"))
+            self.check_grad_with_place(place, ['Y'],
+                                       'Out',
+                                       max_relative_error=0.5,
+                                       no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['X'],
-                'Out',
-                max_relative_error=0.9,
-                no_grad_set=set('Y'))
+            self.check_grad_with_place(place, ['X'],
+                                       'Out',
+                                       max_relative_error=0.9,
+                                       no_grad_set=set('Y'))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
index 11c0436317076..d4d9fcd8b6a69 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
@@ -26,6 +26,7 @@
 #the unittest of multi_dot
 #compare the result of paddle multi_dot and numpy multi_dot
 class TestMultiDotOp(OpTest):
+
     def setUp(self):
         self.op_type = "multi_dot"
         self.python_api = paddle.linalg.multi_dot
@@ -51,6 +52,7 @@ def test_check_grad(self):
 
 #(A*B)*C
 class TestMultiDotOp3Mat(TestMultiDotOp):
+
     def get_inputs_and_outputs(self):
         self.A = np.random.random((2, 10)).astype(self.dtype)
         self.B = np.random.random((10, 4)).astype(self.dtype)
@@ -66,6 +68,7 @@ def test_check_grad(self):
 
 #A*(B*C)
 class TestMultiDotOp3Mat2(TestMultiDotOp):
+
     def get_inputs_and_outputs(self):
         self.A = np.random.random((3, 4)).astype(self.dtype)
         self.B = np.random.random((4, 8)).astype(self.dtype)
@@ -80,14 +83,15 @@ def test_check_grad(self):
 
 
 class TestMultiDotOp4Mat(TestMultiDotOp):
+
     def get_inputs_and_outputs(self):
         self.A = np.random.random((8, 6)).astype(self.dtype)
         self.B = np.random.random((6, 3)).astype(self.dtype)
         self.C = np.random.random((3, 4)).astype(self.dtype)
         self.D = np.random.random((4, 5)).astype(self.dtype)
         self.inputs = {
-            'X':
-            [('x0', self.A), ('x1', self.B), ('x2', self.C), ('x3', self.D)]
+            'X': [('x0', self.A), ('x1', self.B), ('x2', self.C),
+                  ('x3', self.D)]
         }
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
 
@@ -99,6 +103,7 @@ def test_check_grad(self):
 
 
 class TestMultiDotOpFirst1D(TestMultiDotOp):
+
     def get_inputs_and_outputs(self):
         self.A = np.random.random((4)).astype(self.dtype)
         self.B = np.random.random((4, 3)).astype(self.dtype)
@@ -107,6 +112,7 @@ def get_inputs_and_outputs(self):
 
 
 class TestMultiDotOp3MatFirst1D(TestMultiDotOp3Mat):
+
     def get_inputs_and_outputs(self):
         self.A = np.random.random((4)).astype(self.dtype)
         self.B = np.random.random((4, 3)).astype(self.dtype)
@@ -116,19 +122,21 @@ def get_inputs_and_outputs(self):
 
 
 class TestMultiDotOp4MatFirst1D(TestMultiDotOp4Mat):
+
     def get_inputs_and_outputs(self):
         self.A = np.random.random((4)).astype(self.dtype)
         self.B = np.random.random((4, 3)).astype(self.dtype)
         self.C = np.random.random((3, 4)).astype(self.dtype)
         self.D = np.random.random((4, 5)).astype(self.dtype)
         self.inputs = {
-            'X':
-            [('x0', self.A), ('x1', self.B), ('x2', self.C), ('x3', self.D)]
+            'X': [('x0', self.A), ('x1', self.B), ('x2', self.C),
+                  ('x3', self.D)]
         }
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
 
 
 class TestMultiDotOpLast1D(TestMultiDotOp):
+
     def get_inputs_and_outputs(self):
         self.A = np.random.random((3, 6)).astype(self.dtype)
         self.B = np.random.random((6)).astype(self.dtype)
@@ -137,6 +145,7 @@ def get_inputs_and_outputs(self):
 
 
 class TestMultiDotOp3MatLast1D(TestMultiDotOp3Mat):
+
     def get_inputs_and_outputs(self):
         self.A = np.random.random((2, 4)).astype(self.dtype)
         self.B = np.random.random((4, 3)).astype(self.dtype)
@@ -151,19 +160,21 @@ def test_check_grad(self):
 
 
 class TestMultiDotOp4MatLast1D(TestMultiDotOp4Mat):
+
     def get_inputs_and_outputs(self):
         self.A = np.random.random((2, 3)).astype(self.dtype)
         self.B = np.random.random((3, 2)).astype(self.dtype)
         self.C = np.random.random((2, 3)).astype(self.dtype)
         self.D = np.random.random((3)).astype(self.dtype)
         self.inputs = {
-            'X':
-            [('x0', self.A), ('x1', self.B), ('x2', self.C), ('x3', self.D)]
+            'X': [('x0', self.A), ('x1', self.B), ('x2', self.C),
+                  ('x3', self.D)]
         }
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
 
 
 class TestMultiDotOpFirstAndLast1D(TestMultiDotOp):
+
     def get_inputs_and_outputs(self):
         self.A = np.random.random((4, )).astype(self.dtype)
         self.B = np.random.random((4)).astype(self.dtype)
@@ -172,6 +183,7 @@ def get_inputs_and_outputs(self):
 
 
 class TestMultiDotOp3MatFirstAndLast1D(TestMultiDotOp3Mat):
+
     def get_inputs_and_outputs(self):
         self.A = np.random.random((6, )).astype(self.dtype)
         self.B = np.random.random((6, 4)).astype(self.dtype)
@@ -181,20 +193,22 @@ def get_inputs_and_outputs(self):
 
 
 class TestMultiDotOp4MatFirstAndLast1D(TestMultiDotOp4Mat):
+
     def get_inputs_and_outputs(self):
         self.A = np.random.random((3, )).astype(self.dtype)
         self.B = np.random.random((3, 4)).astype(self.dtype)
         self.C = np.random.random((4, 2)).astype(self.dtype)
         self.D = np.random.random((2)).astype(self.dtype)
         self.inputs = {
-            'X':
-            [('x0', self.A), ('x1', self.B), ('x2', self.C), ('x3', self.D)]
+            'X': [('x0', self.A), ('x1', self.B), ('x2', self.C),
+                  ('x3', self.D)]
         }
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
 
 
 #####python API test#######
 class TestMultiDotOpError(unittest.TestCase):
+
     def test_errors(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
@@ -204,8 +218,9 @@ def test_errors(self):
                               [input1, input1])
 
             # The inputs dtype of multi_dot must be float64, float64 or float16.
-            input2 = paddle.static.data(
-                name='input2', shape=[10, 10], dtype="int32")
+            input2 = paddle.static.data(name='input2',
+                                        shape=[10, 10],
+                                        dtype="int32")
             self.assertRaises(TypeError, paddle.linalg.multi_dot,
                               [input2, input2])
 
@@ -231,6 +246,7 @@ def test_errors(self):
 
 
 class APITestMultiDot(unittest.TestCase):
+
     def test_out(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
@@ -240,15 +256,15 @@ def test_out(self):
             exe = paddle.static.Executor(paddle.CPUPlace())
             data1 = np.random.rand(3, 2).astype("float64")
             data2 = np.random.rand(2, 3).astype("float64")
-            np_res = exe.run(feed={'x0': data1,
-                                   'x1': data2},
+            np_res = exe.run(feed={
+                'x0': data1,
+                'x1': data2
+            },
                              fetch_list=[result])
             expected_result = np.linalg.multi_dot([data1, data2])
 
         self.assertTrue(
-            np.allclose(
-                np_res, expected_result, atol=1e-5),
-            "two value is\
+            np.allclose(np_res, expected_result, atol=1e-5), "two value is\
             {}\n{}, check diff!".format(np_res, expected_result))
 
     def test_dygraph_without_out(self):
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index 3158d78db63dc..67650158bef16 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -55,8 +55,8 @@ def iou(box_a, box_b, norm):
     xb = min(xmax_a, xmax_b)
     yb = min(ymax_a, ymax_b)
 
-    inter_area = max(xb - xa + (norm == False),
-                     0.0) * max(yb - ya + (norm == False), 0.0)
+    inter_area = max(xb - xa +
+                     (norm == False), 0.0) * max(yb - ya + (norm == False), 0.0)
 
     iou_ratio = inter_area / (area_a + area_b - inter_area)
 
@@ -147,8 +147,9 @@ def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
                 else:
                     score_index.append((scores[idx][c], c, idx))
 
-        sorted_score_index = sorted(
-            score_index, key=lambda tup: tup[0], reverse=True)
+        sorted_score_index = sorted(score_index,
+                                    key=lambda tup: tup[0],
+                                    reverse=True)
         sorted_score_index = sorted_score_index[:keep_top_k]
         selected_indices = {}
 
@@ -179,16 +180,15 @@ def lod_multiclass_nms(boxes, scores, background, score_threshold,
         score = scores[head:head + box_lod[0][n]]
         offset = head
         head = head + box_lod[0][n]
-        nmsed_outs, nmsed_num = multiclass_nms(
-            box,
-            score,
-            background,
-            score_threshold,
-            nms_threshold,
-            nms_top_k,
-            keep_top_k,
-            normalized,
-            shared=False)
+        nmsed_outs, nmsed_num = multiclass_nms(box,
+                                               score,
+                                               background,
+                                               score_threshold,
+                                               nms_threshold,
+                                               nms_top_k,
+                                               keep_top_k,
+                                               normalized,
+                                               shared=False)
         lod.append(nmsed_num)
 
         if nmsed_num == 0:
@@ -201,8 +201,9 @@ def lod_multiclass_nms(boxes, scores, background, score_threshold,
                     c, score[idx][c], xmin, ymin, xmax, ymax,
                     offset * num_class + idx * num_class + c
                 ])
-        sorted_det_out = sorted(
-            tmp_det_out, key=lambda tup: tup[0], reverse=False)
+        sorted_det_out = sorted(tmp_det_out,
+                                key=lambda tup: tup[0],
+                                reverse=False)
         det_outs.extend(sorted_det_out)
 
     return det_outs, lod
@@ -222,16 +223,15 @@ def batched_multiclass_nms(boxes,
     index_outs = []
     lod = []
     for n in range(batch_size):
-        nmsed_outs, nmsed_num = multiclass_nms(
-            boxes[n],
-            scores[n],
-            background,
-            score_threshold,
-            nms_threshold,
-            nms_top_k,
-            keep_top_k,
-            normalized,
-            shared=True)
+        nmsed_outs, nmsed_num = multiclass_nms(boxes[n],
+                                               scores[n],
+                                               background,
+                                               score_threshold,
+                                               nms_threshold,
+                                               nms_top_k,
+                                               keep_top_k,
+                                               normalized,
+                                               shared=True)
         lod.append(nmsed_num)
 
         if nmsed_num == 0:
@@ -244,13 +244,15 @@ def batched_multiclass_nms(boxes,
                     c, scores[n][c][idx], xmin, ymin, xmax, ymax,
                     idx + n * num_boxes
                 ])
-        sorted_det_out = sorted(
-            tmp_det_out, key=lambda tup: tup[0], reverse=False)
+        sorted_det_out = sorted(tmp_det_out,
+                                key=lambda tup: tup[0],
+                                reverse=False)
         det_outs.extend(sorted_det_out)
     return det_outs, lod
 
 
 class TestMulticlassNMSOp(OpTest):
+
     def set_argument(self):
         self.score_threshold = 0.01
 
@@ -302,6 +304,7 @@ def test_check_output(self):
 
 
 class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp):
+
     def set_argument(self):
         # Here set 2.0 to test the case there is no outputs.
         # In practical use, 0.0 < score_threshold < 1.0
@@ -309,6 +312,7 @@ def set_argument(self):
 
 
 class TestMulticlassNMSLoDInput(OpTest):
+
     def set_argument(self):
         self.score_threshold = 0.01
 
@@ -335,9 +339,10 @@ def setUp(self):
         boxes[:, :, 2] = boxes[:, :, 2] * 10 + 10
         boxes[:, :, 3] = boxes[:, :, 3] * 10 + 10
 
-        det_outs, lod = lod_multiclass_nms(
-            boxes, scores, background, score_threshold, nms_threshold,
-            nms_top_k, keep_top_k, box_lod, normalized)
+        det_outs, lod = lod_multiclass_nms(boxes, scores, background,
+                                           score_threshold, nms_threshold,
+                                           nms_top_k, keep_top_k, box_lod,
+                                           normalized)
         det_outs = np.array(det_outs).astype('float32')
         nmsed_outs = det_outs[:, :-1].astype('float32') if len(
             det_outs) else det_outs
@@ -362,6 +367,7 @@ def test_check_output(self):
 
 
 class TestMulticlassNMSNoBox(TestMulticlassNMSLoDInput):
+
     def setUp(self):
         self.set_argument()
         M = 1200
@@ -385,9 +391,10 @@ def setUp(self):
         boxes[:, :, 2] = boxes[:, :, 2] * 10 + 10
         boxes[:, :, 3] = boxes[:, :, 3] * 10 + 10
 
-        det_outs, lod = lod_multiclass_nms(
-            boxes, scores, background, score_threshold, nms_threshold,
-            nms_top_k, keep_top_k, box_lod, normalized)
+        det_outs, lod = lod_multiclass_nms(boxes, scores, background,
+                                           score_threshold, nms_threshold,
+                                           nms_top_k, keep_top_k, box_lod,
+                                           normalized)
         det_outs = np.array(det_outs).astype('float32')
         nmsed_outs = det_outs[:, :-1].astype('float32') if len(
             det_outs) else det_outs
@@ -409,6 +416,7 @@ def setUp(self):
 
 
 class TestIOU(unittest.TestCase):
+
     def test_iou(self):
         box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32')
         box2 = np.array([3.0, 4.0, 6.0, 8.0]).astype('float32')
@@ -419,6 +427,7 @@ def test_iou(self):
 
 
 class TestMulticlassNMS2Op(TestMulticlassNMSOp):
+
     def setUp(self):
         self.set_argument()
         N = 7
@@ -448,8 +457,8 @@ def setUp(self):
 
         nmsed_outs = det_outs[:, :-1].astype('float32') if len(
             det_outs) else det_outs
-        index_outs = det_outs[:, -1:].astype('int') if len(
-            det_outs) else det_outs
+        index_outs = det_outs[:,
+                              -1:].astype('int') if len(det_outs) else det_outs
         self.op_type = 'multiclass_nms2'
         self.inputs = {'BBoxes': boxes, 'Scores': scores}
         self.outputs = {
@@ -471,6 +480,7 @@ def test_check_output(self):
 
 
 class TestMulticlassNMS2OpNoOutput(TestMulticlassNMS2Op):
+
     def set_argument(self):
         # Here set 2.0 to test the case there is no outputs.
         # In practical use, 0.0 < score_threshold < 1.0
@@ -478,6 +488,7 @@ def set_argument(self):
 
 
 class TestMulticlassNMS2LoDInput(TestMulticlassNMSLoDInput):
+
     def setUp(self):
         self.set_argument()
         M = 1200
@@ -501,15 +512,16 @@ def setUp(self):
         boxes[:, :, 2] = boxes[:, :, 2] * 10 + 10
         boxes[:, :, 3] = boxes[:, :, 3] * 10 + 10
 
-        det_outs, lod = lod_multiclass_nms(
-            boxes, scores, background, score_threshold, nms_threshold,
-            nms_top_k, keep_top_k, box_lod, normalized)
+        det_outs, lod = lod_multiclass_nms(boxes, scores, background,
+                                           score_threshold, nms_threshold,
+                                           nms_top_k, keep_top_k, box_lod,
+                                           normalized)
 
         det_outs = np.array(det_outs)
         nmsed_outs = det_outs[:, :-1].astype('float32') if len(
             det_outs) else det_outs
-        index_outs = det_outs[:, -1:].astype('int') if len(
-            det_outs) else det_outs
+        index_outs = det_outs[:,
+                              -1:].astype('int') if len(det_outs) else det_outs
         self.op_type = 'multiclass_nms2'
         self.inputs = {
             'BBoxes': (boxes, box_lod),
@@ -534,6 +546,7 @@ def test_check_output(self):
 
 
 class TestMulticlassNMS2LoDNoOutput(TestMulticlassNMS2LoDInput):
+
     def set_argument(self):
         # Here set 2.0 to test the case there is no outputs.
         # In practical use, 0.0 < score_threshold < 1.0
@@ -541,6 +554,7 @@ def set_argument(self):
 
 
 class TestMulticlassNMSError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             M = 1200
@@ -554,10 +568,12 @@ def test_errors(self):
             scores = np.reshape(scores, (N, M, C))
             scores_np = np.transpose(scores, (0, 2, 1))
 
-            boxes_data = fluid.data(
-                name='bboxes', shape=[M, C, BOX_SIZE], dtype='float32')
-            scores_data = fluid.data(
-                name='scores', shape=[N, C, M], dtype='float32')
+            boxes_data = fluid.data(name='bboxes',
+                                    shape=[M, C, BOX_SIZE],
+                                    dtype='float32')
+            scores_data = fluid.data(name='scores',
+                                     shape=[N, C, M],
+                                     dtype='float32')
 
             def test_bboxes_Variable():
                 # the bboxes type must be Variable
@@ -572,6 +588,7 @@ def test_scores_Variable():
 
 
 class TestMulticlassNMS3Op(TestMulticlassNMS2Op):
+
     def setUp(self):
         self.set_argument()
         N = 7
@@ -601,8 +618,8 @@ def setUp(self):
 
         nmsed_outs = det_outs[:, :-1].astype('float32') if len(
             det_outs) else det_outs
-        index_outs = det_outs[:, -1:].astype('int') if len(
-            det_outs) else det_outs
+        index_outs = det_outs[:,
+                              -1:].astype('int') if len(det_outs) else det_outs
         self.op_type = 'multiclass_nms3'
         self.inputs = {'BBoxes': boxes, 'Scores': scores}
         self.outputs = {
@@ -625,6 +642,7 @@ def test_check_output(self):
 
 
 class TestMulticlassNMS3OpNoOutput(TestMulticlassNMS3Op):
+
     def set_argument(self):
         # Here set 2.0 to test the case there is no outputs.
         # In practical use, 0.0 < score_threshold < 1.0
@@ -632,6 +650,7 @@ def set_argument(self):
 
 
 class TestMulticlassNMS3LoDInput(TestMulticlassNMS2LoDInput):
+
     def setUp(self):
         self.set_argument()
         M = 1200
@@ -655,9 +674,10 @@ def setUp(self):
         boxes[:, :, 2] = boxes[:, :, 2] * 10 + 10
         boxes[:, :, 3] = boxes[:, :, 3] * 10 + 10
 
-        det_outs, lod = lod_multiclass_nms(
-            boxes, scores, background, score_threshold, nms_threshold,
-            nms_top_k, keep_top_k, box_lod, normalized)
+        det_outs, lod = lod_multiclass_nms(boxes, scores, background,
+                                           score_threshold, nms_threshold,
+                                           nms_top_k, keep_top_k, box_lod,
+                                           normalized)
 
         det_outs = np.array(det_outs)
         nmsed_outs = det_outs[:, :-1].astype('float32') if len(
@@ -687,6 +707,7 @@ def test_check_output(self):
 
 
 class TestMulticlassNMS3LoDNoOutput(TestMulticlassNMS3LoDInput):
+
     def set_argument(self):
         # Here set 2.0 to test the case there is no outputs.
         # In practical use, 0.0 < score_threshold < 1.0
diff --git a/python/paddle/fluid/tests/unittests/test_multihead_attention.py b/python/paddle/fluid/tests/unittests/test_multihead_attention.py
index f60da862ac091..9a0e3f1b2a9cd 100644
--- a/python/paddle/fluid/tests/unittests/test_multihead_attention.py
+++ b/python/paddle/fluid/tests/unittests/test_multihead_attention.py
@@ -21,6 +21,7 @@
 
 
 class TestMultiheadAttention(unittest.TestCase):
+
     def gen_random_input(self):
         """Generate random input data.
         """
@@ -32,25 +33,22 @@ def gen_random_input(self):
     def set_program(self):
         """Build the test program.
         """
-        queries = fluid.layers.data(
-            name="queries",
-            shape=self.input_shape,
-            dtype="float32",
-            append_batch_size=False)
+        queries = fluid.layers.data(name="queries",
+                                    shape=self.input_shape,
+                                    dtype="float32",
+                                    append_batch_size=False)
         queries.stop_gradient = False
-        keys = fluid.layers.data(
-            name="keys",
-            shape=self.input_shape,
-            dtype="float32",
-            append_batch_size=False)
+        keys = fluid.layers.data(name="keys",
+                                 shape=self.input_shape,
+                                 dtype="float32",
+                                 append_batch_size=False)
         keys.stop_gradient = False
 
-        contexts = fluid.nets.scaled_dot_product_attention(
-            queries=queries,
-            keys=keys,
-            values=keys,
-            num_heads=8,
-            dropout_rate=0.)
+        contexts = fluid.nets.scaled_dot_product_attention(queries=queries,
+                                                           keys=keys,
+                                                           values=keys,
+                                                           num_heads=8,
+                                                           dropout_rate=0.)
         out = fluid.layers.reduce_sum(contexts, dim=None)
         fluid.backward.append_backward(loss=out)
 
diff --git a/python/paddle/fluid/tests/unittests/test_multinomial_op.py b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
index 4dfc881d7723f..b60a46d66adb9 100644
--- a/python/paddle/fluid/tests/unittests/test_multinomial_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
@@ -44,6 +44,7 @@ def sample_output_two_dimension(out, shape):
 
 
 class TestMultinomialOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.op_type = "multinomial"
@@ -67,12 +68,12 @@ def verify_output(self, outs):
         prob = self.input_np / self.input_np.sum(axis=-1, keepdims=True)
         sample_prob = self.sample_output(np.array(outs[0]))
         self.assertTrue(
-            np.allclose(
-                sample_prob, prob, rtol=0, atol=0.01),
+            np.allclose(sample_prob, prob, rtol=0, atol=0.01),
             "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
 
 
 class TestMultinomialOp2(TestMultinomialOp):
+
     def init_data(self):
         # input probability is a matrix
         self.input_np = np.random.rand(3, 4)
@@ -84,6 +85,7 @@ def sample_output(self, out):
 
 
 class TestMultinomialOp3(TestMultinomialOp):
+
     def init_data(self):
         # replacement is False. number of samples must be less than number of categories.
         self.input_np = np.random.rand(1000)
@@ -99,6 +101,7 @@ def verify_output(self, outs):
 
 
 class TestMultinomialApi(unittest.TestCase):
+
     def test_dygraph(self):
         # input probability is a vector, and replacement is True
         paddle.disable_static()
@@ -110,8 +113,7 @@ def test_dygraph(self):
         sample_prob = sample_output_one_dimension(out.numpy(), 4)
         prob = x_numpy / x_numpy.sum(axis=-1, keepdims=True)
         self.assertTrue(
-            np.allclose(
-                sample_prob, prob, rtol=0, atol=0.01),
+            np.allclose(sample_prob, prob, rtol=0, atol=0.01),
             "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
 
     def test_dygraph2(self):
@@ -124,8 +126,7 @@ def test_dygraph2(self):
         sample_prob = sample_output_two_dimension(out.numpy(), [3, 4])
         prob = x_numpy / x_numpy.sum(axis=-1, keepdims=True)
         self.assertTrue(
-            np.allclose(
-                sample_prob, prob, rtol=0, atol=0.01),
+            np.allclose(sample_prob, prob, rtol=0, atol=0.01),
             "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
         paddle.enable_static()
 
@@ -170,12 +171,12 @@ def test_static(self):
         sample_prob = sample_output_one_dimension(out, 4)
         prob = x_np / x_np.sum(axis=-1, keepdims=True)
         self.assertTrue(
-            np.allclose(
-                sample_prob, prob, rtol=0, atol=0.01),
+            np.allclose(sample_prob, prob, rtol=0, atol=0.01),
             "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
 
 
 class TestMultinomialAlias(unittest.TestCase):
+
     def test_alias(self):
         paddle.disable_static()
         x = paddle.rand([4])
@@ -185,10 +186,12 @@ def test_alias(self):
 
 
 class TestMultinomialError(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
 
     def test_num_sample(self):
+
         def test_num_sample_less_than_0():
             x = paddle.rand([4])
             paddle.multinomial(x, num_samples=-2)
@@ -196,6 +199,7 @@ def test_num_sample_less_than_0():
         self.assertRaises(ValueError, test_num_sample_less_than_0)
 
     def test_replacement_False(self):
+
         def test_samples_larger_than_categories():
             x = paddle.rand([4])
             paddle.multinomial(x, num_samples=5, replacement=False)
@@ -203,6 +207,7 @@ def test_samples_larger_than_categories():
         self.assertRaises(ValueError, test_samples_larger_than_categories)
 
     def test_input_probs_dim(self):
+
         def test_dim_larger_than_2():
             x = paddle.rand([2, 3, 3])
             paddle.multinomial(x)
@@ -226,6 +231,7 @@ def test_dim_less_than_1():
 
 
 class TestRandomValue(unittest.TestCase):
+
     def test_fixed_random_number(self):
         # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
         if not paddle.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_multiplex_op.py b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
index 093ee86aeea6e..29a11ab68d0d0 100644
--- a/python/paddle/fluid/tests/unittests/test_multiplex_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
@@ -23,6 +23,7 @@
 
 
 class TestMultiplexOp(OpTest):
+
     def setUp(self):
         self.op_type = "multiplex"
         rows = 4
@@ -61,6 +62,7 @@ def test_check_grad_ignore_x3(self):
 
 
 class TestMultiplexOpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             x1 = fluid.data(name='x1', shape=[None, 2], dtype='int64')
@@ -86,14 +88,16 @@ def test_type():
             self.assertRaises(TypeError, test_type)
 
             def test_type2():
-                index2 = fluid.data(
-                    name='index2', shape=[None, 1], dtype='int16')
+                index2 = fluid.data(name='index2',
+                                    shape=[None, 1],
+                                    dtype='int16')
                 paddle.multiplex(inputs=[x1, x2], index=index2)
 
             self.assertRaises(TypeError, test_type2)
 
 
 class TestMultiplexODygrap(unittest.TestCase):
+
     def test_multiplex_dygraph(self):
         paddle.disable_static()
         img1 = np.array([[1, 2], [3, 4]]).astype(np.float32)
@@ -122,10 +126,12 @@ def test_dygraph_final_state_api(self):
                 res_eager = paddle.multiplex(inputs_eager, index_eager)
                 res_eager.backward()
                 self.assertEqual((res.numpy() == res_eager.numpy()).all(), True)
-                self.assertEqual((inputs[0].grad.numpy() ==
-                                  inputs_eager[0].grad.numpy()).all(), True)
-                self.assertEqual((inputs[1].grad.numpy() ==
-                                  inputs_eager[1].grad.numpy()).all(), True)
+                self.assertEqual(
+                    (inputs[0].grad.numpy() == inputs_eager[0].grad.numpy()
+                     ).all(), True)
+                self.assertEqual(
+                    (inputs[1].grad.numpy() == inputs_eager[1].grad.numpy()
+                     ).all(), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_multiply.py b/python/paddle/fluid/tests/unittests/test_multiply.py
index e8463ed8ad235..cfc56d5a9590c 100755
--- a/python/paddle/fluid/tests/unittests/test_multiply.py
+++ b/python/paddle/fluid/tests/unittests/test_multiply.py
@@ -24,21 +24,26 @@
 
 
 class TestMultiplyApi(unittest.TestCase):
+
     def _run_static_graph_case(self, x_data, y_data):
         with program_guard(Program(), Program()):
             paddle.enable_static()
-            x = paddle.static.data(
-                name='x', shape=x_data.shape, dtype=x_data.dtype)
-            y = paddle.static.data(
-                name='y', shape=y_data.shape, dtype=y_data.dtype)
+            x = paddle.static.data(name='x',
+                                   shape=x_data.shape,
+                                   dtype=x_data.dtype)
+            y = paddle.static.data(name='y',
+                                   shape=y_data.shape,
+                                   dtype=y_data.dtype)
             res = tensor.multiply(x, y)
 
-            place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
-            ) else paddle.CPUPlace()
+            place = paddle.CUDAPlace(
+                0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             outs = exe.run(paddle.static.default_main_program(),
-                           feed={'x': x_data,
-                                 'y': y_data},
+                           feed={
+                               'x': x_data,
+                               'y': y_data
+                           },
                            fetch_list=[res])
             res = outs[0]
             return res
@@ -108,6 +113,7 @@ def test_multiply(self):
 
 
 class TestMultiplyError(unittest.TestCase):
+
     def func_test_errors(self):
         # test static computation graph: dtype can not be int8
         paddle.enable_static()
@@ -116,7 +122,7 @@ def func_test_errors(self):
             y = paddle.static.data(name='y', shape=[100], dtype=np.int8)
             self.assertRaises(TypeError, tensor.multiply, x, y)
 
-        # test static computation graph: inputs must be broadcastable 
+        # test static computation graph: inputs must be broadcastable
         with program_guard(Program(), Program()):
             x = paddle.static.data(name='x', shape=[20, 50], dtype=np.float64)
             y = paddle.static.data(name='y', shape=[20], dtype=np.float64)
@@ -145,7 +151,7 @@ def func_test_errors(self):
         y = paddle.to_tensor(y_data)
         self.assertRaises(ValueError, paddle.multiply, x, y)
 
-        # test dynamic computation graph: dtype must be same	
+        # test dynamic computation graph: dtype must be same
         x_data = np.random.randn(200).astype(np.int64)
         y_data = np.random.randn(200).astype(np.float64)
         x = paddle.to_tensor(x_data)
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
index e23905005df56..d409648c71603 100755
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
@@ -27,6 +27,7 @@
 
 
 class RandomDataset(Dataset):
+
     def __init__(self, sample_num):
         self.sample_num = sample_num
 
@@ -41,6 +42,7 @@ def __getitem__(self, idx):
 
 
 class RandomIterableDataset(IterableDataset):
+
     def __init__(self, sample_num):
         self.sample_num = sample_num
 
@@ -53,6 +55,7 @@ def __iter__(self):
 
 
 class TestTensorDataset(unittest.TestCase):
+
     def run_main(self, num_workers, places):
         paddle.static.default_startup_program().random_seed = 1
         paddle.static.default_main_program().random_seed = 1
@@ -65,12 +68,11 @@ def run_main(self, num_workers, places):
 
             dataset = TensorDataset([input, label])
             assert len(dataset) == 16
-            dataloader = DataLoader(
-                dataset,
-                places=place,
-                num_workers=num_workers,
-                batch_size=1,
-                drop_last=True)
+            dataloader = DataLoader(dataset,
+                                    places=place,
+                                    num_workers=num_workers,
+                                    batch_size=1,
+                                    drop_last=True)
 
             for i, (input, label) in enumerate(dataloader()):
                 assert len(input) == 1
@@ -98,6 +100,7 @@ def test_main(self):
 
 
 class TestComposeDataset(unittest.TestCase):
+
     def func_test_main(self):
         paddle.static.default_startup_program().random_seed = 1
         paddle.static.default_main_program().random_seed = 1
@@ -123,6 +126,7 @@ def test_main(self):
 
 
 class TestRandomSplitApi(unittest.TestCase):
+
     def func_test_main(self):
         paddle.static.default_startup_program().random_seed = 1
         paddle.static.default_main_program().random_seed = 1
@@ -149,6 +153,7 @@ def test_main(self):
 
 
 class TestRandomSplitError(unittest.TestCase):
+
     def func_test_errors(self):
         paddle.static.default_startup_program().random_seed = 1
         paddle.static.default_main_program().random_seed = 1
@@ -164,6 +169,7 @@ def test_errors(self):
 
 
 class TestSubsetDataset(unittest.TestCase):
+
     def run_main(self, num_workers, places):
         paddle.static.default_startup_program().random_seed = 1
         paddle.static.default_main_program().random_seed = 1
@@ -180,12 +186,11 @@ def run_main(self, num_workers, places):
         assert len(dataset) == 5
 
         def prepare_dataloader(dataset):
-            return DataLoader(
-                dataset,
-                places=places,
-                num_workers=num_workers,
-                batch_size=1,
-                drop_last=True)
+            return DataLoader(dataset,
+                              places=places,
+                              num_workers=num_workers,
+                              batch_size=1,
+                              drop_last=True)
 
         dataloader = prepare_dataloader(dataset)
         dataloader_even = prepare_dataloader(even_subset)
@@ -234,6 +239,7 @@ def test_main(self):
 
 
 class TestChainDataset(unittest.TestCase):
+
     def run_main(self, num_workers, places):
         paddle.static.default_startup_program().random_seed = 1
         paddle.static.default_main_program().random_seed = 1
@@ -271,6 +277,7 @@ def test_main(self):
 
 
 class NumpyMixTensorDataset(Dataset):
+
     def __init__(self, sample_num):
         self.sample_num = sample_num
 
@@ -285,6 +292,7 @@ def __getitem__(self, idx):
 
 
 class TestNumpyMixTensorDataset(TestTensorDataset):
+
     def run_main(self, num_workers, places):
         paddle.static.default_startup_program().random_seed = 1
         paddle.static.default_main_program().random_seed = 1
@@ -292,12 +300,11 @@ def run_main(self, num_workers, places):
         with fluid.dygraph.guard(place):
             dataset = NumpyMixTensorDataset(16)
             assert len(dataset) == 16
-            dataloader = DataLoader(
-                dataset,
-                places=place,
-                num_workers=num_workers,
-                batch_size=1,
-                drop_last=True)
+            dataloader = DataLoader(dataset,
+                                    places=place,
+                                    num_workers=num_workers,
+                                    batch_size=1,
+                                    drop_last=True)
 
             for i, (input, label) in enumerate(dataloader()):
                 assert len(input) == 1
@@ -311,6 +318,7 @@ def run_main(self, num_workers, places):
 
 
 class ComplextDataset(Dataset):
+
     def __init__(self, sample_num):
         self.sample_num = sample_num
 
@@ -318,9 +326,10 @@ def __len__(self):
         return self.sample_num
 
     def __getitem__(self, idx):
-        return (3.1, 'abc', paddle.to_tensor(
-            np.random.random([IMAGE_SIZE]).astype('float32'),
-            place=paddle.CPUPlace()),
+        return (3.1, 'abc',
+                paddle.to_tensor(np.random.random([IMAGE_SIZE
+                                                   ]).astype('float32'),
+                                 place=paddle.CPUPlace()),
                 [1, np.random.random([2]).astype('float32')], {
                     'a': 2.0,
                     'b': np.random.random([2]).astype('float32')
@@ -328,6 +337,7 @@ def __getitem__(self, idx):
 
 
 class TestComplextDataset(unittest.TestCase):
+
     def run_main(self, num_workers):
         paddle.static.default_startup_program().random_seed = 1
         paddle.static.default_main_program().random_seed = 1
@@ -335,12 +345,11 @@ def run_main(self, num_workers):
         with fluid.dygraph.guard(place):
             dataset = ComplextDataset(16)
             assert len(dataset) == 16
-            dataloader = DataLoader(
-                dataset,
-                places=place,
-                num_workers=num_workers,
-                batch_size=2,
-                drop_last=True)
+            dataloader = DataLoader(dataset,
+                                    places=place,
+                                    num_workers=num_workers,
+                                    batch_size=2,
+                                    drop_last=True)
 
             for i, data in enumerate(dataloader()):
                 assert len(data) == 5
@@ -373,6 +382,7 @@ def test_main(self):
 
 
 class SingleFieldDataset(Dataset):
+
     def __init__(self, sample_num):
         self.sample_num = sample_num
 
@@ -384,6 +394,7 @@ def __getitem__(self, idx):
 
 
 class TestSingleFieldDataset(unittest.TestCase):
+
     def init_dataset(self):
         self.sample_num = 16
         self.dataset = SingleFieldDataset(self.sample_num)
@@ -394,12 +405,11 @@ def run_main(self, num_workers):
         place = paddle.CPUPlace()
         with fluid.dygraph.guard(place):
             self.init_dataset()
-            dataloader = DataLoader(
-                self.dataset,
-                places=place,
-                num_workers=num_workers,
-                batch_size=2,
-                drop_last=True)
+            dataloader = DataLoader(self.dataset,
+                                    places=place,
+                                    num_workers=num_workers,
+                                    batch_size=2,
+                                    drop_last=True)
 
             for i, data in enumerate(dataloader()):
                 assert isinstance(data,
@@ -417,6 +427,7 @@ def test_main(self):
 
 
 class SingleFieldIterableDataset(IterableDataset):
+
     def __init__(self, sample_num):
         self.sample_num = sample_num
 
@@ -426,12 +437,14 @@ def __iter__(self):
 
 
 class TestSingleFieldIterableDataset(TestSingleFieldDataset):
+
     def init_dataset(self):
         self.sample_num = 16
         self.dataset = SingleFieldIterableDataset(self.sample_num)
 
 
 class TestDataLoaderGenerateStates(unittest.TestCase):
+
     def setUp(self):
         self.inputs = [(0, 1), (0, 2), (1, 3)]
         self.outputs = [[1835504127, 1731038949, 1320224556, 2330041505],
@@ -451,16 +464,16 @@ def test_main(self):
 
 
 class TestDatasetWithDropLast(unittest.TestCase):
+
     def run_main(self, dataset, num_samples, batch_size):
         for num_workers in [0, 1]:
             for drop_last in [True, False]:
                 steps = (num_samples + (1 - int(drop_last)) * \
                         (batch_size - 1)) // batch_size
-                dataloader = DataLoader(
-                    dataset,
-                    batch_size=batch_size,
-                    drop_last=drop_last,
-                    num_workers=num_workers)
+                dataloader = DataLoader(dataset,
+                                        batch_size=batch_size,
+                                        drop_last=drop_last,
+                                        num_workers=num_workers)
                 datas = []
                 for data in dataloader:
                     datas.append(data)
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
index fcc7c17ce06a7..c3eda1b3fdf0d 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
@@ -32,6 +32,7 @@
 
 
 class SimpleFCNet(fluid.dygraph.Layer):
+
     def __init__(self):
         super(SimpleFCNet, self).__init__()
 
@@ -43,20 +44,18 @@ def __init__(self):
         in_channel = IMAGE_SIZE
         for hidden_size in [10, 20, 30]:
             self._fcs.append(
-                Linear(
-                    in_channel,
-                    hidden_size,
-                    act='tanh',
-                    param_attr=param_attr,
-                    bias_attr=bias_attr))
+                Linear(in_channel,
+                       hidden_size,
+                       act='tanh',
+                       param_attr=param_attr,
+                       bias_attr=bias_attr))
             in_channel = hidden_size
         self._fcs.append(
-            Linear(
-                in_channel,
-                CLASS_NUM,
-                act='softmax',
-                param_attr=param_attr,
-                bias_attr=bias_attr))
+            Linear(in_channel,
+                   CLASS_NUM,
+                   act='softmax',
+                   param_attr=param_attr,
+                   bias_attr=bias_attr))
 
     def forward(self, image):
         out = image
@@ -66,6 +65,7 @@ def forward(self, image):
 
 
 class TestDygraphDataLoader(unittest.TestCase):
+
     def run_main(self, num_workers, places, persistent_workers):
         fluid.default_startup_program().random_seed = 1
         fluid.default_main_program().random_seed = 1
@@ -74,12 +74,11 @@ def run_main(self, num_workers, places, persistent_workers):
             optimizer = fluid.optimizer.Adam(parameter_list=fc_net.parameters())
 
             dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
-            dataloader = DataLoader(
-                dataset,
-                num_workers=num_workers,
-                batch_size=BATCH_SIZE,
-                drop_last=True,
-                persistent_workers=persistent_workers)
+            dataloader = DataLoader(dataset,
+                                    num_workers=num_workers,
+                                    batch_size=BATCH_SIZE,
+                                    drop_last=True,
+                                    persistent_workers=persistent_workers)
             assert len(dataloader) == int(SAMPLE_NUM / BATCH_SIZE)
 
             step_list = []
@@ -117,10 +116,9 @@ def test_main(self):
                     print(self.__class__.__name__, p, num_workers,
                           persistent_workers)
                     sys.stdout.flush()
-                    ret = self.run_main(
-                        num_workers=num_workers,
-                        places=p,
-                        persistent_workers=persistent_workers)
+                    ret = self.run_main(num_workers=num_workers,
+                                        places=p,
+                                        persistent_workers=persistent_workers)
                     results.append(ret)
                 diff = np.max(
                     np.abs(results[0]['loss'] - results[1]['loss']) /
@@ -129,6 +127,7 @@ def test_main(self):
 
 
 class TestDygraphDataLoaderWithBatchedDataset(TestDygraphDataLoader):
+
     def run_main(self, num_workers, places, persistent_workers):
         fluid.default_startup_program().random_seed = 1
         fluid.default_main_program().random_seed = 1
@@ -137,12 +136,11 @@ def run_main(self, num_workers, places, persistent_workers):
             optimizer = fluid.optimizer.Adam(parameter_list=fc_net.parameters())
 
             dataset = RandomBatchedDataset(SAMPLE_NUM, CLASS_NUM)
-            dataloader = DataLoader(
-                dataset,
-                num_workers=num_workers,
-                batch_size=None,
-                drop_last=True,
-                persistent_workers=persistent_workers)
+            dataloader = DataLoader(dataset,
+                                    num_workers=num_workers,
+                                    batch_size=None,
+                                    drop_last=True,
+                                    persistent_workers=persistent_workers)
             assert len(dataloader) == int(SAMPLE_NUM / BATCH_SIZE)
 
             step_list = []
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
index 52f4c2567730f..2d6cdac4854f7 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
@@ -31,6 +31,7 @@
 
 
 class RandomDataset(Dataset):
+
     def __init__(self, sample_num):
         self.sample_num = sample_num
 
@@ -45,6 +46,7 @@ def __len__(self):
 
 
 class TestDataLoaderAssert(unittest.TestCase):
+
     def test_main(self):
         place = fluid.cpu_places()[0]
         with fluid.dygraph.guard(place):
@@ -67,8 +69,9 @@ def test_main(self):
 
             # num_workers < 0
             try:
-                loader = DataLoader(
-                    dataset=dataset, places=place, num_workers=-1)
+                loader = DataLoader(dataset=dataset,
+                                    places=place,
+                                    num_workers=-1)
                 self.assertTrue(False)
             except AssertionError:
                 pass
@@ -82,26 +85,27 @@ def test_main(self):
 
             # set batch_sampler and shuffle/batch_size/drop_last
             try:
-                loader = DataLoader(
-                    dataset=dataset,
-                    places=place,
-                    batch_sampler=batch_sampler,
-                    shuffle=True,
-                    drop_last=True)
+                loader = DataLoader(dataset=dataset,
+                                    places=place,
+                                    batch_sampler=batch_sampler,
+                                    shuffle=True,
+                                    drop_last=True)
                 self.assertTrue(False)
             except AssertionError:
                 pass
 
             # set batch_sampler correctly
             try:
-                loader = DataLoader(
-                    dataset=dataset, places=place, batch_sampler=batch_sampler)
+                loader = DataLoader(dataset=dataset,
+                                    places=place,
+                                    batch_sampler=batch_sampler)
                 self.assertTrue(True)
             except AssertionError:
                 self.assertTrue(False)
 
 
 class TestDatasetRuntimeError(unittest.TestCase):
+
     def test_main(self):
         dataset = Dataset()
 
@@ -148,6 +152,7 @@ def test_main(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestDataLoaderWorkerLoop(unittest.TestCase):
+
     def run_without_worker_done(self, use_shared_memory=True):
         try:
             place = fluid.cpu_places()[0]
@@ -161,15 +166,13 @@ def _init_fn(worker_id):
                 # test collate_fn
                 def _collate_fn(sample_list):
                     return [
-                        np.stack(
-                            s, axis=0) for s in list(zip(*sample_list))
+                        np.stack(s, axis=0) for s in list(zip(*sample_list))
                     ]
 
-                loader = DataLoader(
-                    dataset,
-                    num_workers=1,
-                    places=place,
-                    use_shared_memory=use_shared_memory)
+                loader = DataLoader(dataset,
+                                    num_workers=1,
+                                    places=place,
+                                    use_shared_memory=use_shared_memory)
                 assert loader.num_workers > 0, \
                     "go to AssertionError and pass in Mac and Windows"
                 loader = iter(loader)
@@ -204,15 +207,13 @@ def _init_fn(worker_id):
                 # test collate_fn
                 def _collate_fn(sample_list):
                     return [
-                        np.stack(
-                            s, axis=0) for s in list(zip(*sample_list))
+                        np.stack(s, axis=0) for s in list(zip(*sample_list))
                     ]
 
-                loader = DataLoader(
-                    dataset,
-                    num_workers=1,
-                    places=place,
-                    use_shared_memory=use_shared_memory)
+                loader = DataLoader(dataset,
+                                    num_workers=1,
+                                    places=place,
+                                    use_shared_memory=use_shared_memory)
                 assert loader.num_workers > 0, \
                     "go to AssertionError and pass in Mac and Windows"
                 loader = iter(loader)
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
index 490e95a0f0be2..7ebcf4b8efa34 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
@@ -32,6 +32,7 @@
 
 
 class SimpleFCNet(fluid.dygraph.Layer):
+
     def __init__(self):
         super(SimpleFCNet, self).__init__()
 
@@ -43,20 +44,18 @@ def __init__(self):
         in_channel = IMAGE_SIZE
         for hidden_size in [10, 20, 30]:
             self._fcs.append(
-                Linear(
-                    in_channel,
-                    hidden_size,
-                    act='tanh',
-                    param_attr=param_attr,
-                    bias_attr=bias_attr))
+                Linear(in_channel,
+                       hidden_size,
+                       act='tanh',
+                       param_attr=param_attr,
+                       bias_attr=bias_attr))
             in_channel = hidden_size
         self._fcs.append(
-            Linear(
-                in_channel,
-                CLASS_NUM,
-                act='softmax',
-                param_attr=param_attr,
-                bias_attr=bias_attr))
+            Linear(in_channel,
+                   CLASS_NUM,
+                   act='softmax',
+                   param_attr=param_attr,
+                   bias_attr=bias_attr))
 
     def forward(self, image):
         out = image
@@ -66,6 +65,7 @@ def forward(self, image):
 
 
 class TestDygraphDataLoader(unittest.TestCase):
+
     def run_main(self, num_workers, places, persistent_workers):
         fluid.default_startup_program().random_seed = 1
         fluid.default_main_program().random_seed = 1
@@ -74,12 +74,11 @@ def run_main(self, num_workers, places, persistent_workers):
             optimizer = fluid.optimizer.Adam(parameter_list=fc_net.parameters())
 
             dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
-            dataloader = DataLoader(
-                dataset,
-                num_workers=num_workers,
-                batch_size=BATCH_SIZE,
-                drop_last=True,
-                persistent_workers=persistent_workers)
+            dataloader = DataLoader(dataset,
+                                    num_workers=num_workers,
+                                    batch_size=BATCH_SIZE,
+                                    drop_last=True,
+                                    persistent_workers=persistent_workers)
 
             step_list = []
             loss_list = []
@@ -116,16 +115,16 @@ def test_main(self):
                     print(self.__class__.__name__, p, num_workers,
                           persistent_workers)
                     sys.stdout.flush()
-                    ret = self.run_main(
-                        num_workers=num_workers,
-                        places=p,
-                        persistent_workers=persistent_workers)
+                    ret = self.run_main(num_workers=num_workers,
+                                        places=p,
+                                        persistent_workers=persistent_workers)
                     results.append(ret)
                 assert results[0]['loss'].shape[0] * 2 == results[1][
                     'loss'].shape[0]
 
 
 class TestDygraphDataLoaderWithBatchedDataset(TestDygraphDataLoader):
+
     def run_main(self, num_workers, places, persistent_workers):
         fluid.default_startup_program().random_seed = 1
         fluid.default_main_program().random_seed = 1
@@ -134,12 +133,11 @@ def run_main(self, num_workers, places, persistent_workers):
             optimizer = fluid.optimizer.Adam(parameter_list=fc_net.parameters())
 
             dataset = RandomBatchedDataset(SAMPLE_NUM, CLASS_NUM)
-            dataloader = DataLoader(
-                dataset,
-                num_workers=num_workers,
-                batch_size=None,
-                drop_last=True,
-                persistent_workers=persistent_workers)
+            dataloader = DataLoader(dataset,
+                                    num_workers=num_workers,
+                                    batch_size=None,
+                                    drop_last=True,
+                                    persistent_workers=persistent_workers)
 
             step_list = []
             loss_list = []
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py
index d2b7971a85dd0..066585edff25d 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py
@@ -23,6 +23,7 @@
 
 
 class RangeIterableDatasetSplit(IterableDataset):
+
     def __init__(self, start, end):
         self.start = start
         self.end = end
@@ -34,8 +35,8 @@ def __iter__(self):
             iter_end = self.end
         else:
             per_worker = int(
-                math.ceil((self.end - self.start) / float(
-                    worker_info.num_workers)))
+                math.ceil(
+                    (self.end - self.start) / float(worker_info.num_workers)))
             worker_id = worker_info.id
             iter_start = self.start + worker_id * per_worker
             iter_end = min(iter_start + per_worker, self.end)
@@ -45,16 +46,16 @@ def __iter__(self):
 
 
 class TestDynamicDataLoaderIterSplit(unittest.TestCase):
+
     def test_main(self):
         place = fluid.CPUPlace()
         with fluid.dygraph.guard(place):
             dataset = RangeIterableDatasetSplit(0, 10)
-            dataloader = DataLoader(
-                dataset,
-                places=place,
-                num_workers=2,
-                batch_size=1,
-                drop_last=True)
+            dataloader = DataLoader(dataset,
+                                    places=place,
+                                    num_workers=2,
+                                    batch_size=1,
+                                    drop_last=True)
 
             rets = []
             for d in dataloader:
@@ -64,6 +65,7 @@ def test_main(self):
 
 
 class RangeIterableDataset(IterableDataset):
+
     def __init__(self, start, end):
         self.start = start
         self.end = end
@@ -74,6 +76,7 @@ def __iter__(self):
 
 
 class TestDynamicDataLoaderIterInitFuncSplit(unittest.TestCase):
+
     def test_main(self):
         place = fluid.CPUPlace()
         with fluid.dygraph.guard(place):
@@ -92,13 +95,12 @@ def worker_spliter(worker_id):
                 dataset.start = start + worker_id * num_per_worker
                 dataset.end = min(dataset.start + num_per_worker, end)
 
-            dataloader = DataLoader(
-                dataset,
-                places=place,
-                num_workers=1,
-                batch_size=1,
-                drop_last=True,
-                worker_init_fn=worker_spliter)
+            dataloader = DataLoader(dataset,
+                                    places=place,
+                                    num_workers=1,
+                                    batch_size=1,
+                                    drop_last=True,
+                                    worker_init_fn=worker_spliter)
 
             rets = []
             for d in dataloader:
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
index 9e09c5e3a1d44..2ef623c2189cc 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
@@ -33,6 +33,7 @@
 
 
 class RandomDataset(IterableDataset):
+
     def __init__(self, sample_num, class_num):
         self.sample_num = sample_num
         self.class_num = class_num
@@ -54,8 +55,9 @@ def simple_fc_net_static():
 
     with fluid.unique_name.guard():
         with fluid.program_guard(main_prog, startup_prog):
-            image = fluid.data(
-                name='image', shape=[None, IMAGE_SIZE], dtype='float32')
+            image = fluid.data(name='image',
+                               shape=[None, IMAGE_SIZE],
+                               dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
             hidden = image
             param_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(
@@ -75,8 +77,7 @@ def simple_fc_net_static():
                                             param_attr=param_attr,
                                             bias_attr=bias_attr)
             loss = fluid.layers.reduce_mean(
-                fluid.layers.cross_entropy(
-                    input=predict_label, label=label))
+                fluid.layers.cross_entropy(input=predict_label, label=label))
 
             optimizer = fluid.optimizer.Adam()
             optimizer.minimize(loss)
@@ -100,21 +101,21 @@ def prepare_places(with_data_parallel, with_cpu=False, with_gpu=True):
 
 
 class TestStaticDataLoader(unittest.TestCase):
+
     def run_main(self, num_workers, places, persistent_workers):
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             startup_prog, main_prog, image, label, loss = simple_fc_net_static()
 
             dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
-            dataloader = DataLoader(
-                dataset,
-                feed_list=[image, label],
-                places=places,
-                num_workers=num_workers,
-                batch_size=BATCH_SIZE,
-                return_list=False,
-                drop_last=True,
-                persistent_workers=persistent_workers)
+            dataloader = DataLoader(dataset,
+                                    feed_list=[image, label],
+                                    places=places,
+                                    num_workers=num_workers,
+                                    batch_size=BATCH_SIZE,
+                                    return_list=False,
+                                    drop_last=True,
+                                    persistent_workers=persistent_workers)
             # assert len(dataloader) == int(SAMPLE_NUM / BATCH_SIZE)
 
             exe = fluid.Executor(place=places[0])
@@ -122,8 +123,8 @@ def run_main(self, num_workers, places, persistent_workers):
 
             prog = fluid.CompiledProgram(main_prog)
             if len(places) > 1:
-                prog = prog.with_data_parallel(
-                    loss_name=loss.name, places=places)
+                prog = prog.with_data_parallel(loss_name=loss.name,
+                                               places=places)
 
             step_list = []
             loss_list = []
@@ -165,16 +166,16 @@ def test_main(self):
                     print(self.__class__.__name__, p, num_workers,
                           persistent_workers)
                     sys.stdout.flush()
-                    ret = self.run_main(
-                        num_workers=num_workers,
-                        places=p,
-                        persistent_workers=persistent_workers)
+                    ret = self.run_main(num_workers=num_workers,
+                                        places=p,
+                                        persistent_workers=persistent_workers)
                     results.append(ret)
                 assert results[0]['loss'].shape[0] * 2 == results[1][
                     'loss'].shape[0]
 
 
 class RandomBatchedDataset(IterableDataset):
+
     def __init__(self, sample_num, class_num):
         self.sample_num = sample_num // BATCH_SIZE
         self.class_num = class_num
@@ -194,29 +195,29 @@ def __iter__(self):
 
 
 class TestStaticDataLoaderWithBatchedDataset(TestStaticDataLoader):
+
     def run_main(self, num_workers, places, persistent_workers):
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             startup_prog, main_prog, image, label, loss = simple_fc_net_static()
 
             dataset = RandomBatchedDataset(SAMPLE_NUM, CLASS_NUM)
-            dataloader = DataLoader(
-                dataset,
-                feed_list=[image, label],
-                places=places,
-                num_workers=num_workers,
-                batch_size=None,
-                return_list=False,
-                drop_last=True,
-                persistent_workers=persistent_workers)
+            dataloader = DataLoader(dataset,
+                                    feed_list=[image, label],
+                                    places=places,
+                                    num_workers=num_workers,
+                                    batch_size=None,
+                                    return_list=False,
+                                    drop_last=True,
+                                    persistent_workers=persistent_workers)
 
             exe = fluid.Executor(place=places[0])
             exe.run(startup_prog)
 
             prog = fluid.CompiledProgram(main_prog)
             if len(places) > 1:
-                prog = prog.with_data_parallel(
-                    loss_name=loss.name, places=places)
+                prog = prog.with_data_parallel(loss_name=loss.name,
+                                               places=places)
 
             step_list = []
             loss_list = []
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
index 9f73ee041e0e2..4da22817be296 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
@@ -33,6 +33,7 @@
 
 
 class RandomDataset(Dataset):
+
     def __init__(self, sample_num, class_num):
         self.sample_num = sample_num
         self.class_num = class_num
@@ -55,8 +56,9 @@ def simple_fc_net_static():
 
     with fluid.unique_name.guard():
         with fluid.program_guard(main_prog, startup_prog):
-            image = fluid.data(
-                name='image', shape=[None, IMAGE_SIZE], dtype='float32')
+            image = fluid.data(name='image',
+                               shape=[None, IMAGE_SIZE],
+                               dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
             hidden = image
             param_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(
@@ -76,8 +78,7 @@ def simple_fc_net_static():
                                             param_attr=param_attr,
                                             bias_attr=bias_attr)
             loss = fluid.layers.reduce_mean(
-                fluid.layers.cross_entropy(
-                    input=predict_label, label=label))
+                fluid.layers.cross_entropy(input=predict_label, label=label))
 
             optimizer = fluid.optimizer.Adam()
             optimizer.minimize(loss)
@@ -101,21 +102,21 @@ def prepare_places(with_data_parallel, with_cpu=False, with_gpu=True):
 
 
 class TestStaticDataLoader(unittest.TestCase):
+
     def run_main(self, num_workers, places, persistent_workers, use_pe=True):
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             startup_prog, main_prog, image, label, loss = simple_fc_net_static()
 
             dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
-            dataloader = DataLoader(
-                dataset,
-                feed_list=[image, label],
-                places=places,
-                num_workers=num_workers,
-                batch_size=BATCH_SIZE,
-                return_list=False,
-                drop_last=True,
-                persistent_workers=persistent_workers)
+            dataloader = DataLoader(dataset,
+                                    feed_list=[image, label],
+                                    places=places,
+                                    num_workers=num_workers,
+                                    batch_size=BATCH_SIZE,
+                                    return_list=False,
+                                    drop_last=True,
+                                    persistent_workers=persistent_workers)
             assert len(dataloader) == int(SAMPLE_NUM / BATCH_SIZE)
 
             exe = fluid.Executor(place=places[0])
@@ -124,8 +125,8 @@ def run_main(self, num_workers, places, persistent_workers, use_pe=True):
             if use_pe:
                 prog = fluid.CompiledProgram(main_prog)
                 if len(places) > 1:
-                    prog = prog.with_data_parallel(
-                        loss_name=loss.name, places=places)
+                    prog = prog.with_data_parallel(loss_name=loss.name,
+                                                   places=places)
             else:
                 prog = main_prog
 
@@ -169,10 +170,9 @@ def test_main(self):
                     print(self.__class__.__name__, p, num_workers,
                           persistent_workers)
                     sys.stdout.flush()
-                    ret = self.run_main(
-                        num_workers=num_workers,
-                        places=p,
-                        persistent_workers=persistent_workers)
+                    ret = self.run_main(num_workers=num_workers,
+                                        places=p,
+                                        persistent_workers=persistent_workers)
                     results.append(ret)
                 diff = np.max(
                     np.abs(results[0]['loss'] - results[1]['loss']) /
@@ -181,20 +181,21 @@ def test_main(self):
 
 
 class TestStaticDataLoaderReturnList(unittest.TestCase):
+
     def test_single_place(self):
         scope = fluid.Scope()
-        image = fluid.data(
-            name='image', shape=[None, IMAGE_SIZE], dtype='float32')
+        image = fluid.data(name='image',
+                           shape=[None, IMAGE_SIZE],
+                           dtype='float32')
         label = fluid.data(name='label', shape=[None, 1], dtype='int64')
         with fluid.scope_guard(scope):
             dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
-            dataloader = DataLoader(
-                dataset,
-                feed_list=[image, label],
-                num_workers=0,
-                batch_size=BATCH_SIZE,
-                drop_last=True,
-                return_list=True)
+            dataloader = DataLoader(dataset,
+                                    feed_list=[image, label],
+                                    num_workers=0,
+                                    batch_size=BATCH_SIZE,
+                                    drop_last=True,
+                                    return_list=True)
 
             for d in dataloader:
                 assert isinstance(d, list)
@@ -204,19 +205,19 @@ def test_single_place(self):
 
     def test_multi_place(self):
         scope = fluid.Scope()
-        image = fluid.data(
-            name='image', shape=[None, IMAGE_SIZE], dtype='float32')
+        image = fluid.data(name='image',
+                           shape=[None, IMAGE_SIZE],
+                           dtype='float32')
         label = fluid.data(name='label', shape=[None, 1], dtype='int64')
         with fluid.scope_guard(scope):
             dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
-            dataloader = DataLoader(
-                dataset,
-                feed_list=[image, label],
-                num_workers=0,
-                batch_size=BATCH_SIZE,
-                places=[fluid.CPUPlace()] * 2,
-                drop_last=True,
-                return_list=True)
+            dataloader = DataLoader(dataset,
+                                    feed_list=[image, label],
+                                    num_workers=0,
+                                    batch_size=BATCH_SIZE,
+                                    places=[fluid.CPUPlace()] * 2,
+                                    drop_last=True,
+                                    return_list=True)
 
             for d in dataloader:
                 assert isinstance(d, list)
@@ -226,6 +227,7 @@ def test_multi_place(self):
 
 
 class RandomBatchedDataset(Dataset):
+
     def __init__(self, sample_num, class_num):
         self.sample_num = int(sample_num / BATCH_SIZE)
         self.class_num = class_num
@@ -247,21 +249,21 @@ def __len__(self):
 
 
 class TestStaticDataLoaderWithBatchedDataset(TestStaticDataLoader):
+
     def run_main(self, num_workers, places, persistent_workers):
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             startup_prog, main_prog, image, label, loss = simple_fc_net_static()
 
             dataset = RandomBatchedDataset(SAMPLE_NUM, CLASS_NUM)
-            dataloader = DataLoader(
-                dataset,
-                feed_list=[image, label],
-                places=places,
-                num_workers=num_workers,
-                batch_size=None,
-                return_list=False,
-                drop_last=True,
-                persistent_workers=persistent_workers)
+            dataloader = DataLoader(dataset,
+                                    feed_list=[image, label],
+                                    places=places,
+                                    num_workers=num_workers,
+                                    batch_size=None,
+                                    return_list=False,
+                                    drop_last=True,
+                                    persistent_workers=persistent_workers)
             assert len(dataloader) == int(SAMPLE_NUM / BATCH_SIZE)
 
             exe = fluid.Executor(place=places[0])
@@ -269,8 +271,8 @@ def run_main(self, num_workers, places, persistent_workers):
 
             prog = fluid.CompiledProgram(main_prog)
             if len(places) > 1:
-                prog = prog.with_data_parallel(
-                    loss_name=loss.name, places=places)
+                prog = prog.with_data_parallel(loss_name=loss.name,
+                                               places=places)
 
             step_list = []
             loss_list = []
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
index c3b53e81a6665..825a6b8fa49e1 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
@@ -26,6 +26,7 @@ class ReaderException(Exception):
 
 
 class TestMultiprocessReaderExceptionWithQueueSuccess(unittest.TestCase):
+
     def setUp(self):
         self.use_pipe = False
         self.raise_exception = False
@@ -41,12 +42,12 @@ def main_impl(self, place, iterable):
         batch_size = 4
 
         def fake_reader():
+
             def __impl__():
                 for _ in range(sample_num):
                     if not self.raise_exception:
-                        yield list(
-                            np.random.uniform(
-                                low=-1, high=1, size=[10])),
+                        yield list(np.random.uniform(low=-1, high=1,
+                                                     size=[10])),
                     else:
                         raise ValueError()
 
@@ -54,8 +55,9 @@ def __impl__():
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             image = fluid.data(name='image', dtype='float32', shape=[None, 10])
-            reader = fluid.io.DataLoader.from_generator(
-                feed_list=[image], capacity=2, iterable=iterable)
+            reader = fluid.io.DataLoader.from_generator(feed_list=[image],
+                                                        capacity=2,
+                                                        iterable=iterable)
 
             image_p_1 = image + 1
 
@@ -63,15 +65,13 @@ def __impl__():
                 [fake_reader(), fake_reader()], use_pipe=self.use_pipe)
 
             if isinstance(place, fluid.CUDAPlace):
-                reader.set_sample_generator(
-                    decorated_reader,
-                    batch_size=batch_size,
-                    places=fluid.cuda_places(0))
+                reader.set_sample_generator(decorated_reader,
+                                            batch_size=batch_size,
+                                            places=fluid.cuda_places(0))
             else:
-                reader.set_sample_generator(
-                    decorated_reader,
-                    batch_size=batch_size,
-                    places=fluid.cpu_places(1))
+                reader.set_sample_generator(decorated_reader,
+                                            batch_size=batch_size,
+                                            places=fluid.cpu_places(1))
 
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -120,6 +120,7 @@ def test_main(self):
 
 class TestMultiprocessReaderExceptionWithQueueFailed(
         TestMultiprocessReaderExceptionWithQueueSuccess):
+
     def setUp(self):
         self.use_pipe = False
         self.raise_exception = True
@@ -127,6 +128,7 @@ def setUp(self):
 
 class TestMultiprocessReaderExceptionWithPipeSuccess(
         TestMultiprocessReaderExceptionWithQueueSuccess):
+
     def setUp(self):
         self.use_pipe = True
         self.raise_exception = False
@@ -134,6 +136,7 @@ def setUp(self):
 
 class TestMultiprocessReaderExceptionWithPipeFailed(
         TestMultiprocessReaderExceptionWithQueueSuccess):
+
     def setUp(self):
         self.use_pipe = True
         self.raise_exception = True
diff --git a/python/paddle/fluid/tests/unittests/test_mv_op.py b/python/paddle/fluid/tests/unittests/test_mv_op.py
index 09ec702671bc9..086ed5e693bb4 100644
--- a/python/paddle/fluid/tests/unittests/test_mv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mv_op.py
@@ -25,6 +25,7 @@
 
 
 class TestMVOp(OpTest):
+
     def setUp(self):
         self.op_type = "mv"
         self.python_api = paddle.mv
@@ -44,6 +45,7 @@ def init_config(self):
 
 
 class TestMVAPI(unittest.TestCase):
+
     def test_dygraph_api_out(self):
         paddle.disable_static()
 
@@ -71,10 +73,12 @@ def test_static_graph(self):
                 self.input_vec = np.random.rand(100).astype("float64")
 
                 with program_guard(train_program, startup_program):
-                    data_x = paddle.static.data(
-                        "x", shape=[5, 100], dtype="float64")
-                    data_vec = paddle.static.data(
-                        "vec", shape=[100], dtype="float64")
+                    data_x = paddle.static.data("x",
+                                                shape=[5, 100],
+                                                dtype="float64")
+                    data_vec = paddle.static.data("vec",
+                                                  shape=[100],
+                                                  dtype="float64")
 
                     data_x.stop_gradient = x_stop_gradient
                     data_vec.stop_gradient = vec_stop_gradient
@@ -83,16 +87,19 @@ def test_static_graph(self):
 
                     self.place = paddle.CPUPlace()
                     exe = paddle.static.Executor(self.place)
-                    res, = exe.run(
-                        feed={"x": self.input_x,
-                              "vec": self.input_vec},
-                        fetch_list=[result_vec])
+                    res, = exe.run(feed={
+                        "x": self.input_x,
+                        "vec": self.input_vec
+                    },
+                                   fetch_list=[result_vec])
                     z_expected = np.array(np.dot(self.input_x, self.input_vec))
                     self.assertTrue(np.allclose(res, z_expected))
 
 
 class TestMVError(unittest.TestCase):
+
     def test_input(self):
+
         def test_shape():
             paddle.enable_static()
 
@@ -100,8 +107,9 @@ def test_shape():
             self.input_vec = np.random.rand(100).astype("float64")
 
             data_x = paddle.static.data("x", shape=[5, 100], dtype="float64")
-            data_vec = paddle.static.data(
-                "vec", shape=[100, 2], dtype="float64")
+            data_vec = paddle.static.data("vec",
+                                          shape=[100, 2],
+                                          dtype="float64")
             result_vec = paddle.mv(data_x, data_vec)
 
         self.assertRaises(ValueError, test_shape)
diff --git a/python/paddle/fluid/tests/unittests/test_naive_best_fit_gpu_memory_limit.py b/python/paddle/fluid/tests/unittests/test_naive_best_fit_gpu_memory_limit.py
index d8d10816bf97a..6994bf30523be 100644
--- a/python/paddle/fluid/tests/unittests/test_naive_best_fit_gpu_memory_limit.py
+++ b/python/paddle/fluid/tests/unittests/test_naive_best_fit_gpu_memory_limit.py
@@ -23,6 +23,7 @@
 
 
 class TestBase(unittest.TestCase):
+
     def setUp(self):
         if fluid.is_compiled_with_cuda():
             self._limit = fluid.core.globals()['FLAGS_gpu_memory_limit_mb']
@@ -35,8 +36,7 @@ def test_allocate(self):
 
         place = fluid.CUDAPlace(0)
         t = fluid.LoDTensor()
-        t.set(np.ndarray(
-            [int(self._limit / 2), other_dim], dtype='float32'),
+        t.set(np.ndarray([int(self._limit / 2), other_dim], dtype='float32'),
               place)
         del t
 
diff --git a/python/paddle/fluid/tests/unittests/test_name_scope.py b/python/paddle/fluid/tests/unittests/test_name_scope.py
index a1f0d56d0ff29..92d3f04fd2c2d 100644
--- a/python/paddle/fluid/tests/unittests/test_name_scope.py
+++ b/python/paddle/fluid/tests/unittests/test_name_scope.py
@@ -19,6 +19,7 @@
 
 
 class TestNameScope(unittest.TestCase):
+
     def test_name_scope(self):
         with fluid.name_scope("s1"):
             a = fluid.layers.data(name='data', shape=[1], dtype='int32')
diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py
index 84559048a2b8a..6eb951b8ad1b5 100644
--- a/python/paddle/fluid/tests/unittests/test_nan_inf.py
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py
@@ -25,6 +25,7 @@
 
 
 class TestNanInf(unittest.TestCase):
+
     def setUp(self):
         self._python_interp = sys.executable
         if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
@@ -35,11 +36,10 @@ def setUp(self):
     def check_nan_inf(self):
         cmd = self._python_interp
 
-        proc = subprocess.Popen(
-            cmd.split(" "),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            env=self.env)
+        proc = subprocess.Popen(cmd.split(" "),
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE,
+                                env=self.env)
 
         out, err = proc.communicate()
         returncode = proc.returncode
@@ -51,8 +51,8 @@ def check_nan_inf(self):
         if paddle.fluid.core.is_compiled_with_cuda():
             assert (out + err).find('find nan or inf==='.encode()) != -1
         else:
-            assert (out + err
-                    ).find('There are `nan` or `inf` in tensor'.encode()) != -1
+            assert (out + err).find(
+                'There are `nan` or `inf` in tensor'.encode()) != -1
 
     def test_nan_inf_in_static_mode(self):
         self._python_interp += " check_nan_inf_base.py"
@@ -64,6 +64,7 @@ def test_nan_inf_in_dynamic_mode(self):
 
 
 class TestNanInfEnv(TestNanInf):
+
     def setUp(self):
         super(TestNanInfEnv, self).setUp()
         # windows python have some bug with env, so need use str to pass ci
diff --git a/python/paddle/fluid/tests/unittests/test_nanmean_api.py b/python/paddle/fluid/tests/unittests/test_nanmean_api.py
index 90a9a130899d3..7f6306f0ae52f 100644
--- a/python/paddle/fluid/tests/unittests/test_nanmean_api.py
+++ b/python/paddle/fluid/tests/unittests/test_nanmean_api.py
@@ -31,8 +31,8 @@ def setUp(self):
         self.x_shape = [2, 3, 4, 5]
         self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
         self.x[0, :, :, :] = np.nan
-        self.x_grad = np.array([[np.nan, np.nan, 3.],
-                                [0., np.nan, 2.]]).astype(np.float32)
+        self.x_grad = np.array([[np.nan, np.nan, 3.], [0., np.nan,
+                                                       2.]]).astype(np.float32)
         self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
             else paddle.CPUPlace()
 
@@ -72,9 +72,8 @@ def test_case(x, axis=None, keepdim=False):
                 out_np[nan_mask] = 0
                 self.assertEqual(np.allclose(out_np, out_ref, rtol=1e-04), True)
             else:
-                self.assertEqual(
-                    np.allclose(
-                        out.numpy(), out_ref, rtol=1e-04), True)
+                self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-04),
+                                 True)
 
         test_case(self.x)
         test_case(self.x, [])
diff --git a/python/paddle/fluid/tests/unittests/test_nanmedian.py b/python/paddle/fluid/tests/unittests/test_nanmedian.py
index 2e1f13a8c7d9f..74c0c635dd306 100644
--- a/python/paddle/fluid/tests/unittests/test_nanmedian.py
+++ b/python/paddle/fluid/tests/unittests/test_nanmedian.py
@@ -23,6 +23,7 @@
 
 
 class TestNanmedian(unittest.TestCase):
+
     def setUp(self):
         single_axis_shape = (120)
         multi_axis_shape = (2, 3, 4, 5)
@@ -32,8 +33,10 @@ def setUp(self):
             np.random.uniform(-1, 1, single_axis_shape).astype(np.float32),
             "multi_axis_normal":
             np.random.uniform(-1, 1, multi_axis_shape).astype(np.float32),
-            "single_axis_all_nan": np.full(single_axis_shape, np.nan),
-            "multi_axis_all_nan": np.full(multi_axis_shape, np.nan),
+            "single_axis_all_nan":
+            np.full(single_axis_shape, np.nan),
+            "multi_axis_all_nan":
+            np.full(multi_axis_shape, np.nan),
         }
 
         single_partial_nan = self.fake_data["single_axis_normal"].copy()
@@ -108,15 +111,15 @@ def test_data_case(data):
                         continue
 
                 np_res = np.nanmedian(data, keepdims=keep_dim)
-                pd_res = paddle.nanmedian(
-                    paddle.to_tensor(data), keepdim=keep_dim)
+                pd_res = paddle.nanmedian(paddle.to_tensor(data),
+                                          keepdim=keep_dim)
                 self.assertTrue(
-                    np.allclose(
-                        np_res, pd_res.numpy(), equal_nan=True))
+                    np.allclose(np_res, pd_res.numpy(), equal_nan=True))
 
         def test_axis_case(data, axis):
-            pd_res = paddle.nanmedian(
-                paddle.to_tensor(data), axis=axis, keepdim=False)
+            pd_res = paddle.nanmedian(paddle.to_tensor(data),
+                                      axis=axis,
+                                      keepdim=False)
             axis = clean_axis_numpy(axis, len(data.shape))
             np_res = np.nanmedian(data, axis=axis, keepdims=False)
             self.assertTrue(np.allclose(np_res, pd_res.numpy(), equal_nan=True))
diff --git a/python/paddle/fluid/tests/unittests/test_nansum_api.py b/python/paddle/fluid/tests/unittests/test_nansum_api.py
index a9fc285d2d9d0..34c34e9d8a6ee 100644
--- a/python/paddle/fluid/tests/unittests/test_nansum_api.py
+++ b/python/paddle/fluid/tests/unittests/test_nansum_api.py
@@ -23,6 +23,7 @@
 
 
 class API_Test_Nansum(unittest.TestCase):
+
     def test_static_graph(self):
         paddle.enable_static()
         startup_program = fluid.Program()
@@ -39,8 +40,9 @@ def test_static_graph(self):
             exe = fluid.Executor(place)
             exe.run(startup_program)
 
-            x = np.array([[float('nan'), 3, 5, 9],
-                          [1, 2, float('-nan'), 7]]).astype(np.float32)
+            x = np.array([[float('nan'), 3, 5, 9], [1, 2,
+                                                    float('-nan'),
+                                                    7]]).astype(np.float32)
             res = exe.run(train_program,
                           feed={'input': x},
                           fetch_list=[out1, out2, out3, out4])
@@ -54,18 +56,14 @@ def test_static_graph(self):
             out3_ref = np.array([17, 10]).astype(np.float32)
             out4_ref = np.array([[17], [10]]).astype(np.float32)
 
-            self.assertTrue(
-                (out1_np == out1_ref).all(),
-                msg='nansum output is wrong, out =' + str(out1_np))
-            self.assertTrue(
-                (out2_np == out2_ref).all(),
-                msg='nansum output is wrong, out =' + str(out2_np))
-            self.assertTrue(
-                (out3_np == out3_ref).all(),
-                msg='nansum output is wrong, out =' + str(out3_np))
-            self.assertTrue(
-                (out4_np == out4_ref).all(),
-                msg='nansum output is wrong, out =' + str(out4_np))
+            self.assertTrue((out1_np == out1_ref).all(),
+                            msg='nansum output is wrong, out =' + str(out1_np))
+            self.assertTrue((out2_np == out2_ref).all(),
+                            msg='nansum output is wrong, out =' + str(out2_np))
+            self.assertTrue((out3_np == out3_ref).all(),
+                            msg='nansum output is wrong, out =' + str(out3_np))
+            self.assertTrue((out4_np == out4_ref).all(),
+                            msg='nansum output is wrong, out =' + str(out4_np))
 
     def test_error_api(self):
         paddle.enable_static()
@@ -85,8 +83,8 @@ def run2():
         self.assertRaises(TypeError, run2)
 
     def test_dygraph(self):
-        x = np.array([[float('nan'), 3, 5, 9],
-                      [1, 2, float('-nan'), 7]]).astype(np.float32)
+        x = np.array([[float('nan'), 3, 5, 9], [1, 2, float('-nan'),
+                                                7]]).astype(np.float32)
         with fluid.dygraph.guard():
             inputs = fluid.dygraph.to_variable(x)
             out = paddle.nansum(inputs)
diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py
index 6c2fc4d842928..bbeec5ce62111 100644
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
@@ -56,13 +56,15 @@ def nce(input, weight, bias, sample_weight, labels, num_classes,
         o = sample_out[i]
         cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b))
         out[samples[i][0]] += cost * samples[i][3]
-    return (out[:, np.newaxis], np.array(sample_out).reshape(
-        batch_size, num_sample_class + num_true_class),
+    return (out[:, np.newaxis],
+            np.array(sample_out).reshape(batch_size,
+                                         num_sample_class + num_true_class),
             np.array(sample_labels).reshape(batch_size,
                                             num_sample_class + num_true_class))
 
 
 class TestNCE(OpTest):
+
     def generate_data(self, dim, batch_size, num_classes, num_true_class,
                       num_neg_samples, is_sparse):
         input = np.random.randn(batch_size, dim).astype(np.float32)
@@ -118,11 +120,13 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            ["Input", "Weight", "Bias"], "Cost", max_relative_error=0.02)
+        self.check_grad(["Input", "Weight", "Bias"],
+                        "Cost",
+                        max_relative_error=0.02)
 
 
 class TestNCECase1Tensor(TestNCE):
+
     def set_data(self):
         self.generate_data(10, 20, 100, 2, 5, False)
 
@@ -137,6 +141,7 @@ def test_check_grad(self):
 
 
 class TestNCECase1SelectedRows(unittest.TestCase):
+
     def setUp(self):
         self.base_lr = 0.0001
         self.batch_size = 8
@@ -238,73 +243,89 @@ def test_input_is_selected_rows(self):
 
 
 class TestNCE_OpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            input1 = fluid.create_lod_tensor(
-                np.array([0.0, 3.0, 2.0, 4.0]), [[1, 1, 2]], fluid.CPUPlace())
-            label1 = fluid.layers.data(
-                name='label1', shape=[-1, 4], dtype="int64")
+            input1 = fluid.create_lod_tensor(np.array([0.0, 3.0, 2.0, 4.0]),
+                                             [[1, 1, 2]], fluid.CPUPlace())
+            label1 = fluid.layers.data(name='label1',
+                                       shape=[-1, 4],
+                                       dtype="int64")
             # the input(input) of nce layer must be Variable.
             self.assertRaises(TypeError, fluid.layers.nce, input1, label1, 5)
 
-            input2 = fluid.layers.data(
-                name='input2', shape=[-1, 4], dtype="float32")
-            label2 = fluid.create_lod_tensor(
-                np.array([0.0, 3.0, 2.0, 4.0]), [[1, 1, 2]], fluid.CPUPlace())
+            input2 = fluid.layers.data(name='input2',
+                                       shape=[-1, 4],
+                                       dtype="float32")
+            label2 = fluid.create_lod_tensor(np.array([0.0, 3.0, 2.0, 4.0]),
+                                             [[1, 1, 2]], fluid.CPUPlace())
             # the input(label) of nce layer must be Variable.
             self.assertRaises(TypeError, fluid.layers.nce, input2, label2, 5)
 
-            input3 = fluid.layers.data(
-                name='input3', shape=[-1, 4], dtype="float16")
-            label3 = fluid.layers.data(
-                name='label3', shape=[-1, 1], dtype="int64")
+            input3 = fluid.layers.data(name='input3',
+                                       shape=[-1, 4],
+                                       dtype="float16")
+            label3 = fluid.layers.data(name='label3',
+                                       shape=[-1, 1],
+                                       dtype="int64")
             # the data type of input(input) must be float32 or float64.
             self.assertRaises(TypeError, fluid.layers.nce, input3, label3, 5)
 
-            input4 = fluid.layers.data(
-                name='input4', shape=[-1, 4], dtype="float32")
-            label4 = fluid.layers.data(
-                name='label4', shape=[-1, 1], dtype="int32")
+            input4 = fluid.layers.data(name='input4',
+                                       shape=[-1, 4],
+                                       dtype="float32")
+            label4 = fluid.layers.data(name='label4',
+                                       shape=[-1, 1],
+                                       dtype="int32")
             # the data type of input(label) must be int64.
             self.assertRaises(TypeError, fluid.layers.nce, input4, label4, 5)
 
 
 class TestDygraphNCE_OpError(unittest.TestCase):
+
     def test_NCE_errors(self):
         with program_guard(Program(), Program()):
             nce = fluid.NCE(20, 5)
-            input1 = fluid.create_lod_tensor(
-                np.array([0.0, 3.0, 2.0, 4.0]), [[1, 1, 2]], fluid.CPUPlace())
-            label1 = fluid.layers.data(
-                name='label1', shape=[-1, 4], dtype="int64")
+            input1 = fluid.create_lod_tensor(np.array([0.0, 3.0, 2.0, 4.0]),
+                                             [[1, 1, 2]], fluid.CPUPlace())
+            label1 = fluid.layers.data(name='label1',
+                                       shape=[-1, 4],
+                                       dtype="int64")
             # the input(input) of NCE layer must be Variable.
             self.assertRaises(TypeError, nce, input1, label1)
 
-            input2 = fluid.layers.data(
-                name='input2', shape=[-1, 4], dtype="float32")
-            label2 = fluid.create_lod_tensor(
-                np.array([0.0, 3.0, 2.0, 4.0]), [[1, 1, 2]], fluid.CPUPlace())
+            input2 = fluid.layers.data(name='input2',
+                                       shape=[-1, 4],
+                                       dtype="float32")
+            label2 = fluid.create_lod_tensor(np.array([0.0, 3.0, 2.0, 4.0]),
+                                             [[1, 1, 2]], fluid.CPUPlace())
             # the input(label) of NCE layer must be Variable.
             self.assertRaises(TypeError, nce, input2, label2)
 
-            input3 = fluid.layers.data(
-                name='input3', shape=[-1, 4], dtype="float16")
-            label3 = fluid.layers.data(
-                name='label3', shape=[-1, 1], dtype="int64")
+            input3 = fluid.layers.data(name='input3',
+                                       shape=[-1, 4],
+                                       dtype="float16")
+            label3 = fluid.layers.data(name='label3',
+                                       shape=[-1, 1],
+                                       dtype="int64")
             # the data type of input(input) must be float32 or float64.
             self.assertRaises(TypeError, nce, input3, label3)
 
-            input4 = fluid.layers.data(
-                name='input4', shape=[-1, 4], dtype="float32")
-            label4 = fluid.layers.data(
-                name='label4', shape=[-1, 1], dtype="int32")
+            input4 = fluid.layers.data(name='input4',
+                                       shape=[-1, 4],
+                                       dtype="float32")
+            label4 = fluid.layers.data(name='label4',
+                                       shape=[-1, 1],
+                                       dtype="int32")
             # the data type of input(label) must be int64.
             self.assertRaises(TypeError, nce, input4, label4)
 
-            input5 = fluid.layers.data(
-                name='input5', shape=[-1, 4], dtype="float32")
-            label5 = fluid.layers.data(
-                name='label5', shape=[-1, 1], dtype="int64")
+            input5 = fluid.layers.data(name='input5',
+                                       shape=[-1, 4],
+                                       dtype="float32")
+            label5 = fluid.layers.data(name='label5',
+                                       shape=[-1, 1],
+                                       dtype="int64")
             sample_weight = fluid.create_lod_tensor(
                 np.array([0.0, 3.0, 2.0, 4.0]), [[1, 1, 2]], fluid.CPUPlace())
             # the sample_weight of nce must be Variable or None.
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
index 5df085d4febac..3bcafe53cb89b 100755
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
@@ -73,6 +73,7 @@ def nearest_neighbor_interp_np(X,
 
 
 class TestNearestInterpOp(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -96,9 +97,10 @@ def setUp(self):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = nearest_neighbor_interp_np(
-            input_np, out_h, out_w, self.out_size, self.actual_shape,
-            self.align_corners, self.data_layout)
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+                                               self.out_size, self.actual_shape,
+                                               self.align_corners,
+                                               self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -120,8 +122,10 @@ def test_check_output(self):
         self.check_output(check_eager=self.check_eager)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', in_place=True, check_eager=self.check_eager)
+        self.check_grad(['X'],
+                        'Out',
+                        in_place=True,
+                        check_eager=self.check_eager)
 
     def init_test_case(self):
         self.interp_method = 'nearest'
@@ -134,6 +138,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase1(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [4, 1, 7, 8]
@@ -144,6 +149,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 3, 9, 6]
@@ -154,6 +160,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [1, 1, 32, 64]
@@ -164,6 +171,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [4, 1, 7, 8]
@@ -175,6 +183,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 3, 9, 6]
@@ -186,6 +195,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [1, 1, 32, 64]
@@ -197,6 +207,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpSame(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [2, 3, 32, 64]
@@ -207,6 +218,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 32, 16]
@@ -218,6 +230,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [2, 4, 4, 5]
@@ -230,14 +243,15 @@ def init_test_case(self):
 
 
 class TestNearestInterpOpUint8(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
         self.init_test_case()
         self.op_type = "nearest_interp"
         self.check_eager = True
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape).astype("uint8")
+        input_np = np.random.randint(low=0, high=256,
+                                     size=self.input_shape).astype("uint8")
 
         if self.scale > 0:
             out_h = int(self.input_shape[2] * self.scale)
@@ -263,8 +277,9 @@ def setUp(self):
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
-        self.check_output_with_place(
-            place=core.CPUPlace(), atol=1, check_eager=self.check_eager)
+        self.check_output_with_place(place=core.CPUPlace(),
+                                     atol=1,
+                                     check_eager=self.check_eager)
 
     def init_test_case(self):
         self.interp_method = 'nearest'
@@ -276,6 +291,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [2, 3, 32, 64]
@@ -286,6 +302,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [4, 1, 7, 8]
@@ -297,11 +314,13 @@ def init_test_case(self):
 
 
 class TestNearestInterpWithoutCorners(TestNearestInterpOp):
+
     def set_align_corners(self):
         self.align_corners = False
 
 
 class TestNearestNeighborInterpScale1(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 7, 5]
@@ -313,6 +332,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpScale2(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 5, 7]
@@ -324,6 +344,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpScale3(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 7, 5]
@@ -335,6 +356,7 @@ def init_test_case(self):
 
 
 class TestNearestInterpOp_attr_tensor(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -385,8 +407,10 @@ def test_check_output(self):
         self.check_output(check_eager=self.check_eager)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', in_place=True, check_eager=self.check_eager)
+        self.check_grad(['X'],
+                        'Out',
+                        in_place=True,
+                        check_eager=self.check_eager)
 
     def init_test_case(self):
         self.interp_method = 'nearest'
@@ -400,6 +424,7 @@ def init_test_case(self):
 
 # out_size is a tensor list
 class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 3, 9, 6]
@@ -412,6 +437,7 @@ def init_test_case(self):
 
 # out_size is a 1-D tensor
 class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 32, 16]
@@ -425,6 +451,7 @@ def init_test_case(self):
 
 # scale is a 1-D tensor
 class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 32, 16]
@@ -437,6 +464,7 @@ def init_test_case(self):
 
 
 class TestNearestAPI(unittest.TestCase):
+
     def test_case(self):
         x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
         y = fluid.data(name="y", shape=[2, 6, 6, 3], dtype="float32")
@@ -444,15 +472,18 @@ def test_case(self):
         dim = fluid.data(name="dim", shape=[1], dtype="int32")
         shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
         actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
-        scale_tensor = fluid.data(
-            name="scale_tensor", shape=[1], dtype="float32")
+        scale_tensor = fluid.data(name="scale_tensor",
+                                  shape=[1],
+                                  dtype="float32")
 
-        out1 = fluid.layers.resize_nearest(
-            y, out_shape=[12, 12], data_format='NHWC')
+        out1 = fluid.layers.resize_nearest(y,
+                                           out_shape=[12, 12],
+                                           data_format='NHWC')
         out2 = fluid.layers.resize_nearest(x, out_shape=[12, dim])
         out3 = fluid.layers.resize_nearest(x, out_shape=shape_tensor)
-        out4 = fluid.layers.resize_nearest(
-            x, out_shape=[4, 4], actual_shape=actual_size)
+        out4 = fluid.layers.resize_nearest(x,
+                                           out_shape=[4, 4],
+                                           actual_shape=actual_size)
         out5 = fluid.layers.resize_nearest(x, scale=scale_tensor)
 
         x_data = np.random.random((2, 3, 6, 6)).astype("float32")
@@ -479,8 +510,10 @@ def test_case(self):
                           fetch_list=[out1, out2, out3, out4, out5],
                           return_numpy=True)
 
-        expect_res = nearest_neighbor_interp_np(
-            x_data, out_h=12, out_w=12, align_corners=True)
+        expect_res = nearest_neighbor_interp_np(x_data,
+                                                out_h=12,
+                                                out_w=12,
+                                                align_corners=True)
         self.assertTrue(
             np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1))))
         for i in range(len(results) - 1):
@@ -488,13 +521,15 @@ def test_case(self):
 
 
 class TestNearestInterpException(unittest.TestCase):
+
     def test_exception(self):
         input = fluid.data(name="input", shape=[1, 3, 6, 6], dtype="float32")
 
         def attr_data_format():
             # for 4-D input, data_format can only be NCHW or NHWC
-            out = fluid.layers.resize_nearest(
-                input, out_shape=[4, 8], data_format='NDHWC')
+            out = fluid.layers.resize_nearest(input,
+                                              out_shape=[4, 8],
+                                              data_format='NDHWC')
 
         def attr_scale_type():
             out = fluid.layers.resize_nearest(input, scale='scale')
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
index e2ac98f7c9f1f..322db889d23c7 100755
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
@@ -158,6 +158,7 @@ def nearest_neighbor_interp3d_np(X,
 
 
 class TestNearestInterpOp(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -215,10 +216,12 @@ def setUp(self):
                 input_np, out_h, out_w, scale_h, scale_w, self.out_size,
                 self.actual_shape, self.align_corners, self.data_layout)
         elif len(self.input_shape) == 5:
-            output_np = nearest_neighbor_interp3d_np(
-                input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w,
-                self.out_size, self.actual_shape, self.align_corners,
-                self.data_layout)
+            output_np = nearest_neighbor_interp3d_np(input_np, out_d, out_h,
+                                                     out_w, scale_d, scale_h,
+                                                     scale_w, self.out_size,
+                                                     self.actual_shape,
+                                                     self.align_corners,
+                                                     self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -267,6 +270,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase1(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [4, 1, 1, 7, 8]
@@ -278,6 +282,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 3, 9, 6]
@@ -288,6 +293,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [1, 1, 32, 64]
@@ -298,6 +304,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [4, 1, 7, 8]
@@ -309,6 +316,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 3, 9, 6]
@@ -320,6 +328,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [1, 1, 32, 64]
@@ -331,6 +340,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpSame(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [2, 3, 32, 64]
@@ -341,6 +351,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 32, 16]
@@ -352,6 +363,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [2, 4, 4, 5]
@@ -364,13 +376,14 @@ def init_test_case(self):
 
 
 class TestNearestInterpOpUint8(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
         self.init_test_case()
         self.op_type = "nearest_interp_v2"
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape).astype("uint8")
+        input_np = np.random.randint(low=0, high=256,
+                                     size=self.input_shape).astype("uint8")
 
         if self.scale:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
@@ -421,6 +434,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [2, 3, 32, 64]
@@ -431,6 +445,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [4, 1, 7, 8]
@@ -442,11 +457,13 @@ def init_test_case(self):
 
 
 class TestNearestInterpWithoutCorners(TestNearestInterpOp):
+
     def set_align_corners(self):
         self.align_corners = False
 
 
 class TestNearestNeighborInterpScale1(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 7, 5]
@@ -458,6 +475,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpScale2(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 5, 7]
@@ -469,6 +487,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighborInterpScale3(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 7, 5]
@@ -480,6 +499,7 @@ def init_test_case(self):
 
 
 class TestNearestNeighbor3DInterp(TestNearestInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 4, 7, 5]
@@ -492,6 +512,7 @@ def init_test_case(self):
 
 
 class TestNearestInterpOp_attr_tensor(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -565,6 +586,7 @@ def init_test_case(self):
 
 # out_size is a tensor list
 class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 3, 9, 6]
@@ -577,6 +599,7 @@ def init_test_case(self):
 
 # out_size is a 1-D tensor
 class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 32, 16]
@@ -590,6 +613,7 @@ def init_test_case(self):
 
 # scale is a 1-D tensor
 class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'nearest'
         self.input_shape = [3, 2, 32, 16]
@@ -602,6 +626,7 @@ def init_test_case(self):
 
 
 class TestNearestAPI(unittest.TestCase):
+
     def test_case(self):
         x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
         y = fluid.data(name="y", shape=[2, 6, 6, 3], dtype="float32")
@@ -609,15 +634,18 @@ def test_case(self):
         dim = fluid.data(name="dim", shape=[1], dtype="int32")
         shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
         actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
-        scale_tensor = fluid.data(
-            name="scale_tensor", shape=[1], dtype="float32")
+        scale_tensor = fluid.data(name="scale_tensor",
+                                  shape=[1],
+                                  dtype="float32")
 
-        out1 = fluid.layers.resize_nearest(
-            y, out_shape=[12, 12], data_format='NHWC')
+        out1 = fluid.layers.resize_nearest(y,
+                                           out_shape=[12, 12],
+                                           data_format='NHWC')
         out2 = fluid.layers.resize_nearest(x, out_shape=[12, dim])
         out3 = fluid.layers.resize_nearest(x, out_shape=shape_tensor)
-        out4 = fluid.layers.resize_nearest(
-            x, out_shape=[4, 4], actual_shape=actual_size)
+        out4 = fluid.layers.resize_nearest(x,
+                                           out_shape=[4, 4],
+                                           actual_shape=actual_size)
         out5 = fluid.layers.resize_nearest(x, scale=scale_tensor)
 
         x_data = np.random.random((2, 3, 6, 6)).astype("float32")
@@ -644,8 +672,10 @@ def test_case(self):
                           fetch_list=[out1, out2, out3, out4, out5],
                           return_numpy=True)
 
-        expect_res = nearest_neighbor_interp_np(
-            x_data, out_h=12, out_w=12, align_corners=True)
+        expect_res = nearest_neighbor_interp_np(x_data,
+                                                out_h=12,
+                                                out_w=12,
+                                                align_corners=True)
         self.assertTrue(
             np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1))))
         for i in range(len(results) - 1):
@@ -653,6 +683,7 @@ def test_case(self):
 
 
 class TestNearestInterpOpAPI_dy(unittest.TestCase):
+
     def test_case(self):
         import paddle
         if core.is_compiled_with_cuda():
@@ -664,17 +695,19 @@ def test_case(self):
             scale_np = np.array([2, 2]).astype("int64")
             input_x = paddle.to_tensor(input_data)
             scale = paddle.to_tensor(scale_np)
-            expect_res = nearest_neighbor_interp_np(
-                input_data, out_h=12, out_w=12, align_corners=False)
-            out = interpolate(
-                x=input_x,
-                scale_factor=scale,
-                mode="nearest",
-                align_corners=False)
+            expect_res = nearest_neighbor_interp_np(input_data,
+                                                    out_h=12,
+                                                    out_w=12,
+                                                    align_corners=False)
+            out = interpolate(x=input_x,
+                              scale_factor=scale,
+                              mode="nearest",
+                              align_corners=False)
             self.assertTrue(np.allclose(out.numpy(), expect_res))
 
 
 class TestNearestInterp3DOpAPI_dy(unittest.TestCase):
+
     def test_case(self):
         import paddle
         if core.is_compiled_with_cuda():
@@ -686,26 +719,30 @@ def test_case(self):
             scale_np = np.array([2, 2, 2]).astype("int64")
             input_x = paddle.to_tensor(input_data)
             scale = paddle.to_tensor(scale_np)
-            expect_res = nearest_neighbor_interp3d_np(
-                input_data, out_d=12, out_h=12, out_w=12, align_corners=False)
-            out = interpolate(
-                x=input_x,
-                scale_factor=scale,
-                mode="nearest",
-                align_corners=False,
-                data_format="NCDHW")
+            expect_res = nearest_neighbor_interp3d_np(input_data,
+                                                      out_d=12,
+                                                      out_h=12,
+                                                      out_w=12,
+                                                      align_corners=False)
+            out = interpolate(x=input_x,
+                              scale_factor=scale,
+                              mode="nearest",
+                              align_corners=False,
+                              data_format="NCDHW")
             self.assertTrue(np.allclose(out.numpy(), expect_res))
 
 
 class TestNearestInterpException(unittest.TestCase):
+
     def test_exception(self):
         import paddle
         input = fluid.data(name="input", shape=[1, 3, 6, 6], dtype="float32")
 
         def attr_data_format():
             # for 4-D input, data_format can only be NCHW or NHWC
-            out = fluid.layers.resize_nearest(
-                input, out_shape=[4, 8], data_format='NDHWC')
+            out = fluid.layers.resize_nearest(input,
+                                              out_shape=[4, 8],
+                                              data_format='NDHWC')
 
         def attr_scale_type():
             out = fluid.layers.resize_nearest(input, scale='scale')
@@ -719,8 +756,9 @@ def input_shape_error():
 
         def mode_error():
             x = paddle.randn([1, 3])
-            out = paddle.nn.functional.interpolate(
-                x, scale_factor='scale', mode="BILINEAR")
+            out = paddle.nn.functional.interpolate(x,
+                                                   scale_factor='scale',
+                                                   mode="BILINEAR")
 
         self.assertRaises(ValueError, attr_data_format)
         self.assertRaises(TypeError, attr_scale_type)
diff --git a/python/paddle/fluid/tests/unittests/test_neg_op.py b/python/paddle/fluid/tests/unittests/test_neg_op.py
index e7b16bde02357..473d2b77bae70 100644
--- a/python/paddle/fluid/tests/unittests/test_neg_op.py
+++ b/python/paddle/fluid/tests/unittests/test_neg_op.py
@@ -18,6 +18,7 @@
 
 
 class TestNegOp(unittest.TestCase):
+
     def setUp(self):
         self.init_dtype_type()
         self.input = (np.random.random((32, 8)) * 100).astype(self.dtype)
@@ -63,26 +64,31 @@ def test_gpu(self):
 
 
 class TestNegOpFp32(TestNegOp):
+
     def init_dtype_type(self):
         self.dtype = np.float32
 
 
 class TestNegOpInt64(TestNegOp):
+
     def init_dtype_type(self):
         self.dtype = np.int64
 
 
 class TestNegOpInt32(TestNegOp):
+
     def init_dtype_type(self):
         self.dtype = np.int32
 
 
 class TestNegOpInt16(TestNegOp):
+
     def init_dtype_type(self):
         self.dtype = np.int16
 
 
 class TestNegOpInt8(TestNegOp):
+
     def init_dtype_type(self):
         self.dtype = np.int8
 
diff --git a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
index 60dcf195daf61..7f230164d6027 100644
--- a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
+++ b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
@@ -26,6 +26,7 @@
 
 
 class TestNetWithDtype(unittest.TestCase):
+
     def setUp(self):
         self.dtype = "float64"
         self.init_dtype()
@@ -43,8 +44,8 @@ def run_net_on_place(self, place):
             sgd_optimizer.minimize(avg_cost)
 
         fetch_list = [avg_cost]
-        train_reader = paddle.batch(
-            paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE)
+        train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
+                                    batch_size=BATCH_SIZE)
         feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
         exe = fluid.Executor(place)
         exe.run(startup)
diff --git a/python/paddle/fluid/tests/unittests/test_new_group_api.py b/python/paddle/fluid/tests/unittests/test_new_group_api.py
index b9b80d3b431ea..af8df48ff23fb 100644
--- a/python/paddle/fluid/tests/unittests/test_new_group_api.py
+++ b/python/paddle/fluid/tests/unittests/test_new_group_api.py
@@ -23,6 +23,7 @@
 
 
 class TestCollectiveAllreduceAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py
index 53ade0dfb79c1..0143bdb53242c 100755
--- a/python/paddle/fluid/tests/unittests/test_newprofiler.py
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py
@@ -27,15 +27,18 @@
 
 
 class TestProfiler(unittest.TestCase):
+
     def test_profiler(self):
+
         def my_trace_back(prof):
             profiler.export_chrome_tracing('./test_profiler_chrometracing/')(
                 prof)
             profiler.export_protobuf('./test_profiler_pb/')(prof)
 
         x_value = np.random.randn(2, 3, 3)
-        x = paddle.to_tensor(
-            x_value, stop_gradient=False, place=paddle.CPUPlace())
+        x = paddle.to_tensor(x_value,
+                             stop_gradient=False,
+                             place=paddle.CPUPlace())
         y = x / 2.0
         ones_like_y = paddle.ones_like(y)
         with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU], ) as prof:
@@ -45,26 +48,27 @@ def my_trace_back(prof):
         with profiler.RecordEvent(name='test'):
             y = x / 2.0
 
-        with profiler.Profiler(
-                targets=[profiler.ProfilerTarget.CPU],
-                scheduler=(1, 2)) as prof:
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU],
+                               scheduler=(1, 2)) as prof:
             self.assertEqual(utils._is_profiler_used, True)
             with profiler.RecordEvent(name='test'):
                 y = x / 2.0
 
         prof = None
-        with profiler.Profiler(
-                targets=[profiler.ProfilerTarget.CPU],
-                scheduler=profiler.make_scheduler(
-                    closed=0, ready=1, record=1, repeat=1),
-                on_trace_ready=my_trace_back) as prof:
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU],
+                               scheduler=profiler.make_scheduler(closed=0,
+                                                                 ready=1,
+                                                                 record=1,
+                                                                 repeat=1),
+                               on_trace_ready=my_trace_back) as prof:
             y = x / 2.0
         prof = None
-        with profiler.Profiler(
-                targets=[profiler.ProfilerTarget.CPU],
-                scheduler=profiler.make_scheduler(
-                    closed=0, ready=0, record=2, repeat=1),
-                on_trace_ready=my_trace_back) as prof:
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU],
+                               scheduler=profiler.make_scheduler(closed=0,
+                                                                 ready=0,
+                                                                 record=2,
+                                                                 repeat=1),
+                               on_trace_ready=my_trace_back) as prof:
             for i in range(3):
                 y = x / 2.0
                 prof.step()
@@ -106,26 +110,26 @@ def my_sheduler1(num_step):
                 y = x / 2.0
                 prof.step()
         prof = None
-        with profiler.Profiler(
-                targets=[profiler.ProfilerTarget.CPU],
-                scheduler=my_sheduler,
-                on_trace_ready=my_trace_back) as prof:
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU],
+                               scheduler=my_sheduler,
+                               on_trace_ready=my_trace_back) as prof:
             for i in range(5):
                 y = x / 2.0
                 prof.step()
         prof = None
-        with profiler.Profiler(
-                targets=[profiler.ProfilerTarget.CPU],
-                scheduler=my_sheduler1) as prof:
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU],
+                               scheduler=my_sheduler1) as prof:
             for i in range(5):
                 y = x / 2.0
                 prof.step()
         prof = None
-        with profiler.Profiler(
-                targets=[profiler.ProfilerTarget.CPU],
-                scheduler=profiler.make_scheduler(
-                    closed=1, ready=1, record=2, repeat=1, skip_first=1),
-                on_trace_ready=my_trace_back) as prof:
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU],
+                               scheduler=profiler.make_scheduler(closed=1,
+                                                                 ready=1,
+                                                                 record=2,
+                                                                 repeat=1,
+                                                                 skip_first=1),
+                               on_trace_ready=my_trace_back) as prof:
             for i in range(5):
                 y = x / 2.0
                 paddle.grad(outputs=y, inputs=[x], grad_outputs=ones_like_y)
@@ -139,8 +143,11 @@ def my_sheduler1(num_step):
         simple_net = SimpleNet()
         opt = paddle.optimizer.SGD(learning_rate=1e-3,
                                    parameters=simple_net.parameters())
-        loader = DataLoader(
-            dataset, batch_size=4, shuffle=True, drop_last=True, num_workers=2)
+        loader = DataLoader(dataset,
+                            batch_size=4,
+                            shuffle=True,
+                            drop_last=True,
+                            num_workers=2)
         prof = profiler.Profiler(on_trace_ready=lambda prof: None)
         prof.start()
         for i, (image, label) in enumerate(loader()):
@@ -157,8 +164,8 @@ def my_sheduler1(num_step):
         dataset = RandomDataset(10 * 4)
         simple_net = SimpleNet()
         loader = DataLoader(dataset, batch_size=4, shuffle=True, drop_last=True)
-        opt = paddle.optimizer.Adam(
-            learning_rate=1e-3, parameters=simple_net.parameters())
+        opt = paddle.optimizer.Adam(learning_rate=1e-3,
+                                    parameters=simple_net.parameters())
         prof = profiler.Profiler(on_trace_ready=lambda prof: None)
         prof.start()
         for i, (image, label) in enumerate(loader()):
@@ -173,16 +180,19 @@ def my_sheduler1(num_step):
 
 
 class TestNvprof(unittest.TestCase):
+
     def test_nvprof(self):
         for i in range(10):
             paddle.fluid.profiler._nvprof_range(i, 10, 20)
             x_value = np.random.randn(2, 3, 3)
-            x = paddle.to_tensor(
-                x_value, stop_gradient=False, place=paddle.CPUPlace())
+            x = paddle.to_tensor(x_value,
+                                 stop_gradient=False,
+                                 place=paddle.CPUPlace())
             y = x / 2.0
 
 
 class TestGetProfiler(unittest.TestCase):
+
     def test_getprofiler(self):
         config_content = '''
         {
@@ -207,8 +217,9 @@ def test_getprofiler(self):
         import paddle.profiler.profiler as profiler
         profiler = profiler.get_profiler(filehandle.name)
         x_value = np.random.randn(2, 3, 3)
-        x = paddle.to_tensor(
-            x_value, stop_gradient=False, place=paddle.CPUPlace())
+        x = paddle.to_tensor(x_value,
+                             stop_gradient=False,
+                             place=paddle.CPUPlace())
         with profiler:
             for i in range(5):
                 y = x / 2.0
@@ -249,7 +260,7 @@ def test_getprofiler(self):
         except:
             pass
 
-        # test scheduler 
+        # test scheduler
         config_content = '''
         {
         "targets": ["Cpu", "Gpu"],
@@ -323,6 +334,7 @@ def test_getprofiler(self):
 
 
 class RandomDataset(Dataset):
+
     def __init__(self, num_samples):
         self.num_samples = num_samples
 
@@ -336,6 +348,7 @@ def __len__(self):
 
 
 class SimpleNet(nn.Layer):
+
     def __init__(self):
         super(SimpleNet, self).__init__()
         self.fc = nn.Linear(100, 10)
@@ -345,18 +358,19 @@ def forward(self, image, label=None):
 
 
 class TestTimerOnly(unittest.TestCase):
+
     def test_with_dataloader(self):
+
         def train(step_num_samples=None):
             dataset = RandomDataset(20 * 4)
             simple_net = SimpleNet()
             opt = paddle.optimizer.SGD(learning_rate=1e-3,
                                        parameters=simple_net.parameters())
-            loader = DataLoader(
-                dataset,
-                batch_size=4,
-                shuffle=True,
-                drop_last=True,
-                num_workers=2)
+            loader = DataLoader(dataset,
+                                batch_size=4,
+                                shuffle=True,
+                                drop_last=True,
+                                num_workers=2)
             step_info = ''
             p = profiler.Profiler(timer_only=True)
             p.start()
diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py b/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py
index 05e7920035456..d7a7a25d7aebf 100755
--- a/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py
@@ -20,6 +20,7 @@
 
 
 class TestStatisticHelper(unittest.TestCase):
+
     def test_sum_ranges_case1(self):
         src = [(1, 3), (4, 10), (11, 15)]
         self.assertEqual(statistic_helper.sum_ranges(src), 12)
diff --git a/python/paddle/fluid/tests/unittests/test_nll_loss.py b/python/paddle/fluid/tests/unittests/test_nll_loss.py
index c53fdffe1cf1b..eb027951c5235 100644
--- a/python/paddle/fluid/tests/unittests/test_nll_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nll_loss.py
@@ -20,7 +20,10 @@
 from paddle.fluid.framework import _test_eager_guard
 
 
-def nll_loss_1d(logs, targets, weight=None, reduction='mean',
+def nll_loss_1d(logs,
+                targets,
+                weight=None,
+                reduction='mean',
                 ignore_index=-100):
     input_shape = logs.shape
     N = input_shape[0]
@@ -38,13 +41,16 @@ def nll_loss_1d(logs, targets, weight=None, reduction='mean',
     if reduction == 'sum':
         return np.sum(out), np.array([total_weight]).astype('float64')
     elif reduction == 'mean':
-        return out.sum() / total_weight, np.array(
-            [total_weight]).astype('float64')
+        return out.sum() / total_weight, np.array([total_weight
+                                                   ]).astype('float64')
     elif reduction == 'none':
         return out
 
 
-def nll_loss_2d(logs, targets, weight=None, reduction='mean',
+def nll_loss_2d(logs,
+                targets,
+                weight=None,
+                reduction='mean',
                 ignore_index=-100):
     input_shape = logs.shape
     N = input_shape[0]
@@ -65,13 +71,14 @@ def nll_loss_2d(logs, targets, weight=None, reduction='mean',
     if reduction == 'sum':
         return np.sum(out), np.array([total_weight]).astype('float64')
     elif reduction == 'mean':
-        return out.sum() / total_weight, np.array(
-            [total_weight]).astype('float64')
+        return out.sum() / total_weight, np.array([total_weight
+                                                   ]).astype('float64')
     elif reduction == 'none':
         return out
 
 
 class TestNLLLoss(unittest.TestCase):
+
     def test_NLLLoss_1D_mean(self):
         np.random.seed(200)
         input_np = np.random.random(size=(10, 10)).astype(np.float64)
@@ -79,8 +86,8 @@ def test_NLLLoss_1D_mean(self):
         label_np = np.random.randint(0, 10, size=(10, )).astype(np.int64)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         #place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[10, 10], dtype='float64')
@@ -89,23 +96,24 @@ def test_NLLLoss_1D_mean(self):
             res = nll_loss(input, label)
 
             exe = fluid.Executor(place)
-            static_result = exe.run(
-                prog,
-                feed={"input": input_np,
-                      "label": label_np},
-                fetch_list=[res])
+            static_result = exe.run(prog,
+                                    feed={
+                                        "input": input_np,
+                                        "label": label_np
+                                    },
+                                    fetch_list=[res])
 
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss()
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
 
         with fluid.dygraph.guard():
             with _test_eager_guard():
                 nll_loss = paddle.nn.loss.NLLLoss()
-                eager_res = nll_loss(
-                    paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+                eager_res = nll_loss(paddle.to_tensor(input_np),
+                                     paddle.to_tensor(label_np))
                 eager_result = eager_res.numpy()
 
         expected = nll_loss_1d(input_np, label_np)[0]
@@ -121,8 +129,8 @@ def test_NLLLoss_1D_sum(self):
         label_np = np.random.randint(0, 10, size=(10, )).astype(np.int64)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         #place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[10, 10], dtype='float64')
@@ -131,16 +139,17 @@ def test_NLLLoss_1D_sum(self):
             res = nll_loss(input, label)
 
             exe = fluid.Executor(place)
-            static_result = exe.run(
-                prog,
-                feed={"input": input_np,
-                      "label": label_np},
-                fetch_list=[res])
+            static_result = exe.run(prog,
+                                    feed={
+                                        "input": input_np,
+                                        "label": label_np
+                                    },
+                                    fetch_list=[res])
 
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss(reduction='sum')
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
 
             with _test_eager_guard():
@@ -167,8 +176,8 @@ def test_NLLLoss_1D_with_weight_mean(self):
         weight_np = np.random.random(size=(10, )).astype(np.float64)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         # place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[10, 10], dtype='float64')
@@ -189,15 +198,15 @@ def test_NLLLoss_1D_with_weight_mean(self):
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss(
                 weight=paddle.to_tensor(weight_np))
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
 
             with _test_eager_guard():
                 nll_loss = paddle.nn.loss.NLLLoss(
                     weight=paddle.to_tensor(weight_np))
-                eager_res = nll_loss(
-                    paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+                eager_res = nll_loss(paddle.to_tensor(input_np),
+                                     paddle.to_tensor(label_np))
                 loss = eager_res.sum()
                 loss.backward()
                 eager_result = eager_res.numpy()
@@ -217,8 +226,8 @@ def test_NLLLoss_1D_with_weight_sum(self):
         weight_np = np.random.random(size=(10, )).astype(np.float64)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         # place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[10, 10], dtype='float64')
@@ -239,11 +248,13 @@ def test_NLLLoss_1D_with_weight_sum(self):
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss(
                 weight=paddle.to_tensor(weight_np), reduction='sum')
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
-        expected = nll_loss_1d(
-            input_np, label_np, weight=weight_np, reduction='sum')[0]
+        expected = nll_loss_1d(input_np,
+                               label_np,
+                               weight=weight_np,
+                               reduction='sum')[0]
 
         self.assertTrue(np.allclose(static_result, expected))
         self.assertTrue(np.allclose(static_result, dy_result))
@@ -277,8 +288,8 @@ def test_NLLLoss_1D_with_weight_mean_cpu(self):
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss(
                 weight=paddle.to_tensor(weight_np))
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
         expected = nll_loss_1d(input_np, label_np, weight=weight_np)[0]
 
@@ -314,11 +325,13 @@ def test_NLLLoss_1D_with_weight_no_reduce_cpu(self):
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss(
                 weight=paddle.to_tensor(weight_np), reduction='none')
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
-        expected = nll_loss_1d(
-            input_np, label_np, weight=weight_np, reduction='none')
+        expected = nll_loss_1d(input_np,
+                               label_np,
+                               weight=weight_np,
+                               reduction='none')
 
         self.assertTrue(np.allclose(static_result, expected))
         self.assertTrue(np.allclose(static_result, dy_result))
@@ -331,27 +344,29 @@ def test_NLLLoss_2D_mean(self):
         label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         #place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            input = fluid.data(name='input',
+                               shape=[5, 3, 5, 5],
+                               dtype='float64')
             label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
             nll_loss = paddle.nn.loss.NLLLoss()
             res = nll_loss(input, label)
 
             exe = fluid.Executor(place)
-            static_result = exe.run(
-                prog,
-                feed={"input": input_np,
-                      "label": label_np},
-                fetch_list=[res])
+            static_result = exe.run(prog,
+                                    feed={
+                                        "input": input_np,
+                                        "label": label_np
+                                    },
+                                    fetch_list=[res])
 
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss()
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
 
         expected = nll_loss_2d(input_np, label_np)[0]
@@ -367,27 +382,29 @@ def test_NLLLoss_2D_sum(self):
         label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         #place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            input = fluid.data(name='input',
+                               shape=[5, 3, 5, 5],
+                               dtype='float64')
             label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
             nll_loss = paddle.nn.loss.NLLLoss(reduction='sum')
             res = nll_loss(input, label)
 
             exe = fluid.Executor(place)
-            static_result = exe.run(
-                prog,
-                feed={"input": input_np,
-                      "label": label_np},
-                fetch_list=[res])
+            static_result = exe.run(prog,
+                                    feed={
+                                        "input": input_np,
+                                        "label": label_np
+                                    },
+                                    fetch_list=[res])
 
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss(reduction='sum')
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
 
         expected = nll_loss_2d(input_np, label_np, reduction='sum')[0]
@@ -404,12 +421,13 @@ def test_NLLLoss_2D_with_weight_mean(self):
         weight_np = np.random.random(size=(3, )).astype(np.float64)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         #place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            input = fluid.data(name='input',
+                               shape=[5, 3, 5, 5],
+                               dtype='float64')
             label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
             weight = fluid.data(name='weight', shape=[3], dtype='float64')
 
@@ -428,8 +446,8 @@ def test_NLLLoss_2D_with_weight_mean(self):
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss(
                 weight=paddle.to_tensor(weight_np))
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
 
         expected = nll_loss_2d(input_np, label_np, weight=weight_np)[0]
@@ -448,8 +466,9 @@ def test_NLLLoss_2D_with_weight_mean_cpu(self):
         startup_prog = fluid.Program()
         place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            input = fluid.data(name='input',
+                               shape=[5, 3, 5, 5],
+                               dtype='float64')
             label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
             weight = fluid.data(name='weight', shape=[3], dtype='float64')
 
@@ -468,8 +487,8 @@ def test_NLLLoss_2D_with_weight_mean_cpu(self):
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss(
                 weight=paddle.to_tensor(weight_np))
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
 
         expected = nll_loss_2d(input_np, label_np, weight=weight_np)[0]
@@ -486,11 +505,12 @@ def test_NLLLoss_2D_with_weight_sum(self):
         weight_np = np.random.random(size=(3, )).astype(np.float64)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
+            input = fluid.data(name='input',
+                               shape=[5, 3, 5, 5],
+                               dtype='float64')
             label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
             weight = fluid.data(name='weight', shape=[3], dtype='float64')
 
@@ -509,12 +529,14 @@ def test_NLLLoss_2D_with_weight_sum(self):
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss(
                 weight=paddle.to_tensor(weight_np), reduction='sum')
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
 
-        expected = nll_loss_2d(
-            input_np, label_np, weight=weight_np, reduction='sum')[0]
+        expected = nll_loss_2d(input_np,
+                               label_np,
+                               weight=weight_np,
+                               reduction='sum')[0]
 
         self.assertTrue(np.allclose(static_result, expected))
         self.assertTrue(np.allclose(static_result, dy_result))
@@ -527,27 +549,29 @@ def test_NLLLoss_in_dims_not_2or4_mean(self):
         label_np = np.random.randint(0, 3, size=(5, 5, 5, 5)).astype(np.int64)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         #place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5, 5], dtype='float64')
+            input = fluid.data(name='input',
+                               shape=[5, 3, 5, 5, 5],
+                               dtype='float64')
             label = fluid.data(name='label', shape=[5, 5, 5, 5], dtype='int64')
             nll_loss = paddle.nn.loss.NLLLoss()
             res = nll_loss(input, label)
 
             exe = fluid.Executor(place)
-            static_result = exe.run(
-                prog,
-                feed={"input": input_np,
-                      "label": label_np},
-                fetch_list=[res])
+            static_result = exe.run(prog,
+                                    feed={
+                                        "input": input_np,
+                                        "label": label_np
+                                    },
+                                    fetch_list=[res])
 
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss()
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
 
         input_shape = input_np.shape
@@ -569,12 +593,13 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_mean(self):
         weight_np = np.random.random(size=(3, )).astype(np.float64)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         #place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5, 5], dtype='float64')
+            input = fluid.data(name='input',
+                               shape=[5, 3, 5, 5, 5],
+                               dtype='float64')
             label = fluid.data(name='label', shape=[5, 5, 5, 5], dtype='int64')
             weight = fluid.data(name='weight', shape=[3], dtype='float64')
             nll_loss = paddle.nn.loss.NLLLoss(weight=weight)
@@ -592,8 +617,8 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_mean(self):
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss(
                 weight=paddle.to_tensor(weight_np))
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
 
         input_shape = input_np.shape
@@ -601,8 +626,9 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_mean(self):
         input_np_reshape = np.reshape(input_np,
                                       (input_shape[0], input_shape[1], 1, -1))
         label_np_reshape = np.reshape(label_np, (label_shape[0], 1, -1))
-        expected = nll_loss_2d(
-            input_np_reshape, label_np_reshape, weight=weight_np)[0]
+        expected = nll_loss_2d(input_np_reshape,
+                               label_np_reshape,
+                               weight=weight_np)[0]
 
         self.assertTrue(np.allclose(static_result, expected))
         self.assertTrue(np.allclose(static_result, dy_result))
@@ -616,12 +642,13 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_sum(self):
         weight_np = np.random.random(size=(3, )).astype(np.float64)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5, 5], dtype='float64')
+            input = fluid.data(name='input',
+                               shape=[5, 3, 5, 5, 5],
+                               dtype='float64')
             label = fluid.data(name='label', shape=[5, 5, 5, 5], dtype='int64')
             weight = fluid.data(name='weight', shape=[3], dtype='float64')
             nll_loss = paddle.nn.loss.NLLLoss(weight=weight, reduction='sum')
@@ -639,8 +666,8 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_sum(self):
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss(
                 weight=paddle.to_tensor(weight_np), reduction='sum')
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
 
         input_shape = input_np.shape
@@ -648,11 +675,10 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_sum(self):
         input_np_reshape = np.reshape(input_np,
                                       (input_shape[0], input_shape[1], 1, -1))
         label_np_reshape = np.reshape(label_np, (label_shape[0], 1, -1))
-        expected = nll_loss_2d(
-            input_np_reshape,
-            label_np_reshape,
-            weight=weight_np,
-            reduction='sum')[0]
+        expected = nll_loss_2d(input_np_reshape,
+                               label_np_reshape,
+                               weight=weight_np,
+                               reduction='sum')[0]
 
         self.assertTrue(np.allclose(static_result, expected))
         self.assertTrue(np.allclose(static_result, dy_result))
@@ -666,12 +692,13 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_no_reduce(self):
         weight_np = np.random.random(size=(3, )).astype(np.float64)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         #place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5, 5], dtype='float64')
+            input = fluid.data(name='input',
+                               shape=[5, 3, 5, 5, 5],
+                               dtype='float64')
             label = fluid.data(name='label', shape=[5, 5, 5, 5], dtype='int64')
             weight = fluid.data(name='weight', shape=[3], dtype='float64')
             nll_loss = paddle.nn.loss.NLLLoss(weight=weight, reduction='none')
@@ -689,8 +716,8 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_no_reduce(self):
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss(
                 weight=paddle.to_tensor(weight_np), reduction='none')
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
 
         input_shape = input_np.shape
@@ -699,11 +726,10 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_no_reduce(self):
         input_np_reshape = np.reshape(input_np,
                                       (input_shape[0], input_shape[1], 1, -1))
         label_np_reshape = np.reshape(label_np, (label_shape[0], 1, -1))
-        expected = nll_loss_2d(
-            input_np_reshape,
-            label_np_reshape,
-            weight=weight_np,
-            reduction='none')
+        expected = nll_loss_2d(input_np_reshape,
+                               label_np_reshape,
+                               weight=weight_np,
+                               reduction='none')
         expected = np.reshape(expected, out_shape)
         self.assertTrue(np.allclose(static_result, expected))
         self.assertTrue(np.allclose(static_result, dy_result))
@@ -719,8 +745,9 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_no_reduce_cpu(self):
         startup_prog = fluid.Program()
         place = fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5, 5], dtype='float64')
+            input = fluid.data(name='input',
+                               shape=[5, 3, 5, 5, 5],
+                               dtype='float64')
             label = fluid.data(name='label', shape=[5, 5, 5, 5], dtype='int64')
             weight = fluid.data(name='weight', shape=[3], dtype='float64')
             nll_loss = paddle.nn.loss.NLLLoss(weight=weight, reduction='none')
@@ -738,8 +765,8 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_no_reduce_cpu(self):
         with fluid.dygraph.guard():
             nll_loss = paddle.nn.loss.NLLLoss(
                 weight=paddle.to_tensor(weight_np), reduction='none')
-            dy_res = nll_loss(
-                paddle.to_tensor(input_np), paddle.to_tensor(label_np))
+            dy_res = nll_loss(paddle.to_tensor(input_np),
+                              paddle.to_tensor(label_np))
             dy_result = dy_res.numpy()
 
         input_shape = input_np.shape
@@ -748,11 +775,10 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_no_reduce_cpu(self):
         input_np_reshape = np.reshape(input_np,
                                       (input_shape[0], input_shape[1], 1, -1))
         label_np_reshape = np.reshape(label_np, (label_shape[0], 1, -1))
-        expected = nll_loss_2d(
-            input_np_reshape,
-            label_np_reshape,
-            weight=weight_np,
-            reduction='none')
+        expected = nll_loss_2d(input_np_reshape,
+                               label_np_reshape,
+                               weight=weight_np,
+                               reduction='none')
         expected = np.reshape(expected, out_shape)
         self.assertTrue(np.allclose(static_result, expected))
         self.assertTrue(np.allclose(static_result, dy_result))
@@ -760,6 +786,7 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_no_reduce_cpu(self):
 
 
 class TestNLLLossOp1DWithReduce(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.op_type = "nll_loss"
@@ -780,8 +807,9 @@ def setUp(self):
             np.random.seed(200)
             weight_np = np.random.uniform(0.1, 0.8,
                                           self.input_shape[1]).astype("float64")
-            output_np, total_weight_np = nll_loss_1d(
-                input_np, label_np, weight=weight_np)
+            output_np, total_weight_np = nll_loss_1d(input_np,
+                                                     label_np,
+                                                     weight=weight_np)
             self.inputs['Weight'] = weight_np
 
         self.outputs = {'Out': output_np, 'Total_weight': total_weight_np}
@@ -808,6 +836,7 @@ def init_test_case(self):
 
 
 class TestNLLLossOp1DNoReduce(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.op_type = "nll_loss"
@@ -827,8 +856,10 @@ def setUp(self):
             np.random.seed(200)
             weight_np = np.random.uniform(0.1, 0.8,
                                           self.input_shape[1]).astype("float64")
-            output_np, total_weight_np = nll_loss_1d(
-                input_np, label_np, weight=weight_np, reduction='none')
+            output_np, total_weight_np = nll_loss_1d(input_np,
+                                                     label_np,
+                                                     weight=weight_np,
+                                                     reduction='none')
             self.inputs['Weight'] = weight_np
 
         self.outputs = {'Out': output_np, 'Total_weight': total_weight_np}
@@ -855,6 +886,7 @@ def init_test_case(self):
 
 
 class TestNLLLossOp2DWithReduce(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.op_type = "nll_loss"
@@ -873,8 +905,9 @@ def setUp(self):
             np.random.seed(200)
             weight_np = np.random.uniform(0.1, 0.8,
                                           self.input_shape[1]).astype("float64")
-            output_np, total_weight_np = nll_loss_2d(
-                input_np, label_np, weight=weight_np)
+            output_np, total_weight_np = nll_loss_2d(input_np,
+                                                     label_np,
+                                                     weight=weight_np)
             self.inputs['Weight'] = weight_np
 
         self.outputs = {'Out': output_np, 'Total_weight': total_weight_np}
@@ -901,6 +934,7 @@ def init_test_case(self):
 
 
 class TestNLLLossOp2DNoReduce(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.op_type = "nll_loss"
@@ -920,8 +954,10 @@ def setUp(self):
             np.random.seed(200)
             weight_np = np.random.uniform(0.1, 0.8,
                                           self.input_shape[1]).astype("float64")
-            output_np, total_weight_np = nll_loss_2d(
-                input_np, label_np, weight=weight_np, reduction='none')
+            output_np, total_weight_np = nll_loss_2d(input_np,
+                                                     label_np,
+                                                     weight=weight_np,
+                                                     reduction='none')
             self.inputs['Weight'] = weight_np
 
         self.outputs = {'Out': output_np, 'Total_weight': total_weight_np}
@@ -948,6 +984,7 @@ def init_test_case(self):
 
 
 class TestNLLLossName(unittest.TestCase):
+
     def test_name(self):
         prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -961,15 +998,22 @@ def test_name(self):
 
 
 class TestNLLLossInvalidArgs(unittest.TestCase):
+
     def test_x_dim_value_error(self):
+
         def test_x_dim_lt_2():
             prog = paddle.static.Program()
             startup_prog = paddle.static.Program()
             place = paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                x = paddle.fluid.data(name='x', shape=[10, ], dtype='float64')
-                label = paddle.fluid.data(
-                    name='label', shape=[10, ], dtype='float64')
+                x = paddle.fluid.data(name='x', shape=[
+                    10,
+                ], dtype='float64')
+                label = paddle.fluid.data(name='label',
+                                          shape=[
+                                              10,
+                                          ],
+                                          dtype='float64')
                 nll_loss = paddle.nn.loss.NLLLoss()
                 res = nll_loss(x, label)
 
@@ -987,14 +1031,16 @@ def test_x_dim_imperative_lt_2():
         self.assertRaises(ValueError, test_x_dim_imperative_lt_2)
 
     def test_reduction_value_error(self):
+
         def test_NLLLoss_reduction_not_sum_mean_none():
             prog = paddle.static.Program()
             startup_prog = paddle.static.Program()
             place = paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
                 x = paddle.fluid.data(name='x', shape=[10, 10], dtype='float64')
-                label = paddle.fluid.data(
-                    name='label', shape=[10], dtype='int64')
+                label = paddle.fluid.data(name='label',
+                                          shape=[10],
+                                          dtype='int64')
                 nll_loss = paddle.nn.loss.NLLLoss(reduction='')
                 res = nll_loss(x, label)
 
@@ -1018,8 +1064,9 @@ def test_nll_loss_function_reduction_not_sum_mean_none():
             place = paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
                 x = paddle.fluid.data(name='x', shape=[10, 10], dtype='float64')
-                label = paddle.fluid.data(
-                    name='label', shape=[10], dtype='int64')
+                label = paddle.fluid.data(name='label',
+                                          shape=[10],
+                                          dtype='int64')
                 res = paddle.nn.functional.nll_loss(x, label, reduction='')
 
         self.assertRaises(ValueError,
diff --git a/python/paddle/fluid/tests/unittests/test_nms_op.py b/python/paddle/fluid/tests/unittests/test_nms_op.py
index 1b5ac1f1337d0..f3c253d45c0de 100644
--- a/python/paddle/fluid/tests/unittests/test_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nms_op.py
@@ -68,6 +68,7 @@ def nms(boxes, nms_threshold):
 
 
 class TestNMSOp(OpTest):
+
     def setUp(self):
         self.op_type = 'nms'
         self.dtype = np.float64
diff --git a/python/paddle/fluid/tests/unittests/test_nn_dice_loss.py b/python/paddle/fluid/tests/unittests/test_nn_dice_loss.py
index 316063767771f..8dfaca25e2798 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_dice_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_dice_loss.py
@@ -24,6 +24,7 @@
 
 
 class TestDiceLossValue(unittest.TestCase):
+
     def test_dice_loss(self):
         input_ = paddle.rand([2, 3, num_classes])
         label_ = paddle.randint(0, num_classes, [2, 3, 1], dtype=paddle.int64)
@@ -41,19 +42,23 @@ def test_dice_loss(self):
 
 
 class TestDiceLossInvalidInput(unittest.TestCase):
+
     def test_error(self):
+
         def test_invalid_dtype():
             input_ = paddle.rand([2, 3, num_classes], dtype=paddle.float32)
-            label_ = paddle.randint(
-                0, num_classes, [2, 3, 1], dtype=paddle.int64)
+            label_ = paddle.randint(0,
+                                    num_classes, [2, 3, 1],
+                                    dtype=paddle.int64)
             nn.dice_loss(input_, label_.astype(paddle.float32))
 
         self.assertRaises(AssertionError, test_invalid_dtype)
 
         def test_zero_shape_input():
             input_ = paddle.rand([0, 3, num_classes], dtype=paddle.float32)
-            label_ = paddle.randint(
-                0, num_classes, [0, 3, 1], dtype=paddle.int64)
+            label_ = paddle.randint(0,
+                                    num_classes, [0, 3, 1],
+                                    dtype=paddle.int64)
             nn.dice_loss(input_, label_)
 
         self.assertRaises(AssertionError, test_zero_shape_input)
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
index 0b5493e21705f..98e323c0d9e68 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
@@ -25,6 +25,7 @@
 
 
 class EmbeddingDygraph(unittest.TestCase):
+
     def func_1(self):
         x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
         paddle.disable_static(paddle.CPUPlace())
@@ -35,8 +36,8 @@ def func_1(self):
         w0 = np.full(shape=(10, 3), fill_value=2).astype(np.float32)
         embedding.weight.set_value(w0)
 
-        adam = paddle.optimizer.Adam(
-            parameters=[embedding.weight], learning_rate=0.01)
+        adam = paddle.optimizer.Adam(parameters=[embedding.weight],
+                                     learning_rate=0.01)
         adam.clear_grad()
 
         out = embedding(x)
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
index 4af0cce12b733..62267bdf6f4fe 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
@@ -21,6 +21,7 @@
 
 
 class EmbeddingStatic(unittest.TestCase):
+
     def test_1(self):
         prog = fluid.Program()
         with fluid.program_guard(prog):
@@ -29,23 +30,24 @@ def test_bad_x():
                 initializer = fluid.initializer.NumpyArrayInitializer(
                     np.random.random(size=(128, 100)))
 
-                param_attr = fluid.ParamAttr(
-                    name="emb_weight",
-                    learning_rate=0.5,
-                    initializer=initializer,
-                    trainable=True)
+                param_attr = fluid.ParamAttr(name="emb_weight",
+                                             learning_rate=0.5,
+                                             initializer=initializer,
+                                             trainable=True)
 
-                weight = prog.global_block().create_parameter(
-                    (128, 100), attr=param_attr, dtype="float32")
+                weight = prog.global_block().create_parameter((128, 100),
+                                                              attr=param_attr,
+                                                              dtype="float32")
 
-                label = fluid.layers.data(
-                    name="label",
-                    shape=[4],
-                    append_batch_size=False,
-                    dtype="int64")
+                label = fluid.layers.data(name="label",
+                                          shape=[4],
+                                          append_batch_size=False,
+                                          dtype="int64")
 
-                emb = functional.embedding(
-                    x=label, weight=weight, sparse=True, name="embedding")
+                emb = functional.embedding(x=label,
+                                           weight=weight,
+                                           sparse=True,
+                                           name="embedding")
 
             test_bad_x()
 
@@ -57,27 +59,25 @@ def test_bad_x():
                 initializer = fluid.initializer.NumpyArrayInitializer(
                     np.random.random(size=(128, 100)))
 
-                param_attr = fluid.ParamAttr(
-                    name="emb_weight",
-                    learning_rate=0.5,
-                    initializer=initializer,
-                    trainable=True)
-
-                weight = prog.global_block().create_parameter(
-                    (128, 100), attr=param_attr, dtype="float32")
-
-                label = fluid.layers.data(
-                    name="label",
-                    shape=[4],
-                    append_batch_size=False,
-                    dtype="int32")
-
-                emb = functional.embedding(
-                    x=label,
-                    weight=weight,
-                    padding_idx=129,
-                    sparse=True,
-                    name="embedding")
+                param_attr = fluid.ParamAttr(name="emb_weight",
+                                             learning_rate=0.5,
+                                             initializer=initializer,
+                                             trainable=True)
+
+                weight = prog.global_block().create_parameter((128, 100),
+                                                              attr=param_attr,
+                                                              dtype="float32")
+
+                label = fluid.layers.data(name="label",
+                                          shape=[4],
+                                          append_batch_size=False,
+                                          dtype="int32")
+
+                emb = functional.embedding(x=label,
+                                           weight=weight,
+                                           padding_idx=129,
+                                           sparse=True,
+                                           name="embedding")
 
         with self.assertRaises(ValueError):
             test_bad_x()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py b/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py
index 9b7ba5c4b052f..f0a1bdc76d814 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py
@@ -27,6 +27,7 @@
 
 
 class TestOneHotOp(OpTest):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         depth = 10
@@ -50,6 +51,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_attr(OpTest):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         depth = 10
@@ -73,6 +75,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_default_dtype(OpTest):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         depth = 10
@@ -96,6 +99,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_default_dtype_attr(OpTest):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         depth = 10
@@ -119,6 +123,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_exception(unittest.TestCase):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         self.depth = 10
@@ -134,18 +139,18 @@ def setUp(self):
     def test_check_output(self):
         program = Program()
         with program_guard(program):
-            x = fluid.layers.data(
-                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
+            x = fluid.layers.data(name='x',
+                                  shape=[self.dimension],
+                                  dtype='float32',
+                                  lod_level=1)
             block = program.current_block()
-            one_hot_out = block.create_var(
-                name="one_hot_out",
-                type=core.VarDesc.VarType.LOD_TENSOR,
-                dtype='float32')
-            block.append_op(
-                type='one_hot',
-                inputs={'X': x},
-                attrs={'depth': self.depth},
-                outputs={'Out': one_hot_out})
+            one_hot_out = block.create_var(name="one_hot_out",
+                                           type=core.VarDesc.VarType.LOD_TENSOR,
+                                           dtype='float32')
+            block.append_op(type='one_hot',
+                            inputs={'X': x},
+                            attrs={'depth': self.depth},
+                            outputs={'Out': one_hot_out})
             exe = fluid.Executor(self.place)
 
             def run():
@@ -157,6 +162,7 @@ def run():
 
 
 class TestOneHotOpApi(unittest.TestCase):
+
     def test_api(self):
         num_classes = 10
         self._run(num_classes)
@@ -167,9 +173,9 @@ def test_api_with_depthTensor(self):
 
     def test_api_with_dygraph(self):
         num_classes = 10
-        label = np.array(
-            [np.random.randint(0, num_classes - 1)
-             for i in range(6)]).reshape([6, 1])
+        label = np.array([
+            np.random.randint(0, num_classes - 1) for i in range(6)
+        ]).reshape([6, 1])
         with fluid.dygraph.guard():
             one_hot_label = functional.one_hot(
                 x=fluid.dygraph.to_variable(label), num_classes=num_classes)
@@ -184,21 +190,23 @@ def _run(self, num_classes):
 
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'label': label_data, },
+        ret = exe.run(feed={
+            'label': label_data,
+        },
                       fetch_list=[one_hot_label],
                       return_numpy=False)
 
 
 class BadInputTestOnehotV2(unittest.TestCase):
+
     def test_error(self):
         with fluid.program_guard(fluid.Program()):
 
             def test_bad_x():
-                label = fluid.layers.data(
-                    name="label",
-                    shape=[4],
-                    append_batch_size=False,
-                    dtype="float32")
+                label = fluid.layers.data(name="label",
+                                          shape=[4],
+                                          append_batch_size=False,
+                                          dtype="float32")
                 one_hot_label = functional.one_hot(x=label, num_classes=4)
 
             self.assertRaises(TypeError, test_bad_x)
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index 4685b00b394b7..c168b827e84a1 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -23,26 +23,33 @@
 import paddle.fluid.core as core
 import gradient_checker
 from decorator_helper import prog_scope
+
 paddle.enable_static()
 
 
 class TestSliceOpDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         self.config()
 
-        out = fluid.layers.slice(
-            self.inputs, axes=self.axes, starts=self.starts, ends=self.ends)
-        gradient_checker.double_grad_check(
-            [self.inputs], out, x_init=self.x_arr, place=place)
+        out = fluid.layers.slice(self.inputs,
+                                 axes=self.axes,
+                                 starts=self.starts,
+                                 ends=self.ends)
+        gradient_checker.double_grad_check([self.inputs],
+                                           out,
+                                           x_init=self.x_arr,
+                                           place=place)
 
     def config(self):
         self.starts = [1, 0, -1]
         self.ends = [3, 3, 6]
         self.axes = [0, 1, 2]
         self.x_arr = np.random.random([3, 4, 5, 2]).astype("float64")
-        self.inputs = layers.create_parameter(
-            dtype="float64", shape=[3, 4, 5, 2], name='x')
+        self.inputs = layers.create_parameter(dtype="float64",
+                                              shape=[3, 4, 5, 2],
+                                              name='x')
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -53,16 +60,19 @@ def test_grad(self):
 
 
 class TestSliceOpDoubleGradCheckCase3(TestSliceOpDoubleGradCheck):
+
     def config(self):
         self.starts = [1, -1, 1]
         self.ends = [3, 3, 3]
         self.axes = [0, 1, 2]
         self.x_arr = np.random.random([3, 3, 3]).astype("float64")
-        self.inputs = layers.create_parameter(
-            dtype="float64", shape=[3, 3, 3], name='x3')
+        self.inputs = layers.create_parameter(dtype="float64",
+                                              shape=[3, 3, 3],
+                                              name='x3')
 
 
 class TestReduceMeanWithDimDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [7, 11]
@@ -74,8 +84,11 @@ def func(self, place):
         y = layers.reduce_mean(x, dim=0)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x],
+                                           y,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -86,6 +99,7 @@ def test_grad(self):
 
 
 class TestReduceSumWithDimDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         shape = [7, 11]
@@ -97,8 +111,11 @@ def func(self, place):
         y = layers.reduce_sum(x, dim=0)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x],
+                                           y,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -109,6 +126,7 @@ def test_grad(self):
 
 
 class TestReshapeDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         x_shape = [3, 12]
@@ -121,8 +139,11 @@ def func(self, place):
         out = layers.expand(x, expand_times)
         x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x],
+                                           out,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -133,6 +154,7 @@ def test_grad(self):
 
 
 class TestExpandDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         x_shape = [3, 12]
@@ -145,8 +167,11 @@ def func(self, place):
         out = layers.reshape(x, new_shape)
         x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check([x],
+                                           out,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -157,6 +182,7 @@ def test_grad(self):
 
 
 class TestTileDoubleGradCheck(unittest.TestCase):
+
     def tile_wrapper(self, x):
         return paddle.tile(x[0], [4, 9])
 
@@ -172,10 +198,15 @@ def func(self, place):
         out = paddle.tile(x, repeat_times)
         x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x], out, x_init=x_arr, place=place, eps=eps)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.tile_wrapper, [x], out, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check([x],
+                                           out,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(self.tile_wrapper, [x],
+                                                       out,
+                                                       x_init=x_arr,
+                                                       place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -186,6 +217,7 @@ def test_grad(self):
 
 
 class TestExpandV2DoubleGradCheck(unittest.TestCase):
+
     def expand_wrapper(self, x):
         return paddle.expand(x[0], [4, 12])
 
@@ -201,10 +233,15 @@ def func(self, place):
         out = paddle.expand(x, new_shape)
         x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x], out, x_init=x_arr, place=place, eps=eps)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.expand_wrapper, [x], out, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check([x],
+                                           out,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(self.expand_wrapper, [x],
+                                                       out,
+                                                       x_init=x_arr,
+                                                       place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -215,6 +252,7 @@ def test_grad(self):
 
 
 class TestSqueezeDoubleGradCheck(unittest.TestCase):
+
     def squeeze_warpper(self, x):
         axes = [0, 2]
         return paddle.squeeze(x[0], axes)
@@ -231,10 +269,16 @@ def func(self, place):
         out = paddle.squeeze(x, axes)
         x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x], out, x_init=x_arr, place=place, eps=eps)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.squeeze_warpper, [x], out, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check([x],
+                                           out,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(self.squeeze_warpper,
+                                                       [x],
+                                                       out,
+                                                       x_init=x_arr,
+                                                       place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -245,6 +289,7 @@ def test_grad(self):
 
 
 class TestUnsqueezeDoubleGradCheck(unittest.TestCase):
+
     def unsqueeze_wrapper(self, x):
         axes = [1, 2]
         return paddle.unsqueeze(x[0], axes)
@@ -261,10 +306,16 @@ def func(self, place):
         out = paddle.unsqueeze(x, axes)
         x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x], out, x_init=x_arr, place=place, eps=eps)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.unsqueeze_wrapper, [x], out, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check([x],
+                                           out,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(self.unsqueeze_wrapper,
+                                                       [x],
+                                                       out,
+                                                       x_init=x_arr,
+                                                       place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -275,6 +326,7 @@ def test_grad(self):
 
 
 class TestClipDoubleGradCheck(unittest.TestCase):
+
     def clip_wrapper(self, x):
         return paddle.clip(x[0], min=-1., max=1.)
 
@@ -289,8 +341,10 @@ def func(self, place):
         x_arr = np.random.uniform(-5., 5., x_shape).astype(dtype)
 
         gradient_checker.double_grad_check([x], out, x_init=x_arr, place=place)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.clip_wrapper, [x], out, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check_for_dygraph(self.clip_wrapper, [x],
+                                                       out,
+                                                       x_init=x_arr,
+                                                       place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -301,6 +355,7 @@ def test_grad(self):
 
 
 class TestTransposeDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         x_shape = [3, 40]
@@ -323,6 +378,7 @@ def test_grad(self):
 
 
 class TestTransposeDoubleGradCheckCase1(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         x_shape = [2, 3, 4, 5]
@@ -345,6 +401,7 @@ def test_grad(self):
 
 
 class TestConstantPadDoubleGradCheck(unittest.TestCase):
+
     def pad_wrapper(self, x):
         pad = [1, 1, 1, 1]
         return paddle.nn.functional.pad(x[0], pad)
@@ -361,10 +418,15 @@ def func(self, place):
         out = paddle.nn.functional.pad(x, pad)
         x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x], out, x_init=x_arr, place=place, eps=eps)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.pad_wrapper, [x], out, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check([x],
+                                           out,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(self.pad_wrapper, [x],
+                                                       out,
+                                                       x_init=x_arr,
+                                                       place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -375,6 +437,7 @@ def test_grad(self):
 
 
 class TestConstantPadDoubleGradCheckCase1(TestConstantPadDoubleGradCheck):
+
     @prog_scope()
     def func(self, place):
         x_shape = [2, 3, 4, 5]
@@ -390,6 +453,7 @@ def func(self, place):
 
 
 class TestConcatDoubleGradCheck(unittest.TestCase):
+
     def concat_wrapper(self, x):
         return paddle.concat(x, axis=0)
 
@@ -407,13 +471,15 @@ def func(self, place):
         x2_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
         x1_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
 
-        gradient_checker.double_grad_check(
-            [x1, x2], out, x_init=[x1_arr, x2_arr], place=place)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.concat_wrapper, [x1, x2],
-            out,
-            x_init=[x1_arr, x2_arr],
-            place=place)
+        gradient_checker.double_grad_check([x1, x2],
+                                           out,
+                                           x_init=[x1_arr, x2_arr],
+                                           place=place)
+        gradient_checker.double_grad_check_for_dygraph(self.concat_wrapper,
+                                                       [x1, x2],
+                                                       out,
+                                                       x_init=[x1_arr, x2_arr],
+                                                       place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -424,20 +490,23 @@ def test_grad(self):
 
 
 class TestAvgPool2DDoubleGradCheckCase1(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
-        input_NCHW = fluid.layers.data(
-            name="input_NCHW",
-            shape=[2, 3, 5, 5],
-            append_batch_size=False,
-            dtype="float32")
+        input_NCHW = fluid.layers.data(name="input_NCHW",
+                                       shape=[2, 3, 5, 5],
+                                       append_batch_size=False,
+                                       dtype="float32")
 
         input_NCHW.persistable = True
         y = layers.pool2d(input_NCHW, pool_size=2, pool_type="avg")
         x_arr = np.random.uniform(-1, 1, [2, 3, 5, 5]).astype(np.float32)
 
-        gradient_checker.double_grad_check(
-            [input_NCHW], y, x_init=x_arr, place=place, eps=0.05)
+        gradient_checker.double_grad_check([input_NCHW],
+                                           y,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=0.05)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -448,28 +517,36 @@ def test_grad(self):
 
 
 class TestAvgPool2DDoubleGradCheckCase2(unittest.TestCase):
+
     def pool2d_wrapper(self, x):
-        return paddle.nn.functional.avg_pool2d(
-            x[0], kernel_size=2, data_format="NHWC")
+        return paddle.nn.functional.avg_pool2d(x[0],
+                                               kernel_size=2,
+                                               data_format="NHWC")
 
     @prog_scope()
     def func(self, place):
-        input_NHWC = fluid.layers.data(
-            name="input_NHWC",
-            shape=[2, 5, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
+        input_NHWC = fluid.layers.data(name="input_NHWC",
+                                       shape=[2, 5, 5, 3],
+                                       append_batch_size=False,
+                                       dtype="float32")
 
         input_NHWC.persistable = True
-        y = paddle.nn.functional.avg_pool2d(
-            input_NHWC, kernel_size=2, data_format="NHWC")
+        y = paddle.nn.functional.avg_pool2d(input_NHWC,
+                                            kernel_size=2,
+                                            data_format="NHWC")
         x_arr = np.random.uniform(-1, 1, [2, 5, 5, 3]).astype(np.float32)
 
-        gradient_checker.double_grad_check(
-            [input_NHWC], y, x_init=x_arr, place=place, eps=0.05)
+        gradient_checker.double_grad_check([input_NHWC],
+                                           y,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=0.05)
 
-        gradient_checker.double_grad_check_for_dygraph(
-            self.pool2d_wrapper, [input_NHWC], y, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check_for_dygraph(self.pool2d_wrapper,
+                                                       [input_NHWC],
+                                                       y,
+                                                       x_init=x_arr,
+                                                       place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -480,27 +557,35 @@ def test_grad(self):
 
 
 class TestAvgPool2DDoubleGradCheckCase3(unittest.TestCase):
+
     def pool2d_wrapper(self, x):
-        return paddle.nn.functional.avg_pool2d(
-            x[0], kernel_size=2, padding=[1, 1])
+        return paddle.nn.functional.avg_pool2d(x[0],
+                                               kernel_size=2,
+                                               padding=[1, 1])
 
     @prog_scope()
     def func(self, place):
-        input_NCHW = fluid.layers.data(
-            name="input_NCHW",
-            shape=[2, 3, 5, 5],
-            append_batch_size=False,
-            dtype="float32")
+        input_NCHW = fluid.layers.data(name="input_NCHW",
+                                       shape=[2, 3, 5, 5],
+                                       append_batch_size=False,
+                                       dtype="float32")
 
         input_NCHW.persistable = True
-        y = paddle.nn.functional.avg_pool2d(
-            input_NCHW, kernel_size=2, padding=[1, 1])
+        y = paddle.nn.functional.avg_pool2d(input_NCHW,
+                                            kernel_size=2,
+                                            padding=[1, 1])
         x_arr = np.random.uniform(-1, 1, [2, 3, 5, 5]).astype(np.float32)
 
-        gradient_checker.double_grad_check(
-            [input_NCHW], y, x_init=x_arr, place=place, eps=0.05)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.pool2d_wrapper, [input_NCHW], y, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check([input_NCHW],
+                                           y,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=0.05)
+        gradient_checker.double_grad_check_for_dygraph(self.pool2d_wrapper,
+                                                       [input_NCHW],
+                                                       y,
+                                                       x_init=x_arr,
+                                                       place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -511,26 +596,32 @@ def test_grad(self):
 
 
 class TestAvgPool2DDoubleGradCheckCase4(unittest.TestCase):
+
     def pool2d_wrapper(self, x):
         return paddle.nn.functional.avg_pool2d(x[0], kernel_size=[4, 4])
 
     @prog_scope()
     def func(self, place):
-        input_NCHW = fluid.layers.data(
-            name="input_NCHW",
-            shape=[2, 3, 5, 5],
-            append_batch_size=False,
-            dtype="float32")
+        input_NCHW = fluid.layers.data(name="input_NCHW",
+                                       shape=[2, 3, 5, 5],
+                                       append_batch_size=False,
+                                       dtype="float32")
 
         input_NCHW.persistable = True
         y = layers.pool2d(input_NCHW, pool_size=[4, 4], pool_type="avg")
         y = paddle.nn.functional.avg_pool2d(input_NCHW, kernel_size=[4, 4])
         x_arr = np.random.uniform(-1, 1, [2, 3, 5, 5]).astype(np.float32)
 
-        gradient_checker.double_grad_check(
-            [input_NCHW], y, x_init=x_arr, place=place, eps=0.05)
-        gradient_checker.double_grad_check_for_dygraph(
-            self.pool2d_wrapper, [input_NCHW], y, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check([input_NCHW],
+                                           y,
+                                           x_init=x_arr,
+                                           place=place,
+                                           eps=0.05)
+        gradient_checker.double_grad_check_for_dygraph(self.pool2d_wrapper,
+                                                       [input_NCHW],
+                                                       y,
+                                                       x_init=x_arr,
+                                                       place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
index 8ee3b2ac20320..2fb1c92330be5 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
@@ -34,12 +34,14 @@ def calc_margin_rank_loss(x, y, label, margin=0.0, reduction='none'):
 
 
 def create_test_case(margin, reduction):
+
     class MarginRankingLossCls(unittest.TestCase):
+
         def setUp(self):
             self.x_data = np.random.rand(10, 10).astype("float64")
             self.y_data = np.random.rand(10, 10).astype("float64")
-            self.label_data = np.random.choice(
-                [-1, 1], size=[10, 10]).astype("float64")
+            self.label_data = np.random.choice([-1, 1],
+                                               size=[10, 10]).astype("float64")
             self.places = []
             self.places.append(fluid.CPUPlace())
             if core.is_compiled_with_cuda():
@@ -47,19 +49,21 @@ def setUp(self):
 
         def run_static_functional_api(self, place):
             paddle.enable_static()
-            expected = calc_margin_rank_loss(
-                self.x_data,
-                self.y_data,
-                self.label_data,
-                margin=margin,
-                reduction=reduction)
+            expected = calc_margin_rank_loss(self.x_data,
+                                             self.y_data,
+                                             self.label_data,
+                                             margin=margin,
+                                             reduction=reduction)
             with program_guard(Program(), Program()):
-                x = paddle.static.data(
-                    name="x", shape=[10, 10], dtype="float64")
-                y = paddle.static.data(
-                    name="y", shape=[10, 10], dtype="float64")
-                label = paddle.static.data(
-                    name="label", shape=[10, 10], dtype="float64")
+                x = paddle.static.data(name="x",
+                                       shape=[10, 10],
+                                       dtype="float64")
+                y = paddle.static.data(name="y",
+                                       shape=[10, 10],
+                                       dtype="float64")
+                label = paddle.static.data(name="label",
+                                           shape=[10, 10],
+                                           dtype="float64")
                 result = paddle.nn.functional.margin_ranking_loss(
                     x, y, label, margin, reduction)
                 exe = paddle.static.Executor(place)
@@ -73,19 +77,21 @@ def run_static_functional_api(self, place):
 
         def run_static_api(self, place):
             paddle.enable_static()
-            expected = calc_margin_rank_loss(
-                self.x_data,
-                self.y_data,
-                self.label_data,
-                margin=margin,
-                reduction=reduction)
+            expected = calc_margin_rank_loss(self.x_data,
+                                             self.y_data,
+                                             self.label_data,
+                                             margin=margin,
+                                             reduction=reduction)
             with program_guard(Program(), Program()):
-                x = paddle.static.data(
-                    name="x", shape=[10, 10], dtype="float64")
-                y = paddle.static.data(
-                    name="y", shape=[10, 10], dtype="float64")
-                label = paddle.static.data(
-                    name="label", shape=[10, 10], dtype="float64")
+                x = paddle.static.data(name="x",
+                                       shape=[10, 10],
+                                       dtype="float64")
+                y = paddle.static.data(name="y",
+                                       shape=[10, 10],
+                                       dtype="float64")
+                label = paddle.static.data(name="label",
+                                           shape=[10, 10],
+                                           dtype="float64")
                 margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
                     margin=margin, reduction=reduction)
                 result = margin_rank_loss(x, y, label)
@@ -105,14 +111,13 @@ def run_dynamic_functional_api(self, place):
             y = paddle.to_tensor(self.y_data)
             label = paddle.to_tensor(self.label_data)
 
-            result = paddle.nn.functional.margin_ranking_loss(x, y, label,
-                                                              margin, reduction)
-            expected = calc_margin_rank_loss(
-                self.x_data,
-                self.y_data,
-                self.label_data,
-                margin=margin,
-                reduction=reduction)
+            result = paddle.nn.functional.margin_ranking_loss(
+                x, y, label, margin, reduction)
+            expected = calc_margin_rank_loss(self.x_data,
+                                             self.y_data,
+                                             self.label_data,
+                                             margin=margin,
+                                             reduction=reduction)
             self.assertTrue(np.allclose(result.numpy(), expected))
 
         def run_dynamic_api(self, place):
@@ -123,12 +128,11 @@ def run_dynamic_api(self, place):
             margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
                 margin=margin, reduction=reduction)
             result = margin_rank_loss(x, y, label)
-            expected = calc_margin_rank_loss(
-                self.x_data,
-                self.y_data,
-                self.label_data,
-                margin=margin,
-                reduction=reduction)
+            expected = calc_margin_rank_loss(self.x_data,
+                                             self.y_data,
+                                             self.label_data,
+                                             margin=margin,
+                                             reduction=reduction)
             self.assertTrue(np.allclose(result.numpy(), expected))
 
         def run_dynamic_broadcast_api(self, place):
@@ -140,12 +144,11 @@ def run_dynamic_broadcast_api(self, place):
             margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
                 margin=margin, reduction=reduction)
             result = margin_rank_loss(x, y, label)
-            expected = calc_margin_rank_loss(
-                self.x_data,
-                self.y_data,
-                label_data,
-                margin=margin,
-                reduction=reduction)
+            expected = calc_margin_rank_loss(self.x_data,
+                                             self.y_data,
+                                             label_data,
+                                             margin=margin,
+                                             reduction=reduction)
             self.assertTrue(np.allclose(result.numpy(), expected))
 
         def test_case(self):
@@ -171,6 +174,7 @@ class MarginRakingLossError(unittest.TestCase):
     paddle.enable_static()
 
     def test_errors(self):
+
         def test_margin_value_error():
             margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
                 margin=0.1, reduction="reduce_mean")
@@ -180,8 +184,9 @@ def test_margin_value_error():
         def test_functional_margin_value_error():
             x = paddle.static.data(name="x", shape=[10, 10], dtype="float64")
             y = paddle.static.data(name="y", shape=[10, 10], dtype="float64")
-            label = paddle.static.data(
-                name="label", shape=[10, 10], dtype="float64")
+            label = paddle.static.data(name="label",
+                                       shape=[10, 10],
+                                       dtype="float64")
             result = paddle.nn.functional.margin_ranking_loss(
                 x, y, label, margin=0.1, reduction="reduction_mean")
 
diff --git a/python/paddle/fluid/tests/unittests/test_nn_matmul_v2_grad.py b/python/paddle/fluid/tests/unittests/test_nn_matmul_v2_grad.py
index 6dbabda1f4c34..a83cf1249458e 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_matmul_v2_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_matmul_v2_grad.py
@@ -23,10 +23,12 @@
 import paddle.fluid.core as core
 import gradient_checker
 from decorator_helper import prog_scope
+
 paddle.enable_static()
 
 
 class TestMatmulDoubleGradCheck(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -41,17 +43,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = paddle.static.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = paddle.static.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = paddle.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.x_shape,
+                                           name='x')
+        y = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.y_shape,
+                                           name='y')
+        out = paddle.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
 
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -62,6 +72,7 @@ def test_grad(self):
 
 
 class TestMatmulDoubleGradCheckCase1(TestMatmulDoubleGradCheck):
+
     def init_test(self):
         self.x_shape = [2, 3]
         self.y_shape = [3, 2]
@@ -77,6 +88,7 @@ def test_grad(self):
 
 
 class TestMatmulDoubleGradCheck2(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -91,17 +103,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = paddle.static.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = paddle.static.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = paddle.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.x_shape,
+                                           name='x')
+        y = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.y_shape,
+                                           name='y')
+        out = paddle.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
 
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -112,6 +132,7 @@ def test_grad(self):
 
 
 class TestMatmulDoubleGradCheckCase3(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -126,17 +147,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = paddle.static.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = paddle.static.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = paddle.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.x_shape,
+                                           name='x')
+        y = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.y_shape,
+                                           name='y')
+        out = paddle.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
 
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -147,6 +176,7 @@ def test_grad(self):
 
 
 class TestMatmulTripleGradCheckDotCase(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -163,16 +193,21 @@ def func(self, place):
     eps = 0.005
     dtype = np.float64
     typename = "float64"
-    x = paddle.static.create_parameter(
-        dtype=typename, shape=self.x_shape, name='x')
-    y = paddle.static.create_parameter(
-        dtype=typename, shape=self.y_shape, name='y')
+    x = paddle.static.create_parameter(dtype=typename,
+                                       shape=self.x_shape,
+                                       name='x')
+    y = paddle.static.create_parameter(dtype=typename,
+                                       shape=self.y_shape,
+                                       name='y')
     out = paddle.matmul(x, y, self.transpose_x, self.transpose_y, name='out')
     np.random.seed(2021)
     x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
     y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-    gradient_checker.triple_grad_check(
-        [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+    gradient_checker.triple_grad_check([x, y],
+                                       out,
+                                       x_init=[x_arr, y_arr],
+                                       place=place,
+                                       eps=eps)
 
 
 def test_grad(self):
@@ -184,6 +219,7 @@ def test_grad(self):
 
 
 class TestMatmulTripleGradCheckNormalCase1(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -198,17 +234,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = paddle.static.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = paddle.static.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = paddle.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.x_shape,
+                                           name='x')
+        y = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.y_shape,
+                                           name='y')
+        out = paddle.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
         np.random.seed(2021)
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.triple_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.triple_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -219,6 +263,7 @@ def test_grad(self):
 
 
 class TestMatmulTripleGradCheckNormalCase2(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -233,17 +278,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = paddle.static.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = paddle.static.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = paddle.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.x_shape,
+                                           name='x')
+        y = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.y_shape,
+                                           name='y')
+        out = paddle.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
         np.random.seed(2021)
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.triple_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.triple_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -254,6 +307,7 @@ def test_grad(self):
 
 
 class TestMatmulTripleGradCheckNormalCase3(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -268,17 +322,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = paddle.static.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = paddle.static.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = paddle.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.x_shape,
+                                           name='x')
+        y = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.y_shape,
+                                           name='y')
+        out = paddle.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
         np.random.seed(2021)
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.triple_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.triple_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -289,6 +351,7 @@ def test_grad(self):
 
 
 class TestMatmulTripleGradCheckNormalCase4(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -303,17 +366,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = paddle.static.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = paddle.static.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = paddle.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.x_shape,
+                                           name='x')
+        y = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.y_shape,
+                                           name='y')
+        out = paddle.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
         np.random.seed(2021)
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.triple_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.triple_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -324,6 +395,7 @@ def test_grad(self):
 
 
 class TestMatmulTripleGradCheckBroadcastCase1(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -338,17 +410,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = paddle.static.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = paddle.static.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = paddle.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.x_shape,
+                                           name='x')
+        y = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.y_shape,
+                                           name='y')
+        out = paddle.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
         np.random.seed(2021)
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.triple_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.triple_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -359,6 +439,7 @@ def test_grad(self):
 
 
 class TestMatmulTripleGradCheckBroadcastCase2(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -373,17 +454,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = paddle.static.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = paddle.static.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = paddle.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.x_shape,
+                                           name='x')
+        y = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.y_shape,
+                                           name='y')
+        out = paddle.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
         np.random.seed(2021)
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.triple_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.triple_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -394,6 +483,7 @@ def test_grad(self):
 
 
 class TestMatmulTripleGradCheckBroadcastCase3(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -408,17 +498,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = paddle.static.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = paddle.static.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = paddle.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.x_shape,
+                                           name='x')
+        y = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.y_shape,
+                                           name='y')
+        out = paddle.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
         np.random.seed(2021)
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.triple_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.triple_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -429,6 +527,7 @@ def test_grad(self):
 
 
 class TestMatmulTripleGradCheckBroadcastCase4(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -443,17 +542,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = paddle.static.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = paddle.static.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = paddle.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.x_shape,
+                                           name='x')
+        y = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.y_shape,
+                                           name='y')
+        out = paddle.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
         np.random.seed(2021)
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.triple_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.triple_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -464,6 +571,7 @@ def test_grad(self):
 
 
 class TestMatmulTripleGradCheckBroadcastCase5(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -478,17 +586,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = paddle.static.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = paddle.static.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = paddle.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.x_shape,
+                                           name='x')
+        y = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.y_shape,
+                                           name='y')
+        out = paddle.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
         np.random.seed(2021)
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.triple_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.triple_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -499,6 +615,7 @@ def test_grad(self):
 
 
 class TestMatmulTripleGradCheckSpecialCase1(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -513,17 +630,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = paddle.static.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = paddle.static.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = paddle.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.x_shape,
+                                           name='x')
+        y = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.y_shape,
+                                           name='y')
+        out = paddle.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
         np.random.seed(2021)
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.triple_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.triple_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -534,6 +659,7 @@ def test_grad(self):
 
 
 class TestMatmulTripleGradCheckSpecialCase2(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -548,17 +674,25 @@ def func(self, place):
         eps = 0.005
         dtype = np.float64
         typename = "float64"
-        x = paddle.static.create_parameter(
-            dtype=typename, shape=self.x_shape, name='x')
-        y = paddle.static.create_parameter(
-            dtype=typename, shape=self.y_shape, name='y')
-        out = paddle.matmul(
-            x, y, self.transpose_x, self.transpose_y, name='out')
+        x = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.x_shape,
+                                           name='x')
+        y = paddle.static.create_parameter(dtype=typename,
+                                           shape=self.y_shape,
+                                           name='y')
+        out = paddle.matmul(x,
+                            y,
+                            self.transpose_x,
+                            self.transpose_y,
+                            name='out')
         np.random.seed(2021)
         x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
         y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
-        gradient_checker.triple_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.triple_grad_check([x, y],
+                                           out,
+                                           x_init=[x_arr, y_arr],
+                                           place=place,
+                                           eps=eps)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
index 90132a0923df7..170b916941d36 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
@@ -26,6 +26,7 @@
 
 
 class TestNNSigmoidAPI(unittest.TestCase):
+
     def setUp(self):
         self.init_data()
 
@@ -71,6 +72,7 @@ def test_check_api(self):
 
 
 class TestNNFunctionalSigmoidAPI(unittest.TestCase):
+
     def setUp(self):
         self.init_data()
 
diff --git a/python/paddle/fluid/tests/unittests/test_nonzero_api.py b/python/paddle/fluid/tests/unittests/test_nonzero_api.py
index 8569be82db09e..b107823277ecc 100644
--- a/python/paddle/fluid/tests/unittests/test_nonzero_api.py
+++ b/python/paddle/fluid/tests/unittests/test_nonzero_api.py
@@ -23,6 +23,7 @@
 
 
 class TestNonZeroAPI(unittest.TestCase):
+
     def test_nonzero_api_as_tuple(self):
         data = np.array([[True, False], [False, True]])
         with program_guard(Program(), Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index 5b0a9599bf84e..c65bff3a7bb39 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -35,9 +35,9 @@ def p_norm_python_api(x,
         return _C_ops.final_state_p_norm(x, p, axis, epsilon, keepdim,
                                          as_vector)
     if _in_legacy_dygraph():
-        return _C_ops.p_norm(x, 'axis', axis, 'porder',
-                             float(p), 'keepdim', keepdim, 'epsilon', epsilon,
-                             'as_vector', as_vector)
+        return _C_ops.p_norm(x, 'axis', axis, 'porder', float(p), 'keepdim',
+                             keepdim, 'epsilon', epsilon, 'as_vector',
+                             as_vector)
 
 
 def p_norm(x, axis, porder, keepdims=False, reduce_all=False):
@@ -81,8 +81,8 @@ def p_norm(x, axis, porder, keepdims=False, reduce_all=False):
 def frobenius_norm(x, axis=None, keepdims=False):
     if isinstance(axis, list): axis = tuple(axis)
     if axis is None: x = x.reshape(1, x.size)
-    r = np.linalg.norm(
-        x, ord='fro', axis=axis, keepdims=keepdims).astype(x.dtype)
+    r = np.linalg.norm(x, ord='fro', axis=axis,
+                       keepdims=keepdims).astype(x.dtype)
     return r
 
 
@@ -91,6 +91,7 @@ def final_state_frobenius_norm(x, dim, keep_dim, reduce_all):
 
 
 class TestFrobeniusNormOp(OpTest):
+
     def setUp(self):
         self.python_api = final_state_frobenius_norm
         self.op_type = "frobenius_norm"
@@ -120,6 +121,7 @@ def init_test_case(self):
 
 
 class TestFrobeniusNormOp2(TestFrobeniusNormOp):
+
     def init_test_case(self):
         self.shape = [5, 5, 5]
         self.axis = (0, 1)
@@ -131,6 +133,7 @@ def test_check_grad(self):
 
 
 class TestPnormOp(OpTest):
+
     def setUp(self):
         self.op_type = "p_norm"
         self.python_api = p_norm_python_api
@@ -180,14 +183,20 @@ def calc_gradient(self):
         if porder == 0:
             grad = np.zeros(x.shape).astype(x.dtype)
         elif porder in [float("inf"), float("-inf")]:
-            norm = p_norm(
-                x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
+            norm = p_norm(x,
+                          axis=axis,
+                          porder=porder,
+                          keepdims=True,
+                          reduce_all=asvector)
             x_abs = np.abs(x)
             grad = np.sign(x)
             grad[x_abs != norm] = 0.0
         else:
-            norm = p_norm(
-                x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
+            norm = p_norm(x,
+                          axis=axis,
+                          porder=porder,
+                          keepdims=True,
+                          reduce_all=asvector)
             grad = np.power(norm, 1 - porder) * np.power(
                 np.abs(x), porder - 1) * np.sign(x)
 
@@ -200,6 +209,7 @@ def calc_gradient(self):
 
 
 class TestPnormOp2(TestPnormOp):
+
     def init_test_case(self):
         self.shape = [3, 20, 3]
         self.axis = 2
@@ -214,6 +224,7 @@ def test_check_grad(self):
 
 
 class TestPnormOp3(TestPnormOp):
+
     def init_test_case(self):
         self.shape = [3, 20, 3]
         self.axis = 2
@@ -228,6 +239,7 @@ def test_check_grad(self):
 
 
 class TestPnormOp4(TestPnormOp):
+
     def init_test_case(self):
         self.shape = [3, 20, 3]
         self.axis = 2
@@ -242,6 +254,7 @@ def test_check_grad(self):
 
 
 class TestPnormOp5(TestPnormOp):
+
     def init_test_case(self):
         self.shape = [3, 20, 3]
         self.axis = 2
@@ -256,6 +269,7 @@ def test_check_grad(self):
 
 
 class TestPnormOp6(TestPnormOp):
+
     def init_test_case(self):
         self.shape = [3, 20, 3]
         self.axis = -1
@@ -272,6 +286,7 @@ def test_check_grad(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestPnormOpFP16(TestPnormOp):
+
     def init_test_case(self):
         self.shape = [2, 3, 4, 5]
         self.axis = 1
@@ -289,13 +304,15 @@ def test_check_output(self):
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['X'], 'Out', user_defined_grads=self.gradient)
+            self.check_grad_with_place(place, ['X'],
+                                       'Out',
+                                       user_defined_grads=self.gradient)
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestPnormOpFP161(TestPnormOpFP16):
+
     def init_test_case(self):
         self.shape = [2, 3, 4, 5]
         self.axis = -1
@@ -309,6 +326,7 @@ def init_test_case(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestPnormBF16Op(OpTest):
+
     def setUp(self):
         self.op_type = "p_norm"
         self.python_api = p_norm_python_api
@@ -333,11 +351,10 @@ def test_check_output(self):
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'],
-            'Out',
-            user_defined_grads=self.gradient,
-            check_eager=True)
+        self.check_grad_with_place(place, ['X'],
+                                   'Out',
+                                   user_defined_grads=self.gradient,
+                                   check_eager=True)
 
     def init_test_case(self):
         self.shape = [2, 3, 4, 5]
@@ -365,14 +382,20 @@ def calc_gradient(self):
         if porder == 0:
             grad = np.zeros(x.shape).astype(x.dtype)
         elif porder in [float("inf"), float("-inf")]:
-            norm = p_norm(
-                x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
+            norm = p_norm(x,
+                          axis=axis,
+                          porder=porder,
+                          keepdims=True,
+                          reduce_all=asvector)
             x_abs = np.abs(x)
             grad = np.sign(x)
             grad[x_abs != norm] = 0.0
         else:
-            norm = p_norm(
-                x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
+            norm = p_norm(x,
+                          axis=axis,
+                          porder=porder,
+                          keepdims=True,
+                          reduce_all=asvector)
             grad = np.power(norm, 1 - porder) * np.power(
                 np.abs(x), porder - 1) * np.sign(x)
 
@@ -407,8 +430,10 @@ def run_pnorm(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
-        expected_result = p_norm(
-            np_input, porder=p, axis=axis, keepdims=keep_dim).astype(dtype)
+        expected_result = p_norm(np_input,
+                                 porder=p,
+                                 axis=axis,
+                                 keepdims=keep_dim).astype(dtype)
         result, = exe.run(feed={"X": np_input}, fetch_list=[out])
     self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
     if keep_dim and check_dim:
@@ -450,134 +475,119 @@ def run_graph(self, p, axis, shape_x, dtype):
 
 
 class API_NormTest(unittest.TestCase):
+
     def test_basic(self):
         keep_dims = {False, True}
         for keep in keep_dims:
-            run_fro(
-                self,
-                p='fro',
-                axis=None,
-                shape_x=[2, 3, 4],
-                dtype="float32",
-                keep_dim=keep)
-            run_fro(
-                self,
-                p='fro',
-                axis=[0, 1],
-                shape_x=[2, 3, 4],
-                dtype="float64",
-                keep_dim=keep,
-                check_dim=True)
-            run_pnorm(
-                self,
-                p=2,
-                axis=None,
-                shape_x=[3, 4],
-                dtype="float32",
-                keep_dim=keep)
-            run_pnorm(
-                self,
-                p=2,
-                axis=1,
-                shape_x=[3, 4],
-                dtype="float64",
-                keep_dim=keep,
-                check_dim=True)
-            run_pnorm(
-                self,
-                p=np.inf,
-                axis=0,
-                shape_x=[2, 3, 4],
-                dtype="float32",
-                keep_dim=keep,
-                check_dim=True)
-            run_pnorm(
-                self,
-                p=np.inf,
-                axis=None,
-                shape_x=[2, 3, 4],
-                dtype="float32",
-                keep_dim=keep)
-            run_pnorm(
-                self,
-                p=-np.inf,
-                axis=0,
-                shape_x=[2, 3, 4],
-                dtype="float64",
-                keep_dim=keep,
-                check_dim=True)
-            run_pnorm(
-                self,
-                p=-np.inf,
-                axis=None,
-                shape_x=[2, 3, 4],
-                dtype="float64",
-                keep_dim=keep)
-            run_pnorm(
-                self,
-                p=0,
-                axis=1,
-                shape_x=[3, 4],
-                dtype="float64",
-                keep_dim=keep,
-                check_dim=True)
-
-            run_pnorm(
-                self,
-                p=1,
-                axis=1,
-                shape_x=[3, 4],
-                dtype="float64",
-                keep_dim=keep,
-                check_dim=True)
-            run_pnorm(
-                self,
-                p=0,
-                axis=None,
-                shape_x=[3, 4],
-                dtype="float64",
-                keep_dim=keep,
-                check_dim=True)
-            run_pnorm(
-                self,
-                p=2,
-                axis=[0, 1],
-                shape_x=[2, 3, 4],
-                dtype="float64",
-                keep_dim=keep,
-                check_dim=True)
-            run_pnorm(
-                self,
-                p=2,
-                axis=-1,
-                shape_x=[2, 3, 4],
-                dtype="float64",
-                keep_dim=keep,
-                check_dim=True)
-            run_pnorm(
-                self,
-                p=1,
-                axis=[0, 1],
-                shape_x=[2, 3, 4],
-                dtype="float64",
-                keep_dim=keep,
-                check_dim=True)
-            run_pnorm(
-                self,
-                p=np.inf,
-                axis=[0, 1],
-                shape_x=[2, 3, 4],
-                dtype="float64",
-                keep_dim=keep,
-                check_dim=True)
-            run_pnorm(
-                self,
-                p=-np.inf,
-                axis=[0, 1],
-                shape_x=[2, 3, 4],
-                dtype="float64",
-                keep_dim=keep,
-                check_dim=True)
+            run_fro(self,
+                    p='fro',
+                    axis=None,
+                    shape_x=[2, 3, 4],
+                    dtype="float32",
+                    keep_dim=keep)
+            run_fro(self,
+                    p='fro',
+                    axis=[0, 1],
+                    shape_x=[2, 3, 4],
+                    dtype="float64",
+                    keep_dim=keep,
+                    check_dim=True)
+            run_pnorm(self,
+                      p=2,
+                      axis=None,
+                      shape_x=[3, 4],
+                      dtype="float32",
+                      keep_dim=keep)
+            run_pnorm(self,
+                      p=2,
+                      axis=1,
+                      shape_x=[3, 4],
+                      dtype="float64",
+                      keep_dim=keep,
+                      check_dim=True)
+            run_pnorm(self,
+                      p=np.inf,
+                      axis=0,
+                      shape_x=[2, 3, 4],
+                      dtype="float32",
+                      keep_dim=keep,
+                      check_dim=True)
+            run_pnorm(self,
+                      p=np.inf,
+                      axis=None,
+                      shape_x=[2, 3, 4],
+                      dtype="float32",
+                      keep_dim=keep)
+            run_pnorm(self,
+                      p=-np.inf,
+                      axis=0,
+                      shape_x=[2, 3, 4],
+                      dtype="float64",
+                      keep_dim=keep,
+                      check_dim=True)
+            run_pnorm(self,
+                      p=-np.inf,
+                      axis=None,
+                      shape_x=[2, 3, 4],
+                      dtype="float64",
+                      keep_dim=keep)
+            run_pnorm(self,
+                      p=0,
+                      axis=1,
+                      shape_x=[3, 4],
+                      dtype="float64",
+                      keep_dim=keep,
+                      check_dim=True)
+
+            run_pnorm(self,
+                      p=1,
+                      axis=1,
+                      shape_x=[3, 4],
+                      dtype="float64",
+                      keep_dim=keep,
+                      check_dim=True)
+            run_pnorm(self,
+                      p=0,
+                      axis=None,
+                      shape_x=[3, 4],
+                      dtype="float64",
+                      keep_dim=keep,
+                      check_dim=True)
+            run_pnorm(self,
+                      p=2,
+                      axis=[0, 1],
+                      shape_x=[2, 3, 4],
+                      dtype="float64",
+                      keep_dim=keep,
+                      check_dim=True)
+            run_pnorm(self,
+                      p=2,
+                      axis=-1,
+                      shape_x=[2, 3, 4],
+                      dtype="float64",
+                      keep_dim=keep,
+                      check_dim=True)
+            run_pnorm(self,
+                      p=1,
+                      axis=[0, 1],
+                      shape_x=[2, 3, 4],
+                      dtype="float64",
+                      keep_dim=keep,
+                      check_dim=True)
+            run_pnorm(self,
+                      p=np.inf,
+                      axis=[0, 1],
+                      shape_x=[2, 3, 4],
+                      dtype="float64",
+                      keep_dim=keep,
+                      check_dim=True)
+            run_pnorm(self,
+                      p=-np.inf,
+                      axis=[0, 1],
+                      shape_x=[2, 3, 4],
+                      dtype="float64",
+                      keep_dim=keep,
+                      check_dim=True)
 
     def test_dygraph(self):
         run_graph(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
@@ -611,8 +621,11 @@ def err_dtype(p, shape_x, xdtype, out=None):
             self.assertRaises(ValueError, paddle.norm, data, p=[1], axis=-1)
             self.assertRaises(ValueError, paddle.norm, 0, [1, 0], "float64")
             data = fluid.data(name="data_3d", shape=[2, 2, 2], dtype="float64")
-            self.assertRaises(
-                ValueError, paddle.norm, data, p='unspport', axis=[-3, -2, -1])
+            self.assertRaises(ValueError,
+                              paddle.norm,
+                              data,
+                              p='unspport',
+                              axis=[-3, -2, -1])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index 1452b869d4f8b..46fbc00eacf6f 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -27,6 +27,7 @@
 
 
 class TestInstanceNormDoubleGradCheck(unittest.TestCase):
+
     @prog_scope()
     def func(self, place):
         prog = fluid.Program()
@@ -39,8 +40,12 @@ def func(self, place):
             x = layers.create_parameter(dtype=dtype, shape=shape, name='x')
             z = fluid.layers.instance_norm(input=x)
             x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-            gradient_checker.double_grad_check(
-                [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
+            gradient_checker.double_grad_check([x],
+                                               z,
+                                               x_init=x_arr,
+                                               atol=atol,
+                                               place=place,
+                                               eps=eps)
 
     def test_grad(self):
         paddle.enable_static()
@@ -53,6 +58,34 @@ def test_grad(self):
 
 class TestInstanceNormDoubleGradCheckWithoutParamBias(
         TestInstanceNormDoubleGradCheck):
+
+    @prog_scope()
+    def func(self, place):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+            np.random.seed()
+            shape = [2, 3, 4, 5]
+            dtype = "float32"
+            eps = 0.005
+            atol = 1e-4
+            x = layers.create_parameter(dtype=dtype, shape=shape, name='x')
+            z = fluid.layers.instance_norm(input=x,
+                                           param_attr=False,
+                                           bias_attr=False)
+            x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+            gradient_checker.double_grad_check([x],
+                                               z,
+                                               x_init=x_arr,
+                                               atol=atol,
+                                               place=place,
+                                               eps=eps)
+
+
+class TestInstanceNormDoubleGradEagerCheck(unittest.TestCase):
+
+    def instance_norm_wrapper(self, x):
+        return paddle.nn.functional.instance_norm(x[0])
+
     @prog_scope()
     def func(self, place):
         prog = fluid.Program()
@@ -63,14 +96,69 @@ def func(self, place):
             eps = 0.005
             atol = 1e-4
             x = layers.create_parameter(dtype=dtype, shape=shape, name='x')
-            z = fluid.layers.instance_norm(
-                input=x, param_attr=False, bias_attr=False)
+            z = paddle.nn.functional.instance_norm(x)
             x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-            gradient_checker.double_grad_check(
-                [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
+            # check for static mode
+            gradient_checker.double_grad_check([x],
+                                               z,
+                                               x_init=x_arr,
+                                               atol=atol,
+                                               place=place,
+                                               eps=eps)
+            # check for eager mode
+            gradient_checker.double_grad_check_for_dygraph(
+                self.instance_norm_wrapper, [x],
+                z,
+                x_init=x_arr,
+                atol=atol,
+                place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestInstanceNormDoubleGradEagerCheckWithParams(
+        TestInstanceNormDoubleGradEagerCheck):
+
+    def instance_norm_wrapper(self, x):
+        instance_norm = paddle.nn.InstanceNorm2D(3)
+        return instance_norm(x[0])
+
+    @prog_scope()
+    def func(self, place):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+            np.random.seed()
+            shape = [2, 3, 4, 5]
+            dtype = "float32"
+            eps = 0.005
+            atol = 1e-4
+            x = layers.create_parameter(dtype=dtype, shape=shape, name='x')
+            z = paddle.nn.InstanceNorm2D(3)(x)
+            x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+            # check for static mode
+            gradient_checker.double_grad_check([x],
+                                               z,
+                                               x_init=x_arr,
+                                               atol=atol,
+                                               place=place,
+                                               eps=eps)
+            # check for eager mode
+            gradient_checker.double_grad_check_for_dygraph(
+                self.instance_norm_wrapper, [x],
+                z,
+                x_init=x_arr,
+                atol=atol,
+                place=place)
 
 
 class TestBatchNormDoubleGradCheck(unittest.TestCase):
+
     def setUp(self):
         self.init_test()
 
@@ -96,13 +184,16 @@ def func(self, place):
             eps = 0.005
             atol = 1e-4
             x = layers.create_parameter(dtype=dtype, shape=self.shape, name='x')
-            z = fluid.layers.batch_norm(
-                input=x,
-                data_layout=self.data_layout,
-                use_global_stats=self.use_global_stats)
+            z = fluid.layers.batch_norm(input=x,
+                                        data_layout=self.data_layout,
+                                        use_global_stats=self.use_global_stats)
             x_arr = np.random.uniform(-1, 1, self.shape).astype(dtype)
-            gradient_checker.double_grad_check(
-                [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
+            gradient_checker.double_grad_check([x],
+                                               z,
+                                               x_init=x_arr,
+                                               atol=atol,
+                                               place=place,
+                                               eps=eps)
             gradient_checker.double_grad_check_for_dygraph(
                 self.batch_norm_wrapper, [x],
                 z,
@@ -120,6 +211,7 @@ def test_grad(self):
 
 
 class TestBatchNormDoubleGradCheckCase1(TestBatchNormDoubleGradCheck):
+
     def init_test(self):
         self.data_layout = 'NHWC'
         self.use_global_stats = False
@@ -128,6 +220,7 @@ def init_test(self):
 
 
 class TestBatchNormDoubleGradCheckCase2(TestBatchNormDoubleGradCheck):
+
     def init_test(self):
         self.data_layout = 'NCHW'
         self.use_global_stats = True
@@ -136,6 +229,7 @@ def init_test(self):
 
 
 class TestBatchNormDoubleGradCheckCase3(TestBatchNormDoubleGradCheck):
+
     def init_test(self):
         self.data_layout = 'NHWC'
         self.use_global_stats = True
@@ -144,6 +238,7 @@ def init_test(self):
 
 
 class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck):
+
     def init_test(self):
         self.data_layout = 'NCHW'
         self.use_global_stats = False
@@ -159,6 +254,7 @@ def batch_norm_wrapper(self, x):
 
 
 class TestBatchNormDoubleGradCheckCase5(TestBatchNormDoubleGradCheck):
+
     @prog_scope()
     def func(self, place):
         prog = fluid.Program()
@@ -167,27 +263,25 @@ def func(self, place):
             dtype = "float32"
             eps = 0.005
             atol = 2e-4
-            chn = self.shape[1] if self.data_layout == 'NCHW' else self.shape[
-                -1]
+            chn = self.shape[1] if self.data_layout == 'NCHW' else self.shape[-1]
             x = layers.create_parameter(dtype=dtype, shape=self.shape, name='x')
-            z = fluid.layers.batch_norm(
-                input=x,
-                data_layout=self.data_layout,
-                use_global_stats=self.use_global_stats)
+            z = fluid.layers.batch_norm(input=x,
+                                        data_layout=self.data_layout,
+                                        use_global_stats=self.use_global_stats)
             x_arr = np.random.uniform(-1, 1, self.shape).astype(dtype)
             w, b = prog.global_block().all_parameters()[1:3]
             w_arr = np.ones(chn).astype(dtype)
             b_arr = np.zeros(chn).astype(dtype)
-            gradient_checker.double_grad_check(
-                [x, w, b],
-                z,
-                x_init=[x_arr, w_arr, b_arr],
-                atol=atol,
-                place=place,
-                eps=eps)
+            gradient_checker.double_grad_check([x, w, b],
+                                               z,
+                                               x_init=[x_arr, w_arr, b_arr],
+                                               atol=atol,
+                                               place=place,
+                                               eps=eps)
 
 
 class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5):
+
     def init_test(self):
         self.data_layout = 'NCHW'
         self.use_global_stats = True
diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py
index 49e1f2533491d..7cc598a73874f 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
@@ -30,6 +30,7 @@ def l2_norm(x, axis, epsilon):
 
 
 class TestNormOp(OpTest):
+
     def setUp(self):
         self.op_type = "norm"
         self.python_api = paddle.fluid.layers.l2_normalize
@@ -57,6 +58,7 @@ def init_dtype(self):
 
 
 class TestNormOp2(TestNormOp):
+
     def init_test_case(self):
         self.shape = [5, 3, 9, 7]
         self.axis = 0
@@ -64,6 +66,7 @@ def init_test_case(self):
 
 
 class TestNormOp3(TestNormOp):
+
     def init_test_case(self):
         self.shape = [5, 3, 2, 7]
         self.axis = -1
@@ -73,6 +76,7 @@ def init_test_case(self):
 @skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
                     "however it is desirable to cover the forward pass")
 class TestNormOp4(TestNormOp):
+
     def init_test_case(self):
         self.shape = [128, 1024, 14, 14]
         self.axis = 2
@@ -85,6 +89,7 @@ def test_check_grad(self):
 @skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
                     "however it is desirable to cover the forward pass")
 class TestNormOp5(TestNormOp):
+
     def init_test_case(self):
         self.shape = [2048, 2048]
         self.axis = 1
@@ -95,6 +100,7 @@ def test_check_grad(self):
 
 
 class TestNormOp6(TestNormOp):
+
     def init_dtype(self):
         self.dtype = "float32"
 
@@ -105,6 +111,7 @@ def test_check_grad(self):
 @unittest.skipIf(not fluid.core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestNormOp7(TestNormOp):
+
     def init_dtype(self):
         self.dtype = "float16"
 
@@ -112,12 +119,14 @@ def test_check_output(self):
         self.check_output_with_place(fluid.core.CUDAPlace(0), atol=5e-2)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            fluid.core.CUDAPlace(0), ['X'], 'Out', max_relative_error=0.05)
+        self.check_grad_with_place(fluid.core.CUDAPlace(0), ['X'],
+                                   'Out',
+                                   max_relative_error=0.05)
 
 
 @skip_check_grad_ci(reason="skip check grad for test mode.")
 class TestNormTestOp(OpTest):
+
     def setUp(self):
         self.op_type = "norm"
         self.init_test_case()
@@ -144,6 +153,7 @@ def init_test_case(self):
 
 
 class API_NormTest(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program()):
 
diff --git a/python/paddle/fluid/tests/unittests/test_normal.py b/python/paddle/fluid/tests/unittests/test_normal.py
index 79632817662c5..b4cfed68149b0 100644
--- a/python/paddle/fluid/tests/unittests/test_normal.py
+++ b/python/paddle/fluid/tests/unittests/test_normal.py
@@ -22,6 +22,7 @@
 
 
 class TestNormalAPI(unittest.TestCase):
+
     def setUp(self):
         self.mean = 1.0
         self.std = 0.0
@@ -142,28 +143,33 @@ def test_api(self):
 
 
 class TestNormalAPI_mean_is_tensor(TestNormalAPI):
+
     def set_attrs(self):
         self.mean = np.random.uniform(-2, -1, [2, 3, 4, 5]).astype('float64')
 
 
 class TestNormalAPI_std_is_tensor(TestNormalAPI):
+
     def set_attrs(self):
         self.std = np.random.uniform(0.7, 1, [2, 3, 17]).astype('float64')
 
 
 class TestNormalAPI_mean_std_are_tensor(TestNormalAPI):
+
     def set_attrs(self):
         self.mean = np.random.uniform(1, 2, [1, 100]).astype('float64')
         self.std = np.random.uniform(0.5, 1, [1, 100]).astype('float64')
 
 
 class TestNormalAPI_mean_std_are_tensor_with_different_dtype(TestNormalAPI):
+
     def set_attrs(self):
         self.mean = np.random.uniform(1, 2, [100]).astype('float64')
         self.std = np.random.uniform(1, 2, [100]).astype('float32')
 
 
 class TestNormalAlias(unittest.TestCase):
+
     def test_alias(self):
         paddle.disable_static()
         shape = [1, 2, 3]
@@ -174,6 +180,7 @@ def test_alias(self):
 
 
 class TestNormalErrors(unittest.TestCase):
+
     def test_errors(self):
         with paddle.static.program_guard(paddle.static.Program()):
             mean = [1, 2, 3]
diff --git a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
index 24fdcf8c88417..c717831b247e9 100644
--- a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
+++ b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
@@ -32,11 +32,10 @@ def gen_random_input(self):
     def set_program(self, axis, epsilon):
         """Build the test program.
         """
-        data = fluid.layers.data(
-            name=self.data_desc["name"],
-            shape=self.data_desc["shape"],
-            dtype="float32",
-            append_batch_size=False)
+        data = fluid.layers.data(name=self.data_desc["name"],
+                                 shape=self.data_desc["shape"],
+                                 dtype="float32",
+                                 append_batch_size=False)
         data.stop_gradient = False
         l2_norm = fluid.layers.l2_normalize(x=data, axis=axis, epsilon=epsilon)
         out = fluid.layers.reduce_sum(l2_norm, dim=None)
diff --git a/python/paddle/fluid/tests/unittests/test_normalize.py b/python/paddle/fluid/tests/unittests/test_normalize.py
index 2f52ae391c7de..ebcaf26955e40 100644
--- a/python/paddle/fluid/tests/unittests/test_normalize.py
+++ b/python/paddle/fluid/tests/unittests/test_normalize.py
@@ -31,6 +31,7 @@ def p_normalize(x, axis=1, p=2, epsilon=1e-12, keepdims=True):
 
 
 class TestNNFunctionalNormalize(unittest.TestCase):
+
     def setUp(self):
         self.input_np = np.random.random(size=(10, 10)).astype(np.float32)
         self.input_np2 = np.array([0.0, 0.0]).astype(np.float32)
@@ -68,10 +69,11 @@ def run_static(self, use_gpu=False):
         place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        static_result = exe.run(
-            feed={"input": self.input_np,
-                  "input2": self.input_np2},
-            fetch_list=[result0, result1, result2, result4])
+        static_result = exe.run(feed={
+            "input": self.input_np,
+            "input2": self.input_np2
+        },
+                                fetch_list=[result0, result1, result2, result4])
 
         self.assertTrue(np.allclose(static_result[0], self.expected0))
         self.assertTrue(np.allclose(static_result[1], self.expected1))
diff --git a/python/paddle/fluid/tests/unittests/test_npair_loss_op.py b/python/paddle/fluid/tests/unittests/test_npair_loss_op.py
index c6d7e0300d0bc..470dfff788bc9 100644
--- a/python/paddle/fluid/tests/unittests/test_npair_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_npair_loss_op.py
@@ -22,12 +22,13 @@
 
 
 def npairloss(anchor, positive, labels, l2_reg=0.002):
+
     def softmax_cross_entropy_with_logits(logits, labels):
         logits = np.exp(logits)
         logits = logits / np.sum(logits, axis=1).reshape(-1, 1)
 
-        return np.mean(
-            -np.sum(labels * np.log(logits), axis=1), dtype=np.float32)
+        return np.mean(-np.sum(labels * np.log(logits), axis=1),
+                       dtype=np.float32)
 
     batch_size = labels.shape[0]
 
@@ -47,6 +48,7 @@ def softmax_cross_entropy_with_logits(logits, labels):
 
 
 class TestNpairLossOp(unittest.TestCase):
+
     def setUp(self):
         self.dtype = np.float32
 
@@ -65,32 +67,30 @@ def test_npair_loss(self):
                                            feat_dim).astype(np.float32)
         embeddings_positive = np.random.rand(num_data,
                                              feat_dim).astype(np.float32)
-        row_labels = np.random.randint(
-            0, num_classes, size=(num_data)).astype(np.float32)
-        out_loss = npairloss(
-            embeddings_anchor,
-            embeddings_positive,
-            row_labels,
-            l2_reg=reg_lambda)
-
-        anc = fluid.layers.data(
-            dtype='float32',
-            name='anc',
-            shape=embeddings_anchor.shape,
-            append_batch_size=False)
-        pos = fluid.layers.data(
-            dtype='float32',
-            name='pos',
-            shape=embeddings_positive.shape,
-            append_batch_size=False)
-        lab = fluid.layers.data(
-            dtype='float32',
-            name='lab',
-            shape=row_labels.shape,
-            append_batch_size=False)
-
-        npair_loss_op = fluid.layers.npair_loss(
-            anchor=anc, positive=pos, labels=lab, l2_reg=reg_lambda)
+        row_labels = np.random.randint(0, num_classes,
+                                       size=(num_data)).astype(np.float32)
+        out_loss = npairloss(embeddings_anchor,
+                             embeddings_positive,
+                             row_labels,
+                             l2_reg=reg_lambda)
+
+        anc = fluid.layers.data(dtype='float32',
+                                name='anc',
+                                shape=embeddings_anchor.shape,
+                                append_batch_size=False)
+        pos = fluid.layers.data(dtype='float32',
+                                name='pos',
+                                shape=embeddings_positive.shape,
+                                append_batch_size=False)
+        lab = fluid.layers.data(dtype='float32',
+                                name='lab',
+                                shape=row_labels.shape,
+                                append_batch_size=False)
+
+        npair_loss_op = fluid.layers.npair_loss(anchor=anc,
+                                                positive=pos,
+                                                labels=lab,
+                                                l2_reg=reg_lambda)
         out_tensor = exe.run(feed={
             'anc': embeddings_anchor,
             'pos': embeddings_positive,
@@ -98,47 +98,46 @@ def test_npair_loss(self):
         },
                              fetch_list=[npair_loss_op.name])
 
-        self.__assert_close(
-            out_tensor,
-            out_loss,
-            "inference output are different at " + str(place) + ", " +
-            str(np.dtype('float32')) + str(np.array(out_tensor)) +
-            str(out_loss),
-            atol=1e-3)
+        self.__assert_close(out_tensor,
+                            out_loss,
+                            "inference output are different at " + str(place) +
+                            ", " + str(np.dtype('float32')) +
+                            str(np.array(out_tensor)) + str(out_loss),
+                            atol=1e-3)
 
 
 class TestNpairLossOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             anchor_np = np.random.random((2, 4)).astype("float32")
             positive_np = np.random.random((2, 4)).astype("float32")
             labels_np = np.random.random((2)).astype("float32")
-            anchor_data = fluid.data(
-                name='anchor', shape=[2, 4], dtype='float32')
-            positive_data = fluid.data(
-                name='positive', shape=[2, 4], dtype='float32')
+            anchor_data = fluid.data(name='anchor',
+                                     shape=[2, 4],
+                                     dtype='float32')
+            positive_data = fluid.data(name='positive',
+                                       shape=[2, 4],
+                                       dtype='float32')
             labels_data = fluid.data(name='labels', shape=[2], dtype='float32')
 
             def test_anchor_Variable():
                 # the anchor type must be Variable
-                fluid.layers.npair_loss(
-                    anchor=anchor_np,
-                    positive=positive_data,
-                    labels=labels_data)
+                fluid.layers.npair_loss(anchor=anchor_np,
+                                        positive=positive_data,
+                                        labels=labels_data)
 
             def test_positive_Variable():
                 # the positive type must be Variable
-                fluid.layers.npair_loss(
-                    anchor=anchor_data,
-                    positive=positive_np,
-                    labels=labels_data)
+                fluid.layers.npair_loss(anchor=anchor_data,
+                                        positive=positive_np,
+                                        labels=labels_data)
 
             def test_labels_Variable():
                 # the labels type must be Variable
-                fluid.layers.npair_loss(
-                    anchor=anchor_data,
-                    positive=positive_data,
-                    labels=labels_np)
+                fluid.layers.npair_loss(anchor=anchor_data,
+                                        positive=positive_data,
+                                        labels=labels_np)
 
             self.assertRaises(TypeError, test_anchor_Variable)
             self.assertRaises(TypeError, test_positive_Variable)
@@ -146,30 +145,30 @@ def test_labels_Variable():
 
             def test_anchor_type():
                 # dtype must be float32 or float64
-                anchor_data1 = fluid.data(
-                    name='anchor1', shape=[2, 4], dtype='int32')
-                fluid.layers.npair_loss(
-                    anchor=anchor_data,
-                    positive=positive_data,
-                    labels=labels_np)
+                anchor_data1 = fluid.data(name='anchor1',
+                                          shape=[2, 4],
+                                          dtype='int32')
+                fluid.layers.npair_loss(anchor=anchor_data,
+                                        positive=positive_data,
+                                        labels=labels_np)
 
             def test_positive_type():
                 # dtype must be float32 or float64
-                positive_data1 = fluid.data(
-                    name='positive1', shape=[2, 4], dtype='int32')
-                fluid.layers.npair_loss(
-                    anchor=anchor_data,
-                    positive=positive_data1,
-                    labels=labels_np)
+                positive_data1 = fluid.data(name='positive1',
+                                            shape=[2, 4],
+                                            dtype='int32')
+                fluid.layers.npair_loss(anchor=anchor_data,
+                                        positive=positive_data1,
+                                        labels=labels_np)
 
             def test_labels_type():
                 # dtype must be float32 or float64
-                labels_data1 = fluid.data(
-                    name='labels1', shape=[2], dtype='int32')
-                fluid.layers.npair_loss(
-                    anchor=anchor_data,
-                    positive=positive_data,
-                    labels=labels_data1)
+                labels_data1 = fluid.data(name='labels1',
+                                          shape=[2],
+                                          dtype='int32')
+                fluid.layers.npair_loss(anchor=anchor_data,
+                                        positive=positive_data,
+                                        labels=labels_data1)
 
             self.assertRaises(TypeError, test_anchor_type)
             self.assertRaises(TypeError, test_positive_type)
diff --git a/python/paddle/fluid/tests/unittests/test_number_count_op.py b/python/paddle/fluid/tests/unittests/test_number_count_op.py
index bb09b8c6512f7..50ddde7a4dda9 100644
--- a/python/paddle/fluid/tests/unittests/test_number_count_op.py
+++ b/python/paddle/fluid/tests/unittests/test_number_count_op.py
@@ -38,6 +38,7 @@ def count(x, upper_num):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestNumberCountOpInt64(op_test.OpTest):
+
     def setUp(self):
         upper_num = 16
         self.op_type = "number_count"
@@ -53,10 +54,11 @@ def test_forward(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestNumberCountAPI(unittest.TestCase):
+
     def setUp(self):
         self.upper_num = 320
-        self.x = np.random.randint(
-            -1, self.upper_num, size=(6000, 200)).astype('int64')
+        self.x = np.random.randint(-1, self.upper_num,
+                                   size=(6000, 200)).astype('int64')
         self.out = count(self.x, self.upper_num)
         self.place = paddle.CUDAPlace(0)
 
diff --git a/python/paddle/fluid/tests/unittests/test_numel_op.py b/python/paddle/fluid/tests/unittests/test_numel_op.py
index d3b9509795783..9ab24dea565f1 100644
--- a/python/paddle/fluid/tests/unittests/test_numel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_numel_op.py
@@ -24,11 +24,14 @@
 
 
 class TestNumelOp(OpTest):
+
     def setUp(self):
         self.op_type = "size"
         self.init()
         x = np.random.random((self.shape)).astype("float64")
-        self.inputs = {'Input': x, }
+        self.inputs = {
+            'Input': x,
+        }
         self.outputs = {'Out': np.array([np.size(x)])}
 
     def test_check_output(self):
@@ -39,16 +42,19 @@ def init(self):
 
 
 class TestNumelOp1(TestNumelOp):
+
     def init(self):
         self.shape = (11, 66)
 
 
 class TestNumelOp2(TestNumelOp):
+
     def init(self):
         self.shape = (0, )
 
 
 class TestNumelAPI(unittest.TestCase):
+
     def test_numel_static(self):
         main_program = fluid.Program()
         startup_program = fluid.Program()
@@ -67,10 +73,12 @@ def test_numel_static(self):
                 "x_2": input_2,
             },
                                    fetch_list=[out_1, out_2])
-            assert (np.array_equal(
-                res_1, np.array([np.size(input_1)]).astype("int64")))
-            assert (np.array_equal(
-                res_2, np.array([np.size(input_2)]).astype("int64")))
+            assert (np.array_equal(res_1,
+                                   np.array([np.size(input_1)
+                                             ]).astype("int64")))
+            assert (np.array_equal(res_2,
+                                   np.array([np.size(input_2)
+                                             ]).astype("int64")))
 
     def test_numel_imperative(self):
         paddle.disable_static(paddle.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
index e1da94c1219ca..5caede6295b13 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
@@ -26,6 +26,7 @@
 
 
 class TestOneHotOp(OpTest):
+
     def setUp(self):
         self.op_type = 'one_hot'
         depth = 10
@@ -50,6 +51,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_attr(OpTest):
+
     def setUp(self):
         self.op_type = 'one_hot'
         depth = 10
@@ -73,6 +75,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_default_dtype(OpTest):
+
     def setUp(self):
         self.op_type = 'one_hot'
         depth = 10
@@ -97,6 +100,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_default_dtype_attr(OpTest):
+
     def setUp(self):
         self.op_type = 'one_hot'
         depth = 10
@@ -120,6 +124,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_out_of_range(OpTest):
+
     def setUp(self):
         self.op_type = 'one_hot'
         depth = 10
@@ -139,6 +144,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_exception(unittest.TestCase):
+
     def setUp(self):
         self.op_type = 'one_hot'
         self.depth = 10
@@ -154,18 +160,18 @@ def setUp(self):
     def test_check_output(self):
         program = Program()
         with program_guard(program):
-            x = fluid.layers.data(
-                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
+            x = fluid.layers.data(name='x',
+                                  shape=[self.dimension],
+                                  dtype='float32',
+                                  lod_level=1)
             block = program.current_block()
-            one_hot_out = block.create_var(
-                name="one_hot_out",
-                type=core.VarDesc.VarType.LOD_TENSOR,
-                dtype='float32')
-            block.append_op(
-                type='one_hot',
-                inputs={'X': x},
-                attrs={'depth': self.depth},
-                outputs={'Out': one_hot_out})
+            one_hot_out = block.create_var(name="one_hot_out",
+                                           type=core.VarDesc.VarType.LOD_TENSOR,
+                                           dtype='float32')
+            block.append_op(type='one_hot',
+                            inputs={'X': x},
+                            attrs={'depth': self.depth},
+                            outputs={'Out': one_hot_out})
             exe = fluid.Executor(self.place)
 
             def run():
@@ -177,24 +183,23 @@ def run():
 
 
 class TestOneHotOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input must be Variable
             in_w = np.random.random((4, 1)).astype("int32")
             self.assertRaises(TypeError, fluid.layers.one_hot, in_w)
             # the input must be int32 or int 64
-            in_w2 = fluid.layers.data(
-                name="in_w2",
-                shape=[4, 1],
-                append_batch_size=False,
-                dtype="float32")
+            in_w2 = fluid.layers.data(name="in_w2",
+                                      shape=[4, 1],
+                                      append_batch_size=False,
+                                      dtype="float32")
             self.assertRaises(TypeError, fluid.layers.one_hot, in_w2)
             # the depth must be int, long or Variable
-            in_r = fluid.layers.data(
-                name="in_r",
-                shape=[4, 1],
-                append_batch_size=False,
-                dtype="int32")
+            in_r = fluid.layers.data(name="in_r",
+                                     shape=[4, 1],
+                                     append_batch_size=False,
+                                     dtype="int32")
             depth_w = np.array([4])
             self.assertRaises(TypeError, fluid.layers.one_hot, in_r, 4.1)
             self.assertRaises(TypeError, fluid.layers.one_hot, in_r, depth_w)
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
index b16c4b5ce69e1..14ea523fb296c 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
@@ -26,6 +26,7 @@
 
 
 class TestOneHotOp(OpTest):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         depth = 10
@@ -49,6 +50,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_attr(OpTest):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         depth = 10
@@ -72,6 +74,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_default_dtype(OpTest):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         depth = 10
@@ -95,6 +98,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_default_dtype_attr(OpTest):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         depth = 10
@@ -118,6 +122,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_exception(unittest.TestCase):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         self.depth = 10
@@ -133,18 +138,18 @@ def setUp(self):
     def test_check_output(self):
         program = Program()
         with program_guard(program):
-            x = fluid.layers.data(
-                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
+            x = fluid.layers.data(name='x',
+                                  shape=[self.dimension],
+                                  dtype='float32',
+                                  lod_level=1)
             block = program.current_block()
-            one_hot_out = block.create_var(
-                name="one_hot_out",
-                type=core.VarDesc.VarType.LOD_TENSOR,
-                dtype='float32')
-            block.append_op(
-                type='one_hot',
-                inputs={'X': x},
-                attrs={'depth': self.depth},
-                outputs={'Out': one_hot_out})
+            one_hot_out = block.create_var(name="one_hot_out",
+                                           type=core.VarDesc.VarType.LOD_TENSOR,
+                                           dtype='float32')
+            block.append_op(type='one_hot',
+                            inputs={'X': x},
+                            attrs={'depth': self.depth},
+                            outputs={'Out': one_hot_out})
             exe = fluid.Executor(self.place)
 
             def run():
@@ -156,6 +161,7 @@ def run():
 
 
 class TestOneHotOpApi(unittest.TestCase):
+
     def test_api(self):
         depth = 10
         self._run(depth)
@@ -188,21 +194,23 @@ def _run(self, depth):
 
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'label': label_data, },
+        ret = exe.run(feed={
+            'label': label_data,
+        },
                       fetch_list=[one_hot_label],
                       return_numpy=False)
 
 
 class BadInputTestOnehotV2(unittest.TestCase):
+
     def test_error(self):
         with fluid.program_guard(fluid.Program()):
 
             def test_bad_x():
-                label = fluid.layers.data(
-                    name="label",
-                    shape=[4],
-                    append_batch_size=False,
-                    dtype="float32")
+                label = fluid.layers.data(name="label",
+                                          shape=[4],
+                                          append_batch_size=False,
+                                          dtype="float32")
                 one_hot_label = fluid.one_hot(input=label, depth=4)
 
             self.assertRaises(TypeError, test_bad_x)
diff --git a/python/paddle/fluid/tests/unittests/test_ones_like.py b/python/paddle/fluid/tests/unittests/test_ones_like.py
index db7fc9d2b2e99..0c6e2476be324 100644
--- a/python/paddle/fluid/tests/unittests/test_ones_like.py
+++ b/python/paddle/fluid/tests/unittests/test_ones_like.py
@@ -23,6 +23,7 @@
 
 
 class TestOnesLikeAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             x = paddle.fluid.data('x', [3, 4])
@@ -30,6 +31,7 @@ def test_errors(self):
 
 
 class TestOnesLikeAPI(unittest.TestCase):
+
     def test_api(self):
         shape = [3, 4]
         startup_program = Program()
@@ -44,8 +46,8 @@ def test_api(self):
             out4 = ones_like(x, 'int32')
             out5 = ones_like(x, 'int64')
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         outs = exe.run(train_program,
                        feed={'X': np.ones(shape).astype('float32')},
@@ -58,10 +60,11 @@ def test_api(self):
 
 
 class TestOnesLikeImpeartive(unittest.TestCase):
+
     def test_out(self):
         shape = [3, 4]
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         paddle.disable_static(place)
         x = paddle.to_tensor(np.ones(shape))
         for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
diff --git a/python/paddle/fluid/tests/unittests/test_ones_op.py b/python/paddle/fluid/tests/unittests/test_ones_op.py
index 47ce379643242..93cda220496df 100644
--- a/python/paddle/fluid/tests/unittests/test_ones_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ones_op.py
@@ -26,6 +26,7 @@
 
 
 class ApiOnesTest(unittest.TestCase):
+
     def test_paddle_ones(self):
         with paddle.static.program_guard(paddle.static.Program()):
             ones = paddle.ones(shape=[10])
@@ -62,7 +63,9 @@ def test_fluid_ones(self):
 
 
 class ApiOnesZerosError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_error1():
             with paddle.static.program_guard(paddle.static.Program()):
                 ones = paddle.ones(shape=10, dtype="int64")
diff --git a/python/paddle/fluid/tests/unittests/test_onnx_export.py b/python/paddle/fluid/tests/unittests/test_onnx_export.py
index 07016d4290102..06375125232b7 100644
--- a/python/paddle/fluid/tests/unittests/test_onnx_export.py
+++ b/python/paddle/fluid/tests/unittests/test_onnx_export.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,6 +25,7 @@
 
 
 class LinearNet(paddle.nn.Layer):
+
     def __init__(self):
         super(LinearNet, self).__init__()
         self._linear = paddle.nn.Linear(128, 10)
@@ -34,6 +35,7 @@ def forward(self, x):
 
 
 class Logic(paddle.nn.Layer):
+
     def __init__(self):
         super(Logic, self).__init__()
 
@@ -45,9 +47,10 @@ def forward(self, x, y, z):
 
 
 class TestExportWithTensor(unittest.TestCase):
+
     def func_with_tensor(self):
-        self.x_spec = paddle.static.InputSpec(
-            shape=[None, 128], dtype='float32')
+        self.x_spec = paddle.static.InputSpec(shape=[None, 128],
+                                              dtype='float32')
         model = LinearNet()
         paddle.onnx.export(model, 'linear_net', input_spec=[self.x_spec])
 
@@ -58,6 +61,7 @@ def test_with_tensor(self):
 
 
 class TestExportWithTensor1(unittest.TestCase):
+
     def func_with_tensor(self):
         self.x = paddle.to_tensor(np.random.random((1, 128)))
         model = LinearNet()
@@ -70,14 +74,17 @@ def test_with_tensor(self):
 
 
 class TestExportPrunedGraph(unittest.TestCase):
+
     def func_prune_graph(self):
         model = Logic()
         self.x = paddle.to_tensor(np.array([1]))
         self.y = paddle.to_tensor(np.array([-1]))
         paddle.jit.to_static(model)
         out = model(self.x, self.y, z=True)
-        paddle.onnx.export(
-            model, 'pruned', input_spec=[self.x], output_spec=[out])
+        paddle.onnx.export(model,
+                           'pruned',
+                           input_spec=[self.x],
+                           output_spec=[out])
 
     def test_prune_graph(self):
         # test eager
diff --git a/python/paddle/fluid/tests/unittests/test_op_function_generator.py b/python/paddle/fluid/tests/unittests/test_op_function_generator.py
index c712b5db0f31f..e97895cf8bbe1 100644
--- a/python/paddle/fluid/tests/unittests/test_op_function_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_op_function_generator.py
@@ -25,6 +25,7 @@
 
 
 class TestTracedLayer(fluid.dygraph.Layer):
+
     def __init__(self, name_scope):
         super(TestTracedLayer, self).__init__(name_scope)
 
@@ -33,6 +34,7 @@ def forward(self, input):
 
 
 class TestVariable(unittest.TestCase):
+
     def setUp(self):
         self.shape = [512, 768]
         self.dtype = np.float32
@@ -74,6 +76,7 @@ def test_relu(self):
             self.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
 
     def test_trace_backward(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with fluid.dygraph.guard():
             a = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
             b = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
@@ -90,6 +93,7 @@ def test_trace_backward(self):
 
             self.assertTrue(np.array_equal(x_grad, loss.gradient() * b))
             self.assertTrue(np.array_equal(y_grad, loss.gradient() * a))
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_traced_layer(self):
         if in_dygraph_mode():
diff --git a/python/paddle/fluid/tests/unittests/test_op_name_conflict.py b/python/paddle/fluid/tests/unittests/test_op_name_conflict.py
index ee8f0c2cd2939..86f82f0a9e22e 100644
--- a/python/paddle/fluid/tests/unittests/test_op_name_conflict.py
+++ b/python/paddle/fluid/tests/unittests/test_op_name_conflict.py
@@ -18,6 +18,7 @@
 
 
 class TestOpNameConflict(unittest.TestCase):
+
     def test_conflict(self):
         main = fluid.Program()
         startup = fluid.Program()
@@ -53,12 +54,16 @@ def test_layers(self):
                 ) else fluid.CPUPlace()
                 exe = fluid.Executor(place)
 
-                data = fluid.data(
-                    name='data', shape=[None, 1, 2, 2], dtype='float32')
-                tensor = fluid.data(
-                    name='tensor', shape=[None, 32, 64], dtype='float32')
-                x = fluid.data(
-                    name='x', shape=[None, 1], dtype='float32', lod_level=1)
+                data = fluid.data(name='data',
+                                  shape=[None, 1, 2, 2],
+                                  dtype='float32')
+                tensor = fluid.data(name='tensor',
+                                    shape=[None, 32, 64],
+                                    dtype='float32')
+                x = fluid.data(name='x',
+                               shape=[None, 1],
+                               dtype='float32',
+                               lod_level=1)
 
                 input_scale = fluid.layers.create_parameter(
                     shape=[1],
@@ -68,10 +73,12 @@ def test_layers(self):
                     shape=[1],
                     dtype="float32",
                     default_initializer=fluid.initializer.Constant(0.5))
-                out_affine = fluid.layers.affine_channel(
-                    data, scale=input_scale, bias=input_bias)
-                out_similarity = fluid.layers.similarity_focus(
-                    input=data, axis=1, indexes=[0])
+                out_affine = fluid.layers.affine_channel(data,
+                                                         scale=input_scale,
+                                                         bias=input_bias)
+                out_similarity = fluid.layers.similarity_focus(input=data,
+                                                               axis=1,
+                                                               indexes=[0])
                 position_tensor = fluid.layers.add_position_encoding(
                     input=tensor, alpha=1.0, beta=1.0)
                 x_reversed = fluid.layers.sequence_reverse(x)
@@ -82,17 +89,20 @@ def test_layers(self):
                 x_d = fluid.create_lod_tensor(
                     np.array([[1.1], [2.2], [3.3], [4.4]]).astype('float32'),
                     [[1, 3]], place)
-                outs = exe.run(
-                    test_program,
-                    fetch_list=[
-                        out_affine, out_similarity, position_tensor, x_reversed
-                    ],
-                    feed={
-                        data.name: np.ones([1, 1, 2, 2]).astype('float32'),
-                        tensor.name: np.ones([1, 32, 64]).astype('float32'),
-                        x.name: x_d
-                    },
-                    return_numpy=False)
+                outs = exe.run(test_program,
+                               fetch_list=[
+                                   out_affine, out_similarity, position_tensor,
+                                   x_reversed
+                               ],
+                               feed={
+                                   data.name:
+                                   np.ones([1, 1, 2, 2]).astype('float32'),
+                                   tensor.name:
+                                   np.ones([1, 32, 64]).astype('float32'),
+                                   x.name:
+                                   x_d
+                               },
+                               return_numpy=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_op_support_gpu.py b/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
index e203fccd03f86..dba0577a101c7 100644
--- a/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
@@ -19,6 +19,7 @@
 
 
 class TestOpSupportGPU(unittest.TestCase):
+
     def test_case(self):
         self.assertEqual(core.is_compiled_with_cuda(),
                          core.op_support_gpu("sum"))
diff --git a/python/paddle/fluid/tests/unittests/test_op_version.py b/python/paddle/fluid/tests/unittests/test_op_version.py
index 1d7167955ac7c..87f12a0bc6e0e 100644
--- a/python/paddle/fluid/tests/unittests/test_op_version.py
+++ b/python/paddle/fluid/tests/unittests/test_op_version.py
@@ -21,6 +21,7 @@
 
 
 class OpLastCheckpointCheckerTest(unittest.TestCase):
+
     def __init__(self, methodName='runTest'):
         super(OpLastCheckpointCheckerTest, self).__init__(methodName)
         self.checker = utils.OpLastCheckpointChecker()
@@ -51,6 +52,7 @@ def test_op_bug_fix_info(self):
 
 
 class OpVersionTest(unittest.TestCase):
+
     def __init__(self, methodName='runTest'):
         super(OpVersionTest, self).__init__(methodName)
         self.vmap = fluid.core.get_op_version_map()
diff --git a/python/paddle/fluid/tests/unittests/test_operator.py b/python/paddle/fluid/tests/unittests/test_operator.py
index 544fca8cecd0a..a3ca52f78c97c 100644
--- a/python/paddle/fluid/tests/unittests/test_operator.py
+++ b/python/paddle/fluid/tests/unittests/test_operator.py
@@ -21,6 +21,7 @@
 
 
 class TestGetAllProtos(unittest.TestCase):
+
     def test_all(self):
         all_protos = op.get_all_op_protos()
         self.assertNotEqual(0, len(all_protos))
@@ -30,6 +31,7 @@ def test_all(self):
 
 
 class TestOpDescCreationMethod(unittest.TestCase):
+
     def test_plain_input_output(self):
         op_proto = framework_pb2.OpProto()
         op_proto.type = "test"
@@ -110,8 +112,10 @@ def test_multiple_input_plain_output(self):
         expected1.type = 'fc'
         self.assertEqual(expected1, generated1)
 
-        generated2 = method(
-            X=['x1', 'x2', 'x3'], b='b', W=['w1', 'w2', 'w3'], Y='y')
+        generated2 = method(X=['x1', 'x2', 'x3'],
+                            b='b',
+                            W=['w1', 'w2', 'w3'],
+                            Y='y')
         expected2 = framework_pb2.OpDesc()
 
         tmp = expected2.inputs.add()
@@ -158,14 +162,13 @@ def __add_attr__(name, type):
 
         method = op.OpDescCreationMethod(op_proto)
 
-        generated = method(
-            X="a",
-            int_attr=10,
-            float_attr=3.2,
-            string_attr="test_str",
-            ints_attr=[0, 1, 2, 3, 4],
-            floats_attr=[0.2, 3.2, 4.5],
-            strings_attr=["a", "b", "c"])
+        generated = method(X="a",
+                           int_attr=10,
+                           float_attr=3.2,
+                           string_attr="test_str",
+                           ints_attr=[0, 1, 2, 3, 4],
+                           floats_attr=[0.2, 3.2, 4.5],
+                           strings_attr=["a", "b", "c"])
 
         expected = framework_pb2.OpDesc()
         expected.type = "test"
@@ -208,6 +211,7 @@ def __add_attr__(name, type):
 
 
 class TestOpCreations(unittest.TestCase):
+
     def test_all(self):
         add_op = op.Operator("sum", X=["a", "b"], Out="z")
         self.assertIsNotNone(add_op)
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 17eeedc524467..3c0871cfc8265 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -25,6 +25,7 @@
 
 
 class TestOperator(unittest.TestCase):
+
     def test_error_type(self):
         block = main_program._create_block()
         try:
@@ -45,18 +46,25 @@ def test_error_type(self):
     def test_op_desc_creation(self):
         program = Program()
         block = program.current_block()
-        mul_x = block.create_var(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        mul_op = block.append_op(
-            type="mul",
-            inputs={"X": [mul_x],
-                    "Y": mul_y},
-            outputs={"Out": [mul_out]},
-            attrs={"x_num_col_dims": 1})
+        mul_x = block.create_var(dtype="float32",
+                                 shape=[5, 10],
+                                 lod_level=0,
+                                 name="mul.x")
+        mul_y = block.create_var(dtype="float32",
+                                 shape=[10, 8],
+                                 lod_level=0,
+                                 name="mul.y")
+        mul_out = block.create_var(dtype="float32",
+                                   shape=[5, 8],
+                                   lod_level=0,
+                                   name="mul.out")
+        mul_op = block.append_op(type="mul",
+                                 inputs={
+                                     "X": [mul_x],
+                                     "Y": mul_y
+                                 },
+                                 outputs={"Out": [mul_out]},
+                                 attrs={"x_num_col_dims": 1})
 
         self.assertNotEqual(str(mul_op), "")
         self.assertEqual(mul_op.type, "mul")
@@ -87,18 +95,25 @@ def test_op_desc_creation(self):
     def test_mult_input(self):
         program = Program()
         block = program.current_block()
-        sum_x1 = block.create_var(
-            dtype="int", shape=[3, 4], lod_level=0, name="sum.x1")
-        sum_x2 = block.create_var(
-            dtype="int", shape=[3, 4], lod_level=0, name="sum.x2")
-        sum_x3 = block.create_var(
-            dtype="int", shape=[3, 4], lod_level=0, name="sum.x3")
-        sum_out = block.create_var(
-            dtype="int", shape=[3, 4], lod_level=0, name="sum.out")
-        sum_op = block.append_op(
-            type="sum",
-            inputs={"X": [sum_x1, sum_x2, sum_x3]},
-            outputs={"Out": sum_out})
+        sum_x1 = block.create_var(dtype="int",
+                                  shape=[3, 4],
+                                  lod_level=0,
+                                  name="sum.x1")
+        sum_x2 = block.create_var(dtype="int",
+                                  shape=[3, 4],
+                                  lod_level=0,
+                                  name="sum.x2")
+        sum_x3 = block.create_var(dtype="int",
+                                  shape=[3, 4],
+                                  lod_level=0,
+                                  name="sum.x3")
+        sum_out = block.create_var(dtype="int",
+                                   shape=[3, 4],
+                                   lod_level=0,
+                                   name="sum.out")
+        sum_op = block.append_op(type="sum",
+                                 inputs={"X": [sum_x1, sum_x2, sum_x3]},
+                                 outputs={"Out": sum_out})
         self.assertEqual(sum_op.type, "sum")
         self.assertEqual(sum_op.input_names, ["X"])
         self.assertEqual(sum_op.input("X"), ["sum.x1", "sum.x2", "sum.x3"])
diff --git a/python/paddle/fluid/tests/unittests/test_ops_nms.py b/python/paddle/fluid/tests/unittests/test_ops_nms.py
index c0bbe82d3581a..54ea804cdbd9b 100644
--- a/python/paddle/fluid/tests/unittests/test_ops_nms.py
+++ b/python/paddle/fluid/tests/unittests/test_ops_nms.py
@@ -70,6 +70,7 @@ def gen_args(num_boxes, dtype):
 
 
 class TestOpsNMS(unittest.TestCase):
+
     def setUp(self):
         self.num_boxes = 64
         self.threshold = 0.5
@@ -85,11 +86,11 @@ def test_nms(self):
                 boxes, scores, category_idxs, categories = gen_args(
                     self.num_boxes, dtype)
                 paddle.set_device(device)
-                out = paddle.vision.ops.nms(
-                    paddle.to_tensor(boxes), self.threshold,
-                    paddle.to_tensor(scores))
-                out = paddle.vision.ops.nms(
-                    paddle.to_tensor(boxes), self.threshold)
+                out = paddle.vision.ops.nms(paddle.to_tensor(boxes),
+                                            self.threshold,
+                                            paddle.to_tensor(scores))
+                out = paddle.vision.ops.nms(paddle.to_tensor(boxes),
+                                            self.threshold)
                 out_py = nms(boxes, self.threshold)
 
                 self.assertTrue(
@@ -102,10 +103,11 @@ def test_multiclass_nms_dynamic(self):
                 boxes, scores, category_idxs, categories = gen_args(
                     self.num_boxes, dtype)
                 paddle.set_device(device)
-                out = paddle.vision.ops.nms(
-                    paddle.to_tensor(boxes), self.threshold,
-                    paddle.to_tensor(scores),
-                    paddle.to_tensor(category_idxs), categories, self.topk)
+                out = paddle.vision.ops.nms(paddle.to_tensor(boxes),
+                                            self.threshold,
+                                            paddle.to_tensor(scores),
+                                            paddle.to_tensor(category_idxs),
+                                            categories, self.topk)
                 out_py = multiclass_nms(boxes, scores, category_idxs,
                                         self.threshold, self.topk)
 
@@ -119,10 +121,12 @@ def test_multiclass_nms_static(self):
                 paddle.enable_static()
                 boxes, scores, category_idxs, categories = gen_args(
                     self.num_boxes, dtype)
-                boxes_static = paddle.static.data(
-                    shape=boxes.shape, dtype=boxes.dtype, name="boxes")
-                scores_static = paddle.static.data(
-                    shape=scores.shape, dtype=scores.dtype, name="scores")
+                boxes_static = paddle.static.data(shape=boxes.shape,
+                                                  dtype=boxes.dtype,
+                                                  name="boxes")
+                scores_static = paddle.static.data(shape=scores.shape,
+                                                   dtype=scores.dtype,
+                                                   name="scores")
                 category_idxs_static = paddle.static.data(
                     shape=category_idxs.shape,
                     dtype=category_idxs.dtype,
@@ -175,15 +179,17 @@ def fun(x):
                     fun,
                     path,
                     input_spec=[
-                        paddle.static.InputSpec(
-                            shape=[None, 4], dtype='float32', name='x')
-                    ], )
+                        paddle.static.InputSpec(shape=[None, 4],
+                                                dtype='float32',
+                                                name='x')
+                    ],
+                )
                 load_func = paddle.jit.load(path)
                 res = load_func(paddle.to_tensor(boxes))
                 self.assertTrue(
                     np.array_equal(origin, res),
-                    "origin out: {}\n inference model out: {}\n".format(origin,
-                                                                        res))
+                    "origin out: {}\n inference model out: {}\n".format(
+                        origin, res))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index a0c5ce77f1d25..b70b69ca97c3d 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -28,35 +28,45 @@
 import paddle
 from paddle.io import Dataset
 import numpy
+
 paddle.enable_static()
 
 
 class TestOptimizer(unittest.TestCase):
+
     def test_sgd_optimizer(self):
+
         def check_sgd_optimizer(optimizer_attr):
             init_program = framework.Program()
             program = framework.Program()
             block = program.global_block()
-            mul_x = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="mul.x",
-                optimize_attr=optimizer_attr)
-            mul_y = block.create_var(
-                dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-            mul_out = block.create_var(
-                dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-            mean_out = block.create_var(
-                dtype="float32", shape=[1], lod_level=0, name="mean.out")
-            block.append_op(
-                type="mul",
-                inputs={"X": mul_x,
-                        "Y": mul_y},
-                outputs={"Out": mul_out},
-                attrs={"x_num_col_dims": 1})
-            block.append_op(
-                type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+            mul_x = block.create_parameter(dtype="float32",
+                                           shape=[5, 10],
+                                           lod_level=0,
+                                           name="mul.x",
+                                           optimize_attr=optimizer_attr)
+            mul_y = block.create_var(dtype="float32",
+                                     shape=[10, 8],
+                                     lod_level=0,
+                                     name="mul.y")
+            mul_out = block.create_var(dtype="float32",
+                                       shape=[5, 8],
+                                       lod_level=0,
+                                       name="mul.out")
+            mean_out = block.create_var(dtype="float32",
+                                        shape=[1],
+                                        lod_level=0,
+                                        name="mean.out")
+            block.append_op(type="mul",
+                            inputs={
+                                "X": mul_x,
+                                "Y": mul_y
+                            },
+                            outputs={"Out": mul_out},
+                            attrs={"x_num_col_dims": 1})
+            block.append_op(type="mean",
+                            inputs={"X": mul_out},
+                            outputs={"Out": mean_out})
             sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
             opts, _ = sgd_optimizer.minimize(mean_out, init_program)
             return opts
@@ -71,31 +81,40 @@ def check_sgd_optimizer(optimizer_attr):
 
 
 class TestOptimizerBackwardApplygrad(unittest.TestCase):
+
     def test_sgd_optimizer(self):
+
         def check_sgd_optimizer(optimizer_attr):
             init_program = framework.Program()
             program = framework.Program()
             block = program.global_block()
-            mul_x = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="mul.x",
-                optimize_attr=optimizer_attr)
-            mul_y = block.create_var(
-                dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-            mul_out = block.create_var(
-                dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-            mean_out = block.create_var(
-                dtype="float32", shape=[1], lod_level=0, name="mean.out")
-            block.append_op(
-                type="mul",
-                inputs={"X": mul_x,
-                        "Y": mul_y},
-                outputs={"Out": mul_out},
-                attrs={"x_num_col_dims": 1})
-            block.append_op(
-                type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+            mul_x = block.create_parameter(dtype="float32",
+                                           shape=[5, 10],
+                                           lod_level=0,
+                                           name="mul.x",
+                                           optimize_attr=optimizer_attr)
+            mul_y = block.create_var(dtype="float32",
+                                     shape=[10, 8],
+                                     lod_level=0,
+                                     name="mul.y")
+            mul_out = block.create_var(dtype="float32",
+                                       shape=[5, 8],
+                                       lod_level=0,
+                                       name="mul.out")
+            mean_out = block.create_var(dtype="float32",
+                                        shape=[1],
+                                        lod_level=0,
+                                        name="mean.out")
+            block.append_op(type="mul",
+                            inputs={
+                                "X": mul_x,
+                                "Y": mul_y
+                            },
+                            outputs={"Out": mul_out},
+                            attrs={"x_num_col_dims": 1})
+            block.append_op(type="mean",
+                            inputs={"X": mul_out},
+                            outputs={"Out": mean_out})
             sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
             with framework.program_guard(program, init_program):
                 p_g = sgd_optimizer.backward(mean_out)
@@ -112,7 +131,9 @@ def check_sgd_optimizer(optimizer_attr):
 
 
 class TestMomentumOptimizer(unittest.TestCase):
+
     class MockMomentum(optimizer.MomentumOptimizer):
+
         def get_accumulators(self):
             return self._accumulators
 
@@ -123,29 +144,36 @@ def test_vanilla_momentum_optimizer(self):
         init_program = framework.Program()
         program = framework.Program()
         block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
+        mul_x = block.create_parameter(dtype="float32",
+                                       shape=[5, 10],
+                                       lod_level=0,
+                                       name="mul.x",
+                                       optimize_attr={'learning_rate': 1.1})
+        mul_y = block.create_var(dtype="float32",
+                                 shape=[10, 8],
+                                 lod_level=0,
+                                 name="mul.y")
+        mul_out = block.create_var(dtype="float32",
+                                   shape=[5, 8],
+                                   lod_level=0,
+                                   name="mul.out")
+        block.append_op(type="mul",
+                        inputs={
+                            "X": mul_x,
+                            "Y": mul_y
+                        },
+                        outputs={"Out": mul_out},
+                        attrs={"x_num_col_dims": 1})
         learning_rate = 0.01
-        momentum_optimizer = self.MockMomentum(
-            learning_rate=learning_rate, momentum=0.2)
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        momentum_optimizer = self.MockMomentum(learning_rate=learning_rate,
+                                               momentum=0.2)
+        mean_out = block.create_var(dtype="float32",
+                                    shape=[1],
+                                    lod_level=0,
+                                    name="mean.out")
+        block.append_op(type="mean",
+                        inputs={"X": mul_out},
+                        outputs={"Out": mean_out})
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
@@ -176,29 +204,37 @@ def test_nesterov_momentum_optimizer(self):
         init_program = framework.Program()
         program = framework.Program()
         block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        mul_x = block.create_parameter(dtype="float32",
+                                       shape=[5, 10],
+                                       lod_level=0,
+                                       name="mul.x",
+                                       optimize_attr={'learning_rate': 1.1})
+        mul_y = block.create_var(dtype="float32",
+                                 shape=[10, 8],
+                                 lod_level=0,
+                                 name="mul.y")
+        mul_out = block.create_var(dtype="float32",
+                                   shape=[5, 8],
+                                   lod_level=0,
+                                   name="mul.out")
+        block.append_op(type="mul",
+                        inputs={
+                            "X": mul_x,
+                            "Y": mul_y
+                        },
+                        outputs={"Out": mul_out},
+                        attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(dtype="float32",
+                                    shape=[1],
+                                    lod_level=0,
+                                    name="mean.out")
+        block.append_op(type="mean",
+                        inputs={"X": mul_out},
+                        outputs={"Out": mean_out})
         learning_rate = 0.01
-        momentum_optimizer = self.MockMomentum(
-            learning_rate=learning_rate, momentum=0.2, use_nesterov=True)
+        momentum_optimizer = self.MockMomentum(learning_rate=learning_rate,
+                                               momentum=0.2,
+                                               use_nesterov=True)
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
@@ -227,7 +263,9 @@ def test_nesterov_momentum_optimizer(self):
 
 
 class TestAdagradOptimizer(unittest.TestCase):
+
     class MockAdagrad(optimizer.AdagradOptimizer):
+
         def get_accumulators(self):
             return self._accumulators
 
@@ -238,29 +276,36 @@ def test_adagrad_optimizer(self):
         init_program = framework.Program()
         program = framework.Program()
         block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        mul_x = block.create_parameter(dtype="float32",
+                                       shape=[5, 10],
+                                       lod_level=0,
+                                       name="mul.x",
+                                       optimize_attr={'learning_rate': 1.1})
+        mul_y = block.create_var(dtype="float32",
+                                 shape=[10, 8],
+                                 lod_level=0,
+                                 name="mul.y")
+        mul_out = block.create_var(dtype="float32",
+                                   shape=[5, 8],
+                                   lod_level=0,
+                                   name="mul.out")
+        block.append_op(type="mul",
+                        inputs={
+                            "X": mul_x,
+                            "Y": mul_y
+                        },
+                        outputs={"Out": mul_out},
+                        attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(dtype="float32",
+                                    shape=[1],
+                                    lod_level=0,
+                                    name="mean.out")
+        block.append_op(type="mean",
+                        inputs={"X": mul_out},
+                        outputs={"Out": mean_out})
         learning_rate = 0.01
-        adagrad_optimizer = self.MockAdagrad(
-            learning_rate=learning_rate, epsilon=1.0e-6)
+        adagrad_optimizer = self.MockAdagrad(learning_rate=learning_rate,
+                                             epsilon=1.0e-6)
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
@@ -287,7 +332,9 @@ def test_adagrad_optimizer(self):
 
 
 class TestAdamOptimizer(unittest.TestCase):
+
     class MockAdam(optimizer.AdamOptimizer):
+
         def get_accumulators(self):
             return self._accumulators
 
@@ -301,29 +348,37 @@ def test_adam_optimizer(self):
         init_program = framework.Program()
         program = framework.Program()
         block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        mul_x = block.create_parameter(dtype="float32",
+                                       shape=[5, 10],
+                                       lod_level=0,
+                                       name="mul.x",
+                                       optimize_attr={'learning_rate': 1.1})
+        mul_y = block.create_var(dtype="float32",
+                                 shape=[10, 8],
+                                 lod_level=0,
+                                 name="mul.y")
+        mul_out = block.create_var(dtype="float32",
+                                   shape=[5, 8],
+                                   lod_level=0,
+                                   name="mul.out")
+        block.append_op(type="mul",
+                        inputs={
+                            "X": mul_x,
+                            "Y": mul_y
+                        },
+                        outputs={"Out": mul_out},
+                        attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(dtype="float32",
+                                    shape=[1],
+                                    lod_level=0,
+                                    name="mean.out")
+        block.append_op(type="mean",
+                        inputs={"X": mul_out},
+                        outputs={"Out": mean_out})
         learning_rate = 0.01
-        adam_optimizer = self.MockAdam(
-            learning_rate=learning_rate, beta1=0.9, beta2=0.999)
+        adam_optimizer = self.MockAdam(learning_rate=learning_rate,
+                                       beta1=0.9,
+                                       beta2=0.999)
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
@@ -352,7 +407,9 @@ def test_adam_optimizer(self):
 
 
 class TestAdamaxOptimizer(unittest.TestCase):
+
     class MockAdamax(optimizer.AdamaxOptimizer):
+
         def get_accumulators(self):
             return self._accumulators
 
@@ -366,29 +423,37 @@ def test_adamax_optimizer(self):
         init_program = framework.Program()
         program = framework.Program()
         block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        mul_x = block.create_parameter(dtype="float32",
+                                       shape=[5, 10],
+                                       lod_level=0,
+                                       name="mul.x",
+                                       optimize_attr={'learning_rate': 1.1})
+        mul_y = block.create_var(dtype="float32",
+                                 shape=[10, 8],
+                                 lod_level=0,
+                                 name="mul.y")
+        mul_out = block.create_var(dtype="float32",
+                                   shape=[5, 8],
+                                   lod_level=0,
+                                   name="mul.out")
+        block.append_op(type="mul",
+                        inputs={
+                            "X": mul_x,
+                            "Y": mul_y
+                        },
+                        outputs={"Out": mul_out},
+                        attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(dtype="float32",
+                                    shape=[1],
+                                    lod_level=0,
+                                    name="mean.out")
+        block.append_op(type="mean",
+                        inputs={"X": mul_out},
+                        outputs={"Out": mean_out})
         learning_rate = 0.01
-        adamax_optimizer = self.MockAdamax(
-            learning_rate=learning_rate, beta1=0.9, beta2=0.999)
+        adamax_optimizer = self.MockAdamax(learning_rate=learning_rate,
+                                           beta1=0.9,
+                                           beta2=0.999)
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
@@ -417,33 +482,44 @@ def test_adamax_optimizer(self):
 
 
 class TestDpsgdOptimizer(unittest.TestCase):
+
     def test_dpsgd_optimizer(self):
+
         def check_dpsgd_optimizer(optimizer_attr):
             init_program = framework.Program()
             program = framework.Program()
             block = program.global_block()
-            mul_x = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="mul.x",
-                optimize_attr=optimizer_attr)
-            mul_y = block.create_var(
-                dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-            mul_out = block.create_var(
-                dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-            block.append_op(
-                type="mul",
-                inputs={"X": mul_x,
-                        "Y": mul_y},
-                outputs={"Out": mul_out},
-                attrs={"x_num_col_dims": 1})
-            mean_out = block.create_var(
-                dtype="float32", shape=[1], lod_level=0, name="mean.out")
-            block.append_op(
-                type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-            dpsgd_optimizer = optimizer.DpsgdOptimizer(
-                learning_rate=0.01, clip=100.0, batch_size=16.0, sigma=0.0)
+            mul_x = block.create_parameter(dtype="float32",
+                                           shape=[5, 10],
+                                           lod_level=0,
+                                           name="mul.x",
+                                           optimize_attr=optimizer_attr)
+            mul_y = block.create_var(dtype="float32",
+                                     shape=[10, 8],
+                                     lod_level=0,
+                                     name="mul.y")
+            mul_out = block.create_var(dtype="float32",
+                                       shape=[5, 8],
+                                       lod_level=0,
+                                       name="mul.out")
+            block.append_op(type="mul",
+                            inputs={
+                                "X": mul_x,
+                                "Y": mul_y
+                            },
+                            outputs={"Out": mul_out},
+                            attrs={"x_num_col_dims": 1})
+            mean_out = block.create_var(dtype="float32",
+                                        shape=[1],
+                                        lod_level=0,
+                                        name="mean.out")
+            block.append_op(type="mean",
+                            inputs={"X": mul_out},
+                            outputs={"Out": mean_out})
+            dpsgd_optimizer = optimizer.DpsgdOptimizer(learning_rate=0.01,
+                                                       clip=100.0,
+                                                       batch_size=16.0,
+                                                       sigma=0.0)
             opts, _ = dpsgd_optimizer.minimize(mean_out, init_program)
             return opts
 
@@ -458,7 +534,9 @@ def check_dpsgd_optimizer(optimizer_attr):
 
 
 class TestDecayedAdagradOptimizer(unittest.TestCase):
+
     class MockDecayedAdagrad(optimizer.DecayedAdagradOptimizer):
+
         def get_accumulators(self):
             return self._accumulators
 
@@ -469,26 +547,33 @@ def test_decayed_adagrad_optimizer(self):
         init_program = framework.Program()
         program = framework.Program()
         block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        mul_x = block.create_parameter(dtype="float32",
+                                       shape=[5, 10],
+                                       lod_level=0,
+                                       name="mul.x",
+                                       optimize_attr={'learning_rate': 1.1})
+        mul_y = block.create_var(dtype="float32",
+                                 shape=[10, 8],
+                                 lod_level=0,
+                                 name="mul.y")
+        mul_out = block.create_var(dtype="float32",
+                                   shape=[5, 8],
+                                   lod_level=0,
+                                   name="mul.out")
+        block.append_op(type="mul",
+                        inputs={
+                            "X": mul_x,
+                            "Y": mul_y
+                        },
+                        outputs={"Out": mul_out},
+                        attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(dtype="float32",
+                                    shape=[1],
+                                    lod_level=0,
+                                    name="mean.out")
+        block.append_op(type="mean",
+                        inputs={"X": mul_out},
+                        outputs={"Out": mean_out})
         learning_rate = 0.01
         decayed_adagrad_optimizer = self.MockDecayedAdagrad(
             learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6)
@@ -519,7 +604,9 @@ def test_decayed_adagrad_optimizer(self):
 
 
 class TestFtrlOptimizer(unittest.TestCase):
+
     class MockFtrl(optimizer.FtrlOptimizer):
+
         def get_accumulators(self):
             return self._accumulators
 
@@ -533,29 +620,38 @@ def test_ftrl_optimizer(self):
         init_program = framework.Program()
         program = framework.Program()
         block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        mul_x = block.create_parameter(dtype="float32",
+                                       shape=[5, 10],
+                                       lod_level=0,
+                                       name="mul.x",
+                                       optimize_attr={'learning_rate': 1.1})
+        mul_y = block.create_var(dtype="float32",
+                                 shape=[10, 8],
+                                 lod_level=0,
+                                 name="mul.y")
+        mul_out = block.create_var(dtype="float32",
+                                   shape=[5, 8],
+                                   lod_level=0,
+                                   name="mul.out")
+        block.append_op(type="mul",
+                        inputs={
+                            "X": mul_x,
+                            "Y": mul_y
+                        },
+                        outputs={"Out": mul_out},
+                        attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(dtype="float32",
+                                    shape=[1],
+                                    lod_level=0,
+                                    name="mean.out")
+        block.append_op(type="mean",
+                        inputs={"X": mul_out},
+                        outputs={"Out": mean_out})
         learning_rate = 0.01
-        ftrl_optimizer = self.MockFtrl(
-            learning_rate=learning_rate, l1=0.0, l2=0.0, lr_power=-0.5)
+        ftrl_optimizer = self.MockFtrl(learning_rate=learning_rate,
+                                       l1=0.0,
+                                       l2=0.0,
+                                       lr_power=-0.5)
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
@@ -584,34 +680,44 @@ def test_ftrl_optimizer(self):
 
 
 class TestLookaheadOptimizer(unittest.TestCase):
+
     def test_lookahead_optimizer(self):
         init_program = framework.Program()
         program = framework.Program()
         block = program.global_block()
         init_block = init_program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="mul.x",
-            optimize_attr={'learning_rate': 1.1})
-        init_mul_x = init_block.create_parameter(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        mul_x = block.create_parameter(dtype="float32",
+                                       shape=[5, 10],
+                                       lod_level=0,
+                                       name="mul.x",
+                                       optimize_attr={'learning_rate': 1.1})
+        init_mul_x = init_block.create_parameter(dtype="float32",
+                                                 shape=[5, 10],
+                                                 lod_level=0,
+                                                 name="mul.x")
+        mul_y = block.create_var(dtype="float32",
+                                 shape=[10, 8],
+                                 lod_level=0,
+                                 name="mul.y")
+        mul_out = block.create_var(dtype="float32",
+                                   shape=[5, 8],
+                                   lod_level=0,
+                                   name="mul.out")
+        mean_out = block.create_var(dtype="float32",
+                                    shape=[1],
+                                    lod_level=0,
+                                    name="mean.out")
+
+        block.append_op(type="mul",
+                        inputs={
+                            "X": mul_x,
+                            "Y": mul_y
+                        },
+                        outputs={"Out": mul_out},
+                        attrs={"x_num_col_dims": 1})
+        block.append_op(type="mean",
+                        inputs={"X": mul_out},
+                        outputs={"Out": mean_out})
 
         sgd = optimizer.SGD(learning_rate=0.01)
         lookahead = optimizer.LookaheadOptimizer(sgd, alpha=0.5, k=5)
@@ -622,83 +728,109 @@ def test_lookahead_optimizer(self):
 
 
 class TestRecomputeOptimizer(unittest.TestCase):
+
     def net(self, return_input=False, with_dropout=False, with_seed=False):
         program = framework.Program()
         block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        mul_x = block.create_parameter(dtype="float32",
+                                       shape=[5, 10],
+                                       lod_level=0,
+                                       name="mul.x")
+        mul_y = block.create_var(dtype="float32",
+                                 shape=[10, 8],
+                                 lod_level=0,
+                                 name="mul.y")
+        mul_out = block.create_var(dtype="float32",
+                                   shape=[5, 8],
+                                   lod_level=0,
+                                   name="mul.out")
 
         if with_dropout is True:
-            mul_out_drop = block.create_var(
-                dtype="float32",
-                shape=[5, 8],
-                lod_level=0,
-                name="mul.out.dropout")
-            mul_out_mask = block.create_var(
-                dtype="uint8", shape=[5, 8], lod_level=0, name="mul.out.mask")
+            mul_out_drop = block.create_var(dtype="float32",
+                                            shape=[5, 8],
+                                            lod_level=0,
+                                            name="mul.out.dropout")
+            mul_out_mask = block.create_var(dtype="uint8",
+                                            shape=[5, 8],
+                                            lod_level=0,
+                                            name="mul.out.mask")
             if with_seed is True:
-                seed_out = block.create_var(
-                    dtype="int32", shape=[1], name="seed.out")
-
-        b1 = block.create_parameter(
-            dtype="float32", shape=[5, 8], lod_level=0, name="b1")
-        b1_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="b1_out")
-        b2 = block.create_parameter(
-            dtype="float32", shape=[5, 8], lod_level=0, name="b2")
-        b2_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="b2_out")
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
+                seed_out = block.create_var(dtype="int32",
+                                            shape=[1],
+                                            name="seed.out")
+
+        b1 = block.create_parameter(dtype="float32",
+                                    shape=[5, 8],
+                                    lod_level=0,
+                                    name="b1")
+        b1_out = block.create_var(dtype="float32",
+                                  shape=[5, 8],
+                                  lod_level=0,
+                                  name="b1_out")
+        b2 = block.create_parameter(dtype="float32",
+                                    shape=[5, 8],
+                                    lod_level=0,
+                                    name="b2")
+        b2_out = block.create_var(dtype="float32",
+                                  shape=[5, 8],
+                                  lod_level=0,
+                                  name="b2_out")
+        mean_out = block.create_var(dtype="float32",
+                                    shape=[1],
+                                    lod_level=0,
+                                    name="mean.out")
+        block.append_op(type="mul",
+                        inputs={
+                            "X": mul_x,
+                            "Y": mul_y
+                        },
+                        outputs={"Out": mul_out},
+                        attrs={"x_num_col_dims": 1})
 
         if with_dropout is True:
             dropout_inputs = {'X': [mul_out]}
             if with_seed is True:
-                block.append_op(
-                    type='seed',
-                    outputs={'Out': seed_out},
-                    attrs={
-                        'deterministic': True,
-                        'rng_name': 'rng0',
-                        'force_cpu': True
-                    })
+                block.append_op(type='seed',
+                                outputs={'Out': seed_out},
+                                attrs={
+                                    'deterministic': True,
+                                    'rng_name': 'rng0',
+                                    'force_cpu': True
+                                })
                 dropout_inputs = {'X': [mul_out], 'Seed': [seed_out]}
 
-            block.append_op(
-                type='dropout',
-                inputs=dropout_inputs,
-                outputs={'Out': [mul_out_drop],
-                         'Mask': [mul_out_mask]},
-                attrs={'dropout_prob': 0.5, })
-            block.append_op(
-                type="elementwise_add",
-                inputs={"X": mul_out_drop,
-                        "Y": b1},
-                outputs={"Out": b1_out})
+            block.append_op(type='dropout',
+                            inputs=dropout_inputs,
+                            outputs={
+                                'Out': [mul_out_drop],
+                                'Mask': [mul_out_mask]
+                            },
+                            attrs={
+                                'dropout_prob': 0.5,
+                            })
+            block.append_op(type="elementwise_add",
+                            inputs={
+                                "X": mul_out_drop,
+                                "Y": b1
+                            },
+                            outputs={"Out": b1_out})
         else:
-            block.append_op(
-                type="elementwise_add",
-                inputs={"X": mul_out,
-                        "Y": b1},
-                outputs={"Out": b1_out})
-
-        block.append_op(
-            type="elementwise_add",
-            inputs={"X": b1_out,
-                    "Y": b2},
-            outputs={"Out": b2_out})
-        block.append_op(
-            type="mean", inputs={"X": b2_out}, outputs={"Out": mean_out})
+            block.append_op(type="elementwise_add",
+                            inputs={
+                                "X": mul_out,
+                                "Y": b1
+                            },
+                            outputs={"Out": b1_out})
+
+        block.append_op(type="elementwise_add",
+                        inputs={
+                            "X": b1_out,
+                            "Y": b2
+                        },
+                        outputs={"Out": b2_out})
+        block.append_op(type="mean",
+                        inputs={"X": b2_out},
+                        outputs={"Out": mean_out})
 
         if return_input == True:
             return mul_x, mul_out, b1_out, b2_out, mean_out
@@ -832,11 +964,10 @@ def test_apply_gradients(self):
         recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
         recompute_optimizer._set_checkpoints([b1_out])
         # apply backward
-        params_grads = recompute_optimizer.backward(
-            mean_out,
-            startup_program=None,
-            parameter_list=None,
-            no_grad_set=None)
+        params_grads = recompute_optimizer.backward(mean_out,
+                                                    startup_program=None,
+                                                    parameter_list=None,
+                                                    no_grad_set=None)
 
         # apply gradient
         program = mean_out.block.program
@@ -916,13 +1047,13 @@ def test_dropout_with_seed(self):
         def gen_data():
             return {
                 "x": np.random.random(size=(100, 3)).astype('float32'),
-                "y": np.random.randint(
-                    2, size=(100, 1)).astype('int64')
+                "y": np.random.randint(2, size=(100, 1)).astype('int64')
             }
 
         def mlp(input_x, input_y):
-            drop_res = fluid.layers.dropout(
-                input_x, dropout_prob=0.5, name="dropout_with_seed_cpu")
+            drop_res = fluid.layers.dropout(input_x,
+                                            dropout_prob=0.5,
+                                            name="dropout_with_seed_cpu")
             prediction = fluid.layers.fc(input=[drop_res],
                                          size=2,
                                          act='softmax')
@@ -935,8 +1066,9 @@ def mlp(input_x, input_y):
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             with program_guard(main_program, startup_program):
-                input_x = fluid.layers.data(
-                    name="x", shape=[3], dtype='float32')
+                input_x = fluid.layers.data(name="x",
+                                            shape=[3],
+                                            dtype='float32')
                 input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
                 drop_res, prediction, cost = mlp(input_x, input_y)
                 sgd = fluid.optimizer.Adam(learning_rate=0.01)
@@ -960,6 +1092,7 @@ def mlp(input_x, input_y):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestRecomputeOptimizerCUDA(unittest.TestCase):
+
     def test_dropout_with_seed(self):
         """
         when we recompute a dropout op, make sure that the recomputed one
@@ -969,13 +1102,13 @@ def test_dropout_with_seed(self):
         def gen_data():
             return {
                 "x": np.random.random(size=(100, 3)).astype('float32'),
-                "y": np.random.randint(
-                    2, size=(100, 1)).astype('int64')
+                "y": np.random.randint(2, size=(100, 1)).astype('int64')
             }
 
         def mlp(input_x, input_y):
-            drop_res = fluid.layers.dropout(
-                input_x, dropout_prob=0.5, name="dropout_with_seed_gpu")
+            drop_res = fluid.layers.dropout(input_x,
+                                            dropout_prob=0.5,
+                                            name="dropout_with_seed_gpu")
             prediction = fluid.layers.fc(input=[drop_res],
                                          size=2,
                                          act='softmax')
@@ -988,8 +1121,9 @@ def mlp(input_x, input_y):
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             with program_guard(main_program, startup_program):
-                input_x = fluid.layers.data(
-                    name="x", shape=[3], dtype='float32')
+                input_x = fluid.layers.data(name="x",
+                                            shape=[3],
+                                            dtype='float32')
                 input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
                 drop_res, prediction, cost = mlp(input_x, input_y)
                 sgd = fluid.optimizer.Adam(learning_rate=0.01)
@@ -1011,34 +1145,50 @@ def mlp(input_x, input_y):
 
 
 class TestGradientMergeOptimizer(unittest.TestCase):
+
     def net(self):
         program = framework.Program()
         block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        b1 = block.create_parameter(
-            dtype="float32", shape=[5, 8], lod_level=0, name="b1")
-        b1_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="b1_out")
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        block.append_op(
-            type="elementwise_add",
-            inputs={"X": mul_out,
-                    "Y": b1},
-            outputs={"Out": b1_out})
-        block.append_op(
-            type="mean", inputs={"X": b1_out}, outputs={"Out": mean_out})
+        mul_x = block.create_parameter(dtype="float32",
+                                       shape=[5, 10],
+                                       lod_level=0,
+                                       name="mul.x")
+        mul_y = block.create_var(dtype="float32",
+                                 shape=[10, 8],
+                                 lod_level=0,
+                                 name="mul.y")
+        mul_out = block.create_var(dtype="float32",
+                                   shape=[5, 8],
+                                   lod_level=0,
+                                   name="mul.out")
+        b1 = block.create_parameter(dtype="float32",
+                                    shape=[5, 8],
+                                    lod_level=0,
+                                    name="b1")
+        b1_out = block.create_var(dtype="float32",
+                                  shape=[5, 8],
+                                  lod_level=0,
+                                  name="b1_out")
+        mean_out = block.create_var(dtype="float32",
+                                    shape=[1],
+                                    lod_level=0,
+                                    name="mean.out")
+        block.append_op(type="mul",
+                        inputs={
+                            "X": mul_x,
+                            "Y": mul_y
+                        },
+                        outputs={"Out": mul_out},
+                        attrs={"x_num_col_dims": 1})
+        block.append_op(type="elementwise_add",
+                        inputs={
+                            "X": mul_out,
+                            "Y": b1
+                        },
+                        outputs={"Out": b1_out})
+        block.append_op(type="mean",
+                        inputs={"X": b1_out},
+                        outputs={"Out": mean_out})
         return mean_out
 
     def test_program_desc(self, ):
@@ -1079,9 +1229,9 @@ def test_program_desc(self, ):
 
         # optimize block
         self.assertEqual(len(main_program.block(1).ops), 6)
-        self.assertEqual([op.type for op in main_program.block(1).ops], [
-            'scale', 'scale', 'sgd', 'sgd', 'fill_constant', 'fill_constant'
-        ])
+        self.assertEqual(
+            [op.type for op in main_program.block(1).ops],
+            ['scale', 'scale', 'sgd', 'sgd', 'fill_constant', 'fill_constant'])
 
 
 class TestOptimizerDtype(unittest.TestCase):
@@ -1091,7 +1241,9 @@ class TestOptimizerDtype(unittest.TestCase):
     '''
 
     def check_with_dtype(self, dtype):
+
         class MyLayer(paddle.nn.Layer):
+
             def __init__(self, dtype):
                 super(MyLayer, self).__init__()
                 self._w = self.create_parameter([2, 3], dtype=dtype)
@@ -1132,6 +1284,7 @@ def check_with_opt_state_dict(self, use_save_load=True):
         numpy.random.seed(100)
 
         class SimpleNet(paddle.nn.Layer):
+
             def __init__(self, input_size, output_size):
                 super(SimpleNet, self).__init__()
                 self.linears = paddle.nn.LayerList([
@@ -1149,6 +1302,7 @@ def forward(self, x):
         nums_batch = 10
 
         class RandomDataset(Dataset):
+
             def __init__(self, num_samples):
                 self.num_samples = num_samples
 
@@ -1161,19 +1315,17 @@ def __len__(self):
                 return self.num_samples
 
         dataset = RandomDataset(nums_batch * batch_size)
-        loader = paddle.io.DataLoader(
-            dataset,
-            batch_size=batch_size,
-            shuffle=False,
-            drop_last=True,
-            num_workers=0)
+        loader = paddle.io.DataLoader(dataset,
+                                      batch_size=batch_size,
+                                      shuffle=False,
+                                      drop_last=True,
+                                      num_workers=0)
 
         mse = paddle.nn.MSELoss()
         model = SimpleNet(input_size, output_size)  # 定义模型
-        optimizer = paddle.optimizer.Momentum(
-            learning_rate=0.0001,
-            parameters=model.parameters(),
-            multi_precision=True)  # 定义优化器
+        optimizer = paddle.optimizer.Momentum(learning_rate=0.0001,
+                                              parameters=model.parameters(),
+                                              multi_precision=True)  # 定义优化器
         scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
         model = paddle.amp.decorate(models=model, level='O2')
 
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py b/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py
index b2b133a6b4274..31bbaefd16519 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py
@@ -23,6 +23,7 @@
 
 
 class TestOptimizerForVarBase(unittest.TestCase):
+
     def setUp(self):
         self.lr = 0.01
 
@@ -33,8 +34,9 @@ def run_optimizer_step_with_varbase_list_input(self, optimizer):
 
         z = x + y
 
-        opt = optimizer(
-            learning_rate=self.lr, parameters=[x], weight_decay=0.01)
+        opt = optimizer(learning_rate=self.lr,
+                        parameters=[x],
+                        weight_decay=0.01)
 
         z.backward()
         opt.step()
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
index 7caae211b7bba..30cfa9f17ebcc 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
@@ -23,6 +23,7 @@
 from paddle.fluid.backward import _append_grad_suffix_
 
 import paddle
+
 paddle.enable_static()
 
 np.random.seed(10)
@@ -81,21 +82,18 @@ def build_net(self, cond_i, use_bf16=False):
         param_x = fluid.layers.create_parameter(
             dtype="float32",
             shape=self.shape,
-            attr=fluid.ParamAttr(
-                learning_rate=self.param_lr, name="param_x"),
+            attr=fluid.ParamAttr(learning_rate=self.param_lr, name="param_x"),
             default_initializer=fluid.initializer.NumpyArrayInitializer(self.x))
 
         param_y = fluid.layers.create_parameter(
             dtype="float32",
             shape=self.shape,
-            attr=fluid.ParamAttr(
-                learning_rate=self.param_lr, name="param_y"),
+            attr=fluid.ParamAttr(learning_rate=self.param_lr, name="param_y"),
             default_initializer=fluid.initializer.NumpyArrayInitializer(self.y))
         param_z = fluid.layers.create_parameter(
             dtype="float32",
             shape=self.shape,
-            attr=fluid.ParamAttr(
-                learning_rate=self.param_lr, name="param_z"),
+            attr=fluid.ParamAttr(learning_rate=self.param_lr, name="param_z"),
             default_initializer=fluid.initializer.NumpyArrayInitializer(self.z))
 
         sum_xy = fluid.layers.elementwise_add(param_x, param_y, name='sum_xy')
@@ -103,18 +101,21 @@ def build_net(self, cond_i, use_bf16=False):
         useless = fluid.layers.fc(param_x, size=1, name='fc_useless')
 
         def cond_true():
-            cond_yz = fluid.layers.elementwise_add(
-                param_y, param_z, name='sum_cond_yz')
+            cond_yz = fluid.layers.elementwise_add(param_y,
+                                                   param_z,
+                                                   name='sum_cond_yz')
             # param_y will not be updated
             param_y.stop_gradient = self.y_no_grad
-            cond_res = fluid.layers.elementwise_add(
-                cond_yz, param_z, name='sum_cond_true')
+            cond_res = fluid.layers.elementwise_add(cond_yz,
+                                                    param_z,
+                                                    name='sum_cond_true')
             cond_useless = fluid.layers.elementwise_mul(param_x, param_y)
             return cond_res
 
         def cond_false():
-            cond_res = fluid.layers.elementwise_add(
-                param_y, param_z, name='sum_cond_false')
+            cond_res = fluid.layers.elementwise_add(param_y,
+                                                    param_z,
+                                                    name='sum_cond_false')
             cond_useless = fluid.layers.elementwise_mul(param_z, param_z)
             return cond_res
 
@@ -229,17 +230,18 @@ def _check_grads(self, use_bf16=False):
                                 res = exe.run(main_program,
                                               fetch_list=fetch_list)
                                 gt_grads = test_net._calc_gradient(cond_i)
-                                gt_params = self._apply_optimize(test_net,
-                                                                 gt_grads)
+                                gt_params = self._apply_optimize(
+                                    test_net, gt_grads)
                                 param_grads = gt_params + gt_grads
                                 for i in range(len(res)):
-                                    np.testing.assert_allclose(res[i],
-                                                               param_grads[i])
+                                    np.testing.assert_allclose(
+                                        res[i], param_grads[i])
 
 
 @unittest.skipIf(not fluid.core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestSGDOptimizer(TestOptimizer):
+
     def test_optimizer_multiblock_except(self):
         with self.assertRaisesRegexp(ValueError,
                                      "var param_y not in this block"):
@@ -256,8 +258,10 @@ class TestAdamOptimizer(TestOptimizer):
     def setUp(self):
         self._init_config()
         beta1, beta2, epsilon = 0.9, 0.999, 1e-8
-        self.optimizer = optimizer.AdamOptimizer(
-            learning_rate=0.01, beta1=beta1, beta2=beta2, epsilon=epsilon)
+        self.optimizer = optimizer.AdamOptimizer(learning_rate=0.01,
+                                                 beta1=beta1,
+                                                 beta2=beta2,
+                                                 epsilon=epsilon)
         self.attr = {
             "beta1": beta1,
             "beta2": beta2,
@@ -282,8 +286,9 @@ def _apply_gradient(self, param, grad, name):
         moment2_out = beta2 * moment2 + (1. - beta2) * np.square(grad)
 
         lr = attr['lr'] * np.sqrt(1. - beta2_pow) / (1. - beta1_pow)
-        param_out = param - lr * (moment1_out / (np.sqrt(moment2_out) + epsilon
-                                                 * np.sqrt(1 - beta2_pow)))
+        param_out = param - lr * (
+            moment1_out /
+            (np.sqrt(moment2_out) + epsilon * np.sqrt(1 - beta2_pow)))
 
         # update hyper-parameter of optimizer
         self.param_attr[name]['beta1_pow'] = beta1_pow * beta1
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
index 2cb6d0be430f1..40afe9248bf9b 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
@@ -78,8 +78,8 @@ def fn_1(opt, avg_loss=None, pred=None, label=None):
 
         def fn_2(opt, avg_loss=None, pred=None, label=None):
             if avg_loss is None:
-                loss = layers.softmax_with_cross_entropy(
-                    logits=pred, label=label)
+                loss = layers.softmax_with_cross_entropy(logits=pred,
+                                                         label=label)
                 avg_loss = layers.mean(loss, name='mean_softmax_loss')
             opt.minimize(avg_loss)
             return avg_loss
@@ -96,14 +96,14 @@ def fn_2(opt, avg_loss=None, pred=None, label=None):
         mod_two = layers.elementwise_mod(id, two) == 0
 
         if loss_in_switch:
-            avg_loss = layers.case([(
-                mod_two, lambda: fn_1(adam, None, prediction, label))],
-                                   lambda: fn_2(sgd, None, prediction, label))
+            avg_loss = layers.case(
+                [(mod_two, lambda: fn_1(adam, None, prediction, label))],
+                lambda: fn_2(sgd, None, prediction, label))
         else:
             loss_1 = layers.cross_entropy(input=prediction, label=label)
             avg_loss_1 = layers.mean(loss_1)
-            loss_2 = layers.softmax_with_cross_entropy(
-                logits=prediction, label=label)
+            loss_2 = layers.softmax_with_cross_entropy(logits=prediction,
+                                                       label=label)
             avg_loss_2 = layers.mean(loss_2)
             avg_loss = layers.case([(mod_two, lambda: fn_1(adam, avg_loss_1))],
                                    lambda: fn_2(sgd, avg_loss_2))
@@ -127,6 +127,7 @@ def fn_2(opt, avg_loss=None, pred=None, label=None):
 
 
 class DygraphLayer(fluid.dygraph.Layer):
+
     def __init__(self):
         super(DygraphLayer, self).__init__()
         self.fc_1 = fluid.dygraph.nn.Linear(
@@ -136,7 +137,8 @@ def __init__(self):
             param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
                 value=0.99)),
             bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
-                value=0.5)), )
+                value=0.5)),
+        )
 
         self.fc_2 = fluid.dygraph.nn.Linear(
             FC_SIZE,
@@ -159,8 +161,8 @@ def dynamic(train_data, use_cuda=False, use_parallel_exe=False):
         fluid.default_startup_program().random_seed = SEED
         fluid.default_main_program().random_seed = SEED
         dy_layer = DygraphLayer()
-        adam = fluid.optimizer.Adam(
-            learning_rate=LR, parameter_list=dy_layer.parameters())
+        adam = fluid.optimizer.Adam(learning_rate=LR,
+                                    parameter_list=dy_layer.parameters())
         sgd = fluid.optimizer.SGD(learning_rate=LR,
                                   parameter_list=dy_layer.parameters())
 
@@ -176,8 +178,8 @@ def dynamic(train_data, use_cuda=False, use_parallel_exe=False):
                 loss.backward()
                 adam.minimize(loss)
             else:
-                softmax_loss = layers.softmax_with_cross_entropy(prediction,
-                                                                 var_label)
+                softmax_loss = layers.softmax_with_cross_entropy(
+                    prediction, var_label)
                 loss = layers.mean(softmax_loss)
                 loss.backward()
                 sgd.minimize(loss)
@@ -199,8 +201,9 @@ def random_input(self,
         np.random.seed(seed)
         image_np = np.random.random(size=image_shape).astype('float32')
         np.random.seed(seed)
-        label_np = np.random.randint(
-            low=0, high=CLASS_NUM - 1, size=label_shape).astype('int64')
+        label_np = np.random.randint(low=0,
+                                     high=CLASS_NUM - 1,
+                                     size=label_shape).astype('int64')
         return image_np, label_np
 
     def init_train_data(self):
@@ -223,13 +226,13 @@ def test_optimzier_in_switch(self):
                 np.allclose(pre_1, pre_2),
                 msg='static prediction is {}\ndynamic prediction is {}'.format(
                     pre_1, pre_2))
-            self.assertTrue(
-                np.allclose(loss_1, loss_2),
-                msg='static loss is {}\ndynamic loss is {}'.format(loss_1,
-                                                                   loss_2))
+            self.assertTrue(np.allclose(loss_1, loss_2),
+                            msg='static loss is {}\ndynamic loss is {}'.format(
+                                loss_1, loss_2))
 
 
 class TestMultiOptimizersMultiCardsError(unittest.TestCase):
+
     def test_error(self):
         startup_program = Program()
         main_program = Program()
@@ -270,15 +273,15 @@ def fn_2(opt, avg_loss):
             # to use multi cards ** only on CPU ** not GPU to reduce CI time.
             os.environ['CPU_NUM'] = str(2)
 
-            pe_exe = fluid.ParallelExecutor(
-                use_cuda=use_cuda,
-                main_program=main_program,
-                loss_name=avg_loss.name)
+            pe_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                            main_program=main_program,
+                                            loss_name=avg_loss.name)
             num_devices = pe_exe.device_count
 
             def not_implemented_error():
                 pe_exe.run(feed={
-                    'X': np.random.random(size=[64, 10]).astype('float32'),
+                    'X':
+                    np.random.random(size=[64, 10]).astype('float32'),
                 },
                            fetch_list=[avg_loss.name])
 
diff --git a/python/paddle/fluid/tests/unittests/test_outer.py b/python/paddle/fluid/tests/unittests/test_outer.py
index 2c4d64344cfc7..cffe8a895c740 100644
--- a/python/paddle/fluid/tests/unittests/test_outer.py
+++ b/python/paddle/fluid/tests/unittests/test_outer.py
@@ -23,21 +23,26 @@
 
 
 class TestMultiplyApi(unittest.TestCase):
+
     def _run_static_graph_case(self, x_data, y_data):
         with program_guard(Program(), Program()):
             paddle.enable_static()
-            x = paddle.static.data(
-                name='x', shape=x_data.shape, dtype=x_data.dtype)
-            y = paddle.static.data(
-                name='y', shape=y_data.shape, dtype=y_data.dtype)
+            x = paddle.static.data(name='x',
+                                   shape=x_data.shape,
+                                   dtype=x_data.dtype)
+            y = paddle.static.data(name='y',
+                                   shape=y_data.shape,
+                                   dtype=y_data.dtype)
             res = paddle.outer(x, y)
 
-            place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
-            ) else paddle.CPUPlace()
+            place = paddle.CUDAPlace(
+                0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             outs = exe.run(paddle.static.default_main_program(),
-                           feed={'x': x_data,
-                                 'y': y_data},
+                           feed={
+                               'x': x_data,
+                               'y': y_data
+                           },
                            fetch_list=[res])
             res = outs[0]
             return res
@@ -89,20 +94,18 @@ def func_test_multiply(self):
         self.assertTrue(np.allclose(res, np.outer(x_data, y_data), rtol=1e4))
 
         # test dynamic computation graph: 2-d array Complex
-        x_data = np.random.rand(20,
-                                50).astype(np.float64) + 1J * np.random.rand(
-                                    20, 50).astype(np.float64)
-        y_data = np.random.rand(50).astype(np.float64) + 1J * np.random.rand(
-            50).astype(np.float64)
+        x_data = np.random.rand(20, 50).astype(
+            np.float64) + 1J * np.random.rand(20, 50).astype(np.float64)
+        y_data = np.random.rand(50).astype(
+            np.float64) + 1J * np.random.rand(50).astype(np.float64)
         res = self._run_dynamic_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.outer(x_data, y_data)))
 
         # test dynamic computation graph: 3-d array Complex
-        x_data = np.random.rand(5, 10,
-                                10).astype(np.float64) + 1J * np.random.rand(
-                                    5, 10, 10).astype(np.float64)
-        y_data = np.random.rand(2, 10).astype(np.float64) + 1J * np.random.rand(
-            2, 10).astype(np.float64)
+        x_data = np.random.rand(5, 10, 10).astype(
+            np.float64) + 1J * np.random.rand(5, 10, 10).astype(np.float64)
+        y_data = np.random.rand(2, 10).astype(
+            np.float64) + 1J * np.random.rand(2, 10).astype(np.float64)
         res = self._run_dynamic_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.outer(x_data, y_data)))
 
@@ -113,6 +116,7 @@ def test_multiply(self):
 
 
 class TestMultiplyError(unittest.TestCase):
+
     def func_test_errors(self):
         # test static computation graph: dtype can not be int8
         paddle.enable_static()
@@ -130,7 +134,7 @@ def func_test_errors(self):
         y = paddle.to_tensor(y_data)
         self.assertRaises(RuntimeError, paddle.outer, x, y)
 
-        # test dynamic computation graph: dtype must be same	
+        # test dynamic computation graph: dtype must be same
         x_data = np.random.randn(200).astype(np.float32)
         y_data = np.random.randn(200).astype(np.float64)
         x = paddle.to_tensor(x_data)
diff --git a/python/paddle/fluid/tests/unittests/test_overlap_add_op.py b/python/paddle/fluid/tests/unittests/test_overlap_add_op.py
index 7af67d01b573e..e04db251de6d2 100644
--- a/python/paddle/fluid/tests/unittests/test_overlap_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_overlap_add_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -70,6 +70,7 @@ def overlap_add(x, hop_length, axis=-1):
 
 
 class TestOverlapAddOp(OpTest):
+
     def setUp(self):
         self.op_type = "overlap_add"
         self.shape, self.type, self.attrs = self.initTestCase()
@@ -99,6 +100,7 @@ def test_check_grad_normal(self):
 
 
 class TestCase1(TestOverlapAddOp):
+
     def initTestCase(self):
         input_shape = (3, 50)
         input_type = 'float64'
@@ -110,6 +112,7 @@ def initTestCase(self):
 
 
 class TestCase2(TestOverlapAddOp):
+
     def initTestCase(self):
         input_shape = (2, 40, 5)
         input_type = 'float64'
@@ -121,6 +124,7 @@ def initTestCase(self):
 
 
 class TestCase3(TestOverlapAddOp):
+
     def initTestCase(self):
         input_shape = (5, 40, 2)
         input_type = 'float64'
@@ -132,6 +136,7 @@ def initTestCase(self):
 
 
 class TestCase4(TestOverlapAddOp):
+
     def initTestCase(self):
         input_shape = (3, 5, 12, 8)
         input_type = 'float64'
@@ -143,6 +148,7 @@ def initTestCase(self):
 
 
 class TestCase5(TestOverlapAddOp):
+
     def initTestCase(self):
         input_shape = (8, 12, 5, 3)
         input_type = 'float64'
diff --git a/python/paddle/fluid/tests/unittests/test_pad2d_op.py b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
index 1da94ee4fca95..0f43ddbd8fc82 100644
--- a/python/paddle/fluid/tests/unittests/test_pad2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
@@ -20,6 +20,7 @@
 
 
 class TestPad2dOp(OpTest):
+
     def setUp(self):
         self.pad_value = 0.0
         self.variable_paddings = False
@@ -29,11 +30,11 @@ def setUp(self):
         self.attrs = {}
         if self.variable_paddings:
             self.attrs['paddings'] = []
-            self.inputs['Paddings'] = np.array(self.paddings).flatten().astype(
-                "int32")
+            self.inputs['Paddings'] = np.array(
+                self.paddings).flatten().astype("int32")
         else:
-            self.attrs['paddings'] = np.array(self.paddings).flatten().astype(
-                "int32")
+            self.attrs['paddings'] = np.array(
+                self.paddings).flatten().astype("int32")
         self.attrs['pad_value'] = self.pad_value
         self.attrs['mode'] = self.mode
         self.attrs['data_format'] = self.data_format
@@ -67,6 +68,7 @@ def initTestCase(self):
 
 
 class TestCase1(TestPad2dOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5)
         self.paddings = [0, 1, 2, 3]
@@ -75,6 +77,7 @@ def initTestCase(self):
 
 
 class TestCase2(TestPad2dOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5)
         self.paddings = [0, 1, 2, 3]
@@ -83,6 +86,7 @@ def initTestCase(self):
 
 
 class TestCase3(TestPad2dOp):
+
     def initTestCase(self):
         self.shape = (2, 4, 4, 4)
         self.paddings = [0, 1, 2, 3]
@@ -91,6 +95,7 @@ def initTestCase(self):
 
 
 class TestCase4(TestPad2dOp):
+
     def initTestCase(self):
         self.shape = (2, 4, 4, 4)
         self.paddings = [0, 1, 2, 3]
@@ -99,6 +104,7 @@ def initTestCase(self):
 
 
 class TestCase5(TestPad2dOp):
+
     def initTestCase(self):
         self.shape = (2, 4, 4, 4)
         self.paddings = [0, 1, 2, 3]
@@ -108,6 +114,7 @@ def initTestCase(self):
 
 
 class TestCase6(TestPad2dOp):
+
     def initTestCase(self):
         self.shape = (2, 4, 4, 4)
         self.paddings = [0, 1, 2, 3]
@@ -118,6 +125,7 @@ def initTestCase(self):
 
 
 class TestCase7(TestPad2dOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5)
         self.paddings = [0, 1, 2, 3]
@@ -127,6 +135,7 @@ def initTestCase(self):
 
 
 class TestPad2dOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             input_data = np.random.random((2, 2, 2, 2)).astype("float32")
@@ -136,8 +145,9 @@ def test_Variable():
 
             self.assertRaises(TypeError, test_Variable)
 
-            data = fluid.data(
-                name='data', shape=[None, 3, 20, 20], dtype='float16')
+            data = fluid.data(name='data',
+                              shape=[None, 3, 20, 20],
+                              dtype='float16')
             fluid.layers.pad2d(input=data, paddings=[1, 1, 1, 1])
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_pad3d_op.py b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
index eabff5f0021c5..b277ebbb75d16 100644
--- a/python/paddle/fluid/tests/unittests/test_pad3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
@@ -24,6 +24,7 @@
 
 
 class TestPad3dOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.value = 0.0
@@ -34,11 +35,11 @@ def setUp(self):
         self.attrs = {}
         if self.variable_paddings:
             self.attrs['paddings'] = []
-            self.inputs['Paddings'] = np.array(self.paddings).flatten().astype(
-                "int32")
+            self.inputs['Paddings'] = np.array(
+                self.paddings).flatten().astype("int32")
         else:
-            self.attrs['paddings'] = np.array(self.paddings).flatten().astype(
-                "int32")
+            self.attrs['paddings'] = np.array(
+                self.paddings).flatten().astype("int32")
         self.attrs['value'] = self.value
         self.attrs['mode'] = self.mode
         self.attrs['data_format'] = self.data_format
@@ -87,6 +88,7 @@ def initTestCase(self):
 
 
 class TestCase1(TestPad3dOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6)
         self.paddings = [0, 1, 2, 3, 4, 5]
@@ -97,6 +99,7 @@ def initTestCase(self):
 
 
 class TestCase2(TestPad3dOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6)
         self.paddings = [1, 1, 1, 1, 1, 1]
@@ -107,6 +110,7 @@ def initTestCase(self):
 
 
 class TestCase3(TestPad3dOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6)
         self.paddings = [0, 1, 1, 0, 2, 3]
@@ -116,6 +120,7 @@ def initTestCase(self):
 
 
 class TestCase4(TestPad3dOp):
+
     def initTestCase(self):
         self.shape = (4, 4, 4, 4, 4)
         self.paddings = [0, 1, 2, 1, 2, 3]
@@ -125,6 +130,7 @@ def initTestCase(self):
 
 
 class TestCase5(TestPad3dOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6)
         self.paddings = [0, 1, 2, 3, 2, 1]
@@ -134,6 +140,7 @@ def initTestCase(self):
 
 
 class TestCase6(TestPad3dOp):
+
     def initTestCase(self):
         self.shape = (4, 4, 4, 4, 4)
         self.paddings = [5, 4, 2, 1, 2, 3]
@@ -143,6 +150,7 @@ def initTestCase(self):
 
 
 class TestCase7(TestPad3dOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6)
         self.paddings = [0, 1, 2, 3, 2, 1]
@@ -152,6 +160,7 @@ def initTestCase(self):
 
 
 class TestCase8(TestPad3dOp):
+
     def initTestCase(self):
         self.shape = (4, 4, 4, 4, 4)
         self.paddings = [0, 1, 2, 1, 2, 3]
@@ -161,6 +170,7 @@ def initTestCase(self):
 
 
 class TestCase9(TestPad3dOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6)
         self.paddings = [0, 1, 2, 3, 4, 5]
@@ -171,6 +181,7 @@ def initTestCase(self):
 
 
 class TestCase10(TestPad3dOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6)
         self.paddings = [0, 1, 2, 3, 4, 5]
@@ -181,6 +192,7 @@ def initTestCase(self):
 
 
 class TestPadAPI(unittest.TestCase):
+
     def setUp(self):
         self.places = [paddle.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -223,10 +235,14 @@ def check_static_result_2(self, place):
                               feed={"x": input_data},
                               fetch_list=[result1, result2])
 
-            np_out1 = self._get_numpy_out(
-                input_data, pad, mode, data_format="NCDHW")
-            np_out2 = self._get_numpy_out(
-                input_data, pad, mode, data_format="NDHWC")
+            np_out1 = self._get_numpy_out(input_data,
+                                          pad,
+                                          mode,
+                                          data_format="NCDHW")
+            np_out2 = self._get_numpy_out(input_data,
+                                          pad,
+                                          mode,
+                                          data_format="NDHWC")
             self.assertTrue(np.allclose(fetches[0], np_out1))
             self.assertTrue(np.allclose(fetches[1], np_out2))
 
@@ -245,10 +261,14 @@ def check_static_result_3(self, place):
                               feed={"x": input_data},
                               fetch_list=[result1, result2])
 
-            np_out1 = self._get_numpy_out(
-                input_data, pad, mode, data_format="NCDHW")
-            np_out2 = self._get_numpy_out(
-                input_data, pad, mode, data_format="NDHWC")
+            np_out1 = self._get_numpy_out(input_data,
+                                          pad,
+                                          mode,
+                                          data_format="NCDHW")
+            np_out2 = self._get_numpy_out(input_data,
+                                          pad,
+                                          mode,
+                                          data_format="NDHWC")
             self.assertTrue(np.allclose(fetches[0], np_out1))
             self.assertTrue(np.allclose(fetches[1], np_out2))
 
@@ -267,10 +287,14 @@ def check_static_result_4(self, place):
                               feed={"x": input_data},
                               fetch_list=[result1, result2])
 
-            np_out1 = self._get_numpy_out(
-                input_data, pad, mode, data_format="NCDHW")
-            np_out2 = self._get_numpy_out(
-                input_data, pad, mode, data_format="NDHWC")
+            np_out1 = self._get_numpy_out(input_data,
+                                          pad,
+                                          mode,
+                                          data_format="NCDHW")
+            np_out2 = self._get_numpy_out(input_data,
+                                          pad,
+                                          mode,
+                                          data_format="NDHWC")
             self.assertTrue(np.allclose(fetches[0], np_out1))
             self.assertTrue(np.allclose(fetches[1], np_out2))
 
@@ -351,12 +375,21 @@ def test_dygraph_1(self):
         mode = "constant"
         value = 100
         input_data = np.random.rand(*input_shape).astype(np.float32)
-        np_out1 = self._get_numpy_out(
-            input_data, pad, mode, value, data_format="NCDHW")
-        np_out2 = self._get_numpy_out(
-            input_data, pad, mode, value, data_format="NDHWC")
-        np_out3 = self._get_numpy_out(
-            input_data, pad_3, mode, value, data_format="NCDHW")
+        np_out1 = self._get_numpy_out(input_data,
+                                      pad,
+                                      mode,
+                                      value,
+                                      data_format="NCDHW")
+        np_out2 = self._get_numpy_out(input_data,
+                                      pad,
+                                      mode,
+                                      value,
+                                      data_format="NDHWC")
+        np_out3 = self._get_numpy_out(input_data,
+                                      pad_3,
+                                      mode,
+                                      value,
+                                      data_format="NCDHW")
         tensor_data = paddle.to_tensor(input_data)
 
         y1 = F.pad(tensor_data,
@@ -387,12 +420,21 @@ def test_dygraph_2(self):
         mode = "constant"
         value = 100
         input_data = np.random.rand(*input_shape).astype(np.float32)
-        np_out1 = self._get_numpy_out(
-            input_data, pad, mode, value, data_format="NCHW")
-        np_out2 = self._get_numpy_out(
-            input_data, pad, mode, value, data_format="NHWC")
-        np_out3 = self._get_numpy_out(
-            input_data, pad_3, mode, value, data_format="NCHW")
+        np_out1 = self._get_numpy_out(input_data,
+                                      pad,
+                                      mode,
+                                      value,
+                                      data_format="NCHW")
+        np_out2 = self._get_numpy_out(input_data,
+                                      pad,
+                                      mode,
+                                      value,
+                                      data_format="NHWC")
+        np_out3 = self._get_numpy_out(input_data,
+                                      pad_3,
+                                      mode,
+                                      value,
+                                      data_format="NCHW")
 
         tensor_data = paddle.to_tensor(input_data)
         tensor_pad = paddle.to_tensor(pad, dtype="int32")
@@ -425,12 +467,21 @@ def test_dygraph_3(self):
         mode = "constant"
         value = 100
         input_data = np.random.rand(*input_shape).astype(np.float32)
-        np_out1 = self._get_numpy_out(
-            input_data, pad, mode, value, data_format="NCL")
-        np_out2 = self._get_numpy_out(
-            input_data, pad, mode, value, data_format="NLC")
-        np_out3 = self._get_numpy_out(
-            input_data, pad_3, mode, value, data_format="NCL")
+        np_out1 = self._get_numpy_out(input_data,
+                                      pad,
+                                      mode,
+                                      value,
+                                      data_format="NCL")
+        np_out2 = self._get_numpy_out(input_data,
+                                      pad,
+                                      mode,
+                                      value,
+                                      data_format="NLC")
+        np_out3 = self._get_numpy_out(input_data,
+                                      pad_3,
+                                      mode,
+                                      value,
+                                      data_format="NCL")
         tensor_data = paddle.to_tensor(input_data)
         tensor_pad = paddle.to_tensor(pad, dtype="int32")
 
@@ -456,6 +507,7 @@ def test_dygraph_3(self):
 
 
 class TestPad1dAPI(unittest.TestCase):
+
     def _get_numpy_out(self,
                        input_data,
                        pad,
@@ -503,42 +555,53 @@ def test_class(self):
             pad_reflection = nn.Pad1D(padding=pad, mode="reflect")
             pad_replication = nn.Pad1D(padding=pad, mode="replicate")
             pad_constant = nn.Pad1D(padding=pad, mode="constant", value=value)
-            pad_constant_int = nn.Pad1D(
-                padding=pad_int, mode="constant", value=value)
+            pad_constant_int = nn.Pad1D(padding=pad_int,
+                                        mode="constant",
+                                        value=value)
             pad_circular = nn.Pad1D(padding=pad, mode="circular")
 
             data = paddle.to_tensor(input_data)
 
             output = pad_reflection(data)
-            np_out = self._get_numpy_out(
-                input_data, pad, "reflect", data_format="NCL")
+            np_out = self._get_numpy_out(input_data,
+                                         pad,
+                                         "reflect",
+                                         data_format="NCL")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
             output = pad_replication(data)
-            np_out = self._get_numpy_out(
-                input_data, pad, "replicate", data_format="NCL")
+            np_out = self._get_numpy_out(input_data,
+                                         pad,
+                                         "replicate",
+                                         data_format="NCL")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
             output = pad_constant(data)
-            np_out = self._get_numpy_out(
-                input_data, pad, "constant", value=value, data_format="NCL")
+            np_out = self._get_numpy_out(input_data,
+                                         pad,
+                                         "constant",
+                                         value=value,
+                                         data_format="NCL")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
             output = pad_constant_int(data)
-            np_out = self._get_numpy_out(
-                input_data, [pad_int] * 2,
-                "constant",
-                value=value,
-                data_format="NCL")
+            np_out = self._get_numpy_out(input_data, [pad_int] * 2,
+                                         "constant",
+                                         value=value,
+                                         data_format="NCL")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
             output = pad_circular(data)
-            np_out = self._get_numpy_out(
-                input_data, pad, "circular", value=value, data_format="NCL")
+            np_out = self._get_numpy_out(input_data,
+                                         pad,
+                                         "circular",
+                                         value=value,
+                                         data_format="NCL")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
 
 class TestPad2dAPI(unittest.TestCase):
+
     def _get_numpy_out(self,
                        input_data,
                        pad,
@@ -588,42 +651,52 @@ def test_class(self):
             pad_reflection = nn.Pad2D(padding=pad, mode="reflect")
             pad_replication = nn.Pad2D(padding=pad, mode="replicate")
             pad_constant = nn.Pad2D(padding=pad, mode="constant", value=value)
-            pad_constant_int = nn.Pad2D(
-                padding=pad_int, mode="constant", value=value)
+            pad_constant_int = nn.Pad2D(padding=pad_int,
+                                        mode="constant",
+                                        value=value)
             pad_circular = nn.Pad2D(padding=pad, mode="circular")
 
             data = paddle.to_tensor(input_data)
 
             output = pad_reflection(data)
-            np_out = self._get_numpy_out(
-                input_data, pad, "reflect", data_format="NCHW")
+            np_out = self._get_numpy_out(input_data,
+                                         pad,
+                                         "reflect",
+                                         data_format="NCHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
             output = pad_replication(data)
-            np_out = self._get_numpy_out(
-                input_data, pad, "replicate", data_format="NCHW")
+            np_out = self._get_numpy_out(input_data,
+                                         pad,
+                                         "replicate",
+                                         data_format="NCHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
             output = pad_constant(data)
-            np_out = self._get_numpy_out(
-                input_data, pad, "constant", value=value, data_format="NCHW")
+            np_out = self._get_numpy_out(input_data,
+                                         pad,
+                                         "constant",
+                                         value=value,
+                                         data_format="NCHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
             output = pad_constant_int(data)
-            np_out = self._get_numpy_out(
-                input_data, [pad_int] * 4,
-                "constant",
-                value=value,
-                data_format="NCHW")
+            np_out = self._get_numpy_out(input_data, [pad_int] * 4,
+                                         "constant",
+                                         value=value,
+                                         data_format="NCHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
             output = pad_circular(data)
-            np_out = self._get_numpy_out(
-                input_data, pad, "circular", data_format="NCHW")
+            np_out = self._get_numpy_out(input_data,
+                                         pad,
+                                         "circular",
+                                         data_format="NCHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
 
 class TestPad3dAPI(unittest.TestCase):
+
     def _get_numpy_out(self,
                        input_data,
                        pad,
@@ -675,38 +748,47 @@ def test_class(self):
             pad_reflection = nn.Pad3D(padding=pad, mode="reflect")
             pad_replication = nn.Pad3D(padding=pad, mode="replicate")
             pad_constant = nn.Pad3D(padding=pad, mode="constant", value=value)
-            pad_constant_int = nn.Pad3D(
-                padding=pad_int, mode="constant", value=value)
+            pad_constant_int = nn.Pad3D(padding=pad_int,
+                                        mode="constant",
+                                        value=value)
             pad_circular = nn.Pad3D(padding=pad, mode="circular")
 
             data = paddle.to_tensor(input_data)
 
             output = pad_reflection(data)
-            np_out = self._get_numpy_out(
-                input_data, pad, "reflect", data_format="NCDHW")
+            np_out = self._get_numpy_out(input_data,
+                                         pad,
+                                         "reflect",
+                                         data_format="NCDHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
             output = pad_replication(data)
-            np_out = self._get_numpy_out(
-                input_data, pad, "replicate", data_format="NCDHW")
+            np_out = self._get_numpy_out(input_data,
+                                         pad,
+                                         "replicate",
+                                         data_format="NCDHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
             output = pad_constant(data)
-            np_out = self._get_numpy_out(
-                input_data, pad, "constant", value=value, data_format="NCDHW")
+            np_out = self._get_numpy_out(input_data,
+                                         pad,
+                                         "constant",
+                                         value=value,
+                                         data_format="NCDHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
             output = pad_constant_int(data)
-            np_out = self._get_numpy_out(
-                input_data, [pad_int] * 6,
-                "constant",
-                value=value,
-                data_format="NCDHW")
+            np_out = self._get_numpy_out(input_data, [pad_int] * 6,
+                                         "constant",
+                                         value=value,
+                                         data_format="NCDHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
             output = pad_circular(data)
-            np_out = self._get_numpy_out(
-                input_data, pad, "circular", data_format="NCDHW")
+            np_out = self._get_numpy_out(input_data,
+                                         pad,
+                                         "circular",
+                                         data_format="NCDHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
     def test_pad_tensor(self):
@@ -717,30 +799,38 @@ def test_pad_tensor(self):
             pad_tensor = paddle.to_tensor(pad)
             input_data = np.random.rand(*input_shape).astype(np.float32)
 
-            pad_reflection_ncdhw = nn.Pad3D(
-                padding=pad_tensor, mode="reflect", data_format="NCDHW")
-            pad_reflection_ndhwc = nn.Pad3D(
-                padding=pad_tensor, mode="reflect", data_format="NDHWC")
+            pad_reflection_ncdhw = nn.Pad3D(padding=pad_tensor,
+                                            mode="reflect",
+                                            data_format="NCDHW")
+            pad_reflection_ndhwc = nn.Pad3D(padding=pad_tensor,
+                                            mode="reflect",
+                                            data_format="NDHWC")
             data = paddle.to_tensor(input_data)
 
             output = pad_reflection_ncdhw(data)
-            np_out = self._get_numpy_out(
-                input_data, pad, "reflect", data_format="NCDHW")
+            np_out = self._get_numpy_out(input_data,
+                                         pad,
+                                         "reflect",
+                                         data_format="NCDHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
             output = pad_reflection_ndhwc(data)
-            np_out = self._get_numpy_out(
-                input_data, pad, "reflect", data_format="NDHWC")
+            np_out = self._get_numpy_out(input_data,
+                                         pad,
+                                         "reflect",
+                                         data_format="NDHWC")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
 
 class TestPad3dOpError(unittest.TestCase):
+
     def setUp(self):
         self.places = [paddle.CPUPlace()]
         if core.is_compiled_with_cuda():
             self.places.append(paddle.CUDAPlace(0))
 
     def test_errors(self):
+
         def test_variable():
             input_shape = (1, 2, 3, 4, 5)
             data = np.random.rand(*input_shape).astype(np.float32)
@@ -806,12 +896,14 @@ def test_replicate_1():
 
 
 class TestPadDataformatError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_ncl():
             input_shape = (1, 2, 3, 4)
             pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
-            data = np.arange(
-                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
+            data = np.arange(np.prod(input_shape),
+                             dtype=np.float64).reshape(input_shape) + 1
             my_pad = nn.Pad1D(padding=pad, mode="replicate", data_format="NCL")
             data = paddle.to_tensor(data)
             result = my_pad(data)
@@ -819,8 +911,8 @@ def test_ncl():
         def test_nchw():
             input_shape = (1, 2, 4)
             pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
-            data = np.arange(
-                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
+            data = np.arange(np.prod(input_shape),
+                             dtype=np.float64).reshape(input_shape) + 1
             my_pad = nn.Pad1D(padding=pad, mode="replicate", data_format="NCHW")
             data = paddle.to_tensor(data)
             result = my_pad(data)
@@ -828,10 +920,11 @@ def test_nchw():
         def test_ncdhw():
             input_shape = (1, 2, 3, 4)
             pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32'))
-            data = np.arange(
-                np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1
-            my_pad = nn.Pad1D(
-                padding=pad, mode="replicate", data_format="NCDHW")
+            data = np.arange(np.prod(input_shape),
+                             dtype=np.float64).reshape(input_shape) + 1
+            my_pad = nn.Pad1D(padding=pad,
+                              mode="replicate",
+                              data_format="NCDHW")
             data = paddle.to_tensor(data)
             result = my_pad(data)
 
diff --git a/python/paddle/fluid/tests/unittests/test_pad_constant_like.py b/python/paddle/fluid/tests/unittests/test_pad_constant_like.py
index 41257895a739f..882b3c3b42afb 100644
--- a/python/paddle/fluid/tests/unittests/test_pad_constant_like.py
+++ b/python/paddle/fluid/tests/unittests/test_pad_constant_like.py
@@ -22,6 +22,7 @@
 
 
 class TestPadConstantLikeOp(OpTest):
+
     def setUp(self):
         self.initTestCase()
         self.op_type = "pad_constant_like"
@@ -32,10 +33,11 @@ def setUp(self):
         self.attrs = {}
         self.attrs['pad_value'] = self.pad_value
         self.outputs = {
-            'Out': np.pad(self.inputs['Y'],
-                          self.paddings,
-                          mode='constant',
-                          constant_values=self.pad_value)
+            'Out':
+            np.pad(self.inputs['Y'],
+                   self.paddings,
+                   mode='constant',
+                   constant_values=self.pad_value)
         }
 
     def test_check_output(self):
@@ -52,6 +54,7 @@ def initTestCase(self):
 
 
 class TestCase1(TestPadConstantLikeOp):
+
     def initTestCase(self):
         self.x_shape = (4, 3, 4, 5)
         self.y_shape = (2, 3, 4, 5)
@@ -60,6 +63,7 @@ def initTestCase(self):
 
 
 class TestCase2(TestPadConstantLikeOp):
+
     def initTestCase(self):
         self.x_shape = (4, 3, 4, 10)
         self.y_shape = (2, 3, 2, 10)
@@ -68,35 +72,38 @@ def initTestCase(self):
 
 
 class TestPadConstantLikeOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             x_data = np.random.random((2, 2, 2, 2)).astype("float32")
             y_data = np.random.random((2, 2, 2, 2)).astype("float32")
 
             def test_Variable_x():
-                var_y = fluid.data(
-                    name="data_y", shape=[2, 2, 2, 2], dtype="float32")
+                var_y = fluid.data(name="data_y",
+                                   shape=[2, 2, 2, 2],
+                                   dtype="float32")
                 fluid.layers.pad_constant_like(x=x_data, y=var_y)
 
             self.assertRaises(TypeError, test_Variable_x)
 
             def test_Variable_y():
-                var_x = fluid.data(
-                    name="data_x", shape=[2, 2, 2, 2], dtype="float32")
+                var_x = fluid.data(name="data_x",
+                                   shape=[2, 2, 2, 2],
+                                   dtype="float32")
                 fluid.layers.pad_constant_like(x=var_x, y=y_data)
 
             self.assertRaises(TypeError, test_Variable_y)
 
 
 class TestOutDtype(unittest.TestCase):
+
     def test_dtype(self):
         api_fn = fluid.layers.pad_constant_like
-        check_out_dtype(
-            api_fn,
-            in_specs=[([2, 3, 2, 3], 'float64'), ([1, 3, 1, 3], )],
-            expect_dtypes=['float32', 'float64', 'int32', 'int64'],
-            target_index=1,
-            pad_value=0.)
+        check_out_dtype(api_fn,
+                        in_specs=[([2, 3, 2, 3], 'float64'), ([1, 3, 1, 3], )],
+                        expect_dtypes=['float32', 'float64', 'int32', 'int64'],
+                        target_index=1,
+                        pad_value=0.)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_pad_op.py b/python/paddle/fluid/tests/unittests/test_pad_op.py
index a62d19d1c0ccf..30044fec755a3 100644
--- a/python/paddle/fluid/tests/unittests/test_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad_op.py
@@ -23,19 +23,23 @@
 
 
 class TestPadOp(OpTest):
+
     def setUp(self):
         self.initTestCase()
         self.dtype = self.get_dtype()
         self.op_type = "pad"
-        self.inputs = {'X': np.random.random(self.shape).astype(self.dtype), }
+        self.inputs = {
+            'X': np.random.random(self.shape).astype(self.dtype),
+        }
         self.attrs = {}
         self.attrs['paddings'] = np.array(self.paddings).flatten()
         self.attrs['pad_value'] = self.pad_value
         self.outputs = {
-            'Out': np.pad(self.inputs['X'],
-                          self.paddings,
-                          mode='constant',
-                          constant_values=self.pad_value)
+            'Out':
+            np.pad(self.inputs['X'],
+                   self.paddings,
+                   mode='constant',
+                   constant_values=self.pad_value)
         }
 
     def get_dtype(self):
@@ -54,6 +58,7 @@ def initTestCase(self):
 
 
 class TestCase1(TestPadOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5)
         self.paddings = [(0, 1), (2, 3), (2, 1), (1, 1)]
@@ -61,6 +66,7 @@ def initTestCase(self):
 
 
 class TestCase2(TestPadOp):
+
     def initTestCase(self):
         self.shape = (5, 5, 5)
         self.paddings = [(0, 0), (0, 0), (1, 2)]
@@ -68,6 +74,7 @@ def initTestCase(self):
 
 
 class TestCase3(TestPadOp):
+
     def initTestCase(self):
         self.shape = (100)
         self.paddings = [(0, 1)]
@@ -78,9 +85,11 @@ def initTestCase(self):
 
 
 def create_test_fp16(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestPadFp16(parent):
+
         def get_dtype(self):
             return np.float16
 
@@ -99,6 +108,7 @@ def test_check_grad_normal(self):
 
 
 class TestPadOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             input_data = np.random.random((2, 2)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
index 23f6b3d646b44..43d4276905390 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
@@ -23,6 +23,7 @@
 
 
 def _dygraph_guard_(func):
+
     def __impl__(*args, **kwargs):
         if paddle.in_dynamic_mode():
             return func(*args, **kwargs)
@@ -42,6 +43,7 @@ def random_var(size, low=-1, high=1, dtype='float32'):
 
 
 class TestDygraphDoubleGrad(TestCase):
+
     def setUp(self):
         self.sort_sum_gradient = False
         self.shape = [5, 10]
@@ -54,14 +56,13 @@ def grad(self,
              retain_graph=None,
              create_graph=False,
              allow_unused=False):
-        return paddle.grad(
-            outputs=outputs,
-            inputs=inputs,
-            grad_outputs=grad_outputs,
-            no_grad_vars=no_grad_vars,
-            retain_graph=retain_graph,
-            create_graph=create_graph,
-            allow_unused=allow_unused)
+        return paddle.grad(outputs=outputs,
+                           inputs=inputs,
+                           grad_outputs=grad_outputs,
+                           no_grad_vars=no_grad_vars,
+                           retain_graph=retain_graph,
+                           create_graph=create_graph,
+                           allow_unused=allow_unused)
 
     @dygraph_guard
     def func_exception(self):
@@ -87,8 +88,8 @@ def func_exception(self):
                       [random_var(shape)], [random_var(shape)])
 
         with self.assertRaises(AssertionError):
-            self.grad(
-                [random_var(shape)], [random_var(shape)], no_grad_vars=[1])
+            self.grad([random_var(shape)], [random_var(shape)],
+                      no_grad_vars=[1])
 
         with self.assertRaises(AssertionError):
             self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
@@ -105,24 +106,27 @@ def func_simple_example(self):
         y = x + 1
 
         for create_graph in [False, True]:
-            dx, = self.grad(
-                [x], [x], create_graph=create_graph, retain_graph=True)
+            dx, = self.grad([x], [x],
+                            create_graph=create_graph,
+                            retain_graph=True)
             self.assertEqual(dx.shape, x.shape)
             self.assertTrue(np.all(dx.numpy() == 1))
             self.assertNotEqual(dx.stop_gradient, create_graph)
 
-            dx_mul_2, = self.grad(
-                [y, x], [x], create_graph=create_graph, retain_graph=True)
+            dx_mul_2, = self.grad([y, x], [x],
+                                  create_graph=create_graph,
+                                  retain_graph=True)
             self.assertEqual(dx_mul_2.shape, x.shape)
             self.assertTrue(np.all(dx_mul_2.numpy() == 2))
             self.assertNotEqual(dx_mul_2.stop_gradient, create_graph)
 
-            none_grad, = self.grad(
-                [x], [y], create_graph=create_graph, allow_unused=True)
+            none_grad, = self.grad([x], [y],
+                                   create_graph=create_graph,
+                                   allow_unused=True)
             self.assertTrue(none_grad is None)
 
-            grad_with_none_and_not_none, = self.grad(
-                [x, y], [y], create_graph=create_graph)
+            grad_with_none_and_not_none, = self.grad([x, y], [y],
+                                                     create_graph=create_graph)
             self.assertTrue(grad_with_none_and_not_none.shape, x.shape)
             self.assertTrue(np.all(grad_with_none_and_not_none.numpy() == 1))
             self.assertNotEqual(grad_with_none_and_not_none.stop_gradient,
@@ -141,10 +145,11 @@ def func_none_one_initial_gradient(self):
 
         half_numel = int(numel / 2)
         half_x_positive = np.random.uniform(low=1, high=2, size=[half_numel])
-        half_x_negative = np.random.uniform(
-            low=-2, high=-1, size=[numel - half_numel])
-        x_np = np.array(list(half_x_positive) + list(half_x_negative)).astype(
-            'float32')
+        half_x_negative = np.random.uniform(low=-2,
+                                            high=-1,
+                                            size=[numel - half_numel])
+        x_np = np.array(list(half_x_positive) +
+                        list(half_x_negative)).astype('float32')
         np.random.shuffle(x_np)
 
         x = fluid.dygraph.to_variable(x_np)
@@ -173,12 +178,11 @@ def func_none_one_initial_gradient(self):
         for grad_y in [random_grad_y]:
             for grad_z in [random_grad_z]:
                 for create_graph in [False, True]:
-                    dx_actual, = self.grad(
-                        outputs=[y, z],
-                        inputs=[x],
-                        grad_outputs=[grad_y, grad_z],
-                        create_graph=create_graph,
-                        retain_graph=True)
+                    dx_actual, = self.grad(outputs=[y, z],
+                                           inputs=[x],
+                                           grad_outputs=[grad_y, grad_z],
+                                           create_graph=create_graph,
+                                           retain_graph=True)
 
                     grad_y_np = ones_grad_y if grad_y is None else grad_y.numpy(
                     )
@@ -236,9 +240,10 @@ def func_example_with_gradient_accumulation_and_create_graph(self):
             loss.backward()
 
             x_grad_actual = x.gradient()
-            x_grad_expected = (2.0 / float(numel) * (
-                x_np + dx_expected *
-                (x_np > 0) * 2 / float(numel))).astype('float32')
+            x_grad_expected = (
+                2.0 / float(numel) *
+                (x_np + dx_expected *
+                 (x_np > 0) * 2 / float(numel))).astype('float32')
             self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
 
     def test_example_with_gradient_accumulation_and_create_graph(self):
@@ -261,8 +266,9 @@ def func_example_with_gradient_accumulation_and_no_grad_vars(self):
         w_mean = fluid.layers.reduce_mean(w)
         del y1, z, w
 
-        dx_actual, = self.grad(
-            [w_mean], [x], create_graph=True, no_grad_vars=[y2])
+        dx_actual, = self.grad([w_mean], [x],
+                               create_graph=True,
+                               no_grad_vars=[y2])
 
         self.assertFalse(y2.stop_gradient)
         self.assertFalse(dx_actual.stop_gradient)
@@ -278,9 +284,10 @@ def func_example_with_gradient_accumulation_and_no_grad_vars(self):
             loss.backward()
 
             x_grad_actual = x.gradient()
-            x_grad_expected = (2.0 / float(numel) * (
-                x_np + dx_expected *
-                (x_np > 0) * 4 / float(numel))).astype('float32')
+            x_grad_expected = (
+                2.0 / float(numel) *
+                (x_np + dx_expected *
+                 (x_np > 0) * 4 / float(numel))).astype('float32')
             self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
 
     def test_example_with_gradient_accumulation_and_no_grad_vars(self):
@@ -329,6 +336,7 @@ def test_example_with_gradient_accumulation_and_not_create_graph(self):
 
 
 class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
+
     def setUp(self):
         self.sort_sum_gradient = True
         self.shape = [5, 10]
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py b/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py
index 3fc06e3c8dff7..f13e20f4e91f6 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py
@@ -55,6 +55,7 @@ def send_parambase(queue, event, device, dtype):
 
 
 class leak_checker(object):
+
     def __init__(self, test_case):
         self.checked_pids = [os.getpid()]
         self.test_case = test_case
@@ -98,6 +99,7 @@ def _has_shm_files(self):
 
 
 class TestMultiprocessingBase(unittest.TestCase):
+
     def get_tensor(self, device="cpu"):
         self.device = device.lower()
         place = None
@@ -123,6 +125,7 @@ def _test_sharing(self,
                       dtype="float32",
                       repeat=1,
                       param=False):
+
         def test_fill():
             if param:
                 x = self.get_parameter()
@@ -178,6 +181,7 @@ def test_receive():
 
 
 class TestMultiprocessingCpu(TestMultiprocessingBase):
+
     def func_test_pass_tensor(self):
         if in_dygraph_mode():
             return
@@ -213,6 +217,7 @@ def test_pass_empty(self):
 
 
 class TestMultiprocessingGpu(TestMultiprocessingBase):
+
     @unittest.skipIf(not paddle.fluid.core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     def func_test_pass_tensor(self):
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index 8945d35c131fd..3cf35550c5819 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -41,12 +41,14 @@
 
 
 def random_batch_reader():
+
     def _get_random_inputs_and_labels():
         np.random.seed(SEED)
         image = np.random.random([BATCH_SIZE, IMAGE_SIZE]).astype('float32')
         label = np.random.randint(0, CLASS_NUM - 1, (
             BATCH_SIZE,
-            1, )).astype('int64')
+            1,
+        )).astype('int64')
         return image, label
 
     def __reader__():
@@ -60,6 +62,7 @@ def __reader__():
 
 
 class LinearNet(nn.Layer):
+
     def __init__(self):
         super(LinearNet, self).__init__()
         self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
@@ -69,6 +72,7 @@ def forward(self, x):
 
 
 class LayerWithLargeParameters(paddle.nn.Layer):
+
     def __init__(self):
         super(LayerWithLargeParameters, self).__init__()
         self._l = paddle.nn.Linear(10, LARGE_PARAM)
@@ -89,6 +93,7 @@ def train(layer, loader, loss_fn, opt):
 
 
 class TestSaveLoadLargeParameters(unittest.TestCase):
+
     def setUp(self):
         pass
 
@@ -111,6 +116,7 @@ def test_large_parameters_paddle_save(self):
 
 
 class TestSaveLoadPickle(unittest.TestCase):
+
     def test_pickle_protocol(self):
         # enable dygraph mode
         paddle.disable_static()
@@ -130,7 +136,9 @@ def test_pickle_protocol(self):
         with self.assertRaises(ValueError):
             paddle.save(save_dict, path, 5)
 
-        protocols = [2, ]
+        protocols = [
+            2,
+        ]
         if sys.version_info.major >= 3 and sys.version_info.minor >= 4:
             protocols += [3, 4]
         for protocol in protocols:
@@ -143,6 +151,7 @@ def test_pickle_protocol(self):
 
 
 class TestSaveLoadAny(unittest.TestCase):
+
     def set_zero(self, prog, place, scope=None):
         if scope is None:
             scope = fluid.global_scope()
@@ -184,8 +193,9 @@ def replace_static_load(self, program, model_path):
     def test_replace_static_save_load(self):
         paddle.enable_static()
         with new_program_scope():
-            x = paddle.static.data(
-                name="static_x", shape=[None, IMAGE_SIZE], dtype='float32')
+            x = paddle.static.data(name="static_x",
+                                   shape=[None, IMAGE_SIZE],
+                                   dtype='float32')
             z = paddle.static.nn.fc(x, 10)
             z = paddle.static.nn.fc(z, 10, bias_attr=False)
             loss = fluid.layers.reduce_mean(z)
@@ -200,8 +210,8 @@ def test_replace_static_save_load(self):
             base_map = {}
             for var in prog.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                    t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_map[var.name] = t
             path = os.path.join("test_replace_static_save_load", "model")
             # paddle.save, legacy paddle.fluid.load
@@ -210,18 +220,18 @@ def test_replace_static_save_load(self):
             paddle.fluid.io.load(prog, path)
             for var in prog.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, np.array(base_t)))
-            # legacy paddle.fluid.save, paddle.load 
+            # legacy paddle.fluid.save, paddle.load
             paddle.fluid.io.save(prog, path)
             self.set_zero(prog, place)
             self.replace_static_load(prog, path)
             for var in prog.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
             # test for return tensor
@@ -247,11 +257,11 @@ def test_replace_static_save_load(self):
             self.set_zero(prog, place)
             for var in prog.list_vars():
                 if var.persistable:
-                    tensor = paddle.load(
-                        os.path.join(path_vars, var.name), return_numpy=False)
+                    tensor = paddle.load(os.path.join(path_vars, var.name),
+                                         return_numpy=False)
                     var.set_value(tensor)
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
 
@@ -259,6 +269,7 @@ def test_paddle_save_load_v2(self):
         paddle.disable_static()
 
         class StepDecay(LRScheduler):
+
             def __init__(self,
                          learning_rate,
                          step_size,
@@ -276,8 +287,8 @@ def get_lr(self):
 
         layer = LinearNet()
         inps = paddle.randn([2, IMAGE_SIZE])
-        adam = opt.Adam(
-            learning_rate=StepDecay(0.1, 1), parameters=layer.parameters())
+        adam = opt.Adam(learning_rate=StepDecay(0.1, 1),
+                        parameters=layer.parameters())
         y = layer(inps)
         y.mean().backward()
         adam.step()
@@ -316,8 +327,9 @@ def test_single_pickle_var_dygraph(self):
         t_dygraph = paddle.load(path)
         np_dygraph = paddle.load(path, return_numpy=True)
         self.assertTrue(
-            isinstance(t_dygraph, (paddle.fluid.core.VarBase,
-                                   paddle.fluid.core.eager.Tensor)))
+            isinstance(
+                t_dygraph,
+                (paddle.fluid.core.VarBase, paddle.fluid.core.eager.Tensor)))
         self.assertTrue(np.array_equal(tensor.numpy(), np_dygraph))
         self.assertTrue(np.array_equal(tensor.numpy(), t_dygraph.numpy()))
         paddle.enable_static()
@@ -332,8 +344,9 @@ def test_single_pickle_var_static(self):
         paddle.enable_static()
         with new_program_scope():
             # create network
-            x = paddle.static.data(
-                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            x = paddle.static.data(name="x",
+                                   shape=[None, IMAGE_SIZE],
+                                   dtype='float32')
             z = paddle.static.nn.fc(x, 128)
             loss = fluid.layers.reduce_mean(z)
             place = fluid.CPUPlace(
@@ -379,8 +392,9 @@ def test_dygraph_save_static_load(self):
         paddle.enable_static()
         with new_program_scope():
             layer = LinearNet()
-            data = paddle.static.data(
-                name='x_static_save', shape=(None, IMAGE_SIZE), dtype='float32')
+            data = paddle.static.data(name='x_static_save',
+                                      shape=(None, IMAGE_SIZE),
+                                      dtype='float32')
             y_static = layer(data)
             program = paddle.static.default_main_program()
             place = fluid.CPUPlace(
@@ -401,17 +415,16 @@ def test_save_load_complex_object_dygraph_save(self):
         layer = paddle.nn.Linear(3, 4)
         state_dict = layer.state_dict()
         obj1 = [
-            paddle.randn(
-                [3, 4], dtype='float32'), np.random.randn(5, 6),
-            ('fake_weight', np.ones(
-                [7, 8], dtype='float32'))
+            paddle.randn([3, 4], dtype='float32'),
+            np.random.randn(5, 6),
+            ('fake_weight', np.ones([7, 8], dtype='float32'))
         ]
         obj2 = {'k1': obj1, 'k2': state_dict, 'epoch': 123}
-        obj3 = (paddle.randn(
-            [5, 4], dtype='float32'), np.random.randn(3, 4).astype("float32"), {
-                "state_dict": state_dict,
-                "opt": state_dict
-            })
+        obj3 = (paddle.randn([5, 4], dtype='float32'),
+                np.random.randn(3, 4).astype("float32"), {
+                    "state_dict": state_dict,
+                    "opt": state_dict
+                })
         obj4 = (np.random.randn(5, 6), (123, ))
 
         path1 = "test_save_load_any_complex_object_dygraph/obj1"
@@ -428,8 +441,8 @@ def test_save_load_complex_object_dygraph_save(self):
         load_tensor3 = paddle.load(path3, return_numpy=False)
         load_tensor4 = paddle.load(path4, return_numpy=False)
 
-        self.assertTrue(
-            np.array_equal(load_tensor1[0].numpy(), obj1[0].numpy()))
+        self.assertTrue(np.array_equal(load_tensor1[0].numpy(),
+                                       obj1[0].numpy()))
         self.assertTrue(np.array_equal(load_tensor1[1], obj1[1]))
         self.assertTrue(np.array_equal(load_tensor1[2].numpy(), obj1[2][1]))
         for i in range(len(load_tensor1)):
@@ -440,8 +453,8 @@ def test_save_load_complex_object_dygraph_save(self):
                 np.array_equal(v.numpy(), load_tensor2['k2'][k].numpy()))
         self.assertTrue(load_tensor2['epoch'] == 123)
 
-        self.assertTrue(
-            np.array_equal(load_tensor3[0].numpy(), obj3[0].numpy()))
+        self.assertTrue(np.array_equal(load_tensor3[0].numpy(),
+                                       obj3[0].numpy()))
         self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1]))
 
         for k, v in state_dict.items():
@@ -502,8 +515,8 @@ def test_save_load_complex_object_dygraph_save(self):
                 np.array_equal(v.numpy(), np.array(load_tensor2['k2'][k])))
         self.assertTrue(load_tensor2['epoch'] == 123)
 
-        self.assertTrue(
-            isinstance(load_tensor3[0], paddle.fluid.core.LoDTensor))
+        self.assertTrue(isinstance(load_tensor3[0],
+                                   paddle.fluid.core.LoDTensor))
         self.assertTrue(
             np.array_equal(np.array(load_tensor3[0]), obj3[0].numpy()))
         self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1]))
@@ -513,8 +526,8 @@ def test_save_load_complex_object_dygraph_save(self):
                 isinstance(load_tensor3[2]["state_dict"][k],
                            paddle.fluid.core.LoDTensor))
             self.assertTrue(
-                np.array_equal(
-                    np.array(load_tensor3[2]["state_dict"][k]), v.numpy()))
+                np.array_equal(np.array(load_tensor3[2]["state_dict"][k]),
+                               v.numpy()))
 
         for k, v in state_dict.items():
             self.assertTrue(
@@ -557,8 +570,9 @@ def test_save_load_complex_object_static_save(self):
         paddle.enable_static()
         with new_program_scope():
             # create network
-            x = paddle.static.data(
-                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            x = paddle.static.data(name="x",
+                                   shape=[None, IMAGE_SIZE],
+                                   dtype='float32')
             z = paddle.static.nn.fc(x, 10, bias_attr=False)
             z = paddle.static.nn.fc(z, 128, bias_attr=False)
             loss = fluid.layers.reduce_mean(z)
@@ -572,16 +586,15 @@ def test_save_load_complex_object_static_save(self):
             state_dict = prog.state_dict()
             keys = list(state_dict.keys())
             obj1 = [
-                state_dict[keys[0]], np.random.randn(5, 6),
-                ('fake_weight', np.ones(
-                    [7, 8], dtype='float32'))
+                state_dict[keys[0]],
+                np.random.randn(5, 6),
+                ('fake_weight', np.ones([7, 8], dtype='float32'))
             ]
             obj2 = {'k1': obj1, 'k2': state_dict, 'epoch': 123}
-            obj3 = (state_dict[keys[0]], np.ndarray(
-                [3, 4], dtype="float32"), {
-                    "state_dict": state_dict,
-                    "opt": state_dict
-                })
+            obj3 = (state_dict[keys[0]], np.ndarray([3, 4], dtype="float32"), {
+                "state_dict": state_dict,
+                "opt": state_dict
+            })
             obj4 = (np.ndarray([3, 4], dtype="float32"), )
 
             path1 = "test_save_load_any_complex_object_static/obj1"
@@ -608,8 +621,8 @@ def test_save_load_complex_object_static_save(self):
                     type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
             for k, v in state_dict.items():
                 self.assertTrue(
-                    np.array_equal(
-                        np.array(v), np.array(load_tensor2['k2'][k])))
+                    np.array_equal(np.array(v),
+                                   np.array(load_tensor2['k2'][k])))
             self.assertTrue(load_tensor2['epoch'] == 123)
 
             self.assertTrue(isinstance(load_tensor3[0], fluid.core.LoDTensor))
@@ -622,16 +635,15 @@ def test_save_load_complex_object_static_save(self):
                     isinstance(load_tensor3[2]["state_dict"][k],
                                fluid.core.LoDTensor))
                 self.assertTrue(
-                    np.array_equal(
-                        np.array(load_tensor3[2]["state_dict"][k]), np.array(
-                            v)))
+                    np.array_equal(np.array(load_tensor3[2]["state_dict"][k]),
+                                   np.array(v)))
 
             for k, v in state_dict.items():
                 self.assertTrue(
                     isinstance(load_tensor3[2]["opt"][k], fluid.core.LoDTensor))
                 self.assertTrue(
-                    np.array_equal(
-                        np.array(load_tensor3[2]["opt"][k]), np.array(v)))
+                    np.array_equal(np.array(load_tensor3[2]["opt"][k]),
+                                   np.array(v)))
 
             self.assertTrue(isinstance(load_tensor4[0], fluid.core.LoDTensor))
             self.assertTrue(np.array_equal(np.array(load_tensor4[0]), obj4[0]))
@@ -657,8 +669,8 @@ def test_save_load_complex_object_static_save(self):
 
             for k, v in state_dict.items():
                 self.assertTrue(
-                    np.array_equal(load_array3[2]["state_dict"][k], np.array(
-                        v)))
+                    np.array_equal(load_array3[2]["state_dict"][k],
+                                   np.array(v)))
 
             for k, v in state_dict.items():
                 self.assertTrue(
@@ -683,38 +695,38 @@ def test_save_load_complex_object_static_save(self):
                     type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
             for k, v in state_dict.items():
                 self.assertTrue(
-                    np.array_equal(
-                        np.array(v), np.array(load_tensor2['k2'][k])))
+                    np.array_equal(np.array(v),
+                                   np.array(load_tensor2['k2'][k])))
             self.assertTrue(load_tensor2['epoch'] == 123)
 
             self.assertTrue(
-                isinstance(load_tensor3[0], (fluid.core.VarBase,
-                                             fluid.core.eager.Tensor)))
+                isinstance(load_tensor3[0],
+                           (fluid.core.VarBase, fluid.core.eager.Tensor)))
             self.assertTrue(np.array_equal(load_tensor3[0].numpy(), obj3[0]))
             self.assertTrue(
-                isinstance(load_tensor3[1], (fluid.core.VarBase,
-                                             fluid.core.eager.Tensor)))
+                isinstance(load_tensor3[1],
+                           (fluid.core.VarBase, fluid.core.eager.Tensor)))
             self.assertTrue(np.array_equal(load_tensor3[1].numpy(), obj3[1]))
 
             for k, v in state_dict.items():
                 self.assertTrue(
-                    isinstance(load_tensor3[2]["state_dict"][k], (
-                        fluid.core.VarBase, fluid.core.eager.Tensor)))
+                    isinstance(load_tensor3[2]["state_dict"][k],
+                               (fluid.core.VarBase, fluid.core.eager.Tensor)))
                 self.assertTrue(
                     np.array_equal(load_tensor3[2]["state_dict"][k].numpy(),
                                    np.array(v)))
 
             for k, v in state_dict.items():
                 self.assertTrue(
-                    isinstance(load_tensor3[2]["opt"][k], (
-                        fluid.core.VarBase, fluid.core.eager.Tensor)))
+                    isinstance(load_tensor3[2]["opt"][k],
+                               (fluid.core.VarBase, fluid.core.eager.Tensor)))
                 self.assertTrue(
                     np.array_equal(load_tensor3[2]["opt"][k].numpy(),
                                    np.array(v)))
 
             self.assertTrue(
-                isinstance(load_tensor4[0], (fluid.core.VarBase,
-                                             fluid.core.eager.Tensor)))
+                isinstance(load_tensor4[0],
+                           (fluid.core.VarBase, fluid.core.eager.Tensor)))
             self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0]))
 
             load_array1 = paddle.load(path1, return_numpy=True)
@@ -738,8 +750,8 @@ def test_save_load_complex_object_static_save(self):
 
             for k, v in state_dict.items():
                 self.assertTrue(
-                    np.array_equal(load_array3[2]["state_dict"][k], np.array(
-                        v)))
+                    np.array_equal(load_array3[2]["state_dict"][k],
+                                   np.array(v)))
 
             for k, v in state_dict.items():
                 self.assertTrue(
@@ -764,6 +776,7 @@ def test_varbase_binary_var(self):
 
 
 class TestSaveLoadToMemory(unittest.TestCase):
+
     def test_dygraph_save_to_memory(self):
         paddle.disable_static()
         linear = LinearNet()
@@ -792,8 +805,9 @@ def test_static_save_to_memory(self):
         paddle.enable_static()
         with new_program_scope():
             # create network
-            x = paddle.static.data(
-                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            x = paddle.static.data(name="x",
+                                   shape=[None, IMAGE_SIZE],
+                                   dtype='float32')
             z = paddle.static.nn.fc(x, 10, bias_attr=False)
             z = paddle.static.nn.fc(z, 128, bias_attr=False)
             loss = fluid.layers.reduce_mean(z)
@@ -829,6 +843,7 @@ def test_static_save_to_memory(self):
 
 
 class TestSaveLoad(unittest.TestCase):
+
     def setUp(self):
         # enable dygraph mode
         paddle.disable_static()
@@ -903,12 +918,14 @@ def test_save_load(self):
 
 
 class TestSaveLoadProgram(unittest.TestCase):
+
     def test_save_load_program(self):
         paddle.enable_static()
         with new_program_scope():
             layer = LinearNet()
-            data = paddle.static.data(
-                name='x_static_save', shape=(None, IMAGE_SIZE), dtype='float32')
+            data = paddle.static.data(name='x_static_save',
+                                      shape=(None, IMAGE_SIZE),
+                                      dtype='float32')
             y_static = layer(data)
             main_program = paddle.static.default_main_program()
             startup_program = paddle.static.default_startup_program()
@@ -927,6 +944,7 @@ def test_save_load_program(self):
 
 
 class TestSaveLoadLayer(unittest.TestCase):
+
     def test_save_load_layer(self):
         paddle.disable_static()
         inps = paddle.randn([1, IMAGE_SIZE], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
index 0b9e038f7cd95..bba65e469abe1 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
@@ -34,6 +34,7 @@
 
 
 class TestSaveLoadBinaryFormat(unittest.TestCase):
+
     def setUp(self):
         # enable static graph mode
         paddle.enable_static()
@@ -50,17 +51,18 @@ def set_zero(self, prog, place, scope=None):
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
     def replace_save_vars(self, program, dirname):
+
         def predicate(var):
             return var.persistable
 
         vars = filter(predicate, program.list_vars())
         for var in vars:
-            paddle.save(
-                var.get_value(),
-                os.path.join(dirname, var.name),
-                use_binary_format=True)
+            paddle.save(var.get_value(),
+                        os.path.join(dirname, var.name),
+                        use_binary_format=True)
 
     def replace_load_vars(self, program, dirname):
+
         def predicate(var):
             return var.persistable
 
@@ -74,8 +76,9 @@ def test_replace_save_load_vars(self):
         paddle.enable_static()
         with new_program_scope():
             # create network
-            x = paddle.static.data(
-                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            x = paddle.static.data(name="x",
+                                   shape=[None, IMAGE_SIZE],
+                                   dtype='float32')
             z = paddle.static.nn.fc(x, 10, bias_attr=False)
             z = paddle.static.nn.fc(z, 128, bias_attr=False)
             loss = fluid.layers.reduce_mean(z)
@@ -88,8 +91,8 @@ def test_replace_save_load_vars(self):
             base_map = {}
             for var in prog.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                    t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
@@ -100,26 +103,30 @@ def test_replace_save_load_vars(self):
             self.set_zero(prog, place)
             var_list = list(
                 filter(lambda var: var.persistable, prog.list_vars()))
-            fluid.io.load_vars(
-                exe, path_vars1, main_program=prog, vars=var_list)
+            fluid.io.load_vars(exe,
+                               path_vars1,
+                               main_program=prog,
+                               vars=var_list)
 
             for var in prog.list_vars():
                 if var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
 
                     self.assertTrue(np.array_equal(new_t, base_t))
             # test for io.save_vars/replace_load_vars
             path_vars2 = 'test_replace_save_load_vars_binary2/model/'
-            fluid.io.save_vars(
-                exe, path_vars2, main_program=prog, vars=var_list)
+            fluid.io.save_vars(exe,
+                               path_vars2,
+                               main_program=prog,
+                               vars=var_list)
             self.set_zero(prog, place)
             self.replace_load_vars(prog, path_vars2)
             for var in prog.list_vars():
                 if var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
 
                     self.assertTrue(np.array_equal(new_t, base_t))
@@ -132,7 +139,8 @@ def test_save_load_lod_tensor(self):
             y = fluid.layers.fc(
                 x,
                 OUTPUT_NUM,
-                name='fc_vars', )
+                name='fc_vars',
+            )
             prog = fluid.default_main_program()
             place = fluid.CPUPlace(
             ) if not paddle.fluid.core.is_compiled_with_cuda(
@@ -146,8 +154,9 @@ def test_save_load_lod_tensor(self):
                 if var.persistable and list(
                         var.shape) == [IMAGE_SIZE, OUTPUT_NUM]:
                     tensor = var.get_value()
-                    paddle.save(
-                        tensor, dirname + 'fc_vars.w_0', use_binary_format=True)
+                    paddle.save(tensor,
+                                dirname + 'fc_vars.w_0',
+                                use_binary_format=True)
                     break
 
             origin = np.array(var.get_value())
@@ -221,8 +230,8 @@ def test_save_load_selected_rows(self):
         self.assertTrue(isinstance(load_sr, fluid.core.SelectedRows))
         self.assertTrue(list(load_sr.rows()) == rows)
         self.assertTrue(load_sr.height() == height)
-        self.assertTrue(
-            np.array_equal(np.array(load_sr.get_tensor()), np_array))
+        self.assertTrue(np.array_equal(np.array(load_sr.get_tensor()),
+                                       np_array))
 
         with self.assertRaises(RuntimeError):
             fluid.core.save_selected_rows(
diff --git a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
index c91616b06ee47..651d9b5ea6860 100644
--- a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
+++ b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
@@ -28,19 +28,22 @@ def test_static(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
 
-    place = fluid.CUDAPlace(0) if paddle.fluid.core.is_compiled_with_cuda(
-    ) else fluid.CPUPlace()
+    place = fluid.CUDAPlace(
+        0) if paddle.fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
 
     with paddle.static.program_guard(prog, startup_prog):
         x = paddle.fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
         y = paddle.fluid.data(name='y', shape=y_np.shape, dtype=x_np.dtype)
-        dist = paddle.nn.layer.distance.PairwiseDistance(
-            p=p, epsilon=epsilon, keepdim=keepdim)
+        dist = paddle.nn.layer.distance.PairwiseDistance(p=p,
+                                                         epsilon=epsilon,
+                                                         keepdim=keepdim)
         distance = dist(x, y)
         exe = paddle.static.Executor(place)
         static_ret = exe.run(prog,
-                             feed={'x': x_np,
-                                   'y': y_np},
+                             feed={
+                                 'x': x_np,
+                                 'y': y_np
+                             },
                              fetch_list=[distance])
         static_ret = static_ret[0]
     return static_ret
@@ -50,8 +53,9 @@ def test_dygraph(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
     paddle.disable_static()
     x = paddle.to_tensor(x_np)
     y = paddle.to_tensor(y_np)
-    dist = paddle.nn.layer.distance.PairwiseDistance(
-        p=p, epsilon=epsilon, keepdim=keepdim)
+    dist = paddle.nn.layer.distance.PairwiseDistance(p=p,
+                                                     epsilon=epsilon,
+                                                     keepdim=keepdim)
     distance = dist(x, y)
     dygraph_ret = distance.numpy()
     paddle.enable_static()
@@ -59,6 +63,7 @@ def test_dygraph(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
 
 
 class TestPairwiseDistance(unittest.TestCase):
+
     def test_pairwise_distance(self):
         all_shape = [[100, 100], [4, 5, 6, 7]]
         dtypes = ['float32', 'float64']
@@ -71,8 +76,9 @@ def test_pairwise_distance(self):
 
                     static_ret = test_static(x_np, y_np, keepdim=keepdim)
                     dygraph_ret = test_dygraph(x_np, y_np, keepdim=keepdim)
-                    excepted_value = pairwise_distance(
-                        x_np, y_np, keepdim=keepdim)
+                    excepted_value = pairwise_distance(x_np,
+                                                       y_np,
+                                                       keepdim=keepdim)
 
                     self.assertTrue(np.allclose(static_ret, dygraph_ret))
                     self.assertTrue(np.allclose(static_ret, excepted_value))
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py b/python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py
index e2a526110f18a..5dfe41d7bfe3e 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py
@@ -23,6 +23,7 @@
 
 
 class TestParallelClassCenterSample(TestMultipleGpus):
+
     def test_parallel_class_center_sample(self):
         self.run_mnist_2gpu('parallel_class_center_sample.py')
         self.run_mnist_2gpu('parallel_class_center_sample.py', eager_mode=False)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
index 3c45b2c795037..41ab6ebf29a48 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
@@ -26,6 +26,7 @@
 
 
 class TestDygraphControlFlowSame(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -34,14 +35,14 @@ def _setup_config(self):
 
     def test_net(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_control_flow_same.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_control_flow_same.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestFleetDygraphControlFlowSame(TestDygraphControlFlowSame):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -51,6 +52,7 @@ def _setup_config(self):
 
 
 class TestFleetDygraphControlFlowSameAccGrad(TestDygraphControlFlowSame):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -60,6 +62,7 @@ def _setup_config(self):
 
 
 class TestDygraphControlFlowDiff(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -68,14 +71,14 @@ def _setup_config(self):
 
     def test_net(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_control_flow_different.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_control_flow_different.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestFleetDygraphControlFlowDiff(TestDygraphControlFlowDiff):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -85,6 +88,7 @@ def _setup_config(self):
 
 
 class TestFleetDygraphControlFlowDiffAccGrad(TestDygraphControlFlowDiff):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
index 8145e880a650e..930bf5345fcae 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -150,6 +150,7 @@ def start_local_trainers(cluster,
 
 
 class TestMultipleGpus(unittest.TestCase):
+
     def run_mnist_2gpu(self, target_file_name, eager_mode=True):
         if not fluid.core.is_compiled_with_cuda(
         ) or fluid.core.get_cuda_device_count() == 0:
@@ -161,12 +162,11 @@ def run_mnist_2gpu(self, target_file_name, eager_mode=True):
 
         cluster, pod = get_cluster_from_args(selected_gpus)
 
-        procs = start_local_trainers(
-            cluster,
-            pod,
-            eager_mode=eager_mode,
-            training_script=target_file_name,
-            training_script_args=[])
+        procs = start_local_trainers(cluster,
+                                     pod,
+                                     eager_mode=eager_mode,
+                                     training_script=target_file_name,
+                                     training_script_args=[])
 
         while True:
             alive = watch_local_trainers(procs, cluster.trainers_endpoints())
@@ -178,15 +178,15 @@ def run_mnist_2gpu(self, target_file_name, eager_mode=True):
 
 
 class TestMultipleWithGloo(unittest.TestCase):
+
     def run_mnist_2cpu(self, target_file_name):
 
         cluster, pod = get_cluster_from_args(
             [0, 1])  #tmp use. for getting trainer_nranks()
 
-        procs = start_local_trainers_cpu(
-            cluster.trainers_endpoints(),
-            training_script=target_file_name,
-            training_script_args=[])
+        procs = start_local_trainers_cpu(cluster.trainers_endpoints(),
+                                         training_script=target_file_name,
+                                         training_script_args=[])
 
         while True:
             alive = watch_local_trainers(procs, cluster.trainers_nranks())
@@ -198,18 +198,21 @@ def run_mnist_2cpu(self, target_file_name):
 
 
 class TestDataParallelGradientCheck(TestMultipleGpus):
+
     def test_multiple_gpus_dynamic(self):
         self.run_mnist_2gpu('parallel_dygraph_gradient_check.py')
 
 
 class TestDataParallelWithPyLayer(TestMultipleGpus):
+
     def test_parallel_dygraph_dataparallel_with_pylayer(self):
         self.run_mnist_2gpu('parallel_dygraph_dataparallel_with_pylayer.py')
-        self.run_mnist_2gpu(
-            'parallel_dygraph_dataparallel_with_pylayer.py', eager_mode=False)
+        self.run_mnist_2gpu('parallel_dygraph_dataparallel_with_pylayer.py',
+                            eager_mode=False)
 
 
 class TestGradientCheckInEagerMode(TestMultipleGpus):
+
     def test_multiple_gpus_dynamic(self):
         self.run_mnist_2gpu('parallel_dygraph_gradient_check_in_eager_mode.py')
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_cpuonly.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_cpuonly.py
index ce67a2ce4d209..725d5249f594b 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_cpuonly.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_cpuonly.py
@@ -104,6 +104,7 @@ def start_local_trainers(cluster,
 
 
 class TestMultipleGpus(unittest.TestCase):
+
     def run_mnist_2gpu(self, target_file_name):
         #if not fluid.core.is_compiled_with_cuda(
         #) or fluid.core.get_cuda_device_count() == 0:
@@ -114,11 +115,10 @@ def run_mnist_2gpu(self, target_file_name):
         pod = None
 
         cluster, pod = get_cluster_from_args(selected_gpus)
-        procs = start_local_trainers(
-            cluster,
-            pod,
-            training_script=target_file_name,
-            training_script_args=[])
+        procs = start_local_trainers(cluster,
+                                     pod,
+                                     training_script=target_file_name,
+                                     training_script_args=[])
 
         while True:
             alive = watch_local_trainers(procs, cluster.trainers_nranks())
@@ -130,11 +130,13 @@ def run_mnist_2gpu(self, target_file_name):
 
 
 class TestDataParallelGradientCheck(TestMultipleGpus):
+
     def test_multiple_gpus_dynamic(self):
         self.run_mnist_2gpu('parallel_dygraph_gradient_check.py')
 
 
 class TestDataParallelGradientCheckInEagerMode(TestMultipleGpus):
+
     def test_multiple_gpus_dynamic(self):
         self.run_mnist_2gpu('parallel_dygraph_gradient_check_in_eager_mode.py')
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index 0c55e135721ce..e25a74863e483 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -27,6 +27,7 @@
 
 
 class TestParallelDygraphMnist(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -35,17 +36,17 @@ def _setup_config(self):
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_mnist.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_mnist.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 #TODO(liuyuhui): Multi-Card Baidu Kunlun XPU training exist accuracy problems
-#it is difficult to find out immediately where the problem is, 
-#and we will work with frameworkers' help to fix it. 
+#it is difficult to find out immediately where the problem is,
+#and we will work with frameworkers' help to fix it.
 class TestParallelDygraphMnistXPU(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._bkcl_mode = True
@@ -54,20 +55,21 @@ def _setup_config(self):
 
     def test_mnist_xpu(self):
         if fluid.core.is_compiled_with_xpu():
-            self.check_with_place(
-                "parallel_dygraph_mnist.py",
-                delta=1e-4,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_mnist.py",
+                                  delta=1e-4,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestParallelDygraphMnistSpawn(TestDistSpawnRunner):
+
     def test_mnist_with_spawn(self):
         if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
             self.check_dist_result_with_spawn(test_class=TestMnist, delta=1e-5)
 
 
 class TestParallelDygraphMnistAccGrad(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -78,14 +80,14 @@ def _setup_config(self):
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_mnist.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_mnist.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestFleetDygraphMnistXPU(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._bkcl_mode = True
@@ -95,11 +97,10 @@ def _setup_config(self):
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_xpu():
-            self.check_with_place(
-                "parallel_dygraph_mnist.py",
-                delta=1e-4,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_mnist.py",
+                                  delta=1e-4,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py
index da8df19a1e649..8e26452389bef 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py
@@ -22,6 +22,7 @@
 
 
 class TestModelParallelLayer(TestMultipleGpus):
+
     def test_hybrid_parallel_mp_layer(self):
         self.run_mnist_2gpu('hybrid_parallel_mp_layers.py')
         self.run_mnist_2gpu('hybrid_parallel_mp_layers.py', eager_mode=False)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync.py
index 2e364e5d4d99f..a3c4a90746f3d 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync.py
@@ -29,6 +29,7 @@
 
 
 class TestParallelDygraphNoSync(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -37,14 +38,14 @@ def _setup_config(self):
 
     def test_no_sync(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_no_sync.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_no_sync.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestParallelDygraphNoSyncUnusedParam(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -53,14 +54,14 @@ def _setup_config(self):
 
     def test_no_sync_ununsed_param(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_no_sync_unused_params.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_no_sync_unused_params.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestParallelDygraphNoSyncControlFlow(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -69,37 +70,39 @@ def _setup_config(self):
 
     def test_no_sync_control_flow(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_no_sync_control_flow.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_no_sync_control_flow.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestParallelDygraphNoSyncSpawn(TestDistSpawnRunner):
+
     def test_no_sync_with_spawn(self):
         if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
             self.check_dist_result_with_spawn(test_class=TestNoSync, delta=1e-5)
 
 
 class TestParallelDygraphNoSyncUnusedParamSpawn(TestDistSpawnRunner):
+
     def _args_config(self, args):
         args.find_unused_parameters = True
 
     def test_no_sync_with_spawn(self):
         if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
-            self.check_dist_result_with_spawn(
-                test_class=TestNoSyncUnusedParam, delta=1e-5)
+            self.check_dist_result_with_spawn(test_class=TestNoSyncUnusedParam,
+                                              delta=1e-5)
 
 
 class TestParallelDygraphNoSyncControlFlowSpawn(TestDistSpawnRunner):
+
     def _args_config(self, args):
         args.find_unused_parameters = True
 
     def test_no_sync_with_spawn(self):
         if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
-            self.check_dist_result_with_spawn(
-                test_class=TestNoSyncControlFlow, delta=1e-5)
+            self.check_dist_result_with_spawn(test_class=TestNoSyncControlFlow,
+                                              delta=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py
index d5eebf01adb7c..fad9e902cc91e 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py
@@ -21,6 +21,7 @@
 
 
 class TestDataParallelLayer(TestMultipleGpus):
+
     def test_parallel_dygraph_dataparallel_no_sync(self):
         self.run_mnist_2gpu('parallel_dygraph_no_sync_gradient_check.py')
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
index 7f7db930d4c2d..5357a6a132a34 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
@@ -22,6 +22,7 @@
 
 
 class TestHybridPipeParallel(TestMultipleGpus):
+
     def test_hybrid_parallel_pp_layer(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_layer.py')
         self.run_mnist_2gpu('hybrid_parallel_pp_layer.py', eager_mode=False)
@@ -32,8 +33,8 @@ def test_hybrid_parallel_pp_tuple_inputs(self):
 
     def test_hybrid_parallel_shared_weight(self):
         self.run_mnist_2gpu('hybrid_parallel_shared_weight.py')
-        self.run_mnist_2gpu(
-            'hybrid_parallel_shared_weight.py', eager_mode=False)
+        self.run_mnist_2gpu('hybrid_parallel_shared_weight.py',
+                            eager_mode=False)
 
     def test_pipeline_parallel_amp(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_amp.py')
@@ -45,8 +46,8 @@ def test_pipeline_parallel_fp16(self):
 
     def test_hybrid_parallel_transformer(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_transformer.py')
-        self.run_mnist_2gpu(
-            'hybrid_parallel_pp_transformer.py', eager_mode=False)
+        self.run_mnist_2gpu('hybrid_parallel_pp_transformer.py',
+                            eager_mode=False)
 
     def test_hybrid_parallel_save_load(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_save_load.py')
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
index cf89dc484c488..9127f3bfdb88a 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
@@ -27,6 +27,7 @@
 
 
 class TestParallelDygraphSeResNeXt(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -34,18 +35,18 @@ def _setup_config(self):
 
     def test_se_resnext(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_se_resnext.py",
-                delta=0.01,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_se_resnext.py",
+                                  delta=0.01,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestParallelDygraphSeResNeXtSpawn(TestDistSpawnRunner):
+
     def test_se_resnext_with_spawn(self):
         if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
-            self.check_dist_result_with_spawn(
-                test_class=TestSeResNeXt, delta=0.01)
+            self.check_dist_result_with_spawn(test_class=TestSeResNeXt,
+                                              delta=0.01)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py
index 503bd9d0f9797..920ef969317b9 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py
@@ -26,8 +26,8 @@ class TestHybridParallel(TestMultipleGpus):
     # check sharding logic as well as the accuracy with single mode
     def test_hybrid_parallel_sharding_logic(self):
         self.run_mnist_2gpu('hybrid_parallel_sharding_model.py')
-        self.run_mnist_2gpu(
-            'hybrid_parallel_sharding_model.py', eager_mode=False)
+        self.run_mnist_2gpu('hybrid_parallel_sharding_model.py',
+                            eager_mode=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
index 43907da609803..ae65b545a9534 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
@@ -28,6 +28,7 @@
 
 
 class TestParallelDygraphSparseEmdedding(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -35,14 +36,14 @@ def _setup_config(self):
 
     def test_sparse_embedding(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_sparse_embedding.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_sparse_embedding.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestParallelDygraphSparseEmdeddingFP64(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -50,18 +51,18 @@ def _setup_config(self):
 
     def test_sparse_embedding_fp64(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_sparse_embedding_fp64.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_sparse_embedding_fp64.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestParallelDygraphSparseEmdeddingSpawn(TestDistSpawnRunner):
+
     def test_sparse_embedding_with_spawn(self):
         if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
-            self.check_dist_result_with_spawn(
-                test_class=TestSparseEmbedding, delta=1e-5)
+            self.check_dist_result_with_spawn(test_class=TestSparseEmbedding,
+                                              delta=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_diff_length_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_diff_length_gloo.py
index 1c425a40a9b39..2abd9a1f85452 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_diff_length_gloo.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_diff_length_gloo.py
@@ -28,6 +28,7 @@
 
 
 class TestParallelDygraphSparseEmdedding_GLOO(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._gloo_mode = True
@@ -35,11 +36,10 @@ def _setup_config(self):
         self._diff_batch = True
 
     def test_sparse_embedding(self):
-        self.check_with_place(
-            "parallel_dygraph_sparse_embedding.py",
-            delta=1e-5,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("parallel_dygraph_sparse_embedding.py",
+                              delta=1e-5,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_gloo.py
index 56fcf806c4717..5d42d54a28c34 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_gloo.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_gloo.py
@@ -28,31 +28,31 @@
 
 
 class TestParallelDygraphSparseEmdedding_GLOO(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._gloo_mode = True
         self._dygraph = True
 
     def test_sparse_embedding(self):
-        self.check_with_place(
-            "parallel_dygraph_sparse_embedding.py",
-            delta=1e-5,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("parallel_dygraph_sparse_embedding.py",
+                              delta=1e-5,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 class TestParallelDygraphSparseEmdeddingFP64_GLOO(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._gloo_mode = True
         self._dygraph = True
 
     def test_sparse_embedding_fp64(self):
-        self.check_with_place(
-            "parallel_dygraph_sparse_embedding_fp64.py",
-            delta=1e-5,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("parallel_dygraph_sparse_embedding_fp64.py",
+                              delta=1e-5,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py
index 9aca448f16121..7b1cd0efcdf27 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py
@@ -27,6 +27,7 @@
 
 
 class TestParallelDygraphSparseEmdeddingOverHeight(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -42,6 +43,7 @@ def test_sparse_embedding(self):
 
 
 class TestParallelDygraphSparseEmdeddingOverHeightSpawn(TestDistSpawnRunner):
+
     def test_sparse_embedding_with_spawn(self):
         if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
             self.check_dist_result_with_spawn(
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height_gloo.py
index ba43e26e23a4e..d48eb401e7a55 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height_gloo.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height_gloo.py
@@ -27,6 +27,7 @@
 
 
 class TestParallelDygraphSparseEmdeddingOverHeight_GLOO(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._gloo_mode = True
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
index 7cf1e9711b74b..d4b73ab7b4241 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
@@ -18,10 +18,12 @@
 import paddle.fluid as fluid
 
 import os
+
 flag_name = os.path.splitext(__file__)[0]
 
 
 class TestParallelDygraphMnist(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -29,11 +31,10 @@ def _setup_config(self):
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_sync_batch_norm.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_sync_batch_norm.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py
index 14a291627843e..971c545f0f5a6 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py
@@ -22,6 +22,7 @@
 
 
 class TestHybridParallel(TestMultipleGpus):
+
     def test_hybrid_parallel_mp_random(self):
         self.run_mnist_2gpu('hybrid_parallel_mp_random.py')
         self.run_mnist_2gpu('hybrid_parallel_mp_random.py', eager_mode=False)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
index 71a8c7347e162..03c3235b50306 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
@@ -27,6 +27,7 @@
 
 
 class TestParallelDygraphTransformer(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -34,14 +35,14 @@ def _setup_config(self):
 
     def test_transformer(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_transformer.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_transformer.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestParallelDygraphTransformerAccGrad(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -51,11 +52,10 @@ def _setup_config(self):
 
     def test_transformer(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_transformer.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_transformer.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer_gloo.py
index d3619cc1b9a00..bfd9158e9ec83 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer_gloo.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer_gloo.py
@@ -27,20 +27,21 @@
 
 
 class TestParallelDygraphTransformer_GLOO(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._gloo_mode = True
         self._dygraph = True
 
     def test_transformer(self):
-        self.check_with_place(
-            "parallel_dygraph_transformer.py",
-            delta=1e-5,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("parallel_dygraph_transformer.py",
+                              delta=1e-5,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 class TestParallelDygraphTransformerAccGrad_GLOO(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._gloo_mode = True
@@ -50,11 +51,10 @@ def _setup_config(self):
 
     def test_transformer(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_transformer.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_transformer.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
index 75fa6f7c71d0a..1f71514cc7372 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
@@ -27,6 +27,7 @@
 
 
 class TestParallelDygraphUnusedVar(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -34,14 +35,14 @@ def _setup_config(self):
 
     def test_net(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_unused_variables.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_unused_variables.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestFleetDygraphUnusedVar(TestParallelDygraphUnusedVar):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -50,6 +51,7 @@ def _setup_config(self):
 
 
 class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner):
+
     def test_mnist_with_spawn(self):
         if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
             self.check_dist_result_with_spawn(
@@ -57,6 +59,7 @@ def test_mnist_with_spawn(self):
 
 
 class TestParallelDygraphNoVar(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -64,14 +67,14 @@ def _setup_config(self):
 
     def test_net(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_none_var.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_none_var.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 class TestParallelDygraphSharedUnusedVariables(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
@@ -79,11 +82,10 @@ def _setup_config(self):
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_shared_unused_var.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("parallel_dygraph_shared_unused_var.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables_gloo.py
index 89373fcb6eebc..f605ae8fe28b4 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables_gloo.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables_gloo.py
@@ -27,45 +27,45 @@
 
 
 class TestParallelDygraphUnusedVar_GLOO(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._gloo_mode = True
         self._dygraph = True
 
     def test_net(self):
-        self.check_with_place(
-            "parallel_dygraph_unused_variables.py",
-            delta=1e-5,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("parallel_dygraph_unused_variables.py",
+                              delta=1e-5,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 class TestParallelDygraphNoVar_GLOO(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._gloo_mode = True
         self._dygraph = True
 
     def test_net(self):
-        self.check_with_place(
-            "parallel_dygraph_none_var.py",
-            delta=1e-5,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("parallel_dygraph_none_var.py",
+                              delta=1e-5,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 class TestParallelDygraphSharedUnusedVariables_GLOO(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = False
         self._gloo_mode = True
         self._dygraph = True
 
     def test_mnist(self):
-        self.check_with_place(
-            "parallel_dygraph_shared_unused_var.py",
-            delta=1e-5,
-            check_error_log=True,
-            log_name=flag_name)
+        self.check_with_place("parallel_dygraph_shared_unused_var.py",
+                              delta=1e-5,
+                              check_error_log=True,
+                              log_name=flag_name)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index 47d286fb6ab32..c81a38019956f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -39,27 +39,25 @@
 def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
             is_sparse, **ignored):
     # 8 features
-    predicate_embedding = fluid.layers.embedding(
-        input=predicate,
-        is_sparse=is_sparse,
-        size=[pred_dict_len, word_dim],
-        dtype='float32',
-        param_attr='vemb')
-
-    mark_embedding = fluid.layers.embedding(
-        input=mark,
-        is_sparse=is_sparse,
-        size=[mark_dict_len, mark_dim],
-        dtype='float32')
+    predicate_embedding = fluid.layers.embedding(input=predicate,
+                                                 is_sparse=is_sparse,
+                                                 size=[pred_dict_len, word_dim],
+                                                 dtype='float32',
+                                                 param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(input=mark,
+                                            is_sparse=is_sparse,
+                                            size=[mark_dict_len, mark_dim],
+                                            dtype='float32')
 
     word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
     emb_layers = [
-        fluid.layers.embedding(
-            size=[word_dict_len, word_dim],
-            is_sparse=is_sparse,
-            input=x,
-            param_attr=fluid.ParamAttr(
-                name=embedding_name, trainable=False)) for x in word_input
+        fluid.layers.embedding(size=[word_dict_len, word_dim],
+                               is_sparse=is_sparse,
+                               input=x,
+                               param_attr=fluid.ParamAttr(name=embedding_name,
+                                                          trainable=False))
+        for x in word_input
     ]
     # TODO(zcd): if the parameter is not trainable, the
     #  parameter's gradient should not generated.
@@ -76,12 +74,11 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
 
     hidden_0 = fluid.layers.sums(input=hidden_0_layers)
 
-    lstm_0 = fluid.layers.dynamic_lstm(
-        input=hidden_0,
-        size=hidden_dim,
-        candidate_activation='relu',
-        gate_activation='sigmoid',
-        cell_activation='sigmoid')
+    lstm_0 = fluid.layers.dynamic_lstm(input=hidden_0,
+                                       size=hidden_dim,
+                                       candidate_activation='relu',
+                                       gate_activation='sigmoid',
+                                       cell_activation='sigmoid')
 
     # stack L-LSTM and R-LSTM with direct edges
     input_tmp = [hidden_0, lstm_0]
@@ -92,13 +89,12 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
             fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
         ])
 
-        lstm = fluid.layers.dynamic_lstm(
-            input=mix_hidden,
-            size=hidden_dim,
-            candidate_activation='relu',
-            gate_activation='sigmoid',
-            cell_activation='sigmoid',
-            is_reverse=((i % 2) == 1))
+        lstm = fluid.layers.dynamic_lstm(input=mix_hidden,
+                                         size=hidden_dim,
+                                         candidate_activation='relu',
+                                         gate_activation='sigmoid',
+                                         cell_activation='sigmoid',
+                                         is_reverse=((i % 2) == 1))
 
         input_tmp = [mix_hidden, lstm]
 
@@ -111,6 +107,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
 
 
 class TestCRFModel(unittest.TestCase):
+
     def check_network_convergence(self,
                                   is_sparse,
                                   build_strategy=None,
@@ -121,31 +118,48 @@ def check_network_convergence(self,
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             with fluid.program_guard(main, startup):
-                word = fluid.layers.data(
-                    name='word_data', shape=[1], dtype='int64', lod_level=1)
-                predicate = fluid.layers.data(
-                    name='verb_data', shape=[1], dtype='int64', lod_level=1)
-                ctx_n2 = fluid.layers.data(
-                    name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-                ctx_n1 = fluid.layers.data(
-                    name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-                ctx_0 = fluid.layers.data(
-                    name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-                ctx_p1 = fluid.layers.data(
-                    name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-                ctx_p2 = fluid.layers.data(
-                    name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-                mark = fluid.layers.data(
-                    name='mark_data', shape=[1], dtype='int64', lod_level=1)
+                word = fluid.layers.data(name='word_data',
+                                         shape=[1],
+                                         dtype='int64',
+                                         lod_level=1)
+                predicate = fluid.layers.data(name='verb_data',
+                                              shape=[1],
+                                              dtype='int64',
+                                              lod_level=1)
+                ctx_n2 = fluid.layers.data(name='ctx_n2_data',
+                                           shape=[1],
+                                           dtype='int64',
+                                           lod_level=1)
+                ctx_n1 = fluid.layers.data(name='ctx_n1_data',
+                                           shape=[1],
+                                           dtype='int64',
+                                           lod_level=1)
+                ctx_0 = fluid.layers.data(name='ctx_0_data',
+                                          shape=[1],
+                                          dtype='int64',
+                                          lod_level=1)
+                ctx_p1 = fluid.layers.data(name='ctx_p1_data',
+                                           shape=[1],
+                                           dtype='int64',
+                                           lod_level=1)
+                ctx_p2 = fluid.layers.data(name='ctx_p2_data',
+                                           shape=[1],
+                                           dtype='int64',
+                                           lod_level=1)
+                mark = fluid.layers.data(name='mark_data',
+                                         shape=[1],
+                                         dtype='int64',
+                                         lod_level=1)
 
                 feature_out = db_lstm(**locals())
-                target = fluid.layers.data(
-                    name='target', shape=[1], dtype='int64', lod_level=1)
+                target = fluid.layers.data(name='target',
+                                           shape=[1],
+                                           dtype='int64',
+                                           lod_level=1)
                 crf_cost = fluid.layers.linear_chain_crf(
                     input=feature_out,
                     label=target,
-                    param_attr=fluid.ParamAttr(
-                        name='crfw', learning_rate=1e-1))
+                    param_attr=fluid.ParamAttr(name='crfw', learning_rate=1e-1))
                 avg_cost = fluid.layers.mean(crf_cost)
 
                 sgd_optimizer = fluid.optimizer.SGD(
@@ -156,10 +170,9 @@ def check_network_convergence(self,
                         staircase=True))
                 sgd_optimizer.minimize(avg_cost)
 
-                train_data = paddle.batch(
-                    paddle.reader.shuffle(
-                        paddle.dataset.conll05.test(), buf_size=8192),
-                    batch_size=8)
+                train_data = paddle.batch(paddle.reader.shuffle(
+                    paddle.dataset.conll05.test(), buf_size=8192),
+                                          batch_size=8)
 
                 place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
                 exe = fluid.Executor(place)
@@ -168,19 +181,19 @@ def check_network_convergence(self,
                 train_cp = compiler.CompiledProgram(main).with_data_parallel(
                     loss_name=avg_cost.name, build_strategy=build_strategy)
 
-                feeder = fluid.DataFeeder(
-                    feed_list=[
-                        word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
-                        mark, target
-                    ],
-                    place=fluid.CPUPlace())
+                feeder = fluid.DataFeeder(feed_list=[
+                    word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
+                    mark, target
+                ],
+                                          place=fluid.CPUPlace())
 
             data = train_data()
             for i in range(4):
                 cur_batch = next(data)
-                print(exe.run(train_cp,
-                              feed=feeder.feed(cur_batch),
-                              fetch_list=[avg_cost.name])[0])
+                print(
+                    exe.run(train_cp,
+                            feed=feeder.feed(cur_batch),
+                            fetch_list=[avg_cost.name])[0])
 
     def _new_build_strategy(self, use_reduce=False):
         build_strategy = fluid.BuildStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py
index e0bae089829b3..7618371036b12 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py
@@ -21,6 +21,7 @@
 
 
 class TestParallelExecutorDropExeScope(unittest.TestCase):
+
     def check_drop_scope(self, use_cuda=True):
         place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
@@ -42,16 +43,14 @@ def check_drop_scope(self, use_cuda=True):
         exec_strateg = fluid.ExecutionStrategy()
         exec_strateg.num_iteration_per_drop_scope = 10
 
-        train_exe = fluid.ParallelExecutor(
-            use_cuda=use_cuda,
-            main_program=train_program,
-            loss_name=loss.name,
-            exec_strategy=exec_strateg)
-        test_exe = fluid.ParallelExecutor(
-            use_cuda=use_cuda,
-            main_program=test_program,
-            share_vars_from=train_exe,
-            exec_strategy=exec_strateg)
+        train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                           main_program=train_program,
+                                           loss_name=loss.name,
+                                           exec_strategy=exec_strateg)
+        test_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                          main_program=test_program,
+                                          share_vars_from=train_exe,
+                                          exec_strategy=exec_strateg)
 
         x = numpy.random.random(size=(10, 1)).astype('float32')
         train_exe.run(feed={"X": x}, fetch_list=[loss.name])
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
index 328b3a4813eec..aefa635508db0 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
@@ -18,10 +18,12 @@
 import logging
 import six
 import os
+
 os.environ['CPU_NUM'] = str(4)
 
 
 class TestBase(unittest.TestCase):
+
     def main(self,
              network_func,
              iter=10,
@@ -47,21 +49,21 @@ def main(self,
                 exe_strategy._dry_run = True
                 exe_strategy.use_experimental_executor = use_experimental_executor
                 train_cp = compiler.CompiledProgram(
-                    main_prog).with_data_parallel(
-                        loss_name=loss.name, exec_strategy=exe_strategy)
+                    main_prog).with_data_parallel(loss_name=loss.name,
+                                                  exec_strategy=exe_strategy)
                 for _ in six.moves.xrange(iter):
                     for _ in six.moves.xrange(iter_per_pe):
                         exe.run(train_cp)
 
 
 class TestMNISTDryRun(TestBase):
+
     def test_mnist_dry_run(self):
         for use_gpu in (False, True):
             for use_experimental_executor in (False, True):
-                self.main(
-                    network_func=TestMNISTDryRun.network_func,
-                    use_gpu=use_gpu,
-                    use_experimental_executor=use_experimental_executor)
+                self.main(network_func=TestMNISTDryRun.network_func,
+                          use_gpu=use_gpu,
+                          use_experimental_executor=use_experimental_executor)
 
     @staticmethod
     def network_func():
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py
index 2597df7faff54..2c903f7c9971a 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py
@@ -24,12 +24,14 @@
 
 
 class TestFeedPersistableVar(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
         batch_size = 4
-        cls.img, cls.label = init_data(
-            batch_size, img_shape=[784], label_range=9)
+        cls.img, cls.label = init_data(batch_size,
+                                       img_shape=[784],
+                                       label_range=9)
         cls.feed_dict = {
             'image': cls.img,
             'label': cls.label,
@@ -37,12 +39,11 @@ def setUpClass(cls):
         }
 
     def optimizer(self):
-        learning_rate = fluid.layers.create_global_var(
-            name="learning_rate",
-            shape=[1],
-            value=1.0,
-            dtype='float32',
-            persistable=True)
+        learning_rate = fluid.layers.create_global_var(name="learning_rate",
+                                                       shape=[1],
+                                                       value=1.0,
+                                                       dtype='float32',
+                                                       persistable=True)
         optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
         return optimizer
 
@@ -70,12 +71,12 @@ def test_feed_persistable_var(self):
         self.check_feed_persistable_var(self.feed_dict)
         self.check_feed_persistable_var(self.feed_dict, use_cuda=True)
 
-        self.feed_dict['learning_rate'] = numpy.array(
-            [1.0, 1.0]).astype("float32")
+        self.feed_dict['learning_rate'] = numpy.array([1.0,
+                                                       1.0]).astype("float32")
         self.check_feed_persistable_var(self.feed_dict, use_cuda=True)
 
-        self.feed_dict['learning_rate'] = numpy.array(
-            [1.0, 1.0]).astype("float32")
+        self.feed_dict['learning_rate'] = numpy.array([1.0,
+                                                       1.0]).astype("float32")
         run = partial(self.check_feed_persistable_var, self.feed_dict)
         self.assertRaises(RuntimeError, run)
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index 052edac0ea7a3..0c3c293f7b9c3 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -38,6 +38,7 @@ def Lenet(data, class_dim):
 
 
 class TestFetchAndFeed(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
@@ -52,8 +53,9 @@ def parallel_exe(self,
         startup = fluid.Program()
         startup.random_seed = seed
         with fluid.program_guard(main_program, startup):
-            data = fluid.layers.data(
-                name='image', shape=[3, 224, 224], dtype='float32')
+            data = fluid.layers.data(name='image',
+                                     shape=[3, 224, 224],
+                                     dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             out = Lenet(data, class_dim=102)
             loss = fluid.layers.cross_entropy(input=out, label=label)
@@ -84,13 +86,14 @@ def parallel_exe(self,
 
     def run_parallel_exe_with_fetch(self, compiled_program, exe, use_cuda, data,
                                     label, loss):
+
         def get_data(batch_size=8):
             np.random.seed(5)
             while True:
-                img = np.random.random(
-                    size=[batch_size, 3, 224, 224]).astype(np.float32)
-                l = (np.random.random(size=[batch_size, 1]) *
-                     10).astype(np.int64)
+                img = np.random.random(size=[batch_size, 3, 224, 224]).astype(
+                    np.float32)
+                l = (np.random.random(size=[batch_size, 1]) * 10).astype(
+                    np.int64)
                 yield img, l
 
         fetch_list = []
@@ -117,15 +120,16 @@ def get_data(batch_size=8):
 
     def run_parallel_exe_with_feed(self, compiled_program, exe, use_cuda, data,
                                    label, loss):
+
         def get_data(batch_size=8):
             np.random.seed(5)
             while True:
                 train_data = []
                 for _ in range(batch_size):
-                    img = np.random.random(
-                        size=[1, 3, 224, 224]).astype(np.float32)
-                    label = (np.random.random(size=[1, 1]) *
-                             10).astype(np.int64)
+                    img = np.random.random(size=[1, 3, 224, 224]).astype(
+                        np.float32)
+                    label = (np.random.random(size=[1, 1]) * 10).astype(
+                        np.int64)
                     train_data.append([img, label])
                 yield train_data
 
@@ -143,30 +147,28 @@ def get_data(batch_size=8):
 
     def check_executor(self, use_faster_executor=False, num_threads=4):
         if core.is_compiled_with_cuda():
-            self.parallel_exe(
-                use_cuda=True,
-                run_parallel_exe=self.run_parallel_exe_with_fetch,
-                use_faster_executor=use_faster_executor,
-                num_threads=num_threads)
-        self.parallel_exe(
-            use_cuda=False,
-            run_parallel_exe=self.run_parallel_exe_with_fetch,
-            use_faster_executor=use_faster_executor,
-            num_threads=num_threads)
+            self.parallel_exe(use_cuda=True,
+                              run_parallel_exe=self.run_parallel_exe_with_fetch,
+                              use_faster_executor=use_faster_executor,
+                              num_threads=num_threads)
+        self.parallel_exe(use_cuda=False,
+                          run_parallel_exe=self.run_parallel_exe_with_fetch,
+                          use_faster_executor=use_faster_executor,
+                          num_threads=num_threads)
 
     def test_fetch(self):
         for use_faster_executor in {True, False}:
-            self.check_executor(
-                use_faster_executor=use_faster_executor, num_threads=4)
-            self.check_executor(
-                use_faster_executor=use_faster_executor, num_threads=1)
+            self.check_executor(use_faster_executor=use_faster_executor,
+                                num_threads=4)
+            self.check_executor(use_faster_executor=use_faster_executor,
+                                num_threads=1)
 
     def test_feed(self):
         if core.is_compiled_with_cuda():
-            self.parallel_exe(
-                use_cuda=True, run_parallel_exe=self.run_parallel_exe_with_feed)
-        self.parallel_exe(
-            use_cuda=False, run_parallel_exe=self.run_parallel_exe_with_feed)
+            self.parallel_exe(use_cuda=True,
+                              run_parallel_exe=self.run_parallel_exe_with_feed)
+        self.parallel_exe(use_cuda=False,
+                          run_parallel_exe=self.run_parallel_exe_with_feed)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
index a34982ef3dd67..1a015369ec679 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
@@ -26,6 +26,7 @@ def enable_parallel_ssa_executor(enabled=True):
 
 
 class TestParallelExecutorFetchIsolatedVarBase(unittest.TestCase):
+
     def build_network(self, is_training):
         x = fluid.data(name='x', shape=[-1, 10], dtype='float32')
         y = fluid.data(name='y', shape=[-1, 10], dtype='float32')
@@ -54,7 +55,10 @@ def test_main(self):
                 for is_training in [False, True]:
                     for use_experimental_executor in [False, True]:
                         for use_parallel_ssa_executor in [False, True]:
-                            func = lambda: self.run_impl(use_gpu, dev_cnt, is_training, use_experimental_executor, use_parallel_ssa_executor)
+                            func = lambda: self.run_impl(
+                                use_gpu, dev_cnt, is_training,
+                                use_experimental_executor,
+                                use_parallel_ssa_executor)
                             self.run_func_with_guard(func)
 
     def run_impl(self, use_gpu, dev_cnt, is_training, use_experimental_executor,
@@ -63,8 +67,8 @@ def run_impl(self, use_gpu, dev_cnt, is_training, use_experimental_executor,
         enable_parallel_ssa_executor(use_parallel_ssa_executor)
 
         if fluid.is_compiled_with_cuda():
-            if fluid.core.globals()[
-                    'FLAGS_enable_parallel_graph'] and not use_gpu:
+            if fluid.core.globals(
+            )['FLAGS_enable_parallel_graph'] and not use_gpu:
                 return
             # windows has only 1 GPU
             if use_gpu and dev_cnt > 1 and os.name == "nt":
@@ -81,11 +85,11 @@ def run_impl(self, use_gpu, dev_cnt, is_training, use_experimental_executor,
 
         exe.run(fluid.default_startup_program())
 
-        prog = fluid.CompiledProgram(fluid.default_main_program(
-        )).with_data_parallel(
-            loss_name=loss_name,
-            exec_strategy=self.exec_strategy(use_experimental_executor),
-            places=places)
+        prog = fluid.CompiledProgram(
+            fluid.default_main_program()).with_data_parallel(
+                loss_name=loss_name,
+                exec_strategy=self.exec_strategy(use_experimental_executor),
+                places=places)
 
         BATCH_SIZE = 8 * dev_cnt
         for _ in six.moves.range(10):
@@ -93,8 +97,10 @@ def run_impl(self, use_gpu, dev_cnt, is_training, use_experimental_executor,
             y_np = np.random.random(size=[BATCH_SIZE, 10]).astype('float32')
 
             _, y_np_fetch = exe.run(prog,
-                                    feed={'x': x_np,
-                                          'y': y_np},
+                                    feed={
+                                        'x': x_np,
+                                        'y': y_np
+                                    },
                                     fetch_list=[loss, isolated_var])
 
             self.assertTrue(np.array_equal(y_np, y_np_fetch))
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fix_op_run_order.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fix_op_run_order.py
index 24aa080e68c28..80da6b5ac61c8 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fix_op_run_order.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fix_op_run_order.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,6 +21,7 @@
 
 
 class TestFixOpRunOrder(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         paddle.seed(1)
@@ -29,8 +30,8 @@ def setUp(self):
             fluid.set_flags({'FLAGS_cudnn_deterministic': 1})
 
     def get_place(self):
-        return paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        return paddle.CUDAPlace(
+            0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
 
     def get_feed(self):
         batch_size = 4
@@ -43,10 +44,12 @@ def create_model(self, fix_op_run_order):
         startup_prog = paddle.static.Program()
         scope = paddle.static.Scope()
         with paddle.static.program_guard(main_prog, startup_prog):
-            image = paddle.static.data(
-                name="image", shape=[None, 3, 224, 224], dtype="float32")
-            label = paddle.static.data(
-                name="label", shape=[None, 1], dtype="int64")
+            image = paddle.static.data(name="image",
+                                       shape=[None, 3, 224, 224],
+                                       dtype="float32")
+            label = paddle.static.data(name="label",
+                                       shape=[None, 1],
+                                       dtype="int64")
             model = resnet18()
             pred = model(image)
             loss_fn = CrossEntropyLoss()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_inference_feed_partial_data.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_inference_feed_partial_data.py
index 124e1dc0c1617..a3a26f481f3f1 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_inference_feed_partial_data.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_inference_feed_partial_data.py
@@ -19,6 +19,7 @@
 
 
 class TestInferencePartialFeed(unittest.TestCase):
+
     def setUp(self):
         self.iterations = 10
         self.size = 10
@@ -46,8 +47,10 @@ def run_network(self, places, use_split, has_persistable):
         prog = fluid.CompiledProgram(main_prog).with_data_parallel(
             places=places)
 
-        gen_random = lambda shape:np.random.uniform(low=-1.0, high=1.0, size=shape).astype('float32')
-        assert_result = lambda feed, result: self.assertTrue(np.array_equal(np.maximum(0, feed), result))
+        gen_random = lambda shape: np.random.uniform(
+            low=-1.0, high=1.0, size=shape).astype('float32')
+        assert_result = lambda feed, result: self.assertTrue(
+            np.array_equal(np.maximum(0, feed), result))
 
         def assert_merged_unmerged(merged, unmerged):
             unmerged = np.concatenate(unmerged, axis=0)
@@ -142,17 +145,20 @@ def test_main(self):
         for p in places:
             for has_persistable in [False, True]:
                 for use_split in [False, True]:
-                    self.run_network(
-                        p, use_split=use_split, has_persistable=has_persistable)
+                    self.run_network(p,
+                                     use_split=use_split,
+                                     has_persistable=has_persistable)
 
 
 class TestInferencePartialFeedUsingDataLoader(unittest.TestCase):
+
     def setUp(self):
         self.epoch_num = 3
         self.batch_num = 101  # a prime number
         self.batch_size = 32
 
     def create_reader(self):
+
         def __impl__():
             for _ in six.moves.range(self.batch_num):
                 yield np.random.random([self.batch_size, 1]).astype('float32'),
@@ -162,20 +168,22 @@ def __impl__():
     def run_network(self, iterable, use_cuda, drop_last):
         x = fluid.data(shape=[None, 1], name='x', dtype='float32')
         places = fluid.cuda_places() if use_cuda else fluid.cpu_places(4)
-        loader = fluid.io.DataLoader.from_generator(
-            feed_list=[x], capacity=16, iterable=iterable, drop_last=drop_last)
+        loader = fluid.io.DataLoader.from_generator(feed_list=[x],
+                                                    capacity=16,
+                                                    iterable=iterable,
+                                                    drop_last=drop_last)
         y = fluid.layers.fc(x, size=10)
         loss = fluid.layers.reduce_mean(y)
 
         exe = fluid.Executor(places[0])
         exe.run(fluid.default_startup_program())
 
-        prog = fluid.CompiledProgram(fluid.default_main_program(
-        )).with_data_parallel(
-            places=places, loss_name=loss.name)
+        prog = fluid.CompiledProgram(
+            fluid.default_main_program()).with_data_parallel(
+                places=places, loss_name=loss.name)
 
-        loader.set_batch_generator(
-            self.create_reader(), places=places if iterable else None)
+        loader.set_batch_generator(self.create_reader(),
+                                   places=places if iterable else None)
 
         for _ in six.moves.range(self.epoch_num):
             actual_batch_num = 0
@@ -203,8 +211,8 @@ def run_network(self, iterable, use_cuda, drop_last):
                 self.assertGreater(self.batch_num, actual_batch_num)
 
     def test_main(self):
-        use_cuda_list = [False, True] if fluid.is_compiled_with_cuda(
-        ) else [False]
+        use_cuda_list = [False, True
+                         ] if fluid.is_compiled_with_cuda() else [False]
         iterable_list = [False, True]
         drop_last_list = [False, True]
         for iterable in iterable_list:
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 61d643f24c17a..81625a29e2295 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -34,8 +34,8 @@ def simple_fc_net(use_feed):
             hidden,
             size=200,
             act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=1.0)))
     prediction = fluid.layers.fc(hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
     loss = fluid.layers.mean(loss)
@@ -73,6 +73,7 @@ def init_data():
 
 
 class TestMNIST(TestParallelExecutorBase):
+
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
@@ -92,15 +93,19 @@ def _compare_reduce_and_allreduce(self,
 
         all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict={
+                "image": img,
+                "label": label
+            },
             use_device=use_device,
             use_reduce=False)
 
         reduce_first_loss, reduce_last_loss = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict={
+                "image": img,
+                "label": label
+            },
             use_device=use_device,
             use_reduce=True)
 
@@ -119,12 +124,13 @@ def check_simple_fc_convergence(self, use_device, use_reduce=False):
 
         img, label = init_data()
 
-        self.check_network_convergence(
-            simple_fc_net,
-            feed_dict={"image": img,
-                       "label": label},
-            use_device=use_device,
-            use_reduce=use_reduce)
+        self.check_network_convergence(simple_fc_net,
+                                       feed_dict={
+                                           "image": img,
+                                           "label": label
+                                       },
+                                       use_device=use_device,
+                                       use_reduce=use_reduce)
 
     def test_simple_fc(self):
         # use_device
@@ -149,23 +155,29 @@ def check_simple_fc_parallel_accuracy(self, use_device):
 
         single_first_loss, single_last_loss = self.check_network_convergence(
             method=simple_fc_net,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict={
+                "image": img,
+                "label": label
+            },
             use_device=use_device,
             use_parallel_executor=False)
         parallel_first_loss, parallel_last_loss = self.check_network_convergence(
             method=simple_fc_net,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict={
+                "image": img,
+                "label": label
+            },
             use_device=use_device,
             use_parallel_executor=True)
 
         self.assertAlmostEquals(
             np.mean(parallel_first_loss),
             single_first_loss,
-            delta=1e-6, )
-        self.assertAlmostEquals(
-            np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
+            delta=1e-6,
+        )
+        self.assertAlmostEquals(np.mean(parallel_last_loss),
+                                single_last_loss,
+                                delta=1e-6)
 
     def test_simple_fc_parallel_accuracy(self):
         self.check_simple_fc_parallel_accuracy(DeviceType.CUDA)
@@ -178,12 +190,13 @@ def check_batchnorm_fc_convergence(self, use_device, use_fast_executor):
             return
         img, label = init_data()
 
-        self.check_network_convergence(
-            fc_with_batchnorm,
-            feed_dict={"image": img,
-                       "label": label},
-            use_device=use_device,
-            use_fast_executor=use_fast_executor)
+        self.check_network_convergence(fc_with_batchnorm,
+                                       feed_dict={
+                                           "image": img,
+                                           "label": label
+                                       },
+                                       use_device=use_device,
+                                       use_fast_executor=use_fast_executor)
 
     def test_batchnorm_fc(self):
         for use_device in (DeviceType.CPU, DeviceType.CUDA):
@@ -201,6 +214,7 @@ def test_batchnorm_fc_with_new_strategy(self):
 
 
 class TestMNISTNoReduce(unittest.TestCase):
+
     def run_program(self, device_type):
         if device_type == DeviceType.CUDA:
             if not paddle.is_compiled_with_cuda():
@@ -225,18 +239,16 @@ def run_program(self, device_type):
         build_strategy = paddle.static.BuildStrategy()
         build_strategy.reduce_strategy = no_reduce
         main_multi_place = paddle.static.CompiledProgram(
-            main).with_data_parallel(
-                loss_name=loss.name,
-                build_strategy=build_strategy,
-                places=places)
+            main).with_data_parallel(loss_name=loss.name,
+                                     build_strategy=build_strategy,
+                                     places=places)
 
         build_strategy = paddle.static.BuildStrategy()
         build_strategy.reduce_strategy = no_reduce
-        main_single_place = paddle.static.CompiledProgram(main.clone(
-        )).with_data_parallel(
-            loss_name=loss.name,
-            build_strategy=build_strategy,
-            places=places[0])
+        main_single_place = paddle.static.CompiledProgram(
+            main.clone()).with_data_parallel(loss_name=loss.name,
+                                             build_strategy=build_strategy,
+                                             places=places[0])
 
         image, label = init_data()
         feed = {'image': image, 'label': label}
@@ -256,13 +268,13 @@ def run_program(self, device_type):
                     grads_single_place[i].append(g)
 
             for i in range(len(grads)):
-                grads_single_place[i] = np.concatenate(
-                    grads_single_place[i], axis=0) / len(places)
+                grads_single_place[i] = np.concatenate(grads_single_place[i],
+                                                       axis=0) / len(places)
 
         self.assertEqual(len(grads_multi_place), len(grads_single_place))
         for g1, g2 in zip(grads_multi_place, grads_single_place):
-            self.assertTrue(
-                np.allclose(g1, g2), 'g1 = {}\ng2 = {}\n'.format(g1, g2))
+            self.assertTrue(np.allclose(g1, g2),
+                            'g1 = {}\ng2 = {}\n'.format(g1, g2))
 
     def split_feed(self, feed, n):
         image = feed['image']
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
index e07b89f7aae76..36299da25a6e8 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 import os
+
 os.environ['FLAGS_enable_parallel_graph'] = str(1)
 import paddle.fluid.core as core
 import os
@@ -26,6 +27,7 @@
 
 
 class TestMNIST(TestParallelExecutorBase):
+
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
@@ -36,12 +38,13 @@ def check_simple_fc_convergence(self, use_device, use_reduce=False):
             return
 
         img, label = init_data()
-        self.check_network_convergence(
-            simple_fc_net,
-            feed_dict={"image": img,
-                       "label": label},
-            use_device=use_device,
-            use_reduce=use_reduce)
+        self.check_network_convergence(simple_fc_net,
+                                       feed_dict={
+                                           "image": img,
+                                           "label": label
+                                       },
+                                       use_device=use_device,
+                                       use_reduce=use_reduce)
 
     def test_simple_fc(self):
         # use_device
@@ -54,23 +57,29 @@ def check_simple_fc_parallel_accuracy(self, use_device):
         img, label = init_data()
         single_first_loss, single_last_loss = self.check_network_convergence(
             method=simple_fc_net,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict={
+                "image": img,
+                "label": label
+            },
             use_device=use_device,
             use_parallel_executor=False)
         parallel_first_loss, parallel_last_loss = self.check_network_convergence(
             method=simple_fc_net,
-            feed_dict={"image": img,
-                       "label": label},
+            feed_dict={
+                "image": img,
+                "label": label
+            },
             use_device=use_device,
             use_parallel_executor=True)
 
         self.assertAlmostEquals(
             np.mean(parallel_first_loss),
             single_first_loss,
-            delta=1e-6, )
-        self.assertAlmostEquals(
-            np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
+            delta=1e-6,
+        )
+        self.assertAlmostEquals(np.mean(parallel_last_loss),
+                                single_last_loss,
+                                delta=1e-6)
 
     def test_simple_fc_parallel_accuracy(self):
         self.check_simple_fc_parallel_accuracy(DeviceType.CUDA)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py
index 0fac0610fd22d..68f5154076400 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py
@@ -24,12 +24,13 @@
 # NCCL 2.7 decides to use shared memory while NCCL 2.6 didn't, hence causing the error.
 # include/shm.h:28 NCCL WARN Call to posix_fallocate failed: No space left on device
 #
-# Set environment variables NCCL_SHM_DISABLE=1 to disables the Shared Memory (SHM) transports 
+# Set environment variables NCCL_SHM_DISABLE=1 to disables the Shared Memory (SHM) transports
 # and force to use P2P which is the default transports way of NCCL2.6.
 os.environ['NCCL_SHM_DISABLE'] = str(1)
 
 
 class TestPEProfiler(TestProfiler):
+
     def test_cpu_profiler(self):
         exe = fluid.Executor(fluid.CPUPlace())
         self.net_profiler(exe, 'CPU', "Default", use_parallel_executor=True)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
index d9ae3cf5e757d..3b275a75b7d39 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
@@ -24,8 +24,8 @@
 
 paddle.enable_static()
 
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',
+                    level=logging.INFO)
 logger = logging.getLogger("paddle_with_cinn")
 
 
@@ -85,8 +85,8 @@ def train(dot_save_dir, prefix, seed=1234):
     main_program = paddle.static.Program()
     img, label, loss = build_program(main_program, startup_program)
 
-    place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
-    ) else paddle.CPUPlace()
+    place = paddle.CUDAPlace(
+        0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
     exe = paddle.static.Executor(place)
     exe.run(startup_program)
 
@@ -109,6 +109,7 @@ def train(dot_save_dir, prefix, seed=1234):
 
 @unittest.skipIf(not set_cinn_flag(True), "Paddle is not compiled with CINN.")
 class TestParallelExecutorRunCinn(unittest.TestCase):
+
     def setUp(self):
         self.tmpdir = tempfile.mkdtemp(prefix="dots_")
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_load_infer_program.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_load_infer_program.py
index fc76f5d152dfe..964fce25a6f0e 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_load_infer_program.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_load_infer_program.py
@@ -21,6 +21,7 @@
 
 
 class TestMNIST(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         cls.save_dirname = "./"
@@ -37,12 +38,13 @@ def test_simple_fc(self):
         exe_loss = self.run_with_executor()
 
         [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             self.save_dirname, self.exe, self.model_filename,
-             self.params_filename)
+         fetch_targets] = fluid.io.load_inference_model(self.save_dirname,
+                                                        self.exe,
+                                                        self.model_filename,
+                                                        self.params_filename)
 
-        train_exe = fluid.ParallelExecutor(
-            use_cuda=False, main_program=inference_program)
+        train_exe = fluid.ParallelExecutor(use_cuda=False,
+                                           main_program=inference_program)
         feed_vars = [
             inference_program.global_block().var(var_name)
             for var_name in ["image", "label"]
@@ -71,12 +73,12 @@ def run_with_executor(self):
                                  feed=feeder.feed(self.batch_data),
                                  fetch_list=[loss.name])
 
-        fluid.io.save_inference_model(
-            self.save_dirname, ["image", "label"], [loss],
-            self.exe,
-            model_filename=self.model_filename,
-            params_filename=self.params_filename,
-            main_program=main)
+        fluid.io.save_inference_model(self.save_dirname, ["image", "label"],
+                                      [loss],
+                                      self.exe,
+                                      model_filename=self.model_filename,
+                                      params_filename=self.params_filename,
+                                      main_program=main)
 
         return loss_data
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
index 9b48a87bff7b9..15a26fc0c068d 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
@@ -20,20 +20,19 @@
 
 
 class TestResnetCPU(TestResnetBase):
+
     def test_seresnext_with_learning_rate_decay(self):
         # NOTE(zcd): This test is compare the result of use parallel_executor
         # and executor, and the result of drop_out op and batch_norm op in
         # this two executor have diff, so the two ops should be removed
         # from the model.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            use_parallel_executor=False)
-        self._compare_result_with_origin_model(
-            check_func,
-            use_device=DeviceType.CPU,
-            compare_separately=False,
-            delta2=1e-3)
+        check_func = partial(self.check_network_convergence,
+                             optimizer=seresnext_net.optimizer,
+                             use_parallel_executor=False)
+        self._compare_result_with_origin_model(check_func,
+                                               use_device=DeviceType.CPU,
+                                               compare_separately=False,
+                                               delta2=1e-3)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
index ff529ce94bd25..ee7736a730315 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
@@ -20,17 +20,18 @@
 
 
 class TestResnetGPU(TestResnetBase):
+
     def test_seresnext_with_learning_rate_decay(self):
         # NOTE(zcd): This test is compare the result of use parallel_executor
         # and executor, and the result of drop_out op and batch_norm op in
         # this two executor have diff, so the two ops should be removed
         # from the model.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            use_parallel_executor=False)
-        self._compare_result_with_origin_model(
-            check_func, use_device=DeviceType.CUDA, compare_separately=False)
+        check_func = partial(self.check_network_convergence,
+                             optimizer=seresnext_net.optimizer,
+                             use_parallel_executor=False)
+        self._compare_result_with_origin_model(check_func,
+                                               use_device=DeviceType.CUDA,
+                                               compare_separately=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py
index 0f1a86a83dbfe..1c355c32ed7f4 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import paddle.fluid as fluid
+
 fluid.core._set_fuse_parameter_group_size(3)
 fluid.core._set_fuse_parameter_memory_size(131072)
 
@@ -24,15 +25,15 @@
 
 
 class TestResnetWithFuseAllReduceCPU(TestResnetBase):
+
     def test_seresnext_with_fused_all_reduce(self):
         # NOTE(zcd): In order to make the program faster,
         # this unit test remove drop_out and batch_norm.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            fuse_all_reduce_ops=True)
-        self._compare_result_with_origin_model(
-            check_func, use_device=DeviceType.CPU)
+        check_func = partial(self.check_network_convergence,
+                             optimizer=seresnext_net.optimizer,
+                             fuse_all_reduce_ops=True)
+        self._compare_result_with_origin_model(check_func,
+                                               use_device=DeviceType.CPU)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
index c747591c81622..566e3d4248d34 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import paddle.fluid as fluid
+
 fluid.core._set_fuse_parameter_group_size(3)
 fluid.core._set_fuse_parameter_memory_size(131072)
 
@@ -24,15 +25,16 @@
 
 
 class TestResnetWithFuseAllReduceGPU(TestResnetBase):
+
     def test_seresnext_with_fused_all_reduce(self):
         # NOTE(zcd): In order to make the program faster,
         # this unit test remove drop_out and batch_norm.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            fuse_all_reduce_ops=True)
-        self._compare_result_with_origin_model(
-            check_func, use_device=DeviceType.CUDA, delta2=1e-2)
+        check_func = partial(self.check_network_convergence,
+                             optimizer=seresnext_net.optimizer,
+                             fuse_all_reduce_ops=True)
+        self._compare_result_with_origin_model(check_func,
+                                               use_device=DeviceType.CUDA,
+                                               delta2=1e-2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
index e67934d87f957..d4cc297d6890a 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
@@ -20,6 +20,7 @@
 
 
 class TestResnetWithReduceBase(TestParallelExecutorBase):
+
     def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5):
         if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
@@ -86,9 +87,10 @@ def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5):
 
 
 class TestResnetWithReduceCPU(TestResnetWithReduceBase):
+
     def test_seresnext_with_reduce(self):
-        self._compare_reduce_and_allreduce(
-            use_device=DeviceType.CPU, delta2=1e-3)
+        self._compare_reduce_and_allreduce(use_device=DeviceType.CPU,
+                                           delta2=1e-3)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
index 4de1a6092dcae..13591f8d87de6 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
@@ -18,9 +18,10 @@
 
 
 class TestResnetWithReduceGPU(TestResnetWithReduceBase):
+
     def test_seresnext_with_reduce(self):
-        self._compare_reduce_and_allreduce(
-            use_device=DeviceType.CUDA, delta2=1e-2)
+        self._compare_reduce_and_allreduce(use_device=DeviceType.CUDA,
+                                           delta2=1e-2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index fd47dc37e7694..e9f4e679d5a24 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -25,6 +25,7 @@
 
 
 class ParallelExecutorTestingDuringTraining(unittest.TestCase):
+
     def check_network_convergence(self, use_cuda, build_strategy=None):
         os.environ['CPU_NUM'] = str(4)
         main = fluid.Program()
@@ -70,32 +71,31 @@ def check_network_convergence(self, use_cuda, build_strategy=None):
                     sys.exit("got NaN loss, training failed.")
 
                 self.assertTrue(
-                    np.allclose(
-                        train_loss, test_loss, atol=1e-2),
-                    "Train loss: " + str(train_loss) + "\n Test loss:" +
-                    str(test_loss))
+                    np.allclose(train_loss, test_loss,
+                                atol=1e-2), "Train loss: " + str(train_loss) +
+                    "\n Test loss:" + str(test_loss))
 
     def test_parallel_testing(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
         if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                use_cuda=True, build_strategy=build_strategy)
-        self.check_network_convergence(
-            use_cuda=False, build_strategy=build_strategy)
+            self.check_network_convergence(use_cuda=True,
+                                           build_strategy=build_strategy)
+        self.check_network_convergence(use_cuda=False,
+                                       build_strategy=build_strategy)
 
     def test_parallel_testing_with_new_strategy_gpu(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
         if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                use_cuda=True, build_strategy=build_strategy)
+            self.check_network_convergence(use_cuda=True,
+                                           build_strategy=build_strategy)
 
     def test_parallel_testing_with_new_strategy_cpu(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        self.check_network_convergence(
-            use_cuda=False, build_strategy=build_strategy)
+        self.check_network_convergence(use_cuda=False,
+                                       build_strategy=build_strategy)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index b87e8d4e3c21a..cc90fdb07f43e 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -106,8 +106,8 @@ def __pad_batch_data(insts,
             if is_target:
                 # This is used to avoid attention on paddings and subsequent
                 # words.
-                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
-                                              max_len))
+                slf_attn_bias_data = np.ones(
+                    (inst_data.shape[0], max_len, max_len))
                 slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
                     [-1, 1, max_len, max_len])
                 slf_attn_bias_data = np.tile(slf_attn_bias_data,
@@ -161,10 +161,9 @@ def get_feed_data_reader():
     if feed_data_reader is not None:
         return feed_data_reader
 
-    reader = paddle.batch(
-        wmt16.train(ModelHyperParams.src_vocab_size,
-                    ModelHyperParams.trg_vocab_size),
-        batch_size=transformer_model.batch_size)
+    reader = paddle.batch(wmt16.train(ModelHyperParams.src_vocab_size,
+                                      ModelHyperParams.trg_vocab_size),
+                          batch_size=transformer_model.batch_size)
     all_batch_tensors = []
     for batch in reader():
         tensors = []
@@ -178,15 +177,15 @@ def __reader__():
         for t in all_batch_tensors:
             yield t
 
-    feed_data_reader = FeedDataReader(
-        feed_list=transformer_model.build_inputs(
-            ModelHyperParams.max_length + 1, ModelHyperParams.n_head),
-        reader=__reader__)
+    feed_data_reader = FeedDataReader(feed_list=transformer_model.build_inputs(
+        ModelHyperParams.max_length + 1, ModelHyperParams.n_head),
+                                      reader=__reader__)
 
     return feed_data_reader
 
 
 class TestTransformer(TestParallelExecutorBase):
+
     def test_main(self):
         if core.is_compiled_with_cuda():
             self.check_network_convergence(
@@ -198,11 +197,10 @@ def test_main(self):
                 use_device=DeviceType.CUDA,
                 enable_sequential_execution=True,
                 feed_data_reader=get_feed_data_reader())
-        self.check_network_convergence(
-            transformer,
-            use_device=DeviceType.CPU,
-            iter=2,
-            feed_data_reader=get_feed_data_reader())
+        self.check_network_convergence(transformer,
+                                       use_device=DeviceType.CPU,
+                                       iter=2,
+                                       feed_data_reader=get_feed_data_reader())
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py b/python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py
index 1b24889830ad8..bacf97e0c68b3 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import unittest
 import paddle.fluid as fluid
 
@@ -21,9 +22,13 @@
 
 
 class TestParallelMarginSoftmaxWithCrossEntropy(TestMultipleGpus):
+
     def test_parallel_margin_cross_entropy(self):
         self.run_mnist_2gpu('parallel_margin_cross_entropy.py')
+        self.run_mnist_2gpu('parallel_margin_cross_entropy.py',
+                            eager_mode=False)
 
 
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parameter.py b/python/paddle/fluid/tests/unittests/test_parameter.py
index 61d75fca2745e..d75a6c0dd90f4 100644
--- a/python/paddle/fluid/tests/unittests/test_parameter.py
+++ b/python/paddle/fluid/tests/unittests/test_parameter.py
@@ -30,15 +30,15 @@
 
 
 class ParameterChecks(unittest.TestCase):
+
     def test_parameter(self):
         shape = [784, 100]
         val = 1.0625
         b = main_program.global_block()
-        param = b.create_parameter(
-            name='fc.w',
-            shape=shape,
-            dtype='float32',
-            initializer=ConstantInitializer(val))
+        param = b.create_parameter(name='fc.w',
+                                   shape=shape,
+                                   dtype='float32',
+                                   initializer=ConstantInitializer(val))
         self.assertIsNotNone(param)
         self.assertEqual('fc.w', param.name)
         self.assertEqual((784, 100), param.shape)
@@ -80,17 +80,25 @@ def test_parambase(self):
     def func_exception(self):
         b = main_program.global_block()
         with self.assertRaises(ValueError):
-            b.create_parameter(
-                name='test', shape=None, dtype='float32', initializer=None)
+            b.create_parameter(name='test',
+                               shape=None,
+                               dtype='float32',
+                               initializer=None)
         with self.assertRaises(ValueError):
-            b.create_parameter(
-                name='test', shape=[1], dtype=None, initializer=None)
+            b.create_parameter(name='test',
+                               shape=[1],
+                               dtype=None,
+                               initializer=None)
         with self.assertRaises(ValueError):
-            b.create_parameter(
-                name='test', shape=[], dtype='float32', initializer=None)
+            b.create_parameter(name='test',
+                               shape=[],
+                               dtype='float32',
+                               initializer=None)
         with self.assertRaises(ValueError):
-            b.create_parameter(
-                name='test', shape=[-1], dtype='float32', initializer=None)
+            b.create_parameter(name='test',
+                               shape=[-1],
+                               dtype='float32',
+                               initializer=None)
 
     def func_parambase_to_vector(self):
         with guard():
diff --git a/python/paddle/fluid/tests/unittests/test_partial_concat_op.py b/python/paddle/fluid/tests/unittests/test_partial_concat_op.py
index a83ca3f81a816..842b1e725d40b 100644
--- a/python/paddle/fluid/tests/unittests/test_partial_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_partial_concat_op.py
@@ -41,6 +41,7 @@ def np_partial_concat(inputs, start, length):
 
 
 class TestPartialConcatOp(OpTest):
+
     def setUp(self):
         self.op_type = "partial_concat"
         self.init_kernel_type()
@@ -74,6 +75,7 @@ def test_check_grad(self):
 
 
 class TestPartialConcatOp2(TestPartialConcatOp):
+
     def init_para(self):
         self.batch_size = random.randint(1, 10)
         self.column = random.randint(101, 200)
@@ -83,6 +85,7 @@ def init_para(self):
 
 
 class TestPartialConcatOp3(TestPartialConcatOp):
+
     def init_para(self):
         self.batch_size = random.randint(1, 10)
         self.column = random.randint(101, 200)
@@ -92,6 +95,7 @@ def init_para(self):
 
 
 class TestPartialConcatOp4(TestPartialConcatOp):
+
     def init_para(self):
         self.batch_size = random.randint(1, 10)
         self.column = random.randint(101, 200)
diff --git a/python/paddle/fluid/tests/unittests/test_partial_sum_op.py b/python/paddle/fluid/tests/unittests/test_partial_sum_op.py
index eb5166430103b..7f016d3d1b5cf 100644
--- a/python/paddle/fluid/tests/unittests/test_partial_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_partial_sum_op.py
@@ -26,6 +26,7 @@
 
 
 class TestPartialSumOp(OpTest):
+
     def setUp(self):
         self.op_type = "partial_sum"
         self.init_kernel_type()
@@ -66,6 +67,7 @@ def test_check_grad(self):
 
 
 class TestPartialSumOp2(TestPartialSumOp):
+
     def init_para(self):
         self.batch_size = random.randint(1, 10)
         self.column = random.randint(101, 200)
@@ -75,6 +77,7 @@ def init_para(self):
 
 
 class TestPartialSumOp3(TestPartialSumOp):
+
     def init_para(self):
         self.batch_size = random.randint(1, 10)
         self.column = random.randint(101, 200)
@@ -84,6 +87,7 @@ def init_para(self):
 
 
 class TestPartialSumOp4(TestPartialSumOp):
+
     def init_para(self):
         self.batch_size = random.randint(1, 10)
         self.column = random.randint(101, 200)
diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py
index 023ceeaa73acc..01d65941068a8 100644
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
@@ -27,6 +27,7 @@
 
 
 class TestPassBuilder(unittest.TestCase):
+
     def check_network_convergence(self, use_cuda, build_strategy=None):
         os.environ['CPU_NUM'] = str(4)
         main = fluid.Program()
@@ -72,10 +73,9 @@ def check_network_convergence(self, use_cuda, build_strategy=None):
                     sys.exit("got NaN loss, training failed.")
 
                 self.assertTrue(
-                    np.allclose(
-                        train_loss, test_loss, atol=1e-8),
-                    "Train loss: " + str(train_loss) + "\n Test loss:" +
-                    str(test_loss))
+                    np.allclose(train_loss, test_loss,
+                                atol=1e-8), "Train loss: " + str(train_loss) +
+                    "\n Test loss:" + str(test_loss))
 
     def test_parallel_testing_with_new_strategy(self):
         build_strategy = fluid.BuildStrategy()
@@ -93,8 +93,8 @@ def test_parallel_testing_with_new_strategy(self):
         viz_pass = pass_builder.append_pass("graph_viz_pass")
         self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
 
-        pass_builder.insert_pass(
-            len(pass_builder.all_passes()), "graph_viz_pass")
+        pass_builder.insert_pass(len(pass_builder.all_passes()),
+                                 "graph_viz_pass")
         self.assertEqual(origin_len + 2, len(pass_builder.all_passes()))
 
         pass_builder.remove_pass(len(pass_builder.all_passes()) - 1)
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline.py b/python/paddle/fluid/tests/unittests/test_pipeline.py
index 04772a2da2871..7075b9a0d434a 100644
--- a/python/paddle/fluid/tests/unittests/test_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline.py
@@ -25,6 +25,7 @@
 
 
 class TestPipeline(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -41,21 +42,19 @@ def test_dist_train(self):
             # Now pipeline only gets the loss value of the last
             # microbatch, so it is not consistable with the
             # non-pipeline one.
-            self.check_with_place(
-                "pipeline_mnist.py",
-                delta=1e0,
-                check_error_log=True,
-                log_name=flag_name,
-                need_envs=self.need_envs())
+            self.check_with_place("pipeline_mnist.py",
+                                  delta=1e0,
+                                  check_error_log=True,
+                                  log_name=flag_name,
+                                  need_envs=self.need_envs())
 
     def test_dist_train_multi_device(self):
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "pipeline_mnist_multi_device.py",
-                check_error_log=True,
-                delta=1e0,
-                log_name=flag_name,
-                need_envs=self.need_envs())
+            self.check_with_place("pipeline_mnist_multi_device.py",
+                                  check_error_log=True,
+                                  delta=1e0,
+                                  log_name=flag_name,
+                                  need_envs=self.need_envs())
 
     def test_dist_train_one_device(self):
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
index f62e160673f8d..8773e8d47ed3c 100644
--- a/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
@@ -21,6 +21,7 @@
 
 
 class TestPipelineParallel(TestMultipleGpus):
+
     def test_pipeline_parallel(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_alexnet.py')
 
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
index 06d975fe2b88f..05b158624dd3d 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
@@ -50,6 +50,7 @@ def pixel_shuffle_np(x, up_factor, data_format="NCHW"):
 
 
 class TestPixelShuffleOp(OpTest):
+
     def setUp(self):
         self.op_type = "pixel_shuffle"
         self.python_api = paddle.nn.functional.pixel_shuffle
@@ -81,11 +82,13 @@ def test_check_grad(self):
 
 
 class TestChannelLast(TestPixelShuffleOp):
+
     def init_data_format(self):
         self.format = "NHWC"
 
 
 class TestPixelShuffleAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_1_np = np.random.random([2, 9, 4, 4]).astype("float64")
         self.x_2_np = np.random.random([2, 4, 4, 9]).astype("float64")
@@ -98,10 +101,12 @@ def test_static_graph_functional(self):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
             paddle.enable_static()
-            x_1 = paddle.fluid.data(
-                name="x", shape=[2, 9, 4, 4], dtype="float64")
-            x_2 = paddle.fluid.data(
-                name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            x_1 = paddle.fluid.data(name="x",
+                                    shape=[2, 9, 4, 4],
+                                    dtype="float64")
+            x_2 = paddle.fluid.data(name="x2",
+                                    shape=[2, 4, 4, 9],
+                                    dtype="float64")
             out_1 = F.pixel_shuffle(x_1, 3)
             out_2 = F.pixel_shuffle(x_2, 3, "NHWC")
 
@@ -126,10 +131,12 @@ def test_static_graph_layer(self):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
             paddle.enable_static()
-            x_1 = paddle.fluid.data(
-                name="x", shape=[2, 9, 4, 4], dtype="float64")
-            x_2 = paddle.fluid.data(
-                name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            x_1 = paddle.fluid.data(name="x",
+                                    shape=[2, 9, 4, 4],
+                                    dtype="float64")
+            x_2 = paddle.fluid.data(name="x2",
+                                    shape=[2, 4, 4, 9],
+                                    dtype="float64")
             # init instance
             ps_1 = paddle.nn.PixelShuffle(3)
             ps_2 = paddle.nn.PixelShuffle(3, "NHWC")
@@ -171,14 +178,14 @@ def run_dygraph(self, up_factor, data_format):
 
             paddle.disable_static(place=place)
 
-            pixel_shuffle = paddle.nn.PixelShuffle(
-                up_factor, data_format=data_format)
+            pixel_shuffle = paddle.nn.PixelShuffle(up_factor,
+                                                   data_format=data_format)
             result = pixel_shuffle(paddle.to_tensor(x))
 
             self.assertTrue(np.allclose(result.numpy(), npresult))
 
-            result_functional = F.pixel_shuffle(
-                paddle.to_tensor(x), 3, data_format)
+            result_functional = F.pixel_shuffle(paddle.to_tensor(x), 3,
+                                                data_format)
             self.assertTrue(np.allclose(result_functional.numpy(), npresult))
 
     def test_dygraph1(self):
@@ -189,7 +196,9 @@ def test_dygraph2(self):
 
 
 class TestPixelShuffleError(unittest.TestCase):
+
     def test_error_functional(self):
+
         def error_upscale_factor():
             with paddle.fluid.dygraph.guard():
                 x = np.random.random([2, 9, 4, 4]).astype("float64")
@@ -205,6 +214,7 @@ def error_data_format():
         self.assertRaises(ValueError, error_data_format)
 
     def test_error_layer(self):
+
         def error_upscale_factor_layer():
             with paddle.fluid.dygraph.guard():
                 x = np.random.random([2, 9, 4, 4]).astype("float64")
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_unshuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_unshuffle.py
index 768a9e307c91e..1ae2c016e25d9 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_unshuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_unshuffle.py
@@ -120,10 +120,12 @@ def test_static_graph_functional(self):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
             paddle.enable_static()
-            x_1 = paddle.fluid.data(
-                name="x", shape=[2, 1, 12, 12], dtype="float64")
-            x_2 = paddle.fluid.data(
-                name="x2", shape=[2, 12, 12, 1], dtype="float64")
+            x_1 = paddle.fluid.data(name="x",
+                                    shape=[2, 1, 12, 12],
+                                    dtype="float64")
+            x_2 = paddle.fluid.data(name="x2",
+                                    shape=[2, 12, 12, 1],
+                                    dtype="float64")
             out_1 = F.pixel_unshuffle(x_1, 3)
             out_2 = F.pixel_unshuffle(x_2, 3, "NHWC")
 
@@ -150,10 +152,12 @@ def test_static_graph_layer(self):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
             paddle.enable_static()
-            x_1 = paddle.fluid.data(
-                name="x", shape=[2, 1, 12, 12], dtype="float64")
-            x_2 = paddle.fluid.data(
-                name="x2", shape=[2, 12, 12, 1], dtype="float64")
+            x_1 = paddle.fluid.data(name="x",
+                                    shape=[2, 1, 12, 12],
+                                    dtype="float64")
+            x_2 = paddle.fluid.data(name="x2",
+                                    shape=[2, 12, 12, 1],
+                                    dtype="float64")
             # init instance
             ps_1 = paddle.nn.PixelUnshuffle(3)
             ps_2 = paddle.nn.PixelUnshuffle(3, "NHWC")
@@ -196,14 +200,14 @@ def run_dygraph(self, down_factor, data_format):
 
             paddle.disable_static(place=place)
 
-            pixel_unshuffle = paddle.nn.PixelUnshuffle(
-                down_factor, data_format=data_format)
+            pixel_unshuffle = paddle.nn.PixelUnshuffle(down_factor,
+                                                       data_format=data_format)
             result = pixel_unshuffle(paddle.to_tensor(x))
 
             self.assertTrue(np.allclose(result.numpy(), npresult))
 
-            result_functional = F.pixel_unshuffle(
-                paddle.to_tensor(x), 3, data_format)
+            result_functional = F.pixel_unshuffle(paddle.to_tensor(x), 3,
+                                                  data_format)
             self.assertTrue(np.allclose(result_functional.numpy(), npresult))
 
             pixel_unshuffle_str = 'downscale_factor={}'.format(down_factor)
@@ -252,8 +256,8 @@ def error_downscale_factor_2():
         def error_data_format():
             with paddle.fluid.dygraph.guard():
                 x = np.random.random([2, 1, 12, 12]).astype("float64")
-                pixel_unshuffle = F.pixel_unshuffle(
-                    paddle.to_tensor(x), 3, "WOW")
+                pixel_unshuffle = F.pixel_unshuffle(paddle.to_tensor(x), 3,
+                                                    "WOW")
 
         self.assertRaises(ValueError, error_data_format)
 
diff --git a/python/paddle/fluid/tests/unittests/test_poisson_op.py b/python/paddle/fluid/tests/unittests/test_poisson_op.py
index 7dd3841fe4bcb..57adcd26959ae 100644
--- a/python/paddle/fluid/tests/unittests/test_poisson_op.py
+++ b/python/paddle/fluid/tests/unittests/test_poisson_op.py
@@ -39,6 +39,7 @@ def output_hist(out, lam, a, b):
 
 
 class TestPoissonOp1(OpTest):
+
     def setUp(self):
         self.op_type = "poisson"
         self.config()
@@ -55,10 +56,8 @@ def config(self):
 
     def verify_output(self, outs):
         hist, prob = output_hist(np.array(outs[0]), self.lam, self.a, self.b)
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0.01),
-            "actual: {}, expected: {}".format(hist, prob))
+        self.assertTrue(np.allclose(hist, prob, rtol=0.01),
+                        "actual: {}, expected: {}".format(hist, prob))
 
     def test_check_output(self):
         self.check_output_customized(self.verify_output)
@@ -67,14 +66,14 @@ def test_check_grad_normal(self):
         self.check_grad(
             ['X'],
             'Out',
-            user_defined_grads=[np.zeros(
-                [1024, 1024], dtype=self.dtype)],
+            user_defined_grads=[np.zeros([1024, 1024], dtype=self.dtype)],
             user_defined_grad_outputs=[
                 np.random.rand(1024, 1024).astype(self.dtype)
             ])
 
 
 class TestPoissonOp2(TestPoissonOp1):
+
     def config(self):
         self.lam = 5
         self.a = 1
@@ -83,6 +82,7 @@ def config(self):
 
 
 class TestPoissonAPI(unittest.TestCase):
+
     def test_static(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
index 578b01b02d632..ed4bcf13b71b8 100644
--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
@@ -35,13 +35,14 @@ def PolygonBoxRestore(input):
     indexes = indexes.repeat(
         [geo_channels / 2],
         axis=0)[np.newaxis, :]  # [1, geo_channels/2, 2, h, w]
-    indexes = indexes.repeat(
-        [batch_size], axis=0)  # [batch_size, geo_channels/2, 2, h, w]
+    indexes = indexes.repeat([batch_size],
+                             axis=0)  # [batch_size, geo_channels/2, 2, h, w]
     return indexes.reshape(
         input.shape) * 4 - input  # [batch_size, geo_channels, h, w]
 
 
 class TestPolygonBoxRestoreOp(OpTest):
+
     def config(self):
         self.input_shape = (1, 8, 2, 2)
 
@@ -58,20 +59,25 @@ def test_check_output(self):
 
 
 class TestCase1(TestPolygonBoxRestoreOp):
+
     def config(self):
         self.input_shape = (2, 10, 3, 2)
 
 
 class TestCase2(TestPolygonBoxRestoreOp):
+
     def config(self):
         self.input_shape = (3, 12, 4, 5)
 
 
 class TestPolygonBoxInvalidInput(unittest.TestCase):
+
     def test_error(self):
+
         def test_invalid_input():
-            input = fluid.data(
-                name='input', shape=[None, 3, 32, 32], dtype='int64')
+            input = fluid.data(name='input',
+                               shape=[None, 3, 32, 32],
+                               dtype='int64')
             out = fluid.layers.polygon_box_transform(input)
 
         self.assertRaises(TypeError, test_invalid_input)
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index e1cfcc3f06602..2dd26bef9d152 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -46,8 +46,8 @@ def max_pool1D_forward_naive(x,
     if adaptive:
         L_out = ksize[0]
     else:
-        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] -
+                 1) // strides[0] + 1 if ceil_mode else (
                      L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
 
     out = np.zeros((N, C, L_out))
@@ -79,8 +79,8 @@ def avg_pool1D_forward_naive(x,
     if adaptive:
         L_out = ksize[0]
     else:
-        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
+        L_out = (L - ksize[0] + 2 * paddings[0] + strides[0] -
+                 1) // strides[0] + 1 if ceil_mode else (
                      L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
 
     out = np.zeros((N, C, L_out))
@@ -96,15 +96,16 @@ def avg_pool1D_forward_naive(x,
         field_size = (r_end - r_start) \
             if (exclusive or adaptive) else (ksize[0])
         if data_type == np.int8 or data_type == np.uint8:
-            out[:, :, i] = (np.rint(
-                np.sum(x_masked, axis=(2, 3)) / field_size)).astype(data_type)
+            out[:, :, i] = (np.rint(np.sum(x_masked, axis=(2, 3)) /
+                                    field_size)).astype(data_type)
         else:
-            out[:, :, i] = (np.sum(x_masked, axis=(2)) /
-                            field_size).astype(data_type)
+            out[:, :,
+                i] = (np.sum(x_masked, axis=(2)) / field_size).astype(data_type)
     return out
 
 
 class TestPool1D_API(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -117,8 +118,11 @@ def check_avg_static_results(self, place):
             result = F.avg_pool1d(input, kernel_size=2, stride=2, padding=0)
 
             input_np = np.random.random([2, 3, 32]).astype("float32")
-            result_np = avg_pool1D_forward_naive(
-                input_np, ksize=[2], strides=[2], paddings=[0], ceil_mode=False)
+            result_np = avg_pool1D_forward_naive(input_np,
+                                                 ksize=[2],
+                                                 strides=[2],
+                                                 paddings=[0],
+                                                 ceil_mode=False)
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
@@ -132,13 +136,16 @@ def check_avg_dygraph_results(self, place):
             input = fluid.dygraph.to_variable(input_np)
             result = F.avg_pool1d(input, kernel_size=2, stride=2, padding=[0])
 
-            result_np = avg_pool1D_forward_naive(
-                input_np, ksize=[2], strides=[2], paddings=[0])
+            result_np = avg_pool1D_forward_naive(input_np,
+                                                 ksize=[2],
+                                                 strides=[2],
+                                                 paddings=[0])
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool1d_dg = paddle.nn.layer.AvgPool1D(
-                kernel_size=2, stride=None, padding=0)
+            avg_pool1d_dg = paddle.nn.layer.AvgPool1D(kernel_size=2,
+                                                      stride=None,
+                                                      padding=0)
             result = avg_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -146,16 +153,24 @@ def check_avg_dygraph_padding_results(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result = F.avg_pool1d(
-                input, kernel_size=2, stride=2, padding=[1], exclusive=True)
-
-            result_np = avg_pool1D_forward_naive(
-                input_np, ksize=[2], strides=[2], paddings=[1], exclusive=False)
+            result = F.avg_pool1d(input,
+                                  kernel_size=2,
+                                  stride=2,
+                                  padding=[1],
+                                  exclusive=True)
+
+            result_np = avg_pool1D_forward_naive(input_np,
+                                                 ksize=[2],
+                                                 strides=[2],
+                                                 paddings=[1],
+                                                 exclusive=False)
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool1d_dg = paddle.nn.AvgPool1D(
-                kernel_size=2, stride=None, padding=1, exclusive=True)
+            avg_pool1d_dg = paddle.nn.AvgPool1D(kernel_size=2,
+                                                stride=None,
+                                                padding=1,
+                                                exclusive=True)
 
             result = avg_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -166,8 +181,10 @@ def check_max_static_results(self, place):
             result = F.max_pool1d(input, kernel_size=2, stride=2, padding=[0])
 
             input_np = np.random.random([2, 3, 32]).astype("float32")
-            result_np = max_pool1D_forward_naive(
-                input_np, ksize=[2], strides=[2], paddings=[0])
+            result_np = max_pool1D_forward_naive(input_np,
+                                                 ksize=[2],
+                                                 strides=[2],
+                                                 paddings=[0])
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
@@ -181,13 +198,16 @@ def check_max_dygraph_results(self, place):
             input = fluid.dygraph.to_variable(input_np)
             result = F.max_pool1d(input, kernel_size=2, stride=2, padding=0)
 
-            result_np = max_pool1D_forward_naive(
-                input_np, ksize=[2], strides=[2], paddings=[0])
+            result_np = max_pool1D_forward_naive(input_np,
+                                                 ksize=[2],
+                                                 strides=[2],
+                                                 paddings=[0])
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool1d_dg = paddle.nn.layer.MaxPool1D(
-                kernel_size=2, stride=None, padding=0)
+            max_pool1d_dg = paddle.nn.layer.MaxPool1D(kernel_size=2,
+                                                      stride=None,
+                                                      padding=0)
             result = max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -195,16 +215,22 @@ def check_max_dygraph_return_index_results(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result, index = F.max_pool1d(
-                input, kernel_size=2, stride=2, padding=0, return_mask=True)
+            result, index = F.max_pool1d(input,
+                                         kernel_size=2,
+                                         stride=2,
+                                         padding=0,
+                                         return_mask=True)
 
-            result_np = max_pool1D_forward_naive(
-                input_np, ksize=[2], strides=[2], paddings=[0])
+            result_np = max_pool1D_forward_naive(input_np,
+                                                 ksize=[2],
+                                                 strides=[2],
+                                                 paddings=[0])
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool1d_dg = paddle.nn.layer.MaxPool1D(
-                kernel_size=2, stride=None, padding=0)
+            max_pool1d_dg = paddle.nn.layer.MaxPool1D(kernel_size=2,
+                                                      stride=None,
+                                                      padding=0)
             result = max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -212,11 +238,15 @@ def check_max_dygraph_padding_same(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result = F.max_pool1d(
-                input, kernel_size=2, stride=2, padding="SAME")
+            result = F.max_pool1d(input,
+                                  kernel_size=2,
+                                  stride=2,
+                                  padding="SAME")
 
-            result_np = max_pool1D_forward_naive(
-                input_np, ksize=[2], strides=[2], paddings=[0])
+            result_np = max_pool1D_forward_naive(input_np,
+                                                 ksize=[2],
+                                                 strides=[2],
+                                                 paddings=[0])
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -224,11 +254,15 @@ def check_avg_dygraph_padding_same(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result = F.avg_pool1d(
-                input, kernel_size=2, stride=2, padding="SAME")
+            result = F.avg_pool1d(input,
+                                  kernel_size=2,
+                                  stride=2,
+                                  padding="SAME")
 
-            result_np = avg_pool1D_forward_naive(
-                input_np, ksize=[2], strides=[2], paddings=[0])
+            result_np = avg_pool1D_forward_naive(input_np,
+                                                 ksize=[2],
+                                                 strides=[2],
+                                                 paddings=[0])
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -249,15 +283,19 @@ def test_dygraph_final_state_api(self):
 
 
 class TestPool2DError_API(unittest.TestCase):
+
     def test_error_api(self):
+
         def run1():
             with fluid.dygraph.guard():
                 input_np = np.random.uniform(-1, 1,
                                              [2, 3, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = [[2]]
-                res_pd = F.max_pool1d(
-                    input_pd, kernel_size=2, stride=2, padding=padding)
+                res_pd = F.max_pool1d(input_pd,
+                                      kernel_size=2,
+                                      stride=2,
+                                      padding=padding)
 
         self.assertRaises(ValueError, run1)
 
@@ -267,8 +305,10 @@ def run2():
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = [[2]]
-                res_pd = F.max_pool1d(
-                    input_pd, kernel_size=2, stride=2, padding=padding)
+                res_pd = F.max_pool1d(input_pd,
+                                      kernel_size=2,
+                                      stride=2,
+                                      padding=padding)
 
         self.assertRaises(ValueError, run2)
 
@@ -278,8 +318,10 @@ def run3():
                                              [2, 3, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = "padding"
-                res_pd = F.max_pool1d(
-                    input_pd, kernel_size=2, stride=2, padding=padding)
+                res_pd = F.max_pool1d(input_pd,
+                                      kernel_size=2,
+                                      stride=2,
+                                      padding=padding)
 
         self.assertRaises(ValueError, run3)
 
@@ -289,12 +331,11 @@ def run4():
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = "VALID"
-                res_pd = F.max_pool1d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    ceil_mode=True)
+                res_pd = F.max_pool1d(input_pd,
+                                      kernel_size=2,
+                                      stride=2,
+                                      padding=padding,
+                                      ceil_mode=True)
 
         self.assertRaises(ValueError, run4)
 
@@ -304,12 +345,11 @@ def run5():
                                              [2, 3, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = "VALID"
-                res_pd = F.max_pool1d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    ceil_mode=True)
+                res_pd = F.max_pool1d(input_pd,
+                                      kernel_size=2,
+                                      stride=2,
+                                      padding=padding,
+                                      ceil_mode=True)
 
         self.assertRaises(ValueError, run5)
 
@@ -319,12 +359,11 @@ def run6():
                                              [2, 3, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = "VALID"
-                res_pd = F.avg_pool1d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    ceil_mode=True)
+                res_pd = F.avg_pool1d(input_pd,
+                                      kernel_size=2,
+                                      stride=2,
+                                      padding=padding,
+                                      ceil_mode=True)
 
         self.assertRaises(ValueError, run6)
 
@@ -334,12 +373,11 @@ def run7():
                                              [2, 3, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = "paddle"
-                res_pd = F.avg_pool1d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    ceil_mode=True)
+                res_pd = F.avg_pool1d(input_pd,
+                                      kernel_size=2,
+                                      stride=2,
+                                      padding=padding,
+                                      ceil_mode=True)
 
         self.assertRaises(ValueError, run7)
 
@@ -349,12 +387,11 @@ def run_kernel_out_of_range():
                                              [2, 3, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = 0
-                res_pd = F.avg_pool1d(
-                    input_pd,
-                    kernel_size=-1,
-                    stride=2,
-                    padding=padding,
-                    ceil_mode=True)
+                res_pd = F.avg_pool1d(input_pd,
+                                      kernel_size=-1,
+                                      stride=2,
+                                      padding=padding,
+                                      ceil_mode=True)
 
         self.assertRaises(ValueError, run_kernel_out_of_range)
 
@@ -364,12 +401,11 @@ def run_stride_out_of_range():
                                              [2, 3, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = 0
-                res_pd = F.avg_pool1d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=0,
-                    padding=padding,
-                    ceil_mode=True)
+                res_pd = F.avg_pool1d(input_pd,
+                                      kernel_size=2,
+                                      stride=0,
+                                      padding=padding,
+                                      ceil_mode=True)
 
         self.assertRaises(ValueError, run_stride_out_of_range)
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_api.py b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
index e86fa0ec48330..b17c0ea039125 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
@@ -24,6 +24,7 @@
 
 
 class TestPool2D_API(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -32,17 +33,17 @@ def setUp(self):
 
     def check_avg_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(
-                name="input", shape=[2, 3, 32, 32], dtype="float32")
+            input = fluid.data(name="input",
+                               shape=[2, 3, 32, 32],
+                               dtype="float32")
             result = avg_pool2d(input, kernel_size=2, stride=2, padding=0)
 
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
-            result_np = pool2D_forward_naive(
-                input_np,
-                ksize=[2, 2],
-                strides=[2, 2],
-                paddings=[0, 0],
-                pool_type='avg')
+            result_np = pool2D_forward_naive(input_np,
+                                             ksize=[2, 2],
+                                             strides=[2, 2],
+                                             paddings=[0, 0],
+                                             pool_type='avg')
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
@@ -56,16 +57,16 @@ def check_avg_dygraph_results(self, place):
             input = fluid.dygraph.to_variable(input_np)
             result = avg_pool2d(input, kernel_size=2, stride=2, padding=0)
 
-            result_np = pool2D_forward_naive(
-                input_np,
-                ksize=[2, 2],
-                strides=[2, 2],
-                paddings=[0, 0],
-                pool_type='avg')
+            result_np = pool2D_forward_naive(input_np,
+                                             ksize=[2, 2],
+                                             strides=[2, 2],
+                                             paddings=[0, 0],
+                                             pool_type='avg')
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(
-                kernel_size=2, stride=2, padding=0)
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(kernel_size=2,
+                                                      stride=2,
+                                                      padding=0)
             result = avg_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -73,20 +74,24 @@ def check_avg_dygraph_padding_results(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result = avg_pool2d(
-                input, kernel_size=2, stride=2, padding=1, ceil_mode=False)
-
-            result_np = avg_pool2D_forward_naive(
-                input_np,
-                ksize=[2, 2],
-                strides=[2, 2],
-                paddings=[1, 1],
-                ceil_mode=False,
-                exclusive=False)
+            result = avg_pool2d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=1,
+                                ceil_mode=False)
+
+            result_np = avg_pool2D_forward_naive(input_np,
+                                                 ksize=[2, 2],
+                                                 strides=[2, 2],
+                                                 paddings=[1, 1],
+                                                 ceil_mode=False,
+                                                 exclusive=False)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(
-                kernel_size=2, stride=2, padding=1, ceil_mode=False)
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(kernel_size=2,
+                                                      stride=2,
+                                                      padding=1,
+                                                      ceil_mode=False)
             result = avg_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -94,35 +99,39 @@ def check_avg_dygraph_ceilmode_results(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result = avg_pool2d(
-                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
-
-            result_np = avg_pool2D_forward_naive(
-                input_np,
-                ksize=[2, 2],
-                strides=[2, 2],
-                paddings=[0, 0],
-                ceil_mode=True)
+            result = avg_pool2d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=0,
+                                ceil_mode=True)
+
+            result_np = avg_pool2D_forward_naive(input_np,
+                                                 ksize=[2, 2],
+                                                 strides=[2, 2],
+                                                 paddings=[0, 0],
+                                                 ceil_mode=True)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(
-                kernel_size=2, stride=2, padding=0, ceil_mode=True)
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(kernel_size=2,
+                                                      stride=2,
+                                                      padding=0,
+                                                      ceil_mode=True)
             result = avg_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
     def check_max_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(
-                name="input", shape=[2, 3, 32, 32], dtype="float32")
+            input = fluid.data(name="input",
+                               shape=[2, 3, 32, 32],
+                               dtype="float32")
             result = max_pool2d(input, kernel_size=2, stride=2, padding=0)
 
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
-            result_np = pool2D_forward_naive(
-                input_np,
-                ksize=[2, 2],
-                strides=[2, 2],
-                paddings=[0, 0],
-                pool_type='max')
+            result_np = pool2D_forward_naive(input_np,
+                                             ksize=[2, 2],
+                                             strides=[2, 2],
+                                             paddings=[0, 0],
+                                             pool_type='max')
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
@@ -134,19 +143,22 @@ def check_max_dygraph_results(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result = max_pool2d(
-                input, kernel_size=2, stride=2, padding=0, return_mask=False)
-
-            result_np = pool2D_forward_naive(
-                input_np,
-                ksize=[2, 2],
-                strides=[2, 2],
-                paddings=[0, 0],
-                pool_type='max')
+            result = max_pool2d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=0,
+                                return_mask=False)
+
+            result_np = pool2D_forward_naive(input_np,
+                                             ksize=[2, 2],
+                                             strides=[2, 2],
+                                             paddings=[0, 0],
+                                             pool_type='max')
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool2d_dg = paddle.nn.layer.MaxPool2D(
-                kernel_size=2, stride=2, padding=0)
+            max_pool2d_dg = paddle.nn.layer.MaxPool2D(kernel_size=2,
+                                                      stride=2,
+                                                      padding=0)
             result = max_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -155,42 +167,44 @@ def check_max_dygraph_nhwc_results(self, place):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(
                 np.transpose(input_np, [0, 2, 3, 1]))
-            result = max_pool2d(
-                input,
-                kernel_size=2,
-                stride=2,
-                padding=0,
-                return_mask=False,
-                data_format="NHWC")
-
-            result_np = pool2D_forward_naive(
-                input_np,
-                ksize=[2, 2],
-                strides=[2, 2],
-                paddings=[0, 0],
-                pool_type='max')
+            result = max_pool2d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=0,
+                                return_mask=False,
+                                data_format="NHWC")
+
+            result_np = pool2D_forward_naive(input_np,
+                                             ksize=[2, 2],
+                                             strides=[2, 2],
+                                             paddings=[0, 0],
+                                             pool_type='max')
             self.assertTrue(
-                np.allclose(
-                    np.transpose(result.numpy(), [0, 3, 1, 2]), result_np))
+                np.allclose(np.transpose(result.numpy(), [0, 3, 1, 2]),
+                            result_np))
 
     def check_max_dygraph_padding_results(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result = max_pool2d(
-                input, kernel_size=2, stride=2, padding=1, ceil_mode=False)
-
-            result_np = max_pool2D_forward_naive(
-                input_np,
-                ksize=[2, 2],
-                strides=[2, 2],
-                paddings=[1, 1],
-                ceil_mode=False,
-                exclusive=False)
+            result = max_pool2d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=1,
+                                ceil_mode=False)
+
+            result_np = max_pool2D_forward_naive(input_np,
+                                                 ksize=[2, 2],
+                                                 strides=[2, 2],
+                                                 paddings=[1, 1],
+                                                 ceil_mode=False,
+                                                 exclusive=False)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool2d_dg = paddle.nn.layer.MaxPool2D(
-                kernel_size=2, stride=2, padding=1, ceil_mode=False)
+            max_pool2d_dg = paddle.nn.layer.MaxPool2D(kernel_size=2,
+                                                      stride=2,
+                                                      padding=1,
+                                                      ceil_mode=False)
             result = max_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -198,19 +212,23 @@ def check_max_dygraph_ceilmode_results(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result = max_pool2d(
-                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
-
-            result_np = max_pool2D_forward_naive(
-                input_np,
-                ksize=[2, 2],
-                strides=[2, 2],
-                paddings=[0, 0],
-                ceil_mode=True)
+            result = max_pool2d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=0,
+                                ceil_mode=True)
+
+            result_np = max_pool2D_forward_naive(input_np,
+                                                 ksize=[2, 2],
+                                                 strides=[2, 2],
+                                                 paddings=[0, 0],
+                                                 ceil_mode=True)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool2d_dg = paddle.nn.layer.MaxPool2D(
-                kernel_size=2, stride=2, padding=0, ceil_mode=True)
+            max_pool2d_dg = paddle.nn.layer.MaxPool2D(kernel_size=2,
+                                                      stride=2,
+                                                      padding=0,
+                                                      ceil_mode=True)
             result = max_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -218,24 +236,23 @@ def check_max_dygraph_stride_is_none(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result, indices = max_pool2d(
-                input,
-                kernel_size=2,
-                stride=None,
-                padding="SAME",
-                return_mask=True)
-
-            result_np = pool2D_forward_naive(
-                input_np,
-                ksize=[2, 2],
-                strides=[2, 2],
-                paddings=[0, 0],
-                pool_type='max',
-                padding_algorithm="SAME")
+            result, indices = max_pool2d(input,
+                                         kernel_size=2,
+                                         stride=None,
+                                         padding="SAME",
+                                         return_mask=True)
+
+            result_np = pool2D_forward_naive(input_np,
+                                             ksize=[2, 2],
+                                             strides=[2, 2],
+                                             paddings=[0, 0],
+                                             pool_type='max',
+                                             padding_algorithm="SAME")
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool2d_dg = paddle.nn.layer.MaxPool2D(
-                kernel_size=2, stride=2, padding=0)
+            max_pool2d_dg = paddle.nn.layer.MaxPool2D(kernel_size=2,
+                                                      stride=2,
+                                                      padding=0)
             result = max_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -243,20 +260,22 @@ def check_avg_dygraph_stride_is_none(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result = avg_pool2d(
-                input, kernel_size=2, stride=None, padding="SAME")
-
-            result_np = pool2D_forward_naive(
-                input_np,
-                ksize=[2, 2],
-                strides=[2, 2],
-                paddings=[0, 0],
-                pool_type='avg',
-                padding_algorithm="SAME")
+            result = avg_pool2d(input,
+                                kernel_size=2,
+                                stride=None,
+                                padding="SAME")
+
+            result_np = pool2D_forward_naive(input_np,
+                                             ksize=[2, 2],
+                                             strides=[2, 2],
+                                             paddings=[0, 0],
+                                             pool_type='avg',
+                                             padding_algorithm="SAME")
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(
-                kernel_size=2, stride=2, padding=0)
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(kernel_size=2,
+                                                      stride=2,
+                                                      padding=0)
             result = avg_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -265,23 +284,22 @@ def check_max_dygraph_padding(self, place):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
             padding = [[0, 0], [0, 0], [0, 0], [0, 0]]
-            result = max_pool2d(
-                input,
-                kernel_size=2,
-                stride=2,
-                padding=padding,
-                return_mask=False)
-
-            result_np = pool2D_forward_naive(
-                input_np,
-                ksize=[2, 2],
-                strides=[2, 2],
-                paddings=[0, 0],
-                pool_type='max')
+            result = max_pool2d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=padding,
+                                return_mask=False)
+
+            result_np = pool2D_forward_naive(input_np,
+                                             ksize=[2, 2],
+                                             strides=[2, 2],
+                                             paddings=[0, 0],
+                                             pool_type='max')
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool2d_dg = paddle.nn.layer.MaxPool2D(
-                kernel_size=2, stride=2, padding=0)
+            max_pool2d_dg = paddle.nn.layer.MaxPool2D(kernel_size=2,
+                                                      stride=2,
+                                                      padding=0)
             result = max_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -290,23 +308,22 @@ def check_avg_divisor(self, place):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
             padding = [[0, 0], [0, 0], [0, 0], [0, 0]]
-            result = avg_pool2d(
-                input,
-                kernel_size=2,
-                stride=2,
-                padding=padding,
-                divisor_override=4)
-
-            result_np = pool2D_forward_naive(
-                input_np,
-                ksize=[2, 2],
-                strides=[2, 2],
-                paddings=[0, 0],
-                pool_type='avg')
+            result = avg_pool2d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=padding,
+                                divisor_override=4)
+
+            result_np = pool2D_forward_naive(input_np,
+                                             ksize=[2, 2],
+                                             strides=[2, 2],
+                                             paddings=[0, 0],
+                                             pool_type='avg')
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(
-                kernel_size=2, stride=2, padding=0)
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(kernel_size=2,
+                                                      stride=2,
+                                                      padding=0)
             result = avg_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -331,15 +348,19 @@ def test_dygraph_final_state_api(self):
 
 
 class TestPool2DError_API(unittest.TestCase):
+
     def test_error_api(self):
+
         def run1():
             with fluid.dygraph.guard():
                 input_np = np.random.uniform(-1, 1,
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = [[0, 1], [0, 0], [0, 0], [0, 0]]
-                res_pd = max_pool2d(
-                    input_pd, kernel_size=2, stride=2, padding=padding)
+                res_pd = max_pool2d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=padding)
 
         self.assertRaises(ValueError, run1)
 
@@ -349,12 +370,11 @@ def run2():
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = [[0, 1], [0, 0], [0, 0], [0, 0]]
-                res_pd = max_pool2d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    data_format='NHWC')
+                res_pd = max_pool2d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=padding,
+                                    data_format='NHWC')
 
         self.assertRaises(ValueError, run2)
 
@@ -364,12 +384,11 @@ def run3():
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = "padding"
-                res_pd = max_pool2d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    data_format='NHWC')
+                res_pd = max_pool2d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=padding,
+                                    data_format='NHWC')
 
         self.assertRaises(ValueError, run3)
 
@@ -379,12 +398,11 @@ def run3_avg():
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = "padding"
-                res_pd = avg_pool2d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    data_format='NHWC')
+                res_pd = avg_pool2d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=padding,
+                                    data_format='NHWC')
 
         self.assertRaises(ValueError, run3_avg)
 
@@ -394,13 +412,12 @@ def run4():
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = "VALID"
-                res_pd = max_pool2d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    ceil_mode=True,
-                    data_format='NHWC')
+                res_pd = max_pool2d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=padding,
+                                    ceil_mode=True,
+                                    data_format='NHWC')
 
         self.assertRaises(ValueError, run4)
 
@@ -410,13 +427,12 @@ def run4_avg():
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = "VALID"
-                res_pd = avg_pool2d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    ceil_mode=True,
-                    data_format='NHWC')
+                res_pd = avg_pool2d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=padding,
+                                    ceil_mode=True,
+                                    data_format='NHWC')
 
         self.assertRaises(ValueError, run4_avg)
 
@@ -426,12 +442,11 @@ def run5():
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = "padding"
-                res_pd = avg_pool2d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    data_format='NHWC')
+                res_pd = avg_pool2d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=padding,
+                                    data_format='NHWC')
 
         self.assertRaises(ValueError, run5)
 
@@ -441,13 +456,12 @@ def run6():
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = "VALID"
-                res_pd = avg_pool2d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    ceil_mode=True,
-                    data_format='NHWC')
+                res_pd = avg_pool2d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=padding,
+                                    ceil_mode=True,
+                                    data_format='NHWC')
 
         self.assertRaises(ValueError, run6)
 
@@ -457,13 +471,12 @@ def run7():
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = "VALID"
-                res_pd = avg_pool2d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    ceil_mode=False,
-                    data_format='NNNN')
+                res_pd = avg_pool2d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=padding,
+                                    ceil_mode=False,
+                                    data_format='NNNN')
 
         self.assertRaises(ValueError, run7)
 
@@ -473,13 +486,12 @@ def run8():
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = "VALID"
-                res_pd = max_pool2d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    ceil_mode=False,
-                    data_format='NNNN')
+                res_pd = max_pool2d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=padding,
+                                    ceil_mode=False,
+                                    data_format='NNNN')
 
         self.assertRaises(ValueError, run8)
 
@@ -488,14 +500,13 @@ def run9():
                 input_np = np.random.uniform(-1, 1,
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
-                res_pd = max_pool2d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=0,
-                    ceil_mode=False,
-                    data_format='NHWC',
-                    return_mask=True)
+                res_pd = max_pool2d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=0,
+                                    ceil_mode=False,
+                                    data_format='NHWC',
+                                    return_mask=True)
 
         self.assertRaises(ValueError, run9)
 
@@ -504,13 +515,12 @@ def run_kernel_out_of_range():
                 input_np = np.random.uniform(-1, 1,
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
-                res_pd = avg_pool2d(
-                    input_pd,
-                    kernel_size=[-1, 2],
-                    stride=2,
-                    padding=0,
-                    ceil_mode=False,
-                    data_format='NHWC')
+                res_pd = avg_pool2d(input_pd,
+                                    kernel_size=[-1, 2],
+                                    stride=2,
+                                    padding=0,
+                                    ceil_mode=False,
+                                    data_format='NHWC')
 
         self.assertRaises(ValueError, run_kernel_out_of_range)
 
@@ -519,13 +529,12 @@ def run_stride_out_of_range():
                 input_np = np.random.uniform(-1, 1,
                                              [2, 3, 32, 32]).astype(np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
-                res_pd = avg_pool2d(
-                    input_pd,
-                    kernel_size=3,
-                    stride=[0, 2],
-                    padding=0,
-                    ceil_mode=False,
-                    data_format='NHWC')
+                res_pd = avg_pool2d(input_pd,
+                                    kernel_size=3,
+                                    stride=[0, 2],
+                                    padding=0,
+                                    ceil_mode=False,
+                                    data_format='NHWC')
 
         self.assertRaises(ValueError, run_stride_out_of_range)
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 582ec9501068c..81fa00986d7da 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -49,11 +49,11 @@ def max_pool2D_forward_naive(x,
     if adaptive:
         H_out, W_out = ksize
     else:
-        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
+        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] -
+                 1) // strides[0] + 1 if ceil_mode else (
                      H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
-                 ) // strides[1] + 1 if ceil_mode else (
+        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] -
+                 1) // strides[1] + 1 if ceil_mode else (
                      W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     for i in range(H_out):
@@ -91,11 +91,11 @@ def avg_pool2D_forward_naive(x,
     if adaptive:
         H_out, W_out = ksize
     else:
-        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
+        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] -
+                 1) // strides[0] + 1 if ceil_mode else (
                      H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
-                 ) // strides[1] + 1 if ceil_mode else (
+        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] -
+                 1) // strides[1] + 1 if ceil_mode else (
                      W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     for i in range(H_out):
@@ -122,9 +122,9 @@ def avg_pool2D_forward_naive(x,
                 field_size = (r_end - r_start) * (c_end - c_start)
 
             if data_type == np.int8 or data_type == np.uint8:
-                out[:, :, i, j] = (np.rint(
-                    np.sum(x_masked, axis=(2, 3)) /
-                    field_size)).astype(data_type)
+                out[:, :, i,
+                    j] = (np.rint(np.sum(x_masked, axis=(2, 3)) /
+                                  field_size)).astype(data_type)
             else:
                 out[:, :, i, j] = (np.sum(x_masked, axis=(2, 3)) /
                                    field_size).astype(data_type)
@@ -149,8 +149,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         for input_size, filter_size, stride_size in zip(input_shape, pool_size,
                                                         pool_stride):
             out_size = int((input_size + stride_size - 1) / stride_size)
-            pad_sum = np.max((
-                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0))
             pad_0 = int(pad_sum / 2)
             pad_1 = int(pad_sum - pad_0)
             padding.append(pad_0)
@@ -233,8 +233,9 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                 x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
                 if pool_type == 'avg':
                     if (exclusive or adaptive):
-                        field_size = (in_h_end - in_h_start) * (
-                            in_w_end - in_w_start)
+                        field_size = (in_h_end - in_h_start) * (in_w_end -
+                                                                in_w_start)
+
 
 #                         if (exclusive or adaptive) else (ksize[0] * ksize[1])
                     out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
@@ -244,8 +245,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                 x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
                 if pool_type == 'avg':
                     if (exclusive or adaptive):
-                        field_size = (in_h_end - in_h_start) * (
-                            in_w_end - in_w_start)
+                        field_size = (in_h_end - in_h_start) * (in_w_end -
+                                                                in_w_start)
                     out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
                 elif pool_type == 'max':
                     out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
@@ -253,6 +254,7 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
 
 
 class TestPool2D_Op_Mixin(object):
+
     def setUp(self):
         self.op_type = "pool2d"
         self.use_cudnn = False
@@ -272,10 +274,12 @@ def setUp(self):
         self.init_shape()
 
         input = np.random.random(self.shape).astype(self.dtype)
-        output = pool2D_forward_naive(
-            input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive, self.adaptive, self.data_format,
-            self.pool_type, self.padding_algorithm).astype(self.dtype)
+        output = pool2D_forward_naive(input, self.ksize, self.strides,
+                                      self.paddings, self.global_pool,
+                                      self.ceil_mode, self.exclusive,
+                                      self.adaptive, self.data_format,
+                                      self.pool_type,
+                                      self.padding_algorithm).astype(self.dtype)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
@@ -313,18 +317,16 @@ def test_check_grad(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.has_cudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place,
-                set(['X']),
-                'Out',
-                max_relative_error=0.07,
-                check_dygraph=(self.use_mkldnn == False))
+            self.check_grad_with_place(place,
+                                       set(['X']),
+                                       'Out',
+                                       max_relative_error=0.07,
+                                       check_dygraph=(self.use_mkldnn == False))
         elif self.pool_type != "max":
-            self.check_grad(
-                set(['X']),
-                'Out',
-                max_relative_error=0.07,
-                check_dygraph=(self.use_mkldnn == False))
+            self.check_grad(set(['X']),
+                            'Out',
+                            max_relative_error=0.07,
+                            check_dygraph=(self.use_mkldnn == False))
 
     def init_data_format(self):
         self.data_format = "NCHW"
@@ -368,6 +370,7 @@ class TestPool2D_Op(TestPool2D_Op_Mixin, OpTest):
 
 
 class TestCase1(TestPool2D_Op):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -387,6 +390,7 @@ def init_shape(self):
 
 
 class TestCase2(TestPool2D_Op):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -406,18 +410,21 @@ def init_shape(self):
 
 
 class TestCase3(TestPool2D_Op):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase4(TestCase1):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase5(TestCase2):
+
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
@@ -427,9 +434,11 @@ def init_pool_type(self):
 
 
 def create_test_cudnn_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNCase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
 
@@ -449,9 +458,11 @@ def init_kernel_type(self):
 
 
 def create_test_cudnn_fp16_class(parent, check_grad=True):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNFp16Case(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = np.float16
@@ -484,9 +495,11 @@ def test_check_grad(self):
 
 
 def create_test_fp16_class(parent, check_grad=True):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestFp16Case(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = False
             self.dtype = np.float16
@@ -536,9 +549,11 @@ def test_check_grad(self):
 
 
 def create_test_cudnn_use_ceil_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestPool2DUseCeilCase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
 
@@ -555,7 +570,9 @@ def init_ceil_mode(self):
 
 
 def create_test_use_ceil_class(parent):
+
     class TestPool2DUseCeilCase(parent):
+
         def init_ceil_mode(self):
             self.ceil_mode = True
 
@@ -569,11 +586,13 @@ def init_ceil_mode(self):
 
 
 class TestAvgInclude(TestCase2):
+
     def init_exclusive(self):
         self.exclusive = False
 
 
 class TestCUDNNAvgInclude(TestCase2):
+
     def init_kernel_type(self):
         self.use_cudnn = True
 
@@ -582,11 +601,13 @@ def init_exclusive(self):
 
 
 class TestAvgPoolAdaptive(TestCase1):
+
     def init_adaptive(self):
         self.adaptive = True
 
 
 class TestAvgPoolAdaptiveAsyOutSize(TestCase1):
+
     def init_adaptive(self):
         self.adaptive = True
 
@@ -603,6 +624,7 @@ def init_test_case(self):
 
 
 class TestPool2D_AsyPadding(TestPool2D_Op):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -613,6 +635,7 @@ def init_shape(self):
 
 
 class TestCase1_AsyPadding(TestCase1):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -623,6 +646,7 @@ def init_shape(self):
 
 
 class TestCase2_AsyPadding(TestCase2):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -633,6 +657,7 @@ def init_shape(self):
 
 
 class TestCase3_AsyPadding(TestCase3):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -643,6 +668,7 @@ def init_shape(self):
 
 
 class TestCase4_AsyPadding(TestCase4):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -653,6 +679,7 @@ def init_shape(self):
 
 
 class TestCase5_AsyPadding((TestCase5)):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -684,6 +711,7 @@ def init_shape(self):
 
 
 class TestAvgInclude_AsyPadding(TestCase2):
+
     def init_exclusive(self):
         self.exclusive = False
 
@@ -697,6 +725,7 @@ def init_shape(self):
 
 
 class TestCUDNNAvgInclude_AsyPadding(TestCase2):
+
     def init_kernel_type(self):
         self.use_cudnn = True
 
@@ -713,6 +742,7 @@ def init_shape(self):
 
 
 class TestAvgPoolAdaptive_AsyPadding(TestCase1):
+
     def init_adaptive(self):
         self.adaptive = True
 
@@ -727,6 +757,7 @@ def init_shape(self):
 
 #----------- test channel_last --------------
 class TestPool2D_channel_last(TestPool2D_Op):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -735,6 +766,7 @@ def init_shape(self):
 
 
 class TestCase1_channel_last(TestCase1):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -743,6 +775,7 @@ def init_shape(self):
 
 
 class TestCase2_channel_last(TestCase2):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -751,6 +784,7 @@ def init_shape(self):
 
 
 class TestCase3_channel_last(TestCase3):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -759,6 +793,7 @@ def init_shape(self):
 
 
 class TestCase4_channel_last(TestCase4):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -767,6 +802,7 @@ def init_shape(self):
 
 
 class TestCase5_channel_last(TestCase5):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -796,6 +832,7 @@ def init_shape(self):
 
 
 class TestCase5_Max(TestCase2):
+
     def init_pool_type(self):
         self.pool_type = "max"
 
@@ -804,13 +841,16 @@ def test_check_grad(self):
             return
         if self.has_cudnn() and self.pool_type == "max":
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, set(['X']), 'Out', max_relative_error=1.00)
+            self.check_grad_with_place(place,
+                                       set(['X']),
+                                       'Out',
+                                       max_relative_error=1.00)
         elif self.pool_type == "max":
             self.check_grad(set(['X']), 'Out', max_relative_error=1.00)
 
 
 class TestCase5_channel_last_Max(TestCase5_Max):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -823,11 +863,13 @@ def init_shape(self):
 
 
 class TestAvgInclude_channel_last(TestCase2_channel_last):
+
     def init_exclusive(self):
         self.exclusive = False
 
 
 class TestCUDNNAvgInclude_channel_last(TestCase2_channel_last):
+
     def init_kernel_type(self):
         self.use_cudnn = True
 
@@ -836,11 +878,13 @@ def init_exclusive(self):
 
 
 class TestAvgPoolAdaptive_channel_last(TestCase1_channel_last):
+
     def init_adaptive(self):
         self.adaptive = True
 
 
 class TestPool2D_AsyPadding_channel_last(TestPool2D_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -849,6 +893,7 @@ def init_shape(self):
 
 
 class TestCase1_AsyPadding_channel_last(TestCase1_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -857,6 +902,7 @@ def init_shape(self):
 
 
 class TestCase2_AsyPadding_channel_last(TestCase2_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -865,6 +911,7 @@ def init_shape(self):
 
 
 class TestCase3_AsyPadding_channel_last(TestCase3_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -873,6 +920,7 @@ def init_shape(self):
 
 
 class TestCase4_AsyPadding_channel_last(TestCase4_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -881,6 +929,7 @@ def init_shape(self):
 
 
 class TestCase5_AsyPadding_channel_last(TestCase5_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -896,8 +945,8 @@ def init_shape(self):
 create_test_cudnn_class(TestCase5_AsyPadding_channel_last)
 
 create_test_cudnn_fp16_class(TestPool2D_AsyPadding_channel_last)
-create_test_cudnn_fp16_class(
-    TestCase1_AsyPadding_channel_last, check_grad=False)
+create_test_cudnn_fp16_class(TestCase1_AsyPadding_channel_last,
+                             check_grad=False)
 create_test_cudnn_fp16_class(TestCase2_AsyPadding_channel_last)
 create_test_cudnn_fp16_class(TestCase3_AsyPadding_channel_last)
 create_test_cudnn_fp16_class(TestCase4_AsyPadding_channel_last)
@@ -911,6 +960,7 @@ def init_shape(self):
 
 
 class TestAvgInclude_AsyPadding_channel_last(TestAvgInclude_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -918,8 +968,9 @@ def init_shape(self):
         self.shape = [2, 7, 7, 3]
 
 
-class TestCUDNNAvgInclude_AsyPadding_channel_last(
-        TestCUDNNAvgInclude_AsyPadding):
+class TestCUDNNAvgInclude_AsyPadding_channel_last(TestCUDNNAvgInclude_AsyPadding
+                                                  ):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -927,8 +978,9 @@ def init_shape(self):
         self.shape = [2, 7, 7, 3]
 
 
-class TestAvgPoolAdaptive_AsyPadding_channel_last(
-        TestAvgPoolAdaptive_AsyPadding):
+class TestAvgPoolAdaptive_AsyPadding_channel_last(TestAvgPoolAdaptive_AsyPadding
+                                                  ):
+
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -940,7 +992,9 @@ def init_shape(self):
 
 
 def create_test_padding_SAME_class(parent):
+
     class TestPaddingSMAECase(parent):
+
         def init_paddings(self):
             self.paddings = [0, 0]
             self.padding_algorithm = "SAME"
@@ -966,9 +1020,11 @@ def init_paddings(self):
 
 
 def create_test_cudnn_padding_SAME_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNPaddingSMAECase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
 
@@ -997,7 +1053,9 @@ def init_paddings(self):
 
 
 def create_test_padding_VALID_class(parent):
+
     class TestPaddingVALIDCase(parent):
+
         def init_paddings(self):
             self.paddings = [1, 1]
             self.padding_algorithm = "VALID"
@@ -1023,9 +1081,11 @@ def init_paddings(self):
 
 
 def create_test_cudnn_padding_VALID_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNPaddingVALIDCase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
 
@@ -1054,6 +1114,7 @@ def init_paddings(self):
 
 
 class TestCase1_strides(TestCase1):
+
     def init_test_case(self):
         self.ksize = [3, 3]
         self.strides = [1, 2]
@@ -1069,117 +1130,106 @@ def init_shape(self):
 
 # ----- test API
 class TestPool2DAPI(unittest.TestCase):
+
     def test_api(self):
         x_NHWC = np.random.random([2, 5, 5, 3]).astype("float32")
         x_NCHW = np.random.random([2, 3, 5, 5]).astype("float32")
 
-        input_NHWC = fluid.layers.data(
-            name="input_NHWC",
-            shape=[2, 5, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
-
-        input_NCHW = fluid.layers.data(
-            name="input_NCHW",
-            shape=[2, 3, 5, 5],
-            append_batch_size=False,
-            dtype="float32")
-
-        input_NHWC_negetive = fluid.layers.data(
-            name="input_NHWC_negetive",
-            shape=[2, -1, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
-
-        input_NCHW_negetive = fluid.layers.data(
-            name="input_NCHW_negetive",
-            shape=[2, 3, -1, -1],
-            append_batch_size=False,
-            dtype="float32")
+        input_NHWC = fluid.layers.data(name="input_NHWC",
+                                       shape=[2, 5, 5, 3],
+                                       append_batch_size=False,
+                                       dtype="float32")
+
+        input_NCHW = fluid.layers.data(name="input_NCHW",
+                                       shape=[2, 3, 5, 5],
+                                       append_batch_size=False,
+                                       dtype="float32")
+
+        input_NHWC_negetive = fluid.layers.data(name="input_NHWC_negetive",
+                                                shape=[2, -1, 5, 3],
+                                                append_batch_size=False,
+                                                dtype="float32")
+
+        input_NCHW_negetive = fluid.layers.data(name="input_NCHW_negetive",
+                                                shape=[2, 3, -1, -1],
+                                                append_batch_size=False,
+                                                dtype="float32")
 
         ksize = [3, 3]
-        out_1 = fluid.layers.pool2d(
-            input=input_NHWC,
-            pool_size=ksize,
-            pool_type="max",
-            pool_padding=[1, 1],
-            use_cudnn=False,
-            data_format="NHWC")
-
-        out_2 = fluid.layers.pool2d(
-            input=input_NHWC,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
-            use_cudnn=False,
-            data_format="NHWC")
-
-        out_3 = fluid.layers.pool2d(
-            input=input_NCHW,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding=[[0, 0], [0, 0], [1, 1], [1, 1]],
-            use_cudnn=False,
-            data_format="NCHW")
-
-        out_4 = fluid.layers.pool2d(
-            input=input_NCHW,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding=[1, 2, 1, 0],
-            use_cudnn=False,
-            data_format="NCHW")
+        out_1 = fluid.layers.pool2d(input=input_NHWC,
+                                    pool_size=ksize,
+                                    pool_type="max",
+                                    pool_padding=[1, 1],
+                                    use_cudnn=False,
+                                    data_format="NHWC")
+
+        out_2 = fluid.layers.pool2d(input=input_NHWC,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding=[[0, 0], [1, 1], [1, 1],
+                                                  [0, 0]],
+                                    use_cudnn=False,
+                                    data_format="NHWC")
+
+        out_3 = fluid.layers.pool2d(input=input_NCHW,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding=[[0, 0], [0, 0], [1, 1],
+                                                  [1, 1]],
+                                    use_cudnn=False,
+                                    data_format="NCHW")
+
+        out_4 = fluid.layers.pool2d(input=input_NCHW,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding=[1, 2, 1, 0],
+                                    use_cudnn=False,
+                                    data_format="NCHW")
         # test VALID
-        out_5 = fluid.layers.pool2d(
-            input=input_NCHW,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding="VALID",
-            use_cudnn=False,
-            data_format="NCHW")
-
-        out_6 = fluid.layers.pool2d(
-            input=input_NHWC,
-            pool_size=ksize,
-            pool_type="max",
-            pool_padding="VALID",
-            use_cudnn=False,
-            data_format="NHWC")
+        out_5 = fluid.layers.pool2d(input=input_NCHW,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding="VALID",
+                                    use_cudnn=False,
+                                    data_format="NCHW")
+
+        out_6 = fluid.layers.pool2d(input=input_NHWC,
+                                    pool_size=ksize,
+                                    pool_type="max",
+                                    pool_padding="VALID",
+                                    use_cudnn=False,
+                                    data_format="NHWC")
 
         # test SAME
-        out_7 = fluid.layers.pool2d(
-            input=input_NCHW,
-            pool_size=[4, 4],
-            pool_type="avg",
-            pool_padding="SAME",
-            use_cudnn=False,
-            data_format="NCHW")
-
-        out_8 = fluid.layers.pool2d(
-            input=input_NHWC,
-            pool_size=[4, 4],
-            pool_type="max",
-            pool_padding="SAME",
-            use_cudnn=False,
-            data_format="NHWC")
+        out_7 = fluid.layers.pool2d(input=input_NCHW,
+                                    pool_size=[4, 4],
+                                    pool_type="avg",
+                                    pool_padding="SAME",
+                                    use_cudnn=False,
+                                    data_format="NCHW")
+
+        out_8 = fluid.layers.pool2d(input=input_NHWC,
+                                    pool_size=[4, 4],
+                                    pool_type="max",
+                                    pool_padding="SAME",
+                                    use_cudnn=False,
+                                    data_format="NHWC")
 
         # test negetive
-        out_9 = fluid.layers.pool2d(
-            input=input_NHWC_negetive,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding=[0, 0],
-            use_cudnn=False,
-            data_format="NHWC")
+        out_9 = fluid.layers.pool2d(input=input_NHWC_negetive,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding=[0, 0],
+                                    use_cudnn=False,
+                                    data_format="NHWC")
         assert out_9.shape == (2, -1, 3, 3)
 
-        out_10 = fluid.layers.pool2d(
-            input=input_NCHW_negetive,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding=[0, 0],
-            use_cudnn=False,
-            data_format="NCHW")
+        out_10 = fluid.layers.pool2d(input=input_NCHW_negetive,
+                                     pool_size=ksize,
+                                     pool_type="avg",
+                                     pool_padding=[0, 0],
+                                     use_cudnn=False,
+                                     data_format="NCHW")
         assert out_10.shape == (2, 3, -1, -1)
 
         exe = fluid.Executor(place=fluid.CPUPlace())
@@ -1191,52 +1241,44 @@ def test_api(self):
                 "input_NHWC_negetive": x_NHWC,
                 "input_NCHW_negetive": x_NCHW
             },
-            fetch_list=[
-                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
-            ])
+            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8])
 
         assert np.allclose(
             res_1,
-            pool2D_forward_naive(
-                x=x_NHWC,
-                ksize=ksize,
-                pool_type="max",
-                strides=[1, 1],
-                paddings=[1, 1],
-                data_format="NHWC"))
+            pool2D_forward_naive(x=x_NHWC,
+                                 ksize=ksize,
+                                 pool_type="max",
+                                 strides=[1, 1],
+                                 paddings=[1, 1],
+                                 data_format="NHWC"))
 
         assert np.allclose(
             res_2,
-            pool2D_forward_naive(
-                x=x_NHWC,
-                ksize=ksize,
-                pool_type="avg",
-                strides=[1, 1],
-                paddings=[1, 1, 1, 1],
-                data_format="NHWC"))
-        assert np.allclose(
-            res_3,
-            pool2D_forward_naive(
-                x=x_NCHW,
-                ksize=ksize,
-                pool_type="avg",
-                strides=[1, 1],
-                paddings=[1, 1, 1, 1],
-                data_format="NCHW"),
-            rtol=0.07,
-            atol=1e-05)
-
-        assert np.allclose(
-            res_4,
-            pool2D_forward_naive(
-                x=x_NCHW,
-                ksize=ksize,
-                pool_type="avg",
-                strides=[1, 1],
-                paddings=[1, 2, 1, 0],
-                data_format="NCHW"),
-            rtol=0.07,
-            atol=1e-05)
+            pool2D_forward_naive(x=x_NHWC,
+                                 ksize=ksize,
+                                 pool_type="avg",
+                                 strides=[1, 1],
+                                 paddings=[1, 1, 1, 1],
+                                 data_format="NHWC"))
+        assert np.allclose(res_3,
+                           pool2D_forward_naive(x=x_NCHW,
+                                                ksize=ksize,
+                                                pool_type="avg",
+                                                strides=[1, 1],
+                                                paddings=[1, 1, 1, 1],
+                                                data_format="NCHW"),
+                           rtol=0.07,
+                           atol=1e-05)
+
+        assert np.allclose(res_4,
+                           pool2D_forward_naive(x=x_NCHW,
+                                                ksize=ksize,
+                                                pool_type="avg",
+                                                strides=[1, 1],
+                                                paddings=[1, 2, 1, 0],
+                                                data_format="NCHW"),
+                           rtol=0.07,
+                           atol=1e-05)
 
         # VALID
         assert np.allclose(
@@ -1253,200 +1295,187 @@ def test_api(self):
             atol=1e-05)
         assert np.allclose(
             res_6,
-            pool2D_forward_naive(
-                x=x_NHWC,
-                ksize=ksize,
-                pool_type="max",
-                strides=[1, 1],
-                paddings=[10, 20],
-                padding_algorithm="VALID",
-                data_format="NHWC"))
+            pool2D_forward_naive(x=x_NHWC,
+                                 ksize=ksize,
+                                 pool_type="max",
+                                 strides=[1, 1],
+                                 paddings=[10, 20],
+                                 padding_algorithm="VALID",
+                                 data_format="NHWC"))
         # SAME
-        assert np.allclose(
-            res_7,
-            pool2D_forward_naive(
-                x=x_NCHW,
-                ksize=[4, 4],
-                pool_type="avg",
-                strides=[1, 1],
-                paddings=[10, 20],
-                padding_algorithm="SAME",
-                data_format="NCHW"),
-            rtol=0.07,
-            atol=1e-05)
+        assert np.allclose(res_7,
+                           pool2D_forward_naive(x=x_NCHW,
+                                                ksize=[4, 4],
+                                                pool_type="avg",
+                                                strides=[1, 1],
+                                                paddings=[10, 20],
+                                                padding_algorithm="SAME",
+                                                data_format="NCHW"),
+                           rtol=0.07,
+                           atol=1e-05)
 
         assert np.allclose(
             res_8,
-            pool2D_forward_naive(
-                x=x_NHWC,
-                ksize=[4, 4],
-                pool_type="max",
-                strides=[1, 1],
-                paddings=[10, 20],
-                padding_algorithm="SAME",
-                data_format="NHWC"))
+            pool2D_forward_naive(x=x_NHWC,
+                                 ksize=[4, 4],
+                                 pool_type="max",
+                                 strides=[1, 1],
+                                 paddings=[10, 20],
+                                 padding_algorithm="SAME",
+                                 data_format="NHWC"))
 
 
 class TestPool2DAPI_Error(unittest.TestCase):
+
     def test_api(self):
-        input_NHWC = fluid.layers.data(
-            name="input_NHWC",
-            shape=[2, 5, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
+        input_NHWC = fluid.layers.data(name="input_NHWC",
+                                       shape=[2, 5, 5, 3],
+                                       append_batch_size=False,
+                                       dtype="float32")
         ksize = [3, 3]
 
         # cudnn type error
         def run_1():
-            out_1 = fluid.layers.pool2d(
-                input=input_NHWC,
-                pool_size=ksize,
-                pool_type="max",
-                pool_padding=[1, 1],
-                use_cudnn=[0],
-                data_format="NHWC")
+            out_1 = fluid.layers.pool2d(input=input_NHWC,
+                                        pool_size=ksize,
+                                        pool_type="max",
+                                        pool_padding=[1, 1],
+                                        use_cudnn=[0],
+                                        data_format="NHWC")
 
         self.assertRaises(TypeError, run_1)
 
         # data_format value error
         def run_2():
-            out_2 = fluid.layers.pool2d(
-                input=input_NHWC,
-                pool_size=ksize,
-                pool_type="max",
-                pool_padding=[1, 1],
-                use_cudnn=False,
-                data_format="NHWCC")
+            out_2 = fluid.layers.pool2d(input=input_NHWC,
+                                        pool_size=ksize,
+                                        pool_type="max",
+                                        pool_padding=[1, 1],
+                                        use_cudnn=False,
+                                        data_format="NHWCC")
 
         self.assertRaises(ValueError, run_2)
 
         # padding str value error
         def run_3():
-            out_3 = fluid.layers.pool2d(
-                input=input_NHWC,
-                pool_size=ksize,
-                pool_type="max",
-                pool_padding="VALIDSAME",
-                use_cudnn=False,
-                data_format="NHWC")
+            out_3 = fluid.layers.pool2d(input=input_NHWC,
+                                        pool_size=ksize,
+                                        pool_type="max",
+                                        pool_padding="VALIDSAME",
+                                        use_cudnn=False,
+                                        data_format="NHWC")
 
         self.assertRaises(ValueError, run_3)
 
         # padding str valid and ceil_mode value error
         def run_4():
-            out_4 = fluid.layers.pool2d(
-                input=input_NHWC,
-                pool_size=ksize,
-                pool_type="max",
-                pool_padding="VALID",
-                use_cudnn=False,
-                ceil_mode=True,
-                data_format="NHWC")
+            out_4 = fluid.layers.pool2d(input=input_NHWC,
+                                        pool_size=ksize,
+                                        pool_type="max",
+                                        pool_padding="VALID",
+                                        use_cudnn=False,
+                                        ceil_mode=True,
+                                        data_format="NHWC")
 
         self.assertRaises(ValueError, run_4)
 
         # padding with 8 ele. value error
         def run_5():
-            out_5 = fluid.layers.pool2d(
-                input=input_NHWC,
-                pool_size=ksize,
-                pool_type="max",
-                pool_padding=[[1, 1], [0, 0], [0, 0], [1, 1]],
-                use_cudnn=False,
-                data_format="NHWC")
+            out_5 = fluid.layers.pool2d(input=input_NHWC,
+                                        pool_size=ksize,
+                                        pool_type="max",
+                                        pool_padding=[[1, 1], [0, 0], [0, 0],
+                                                      [1, 1]],
+                                        use_cudnn=False,
+                                        data_format="NHWC")
 
         self.assertRaises(ValueError, run_5)
 
 
 class TestDygraphPool2DAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of Pool2D must be Variable.
             data1 = np.random.random((3, 32, 32, 5)).astype('float32')
-            pool2d = fluid.dygraph.Pool2D(
-                pool_size=2,
-                pool_type='max',
-                pool_stride=1,
-                global_pooling=False)
+            pool2d = fluid.dygraph.Pool2D(pool_size=2,
+                                          pool_type='max',
+                                          pool_stride=1,
+                                          global_pooling=False)
             self.assertRaises(TypeError, pool2d, data1)
 
             # the input dtype of Pool2D must be uint8 or int8 or float16 or float32 or float64
             # uint8 and int8 only can be set on mkldnn
             # float16 only can be set on GPU place
-            data2 = fluid.layers.data(
-                name='x1', shape=[3, 32, 32, 5], dtype="int32")
+            data2 = fluid.layers.data(name='x1',
+                                      shape=[3, 32, 32, 5],
+                                      dtype="int32")
             self.assertRaises(TypeError, pool2d, data2)
 
     def test_data_format_error(self):
         with program_guard(Program(), Program()):
             # the data_format must be 'NCHW' or 'NHWC'
             data1 = np.random.random((3, 32, 32, 5)).astype('float32')
-            self.assertRaises(
-                ValueError,
-                fluid.dygraph.Pool2D,
-                pool_size=2,
-                pool_type='max',
-                pool_stride=1,
-                global_pooling=False,
-                data_format='NWHC')
+            self.assertRaises(ValueError,
+                              fluid.dygraph.Pool2D,
+                              pool_size=2,
+                              pool_type='max',
+                              pool_stride=1,
+                              global_pooling=False,
+                              data_format='NWHC')
 
 
 class TestDygraphPool2DAPI(unittest.TestCase):
+
     def test_nhwc(self):
         with fluid.dygraph.guard():
             data = np.random.random((3, 32, 32, 5)).astype('float32')
             x = fluid.dygraph.to_variable(data)
-            pool2d = fluid.dygraph.Pool2D(
-                pool_size=2,
-                pool_type='max',
-                pool_stride=1,
-                pool_padding=[0, 0],
-                global_pooling=False,
-                data_format='NHWC')
+            pool2d = fluid.dygraph.Pool2D(pool_size=2,
+                                          pool_type='max',
+                                          pool_stride=1,
+                                          pool_padding=[0, 0],
+                                          global_pooling=False,
+                                          data_format='NHWC')
             out1 = pool2d(x)
-            out2 = pool2D_forward_naive(
-                data, [2, 2], [1, 1],
-                paddings=[0, 0],
-                pool_type='max',
-                data_format='NHWC')
+            out2 = pool2D_forward_naive(data, [2, 2], [1, 1],
+                                        paddings=[0, 0],
+                                        pool_type='max',
+                                        data_format='NHWC')
             self.assertTrue(np.allclose(out1.numpy(), out2))
 
     def test_lower_case(self):
         with fluid.dygraph.guard():
             data = np.random.random((3, 32, 32, 5)).astype('float32')
             x = fluid.dygraph.to_variable(data)
-            pool2d = fluid.dygraph.Pool2D(
-                pool_size=2,
-                pool_type='max',
-                pool_stride=1,
-                pool_padding=[0, 0],
-                global_pooling=False,
-                data_format='nhwc')
+            pool2d = fluid.dygraph.Pool2D(pool_size=2,
+                                          pool_type='max',
+                                          pool_stride=1,
+                                          pool_padding=[0, 0],
+                                          global_pooling=False,
+                                          data_format='nhwc')
             out1 = pool2d(x)
-            out2 = pool2D_forward_naive(
-                data, [2, 2], [1, 1],
-                paddings=[0, 0],
-                pool_type='max',
-                data_format='NHWC')
+            out2 = pool2D_forward_naive(data, [2, 2], [1, 1],
+                                        paddings=[0, 0],
+                                        pool_type='max',
+                                        data_format='NHWC')
             self.assertTrue(np.allclose(out1.numpy(), out2))
 
     def test_upper_case(self):
         with fluid.dygraph.guard():
             data = np.random.random((3, 32, 32, 5)).astype('float32')
             x = fluid.dygraph.to_variable(data)
-            pool2d = fluid.dygraph.Pool2D(
-                pool_size=2,
-                pool_type='MAX',
-                pool_stride=1,
-                pool_padding=[0, 0],
-                global_pooling=False,
-                data_format='nhwc')
+            pool2d = fluid.dygraph.Pool2D(pool_size=2,
+                                          pool_type='MAX',
+                                          pool_stride=1,
+                                          pool_padding=[0, 0],
+                                          global_pooling=False,
+                                          data_format='nhwc')
             out1 = pool2d(x)
-            out2 = pool2D_forward_naive(
-                data, [2, 2], [1, 1],
-                paddings=[0, 0],
-                pool_type='max',
-                data_format='NHWC')
+            out2 = pool2D_forward_naive(data, [2, 2], [1, 1],
+                                        paddings=[0, 0],
+                                        pool_type='max',
+                                        data_format='NHWC')
             self.assertTrue(np.allclose(out1.numpy(), out2))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
index f20d2aad49f27..3ecfb06bb583e 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -28,6 +28,7 @@
 
 
 class TestPool3D_API(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -36,17 +37,17 @@ def setUp(self):
 
     def check_avg_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(
-                name="input", shape=[2, 3, 32, 32, 32], dtype="float32")
+            input = fluid.data(name="input",
+                               shape=[2, 3, 32, 32, 32],
+                               dtype="float32")
             result = avg_pool3d(input, kernel_size=2, stride=2, padding=0)
 
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
-            result_np = pool3D_forward_naive(
-                input_np,
-                ksize=[2, 2, 2],
-                strides=[2, 2, 2],
-                paddings=[0, 0, 0],
-                pool_type='avg')
+            result_np = pool3D_forward_naive(input_np,
+                                             ksize=[2, 2, 2],
+                                             strides=[2, 2, 2],
+                                             paddings=[0, 0, 0],
+                                             pool_type='avg')
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
@@ -60,18 +61,18 @@ def check_avg_dygraph_results(self, place):
             input = fluid.dygraph.to_variable(input_np)
             result = avg_pool3d(input, kernel_size=2, stride=2, padding="SAME")
 
-            result_np = pool3D_forward_naive(
-                input_np,
-                ksize=[2, 2, 2],
-                strides=[2, 2, 2],
-                paddings=[0, 0, 0],
-                pool_type='avg',
-                padding_algorithm="SAME")
+            result_np = pool3D_forward_naive(input_np,
+                                             ksize=[2, 2, 2],
+                                             strides=[2, 2, 2],
+                                             paddings=[0, 0, 0],
+                                             pool_type='avg',
+                                             padding_algorithm="SAME")
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool3d_dg = paddle.nn.layer.AvgPool3D(
-                kernel_size=2, stride=None, padding="SAME")
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3D(kernel_size=2,
+                                                      stride=None,
+                                                      padding="SAME")
             result = avg_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -79,30 +80,27 @@ def check_avg_dygraph_padding_results(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result = avg_pool3d(
-                input,
-                kernel_size=2,
-                stride=2,
-                padding=1,
-                ceil_mode=False,
-                exclusive=True)
-
-            result_np = avg_pool3D_forward_naive(
-                input_np,
-                ksize=[2, 2, 2],
-                strides=[2, 2, 2],
-                paddings=[1, 1, 1],
-                ceil_mode=False,
-                exclusive=False)
+            result = avg_pool3d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=1,
+                                ceil_mode=False,
+                                exclusive=True)
+
+            result_np = avg_pool3D_forward_naive(input_np,
+                                                 ksize=[2, 2, 2],
+                                                 strides=[2, 2, 2],
+                                                 paddings=[1, 1, 1],
+                                                 ceil_mode=False,
+                                                 exclusive=False)
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool3d_dg = paddle.nn.layer.AvgPool3D(
-                kernel_size=2,
-                stride=None,
-                padding=1,
-                ceil_mode=False,
-                exclusive=True)
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3D(kernel_size=2,
+                                                      stride=None,
+                                                      padding=1,
+                                                      ceil_mode=False,
+                                                      exclusive=True)
             result = avg_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -110,36 +108,40 @@ def check_avg_dygraph_ceilmode_results(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result = avg_pool3d(
-                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
-
-            result_np = avg_pool3D_forward_naive(
-                input_np,
-                ksize=[2, 2, 2],
-                strides=[2, 2, 2],
-                paddings=[0, 0, 0],
-                ceil_mode=True)
+            result = avg_pool3d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=0,
+                                ceil_mode=True)
+
+            result_np = avg_pool3D_forward_naive(input_np,
+                                                 ksize=[2, 2, 2],
+                                                 strides=[2, 2, 2],
+                                                 paddings=[0, 0, 0],
+                                                 ceil_mode=True)
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool3d_dg = paddle.nn.layer.AvgPool3D(
-                kernel_size=2, stride=None, padding=0, ceil_mode=True)
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3D(kernel_size=2,
+                                                      stride=None,
+                                                      padding=0,
+                                                      ceil_mode=True)
             result = avg_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
     def check_max_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(
-                name="input", shape=[2, 3, 32, 32, 32], dtype="float32")
+            input = fluid.data(name="input",
+                               shape=[2, 3, 32, 32, 32],
+                               dtype="float32")
             result = max_pool3d(input, kernel_size=2, stride=2, padding=0)
 
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
-            result_np = pool3D_forward_naive(
-                input_np,
-                ksize=[2, 2, 2],
-                strides=[2, 2, 2],
-                paddings=[0, 0, 0],
-                pool_type='max')
+            result_np = pool3D_forward_naive(input_np,
+                                             ksize=[2, 2, 2],
+                                             strides=[2, 2, 2],
+                                             paddings=[0, 0, 0],
+                                             pool_type='max')
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
@@ -153,16 +155,16 @@ def check_max_dygraph_results(self, place):
             input = fluid.dygraph.to_variable(input_np)
             result = max_pool3d(input, kernel_size=2, stride=2, padding=0)
 
-            result_np = pool3D_forward_naive(
-                input_np,
-                ksize=[2, 2, 2],
-                strides=[2, 2, 2],
-                paddings=[0, 0, 0],
-                pool_type='max')
+            result_np = pool3D_forward_naive(input_np,
+                                             ksize=[2, 2, 2],
+                                             strides=[2, 2, 2],
+                                             paddings=[0, 0, 0],
+                                             pool_type='max')
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
-            max_pool3d_dg = paddle.nn.layer.MaxPool3D(
-                kernel_size=2, stride=None, padding=0)
+            max_pool3d_dg = paddle.nn.layer.MaxPool3D(kernel_size=2,
+                                                      stride=None,
+                                                      padding=0)
             result = max_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -171,43 +173,45 @@ def check_max_dygraph_ndhwc_results(self, place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(
                 np.transpose(input_np, [0, 2, 3, 4, 1]))
-            result = max_pool3d(
-                input,
-                kernel_size=2,
-                stride=2,
-                padding=0,
-                data_format="NDHWC",
-                return_mask=False)
-
-            result_np = pool3D_forward_naive(
-                input_np,
-                ksize=[2, 2, 2],
-                strides=[2, 2, 2],
-                paddings=[0, 0, 0],
-                pool_type='max')
+            result = max_pool3d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=0,
+                                data_format="NDHWC",
+                                return_mask=False)
+
+            result_np = pool3D_forward_naive(input_np,
+                                             ksize=[2, 2, 2],
+                                             strides=[2, 2, 2],
+                                             paddings=[0, 0, 0],
+                                             pool_type='max')
 
             self.assertTrue(
-                np.allclose(
-                    np.transpose(result.numpy(), [0, 4, 1, 2, 3]), result_np))
+                np.allclose(np.transpose(result.numpy(), [0, 4, 1, 2, 3]),
+                            result_np))
 
     def check_max_dygraph_ceilmode_results(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result = max_pool3d(
-                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
-
-            result_np = max_pool3D_forward_naive(
-                input_np,
-                ksize=[2, 2, 2],
-                strides=[2, 2, 2],
-                paddings=[0, 0, 0],
-                ceil_mode=True)
+            result = max_pool3d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=0,
+                                ceil_mode=True)
+
+            result_np = max_pool3D_forward_naive(input_np,
+                                                 ksize=[2, 2, 2],
+                                                 strides=[2, 2, 2],
+                                                 paddings=[0, 0, 0],
+                                                 ceil_mode=True)
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool3d_dg = paddle.nn.layer.MaxPool3D(
-                kernel_size=2, stride=None, padding=0, ceil_mode=True)
+            max_pool3d_dg = paddle.nn.layer.MaxPool3D(kernel_size=2,
+                                                      stride=None,
+                                                      padding=0,
+                                                      ceil_mode=True)
             result = max_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -215,20 +219,24 @@ def check_max_dygraph_padding_results(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result = max_pool3d(
-                input, kernel_size=2, stride=2, padding=1, ceil_mode=False)
-
-            result_np = max_pool3D_forward_naive(
-                input_np,
-                ksize=[2, 2, 2],
-                strides=[2, 2, 2],
-                paddings=[1, 1, 1],
-                ceil_mode=False)
+            result = max_pool3d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=1,
+                                ceil_mode=False)
+
+            result_np = max_pool3D_forward_naive(input_np,
+                                                 ksize=[2, 2, 2],
+                                                 strides=[2, 2, 2],
+                                                 paddings=[1, 1, 1],
+                                                 ceil_mode=False)
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool3d_dg = paddle.nn.layer.MaxPool3D(
-                kernel_size=2, stride=None, padding=1, ceil_mode=False)
+            max_pool3d_dg = paddle.nn.layer.MaxPool3D(kernel_size=2,
+                                                      stride=None,
+                                                      padding=1,
+                                                      ceil_mode=False)
             result = max_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -236,24 +244,23 @@ def check_max_dygraph_stride_is_none(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
-            result, indices = max_pool3d(
-                input,
-                kernel_size=2,
-                stride=None,
-                padding="SAME",
-                return_mask=True)
-
-            result_np = pool3D_forward_naive(
-                input_np,
-                ksize=[2, 2, 2],
-                strides=[2, 2, 2],
-                paddings=[0, 0, 0],
-                pool_type='max',
-                padding_algorithm="SAME")
+            result, indices = max_pool3d(input,
+                                         kernel_size=2,
+                                         stride=None,
+                                         padding="SAME",
+                                         return_mask=True)
+
+            result_np = pool3D_forward_naive(input_np,
+                                             ksize=[2, 2, 2],
+                                             strides=[2, 2, 2],
+                                             paddings=[0, 0, 0],
+                                             pool_type='max',
+                                             padding_algorithm="SAME")
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
-            max_pool3d_dg = paddle.nn.layer.MaxPool3D(
-                kernel_size=2, stride=2, padding=0)
+            max_pool3d_dg = paddle.nn.layer.MaxPool3D(kernel_size=2,
+                                                      stride=2,
+                                                      padding=0)
             result = max_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -264,16 +271,16 @@ def check_max_dygraph_padding(self, place):
             padding = [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]
             result = max_pool3d(input, kernel_size=2, stride=2, padding=padding)
 
-            result_np = pool3D_forward_naive(
-                input_np,
-                ksize=[2, 2, 2],
-                strides=[2, 2, 2],
-                paddings=[0, 0, 0],
-                pool_type='max')
+            result_np = pool3D_forward_naive(input_np,
+                                             ksize=[2, 2, 2],
+                                             strides=[2, 2, 2],
+                                             paddings=[0, 0, 0],
+                                             pool_type='max')
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
-            max_pool3d_dg = paddle.nn.layer.MaxPool3D(
-                kernel_size=2, stride=2, padding=0)
+            max_pool3d_dg = paddle.nn.layer.MaxPool3D(kernel_size=2,
+                                                      stride=2,
+                                                      padding=0)
             result = max_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -286,33 +293,31 @@ def check_avg_divisor(self, place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
             padding = 0
-            result = avg_pool3d(
-                input,
-                kernel_size=2,
-                stride=2,
-                padding=padding,
-                divisor_override=8)
-
-            result_np = pool3D_forward_naive(
-                input_np,
-                ksize=[2, 2, 2],
-                strides=[2, 2, 2],
-                paddings=[0, 0, 0],
-                pool_type='avg')
+            result = avg_pool3d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=padding,
+                                divisor_override=8)
+
+            result_np = pool3D_forward_naive(input_np,
+                                             ksize=[2, 2, 2],
+                                             strides=[2, 2, 2],
+                                             paddings=[0, 0, 0],
+                                             pool_type='avg')
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
-            avg_pool3d_dg = paddle.nn.layer.AvgPool3D(
-                kernel_size=2, stride=2, padding=0)
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3D(kernel_size=2,
+                                                      stride=2,
+                                                      padding=0)
             result = avg_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
             padding = [0, 0, 0, 0, 0, 0]
-            result = avg_pool3d(
-                input,
-                kernel_size=2,
-                stride=2,
-                padding=padding,
-                divisor_override=8)
+            result = avg_pool3d(input,
+                                kernel_size=2,
+                                stride=2,
+                                padding=padding,
+                                divisor_override=8)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
     def test_pool3d(self):
@@ -334,174 +339,167 @@ def test_dygraph_final_state_api(self):
 
 
 class TestPool3DError_API(unittest.TestCase):
+
     def test_error_api(self):
+
         def run1():
             with fluid.dygraph.guard():
-                input_np = np.random.uniform(
-                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_np = np.random.uniform(-1, 1, [2, 3, 32, 32, 32]).astype(
+                    np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = [[0, 1], [0, 0], [0, 0], [0, 0], [0, 0]]
-                res_pd = avg_pool3d(
-                    input_pd, kernel_size=2, stride=2, padding=padding)
+                res_pd = avg_pool3d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=padding)
 
         self.assertRaises(ValueError, run1)
 
         def run2():
             with fluid.dygraph.guard():
-                input_np = np.random.uniform(
-                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_np = np.random.uniform(-1, 1, [2, 3, 32, 32, 32]).astype(
+                    np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = [[0, 1], [0, 0], [0, 0], [0, 0], [0, 0]]
-                res_pd = avg_pool3d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    data_format='NCDHW')
+                res_pd = avg_pool3d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=padding,
+                                    data_format='NCDHW')
 
         self.assertRaises(ValueError, run2)
 
         def run3():
             with fluid.dygraph.guard():
-                input_np = np.random.uniform(
-                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_np = np.random.uniform(-1, 1, [2, 3, 32, 32, 32]).astype(
+                    np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
                 padding = [[0, 1], [0, 0], [0, 0], [0, 0], [0, 0]]
-                res_pd = avg_pool3d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=padding,
-                    data_format='NDHWC')
+                res_pd = avg_pool3d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=padding,
+                                    data_format='NDHWC')
 
         self.assertRaises(ValueError, run3)
 
         def run4():
             with fluid.dygraph.guard():
-                input_np = np.random.uniform(
-                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_np = np.random.uniform(-1, 1, [2, 3, 32, 32, 32]).astype(
+                    np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
-                res_pd = avg_pool3d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=0,
-                    data_format='NNNN')
+                res_pd = avg_pool3d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=0,
+                                    data_format='NNNN')
 
         self.assertRaises(ValueError, run4)
 
         def run5():
             with fluid.dygraph.guard():
-                input_np = np.random.uniform(
-                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_np = np.random.uniform(-1, 1, [2, 3, 32, 32, 32]).astype(
+                    np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
-                res_pd = max_pool3d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=0,
-                    data_format='NNNN')
+                res_pd = max_pool3d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=0,
+                                    data_format='NNNN')
 
         self.assertRaises(ValueError, run5)
 
         def run6():
             with fluid.dygraph.guard():
-                input_np = np.random.uniform(
-                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_np = np.random.uniform(-1, 1, [2, 3, 32, 32, 32]).astype(
+                    np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
-                res_pd = avg_pool3d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding="padding",
-                    data_format='NNNN')
+                res_pd = avg_pool3d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding="padding",
+                                    data_format='NNNN')
 
         self.assertRaises(ValueError, run6)
 
         def run7():
             with fluid.dygraph.guard():
-                input_np = np.random.uniform(
-                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_np = np.random.uniform(-1, 1, [2, 3, 32, 32, 32]).astype(
+                    np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
-                res_pd = max_pool3d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding="padding",
-                    data_format='NNNN')
+                res_pd = max_pool3d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding="padding",
+                                    data_format='NNNN')
 
         self.assertRaises(ValueError, run7)
 
         def run8():
             with fluid.dygraph.guard():
-                input_np = np.random.uniform(
-                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_np = np.random.uniform(-1, 1, [2, 3, 32, 32, 32]).astype(
+                    np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
-                res_pd = avg_pool3d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding="VALID",
-                    ceil_mode=True,
-                    data_format='NNNN')
+                res_pd = avg_pool3d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding="VALID",
+                                    ceil_mode=True,
+                                    data_format='NNNN')
 
         self.assertRaises(ValueError, run8)
 
         def run9():
             with fluid.dygraph.guard():
-                input_np = np.random.uniform(
-                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_np = np.random.uniform(-1, 1, [2, 3, 32, 32, 32]).astype(
+                    np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
-                res_pd = max_pool3d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding="VALID",
-                    ceil_mode=True,
-                    data_format='NNNN')
+                res_pd = max_pool3d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding="VALID",
+                                    ceil_mode=True,
+                                    data_format='NNNN')
 
         self.assertRaises(ValueError, run9)
 
         def run10():
             with fluid.dygraph.guard():
-                input_np = np.random.uniform(
-                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_np = np.random.uniform(-1, 1, [2, 3, 32, 32, 32]).astype(
+                    np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
-                res_pd = max_pool3d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=2,
-                    padding=0,
-                    data_format='NDHWC',
-                    return_mask=True)
+                res_pd = max_pool3d(input_pd,
+                                    kernel_size=2,
+                                    stride=2,
+                                    padding=0,
+                                    data_format='NDHWC',
+                                    return_mask=True)
 
         self.assertRaises(ValueError, run10)
 
         def run_kernel_out_of_range():
             with fluid.dygraph.guard():
-                input_np = np.random.uniform(
-                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_np = np.random.uniform(-1, 1, [2, 3, 32, 32, 32]).astype(
+                    np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
-                res_pd = avg_pool3d(
-                    input_pd,
-                    kernel_size=-1,
-                    stride=2,
-                    padding="VALID",
-                    ceil_mode=True)
+                res_pd = avg_pool3d(input_pd,
+                                    kernel_size=-1,
+                                    stride=2,
+                                    padding="VALID",
+                                    ceil_mode=True)
 
         self.assertRaises(ValueError, run_kernel_out_of_range)
 
         def run_size_out_of_range():
             with fluid.dygraph.guard():
-                input_np = np.random.uniform(
-                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_np = np.random.uniform(-1, 1, [2, 3, 32, 32, 32]).astype(
+                    np.float32)
                 input_pd = fluid.dygraph.to_variable(input_np)
-                res_pd = avg_pool3d(
-                    input_pd,
-                    kernel_size=2,
-                    stride=0,
-                    padding="VALID",
-                    ceil_mode=True)
+                res_pd = avg_pool3d(input_pd,
+                                    kernel_size=2,
+                                    stride=0,
+                                    padding="VALID",
+                                    ceil_mode=True)
 
         self.assertRaises(ValueError, run_size_out_of_range)
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 40b9be9ee4f9b..2045f6bdd7a32 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -49,8 +49,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         for input_size, filter_size, stride_size in zip(input_shape, pool_size,
                                                         pool_stride):
             out_size = int((input_size + stride_size - 1) / stride_size)
-            pad_sum = np.max((
-                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0))
             pad_0 = int(pad_sum / 2)
             pad_1 = int(pad_sum - pad_0)
             padding.append(pad_0)
@@ -148,21 +148,21 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                     d_end = np.min((d_end, D))
                     h_end = np.min((h_end, H))
                 if data_format == 'NCDHW':
-                    x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
-                                 w_end]
+                    x_masked = x[:, :, d_start:d_end, h_start:h_end,
+                                 w_start:w_end]
                     if pool_type == 'avg':
                         if (exclusive or adaptive):
                             field_size = (d_end - d_start) * (
                                 h_end - h_start) * (w_end - w_start)
 
-                        out[:, :, k, i, j] = np.sum(x_masked,
-                                                    axis=(2, 3, 4)) / field_size
+                        out[:, :, k, i,
+                            j] = np.sum(x_masked, axis=(2, 3, 4)) / field_size
                     elif pool_type == 'max':
                         out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
 
                 elif data_format == 'NDHWC':
-                    x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
-                                 w_end, :]
+                    x_masked = x[:, d_start:d_end, h_start:h_end,
+                                 w_start:w_end, :]
                     if pool_type == 'avg':
                         if (exclusive or adaptive):
                             field_size = (d_end - d_start) * (
@@ -184,17 +184,16 @@ def max_pool3D_forward_naive(x,
                              ceil_mode=False,
                              exclusive=True,
                              adaptive=False):
-    out = pool3D_forward_naive(
-        x=x,
-        ksize=ksize,
-        strides=strides,
-        paddings=paddings,
-        global_pool=global_pool,
-        ceil_mode=ceil_mode,
-        exclusive=exclusive,
-        adaptive=adaptive,
-        data_format='NCDHW',
-        pool_type="max")
+    out = pool3D_forward_naive(x=x,
+                               ksize=ksize,
+                               strides=strides,
+                               paddings=paddings,
+                               global_pool=global_pool,
+                               ceil_mode=ceil_mode,
+                               exclusive=exclusive,
+                               adaptive=adaptive,
+                               data_format='NCDHW',
+                               pool_type="max")
     return out
 
 
@@ -206,21 +205,21 @@ def avg_pool3D_forward_naive(x,
                              ceil_mode=False,
                              exclusive=True,
                              adaptive=False):
-    out = pool3D_forward_naive(
-        x=x,
-        ksize=ksize,
-        strides=strides,
-        paddings=paddings,
-        global_pool=global_pool,
-        ceil_mode=ceil_mode,
-        exclusive=exclusive,
-        adaptive=adaptive,
-        data_format='NCDHW',
-        pool_type="avg")
+    out = pool3D_forward_naive(x=x,
+                               ksize=ksize,
+                               strides=strides,
+                               paddings=paddings,
+                               global_pool=global_pool,
+                               ceil_mode=ceil_mode,
+                               exclusive=exclusive,
+                               adaptive=adaptive,
+                               data_format='NCDHW',
+                               pool_type="avg")
     return out
 
 
 class TestPool3D_Op(OpTest):
+
     def setUp(self):
         self.op_type = "pool3d"
         self.init_kernel_type()
@@ -239,10 +238,12 @@ def setUp(self):
         paddle.enable_static()
 
         input = np.random.random(self.shape).astype(self.dtype)
-        output = pool3D_forward_naive(
-            input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive, self.adaptive, self.data_format,
-            self.pool_type, self.padding_algorithm).astype(self.dtype)
+        output = pool3D_forward_naive(input, self.ksize, self.strides,
+                                      self.paddings, self.global_pool,
+                                      self.ceil_mode, self.exclusive,
+                                      self.adaptive, self.data_format,
+                                      self.pool_type,
+                                      self.padding_algorithm).astype(self.dtype)
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
@@ -278,8 +279,10 @@ def test_check_grad(self):
         if self.has_cudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
             if core.is_compiled_with_rocm():
-                self.check_grad_with_place(
-                    place, set(['X']), 'Out', max_relative_error=1e-2)
+                self.check_grad_with_place(place,
+                                           set(['X']),
+                                           'Out',
+                                           max_relative_error=1e-2)
             else:
                 self.check_grad_with_place(place, set(['X']), 'Out')
         elif self.pool_type != "max":
@@ -322,6 +325,7 @@ def init_adaptive(self):
 
 
 class TestCase1(TestPool3D_Op):
+
     def init_shape(self):
         self.shape = [1, 3, 7, 7, 7]
 
@@ -340,6 +344,7 @@ def init_global_pool(self):
 
 
 class TestCase2(TestPool3D_Op):
+
     def init_shape(self):
         self.shape = [1, 3, 6, 7, 7]
 
@@ -358,16 +363,19 @@ def init_global_pool(self):
 
 
 class TestCase3(TestPool3D_Op):
+
     def init_pool_type(self):
         self.pool_type = "max"
 
 
 class TestCase4(TestCase1):
+
     def init_pool_type(self):
         self.pool_type = "max"
 
 
 class TestCase5(TestCase2):
+
     def init_pool_type(self):
         self.pool_type = "max"
 
@@ -376,9 +384,11 @@ def init_pool_type(self):
 
 
 def create_test_cudnn_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNCase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
 
@@ -396,9 +406,11 @@ def init_kernel_type(self):
 
 
 def create_test_cudnn_fp16_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNFp16Case(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
             self.dtype = np.float16
@@ -418,9 +430,11 @@ def test_check_output(self):
 
 
 def create_test_fp16_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestFp16Case(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = False
             self.dtype = np.float16
@@ -453,9 +467,11 @@ def test_check_output(self):
 
 # ---- test ceil mode ------
 def create_test_cudnn_use_ceil_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestPool3DUseCeilCase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
 
@@ -472,7 +488,9 @@ def init_ceil_mode(self):
 
 
 def create_test_use_ceil_class(parent):
+
     class TestPool3DUseCeilCase(parent):
+
         def init_ceil_mode(self):
             self.ceil_mode = True
 
@@ -486,6 +504,7 @@ def init_ceil_mode(self):
 
 
 class TestAvgInclude(TestCase2):
+
     def init_exclusive(self):
         self.exclusive = False
 
@@ -493,6 +512,7 @@ def init_exclusive(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNAvgInclude(TestCase2):
+
     def init_kernel_type(self):
         self.use_cudnn = True
 
@@ -501,11 +521,13 @@ def init_exclusive(self):
 
 
 class TestAvgPoolAdaptive(TestCase1):
+
     def init_adaptive(self):
         self.adaptive = True
 
 
 class TestAvgPoolAdaptiveAsyOutSize(TestCase1):
+
     def init_adaptive(self):
         self.adaptive = True
 
@@ -519,6 +541,7 @@ def init_test_case(self):
 
 #-------test pool3d with asymmetric padding------
 class TestPool3D_Op_AsyPadding(TestPool3D_Op):
+
     def init_test_case(self):
         self.ksize = [3, 4, 3]
         self.strides = [1, 1, 2]
@@ -531,6 +554,7 @@ def init_shape(self):
 
 
 class TestCase1_AsyPadding(TestCase1):
+
     def init_test_case(self):
         self.ksize = [3, 3, 4]
         self.strides = [1, 1, 2]
@@ -543,6 +567,7 @@ def init_shape(self):
 
 
 class TestCase2_AsyPadding(TestCase2):
+
     def init_test_case(self):
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
@@ -555,6 +580,7 @@ def init_shape(self):
 
 
 class TestCase3_AsyPadding(TestCase3):
+
     def init_test_case(self):
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
@@ -567,6 +593,7 @@ def init_shape(self):
 
 
 class TestCase4_AsyPadding(TestCase4):
+
     def init_test_case(self):
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
@@ -579,6 +606,7 @@ def init_shape(self):
 
 
 class TestCase5_AsyPadding(TestCase5):
+
     def init_test_case(self):
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
@@ -612,6 +640,7 @@ def init_shape(self):
 
 
 class TestAvgInclude_AsyPadding(TestCase2):
+
     def init_exclusive(self):
         self.exclusive = False
 
@@ -622,6 +651,7 @@ def init_paddings(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNAvgInclude_AsyPadding(TestCase2):
+
     def init_kernel_type(self):
         self.use_cudnn = True
 
@@ -636,6 +666,7 @@ def init_shape(self):
 
 
 class TestAvgPoolAdaptive_AsyPadding(TestCase1):
+
     def init_adaptive(self):
         self.adaptive = True
 
@@ -645,6 +676,7 @@ def init_paddings(self):
 
 # ------------ test channel_last --------------
 class TestPool3D_channel_last(TestPool3D_Op):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -653,6 +685,7 @@ def init_shape(self):
 
 
 class TestCase1_channel_last(TestCase1):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -661,6 +694,7 @@ def init_shape(self):
 
 
 class TestCase2_channel_last(TestCase2):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -669,6 +703,7 @@ def init_shape(self):
 
 
 class TestCase3_channel_last(TestCase3):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -677,6 +712,7 @@ def init_shape(self):
 
 
 class TestCase4_channel_last(TestCase4):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -685,6 +721,7 @@ def init_shape(self):
 
 
 class TestCase5_channel_last(TestCase5):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -707,6 +744,7 @@ def init_shape(self):
 
 
 class TestCase5_Max(TestCase2):
+
     def init_pool_type(self):
         self.pool_type = "max"
 
@@ -715,13 +753,16 @@ def test_check_grad(self):
             return
         if self.has_cudnn() and self.pool_type == "max":
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, set(['X']), 'Out', max_relative_error=1.00)
+            self.check_grad_with_place(place,
+                                       set(['X']),
+                                       'Out',
+                                       max_relative_error=1.00)
         elif self.pool_type == "max":
             self.check_grad(set(['X']), 'Out', max_relative_error=1.00)
 
 
 class TestCase5_channel_last_Max(TestCase5_Max):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -734,6 +775,7 @@ def init_shape(self):
 
 
 class TestAvgInclude_channel_last(TestCase2_channel_last):
+
     def init_exclusive(self):
         self.exclusive = False
 
@@ -741,6 +783,7 @@ def init_exclusive(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNAvgInclude_channel_last(TestCase2_channel_last):
+
     def init_kernel_type(self):
         self.use_cudnn = True
 
@@ -749,12 +792,14 @@ def init_exclusive(self):
 
 
 class TestAvgPoolAdaptive_channel_last(TestCase1_channel_last):
+
     def init_adaptive(self):
         self.adaptive = True
 
 
 # --- asy padding
 class TestPool3D_Op_AsyPadding_channel_last(TestPool3D_Op_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -763,6 +808,7 @@ def init_shape(self):
 
 
 class TestCase1_AsyPadding_channel_last(TestCase1_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -771,6 +817,7 @@ def init_shape(self):
 
 
 class TestCase2_AsyPadding_channel_last(TestCase2_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -779,6 +826,7 @@ def init_shape(self):
 
 
 class TestCase3_AsyPadding_channel_last(TestCase3_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -787,6 +835,7 @@ def init_shape(self):
 
 
 class TestCase4_AsyPadding_channel_last(TestCase4_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -795,6 +844,7 @@ def init_shape(self):
 
 
 class TestCase5_AsyPadding_channel_last(TestCase5_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -817,20 +867,23 @@ def init_shape(self):
 
 
 class TestAvgInclude_AsyPadding_channel_last(TestAvgInclude_AsyPadding):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestCUDNNAvgInclude_AsyPadding_channel_last(
-        TestCUDNNAvgInclude_AsyPadding):
+class TestCUDNNAvgInclude_AsyPadding_channel_last(TestCUDNNAvgInclude_AsyPadding
+                                                  ):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
 
-class TestAvgPoolAdaptive_AsyPadding_channel_last(
-        TestAvgPoolAdaptive_AsyPadding):
+class TestAvgPoolAdaptive_AsyPadding_channel_last(TestAvgPoolAdaptive_AsyPadding
+                                                  ):
+
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -840,7 +893,9 @@ def init_shape(self):
 
 #test padding = SAME VALID
 def create_test_padding_SAME_class(parent):
+
     class TestPaddingSMAECase(parent):
+
         def init_paddings(self):
             self.paddings = [0, 0, 0]
             self.padding_algorithm = "SAME"
@@ -866,9 +921,11 @@ def init_paddings(self):
 
 
 def create_test_cudnn_padding_SAME_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNPaddingSMAECase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
 
@@ -897,7 +954,9 @@ def init_paddings(self):
 
 
 def create_test_padding_VALID_class(parent):
+
     class TestPaddingVALIDCase(parent):
+
         def init_paddings(self):
             self.paddings = [1, 1, 1]
             self.padding_algorithm = "VALID"
@@ -923,9 +982,11 @@ def init_paddings(self):
 
 
 def create_test_cudnn_padding_VALID_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestCUDNNPaddingVALIDCase(parent):
+
         def init_kernel_type(self):
             self.use_cudnn = True
 
@@ -955,257 +1016,234 @@ def init_paddings(self):
 
 #test API
 class TestPool3DAPI(unittest.TestCase):
+
     def test_api(self):
         x_NDHWC = np.random.random([2, 5, 5, 5, 3]).astype("float32")
         x_NCDHW = np.random.random([2, 3, 5, 5, 5]).astype("float32")
 
-        input_NDHWC = fluid.layers.data(
-            name="input_NDHWC",
-            shape=[2, 5, 5, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
+        input_NDHWC = fluid.layers.data(name="input_NDHWC",
+                                        shape=[2, 5, 5, 5, 3],
+                                        append_batch_size=False,
+                                        dtype="float32")
 
-        input_NCDHW = fluid.layers.data(
-            name="input_NCDHW",
-            shape=[2, 3, 5, 5, 5],
-            append_batch_size=False,
-            dtype="float32")
+        input_NCDHW = fluid.layers.data(name="input_NCDHW",
+                                        shape=[2, 3, 5, 5, 5],
+                                        append_batch_size=False,
+                                        dtype="float32")
 
         ksize = [3, 3, 3]
-        out_1 = fluid.layers.pool3d(
-            input=input_NDHWC,
-            pool_size=ksize,
-            pool_type="max",
-            pool_padding=[1, 1, 1],
-            use_cudnn=False,
-            data_format="NDHWC")
-
-        out_2 = fluid.layers.pool3d(
-            input=input_NDHWC,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding=[[0, 0], [1, 1], [1, 1], [1, 1], [0, 0]],
-            use_cudnn=False,
-            data_format="NDHWC")
-
-        out_3 = fluid.layers.pool3d(
-            input=input_NCDHW,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding=[[0, 0], [0, 0], [1, 1], [1, 1], [1, 1]],
-            use_cudnn=False,
-            data_format="NCDHW")
-
-        out_4 = fluid.layers.pool3d(
-            input=input_NCDHW,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding=[1, 2, 1, 0, 0, 1],
-            use_cudnn=False,
-            data_format="NCDHW")
+        out_1 = fluid.layers.pool3d(input=input_NDHWC,
+                                    pool_size=ksize,
+                                    pool_type="max",
+                                    pool_padding=[1, 1, 1],
+                                    use_cudnn=False,
+                                    data_format="NDHWC")
+
+        out_2 = fluid.layers.pool3d(input=input_NDHWC,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding=[[0, 0], [1, 1], [1, 1],
+                                                  [1, 1], [0, 0]],
+                                    use_cudnn=False,
+                                    data_format="NDHWC")
+
+        out_3 = fluid.layers.pool3d(input=input_NCDHW,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding=[[0, 0], [0, 0], [1, 1],
+                                                  [1, 1], [1, 1]],
+                                    use_cudnn=False,
+                                    data_format="NCDHW")
+
+        out_4 = fluid.layers.pool3d(input=input_NCDHW,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding=[1, 2, 1, 0, 0, 1],
+                                    use_cudnn=False,
+                                    data_format="NCDHW")
         # test VALID
-        out_5 = fluid.layers.pool3d(
-            input=input_NDHWC,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding="VALID",
-            use_cudnn=False,
-            data_format="NDHWC")
-
-        out_6 = fluid.layers.pool3d(
-            input=input_NCDHW,
-            pool_size=ksize,
-            pool_type="avg",
-            pool_padding="VALID",
-            use_cudnn=False,
-            data_format="NCDHW")
+        out_5 = fluid.layers.pool3d(input=input_NDHWC,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding="VALID",
+                                    use_cudnn=False,
+                                    data_format="NDHWC")
+
+        out_6 = fluid.layers.pool3d(input=input_NCDHW,
+                                    pool_size=ksize,
+                                    pool_type="avg",
+                                    pool_padding="VALID",
+                                    use_cudnn=False,
+                                    data_format="NCDHW")
 
         # test SAME
-        out_7 = fluid.layers.pool3d(
-            input=input_NDHWC,
-            pool_size=ksize,
-            pool_stride=[1, 1, 2],
-            pool_type="avg",
-            pool_padding="SAME",
-            use_cudnn=False,
-            data_format="NDHWC")
-
-        out_8 = fluid.layers.pool3d(
-            input=input_NCDHW,
-            pool_size=[4, 4, 4],
-            pool_type="avg",
-            pool_padding="SAME",
-            use_cudnn=False,
-            data_format="NCDHW")
+        out_7 = fluid.layers.pool3d(input=input_NDHWC,
+                                    pool_size=ksize,
+                                    pool_stride=[1, 1, 2],
+                                    pool_type="avg",
+                                    pool_padding="SAME",
+                                    use_cudnn=False,
+                                    data_format="NDHWC")
+
+        out_8 = fluid.layers.pool3d(input=input_NCDHW,
+                                    pool_size=[4, 4, 4],
+                                    pool_type="avg",
+                                    pool_padding="SAME",
+                                    use_cudnn=False,
+                                    data_format="NCDHW")
 
         exe = fluid.Executor(place=fluid.CPUPlace())
         [res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8] = exe.run(
             fluid.default_main_program(),
-            feed={"input_NDHWC": x_NDHWC,
-                  "input_NCDHW": x_NCDHW},
-            fetch_list=[
-                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
-            ])
+            feed={
+                "input_NDHWC": x_NDHWC,
+                "input_NCDHW": x_NCDHW
+            },
+            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8])
 
         assert np.allclose(
             res_1,
-            pool3D_forward_naive(
-                x=x_NDHWC,
-                ksize=ksize,
-                pool_type="max",
-                strides=[1, 1, 1],
-                paddings=[1, 1, 1],
-                data_format="NDHWC"))
+            pool3D_forward_naive(x=x_NDHWC,
+                                 ksize=ksize,
+                                 pool_type="max",
+                                 strides=[1, 1, 1],
+                                 paddings=[1, 1, 1],
+                                 data_format="NDHWC"))
 
         assert np.allclose(
             res_2,
-            pool3D_forward_naive(
-                x=x_NDHWC,
-                ksize=ksize,
-                pool_type="avg",
-                strides=[1, 1, 1],
-                paddings=[1, 1, 1, 1, 1, 1],
-                data_format="NDHWC"))
-        assert np.allclose(
-            res_3,
-            pool3D_forward_naive(
-                x=x_NCDHW,
-                ksize=ksize,
-                pool_type="avg",
-                strides=[1, 1, 1],
-                paddings=[1, 1, 1, 1, 1, 1],
-                data_format="NCDHW"),
-            rtol=0.07,
-            atol=1e-05)
-
-        assert np.allclose(
-            res_4,
-            pool3D_forward_naive(
-                x=x_NCDHW,
-                ksize=ksize,
-                pool_type="avg",
-                strides=[1, 1, 1],
-                paddings=[1, 2, 1, 0, 0, 1],
-                data_format="NCDHW"),
-            rtol=0.07,
-            atol=1e-05)
+            pool3D_forward_naive(x=x_NDHWC,
+                                 ksize=ksize,
+                                 pool_type="avg",
+                                 strides=[1, 1, 1],
+                                 paddings=[1, 1, 1, 1, 1, 1],
+                                 data_format="NDHWC"))
+        assert np.allclose(res_3,
+                           pool3D_forward_naive(x=x_NCDHW,
+                                                ksize=ksize,
+                                                pool_type="avg",
+                                                strides=[1, 1, 1],
+                                                paddings=[1, 1, 1, 1, 1, 1],
+                                                data_format="NCDHW"),
+                           rtol=0.07,
+                           atol=1e-05)
+
+        assert np.allclose(res_4,
+                           pool3D_forward_naive(x=x_NCDHW,
+                                                ksize=ksize,
+                                                pool_type="avg",
+                                                strides=[1, 1, 1],
+                                                paddings=[1, 2, 1, 0, 0, 1],
+                                                data_format="NCDHW"),
+                           rtol=0.07,
+                           atol=1e-05)
         # VALID
         assert np.allclose(
             res_5,
-            pool3D_forward_naive(
-                x=x_NDHWC,
-                ksize=ksize,
-                pool_type="avg",
-                strides=[1, 1, 1],
-                paddings=[10, 20],
-                padding_algorithm="VALID",
-                data_format="NDHWC"))
-
-        assert np.allclose(
-            res_6,
-            pool3D_forward_naive(
-                x=x_NCDHW,
-                ksize=ksize,
-                pool_type="avg",
-                strides=[1, 1, 1],
-                paddings=[10, 20],
-                padding_algorithm="VALID",
-                data_format="NCDHW"),
-            rtol=0.07,
-            atol=1e-05)
+            pool3D_forward_naive(x=x_NDHWC,
+                                 ksize=ksize,
+                                 pool_type="avg",
+                                 strides=[1, 1, 1],
+                                 paddings=[10, 20],
+                                 padding_algorithm="VALID",
+                                 data_format="NDHWC"))
+
+        assert np.allclose(res_6,
+                           pool3D_forward_naive(x=x_NCDHW,
+                                                ksize=ksize,
+                                                pool_type="avg",
+                                                strides=[1, 1, 1],
+                                                paddings=[10, 20],
+                                                padding_algorithm="VALID",
+                                                data_format="NCDHW"),
+                           rtol=0.07,
+                           atol=1e-05)
         # SAME
         assert np.allclose(
             res_7,
-            pool3D_forward_naive(
-                x=x_NDHWC,
-                ksize=ksize,
-                pool_type="avg",
-                strides=[1, 1, 2],
-                paddings=[10, 20],
-                padding_algorithm="SAME",
-                data_format="NDHWC"))
-
-        assert np.allclose(
-            res_8,
-            pool3D_forward_naive(
-                x=x_NCDHW,
-                ksize=[4, 4, 4],
-                pool_type="avg",
-                strides=[1, 1, 1],
-                paddings=[10, 20],
-                padding_algorithm="SAME",
-                data_format="NCDHW"),
-            rtol=0.07,
-            atol=1e-05)
+            pool3D_forward_naive(x=x_NDHWC,
+                                 ksize=ksize,
+                                 pool_type="avg",
+                                 strides=[1, 1, 2],
+                                 paddings=[10, 20],
+                                 padding_algorithm="SAME",
+                                 data_format="NDHWC"))
+
+        assert np.allclose(res_8,
+                           pool3D_forward_naive(x=x_NCDHW,
+                                                ksize=[4, 4, 4],
+                                                pool_type="avg",
+                                                strides=[1, 1, 1],
+                                                paddings=[10, 20],
+                                                padding_algorithm="SAME",
+                                                data_format="NCDHW"),
+                           rtol=0.07,
+                           atol=1e-05)
 
 
 class TestPool3DAPI_Error(unittest.TestCase):
+
     def test_api(self):
-        input_NDHWC = fluid.layers.data(
-            name="input_NDHWC",
-            shape=[2, 5, 5, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
+        input_NDHWC = fluid.layers.data(name="input_NDHWC",
+                                        shape=[2, 5, 5, 5, 3],
+                                        append_batch_size=False,
+                                        dtype="float32")
         ksize = [3, 3, 3]
 
         # cudnn type error
         def run_1():
-            out_1 = fluid.layers.pool3d(
-                input=input_NDHWC,
-                pool_size=ksize,
-                pool_type="max",
-                pool_padding=[1, 1, 1],
-                use_cudnn=[0],
-                data_format="NDHWC")
+            out_1 = fluid.layers.pool3d(input=input_NDHWC,
+                                        pool_size=ksize,
+                                        pool_type="max",
+                                        pool_padding=[1, 1, 1],
+                                        use_cudnn=[0],
+                                        data_format="NDHWC")
 
         self.assertRaises(TypeError, run_1)
 
         # data_format value error
         def run_2():
-            out_2 = fluid.layers.pool3d(
-                input=input_NDHWC,
-                pool_size=ksize,
-                pool_type="max",
-                pool_padding=[1, 1, 1],
-                use_cudnn=False,
-                data_format="NDHWCC")
+            out_2 = fluid.layers.pool3d(input=input_NDHWC,
+                                        pool_size=ksize,
+                                        pool_type="max",
+                                        pool_padding=[1, 1, 1],
+                                        use_cudnn=False,
+                                        data_format="NDHWCC")
 
         self.assertRaises(ValueError, run_2)
 
         # padding str value error
         def run_3():
-            out_3 = fluid.layers.pool3d(
-                input=input_NDHWC,
-                pool_size=ksize,
-                pool_type="max",
-                pool_padding="VALIDSAME",
-                use_cudnn=False,
-                data_format="NDHWC")
+            out_3 = fluid.layers.pool3d(input=input_NDHWC,
+                                        pool_size=ksize,
+                                        pool_type="max",
+                                        pool_padding="VALIDSAME",
+                                        use_cudnn=False,
+                                        data_format="NDHWC")
 
         self.assertRaises(ValueError, run_3)
 
         # padding str valid and ceil_mode value error
         def run_4():
-            out_4 = fluid.layers.pool3d(
-                input=input_NDHWC,
-                pool_size=ksize,
-                pool_type="max",
-                pool_padding="VALID",
-                use_cudnn=False,
-                ceil_mode=True,
-                data_format="NDHWC")
+            out_4 = fluid.layers.pool3d(input=input_NDHWC,
+                                        pool_size=ksize,
+                                        pool_type="max",
+                                        pool_padding="VALID",
+                                        use_cudnn=False,
+                                        ceil_mode=True,
+                                        data_format="NDHWC")
 
         self.assertRaises(ValueError, run_4)
 
         # padding with 8 ele. value error
         def run_5():
-            out_5 = fluid.layers.pool3d(
-                input=input_NDHWC,
-                pool_size=ksize,
-                pool_type="max",
-                pool_padding=[[1, 1], [0, 0], [0, 0], [1, 1], [1, 1]],
-                use_cudnn=False,
-                data_format="NDHWC")
+            out_5 = fluid.layers.pool3d(input=input_NDHWC,
+                                        pool_size=ksize,
+                                        pool_type="max",
+                                        pool_padding=[[1, 1], [0, 0], [0, 0],
+                                                      [1, 1], [1, 1]],
+                                        use_cudnn=False,
+                                        data_format="NDHWC")
 
         self.assertRaises(ValueError, run_5)
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
index 4b3c777ccf3b1..ea3737575911b 100644
--- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
@@ -135,6 +135,7 @@ def max_pool2D_forward_naive(x,
 
 
 class TestMaxPoolWithIndex_Op(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.init_global()
@@ -181,11 +182,13 @@ def init_adaptive(self):
 
 
 class TestCase1(TestMaxPoolWithIndex_Op):
+
     def init_global(self):
         self.global_pool = True
 
 
 class TestCase2(TestMaxPoolWithIndex_Op):
+
     def init_test_case(self):
         self.op_type = "max_pool3d_with_index"
         self.pool_forward_naive = max_pool3D_forward_naive
@@ -199,12 +202,14 @@ def init_global(self):
 
 
 class TestCase3(TestCase2):
+
     def init_global(self):
         self.global_pool = False
 
 
 #----------------max_pool2d_with_index----------------
 class TestCase4(TestMaxPoolWithIndex_Op):
+
     def init_test_case(self):
         self.op_type = "max_pool2d_with_index"
         self.pool_forward_naive = max_pool2D_forward_naive
@@ -218,11 +223,13 @@ def init_global(self):
 
 
 class TestCase5(TestCase4):
+
     def init_global(self):
         self.global_pool = False
 
 
 class TestCase6(TestMaxPoolWithIndex_Op):
+
     def init_test_case(self):
         self.op_type = "max_pool2d_with_index"
         self.pool_forward_naive = max_pool2D_forward_naive
@@ -236,16 +243,19 @@ def init_global(self):
 
 
 class TestCase7(TestCase6):
+
     def init_global(self):
         self.global_pool = False
 
 
 class TestCastAdaptive2d(TestCase6):
+
     def init_adaptive(self):
         self.adaptive = True
 
 
 class TestCastAdaptive3d(TestMaxPoolWithIndex_Op):
+
     def init_adaptive(self):
         self.adaptive = True
 
diff --git a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
index afe8d212d6ec2..157123d82c3ff 100644
--- a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
+++ b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
@@ -53,6 +53,7 @@ def py_pnpair_op(score, label, query, column=-1, weight=None):
 
 
 class TestPositiveNegativePairOp(OpTest):
+
     def setUp(self):
         self.op_type = 'positive_negative_pair'
         batch_size = 20
@@ -77,6 +78,7 @@ def test_check_output(self):
 
 
 class TestPositiveNegativePairOpAccumulateWeight(OpTest):
+
     def setUp(self):
         self.op_type = 'positive_negative_pair'
         batch_size = 20
@@ -89,16 +91,19 @@ def setUp(self):
         query = np.array(
             [np.random.randint(max_query_id) for i in range(batch_size)])
         query = np.reshape(query, newshape=(batch_size, 1)).astype('int64')
-        acc_pos = np.reshape(
-            np.random.randint(max_random_num), newshape=(1)).astype('float32')
-        acc_neg = np.reshape(
-            np.random.randint(max_random_num), newshape=(1)).astype('float32')
-        acc_neu = np.reshape(
-            np.random.randint(max_random_num), newshape=(1)).astype('float32')
+        acc_pos = np.reshape(np.random.randint(max_random_num),
+                             newshape=(1)).astype('float32')
+        acc_neg = np.reshape(np.random.randint(max_random_num),
+                             newshape=(1)).astype('float32')
+        acc_neu = np.reshape(np.random.randint(max_random_num),
+                             newshape=(1)).astype('float32')
         column = np.random.randint(score_dim)
 
-        pos, neg, neu = py_pnpair_op(
-            score, label, query, column=column, weight=weight)
+        pos, neg, neu = py_pnpair_op(score,
+                                     label,
+                                     query,
+                                     column=column,
+                                     weight=weight)
         self.inputs = {
             'Score': score,
             'Label': label,
diff --git a/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py b/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py
index 056db5b8590ab..43f98cada4235 100644
--- a/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -34,19 +34,18 @@ def gen_pow2_warmup_op_lr(warmup_steps, total_steps, base_lr, end_lr, place):
 
 
 class Pow2Warmup(LinearWarmup):
+
     def __init__(self, warmup_steps, total_steps, base_lr, end_lr):
         assert total_steps > warmup_steps
-        lr_sch = PolynomialDecay(
-            learning_rate=base_lr,
-            decay_steps=total_steps - warmup_steps,
-            end_lr=end_lr,
-            power=2)
+        lr_sch = PolynomialDecay(learning_rate=base_lr,
+                                 decay_steps=total_steps - warmup_steps,
+                                 end_lr=end_lr,
+                                 power=2)
 
-        super(Pow2Warmup, self).__init__(
-            learning_rate=lr_sch,
-            warmup_steps=warmup_steps,
-            start_lr=0.0,
-            end_lr=base_lr)
+        super(Pow2Warmup, self).__init__(learning_rate=lr_sch,
+                                         warmup_steps=warmup_steps,
+                                         start_lr=0.0,
+                                         end_lr=base_lr)
 
 
 def gen_pow2_warmup_py_lr(warmup_steps, total_steps, base_lr, end_lr, place):
@@ -58,6 +57,7 @@ def gen_pow2_warmup_py_lr(warmup_steps, total_steps, base_lr, end_lr, place):
 
 
 class TestPow2WarmupLRScheduler(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         self.params = {
diff --git a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
index 645637625959f..ee88d76ac67d4 100644
--- a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
+++ b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
@@ -87,6 +87,7 @@ def compute_metrics(states, cls_num):
 
 
 class TestPrecisionRecallOp_0(OpTest):
+
     def setUp(self):
         self.op_type = "precision_recall"
         ins_num = 64
@@ -114,6 +115,7 @@ def test_check_output(self):
 
 
 class TestPrecisionRecallOp_1(OpTest):
+
     def setUp(self):
         self.op_type = "precision_recall"
         ins_num = 64
@@ -148,6 +150,7 @@ def test_check_output(self):
 
 
 class TestPrecisionRecallOp_2(OpTest):
+
     def setUp(self):
         self.op_type = "precision_recall"
         ins_num = 64
diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
index 73c423a23e6ba..1de0c434952e9 100644
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -41,9 +41,10 @@ def ref_prelu_nn(x, num_parameters, init):
 
 
 class TestFunctionalPReluAPI(unittest.TestCase):
+
     def setUp(self):
-        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        self.place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
         self.x_np = np.random.uniform(-1., 1., [1, 2, 3, 4]).astype('float32')
         self.weight_np_0 = np.random.randn(1).astype('float32')
         self.weight_np_1 = np.random.randn(self.x_np.shape[1]).astype('float32')
@@ -54,8 +55,10 @@ def static_check(self, weight_np):
             weight = paddle.fluid.data('Alpha', weight_np.shape, 'float32')
             out = F.prelu(x, weight)
             exe = paddle.static.Executor(self.place)
-            res = exe.run(feed={'X': self.x_np,
-                                'Alpha': weight_np},
+            res = exe.run(feed={
+                'X': self.x_np,
+                'Alpha': weight_np
+            },
                           fetch_list=[out])
         out_ref = ref_prelu(self.x_np, weight_np)
         self.assertEqual(np.allclose(out_ref, res[0]), True)
@@ -83,32 +86,37 @@ def test_dygraph_api_eager(self):
 
     def test_error(self):
         with paddle.static.program_guard(paddle.static.Program()):
-            weight_fp32 = paddle.fluid.data(
-                name='weight_fp32', shape=[1], dtype='float32')
+            weight_fp32 = paddle.fluid.data(name='weight_fp32',
+                                            shape=[1],
+                                            dtype='float32')
             # The input type must be Variable.
             self.assertRaises(TypeError, F.prelu, x=1, weight=weight_fp32)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[2, 3], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[2, 3],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.prelu, x=x_int32, weight=weight_fp32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[2, 3], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[2, 3],
+                                       dtype='float16')
             F.prelu(x=x_fp16, weight=weight_fp32)
 
 
 class TestNNPReluAPI(unittest.TestCase):
+
     def setUp(self):
-        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        self.place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
         self.x_np = np.ones([1, 2, 3, 4]).astype('float32')
 
     def test_static_api(self):
         startup_program = paddle.static.Program()
         train_program = paddle.static.Program()
         with paddle.static.program_guard(train_program, startup_program):
-            x = paddle.fluid.data(
-                name='X', shape=self.x_np.shape, dtype='float32')
+            x = paddle.fluid.data(name='X',
+                                  shape=self.x_np.shape,
+                                  dtype='float32')
             m = paddle.nn.PReLU()
             out = m(x)
             exe = paddle.static.Executor(self.place)
@@ -162,6 +170,7 @@ def prelu_api_wrapper(x, weight, data_format="NCHW"):
 
 
 class PReluTest(OpTest):
+
     def setUp(self):
         self.init_dtype()
         self.init_input_shape()
@@ -196,16 +205,16 @@ def setUp(self):
         self.inputs = {'X': x_np, 'Alpha': alpha_np}
 
         # NOTE(zhiqu): reshape inputs['Alpha'] from [1, 100, 1, 1] to [1, 100] + [1]*len(x.shape[2:])
-        # since np operands could not be broadcast together with shapes (1,100,2,2,2,3) (1,100,1,1) 	
+        # since np operands could not be broadcast together with shapes (1,100,2,2,2,3) (1,100,1,1)
         reshaped_alpha = self.inputs['Alpha']
         if self.attrs == {'mode': "channel", "data_format": "NCHW"}:
-            reshaped_alpha = np.reshape(
-                self.inputs['Alpha'],
-                [1, self.x_shape[1]] + [1] * len(self.x_shape[2:]))
+            reshaped_alpha = np.reshape(self.inputs['Alpha'],
+                                        [1, self.x_shape[1]] +
+                                        [1] * len(self.x_shape[2:]))
         elif self.attrs == {'mode': "channel", "data_format": "NHWC"}:
-            reshaped_alpha = np.reshape(
-                self.inputs['Alpha'],
-                [1] + [1] * len(self.x_shape[1:-1]) + [self.x_shape[-1]])
+            reshaped_alpha = np.reshape(self.inputs['Alpha'],
+                                        [1] + [1] * len(self.x_shape[1:-1]) +
+                                        [self.x_shape[-1]])
         out_np = np.maximum(self.inputs['X'], 0.)
         out_np = out_np + np.minimum(self.inputs['X'], 0.) * reshaped_alpha
         assert out_np is not self.inputs['X']
@@ -228,9 +237,11 @@ def test_check_grad(self):
 
 
 @skip_check_grad_ci(
-    reason="[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
+    reason=
+    "[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
 )
 class TestModeAll(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [2, 3, 4, 5]
 
@@ -239,9 +250,11 @@ def init_attr(self):
 
 
 @skip_check_grad_ci(
-    reason="[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
+    reason=
+    "[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
 )
 class TestModeAllNHWC(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [2, 3, 4, 50]
 
@@ -250,6 +263,7 @@ def init_attr(self):
 
 
 class TestModeElt(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [3, 2, 5, 10]
 
@@ -258,6 +272,7 @@ def init_attr(self):
 
 
 class TestModeEltNHWC(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [3, 2, 5, 10]
 
@@ -266,9 +281,11 @@ def init_attr(self):
 
 
 @skip_check_grad_ci(
-    reason="[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
+    reason=
+    "[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
 )
 class TestModeAllRank3(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [1, 200, 3]
 
@@ -277,9 +294,11 @@ def init_attr(self):
 
 
 @skip_check_grad_ci(
-    reason="[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
+    reason=
+    "[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
 )
 class TestModeAllRank3NHWC(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [1, 200, 3]
 
@@ -288,9 +307,11 @@ def init_attr(self):
 
 
 @skip_check_grad_ci(
-    reason="[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
+    reason=
+    "[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
 )
 class TestModeAllRank6(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [1, 2, 3, 4, 5, 6]
 
@@ -299,9 +320,11 @@ def init_attr(self):
 
 
 @skip_check_grad_ci(
-    reason="[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
+    reason=
+    "[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
 )
 class TestModeAllRank6NHWC(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [1, 2, 3, 4, 5, 6]
 
@@ -310,6 +333,7 @@ def init_attr(self):
 
 
 class TestModeChannelRank3(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [1, 200, 3]
 
@@ -318,6 +342,7 @@ def init_attr(self):
 
 
 class TestModeChannelRank3NHWC(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [1, 3, 100]
 
@@ -326,6 +351,7 @@ def init_attr(self):
 
 
 class TestModeChannelRank6(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [1, 100, 2, 2, 2, 2]
 
@@ -334,6 +360,7 @@ def init_attr(self):
 
 
 class TestModeChannelRank6NHWC(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [1, 2, 2, 2, 2, 100]
 
@@ -342,6 +369,7 @@ def init_attr(self):
 
 
 class TestModeElementRank3(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [3, 10, 10]
 
@@ -350,6 +378,7 @@ def init_attr(self):
 
 
 class TestModeElementRank3NHWC(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [3, 10, 10]
 
@@ -358,6 +387,7 @@ def init_attr(self):
 
 
 class TestModeElementRank6(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [3, 2, 2, 4, 5, 2]
 
@@ -366,6 +396,7 @@ def init_attr(self):
 
 
 class TestModeElementRank6NHWC(PReluTest):
+
     def init_input_shape(self):
         self.x_shape = [3, 2, 2, 4, 5, 2]
 
@@ -377,9 +408,11 @@ def create_test_fp16_class(parent,
                            check_grad=True,
                            atol=1e-3,
                            max_relative_error=0.05):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestPReluFp16Case(parent):
+
         def init_dtype(self):
             self.dtype = np.float16
 
@@ -387,8 +420,9 @@ def test_check_output(self):
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(0)
                 if core.is_float16_supported(place):
-                    self.check_output_with_place(
-                        place, atol=atol, check_eager=self.eager_mode)
+                    self.check_output_with_place(place,
+                                                 atol=atol,
+                                                 check_eager=self.eager_mode)
 
         def test_check_grad(self):
             place = core.CUDAPlace(0)
@@ -431,21 +465,25 @@ def prelu_t(x, mode, param_attr=None, name=None, data_format='NCHW'):
         is_bias=False,
         default_initializer=fluid.initializer.ConstantInitializer(0.25))
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="prelu",
-        inputs={"X": x,
-                'Alpha': alpha},
-        attrs={"mode": mode,
-               'data_format': data_format},
-        outputs={"Out": out})
+    helper.append_op(type="prelu",
+                     inputs={
+                         "X": x,
+                         'Alpha': alpha
+                     },
+                     attrs={
+                         "mode": mode,
+                         'data_format': data_format
+                     },
+                     outputs={"Out": out})
     return out
 
 
 # error message test if mode is not one of 'all', 'channel', 'element'
 class TestModeError(unittest.TestCase):
+
     def setUp(self):
-        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        self.place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
         self.x_np = np.ones([1, 2, 3, 4]).astype('float32')
 
     def test_mode_error(self):
diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py
index cc06a3cf7fa6b..4607327306554 100755
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
@@ -30,6 +30,7 @@
 
 
 class TestPrintOpCPU(unittest.TestCase):
+
     def setUp(self):
         self.place = paddle.CPUPlace()
         self.x_tensor = fluid.core.LoDTensor()
@@ -74,7 +75,8 @@ def test_all_parameters(self):
                             print_tensor_name=print_tensor_name,
                             print_tensor_type=print_tensor_type,
                             print_tensor_shape=print_tensor_shape,
-                            print_tensor_lod=print_tensor_lod, )
+                            print_tensor_lod=print_tensor_lod,
+                        )
         loss = paddle.mean(x)
         paddle.static.append_backward(loss=loss)
         exe = paddle.static.Executor(self.place)
@@ -92,11 +94,12 @@ def test_no_summarize(self):
 
 
 class TestPrintOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of Print_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], paddle.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         paddle.CPUPlace())
             self.assertRaises(TypeError, paddle.static.Print, x1)
             # The input dtype of Print_op must be float32, float64, int32_t, int64_t or bool.
             x2 = paddle.static.data(name='x2', shape=[4], dtype="float16")
@@ -106,6 +109,7 @@ def test_errors(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestPrintOpGPU(TestPrintOpCPU):
+
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
         self.x_tensor = fluid.core.LoDTensor()
@@ -115,6 +119,7 @@ def setUp(self):
 
 
 class TestPrintOpBackward(unittest.TestCase):
+
     def check_backward(self, use_cuda):
         main = paddle.static.Program()
         startup = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
index 7381b74af7105..2e18f8b748efd 100644
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
@@ -22,6 +22,7 @@
 
 
 class TestPriorBoxOp(OpTest):
+
     def set_data(self):
         self.init_test_params()
         self.init_test_input()
@@ -79,8 +80,8 @@ def init_test_params(self):
         self.flip = True
         self.set_min_max_aspect_ratios_order()
         self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
-        self.aspect_ratios = np.array(
-            self.aspect_ratios, dtype=np.float).flatten()
+        self.aspect_ratios = np.array(self.aspect_ratios,
+                                      dtype=np.float).flatten()
         self.variances = [0.1, 0.1, 0.2, 0.2]
         self.variances = np.array(self.variances, dtype=np.float).flatten()
 
@@ -118,22 +119,22 @@ def init_test_output(self):
                             ar = self.real_aspect_ratios[r]
                             c_w = min_size * math.sqrt(ar) / 2
                             c_h = (min_size / math.sqrt(ar)) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
+                            out_boxes[h, w,
+                                      idx, :] = [(c_x - c_w) / self.image_w,
+                                                 (c_y - c_h) / self.image_h,
+                                                 (c_x + c_w) / self.image_w,
+                                                 (c_y + c_h) / self.image_h]
                             idx += 1
 
                         if len(self.max_sizes) > 0:
                             max_size = self.max_sizes[s]
                             # second prior: aspect_ratio = 1,
                             c_w = c_h = math.sqrt(min_size * max_size) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
+                            out_boxes[h, w,
+                                      idx, :] = [(c_x - c_w) / self.image_w,
+                                                 (c_y - c_h) / self.image_h,
+                                                 (c_x + c_w) / self.image_w,
+                                                 (c_y + c_h) / self.image_h]
                             idx += 1
                     else:
                         c_w = c_h = min_size / 2.
@@ -146,11 +147,11 @@ def init_test_output(self):
                             max_size = self.max_sizes[s]
                             # second prior: aspect_ratio = 1,
                             c_w = c_h = math.sqrt(min_size * max_size) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
+                            out_boxes[h, w,
+                                      idx, :] = [(c_x - c_w) / self.image_w,
+                                                 (c_y - c_h) / self.image_h,
+                                                 (c_x + c_w) / self.image_w,
+                                                 (c_y + c_h) / self.image_h]
                             idx += 1
 
                         # rest of priors
@@ -160,29 +161,31 @@ def init_test_output(self):
                                 continue
                             c_w = min_size * math.sqrt(ar) / 2
                             c_h = (min_size / math.sqrt(ar)) / 2
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
+                            out_boxes[h, w,
+                                      idx, :] = [(c_x - c_w) / self.image_w,
+                                                 (c_y - c_h) / self.image_h,
+                                                 (c_x + c_w) / self.image_w,
+                                                 (c_y + c_h) / self.image_h]
                             idx += 1
 
         # clip the prior's coordidate such that it is within[0, 1]
         if self.clip:
             out_boxes = np.clip(out_boxes, 0.0, 1.0)
         # set the variance.
-        out_var = np.tile(self.variances, (self.layer_h, self.layer_w,
-                                           self.num_priors, 1))
+        out_var = np.tile(self.variances,
+                          (self.layer_h, self.layer_w, self.num_priors, 1))
         self.out_boxes = out_boxes.astype('float32')
         self.out_var = out_var.astype('float32')
 
 
 class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp):
+
     def set_max_sizes(self):
         self.max_sizes = []
 
 
 class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp):
+
     def set_min_max_aspect_ratios_order(self):
         self.min_max_aspect_ratios_order = True
 
diff --git a/python/paddle/fluid/tests/unittests/test_prod_op.py b/python/paddle/fluid/tests/unittests/test_prod_op.py
index cdfcbb4e4e735..656601e05d159 100644
--- a/python/paddle/fluid/tests/unittests/test_prod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prod_op.py
@@ -20,6 +20,7 @@
 
 
 class TestProdOp(unittest.TestCase):
+
     def setUp(self):
         self.input = np.random.random(size=(10, 10, 5)).astype(np.float32)
 
@@ -50,13 +51,16 @@ def run_imperative(self):
         self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
 
         dy_result = paddle.prod(input, axis=1, keepdim=True, dtype='int64')
-        expected_result = np.prod(
-            self.input, axis=1, keepdims=True, dtype=np.int64)
+        expected_result = np.prod(self.input,
+                                  axis=1,
+                                  keepdims=True,
+                                  dtype=np.int64)
         self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
 
     def run_static(self, use_gpu=False):
-        input = paddle.fluid.data(
-            name='input', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(name='input',
+                                  shape=[10, 10, 5],
+                                  dtype='float32')
         result0 = paddle.prod(input)
         result1 = paddle.prod(input, axis=1)
         result2 = paddle.prod(input, axis=-1)
@@ -86,8 +90,10 @@ def run_static(self, use_gpu=False):
         self.assertTrue(np.allclose(static_result[4], expected_result))
         expected_result = np.prod(self.input, axis=1, dtype=np.int64)
         self.assertTrue(np.allclose(static_result[5], expected_result))
-        expected_result = np.prod(
-            self.input, axis=1, keepdims=True, dtype=np.int64)
+        expected_result = np.prod(self.input,
+                                  axis=1,
+                                  keepdims=True,
+                                  dtype=np.int64)
         self.assertTrue(np.allclose(static_result[6], expected_result))
 
     def test_cpu(self):
@@ -111,12 +117,14 @@ def test_gpu(self):
 
 
 class TestProdOpError(unittest.TestCase):
+
     def test_error(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
             x = paddle.fluid.data(name='x', shape=[2, 2, 4], dtype='float32')
-            bool_x = paddle.fluid.data(
-                name='bool_x', shape=[2, 2, 4], dtype='bool')
+            bool_x = paddle.fluid.data(name='bool_x',
+                                       shape=[2, 2, 4],
+                                       dtype='bool')
             # The argument x shoule be a Tensor
             self.assertRaises(TypeError, paddle.prod, [1])
 
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index 1b8852810f2fe..0eec7633a2ec1 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -28,6 +28,7 @@
 
 
 class TestProfiler(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
@@ -39,8 +40,9 @@ def build_program(self, compile_program=True):
             image = fluid.layers.data(name='x', shape=[784], dtype='float32')
             hidden1 = fluid.layers.fc(input=image, size=64, act='relu')
             i = layers.zeros(shape=[1], dtype='int64')
-            counter = fluid.layers.zeros(
-                shape=[1], dtype='int64', force_cpu=True)
+            counter = fluid.layers.zeros(shape=[1],
+                                         dtype='int64',
+                                         force_cpu=True)
             until = layers.fill_constant([1], dtype='int64', value=10)
             data_arr = layers.array_write(hidden1, i)
             cond = fluid.layers.less_than(x=counter, y=until)
@@ -58,8 +60,9 @@ def build_program(self, compile_program=True):
             cost = fluid.layers.cross_entropy(input=predict, label=label)
             avg_cost = fluid.layers.mean(cost)
             batch_size = fluid.layers.create_tensor(dtype='int64')
-            batch_acc = fluid.layers.accuracy(
-                input=predict, label=label, total=batch_size)
+            batch_acc = fluid.layers.accuracy(input=predict,
+                                              label=label,
+                                              total=batch_size)
 
         optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
         opts = optimizer.minimize(avg_cost, startup_program=startup_program)
@@ -70,8 +73,8 @@ def build_program(self, compile_program=True):
             exec_strategy = fluid.ExecutionStrategy()
             exec_strategy.num_threads = 1
             train_program = fluid.compiler.CompiledProgram(
-                main_program).with_data_parallel(
-                    loss_name=avg_cost.name, exec_strategy=exec_strategy)
+                main_program).with_data_parallel(loss_name=avg_cost.name,
+                                                 exec_strategy=exec_strategy)
         else:
             train_program = main_program
         return train_program, startup_program, avg_cost, batch_size, batch_acc
@@ -95,16 +98,18 @@ def check_profile_result(self, profile_path):
                             "Kernel %s missing event. Has this kernel been recorded by RecordEvent?"
                             % event.name)
                 elif event.type == profiler_pb2.Event.CPU and (
-                        event.name.startswith("Driver API") or
-                        event.name.startswith("Runtime API")):
+                        event.name.startswith("Driver API")
+                        or event.name.startswith("Runtime API")):
                     print("Warning: unregister", event.name)
 
     def run_iter(self, exe, main_program, fetch_list):
         x = np.random.random((32, 784)).astype("float32")
         y = np.random.randint(0, 10, (32, 1)).astype("int64")
         outs = exe.run(main_program,
-                       feed={'x': x,
-                             'y': y},
+                       feed={
+                           'x': x,
+                           'y': y
+                       },
                        fetch_list=fetch_list)
 
     def net_profiler(self,
@@ -127,13 +132,15 @@ def net_profiler(self,
                     self.run_iter(exe, main_program,
                                   [avg_cost, batch_acc, batch_size])
         else:
-            options = utils.ProfilerOptions(options={
-                'state': state,
-                'sorted_key': 'total',
-                'tracer_level': tracer_option,
-                'batch_range': [0, 10] if batch_range is None else batch_range,
-                'profile_path': profile_path
-            })
+            options = utils.ProfilerOptions(
+                options={
+                    'state': state,
+                    'sorted_key': 'total',
+                    'tracer_level': tracer_option,
+                    'batch_range':
+                    [0, 10] if batch_range is None else batch_range,
+                    'profile_path': profile_path
+                })
             with utils.Profiler(enabled=True, options=options) as prof:
                 for iter in range(10):
                     self.run_iter(exe, main_program,
@@ -148,39 +155,37 @@ def net_profiler(self,
     def test_cpu_profiler(self):
         exe = fluid.Executor(fluid.CPUPlace())
         for use_new_api in [False, True]:
-            self.net_profiler(
-                exe,
-                'CPU',
-                "Default",
-                batch_range=[5, 10],
-                use_new_api=use_new_api)
+            self.net_profiler(exe,
+                              'CPU',
+                              "Default",
+                              batch_range=[5, 10],
+                              use_new_api=use_new_api)
 
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "profiler is enabled only with GPU")
     def test_cuda_profiler(self):
         exe = fluid.Executor(fluid.CUDAPlace(0))
         for use_new_api in [False, True]:
-            self.net_profiler(
-                exe,
-                'GPU',
-                "OpDetail",
-                batch_range=[0, 10],
-                use_new_api=use_new_api)
+            self.net_profiler(exe,
+                              'GPU',
+                              "OpDetail",
+                              batch_range=[0, 10],
+                              use_new_api=use_new_api)
 
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "profiler is enabled only with GPU")
     def test_all_profiler(self):
         exe = fluid.Executor(fluid.CUDAPlace(0))
         for use_new_api in [False, True]:
-            self.net_profiler(
-                exe,
-                'All',
-                "AllOpDetail",
-                batch_range=None,
-                use_new_api=use_new_api)
+            self.net_profiler(exe,
+                              'All',
+                              "AllOpDetail",
+                              batch_range=None,
+                              use_new_api=use_new_api)
 
 
 class TestProfilerAPIError(unittest.TestCase):
+
     def test_errors(self):
         options = utils.ProfilerOptions()
         self.assertTrue(options['profile_path'] is None)
diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
index 7079d9678b2fd..e5463b1a90d59 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -19,6 +19,7 @@
 
 
 class HostPythonNode:
+
     def __init__(self, name, type, start_ns, end_ns, process_id, thread_id):
         self.name = name
         self.type = type
@@ -32,6 +33,7 @@ def __init__(self, name, type, start_ns, end_ns, process_id, thread_id):
 
 
 class DevicePythonNode:
+
     def __init__(self, name, type, start_ns, end_ns, device_id, context_id,
                  stream_id):
         self.name = name
@@ -44,6 +46,7 @@ def __init__(self, name, type, start_ns, end_ns, device_id, context_id,
 
 
 class TestProfilerStatistic(unittest.TestCase):
+
     def test_statistic_case1(self):
         root_node = HostPythonNode('Root Node',
                                    profiler.TracerEventType.UserDefined, 0,
@@ -54,10 +57,12 @@ def test_statistic_case1(self):
         dataloader_node = HostPythonNode('Dataloader',
                                          profiler.TracerEventType.Dataloader, 5,
                                          15, 1000, 1001)
-        mobilenet_node = HostPythonNode(
-            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
-        yolonet_node = HostPythonNode(
-            'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001)
+        mobilenet_node = HostPythonNode('MobileNet',
+                                        profiler.TracerEventType.Forward, 20,
+                                        50, 1000, 1001)
+        yolonet_node = HostPythonNode('Yolov3Net',
+                                      profiler.TracerEventType.Forward, 50, 110,
+                                      1000, 1001)
 
         userdefined_node = HostPythonNode('Communication Time',
                                           profiler.TracerEventType.UserDefined,
@@ -72,8 +77,9 @@ def test_statistic_case1(self):
         optimization_node = HostPythonNode(
             'Optimization', profiler.TracerEventType.Optimization, 220, 300,
             1000, 1001)
-        conv2d_node = HostPythonNode(
-            'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
+        conv2d_node = HostPythonNode('conv2d',
+                                     profiler.TracerEventType.Operator, 25, 40,
+                                     1000, 1001)
         sync_batch_norm_node = HostPythonNode('sync_batch_norm',
                                               profiler.TracerEventType.Operator,
                                               60, 100, 1000, 1001)
@@ -92,10 +98,12 @@ def test_statistic_case1(self):
         conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
                                            profiler.TracerEventType.CudaRuntime,
                                            35, 40, 1000, 1001)
-        conv2d_kernel = DevicePythonNode(
-            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
-        conv2d_memcpy = DevicePythonNode(
-            'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
+        conv2d_kernel = DevicePythonNode('conv2d_kernel',
+                                         profiler.TracerEventType.Kernel, 35,
+                                         50, 0, 0, 0)
+        conv2d_memcpy = DevicePythonNode('conv2d_memcpy',
+                                         profiler.TracerEventType.Memcpy, 50,
+                                         60, 0, 0, 0)
         sync_batch_norm_infer_shape = HostPythonNode(
             'sync_batch_norm::infer_shape',
             profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
@@ -146,8 +154,8 @@ def test_statistic_case1(self):
             'Process Cpu Utilization': '1.02',
             'System Cpu Utilization': '0.68'
         }
-        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
-                                                                   extra_info)
+        statistic_data = profiler.profiler_statistic.StatisticData(
+            thread_tree, extra_info)
         time_range_summary = statistic_data.time_range_summary
         event_summary = statistic_data.event_summary
 
@@ -200,8 +208,9 @@ def test_statistic_case1(self):
             0)
         self.assertEqual(
             event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
-        self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy']
-                         .general_gpu_time, 60)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].
+            general_gpu_time, 60)
         print(
             profiler.profiler_statistic._build_table(
                 statistic_data,
@@ -222,10 +231,12 @@ def test_statistic_case2(self):
                                          profiler.TracerEventType.Dataloader, 5,
                                          15, 1000, 1001)
 
-        mobilenet_node = HostPythonNode(
-            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
-        yolonet_node = HostPythonNode(
-            'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001)
+        mobilenet_node = HostPythonNode('MobileNet',
+                                        profiler.TracerEventType.Forward, 20,
+                                        50, 1000, 1001)
+        yolonet_node = HostPythonNode('Yolov3Net',
+                                      profiler.TracerEventType.Forward, 50, 110,
+                                      1000, 1001)
 
         userdefined_node = HostPythonNode('Communication Time',
                                           profiler.TracerEventType.UserDefined,
@@ -263,8 +274,9 @@ def test_statistic_case2(self):
         optimization_node = HostPythonNode(
             'Optimization', profiler.TracerEventType.Optimization, 220, 300,
             1000, 1001)
-        conv2d_node = HostPythonNode(
-            'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
+        conv2d_node = HostPythonNode('conv2d',
+                                     profiler.TracerEventType.Operator, 25, 40,
+                                     1000, 1001)
         sync_batch_norm_node = HostPythonNode('sync_batch_norm',
                                               profiler.TracerEventType.Operator,
                                               60, 100, 1000, 1001)
@@ -283,10 +295,12 @@ def test_statistic_case2(self):
         conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
                                            profiler.TracerEventType.CudaRuntime,
                                            35, 40, 1000, 1001)
-        conv2d_kernel = DevicePythonNode(
-            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
-        conv2d_memcpy = DevicePythonNode(
-            'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
+        conv2d_kernel = DevicePythonNode('conv2d_kernel',
+                                         profiler.TracerEventType.Kernel, 35,
+                                         50, 0, 0, 0)
+        conv2d_memcpy = DevicePythonNode('conv2d_memcpy',
+                                         profiler.TracerEventType.Memcpy, 50,
+                                         60, 0, 0, 0)
         sync_batch_norm_infer_shape = HostPythonNode(
             'sync_batch_norm::infer_shape',
             profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
@@ -363,8 +377,8 @@ def test_statistic_case2(self):
             'Process Cpu Utilization': '1.02',
             'System Cpu Utilization': '0.68'
         }
-        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
-                                                                   extra_info)
+        statistic_data = profiler.profiler_statistic.StatisticData(
+            thread_tree, extra_info)
         time_range_summary = statistic_data.time_range_summary
         event_summary = statistic_data.event_summary
         distributed_summary = statistic_data.distributed_summary
@@ -433,8 +447,9 @@ def test_statistic_case2(self):
             0)
         self.assertEqual(
             event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
-        self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy']
-                         .general_gpu_time, 60)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].
+            general_gpu_time, 60)
         print(
             profiler.profiler_statistic._build_table(
                 statistic_data,
@@ -454,8 +469,9 @@ def test_statistic_case3(self):
         dataloader_node = HostPythonNode('Dataloader',
                                          profiler.TracerEventType.Dataloader, 5,
                                          15, 1000, 1001)
-        mobilenet_node = HostPythonNode(
-            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
+        mobilenet_node = HostPythonNode('MobileNet',
+                                        profiler.TracerEventType.Forward, 20,
+                                        50, 1000, 1001)
 
         backward_node = HostPythonNode('Gradient Backward',
                                        profiler.TracerEventType.Backward, 120,
@@ -467,8 +483,9 @@ def test_statistic_case3(self):
                                           profiler.TracerEventType.UserDefined,
                                           60, 70, 1000, 1001)
 
-        conv2d_node = HostPythonNode(
-            'conv2d', profiler.TracerEventType.Operator, 25, 25, 1000, 1001)
+        conv2d_node = HostPythonNode('conv2d',
+                                     profiler.TracerEventType.Operator, 25, 25,
+                                     1000, 1001)
 
         conv2d_infer_shape = HostPythonNode(
             'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
@@ -480,8 +497,9 @@ def test_statistic_case3(self):
             'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 25, 25,
             1000, 1001)
 
-        conv2d_kernel = DevicePythonNode(
-            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
+        conv2d_kernel = DevicePythonNode('conv2d_kernel',
+                                         profiler.TracerEventType.Kernel, 35,
+                                         35, 0, 0, 0)
         another_kernel = DevicePythonNode(
             'void phi::funcs::VectorizedBroadcastKernel<float, float, phi::funcs::AddFunctor<float>, phi::funcs::AddFunctor<float>>()',
             profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
@@ -500,15 +518,16 @@ def test_statistic_case3(self):
             'Process Cpu Utilization': '1.02',
             'System Cpu Utilization': '0.68'
         }
-        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
-                                                                   extra_info)
+        statistic_data = profiler.profiler_statistic.StatisticData(
+            thread_tree, extra_info)
         time_range_summary = statistic_data.time_range_summary
         event_summary = statistic_data.event_summary
 
         self.assertEqual(event_summary.items['conv2d'].cpu_time, 0)
         self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 0)
-        self.assertEqual(event_summary.userdefined_items['Communication Time']
-                         .general_gpu_time, 0)
+        self.assertEqual(
+            event_summary.userdefined_items['Communication Time'].
+            general_gpu_time, 0)
         for sort_key in [
                 profiler.SortedKeys.CPUTotal, profiler.SortedKeys.CPUMax,
                 profiler.SortedKeys.CPUMin, profiler.SortedKeys.CPUAvg,
@@ -516,12 +535,11 @@ def test_statistic_case3(self):
                 profiler.SortedKeys.GPUMin, profiler.SortedKeys.GPUAvg
         ]:
             print(
-                profiler.profiler_statistic._build_table(
-                    statistic_data,
-                    sorted_by=sort_key,
-                    op_detail=True,
-                    thread_sep=False,
-                    time_unit='ms'))
+                profiler.profiler_statistic._build_table(statistic_data,
+                                                         sorted_by=sort_key,
+                                                         op_detail=True,
+                                                         thread_sep=False,
+                                                         time_unit='ms'))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py
index d73ed872888c4..b768aa7305158 100644
--- a/python/paddle/fluid/tests/unittests/test_program.py
+++ b/python/paddle/fluid/tests/unittests/test_program.py
@@ -23,6 +23,7 @@
 
 
 class TestProgram(unittest.TestCase):
+
     def test_program(self):
         b = main_program.current_block()
         self.assertEqual(-1, b.parent_idx)
@@ -54,15 +55,20 @@ def test_program(self):
     def test_program_clone(self):
         prog = Program()
 
-        x = prog.global_block().create_var(
-            name='X', shape=[1000, 784], dtype='float32')
+        x = prog.global_block().create_var(name='X',
+                                           shape=[1000, 784],
+                                           dtype='float32')
 
-        y = prog.global_block().create_var(
-            name='Y', shape=[784, 100], dtype='float32')
+        y = prog.global_block().create_var(name='Y',
+                                           shape=[784, 100],
+                                           dtype='float32')
         out = prog.global_block().create_var(name='Out', dtype='float32')
-        prog.global_block().append_op(
-            type="mul", inputs={'X': [x],
-                                'Y': [y]}, outputs={'Out': [out]})
+        prog.global_block().append_op(type="mul",
+                                      inputs={
+                                          'X': [x],
+                                          'Y': [y]
+                                      },
+                                      outputs={'Out': [out]})
 
         # FIXME(yuyang18): We manual compare the output string, since the order
         # of variable could be changed.
@@ -72,15 +78,20 @@ def test_program_clone(self):
     def test_parse_program_from_string(self):
         prog = Program()
 
-        x = prog.global_block().create_var(
-            name='X', shape=[1000, 784], dtype='float32')
+        x = prog.global_block().create_var(name='X',
+                                           shape=[1000, 784],
+                                           dtype='float32')
 
-        y = prog.global_block().create_var(
-            name='Y', shape=[784, 100], dtype='float32')
+        y = prog.global_block().create_var(name='Y',
+                                           shape=[784, 100],
+                                           dtype='float32')
         out = prog.global_block().create_var(name='Out', dtype='float32')
-        prog.global_block().append_op(
-            type="mul", inputs={'X': [x],
-                                'Y': [y]}, outputs={'Out': [out]})
+        prog.global_block().append_op(type="mul",
+                                      inputs={
+                                          'X': [x],
+                                          'Y': [y]
+                                      },
+                                      outputs={'Out': [out]})
 
         binary_str = prog.desc.serialize_to_string()
         prog_restored = Program.parse_from_string(binary_str)
@@ -100,18 +111,17 @@ def test_program_clone_with_parameter(self):
         self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
 
     def test_program_inference_optimize(self):
+
         def net():
-            reader = fluid.layers.py_reader(
-                capacity=10,
-                shapes=[[-1, 10], [-1, 1]],
-                lod_levels=[0, 0],
-                dtypes=['float32', 'int64'],
-                use_double_buffer=True)
+            reader = fluid.layers.py_reader(capacity=10,
+                                            shapes=[[-1, 10], [-1, 1]],
+                                            lod_levels=[0, 0],
+                                            dtypes=['float32', 'int64'],
+                                            use_double_buffer=True)
             in_data, label = fluid.layers.read_file(reader)
             predict_label = fluid.layers.fc(in_data, size=2, act='softmax')
             loss = fluid.layers.mean(
-                fluid.layers.cross_entropy(
-                    input=predict_label, label=label))
+                fluid.layers.cross_entropy(input=predict_label, label=label))
 
             optimizer = fluid.optimizer.Adam()
             optimizer.minimize(loss)
@@ -163,18 +173,17 @@ def test_copy_info_from_error(self):
                           "program")
 
     def test_remove_training_info(self):
+
         def net():
-            reader = fluid.layers.py_reader(
-                capacity=10,
-                shapes=[[-1, 10], [-1, 1]],
-                lod_levels=[0, 0],
-                dtypes=['float32', 'int64'],
-                use_double_buffer=True)
+            reader = fluid.layers.py_reader(capacity=10,
+                                            shapes=[[-1, 10], [-1, 1]],
+                                            lod_levels=[0, 0],
+                                            dtypes=['float32', 'int64'],
+                                            use_double_buffer=True)
             in_data, label = fluid.layers.read_file(reader)
             predict_label = fluid.layers.fc(in_data, size=2, act='softmax')
             loss = fluid.layers.mean(
-                fluid.layers.cross_entropy(
-                    input=predict_label, label=label))
+                fluid.layers.cross_entropy(input=predict_label, label=label))
 
             optimizer = fluid.optimizer.Adam()
             optimizer.minimize(loss)
diff --git a/python/paddle/fluid/tests/unittests/test_program_code.py b/python/paddle/fluid/tests/unittests/test_program_code.py
index e82447519bf20..390644cec5484 100644
--- a/python/paddle/fluid/tests/unittests/test_program_code.py
+++ b/python/paddle/fluid/tests/unittests/test_program_code.py
@@ -21,19 +21,25 @@
 
 
 class TestProgramToReadableCode(unittest.TestCase):
+
     def setUp(self):
         self.program = fluid.Program()
         self.block = self.program.current_block()
-        self.var = self.block.create_var(
-            name="X", shape=[-1, 23, 48], dtype='float32')
-        self.param = self.block.create_parameter(
-            name="W", shape=[23, 48], dtype='float32', trainable=True)
-        self.op = self.block.append_op(
-            type="abs", inputs={"X": [self.var]}, outputs={"Out": [self.var]})
+        self.var = self.block.create_var(name="X",
+                                         shape=[-1, 23, 48],
+                                         dtype='float32')
+        self.param = self.block.create_parameter(name="W",
+                                                 shape=[23, 48],
+                                                 dtype='float32',
+                                                 trainable=True)
+        self.op = self.block.append_op(type="abs",
+                                       inputs={"X": [self.var]},
+                                       outputs={"Out": [self.var]})
         # add control flow op and sub block
         self.append_cond_op(self.program)
 
     def append_cond_op(self, program):
+
         def true_func():
             return layers.fill_constant(shape=[2, 3], dtype='int32', value=2)
 
diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
index a1a3b31a9766e..c602cfb4ad0b3 100755
--- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
@@ -34,16 +34,19 @@ def lstm_net(use_feed):
     hid_dim2 = 96
     class_dim = 2
     emb_lr = 30.0
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
+    data = fluid.layers.data(name="words",
+                             shape=[1],
+                             dtype="int64",
+                             lod_level=1)
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
     emb = fluid.layers.embedding(
         input=data,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(learning_rate=emb_lr))
     fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
-    lstm_h, c = fluid.layers.dynamic_lstm(
-        input=fc0, size=hid_dim * 4, is_reverse=False)
+    lstm_h, c = fluid.layers.dynamic_lstm(input=fc0,
+                                          size=hid_dim * 4,
+                                          is_reverse=False)
     lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
     lstm_max_tanh = fluid.layers.tanh(lstm_max)
     fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
@@ -63,8 +66,8 @@ def simple_fc_net_with_accuracy(use_feed):
             hidden,
             size=200,
             act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=1.0)))
     prediction = fluid.layers.fc(hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
     loss = fluid.layers.mean(loss)
@@ -125,13 +128,14 @@ def loss2(opt, pred, label, with_optimize):
 
 
 class TestProgramPruneBackward(unittest.TestCase):
+
     def program_compare(self, program_a, program_b):
         assert isinstance(
-            program_a, fluid.framework.
-            Program), "The first argument should be fluid.framework.Program."
+            program_a, fluid.framework.Program
+        ), "The first argument should be fluid.framework.Program."
         assert isinstance(
-            program_b, fluid.framework.
-            Program), "The second argument should be fluid.framework Program."
+            program_b, fluid.framework.Program
+        ), "The second argument should be fluid.framework Program."
 
         self.assertEqual(len(program_a.blocks), len(program_b.blocks))
         for idx in range(len(program_a.blocks)):
@@ -172,6 +176,7 @@ def check_prune_correctness(self, method, feed_dict, optimizer):
             self.assertEqual(loss_data_orig, loss_data_prune)
 
     def test_simple_fc_net(self):
+
         def optimizer():
             optimizer = fluid.optimizer.SGD(
                 learning_rate=0.001,
@@ -180,13 +185,15 @@ def optimizer():
 
         with self.program_scope_guard():
             img, label = init_data()
-            self.check_prune_correctness(
-                method=simple_fc_net,
-                feed_dict={"image": img,
-                           "label": label},
-                optimizer=optimizer)
+            self.check_prune_correctness(method=simple_fc_net,
+                                         feed_dict={
+                                             "image": img,
+                                             "label": label
+                                         },
+                                         optimizer=optimizer)
 
     def test_simple_fc_net_with_accuracy(self):
+
         def optimizer():
             optimizer = fluid.optimizer.SGD(
                 learning_rate=0.001,
@@ -195,13 +202,15 @@ def optimizer():
 
         with self.program_scope_guard():
             img, label = init_data()
-            self.check_prune_correctness(
-                method=simple_fc_net_with_accuracy,
-                feed_dict={"image": img,
-                           "label": label},
-                optimizer=optimizer)
+            self.check_prune_correctness(method=simple_fc_net_with_accuracy,
+                                         feed_dict={
+                                             "image": img,
+                                             "label": label
+                                         },
+                                         optimizer=optimizer)
 
     def test_batchnorm_fc(self):
+
         def optimizer():
             optimizer = fluid.optimizer.SGD(
                 learning_rate=0.001,
@@ -210,11 +219,12 @@ def optimizer():
 
         with self.program_scope_guard():
             img, label = init_data()
-            self.check_prune_correctness(
-                method=fc_with_batchnorm,
-                feed_dict={"image": img,
-                           "label": label},
-                optimizer=optimizer)
+            self.check_prune_correctness(method=fc_with_batchnorm,
+                                         feed_dict={
+                                             "image": img,
+                                             "label": label
+                                         },
+                                         optimizer=optimizer)
 
     def test_seresnet(self):
         with self.program_scope_guard():
@@ -224,6 +234,7 @@ def test_seresnet(self):
                 optimizer=seresnext_net.optimizer)
 
     def test_transformer(self):
+
         def optimizer():
             optimizer = fluid.optimizer.Adam(
                 learning_rate=0.001,
@@ -234,10 +245,12 @@ def optimizer():
             # the program argument is used to distinguish Program and CompiledProgram
             feed_dict = get_feed_data_reader().get_next(
                 fluid.Executor(core.CPUPlace()), fluid.default_main_program())
-            self.check_prune_correctness(
-                method=transformer, feed_dict=feed_dict, optimizer=optimizer)
+            self.check_prune_correctness(method=transformer,
+                                         feed_dict=feed_dict,
+                                         optimizer=optimizer)
 
     def test_lstm(self):
+
         def optimizer():
             optimizer = fluid.optimizer.Adagrad(
                 learning_rate=0.001,
@@ -247,16 +260,20 @@ def optimizer():
         with self.program_scope_guard():
             word_dict_size = 5147
             reader = fake_imdb_reader(word_dict_size, 1)
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
+            data = fluid.layers.data(name="words",
+                                     shape=[1],
+                                     dtype="int64",
+                                     lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            feeder = fluid.DataFeeder(
-                feed_list=[data, label], place=core.CPUPlace())
+            feeder = fluid.DataFeeder(feed_list=[data, label],
+                                      place=core.CPUPlace())
             feed_data = feeder.feed(reader())
-            self.check_prune_correctness(
-                method=lstm_net, feed_dict=feed_data, optimizer=optimizer)
+            self.check_prune_correctness(method=lstm_net,
+                                         feed_dict=feed_data,
+                                         optimizer=optimizer)
 
     def test_cond(self):
+
         def optimizer():
             optimizer = fluid.optimizer.SGD(learning_rate=0.01)
             return optimizer
@@ -265,8 +282,9 @@ def optimizer():
             x_in = np.random.random(size=(10, 4)).astype('float32')
             label_in = np.random.randint(1, size=(10, 1)).astype('int64')
             feed_dict = {'x': x_in, 'label': label_in}
-            self.check_prune_correctness(
-                method=cond_net, feed_dict=feed_dict, optimizer=optimizer)
+            self.check_prune_correctness(method=cond_net,
+                                         feed_dict=feed_dict,
+                                         optimizer=optimizer)
 
     def test_optimization_in_cond(self):
         x_in = np.random.random(size=(10, 4)).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/test_program_to_string.py b/python/paddle/fluid/tests/unittests/test_program_to_string.py
index 22ba43bde2ad4..23965e36a35bc 100644
--- a/python/paddle/fluid/tests/unittests/test_program_to_string.py
+++ b/python/paddle/fluid/tests/unittests/test_program_to_string.py
@@ -18,14 +18,17 @@
 
 
 class TestProgram(unittest.TestCase):
+
     def test_program_to_string(self):
         prog = fluid.default_main_program()
-        a = fluid.layers.data(
-            name="X", shape=[2, 3], dtype="float32", append_batch_size=False)
+        a = fluid.layers.data(name="X",
+                              shape=[2, 3],
+                              dtype="float32",
+                              append_batch_size=False)
         c = fluid.layers.fc(a, size=3)
         prog_string = prog.to_string(throw_on_error=True, with_details=False)
-        prog_string_with_details = prog.to_string(
-            throw_on_error=False, with_details=True)
+        prog_string_with_details = prog.to_string(throw_on_error=False,
+                                                  with_details=True)
         assert prog_string is not None
         assert len(prog_string_with_details) > len(prog_string)
 
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf.py b/python/paddle/fluid/tests/unittests/test_protobuf.py
index 7b80927c48d02..4648d9f90ab9f 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf.py
@@ -19,6 +19,7 @@
 
 
 class TestFrameworkProto(unittest.TestCase):
+
     def test_all(self):
         op_proto = framework_pb2.OpProto()
         ipt0 = op_proto.inputs.add()
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
index 7fb2171f611ad..1cbf2ccd7b70b 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -21,6 +21,7 @@
 
 
 class TestOpDesc(unittest.TestCase):
+
     def test_op_desc(self):
         program_desc = core.ProgramDesc()
         self.assertIsNotNone(program_desc)
@@ -81,6 +82,7 @@ def test_op_desc(self):
 
 
 class TestProgramDesc(unittest.TestCase):
+
     def test_instance(self):
         program_desc = core.ProgramDesc()
         self.assertIsNotNone(program_desc)
@@ -108,6 +110,7 @@ def test_append_block(self):
 
 
 class TestVarDesc(unittest.TestCase):
+
     def test_shape(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
@@ -164,6 +167,7 @@ def test_multiple_lod_level(self):
 
 
 class TestBlockDesc(unittest.TestCase):
+
     def test_add_var(self):
         program_desc = core.ProgramDesc()
         self.assertIsNotNone(program_desc)
diff --git a/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
index 57e96f1fa34fa..ecccd7ba6c3a0 100644
--- a/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
@@ -20,6 +20,7 @@
 
 
 class TestProximalAdagradOp(OpTest):
+
     def setUp(self):
         self.op_type = "proximal_adagrad"
         w = np.random.random((102, 105)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py b/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
index 067502baecc73..b130a19597191 100644
--- a/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
@@ -20,6 +20,7 @@
 
 
 class TestProximalGDOp(OpTest):
+
     def setUp(self):
         self.op_type = "proximal_gd"
         w = np.random.random((102, 105)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py
index 8e5ba7c3363a1..71b07155f4015 100644
--- a/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py
@@ -24,13 +24,16 @@
 
 
 class TestPRROIPoolOp(OpTest):
+
     def set_data(self):
         self.init_test_case()
         self.make_rois()
         self.prRoIPool = PyPrRoIPool()
-        self.outs = self.prRoIPool.compute(
-            self.x, self.rois, self.output_channels, self.spatial_scale,
-            self.pooled_height, self.pooled_width).astype('float32')
+        self.outs = self.prRoIPool.compute(self.x, self.rois,
+                                           self.output_channels,
+                                           self.spatial_scale,
+                                           self.pooled_height,
+                                           self.pooled_width).astype('float32')
         self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
         self.attrs = {
             'output_channels': self.output_channels,
@@ -95,8 +98,10 @@ def run_net(self, place):
                 name="X",
                 shape=[self.channels, self.height, self.width],
                 dtype="float32")
-            rois = fluid.layers.data(
-                name="ROIs", shape=[4], dtype="float32", lod_level=1)
+            rois = fluid.layers.data(name="ROIs",
+                                     shape=[4],
+                                     dtype="float32",
+                                     lod_level=1)
             output = fluid.layers.prroi_pool(x, rois, 0.25, 2, 2)
             loss = fluid.layers.mean(output)
             optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
@@ -106,9 +111,10 @@ def run_net(self, place):
                                                  self.rois_lod, place)
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
-            exe.run(fluid.default_main_program(),
-                    {'X': input_x,
-                     "ROIs": input_rois})
+            exe.run(fluid.default_main_program(), {
+                'X': input_x,
+                "ROIs": input_rois
+            })
 
     def test_net(self):
         places = [fluid.CPUPlace()]
@@ -119,10 +125,13 @@ def test_net(self):
 
     def test_errors(self):
         with program_guard(Program(), Program()):
-            x = fluid.layers.data(
-                name="x", shape=[245, 30, 30], dtype="float32")
-            rois = fluid.layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
+            x = fluid.layers.data(name="x",
+                                  shape=[245, 30, 30],
+                                  dtype="float32")
+            rois = fluid.layers.data(name="rois",
+                                     shape=[4],
+                                     dtype="float32",
+                                     lod_level=1)
             # spatial_scale must be float type
             self.assertRaises(TypeError, fluid.layers.prroi_pool, x, rois, 2, 7,
                               7)
@@ -135,13 +144,16 @@ def test_errors(self):
 
 
 class TestPRROIPoolOpTensorRoIs(OpTest):
+
     def set_data(self):
         self.init_test_case()
         self.make_rois()
         self.prRoIPool = PyPrRoIPool()
-        self.outs = self.prRoIPool.compute(
-            self.x, self.rois, self.output_channels, self.spatial_scale,
-            self.pooled_height, self.pooled_width).astype('float32')
+        self.outs = self.prRoIPool.compute(self.x, self.rois,
+                                           self.output_channels,
+                                           self.spatial_scale,
+                                           self.pooled_height,
+                                           self.pooled_width).astype('float32')
 
         self.rois_index = np.array(self.rois_lod).reshape([-1]).astype(np.int64)
         self.inputs = {
@@ -213,10 +225,15 @@ def run_net(self, place):
                 shape=[self.channels, self.height, self.width],
                 dtype="float32")
             rois = fluid.layers.data(name="ROIs", shape=[4], dtype="float32")
-            rois_index = fluid.layers.data(
-                name='rois_idx', shape=[], dtype="int64")
-            output = fluid.layers.prroi_pool(
-                x, rois, 0.25, 2, 2, batch_roi_nums=rois_index)
+            rois_index = fluid.layers.data(name='rois_idx',
+                                           shape=[],
+                                           dtype="int64")
+            output = fluid.layers.prroi_pool(x,
+                                             rois,
+                                             0.25,
+                                             2,
+                                             2,
+                                             batch_roi_nums=rois_index)
             loss = fluid.layers.mean(output)
             optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
             optimizer.minimize(loss)
@@ -237,10 +254,13 @@ def test_net(self):
 
     def test_errors(self):
         with program_guard(Program(), Program()):
-            x = fluid.layers.data(
-                name="x", shape=[245, 30, 30], dtype="float32")
-            rois = fluid.layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
+            x = fluid.layers.data(name="x",
+                                  shape=[245, 30, 30],
+                                  dtype="float32")
+            rois = fluid.layers.data(name="rois",
+                                     shape=[4],
+                                     dtype="float32",
+                                     lod_level=1)
             # spatial_scale must be float type
             self.assertRaises(TypeError, fluid.layers.prroi_pool, x, rois, 2, 7,
                               7)
@@ -252,27 +272,24 @@ def test_errors(self):
                               7, 0.7)
 
             def test_bad_x():
-                x = fluid.layers.data(
-                    name='data1',
-                    shape=[2, 3, 16, 16],
-                    dtype='int64',
-                    append_batch_size=False)
-                label = fluid.layers.data(
-                    name='label1',
-                    shape=[2, 4],
-                    dtype='float32',
-                    lod_level=1,
-                    append_batch_size=False)
+                x = fluid.layers.data(name='data1',
+                                      shape=[2, 3, 16, 16],
+                                      dtype='int64',
+                                      append_batch_size=False)
+                label = fluid.layers.data(name='label1',
+                                          shape=[2, 4],
+                                          dtype='float32',
+                                          lod_level=1,
+                                          append_batch_size=False)
                 output = fluid.layers.prroi_pool(x, label, 0.25, 2, 2)
 
             self.assertRaises(TypeError, test_bad_x)
 
             def test_bad_y():
-                x = fluid.layers.data(
-                    name='data2',
-                    shape=[2, 3, 16, 16],
-                    dtype='float32',
-                    append_batch_size=False)
+                x = fluid.layers.data(name='data2',
+                                      shape=[2, 3, 16, 16],
+                                      dtype='float32',
+                                      append_batch_size=False)
                 label = [[1, 2, 3, 4], [2, 3, 4, 5]]
                 output = fluid.layers.prroi_pool(x, label, 0.25, 2, 2)
 
diff --git a/python/paddle/fluid/tests/unittests/test_prune.py b/python/paddle/fluid/tests/unittests/test_prune.py
index 3755d92858a5c..c320e3fbf58b2 100644
--- a/python/paddle/fluid/tests/unittests/test_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_prune.py
@@ -25,6 +25,7 @@
 
 
 class TestPrune(unittest.TestCase):
+
     def net(self):
         x = fluid.layers.data(name='x', shape=[2], dtype='float32')
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
@@ -40,9 +41,9 @@ def test_prune_with_input(self):
         with fluid.program_guard(program, startup_program):
             (x, y, label, loss) = self.net()
         self.assertEqual(len(block.ops), 5)
-        self.assertEqual([op.type for op in block.ops], [
-            "mul", "elementwise_add", "softmax", "cross_entropy2", "mean"
-        ])
+        self.assertEqual(
+            [op.type for op in block.ops],
+            ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"])
         pruned_program = program._prune_with_input(
             feeded_var_names=[y.name, label.name], targets=[loss])
         self.assertEqual(len(pruned_program.global_block().ops), 2)
@@ -56,9 +57,9 @@ def test_prune(self):
         with fluid.program_guard(program, startup_program):
             (x, y, label, loss) = self.net()
         self.assertEqual(len(block.ops), 5)
-        self.assertEqual([op.type for op in block.ops], [
-            "mul", "elementwise_add", "softmax", "cross_entropy2", "mean"
-        ])
+        self.assertEqual(
+            [op.type for op in block.ops],
+            ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"])
         pruned_program = program._prune(targets=[loss])
         self.assertEqual(len(pruned_program.global_block().ops), 5)
         self.assertEqual(
@@ -72,9 +73,9 @@ def test_prune_target_not_list(self):
         with fluid.program_guard(program, startup_program):
             (x, y, label, loss) = self.net()
         self.assertEqual(len(block.ops), 5)
-        self.assertEqual([op.type for op in block.ops], [
-            "mul", "elementwise_add", "softmax", "cross_entropy2", "mean"
-        ])
+        self.assertEqual(
+            [op.type for op in block.ops],
+            ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"])
         pruned_program = program._prune(targets=loss)
         self.assertEqual(len(pruned_program.global_block().ops), 5)
         self.assertEqual(
@@ -88,9 +89,9 @@ def test_prune_target_none(self):
         with fluid.program_guard(program, startup_program):
             (x, y, label, loss) = self.net()
         self.assertEqual(len(block.ops), 5)
-        self.assertEqual([op.type for op in block.ops], [
-            "mul", "elementwise_add", "softmax", "cross_entropy2", "mean"
-        ])
+        self.assertEqual(
+            [op.type for op in block.ops],
+            ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"])
         try:
             pruned_program = program._prune(targets=None)
         except ValueError as e:
@@ -113,6 +114,7 @@ def _mock_guard(mock):
 
 
 class TestExecutorRunAutoPrune(unittest.TestCase):
+
     def net1(self):
         x = fluid.layers.data(name='x', shape=[2], dtype='float32')
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
@@ -176,8 +178,10 @@ def test_not_prune(self):
                 x_np = np.random.random(size=(10, 2)).astype('float32')
                 label_np = np.random.randint(1, size=(10, 1)).astype('int64')
                 res = exe.run(program,
-                              feed={'x': x_np,
-                                    'label': label_np},
+                              feed={
+                                  'x': x_np,
+                                  'label': label_np
+                              },
                               fetch_list=[loss1.name],
                               use_prune=False)
                 self.assertIsNotNone(scope.find_var(loss1.name))
@@ -200,8 +204,10 @@ def test_prune_fetches_without_optimizer(self):
                 x_np = np.random.random(size=(10, 2)).astype('float32')
                 label_np = np.random.randint(1, size=(10, 1)).astype('int64')
                 res = exe.run(program,
-                              feed={'x': x_np,
-                                    'label': label_np},
+                              feed={
+                                  'x': x_np,
+                                  'label': label_np
+                              },
                               fetch_list=[loss1.name],
                               use_prune=True)
                 self.assertIsNotNone(scope.find_var(loss1.name))
@@ -231,8 +237,10 @@ def test_prune_fetches_with_optimizer(self):
                 x_np = np.random.random(size=(10, 2)).astype('float32')
                 label_np = np.random.randint(1, size=(10, 1)).astype('int64')
                 res = exe.run(program,
-                              feed={'x': x_np,
-                                    'label': label_np},
+                              feed={
+                                  'x': x_np,
+                                  'label': label_np
+                              },
                               fetch_list=[loss1.name],
                               use_prune=True)
                 self.assertIsNotNone(scope.find_var(loss1.name))
@@ -254,15 +262,17 @@ def test_prune_compiled_program(self):
                 exe = fluid.Executor(fluid.CPUPlace())
                 exe.run(startup_program)
                 compiled_prog = fluid.CompiledProgram(
-                    program).with_data_parallel(
-                        loss_name=loss1.name, places=fluid.CPUPlace())
+                    program).with_data_parallel(loss_name=loss1.name,
+                                                places=fluid.CPUPlace())
                 weight_init = np.array(
                     scope.find_var(w_param_attrs.name).get_tensor())
                 x_np = np.random.random(size=(10, 2)).astype('float32')
                 label_np = np.random.randint(1, size=(10, 1)).astype('int64')
                 res = exe.run(compiled_prog,
-                              feed={'x': x_np,
-                                    'label': label_np},
+                              feed={
+                                  'x': x_np,
+                                  'label': label_np
+                              },
                               fetch_list=[loss1.name],
                               use_prune=True)
                 self.assertIsNotNone(scope.find_var(loss1.name))
@@ -286,8 +296,10 @@ def test_prune_feed_without_optimizer(self):
                 x_np = np.random.random(size=(10, 2)).astype('float32')
                 label_np = np.random.randint(1, size=(10, 1)).astype('int64')
                 res = exe.run(program,
-                              feed={y.name: x_np,
-                                    'label': label_np},
+                              feed={
+                                  y.name: x_np,
+                                  'label': label_np
+                              },
                               fetch_list=[loss1.name],
                               use_prune=True)
                 self.assertIsNotNone(scope.find_var(loss1.name))
@@ -310,14 +322,15 @@ def test_prune_feed_with_optimizer(self):
                 exe.run(startup_program)
                 x_np = np.random.random(size=(10, 2)).astype('float32')
                 label_np = np.random.randint(1, size=(10, 1)).astype('int64')
-                self.assertRaises(
-                    Exception,
-                    exe.run,
-                    program,
-                    feed={y.name: x_np,
-                          'label': label_np},
-                    fetch_list=[loss1.name],
-                    use_prune=True)
+                self.assertRaises(Exception,
+                                  exe.run,
+                                  program,
+                                  feed={
+                                      y.name: x_np,
+                                      'label': label_np
+                                  },
+                                  fetch_list=[loss1.name],
+                                  use_prune=True)
                 self.assertIsNotNone(scope.find_var(loss1.name))
                 self.assertIsNone(scope.find_var(loss2.name))
 
@@ -343,12 +356,14 @@ def test_prune_with_cache_program(self):
                     sgd_optimizer.minimize(loss1)
                     exe.run(startup_program)
                     x_np = np.random.random(size=(10, 2)).astype('float32')
-                    label_np = np.random.randint(
-                        1, size=(10, 1)).astype('int64')
+                    label_np = np.random.randint(1,
+                                                 size=(10, 1)).astype('int64')
                     for i in range(10):
                         res = exe.run(program,
-                                      feed={'x': x_np,
-                                            'label': label_np},
+                                      feed={
+                                          'x': x_np,
+                                          'label': label_np
+                                      },
                                       fetch_list=[loss1.name],
                                       use_prune=True)
                         if i == 0:
@@ -380,8 +395,8 @@ def test_prune_with_cache_program2(self):
                     train2 = adam_optimizer2.minimize(loss2)
                     exe.run(startup_program)
                     x_np = np.random.random(size=(10, 2)).astype('float32')
-                    label_np = np.random.randint(
-                        1, size=(10, 1)).astype('int64')
+                    label_np = np.random.randint(1,
+                                                 size=(10, 1)).astype('int64')
 
                     for i in range(10):
                         if i % 2:
@@ -431,15 +446,17 @@ def test_prune_with_cache_compiled_program(self):
                     sgd_optimizer.minimize(loss1)
                     exe.run(startup_program)
                     x_np = np.random.random(size=(10, 2)).astype('float32')
-                    label_np = np.random.randint(
-                        1, size=(10, 1)).astype('int64')
+                    label_np = np.random.randint(1,
+                                                 size=(10, 1)).astype('int64')
                     compiled_prog = fluid.CompiledProgram(
-                        program).with_data_parallel(
-                            loss_name=loss1.name, places=fluid.CPUPlace())
+                        program).with_data_parallel(loss_name=loss1.name,
+                                                    places=fluid.CPUPlace())
                     for i in range(10):
                         res = exe.run(compiled_prog,
-                                      feed={'x': x_np,
-                                            'label': label_np},
+                                      feed={
+                                          'x': x_np,
+                                          'label': label_np
+                                      },
                                       fetch_list=[loss1.name],
                                       use_prune=True)
                         if i == 0:
@@ -468,8 +485,10 @@ def test_prune_with_multi_optimizers(self):
                 x_np = np.random.random(size=(10, 2)).astype('float32')
                 label_np = np.random.randint(1, size=(10, 1)).astype('int64')
                 res = exe.run(program,
-                              feed={'x': x_np,
-                                    'label': label_np},
+                              feed={
+                                  'x': x_np,
+                                  'label': label_np
+                              },
                               fetch_list=[loss1.name],
                               use_prune=False)
                 weight_without_prune = np.array(
@@ -480,8 +499,10 @@ def test_prune_with_multi_optimizers(self):
         with fluid.scope_guard(scope):
             exe.run(startup_program)
             res = exe.run(program,
-                          feed={'x': x_np,
-                                'label': label_np},
+                          feed={
+                              'x': x_np,
+                              'label': label_np
+                          },
                           fetch_list=[loss1.name, train1],
                           use_prune=True)
             weight_with_prune = np.array(
@@ -492,8 +513,10 @@ def test_prune_with_multi_optimizers(self):
         with fluid.scope_guard(scope):
             exe.run(startup_program)
             exe.run(cloned_program,
-                    feed={'x': x_np,
-                          'label': label_np},
+                    feed={
+                        'x': x_np,
+                        'label': label_np
+                    },
                     fetch_list=[loss1.name],
                     use_prune=False)
             weight_expected = np.array(
@@ -528,11 +551,11 @@ def test_prune_with_multi_devices(self):
                 x_np = np.random.random(size=(10, 2)).astype('float32')
                 label_np = np.random.randint(1, size=(10, 1)).astype('int64')
                 compiled_prog1 = fluid.CompiledProgram(
-                    program).with_data_parallel(
-                        loss_name=loss1.name, places=[fluid.CPUPlace()] * 2)
+                    program).with_data_parallel(loss_name=loss1.name,
+                                                places=[fluid.CPUPlace()] * 2)
                 compiled_prog2 = fluid.CompiledProgram(
-                    program).with_data_parallel(
-                        loss_name=loss2.name, places=[fluid.CPUPlace()] * 2)
+                    program).with_data_parallel(loss_name=loss2.name,
+                                                places=[fluid.CPUPlace()] * 2)
                 for i in range(10):
                     if i % 2 == 1:
                         res = exe.run(compiled_prog1,
@@ -547,8 +570,10 @@ def test_prune_with_multi_devices(self):
                                       use_prune=True)
                     else:
                         res = exe.run(compiled_prog2,
-                                      feed={'x2': x_np,
-                                            'label': label_np},
+                                      feed={
+                                          'x2': x_np,
+                                          'label': label_np
+                                      },
                                       fetch_list=[loss2.name, train2],
                                       use_prune=True)
                 weight1 = np.array(
@@ -560,9 +585,11 @@ def test_prune_with_multi_devices(self):
             for i in range(10):
                 if i % 2 == 1:
                     exe.run(cloned_program,
-                            feed={'x1': x_np,
-                                  'x2': x_np,
-                                  'label': label_np},
+                            feed={
+                                'x1': x_np,
+                                'x2': x_np,
+                                'label': label_np
+                            },
                             fetch_list=[loss1.name],
                             use_prune=False)
             weight2 = np.array(scope.find_var(w1_param_attrs.name).get_tensor())
@@ -591,8 +618,10 @@ def test_prune_program_with_tupe_in_fetch_list(self):
                 label_np = np.random.randint(1, size=(10, 1)).astype('int64')
 
                 res = exe.run(program,
-                              feed={'x': x_np,
-                                    'label': label_np},
+                              feed={
+                                  'x': x_np,
+                                  'label': label_np
+                              },
                               fetch_list=[loss1.name],
                               use_prune=False)
 
@@ -604,8 +633,10 @@ def test_prune_program_with_tupe_in_fetch_list(self):
         with fluid.scope_guard(scope):
             exe.run(startup_program)
             res = exe.run(program,
-                          feed={'x': x_np,
-                                'label': label_np},
+                          feed={
+                              'x': x_np,
+                              'label': label_np
+                          },
                           fetch_list=[loss1.name, train1],
                           use_prune=True)
             weight_with_prune = np.array(
@@ -616,8 +647,10 @@ def test_prune_program_with_tupe_in_fetch_list(self):
         with fluid.scope_guard(scope):
             exe.run(startup_program)
             exe.run(cloned_program,
-                    feed={'x': x_np,
-                          'label': label_np},
+                    feed={
+                        'x': x_np,
+                        'label': label_np
+                    },
                     fetch_list=[loss1.name],
                     use_prune=False)
             weight_expected = np.array(
@@ -654,8 +687,10 @@ def test_prune_program_partial_parameter_updated(self):
                 label_np = np.random.randint(1, size=(10, 1)).astype('int64')
 
                 res = exe.run(program,
-                              feed={'x1': x_np,
-                                    'label': label_np},
+                              feed={
+                                  'x1': x_np,
+                                  'label': label_np
+                              },
                               fetch_list=[loss1.name, train1],
                               use_prune=True)
                 self.assertIsNotNone(scope.find_var(w1_param_attrs.name))
@@ -691,8 +726,10 @@ def test_prune_override_use_prune(self):
                 x_np = np.random.random(size=(10, 2)).astype('float32')
                 label_np = np.random.randint(1, size=(10, 1)).astype('int64')
                 res = exe.run(program,
-                              feed={'x': x_np,
-                                    'label': label_np},
+                              feed={
+                                  'x': x_np,
+                                  'label': label_np
+                              },
                               fetch_list=[loss1.name],
                               use_prune=False)
 
@@ -704,8 +741,10 @@ def test_prune_override_use_prune(self):
         with fluid.scope_guard(scope):
             exe.run(startup_program)
             res = exe.run(program,
-                          feed={'x': x_np,
-                                'label': label_np},
+                          feed={
+                              'x': x_np,
+                              'label': label_np
+                          },
                           fetch_list=[loss1.name, train1])
             weight_with_prune = np.array(
                 scope.find_var(w_param_attrs.name).get_tensor())
@@ -715,8 +754,10 @@ def test_prune_override_use_prune(self):
         with fluid.scope_guard(scope):
             exe.run(startup_program)
             exe.run(cloned_program,
-                    feed={'x': x_np,
-                          'label': label_np},
+                    feed={
+                        'x': x_np,
+                        'label': label_np
+                    },
                     fetch_list=[loss1.name],
                     use_prune=False)
             weight_expected = np.array(
@@ -740,8 +781,10 @@ def test_prune_feed_var_in_fetchlist_1(self):
                 x_np = np.random.random(size=(10, 2)).astype('float32')
                 label_np = np.random.randint(1, size=(10, 1)).astype('int64')
                 res = exe.run(program,
-                              feed={y.name: x_np,
-                                    'label': label_np},
+                              feed={
+                                  y.name: x_np,
+                                  'label': label_np
+                              },
                               fetch_list=[y.name, loss1.name],
                               use_prune=True)
                 self.assertIsNotNone(scope.find_var(loss1.name))
@@ -767,8 +810,10 @@ def test_prune_feed_var_in_fetchlist_2(self):
                 x_np = np.random.random(size=(10, 2)).astype('float32')
                 label_np = np.random.randint(1, size=(10, 1)).astype('int64')
                 res = exe.run(program,
-                              feed={x.name: x_np,
-                                    'label': label_np},
+                              feed={
+                                  x.name: x_np,
+                                  'label': label_np
+                              },
                               fetch_list=[x.name, loss1.name],
                               use_prune=True)
                 self.assertIsNotNone(scope.find_var(loss1.name))
diff --git a/python/paddle/fluid/tests/unittests/test_prune_gate_by_capacity_op.py b/python/paddle/fluid/tests/unittests/test_prune_gate_by_capacity_op.py
index 8a641a6b4faf9..4b73e05f502c6 100644
--- a/python/paddle/fluid/tests/unittests/test_prune_gate_by_capacity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prune_gate_by_capacity_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -67,9 +67,10 @@ def assert_allclose(output, expected, n_expert):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestPruneGateByCapacityAPI1(unittest.TestCase):
+
     def init_test_case(self):
-        self.gate_idx = np.random.randint(
-            0, self.n_expert, size=(200, )).astype(self.dtype)
+        self.gate_idx = np.random.randint(0, self.n_expert,
+                                          size=(200, )).astype(self.dtype)
         expert_count = count(self.gate_idx, self.n_expert * self.n_worker)
         capacity = np.random.randint(10, 200, size=(self.n_expert, ))
         self.expert_count = limit_by_capacity(expert_count, capacity,
@@ -88,8 +89,9 @@ def setUp(self):
     def test_static_api(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            gate_idx_tensor = paddle.static.data(
-                'GateIdx', shape=self.gate_idx.shape, dtype="int64")
+            gate_idx_tensor = paddle.static.data('GateIdx',
+                                                 shape=self.gate_idx.shape,
+                                                 dtype="int64")
             expert_count_tensor = paddle.static.data(
                 'ExpertCount', shape=self.expert_count.shape, dtype="int64")
             out = utils._prune_gate_by_capacity(gate_idx_tensor,
@@ -107,8 +109,9 @@ def func_dygraph_api(self):
         paddle.disable_static(self.place)
         gate_idx_tensor = paddle.to_tensor(self.gate_idx)
         expert_count_tensor = paddle.to_tensor(self.expert_count)
-        out = utils._prune_gate_by_capacity(
-            gate_idx_tensor, expert_count_tensor, self.n_expert, self.n_worker)
+        out = utils._prune_gate_by_capacity(gate_idx_tensor,
+                                            expert_count_tensor, self.n_expert,
+                                            self.n_worker)
         assert_allclose(out.numpy(), self.out, self.n_expert)
 
     def test_dygraph_api(self):
@@ -120,6 +123,7 @@ def test_dygraph_api(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestPruneGateByCapacityAPI2(TestPruneGateByCapacityAPI1):
+
     def setUp(self):
         self.n_expert = 12
         self.n_worker = 1
diff --git a/python/paddle/fluid/tests/unittests/test_ps_dispatcher.py b/python/paddle/fluid/tests/unittests/test_ps_dispatcher.py
index 16abb8a7da4e6..d9de42ae562ff 100644
--- a/python/paddle/fluid/tests/unittests/test_ps_dispatcher.py
+++ b/python/paddle/fluid/tests/unittests/test_ps_dispatcher.py
@@ -19,6 +19,7 @@
 
 
 class TestPsDispatcher(unittest.TestCase):
+
     def setUp(self):
         self.points = [
             "127.0.0.1:1001", "127.0.0.1:1002", "127.0.0.1:1003",
@@ -34,7 +35,9 @@ def test_base(self):
             base.dispatch([])
 
     def test_hash(self):
+
         class Var:
+
             def __init__(self, index):
                 self._name = "var_{}".format(index)
 
@@ -52,7 +55,9 @@ def name(self):
         self.assertEqual(len(eplist), 4)
 
     def test_round_rodin(self):
+
         class Var:
+
             def __init__(self, index):
                 self._name = "var_{}".format(index)
 
diff --git a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
index 39dec982b6607..3e3529b2240da 100644
--- a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
@@ -79,12 +79,14 @@ def calc_psroi_pool(x, rois, rois_num_per_img, output_channels, spatial_scale,
                         for iw in range(wstart, wend):
                             out_sum += x_i[c_in, ih, iw]
                     bin_area = (hend - hstart) * (wend - wstart)
-                    out_data[i, c, ph, pw] = 0. if is_empty else (
-                        out_sum / float(bin_area))
+                    out_data[i, c, ph,
+                             pw] = 0. if is_empty else (out_sum /
+                                                        float(bin_area))
     return out_data
 
 
 class TestPSROIPoolOp(OpTest):
+
     def set_data(self):
         paddle.enable_static()
         self.init_test_case()
@@ -141,12 +143,13 @@ def make_rois(self):
         self.rois_num = len(rois)
         self.rois_with_batch_id = np.array(rois).astype('float64')
         self.boxes = self.rois_with_batch_id[:, 1:]
-        self.boxes_num = np.array(
-            [bno + 1 for bno in range(self.batch_size)]).astype('int32')
+        self.boxes_num = np.array([bno + 1 for bno in range(self.batch_size)
+                                   ]).astype('int32')
 
     def setUp(self):
         self.op_type = 'psroi_pool'
-        self.python_api = lambda x, boxes, boxes_num, pooled_height, pooled_width, output_channels, spatial_scale: paddle.vision.ops.psroi_pool(x, boxes, boxes_num, (pooled_height, pooled_width), spatial_scale)
+        self.python_api = lambda x, boxes, boxes_num, pooled_height, pooled_width, output_channels, spatial_scale: paddle.vision.ops.psroi_pool(
+            x, boxes, boxes_num, (pooled_height, pooled_width), spatial_scale)
         self.set_data()
 
     def test_check_output(self):
@@ -157,29 +160,31 @@ def test_check_grad(self):
 
 
 class TestPSROIPoolDynamicFunctionAPI(unittest.TestCase):
+
     def setUp(self):
         self.x = np.random.random([2, 490, 28, 28]).astype(np.float32)
-        self.boxes = np.array(
-            [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]]).astype(np.float32)
+        self.boxes = np.array([[1, 5, 8, 10], [4, 2, 6, 7],
+                               [12, 12, 19, 21]]).astype(np.float32)
         self.boxes_num = np.array([1, 2]).astype(np.int32)
 
     def test_output_size(self):
+
         def test_output_size_is_int():
             output_size = 7
-            out = paddle.vision.ops.psroi_pool(
-                paddle.to_tensor(self.x),
-                paddle.to_tensor(self.boxes),
-                paddle.to_tensor(self.boxes_num), output_size).numpy()
+            out = paddle.vision.ops.psroi_pool(paddle.to_tensor(self.x),
+                                               paddle.to_tensor(self.boxes),
+                                               paddle.to_tensor(self.boxes_num),
+                                               output_size).numpy()
             expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 10,
                                          1.0, 7, 7)
             self.assertTrue(np.allclose(out, expect_out))
 
         def test_output_size_is_tuple():
             output_size = (7, 7)
-            out = paddle.vision.ops.psroi_pool(
-                paddle.to_tensor(self.x),
-                paddle.to_tensor(self.boxes),
-                paddle.to_tensor(self.boxes_num), output_size).numpy()
+            out = paddle.vision.ops.psroi_pool(paddle.to_tensor(self.x),
+                                               paddle.to_tensor(self.boxes),
+                                               paddle.to_tensor(self.boxes_num),
+                                               output_size).numpy()
             expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 10,
                                          1.0, 7, 7)
             self.assertTrue(np.allclose(out, expect_out))
@@ -205,6 +210,7 @@ def test_dytype_is_float64():
 
 
 class TestPSROIPoolDynamicClassAPI(unittest.TestCase):
+
     def setUp(self):
         self.x = np.random.random([2, 128, 32, 32]).astype(np.float32)
         self.boxes = np.array([[3, 5, 6, 13], [7, 4, 22, 18], [4, 5, 7, 10],
@@ -212,32 +218,31 @@ def setUp(self):
         self.boxes_num = np.array([2, 2]).astype(np.int32)
 
     def test_output_size(self):
+
         def test_output_size_is_int():
             psroi_module = paddle.vision.ops.PSRoIPool(8, 1.1)
-            out = psroi_module(
-                paddle.to_tensor(self.x),
-                paddle.to_tensor(self.boxes),
-                paddle.to_tensor(self.boxes_num)).numpy()
+            out = psroi_module(paddle.to_tensor(self.x),
+                               paddle.to_tensor(self.boxes),
+                               paddle.to_tensor(self.boxes_num)).numpy()
             expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 2,
                                          1.1, 8, 8)
             self.assertTrue(np.allclose(out, expect_out))
 
         def test_output_size_is_tuple():
             psroi_pool_module = paddle.vision.ops.PSRoIPool(8, 1.1)
-            out = psroi_pool_module(
-                paddle.to_tensor(self.x),
-                paddle.to_tensor(self.boxes),
-                paddle.to_tensor(self.boxes_num)).numpy()
+            out = psroi_pool_module(paddle.to_tensor(self.x),
+                                    paddle.to_tensor(self.boxes),
+                                    paddle.to_tensor(self.boxes_num)).numpy()
             expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 2,
                                          1.1, 8, 8)
             self.assertTrue(np.allclose(out, expect_out))
 
         def test_dytype_is_float64():
             psroi_pool_module = paddle.vision.ops.PSRoIPool(8, 1.1)
-            out = psroi_pool_module(
-                paddle.to_tensor(self.x, 'float64'),
-                paddle.to_tensor(self.boxes, 'float64'),
-                paddle.to_tensor(self.boxes_num, 'int32')).numpy()
+            out = psroi_pool_module(paddle.to_tensor(self.x, 'float64'),
+                                    paddle.to_tensor(self.boxes, 'float64'),
+                                    paddle.to_tensor(self.boxes_num,
+                                                     'int32')).numpy()
             expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 2,
                                          1.1, 8, 8)
             self.assertTrue(np.allclose(out, expect_out))
@@ -254,6 +259,7 @@ def test_dytype_is_float64():
 
 
 class TestPSROIPoolBoxesNumError(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
         self.x = paddle.uniform([2, 490, 28, 28], dtype='float32')
@@ -261,22 +267,28 @@ def setUp(self):
             [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], 'float32')
 
     def test_errors(self):
+
         def test_boxes_num_nums_error():
             boxes_num = paddle.to_tensor([1, 5], 'int32')
-            out = paddle.vision.ops.psroi_pool(
-                self.x, self.boxes, boxes_num, output_size=7)
+            out = paddle.vision.ops.psroi_pool(self.x,
+                                               self.boxes,
+                                               boxes_num,
+                                               output_size=7)
 
         self.assertRaises(ValueError, test_boxes_num_nums_error)
 
         def test_boxes_num_length_error():
             boxes_num = paddle.to_tensor([1, 1, 1], 'int32')
-            out = paddle.vision.ops.psroi_pool(
-                self.x, self.boxes, boxes_num, output_size=7)
+            out = paddle.vision.ops.psroi_pool(self.x,
+                                               self.boxes,
+                                               boxes_num,
+                                               output_size=7)
 
         self.assertRaises(ValueError, test_boxes_num_length_error)
 
 
 class TestPSROIPoolChannelError(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
         self.x = paddle.uniform([2, 490, 28, 28], dtype='float32')
@@ -285,6 +297,7 @@ def setUp(self):
         self.output_size = 4
 
     def test_errors(self):
+
         def test_channel_error():
             boxes_num = paddle.to_tensor([2, 1], 'int32')
             out = paddle.vision.ops.psroi_pool(self.x, self.boxes, boxes_num,
@@ -294,15 +307,17 @@ def test_channel_error():
 
 
 class TestPSROIPoolStaticAPI(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
-        self.x_placeholder = paddle.static.data(
-            name='x', shape=[2, 490, 28, 28])
+        self.x_placeholder = paddle.static.data(name='x',
+                                                shape=[2, 490, 28, 28])
         self.x = np.random.random([2, 490, 28, 28]).astype(np.float32)
-        self.boxes_placeholder = paddle.static.data(
-            name='boxes', shape=[3, 4], lod_level=1)
-        self.boxes = np.array(
-            [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]]).astype(np.float32)
+        self.boxes_placeholder = paddle.static.data(name='boxes',
+                                                    shape=[3, 4],
+                                                    lod_level=1)
+        self.boxes = np.array([[1, 5, 8, 10], [4, 2, 6, 7],
+                               [12, 12, 19, 21]]).astype(np.float32)
         self.boxes_num = np.array([1, 2]).astype(np.int32)
 
     def test_function_in_static(self):
@@ -317,11 +332,13 @@ def test_function_in_static(self):
             places.append(paddle.CUDAPlace(0))
         for place in places:
             exe = paddle.static.Executor(place)
-            boxes_lod_data = paddle.fluid.create_lod_tensor(self.boxes,
-                                                            [[1, 2]], place)
+            boxes_lod_data = paddle.fluid.create_lod_tensor(
+                self.boxes, [[1, 2]], place)
             out_res = exe.run(paddle.static.default_main_program(),
-                              feed={'x': self.x,
-                                    'boxes': boxes_lod_data},
+                              feed={
+                                  'x': self.x,
+                                  'boxes': boxes_lod_data
+                              },
                               fetch_list=[out.name])
             self.assertTrue(np.allclose(out_res, expect_out))
 
diff --git a/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py b/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py
index 07354f1b7b25b..b15edb44d57a8 100644
--- a/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py
@@ -34,11 +34,15 @@ def test_static_graph(self):
         slots = []
         with fluid.program_guard(train_program, startup_program):
 
-            l = fluid.layers.data(
-                name='input', shape=[1], dtype="int64", lod_level=1)
+            l = fluid.layers.data(name='input',
+                                  shape=[1],
+                                  dtype="int64",
+                                  lod_level=1)
             slots.append(l)
-            output = _pull_gpups_sparse(
-                slots, size=[11], is_distributed=True, is_sparse=True)
+            output = _pull_gpups_sparse(slots,
+                                        size=[11],
+                                        is_distributed=True,
+                                        is_sparse=True)
             cost = paddle.fluid.layers.mean(output)
             sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
             sgd_optimizer.minimize(cost, train_program)
diff --git a/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py b/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py
index 2662cd5250ff6..bbc383eaf6183 100644
--- a/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py
+++ b/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py
@@ -27,6 +27,7 @@
 
 
 class TestPutAlongAxisOp(OpTest):
+
     def setUp(self):
         self.init_data()
         self.reduce_op = "assign"
@@ -68,6 +69,7 @@ def init_data(self):
 
 
 class TestPutAlongAxisAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(0)
         self.shape = [1, 3]
@@ -111,6 +113,7 @@ def run(place):
             run(place)
 
     def test_api_dygraph(self):
+
         def run(place):
             paddle.disable_static(place)
             x_tensor = paddle.to_tensor(self.x_np)
@@ -122,9 +125,8 @@ def run(place):
                 np.put_along_axis(self.x_np, self.index_np, self.value_np,
                                   self.axis))
             out_ref = self.x_np
-            self.assertEqual(
-                np.allclose(
-                    out.numpy(), out_ref, rtol=1e-03), True)
+            self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-03),
+                             True)
 
             # for ci coverage, numpy put_along_axis did not support argument of 'reduce'
             paddle.put_along_axis(x_tensor, index_tensor, value_tensor,
@@ -138,6 +140,7 @@ def run(place):
             run(place)
 
     def test_inplace_dygraph(self):
+
         def run(place):
             paddle.disable_static(place)
             x_tensor = paddle.to_tensor(self.x_np)
@@ -151,9 +154,8 @@ def run(place):
                                   self.axis))
             out_ref = self.x_np
 
-            self.assertEqual(
-                np.allclose(
-                    x_tensor.numpy(), out_ref, rtol=1e-03), True)
+            self.assertEqual(np.allclose(x_tensor.numpy(), out_ref, rtol=1e-03),
+                             True)
             paddle.enable_static()
 
         for place in self.place:
@@ -161,6 +163,7 @@ def run(place):
 
 
 class TestPutAlongAxisAPICase2(TestPutAlongAxisAPI):
+
     def setUp(self):
         np.random.seed(0)
         self.shape = [2, 2]
@@ -177,12 +180,13 @@ def setUp(self):
 
 
 class TestPutAlongAxisAPICase3(TestPutAlongAxisAPI):
+
     def setUp(self):
         np.random.seed(0)
         self.shape = [2, 2]
         self.index_shape = [4, 2]
-        self.index_np = np.array(
-            [[0, 0], [1, 0], [0, 0], [1, 0]]).astype('int64')
+        self.index_np = np.array([[0, 0], [1, 0], [0, 0], [1,
+                                                           0]]).astype('int64')
         self.x_np = np.random.random(self.shape).astype(np.float32)
         self.place = [paddle.CPUPlace()]
         self.axis = 0
diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py
index 14b0eec9cbcdd..f0f791d62a7a2 100644
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -65,8 +65,8 @@ def cross_entropy_grad(logits, labels, bwd_dout):
     N = logits.shape[1]
     dlogits = np.zeros([M, N]).astype(logits.dtype)
     for idx in six.moves.range(M):
-        dlogits[idx][labels[idx][0]] = -bwd_dout[idx] / logits[idx][labels[idx][
-            0]]
+        dlogits[idx][labels[idx]
+                     [0]] = -bwd_dout[idx] / logits[idx][labels[idx][0]]
     return dlogits, None
 
 
@@ -76,22 +76,20 @@ def simple_fc_net(img, label, use_py_func_op):
         hidden = fluid.layers.fc(
             hidden,
             size=200,
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=1.0)))
         if not use_py_func_op:
             hidden = fluid.layers.tanh(hidden)
         else:
             new_hidden = fluid.default_main_program().current_block(
-            ).create_var(
-                name='hidden_{}'.format(idx),
-                dtype='float32',
-                shape=hidden.shape)
-            hidden = fluid.layers.py_func(
-                func=tanh,
-                x=hidden,
-                out=new_hidden,
-                backward_func=tanh_grad,
-                skip_vars_in_backward_input=hidden)
+            ).create_var(name='hidden_{}'.format(idx),
+                         dtype='float32',
+                         shape=hidden.shape)
+            hidden = fluid.layers.py_func(func=tanh,
+                                          x=hidden,
+                                          out=new_hidden,
+                                          backward_func=tanh_grad,
+                                          skip_vars_in_backward_input=hidden)
 
     prediction = fluid.layers.fc(hidden, size=10, act='softmax')
     if not use_py_func_op:
@@ -99,17 +97,17 @@ def simple_fc_net(img, label, use_py_func_op):
     else:
         loss = fluid.default_main_program().current_block().create_var(
             name='loss', dtype='float32', shape=[-1, 1])
-        loss = fluid.layers.py_func(
-            func=cross_entropy,
-            x=[prediction, label],
-            out=loss,
-            backward_func=cross_entropy_grad,
-            skip_vars_in_backward_input=loss)
+        loss = fluid.layers.py_func(func=cross_entropy,
+                                    x=[prediction, label],
+                                    out=loss,
+                                    backward_func=cross_entropy_grad,
+                                    skip_vars_in_backward_input=loss)
 
         dummy_var = fluid.default_main_program().current_block().create_var(
             name='test_tmp_var', dtype='float32', shape=[1])
-        fluid.layers.py_func(
-            func=dummy_func_with_no_input, x=None, out=dummy_var)
+        fluid.layers.py_func(func=dummy_func_with_no_input,
+                             x=None,
+                             out=dummy_var)
         loss += dummy_var
         fluid.layers.py_func(func=dummy_func_with_no_output, x=loss, out=None)
 
@@ -117,17 +115,15 @@ def simple_fc_net(img, label, use_py_func_op):
             dtype='float32', shape=[-1, 1])
         dummy_var_out = fluid.default_main_program().current_block().create_var(
             dtype='float32', shape=[1])
-        fluid.layers.py_func(
-            func=dummy_func_with_multi_input_output,
-            x=(loss, dummy_var),
-            out=(loss_out, dummy_var_out))
+        fluid.layers.py_func(func=dummy_func_with_multi_input_output,
+                             x=(loss, dummy_var),
+                             out=(loss_out, dummy_var_out))
         assert loss == loss_out and dummy_var == dummy_var_out, \
             "py_func failed with multi input and output"
 
-        fluid.layers.py_func(
-            func=dummy_func_with_multi_input_output,
-            x=[loss, dummy_var],
-            out=[loss_out, dummy_var_out])
+        fluid.layers.py_func(func=dummy_func_with_multi_input_output,
+                             x=[loss, dummy_var],
+                             out=[loss_out, dummy_var_out])
         assert loss == loss_out and dummy_var == dummy_var_out, \
             "py_func failed with multi input and output"
 
@@ -137,8 +133,9 @@ def simple_fc_net(img, label, use_py_func_op):
 
 def reader():
     for _ in six.moves.range(dev_cnt * 100):
-        yield np.random.random([784]), np.random.random_integers(
-            size=[1], low=0, high=9)
+        yield np.random.random([784]), np.random.random_integers(size=[1],
+                                                                 low=0,
+                                                                 high=9)
 
 
 def test_main(use_cuda, use_py_func_op, use_parallel_executor):
@@ -165,8 +162,8 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
             train_cp = fluid.default_main_program()
 
             if use_parallel_executor:
-                train_cp = compiler.CompiledProgram(fluid.default_main_program(
-                ))
+                train_cp = compiler.CompiledProgram(
+                    fluid.default_main_program())
                 train_cp = train_cp.with_data_parallel(loss_name=loss.name)
                 fetch_list = [loss.name]
             else:
@@ -183,6 +180,7 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
 
 
 class TestPyFuncOpUseExecutor(unittest.TestCase):
+
     def setUp(self):
         self.use_parallel_executor = False
 
@@ -201,6 +199,7 @@ def test_loss_diff(self):
 
 
 class TestPyFuncOpUseParallelExecutor(TestPyFuncOpUseExecutor):
+
     def setUp(self):
         self.use_parallel_executor = True
 
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_combination.py b/python/paddle/fluid/tests/unittests/test_py_reader_combination.py
index 624927d809fba..d3bc50bffe7cd 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_combination.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_combination.py
@@ -19,18 +19,23 @@
 
 
 class TestPyReaderCombination(unittest.TestCase):
+
     def setUp(self):
         self.n1 = 10
         self.n2 = 20
         self.batch_size = 2
 
     def create_reader(self, batch_num):
+
         def __impl__():
             for _ in range(batch_num):
-                image = np.random.uniform(
-                    low=-1, high=1, size=[batch_num, 784]).astype('float32')
-                label = np.random.random_integers(
-                    low=0, high=9, size=[batch_num, 1]).astype('int64')
+                image = np.random.uniform(low=-1, high=1,
+                                          size=[batch_num,
+                                                784]).astype('float32')
+                label = np.random.random_integers(low=0,
+                                                  high=9,
+                                                  size=[batch_num,
+                                                        1]).astype('int64')
                 yield image, label
 
         return __impl__
@@ -52,14 +57,17 @@ def _reset_iterable_reader(self, py_reader):
 
     def main_impl(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            image = fluid.layers.data(
-                name='image', dtype='float32', shape=[784])
+            image = fluid.layers.data(name='image',
+                                      dtype='float32',
+                                      shape=[784])
             label = fluid.layers.data(name='label', dtype='int64', shape=[1])
 
-            py_reader1 = fluid.io.PyReader(
-                feed_list=[image, label], capacity=16, iterable=True)
-            py_reader2 = fluid.io.PyReader(
-                feed_list=[image, label], capacity=16, iterable=True)
+            py_reader1 = fluid.io.PyReader(feed_list=[image, label],
+                                           capacity=16,
+                                           iterable=True)
+            py_reader2 = fluid.io.PyReader(feed_list=[image, label],
+                                           capacity=16,
+                                           iterable=True)
 
             reader1 = paddle.reader.cache(self.create_reader(self.n1))
             reader2 = paddle.reader.cache(self.create_reader(self.n2))
@@ -92,6 +100,7 @@ def test_main(self):
 
 
 class TestPyReaderCombination2(TestPyReaderCombination):
+
     def setUp(self):
         self.n1 = 20
         self.n2 = 10
@@ -99,6 +108,7 @@ def setUp(self):
 
 
 class TestPyReaderCombination3(TestPyReaderCombination):
+
     def setUp(self):
         self.n1 = 10
         self.n2 = 10
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_error_msg.py b/python/paddle/fluid/tests/unittests/test_py_reader_error_msg.py
index f4fa419b91dde..337cafbb1246f 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_error_msg.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_error_msg.py
@@ -19,34 +19,32 @@
 
 
 class TestPyReaderErrorMsg(unittest.TestCase):
+
     def test_check_input_array(self):
         fluid.reader.GeneratorLoader._check_input_array([
-            np.random.randint(
-                100, size=[2]), np.random.randint(
-                    100, size=[2]), np.random.randint(
-                        100, size=[2])
+            np.random.randint(100, size=[2]),
+            np.random.randint(100, size=[2]),
+            np.random.randint(100, size=[2])
         ])
-        self.assertRaises(
-            TypeError,
-            fluid.reader.GeneratorLoader._check_input_array, [
-                np.random.randint(
-                    100, size=[2]), np.random.randint(
-                        100, size=[1]), np.random.randint(
-                            100, size=[3])
-            ])
+        self.assertRaises(TypeError,
+                          fluid.reader.GeneratorLoader._check_input_array, [
+                              np.random.randint(100, size=[2]),
+                              np.random.randint(100, size=[1]),
+                              np.random.randint(100, size=[3])
+                          ])
 
 
 class TestDoubleBufferAPI(unittest.TestCase):
+
     def test_double_buffer(self):
         paddle.enable_static()
         if fluid.core.is_compiled_with_cuda():
-            reader = fluid.layers.py_reader(
-                capacity=64,
-                shapes=[(-1, 1, 28, 28), (-1, 1)],
-                dtypes=['float32', 'int64'],
-                use_double_buffer=False)
-            reader = fluid.layers.double_buffer(
-                reader, place=fluid.core.CUDAPlace(0))
+            reader = fluid.layers.py_reader(capacity=64,
+                                            shapes=[(-1, 1, 28, 28), (-1, 1)],
+                                            dtypes=['float32', 'int64'],
+                                            use_double_buffer=False)
+            reader = fluid.layers.double_buffer(reader,
+                                                place=fluid.core.CUDAPlace(0))
             image, label = fluid.layers.read_file(reader)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_lod_level_share.py b/python/paddle/fluid/tests/unittests/test_py_reader_lod_level_share.py
index 55dc3a7aa341f..4b5e2b9711ee1 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_lod_level_share.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_lod_level_share.py
@@ -17,6 +17,7 @@
 
 
 class TestLoDLevelShare(unittest.TestCase):
+
     def setUp(self):
         self.use_double_buffer = False
 
@@ -35,6 +36,7 @@ def test_lod_level_share(self):
 
 
 class TestLoDLevelShare2(TestLoDLevelShare):
+
     def setUp(self):
         self.use_double_buffer = True
 
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py b/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py
index 3aa359c0e0dc1..f03782cf665bd 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py
@@ -23,6 +23,7 @@
 
 
 def user_reader(inputs):
+
     def _reader():
         for d in inputs:
             yield d
@@ -31,6 +32,7 @@ def _reader():
 
 
 def batch_feeder(batch_reader, pin_memory=False, img_dtype="float32"):
+
     def _feeder():
         for batch_data in batch_reader():
             sample_batch = []
@@ -49,6 +51,7 @@ def _feeder():
 
 
 class TestPyReader(unittest.TestCase):
+
     def setUp(self):
         self.capacity = 10
         self.shapes = [(-1, 3, 2, 1), (-1, 1)]
@@ -57,29 +60,27 @@ def setUp(self):
 
     def test_pin_memory_pyreader(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
             executor = fluid.Executor(place)
 
-            data_file = fluid.layers.py_reader(
-                capacity=self.capacity,
-                dtypes=self.dtypes,
-                lod_levels=self.lod_levels,
-                shapes=self.shapes)
+            data_file = fluid.layers.py_reader(capacity=self.capacity,
+                                               dtypes=self.dtypes,
+                                               lod_levels=self.lod_levels,
+                                               shapes=self.shapes)
             # feed_queue = data_file.queue
             read_out_data = fluid.layers.read_file(data_file)
 
             self.inputs = []
             for _ in range(10):
-                sample = np.random.uniform(
-                    low=0, high=1, size=[3, 2, 1]).astype("float32")
+                sample = np.random.uniform(low=0, high=1,
+                                           size=[3, 2, 1]).astype("float32")
                 label = np.random.randint(low=0, high=10, dtype="int64")
                 self.inputs.append((sample, label))
 
             self.input_tensors = []
             for d, l in batch_feeder(
-                    paddle.batch(
-                        user_reader(self.inputs), batch_size=2),
+                    paddle.batch(user_reader(self.inputs), batch_size=2),
                     pin_memory=True
                     if fluid.core.is_compiled_with_cuda() else False)():
                 ta = fluid.LoDTensorArray()
@@ -97,11 +98,10 @@ def test_pin_memory_pyreader(self):
                 self.batched_inputs.append([feed_d, feed_l])
 
             data_file.decorate_tensor_provider(
-                batch_feeder(
-                    paddle.batch(
-                        user_reader(self.inputs), batch_size=2),
-                    pin_memory=True
-                    if fluid.core.is_compiled_with_cuda() else False))
+                batch_feeder(paddle.batch(user_reader(self.inputs),
+                                          batch_size=2),
+                             pin_memory=True
+                             if fluid.core.is_compiled_with_cuda() else False))
 
             executor.run(fluid.default_startup_program())
             self.outputs = []
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
index 3efe5aac8848b..f0757d2885dcc 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
@@ -26,6 +26,7 @@ def feed_data(feed_queue, inputs):
 
 
 class TestPyReader(unittest.TestCase):
+
     def setUp(self):
         self.capacity = 10
         self.batch_size_min = 10
@@ -43,15 +44,14 @@ def test_multiple_thread_main(self):
 
     def main(self, use_thread=False):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
             executor = fluid.Executor(place)
 
-            data_file = fluid.layers.py_reader(
-                capacity=self.capacity,
-                dtypes=self.dtypes,
-                lod_levels=self.lod_levels,
-                shapes=self.shapes)
+            data_file = fluid.layers.py_reader(capacity=self.capacity,
+                                               dtypes=self.dtypes,
+                                               lod_levels=self.lod_levels,
+                                               shapes=self.shapes)
             feed_queue = data_file.queue
             read_out_data = fluid.layers.read_file(data_file)
             self.inputs = []
@@ -61,9 +61,10 @@ def main(self, use_thread=False):
                 batch_size = np.random.random_integers(self.batch_size_min,
                                                        self.batch_size_max)
                 for shape, dtype in zip(self.shapes, self.dtypes):
-                    next_data = np.random.uniform(
-                        low=0, high=1000,
-                        size=(batch_size, ) + shape[1:]).astype(dtype)
+                    next_data = np.random.uniform(low=0,
+                                                  high=1000,
+                                                  size=(batch_size, ) +
+                                                  shape[1:]).astype(dtype)
                     in_data.append(
                         fluid.executor._as_lodtensor(next_data, place))
 
@@ -72,8 +73,8 @@ def main(self, use_thread=False):
             executor.run(fluid.default_startup_program())
             self.outputs = []
             if use_thread:
-                thread = Thread(
-                    target=feed_data, args=(feed_queue, self.inputs))
+                thread = Thread(target=feed_data,
+                                args=(feed_queue, self.inputs))
                 thread.start()
                 for in_data in self.inputs:
                     self.outputs.append(
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_return_list.py b/python/paddle/fluid/tests/unittests/test_py_reader_return_list.py
index c6e1856507835..c6e951997eabf 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_return_list.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_return_list.py
@@ -19,34 +19,38 @@
 
 
 class TestPyReader(unittest.TestCase):
+
     def setUp(self):
         self.batch_size = 32
         self.epoch_num = 2
         self.sample_num = 10
 
     def test_returnlist(self):
+
         def reader_creator_random_image(height, width):
+
             def reader():
                 for i in range(self.sample_num):
-                    yield np.random.uniform(
-                        low=0, high=255, size=[height, width]),
+                    yield np.random.uniform(low=0,
+                                            high=255,
+                                            size=[height, width]),
 
             return reader
 
         for return_list in [True, False]:
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                image = fluid.layers.data(
-                    name='image', shape=[784, 784], dtype='float32')
-                reader = fluid.io.PyReader(
-                    feed_list=[image],
-                    capacity=4,
-                    iterable=True,
-                    return_list=return_list)
+                image = fluid.layers.data(name='image',
+                                          shape=[784, 784],
+                                          dtype='float32')
+                reader = fluid.io.PyReader(feed_list=[image],
+                                           capacity=4,
+                                           iterable=True,
+                                           return_list=return_list)
 
                 user_defined_reader = reader_creator_random_image(784, 784)
                 reader.decorate_sample_list_generator(
-                    paddle.batch(
-                        user_defined_reader, batch_size=self.batch_size),
+                    paddle.batch(user_defined_reader,
+                                 batch_size=self.batch_size),
                     fluid.core.CPUPlace())
                 # definition of network is omitted
                 executor = fluid.Executor(fluid.core.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py b/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py
index 4efca5e2aafd9..7f0cf633ed25c 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py
@@ -23,6 +23,7 @@
 
 
 def random_reader(sample_num):
+
     def __impl__():
         for _ in range(sample_num):
             yield np.random.random(
@@ -33,6 +34,7 @@ def __impl__():
 
 
 class TestCaseBase(unittest.TestCase):
+
     def setUp(self):
         self.batch_size = 32
         self.epoch_num = 2
@@ -52,21 +54,22 @@ def generate_all_data(self, reader):
     def run_main(self, reader, use_sample_generator, iterable, drop_last):
         image = fluid.layers.data(name='image', dtype='float32', shape=[784])
         label = fluid.layers.data(name='label', dtype='int64', shape=[1])
-        py_reader = fluid.io.PyReader(
-            feed_list=[image, label],
-            capacity=16,
-            iterable=iterable,
-            use_double_buffer=False)
+        py_reader = fluid.io.PyReader(feed_list=[image, label],
+                                      capacity=16,
+                                      iterable=iterable,
+                                      use_double_buffer=False)
 
         batch_reader = paddle.batch(reader, self.batch_size, drop_last)
         all_datas = self.generate_all_data(batch_reader)
 
         if not use_sample_generator:
-            py_reader.decorate_sample_list_generator(
-                batch_reader, places=fluid.cpu_places())
+            py_reader.decorate_sample_list_generator(batch_reader,
+                                                     places=fluid.cpu_places())
         else:
-            py_reader.decorate_sample_generator(
-                reader, self.batch_size, drop_last, places=fluid.cpu_places())
+            py_reader.decorate_sample_generator(reader,
+                                                self.batch_size,
+                                                drop_last,
+                                                places=fluid.cpu_places())
 
         if drop_last:
             batch_num = int(self.sample_num / self.batch_size)
@@ -113,6 +116,7 @@ def test_main(self):
 
 
 class TestCase1(TestCaseBase):
+
     def setUp(self):
         self.batch_size = 32
         self.epoch_num = 10
@@ -120,6 +124,7 @@ def setUp(self):
 
 
 class TestCase2(TestCaseBase):
+
     def setUp(self):
         self.batch_size = 32
         self.epoch_num = 2
@@ -127,6 +132,7 @@ def setUp(self):
 
 
 class TestCase3(TestCaseBase):
+
     def setUp(self):
         self.batch_size = 32
         self.epoch_num = 2
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index b5684de4b900e..4be5a4ae94860 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -24,6 +24,7 @@
 import threading
 import multiprocessing
 import os
+
 os.environ['CPU_NUM'] = str(4)
 
 
@@ -114,13 +115,12 @@ def simple_fc_net(in_size,
             hidden,
             size=hidden_size,
             act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=1.0)))
 
     predict_label = fluid.layers.fc(hidden, size=class_num, act='softmax')
     loss = fluid.layers.mean(
-        fluid.layers.cross_entropy(
-            input=predict_label, label=label))
+        fluid.layers.cross_entropy(input=predict_label, label=label))
 
     optimizer = fluid.optimizer.Adam()
     optimizer.minimize(loss)
@@ -128,6 +128,7 @@ def simple_fc_net(in_size,
 
 
 class TestPyReaderUsingExecutor(unittest.TestCase):
+
     def setUp(self):
         self.in_size = 1000
         self.hidden_sizes = [50, 30, 20]
@@ -145,10 +146,14 @@ def test(self):
                         for use_decorate_paddle_reader in [False, True]:
                             print('Test Parameters:'),
                             print({
-                                'use_cuda': use_cuda,
-                                'use_parallel_executor': use_parallel_executor,
-                                'use_double_buffer': use_double_buffer,
-                                'use_feed_list': use_feed_list,
+                                'use_cuda':
+                                use_cuda,
+                                'use_parallel_executor':
+                                use_parallel_executor,
+                                'use_double_buffer':
+                                use_double_buffer,
+                                'use_feed_list':
+                                use_feed_list,
                                 'use_decorate_paddle_reader':
                                 use_decorate_paddle_reader
                             })
@@ -157,13 +162,15 @@ def test(self):
                                       use_decorate_paddle_reader)
 
     def tensor_reader(self, use_decorate_paddle_reader):
+
         def reader():
             for sample_id in range(self.batch_size * self.iterations *
                                    self.batch_size_times):
                 in_data = np.random.uniform(
                     low=0, high=1, size=(self.in_size, )).astype('float32')
-                label = np.random.random_integers(
-                    low=0, high=self.class_num - 1, size=(1, )).astype('int64')
+                label = np.random.random_integers(low=0,
+                                                  high=self.class_num - 1,
+                                                  size=(1, )).astype('int64')
 
                 reshaped_in_data = np.reshape(in_data, [1, -1])
                 reshaped_label = np.reshape(label, [1, -1])
@@ -239,8 +246,8 @@ def main(self,
                     py_reader.decorate_sample_list_generator(batch_reader)
                 py_reader.start()
             else:
-                thread = threading.Thread(
-                    target=feed_data, args=(feed_queue, batch_reader))
+                thread = threading.Thread(target=feed_data,
+                                          args=(feed_queue, batch_reader))
                 thread.daemon = True
                 thread.start()
 
diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
index aadfb4d39442c..d55e427f286c3 100644
--- a/python/paddle/fluid/tests/unittests/test_pylayer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,13 +23,17 @@
 
 
 class FakeTensor(paddle.fluid.core.VarBase):
+
     def __init__(self):
         pass
 
 
 class TestPyLayer(unittest.TestCase):
+
     def func_test_simple_pylayer_multiple_output(self):
+
         class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x1, x2, func1, func2=paddle.square):
                 ctx.func = func2
@@ -65,7 +69,9 @@ def test_simple_pylayer_multiple_output(self):
         self.func_test_simple_pylayer_multiple_output()
 
     def func_test_simple_pylayer_return_none_with_no_grad(self):
+
         class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x1, x2, func1, func2=paddle.square):
                 ctx.func = func2
@@ -105,7 +111,9 @@ def test_simple_pylayer_return_none_with_no_grad(self):
         self.func_test_simple_pylayer_return_none_with_no_grad()
 
     def func_test_simple_pylayer_single_output(self):
+
         class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x1, func1, func2=paddle.square):
                 ctx.func = func2
@@ -137,12 +145,15 @@ def test_simple_pylayer_single_output(self):
         self.func_test_simple_pylayer_single_output()
 
     def func_test_pylayer_num_output_match(self):
+
         class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(
-                    ctx,
-                    x1,
-                    x2, ):
+                ctx,
+                x1,
+                x2,
+            ):
                 return x1 + x2
 
             @staticmethod
@@ -163,7 +174,9 @@ def test_pylayer_num_output_match(self):
         self.func_test_pylayer_num_output_match()
 
     def func_test_pylayer_dtype(self):
+
         class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x, dtype):
                 y = paddle.cast(x, dtype)
@@ -192,7 +205,9 @@ def test_pylayer_dtype(self):
         self.func_test_pylayer_dtype()
 
     def func_test_pylayer_Exception_forward(self):
+
         class Layer_None1(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, *args):
                 return None
@@ -206,6 +221,7 @@ def backward(ctx, *args):
             z = Layer_None1.apply(input1)
 
         class Layer_None2(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, *args):
                 return [None, args[0]]
@@ -219,6 +235,7 @@ def backward(ctx, *args):
         z = Layer_None2.apply(input1)
 
         class Layer_one1(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, *args):
                 return 1
@@ -233,6 +250,7 @@ def backward(ctx, *args):
             z = Layer_one1.apply(input1)
 
         class Layer_one2(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, *args):
                 return [1, 2, args[0]]
@@ -242,10 +260,11 @@ def backward(ctx, *args):
                 return args
 
         input1 = paddle.randn([2, 3]).astype("float64")
-        # return int 
+        # return int
         z = Layer_one2.apply(input1)
 
         class Layer_no_fw(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def backward(ctx, *args):
                 return args
@@ -260,7 +279,9 @@ def test_pylayer_Exception_forward(self):
         self.func_test_pylayer_Exception_forward()
 
     def func_test_pylayer_nograd(self):
+
         class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x1, func1, func2=paddle.square, xx=None):
                 ctx.func = func2
@@ -283,7 +304,9 @@ def test_pylayer_nograd(self):
         self.func_test_pylayer_nograd()
 
     def func_test_pylayer_Exception_bk(self):
+
         class Layer_bk_none1(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x):
                 return x * 2
@@ -300,6 +323,7 @@ def backward(ctx, dy1):
             z.sum().backward()
 
         class Layer_bk_none2(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x1, x2):
                 return x1 + x2
@@ -316,6 +340,7 @@ def backward(ctx, dy1):
             z.mean().backward()
 
         class Layer_bk_one1(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x):
                 return x + x
@@ -332,6 +357,7 @@ def backward(ctx, dy):
             z.mean().backward()
 
         class Layer_bk_one2(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x1, x2):
                 return x1 * 2, x2 * 5
@@ -349,6 +375,7 @@ def backward(ctx, *args):
             z.mean().backward()
 
         class Layer_no_bk(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x):
                 return x * 2, x * 5
@@ -362,6 +389,7 @@ def forward(ctx, x):
             z.mean().backward()
 
         class Layer_bk_match(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x):
                 return x * 2, x * 5
@@ -383,7 +411,9 @@ def test_pylayer_Exception_bk(self):
         self.func_test_pylayer_Exception_bk()
 
     def func_test_pylayer_bk_return_none(self):
+
         class Layer_bk_none1(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x1, x2):
                 return x1 + x2
@@ -402,6 +432,7 @@ def backward(ctx, dy):
             z.mean().backward()
 
         class Layer_bk_none2(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x1, x2):
                 return x1 * 2, x2 * 5
@@ -425,7 +456,9 @@ def test_pylayer_bk_return_none(self):
         self.func_test_pylayer_bk_return_none()
 
     def func_test_pylayer_inplace(self):
+
         class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x):
                 return x
@@ -435,6 +468,7 @@ def backward(ctx, dy):
                 return dy
 
         class Layer(paddle.nn.Layer):
+
             def __init__(self):
                 super(Layer, self).__init__()
 
@@ -461,6 +495,7 @@ def test_pylayer_inplace_backward_error(self):
         with _test_eager_guard():
 
             class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
                 @staticmethod
                 def forward(ctx, x):
                     ctx.mark_dirty(x)
@@ -471,6 +506,7 @@ def backward(ctx, dy):
                     return dy
 
             class Layer(paddle.nn.Layer):
+
                 def __init__(self):
                     super(Layer, self).__init__()
 
@@ -495,6 +531,7 @@ def test_pylayer_inplace_backward_success_1(self):
         with _test_eager_guard():
 
             class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
                 @staticmethod
                 def forward(ctx, x):
                     ctx.mark_dirty(x)
@@ -505,6 +542,7 @@ def backward(ctx, dy):
                     return dy
 
             class Layer(paddle.nn.Layer):
+
                 def __init__(self):
                     super(Layer, self).__init__()
 
@@ -527,6 +565,7 @@ def test_pylayer_inplace_backward_success_2(self):
         with _test_eager_guard():
 
             class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
                 @staticmethod
                 def forward(ctx, x):
                     ctx.mark_dirty(x)
@@ -537,6 +576,7 @@ def backward(ctx, dy):
                     return dy
 
             class Layer(paddle.nn.Layer):
+
                 def __init__(self):
                     super(Layer, self).__init__()
 
@@ -556,7 +596,9 @@ def forward(self, data):
                 self.assertTrue(data.grad is not None)
 
     def func_test_pylayer_inplace_and_leaf_exception(self):
+
         class cus_pylayer_op(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x):
                 if in_dygraph_mode():
@@ -568,6 +610,7 @@ def backward(ctx, dy):
                 return dy
 
         class Layer(paddle.nn.Layer):
+
             def __init__(self):
                 super(Layer, self).__init__()
 
@@ -589,7 +632,9 @@ def test_pylayer_inplace_and_leaf_exception(self):
         self.func_test_pylayer_inplace_and_leaf_exception()
 
     def func_test_backward_in_backward(self):
+
         class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x):
                 temp = x.detach()
@@ -619,7 +664,9 @@ def test_backward_in_backward(self):
         self.func_test_backward_in_backward()
 
     def func_test_return_to_tensor(self):
+
         class Tanh(EagerPyLayer if in_dygraph_mode() else PyLayer):
+
             @staticmethod
             def forward(ctx, x1):
                 y1 = paddle.tanh(x1)
@@ -649,6 +696,7 @@ def test_materialize_grads(self):
         with _test_eager_guard():
 
             class Tanh(EagerPyLayer):
+
                 @staticmethod
                 def forward(ctx, x):
                     return x, x + x
@@ -666,6 +714,7 @@ def test_dont_materialize_grads(self):
         with _test_eager_guard():
 
             class Tanh(EagerPyLayer):
+
                 @staticmethod
                 def forward(ctx, x):
                     ctx.set_materialize_grads(False)
@@ -684,6 +733,7 @@ def test_mark_non_differentiable(self):
         with _test_eager_guard():
 
             class Tanh(EagerPyLayer):
+
                 @staticmethod
                 def forward(ctx, x):
                     a = x + x
@@ -704,6 +754,7 @@ def test_mark_non_differentiable2(self):
         with _test_eager_guard():
 
             class Tanh(EagerPyLayer):
+
                 @staticmethod
                 def forward(ctx, x):
                     a = x + x
@@ -725,8 +776,11 @@ def backward(ctx, grad_a, grad_b):
 
 
 class TestPyLayerReturnType(unittest.TestCase):
+
     def test_forward_args_fake_tensor(self):
+
         class Tanh(PyLayer):
+
             @staticmethod
             def forward(ctx, x1):
                 y1 = FakeTensor()
@@ -742,7 +796,9 @@ def backward(ctx, dy1, dy2):
             y1, y2 = Tanh.apply(input1)
 
     def test_forward_kwargs_fake_tensor(self):
+
         class Tanh(PyLayer):
+
             @staticmethod
             def forward(ctx, x1):
 
@@ -758,7 +814,9 @@ def backward(ctx, dy1, dy2):
             y = Tanh.apply(x1=input1)
 
     def test_forward_return_fake_tensor(self):
+
         class Tanh(PyLayer):
+
             @staticmethod
             def forward(ctx, x1):
 
@@ -774,7 +832,9 @@ def backward(ctx, dy1, dy2):
             y = Tanh.apply(x1=input1)
 
     def test_forward_return_fake_tensor_tuple(self):
+
         class Tanh(PyLayer):
+
             @staticmethod
             def forward(ctx, x1):
 
@@ -790,7 +850,9 @@ def backward(ctx, dy1, dy2):
             y = Tanh.apply(x1=input1)
 
     def test_backward_return_fake_tensor_tuple(self):
+
         class Tanh(PyLayer):
+
             @staticmethod
             def forward(ctx, x1, x2):
                 return x1 + 1, x1 + 2
@@ -808,7 +870,9 @@ def backward(ctx, dy1, dy2):
             y.mean().backward()
 
     def test_backward_return_fake_tensor(self):
+
         class Tanh(PyLayer):
+
             @staticmethod
             def forward(ctx, x1):
                 return x1 + 1, x1 + 2
diff --git a/python/paddle/fluid/tests/unittests/test_pyramid_hash_op.py b/python/paddle/fluid/tests/unittests/test_pyramid_hash_op.py
index 9ffea2c565cb9..6f3f94253c83d 100644
--- a/python/paddle/fluid/tests/unittests/test_pyramid_hash_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pyramid_hash_op.py
@@ -18,6 +18,7 @@
 
 
 class TestPyramidHashOpApi(unittest.TestCase):
+
     def test_api(self):
         num_voc = 128
         embed_dim = 64
@@ -38,13 +39,16 @@ def test_api(self):
             lr=0.002,
             param_attr=fluid.ParamAttr(
                 name="PyramidHash_emb_0",
-                learning_rate=0, ),
+                learning_rate=0,
+            ),
             param_attr_wl=fluid.ParamAttr(
                 name="Filter",
-                learning_rate=0, ),
+                learning_rate=0,
+            ),
             param_attr_bl=None,
             distribute_update_vars=["PyramidHash_emb_0"],
-            name=None, )
+            name=None,
+        )
 
         place = fluid.CPUPlace()
         x_tensor = fluid.create_lod_tensor(
diff --git a/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py b/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py
index a58d7d35807c6..ac5ca3d9a1b55 100644
--- a/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py
+++ b/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,6 +18,7 @@
 
 
 class TestBF16DataType(unittest.TestCase):
+
     def test_matmul(self):
         a_bf16 = np.random.random((6, 7)).astype(bfloat16)
         b_bf16 = np.random.random((7, 8)).astype(bfloat16)
diff --git a/python/paddle/fluid/tests/unittests/test_qr_op.py b/python/paddle/fluid/tests/unittests/test_qr_op.py
index ecf65d16d3431..338b08d1aa548 100644
--- a/python/paddle/fluid/tests/unittests/test_qr_op.py
+++ b/python/paddle/fluid/tests/unittests/test_qr_op.py
@@ -25,6 +25,7 @@
 
 
 class TestQrOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         np.random.seed(7)
@@ -74,31 +75,37 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X'], ['Q', 'R'], numeric_grad_delta=1e-5, max_relative_error=1e-6)
+        self.check_grad(['X'], ['Q', 'R'],
+                        numeric_grad_delta=1e-5,
+                        max_relative_error=1e-6)
 
 
 class TestQrOpCase1(TestQrOp):
+
     def get_shape(self):
         return (10, 12)
 
 
 class TestQrOpCase2(TestQrOp):
+
     def get_shape(self):
         return (16, 15)
 
 
 class TestQrOpCase3(TestQrOp):
+
     def get_shape(self):
         return (2, 12, 16)
 
 
 class TestQrOpCase4(TestQrOp):
+
     def get_shape(self):
         return (3, 16, 15)
 
 
 class TestQrOpCase5(TestQrOp):
+
     def get_mode(self):
         return "complete"
 
@@ -107,6 +114,7 @@ def get_shape(self):
 
 
 class TestQrOpCase6(TestQrOp):
+
     def get_mode(self):
         return "complete"
 
@@ -115,6 +123,7 @@ def get_shape(self):
 
 
 class TestQrAPI(unittest.TestCase):
+
     def test_dygraph(self):
         paddle.disable_static()
         np.random.seed(7)
@@ -176,8 +185,8 @@ def run_qr_dygraph(shape, mode, dtype):
         ]
         modes = ["reduced", "complete", "r"]
         dtypes = ["float32", "float64"]
-        for tensor_shape, mode, dtype in itertools.product(tensor_shapes, modes,
-                                                           dtypes):
+        for tensor_shape, mode, dtype in itertools.product(
+                tensor_shapes, modes, dtypes):
             run_qr_dygraph(tensor_shape, mode, dtype)
 
     def test_static(self):
@@ -219,29 +228,27 @@ def run_qr_static(shape, mode, dtype):
                             tmp_q, tmp_r = np.linalg.qr(a[coord], mode=mode)
                             np_q[coord] = tmp_q
                             np_r[coord] = tmp_r
-                    x = paddle.fluid.data(
-                        name="input", shape=shape, dtype=dtype)
+                    x = paddle.fluid.data(name="input",
+                                          shape=shape,
+                                          dtype=dtype)
                     if mode == "r":
                         r = paddle.linalg.qr(x, mode=mode)
                         exe = fluid.Executor(place)
                         fetches = exe.run(fluid.default_main_program(),
                                           feed={"input": a},
                                           fetch_list=[r])
-                        self.assertTrue(
-                            np.allclose(
-                                fetches[0], np_r, atol=1e-5))
+                        self.assertTrue(np.allclose(fetches[0], np_r,
+                                                    atol=1e-5))
                     else:
                         q, r = paddle.linalg.qr(x, mode=mode)
                         exe = fluid.Executor(place)
                         fetches = exe.run(fluid.default_main_program(),
                                           feed={"input": a},
                                           fetch_list=[q, r])
-                        self.assertTrue(
-                            np.allclose(
-                                fetches[0], np_q, atol=1e-5))
-                        self.assertTrue(
-                            np.allclose(
-                                fetches[1], np_r, atol=1e-5))
+                        self.assertTrue(np.allclose(fetches[0], np_q,
+                                                    atol=1e-5))
+                        self.assertTrue(np.allclose(fetches[1], np_r,
+                                                    atol=1e-5))
 
         tensor_shapes = [
             (3, 5),
@@ -256,8 +263,8 @@ def run_qr_static(shape, mode, dtype):
         ]
         modes = ["reduced", "complete", "r"]
         dtypes = ["float32", "float64"]
-        for tensor_shape, mode, dtype in itertools.product(tensor_shapes, modes,
-                                                           dtypes):
+        for tensor_shape, mode, dtype in itertools.product(
+                tensor_shapes, modes, dtypes):
             run_qr_static(tensor_shape, mode, dtype)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_quantile_and_nanquantile.py b/python/paddle/fluid/tests/unittests/test_quantile_and_nanquantile.py
index f0368cd2bc34f..3831abd16735f 100644
--- a/python/paddle/fluid/tests/unittests/test_quantile_and_nanquantile.py
+++ b/python/paddle/fluid/tests/unittests/test_quantile_and_nanquantile.py
@@ -135,10 +135,14 @@ def test_quantile_multiple_axis(self):
 
     def test_quantile_multiple_axis_keepdim(self):
         x = paddle.to_tensor(self.input_data)
-        paddle_res = paddle.quantile(
-            x, q=[0.1, 0.2, 0.3], axis=[1, 2], keepdim=True)
-        np_res = np.quantile(
-            self.input_data, q=[0.1, 0.2, 0.3], axis=[1, 2], keepdims=True)
+        paddle_res = paddle.quantile(x,
+                                     q=[0.1, 0.2, 0.3],
+                                     axis=[1, 2],
+                                     keepdim=True)
+        np_res = np.quantile(self.input_data,
+                             q=[0.1, 0.2, 0.3],
+                             axis=[1, 2],
+                             keepdims=True)
         self.assertTrue(np.allclose(paddle_res.numpy(), np_res))
 
 
@@ -239,12 +243,12 @@ def test_static(self):
         paddle.enable_static()
         for (func, res_func) in API_list:
             for device in self.devices:
-                x = paddle.static.data(
-                    name="x", shape=self.input_data.shape, dtype=paddle.float32)
-                x_fp64 = paddle.static.data(
-                    name="x_fp64",
-                    shape=self.input_data.shape,
-                    dtype=paddle.float64)
+                x = paddle.static.data(name="x",
+                                       shape=self.input_data.shape,
+                                       dtype=paddle.float32)
+                x_fp64 = paddle.static.data(name="x_fp64",
+                                            shape=self.input_data.shape,
+                                            dtype=paddle.float64)
 
                 results = func(x, q=0.5, axis=1)
                 np_input_data = self.input_data.astype('float32')
@@ -254,14 +258,16 @@ def test_static(self):
                 exe = paddle.static.Executor(device)
                 paddle_res, paddle_res_fp64 = exe.run(
                     paddle.static.default_main_program(),
-                    feed={"x": np_input_data,
-                          "x_fp64": np_input_data_fp64},
+                    feed={
+                        "x": np_input_data,
+                        "x_fp64": np_input_data_fp64
+                    },
                     fetch_list=[results, results_fp64])
                 np_res = res_func(np_input_data, q=0.5, axis=1)
                 np_res_fp64 = res_func(np_input_data_fp64, q=0.5, axis=1)
                 self.assertTrue(
-                    np.allclose(paddle_res, np_res) and
-                    np.allclose(paddle_res_fp64, np_res_fp64))
+                    np.allclose(paddle_res, np_res)
+                    and np.allclose(paddle_res_fp64, np_res_fp64))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_query_op.py b/python/paddle/fluid/tests/unittests/test_query_op.py
index fc8ce5ad5f6b8..ced08a0fc5383 100644
--- a/python/paddle/fluid/tests/unittests/test_query_op.py
+++ b/python/paddle/fluid/tests/unittests/test_query_op.py
@@ -20,6 +20,7 @@
 
 
 class TestCudnnVersion(unittest.TestCase):
+
     def test_no_cudnn(self):
         cudnn_version = paddle.get_cudnn_version()
         if not core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_queue.py b/python/paddle/fluid/tests/unittests/test_queue.py
index cfb843d75eb78..a4b01870d3239 100644
--- a/python/paddle/fluid/tests/unittests/test_queue.py
+++ b/python/paddle/fluid/tests/unittests/test_queue.py
@@ -24,6 +24,7 @@
 
 
 class TestQueue(unittest.TestCase):
+
     def test_eq(self):
         """
         test queue_generator op, enqueue op and dequeue op.
@@ -33,36 +34,33 @@ def test_eq(self):
         startup_program = fluid.Program()
         value = np.random.rand(1)
         with fluid.program_guard(main_program, startup_program):
-            data_in = layers.create_global_var(
-                shape=[2, 3],
-                value=value,
-                dtype="float32",
-                persistable=True,
-                name='var_in')
-            data_out = layers.create_global_var(
-                shape=[2, 3],
-                value=value - 1.0,
-                dtype="float32",
-                persistable=True,
-                name='var_out')
+            data_in = layers.create_global_var(shape=[2, 3],
+                                               value=value,
+                                               dtype="float32",
+                                               persistable=True,
+                                               name='var_in')
+            data_out = layers.create_global_var(shape=[2, 3],
+                                                value=value - 1.0,
+                                                dtype="float32",
+                                                persistable=True,
+                                                name='var_out')
         startup_block = startup_program.block(0)
         queue_name = 'blocking_queue'
-        startup_block.create_var(
-            name=queue_name, persistable=True, type=core.VarDesc.VarType.RAW)
-        startup_block.append_op(
-            type="queue_generator", attrs={'names': [queue_name]})
+        startup_block.create_var(name=queue_name,
+                                 persistable=True,
+                                 type=core.VarDesc.VarType.RAW)
+        startup_block.append_op(type="queue_generator",
+                                attrs={'names': [queue_name]})
         block = main_program.block(0)
-        block.append_op(
-            type='enqueue',
-            inputs={'X': data_in},
-            attrs={'queue_name': queue_name})
-        block.append_op(
-            type='dequeue',
-            outputs={'Out': [data_out]},
-            attrs={'queue_name': queue_name})
+        block.append_op(type='enqueue',
+                        inputs={'X': data_in},
+                        attrs={'queue_name': queue_name})
+        block.append_op(type='dequeue',
+                        outputs={'Out': [data_out]},
+                        attrs={'queue_name': queue_name})
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(startup_program)
         ret = exe.run(main_program, fetch_list=[data_out.name])
diff --git a/python/paddle/fluid/tests/unittests/test_rad2deg.py b/python/paddle/fluid/tests/unittests/test_rad2deg.py
index 9f117cbab9a4d..0299884a8bb7d 100644
--- a/python/paddle/fluid/tests/unittests/test_rad2deg.py
+++ b/python/paddle/fluid/tests/unittests/test_rad2deg.py
@@ -26,10 +26,11 @@
 
 
 class TestRad2degAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_dtype = 'float64'
-        self.x_np = np.array(
-            [3.142, -3.142, 6.283, -6.283, 1.570, -1.570]).astype(np.float64)
+        self.x_np = np.array([3.142, -3.142, 6.283, -6.283, 1.570,
+                              -1.570]).astype(np.float64)
         self.x_shape = [6]
         self.out_np = np.rad2deg(self.x_np)
 
@@ -40,8 +41,8 @@ def test_static_graph(self):
             x = fluid.data(name='input', dtype=self.x_dtype, shape=self.x_shape)
             out = paddle.rad2deg(x)
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
             res = exe.run(fluid.default_main_program(),
                           feed={'input': self.x_np},
@@ -58,6 +59,7 @@ def test_dygraph(self):
 
 
 class TestRad2degAPI2(TestRad2degAPI):
+
     def setUp(self):
         self.x_np = np.pi / 2
         self.x_shape = [1]
diff --git a/python/paddle/fluid/tests/unittests/test_rand_op.py b/python/paddle/fluid/tests/unittests/test_rand_op.py
index 4b8fe8c7e4786..d8b4de6036e6d 100644
--- a/python/paddle/fluid/tests/unittests/test_rand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rand_op.py
@@ -35,8 +35,8 @@ def test_errors(self):
         with program_guard(main_prog, start_prog):
 
             def test_Variable():
-                x1 = fluid.create_lod_tensor(
-                    np.zeros((4, 784)), [[1, 1, 1, 1]], fluid.CPUPlace())
+                x1 = fluid.create_lod_tensor(np.zeros((4, 784)), [[1, 1, 1, 1]],
+                                             fluid.CPUPlace())
                 rand(x1)
 
             self.assertRaises(TypeError, test_Variable)
@@ -71,8 +71,9 @@ def run_net(self, use_cuda=False):
             var_shape = fluid.data(name='var_shape', shape=[2], dtype="int64")
             result_3 = rand(var_shape)
 
-            var_shape_int32 = fluid.data(
-                name='var_shape_int32', shape=[2], dtype="int32")
+            var_shape_int32 = fluid.data(name='var_shape_int32',
+                                         shape=[2],
+                                         dtype="int32")
             result_4 = rand(var_shape_int32)
 
         exe.run(startup_program)
@@ -81,8 +82,10 @@ def run_net(self, use_cuda=False):
         x2 = np.array([4, 3]).astype('int32')
         ret = exe.run(
             train_program,
-            feed={"var_shape": x1,
-                  "var_shape_int32": x2},
+            feed={
+                "var_shape": x1,
+                "var_shape_int32": x2
+            },
             fetch_list=[result_1, result_1, result_2, result_3, result_4])
 
     def test_run(self):
@@ -117,6 +120,7 @@ def test_run(self):
 
 
 class TestRandDtype(unittest.TestCase):
+
     def test_default_dtype(self):
         paddle.disable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_randint_like.py b/python/paddle/fluid/tests/unittests/test_randint_like.py
index c716fd549244b..181a7f9763eb8 100644
--- a/python/paddle/fluid/tests/unittests/test_randint_like.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_like.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 # Test python API
 class TestRandintLikeAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_bool = np.zeros((10, 12)).astype("bool")
         self.x_int32 = np.zeros((10, 12)).astype("int32")
@@ -38,25 +39,30 @@ def test_static_api(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
             # results are from [-100, 100).
-            x_bool = paddle.fluid.data(
-                name="x_bool", shape=[10, 12], dtype="bool")
-            x_int32 = paddle.fluid.data(
-                name="x_int32", shape=[10, 12], dtype="int32")
-            x_int64 = paddle.fluid.data(
-                name="x_int64", shape=[10, 12], dtype="int64")
-            x_float16 = paddle.fluid.data(
-                name="x_float16", shape=[10, 12], dtype="float16")
-            x_float32 = paddle.fluid.data(
-                name="x_float32", shape=[10, 12], dtype="float32")
-            x_float64 = paddle.fluid.data(
-                name="x_float64", shape=[10, 12], dtype="float64")
+            x_bool = paddle.fluid.data(name="x_bool",
+                                       shape=[10, 12],
+                                       dtype="bool")
+            x_int32 = paddle.fluid.data(name="x_int32",
+                                        shape=[10, 12],
+                                        dtype="int32")
+            x_int64 = paddle.fluid.data(name="x_int64",
+                                        shape=[10, 12],
+                                        dtype="int64")
+            x_float16 = paddle.fluid.data(name="x_float16",
+                                          shape=[10, 12],
+                                          dtype="float16")
+            x_float32 = paddle.fluid.data(name="x_float32",
+                                          shape=[10, 12],
+                                          dtype="float32")
+            x_float64 = paddle.fluid.data(name="x_float64",
+                                          shape=[10, 12],
+                                          dtype="float64")
 
             exe = paddle.static.Executor(self.place)
 
             # x dtype is bool output dtype in ["bool", "int32", "int64", "float16", "float32", "float64"]
             outlist1 = [
-                paddle.randint_like(
-                    x_bool, low=-10, high=10, dtype=dtype)
+                paddle.randint_like(x_bool, low=-10, high=10, dtype=dtype)
                 for dtype in self.dtype
             ]
             outs1 = exe.run(feed={'x_bool': self.x_bool}, fetch_list=outlist1)
@@ -66,8 +72,7 @@ def test_static_api(self):
 
             # x dtype is int32 output dtype in ["bool", "int32", "int64", "float16", "float32", "float64"]
             outlist2 = [
-                paddle.randint_like(
-                    x_int32, low=-5, high=10, dtype=dtype)
+                paddle.randint_like(x_int32, low=-5, high=10, dtype=dtype)
                 for dtype in self.dtype
             ]
             outs2 = exe.run(feed={'x_int32': self.x_int32}, fetch_list=outlist2)
@@ -77,8 +82,7 @@ def test_static_api(self):
 
             # x dtype is int64 output dtype in ["bool", "int32", "int64", "float16", "float32", "float64"]
             outlist3 = [
-                paddle.randint_like(
-                    x_int64, low=-100, high=100, dtype=dtype)
+                paddle.randint_like(x_int64, low=-100, high=100, dtype=dtype)
                 for dtype in self.dtype
             ]
             outs3 = exe.run(feed={'x_int64': self.x_int64}, fetch_list=outlist3)
@@ -88,8 +92,7 @@ def test_static_api(self):
 
             # x dtype is float16 output dtype in ["bool", "int32", "int64", "float16", "float32", "float64"]
             outlist4 = [
-                paddle.randint_like(
-                    x_float16, low=-3, high=25, dtype=dtype)
+                paddle.randint_like(x_float16, low=-3, high=25, dtype=dtype)
                 for dtype in self.dtype
             ]
             outs4 = exe.run(feed={'x_float16': self.x_float16},
@@ -100,8 +103,7 @@ def test_static_api(self):
 
             # x dtype is float32 output dtype in ["bool", "int32", "int64", "float16", "float32", "float64"]
             outlist5 = [
-                paddle.randint_like(
-                    x_float32, low=-25, high=25, dtype=dtype)
+                paddle.randint_like(x_float32, low=-25, high=25, dtype=dtype)
                 for dtype in self.dtype
             ]
             outs5 = exe.run(feed={'x_float32': self.x_float32},
@@ -112,8 +114,7 @@ def test_static_api(self):
 
             # x dtype is float64 output dtype in ["bool", "int32", "int64", "float16", "float32", "float64"]
             outlist6 = [
-                paddle.randint_like(
-                    x_float64, low=-16, high=16, dtype=dtype)
+                paddle.randint_like(x_float64, low=-16, high=16, dtype=dtype)
                 for dtype in self.dtype
             ]
             outs6 = exe.run(feed={'x_float64': self.x_float64},
@@ -132,34 +133,45 @@ def test_dygraph_api(self):
             x_inputs = paddle.to_tensor(x)
             # self.dtype ["bool", "int32", "int64", "float16", "float32", "float64"]
             for dtype in self.dtype:
-                out = paddle.randint_like(
-                    x_inputs, low=-100, high=100, dtype=dtype)
+                out = paddle.randint_like(x_inputs,
+                                          low=-100,
+                                          high=100,
+                                          dtype=dtype)
                 self.assertTrue(out.numpy().dtype, np.dtype(dtype))
-                self.assertTrue(((out.numpy() >= -100) &
-                                 (out.numpy() <= 100)).all(), True)
+                self.assertTrue(
+                    ((out.numpy() >= -100) & (out.numpy() <= 100)).all(), True)
 
         paddle.enable_static()
 
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
-            x_bool = paddle.fluid.data(
-                name="x_bool", shape=[10, 12], dtype="bool")
-            x_int32 = paddle.fluid.data(
-                name="x_int32", shape=[10, 12], dtype="int32")
-            x_int64 = paddle.fluid.data(
-                name="x_int64", shape=[10, 12], dtype="int64")
-            x_float16 = paddle.fluid.data(
-                name="x_float16", shape=[10, 12], dtype="float16")
-            x_float32 = paddle.fluid.data(
-                name="x_float32", shape=[10, 12], dtype="float32")
-            x_float64 = paddle.fluid.data(
-                name="x_float64", shape=[10, 12], dtype="float64")
+            x_bool = paddle.fluid.data(name="x_bool",
+                                       shape=[10, 12],
+                                       dtype="bool")
+            x_int32 = paddle.fluid.data(name="x_int32",
+                                        shape=[10, 12],
+                                        dtype="int32")
+            x_int64 = paddle.fluid.data(name="x_int64",
+                                        shape=[10, 12],
+                                        dtype="int64")
+            x_float16 = paddle.fluid.data(name="x_float16",
+                                          shape=[10, 12],
+                                          dtype="float16")
+            x_float32 = paddle.fluid.data(name="x_float32",
+                                          shape=[10, 12],
+                                          dtype="float32")
+            x_float64 = paddle.fluid.data(name="x_float64",
+                                          shape=[10, 12],
+                                          dtype="float64")
 
             # x dtype is bool
             # low is 5 and high is 5, low must less then high
-            self.assertRaises(
-                ValueError, paddle.randint_like, x_bool, low=5, high=5)
+            self.assertRaises(ValueError,
+                              paddle.randint_like,
+                              x_bool,
+                              low=5,
+                              high=5)
             # low(default value) is 0 and high is -5, low must less then high
             self.assertRaises(ValueError, paddle.randint_like, x_bool, high=-5)
             # if high is None, low must be greater than 0
@@ -167,8 +179,11 @@ def test_errors(self):
 
             # x dtype is int32
             # low is 5 and high is 5, low must less then high
-            self.assertRaises(
-                ValueError, paddle.randint_like, x_int32, low=5, high=5)
+            self.assertRaises(ValueError,
+                              paddle.randint_like,
+                              x_int32,
+                              low=5,
+                              high=5)
             # low(default value) is 0 and high is -5, low must less then high
             self.assertRaises(ValueError, paddle.randint_like, x_int32, high=-5)
             # if high is None, low must be greater than 0
@@ -176,8 +191,11 @@ def test_errors(self):
 
             # x dtype is int64
             # low is 5 and high is 5, low must less then high
-            self.assertRaises(
-                ValueError, paddle.randint_like, x_int64, low=5, high=5)
+            self.assertRaises(ValueError,
+                              paddle.randint_like,
+                              x_int64,
+                              low=5,
+                              high=5)
             # low(default value) is 0 and high is -5, low must less then high
             self.assertRaises(ValueError, paddle.randint_like, x_int64, high=-5)
             # if high is None, low must be greater than 0
@@ -185,36 +203,57 @@ def test_errors(self):
 
             # x dtype is float16
             # low is 5 and high is 5, low must less then high
-            self.assertRaises(
-                ValueError, paddle.randint_like, x_float16, low=5, high=5)
+            self.assertRaises(ValueError,
+                              paddle.randint_like,
+                              x_float16,
+                              low=5,
+                              high=5)
             # low(default value) is 0 and high is -5, low must less then high
-            self.assertRaises(
-                ValueError, paddle.randint_like, x_float16, high=-5)
+            self.assertRaises(ValueError,
+                              paddle.randint_like,
+                              x_float16,
+                              high=-5)
             # if high is None, low must be greater than 0
-            self.assertRaises(
-                ValueError, paddle.randint_like, x_float16, low=-5)
+            self.assertRaises(ValueError,
+                              paddle.randint_like,
+                              x_float16,
+                              low=-5)
 
             # x dtype is float32
             # low is 5 and high is 5, low must less then high
-            self.assertRaises(
-                ValueError, paddle.randint_like, x_float32, low=5, high=5)
+            self.assertRaises(ValueError,
+                              paddle.randint_like,
+                              x_float32,
+                              low=5,
+                              high=5)
             # low(default value) is 0 and high is -5, low must less then high
-            self.assertRaises(
-                ValueError, paddle.randint_like, x_float32, high=-5)
+            self.assertRaises(ValueError,
+                              paddle.randint_like,
+                              x_float32,
+                              high=-5)
             # if high is None, low must be greater than 0
-            self.assertRaises(
-                ValueError, paddle.randint_like, x_float32, low=-5)
+            self.assertRaises(ValueError,
+                              paddle.randint_like,
+                              x_float32,
+                              low=-5)
 
             # x dtype is float64
             # low is 5 and high is 5, low must less then high
-            self.assertRaises(
-                ValueError, paddle.randint_like, x_float64, low=5, high=5)
+            self.assertRaises(ValueError,
+                              paddle.randint_like,
+                              x_float64,
+                              low=5,
+                              high=5)
             # low(default value) is 0 and high is -5, low must less then high
-            self.assertRaises(
-                ValueError, paddle.randint_like, x_float64, high=-5)
+            self.assertRaises(ValueError,
+                              paddle.randint_like,
+                              x_float64,
+                              high=-5)
             # if high is None, low must be greater than 0
-            self.assertRaises(
-                ValueError, paddle.randint_like, x_float64, low=-5)
+            self.assertRaises(ValueError,
+                              paddle.randint_like,
+                              x_float64,
+                              low=-5)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py
index 361f4d280f70f..f5d18a9268f40 100644
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
@@ -35,6 +35,7 @@ def output_hist(out):
 
 
 class TestRandintOp(OpTest):
+
     def setUp(self):
         self.op_type = "randint"
         self.inputs = {}
@@ -50,9 +51,8 @@ def test_check_output(self):
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.001), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.001),
+                        "hist: " + str(hist))
 
     def test_check_output_eager(self):
         with _test_eager_guard():
@@ -60,6 +60,7 @@ def test_check_output_eager(self):
 
 
 class TestRandintOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             self.assertRaises(TypeError, paddle.randint, 5, shape=np.array([2]))
@@ -69,8 +70,10 @@ def test_errors(self):
             self.assertRaises(TypeError, paddle.randint, 5, shape=['2'])
             shape_tensor = paddle.static.data('X', [1])
             self.assertRaises(TypeError, paddle.randint, 5, shape=shape_tensor)
-            self.assertRaises(
-                TypeError, paddle.randint, 5, shape=[shape_tensor])
+            self.assertRaises(TypeError,
+                              paddle.randint,
+                              5,
+                              shape=[shape_tensor])
 
     def test_errors_eager(self):
         with _test_eager_guard():
@@ -78,6 +81,7 @@ def test_errors_eager(self):
 
 
 class TestRandintOp_attr_tensorlist(OpTest):
+
     def setUp(self):
         self.op_type = "randint"
         self.new_shape = (10000, 784)
@@ -98,9 +102,8 @@ def test_check_output(self):
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.001), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.001),
+                        "hist: " + str(hist))
 
     def test_check_output_eager(self):
         with _test_eager_guard():
@@ -108,6 +111,7 @@ def test_check_output_eager(self):
 
 
 class TestRandint_attr_tensor(OpTest):
+
     def setUp(self):
         self.op_type = "randint"
         self.inputs = {"ShapeTensor": np.array([10000, 784]).astype("int64")}
@@ -123,9 +127,8 @@ def test_check_output(self):
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.001), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.001),
+                        "hist: " + str(hist))
 
     def test_check_output_eager(self):
         with _test_eager_guard():
@@ -134,29 +137,39 @@ def test_check_output_eager(self):
 
 # Test python API
 class TestRandintAPI(unittest.TestCase):
+
     def test_api(self):
         with program_guard(Program(), Program()):
             # results are from [0, 5).
             out1 = paddle.randint(5)
             # shape is a list and dtype is 'int32'
-            out2 = paddle.randint(
-                low=-100, high=100, shape=[64, 64], dtype='int32')
+            out2 = paddle.randint(low=-100,
+                                  high=100,
+                                  shape=[64, 64],
+                                  dtype='int32')
             # shape is a tuple and dtype is 'int64'
-            out3 = paddle.randint(
-                low=-100, high=100, shape=(32, 32, 3), dtype='int64')
+            out3 = paddle.randint(low=-100,
+                                  high=100,
+                                  shape=(32, 32, 3),
+                                  dtype='int64')
             # shape is a tensorlist and dtype is 'float32'
             dim_1 = paddle.fluid.layers.fill_constant([1], "int64", 32)
             dim_2 = paddle.fluid.layers.fill_constant([1], "int32", 50)
-            out4 = paddle.randint(
-                low=-100, high=100, shape=[dim_1, 5, dim_2], dtype='int32')
+            out4 = paddle.randint(low=-100,
+                                  high=100,
+                                  shape=[dim_1, 5, dim_2],
+                                  dtype='int32')
             # shape is a tensor and dtype is 'float64'
-            var_shape = paddle.static.data(
-                name='var_shape', shape=[2], dtype="int64")
-            out5 = paddle.randint(
-                low=1, high=1000, shape=var_shape, dtype='int64')
-
-            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else paddle.CPUPlace()
+            var_shape = paddle.static.data(name='var_shape',
+                                           shape=[2],
+                                           dtype="int64")
+            out5 = paddle.randint(low=1,
+                                  high=1000,
+                                  shape=var_shape,
+                                  dtype='int64')
+
+            place = paddle.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             outs = exe.run(
                 feed={'var_shape': np.array([100, 100]).astype('int64')},
@@ -168,6 +181,7 @@ def test_api_eager(self):
 
 
 class TestRandintImperative(unittest.TestCase):
+
     def test_api(self):
         paddle.disable_static()
 
@@ -189,6 +203,7 @@ def run_test_case(self):
 
 
 class TestRandomValue(unittest.TestCase):
+
     def test_fixed_random_number(self):
         # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
         if not paddle.is_compiled_with_cuda():
@@ -212,8 +227,8 @@ def run_test_case(self):
         paddle.set_device('gpu')
         paddle.seed(100)
 
-        x = paddle.randint(
-            -10000, 10000, [32, 3, 1024, 1024], dtype='int32').numpy()
+        x = paddle.randint(-10000, 10000, [32, 3, 1024, 1024],
+                           dtype='int32').numpy()
         self.assertTrue(x.mean(), -0.7517569760481516)
         self.assertTrue(x.std(), 5773.696619107639)
         expect = [2535, 2109, 5916, -5011, -261]
@@ -223,8 +238,8 @@ def run_test_case(self):
         expect = [881, 1560, 1100, 9664, 1669]
         self.assertTrue(np.array_equal(x[30, 2, 1000, 1000:1005], expect))
 
-        x = paddle.randint(
-            -10000, 10000, [32, 3, 1024, 1024], dtype='int64').numpy()
+        x = paddle.randint(-10000, 10000, [32, 3, 1024, 1024],
+                           dtype='int64').numpy()
         self.assertTrue(x.mean(), -1.461287518342336)
         self.assertTrue(x.std(), 5773.023477548159)
         expect = [7213, -9597, 754, 8129, -1158]
diff --git a/python/paddle/fluid/tests/unittests/test_randn_op.py b/python/paddle/fluid/tests/unittests/test_randn_op.py
index 6d33b468ee1d0..8347411192e5a 100644
--- a/python/paddle/fluid/tests/unittests/test_randn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randn_op.py
@@ -22,6 +22,7 @@
 
 
 class TestRandnOp(unittest.TestCase):
+
     def test_api(self):
         shape = [1000, 784]
         train_program = Program()
@@ -37,12 +38,11 @@ def test_api(self):
             var_shape = paddle.static.data('X', [2], 'int32')
             x4 = paddle.randn(var_shape)
 
-        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
         exe = paddle.static.Executor(place)
         res = exe.run(train_program,
-                      feed={'X': np.array(
-                          shape, dtype='int32')},
+                      feed={'X': np.array(shape, dtype='int32')},
                       fetch_list=[x1, x2, x3, x4])
 
         for out in res:
@@ -51,10 +51,11 @@ def test_api(self):
 
 
 class TestRandnOpForDygraph(unittest.TestCase):
+
     def test_api(self):
         shape = [1000, 784]
-        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
         paddle.disable_static(place)
         x1 = paddle.randn(shape, 'float32')
         x2 = paddle.randn(shape, 'float64')
@@ -73,6 +74,7 @@ def test_api(self):
 
 
 class TestRandnOpError(unittest.TestCase):
+
     def test_error(self):
         with program_guard(Program(), Program()):
             # The argument shape's size of randn_op should not be 0.
diff --git a/python/paddle/fluid/tests/unittests/test_random_crop_op.py b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
index 98e060f69d24d..7b15899bab260 100644
--- a/python/paddle/fluid/tests/unittests/test_random_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
@@ -22,6 +22,7 @@
 
 
 class TestRandomCropOp(OpTest):
+
     def setUp(self):
         to_crop = np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]] *
                            5).astype(np.int32)
@@ -47,6 +48,7 @@ def verify_output(self, outs):
 
 
 class TestRandomCropOpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program()):
 
@@ -57,15 +59,17 @@ def test_x_type():
             self.assertRaises(TypeError, test_x_type)
 
             def test_x_dtype():
-                x2 = fluid.layers.data(
-                    name='x2', shape=[None, 3, 256, 256], dtype='float16')
+                x2 = fluid.layers.data(name='x2',
+                                       shape=[None, 3, 256, 256],
+                                       dtype='float16')
                 fluid.layers.random_crop(x2)
 
             self.assertRaises(TypeError, test_x_dtype)
 
             def test_shape_type():
-                x3 = fluid.layers.data(
-                    name='x3', shape=[None, 3, 256, 256], dtype='float32')
+                x3 = fluid.layers.data(name='x3',
+                                       shape=[None, 3, 256, 256],
+                                       dtype='float32')
                 fluid.layers.random_crop(x3, shape=1)
 
             self.assertRaises(TypeError, test_shape_type)
diff --git a/python/paddle/fluid/tests/unittests/test_random_routing_op.py b/python/paddle/fluid/tests/unittests/test_random_routing_op.py
index e4bb7c5ca5fd8..d4eadd268cc28 100644
--- a/python/paddle/fluid/tests/unittests/test_random_routing_op.py
+++ b/python/paddle/fluid/tests/unittests/test_random_routing_op.py
@@ -42,14 +42,15 @@ def random_routing(topk_idx, topk_value, prob, topk=2):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestNumberCountAPIFp32(unittest.TestCase):
+
     def setUp(self):
         self.dtype = "float32"
         self.init()
 
     def init(self):
         self.upper_range = 8
-        self.x = np.random.randint(
-            -1, self.upper_range, size=(200, 2)).astype('int64')
+        self.x = np.random.randint(-1, self.upper_range,
+                                   size=(200, 2)).astype('int64')
         self.prob = np.random.random((self.x.shape[0], )).astype(self.dtype)
         self.topk_value = np.random.random(self.x.shape).astype(self.dtype)
         self.out = random_routing(self.x, self.topk_value,
@@ -73,6 +74,7 @@ def test_api_dygraph(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestNumberCountAPIFp16(TestNumberCountAPIFp32):
+
     def setUp(self):
         self.dtype = "float16"
         self.init()
diff --git a/python/paddle/fluid/tests/unittests/test_random_seed.py b/python/paddle/fluid/tests/unittests/test_random_seed.py
index 617c0e61da8ac..f4d16a0a81ea9 100644
--- a/python/paddle/fluid/tests/unittests/test_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_random_seed.py
@@ -39,17 +39,23 @@ def test_generator_uniform_random_dygraph(self):
         x = fluid.layers.uniform_random([10], dtype="float32", min=0.0, max=1.0)
 
         st1 = gen.get_state()
-        x1 = fluid.layers.uniform_random(
-            [10], dtype="float32", min=0.0, max=1.0)
+        x1 = fluid.layers.uniform_random([10],
+                                         dtype="float32",
+                                         min=0.0,
+                                         max=1.0)
 
         gen.set_state(st1)
         print(gen.get_state())
-        x2 = fluid.layers.uniform_random(
-            [10], dtype="float32", min=0.0, max=1.0)
+        x2 = fluid.layers.uniform_random([10],
+                                         dtype="float32",
+                                         min=0.0,
+                                         max=1.0)
 
         paddle.seed(12312321111)
-        x3 = fluid.layers.uniform_random(
-            [10], dtype="float32", min=0.0, max=1.0)
+        x3 = fluid.layers.uniform_random([10],
+                                         dtype="float32",
+                                         min=0.0,
+                                         max=1.0)
 
         x_np = x.numpy()
         x1_np = x1.numpy()
@@ -100,13 +106,17 @@ def test_gen_dropout_dygraph(self):
         gen = paddle.seed(111111111)
         st = gen.get_state()
         # x = np.arange(1,101).reshape(2,50).astype("float32")
-        x = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x = fluid.layers.uniform_random([2, 10],
+                                        dtype="float32",
+                                        min=0.0,
+                                        max=1.0)
         y = fluid.layers.dropout(x, 0.5)
         gen.manual_seed(111111111)
         #gen.set_state(st)
-        x1 = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x1 = fluid.layers.uniform_random([2, 10],
+                                         dtype="float32",
+                                         min=0.0,
+                                         max=1.0)
         y1 = fluid.layers.dropout(x1, 0.5)
         y_np = y.numpy()
         y1_np = y1.numpy()
@@ -376,23 +386,31 @@ def test_generator_sampling_id_dygraph(self):
         fluid.enable_dygraph()
 
         gen.manual_seed(12312321111)
-        x = fluid.layers.uniform_random(
-            [10, 10], dtype="float32", min=0.0, max=1.0)
+        x = fluid.layers.uniform_random([10, 10],
+                                        dtype="float32",
+                                        min=0.0,
+                                        max=1.0)
         y = fluid.layers.sampling_id(x)
 
         st1 = gen.get_state()
-        x1 = fluid.layers.uniform_random(
-            [10, 10], dtype="float32", min=0.0, max=1.0)
+        x1 = fluid.layers.uniform_random([10, 10],
+                                         dtype="float32",
+                                         min=0.0,
+                                         max=1.0)
         y1 = fluid.layers.sampling_id(x)
 
         gen.set_state(st1)
-        x2 = fluid.layers.uniform_random(
-            [10, 10], dtype="float32", min=0.0, max=1.0)
+        x2 = fluid.layers.uniform_random([10, 10],
+                                         dtype="float32",
+                                         min=0.0,
+                                         max=1.0)
         y2 = fluid.layers.sampling_id(x)
 
         gen.manual_seed(12312321111)
-        x3 = fluid.layers.uniform_random(
-            [10, 10], dtype="float32", min=0.0, max=1.0)
+        x3 = fluid.layers.uniform_random([10, 10],
+                                         dtype="float32",
+                                         min=0.0,
+                                         max=1.0)
         y3 = fluid.layers.sampling_id(x)
 
         x_np = y.numpy()
@@ -457,13 +475,13 @@ def test_gen_TruncatedNormal_initializer(self):
             result_1 = fluid.layers.fc(
                 input=x,
                 size=10,
-                param_attr=fluid.initializer.TruncatedNormal(
-                    loc=0.0, scale=2.0))
+                param_attr=fluid.initializer.TruncatedNormal(loc=0.0,
+                                                             scale=2.0))
             result_2 = fluid.layers.fc(
                 input=x,
                 size=10,
-                param_attr=fluid.initializer.TruncatedNormal(
-                    loc=0.0, scale=2.0))
+                param_attr=fluid.initializer.TruncatedNormal(loc=0.0,
+                                                             scale=2.0))
 
             exe = fluid.Executor(fluid.CPUPlace())
             exe.run(startup_program)
diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py
index deb0a9a082140..5a75e83939711 100644
--- a/python/paddle/fluid/tests/unittests/test_randperm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py
@@ -71,8 +71,8 @@ def test_check_output(self):
 
     def verify_output(self, outs):
         out_np = np.array(outs[0])
-        self.assertTrue(
-            check_randperm_out(self.n, out_np), msg=error_msg(out_np))
+        self.assertTrue(check_randperm_out(self.n, out_np),
+                        msg=error_msg(out_np))
 
     def test_eager(self):
         with _test_eager_guard():
@@ -80,26 +80,31 @@ def test_eager(self):
 
 
 class TestRandpermOpN(TestRandpermOp):
+
     def init_attrs(self):
         self.n = 10000
 
 
 class TestRandpermOpInt32(TestRandpermOp):
+
     def init_attrs(self):
         self.dtype = "int32"
 
 
 class TestRandpermOpFloat32(TestRandpermOp):
+
     def init_attrs(self):
         self.dtype = "float32"
 
 
 class TestRandpermOpFloat64(TestRandpermOp):
+
     def init_attrs(self):
         self.dtype = "float64"
 
 
 class TestRandpermOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             self.assertRaises(ValueError, paddle.randperm, -3)
@@ -107,10 +112,11 @@ def test_errors(self):
 
 
 class TestRandpermAPI(unittest.TestCase):
+
     def test_out(self):
         n = 10
-        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
         with program_guard(Program(), Program()):
             x1 = paddle.randperm(n)
             x2 = paddle.randperm(n, 'float32')
@@ -125,18 +131,20 @@ def test_out(self):
 
 
 class TestRandpermImperative(unittest.TestCase):
+
     def test_out(self):
         paddle.disable_static()
         n = 10
         for dtype in ['int32', np.int64, 'float32', 'float64']:
             data_p = paddle.randperm(n, dtype)
             data_np = data_p.numpy()
-            self.assertTrue(
-                check_randperm_out(n, data_np), msg=error_msg(data_np))
+            self.assertTrue(check_randperm_out(n, data_np),
+                            msg=error_msg(data_np))
         paddle.enable_static()
 
 
 class TestRandpermEager(unittest.TestCase):
+
     def test_out(self):
         paddle.disable_static()
         n = 10
@@ -144,12 +152,13 @@ def test_out(self):
             for dtype in ['int32', np.int64, 'float32', 'float64']:
                 data_p = paddle.randperm(n, dtype)
                 data_np = data_p.numpy()
-                self.assertTrue(
-                    check_randperm_out(n, data_np), msg=error_msg(data_np))
+                self.assertTrue(check_randperm_out(n, data_np),
+                                msg=error_msg(data_np))
         paddle.enable_static()
 
 
 class TestRandomValue(unittest.TestCase):
+
     def test_fixed_random_number(self):
         # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
         if not paddle.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_range.py b/python/paddle/fluid/tests/unittests/test_range.py
index e19c1b227f531..3df893b0b59e7 100644
--- a/python/paddle/fluid/tests/unittests/test_range.py
+++ b/python/paddle/fluid/tests/unittests/test_range.py
@@ -26,6 +26,7 @@ def arange_wrapper(start, end, step, dtype=None):
 
 
 class TestRangeOp(OpTest):
+
     def setUp(self):
         self.op_type = "range"
         self.init_config()
@@ -36,8 +37,9 @@ def setUp(self):
         }
 
         self.outputs = {
-            'Out': np.arange(self.case[0], self.case[1],
-                             self.case[2]).astype(self.dtype)
+            'Out':
+            np.arange(self.case[0], self.case[1],
+                      self.case[2]).astype(self.dtype)
         }
 
     def init_config(self):
@@ -50,6 +52,7 @@ def test_check_output(self):
 
 
 class TestFloatRangeOpCase0(TestRangeOp):
+
     def init_config(self):
         self.dtype = np.float32
         self.python_api = partial(arange_wrapper, dtype=self.dtype)
@@ -57,6 +60,7 @@ def init_config(self):
 
 
 class TestInt32RangeOpCase0(TestRangeOp):
+
     def init_config(self):
         self.dtype = np.int32
         self.python_api = partial(arange_wrapper, dtype=self.dtype)
@@ -64,6 +68,7 @@ def init_config(self):
 
 
 class TestInt32RangeOpCase1(TestRangeOp):
+
     def init_config(self):
         self.dtype = np.int32
         self.python_api = partial(arange_wrapper, dtype=self.dtype)
@@ -71,6 +76,7 @@ def init_config(self):
 
 
 class TestInt32RangeOpCase2(TestRangeOp):
+
     def init_config(self):
         self.dtype = np.int32
         self.python_api = partial(arange_wrapper, dtype=self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_rank_attention_op.py b/python/paddle/fluid/tests/unittests/test_rank_attention_op.py
index 64d564c223f8d..1cca1378232f8 100644
--- a/python/paddle/fluid/tests/unittests/test_rank_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rank_attention_op.py
@@ -144,6 +144,7 @@ def gen_rank_offset(pv_nums, max_rank):
 
 
 class TestRankAttentionOpComplex(OpTest):
+
     def config(self):
         self.pv_num = 100
         self.x_feat = 10
@@ -161,8 +162,8 @@ def setUp(self):
         ]
         rank_para = np.random.random(rank_para_shape).astype(self.dtype)
         np_out, np_input_help, np_param_help, np_ins_rank = np_rank_attention(
-            input,
-            np.array(rank_offset), rank_para, self.max_rank, self.pv_num * 7)
+            input, np.array(rank_offset), rank_para, self.max_rank,
+            self.pv_num * 7)
         self.inputs = {
             "X": input,
             "RankOffset": np.array(rank_offset).astype("int32"),
@@ -185,6 +186,7 @@ def test_check_grad_gpu(self):
 
 
 class TestRankAttentionOpCpu(OpTest):
+
     def config(self):
         self.pv_num = 100
         self.x_feat = 10
@@ -202,8 +204,8 @@ def setUp(self):
         ]
         rank_para = np.random.random(rank_para_shape).astype(self.dtype)
         np_out, np_input_help, np_param_help, np_ins_rank = np_rank_attention(
-            input,
-            np.array(rank_offset), rank_para, self.max_rank, self.pv_num * 7)
+            input, np.array(rank_offset), rank_para, self.max_rank,
+            self.pv_num * 7)
         self.inputs = {
             "X": input,
             "RankOffset": np.array(rank_offset).astype("int32"),
diff --git a/python/paddle/fluid/tests/unittests/test_rank_loss_op.py b/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
index c4851bc274b82..eb29c68daf7b9 100644
--- a/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
@@ -22,6 +22,7 @@
 
 
 class TestRankLossOp(OpTest):
+
     def setUp(self):
         self.op_type = "rank_loss"
         shape = (100, 1)
@@ -57,36 +58,42 @@ def test_check_grad_ignore_right(self):
 
 
 class TestRankLossOp1(TestRankLossOp):
+
     def set_shape(self):
         batch_size = 100
         return (batch_size), (batch_size, 1), (batch_size, 1)
 
 
 class TestRankLossOp2(TestRankLossOp):
+
     def set_shape(self):
         batch_size = 100
         return (batch_size, 1), (batch_size), (batch_size, 1)
 
 
 class TestRankLossOp3(TestRankLossOp):
+
     def set_shape(self):
         batch_size = 100
         return (batch_size, 1), (batch_size, 1), (batch_size)
 
 
 class TestRankLossOp4(TestRankLossOp):
+
     def set_shape(self):
         batch_size = 100
         return (batch_size), (batch_size), (batch_size, 1)
 
 
 class TestRankLossOp5(TestRankLossOp):
+
     def set_shape(self):
         batch_size = 100
         return (batch_size), (batch_size), (batch_size)
 
 
 class TestRankLossOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             label = fluid.data(name="label", shape=[16, 1], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/test_raw_program_optimizer.py
index 34930e3577b9b..43108fb4ab4d6 100644
--- a/python/paddle/fluid/tests/unittests/test_raw_program_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_raw_program_optimizer.py
@@ -25,6 +25,7 @@
 
 
 class TestRawProgramOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "0"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
@@ -35,16 +36,15 @@ def mlp(self, input_x, input_y, hid_dim=128, label_dim=2):
         prediction = paddle.static.nn.fc(x=[fc_2],
                                          size=label_dim,
                                          activation='softmax')
-        cost = paddle.nn.functional.cross_entropy(
-            input=prediction, label=input_y)
+        cost = paddle.nn.functional.cross_entropy(input=prediction,
+                                                  label=input_y)
         avg_cost = paddle.mean(x=cost)
         return avg_cost
 
     def gen_data(self):
         return {
             "x": np.random.random(size=(128, 32)).astype('float32'),
-            "y": np.random.randint(
-                2, size=(128, 1)).astype('int64')
+            "y": np.random.randint(2, size=(128, 1)).astype('int64')
         }
 
     def test_single_gpu(self):
@@ -56,10 +56,12 @@ def test_single_gpu(self):
         strategy.without_graph_optimization = True
         with fluid.program_guard(sharding_program, sharding_startup_program):
             with fluid.unique_name.guard():
-                input_x = paddle.static.data(
-                    name="x", shape=[None, 32], dtype='float32')
-                input_y = paddle.static.data(
-                    name="y", shape=[None, 1], dtype='int64')
+                input_x = paddle.static.data(name="x",
+                                             shape=[None, 32],
+                                             dtype='float32')
+                input_y = paddle.static.data(name="y",
+                                             shape=[None, 1],
+                                             dtype='int64')
                 cost = self.mlp(input_x=input_x, input_y=input_y)
                 output_name = cost.name
                 optimizer = fleet.distributed_optimizer(fluid.optimizer.Adam(),
diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py
index 2cef896aa75f5..bb69083e785f2 100644
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import os
+
 os.environ['CPU_NUM'] = str(1)
 import paddle.fluid as fluid
 from paddle.fluid import compiler
@@ -23,7 +24,9 @@
 
 
 class TestReaderReset(unittest.TestCase):
+
     def prepare_data(self):
+
         def fake_data_generator():
             for n in range(self.total_ins_num):
                 yield np.ones(self.ins_shape) * n, n
@@ -44,8 +47,9 @@ def main(self, with_double_buffer):
         startup_prog = fluid.Program()
 
         with fluid.program_guard(main_prog, startup_prog):
-            image = fluid.layers.data(
-                name='image', shape=self.ins_shape, dtype='float32')
+            image = fluid.layers.data(name='image',
+                                      shape=self.ins_shape,
+                                      dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             data_reader_handle = fluid.io.PyReader(
                 feed_list=[image, label],
@@ -59,8 +63,7 @@ def main(self, with_double_buffer):
         exe.run(startup_prog)
 
         data_reader_handle.decorate_sample_list_generator(
-            paddle.batch(
-                self.prepare_data(), batch_size=self.batch_size))
+            paddle.batch(self.prepare_data(), batch_size=self.batch_size))
 
         train_cp = compiler.CompiledProgram(main_prog).with_data_parallel(
             places=[place])
@@ -75,8 +78,9 @@ def main(self, with_double_buffer):
                                                   fetch_list=fetch_list,
                                                   return_numpy=True)
                     ins_num = data_val.shape[0]
-                    broadcasted_label = np.ones((ins_num, ) + tuple(
-                        self.ins_shape)) * label_val.reshape((ins_num, 1))
+                    broadcasted_label = np.ones((
+                        ins_num, ) + tuple(self.ins_shape)) * label_val.reshape(
+                            (ins_num, 1))
                     self.assertEqual(data_val.all(), broadcasted_label.all())
                     batch_id += 1
             except fluid.core.EOFException:
diff --git a/python/paddle/fluid/tests/unittests/test_real_imag_op.py b/python/paddle/fluid/tests/unittests/test_real_imag_op.py
index 523f48374eab9..1402585c03745 100644
--- a/python/paddle/fluid/tests/unittests/test_real_imag_op.py
+++ b/python/paddle/fluid/tests/unittests/test_real_imag_op.py
@@ -34,6 +34,7 @@
 
 
 class TestRealOp(OpTest):
+
     def setUp(self):
         # switch to static
         paddle.enable_static()
@@ -47,7 +48,8 @@ def setUp(self):
 
     def init_input_output(self):
         self.inputs = {
-            'X': np.random.random(
+            'X':
+            np.random.random(
                 (20, 5)).astype(self.dtype) + 1j * np.random.random(
                     (20, 5)).astype(self.dtype)
         }
@@ -55,22 +57,22 @@ def init_input_output(self):
 
     def init_grad_input_output(self):
         self.grad_out = np.ones((20, 5), self.dtype)
-        self.grad_x = np.real(self.grad_out) + 1j * np.zeros(
-            self.grad_out.shape)
+        self.grad_x = np.real(
+            self.grad_out) + 1j * np.zeros(self.grad_out.shape)
 
     def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out],
-            check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        user_defined_grads=[self.grad_x],
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=True)
 
 
 class TestImagOp(TestRealOp):
+
     def setUp(self):
         # switch to static
         paddle.enable_static()
@@ -84,11 +86,12 @@ def setUp(self):
 
     def init_grad_input_output(self):
         self.grad_out = np.ones((20, 5), self.dtype)
-        self.grad_x = np.zeros(self.grad_out.shape) + 1j * np.real(
-            self.grad_out)
+        self.grad_x = np.zeros(
+            self.grad_out.shape) + 1j * np.real(self.grad_out)
 
 
 class TestRealAPI(unittest.TestCase):
+
     def setUp(self):
         # switch to static
         paddle.enable_static()
@@ -101,6 +104,7 @@ def setUp(self):
         self._shape = [2, 20, 2, 3]
 
     def test_in_static_mode(self):
+
         def init_input_output(dtype):
             input = np.random.random(self._shape).astype(
                 dtype) + 1j * np.random.random(self._shape).astype(dtype)
@@ -154,6 +158,7 @@ def test_dtype_error(self):
 
 
 class TestImagAPI(TestRealAPI):
+
     def setUp(self):
         # switch to static
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index a8adee742c612..568d57c09355f 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -29,6 +29,7 @@
 
 
 class PyRNNBase(object):
+
     def __init__(self, input_shape, output_shape):
         self.x = np.ones(shape=input_shape).astype("float32")
         self.y = np.zeros(shape=output_shape).astype("float32")
@@ -46,6 +47,7 @@ def segment_inputs(self):
 
 
 class PySimpleRNN1(PyRNNBase):
+
     def __init__(self, input_shape, output_shape):
         super(PySimpleRNN1, self).__init__(input_shape, output_shape)
 
@@ -67,6 +69,7 @@ def step(self, step_id, x):
 
 
 class PySimpleRNN2(PyRNNBase):
+
     def __init__(self, input_shape, output_shape):
         super(PySimpleRNN2, self).__init__(input_shape, output_shape)
 
@@ -134,14 +137,14 @@ def setUp(self):
             self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
+        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
+                        dtype='float32',
+                        name='x',
+                        append_batch_size=False)
         x.stop_gradient = False
-        h_boot = layers.data(
-            shape=[self.input_dim], dtype='float32', name='h_boot')
+        h_boot = layers.data(shape=[self.input_dim],
+                             dtype='float32',
+                             name='h_boot')
         h_boot.stop_gradient = False
 
         rnn = layers.StaticRNN()
@@ -149,10 +152,8 @@ def create_rnn_op(self):
             h_pre = rnn.memory(init=h_boot)
             x_t = rnn.step_input(x)
 
-            h = layers.scale(
-                x=layers.elementwise_add(
-                    x=h_pre, y=x_t),
-                scale=self.py_rnn.scale)
+            h = layers.scale(x=layers.elementwise_add(x=h_pre, y=x_t),
+                             scale=self.py_rnn.scale)
 
             rnn.update_memory(h_pre, h)
             rnn.output(h)
@@ -199,8 +200,7 @@ def test_backward(self, rtol=0.01):
         for idx, name in enumerate(self.grad_data_field):
             self.assertEqual(num_grad[idx].shape, ana_grad[idx].shape)
             self.assertTrue(
-                np.isclose(
-                    num_grad[idx], ana_grad[idx], rtol=rtol).all(),
+                np.isclose(num_grad[idx], ana_grad[idx], rtol=rtol).all(),
                 "num_grad (" + name + ") has diff at " + str(self.place) +
                 "\nExpect " + str(num_grad[idx]) + "\n" + "But Got" +
                 str(ana_grad[idx]) + " in class " + self.__class__.__name__)
@@ -265,14 +265,14 @@ def setUp(self):
             self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
+        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
+                        dtype='float32',
+                        name='x',
+                        append_batch_size=False)
         x.stop_gradient = False
-        h_boot = layers.data(
-            shape=[self.input_dim], dtype='float32', name='h_boot')
+        h_boot = layers.data(shape=[self.input_dim],
+                             dtype='float32',
+                             name='h_boot')
         h_boot.stop_gradient = False
 
         rnn = layers.StaticRNN()
@@ -322,9 +322,10 @@ class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):
     '''
 
     class PySimpleRNN3(PyRNNBase):
+
         def __init__(self, input_shape, output_shape):
-            super(RecurrentOpMultipleMemoryTest.PySimpleRNN3, self).__init__(
-                input_shape, output_shape)
+            super(RecurrentOpMultipleMemoryTest.PySimpleRNN3,
+                  self).__init__(input_shape, output_shape)
 
             seq_len, batch_size, input_dim = input_shape
             self.h_boot1 = np.random.normal(size=(batch_size,
@@ -366,23 +367,20 @@ def setUp(self):
             self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
+        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
+                        dtype='float32',
+                        name='x',
+                        append_batch_size=False)
         x.stop_gradient = False
-        h_boot1 = layers.data(
-            shape=[self.batch_size, self.input_dim],
-            dtype='float32',
-            name='h_boot1',
-            append_batch_size=False)
+        h_boot1 = layers.data(shape=[self.batch_size, self.input_dim],
+                              dtype='float32',
+                              name='h_boot1',
+                              append_batch_size=False)
         h_boot1.stop_gradient = False
-        h_boot2 = layers.data(
-            shape=[self.batch_size, self.input_dim],
-            dtype='float32',
-            name='h_boot2',
-            append_batch_size=False)
+        h_boot2 = layers.data(shape=[self.batch_size, self.input_dim],
+                              dtype='float32',
+                              name='h_boot2',
+                              append_batch_size=False)
         h_boot2.stop_gradient = False
 
         rnn = layers.StaticRNN()
@@ -417,9 +415,10 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
     '''
 
     class PySimpleRNN4(PyRNNBase):
+
         def __init__(self, input_shape, output_shape):
-            super(RecurrentOpNoMemBootTest.PySimpleRNN4, self).__init__(
-                input_shape, output_shape)
+            super(RecurrentOpNoMemBootTest.PySimpleRNN4,
+                  self).__init__(input_shape, output_shape)
             men_dim = input_shape
             self.mems = np.zeros(shape=men_dim).astype("float32")
 
@@ -443,18 +442,17 @@ def setUp(self):
 
         self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(self.input_shape,
-                                                            self.output_shape)
+        self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(
+            self.input_shape, self.output_shape)
 
         with fluid.program_guard(self.main_program, self.startup_program):
             self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
+        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
+                        dtype='float32',
+                        name='x',
+                        append_batch_size=False)
         x.stop_gradient = False
 
         rnn = layers.StaticRNN()
@@ -490,25 +488,31 @@ class RecurrentOpSubBlockTest(RecurrentOpTest1):
     '''
 
     class PySimpleRNN5(PyRNNBase):
+
         def __init__(self, input_shape, output_shape):
-            super(RecurrentOpSubBlockTest.PySimpleRNN5, self).__init__(
-                input_shape, output_shape)
+            super(RecurrentOpSubBlockTest.PySimpleRNN5,
+                  self).__init__(input_shape, output_shape)
 
             seq_len, batch_size, input_dim = input_shape
-            self.w1 = np.random.uniform(
-                -0.1, 0.1, size=(input_dim, input_dim)).astype("float32")
-            self.w2 = np.random.uniform(
-                -0.1, 0.1, size=(input_dim * 2, input_dim)).astype("float32")
-
-            self.emb = np.random.uniform(
-                -0.1, 0.1, size=(seq_len, batch_size,
-                                 input_dim)).astype("float32")
+            self.w1 = np.random.uniform(-0.1, 0.1,
+                                        size=(input_dim,
+                                              input_dim)).astype("float32")
+            self.w2 = np.random.uniform(-0.1,
+                                        0.1,
+                                        size=(input_dim * 2,
+                                              input_dim)).astype("float32")
+
+            self.emb = np.random.uniform(-0.1,
+                                         0.1,
+                                         size=(seq_len, batch_size,
+                                               input_dim)).astype("float32")
 
             men_dim = (seq_len, batch_size, input_dim)
             self.mems = np.zeros(shape=men_dim).astype("float32")
             self.oy = np.matmul(self.emb, self.w1)
 
         def step(self, step_id, x):
+
             def dot_attention(query, memory):
                 attn = np.matmul(query, memory.transpose((0, 2, 1)))
                 weight = softmax(attn)
@@ -544,19 +548,18 @@ def setUp(self):
 
         self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = RecurrentOpSubBlockTest.PySimpleRNN5(self.input_shape,
-                                                           self.output_shape)
+        self.py_rnn = RecurrentOpSubBlockTest.PySimpleRNN5(
+            self.input_shape, self.output_shape)
 
         with fluid.program_guard(self.main_program, self.startup_program):
             rnn_out = self.create_rnn_op()
             self.output = layers.mean(rnn_out)
 
     def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype='float32',
-            name='x',
-            append_batch_size=False)
+        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
+                        dtype='float32',
+                        name='x',
+                        append_batch_size=False)
         x.stop_gradient = False
 
         emb = layers.data(
@@ -566,17 +569,15 @@ def create_rnn_op(self):
             append_batch_size=False)
         emb.stop_gradient = False
 
-        w1 = layers.data(
-            shape=[self.input_dim, self.input_dim],
-            dtype='float32',
-            name='w1',
-            append_batch_size=False)
+        w1 = layers.data(shape=[self.input_dim, self.input_dim],
+                         dtype='float32',
+                         name='w1',
+                         append_batch_size=False)
         w1.stop_gradient = False
-        w2 = layers.data(
-            shape=[self.input_dim * 2, self.input_dim],
-            dtype='float32',
-            name='w2',
-            append_batch_size=False)
+        w2 = layers.data(shape=[self.input_dim * 2, self.input_dim],
+                         dtype='float32',
+                         name='w2',
+                         append_batch_size=False)
         w2.stop_gradient = False
 
         rnn = layers.StaticRNN()
@@ -590,10 +591,9 @@ def dot_attention(query, memory):
 
         y = layers.matmul(emb, w1)
         with rnn.step():
-            pre_h = rnn.memory(
-                shape=(self.sent_len, self.input_dim),
-                batch_ref=x,
-                init_value=0.0)
+            pre_h = rnn.memory(shape=(self.sent_len, self.input_dim),
+                               batch_ref=x,
+                               init_value=0.0)
             step_in = rnn.step_input(x)
             concat_in = layers.concat([step_in, pre_h], 1)
             new_h = layers.matmul(concat_in, w2)
@@ -640,14 +640,14 @@ def setUp(self):
             self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
-        x = layers.data(
-            shape=[self.sent_len, self.batch_size, self.input_dim],
-            dtype="float32",
-            name="x",
-            append_batch_size=False)
+        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
+                        dtype="float32",
+                        name="x",
+                        append_batch_size=False)
         x.stop_gradient = False
-        h_boot = layers.data(
-            shape=[self.input_dim], dtype="float32", name="h_boot")
+        h_boot = layers.data(shape=[self.input_dim],
+                             dtype="float32",
+                             name="h_boot")
         h_boot.stop_gradient = True
 
         rnn = layers.StaticRNN()
diff --git a/python/paddle/fluid/tests/unittests/test_recv_save_op.py b/python/paddle/fluid/tests/unittests/test_recv_save_op.py
index 233cbf129f1f9..7e875ee84b814 100644
--- a/python/paddle/fluid/tests/unittests/test_recv_save_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recv_save_op.py
@@ -49,17 +49,20 @@ def run_pserver(pserver_id):
             param.set(param_array, place)
 
             optimize_block = program._create_block(program.global_block().idx)
-            program.global_block().append_op(
-                type="listen_and_serv",
-                inputs={'X': []},
-                outputs={},
-                attrs={
-                    "optimize_blocks": [optimize_block],
-                    "endpoint": '127.0.0.1:0',
-                    "Fanin": 1,
-                    "distributed_mode": DistributedMode.SYNC,
-                    "grad_to_block_id": []
-                })
+            program.global_block().append_op(type="listen_and_serv",
+                                             inputs={'X': []},
+                                             outputs={},
+                                             attrs={
+                                                 "optimize_blocks":
+                                                 [optimize_block],
+                                                 "endpoint":
+                                                 '127.0.0.1:0',
+                                                 "Fanin":
+                                                 1,
+                                                 "distributed_mode":
+                                                 DistributedMode.SYNC,
+                                                 "grad_to_block_id": []
+                                             })
 
             exe = fluid.Executor(place)
             exe.run(program)
@@ -67,6 +70,7 @@ def run_pserver(pserver_id):
 
 @unittest.skip("do not need currently")
 class TestListenAndServOp(unittest.TestCase):
+
     def setUp(self):
         self.ps_timeout = 5
 
@@ -103,16 +107,15 @@ def _run_nce_op_two_pserver(self, place, port0, port1, model_file):
                 emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)]
 
                 # create and run recv and save operator
-                remote_recv_op = Operator(
-                    "recv_save",
-                    trainer_id=0,
-                    shape=[10, 8],
-                    slice_shapes=["5,8", "5,8"],
-                    slice_varnames=["table", "table"],
-                    remote_varnames=['table', 'table'],
-                    is_sparse=False,
-                    endpoints=emaps,
-                    file_path=model_file)
+                remote_recv_op = Operator("recv_save",
+                                          trainer_id=0,
+                                          shape=[10, 8],
+                                          slice_shapes=["5,8", "5,8"],
+                                          slice_varnames=["table", "table"],
+                                          remote_varnames=['table', 'table'],
+                                          is_sparse=False,
+                                          endpoints=emaps,
+                                          file_path=model_file)
 
                 remote_recv_op.run(scope, place)
 
@@ -141,31 +144,28 @@ def _load_slice_var(self, model_file):
             dtype="float32",
             persistable=True)
 
-        load_block.append_op(
-            type='load',
-            inputs={},
-            outputs={'Out': [origin]},
-            attrs={'file_path': model_file})
-
-        load_block.append_op(
-            type='load',
-            inputs={},
-            outputs={'Out': [slice0]},
-            attrs={
-                'file_path': model_file,
-                'seek': 2 * 8,
-                'shape': slice0.shape
-            })
-
-        load_block.append_op(
-            type='load',
-            inputs={},
-            outputs={'Out': [slice1]},
-            attrs={
-                'file_path': model_file,
-                'seek': 5 * 8,
-                'shape': slice1.shape
-            })
+        load_block.append_op(type='load',
+                             inputs={},
+                             outputs={'Out': [origin]},
+                             attrs={'file_path': model_file})
+
+        load_block.append_op(type='load',
+                             inputs={},
+                             outputs={'Out': [slice0]},
+                             attrs={
+                                 'file_path': model_file,
+                                 'seek': 2 * 8,
+                                 'shape': slice0.shape
+                             })
+
+        load_block.append_op(type='load',
+                             inputs={},
+                             outputs={'Out': [slice1]},
+                             attrs={
+                                 'file_path': model_file,
+                                 'seek': 5 * 8,
+                                 'shape': slice1.shape
+                             })
 
         exe = fluid.Executor(place=fluid.CPUPlace())
         exe.run(load_prog)
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 01d386724d161..d6fabb44b4fe2 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -25,6 +25,7 @@
 
 
 class TestSumOp(OpTest):
+
     def setUp(self):
         self.python_api = paddle.sum
         self.op_type = "reduce_sum"
@@ -40,6 +41,7 @@ def test_check_grad(self):
 
 
 class TestSumOp_fp16(OpTest):
+
     def setUp(self):
         self.python_api = paddle.sum
         self.op_type = "reduce_sum"
@@ -61,13 +63,16 @@ def calc_gradient(self):
         return grad,
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', user_defined_grads=self.gradient, check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        user_defined_grads=self.gradient,
+                        check_eager=True)
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSumOp_bf16(OpTest):
+
     def setUp(self):
         np.random.seed(100)
         self.python_api = paddle.sum
@@ -88,11 +93,10 @@ def test_check_output(self):
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'],
-            'Out',
-            user_defined_grads=self.gradient,
-            check_eager=True)
+        self.check_grad_with_place(place, ['X'],
+                                   'Out',
+                                   user_defined_grads=self.gradient,
+                                   check_eager=True)
 
     def calc_gradient(self):
         x = self.x
@@ -101,6 +105,7 @@ def calc_gradient(self):
 
 
 class TestSumOp_fp16_withInt(OpTest):
+
     def setUp(self):
         self.python_api = paddle.sum
         self.op_type = "reduce_sum"
@@ -124,11 +129,14 @@ def calc_gradient(self):
         return grad,
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', user_defined_grads=self.gradient, check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        user_defined_grads=self.gradient,
+                        check_eager=True)
 
 
 class TestSumOp5D(OpTest):
+
     def setUp(self):
         self.python_api = paddle.sum
         self.op_type = "reduce_sum"
@@ -146,6 +154,7 @@ def test_check_grad(self):
 
 
 class TestSumOp6D(OpTest):
+
     def setUp(self):
         self.python_api = paddle.sum
         self.op_type = "reduce_sum"
@@ -163,6 +172,7 @@ def test_check_grad(self):
 
 
 class TestSumOp8D(OpTest):
+
     def setUp(self):
         self.python_api = paddle.sum
         self.op_type = "reduce_sum"
@@ -258,6 +268,7 @@ def raw_reduce_prod(x, dim=[0], keep_dim=False):
 
 
 class TestProdOp(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.python_api = raw_reduce_prod
@@ -277,6 +288,7 @@ def test_check_grad(self):
 
 
 class TestProd6DOp(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.python_api = raw_reduce_prod
@@ -301,6 +313,7 @@ def test_check_grad(self):
 
 
 class TestProd8DOp(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_prod"
         self.python_api = raw_reduce_prod
@@ -326,6 +339,7 @@ def test_check_grad(self):
 
 
 class TestAllOp(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_all"
         self.python_api = paddle.all
@@ -338,6 +352,7 @@ def test_check_output(self):
 
 
 class TestAll8DOp(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_all"
         self.python_api = paddle.all
@@ -353,6 +368,7 @@ def test_check_output(self):
 
 
 class TestAllOpWithDim(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_all"
         self.python_api = paddle.all
@@ -365,6 +381,7 @@ def test_check_output(self):
 
 
 class TestAll8DOpWithDim(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_all"
         self.python_api = paddle.all
@@ -380,14 +397,14 @@ def test_check_output(self):
 
 
 class TestAllOpWithKeepDim(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_all"
         self.python_api = paddle.all
         self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
         self.attrs = {'dim': [1], 'keep_dim': True}
         self.outputs = {
-            'Out': np.expand_dims(
-                self.inputs['X'].all(axis=1), axis=1)
+            'Out': np.expand_dims(self.inputs['X'].all(axis=1), axis=1)
         }
 
     def test_check_output(self):
@@ -395,6 +412,7 @@ def test_check_output(self):
 
 
 class TestAll8DOpWithKeepDim(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_all"
         self.python_api = paddle.all
@@ -404,8 +422,8 @@ def setUp(self):
         }
         self.attrs = {'dim': (5, ), 'keep_dim': True}
         self.outputs = {
-            'Out': np.expand_dims(
-                self.inputs['X'].all(axis=self.attrs['dim']), axis=5)
+            'Out':
+            np.expand_dims(self.inputs['X'].all(axis=self.attrs['dim']), axis=5)
         }
 
     def test_check_output(self):
@@ -413,18 +431,21 @@ def test_check_output(self):
 
 
 class TestAllOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of reduce_all_op must be Variable.
             input1 = 12
             self.assertRaises(TypeError, fluid.layers.reduce_all, input1)
             # The input dtype of reduce_all_op must be bool.
-            input2 = fluid.layers.data(
-                name='input2', shape=[12, 10], dtype="int32")
+            input2 = fluid.layers.data(name='input2',
+                                       shape=[12, 10],
+                                       dtype="int32")
             self.assertRaises(TypeError, fluid.layers.reduce_all, input2)
 
 
 class TestAnyOp(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_any"
         self.python_api = paddle.any
@@ -437,6 +458,7 @@ def test_check_output(self):
 
 
 class TestAny8DOp(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_any"
         self.python_api = paddle.any
@@ -452,6 +474,7 @@ def test_check_output(self):
 
 
 class TestAnyOpWithDim(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_any"
         self.python_api = paddle.any
@@ -464,6 +487,7 @@ def test_check_output(self):
 
 
 class TestAny8DOpWithDim(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_any"
         self.python_api = paddle.any
@@ -479,14 +503,15 @@ def test_check_output(self):
 
 
 class TestAnyOpWithKeepDim(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_any"
         self.python_api = paddle.any
         self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
         self.attrs = {'dim': (1, ), 'keep_dim': True}
         self.outputs = {
-            'Out': np.expand_dims(
-                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
+            'Out':
+            np.expand_dims(self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
         }
 
     def test_check_output(self):
@@ -494,6 +519,7 @@ def test_check_output(self):
 
 
 class TestAny8DOpWithKeepDim(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_any"
         self.python_api = paddle.any
@@ -503,8 +529,8 @@ def setUp(self):
         }
         self.attrs = {'dim': (1, ), 'keep_dim': True}
         self.outputs = {
-            'Out': np.expand_dims(
-                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
+            'Out':
+            np.expand_dims(self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
         }
 
     def test_check_output(self):
@@ -512,18 +538,21 @@ def test_check_output(self):
 
 
 class TestAnyOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of reduce_any_op must be Variable.
             input1 = 12
             self.assertRaises(TypeError, fluid.layers.reduce_any, input1)
             # The input dtype of reduce_any_op must be bool.
-            input2 = fluid.layers.data(
-                name='input2', shape=[12, 10], dtype="int32")
+            input2 = fluid.layers.data(name='input2',
+                                       shape=[12, 10],
+                                       dtype="int32")
             self.assertRaises(TypeError, fluid.layers.reduce_any, input2)
 
 
 class Test1DReduce(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random(120).astype("float64")}
@@ -537,6 +566,7 @@ def test_check_grad(self):
 
 
 class Test2DReduce0(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.attrs = {'dim': [0]}
@@ -545,6 +575,7 @@ def setUp(self):
 
 
 class Test2DReduce1(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.attrs = {'dim': [1]}
@@ -555,6 +586,7 @@ def setUp(self):
 
 
 class Test3DReduce0(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.attrs = {'dim': [1]}
@@ -565,6 +597,7 @@ def setUp(self):
 
 
 class Test3DReduce1(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.attrs = {'dim': [2]}
@@ -575,6 +608,7 @@ def setUp(self):
 
 
 class Test3DReduce2(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.attrs = {'dim': [-2]}
@@ -585,6 +619,7 @@ def setUp(self):
 
 
 class Test3DReduce3(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.attrs = {'dim': [1, 2]}
@@ -595,6 +630,7 @@ def setUp(self):
 
 
 class Test8DReduce0(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.attrs = {'dim': (4, 2, 3)}
@@ -607,17 +643,20 @@ def setUp(self):
 
 
 class TestKeepDimReduce(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
         self.attrs = {'dim': [1], 'keep_dim': True}
         self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
-                                        keepdims=self.attrs['keep_dim'])
+            'Out':
+            self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                 keepdims=self.attrs['keep_dim'])
         }
 
 
 class TestKeepDim8DReduce(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {
@@ -625,8 +664,9 @@ def setUp(self):
         }
         self.attrs = {'dim': (3, 4, 5), 'keep_dim': True}
         self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
-                                        keepdims=self.attrs['keep_dim'])
+            'Out':
+            self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                 keepdims=self.attrs['keep_dim'])
         }
 
 
@@ -669,6 +709,7 @@ def test_check_output(self):
 
 
 class TestKeepDimReduceSumMultiAxises(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
@@ -686,13 +727,14 @@ def test_check_grad(self):
 
 
 class TestReduceSumWithDimOne(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random((100, 1, 1)).astype("float64")}
         self.attrs = {'dim': [1, 2], 'keep_dim': True}
         self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
-                                        keepdims=True)
+            'Out':
+            self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True)
         }
 
     def test_check_output(self):
@@ -703,13 +745,14 @@ def test_check_grad(self):
 
 
 class TestReduceSumWithNumelOne(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random((100, 1)).astype("float64")}
         self.attrs = {'dim': [1], 'keep_dim': False}
         self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
-                                        keepdims=False)
+            'Out':
+            self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=False)
         }
 
     def test_check_output(self):
@@ -720,6 +763,7 @@ def test_check_grad(self):
 
 
 class TestReduceAll(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random((100, 1, 1)).astype("float64")}
@@ -734,6 +778,7 @@ def test_check_grad(self):
 
 
 class Test1DReduceWithAxes1(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random(100).astype("float64")}
@@ -748,14 +793,17 @@ def test_check_grad(self):
 
 
 class TestReduceWithDtype(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random((6, 2, 10)).astype("float64")}
         self.outputs = {'Out': self.inputs['X'].sum().astype('float64')}
         self.attrs = {'reduce_all': True}
         self.attrs.update({
-            'in_dtype': int(convert_np_dtype_to_dtype_(np.float32)),
-            'out_dtype': int(convert_np_dtype_to_dtype_(np.float64))
+            'in_dtype':
+            int(convert_np_dtype_to_dtype_(np.float32)),
+            'out_dtype':
+            int(convert_np_dtype_to_dtype_(np.float64))
         })
 
     def test_check_output(self):
@@ -766,35 +814,42 @@ def test_check_grad(self):
 
 
 class TestReduceWithDtype1(TestReduceWithDtype):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random((6, 2, 10)).astype("float64")}
         self.outputs = {'Out': self.inputs['X'].sum(axis=1)}
         self.attrs = {'dim': [1]}
         self.attrs.update({
-            'in_dtype': int(convert_np_dtype_to_dtype_(np.float32)),
-            'out_dtype': int(convert_np_dtype_to_dtype_(np.float64))
+            'in_dtype':
+            int(convert_np_dtype_to_dtype_(np.float32)),
+            'out_dtype':
+            int(convert_np_dtype_to_dtype_(np.float64))
         })
 
 
 class TestReduceWithDtype2(TestReduceWithDtype):
+
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random((6, 2, 10)).astype("float64")}
         self.outputs = {'Out': self.inputs['X'].sum(axis=1, keepdims=True)}
         self.attrs = {'dim': [1], 'keep_dim': True}
         self.attrs.update({
-            'in_dtype': int(convert_np_dtype_to_dtype_(np.float32)),
-            'out_dtype': int(convert_np_dtype_to_dtype_(np.float64))
+            'in_dtype':
+            int(convert_np_dtype_to_dtype_(np.float32)),
+            'out_dtype':
+            int(convert_np_dtype_to_dtype_(np.float64))
         })
 
 
 class TestReduceSumOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of reduce_sum_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.reduce_sum, x1)
             # The input dtype of reduce_sum_op  must be float32 or float64 or int32 or int64.
             x2 = fluid.layers.data(name='x2', shape=[4], dtype="uint8")
@@ -802,6 +857,7 @@ def test_errors(self):
 
 
 class API_TestSumOp(unittest.TestCase):
+
     def run_static(self,
                    shape,
                    x_dtype,
@@ -827,8 +883,9 @@ def run_static(self,
                                fetch_list=[result_sum])
 
             self.assertTrue(
-                np.allclose(
-                    res, np.sum(input_data.astype(attr_dtype), axis=np_axis)))
+                np.allclose(res,
+                            np.sum(input_data.astype(attr_dtype),
+                                   axis=np_axis)))
 
     def test_static(self):
         shape = [10, 10]
@@ -859,8 +916,10 @@ def test_static(self):
 
         shape = [5, 5, 5]
         self.run_static(shape, "int32", (0, 1), attr_dtype="int32")
-        self.run_static(
-            shape, "int32", (), attr_dtype="int32", np_axis=(0, 1, 2))
+        self.run_static(shape,
+                        "int32", (),
+                        attr_dtype="int32",
+                        np_axis=(0, 1, 2))
 
     def test_dygraph(self):
         np_x = np.random.random([2, 3, 4]).astype('int32')
@@ -878,6 +937,7 @@ def test_dygraph(self):
 
 
 class TestAllAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         paddle.enable_static()
@@ -933,6 +993,7 @@ def test_dygraph(self):
 
 
 class TestAnyAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_reducescatter.py b/python/paddle/fluid/tests/unittests/test_reducescatter.py
index 7c355d46285c5..c340157c13264 100644
--- a/python/paddle/fluid/tests/unittests/test_reducescatter.py
+++ b/python/paddle/fluid/tests/unittests/test_reducescatter.py
@@ -23,6 +23,7 @@
 
 
 class TestReduceScatterOp(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_reducescatter_api.py b/python/paddle/fluid/tests/unittests/test_reducescatter_api.py
index 5a494b5529efb..b84943a022352 100644
--- a/python/paddle/fluid/tests/unittests/test_reducescatter_api.py
+++ b/python/paddle/fluid/tests/unittests/test_reducescatter_api.py
@@ -24,6 +24,7 @@
 
 
 class TestReduceScatterAPI(TestDistBase):
+
     def _setup_config(self):
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/test_registry.py b/python/paddle/fluid/tests/unittests/test_registry.py
index 39cf64465ab1e..e9f847185fc76 100644
--- a/python/paddle/fluid/tests/unittests/test_registry.py
+++ b/python/paddle/fluid/tests/unittests/test_registry.py
@@ -21,6 +21,7 @@
 
 
 class TestRegistry(unittest.TestCase):
+
     @prog_scope()
     def test_registry_layer(self):
         x = fluid.layers.data(name='X', shape=[10, 10], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index 08a70fe1852d0..304e47da9a61a 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -29,6 +29,7 @@
 
 
 class TestL2DecayRegularizer(unittest.TestCase):
+
     def test_l2decay_regularizer(self):
         paddle.enable_static()
         program = framework.Program()
@@ -42,20 +43,28 @@ def test_l2decay_regularizer(self):
         self.assertTrue(mul_x.regularizer is not None)
         self.assertTrue(
             isinstance(mul_x.regularizer, regularizer.L2DecayRegularizer))
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        mul_y = block.create_var(dtype="float32",
+                                 shape=[10, 8],
+                                 lod_level=0,
+                                 name="mul.y")
+        mul_out = block.create_var(dtype="float32",
+                                   shape=[5, 8],
+                                   lod_level=0,
+                                   name="mul.out")
+        block.append_op(type="mul",
+                        inputs={
+                            "X": mul_x,
+                            "Y": mul_y
+                        },
+                        outputs={"Out": mul_out},
+                        attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(dtype="float32",
+                                    shape=[1],
+                                    lod_level=0,
+                                    name="mean.out")
+        block.append_op(type="mean",
+                        inputs={"X": mul_out},
+                        outputs={"Out": mean_out})
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
@@ -68,6 +77,7 @@ def test_l2decay_regularizer(self):
 
 
 class TestL1DecayRegularizer(unittest.TestCase):
+
     def test_l2decay_regularizer(self):
         paddle.enable_static()
         program = framework.Program()
@@ -81,20 +91,28 @@ def test_l2decay_regularizer(self):
         self.assertTrue(mul_x.regularizer is not None)
         self.assertTrue(
             isinstance(mul_x.regularizer, regularizer.L1DecayRegularizer))
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        mul_y = block.create_var(dtype="float32",
+                                 shape=[10, 8],
+                                 lod_level=0,
+                                 name="mul.y")
+        mul_out = block.create_var(dtype="float32",
+                                   shape=[5, 8],
+                                   lod_level=0,
+                                   name="mul.out")
+        block.append_op(type="mul",
+                        inputs={
+                            "X": mul_x,
+                            "Y": mul_y
+                        },
+                        outputs={"Out": mul_out},
+                        attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(dtype="float32",
+                                    shape=[1],
+                                    lod_level=0,
+                                    name="mean.out")
+        block.append_op(type="mean",
+                        inputs={"X": mul_out},
+                        outputs={"Out": mean_out})
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
@@ -120,8 +138,9 @@ def bow_net(data,
     This model is from https://github.com/PaddlePaddle/models:
     fluid/PaddleNLP/text_classification/nets.py
     """
-    emb = fluid.layers.embedding(
-        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
+    emb = fluid.layers.embedding(input=data,
+                                 is_sparse=is_sparse,
+                                 size=[dict_dim, emb_dim])
     bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
     bow_tanh = fluid.layers.tanh(bow)
     fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
@@ -133,6 +152,7 @@ def bow_net(data,
 
 
 class TestRegularizer(unittest.TestCase):
+
     def setUp(self):
         self.word_len = 1500
         self.train_data = [[(random.sample(range(1000), 10), [0])]
@@ -176,10 +196,12 @@ def check_l2decay_regularizer(self, place, model):
         paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
-        with self.scope_prog_guard(
-                main_prog=main_prog, startup_prog=startup_prog):
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
+        with self.scope_prog_guard(main_prog=main_prog,
+                                   startup_prog=startup_prog):
+            data = fluid.layers.data(name="words",
+                                     shape=[1],
+                                     dtype="int64",
+                                     lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
             avg_cost = model(data, label, self.word_len)
@@ -197,10 +219,12 @@ def check_l2decay(self, place, model):
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
 
-        with self.scope_prog_guard(
-                main_prog=main_prog, startup_prog=startup_prog):
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
+        with self.scope_prog_guard(main_prog=main_prog,
+                                   startup_prog=startup_prog):
+            data = fluid.layers.data(name="words",
+                                     shape=[1],
+                                     dtype="int64",
+                                     lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
             avg_cost_l2 = model(data, label, self.word_len)
@@ -231,10 +255,9 @@ def test_l2(self):
 
             assert len(dense_sparse_p_sum[0]) == len(dense_sparse_p_sum[1])
             for i in range(len(dense_sparse_p_sum[0])):
-                assert np.isclose(
-                    a=dense_sparse_p_sum[0][i],
-                    b=dense_sparse_p_sum[1][i],
-                    rtol=5e-5)
+                assert np.isclose(a=dense_sparse_p_sum[0][i],
+                                  b=dense_sparse_p_sum[1][i],
+                                  rtol=5e-5)
 
     def test_repeated_regularization(self):
         l1 = fluid.regularizer.L1Decay(regularization_coeff=0.1)
@@ -252,10 +275,14 @@ def test_repeated_regularization(self):
             paddle.seed(1)
             paddle.framework.random._manual_program_seed(1)
 
-            linear1 = fluid.dygraph.Linear(
-                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
-            linear2 = fluid.dygraph.Linear(
-                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
+            linear1 = fluid.dygraph.Linear(2,
+                                           2,
+                                           param_attr=fc_param_attr,
+                                           bias_attr=fc_param_attr)
+            linear2 = fluid.dygraph.Linear(2,
+                                           2,
+                                           param_attr=fc_param_attr,
+                                           bias_attr=fc_param_attr)
 
             loss1 = linear1(input)
             loss1.backward()
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer_api.py b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
index afa2441aac226..da2643cc64726 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
@@ -41,8 +41,9 @@ def bow_net(data,
     This model is from https://github.com/PaddlePaddle/models:
     fluid/PaddleNLP/text_classification/nets.py
     """
-    emb = fluid.layers.embedding(
-        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
+    emb = fluid.layers.embedding(input=data,
+                                 is_sparse=is_sparse,
+                                 size=[dict_dim, emb_dim])
     bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
     bow_tanh = fluid.layers.tanh(bow)
     fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
@@ -55,6 +56,7 @@ def bow_net(data,
 
 
 class TestRegularizer(unittest.TestCase):
+
     def setUp(self):
         self.word_len = 1500
         self.train_data = [[(random.sample(range(1000), 10), [0])]
@@ -98,10 +100,12 @@ def check_l2decay_regularizer(self, place, model):
         paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
-        with self.scope_prog_guard(
-                main_prog=main_prog, startup_prog=startup_prog):
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
+        with self.scope_prog_guard(main_prog=main_prog,
+                                   startup_prog=startup_prog):
+            data = fluid.layers.data(name="words",
+                                     shape=[1],
+                                     dtype="int64",
+                                     lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
             avg_cost = model(data, label, self.word_len)
@@ -119,10 +123,12 @@ def check_l2decay(self, place, model):
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
 
-        with self.scope_prog_guard(
-                main_prog=main_prog, startup_prog=startup_prog):
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
+        with self.scope_prog_guard(main_prog=main_prog,
+                                   startup_prog=startup_prog):
+            data = fluid.layers.data(name="words",
+                                     shape=[1],
+                                     dtype="int64",
+                                     lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
             avg_cost_l2 = model(data, label, self.word_len)
@@ -154,10 +160,9 @@ def test_l2(self):
 
             assert len(dense_sparse_p_sum[0]) == len(dense_sparse_p_sum[1])
             for i in range(len(dense_sparse_p_sum[0])):
-                assert np.isclose(
-                    a=dense_sparse_p_sum[0][i],
-                    b=dense_sparse_p_sum[1][i],
-                    rtol=5e-5)
+                assert np.isclose(a=dense_sparse_p_sum[0][i],
+                                  b=dense_sparse_p_sum[1][i],
+                                  rtol=5e-5)
 
     def test_repeated_regularization(self):
         paddle.enable_static()
@@ -176,10 +181,14 @@ def test_repeated_regularization(self):
             paddle.seed(1)
             paddle.framework.random._manual_program_seed(1)
 
-            linear1 = fluid.dygraph.Linear(
-                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
-            linear2 = fluid.dygraph.Linear(
-                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
+            linear1 = fluid.dygraph.Linear(2,
+                                           2,
+                                           param_attr=fc_param_attr,
+                                           bias_attr=fc_param_attr)
+            linear2 = fluid.dygraph.Linear(2,
+                                           2,
+                                           param_attr=fc_param_attr,
+                                           bias_attr=fc_param_attr)
 
             loss1 = linear1(input)
             loss1.backward()
diff --git a/python/paddle/fluid/tests/unittests/test_renorm_op.py b/python/paddle/fluid/tests/unittests/test_renorm_op.py
index e00a892cf7197..e266800319db1 100644
--- a/python/paddle/fluid/tests/unittests/test_renorm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_renorm_op.py
@@ -25,9 +25,10 @@
 
 
 class TestRenormAPI(unittest.TestCase):
+
     def input_data(self):
-        self.data_x = np.array(
-            [[[2.0, 2, -2], [3, 0.3, 3]], [[2, -8, 2], [3.1, 3.7, 3]]])
+        self.data_x = np.array([[[2.0, 2, -2], [3, 0.3, 3]],
+                                [[2, -8, 2], [3.1, 3.7, 3]]])
         self.p = 1.0
         self.dim = 2
         self.max_norm = 2.05
@@ -65,9 +66,10 @@ def test_dygraph_api(self):
             self.assertTrue(np.allclose(expected, np.array(y)))
             z = paddle.mean(y)
             z.backward(retain_graph=True)
-            expected_grad = np.array(
-                [[[0, 0.01394558, 0.02733333], [0, 0.01394558, 0.00683333]],
-                 [[0, 0.01045918, 0.00683333], [0, 0.01394558, 0.00683333]]])
+            expected_grad = np.array([[[0, 0.01394558, 0.02733333],
+                                       [0, 0.01394558, 0.00683333]],
+                                      [[0, 0.01045918, 0.00683333],
+                                       [0, 0.01394558, 0.00683333]]])
             self.assertTrue(np.allclose(expected_grad, np.array(x.grad)))
         #test exception:
         with fluid.dygraph.guard():
diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
index d67b80882655f..1f6fb37e1e0ca 100644
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
@@ -42,14 +42,14 @@ def setUpClass(cls):
 
     @classmethod
     def set_program(cls):
-        dat = fluid.layers.data(
-            name=cls.data_desc[0][0], shape=cls.data_desc[0][1])
+        dat = fluid.layers.data(name=cls.data_desc[0][0],
+                                shape=cls.data_desc[0][1])
         dat.stop_gradient = False
-        rank_dat = fluid.layers.data(
-            name=cls.data_desc[1][0], shape=cls.data_desc[1][1])
+        rank_dat = fluid.layers.data(name=cls.data_desc[1][0],
+                                     shape=cls.data_desc[1][1])
         table = lod_rank_table(rank_dat)
-        new_dat = fluid.layers.reorder_lod_tensor_by_rank(
-            x=dat, rank_table=table)
+        new_dat = fluid.layers.reorder_lod_tensor_by_rank(x=dat,
+                                                          rank_table=table)
         loss = fluid.layers.reduce_sum(new_dat)
         fluid.backward.append_backward(loss=loss)
         cls.fetch_list = [new_dat, cls.data_desc[0][0] + '@GRAD']
@@ -86,8 +86,8 @@ def set_data(self):
                     size=self.num_seq if i == 0 else sum(lod_level_i)).tolist()
                 data_lod.append(lod_level_i)
             data_value = numpy.random.random(
-                size=[sum(data_lod[-1]) if data_lod else self.num_seq
-                      ] + data_shape).astype('float32')
+                size=[sum(data_lod[-1]) if data_lod else self.num_seq] +
+                data_shape).astype('float32')
             self.data[data_name] = (data_value, data_lod)
 
     def set_inputs(self, place):
@@ -106,8 +106,8 @@ def reorder(self):
         rank_table = []  # list of (index, length)
         for i in range(len(ref_lod)):
             rank_table.append((i, ref_lod[i]))
-        rank_table = sorted(
-            rank_table, key=functools.cmp_to_key(lambda x, y: y[1] - x[1]))
+        rank_table = sorted(rank_table,
+                            key=functools.cmp_to_key(lambda x, y: y[1] - x[1]))
 
         # compute the input sequence info according to input_lod
         input_value, input_lod = self.data[self.data_desc[0][0]]
@@ -122,8 +122,8 @@ def reorder(self):
                 for lod_level_i in offset_lod[level:]:
                     sub_lod_i = []
                     for idx in range(start_idx, end_idx):
-                        sub_lod_i.append(lod_level_i[idx + 1] - lod_level_i[
-                            idx])
+                        sub_lod_i.append(lod_level_i[idx + 1] -
+                                         lod_level_i[idx])
                     sub_lod.append(sub_lod_i)
                     start_idx = lod_level_i[start_idx]
                     end_idx = lod_level_i[end_idx]
@@ -158,8 +158,9 @@ def test_reorder_lod_tensor(self):
         expect_output, expect_output_lod = self.reorder()
         for actual_output in self.actual_outputs:
             self.assertTrue(
-                numpy.allclose(
-                    numpy.array(actual_output), expect_output, atol=0.001))
+                numpy.allclose(numpy.array(actual_output),
+                               expect_output,
+                               atol=0.001))
             self.assertEqual(expect_output_lod,
                              actual_output.recursive_sequence_lengths())
         # check gradient
@@ -167,8 +168,9 @@ def test_reorder_lod_tensor(self):
         expect_grad_lod = self.data[self.data_desc[0][0]][1]
         for actual_grad in self.actual_grads:
             self.assertTrue(
-                numpy.allclose(
-                    numpy.array(actual_grad), expect_grad, atol=0.001))
+                numpy.allclose(numpy.array(actual_grad),
+                               expect_grad,
+                               atol=0.001))
             self.assertEqual(expect_grad_lod,
                              actual_grad.recursive_sequence_lengths())
 
@@ -180,8 +182,9 @@ def test_reorder_tensor(self):
         expect_output, expect_output_lod = self.reorder()
         for actual_output in self.actual_outputs:
             self.assertTrue(
-                numpy.allclose(
-                    numpy.array(actual_output), expect_output, atol=0.001))
+                numpy.allclose(numpy.array(actual_output),
+                               expect_output,
+                               atol=0.001))
             self.assertEqual(expect_output_lod,
                              actual_output.recursive_sequence_lengths())
         # check gradient
@@ -189,8 +192,9 @@ def test_reorder_tensor(self):
         expect_grad_lod = self.data[self.data_desc[0][0]][1]
         for actual_grad in self.actual_grads:
             self.assertTrue(
-                numpy.allclose(
-                    numpy.array(actual_grad), expect_grad, atol=0.001))
+                numpy.allclose(numpy.array(actual_grad),
+                               expect_grad,
+                               atol=0.001))
             self.assertEqual(expect_grad_lod,
                              actual_grad.recursive_sequence_lengths())
 
@@ -206,19 +210,21 @@ def test_reorder_tensor(self):
         self.run_program()
         for actual_output in self.actual_outputs:
             self.assertTrue(
-                numpy.allclose(
-                    numpy.array(actual_output), expect_output, atol=0.001))
+                numpy.allclose(numpy.array(actual_output),
+                               expect_output,
+                               atol=0.001))
 
 
 class TestReorderLoDTensorError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program()):
 
             def test_Variable():
                 # The input must be Variable.
                 x1 = numpy.array([0.9383, 0.1983, 3.2, 1.2]).astype("float64")
-                table1 = numpy.array(
-                    [0.9383, 0.1983, 3.2, 1.2]).astype("float64")
+                table1 = numpy.array([0.9383, 0.1983, 3.2,
+                                      1.2]).astype("float64")
                 new_dat = fluid.layers.reorder_lod_tensor_by_rank(
                     x=x1, rank_table=table1)
 
@@ -226,8 +232,9 @@ def test_Variable():
 
             def test_type():
                 x2 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
-                table2 = fluid.layers.data(
-                    name='table2', shape=[4], dtype='int32')
+                table2 = fluid.layers.data(name='table2',
+                                           shape=[4],
+                                           dtype='int32')
                 new_dat2 = fluid.layers.reorder_lod_tensor_by_rank(
                     x=x2, rank_table=table2)
 
diff --git a/python/paddle/fluid/tests/unittests/test_repeat_interleave_op.py b/python/paddle/fluid/tests/unittests/test_repeat_interleave_op.py
index b047b0c53d8f8..7abc758617cc9 100644
--- a/python/paddle/fluid/tests/unittests/test_repeat_interleave_op.py
+++ b/python/paddle/fluid/tests/unittests/test_repeat_interleave_op.py
@@ -24,6 +24,7 @@
 
 
 class TestRepeatInterleaveOp(OpTest):
+
     def setUp(self):
         self.op_type = "repeat_interleave"
         self.init_dtype_type()
@@ -63,6 +64,7 @@ def test_check_grad_normal(self):
 
 
 class TestRepeatInterleaveOp2(OpTest):
+
     def setUp(self):
         self.op_type = "repeat_interleave"
         self.init_dtype_type()
@@ -100,6 +102,7 @@ def test_check_grad_normal(self):
 
 
 class TestIndexSelectAPI(unittest.TestCase):
+
     def input_data(self):
         self.data_x = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
                                 [9.0, 10.0, 11.0, 12.0]])
@@ -112,15 +115,16 @@ def test_repeat_interleave_api(self):
         # case 1:
         with program_guard(Program(), Program()):
             x = fluid.layers.data(name='x', shape=[-1, 4])
-            index = fluid.layers.data(
-                name='repeats',
-                shape=[4],
-                dtype='int32',
-                append_batch_size=False)
+            index = fluid.layers.data(name='repeats',
+                                      shape=[4],
+                                      dtype='int32',
+                                      append_batch_size=False)
             z = paddle.repeat_interleave(x, index, axis=1)
             exe = fluid.Executor(fluid.CPUPlace())
-            res, = exe.run(feed={'x': self.data_x,
-                                 'repeats': self.data_index},
+            res, = exe.run(feed={
+                'x': self.data_x,
+                'repeats': self.data_index
+            },
                            fetch_list=[z.name],
                            return_numpy=False)
         expect_out = np.repeat(self.data_x, self.data_index, axis=1)
@@ -130,11 +134,10 @@ def test_repeat_interleave_api(self):
         repeats = np.array([1, 2, 1]).astype('int32')
         with program_guard(Program(), Program()):
             x = fluid.layers.data(name='x', shape=[-1, 4])
-            index = fluid.layers.data(
-                name='repeats',
-                shape=[3],
-                dtype='int32',
-                append_batch_size=False)
+            index = fluid.layers.data(name='repeats',
+                                      shape=[3],
+                                      dtype='int32',
+                                      append_batch_size=False)
             z = paddle.repeat_interleave(x, index, axis=0)
             exe = fluid.Executor(fluid.CPUPlace())
             res, = exe.run(feed={
diff --git a/python/paddle/fluid/tests/unittests/test_require_version.py b/python/paddle/fluid/tests/unittests/test_require_version.py
index d1cb0aa4d8164..8e9ea51c73302 100644
--- a/python/paddle/fluid/tests/unittests/test_require_version.py
+++ b/python/paddle/fluid/tests/unittests/test_require_version.py
@@ -22,6 +22,7 @@
 
 
 class VersionTest(unittest.TestCase):
+
     def test_check_output(self):
         warnings.warn(
             "paddle.__version__: %s, fluid_version.full_version: %s, fluid_version.major: %s, fluid_version.minor: %s, fluid_version.patch: %s, fluid_version.rc: %s."
@@ -67,6 +68,7 @@ def test_check_output(self):
 
 # Test Errors
 class TestErrors(unittest.TestCase):
+
     def test_errors(self):
         # The type of params must be str.
         def test_input_type():
diff --git a/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py b/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py
index 84e22024f76cf..839b0e331a89b 100644
--- a/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py
+++ b/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,11 +18,13 @@
 from paddle.fluid import framework
 from paddle.fluid.framework import _test_eager_guard
 import unittest
+
 paddle.set_device('cpu')
 
 
 # Test 1
 def clear_grad_test_0(w, a):
+
     @paddle.no_grad()
     def warp(*_):
         assert w.grad is not None
@@ -33,6 +35,7 @@ def warp(*_):
 
 
 class TestInplaceAndClearGradient(unittest.TestCase):
+
     def func_test(self):
         input_data = np.ones([1, 1])
         w = paddle.to_tensor(input_data, 'float32', stop_gradient=False)
@@ -54,12 +57,14 @@ def test(self):
 
 # Test 2
 class Counter:
+
     def __init__(self):
         self.num_calls = 0
         self.step = 0
 
 
 def clear_grad_test_1(w, c):
+
     @paddle.no_grad()
     def warp(*_):
         assert w.grad is not None
@@ -73,6 +78,7 @@ def warp(*_):
 
 
 class TestInplaceClearGradAccumulation(unittest.TestCase):
+
     def func_test(self):
         input_data = np.ones([1, 1])
         w = paddle.to_tensor(input_data, 'float32', stop_gradient=False)
@@ -100,6 +106,7 @@ def test(self):
 
 
 class TestInplaceClearGradAccumulationAlt(unittest.TestCase):
+
     def func_test(self):
         input_data = np.ones([1, 1])
         w = paddle.to_tensor(input_data, 'float32', stop_gradient=False)
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 40481b097827c..d4d89177653ff 100755
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -27,6 +27,7 @@
 
 # situation 1: have shape( list, no tensor), no actual shape(Tensor)
 class TestReshapeOp(OpTest):
+
     def setUp(self):
         self.init_data()
         self.op_type = "reshape2"
@@ -50,6 +51,7 @@ def test_check_grad(self):
 
 
 class TestReshapeBF16Op(OpTest):
+
     def setUp(self):
         self.init_data()
         self.op_type = "reshape2"
@@ -59,8 +61,10 @@ def setUp(self):
         self.inputs = {"X": convert_float_to_uint16(x)}
         self.attrs = {"shape": self.new_shape}
         self.outputs = {
-            "Out": convert_float_to_uint16(out),
-            'XShape': convert_float_to_uint16(
+            "Out":
+            convert_float_to_uint16(out),
+            'XShape':
+            convert_float_to_uint16(
                 np.random.random(self.ori_shape).astype("float32"))
         }
 
@@ -77,6 +81,7 @@ def test_check_grad(self):
 
 
 class TestReshapeOpDimInfer1(TestReshapeOp):
+
     def init_data(self):
         self.ori_shape = (5, 25)
         self.new_shape = (5, -1, 5)
@@ -84,6 +89,7 @@ def init_data(self):
 
 
 class TestReshapeOpDimInfer2(TestReshapeOp):
+
     def init_data(self):
         self.ori_shape = (10, 2, 6)
         self.new_shape = (10, 0, 3, -1)
@@ -92,14 +98,14 @@ def init_data(self):
 
 # situation 2: have shape(list, no tensor), have actual shape(Tensor)
 class TestReshapeOpWithInputShape(OpTest):
+
     def setUp(self):
         self.init_data()
         self.op_type = "reshape2"
 
         self.inputs = {
             "X": np.random.random(self.ori_shape).astype("float32"),
-            "Shape": np.array(
-                self.actual_shape, dtype="int32")
+            "Shape": np.array(self.actual_shape, dtype="int32")
         }
         self.attrs = {"shape": self.new_shape}
         self.outputs = {
@@ -121,6 +127,7 @@ def test_check_grad(self):
 
 # Situation 3: have shape(list, have tensor), no actual shape(Tensor)
 class TestReshapeOp_attr_ShapeTensor(OpTest):
+
     def setUp(self):
         self.init_data()
         self.op_type = "reshape2"
@@ -154,6 +161,7 @@ def test_check_grad(self):
 
 
 class TestReshapeOpDimInfer1_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
+
     def init_data(self):
         self.ori_shape = (5, 20)
         self.new_shape = (5, -1, 20)
@@ -162,6 +170,7 @@ def init_data(self):
 
 
 class TestReshapeOpDimInfer2_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
+
     def init_data(self):
         self.ori_shape = (10, 2, 6)
         self.new_shape = (10, 0, 3, -1)
@@ -171,14 +180,14 @@ def init_data(self):
 
 # Situation 4: have shape(Tensor), no actual shape(Tensor)
 class TestReshapeOp_attr_OnlyShape(OpTest):
+
     def setUp(self):
         self.init_data()
         self.op_type = "reshape2"
 
         self.inputs = {
             "X": np.random.random(self.ori_shape).astype("float32"),
-            "Shape": np.array(
-                self.new_shape, dtype="int32")
+            "Shape": np.array(self.new_shape, dtype="int32")
         }
         self.attrs = {}
         self.outputs = {
@@ -199,6 +208,7 @@ def test_check_grad(self):
 
 
 class TestReshapeOpDimInfer1_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+
     def init_data(self):
         self.ori_shape = (5, 20)
         self.new_shape = (5, -1, 10)
@@ -207,6 +217,7 @@ def init_data(self):
 
 
 class TestReshapeOpDimInfer2_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+
     def init_data(self):
         self.ori_shape = (10, 2, 6)
         self.new_shape = (10, 0, 3, -1)
@@ -216,6 +227,7 @@ def init_data(self):
 
 # test int8 data type on CPU
 class TestReshapeInt8Op(OpTest):
+
     def setUp(self):
         self.init_dtype()
         self.init_data()
@@ -242,8 +254,9 @@ def init_data(self):
         self.infered_shape = (10, 2, 3, -1)
 
     def test_check_output(self):
-        self.check_output_with_place(
-            fluid.core.CPUPlace(), atol=1e-5, no_check_set=['XShape'])
+        self.check_output_with_place(fluid.core.CPUPlace(),
+                                     atol=1e-5,
+                                     no_check_set=['XShape'])
 
     def test_check_grad(self):
         pass
@@ -251,17 +264,18 @@ def test_check_grad(self):
 
 # test unt8 data type on CPU
 class TestReshapeUint8Op(TestReshapeInt8Op):
+
     def init_dtype(self):
         self.dtype = np.uint8
 
 
 class TestReshapeOpBool(TestReshapeOp):
+
     def setUp(self):
         self.init_data()
         self.op_type = "reshape2"
         self.inputs = {
-            "X": np.random.choice(
-                [True, False], size=self.ori_shape)
+            "X": np.random.choice([True, False], size=self.ori_shape)
         }
         self.attrs = {"shape": self.new_shape}
         self.outputs = {
@@ -275,6 +289,7 @@ def test_check_grad(self):
 
 # Test python API
 class TestReshapeAPI(unittest.TestCase):
+
     def _set_paddle_api(self):
         self.fill_constant = paddle.fluid.layers.fill_constant
         self.data = paddle.static.data
@@ -304,8 +319,9 @@ def _test_api(self):
             out_1 = self.reshape(x, shape)
 
             # situation 2: have shape(list, no tensor), have actual shape(Tensor)
-            out_2 = fluid.layers.reshape(
-                x, shape=shape, actual_shape=actual_shape)
+            out_2 = fluid.layers.reshape(x,
+                                         shape=shape,
+                                         actual_shape=actual_shape)
 
             # Situation 3: have shape(list, have tensor), no actual shape(Tensor)
             out_3 = self.reshape(x, shape=[positive_five, 10])
@@ -316,8 +332,10 @@ def _test_api(self):
         exe = paddle.static.Executor(place=paddle.CPUPlace())
         res_1, res_2, res_3, res_4 = exe.run(
             main_prog,
-            feed={"x": input,
-                  "shape": np.array([2, 5, 5]).astype("int32")},
+            feed={
+                "x": input,
+                "shape": np.array([2, 5, 5]).astype("int32")
+            },
             fetch_list=[out_1, out_2, out_3, out_4])
 
         assert np.array_equal(res_1, input.reshape(shape))
@@ -354,6 +372,7 @@ def test_imperative(self):
 
 
 class TestStaticReshape_(TestReshapeAPI):
+
     def _executed_api(self):
         self.reshape = paddle.reshape_
 
@@ -379,6 +398,7 @@ def test_imperative(self):
 
 # Test Input Error
 class TestReshapeOpError(unittest.TestCase):
+
     def _set_paddle_api(self):
         self.data = paddle.static.data
         self.reshape = paddle.reshape
@@ -391,8 +411,8 @@ def _test_errors(self):
         with program_guard(Program(), Program()):
             # The x type of reshape_op must be Variable.
             def test_x_type():
-                x1 = fluid.create_lod_tensor(
-                    np.array([[-1]]), [[1]], paddle.CPUPlace())
+                x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                             paddle.CPUPlace())
                 self.reshape(x1, shape=[1])
 
             self.assertRaises(TypeError, test_x_type)
@@ -405,8 +425,9 @@ def test_x_dtype():
             self.assertRaises(TypeError, test_x_dtype)
 
             def test_x_dtype_float16():
-                x_float16 = self.data(
-                    name="x_float16", shape=[2, 25], dtype="float16")
+                x_float16 = self.data(name="x_float16",
+                                      shape=[2, 25],
+                                      dtype="float16")
                 self.reshape(x_float16, shape=[2, 5, 5])
 
             test_x_dtype_float16()
@@ -453,6 +474,7 @@ def test_fluid_api_error(self):
 
 
 class TestDygraphReshapeAPI(unittest.TestCase):
+
     def setUp(self):
         self.executed_api()
 
@@ -488,14 +510,16 @@ def test_out_float32(self):
 
 
 class TestDygraphReshapeInplaceAPI(TestDygraphReshapeAPI):
+
     def executed_api(self):
         self.reshape = paddle.reshape_
 
 
 class TestReshapeZeroTensor(unittest.TestCase):
+
     def test_reshape_zero_tensor_success(self):
         zero_tensor = paddle.zeros([0, 2, 3])
-        # since we use "0" as the dimension copy semantically in reshape, 
+        # since we use "0" as the dimension copy semantically in reshape,
         # we need to copy the 0 dim in the src tensor in order to make a successful zero tensor reshape
         zero_tensor = zero_tensor.reshape([0, 6])
         self.assertTrue(list(zero_tensor.shape) == [0, 6])
diff --git a/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py b/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
index 2f6ca1dfa0cb0..829960250d05d 100644
--- a/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
+++ b/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
@@ -21,8 +21,8 @@
 
 paddle.enable_static()
 
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',
+                    level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
@@ -38,6 +38,7 @@ def set_cinn_flag(val):
 
 @unittest.skipIf(not set_cinn_flag(True), "Paddle is not compiled with CINN.")
 class TestResnet50Accuracy(unittest.TestCase):
+
     def reader(self, limit):
         for _ in range(limit):
             yield {'image': np.random.randint(0, 256, size=[32, 3, 224, 224]).astype('float32'), \
@@ -52,8 +53,9 @@ def generate_random_data(self, loop_num=10):
 
     def build_program(self, main_program, startup_program):
         with paddle.static.program_guard(main_program, startup_program):
-            image = paddle.static.data(
-                name='image', shape=[32, 3, 224, 224], dtype='float32')
+            image = paddle.static.data(name='image',
+                                       shape=[32, 3, 224, 224],
+                                       dtype='float32')
             label = paddle.static.data(name='label', shape=[32], dtype='int64')
 
             # TODO: stop_gradient slower training speed, need fix
@@ -62,8 +64,8 @@ def build_program(self, main_program, startup_program):
             model = paddle.vision.models.resnet50()
             prediction = model(image)
 
-            loss = paddle.nn.functional.cross_entropy(
-                input=prediction, label=label)
+            loss = paddle.nn.functional.cross_entropy(input=prediction,
+                                                      label=label)
             loss = paddle.mean(loss)
             adam = paddle.optimizer.Adam(learning_rate=0.001)
             adam.minimize(loss)
@@ -98,8 +100,8 @@ def train(self, place, iters, feed, use_cinn=False, seed=1234):
         return loss_vals
 
     def test_check_resnet50_accuracy(self):
-        place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        place = paddle.CUDAPlace(
+            0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
 
         loop_num = 10
         feed = self.generate_random_data(loop_num)
diff --git a/python/paddle/fluid/tests/unittests/test_retain_graph.py b/python/paddle/fluid/tests/unittests/test_retain_graph.py
index 0259b898a488e..71998f57e5f95 100644
--- a/python/paddle/fluid/tests/unittests/test_retain_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
@@ -24,6 +24,7 @@
 
 
 class Generator(fluid.dygraph.Layer):
+
     def __init__(self):
         super(Generator, self).__init__()
         self.conv1 = paddle.nn.Conv2D(3, 3, 3, padding=1)
@@ -35,6 +36,7 @@ def forward(self, x):
 
 
 class Discriminator(fluid.dygraph.Layer):
+
     def __init__(self):
         super(Discriminator, self).__init__()
         self.convd = paddle.nn.Conv2D(6, 3, 1)
@@ -45,6 +47,7 @@ def forward(self, x):
 
 
 class TestRetainGraph(unittest.TestCase):
+
     def cal_gradient_penalty(self,
                              netD,
                              real_data,
@@ -73,21 +76,21 @@ def cal_gradient_penalty(self,
             fake_AB = paddle.concat((real_data.detach(), interpolatesv), 1)
             disc_interpolates = netD(fake_AB)
 
-            outs = paddle.fluid.layers.fill_constant(
-                disc_interpolates.shape, disc_interpolates.dtype, 1.0)
-            gradients = paddle.grad(
-                outputs=disc_interpolates,
-                inputs=fake_AB,
-                grad_outputs=outs,
-                create_graph=True,
-                retain_graph=True,
-                only_inputs=True)
+            outs = paddle.fluid.layers.fill_constant(disc_interpolates.shape,
+                                                     disc_interpolates.dtype,
+                                                     1.0)
+            gradients = paddle.grad(outputs=disc_interpolates,
+                                    inputs=fake_AB,
+                                    grad_outputs=outs,
+                                    create_graph=True,
+                                    retain_graph=True,
+                                    only_inputs=True)
 
             gradients = paddle.reshape(gradients[0], [real_data.shape[0], -1])
 
-            gradient_penalty = paddle.mean((paddle.norm(gradients + 1e-16, 2, 1)
-                                            - constant)**
-                                           2) * lambda_gp  # added eps
+            gradient_penalty = paddle.mean(
+                (paddle.norm(gradients + 1e-16, 2, 1) - constant)**
+                2) * lambda_gp  # added eps
             return gradient_penalty, gradients
         else:
             return 0.0, None
@@ -113,11 +116,13 @@ def run_retain(self, need_retain):
         fake_AB = paddle.concat((realA, fakeB), 1)
         G_pred_fake = d(fake_AB.detach())
 
-        false_target = paddle.fluid.layers.fill_constant(G_pred_fake.shape,
-                                                         'float32', 0.0)
+        false_target = paddle.fluid.layers.fill_constant(
+            G_pred_fake.shape, 'float32', 0.0)
 
-        G_gradient_penalty, _ = self.cal_gradient_penalty(
-            d, realA, fakeB, lambda_gp=10.0)
+        G_gradient_penalty, _ = self.cal_gradient_penalty(d,
+                                                          realA,
+                                                          fakeB,
+                                                          lambda_gp=10.0)
         loss_d = gan_criterion(G_pred_fake, false_target) + G_gradient_penalty
 
         loss_d.backward(retain_graph=need_retain)
@@ -128,8 +133,8 @@ def run_retain(self, need_retain):
         G_pred_fake = d(fake_AB)
         true_target = paddle.fluid.layers.fill_constant(G_pred_fake.shape,
                                                         'float32', 1.0)
-        loss_g = l1_criterion(fakeB, realB) + gan_criterion(G_pred_fake,
-                                                            true_target)
+        loss_g = l1_criterion(fakeB, realB) + gan_criterion(
+            G_pred_fake, true_target)
 
         loss_g.backward()
         optim_g.minimize(loss_g)
diff --git a/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py b/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
index 1bfc1b00aa822..4353c27278f0f 100644
--- a/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
+++ b/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
@@ -45,8 +45,9 @@ def multiclass_nms(prediction, class_num, keep_top_k, nms_threshold):
         for idx in indices:
             score_index.append((prediction[c][idx][4], c, idx))
 
-    sorted_score_index = sorted(
-        score_index, key=lambda tup: tup[0], reverse=True)
+    sorted_score_index = sorted(score_index,
+                                key=lambda tup: tup[0],
+                                reverse=True)
     if keep_top_k > -1 and num_det > keep_top_k:
         sorted_score_index = sorted_score_index[:keep_top_k]
         num_det = keep_top_k
@@ -103,10 +104,10 @@ def retinanet_detection_out(boxes_list, scores_list, anchors_list, im_info,
                 box_offset] * anchor_box_width + anchor_box_center_x
             target_box_center_y = bboxes_per_level[
                 box_offset + 1] * anchor_box_height + anchor_box_center_y
-            target_box_width = math.exp(bboxes_per_level[box_offset +
-                                                         2]) * anchor_box_width
-            target_box_height = math.exp(bboxes_per_level[
-                box_offset + 3]) * anchor_box_height
+            target_box_width = math.exp(
+                bboxes_per_level[box_offset + 2]) * anchor_box_width
+            target_box_height = math.exp(
+                bboxes_per_level[box_offset + 3]) * anchor_box_height
 
             pred_box_xmin = target_box_center_x - target_box_width / 2
             pred_box_ymin = target_box_center_y - target_box_height / 2
@@ -119,13 +120,17 @@ def retinanet_detection_out(boxes_list, scores_list, anchors_list, im_info,
             pred_box_ymax = pred_box_ymax / im_scale
 
             pred_box_xmin = max(
-                min(pred_box_xmin, np.round(im_width / im_scale) - 1), 0.)
+                min(pred_box_xmin,
+                    np.round(im_width / im_scale) - 1), 0.)
             pred_box_ymin = max(
-                min(pred_box_ymin, np.round(im_height / im_scale) - 1), 0.)
+                min(pred_box_ymin,
+                    np.round(im_height / im_scale) - 1), 0.)
             pred_box_xmax = max(
-                min(pred_box_xmax, np.round(im_width / im_scale) - 1), 0.)
+                min(pred_box_xmax,
+                    np.round(im_width / im_scale) - 1), 0.)
             pred_box_ymax = max(
-                min(pred_box_ymax, np.round(im_height / im_scale) - 1), 0.)
+                min(pred_box_ymax,
+                    np.round(im_height / im_scale) - 1), 0.)
 
             if c not in prediction.keys():
                 prediction[c] = []
@@ -167,6 +172,7 @@ def batched_retinanet_detection_out(boxes, scores, anchors, im_info,
 
 
 class TestRetinanetDetectionOutOp1(OpTest):
+
     def set_argument(self):
         self.score_threshold = 0.05
         self.min_level = 3
@@ -251,11 +257,14 @@ def setUp(self):
             'Scores': [('s0', self.scores_list[0]), ('s1', self.scores_list[1]),
                        ('s2', self.scores_list[2]), ('s3', self.scores_list[3]),
                        ('s4', self.scores_list[4])],
-            'Anchors':
-            [('a0', self.anchors_list[0]), ('a1', self.anchors_list[1]),
-             ('a2', self.anchors_list[2]), ('a3', self.anchors_list[3]),
-             ('a4', self.anchors_list[4])],
-            'ImInfo': (self.im_info, [[1, ]])
+            'Anchors': [('a0', self.anchors_list[0]),
+                        ('a1', self.anchors_list[1]),
+                        ('a2', self.anchors_list[2]),
+                        ('a3', self.anchors_list[3]),
+                        ('a4', self.anchors_list[4])],
+            'ImInfo': (self.im_info, [[
+                1,
+            ]])
         }
         self.outputs = {'Out': (nmsed_outs, [lod])}
         self.attrs = {
@@ -271,6 +280,7 @@ def test_check_output(self):
 
 
 class TestRetinanetDetectionOutOp2(OpTest):
+
     def set_argument(self):
         self.score_threshold = 0.05
         self.min_level = 3
@@ -295,6 +305,7 @@ def set_argument(self):
 
 
 class TestRetinanetDetectionOutOpNo3(TestRetinanetDetectionOutOp1):
+
     def set_argument(self):
         # Here set 2.0 to test the case there is no outputs.
         # In practical use, 0.0 < score_threshold < 1.0
@@ -324,6 +335,7 @@ def set_argument(self):
 
 
 class TestRetinanetDetectionOutOpNo4(TestRetinanetDetectionOutOp1):
+
     def set_argument(self):
         self.score_threshold = 0.05
         self.min_level = 2
@@ -360,16 +372,19 @@ def setUp(self):
         nmsed_outs = np.array(nmsed_outs).astype('float32')
         self.op_type = 'retinanet_detection_output'
         self.inputs = {
-            'BBoxes':
-            [('b0', self.bboxes_list[0]), ('b1', self.bboxes_list[1]),
-             ('b2', self.bboxes_list[2]), ('b3', self.bboxes_list[3])],
+            'BBoxes': [('b0', self.bboxes_list[0]), ('b1', self.bboxes_list[1]),
+                       ('b2', self.bboxes_list[2]),
+                       ('b3', self.bboxes_list[3])],
             'Scores': [('s0', self.scores_list[0]), ('s1', self.scores_list[1]),
                        ('s2', self.scores_list[2]),
                        ('s3', self.scores_list[3])],
-            'Anchors':
-            [('a0', self.anchors_list[0]), ('a1', self.anchors_list[1]),
-             ('a2', self.anchors_list[2]), ('a3', self.anchors_list[3])],
-            'ImInfo': (self.im_info, [[1, ]])
+            'Anchors': [('a0', self.anchors_list[0]),
+                        ('a1', self.anchors_list[1]),
+                        ('a2', self.anchors_list[2]),
+                        ('a3', self.anchors_list[3])],
+            'ImInfo': (self.im_info, [[
+                1,
+            ]])
         }
         self.outputs = {'Out': (nmsed_outs, [lod])}
         self.attrs = {
@@ -385,6 +400,7 @@ def test_check_output(self):
 
 
 class TestRetinanetDetectionOutOpNo5(TestRetinanetDetectionOutOp1):
+
     def set_argument(self):
         self.score_threshold = 0.05
         self.min_level = 3
@@ -412,24 +428,32 @@ def set_argument(self):
 
 
 class TestRetinanetDetectionOutOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            bboxes_low1 = fluid.data(
-                name='bboxes_low1', shape=[1, 44, 4], dtype='float32')
-            bboxes_high1 = fluid.data(
-                name='bboxes_high1', shape=[1, 11, 4], dtype='float32')
-            scores_low1 = fluid.data(
-                name='scores_low1', shape=[1, 44, 10], dtype='float32')
-            scores_high1 = fluid.data(
-                name='scores_high1', shape=[1, 11, 10], dtype='float32')
-            anchors_low1 = fluid.data(
-                name='anchors_low1', shape=[44, 4], dtype='float32')
-            anchors_high1 = fluid.data(
-                name='anchors_high1', shape=[11, 4], dtype='float32')
-            im_info1 = fluid.data(
-                name="im_info1", shape=[1, 3], dtype='float32')
-
-            # The `bboxes` must be list, each element must be Variable and 
+            bboxes_low1 = fluid.data(name='bboxes_low1',
+                                     shape=[1, 44, 4],
+                                     dtype='float32')
+            bboxes_high1 = fluid.data(name='bboxes_high1',
+                                      shape=[1, 11, 4],
+                                      dtype='float32')
+            scores_low1 = fluid.data(name='scores_low1',
+                                     shape=[1, 44, 10],
+                                     dtype='float32')
+            scores_high1 = fluid.data(name='scores_high1',
+                                      shape=[1, 11, 10],
+                                      dtype='float32')
+            anchors_low1 = fluid.data(name='anchors_low1',
+                                      shape=[44, 4],
+                                      dtype='float32')
+            anchors_high1 = fluid.data(name='anchors_high1',
+                                       shape=[11, 4],
+                                       dtype='float32')
+            im_info1 = fluid.data(name="im_info1",
+                                  shape=[1, 3],
+                                  dtype='float32')
+
+            # The `bboxes` must be list, each element must be Variable and
             # its Tensor data type must be one of float32 and float64.
             def test_bboxes_type():
                 fluid.layers.retinanet_detection_output(
@@ -441,8 +465,9 @@ def test_bboxes_type():
             self.assertRaises(TypeError, test_bboxes_type)
 
             def test_bboxes_tensor_dtype():
-                bboxes_high2 = fluid.data(
-                    name='bboxes_high2', shape=[1, 11, 4], dtype='int32')
+                bboxes_high2 = fluid.data(name='bboxes_high2',
+                                          shape=[1, 11, 4],
+                                          dtype='int32')
                 fluid.layers.retinanet_detection_output(
                     bboxes=[bboxes_high2, 5],
                     scores=[scores_low1, scores_high1],
@@ -463,8 +488,9 @@ def test_scores_type():
             self.assertRaises(TypeError, test_scores_type)
 
             def test_scores_tensor_dtype():
-                scores_high2 = fluid.data(
-                    name='scores_high2', shape=[1, 11, 10], dtype='int32')
+                scores_high2 = fluid.data(name='scores_high2',
+                                          shape=[1, 11, 10],
+                                          dtype='int32')
                 fluid.layers.retinanet_detection_output(
                     bboxes=[bboxes_low1, bboxes_high1],
                     scores=[scores_high2, 5],
@@ -485,8 +511,9 @@ def test_anchors_type():
             self.assertRaises(TypeError, test_anchors_type)
 
             def test_anchors_tensor_dtype():
-                anchors_high2 = fluid.data(
-                    name='anchors_high2', shape=[11, 4], dtype='int32')
+                anchors_high2 = fluid.data(name='anchors_high2',
+                                           shape=[11, 4],
+                                           dtype='int32')
                 fluid.layers.retinanet_detection_output(
                     bboxes=[bboxes_low1, bboxes_high1],
                     scores=[scores_low1, scores_high1],
@@ -507,8 +534,9 @@ def test_iminfo_type():
             self.assertRaises(TypeError, test_iminfo_type)
 
             def test_iminfo_tensor_dtype():
-                im_info2 = fluid.data(
-                    name='im_info2', shape=[1, 3], dtype='int32')
+                im_info2 = fluid.data(name='im_info2',
+                                      shape=[1, 3],
+                                      dtype='int32')
                 fluid.layers.retinanet_detection_output(
                     bboxes=[bboxes_low1, bboxes_high1],
                     scores=[scores_low1, scores_high1],
diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py
index 9b739eff97cbe..263fecc619eb7 100644
--- a/python/paddle/fluid/tests/unittests/test_reverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
@@ -23,6 +23,7 @@
 
 
 class TestReverseOp(OpTest):
+
     def initTestCase(self):
         self.x = np.random.random((3, 40)).astype('float64')
         self.axis = [0]
@@ -45,54 +46,63 @@ def test_check_grad(self):
 
 
 class TestCase0(TestReverseOp):
+
     def initTestCase(self):
         self.x = np.random.random((3, 40)).astype('float64')
         self.axis = [1]
 
 
 class TestCase0_neg(TestReverseOp):
+
     def initTestCase(self):
         self.x = np.random.random((3, 40)).astype('float64')
         self.axis = [-1]
 
 
 class TestCase1(TestReverseOp):
+
     def initTestCase(self):
         self.x = np.random.random((3, 40)).astype('float64')
         self.axis = [0, 1]
 
 
 class TestCase1_neg(TestReverseOp):
+
     def initTestCase(self):
         self.x = np.random.random((3, 40)).astype('float64')
         self.axis = [0, -1]
 
 
 class TestCase2(TestReverseOp):
+
     def initTestCase(self):
         self.x = np.random.random((3, 4, 10)).astype('float64')
         self.axis = [0, 2]
 
 
 class TestCase2_neg(TestReverseOp):
+
     def initTestCase(self):
         self.x = np.random.random((3, 4, 10)).astype('float64')
         self.axis = [0, -2]
 
 
 class TestCase3(TestReverseOp):
+
     def initTestCase(self):
         self.x = np.random.random((3, 4, 10)).astype('float64')
         self.axis = [1, 2]
 
 
 class TestCase3_neg(TestReverseOp):
+
     def initTestCase(self):
         self.x = np.random.random((3, 4, 10)).astype('float64')
         self.axis = [-1, -2]
 
 
 class TestCase4(unittest.TestCase):
+
     def test_error(self):
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -100,8 +110,9 @@ def test_error(self):
         train_program = fluid.Program()
         startup_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
-            label = fluid.layers.data(
-                name="label", shape=[1, 1, 1, 1, 1, 1, 1, 1], dtype="int64")
+            label = fluid.layers.data(name="label",
+                                      shape=[1, 1, 1, 1, 1, 1, 1, 1],
+                                      dtype="int64")
             rev = fluid.layers.reverse(label, axis=[-1, -2])
 
         def _run_program():
@@ -112,10 +123,11 @@ def _run_program():
 
 
 class TestReverseLoDTensorArray(unittest.TestCase):
+
     def setUp(self):
         self.shapes = [[5, 25], [5, 20], [5, 5]]
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self.exe = fluid.Executor(self.place)
 
     def run_program(self, arr_len, axis=0):
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index 62839d3a960f1..42f32f2e75bd8 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -28,8 +28,9 @@ def create_selected_rows_and_tensor(scope, place, height, row_num,
     sr = scope.var("@selected_rows@").get_selected_rows()
     tensor = scope.var("grad").get_tensor()
 
-    rows = np.random.random_integers(
-        low=0, high=height - 1, size=[row_num, ]).astype('int64')
+    rows = np.random.random_integers(low=0, high=height - 1, size=[
+        row_num,
+    ]).astype('int64')
     sr_val = np.random.random(size=[row_num, embedding_size]).astype('float32')
 
     sr.set_height(height)
@@ -46,6 +47,7 @@ def create_selected_rows_and_tensor(scope, place, height, row_num,
 
 
 class TestBase(unittest.TestCase):
+
     def setup(self,
               place,
               is_sparse,
@@ -62,8 +64,8 @@ def setup(self,
         self.param = np.random.random(size).astype("float32")
 
         self.mean_square_name = "mean_square"
-        self.mean_square = np.random.uniform(
-            low=1, high=2, size=size).astype("float32")
+        self.mean_square = np.random.uniform(low=1, high=2,
+                                             size=size).astype("float32")
 
         self.mean_grad_name = "mean_grad"
         self.mean_grad = np.random.random(size).astype("float32")
@@ -84,19 +86,19 @@ def setup(self,
             grad_tensor.set(self.grad, place)
 
         self.moment_name = "moment"
-        self.moment = np.random.uniform(
-            low=0, high=1, size=size).astype("float32")
+        self.moment = np.random.uniform(low=0, high=1,
+                                        size=size).astype("float32")
 
         self.epsilon = epsilon
         self.decay = 0.9
         self.momentum = 0.1
         self.centered = centered
 
-        self.ms_out = self.decay * self.mean_square + (1 - self.decay
-                                                       ) * self.grad * self.grad
+        self.ms_out = self.decay * self.mean_square + (
+            1 - self.decay) * self.grad * self.grad
         if centered:
-            self.mg_out = self.decay * self.mean_grad + (1 - self.decay
-                                                         ) * self.grad
+            self.mg_out = self.decay * self.mean_grad + (1 -
+                                                         self.decay) * self.grad
             self.moment_out = self.momentum * self.moment + \
                               self.learning_rate * self.grad / np.sqrt(self.ms_out - np.square(self.mg_out) + self.epsilon)
         else:
@@ -126,13 +128,13 @@ def setup(self,
 
     def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
         self.assertTrue(
-            np.allclose(
-                actual_t, expect_t, atol=atol),
-            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
-            + str(expect_t) + "\n" + "But Got" + str(actual_t))
+            np.allclose(actual_t, expect_t, atol=atol),
+            "Output (" + out_name + ") has diff at " + str(place) +
+            "\nExpect " + str(expect_t) + "\n" + "But Got" + str(actual_t))
 
 
 class TestRmspropOp(TestBase):
+
     def check_with_place(self,
                          place,
                          is_sparse,
@@ -170,29 +172,25 @@ def run_and_check(self):
 
         rmsprop_op.run(self.scope, self.place)
 
-        self.check(
-            np.array(self.mean_square_tensor),
-            self.ms_out,
-            self.place,
-            self.mean_square_name,
-            atol=atol)
-        self.check(
-            np.array(self.moment_tensor),
-            self.moment_out,
-            self.place,
-            self.moment_name,
-            atol=atol)
-        self.check(
-            np.array(self.param_tensor),
-            self.param_out,
-            self.place,
-            self.param_name,
-            atol=atol)
+        self.check(np.array(self.mean_square_tensor),
+                   self.ms_out,
+                   self.place,
+                   self.mean_square_name,
+                   atol=atol)
+        self.check(np.array(self.moment_tensor),
+                   self.moment_out,
+                   self.place,
+                   self.moment_name,
+                   atol=atol)
+        self.check(np.array(self.param_tensor),
+                   self.param_out,
+                   self.place,
+                   self.param_name,
+                   atol=atol)
 
         if self.centered:
-            self.check(
-                np.array(self.mean_grad_tensor), self.mg_out, self.place,
-                self.mean_grad_name)
+            self.check(np.array(self.mean_grad_tensor), self.mg_out, self.place,
+                       self.mean_grad_name)
 
     def test_rmsprop(self):
         places = [core.CPUPlace()]
@@ -203,37 +201,37 @@ def test_rmsprop(self):
         for place in places:
             for centered in [False, True]:
                 with fluid.scope_guard(core.Scope()):
-                    self.check_with_place(
-                        place, is_sparse=False, centered=centered, size=size)
+                    self.check_with_place(place,
+                                          is_sparse=False,
+                                          centered=centered,
+                                          size=size)
 
                 with fluid.scope_guard(core.Scope()):
-                    self.check_with_place(
-                        place,
-                        is_sparse=True,
-                        centered=centered,
-                        row_num=512,
-                        size=size)
+                    self.check_with_place(place,
+                                          is_sparse=True,
+                                          centered=centered,
+                                          row_num=512,
+                                          size=size)
 
                 with fluid.scope_guard(core.Scope()):
-                    self.check_with_place(
-                        place,
-                        is_sparse=True,
-                        centered=centered,
-                        row_num=60,
-                        size=size)
+                    self.check_with_place(place,
+                                          is_sparse=True,
+                                          centered=centered,
+                                          row_num=60,
+                                          size=size)
 
 
 class TestRMSPropV2(unittest.TestCase):
+
     def test_rmsprop_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.RMSProp(
-            learning_rate=0.01,
-            parameters=linear.parameters(),
-            weight_decay=0.01)
+        adam = paddle.optimizer.RMSProp(learning_rate=0.01,
+                                        parameters=linear.parameters(),
+                                        weight_decay=0.01)
         out = linear(a)
         out.backward()
         adam.step()
@@ -254,8 +252,8 @@ def test_rmsprop(self):
             rms_optimizer.minimize(avg_cost)
 
             fetch_list = [avg_cost]
-            train_reader = paddle.batch(
-                paddle.dataset.uci_housing.train(), batch_size=1)
+            train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
+                                        batch_size=1)
             feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -264,34 +262,38 @@ def test_rmsprop(self):
 
     def test_raise_error(self):
         self.assertRaises(ValueError, paddle.optimizer.RMSProp, None)
-        self.assertRaises(
-            ValueError, paddle.optimizer.RMSProp, learning_rate=0.1, rho=None)
-        self.assertRaises(
-            ValueError,
-            paddle.optimizer.RMSProp,
-            learning_rate=0.1,
-            epsilon=None)
-        self.assertRaises(
-            ValueError,
-            paddle.optimizer.RMSProp,
-            learning_rate=0.1,
-            momentum=None)
+        self.assertRaises(ValueError,
+                          paddle.optimizer.RMSProp,
+                          learning_rate=0.1,
+                          rho=None)
+        self.assertRaises(ValueError,
+                          paddle.optimizer.RMSProp,
+                          learning_rate=0.1,
+                          epsilon=None)
+        self.assertRaises(ValueError,
+                          paddle.optimizer.RMSProp,
+                          learning_rate=0.1,
+                          momentum=None)
 
     def test_rmsprop_op_invalid_input(self):
         paddle.disable_static()
         linear = paddle.nn.Linear(10, 10)
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.RMSProp(
-                0.1, epsilon=-1, parameters=linear.parameters())
+            adam = paddle.optimizer.RMSProp(0.1,
+                                            epsilon=-1,
+                                            parameters=linear.parameters())
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.RMSProp(
-                0.1, momentum=-1, parameters=linear.parameters())
+            adam = paddle.optimizer.RMSProp(0.1,
+                                            momentum=-1,
+                                            parameters=linear.parameters())
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.RMSProp(
-                0.1, rho=-1, parameters=linear.parameters())
+            adam = paddle.optimizer.RMSProp(0.1,
+                                            rho=-1,
+                                            parameters=linear.parameters())
 
 
 class TestRMSPropV2Group(TestRMSPropV2):
+
     def test_rmsprop_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -299,15 +301,17 @@ def test_rmsprop_dygraph(self):
         linear_1 = paddle.nn.Linear(13, 5)
         linear_2 = paddle.nn.Linear(5, 3)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.RMSProp(
-            learning_rate=0.01,
-            parameters=[{
-                'params': linear_1.parameters()
-            }, {
-                'params': linear_2.parameters(),
-                'weight_decay': 0.001
-            }],
-            weight_decay=0.01)
+        adam = paddle.optimizer.RMSProp(learning_rate=0.01,
+                                        parameters=[{
+                                            'params':
+                                            linear_1.parameters()
+                                        }, {
+                                            'params':
+                                            linear_2.parameters(),
+                                            'weight_decay':
+                                            0.001
+                                        }],
+                                        weight_decay=0.01)
         out = linear_1(a)
         out = linear_2(out)
         out.backward()
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py b/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py
index 6b9438eecea7d..527b6c5e2d8ec 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py
@@ -35,15 +35,19 @@
 
 
 class TestLSTMCellError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             batch_size, input_size, hidden_size = 4, 16, 16
-            inputs = fluid.data(
-                name='inputs', shape=[None, input_size], dtype='float32')
-            pre_hidden = fluid.data(
-                name='pre_hidden', shape=[None, hidden_size], dtype='float32')
-            pre_cell = fluid.data(
-                name='pre_cell', shape=[None, hidden_size], dtype='float32')
+            inputs = fluid.data(name='inputs',
+                                shape=[None, input_size],
+                                dtype='float32')
+            pre_hidden = fluid.data(name='pre_hidden',
+                                    shape=[None, hidden_size],
+                                    dtype='float32')
+            pre_cell = fluid.data(name='pre_cell',
+                                  shape=[None, hidden_size],
+                                  dtype='float32')
             cell = LSTMCell(hidden_size)
 
             def test_input_Variable():
@@ -68,28 +72,25 @@ def test_pre_cell_Variable():
             self.assertRaises(TypeError, test_pre_cell_Variable)
 
             def test_input_type():
-                error_inputs = fluid.data(
-                    name='error_inputs',
-                    shape=[None, input_size],
-                    dtype='int32')
+                error_inputs = fluid.data(name='error_inputs',
+                                          shape=[None, input_size],
+                                          dtype='int32')
                 cell(error_inputs, [pre_hidden, pre_cell])
 
             self.assertRaises(TypeError, test_input_type)
 
             def test_pre_hidden_type():
-                error_pre_hidden = fluid.data(
-                    name='error_pre_hidden',
-                    shape=[None, hidden_size],
-                    dtype='int32')
+                error_pre_hidden = fluid.data(name='error_pre_hidden',
+                                              shape=[None, hidden_size],
+                                              dtype='int32')
                 cell(inputs, [error_pre_hidden, pre_cell])
 
             self.assertRaises(TypeError, test_pre_hidden_type)
 
             def test_pre_cell_type():
-                error_pre_cell = fluid.data(
-                    name='error_pre_cell',
-                    shape=[None, hidden_size],
-                    dtype='int32')
+                error_pre_cell = fluid.data(name='error_pre_cell',
+                                            shape=[None, hidden_size],
+                                            dtype='int32')
                 cell(inputs, [pre_hidden, error_pre_cell])
 
             self.assertRaises(TypeError, test_pre_cell_type)
@@ -102,18 +103,22 @@ def test_dtype():
 
 
 class TestLSTMCell(unittest.TestCase):
+
     def setUp(self):
         self.batch_size = 4
         self.input_size = 16
         self.hidden_size = 16
 
     def test_run(self):
-        inputs = fluid.data(
-            name='inputs', shape=[None, self.input_size], dtype='float32')
-        pre_hidden = fluid.data(
-            name='pre_hidden', shape=[None, self.hidden_size], dtype='float32')
-        pre_cell = fluid.data(
-            name='pre_cell', shape=[None, self.hidden_size], dtype='float32')
+        inputs = fluid.data(name='inputs',
+                            shape=[None, self.input_size],
+                            dtype='float32')
+        pre_hidden = fluid.data(name='pre_hidden',
+                                shape=[None, self.hidden_size],
+                                dtype='float32')
+        pre_cell = fluid.data(name='pre_cell',
+                              shape=[None, self.hidden_size],
+                              dtype='float32')
 
         cell = LSTMCell(self.hidden_size)
         lstm_hidden_new, lstm_states_new = cell(inputs, [pre_hidden, pre_cell])
@@ -142,14 +147,14 @@ def test_run(self):
         ], ["LSTMCell/BasicLSTMUnit_0.b_0", "basicLSTM/BasicLSTMUnit_0.b_0"]]
 
         for names in param_names:
-            param = np.array(fluid.global_scope().find_var(names[0]).get_tensor(
-            ))
-            param = np.random.uniform(
-                -0.1, 0.1, size=param.shape).astype('float32')
-            fluid.global_scope().find_var(names[0]).get_tensor().set(param,
-                                                                     place)
-            fluid.global_scope().find_var(names[1]).get_tensor().set(param,
-                                                                     place)
+            param = np.array(fluid.global_scope().find_var(
+                names[0]).get_tensor())
+            param = np.random.uniform(-0.1, 0.1,
+                                      size=param.shape).astype('float32')
+            fluid.global_scope().find_var(names[0]).get_tensor().set(
+                param, place)
+            fluid.global_scope().find_var(names[1]).get_tensor().set(
+                param, place)
 
         out = exe.run(feed={
             'inputs': inputs_np,
@@ -162,16 +167,17 @@ def test_run(self):
 
 
 class TestGRUCellError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             batch_size, input_size, hidden_size = 4, 16, 16
-            inputs = fluid.data(
-                name='inputs', shape=[None, input_size], dtype='float32')
-            pre_hidden = layers.data(
-                name='pre_hidden',
-                shape=[None, hidden_size],
-                append_batch_size=False,
-                dtype='float32')
+            inputs = fluid.data(name='inputs',
+                                shape=[None, input_size],
+                                dtype='float32')
+            pre_hidden = layers.data(name='pre_hidden',
+                                     shape=[None, hidden_size],
+                                     append_batch_size=False,
+                                     dtype='float32')
             cell = GRUCell(hidden_size)
 
             def test_input_Variable():
@@ -189,19 +195,17 @@ def test_pre_hidden_Variable():
             self.assertRaises(TypeError, test_pre_hidden_Variable)
 
             def test_input_type():
-                error_inputs = fluid.data(
-                    name='error_inputs',
-                    shape=[None, input_size],
-                    dtype='int32')
+                error_inputs = fluid.data(name='error_inputs',
+                                          shape=[None, input_size],
+                                          dtype='int32')
                 cell(error_inputs, pre_hidden)
 
             self.assertRaises(TypeError, test_input_type)
 
             def test_pre_hidden_type():
-                error_pre_hidden = fluid.data(
-                    name='error_pre_hidden',
-                    shape=[None, hidden_size],
-                    dtype='int32')
+                error_pre_hidden = fluid.data(name='error_pre_hidden',
+                                              shape=[None, hidden_size],
+                                              dtype='int32')
                 cell(inputs, error_pre_hidden)
 
             self.assertRaises(TypeError, test_pre_hidden_type)
@@ -214,25 +218,28 @@ def test_dtype():
 
 
 class TestGRUCell(unittest.TestCase):
+
     def setUp(self):
         self.batch_size = 4
         self.input_size = 16
         self.hidden_size = 16
 
     def test_run(self):
-        inputs = fluid.data(
-            name='inputs', shape=[None, self.input_size], dtype='float32')
-        pre_hidden = layers.data(
-            name='pre_hidden',
-            shape=[None, self.hidden_size],
-            append_batch_size=False,
-            dtype='float32')
+        inputs = fluid.data(name='inputs',
+                            shape=[None, self.input_size],
+                            dtype='float32')
+        pre_hidden = layers.data(name='pre_hidden',
+                                 shape=[None, self.hidden_size],
+                                 append_batch_size=False,
+                                 dtype='float32')
 
         cell = GRUCell(self.hidden_size)
         gru_hidden_new, _ = cell(inputs, pre_hidden)
 
-        gru_unit = contrib.layers.rnn_impl.BasicGRUUnit(
-            "basicGRU", self.hidden_size, None, None, None, None, "float32")
+        gru_unit = contrib.layers.rnn_impl.BasicGRUUnit("basicGRU",
+                                                        self.hidden_size, None,
+                                                        None, None, None,
+                                                        "float32")
         gru_hidden = gru_unit(inputs, pre_hidden)
 
         if core.is_compiled_with_cuda():
@@ -255,64 +262,65 @@ def test_run(self):
         ]
 
         for names in param_names:
-            param = np.array(fluid.global_scope().find_var(names[0]).get_tensor(
-            ))
-            param = np.random.uniform(
-                -0.1, 0.1, size=param.shape).astype('float32')
-            fluid.global_scope().find_var(names[0]).get_tensor().set(param,
-                                                                     place)
-            fluid.global_scope().find_var(names[1]).get_tensor().set(param,
-                                                                     place)
-
-        out = exe.run(feed={'inputs': inputs_np,
-                            'pre_hidden': pre_hidden_np},
+            param = np.array(fluid.global_scope().find_var(
+                names[0]).get_tensor())
+            param = np.random.uniform(-0.1, 0.1,
+                                      size=param.shape).astype('float32')
+            fluid.global_scope().find_var(names[0]).get_tensor().set(
+                param, place)
+            fluid.global_scope().find_var(names[1]).get_tensor().set(
+                param, place)
+
+        out = exe.run(feed={
+            'inputs': inputs_np,
+            'pre_hidden': pre_hidden_np
+        },
                       fetch_list=[gru_hidden_new, gru_hidden])
 
         self.assertTrue(np.allclose(out[0], out[1], rtol=1e-4, atol=0))
 
 
 class TestRnnError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             batch_size = 4
             input_size = 16
             hidden_size = 16
             seq_len = 4
-            inputs = fluid.data(
-                name='inputs', shape=[None, input_size], dtype='float32')
-            pre_hidden = layers.data(
-                name='pre_hidden',
-                shape=[None, hidden_size],
-                append_batch_size=False,
-                dtype='float32')
-            inputs_basic_lstm = fluid.data(
-                name='inputs_basic_lstm',
-                shape=[None, None, input_size],
-                dtype='float32')
-            sequence_length = fluid.data(
-                name="sequence_length", shape=[None], dtype='int64')
-
-            inputs_dynamic_rnn = layers.transpose(
-                inputs_basic_lstm, perm=[1, 0, 2])
+            inputs = fluid.data(name='inputs',
+                                shape=[None, input_size],
+                                dtype='float32')
+            pre_hidden = layers.data(name='pre_hidden',
+                                     shape=[None, hidden_size],
+                                     append_batch_size=False,
+                                     dtype='float32')
+            inputs_basic_lstm = fluid.data(name='inputs_basic_lstm',
+                                           shape=[None, None, input_size],
+                                           dtype='float32')
+            sequence_length = fluid.data(name="sequence_length",
+                                         shape=[None],
+                                         dtype='int64')
+
+            inputs_dynamic_rnn = layers.transpose(inputs_basic_lstm,
+                                                  perm=[1, 0, 2])
             cell = LSTMCell(hidden_size, name="LSTMCell_for_rnn")
             np_inputs_dynamic_rnn = np.random.random(
                 (seq_len, batch_size, input_size)).astype("float32")
 
             def test_input_Variable():
-                dynamic_rnn(
-                    cell=cell,
-                    inputs=np_inputs_dynamic_rnn,
-                    sequence_length=sequence_length,
-                    is_reverse=False)
+                dynamic_rnn(cell=cell,
+                            inputs=np_inputs_dynamic_rnn,
+                            sequence_length=sequence_length,
+                            is_reverse=False)
 
             self.assertRaises(TypeError, test_input_Variable)
 
             def test_input_list():
-                dynamic_rnn(
-                    cell=cell,
-                    inputs=[np_inputs_dynamic_rnn],
-                    sequence_length=sequence_length,
-                    is_reverse=False)
+                dynamic_rnn(cell=cell,
+                            inputs=[np_inputs_dynamic_rnn],
+                            sequence_length=sequence_length,
+                            is_reverse=False)
 
             self.assertRaises(TypeError, test_input_list)
 
@@ -320,12 +328,11 @@ def test_initial_states_type():
                 cell = GRUCell(hidden_size, name="GRUCell_for_rnn")
                 error_initial_states = np.random.random(
                     (batch_size, hidden_size)).astype("float32")
-                dynamic_rnn(
-                    cell=cell,
-                    inputs=inputs_dynamic_rnn,
-                    initial_states=error_initial_states,
-                    sequence_length=sequence_length,
-                    is_reverse=False)
+                dynamic_rnn(cell=cell,
+                            inputs=inputs_dynamic_rnn,
+                            initial_states=error_initial_states,
+                            sequence_length=sequence_length,
+                            is_reverse=False)
 
             self.assertRaises(TypeError, test_initial_states_type)
 
@@ -336,28 +343,27 @@ def test_initial_states_list():
                     np.random.random(
                         (batch_size, hidden_size)).astype("float32")
                 ]
-                dynamic_rnn(
-                    cell=cell,
-                    inputs=inputs_dynamic_rnn,
-                    initial_states=error_initial_states,
-                    sequence_length=sequence_length,
-                    is_reverse=False)
+                dynamic_rnn(cell=cell,
+                            inputs=inputs_dynamic_rnn,
+                            initial_states=error_initial_states,
+                            sequence_length=sequence_length,
+                            is_reverse=False)
 
             self.assertRaises(TypeError, test_initial_states_type)
 
             def test_sequence_length_type():
                 np_sequence_length = np.random.random(
                     (batch_size)).astype("float32")
-                dynamic_rnn(
-                    cell=cell,
-                    inputs=inputs_dynamic_rnn,
-                    sequence_length=np_sequence_length,
-                    is_reverse=False)
+                dynamic_rnn(cell=cell,
+                            inputs=inputs_dynamic_rnn,
+                            sequence_length=np_sequence_length,
+                            is_reverse=False)
 
             self.assertRaises(TypeError, test_sequence_length_type)
 
 
 class TestRnn(unittest.TestCase):
+
     def setUp(self):
         self.batch_size = 4
         self.input_size = 16
@@ -365,20 +371,19 @@ def setUp(self):
         self.seq_len = 4
 
     def test_run(self):
-        inputs_basic_lstm = fluid.data(
-            name='inputs_basic_lstm',
-            shape=[None, None, self.input_size],
-            dtype='float32')
-        sequence_length = fluid.data(
-            name="sequence_length", shape=[None], dtype='int64')
+        inputs_basic_lstm = fluid.data(name='inputs_basic_lstm',
+                                       shape=[None, None, self.input_size],
+                                       dtype='float32')
+        sequence_length = fluid.data(name="sequence_length",
+                                     shape=[None],
+                                     dtype='int64')
 
         inputs_dynamic_rnn = layers.transpose(inputs_basic_lstm, perm=[1, 0, 2])
         cell = LSTMCell(self.hidden_size, name="LSTMCell_for_rnn")
-        output, final_state = dynamic_rnn(
-            cell=cell,
-            inputs=inputs_dynamic_rnn,
-            sequence_length=sequence_length,
-            is_reverse=False)
+        output, final_state = dynamic_rnn(cell=cell,
+                                          inputs=inputs_dynamic_rnn,
+                                          sequence_length=sequence_length,
+                                          is_reverse=False)
         output_new = layers.transpose(output, perm=[1, 0, 2])
 
         rnn_out, last_hidden, last_cell = basic_lstm(inputs_basic_lstm, None, None, self.hidden_size, num_layers=1, \
@@ -394,8 +399,8 @@ def test_run(self):
         inputs_basic_lstm_np = np.random.uniform(
             -0.1, 0.1,
             (self.seq_len, self.batch_size, self.input_size)).astype('float32')
-        sequence_length_np = np.ones(
-            self.batch_size, dtype='int64') * self.seq_len
+        sequence_length_np = np.ones(self.batch_size,
+                                     dtype='int64') * self.seq_len
 
         inputs_np = np.random.uniform(
             -0.1, 0.1, (self.batch_size, self.input_size)).astype('float32')
@@ -407,20 +412,21 @@ def test_run(self):
         param_names = [[
             "LSTMCell_for_rnn/BasicLSTMUnit_0.w_0",
             "basic_lstm_layers_0/BasicLSTMUnit_0.w_0"
-        ], [
-            "LSTMCell_for_rnn/BasicLSTMUnit_0.b_0",
-            "basic_lstm_layers_0/BasicLSTMUnit_0.b_0"
-        ]]
+        ],
+                       [
+                           "LSTMCell_for_rnn/BasicLSTMUnit_0.b_0",
+                           "basic_lstm_layers_0/BasicLSTMUnit_0.b_0"
+                       ]]
 
         for names in param_names:
-            param = np.array(fluid.global_scope().find_var(names[0]).get_tensor(
-            ))
-            param = np.random.uniform(
-                -0.1, 0.1, size=param.shape).astype('float32')
-            fluid.global_scope().find_var(names[0]).get_tensor().set(param,
-                                                                     place)
-            fluid.global_scope().find_var(names[1]).get_tensor().set(param,
-                                                                     place)
+            param = np.array(fluid.global_scope().find_var(
+                names[0]).get_tensor())
+            param = np.random.uniform(-0.1, 0.1,
+                                      size=param.shape).astype('float32')
+            fluid.global_scope().find_var(names[0]).get_tensor().set(
+                param, place)
+            fluid.global_scope().find_var(names[1]).get_tensor().set(
+                param, place)
 
         out = exe.run(feed={
             'inputs_basic_lstm': inputs_basic_lstm_np,
@@ -455,11 +461,12 @@ class EncoderCell(RNNCell):
     """Encoder Cell"""
 
     def __init__(
-            self,
-            num_layers,
-            hidden_size,
-            dropout_prob=0.,
-            init_scale=0.1, ):
+        self,
+        num_layers,
+        hidden_size,
+        dropout_prob=0.,
+        init_scale=0.1,
+    ):
         self.num_layers = num_layers
         self.hidden_size = hidden_size
         self.dropout_prob = dropout_prob
@@ -474,7 +481,8 @@ def call(self, step_input, states):
             out, new_state = self.lstm_cells[i](step_input, states[i])
             step_input = layers.dropout(
                 out,
-                self.dropout_prob, ) if self.dropout_prob else out
+                self.dropout_prob,
+            ) if self.dropout_prob else out
             new_states.append(new_state)
         return step_input, new_states
 
@@ -500,7 +508,8 @@ def call(self, step_input, states):
             out, new_lstm_state = self.lstm_cells[i](step_input, states[i])
             step_input = layers.dropout(
                 out,
-                self.dropout_prob, ) if self.dropout_prob else out
+                self.dropout_prob,
+            ) if self.dropout_prob else out
             new_lstm_states.append(new_lstm_state)
         return step_input, new_lstm_states
 
@@ -510,11 +519,13 @@ def def_seq2seq_model(num_layers, hidden_size, dropout_prob, src_vocab_size,
     "vanilla seq2seq model"
     # data
     source = fluid.data(name="src", shape=[None, None], dtype="int64")
-    source_length = fluid.data(
-        name="src_sequence_length", shape=[None], dtype="int64")
+    source_length = fluid.data(name="src_sequence_length",
+                               shape=[None],
+                               dtype="int64")
     target = fluid.data(name="trg", shape=[None, None], dtype="int64")
-    target_length = fluid.data(
-        name="trg_sequence_length", shape=[None], dtype="int64")
+    target_length = fluid.data(name="trg_sequence_length",
+                               shape=[None],
+                               dtype="int64")
     label = fluid.data(name="label", shape=[None, None, 1], dtype="int64")
 
     # embedding
@@ -523,25 +534,29 @@ def def_seq2seq_model(num_layers, hidden_size, dropout_prob, src_vocab_size,
 
     # encoder
     enc_cell = EncoderCell(num_layers, hidden_size, dropout_prob)
-    enc_output, enc_final_state = dynamic_rnn(
-        cell=enc_cell, inputs=src_emb, sequence_length=source_length)
+    enc_output, enc_final_state = dynamic_rnn(cell=enc_cell,
+                                              inputs=src_emb,
+                                              sequence_length=source_length)
 
     # decoder
     dec_cell = DecoderCell(num_layers, hidden_size, dropout_prob)
-    dec_output, dec_final_state = dynamic_rnn(
-        cell=dec_cell, inputs=tar_emb, initial_states=enc_final_state)
+    dec_output, dec_final_state = dynamic_rnn(cell=dec_cell,
+                                              inputs=tar_emb,
+                                              initial_states=enc_final_state)
     logits = layers.fc(dec_output,
                        size=trg_vocab_size,
                        num_flatten_dims=len(dec_output.shape) - 1,
                        bias_attr=False)
 
     # loss
-    loss = layers.softmax_with_cross_entropy(
-        logits=logits, label=label, soft_label=False)
+    loss = layers.softmax_with_cross_entropy(logits=logits,
+                                             label=label,
+                                             soft_label=False)
     loss = layers.unsqueeze(loss, axes=[2])
     max_tar_seq_len = layers.shape(target)[1]
-    tar_mask = layers.sequence_mask(
-        target_length, maxlen=max_tar_seq_len, dtype="float32")
+    tar_mask = layers.sequence_mask(target_length,
+                                    maxlen=max_tar_seq_len,
+                                    dtype="float32")
     loss = loss * tar_mask
     loss = layers.reduce_mean(loss, dim=[0])
     loss = layers.reduce_sum(loss)
@@ -572,23 +587,28 @@ def setUp(self):
         src_seq_len = 10
         trg_seq_len = 12
         self.data = {
-            "src": np.random.randint(
+            "src":
+            np.random.randint(
                 2, self.model_hparams["src_vocab_size"],
                 (iter_num * batch_size, src_seq_len)).astype("int64"),
-            "src_sequence_length": np.random.randint(
-                1, src_seq_len, (iter_num * batch_size, )).astype("int64"),
-            "trg": np.random.randint(
+            "src_sequence_length":
+            np.random.randint(1, src_seq_len,
+                              (iter_num * batch_size, )).astype("int64"),
+            "trg":
+            np.random.randint(
                 2, self.model_hparams["src_vocab_size"],
                 (iter_num * batch_size, trg_seq_len)).astype("int64"),
-            "trg_sequence_length": np.random.randint(
-                1, trg_seq_len, (iter_num * batch_size, )).astype("int64"),
-            "label": np.random.randint(
+            "trg_sequence_length":
+            np.random.randint(1, trg_seq_len,
+                              (iter_num * batch_size, )).astype("int64"),
+            "label":
+            np.random.randint(
                 2, self.model_hparams["src_vocab_size"],
                 (iter_num * batch_size, trg_seq_len, 1)).astype("int64"),
         }
 
-        place = core.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else core.CPUPlace()
+        place = core.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else core.CPUPlace()
         self.exe = Executor(place)
 
     def test_seq2seq_model(self):
@@ -599,18 +619,26 @@ def test_seq2seq_model(self):
             self.exe.run(startup_program)
             for iter_idx in range(self.iter_num):
                 cost_val = self.exe.run(feed={
-                    "src": self.data["src"][iter_idx * self.batch_size:(
-                        iter_idx + 1) * self.batch_size, :],
-                    "src_sequence_length": self.data["src_sequence_length"]
-                    [iter_idx * self.batch_size:(iter_idx + 1) *
-                     self.batch_size],
-                    "trg": self.data["trg"][iter_idx * self.batch_size:(
-                        iter_idx + 1) * self.batch_size, :],
-                    "trg_sequence_length": self.data["trg_sequence_length"][
-                        iter_idx * self.batch_size:(iter_idx + 1
-                                                    ) * self.batch_size],
-                    "label": self.data["label"][iter_idx * self.batch_size:(
-                        iter_idx + 1) * self.batch_size]
+                    "src":
+                    self.data["src"][iter_idx * self.batch_size:(iter_idx + 1) *
+                                     self.batch_size, :],
+                    "src_sequence_length":
+                    self.data["src_sequence_length"][iter_idx *
+                                                     self.batch_size:(iter_idx +
+                                                                      1) *
+                                                     self.batch_size],
+                    "trg":
+                    self.data["trg"][iter_idx * self.batch_size:(iter_idx + 1) *
+                                     self.batch_size, :],
+                    "trg_sequence_length":
+                    self.data["trg_sequence_length"][iter_idx *
+                                                     self.batch_size:(iter_idx +
+                                                                      1) *
+                                                     self.batch_size],
+                    "label":
+                    self.data["label"][iter_idx *
+                                       self.batch_size:(iter_idx + 1) *
+                                       self.batch_size]
                 },
                                         fetch_list=[cost])[0]
                 print("iter_idx: %d, cost: %f" % (iter_idx, cost_val))
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index 3621fd1b9d445..f53df45523927 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -32,10 +32,12 @@
 from paddle.fluid.executor import Executor
 from paddle.fluid import framework
 from paddle.fluid.framework import _test_eager_guard
+
 paddle.enable_static()
 
 
 class EncoderCell(layers.RNNCell):
+
     def __init__(self, num_layers, hidden_size, dropout_prob=0.):
         self.num_layers = num_layers
         self.hidden_size = hidden_size
@@ -59,6 +61,7 @@ def state_shape(self):
 
 
 class DecoderCell(layers.RNNCell):
+
     def __init__(self, num_layers, hidden_size, dropout_prob=0.):
         self.num_layers = num_layers
         self.hidden_size = hidden_size
@@ -71,14 +74,15 @@ def attention(self, hidden, encoder_output, encoder_padding_mask):
         query = layers.fc(hidden,
                           size=encoder_output.shape[-1],
                           bias_attr=False)
-        attn_scores = layers.matmul(
-            layers.unsqueeze(query, [1]), encoder_output, transpose_y=True)
+        attn_scores = layers.matmul(layers.unsqueeze(query, [1]),
+                                    encoder_output,
+                                    transpose_y=True)
         if encoder_padding_mask is not None:
             attn_scores = layers.elementwise_add(attn_scores,
                                                  encoder_padding_mask)
         attn_scores = layers.softmax(attn_scores)
-        attn_out = layers.squeeze(
-            layers.matmul(attn_scores, encoder_output), [1])
+        attn_out = layers.squeeze(layers.matmul(attn_scores, encoder_output),
+                                  [1])
         attn_out = layers.concat([attn_out, hidden], 1)
         attn_out = layers.fc(attn_out, size=self.hidden_size, bias_attr=False)
         return attn_out
@@ -101,6 +105,7 @@ def call(self,
 
 
 class Encoder(object):
+
     def __init__(self, num_layers, hidden_size, dropout_prob=0.):
         self.encoder_cell = EncoderCell(num_layers, hidden_size, dropout_prob)
 
@@ -114,6 +119,7 @@ def __call__(self, src_emb, src_sequence_length):
 
 
 class Decoder(object):
+
     def __init__(self,
                  num_layers,
                  hidden_size,
@@ -142,11 +148,13 @@ def __call__(self, decoder_initial_states, encoder_output,
                 encoder_output, beam_size)
             encoder_padding_mask = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
                 encoder_padding_mask, beam_size)
-            decoder = layers.BeamSearchDecoder(
-                cell=self.decoder_cell, output_fn=output_layer, **kwargs)
+            decoder = layers.BeamSearchDecoder(cell=self.decoder_cell,
+                                               output_fn=output_layer,
+                                               **kwargs)
         else:
-            decoder = layers.BasicDecoder(
-                self.decoder_cell, helper, output_fn=output_layer)
+            decoder = layers.BasicDecoder(self.decoder_cell,
+                                          helper,
+                                          output_fn=output_layer)
 
         (decoder_output, decoder_final_state,
          dec_seq_lengths) = layers.dynamic_decode(
@@ -189,12 +197,12 @@ def __init__(self,
         self.encoder = Encoder(num_layers, hidden_size, dropout_prob)
         self.decoder = Decoder(num_layers, hidden_size, dropout_prob,
                                decoding_strategy, max_decoding_length)
-        self.output_layer = lambda x: layers.fc(
-            x,
-            size=trg_vocab_size,
-            num_flatten_dims=len(x.shape) - 1,
-            param_attr=fluid.ParamAttr(),
-            bias_attr=False)
+        self.output_layer = lambda x: layers.fc(x,
+                                                size=trg_vocab_size,
+                                                num_flatten_dims=len(x.shape) -
+                                                1,
+                                                param_attr=fluid.ParamAttr(),
+                                                bias_attr=False)
 
     def __call__(self, src, src_length, trg=None, trg_length=None):
         # encoder
@@ -202,11 +210,13 @@ def __call__(self, src, src_length, trg=None, trg_length=None):
             self.src_embeder(src), src_length)
 
         decoder_initial_states = [
-            encoder_final_state, self.decoder.decoder_cell.get_initial_states(
+            encoder_final_state,
+            self.decoder.decoder_cell.get_initial_states(
                 batch_ref=encoder_output, shape=[encoder_output.shape[-1]])
         ]
-        src_mask = layers.sequence_mask(
-            src_length, maxlen=layers.shape(src)[1], dtype="float32")
+        src_mask = layers.sequence_mask(src_length,
+                                        maxlen=layers.shape(src)[1],
+                                        dtype="float32")
         encoder_padding_mask = (src_mask - 1.0) * 1e9
         encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1])
 
@@ -214,20 +224,23 @@ def __call__(self, src, src_length, trg=None, trg_length=None):
         decoder_kwargs = {
             "inputs": self.trg_embeder(trg),
             "sequence_length": trg_length,
-        } if self.decoder.decoding_strategy == "train_greedy" else ({
-            "embedding_fn": self.trg_embeder,
-            "beam_size": self.beam_size,
-            "start_token": self.start_token,
-            "end_token": self.end_token
-        } if self.decoder.decoding_strategy == "beam_search" else {
-            "embedding_fn": self.trg_embeder,
-            "start_tokens": layers.fill_constant_batch_size_like(
-                input=encoder_output,
-                shape=[-1],
-                dtype=src.dtype,
-                value=self.start_token),
-            "end_token": self.end_token
-        })
+        } if self.decoder.decoding_strategy == "train_greedy" else (
+            {
+                "embedding_fn": self.trg_embeder,
+                "beam_size": self.beam_size,
+                "start_token": self.start_token,
+                "end_token": self.end_token
+            } if self.decoder.decoding_strategy == "beam_search" else {
+                "embedding_fn":
+                self.trg_embeder,
+                "start_tokens":
+                layers.fill_constant_batch_size_like(input=encoder_output,
+                                                     shape=[-1],
+                                                     dtype=src.dtype,
+                                                     value=self.start_token),
+                "end_token":
+                self.end_token
+            })
         decoder_kwargs["output_layer"] = self.output_layer
 
         (decoder_output, decoder_final_state,
@@ -252,8 +265,9 @@ def learn(self, act_prob, action, reward, length=None):
         """
         update policy model self.model with policy gradient algorithm
         """
-        self.reward = fluid.layers.py_func(
-            func=reward_func, x=[action, length], out=reward)
+        self.reward = fluid.layers.py_func(func=reward_func,
+                                           x=[action, length],
+                                           out=reward)
         neg_log_prob = layers.cross_entropy(act_prob, action)
         cost = neg_log_prob * reward
         cost = (layers.reduce_sum(cost) / layers.reduce_sum(length)
@@ -282,8 +296,8 @@ def discount_reward_1d(reward, sequence_length, discount=1., dtype=None):
             dmat = np.ones([batch_size, max_seq_length], dtype=dtype)
         else:
             steps = np.tile(np.arange(max_seq_length), [batch_size, 1])
-            mask = np.asarray(
-                steps < (sequence_length - 1)[:, None], dtype=dtype)
+            mask = np.asarray(steps < (sequence_length - 1)[:, None],
+                              dtype=dtype)
             # Make each row = [discount, ..., discount, 1, ..., 1]
             dmat = mask * discount + (1 - mask)
             dmat = np.cumprod(dmat[:, ::-1], axis=1)[:, ::-1]
@@ -342,6 +356,7 @@ def learn(self, probs, label, weight=None, length=None):
 
 
 class SeqPGAgent(object):
+
     def __init__(self,
                  model_cls,
                  alg_cls=PolicyGradient,
@@ -364,14 +379,17 @@ def __init__(self,
     def build_program(self, model_cls, alg_cls, model_hparams, alg_hparams):
         with fluid.program_guard(self.main_program, self.startup_program):
             source = fluid.data(name="src", shape=[None, None], dtype="int64")
-            source_length = fluid.data(
-                name="src_sequence_length", shape=[None], dtype="int64")
+            source_length = fluid.data(name="src_sequence_length",
+                                       shape=[None],
+                                       dtype="int64")
             # only for teacher-forcing MLE training
             target = fluid.data(name="trg", shape=[None, None], dtype="int64")
-            target_length = fluid.data(
-                name="trg_sequence_length", shape=[None], dtype="int64")
-            label = fluid.data(
-                name="label", shape=[None, None, 1], dtype="int64")
+            target_length = fluid.data(name="trg_sequence_length",
+                                       shape=[None],
+                                       dtype="int64")
+            label = fluid.data(name="label",
+                               shape=[None, None, 1],
+                               dtype="int64")
             self.model = model_cls(**model_hparams)
             self.alg = alg_cls(**alg_hparams)
             self.probs, self.samples, self.sample_length = self.model(
@@ -405,6 +423,7 @@ def learn(self, feed_dict, fetch_list):
 
 
 class TestDynamicDecode(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.model_hparams = {
@@ -424,53 +443,63 @@ def setUp(self):
         src_seq_len = 10
         trg_seq_len = 12
         self.data = {
-            "src": np.random.randint(
+            "src":
+            np.random.randint(
                 2, self.model_hparams["src_vocab_size"],
                 (iter_num * batch_size, src_seq_len)).astype("int64"),
-            "src_sequence_length": np.random.randint(
-                1, src_seq_len, (iter_num * batch_size, )).astype("int64"),
-            "trg": np.random.randint(
+            "src_sequence_length":
+            np.random.randint(1, src_seq_len,
+                              (iter_num * batch_size, )).astype("int64"),
+            "trg":
+            np.random.randint(
                 2, self.model_hparams["src_vocab_size"],
                 (iter_num * batch_size, trg_seq_len)).astype("int64"),
-            "trg_sequence_length": np.random.randint(
-                1, trg_seq_len, (iter_num * batch_size, )).astype("int64"),
-            "label": np.random.randint(
+            "trg_sequence_length":
+            np.random.randint(1, trg_seq_len,
+                              (iter_num * batch_size, )).astype("int64"),
+            "label":
+            np.random.randint(
                 2, self.model_hparams["src_vocab_size"],
                 (iter_num * batch_size, trg_seq_len, 1)).astype("int64"),
         }
 
-        place = core.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else core.CPUPlace()
+        place = core.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else core.CPUPlace()
         self.exe = Executor(place)
 
     def test_mle_train(self):
         paddle.enable_static()
         self.model_hparams["decoding_strategy"] = "train_greedy"
-        agent = SeqPGAgent(
-            model_cls=Seq2SeqModel,
-            alg_cls=MLE,
-            model_hparams=self.model_hparams,
-            alg_hparams={"lr": 0.001},
-            executor=self.exe,
-            main_program=fluid.Program(),
-            startup_program=fluid.Program(),
-            seed=123)
+        agent = SeqPGAgent(model_cls=Seq2SeqModel,
+                           alg_cls=MLE,
+                           model_hparams=self.model_hparams,
+                           alg_hparams={"lr": 0.001},
+                           executor=self.exe,
+                           main_program=fluid.Program(),
+                           startup_program=fluid.Program(),
+                           seed=123)
         self.exe.run(agent.startup_program)
         for iter_idx in range(self.iter_num):
             reward, cost = agent.learn(
                 {
-                    "src": self.data["src"][iter_idx * self.batch_size:(
-                        iter_idx + 1) * self.batch_size, :],
-                    "src_sequence_length": self.data["src_sequence_length"][
-                        iter_idx * self.batch_size:(iter_idx + 1
-                                                    ) * self.batch_size],
-                    "trg": self.data["trg"][iter_idx * self.batch_size:(
-                        iter_idx + 1) * self.batch_size, :],
-                    "trg_sequence_length": self.data["trg_sequence_length"]
-                    [iter_idx * self.batch_size:(iter_idx + 1) *
-                     self.batch_size],
-                    "label": self.data["label"][iter_idx * self.batch_size:(
-                        iter_idx + 1) * self.batch_size]
+                    "src":
+                    self.data["src"][iter_idx * self.batch_size:(iter_idx + 1) *
+                                     self.batch_size, :],
+                    "src_sequence_length":
+                    self.data["src_sequence_length"][iter_idx * self.batch_size:
+                                                     (iter_idx + 1) *
+                                                     self.batch_size],
+                    "trg":
+                    self.data["trg"][iter_idx * self.batch_size:(iter_idx + 1) *
+                                     self.batch_size, :],
+                    "trg_sequence_length":
+                    self.data["trg_sequence_length"][iter_idx * self.batch_size:
+                                                     (iter_idx + 1) *
+                                                     self.batch_size],
+                    "label":
+                    self.data["label"][iter_idx *
+                                       self.batch_size:(iter_idx + 1) *
+                                       self.batch_size]
                 },
                 fetch_list=[agent.cost, agent.cost])
             print("iter_idx: %d, reward: %f, cost: %f" %
@@ -479,24 +508,25 @@ def test_mle_train(self):
     def test_greedy_train(self):
         paddle.enable_static()
         self.model_hparams["decoding_strategy"] = "infer_greedy"
-        agent = SeqPGAgent(
-            model_cls=Seq2SeqModel,
-            alg_cls=PolicyGradient,
-            model_hparams=self.model_hparams,
-            alg_hparams={"lr": 0.001},
-            executor=self.exe,
-            main_program=fluid.Program(),
-            startup_program=fluid.Program(),
-            seed=123)
+        agent = SeqPGAgent(model_cls=Seq2SeqModel,
+                           alg_cls=PolicyGradient,
+                           model_hparams=self.model_hparams,
+                           alg_hparams={"lr": 0.001},
+                           executor=self.exe,
+                           main_program=fluid.Program(),
+                           startup_program=fluid.Program(),
+                           seed=123)
         self.exe.run(agent.startup_program)
         for iter_idx in range(self.iter_num):
             reward, cost = agent.learn(
                 {
-                    "src": self.data["src"][iter_idx * self.batch_size:(
-                        iter_idx + 1) * self.batch_size, :],
-                    "src_sequence_length": self.data["src_sequence_length"]
-                    [iter_idx * self.batch_size:(iter_idx + 1) *
-                     self.batch_size]
+                    "src":
+                    self.data["src"][iter_idx * self.batch_size:(iter_idx + 1) *
+                                     self.batch_size, :],
+                    "src_sequence_length":
+                    self.data["src_sequence_length"][iter_idx * self.batch_size:
+                                                     (iter_idx + 1) *
+                                                     self.batch_size]
                 },
                 fetch_list=[agent.reward, agent.cost])
             print("iter_idx: %d, reward: %f, cost: %f" %
@@ -505,24 +535,25 @@ def test_greedy_train(self):
     def test_sample_train(self):
         paddle.enable_static()
         self.model_hparams["decoding_strategy"] = "infer_sample"
-        agent = SeqPGAgent(
-            model_cls=Seq2SeqModel,
-            alg_cls=PolicyGradient,
-            model_hparams=self.model_hparams,
-            alg_hparams={"lr": 0.001},
-            executor=self.exe,
-            main_program=fluid.Program(),
-            startup_program=fluid.Program(),
-            seed=123)
+        agent = SeqPGAgent(model_cls=Seq2SeqModel,
+                           alg_cls=PolicyGradient,
+                           model_hparams=self.model_hparams,
+                           alg_hparams={"lr": 0.001},
+                           executor=self.exe,
+                           main_program=fluid.Program(),
+                           startup_program=fluid.Program(),
+                           seed=123)
         self.exe.run(agent.startup_program)
         for iter_idx in range(self.iter_num):
             reward, cost = agent.learn(
                 {
-                    "src": self.data["src"][iter_idx * self.batch_size:(
-                        iter_idx + 1) * self.batch_size, :],
-                    "src_sequence_length": self.data["src_sequence_length"]
-                    [iter_idx * self.batch_size:(iter_idx + 1) *
-                     self.batch_size]
+                    "src":
+                    self.data["src"][iter_idx * self.batch_size:(iter_idx + 1) *
+                                     self.batch_size, :],
+                    "src_sequence_length":
+                    self.data["src_sequence_length"][iter_idx * self.batch_size:
+                                                     (iter_idx + 1) *
+                                                     self.batch_size]
                 },
                 fetch_list=[agent.reward, agent.cost])
             print("iter_idx: %d, reward: %f, cost: %f" %
@@ -536,8 +567,9 @@ def test_beam_search_infer(self):
         startup_program = fluid.Program()
         with fluid.program_guard(main_program, startup_program):
             source = fluid.data(name="src", shape=[None, None], dtype="int64")
-            source_length = fluid.data(
-                name="src_sequence_length", shape=[None], dtype="int64")
+            source_length = fluid.data(name="src_sequence_length",
+                                       shape=[None],
+                                       dtype="int64")
             model = Seq2SeqModel(**self.model_hparams)
             output = model(source, source_length)
 
@@ -546,11 +578,14 @@ def test_beam_search_infer(self):
             trans_ids = self.exe.run(
                 program=main_program,
                 feed={
-                    "src": self.data["src"][iter_idx * self.batch_size:(
-                        iter_idx + 1) * self.batch_size, :],
-                    "src_sequence_length": self.data["src_sequence_length"]
-                    [iter_idx * self.batch_size:(iter_idx + 1) *
-                     self.batch_size]
+                    "src":
+                    self.data["src"][iter_idx * self.batch_size:(iter_idx + 1) *
+                                     self.batch_size, :],
+                    "src_sequence_length":
+                    self.data["src_sequence_length"][iter_idx *
+                                                     self.batch_size:(iter_idx +
+                                                                      1) *
+                                                     self.batch_size]
                 },
                 fetch_list=[output])[0]
 
@@ -569,6 +604,7 @@ def test_dynamic_basic_decoder(self):
 
 
 class ModuleApiTest(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         cls._np_rand_state = np.random.get_state()
@@ -577,10 +613,11 @@ def setUpClass(cls):
         np.random.seed(cls._random_seed)
         random.seed(cls._random_seed)
 
-        cls.model_cls = type(cls.__name__ + "Model", (Layer, ), {
-            "__init__": cls.model_init_wrapper(cls.model_init),
-            "forward": cls.model_forward
-        })
+        cls.model_cls = type(
+            cls.__name__ + "Model", (Layer, ), {
+                "__init__": cls.model_init_wrapper(cls.model_init),
+                "forward": cls.model_forward
+            })
 
     @classmethod
     def tearDownClass(cls):
@@ -589,6 +626,7 @@ def tearDownClass(cls):
 
     @staticmethod
     def model_init_wrapper(func):
+
         def __impl__(self, *args, **kwargs):
             Layer.__init__(self)
             func(self, *args, **kwargs)
@@ -649,8 +687,7 @@ def check_output_with_place(self, place, mode="test"):
         if expect_output:
             for actual_t, expect_t in zip(dygraph_output, expect_output):
                 self.assertTrue(
-                    np.allclose(
-                        actual_t, expect_t, rtol=1e-5, atol=0))
+                    np.allclose(actual_t, expect_t, rtol=1e-5, atol=0))
 
     def check_output(self):
         devices = ["CPU", "GPU"] if fluid.is_compiled_with_cuda() else ["CPU"]
@@ -660,6 +697,7 @@ def check_output(self):
 
 
 class TestBeamSearch(ModuleApiTest):
+
     def setUp(self):
         paddle.set_default_dtype("float64")
         shape = (8, 32)
@@ -684,26 +722,25 @@ def model_init(self,
                    eos_id=1,
                    beam_size=4,
                    max_step_num=20):
-        embedder = paddle.fluid.dygraph.Embedding(
-            size=[vocab_size, embed_dim], dtype="float64")
+        embedder = paddle.fluid.dygraph.Embedding(size=[vocab_size, embed_dim],
+                                                  dtype="float64")
         output_layer = nn.Linear(hidden_size, vocab_size)
         cell = nn.LSTMCell(embed_dim, hidden_size)
         self.max_step_num = max_step_num
-        self.beam_search_decoder = BeamSearchDecoder(
-            cell,
-            start_token=bos_id,
-            end_token=eos_id,
-            beam_size=beam_size,
-            embedding_fn=embedder,
-            output_fn=output_layer)
+        self.beam_search_decoder = BeamSearchDecoder(cell,
+                                                     start_token=bos_id,
+                                                     end_token=eos_id,
+                                                     beam_size=beam_size,
+                                                     embedding_fn=embedder,
+                                                     output_fn=output_layer)
 
     @staticmethod
     def model_forward(model, init_hidden, init_cell):
-        return dynamic_decode(
-            model.beam_search_decoder, [init_hidden, init_cell],
-            max_step_num=model.max_step_num,
-            impute_finished=True,
-            is_test=True)[0]
+        return dynamic_decode(model.beam_search_decoder,
+                              [init_hidden, init_cell],
+                              max_step_num=model.max_step_num,
+                              impute_finished=True,
+                              is_test=True)[0]
 
     def make_inputs(self):
         inputs = [
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_dp.py b/python/paddle/fluid/tests/unittests/test_rnn_dp.py
index 8d7e86fcdb9c7..46e1530a1916d 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_dp.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_dp.py
@@ -27,6 +27,7 @@
 
 
 class RNNEncoder(nn.Layer):
+
     def __init__(self,
                  input_size,
                  hidden_size,
@@ -41,13 +42,12 @@ def __init__(self,
         self._direction = direction
         self._pooling_type = pooling_type
 
-        self.rnn_layer = nn.SimpleRNN(
-            input_size=input_size,
-            hidden_size=hidden_size,
-            num_layers=num_layers,
-            direction=direction,
-            dropout=dropout,
-            **kwargs)
+        self.rnn_layer = nn.SimpleRNN(input_size=input_size,
+                                      hidden_size=hidden_size,
+                                      num_layers=num_layers,
+                                      direction=direction,
+                                      dropout=dropout,
+                                      **kwargs)
 
     def get_input_dim(self):
         return self._input_size
@@ -66,6 +66,7 @@ def forward(self, inputs, sequence_length):
 
 
 class RNNModel(nn.Layer):
+
     def __init__(self,
                  vocab_size,
                  num_classes,
@@ -78,17 +79,15 @@ def __init__(self,
                  pooling_type=None,
                  fc_hidden_size=96):
         super().__init__()
-        self.embedder = nn.Embedding(
-            num_embeddings=vocab_size,
-            embedding_dim=emb_dim,
-            padding_idx=padding_idx)
-        self.rnn_encoder = RNNEncoder(
-            emb_dim,
-            rnn_hidden_size,
-            num_layers=rnn_layers,
-            direction=direction,
-            dropout=dropout_rate,
-            pooling_type=pooling_type)
+        self.embedder = nn.Embedding(num_embeddings=vocab_size,
+                                     embedding_dim=emb_dim,
+                                     padding_idx=padding_idx)
+        self.rnn_encoder = RNNEncoder(emb_dim,
+                                      rnn_hidden_size,
+                                      num_layers=rnn_layers,
+                                      direction=direction,
+                                      dropout=dropout_rate,
+                                      pooling_type=pooling_type)
         self.fc = nn.Linear(self.rnn_encoder.get_output_dim(), fc_hidden_size)
         self.output_layer = nn.Linear(fc_hidden_size, num_classes)
 
@@ -104,23 +103,23 @@ def rnn_pretrain_forward(train_program, start_program, topo=None):
     with static.program_guard(train_program,
                               start_program), paddle.utils.unique_name.guard():
         batch_size = 1
-        tokens = static.data(
-            name="tokens", shape=[batch_size, -1], dtype="int64")
+        tokens = static.data(name="tokens",
+                             shape=[batch_size, -1],
+                             dtype="int64")
         seq_len = static.data(name="ids", shape=[batch_size], dtype="int64")
         labels = static.data(name="labels", shape=[batch_size], dtype="int64")
         data_holders = [tokens, seq_len, labels]
         vocab_size = 10
         num_classes = 2
         pad_token_id = 0
-        model = RNNModel(
-            vocab_size,
-            num_classes,
-            direction='forward',
-            padding_idx=pad_token_id,
-            pooling_type='max')
-
-        optimizer = paddle.optimizer.Adam(
-            parameters=model.parameters(), learning_rate=0.001)
+        model = RNNModel(vocab_size,
+                         num_classes,
+                         direction='forward',
+                         padding_idx=pad_token_id,
+                         pooling_type='max')
+
+        optimizer = paddle.optimizer.Adam(parameters=model.parameters(),
+                                          learning_rate=0.001)
         criterion = paddle.nn.CrossEntropyLoss()
         preds = model(tokens, seq_len)
         loss = criterion(preds, labels)
@@ -129,6 +128,7 @@ def rnn_pretrain_forward(train_program, start_program, topo=None):
 
 
 class TestFleetMetaOptimizer(unittest.TestCase):
+
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "1"
         os.environ[
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py b/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
index 9bfec8e9bdd8c..f5ce030609159 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
@@ -24,19 +24,21 @@
 
 
 class RNNMemoryHelperOpTest(unittest.TestCase):
+
     def setUp(self):
         self.program = Program()
         self.place = core.CPUPlace()
 
-        self.X = self.program.global_block().create_var(
-            name='X', shape=[2, 3], dtype='float32')
-        self.Out = self.program.global_block().create_var(
-            name='Out', shape=[2, 3], dtype='float32')
-        self.program.global_block().append_op(
-            type='rnn_memory_helper',
-            inputs={"X": self.X},
-            outputs={"Out": self.Out},
-            attrs={})
+        self.X = self.program.global_block().create_var(name='X',
+                                                        shape=[2, 3],
+                                                        dtype='float32')
+        self.Out = self.program.global_block().create_var(name='Out',
+                                                          shape=[2, 3],
+                                                          dtype='float32')
+        self.program.global_block().append_op(type='rnn_memory_helper',
+                                              inputs={"X": self.X},
+                                              outputs={"Out": self.Out},
+                                              attrs={})
 
     def test_forward(self):
         x_np = np.random.normal(size=(2, 3)).astype("float32")
@@ -50,29 +52,31 @@ def test_forward(self):
 
 
 class RNNMemoryHelperGradOpTest(unittest.TestCase):
+
     def setUp(self):
         self.program = Program()
         self.place = core.CPUPlace()
 
         self.input_names = ['X', 'Out', 'Out@GRAD']
         self.input_vars = {
-            name: self.program.global_block().create_var(
-                name=name, shape=[2, 3], dtype='float32')
+            name: self.program.global_block().create_var(name=name,
+                                                         shape=[2, 3],
+                                                         dtype='float32')
             for name in self.input_names
         }
 
         self.output_names = ['X@GRAD']
         self.output_vars = {
-            name: self.program.global_block().create_var(
-                name=name, shape=[2, 3], dtype='float32')
+            name: self.program.global_block().create_var(name=name,
+                                                         shape=[2, 3],
+                                                         dtype='float32')
             for name in self.output_names
         }
 
-        self.program.global_block().append_op(
-            type='rnn_memory_helper_grad',
-            inputs=self.input_vars,
-            outputs=self.output_vars,
-            attrs={})
+        self.program.global_block().append_op(type='rnn_memory_helper_grad',
+                                              inputs=self.input_vars,
+                                              outputs=self.output_vars,
+                                              attrs={})
 
     def test_backward(self):
         self.feed_map = {
@@ -89,6 +93,7 @@ def test_backward(self):
 
 
 class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
+
     def setUp(self):
         self.program = Program()
         self.fake_program = Program()
@@ -96,8 +101,9 @@ def setUp(self):
 
         self.input_names = ['X', 'Out']
         self.input_vars = {
-            name: self.program.global_block().create_var(
-                name=name, shape=[2, 3], dtype='float32')
+            name: self.program.global_block().create_var(name=name,
+                                                         shape=[2, 3],
+                                                         dtype='float32')
             for name in self.input_names
         }
         self.input_vars["Out@GRAD"] = \
@@ -106,16 +112,16 @@ def setUp(self):
 
         self.output_names = ['X@GRAD']
         self.output_vars = {
-            name: self.program.global_block().create_var(
-                name=name, shape=[2, 3], dtype='float32')
+            name: self.program.global_block().create_var(name=name,
+                                                         shape=[2, 3],
+                                                         dtype='float32')
             for name in self.output_names
         }
 
-        self.program.global_block().append_op(
-            type='rnn_memory_helper_grad',
-            inputs=self.input_vars,
-            outputs=self.output_vars,
-            attrs={})
+        self.program.global_block().append_op(type='rnn_memory_helper_grad',
+                                              inputs=self.input_vars,
+                                              outputs=self.output_vars,
+                                              attrs={})
 
     def test_backward(self):
         self.feed_map = {
@@ -129,8 +135,9 @@ def test_backward(self):
                       feed=self.feed_map,
                       fetch_list=self.fetch_list)
         self.assertTrue(
-            np.allclose(
-                out[0], np.zeros(shape=(2, 3)).astype("float32"), rtol=1e-5))
+            np.allclose(out[0],
+                        np.zeros(shape=(2, 3)).astype("float32"),
+                        rtol=1e-5))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_op.py b/python/paddle/fluid/tests/unittests/test_rnn_op.py
index 79e33166bb6f8..f03215a480a5d 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_op.py
@@ -25,6 +25,7 @@
 import sys
 
 from op_test import OpTest
+
 sys.path.append("./rnn")
 from rnn_numpy import SimpleRNN, LSTM, GRU
 from convert import get_params_for_net
@@ -35,6 +36,7 @@
 
 
 class TestRNNOp(OpTest):
+
     def get_weight_names(self):
         weight_names = []
         for i in range(self.num_layers):
@@ -49,8 +51,7 @@ def setUp(self):
         self.op_type = "rnn"
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.sequence_length = None if core.is_compiled_with_rocm(
-        ) else np.array(
-            [12, 11, 10, 9, 8], dtype=np.int32)
+        ) else np.array([12, 11, 10, 9, 8], dtype=np.int32)
         self.num_layers = 1
         self.is_bidirec = False
         self.mode = "LSTM"
@@ -65,27 +66,27 @@ def setUp(self):
         input_size = 3
         hidden_size = 2
 
-        input = np.random.uniform(
-            low=-0.1, high=0.1,
-            size=(seq_length, batch_size, input_size)).astype(self.dtype)
+        input = np.random.uniform(low=-0.1,
+                                  high=0.1,
+                                  size=(seq_length, batch_size,
+                                        input_size)).astype(self.dtype)
         if self.sequence_length is not None:
             input[11][1:][:] = 0
             input[10][2:][:] = 0
             input[9][3:][:] = 0
             input[8][4:][:] = 0
 
-        rnn1 = LSTM(
-            input_size,
-            hidden_size,
-            num_layers=self.num_layers,
-            time_major=True,
-            direction=direction,
-            dropout=self.dropout,
-            dtype=self.dtype)
+        rnn1 = LSTM(input_size,
+                    hidden_size,
+                    num_layers=self.num_layers,
+                    time_major=True,
+                    direction=direction,
+                    dropout=self.dropout,
+                    dtype=self.dtype)
 
         flat_w = get_params_for_net(rnn1)
-        output, (last_hidden, last_cell) = rnn1(
-            input, sequence_length=self.sequence_length)
+        output, (last_hidden,
+                 last_cell) = rnn1(input, sequence_length=self.sequence_length)
 
         if core.is_compiled_with_rocm():
 
@@ -140,28 +141,32 @@ def test_grad(self):
             var_name_list = self.get_weight_names()
             grad_check_list = ['Input', 'init_h', 'init_c']
             grad_check_list.extend(var_name_list)
-            self.check_grad(
-                set(grad_check_list), ['Out', 'last_hidden', 'last_cell'])
+            self.check_grad(set(grad_check_list),
+                            ['Out', 'last_hidden', 'last_cell'])
 
 
 class TestRNNOp1(TestRNNOp):
+
     def set_attrs(self):
         self.sequence_length = None
 
 
 class TestRNNOp2(TestRNNOp):
+
     def set_attrs(self):
         self.sequence_length = None
         self.is_bidirec = True
 
 
 class TestRNNOp3(TestRNNOp):
+
     def set_attrs(self):
         self.is_test = True
         self.sequence_length = None
 
 
 class TestRNNOp4(TestRNNOp):
+
     def set_attrs(self):
         self.is_test = True
         self.sequence_length = None
@@ -169,17 +174,20 @@ def set_attrs(self):
 
 
 class TestRNNOp5(TestRNNOp):
+
     def set_attrs(self):
         self.num_layers = 2
 
 
 class TestRNNOp6(TestRNNOp):
+
     def set_attrs(self):
         self.num_layers = 2
         self.is_bidirec = True
 
 
 class TestRNNOp7(TestRNNOp):
+
     def set_attrs(self):
         self.num_layers = 2
         self.is_bidirec = True
@@ -187,6 +195,7 @@ def set_attrs(self):
 
 
 class TestRNNOp8(TestRNNOp):
+
     def set_attrs(self):
         self.num_layers = 2
         self.is_bidirec = True
@@ -194,6 +203,7 @@ def set_attrs(self):
 
 
 class TestRNNOp9(TestRNNOp):
+
     def set_attrs(self):
         self.num_layers = 3
 
diff --git a/python/paddle/fluid/tests/unittests/test_roi_align_op.py b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
index a22b331b03241..cf52810568625 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
@@ -23,6 +23,7 @@
 
 
 class TestROIAlignOp(OpTest):
+
     def set_data(self):
         self.init_test_case()
         self.make_rois()
@@ -66,8 +67,8 @@ def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w,
         bilinear_pos = np.zeros(
             [self.channels, self.pooled_height, self.pooled_width, count, 4],
             np.float64)
-        bilinear_w = np.zeros(
-            [self.pooled_height, self.pooled_width, count, 4], np.float64)
+        bilinear_w = np.zeros([self.pooled_height, self.pooled_width, count, 4],
+                              np.float64)
         for ph in range(self.pooled_width):
             for pw in range(self.pooled_height):
                 c = 0
@@ -172,12 +173,14 @@ def make_rois(self):
                 rois.append(roi)
         self.rois_num = len(rois)
         self.rois = np.array(rois).astype("float64")
-        self.boxes_num = np.array(
-            [bno + 1 for bno in range(self.batch_size)]).astype('int32')
+        self.boxes_num = np.array([bno + 1 for bno in range(self.batch_size)
+                                   ]).astype('int32')
 
     def setUp(self):
         self.op_type = "roi_align"
-        self.python_api = lambda x, boxes, boxes_num, pooled_height, pooled_width, spatial_scale, sampling_ratio, aligned: paddle.vision.ops.roi_align(x, boxes, boxes_num, (pooled_height, pooled_width), spatial_scale, sampling_ratio, aligned)
+        self.python_api = lambda x, boxes, boxes_num, pooled_height, pooled_width, spatial_scale, sampling_ratio, aligned: paddle.vision.ops.roi_align(
+            x, boxes, boxes_num, (pooled_height, pooled_width), spatial_scale,
+            sampling_ratio, aligned)
         self.set_data()
 
     def test_check_output(self):
@@ -188,6 +191,7 @@ def test_check_grad(self):
 
 
 class TestROIAlignInLodOp(TestROIAlignOp):
+
     def set_data(self):
         self.init_test_case()
         self.make_rois()
@@ -213,6 +217,7 @@ def set_data(self):
 
 
 class TestROIAlignOpWithAligned(TestROIAlignOp):
+
     def init_test_case(self):
         self.batch_size = 3
         self.channels = 3
diff --git a/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py b/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py
index d4e48ac8a5704..202805b096156 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py
@@ -100,10 +100,10 @@ def get_transform_matrix(transformed_width, transformed_height, roi_x, roi_y):
     dy2 = y3 - y2
     dy3 = y0 - y1 + y2 - y3
     matrix = np.zeros([9])
-    matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1 + 1e-5) / (
-        normalized_width - 1)
-    matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1 + 1e-5) / (
-        normalized_height - 1)
+    matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1 +
+                                           1e-5) / (normalized_width - 1)
+    matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1 +
+                                           1e-5) / (normalized_height - 1)
     matrix[8] = 1
 
     matrix[3] = (y1 - y0 + matrix[6] *
@@ -199,8 +199,8 @@ def roi_transform(in_data, rois, rois_lod, transformed_height,
             roi2image[j] = i
 
     out = np.zeros([rois_num, channels, transformed_height, transformed_width])
-    mask = np.zeros(
-        [rois_num, 1, transformed_height, transformed_width]).astype('int')
+    mask = np.zeros([rois_num, 1, transformed_height,
+                     transformed_width]).astype('int')
     matrix = np.zeros([rois_num, 9], dtype=in_data.dtype)
     for n in range(rois_num):
         roi_x = []
@@ -209,8 +209,9 @@ def roi_transform(in_data, rois, rois_lod, transformed_height,
             roi_x.append(rois[n][2 * k] * spatial_scale)
             roi_y.append(rois[n][2 * k + 1] * spatial_scale)
         image_id = roi2image[n]
-        transform_matrix = get_transform_matrix(
-            transformed_width, transformed_height, roi_x, roi_y)
+        transform_matrix = get_transform_matrix(transformed_width,
+                                                transformed_height, roi_x,
+                                                roi_y)
         matrix[n] = transform_matrix
         for c in range(channels):
             for out_h in range(transformed_height):
@@ -230,6 +231,7 @@ def roi_transform(in_data, rois, rois_lod, transformed_height,
 
 
 class TestROIPoolOp(OpTest):
+
     def set_data(self):
         self.init_test_case()
         self.make_rois()
@@ -241,9 +243,11 @@ def set_data(self):
             'transformed_height': self.transformed_height,
             'transformed_width': self.transformed_width
         }
-        out, mask, transform_matrix = roi_transform(
-            self.x, self.rois, self.rois_lod, self.transformed_height,
-            self.transformed_width, self.spatial_scale)
+        out, mask, transform_matrix = roi_transform(self.x, self.rois,
+                                                    self.rois_lod,
+                                                    self.transformed_height,
+                                                    self.transformed_width,
+                                                    self.spatial_scale)
         self.outputs = {
             'Out': out,
             'Mask': mask,
@@ -316,13 +320,18 @@ def test_check_grad(self):
 
     def test_errors(self):
         x = fluid.data(name='x', shape=[100, 256, 28, 28], dtype='float32')
-        rois = fluid.data(
-            name='rois', shape=[None, 8], lod_level=1, dtype='float32')
-
-        x_int = fluid.data(
-            name='x_int', shape=[100, 256, 28, 28], dtype='int32')
-        rois_int = fluid.data(
-            name='rois_int', shape=[None, 8], lod_level=1, dtype='int32')
+        rois = fluid.data(name='rois',
+                          shape=[None, 8],
+                          lod_level=1,
+                          dtype='float32')
+
+        x_int = fluid.data(name='x_int',
+                           shape=[100, 256, 28, 28],
+                           dtype='int32')
+        rois_int = fluid.data(name='rois_int',
+                              shape=[None, 8],
+                              lod_level=1,
+                              dtype='int32')
         x_tmp = [1, 2]
         rois_tmp = [1, 2]
 
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
index f0afcff63c6c4..d01daf75036f4 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -25,6 +25,7 @@
 
 
 class TestROIPoolOp(OpTest):
+
     def set_data(self):
         self.init_test_case()
         self.make_rois()
@@ -132,12 +133,13 @@ def make_rois(self):
                 rois.append(roi)
         self.rois_num = len(rois)
         self.rois = np.array(rois).astype("float64")
-        self.boxes_num = np.array(
-            [bno + 1 for bno in range(self.batch_size)]).astype('int32')
+        self.boxes_num = np.array([bno + 1 for bno in range(self.batch_size)
+                                   ]).astype('int32')
 
     def setUp(self):
         self.op_type = "roi_pool"
-        self.python_api = lambda x, boxes, boxes_num, pooled_height, pooled_width, spatial_scale: paddle.vision.ops.roi_pool(x, boxes, boxes_num, (pooled_height, pooled_width), spatial_scale)
+        self.python_api = lambda x, boxes, boxes_num, pooled_height, pooled_width, spatial_scale: paddle.vision.ops.roi_pool(
+            x, boxes, boxes_num, (pooled_height, pooled_width), spatial_scale)
         self.python_out_sig = ["Out"]
         self.set_data()
 
@@ -149,24 +151,27 @@ def test_check_grad(self):
 
 
 class BadInputTestRoiPool(unittest.TestCase):
+
     def test_error(self):
         with fluid.program_guard(fluid.Program()):
 
             def test_bad_x():
-                x = fluid.layers.data(
-                    name='data1', shape=[2, 1, 4, 4], dtype='int64')
-                label = fluid.layers.data(
-                    name='label', shape=[2, 4], dtype='float32', lod_level=1)
+                x = fluid.layers.data(name='data1',
+                                      shape=[2, 1, 4, 4],
+                                      dtype='int64')
+                label = fluid.layers.data(name='label',
+                                          shape=[2, 4],
+                                          dtype='float32',
+                                          lod_level=1)
                 output = fluid.layers.roi_pool(x, label, 1, 1, 1.0)
 
             self.assertRaises(TypeError, test_bad_x)
 
             def test_bad_y():
-                x = fluid.layers.data(
-                    name='data2',
-                    shape=[2, 1, 4, 4],
-                    dtype='float32',
-                    append_batch_size=False)
+                x = fluid.layers.data(name='data2',
+                                      shape=[2, 1, 4, 4],
+                                      dtype='float32',
+                                      append_batch_size=False)
                 label = [[1, 2, 3, 4], [2, 3, 4, 5]]
                 output = fluid.layers.roi_pool(x, label, 1, 1, 1.0)
 
@@ -174,6 +179,7 @@ def test_bad_y():
 
 
 class TestROIPoolInLodOp(TestROIPoolOp):
+
     def set_data(self):
         self.init_test_case()
         self.make_rois()
diff --git a/python/paddle/fluid/tests/unittests/test_roll_op.py b/python/paddle/fluid/tests/unittests/test_roll_op.py
index c315aa9b74618..546c278b8fada 100644
--- a/python/paddle/fluid/tests/unittests/test_roll_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roll_op.py
@@ -24,6 +24,7 @@
 
 
 class TestRollOp(OpTest):
+
     def setUp(self):
         self.python_api = paddle.roll
         self.op_type = "roll"
@@ -31,8 +32,8 @@ def setUp(self):
         self.inputs = {'X': np.random.random(self.x_shape).astype(self.dtype)}
         self.attrs = {'shifts': self.shifts, 'axis': self.axis}
         self.outputs = {
-            'Out': np.roll(self.inputs['X'], self.attrs['shifts'],
-                           self.attrs['axis'])
+            'Out':
+            np.roll(self.inputs['X'], self.attrs['shifts'], self.attrs['axis'])
         }
 
     def init_dtype_type(self):
@@ -49,6 +50,7 @@ def test_check_grad_normal(self):
 
 
 class TestRollOpCase2(TestRollOp):
+
     def init_dtype_type(self):
         self.dtype = np.float32
         self.x_shape = (100, 10, 5)
@@ -57,9 +59,10 @@ def init_dtype_type(self):
 
 
 class TestRollAPI(unittest.TestCase):
+
     def input_data(self):
-        self.data_x = np.array(
-            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
+        self.data_x = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0],
+                                [7.0, 8.0, 9.0]])
 
     def test_roll_op_api(self):
         self.input_data()
diff --git a/python/paddle/fluid/tests/unittests/test_rot90_op.py b/python/paddle/fluid/tests/unittests/test_rot90_op.py
index 404bb3ae1eb67..3829eaed277dc 100644
--- a/python/paddle/fluid/tests/unittests/test_rot90_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rot90_op.py
@@ -48,9 +48,8 @@ def test_static_graph(self):
             out_np = np.array(res[0])
             out_ref = np.array([[4, 1], [5, 2], [6, 3]]).astype(np.float32)
 
-            self.assertTrue(
-                (out_np == out_ref).all(),
-                msg='rot90 output is wrong, out =' + str(out_np))
+            self.assertTrue((out_np == out_ref).all(),
+                            msg='rot90 output is wrong, out =' + str(out_np))
 
     def test_static_k_0(self):
         paddle.enable_static()
@@ -74,9 +73,8 @@ def test_static_k_0(self):
             out_np = np.array(res[0])
             out_ref = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
 
-            self.assertTrue(
-                (out_np == out_ref).all(),
-                msg='rot90 output is wrong, out =' + str(out_np))
+            self.assertTrue((out_np == out_ref).all(),
+                            msg='rot90 output is wrong, out =' + str(out_np))
 
     def test_static_k_2(self):
         paddle.enable_static()
@@ -100,9 +98,8 @@ def test_static_k_2(self):
             out_np = np.array(res[0])
             out_ref = np.array([[6, 5, 4], [3, 2, 1]]).astype(np.float32)
 
-            self.assertTrue(
-                (out_np == out_ref).all(),
-                msg='rot90 output is wrong, out =' + str(out_np))
+            self.assertTrue((out_np == out_ref).all(),
+                            msg='rot90 output is wrong, out =' + str(out_np))
 
     def test_static_k_3(self):
         paddle.enable_static()
@@ -126,9 +123,8 @@ def test_static_k_3(self):
             out_np = np.array(res[0])
             out_ref = np.array([[4, 1], [5, 2], [6, 3]]).astype(np.float32)
 
-            self.assertTrue(
-                (out_np == out_ref).all(),
-                msg='rot90 output is wrong, out =' + str(out_np))
+            self.assertTrue((out_np == out_ref).all(),
+                            msg='rot90 output is wrong, out =' + str(out_np))
 
     def test_static_neg_k_1(self):
         paddle.enable_static()
@@ -152,9 +148,8 @@ def test_static_neg_k_1(self):
             out_np = np.array(res[0])
             out_ref = np.array([[4, 1], [5, 2], [6, 3]]).astype(np.float32)
 
-            self.assertTrue(
-                (out_np == out_ref).all(),
-                msg='rot90 output is wrong, out =' + str(out_np))
+            self.assertTrue((out_np == out_ref).all(),
+                            msg='rot90 output is wrong, out =' + str(out_np))
 
     def test_static_neg_k_2(self):
         paddle.enable_static()
@@ -178,9 +173,8 @@ def test_static_neg_k_2(self):
             out_np = np.array(res[0])
             out_ref = np.array([[6, 5, 4], [3, 2, 1]]).astype(np.float32)
 
-            self.assertTrue(
-                (out_np == out_ref).all(),
-                msg='rot90 output is wrong, out =' + str(out_np))
+            self.assertTrue((out_np == out_ref).all(),
+                            msg='rot90 output is wrong, out =' + str(out_np))
 
     def test_static_neg_k_3(self):
         paddle.enable_static()
@@ -204,9 +198,8 @@ def test_static_neg_k_3(self):
             out_np = np.array(res[0])
             out_ref = np.array([[3, 6], [2, 5], [1, 4]]).astype(np.float32)
 
-            self.assertTrue(
-                (out_np == out_ref).all(),
-                msg='rot90 output is wrong, out =' + str(out_np))
+            self.assertTrue((out_np == out_ref).all(),
+                            msg='rot90 output is wrong, out =' + str(out_np))
 
     def test_static_neg_k_4(self):
         paddle.enable_static()
@@ -230,9 +223,8 @@ def test_static_neg_k_4(self):
             out_np = np.array(res[0])
             out_ref = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
 
-            self.assertTrue(
-                (out_np == out_ref).all(),
-                msg='rot90 output is wrong, out =' + str(out_np))
+            self.assertTrue((out_np == out_ref).all(),
+                            msg='rot90 output is wrong, out =' + str(out_np))
 
     def test_error_api(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_row_conv_op.py b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
index b3b0742e7aff1..e12d9108ab908 100644
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
@@ -45,6 +45,7 @@ def row_conv_forward(x, lod, wt):
 
 
 class TestRowConvOp1(OpTest):
+
     def setUp(self):
 
         self.op_type = "row_conv"
@@ -67,15 +68,20 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Filter'], 'Out', check_dygraph=False)
 
     def test_check_grad_ignore_x(self):
-        self.check_grad(
-            ['Filter'], 'Out', no_grad_set=set('X'), check_dygraph=False)
+        self.check_grad(['Filter'],
+                        'Out',
+                        no_grad_set=set('X'),
+                        check_dygraph=False)
 
     def test_check_grad_ignore_wt(self):
-        self.check_grad(
-            ['X'], 'Out', no_grad_set=set('Filter'), check_dygraph=False)
+        self.check_grad(['X'],
+                        'Out',
+                        no_grad_set=set('Filter'),
+                        check_dygraph=False)
 
 
 class TestRowConvOp2(OpTest):
+
     def setUp(self):
 
         self.op_type = "row_conv"
@@ -98,27 +104,24 @@ def test_check_output(self):
     #dimensional input, the dX on CPU for some values has max_rel_error
     #slightly more than 0.05
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Filter'],
-            'Out',
-            max_relative_error=0.06,
-            check_dygraph=False)
+        self.check_grad(['X', 'Filter'],
+                        'Out',
+                        max_relative_error=0.06,
+                        check_dygraph=False)
 
     def test_check_grad_ignore_x(self):
-        self.check_grad(
-            ['Filter'],
-            'Out',
-            max_relative_error=0.06,
-            no_grad_set=set('X'),
-            check_dygraph=False)
+        self.check_grad(['Filter'],
+                        'Out',
+                        max_relative_error=0.06,
+                        no_grad_set=set('X'),
+                        check_dygraph=False)
 
     def test_check_grad_ignore_wt(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=0.06,
-            no_grad_set=set('Filter'),
-            check_dygraph=False)
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.06,
+                        no_grad_set=set('Filter'),
+                        check_dygraph=False)
 
 
 def row_conv_foward_Tensor(x, wt):
@@ -138,6 +141,7 @@ def row_conv_foward_Tensor(x, wt):
 
 
 class TestRowOpWithTensorInput(OpTest):
+
     def setUp(self):
         self.op_type = "row_conv"
         length = [1, 2, 3]
@@ -157,18 +161,23 @@ def test_check_output(self):
         self.check_output(check_dygraph=False)
 
     def test_check_grad_ignore_x(self):
-        self.check_grad(
-            ['Filter'], 'Out', no_grad_set=set('X'), check_dygraph=False)
+        self.check_grad(['Filter'],
+                        'Out',
+                        no_grad_set=set('X'),
+                        check_dygraph=False)
 
     def test_check_grad_normal(self):
         self.check_grad(['X', 'Filter'], 'Out', check_dygraph=False)
 
     def test_check_grad_ignore_wt(self):
-        self.check_grad(
-            ['X'], 'Out', no_grad_set=set('Filter'), check_dygraph=False)
+        self.check_grad(['X'],
+                        'Out',
+                        no_grad_set=set('Filter'),
+                        check_dygraph=False)
 
 
 class TestRowConvLayer(unittest.TestCase):
+
     def setUp(self):
         self.B = 2
         self.T = 6
diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
index 7a6ce5bc921f5..c7a8c04fa3497 100644
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@@ -32,12 +32,13 @@ def rpn_target_assign(anchor_by_gt_overlap,
                       rpn_fg_fraction,
                       use_random=True):
     anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
-    anchor_to_gt_max = anchor_by_gt_overlap[np.arange(
-        anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
+    anchor_to_gt_max = anchor_by_gt_overlap[
+        np.arange(anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
 
     gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
-    gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange(
-        anchor_by_gt_overlap.shape[1])]
+    gt_to_anchor_max = anchor_by_gt_overlap[
+        gt_to_anchor_argmax,
+        np.arange(anchor_by_gt_overlap.shape[1])]
     anchors_with_max_overlap = np.where(
         anchor_by_gt_overlap == gt_to_anchor_max)[0]
 
@@ -48,8 +49,9 @@ def rpn_target_assign(anchor_by_gt_overlap,
     num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im)
     fg_inds = np.where(labels == 1)[0]
     if len(fg_inds) > num_fg and use_random:
-        disable_inds = np.random.choice(
-            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
+        disable_inds = np.random.choice(fg_inds,
+                                        size=(len(fg_inds) - num_fg),
+                                        replace=False)
     else:
         disable_inds = fg_inds[num_fg:]
 
@@ -88,13 +90,12 @@ def rpn_target_assign(anchor_by_gt_overlap,
 
 def get_anchor(n, c, h, w):
     input_feat = np.random.random((n, c, h, w)).astype('float32')
-    anchors, _ = anchor_generator_in_python(
-        input_feat=input_feat,
-        anchor_sizes=[32., 64.],
-        aspect_ratios=[0.5, 1.0],
-        variances=[1.0, 1.0, 1.0, 1.0],
-        stride=[16.0, 16.0],
-        offset=0.5)
+    anchors, _ = anchor_generator_in_python(input_feat=input_feat,
+                                            anchor_sizes=[32., 64.],
+                                            aspect_ratios=[0.5, 1.0],
+                                            variances=[1.0, 1.0, 1.0, 1.0],
+                                            stride=[16.0, 16.0],
+                                            offset=0.5)
     return anchors
 
 
@@ -118,10 +119,10 @@ def rpn_target_assign_in_python(all_anchors,
         if rpn_straddle_thresh >= 0:
             # Only keep anchors inside the image by a margin of straddle_thresh
             inds_inside = np.where(
-                (all_anchors[:, 0] >= -rpn_straddle_thresh) &
-                (all_anchors[:, 1] >= -rpn_straddle_thresh) & (
-                    all_anchors[:, 2] < im_width + rpn_straddle_thresh) & (
-                        all_anchors[:, 3] < im_height + rpn_straddle_thresh))[0]
+                (all_anchors[:, 0] >= -rpn_straddle_thresh)
+                & (all_anchors[:, 1] >= -rpn_straddle_thresh)
+                & (all_anchors[:, 2] < im_width + rpn_straddle_thresh)
+                & (all_anchors[:, 3] < im_height + rpn_straddle_thresh))[0]
             # keep only inside anchors
             inside_anchors = all_anchors[inds_inside, :]
         else:
@@ -142,7 +143,7 @@ def rpn_target_assign_in_python(all_anchors,
                                            rpn_negative_overlap,
                                            rpn_fg_fraction,
                                            use_random)
-        # unmap to all anchor 
+        # unmap to all anchor
         loc_inds = inds_inside[loc_inds]
         score_inds = inds_inside[score_inds]
 
@@ -172,12 +173,13 @@ def rpn_target_assign_in_python(all_anchors,
 def retinanet_target_assign(anchor_by_gt_overlap, gt_labels, positive_overlap,
                             negative_overlap):
     anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
-    anchor_to_gt_max = anchor_by_gt_overlap[np.arange(
-        anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
+    anchor_to_gt_max = anchor_by_gt_overlap[
+        np.arange(anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
 
     gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
-    gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange(
-        anchor_by_gt_overlap.shape[1])]
+    gt_to_anchor_max = anchor_by_gt_overlap[
+        gt_to_anchor_argmax,
+        np.arange(anchor_by_gt_overlap.shape[1])]
     anchors_with_max_overlap = np.where(
         anchor_by_gt_overlap == gt_to_anchor_max)[0]
 
@@ -269,6 +271,7 @@ def retinanet_target_assign_in_python(all_anchors, gt_boxes, gt_labels,
 
 
 class TestRpnTargetAssignOp(OpTest):
+
     def setUp(self):
         n, c, h, w = 2, 4, 14, 14
         all_anchors = get_anchor(n, c, h, w)
@@ -336,6 +339,7 @@ def test_check_output(self):
 
 
 class TestRetinanetTargetAssignOp(OpTest):
+
     def setUp(self):
         n, c, h, w = 2, 4, 14, 14
         all_anchors = get_anchor(n, c, h, w)
@@ -396,23 +400,31 @@ def test_check_output(self):
 
 
 class TestRetinanetTargetAssignOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            bbox_pred1 = fluid.data(
-                name='bbox_pred1', shape=[1, 100, 4], dtype='float32')
-            cls_logits1 = fluid.data(
-                name='cls_logits1', shape=[1, 100, 10], dtype='float32')
-            anchor_box1 = fluid.data(
-                name='anchor_box1', shape=[100, 4], dtype='float32')
-            anchor_var1 = fluid.data(
-                name='anchor_var1', shape=[100, 4], dtype='float32')
-            gt_boxes1 = fluid.data(
-                name='gt_boxes1', shape=[10, 4], dtype='float32')
-            gt_labels1 = fluid.data(
-                name='gt_labels1', shape=[10, 1], dtype='int32')
+            bbox_pred1 = fluid.data(name='bbox_pred1',
+                                    shape=[1, 100, 4],
+                                    dtype='float32')
+            cls_logits1 = fluid.data(name='cls_logits1',
+                                     shape=[1, 100, 10],
+                                     dtype='float32')
+            anchor_box1 = fluid.data(name='anchor_box1',
+                                     shape=[100, 4],
+                                     dtype='float32')
+            anchor_var1 = fluid.data(name='anchor_var1',
+                                     shape=[100, 4],
+                                     dtype='float32')
+            gt_boxes1 = fluid.data(name='gt_boxes1',
+                                   shape=[10, 4],
+                                   dtype='float32')
+            gt_labels1 = fluid.data(name='gt_labels1',
+                                    shape=[10, 1],
+                                    dtype='int32')
             is_crowd1 = fluid.data(name='is_crowd1', shape=[1], dtype='float32')
-            im_info1 = fluid.data(
-                name='im_info1', shape=[1, 3], dtype='float32')
+            im_info1 = fluid.data(name='im_info1',
+                                  shape=[1, 3],
+                                  dtype='float32')
 
             # The `bbox_pred` must be Variable and the data type of `bbox_pred` Tensor
             # one of float32 and float64.
@@ -424,8 +436,9 @@ def test_bbox_pred_type():
             self.assertRaises(TypeError, test_bbox_pred_type)
 
             def test_bbox_pred_tensor_dtype():
-                bbox_pred2 = fluid.data(
-                    name='bbox_pred2', shape=[1, 100, 4], dtype='intt32')
+                bbox_pred2 = fluid.data(name='bbox_pred2',
+                                        shape=[1, 100, 4],
+                                        dtype='intt32')
                 score_pred, loc_pred, score_target, loc_target, bbox_inside_weight, fg_num = \
                     fluid.layers.retinanet_target_assign(bbox_pred2, cls_logits1, anchor_box1,
                     anchor_var1, gt_boxes1, gt_labels1, is_crowd1, im_info1, 10)
@@ -442,8 +455,9 @@ def test_cls_logits_type():
             self.assertRaises(TypeError, test_cls_logits_type)
 
             def test_cls_logits_tensor_dtype():
-                cls_logits2 = fluid.data(
-                    name='cls_logits2', shape=[1, 100, 10], dtype='int32')
+                cls_logits2 = fluid.data(name='cls_logits2',
+                                         shape=[1, 100, 10],
+                                         dtype='int32')
                 score_pred, loc_pred, score_target, loc_target, bbox_inside_weight, fg_num = \
                     fluid.layers.retinanet_target_assign(bbox_pred1, cls_logits2, anchor_box1,
                     anchor_var1, gt_boxes1, gt_labels1, is_crowd1, im_info1, 10)
@@ -460,8 +474,9 @@ def test_anchor_box_type():
             self.assertRaises(TypeError, test_anchor_box_type)
 
             def test_anchor_box_tensor_dtype():
-                anchor_box2 = fluid.data(
-                    name='anchor_box2', shape=[100, 4], dtype='int32')
+                anchor_box2 = fluid.data(name='anchor_box2',
+                                         shape=[100, 4],
+                                         dtype='int32')
                 score_pred, loc_pred, score_target, loc_target, bbox_inside_weight, fg_num = \
                     fluid.layers.retinanet_target_assign(bbox_pred1, cls_logits1, anchor_box2,
                     anchor_var1, gt_boxes1, gt_labels1, is_crowd1, im_info1, 10)
@@ -478,8 +493,9 @@ def test_anchor_var_type():
             self.assertRaises(TypeError, test_anchor_var_type)
 
             def test_anchor_var_tensor_dtype():
-                anchor_var2 = fluid.data(
-                    name='anchor_var2', shape=[100, 4], dtype='int32')
+                anchor_var2 = fluid.data(name='anchor_var2',
+                                         shape=[100, 4],
+                                         dtype='int32')
                 score_pred, loc_pred, score_target, loc_target, bbox_inside_weight, fg_num = \
                     fluid.layers.retinanet_target_assign(bbox_pred1, cls_logits1, anchor_box1,
                     anchor_var2, gt_boxes1, gt_labels1, is_crowd1, im_info1, 10)
@@ -496,8 +512,9 @@ def test_gt_boxes_type():
             self.assertRaises(TypeError, test_gt_boxes_type)
 
             def test_gt_boxes_tensor_dtype():
-                gt_boxes2 = fluid.data(
-                    name='gt_boxes2', shape=[10, 4], dtype='int32')
+                gt_boxes2 = fluid.data(name='gt_boxes2',
+                                       shape=[10, 4],
+                                       dtype='int32')
                 score_pred, loc_pred, score_target, loc_target, bbox_inside_weight, fg_num = \
                     fluid.layers.retinanet_target_assign(bbox_pred1, cls_logits1, anchor_box1,
                     anchor_var1, gt_boxes2, gt_labels1, is_crowd1, im_info1, 10)
@@ -514,8 +531,9 @@ def test_gt_label_type():
             self.assertRaises(TypeError, test_gt_label_type)
 
             def test_gt_label_tensor_dtype():
-                gt_labels2 = fluid.data(
-                    name='label2', shape=[10, 1], dtype='float32')
+                gt_labels2 = fluid.data(name='label2',
+                                        shape=[10, 1],
+                                        dtype='float32')
                 score_pred, loc_pred, score_target, loc_target, bbox_inside_weight, fg_num = \
                     fluid.layers.retinanet_target_assign(bbox_pred1, cls_logits1, anchor_box1,
                     anchor_var1, gt_boxes1, gt_labels2, is_crowd1, im_info1, 10)
@@ -532,8 +550,9 @@ def test_is_crowd_type():
             self.assertRaises(TypeError, test_is_crowd_type)
 
             def test_is_crowd_tensor_dtype():
-                is_crowd2 = fluid.data(
-                    name='is_crowd2', shape=[10, 1], dtype='float32')
+                is_crowd2 = fluid.data(name='is_crowd2',
+                                       shape=[10, 1],
+                                       dtype='float32')
                 score_pred, loc_pred, score_target, loc_target, bbox_inside_weight, fg_num = \
                     fluid.layers.retinanet_target_assign(bbox_pred1, cls_logits1, anchor_box1,
                     anchor_var1, gt_boxes1, gt_labels1, is_crowd2, im_info1, 10)
@@ -550,8 +569,9 @@ def test_im_info_type():
             self.assertRaises(TypeError, test_im_info_type)
 
             def test_im_info_tensor_dtype():
-                im_info2 = fluid.data(
-                    name='im_info2', shape=[1, 3], dtype='int32')
+                im_info2 = fluid.data(name='im_info2',
+                                      shape=[1, 3],
+                                      dtype='int32')
                 score_pred, loc_pred, score_target, loc_target, bbox_inside_weight, fg_num = \
                     fluid.layers.retinanet_target_assign(bbox_pred1, cls_logits1, anchor_box1,
                     anchor_var1, gt_boxes1, gt_labels1, is_crowd1, im_info2, 10)
diff --git a/python/paddle/fluid/tests/unittests/test_rrelu_op.py b/python/paddle/fluid/tests/unittests/test_rrelu_op.py
index 9d33ce085b7f7..523b65bcd6ddc 100644
--- a/python/paddle/fluid/tests/unittests/test_rrelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rrelu_op.py
@@ -44,6 +44,7 @@ def check_output(input, output, lower, upper):
 
 
 class TestFunctionalRReluAPI(unittest.TestCase):
+
     def setUp(self):
         self.x_np = np.random.uniform(-1., 1., [1, 2, 3, 4]).astype('float64')
         self.lower_0 = 0.05
@@ -58,12 +59,17 @@ def setUp(self):
 
     def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input = fluid.data(
-                name="input", shape=[2, 3, 4, 5], dtype="float32")
-            res1 = F.rrelu(
-                x=input, lower=self.lower_0, upper=self.upper_0, training=False)
-            res2 = F.rrelu(
-                x=input, lower=self.lower_1, upper=self.upper_1, training=False)
+            input = fluid.data(name="input",
+                               shape=[2, 3, 4, 5],
+                               dtype="float32")
+            res1 = F.rrelu(x=input,
+                           lower=self.lower_0,
+                           upper=self.upper_0,
+                           training=False)
+            res2 = F.rrelu(x=input,
+                           lower=self.lower_1,
+                           upper=self.upper_1,
+                           training=False)
             in_np = np.random.uniform(-1., 1., [2, 3, 4, 5]).astype("float32")
 
             res_np1 = ref_rrelu(in_np, self.lower_0, self.upper_0)
@@ -89,10 +95,12 @@ def test_static_graph_functional(self):
 
         for place in self.places:
             paddle.enable_static()
-            x_1 = paddle.fluid.data(
-                name="x", shape=self.x_np.shape, dtype="float64")
-            x_2 = paddle.fluid.data(
-                name="x2", shape=self.x_np.shape, dtype="float64")
+            x_1 = paddle.fluid.data(name="x",
+                                    shape=self.x_np.shape,
+                                    dtype="float64")
+            x_2 = paddle.fluid.data(name="x2",
+                                    shape=self.x_np.shape,
+                                    dtype="float64")
             out_1 = F.rrelu(x_1, self.lower_0, self.upper_0, training=False)
             out_2 = F.rrelu(x_2, self.lower_1, self.upper_1, training=False)
             out_3 = F.rrelu(x_2, self.lower_1, self.upper_1, training=True)
@@ -123,10 +131,12 @@ def test_static_graph_layer(self):
 
         for place in self.places:
             paddle.enable_static()
-            x_1 = paddle.fluid.data(
-                name="x", shape=self.x_np.shape, dtype="float64")
-            x_2 = paddle.fluid.data(
-                name="x2", shape=self.x_np.shape, dtype="float64")
+            x_1 = paddle.fluid.data(name="x",
+                                    shape=self.x_np.shape,
+                                    dtype="float64")
+            x_2 = paddle.fluid.data(name="x2",
+                                    shape=self.x_np.shape,
+                                    dtype="float64")
             # init instance
             rrelu_1 = paddle.nn.RReLU(self.lower_0, self.upper_0)
             rrelu_2 = paddle.nn.RReLU(self.lower_1, self.upper_1)
@@ -171,8 +181,8 @@ def test_dygraph_layer(self):
             rrelu = paddle.nn.RReLU(self.lower_0, self.upper_0)
             result = rrelu(paddle.to_tensor(self.x_np))
             self.assertTrue(
-                check_output(self.x_np,
-                             result.numpy(), self.lower_0, self.upper_0))
+                check_output(self.x_np, result.numpy(), self.lower_0,
+                             self.upper_0))
             paddle.enable_static()
 
     def test_dygraph(self):
@@ -182,52 +192,67 @@ def test_dygraph(self):
                 rrelu = paddle.nn.RReLU(self.lower_0, self.upper_0)
                 out_np = rrelu(paddle.to_tensor(self.x_np))
             self.assertTrue(
-                check_output(self.x_np,
-                             out_np.numpy(), self.lower_0, self.upper_0))
+                check_output(self.x_np, out_np.numpy(), self.lower_0,
+                             self.upper_0))
             paddle.enable_static()
 
     def test_error_functional(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(
-                TypeError, F.rrelu, x=1, lower=self.lower_0, upper=self.upper_0)
+            self.assertRaises(TypeError,
+                              F.rrelu,
+                              x=1,
+                              lower=self.lower_0,
+                              upper=self.upper_0)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[2, 3], dtype='int32')
-            self.assertRaises(
-                TypeError,
-                F.rrelu,
-                x=x_int32,
-                lower=self.lower_0,
-                upper=self.upper_0)
-            x_bool = paddle.fluid.data(
-                name='x_bool', shape=[2, 3], dtype='int32')
-            self.assertRaises(
-                TypeError,
-                F.rrelu,
-                x=x_bool,
-                lower=self.lower_0,
-                upper=self.upper_0)
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[2, 3],
+                                        dtype='int32')
+            self.assertRaises(TypeError,
+                              F.rrelu,
+                              x=x_int32,
+                              lower=self.lower_0,
+                              upper=self.upper_0)
+            x_bool = paddle.fluid.data(name='x_bool',
+                                       shape=[2, 3],
+                                       dtype='int32')
+            self.assertRaises(TypeError,
+                              F.rrelu,
+                              x=x_bool,
+                              lower=self.lower_0,
+                              upper=self.upper_0)
             # lower and upper must be float
-            x_fp32 = paddle.fluid.data(
-                name='x_fp32', shape=[2, 3], dtype='float32')
+            x_fp32 = paddle.fluid.data(name='x_fp32',
+                                       shape=[2, 3],
+                                       dtype='float32')
             self.assertRaises(TypeError, F.rrelu, x=x_fp32, lower=0, upper=0.5)
             self.assertRaises(TypeError, F.rrelu, x=x_fp32, lower=0.5, upper=1)
             # lower and upper must be in (0, 1)
-            self.assertRaises(
-                ValueError, F.rrelu, x=x_fp32, lower=-1., upper=0.5)
-            self.assertRaises(
-                ValueError, F.rrelu, x=x_fp32, lower=0.5, upper=2.)
+            self.assertRaises(ValueError,
+                              F.rrelu,
+                              x=x_fp32,
+                              lower=-1.,
+                              upper=0.5)
+            self.assertRaises(ValueError,
+                              F.rrelu,
+                              x=x_fp32,
+                              lower=0.5,
+                              upper=2.)
             # upper should not be less than lower
-            self.assertRaises(
-                ValueError, F.rrelu, x=x_fp32, lower=0.5, upper=0.2)
+            self.assertRaises(ValueError,
+                              F.rrelu,
+                              x=x_fp32,
+                              lower=0.5,
+                              upper=0.2)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[2, 3], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[2, 3],
+                                       dtype='float16')
             F.rrelu(x=x_fp16, lower=self.lower_0, upper=self.upper_0)
 
     def test_error_layer(self):
+
         def error_int_dtype():
             with paddle.fluid.dygraph.guard():
                 x = np.random.random([2, 3]).astype("float64")
@@ -273,6 +298,7 @@ def error_lower_upper():
 
 
 class RReluTest(OpTest):
+
     def setUp(self):
         self.op_type = "rrelu"
         self.lower = 0.1
@@ -305,6 +331,7 @@ def test_check_grad(self):
 
 
 class RReluTrainingTest(OpTest):
+
     def setUp(self):
         self.op_type = "rrelu"
         self.lower = 0.3
@@ -314,6 +341,7 @@ def setUp(self):
 
 
 class RReluTrainingTest(OpTest):
+
     def setUp(self):
         self.op_type = "rrelu"
         self.lower = 0.3
diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py
index c0157c5b9068c..d2f3ec9ebcd3b 100644
--- a/python/paddle/fluid/tests/unittests/test_run.py
+++ b/python/paddle/fluid/tests/unittests/test_run.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -51,13 +51,13 @@ def write_file(name, ct):
 
 def get_files(pth, prefix):
     return [
-        f for f in listdir(pth)
-        if isfile(join(pth, f)) and f.startswith(prefix) and f !=
-        f"{prefix}.gpu.log"
+        f for f in listdir(pth) if isfile(join(pth, f)) and f.startswith(prefix)
+        and f != f"{prefix}.gpu.log"
     ]
 
 
 class Collective_Test(unittest.TestCase):
+
     def setUp(self):
         write_file(pyname, colpyfile)
 
@@ -109,6 +109,7 @@ def test_collective_3(self):
 
 
 class PS_Test(unittest.TestCase):
+
     def setUp(self):
         write_file(pyname, pspyfile)
 
diff --git a/python/paddle/fluid/tests/unittests/test_run_fluid_by_module_or_command_line.py b/python/paddle/fluid/tests/unittests/test_run_fluid_by_module_or_command_line.py
index df626dc6dded7..d59c9db637f74 100644
--- a/python/paddle/fluid/tests/unittests/test_run_fluid_by_module_or_command_line.py
+++ b/python/paddle/fluid/tests/unittests/test_run_fluid_by_module_or_command_line.py
@@ -18,6 +18,7 @@
 
 
 class TestRunFluidByModule(unittest.TestCase):
+
     def test_module(self):
         print(sys.executable)
         res = os.system(sys.executable + ' -m "paddle.fluid.reader"')
@@ -25,6 +26,7 @@ def test_module(self):
 
 
 class TestRunFluidByCommand(unittest.TestCase):
+
     def test_command(self):
         res = os.system(sys.executable + ' -c "import paddle.fluid"')
         self.assertEqual(res, 0)  # 0 means status OK
diff --git a/python/paddle/fluid/tests/unittests/test_run_program_op.py b/python/paddle/fluid/tests/unittests/test_run_program_op.py
index 68f24bf257008..00deabbf72e83 100644
--- a/python/paddle/fluid/tests/unittests/test_run_program_op.py
+++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py
@@ -41,7 +41,7 @@ def program_scope_guard():
                 yield
 
 
-# NOTE: Because RunProgramOp has a special output of type std::vector<Scope *>, 
+# NOTE: Because RunProgramOp has a special output of type std::vector<Scope *>,
 # the OpTest cannot be used in RunProgramOp. The variable type cannot be specified
 # when creating output variables in OpTest, default type is LoDTensor
 # NOTE: the gradient test method in OpTest also cannot be used for RunProgramOp,
@@ -49,6 +49,7 @@ def program_scope_guard():
 # when create Operator, so here compare gradients with static graph
 # NOTE: Here rewrite a simple unittest framework for RunProgramOp
 class RunProgramOpTest(unittest.TestCase):
+
     def build_model(self):
         raise NotImplementedError(
             "RunProgramOp test should implement build_model")
@@ -126,13 +127,18 @@ def check_grad_with_place(self, place):
             self.assertTrue(np.allclose(expect_v, actual_v, atol=1e-5))
 
     def prepare_dygraph_input(self, place, return_param_list=False):
+
         def create_var_base(is_input, name, np_value, stop_gradient):
             if _in_eager_mode_:
-                var = core.eager.Tensor(
-                    value=np_value, name=name, place=place, zero_copy=True)
+                var = core.eager.Tensor(value=np_value,
+                                        name=name,
+                                        place=place,
+                                        zero_copy=True)
             else:
-                var = core.VarBase(
-                    value=np_value, name=name, place=place, zero_copy=True)
+                var = core.VarBase(value=np_value,
+                                   name=name,
+                                   place=place,
+                                   zero_copy=True)
             var.stop_gradient = stop_gradient
             return var
 
@@ -155,6 +161,7 @@ def create_var_base(is_input, name, np_value, stop_gradient):
         return inputs
 
     def prepare_dygraph_output(self):
+
         def create_var_base(is_input, name):
             var = framework._varbase_creator(dtype=None, shape=None, name=name)
             var.stop_gradient = False
@@ -188,7 +195,7 @@ def calc_dygraph_output(self, place):
             outputs = self.prepare_dygraph_output()
 
             _C_ops.run_program(inputs['X'], inputs['Params'], outputs['Out'],
-                               outputs['OutScope'], outputs['DOut'],
+                               outputs['OutScope'], outputs['DOut'], None,
                                *self.attrs)
             return outputs['Out']
 
@@ -202,7 +209,7 @@ def calc_dygraph_grad(self, place):
             outputs = self.prepare_dygraph_output()
 
             _C_ops.run_program(inputs['X'], inputs['Params'], outputs['Out'],
-                               outputs['OutScope'], outputs['DOut'],
+                               outputs['OutScope'], outputs['DOut'], None,
                                *self.attrs)
 
             for param in input_param_list:
@@ -234,6 +241,7 @@ def _get_grad_vartype(self, name):
 
 
 class TestRunProgramOpWithFC(RunProgramOpTest):
+
     def setUp(self):
         self.op_type = "run_program"
         self.dtype = np.float32
@@ -245,14 +253,14 @@ def setUp(self):
 
         self.inputs = {
             'X': {
-                self.input_names['X'][0]: np.random.random((32, 1, 28, 28))
-                .astype(self.dtype)
+                self.input_names['X'][0]:
+                np.random.random((32, 1, 28, 28)).astype(self.dtype)
             },
             'Params': {
-                self.input_names['Params'][0]: np.random.random(
-                    (784, 10)).astype(self.dtype),
-                self.input_names['Params'][1]: np.random.random(
-                    (32, 10)).astype(self.dtype)
+                self.input_names['Params'][0]:
+                np.random.random((784, 10)).astype(self.dtype),
+                self.input_names['Params'][1]:
+                np.random.random((32, 10)).astype(self.dtype)
             }
         }
 
@@ -264,21 +272,20 @@ def test_check_grad(self):
 
     def build_model(self):
         # 1. simple model
-        img = fluid.data(
-            name=self.input_names['X'][0],
-            shape=[None, 1, 28, 28],
-            dtype='float32')
+        img = fluid.data(name=self.input_names['X'][0],
+                         shape=[None, 1, 28, 28],
+                         dtype='float32')
         weight_attr = fluid.ParamAttr(
             name=self.input_names['Params'][0],
             learning_rate=0.5,
-            initializer=fluid.initializer.NumpyArrayInitializer(self.inputs[
-                'Params'][self.input_names['Params'][0]]),
+            initializer=fluid.initializer.NumpyArrayInitializer(
+                self.inputs['Params'][self.input_names['Params'][0]]),
             trainable=True)
         bias_attr = fluid.ParamAttr(
             name=self.input_names['Params'][1],
             learning_rate=0.5,
-            initializer=fluid.initializer.NumpyArrayInitializer(self.inputs[
-                'Params'][self.input_names['Params'][1]]),
+            initializer=fluid.initializer.NumpyArrayInitializer(
+                self.inputs['Params'][self.input_names['Params'][1]]),
             trainable=True)
         pred = fluid.layers.fc(input=img,
                                size=10,
@@ -294,6 +301,7 @@ def build_model(self):
 
 
 class TestRunProgramOpWithEmbedding(RunProgramOpTest):
+
     def setUp(self):
         self.op_type = "run_program"
         self.dtype = np.float32
@@ -313,7 +321,7 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        # NOTE: fecth not support SelectedRows, catnot compare 
+        # NOTE: fecth not support SelectedRows, catnot compare
         # sparse gradients with staic mode, only run dygraph
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -324,16 +332,17 @@ def test_check_grad(self):
 
     def build_model(self):
         # 1. simple model
-        x = fluid.layers.data(
-            name=self.input_names['X'][0], shape=[5], dtype='int64')
+        x = fluid.layers.data(name=self.input_names['X'][0],
+                              shape=[5],
+                              dtype='int64')
         emb = fluid.input.embedding(
             input=x,
             size=[10, 16],
             param_attr=fluid.ParamAttr(
                 name="emb_weight",
                 learning_rate=10,
-                initializer=fluid.initializer.NumpyArrayInitializer(self.inputs[
-                    'Params'][self.input_names['Params'][0]])),
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.inputs['Params'][self.input_names['Params'][0]])),
             is_sparse=True)
         y = fluid.layers.reduce_sum(emb, dim=-1)
         # 2. get forward op num
@@ -345,6 +354,7 @@ def build_model(self):
 
 
 class Net(paddle.nn.Layer):
+
     def __init__(self):
         super(Net, self).__init__()
         self.fc1 = paddle.nn.Linear(10, 10)
@@ -358,6 +368,7 @@ def forward(self, x):
 
 
 class TestParametersWithStopGradient(unittest.TestCase):
+
     def setUp(self):
         self.seed = 2021
         self.iter = 5
diff --git a/python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py b/python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py
index 21fdeeeb3e645..18e3b67c25e81 100644
--- a/python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py
@@ -23,6 +23,7 @@
 
 
 class TestRunTimeException(unittest.TestCase):
+
     def test_run_time_exception(self):
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -41,6 +42,7 @@ def _run_program():
 
 
 class TestCompileTimeException(unittest.TestCase):
+
     def test_compile_time_exception(self):
         self.assertRaises(ValueError, self.build_model)
 
@@ -48,8 +50,10 @@ def build_model(self):
         train_program = fluid.Program()
         startup_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
-            label = fluid.layers.data(
-                name="label", shape=[1], dtype="int64", append_batch_size=False)
+            label = fluid.layers.data(name="label",
+                                      shape=[1],
+                                      dtype="int64",
+                                      append_batch_size=False)
             fluid.layers.one_hot(input=label, depth=100)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sample_logits_op.py b/python/paddle/fluid/tests/unittests/test_sample_logits_op.py
index a3eaf24bd6ba8..749a32978beed 100644
--- a/python/paddle/fluid/tests/unittests/test_sample_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sample_logits_op.py
@@ -19,6 +19,7 @@
 
 
 class TestSampleLogitsOp(OpTest):
+
     def setUp(self):
         self.op_type = "sample_logits"
         self.dtype = np.float64
@@ -92,12 +93,14 @@ def test_check_grad(self):
 
 
 class TestSampleLogitsOpNoUniq(TestSampleLogitsOp):
+
     def setUp(self):
         super(TestSampleLogitsOpNoUniq, self).setUp()
         self.attrs = {'num_samples': self.S, 'uniq': False}
 
 
 class TestSampleLogitsOpWithAccidentalHits(TestSampleLogitsOp):
+
     def setUp(self):
         super(TestSampleLogitsOpWithAccidentalHits, self).setUp()
         self.attrs = {'num_samples': self.S, 'remove_accidental_hits': False}
diff --git a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
index 521cd3ae238c6..f9271f475fbee 100644
--- a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
@@ -23,6 +23,7 @@
 
 
 class TestSamplingIdShape(unittest.TestCase):
+
     def test_shape(self):
         paddle.enable_static()
         x = fluid.layers.data(name='x', shape=[3], dtype='float32')
@@ -33,8 +34,7 @@ def test_shape(self):
         exe.run(fluid.default_startup_program())
 
         feed = {
-            'x': np.array(
-                [[0.2, 0.3, 0.5], [0.2, 0.3, 0.4]], dtype='float32')
+            'x': np.array([[0.2, 0.3, 0.5], [0.2, 0.3, 0.4]], dtype='float32')
         }
         output_np = exe.run(feed=feed, fetch_list=[output])[0]
 
diff --git a/python/paddle/fluid/tests/unittests/test_save_inference_model_conditional_op.py b/python/paddle/fluid/tests/unittests/test_save_inference_model_conditional_op.py
index 86431086ac5f9..9f8f9c382ca01 100644
--- a/python/paddle/fluid/tests/unittests/test_save_inference_model_conditional_op.py
+++ b/python/paddle/fluid/tests/unittests/test_save_inference_model_conditional_op.py
@@ -31,13 +31,14 @@ def getModelOp(model_path):
 
     result = set()
     for i in range(0, size):
-        #print(main_block.op(i).type())    
+        #print(main_block.op(i).type())
         result.add(main_block.op(i).type())
 
     return result
 
 
 class WhileNet(paddle.nn.Layer):
+
     def __init__(self):
         super(WhileNet, self).__init__()
 
@@ -55,6 +56,7 @@ def forward(self, x):
 
 
 class ForNet(paddle.nn.Layer):
+
     def __init__(self):
         super(ForNet, self).__init__()
 
@@ -68,6 +70,7 @@ def forward(self, x):
 
 
 class IfElseNet(paddle.nn.Layer):
+
     def __init__(self):
         super(IfElseNet, self).__init__()
 
@@ -81,15 +84,15 @@ def forward(self, x):
 
 
 class TestConditionalOp(unittest.TestCase):
+
     def test_while_op(self):
         paddle.disable_static()
         net = WhileNet()
-        net = paddle.jit.to_static(
-            net,
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[1, 3, 8, 8], dtype='float32')
-            ])
+        net = paddle.jit.to_static(net,
+                                   input_spec=[
+                                       paddle.static.InputSpec(
+                                           shape=[1, 3, 8, 8], dtype='float32')
+                                   ])
         paddle.jit.save(net, './while_net')
 
         right_pdmodel = set([
@@ -107,9 +110,7 @@ def test_for_op(self):
         paddle.disable_static()
         net = ForNet()
         net = paddle.jit.to_static(
-            net,
-            input_spec=[paddle.static.InputSpec(
-                shape=[1], dtype='int32')])
+            net, input_spec=[paddle.static.InputSpec(shape=[1], dtype='int32')])
         paddle.jit.save(net, './for_net')
 
         right_pdmodel = set([
@@ -127,9 +128,7 @@ def test_if_op(self):
         paddle.disable_static()
         net = IfElseNet()
         net = paddle.jit.to_static(
-            net,
-            input_spec=[paddle.static.InputSpec(
-                shape=[1], dtype='int32')])
+            net, input_spec=[paddle.static.InputSpec(shape=[1], dtype='int32')])
         paddle.jit.save(net, './if_net')
 
         right_pdmodel = set([
diff --git a/python/paddle/fluid/tests/unittests/test_save_model_without_var.py b/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
index 4c63dced83b19..df520f8716d39 100644
--- a/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
+++ b/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
@@ -22,12 +22,12 @@
 
 
 class TestSaveModelWithoutVar(unittest.TestCase):
+
     def test_no_var_save(self):
-        data = fluid.layers.data(
-            name='data',
-            shape=[-1, 1],
-            dtype='float32',
-            append_batch_size=False)
+        data = fluid.layers.data(name='data',
+                                 shape=[-1, 1],
+                                 dtype='float32',
+                                 append_batch_size=False)
         data_plus = data + 1
 
         if fluid.core.is_compiled_with_cuda():
@@ -41,13 +41,12 @@ def test_no_var_save(self):
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")
 
-            fluid.io.save_inference_model(
-                dirname='test',
-                feeded_var_names=['data'],
-                target_vars=[data_plus],
-                executor=exe,
-                model_filename='model',
-                params_filename='params')
+            fluid.io.save_inference_model(dirname='test',
+                                          feeded_var_names=['data'],
+                                          target_vars=[data_plus],
+                                          executor=exe,
+                                          model_filename='model',
+                                          params_filename='params')
             expected_warn = "no variable in your model, please ensure there are any variables in your model to save"
             self.assertTrue(len(w) > 0)
             self.assertTrue(expected_warn == str(w[-1].message))
diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py
index 04ddb5a788d6f..f00b5fdc436e8 100644
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -25,6 +25,7 @@
 
 
 class TestScaleOp(OpTest):
+
     def setUp(self):
         self.op_type = "scale"
         self.python_api = paddle.scale
@@ -47,6 +48,7 @@ def test_check_grad(self):
 
 
 class TestScaleOpScaleVariable(OpTest):
+
     def setUp(self):
         self.op_type = "scale"
         self.python_api = paddle.scale
@@ -71,6 +73,7 @@ def test_check_grad(self):
 
 
 class TestScaleOpSelectedRows(unittest.TestCase):
+
     def init_dtype_type(self):
         pass
 
@@ -129,7 +132,9 @@ def test_scale_selected_rows_inplace(self):
 
 
 class TestScaleRaiseError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_type():
             fluid.layers.scale([10])
 
@@ -140,6 +145,7 @@ def test_type():
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestScaleFp16Op(TestScaleOp):
+
     def init_dtype_type(self):
         self.dtype = np.float16
 
@@ -151,11 +157,14 @@ def test_check_output(self):
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ["X"], "Out", max_relative_error=0.05, check_eager=True)
+            self.check_grad_with_place(place, ["X"],
+                                       "Out",
+                                       max_relative_error=0.05,
+                                       check_eager=True)
 
 
 class TestScaleBF16Op(OpTest):
+
     def setUp(self):
         self.op_type = "scale"
         self.python_api = paddle.scale
@@ -176,6 +185,7 @@ def test_check_grad(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestScaleFp16OpSelectedRows(TestScaleOpSelectedRows):
+
     def init_dtype_type(self):
         self.dtype = np.float16
 
@@ -191,6 +201,7 @@ def test_scale_selected_rows_inplace(self):
 
 
 class TestScaleApiStatic(unittest.TestCase):
+
     def _executed_api(self, x, scale=1.0, bias=0.0):
         return paddle.scale(x, scale, bias)
 
@@ -208,11 +219,13 @@ def test_api(self):
 
 
 class TestScaleInplaceApiStatic(TestScaleApiStatic):
+
     def _executed_api(self, x, scale=1.0, bias=0.0):
         return x.scale_(scale, bias)
 
 
 class TestScaleApiDygraph(unittest.TestCase):
+
     def _executed_api(self, x, scale=1.0, bias=0.0):
         return paddle.scale(x, scale, bias)
 
@@ -226,6 +239,7 @@ def test_api(self):
 
 
 class TestScaleInplaceApiDygraph(TestScaleApiDygraph):
+
     def _executed_api(self, x, scale=1.0, bias=0.0):
         return x.scale_(scale, bias)
 
diff --git a/python/paddle/fluid/tests/unittests/test_scaled_dot_product_attention.py b/python/paddle/fluid/tests/unittests/test_scaled_dot_product_attention.py
index a205189e4f9d2..05f824b42a1ff 100644
--- a/python/paddle/fluid/tests/unittests/test_scaled_dot_product_attention.py
+++ b/python/paddle/fluid/tests/unittests/test_scaled_dot_product_attention.py
@@ -21,13 +21,16 @@
 
 
 class TestScaledDotProductAttentionError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            queries = fluid.data(
-                name="queries", shape=[3, 5, 9], dtype="float32")
+            queries = fluid.data(name="queries",
+                                 shape=[3, 5, 9],
+                                 dtype="float32")
             keys = fluid.data(name="keys", shape=[3, 6, 9], dtype="float32")
-            values = fluid.data(
-                name="values", shape=[3, 6, 10], dtype="float32")
+            values = fluid.data(name="values",
+                                shape=[3, 6, 10],
+                                dtype="float32")
 
             def test_queries_Variable():
                 queries_data = np.random.rand(3, 5, 9).astype("float32")
@@ -51,40 +54,48 @@ def test_values_Variable():
             self.assertRaises(TypeError, test_values_Variable)
 
             def test_diff_dtype():
-                keys_error = fluid.data(
-                    name="keys_error", shape=[3, 6, 9], dtype="float64")
-                values_error = fluid.data(
-                    name="values_error", shape=[3, 6, 10], dtype="float64")
+                keys_error = fluid.data(name="keys_error",
+                                        shape=[3, 6, 9],
+                                        dtype="float64")
+                values_error = fluid.data(name="values_error",
+                                          shape=[3, 6, 10],
+                                          dtype="float64")
                 fluid.nets.scaled_dot_product_attention(queries, keys_error,
                                                         values_error)
 
             self.assertRaises(TypeError, test_diff_dtype)
 
             def test_diff_dim():
-                keys_error_dim = fluid.data(
-                    name="keys_error_dim", shape=[3, 6], dtype="float32")
-                values_error_dim = fluid.data(
-                    name="values_error_dim", shape=[3], dtype="float32")
+                keys_error_dim = fluid.data(name="keys_error_dim",
+                                            shape=[3, 6],
+                                            dtype="float32")
+                values_error_dim = fluid.data(name="values_error_dim",
+                                              shape=[3],
+                                              dtype="float32")
                 fluid.nets.scaled_dot_product_attention(queries, keys_error_dim,
                                                         values_error_dim)
 
             self.assertRaises(ValueError, test_diff_dim)
 
             def test_diff_hidden_size():
-                queries_error_hs = fluid.data(
-                    name="queries_error_hs", shape=[3, 5, 9], dtype="float32")
-                keys_error_hs = fluid.data(
-                    name="keys_error_hs", shape=[3, 6, 10], dtype="float32")
+                queries_error_hs = fluid.data(name="queries_error_hs",
+                                              shape=[3, 5, 9],
+                                              dtype="float32")
+                keys_error_hs = fluid.data(name="keys_error_hs",
+                                           shape=[3, 6, 10],
+                                           dtype="float32")
                 fluid.nets.scaled_dot_product_attention(queries_error_hs,
                                                         keys_error_hs, values)
 
             self.assertRaises(ValueError, test_diff_hidden_size)
 
             def test_diff_max_len():
-                keys_error_len = fluid.data(
-                    name="keys_error_len", shape=[3, 7, 9], dtype="float32")
-                values_error_len = fluid.data(
-                    name="values_error_len", shape=[3, 6, 10], dtype="float32")
+                keys_error_len = fluid.data(name="keys_error_len",
+                                            shape=[3, 7, 9],
+                                            dtype="float32")
+                values_error_len = fluid.data(name="values_error_len",
+                                              shape=[3, 6, 10],
+                                              dtype="float32")
                 fluid.nets.scaled_dot_product_attention(queries, keys_error_len,
                                                         values_error_len)
 
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
index ddbee33c35bb1..1833f36013d27 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
@@ -117,9 +117,8 @@ def setUp(self):
         self.python_api = paddle.scatter_nd_add
         shape = (3, 2, 2, 1, 10)
         ref_np = np.random.rand(*shape).astype("float64")
-        index_np = np.vstack(
-            [np.random.randint(
-                0, s, size=100) for s in shape]).T.astype("int32")
+        index_np = np.vstack([np.random.randint(0, s, size=100)
+                              for s in shape]).T.astype("int32")
         update_shape = judge_update_shape(ref_np, index_np)
         updates_np = np.random.rand(*update_shape).astype("float64")
         expect_np = numpy_scatter_nd_add(ref_np.copy(), index_np, updates_np)
@@ -167,70 +166,64 @@ class TestScatterNdOpAPI(unittest.TestCase):
     """
 
     def testcase1(self):
-        ref1 = fluid.layers.data(
-            name='ref1',
-            shape=[10, 9, 8, 1, 3],
-            dtype='float32',
-            append_batch_size=False)
-        index1 = fluid.layers.data(
-            name='index1',
-            shape=[5, 5, 8, 5],
-            dtype='int32',
-            append_batch_size=False)
-        updates1 = fluid.layers.data(
-            name='update1',
-            shape=[5, 5, 8],
-            dtype='float32',
-            append_batch_size=False)
+        ref1 = fluid.layers.data(name='ref1',
+                                 shape=[10, 9, 8, 1, 3],
+                                 dtype='float32',
+                                 append_batch_size=False)
+        index1 = fluid.layers.data(name='index1',
+                                   shape=[5, 5, 8, 5],
+                                   dtype='int32',
+                                   append_batch_size=False)
+        updates1 = fluid.layers.data(name='update1',
+                                     shape=[5, 5, 8],
+                                     dtype='float32',
+                                     append_batch_size=False)
         output1 = fluid.layers.scatter_nd_add(ref1, index1, updates1)
 
     def testcase2(self):
-        ref2 = fluid.layers.data(
-            name='ref2',
-            shape=[10, 9, 8, 1, 3],
-            dtype='double',
-            append_batch_size=False)
-        index2 = fluid.layers.data(
-            name='index2',
-            shape=[5, 8, 5],
-            dtype='int32',
-            append_batch_size=False)
-        updates2 = fluid.layers.data(
-            name='update2',
-            shape=[5, 8],
-            dtype='double',
-            append_batch_size=False)
-        output2 = fluid.layers.scatter_nd_add(
-            ref2, index2, updates2, name="scatter_nd_add")
+        ref2 = fluid.layers.data(name='ref2',
+                                 shape=[10, 9, 8, 1, 3],
+                                 dtype='double',
+                                 append_batch_size=False)
+        index2 = fluid.layers.data(name='index2',
+                                   shape=[5, 8, 5],
+                                   dtype='int32',
+                                   append_batch_size=False)
+        updates2 = fluid.layers.data(name='update2',
+                                     shape=[5, 8],
+                                     dtype='double',
+                                     append_batch_size=False)
+        output2 = fluid.layers.scatter_nd_add(ref2,
+                                              index2,
+                                              updates2,
+                                              name="scatter_nd_add")
 
     def testcase3(self):
         shape3 = [10, 9, 8, 1, 3]
-        index3 = fluid.layers.data(
-            name='index3',
-            shape=[5, 5, 8, 5],
-            dtype='int32',
-            append_batch_size=False)
-        updates3 = fluid.layers.data(
-            name='update3',
-            shape=[5, 5, 8],
-            dtype='float32',
-            append_batch_size=False)
+        index3 = fluid.layers.data(name='index3',
+                                   shape=[5, 5, 8, 5],
+                                   dtype='int32',
+                                   append_batch_size=False)
+        updates3 = fluid.layers.data(name='update3',
+                                     shape=[5, 5, 8],
+                                     dtype='float32',
+                                     append_batch_size=False)
         output3 = fluid.layers.scatter_nd(index3, updates3, shape3)
 
     def testcase4(self):
         shape4 = [10, 9, 8, 1, 3]
-        index4 = fluid.layers.data(
-            name='index4',
-            shape=[5, 5, 8, 5],
-            dtype='int32',
-            append_batch_size=False)
-        updates4 = fluid.layers.data(
-            name='update4',
-            shape=[5, 5, 8],
-            dtype='double',
-            append_batch_size=False)
-        output4 = fluid.layers.scatter_nd(
-            index4, updates4, shape4, name='scatter_nd')
+        index4 = fluid.layers.data(name='index4',
+                                   shape=[5, 5, 8, 5],
+                                   dtype='int32',
+                                   append_batch_size=False)
+        updates4 = fluid.layers.data(name='update4',
+                                     shape=[5, 5, 8],
+                                     dtype='double',
+                                     append_batch_size=False)
+        output4 = fluid.layers.scatter_nd(index4,
+                                          updates4,
+                                          shape4,
+                                          name='scatter_nd')
 
     def testcase5(self):
         if not fluid.core.is_compiled_with_cuda():
@@ -244,15 +237,15 @@ def testcase5(self):
         with fluid.dygraph.guard():
             device = paddle.get_device()
             paddle.set_device('gpu')
-            gpu_value = paddle.scatter_nd_add(
-                paddle.to_tensor(x),
-                paddle.to_tensor(index), paddle.to_tensor(val))
+            gpu_value = paddle.scatter_nd_add(paddle.to_tensor(x),
+                                              paddle.to_tensor(index),
+                                              paddle.to_tensor(val))
             paddle.set_device('cpu')
-            cpu_value = paddle.scatter_nd_add(
-                paddle.to_tensor(x),
-                paddle.to_tensor(index), paddle.to_tensor(val))
-            self.assertTrue(
-                np.array_equal(gpu_value.numpy(), cpu_value.numpy()))
+            cpu_value = paddle.scatter_nd_add(paddle.to_tensor(x),
+                                              paddle.to_tensor(index),
+                                              paddle.to_tensor(val))
+            self.assertTrue(np.array_equal(gpu_value.numpy(),
+                                           cpu_value.numpy()))
             paddle.set_device(device)
 
         @switch_to_static_graph
@@ -260,10 +253,12 @@ def test_static_graph():
             with paddle.static.program_guard(paddle.static.Program(),
                                              paddle.static.Program()):
                 x_t = paddle.static.data(name="x", dtype=x.dtype, shape=x.shape)
-                index_t = paddle.static.data(
-                    name="index", dtype=index.dtype, shape=index.shape)
-                val_t = paddle.static.data(
-                    name="val", dtype=val.dtype, shape=val.shape)
+                index_t = paddle.static.data(name="index",
+                                             dtype=index.dtype,
+                                             shape=index.shape)
+                val_t = paddle.static.data(name="val",
+                                           dtype=val.dtype,
+                                           shape=val.shape)
                 out_t = paddle.scatter_nd_add(x_t, index_t, val_t)
                 feed = {x_t.name: x, index_t.name: index, val_t.name: val}
                 fetch = [out_t]
@@ -279,15 +274,20 @@ def test_static_graph():
 
 #Test Raise Error
 class TestScatterNdOpRaise(unittest.TestCase):
+
     def test_check_raise(self):
+
         def check_raise_is_test():
             try:
-                ref5 = fluid.layers.data(
-                    name='ref5', shape=[3, 4, 5], dtype='float32')
-                index5 = fluid.layers.data(
-                    name='index5', shape=[2, 10], dtype='int32')
-                updates5 = fluid.layers.data(
-                    name='updates5', shape=[2, 10], dtype='float32')
+                ref5 = fluid.layers.data(name='ref5',
+                                         shape=[3, 4, 5],
+                                         dtype='float32')
+                index5 = fluid.layers.data(name='index5',
+                                           shape=[2, 10],
+                                           dtype='int32')
+                updates5 = fluid.layers.data(name='updates5',
+                                             shape=[2, 10],
+                                             dtype='float32')
                 output5 = fluid.layers.scatter_nd_add(ref5, index5, updates5)
             except Exception as e:
                 t = \
@@ -299,31 +299,31 @@ def check_raise_is_test():
 
     def test_check_raise2(self):
         with self.assertRaises(ValueError):
-            ref6 = fluid.layers.data(
-                name='ref6',
-                shape=[10, 9, 8, 1, 3],
-                dtype='double',
-                append_batch_size=False)
-            index6 = fluid.layers.data(
-                name='index6',
-                shape=[5, 8, 5],
-                dtype='int32',
-                append_batch_size=False)
-            updates6 = fluid.layers.data(
-                name='update6',
-                shape=[5, 8],
-                dtype='float32',
-                append_batch_size=False)
+            ref6 = fluid.layers.data(name='ref6',
+                                     shape=[10, 9, 8, 1, 3],
+                                     dtype='double',
+                                     append_batch_size=False)
+            index6 = fluid.layers.data(name='index6',
+                                       shape=[5, 8, 5],
+                                       dtype='int32',
+                                       append_batch_size=False)
+            updates6 = fluid.layers.data(name='update6',
+                                         shape=[5, 8],
+                                         dtype='float32',
+                                         append_batch_size=False)
             output6 = fluid.layers.scatter_nd_add(ref6, index6, updates6)
 
     def test_check_raise3(self):
+
         def check_raise_is_test():
             try:
                 shape = [3, 4, 5]
-                index7 = fluid.layers.data(
-                    name='index7', shape=[2, 1], dtype='int32')
-                updates7 = fluid.layers.data(
-                    name='updates7', shape=[2, 4, 5, 20], dtype='float32')
+                index7 = fluid.layers.data(name='index7',
+                                           shape=[2, 1],
+                                           dtype='int32')
+                updates7 = fluid.layers.data(name='updates7',
+                                             shape=[2, 4, 5, 20],
+                                             dtype='float32')
                 output7 = fluid.layers.scatter_nd(index7, updates7, shape)
             except Exception as e:
                 t = \
@@ -335,6 +335,7 @@ def check_raise_is_test():
 
 
 class TestDygraph(unittest.TestCase):
+
     def test_dygraph(self):
         with fluid.dygraph.guard(fluid.CPUPlace()):
             index_data = np.array([[1, 1], [0, 1], [1, 3]]).astype(np.int64)
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py
index d7f8886dcd3c1..2fe162d809019 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
@@ -25,6 +25,7 @@
 
 
 class TestScatterOp(OpTest):
+
     def setUp(self):
         self.op_type = "scatter"
         self.python_api = paddle.scatter
@@ -44,6 +45,7 @@ def test_check_grad(self):
 
 
 class TestScatterOp0(OpTest):
+
     def setUp(self):
         self.op_type = "scatter"
         self.python_api = paddle.scatter
@@ -64,6 +66,7 @@ def test_check_grad(self):
 
 
 class TestScatterOp1(OpTest):
+
     def setUp(self):
         self.op_type = "scatter"
         self.python_api = paddle.scatter
@@ -89,6 +92,7 @@ def test_check_grad(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestScatterOp2(OpTest):
+
     def setUp(self):
         self.op_type = "scatter"
         self.python_api = paddle.scatter
@@ -108,13 +112,15 @@ def test_check_output(self):
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['X', 'Updates'], 'Out', check_eager=False)
+            self.check_grad_with_place(place, ['X', 'Updates'],
+                                       'Out',
+                                       check_eager=False)
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestScatterOp3(OpTest):
+
     def setUp(self):
         self.op_type = "scatter"
         self.python_api = paddle.scatter
@@ -138,11 +144,13 @@ def test_check_output(self):
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['X', 'Updates'], 'Out', check_eager=False)
+            self.check_grad_with_place(place, ['X', 'Updates'],
+                                       'Out',
+                                       check_eager=False)
 
 
 class TestScatterOp4(OpTest):
+
     def setUp(self):
         self.op_type = "scatter"
         self.python_api = paddle.scatter
@@ -164,6 +172,7 @@ def test_check_grad(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestScatterOp5(OpTest):
+
     def setUp(self):
         self.op_type = "scatter"
         self.python_api = paddle.scatter
@@ -183,11 +192,13 @@ def test_check_output(self):
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['X', 'Updates'], 'Out', check_eager=False)
+            self.check_grad_with_place(place, ['X', 'Updates'],
+                                       'Out',
+                                       check_eager=False)
 
 
 class TestScatterAPI(unittest.TestCase):
+
     def setUp(self):
         self.places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -206,8 +217,8 @@ def check_static_result(self, place):
 
             input_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float64)
             index_data = np.array([2, 1, 0, 1]).astype(np.int64)
-            updates_data = np.array(
-                [[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float64)
+            updates_data = np.array([[1, 1], [2, 2], [3, 3],
+                                     [4, 4]]).astype(np.float64)
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
@@ -229,8 +240,8 @@ def test_dygraph(self):
             with fluid.dygraph.guard(place):
                 x_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float64)
                 index_data = np.array([2, 1, 0, 1]).astype(np.int64)
-                updates_data = np.array(
-                    [[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float64)
+                updates_data = np.array([[1, 1], [2, 2], [3, 3],
+                                         [4, 4]]).astype(np.float64)
 
                 x = fluid.dygraph.to_variable(x_data)
                 index = fluid.dygraph.to_variable(index_data)
@@ -250,9 +261,9 @@ def test_large_data(self):
 
         def test_dygraph():
             with fluid.dygraph.guard():
-                gpu_out = paddle.scatter(
-                    paddle.to_tensor(x),
-                    paddle.to_tensor(index), paddle.to_tensor(updates))
+                gpu_out = paddle.scatter(paddle.to_tensor(x),
+                                         paddle.to_tensor(index),
+                                         paddle.to_tensor(updates))
                 return gpu_out.numpy()
 
         @switch_to_static_graph
@@ -260,10 +271,12 @@ def test_static_graph():
             with paddle.static.program_guard(paddle.static.Program(),
                                              paddle.static.Program()):
                 x_t = paddle.static.data(name="x", dtype=x.dtype, shape=x.shape)
-                index_t = paddle.static.data(
-                    name="index", dtype=index.dtype, shape=index.shape)
-                updates_t = paddle.static.data(
-                    name="updates", dtype=updates.dtype, shape=updates.shape)
+                index_t = paddle.static.data(name="index",
+                                             dtype=index.dtype,
+                                             shape=index.shape)
+                updates_t = paddle.static.data(name="updates",
+                                               dtype=updates.dtype,
+                                               shape=updates.shape)
                 out_t = paddle.scatter(x_t, index_t, updates_t)
                 feed = {
                     x_t.name: x,
@@ -282,6 +295,7 @@ def test_static_graph():
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestScatterOpFp16(OpTest):
+
     def setUp(self):
         self.__class__.op_type = "scatter"
         self.python_api = paddle.scatter
@@ -301,8 +315,8 @@ def setUp(self):
         self.ref_dx[self.index_np] = zero_np
 
     def compute_ref_grad_updates(self):
-        ref_grad_updates = paddle.gather(
-            paddle.to_tensor(self.dout_np), paddle.to_tensor(self.index_np))
+        ref_grad_updates = paddle.gather(paddle.to_tensor(self.dout_np),
+                                         paddle.to_tensor(self.index_np))
         return ref_grad_updates
 
     def test_scatter_fp16(self):
@@ -311,19 +325,21 @@ def test_scatter_fp16(self):
         index_tensor = paddle.to_tensor(self.index_np)
         updates_tensor = paddle.to_tensor(self.updates_np, stop_gradient=False)
         out_tensor = paddle.scatter(x_tensor, index_tensor, updates_tensor)
-        paddle.autograd.backward(
-            [out_tensor], [paddle.to_tensor(self.dout_np)], retain_graph=True)
+        paddle.autograd.backward([out_tensor], [paddle.to_tensor(self.dout_np)],
+                                 retain_graph=True)
         ref_grad_updates = self.compute_ref_grad_updates()
-        np.testing.assert_allclose(
-            ref_grad_updates.numpy(),
-            updates_tensor.grad.numpy(),
-            rtol=1e-5,
-            atol=1e-5)
-        np.testing.assert_allclose(
-            self.ref_dx, x_tensor.grad.numpy(), rtol=1e-5, atol=1e-5)
+        np.testing.assert_allclose(ref_grad_updates.numpy(),
+                                   updates_tensor.grad.numpy(),
+                                   rtol=1e-5,
+                                   atol=1e-5)
+        np.testing.assert_allclose(self.ref_dx,
+                                   x_tensor.grad.numpy(),
+                                   rtol=1e-5,
+                                   atol=1e-5)
 
 
 class TestScatterInplaceAPI(TestScatterAPI):
+
     def executed_api(self):
         self.scatter = paddle.scatter_
 
diff --git a/python/paddle/fluid/tests/unittests/test_scope.py b/python/paddle/fluid/tests/unittests/test_scope.py
index 805aabd393e49..9e9f2472d4466 100644
--- a/python/paddle/fluid/tests/unittests/test_scope.py
+++ b/python/paddle/fluid/tests/unittests/test_scope.py
@@ -20,6 +20,7 @@
 
 
 class TestScope(unittest.TestCase):
+
     def test_create_destroy(self):
         paddle_c = paddle.fluid.core
         scope = paddle_c.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_searchsorted_op.py b/python/paddle/fluid/tests/unittests/test_searchsorted_op.py
index f802b0adfcb2a..84aa4e858efd3 100644
--- a/python/paddle/fluid/tests/unittests/test_searchsorted_op.py
+++ b/python/paddle/fluid/tests/unittests/test_searchsorted_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,6 +24,7 @@
 
 
 class TestSearchSorted(OpTest):
+
     def setUp(self):
         self.python_api = paddle.searchsorted
         self.op_type = "searchsorted"
@@ -36,8 +37,8 @@ def setUp(self):
         self.attrs = {"out_int32": False, "right": False}
         self.attrs["right"] = True if self.side == 'right' else False
         self.outputs = {
-            'Out': np.searchsorted(
-                self.sorted_sequence, self.values, side=self.side)
+            'Out':
+            np.searchsorted(self.sorted_sequence, self.values, side=self.side)
         }
 
     def test_check_output(self):
@@ -50,6 +51,7 @@ def init_test_case(self):
 
 
 class TestSearchSortedOp1(TestSearchSorted):
+
     def init_test_case(self):
         self.sorted_sequence = np.array([1, 3, 5, 7, 9]).astype("int32")
         self.values = np.array([[3, 6, 9], [3, 6, 9]]).astype("int32")
@@ -57,6 +59,7 @@ def init_test_case(self):
 
 
 class TestSearchSortedOp2(TestSearchSorted):
+
     def init_test_case(self):
         self.sorted_sequence = np.array([1, 3, 5, 7, 9]).astype("int64")
         self.values = np.array([[3, 6, 9], [3, 6, 9]]).astype("int64")
@@ -64,22 +67,25 @@ def init_test_case(self):
 
 
 class TestSearchSortedOp3(TestSearchSorted):
+
     def init_test_case(self):
         self.sorted_sequence = np.array([1, 3, 5, 7, 9]).astype("float64")
-        self.values = np.array(
-            [[np.nan, np.nan, np.nan], [3, 6, 9]]).astype("float64")
+        self.values = np.array([[np.nan, np.nan, np.nan],
+                                [3, 6, 9]]).astype("float64")
         self.side = "left"
 
 
 class TestSearchSortedOp4(TestSearchSorted):
+
     def init_test_case(self):
         self.sorted_sequence = np.array([1, 3, 5, 7, 9]).astype("float64")
-        self.values = np.array(
-            [[np.inf, np.inf, np.inf], [3, 6, 9]]).astype("float64")
+        self.values = np.array([[np.inf, np.inf, np.inf],
+                                [3, 6, 9]]).astype("float64")
         self.side = "right"
 
 
 class TestSearchSortedOp5(TestSearchSorted):
+
     def init_test_case(self):
         self.sorted_sequence = np.array([1, 3, 5, 7, 9]).astype("float64")
         self.values = np.array([[np.inf, np.inf, np.inf],
@@ -88,6 +94,7 @@ def init_test_case(self):
 
 
 class TestSearchSortedAPI(unittest.TestCase):
+
     def init_test_case(self):
         self.sorted_sequence = np.array([2, 4, 6, 8, 10]).astype("float64")
         self.values = np.array([[3, 6, 9], [3, 6, 9]]).astype("float64")
@@ -107,8 +114,9 @@ def run(place):
                     'SortedSequence',
                     shape=self.sorted_sequence.shape,
                     dtype="float64")
-                values = paddle.static.data(
-                    'Values', shape=self.values.shape, dtype="float64")
+                values = paddle.static.data('Values',
+                                            shape=self.values.shape,
+                                            dtype="float64")
                 out = paddle.searchsorted(sorted_sequence, values)
                 exe = paddle.static.Executor(place)
                 res = exe.run(feed={
@@ -123,14 +131,16 @@ def run(place):
             run(place)
 
     def test_dygraph_api(self):
+
         def run(place):
 
             paddle.disable_static(place)
             sorted_sequence = paddle.to_tensor(self.sorted_sequence)
             values = paddle.to_tensor(self.values)
             out = paddle.searchsorted(sorted_sequence, values, right=True)
-            out_ref = np.searchsorted(
-                self.sorted_sequence, self.values, side='right')
+            out_ref = np.searchsorted(self.sorted_sequence,
+                                      self.values,
+                                      side='right')
             self.assertEqual(np.allclose(out_ref, out.numpy()), True)
             paddle.enable_static()
 
@@ -146,15 +156,18 @@ def test_out_int32(self):
 
 
 class TestSearchSortedError(unittest.TestCase):
+
     def test_error_api(self):
         paddle.enable_static()
 
         def test_searchsorted_dims_matched_before_lastdim_error1():
             with paddle.static.program_guard(paddle.static.Program()):
-                sorted_sequence = paddle.static.data(
-                    'SortedSequence', shape=[2, 2, 3], dtype="float64")
-                values = paddle.static.data(
-                    'Values', shape=[2, 5], dtype="float64")
+                sorted_sequence = paddle.static.data('SortedSequence',
+                                                     shape=[2, 2, 3],
+                                                     dtype="float64")
+                values = paddle.static.data('Values',
+                                            shape=[2, 5],
+                                            dtype="float64")
                 out = paddle.searchsorted(sorted_sequence, values)
 
         self.assertRaises(RuntimeError,
@@ -162,10 +175,12 @@ def test_searchsorted_dims_matched_before_lastdim_error1():
 
         def test_searchsorted_dims_matched_before_lastdim_error2():
             with paddle.static.program_guard(paddle.static.Program()):
-                sorted_sequence = paddle.static.data(
-                    'SortedSequence', shape=[2, 2, 3], dtype="float64")
-                values = paddle.static.data(
-                    'Values', shape=[2, 3, 5], dtype="float64")
+                sorted_sequence = paddle.static.data('SortedSequence',
+                                                     shape=[2, 2, 3],
+                                                     dtype="float64")
+                values = paddle.static.data('Values',
+                                            shape=[2, 3, 5],
+                                            dtype="float64")
                 out = paddle.searchsorted(sorted_sequence, values)
 
         self.assertRaises(RuntimeError,
@@ -173,22 +188,28 @@ def test_searchsorted_dims_matched_before_lastdim_error2():
 
         def test_searchsorted_sortedsequence_size_error():
             with paddle.static.program_guard(paddle.static.Program()):
-                sorted_sequence = paddle.static.data(
-                    'SortedSequence', shape=[2, 2, pow(2, 34)], dtype="float64")
-                values = paddle.static.data(
-                    'Values', shape=[2, 2, 5], dtype="float64")
-                out = paddle.searchsorted(
-                    sorted_sequence, values, out_int32=True)
+                sorted_sequence = paddle.static.data('SortedSequence',
+                                                     shape=[2, 2,
+                                                            pow(2, 34)],
+                                                     dtype="float64")
+                values = paddle.static.data('Values',
+                                            shape=[2, 2, 5],
+                                            dtype="float64")
+                out = paddle.searchsorted(sorted_sequence,
+                                          values,
+                                          out_int32=True)
 
         self.assertRaises(RuntimeError,
                           test_searchsorted_sortedsequence_size_error)
 
         def test_sortedsequence_values_type_error():
             with paddle.static.program_guard(paddle.static.Program()):
-                sorted_sequence = paddle.static.data(
-                    'SortedSequence', shape=[2, 3], dtype="int16")
-                values = paddle.static.data(
-                    'Values', shape=[2, 5], dtype="int16")
+                sorted_sequence = paddle.static.data('SortedSequence',
+                                                     shape=[2, 3],
+                                                     dtype="int16")
+                values = paddle.static.data('Values',
+                                            shape=[2, 5],
+                                            dtype="int16")
                 out = paddle.searchsorted(sorted_sequence, values)
 
         self.assertRaises(TypeError, test_sortedsequence_values_type_error)
diff --git a/python/paddle/fluid/tests/unittests/test_seed_op.py b/python/paddle/fluid/tests/unittests/test_seed_op.py
index 0dcc197ece7ed..3e4730a1fbca7 100644
--- a/python/paddle/fluid/tests/unittests/test_seed_op.py
+++ b/python/paddle/fluid/tests/unittests/test_seed_op.py
@@ -24,6 +24,7 @@
 
 
 class TestSeedOpFixSeed(OpTest):
+
     def setUp(self):
         self.op_type = "seed"
         self.inputs = {}
@@ -35,6 +36,7 @@ def test_check_output(self):
 
 
 class TestSeedOpDiffSeed(OpTest):
+
     def setUp(self):
         self.op_type = "seed"
         self.inputs = {}
@@ -46,6 +48,7 @@ def test_check_output(self):
 
 
 class TestDropoutWithRandomSeedGenerator(unittest.TestCase):
+
     def setUp(self):
         paddle.framework.random.set_random_seed_generator('seed0', 123)
         paddle.framework.random.set_random_seed_generator('seed1', 123)
diff --git a/python/paddle/fluid/tests/unittests/test_segment_ops.py b/python/paddle/fluid/tests/unittests/test_segment_ops.py
index 90d597837a8e1..678a888eeda5d 100644
--- a/python/paddle/fluid/tests/unittests/test_segment_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_segment_ops.py
@@ -85,6 +85,7 @@ def segment_pool_split(X, SegmentIds, pooltype):
 
 
 class TestSegmentOps(OpTest):
+
     def set_data(self):
         x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
         segment_ids = self.set_segment(len(x), len(x) // 5 + 1)
@@ -125,6 +126,7 @@ def test_check_grad(self):
 
 
 class TestSegmentSum2(TestSegmentOps):
+
     def prepare(self):
         super(TestSegmentSum2, self).prepare()
         self.shape = [40, 20]
@@ -142,6 +144,7 @@ def setUp(self):
 
 
 class TestSegmentMax(TestSegmentOps):
+
     def compute(self, x, segment_ids):
         return compute_segment_min_max(x, segment_ids, pooltype="MAX")
 
@@ -165,12 +168,14 @@ def test_check_grad(self):
 
 
 class TestSegmentMax2(TestSegmentMax):
+
     def prepare(self):
         super(TestSegmentMax2, self).prepare()
         self.dtype = np.float32
 
 
 class TestSegmentMin(TestSegmentMax):
+
     def compute(self, x, segment_ids):
         return compute_segment_min_max(x, segment_ids, pooltype="MIN")
 
@@ -180,12 +185,14 @@ def prepare(self):
 
 
 class TestSegmentMin2(TestSegmentMin):
+
     def prepare(self):
         super(TestSegmentMin2, self).prepare()
         self.dtype = np.float32
 
 
 class TestSegmentMean(TestSegmentOps):
+
     def compute(self, x, segment_ids):
         return compute_segment_mean(x, segment_ids)
 
@@ -200,13 +207,16 @@ def setUp(self):
         result = self.compute(x, segment_ids)
         self.inputs = {'X': x, 'SegmentIds': segment_ids}
         self.outputs = {
-            'Out': result,
-            'SummedIds': compute_segment_sum(
+            'Out':
+            result,
+            'SummedIds':
+            compute_segment_sum(
                 np.ones([len(x), 1]).astype(self.dtype), segment_ids)
         }
 
 
 class TestSegmentMean2(TestSegmentMean):
+
     def prepare(self):
         super(TestSegmentMean2, self).prepare()
         self.dtype = np.float32
@@ -215,6 +225,7 @@ def prepare(self):
 
 
 class API_SegmentOpsTest(unittest.TestCase):
+
     def test_static(self):
         with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.static.data(name="x", shape=[3, 3], dtype="float32")
@@ -234,22 +245,22 @@ def test_static(self):
             np_max = np.array([[3, 2, 3], [4, 5, 6]], dtype="float32")
             np_min = np.array([[1, 2, 1], [4, 5, 6]], dtype="float32")
 
-            ret = exe.run(feed={'x': data1,
-                                'y': data2},
+            ret = exe.run(feed={
+                'x': data1,
+                'y': data2
+            },
                           fetch_list=[res_sum, res_mean, res_max, res_min])
 
         for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret):
             self.assertTrue(
-                np.allclose(
-                    np_res, ret_res, atol=1e-6),
-                "two value is\
+                np.allclose(np_res, ret_res, atol=1e-6), "two value is\
                 {}\n{}, check diff!".format(np_res, ret_res))
 
     def test_dygraph(self):
         device = paddle.CPUPlace()
         with paddle.fluid.dygraph.guard(device):
-            x = paddle.to_tensor(
-                [[1, 2, 3], [3, 2, 1], [4, 5, 6]], dtype='float32')
+            x = paddle.to_tensor([[1, 2, 3], [3, 2, 1], [4, 5, 6]],
+                                 dtype='float32')
             y = paddle.to_tensor([0, 0, 1], dtype="int32")
             res_sum = paddle.incubate.segment_sum(x, y)
             res_mean = paddle.incubate.segment_mean(x, y)
@@ -265,9 +276,7 @@ def test_dygraph(self):
 
         for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret):
             self.assertTrue(
-                np.allclose(
-                    np_res, ret_res.numpy(), atol=1e-6),
-                "two value is\
+                np.allclose(np_res, ret_res.numpy(), atol=1e-6), "two value is\
                 {}\n{}, check diff!".format(np_res, ret_res))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_select_input_output_op.py b/python/paddle/fluid/tests/unittests/test_select_input_output_op.py
index 23b394516fc13..8a41e05d1d52a 100644
--- a/python/paddle/fluid/tests/unittests/test_select_input_output_op.py
+++ b/python/paddle/fluid/tests/unittests/test_select_input_output_op.py
@@ -26,6 +26,7 @@
 
 
 class TestSplitMergeSelectedVarOps(unittest.TestCase):
+
     def test_forward_backward_list_output(self):
         for branch_num in range(2, 10):
             program = Program()
@@ -45,16 +46,18 @@ def test_forward_backward_list_output(self):
                 mean = layers.mean(y)
                 append_backward(mean)
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = Executor(place)
 
             feed_x = np.asarray([1.3, -1.4]).astype(np.float32)
             for i in range(branch_num):
                 feed_mask = np.asarray([i]).astype(np.int32)
                 ret = exe.run(program,
-                              feed={'x': feed_x,
-                                    'mask': feed_mask},
+                              feed={
+                                  'x': feed_x,
+                                  'mask': feed_mask
+                              },
                               fetch_list=[y.name, x.grad_name])
                 x_grad = np.asarray([0.5, 0.5]).astype(np.float32)
                 self.assertTrue(np.allclose(np.asarray(ret[0]), feed_x))
@@ -62,6 +65,7 @@ def test_forward_backward_list_output(self):
 
 
 class TestSelectInputOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             mask = layers.data(name='mask', shape=[1], dtype='int32')
@@ -88,14 +92,17 @@ def test_mask_dtype():
 
 
 class TestSelectOutput_Error(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
             in1 = layers.data(name='in1', shape=[1], dtype='int32')
-            mask_int32 = layers.data(
-                name='mask_int32', shape=[1], dtype='int32')
-            mask_float32 = layers.data(
-                name='mask_float32', shape=[1], dtype='float32')
+            mask_int32 = layers.data(name='mask_int32',
+                                     shape=[1],
+                                     dtype='int32')
+            mask_float32 = layers.data(name='mask_float32',
+                                       shape=[1],
+                                       dtype='float32')
             out1 = layers.data(name='out1', shape=[1], dtype='int32')
 
             # 1. The type of input in select_output must Variable.
diff --git a/python/paddle/fluid/tests/unittests/test_selected_rows.py b/python/paddle/fluid/tests/unittests/test_selected_rows.py
index 2f34f79b8eafa..31023ef60904a 100644
--- a/python/paddle/fluid/tests/unittests/test_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_selected_rows.py
@@ -20,6 +20,7 @@
 
 
 class TestSelectedRows(unittest.TestCase):
+
     def test_selected_rows(self):
         place = core.CPUPlace()
         height = 10
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
index f16198817945a..6807f96109e7c 100644
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -40,6 +40,7 @@ def ref_selu(x,
 
 
 class SeluTest(OpTest):
+
     def setUp(self):
         self.op_type = "selu"
         self.python_api = paddle.nn.functional.selu
@@ -129,18 +130,21 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.selu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
             self.assertRaises(TypeError, F.selu, x_int32)
             # The scale must be greater than 1.0
-            x_fp32 = paddle.fluid.data(
-                name='x_fp32', shape=[12, 10], dtype='float32')
+            x_fp32 = paddle.fluid.data(name='x_fp32',
+                                       shape=[12, 10],
+                                       dtype='float32')
             self.assertRaises(ValueError, F.selu, x_fp32, -1.0)
             # The alpha must be no less than 0
             self.assertRaises(ValueError, F.selu, x_fp32, 1.6, -1.0)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
             F.selu(x_fp16)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_set_bool_attr.py b/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
index 827f63fe823ba..a089563b6ecd0 100644
--- a/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
+++ b/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
@@ -17,6 +17,7 @@
 
 
 class TestAttrSet(unittest.TestCase):
+
     def test_set_bool_attr(self):
         x = fluid.layers.data(name='x', shape=[3, 7, 3, 7], dtype='float32')
         param_attr = fluid.ParamAttr(
@@ -25,8 +26,9 @@ def test_set_bool_attr(self):
         bias_attr = fluid.ParamAttr(
             name='batch_norm_b',
             initializer=fluid.initializer.Constant(value=0.0))
-        bn = fluid.layers.batch_norm(
-            input=x, param_attr=param_attr, bias_attr=bias_attr)
+        bn = fluid.layers.batch_norm(input=x,
+                                     param_attr=param_attr,
+                                     bias_attr=bias_attr)
         block = fluid.default_main_program().desc.block(0)
         op = block.op(0)
         before_type = op.attr_type('is_test')
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index 8f9801780cd9d..9aee71af41675 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -20,12 +20,14 @@
 import numpy as np
 
 import paddle
+import paddle.fluid as fluid
 from paddle.fluid.layer_helper import LayerHelper
 from functools import reduce
 from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph
 
 
 class TestSetValueBase(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         self.set_dtype()
@@ -51,6 +53,7 @@ def _get_answer(self):
 
 
 class TestSetValueApi(TestSetValueBase):
+
     def _run_static(self):
         paddle.enable_static()
         with paddle.static.program_guard(self.program):
@@ -76,12 +79,10 @@ def func_test_api(self):
         self._get_answer()
 
         error_msg = "\nIn {} mode: \nExpected res = \n{}, \n\nbut received : \n{}"
-        self.assertTrue(
-            (self.data == static_out).all(),
-            msg=error_msg.format("static", self.data, static_out))
-        self.assertTrue(
-            (self.data == dynamic_out).all(),
-            msg=error_msg.format("dynamic", self.data, dynamic_out))
+        self.assertTrue((self.data == static_out).all(),
+                        msg=error_msg.format("static", self.data, static_out))
+        self.assertTrue((self.data == dynamic_out).all(),
+                        msg=error_msg.format("dynamic", self.data, dynamic_out))
 
     def test_api(self):
         with _test_eager_guard():
@@ -92,6 +93,7 @@ def test_api(self):
 # 1. Test different type of item: int, Python slice, Paddle Tensor
 # 1.1 item is int
 class TestSetValueItemInt(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0] = self.value
 
@@ -102,6 +104,7 @@ def _get_answer(self):
 # 1.2 item is slice
 # 1.2.1 step is 1
 class TestSetValueItemSlice(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:2] = self.value
 
@@ -110,6 +113,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSlice2(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:-1] = self.value
 
@@ -118,6 +122,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSlice3(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:-1, 0:2] = self.value
 
@@ -126,6 +131,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSlice4(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:, 1:2, :] = self.value
 
@@ -134,6 +140,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSlice5(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:, 1:1, :] = self.value
 
@@ -142,7 +149,9 @@ def _get_answer(self):
 
 
 class TestSetValueItemSliceInWhile(TestSetValueApi):
+
     def _call_setitem(self, x):
+
         def cond(i, x):
             return i < 1
 
@@ -160,6 +169,7 @@ def _get_answer(self):
 
 # 1.2.2 step > 1
 class TestSetValueItemSliceStep(TestSetValueApi):
+
     def set_shape(self):
         self.shape = [5, 5, 5]
 
@@ -171,6 +181,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSliceStep2(TestSetValueApi):
+
     def set_shape(self):
         self.shape = [7, 5, 5]
 
@@ -182,6 +193,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSliceStep3(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:-1, 0:2, ::2] = self.value
 
@@ -190,6 +202,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSliceStep4(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:, 1:2:2, :] = self.value
 
@@ -199,6 +212,7 @@ def _get_answer(self):
 
 # 1.2.3 step < 0
 class TestSetValueItemSliceNegetiveStep(TestSetValueApi):
+
     def set_shape(self):
         self.shape = [5, 2]
 
@@ -213,6 +227,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSliceNegetiveStep2(TestSetValueApi):
+
     def set_shape(self):
         self.shape = [5]
 
@@ -227,6 +242,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSliceNegetiveStep3(TestSetValueApi):
+
     def set_shape(self):
         self.shape = [3]
 
@@ -241,6 +257,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemSliceNegetiveStep4(TestSetValueApi):
+
     def set_shape(self):
         self.shape = [3, 4, 5]
 
@@ -255,6 +272,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemEllipsis1(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:, ..., 1:] = self.value
 
@@ -263,6 +281,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemEllipsis2(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0:, ...] = self.value
 
@@ -271,6 +290,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemEllipsis3(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[..., 1:] = self.value
 
@@ -279,6 +299,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemEllipsis4(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[...] = self.value
 
@@ -288,6 +309,7 @@ def _get_answer(self):
 
 # 1.4 item is Paddle Tensor
 class TestSetValueItemTensor(TestSetValueApi):
+
     def _call_setitem(self, x):
         zero = paddle.full([1], 0, dtype="int32")
         x[zero] = self.value
@@ -297,6 +319,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemTensor2(TestSetValueApi):
+
     def _call_setitem(self, x):
         zero = paddle.full([1], 0, dtype="int32")
         two = paddle.full([1], 2, dtype="int64")
@@ -307,6 +330,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemTensor3(TestSetValueApi):
+
     def _call_setitem(self, x):
         zero = paddle.full([1], 0, dtype="int32")
         two = paddle.full([1], 2, dtype="int64")
@@ -317,6 +341,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemTensor4(TestSetValueApi):
+
     def _call_setitem(self, x):
         zero = paddle.full([1], 0, dtype="int32")
         two = paddle.full([1], 2, dtype="int64")
@@ -327,6 +352,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemTensor5(TestSetValueApi):
+
     def _call_setitem(self, x):
         zero = paddle.full([1], 0, dtype="int32")
         two = paddle.full([1], 2, dtype="int64")
@@ -337,6 +363,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemTensor6(TestSetValueApi):
+
     def set_shape(self):
         self.shape = [3, 4, 5]
 
@@ -351,6 +378,7 @@ def _get_answer(self):
 
 # 1.5 item is None
 class TestSetValueItemNone1(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[None] = self.value
 
@@ -359,6 +387,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone2(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0, None, 1] = self.value
 
@@ -367,6 +396,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone3(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[:, None, None, 1] = self.value
 
@@ -375,6 +405,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone4(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0, 0, None, 1] = self.value
 
@@ -383,6 +414,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone5(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0, None, 0, None, 1] = self.value
 
@@ -391,6 +423,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone6(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[None, 0, 0, None, 0] = self.value
 
@@ -399,6 +432,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone7(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[:, None, 1] = np.zeros(self.shape)[:, None, 0]
 
@@ -407,6 +441,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone8(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[:, 1, None] = np.zeros(self.shape)[:, 0, None]
 
@@ -415,6 +450,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone9(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[None, :, 1, ..., None] = np.zeros(self.shape)[0, 0, :, None]
 
@@ -423,6 +459,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemNone10(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[..., None, :, None] = np.zeros(self.shape)[..., None, :, None]
 
@@ -432,6 +469,7 @@ def _get_answer(self):
 
 # 1.5 item is list or Tensor of bol
 class TestSetValueItemBool1(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[[True, False]] = self.value
 
@@ -440,6 +478,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemBool2(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[[False, False]] = self.value
 
@@ -448,6 +487,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemBool3(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[[False, True]] = np.zeros(self.shape[2])
 
@@ -456,6 +496,7 @@ def _get_answer(self):
 
 
 class TestSetValueItemBool4(TestSetValueApi):
+
     def _call_setitem(self, x):
         idx = paddle.assign(np.array([False, True]))
         x[idx] = np.zeros(self.shape[2])
@@ -465,17 +506,19 @@ def _get_answer(self):
 
 
 class TestSetValueItemBool5(TestSetValueApi):
+
     def _call_setitem(self, x):
         idx = paddle.assign(
             np.array([[False, True, False], [True, True, False]]))
         x[idx] = self.value
 
     def _get_answer(self):
-        self.data[np.array([[False, True, False], [True, True, False]
-                            ])] = self.value
+        self.data[np.array([[False, True, False], [True, True,
+                                                   False]])] = self.value
 
 
 class TestSetValueItemBool6(TestSetValueApi):
+
     def _call_setitem(self, x):
         x[0, ...] = 0
         x[x > 0] = self.value
@@ -490,7 +533,9 @@ def _get_answer(self):
 
 
 def create_test_value_int32(parent):
+
     class TestValueInt(parent):
+
         def set_value(self):
             self.value = 7
 
@@ -510,7 +555,9 @@ def set_dtype(self):
 
 
 def create_test_value_int64(parent):
+
     class TestValueInt(parent):
+
         def set_value(self):
             self.value = 7
 
@@ -530,7 +577,9 @@ def set_dtype(self):
 
 
 def create_test_value_fp32(parent):
+
     class TestValueInt(parent):
+
         def set_value(self):
             self.value = 3.3
 
@@ -550,7 +599,9 @@ def set_dtype(self):
 
 
 def create_test_value_fp64(parent):
+
     class TestValueInt(parent):
+
         def set_value(self):
             self.value = 2.0**127  # float32:[-2^128, 2^128)
 
@@ -570,7 +621,9 @@ def set_dtype(self):
 
 
 def create_test_value_bool(parent):
+
     class TestValueInt(parent):
+
         def set_value(self):
             self.value = 0
 
@@ -591,7 +644,9 @@ def set_dtype(self):
 
 # 2.2 value is numpy.array (int32, int64, float32, float64, bool)
 def create_test_value_numpy_int32(parent):
+
     class TestValueInt(parent):
+
         def set_value(self):
             self.value = np.array([5])
 
@@ -611,7 +666,9 @@ def set_dtype(self):
 
 
 def create_test_value_numpy_int64(parent):
+
     class TestValueInt(parent):
+
         def set_value(self):
             self.value = np.array([1])
 
@@ -631,7 +688,9 @@ def set_dtype(self):
 
 
 def create_test_value_numpy_fp32(parent):
+
     class TestValueInt(parent):
+
         def set_value(self):
             self.value = np.array([1])
 
@@ -651,7 +710,9 @@ def set_dtype(self):
 
 
 def create_test_value_numpy_fp64(parent):
+
     class TestValueInt(parent):
+
         def set_value(self):
             self.value = np.array([2**127]).astype("float64")
 
@@ -671,7 +732,9 @@ def set_dtype(self):
 
 
 def create_test_value_numpy_bool(parent):
+
     class TestValueInt(parent):
+
         def set_value(self):
             self.value = np.array([0])
 
@@ -692,7 +755,9 @@ def set_dtype(self):
 
 # 2.3 value is a Paddle Tensor (int32, int64, float32, float64, bool)
 def create_test_value_tensor_int32(parent):
+
     class TestValueInt(parent):
+
         def set_dtype(self):
             self.dtype = "int32"
 
@@ -716,7 +781,9 @@ def _get_answer(self):
 
 
 def create_test_value_tensor_int64(parent):
+
     class TestValueInt(parent):
+
         def set_dtype(self):
             self.dtype = "int64"
 
@@ -740,7 +807,9 @@ def _get_answer(self):
 
 
 def create_test_value_tensor_fp32(parent):
+
     class TestValueInt(parent):
+
         def set_dtype(self):
             self.dtype = "float32"
 
@@ -764,7 +833,9 @@ def _get_answer(self):
 
 
 def create_test_value_tensor_fp64(parent):
+
     class TestValueInt(parent):
+
         def set_dtype(self):
             self.dtype = "float64"
 
@@ -788,7 +859,9 @@ def _get_answer(self):
 
 
 def create_test_value_tensor_bool(parent):
+
     class TestValueInt(parent):
+
         def set_dtype(self):
             self.dtype = "bool"
 
@@ -813,6 +886,7 @@ def _get_answer(self):
 
 # 3. Test different shape of value
 class TestSetValueValueShape1(TestSetValueApi):
+
     def set_value(self):
         self.value = np.array([3, 4, 5, 6])  # shape is (4,)
 
@@ -824,6 +898,7 @@ def _get_answer(self):
 
 
 class TestSetValueValueShape2(TestSetValueApi):
+
     def set_value(self):
         self.value = np.array([[3, 4, 5, 6]])  # shape is (1,4)
 
@@ -835,9 +910,10 @@ def _get_answer(self):
 
 
 class TestSetValueValueShape3(TestSetValueApi):
+
     def set_value(self):
-        self.value = np.array(
-            [[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3]])  # shape is (3,4)
+        self.value = np.array([[1, 1, 1, 1], [2, 2, 2, 2],
+                               [3, 3, 3, 3]])  # shape is (3,4)
 
     def _call_setitem(self, x):
         x[0] = self.value
@@ -847,10 +923,11 @@ def _get_answer(self):
 
 
 class TestSetValueValueShape4(TestSetValueApi):
+
     def set_value(self):
-        self.value = np.array(
-            [[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3]]).astype(
-                self.dtype)  # shape is (3,4)
+        self.value = np.array([[1, 1, 1, 1], [2, 2, 2, 2],
+                               [3, 3, 3,
+                                3]]).astype(self.dtype)  # shape is (3,4)
 
     def _call_setitem(self, x):
         x[0] = paddle.assign(self.value)  # x is Paddle.Tensor
@@ -860,6 +937,7 @@ def _get_answer(self):
 
 
 class TestSetValueValueShape5(TestSetValueApi):
+
     def set_value(self):
         self.value = np.array([3, 3, 3]).astype(self.dtype)
 
@@ -875,6 +953,7 @@ def _get_answer(self):
 
 # 4. Test error
 class TestError(TestSetValueBase):
+
     def _value_type_error(self):
         with self.assertRaisesRegexp(
                 TypeError,
@@ -947,6 +1026,7 @@ def test_error(self):
 
 
 class Model(paddle.nn.Layer):
+
     def __init__(self):
         super(Model, self).__init__()
         self.conv = paddle.nn.Conv2D(12, 12, 3)
@@ -962,6 +1042,7 @@ def forward(self, x, y):
 
 
 class TestBackward(unittest.TestCase):
+
     def test_static(self):
         paddle.enable_static()
         main_program = paddle.static.Program()
@@ -975,8 +1056,9 @@ def test_static(self):
             x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
             y = paddle.static.data(name="y", shape=[4, 4], dtype='float32')
 
-            label = paddle.static.data(
-                name="label", shape=[4, 1], dtype='int64')
+            label = paddle.static.data(name="label",
+                                       shape=[4, 1],
+                                       dtype='int64')
 
             z = paddle.add(x, y)
             var = y[0, :]
@@ -984,8 +1066,8 @@ def test_static(self):
 
             prediction = paddle.static.nn.fc(x=z, size=2, activation='softmax')
 
-            cost = paddle.nn.functional.cross_entropy(
-                input=prediction, label=label)
+            cost = paddle.nn.functional.cross_entropy(input=prediction,
+                                                      label=label)
             loss = paddle.mean(cost)
             sgd = paddle.optimizer.SGD(learning_rate=0.01)
             sgd.minimize(loss)
@@ -995,9 +1077,11 @@ def test_static(self):
 
         var_grad, z_grad = exe.run(
             main_program,
-            feed={"x": x_np,
-                  "y": y_np,
-                  "label": label_np},
+            feed={
+                "x": x_np,
+                "y": y_np,
+                "label": label_np
+            },
             fetch_list=[var.name + "@GRAD", z.name + "@GRAD"])
 
         self.assertTrue((var_grad == z_grad[0, :]).all())
@@ -1014,12 +1098,15 @@ def func_test_dynamic(self):
         self.assertTrue((0 == x.grad[0, :, 0, 0]).all())
 
     def test_dynamic(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_test_dynamic()
         self.func_test_dynamic()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 class TestGradientTruncated(unittest.TestCase):
+
     def func_test_consistent_with_competitor(self):
         paddle.disable_static()
 
@@ -1030,8 +1117,8 @@ def set_value(t, value):
             return y.sum()
 
         # case 1
-        array = np.arange(
-            1, 1 + 2 * 3 * 4, dtype="float32").reshape([1, 2, 1, 3, 1, 4])
+        array = np.arange(1, 1 + 2 * 3 * 4,
+                          dtype="float32").reshape([1, 2, 1, 3, 1, 4])
         value = np.arange(100, 104, dtype="float32").reshape(1, 4)
 
         inps = paddle.to_tensor(array, stop_gradient=False)
@@ -1041,10 +1128,11 @@ def set_value(t, value):
         loss.backward()
 
         value_grad = np.array([[600., 606., 612., 618.]])
-        input_grad = np.array(
-            [[[[[[4., 32., 108., 256.]], [[500., 864., 1372., 2048.]],
-                [[2916., 4000., 5324., 6912.]]]],
-              [[[[0., 0., 0., 0.]], [[0., 0., 0., 0.]], [[0., 0., 0., 0.]]]]]])
+        input_grad = np.array([[[[[[4., 32., 108., 256.]],
+                                  [[500., 864., 1372., 2048.]],
+                                  [[2916., 4000., 5324., 6912.]]]],
+                                [[[[0., 0., 0., 0.]], [[0., 0., 0., 0.]],
+                                  [[0., 0., 0., 0.]]]]]])
         self.assertTrue(
             np.array_equal(inps.grad.numpy(), input_grad),
             msg="The gradient of value should be \n{},\n but reveived {}".
@@ -1065,11 +1153,12 @@ def set_value(t, value):
         loss.backward()
 
         value_grad2 = np.array([600.])
-        input_grad2 = np.array(
-            [[[4., 32., 108.], [0., 0., 0.]], [[1372., 2048., 2916.],
-                                               [4000., 5324., 6912.]],
-             [[8788., 10976., 13500.], [16384., 19652., 23328.]],
-             [[27436., 32000., 37044.], [42592., 48668., 55296.]]])
+        input_grad2 = np.array([[[4., 32., 108.], [0., 0., 0.]],
+                                [[1372., 2048., 2916.], [4000., 5324., 6912.]],
+                                [[8788., 10976., 13500.],
+                                 [16384., 19652., 23328.]],
+                                [[27436., 32000., 37044.],
+                                 [42592., 48668., 55296.]]])
         self.assertTrue(
             np.array_equal(inps2.grad.numpy(), input_grad2),
             msg="The gradient of value should be \n{},\n but reveived {}".
@@ -1086,8 +1175,8 @@ def set_value3(t, value):
             y = a * a
             return y.sum()
 
-        array = np.arange(
-            1, 1 + 2 * 3 * 4, dtype="float32").reshape([4, 3, 1, 1, 2, 1])
+        array = np.arange(1, 1 + 2 * 3 * 4,
+                          dtype="float32").reshape([4, 3, 1, 1, 2, 1])
         value = np.arange(100, 100 + 2, dtype="float32").reshape(1, 2, 1)
 
         inps = paddle.to_tensor(array, stop_gradient=False)
@@ -1097,14 +1186,16 @@ def set_value3(t, value):
         loss.backward()
 
         value_grad = np.array([[[600.], [606.]]])
-        input_grad = np.array(
-            [[[[[[0.], [0.]]]], [[[[0.], [0.]]]], [[[[0.], [0.]]]]],
-             [[[[[1372.], [2048.]]]], [[[[2916.], [4000.]]]],
-              [[[[5324.], [6912.]]]]], [[[[[8788.], [10976.]]]], [[[[13500.],
-                                                                    [16384.]]]],
-                                        [[[[19652.], [23328.]]]]],
-             [[[[[27436.], [32000.]]]], [[[[37044.], [42592.]]]],
-              [[[[48668.], [55296.]]]]]])
+        input_grad = np.array([[[[[[0.], [0.]]]], [[[[0.], [0.]]]],
+                                [[[[0.], [0.]]]]],
+                               [[[[[1372.], [2048.]]]], [[[[2916.], [4000.]]]],
+                                [[[[5324.], [6912.]]]]],
+                               [[[[[8788.], [10976.]]]], [[[[13500.],
+                                                            [16384.]]]],
+                                [[[[19652.], [23328.]]]]],
+                               [[[[[27436.], [32000.]]]],
+                                [[[[37044.], [42592.]]]],
+                                [[[[48668.], [55296.]]]]]])
         self.assertTrue(
             np.array_equal(inps.grad.numpy(), input_grad),
             msg="The gradient of value should be \n{},\n but reveived {}".
@@ -1121,8 +1212,8 @@ def set_value4(t, value):
             y = a * a
             return y.sum()
 
-        array = np.arange(
-            1, 1 + 2 * 3 * 4, dtype="float32").reshape([2, 3, 1, 4, 1])
+        array = np.arange(1, 1 + 2 * 3 * 4,
+                          dtype="float32").reshape([2, 3, 1, 4, 1])
         value = np.arange(100, 100 + 2, dtype="float32").reshape(1, 2, 1)
 
         inps = paddle.to_tensor(array, stop_gradient=False)
@@ -1132,8 +1223,8 @@ def set_value4(t, value):
         loss.backward()
 
         value_grad = np.array([[[600.], [606.]]])
-        input_grad = np.array([[[[[0.], [32.], [108.],
-                                  [0.]]], [[[0.], [864.], [1372.], [0.]]],
+        input_grad = np.array([[[[[0.], [32.], [108.], [0.]]],
+                                [[[0.], [864.], [1372.], [0.]]],
                                 [[[0.], [4000.], [5324.], [0.]]]],
                                [[[[8788.], [10976.], [13500.], [16384.]]],
                                 [[[19652.], [23328.], [27436.], [32000.]]],
@@ -1163,8 +1254,8 @@ def set_value5(t, value):
         loss = set_value5(inps, value)
         loss.backward()
 
-        value_grad = np.array([[200., 202., 204., 206.],
-                               [208., 210., 212., 214.],
+        value_grad = np.array([[200., 202., 204.,
+                                206.], [208., 210., 212., 214.],
                                [216., 218., 220., 222.]])
         input_grad = np.array([[[0., 0., 0., 0.], [0., 0., 0., 0.],
                                 [0., 0., 0., 0.]],
@@ -1205,38 +1296,49 @@ def test_static_graph(self):
 
         def op1(x):
             value = paddle.fluid.layers.fill_constant([1], "float32", 1)
-            # test stop_gradient 
+            # test stop_gradient
             value.stop_gradient = True
             x.stop_gradient = False
-            start = paddle.fluid.layers.fill_constant(
-                [1], "int32", 5, force_cpu=True)
-            end = paddle.fluid.layers.fill_constant(
-                [1], "int32", 0, force_cpu=True)
-            step = paddle.fluid.layers.fill_constant(
-                [1], "int32", -2, force_cpu=True)
+            start = paddle.fluid.layers.fill_constant([1],
+                                                      "int32",
+                                                      5,
+                                                      force_cpu=True)
+            end = paddle.fluid.layers.fill_constant([1],
+                                                    "int32",
+                                                    0,
+                                                    force_cpu=True)
+            step = paddle.fluid.layers.fill_constant([1],
+                                                     "int32",
+                                                     -2,
+                                                     force_cpu=True)
 
             inputs = {
                 'Input': x,
                 'ValueTensor': value,
-                'StartsTensorList': [start, ],
-                'EndsTensorList': [end, ],
-                'StepsTensorList': [step, ]
+                'StartsTensorList': [
+                    start,
+                ],
+                'EndsTensorList': [
+                    end,
+                ],
+                'StepsTensorList': [
+                    step,
+                ]
             }
 
             helper = LayerHelper("set_value")
             y = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-            helper.append_op(
-                type="set_value",
-                inputs=inputs,
-                outputs={'Out': y},
-                attrs={'axes': [0]})
+            helper.append_op(type="set_value",
+                             inputs=inputs,
+                             outputs={'Out': y},
+                             attrs={'axes': [0]})
 
             return y, value
 
         def op2(x):
             value = paddle.fluid.layers.fill_constant([1, 3, 2], "float32", 1)
-            # test stop_gradient 
+            # test stop_gradient
             value.stop_gradient = False
             x.stop_gradient = False
             attrs = {
@@ -1253,11 +1355,10 @@ def op2(x):
             helper = LayerHelper("set_value")
             y = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-            helper.append_op(
-                type="set_value",
-                inputs=inputs,
-                outputs={'Out': y},
-                attrs=attrs)
+            helper.append_op(type="set_value",
+                             inputs=inputs,
+                             outputs={'Out': y},
+                             attrs=attrs)
 
             return y, value
 
@@ -1265,39 +1366,51 @@ def op3(x):
             value = paddle.fluid.layers.fill_constant([1], "float32", 1)
             x.stop_gradient = True
             value.stop_gradient = False
-            start = paddle.fluid.layers.fill_constant(
-                [1], "int32", 0, force_cpu=True)
-            end = paddle.fluid.layers.fill_constant(
-                [1], "int32", 5, force_cpu=True)
-            step = paddle.fluid.layers.fill_constant(
-                [1], "int32", 3, force_cpu=True)
+            start = paddle.fluid.layers.fill_constant([1],
+                                                      "int32",
+                                                      0,
+                                                      force_cpu=True)
+            end = paddle.fluid.layers.fill_constant([1],
+                                                    "int32",
+                                                    5,
+                                                    force_cpu=True)
+            step = paddle.fluid.layers.fill_constant([1],
+                                                     "int32",
+                                                     3,
+                                                     force_cpu=True)
 
             inputs = {
                 'Input': x,
                 'ValueTensor': value,
-                'StartsTensorList': [start, ],
-                'EndsTensorList': [end, ],
-                'StepsTensorList': [step, ]
+                'StartsTensorList': [
+                    start,
+                ],
+                'EndsTensorList': [
+                    end,
+                ],
+                'StepsTensorList': [
+                    step,
+                ]
             }
 
             helper = LayerHelper("set_value")
             y = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-            helper.append_op(
-                type="set_value",
-                inputs=inputs,
-                outputs={'Out': y},
-                attrs={'axes': [0]})
+            helper.append_op(type="set_value",
+                             inputs=inputs,
+                             outputs={'Out': y},
+                             attrs={'axes': [0]})
 
             return y, value
 
         def set_value(array, i, op):
             name_x = to_string('x', i)
-            x = paddle.static.data(
-                name=name_x, shape=array.shape, dtype='float32')
+            x = paddle.static.data(name=name_x,
+                                   shape=array.shape,
+                                   dtype='float32')
 
-            # set_value_op in __get/setitem__ is an inplace operation. 
-            # When `input.stop_gradient = True` and `value.stop_gradient = False`, 
+            # set_value_op in __get/setitem__ is an inplace operation.
+            # When `input.stop_gradient = True` and `value.stop_gradient = False`,
             # set_value_grad_op will not be run during backward.
             y, value = op(x)
 
@@ -1322,8 +1435,8 @@ def set_value(array, i, op):
 
         input_shape = [7, 6, 5, 4, 3, 2]
 
-        array = np.arange(
-            0, numel(input_shape), dtype="float32").reshape(input_shape)
+        array = np.arange(0, numel(input_shape),
+                          dtype="float32").reshape(input_shape)
 
         for i in range(len(input_shape)):
             program = paddle.static.Program()
@@ -1347,6 +1460,7 @@ def set_value(array, i, op):
 
 
 class TestSetValueInplace(unittest.TestCase):
+
     def test_inplace(self):
         paddle.disable_static()
         with paddle.fluid.dygraph.guard():
@@ -1365,6 +1479,7 @@ def test_inplace(self):
 
 
 class TestSetValueInplaceLeafVar(unittest.TestCase):
+
     def test_inplace_var_become_leaf_var(self):
         paddle.disable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index ad03fa30009e7..8e00d905a3520 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -27,6 +27,7 @@
 
 
 class TestSGDOp(OpTest):
+
     def setUp(self):
         self.op_type = "sgd"
         self.conf()
@@ -46,16 +47,18 @@ def test_check_output(self):
 
 
 class TestSGDOpCase8X(TestSGDOp):
+
     def conf(self):
         self.h = 10
         self.w = 64
 
 
 class TestSparseSGDOp(unittest.TestCase):
+
     def check_with_place(self, place):
         scope = core.Scope()
 
-        # create and initialize Grad Variable   
+        # create and initialize Grad Variable
         height = 10
         rows = [0, 4, 7]
         self.conf()
@@ -81,12 +84,11 @@ def check_with_place(self, place):
         lr.set(lr_array, place)
 
         # create and run sgd operator
-        sgd_op = Operator(
-            "sgd",
-            Param='Param',
-            Grad='Grad',
-            ParamOut='Param',
-            LearningRate='LearningRate')
+        sgd_op = Operator("sgd",
+                          Param='Param',
+                          Grad='Grad',
+                          ParamOut='Param',
+                          LearningRate='LearningRate')
         sgd_op.run(scope, place)
 
         # get and compare result
@@ -119,11 +121,13 @@ def conf(self):
 
 
 class TestSparseSGDOpCase8X(TestSparseSGDOp):
+
     def conf(self):
         self.row_numel = 16
 
 
 class TestSGDOpOptimizeSelectedRows(unittest.TestCase):
+
     def check_with_place(self, place):
         scope = core.Scope()
 
@@ -168,16 +172,15 @@ def check_with_place(self, place):
         # optimize with Python
         w_after_optimize = np.copy(w_before_optimize)
         for index, id in enumerate(grad_rows):
-            w_after_optimize[id] = w_before_optimize[
-                id] - lr_value * grad_array[index]
+            w_after_optimize[
+                id] = w_before_optimize[id] - lr_value * grad_array[index]
 
         # create and run sgd operator
-        sgd_op = Operator(
-            "sgd",
-            Param='Param',
-            Grad='Grad',
-            ParamOut='Param',
-            LearningRate='LearningRate')
+        sgd_op = Operator("sgd",
+                          Param='Param',
+                          Grad='Grad',
+                          ParamOut='Param',
+                          LearningRate='LearningRate')
         sgd_op.run(scope, place)
 
         # get and compare result
@@ -192,11 +195,13 @@ def test_sparse_parameter_sgd(self):
 
 
 class TestSGDOpWithLargeInput(unittest.TestCase):
+
     def runTest(self):
         paddle.enable_static()
         data = fluid.layers.fill_constant(shape=[1], value=128, dtype='int64')
-        label = fluid.layers.fill_constant(
-            shape=[1, 150], value=0.5, dtype='float32')
+        label = fluid.layers.fill_constant(shape=[1, 150],
+                                           value=0.5,
+                                           dtype='float32')
         emb = fluid.embedding(input=data, size=(10000000, 150), dtype='float32')
         out = fluid.layers.l2_normalize(x=emb, axis=-1)
 
@@ -214,6 +219,7 @@ def runTest(self):
 
 
 class TestSGDV2(unittest.TestCase):
+
     def test_sgd_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -235,26 +241,33 @@ def check_sgd_optimizer(optimizer_attr):
             init_program = paddle.static.Program()
             program = paddle.static.Program()
             block = program.global_block()
-            mul_x = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="mul.x",
-                optimize_attr=optimizer_attr)
-            mul_y = block.create_var(
-                dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-            mul_out = block.create_var(
-                dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-            mean_out = block.create_var(
-                dtype="float32", shape=[1], lod_level=0, name="mean.out")
-            block.append_op(
-                type="mul",
-                inputs={"X": mul_x,
-                        "Y": mul_y},
-                outputs={"Out": mul_out},
-                attrs={"x_num_col_dims": 1})
-            block.append_op(
-                type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+            mul_x = block.create_parameter(dtype="float32",
+                                           shape=[5, 10],
+                                           lod_level=0,
+                                           name="mul.x",
+                                           optimize_attr=optimizer_attr)
+            mul_y = block.create_var(dtype="float32",
+                                     shape=[10, 8],
+                                     lod_level=0,
+                                     name="mul.y")
+            mul_out = block.create_var(dtype="float32",
+                                       shape=[5, 8],
+                                       lod_level=0,
+                                       name="mul.out")
+            mean_out = block.create_var(dtype="float32",
+                                        shape=[1],
+                                        lod_level=0,
+                                        name="mean.out")
+            block.append_op(type="mul",
+                            inputs={
+                                "X": mul_x,
+                                "Y": mul_y
+                            },
+                            outputs={"Out": mul_out},
+                            attrs={"x_num_col_dims": 1})
+            block.append_op(type="mean",
+                            inputs={"X": mul_out},
+                            outputs={"Out": mean_out})
             sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.01)
             opts, _ = sgd_optimizer.minimize(mean_out, init_program)
             return opts
@@ -299,6 +312,7 @@ def test_eager(self):
 
 
 class TestSGDMultiPrecision2_0(unittest.TestCase):
+
     def dygraph_sgd_mp(self, mp):
         paddle.disable_static()
         paddle.seed(10)
@@ -346,11 +360,13 @@ def static_sgd_mp(self, mp):
                 use_fp16_guard=False)
         with paddle.static.program_guard(train_program, startup_program):
             if mp:
-                data = paddle.static.data(
-                    shape=[2, 2], name='X', dtype='float16')
+                data = paddle.static.data(shape=[2, 2],
+                                          name='X',
+                                          dtype='float16')
             else:
-                data = paddle.static.data(
-                    shape=[2, 2], name='X', dtype='float32')
+                data = paddle.static.data(shape=[2, 2],
+                                          name='X',
+                                          dtype='float32')
             hidden = paddle.static.nn.fc(x=data, size=10)
             loss = paddle.fluid.layers.mean(hidden)
             optimizer.minimize(loss)
@@ -376,31 +392,26 @@ def test_main(self):
         output1_dy, params1_dy = self.dygraph_sgd_mp(mp=True)
         output2_dy, params2_dy = self.dygraph_sgd_mp(mp=False)
         self.assertEqual(
-            np.allclose(
-                output1_dy.astype('float32').numpy(),
-                output2_dy.astype('float32').numpy(),
-                atol=1e-01),
-            True)
+            np.allclose(output1_dy.astype('float32').numpy(),
+                        output2_dy.astype('float32').numpy(),
+                        atol=1e-01), True)
         for idx in range(len(params1_dy)):
             self.assertEqual(
-                np.allclose(
-                    params1_dy[idx].astype('float32').numpy(),
-                    params2_dy[idx].astype('float32').numpy(),
-                    atol=1e-01),
-                True)
+                np.allclose(params1_dy[idx].astype('float32').numpy(),
+                            params2_dy[idx].astype('float32').numpy(),
+                            atol=1e-01), True)
         "Test static mode"
         output1_st = self.static_sgd_mp(mp=True)
         output2_st = self.static_sgd_mp(mp=False)
         for idx in range(len(output1_st)):
             self.assertEqual(
-                np.allclose(
-                    output1_st[idx].astype('float32'),
-                    output2_st[idx].astype('float32'),
-                    atol=1e-01),
-                True)
+                np.allclose(output1_st[idx].astype('float32'),
+                            output2_st[idx].astype('float32'),
+                            atol=1e-01), True)
 
 
 class TestSGDMultiPrecision1_0(unittest.TestCase):
+
     def dygraph_sgd_mp(self, mp):
         paddle.disable_static()
         paddle.seed(10)
@@ -451,11 +462,13 @@ def static_sgd_mp(self, mp):
                 use_fp16_guard=False)
         with paddle.static.program_guard(train_program, startup_program):
             if mp:
-                data = paddle.static.data(
-                    shape=[2, 2], name='X', dtype='float16')
+                data = paddle.static.data(shape=[2, 2],
+                                          name='X',
+                                          dtype='float16')
             else:
-                data = paddle.static.data(
-                    shape=[2, 2], name='X', dtype='float32')
+                data = paddle.static.data(shape=[2, 2],
+                                          name='X',
+                                          dtype='float32')
             hidden = paddle.static.nn.fc(x=data, size=10)
             loss = paddle.fluid.layers.mean(hidden)
             optimizer.minimize(loss)
@@ -481,28 +494,22 @@ def test_main(self):
         output1_dy, params1_dy = self.dygraph_sgd_mp(mp=True)
         output2_dy, params2_dy = self.dygraph_sgd_mp(mp=False)
         self.assertEqual(
-            np.allclose(
-                output1_dy.astype('float32').numpy(),
-                output2_dy.astype('float32').numpy(),
-                atol=1e-01),
-            True)
+            np.allclose(output1_dy.astype('float32').numpy(),
+                        output2_dy.astype('float32').numpy(),
+                        atol=1e-01), True)
         for idx in range(len(params1_dy)):
             self.assertEqual(
-                np.allclose(
-                    params1_dy[idx].astype('float32').numpy(),
-                    params2_dy[idx].astype('float32').numpy(),
-                    atol=1e-01),
-                True)
+                np.allclose(params1_dy[idx].astype('float32').numpy(),
+                            params2_dy[idx].astype('float32').numpy(),
+                            atol=1e-01), True)
         "Test static mode"
         output1_st = self.static_sgd_mp(mp=True)
         output2_st = self.static_sgd_mp(mp=False)
         for idx in range(len(output1_st)):
             self.assertEqual(
-                np.allclose(
-                    output1_st[idx].astype('float32'),
-                    output2_st[idx].astype('float32'),
-                    atol=1e-01),
-                True)
+                np.allclose(output1_st[idx].astype('float32'),
+                            output2_st[idx].astype('float32'),
+                            atol=1e-01), True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
index a468d6e828ce1..4df56373a5326 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -19,8 +19,9 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
-from paddle.fluid.tests.unittests.op_test import (
-    convert_float_to_uint16, convert_uint16_to_float, OpTest, OpTestTool)
+from paddle.fluid.tests.unittests.op_test import (convert_float_to_uint16,
+                                                  convert_uint16_to_float,
+                                                  OpTest, OpTestTool)
 import paddle
 import paddle.static.amp as amp
 import struct
@@ -29,6 +30,7 @@
 @unittest.skipIf(not core.supports_bfloat16(),
                  'place does not support BF16 evaluation')
 class TestSGDOpBF16(OpTest):
+
     def setUp(self):
         self.op_type = 'sgd'
         self.dtype = np.uint16
@@ -56,12 +58,14 @@ def test_check_output(self):
 @unittest.skipIf(not core.supports_bfloat16(),
                  'place does not support BF16 evaluation')
 class TestSGDOpBF16Case2(TestSGDOpBF16):
+
     def conf(self):
         self.h = 10
         self.w = 64
 
 
 class TestSparseSGDOpBF16(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         np.random.seed(12345)
@@ -122,6 +126,7 @@ def create_dense_lr_var(self, scope, place):
 @unittest.skipIf(not core.supports_bfloat16(),
                  'place does not support BF16 evaluation')
 class TestSparseGradSGDOpBF16(TestSparseSGDOpBF16):
+
     def setUp(self):
         self.setup_params()
 
@@ -133,19 +138,20 @@ def setup_params(self):
     def test_sparse_grad_sgd(self):
         scope = core.Scope()
         place = core.CPUPlace()
-        _, grad_array = self.create_sparse_grad_var(
-            scope, place, self.grad_height, self.grad_rows, self.grad_row_numel)
+        _, grad_array = self.create_sparse_grad_var(scope, place,
+                                                    self.grad_height,
+                                                    self.grad_rows,
+                                                    self.grad_row_numel)
         param_tensor, param_array = self.create_dense_param_var(
             scope, place, self.grad_height, self.grad_row_numel)
         _, lr_value = self.create_dense_lr_var(scope, place)
 
-        sgd_op = Operator(
-            'sgd',
-            Param='Param',
-            Grad='Grad',
-            ParamOut='Param',
-            LearningRate='LearningRate',
-            use_mkldnn=True)
+        sgd_op = Operator('sgd',
+                          Param='Param',
+                          Grad='Grad',
+                          ParamOut='Param',
+                          LearningRate='LearningRate',
+                          use_mkldnn=True)
         sgd_op.run(scope, place)
 
         reference = self.ref_optimize(param_array, self.grad_rows, grad_array,
@@ -157,6 +163,7 @@ def test_sparse_grad_sgd(self):
 @unittest.skipIf(not core.supports_bfloat16(),
                  'place does not support BF16 evaluation')
 class TestSparseGradSGDOpBF16Case2(TestSparseGradSGDOpBF16):
+
     def setup_params(self):
         self.grad_height = 14
         self.grad_rows = [1, 4, 12, 7, 8]
@@ -164,6 +171,7 @@ def setup_params(self):
 
 
 class TestSparseGradSGDOpBF16Case3(TestSparseGradSGDOpBF16):
+
     def setup_params(self):
         self.grad_height = 10
         self.grad_rows = [0, 4, 7]
@@ -173,6 +181,7 @@ def setup_params(self):
 @unittest.skipIf(not core.supports_bfloat16(),
                  'place does not support BF16 evaluation')
 class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16):
+
     def setUp(self):
         self.setup_params()
 
@@ -185,20 +194,21 @@ def setup_params(self):
     def test_sparse_param_grad_sgd(self):
         scope = core.Scope()
         place = core.CPUPlace()
-        _, grad_array = self.create_sparse_grad_var(
-            scope, place, self.grad_height, self.grad_rows, self.grad_row_numel)
+        _, grad_array = self.create_sparse_grad_var(scope, place,
+                                                    self.grad_height,
+                                                    self.grad_rows,
+                                                    self.grad_row_numel)
         param_tensor, param_array = self.create_sparse_param_var(
             scope, place, self.grad_height, self.param_rows,
             self.grad_row_numel)
         _, lr_value = self.create_dense_lr_var(scope, place)
 
-        sgd_op = Operator(
-            'sgd',
-            Param='Param',
-            Grad='Grad',
-            ParamOut='Param',
-            LearningRate='LearningRate',
-            use_mkldnn=True)
+        sgd_op = Operator('sgd',
+                          Param='Param',
+                          Grad='Grad',
+                          ParamOut='Param',
+                          LearningRate='LearningRate',
+                          use_mkldnn=True)
         sgd_op.run(scope, place)
 
         reference = self.ref_optimize(param_array, self.grad_rows, grad_array,
@@ -208,6 +218,7 @@ def test_sparse_param_grad_sgd(self):
 
 
 class TestSparseGradParamSGDOpBF16Case2(TestSparseGradParamSGDOpBF16):
+
     def setup_params(self):
         self.grad_height = 14
         self.grad_rows = [1, 4, 12, 7, 8]
@@ -217,6 +228,7 @@ def setup_params(self):
 
 @OpTestTool.skip_if_not_cpu_bf16()
 class TestSGDOpBF16API(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         np.random.seed(12345)
@@ -249,18 +261,18 @@ def _mul_bf16(self, lhs: np.uint16, rhs: np.uint16):
         return self._fp322bf16(self._bf162fp32(lhs) * self._bf162fp32(rhs))
 
     def _reference(self, data, emb_weight, bf16=False):
-        emb_out_shape = np.array(
-            [self.ids_shape[0], self.w_shape[1]], dtype=np.int64)
-        mean_grad_value = np.float32(1.0) / np.prod(
-            emb_out_shape, dtype=np.float32)
+        emb_out_shape = np.array([self.ids_shape[0], self.w_shape[1]],
+                                 dtype=np.int64)
+        mean_grad_value = np.float32(1.0) / np.prod(emb_out_shape,
+                                                    dtype=np.float32)
         if bf16:
-            mean_grad = np.full(
-                emb_out_shape,
-                self._fp322bf16(mean_grad_value),
-                dtype=np.uint16)
+            mean_grad = np.full(emb_out_shape,
+                                self._fp322bf16(mean_grad_value),
+                                dtype=np.uint16)
         else:
-            mean_grad = np.full(
-                emb_out_shape, mean_grad_value, dtype=np.float32)
+            mean_grad = np.full(emb_out_shape,
+                                mean_grad_value,
+                                dtype=np.float32)
         # add_grad = 1 * mean_grad
         out_dtype = np.uint16 if bf16 else np.float32
         lookup_table_grad = np.zeros(self.w_shape, dtype=out_dtype)
@@ -286,7 +298,11 @@ def _reference(self, data, emb_weight, bf16=False):
             ref_grad = emb_weight - self.learning_rate * lookup_table_grad
         return ref_grad
 
-    def _check_output(self, actual, reference, bf16=False, atol=0,
+    def _check_output(self,
+                      actual,
+                      reference,
+                      bf16=False,
+                      atol=0,
                       rtol=0.15e-2):
         output = actual if bf16 else convert_uint16_to_float(actual)
         if bf16:
@@ -294,8 +310,10 @@ def _check_output(self, actual, reference, bf16=False, atol=0,
         else:
             try:
                 print('Compare with FP32 values:')
-                np.testing.assert_allclose(
-                    output, reference, atol=atol, rtol=rtol)
+                np.testing.assert_allclose(output,
+                                           reference,
+                                           atol=atol,
+                                           rtol=rtol)
             except AssertionError as e:
                 print(e)
 
@@ -313,15 +331,16 @@ def test_sgd(self):
         main = fluid.Program()
         with fluid.program_guard(main):
             x = fluid.layers.data(name='X', shape=self.ids_shape, dtype='int64')
-            label = fluid.layers.data(
-                name='Y', shape=self.y_shape, dtype='uint16')
-            emb = fluid.layers.embedding(
-                input=x,
-                size=self.w_shape,
-                param_attr=fluid.ParamAttr(
-                    name="emb_weight", initializer=self.initializer),
-                is_sparse=False,
-                dtype="uint16")  # bfloat16
+            label = fluid.layers.data(name='Y',
+                                      shape=self.y_shape,
+                                      dtype='uint16')
+            emb = fluid.layers.embedding(input=x,
+                                         size=self.w_shape,
+                                         param_attr=fluid.ParamAttr(
+                                             name="emb_weight",
+                                             initializer=self.initializer),
+                                         is_sparse=False,
+                                         dtype="uint16")  # bfloat16
             cost = fluid.layers.elementwise_add(emb, label)
             avg_cost = paddle.mean(cost)
 
@@ -330,7 +349,9 @@ def test_sgd(self):
             sgd_optimizer = amp.bf16.decorate_bf16(
                 sgd_optimizer,
                 amp_lists=amp.bf16.AutoMixedPrecisionListsBF16(
-                    custom_bf16_list={'lookup_table', }),
+                    custom_bf16_list={
+                        'lookup_table',
+                    }),
                 use_bf16_guard=False,
                 use_pure_bf16=True)
             sgd_optimizer.minimize(
@@ -340,12 +361,14 @@ def test_sgd(self):
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
             test_prog = main.clone(for_test=True)
-            sgd_optimizer.amp_init(
-                place, test_program=test_prog, use_bf16_test=True)
+            sgd_optimizer.amp_init(place,
+                                   test_program=test_prog,
+                                   use_bf16_test=True)
 
             ref_emb = np.full(self.w_shape, self.value, dtype=np.float32)
-            ref_emb_bf16 = np.full(
-                self.w_shape, self._fp322bf16(self.value), dtype=np.uint16)
+            ref_emb_bf16 = np.full(self.w_shape,
+                                   self._fp322bf16(self.value),
+                                   dtype=np.uint16)
             emb_weight = []
 
             for sample in train_reader():
@@ -353,8 +376,10 @@ def test_sgd(self):
                 label = sample[0][1]
                 y_bf16 = convert_float_to_uint16(label)
                 emb_weight = exe.run(main,
-                                     feed={'X': data,
-                                           'Y': y_bf16},
+                                     feed={
+                                         'X': data,
+                                         'Y': y_bf16
+                                     },
                                      fetch_list=['emb_weight'])
 
                 ref_emb = self._reference(data, ref_emb)
diff --git a/python/paddle/fluid/tests/unittests/test_shape_op.py b/python/paddle/fluid/tests/unittests/test_shape_op.py
index 3d961a7413ca0..cb64739f8f066 100644
--- a/python/paddle/fluid/tests/unittests/test_shape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shape_op.py
@@ -23,6 +23,7 @@
 
 
 class TestShapeOp(OpTest):
+
     def setUp(self):
         self.op_type = "shape"
         self.python_api = paddle.shape
@@ -40,16 +41,19 @@ def test_check_output(self):
 
 
 class case1(TestShapeOp):
+
     def config(self):
         self.shape = [2]
 
 
 class case2(TestShapeOp):
+
     def config(self):
         self.shape = [1, 2, 3]
 
 
 class TestShapeWithSelectedRows(unittest.TestCase):
+
     def get_places(self):
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_shard_index_op.py b/python/paddle/fluid/tests/unittests/test_shard_index_op.py
index 9ccf1f254a556..9d52f8f8459b2 100644
--- a/python/paddle/fluid/tests/unittests/test_shard_index_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shard_index_op.py
@@ -50,6 +50,7 @@ def common_setup(self, index_num, nshards, shard_id, ignore_value):
 
 
 class TestShardIndexShardId0Op(OpTest):
+
     def setUp(self):
         common_setup(self, 20, 2, 0, -1)
 
@@ -58,6 +59,7 @@ def test_check_output(self):
 
 
 class TestShardIndexShardId1Op(OpTest):
+
     def setUp(self):
         common_setup(self, 20, 2, 1, -1)
 
@@ -66,6 +68,7 @@ def test_check_output(self):
 
 
 class TestShardIndexIgnoreValueOp(OpTest):
+
     def setUp(self):
         common_setup(self, 20, 2, 0, -2)
 
@@ -74,6 +77,7 @@ def test_check_output(self):
 
 
 class TestShardIndexNotEvenlyDividedOp(OpTest):
+
     def setUp(self):
         common_setup(self, 15, 2, 1, -1)
 
diff --git a/python/paddle/fluid/tests/unittests/test_share_data_op.py b/python/paddle/fluid/tests/unittests/test_share_data_op.py
index 1e6f0ef693c3d..a049661eaab68 100644
--- a/python/paddle/fluid/tests/unittests/test_share_data_op.py
+++ b/python/paddle/fluid/tests/unittests/test_share_data_op.py
@@ -20,6 +20,7 @@
 
 
 class TestShareDataOp(OpTest):
+
     def setUp(self):
         self.op_type = "share_data"
         input = np.random.rand(2, 3, 5).astype("float32")
@@ -31,6 +32,7 @@ def test_check_output(self):
 
 
 class TestShareDataOpOnDifferentPlaces(unittest.TestCase):
+
     def get_places(self):
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
index 6e1099e5a391c..daa3f191ccd72 100644
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
@@ -28,13 +28,16 @@
 
 
 class TestShrinkRNNMemoryBase(unittest.TestCase):
+
     def setUp(self):
         self.main_program = Program()
         switch_main_program(self.main_program)
         x = layers.data('x', shape=[100], dtype='float32')
         x.stop_gradient = False
-        rank_table_tensor = layers.data(
-            'rank_table_tensor', shape=[1], dtype='float32', lod_level=1)
+        rank_table_tensor = layers.data('rank_table_tensor',
+                                        shape=[1],
+                                        dtype='float32',
+                                        lod_level=1)
         table = lod_rank_table(x=rank_table_tensor)
         i = layers.zeros(dtype='int64', shape=[1])
         self.mem1 = shrink_memory(x=x, i=i, table=table)
@@ -56,6 +59,7 @@ def sum_lodtensor(self, tensor):
 
 
 class TestShrinkRNNMemoryReferLoD(TestShrinkRNNMemoryBase):
+
     def test_refer_lod(self):
         cpu = core.CPUPlace()
         x_tensor = core.LoDTensor()
@@ -65,13 +69,15 @@ def test_refer_lod(self):
 
         rank_table_tensor = core.LoDTensor()
         rank_table_tensor.set_recursive_sequence_lengths([[1, 2, 3]])
-        rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
-                              cpu)
+        rank_table_tensor.set(
+            np.random.random(size=(6, 1)).astype('float32'), cpu)
 
         exe = Executor(cpu)
         outs = exe.run(
-            feed={'x': x_tensor,
-                  'rank_table_tensor': rank_table_tensor},
+            feed={
+                'x': x_tensor,
+                'rank_table_tensor': rank_table_tensor
+            },
             fetch_list=[self.mem1, self.mem2, self.mem3, self.x_grad],
             return_numpy=False)
         self.assertTrue(np.allclose(tensor_np[0:6], outs[0]))
@@ -81,6 +87,7 @@ def test_refer_lod(self):
 
 
 class TestShrinkRNNMemoryNoLoD(TestShrinkRNNMemoryBase):
+
     def test_no_lod(self):
         cpu = core.CPUPlace()
         x_tensor = core.LoDTensor()
@@ -89,13 +96,15 @@ def test_no_lod(self):
 
         rank_table_tensor = core.LoDTensor()
         rank_table_tensor.set_recursive_sequence_lengths([[1, 2, 3]])
-        rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
-                              cpu)
+        rank_table_tensor.set(
+            np.random.random(size=(6, 1)).astype('float32'), cpu)
 
         exe = Executor(cpu)
         outs = exe.run(
-            feed={'x': x_tensor,
-                  'rank_table_tensor': rank_table_tensor},
+            feed={
+                'x': x_tensor,
+                'rank_table_tensor': rank_table_tensor
+            },
             fetch_list=[self.mem1, self.mem2, self.mem3, self.x_grad],
             return_numpy=False)
         self.assertTrue(np.allclose(tensor_np[0:3], outs[0]))
@@ -105,6 +114,7 @@ def test_no_lod(self):
 
 
 class TestShrinkRNNMemoryOpError(unittest.TestCase):
+
     def test_erroes(self):
         with program_guard(Program(), Program()):
             x = layers.zeros(dtype='int64', shape=[3, 100])
diff --git a/python/paddle/fluid/tests/unittests/test_shuffle_batch_op.py b/python/paddle/fluid/tests/unittests/test_shuffle_batch_op.py
index 62c26a73a8d43..6292a4d2b517d 100644
--- a/python/paddle/fluid/tests/unittests/test_shuffle_batch_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shuffle_batch_op.py
@@ -25,6 +25,7 @@
 
 
 class TestShuffleBatchOpBase(OpTest):
+
     def gen_random_array(self, shape, low=0, high=1):
         rnd = (high - low) * np.random.random(shape) + low
         return rnd.astype(self.dtype)
@@ -43,8 +44,8 @@ def setUp(self):
         self.dtype = np.float64
         self.shape = self.get_shape()
         x = self.gen_random_array(self.shape)
-        seed = np.random.random_integers(
-            low=10, high=100, size=(1, )).astype('int64')
+        seed = np.random.random_integers(low=10, high=100,
+                                         size=(1, )).astype('int64')
         self.inputs = {'X': x, 'Seed': seed}
         self.outputs = {
             'Out': np.array([]).astype(x.dtype),
@@ -81,6 +82,7 @@ def test_check_grad(self):
 
 
 class TestShuffleBatchOp2(TestShuffleBatchOpBase):
+
     def get_shape(self):
         return (4, 30)
 
diff --git a/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
index aeaae9058187b..1ff167b680fc3 100644
--- a/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
@@ -23,6 +23,7 @@
 
 
 class TestShuffleChannelOp(OpTest):
+
     def setUp(self):
         self.op_type = "shuffle_channel"
         self.batch_size = 10
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
index e5406f4d0c224..9c0d2bc92355d 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -40,11 +40,12 @@ def setUp(self):
         batch_size = 64
         num_classes = 20
         self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, (batch_size, num_classes))
-                .astype("float64")),
-            'Label': np.random.randint(0, 2, (batch_size, num_classes))
-            .astype("float64")
+            'X':
+            logit(
+                np.random.uniform(0, 1,
+                                  (batch_size, num_classes)).astype("float64")),
+            'Label':
+            np.random.randint(0, 2, (batch_size, num_classes)).astype("float64")
         }
 
         # Fw Pass is implemented as elementwise sigmoid followed by
@@ -73,13 +74,17 @@ def setUp(self):
         num_classes = 20
         ignore_index = -1
         self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, (batch_size, num_classes))
-                .astype("float64")),
-            'Label': np.random.randint(-1, 2, (batch_size, num_classes))
-            .astype("float64")
+            'X':
+            logit(
+                np.random.uniform(0, 1,
+                                  (batch_size, num_classes)).astype("float64")),
+            'Label':
+            np.random.randint(-1, 2,
+                              (batch_size, num_classes)).astype("float64")
+        }
+        self.attrs = {
+            'ignore_index': ignore_index,
         }
-        self.attrs = {'ignore_index': ignore_index, }
         # Fw Pass is implemented as elementwise sigmoid followed by
         # elementwise logistic loss
         # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
@@ -107,11 +112,12 @@ def setUp(self):
         batch_size = 64
         num_classes = 20
         self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, (batch_size, num_classes))
-                .astype("float64")),
-            'Label': np.random.uniform(0, 1, (batch_size, num_classes))
-            .astype("float64")
+            'X':
+            logit(
+                np.random.uniform(0, 1,
+                                  (batch_size, num_classes)).astype("float64")),
+            'Label':
+            np.random.uniform(0, 1, (batch_size, num_classes)).astype("float64")
         }
 
         # Fw Pass is implemented as elementwise sigmoid followed by
@@ -130,6 +136,7 @@ def test_check_grad(self):
 
 
 class TestSigmoidCrossEntropyWithNorm(OpTest):
+
     def setUp(self):
         self.op_type = "sigmoid_cross_entropy_with_logits"
         self.python_api = test_fluid_sigmoid
@@ -137,11 +144,13 @@ def setUp(self):
         num_classes = 20
         ignore_index = -1
         self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, (batch_size, num_classes))
-                .astype("float64")),
-            'Label': np.random.randint(-1, 2, (batch_size, num_classes))
-            .astype("float64")
+            'X':
+            logit(
+                np.random.uniform(0, 1,
+                                  (batch_size, num_classes)).astype("float64")),
+            'Label':
+            np.random.randint(-1, 2,
+                              (batch_size, num_classes)).astype("float64")
         }
         self.attrs = {'ignore_index': ignore_index, 'normalize': True}
         sigmoid_X = expit(self.inputs['X'])
@@ -171,11 +180,13 @@ def setUp(self):
         batch_size = [10, 10]
         num_classes = 20
         self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-                .astype("float64")),
-            'Label': np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-            .astype("float64")
+            'X':
+            logit(
+                np.random.uniform(
+                    0, 1, tuple(batch_size + [num_classes])).astype("float64")),
+            'Label':
+            np.random.uniform(0, 1, tuple(batch_size +
+                                          [num_classes])).astype("float64")
         }
 
         # Fw Pass is implemented as elementwise sigmoid followed by
@@ -194,6 +205,7 @@ def test_check_grad(self):
 
 
 class TestSigmoidCrossEntropyWithNorm2(OpTest):
+
     def setUp(self):
         self.op_type = "sigmoid_cross_entropy_with_logits"
         self.python_api = test_fluid_sigmoid
@@ -201,11 +213,13 @@ def setUp(self):
         num_classes = 20
         ignore_index = -1
         self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-                .astype("float64")),
-            'Label': np.random.randint(-1, 2, tuple(batch_size + [num_classes]))
-            .astype("float64")
+            'X':
+            logit(
+                np.random.uniform(
+                    0, 1, tuple(batch_size + [num_classes])).astype("float64")),
+            'Label':
+            np.random.randint(-1, 2, tuple(batch_size +
+                                           [num_classes])).astype("float64")
         }
         self.attrs = {'ignore_index': ignore_index, 'normalize': True}
         sigmoid_X = expit(self.inputs['X'])
@@ -234,12 +248,14 @@ def setUp(self):
             batch_size = [10, 10]
             num_classes = 20
             self.inputs = {
-                'X': logit(
-                    np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-                    .astype("float64")),
+                'X':
+                logit(
+                    np.random.uniform(0, 1,
+                                      tuple(batch_size +
+                                            [num_classes])).astype("float64")),
                 'Label':
-                np.random.randint(0, 2, tuple(batch_size + [num_classes]))
-                .astype("float64")
+                np.random.randint(0, 2, tuple(batch_size +
+                                              [num_classes])).astype("float64")
             }
 
             # Fw Pass is implemented as elementwise sigmoid followed by
@@ -257,17 +273,18 @@ def test_check_grad(self):
             self.check_grad(['X'], 'Out', check_eager=True)
 
     class TestSigmoidCrossEntropyWithLogitsOpError(unittest.TestCase):
+
         def test_errors(self):
             with program_guard(Program(), Program()):
 
                 def test_Variable():
                     # the input of sigmoid_cross_entropy_with_logits must be Variable.
-                    x1 = fluid.create_lod_tensor(
-                        np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]],
-                        fluid.CPUPlace())
-                    lab1 = fluid.create_lod_tensor(
-                        np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]],
-                        fluid.CPUPlace())
+                    x1 = fluid.create_lod_tensor(np.array([-1, 3, 5,
+                                                           5]), [[1, 1, 1, 1]],
+                                                 fluid.CPUPlace())
+                    lab1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                                   [[1, 1, 1, 1]],
+                                                   fluid.CPUPlace())
                     fluid.layers.sigmoid_cross_entropy_with_logits(x1, lab1)
 
                 self.assertRaises(TypeError, test_Variable)
@@ -275,10 +292,12 @@ def test_Variable():
                 def test_dtype():
                     # the input dtype of sigmoid_cross_entropy_with_logits must be float16 or float32 or float64
                     # float16 only can be set on GPU place
-                    x2 = fluid.layers.data(
-                        name='x2', shape=[3, 4, 5, 6], dtype="int32")
-                    lab2 = fluid.layers.data(
-                        name='lab2', shape=[3, 4, 5, 6], dtype="int32")
+                    x2 = fluid.layers.data(name='x2',
+                                           shape=[3, 4, 5, 6],
+                                           dtype="int32")
+                    lab2 = fluid.layers.data(name='lab2',
+                                             shape=[3, 4, 5, 6],
+                                             dtype="int32")
                     fluid.layers.sigmoid_cross_entropy_with_logits(x2, lab2)
 
                 self.assertRaises(TypeError, test_dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
index 15a4827cecba3..bdfa1a19eca31 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
@@ -27,8 +27,12 @@ def call_sfl_functional(logit,
                         alpha=0.25,
                         gamma=2.0,
                         reduction='sum'):
-    res = paddle.nn.functional.sigmoid_focal_loss(
-        logit, label, normalizer, alpha=alpha, gamma=gamma, reduction=reduction)
+    res = paddle.nn.functional.sigmoid_focal_loss(logit,
+                                                  label,
+                                                  normalizer,
+                                                  alpha=alpha,
+                                                  gamma=gamma,
+                                                  reduction=reduction)
     return res
 
 
@@ -43,16 +47,19 @@ def test_static(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        logit = paddle.fluid.data(
-            name='logit', shape=logit_np.shape, dtype='float64')
-        label = paddle.fluid.data(
-            name='label', shape=label_np.shape, dtype='float64')
+        logit = paddle.fluid.data(name='logit',
+                                  shape=logit_np.shape,
+                                  dtype='float64')
+        label = paddle.fluid.data(name='label',
+                                  shape=label_np.shape,
+                                  dtype='float64')
         feed_dict = {"logit": logit_np, "label": label_np}
 
         normalizer = None
         if normalizer_np is not None:
-            normalizer = paddle.fluid.data(
-                name='normalizer', shape=normalizer_np.shape, dtype='float64')
+            normalizer = paddle.fluid.data(name='normalizer',
+                                           shape=normalizer_np.shape,
+                                           dtype='float64')
             feed_dict["normalizer"] = normalizer_np
 
         res = call_sfl_functional(logit, label, normalizer, alpha, gamma,
@@ -115,14 +122,14 @@ def calc_sigmoid_focal_loss(logit_np,
 
 
 class TestSigmoidFocalLoss(unittest.TestCase):
+
     def test_SigmoidFocalLoss(self):
-        logit_np = np.random.uniform(
-            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
-        label_np = np.random.randint(
-            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
+        logit_np = np.random.uniform(0.1, 0.8,
+                                     size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(0, 2,
+                                     size=(2, 3, 4, 10)).astype(np.float64)
         normalizer_nps = [
-            np.asarray(
-                [np.sum(label_np > 0)], dtype=label_np.dtype), None
+            np.asarray([np.sum(label_np > 0)], dtype=label_np.dtype), None
         ]
         places = [fluid.CPUPlace()]
         if fluid.core.is_compiled_with_cuda():
@@ -148,8 +155,8 @@ def test_SigmoidFocalLoss(self):
                             expected = calc_sigmoid_focal_loss(
                                 logit_np, label_np, normalizer_np, alpha, gamma,
                                 reduction)
-                            self.assertTrue(
-                                np.allclose(static_result, expected))
+                            self.assertTrue(np.allclose(static_result,
+                                                        expected))
                             self.assertTrue(
                                 np.allclose(static_result, dy_result))
                             self.assertTrue(np.allclose(dy_result, expected))
@@ -159,13 +166,12 @@ def test_SigmoidFocalLoss_error(self):
         paddle.disable_static()
         logit = paddle.to_tensor([[0.97], [0.91], [0.03]], dtype='float32')
         label = paddle.to_tensor([[1.0], [1.0], [0.0]], dtype='float32')
-        self.assertRaises(
-            ValueError,
-            paddle.nn.functional.sigmoid_focal_loss,
-            logit=logit,
-            label=label,
-            normalizer=None,
-            reduction="unsupport reduction")
+        self.assertRaises(ValueError,
+                          paddle.nn.functional.sigmoid_focal_loss,
+                          logit=logit,
+                          label=label,
+                          normalizer=None,
+                          reduction="unsupport reduction")
         paddle.enable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss_op.py
index 6c1b15ab00317..7a625fb296a41 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss_op.py
@@ -46,8 +46,9 @@ def sigmoid_focal_loss_forward(x_data, label_data, fg_num_data, gamma, alpha,
         p = 1. / (1. + math.exp(-x))
         FLT_MIN = 1.175494351e-38
         term_pos = math.pow((1. - p), gamma) * math.log(max(FLT_MIN, p))
-        term_neg = math.pow(p, gamma) * (
-            -1. * x * (x >= 0) - math.log(1. + math.exp(x - 2. * x * (x >= 0))))
+        term_neg = math.pow(p, gamma) * (-1. * x * (x >= 0) -
+                                         math.log(1. + math.exp(x - 2. * x *
+                                                                (x >= 0))))
         out_data[idx] = 0.0
         out_data[idx] += -c_pos * term_pos * z_pos
         out_data[idx] += -c_neg * term_neg * z_neg
@@ -57,6 +58,7 @@ def sigmoid_focal_loss_forward(x_data, label_data, fg_num_data, gamma, alpha,
 
 
 class TestSigmoidFocalLossOp1(OpTest):
+
     def set_argument(self):
         self.num_anchors = 10
         self.num_classes = 10
@@ -84,9 +86,10 @@ def setUp(self):
             'gamma': self.gamma,
             'alpha': self.alpha,
         }
-        loss = sigmoid_focal_loss_forward(
-            self.inputs['X'], self.inputs['Label'], self.inputs['FgNum'],
-            self.gamma, self.alpha, self.num_classes)
+        loss = sigmoid_focal_loss_forward(self.inputs['X'],
+                                          self.inputs['Label'],
+                                          self.inputs['FgNum'], self.gamma,
+                                          self.alpha, self.num_classes)
         self.outputs = {'Out': loss.astype('float64')}
 
     def test_check_output(self):
@@ -99,17 +102,20 @@ def test_check_grad(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSigmoidFocalLossOp2(TestSigmoidFocalLossOp1):
+
     def test_check_output(self):
         place = core.CUDAPlace(0)
         self.check_output_with_place(place, atol=2e-3)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.002)
+        self.check_grad_with_place(place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.002)
 
 
 class TestSigmoidFocalLossOp3(TestSigmoidFocalLossOp1):
+
     def set_argument(self):
         self.num_anchors = 200
         self.num_classes = 10
@@ -120,36 +126,47 @@ def set_argument(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSigmoidFocalLossOp4(TestSigmoidFocalLossOp3):
+
     def test_check_output(self):
         place = core.CUDAPlace(0)
         self.check_output_with_place(place, atol=2e-3)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.002)
+        self.check_grad_with_place(place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.002)
 
 
 class TestSigmoidFocalLossOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            label1 = fluid.layers.fill_constant(
-                shape=[10, 1], dtype="int32", value=1)
-            fg_num1 = fluid.layers.fill_constant(
-                shape=[1], dtype="int32", value=5)
+            label1 = fluid.layers.fill_constant(shape=[10, 1],
+                                                dtype="int32",
+                                                value=1)
+            fg_num1 = fluid.layers.fill_constant(shape=[1],
+                                                 dtype="int32",
+                                                 value=5)
 
             # The `x` must be Variable and the data type of `x` Tensor must be one of float32 and float64.
             def test_x_type():
                 x1 = [2]
-                fluid.layers.sigmoid_focal_loss(
-                    x=x1, label=label1, fg_num=fg_num1, gamma=2., alpha=0.25)
+                fluid.layers.sigmoid_focal_loss(x=x1,
+                                                label=label1,
+                                                fg_num=fg_num1,
+                                                gamma=2.,
+                                                alpha=0.25)
 
             self.assertRaises(TypeError, test_x_type)
 
             def test_x_tensor_dtype():
                 x2 = fluid.layers.data(name='x2', shape=[10, 10], dtype="int16")
-                fluid.layers.sigmoid_focal_loss(
-                    x=x2, label=label1, fg_num=fg_num1, gamma=2., alpha=0.25)
+                fluid.layers.sigmoid_focal_loss(x=x2,
+                                                label=label1,
+                                                fg_num=fg_num1,
+                                                gamma=2.,
+                                                alpha=0.25)
 
             self.assertRaises(TypeError, test_x_tensor_dtype)
 
@@ -158,32 +175,46 @@ def test_x_tensor_dtype():
             # The `label` must be Variable and the data type of `label` Tensor must be int32.
             def test_label_type():
                 label2 = [2]
-                fluid.layers.sigmoid_focal_loss(
-                    x=x3, label=label2, fg_num=fg_num1, gamma=2., alpha=0.25)
+                fluid.layers.sigmoid_focal_loss(x=x3,
+                                                label=label2,
+                                                fg_num=fg_num1,
+                                                gamma=2.,
+                                                alpha=0.25)
 
             self.assertRaises(TypeError, test_label_type)
 
             def test_label_tensor_dtype():
-                label3 = fluid.layers.fill_constant(
-                    shape=[10, 1], dtype="float32", value=1.)
-                fluid.layers.sigmoid_focal_loss(
-                    x=x3, label=label3, fg_num=fg_num1, gamma=2., alpha=0.25)
+                label3 = fluid.layers.fill_constant(shape=[10, 1],
+                                                    dtype="float32",
+                                                    value=1.)
+                fluid.layers.sigmoid_focal_loss(x=x3,
+                                                label=label3,
+                                                fg_num=fg_num1,
+                                                gamma=2.,
+                                                alpha=0.25)
 
             self.assertRaises(TypeError, test_label_tensor_dtype)
 
             # The `fg_num` must be Variable and the data type of `fg_num` Tensor must be int32.
             def test_fgnum_type():
                 fg_num2 = [2]
-                fluid.layers.sigmoid_focal_loss(
-                    x=x3, label=label1, fg_num=fg_num2, gamma=2., alpha=0.25)
+                fluid.layers.sigmoid_focal_loss(x=x3,
+                                                label=label1,
+                                                fg_num=fg_num2,
+                                                gamma=2.,
+                                                alpha=0.25)
 
             self.assertRaises(TypeError, test_fgnum_type)
 
             def test_fgnum_tensor_dtype():
-                fg_num3 = fluid.layers.fill_constant(
-                    shape=[1], dtype="float32", value=5.)
-                fluid.layers.sigmoid_focal_loss(
-                    x=x3, label=label1, fg_num=fg_num3, gamma=2., alpha=0.25)
+                fg_num3 = fluid.layers.fill_constant(shape=[1],
+                                                     dtype="float32",
+                                                     value=5.)
+                fluid.layers.sigmoid_focal_loss(x=x3,
+                                                label=label1,
+                                                fg_num=fg_num3,
+                                                gamma=2.,
+                                                alpha=0.25)
 
             self.assertRaises(TypeError, test_fgnum_tensor_dtype)
 
diff --git a/python/paddle/fluid/tests/unittests/test_sign_op.py b/python/paddle/fluid/tests/unittests/test_sign_op.py
index bd145a968ed85..444675a4bb5c2 100644
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
@@ -23,6 +23,7 @@
 
 
 class TestSignOp(OpTest):
+
     def setUp(self):
         self.op_type = "sign"
         self.inputs = {
@@ -38,24 +39,29 @@ def test_check_grad(self):
 
 
 class TestSignOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of sign_op must be Variable or numpy.ndarray.
             input1 = 12
             self.assertRaises(TypeError, fluid.layers.sign, input1)
             # The input dtype of sign_op must be float16, float32, float64.
-            input2 = fluid.layers.data(
-                name='input2', shape=[12, 10], dtype="int32")
-            input3 = fluid.layers.data(
-                name='input3', shape=[12, 10], dtype="int64")
+            input2 = fluid.layers.data(name='input2',
+                                       shape=[12, 10],
+                                       dtype="int32")
+            input3 = fluid.layers.data(name='input3',
+                                       shape=[12, 10],
+                                       dtype="int64")
             self.assertRaises(TypeError, fluid.layers.sign, input2)
             self.assertRaises(TypeError, fluid.layers.sign, input3)
-            input4 = fluid.layers.data(
-                name='input4', shape=[4], dtype="float16")
+            input4 = fluid.layers.data(name='input4',
+                                       shape=[4],
+                                       dtype="float16")
             fluid.layers.sign(input4)
 
 
 class TestSignAPI(unittest.TestCase):
+
     def test_dygraph(self):
         with fluid.dygraph.guard():
             np_x = np.array([-1., 0., -0., 1.2, 1.5], dtype='float64')
@@ -71,14 +77,17 @@ def test_static(self):
             input1 = 12
             self.assertRaises(TypeError, paddle.tensor.math.sign, input1)
             # The input dtype of sign_op must be float16, float32, float64.
-            input2 = fluid.layers.data(
-                name='input2', shape=[12, 10], dtype="int32")
-            input3 = fluid.layers.data(
-                name='input3', shape=[12, 10], dtype="int64")
+            input2 = fluid.layers.data(name='input2',
+                                       shape=[12, 10],
+                                       dtype="int32")
+            input3 = fluid.layers.data(name='input3',
+                                       shape=[12, 10],
+                                       dtype="int64")
             self.assertRaises(TypeError, paddle.tensor.math.sign, input2)
             self.assertRaises(TypeError, paddle.tensor.math.sign, input3)
-            input4 = fluid.layers.data(
-                name='input4', shape=[4], dtype="float16")
+            input4 = fluid.layers.data(name='input4',
+                                       shape=[4],
+                                       dtype="float16")
             paddle.sign(input4)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_signal.py b/python/paddle/fluid/tests/unittests/test_signal.py
index ecbbd8f52db9b..8257630cf2071 100644
--- a/python/paddle/fluid/tests/unittests/test_signal.py
+++ b/python/paddle/fluid/tests/unittests/test_signal.py
@@ -56,8 +56,8 @@ def tiny(x):
     x = np.asarray(x)
 
     # Only floating types generate a tiny
-    if np.issubdtype(x.dtype, np.floating) or np.issubdtype(x.dtype,
-                                                            np.complexfloating):
+    if np.issubdtype(x.dtype, np.floating) or np.issubdtype(
+            x.dtype, np.complexfloating):
         dtype = x.dtype
     else:
         dtype = np.float32
@@ -144,18 +144,19 @@ def __window_ss_fill(x, win_sq, n_frames, hop_length):  # pragma: no cover
     n_fft = len(win_sq)
     for i in range(n_frames):
         sample = i * hop_length
-        x[sample:min(n, sample + n_fft)] += win_sq[:max(0,
-                                                        min(n_fft, n - sample))]
+        x[sample:min(n, sample +
+                     n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
 
 
 def window_sumsquare(
-        window,
-        n_frames,
-        hop_length=512,
-        win_length=None,
-        n_fft=2048,
-        dtype=np.float32,
-        norm=None, ):
+    window,
+    n_frames,
+    hop_length=512,
+    win_length=None,
+    n_fft=2048,
+    dtype=np.float32,
+    norm=None,
+):
     if win_length is None:
         win_length = n_fft
 
@@ -335,8 +336,9 @@ def stft(x,
         y = np.pad(y, int(n_fft // 2), mode=pad_mode)
 
     elif n_fft > y.shape[-1]:
-        raise Exception("n_fft={} is too large for input signal of length={}".
-                        format(n_fft, y.shape[-1]))
+        raise Exception(
+            "n_fft={} is too large for input signal of length={}".format(
+                n_fft, y.shape[-1]))
 
     # Window the time series.
     y_frames = frame(y, frame_length=n_fft, hop_length=hop_length)
@@ -345,8 +347,9 @@ def stft(x,
         dtype = dtype_r2c(y.dtype)
 
     # Pre-allocate the STFT matrix
-    stft_matrix = np.empty(
-        (int(1 + n_fft // 2), y_frames.shape[1]), dtype=dtype, order="F")
+    stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]),
+                           dtype=dtype,
+                           order="F")
 
     # how many columns can we fit within MAX_MEM_BLOCK?
     n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
@@ -355,8 +358,9 @@ def stft(x,
     for bl_s in range(0, stft_matrix.shape[1], n_columns):
         bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
 
-        stft_matrix[:, bl_s:bl_t] = fft.rfft(
-            fft_window * y_frames[:, bl_s:bl_t], axis=0)
+        stft_matrix[:,
+                    bl_s:bl_t] = fft.rfft(fft_window * y_frames[:, bl_s:bl_t],
+                                          axis=0)
 
     if input_rank == 2:
         stft_matrix = np.expand_dims(stft_matrix, 0)
@@ -365,12 +369,13 @@ def stft(x,
 
 
 def istft(
-        x,
-        hop_length=None,
-        win_length=None,
-        window="hann",
-        center=True,
-        length=None, ):
+    x,
+    hop_length=None,
+    win_length=None,
+    window="hann",
+    center=True,
+    length=None,
+):
 
     stft_matrix = x
     input_rank = len(stft_matrix.shape)
@@ -434,7 +439,8 @@ def istft(
         win_length=win_length,
         n_fft=n_fft,
         hop_length=hop_length,
-        dtype=dtype, )
+        dtype=dtype,
+    )
 
     approx_nonzero_indices = ifft_window_sum > tiny(ifft_window_sum)
     y[approx_nonzero_indices] /= ifft_window_sum[approx_nonzero_indices]
@@ -537,6 +543,7 @@ def overlap_add_for_api_test(x, hop_length, axis=-1):
 
 
 def place(devices, key='place'):
+
     def decorate(cls):
         module = sys.modules[cls.__module__].__dict__
         raw_classes = {
@@ -591,8 +598,8 @@ def rand_x(dims=1,
             np.random.randint(min_dim_len, max_dim_len) for i in range(dims)
         ]
     if complex:
-        return np.random.randn(*shape).astype(dtype) + 1.j * np.random.randn(
-            *shape).astype(dtype)
+        return np.random.randn(*shape).astype(
+            dtype) + 1.j * np.random.randn(*shape).astype(dtype)
     else:
         return np.random.randn(*shape).astype(dtype)
 
diff --git a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
index 888bec928ff8d..114003f07087a 100755
--- a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
@@ -23,15 +23,17 @@
 
 
 class TestSimilarityFocusOp(OpTest):
+
     def setUp(self):
         self.op_type = "similarity_focus"
         batch_size = 2
         x_dim, y_dim, z_dim = 3, 2, 2
         self.inputs = {
-            'X': np.array([[[[0.8, 0.1], [0.4, 0.5]], [[0.9, 0.7], [0.9, 0.9]],
-                            [[0.8, 0.9], [0.1, 0.2]]],
-                           [[[0.2, 0.5], [0.3, 0.4]], [[0.9, 0.7], [0.8, 0.4]],
-                            [[0.0, 0.2], [0.4, 0.7]]]]),
+            'X':
+            np.array([[[[0.8, 0.1], [0.4, 0.5]], [[0.9, 0.7], [0.9, 0.9]],
+                       [[0.8, 0.9], [0.1, 0.2]]],
+                      [[[0.2, 0.5], [0.3, 0.4]], [[0.9, 0.7], [0.8, 0.4]],
+                       [[0.0, 0.2], [0.4, 0.7]]]]),
         }
         self.attrs = {
             'axis': 1,
@@ -42,8 +44,8 @@ def setUp(self):
         for batch in range(batch_size):
             res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
             for index in self.attrs['indexes']:
-                channel = self.inputs['X'][batch, index, :, :].reshape(-1).copy(
-                )
+                channel = self.inputs['X'][batch,
+                                           index, :, :].reshape(-1).copy()
                 tag1 = [0 for i in range(y_dim)]
                 tag2 = [0 for i in range(z_dim)]
                 cnt = 0
@@ -72,12 +74,14 @@ def test_check_output(self):
 
 
 class TestSimilarityFocusOp_axis1(OpTest):
+
     def setUp(self):
         self.op_type = "similarity_focus"
         batch_size = 3
         x_dim, y_dim, z_dim = 4, 5, 6
         self.inputs = {
-            'X': np.random.random(
+            'X':
+            np.random.random(
                 (batch_size, x_dim, y_dim, z_dim)).astype("float32"),
         }
         self.attrs = {
@@ -89,8 +93,8 @@ def setUp(self):
         for batch in range(batch_size):
             res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
             for index in self.attrs['indexes']:
-                channel = self.inputs['X'][batch, index, :, :].reshape(-1).copy(
-                )
+                channel = self.inputs['X'][batch,
+                                           index, :, :].reshape(-1).copy()
                 tag1 = [0 for i in range(y_dim)]
                 tag2 = [0 for i in range(z_dim)]
                 cnt = 0
@@ -120,12 +124,14 @@ def test_check_output(self):
 
 
 class TestSimilarityFocusOp_axis2(OpTest):
+
     def setUp(self):
         self.op_type = "similarity_focus"
         batch_size = 6
         x_dim, y_dim, z_dim = 7, 8, 9
         self.inputs = {
-            'X': np.random.random(
+            'X':
+            np.random.random(
                 (batch_size, x_dim, y_dim, z_dim)).astype("float32"),
         }
         self.attrs = {
@@ -137,8 +143,8 @@ def setUp(self):
         for batch in range(batch_size):
             res = np.zeros((x_dim, 1, z_dim)).astype("float32").reshape(-1)
             for index in self.attrs['indexes']:
-                channel = self.inputs['X'][batch, :, index, :].reshape(-1).copy(
-                )
+                channel = self.inputs['X'][batch, :,
+                                           index, :].reshape(-1).copy()
                 tag1 = [0 for i in range(x_dim)]
                 tag2 = [0 for i in range(z_dim)]
                 cnt = 0
@@ -168,12 +174,14 @@ def test_check_output(self):
 
 
 class TestSimilarityFocusOp_axis3(OpTest):
+
     def setUp(self):
         self.op_type = "similarity_focus"
         batch_size = 64
         x_dim, y_dim, z_dim = 48, 48, 13
         self.inputs = {
-            'X': np.random.random(
+            'X':
+            np.random.random(
                 (batch_size, x_dim, y_dim, z_dim)).astype("float32"),
         }
         self.attrs = {
@@ -185,8 +193,8 @@ def setUp(self):
         for batch in range(batch_size):
             res = np.zeros((x_dim, y_dim, 1)).astype("float32").reshape(-1)
             for index in self.attrs['indexes']:
-                channel = self.inputs['X'][batch, :, :, index].reshape(-1).copy(
-                )
+                channel = self.inputs['X'][batch, :, :,
+                                           index].reshape(-1).copy()
                 tag1 = [0 for i in range(x_dim)]
                 tag2 = [0 for i in range(y_dim)]
                 cnt = 0
@@ -216,28 +224,32 @@ def test_check_output(self):
 
 
 class TestSimilarityFocusOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             data = fluid.data(name='data', shape=[16, 3, 2, 2], dtype='float32')
 
             def test_input_Variable():
                 input = np.random.rand(16, 3, 2, 2).astype("float32")
-                out = fluid.layers.similarity_focus(
-                    input=input, axis=1, indexes=[0])
+                out = fluid.layers.similarity_focus(input=input,
+                                                    axis=1,
+                                                    indexes=[0])
 
             self.assertRaises(TypeError, test_input_Variable)
 
             def test_axis_Int():
                 axis = 1.0
-                out = fluid.layers.similarity_focus(
-                    input=data, axis=axis, indexes=[0])
+                out = fluid.layers.similarity_focus(input=data,
+                                                    axis=axis,
+                                                    indexes=[0])
 
             self.assertRaises(TypeError, test_axis_Int)
 
             def test_indexes_List():
                 indexes = 0
-                out = fluid.layers.similarity_focus(
-                    input=data, axis=1, indexes=indexes)
+                out = fluid.layers.similarity_focus(input=data,
+                                                    axis=1,
+                                                    indexes=indexes)
 
             self.assertRaises(TypeError, test_indexes_List)
 
diff --git a/python/paddle/fluid/tests/unittests/test_simple_rnn_op.py b/python/paddle/fluid/tests/unittests/test_simple_rnn_op.py
index d7e24b6308e5d..7676e15a74bca 100644
--- a/python/paddle/fluid/tests/unittests/test_simple_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_simple_rnn_op.py
@@ -23,6 +23,7 @@
 import paddle.fluid.layers as layers
 import random
 import sys
+
 sys.path.append("./rnn")
 from rnn_numpy import SimpleRNN
 from convert import get_params_for_net
@@ -33,6 +34,7 @@
 
 
 class TestSimpleRNNOp(OpTest):
+
     def get_weight_names(self):
         weight_names = []
         for i in range(self.num_layers):
@@ -47,8 +49,7 @@ def setUp(self):
         self.op_type = "rnn"
         self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
         self.sequence_length = None if core.is_compiled_with_rocm(
-        ) else np.array(
-            [12, 11, 10, 9, 8], dtype=np.int32)
+        ) else np.array([12, 11, 10, 9, 8], dtype=np.int32)
         self.num_layers = 1
         self.is_bidirec = False
         self.is_test = False
@@ -63,24 +64,24 @@ def setUp(self):
         input_size = 3
         hidden_size = 2
 
-        input = np.random.uniform(
-            low=-0.1, high=0.1,
-            size=(seq_length, batch_size, input_size)).astype(self.dtype)
+        input = np.random.uniform(low=-0.1,
+                                  high=0.1,
+                                  size=(seq_length, batch_size,
+                                        input_size)).astype(self.dtype)
         if self.sequence_length is not None:
             input[11][1:][:] = 0
             input[10][2:][:] = 0
             input[9][3:][:] = 0
             input[8][4:][:] = 0
 
-        rnn1 = SimpleRNN(
-            input_size,
-            hidden_size,
-            num_layers=self.num_layers,
-            time_major=True,
-            direction=direction,
-            dropout=self.dropout,
-            nonlinearity=self.mode,
-            dtype=self.dtype)
+        rnn1 = SimpleRNN(input_size,
+                         hidden_size,
+                         num_layers=self.num_layers,
+                         time_major=True,
+                         direction=direction,
+                         dropout=self.dropout,
+                         nonlinearity=self.mode,
+                         dtype=self.dtype)
 
         flat_w = get_params_for_net(rnn1)
 
@@ -134,23 +135,27 @@ def test_grad(self):
 
 
 class TestSimpleRNNOp1(TestSimpleRNNOp):
+
     def set_attrs(self):
         self.sequence_length = None
 
 
 class TestSimpleRNNOp2(TestSimpleRNNOp):
+
     def set_attrs(self):
         self.sequence_length = None
         self.is_bidirec = True
 
 
 class TestSimpleRNNOp3(TestSimpleRNNOp):
+
     def set_attrs(self):
         self.sequence_length = None
         self.is_test = True
 
 
 class TestSimpleRNNOp4(TestSimpleRNNOp):
+
     def set_attrs(self):
         self.sequence_length = None
         self.is_bidirec = True
@@ -158,6 +163,7 @@ def set_attrs(self):
 
 
 class TestSimpleRNNOp5(TestSimpleRNNOp):
+
     def set_attrs(self):
         self.mode = "RNN_RELU"
 
diff --git a/python/paddle/fluid/tests/unittests/test_size_op.py b/python/paddle/fluid/tests/unittests/test_size_op.py
index 09cd35391bae0..bb64e3e66b24b 100644
--- a/python/paddle/fluid/tests/unittests/test_size_op.py
+++ b/python/paddle/fluid/tests/unittests/test_size_op.py
@@ -20,6 +20,7 @@
 
 
 class TestSizeOp(OpTest):
+
     def setUp(self):
         self.op_type = "size"
         self.shape = []
@@ -36,26 +37,31 @@ def test_check_output(self):
 
 
 class TestRank1Tensor(TestSizeOp):
+
     def config(self):
         self.shape = [2]
 
 
 class TestRank2Tensor(TestSizeOp):
+
     def config(self):
         self.shape = [2, 3]
 
 
 class TestRank3Tensor(TestSizeOp):
+
     def config(self):
         self.shape = [2, 3, 100]
 
 
 class TestLargeTensor(TestSizeOp):
+
     def config(self):
         self.shape = [2**10]
 
 
 class TestSizeAPI(unittest.TestCase):
+
     def test_size_static(self):
         main_program = fluid.Program()
         startup_program = fluid.Program()
@@ -74,10 +80,12 @@ def test_size_static(self):
                 "x_2": input_2,
             },
                                    fetch_list=[out_1, out_2])
-            assert (np.array_equal(
-                res_1, np.array([np.size(input_1)]).astype("int64")))
-            assert (np.array_equal(
-                res_2, np.array([np.size(input_2)]).astype("int64")))
+            assert (np.array_equal(res_1,
+                                   np.array([np.size(input_1)
+                                             ]).astype("int64")))
+            assert (np.array_equal(res_2,
+                                   np.array([np.size(input_2)
+                                             ]).astype("int64")))
 
     def test_size_imperative(self):
         paddle.disable_static(paddle.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index 34f296c4b6354..3b341d7936676 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -29,6 +29,7 @@
 # Situation 1: starts(list, no tensor), ends(list, no tensor)
 # 1.1 without attr(decrease)
 class TestSliceOp(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.config()
@@ -57,6 +58,7 @@ def test_check_grad_normal(self):
 
 
 class TestCase1(TestSliceOp):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float64")
         self.starts = [-3, 0, 2]
@@ -67,6 +69,7 @@ def config(self):
 
 
 class TestCase2(TestSliceOp):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float64")
         self.starts = [-3, 0, 2]
@@ -78,6 +81,7 @@ def config(self):
 
 # 1.2 with attr(decrease)
 class TestSliceOp_decs_dim(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.config()
@@ -108,6 +112,7 @@ def test_check_grad_normal(self):
 
 
 class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float64")
         self.starts = [1, 0, 2]
@@ -119,6 +124,7 @@ def config(self):
 
 
 class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float64")
         self.starts = [-1, 0, 2]
@@ -130,6 +136,7 @@ def config(self):
 
 
 class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 7]).astype("float64")
         self.starts = [0, 1, 2, 3]
@@ -141,6 +148,7 @@ def config(self):
 
 
 class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float64")
         self.starts = [-1]
@@ -152,6 +160,7 @@ def config(self):
 
 
 class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float64")
         self.starts = [0, 1, 2, 3]
@@ -165,6 +174,7 @@ def config(self):
 # Situation 2: starts(list, have tensor), ends(list, no tensor)
 # without attr(decrease)
 class TestSliceOp_starts_ListTensor(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.config()
@@ -203,6 +213,7 @@ def test_check_grad_normal(self):
 # Situation 2: starts(list, have tensor), ends(list, no tensor)
 #  with attr(decrease)
 class TestSliceOp_decs_dim_starts_ListTensor(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.config()
@@ -243,6 +254,7 @@ def test_check_grad_normal(self):
 
 class TestSliceOp_decs_dim_5_starts_ListTensor(
         TestSliceOp_decs_dim_starts_ListTensor):
+
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float64")
         self.starts = [-1]
@@ -258,13 +270,13 @@ def config(self):
 # Situation 3: starts(tensor), ends(list, no tensor)
 # with attr(decrease)
 class TestSliceOp_decs_dim_starts_OneTensor(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.config()
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32")
+            "StartsTensor": np.array(self.starts, dtype="int32")
         }
         self.outputs = {'Out': self.out}
         self.attrs = {
@@ -294,16 +306,15 @@ def test_check_grad_normal(self):
 # Situation 4: starts(tensor), ends(tensor)
 #  without attr(decrease)
 class TestSliceOp_starts_OneTensor_ends_OneTensor(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.config()
 
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int64"),
-            "EndsTensor": np.array(
-                self.ends, dtype="int32")
+            "StartsTensor": np.array(self.starts, dtype="int64"),
+            "EndsTensor": np.array(self.ends, dtype="int32")
         }
         self.outputs = {'Out': self.out}
         self.attrs = {
@@ -331,15 +342,14 @@ def test_check_grad_normal(self):
 # Situation 5: starts(tensor), ends(tensor)
 #  with attr(decrease)
 class TestSliceOp_decs_dim_starts_and_ends_OneTensor(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.config()
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32"),
-            "EndsTensor": np.array(
-                self.ends, dtype="int32")
+            "StartsTensor": np.array(self.starts, dtype="int32"),
+            "EndsTensor": np.array(self.ends, dtype="int32")
         }
         self.outputs = {'Out': self.out}
         self.attrs = {
@@ -369,6 +379,7 @@ def test_check_grad_normal(self):
 # Situation 6: starts(tensor), ends(list, have tensor)
 # without attr(decrease)
 class TestSliceOp_starts_OneTensor_ends_ListTensor(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.config()
@@ -380,8 +391,7 @@ def setUp(self):
 
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32"),
+            "StartsTensor": np.array(self.starts, dtype="int32"),
             'EndsTensorList': ends_tensor
         }
         self.outputs = {'Out': self.out}
@@ -413,6 +423,7 @@ def test_check_grad_normal(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.config()
@@ -442,13 +453,15 @@ def test_check_output(self):
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['Input'], 'Out', max_relative_error=0.006)
+            self.check_grad_with_place(place, ['Input'],
+                                       'Out',
+                                       max_relative_error=0.006)
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16_2(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.config()
@@ -478,14 +491,14 @@ def test_check_output(self):
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_grad_with_place(
-                place, ['Input'],
-                'Out',
-                max_relative_error=0.006,
-                numeric_grad_delta=0.5)
+            self.check_grad_with_place(place, ['Input'],
+                                       'Out',
+                                       max_relative_error=0.006,
+                                       numeric_grad_delta=0.5)
 
 
 class TestBF16(OpTest):
+
     def setUp(self):
         self.op_type = "slice"
         self.config()
@@ -516,30 +529,38 @@ def test_check_grad_normal(self):
 
 # Test python API
 class TestSliceAPI(unittest.TestCase):
+
     def test_1(self):
         input = np.random.random([3, 4, 5, 6]).astype("float64")
         minus_1 = fluid.layers.fill_constant([1], "int32", -1)
         minus_3 = fluid.layers.fill_constant([1], "int64", -3)
-        starts = fluid.layers.data(
-            name='starts', shape=[1, 3], append_batch_size=False)
-        ends = fluid.layers.data(
-            name='ends', shape=[3], append_batch_size=False)
-
-        x = fluid.layers.data(
-            name="x",
-            shape=[3, 4, 5, 6],
-            append_batch_size=False,
-            dtype="float64")
+        starts = fluid.layers.data(name='starts',
+                                   shape=[1, 3],
+                                   append_batch_size=False)
+        ends = fluid.layers.data(name='ends',
+                                 shape=[3],
+                                 append_batch_size=False)
+
+        x = fluid.layers.data(name="x",
+                              shape=[3, 4, 5, 6],
+                              append_batch_size=False,
+                              dtype="float64")
 
         # value_int64 is greater than 2147483647 which is the max of int32
         value_int64 = fluid.layers.fill_constant([1], "int64", 2147483648)
 
-        out_1 = paddle.slice(
-            x, axes=[0, 1, 2], starts=[-3, 0, 2], ends=[value_int64, 100, -1])
-        out_2 = paddle.slice(
-            x, axes=[0, 1, 3], starts=[minus_3, 0, 2], ends=[3, 100, -1])
-        out_3 = paddle.slice(
-            x, axes=[0, 1, 3], starts=[minus_3, 0, 2], ends=[3, 100, minus_1])
+        out_1 = paddle.slice(x,
+                             axes=[0, 1, 2],
+                             starts=[-3, 0, 2],
+                             ends=[value_int64, 100, -1])
+        out_2 = paddle.slice(x,
+                             axes=[0, 1, 3],
+                             starts=[minus_3, 0, 2],
+                             ends=[3, 100, -1])
+        out_3 = paddle.slice(x,
+                             axes=[0, 1, 3],
+                             starts=[minus_3, 0, 2],
+                             ends=[3, 100, minus_1])
         out_4 = paddle.slice(x, axes=[0, 1, 2], starts=starts, ends=ends)
 
         out_5 = x[-3:3, 0:100, 2:-1]
@@ -566,19 +587,17 @@ def test_1(self):
 
 
 class TestSliceApiWithTensor(unittest.TestCase):
+
     def test_starts_ends_is_tensor(self):
         with paddle.fluid.dygraph.guard():
             a = paddle.rand(shape=[4, 5, 6], dtype='float32')
             axes = [0, 1, 2]
             starts = [-3, 0, 2]
             ends = [3, 2, 4]
-            a_1 = paddle.slice(
-                a,
-                axes=axes,
-                starts=paddle.to_tensor(
-                    starts, dtype='int32'),
-                ends=paddle.to_tensor(
-                    ends, dtype='int32'))
+            a_1 = paddle.slice(a,
+                               axes=axes,
+                               starts=paddle.to_tensor(starts, dtype='int32'),
+                               ends=paddle.to_tensor(ends, dtype='int32'))
             a_2 = paddle.slice(a, axes=axes, starts=starts, ends=ends)
 
             self.assertTrue(np.array_equal(a_1.numpy(), a_2.numpy()))
@@ -601,6 +620,7 @@ def test_bool_tensor(self):
 
 
 class TestSliceApiEager(unittest.TestCase):
+
     def test_slice_api(self):
         with paddle.fluid.dygraph.guard():
             with _test_eager_guard():
@@ -611,11 +631,10 @@ def test_slice_api(self):
                 ends = [3, 2, 4]
                 a_1 = paddle.slice(a, axes=axes, starts=starts, ends=ends)
 
-                a_2 = paddle.slice(
-                    a,
-                    axes=axes,
-                    starts=paddle.to_tensor(starts),
-                    ends=paddle.to_tensor(ends))
+                a_2 = paddle.slice(a,
+                                   axes=axes,
+                                   starts=paddle.to_tensor(starts),
+                                   ends=paddle.to_tensor(ends))
 
                 a_1.backward()
                 grad_truth = paddle.zeros_like(a)
@@ -626,6 +645,7 @@ def test_slice_api(self):
 
 
 class TestSliceApiWithLoDTensorArray(unittest.TestCase):
+
     def setUp(self):
         self.shape = (3, 4)
         self.data = np.random.random(size=self.shape).astype('float32')
@@ -634,18 +654,16 @@ def setUp(self):
         self.end = 2
         self.axis = 1
 
-        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
         self.exe = fluid.Executor(self.place)
 
     def set_program_and_run(self, main_program, case_num):
         with fluid.program_guard(main_program):
             x = [
-                fluid.data(
-                    name='x0', shape=self.shape, dtype="float32"), fluid.data(
-                        name='x1', shape=self.shape, dtype="float32"),
-                fluid.data(
-                    name='x2', shape=self.shape, dtype="float32")
+                fluid.data(name='x0', shape=self.shape, dtype="float32"),
+                fluid.data(name='x1', shape=self.shape, dtype="float32"),
+                fluid.data(name='x2', shape=self.shape, dtype="float32")
             ]
 
             for each_x in x:
@@ -663,14 +681,16 @@ def set_program_and_run(self, main_program, case_num):
                 end = fluid.layers.array_length(
                     arr) - 1  # dtype of end is int64
                 self.sliced_arr = slice_arr = arr[self.start:end]
-                output, _ = fluid.layers.tensor_array_to_tensor(
-                    slice_arr, axis=self.axis, use_stack=True)
+                output, _ = fluid.layers.tensor_array_to_tensor(slice_arr,
+                                                                axis=self.axis,
+                                                                use_stack=True)
             elif case_num == 3:
                 value_int64 = fluid.layers.fill_constant([1], "int64",
                                                          2147483648)
                 self.sliced_arr = slice_arr = arr[self.start:value_int64]
-                output, _ = fluid.layers.tensor_array_to_tensor(
-                    slice_arr, axis=self.axis, use_stack=True)
+                output, _ = fluid.layers.tensor_array_to_tensor(slice_arr,
+                                                                axis=self.axis,
+                                                                use_stack=True)
 
             loss = fluid.layers.reduce_sum(output)
             fluid.backward.append_backward(loss)
@@ -703,9 +723,8 @@ def test_case_2(self):
             self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY)
         self.assertEqual(self.sliced_arr.shape, self.shape)
         self.assertTrue(
-            np.array_equal(
-                self.out, np.stack(
-                    [self.data, self.data], axis=self.axis)))
+            np.array_equal(self.out,
+                           np.stack([self.data, self.data], axis=self.axis)))
         self.assertTrue(np.array_equal(self.g_x0, np.ones_like(self.data)))
         self.assertTrue(np.array_equal(self.g_x1, np.ones_like(self.data)))
         self.assertTrue(np.array_equal(self.g_x2, np.zeros_like(self.data)))
@@ -720,14 +739,14 @@ def test_case_3(self):
         self.assertTrue(
             np.array_equal(
                 self.out,
-                np.stack(
-                    [self.data, self.data, self.data], axis=self.axis)))
+                np.stack([self.data, self.data, self.data], axis=self.axis)))
         self.assertTrue(np.array_equal(self.g_x0, np.ones_like(self.data)))
         self.assertTrue(np.array_equal(self.g_x1, np.ones_like(self.data)))
         self.assertTrue(np.array_equal(self.g_x2, np.ones_like(self.data)))
 
 
 class TestImperativeVarBaseGetItem(unittest.TestCase):
+
     def test_getitem_with_long(self):
         with fluid.dygraph.guard():
             data = np.random.random((2, 80, 16128)).astype('float32')
@@ -739,6 +758,7 @@ def test_getitem_with_long(self):
             self.assertEqual(sliced.shape, [2, 78, 78])
 
     def test_getitem_with_float(self):
+
         def test_float_in_slice_item():
             with fluid.dygraph.guard():
                 data = np.random.random((2, 80, 16128)).astype('float32')
@@ -757,6 +777,7 @@ def test_float_in_index():
 
 
 class TestInferShape(unittest.TestCase):
+
     def test(self):
         x = paddle.ones(shape=[3, 4, 5])
         x.desc.set_shape([3, -1, 5])
@@ -772,7 +793,9 @@ def test_axis_less_than_zero(self):
             x_arr = np.arange(0, 24, dtype=np.float32).reshape([2, 3, 4])
             x = paddle.to_tensor(x_arr)
 
-            pp_slice = paddle.slice(x, [100, ], [0], [1])
+            pp_slice = paddle.slice(x, [
+                100,
+            ], [0], [1])
             np_slice = x_arr[:, :, 0:1]
             self.assertTrue(np.array_equal(pp_slice, np_slice))
 
@@ -784,13 +807,9 @@ def test_axis_less_than_zero(self):
             x = paddle.to_tensor(np.reshape(x_arr, (0, 0, 0)))
 
             starts = paddle.to_tensor(
-                np.reshape(
-                    np.array(
-                        [], dtype=np.int32), (0, )))
+                np.reshape(np.array([], dtype=np.int32), (0, )))
             ends = paddle.to_tensor(
-                np.reshape(
-                    np.array(
-                        [], dtype=np.int32), (0, )))
+                np.reshape(np.array([], dtype=np.int32), (0, )))
 
             with self.assertRaises(ValueError):
                 paddle.slice(x, [-1000000], starts, ends)
@@ -808,15 +827,15 @@ def test_axis_less_than_zero(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestImperativeCUDAPinnedInput(unittest.TestCase):
+
     def test_input_cuda_pinned_var(self):
         with fluid.dygraph.guard():
             data = np.random.random((2, 80, 16128)).astype('float32')
-            var = core.VarBase(
-                value=data,
-                name='',
-                persistable=False,
-                place=fluid.CUDAPinnedPlace(),
-                zero_copy=False)
+            var = core.VarBase(value=data,
+                               name='',
+                               persistable=False,
+                               place=fluid.CUDAPinnedPlace(),
+                               zero_copy=False)
             sliced = var[:, 10:, :var.shape[1]]
             self.assertEqual(sliced.shape, [2, 70, 80])
 
diff --git a/python/paddle/fluid/tests/unittests/test_slice_var.py b/python/paddle/fluid/tests/unittests/test_slice_var.py
index b16c744603534..d9cb3e2073b93 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_var.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_var.py
@@ -23,14 +23,15 @@
 
 
 class TestSliceVar(unittest.TestCase):
+
     def check_slice_output(self, shapes, expected_sizes, min_size):
         var_list = []
         program = fluid.Program()
         for shape in shapes:
-            var = program.global_block().create_var(
-                name=str(random.randint(10000, 99999)),
-                persistable=True,
-                shape=shape)
+            var = program.global_block().create_var(name=str(
+                random.randint(10000, 99999)),
+                                                    persistable=True,
+                                                    shape=shape)
             var_list.append(var)
         blocks = slice_variable(var_list, 10, min_size)
         all_sizes = []
@@ -43,12 +44,12 @@ def check_slice_output(self, shapes, expected_sizes, min_size):
 
     def test_1k(self):
         shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10]]
-        expected_sizes = [
-            [15], [1024],
-            [2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 784],
-            [2040, 2040, 2040, 2040],
-            [1150, 1150, 1150, 1150, 1150, 1150, 1100]
-        ]
+        expected_sizes = [[15], [1024],
+                          [
+                              2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352,
+                              2352, 784
+                          ], [2040, 2040, 2040, 2040],
+                          [1150, 1150, 1150, 1150, 1150, 1150, 1100]]
 
         self.check_slice_output(shapes, expected_sizes, 1024)
 
diff --git a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
index 74409c8671059..38cf45bfcc5ca 100644
--- a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
@@ -40,6 +40,7 @@ def smooth_l1_loss_np(input, label, reduction='mean', delta=1.0):
 
 
 class SmoothL1Loss(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
 
@@ -48,8 +49,8 @@ def test_smooth_l1_loss_mean(self):
         label_np = np.random.random([100, 200]).astype(np.float32)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[100, 200], dtype='float32')
             label = fluid.data(name='label', shape=[100, 200], dtype='float32')
@@ -66,9 +67,8 @@ def test_smooth_l1_loss_mean(self):
             self.assertIsNotNone(static_ret)
         with fluid.dygraph.guard():
             smooth_l1_loss = paddle.nn.loss.SmoothL1Loss()
-            dy_ret = smooth_l1_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = smooth_l1_loss(fluid.dygraph.to_variable(input_np),
+                                    fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
         expected = smooth_l1_loss_np(input_np, label_np, reduction='mean')
@@ -81,8 +81,8 @@ def test_smooth_l1_loss_sum(self):
         label_np = np.random.random([100, 200]).astype(np.float32)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[100, 200], dtype='float32')
             label = fluid.data(name='label', shape=[100, 200], dtype='float32')
@@ -99,9 +99,8 @@ def test_smooth_l1_loss_sum(self):
             self.assertIsNotNone(static_ret)
         with fluid.dygraph.guard():
             smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='sum')
-            dy_ret = smooth_l1_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = smooth_l1_loss(fluid.dygraph.to_variable(input_np),
+                                    fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
         expected = smooth_l1_loss_np(input_np, label_np, reduction='sum')
@@ -114,8 +113,8 @@ def test_smooth_l1_loss_none(self):
         label_np = np.random.random([100, 200]).astype(np.float32)
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[100, 200], dtype='float32')
             label = fluid.data(name='label', shape=[100, 200], dtype='float32')
@@ -132,9 +131,8 @@ def test_smooth_l1_loss_none(self):
             self.assertIsNotNone(static_ret)
         with fluid.dygraph.guard():
             smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(reduction='none')
-            dy_ret = smooth_l1_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = smooth_l1_loss(fluid.dygraph.to_variable(input_np),
+                                    fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
         expected = smooth_l1_loss_np(input_np, label_np, reduction='none')
@@ -148,8 +146,8 @@ def test_smooth_l1_loss_delta(self):
         delta = np.random.rand()
         prog = fluid.Program()
         startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[100, 200], dtype='float32')
             label = fluid.data(name='label', shape=[100, 200], dtype='float32')
@@ -166,9 +164,8 @@ def test_smooth_l1_loss_delta(self):
             self.assertIsNotNone(static_ret)
         with fluid.dygraph.guard():
             smooth_l1_loss = paddle.nn.loss.SmoothL1Loss(delta=delta)
-            dy_ret = smooth_l1_loss(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
+            dy_ret = smooth_l1_loss(fluid.dygraph.to_variable(input_np),
+                                    fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
             self.assertIsNotNone(dy_ret_value)
         expected = smooth_l1_loss_np(input_np, label_np, delta=delta)
diff --git a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
index 63e8568048d13..b102236380dd9 100644
--- a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
@@ -29,6 +29,7 @@ def smooth_l1_loss_forward(val, sigma2):
 
 
 class TestSmoothL1LossOp1(OpTest):
+
     def setUp(self):
         self.op_type = "smooth_l1_loss"
         dims = (5, 20)
@@ -51,27 +52,28 @@ def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'], 'Out', max_relative_error=0.02, check_eager=True)
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        max_relative_error=0.02,
+                        check_eager=True)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            max_relative_error=0.03,
-            no_grad_set=set("X"),
-            check_eager=True)
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=0.03,
+                        no_grad_set=set("X"),
+                        check_eager=True)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=0.03,
-            no_grad_set=set('Y'),
-            check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.03,
+                        no_grad_set=set('Y'),
+                        check_eager=True)
 
 
 class TestSmoothL1LossOp2(OpTest):
+
     def setUp(self):
         self.op_type = "smooth_l1_loss"
         dims = (5, 20)
@@ -98,34 +100,35 @@ def test_check_output(self):
         self.check_output(check_eager=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'], 'Out', max_relative_error=0.03, check_eager=True)
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        max_relative_error=0.03,
+                        check_eager=True)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            max_relative_error=0.03,
-            no_grad_set=set(['X', 'InsideWeight', 'OutsideWeight']),
-            check_eager=True)
+        self.check_grad(['Y'],
+                        'Out',
+                        max_relative_error=0.03,
+                        no_grad_set=set(['X', 'InsideWeight', 'OutsideWeight']),
+                        check_eager=True)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=0.03,
-            no_grad_set=set(['Y', 'InsideWeight', 'OutsideWeight']),
-            check_eager=True)
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.03,
+                        no_grad_set=set(['Y', 'InsideWeight', 'OutsideWeight']),
+                        check_eager=True)
 
 
 class TestSmoothL1LossOpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             # The input type of accuracy_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            y1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.smooth_l1, x1, y1)
             # The input dtype of accuracy_op must be float32 or float64.
             x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32")
diff --git a/python/paddle/fluid/tests/unittests/test_softmax2d.py b/python/paddle/fluid/tests/unittests/test_softmax2d.py
index 4879e9a0efbf0..cb851c771b8c6 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax2d.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax2d.py
@@ -21,6 +21,7 @@
 
 
 class TestSoftmax2DAPI(unittest.TestCase):
+
     def setUp(self):
         self.shape = [2, 6, 5, 4]
         self.x_np = np.random.uniform(-1, 1, self.shape).astype('float64')
@@ -50,6 +51,7 @@ def test_dygraph_api(self):
 
 
 class TestSoftmax2DShape(TestSoftmax2DAPI):
+
     def setUp(self):
         self.shape = [2, 6, 4]
         self.x_np = np.random.uniform(-1, 1, self.shape).astype('float64')
@@ -59,6 +61,7 @@ def setUp(self):
 
 
 class TestSoftmax2DFloat32(TestSoftmax2DAPI):
+
     def setUp(self):
         self.shape = [2, 3, 4]
         self.x_np = np.random.uniform(-1, 1, self.shape).astype('float32')
@@ -68,6 +71,7 @@ def setUp(self):
 
 
 class TestSoftmax2DCPU(TestSoftmax2DAPI):
+
     def setUp(self):
         self.shape = [2, 6, 4]
         self.x_np = np.random.uniform(-1, 1, self.shape).astype('float64')
@@ -76,6 +80,7 @@ def setUp(self):
 
 
 class TestSoftmax2DRepr(unittest.TestCase):
+
     def setUp(self):
         self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
             else paddle.CPUPlace()
@@ -88,6 +93,7 @@ def test_extra_repr(self):
 
 
 class TestSoftmax2DError(unittest.TestCase):
+
     def setUp(self):
         self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
             else paddle.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_op.py b/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_op.py
index cff06f9025fb1..3aa1cafd92f32 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_op.py
@@ -40,6 +40,7 @@ def _get_softmax(x, mask, fp16=True):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxMaskFuseOp(OpTest):
+
     def setUp(self):
         self.op_type = "fused_softmax_mask"
         x = np.random.random((1, 1, 8, 32))
@@ -65,6 +66,7 @@ def test_check_grad(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxMaskFuseOp0(OpTest):
+
     def setUp(self):
         self.op_type = "fused_softmax_mask"
         x = np.random.random((1, 1, 8, 32)).astype("float16")
@@ -84,11 +86,13 @@ def test_check_grad(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestDropoutBiasFuseOp3(unittest.TestCase):
+
     def test_static_result(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input_x = fluid.data(name="x", shape=[1, 1, 8, 32], dtype="float32")
-            input_mask = fluid.data(
-                name="mask", shape=[1, 1, 8, 32], dtype="float32")
+            input_mask = fluid.data(name="mask",
+                                    shape=[1, 1, 8, 32],
+                                    dtype="float32")
             rst = incubate.softmax_mask_fuse(input_x, input_mask)
 
             x_in_np = np.random.random((1, 1, 8, 32)).astype("float32")
@@ -98,8 +102,10 @@ def test_static_result(self):
 
             exe = fluid.Executor(fluid.CUDAPlace(0))
             fetches = exe.run(fluid.default_main_program(),
-                              feed={"x": x_in_np,
-                                    "mask": mask_in_np},
+                              feed={
+                                  "x": x_in_np,
+                                  "mask": mask_in_np
+                              },
                               fetch_list=[rst])
             self.assertTrue(np.allclose(fetches[0], rst_np))
 
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py b/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py
index a73ebd73e4946..53128e51298ab 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py
@@ -41,6 +41,7 @@ def _get_softmax_upper(x, fp16=True):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxMaskFuseOp(OpTest):
+
     def setUp(self):
         self.op_type = "fused_softmax_mask_upper_triangle"
         x = np.random.random((1, 4, 32, 32)).astype("float16")
@@ -58,6 +59,7 @@ def test_check_grad(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxMaskFuseOp1(OpTest):
+
     def setUp(self):
         self.op_type = "fused_softmax_mask_upper_triangle"
         x = np.random.random((1, 4, 32, 32))
@@ -89,8 +91,9 @@ def setUp(self):
     def test_static(self):
         for dtype in self.dtypes:
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                input_x = fluid.data(
-                    name="x", shape=[1, 4, 32, 32], dtype=dtype)
+                input_x = fluid.data(name="x",
+                                     shape=[1, 4, 32, 32],
+                                     dtype=dtype)
                 rst = incubate.softmax_mask_fuse_upper_triangle(input_x)
 
                 x_in_np = np.random.random((1, 4, 32, 32)).astype(dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 4f1c37a242474..8618e046893f3 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -45,6 +45,7 @@ def ref_softmax(x, axis=None, dtype=None):
 
 
 class TestSoftmaxOp(OpTest):
+
     def get_x_shape(self):
         return [10, 10]
 
@@ -96,19 +97,20 @@ def test_check_grad(self):
                     max_relative_error=0.01,
                     check_dygraph=(self.use_mkldnn == False))
         else:
-            self.check_grad(
-                ["X"],
-                "Out",
-                max_relative_error=0.01,
-                check_dygraph=(self.use_mkldnn == False))
+            self.check_grad(["X"],
+                            "Out",
+                            max_relative_error=0.01,
+                            check_dygraph=(self.use_mkldnn == False))
 
 
 class TestSoftmaxOp2(TestSoftmaxOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
 
 class TestSoftmaxOp3(TestSoftmaxOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
@@ -117,6 +119,7 @@ def get_axis(self):
 
 
 class TestSoftmaxOp4(TestSoftmaxOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
@@ -125,6 +128,7 @@ def get_axis(self):
 
 
 class TestSoftmaxOp5(TestSoftmaxOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
@@ -133,6 +137,7 @@ def get_axis(self):
 
 
 class TestSoftmaxOp6(TestSoftmaxOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
@@ -143,6 +148,7 @@ def get_axis(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp(TestSoftmaxOp):
+
     def init_kernel_type(self):
         self.use_cudnn = True
 
@@ -150,6 +156,7 @@ def init_kernel_type(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
@@ -157,6 +164,7 @@ def get_x_shape(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp3(TestSoftmaxCUDNNOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
@@ -167,6 +175,7 @@ def get_axis(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp4(TestSoftmaxCUDNNOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
@@ -177,6 +186,7 @@ def get_axis(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp5(TestSoftmaxCUDNNOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
@@ -187,6 +197,7 @@ def get_axis(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp6(TestSoftmaxCUDNNOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
@@ -197,6 +208,7 @@ def get_axis(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp7(TestSoftmaxCUDNNOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5, 6]
 
@@ -204,6 +216,7 @@ def get_x_shape(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp8(TestSoftmaxCUDNNOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5, 6]
 
@@ -214,6 +227,7 @@ def get_axis(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp9(TestSoftmaxCUDNNOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5, 6]
 
@@ -224,6 +238,7 @@ def get_axis(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp10(TestSoftmaxCUDNNOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5, 6]
 
@@ -234,6 +249,7 @@ def get_axis(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp11(TestSoftmaxCUDNNOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5, 6]
 
@@ -244,6 +260,7 @@ def get_axis(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp12(TestSoftmaxCUDNNOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5, 6]
 
@@ -254,6 +271,7 @@ def get_axis(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxFP16Op(TestSoftmaxOp):
+
     def init_kernel_type(self):
         self.dtype = np.float16
 
@@ -271,6 +289,7 @@ def test_check_grad(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxFP16Op2(TestSoftmaxFP16Op):
+
     def get_x_shape(self):
         return [2, 3, 4, 10]
 
@@ -278,6 +297,7 @@ def get_x_shape(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
+
     def init_kernel_type(self):
         self.use_cudnn = True
         self.dtype = np.float16
@@ -292,6 +312,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
+
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
@@ -299,6 +320,7 @@ def get_x_shape(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxBF16Op(OpTest):
+
     def setUp(self):
         self.op_type = "softmax"
         self.use_cudnn = self.init_cudnn()
@@ -326,30 +348,31 @@ def init_cudnn(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(
-            place, check_dygraph=(self.use_mkldnn == False))
+        self.check_output_with_place(place,
+                                     check_dygraph=(self.use_mkldnn == False))
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ["X"],
-            "Out",
-            numeric_grad_delta=0.05,
-            check_dygraph=(self.use_mkldnn == False))
+        self.check_grad_with_place(place, ["X"],
+                                   "Out",
+                                   numeric_grad_delta=0.05,
+                                   check_dygraph=(self.use_mkldnn == False))
 
 
 @unittest.skipIf(
     not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
     "core is not compiled with CUDA and cudnn version need larger than 8.1.0")
 class TestSoftmaxBF16CUDNNOp(TestSoftmaxBF16Op):
+
     def init_cudnn(self):
         return True
 
 
 class TestSoftmaxAPI(unittest.TestCase):
+
     def setUp(self):
-        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
+        self.place = paddle.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
         self.x_np = np.random.uniform(-1., 1., [2, 3, 4, 5]).astype('float32')
         self.out_ref = np.apply_along_axis(stable_softmax, -1, self.x_np)
         self.executed_api()
@@ -405,16 +428,19 @@ def test_error(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, self.softmax, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(
-                name='x_int32', shape=[2, 3], dtype='int32')
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[2, 3],
+                                        dtype='int32')
             self.assertRaises(TypeError, self.softmax, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(
-                name='x_fp16', shape=[2, 3], dtype='float16')
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[2, 3],
+                                       dtype='float16')
             self.softmax(x_fp16)
 
 
 class TestSoftmaxInplaceAPI(TestSoftmaxAPI):
+
     def executed_api(self):
         self.softmax = F.softmax_
 
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index 75d09e3df0c30..d4cb658d96aa0 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -112,8 +112,10 @@ def setUp(self):
         else:
             axis_dim = self.shape[self.axis]
             self.shape[self.axis] = 1
-            labels = np.random.randint(
-                0, axis_dim, self.shape, dtype=self.hard_label_dtype())
+            labels = np.random.randint(0,
+                                       axis_dim,
+                                       self.shape,
+                                       dtype=self.hard_label_dtype())
 
         loss = cross_entropy(softmax, labels, self.soft_label, self.axis,
                              self.ignore_index)
@@ -145,45 +147,48 @@ def test_check_output(self):
     def test_check_grad(self):
         if core.is_compiled_with_rocm():
             if self.python_api is not None:
-                self.check_grad(
-                    ["Logits"],
-                    "Loss",
-                    max_relative_error=5e-1,
-                    check_eager=True)
+                self.check_grad(["Logits"],
+                                "Loss",
+                                max_relative_error=5e-1,
+                                check_eager=True)
             # HIP will have accuracy fail when using float32 in CPU place
             self.check_grad(["Logits"], "Loss", max_relative_error=5e-1)
         else:
             if self.python_api is not None:
-                self.check_grad(
-                    ["Logits"],
-                    "Loss",
-                    numeric_grad_delta=0.001,
-                    check_eager=True)
+                self.check_grad(["Logits"],
+                                "Loss",
+                                numeric_grad_delta=0.001,
+                                check_eager=True)
             self.check_grad(["Logits"], "Loss", numeric_grad_delta=0.001)
 
 
 class TestSoftmaxWithCrossEntropyOpInt32(TestSoftmaxWithCrossEntropyOp):
+
     def hard_label_dtype(self):
         return "int32"
 
 
 class TestSoftmaxWithCrossEntropyOpInt16(TestSoftmaxWithCrossEntropyOp):
+
     def hard_label_dtype(self):
         return "int16"
 
 
 class TestSoftmaxWithCrossEntropyOpInt8(TestSoftmaxWithCrossEntropyOp):
+
     def hard_label_dtype(self):
         return "int8"
 
 
 class TestSoftmaxWithCrossEntropyOpUInt8(TestSoftmaxWithCrossEntropyOp):
+
     def hard_label_dtype(self):
         return "uint8"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_1D(
         TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_core_api_without_softmax
@@ -199,6 +204,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_1D(
         TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_core_api_without_softmax
@@ -217,6 +223,7 @@ def initParams(self):
 ##############################################################################
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D(
         TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_core_api_without_softmax
@@ -232,6 +239,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis2(
         TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_core_api_without_softmax
@@ -247,6 +255,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis3(
         TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_core_api_without_softmax
@@ -262,6 +271,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis4(
         TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_core_api_without_softmax
@@ -286,6 +296,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D(
         TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_core_api_without_softmax
@@ -301,6 +312,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis2(
         TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_core_api_without_softmax
@@ -316,6 +328,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis3(
         TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_core_api_without_softmax
@@ -331,6 +344,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis4(
         TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_core_api_without_softmax
@@ -355,6 +369,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_Ignore(
         TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_core_api_without_softmax
@@ -370,6 +385,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_Ignore_Axis(
         TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_core_api_without_softmax
@@ -385,6 +401,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Ignore(
         TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_core_api_without_softmax
@@ -400,6 +417,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Ignore_Axis3(
         TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_core_api_without_softmax
@@ -419,6 +437,7 @@ def initParams(self):
 
 
 class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
@@ -435,6 +454,7 @@ def initParams(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxWithCrossEntropyOpFp16(TestSoftmaxWithCrossEntropyOp):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
@@ -482,13 +502,16 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.python_api is not None:
-            self.check_grad(
-                ["Logits"], "Loss", max_relative_error=0.1, check_eager=True)
+            self.check_grad(["Logits"],
+                            "Loss",
+                            max_relative_error=0.1,
+                            check_eager=True)
         self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
 
 
-class TestSoftmaxWithCrossEntropyOpNoCudnnFp16(
-        TestSoftmaxWithCrossEntropyOpFp16):
+class TestSoftmaxWithCrossEntropyOpNoCudnnFp16(TestSoftmaxWithCrossEntropyOpFp16
+                                               ):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
@@ -502,8 +525,10 @@ def initParams(self):
 
     def test_check_grad(self):
         if self.python_api is not None:
-            self.check_grad(
-                ["Logits"], "Loss", max_relative_error=0.1, check_eager=True)
+            self.check_grad(["Logits"],
+                            "Loss",
+                            max_relative_error=0.1,
+                            check_eager=True)
         self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
 
 
@@ -533,11 +558,10 @@ def test_check_grad(self):
         if core.is_compiled_with_rocm():
             # HIP will have accuracy fail when using float32 in CPU place
             if self.python_api is not None:
-                self.check_grad(
-                    ["Logits"],
-                    "Loss",
-                    max_relative_error=0.1,
-                    check_eager=True)
+                self.check_grad(["Logits"],
+                                "Loss",
+                                max_relative_error=0.1,
+                                check_eager=True)
             self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
         else:
             if self.python_api is not None:
@@ -564,6 +588,7 @@ def initParams(self):
 
 
 class TestSoftmaxWithCrossEntropyOp3NoCudnn(TestSoftmaxWithCrossEntropyOp3):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
@@ -653,8 +678,8 @@ def initParams(self):
         self.use_softmax = True
 
 
-class TestSoftmaxWithCrossEntropyOpAxisDimEqualOne(
-        TestSoftmaxWithCrossEntropyOp):
+class TestSoftmaxWithCrossEntropyOpAxisDimEqualOne(TestSoftmaxWithCrossEntropyOp
+                                                   ):
     """
     Test softmax with cross entropy operator with discreate one-hot labels.
     Given axis != -1
@@ -675,6 +700,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis1(
         TestSoftmaxWithCrossEntropyOpNoCudnnFp16):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
@@ -690,6 +716,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis2(
         TestSoftmaxWithCrossEntropyOpNoCudnnFp16):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
@@ -705,6 +732,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis3(
         TestSoftmaxWithCrossEntropyOpNoCudnnFp16):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
@@ -718,8 +746,9 @@ def initParams(self):
         self.use_softmax = True
 
 
-class TestSoftmaxWithCrossEntropyOpSoftLabelAxis1(
-        TestSoftmaxWithCrossEntropyOp2):
+class TestSoftmaxWithCrossEntropyOpSoftLabelAxis1(TestSoftmaxWithCrossEntropyOp2
+                                                  ):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
@@ -733,8 +762,9 @@ def initParams(self):
         self.use_softmax = True
 
 
-class TestSoftmaxWithCrossEntropyOpSoftLabelAxis2(
-        TestSoftmaxWithCrossEntropyOp2):
+class TestSoftmaxWithCrossEntropyOpSoftLabelAxis2(TestSoftmaxWithCrossEntropyOp2
+                                                  ):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
@@ -748,8 +778,9 @@ def initParams(self):
         self.use_softmax = True
 
 
-class TestSoftmaxWithCrossEntropyOpSoftLabelAxis3(
-        TestSoftmaxWithCrossEntropyOp2):
+class TestSoftmaxWithCrossEntropyOpSoftLabelAxis3(TestSoftmaxWithCrossEntropyOp2
+                                                  ):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
@@ -763,8 +794,9 @@ def initParams(self):
         self.use_softmax = True
 
 
-class TestSoftmaxWithCrossEntropyOpSoftLabelAxis4(
-        TestSoftmaxWithCrossEntropyOp2):
+class TestSoftmaxWithCrossEntropyOpSoftLabelAxis4(TestSoftmaxWithCrossEntropyOp2
+                                                  ):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
@@ -780,6 +812,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis1(
         TestSoftmaxWithCrossEntropyOp3):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
@@ -795,6 +828,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis2(
         TestSoftmaxWithCrossEntropyOp3):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
@@ -810,6 +844,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis3(
         TestSoftmaxWithCrossEntropyOp3):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
@@ -825,6 +860,7 @@ def initParams(self):
 
 class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis4(
         TestSoftmaxWithCrossEntropyOp3):
+
     def initParams(self):
         self.op_type = "softmax_with_cross_entropy"
         self.python_api = python_api
diff --git a/python/paddle/fluid/tests/unittests/test_solve_op.py b/python/paddle/fluid/tests/unittests/test_solve_op.py
index fd527ec90f217..99c5eb21db449 100644
--- a/python/paddle/fluid/tests/unittests/test_solve_op.py
+++ b/python/paddle/fluid/tests/unittests/test_solve_op.py
@@ -19,6 +19,7 @@
 import paddle
 import paddle.fluid.core as core
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle.fluid as fluid
@@ -27,6 +28,7 @@
 
 # 2D normal case
 class TestSolveOp(OpTest):
+
     def config(self):
         self.input_x_matrix_shape = [15, 15]
         self.input_y_matrix_shape = [15, 10]
@@ -53,8 +55,9 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out')
 
 
-# x broadcast + 3D batch case 
+# x broadcast + 3D batch case
 class TestSolveOpBatched_case0(OpTest):
+
     def setUp(self):
         self.op_type = "solve"
         self.dtype = "float64"
@@ -75,6 +78,7 @@ def test_check_grad_normal(self):
 
 # 3D batch + y vector case
 class TestSolveOpBatched_case1(OpTest):
+
     def setUp(self):
         self.op_type = "solve"
         self.dtype = "float64"
@@ -95,6 +99,7 @@ def test_check_grad_normal(self):
 
 # 3D batch + y broadcast case
 class TestSolveOpBatched_case2(OpTest):
+
     def setUp(self):
         self.op_type = "solve"
         self.dtype = "float64"
@@ -113,8 +118,9 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.02)
 
 
-# x broadcast + 3D batch case 
+# x broadcast + 3D batch case
 class TestSolveOpBatched_case3(OpTest):
+
     def setUp(self):
         self.op_type = "solve"
         self.dtype = "float64"
@@ -133,8 +139,9 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.02)
 
 
-# 3D normal batch case 
+# 3D normal batch case
 class TestSolveOpBatched_case4(OpTest):
+
     def setUp(self):
         self.op_type = "solve"
         self.dtype = "float64"
@@ -155,6 +162,7 @@ def test_check_grad_normal(self):
 
 # 4D normal batch case
 class TestSolveOpBatched_case5(OpTest):
+
     def setUp(self):
         self.op_type = "solve"
         self.dtype = "float64"
@@ -175,6 +183,7 @@ def test_check_grad_normal(self):
 
 # 4D batch + y broadcast case
 class TestSolveOpBatched_case6(OpTest):
+
     def setUp(self):
         self.op_type = "solve"
         self.dtype = "float64"
@@ -195,6 +204,7 @@ def test_check_grad_normal(self):
 
 # 5D normal batch case
 class TestSolveOpBatched_case7(OpTest):
+
     def setUp(self):
         self.op_type = "solve"
         self.dtype = "float64"
@@ -215,6 +225,7 @@ def test_check_grad_normal(self):
 
 # 5D batch + y broadcast case
 class TestSolveOpBatched_case8(OpTest):
+
     def setUp(self):
         self.op_type = "solve"
         self.dtype = "float64"
@@ -234,16 +245,17 @@ def test_check_grad_normal(self):
 
 
 class TestSolveOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of solve_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            y1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             self.assertRaises(TypeError, paddle.linalg.solve, x1, y1)
 
-            # The data type of input must be float32 or float64.        
+            # The data type of input must be float32 or float64.
             x2 = fluid.data(name="x2", shape=[30, 30], dtype="bool")
             y2 = fluid.data(name="y2", shape=[30, 10], dtype="bool")
             self.assertRaises(TypeError, paddle.linalg.solve, x2, y2)
@@ -273,6 +285,7 @@ def test_errors(self):
 
 # 2D + vector case, FP64
 class TestSolveOpAPI_1(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(2021)
         self.place = [paddle.CPUPlace()]
@@ -282,10 +295,12 @@ def setUp(self):
 
     def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            paddle_input_x = fluid.data(
-                name="input_x", shape=[3, 3], dtype=self.dtype)
-            paddle_input_y = fluid.data(
-                name="input_y", shape=[3], dtype=self.dtype)
+            paddle_input_x = fluid.data(name="input_x",
+                                        shape=[3, 3],
+                                        dtype=self.dtype)
+            paddle_input_y = fluid.data(name="input_y",
+                                        shape=[3],
+                                        dtype=self.dtype)
             paddle_result = paddle.linalg.solve(paddle_input_x, paddle_input_y)
 
             np_input_x = np.random.random([3, 3]).astype(self.dtype)
@@ -294,11 +309,12 @@ def check_static_result(self, place):
             np_result = np.linalg.solve(np_input_x, np_input_y)
 
             exe = fluid.Executor(place)
-            fetches = exe.run(
-                fluid.default_main_program(),
-                feed={"input_x": np_input_x,
-                      "input_y": np_input_y},
-                fetch_list=[paddle_result])
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={
+                                  "input_x": np_input_x,
+                                  "input_y": np_input_y
+                              },
+                              fetch_list=[paddle_result])
             self.assertTrue(
                 np.allclose(fetches[0], np.linalg.solve(np_input_x,
                                                         np_input_y)))
@@ -308,6 +324,7 @@ def test_static(self):
             self.check_static_result(place=place)
 
     def test_dygraph(self):
+
         def run(place):
             paddle.disable_static(place)
             np.random.seed(2021)
@@ -319,8 +336,8 @@ def run(place):
 
             numpy_output = np.linalg.solve(input_x_np, input_y_np)
             paddle_output = paddle.linalg.solve(tensor_input_x, tensor_input_y)
-            self.assertEqual(
-                np.allclose(numpy_output, paddle_output.numpy()), True)
+            self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()),
+                             True)
             self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
             paddle.enable_static()
 
@@ -330,6 +347,7 @@ def run(place):
 
 # 2D normal case, FP64
 class TestSolveOpAPI_2(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(2021)
         self.place = [paddle.CPUPlace()]
@@ -340,10 +358,12 @@ def setUp(self):
     def check_static_result(self, place):
         paddle.enable_static()
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            paddle_input_x = fluid.data(
-                name="input_x", shape=[10, 10], dtype=self.dtype)
-            paddle_input_y = fluid.data(
-                name="input_y", shape=[10, 4], dtype=self.dtype)
+            paddle_input_x = fluid.data(name="input_x",
+                                        shape=[10, 10],
+                                        dtype=self.dtype)
+            paddle_input_y = fluid.data(name="input_y",
+                                        shape=[10, 4],
+                                        dtype=self.dtype)
             paddle_result = paddle.linalg.solve(paddle_input_x, paddle_input_y)
 
             np_input_x = np.random.random([10, 10]).astype(self.dtype)
@@ -352,11 +372,12 @@ def check_static_result(self, place):
             np_result = np.linalg.solve(np_input_x, np_input_y)
 
             exe = fluid.Executor(place)
-            fetches = exe.run(
-                fluid.default_main_program(),
-                feed={"input_x": np_input_x,
-                      "input_y": np_input_y},
-                fetch_list=[paddle_result])
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={
+                                  "input_x": np_input_x,
+                                  "input_y": np_input_y
+                              },
+                              fetch_list=[paddle_result])
             self.assertTrue(
                 np.allclose(fetches[0], np.linalg.solve(np_input_x,
                                                         np_input_y)))
@@ -366,6 +387,7 @@ def test_static(self):
             self.check_static_result(place=place)
 
     def test_dygraph(self):
+
         def run(place):
             paddle.disable_static(place)
             np.random.seed(2021)
@@ -377,8 +399,8 @@ def run(place):
 
             numpy_output = np.linalg.solve(input_x_np, input_y_np)
             paddle_output = paddle.linalg.solve(tensor_input_x, tensor_input_y)
-            self.assertEqual(
-                np.allclose(numpy_output, paddle_output.numpy()), True)
+            self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()),
+                             True)
             self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
             paddle.enable_static()
 
@@ -388,6 +410,7 @@ def run(place):
 
 # 2D normal case, FP32
 class TestSolveOpAPI_3(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(2021)
         self.place = [paddle.CPUPlace()]
@@ -398,10 +421,12 @@ def setUp(self):
     def check_static_result(self, place):
         paddle.enable_static()
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            paddle_input_x = fluid.data(
-                name="input_x", shape=[10, 10], dtype=self.dtype)
-            paddle_input_y = fluid.data(
-                name="input_y", shape=[10, 4], dtype=self.dtype)
+            paddle_input_x = fluid.data(name="input_x",
+                                        shape=[10, 10],
+                                        dtype=self.dtype)
+            paddle_input_y = fluid.data(name="input_y",
+                                        shape=[10, 4],
+                                        dtype=self.dtype)
             paddle_result = paddle.linalg.solve(paddle_input_x, paddle_input_y)
 
             np_input_x = np.random.random([10, 10]).astype(self.dtype)
@@ -410,22 +435,23 @@ def check_static_result(self, place):
             np_result = np.linalg.solve(np_input_x, np_input_y)
 
             exe = fluid.Executor(place)
-            fetches = exe.run(
-                fluid.default_main_program(),
-                feed={"input_x": np_input_x,
-                      "input_y": np_input_y},
-                fetch_list=[paddle_result])
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={
+                                  "input_x": np_input_x,
+                                  "input_y": np_input_y
+                              },
+                              fetch_list=[paddle_result])
             self.assertTrue(
-                np.allclose(
-                    fetches[0],
-                    np.linalg.solve(np_input_x, np_input_y),
-                    rtol=1.e-4))
+                np.allclose(fetches[0],
+                            np.linalg.solve(np_input_x, np_input_y),
+                            rtol=1.e-4))
 
     def test_static(self):
         for place in self.place:
             self.check_static_result(place=place)
 
     def test_dygraph(self):
+
         def run(place):
             paddle.disable_static(place)
             np.random.seed(2021)
@@ -438,8 +464,7 @@ def run(place):
             numpy_output = np.linalg.solve(input_x_np, input_y_np)
             paddle_output = paddle.linalg.solve(tensor_input_x, tensor_input_y)
             self.assertEqual(
-                np.allclose(
-                    numpy_output, paddle_output.numpy(), rtol=1.e-4),
+                np.allclose(numpy_output, paddle_output.numpy(), rtol=1.e-4),
                 True)
             self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
             paddle.enable_static()
@@ -450,6 +475,7 @@ def run(place):
 
 # 3D + y broadcast case, FP64
 class TestSolveOpAPI_4(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(2021)
         self.place = [paddle.CPUPlace()]
@@ -459,10 +485,12 @@ def setUp(self):
 
     def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            paddle_input_x = fluid.data(
-                name="input_x", shape=[2, 3, 3], dtype=self.dtype)
-            paddle_input_y = fluid.data(
-                name="input_y", shape=[1, 3, 3], dtype=self.dtype)
+            paddle_input_x = fluid.data(name="input_x",
+                                        shape=[2, 3, 3],
+                                        dtype=self.dtype)
+            paddle_input_y = fluid.data(name="input_y",
+                                        shape=[1, 3, 3],
+                                        dtype=self.dtype)
             paddle_result = paddle.linalg.solve(paddle_input_x, paddle_input_y)
 
             np_input_x = np.random.random([2, 3, 3]).astype(self.dtype)
@@ -471,11 +499,12 @@ def check_static_result(self, place):
             np_result = np.linalg.solve(np_input_x, np_input_y)
 
             exe = fluid.Executor(place)
-            fetches = exe.run(
-                fluid.default_main_program(),
-                feed={"input_x": np_input_x,
-                      "input_y": np_input_y},
-                fetch_list=[paddle_result])
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={
+                                  "input_x": np_input_x,
+                                  "input_y": np_input_y
+                              },
+                              fetch_list=[paddle_result])
             self.assertTrue(
                 np.allclose(fetches[0], np.linalg.solve(np_input_x,
                                                         np_input_y)))
@@ -485,6 +514,7 @@ def test_static(self):
             self.check_static_result(place=place)
 
     def test_dygraph(self):
+
         def run(place):
             paddle.disable_static(place)
             np.random.seed(2021)
@@ -496,8 +526,8 @@ def run(place):
 
             numpy_output = np.linalg.solve(input_x_np, input_y_np)
             paddle_output = paddle.linalg.solve(tensor_input_x, tensor_input_y)
-            self.assertEqual(
-                np.allclose(numpy_output, paddle_output.numpy()), True)
+            self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()),
+                             True)
             self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
             paddle.enable_static()
 
@@ -526,8 +556,10 @@ def check_static_result(self, place):
             exe = fluid.Executor(place)
             try:
                 fetches = exe.run(fluid.default_main_program(),
-                                  feed={"x": input_x_np,
-                                        "y": input_y_np},
+                                  feed={
+                                      "x": input_x_np,
+                                      "y": input_y_np
+                                  },
                                   fetch_list=[result])
             except RuntimeError as ex:
                 print("The mat is singular")
diff --git a/python/paddle/fluid/tests/unittests/test_sort_op.py b/python/paddle/fluid/tests/unittests/test_sort_op.py
index d678aa835d544..2faa2c138d89b 100644
--- a/python/paddle/fluid/tests/unittests/test_sort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sort_op.py
@@ -25,6 +25,7 @@
 
 
 class TestSortOnCPU(unittest.TestCase):
+
     def setUp(self):
         self.place = core.CPUPlace()
 
@@ -33,10 +34,9 @@ def test_api_0(self):
             input = fluid.data(name="input", shape=[2, 3, 4], dtype="float32")
             output = paddle.sort(x=input)
             exe = fluid.Executor(self.place)
-            data = np.array(
-                [[[5, 8, 9, 5], [0, 0, 1, 7], [6, 9, 2, 4]],
-                 [[5, 2, 4, 2], [4, 7, 7, 9], [1, 7, 0, 6]]],
-                dtype='float32')
+            data = np.array([[[5, 8, 9, 5], [0, 0, 1, 7], [6, 9, 2, 4]],
+                             [[5, 2, 4, 2], [4, 7, 7, 9], [1, 7, 0, 6]]],
+                            dtype='float32')
             result, = exe.run(feed={'input': data}, fetch_list=[output])
             np_result = np.sort(result)
             self.assertEqual((result == np_result).all(), True)
@@ -46,16 +46,16 @@ def test_api_1(self):
             input = fluid.data(name="input", shape=[2, 3, 4], dtype="float32")
             output = paddle.sort(x=input, axis=1)
             exe = fluid.Executor(self.place)
-            data = np.array(
-                [[[5, 8, 9, 5], [0, 0, 1, 7], [6, 9, 2, 4]],
-                 [[5, 2, 4, 2], [4, 7, 7, 9], [1, 7, 0, 6]]],
-                dtype='float32')
+            data = np.array([[[5, 8, 9, 5], [0, 0, 1, 7], [6, 9, 2, 4]],
+                             [[5, 2, 4, 2], [4, 7, 7, 9], [1, 7, 0, 6]]],
+                            dtype='float32')
             result, = exe.run(feed={'input': data}, fetch_list=[output])
             np_result = np.sort(result, axis=1)
             self.assertEqual((result == np_result).all(), True)
 
 
 class TestSortOnGPU(TestSortOnCPU):
+
     def init_place(self):
         if core.is_compiled_with_cuda():
             self.place = core.CUDAPlace(0)
@@ -64,6 +64,7 @@ def init_place(self):
 
 
 class TestSortDygraph(unittest.TestCase):
+
     def setUp(self):
         self.input_data = np.random.rand(10, 10)
         if core.is_compiled_with_cuda():
@@ -87,9 +88,8 @@ def func_api_1(self):
         paddle.disable_static(self.place)
         var_x = paddle.to_tensor(self.input_data)
         out = paddle.sort(var_x, axis=-1)
-        self.assertEqual(
-            (np.sort(
-                self.input_data, axis=-1) == out.numpy()).all(), True)
+        self.assertEqual((np.sort(self.input_data,
+                                  axis=-1) == out.numpy()).all(), True)
         paddle.enable_static()
 
     def test_api_1(self):
diff --git a/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py b/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py
index 75e1c16231c88..c4304fa920b83 100644
--- a/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py
+++ b/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py
@@ -20,6 +20,7 @@
 
 
 class TestSpaceToDepthOp(OpTest):
+
     @staticmethod
     def helper(in_, width, height, channel, batch, blocksize, forward, out_):
         channel_out = channel // (blocksize * blocksize)
@@ -65,17 +66,18 @@ def init_data(self):
         self.forward = 1
 
     def test_check_output(self):
-        place = fluid.core.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.core.CPUPlace()
+        place = fluid.core.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.core.CPUPlace()
         self.check_output_with_place(place, 1e-5, None, False)
 
     def test_check_grad(self):
-        place = fluid.core.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.core.CPUPlace()
+        place = fluid.core.CUDAPlace(
+            0) if fluid.core.is_compiled_with_cuda() else fluid.core.CPUPlace()
         self.check_grad_with_place(place, ['X'], 'Out')
 
 
 class TestSpaceToDepthOpBasic(TestSpaceToDepthOp):
+
     def init_data(self):
         self.ori_shape = (32, 8, 6, 6)
         self.infered_shape = (32, 32, 3, 3)
@@ -90,6 +92,7 @@ def init_data(self):
 
 
 class TestSpaceToDepthOpDoubleBasic(TestSpaceToDepthOp):
+
     def init_data(self):
         self.ori_shape = (32, 8, 6, 6)
         self.infered_shape = (32, 32, 3, 3)
@@ -104,6 +107,7 @@ def init_data(self):
 
 
 class TestSpaceToDepthOpWithStride3(TestSpaceToDepthOp):
+
     def init_data(self):
         self.ori_shape = (32, 9, 6, 6)
         self.infered_shape = (32, 81, 2, 2)
@@ -118,6 +122,7 @@ def init_data(self):
 
 
 class TestSpaceToDepthOpWithNotSquare(TestSpaceToDepthOp):
+
     def init_data(self):
         self.ori_shape = (32, 9, 9, 6)
         self.infered_shape = (32, 81, 3, 2)
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
index c016a482f36ec..f9e40584ee675 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
@@ -196,6 +196,7 @@ def init_csr_format(batch_size, num_heads, rows, blocksize):
     "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
 )
 class TestSparseAttentionOp(OpTest):
+
     def config(self):
         self.shape = (1, 1, 16, 16)
         self.blocksize = 4
@@ -272,6 +273,7 @@ def test_check_grad(self):
 
 
 class TestSparseAttentionOpFp32Test(TestSparseAttentionOp):
+
     def config(self):
         self.shape = (1, 1, 8, 16)
         self.blocksize = 2
@@ -280,6 +282,7 @@ def config(self):
 
 
 class TestSparseAttentionOpShapeTest(TestSparseAttentionOp):
+
     def config(self):
         self.shape = (2, 2, 32, 8)
         self.blocksize = 8
@@ -292,6 +295,7 @@ def config(self):
     "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
 )
 class TestSparseAttentionAPI(unittest.TestCase):
+
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
         self.shape = (1, 1, 8, 4)
@@ -314,10 +318,12 @@ def test_static_graph(self):
             offset_shape = (batch_size, num_heads, rows + 1)
             columns_shape = (batch_size, num_heads, int(sparse_nnz_num))
 
-            offset = paddle.static.data(
-                name="Offset", shape=offset_shape, dtype="int32")
-            columns = paddle.static.data(
-                name="Columns", shape=columns_shape, dtype="int32")
+            offset = paddle.static.data(name="Offset",
+                                        shape=offset_shape,
+                                        dtype="int32")
+            columns = paddle.static.data(name="Columns",
+                                         shape=columns_shape,
+                                         dtype="int32")
             key_padding_mask_shape = (self.shape[0], self.shape[2])
             attn_mask_shape = (self.shape[2], self.shape[2])
             if self.use_mask == True:
@@ -325,30 +331,33 @@ def test_static_graph(self):
                     name="KeyPaddingMask",
                     shape=key_padding_mask_shape,
                     dtype=self.dtype)
-                attn_mask = paddle.static.data(
-                    name="AttnMask", shape=attn_mask_shape, dtype=self.dtype)
-                Out = F.sparse_attention(
-                    Q,
-                    K,
-                    V,
-                    offset,
-                    columns,
-                    key_padding_mask=key_padding_mask,
-                    attn_mask=attn_mask)
+                attn_mask = paddle.static.data(name="AttnMask",
+                                               shape=attn_mask_shape,
+                                               dtype=self.dtype)
+                Out = F.sparse_attention(Q,
+                                         K,
+                                         V,
+                                         offset,
+                                         columns,
+                                         key_padding_mask=key_padding_mask,
+                                         attn_mask=attn_mask)
             else:
                 Out = F.sparse_attention(Q, K, V, offset, columns)
 
             Q_np = np.random.random(self.shape).astype(self.dtype)
             K_np = np.random.random(self.shape).astype(self.dtype)
             V_np = np.random.random(self.shape).astype(self.dtype)
-            offset_np, columns_np = init_csr_format(
-                self.shape[0], self.shape[1], self.shape[2], self.blocksize)
+            offset_np, columns_np = init_csr_format(self.shape[0],
+                                                    self.shape[1],
+                                                    self.shape[2],
+                                                    self.blocksize)
             offset_np = offset_np.astype('int32')
             columns_np = columns_np.astype('int32')
 
             # init mask tensor
-            key_padding_mask_np = np.random.randint(
-                0, 2, size=key_padding_mask_shape)
+            key_padding_mask_np = np.random.randint(0,
+                                                    2,
+                                                    size=key_padding_mask_shape)
             attn_mask_np = np.random.randint(0, 2, size=attn_mask_shape)
             key_padding_mask_np = init_mask(key_padding_mask_np)
             attn_mask_np = init_mask(attn_mask_np)
@@ -388,8 +397,7 @@ def test_static_graph(self):
                     Q_np, K_np, V_np, offset_np, columns_np)
 
             self.assertTrue(
-                np.allclose(
-                    fetches_result, expected_result, atol=1e-5))
+                np.allclose(fetches_result, expected_result, atol=1e-5))
 
     def test_dygraph(self):
         paddle.disable_static()
@@ -419,14 +427,13 @@ def test_dygraph(self):
         paddle_attn_mask = paddle.to_tensor(attn_mask, place=self.place)
 
         if self.use_mask == True:
-            paddle_result = F.sparse_attention(
-                paddle_query,
-                paddle_key,
-                paddle_value,
-                paddle_offset,
-                paddle_colunmns,
-                key_padding_mask=paddle_kp_mask,
-                attn_mask=paddle_attn_mask)
+            paddle_result = F.sparse_attention(paddle_query,
+                                               paddle_key,
+                                               paddle_value,
+                                               paddle_offset,
+                                               paddle_colunmns,
+                                               key_padding_mask=paddle_kp_mask,
+                                               attn_mask=paddle_attn_mask)
 
             numpy_result, __, __ = ref_batch_sparse_attention(
                 query,
@@ -442,16 +449,16 @@ def test_dygraph(self):
                                                paddle_value, paddle_offset,
                                                paddle_colunmns)
 
-            numpy_result, __, __ = ref_batch_sparse_attention(query, key, value,
-                                                              offset, columns)
+            numpy_result, __, __ = ref_batch_sparse_attention(
+                query, key, value, offset, columns)
             numpy_result = numpy_result.astype(self.dtype)
 
         self.assertTrue(
-            np.allclose(
-                paddle_result.numpy(), numpy_result, atol=1e-5))
+            np.allclose(paddle_result.numpy(), numpy_result, atol=1e-5))
 
 
 class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI):
+
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
         self.shape = (2, 2, 8, 4)
@@ -461,6 +468,7 @@ def setUp(self):
 
 
 class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI):
+
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
         self.shape = (2, 2, 64, 32)
@@ -470,6 +478,7 @@ def setUp(self):
 
 
 class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI):
+
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
         self.shape = (2, 1, 64, 32)
@@ -479,6 +488,7 @@ def setUp(self):
 
 
 class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI):
+
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
         self.shape = (4, 4, 128, 32)
@@ -488,6 +498,7 @@ def setUp(self):
 
 
 class TestSparseAttentionAPITestShape4(TestSparseAttentionAPI):
+
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
         self.shape = (3, 3, 35, 15)
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
index 1677051ee9db4..623d1b57b3eaa 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,11 +22,13 @@
 
 
 class TestSparseConv(unittest.TestCase):
+
     def test_conv3d(self):
         with _test_eager_guard():
             kernel = [[[[[1], [1], [1]], [[1], [1], [1]], [[1], [1], [1]]]]]
-            dense_kernel = paddle.to_tensor(
-                kernel, dtype='float32', stop_gradient=False)
+            dense_kernel = paddle.to_tensor(kernel,
+                                            dtype='float32',
+                                            stop_gradient=False)
             dense_kernel = paddle.reshape(dense_kernel, [1, 3, 3, 1, 1])
             paddings = [0, 0, 0]
             strides = [1, 1, 1]
@@ -41,11 +43,10 @@ def test_conv3d(self):
             correct_out_values = [[5], [11]]
             sparse_input = core.eager.sparse_coo_tensor(indices, values,
                                                         dense_shape, False)
-            out = paddle.sparse.functional.conv3d(
+            out = paddle.incubate.sparse.nn.functional.conv3d(
                 sparse_input,
                 dense_kernel,
-                bias=paddle.to_tensor(
-                    bias, dtype='float32'),
+                bias=paddle.to_tensor(bias, dtype='float32'),
                 stride=strides,
                 padding=paddings,
                 dilation=dilations,
@@ -61,10 +62,11 @@ def test_subm_conv3d(self):
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
             dense_shape = [1, 1, 3, 4, 1]
-            sparse_x = paddle.sparse.sparse_coo_tensor(
+            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                 indices, values, dense_shape, stop_gradient=True)
             weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
-            y = paddle.sparse.functional.subm_conv3d(sparse_x, weight)
+            y = paddle.incubate.sparse.nn.functional.subm_conv3d(
+                sparse_x, weight)
             assert np.array_equal(sparse_x.indices().numpy(),
                                   y.indices().numpy())
 
@@ -78,16 +80,16 @@ def test_Conv3D(self):
             values = paddle.to_tensor(values, dtype='float32')
             dense_shape = [1, 1, 3, 4, 1]
             correct_out_values = [[4], [10]]
-            sparse_input = paddle.sparse.sparse_coo_tensor(indices, values,
-                                                           dense_shape, False)
+            sparse_input = paddle.incubate.sparse.sparse_coo_tensor(
+                indices, values, dense_shape, False)
 
-            sparse_conv3d = paddle.sparse.Conv3D(
+            sparse_conv3d = paddle.incubate.sparse.nn.Conv3D(
                 1, 1, (1, 3, 3), data_format='NDHWC')
             sparse_out = sparse_conv3d(sparse_input)
             #test errors
             with self.assertRaises(ValueError):
                 #Currently, only support data_format='NDHWC'
-                conv3d = paddle.sparse.SubmConv3D(
+                conv3d = paddle.incubate.sparse.nn.SubmConv3D(
                     1, 1, (1, 3, 3), data_format='NCDHW')
 
     def test_SubmConv3D(self):
@@ -98,10 +100,10 @@ def test_SubmConv3D(self):
             values = paddle.to_tensor(values, dtype='float32')
             dense_shape = [1, 1, 3, 4, 1]
             correct_out_values = [[4], [10]]
-            sparse_input = paddle.sparse.sparse_coo_tensor(indices, values,
-                                                           dense_shape, False)
+            sparse_input = paddle.incubate.sparse.sparse_coo_tensor(
+                indices, values, dense_shape, False)
 
-            subm_conv3d = paddle.sparse.SubmConv3D(
+            subm_conv3d = paddle.incubate.sparse.nn.SubmConv3D(
                 1, 1, (1, 3, 3), data_format='NDHWC')
             # test extra_repr
             print(subm_conv3d.extra_repr())
@@ -113,5 +115,5 @@ def test_SubmConv3D(self):
             #test errors
             with self.assertRaises(ValueError):
                 #Currently, only support data_format='NDHWC'
-                conv3d = paddle.sparse.SubmConv3D(
+                conv3d = paddle.incubate.sparse.nn.SubmConv3D(
                     1, 1, (1, 3, 3), data_format='NCDHW')
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_copy_op.py b/python/paddle/fluid/tests/unittests/test_sparse_copy_op.py
index 9cf5eace71bb1..f8bc93f27032b 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_copy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_copy_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class TestSparseCopy(unittest.TestCase):
+
     def test_copy_sparse_coo(self):
         with _test_eager_guard():
             np_x = [[0, 1.0, 0], [2.0, 0, 0], [0, 3.0, 0]]
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_momentum_op.py b/python/paddle/fluid/tests/unittests/test_sparse_momentum_op.py
index 033dbd250ed61..b71a34f9dfdcc 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_momentum_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -63,6 +63,7 @@ def calculate_sparse_momentum_by_numpy(param,
 
 
 class TestSparseMomentumOp(OpTest):
+
     def setUp(self):
         self.op_type = "sparse_momentum"
         self.dtype = np.float32
@@ -85,18 +86,16 @@ def setUp(self):
         grad = np.random.random(
             (self.batch_size, self.num_classes)).astype(self.dtype)
         if self.axis == 0:
-            index = np.random.randint(
-                0,
-                self.batch_size,
-                size=(self.batch_size // 2, ),
-                dtype=self.index_dtype)
+            index = np.random.randint(0,
+                                      self.batch_size,
+                                      size=(self.batch_size // 2, ),
+                                      dtype=self.index_dtype)
             grad = grad[index]
         else:
-            index = np.random.randint(
-                0,
-                self.num_classes,
-                size=(self.num_classes // 2, ),
-                dtype=self.index_dtype)
+            index = np.random.randint(0,
+                                      self.num_classes,
+                                      size=(self.num_classes // 2, ),
+                                      dtype=self.index_dtype)
             grad = grad[:, index]
         velocity = np.random.random(
             (self.batch_size, self.num_classes)).astype(self.dtype)
@@ -128,19 +127,25 @@ def setUp(self):
         }
 
         self.inputs = {
-            'Param': param.astype("float16") if self.multi_precision else param,
-            'Velocity': velocity.astype("float32")
-            if self.multi_precision else velocity,
-            'LearningRate': learning_rate.astype("float32")
+            'Param':
+            param.astype("float16") if self.multi_precision else param,
+            'Velocity':
+            velocity.astype("float32") if self.multi_precision else velocity,
+            'LearningRate':
+            learning_rate.astype("float32")
             if self.multi_precision else learning_rate,
-            'Grad': grad.astype("float16") if self.multi_precision else grad,
-            'Index': index,
-            'Axis': np.array(self.axis).astype(np.int32),
+            'Grad':
+            grad.astype("float16") if self.multi_precision else grad,
+            'Index':
+            index,
+            'Axis':
+            np.array(self.axis).astype(np.int32),
         }
         self.outputs = {
-            'ParamOut': param_out.astype("float16")
-            if self.multi_precision else param_out,
-            'VelocityOut': velocity_out.astype("float32")
+            'ParamOut':
+            param_out.astype("float16") if self.multi_precision else param_out,
+            'VelocityOut':
+            velocity_out.astype("float32")
             if self.multi_precision else velocity_out,
         }
 
@@ -163,39 +168,45 @@ def init_use_nesterov(self):
         pass
 
     def test_check_output(self):
-        self.check_output(
-            atol=5e-3 if self.multi_precision else 1e-5, check_eager=True)
+        self.check_output(atol=5e-3 if self.multi_precision else 1e-5,
+                          check_eager=True)
 
 
 class TestSparseMomentumOpDtype1(TestSparseMomentumOp):
+
     def init_dtype(self):
         self.dtype = np.float32
         self.index_dtype = np.int64
 
 
 class TestSparseMomentumOpDtype2(TestSparseMomentumOp):
+
     def init_dtype(self):
         self.dtype = np.float64
         self.index_dtype = np.int32
 
 
 class TestSparseMomentumOpDtype3(TestSparseMomentumOp):
+
     def init_dtype(self):
         self.dtype = np.float64
         self.index_dtype = np.int64
 
 
 class TestSparseMomentumOpAxis(TestSparseMomentumOp):
+
     def init_axis(self):
         self.axis = 1
 
 
 class TestSparseMomentumOpNesterov(TestSparseMomentumOp):
+
     def init_use_nesterov(self):
         self.use_nesterov = True
 
 
 class TestSparseMomentumOpMultiPrecision(TestSparseMomentumOp):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.index_dtype = np.int32
@@ -208,6 +219,7 @@ def init_use_nesterov(self):
 
 
 class TestSparseMomentumOpMultiPrecision1(TestSparseMomentumOp):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.index_dtype = np.int64
@@ -220,6 +232,7 @@ def init_use_nesterov(self):
 
 
 class TestSparseMomentumOpMultiPrecision2(TestSparseMomentumOp):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.index_dtype = np.int32
@@ -232,6 +245,7 @@ def init_use_nesterov(self):
 
 
 class TestSparseMomentumOpMultiPrecision3(TestSparseMomentumOp):
+
     def init_dtype(self):
         self.dtype = np.float16
         self.index_dtype = np.int64
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
index 3c3085ec8be69..8eccefed6ef64 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,12 +16,15 @@
 import unittest
 import numpy as np
 import paddle
+import paddle.fluid as fluid
 from paddle.fluid.framework import _test_eager_guard
 import copy
 
 
 class TestSparseBatchNorm(unittest.TestCase):
+
     def test(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             paddle.seed(0)
             channels = 4
@@ -38,7 +41,7 @@ def test(self):
             dense_x2 = copy.deepcopy(dense_x)
             dense_x2.stop_gradient = False
             sparse_x = dense_x2.to_sparse_coo(sparse_dim)
-            sparse_batch_norm = paddle.sparse.BatchNorm(channels)
+            sparse_batch_norm = paddle.incubate.sparse.nn.BatchNorm(channels)
             # set same params
             sparse_batch_norm._mean.set_value(batch_norm._mean)
             sparse_batch_norm._variance.set_value(batch_norm._variance)
@@ -46,11 +49,10 @@ def test(self):
 
             sparse_y = sparse_batch_norm(sparse_x)
             # compare the result with dense batch_norm
-            assert np.allclose(
-                dense_y.flatten().numpy(),
-                sparse_y.values().flatten().numpy(),
-                atol=1e-5,
-                rtol=1e-5)
+            assert np.allclose(dense_y.flatten().numpy(),
+                               sparse_y.values().flatten().numpy(),
+                               atol=1e-5,
+                               rtol=1e-5)
 
             # test backward
             sparse_y.backward(sparse_y)
@@ -59,6 +61,7 @@ def test(self):
                 sparse_x.grad.values().flatten().numpy(),
                 atol=1e-5,
                 rtol=1e-5)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_error_layout(self):
         with _test_eager_guard():
@@ -66,7 +69,7 @@ def test_error_layout(self):
                 shape = [2, 3, 6, 6, 3]
                 x = paddle.randn(shape)
                 sparse_x = x.to_sparse_coo(4)
-                sparse_batch_norm = paddle.sparse.BatchNorm(
+                sparse_batch_norm = paddle.incubate.sparse.nn.BatchNorm(
                     3, data_format='NCDHW')
                 sparse_batch_norm(sparse_x)
 
@@ -77,7 +80,7 @@ def test2(self):
             x_data = paddle.randn((1, 6, 6, 6, channels)).astype('float32')
             dense_x = paddle.to_tensor(x_data)
             sparse_x = dense_x.to_sparse_coo(4)
-            batch_norm = paddle.sparse.BatchNorm(channels)
+            batch_norm = paddle.incubate.sparse.nn.BatchNorm(channels)
             batch_norm_out = batch_norm(sparse_x)
             print(batch_norm_out.shape)
             # [1, 6, 6, 6, 3]
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py b/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
index 8d65a4c4444d4..5f6d71008d785 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 
 class TestMaxPool3DFunc(unittest.TestCase):
+
     def setInput(self):
         paddle.seed(0)
         self.dense_x = paddle.randn((1, 4, 4, 4, 4))
@@ -47,7 +48,7 @@ def test(self):
             self.setUp()
             self.dense_x.stop_gradient = False
             sparse_x = self.dense_x.to_sparse_coo(4)
-            sparse_out = paddle.sparse.functional.max_pool3d(
+            sparse_out = paddle.incubate.sparse.nn.functional.max_pool3d(
                 sparse_x,
                 self.kernel_sizes,
                 stride=self.strides,
@@ -56,12 +57,11 @@ def test(self):
             out.backward(out)
 
             dense_x = copy.deepcopy(self.dense_x)
-            dense_out = paddle.nn.functional.max_pool3d(
-                dense_x,
-                self.kernel_sizes,
-                stride=self.strides,
-                padding=self.paddings,
-                data_format='NDHWC')
+            dense_out = paddle.nn.functional.max_pool3d(dense_x,
+                                                        self.kernel_sizes,
+                                                        stride=self.strides,
+                                                        padding=self.paddings,
+                                                        data_format='NDHWC')
             dense_out.backward(dense_out)
 
             #compare with dense
@@ -70,11 +70,13 @@ def test(self):
 
 
 class TestStride(TestMaxPool3DFunc):
+
     def setStride(self):
         self.strides = 1
 
 
 class TestPadding(TestMaxPool3DFunc):
+
     def setPadding(self):
         self.paddings = 1
 
@@ -83,6 +85,7 @@ def setInput(self):
 
 
 class TestKernelSize(TestMaxPool3DFunc):
+
     def setKernelSize(self):
         self.kernel_sizes = [5, 5, 5]
 
@@ -92,6 +95,7 @@ def setInput(self):
 
 
 class TestInput(TestMaxPool3DFunc):
+
     def setInput(self):
         paddle.seed(0)
         self.dense_x = paddle.randn((2, 6, 7, 9, 3))
@@ -100,17 +104,19 @@ def setInput(self):
 
 
 class TestMaxPool3DAPI(unittest.TestCase):
+
     def test(self):
         with _test_eager_guard():
             dense_x = paddle.randn((2, 3, 6, 6, 3))
             sparse_x = dense_x.to_sparse_coo(4)
-            max_pool3d = paddle.sparse.MaxPool3D(
+            max_pool3d = paddle.incubate.sparse.nn.MaxPool3D(
                 kernel_size=3, data_format='NDHWC')
             out = max_pool3d(sparse_x)
             out = out.to_dense()
 
-            dense_out = paddle.nn.functional.max_pool3d(
-                dense_x, 3, data_format='NDHWC')
+            dense_out = paddle.nn.functional.max_pool3d(dense_x,
+                                                        3,
+                                                        data_format='NDHWC')
             assert np.allclose(dense_out.numpy(), out.numpy())
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
index 573cc5ba8cf5d..3fd6665b26d8c 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
@@ -17,11 +17,13 @@
 from typing import Union, Callable
 import numpy as np
 import paddle
+import paddle.fluid as fluid
 from paddle.fluid.framework import _test_eager_guard
 from paddle import _C_ops
 
 
 class TestSparseUnary(unittest.TestCase):
+
     def assert_raises_on_dense_tensor(self, sparse_func):
         with _test_eager_guard():
             dense_x = paddle.ones((2, 3))
@@ -29,12 +31,14 @@ def assert_raises_on_dense_tensor(self, sparse_func):
                 sparse_func(dense_x)
 
     def compare_with_dense(
-            self,
-            x,
-            to_sparse: Callable[[paddle.Tensor], paddle.Tensor],
-            dense_func: Callable[[paddle.Tensor], paddle.Tensor],
-            sparse_func: Callable[[paddle.Tensor], paddle.Tensor],
-            test_gradient: bool, ):
+        self,
+        x,
+        to_sparse: Callable[[paddle.Tensor], paddle.Tensor],
+        dense_func: Callable[[paddle.Tensor], paddle.Tensor],
+        sparse_func: Callable[[paddle.Tensor], paddle.Tensor],
+        test_gradient: bool,
+    ):
+
         def tensor_allclose(dense_tensor: paddle.Tensor,
                             sparse_tensor: paddle.Tensor):
             dense_numpy = dense_tensor.numpy()
@@ -42,15 +46,18 @@ def tensor_allclose(dense_tensor: paddle.Tensor,
             return np.allclose(dense_numpy[mask],
                                sparse_tensor.to_dense().numpy()[mask])
 
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
-            dense_x = paddle.to_tensor(
-                x, dtype="float32", stop_gradient=not test_gradient)
+            dense_x = paddle.to_tensor(x,
+                                       dtype="float32",
+                                       stop_gradient=not test_gradient)
 
             sparse_x = to_sparse(dense_x)
             sparse_out = sparse_func(sparse_x)
 
-            dense_x = paddle.to_tensor(
-                x, dtype="float32", stop_gradient=not test_gradient)
+            dense_x = paddle.to_tensor(x,
+                                       dtype="float32",
+                                       stop_gradient=not test_gradient)
             dense_out = dense_func(dense_x)
 
             assert tensor_allclose(dense_out, sparse_out)
@@ -59,6 +66,7 @@ def tensor_allclose(dense_tensor: paddle.Tensor,
                 dense_out.backward(dense_out)
                 sparse_out.backward(sparse_out)
                 assert tensor_allclose(dense_x.grad, sparse_x.grad)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_sparse_relu(self):
         x = [[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]]
@@ -67,15 +75,17 @@ def test_sparse_relu(self):
             x,
             lambda x: x.to_sparse_coo(sparse_dim),
             paddle.nn.ReLU(),
-            paddle.sparse.ReLU(),
-            True, )
+            paddle.incubate.sparse.nn.ReLU(),
+            True,
+        )
         self.compare_with_dense(
             x,
             lambda x: x.to_sparse_csr(),
             paddle.nn.ReLU(),
-            paddle.sparse.ReLU(),
-            False, )
-        self.assert_raises_on_dense_tensor(paddle.sparse.ReLU())
+            paddle.incubate.sparse.nn.ReLU(),
+            False,
+        )
+        self.assert_raises_on_dense_tensor(paddle.incubate.sparse.nn.ReLU())
 
     def test_sparse_sqrt(self):
         x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, 4, 2, 0]]
@@ -84,15 +94,17 @@ def test_sparse_sqrt(self):
             x,
             lambda x: x.to_sparse_coo(sparse_dim),
             paddle.sqrt,
-            paddle.sparse.sqrt,
-            True, )
+            paddle.incubate.sparse.sqrt,
+            True,
+        )
         self.compare_with_dense(
             x,
             lambda x: x.to_sparse_csr(),
             paddle.sqrt,
-            paddle.sparse.sqrt,
-            False, )
-        self.assert_raises_on_dense_tensor(paddle.sparse.sqrt)
+            paddle.incubate.sparse.sqrt,
+            False,
+        )
+        self.assert_raises_on_dense_tensor(paddle.incubate.sparse.sqrt)
 
     def test_sparse_sin(self):
         x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, 4, 2, 0]]
@@ -101,15 +113,17 @@ def test_sparse_sin(self):
             x,
             lambda x: x.to_sparse_coo(sparse_dim),
             paddle.sin,
-            paddle.sparse.sin,
-            True, )
+            paddle.incubate.sparse.sin,
+            True,
+        )
         self.compare_with_dense(
             x,
             lambda x: x.to_sparse_csr(),
             paddle.sin,
-            paddle.sparse.sin,
-            False, )
-        self.assert_raises_on_dense_tensor(paddle.sparse.sin)
+            paddle.incubate.sparse.sin,
+            False,
+        )
+        self.assert_raises_on_dense_tensor(paddle.incubate.sparse.sin)
 
     def test_sparse_tanh(self):
         x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, -4, 2, 0]]
@@ -118,15 +132,17 @@ def test_sparse_tanh(self):
             x,
             lambda x: x.to_sparse_coo(sparse_dim),
             paddle.tanh,
-            paddle.sparse.tanh,
-            True, )
+            paddle.incubate.sparse.tanh,
+            True,
+        )
         self.compare_with_dense(
             x,
             lambda x: x.to_sparse_csr(),
             paddle.tanh,
-            paddle.sparse.tanh,
-            False, )
-        self.assert_raises_on_dense_tensor(paddle.sparse.tanh)
+            paddle.incubate.sparse.tanh,
+            False,
+        )
+        self.assert_raises_on_dense_tensor(paddle.incubate.sparse.tanh)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index 80820c0f2d837..5705763e0af5f 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,6 +16,7 @@
 import unittest
 import numpy as np
 import paddle
+import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.framework import _test_eager_guard
 
@@ -23,6 +24,7 @@
 
 
 class TestSparseCreate(unittest.TestCase):
+
     def test_create_coo_by_tensor(self):
         with _test_eager_guard():
             indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
@@ -30,8 +32,10 @@ def test_create_coo_by_tensor(self):
             dense_shape = [3, 4]
             dense_indices = paddle.to_tensor(indices)
             dense_elements = paddle.to_tensor(values, dtype='float32')
-            coo = paddle.sparse.sparse_coo_tensor(
-                dense_indices, dense_elements, dense_shape, stop_gradient=False)
+            coo = paddle.incubate.sparse.sparse_coo_tensor(dense_indices,
+                                                           dense_elements,
+                                                           dense_shape,
+                                                           stop_gradient=False)
             # test the to_string.py
             print(coo)
             assert np.array_equal(indices, coo.indices().numpy())
@@ -42,7 +46,8 @@ def test_create_coo_by_np(self):
             indices = [[0, 1, 2], [1, 2, 0]]
             values = [1.0, 2.0, 3.0]
             dense_shape = [3, 3]
-            coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
+            coo = paddle.incubate.sparse.sparse_coo_tensor(
+                indices, values, dense_shape)
             assert np.array_equal(indices, coo.indices().numpy())
             assert np.array_equal(values, coo.values().numpy())
 
@@ -56,7 +61,7 @@ def test_create_csr_by_tensor(self):
             dense_cols = paddle.to_tensor(cols)
             dense_elements = paddle.to_tensor(values, dtype='float32')
             stop_gradient = False
-            csr = paddle.sparse.sparse_csr_tensor(
+            csr = paddle.incubate.sparse.sparse_csr_tensor(
                 dense_crows,
                 dense_cols,
                 dense_elements,
@@ -69,8 +74,8 @@ def test_create_csr_by_np(self):
             cols = [1, 3, 2, 0, 1]
             values = [1, 2, 3, 4, 5]
             dense_shape = [3, 4]
-            csr = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                  dense_shape)
+            csr = paddle.incubate.sparse.sparse_csr_tensor(
+                crows, cols, values, dense_shape)
             # test the to_string.py
             print(csr)
             assert np.array_equal(crows, csr.crows().numpy())
@@ -83,8 +88,10 @@ def test_place(self):
             indices = [[0, 1], [0, 1]]
             values = [1.0, 2.0]
             dense_shape = [2, 2]
-            coo = paddle.sparse.sparse_coo_tensor(
-                indices, values, dense_shape, place=place)
+            coo = paddle.incubate.sparse.sparse_coo_tensor(indices,
+                                                           values,
+                                                           dense_shape,
+                                                           place=place)
             assert coo.place.is_cpu_place()
             assert coo.values().place.is_cpu_place()
             assert coo.indices().place.is_cpu_place()
@@ -92,8 +99,10 @@ def test_place(self):
             crows = [0, 2, 3, 5]
             cols = [1, 3, 2, 0, 1]
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            csr = paddle.sparse.sparse_csr_tensor(
-                crows, cols, values, [3, 5], place=place)
+            csr = paddle.incubate.sparse.sparse_csr_tensor(crows,
+                                                           cols,
+                                                           values, [3, 5],
+                                                           place=place)
             assert csr.place.is_cpu_place()
             assert csr.crows().place.is_cpu_place()
             assert csr.cols().place.is_cpu_place()
@@ -106,15 +115,19 @@ def test_dtype(self):
             dense_shape = [2, 2]
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
-            coo = paddle.sparse.sparse_coo_tensor(
-                indices, values, dense_shape, dtype='float64')
+            coo = paddle.incubate.sparse.sparse_coo_tensor(indices,
+                                                           values,
+                                                           dense_shape,
+                                                           dtype='float64')
             assert coo.dtype == paddle.float64
 
             crows = [0, 2, 3, 5]
             cols = [1, 3, 2, 0, 1]
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            csr = paddle.sparse.sparse_csr_tensor(
-                crows, cols, values, [3, 5], dtype='float16')
+            csr = paddle.incubate.sparse.sparse_csr_tensor(crows,
+                                                           cols,
+                                                           values, [3, 5],
+                                                           dtype='float16')
             assert csr.dtype == paddle.float16
 
     def test_create_coo_no_shape(self):
@@ -123,11 +136,12 @@ def test_create_coo_no_shape(self):
             values = [1.0, 2.0]
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
-            coo = paddle.sparse.sparse_coo_tensor(indices, values)
+            coo = paddle.incubate.sparse.sparse_coo_tensor(indices, values)
             assert [2, 2] == coo.shape
 
 
 class TestSparseConvert(unittest.TestCase):
+
     def test_to_sparse_coo(self):
         with _test_eager_guard():
             x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]]
@@ -140,7 +154,7 @@ def test_to_sparse_coo(self):
             #test to_sparse_coo_grad backward
             out_grad_indices = [[0, 1], [0, 1]]
             out_grad_values = [2.0, 3.0]
-            out_grad = paddle.sparse.sparse_coo_tensor(
+            out_grad = paddle.incubate.sparse.sparse_coo_tensor(
                 paddle.to_tensor(out_grad_indices),
                 paddle.to_tensor(out_grad_values),
                 shape=out.shape,
@@ -150,10 +164,11 @@ def test_to_sparse_coo(self):
                                   out_grad.to_dense().numpy())
 
     def test_coo_to_dense(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            sparse_x = paddle.sparse.sparse_coo_tensor(
+            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                 paddle.to_tensor(indices),
                 paddle.to_tensor(values),
                 shape=[3, 4],
@@ -163,13 +178,13 @@ def test_coo_to_dense(self):
             out_grad = [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
                         [9.0, 10.0, 11.0, 12.0]]
             dense_tensor.backward(paddle.to_tensor(out_grad))
-            #mask the out_grad by sparse_x.indices() 
+            #mask the out_grad by sparse_x.indices()
             correct_x_grad = [2.0, 4.0, 7.0, 9.0, 10.0]
             assert np.array_equal(correct_x_grad,
                                   sparse_x.grad.values().numpy())
 
             paddle.device.set_device("cpu")
-            sparse_x_cpu = paddle.sparse.sparse_coo_tensor(
+            sparse_x_cpu = paddle.incubate.sparse.sparse_coo_tensor(
                 paddle.to_tensor(indices),
                 paddle.to_tensor(values),
                 shape=[3, 4],
@@ -178,6 +193,7 @@ def test_coo_to_dense(self):
             dense_tensor_cpu.backward(paddle.to_tensor(out_grad))
             assert np.array_equal(correct_x_grad,
                                   sparse_x_cpu.grad.values().numpy())
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_to_sparse_csr(self):
         with _test_eager_guard():
@@ -195,10 +211,11 @@ def test_to_sparse_csr(self):
             assert np.array_equal(dense_tensor.numpy(), x)
 
     def test_coo_values_grad(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            sparse_x = paddle.sparse.sparse_coo_tensor(
+            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                 paddle.to_tensor(indices),
                 paddle.to_tensor(values),
                 shape=[3, 4],
@@ -211,7 +228,7 @@ def test_coo_values_grad(self):
             indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
             values = [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0],
                       [5.0, 5.0]]
-            sparse_x = paddle.sparse.sparse_coo_tensor(
+            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                 paddle.to_tensor(indices),
                 paddle.to_tensor(values),
                 shape=[3, 4, 2],
@@ -222,25 +239,27 @@ def test_coo_values_grad(self):
             # test coo_values_grad
             values_tensor.backward(paddle.to_tensor(out_grad))
             assert np.array_equal(out_grad, sparse_x.grad.values().numpy())
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_sparse_coo_tensor_grad(self):
         with _test_eager_guard():
             for device in devices:
-                if device == 'cpu' or (device == 'gpu' and
-                                       paddle.is_compiled_with_cuda()):
+                if device == 'cpu' or (device == 'gpu'
+                                       and paddle.is_compiled_with_cuda()):
                     paddle.device.set_device(device)
                     indices = [[0, 1], [0, 1]]
                     values = [1, 2]
                     indices = paddle.to_tensor(indices, dtype='int32')
-                    values = paddle.to_tensor(
-                        values, dtype='float32', stop_gradient=False)
-                    sparse_x = paddle.sparse.sparse_coo_tensor(
+                    values = paddle.to_tensor(values,
+                                              dtype='float32',
+                                              stop_gradient=False)
+                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                         indices, values, shape=[2, 2], stop_gradient=False)
                     grad_indices = [[0, 1], [1, 1]]
                     grad_values = [2, 3]
                     grad_indices = paddle.to_tensor(grad_indices, dtype='int32')
                     grad_values = paddle.to_tensor(grad_values, dtype='float32')
-                    sparse_out_grad = paddle.sparse.sparse_coo_tensor(
+                    sparse_out_grad = paddle.incubate.sparse.sparse_coo_tensor(
                         grad_indices, grad_values, shape=[2, 2])
                     sparse_x.backward(sparse_out_grad)
                     correct_values_grad = [0, 3]
@@ -249,13 +268,14 @@ def test_sparse_coo_tensor_grad(self):
 
                     # test the non-zero values is a vector
                     values = [[1, 1], [2, 2]]
-                    values = paddle.to_tensor(
-                        values, dtype='float32', stop_gradient=False)
-                    sparse_x = paddle.sparse.sparse_coo_tensor(
+                    values = paddle.to_tensor(values,
+                                              dtype='float32',
+                                              stop_gradient=False)
+                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                         indices, values, shape=[2, 2, 2], stop_gradient=False)
                     grad_values = [[2, 2], [3, 3]]
                     grad_values = paddle.to_tensor(grad_values, dtype='float32')
-                    sparse_out_grad = paddle.sparse.sparse_coo_tensor(
+                    sparse_out_grad = paddle.incubate.sparse.sparse_coo_tensor(
                         grad_indices, grad_values, shape=[2, 2, 2])
                     sparse_x.backward(sparse_out_grad)
                     correct_values_grad = [[0, 0], [3, 3]]
@@ -265,15 +285,16 @@ def test_sparse_coo_tensor_grad(self):
     def test_sparse_coo_tensor_sorted(self):
         with _test_eager_guard():
             for device in devices:
-                if device == 'cpu' or (device == 'gpu' and
-                                       paddle.is_compiled_with_cuda()):
+                if device == 'cpu' or (device == 'gpu'
+                                       and paddle.is_compiled_with_cuda()):
                     paddle.device.set_device(device)
-                    #test unsorted and duplicate indices 
+                    #test unsorted and duplicate indices
                     indices = [[1, 0, 0], [0, 1, 1]]
                     values = [1.0, 2.0, 3.0]
                     indices = paddle.to_tensor(indices, dtype='int32')
                     values = paddle.to_tensor(values, dtype='float32')
-                    sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
+                        indices, values)
                     indices_sorted = [[0, 1], [1, 0]]
                     values_sorted = [5.0, 1.0]
                     assert np.array_equal(indices_sorted,
@@ -284,7 +305,8 @@ def test_sparse_coo_tensor_sorted(self):
                     # test the non-zero values is a vector
                     values = [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]]
                     values = paddle.to_tensor(values, dtype='float32')
-                    sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
+                        indices, values)
                     values_sorted = [[5.0, 5.0], [1.0, 1.0]]
                     assert np.array_equal(indices_sorted,
                                           sparse_x.indices().numpy())
@@ -293,6 +315,7 @@ def test_sparse_coo_tensor_sorted(self):
 
 
 class TestCooError(unittest.TestCase):
+
     def test_small_shape(self):
         with _test_eager_guard():
             with self.assertRaises(ValueError):
@@ -300,7 +323,7 @@ def test_small_shape(self):
                 values = [1, 2]
                 # 1. the shape too small
                 dense_shape = [2, 2]
-                sparse_x = paddle.sparse.sparse_coo_tensor(
+                sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                     indices, values, shape=dense_shape)
 
     def test_same_nnz(self):
@@ -309,7 +332,8 @@ def test_same_nnz(self):
                 # 2. test the nnz of indices must same as nnz of values
                 indices = [[1, 2], [1, 0]]
                 values = [1, 2, 3]
-                sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+                sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
+                    indices, values)
 
     def test_same_dimensions(self):
         with _test_eager_guard():
@@ -317,18 +341,21 @@ def test_same_dimensions(self):
                 indices = [[1, 2], [1, 0]]
                 values = [1, 2, 3]
                 shape = [2, 3, 4]
-                sparse_x = paddle.sparse.sparse_coo_tensor(
-                    indices, values, shape=shape)
+                sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices,
+                                                                    values,
+                                                                    shape=shape)
 
     def test_indices_dtype(self):
         with _test_eager_guard():
             with self.assertRaises(TypeError):
                 indices = [[1.0, 2.0], [0, 1]]
                 values = [1, 2]
-                sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+                sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
+                    indices, values)
 
 
 class TestCsrError(unittest.TestCase):
+
     def test_dimension1(self):
         with _test_eager_guard():
             with self.assertRaises(ValueError):
@@ -336,8 +363,8 @@ def test_dimension1(self):
                 cols = [0, 1, 2]
                 values = [1, 2, 3]
                 shape = [3]
-                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                           shape)
+                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                    crows, cols, values, shape)
 
     def test_dimension2(self):
         with _test_eager_guard():
@@ -346,8 +373,8 @@ def test_dimension2(self):
                 cols = [0, 1, 2]
                 values = [1, 2, 3]
                 shape = [3, 3, 3, 3]
-                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                           shape)
+                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                    crows, cols, values, shape)
 
     def test_same_shape1(self):
         with _test_eager_guard():
@@ -356,8 +383,8 @@ def test_same_shape1(self):
                 cols = [0, 1, 2, 3]
                 values = [1, 2, 3]
                 shape = [3, 4]
-                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                           shape)
+                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                    crows, cols, values, shape)
 
     def test_same_shape2(self):
         with _test_eager_guard():
@@ -366,8 +393,8 @@ def test_same_shape2(self):
                 cols = [0, 1, 2, 3]
                 values = [1, 2, 3, 4]
                 shape = [3, 4]
-                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                           shape)
+                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                    crows, cols, values, shape)
 
     def test_same_shape3(self):
         with _test_eager_guard():
@@ -376,8 +403,8 @@ def test_same_shape3(self):
                 cols = [0, 1, 2, 3, 0, 1, 2]
                 values = [1, 2, 3, 4, 0, 1, 2]
                 shape = [2, 3, 4]
-                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                           shape)
+                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                    crows, cols, values, shape)
 
     def test_crows_first_value(self):
         with _test_eager_guard():
@@ -386,8 +413,8 @@ def test_crows_first_value(self):
                 cols = [0, 1, 2]
                 values = [1, 2, 3]
                 shape = [3, 4]
-                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                           shape)
+                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                    crows, cols, values, shape)
 
     def test_dtype(self):
         with _test_eager_guard():
@@ -396,8 +423,8 @@ def test_dtype(self):
                 cols = [0, 1, 2]
                 values = [1, 2, 3]
                 shape = [3]
-                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
-                                                           shape)
+                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                    crows, cols, values, shape)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
index dccc117f6bc15..10fcf961f4bf5 100644
--- a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
+++ b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
@@ -27,13 +27,14 @@
 import multiprocessing
 
 # NOTE(chenweihang): Coverage CI is currently not able to count python3
-# unittest, so the unittests here covers some cases that will only be 
-# executed in the python3 sub-process. 
+# unittest, so the unittests here covers some cases that will only be
+# executed in the python3 sub-process.
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestInitParallelEnv(unittest.TestCase):
+
     def test_check_env_failed(self):
         os.environ['FLAGS_selected_gpus'] = '0'
         os.environ['PADDLE_TRAINER_ID'] = '0'
@@ -56,6 +57,7 @@ def test_init_parallel_env_break(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSpawnAssistMethod(unittest.TestCase):
+
     def test_nprocs_greater_than_device_num_error(self):
         with self.assertRaises(RuntimeError):
             _get_subprocess_env_list(nprocs=100, options=dict())
diff --git a/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py b/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
index 7dd0c7625983e..a448884df00ce 100644
--- a/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
@@ -52,6 +52,7 @@ def spectral_norm(weight, u, v, dim, power_iters, eps):
     "because grad is not calculated in power iterations, "
     "which cannot be checked by python grad unittests")
 class TestSpectralNormOpNoGrad(OpTest):
+
     def setUp(self):
         self.initTestCase()
         self.op_type = 'spectral_norm'
@@ -92,6 +93,7 @@ def initTestCase(self):
     "because grad is not calculated in power iterations, "
     "which cannot be checked by python grad unittests")
 class TestSpectralNormOpNoGrad2(TestSpectralNormOpNoGrad):
+
     def initTestCase(self):
         self.weight_shape = (2, 3, 3, 3)
         self.u_shape = (3, )
@@ -102,11 +104,13 @@ def initTestCase(self):
 
 
 class TestSpectralNormOp(TestSpectralNormOpNoGrad):
+
     def test_check_grad_ignore_uv(self):
         self.check_grad(
             ['Weight'],
             'Out',
-            no_grad_set=set(["U", "V"]), )
+            no_grad_set=set(["U", "V"]),
+        )
 
     def initTestCase(self):
         self.weight_shape = (10, 12)
@@ -118,6 +122,7 @@ def initTestCase(self):
 
 
 class TestSpectralNormOp2(TestSpectralNormOp):
+
     def initTestCase(self):
         self.weight_shape = (2, 6, 3, 3)
         self.u_shape = (6, )
@@ -128,6 +133,7 @@ def initTestCase(self):
 
 
 class TestSpectralNormOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
 
@@ -147,11 +153,13 @@ def test_weight_dtype():
 
 
 class TestDygraphSpectralNormOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             shape = (2, 4, 3, 3)
-            spectralNorm = fluid.dygraph.nn.SpectralNorm(
-                shape, dim=1, power_iters=2)
+            spectralNorm = fluid.dygraph.nn.SpectralNorm(shape,
+                                                         dim=1,
+                                                         power_iters=2)
 
             def test_Variable():
                 weight_1 = np.random.random((2, 4)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
index fb401347308f2..8f2380845875a 100644
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
@@ -28,6 +28,7 @@
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
+
     def place(self):
         return core.CPUPlace()
 
@@ -52,12 +53,11 @@ def test_split_and_merge_lod_tensor_no_lod(self):
         expect_false = core.LoDTensor()
         expect_false.set(expect_false_tensor, self.place())
 
-        self.main(
-            tensor=tensor,
-            mask=mask,
-            expect_true=expect_true,
-            expect_false=expect_false,
-            expect_out=tensor)
+        self.main(tensor=tensor,
+                  mask=mask,
+                  expect_true=expect_true,
+                  expect_false=expect_false,
+                  expect_out=tensor)
 
     def split_and_merge_lod_tensor_level_0(self, use_merge_lod_infer=False):
         tensor = core.LoDTensor()
@@ -84,13 +84,12 @@ def split_and_merge_lod_tensor_level_0(self, use_merge_lod_infer=False):
         expect_false.set(expect_false_tensor, self.place())
         expect_false.set_recursive_sequence_lengths(expect_false_lod)
 
-        self.main(
-            tensor=tensor,
-            mask=mask,
-            expect_true=expect_true,
-            expect_false=expect_false,
-            expect_out=tensor,
-            use_merge_lod_infer=use_merge_lod_infer)
+        self.main(tensor=tensor,
+                  mask=mask,
+                  expect_true=expect_true,
+                  expect_false=expect_false,
+                  expect_out=tensor,
+                  use_merge_lod_infer=use_merge_lod_infer)
 
     def test_split_and_merge_lod_tensor_1(self):
         self.split_and_merge_lod_tensor_level_0()
@@ -129,31 +128,31 @@ def main(self,
                 helper = LayerHelper('merge_lod_tensor_infer')
                 out = helper.create_variable_for_type_inference(
                     dtype=out_true.dtype)
-                helper.append_op(
-                    type='merge_lod_tensor_infer',
-                    inputs={
-                        'X': x,
-                        'Mask': y,
-                        'InTrue': out_true,
-                        'InFalse': out_false
-                    },
-                    outputs={'Out': out},
-                    attrs={'level': level})
+                helper.append_op(type='merge_lod_tensor_infer',
+                                 inputs={
+                                     'X': x,
+                                     'Mask': y,
+                                     'InTrue': out_true,
+                                     'InFalse': out_false
+                                 },
+                                 outputs={'Out': out},
+                                 attrs={'level': level})
                 out.persistable = True
             else:
-                out = merge_lod_tensor(
-                    in_true=out_true,
-                    in_false=out_false,
-                    mask=y,
-                    x=x,
-                    level=level)
+                out = merge_lod_tensor(in_true=out_true,
+                                       in_false=out_false,
+                                       mask=y,
+                                       x=x,
+                                       level=level)
                 out.persistable = True
 
         exe = Executor(place)
         scope = core.Scope()
         exe.run(program,
-                feed={'x': tensor,
-                      'y': mask},
+                feed={
+                    'x': tensor,
+                    'y': mask
+                },
                 scope=scope,
                 return_numpy=False)
 
@@ -174,20 +173,28 @@ def check_tensor_same(self, actual, expect):
 
 
 class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
+
     def test_grad(self):
         place = core.CPUPlace()
         program = Program()
         with program_guard(program):
-            x = layers.data(
-                name='x', shape=[1], dtype='float32', stop_gradient=False)
-            y = layers.data(
-                name='y', shape=[1], dtype='bool', stop_gradient=False)
+            x = layers.data(name='x',
+                            shape=[1],
+                            dtype='float32',
+                            stop_gradient=False)
+            y = layers.data(name='y',
+                            shape=[1],
+                            dtype='bool',
+                            stop_gradient=False)
 
             level = 0
 
             out_true, out_false = split_lod_tensor(input=x, mask=y, level=level)
-            out = merge_lod_tensor(
-                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
+            out = merge_lod_tensor(in_true=out_true,
+                                   in_false=out_false,
+                                   mask=y,
+                                   x=x,
+                                   level=level)
             mean = layers.mean(out)
 
             append_backward(mean)
@@ -207,14 +214,16 @@ def test_grad(self):
 
         g_vars = program.global_block().var(x.name + "@GRAD")
         g_out = [
-            item.sum()
-            for item in map(np.array,
-                            exe.run(program,
-                                    feed={'x': tensor,
-                                          'y': mask},
-                                    fetch_list=[g_vars],
-                                    scope=scope,
-                                    return_numpy=False))
+            item.sum() for item in map(
+                np.array,
+                exe.run(program,
+                        feed={
+                            'x': tensor,
+                            'y': mask
+                        },
+                        fetch_list=[g_vars],
+                        scope=scope,
+                        return_numpy=False))
         ]
 
         g_out_sum = np.array(g_out).sum()
@@ -223,68 +232,78 @@ def test_grad(self):
 
 
 class TestMergeLodTensorOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            input_data = layers.data(
-                name='x', shape=[1], dtype='float32', stop_gradient=False)
-            y = layers.data(
-                name='y', shape=[1], dtype='bool', stop_gradient=False)
-            x_true = layers.data(
-                name='x_true', shape=[1], dtype='float32', stop_gradient=False)
-            x_false = layers.data(
-                name='x_false', shape=[1], dtype='float32', stop_gradient=False)
+            input_data = layers.data(name='x',
+                                     shape=[1],
+                                     dtype='float32',
+                                     stop_gradient=False)
+            y = layers.data(name='y',
+                            shape=[1],
+                            dtype='bool',
+                            stop_gradient=False)
+            x_true = layers.data(name='x_true',
+                                 shape=[1],
+                                 dtype='float32',
+                                 stop_gradient=False)
+            x_false = layers.data(name='x_false',
+                                  shape=[1],
+                                  dtype='float32',
+                                  stop_gradient=False)
             level = 0
 
             def test_x():
-                out = merge_lod_tensor(
-                    int_true=x_true,
-                    in_false=x_false,
-                    x=set(),
-                    mask=y,
-                    level=level)
+                out = merge_lod_tensor(int_true=x_true,
+                                       in_false=x_false,
+                                       x=set(),
+                                       mask=y,
+                                       level=level)
 
             self.assertRaises(TypeError, test_x)
 
             def test_mask():
-                out = merge_lod_tensor(
-                    int_true=x_true,
-                    in_false=x_false,
-                    x=input_data,
-                    mask=set(),
-                    level=level)
+                out = merge_lod_tensor(int_true=x_true,
+                                       in_false=x_false,
+                                       x=input_data,
+                                       mask=set(),
+                                       level=level)
 
             self.assertRaises(TypeError, test_mask)
 
             def test_xtrue():
-                out = merge_lod_tensor(
-                    int_true=set(),
-                    in_false=x_false,
-                    x=input_data,
-                    mask=y,
-                    level=level)
+                out = merge_lod_tensor(int_true=set(),
+                                       in_false=x_false,
+                                       x=input_data,
+                                       mask=y,
+                                       level=level)
 
             self.assertRaises(TypeError, test_xtrue)
 
             def test_xfalse():
-                out = merge_lod_tensor(
-                    int_true=x_true,
-                    in_false=set(),
-                    x=input_data,
-                    mask=y,
-                    level=level)
+                out = merge_lod_tensor(int_true=x_true,
+                                       in_false=set(),
+                                       x=input_data,
+                                       mask=y,
+                                       level=level)
 
             self.assertRaises(TypeError, test_xfalse)
 
 
 class TestSplitLodTensorWithError(unittest.TestCase):
+
     def test_error(self):
         main_program = Program()
         startup_program = Program()
         with program_guard(main_program, startup_program):
-            x = layers.data(
-                name='x', shape=[1], dtype='float32', stop_gradient=False)
-            y = layers.data(
-                name='y', shape=[1], dtype='bool', stop_gradient=False)
+            x = layers.data(name='x',
+                            shape=[1],
+                            dtype='float32',
+                            stop_gradient=False)
+            y = layers.data(name='y',
+                            shape=[1],
+                            dtype='bool',
+                            stop_gradient=False)
             level = 0
 
             with self.assertRaises(TypeError):
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index bf3be4080a9fc..e3f72d7b41ca2 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -23,6 +23,7 @@
 
 
 class TestSplitOp(OpTest):
+
     def setUp(self):
         self._set_op_type()
         self.dtype = self.get_dtype()
@@ -56,6 +57,7 @@ def test_check_grad(self):
 
 # test with attr(num)
 class TestSplitOp_2(OpTest):
+
     def setUp(self):
         self._set_op_type()
         self.dtype = self.get_dtype()
@@ -93,6 +95,7 @@ def test_check_grad(self):
 
 # attr(axis) is Tensor
 class TestSplitOp_AxisTensor(OpTest):
+
     def setUp(self):
         self._set_op_type()
         self.dtype = self.get_dtype()
@@ -129,6 +132,7 @@ def test_check_grad(self):
 
 # attr(sections) is list containing Tensor
 class TestSplitOp_SectionsTensor(OpTest):
+
     def setUp(self):
         self._set_op_type()
         self.dtype = self.get_dtype()
@@ -174,6 +178,7 @@ def test_check_grad(self):
 
 
 class TestSplitOp_unk_section(OpTest):
+
     def setUp(self):
         self._set_op_type()
         self.dtype = self.get_dtype()
@@ -210,6 +215,7 @@ def test_check_grad(self):
 
 
 class TestSplitByrefOp(OpTest):
+
     def _set_op_type(self):
         self.op_type = "split_byref"
 
@@ -218,9 +224,11 @@ def _set_op_type(self):
 
 
 def create_test_fp16(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestSplitFp16(parent):
+
         def get_dtype(self):
             return np.float16
 
@@ -238,9 +246,11 @@ def test_check_grad(self):
 
 
 def create_test_bf16(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestSplitBf16(parent):
+
         def get_dtype(self):
             return np.uint16
 
@@ -260,6 +270,7 @@ def test_check_grad(self):
 
 
 class TestSplitAPI(unittest.TestCase):
+
     def test_api(self):
         input_1 = np.random.random([4, 5, 6]).astype("int32")
         positive_1_int32 = fluid.layers.fill_constant([1], "int32", 1)
@@ -273,16 +284,19 @@ def test_api(self):
             num_or_sections=[positive_2_int64, positive_1_int32, -1],
             dim=positive_1_int64)
 
-        out_3, out_4, out_5 = fluid.layers.split(
-            input=x_1, num_or_sections=[2, 1, 2], dim=positive_1_int32)
+        out_3, out_4, out_5 = fluid.layers.split(input=x_1,
+                                                 num_or_sections=[2, 1, 2],
+                                                 dim=positive_1_int32)
         fluid.layers.split(input=x_2, num_or_sections=2, dim=2)
 
         exe = fluid.Executor(place=fluid.CPUPlace())
-        [res_0, res_1, res_2, res_3, res_4, res_5] = exe.run(
-            fluid.default_main_program(),
-            feed={"x_1": input_1,
-                  "x_2": input_1},
-            fetch_list=[out_0, out_1, out_2, out_3, out_4, out_5])
+        [res_0, res_1, res_2, res_3, res_4,
+         res_5] = exe.run(fluid.default_main_program(),
+                          feed={
+                              "x_1": input_1,
+                              "x_2": input_1
+                          },
+                          fetch_list=[out_0, out_1, out_2, out_3, out_4, out_5])
 
         out = np.split(input_1, [2, 3], 1)
         assert np.array_equal(res_0, out[0])
@@ -294,6 +308,7 @@ def test_api(self):
 
 
 class TestSplitOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The type of axis in split_op should be int or Variable.
@@ -332,6 +347,7 @@ def test_axis_type_tensor():
 
 
 class API_TestSplit(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = fluid.layers.data('data1', shape=[4, 6, 6], dtype='float64')
@@ -341,8 +357,10 @@ def test_out(self):
             exe = fluid.Executor(place)
             input1 = np.random.random([4, 6, 6]).astype('float64')
             input2 = np.array([2]).astype('int32')
-            r0, r1, r2, = exe.run(feed={"data1": input1,
-                                        "data2": input2},
+            r0, r1, r2, = exe.run(feed={
+                "data1": input1,
+                "data2": input2
+            },
                                   fetch_list=[x0, x1, x2])
             ex_x0, ex_x1, ex_x2 = np.split(input1, 3, axis=2)
             self.assertTrue(np.allclose(ex_x0, r0))
@@ -351,6 +369,7 @@ def test_out(self):
 
 
 class API_TestSplit2(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = fluid.layers.data('data1', shape=[4, 6, 6], dtype='float64')
@@ -367,6 +386,7 @@ def test_out(self):
 
 
 class API_TestSplit3(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.layers.data('data', shape=[-1, 10], dtype='float64')
@@ -381,6 +401,7 @@ def test_out(self):
 
 
 class API_TestSplit4(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data = fluid.layers.data('data', shape=[-1, 10], dtype='float64')
@@ -390,8 +411,10 @@ def test_out(self):
             exe = fluid.Executor(place)
             input1 = np.random.random([1, 10]).astype('float64')
             input2 = np.array([7]).astype('int32')
-            r0, r1 = exe.run(feed={"data": input1,
-                                   "index": input2},
+            r0, r1 = exe.run(feed={
+                "data": input1,
+                "index": input2
+            },
                              fetch_list=[x0, x1])
             ex_x0, ex_x1 = np.split(input1, (3, ), axis=1)
             self.assertTrue(np.allclose(ex_x0, r0))
@@ -399,6 +422,7 @@ def test_out(self):
 
 
 class API_TestDygraphSplit(unittest.TestCase):
+
     def test_out1(self):
         with fluid.dygraph.guard():
             input_1 = np.random.random([4, 6, 6]).astype("int32")
@@ -451,8 +475,9 @@ def test_out_tensor_input(self):
             # input is a variable which shape is [4, 6, 6]
             input = paddle.to_tensor(input_1)
             num1 = paddle.full(shape=[1], fill_value=2, dtype='int32')
-            x0, x1, x2 = paddle.split(
-                input, num_or_sections=[num1, 2, 2], axis=1)
+            x0, x1, x2 = paddle.split(input,
+                                      num_or_sections=[num1, 2, 2],
+                                      axis=1)
             x0_out = x0.numpy()
             x1_out = x1.numpy()
             x2_out = x2.numpy()
@@ -467,8 +492,9 @@ def test_axis_tensor_input(self):
             # input is a variable which shape is [4, 6, 6]
             input = paddle.to_tensor(input_1)
             num1 = paddle.full(shape=[1], fill_value=1, dtype='int32')
-            x0, x1, x2 = paddle.split(
-                input, num_or_sections=[2, 2, 2], axis=num1)
+            x0, x1, x2 = paddle.split(input,
+                                      num_or_sections=[2, 2, 2],
+                                      axis=num1)
             x0_out = x0.numpy()
             x1_out = x1.numpy()
             x2_out = x2.numpy()
@@ -479,6 +505,7 @@ def test_axis_tensor_input(self):
 
 
 class API_TestEmptySplit(unittest.TestCase):
+
     def test_axis_input_empty_section(self):
         with fluid.dygraph.guard():
             input_1 = np.random.random([8, 6, 6]).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_split_program.py b/python/paddle/fluid/tests/unittests/test_split_program.py
index 3245e8d997ad0..ff8348eb71913 100644
--- a/python/paddle/fluid/tests/unittests/test_split_program.py
+++ b/python/paddle/fluid/tests/unittests/test_split_program.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class TestSplitProgram(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
         if paddle.is_compiled_with_cuda():
@@ -31,10 +32,12 @@ def get_model(self, batch_size):
         main = paddle.static.Program()
         startup = paddle.static.Program()
         with paddle.static.program_guard(main, startup):
-            image = paddle.static.data(
-                shape=[batch_size, 3, 224, 224], dtype='float32', name='image')
-            label = paddle.static.data(
-                shape=[batch_size, 1], dtype='int64', name='label')
+            image = paddle.static.data(shape=[batch_size, 3, 224, 224],
+                                       dtype='float32',
+                                       name='image')
+            label = paddle.static.data(shape=[batch_size, 1],
+                                       dtype='int64',
+                                       name='label')
 
             model = resnet(pretrained=False)
             loss_fn = nn.loss.CrossEntropyLoss()
@@ -63,9 +66,8 @@ def test_split_program(self):
             self.assertEqual(len(vars_actual), len(vars_expected))
             for actual, expected in zip(vars_actual, vars_expected):
                 self.assertEqual(actual.shape, expected.shape)
-                self.assertTrue(
-                    np.array_equal(actual, expected),
-                    '{}\n{}\n'.format(actual, expected))
+                self.assertTrue(np.array_equal(actual, expected),
+                                '{}\n{}\n'.format(actual, expected))
 
     def get_places(self):
         places = [paddle.CPUPlace()]
@@ -90,8 +92,10 @@ def check_split_program(self, place, use_split=True, seed=100, batch_num=5):
         exe = paddle.static.Executor(place)
 
         image_np = np.random.random(size=image.shape).astype('float32')
-        label_np = np.random.randint(
-            low=0, high=1000, dtype='int64', size=label.shape)
+        label_np = np.random.randint(low=0,
+                                     high=1000,
+                                     dtype='int64',
+                                     size=label.shape)
 
         scope = paddle.static.Scope()
         if not use_split:
@@ -99,14 +103,16 @@ def check_split_program(self, place, use_split=True, seed=100, batch_num=5):
                 exe.run(startup_prog)
                 for _ in range(batch_num):
                     exe.run(main_prog,
-                            feed={image.name: image_np,
-                                  label.name: label_np})
+                            feed={
+                                image.name: image_np,
+                                label.name: label_np
+                            })
             return self.get_var_values(scope, startup_vars)
 
         op_num = len(main_prog.global_block().ops)
         split_op_indices = [int(op_num / 3.0), int(op_num * 3 / 4.0)]
-        programs, input_vars, output_vars = split_program(main_prog,
-                                                          split_op_indices)
+        programs, input_vars, output_vars = split_program(
+            main_prog, split_op_indices)
         op_nums = [0] + split_op_indices + [op_num]
         op_nums = [op_nums[i + 1] - op_nums[i] for i in range(len(op_nums) - 1)]
         num_split = len(split_op_indices) + 1
@@ -137,8 +143,8 @@ def check_split_program(self, place, use_split=True, seed=100, batch_num=5):
                     for out_name, out_value in zip(output_vars[i],
                                                    output_var_values):
                         if not out_value._is_initialized():
-                            tmp_vars[out_name] = np.ndarray(out_value._get_dims(
-                            )).astype('float32')
+                            tmp_vars[out_name] = np.ndarray(
+                                out_value._get_dims()).astype('float32')
                         else:
                             tmp_vars[out_name] = np.array(out_value)
 
diff --git a/python/paddle/fluid/tests/unittests/test_spp_op.py b/python/paddle/fluid/tests/unittests/test_spp_op.py
index 4a7ea97cfbd22..a4f34c4fcfa50 100644
--- a/python/paddle/fluid/tests/unittests/test_spp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spp_op.py
@@ -22,6 +22,7 @@
 
 
 class TestSppOp(OpTest):
+
     def setUp(self):
         self.op_type = "spp"
         self.init_test_case()
@@ -37,13 +38,13 @@ def setUp(self):
             padding = [0, 0]
             kernel_size[0] = np.ceil(hsize /
                                      bins.astype("double")).astype("int32")
-            padding[0] = (
-                (kernel_size[0] * bins - hsize + 1) / 2).astype("int32")
+            padding[0] = ((kernel_size[0] * bins - hsize + 1) /
+                          2).astype("int32")
 
             kernel_size[1] = np.ceil(wsize /
                                      bins.astype("double")).astype("int32")
-            padding[1] = (
-                (kernel_size[1] * bins - wsize + 1) / 2).astype("int32")
+            padding[1] = ((kernel_size[1] * bins - wsize + 1) /
+                          2).astype("int32")
             out_level = self.pool2D_forward_naive(input, kernel_size,
                                                   kernel_size, padding)
             out_level_flatten.append(
@@ -53,7 +54,9 @@ def setUp(self):
             else:
                 output = np.concatenate((output, out_level_flatten[i]), 1)
         # output = np.concatenate(out_level_flatten.tolist(), 0);
-        self.inputs = {'X': input.astype('float64'), }
+        self.inputs = {
+            'X': input.astype('float64'),
+        }
         self.attrs = {
             'pyramid_height': self.pyramid_height,
             'pooling_type': self.pool_type
@@ -74,6 +77,7 @@ def init_test_case(self):
 
 
 class TestCase2(TestSppOp):
+
     def init_test_case(self):
         self.shape = [3, 2, 16, 16]
         self.pyramid_height = 3
diff --git a/python/paddle/fluid/tests/unittests/test_square_error_cost.py b/python/paddle/fluid/tests/unittests/test_square_error_cost.py
index a10d0efe3c82e..18d6d58daa559 100644
--- a/python/paddle/fluid/tests/unittests/test_square_error_cost.py
+++ b/python/paddle/fluid/tests/unittests/test_square_error_cost.py
@@ -24,6 +24,7 @@
 
 
 class TestSquareErrorCost(unittest.TestCase):
+
     def test_square_error_cost(self):
         input_val = np.random.uniform(0.1, 0.5, (2, 3)).astype("float32")
         label_val = np.random.uniform(0.1, 0.5, (2, 3)).astype("float32")
@@ -41,15 +42,19 @@ def test_square_error_cost(self):
             place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
             exe = Executor(place)
             result = exe.run(fluid.default_main_program(),
-                             feed={"input": input_val,
-                                   "label": label_val},
+                             feed={
+                                 "input": input_val,
+                                 "label": label_val
+                             },
                              fetch_list=[output])
 
             self.assertTrue(np.isclose(np_result, result).all())
 
 
 class TestSquareErrorInvalidInput(unittest.TestCase):
+
     def test_error(self):
+
         def test_invalid_input():
             input = [256, 3]
             label = fluid.data(name='label1', shape=[None, 3], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py b/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
index b964793342ed4..3f4d376a9420d 100644
--- a/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
@@ -20,6 +20,7 @@
 
 
 class TestSquaredL2DistanceOp_f0(OpTest):
+
     def setUp(self):
         self.op_type = "squared_l2_distance"
         self.inputs = {
@@ -41,6 +42,7 @@ def test_check_grad(self):
 
 
 class TestSquaredL2DistanceOp_f1(OpTest):
+
     def setUp(self):
         self.op_type = "squared_l2_distance"
         self.inputs = {
@@ -62,6 +64,7 @@ def test_check_grad(self):
 
 
 class TestSquaredL2DistanceOp_f2(OpTest):
+
     def setUp(self):
         self.op_type = "squared_l2_distance"
         self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
index 430632ebb87db..ee8f7245634af 100644
--- a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
@@ -39,11 +39,13 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=self.max_relative_error)
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=self.max_relative_error)
 
 
 class TestL2LossDeterministic(unittest.TestCase):
+
     def check_place(self, place):
         with paddle.fluid.dygraph.guard(place):
             x_np = np.random.rand(5, 11, 13).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze2_op.py b/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
index 7d7893cfda0b1..711373165fd92 100755
--- a/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
@@ -25,6 +25,7 @@
 
 # Correct: General.
 class TestSqueezeOp(OpTest):
+
     def setUp(self):
         self.op_type = "squeeze2"
         self.python_api = paddle.squeeze
@@ -56,6 +57,7 @@ def init_attrs(self):
 
 # Correct: There is mins axis.
 class TestSqueezeOp1(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (1, 20, 1, 5)
         self.axes = (0, -2)
@@ -64,14 +66,16 @@ def init_test_case(self):
 
 # Correct: No axes input.
 class TestSqueezeOp2(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (1, 20, 1, 5)
         self.axes = ()
         self.new_shape = (20, 5)
 
 
-# Correct: Just part of axes be squeezed. 
+# Correct: Just part of axes be squeezed.
 class TestSqueezeOp3(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (6, 1, 5, 1, 4, 1)
         self.axes = (1, -1)
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
index e0e31894cb57e..c7a0724d372d0 100755
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -28,12 +28,15 @@
 
 # Correct: General.
 class TestSqueezeOp(OpTest):
+
     def setUp(self):
         self.op_type = "squeeze"
         self.init_test_case()
         self.inputs = {"X": np.random.random(self.ori_shape).astype("float64")}
         self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape), }
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -51,6 +54,7 @@ def init_attrs(self):
 
 
 class TestSqueezeBF16Op(OpTest):
+
     def setUp(self):
         self.op_type = "squeeze"
         self.dtype = np.uint16
@@ -78,6 +82,7 @@ def init_attrs(self):
 
 # Correct: There is mins axis.
 class TestSqueezeOp1(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (1, 3, 1, 40)
         self.axes = (0, -2)
@@ -86,14 +91,16 @@ def init_test_case(self):
 
 # Correct: No axes input.
 class TestSqueezeOp2(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (1, 20, 1, 5)
         self.axes = ()
         self.new_shape = (20, 5)
 
 
-# Correct: Just part of axes be squeezed. 
+# Correct: Just part of axes be squeezed.
 class TestSqueezeOp3(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (6, 1, 5, 1, 4, 1)
         self.axes = (1, -1)
@@ -102,6 +109,7 @@ def init_test_case(self):
 
 # Correct: The demension of axis is not of size 1 remains unchanged.
 class TestSqueezeOp4(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (6, 1, 5, 1, 4, 1)
         self.axes = (1, 2)
@@ -109,12 +117,13 @@ def init_test_case(self):
 
 
 class TestSqueezeOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input type of softmax_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], paddle.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         paddle.CPUPlace())
             self.assertRaises(TypeError, paddle.squeeze, x1)
             # The input axes of squeeze must be list.
             x2 = paddle.static.data(name='x2', shape=[4], dtype="int32")
@@ -125,6 +134,7 @@ def test_errors(self):
 
 
 class API_TestSqueeze(unittest.TestCase):
+
     def setUp(self):
         self.executed_api()
 
@@ -135,8 +145,9 @@ def test_out(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            data1 = paddle.static.data(
-                'data1', shape=[-1, 1, 10], dtype='float64')
+            data1 = paddle.static.data('data1',
+                                       shape=[-1, 1, 10],
+                                       dtype='float64')
             result_squeeze = self.squeeze(data1, axis=[1])
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
@@ -148,11 +159,13 @@ def test_out(self):
 
 
 class API_TestStaticSqueeze_(API_TestSqueeze):
+
     def executed_api(self):
         self.squeeze = paddle.squeeze_
 
 
 class API_TestDygraphSqueeze(unittest.TestCase):
+
     def setUp(self):
         self.executed_api()
 
@@ -206,6 +219,7 @@ def test_dimension_not_1(self):
 
 
 class API_TestDygraphSqueezeInplace(API_TestDygraphSqueeze):
+
     def executed_api(self):
         self.squeeze = paddle.squeeze_
 
diff --git a/python/paddle/fluid/tests/unittests/test_stack_op.py b/python/paddle/fluid/tests/unittests/test_stack_op.py
index faabcea13aec7..6f4e490be6bfe 100644
--- a/python/paddle/fluid/tests/unittests/test_stack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_stack_op.py
@@ -21,6 +21,7 @@
 
 
 class TestStackOpBase(OpTest):
+
     def initDefaultParameters(self):
         self.num_inputs = 4
         self.input_dim = (5, 6, 7)
@@ -63,36 +64,43 @@ def test_check_grad(self):
 
 
 class TestStackOp1(TestStackOpBase):
+
     def initParameters(self):
         self.num_inputs = 8
 
 
 class TestStackOp2(TestStackOpBase):
+
     def initParameters(self):
         self.num_inputs = 10
 
 
 class TestStackOp3(TestStackOpBase):
+
     def initParameters(self):
         self.axis = -1
 
 
 class TestStackOp4(TestStackOpBase):
+
     def initParameters(self):
         self.axis = -4
 
 
 class TestStackOp5(TestStackOpBase):
+
     def initParameters(self):
         self.axis = 1
 
 
 class TestStackOp6(TestStackOpBase):
+
     def initParameters(self):
         self.axis = 3
 
 
 class TestStackBF16Op(OpTest):
+
     def initDefaultParameters(self):
         self.num_inputs = 4
         self.input_dim = (5, 6, 7)
@@ -167,9 +175,8 @@ def test_case(self):
         exe = fluid.Executor(self.place)
         res = exe.run(self.program, fetch_list=self.out_var)
         self.assertTrue(
-            np.array_equal(
-                res[0], np.stack(
-                    [self.x] * self.iter_num, axis=self.axis)))
+            np.array_equal(res[0],
+                           np.stack([self.x] * self.iter_num, axis=self.axis)))
 
 
 class TestTensorStackAPIWithLoDTensorArray(unittest.TestCase):
@@ -203,12 +210,12 @@ def test_case(self):
         exe = fluid.Executor(self.place)
         res = exe.run(self.program, fetch_list=self.out_var)
         self.assertTrue(
-            np.array_equal(
-                res[0], np.stack(
-                    [self.x] * self.iter_num, axis=self.axis)))
+            np.array_equal(res[0],
+                           np.stack([self.x] * self.iter_num, axis=self.axis)))
 
 
 class API_test(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = fluid.layers.data('data1', shape=[1, 2], dtype='float64')
@@ -220,11 +227,12 @@ def test_out(self):
             input1 = np.random.random([1, 2]).astype('float64')
             input2 = np.random.random([1, 2]).astype('float64')
             input3 = np.random.random([1, 2]).astype('float64')
-            result, = exe.run(
-                feed={"data1": input1,
-                      "data2": input2,
-                      "data3": input3},
-                fetch_list=[result_stack])
+            result, = exe.run(feed={
+                "data1": input1,
+                "data2": input2,
+                "data3": input3
+            },
+                              fetch_list=[result_stack])
             expected_result = np.stack([input1, input2, input3], axis=0)
             self.assertTrue(np.allclose(expected_result, result))
 
@@ -235,6 +243,7 @@ def test_single_tensor_error(self):
 
 
 class API_DygraphTest(unittest.TestCase):
+
     def test_out(self):
         data1 = np.array([[1.0, 2.0]])
         data2 = np.array([[3.0, 4.0]])
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel.py
index 6f2f7408262d9..ee11b3acaf9a6 100644
--- a/python/paddle/fluid/tests/unittests/test_static_model_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel.py
@@ -24,6 +24,7 @@
 
 
 class TestStaticModelParallel(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -34,29 +35,26 @@ def _setup_config(self):
     def test_dist_static_model_parallel(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "static_model_parallel_by_row.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("static_model_parallel_by_row.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
     def test_dist_static_model_parallel2(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "static_model_parallel_by_col.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("static_model_parallel_by_col.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
     def test_dist_static_model_parallel3(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "static_model_parallel_embedding.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("static_model_parallel_embedding.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py
index e4ce8e8170fa1..7675ec7f477c5 100644
--- a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_attention.py
@@ -24,6 +24,7 @@
 
 
 class TestStaticModelParallel(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -34,11 +35,10 @@ def _setup_config(self):
     def test_dist_static_model_parallel_fused_feedforward(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "static_model_parallel_fused_attention.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("static_model_parallel_fused_attention.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py
index 1a6b637e1b45e..cb535ee43dad7 100644
--- a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py
@@ -24,6 +24,7 @@
 
 
 class TestStaticModelParallel(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
@@ -34,11 +35,10 @@ def _setup_config(self):
     def test_dist_static_model_parallel_fused_feedforward(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "static_model_parallel_fused_feedforward.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
+            self.check_with_place("static_model_parallel_fused_feedforward.py",
+                                  delta=1e-5,
+                                  check_error_log=True,
+                                  log_name=flag_name)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_multi_transformer.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_multi_transformer.py
index 5475fd4a10a13..f300b5611140f 100644
--- a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_multi_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_multi_transformer.py
@@ -24,6 +24,7 @@
 
 
 class TestStaticModelParallel(TestDistBase):
+
     def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py
index cfce0bb7d311b..9c44785d1c469 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
@@ -35,6 +35,7 @@
 
 
 class SimpleLSTMRNN(fluid.Layer):
+
     def __init__(self,
                  name_scope,
                  hidden_size,
@@ -81,23 +82,29 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
         self.hidden_array = []
 
         for i in range(self._num_layers):
-            pre_hidden = fluid.layers.slice(
-                init_hidden, axes=[0], starts=[i], ends=[i + 1])
-            pre_cell = fluid.layers.slice(
-                init_cell, axes=[0], starts=[i], ends=[i + 1])
-            pre_hidden = fluid.layers.reshape(
-                pre_hidden, shape=[-1, self._hidden_size])
-            pre_cell = fluid.layers.reshape(
-                pre_cell, shape=[-1, self._hidden_size])
+            pre_hidden = fluid.layers.slice(init_hidden,
+                                            axes=[0],
+                                            starts=[i],
+                                            ends=[i + 1])
+            pre_cell = fluid.layers.slice(init_cell,
+                                          axes=[0],
+                                          starts=[i],
+                                          ends=[i + 1])
+            pre_hidden = fluid.layers.reshape(pre_hidden,
+                                              shape=[-1, self._hidden_size])
+            pre_cell = fluid.layers.reshape(pre_cell,
+                                            shape=[-1, self._hidden_size])
             self.hidden_array.append(pre_hidden)
             self.cell_array.append(pre_cell)
 
         res = []
         for index in range(self._num_steps):
-            self._input = fluid.layers.slice(
-                input_embedding, axes=[1], starts=[index], ends=[index + 1])
-            self._input = fluid.layers.reshape(
-                self._input, shape=[-1, self._hidden_size])
+            self._input = fluid.layers.slice(input_embedding,
+                                             axes=[1],
+                                             starts=[index],
+                                             ends=[index + 1])
+            self._input = fluid.layers.reshape(self._input,
+                                               shape=[-1, self._hidden_size])
             for k in range(self._num_layers):
                 pre_hidden = self.hidden_array[k]
                 pre_cell = self.cell_array[k]
@@ -108,8 +115,9 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                 gate_input = fluid.layers.matmul(x=nn, y=weight_1)
 
                 gate_input = fluid.layers.elementwise_add(gate_input, bias)
-                i, j, f, o = fluid.layers.split(
-                    gate_input, num_or_sections=4, dim=-1)
+                i, j, f, o = fluid.layers.split(gate_input,
+                                                num_or_sections=4,
+                                                dim=-1)
                 c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
                     i) * fluid.layers.tanh(j)
                 m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
@@ -123,8 +131,8 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                         dropout_prob=self._dropout,
                         dropout_implementation='upscale_in_train')
             res.append(
-                fluid.layers.reshape(
-                    self._input, shape=[1, -1, self._hidden_size]))
+                fluid.layers.reshape(self._input,
+                                     shape=[1, -1, self._hidden_size]))
         real_res = fluid.layers.concat(res, 0)
         real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2])
         last_hidden = fluid.layers.concat(self.hidden_array, 1)
@@ -139,6 +147,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
 
 
 class PtbModel(fluid.Layer):
+
     def __init__(self,
                  name_scope,
                  hidden_size,
@@ -154,13 +163,12 @@ def __init__(self,
         self.num_layers = num_layers
         self.num_steps = num_steps
         self.dropout = dropout
-        self.simple_lstm_rnn = SimpleLSTMRNN(
-            self.full_name(),
-            hidden_size,
-            num_steps,
-            num_layers=num_layers,
-            init_scale=init_scale,
-            dropout=dropout)
+        self.simple_lstm_rnn = SimpleLSTMRNN(self.full_name(),
+                                             hidden_size,
+                                             num_steps,
+                                             num_layers=num_layers,
+                                             init_scale=init_scale,
+                                             dropout=dropout)
         self.embedding = paddle.nn.Embedding(
             num_embeddings=vocab_size,
             embedding_dim=hidden_size,
@@ -198,17 +206,18 @@ def forward(self, input, label, init_hidden, init_cell):
                 x_emb,
                 dropout_prob=self.drop_out,
                 dropout_implementation='upscale_in_train')
-        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h,
-                                                               init_c)
+        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(
+            x_emb, init_h, init_c)
 
         rnn_out = fluid.layers.reshape(
             rnn_out, shape=[-1, self.num_steps, self.hidden_size])
         projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
         projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
-        projection = fluid.layers.reshape(
-            projection, shape=[-1, self.vocab_size])
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=projection, label=label, soft_label=False)
+        projection = fluid.layers.reshape(projection,
+                                          shape=[-1, self.vocab_size])
+        loss = fluid.layers.softmax_with_cross_entropy(logits=projection,
+                                                       label=label,
+                                                       soft_label=False)
         loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
         loss = fluid.layers.reduce_mean(loss, dim=[0])
         loss = fluid.layers.reduce_sum(loss)
@@ -217,9 +226,10 @@ def forward(self, input, label, init_hidden, init_cell):
 
 
 class TestSaveLoadBase(unittest.TestCase):
+
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
 
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
@@ -234,24 +244,26 @@ def test_ptb_rnn_cpu_float32(self):
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            ptb_model = PtbModel(
-                "ptb_model",
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel("ptb_model",
+                                 hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
-            x = fluid.layers.data(
-                name="x", shape=[-1, num_steps], dtype='int64')
+            x = fluid.layers.data(name="x",
+                                  shape=[-1, num_steps],
+                                  dtype='int64')
             y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
-            init_hidden = fluid.layers.data(
-                name="init_hidden", shape=[1], dtype='float32')
-            init_cell = fluid.layers.data(
-                name="init_cell", shape=[1], dtype='float32')
+            init_hidden = fluid.layers.data(name="init_hidden",
+                                            shape=[1],
+                                            dtype='float32')
+            init_cell = fluid.layers.data(name="init_cell",
+                                          shape=[1],
+                                          dtype='float32')
 
             static_loss, static_last_hidden, static_last_cell = ptb_model(
                 x, y, init_hidden, init_cell)
@@ -271,8 +283,8 @@ def test_ptb_rnn_cpu_float32(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 fetch_list = [static_loss, static_last_hidden, static_last_cell]
                 out = exe.run(fluid.default_main_program(),
                               feed={
@@ -291,8 +303,8 @@ def test_ptb_rnn_cpu_float32(self):
             base_map = {}
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                    t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
@@ -305,8 +317,8 @@ def test_ptb_rnn_cpu_float32(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
@@ -314,16 +326,17 @@ def test_ptb_rnn_cpu_float32(self):
 
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
 
 
 class TestSaveLoadPartial(unittest.TestCase):
+
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
 
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
@@ -338,24 +351,26 @@ def test_ptb_rnn_cpu_float32(self):
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            ptb_model = PtbModel(
-                "ptb_model",
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel("ptb_model",
+                                 hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
-            x = fluid.layers.data(
-                name="x", shape=[-1, num_steps], dtype='int64')
+            x = fluid.layers.data(name="x",
+                                  shape=[-1, num_steps],
+                                  dtype='int64')
             y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
-            init_hidden = fluid.layers.data(
-                name="init_hidden", shape=[1], dtype='float32')
-            init_cell = fluid.layers.data(
-                name="init_cell", shape=[1], dtype='float32')
+            init_hidden = fluid.layers.data(name="init_hidden",
+                                            shape=[1],
+                                            dtype='float32')
+            init_cell = fluid.layers.data(name="init_cell",
+                                          shape=[1],
+                                          dtype='float32')
 
             static_loss, static_last_hidden, static_last_cell = ptb_model(
                 x, y, init_hidden, init_cell)
@@ -383,8 +398,8 @@ def test_ptb_rnn_cpu_float32(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 fetch_list = [static_loss, static_last_hidden, static_last_cell]
                 out = exe.run(fluid.default_main_program(),
                               feed={
@@ -403,8 +418,8 @@ def test_ptb_rnn_cpu_float32(self):
             base_map = {}
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                    t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
@@ -417,8 +432,8 @@ def test_ptb_rnn_cpu_float32(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
@@ -426,17 +441,18 @@ def test_ptb_rnn_cpu_float32(self):
 
             for var in test_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
             fluid.load(test_program, "./test_1.pdmodel", None)
 
 
 class TestSaveLoadSetStateDict(unittest.TestCase):
+
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
 
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
@@ -451,24 +467,26 @@ def test_ptb_rnn_cpu_float32(self):
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            ptb_model = PtbModel(
-                "ptb_model",
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel("ptb_model",
+                                 hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
-            x = fluid.layers.data(
-                name="x", shape=[-1, num_steps], dtype='int64')
+            x = fluid.layers.data(name="x",
+                                  shape=[-1, num_steps],
+                                  dtype='int64')
             y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
-            init_hidden = fluid.layers.data(
-                name="init_hidden", shape=[1], dtype='float32')
-            init_cell = fluid.layers.data(
-                name="init_cell", shape=[1], dtype='float32')
+            init_hidden = fluid.layers.data(name="init_hidden",
+                                            shape=[1],
+                                            dtype='float32')
+            init_cell = fluid.layers.data(name="init_cell",
+                                          shape=[1],
+                                          dtype='float32')
 
             static_loss, static_last_hidden, static_last_cell = ptb_model(
                 x, y, init_hidden, init_cell)
@@ -488,8 +506,8 @@ def test_ptb_rnn_cpu_float32(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 fetch_list = [static_loss, static_last_hidden, static_last_cell]
                 out = exe.run(fluid.default_main_program(),
                               feed={
@@ -508,8 +526,8 @@ def test_ptb_rnn_cpu_float32(self):
             base_map = {}
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                    t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
@@ -522,8 +540,8 @@ def test_ptb_rnn_cpu_float32(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
@@ -531,16 +549,17 @@ def test_ptb_rnn_cpu_float32(self):
 
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
 
 
 class TestProgramStatePartial(unittest.TestCase):
+
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
 
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
@@ -555,24 +574,26 @@ def test_ptb_rnn_cpu_float32(self):
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            ptb_model = PtbModel(
-                "ptb_model",
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel("ptb_model",
+                                 hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
-            x = fluid.layers.data(
-                name="x", shape=[-1, num_steps], dtype='int64')
+            x = fluid.layers.data(name="x",
+                                  shape=[-1, num_steps],
+                                  dtype='int64')
             y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
-            init_hidden = fluid.layers.data(
-                name="init_hidden", shape=[1], dtype='float32')
-            init_cell = fluid.layers.data(
-                name="init_cell", shape=[1], dtype='float32')
+            init_hidden = fluid.layers.data(name="init_hidden",
+                                            shape=[1],
+                                            dtype='float32')
+            init_cell = fluid.layers.data(name="init_cell",
+                                          shape=[1],
+                                          dtype='float32')
 
             static_loss, static_last_hidden, static_last_cell = ptb_model(
                 x, y, init_hidden, init_cell)
@@ -600,8 +621,8 @@ def test_ptb_rnn_cpu_float32(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 fetch_list = [static_loss, static_last_hidden, static_last_cell]
                 out = exe.run(fluid.default_main_program(),
                               feed={
@@ -620,8 +641,8 @@ def test_ptb_rnn_cpu_float32(self):
             base_map = {}
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                    t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
@@ -634,8 +655,8 @@ def test_ptb_rnn_cpu_float32(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
@@ -656,8 +677,8 @@ def test_ptb_rnn_cpu_float32(self):
 
             for var in test_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
 
@@ -667,8 +688,8 @@ def test_ptb_rnn_cpu_float32(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
@@ -676,8 +697,8 @@ def test_ptb_rnn_cpu_float32(self):
 
             for var in test_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
 
@@ -687,8 +708,8 @@ def test_ptb_rnn_cpu_float32(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
@@ -696,8 +717,8 @@ def test_ptb_rnn_cpu_float32(self):
 
             for var in test_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
 
@@ -707,8 +728,8 @@ def test_ptb_rnn_cpu_float32(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
@@ -716,16 +737,17 @@ def test_ptb_rnn_cpu_float32(self):
 
             for var in test_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
 
 
 class TestVariableInit(unittest.TestCase):
+
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
 
     def test_variable_init(self):
 
@@ -758,8 +780,8 @@ def set_var(var, ndarray):
 
         place = self.set_place()
         exe = fluid.Executor(place)
-        parameter_list = list(
-            filter(fluid.io.is_parameter, program.list_vars()))
+        parameter_list = list(filter(fluid.io.is_parameter,
+                                     program.list_vars()))
 
         fluid.core._create_loaded_parameter(parameter_list, new_scope,
                                             exe._default_executor)
@@ -794,8 +816,8 @@ def set_var(var, ndarray):
         base_map = {}
         for var in program.list_vars():
             if isinstance(var, framework.Parameter) or var.persistable:
-                t = np.array(fluid.global_scope().find_var(var.name)
-                             .get_tensor())
+                t = np.array(fluid.global_scope().find_var(
+                    var.name).get_tensor())
                 # make sure all the paramerter or optimizer var have been update
                 base_map[var.name] = t
 
@@ -808,6 +830,7 @@ def set_var(var, ndarray):
 
 
 class TestLoadFromOldInterface(unittest.TestCase):
+
     def setUp(self):
         if os.path.exists("test_path.pdparams"):
             os.remove("test_path.pdparams")
@@ -816,8 +839,8 @@ def setUp(self):
             os.remove("test_static_load_var_list.pdparams")
 
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
 
     def test_load_from_old_interface(self):
         seed = 90
@@ -832,24 +855,26 @@ def test_load_from_old_interface(self):
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            ptb_model = PtbModel(
-                "ptb_model",
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel("ptb_model",
+                                 hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
-            x = fluid.layers.data(
-                name="x", shape=[-1, num_steps], dtype='int64')
+            x = fluid.layers.data(name="x",
+                                  shape=[-1, num_steps],
+                                  dtype='int64')
             y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
-            init_hidden = fluid.layers.data(
-                name="init_hidden", shape=[1], dtype='float32')
-            init_cell = fluid.layers.data(
-                name="init_cell", shape=[1], dtype='float32')
+            init_hidden = fluid.layers.data(name="init_hidden",
+                                            shape=[1],
+                                            dtype='float32')
+            init_cell = fluid.layers.data(name="init_cell",
+                                          shape=[1],
+                                          dtype='float32')
 
             static_loss, static_last_hidden, static_last_cell = ptb_model(
                 x, y, init_hidden, init_cell)
@@ -871,8 +896,8 @@ def test_load_from_old_interface(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 fetch_list = [static_loss, static_last_hidden, static_last_cell]
                 out = exe.run(fluid.default_main_program(),
                               feed={
@@ -891,8 +916,8 @@ def test_load_from_old_interface(self):
             base_map = {}
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                    t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
@@ -906,8 +931,8 @@ def test_load_from_old_interface(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
@@ -915,8 +940,8 @@ def test_load_from_old_interface(self):
 
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
 
@@ -947,24 +972,26 @@ def test_load_from_old_interface_var_list(self):
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            ptb_model = PtbModel(
-                "ptb_model",
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel("ptb_model",
+                                 hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
-            x = fluid.layers.data(
-                name="x", shape=[-1, num_steps], dtype='int64')
+            x = fluid.layers.data(name="x",
+                                  shape=[-1, num_steps],
+                                  dtype='int64')
             y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
-            init_hidden = fluid.layers.data(
-                name="init_hidden", shape=[1], dtype='float32')
-            init_cell = fluid.layers.data(
-                name="init_cell", shape=[1], dtype='float32')
+            init_hidden = fluid.layers.data(name="init_hidden",
+                                            shape=[1],
+                                            dtype='float32')
+            init_cell = fluid.layers.data(name="init_cell",
+                                          shape=[1],
+                                          dtype='float32')
 
             static_loss, static_last_hidden, static_last_cell = ptb_model(
                 x, y, init_hidden, init_cell)
@@ -986,8 +1013,8 @@ def test_load_from_old_interface_var_list(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 fetch_list = [static_loss, static_last_hidden, static_last_cell]
                 out = exe.run(fluid.default_main_program(),
                               feed={
@@ -1006,8 +1033,8 @@ def test_load_from_old_interface_var_list(self):
             base_map = {}
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                    t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
@@ -1016,7 +1043,7 @@ def test_load_from_old_interface_var_list(self):
             fluid.io.save_persistables(exe, "test_static_load_var_list",
                                        main_program)
 
-            # set var to zero            
+            # set var to zero
             var_list = []
             for i, var in enumerate(main_program.list_vars()):
                 if isinstance(var, framework.Parameter) or var.persistable:
@@ -1025,8 +1052,8 @@ def test_load_from_old_interface_var_list(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
@@ -1034,8 +1061,8 @@ def test_load_from_old_interface_var_list(self):
             var_list_names = [var.name for var in var_list]
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     if var.name in var_list_names:
                         # loaded vars
                         base_t = base_map[var.name]
@@ -1046,9 +1073,10 @@ def test_load_from_old_interface_var_list(self):
 
 
 class TestLoadFromOldInterfaceSingleFile(unittest.TestCase):
+
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
 
     def test_load_from_old_interface(self):
         seed = 90
@@ -1063,24 +1091,26 @@ def test_load_from_old_interface(self):
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            ptb_model = PtbModel(
-                "ptb_model",
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel("ptb_model",
+                                 hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
-            x = fluid.layers.data(
-                name="x", shape=[-1, num_steps], dtype='int64')
+            x = fluid.layers.data(name="x",
+                                  shape=[-1, num_steps],
+                                  dtype='int64')
             y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
-            init_hidden = fluid.layers.data(
-                name="init_hidden", shape=[1], dtype='float32')
-            init_cell = fluid.layers.data(
-                name="init_cell", shape=[1], dtype='float32')
+            init_hidden = fluid.layers.data(name="init_hidden",
+                                            shape=[1],
+                                            dtype='float32')
+            init_cell = fluid.layers.data(name="init_cell",
+                                          shape=[1],
+                                          dtype='float32')
 
             static_loss, static_last_hidden, static_last_cell = ptb_model(
                 x, y, init_hidden, init_cell)
@@ -1100,8 +1130,8 @@ def test_load_from_old_interface(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 fetch_list = [static_loss, static_last_hidden, static_last_cell]
                 out = exe.run(fluid.default_main_program(),
                               feed={
@@ -1120,15 +1150,17 @@ def test_load_from_old_interface(self):
             base_map = {}
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                    t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
             #fluid.save(main_program, "./test_1")
-            fluid.io.save_persistables(
-                exe, "test_path", main_program, filename="model_single")
+            fluid.io.save_persistables(exe,
+                                       "test_path",
+                                       main_program,
+                                       filename="model_single")
 
             # set var to zero
             for var in main_program.list_vars():
@@ -1136,8 +1168,8 @@ def test_load_from_old_interface(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
@@ -1147,8 +1179,8 @@ def test_load_from_old_interface(self):
 
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
 
@@ -1166,8 +1198,10 @@ def test_load_from_old_interface(self):
                 fluid.load(main_program, file_model_path, exe,
                            fluid.io.get_program_persistable_vars(main_program))
 
-            fluid.io.save_params(
-                exe, "test_path", main_program, filename="model_single")
+            fluid.io.save_params(exe,
+                                 "test_path",
+                                 main_program,
+                                 filename="model_single")
             with self.assertRaises(RuntimeError):
                 fluid.load(main_program, file_model_path, exe,
                            fluid.io.get_program_persistable_vars(main_program))
@@ -1183,22 +1217,22 @@ def test_load_from_old_interface(self):
 
             # check save params, load var_list = get_program_persistable_vars
             with self.assertRaises(RuntimeError):
-                temp_var = framework.Variable(
-                    main_program.global_block(),
-                    shape=[1],
-                    name="test_temp_var")
+                temp_var = framework.Variable(main_program.global_block(),
+                                              shape=[1],
+                                              name="test_temp_var")
                 all_var_list = list(main_program.list_vars())
                 fluid.load(main_program, file_model_path, exe,
                            all_var_list + [temp_var])
 
 
 class TestProgramStateOldSave(unittest.TestCase):
+
     def setUp(self):
         self.test_dygraph = True
 
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
 
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
@@ -1213,24 +1247,26 @@ def test_ptb_rnn_cpu_float32(self):
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            ptb_model = PtbModel(
-                "ptb_model",
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel("ptb_model",
+                                 hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
-            x = fluid.layers.data(
-                name="x", shape=[-1, num_steps], dtype='int64')
+            x = fluid.layers.data(name="x",
+                                  shape=[-1, num_steps],
+                                  dtype='int64')
             y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
-            init_hidden = fluid.layers.data(
-                name="init_hidden", shape=[1], dtype='float32')
-            init_cell = fluid.layers.data(
-                name="init_cell", shape=[1], dtype='float32')
+            init_hidden = fluid.layers.data(name="init_hidden",
+                                            shape=[1],
+                                            dtype='float32')
+            init_cell = fluid.layers.data(name="init_cell",
+                                          shape=[1],
+                                          dtype='float32')
 
             static_loss, static_last_hidden, static_last_cell = ptb_model(
                 x, y, init_hidden, init_cell)
@@ -1258,8 +1294,8 @@ def test_ptb_rnn_cpu_float32(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 fetch_list = [static_loss, static_last_hidden, static_last_cell]
                 out = exe.run(fluid.default_main_program(),
                               feed={
@@ -1278,8 +1314,8 @@ def test_ptb_rnn_cpu_float32(self):
             base_map = {}
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                    t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
@@ -1292,8 +1328,8 @@ def test_ptb_rnn_cpu_float32(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
@@ -1345,16 +1381,17 @@ def create_symlink(self, target, link_name):
     def check_in_static(self, main_program, base_map):
         for var in main_program.list_vars():
             if isinstance(var, framework.Parameter) or var.persistable:
-                new_t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                new_t = np.array(fluid.global_scope().find_var(
+                    var.name).get_tensor())
                 base_t = base_map[var.name]
                 self.assertTrue(np.array_equal(new_t, base_t))
 
 
 class TestProgramStateOldSaveSingleModel(unittest.TestCase):
+
     def set_place(self):
-        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        return fluid.CPUPlace(
+        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
 
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
@@ -1369,24 +1406,26 @@ def test_ptb_rnn_cpu_float32(self):
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            ptb_model = PtbModel(
-                "ptb_model",
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel("ptb_model",
+                                 hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
-            x = fluid.layers.data(
-                name="x", shape=[-1, num_steps], dtype='int64')
+            x = fluid.layers.data(name="x",
+                                  shape=[-1, num_steps],
+                                  dtype='int64')
             y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
-            init_hidden = fluid.layers.data(
-                name="init_hidden", shape=[1], dtype='float32')
-            init_cell = fluid.layers.data(
-                name="init_cell", shape=[1], dtype='float32')
+            init_hidden = fluid.layers.data(name="init_hidden",
+                                            shape=[1],
+                                            dtype='float32')
+            init_cell = fluid.layers.data(name="init_cell",
+                                          shape=[1],
+                                          dtype='float32')
 
             static_loss, static_last_hidden, static_last_cell = ptb_model(
                 x, y, init_hidden, init_cell)
@@ -1414,8 +1453,8 @@ def test_ptb_rnn_cpu_float32(self):
                 y_data = y_data.reshape((-1, 1))
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='float32')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='float32')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='float32')
                 fetch_list = [static_loss, static_last_hidden, static_last_cell]
                 out = exe.run(fluid.default_main_program(),
                               feed={
@@ -1434,14 +1473,16 @@ def test_ptb_rnn_cpu_float32(self):
             base_map = {}
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                    t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
-            fluid.io.save_persistables(
-                exe, "test_program_2", main_program, filename="model_1")
+            fluid.io.save_persistables(exe,
+                                       "test_program_2",
+                                       main_program,
+                                       filename="model_1")
 
             # set var to zero
             for var in main_program.list_vars():
@@ -1449,8 +1490,8 @@ def test_ptb_rnn_cpu_float32(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
@@ -1462,8 +1503,8 @@ def test_ptb_rnn_cpu_float32(self):
 
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
 
@@ -1472,9 +1513,9 @@ def test_ptb_rnn_cpu_float32(self):
                     os.path.join("test_program_2", "model_1"))
 
             with self.assertRaises(TypeError):
-                fluid.load_program_state(
-                    os.path.join("test_program_2", "model_1"),
-                    var_list=["str"])
+                fluid.load_program_state(os.path.join("test_program_2",
+                                                      "model_1"),
+                                         var_list=["str"])
 
             with self.assertRaises(RuntimeError):
                 fluid.load_program_state(
@@ -1486,16 +1527,16 @@ def test_ptb_rnn_cpu_float32(self):
 
 
 class TestStaticSaveLoadPickle(unittest.TestCase):
+
     def test_pickle_protocol(self):
         # enable static mode
         paddle.enable_static()
 
         with new_program_scope():
             # create network
-            x = paddle.static.data(
-                name="static_save_load_large_x",
-                shape=[None, 10],
-                dtype='float32')
+            x = paddle.static.data(name="static_save_load_large_x",
+                                   shape=[None, 10],
+                                   dtype='float32')
             z = paddle.static.nn.fc(x, 10, bias_attr=False)
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
@@ -1505,8 +1546,8 @@ def test_pickle_protocol(self):
             base_map = {}
             for var in prog.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                    t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
@@ -1523,7 +1564,9 @@ def test_pickle_protocol(self):
             with self.assertRaises(ValueError):
                 paddle.fluid.save(prog, path, 5)
 
-            protocols = [2, ]
+            protocols = [
+                2,
+            ]
             if sys.version_info.major >= 3 and sys.version_info.minor >= 4:
                 protocols += [3, 4]
             for protocol in protocols:
@@ -1535,16 +1578,16 @@ def test_pickle_protocol(self):
                             var.name).get_tensor()
                         ten.set(np.zeros_like(np.array(ten)), place)
 
-                        new_t = np.array(fluid.global_scope().find_var(var.name)
-                                         .get_tensor())
+                        new_t = np.array(fluid.global_scope().find_var(
+                            var.name).get_tensor())
                         self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
                 paddle.fluid.load(prog, path)
 
                 for var in prog.list_vars():
                     if isinstance(var, framework.Parameter) or var.persistable:
-                        new_t = np.array(fluid.global_scope().find_var(var.name)
-                                         .get_tensor())
+                        new_t = np.array(fluid.global_scope().find_var(
+                            var.name).get_tensor())
                         base_t = base_map[var.name]
                         self.assertTrue(np.array_equal(new_t, base_t))
 
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py b/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py
index bc8c3cc5b23e5..25619aa4a5c04 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py
@@ -28,6 +28,7 @@
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestSaveLoadBF16(unittest.TestCase):
+
     def set_place(self):
         return fluid.CPUPlace()
 
@@ -44,24 +45,26 @@ def test_ptb_rnn_cpu_bfloat16(self):
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            ptb_model = PtbModel(
-                "ptb_model",
-                hidden_size=hidden_size,
-                vocab_size=vocab_size,
-                num_layers=num_layers,
-                num_steps=num_steps,
-                init_scale=init_scale)
+            ptb_model = PtbModel("ptb_model",
+                                 hidden_size=hidden_size,
+                                 vocab_size=vocab_size,
+                                 num_layers=num_layers,
+                                 num_steps=num_steps,
+                                 init_scale=init_scale)
 
             place = self.set_place()
             exe = fluid.Executor(place)
             sgd = SGDOptimizer(learning_rate=1e-3)
-            x = fluid.layers.data(
-                name="x", shape=[-1, num_steps], dtype='int64')
+            x = fluid.layers.data(name="x",
+                                  shape=[-1, num_steps],
+                                  dtype='int64')
             y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
-            init_hidden = fluid.layers.data(
-                name="init_hidden", shape=[1], dtype='float32')
-            init_cell = fluid.layers.data(
-                name="init_cell", shape=[1], dtype='float32')
+            init_hidden = fluid.layers.data(name="init_hidden",
+                                            shape=[1],
+                                            dtype='float32')
+            init_cell = fluid.layers.data(name="init_cell",
+                                          shape=[1],
+                                          dtype='float32')
 
             static_loss, static_last_hidden, static_last_cell = ptb_model(
                 x, y, init_hidden, init_cell)
@@ -85,8 +88,8 @@ def test_ptb_rnn_cpu_bfloat16(self):
                 # slice_op PR(datatypes in model graph are different than datatypes during runtime because of that)
                 init_hidden_data = np.zeros(
                     (num_layers, batch_size, hidden_size), dtype='uint16')
-                init_cell_data = np.zeros(
-                    (num_layers, batch_size, hidden_size), dtype='uint16')
+                init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
+                                          dtype='uint16')
 
                 fetch_list = [static_loss, static_last_hidden, static_last_cell]
                 out = exe.run(fluid.default_main_program(),
@@ -103,8 +106,8 @@ def test_ptb_rnn_cpu_bfloat16(self):
             base_map = {}
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                    t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
@@ -117,8 +120,8 @@ def test_ptb_rnn_cpu_bfloat16(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
@@ -126,8 +129,8 @@ def test_ptb_rnn_cpu_bfloat16(self):
 
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
 
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
index 389fc259b5549..fdb6a1f2f0585 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
@@ -28,15 +28,15 @@
 
 
 class TestStaticSaveLoadLargeParameters(unittest.TestCase):
+
     def test_large_parameters_static_save(self):
         # enable static mode
         paddle.enable_static()
         with new_program_scope():
             # create network
-            x = paddle.static.data(
-                name="static_save_load_large_x",
-                shape=[None, 10],
-                dtype='float32')
+            x = paddle.static.data(name="static_save_load_large_x",
+                                   shape=[None, 10],
+                                   dtype='float32')
             z = paddle.static.nn.fc(x, LARGE_PARAM, bias_attr=False)
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
@@ -46,8 +46,8 @@ def test_large_parameters_static_save(self):
             base_map = {}
             for var in prog.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    t = np.array(fluid.global_scope().find_var(var.name)
-                                 .get_tensor())
+                    t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
@@ -62,16 +62,16 @@ def test_large_parameters_static_save(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             paddle.fluid.load(prog, path)
 
             for var in prog.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
 
@@ -81,16 +81,16 @@ def test_large_parameters_static_save(self):
                     ten = fluid.global_scope().find_var(var.name).get_tensor()
                     ten.set(np.zeros_like(np.array(ten)), place)
 
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             program_state = fluid.load_program_state(path)
             fluid.set_program_state(prog, program_state)
             for var in prog.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
-                    new_t = np.array(fluid.global_scope().find_var(var.name)
-                                     .get_tensor())
+                    new_t = np.array(fluid.global_scope().find_var(
+                        var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
 
diff --git a/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py b/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py
index 2c6d646baf593..0e22905e81d69 100644
--- a/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py
@@ -17,10 +17,12 @@
 
 
 class StaticShapeInferrenceTest(unittest.TestCase):
+
     def test_static_graph(self):
         paddle.enable_static()
-        data = paddle.fluid.layers.data(
-            name="x", shape=[-1, 2], dtype='float32')
+        data = paddle.fluid.layers.data(name="x",
+                                        shape=[-1, 2],
+                                        dtype='float32')
         shape = paddle.fluid.layers.shape(data)  # shape should be [-1, 2]
         x = paddle.fluid.layers.uniform_random(shape)
         self.assertEqual(x.shape, data.shape)
diff --git a/python/paddle/fluid/tests/unittests/test_std_layer.py b/python/paddle/fluid/tests/unittests/test_std_layer.py
index 2196996afffc9..4252899eba6f2 100644
--- a/python/paddle/fluid/tests/unittests/test_std_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_std_layer.py
@@ -27,6 +27,7 @@ def ref_std(x, axis=None, unbiased=True, keepdim=False):
 
 
 class TestStdAPI(unittest.TestCase):
+
     def setUp(self):
         self.dtype = 'float64'
         self.shape = [1, 3, 4, 10]
@@ -67,36 +68,43 @@ def test_api(self):
 
 
 class TestStdAPI_dtype(TestStdAPI):
+
     def set_attrs(self):
         self.dtype = 'float32'
 
 
 class TestStdAPI_axis_int(TestStdAPI):
+
     def set_attrs(self):
         self.axis = 2
 
 
 class TestStdAPI_axis_list(TestStdAPI):
+
     def set_attrs(self):
         self.axis = [1, 2]
 
 
 class TestStdAPI_axis_tuple(TestStdAPI):
+
     def set_attrs(self):
         self.axis = (1, 3)
 
 
 class TestStdAPI_keepdim(TestStdAPI):
+
     def set_attrs(self):
         self.keepdim = False
 
 
 class TestStdAPI_unbiased(TestStdAPI):
+
     def set_attrs(self):
         self.unbiased = False
 
 
 class TestStdAPI_alias(unittest.TestCase):
+
     def test_alias(self):
         paddle.disable_static()
         x = paddle.to_tensor(np.array([10, 12], 'float32'))
@@ -109,6 +117,7 @@ def test_alias(self):
 
 
 class TestStdError(unittest.TestCase):
+
     def test_error(self):
         with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.fluid.data('X', [2, 3, 4], 'int32')
diff --git a/python/paddle/fluid/tests/unittests/test_stft_op.py b/python/paddle/fluid/tests/unittests/test_stft_op.py
index 41e950606b3db..8110f1d805fbb 100644
--- a/python/paddle/fluid/tests/unittests/test_stft_op.py
+++ b/python/paddle/fluid/tests/unittests/test_stft_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -45,13 +45,14 @@ def frame_from_librosa(x, frame_length, hop_length, axis=-1):
 
 def stft_np(x, window, n_fft, hop_length, **kwargs):
     frames = frame_from_librosa(x, n_fft, hop_length)
-    frames = np.multiply(frames.transpose([0, 2, 1]), window).transpose(
-        [0, 2, 1])
+    frames = np.multiply(frames.transpose([0, 2, 1]),
+                         window).transpose([0, 2, 1])
     res = np.fft.rfft(frames, axis=1)
     return res
 
 
 class TestStftOp(OpTest):
+
     def setUp(self):
         self.op_type = "stft"
         self.shape, self.type, self.attrs = self.initTestCase()
@@ -60,8 +61,10 @@ def setUp(self):
             'Window': np.hamming(self.attrs['n_fft']).astype(self.type),
         }
         self.outputs = {
-            'Out': stft_np(
-                x=self.inputs['X'], window=self.inputs['Window'], **self.attrs)
+            'Out':
+            stft_np(x=self.inputs['X'],
+                    window=self.inputs['Window'],
+                    **self.attrs)
         }
 
     def initTestCase(self):
diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
index 4954cfc97e4e2..e8d42a2fae8c8 100644
--- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
@@ -55,12 +55,14 @@ def strided_slice_native_forward(input, axes, starts, ends, strides):
 
 
 class TestStrideSliceOp(OpTest):
+
     def setUp(self):
         self.initTestCase()
         self.op_type = 'strided_slice'
         self.python_api = paddle.strided_slice
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
+        self.output = strided_slice_native_forward(self.input, self.axes,
+                                                   self.starts, self.ends,
+                                                   self.strides)
 
         self.inputs = {'Input': self.input}
         self.outputs = {'Out': self.output}
@@ -88,6 +90,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOp1(TestStrideSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(100)
         self.axes = [0]
@@ -98,6 +101,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOp2(TestStrideSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(100)
         self.axes = [0]
@@ -108,6 +112,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOp3(TestStrideSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(100)
         self.axes = [0]
@@ -118,6 +123,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOp4(TestStrideSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 4, 10)
         self.axes = [0, 1, 2]
@@ -128,6 +134,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOp5(TestStrideSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(5, 5, 5)
         self.axes = [0, 1, 2]
@@ -138,6 +145,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOp6(TestStrideSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(5, 5, 5)
         self.axes = [0, 1, 2]
@@ -148,6 +156,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOp7(TestStrideSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(5, 5, 5)
         self.axes = [0, 1, 2]
@@ -158,6 +167,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOp8(TestStrideSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(1, 100, 1)
         self.axes = [1]
@@ -168,6 +178,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOp9(TestStrideSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(1, 100, 1)
         self.axes = [1]
@@ -178,6 +189,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOp10(TestStrideSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(10, 10)
         self.axes = [0, 1]
@@ -188,6 +200,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOp11(TestStrideSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 3, 3, 4)
         self.axes = [0, 1, 2, 3]
@@ -198,6 +211,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOp12(TestStrideSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 3, 3, 4, 5)
         self.axes = [0, 1, 2, 3, 4]
@@ -208,6 +222,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOp13(TestStrideSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 3, 3, 6, 7, 8)
         self.axes = [0, 1, 2, 3, 4, 5]
@@ -218,6 +233,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOp14(TestStrideSliceOp):
+
     def initTestCase(self):
         self.input = np.random.rand(4, 4, 4, 4)
         self.axes = [1, 2, 3]
@@ -228,11 +244,13 @@ def initTestCase(self):
 
 
 class TestStrideSliceOpBool(TestStrideSliceOp):
+
     def test_check_grad(self):
         pass
 
 
 class TestStrideSliceOpBool1D(TestStrideSliceOpBool):
+
     def initTestCase(self):
         self.input = np.random.rand(100).astype("bool")
         self.axes = [0]
@@ -243,6 +261,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOpBool2D(TestStrideSliceOpBool):
+
     def initTestCase(self):
         self.input = np.random.rand(10, 10).astype("bool")
         self.axes = [0, 1]
@@ -253,6 +272,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOpBool3D(TestStrideSliceOpBool):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 4, 10).astype("bool")
         self.axes = [0, 1, 2]
@@ -263,6 +283,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOpBool4D(TestStrideSliceOpBool):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 3, 3, 4).astype("bool")
         self.axes = [0, 1, 2, 3]
@@ -273,6 +294,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOpBool5D(TestStrideSliceOpBool):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 3, 3, 4, 5).astype("bool")
         self.axes = [0, 1, 2, 3, 4]
@@ -283,6 +305,7 @@ def initTestCase(self):
 
 
 class TestStrideSliceOpBool6D(TestStrideSliceOpBool):
+
     def initTestCase(self):
         self.input = np.random.rand(3, 3, 3, 6, 7, 8).astype("bool")
         self.axes = [0, 1, 2, 3, 4, 5]
@@ -293,6 +316,7 @@ def initTestCase(self):
 
 
 class TestStridedSliceOp_starts_ListTensor(OpTest):
+
     def setUp(self):
         self.op_type = "strided_slice"
         self.config()
@@ -319,8 +343,9 @@ def config(self):
         self.axes = [0, 1, 2]
         self.strides = [1, 1, 1]
         self.infer_flags = [1, -1, 1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
+        self.output = strided_slice_native_forward(self.input, self.axes,
+                                                   self.starts, self.ends,
+                                                   self.strides)
 
         self.starts_infer = [1, 10, 2]
 
@@ -332,6 +357,7 @@ def test_check_grad_normal(self):
 
 
 class TestStridedSliceOp_ends_ListTensor(OpTest):
+
     def setUp(self):
         self.op_type = "strided_slice"
         self.config()
@@ -358,8 +384,9 @@ def config(self):
         self.axes = [0, 1, 2]
         self.strides = [1, 1, 2]
         self.infer_flags = [1, -1, 1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
+        self.output = strided_slice_native_forward(self.input, self.axes,
+                                                   self.starts, self.ends,
+                                                   self.strides)
 
         self.ends_infer = [3, 1, 4]
 
@@ -371,13 +398,13 @@ def test_check_grad_normal(self):
 
 
 class TestStridedSliceOp_starts_Tensor(OpTest):
+
     def setUp(self):
         self.op_type = "strided_slice"
         self.config()
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32")
+            "StartsTensor": np.array(self.starts, dtype="int32")
         }
         self.outputs = {'Out': self.output}
         self.attrs = {
@@ -395,8 +422,9 @@ def config(self):
         self.axes = [0, 1, 2]
         self.strides = [1, 1, 1]
         self.infer_flags = [-1, -1, -1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
+        self.output = strided_slice_native_forward(self.input, self.axes,
+                                                   self.starts, self.ends,
+                                                   self.strides)
 
     def test_check_output(self):
         self.check_output()
@@ -406,13 +434,13 @@ def test_check_grad_normal(self):
 
 
 class TestStridedSliceOp_ends_Tensor(OpTest):
+
     def setUp(self):
         self.op_type = "strided_slice"
         self.config()
         self.inputs = {
             'Input': self.input,
-            "EndsTensor": np.array(
-                self.ends, dtype="int32")
+            "EndsTensor": np.array(self.ends, dtype="int32")
         }
         self.outputs = {'Out': self.output}
         self.attrs = {
@@ -430,8 +458,9 @@ def config(self):
         self.axes = [0, 1, 2]
         self.strides = [1, 1, 1]
         self.infer_flags = [-1, -1, -1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
+        self.output = strided_slice_native_forward(self.input, self.axes,
+                                                   self.starts, self.ends,
+                                                   self.strides)
 
     def test_check_output(self):
         self.check_output()
@@ -441,6 +470,7 @@ def test_check_grad_normal(self):
 
 
 class TestStridedSliceOp_listTensor_Tensor(OpTest):
+
     def setUp(self):
         self.config()
         ends_tensor = []
@@ -451,8 +481,7 @@ def setUp(self):
 
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(
-                self.starts, dtype="int32"),
+            "StartsTensor": np.array(self.starts, dtype="int32"),
             "EndsTensorList": ends_tensor
         }
         self.outputs = {'Out': self.output}
@@ -471,8 +500,9 @@ def config(self):
         self.axes = [0, 1, 2]
         self.strides = [1, 1, 1]
         self.infer_flags = [-1, -1, -1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
+        self.output = strided_slice_native_forward(self.input, self.axes,
+                                                   self.starts, self.ends,
+                                                   self.strides)
 
     def test_check_output(self):
         self.check_output()
@@ -482,13 +512,13 @@ def test_check_grad_normal(self):
 
 
 class TestStridedSliceOp_strides_Tensor(OpTest):
+
     def setUp(self):
         self.op_type = "strided_slice"
         self.config()
         self.inputs = {
             'Input': self.input,
-            "StridesTensor": np.array(
-                self.strides, dtype="int32")
+            "StridesTensor": np.array(self.strides, dtype="int32")
         }
         self.outputs = {'Out': self.output}
         self.attrs = {
@@ -506,8 +536,9 @@ def config(self):
         self.axes = [0, 1, 2]
         self.strides = [1, -1, 1]
         self.infer_flags = [-1, -1, -1]
-        self.output = strided_slice_native_forward(
-            self.input, self.axes, self.starts, self.ends, self.strides)
+        self.output = strided_slice_native_forward(self.input, self.axes,
+                                                   self.starts, self.ends,
+                                                   self.strides)
 
     def test_check_output(self):
         self.check_output()
@@ -518,42 +549,48 @@ def test_check_grad_normal(self):
 
 # Test python API
 class TestStridedSliceAPI(unittest.TestCase):
+
     def test_1(self):
         input = np.random.random([3, 4, 5, 6]).astype("float64")
         minus_1 = fluid.layers.fill_constant([1], "int32", -1)
         minus_3 = fluid.layers.fill_constant([1], "int32", -3)
-        starts = fluid.layers.data(
-            name='starts', shape=[3], dtype='int32', append_batch_size=False)
-        ends = fluid.layers.data(
-            name='ends', shape=[3], dtype='int32', append_batch_size=False)
-        strides = fluid.layers.data(
-            name='strides', shape=[3], dtype='int32', append_batch_size=False)
-
-        x = fluid.layers.data(
-            name="x",
-            shape=[3, 4, 5, 6],
-            append_batch_size=False,
-            dtype="float64")
-        out_1 = paddle.strided_slice(
-            x,
-            axes=[0, 1, 2],
-            starts=[-3, 0, 2],
-            ends=[3, 100, -1],
-            strides=[1, 1, 1])
-        out_2 = paddle.strided_slice(
-            x,
-            axes=[0, 1, 3],
-            starts=[minus_3, 0, 2],
-            ends=[3, 100, -1],
-            strides=[1, 1, 1])
-        out_3 = paddle.strided_slice(
-            x,
-            axes=[0, 1, 3],
-            starts=[minus_3, 0, 2],
-            ends=[3, 100, minus_1],
-            strides=[1, 1, 1])
-        out_4 = paddle.strided_slice(
-            x, axes=[0, 1, 2], starts=starts, ends=ends, strides=strides)
+        starts = fluid.layers.data(name='starts',
+                                   shape=[3],
+                                   dtype='int32',
+                                   append_batch_size=False)
+        ends = fluid.layers.data(name='ends',
+                                 shape=[3],
+                                 dtype='int32',
+                                 append_batch_size=False)
+        strides = fluid.layers.data(name='strides',
+                                    shape=[3],
+                                    dtype='int32',
+                                    append_batch_size=False)
+
+        x = fluid.layers.data(name="x",
+                              shape=[3, 4, 5, 6],
+                              append_batch_size=False,
+                              dtype="float64")
+        out_1 = paddle.strided_slice(x,
+                                     axes=[0, 1, 2],
+                                     starts=[-3, 0, 2],
+                                     ends=[3, 100, -1],
+                                     strides=[1, 1, 1])
+        out_2 = paddle.strided_slice(x,
+                                     axes=[0, 1, 3],
+                                     starts=[minus_3, 0, 2],
+                                     ends=[3, 100, -1],
+                                     strides=[1, 1, 1])
+        out_3 = paddle.strided_slice(x,
+                                     axes=[0, 1, 3],
+                                     starts=[minus_3, 0, 2],
+                                     ends=[3, 100, minus_1],
+                                     strides=[1, 1, 1])
+        out_4 = paddle.strided_slice(x,
+                                     axes=[0, 1, 2],
+                                     starts=starts,
+                                     ends=ends,
+                                     strides=strides)
 
         out_5 = x[-3:3, 0:100:2, -1:2:-1]
         out_6 = x[minus_3:3:1, 0:100:2, :, minus_1:2:minus_1]
@@ -583,16 +620,19 @@ def test_dygraph_op(self):
         starts = [-3, 0, 2]
         ends = [3, 2, 4]
         strides_1 = [1, 1, 1]
-        sliced_1 = paddle.strided_slice(
-            x, axes=axes, starts=starts, ends=ends, strides=strides_1)
+        sliced_1 = paddle.strided_slice(x,
+                                        axes=axes,
+                                        starts=starts,
+                                        ends=ends,
+                                        strides=strides_1)
         assert sliced_1.shape == (3, 2, 2, 2)
 
     @unittest.skipIf(not paddle.is_compiled_with_cuda(),
                      "Cannot use CUDAPinnedPlace in CPU only version")
     def test_cuda_pinned_place(self):
         with paddle.fluid.dygraph.guard():
-            x = paddle.to_tensor(
-                np.random.randn(2, 10), place=paddle.CUDAPinnedPlace())
+            x = paddle.to_tensor(np.random.randn(2, 10),
+                                 place=paddle.CUDAPinnedPlace())
             self.assertTrue(x.place.is_cuda_pinned_place())
             y = x[:, ::2]
             self.assertFalse(x.place.is_cuda_pinned_place())
@@ -600,14 +640,14 @@ def test_cuda_pinned_place(self):
 
 
 class ArrayLayer(paddle.nn.Layer):
+
     def __init__(self, input_size=224, output_size=10, array_size=1):
         super(ArrayLayer, self).__init__()
         self.input_size = input_size
         self.output_size = output_size
         self.array_size = array_size
         for i in range(self.array_size):
-            setattr(self,
-                    self.create_name(i),
+            setattr(self, self.create_name(i),
                     paddle.nn.Linear(input_size, output_size))
 
     def create_name(self, index):
@@ -664,6 +704,7 @@ def create_tensor_array(self, tensors):
 
 
 class TestStridedSliceTensorArray(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
 
@@ -677,9 +718,9 @@ def grad_equal(self, g1, g2):
     def is_grads_equal(self, g1, g2):
         for i, g in enumerate(g1):
 
-            self.assertTrue(
-                self.grad_equal(g, g2[i]),
-                msg="gradient_1:\n{} \ngradient_2:\n{}".format(g, g2))
+            self.assertTrue(self.grad_equal(g, g2[i]),
+                            msg="gradient_1:\n{} \ngradient_2:\n{}".format(
+                                g, g2))
 
     def is_grads_equal_zeros(self, grads):
         for g in grads:
@@ -717,20 +758,23 @@ def test_strided_slice_tensor_array_cuda_pinned_place(self):
             with paddle.fluid.dygraph.guard():
 
                 class Simple(paddle.nn.Layer):
+
                     def __init__(self):
                         super(Simple, self).__init__()
 
                     def forward(self, inps):
                         tensor_array = None
                         for i, tensor in enumerate(inps):
-                            index = paddle.full(
-                                shape=[1], dtype='int64', fill_value=i)
+                            index = paddle.full(shape=[1],
+                                                dtype='int64',
+                                                fill_value=i)
                             if tensor_array is None:
                                 tensor_array = paddle.tensor.array_write(
                                     tensor, i=index)
                             else:
-                                paddle.tensor.array_write(
-                                    tensor, i=index, array=tensor_array)
+                                paddle.tensor.array_write(tensor,
+                                                          i=index,
+                                                          array=tensor_array)
 
                         array1 = paddle.concat(tensor_array)
                         array2 = paddle.concat(tensor_array[::-1])
@@ -739,14 +783,12 @@ def forward(self, inps):
                 net = Simple()
                 func = paddle.jit.to_static(net.forward)
 
-                inps1 = paddle.to_tensor(
-                    np.random.randn(2, 10),
-                    place=paddle.CUDAPinnedPlace(),
-                    stop_gradient=False)
-                inps2 = paddle.to_tensor(
-                    np.random.randn(2, 10),
-                    place=paddle.CUDAPinnedPlace(),
-                    stop_gradient=False)
+                inps1 = paddle.to_tensor(np.random.randn(2, 10),
+                                         place=paddle.CUDAPinnedPlace(),
+                                         stop_gradient=False)
+                inps2 = paddle.to_tensor(np.random.randn(2, 10),
+                                         place=paddle.CUDAPinnedPlace(),
+                                         stop_gradient=False)
 
                 self.assertTrue(inps1.place.is_cuda_pinned_place())
                 self.assertTrue(inps2.place.is_cuda_pinned_place())
@@ -756,163 +798,191 @@ def forward(self, inps):
                 self.assertFalse(result.place.is_cuda_pinned_place())
 
     def test_strided_slice_tensor_array(self):
+
         class Net01(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[::-1]
 
         self.create_case(Net01(array_size=10))
 
         class Net02(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[::-2]
 
         self.create_case(Net02(input_size=112, array_size=11))
 
         class Net03(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[::-3]
 
         self.create_case(Net03(input_size=112, array_size=9))
 
         class Net04(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[1::-4]
 
         self.create_case(Net04(input_size=112, array_size=9))
 
         class Net05(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[:7:-4]
 
         self.create_case(Net05(input_size=112, array_size=9))
 
         class Net06(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[8:0:-4]
 
         self.create_case(Net06(input_size=112, array_size=9))
 
         class Net07(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[8:1:-4]
 
         self.create_case(Net07(input_size=112, array_size=9))
 
         class Net08(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[::2]
 
         self.create_case(Net08(input_size=112, array_size=11))
 
         class Net09(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[::3]
 
         self.create_case(Net09(input_size=112, array_size=9))
 
         class Net10(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[1::4]
 
         self.create_case(Net10(input_size=112, array_size=9))
 
         class Net11(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[:8:4]
 
         self.create_case(Net11(input_size=112, array_size=9))
 
         class Net12(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[1:8:4]
 
         self.create_case(Net12(input_size=112, array_size=9))
 
         class Net13(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[8:10:4]
 
         self.create_case(Net13(input_size=112, array_size=13))
 
         class Net14(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[3:10:4]
 
         self.create_case(Net14(input_size=112, array_size=13))
 
         class Net15(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[2:10:4]
 
         self.create_case(Net15(input_size=112, array_size=13))
 
         class Net16(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[3:10:3]
 
         self.create_case(Net16(input_size=112, array_size=13))
 
         class Net17(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[3:15:3]
 
         self.create_case(Net17(input_size=112, array_size=13))
 
         class Net18(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[0:15:3]
 
         self.create_case(Net18(input_size=112, array_size=13))
 
         class Net19(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[-1:-5:-3]
 
         self.create_case(Net19(input_size=112, array_size=13))
 
         class Net20(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[-1:-6:-3]
 
         self.create_case(Net20(input_size=112, array_size=13))
 
         class Net21(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[-3:-6:-3]
 
         self.create_case(Net21(input_size=112, array_size=13))
 
         class Net22(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[-5:-1:3]
 
         self.create_case(Net22(input_size=112, array_size=13))
 
         class Net23(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[-6:-1:3]
 
         self.create_case(Net23(input_size=112, array_size=13))
 
         class Net24(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[-6:-3:3]
 
         self.create_case(Net24(input_size=112, array_size=13))
 
         class Net25(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[0::3]
 
         self.create_case(Net25(input_size=112, array_size=13))
 
         class Net26(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[-60:20:3]
 
         self.create_case(Net26(input_size=112, array_size=13))
 
         class Net27(ArrayLayer):
+
             def array_slice(self, tensors):
                 return tensors[-3:-60:-3]
 
diff --git a/python/paddle/fluid/tests/unittests/test_subtract_op.py b/python/paddle/fluid/tests/unittests/test_subtract_op.py
index 7f3738960c550..d7d9d3c8e253b 100644
--- a/python/paddle/fluid/tests/unittests/test_subtract_op.py
+++ b/python/paddle/fluid/tests/unittests/test_subtract_op.py
@@ -21,6 +21,7 @@
 
 
 class ApiSubtractTest(unittest.TestCase):
+
     def setUp(self):
         if core.is_compiled_with_cuda():
             self.place = core.CUDAPlace(0)
@@ -47,8 +48,10 @@ def test_static_api(self):
             data_y = paddle.static.data("y", shape=[10, 15], dtype="float32")
             result_max = paddle.subtract(data_x, data_y)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"x": self.input_x,
-                                 "y": self.input_y},
+            res, = exe.run(feed={
+                "x": self.input_x,
+                "y": self.input_y
+            },
                            fetch_list=[result_max])
         self.assertTrue(np.allclose(res, self.np_expected1))
 
@@ -58,8 +61,10 @@ def test_static_api(self):
             data_z = paddle.static.data("z", shape=[15], dtype="float32")
             result_max = paddle.subtract(data_x, data_z)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"x": self.input_x,
-                                 "z": self.input_z},
+            res, = exe.run(feed={
+                "x": self.input_x,
+                "z": self.input_z
+            },
                            fetch_list=[result_max])
         self.assertTrue(np.allclose(res, self.np_expected2))
 
@@ -69,8 +74,10 @@ def test_static_api(self):
             data_c = paddle.static.data("c", shape=[3], dtype="int64")
             result_max = paddle.subtract(data_a, data_c)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"a": self.input_a,
-                                 "c": self.input_c},
+            res, = exe.run(feed={
+                "a": self.input_a,
+                "c": self.input_c
+            },
                            fetch_list=[result_max])
         self.assertTrue(np.allclose(res, self.np_expected3))
 
@@ -80,8 +87,10 @@ def test_static_api(self):
             data_c = paddle.static.data("c", shape=[3], dtype="int64")
             result_max = paddle.subtract(data_b, data_c)
             exe = paddle.static.Executor(self.place)
-            res, = exe.run(feed={"b": self.input_b,
-                                 "c": self.input_c},
+            res, = exe.run(feed={
+                "b": self.input_b,
+                "c": self.input_c
+            },
                            fetch_list=[result_max])
         self.assertTrue(np.allclose(res, self.np_expected4))
 
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 6f625c097979b..9d1a4cf19eb07 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -22,13 +22,15 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
-from paddle.fluid.tests.unittests.op_test import (
-    OpTest, convert_float_to_uint16, convert_uint16_to_float)
+from paddle.fluid.tests.unittests.op_test import (OpTest,
+                                                  convert_float_to_uint16,
+                                                  convert_uint16_to_float)
 from paddle import _C_ops
 from paddle.fluid.framework import _test_eager_guard
 
 
 class TestSumOp(OpTest):
+
     def setUp(self):
         self.op_type = "sum"
         self.init_kernel_type()
@@ -53,6 +55,7 @@ def test_check_grad(self):
 
 
 class TestSelectedRowsSumOp(unittest.TestCase):
+
     def setUp(self):
         self.height = 10
         self.row_numel = 12
@@ -144,6 +147,7 @@ def test_w_is_selected_rows(self):
 
 
 class TestSelectedRowsSumOpInt(TestSelectedRowsSumOp):
+
     def init_kernel_type(self):
         self.dtype = np.int32
 
@@ -151,6 +155,7 @@ def init_kernel_type(self):
 @unittest.skipIf(not core.supports_bfloat16(),
                  'place does not support BF16 evaluation')
 class TestSelectedRowsSumBF16Op(TestSelectedRowsSumOp):
+
     def setUp(self):
         self.height = 10
         self.row_numel = 12
@@ -158,8 +163,8 @@ def setUp(self):
         self.dtype = np.uint16
         self.init_kernel_type()
         np.random.seed(12345)
-        self.data = np.random.random((len(self.rows),
-                                      self.row_numel)).astype(np.float32)
+        self.data = np.random.random(
+            (len(self.rows), self.row_numel)).astype(np.float32)
 
     def _get_array(self, rows, row_numel):
         if len(rows) > 0:
@@ -211,11 +216,13 @@ def test_w_is_selected_rows(self):
 
 
 class TestSelectedRowsSumBF16OpBigRow(TestSelectedRowsSumBF16Op):
+
     def init_kernel_type(self):
         self.row_numel = 102
 
 
 class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp):
+
     def setUp(self):
         self.height = 10
         self.row_numel = 12
@@ -246,11 +253,12 @@ def check_with_place(self, place, inplace):
         out_t = np.array(out)
         self.assertEqual(out_t.shape[0], self.height)
         self.assertTrue(
-            np.array_equal(out_t,
-                           self._get_array([i for i in range(
-                               self.height)], self.row_numel) * np.tile(
-                                   np.array(result).reshape(self.height, 1),
-                                   self.row_numel)))
+            np.array_equal(
+                out_t,
+                self._get_array([i
+                                 for i in range(self.height)], self.row_numel) *
+                np.tile(
+                    np.array(result).reshape(self.height, 1), self.row_numel)))
 
     def create_lod_tensor(self, scope, place, var_name):
         var = scope.var(var_name)
@@ -265,6 +273,7 @@ def create_lod_tensor(self, scope, place, var_name):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestFP16SumOp(TestSumOp):
+
     def init_kernel_type(self):
         self.dtype = np.float16
 
@@ -282,9 +291,11 @@ def test_check_grad(self):
 
 
 def create_test_sum_fp16_class(parent):
+
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "core is not compiled with CUDA")
     class TestSumFp16Case(parent):
+
         def init_kernel_type(self):
             self.dtype = np.float16
 
@@ -301,6 +312,7 @@ def test_w_is_selected_rows(self):
 
 #----------- test bf16 -----------
 class TestSumBF16Op(OpTest):
+
     def setUp(self):
         self.op_type = "sum"
         self.init_kernel_type()
@@ -326,12 +338,15 @@ def test_check_grad(self):
 
 
 class API_Test_Add_n(unittest.TestCase):
+
     def test_api(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input0 = fluid.layers.fill_constant(
-                shape=[2, 3], dtype='int64', value=5)
-            input1 = fluid.layers.fill_constant(
-                shape=[2, 3], dtype='int64', value=3)
+            input0 = fluid.layers.fill_constant(shape=[2, 3],
+                                                dtype='int64',
+                                                value=5)
+            input1 = fluid.layers.fill_constant(shape=[2, 3],
+                                                dtype='int64',
+                                                value=3)
             expected_result = np.empty((2, 3))
             expected_result.fill(8)
             sum_value = paddle.add_n([input0, input1])
@@ -371,7 +386,9 @@ def test_dygraph_final_state_api(self):
 
 
 class TestRaiseSumError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_type():
             fluid.layers.sum([11, 22])
 
@@ -392,7 +409,9 @@ def test_dtype1():
 
 
 class TestRaiseSumsError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_type():
             fluid.layers.sums([11, 22])
 
@@ -428,7 +447,9 @@ def test_out_dtype():
 
 
 class TestSumOpError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_empty_list_input():
             with fluid.dygraph.guard():
                 fluid._C_ops.sum([])
diff --git a/python/paddle/fluid/tests/unittests/test_svd_op.py b/python/paddle/fluid/tests/unittests/test_svd_op.py
index c2d712b3d7e65..ef9bbae6b81dd 100644
--- a/python/paddle/fluid/tests/unittests/test_svd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_svd_op.py
@@ -26,6 +26,7 @@
 
 
 class TestSvdOp(OpTest):
+
     def setUp(self):
         paddle.enable_static()
         self.generate_input()
@@ -102,11 +103,12 @@ def generate_input(self):
             vander matrix must be a full rank matrix.
         """
         self._input_shape = (5, 5)
-        self._input_data = np.vander(
-            [2, 3, 4, 5, 6]).astype("float64").reshape(self._input_shape)
+        self._input_data = np.vander([2, 3, 4, 5, 6]).astype("float64").reshape(
+            self._input_shape)
 
 
 class TestSvdNormalMatrixSmall(TestSvdCheckGrad2):
+
     def generate_input(self):
         """ small matrix SVD. 
         """
@@ -115,37 +117,40 @@ def generate_input(self):
 
 
 class TestSvdNormalMatrix6x3(TestSvdCheckGrad2):
+
     def generate_input(self):
         """ return a deterministic  matrix, the range matrix; 
             vander matrix must be a full rank matrix.
         """
         self._input_shape = (6, 3)
-        self._input_data = np.array(
-            [[1.0, 2.0, 3.0], [0.0, 1.0, 5.0], [0.0, 0.0, 6.0],
-             [2.0, 4.0, 9.0], [3.0, 6.0, 8.0],
-             [3.0, 1.0, 0.0]]).astype("float64")
+        self._input_data = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.0],
+                                     [0.0, 0.0, 6.0], [2.0, 4.0, 9.0],
+                                     [3.0, 6.0, 8.0], [3.0, 1.0,
+                                                       0.0]]).astype("float64")
 
 
 class TestSvdNormalMatrix3x6(TestSvdCheckGrad2):
+
     def generate_input(self):
         """ return a deterministic  matrix, the range matrix; 
             vander matrix must be a full rank matrix.
         """
         self._input_shape = (3, 6)
-        self._input_data = np.array(
-            [[1.0, 2.0, 3.0], [0.0, 1.0, 5.0], [0.0, 0.0, 6.0],
-             [2.0, 4.0, 9.0], [3.0, 6.0, 8.0],
-             [3.0, 1.0, 0.0]]).astype("float64")
+        self._input_data = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.0],
+                                     [0.0, 0.0, 6.0], [2.0, 4.0, 9.0],
+                                     [3.0, 6.0, 8.0], [3.0, 1.0,
+                                                       0.0]]).astype("float64")
         self._input_data = self._input_data.transpose((-1, -2))
 
 
 class TestSvdNormalMatrix6x3Batched(TestSvdOp):
+
     def generate_input(self):
         self._input_shape = (10, 6, 3)
-        self._input_data = np.array(
-            [[1.0, 2.0, 3.0], [0.0, 1.0, 5.0], [0.0, 0.0, 6.0],
-             [2.0, 4.0, 9.0], [3.0, 6.0, 8.0],
-             [3.0, 1.0, 0.0]]).astype("float64")
+        self._input_data = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.0],
+                                     [0.0, 0.0, 6.0], [2.0, 4.0, 9.0],
+                                     [3.0, 6.0, 8.0], [3.0, 1.0,
+                                                       0.0]]).astype("float64")
         self._input_data = np.stack([self._input_data] * 10, axis=0)
 
     def test_svd_forward(self):
@@ -155,15 +160,16 @@ def test_svd_forward(self):
 
 
 class TestSvdNormalMatrix3x6Batched(TestSvdOp):
+
     def generate_input(self):
         """ return a deterministic  matrix, the range matrix; 
             vander matrix must be a full rank matrix.
         """
         self._input_shape = (10, 3, 6)
-        self._input_data = np.array(
-            [[1.0, 2.0, 3.0], [0.0, 1.0, 5.0], [0.0, 0.0, 6.0],
-             [2.0, 4.0, 9.0], [3.0, 6.0, 8.0],
-             [3.0, 1.0, 0.0]]).astype("float64")
+        self._input_data = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.0],
+                                     [0.0, 0.0, 6.0], [2.0, 4.0, 9.0],
+                                     [3.0, 6.0, 8.0], [3.0, 1.0,
+                                                       0.0]]).astype("float64")
         self._input_data = self._input_data.transpose((-1, -2))
         self._input_data = np.stack([self._input_data] * 10, axis=0)
 
@@ -174,15 +180,16 @@ def test_svd_forward(self):
 
 
 class TestSvdNormalMatrix3x3x3x6Batched(TestSvdOp):
+
     def generate_input(self):
         """ return a deterministic  matrix, the range matrix; 
             vander matrix must be a full rank matrix.
         """
         self._input_shape = (3, 3, 3, 6)
-        self._input_data = np.array(
-            [[1.0, 2.0, 3.0], [0.0, 1.0, 5.0], [0.0, 0.0, 6.0],
-             [2.0, 4.0, 9.0], [3.0, 6.0, 8.0],
-             [3.0, 1.0, 0.0]]).astype("float64")
+        self._input_data = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.0],
+                                     [0.0, 0.0, 6.0], [2.0, 4.0, 9.0],
+                                     [3.0, 6.0, 8.0], [3.0, 1.0,
+                                                       0.0]]).astype("float64")
         self._input_data = self._input_data.transpose((-1, -2))
         self._input_data = np.stack(
             [self._input_data, self._input_data, self._input_data], axis=0)
@@ -198,6 +205,7 @@ def test_svd_forward(self):
 @skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
                     "however it is desirable to cover the forward pass")
 class TestSvdNormalMatrixBig(TestSvdOp):
+
     def generate_input(self):
         """ big matrix SVD. 
             
@@ -215,6 +223,7 @@ def test_check_grad(self):
 
 
 class TestSvdNormalMatrixBig2(TestSvdOp):
+
     def generate_input(self):
         """ big matrix SVD. 
         """
@@ -223,6 +232,7 @@ def generate_input(self):
 
 
 class TestSvdNormalMatrixFullMatrices(unittest.TestCase):
+
     def setUp(self):
         paddle.disable_static()
 
@@ -242,6 +252,7 @@ def test_full_matrices(self):
 
 
 class TestSvdFullMatriceGrad(TestSvdNormalMatrix6x3):
+
     def get_full_matrices_option(self):
         return True
 
@@ -260,6 +271,7 @@ def test_check_grad(self):
 
 
 class TestSvdAPI(unittest.TestCase):
+
     def test_dygraph(self):
         paddle.disable_static()
         a = np.random.rand(5, 5)
@@ -276,8 +288,9 @@ def test_static(self):
         for place in places:
             with fluid.program_guard(fluid.Program(), fluid.Program()):
                 a = np.random.rand(5, 5)
-                x = paddle.fluid.data(
-                    name="input", shape=[5, 5], dtype='float64')
+                x = paddle.fluid.data(name="input",
+                                      shape=[5, 5],
+                                      dtype='float64')
                 u, s, vh = paddle.linalg.svd(x)
                 exe = fluid.Executor(place)
                 gt_u, gt_s, gt_vh = np.linalg.svd(a, full_matrices=False)
diff --git a/python/paddle/fluid/tests/unittests/test_switch.py b/python/paddle/fluid/tests/unittests/test_switch.py
index b9f3c804ef357..9d28615f71b76 100644
--- a/python/paddle/fluid/tests/unittests/test_switch.py
+++ b/python/paddle/fluid/tests/unittests/test_switch.py
@@ -24,6 +24,7 @@
 
 
 class TestSwitch(unittest.TestCase):
+
     def check_switch(self, value):
         x = layers.fill_constant(shape=[1], dtype='float32', value=value)
         zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
@@ -31,8 +32,10 @@ def check_switch(self, value):
         two_var = layers.fill_constant(shape=[1], dtype='float32', value=2.0)
         three_var = layers.fill_constant(shape=[1], dtype='float32', value=3.0)
 
-        result = layers.create_global_var(
-            shape=[1], value=-1.0, dtype='float32', persistable=True)
+        result = layers.create_global_var(shape=[1],
+                                          value=-1.0,
+                                          dtype='float32',
+                                          persistable=True)
 
         with layers.Switch() as switch:
             with switch.case(layers.less_than(x, zero_var)):
@@ -62,16 +65,20 @@ def test_switch(self):
 
 
 class TestSwitchCaseError(unittest.TestCase):
+
     def test_error(self):
         main_program = framework.Program()
         startup_program = framework.Program()
         with framework.program_guard(main_program, startup_program):
             cond = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
-            zero_var = layers.fill_constant(
-                shape=[1], dtype='float32', value=0.0)
-
-            result = layers.create_global_var(
-                shape=[1], value=-1.0, dtype='float32', persistable=True)
+            zero_var = layers.fill_constant(shape=[1],
+                                            dtype='float32',
+                                            value=0.0)
+
+            result = layers.create_global_var(shape=[1],
+                                              value=-1.0,
+                                              dtype='float32',
+                                              persistable=True)
 
             # 1. The type of 'condition' in case must be Variable.
             def test_condition_type():
diff --git a/python/paddle/fluid/tests/unittests/test_switch_autotune.py b/python/paddle/fluid/tests/unittests/test_switch_autotune.py
index 0049a922b9166..a22df61ace8c7 100644
--- a/python/paddle/fluid/tests/unittests/test_switch_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_switch_autotune.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class SimpleNet(paddle.nn.Layer):
+
     def __init__(self):
         super(SimpleNet, self).__init__()
         self.conv = paddle.nn.Conv2D(1, 2, (3, 3))
@@ -50,6 +51,7 @@ def static_program(net, data):
 
 
 class TestAutoTune(unittest.TestCase):
+
     def set_flags(self, enable_autotune):
         if paddle.is_compiled_with_cuda():
             if enable_autotune:
@@ -97,6 +99,7 @@ def check_status(self, expected_res):
 
 
 class TestDygraphAutoTuneStatus(TestAutoTune):
+
     def run_program(self, enable_autotune):
         self.set_flags(enable_autotune)
         if enable_autotune:
@@ -135,6 +138,7 @@ def test_disable_autotune(self):
 
 
 class TestStaticAutoTuneStatus(TestAutoTune):
+
     def run_program(self, enable_autotune):
         paddle.enable_static()
 
@@ -142,8 +146,9 @@ def run_program(self, enable_autotune):
         main_program = paddle.static.Program()
         startup_program = paddle.static.Program()
         with paddle.static.program_guard(main_program, startup_program):
-            data = paddle.static.data(
-                name='X', shape=data_shape, dtype='float32')
+            data = paddle.static.data(name='X',
+                                      shape=data_shape,
+                                      dtype='float32')
             net = SimpleNet()
             loss = static_program(net, data)
         place = paddle.CUDAPlace(0) if paddle.fluid.core.is_compiled_with_cuda(
@@ -188,6 +193,7 @@ def test_disable_autotune(self):
 
 
 class TestAutoTuneAPI(unittest.TestCase):
+
     def test_set_config_warnings(self):
         with warnings.catch_warnings(record=True) as w:
             config = {"kernel": {"enable": 1, "tuning_range": 1}}
diff --git a/python/paddle/fluid/tests/unittests/test_switch_case.py b/python/paddle/fluid/tests/unittests/test_switch_case.py
index 598e415e5fbef..814e46fb341a7 100644
--- a/python/paddle/fluid/tests/unittests/test_switch_case.py
+++ b/python/paddle/fluid/tests/unittests/test_switch_case.py
@@ -25,7 +25,9 @@
 
 
 class TestAPISwitchCase(unittest.TestCase):
+
     def test_return_single_var(self):
+
         def fn_1():
             return layers.fill_constant(shape=[4, 2], dtype='int32', value=1)
 
@@ -43,68 +45,71 @@ def fn_3():
             index_5 = layers.fill_constant(shape=[1], dtype='int32', value=5)
 
             # call fn_1
-            out_0 = layers.switch_case(
-                branch_index=index_1, branch_fns={1: fn_1,
-                                                  2: fn_2,
-                                                  3: fn_3})
+            out_0 = layers.switch_case(branch_index=index_1,
+                                       branch_fns={
+                                           1: fn_1,
+                                           2: fn_2,
+                                           3: fn_3
+                                       })
 
             # call fn_2 : branch_fns={0: fn_1, 1:fn_2, 2:fn_3}
-            out_1 = layers.switch_case(
-                branch_index=index_1, branch_fns=(fn_1, fn_2, fn_3))
+            out_1 = layers.switch_case(branch_index=index_1,
+                                       branch_fns=(fn_1, fn_2, fn_3))
 
             # call default fn_3
-            out_2 = layers.switch_case(
-                branch_index=index_5,
-                branch_fns=((1, fn_1), (2, fn_2)),
-                default=fn_3)
+            out_2 = layers.switch_case(branch_index=index_5,
+                                       branch_fns=((1, fn_1), (2, fn_2)),
+                                       default=fn_3)
 
             # no default, call fn_2
-            out_3 = layers.switch_case(
-                branch_index=index_2, branch_fns=[(1, fn_1), (2, fn_2)])
+            out_3 = layers.switch_case(branch_index=index_2,
+                                       branch_fns=[(1, fn_1), (2, fn_2)])
 
             # no default, call fn_2 but branch_index is 5
-            out_4 = layers.switch_case(
-                branch_index=index_5,
-                branch_fns=[(1, fn_1), (3, fn_2), (2, fn_3)])
+            out_4 = layers.switch_case(branch_index=index_5,
+                                       branch_fns=[(1, fn_1), (3, fn_2),
+                                                   (2, fn_3)])
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
 
             res = exe.run(main_program,
                           fetch_list=[out_0, out_1, out_2, out_3, out_4])
 
-            self.assertTrue(
-                np.allclose(res[0], 1),
-                "result is {} but answer is {}".format(res[0], 1))
-            self.assertTrue(
-                np.allclose(res[1], 2),
-                "result is {} but answer is {}".format(res[0], 2))
-            self.assertTrue(
-                np.allclose(res[2], 3),
-                "result is {} but answer is {}".format(res[0], 3))
-            self.assertTrue(
-                np.allclose(res[3], 2),
-                "result is {} but answer is {}".format(res[0], 2))
-            self.assertTrue(
-                np.allclose(res[4], 2),
-                "result is {} but answer is {}".format(res[0], 2))
+            self.assertTrue(np.allclose(res[0], 1),
+                            "result is {} but answer is {}".format(res[0], 1))
+            self.assertTrue(np.allclose(res[1], 2),
+                            "result is {} but answer is {}".format(res[0], 2))
+            self.assertTrue(np.allclose(res[2], 3),
+                            "result is {} but answer is {}".format(res[0], 3))
+            self.assertTrue(np.allclose(res[3], 2),
+                            "result is {} but answer is {}".format(res[0], 2))
+            self.assertTrue(np.allclose(res[4], 2),
+                            "result is {} but answer is {}".format(res[0], 2))
 
     def test_return_var_tuple(self):
+
         def fn_1():
-            return layers.fill_constant(
-                shape=[1, 2], dtype='int32', value=1), layers.fill_constant(
-                    shape=[2, 3], dtype='float32', value=2)
+            return layers.fill_constant(shape=[1, 2], dtype='int32',
+                                        value=1), layers.fill_constant(
+                                            shape=[2, 3],
+                                            dtype='float32',
+                                            value=2)
 
         def fn_2():
-            return layers.fill_constant(
-                shape=[3, 4], dtype='int32', value=3), layers.fill_constant(
-                    shape=[4, 5], dtype='float32', value=4)
+            return layers.fill_constant(shape=[3, 4], dtype='int32',
+                                        value=3), layers.fill_constant(
+                                            shape=[4, 5],
+                                            dtype='float32',
+                                            value=4)
 
         def fn_3():
-            return layers.fill_constant(
-                shape=[5], dtype='int32', value=5), layers.fill_constant(
-                    shape=[5, 6], dtype='float32', value=6)
+            return layers.fill_constant(shape=[5], dtype='int32',
+                                        value=5), layers.fill_constant(
+                                            shape=[5, 6],
+                                            dtype='float32',
+                                            value=6)
 
         main_program = Program()
         startup_program = Program()
@@ -113,61 +118,64 @@ def fn_3():
 
             out = layers.switch_case(index_1, ((1, fn_1), (2, fn_2)), fn_3)
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
             ret = exe.run(main_program, fetch_list=out)
 
             self.assertTrue(
                 np.allclose(np.asarray(ret[0]), np.full((1, 2), 1, np.int32)))
             self.assertTrue(
-                np.allclose(
-                    np.asarray(ret[1]), np.full((2, 3), 2, np.float32)))
+                np.allclose(np.asarray(ret[1]), np.full((2, 3), 2, np.float32)))
 
 
 class TestAPISwitchCase_Nested(unittest.TestCase):
+
     def test_nested_switch_case(self):
+
         def fn_1(x=1):
-            out = layers.switch_case(
-                branch_index=layers.fill_constant(
-                    shape=[1], dtype='int32', value=x),
-                branch_fns={
-                    1: partial(
-                        layers.fill_constant, shape=[1], dtype='int32',
-                        value=1),
-                    x: partial(
-                        layers.fill_constant, shape=[2], dtype='int32', value=x)
-                })
+            out = layers.switch_case(branch_index=layers.fill_constant(
+                shape=[1], dtype='int32', value=x),
+                                     branch_fns={
+                                         1:
+                                         partial(layers.fill_constant,
+                                                 shape=[1],
+                                                 dtype='int32',
+                                                 value=1),
+                                         x:
+                                         partial(layers.fill_constant,
+                                                 shape=[2],
+                                                 dtype='int32',
+                                                 value=x)
+                                     })
             return out
 
         def fn_2(x=2):
-            out = layers.switch_case(
-                branch_index=layers.fill_constant(
-                    shape=[1], dtype='int32', value=2),
-                branch_fns={
-                    1: partial(
-                        layers.fill_constant,
-                        shape=[4, 3],
-                        dtype='int32',
-                        value=1),
-                    2: partial(
-                        fn_1, x=x)
-                })
+            out = layers.switch_case(branch_index=layers.fill_constant(
+                shape=[1], dtype='int32', value=2),
+                                     branch_fns={
+                                         1:
+                                         partial(layers.fill_constant,
+                                                 shape=[4, 3],
+                                                 dtype='int32',
+                                                 value=1),
+                                         2:
+                                         partial(fn_1, x=x)
+                                     })
             return out
 
         def fn_3():
-            out = layers.switch_case(
-                branch_index=layers.fill_constant(
-                    shape=[1], dtype='int32', value=3),
-                branch_fns={
-                    1: partial(
-                        layers.fill_constant,
-                        shape=[4, 3],
-                        dtype='int32',
-                        value=1),
-                    3: partial(
-                        fn_2, x=3)
-                })
+            out = layers.switch_case(branch_index=layers.fill_constant(
+                shape=[1], dtype='int32', value=3),
+                                     branch_fns={
+                                         1:
+                                         partial(layers.fill_constant,
+                                                 shape=[4, 3],
+                                                 dtype='int32',
+                                                 value=1),
+                                         3:
+                                         partial(fn_2, x=3)
+                                     })
             return out
 
         main_program = Program()
@@ -177,43 +185,47 @@ def fn_3():
             index_2 = layers.fill_constant(shape=[1], dtype='int32', value=2)
             index_3 = layers.fill_constant(shape=[1], dtype='int64', value=3)
 
-            out_1 = layers.switch_case(
-                branch_index=index_1, branch_fns={1: fn_1,
-                                                  2: fn_2,
-                                                  3: fn_3})
-            out_2 = layers.switch_case(
-                branch_index=index_2, branch_fns={1: fn_1,
-                                                  2: fn_2,
-                                                  3: fn_3})
-
-            out_3 = layers.switch_case(
-                branch_index=index_3, branch_fns={1: fn_1,
-                                                  2: fn_2,
-                                                  3: fn_3})
-
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            out_1 = layers.switch_case(branch_index=index_1,
+                                       branch_fns={
+                                           1: fn_1,
+                                           2: fn_2,
+                                           3: fn_3
+                                       })
+            out_2 = layers.switch_case(branch_index=index_2,
+                                       branch_fns={
+                                           1: fn_1,
+                                           2: fn_2,
+                                           3: fn_3
+                                       })
+
+            out_3 = layers.switch_case(branch_index=index_3,
+                                       branch_fns={
+                                           1: fn_1,
+                                           2: fn_2,
+                                           3: fn_3
+                                       })
+
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
 
             res = exe.run(main_program,
-                          feed={"index_1": np.array(
-                              [1], dtype="uint8")},
+                          feed={"index_1": np.array([1], dtype="uint8")},
                           fetch_list=[out_1, out_2, out_3])
 
-            self.assertTrue(
-                np.allclose(res[0], 1),
-                "result is {} but answer is {}".format(res[0], 1))
-            self.assertTrue(
-                np.allclose(res[1], 2),
-                "result is {} but answer is {}".format(res[1], 2))
-            self.assertTrue(
-                np.allclose(res[2], 3),
-                "result is {} but answer is {}".format(res[2], 3))
+            self.assertTrue(np.allclose(res[0], 1),
+                            "result is {} but answer is {}".format(res[0], 1))
+            self.assertTrue(np.allclose(res[1], 2),
+                            "result is {} but answer is {}".format(res[1], 2))
+            self.assertTrue(np.allclose(res[2], 3),
+                            "result is {} but answer is {}".format(res[2], 3))
 
 
 # test TypeError and ValueError of api switch_case
 class TestAPISwitchCase_Error(unittest.TestCase):
+
     def test_error(self):
+
         def fn_1():
             return layers.fill_constant(shape=[4, 2], dtype='int32', value=1)
 
@@ -226,81 +238,82 @@ def fn_3():
         main_program = Program()
         startup_program = Program()
         with program_guard(main_program, startup_program):
-            key_float32 = layers.fill_constant(
-                shape=[1], dtype='float32', value=0.23)
-            key_int32 = layers.fill_constant(
-                shape=[1], dtype='int32', value=0.23)
+            key_float32 = layers.fill_constant(shape=[1],
+                                               dtype='float32',
+                                               value=0.23)
+            key_int32 = layers.fill_constant(shape=[1],
+                                             dtype='int32',
+                                             value=0.23)
 
             # The type of 'branch_index' in Op(switch_case) must be Variable
             def type_error_branch_index():
-                layers.switch_case(
-                    branch_index=1, branch_fns=[(1, fn_1)], default=fn_3)
+                layers.switch_case(branch_index=1,
+                                   branch_fns=[(1, fn_1)],
+                                   default=fn_3)
 
             self.assertRaises(TypeError, type_error_branch_index)
 
             # The data type of 'branch_index' in Op(switch_case) must be int32, int64 or uint8
             def dtype_error_branch_index():
-                layers.switch_case(
-                    branch_index=key_float32,
-                    branch_fns=[(1, fn_1)],
-                    default=fn_3)
+                layers.switch_case(branch_index=key_float32,
+                                   branch_fns=[(1, fn_1)],
+                                   default=fn_3)
 
             self.assertRaises(TypeError, dtype_error_branch_index)
 
             # The type of 'branch_fns' in Op(switch_case) must be list, tuple or dict
             def type_error_branch_fns():
-                layers.switch_case(
-                    branch_index=key_int32, branch_fns=1, default=fn_3)
+                layers.switch_case(branch_index=key_int32,
+                                   branch_fns=1,
+                                   default=fn_3)
 
             self.assertRaises(TypeError, type_error_branch_fns)
 
             # The elements' type of 'branch_fns' in Op(switch_case) must be tuple
             def type_error_index_fn_pair_1():
-                layers.switch_case(
-                    branch_index=key_int32, branch_fns=[1], default=fn_3)
+                layers.switch_case(branch_index=key_int32,
+                                   branch_fns=[1],
+                                   default=fn_3)
 
             self.assertRaises(TypeError, type_error_index_fn_pair_1)
 
             # The tuple's size of 'branch_fns' in Op(switch_case) must be 2
             def type_error_index_fn_pair_2():
-                layers.switch_case(
-                    branch_index=key_int32,
-                    branch_fns=[(1, 2, 3)],
-                    default=fn_3)
+                layers.switch_case(branch_index=key_int32,
+                                   branch_fns=[(1, 2, 3)],
+                                   default=fn_3)
 
             self.assertRaises(TypeError, type_error_index_fn_pair_2)
 
             # The key's type of 'branch_fns' in Op(switch_case) must be int
             def type_error_key():
-                layers.switch_case(
-                    branch_index=key_int32, branch_fns=[(2.3, 2)], default=fn_3)
+                layers.switch_case(branch_index=key_int32,
+                                   branch_fns=[(2.3, 2)],
+                                   default=fn_3)
 
             self.assertRaises(TypeError, type_error_key)
 
             # The key in 'branch_fns' must be unique
             def value_error_key():
-                layers.switch_case(
-                    branch_index=key_int32,
-                    branch_fns=[(2, fn_1), (2, fn_2)],
-                    default=fn_3)
+                layers.switch_case(branch_index=key_int32,
+                                   branch_fns=[(2, fn_1), (2, fn_2)],
+                                   default=fn_3)
 
             self.assertRaises(ValueError, value_error_key)
 
             # The type of function in 'branch_fns' must be callable
             def type_error_fn():
-                layers.switch_case(
-                    branch_index=key_int32,
-                    branch_fns=[(1, 1), (2, fn_2)],
-                    default=fn_3)
+                layers.switch_case(branch_index=key_int32,
+                                   branch_fns=[(1, 1), (2, fn_2)],
+                                   default=fn_3)
 
             self.assertRaises(TypeError, type_error_fn)
 
             # The default in Op(case) must be callable
             def type_error_default():
-                layers.switch_case(
-                    branch_index=key_int32,
-                    branch_fns=[(1, fn_1), (2, fn_2)],
-                    default=1)
+                layers.switch_case(branch_index=key_int32,
+                                   branch_fns=[(1, fn_1), (2, fn_2)],
+                                   default=1)
 
             self.assertRaises(TypeError, type_error_default)
 
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index 6bf811be2ad0d..06da617f26f52 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -73,11 +73,10 @@ def _build_program(self,
         use_cudnn = self.dtype == np.float16
         with fluid.unique_name.guard():
             with fluid.program_guard(main, startup):
-                data = fluid.layers.data(
-                    name='input',
-                    shape=self.dshape,
-                    dtype=self.dtype,
-                    append_batch_size=False)
+                data = fluid.layers.data(name='input',
+                                         shape=self.dshape,
+                                         dtype=self.dtype,
+                                         append_batch_size=False)
                 conv = fluid.layers.conv2d(
                     input=data,
                     num_filters=32,
@@ -170,8 +169,7 @@ def _compare(self, place, layout, only_forward):
             if sync_bn_val.shape != bn_val.shape:
                 sync_bn_val = sync_bn_val[:bn_val.shape[0]]
             self.assertTrue(
-                np.allclose(
-                    bn_val, sync_bn_val, atol=self.atol),
+                np.allclose(bn_val, sync_bn_val, atol=self.atol),
                 "Output (" + fetch_names[i] + ") has diff. \n" + "\nBN     " +
                 str(bn_val) + "\n" + "Sync BN " + str(sync_bn_val))
 
@@ -211,14 +209,15 @@ def setUp(self):
 
 
 class TestDygraphSyncBatchNormAPIError(unittest.TestCase):
+
     def test_errors(self):
         if not core.is_compiled_with_cuda():
             return
 
         with program_guard(Program(), Program()):
             my_sync_batch_norm = paddle.nn.SyncBatchNorm(10)
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CUDAPlace(0))
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.CUDAPlace(0))
             self.assertRaises(TypeError, my_sync_batch_norm, x1)
 
             # the input dtype of SyncBatchNorm must be float16 or float32 or float64
@@ -228,17 +227,17 @@ def test_errors(self):
 
 
 class TestConvertSyncBatchNorm(unittest.TestCase):
+
     def test_convert(self):
         if not core.is_compiled_with_cuda():
             return
 
         with program_guard(Program(), Program()):
-            compare_model = paddle.nn.Sequential(
-                paddle.nn.Conv2D(3, 5, 3),
-                paddle.nn.BatchNorm2D(5), paddle.nn.BatchNorm2D(5))
+            compare_model = paddle.nn.Sequential(paddle.nn.Conv2D(3, 5, 3),
+                                                 paddle.nn.BatchNorm2D(5),
+                                                 paddle.nn.BatchNorm2D(5))
             model = paddle.nn.Sequential(
-                paddle.nn.Conv2D(3, 5, 3),
-                paddle.nn.BatchNorm2D(5),
+                paddle.nn.Conv2D(3, 5, 3), paddle.nn.BatchNorm2D(5),
                 paddle.nn.BatchNorm2D(
                     5,
                     weight_attr=fluid.ParamAttr(name='bn.scale'),
@@ -251,11 +250,13 @@ def test_convert(self):
 
 
 class TestConvertSyncBatchNormCast1(unittest.TestCase):
+
     def test_convert(self):
         if not core.is_compiled_with_cuda():
             return
 
         class Net(nn.Layer):
+
             def __init__(self):
                 super(Net, self).__init__()
                 self.conv1 = nn.Conv2D(3, 5, 3)
@@ -280,6 +281,7 @@ def forward(self, x):
 
 
 class TestConvertSyncBatchNormCase2(unittest.TestCase):
+
     def test_convert(self):
         if not core.is_compiled_with_cuda():
             return
@@ -287,6 +289,7 @@ def test_convert(self):
         with fluid.dygraph.guard(fluid.CUDAPlace(0)):
 
             class SyBNNet(paddle.nn.Layer):
+
                 def __init__(self, in_ch=3, out_ch=3, dirate=1):
                     super(SyBNNet, self).__init__()
                     self.bn_s1 = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
@@ -295,8 +298,7 @@ def __init__(self, in_ch=3, out_ch=3, dirate=1):
                             weight_attr=paddle.ParamAttr(
                                 regularizer=paddle.regularizer.L2Decay(0.))))
                     self.bn_s2 = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
-                        paddle.nn.BatchNorm3D(
-                            out_ch, data_format='NDHWC'))
+                        paddle.nn.BatchNorm3D(out_ch, data_format='NDHWC'))
 
                 def forward(self, x):
                     x = self.bn_s1(x)
@@ -304,6 +306,7 @@ def forward(self, x):
                     return out
 
             class BNNet(paddle.nn.Layer):
+
                 def __init__(self, in_ch=3, out_ch=3, dirate=1):
                     super(BNNet, self).__init__()
                     self.bn_s1 = paddle.nn.BatchNorm3D(
@@ -311,8 +314,7 @@ def __init__(self, in_ch=3, out_ch=3, dirate=1):
                         weight_attr=paddle.ParamAttr(
                             regularizer=paddle.regularizer.L2Decay(0.)))
                     self.bn_s2 = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
-                        paddle.nn.BatchNorm3D(
-                            out_ch, data_format='NDHWC'))
+                        paddle.nn.BatchNorm3D(out_ch, data_format='NDHWC'))
 
                 def forward(self, x):
                     x = self.bn_s1(x)
@@ -328,11 +330,12 @@ def forward(self, x):
             sybn_out = sybn_model(x)
             self.assertTrue(
                 np.allclose(bn_out.numpy(), sybn_out.numpy()),
-                "Output has diff. \n" + "\nBN     " + str(bn_out.numpy()) + "\n"
-                + "Sync BN " + str(sybn_out.numpy()))
+                "Output has diff. \n" + "\nBN     " + str(bn_out.numpy()) +
+                "\n" + "Sync BN " + str(sybn_out.numpy()))
 
 
 class TestDygraphSyncBatchNormDataFormatError(unittest.TestCase):
+
     def test_errors(self):
         if not core.is_compiled_with_cuda():
             return
diff --git a/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py b/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py
index b7650efc8c215..34ca5860a161e 100644
--- a/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py
+++ b/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py
@@ -26,6 +26,7 @@
 
 
 class TestTakeAlongAxisOp(OpTest):
+
     def setUp(self):
         self.init_data()
         self.op_type = "take_along_axis"
@@ -52,13 +53,14 @@ def init_data(self):
         self.x_type = "float64"
         self.x_shape = (5, 5, 5)
         self.index_type = "int32"
-        self.index = np.array(
-            [[[1]], [[1]], [[2]], [[4]], [[3]]]).astype(self.index_type)
+        self.index = np.array([[[1]], [[1]], [[2]], [[4]],
+                               [[3]]]).astype(self.index_type)
         self.axis = 2
         self.axis_type = "int64"
 
 
 class TestCase1(TestTakeAlongAxisOp):
+
     def init_data(self):
         self.x_type = "float64"
         self.x_shape = (5, 5, 5)
@@ -69,6 +71,7 @@ def init_data(self):
 
 
 class TestTakeAlongAxisAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(0)
         self.shape = [3, 3]
@@ -87,8 +90,10 @@ def test_api_static(self):
             index = paddle.fluid.data('Index', self.index_shape, "int64")
             out = paddle.take_along_axis(x, index, self.axis)
             exe = paddle.static.Executor(self.place[0])
-            res = exe.run(feed={'X': self.x_np,
-                                'Index': self.index_np},
+            res = exe.run(feed={
+                'X': self.x_np,
+                'Index': self.index_np
+            },
                           fetch_list=[out])
         out_ref = np.array(
             np.take_along_axis(self.x_np, self.index_np, self.axis))
@@ -107,12 +112,13 @@ def test_api_dygraph(self):
 
 
 class TestTakeAlongAxisAPICase1(TestTakeAlongAxisAPI):
+
     def setUp(self):
         np.random.seed(0)
         self.shape = [2, 2]
         self.index_shape = [4, 2]
-        self.index_np = np.array(
-            [[0, 0], [1, 0], [0, 0], [1, 0]]).astype('int64')
+        self.index_np = np.array([[0, 0], [1, 0], [0, 0], [1,
+                                                           0]]).astype('int64')
         self.x_np = np.random.random(self.shape).astype(np.float32)
         self.place = [paddle.CPUPlace()]
         self.axis = 0
diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
index aec219f806394..a283328a237c5 100644
--- a/python/paddle/fluid/tests/unittests/test_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
@@ -38,8 +38,9 @@ def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod):
         ret_ids = set([i for i in range(num_prior)]) - set(ids)
         l = neg_lod[n]
         neg_ids = random.sample(ret_ids, l)
-        neg_indices[offset:offset + neg_lod[n], :] = np.array(neg_ids).astype(
-            'int32').reshape(l, 1)
+        neg_indices[offset:offset +
+                    neg_lod[n], :] = np.array(neg_ids).astype('int32').reshape(
+                        l, 1)
         offset += neg_lod[n]
 
     return match_indices, neg_indices
@@ -86,6 +87,7 @@ def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
 
 
 class TestTargetAssginFloatType(OpTest):
+
     def setUp(self):
         self.op_type = "target_assign"
         num_prior = 120
@@ -97,11 +99,11 @@ def setUp(self):
         num_gt = sum(gt_lod)
 
         encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
-        gt_label = np.random.randint(
-            num_class, size=(num_gt, 1)).astype('int32')
+        gt_label = np.random.randint(num_class,
+                                     size=(num_gt, 1)).astype('int32')
 
-        match_indices, neg_indices = gen_match_and_neg_indices(num_prior,
-                                                               gt_lod, neg_lod)
+        match_indices, neg_indices = gen_match_and_neg_indices(
+            num_prior, gt_lod, neg_lod)
 
         out, out_wt, _, _ = target_assign(encoded_box, gt_label, match_indices,
                                           neg_indices, gt_lod, neg_lod,
@@ -124,6 +126,7 @@ def test_check_output(self):
 
 
 class TestTargetAssginIntType(OpTest):
+
     def setUp(self):
         self.op_type = "target_assign"
         num_prior = 120
@@ -135,11 +138,11 @@ def setUp(self):
         num_gt = sum(gt_lod)
 
         encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
-        gt_label = np.random.randint(
-            num_class, size=(num_gt, 1)).astype('int32')
+        gt_label = np.random.randint(num_class,
+                                     size=(num_gt, 1)).astype('int32')
 
-        match_indices, neg_indices = gen_match_and_neg_indices(num_prior,
-                                                               gt_lod, neg_lod)
+        match_indices, neg_indices = gen_match_and_neg_indices(
+            num_prior, gt_lod, neg_lod)
 
         _, _, out, out_wt, = target_assign(encoded_box, gt_label, match_indices,
                                            neg_indices, gt_lod, neg_lod,
diff --git a/python/paddle/fluid/tests/unittests/test_tcp_store.py b/python/paddle/fluid/tests/unittests/test_tcp_store.py
index 11e1e8cd059c8..a051519d634a5 100644
--- a/python/paddle/fluid/tests/unittests/test_tcp_store.py
+++ b/python/paddle/fluid/tests/unittests/test_tcp_store.py
@@ -20,6 +20,7 @@
 
 
 class TestTCPStore(unittest.TestCase):
+
     def test_tcp_store(self):
         store = paddle.fluid.core.TCPStore("127.0.0.1", 6170, True, 1,
                                            datetime.timedelta(0))
diff --git a/python/paddle/fluid/tests/unittests/test_tdm_child_op.py b/python/paddle/fluid/tests/unittests/test_tdm_child_op.py
index af7bbeaab05bc..f2987a2a61402 100644
--- a/python/paddle/fluid/tests/unittests/test_tdm_child_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tdm_child_op.py
@@ -59,14 +59,15 @@ def create_tdm_tree():
 
 
 class TestTDMChildOp(OpTest):
+
     def setUp(self):
         self.__class__.op_type = "tdm_child"
         self.config()
         tree_info = create_tdm_tree()
         tree_info_np = np.array(tree_info).astype(self.info_type)
 
-        x_np = np.random.randint(
-            low=0, high=26, size=self.x_shape).astype(self.x_type)
+        x_np = np.random.randint(low=0, high=26,
+                                 size=self.x_shape).astype(self.x_type)
         children_res = []
         leaf_mask_res = []
         for batch in x_np:
@@ -106,6 +107,7 @@ def test_check_output(self):
 
 
 class TestCase1(TestTDMChildOp):
+
     def config(self):
         """check int int64_t """
         self.x_shape = (10, 20)
@@ -115,6 +117,7 @@ def config(self):
 
 
 class TestCase2(TestTDMChildOp):
+
     def config(self):
         """check int64_t int64_t """
         self.x_shape = (10, 20)
@@ -124,6 +127,7 @@ def config(self):
 
 
 class TestCase3(TestTDMChildOp):
+
     def config(self):
         """check int64 int32 """
         self.x_shape = (10, 20)
@@ -133,6 +137,7 @@ def config(self):
 
 
 class TestCase4(TestTDMChildOp):
+
     def config(self):
         """check large shape """
         self.x_shape = (100, 20)
@@ -142,6 +147,7 @@ def config(self):
 
 
 class TestTDMChildShape(unittest.TestCase):
+
     def test_shape(self):
         x = fluid.layers.data(name='x', shape=[1], dtype='int32', lod_level=1)
         tdm_tree_info = create_tdm_tree()
@@ -151,17 +157,17 @@ def test_shape(self):
             x=x,
             node_nums=26,
             child_nums=2,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    tree_info_np)))
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.
+                                       NumpyArrayInitializer(tree_info_np)))
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place=place)
         exe.run(fluid.default_startup_program())
 
         feed = {
-            'x': np.array([[1], [2], [3], [4], [5], [6], [7], [8], [9], [10],
-                           [11], [12]]).astype('int32')
+            'x':
+            np.array([[1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11],
+                      [12]]).astype('int32')
         }
         exe.run(feed=feed)
 
diff --git a/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py b/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py
index e245529edc6ab..ffcf76308946e 100644
--- a/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py
@@ -29,9 +29,9 @@
 
 def create_tdm_travel():
     tree_travel = [[1, 3, 7, 14], [1, 3, 7, 15], [1, 3, 8, 16], [1, 3, 8, 17],
-                   [1, 4, 9, 18], [1, 4, 9, 19], [1, 4, 10, 20],
-                   [1, 4, 10, 21], [2, 5, 11, 22], [2, 5, 11, 23],
-                   [2, 5, 12, 24], [2, 5, 12, 25], [2, 6, 13, 0]]
+                   [1, 4, 9, 18], [1, 4, 9, 19], [1, 4, 10, 20], [1, 4, 10, 21],
+                   [2, 5, 11, 22], [2, 5, 11, 23], [2, 5, 12, 24],
+                   [2, 5, 12, 25], [2, 6, 13, 0]]
     return tree_travel
 
 
@@ -48,6 +48,7 @@ def create_tdm_layer():
 
 
 class TestTDMSamplerOp(OpTest):
+
     def setUp(self):
         self.__class__.op_type = "tdm_sampler"
         self.config()
@@ -74,8 +75,8 @@ def setUp(self):
         layer_np = np.array(tree_layer_flat).astype(self.tree_dtype)
         layer_np = layer_np.reshape([-1, 1])
 
-        self.x_np = np.random.randint(
-            low=0, high=13, size=self.x_shape).astype(self.x_type)
+        self.x_np = np.random.randint(low=0, high=13,
+                                      size=self.x_shape).astype(self.x_type)
 
         out = np.random.random(self.output_shape).astype(self.out_dtype)
         label = np.random.random(self.output_shape).astype(self.out_dtype)
@@ -133,8 +134,8 @@ def test_check_output(self):
                 sampling_res_list = sampling_res.tolist()
                 positive_travel.append(sampling_res_list[0])
 
-                label_sampling_res = label_res[batch_ids][start_offset:
-                                                          end_offset]
+                label_sampling_res = label_res[batch_ids][
+                    start_offset:end_offset]
                 mask_sampling_res = mask_res[batch_ids][start_offset:end_offset]
 
                 # check unique
@@ -142,9 +143,8 @@ def test_check_output(self):
                     assert len(set(sampling_res_list)) == len(
                         sampling_res_list
                     ), "len(set(sampling_res_list)): {}, len(sampling_res_list): {} , sample_res: {}, label_res:{}, mask_res: {}".format(
-                        len(set(sampling_res_list)),
-                        len(sampling_res_list), sampling_res,
-                        label_sampling_res, mask_sampling_res)
+                        len(set(sampling_res_list)), len(sampling_res_list),
+                        sampling_res, label_sampling_res, mask_sampling_res)
                 # check legal
                 layer_node = self.tree_layer[layer_idx]
                 layer_node.append(0)
@@ -168,11 +168,12 @@ def test_check_output(self):
                     np.sum(mask_sampling_res[padding_index]))
                 start_offset = end_offset
             # check travel legal
-            assert self.tree_travel[int(self.x_np[
-                batch_ids])] == positive_travel
+            assert self.tree_travel[int(
+                self.x_np[batch_ids])] == positive_travel
 
 
 class TestCase1(TestTDMSamplerOp):
+
     def config(self):
         """test input int64"""
         self.neg_samples_num_list = [0, 0, 0, 0]
@@ -183,6 +184,7 @@ def config(self):
 
 
 class TestCase2(TestTDMSamplerOp):
+
     def config(self):
         """test dtype int64"""
         self.neg_samples_num_list = [0, 0, 0, 0]
@@ -193,6 +195,7 @@ def config(self):
 
 
 class TestCase3(TestTDMSamplerOp):
+
     def config(self):
         """test all dtype int64"""
         self.neg_samples_num_list = [0, 0, 0, 0]
@@ -203,6 +206,7 @@ def config(self):
 
 
 class TestCase4(TestTDMSamplerOp):
+
     def config(self):
         """test one neg"""
         self.neg_samples_num_list = [1, 1, 1, 1]
@@ -213,6 +217,7 @@ def config(self):
 
 
 class TestCase5(TestTDMSamplerOp):
+
     def config(self):
         """test normal neg"""
         self.neg_samples_num_list = [1, 2, 3, 4]
@@ -223,6 +228,7 @@ def config(self):
 
 
 class TestCase6(TestTDMSamplerOp):
+
     def config(self):
         """test huge batchsize"""
         self.neg_samples_num_list = [1, 2, 3, 4]
@@ -233,6 +239,7 @@ def config(self):
 
 
 class TestCase7(TestTDMSamplerOp):
+
     def config(self):
         """test full neg"""
         self.neg_samples_num_list = [1, 3, 6, 11]
@@ -243,6 +250,7 @@ def config(self):
 
 
 class TestTDMSamplerShape(unittest.TestCase):
+
     def test_shape(self):
         x = fluid.layers.data(name='x', shape=[1], dtype='int32', lod_level=1)
         tdm_tree_travel = create_tdm_travel()
@@ -267,9 +275,8 @@ def test_shape(self):
             tree_travel_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.NumpyArrayInitializer(
                     travel_array)),
-            tree_layer_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    layer_array)),
+            tree_layer_attr=fluid.ParamAttr(initializer=fluid.initializer.
+                                            NumpyArrayInitializer(layer_array)),
             output_positive=True,
             output_list=True,
             seed=0,
@@ -281,8 +288,9 @@ def test_shape(self):
         exe.run(fluid.default_startup_program())
 
         feed = {
-            'x': np.array([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9],
-                           [10], [11], [12]]).astype('int32')
+            'x':
+            np.array([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10],
+                      [11], [12]]).astype('int32')
         }
         exe.run(feed=feed)
 
diff --git a/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
index e0142776c8312..6890e7d3a0678 100644
--- a/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
@@ -32,11 +32,12 @@ def setUp(self):
         batch_size = 100
         num_classes = 1
         self.inputs = {
-            'X': logit(
-                np.random.uniform(0, 1, (batch_size, num_classes))
-                .astype("float64")),
-            'Label': np.random.uniform(0, 2, (batch_size, num_classes))
-            .astype("float64")
+            'X':
+            logit(
+                np.random.uniform(0, 1,
+                                  (batch_size, num_classes)).astype("float64")),
+            'Label':
+            np.random.uniform(0, 2, (batch_size, num_classes)).astype("float64")
         }
         outs = []
         for index, label in enumerate(self.inputs["Label"]):
@@ -61,7 +62,9 @@ def test_check_grad(self):
 
 
 class TestTeacherStudentSigmoidLossInvalidInput(unittest.TestCase):
+
     def test_error(self):
+
         def test_invalid_input():
             input = [512, 1]
             label = fluid.data(name='label', shape=[None, 1], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
index 407a252e1a530..e9561b3e0a5d4 100644
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
@@ -42,6 +42,7 @@ def temporal_shift(x, seg_num, shift_ratio, data_format):
 
 
 class TestTemporalShift(OpTest):
+
     def setUp(self):
         self.initTestCase()
         self.op_type = 'temporal_shift'
@@ -53,7 +54,9 @@ def setUp(self):
             "data_format": self.data_format
         }
 
-        self.inputs = {"X": x, }
+        self.inputs = {
+            "X": x,
+        }
 
         output = temporal_shift(x, self.seg_num, self.shift_ratio,
                                 self.data_format)
@@ -74,6 +77,7 @@ def initTestCase(self):
 
 
 class TestTemporalShift2(TestTemporalShift):
+
     def initTestCase(self):
         self.x_shape = (4, 9, 7, 7)
         self.seg_num = 2
@@ -82,6 +86,7 @@ def initTestCase(self):
 
 
 class TestTemporalShift3(TestTemporalShift):
+
     def initTestCase(self):
         self.x_shape = (3, 10, 5, 5)
         self.seg_num = 1
@@ -90,6 +95,7 @@ def initTestCase(self):
 
 
 class TestTemporalShift4(TestTemporalShift):
+
     def initTestCase(self):
         self.x_shape = (6, 5, 5, 4)
         self.seg_num = 3
@@ -100,6 +106,7 @@ def initTestCase(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestTemporalShiftFP16(TestTemporalShift):
+
     def initTestCase(self):
         self.x_shape = (3, 10, 5, 5)
         self.seg_num = 1
@@ -119,25 +126,32 @@ def test_check_grad_ignore_uv(self):
 
 
 class TestTemporalShiftAPI(unittest.TestCase):
+
     def test_api(self):
         input = paddle.randn([6, 4, 2, 2])
-        out = paddle.fluid.layers.temporal_shift(
-            x=input, seg_num=2, shift_ratio=0.2)
+        out = paddle.fluid.layers.temporal_shift(x=input,
+                                                 seg_num=2,
+                                                 shift_ratio=0.2)
 
-        out_from_function = paddle.nn.functional.temporal_shift(
-            x=input, seg_num=2, shift_ratio=0.2)
+        out_from_function = paddle.nn.functional.temporal_shift(x=input,
+                                                                seg_num=2,
+                                                                shift_ratio=0.2)
 
         # dygraph
         with paddle.fluid.dygraph.guard():
             input = paddle.randn([6, 4, 2, 2])
-            out = paddle.nn.functional.temporal_shift(
-                x=input, seg_num=2, shift_ratio=0.2)
+            out = paddle.nn.functional.temporal_shift(x=input,
+                                                      seg_num=2,
+                                                      shift_ratio=0.2)
 
     def test_error(self):
+
         def attr_data_format():
             input = paddle.randn([6, 4, 2, 2])
-            out = paddle.nn.functional.temporal_shift(
-                x=input, seg_num=2, shift_ratio=0.2, data_format="HWC")
+            out = paddle.nn.functional.temporal_shift(x=input,
+                                                      seg_num=2,
+                                                      shift_ratio=0.2,
+                                                      data_format="HWC")
 
         self.assertRaises(ValueError, attr_data_format)
 
diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py
index da792903b7de8..2ea88c89a37ac 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -22,6 +22,7 @@
 
 
 class TestTensorPtr(unittest.TestCase):
+
     def test_tensor_ptr(self):
         t = core.Tensor()
         np_arr = numpy.zeros([2, 3])
@@ -30,6 +31,7 @@ def test_tensor_ptr(self):
 
 
 class TestTensor(unittest.TestCase):
+
     def setUp(self):
         self.support_dtypes = [
             'bool', 'uint8', 'int8', 'int16', 'int32', 'int64', 'float16',
@@ -79,8 +81,10 @@ def test_int8_tensor(self):
         scope = core.Scope()
         var = scope.var("int8_tensor")
         cpu_tensor = var.get_tensor()
-        tensor_array = numpy.random.randint(
-            -127, high=128, size=[100, 200], dtype=numpy.int8)
+        tensor_array = numpy.random.randint(-127,
+                                            high=128,
+                                            size=[100, 200],
+                                            dtype=numpy.int8)
         place = core.CPUPlace()
         cpu_tensor.set(tensor_array, place)
         cpu_tensor_array_2 = numpy.array(cpu_tensor)
@@ -88,8 +92,10 @@ def test_int8_tensor(self):
 
         if core.is_compiled_with_cuda():
             cuda_tensor = var.get_tensor()
-            tensor_array = numpy.random.randint(
-                -127, high=128, size=[100, 200], dtype=numpy.int8)
+            tensor_array = numpy.random.randint(-127,
+                                                high=128,
+                                                size=[100, 200],
+                                                dtype=numpy.int8)
             place = core.CUDAPlace(0)
             cuda_tensor.set(tensor_array, place)
             cuda_tensor_array_2 = numpy.array(cuda_tensor)
@@ -203,10 +209,10 @@ def run_slice_tensor(self, place, dtype):
         shape = [3, 3, 3]
         tensor._set_dims(shape)
 
-        tensor_array = numpy.array(
-            [[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-             [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
-             [[19, 20, 21], [22, 23, 24], [25, 26, 27]]]).astype(dtype)
+        tensor_array = numpy.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                                    [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
+                                    [[19, 20, 21], [22, 23, 24],
+                                     [25, 26, 27]]]).astype(dtype)
 
         tensor.set(tensor_array, place)
         n1 = tensor[1]
@@ -284,16 +290,16 @@ def test_tensor_poiter(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
             self.assertTrue(
-                isinstance(
-                    tensor._mutable_data(place, dtype), numbers.Integral))
+                isinstance(tensor._mutable_data(place, dtype),
+                           numbers.Integral))
             place = core.CUDAPinnedPlace()
             self.assertTrue(
-                isinstance(
-                    tensor._mutable_data(place, dtype), numbers.Integral))
+                isinstance(tensor._mutable_data(place, dtype),
+                           numbers.Integral))
             places = fluid.cuda_pinned_places()
             self.assertTrue(
-                isinstance(
-                    tensor._mutable_data(places[0], dtype), numbers.Integral))
+                isinstance(tensor._mutable_data(places[0], dtype),
+                           numbers.Integral))
 
     def test_tensor_set_fp16(self):
         array = numpy.random.random((300, 500)).astype("float16")
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
index ff6cbdde066bb..d9c4d2c61b266 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
@@ -54,9 +54,8 @@ def test_get_set(self):
         program = fluid.Program()
         block = program.global_block()
 
-        input_arr = block.create_var(
-            name="tmp_lod_tensor_array",
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+        input_arr = block.create_var(name="tmp_lod_tensor_array",
+                                     type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
         input_arr.persistable = True
         input_arr_var = scope.var('tmp_lod_tensor_array')
         input_tensor_array = input_arr_var.get_lod_tensor_array()
@@ -80,27 +79,28 @@ def test_get_set(self):
         y_out_index = block.create_var(name="OutIndex")
         y_out_index.persistable = True
 
-        y_grad_arr = block.create_var(
-            name='Out@GRAD', dtype='float32', shape=[11])
+        y_grad_arr = block.create_var(name='Out@GRAD',
+                                      dtype='float32',
+                                      shape=[11])
         y_grad_arr.persistable = True
         y_grad = scope.var('Out@GRAD')
         y_grad_tensor = y_grad.get_tensor()
         y_grad_tensor.set(random_grad, cpu)
 
-        op = block.append_op(
-            type=self.op_type,
-            inputs={"X": input_arr},
-            outputs={"Out": y_out,
-                     "OutIndex": y_out_index},
-            attrs=self.attrs)
+        op = block.append_op(type=self.op_type,
+                             inputs={"X": input_arr},
+                             outputs={
+                                 "Out": y_out,
+                                 "OutIndex": y_out_index
+                             },
+                             attrs=self.attrs)
 
-        out_grad = block.create_var(
-            name="tmp_lod_tensor_array@GRAD",
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+        out_grad = block.create_var(name="tmp_lod_tensor_array@GRAD",
+                                    type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
         out_grad.persistable = True
 
-        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc,
-                                                                  set(), [])
+        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+            op.desc, set(), [])
         grad_op_desc = grad_op_desc_list[0]
         new_op_desc = block.desc.append_op()
         new_op_desc.copy_from(grad_op_desc)
@@ -124,8 +124,8 @@ def test_get_set(self):
         # test forward
         tensor_res = numpy.array(out[0])
         tensor_res_out_idx = numpy.array(out[1])
-        tensor_gt = numpy.array(
-            [0] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='float32')
+        tensor_gt = numpy.array([0] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+                                dtype='float32')
 
         self.assertEqual(len(tensor_res), len(tensor_gt))
         self.assertEqual(len(tensor_res_out_idx), 10)
@@ -154,9 +154,8 @@ def test_get_set(self):
                     numpy.array(grad_tensor_array[i])[1],
                     numpy.array(random_grad[i + 1]))
             if i == 1:
-                self.assertEqual(
-                    numpy.array(grad_tensor_array[i]),
-                    numpy.array(random_grad[i + 1]))
+                self.assertEqual(numpy.array(grad_tensor_array[i]),
+                                 numpy.array(random_grad[i + 1]))
 
 
 class TestLoDTensorArrayStack(unittest.TestCase):
@@ -171,10 +170,9 @@ def setUp(self):
             numpy.random.rand(2, 3, 4).astype("float32")
         ]
         self.outputs = [
-            numpy.stack(
-                self.inputs, axis=self.attrs["axis"]), numpy.array(
-                    [x.shape[self.attrs["axis"]] for x in self.inputs],
-                    dtype="int32")
+            numpy.stack(self.inputs, axis=self.attrs["axis"]),
+            numpy.array([x.shape[self.attrs["axis"]] for x in self.inputs],
+                        dtype="int32")
         ]
         self.input_grads = [numpy.ones_like(x) for x in self.inputs]
         self.set_program()
@@ -199,14 +197,15 @@ def set_program(self):
     def run_check(self, executor, scope):
         executor.run(self.program, scope=scope)
         for i, output in enumerate(self.outputs):
-            numpy.allclose(
-                numpy.array(scope.var(self.output_vars[i].name).get_tensor()),
-                output,
-                atol=0)
+            numpy.allclose(numpy.array(
+                scope.var(self.output_vars[i].name).get_tensor()),
+                           output,
+                           atol=0)
         tensor_array_grad = scope.var(self.array.name).get_lod_tensor_array()
         for i, input_grad in enumerate(self.input_grads):
-            numpy.allclose(
-                numpy.array(tensor_array_grad[i]), input_grad, atol=0)
+            numpy.allclose(numpy.array(tensor_array_grad[i]),
+                           input_grad,
+                           atol=0)
 
     def test_cpu(self):
         scope = core.Scope()
@@ -223,6 +222,7 @@ def test_gpu(self):
 
 
 class TestTensorArrayToTensorAPI(unittest.TestCase):
+
     def _test_case(self, inp1, inp2):
         x0 = fluid.layers.assign(inp1)
         x0.stop_gradient = False
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_copy_from.py b/python/paddle/fluid/tests/unittests/test_tensor_copy_from.py
index 6a91c2182d1c5..64c4be260ed69 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_copy_from.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_copy_from.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,6 +19,7 @@
 
 
 class TestTensorCopyFrom(unittest.TestCase):
+
     def test_main(self):
         place = paddle.CPUPlace()
         np_value = np.random.random(size=[10, 30]).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_.py
index 2f43f129978cd..a6055e7b40eb1 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_fill_.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_.py
@@ -21,6 +21,7 @@
 
 
 class TensorFill_Test(unittest.TestCase):
+
     def setUp(self):
         self.shape = [32, 32]
 
@@ -36,8 +37,8 @@ def func_test_tensor_fill_true(self):
                 paddle.set_device('cpu')
             else:
                 paddle.set_device('gpu')
-            np_arr = np.reshape(
-                np.array(six.moves.range(np.prod(self.shape))), self.shape)
+            np_arr = np.reshape(np.array(six.moves.range(np.prod(self.shape))),
+                                self.shape)
             for dtype in typelist:
                 var = 1.
                 tensor = paddle.to_tensor(np_arr, place=p, dtype=dtype)
@@ -64,8 +65,8 @@ def func_test_tensor_fill_backward(self):
                 paddle.set_device('cpu')
             else:
                 paddle.set_device('gpu')
-            np_arr = np.reshape(
-                np.array(six.moves.range(np.prod(self.shape))), self.shape)
+            np_arr = np.reshape(np.array(six.moves.range(np.prod(self.shape))),
+                                self.shape)
             for dtype in typelist:
                 var = int(1)
                 tensor = paddle.to_tensor(np_arr, place=p, dtype=dtype)
@@ -78,11 +79,14 @@ def func_test_tensor_fill_backward(self):
                 self.assertEqual((y.grad.numpy() == 0).all().item(), True)
 
     def test_tensor_fill_backward(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_test_tensor_fill_backward()
         self.func_test_tensor_fill_backward()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_test_errors(self):
+
         def test_list():
             x = paddle.to_tensor([2, 3, 4])
             x.fill_([1])
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py
index ca0c97adedb94..da8ca1f2d6611 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py
@@ -21,11 +21,12 @@
 
 
 class TensorFillDiagonal_Test(unittest.TestCase):
+
     def func_dim2_normal(self):
-        expected_np = np.array(
-            [[1, 2, 2], [2, 1, 2], [2, 2, 1]]).astype('float32')
-        expected_grad = np.array(
-            [[0, 1, 1], [1, 0, 1], [1, 1, 0]]).astype('float32')
+        expected_np = np.array([[1, 2, 2], [2, 1, 2], [2, 2,
+                                                       1]]).astype('float32')
+        expected_grad = np.array([[0, 1, 1], [1, 0, 1], [1, 1,
+                                                         0]]).astype('float32')
 
         typelist = ['float32', 'float64', 'int32', 'int64']
         places = [fluid.CPUPlace()]
@@ -52,15 +53,17 @@ def func_dim2_normal(self):
                     True)
 
     def test_dim2_normal(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_dim2_normal()
         self.func_dim2_normal()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_offset(self):
-        expected_np = np.array(
-            [[2, 2, 1], [2, 2, 2], [2, 2, 2]]).astype('float32')
-        expected_grad = np.array(
-            [[1, 1, 0], [1, 1, 1], [1, 1, 1]]).astype('float32')
+        expected_np = np.array([[2, 2, 1], [2, 2, 2], [2, 2,
+                                                       2]]).astype('float32')
+        expected_grad = np.array([[1, 1, 0], [1, 1, 1], [1, 1,
+                                                         1]]).astype('float32')
 
         typelist = ['float32', 'float64', 'int32', 'int64']
         places = [fluid.CPUPlace()]
@@ -87,13 +90,15 @@ def func_offset(self):
                     True)
 
     def test_offset(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_offset()
         self.func_offset()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_bool(self):
-        expected_np = np.array(
-            [[False, True, True], [True, False, True], [True, True, False]])
+        expected_np = np.array([[False, True, True], [True, False, True],
+                                [True, True, False]])
 
         typelist = ['bool']
         places = [fluid.CPUPlace()]
@@ -119,11 +124,11 @@ def test_bool(self):
 
     def func_dim2_unnormal_wrap(self):
         expected_np = np.array([[1, 2, 2], [2, 1, 2], [2, 2, 1], [2, 2, 2],
-                                [1, 2, 2], [2, 1, 2],
-                                [2, 2, 1]]).astype('float32')
+                                [1, 2, 2], [2, 1, 2], [2, 2,
+                                                       1]]).astype('float32')
         expected_grad = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0], [1, 1, 1],
-                                  [0, 1, 1], [1, 0, 1],
-                                  [1, 1, 0]]).astype('float32')
+                                  [0, 1, 1], [1, 0, 1], [1, 1,
+                                                         0]]).astype('float32')
 
         typelist = ['float32', 'float64', 'int32', 'int64']
         places = [fluid.CPUPlace()]
@@ -150,17 +155,19 @@ def func_dim2_unnormal_wrap(self):
                     True)
 
     def test_dim2_unnormal_wrap(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_dim2_unnormal_wrap()
         self.func_dim2_unnormal_wrap()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_dim2_unnormal_unwrap(self):
         expected_np = np.array([[1, 2, 2], [2, 1, 2], [2, 2, 1], [2, 2, 2],
-                                [2, 2, 2], [2, 2, 2],
-                                [2, 2, 2]]).astype('float32')
+                                [2, 2, 2], [2, 2, 2], [2, 2,
+                                                       2]]).astype('float32')
         expected_grad = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0], [1, 1, 1],
-                                  [1, 1, 1], [1, 1, 1],
-                                  [1, 1, 1]]).astype('float32')
+                                  [1, 1, 1], [1, 1, 1], [1, 1,
+                                                         1]]).astype('float32')
 
         typelist = ['float32', 'float64', 'int32', 'int64']
         places = [fluid.CPUPlace()]
@@ -187,18 +194,21 @@ def func_dim2_unnormal_unwrap(self):
                     True)
 
     def test_dim2_unnormal_unwrap(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_dim2_unnormal_unwrap()
         self.func_dim2_unnormal_unwrap()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_dim_larger2_normal(self):
-        expected_np = np.array([[[1, 2, 2], [2, 2, 2], [2, 2, 2]], [[2, 2, 2], [
-            2, 1, 2
-        ], [2, 2, 2]], [[2, 2, 2], [2, 2, 2], [2, 2, 1]]]).astype('float32')
-        expected_grad = np.array(
-            [[[0, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 0, 1],
-                                                 [1, 1, 1]],
-             [[1, 1, 1], [1, 1, 1], [1, 1, 0]]]).astype('float32')
+        expected_np = np.array([[[1, 2, 2], [2, 2, 2], [2, 2, 2]],
+                                [[2, 2, 2], [2, 1, 2], [2, 2, 2]],
+                                [[2, 2, 2], [2, 2, 2], [2, 2,
+                                                        1]]]).astype('float32')
+        expected_grad = np.array([[[0, 1, 1], [1, 1, 1], [1, 1, 1]],
+                                  [[1, 1, 1], [1, 0, 1], [1, 1, 1]],
+                                  [[1, 1, 1], [1, 1, 1],
+                                   [1, 1, 0]]]).astype('float32')
 
         typelist = ['float32', 'float64', 'int32', 'int64']
         places = [fluid.CPUPlace()]
@@ -225,9 +235,11 @@ def func_dim_larger2_normal(self):
                     True)
 
     def test_dim_larger2_normal(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_dim_larger2_normal()
         self.func_dim_larger2_normal()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py
index 47316809189b7..4765b540c7e60 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py
@@ -21,6 +21,7 @@
 
 
 class TensorFillDiagTensor_Test(unittest.TestCase):
+
     def setUp(self):
         self.typelist = ['float32', 'float64', 'int32', 'int64']
         self.places = [fluid.CPUPlace()]
@@ -28,6 +29,7 @@ def setUp(self):
             self.places.append(fluid.CUDAPlace(0))
 
     def test_dim2(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         expected_np = np.array(
             [[1, 2, 2], [2, 1, 2], [2, 2, 1], [2, 2, 2]]).astype('float32')
         expected_grad = np.array(
@@ -53,8 +55,10 @@ def test_dim2(self):
                 self.assertEqual(
                     (y.grad.numpy().astype('float32') == expected_grad).all(),
                     True)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_dim2_offset_1(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         expected_np = np.array(
             [[2, 2, 2], [1, 2, 2], [2, 1, 2], [2, 2, 1]]).astype('float32')
         expected_grad = np.array(
@@ -80,8 +84,10 @@ def test_dim2_offset_1(self):
                 self.assertEqual(
                     (y.grad.numpy().astype('float32') == expected_grad).all(),
                     True)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_dim2_offset1(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         expected_np = np.array(
             [[2, 1, 2], [2, 2, 1], [2, 2, 2], [2, 2, 2]]).astype('float32')
         expected_grad = np.array(
@@ -107,8 +113,10 @@ def test_dim2_offset1(self):
                 self.assertEqual(
                     (y.grad.numpy().astype('float32') == expected_grad).all(),
                     True)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_dim4(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         expected_np = np.array(
             [[[[0, 3], [2, 2], [2, 2]], [[2, 2], [1, 4], [2, 2]],
               [[2, 2], [2, 2], [2, 5]], [[2, 2], [2, 2], [2, 2]]],
@@ -128,8 +136,8 @@ def test_dim4(self):
             else:
                 paddle.set_device('gpu')
             for dtype in self.typelist:
-                v = paddle.to_tensor(
-                    np.arange(12).reshape(2, 2, 3), dtype=dtype)
+                v = paddle.to_tensor(np.arange(12).reshape(2, 2, 3),
+                                     dtype=dtype)
                 var = (np.random.random() + 1)
                 x = paddle.ones((2, 4, 3, 2), dtype=dtype)
                 x.stop_gradient = False
@@ -143,6 +151,7 @@ def test_dim4(self):
                 self.assertEqual(
                     (y.grad.numpy().astype('float32') == expected_grad).all(),
                     True)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_largedim(self):
         if len(self.places) > 1:
@@ -150,8 +159,8 @@ def test_largedim(self):
             fsdim = 128
             paddle.set_device('gpu')
             for dtype in self.typelist:
-                v = paddle.arange(
-                    bsdim * fsdim, dtype=dtype).reshape((bsdim, fsdim))
+                v = paddle.arange(bsdim * fsdim, dtype=dtype).reshape(
+                    (bsdim, fsdim))
                 y = paddle.ones((bsdim, fsdim, fsdim), dtype=dtype)
                 y.stop_gradient = False
                 y = y * 2
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor_.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor_.py
index 81ec1daa6691d..03608046414ff 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor_.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor_.py
@@ -22,6 +22,7 @@
 
 
 class TensorFillDiagTensor_Test(unittest.TestCase):
+
     def setUp(self):
         self.typelist = ['float32', 'float64', 'int32', 'int64']
         self.places = [fluid.CPUPlace()]
@@ -29,10 +30,10 @@ def setUp(self):
             self.places.append(fluid.CUDAPlace(0))
 
     def func_dim2(self):
-        expected_np = np.array(
-            [[1, 2, 2], [2, 1, 2], [2, 2, 1], [2, 2, 2]]).astype('float32')
-        expected_grad = np.array(
-            [[0, 1, 1], [1, 0, 1], [1, 1, 0], [1, 1, 1]]).astype('float32')
+        expected_np = np.array([[1, 2, 2], [2, 1, 2], [2, 2, 1],
+                                [2, 2, 2]]).astype('float32')
+        expected_grad = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0],
+                                  [1, 1, 1]]).astype('float32')
 
         for idx, p in enumerate(self.places):
             if idx == 0:
@@ -56,15 +57,17 @@ def func_dim2(self):
                     True)
 
     def test_dim2(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_dim2()
         self.func_dim2()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_dim2_offset_1(self):
-        expected_np = np.array(
-            [[2, 2, 2], [1, 2, 2], [2, 1, 2], [2, 2, 1]]).astype('float32')
-        expected_grad = np.array(
-            [[1, 1, 1], [0, 1, 1], [1, 0, 1], [1, 1, 0]]).astype('float32')
+        expected_np = np.array([[2, 2, 2], [1, 2, 2], [2, 1, 2],
+                                [2, 2, 1]]).astype('float32')
+        expected_grad = np.array([[1, 1, 1], [0, 1, 1], [1, 0, 1],
+                                  [1, 1, 0]]).astype('float32')
 
         for idx, p in enumerate(self.places):
             if idx == 0:
@@ -88,15 +91,17 @@ def func_dim2_offset_1(self):
                     True)
 
     def test_dim2_offset_1(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_dim2_offset_1()
         self.func_dim2_offset_1()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_dim2_offset1(self):
-        expected_np = np.array(
-            [[2, 1, 2], [2, 2, 1], [2, 2, 2], [2, 2, 2]]).astype('float32')
-        expected_grad = np.array(
-            [[1, 0, 1], [1, 1, 0], [1, 1, 1], [1, 1, 1]]).astype('float32')
+        expected_np = np.array([[2, 1, 2], [2, 2, 1], [2, 2, 2],
+                                [2, 2, 2]]).astype('float32')
+        expected_grad = np.array([[1, 0, 1], [1, 1, 0], [1, 1, 1],
+                                  [1, 1, 1]]).astype('float32')
 
         for idx, p in enumerate(self.places):
             if idx == 0:
@@ -120,23 +125,29 @@ def func_dim2_offset1(self):
                     True)
 
     def test_dim2_offset1(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_dim2_offset1()
         self.func_dim2_offset1()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_dim4(self):
-        expected_np = np.array(
-            [[[[0, 3], [2, 2], [2, 2]], [[2, 2], [1, 4], [2, 2]],
-              [[2, 2], [2, 2], [2, 5]], [[2, 2], [2, 2], [2, 2]]],
-             [[[6, 9], [2, 2], [2, 2]], [[2, 2], [7, 10], [2, 2]],
-              [[2, 2], [2, 2], [8, 11]],
-              [[2, 2], [2, 2], [2, 2]]]]).astype('float32')
-        expected_grad = np.array(
-            [[[[0, 0], [1, 1], [1, 1]], [[1, 1], [0, 0], [1, 1]],
-              [[1, 1], [1, 1], [0, 0]], [[1, 1], [1, 1], [1, 1]]],
-             [[[0, 0], [1, 1], [1, 1]], [[1, 1], [0, 0], [1, 1]],
-              [[1, 1], [1, 1], [0, 0]],
-              [[1, 1], [1, 1], [1, 1]]]]).astype('float32')
+        expected_np = np.array([[[[0, 3], [2, 2], [2, 2]],
+                                 [[2, 2], [1, 4], [2, 2]],
+                                 [[2, 2], [2, 2], [2, 5]],
+                                 [[2, 2], [2, 2], [2, 2]]],
+                                [[[6, 9], [2, 2], [2, 2]],
+                                 [[2, 2], [7, 10], [2, 2]],
+                                 [[2, 2], [2, 2], [8, 11]],
+                                 [[2, 2], [2, 2], [2, 2]]]]).astype('float32')
+        expected_grad = np.array([[[[0, 0], [1, 1], [1, 1]],
+                                   [[1, 1], [0, 0], [1, 1]],
+                                   [[1, 1], [1, 1], [0, 0]],
+                                   [[1, 1], [1, 1], [1, 1]]],
+                                  [[[0, 0], [1, 1], [1, 1]],
+                                   [[1, 1], [0, 0], [1, 1]],
+                                   [[1, 1], [1, 1], [0, 0]],
+                                   [[1, 1], [1, 1], [1, 1]]]]).astype('float32')
 
         for idx, p in enumerate(self.places):
             if idx == 0:
@@ -144,8 +155,8 @@ def func_dim4(self):
             else:
                 paddle.set_device('gpu')
             for dtype in self.typelist:
-                v = paddle.to_tensor(
-                    np.arange(12).reshape(2, 2, 3), dtype=dtype)
+                v = paddle.to_tensor(np.arange(12).reshape(2, 2, 3),
+                                     dtype=dtype)
                 var = (np.random.random() + 1)
                 x = paddle.ones((2, 4, 3, 2), dtype=dtype)
                 x.stop_gradient = False
@@ -161,9 +172,11 @@ def func_dim4(self):
                     True)
 
     def test_func_dim4(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_dim4()
         self.func_dim4()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_largedim(self):
         #large dim only test on gpu because the cpu version is too slow for ci test, and the memory is limited
@@ -172,8 +185,8 @@ def func_largedim(self):
             fsdim = 128
             paddle.set_device('gpu')
             for dtype in self.typelist:
-                v = paddle.arange(
-                    bsdim * fsdim, dtype=dtype).reshape((bsdim, fsdim))
+                v = paddle.arange(bsdim * fsdim, dtype=dtype).reshape(
+                    (bsdim, fsdim))
                 y = paddle.ones((bsdim, fsdim, fsdim), dtype=dtype)
                 y.stop_gradient = False
                 y = y * 2
@@ -190,9 +203,11 @@ def func_largedim(self):
                 self.assertEqual((y.grad == expected_grad).all(), True)
 
     def test_largedim(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_largedim()
         self.func_largedim()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index e7f85f0451a17..d8d1990a4fafe 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -25,6 +25,7 @@
 
 
 class SimpleNet(nn.Layer):
+
     def __init__(self, in_size, out_size):
         super(SimpleNet, self).__init__()
         self.linear1 = nn.Linear(in_size, in_size)
@@ -43,6 +44,7 @@ def forward(self, x, hook=None, register=False, remove=False):
 
 
 class SimpleNetForStatic(nn.Layer):
+
     def __init__(self, in_size, out_size):
         super(SimpleNetForStatic, self).__init__()
         self.linear1 = nn.Linear(in_size, in_size)
@@ -58,6 +60,7 @@ def forward(self, x):
 
 
 class TestTensorRegisterHook(unittest.TestCase):
+
     def setUp(self):
         self.seed = 2021
         self.in_size = 10
@@ -68,6 +71,7 @@ def setUp(self):
             self.devices.append("gpu")
 
     def func_hook_for_interior_var(self):
+
         def run_double_hook_for_interior_var(double_hook, removed=False):
             for device in self.devices:
                 paddle.set_device(device)
@@ -158,11 +162,14 @@ def print_hook(grad):
         run_print_hook_for_interior_var(print_hook, removed=True)
 
     def test_hook_for_interior_var(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_hook_for_interior_var()
         self.func_hook_for_interior_var()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_hook_for_leaf_var(self):
+
         def run_double_hook_for_leaf_var(double_hook, removed=False):
             for device in self.devices:
                 paddle.set_device(device)
@@ -202,13 +209,16 @@ def run_double_hook_for_leaf_var(double_hook, removed=False):
         run_double_hook_for_leaf_var(lambda grad: grad * 2, removed=True)
 
     def test_hook_for_leaf_var(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_hook_for_leaf_var()
         self.func_hook_for_leaf_var()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_hook_for_accumulated_grad_interior_var(self):
-        def run_double_hook_for_accumulated_grad_interior_var(double_hook,
-                                                              removed=False):
+
+        def run_double_hook_for_accumulated_grad_interior_var(
+                double_hook, removed=False):
             for device in self.devices:
                 paddle.set_device(device)
 
@@ -248,27 +258,30 @@ def run_double_hook_for_accumulated_grad_interior_var(double_hook,
                 self.assertTrue(np.array_equal(x.grad.numpy(), base_grad))
                 # b.grad is changed by x.hook
                 self.assertTrue(
-                    np.array_equal(b.grad.numpy(), base_grad * 2
-                                   if not removed else base_grad))
+                    np.array_equal(b.grad.numpy(),
+                                   base_grad * 2 if not removed else base_grad))
                 # a.grad is changed by x.hook and a.hook
                 self.assertTrue(
-                    np.array_equal(a.grad.numpy(), base_grad * 4
-                                   if not removed else base_grad))
+                    np.array_equal(a.grad.numpy(),
+                                   base_grad * 4 if not removed else base_grad))
 
         # register hook
         run_double_hook_for_accumulated_grad_interior_var(lambda grad: grad * 2)
         # register hook and removed
-        run_double_hook_for_accumulated_grad_interior_var(
-            lambda grad: grad * 2, removed=True)
+        run_double_hook_for_accumulated_grad_interior_var(lambda grad: grad * 2,
+                                                          removed=True)
 
     def test_hook_for_accumulated_grad_interior_var(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_hook_for_accumulated_grad_interior_var()
         self.func_hook_for_accumulated_grad_interior_var()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_hook_for_accumulated_grad_leaf_var(self):
-        def run_double_hook_for_accumulated_grad_leaf_var(double_hook,
-                                                          removed=False):
+
+        def run_double_hook_for_accumulated_grad_leaf_var(
+                double_hook, removed=False):
             for device in self.devices:
                 paddle.set_device(device)
 
@@ -298,14 +311,14 @@ def run_double_hook_for_accumulated_grad_leaf_var(double_hook,
                 base_grad = np.array([5., 9., 13., 19.])
                 # x.grad is changed by x.hook
                 self.assertTrue(
-                    np.array_equal(x.grad.numpy(), base_grad * 2
-                                   if not removed else base_grad))
+                    np.array_equal(x.grad.numpy(),
+                                   base_grad * 2 if not removed else base_grad))
 
         # register hook
         run_double_hook_for_accumulated_grad_leaf_var(lambda grad: grad * 2)
         # register hook and removed
-        run_double_hook_for_accumulated_grad_leaf_var(
-            lambda grad: grad * 2, removed=True)
+        run_double_hook_for_accumulated_grad_leaf_var(lambda grad: grad * 2,
+                                                      removed=True)
 
     def test_hook_for_accumulated_grad_leaf_var(self):
         with _test_eager_guard():
@@ -313,6 +326,7 @@ def test_hook_for_accumulated_grad_leaf_var(self):
         self.func_hook_for_accumulated_grad_leaf_var()
 
     def func_hook_in_model(self):
+
         def run_double_hook_in_model(data,
                                      label,
                                      hook=None,
@@ -360,11 +374,14 @@ def run_double_hook_in_model(data,
         self.assertTrue(np.array_equal(linear1_b_grad, linear1_b_grad_rm))
 
     def test_func_hook_in_model(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_hook_in_model()
         self.func_hook_in_model()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_multiple_hooks_for_interior_var(self):
+
         def run_multiple_hooks_for_interior_var(device,
                                                 hooks,
                                                 remove1=False,
@@ -443,11 +460,14 @@ def double_hook(grad):
             self.assertTrue(np.array_equal(y_grad, z))
 
     def test_multiple_hooks_for_interior_var(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         with _test_eager_guard():
             self.func_multiple_hooks_for_interior_var()
         self.func_multiple_hooks_for_interior_var()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def func_hook_in_double_grad(self):
+
         def double_print_hook(grad):
             grad = grad * 2
             print(grad)
@@ -463,8 +483,10 @@ def double_print_hook(grad):
 
         y = x * x
         # Since y = x * x, dx = 2 * x
-        dx = paddle.grad(
-            outputs=[y], inputs=[x], create_graph=True, retain_graph=True)[0]
+        dx = paddle.grad(outputs=[y],
+                         inputs=[x],
+                         create_graph=True,
+                         retain_graph=True)[0]
 
         z = y + dx
         self.assertTrue(x.grad is None)
@@ -524,8 +546,9 @@ def test_register_hook_in_static_mode(self):
         main_program = paddle.static.Program()
         with paddle.static.scope_guard(paddle.static.Scope()):
             with paddle.static.program_guard(main_program, startup_program):
-                x = paddle.static.data(
-                    name='x', shape=[None, self.in_size], dtype='float32')
+                x = paddle.static.data(name='x',
+                                       shape=[None, self.in_size],
+                                       dtype='float32')
 
                 net = SimpleNetForStatic(self.in_size, self.out_size)
                 with self.assertRaises(AssertionError):
@@ -563,6 +586,7 @@ def global_void_hook():
 
 
 class TestTensorRegisterBackwardHook(unittest.TestCase):
+
     def setUp(self):
         self.devices = ["cpu"]
         if paddle.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py
index 774d40a17c66d..ded9d42b9b5fe 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py
@@ -24,12 +24,13 @@
 # - Related paddle dtypes:
 #  - int type: int64, (no test here: uint8, int8, int16, int32)
 #  - float type: float32, (no test here: float64)
-# - Python scalar dtypes: 
+# - Python scalar dtypes:
 #  - int(64)
 #  - float(64)
 
 
 class TestTensorScalarTypePromotionDynamic(unittest.TestCase):
+
     def check_operation(self, a, b, c, op):
         if op == '+':
             c_rlt = a + b
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py
index d697666e12ddd..701ff5c3d6e9f 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py
@@ -25,12 +25,13 @@
 # - Related paddle dtypes:
 #  - int type: int64, (no test here: uint8, int8, int16, int32)
 #  - float type: float32, (no test here: float64)
-# - Python scalar dtypes: 
+# - Python scalar dtypes:
 #  - int(64)
 #  - float(64)
 
 
 class TestTensorScalarTypePromotionStatic(unittest.TestCase):
+
     def setUp(self):
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_to_list.py b/python/paddle/fluid/tests/unittests/test_tensor_to_list.py
index a78113030ed53..5b8e2b18d9e18 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_to_list.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_to_list.py
@@ -21,6 +21,7 @@
 
 
 class TensorToListTest(unittest.TestCase):
+
     def setUp(self):
         self.shape = [11, 25, 32, 43]
 
@@ -31,8 +32,8 @@ def func_tensor_tolist(self):
             places.append(fluid.CUDAPinnedPlace())
 
         for p in places:
-            np_arr = np.reshape(
-                np.array(six.moves.range(np.prod(self.shape))), self.shape)
+            np_arr = np.reshape(np.array(six.moves.range(np.prod(self.shape))),
+                                self.shape)
             expectlist = np_arr.tolist()
 
             t = paddle.to_tensor(np_arr, place=p)
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_to_numpy.py b/python/paddle/fluid/tests/unittests/test_tensor_to_numpy.py
index 003f27652ef1d..635e08e3811cd 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_to_numpy.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_to_numpy.py
@@ -19,6 +19,7 @@
 
 
 class TensorToNumpyTest(unittest.TestCase):
+
     def setUp(self):
         self.shape = [11, 25, 32, 43]
 
@@ -35,8 +36,8 @@ def test_main(self):
         for p in places:
             for dtype in dtypes:
                 np_arr = np.reshape(
-                    np.array(six.moves.range(np.prod(self.shape))).astype(
-                        dtype), self.shape)
+                    np.array(six.moves.range(np.prod(
+                        self.shape))).astype(dtype), self.shape)
 
                 t = fluid.LoDTensor()
                 t.set(np_arr, p)
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_type_promotion.py b/python/paddle/fluid/tests/unittests/test_tensor_type_promotion.py
index c2543645853ea..4aa8c429b0d48 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_type_promotion.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_type_promotion.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,6 +21,7 @@
 
 
 class TestTensorTypePromotion(unittest.TestCase):
+
     def setUp(self):
         self.x = paddle.to_tensor([2, 3])
         self.y = paddle.to_tensor([1.0, 2.0])
@@ -30,29 +31,29 @@ def test_operator(self):
             warnings.simplefilter("always")
             self.x + self.y
             self.assertTrue(
-                "The dtype of left and right variables are not the same" in
-                str(context[-1].message))
+                "The dtype of left and right variables are not the same" in str(
+                    context[-1].message))
 
         with warnings.catch_warnings(record=True) as context:
             warnings.simplefilter("always")
             self.x - self.y
             self.assertTrue(
-                "The dtype of left and right variables are not the same" in
-                str(context[-1].message))
+                "The dtype of left and right variables are not the same" in str(
+                    context[-1].message))
 
         with warnings.catch_warnings(record=True) as context:
             warnings.simplefilter("always")
             self.x * self.y
             self.assertTrue(
-                "The dtype of left and right variables are not the same" in
-                str(context[-1].message))
+                "The dtype of left and right variables are not the same" in str(
+                    context[-1].message))
 
         with warnings.catch_warnings(record=True) as context:
             warnings.simplefilter("always")
             self.x / self.y
             self.assertTrue(
-                "The dtype of left and right variables are not the same" in
-                str(context[-1].message))
+                "The dtype of left and right variables are not the same" in str(
+                    context[-1].message))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_uva.py b/python/paddle/fluid/tests/unittests/test_tensor_uva.py
index 4af04b8f6d41e..8e62d04004170 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_uva.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_uva.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@
 
 
 class TestTensorCopyFrom(unittest.TestCase):
+
     def func_main(self):
         if paddle.fluid.core.is_compiled_with_cuda():
             place = paddle.CPUPlace()
@@ -35,6 +36,7 @@ def test_main(self):
 
 
 class TestUVATensorFromNumpy(unittest.TestCase):
+
     def func_uva_tensor_creation(self):
         if paddle.fluid.core.is_compiled_with_cuda():
             dtype_list = [
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_zero_.py b/python/paddle/fluid/tests/unittests/test_tensor_zero_.py
index d47585f78bb7b..30825e637b92e 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_zero_.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_zero_.py
@@ -21,6 +21,7 @@
 
 
 class TensorFill_Test(unittest.TestCase):
+
     def setUp(self):
         self.shape = [32, 32]
 
@@ -32,8 +33,8 @@ def func_test_tensor_fill_true(self):
             places.append(fluid.CUDAPinnedPlace())
 
         for p in places:
-            np_arr = np.reshape(
-                np.array(six.moves.range(np.prod(self.shape))), self.shape)
+            np_arr = np.reshape(np.array(six.moves.range(np.prod(self.shape))),
+                                self.shape)
             for dtype in typelist:
                 tensor = paddle.to_tensor(np_arr, place=p, dtype=dtype)
                 target = tensor.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_tensordot.py b/python/paddle/fluid/tests/unittests/test_tensordot.py
index 04b140cba4c0e..e5d563455e896 100644
--- a/python/paddle/fluid/tests/unittests/test_tensordot.py
+++ b/python/paddle/fluid/tests/unittests/test_tensordot.py
@@ -66,6 +66,7 @@ def tensordot_np(x, y, axes):
 
 
 class TestTensordotAPI(unittest.TestCase):
+
     def setUp(self):
         self.set_place()
         self.set_dtype()
@@ -90,50 +91,60 @@ def set_input_data(self):
         self.y = np.random.random(self.y_shape).astype(self.dtype)
 
     def set_test_axes(self):
-        self.all_axes = [
-            [[3, 2], [3]], [[2, 1, 0], [2, 1]], [[1, 2, 0], [1, 3, 2]], [3, 0],
-            [[], [0, 3, 1]], [[2, 1, 0, 3], [2, 0, 1, 3]],
-            [[3, 1, 2], [1, 3, 2, 0]], [[2, 1], [0, 2]], [[2, 0, 1, 3], [2]],
-            [[1, 2, 0, 3], [0, 2, 1]], [[2, 1, 3, 0], [1, 2, 3]],
-            [[2, 0, 1, 3], [3, 1, 0, 2]], [[0, 3], [0, 3, 2, 1]],
-            [[1, 3, 2, 0], [2, 1, 0, 3]], [[1, 3, 2, 0], [1, 3, 2, 0]],
-            [[1, 0, 2], [0, 1]], [[2, 3, 0], [3, 1]],
-            [[1, 3, 2, 0], [3, 0, 1, 2]], [[3, 2, 1], [2, 0, 1]], [[0], []],
-            [[2, 3, 0], [1, 2, 0]], [[3, 0, 2, 1], [2, 1, 0, 3]],
-            [[3, 1, 2], [2, 3, 1]], [[1, 0, 2, 3], []], [[1, 2], [1, 2, 3]],
-            [[2, 0, 1, 3], [2, 0, 1]], [[3, 1, 2], [1, 3, 2]],
-            [[3, 1, 2, 0], [1, 2, 3, 0]], [[0, 2, 3], [0, 1, 2]],
-            [[3, 2, 0], [2, 0, 3, 1]], [[2, 1, 0, 3], [3, 1, 2, 0]],
-            [[1, 2, 3, 0], [1, 3, 0, 2]], [[3, 0], [2, 1]],
-            [[0, 1, 3, 2], [0, 2, 1, 3]], [[1, 0], [2, 1, 3]],
-            [[1, 0, 3, 2], [2, 3, 0, 1]], [[1, 2], [3]],
-            [[1, 2, 3, 0], [3, 2, 1, 0]], [[0, 3, 2, 1], [2, 1, 3, 0]], [0],
-            [[0, 2, 3], [3, 2, 0, 1]], [[1, 2, 3, 0], [3, 2, 1, 0]],
-            [[3, 1], [3]], [[3, 2, 0, 1], [3, 2, 0]], [[2, 3, 0, 1], [0, 3, 2]],
-            [[1], [1, 3]], [[1, 2], [2, 1, 0]], [[3, 1, 2], [3, 1, 0]],
-            [[1, 3], [3, 1, 2]], [[2, 0, 1, 3], [3, 1, 0, 2]],
-            [[1, 3, 0], [1, 3]], [[2, 3, 1], [1, 0, 2]],
-            [[1, 2, 0, 3], [0, 2, 1, 3]], [[2], [0, 1, 3]], [[1], [1, 2]],
-            [[1, 0, 2, 3], [3, 0, 1, 2]], [[0, 1, 3, 2], [1, 3, 0, 2]],
-            [[3, 0, 2, 1], [0, 2, 3]], [[1, 2, 0], [1, 2, 3]],
-            [[1, 0, 3], [2, 3, 0]], [[2, 3, 0], [3, 1, 0]], [[1, 3], [1, 0]],
-            [[2, 1, 0, 3], [2, 0, 3, 1]], [[3, 2, 0], [2, 1, 0]],
-            [[0, 1, 3], [0, 3, 1]], [[3, 1, 0], [3, 2, 1]], [[3, 2], [3, 1]],
-            [[3], [2, 1, 0]], [[1, 2, 3, 0], []], [[1, 3, 2, 0], [3, 1, 2]],
-            [[1], [0, 2]], [[3, 2, 0], [3, 2, 0]], [[3], []],
-            [[1, 0, 3], [2, 1]], [[3, 1, 0, 2], [2, 3, 1, 0]],
-            [[0, 1], [0, 3, 2]], [[0, 2, 3], [0, 2, 1]], [[1, 3, 0], [3, 0, 2]],
-            [[3, 1, 2], [1, 2, 3]], [[3, 1, 2], [3, 1, 0]],
-            [[0, 3, 1, 2], [3, 2, 1, 0]], [[0, 3], [3, 2, 1]],
-            [[2, 3], [1, 3, 0]], [[0, 3, 2], [2, 0, 3, 1]], [[2, 3], [1, 3]],
-            [[3, 1, 2, 0], [2, 3, 1, 0]], [[1, 0, 3, 2], [3, 0, 1, 2]],
-            [[3, 2, 1, 0], [0, 1, 3, 2]], [[3, 1, 2], [3]],
-            [[0, 1, 3, 2], [2, 3, 0, 1]], [[1, 2, 3, 0], [1, 3, 0, 2]],
-            [3, 1, 2], [[3, 1, 2], [0, 3, 2]], [[2, 3, 0], [1, 2, 0]],
-            [[2, 0, 3], [2, 0]], [[3, 1, 0, 2], [3, 1, 0, 2]],
-            [[0, 1, 2], [2, 0, 1]], [[1, 0, 3], [2, 3, 0]],
-            [[2, 0, 1], [0, 1, 3]], [[2, 1], [0, 1, 3]]
-        ]
+        self.all_axes = [[[3, 2], [3]], [[2, 1, 0], [2, 1]],
+                         [[1, 2, 0], [1, 3, 2]], [3, 0], [[], [0, 3, 1]],
+                         [[2, 1, 0, 3], [2, 0, 1, 3]], [[3, 1, 2], [1, 3, 2,
+                                                                    0]],
+                         [[2, 1], [0, 2]], [[2, 0, 1, 3], [2]],
+                         [[1, 2, 0, 3], [0, 2, 1]], [[2, 1, 3, 0], [1, 2, 3]],
+                         [[2, 0, 1, 3], [3, 1, 0, 2]], [[0, 3], [0, 3, 2, 1]],
+                         [[1, 3, 2, 0], [2, 1, 0, 3]],
+                         [[1, 3, 2, 0], [1, 3, 2, 0]], [[1, 0, 2], [0, 1]],
+                         [[2, 3, 0], [3, 1]], [[1, 3, 2, 0], [3, 0, 1, 2]],
+                         [[3, 2, 1], [2, 0, 1]], [[0], []],
+                         [[2, 3, 0], [1, 2, 0]], [[3, 0, 2, 1], [2, 1, 0, 3]],
+                         [[3, 1, 2], [2, 3, 1]], [[1, 0, 2, 3], []],
+                         [[1, 2], [1, 2, 3]], [[2, 0, 1, 3], [2, 0, 1]],
+                         [[3, 1, 2], [1, 3, 2]], [[3, 1, 2, 0], [1, 2, 3, 0]],
+                         [[0, 2, 3], [0, 1, 2]], [[3, 2, 0], [2, 0, 3, 1]],
+                         [[2, 1, 0, 3], [3, 1, 2, 0]],
+                         [[1, 2, 3, 0], [1, 3, 0, 2]], [[3, 0], [2, 1]],
+                         [[0, 1, 3, 2], [0, 2, 1, 3]], [[1, 0], [2, 1, 3]],
+                         [[1, 0, 3, 2], [2, 3, 0, 1]], [[1, 2], [3]],
+                         [[1, 2, 3, 0], [3, 2, 1, 0]],
+                         [[0, 3, 2, 1], [2, 1, 3, 0]], [0],
+                         [[0, 2, 3], [3, 2, 0, 1]], [[1, 2, 3, 0], [3, 2, 1,
+                                                                    0]],
+                         [[3, 1], [3]], [[3, 2, 0, 1], [3, 2, 0]],
+                         [[2, 3, 0, 1], [0, 3, 2]], [[1], [1, 3]],
+                         [[1, 2], [2, 1, 0]], [[3, 1, 2], [3, 1, 0]],
+                         [[1, 3], [3, 1, 2]], [[2, 0, 1, 3], [3, 1, 0, 2]],
+                         [[1, 3, 0], [1, 3]], [[2, 3, 1], [1, 0, 2]],
+                         [[1, 2, 0, 3], [0, 2, 1, 3]], [[2], [0, 1, 3]],
+                         [[1], [1, 2]], [[1, 0, 2, 3], [3, 0, 1, 2]],
+                         [[0, 1, 3, 2], [1, 3, 0, 2]], [[3, 0, 2, 1], [0, 2,
+                                                                       3]],
+                         [[1, 2, 0], [1, 2, 3]], [[1, 0, 3], [2, 3, 0]],
+                         [[2, 3, 0], [3, 1, 0]], [[1, 3], [1, 0]],
+                         [[2, 1, 0, 3], [2, 0, 3, 1]], [[3, 2, 0], [2, 1, 0]],
+                         [[0, 1, 3], [0, 3, 1]], [[3, 1, 0], [3, 2, 1]],
+                         [[3, 2], [3, 1]], [[3], [2, 1, 0]], [[1, 2, 3, 0], []],
+                         [[1, 3, 2, 0], [3, 1, 2]], [[1], [0, 2]],
+                         [[3, 2, 0], [3, 2, 0]], [[3], []], [[1, 0, 3], [2, 1]],
+                         [[3, 1, 0, 2], [2, 3, 1, 0]], [[0, 1], [0, 3, 2]],
+                         [[0, 2, 3], [0, 2, 1]], [[1, 3, 0], [3, 0, 2]],
+                         [[3, 1, 2], [1, 2, 3]], [[3, 1, 2], [3, 1, 0]],
+                         [[0, 3, 1, 2], [3, 2, 1, 0]], [[0, 3], [3, 2, 1]],
+                         [[2, 3], [1, 3, 0]], [[0, 3, 2], [2, 0, 3, 1]],
+                         [[2, 3], [1, 3]], [[3, 1, 2, 0], [2, 3, 1, 0]],
+                         [[1, 0, 3, 2], [3, 0, 1, 2]],
+                         [[3, 2, 1, 0], [0, 1, 3, 2]], [[3, 1, 2], [3]],
+                         [[0, 1, 3, 2], [2, 3, 0, 1]],
+                         [[1, 2, 3, 0], [1, 3, 0, 2]], [3, 1, 2],
+                         [[3, 1, 2], [0, 3, 2]], [[2, 3, 0], [1, 2, 0]],
+                         [[2, 0, 3], [2, 0]], [[3, 1, 0, 2], [3, 1, 0, 2]],
+                         [[0, 1, 2], [2, 0, 1]], [[1, 0, 3], [2, 3, 0]],
+                         [[2, 0, 1], [0, 1, 3]], [[2, 1], [0, 1, 3]]]
 
     def test_dygraph(self):
         paddle.disable_static()
@@ -151,63 +162,74 @@ def test_static(self):
             for place in self.places:
                 with paddle.static.program_guard(paddle.static.Program(),
                                                  paddle.static.Program()):
-                    x = paddle.static.data(
-                        name='x', shape=self.x_shape, dtype=self.dtype)
-                    y = paddle.static.data(
-                        name='y', shape=self.y_shape, dtype=self.dtype)
+                    x = paddle.static.data(name='x',
+                                           shape=self.x_shape,
+                                           dtype=self.dtype)
+                    y = paddle.static.data(name='y',
+                                           shape=self.y_shape,
+                                           dtype=self.dtype)
                     z = paddle.tensordot(x, y, axes)
                     exe = paddle.static.Executor(place)
-                    paddle_res = exe.run(feed={'x': self.x,
-                                               'y': self.y},
+                    paddle_res = exe.run(feed={
+                        'x': self.x,
+                        'y': self.y
+                    },
                                          fetch_list=[z])
                     np_res = tensordot_np(self.x, self.y, axes)
                     np.testing.assert_allclose(paddle_res[0], np_res, rtol=1e-6)
 
 
 class TestTensordotAPIFloat64(TestTensordotAPI):
+
     def set_dtype(self):
         self.dtype = np.float64
 
 
 class TestTensordotAPIBroadcastCase1(TestTensordotAPI):
+
     def set_input_shape(self):
         self.x_shape = [1, 1, 1, 5]
         self.y_shape = [1, 5, 1, 1]
 
 
 class TestTensordotAPIBroadcastCase2(TestTensordotAPI):
+
     def set_input_shape(self):
         self.x_shape = [1, 5, 5, 5]
         self.y_shape = [1, 1, 1, 5]
 
 
 class TestTensordotAPIBroadcastCase3(TestTensordotAPI):
+
     def set_input_shape(self):
         self.x_shape = [5, 5, 5, 1]
         self.y_shape = [5, 5, 1, 5]
 
 
 class TestTensordotAPIBroadcastCase4(TestTensordotAPI):
+
     def set_input_shape(self):
         self.x_shape = [5, 5, 5, 1]
         self.y_shape = [1, 1, 1, 1]
 
 
 class TestTensordotAPIBroadcastCase5(TestTensordotAPI):
+
     def set_input_shape(self):
         self.x_shape = [1, 1, 5, 5]
         self.y_shape = [5, 5, 1, 5]
 
 
 class TestTensordotAPIAxesType(TestTensordotAPI):
+
     def set_input_shape(self):
         self.x_shape = [3, 4, 4]
         self.y_shape = [4, 4, 5]
 
     def set_test_axes(self):
         self.all_axes = [
-            0, 1, 2, (1, ), [1], ((1, ), ), ([1], ), ((2, 1), (0, )), (
-                (1, 2), (0, 1)), ([1, 2], [0, 1]), ([1, 2], [0, 1]),
+            0, 1, 2, (1, ), [1], ((1, ), ), ([1], ), ((2, 1), (0, )),
+            ((1, 2), (0, 1)), ([1, 2], [0, 1]), ([1, 2], [0, 1]),
             [[1, 2], [0, 1]]
         ]
 
@@ -217,7 +239,8 @@ def test_tensor_axes(self):
         tensor_axes = [
             paddle.to_tensor([1]), (paddle.to_tensor([1])),
             (paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])),
-            [paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])],
+            [paddle.to_tensor([1, 2]),
+             paddle.to_tensor([0, 1])],
             paddle.to_tensor([[1, 2], [0, 1]])
         ]
 
@@ -241,6 +264,7 @@ def test_error(self):
 
 
 class TestTensordotAPIAxesTypeFloat64(TestTensordotAPIAxesType):
+
     def set_dtype(self):
         self.dtype = np.float64
 
diff --git a/python/paddle/fluid/tests/unittests/test_tf32_cublas.py b/python/paddle/fluid/tests/unittests/test_tf32_cublas.py
index 32d8c3dc322e4..ce08c8db89e28 100644
--- a/python/paddle/fluid/tests/unittests/test_tf32_cublas.py
+++ b/python/paddle/fluid/tests/unittests/test_tf32_cublas.py
@@ -21,6 +21,7 @@
 
 
 class TestTF32Switch(unittest.TestCase):
+
     def test_on_off(self):
         if core.is_compiled_with_cuda():
             place = fluid.CUDAPlace(0)
@@ -36,6 +37,7 @@ def test_on_off(self):
 
 
 class TestTF32OnMatmul(unittest.TestCase):
+
     def test_dygraph_without_out(self):
         if core.is_compiled_with_cuda():
             place = fluid.CUDAPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/test_tf32_cudnn.py b/python/paddle/fluid/tests/unittests/test_tf32_cudnn.py
index 48127c2a90b49..fb1687bc1b783 100644
--- a/python/paddle/fluid/tests/unittests/test_tf32_cudnn.py
+++ b/python/paddle/fluid/tests/unittests/test_tf32_cudnn.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,6 +21,7 @@
 
 
 class TestTF32Switch(unittest.TestCase):
+
     def test_on_off(self):
         if core.is_compiled_with_cuda():
             self.assertTrue(core.get_cudnn_switch())  # default
diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py
index 8359141f309f5..c1c6820d9c17e 100644
--- a/python/paddle/fluid/tests/unittests/test_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
@@ -24,6 +24,7 @@
 
 #Situation 1: repeat_times is a list (without tensor)
 class TestTileOpRank1(OpTest):
+
     def setUp(self):
         self.op_type = "tile"
         self.init_data()
@@ -46,36 +47,42 @@ def test_check_grad(self):
 
 # with dimension expanding
 class TestTileOpRank2Expanding(TestTileOpRank1):
+
     def init_data(self):
         self.ori_shape = [120]
         self.repeat_times = [2, 2]
 
 
 class TestTileOpRank2(TestTileOpRank1):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.repeat_times = [2, 3]
 
 
 class TestTileOpRank3_Corner(TestTileOpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 10, 5)
         self.repeat_times = (1, 1, 1)
 
 
 class TestTileOpRank3_Corner2(TestTileOpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 10, 5)
         self.repeat_times = (2, 2)
 
 
 class TestTileOpRank3(TestTileOpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 4, 15)
         self.repeat_times = (2, 1, 4)
 
 
 class TestTileOpRank4(TestTileOpRank1):
+
     def init_data(self):
         self.ori_shape = (2, 4, 5, 7)
         self.repeat_times = (3, 2, 1, 2)
@@ -83,6 +90,7 @@ def init_data(self):
 
 # Situation 2: repeat_times is a list (with tensor)
 class TestTileOpRank1_tensor_attr(OpTest):
+
     def setUp(self):
         self.op_type = "tile"
         self.init_data()
@@ -112,6 +120,7 @@ def test_check_grad(self):
 
 
 class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.repeat_times = [1, 1]
@@ -119,6 +128,7 @@ def init_data(self):
 
 
 class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.repeat_times = [2, 3]
@@ -127,6 +137,7 @@ def init_data(self):
 
 # Situation 3: repeat_times is a tensor
 class TestTileOpRank1_tensor(OpTest):
+
     def setUp(self):
         self.op_type = "tile"
         self.init_data()
@@ -151,6 +162,7 @@ def test_check_grad(self):
 
 
 class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
+
     def init_data(self):
         self.ori_shape = [12, 14]
         self.repeat_times = [2, 3]
@@ -158,11 +170,11 @@ def init_data(self):
 
 # Situation 4: input x is Integer
 class TestTileOpInteger(OpTest):
+
     def setUp(self):
         self.op_type = "tile"
         self.inputs = {
-            'X': np.random.randint(
-                10, size=(4, 4, 5)).astype("int32")
+            'X': np.random.randint(10, size=(4, 4, 5)).astype("int32")
         }
         self.attrs = {'repeat_times': [2, 1, 4]}
         output = np.tile(self.inputs['X'], (2, 1, 4))
@@ -174,6 +186,7 @@ def test_check_output(self):
 
 # Situation 5: input x is Bool
 class TestTileOpBoolean(OpTest):
+
     def setUp(self):
         self.op_type = "tile"
         self.inputs = {'X': np.random.randint(2, size=(2, 4, 5)).astype("bool")}
@@ -187,11 +200,11 @@ def test_check_output(self):
 
 # Situation 56: input x is Integer
 class TestTileOpInt64_t(OpTest):
+
     def setUp(self):
         self.op_type = "tile"
         self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 5)).astype("int64")
+            'X': np.random.randint(10, size=(2, 4, 5)).astype("int64")
         }
         self.attrs = {'repeat_times': [2, 1, 4]}
         output = np.tile(self.inputs['X'], (2, 1, 4))
@@ -202,10 +215,11 @@ def test_check_output(self):
 
 
 class TestTileError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             repeat_times = [2, 2]
             self.assertRaises(TypeError, paddle.tile, x1, repeat_times)
             x2 = fluid.layers.data(name='x2', shape=[4], dtype="uint8")
@@ -216,6 +230,7 @@ def test_errors(self):
 
 
 class TestTileAPIStatic(unittest.TestCase):
+
     def test_api(self):
         with program_guard(Program(), Program()):
             repeat_times = [2, 2]
@@ -227,6 +242,7 @@ def test_api(self):
 
 # Test python API
 class TestTileAPI(unittest.TestCase):
+
     def test_api(self):
         with fluid.dygraph.guard():
             np_x = np.random.random([12, 14]).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py
index 83a940d064e76..4b67c9fd11a17 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
@@ -22,6 +22,7 @@
 
 
 class TestTopkOp(OpTest):
+
     def setUp(self):
         self.variable_k = False
         self.set_args()
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
index c4f50414f954e..4e2aecaca133e 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
@@ -39,6 +39,7 @@ def numpy_topk(x, k=1, axis=-1, largest=True):
 
 
 class TestTopkOp(OpTest):
+
     def init_args(self):
         self.k = 3
         self.axis = 1
@@ -52,8 +53,10 @@ def setUp(self):
         self.init_args()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
-        output, indices = numpy_topk(
-            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        output, indices = numpy_topk(self.input_data,
+                                     axis=self.axis,
+                                     k=self.k,
+                                     largest=self.largest)
         self.outputs = {'Out': output, 'Indices': indices}
 
     def test_check_output(self):
@@ -64,6 +67,7 @@ def test_check_grad(self):
 
 
 class TestTopkOp1(TestTopkOp):
+
     def init_args(self):
         self.k = 3
         self.axis = 0
@@ -71,6 +75,7 @@ def init_args(self):
 
 
 class TestTopkOp2(TestTopkOp):
+
     def init_args(self):
         self.k = 4
         self.axis = 0
@@ -78,6 +83,7 @@ def init_args(self):
 
 
 class TestTopkOp3(OpTest):
+
     def init_args(self):
         self.k = 6
         self.axis = 1
@@ -91,12 +97,15 @@ def setUp(self):
         self.init_args()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
-        output, indices = numpy_topk(
-            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        output, indices = numpy_topk(self.input_data,
+                                     axis=self.axis,
+                                     k=self.k,
+                                     largest=self.largest)
         self.outputs = {'Out': output, 'Indices': indices}
 
 
 class TestTopkOp4(TestTopkOp):
+
     def init_args(self):
         self.k = 3
         self.axis = 1
@@ -110,12 +119,15 @@ def setUp(self):
         self.init_args()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
-        output, indices = numpy_topk(
-            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        output, indices = numpy_topk(self.input_data,
+                                     axis=self.axis,
+                                     k=self.k,
+                                     largest=self.largest)
         self.outputs = {'Out': output, 'Indices': indices}
 
 
 class TestTopkOp5(TestTopkOp):
+
     def init_args(self):
         self.k = 3
         self.axis = 1
@@ -129,12 +141,15 @@ def setUp(self):
         self.init_args()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
-        output, indices = numpy_topk(
-            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        output, indices = numpy_topk(self.input_data,
+                                     axis=self.axis,
+                                     k=self.k,
+                                     largest=self.largest)
         self.outputs = {'Out': output, 'Indices': indices}
 
 
 class TestTopkOp6(OpTest):
+
     def init_args(self):
         self.k = 100
         self.axis = 1
@@ -148,12 +163,15 @@ def setUp(self):
         self.init_args()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
-        output, indices = numpy_topk(
-            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        output, indices = numpy_topk(self.input_data,
+                                     axis=self.axis,
+                                     k=self.k,
+                                     largest=self.largest)
         self.outputs = {'Out': output, 'Indices': indices}
 
 
 class TestTopKAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(123)
         self.input_data = np.random.rand(6, 7, 8)
@@ -187,35 +205,44 @@ def run_dygraph(self, place):
                 np.allclose(paddle_result[1].numpy(), numpy_result[1]))
             # test case for basic test case 4 with tensor largest
             k_tensor = paddle.to_tensor(np.array([2]))
-            paddle_result = paddle.topk(
-                input_tensor, k=2, axis=1, largest=False)
-            numpy_result = numpy_topk(
-                self.input_data, k=2, axis=1, largest=False)
+            paddle_result = paddle.topk(input_tensor,
+                                        k=2,
+                                        axis=1,
+                                        largest=False)
+            numpy_result = numpy_topk(self.input_data,
+                                      k=2,
+                                      axis=1,
+                                      largest=False)
             self.assertTrue(
                 np.allclose(paddle_result[0].numpy(), numpy_result[0]))
             self.assertTrue(
                 np.allclose(paddle_result[1].numpy(), numpy_result[1]))
             # test case for basic test case 5 with axis -1
             k_tensor = paddle.to_tensor(np.array([2]))
-            paddle_result = paddle.topk(
-                input_tensor, k=2, axis=-1, largest=False)
-            numpy_result = numpy_topk(
-                self.input_data, k=2, axis=-1, largest=False)
+            paddle_result = paddle.topk(input_tensor,
+                                        k=2,
+                                        axis=-1,
+                                        largest=False)
+            numpy_result = numpy_topk(self.input_data,
+                                      k=2,
+                                      axis=-1,
+                                      largest=False)
             self.assertTrue(
                 np.allclose(paddle_result[0].numpy(), numpy_result[0]))
             self.assertTrue(
                 np.allclose(paddle_result[1].numpy(), numpy_result[1]))
-            # test case for basic test case 6 for the partial sort 
+            # test case for basic test case 6 for the partial sort
             paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1)
             numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
             self.assertTrue(
                 np.allclose(paddle_result[0].numpy(), numpy_result[0]))
             self.assertTrue(
                 np.allclose(paddle_result[1].numpy(), numpy_result[1]))
-            # test case for basic test case 7 for the unsorted 
+            # test case for basic test case 7 for the unsorted
             paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
-            sort_paddle = numpy_topk(
-                np.array(paddle_result[0].numpy()), axis=1, k=2)
+            sort_paddle = numpy_topk(np.array(paddle_result[0].numpy()),
+                                     axis=1,
+                                     k=2)
             numpy_result = numpy_topk(self.input_data, k=2, axis=1)
             self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
 
@@ -223,10 +250,12 @@ def run_static(self, place):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
-            input_tensor = paddle.static.data(
-                name="x", shape=[6, 7, 8], dtype="float64")
-            large_input_tensor = paddle.static.data(
-                name="large_x", shape=[2, 1030], dtype="float64")
+            input_tensor = paddle.static.data(name="x",
+                                              shape=[6, 7, 8],
+                                              dtype="float64")
+            large_input_tensor = paddle.static.data(name="large_x",
+                                                    shape=[2, 1030],
+                                                    dtype="float64")
             k_tensor = paddle.static.data(name="k", shape=[1], dtype="int32")
             result1 = paddle.topk(input_tensor, k=2)
             result2 = paddle.topk(input_tensor, k=2, axis=-1)
@@ -240,17 +269,18 @@ def run_static(self, place):
             exe = paddle.static.Executor(place)
             input_data = np.random.rand(10, 20).astype("float64")
             large_input_data = np.random.rand(2, 100).astype("float64")
-            paddle_result = exe.run(
-                feed={
-                    "x": self.input_data,
-                    "large_x": self.large_input_data,
-                    "k": np.array([2]).astype("int32")
-                },
-                fetch_list=[
-                    result1[0], result1[1], result2[0], result2[1], result3[0],
-                    result3[1], result4[0], result4[1], result5[0], result5[1],
-                    result6[0], result6[1], result7[0], result7[1]
-                ])
+            paddle_result = exe.run(feed={
+                "x": self.input_data,
+                "large_x": self.large_input_data,
+                "k": np.array([2]).astype("int32")
+            },
+                                    fetch_list=[
+                                        result1[0], result1[1], result2[0],
+                                        result2[1], result3[0], result3[1],
+                                        result4[0], result4[1], result5[0],
+                                        result5[1], result6[0], result6[1],
+                                        result7[0], result7[1]
+                                    ])
             numpy_result = numpy_topk(self.input_data, k=2)
             self.assertTrue(np.allclose(paddle_result[0], numpy_result[0]))
             self.assertTrue(np.allclose(paddle_result[1], numpy_result[1]))
@@ -260,12 +290,16 @@ def run_static(self, place):
             numpy_result = numpy_topk(self.input_data, k=2, axis=1)
             self.assertTrue(np.allclose(paddle_result[4], numpy_result[0]))
             self.assertTrue(np.allclose(paddle_result[5], numpy_result[1]))
-            numpy_result = numpy_topk(
-                self.input_data, k=2, axis=1, largest=False)
+            numpy_result = numpy_topk(self.input_data,
+                                      k=2,
+                                      axis=1,
+                                      largest=False)
             self.assertTrue(np.allclose(paddle_result[6], numpy_result[0]))
             self.assertTrue(np.allclose(paddle_result[7], numpy_result[1]))
-            numpy_result = numpy_topk(
-                self.input_data, k=2, axis=-1, largest=False)
+            numpy_result = numpy_topk(self.input_data,
+                                      k=2,
+                                      axis=-1,
+                                      largest=False)
             self.assertTrue(np.allclose(paddle_result[8], numpy_result[0]))
             self.assertTrue(np.allclose(paddle_result[9], numpy_result[1]))
             numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
diff --git a/python/paddle/fluid/tests/unittests/test_trace_op.py b/python/paddle/fluid/tests/unittests/test_trace_op.py
index 3320b240e5615..bb6bbcf4e9cb9 100644
--- a/python/paddle/fluid/tests/unittests/test_trace_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trace_op.py
@@ -25,6 +25,7 @@
 
 
 class TestTraceOp(OpTest):
+
     def setUp(self):
         self.op_type = "trace"
         self.init_config()
@@ -44,30 +45,31 @@ def init_config(self):
 
 
 class TestTraceOpCase1(TestTraceOp):
+
     def init_config(self):
         self.case = np.random.randn(2, 20, 2, 3).astype('float32')
         self.inputs = {'Input': self.case}
         self.attrs = {'offset': 1, 'axis1': 0, 'axis2': 2}
-        self.target = np.trace(
-            self.inputs['Input'],
-            offset=self.attrs['offset'],
-            axis1=self.attrs['axis1'],
-            axis2=self.attrs['axis2'])
+        self.target = np.trace(self.inputs['Input'],
+                               offset=self.attrs['offset'],
+                               axis1=self.attrs['axis1'],
+                               axis2=self.attrs['axis2'])
 
 
 class TestTraceOpCase2(TestTraceOp):
+
     def init_config(self):
         self.case = np.random.randn(2, 20, 2, 3).astype('float32')
         self.inputs = {'Input': self.case}
         self.attrs = {'offset': -5, 'axis1': 1, 'axis2': -1}
-        self.target = np.trace(
-            self.inputs['Input'],
-            offset=self.attrs['offset'],
-            axis1=self.attrs['axis1'],
-            axis2=self.attrs['axis2'])
+        self.target = np.trace(self.inputs['Input'],
+                               offset=self.attrs['offset'],
+                               axis1=self.attrs['axis1'],
+                               axis2=self.attrs['axis2'])
 
 
 class TestTraceAPICase(unittest.TestCase):
+
     def test_case1(self):
         case = np.random.randn(2, 20, 2, 3).astype('float32')
         data1 = fluid.data(name='data1', shape=[2, 20, 2, 3], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
index 5703ce1313176..a2ccfa925ed81 100644
--- a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
+++ b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
@@ -22,6 +22,7 @@
 
 
 class SimpleFCLayer(nn.Layer):
+
     def __init__(self, feature_size, batch_size, fc_size):
         super(SimpleFCLayer, self).__init__()
         self._linear = nn.Linear(feature_size, fc_size)
@@ -34,6 +35,7 @@ def forward(self, x):
 
 
 class LinearNetWithNone(nn.Layer):
+
     def __init__(self, feature_size, fc_size):
         super(LinearNetWithNone, self).__init__()
         self._linear = nn.Linear(feature_size, fc_size)
@@ -45,6 +47,7 @@ def forward(self, x):
 
 
 class TestTracedLayerErrMsg(unittest.TestCase):
+
     def setUp(self):
         self.batch_size = 4
         self.feature_size = 3
@@ -57,27 +60,27 @@ def test_trace_err(self):
             return
         with fluid.dygraph.guard():
             in_x = fluid.dygraph.to_variable(
-                np.random.random((self.batch_size, self.feature_size)).astype(
-                    'float32'))
+                np.random.random(
+                    (self.batch_size, self.feature_size)).astype('float32'))
 
             with self.assertRaises(AssertionError) as e:
                 dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
                     None, [in_x])
             self.assertEqual(
-                "The type of 'layer' in fluid.dygraph.jit.TracedLayer.trace must be fluid.dygraph.Layer, but received <{} 'NoneType'>.".
-                format(self.type_str), str(e.exception))
+                "The type of 'layer' in fluid.dygraph.jit.TracedLayer.trace must be fluid.dygraph.Layer, but received <{} 'NoneType'>."
+                .format(self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
                 dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
                     self.layer, 3)
             self.assertEqual(
-                "The type of 'each element of inputs' in fluid.dygraph.jit.TracedLayer.trace must be fluid.Variable, but received <{} 'int'>.".
-                format(self.type_str), str(e.exception))
+                "The type of 'each element of inputs' in fluid.dygraph.jit.TracedLayer.trace must be fluid.Variable, but received <{} 'int'>."
+                .format(self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
                 dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
                     self.layer, [True, 1])
             self.assertEqual(
-                "The type of 'each element of inputs' in fluid.dygraph.jit.TracedLayer.trace must be fluid.Variable, but received <{} 'bool'>.".
-                format(self.type_str), str(e.exception))
+                "The type of 'each element of inputs' in fluid.dygraph.jit.TracedLayer.trace must be fluid.Variable, but received <{} 'bool'>."
+                .format(self.type_str), str(e.exception))
 
             dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
                 self.layer, [in_x])
@@ -87,22 +90,22 @@ def test_set_strategy_err(self):
             return
         with fluid.dygraph.guard():
             in_x = fluid.dygraph.to_variable(
-                np.random.random((self.batch_size, self.feature_size)).astype(
-                    'float32'))
+                np.random.random(
+                    (self.batch_size, self.feature_size)).astype('float32'))
             dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
                 self.layer, [in_x])
 
             with self.assertRaises(AssertionError) as e:
                 traced_layer.set_strategy(1, fluid.ExecutionStrategy())
             self.assertEqual(
-                "The type of 'build_strategy' in fluid.dygraph.jit.TracedLayer.set_strategy must be fluid.BuildStrategy, but received <{} 'int'>.".
-                format(self.type_str), str(e.exception))
+                "The type of 'build_strategy' in fluid.dygraph.jit.TracedLayer.set_strategy must be fluid.BuildStrategy, but received <{} 'int'>."
+                .format(self.type_str), str(e.exception))
 
             with self.assertRaises(AssertionError) as e:
                 traced_layer.set_strategy(fluid.BuildStrategy(), False)
             self.assertEqual(
-                "The type of 'exec_strategy' in fluid.dygraph.jit.TracedLayer.set_strategy must be fluid.ExecutionStrategy, but received <{} 'bool'>.".
-                format(self.type_str), str(e.exception))
+                "The type of 'exec_strategy' in fluid.dygraph.jit.TracedLayer.set_strategy must be fluid.ExecutionStrategy, but received <{} 'bool'>."
+                .format(self.type_str), str(e.exception))
 
             traced_layer.set_strategy(build_strategy=fluid.BuildStrategy())
             traced_layer.set_strategy(exec_strategy=fluid.ExecutionStrategy())
@@ -114,8 +117,8 @@ def test_save_inference_model_err(self):
             return
         with fluid.dygraph.guard():
             in_x = fluid.dygraph.to_variable(
-                np.random.random((self.batch_size, self.feature_size)).astype(
-                    'float32'))
+                np.random.random(
+                    (self.batch_size, self.feature_size)).astype('float32'))
             dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
                 self.layer, [in_x])
 
@@ -123,29 +126,29 @@ def test_save_inference_model_err(self):
             with self.assertRaises(TypeError) as e:
                 traced_layer.save_inference_model([0])
             self.assertEqual(
-                "The type of 'path' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'str'>, but received <{} 'list'>. ".
-                format(self.type_str, self.type_str), str(e.exception))
+                "The type of 'path' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'str'>, but received <{} 'list'>. "
+                .format(self.type_str, self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
                 traced_layer.save_inference_model(path, [0], [None])
             self.assertEqual(
-                "The type of 'each element of fetch' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'int'>, but received <{} 'NoneType'>. ".
-                format(self.type_str, self.type_str), str(e.exception))
+                "The type of 'each element of fetch' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'int'>, but received <{} 'NoneType'>. "
+                .format(self.type_str, self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
                 traced_layer.save_inference_model(path, [0], False)
             self.assertEqual(
-                "The type of 'fetch' in fluid.dygraph.jit.TracedLayer.save_inference_model must be (<{} 'NoneType'>, <{} 'list'>), but received <{} 'bool'>. ".
-                format(self.type_str, self.type_str, self.type_str),
+                "The type of 'fetch' in fluid.dygraph.jit.TracedLayer.save_inference_model must be (<{} 'NoneType'>, <{} 'list'>), but received <{} 'bool'>. "
+                .format(self.type_str, self.type_str, self.type_str),
                 str(e.exception))
             with self.assertRaises(TypeError) as e:
                 traced_layer.save_inference_model(path, [None], [0])
             self.assertEqual(
-                "The type of 'each element of feed' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'int'>, but received <{} 'NoneType'>. ".
-                format(self.type_str, self.type_str), str(e.exception))
+                "The type of 'each element of feed' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'int'>, but received <{} 'NoneType'>. "
+                .format(self.type_str, self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
                 traced_layer.save_inference_model(path, True, [0])
             self.assertEqual(
-                "The type of 'feed' in fluid.dygraph.jit.TracedLayer.save_inference_model must be (<{} 'NoneType'>, <{} 'list'>), but received <{} 'bool'>. ".
-                format(self.type_str, self.type_str, self.type_str),
+                "The type of 'feed' in fluid.dygraph.jit.TracedLayer.save_inference_model must be (<{} 'NoneType'>, <{} 'list'>), but received <{} 'bool'>. "
+                .format(self.type_str, self.type_str, self.type_str),
                 str(e.exception))
             with self.assertRaises(ValueError) as e:
                 traced_layer.save_inference_model("")
@@ -165,8 +168,8 @@ def _train_simple_net(self):
 
             for i in range(5):
                 in_x = fluid.dygraph.to_variable(
-                    np.random.random((self.batch_size, self.feature_size))
-                    .astype('float32'))
+                    np.random.random(
+                        (self.batch_size, self.feature_size)).astype('float32'))
                 dygraph_out = layer(in_x)
                 loss = fluid.layers.reduce_mean(dygraph_out)
                 loss.backward()
@@ -175,14 +178,15 @@ def _train_simple_net(self):
 
 
 class TestOutVarWithNoneErrMsg(unittest.TestCase):
+
     def test_linear_net_with_none(self):
         if fluid.framework.in_dygraph_mode():
             return
         model = LinearNetWithNone(100, 16)
         in_x = paddle.to_tensor(np.random.random((4, 100)).astype('float32'))
         with self.assertRaises(TypeError):
-            dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(model,
-                                                                        [in_x])
+            dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
+                model, [in_x])
 
 
 class TestTracedLayerSaveInferenceModel(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_trainable.py b/python/paddle/fluid/tests/unittests/test_trainable.py
index 35ae9d9b47c38..72edff9f29b34 100644
--- a/python/paddle/fluid/tests/unittests/test_trainable.py
+++ b/python/paddle/fluid/tests/unittests/test_trainable.py
@@ -32,6 +32,7 @@ def test_trainable():
 
 
 class TestTrainable(unittest.TestCase):
+
     def check_trainable(self,
                         model,
                         feed_dict,
@@ -64,18 +65,21 @@ def test_trainable(self):
         feed_dict = {'image': img, 'label': label}
         # Note that, because the Weight of FC is not trainable and the x is stop_gradient,
         # so the 'mul_grad' should not be appended.
+        self.check_trainable(test_trainable,
+                             feed_dict,
+                             op_count={
+                                 'adam': 1,
+                                 'scale': 0,
+                                 'mul_grad': 0
+                             })
         self.check_trainable(
             test_trainable,
             feed_dict,
-            op_count={'adam': 1,
-                      'scale': 0,
-                      'mul_grad': 0})
-        self.check_trainable(
-            test_trainable,
-            feed_dict,
-            op_count={'adamax': 1,
-                      'scale': 1,
-                      'mul_grad': 0},
+            op_count={
+                'adamax': 1,
+                'scale': 1,
+                'mul_grad': 0
+            },
             optimizer=fluid.optimizer.Adamax(learning_rate=0.2))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_transfer_dtype_op.py b/python/paddle/fluid/tests/unittests/test_transfer_dtype_op.py
index 637a6c144685b..0d213994e0bf0 100644
--- a/python/paddle/fluid/tests/unittests/test_transfer_dtype_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transfer_dtype_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,6 +24,7 @@
 
 
 class TestTransferDtypeOpFp32ToFp64(OpTest):
+
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float32')}
@@ -39,6 +40,7 @@ def test_check_output(self):
 
 
 class TestTransferDtypeOpFp16ToFp32(OpTest):
+
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float16')}
@@ -54,6 +56,7 @@ def test_check_output(self):
 
 
 class TestTransferDtypeOpFp32ToFp16(OpTest):
+
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float32')}
@@ -69,6 +72,7 @@ def test_check_output(self):
 
 
 class TestTransferDtypeOpBf16ToFp32(OpTest):
+
     def setUp(self):
         ipt = np.array(np.random.randint(10, size=[10, 10])).astype('uint16')
         self.inputs = {'X': ipt}
@@ -84,6 +88,7 @@ def test_check_output(self):
 
 
 class TestTransferDtypeFp32ToBf16(OpTest):
+
     def setUp(self):
         ipt = np.random.random(size=[10, 10]).astype('float32')
         self.inputs = {'X': ipt}
diff --git a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
index 94644cf2fec1d..e7a373e4c24c2 100644
--- a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,6 +25,7 @@
 
 # default kNCHW
 class TestTransferLayoutOpkNCHWTokNHWC(OpTest):
+
     def setUp(self):
         ipt = np.random.random(size=[2, 3, 10, 10])
         self.inputs = {'X': ipt.astype('float32')}
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index 587cedc6aad74..6b254ac3115d4 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -59,8 +59,8 @@ def generate_query_key_value_cache(self_attention,
                                    cache=None):
     query = np.random.rand(batch_size, query_length,
                            embed_dim).astype("float32")
-    attn_mask = np.ones(
-        (batch_size, num_heads, query_length, key_length), dtype=attn_mask_type)
+    attn_mask = np.ones((batch_size, num_heads, query_length, key_length),
+                        dtype=attn_mask_type)
     if attn_mask_type == 'int64':
         attn_mask = np.tril(attn_mask)
     elif attn_mask_type == 'float64':
@@ -77,15 +77,18 @@ def generate_query_key_value_cache(self_attention,
     cache_dict = {}
     if cache:
         if not self_attention:
-            cache_dict["static_k"] = np.random.rand(
-                batch_size, num_heads, key_length, head_dim).astype("float32")
-            cache_dict["static_v"] = np.random.rand(
-                batch_size, num_heads, value_length, head_dim).astype("float32")
+            cache_dict["static_k"] = np.random.rand(batch_size, num_heads,
+                                                    key_length,
+                                                    head_dim).astype("float32")
+            cache_dict["static_v"] = np.random.rand(batch_size, num_heads,
+                                                    value_length,
+                                                    head_dim).astype("float32")
         else:
             cache_dict["k"] = np.random.rand(batch_size, num_heads, key_length,
                                              head_dim).astype("float32")
-            cache_dict["v"] = np.random.rand(
-                batch_size, num_heads, value_length, head_dim).astype("float32")
+            cache_dict["v"] = np.random.rand(batch_size, num_heads,
+                                             value_length,
+                                             head_dim).astype("float32")
     else:
         cache_dict = None
     return query, key, value, attn_mask, cache_dict
@@ -110,8 +113,8 @@ def softmax(x):
 def batch_matmul(x, y):
     assert x.shape[0] == y.shape[0]
     assert x.shape[1] == y.shape[1]
-    retval = np.zeros(
-        (x.shape[0], x.shape[1], x.shape[2], y.shape[3]), dtype=np.float64)
+    retval = np.zeros((x.shape[0], x.shape[1], x.shape[2], y.shape[3]),
+                      dtype=np.float64)
     for i in range(x.shape[0]):
         for j in range(x.shape[1]):
             retval[i, j, :, :] = np.matmul(x[i, j, :, :], y[i, j, :, :])
@@ -220,7 +223,9 @@ def ffn(src, encoder_layer, ffn_fc1_act="relu"):
 
 
 class TestTransformer(unittest.TestCase):
+
     def test_multi_head_attention(self):
+
         def multihead_attention_test_helper(self_attention, cache):
             paddle.seed(2020)
             paddle.framework.random._manual_program_seed(2020)
@@ -236,8 +241,8 @@ def multihead_attention_test_helper(self_attention, cache):
                         embed_dim, attn_mask_type, key_length, value_length,
                         kdim, vdim, cache)
                     if cache and self_attention:
-                        attn_mask = np.concatenate(
-                            (attn_mask, attn_mask), axis=3)
+                        attn_mask = np.concatenate((attn_mask, attn_mask),
+                                                   axis=3)
                     need_weight, param_attr, bias_attr = False, None, None
                     # call paddle's function
                     multi_head_attn = MultiHeadAttention(
@@ -256,15 +261,14 @@ def multihead_attention_test_helper(self_attention, cache):
                                 paddle.to_tensor(cache_dict['static_v']))
                     if attn_mask is not None:
                         attn_output = multi_head_attn(
-                            paddle.to_tensor(query),
-                            paddle.to_tensor(key),
+                            paddle.to_tensor(query), paddle.to_tensor(key),
                             paddle.to_tensor(value),
                             paddle.to_tensor(attn_mask), cache_obj)
                     else:
-                        attn_output = multi_head_attn(
-                            paddle.to_tensor(query),
-                            paddle.to_tensor(key),
-                            paddle.to_tensor(value), attn_mask, cache_obj)
+                        attn_output = multi_head_attn(paddle.to_tensor(query),
+                                                      paddle.to_tensor(key),
+                                                      paddle.to_tensor(value),
+                                                      attn_mask, cache_obj)
                     attn_output = attn_output[0] if cache_dict else attn_output
 
                     # implementation by numpy
@@ -279,8 +283,9 @@ def multihead_attention_test_helper(self_attention, cache):
                     out_proj_weight = multi_head_attn.out_proj.weight.numpy()
                     reference = fc(attn_heads, out_proj_weight)
 
-                    np.testing.assert_allclose(
-                        attn_output.numpy(), reference, atol=1e-6)
+                    np.testing.assert_allclose(attn_output.numpy(),
+                                               reference,
+                                               atol=1e-6)
 
         multihead_attention_test_helper(True, True)
         multihead_attention_test_helper(True, False)
@@ -306,21 +311,23 @@ def test_transformer_encoder_layer(self):
             src_mask[0][0][0][0] = -np.inf
 
             # paddle
-            encoder_layer = TransformerEncoderLayer(
-                d_model, n_head, dim_feedforward, dropout, ffn_fc1_act,
-                attn_dropout, act_dropout)
+            encoder_layer = TransformerEncoderLayer(d_model, n_head,
+                                                    dim_feedforward, dropout,
+                                                    ffn_fc1_act, attn_dropout,
+                                                    act_dropout)
 
             encoder_output = encoder_layer(
                 paddle.to_tensor(src),
                 paddle.to_tensor(src_mask))  # paddle.to_tensor(src_mask))
             # 4.numpy:
             # paddle self attention
-            self_attn = MultiHeadAttention(
-                d_model, n_head, dropout=attn_dropout)
-            attn_output = self_attn(
-                paddle.to_tensor(src),
-                paddle.to_tensor(src),
-                paddle.to_tensor(src), paddle.to_tensor(src_mask)).numpy()
+            self_attn = MultiHeadAttention(d_model,
+                                           n_head,
+                                           dropout=attn_dropout)
+            attn_output = self_attn(paddle.to_tensor(src),
+                                    paddle.to_tensor(src),
+                                    paddle.to_tensor(src),
+                                    paddle.to_tensor(src_mask)).numpy()
 
             src = attn_output + residual
             src_norm = layer_norm(src, d_model, encoder_layer.norm1)
@@ -330,8 +337,10 @@ def test_transformer_encoder_layer(self):
             src = residual + ffn_output
             src = layer_norm(src, d_model, encoder_layer.norm2)
 
-            np.testing.assert_allclose(
-                encoder_output.numpy(), src, rtol=1e-5, atol=1e-6)
+            np.testing.assert_allclose(encoder_output.numpy(),
+                                       src,
+                                       rtol=1e-5,
+                                       atol=1e-6)
 
     def test_transformer_encoder_layer_attr_1(self):
         with fluid.dygraph.guard(fluid.CPUPlace()):
@@ -351,29 +360,31 @@ def test_transformer_encoder_layer_attr_1(self):
 
             for cache in [True, False]:
                 # paddle
-                encoder_layer = TransformerEncoderLayer(
-                    d_model, n_head, dim_feedforward, dropout, ffn_fc1_act,
-                    attn_dropout, act_dropout)
+                encoder_layer = TransformerEncoderLayer(d_model, n_head,
+                                                        dim_feedforward,
+                                                        dropout, ffn_fc1_act,
+                                                        attn_dropout,
+                                                        act_dropout)
                 cache_objs = None
                 if cache:
                     cache_objs = encoder_layer.gen_cache(paddle.to_tensor(src))
 
-                encoder_output = encoder_layer(
-                    paddle.to_tensor(src),
-                    paddle.to_tensor(src_mask), cache_objs)
+                encoder_output = encoder_layer(paddle.to_tensor(src),
+                                               paddle.to_tensor(src_mask),
+                                               cache_objs)
                 encoder_output = encoder_output[0].numpy(
                 ) if cache else encoder_output.numpy()
 
                 # 4.numpy:
                 residual = src
                 # paddle self attention
-                self_attn = MultiHeadAttention(
-                    d_model, n_head, dropout=attn_dropout)
-                attn_output = self_attn(
-                    paddle.to_tensor(src),
-                    paddle.to_tensor(src),
-                    paddle.to_tensor(src),
-                    paddle.to_tensor(src_mask), cache_objs)
+                self_attn = MultiHeadAttention(d_model,
+                                               n_head,
+                                               dropout=attn_dropout)
+                attn_output = self_attn(paddle.to_tensor(src),
+                                        paddle.to_tensor(src),
+                                        paddle.to_tensor(src),
+                                        paddle.to_tensor(src_mask), cache_objs)
                 attn_output = attn_output[0].numpy(
                 ) if cache else attn_output.numpy()
 
@@ -385,8 +396,10 @@ def test_transformer_encoder_layer_attr_1(self):
                 src = residual + ffn_output
                 src = layer_norm(src, d_model, encoder_layer.norm2)
 
-                np.testing.assert_allclose(
-                    encoder_output, src, rtol=1e-5, atol=1e-6)
+                np.testing.assert_allclose(encoder_output,
+                                           src,
+                                           rtol=1e-5,
+                                           atol=1e-6)
 
     def test_transformer_decoder_layer(self):
         with fluid.dygraph.guard(fluid.CPUPlace()):
@@ -406,10 +419,12 @@ def test_transformer_decoder_layer(self):
                                     source_length)).astype("float32")
             memory_mask[0][0][0][0] = -1e9
             for cache in [True, False]:
-                self_attn = MultiHeadAttention(
-                    d_model, n_head, dropout=attn_dropout)
-                cross_attn = MultiHeadAttention(
-                    d_model, n_head, dropout=attn_dropout)
+                self_attn = MultiHeadAttention(d_model,
+                                               n_head,
+                                               dropout=attn_dropout)
+                cross_attn = MultiHeadAttention(d_model,
+                                                n_head,
+                                                dropout=attn_dropout)
 
                 # paddle decoderlayer:
                 decoder_layer = TransformerDecoderLayer(
@@ -420,11 +435,11 @@ def test_transformer_decoder_layer(self):
                     cache_objs = decoder_layer.gen_cache(
                         paddle.to_tensor(memory))
 
-                decoder_output = decoder_layer(
-                    paddle.to_tensor(tgt),
-                    paddle.to_tensor(memory),
-                    paddle.to_tensor(tgt_mask),
-                    paddle.to_tensor(memory_mask), cache_objs)
+                decoder_output = decoder_layer(paddle.to_tensor(tgt),
+                                               paddle.to_tensor(memory),
+                                               paddle.to_tensor(tgt_mask),
+                                               paddle.to_tensor(memory_mask),
+                                               cache_objs)
 
                 decoder_output = decoder_output[0].numpy(
                 ) if cache else decoder_output.numpy()
@@ -434,11 +449,9 @@ def test_transformer_decoder_layer(self):
                 # self-attn
                 self_attn_cache = cache_objs[
                     0] if cache_objs is not None else None
-                tgt = self_attn(
-                    paddle.to_tensor(tgt),
-                    paddle.to_tensor(tgt),
-                    paddle.to_tensor(tgt),
-                    paddle.to_tensor(tgt_mask), self_attn_cache)
+                tgt = self_attn(paddle.to_tensor(tgt), paddle.to_tensor(tgt),
+                                paddle.to_tensor(tgt),
+                                paddle.to_tensor(tgt_mask), self_attn_cache)
 
                 tgt = tgt[0].numpy() if cache else tgt.numpy()
 
@@ -449,11 +462,11 @@ def test_transformer_decoder_layer(self):
                 # cross-attn
                 cross_attn_cache = cache_objs[
                     1] if cache_objs is not None else None
-                tgt = cross_attn(
-                    paddle.to_tensor(tgt_norm),
-                    paddle.to_tensor(memory),
-                    paddle.to_tensor(memory),
-                    paddle.to_tensor(memory_mask), cross_attn_cache)
+                tgt = cross_attn(paddle.to_tensor(tgt_norm),
+                                 paddle.to_tensor(memory),
+                                 paddle.to_tensor(memory),
+                                 paddle.to_tensor(memory_mask),
+                                 cross_attn_cache)
                 tgt = tgt[0].numpy() if cache else tgt.numpy()
 
                 # postprocess
@@ -466,8 +479,10 @@ def test_transformer_decoder_layer(self):
                 tgt = residual + ffn_output
                 tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm3)
 
-                np.testing.assert_allclose(
-                    decoder_output, tgt_norm, rtol=1e-5, atol=1e-6)
+                np.testing.assert_allclose(decoder_output,
+                                           tgt_norm,
+                                           rtol=1e-5,
+                                           atol=1e-6)
 
     def test_encoder(self):
         batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
@@ -485,8 +500,8 @@ def test_encoder(self):
             num_layers = 6
             encoder = TransformerEncoder(encoder_layer, num_layers)
             # src, src_mask
-            enc_output = encoder(
-                paddle.to_tensor(src), paddle.to_tensor(src_mask))
+            enc_output = encoder(paddle.to_tensor(src),
+                                 paddle.to_tensor(src_mask))
 
     def test_encoder_attr_1(self):
         batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
@@ -501,8 +516,9 @@ def test_encoder_attr_1(self):
         with fluid.dygraph.guard(fluid.CPUPlace()):
             for cache in [True, False]:
                 # paddle
-                encoder_layer = TransformerEncoderLayer(
-                    d_model, n_head, dim_feedforward, dropout)
+                encoder_layer = TransformerEncoderLayer(d_model, n_head,
+                                                        dim_feedforward,
+                                                        dropout)
                 num_layers = 6
                 encoder = TransformerEncoder(encoder_layer, num_layers)
                 cache_objs = None
@@ -510,9 +526,8 @@ def test_encoder_attr_1(self):
                     cache_objs = encoder.gen_cache(paddle.to_tensor(src))
 
                 # src, src_mask
-                enc_output = encoder(
-                    paddle.to_tensor(src),
-                    paddle.to_tensor(src_mask), cache_objs)
+                enc_output = encoder(paddle.to_tensor(src),
+                                     paddle.to_tensor(src_mask), cache_objs)
 
     def test_decoder(self):
         batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
@@ -533,10 +548,9 @@ def test_decoder(self):
             num_layers = 6
             decoder = TransformerDecoder(decoder_layer, num_layers)
 
-            output = decoder(
-                paddle.to_tensor(tgt),
-                paddle.to_tensor(memory),
-                paddle.to_tensor(tgt_mask), paddle.to_tensor(memory_mask))
+            output = decoder(paddle.to_tensor(tgt), paddle.to_tensor(memory),
+                             paddle.to_tensor(tgt_mask),
+                             paddle.to_tensor(memory_mask))
 
     def test_transformer(self):
         batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
@@ -544,17 +558,16 @@ def test_transformer(self):
 
         # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
         with fluid.dygraph.guard(fluid.CPUPlace()):
-            transformer = Transformer(
-                d_model,
-                n_head,
-                dim_feedforward=dim_feedforward,
-                dropout=dropout)
+            transformer = Transformer(d_model,
+                                      n_head,
+                                      dim_feedforward=dim_feedforward,
+                                      dropout=dropout)
             src = paddle.to_tensor(
-                np.random.rand(batch_size, source_length, d_model).astype(
-                    "float32"))
+                np.random.rand(batch_size, source_length,
+                               d_model).astype("float32"))
             tgt = paddle.to_tensor(
-                np.random.rand(batch_size, target_length, d_model).astype(
-                    "float32"))
+                np.random.rand(batch_size, target_length,
+                               d_model).astype("float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
@@ -576,19 +589,18 @@ def test_transformer_attr_1(self):
 
         # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
         with fluid.dygraph.guard(fluid.CPUPlace()):
-            transformer = Transformer(
-                d_model,
-                n_head,
-                dim_feedforward=dim_feedforward,
-                dropout=dropout,
-                weight_attr=[None],
-                bias_attr=[False])
+            transformer = Transformer(d_model,
+                                      n_head,
+                                      dim_feedforward=dim_feedforward,
+                                      dropout=dropout,
+                                      weight_attr=[None],
+                                      bias_attr=[False])
             src = paddle.to_tensor(
-                np.random.rand(batch_size, source_length, d_model).astype(
-                    "float32"))
+                np.random.rand(batch_size, source_length,
+                               d_model).astype("float32"))
             tgt = paddle.to_tensor(
-                np.random.rand(batch_size, target_length, d_model).astype(
-                    "float32"))
+                np.random.rand(batch_size, target_length,
+                               d_model).astype("float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
@@ -610,19 +622,18 @@ def test_transformer_attr_2(self):
 
         # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
         with fluid.dygraph.guard(fluid.CPUPlace()):
-            transformer = Transformer(
-                d_model,
-                n_head,
-                dim_feedforward=dim_feedforward,
-                dropout=dropout,
-                weight_attr=[None, None],
-                bias_attr=[False, False])
+            transformer = Transformer(d_model,
+                                      n_head,
+                                      dim_feedforward=dim_feedforward,
+                                      dropout=dropout,
+                                      weight_attr=[None, None],
+                                      bias_attr=[False, False])
             src = paddle.to_tensor(
-                np.random.rand(batch_size, source_length, d_model).astype(
-                    "float32"))
+                np.random.rand(batch_size, source_length,
+                               d_model).astype("float32"))
             tgt = paddle.to_tensor(
-                np.random.rand(batch_size, target_length, d_model).astype(
-                    "float32"))
+                np.random.rand(batch_size, target_length,
+                               d_model).astype("float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
@@ -644,19 +655,18 @@ def test_transformer_attr_3(self):
 
         # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
         with fluid.dygraph.guard(fluid.CPUPlace()):
-            transformer = Transformer(
-                d_model,
-                n_head,
-                dim_feedforward=dim_feedforward,
-                dropout=dropout,
-                weight_attr=[None, None, None],
-                bias_attr=[False, False, True])
+            transformer = Transformer(d_model,
+                                      n_head,
+                                      dim_feedforward=dim_feedforward,
+                                      dropout=dropout,
+                                      weight_attr=[None, None, None],
+                                      bias_attr=[False, False, True])
             src = paddle.to_tensor(
-                np.random.rand(batch_size, source_length, d_model).astype(
-                    "float32"))
+                np.random.rand(batch_size, source_length,
+                               d_model).astype("float32"))
             tgt = paddle.to_tensor(
-                np.random.rand(batch_size, target_length, d_model).astype(
-                    "float32"))
+                np.random.rand(batch_size, target_length,
+                               d_model).astype("float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
@@ -678,18 +688,17 @@ def test_transformer_attr_boolean(self):
 
         # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
         with fluid.dygraph.guard(fluid.CPUPlace()):
-            transformer = Transformer(
-                d_model,
-                n_head,
-                dim_feedforward=dim_feedforward,
-                dropout=dropout,
-                bias_attr=False)
+            transformer = Transformer(d_model,
+                                      n_head,
+                                      dim_feedforward=dim_feedforward,
+                                      dropout=dropout,
+                                      bias_attr=False)
             src = paddle.to_tensor(
-                np.random.rand(batch_size, source_length, d_model).astype(
-                    "float32"))
+                np.random.rand(batch_size, source_length,
+                               d_model).astype("float32"))
             tgt = paddle.to_tensor(
-                np.random.rand(batch_size, target_length, d_model).astype(
-                    "float32"))
+                np.random.rand(batch_size, target_length,
+                               d_model).astype("float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
@@ -708,8 +717,9 @@ def test_transformer_attr_boolean(self):
     def test_generate_square_subsequent_mask(self):
         length = 5
         d_model, n_head, dim_feedforward = 8, 4, 64
-        transformer = Transformer(
-            d_model, n_head, dim_feedforward=dim_feedforward)
+        transformer = Transformer(d_model,
+                                  n_head,
+                                  dim_feedforward=dim_feedforward)
         mask = transformer.generate_square_subsequent_mask(length)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_translated_layer.py b/python/paddle/fluid/tests/unittests/test_translated_layer.py
index 79652b37b7708..4b0be989efe48 100644
--- a/python/paddle/fluid/tests/unittests/test_translated_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_translated_layer.py
@@ -31,6 +31,7 @@
 
 # define a random dataset
 class RandomDataset(paddle.io.Dataset):
+
     def __init__(self, num_samples):
         self.num_samples = num_samples
 
@@ -45,14 +46,16 @@ def __len__(self):
 
 
 class LinearNet(nn.Layer):
+
     def __init__(self):
         super(LinearNet, self).__init__()
         self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
         self._dropout = paddle.nn.Dropout(p=0.5)
 
     @paddle.jit.to_static(input_spec=[
-        paddle.static.InputSpec(
-            shape=[None, IMAGE_SIZE], dtype='float32', name='x')
+        paddle.static.InputSpec(shape=[None, IMAGE_SIZE],
+                                dtype='float32',
+                                name='x')
     ])
     def forward(self, x):
         return self._linear(x)
@@ -72,6 +75,7 @@ def train(layer, loader, loss_fn, opt):
 
 
 class TestTranslatedLayer(unittest.TestCase):
+
     def setUp(self):
         # enable dygraph mode
         place = paddle.CPUPlace()
@@ -89,13 +93,12 @@ def setUp(self):
 
         # create data loader
         dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
-        self.loader = paddle.io.DataLoader(
-            dataset,
-            places=place,
-            batch_size=BATCH_SIZE,
-            shuffle=True,
-            drop_last=True,
-            num_workers=0)
+        self.loader = paddle.io.DataLoader(dataset,
+                                           places=place,
+                                           batch_size=BATCH_SIZE,
+                                           shuffle=True,
+                                           drop_last=True,
+                                           num_workers=0)
 
         # train
         train(self.layer, self.loader, self.loss_fn, self.sgd)
@@ -137,10 +140,9 @@ def load_and_fine_tuning(self):
                       parameters=translated_layer.parameters())
         loss = train(translated_layer, self.loader, self.loss_fn, sgd)
 
-        self.assertTrue(
-            np.array_equal(orig_loss.numpy(), loss.numpy()),
-            msg="original loss:\n{}\nnew loss:\n{}\n".format(orig_loss.numpy(),
-                                                             loss.numpy()))
+        self.assertTrue(np.array_equal(orig_loss.numpy(), loss.numpy()),
+                        msg="original loss:\n{}\nnew loss:\n{}\n".format(
+                            orig_loss.numpy(), loss.numpy()))
 
     def test_get_program(self):
         # load
@@ -161,8 +163,9 @@ def test_get_input_spec(self):
         translated_layer = paddle.jit.load(self.model_path)
 
         expect_spec = [
-            paddle.static.InputSpec(
-                shape=[None, IMAGE_SIZE], dtype='float32', name='x')
+            paddle.static.InputSpec(shape=[None, IMAGE_SIZE],
+                                    dtype='float32',
+                                    name='x')
         ]
         actual_spec = translated_layer._input_spec()
 
@@ -174,10 +177,9 @@ def test_get_output_spec(self):
         translated_layer = paddle.jit.load(self.model_path)
 
         expect_spec = [
-            paddle.static.InputSpec(
-                shape=[None, CLASS_NUM],
-                dtype='float32',
-                name='translated_layer/scale_0.tmp_1')
+            paddle.static.InputSpec(shape=[None, CLASS_NUM],
+                                    dtype='float32',
+                                    name='translated_layer/scale_0.tmp_1')
         ]
         actual_spec = translated_layer._output_spec()
 
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index c890c3c607cb0..d9e293ba67159 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -26,6 +26,7 @@
 
 
 class TestTransposeOp(OpTest):
+
     def setUp(self):
         self.init_op_type()
         self.initTestCase()
@@ -56,66 +57,77 @@ def initTestCase(self):
 
 
 class TestCase0(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (100, )
         self.axis = (0, )
 
 
 class TestCase1(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (3, 4, 10)
         self.axis = (0, 2, 1)
 
 
 class TestCase2(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5)
         self.axis = (0, 2, 3, 1)
 
 
 class TestCase3(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6)
         self.axis = (4, 2, 3, 1, 0)
 
 
 class TestCase4(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6, 1)
         self.axis = (4, 2, 3, 1, 0, 5)
 
 
 class TestCase5(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 16, 96)
         self.axis = (0, 2, 1)
 
 
 class TestCase6(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 10, 12, 16)
         self.axis = (3, 1, 2, 0)
 
 
 class TestCase7(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 10, 2, 16)
         self.axis = (0, 1, 3, 2)
 
 
 class TestCase8(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
         self.axis = (0, 1, 3, 2, 4, 5, 6, 7)
 
 
 class TestCase9(TestTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
         self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
 
 
 class TestTransposeBF16Op(OpTest):
+
     def setUp(self):
         self.init_op_type()
         self.initTestCase()
@@ -128,9 +140,11 @@ def setUp(self):
             'use_mkldnn': self.use_mkldnn,
         }
         self.outputs = {
-            'XShape': convert_float_to_uint16(
+            'XShape':
+            convert_float_to_uint16(
                 np.random.random(self.shape).astype("float32")),
-            'Out': self.inputs['X'].transpose(self.axis)
+            'Out':
+            self.inputs['X'].transpose(self.axis)
         }
 
     def init_op_type(self):
@@ -149,11 +163,13 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool(TestTransposeOp):
+
     def test_check_grad(self):
         pass
 
 
 class TestTransposeOpBool1D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (100, )
         self.axis = (0, )
@@ -165,6 +181,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool2D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (3, 40)
         self.axis = (1, 0)
@@ -176,6 +193,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool3D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (3, 4, 10)
         self.axis = (0, 2, 1)
@@ -187,6 +205,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool4D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5)
         self.axis = (0, 2, 3, 1)
@@ -198,6 +217,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool5D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6)
         self.axis = (4, 2, 3, 1, 0)
@@ -209,6 +229,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool6D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6, 1)
         self.axis = (4, 2, 3, 1, 0, 5)
@@ -220,6 +241,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool7D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (2, 3, 2, 3, 2, 4, 3)
         self.axis = (0, 1, 3, 2, 4, 5, 6)
@@ -231,6 +253,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpBool8D(TestTransposeOpBool):
+
     def initTestCase(self):
         self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
         self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
@@ -242,6 +265,7 @@ def initTestCase(self):
 
 
 class TestTransposeOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
@@ -255,8 +279,9 @@ def test_x_Variable_check():
 
             def test_x_dtype_check():
                 # the Input(x)'s dtype must be one of [bool, float16, float32, float64, int32, int64]
-                x1 = fluid.layers.data(
-                    name='x1', shape=[10, 5, 3], dtype='int8')
+                x1 = fluid.layers.data(name='x1',
+                                       shape=[10, 5, 3],
+                                       dtype='int8')
                 fluid.layers.transpose(x1, perm=[1, 0, 2])
 
             self.assertRaises(TypeError, test_x_dtype_check)
@@ -282,6 +307,7 @@ def test_each_elem_value_check():
 
 
 class TestTransposeApi(unittest.TestCase):
+
     def test_static_out(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
@@ -318,6 +344,7 @@ def test_dygraph_out(self):
 
 
 class TestTAPI(unittest.TestCase):
+
     def test_out(self):
         with fluid.program_guard(fluid.Program()):
             data = fluid.data(shape=[10], dtype="float64", name="data")
@@ -384,6 +411,7 @@ def test_x_dimension_check():
 
 
 class TestMoveAxis(unittest.TestCase):
+
     def test_moveaxis1(self):
         x_np = np.random.randn(2, 3, 4, 5, 7)
         expected = np.moveaxis(x_np, [0, 4, 3, 2], [1, 3, 2, 0])
@@ -426,8 +454,8 @@ def test_moveaxis2(self):
 
     def test_moveaxis3(self):
         paddle.disable_static()
-        x = paddle.to_tensor(
-            [[1 + 1j, -1 - 1j], [1 + 1j, -1 - 1j], [1 + 1j, -1 - 1j]])
+        x = paddle.to_tensor([[1 + 1j, -1 - 1j], [1 + 1j, -1 - 1j],
+                              [1 + 1j, -1 - 1j]])
         out = x.moveaxis(0, 1)
         self.assertEqual(out.shape, [2, 3])
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_tree_conv_op.py b/python/paddle/fluid/tests/unittests/test_tree_conv_op.py
index f35649dd3e80d..114d713b092ac 100644
--- a/python/paddle/fluid/tests/unittests/test_tree_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tree_conv_op.py
@@ -51,6 +51,7 @@ def recurse_helper(node, depth):
 
 
 class TestTreeConvOp(OpTest):
+
     def setUp(self):
         self.n = 17
         self.fea_size = 3
@@ -68,24 +69,30 @@ def setUp(self):
         vectors = np.random.random(
             (self.batch_size, self.n, self.fea_size)).astype('float64')
         self.inputs = {
-            'EdgeSet': adj,
-            'NodesVector': vectors,
-            'Filter': np.random.random((self.fea_size, 3, self.output_size,
-                                        self.num_filters)).astype('float64')
+            'EdgeSet':
+            adj,
+            'NodesVector':
+            vectors,
+            'Filter':
+            np.random.random((self.fea_size, 3, self.output_size,
+                              self.num_filters)).astype('float64')
         }
         self.attrs = {'max_depth': self.max_depth}
         vectors = []
         for i in range(self.batch_size):
             vector = self.get_output_naive(i)
             vectors.append(vector)
-        self.outputs = {'Out': np.array(vectors).astype('float64'), }
+        self.outputs = {
+            'Out': np.array(vectors).astype('float64'),
+        }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            ['NodesVector', 'Filter'], 'Out', max_relative_error=0.5)
+        self.check_grad(['NodesVector', 'Filter'],
+                        'Out',
+                        max_relative_error=0.5)
 
     def get_output_naive(self, batch_id):
         og = [[] for i in range(1, self.n + 2)]
@@ -112,28 +119,30 @@ def get_output_naive(self, batch_id):
                 result = result + res
             vec.append(result)
         vec = np.concatenate(vec, axis=0)
-        vec = np.concatenate(
-            [
-                vec, np.zeros(
-                    (self.n - vec.shape[0], W.shape[2], W.shape[3]),
-                    dtype='float64')
-            ],
-            axis=0)
+        vec = np.concatenate([
+            vec,
+            np.zeros((self.n - vec.shape[0], W.shape[2], W.shape[3]),
+                     dtype='float64')
+        ],
+                             axis=0)
         return vec
 
 
 class TestTreeConv_OpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             nodes_vector_1 = np.random.random((10, 5)).astype("float32")
-            edge_set_1 = fluid.layers.data(
-                name='edge_set_1', shape=[10, 2], dtype='float32')
+            edge_set_1 = fluid.layers.data(name='edge_set_1',
+                                           shape=[10, 2],
+                                           dtype='float32')
             # the nodes_vector of tree_conv must be Variable.
             self.assertRaises(TypeError, fluid.contrib.layers.tree_conv,
                               nodes_vector_1, edge_set_1, 3)
 
-            nodes_vector_2 = fluid.layers.data(
-                name='vectors2', shape=[10, 5], dtype='float32')
+            nodes_vector_2 = fluid.layers.data(name='vectors2',
+                                               shape=[10, 5],
+                                               dtype='float32')
             edge_set_2 = np.random.random((10, 2)).astype("float32")
             # the edge_set of tree_conv must be Variable.
             self.assertRaises(TypeError, fluid.contrib.layers.tree_conv,
@@ -141,19 +150,24 @@ def test_errors(self):
 
 
 class TestDygraphTreeConv_OpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            TreeConv = fluid.dygraph.nn.TreeConv(
-                feature_size=5, output_size=6, num_filters=1, max_depth=2)
+            TreeConv = fluid.dygraph.nn.TreeConv(feature_size=5,
+                                                 output_size=6,
+                                                 num_filters=1,
+                                                 max_depth=2)
             nodes_vector_1 = np.random.random((10, 5)).astype("float32")
-            edge_set_1 = fluid.layers.data(
-                name='edge_set_1', shape=[10, 2], dtype='float32')
+            edge_set_1 = fluid.layers.data(name='edge_set_1',
+                                           shape=[10, 2],
+                                           dtype='float32')
             # the nodes_vector of TreeConv must be Variable.
             self.assertRaises(TypeError, TreeConv, nodes_vector_1, edge_set_1,
                               3)
 
-            nodes_vector_2 = fluid.layers.data(
-                name='vectors2', shape=[10, 5], dtype='float32')
+            nodes_vector_2 = fluid.layers.data(name='vectors2',
+                                               shape=[10, 5],
+                                               dtype='float32')
             edge_set_2 = np.random.random((10, 2)).astype("float32")
             # the edge_set of TreeConv must be Variable.
             self.assertRaises(TypeError, TreeConv, nodes_vector_2, edge_set_2,
diff --git a/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py b/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py
index 4e79e8dca138e..32363e29f1a5c 100644
--- a/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py
+++ b/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 import sys
+
 sys.path.append("..")
 import paddle
 from op_test import OpTest
@@ -42,8 +43,8 @@ def config(self):
         self.dtype = "float64"
 
     def set_output(self):
-        self.output = np.linalg.solve(
-            np.triu(self.inputs['X']), self.inputs['Y'])
+        self.output = np.linalg.solve(np.triu(self.inputs['X']),
+                                      self.inputs['Y'])
 
     def setUp(self):
         self.op_type = "triangular_solve"
@@ -159,11 +160,10 @@ def test_check_grad_normal(self):
         grad_x = np.triu(grad_x)
         np.fill_diagonal(grad_x, 0.)
 
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-            user_defined_grads=[grad_x, grad_y],
-            user_defined_grad_outputs=[grad_out])
+        self.check_grad(['X', 'Y'],
+                        'Out',
+                        user_defined_grads=[grad_x, grad_y],
+                        user_defined_grad_outputs=[grad_out])
 
 
 # 4D(broadcast) + 4D(broadcast)
@@ -247,6 +247,7 @@ def set_output(self):
 
 
 class TestTriangularSolveAPI(unittest.TestCase):
+
     def setUp(self):
         np.random.seed(2021)
         self.place = [paddle.CPUPlace()]
@@ -266,8 +267,10 @@ def check_static_result(self, place):
 
             exe = fluid.Executor(place)
             fetches = exe.run(fluid.default_main_program(),
-                              feed={"x": x_np,
-                                    "y": y_np},
+                              feed={
+                                  "x": x_np,
+                                  "y": y_np
+                              },
                               fetch_list=[z])
             self.assertTrue(np.allclose(fetches[0], z_np))
 
@@ -276,6 +279,7 @@ def test_static(self):
             self.check_static_result(place=place)
 
     def test_dygraph(self):
+
         def run(place):
             paddle.disable_static(place)
             x_np = np.random.random([3, 3]).astype(self.dtype)
@@ -295,16 +299,17 @@ def run(place):
 
 
 class TestTriangularSolveOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of solve_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
-            y1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.CPUPlace())
             self.assertRaises(TypeError, paddle.linalg.triangular_solve, x1, y1)
 
-            # The data type of input must be float32 or float64.        
+            # The data type of input must be float32 or float64.
             x2 = fluid.data(name="x2", shape=[30, 30], dtype="bool")
             y2 = fluid.data(name="y2", shape=[30, 10], dtype="bool")
             self.assertRaises(TypeError, paddle.linalg.triangular_solve, x2, y2)
diff --git a/python/paddle/fluid/tests/unittests/test_tril_indices_op.py b/python/paddle/fluid/tests/unittests/test_tril_indices_op.py
index 29b07a5fb8463..c3a85daeee9e1 100644
--- a/python/paddle/fluid/tests/unittests/test_tril_indices_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tril_indices_op.py
@@ -24,6 +24,7 @@
 
 
 class TestTrilIndicesOp(OpTest):
+
     def setUp(self):
         self.op_type = "tril_indices"
         self.inputs = {}
@@ -42,6 +43,7 @@ def init_config(self):
 
 
 class TestTrilIndicesOpCase1(TestTrilIndicesOp):
+
     def init_config(self):
         self.attrs = {'rows': 0, 'cols': 0, 'offset': 0}
         self.target = np.tril_indices(0, 0, 0)
@@ -49,6 +51,7 @@ def init_config(self):
 
 
 class TestTrilIndicesOpCase2(TestTrilIndicesOp):
+
     def init_config(self):
         self.attrs = {'rows': 4, 'cols': 4, 'offset': 2}
         self.target = np.tril_indices(self.attrs['rows'], self.attrs['offset'],
@@ -57,6 +60,7 @@ def init_config(self):
 
 
 class TestTrilIndicesAPICaseStatic(unittest.TestCase):
+
     def test_static(self):
         places = [
             paddle.CPUPlace(), paddle.fluid.CUDAPlace(0)
@@ -73,6 +77,7 @@ def test_static(self):
 
 
 class TestTrilIndicesAPICaseDygraph(unittest.TestCase):
+
     def test_dygraph(self):
         places = [
             paddle.CPUPlace(), paddle.fluid.CUDAPlace(0)
@@ -89,7 +94,9 @@ def test_dygraph_eager(self):
 
 
 class TestTrilIndicesAPICaseError(unittest.TestCase):
+
     def test_case_error(self):
+
         def test_num_rows_type_check():
             out1 = paddle.tril_indices(1.0, 1, 2)
 
@@ -107,6 +114,7 @@ def test_num_offset_type_check():
 
 
 class TestTrilIndicesAPICaseDefault(unittest.TestCase):
+
     def test_default_CPU(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
diff --git a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
index 00f6169fa3103..3ed9e51709881 100644
--- a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
@@ -38,7 +38,8 @@ def setUp(self):
             'lower': True if self.real_op_type == 'tril' else False,
         }
         self.outputs = {
-            'Out': self.real_np_op(self.X, self.diagonal)
+            'Out':
+            self.real_np_op(self.X, self.diagonal)
             if self.diagonal else self.real_np_op(self.X)
         }
 
@@ -70,15 +71,17 @@ def case_generator(op_type, Xshape, diagonal, expected):
     }
 
     class FailureCase(unittest.TestCase):
+
         def test_failure(self):
             paddle.enable_static()
 
             data = fluid.data(shape=Xshape, dtype='float64', name=cls_name)
-            with self.assertRaisesRegexp(
-                    eval(expected.split(':')[-1]), errmsg[expected]):
+            with self.assertRaisesRegexp(eval(expected.split(':')[-1]),
+                                         errmsg[expected]):
                 getattr(tensor, op_type)(x=data, diagonal=diagonal)
 
     class SuccessCase(TrilTriuOpDefaultTest):
+
         def initTestCase(self):
             paddle.enable_static()
 
@@ -92,7 +95,7 @@ def initTestCase(self):
 
 
 ### NOTE: meaningful diagonal is [1 - min(H, W), max(H, W) -1]
-### test the diagonal just at the border, upper/lower the border, 
+### test the diagonal just at the border, upper/lower the border,
 ###     negative/positive integer within range and a zero
 cases = {
     'success': {
@@ -118,8 +121,9 @@ def initTestCase(self):
     for _expected, _params in cases.items():
         for _Xshape, _diaglist in _params.items():
             list(
-                map(lambda _diagonal: case_generator(_op_type, _Xshape, _diagonal, _expected),
-                    _diaglist))
+                map(
+                    lambda _diagonal: case_generator(
+                        _op_type, _Xshape, _diagonal, _expected), _diaglist))
 
 
 class TestTrilTriuOpAPI(unittest.TestCase):
@@ -144,7 +148,8 @@ def test_api(self):
                 tril_out, triu_out = exe.run(
                     fluid.default_main_program(),
                     feed={"x": data},
-                    fetch_list=[tril_out, triu_out], )
+                    fetch_list=[tril_out, triu_out],
+                )
                 self.assertTrue(np.allclose(tril_out, np.tril(data)))
                 self.assertTrue(np.allclose(triu_out, np.triu(data)))
 
diff --git a/python/paddle/fluid/tests/unittests/test_trilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_trilinear_interp_op.py
index 49699b8fafd03..717f1b6004909 100755
--- a/python/paddle/fluid/tests/unittests/test_trilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trilinear_interp_op.py
@@ -125,6 +125,7 @@ def trilinear_interp_np(input,
 
 
 class TestTrilinearInterpOp(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -154,9 +155,10 @@ def setUp(self):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = trilinear_interp_np(
-            input_np, out_d, out_h, out_w, self.out_size, self.actual_shape,
-            self.align_corners, self.align_mode, self.data_layout)
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
+                                        self.out_size, self.actual_shape,
+                                        self.align_corners, self.align_mode,
+                                        self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -185,8 +187,10 @@ def test_check_output(self):
         self.check_output(check_eager=self.check_eager)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', in_place=True, check_eager=self.check_eager)
+        self.check_grad(['X'],
+                        'Out',
+                        in_place=True,
+                        check_eager=self.check_eager)
 
     def init_test_case(self):
         self.interp_method = 'trilinear'
@@ -201,6 +205,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase1(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 1, 7, 8, 9]
@@ -213,6 +218,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase2(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 9, 6, 8]
@@ -225,6 +231,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase3(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [3, 2, 16, 8, 4]
@@ -237,6 +244,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase4(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [4, 1, 7, 8, 9]
@@ -250,6 +258,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase5(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [3, 3, 9, 6, 8]
@@ -263,6 +272,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase6(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [1, 1, 16, 8, 4]
@@ -276,6 +286,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpSame(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [1, 1, 16, 8, 4]
@@ -288,6 +299,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpSameHW(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [1, 1, 16, 8, 4]
@@ -300,6 +312,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpActualShape(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [3, 2, 16, 8, 4]
@@ -313,6 +326,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpDatalayout(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 4, 4, 4, 3]
@@ -327,14 +341,15 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpOpUint8(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
         self.init_test_case()
         self.op_type = "trilinear_interp"
         self.check_eager = True
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape).astype("uint8")
+        input_np = np.random.randint(low=0, high=256,
+                                     size=self.input_shape).astype("uint8")
 
         if self.scale > 0:
             out_d = int(self.input_shape[2] * self.scale)
@@ -365,8 +380,9 @@ def setUp(self):
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
-        self.check_output_with_place(
-            place=core.CPUPlace(), atol=1, check_eager=self.check_eager)
+        self.check_output_with_place(place=core.CPUPlace(),
+                                     atol=1,
+                                     check_eager=self.check_eager)
 
     def init_test_case(self):
         self.interp_method = 'trilinear'
@@ -380,6 +396,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase1Uint8(TestTrilinearInterpOpUint8):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 16, 8, 4]
@@ -392,6 +409,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase2Uint8(TestTrilinearInterpOpUint8):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [4, 1, 7, 8, 9]
@@ -405,24 +423,28 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpOtherMethod1(TestTrilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = False
         self.align_mode = 1
 
 
 class TestTrilinearInterpWithMethod2(TestTrilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = False
         self.align_mode = 0
 
 
 class TestTrilinearInterpWithMethod3(TestTrilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = True
         self.align_mode = 0
 
 
 class TestTrilinearInterpScale1(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 5, 7, 9]
@@ -435,6 +457,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpScale2(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 5, 7, 9]
@@ -447,6 +470,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpScale3(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 5, 7, 9]
@@ -459,6 +483,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpZero(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 5, 7, 11]
@@ -471,6 +496,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpOp_attr_tensor(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -523,8 +549,10 @@ def test_check_output(self):
         self.check_output(check_eager=self.check_eager)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', in_place=True, check_eager=self.check_eager)
+        self.check_grad(['X'],
+                        'Out',
+                        in_place=True,
+                        check_eager=self.check_eager)
 
     def init_test_case(self):
         self.interp_method = 'trilinear'
@@ -540,6 +568,7 @@ def init_test_case(self):
 
 # out_size is a 1-D tensor
 class TestTrilinearInterp_attr_tensor_Case1(TestTrilinearInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [3, 2, 9, 6, 8]
@@ -554,6 +583,7 @@ def init_test_case(self):
 
 # scale is a 1-D tensor
 class TestTrilinearInterp_attr_tensor_Case2(TestTrilinearInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 8, 8, 4]
@@ -569,6 +599,7 @@ def init_test_case(self):
 
 # scale is a 1-D tensor
 class TestTrilinearInterp_attr_tensor_Case3(TestTrilinearInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 8, 8, 4]
@@ -583,6 +614,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpAPI(unittest.TestCase):
+
     def test_case(self):
         x = fluid.data(name="x", shape=[2, 3, 6, 9, 4], dtype="float32")
         y = fluid.data(name="y", shape=[2, 6, 9, 4, 3], dtype="float32")
@@ -590,22 +622,31 @@ def test_case(self):
         dim = fluid.data(name="dim", shape=[1], dtype="int32")
         shape_tensor = fluid.data(name="shape_tensor", shape=[3], dtype="int32")
         actual_size = fluid.data(name="actual_size", shape=[3], dtype="int32")
-        scale_tensor = fluid.data(
-            name="scale_tensor", shape=[1], dtype="float32")
+        scale_tensor = fluid.data(name="scale_tensor",
+                                  shape=[1],
+                                  dtype="float32")
 
-        out1 = fluid.layers.resize_trilinear(
-            y, out_shape=[12, 18, 8], data_format='NDHWC')
+        out1 = fluid.layers.resize_trilinear(y,
+                                             out_shape=[12, 18, 8],
+                                             data_format='NDHWC')
         out2 = fluid.layers.resize_trilinear(x, out_shape=[12, dim, 8])
         out3 = fluid.layers.resize_trilinear(x, out_shape=shape_tensor)
-        out4 = fluid.layers.resize_trilinear(
-            x, out_shape=[4, 4, 8], actual_shape=actual_size)
+        out4 = fluid.layers.resize_trilinear(x,
+                                             out_shape=[4, 4, 8],
+                                             actual_shape=actual_size)
         out5 = fluid.layers.resize_trilinear(x, scale=scale_tensor)
-        out6 = interpolate(
-            x, scale_factor=scale_tensor, mode='trilinear', data_format="NCDHW")
-        out7 = interpolate(
-            x, size=[4, 4, 8], mode='trilinear', data_format="NCDHW")
-        out8 = interpolate(
-            x, size=shape_tensor, mode='trilinear', data_format="NCDHW")
+        out6 = interpolate(x,
+                           scale_factor=scale_tensor,
+                           mode='trilinear',
+                           data_format="NCDHW")
+        out7 = interpolate(x,
+                           size=[4, 4, 8],
+                           mode='trilinear',
+                           data_format="NCDHW")
+        out8 = interpolate(x,
+                           size=shape_tensor,
+                           mode='trilinear',
+                           data_format="NCDHW")
 
         x_data = np.random.random((2, 3, 6, 9, 4)).astype("float32")
         dim_data = np.array([18]).astype("int32")
@@ -631,8 +672,11 @@ def test_case(self):
                           fetch_list=[out1, out2, out3, out4, out5],
                           return_numpy=True)
 
-        expect_res = trilinear_interp_np(
-            x_data, out_d=12, out_h=18, out_w=8, align_mode=1)
+        expect_res = trilinear_interp_np(x_data,
+                                         out_d=12,
+                                         out_h=18,
+                                         out_w=8,
+                                         align_mode=1)
         self.assertTrue(
             np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 4, 1))))
         for i in range(len(results) - 1):
@@ -640,13 +684,15 @@ def test_case(self):
 
 
 class TestTrilinearInterpOpException(unittest.TestCase):
+
     def test_exception(self):
         input = fluid.data(name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
 
         def attr_data_format():
             # for 5-D input, data_format only can be NCDHW or NDHWC
-            out = fluid.layers.resize_trilinear(
-                input, out_shape=[4, 8, 4], data_format='NHWC')
+            out = fluid.layers.resize_trilinear(input,
+                                                out_shape=[4, 8, 4],
+                                                data_format='NHWC')
 
         self.assertRaises(ValueError, attr_data_format)
 
diff --git a/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
index 6d072e3c377fe..f494767d8d077 100755
--- a/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
@@ -139,6 +139,7 @@ def trilinear_interp_np(input,
 
 
 class TestTrilinearInterpOp(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -217,8 +218,10 @@ def test_check_output(self):
         self.check_output(check_eager=self.check_eager)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', in_place=True, check_eager=self.check_eager)
+        self.check_grad(['X'],
+                        'Out',
+                        in_place=True,
+                        check_eager=self.check_eager)
 
     def init_test_case(self):
         self.interp_method = 'trilinear'
@@ -233,6 +236,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase1(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 1, 7, 8, 9]
@@ -245,6 +249,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase2(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 9, 6, 8]
@@ -257,6 +262,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase3(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [3, 2, 16, 8, 4]
@@ -269,6 +275,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase4(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [4, 1, 7, 8, 9]
@@ -282,6 +289,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase5(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [3, 3, 9, 6, 8]
@@ -295,6 +303,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase6(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [1, 1, 16, 8, 4]
@@ -308,6 +317,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpSame(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [1, 1, 16, 8, 4]
@@ -320,6 +330,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpSameHW(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [1, 1, 16, 8, 4]
@@ -332,6 +343,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpActualShape(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [3, 2, 16, 8, 4]
@@ -345,6 +357,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpDatalayout(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 4, 4, 4, 3]
@@ -359,6 +372,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpOpUint8(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -366,8 +380,8 @@ def setUp(self):
         self.op_type = "trilinear_interp_v2"
         # TODO(dev): add self.python_api
         self.check_eager = False
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape).astype("uint8")
+        input_np = np.random.randint(low=0, high=256,
+                                     size=self.input_shape).astype("uint8")
 
         if self.scale > 0:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
@@ -411,8 +425,9 @@ def setUp(self):
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
-        self.check_output_with_place(
-            place=core.CPUPlace(), atol=1, check_eager=self.check_eager)
+        self.check_output_with_place(place=core.CPUPlace(),
+                                     atol=1,
+                                     check_eager=self.check_eager)
 
     def init_test_case(self):
         self.interp_method = 'trilinear'
@@ -426,6 +441,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase1Uint8(TestTrilinearInterpOpUint8):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 16, 8, 4]
@@ -438,6 +454,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpCase2Uint8(TestTrilinearInterpOpUint8):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [4, 1, 7, 8, 9]
@@ -451,24 +468,28 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpOtherMethod1(TestTrilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = False
         self.align_mode = 1
 
 
 class TestTrilinearInterpWithMethod2(TestTrilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = False
         self.align_mode = 0
 
 
 class TestTrilinearInterpWithMethod3(TestTrilinearInterpOp):
+
     def set_align_mode(self):
         self.align_corners = True
         self.align_mode = 0
 
 
 class TestTrilinearInterpScale1(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 5, 7, 9]
@@ -481,6 +502,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpScale2(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 5, 7, 9]
@@ -493,6 +515,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpScale3(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 5, 7, 9]
@@ -505,6 +528,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpZero(TestTrilinearInterpOp):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 5, 7, 11]
@@ -517,6 +541,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpOp_attr_tensor(OpTest):
+
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
@@ -583,8 +608,10 @@ def test_check_output(self):
         self.check_output(check_eager=self.check_eager)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', in_place=True, check_eager=self.check_eager)
+        self.check_grad(['X'],
+                        'Out',
+                        in_place=True,
+                        check_eager=self.check_eager)
 
     def init_test_case(self):
         self.interp_method = 'trilinear'
@@ -600,6 +627,7 @@ def init_test_case(self):
 
 # out_size is a 1-D tensor
 class TestTrilinearInterp_attr_tensor_Case1(TestTrilinearInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [3, 2, 9, 6, 8]
@@ -614,6 +642,7 @@ def init_test_case(self):
 
 # scale is a 1-D tensor
 class TestTrilinearInterp_attr_tensor_Case2(TestTrilinearInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 8, 8, 4]
@@ -629,6 +658,7 @@ def init_test_case(self):
 
 # scale is a 1-D tensor
 class TestTrilinearInterp_attr_tensor_Case3(TestTrilinearInterpOp_attr_tensor):
+
     def init_test_case(self):
         self.interp_method = 'trilinear'
         self.input_shape = [2, 3, 8, 8, 4]
@@ -643,6 +673,7 @@ def init_test_case(self):
 
 
 class TestTrilinearInterpAPI(unittest.TestCase):
+
     def test_case(self):
         x = fluid.data(name="x", shape=[2, 3, 6, 9, 4], dtype="float32")
         y = fluid.data(name="y", shape=[2, 6, 9, 4, 3], dtype="float32")
@@ -650,22 +681,31 @@ def test_case(self):
         dim = fluid.data(name="dim", shape=[1], dtype="int32")
         shape_tensor = fluid.data(name="shape_tensor", shape=[3], dtype="int32")
         actual_size = fluid.data(name="actual_size", shape=[3], dtype="int32")
-        scale_tensor = fluid.data(
-            name="scale_tensor", shape=[1], dtype="float32")
+        scale_tensor = fluid.data(name="scale_tensor",
+                                  shape=[1],
+                                  dtype="float32")
 
-        out1 = fluid.layers.resize_trilinear(
-            y, out_shape=[12, 18, 8], data_format='NDHWC')
+        out1 = fluid.layers.resize_trilinear(y,
+                                             out_shape=[12, 18, 8],
+                                             data_format='NDHWC')
         out2 = fluid.layers.resize_trilinear(x, out_shape=[12, dim, 8])
         out3 = fluid.layers.resize_trilinear(x, out_shape=shape_tensor)
-        out4 = fluid.layers.resize_trilinear(
-            x, out_shape=[4, 4, 8], actual_shape=actual_size)
+        out4 = fluid.layers.resize_trilinear(x,
+                                             out_shape=[4, 4, 8],
+                                             actual_shape=actual_size)
         out5 = fluid.layers.resize_trilinear(x, scale=scale_tensor)
-        out6 = interpolate(
-            x, scale_factor=scale_tensor, mode='trilinear', data_format="NCDHW")
-        out7 = interpolate(
-            x, size=[4, 4, 8], mode='trilinear', data_format="NCDHW")
-        out8 = interpolate(
-            x, size=shape_tensor, mode='trilinear', data_format="NCDHW")
+        out6 = interpolate(x,
+                           scale_factor=scale_tensor,
+                           mode='trilinear',
+                           data_format="NCDHW")
+        out7 = interpolate(x,
+                           size=[4, 4, 8],
+                           mode='trilinear',
+                           data_format="NCDHW")
+        out8 = interpolate(x,
+                           size=shape_tensor,
+                           mode='trilinear',
+                           data_format="NCDHW")
 
         x_data = np.random.random((2, 3, 6, 9, 4)).astype("float32")
         dim_data = np.array([18]).astype("int32")
@@ -691,8 +731,11 @@ def test_case(self):
                           fetch_list=[out1, out2, out3, out4, out5],
                           return_numpy=True)
 
-        expect_res = trilinear_interp_np(
-            x_data, out_d=12, out_h=18, out_w=8, align_mode=1)
+        expect_res = trilinear_interp_np(x_data,
+                                         out_d=12,
+                                         out_h=18,
+                                         out_w=8,
+                                         align_mode=1)
         self.assertTrue(
             np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 4, 1))))
         for i in range(len(results) - 1):
@@ -700,13 +743,15 @@ def test_case(self):
 
 
 class TestTrilinearInterpOpException(unittest.TestCase):
+
     def test_exception(self):
         input = fluid.data(name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
 
         def attr_data_format():
             # for 5-D input, data_format only can be NCDHW or NDHWC
-            out = fluid.layers.resize_trilinear(
-                input, out_shape=[4, 8, 4], data_format='NHWC')
+            out = fluid.layers.resize_trilinear(input,
+                                                out_shape=[4, 8, 4],
+                                                data_format='NHWC')
 
         self.assertRaises(ValueError, attr_data_format)
 
diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py
index 1a6790728b137..56a39e5f6925c 100644
--- a/python/paddle/fluid/tests/unittests/test_trunc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py
@@ -27,6 +27,7 @@
 
 
 class TestTruncOp(OpTest):
+
     def setUp(self):
         self.op_type = "trunc"
         self.python_api = paddle.trunc
@@ -46,6 +47,7 @@ def test_check_grad(self):
 
 
 class TestFloatTruncOp(TestTruncOp):
+
     def init_dtype_type(self):
         self.dtype = np.float32
         self.__class__.exist_fp64_check_grad = True
@@ -55,6 +57,7 @@ def test_check_grad(self):
 
 
 class TestIntTruncOp(TestTruncOp):
+
     def init_dtype_type(self):
         self.dtype = np.int32
         self.__class__.exist_fp64_check_grad = True
@@ -64,6 +67,7 @@ def test_check_grad(self):
 
 
 class TestTruncAPI(unittest.TestCase):
+
     def setUp(self):
         self.shape = [20, 20]
         self.x = np.random.random((20, 20)).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py
index fe28e0c9638b4..8016499d9ac73 100644
--- a/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py
@@ -27,6 +27,7 @@
 
 
 class TestTrunctedGaussianRandomOp(unittest.TestCase):
+
     def setUp(self):
         self.op_type = "truncated_gaussian_random"
         self.inputs = {}
@@ -52,8 +53,9 @@ def gaussian_random_test(self, place):
         program = fluid.Program()
         block = program.global_block()
         vout = block.create_var(name="Out")
-        op = block.append_op(
-            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
+        op = block.append_op(type=self.op_type,
+                             outputs={"Out": vout},
+                             attrs=self.attrs)
 
         op.desc.infer_var_type(block.desc)
         op.desc.infer_shape(block.desc)
diff --git a/python/paddle/fluid/tests/unittests/test_unbind_op.py b/python/paddle/fluid/tests/unittests/test_unbind_op.py
index 43f2f3526ac0f..5f8fb382eb935 100644
--- a/python/paddle/fluid/tests/unittests/test_unbind_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unbind_op.py
@@ -25,6 +25,7 @@
 
 
 class TestUnbind(unittest.TestCase):
+
     def test_unbind(self):
 
         x_1 = fluid.data(shape=[2, 3], dtype='float32', name='x_1')
@@ -34,8 +35,10 @@ def test_unbind(self):
         exe = fluid.Executor(place=fluid.CPUPlace())
 
         [res_1, res_2] = exe.run(fluid.default_main_program(),
-                                 feed={"x_1": input_1,
-                                       "axis": 0},
+                                 feed={
+                                     "x_1": input_1,
+                                     "axis": 0
+                                 },
                                  fetch_list=[out_0, out_1])
 
         assert np.array_equal(res_1, input_1[0, 0:100])
@@ -62,6 +65,7 @@ def test_unbind_dygraph_final_state(self):
 
 
 class TestLayersUnbind(unittest.TestCase):
+
     def test_layers_unbind(self):
 
         x_1 = fluid.data(shape=[2, 3], dtype='float32', name='x_1')
@@ -71,8 +75,10 @@ def test_layers_unbind(self):
         exe = fluid.Executor(place=fluid.CPUPlace())
 
         [res_1, res_2] = exe.run(fluid.default_main_program(),
-                                 feed={"x_1": input_1,
-                                       "axis": 0},
+                                 feed={
+                                     "x_1": input_1,
+                                     "axis": 0
+                                 },
                                  fetch_list=[out_0, out_1])
 
         assert np.array_equal(res_1, input_1[0, 0:100])
@@ -80,6 +86,7 @@ def test_layers_unbind(self):
 
 
 class TestUnbindOp(OpTest):
+
     def initParameters(self):
         pass
 
@@ -118,6 +125,7 @@ def test_check_grad(self):
 
 
 class TestUnbindOp1(TestUnbindOp):
+
     def initParameters(self):
         self.axis = 1
         self.num = 2
@@ -131,6 +139,7 @@ def outReshape(self):
 
 
 class TestUnbindOp2(TestUnbindOp):
+
     def initParameters(self):
         self.axis = 2
         self.num = 2
@@ -144,6 +153,7 @@ def outReshape(self):
 
 
 class TestUnbindOp3(TestUnbindOp):
+
     def initParameters(self):
         self.axis = 2
         self.num = 2
@@ -160,6 +170,7 @@ def outReshape(self):
 
 
 class TestUnbindOp4(TestUnbindOp):
+
     def initParameters(self):
         self.axis = 1
         self.num = 2
@@ -176,6 +187,7 @@ def outReshape(self):
 
 
 class TestUnbindBF16Op(OpTest):
+
     def setUp(self):
         self._set_op_type()
         self.python_api = paddle.unbind
@@ -203,6 +215,7 @@ def test_check_grad(self):
 
 
 class TestUnbindAxisError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             x = fluid.data(shape=[2, 3], dtype='float32', name='x')
diff --git a/python/paddle/fluid/tests/unittests/test_unfold_op.py b/python/paddle/fluid/tests/unittests/test_unfold_op.py
index 7295cb8381600..c990b67f9a425 100644
--- a/python/paddle/fluid/tests/unittests/test_unfold_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unfold_op.py
@@ -52,8 +52,9 @@ def calc_unfold(self):
         dkernel_w = self.dilations[1] * (self.kernel_sizes[1] - 1) + 1
         out_height = int((self.input_height + self.paddings[0] +
                           self.paddings[2] - dkernel_h) / self.strides[0]) + 1
-        out_width = int((self.input_width + self.paddings[1] + self.paddings[3]
-                         - dkernel_w) / self.strides[1]) + 1
+        out_width = int(
+            (self.input_width + self.paddings[1] + self.paddings[3] - dkernel_w)
+            / self.strides[1]) + 1
         output_shape[2] = out_height * out_width
         output = np.zeros(output_shape).astype(np.float64)
         ############ calculate output ##############
@@ -63,8 +64,8 @@ def calc_unfold(self):
                     h_out = int(k / out_width)
                     w_out = k % out_width
                     w_offset = j % self.kernel_sizes[1]
-                    h_offset = int(j /
-                                   self.kernel_sizes[1]) % self.kernel_sizes[0]
+                    h_offset = int(
+                        j / self.kernel_sizes[1]) % self.kernel_sizes[0]
                     c_in = int(j /
                                (self.kernel_sizes[0] * self.kernel_sizes[1]))
                     h_in = h_offset * self.dilations[0] + h_out * self.strides[
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
index 5f4989f6c5dbd..27dda75a736ce 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
@@ -24,6 +24,7 @@
 
 
 class TestUniformRandomOpBF16(OpTest):
+
     def setUp(self):
         self.op_type = "uniform_random"
         self.dtype = "uint16"
@@ -48,9 +49,8 @@ def verify_output(self, outs):
             result = np.array(outs[0])
 
         hist, prob = self.output_hist(result)
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
     def test_check_output(self):
         outs = self.calc_output(core.CPUPlace())
@@ -60,6 +60,7 @@ def test_check_output(self):
 
 
 class TestUniformRandomOpBF16AttrTensorList(TestUniformRandomOpBF16):
+
     def setUp(self):
         self.op_type = "uniform_random"
         self.new_shape = (1000, 784)
@@ -84,6 +85,7 @@ def init_attrs(self):
 
 class TestUniformRandomOpBF16AttrTensorInt32(
         TestUniformRandomOpBF16AttrTensorList):
+
     def setUp(self):
         self.op_type = "uniform_random"
         self.dtype = "uint16"
@@ -93,6 +95,7 @@ def setUp(self):
 
 
 class TestUniformRandomOpBF16WithDiagInit(TestUniformRandomOpBF16):
+
     def init_attrs(self):
         self.attrs = {
             "shape": [1000, 784],
@@ -108,6 +111,7 @@ def init_attrs(self):
 
 
 class TestUniformRandomOpBF16SelectedRows(unittest.TestCase):
+
     def test_check_output(self):
         self.check_with_place(core.CPUPlace())
 
@@ -115,57 +119,55 @@ def check_with_place(self, place):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         paddle.seed(10)
-        op = Operator(
-            "uniform_random",
-            Out="X",
-            shape=[1000, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10,
-            dtype=int(core.VarDesc.VarType.BF16))
+        op = Operator("uniform_random",
+                      Out="X",
+                      shape=[1000, 784],
+                      min=-5.0,
+                      max=10.0,
+                      seed=10,
+                      dtype=int(core.VarDesc.VarType.BF16))
         op.run(scope, place)
         self.assertEqual(out.get_tensor().shape(), [1000, 784])
         result = convert_uint16_to_float(np.array(out.get_tensor()))
         hist, prob = output_hist(result)
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestUniformRandomOpBF16SelectedRowsWithDiagInit(
         TestUniformRandomOpBF16SelectedRows):
+
     def check_with_place(self, place):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         paddle.seed(10)
-        op = Operator(
-            "uniform_random",
-            Out="X",
-            shape=[500, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10,
-            diag_num=500,
-            diag_step=784,
-            diag_val=1.0,
-            dtype=int(core.VarDesc.VarType.BF16))
+        op = Operator("uniform_random",
+                      Out="X",
+                      shape=[500, 784],
+                      min=-5.0,
+                      max=10.0,
+                      seed=10,
+                      diag_num=500,
+                      diag_step=784,
+                      diag_val=1.0,
+                      dtype=int(core.VarDesc.VarType.BF16))
         op.run(scope, place)
         self.assertEqual(out.get_tensor().shape(), [500, 784])
         result = convert_uint16_to_float(np.array(out.get_tensor()))
         hist, prob = output_hist(result)
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestUniformRandomOpBF16AttrTensorAPI(unittest.TestCase):
+
     def test_attr_tensor_API(self):
         startup_program = fluid.Program()
         train_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
             dim_tensor = fluid.layers.fill_constant([1], "int64", 3)
-            ret = fluid.layers.nn.uniform_random(
-                [1, dim_tensor, 2], dtype=np.uint16)
+            ret = fluid.layers.nn.uniform_random([1, dim_tensor, 2],
+                                                 dtype=np.uint16)
 
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -175,6 +177,7 @@ def test_attr_tensor_API(self):
 
 
 class TestUniformRandomOpAPISeed(unittest.TestCase):
+
     def test_attr_tensor_API(self):
         _seed = 10
         gen = paddle.seed(_seed)
@@ -184,10 +187,14 @@ def test_attr_tensor_API(self):
             _min = 5
             _max = 10
 
-            ret = fluid.layers.nn.uniform_random(
-                [2, 3, 2], min=_min, max=_max, seed=_seed)
-            ret_2 = fluid.layers.nn.uniform_random(
-                [2, 3, 2], min=_min, max=_max, seed=_seed)
+            ret = fluid.layers.nn.uniform_random([2, 3, 2],
+                                                 min=_min,
+                                                 max=_max,
+                                                 seed=_seed)
+            ret_2 = fluid.layers.nn.uniform_random([2, 3, 2],
+                                                   min=_min,
+                                                   max=_max,
+                                                   seed=_seed)
             res = fluid.layers.equal(ret, ret_2)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -201,6 +208,7 @@ def test_attr_tensor_API(self):
 
 
 class TestUniformRandomOpBF16SelectedRowsShapeTensor(unittest.TestCase):
+
     def test_check_output(self):
         place = core.CPUPlace()
         scope = core.Scope()
@@ -208,25 +216,24 @@ def test_check_output(self):
         shape_tensor = scope.var("Shape").get_tensor()
         shape_tensor.set(np.array([1000, 784]).astype("int64"), place)
         paddle.seed(10)
-        op = Operator(
-            "uniform_random",
-            ShapeTensor="Shape",
-            Out="X",
-            min=-5.0,
-            max=10.0,
-            seed=10,
-            dtype=int(core.VarDesc.VarType.BF16))
+        op = Operator("uniform_random",
+                      ShapeTensor="Shape",
+                      Out="X",
+                      min=-5.0,
+                      max=10.0,
+                      seed=10,
+                      dtype=int(core.VarDesc.VarType.BF16))
         op.run(scope, place)
         self.assertEqual(out.get_tensor().shape(), [1000, 784])
         result = convert_uint16_to_float(np.array(out.get_tensor()))
         hist, prob = output_hist(result)
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestUniformRandomOpBF16SelectedRowsShapeTensorList(
         TestUniformRandomOpBF16SelectedRowsShapeTensor):
+
     def test_check_output(self):
         place = core.CPUPlace()
         scope = core.Scope()
@@ -236,24 +243,23 @@ def test_check_output(self):
         shape_2 = scope.var("shape2").get_tensor()
         shape_2.set(np.array([784]).astype("int64"), place)
         paddle.seed(10)
-        op = Operator(
-            "uniform_random",
-            ShapeTensorList=["shape1", "shape2"],
-            Out="X",
-            min=-5.0,
-            max=10.0,
-            seed=10,
-            dtype=int(core.VarDesc.VarType.BF16))
+        op = Operator("uniform_random",
+                      ShapeTensorList=["shape1", "shape2"],
+                      Out="X",
+                      min=-5.0,
+                      max=10.0,
+                      seed=10,
+                      dtype=int(core.VarDesc.VarType.BF16))
         op.run(scope, place)
         self.assertEqual(out.get_tensor().shape(), [1000, 784])
         result = convert_uint16_to_float(np.array(out.get_tensor()))
         hist, prob = output_hist(result)
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestUniformRandomBatchSizeLikeOpBF16API(unittest.TestCase):
+
     def test_attr_tensorlist_int32_API(self):
         startup_program = fluid.Program()
         train_program = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_inplace_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_inplace_op.py
index ec3aeb2423913..2e0196d4b1626 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_inplace_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_inplace_op.py
@@ -19,10 +19,12 @@
 
 
 class TestUniformRandomInplaceOpDtype(unittest.TestCase):
+
     def setUp(self):
         self.shape = (1000, 784)
 
     def test_uniform_random_inplace_op_dtype(self):
+
         def test_fp32():
             tensor_fp32 = paddle.ones(self.shape, dtype=paddle.float32)
             tensor_fp32.uniform_()
@@ -43,6 +45,7 @@ def test_fp64():
 
 
 class TestUniformRandomInplaceOpIsInplace(unittest.TestCase):
+
     def setUp(self):
         self.shape = (1000, 784)
 
@@ -53,6 +56,7 @@ def test_uniform_random_inplace_op_is_inplace(self):
 
 
 class TestUniformRandomInplaceOpSeedIsZero(unittest.TestCase):
+
     def setUp(self):
         self.shape = (1000, 784)
         self.seed = 0
@@ -67,6 +71,7 @@ def test_uniform_random_inplace_op_seed_is_zero(self):
 
 
 class TestUniformRandomInplaceOpSeedIsNotZero(unittest.TestCase):
+
     def setUp(self):
         self.shape = (1000, 784)
         self.seed = 10
@@ -81,6 +86,7 @@ def test_uniform_random_inplace_op_seed_is_not_zero(self):
 
 
 class TestUniformRandomInplaceOpWithinRange(unittest.TestCase):
+
     def setUp(self):
         self.shape = (1000, 784)
         self.min = -2
@@ -91,11 +97,12 @@ def test_uniform_random_inplace_op_within_range(self):
         tensor = paddle.ones(self.shape)
         tensor.uniform_(min=self.min, max=self.max, seed=self.seed)
         tensor_data = tensor.numpy()
-        self.assertTrue((tensor_data > self.min).all() and
-                        (tensor_data < self.max).all())
+        self.assertTrue((tensor_data > self.min).all()
+                        and (tensor_data < self.max).all())
 
 
 class TestUniformRandomInplaceOpShape(unittest.TestCase):
+
     def setUp(self):
         self.shape = (1000, 784)
 
@@ -108,6 +115,7 @@ def test_uniform_random_inplace_op_shape(self):
 
 
 class TestUniformRandomInplaceOpDistribution(unittest.TestCase):
+
     def setUp(self):
         self.shape = (1000, 784)
         self.min = -3
@@ -126,10 +134,12 @@ def test_uniform_random_inplace_op_distribution(self):
 
 
 class TestUniformRandomInplaceOpError(unittest.TestCase):
+
     def setUp(self):
         self.shape = (1000, 784)
 
     def test_uniform_random_inplace_op_error(self):
+
         def test_attr_error():
             tensor = paddle.ones(self.shape)
             tensor.uniform_(shape=self.shape, min=-2, max=2)
@@ -138,6 +148,7 @@ def test_attr_error():
 
 
 class TestUniformRandomInplaceOpEmptyTensor(unittest.TestCase):
+
     def test_uniform_random_inplace_op_empty_tensor(self):
         places = ['cpu']
         if fluid.core.is_compiled_with_cuda():
@@ -154,10 +165,13 @@ def test_uniform_random_inplace_op_empty_tensor(self):
 
 
 class TestUniformRandomInplaceGrad(unittest.TestCase):
+
     def setUp(self):
         self.shape = (1000, 784)
 
     def test_uniform_random_inplace_grad(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
         def test_grad():
             tensor_a = paddle.ones(self.shape)
             tensor_a.stop_gradient = False
@@ -174,6 +188,7 @@ def test_grad():
         for place in places:
             paddle.set_device(place)
             test_grad()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 0bca3c08f3d78..d80fe3b2d4712 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -51,6 +51,7 @@ def output_hist_diag(out):
 
 
 class TestUniformRandomOp_attr_tensorlist(OpTest):
+
     def setUp(self):
         self.op_type = "uniform_random"
         self.python_api = paddle.uniform
@@ -72,18 +73,19 @@ def test_check_output(self):
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestMaxMinAreInt(TestUniformRandomOp_attr_tensorlist):
+
     def init_attrs(self):
         self.attrs = {"min": -5, "max": 10, "seed": 10}
         self.output_hist = output_hist
 
 
 class TestUniformRandomOp_attr_tensorlist_int32(OpTest):
+
     def setUp(self):
         self.op_type = "uniform_random"
         self.python_api = paddle.uniform
@@ -105,12 +107,12 @@ def test_check_output(self):
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestUniformRandomOp_attr_tensor(OpTest):
+
     def setUp(self):
         self.op_type = "uniform_random"
         self.python_api = paddle.uniform
@@ -127,12 +129,12 @@ def test_check_output(self):
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestUniformRandomOp_attr_tensor_int32(OpTest):
+
     def setUp(self):
         self.op_type = "uniform_random"
         self.python_api = paddle.uniform
@@ -149,12 +151,12 @@ def test_check_output(self):
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestUniformRandomOp(OpTest):
+
     def setUp(self):
         self.op_type = "uniform_random"
         self.python_api = paddle.uniform
@@ -176,9 +178,8 @@ def test_check_output(self):
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
     def test_check_api(self):
         places = self._get_places()
@@ -194,14 +195,15 @@ def test_check_api_eager(self):
 
 
 class TestUniformRandomOpError(unittest.TestCase):
+
     def test_errors(self):
         main_prog = Program()
         start_prog = Program()
         with program_guard(main_prog, start_prog):
 
             def test_Variable():
-                x1 = fluid.create_lod_tensor(
-                    np.zeros((4, 784)), [[1, 1, 1, 1]], fluid.CPUPlace())
+                x1 = fluid.create_lod_tensor(np.zeros((4, 784)), [[1, 1, 1, 1]],
+                                             fluid.CPUPlace())
                 fluid.layers.uniform_random(x1)
 
             self.assertRaises(TypeError, test_Variable)
@@ -213,8 +215,9 @@ def test_Variable2():
             self.assertRaises(TypeError, test_Variable2)
 
             def test_dtype():
-                x2 = fluid.layers.data(
-                    name='x2', shape=[4, 784], dtype='float32')
+                x2 = fluid.layers.data(name='x2',
+                                       shape=[4, 784],
+                                       dtype='float32')
                 fluid.layers.uniform_random(x2, 'int32')
 
             self.assertRaises(TypeError, test_dtype)
@@ -227,6 +230,7 @@ def test_out_dtype():
 
 
 class TestUniformRandomOpWithDiagInit(TestUniformRandomOp):
+
     def init_attrs(self):
         self.attrs = {
             "shape": [1000, 784],
@@ -241,6 +245,7 @@ def init_attrs(self):
 
 
 class TestUniformRandomOpSelectedRows(unittest.TestCase):
+
     def get_places(self):
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -255,58 +260,55 @@ def check_with_place(self, place):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         paddle.seed(10)
-        op = Operator(
-            "uniform_random",
-            Out="X",
-            shape=[1000, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10)
+        op = Operator("uniform_random",
+                      Out="X",
+                      shape=[1000, 784],
+                      min=-5.0,
+                      max=10.0,
+                      seed=10)
         op.run(scope, place)
         self.assertEqual(out.get_tensor().shape(), [1000, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestUniformRandomOpSelectedRowsWithDiagInit(
         TestUniformRandomOpSelectedRows):
+
     def check_with_place(self, place):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         paddle.seed(10)
-        op = Operator(
-            "uniform_random",
-            Out="X",
-            shape=[500, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10,
-            diag_num=500,
-            diag_step=784,
-            diag_val=1.0)
+        op = Operator("uniform_random",
+                      Out="X",
+                      shape=[500, 784],
+                      min=-5.0,
+                      max=10.0,
+                      seed=10,
+                      diag_num=500,
+                      diag_step=784,
+                      diag_val=1.0)
         op.run(scope, place)
         self.assertEqual(out.get_tensor().shape(), [500, 784])
         hist, prob = output_hist_diag(np.array(out.get_tensor()))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestUniformRandomOpApi(unittest.TestCase):
+
     def test_api(self):
         paddle.seed(10)
         x = fluid.layers.data('x', shape=[16], dtype='float32', lod_level=1)
         y = fluid.layers.fc(x,
                             size=16,
-                            param_attr=fluid.initializer.Uniform(
-                                low=-0.5,
-                                high=0.5,
-                                seed=10,
-                                diag_num=16,
-                                diag_step=16,
-                                diag_val=1.0))
+                            param_attr=fluid.initializer.Uniform(low=-0.5,
+                                                                 high=0.5,
+                                                                 seed=10,
+                                                                 diag_num=16,
+                                                                 diag_step=16,
+                                                                 diag_val=1.0))
 
         place = fluid.CPUPlace()
         x_tensor = fluid.create_lod_tensor(
@@ -317,6 +319,7 @@ def test_api(self):
 
 
 class TestUniformRandomOp_attr_tensor_API(unittest.TestCase):
+
     def test_attr_tensor_API(self):
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -367,6 +370,7 @@ def test_attr_tensor_int32_API(self):
 
 
 class TestUniformRandomOp_API_seed(unittest.TestCase):
+
     def test_attr_tensor_API(self):
         _seed = 10
         gen = paddle.seed(_seed)
@@ -376,10 +380,14 @@ def test_attr_tensor_API(self):
             _min = 5
             _max = 10
 
-            ret = fluid.layers.nn.uniform_random(
-                [2, 3, 2], min=_min, max=_max, seed=_seed)
-            ret_2 = fluid.layers.nn.uniform_random(
-                [2, 3, 2], min=_min, max=_max, seed=_seed)
+            ret = fluid.layers.nn.uniform_random([2, 3, 2],
+                                                 min=_min,
+                                                 max=_max,
+                                                 seed=_seed)
+            ret_2 = fluid.layers.nn.uniform_random([2, 3, 2],
+                                                   min=_min,
+                                                   max=_max,
+                                                   seed=_seed)
             res = fluid.layers.equal(ret, ret_2)
             place = fluid.CPUPlace()
             if fluid.core.is_compiled_with_cuda():
@@ -395,6 +403,7 @@ def test_attr_tensor_API(self):
 
 
 class TestUniformRandomOpSelectedRowsShapeTensor(unittest.TestCase):
+
     def get_places(self):
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -411,22 +420,21 @@ def check_with_place(self, place):
         shape_tensor = scope.var("Shape").get_tensor()
         shape_tensor.set(np.array([1000, 784]).astype("int64"), place)
         paddle.seed(10)
-        op = Operator(
-            "uniform_random",
-            ShapeTensor="Shape",
-            Out="X",
-            min=-5.0,
-            max=10.0,
-            seed=10)
+        op = Operator("uniform_random",
+                      ShapeTensor="Shape",
+                      Out="X",
+                      min=-5.0,
+                      max=10.0,
+                      seed=10)
         op.run(scope, place)
         self.assertEqual(out.get_tensor().shape(), [1000, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestUniformRandomOpSelectedRowsShapeTensorList(unittest.TestCase):
+
     def get_places(self):
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -445,60 +453,65 @@ def check_with_place(self, place):
         shape_2 = scope.var("shape2").get_tensor()
         shape_2.set(np.array([784]).astype("int64"), place)
         paddle.seed(10)
-        op = Operator(
-            "uniform_random",
-            ShapeTensorList=["shape1", "shape2"],
-            Out="X",
-            min=-5.0,
-            max=10.0,
-            seed=10)
+        op = Operator("uniform_random",
+                      ShapeTensorList=["shape1", "shape2"],
+                      Out="X",
+                      min=-5.0,
+                      max=10.0,
+                      seed=10)
         op.run(scope, place)
         self.assertEqual(out.get_tensor().shape(), [1000, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
-        self.assertTrue(
-            np.allclose(
-                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+        self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01),
+                        "hist: " + str(hist))
 
 
 class TestUniformRandomDygraphMode(unittest.TestCase):
+
     def test_check_output(self):
         with fluid.dygraph.guard():
-            x = fluid.layers.uniform_random(
-                [10], dtype="float32", min=0.0, max=1.0)
+            x = fluid.layers.uniform_random([10],
+                                            dtype="float32",
+                                            min=0.0,
+                                            max=1.0)
             x_np = x.numpy()
             for i in range(10):
                 self.assertTrue((x_np[i] > 0 and x_np[i] < 1.0))
 
 
 class TestUniformRandomBatchSizeLikeOpError(unittest.TestCase):
+
     def test_errors(self):
         main_prog = Program()
         start_prog = Program()
         with program_guard(main_prog, start_prog):
 
             def test_Variable():
-                x1 = fluid.create_lod_tensor(
-                    np.zeros((100, 784)), [[10, 10, 10, 70]], fluid.CPUPlace())
+                x1 = fluid.create_lod_tensor(np.zeros(
+                    (100, 784)), [[10, 10, 10, 70]], fluid.CPUPlace())
                 fluid.layers.uniform_random_batch_size_like(x1)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_shape():
-                x1 = fluid.layers.data(
-                    name='x2', shape=[100, 784], dtype='float32')
+                x1 = fluid.layers.data(name='x2',
+                                       shape=[100, 784],
+                                       dtype='float32')
                 fluid.layers.uniform_random_batch_size_like(x1, shape="shape")
 
             self.assertRaises(TypeError, test_shape)
 
             def test_dtype():
-                x2 = fluid.layers.data(
-                    name='x2', shape=[100, 784], dtype='float32')
+                x2 = fluid.layers.data(name='x2',
+                                       shape=[100, 784],
+                                       dtype='float32')
                 fluid.layers.uniform_random_batch_size_like(x2, 'int32')
 
             self.assertRaises(TypeError, test_dtype)
 
 
 class TestUniformAlias(unittest.TestCase):
+
     def test_alias(self):
         paddle.uniform([2, 3], min=-5.0, max=5.0)
         paddle.tensor.uniform([2, 3], min=-5.0, max=5.0)
@@ -511,14 +524,15 @@ def test_uniform_random():
 
 
 class TestUniformOpError(unittest.TestCase):
+
     def test_errors(self):
         main_prog = Program()
         start_prog = Program()
         with program_guard(main_prog, start_prog):
 
             def test_Variable():
-                x1 = fluid.create_lod_tensor(
-                    np.zeros((100, 784)), [[10, 10, 10, 70]], fluid.CPUPlace())
+                x1 = fluid.create_lod_tensor(np.zeros(
+                    (100, 784)), [[10, 10, 10, 70]], fluid.CPUPlace())
                 paddle.tensor.random.uniform(x1)
 
             self.assertRaises(TypeError, test_Variable)
@@ -530,31 +544,36 @@ def test_Variable2():
             self.assertRaises(TypeError, test_Variable2)
 
             def test_dtype():
-                x2 = fluid.layers.data(
-                    name='x2', shape=[100, 784], dtype='float32')
+                x2 = fluid.layers.data(name='x2',
+                                       shape=[100, 784],
+                                       dtype='float32')
                 paddle.tensor.random.uniform(x2, 'int32')
 
             self.assertRaises(TypeError, test_dtype)
 
             def test_out_dtype():
-                out = paddle.tensor.random.uniform(
-                    shape=[3, 4], dtype='float64')
+                out = paddle.tensor.random.uniform(shape=[3, 4],
+                                                   dtype='float64')
                 self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
             test_out_dtype()
 
 
 class TestUniformDygraphMode(unittest.TestCase):
+
     def test_check_output(self):
         with fluid.dygraph.guard():
-            x = paddle.tensor.random.uniform(
-                [10], dtype="float32", min=0.0, max=1.0)
+            x = paddle.tensor.random.uniform([10],
+                                             dtype="float32",
+                                             min=0.0,
+                                             max=1.0)
             x_np = x.numpy()
             for i in range(10):
                 self.assertTrue((x_np[i] > 0 and x_np[i] < 1.0))
 
 
 class TestUniformDtype(unittest.TestCase):
+
     def test_default_dtype(self):
         paddle.disable_static()
 
@@ -581,6 +600,7 @@ def test_default_fp64():
 
 
 class TestRandomValue(unittest.TestCase):
+
     def test_fixed_random_number(self):
         # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
         if not paddle.is_compiled_with_cuda():
@@ -624,8 +644,8 @@ def test_fixed_random_number(self):
             30.089634, 77.05225, 3.1201615, 68.34072, 59.266724, -25.33281,
             12.973292, 27.41127, -17.412298, 27.931019
         ]
-        out = paddle.empty(
-            [16, 16, 16, 16], dtype='float32').uniform_(-50, 100).numpy()
+        out = paddle.empty([16, 16, 16, 16],
+                           dtype='float32').uniform_(-50, 100).numpy()
         self.assertEqual(np.mean(out), expect_mean)
         self.assertEqual(np.std(out), expect_std)
         self.assertTrue(np.allclose(out[10, 10, 10, 0:10], expect))
diff --git a/python/paddle/fluid/tests/unittests/test_unique.py b/python/paddle/fluid/tests/unittests/test_unique.py
index 71dce5cc463cf..b70a342ab8272 100644
--- a/python/paddle/fluid/tests/unittests/test_unique.py
+++ b/python/paddle/fluid/tests/unittests/test_unique.py
@@ -25,6 +25,7 @@
 
 
 class TestUniqueOp(OpTest):
+
     def setUp(self):
         self.op_type = "unique"
         self.init_config()
@@ -33,29 +34,31 @@ def test_check_output(self):
         self.check_output()
 
     def init_config(self):
-        self.inputs = {'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64'), }
+        self.inputs = {
+            'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64'),
+        }
         self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
         self.outputs = {
-            'Out': np.array(
-                [2, 3, 1, 5], dtype='int64'),
-            'Index': np.array(
-                [0, 1, 1, 2, 3, 1], dtype='int32')
+            'Out': np.array([2, 3, 1, 5], dtype='int64'),
+            'Index': np.array([0, 1, 1, 2, 3, 1], dtype='int32')
         }
 
 
 class TestOne(TestUniqueOp):
+
     def init_config(self):
-        self.inputs = {'X': np.array([2], dtype='int64'), }
+        self.inputs = {
+            'X': np.array([2], dtype='int64'),
+        }
         self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
         self.outputs = {
-            'Out': np.array(
-                [2], dtype='int64'),
-            'Index': np.array(
-                [0], dtype='int32')
+            'Out': np.array([2], dtype='int64'),
+            'Index': np.array([0], dtype='int32')
         }
 
 
 class TestRandom(TestUniqueOp):
+
     def init_config(self):
         self.inputs = {'X': np.random.randint(0, 100, (150, ), dtype='int64')}
         self.attrs = {'dtype': int(core.VarDesc.VarType.INT64)}
@@ -72,7 +75,9 @@ def init_config(self):
 
 
 class TestUniqueRaiseError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_type():
             fluid.layers.unique([10])
 
@@ -88,14 +93,15 @@ def test_dtype():
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestOneGPU(TestUniqueOp):
+
     def init_config(self):
-        self.inputs = {'X': np.array([2], dtype='int64'), }
+        self.inputs = {
+            'X': np.array([2], dtype='int64'),
+        }
         self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
         self.outputs = {
-            'Out': np.array(
-                [2], dtype='int64'),
-            'Index': np.array(
-                [0], dtype='int32')
+            'Out': np.array([2], dtype='int64'),
+            'Index': np.array([0], dtype='int32')
         }
 
     def test_check_output(self):
@@ -107,6 +113,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestRandomGPU(TestUniqueOp):
+
     def init_config(self):
         self.inputs = {'X': np.random.randint(0, 100, (150, ), dtype='int64')}
         self.attrs = {'dtype': int(core.VarDesc.VarType.INT64)}
@@ -128,14 +135,14 @@ def test_check_output(self):
 
 
 class TestSortedUniqueOp(TestUniqueOp):
+
     def init_config(self):
         self.inputs = {'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64')}
-        unique, indices, inverse, count = np.unique(
-            self.inputs['X'],
-            return_index=True,
-            return_inverse=True,
-            return_counts=True,
-            axis=None)
+        unique, indices, inverse, count = np.unique(self.inputs['X'],
+                                                    return_index=True,
+                                                    return_inverse=True,
+                                                    return_counts=True,
+                                                    axis=None)
         self.attrs = {
             'dtype': int(core.VarDesc.VarType.INT32),
             "return_index": True,
@@ -153,14 +160,14 @@ def init_config(self):
 
 
 class TestUniqueOpAxisNone(TestUniqueOp):
+
     def init_config(self):
         self.inputs = {'X': np.random.random((4, 7, 10)).astype('float64')}
-        unique, indices, inverse, counts = np.unique(
-            self.inputs['X'],
-            return_index=True,
-            return_inverse=True,
-            return_counts=True,
-            axis=None)
+        unique, indices, inverse, counts = np.unique(self.inputs['X'],
+                                                     return_index=True,
+                                                     return_inverse=True,
+                                                     return_counts=True,
+                                                     axis=None)
         self.attrs = {
             'dtype': int(core.VarDesc.VarType.INT32),
             "return_index": True,
@@ -178,14 +185,14 @@ def init_config(self):
 
 
 class TestUniqueOpAxis1(TestUniqueOp):
+
     def init_config(self):
         self.inputs = {'X': np.random.random((3, 8, 8)).astype('float64')}
-        unique, indices, inverse, counts = np.unique(
-            self.inputs['X'],
-            return_index=True,
-            return_inverse=True,
-            return_counts=True,
-            axis=1)
+        unique, indices, inverse, counts = np.unique(self.inputs['X'],
+                                                     return_index=True,
+                                                     return_inverse=True,
+                                                     return_counts=True,
+                                                     axis=1)
         self.attrs = {
             'dtype': int(core.VarDesc.VarType.INT32),
             "return_index": True,
@@ -203,6 +210,7 @@ def init_config(self):
 
 
 class TestUniqueAPI(unittest.TestCase):
+
     def test_dygraph_api_out(self):
         paddle.disable_static()
         x_data = x_data = np.random.randint(0, 10, (120))
@@ -216,18 +224,16 @@ def test_dygraph_api_attr(self):
         paddle.disable_static()
         x_data = np.random.random((3, 5, 5)).astype("float32")
         x = paddle.to_tensor(x_data)
-        out, index, inverse, counts = paddle.unique(
-            x,
-            return_index=True,
-            return_inverse=True,
-            return_counts=True,
-            axis=0)
-        np_out, np_index, np_inverse, np_counts = np.unique(
-            x_data,
-            return_index=True,
-            return_inverse=True,
-            return_counts=True,
-            axis=0)
+        out, index, inverse, counts = paddle.unique(x,
+                                                    return_index=True,
+                                                    return_inverse=True,
+                                                    return_counts=True,
+                                                    axis=0)
+        np_out, np_index, np_inverse, np_counts = np.unique(x_data,
+                                                            return_index=True,
+                                                            return_inverse=True,
+                                                            return_counts=True,
+                                                            axis=0)
         self.assertTrue((out.numpy() == np_out).all(), True)
         self.assertTrue((index.numpy() == np_index).all(), True)
         self.assertTrue((inverse.numpy() == np_inverse).all(), True)
@@ -238,12 +244,11 @@ def test_dygraph_attr_dtype(self):
         paddle.disable_static()
         x_data = x_data = np.random.randint(0, 10, (120))
         x = paddle.to_tensor(x_data)
-        out, indices, inverse, counts = paddle.unique(
-            x,
-            return_index=True,
-            return_inverse=True,
-            return_counts=True,
-            dtype="int32")
+        out, indices, inverse, counts = paddle.unique(x,
+                                                      return_index=True,
+                                                      return_inverse=True,
+                                                      return_counts=True,
+                                                      dtype="int32")
         expected_out, np_indices, np_inverse, np_counts = np.unique(
             x_data, return_index=True, return_inverse=True, return_counts=True)
         self.assertTrue((out.numpy() == expected_out).all(), True)
@@ -262,22 +267,28 @@ def test_static_graph(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
             x = paddle.fluid.data(name='x', shape=[3, 2], dtype='float64')
-            unique, inverse, counts = paddle.unique(
-                x, return_inverse=True, return_counts=True, axis=0)
+            unique, inverse, counts = paddle.unique(x,
+                                                    return_inverse=True,
+                                                    return_counts=True,
+                                                    axis=0)
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             x_np = np.array([[1, 2], [3, 4], [1, 2]]).astype('float64')
             result = exe.run(feed={"x": x_np},
                              fetch_list=[unique, inverse, counts])
-        np_unique, np_inverse, np_counts = np.unique(
-            x_np, return_inverse=True, return_counts=True, axis=0)
+        np_unique, np_inverse, np_counts = np.unique(x_np,
+                                                     return_inverse=True,
+                                                     return_counts=True,
+                                                     axis=0)
         self.assertTrue(np.allclose(result[0], np_unique))
         self.assertTrue(np.allclose(result[1], np_inverse))
         self.assertTrue(np.allclose(result[2], np_counts))
 
 
 class TestUniqueError(unittest.TestCase):
+
     def test_input_dtype(self):
+
         def test_x_dtype():
             with paddle.static.program_guard(paddle.static.Program(),
                                              paddle.static.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py b/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py
index a12f1aaff4596..b4a4eac0ba74f 100644
--- a/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py
@@ -85,9 +85,13 @@ def setUp(self):
                                               self.return_counts)
         out = reference_unique_consecutive(x)
         out = np.array(out).astype(self.dtype)
-        self.inputs = {'X': x, }
+        self.inputs = {
+            'X': x,
+        }
         self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
-        self.outputs = {'Out': out, }
+        self.outputs = {
+            'Out': out,
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -111,7 +115,9 @@ def setUp(self):
                                                        self.return_counts)
         result = np.array(result).astype(self.dtype)
         inverse = inverse.astype(self.dtype)
-        self.inputs = {'X': x, }
+        self.inputs = {
+            'X': x,
+        }
         self.attrs = {
             'return_inverse': self.return_inverse,
             'dtype': int(core.VarDesc.VarType.INT32)
@@ -137,7 +143,9 @@ def setUp(self):
                                                       self.return_counts)
         result = np.array(result).astype(self.dtype)
         counts = counts.astype(self.dtype)
-        self.inputs = {'X': x, }
+        self.inputs = {
+            'X': x,
+        }
         self.attrs = {
             'return_counts': self.return_counts,
             'dtype': int(core.VarDesc.VarType.INT32)
@@ -164,7 +172,9 @@ def setUp(self):
         result = np.array(result).astype(self.dtype)
         inverse = inverse.astype(self.dtype)
         counts = counts.astype(self.dtype)
-        self.inputs = {'X': x, }
+        self.inputs = {
+            'X': x,
+        }
         self.attrs = {
             'return_inverse': self.return_inverse,
             'return_counts': self.return_counts,
@@ -174,6 +184,7 @@ def setUp(self):
 
 
 class TestUniqueConsecutiveAPI(unittest.TestCase):
+
     def setUp(self):
         self.places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -182,7 +193,9 @@ def setUp(self):
     def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             paddle.enable_static()
-            input_x = fluid.data(name="input_x", shape=[100, ], dtype="float32")
+            input_x = fluid.data(name="input_x", shape=[
+                100,
+            ], dtype="float32")
             result = paddle.unique_consecutive(input_x)
             x_np = np.random.randint(20, size=100).astype("float32")
             exe = fluid.Executor(place)
@@ -203,6 +216,7 @@ def test_dygraph(self):
 
 
 class TestUniqueConsecutiveCase2API(unittest.TestCase):
+
     def setUp(self):
         self.places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -211,7 +225,9 @@ def setUp(self):
     def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             paddle.enable_static()
-            input_x = fluid.data(name="input_x", shape=[100, ], dtype="float32")
+            input_x = fluid.data(name="input_x", shape=[
+                100,
+            ], dtype="float32")
             result, inverse, counts = paddle.unique_consecutive(
                 input_x, return_inverse=True, return_counts=True)
             x_np = np.random.randint(20, size=100).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_unique_name.py b/python/paddle/fluid/tests/unittests/test_unique_name.py
index 4ffff252ee97d..b3adc6c3f5eeb 100644
--- a/python/paddle/fluid/tests/unittests/test_unique_name.py
+++ b/python/paddle/fluid/tests/unittests/test_unique_name.py
@@ -19,6 +19,7 @@
 
 
 class TestUniqueName(unittest.TestCase):
+
     def test_guard(self):
         with fluid.unique_name.guard():
             name_1 = fluid.unique_name.generate('')
@@ -46,6 +47,7 @@ def test_generate(self):
 
 
 class TestImperativeUniqueName(unittest.TestCase):
+
     def test_name_generator(self):
         with fluid.dygraph.guard():
             tracer = fluid.framework._dygraph_tracer()
diff --git a/python/paddle/fluid/tests/unittests/test_unique_with_counts.py b/python/paddle/fluid/tests/unittests/test_unique_with_counts.py
index 6b02a63633c7b..61669a0a5aab0 100644
--- a/python/paddle/fluid/tests/unittests/test_unique_with_counts.py
+++ b/python/paddle/fluid/tests/unittests/test_unique_with_counts.py
@@ -23,6 +23,7 @@
 
 
 class TestUniqueWithCountsOp(OpTest):
+
     def setUp(self):
         self.op_type = "unique_with_counts"
         self.init_config()
@@ -31,33 +32,33 @@ def test_check_output(self):
         self.check_output()
 
     def init_config(self):
-        self.inputs = {'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64'), }
+        self.inputs = {
+            'X': np.array([2, 3, 3, 1, 5, 3], dtype='int64'),
+        }
         self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
         self.outputs = {
-            'Out': np.array(
-                [2, 3, 1, 5], dtype='int64'),
-            'Index': np.array(
-                [0, 1, 1, 2, 3, 1], dtype='int32'),
-            'Count': np.array(
-                [1, 3, 1, 1], dtype='int32')
+            'Out': np.array([2, 3, 1, 5], dtype='int64'),
+            'Index': np.array([0, 1, 1, 2, 3, 1], dtype='int32'),
+            'Count': np.array([1, 3, 1, 1], dtype='int32')
         }
 
 
 class TestOne(TestUniqueWithCountsOp):
+
     def init_config(self):
-        self.inputs = {'X': np.array([2], dtype='int64'), }
+        self.inputs = {
+            'X': np.array([2], dtype='int64'),
+        }
         self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
         self.outputs = {
-            'Out': np.array(
-                [2], dtype='int64'),
-            'Index': np.array(
-                [0], dtype='int32'),
-            'Count': np.array(
-                [1], dtype='int32')
+            'Out': np.array([2], dtype='int64'),
+            'Index': np.array([0], dtype='int32'),
+            'Count': np.array([1], dtype='int32')
         }
 
 
 class TestRandom(TestUniqueWithCountsOp):
+
     def init_config(self):
         input_data = np.random.randint(0, 100, (2000, ), dtype='int64')
         self.inputs = {'X': input_data}
@@ -82,7 +83,9 @@ def init_config(self):
 
 
 class TestUniqueWithCountsRaiseError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_type():
             fluid.layers.unique_with_counts([10])
 
@@ -98,16 +101,16 @@ def test_dtype():
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestOneGPU(TestUniqueWithCountsOp):
+
     def init_config(self):
-        self.inputs = {'X': np.array([2], dtype='int64'), }
+        self.inputs = {
+            'X': np.array([2], dtype='int64'),
+        }
         self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
         self.outputs = {
-            'Out': np.array(
-                [2], dtype='int64'),
-            'Index': np.array(
-                [0], dtype='int32'),
-            'Count': np.array(
-                [1], dtype='int32')
+            'Out': np.array([2], dtype='int64'),
+            'Index': np.array([0], dtype='int32'),
+            'Count': np.array([1], dtype='int32')
         }
 
     def test_check_output(self):
@@ -119,6 +122,7 @@ def test_check_output(self):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestRandomGPU(TestUniqueWithCountsOp):
+
     def init_config(self):
         input_data = np.random.randint(0, 100, (2000, ), dtype='int64')
         self.inputs = {'X': input_data}
diff --git a/python/paddle/fluid/tests/unittests/test_unpool1d_op.py b/python/paddle/fluid/tests/unittests/test_unpool1d_op.py
index 95d19210acb72..30c58d3477cfe 100644
--- a/python/paddle/fluid/tests/unittests/test_unpool1d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool1d_op.py
@@ -28,8 +28,8 @@ def _unpool_output_size(x, kernel_size, stride, padding, output_size):
     input_size = x.shape
     default_size = []
     for d in range(len(kernel_size)):
-        default_size.append((input_size[-len(kernel_size) + d] - 1) * stride[d]
-                            + kernel_size[d] - 2 * padding[d])
+        default_size.append((input_size[-len(kernel_size) + d] - 1) *
+                            stride[d] + kernel_size[d] - 2 * padding[d])
     if output_size is None:
         ret = default_size
     else:
@@ -55,6 +55,7 @@ def unpool1dmax_forward_naive(input, indices, ksize, strides, paddings,
 
 
 class TestUnpool1DOpAPI_dygraph(unittest.TestCase):
+
     def test_case(self):
         places = [paddle.CPUPlace()]
         if paddle.fluid.core.is_compiled_with_cuda():
@@ -63,10 +64,14 @@ def test_case(self):
             paddle.disable_static()
             input_data = np.random.rand(1, 3, 16)
             input_x = paddle.to_tensor(input_data)
-            output, indices = F.max_pool1d(
-                input_x, kernel_size=2, stride=2, return_mask=True)
-            output_unpool = F.max_unpool1d(
-                output, indices, kernel_size=2, stride=2)
+            output, indices = F.max_pool1d(input_x,
+                                           kernel_size=2,
+                                           stride=2,
+                                           return_mask=True)
+            output_unpool = F.max_unpool1d(output,
+                                           indices,
+                                           kernel_size=2,
+                                           stride=2)
             expected_output_unpool = unpool1dmax_forward_naive(
                 output.numpy(), indices.numpy(), [2], [2], [0], [16])
             self.assertTrue(
@@ -76,6 +81,7 @@ def test_case(self):
 
 
 class TestUnpool1DOpAPI_dygraph2(unittest.TestCase):
+
     def test_case(self):
         places = [paddle.CPUPlace()]
         if paddle.fluid.core.is_compiled_with_cuda():
@@ -84,10 +90,14 @@ def test_case(self):
             paddle.disable_static()
             input_data = np.random.rand(1, 3, 16)
             input_x = paddle.to_tensor(input_data)
-            output, indices = F.max_pool1d(
-                input_x, kernel_size=2, stride=2, return_mask=True)
-            output_unpool = F.max_unpool1d(
-                output, indices, kernel_size=2, stride=None)
+            output, indices = F.max_pool1d(input_x,
+                                           kernel_size=2,
+                                           stride=2,
+                                           return_mask=True)
+            output_unpool = F.max_unpool1d(output,
+                                           indices,
+                                           kernel_size=2,
+                                           stride=None)
             expected_output_unpool = unpool1dmax_forward_naive(
                 output.numpy(), indices.numpy(), [2], [2], [0], [16])
             self.assertTrue(
@@ -97,6 +107,7 @@ def test_case(self):
 
 
 class TestUnpool1DOpAPI_dygraph3(unittest.TestCase):
+
     def test_case(self):
         places = [paddle.CPUPlace()]
         if paddle.fluid.core.is_compiled_with_cuda():
@@ -105,8 +116,9 @@ def test_case(self):
             paddle.disable_static()
             input_data = np.random.rand(1, 3, 16)
             input_x = paddle.to_tensor(input_data)
-            Pool1d = paddle.nn.MaxPool1D(
-                kernel_size=2, stride=2, return_mask=True)
+            Pool1d = paddle.nn.MaxPool1D(kernel_size=2,
+                                         stride=2,
+                                         return_mask=True)
             UnPool1d = paddle.nn.MaxUnPool1D(kernel_size=2, stride=2)
 
             output, indices = Pool1d(input_x)
@@ -120,6 +132,7 @@ def test_case(self):
 
 
 class TestUnpool1DOpAPI_static(unittest.TestCase):
+
     def test_case(self):
         paddle.enable_static()
         places = [paddle.CPUPlace()]
@@ -131,22 +144,27 @@ def test_case(self):
 
                 input_data = np.array([[[1, 2, 3, 4], [5, 6, 7, 8],
                                         [9, 10, 11, 12]]]).astype("float32")
-                x = paddle.fluid.data(
-                    name='x', shape=[1, 3, 4], dtype='float32')
-                output, indices = F.max_pool1d(
-                    x, kernel_size=2, stride=2, return_mask=True)
-                output_unpool = F.max_unpool1d(
-                    output, indices, kernel_size=2, stride=None)
+                x = paddle.fluid.data(name='x',
+                                      shape=[1, 3, 4],
+                                      dtype='float32')
+                output, indices = F.max_pool1d(x,
+                                               kernel_size=2,
+                                               stride=2,
+                                               return_mask=True)
+                output_unpool = F.max_unpool1d(output,
+                                               indices,
+                                               kernel_size=2,
+                                               stride=None)
 
                 exe = paddle.fluid.Executor(place)
                 fetches = exe.run(paddle.fluid.default_main_program(),
                                   feed={"x": input_data},
                                   fetch_list=[output_unpool],
                                   return_numpy=True)
-                pool1d_out_np = np.array(
-                    [[[2., 4.], [6., 8.], [10., 12.]]]).astype("float32")
-                indices_np = np.array(
-                    [[[1, 3], [1, 3], [1, 3]]]).astype("int32")
+                pool1d_out_np = np.array([[[2., 4.], [6., 8.],
+                                           [10., 12.]]]).astype("float32")
+                indices_np = np.array([[[1, 3], [1, 3], [1,
+                                                         3]]]).astype("int32")
                 expected_output_unpool = unpool1dmax_forward_naive(
                     pool1d_out_np, indices_np, [2], [2], [0], [4])
                 self.assertTrue(np.allclose(fetches[0], expected_output_unpool))
diff --git a/python/paddle/fluid/tests/unittests/test_unpool3d_op.py b/python/paddle/fluid/tests/unittests/test_unpool3d_op.py
index e6031d9cee8b1..1fbff100a3db5 100644
--- a/python/paddle/fluid/tests/unittests/test_unpool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool3d_op.py
@@ -28,8 +28,8 @@ def _unpool_output_size(x, kernel_size, stride, padding, output_size):
     input_size = x.shape
     default_size = []
     for d in range(len(kernel_size)):
-        default_size.append((input_size[-len(kernel_size) + d] - 1) * stride[d]
-                            + kernel_size[d] - 2 * padding[d])
+        default_size.append((input_size[-len(kernel_size) + d] - 1) *
+                            stride[d] + kernel_size[d] - 2 * padding[d])
     if output_size is None:
         ret = default_size
     else:
@@ -53,10 +53,10 @@ def unpool3dmax_forward_naive(input, indices, ksize, strides, paddings,
                     for w in range(s4):
                         index = indices[nidx, cidx, d, h, w]
                         didx = index // (out_wsize * out_hsize)
-                        hidx = (
-                            index - didx * out_hsize * out_wsize) // out_wsize
-                        widx = (
-                            index - didx * out_hsize * out_wsize) % out_wsize
+                        hidx = (index -
+                                didx * out_hsize * out_wsize) // out_wsize
+                        widx = (index -
+                                didx * out_hsize * out_wsize) % out_wsize
                         out[nidx, cidx, didx, hidx, widx] = \
                                 input[nidx, cidx, d, h, w]
 
@@ -64,6 +64,7 @@ def unpool3dmax_forward_naive(input, indices, ksize, strides, paddings,
 
 
 class TestUnpool3DOp(OpTest):
+
     def setUp(self):
         self.op_type = "unpool3d"
         self.init_test_case()
@@ -72,8 +73,9 @@ def setUp(self):
         self.output_size = _unpool_output_size(inputs, self.ksize, self.strides,
                                                self.paddings, self.output_size)
         indices = np.random.permutation(
-            np.arange(0, self.output_size[0] * self.output_size[1] *
-                      self.output_size[2]))[:dsize * hsize * wsize]
+            np.arange(
+                0, self.output_size[0] * self.output_size[1] *
+                self.output_size[2]))[:dsize * hsize * wsize]
         indices = np.reshape(indices, [dsize, hsize, wsize])
         idx_list = []
         for n in range(nsize):
@@ -116,6 +118,7 @@ def init_test_case(self):
 
 
 class TestUnpool3DOpcase1(TestUnpool3DOp):
+
     def init_test_case(self):
         self.unpool3d_forward_naive = unpool3dmax_forward_naive
         self.unpooling_type = "max"
@@ -127,6 +130,7 @@ def init_test_case(self):
 
 
 class TestUnpool3DOpOutput(TestUnpool3DOp):
+
     def init_test_case(self):
         self.unpool3d_forward_naive = unpool3dmax_forward_naive
         self.unpooling_type = "max"
@@ -138,47 +142,50 @@ def init_test_case(self):
 
 
 class TestUnpool3DOpException(unittest.TestCase):
+
     def test_exception(self):
+
         def indices_size_error():
             data = paddle.randint(shape=[1, 1, 3, 3, 3])
-            indices = paddle.reshape(
-                paddle.arange(0, 36), shape=[1, 1, 3, 3, 4])
+            indices = paddle.reshape(paddle.arange(0, 36),
+                                     shape=[1, 1, 3, 3, 4])
             MaxUnPool3D = F.maxunpool3d(data, indices, kernel_size=2, stride=2)
 
         def indices_value_error():
             data = paddle.randint(shape=[1, 1, 3, 3, 3])
-            indices = paddle.reshape(
-                paddle.arange(4, 40), shape=[1, 1, 3, 3, 3])
+            indices = paddle.reshape(paddle.arange(4, 40),
+                                     shape=[1, 1, 3, 3, 3])
             MaxUnPool3D = F.maxunpool3d(data, indices, kernel_size=2, stride=2)
 
         def data_format_error():
             data = paddle.randint(shape=[1, 1, 3, 3, 3])
-            indices = paddle.reshape(
-                paddle.arange(0, 27), shape=[1, 1, 3, 3, 3])
-            MaxUnPool3D = F.maxunpool3d(
-                data, indices, kernel_size=2, stride=2, data_format="NDHWC")
+            indices = paddle.reshape(paddle.arange(0, 27),
+                                     shape=[1, 1, 3, 3, 3])
+            MaxUnPool3D = F.maxunpool3d(data,
+                                        indices,
+                                        kernel_size=2,
+                                        stride=2,
+                                        data_format="NDHWC")
 
         def data_outputsize_error():
             data = paddle.randint(shape=[1, 1, 3, 3, 3])
-            indices = paddle.reshape(
-                paddle.arange(0, 27), shape=[1, 1, 3, 3, 3])
-            MaxUnPool3D = F.maxunpool3d(
-                data,
-                indices,
-                kernel_size=2,
-                stride=2,
-                output_size=[2, 2, 3, 4, 5])
+            indices = paddle.reshape(paddle.arange(0, 27),
+                                     shape=[1, 1, 3, 3, 3])
+            MaxUnPool3D = F.maxunpool3d(data,
+                                        indices,
+                                        kernel_size=2,
+                                        stride=2,
+                                        output_size=[2, 2, 3, 4, 5])
 
         def data_outputsize_error2():
             data = paddle.randint(shape=[1, 1, 3, 3, 3])
-            indices = paddle.reshape(
-                paddle.arange(0, 27), shape=[1, 1, 3, 3, 3])
-            MaxUnPool3D = F.maxunpool3d(
-                data,
-                indices,
-                kernel_size=2,
-                stride=2,
-                output_size=[10, 10, 10])
+            indices = paddle.reshape(paddle.arange(0, 27),
+                                     shape=[1, 1, 3, 3, 3])
+            MaxUnPool3D = F.maxunpool3d(data,
+                                        indices,
+                                        kernel_size=2,
+                                        stride=2,
+                                        output_size=[10, 10, 10])
 
         self.assertRaises(ValueError, indices_size_error)
         self.assertRaises(ValueError, indices_value_error)
@@ -188,6 +195,7 @@ def data_outputsize_error2():
 
 
 class TestUnpool3DOpAPI_dygraph(unittest.TestCase):
+
     def test_case(self):
         places = [paddle.CPUPlace()]
         if paddle.fluid.core.is_compiled_with_cuda():
@@ -196,13 +204,17 @@ def test_case(self):
             paddle.disable_static()
             input_data = np.random.rand(1, 3, 4, 4, 6)
             input_x = paddle.to_tensor(input_data)
-            output, indices = F.max_pool3d(
-                input_x, kernel_size=2, stride=2, return_mask=True)
-            output_unpool = F.max_unpool3d(
-                output, indices, kernel_size=2, stride=2)
+            output, indices = F.max_pool3d(input_x,
+                                           kernel_size=2,
+                                           stride=2,
+                                           return_mask=True)
+            output_unpool = F.max_unpool3d(output,
+                                           indices,
+                                           kernel_size=2,
+                                           stride=2)
             expected_output_unpool = unpool3dmax_forward_naive(
-                output.numpy(),
-                indices.numpy(), [2, 2, 2], [2, 2, 2], [0, 0, 0], [4, 4, 6])
+                output.numpy(), indices.numpy(), [2, 2, 2], [2, 2, 2],
+                [0, 0, 0], [4, 4, 6])
             self.assertTrue(
                 np.allclose(output_unpool.numpy(), expected_output_unpool))
 
@@ -210,6 +222,7 @@ def test_case(self):
 
 
 class TestUnpool3DOpAPI_dygraph2(unittest.TestCase):
+
     def test_case(self):
         places = [paddle.CPUPlace()]
         if paddle.fluid.core.is_compiled_with_cuda():
@@ -218,13 +231,17 @@ def test_case(self):
             paddle.disable_static()
             input_data = np.random.rand(1, 3, 4, 4, 6)
             input_x = paddle.to_tensor(input_data)
-            output, indices = F.max_pool3d(
-                input_x, kernel_size=2, stride=2, return_mask=True)
-            output_unpool = F.max_unpool3d(
-                output, indices, kernel_size=2, stride=None)
+            output, indices = F.max_pool3d(input_x,
+                                           kernel_size=2,
+                                           stride=2,
+                                           return_mask=True)
+            output_unpool = F.max_unpool3d(output,
+                                           indices,
+                                           kernel_size=2,
+                                           stride=None)
             expected_output_unpool = unpool3dmax_forward_naive(
-                output.numpy(),
-                indices.numpy(), [2, 2, 2], [2, 2, 2], [0, 0, 0], [4, 4, 6])
+                output.numpy(), indices.numpy(), [2, 2, 2], [2, 2, 2],
+                [0, 0, 0], [4, 4, 6])
             self.assertTrue(
                 np.allclose(output_unpool.numpy(), expected_output_unpool))
 
@@ -232,6 +249,7 @@ def test_case(self):
 
 
 class TestUnpool3DOpAPI_dygraph3(unittest.TestCase):
+
     def test_case(self):
         places = [paddle.CPUPlace()]
         if paddle.fluid.core.is_compiled_with_cuda():
@@ -240,15 +258,16 @@ def test_case(self):
             paddle.disable_static()
             input_data = np.random.rand(1, 3, 4, 4, 6)
             input_x = paddle.to_tensor(input_data)
-            Pool3d = paddle.nn.MaxPool3D(
-                kernel_size=2, stride=2, return_mask=True)
+            Pool3d = paddle.nn.MaxPool3D(kernel_size=2,
+                                         stride=2,
+                                         return_mask=True)
             UnPool3d = paddle.nn.MaxUnPool3D(kernel_size=2, stride=2)
 
             output, indices = Pool3d(input_x)
             output_unpool = UnPool3d(output, indices)
             expected_output_unpool = unpool3dmax_forward_naive(
-                output.numpy(),
-                indices.numpy(), [2, 2, 2], [2, 2, 2], [0, 0, 0], [4, 4, 6])
+                output.numpy(), indices.numpy(), [2, 2, 2], [2, 2, 2],
+                [0, 0, 0], [4, 4, 6])
             self.assertTrue(
                 np.allclose(output_unpool.numpy(), expected_output_unpool))
 
@@ -256,6 +275,7 @@ def test_case(self):
 
 
 class TestUnpool3DOpAPI_static(unittest.TestCase):
+
     def test_case(self):
         paddle.enable_static()
         places = [paddle.CPUPlace()]
@@ -268,20 +288,25 @@ def test_case(self):
                 input_data = np.array([[[[[1, 2, 3, 4], [5, 6, 7, 8], \
                     [9, 10, 11, 12], [13, 14, 15, 16]], [[1, 2, 3, 4], [5, 6, 7, 8], \
                     [9, 10, 11, 12], [13, 14, 15, 16]]]]]).astype("float32")
-                x = paddle.fluid.data(
-                    name='x', shape=[1, 1, 2, 4, 4], dtype='float32')
-                output, indices = F.max_pool3d(
-                    x, kernel_size=2, stride=2, return_mask=True)
-                output_unpool = F.max_unpool3d(
-                    output, indices, kernel_size=2, stride=None)
+                x = paddle.fluid.data(name='x',
+                                      shape=[1, 1, 2, 4, 4],
+                                      dtype='float32')
+                output, indices = F.max_pool3d(x,
+                                               kernel_size=2,
+                                               stride=2,
+                                               return_mask=True)
+                output_unpool = F.max_unpool3d(output,
+                                               indices,
+                                               kernel_size=2,
+                                               stride=None)
 
                 exe = paddle.fluid.Executor(place)
                 fetches = exe.run(paddle.fluid.default_main_program(),
                                   feed={"x": input_data},
                                   fetch_list=[output_unpool],
                                   return_numpy=True)
-                pool3d_out_np = np.array(
-                    [[[[[6., 8.], [14., 16.]]]]]).astype("float32")
+                pool3d_out_np = np.array([[[[[6., 8.],
+                                             [14., 16.]]]]]).astype("float32")
                 indices_np = np.array([[[[[5, 7], [13, 15]]]]]).astype("int32")
                 expected_output_unpool = unpool3dmax_forward_naive(
                     pool3d_out_np, indices_np, [2, 2, 2], [2, 2, 2], [0, 0, 0],
diff --git a/python/paddle/fluid/tests/unittests/test_unpool_op.py b/python/paddle/fluid/tests/unittests/test_unpool_op.py
index 95ad254a6dfb0..1b6d3d9dfb732 100644
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
@@ -23,8 +23,8 @@ def _unpool_output_size(x, kernel_size, stride, padding, output_size):
     input_size = x.shape
     default_size = []
     for d in range(len(kernel_size)):
-        default_size.append((input_size[-len(kernel_size) + d] - 1) * stride[d]
-                            + kernel_size[d] - 2 * padding[d])
+        default_size.append((input_size[-len(kernel_size) + d] - 1) *
+                            stride[d] + kernel_size[d] - 2 * padding[d])
     if output_size is None:
         ret = default_size
     else:
@@ -54,6 +54,7 @@ def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings,
 
 
 class TestUnpoolOp(OpTest):
+
     def setUp(self):
         self.op_type = "unpool"
         self.init_test_case()
@@ -106,6 +107,7 @@ def init_test_case(self):
 
 
 class TestUnpoolOpcase1(TestUnpoolOp):
+
     def init_test_case(self):
         self.unpool2d_forward_naive = unpool2dmax_forward_naive
         self.unpooling_type = "max"
@@ -117,6 +119,7 @@ def init_test_case(self):
 
 
 class TestUnpoolOpOutputsize(TestUnpoolOp):
+
     def init_test_case(self):
         self.unpool2d_forward_naive = unpool2dmax_forward_naive
         self.unpooling_type = "max"
@@ -128,6 +131,7 @@ def init_test_case(self):
 
 
 class TestUnpoolOpOutput(TestUnpoolOp):
+
     def init_test_case(self):
         self.unpool2d_forward_naive = unpool2dmax_forward_naive
         self.unpooling_type = "max"
@@ -139,6 +143,7 @@ def init_test_case(self):
 
 
 class TestUnpoolOpException(unittest.TestCase):
+
     def test_exception(self):
         import paddle.nn.functional as F
         import paddle
@@ -156,24 +161,29 @@ def indices_value_error():
         def data_format_error():
             data = paddle.randint(shape=[1, 1, 3, 3])
             indices = paddle.reshape(paddle.arange(4, 40), shape[1, 1, 3, 4])
-            MaxPool2D = F.maxunpool2d(
-                data, indices, kernel_size=2, stride=2, data_format="NHWC")
+            MaxPool2D = F.maxunpool2d(data,
+                                      indices,
+                                      kernel_size=2,
+                                      stride=2,
+                                      data_format="NHWC")
 
         def data_outputsize_error():
             data = paddle.randint(shape=[1, 1, 3, 3])
             indices = paddle.reshape(paddle.arange(4, 40), shape[1, 1, 3, 4])
-            MaxPool2D = F.maxunpool2d(
-                data,
-                indices,
-                kernel_size=2,
-                stride=2,
-                output_size=[5, 6, 7, 8])
+            MaxPool2D = F.maxunpool2d(data,
+                                      indices,
+                                      kernel_size=2,
+                                      stride=2,
+                                      output_size=[5, 6, 7, 8])
 
         def data_outputsize_error2():
             data = paddle.randint(shape=[1, 1, 3, 3])
             indices = paddle.reshape(paddle.arange(4, 40), shape[1, 1, 3, 4])
-            MaxPool2D = F.maxunpool2d(
-                data, indices, kernel_size=2, stride=2, output_size=[100, 100])
+            MaxPool2D = F.maxunpool2d(data,
+                                      indices,
+                                      kernel_size=2,
+                                      stride=2,
+                                      output_size=[100, 100])
 
         self.assertRaises(ValueError, indices_size_error)
         self.assertRaises(ValueError, indices_value_error)
@@ -183,6 +193,7 @@ def data_outputsize_error2():
 
 
 class TestUnpoolOpAPI_dy(unittest.TestCase):
+
     def test_case(self):
         import paddle
         import paddle.nn.functional as F
@@ -195,14 +206,19 @@ def test_case(self):
         else:
             place = core.CPUPlace()
         with fluid.dygraph.guard(place):
-            input_data = np.array([[[[1, 2, 3, 4], [5, 6, 7, 8],
-                                     [9, 10, 11, 12],
+            input_data = np.array([[[[1, 2, 3, 4], [5, 6, 7,
+                                                    8], [9, 10, 11, 12],
                                      [13, 14, 15, 16]]]]).astype("float32")
             input_x = paddle.to_tensor(input_data)
-            output, indices = F.max_pool2d(
-                input_x, kernel_size=2, stride=2, return_mask=True)
-            out_pp = F.max_unpool2d(
-                output, indices, kernel_size=2, stride=2, output_size=(5, 5))
+            output, indices = F.max_pool2d(input_x,
+                                           kernel_size=2,
+                                           stride=2,
+                                           return_mask=True)
+            out_pp = F.max_unpool2d(output,
+                                    indices,
+                                    kernel_size=2,
+                                    stride=2,
+                                    output_size=(5, 5))
             output_np = output.numpy()
             indices_np = indices.numpy()
             expect_res =unpool2dmax_forward_naive(output_np, indices_np, [2,2], \
@@ -211,6 +227,7 @@ def test_case(self):
 
 
 class TestUnpoolOpAPI_dy2(unittest.TestCase):
+
     def test_case(self):
         import paddle
         import paddle.nn.functional as F
@@ -223,14 +240,19 @@ def test_case(self):
         else:
             place = core.CPUPlace()
         with fluid.dygraph.guard(place):
-            input_data = np.array([[[[1, 2, 3, 4], [5, 6, 7, 8],
-                                     [9, 10, 11, 12],
+            input_data = np.array([[[[1, 2, 3, 4], [5, 6, 7,
+                                                    8], [9, 10, 11, 12],
                                      [13, 14, 15, 16]]]]).astype("float32")
             input_x = paddle.to_tensor(input_data)
-            output, indices = F.max_pool2d(
-                input_x, kernel_size=2, stride=2, return_mask=True)
-            out_pp = F.max_unpool2d(
-                output, indices, kernel_size=2, stride=None, output_size=(5, 5))
+            output, indices = F.max_pool2d(input_x,
+                                           kernel_size=2,
+                                           stride=2,
+                                           return_mask=True)
+            out_pp = F.max_unpool2d(output,
+                                    indices,
+                                    kernel_size=2,
+                                    stride=None,
+                                    output_size=(5, 5))
             output_np = output.numpy()
             indices_np = indices.numpy()
             expect_res =unpool2dmax_forward_naive(output_np, indices_np, [2,2], \
@@ -239,6 +261,7 @@ def test_case(self):
 
 
 class TestUnpoolOpAPI_dy3(unittest.TestCase):
+
     def test_case(self):
         import paddle
         import paddle.nn.functional as F
@@ -251,12 +274,13 @@ def test_case(self):
         else:
             place = core.CPUPlace()
         with fluid.dygraph.guard(place):
-            input_data = np.array([[[[1, 2, 3, 4], [5, 6, 7, 8],
-                                     [9, 10, 11, 12],
+            input_data = np.array([[[[1, 2, 3, 4], [5, 6, 7,
+                                                    8], [9, 10, 11, 12],
                                      [13, 14, 15, 16]]]]).astype("float32")
             input_x = paddle.to_tensor(input_data)
-            Pool2d = paddle.nn.MaxPool2D(
-                kernel_size=2, stride=2, return_mask=True)
+            Pool2d = paddle.nn.MaxPool2D(kernel_size=2,
+                                         stride=2,
+                                         return_mask=True)
             UnPool = paddle.nn.MaxUnPool2D(kernel_size=2, stride=2)
 
             output, indices = Pool2d(input_x)
@@ -269,6 +293,7 @@ def test_case(self):
 
 
 class TestUnpoolOpAPI_st(unittest.TestCase):
+
     def test_case(self):
         import paddle
         import paddle.nn.functional as F
@@ -280,10 +305,15 @@ def test_case(self):
                                  [13, 14, 15, 16]]]]).astype("float32")
 
         x = fluid.data(name="x", shape=[1, 1, 4, 4], dtype="float32")
-        output, indices = F.max_pool2d(
-            x, kernel_size=2, stride=2, return_mask=True)
-        unpool_out = F.max_unpool2d(
-            output, indices, kernel_size=2, stride=None, output_size=(5, 5))
+        output, indices = F.max_pool2d(x,
+                                       kernel_size=2,
+                                       stride=2,
+                                       return_mask=True)
+        unpool_out = F.max_unpool2d(output,
+                                    indices,
+                                    kernel_size=2,
+                                    stride=None,
+                                    output_size=(5, 5))
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
         else:
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
index af9d3db629581..c80555a66d08b 100755
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
@@ -26,6 +26,7 @@
 
 # Correct: General.
 class TestUnsqueezeOp(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.op_type = "unsqueeze2"
@@ -55,6 +56,7 @@ def init_attrs(self):
 
 # Correct: Single input index.
 class TestUnsqueezeOp1(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (-1, )
@@ -63,6 +65,7 @@ def init_test_case(self):
 
 # Correct: Mixed input axis.
 class TestUnsqueezeOp2(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (0, -1)
@@ -71,6 +74,7 @@ def init_test_case(self):
 
 # Correct: There is duplicated axis.
 class TestUnsqueezeOp3(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (0, 3, 3)
@@ -79,6 +83,7 @@ def init_test_case(self):
 
 # Correct: Reversed axes.
 class TestUnsqueezeOp4(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (3, 1, 1)
@@ -87,6 +92,7 @@ def init_test_case(self):
 
 # axes is a list(with tensor)
 class TestUnsqueezeOp_AxesTensorList(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.op_type = "unsqueeze2"
@@ -124,6 +130,7 @@ def init_attrs(self):
 
 
 class TestUnsqueezeOp1_AxesTensorList(TestUnsqueezeOp_AxesTensorList):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (-1, )
@@ -131,6 +138,7 @@ def init_test_case(self):
 
 
 class TestUnsqueezeOp2_AxesTensorList(TestUnsqueezeOp_AxesTensorList):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (0, -1)
@@ -138,6 +146,7 @@ def init_test_case(self):
 
 
 class TestUnsqueezeOp3_AxesTensorList(TestUnsqueezeOp_AxesTensorList):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (0, 3, 3)
@@ -145,6 +154,7 @@ def init_test_case(self):
 
 
 class TestUnsqueezeOp4_AxesTensorList(TestUnsqueezeOp_AxesTensorList):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (3, 1, 1)
@@ -153,6 +163,7 @@ def init_test_case(self):
 
 # axes is a Tensor
 class TestUnsqueezeOp_AxesTensor(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.op_type = "unsqueeze2"
@@ -185,6 +196,7 @@ def init_attrs(self):
 
 
 class TestUnsqueezeOp1_AxesTensor(TestUnsqueezeOp_AxesTensor):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (-1, )
@@ -192,6 +204,7 @@ def init_test_case(self):
 
 
 class TestUnsqueezeOp2_AxesTensor(TestUnsqueezeOp_AxesTensor):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (0, -1)
@@ -199,6 +212,7 @@ def init_test_case(self):
 
 
 class TestUnsqueezeOp3_AxesTensor(TestUnsqueezeOp_AxesTensor):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (0, 3, 3)
@@ -206,6 +220,7 @@ def init_test_case(self):
 
 
 class TestUnsqueezeOp4_AxesTensor(TestUnsqueezeOp_AxesTensor):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (3, 1, 1)
@@ -214,6 +229,7 @@ def init_test_case(self):
 
 # test api
 class TestUnsqueezeAPI(unittest.TestCase):
+
     def setUp(self):
         self.executed_api()
 
@@ -225,10 +241,12 @@ def test_api(self):
         x = paddle.static.data(name='x', shape=[3, 2, 5], dtype="float64")
         positive_3_int32 = fluid.layers.fill_constant([1], "int32", 3)
         positive_1_int64 = fluid.layers.fill_constant([1], "int64", 1)
-        axes_tensor_int32 = paddle.static.data(
-            name='axes_tensor_int32', shape=[3], dtype="int32")
-        axes_tensor_int64 = paddle.static.data(
-            name='axes_tensor_int64', shape=[3], dtype="int64")
+        axes_tensor_int32 = paddle.static.data(name='axes_tensor_int32',
+                                               shape=[3],
+                                               dtype="int32")
+        axes_tensor_int64 = paddle.static.data(name='axes_tensor_int64',
+                                               shape=[3],
+                                               dtype="int64")
 
         out_1 = self.unsqueeze(x, axis=[3, 1, 1])
         out_2 = self.unsqueeze(x, axis=[positive_3_int32, positive_1_int64, 1])
@@ -253,6 +271,7 @@ def test_api(self):
         assert np.array_equal(res_5, input.reshape([3, 1, 1, 2, 5, 1]))
 
     def test_error(self):
+
         def test_axes_type():
             x2 = paddle.static.data(name="x2", shape=[2, 25], dtype="int32")
             self.unsqueeze(x2, axis=2.1)
@@ -261,6 +280,7 @@ def test_axes_type():
 
 
 class TestUnsqueezeInplaceAPI(TestUnsqueezeAPI):
+
     def executed_api(self):
         self.unsqueeze = paddle.unsqueeze_
 
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index c1ec95fc8bfb8..fb250bc64b24d 100755
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -27,6 +27,7 @@
 
 # Correct: General.
 class TestUnsqueezeOp(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.op_type = "unsqueeze"
@@ -50,6 +51,7 @@ def init_attrs(self):
 
 
 class TestUnsqueezeBF16Op(OpTest):
+
     def setUp(self):
         self.init_test_case()
         self.op_type = "unsqueeze"
@@ -77,6 +79,7 @@ def init_attrs(self):
 
 # Correct: Single input index.
 class TestUnsqueezeOp1(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (-1, )
@@ -85,6 +88,7 @@ def init_test_case(self):
 
 # Correct: Mixed input axis.
 class TestUnsqueezeOp2(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (0, -1)
@@ -93,6 +97,7 @@ def init_test_case(self):
 
 # Correct: There is duplicated axis.
 class TestUnsqueezeOp3(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (0, 3, 3)
@@ -101,6 +106,7 @@ def init_test_case(self):
 
 # Correct: Reversed axes.
 class TestUnsqueezeOp4(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (3, 1, 1)
@@ -108,6 +114,7 @@ def init_test_case(self):
 
 
 class API_TestUnsqueeze(unittest.TestCase):
+
     def test_out(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
@@ -124,20 +131,23 @@ def test_out(self):
 
 
 class TestUnsqueezeOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
             # The type of axis in split_op should be int or Variable.
             def test_axes_type():
-                x6 = paddle.static.data(
-                    shape=[-1, 10], dtype='float16', name='x3')
+                x6 = paddle.static.data(shape=[-1, 10],
+                                        dtype='float16',
+                                        name='x3')
                 paddle.unsqueeze(x6, axis=3.2)
 
             self.assertRaises(TypeError, test_axes_type)
 
 
 class API_TestUnsqueeze2(unittest.TestCase):
+
     def test_out(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
@@ -150,13 +160,16 @@ def test_out(self):
             input1 = np.random.random([5, 1, 10]).astype('float64')
             input2 = np.array([1]).astype('int32')
             input = np.squeeze(input1, axis=1)
-            result1, = exe.run(feed={"data1": input,
-                                     "data2": input2},
+            result1, = exe.run(feed={
+                "data1": input,
+                "data2": input2
+            },
                                fetch_list=[result_squeeze])
             self.assertTrue(np.allclose(input1, result1))
 
 
 class API_TestUnsqueeze3(unittest.TestCase):
+
     def test_out(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program(),
@@ -169,14 +182,17 @@ def test_out(self):
             input1 = np.random.random([5, 1, 10, 1]).astype('float64')
             input2 = np.array([1]).astype('int32')
             input = np.squeeze(input1)
-            result1, = exe.run(feed={"data1": input,
-                                     "data2": input2},
+            result1, = exe.run(feed={
+                "data1": input,
+                "data2": input2
+            },
                                fetch_list=[result_squeeze])
             self.assertTrue(np.array_equal(input1, result1))
             self.assertEqual(input1.shape, result1.shape)
 
 
 class API_TestDyUnsqueeze(unittest.TestCase):
+
     def test_out(self):
         paddle.disable_static()
         input_1 = np.random.random([5, 1, 10]).astype("int32")
@@ -189,6 +205,7 @@ def test_out(self):
 
 
 class API_TestDyUnsqueeze2(unittest.TestCase):
+
     def test_out(self):
         paddle.disable_static()
         input1 = np.random.random([5, 10]).astype("int32")
@@ -201,6 +218,7 @@ def test_out(self):
 
 
 class API_TestDyUnsqueezeAxisTensor(unittest.TestCase):
+
     def test_out(self):
         paddle.disable_static()
         input1 = np.random.random([5, 10]).astype("int32")
@@ -214,6 +232,7 @@ def test_out(self):
 
 
 class API_TestDyUnsqueezeAxisTensorList(unittest.TestCase):
+
     def test_out(self):
         paddle.disable_static()
         input1 = np.random.random([5, 10]).astype("int32")
@@ -223,13 +242,15 @@ def test_out(self):
         input = paddle.to_tensor(input1)
         output = paddle.unsqueeze(
             paddle.to_tensor(input1),
-            axis=[paddle.to_tensor([1]), paddle.to_tensor([2])])
+            axis=[paddle.to_tensor([1]),
+                  paddle.to_tensor([2])])
         out_np = output.numpy()
         self.assertTrue(np.array_equal(out1, out_np))
         self.assertEqual(out1.shape, out_np.shape)
 
 
 class API_TestDygraphUnSqueeze(unittest.TestCase):
+
     def setUp(self):
         self.executed_api()
 
@@ -283,6 +304,7 @@ def test_dimension_not_1(self):
 
 
 class API_TestDygraphUnSqueezeInplace(API_TestDygraphUnSqueeze):
+
     def executed_api(self):
         self.unsqueeze = paddle.unsqueeze_
 
diff --git a/python/paddle/fluid/tests/unittests/test_unstack_op.py b/python/paddle/fluid/tests/unittests/test_unstack_op.py
index 01232293527cf..730a74dc54c5a 100644
--- a/python/paddle/fluid/tests/unittests/test_unstack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unstack_op.py
@@ -18,6 +18,7 @@
 
 
 class TestUnStackOpBase(OpTest):
+
     def initDefaultParameters(self):
         self.input_dim = (5, 6, 7)
         self.axis = 0
@@ -58,21 +59,25 @@ def test_check_grad(self):
 
 
 class TestStackOp3(TestUnStackOpBase):
+
     def initParameters(self):
         self.axis = -1
 
 
 class TestStackOp4(TestUnStackOpBase):
+
     def initParameters(self):
         self.axis = -3
 
 
 class TestStackOp5(TestUnStackOpBase):
+
     def initParameters(self):
         self.axis = 1
 
 
 class TestStackOp6(TestUnStackOpBase):
+
     def initParameters(self):
         self.axis = 2
 
diff --git a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
index 56f49f60bde84..c1294628a4e71 100644
--- a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
+++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
@@ -20,6 +20,7 @@
 
 
 class TestUpdateLossScalingOp(OpTest):
+
     def setUp(self):
         self.op_type = "update_loss_scaling"
         self.init()
@@ -61,6 +62,7 @@ def test_check_output(self):
 
 
 class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
+
     def setUp(self):
         self.op_type = "update_loss_scaling"
         self.init()
@@ -90,17 +92,21 @@ def test_check_output(self):
 
 
 class TestUpdateLossScalingLayer(unittest.TestCase):
+
     def loss_scaling_check(self, use_cuda=True, scope=fluid.Scope()):
         a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
         b = fluid.data(name="b", shape=[512, 128], dtype='float32')
         x = [a, b]
         found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
-        prev_loss_scaling = fluid.data(
-            name="prev_loss_scaling", shape=[1], dtype='float32')
-        num_good_steps = fluid.data(
-            name="num_good_steps", shape=[1], dtype='int32')
-        num_bad_steps = fluid.data(
-            name="num_bad_steps", shape=[1], dtype='int32')
+        prev_loss_scaling = fluid.data(name="prev_loss_scaling",
+                                       shape=[1],
+                                       dtype='float32')
+        num_good_steps = fluid.data(name="num_good_steps",
+                                    shape=[1],
+                                    dtype='int32')
+        num_bad_steps = fluid.data(name="num_bad_steps",
+                                   shape=[1],
+                                   dtype='int32')
 
         a_v = np.random.random([1024, 1024]).astype('float32')
         b_v = np.random.random([512, 128]).astype('float32')
@@ -114,17 +120,16 @@ def loss_scaling_check(self, use_cuda=True, scope=fluid.Scope()):
         incr_ratio = 2
         decr_ratio = 0.8
 
-        result = amp_nn.update_loss_scaling(
-            x,
-            found_inf,
-            prev_loss_scaling,
-            num_good_steps,
-            num_bad_steps,
-            incr_every_n_steps,
-            decr_every_n_nan_or_inf,
-            incr_ratio,
-            decr_ratio,
-            name="update_loss_scaling")
+        result = amp_nn.update_loss_scaling(x,
+                                            found_inf,
+                                            prev_loss_scaling,
+                                            num_good_steps,
+                                            num_bad_steps,
+                                            incr_every_n_steps,
+                                            decr_every_n_nan_or_inf,
+                                            incr_ratio,
+                                            decr_ratio,
+                                            name="update_loss_scaling")
 
         place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -156,12 +161,15 @@ def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()):
         b = fluid.data(name="b", shape=[512, 128], dtype='float32')
         x = [a, b]
         found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
-        prev_loss_scaling = fluid.data(
-            name="prev_loss_scaling", shape=[1], dtype='float32')
-        num_good_steps = fluid.data(
-            name="num_good_steps", shape=[1], dtype='int32')
-        num_bad_steps = fluid.data(
-            name="num_bad_steps", shape=[1], dtype='int32')
+        prev_loss_scaling = fluid.data(name="prev_loss_scaling",
+                                       shape=[1],
+                                       dtype='float32')
+        num_good_steps = fluid.data(name="num_good_steps",
+                                    shape=[1],
+                                    dtype='int32')
+        num_bad_steps = fluid.data(name="num_bad_steps",
+                                   shape=[1],
+                                   dtype='int32')
 
         a_v = np.random.random([1024, 1024]).astype('float32')
         b_v = np.random.random([512, 128]).astype('float32')
@@ -178,17 +186,16 @@ def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()):
         incr_ratio = 2
         decr_ratio = 0.8
 
-        result = amp_nn.update_loss_scaling(
-            x,
-            found_inf,
-            prev_loss_scaling,
-            num_good_steps,
-            num_bad_steps,
-            incr_every_n_steps,
-            decr_every_n_nan_or_inf,
-            incr_ratio,
-            decr_ratio,
-            name="update_loss_scaling")
+        result = amp_nn.update_loss_scaling(x,
+                                            found_inf,
+                                            prev_loss_scaling,
+                                            num_good_steps,
+                                            num_bad_steps,
+                                            incr_every_n_steps,
+                                            decr_every_n_nan_or_inf,
+                                            incr_ratio,
+                                            decr_ratio,
+                                            name="update_loss_scaling")
 
         place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
         exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index e6e608bea23f4..54c800c875444 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -26,12 +26,14 @@
 
 
 class TestVarBase(unittest.TestCase):
+
     def setUp(self):
         self.shape = [512, 1234]
         self.dtype = np.float32
         self.array = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
 
     def func_test_to_tensor(self):
+
         def _test_place(place):
             with fluid.dygraph.guard():
                 paddle.set_default_dtype('float32')
@@ -44,10 +46,9 @@ def _test_place(place):
                 self.assertEqual(str(x.place), str(y.place))
 
                 # set_default_dtype should not take effect on numpy
-                x = paddle.to_tensor(
-                    np.array([1.2]).astype('float16'),
-                    place=place,
-                    stop_gradient=False)
+                x = paddle.to_tensor(np.array([1.2]).astype('float16'),
+                                     place=place,
+                                     stop_gradient=False)
                 self.assertTrue(
                     np.array_equal(x.numpy(), np.array([1.2], 'float16')))
                 self.assertEqual(x.dtype, core.VarDesc.VarType.FP16)
@@ -59,8 +60,8 @@ def _test_place(place):
                 # set_default_dtype take effect on float
                 x = paddle.to_tensor(1.2, place=place, stop_gradient=False)
                 self.assertTrue(
-                    np.array_equal(x.numpy(), np.array([1.2]).astype(
-                        'float32')))
+                    np.array_equal(x.numpy(),
+                                   np.array([1.2]).astype('float32')))
                 self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
                 clone_x = x.clone()
                 self.assertTrue(
@@ -108,18 +109,24 @@ def _test_place(place):
                 self.assertTrue(np.array_equal(x.numpy(), [1 + 2j]))
                 self.assertEqual(x.dtype, core.VarDesc.VarType.COMPLEX128)
 
-                x = paddle.to_tensor(
-                    1, dtype='float32', place=place, stop_gradient=False)
+                x = paddle.to_tensor(1,
+                                     dtype='float32',
+                                     place=place,
+                                     stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), [1.]))
                 self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
                 self.assertEqual(x.shape, [1])
                 self.assertEqual(x.stop_gradient, False)
                 self.assertEqual(x.type, core.VarDesc.VarType.LOD_TENSOR)
 
-                x = paddle.to_tensor(
-                    (1, 2), dtype='float32', place=place, stop_gradient=False)
-                x = paddle.to_tensor(
-                    [1, 2], dtype='float32', place=place, stop_gradient=False)
+                x = paddle.to_tensor((1, 2),
+                                     dtype='float32',
+                                     place=place,
+                                     stop_gradient=False)
+                x = paddle.to_tensor([1, 2],
+                                     dtype='float32',
+                                     place=place,
+                                     stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), [1., 2.]))
                 self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
                 self.assertEqual(x.grad, None)
@@ -127,11 +134,10 @@ def _test_place(place):
                 self.assertEqual(x.stop_gradient, False)
                 self.assertEqual(x.type, core.VarDesc.VarType.LOD_TENSOR)
 
-                x = paddle.to_tensor(
-                    self.array,
-                    dtype='float32',
-                    place=place,
-                    stop_gradient=False)
+                x = paddle.to_tensor(self.array,
+                                     dtype='float32',
+                                     place=place,
+                                     stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), self.array))
                 self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
                 self.assertEqual(x.shape, self.shape)
@@ -148,8 +154,9 @@ def _test_place(place):
                 z = x + y
                 self.assertTrue(np.array_equal(z.numpy(), 2 * self.array))
 
-                x = paddle.to_tensor(
-                    [1 + 2j, 1 - 2j], dtype='complex64', place=place)
+                x = paddle.to_tensor([1 + 2j, 1 - 2j],
+                                     dtype='complex64',
+                                     place=place)
                 y = paddle.to_tensor(x)
                 self.assertTrue(np.array_equal(x.numpy(), [1 + 2j, 1 - 2j]))
                 self.assertEqual(y.dtype, core.VarDesc.VarType.COMPLEX64)
@@ -171,7 +178,8 @@ def _test_place(place):
                 self.assertTrue(isinstance(x.item(1, 0, 1), float))
                 self.assertEqual(x.item(5), x.item(1, 0, 1))
                 self.assertTrue(
-                    np.array_equal(x.item(1, 0, 1), x.numpy().item(1, 0, 1)))
+                    np.array_equal(x.item(1, 0, 1),
+                                   x.numpy().item(1, 0, 1)))
 
                 x = paddle.to_tensor([[1.111111, 2.222222, 3.333333]])
                 self.assertEqual(x.item(0, 2), x.item(2))
@@ -402,18 +410,16 @@ def func_test_leaf_tensor(self):
             y = x + 1
             self.assertTrue(y.is_leaf)
 
-            x = paddle.to_tensor(
-                np.random.uniform(
-                    -1, 1, size=[10, 10]), stop_gradient=False)
+            x = paddle.to_tensor(np.random.uniform(-1, 1, size=[10, 10]),
+                                 stop_gradient=False)
             self.assertTrue(x.is_leaf)
             y = x + 1
             self.assertFalse(y.is_leaf)
 
             linear = paddle.nn.Linear(10, 10)
-            input = paddle.to_tensor(
-                np.random.uniform(
-                    -1, 1, size=[10, 10]).astype('float32'),
-                stop_gradient=False)
+            input = paddle.to_tensor(np.random.uniform(
+                -1, 1, size=[10, 10]).astype('float32'),
+                                     stop_gradient=False)
             self.assertTrue(input.is_leaf)
 
             out = linear(input)
@@ -537,8 +543,8 @@ def func_test_deep_copy(self):
                                       core.VarDesc.VarType.SELECTED_ROWS, True)
 
             selected_rows = x.value().get_selected_rows()
-            selected_rows.get_tensor().set(
-                np.random.rand(3, 100), core.CPUPlace())
+            selected_rows.get_tensor().set(np.random.rand(3, 100),
+                                           core.CPUPlace())
             selected_rows.set_height(10)
             selected_rows.set_rows([3, 5, 7])
             x_copy = copy.deepcopy(x)
@@ -553,9 +559,8 @@ def func_test_deep_copy(self):
                              selected_rows.height())
             self.assertEqual(copy_selected_rows.rows(), selected_rows.rows())
             self.assertTrue(
-                np.array_equal(
-                    np.array(copy_selected_rows.get_tensor()),
-                    np.array(selected_rows.get_tensor())))
+                np.array_equal(np.array(copy_selected_rows.get_tensor()),
+                               np.array(selected_rows.get_tensor())))
 
     def test_deep_copy(self):
         with _test_eager_guard():
@@ -692,10 +697,10 @@ def _test_slice(self):
         nw = w[:, :, :-1]
         self.assertEqual((784, 100, 99), tuple(nw.shape))
 
-        tensor_array = np.array(
-            [[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-             [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
-             [[19, 20, 21], [22, 23, 24], [25, 26, 27]]]).astype('float32')
+        tensor_array = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                                 [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
+                                 [[19, 20, 21], [22, 23, 24],
+                                  [25, 26, 27]]]).astype('float32')
         var = fluid.dygraph.to_variable(tensor_array)
         var1 = var[0, 1, 1]
         var2 = var[1:]
@@ -752,10 +757,10 @@ def _test_slice(self):
         self.assertTrue(np.array_equal(local_out[18], tensor_array[:, 1:1:2]))
 
     def _test_slice_for_tensor_attr(self):
-        tensor_array = np.array(
-            [[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-             [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
-             [[19, 20, 21], [22, 23, 24], [25, 26, 27]]]).astype('float32')
+        tensor_array = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                                 [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
+                                 [[19, 20, 21], [22, 23, 24],
+                                  [25, 26, 27]]]).astype('float32')
 
         var = paddle.to_tensor(tensor_array)
 
@@ -885,14 +890,14 @@ def _test_none_index(self):
         self.assertTrue(np.array_equal(var[5], np_value[None, 2, 0, ...]))
         self.assertTrue(np.array_equal(var[6], np_value[None, 2, None, 1]))
         self.assertTrue(np.array_equal(var[7], np_value[None]))
-        self.assertTrue(
-            np.array_equal(var[8], np_value[0, 0, None, 0, 0, None]))
+        self.assertTrue(np.array_equal(var[8], np_value[0, 0, None, 0, 0,
+                                                        None]))
         self.assertTrue(
             np.array_equal(var[9], np_value[None, None, 0, ..., None]))
         self.assertTrue(np.array_equal(var[10], np_value[..., None, :, None]))
 
-        # TODO(zyfncg) there is a bug of dimensions when slice step > 1 and 
-        #              indexs has int type 
+        # TODO(zyfncg) there is a bug of dimensions when slice step > 1 and
+        #              indexs has int type
         # self.assertTrue(
         #     np.array_equal(var[11], np_value[0, 1:10:2, None, None, ...]))
 
@@ -921,11 +926,11 @@ def _test_bool_index(self):
         self.assertTrue(np.array_equal(var[5], np_value[index2d]))
         self.assertTrue(np.array_equal(var[6], np_value[index[4]]))
         self.assertTrue(
-            np.array_equal(var_tensor[var_tensor > 0.67], np_value[np_value >
-                                                                   0.67]))
+            np.array_equal(var_tensor[var_tensor > 0.67],
+                           np_value[np_value > 0.67]))
         self.assertTrue(
-            np.array_equal(var_tensor[var_tensor < 0.55], np_value[np_value <
-                                                                   0.55]))
+            np.array_equal(var_tensor[var_tensor < 0.55],
+                           np_value[np_value < 0.55]))
 
         with self.assertRaises(ValueError):
             var_tensor[[False, False, False, False]]
@@ -942,7 +947,9 @@ def _test_scalar_bool_index(self):
         var_tensor = paddle.to_tensor(np_value)
         index = [True]
         tensor_index = paddle.to_tensor(index)
-        var = [var_tensor[tensor_index].numpy(), ]
+        var = [
+            var_tensor[tensor_index].numpy(),
+        ]
         self.assertTrue(np.array_equal(var[0], np_value[index]))
 
     def _test_for_var(self):
@@ -957,8 +964,9 @@ def _test_numpy_index(self):
         t = paddle.to_tensor(array)
         self.assertTrue(np.array_equal(t[np.longlong(0)].numpy(), array[0]))
         self.assertTrue(
-            np.array_equal(t[np.longlong(0):np.longlong(4):np.longlong(2)]
-                           .numpy(), array[0:4:2]))
+            np.array_equal(
+                t[np.longlong(0):np.longlong(4):np.longlong(2)].numpy(),
+                array[0:4:2]))
         self.assertTrue(np.array_equal(t[np.int64(0)].numpy(), array[0]))
         self.assertTrue(
             np.array_equal(t[np.int32(1):np.int32(4):np.int32(2)].numpy(),
@@ -1039,9 +1047,7 @@ def func_test_var_base_as_np(self):
             var = fluid.dygraph.to_variable(self.array)
             self.assertTrue(np.array_equal(var.numpy(), np.array(var)))
             self.assertTrue(
-                np.array_equal(
-                    var.numpy(), np.array(
-                        var, dtype=np.float32)))
+                np.array_equal(var.numpy(), np.array(var, dtype=np.float32)))
 
     def test_var_base_as_np(self):
         with _test_eager_guard():
@@ -1106,8 +1112,8 @@ def _assert_to_static(self, var_base, static_var, is_param=False):
             self.assertTrue(static_var.persistable, True)
             if isinstance(var_base, fluid.framework.ParamBase):
                 for attr in ['trainable', 'is_distributed', 'do_model_average']:
-                    self.assertEqual(
-                        getattr(var_base, attr), getattr(static_var, attr))
+                    self.assertEqual(getattr(var_base, attr),
+                                     getattr(static_var, attr))
 
                 self.assertEqual(static_var.optimize_attr['learning_rate'],
                                  0.001)
@@ -1214,8 +1220,10 @@ def func_test_tensor_str_linewidth(self):
         paddle.disable_static(paddle.CPUPlace())
         paddle.seed(2021)
         x = paddle.rand([128])
-        paddle.set_printoptions(
-            precision=4, threshold=1000, edgeitems=3, linewidth=80)
+        paddle.set_printoptions(precision=4,
+                                threshold=1000,
+                                edgeitems=3,
+                                linewidth=80)
         a_str = str(x)
 
         expected = '''Tensor(shape=[128], dtype=float32, place=Place(cpu), stop_gradient=True,
@@ -1308,6 +1316,7 @@ def test_print_tensor_dtype(self):
 
 
 class TestVarBaseSetitem(unittest.TestCase):
+
     def func_setUp(self):
         self.set_dtype()
         self.tensor_x = paddle.to_tensor(np.ones((4, 2, 3)).astype(self.dtype))
@@ -1379,11 +1388,13 @@ def test_value_int(self):
 
 
 class TestVarBaseSetitemInt64(TestVarBaseSetitem):
+
     def set_dtype(self):
         self.dtype = "int64"
 
 
 class TestVarBaseSetitemFp32(TestVarBaseSetitem):
+
     def set_dtype(self):
         self.dtype = "float32"
 
@@ -1400,11 +1411,13 @@ def test_value_float(self):
 
 
 class TestVarBaseSetitemFp64(TestVarBaseSetitem):
+
     def set_dtype(self):
         self.dtype = "float64"
 
 
 class TestVarBaseSetitemBoolIndex(unittest.TestCase):
+
     def func_setUp(self):
         paddle.disable_static()
         self.set_dtype()
@@ -1483,6 +1496,7 @@ def test_value_int(self):
 
 
 class TestVarBaseSetitemBoolScalarIndex(unittest.TestCase):
+
     def set_input(self):
         self.tensor_x = paddle.to_tensor(np.ones((1, 2, 3)).astype(self.dtype))
         self.np_value = np.random.random((2, 3)).astype(self.dtype)
@@ -1508,6 +1522,7 @@ def _test(self, value):
 
 
 class TestVarBaseInplaceVersion(unittest.TestCase):
+
     def func_test_setitem(self):
         paddle.disable_static()
 
@@ -1543,6 +1558,7 @@ def test_bump_inplace_version(self):
 
 
 class TestVarBaseSlice(unittest.TestCase):
+
     def func_test_slice(self):
         paddle.disable_static()
         np_x = np.random.random((3, 8, 8))
@@ -1558,6 +1574,7 @@ def test_slice(self):
 
 
 class TestVarBaseClear(unittest.TestCase):
+
     def func_test_clear(self):
         paddle.disable_static()
         np_x = np.random.random((3, 8, 8))
@@ -1572,6 +1589,7 @@ def test_clear(self):
 
 
 class TestVarBaseOffset(unittest.TestCase):
+
     def func_offset(self):
         paddle.disable_static()
         np_x = np.random.random((3, 8, 8))
@@ -1588,6 +1606,7 @@ def test_offset(self):
 
 
 class TestVarBaseShareBufferTo(unittest.TestCase):
+
     def func_test_share_buffer_To(self):
         paddle.disable_static()
         np_src = np.random.random((3, 8, 8))
@@ -1607,6 +1626,7 @@ def test_share_buffer_To(self):
 
 
 class TestVarBaseTo(unittest.TestCase):
+
     def func_setUp(self):
         paddle.disable_static()
         self.np_x = np.random.random((3, 8, 8))
@@ -1668,6 +1688,7 @@ def test_to_api(self):
 
 
 class TestVarBaseInitVarBaseFromTensorWithDevice(unittest.TestCase):
+
     def func_test_varbase_init(self):
         paddle.disable_static()
         t = fluid.Tensor()
@@ -1697,6 +1718,7 @@ def test_varbase_init(self):
 
 
 class TestVarBaseNumel(unittest.TestCase):
+
     def func_test_numel_normal(self):
         paddle.disable_static()
         np_x = np.random.random((3, 8, 8))
@@ -1726,6 +1748,7 @@ def ttest_numel_without_holder(self):
 
 
 class TestVarBaseCopyGradientFrom(unittest.TestCase):
+
     def func_test_copy_gradient_from(self):
         paddle.disable_static()
         np_x = np.random.random((2, 2))
@@ -1744,6 +1767,7 @@ def test_copy_gradient_from(self):
 
 
 class TestEagerTensorGradNameValue(unittest.TestCase):
+
     def test_eager_tensor_grad_name_value(self):
         with _test_eager_guard():
             a_np = np.array([2, 3]).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/test_var_conv_2d.py b/python/paddle/fluid/tests/unittests/test_var_conv_2d.py
index 4e23b20581122..9fd1e8573f80f 100644
--- a/python/paddle/fluid/tests/unittests/test_var_conv_2d.py
+++ b/python/paddle/fluid/tests/unittests/test_var_conv_2d.py
@@ -20,6 +20,7 @@
 
 
 class TestVarConv2DOp(OpTest):
+
     def setUp(self):
         self.init_op_type()
         self.set_data()
@@ -175,11 +176,14 @@ def test_check_output(self):
         self.check_output(check_dygraph=False)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.005, check_dygraph=False)
+        self.check_grad(['X'],
+                        'Out',
+                        max_relative_error=0.005,
+                        check_dygraph=False)
 
 
 class TestVarConv2DOpCase1(TestVarConv2DOp):
+
     def set_data(self):
         # set in_ch 1
         input_channel = 1
@@ -193,6 +197,7 @@ def set_data(self):
 
 
 class TestVarConv2DOpCase2(TestVarConv2DOp):
+
     def set_data(self):
         # set out_ch 1
         input_channel = 2
@@ -206,6 +211,7 @@ def set_data(self):
 
 
 class TestVarConv2DOpCase3(TestVarConv2DOp):
+
     def set_data(self):
         # set batch 1
         input_channel = 2
@@ -219,6 +225,7 @@ def set_data(self):
 
 
 class TestVarConv2DOpCase4(TestVarConv2DOp):
+
     def set_data(self):
         # set filter size very large
         input_channel = 3
@@ -232,6 +239,7 @@ def set_data(self):
 
 
 class TestVarConv2DOpCase5(TestVarConv2DOp):
+
     def set_data(self):
         # set input very small
         input_channel = 50
@@ -245,9 +253,11 @@ def set_data(self):
 
 
 @skip_check_grad_ci(
-    reason="[skip shape check] Use shape of input_channel, row and col all is 1 to test special LoDTensor."
+    reason=
+    "[skip shape check] Use shape of input_channel, row and col all is 1 to test special LoDTensor."
 )
 class TestVarConv2DOpCase6(TestVarConv2DOp):
+
     def set_data(self):
         input_channel = 1
         output_channel = 3
@@ -260,6 +270,7 @@ def set_data(self):
 
 
 class TestVarConv2DOpCase7(TestVarConv2DOp):
+
     def set_data(self):
         input_channel = 2
         output_channel = 3
@@ -272,20 +283,20 @@ def set_data(self):
 
 
 class TestVarConv2DApi(unittest.TestCase):
+
     def test_api(self):
         import paddle.fluid as fluid
 
         x = fluid.layers.data(name='x', shape=[1], lod_level=1)
         row = fluid.layers.data(name='row', shape=[6], lod_level=1)
         col = fluid.layers.data(name='col', shape=[6], lod_level=1)
-        out = fluid.contrib.var_conv_2d(
-            input=x,
-            row=row,
-            col=col,
-            input_channel=3,
-            output_channel=5,
-            filter_size=[3, 3],
-            stride=1)
+        out = fluid.contrib.var_conv_2d(input=x,
+                                        row=row,
+                                        col=col,
+                                        input_channel=3,
+                                        output_channel=5,
+                                        filter_size=[3, 3],
+                                        stride=1)
 
         place = fluid.CPUPlace()
         x_tensor = fluid.create_lod_tensor(
@@ -297,12 +308,13 @@ def test_api(self):
 
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        ret = exe.run(
-            feed={'x': x_tensor,
-                  'row': row_tensor,
-                  'col': col_tensor},
-            fetch_list=[out],
-            return_numpy=False)
+        ret = exe.run(feed={
+            'x': x_tensor,
+            'row': row_tensor,
+            'col': col_tensor
+        },
+                      fetch_list=[out],
+                      return_numpy=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index 3a924669b0020..87802b83415d6 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -29,6 +29,7 @@
 
 
 class TestVariable(unittest.TestCase):
+
     def test_np_dtype_convert(self):
         DT = core.VarDesc.VarType
         convert = convert_np_dtype_to_dtype_
@@ -44,8 +45,10 @@ def test_np_dtype_convert(self):
 
     def test_var(self):
         b = default_main_program().current_block()
-        w = b.create_var(
-            dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
+        w = b.create_var(dtype="float64",
+                         shape=[784, 100],
+                         lod_level=0,
+                         name="fc.w")
         self.assertNotEqual(str(w), "")
         self.assertEqual(core.VarDesc.VarType.FP64, w.dtype)
         self.assertEqual((784, 100), w.shape)
@@ -63,10 +66,9 @@ def test_var(self):
         self.assertRaises(ValueError,
                           lambda: b.create_var(name="fc.w", shape=(24, 100)))
 
-        w = b.create_var(
-            dtype=paddle.fluid.core.VarDesc.VarType.STRINGS,
-            shape=[1],
-            name="str_var")
+        w = b.create_var(dtype=paddle.fluid.core.VarDesc.VarType.STRINGS,
+                         shape=[1],
+                         name="str_var")
         self.assertEqual(None, w.lod_level)
 
     def test_element_size(self):
@@ -101,8 +103,8 @@ def test_element_size(self):
     def test_step_scopes(self):
         prog = Program()
         b = prog.current_block()
-        var = b.create_var(
-            name='step_scopes', type=core.VarDesc.VarType.STEP_SCOPES)
+        var = b.create_var(name='step_scopes',
+                           type=core.VarDesc.VarType.STEP_SCOPES)
         self.assertEqual(core.VarDesc.VarType.STEP_SCOPES, var.type)
 
     def _test_slice(self, place):
@@ -135,10 +137,10 @@ def _test_slice(self, place):
         main = fluid.Program()
         with fluid.program_guard(main):
             exe = fluid.Executor(place)
-            tensor_array = np.array(
-                [[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-                 [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
-                 [[19, 20, 21], [22, 23, 24], [25, 26, 27]]]).astype('float32')
+            tensor_array = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                                     [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
+                                     [[19, 20, 21], [22, 23, 24],
+                                      [25, 26, 27]]]).astype('float32')
             var = fluid.layers.assign(tensor_array)
             var1 = var[0, 1, 1]
             var2 = var[1:]
@@ -276,7 +278,8 @@ def _test_slice_index_ellipsis(self, place):
 
         expected = [
             data[0:, ..., 1:], data[0:, ...], data[..., 1:], data[...],
-            data[[1, 0], [0, 0]], data[([1, 0], [0, 0])], np.array([1])
+            data[[1, 0], [0, 0]], data[([1, 0], [0, 0])],
+            np.array([1])
         ]
 
         self.assertTrue((result[0] == expected[0]).all())
@@ -413,12 +416,11 @@ def test_variable_in_dygraph_mode(self):
     def test_create_selected_rows(self):
         b = default_main_program().current_block()
 
-        var = b.create_var(
-            name="var",
-            shape=[1, 1],
-            dtype="float32",
-            type=fluid.core.VarDesc.VarType.SELECTED_ROWS,
-            persistable=True)
+        var = b.create_var(name="var",
+                           shape=[1, 1],
+                           dtype="float32",
+                           type=fluid.core.VarDesc.VarType.SELECTED_ROWS,
+                           persistable=True)
 
         def _test():
             var.lod_level()
@@ -453,8 +455,9 @@ def test_detach(self):
         scope = fluid.core.Scope()
         with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main, startup):
-                x = paddle.static.data(
-                    name='x', shape=[3, 2, 1], dtype='float32')
+                x = paddle.static.data(name='x',
+                                       shape=[3, 2, 1],
+                                       dtype='float32')
                 x.persistable = True
                 feed_data = np.ones(shape=[3, 2, 1], dtype=np.float32)
                 detach_x = x.detach()
@@ -472,8 +475,9 @@ def test_detach(self):
                 self.assertTrue((result[1] == modified_value).all())
                 self.assertTrue((result[0] == result[1]).all())
 
-                modified_value = np.random.uniform(
-                    -1, 1, size=[3, 2, 1]).astype('float32')
+                modified_value = np.random.uniform(-1, 1,
+                                                   size=[3, 2,
+                                                         1]).astype('float32')
                 x.set_value(modified_value, scope)
                 result = exe.run(main, fetch_list=[x, detach_x])
                 self.assertTrue((result[1] == modified_value).all())
@@ -481,6 +485,7 @@ def test_detach(self):
 
 
 class TestVariableSlice(unittest.TestCase):
+
     def _test_item_none(self, place):
         data = np.random.rand(2, 3, 4).astype("float32")
         prog = paddle.static.Program()
@@ -539,6 +544,7 @@ def test_slice(self):
 
 
 class TestListIndex(unittest.TestCase):
+
     def numel(self, shape):
         return reduce(lambda x, y: x * y, shape)
 
@@ -546,8 +552,8 @@ def test_static_graph_list_index(self):
         paddle.enable_static()
 
         inps_shape = [3, 4, 5, 2]
-        array = np.arange(
-            self.numel(inps_shape), dtype='float32').reshape(inps_shape)
+        array = np.arange(self.numel(inps_shape),
+                          dtype='float32').reshape(inps_shape)
 
         index_shape = [3, 3, 2, 1]
         index = np.arange(self.numel(index_shape)).reshape(index_shape)
@@ -558,8 +564,9 @@ def test_static_graph_list_index(self):
             index_mod = (index % (array.shape[0])).tolist()
 
             with paddle.static.program_guard(program):
-                x = paddle.static.data(
-                    name='x', shape=array.shape, dtype='float32')
+                x = paddle.static.data(name='x',
+                                       shape=array.shape,
+                                       dtype='float32')
 
                 y = x[index_mod]
 
@@ -612,16 +619,16 @@ def test_dygraph_list_index(self):
     def test_static_graph_list_index_muti_dim(self):
         paddle.enable_static()
         inps_shape = [3, 4, 5]
-        array = np.arange(
-            self.numel(inps_shape), dtype='float32').reshape(inps_shape)
+        array = np.arange(self.numel(inps_shape),
+                          dtype='float32').reshape(inps_shape)
 
         index_shape = [2, 2]
         index1 = np.arange(self.numel(index_shape)).reshape(index_shape)
         index2 = np.arange(self.numel(index_shape)).reshape(index_shape) + 2
 
         value_shape = [3, 2, 2, 3]
-        value_np = np.arange(
-            self.numel(value_shape), dtype='float32').reshape(value_shape) + 100
+        value_np = np.arange(self.numel(value_shape),
+                             dtype='float32').reshape(value_shape) + 100
 
         index_mod1 = (index1 % (min(array.shape))).tolist()
         index_mod2 = (index2 % (min(array.shape))).tolist()
@@ -631,12 +638,15 @@ def test_static_graph_list_index_muti_dim(self):
 
             x = paddle.static.data(name='x', shape=array.shape, dtype='float32')
 
-            value = paddle.static.data(
-                name='value', shape=value_np.shape, dtype='float32')
-            index1 = paddle.static.data(
-                name='index1', shape=index1.shape, dtype='int32')
-            index2 = paddle.static.data(
-                name='index2', shape=index2.shape, dtype='int32')
+            value = paddle.static.data(name='value',
+                                       shape=value_np.shape,
+                                       dtype='float32')
+            index1 = paddle.static.data(name='index1',
+                                        shape=index1.shape,
+                                        dtype='int32')
+            index2 = paddle.static.data(name='index2',
+                                        shape=index2.shape,
+                                        dtype='int32')
 
             y = x[index1, index2]
 
@@ -661,23 +671,23 @@ def test_static_graph_list_index_muti_dim(self):
                                  },
                                  fetch_list=fetch_list)
 
-            self.assertTrue(
-                np.array_equal(y2, getitem_pp[0]),
-                msg='\n numpy:{},\n paddle:{}'.format(y2, getitem_pp[0]))
+            self.assertTrue(np.array_equal(y2, getitem_pp[0]),
+                            msg='\n numpy:{},\n paddle:{}'.format(
+                                y2, getitem_pp[0]))
 
     def test_dygraph_list_index_muti_dim(self):
         paddle.disable_static()
         inps_shape = [3, 4, 5]
-        array = np.arange(
-            self.numel(inps_shape), dtype='float32').reshape(inps_shape)
+        array = np.arange(self.numel(inps_shape),
+                          dtype='float32').reshape(inps_shape)
 
         index_shape = [2, 2]
         index1 = np.arange(self.numel(index_shape)).reshape(index_shape)
         index2 = np.arange(self.numel(index_shape)).reshape(index_shape) + 2
 
         value_shape = [3, 2, 2, 3]
-        value_np = np.arange(
-            self.numel(value_shape), dtype='float32').reshape(value_shape) + 100
+        value_np = np.arange(self.numel(value_shape),
+                             dtype='float32').reshape(value_shape) + 100
 
         index_mod1 = (index1 % (min(array.shape))).tolist()
         index_mod2 = (index2 % (min(array.shape))).tolist()
@@ -714,9 +724,9 @@ def run_getitem_list_index(self, array, index):
         getitem_pp = exe.run(prog, feed={x.name: array}, fetch_list=fetch_list)
 
         print(getitem_pp)
-        self.assertTrue(
-            np.array_equal(value_np, getitem_pp[0]),
-            msg='\n numpy:{},\n paddle:{}'.format(value_np, getitem_pp[0]))
+        self.assertTrue(np.array_equal(value_np, getitem_pp[0]),
+                        msg='\n numpy:{},\n paddle:{}'.format(
+                            value_np, getitem_pp[0]))
 
     def test_static_graph_getitem_bool_index(self):
         paddle.enable_static()
@@ -748,8 +758,9 @@ def test_static_graph_getitem_bool_index(self):
     def run_setitem_list_index(self, array, index, value_np):
         x = paddle.static.data(name='x', shape=array.shape, dtype='float32')
 
-        value = paddle.static.data(
-            name='value', shape=value_np.shape, dtype='float32')
+        value = paddle.static.data(name='value',
+                                   shape=value_np.shape,
+                                   dtype='float32')
 
         x[index] = value
         y = x
@@ -766,34 +777,37 @@ def run_setitem_list_index(self, array, index, value_np):
             array2[index] = value_np
         except:
             with self.assertRaises(ValueError):
-                setitem_pp = exe.run(
-                    prog,
-                    feed={x.name: array,
-                          value.name: value_np},
-                    fetch_list=fetch_list)
+                setitem_pp = exe.run(prog,
+                                     feed={
+                                         x.name: array,
+                                         value.name: value_np
+                                     },
+                                     fetch_list=fetch_list)
             return
         setitem_pp = exe.run(prog,
-                             feed={x.name: array,
-                                   value.name: value_np},
+                             feed={
+                                 x.name: array,
+                                 value.name: value_np
+                             },
                              fetch_list=fetch_list)
 
-        self.assertTrue(
-            np.allclose(array2, setitem_pp[0]),
-            msg='\n numpy:{},\n paddle:{}'.format(array2, setitem_pp[0]))
+        self.assertTrue(np.allclose(array2, setitem_pp[0]),
+                        msg='\n numpy:{},\n paddle:{}'.format(
+                            array2, setitem_pp[0]))
 
     def test_static_graph_setitem_list_index(self):
         paddle.enable_static()
         # case 1:
         inps_shape = [3, 4, 5, 2, 3]
-        array = np.arange(
-            self.numel(inps_shape), dtype='float32').reshape(inps_shape)
+        array = np.arange(self.numel(inps_shape),
+                          dtype='float32').reshape(inps_shape)
 
         index_shape = [3, 3, 1, 2]
         index = np.arange(self.numel(index_shape)).reshape(index_shape)
 
         value_shape = inps_shape[3:]
-        value_np = np.arange(
-            self.numel(value_shape), dtype='float32').reshape(value_shape) + 100
+        value_np = np.arange(self.numel(value_shape),
+                             dtype='float32').reshape(value_shape) + 100
 
         for _ in range(3):
             program = paddle.static.Program()
@@ -808,15 +822,15 @@ def test_static_graph_setitem_list_index(self):
 
         # case 2:
         inps_shape = [3, 4, 5, 4, 3]
-        array = np.arange(
-            self.numel(inps_shape), dtype='float32').reshape(inps_shape)
+        array = np.arange(self.numel(inps_shape),
+                          dtype='float32').reshape(inps_shape)
 
         index_shape = [4, 3, 2, 2]
         index = np.arange(self.numel(index_shape)).reshape(index_shape)
 
         value_shape = [3]
-        value_np = np.arange(
-            self.numel(value_shape), dtype='float32').reshape(value_shape) + 100
+        value_np = np.arange(self.numel(value_shape),
+                             dtype='float32').reshape(value_shape) + 100
 
         for _ in range(4):
             program = paddle.static.Program()
@@ -830,15 +844,15 @@ def test_static_graph_setitem_list_index(self):
 
         # case 3:
         inps_shape = [3, 4, 5, 3, 3]
-        array = np.arange(
-            self.numel(inps_shape), dtype='float32').reshape(inps_shape)
+        array = np.arange(self.numel(inps_shape),
+                          dtype='float32').reshape(inps_shape)
 
         index_shape = [4, 3, 2, 2]
         index = np.arange(self.numel(index_shape)).reshape(index_shape)
 
         value_shape = [3, 2, 2, 3]
-        value_np = np.arange(
-            self.numel(value_shape), dtype='float32').reshape(value_shape) + 100
+        value_np = np.arange(self.numel(value_shape),
+                             dtype='float32').reshape(value_shape) + 100
         index_mod = (index % (min(array.shape))).tolist()
         self.run_setitem_list_index(array, index_mod, value_np)
 
@@ -881,18 +895,18 @@ def test_static_graph_setitem_bool_scalar_index(self):
     def test_static_graph_tensor_index_setitem_muti_dim(self):
         paddle.enable_static()
         inps_shape = [3, 4, 5, 4]
-        array = np.arange(
-            self.numel(inps_shape), dtype='float32').reshape(inps_shape)
+        array = np.arange(self.numel(inps_shape),
+                          dtype='float32').reshape(inps_shape)
 
         index_shape = [2, 3, 4]
-        index1 = np.arange(
-            self.numel(index_shape), dtype='int32').reshape(index_shape)
-        index2 = np.arange(
-            self.numel(index_shape), dtype='int32').reshape(index_shape) + 2
+        index1 = np.arange(self.numel(index_shape),
+                           dtype='int32').reshape(index_shape)
+        index2 = np.arange(self.numel(index_shape),
+                           dtype='int32').reshape(index_shape) + 2
 
         value_shape = [4]
-        value_np = np.arange(
-            self.numel(value_shape), dtype='float32').reshape(value_shape) + 100
+        value_np = np.arange(self.numel(value_shape),
+                             dtype='float32').reshape(value_shape) + 100
         for _ in range(3):
 
             index_mod1 = index1 % (min(array.shape))
@@ -906,17 +920,22 @@ def test_static_graph_tensor_index_setitem_muti_dim(self):
             program = paddle.static.Program()
             with paddle.static.program_guard(program):
 
-                x1 = paddle.static.data(
-                    name='x1', shape=array.shape, dtype='float32')
-                x2 = paddle.static.data(
-                    name='x2', shape=array.shape, dtype='float32')
-
-                value = paddle.static.data(
-                    name='value', shape=value_np.shape, dtype='float32')
-                index_1 = paddle.static.data(
-                    name='index_1', shape=index1.shape, dtype='int32')
-                index_2 = paddle.static.data(
-                    name='index_2', shape=index2.shape, dtype='int32')
+                x1 = paddle.static.data(name='x1',
+                                        shape=array.shape,
+                                        dtype='float32')
+                x2 = paddle.static.data(name='x2',
+                                        shape=array.shape,
+                                        dtype='float32')
+
+                value = paddle.static.data(name='value',
+                                           shape=value_np.shape,
+                                           dtype='float32')
+                index_1 = paddle.static.data(name='index_1',
+                                             shape=index1.shape,
+                                             dtype='int32')
+                index_2 = paddle.static.data(name='index_2',
+                                             shape=index2.shape,
+                                             dtype='int32')
 
                 x1[index_1, index_2] = value
                 x2[index_1] = value
@@ -940,14 +959,12 @@ def test_static_graph_tensor_index_setitem_muti_dim(self):
                                          index_2.name: index_mod2
                                      },
                                      fetch_list=fetch_list)
-                self.assertTrue(
-                    np.array_equal(array2, setitem_pp[0]),
-                    msg='\n numpy:{},\n paddle:{}'.format(array2,
-                                                          setitem_pp[0]))
-                self.assertTrue(
-                    np.array_equal(array3, setitem_pp[1]),
-                    msg='\n numpy:{},\n paddle:{}'.format(array3,
-                                                          setitem_pp[1]))
+                self.assertTrue(np.array_equal(array2, setitem_pp[0]),
+                                msg='\n numpy:{},\n paddle:{}'.format(
+                                    array2, setitem_pp[0]))
+                self.assertTrue(np.array_equal(array3, setitem_pp[1]),
+                                msg='\n numpy:{},\n paddle:{}'.format(
+                                    array3, setitem_pp[1]))
             array = array[0]
             index1 = index1[0]
             index2 = index2[0]
@@ -955,14 +972,14 @@ def test_static_graph_tensor_index_setitem_muti_dim(self):
     def test_static_graph_array_index_muti_dim(self):
         paddle.enable_static()
         inps_shape = [3, 4, 5, 4]
-        array = np.arange(
-            self.numel(inps_shape), dtype='float32').reshape(inps_shape)
+        array = np.arange(self.numel(inps_shape),
+                          dtype='float32').reshape(inps_shape)
 
         index_shape = [2, 3, 4]
-        index1 = np.arange(
-            self.numel(index_shape), dtype='int32').reshape(index_shape)
-        index2 = np.arange(
-            self.numel(index_shape), dtype='int32').reshape(index_shape) + 2
+        index1 = np.arange(self.numel(index_shape),
+                           dtype='int32').reshape(index_shape)
+        index2 = np.arange(self.numel(index_shape),
+                           dtype='int32').reshape(index_shape) + 2
 
         for _ in range(3):
             index_mod1 = index1 % (min(array.shape))
@@ -978,10 +995,12 @@ def test_static_graph_array_index_muti_dim(self):
             program = paddle.static.Program()
             with paddle.static.program_guard(program):
 
-                x1 = paddle.static.data(
-                    name='x1', shape=array.shape, dtype='float32')
-                x2 = paddle.static.data(
-                    name='x2', shape=array.shape, dtype='float32')
+                x1 = paddle.static.data(name='x1',
+                                        shape=array.shape,
+                                        dtype='float32')
+                x2 = paddle.static.data(name='x2',
+                                        shape=array.shape,
+                                        dtype='float32')
 
                 x1[index_mod1, index_mod2] = 1
                 x2[index_mod1] = 2.5
@@ -997,24 +1016,24 @@ def test_static_graph_array_index_muti_dim(self):
                 fetch_list = [x1.name, x2.name, y1.name, y2.name]
 
                 setitem_pp = exe.run(prog,
-                                     feed={x1.name: array,
-                                           x2.name: array},
+                                     feed={
+                                         x1.name: array,
+                                         x2.name: array
+                                     },
                                      fetch_list=fetch_list)
-                self.assertTrue(
-                    np.array_equal(array2, setitem_pp[0]),
-                    msg='\n numpy:{},\n paddle:{}'.format(array2,
-                                                          setitem_pp[0]))
-                self.assertTrue(
-                    np.array_equal(array3, setitem_pp[1]),
-                    msg='\n numpy:{},\n paddle:{}'.format(array3,
-                                                          setitem_pp[1]))
-
-                self.assertTrue(
-                    np.array_equal(y_np1, setitem_pp[2]),
-                    msg='\n numpy:{},\n paddle:{}'.format(y_np1, setitem_pp[2]))
-                self.assertTrue(
-                    np.array_equal(y_np2, setitem_pp[3]),
-                    msg='\n numpy:{},\n paddle:{}'.format(y_np2, setitem_pp[3]))
+                self.assertTrue(np.array_equal(array2, setitem_pp[0]),
+                                msg='\n numpy:{},\n paddle:{}'.format(
+                                    array2, setitem_pp[0]))
+                self.assertTrue(np.array_equal(array3, setitem_pp[1]),
+                                msg='\n numpy:{},\n paddle:{}'.format(
+                                    array3, setitem_pp[1]))
+
+                self.assertTrue(np.array_equal(y_np1, setitem_pp[2]),
+                                msg='\n numpy:{},\n paddle:{}'.format(
+                                    y_np1, setitem_pp[2]))
+                self.assertTrue(np.array_equal(y_np2, setitem_pp[3]),
+                                msg='\n numpy:{},\n paddle:{}'.format(
+                                    y_np2, setitem_pp[3]))
             array = array[0]
             index1 = index1[0]
             index2 = index2[0]
@@ -1022,13 +1041,13 @@ def test_static_graph_array_index_muti_dim(self):
     def test_dygraph_array_index_muti_dim(self):
         paddle.disable_static()
         inps_shape = [3, 4, 5, 4]
-        array = np.arange(
-            self.numel(inps_shape), dtype='float32').reshape(inps_shape)
+        array = np.arange(self.numel(inps_shape),
+                          dtype='float32').reshape(inps_shape)
         index_shape = [2, 3, 4]
-        index1 = np.arange(
-            self.numel(index_shape), dtype='int32').reshape(index_shape)
-        index2 = np.arange(
-            self.numel(index_shape), dtype='int32').reshape(index_shape) + 2
+        index1 = np.arange(self.numel(index_shape),
+                           dtype='int32').reshape(index_shape)
+        index2 = np.arange(self.numel(index_shape),
+                           dtype='int32').reshape(index_shape) + 2
 
         for _ in range(3):
 
@@ -1043,26 +1062,26 @@ def test_dygraph_array_index_muti_dim(self):
 
             y_t1 = tensor1[index_mod_t2, index_mod_t1]
 
-            self.assertTrue(
-                np.array_equal(y_t1.numpy(), y_np1),
-                msg='\n numpy:{},\n paddle:{}'.format(y_np1, y_t1.numpy()))
+            self.assertTrue(np.array_equal(y_t1.numpy(), y_np1),
+                            msg='\n numpy:{},\n paddle:{}'.format(
+                                y_np1, y_t1.numpy()))
             # 1 dim getitem
             array2 = array.copy()
             y_np2 = array2[index_mod2]
             tensor2 = paddle.to_tensor(array)
 
             y_t2 = tensor2[index_mod_t2]
-            self.assertTrue(
-                np.array_equal(y_t2.numpy(), y_np2),
-                msg='\n numpy:{},\n paddle:{}'.format(y_np2, y_t2.numpy()))
+            self.assertTrue(np.array_equal(y_t2.numpy(), y_np2),
+                            msg='\n numpy:{},\n paddle:{}'.format(
+                                y_np2, y_t2.numpy()))
 
             # 2 dim setitem
             array1 = array.copy()
             array1[index_mod1, index_mod2] = 1
             tensor1[index_mod_t1, index_mod_t2] = 1
-            self.assertTrue(
-                np.array_equal(tensor1.numpy(), array1),
-                msg='\n numpy:{},\n paddle:{}'.format(array1, tensor1.numpy()))
+            self.assertTrue(np.array_equal(tensor1.numpy(), array1),
+                            msg='\n numpy:{},\n paddle:{}'.format(
+                                array1, tensor1.numpy()))
             # 1 dim setitem
             array2 = array.copy()
 
@@ -1070,9 +1089,9 @@ def test_dygraph_array_index_muti_dim(self):
 
             tensor2[index_mod_t1] = 2.5
 
-            self.assertTrue(
-                np.array_equal(tensor2.numpy(), array2),
-                msg='\n numpy:{},\n paddle:{}'.format(array2, tensor2.numpy()))
+            self.assertTrue(np.array_equal(tensor2.numpy(), array2),
+                            msg='\n numpy:{},\n paddle:{}'.format(
+                                array2, tensor2.numpy()))
 
             array = array[0]
             index1 = index1[0]
diff --git a/python/paddle/fluid/tests/unittests/test_variance_layer.py b/python/paddle/fluid/tests/unittests/test_variance_layer.py
index 13e3cf4df111e..cf46d82b11d21 100644
--- a/python/paddle/fluid/tests/unittests/test_variance_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_variance_layer.py
@@ -27,6 +27,7 @@ def ref_var(x, axis=None, unbiased=True, keepdim=False):
 
 
 class TestVarAPI(unittest.TestCase):
+
     def setUp(self):
         self.dtype = 'float64'
         self.shape = [1, 3, 4, 10]
@@ -67,36 +68,43 @@ def test_api(self):
 
 
 class TestVarAPI_dtype(TestVarAPI):
+
     def set_attrs(self):
         self.dtype = 'float32'
 
 
 class TestVarAPI_axis_int(TestVarAPI):
+
     def set_attrs(self):
         self.axis = 2
 
 
 class TestVarAPI_axis_list(TestVarAPI):
+
     def set_attrs(self):
         self.axis = [1, 2]
 
 
 class TestVarAPI_axis_tuple(TestVarAPI):
+
     def set_attrs(self):
         self.axis = (1, 3)
 
 
 class TestVarAPI_keepdim(TestVarAPI):
+
     def set_attrs(self):
         self.keepdim = False
 
 
 class TestVarAPI_unbiased(TestVarAPI):
+
     def set_attrs(self):
         self.unbiased = False
 
 
 class TestVarAPI_alias(unittest.TestCase):
+
     def test_alias(self):
         paddle.disable_static()
         x = paddle.to_tensor(np.array([10, 12], 'float32'))
@@ -109,6 +117,7 @@ def test_alias(self):
 
 
 class TestVarError(unittest.TestCase):
+
     def test_error(self):
         with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.fluid.data('X', [2, 3, 4], 'int32')
diff --git a/python/paddle/fluid/tests/unittests/test_version.py b/python/paddle/fluid/tests/unittests/test_version.py
index 42a0e5c802c53..d31288866861a 100644
--- a/python/paddle/fluid/tests/unittests/test_version.py
+++ b/python/paddle/fluid/tests/unittests/test_version.py
@@ -21,6 +21,7 @@
 
 
 class VersionTest(unittest.TestCase):
+
     def setUp(self):
         self._major_regex = "[0-9]+"
         self._minor_regex = "[0-9]+"
diff --git a/python/paddle/fluid/tests/unittests/test_view_op_reuse_allocation.py b/python/paddle/fluid/tests/unittests/test_view_op_reuse_allocation.py
index 0d4e379660b75..a70d8e209b33d 100644
--- a/python/paddle/fluid/tests/unittests/test_view_op_reuse_allocation.py
+++ b/python/paddle/fluid/tests/unittests/test_view_op_reuse_allocation.py
@@ -28,6 +28,7 @@
 # reuse the input varbase's allocation.
 # View APIs include: `squeeze`, `unsqueeze`, `reshape`, `flatten`, `detach`
 class TestDygraphViewReuseAllocation(unittest.TestCase):
+
     def setUp(self):
         self.init_shape()
 
@@ -104,6 +105,7 @@ def test_backward_error(self):
 
 
 class TestUnsqueezeDygraphViewReuseAllocation(TestDygraphViewReuseAllocation):
+
     def init_shape(self):
         self.input_shape = [2, 3]
         self.output_shape = [2, 3, 1]
@@ -113,6 +115,7 @@ def view_api_processing(self, var):
 
 
 class TestReshapeDygraphViewReuseAllocation(TestDygraphViewReuseAllocation):
+
     def init_shape(self):
         self.input_shape = [3, 4]
         self.output_shape = [2, 2, 3]
@@ -122,6 +125,7 @@ def view_api_processing(self, var):
 
 
 class TestFlattenDygraphViewReuseAllocation(TestDygraphViewReuseAllocation):
+
     def init_shape(self):
         self.input_shape = [3, 4]
         self.output_shape = [12]
diff --git a/python/paddle/fluid/tests/unittests/test_viterbi_decode_op.py b/python/paddle/fluid/tests/unittests/test_viterbi_decode_op.py
index 163e246b71560..7b78957777154 100644
--- a/python/paddle/fluid/tests/unittests/test_viterbi_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_viterbi_decode_op.py
@@ -14,10 +14,12 @@
 from paddle.fluid import core
 import unittest
 import paddle
+
 paddle.enable_static()
 
 
 class Decoder(object):
+
     def __init__(self, transitions, use_tag=True):
         self.transitions = transitions
         self.use_tag = use_tag
@@ -67,6 +69,7 @@ def __call__(self, inputs, length):
 
 
 class TestViterbiOp(OpTest):
+
     def set_attr(self):
         self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
         self.use_tag = True
@@ -87,7 +90,9 @@ def setUp(self):
             'Transition': self.trans,
             'Length': self.length
         }
-        self.attrs = {'include_bos_eos_tag': self.use_tag, }
+        self.attrs = {
+            'include_bos_eos_tag': self.use_tag,
+        }
         self.outputs = {'Scores': scores, 'Path': path}
 
     def test_output(self):
@@ -95,6 +100,7 @@ def test_output(self):
 
 
 class TestViterbiAPI(unittest.TestCase):
+
     def set_attr(self):
         self.use_tag = True
         self.bz, self.len, self.ntags = 4, 8, 10
@@ -113,10 +119,12 @@ def setUp(self):
     def check_static_result(self, place):
         bz, length, ntags = self.bz, self.len, self.ntags
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            Input = fluid.data(
-                name="Input", shape=[bz, length, ntags], dtype="float32")
-            Transition = fluid.data(
-                name="Transition", shape=[ntags, ntags], dtype="float32")
+            Input = fluid.data(name="Input",
+                               shape=[bz, length, ntags],
+                               dtype="float32")
+            Transition = fluid.data(name="Transition",
+                                    shape=[ntags, ntags],
+                                    dtype="float32")
             Length = fluid.data(name="Length", shape=[bz], dtype="int64")
             decoder = paddle.text.ViterbiDecoder(Transition, self.use_tag)
             score, path = decoder(Input, Length)
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 53f3b3cf53d76..1d9d9a180d014 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -31,6 +31,7 @@
 
 
 class CTCForward(object):
+
     def __init__(self, softmax, softmax_lod, labels, labels_lod, num_classes,
                  batch_size, blank, norm_by_times):
         self.softmax = softmax
@@ -115,15 +116,15 @@ def forward_a_sequence(self, softmax_a_sequence, labels_a_sequence):
         # calculate the forward and backward variables,
         # reference Chapter 7.3 of "Alex Grave, Supervised Sequence
         # Labelling with Recurrent Neural Networks"
-        log_acts = np.zeros(
-            [total_times, self.num_classes], dtype=softmax_a_sequence.dtype)
+        log_acts = np.zeros([total_times, self.num_classes],
+                            dtype=softmax_a_sequence.dtype)
         for i in range(total_times):
             for j in range(self.num_classes):
                 log_acts[i, j] = self.safe_log(softmax_a_sequence[i, j])
 
         # calculate the forward variables
-        forward_vars = np.zeros(
-            [total_times, total_segments], dtype=softmax_a_sequence.dtype)
+        forward_vars = np.zeros([total_times, total_segments],
+                                dtype=softmax_a_sequence.dtype)
         for i in range(total_times):
             for j in range(total_segments):
                 forward_vars[i, j] = self.LOG_ZERO
@@ -174,8 +175,8 @@ def forward(self):
                 labels_start_i = labels_offset
                 labels_end_i = labels_offset + self.labels_lod[self.level][i]
 
-                softmax_a_sequence = self.softmax[softmax_start_i:
-                                                  softmax_end_i, :]
+                softmax_a_sequence = self.softmax[
+                    softmax_start_i:softmax_end_i, :]
                 labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
                 self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
                                                        labels_a_sequence)
@@ -191,6 +192,7 @@ def forward(self):
 
 
 class TestWarpCTCOp(OpTest):
+
     def config(self):
         self.batch_size = 4
         self.num_classes = 12
@@ -208,10 +210,10 @@ def setUp(self):
             [sum(self.logits_lod[0]), self.num_classes]).astype("float32")
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
         # labels should not be blank
-        labels = np.random.randint(
-            0,
-            self.num_classes - 1, [sum(self.labels_lod[0]), 1],
-            dtype="int32")
+        labels = np.random.randint(0,
+                                   self.num_classes - 1,
+                                   [sum(self.labels_lod[0]), 1],
+                                   dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
                          self.num_classes, self.batch_size, self.blank,
@@ -242,20 +244,19 @@ def test_check_output(self):
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
         if core.is_compiled_with_rocm():
-            self.check_grad(
-                ["Logits"],
-                "Loss",
-                max_relative_error=0.009,
-                check_dygraph=False)
+            self.check_grad(["Logits"],
+                            "Loss",
+                            max_relative_error=0.009,
+                            check_dygraph=False)
         else:
-            self.check_grad(
-                ["Logits"],
-                "Loss",
-                max_relative_error=0.007,
-                check_dygraph=False)
+            self.check_grad(["Logits"],
+                            "Loss",
+                            max_relative_error=0.007,
+                            check_dygraph=False)
 
 
 class TestWarpCTCOpCase1(TestWarpCTCOp):
+
     def config(self):
         self.batch_size = 4
         self.num_classes = CUDA_BLOCK_SIZE + 2
@@ -266,6 +267,7 @@ def config(self):
 
 
 class TestWarpCTCOpWithPadding(OpTest):
+
     def config(self):
         self.batch_size = 4
         self.num_classes = 8
@@ -285,10 +287,10 @@ def setUp(self):
             [sum(self.logits_length), self.num_classes]).astype("float32")
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
         # labels should not be blank
-        labels = np.random.randint(
-            0,
-            self.num_classes - 1, [sum(self.labels_length), 1],
-            dtype="int32")
+        labels = np.random.randint(0,
+                                   self.num_classes - 1,
+                                   [sum(self.labels_length), 1],
+                                   dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
                          self.num_classes, self.batch_size, self.blank,
@@ -316,8 +318,8 @@ def setUp(self):
         for i in range(self.batch_size):
             max_target_seq_length = max(max_target_seq_length,
                                         self.labels_length[i])
-        new_labels = np.zeros(
-            [self.batch_size, max_target_seq_length], dtype="int32")
+        new_labels = np.zeros([self.batch_size, max_target_seq_length],
+                              dtype="int32")
 
         cur = 0
         for batch_id in range(self.batch_size):
@@ -347,20 +349,19 @@ def test_check_output(self):
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
         if core.is_compiled_with_rocm():
-            self.check_grad(
-                ["Logits"],
-                "Loss",
-                max_relative_error=0.009,
-                check_dygraph=False)
+            self.check_grad(["Logits"],
+                            "Loss",
+                            max_relative_error=0.009,
+                            check_dygraph=False)
         else:
-            self.check_grad(
-                ["Logits"],
-                "Loss",
-                max_relative_error=0.007,
-                check_dygraph=False)
+            self.check_grad(["Logits"],
+                            "Loss",
+                            max_relative_error=0.007,
+                            check_dygraph=False)
 
 
 class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding):
+
     def config(self):
         self.batch_size = 4
         self.num_classes = CUDA_BLOCK_SIZE + 2
@@ -373,6 +374,7 @@ def config(self):
 
 
 class TestWarpCTCOpFp64(OpTest):
+
     def config(self):
         self.batch_size = 4
         self.num_classes = 8
@@ -392,10 +394,10 @@ def setUp(self):
             [sum(self.logits_length), self.num_classes]).astype("float64")
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
         # labels should not be blank
-        labels = np.random.randint(
-            0,
-            self.num_classes - 1, [sum(self.labels_length), 1],
-            dtype="int32")
+        labels = np.random.randint(0,
+                                   self.num_classes - 1,
+                                   [sum(self.labels_length), 1],
+                                   dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
                          self.num_classes, self.batch_size, self.blank,
@@ -423,8 +425,8 @@ def setUp(self):
         for i in range(self.batch_size):
             max_target_seq_length = max(max_target_seq_length,
                                         self.labels_length[i])
-        new_labels = np.zeros(
-            [self.batch_size, max_target_seq_length], dtype="int32")
+        new_labels = np.zeros([self.batch_size, max_target_seq_length],
+                              dtype="int32")
 
         cur = 0
         for batch_id in range(self.batch_size):
@@ -457,57 +459,58 @@ def test_check_grad(self):
 
 
 class TestWarpCTCOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
-            logits = fluid.data(
-                name='logits', shape=[5, 16, 6], dtype='float32')
-            logits_length = fluid.data(
-                name='logits_length', shape=[None], dtype='int64')
+            logits = fluid.data(name='logits',
+                                shape=[5, 16, 6],
+                                dtype='float32')
+            logits_length = fluid.data(name='logits_length',
+                                       shape=[None],
+                                       dtype='int64')
             label = fluid.data(name='label', shape=[16, 3], dtype='int32')
-            label_length = fluid.data(
-                name='labels_length', shape=[None], dtype='int64')
+            label_length = fluid.data(name='labels_length',
+                                      shape=[None],
+                                      dtype='int64')
 
             def test_logits_Variable():
                 logits_data = np.random.rand(5, 16, 6).astype(logits.dtype)
-                fluid.layers.warpctc(
-                    input=logits_data,
-                    label=label,
-                    input_length=logits_length,
-                    label_length=label_length)
+                fluid.layers.warpctc(input=logits_data,
+                                     label=label,
+                                     input_length=logits_length,
+                                     label_length=label_length)
 
             self.assertRaises(TypeError, test_logits_Variable)
 
             def test_label_Variable():
                 label_data = np.random.randint(0, 5, [5, 1]).astype("int32")
-                fluid.layers.warpctc(
-                    input=logits,
-                    label=label_data,
-                    input_length=logits_length,
-                    label_length=label_length)
+                fluid.layers.warpctc(input=logits,
+                                     label=label_data,
+                                     input_length=logits_length,
+                                     label_length=label_length)
 
             self.assertRaises(TypeError, test_label_Variable)
 
             def test_logits_len_Variable():
                 logits_length_data = np.array([5] * 16).astype("int64")
-                fluid.layers.warpctc(
-                    input=logits,
-                    label=label,
-                    input_length=logits_length_data,
-                    label_length=label_length)
+                fluid.layers.warpctc(input=logits,
+                                     label=label,
+                                     input_length=logits_length_data,
+                                     label_length=label_length)
 
             self.assertRaises(TypeError, test_logits_len_Variable)
 
             def test_label_len_Variable():
                 label_length_data = np.array([3] * 16).astype("int64")
-                fluid.layers.warpctc(
-                    input=logits,
-                    label=label,
-                    input_length=logits_length,
-                    label_length=label_length_data)
+                fluid.layers.warpctc(input=logits,
+                                     label=label,
+                                     input_length=logits_length,
+                                     label_length=label_length_data)
 
             self.assertRaises(TypeError, test_label_len_Variable)
 
     def test_dygraph_errors(self):
+
         def test_dygraph_with_lod():
 
             logits = np.random.uniform(0.1, 1.0, [20, 15]).astype("float32")
@@ -524,6 +527,7 @@ def test_dygraph_with_lod():
 
 
 class TestCTCLossAPICase(unittest.TestCase):
+
     def test_functinal_api(self):
         self.batch_size = 4
         self.num_classes = CUDA_BLOCK_SIZE + 2
@@ -532,14 +536,16 @@ def test_functinal_api(self):
         self.blank = self.num_classes - 1
         self.norm_by_times = False
 
-        logits = np.random.uniform(0.1, 1.0, [
-            max(self.logits_length), self.batch_size, self.num_classes
-        ]).astype("float32")
+        logits = np.random.uniform(
+            0.1, 1.0,
+            [max(self.logits_length), self.batch_size, self.num_classes
+             ]).astype("float32")
         softmax = np.apply_along_axis(stable_softmax, -1, logits)
         # labels should not be blank
         labels = np.random.randint(
             0,
-            self.num_classes - 1, [self.batch_size, max(self.labels_length)],
+            self.num_classes - 1,
+            [self.batch_size, max(self.labels_length)],
             dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_length, labels,
@@ -552,22 +558,20 @@ def test_functinal_api(self):
         labels = paddle.to_tensor(labels)
         logits_length = paddle.to_tensor(self.logits_length)
         labels_length = paddle.to_tensor(self.labels_length)
-        loss_pd_mean = F.ctc_loss(
-            softmax,
-            labels,
-            logits_length,
-            labels_length,
-            blank=self.blank,
-            reduction='mean')
+        loss_pd_mean = F.ctc_loss(softmax,
+                                  labels,
+                                  logits_length,
+                                  labels_length,
+                                  blank=self.blank,
+                                  reduction='mean')
         loss_pd_mean = loss_pd_mean.numpy()
 
-        loss_pd_sum = F.ctc_loss(
-            softmax,
-            labels,
-            logits_length,
-            labels_length,
-            blank=self.blank,
-            reduction='sum')
+        loss_pd_sum = F.ctc_loss(softmax,
+                                 labels,
+                                 logits_length,
+                                 labels_length,
+                                 blank=self.blank,
+                                 reduction='sum')
         loss_pd_sum = loss_pd_sum.numpy()
         paddle.enable_static()
         loss_np = np.squeeze(loss_np, axis=-1)
@@ -585,14 +589,16 @@ def test_class_api(self):
         self.blank = 0
         self.norm_by_times = False
 
-        logits = np.random.uniform(0.1, 1.0, [
-            max(self.logits_length), self.batch_size, self.num_classes
-        ]).astype("float32")
+        logits = np.random.uniform(
+            0.1, 1.0,
+            [max(self.logits_length), self.batch_size, self.num_classes
+             ]).astype("float32")
         softmax = np.apply_along_axis(stable_softmax, -1, logits)
         # labels should not be blank
         labels = np.random.randint(
             1,
-            self.num_classes, [self.batch_size, max(self.labels_length)],
+            self.num_classes,
+            [self.batch_size, max(self.labels_length)],
             dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_length, labels,
@@ -606,8 +612,9 @@ def test_class_api(self):
         logits_length = paddle.to_tensor(self.logits_length)
         labels_length = paddle.to_tensor(self.labels_length)
 
-        loss_pd = paddle.nn.CTCLoss(self.blank, 'none')(
-            softmax, labels, logits_length, labels_length)
+        loss_pd = paddle.nn.CTCLoss(self.blank,
+                                    'none')(softmax, labels, logits_length,
+                                            labels_length)
         loss_pd = loss_pd.numpy()
         paddle.enable_static()
         loss_np = np.squeeze(loss_np, axis=-1)
diff --git a/python/paddle/fluid/tests/unittests/test_weight_decay.py b/python/paddle/fluid/tests/unittests/test_weight_decay.py
index 2a2ad0f6d03bb..b42bfb1a684ac 100644
--- a/python/paddle/fluid/tests/unittests/test_weight_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_decay.py
@@ -54,8 +54,9 @@ def bow_net(data,
     This model is from https://github.com/PaddlePaddle/models:
     fluid/PaddleNLP/text_classification/nets.py
     """
-    emb = fluid.layers.embedding(
-        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
+    emb = fluid.layers.embedding(input=data,
+                                 is_sparse=is_sparse,
+                                 size=[dict_dim, emb_dim])
     bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
     bow_tanh = fluid.layers.tanh(bow)
     fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
@@ -68,10 +69,11 @@ def bow_net(data,
 
 
 class TestWeightDecay(unittest.TestCase):
+
     def setUp(self):
         self.word_dict = paddle.dataset.imdb.word_dict()
-        reader = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict), batch_size=4)()
+        reader = paddle.batch(paddle.dataset.imdb.train(self.word_dict),
+                              batch_size=4)()
         self.train_data = [next(reader) for _ in range(5)]
         self.learning_rate = .5
 
@@ -111,11 +113,11 @@ def run_parallel_exe(self,
                 if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
         build_strategy.memory_optimize = use_ir_memory_optimize
 
-        train_cp = compiler.CompiledProgram(fluid.default_main_program(
-        )).with_data_parallel(
-            loss_name=loss.name,
-            exec_strategy=exec_strategy,
-            build_strategy=build_strategy)
+        train_cp = compiler.CompiledProgram(
+            fluid.default_main_program()).with_data_parallel(
+                loss_name=loss.name,
+                exec_strategy=exec_strategy,
+                build_strategy=build_strategy)
 
         loss_set = []
         for data in self.train_data:
@@ -135,8 +137,10 @@ def check_weight_decay(self,
         startup_prog = fluid.framework.Program()
         startup_prog.random_seed = 1
         with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
-            data = fluid.layers.data(
-                name="words", shape=[1], dtype="int64", lod_level=1)
+            data = fluid.layers.data(name="words",
+                                     shape=[1],
+                                     dtype="int64",
+                                     lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
             avg_cost = model(data, label, len(self.word_dict))
 
@@ -148,13 +152,14 @@ def check_weight_decay(self,
             optimizer.minimize(avg_cost)
 
             for params in param_list:
-                updated_p = fluid.layers.elementwise_sub(
-                    x=params[0], y=params[1])
+                updated_p = fluid.layers.elementwise_sub(x=params[0],
+                                                         y=params[1])
                 fluid.layers.assign(input=updated_p, output=params[0])
 
             if use_parallel_exe:
-                loss = self.run_parallel_exe(
-                    place, [data, label], loss=avg_cost, use_reduce=use_reduce)
+                loss = self.run_parallel_exe(place, [data, label],
+                                             loss=avg_cost,
+                                             use_reduce=use_reduce)
             else:
                 loss = self.run_executor(place, [data, label], loss=avg_cost)
 
@@ -166,15 +171,16 @@ def test_weight_decay(self):
             loss = self.check_weight_decay(place, model, use_parallel_exe=False)
 
             # TODO(zcd): should test use_reduce=True
-            loss2 = self.check_weight_decay(
-                place, model, use_parallel_exe=True, use_reduce=False)
+            loss2 = self.check_weight_decay(place,
+                                            model,
+                                            use_parallel_exe=True,
+                                            use_reduce=False)
 
             for i in range(len(loss)):
                 self.assertTrue(
-                    np.isclose(
-                        a=loss[i], b=loss2[i], rtol=5e-5),
-                    "Expect " + str(loss[i]) + "\n" + "But Got" + str(loss2[i])
-                    + " in class " + self.__class__.__name__)
+                    np.isclose(a=loss[i], b=loss2[i], rtol=5e-5),
+                    "Expect " + str(loss[i]) + "\n" + "But Got" +
+                    str(loss2[i]) + " in class " + self.__class__.__name__)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_weight_normalization.py b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
index e990d8b2498f6..95cfe40084fc2 100644
--- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
@@ -34,8 +34,8 @@ def setUpClass(cls):
 
     @classmethod
     def set_program(cls):
-        data = fluid.layers.data(
-            name=cls.data_desc[0][0], shape=cls.data_desc[0][1])
+        data = fluid.layers.data(name=cls.data_desc[0][0],
+                                 shape=cls.data_desc[0][1])
         out = fluid.layers.fc(input=data,
                               size=cls.hidden_size,
                               param_attr=WeightNormParamAttr(
@@ -82,8 +82,8 @@ def set_data(self):
                     if i == 0 else sum(lod_level_i)).tolist()
                 data_lod.append(lod_level_i)
             data_value = numpy.random.random(
-                size=[sum(data_lod[-1]) if data_lod else self.batch_size
-                      ] + data_shape).astype('float32')
+                size=[sum(data_lod[-1]) if data_lod else self.batch_size] +
+                data_shape).astype('float32')
             self.data[data_name] = (data_value, data_lod)
 
     def set_inputs(self, place):
@@ -96,14 +96,15 @@ def set_inputs(self, place):
             self.inputs[desc[0]] = tensor
 
     def weight_normalize(self):
-        v = numpy.ones((self.data[self.data_desc[0][0]][0].shape[-1],
-                        self.hidden_size))
+        v = numpy.ones(
+            (self.data[self.data_desc[0][0]][0].shape[-1], self.hidden_size))
         g = numpy.linalg.norm(v, axis=None, keepdims=True)
         w = g * v / numpy.linalg.norm(v, axis=None, keepdims=True)
         x = self.data[self.data_desc[0][0]][0]
         out = numpy.dot(x, w)
-        g_grad = (numpy.dot(x.T, numpy.ones_like(out)) * (v / numpy.linalg.norm(
-            v, axis=None, keepdims=True))).sum(axis=None, keepdims=True)
+        g_grad = (numpy.dot(x.T, numpy.ones_like(out)) *
+                  (v / numpy.linalg.norm(v, axis=None, keepdims=True))).sum(
+                      axis=None, keepdims=True)
         return g, v, g_grad
 
     def test_weight_normalization(self):
@@ -113,8 +114,7 @@ def test_weight_normalization(self):
         for actual_output in self.actual_outputs:
             [
                 self.assertTrue(
-                    numpy.allclose(
-                        numpy.array(actual), expect, atol=0.001))
+                    numpy.allclose(numpy.array(actual), expect, atol=0.001))
                 for expect, actual in zip(expect_output, actual_output)
             ]
 
diff --git a/python/paddle/fluid/tests/unittests/test_where_index.py b/python/paddle/fluid/tests/unittests/test_where_index.py
index 1c5705023b87a..250bd3fa61f4f 100644
--- a/python/paddle/fluid/tests/unittests/test_where_index.py
+++ b/python/paddle/fluid/tests/unittests/test_where_index.py
@@ -24,6 +24,7 @@
 
 
 class TestWhereIndexOp(OpTest):
+
     def setUp(self):
         self.op_type = "where_index"
         self.init_config()
@@ -32,12 +33,15 @@ def test_check_output(self):
         self.check_output()
 
     def init_config(self):
-        self.inputs = {'Condition': np.array([True, False, True]), }
+        self.inputs = {
+            'Condition': np.array([True, False, True]),
+        }
 
         self.outputs = {'Out': np.array([[0], [2]], dtype='int64')}
 
 
 class TestAllFalse(unittest.TestCase):
+
     def setUp(self):
         self.op_type = "where_index"
         self.init_config()
@@ -69,28 +73,34 @@ def test_all_false(self):
 
 
 class TestRank2(TestWhereIndexOp):
+
     def init_config(self):
-        self.inputs = {'Condition': np.array([[True, False], [False, True]]), }
+        self.inputs = {
+            'Condition': np.array([[True, False], [False, True]]),
+        }
 
         self.outputs = {'Out': np.array([[0, 0], [1, 1]], dtype='int64')}
 
 
 class TestRank3(TestWhereIndexOp):
+
     def init_config(self):
         self.inputs = {
-            'Condition': np.array([[[True, False], [False, True]],
-                                   [[False, True], [True, False]],
-                                   [[False, False], [False, True]]]),
+            'Condition':
+            np.array([[[True, False], [False, True]],
+                      [[False, True], [True, False]],
+                      [[False, False], [False, True]]]),
         }
 
         self.outputs = {
-            'Out': np.array(
-                [[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 0], [2, 1, 1]],
-                dtype='int64')
+            'Out':
+            np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 0], [2, 1, 1]],
+                     dtype='int64')
         }
 
 
 class TestWhereOpError(unittest.TestCase):
+
     def test_api(self):
         with program_guard(Program(), Program()):
             cond = fluid.layers.data(name='cond', shape=[4], dtype='bool')
@@ -103,7 +113,9 @@ def test_api(self):
 
 
 class TestWhereRaiseError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_type():
             fluid.layers.where([10])
 
diff --git a/python/paddle/fluid/tests/unittests/test_where_op.py b/python/paddle/fluid/tests/unittests/test_where_op.py
index 36819e089edbf..51cb380be8438 100644
--- a/python/paddle/fluid/tests/unittests/test_where_op.py
+++ b/python/paddle/fluid/tests/unittests/test_where_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,6 +27,7 @@
 
 
 class TestWhereOp(OpTest):
+
     def setUp(self):
         self.op_type = 'where'
         self.python_api = paddle.where
@@ -47,6 +48,7 @@ def init_config(self):
 
 
 class TestWhereOp2(TestWhereOp):
+
     def init_config(self):
         self.x = np.random.uniform((-5), 5, (60, 2)).astype('float64')
         self.y = np.random.uniform((-5), 5, (60, 2)).astype('float64')
@@ -54,6 +56,7 @@ def init_config(self):
 
 
 class TestWhereOp3(TestWhereOp):
+
     def init_config(self):
         self.x = np.random.uniform((-3), 5, (20, 2, 4)).astype('float64')
         self.y = np.random.uniform((-3), 5, (20, 2, 4)).astype('float64')
@@ -61,6 +64,7 @@ def init_config(self):
 
 
 class TestWhereAPI(unittest.TestCase):
+
     def setUp(self):
         self.init_data()
 
@@ -81,19 +85,22 @@ def test_api(self, use_cuda=False):
         for x_stop_gradient in [False, True]:
             for y_stop_gradient in [False, True]:
                 with fluid.program_guard(Program(), Program()):
-                    cond = fluid.layers.data(
-                        name='cond', shape=self.shape, dtype='bool')
-                    x = fluid.layers.data(
-                        name='x', shape=self.shape, dtype='float32')
-                    y = fluid.layers.data(
-                        name='y', shape=self.shape, dtype='float32')
+                    cond = fluid.layers.data(name='cond',
+                                             shape=self.shape,
+                                             dtype='bool')
+                    x = fluid.layers.data(name='x',
+                                          shape=self.shape,
+                                          dtype='float32')
+                    y = fluid.layers.data(name='y',
+                                          shape=self.shape,
+                                          dtype='float32')
                     x.stop_gradient = x_stop_gradient
                     y.stop_gradient = y_stop_gradient
                     result = paddle.where(cond, x, y)
                     append_backward(layers.mean(result))
                     for use_cuda in [False, True]:
-                        if (use_cuda and
-                            (not fluid.core.is_compiled_with_cuda())):
+                        if (use_cuda
+                                and (not fluid.core.is_compiled_with_cuda())):
                             break
                         place = (fluid.CUDAPlace(0)
                                  if use_cuda else fluid.CPUPlace())
@@ -103,12 +110,13 @@ def test_api(self, use_cuda=False):
                             fetch_list.append(x.grad_name)
                         if (y_stop_gradient is False):
                             fetch_list.append(y.grad_name)
-                        out = exe.run(
-                            fluid.default_main_program(),
-                            feed={'cond': self.cond,
-                                  'x': self.x,
-                                  'y': self.y},
-                            fetch_list=fetch_list)
+                        out = exe.run(fluid.default_main_program(),
+                                      feed={
+                                          'cond': self.cond,
+                                          'x': self.x,
+                                          'y': self.y
+                                      },
+                                      fetch_list=fetch_list)
                         assert np.array_equal(out[0], self.out)
                         if (x_stop_gradient is False):
                             assert np.array_equal(out[2],
@@ -126,8 +134,8 @@ def test_api_broadcast(self, use_cuda=False):
             x = fluid.layers.data(name='x', shape=[4, 1], dtype='float32')
             y = fluid.layers.data(name='y', shape=[4, 2], dtype='float32')
             x_i = np.array([[0.9383, 0.1983, 3.2, 1.2]]).astype('float32')
-            y_i = np.array(
-                [[1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0]]).astype('float32')
+            y_i = np.array([[1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0,
+                                                   1.0]]).astype('float32')
             result = paddle.where((x > 1), x=x, y=y)
             for use_cuda in [False, True]:
                 if (use_cuda and (not fluid.core.is_compiled_with_cuda())):
@@ -135,8 +143,10 @@ def test_api_broadcast(self, use_cuda=False):
                 place = (fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace())
                 exe = fluid.Executor(place)
                 out = exe.run(fluid.default_main_program(),
-                              feed={'x': x_i,
-                                    'y': y_i},
+                              feed={
+                                  'x': x_i,
+                                  'y': y_i
+                              },
                               fetch_list=[result])
                 assert np.array_equal(out[0], np.where((x_i > 1), x_i, y_i))
 
@@ -145,8 +155,9 @@ def test_scalar(self):
         main_program = Program()
         with fluid.program_guard(main_program):
             cond_shape = [2, 4]
-            cond = fluid.layers.data(
-                name='cond', shape=cond_shape, dtype='bool')
+            cond = fluid.layers.data(name='cond',
+                                     shape=cond_shape,
+                                     dtype='bool')
             x_data = 1.0
             y_data = 2.0
             cond_data = np.array([False, False, True, True]).astype('bool')
@@ -166,8 +177,9 @@ def __test_where_with_broadcast_static(self, cond_shape, x_shape, y_shape):
         paddle.enable_static()
         main_program = Program()
         with fluid.program_guard(main_program):
-            cond = fluid.layers.data(
-                name='cond', shape=cond_shape, dtype='bool')
+            cond = fluid.layers.data(name='cond',
+                                     shape=cond_shape,
+                                     dtype='bool')
             x = fluid.layers.data(name='x', shape=x_shape, dtype='float32')
             y = fluid.layers.data(name='y', shape=y_shape, dtype='float32')
             cond_data_tmp = np.random.random(size=cond_shape).astype('float32')
@@ -180,12 +192,13 @@ def __test_where_with_broadcast_static(self, cond_shape, x_shape, y_shape):
                     return
                 place = (fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace())
                 exe = fluid.Executor(place)
-                out = exe.run(
-                    fluid.default_main_program(),
-                    feed={'cond': cond_data,
-                          'x': x_data,
-                          'y': y_data},
-                    fetch_list=[result])
+                out = exe.run(fluid.default_main_program(),
+                              feed={
+                                  'cond': cond_data,
+                                  'x': x_data,
+                                  'y': y_data
+                              },
+                              fetch_list=[result])
                 expect = np.where(cond_data, x_data, y_data)
                 assert np.array_equal(out[0], expect)
 
@@ -239,6 +252,7 @@ def test_static_api_broadcast_8(self):
 
 
 class TestWhereDygraphAPI(unittest.TestCase):
+
     def test_api(self):
         with fluid.dygraph.guard():
             x_i = np.array([0.9383, 0.1983, 3.2, 1.2]).astype('float64')
@@ -360,6 +374,7 @@ def test_eager(self):
 
 
 class TestWhereOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             x_i = np.array([0.9383, 0.1983, 3.2, 1.2]).astype('float64')
diff --git a/python/paddle/fluid/tests/unittests/test_while_loop_op.py b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
index 83ca577faa5c6..baf111df6335a 100644
--- a/python/paddle/fluid/tests/unittests/test_while_loop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
@@ -29,7 +29,9 @@
 
 
 class TestApiWhileLoop(unittest.TestCase):
+
     def test_var_tuple(self):
+
         def cond(i):
             return layers.less_than(i, ten)
 
@@ -44,14 +46,15 @@ def body(i):
             ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
             out = layers.while_loop(cond, body, (i, ))
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         res = exe.run(main_program, fetch_list=out)
         self.assertTrue(
             np.allclose(np.asarray(res[0]), np.full((1), 10, np.int64)))
 
     def test_var_list(self):
+
         def cond(i, mem):
             return layers.less_than(i, ten)
 
@@ -72,8 +75,8 @@ def body(i, mem):
             data = np.random.rand(10).astype('float32')
             data_one = np.ones(10).astype('float32')
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         res = exe.run(main_program, feed={'mem': data}, fetch_list=out)
         for i in range(10):
@@ -81,6 +84,7 @@ def body(i, mem):
         self.assertTrue(np.allclose(np.asarray(res[1]), data))
 
     def test_var_dict(self):
+
         def cond(i, ten, test_dict, test_list, test_list_dict):
             return layers.less_than(i, ten)
 
@@ -91,8 +95,8 @@ def body(i, ten, test_dict, test_list, test_list_dict):
             test_list[0] = fluid.layers.reshape(test_list[0], [2, -1]) + 1
 
             test_list_dict[0]["test_key"] += 1
-            test_list_dict[0]["test_key"] = fluid.layers.relu(test_list_dict[0][
-                "test_key"])
+            test_list_dict[0]["test_key"] = fluid.layers.relu(
+                test_list_dict[0]["test_key"])
 
             i = layers.increment(i)
             return [i, ten, test_dict, test_list, test_list_dict]
@@ -106,18 +110,17 @@ def body(i, ten, test_dict, test_list, test_list_dict):
 
             test_dict = {"test_key": test_data}
             test_list = [
-                layers.fill_constant(
-                    shape=[1, 2], dtype='int64', value=0)
+                layers.fill_constant(shape=[1, 2], dtype='int64', value=0)
             ]
             test_list_dict = [{
-                "test_key": layers.fill_constant(
-                    shape=[1], dtype='float32', value=0)
+                "test_key":
+                layers.fill_constant(shape=[1], dtype='float32', value=0)
             }]
 
             i, ten, test_dict, test_list, test_list_dict = layers.while_loop(
                 cond, body, [i, ten, test_dict, test_list, test_list_dict])
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         res = exe.run(main_program,
                       fetch_list=[
@@ -125,28 +128,25 @@ def body(i, ten, test_dict, test_list, test_list_dict):
                           test_list_dict[0]["test_key"]
                       ])
         self.assertTrue(
-            np.allclose(
-                np.asarray(res[0]),
-                np.full(
-                    shape=(1), fill_value=10, dtype=np.int64)))
+            np.allclose(np.asarray(res[0]),
+                        np.full(shape=(1), fill_value=10, dtype=np.int64)))
         self.assertTrue(
-            np.allclose(
-                np.asarray(res[1]),
-                np.full(
-                    shape=(2, 1), fill_value=10, dtype=np.int64)))
+            np.allclose(np.asarray(res[1]),
+                        np.full(shape=(2, 1), fill_value=10, dtype=np.int64)))
         self.assertTrue(
-            np.allclose(
-                np.asarray(res[2]),
-                np.full(
-                    shape=(1), fill_value=10, dtype=np.float32)))
+            np.allclose(np.asarray(res[2]),
+                        np.full(shape=(1), fill_value=10, dtype=np.float32)))
 
 
 class TestApiWhileLoop_Nested(unittest.TestCase):
+
     def test_nested_net(self):
+
         def external_cond(i, j, init, sums):
             return layers.less_than(i, loop_len1)
 
         def external_body(i, j, init, sums):
+
             def internal_cond(j, init, sums):
                 return layers.less_than(j, loop_len2)
 
@@ -182,12 +182,14 @@ def internal_body(j, init, sums):
             data = np.random.rand(3, 3).astype('float32')
             data_sums = np.zeros([3, 3]).astype('float32')
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         res = exe.run(main_program,
-                      feed={'init': data,
-                            'sums': data_sums},
+                      feed={
+                          'init': data,
+                          'sums': data_sums
+                      },
                       fetch_list=out)
         for i in range(3):
             data = np.add(data, 1)
@@ -198,7 +200,9 @@ def internal_body(j, init, sums):
 
 
 class TestApiWhileLoop_Backward(unittest.TestCase):
+
     def test_while_loop_backward(self):
+
         def cond(i, x):
             return layers.less_than(i, eleven)
 
@@ -221,8 +225,8 @@ def body(i, x):
             mean = layers.mean(out[1])
             append_backward(mean)
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
 
         feed_i = np.ones(1).astype('float32')
@@ -231,15 +235,18 @@ def body(i, x):
         i_grad = np.asarray([110]).astype('float32')
 
         res = exe.run(main_program,
-                      feed={'i': feed_i,
-                            'x': feed_x},
+                      feed={
+                          'i': feed_i,
+                          'x': feed_x
+                      },
                       fetch_list=[mean.name, i.grad_name])
         self.assertTrue(np.allclose(np.asarray(res[0]), data))
-        self.assertTrue(
-            np.allclose(np.asarray(res[1]), i_grad),
-            msg=" \nres = \n{} \n\n ans = \n{}".format(res[1], i_grad))
+        self.assertTrue(np.allclose(np.asarray(res[1]), i_grad),
+                        msg=" \nres = \n{} \n\n ans = \n{}".format(
+                            res[1], i_grad))
 
     def test_while_loop_backward2(self):
+
         def cond(i, x):
             return i < 3
 
@@ -260,8 +267,8 @@ def body(i, x):
             mean = layers.mean(out[1])
             append_backward(mean)
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
 
         feed_i = np.ones(1).astype('float32')
@@ -271,24 +278,29 @@ def body(i, x):
         x_grad = np.asarray([2]).astype('float32')
 
         res = exe.run(main_program,
-                      feed={'i': feed_i,
-                            'x': feed_x},
+                      feed={
+                          'i': feed_i,
+                          'x': feed_x
+                      },
                       fetch_list=[mean.name, i.grad_name, x.grad_name])
         self.assertTrue(np.allclose(np.asarray(res[0]), data))
-        self.assertTrue(
-            np.allclose(np.asarray(res[1]), i_grad),
-            msg=" \nres = \n{} \n\n ans = \n{}".format(res[1], i_grad))
-        self.assertTrue(
-            np.allclose(np.asarray(res[2]), x_grad),
-            msg=" \nres = \n{} \n\n ans = \n{}".format(res[2], x_grad))
+        self.assertTrue(np.allclose(np.asarray(res[1]), i_grad),
+                        msg=" \nres = \n{} \n\n ans = \n{}".format(
+                            res[1], i_grad))
+        self.assertTrue(np.allclose(np.asarray(res[2]), x_grad),
+                        msg=" \nres = \n{} \n\n ans = \n{}".format(
+                            res[2], x_grad))
 
 
 class TestApiWhileLoop_NestedWithBackwardAndLoDTensorArray(unittest.TestCase):
+
     def test_nested_net_with_backward_and_lodtensor(self):
+
         def external_cond(i, j, x, mem_array):
             return layers.less_than(i, array_len)
 
         def external_body(i, j, x, mem_array):
+
             def internal_cond(j, x, mem_array):
                 return layers.less_than(j, array_len2)
 
@@ -342,8 +354,8 @@ def internal_body(j, x, mem_array):
             mean = layers.mean(sum_result)
             append_backward(mean)
 
-            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
+            place = fluid.CUDAPlace(
+                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
 
             d = []
@@ -352,23 +364,27 @@ def internal_body(j, x, mem_array):
             feed_x = np.ones(10).astype('float32')
             data_sum = d[0] + d[1] + d[2] + 3 * feed_x
             x_grad = [0.3] * 10
-            res = exe.run(
-                main_program,
-                feed={'d0': d[0],
-                      'd1': d[1],
-                      'd2': d[2],
-                      'x': feed_x},
-                fetch_list=[sum_result.name, x.grad_name])
+            res = exe.run(main_program,
+                          feed={
+                              'd0': d[0],
+                              'd1': d[1],
+                              'd2': d[2],
+                              'x': feed_x
+                          },
+                          fetch_list=[sum_result.name, x.grad_name])
             self.assertTrue(np.allclose(res[0], data_sum))
             self.assertTrue(np.allclose(res[1], x_grad))
 
 
 class TestApiWhileLoopWithSwitchCase(unittest.TestCase):
+
     def test_with_switch_case(self):
+
         def cond(i):
             return layers.less_than(i, ten)
 
         def body(i):
+
             def fn_add_three():
                 data_add_three = layers.elementwise_add(x=i, y=three)
                 return data_add_three
@@ -381,11 +397,12 @@ def fn_add_one():
                 data_add_one = layers.elementwise_add(x=i, y=one)
                 return data_add_one
 
-            return layers.switch_case(
-                branch_index=i,
-                branch_fns={2: fn_add_three,
-                            5: fn_square},
-                default=fn_add_one)
+            return layers.switch_case(branch_index=i,
+                                      branch_fns={
+                                          2: fn_add_three,
+                                          5: fn_square
+                                      },
+                                      default=fn_add_one)
 
         main_program = Program()
         startup_program = Program()
@@ -396,8 +413,8 @@ def fn_add_one():
             one = layers.fill_constant(shape=[1], dtype='int64', value=1)
             out = layers.while_loop(cond, body, [i])
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
         res = exe.run(main_program, fetch_list=out)
 
@@ -406,7 +423,9 @@ def fn_add_one():
 
 
 class TestApiWhileLoop_Error(unittest.TestCase):
+
     def test_error(self):
+
         def cond_returns_constant(i):
             return 1
 
@@ -436,8 +455,9 @@ def cond_returns_with_mutable_dict(i, test_dict):
             return i > 0
 
         def body_returns_with_mutable_dict(i, test_dict):
-            test_dict['new_key'] = layers.fill_constant(
-                shape=[1], dtype='int64', value=1)
+            test_dict['new_key'] = layers.fill_constant(shape=[1],
+                                                        dtype='int64',
+                                                        value=1)
             return layers.increment(i), test_dict
 
         def cond_returns_with_mutable_list(i, test_list):
@@ -445,8 +465,7 @@ def cond_returns_with_mutable_list(i, test_list):
 
         def body_returns_with_mutable_list(i, test_list):
             test_list.append(
-                layers.fill_constant(
-                    shape=[1], dtype='int64', value=1))
+                layers.fill_constant(shape=[1], dtype='int64', value=1))
             return layers.increment(i), test_list
 
         main_program = Program()
@@ -519,8 +538,8 @@ def value_error_body_returns_error_type():
             # The length of `output_vars` with mutable value should keep same with `loop_vars`
             def value_error_body_returns_with_mutable_dict():
                 test_dict = {
-                    "int_constant": layers.fill_constant(
-                        shape=[2, 2], dtype='int64', value=1)
+                    "int_constant":
+                    layers.fill_constant(shape=[2, 2], dtype='int64', value=1)
                 }
                 out = layers.while_loop(cond_returns_with_mutable_dict,
                                         body_returns_with_mutable_dict,
@@ -531,8 +550,7 @@ def value_error_body_returns_with_mutable_dict():
 
             def value_error_body_returns_with_mutable_list():
                 test_list = [
-                    layers.fill_constant(
-                        shape=[2, 2], dtype='int64', value=1)
+                    layers.fill_constant(shape=[2, 2], dtype='int64', value=1)
                 ]
                 out = layers.while_loop(cond_returns_with_mutable_list,
                                         body_returns_with_mutable_list,
@@ -543,7 +561,9 @@ def value_error_body_returns_with_mutable_list():
 
 
 class TestApiWhileLoopSliceInBody(unittest.TestCase):
+
     def test_var_slice(self):
+
         def cond(z, i):
             return i + 1 <= x_shape[0]
 
@@ -561,8 +581,8 @@ def body(z, i):
             i = fluid.layers.fill_constant([1], 'int32', 0)
             z, _ = fluid.layers.while_loop(cond, body, [z, i])
 
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
+        place = fluid.CUDAPlace(
+            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         exe = fluid.Executor(place)
 
         np_x = np.array([1, 2, 3, 4, 5], dtype='int32')
diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
index 8af9a39634fdb..dee83692bd324 100644
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -28,13 +28,20 @@
 
 
 class TestWhileOp(unittest.TestCase):
+
     def simple_net(self):
-        d0 = layers.data(
-            "d0", shape=[10], append_batch_size=False, dtype='float32')
-        d1 = layers.data(
-            "d1", shape=[10], append_batch_size=False, dtype='float32')
-        d2 = layers.data(
-            "d2", shape=[10], append_batch_size=False, dtype='float32')
+        d0 = layers.data("d0",
+                         shape=[10],
+                         append_batch_size=False,
+                         dtype='float32')
+        d1 = layers.data("d1",
+                         shape=[10],
+                         append_batch_size=False,
+                         dtype='float32')
+        d2 = layers.data("d2",
+                         shape=[10],
+                         append_batch_size=False,
+                         dtype='float32')
         i = layers.zeros(shape=[1], dtype='int64')
         i.stop_gradient = True
         init = layers.zeros(shape=[10], dtype='float32')
@@ -92,9 +99,11 @@ def test_simple_net(self):
             for i in range(3):
                 d.append(numpy.random.random(size=[10]).astype('float32'))
 
-            outs = exe.run(feed={'d0': d[0],
-                                 'd1': d[1],
-                                 'd2': d[2]},
+            outs = exe.run(feed={
+                'd0': d[0],
+                'd1': d[1],
+                'd2': d[2]
+            },
                            fetch_list=[sum_result])
             self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
 
@@ -127,6 +136,7 @@ def test_exceptions(self):
 
 
 class BadInputTest(unittest.TestCase):
+
     def test_error(self):
         with fluid.program_guard(fluid.Program()):
 
@@ -138,7 +148,9 @@ def test_bad_x():
 
 
 class TestIgnoreVarNameInWhile(unittest.TestCase):
+
     def test_ignore_var(self):
+
         def cond(i, ten, temp, y):
             return i < ten
 
@@ -169,8 +181,10 @@ def body_func(i, ten, batch_info, origin_seq):
         input_y = input_y.reshape(3, 1, 1)
 
         res, = exe.run(fluid.default_main_program(),
-                       feed={'x': input_x,
-                             'y': input_y},
+                       feed={
+                           'x': input_x,
+                           'y': input_y
+                       },
                        fetch_list=[output])
 
         self.assertListEqual(list(res.shape), [3, 1, 5])
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index 19dcb49cd957c..139c671947b24 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -55,8 +55,8 @@ def YoloBox(x, img_size, attrs):
                                h)
     anchors = [(anchors[i], anchors[(i + 1)])
                for i in range(0, len(anchors), 2)]
-    anchors_s = np.array(
-        [((an_w / input_w), (an_h / input_h)) for (an_w, an_h) in anchors])
+    anchors_s = np.array([((an_w / input_w), (an_h / input_h))
+                          for (an_w, an_h) in anchors])
     anchor_w = anchors_s[:, 0:1].reshape((1, an_num, 1, 1))
     anchor_h = anchors_s[:, 1:2].reshape((1, an_num, 1, 1))
     pred_box[:, :, :, :, 2] = (np.exp(pred_box[:, :, :, :, 2]) * anchor_w)
@@ -70,9 +70,9 @@ def YoloBox(x, img_size, attrs):
     pred_score = (sigmoid(x[:, :, :, :, 5:]) * pred_conf)
     pred_box = (pred_box * (pred_conf > 0.0).astype('float32'))
     pred_box = pred_box.reshape((n, (-1), 4))
-    (pred_box[:, :, :2], pred_box[:, :, 2:4]) = (
-        (pred_box[:, :, :2] - (pred_box[:, :, 2:4] / 2.0)),
-        (pred_box[:, :, :2] + (pred_box[:, :, 2:4] / 2.0)))
+    (pred_box[:, :, :2],
+     pred_box[:, :, 2:4]) = ((pred_box[:, :, :2] - (pred_box[:, :, 2:4] / 2.0)),
+                             (pred_box[:, :, :2] + (pred_box[:, :, 2:4] / 2.0)))
     pred_box[:, :, 0] = (pred_box[:, :, 0] * img_size[:, 1][:, np.newaxis])
     pred_box[:, :, 1] = (pred_box[:, :, 1] * img_size[:, 0][:, np.newaxis])
     pred_box[:, :, 2] = (pred_box[:, :, 2] * img_size[:, 1][:, np.newaxis])
@@ -89,6 +89,7 @@ def YoloBox(x, img_size, attrs):
 
 
 class TestYoloBoxOp(OpTest):
+
     def setUp(self):
         self.initTestCase()
         self.op_type = 'yolo_box'
@@ -129,6 +130,7 @@ def initTestCase(self):
 
 
 class TestYoloBoxOpNoClipBbox(TestYoloBoxOp):
+
     def initTestCase(self):
         self.anchors = [10, 13, 16, 30, 33, 23]
         an_num = int((len(self.anchors) // 2))
@@ -146,6 +148,7 @@ def initTestCase(self):
 
 
 class TestYoloBoxOpScaleXY(TestYoloBoxOp):
+
     def initTestCase(self):
         self.anchors = [10, 13, 16, 30, 33, 23]
         an_num = int((len(self.anchors) // 2))
@@ -163,6 +166,7 @@ def initTestCase(self):
 
 
 class TestYoloBoxOpIoUAware(TestYoloBoxOp):
+
     def initTestCase(self):
         self.anchors = [10, 13, 16, 30, 33, 23]
         an_num = int((len(self.anchors) // 2))
@@ -180,35 +184,34 @@ def initTestCase(self):
 
 
 class TestYoloBoxDygraph(unittest.TestCase):
+
     def test_dygraph(self):
         paddle.disable_static()
         img_size = np.ones((2, 2)).astype('int32')
         img_size = paddle.to_tensor(img_size)
         x1 = np.random.random([2, 14, 8, 8]).astype('float32')
         x1 = paddle.to_tensor(x1)
-        (boxes, scores) = paddle.vision.ops.yolo_box(
-            x1,
-            img_size=img_size,
-            anchors=[10, 13, 16, 30],
-            class_num=2,
-            conf_thresh=0.01,
-            downsample_ratio=8,
-            clip_bbox=True,
-            scale_x_y=1.0)
+        (boxes, scores) = paddle.vision.ops.yolo_box(x1,
+                                                     img_size=img_size,
+                                                     anchors=[10, 13, 16, 30],
+                                                     class_num=2,
+                                                     conf_thresh=0.01,
+                                                     downsample_ratio=8,
+                                                     clip_bbox=True,
+                                                     scale_x_y=1.0)
         assert ((boxes is not None) and (scores is not None))
         x2 = np.random.random([2, 16, 8, 8]).astype('float32')
         x2 = paddle.to_tensor(x2)
-        (boxes, scores) = paddle.vision.ops.yolo_box(
-            x2,
-            img_size=img_size,
-            anchors=[10, 13, 16, 30],
-            class_num=2,
-            conf_thresh=0.01,
-            downsample_ratio=8,
-            clip_bbox=True,
-            scale_x_y=1.0,
-            iou_aware=True,
-            iou_aware_factor=0.5)
+        (boxes, scores) = paddle.vision.ops.yolo_box(x2,
+                                                     img_size=img_size,
+                                                     anchors=[10, 13, 16, 30],
+                                                     class_num=2,
+                                                     conf_thresh=0.01,
+                                                     downsample_ratio=8,
+                                                     clip_bbox=True,
+                                                     scale_x_y=1.0,
+                                                     iou_aware=True,
+                                                     iou_aware_factor=0.5)
         paddle.enable_static()
 
     def test_eager(self):
@@ -217,35 +220,35 @@ def test_eager(self):
 
 
 class TestYoloBoxStatic(unittest.TestCase):
+
     def test_static(self):
         x1 = paddle.static.data('x1', [2, 14, 8, 8], 'float32')
         img_size = paddle.static.data('img_size', [2, 2], 'int32')
-        (boxes, scores) = paddle.vision.ops.yolo_box(
-            x1,
-            img_size=img_size,
-            anchors=[10, 13, 16, 30],
-            class_num=2,
-            conf_thresh=0.01,
-            downsample_ratio=8,
-            clip_bbox=True,
-            scale_x_y=1.0)
+        (boxes, scores) = paddle.vision.ops.yolo_box(x1,
+                                                     img_size=img_size,
+                                                     anchors=[10, 13, 16, 30],
+                                                     class_num=2,
+                                                     conf_thresh=0.01,
+                                                     downsample_ratio=8,
+                                                     clip_bbox=True,
+                                                     scale_x_y=1.0)
         assert ((boxes is not None) and (scores is not None))
         x2 = paddle.static.data('x2', [2, 16, 8, 8], 'float32')
-        (boxes, scores) = paddle.vision.ops.yolo_box(
-            x2,
-            img_size=img_size,
-            anchors=[10, 13, 16, 30],
-            class_num=2,
-            conf_thresh=0.01,
-            downsample_ratio=8,
-            clip_bbox=True,
-            scale_x_y=1.0,
-            iou_aware=True,
-            iou_aware_factor=0.5)
+        (boxes, scores) = paddle.vision.ops.yolo_box(x2,
+                                                     img_size=img_size,
+                                                     anchors=[10, 13, 16, 30],
+                                                     class_num=2,
+                                                     conf_thresh=0.01,
+                                                     downsample_ratio=8,
+                                                     clip_bbox=True,
+                                                     scale_x_y=1.0,
+                                                     iou_aware=True,
+                                                     iou_aware_factor=0.5)
         assert ((boxes is not None) and (scores is not None))
 
 
 class TestYoloBoxOpHW(TestYoloBoxOp):
+
     def initTestCase(self):
         self.anchors = [10, 13, 16, 30, 33, 23]
         an_num = int((len(self.anchors) // 2))
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index 3f0e4f7a4002a..61f955e917db7 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -53,8 +53,8 @@ def batch_xywh_box_iou(box1, box2):
     left = np.maximum(b1_left[:, :, np.newaxis], b2_left[:, np.newaxis, :])
     right = np.minimum(b1_right[:, :, np.newaxis], b2_right[:, np.newaxis, :])
     top = np.maximum(b1_top[:, :, np.newaxis], b2_top[:, np.newaxis, :])
-    bottom = np.minimum(b1_bottom[:, :, np.newaxis],
-                        b2_bottom[:, np.newaxis, :])
+    bottom = np.minimum(b1_bottom[:, :, np.newaxis], b2_bottom[:,
+                                                               np.newaxis, :])
 
     inter_w = np.clip(right - left, 0., 1.)
     inter_h = np.clip(bottom - top, 0., 1.)
@@ -91,16 +91,18 @@ def YOLOv3Loss(x, gtbox, gtlabel, gtscore, attrs):
     pred_box = x[:, :, :, :, :4].copy()
     grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
     grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
-    pred_box[:, :, :, :, 0] = (
-        grid_x + sigmoid(pred_box[:, :, :, :, 0]) * scale_x_y + bias_x_y) / w
-    pred_box[:, :, :, :, 1] = (
-        grid_y + sigmoid(pred_box[:, :, :, :, 1]) * scale_x_y + bias_x_y) / h
+    pred_box[:, :, :, :,
+             0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0]) * scale_x_y +
+                   bias_x_y) / w
+    pred_box[:, :, :, :,
+             1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1]) * scale_x_y +
+                   bias_x_y) / h
 
     mask_anchors = []
     for m in anchor_mask:
         mask_anchors.append((anchors[2 * m], anchors[2 * m + 1]))
-    anchors_s = np.array(
-        [(an_w / input_size, an_h / input_size) for an_w, an_h in mask_anchors])
+    anchors_s = np.array([(an_w / input_size, an_h / input_size)
+                          for an_w, an_h in mask_anchors])
     anchor_w = anchors_s[:, 0:1].reshape((1, mask_num, 1, 1))
     anchor_h = anchors_s[:, 1:2].reshape((1, mask_num, 1, 1))
     pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
@@ -119,10 +121,10 @@ def YOLOv3Loss(x, gtbox, gtlabel, gtscore, attrs):
     gtbox_shift[:, :, 1] = 0
 
     anchors = [(anchors[2 * i], anchors[2 * i + 1]) for i in range(0, an_num)]
-    anchors_s = np.array(
-        [(an_w / input_size, an_h / input_size) for an_w, an_h in anchors])
-    anchor_boxes = np.concatenate(
-        [np.zeros_like(anchors_s), anchors_s], axis=-1)
+    anchors_s = np.array([(an_w / input_size, an_h / input_size)
+                          for an_w, an_h in anchors])
+    anchor_boxes = np.concatenate([np.zeros_like(anchors_s), anchors_s],
+                                  axis=-1)
     anchor_boxes = np.tile(anchor_boxes[np.newaxis, :, :], (n, 1, 1))
     ious = batch_xywh_box_iou(gtbox_shift, anchor_boxes)
     iou_matches = np.argmax(ious, axis=-1)
@@ -153,9 +155,9 @@ def YOLOv3Loss(x, gtbox, gtlabel, gtscore, attrs):
             objness[i, an_idx * h * w + gj * w + gi] = gtscore[i, j]
 
             for label_idx in range(class_num):
-                loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx], label_pos
-                               if label_idx == gtlabel[i, j] else
-                               label_neg) * gtscore[i, j]
+                loss[i] += sce(
+                    x[i, an_idx, gj, gi, 5 + label_idx], label_pos if label_idx
+                    == gtlabel[i, j] else label_neg) * gtscore[i, j]
 
         for j in range(mask_num * h * w):
             if objness[i, j] > 0:
@@ -168,6 +170,7 @@ def YOLOv3Loss(x, gtbox, gtlabel, gtscore, attrs):
 
 
 class TestYolov3LossOp(OpTest):
+
     def setUp(self):
         self.initTestCase()
         self.op_type = 'yolov3_loss'
@@ -232,6 +235,7 @@ def initTestCase(self):
 
 
 class TestYolov3LossWithoutLabelSmooth(TestYolov3LossOp):
+
     def initTestCase(self):
         self.anchors = [
             10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
@@ -249,6 +253,7 @@ def initTestCase(self):
 
 
 class TestYolov3LossNoGTScore(TestYolov3LossOp):
+
     def initTestCase(self):
         self.anchors = [
             10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
@@ -266,6 +271,7 @@ def initTestCase(self):
 
 
 class TestYolov3LossWithScaleXY(TestYolov3LossOp):
+
     def initTestCase(self):
         self.anchors = [
             10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
@@ -283,6 +289,7 @@ def initTestCase(self):
 
 
 class TestYolov3LossDygraph(unittest.TestCase):
+
     def test_dygraph(self):
         paddle.disable_static()
         x = np.random.random([2, 14, 8, 8]).astype('float32')
@@ -293,54 +300,52 @@ def test_dygraph(self):
         gt_box = paddle.to_tensor(gt_box)
         gt_label = paddle.to_tensor(gt_label)
 
-        loss = paddle.vision.ops.yolo_loss(
-            x,
-            gt_box=gt_box,
-            gt_label=gt_label,
-            anchors=[10, 13, 16, 30],
-            anchor_mask=[0, 1],
-            class_num=2,
-            ignore_thresh=0.7,
-            downsample_ratio=8,
-            use_label_smooth=True,
-            scale_x_y=1.)
+        loss = paddle.vision.ops.yolo_loss(x,
+                                           gt_box=gt_box,
+                                           gt_label=gt_label,
+                                           anchors=[10, 13, 16, 30],
+                                           anchor_mask=[0, 1],
+                                           class_num=2,
+                                           ignore_thresh=0.7,
+                                           downsample_ratio=8,
+                                           use_label_smooth=True,
+                                           scale_x_y=1.)
         assert loss is not None
         assert loss.shape == [2]
         paddle.enable_static()
 
 
 class TestYolov3LossStatic(unittest.TestCase):
+
     def test_static(self):
         x = paddle.static.data('x', [2, 14, 8, 8], 'float32')
         gt_box = paddle.static.data('gt_box', [2, 10, 4], 'float32')
         gt_label = paddle.static.data('gt_label', [2, 10], 'int32')
         gt_score = paddle.static.data('gt_score', [2, 10], 'float32')
 
-        loss = paddle.vision.ops.yolo_loss(
-            x,
-            gt_box=gt_box,
-            gt_label=gt_label,
-            anchors=[10, 13, 16, 30],
-            anchor_mask=[0, 1],
-            class_num=2,
-            ignore_thresh=0.7,
-            downsample_ratio=8,
-            gt_score=gt_score,
-            use_label_smooth=True,
-            scale_x_y=1.)
+        loss = paddle.vision.ops.yolo_loss(x,
+                                           gt_box=gt_box,
+                                           gt_label=gt_label,
+                                           anchors=[10, 13, 16, 30],
+                                           anchor_mask=[0, 1],
+                                           class_num=2,
+                                           ignore_thresh=0.7,
+                                           downsample_ratio=8,
+                                           gt_score=gt_score,
+                                           use_label_smooth=True,
+                                           scale_x_y=1.)
         assert loss is not None
 
-        loss = paddle.vision.ops.yolo_loss(
-            x,
-            gt_box=gt_box,
-            gt_label=gt_label,
-            anchors=[10, 13, 16, 30],
-            anchor_mask=[0, 1],
-            class_num=2,
-            ignore_thresh=0.7,
-            downsample_ratio=8,
-            use_label_smooth=True,
-            scale_x_y=1.)
+        loss = paddle.vision.ops.yolo_loss(x,
+                                           gt_box=gt_box,
+                                           gt_label=gt_label,
+                                           anchors=[10, 13, 16, 30],
+                                           anchor_mask=[0, 1],
+                                           class_num=2,
+                                           ignore_thresh=0.7,
+                                           downsample_ratio=8,
+                                           use_label_smooth=True,
+                                           scale_x_y=1.)
         assert loss is not None
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
index 80b4db793ff43..3be1fb85565f7 100644
--- a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,6 +23,7 @@
 
 
 class TestZerosLikeAPIError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             x = paddle.fluid.data('x', [3, 4])
@@ -34,6 +35,7 @@ def test_eager(self):
 
 
 class TestZerosLikeAPI(unittest.TestCase):
+
     def test_api(self):
         shape = [3, 4]
         startup_program = Program()
@@ -62,6 +64,7 @@ def test_eager(self):
 
 
 class TestZerosLikeImpeartive(unittest.TestCase):
+
     def test_out(self):
         shape = [3, 4]
         place = (fluid.CUDAPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_op.py b/python/paddle/fluid/tests/unittests/test_zeros_op.py
index 01d7107cfaeec..ce30cab501659 100644
--- a/python/paddle/fluid/tests/unittests/test_zeros_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,6 +26,7 @@
 
 
 class TestZerosOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             shape = [4]
@@ -38,6 +39,7 @@ def test_eager(self):
 
 
 class ApiZerosTest(unittest.TestCase):
+
     def test_out(self):
         with program_guard(Program()):
             zeros = paddle.zeros(shape=[10], dtype='float64')
@@ -84,7 +86,9 @@ def test_eager(self):
 
 
 class ApiZerosError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_error1():
             with paddle.static.program_guard(fluid.Program()):
                 ones = fluid.layers.zeros(shape=10, dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index c92d9a429b6c7..e106f33c8a068 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -68,6 +68,7 @@ def __create_var__(name, var_name):
 
 
 def set_input(scope, op, inputs, place):
+
     def __set_input__(var_name, var):
         if isinstance(var, tuple) or isinstance(var, np.ndarray):
             tensor = scope.find_var(var_name).get_tensor()
@@ -116,8 +117,10 @@ def create_var(block, name, np_list, var_proto):
                 if is_input:
                     shape = list(np_value.shape)
                     lod_level = 0
-        return block.create_var(
-            dtype=dtype, shape=shape, lod_level=lod_level, name=name)
+        return block.create_var(dtype=dtype,
+                                shape=shape,
+                                lod_level=lod_level,
+                                name=name)
 
     var_dict = {}
     for var_proto in proto_list:
@@ -146,34 +149,34 @@ def append_loss_ops(block, output_names):
 
     if len(mean_inputs) == 1:
         loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
-        op = block.append_op(
-            inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
+        op = block.append_op(inputs={"X": mean_inputs},
+                             outputs={"Out": loss},
+                             type='mean')
         op.desc.infer_var_type(block.desc)
         op.desc.infer_shape(block.desc)
     else:
         avg_sum = []
         for cur_loss in mean_inputs:
             cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
-            op = block.append_op(
-                inputs={"X": [cur_loss]},
-                outputs={"Out": [cur_avg_loss]},
-                type="mean")
+            op = block.append_op(inputs={"X": [cur_loss]},
+                                 outputs={"Out": [cur_avg_loss]},
+                                 type="mean")
             op.desc.infer_var_type(block.desc)
             op.desc.infer_shape(block.desc)
             avg_sum.append(cur_avg_loss)
 
         loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
-        op_sum = block.append_op(
-            inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
+        op_sum = block.append_op(inputs={"X": avg_sum},
+                                 outputs={"Out": loss_sum},
+                                 type='sum')
         op_sum.desc.infer_var_type(block.desc)
         op_sum.desc.infer_shape(block.desc)
 
         loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
-        op_loss = block.append_op(
-            inputs={"X": loss_sum},
-            outputs={"Out": loss},
-            type='scale',
-            attrs={'scale': 1.0 / float(len(avg_sum))})
+        op_loss = block.append_op(inputs={"X": loss_sum},
+                                  outputs={"Out": loss},
+                                  type='scale',
+                                  attrs={'scale': 1.0 / float(len(avg_sum))})
         op_loss.desc.infer_var_type(block.desc)
         op_loss.desc.infer_shape(block.desc)
     return loss
diff --git a/python/paddle/fluid/tests/unittests/tokenizer/__init__.py b/python/paddle/fluid/tests/unittests/tokenizer/__init__.py
index b9a7651e44909..185a92b8d94d3 100644
--- a/python/paddle/fluid/tests/unittests/tokenizer/__init__.py
+++ b/python/paddle/fluid/tests/unittests/tokenizer/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py b/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py
index 00d5f4e772528..f396e892ecfe4 100755
--- a/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py
+++ b/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py
@@ -364,8 +364,8 @@ def __init__(self,
         self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
         self.do_lower_case = do_lower_case
         self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-        self.wordpiece_tokenizer = WordpieceTokenizer(
-            vocab=self.vocab, unk_token=unk_token)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,
+                                                      unk_token=unk_token)
         self.special_tokens_map = {
             'unk_token': unk_token,
             'sep_token': sep_token,
@@ -433,8 +433,8 @@ def num_special_tokens_to_add(self, pair=False):
         token_ids_0 = []
         token_ids_1 = []
         return len(
-            self.build_inputs_with_special_tokens(token_ids_0, token_ids_1
-                                                  if pair else None))
+            self.build_inputs_with_special_tokens(
+                token_ids_0, token_ids_1 if pair else None))
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
@@ -508,7 +508,9 @@ def get_special_tokens_mask(self,
                     "ids is already formatted with special tokens for the model."
                 )
             return list(
-                map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
+                map(
+                    lambda x: 1
+                    if x in [self.sep_token_id, self.cls_token_id] else 0,
                     token_ids_0))
 
         if token_ids_1 is not None:
diff --git a/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py b/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py
index 7da3cd56e25b5..d2cf118b6320b 100644
--- a/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py
+++ b/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py
@@ -90,8 +90,8 @@ def _is_punctuation(char):
     # Characters such as "^", "$", and "`" are not in the Unicode
     # Punctuation class but we treat them as punctuation anyways, for
     # consistency.
-    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
+            or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
         return True
     cat = unicodedata.category(char)
     if cat.startswith("P"):
@@ -183,7 +183,7 @@ class PretrainedTokenizer(object):
     def __call__(self,
                  text,
                  text_pair=None,
-                 max_seq_len: Optional[int]=None,
+                 max_seq_len: Optional[int] = None,
                  stride=0,
                  is_split_into_words=False,
                  pad_to_max_seq_len=False,
@@ -288,26 +288,28 @@ def __call__(self,
         """
         # Input type checking for clearer error
         assert isinstance(text, str) or (
-            isinstance(text, (list, tuple)) and (len(text) == 0 or (
-                isinstance(text[0], str) or
-                (isinstance(text[0], (list, tuple)) and
-                 (len(text[0]) == 0 or isinstance(text[0][0], str)))))
+            isinstance(text, (list, tuple)) and
+            (len(text) == 0 or
+             (isinstance(text[0], str) or
+              (isinstance(text[0], (list, tuple)) and
+               (len(text[0]) == 0 or isinstance(text[0][0], str)))))
         ), ("text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
             "or `List[List[str]]` (batch of pretokenized examples).")
 
-        assert (text_pair is None or isinstance(text_pair, str) or (
-            isinstance(text_pair, (list, tuple)) and (len(text_pair) == 0 or (
-                isinstance(text_pair[0], str) or
-                (isinstance(text_pair[0], (list, tuple)) and
-                 (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str)))))
-        )), (
-            "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
+        assert (
+            text_pair is None or isinstance(text_pair, str) or
+            (isinstance(text_pair, (list, tuple)) and
+             (len(text_pair) == 0 or
+              (isinstance(text_pair[0], str) or
+               (isinstance(text_pair[0], (list, tuple)) and
+                (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))))))
+        ), ("text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
             "or `List[List[str]]` (batch of pretokenized examples).")
 
         is_batched = bool(
-            (not is_split_into_words and isinstance(text, (list, tuple))) or
-            (is_split_into_words and isinstance(text, (list, tuple)) and
-             text and isinstance(text[0], (list, tuple))))
+            (not is_split_into_words and isinstance(text, (list, tuple)))
+            or (is_split_into_words and isinstance(text, (list, tuple)) and text
+                and isinstance(text[0], (list, tuple))))
 
         if is_batched:
             batch_text_or_text_pairs = list(zip(
@@ -348,8 +350,8 @@ def all_special_tokens(self):
         all_toks = []
         set_attr = self.special_tokens_map
         for attr_value in set_attr.values():
-            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (
-                list, tuple)) else [attr_value])
+            all_toks = all_toks + (list(attr_value) if isinstance(
+                attr_value, (list, tuple)) else [attr_value])
         all_toks = list(set(all_toks))
         return all_toks
 
@@ -420,8 +422,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             for file_id, map_list in cls.pretrained_resource_files_map.items():
                 vocab_files[file_id] = map_list[pretrained_model_name_or_path]
             init_configuration = copy.deepcopy(
-                cls.pretrained_init_configuration[
-                    pretrained_model_name_or_path])
+                cls.pretrained_init_configuration[pretrained_model_name_or_path]
+            )
         # From local dir path
         elif os.path.isdir(pretrained_model_name_or_path):
             for file_id, file_name in cls.resource_files_names.items():
@@ -488,8 +490,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             # does include a vocab file path in it. However, if the vocab file
             # path included in json does not exist, such as was deleted, to make
             # it still work, use the vocab file under this dir.
-            elif not os.path.isfile(init_kwargs[args_name]) and os.path.isfile(
-                    file_path):
+            elif not os.path.isfile(
+                    init_kwargs[args_name]) and os.path.isfile(file_path):
                 init_kwargs[args_name] = file_path
         # TODO(guosheng): avoid reduplication of position args and key word args
         tokenizer = cls(*init_args, **init_kwargs)
@@ -696,8 +698,8 @@ def get_special_tokens_mask(self,
             results (List[int]): The list of integers in the range [0, 1]:
                 1 for a special token, 0 for a sequence token.
         """
-        return [0] * ((len(token_ids_1)
-                       if token_ids_1 else 0) + len(token_ids_0))
+        return [0] * (
+            (len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
 
     def create_token_type_ids_from_sequences(self,
                                              token_ids_0,
@@ -731,8 +733,8 @@ def num_special_tokens_to_add(self, pair):
         token_ids_0 = []
         token_ids_1 = []
         return len(
-            self.build_inputs_with_special_tokens(token_ids_0, token_ids_1
-                                                  if pair else None))
+            self.build_inputs_with_special_tokens(
+                token_ids_0, token_ids_1 if pair else None))
 
     def encode(self,
                text,
@@ -864,7 +866,8 @@ def get_input_ids(text):
                 ids,
                 pair_ids=pair_ids,
                 num_tokens_to_remove=total_len - max_seq_len,
-                truncation_strategy=truncation_strategy, )
+                truncation_strategy=truncation_strategy,
+            )
             if return_overflowing_tokens:
                 encoded_inputs["overflowing_tokens"] = overflowing_tokens
                 encoded_inputs["num_truncated_tokens"] = total_len - max_seq_len
@@ -872,8 +875,8 @@ def get_input_ids(text):
         # Add special tokens
 
         sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
-        token_type_ids = self.create_token_type_ids_from_sequences(ids,
-                                                                   pair_ids)
+        token_type_ids = self.create_token_type_ids_from_sequences(
+            ids, pair_ids)
 
         # Build output dictionnary
         encoded_inputs["input_ids"] = sequence
@@ -881,14 +884,14 @@ def get_input_ids(text):
             encoded_inputs["token_type_ids"] = token_type_ids
         if return_special_tokens_mask:
             encoded_inputs[
-                "special_tokens_mask"] = self.get_special_tokens_mask(ids,
-                                                                      pair_ids)
+                "special_tokens_mask"] = self.get_special_tokens_mask(
+                    ids, pair_ids)
         if return_length:
             encoded_inputs["seq_len"] = len(encoded_inputs["input_ids"])
 
         # Check lengths
-        assert max_seq_len is None or len(encoded_inputs[
-            "input_ids"]) <= max_seq_len
+        assert max_seq_len is None or len(
+            encoded_inputs["input_ids"]) <= max_seq_len
 
         # Padding
         needs_to_be_padded = pad_to_max_seq_len and \
@@ -898,8 +901,8 @@ def get_input_ids(text):
             difference = max_seq_len - len(encoded_inputs["input_ids"])
             if self.padding_side == 'right':
                 if return_attention_mask:
-                    encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
-                        "input_ids"]) + [0] * difference
+                    encoded_inputs["attention_mask"] = [1] * len(
+                        encoded_inputs["input_ids"]) + [0] * difference
                 if return_token_type_ids:
                     encoded_inputs["token_type_ids"] = (
                         encoded_inputs["token_type_ids"] +
@@ -907,8 +910,9 @@ def get_input_ids(text):
                 if return_special_tokens_mask:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs[
                         "special_tokens_mask"] + [1] * difference
-                encoded_inputs["input_ids"] = encoded_inputs[
-                    "input_ids"] + [self.pad_token_id] * difference
+                encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [
+                    self.pad_token_id
+                ] * difference
             elif self.padding_side == 'left':
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + [
@@ -927,8 +931,8 @@ def get_input_ids(text):
                 ] * difference + encoded_inputs["input_ids"]
         else:
             if return_attention_mask:
-                encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
-                    "input_ids"])
+                encoded_inputs["attention_mask"] = [1] * len(
+                    encoded_inputs["input_ids"])
 
         if return_position_ids:
             encoded_inputs["position_ids"] = list(
@@ -1092,8 +1096,8 @@ def get_input_ids(text):
 
                     offset_mapping = self.build_offset_mapping_with_special_tokens(
                         mapping, pair_mapping)
-                    sequence = self.build_inputs_with_special_tokens(ids,
-                                                                     pair_ids)
+                    sequence = self.build_inputs_with_special_tokens(
+                        ids, pair_ids)
                     token_type_ids = self.create_token_type_ids_from_sequences(
                         ids, pair_ids)
 
@@ -1106,12 +1110,12 @@ def get_input_ids(text):
                             "special_tokens_mask"] = self.get_special_tokens_mask(
                                 ids, pair_ids)
                     if return_length:
-                        encoded_inputs["seq_len"] = len(encoded_inputs[
-                            "input_ids"])
+                        encoded_inputs["seq_len"] = len(
+                            encoded_inputs["input_ids"])
 
                     # Check lengths
-                    assert max_seq_len is None or len(encoded_inputs[
-                        "input_ids"]) <= max_seq_len
+                    assert max_seq_len is None or len(
+                        encoded_inputs["input_ids"]) <= max_seq_len
 
                     # Padding
                     needs_to_be_padded = pad_to_max_seq_len and \
@@ -1120,13 +1124,13 @@ def get_input_ids(text):
                     encoded_inputs['offset_mapping'] = offset_mapping
 
                     if needs_to_be_padded:
-                        difference = max_seq_len - len(encoded_inputs[
-                            "input_ids"])
+                        difference = max_seq_len - len(
+                            encoded_inputs["input_ids"])
                         if self.padding_side == 'right':
                             if return_attention_mask:
                                 encoded_inputs["attention_mask"] = [1] * len(
-                                    encoded_inputs[
-                                        "input_ids"]) + [0] * difference
+                                    encoded_inputs["input_ids"]
+                                ) + [0] * difference
                             if return_token_type_ids:
                                 # 0 for padding token mask
                                 encoded_inputs["token_type_ids"] = (
@@ -1145,8 +1149,8 @@ def get_input_ids(text):
                             if return_attention_mask:
                                 encoded_inputs["attention_mask"] = [
                                     0
-                                ] * difference + [1] * len(encoded_inputs[
-                                    "input_ids"])
+                                ] * difference + [1] * len(
+                                    encoded_inputs["input_ids"])
                             if return_token_type_ids:
                                 # 0 for padding token mask
                                 encoded_inputs["token_type_ids"] = (
@@ -1209,8 +1213,8 @@ def get_offset_mapping(self, text):
         split_tokens = []
         for token in self.basic_tokenizer.tokenize(text):
             for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token
-                                    if sub_token != self.unk_token else token)
+                split_tokens.append(
+                    sub_token if sub_token != self.unk_token else token)
 
         normalized_text, char_mapping = '', []
 
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
index 970eb2daea568..fd9f2ec95dec6 100644
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -23,7 +23,8 @@
 
 pos_enc_param_names = (
     "src_pos_enc_table",
-    "trg_pos_enc_table", )
+    "trg_pos_enc_table",
+)
 
 batch_size = 2
 
@@ -122,8 +123,8 @@ def __combine_heads(x):
         return layers.reshape(
             x=trans_x,
             shape=list(
-                map(int, [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]
-                          ])))
+                map(int,
+                    [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]])))
 
     def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
         """
@@ -148,8 +149,9 @@ def __softmax(x, eps=1e-9):
         product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
         weights = __softmax(layers.elementwise_add(x=product, y=attn_bias))
         if dropout_rate:
-            weights = layers.dropout(
-                weights, dropout_prob=dropout_rate, is_test=False)
+            weights = layers.dropout(weights,
+                                     dropout_prob=dropout_rate,
+                                     is_test=False)
         out = layers.matmul(weights, v)
         return out
 
@@ -182,8 +184,8 @@ def positionwise_feed_forward(x, d_inner_hid, d_hid):
     hidden = layers.fc(input=x,
                        size=d_inner_hid,
                        num_flatten_dims=2,
-                       param_attr=fluid.initializer.Uniform(
-                           low=-(d_hid**-0.5), high=(d_hid**-0.5)),
+                       param_attr=fluid.initializer.Uniform(low=-(d_hid**-0.5),
+                                                            high=(d_hid**-0.5)),
                        act="relu")
     out = layers.fc(input=hidden,
                     size=d_hid,
@@ -205,11 +207,10 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.):
         if cmd == "a":  # add residual connection
             out = out + prev_out if prev_out else out
         elif cmd == "n":  # add layer normalization
-            out = layers.layer_norm(
-                out,
-                begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.initializer.Constant(1.),
-                bias_attr=fluid.initializer.Constant(0.))
+            out = layers.layer_norm(out,
+                                    begin_norm_axis=len(out.shape) - 1,
+                                    param_attr=fluid.initializer.Constant(1.),
+                                    bias_attr=fluid.initializer.Constant(0.))
         elif cmd == "d":  # add dropout
             if dropout:
                 out = layers.dropout(out, dropout_prob=dropout, is_test=False)
@@ -235,31 +236,28 @@ def prepare_encoder(src_word,
 
     This module is used at the bottom of the encoder stacks.
     """
-    src_word_emb = layers.embedding(
-        src_word,
-        size=[src_vocab_size, src_emb_dim],
-        padding_idx=src_pad_idx,
-        param_attr=fluid.initializer.Normal(0., 1.))
+    src_word_emb = layers.embedding(src_word,
+                                    size=[src_vocab_size, src_emb_dim],
+                                    padding_idx=src_pad_idx,
+                                    param_attr=fluid.initializer.Normal(0., 1.))
     src_pos_enc = layers.embedding(
         src_pos,
         size=[src_max_len, src_emb_dim],
         padding_idx=pos_pad_idx,
-        param_attr=fluid.ParamAttr(
-            name=pos_enc_param_name, trainable=False))
+        param_attr=fluid.ParamAttr(name=pos_enc_param_name, trainable=False))
     src_pos_enc.stop_gradient = True
     enc_input = src_word_emb + src_pos_enc
 
     # FIXME(guosheng): Decouple the program desc with batch_size.
     enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim])
-    return layers.dropout(
-        enc_input, dropout_prob=dropout,
-        is_test=False) if dropout else enc_input
+    return layers.dropout(enc_input, dropout_prob=dropout,
+                          is_test=False) if dropout else enc_input
 
 
-prepare_encoder = partial(
-    prepare_encoder, pos_enc_param_name=pos_enc_param_names[0])
-prepare_decoder = partial(
-    prepare_encoder, pos_enc_param_name=pos_enc_param_names[1])
+prepare_encoder = partial(prepare_encoder,
+                          pos_enc_param_name=pos_enc_param_names[0])
+prepare_decoder = partial(prepare_encoder,
+                          pos_enc_param_name=pos_enc_param_names[1])
 
 
 def encoder_layer(enc_input,
@@ -330,12 +328,14 @@ def decoder_layer(dec_input,
         d_value,
         d_model,
         n_head,
-        dropout_rate, )
+        dropout_rate,
+    )
     slf_attn_output = post_process_layer(
         dec_input,
         slf_attn_output,
         "dan",  # residual connection + dropout + layer normalization
-        dropout_rate, )
+        dropout_rate,
+    )
     enc_attn_output = multi_head_attention(
         slf_attn_output,
         enc_output,
@@ -345,21 +345,25 @@ def decoder_layer(dec_input,
         d_value,
         d_model,
         n_head,
-        dropout_rate, )
+        dropout_rate,
+    )
     enc_attn_output = post_process_layer(
         slf_attn_output,
         enc_attn_output,
         "dan",  # residual connection + dropout + layer normalization
-        dropout_rate, )
+        dropout_rate,
+    )
     ffd_output = positionwise_feed_forward(
         enc_attn_output,
         d_inner_hid,
-        d_model, )
+        d_model,
+    )
     dec_output = post_process_layer(
         enc_attn_output,
         ffd_output,
         "dan",  # residual connection + dropout + layer normalization
-        dropout_rate, )
+        dropout_rate,
+    )
     return dec_output
 
 
@@ -388,7 +392,8 @@ def decoder(dec_input,
             d_value,
             d_model,
             d_inner_hid,
-            dropout_rate, )
+            dropout_rate,
+        )
         dec_input = dec_output
     return dec_output
 
@@ -433,25 +438,28 @@ def build_inputs(max_length, n_head):
     all_inputs = []
     for name, shape, dtype in zip(names, shapes, dtypes):
         all_inputs.append(
-            fluid.layers.data(
-                name=name, shape=shape, dtype=dtype, append_batch_size=False))
+            fluid.layers.data(name=name,
+                              shape=shape,
+                              dtype=dtype,
+                              append_batch_size=False))
     return all_inputs
 
 
 def transformer(
-        src_vocab_size,
-        trg_vocab_size,
-        max_length,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        dropout_rate,
-        src_pad_idx,
-        trg_pad_idx,
-        pos_pad_idx, ):
+    src_vocab_size,
+    trg_vocab_size,
+    max_length,
+    n_layer,
+    n_head,
+    d_key,
+    d_value,
+    d_model,
+    d_inner_hid,
+    dropout_rate,
+    src_pad_idx,
+    trg_pad_idx,
+    pos_pad_idx,
+):
 
     src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = build_inputs(
         max_length, n_head)
@@ -463,7 +471,8 @@ def transformer(
         d_model,
         src_pad_idx,
         max_length,
-        dropout_rate, )
+        dropout_rate,
+    )
     enc_output = encoder(
         enc_input,
         src_slf_attn_bias,
@@ -473,7 +482,8 @@ def transformer(
         d_value,
         d_model,
         d_inner_hid,
-        dropout_rate, )
+        dropout_rate,
+    )
 
     dec_input = prepare_decoder(
         trg_word,
@@ -482,7 +492,8 @@ def transformer(
         d_model,
         trg_pad_idx,
         max_length,
-        dropout_rate, )
+        dropout_rate,
+    )
     dec_output = decoder(
         dec_input,
         enc_output,
@@ -494,18 +505,19 @@ def transformer(
         d_value,
         d_model,
         d_inner_hid,
-        dropout_rate, )
+        dropout_rate,
+    )
 
     # TODO(guosheng): Share the weight matrix between the embedding layers and
     # the pre-softmax linear transformation.
-    predict = layers.reshape(
-        x=layers.fc(input=dec_output,
-                    size=trg_vocab_size,
-                    param_attr=fluid.initializer.Xavier(uniform=False),
-                    bias_attr=False,
-                    num_flatten_dims=2),
-        shape=[-1, trg_vocab_size],
-        act="softmax")
+    predict = layers.reshape(x=layers.fc(
+        input=dec_output,
+        size=trg_vocab_size,
+        param_attr=fluid.initializer.Xavier(uniform=False),
+        bias_attr=False,
+        num_flatten_dims=2),
+                             shape=[-1, trg_vocab_size],
+                             act="softmax")
 
     cost = layers.cross_entropy(input=predict, label=gold)
     weighted_cost = cost * weights
diff --git a/python/paddle/fluid/tests/unittests/utils.py b/python/paddle/fluid/tests/unittests/utils.py
index 07edd8171fe14..66376382a9724 100644
--- a/python/paddle/fluid/tests/unittests/utils.py
+++ b/python/paddle/fluid/tests/unittests/utils.py
@@ -88,6 +88,7 @@ def _is_equal_program(prog1, prog2):
 
 
 def load_dygraph_vars_to_scope(model_path, scope, place):
+
     def load_dict_to_scope(scope, dictionary):
         if scope is None:
             scope = fluid.global_scope()
@@ -107,6 +108,7 @@ def load_dict_to_scope(scope, dictionary):
 
 
 class DyGraphProgramDescTracerTestHelper(object):
+
     def __init__(self, unittest_obj):
         self.unittest_obj = unittest_obj
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
index 512a76b3f6081..233c4e6143615 100644
--- a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
@@ -1,24 +1,30 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-if (WITH_XPU_BKCL)
-    list(REMOVE_ITEM TEST_OPS "test_gen_bkcl_id_op")
+if(WITH_XPU_BKCL)
+  list(REMOVE_ITEM TEST_OPS "test_gen_bkcl_id_op")
 endif()
 
-file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
-if (WITH_XPU_BKCL)
-    list(APPEND DIST_TEST_OPS test_gen_bkcl_id_op)
+file(
+  GLOB DIST_TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_dist_*.py")
+if(WITH_XPU_BKCL)
+  list(APPEND DIST_TEST_OPS test_gen_bkcl_id_op)
 endif()
 
 list(REMOVE_ITEM TEST_OPS test_concat_op_xpu)
 list(REMOVE_ITEM TEST_OPS test_mean_op_xpu)
 
 foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 
 foreach(TEST_OP ${DIST_TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 
 set_tests_properties(test_mul_op_xpu PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
index aa3dcb6519c57..33a84823460a0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
+++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
@@ -90,6 +90,7 @@
 
 
 class XPUOpTestWrapper(object):
+
     def create_classes(self):
         base_class = None
         classes = []
@@ -177,13 +178,13 @@ def make_xpu_op_list(xpu_version):
 
 def get_xpu_op_support_types(op_name, dev_id=0):
     xpu_version = core.get_xpu_device_version(dev_id)
-    support_type_list = core.get_xpu_device_op_support_types(op_name,
-                                                             xpu_version)
+    support_type_list = core.get_xpu_device_op_support_types(
+        op_name, xpu_version)
     support_type_str_list = []
     for stype in support_type_list:
         if stype == paddle.bfloat16:
-            support_type_str_list.append(type_dict_paddle_to_str[
-                paddle.bfloat16])
+            support_type_str_list.append(
+                type_dict_paddle_to_str[paddle.bfloat16])
         else:
             support_type_str_list.append(type_dict_paddle_to_str[stype])
     type_white_list = get_type_white_list()
@@ -239,11 +240,12 @@ def create_test_class(func_globals,
             continue
         class_obj = test_class[1]
         cls_name = "{0}_{1}".format(test_class[0], str(test_type))
-        func_globals[cls_name] = type(cls_name, (class_obj, ), {
-            'in_type': type_dict_str_to_numpy[test_type],
-            'in_type_str': test_type,
-            'op_type_need_check_grad': True
-        })
+        func_globals[cls_name] = type(
+            cls_name, (class_obj, ), {
+                'in_type': type_dict_str_to_numpy[test_type],
+                'in_type_str': test_type,
+                'op_type_need_check_grad': True
+            })
 
     if hasattr(test_class_obj, 'use_dynamic_create_class'
                ) and test_class_obj.use_dynamic_create_class:
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
index b0bb9a37c16bd..fff2531a9c2e8 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle.fluid as fluid
@@ -30,11 +31,13 @@
 
 
 class XPUTestAccuracyOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'accuracy'
         self.use_dynamic_create_class = False
 
     class TestXPUAccuracyOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = "accuracy"
             self.init_dtype()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index 9e2825ab631f0..63a0aa2e59b93 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -29,6 +30,7 @@
 
 
 class TestActivationOPBase(XPUOpTest):
+
     def setUp(self):
         self.place = paddle.XPUPlace(0)
         self.init_dtype()
@@ -54,11 +56,13 @@ def test_check_grad(self):
 
 
 class XPUTestExpOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'exp'
         self.use_dynamic_create_class = False
 
     class XPUTestExp(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = 'exp'
             self.dtype = self.in_type
@@ -76,11 +80,13 @@ def set_case(self):
 
 
 class XPUTestSigmoidOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'sigmoid'
         self.use_dynamic_create_class = False
 
     class XPUTestSigmoid(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "sigmoid"
             self.dtype = self.in_type
@@ -95,18 +101,22 @@ def init_config(self):
             self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
 
     class XPUTestSigmoid2(XPUTestSigmoid):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2, [100]).astype(self.dtype)
 
     class XPUTestSigmoid3(XPUTestSigmoid):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2, [10, 12, 15]).astype(self.dtype)
 
     class XPUTestSigmoid4(XPUTestSigmoid):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2, [19, 19]).astype(self.dtype)
 
     class XPUTestSigmoid5(XPUTestSigmoid):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2,
                                        [10, 20, 30, 40]).astype(self.dtype)
@@ -118,11 +128,13 @@ def init_config(self):
 
 
 class XPUTestTanhOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'tanh'
         self.use_dynamic_create_class = False
 
     class XPUTestTanh(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "tanh"
             self.dtype = self.in_type
@@ -140,11 +152,13 @@ def set_case(self):
 
 
 class XPUTestSqrtOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'sqrt'
         self.use_dynamic_create_class = False
 
     class XPUTestSqrt(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "sqrt"
             self.dtype = self.in_type
@@ -163,11 +177,13 @@ def set_case(self):
 
 
 class XPUTestAbsOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'abs'
         self.use_dynamic_create_class = False
 
     class XPUTestAbs(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "abs"
             self.dtype = self.in_type
@@ -191,11 +207,13 @@ def set_case(self):
 
 
 class XPUTestReluOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'relu'
         self.use_dynamic_create_class = False
 
     class XPUTestRelu(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "relu"
             self.dtype = self.in_type
@@ -216,11 +234,13 @@ def set_case(self):
 
 
 class XPUTestGeluOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'gelu'
         self.use_dynamic_create_class = False
 
     class XPUTestGelu(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "gelu"
             self.dtype = self.in_type
@@ -242,19 +262,21 @@ def set_case(self):
 def gelu(x, approximate):
     from scipy.special import erf
     if approximate:
-        y_ref = 0.5 * x * (1.0 + np.tanh(
-            np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+        y_ref = 0.5 * x * (
+            1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
     else:
         y_ref = 0.5 * x * (1 + erf(x / np.sqrt(2)))
     return y_ref.astype(x.dtype)
 
 
 class XPUTestHardSwishOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'hard_swish'
         self.use_dynamic_create_class = False
 
     class XPUTestHardSwish(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "hard_swish"
             self.dtype = self.in_type
@@ -281,11 +303,13 @@ def hard_swish(x, offset, threshold, scale):
 
 
 class XPUTestLogOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'log'
         self.use_dynamic_create_class = False
 
     class XPUTestLog(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "log"
             self.dtype = self.in_type
@@ -304,11 +328,13 @@ def set_case(self):
 
 
 class XPUTestSquareOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'square'
         self.use_dynamic_create_class = False
 
     class XPUTestSquare(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "square"
             self.dtype = self.in_type
@@ -323,18 +349,22 @@ def init_config(self):
             self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
 
     class XPUTestSquare2(XPUTestSquare):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2, [100]).astype(self.dtype)
 
     class XPUTestSquare3(XPUTestSquare):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2, [1, 15, 19]).astype(self.dtype)
 
     class XPUTestSquare4(XPUTestSquare):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2, [100, 10]).astype(self.dtype)
 
     class XPUTestSquare5(XPUTestSquare):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2, [1, 2, 5, 17]).astype(self.dtype)
 
@@ -345,11 +375,13 @@ def init_config(self):
 
 
 class XPUTestPowOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'pow'
         self.use_dynamic_create_class = False
 
     class XPUTestPowBase(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "pow"
             self.dtype = self.in_type
@@ -366,34 +398,40 @@ def init_config(self):
             self.factor = 3.0
 
     class XPUTestPow1(XPUTestPowBase):
+
         def init_config(self):
             self.x = np.random.uniform(-1, 1, [1024, 8]).astype(self.dtype)
             self.factor = 1
 
     class XPUTestPow2(XPUTestPowBase):
+
         def init_config(self):
             self.x = np.random.uniform(-1, 1, [1024, 8]).astype(self.dtype)
             self.factor = 2
 
     class XPUTestPow3(XPUTestPowBase):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2,
                                        [4, 512, 15, 15]).astype(self.dtype)
             self.factor = 3
 
     class XPUTestPow4(XPUTestPowBase):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2,
                                        [4, 256, 22, 22]).astype(self.dtype)
             self.factor = 4
 
     class XPUTestPow5(XPUTestPowBase):
+
         def init_config(self):
             self.x = np.random.uniform(0, 1,
                                        [4, 256, 22, 22]).astype(self.dtype)
             self.factor = 1.2
 
     class XPUTestPow6(XPUTestPowBase):
+
         def init_config(self):
             self.x = np.random.uniform(0, 1, [1024, 8]).astype(self.dtype)
             self.factor = 3.2
@@ -405,11 +443,13 @@ def init_config(self):
 
 
 class XPUTestLeakyReluOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'leaky_relu'
         self.use_dynamic_create_class = False
 
     class XPUTestLeakyRelu(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "leaky_relu"
             self.dtype = self.in_type
@@ -417,7 +457,8 @@ def set_case(self):
             x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
             alpha = np.random.uniform(
                 0,
-                1, )
+                1,
+            )
             out = leaky_relu(x, alpha)
 
             self.inputs = {'X': x}
@@ -439,11 +480,13 @@ def leaky_relu(x, alpha):
 
 
 class XPUTestReciprocalOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'reciprocal'
         self.use_dynamic_create_class = False
 
     class XPUTestRecipocal(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "reciprocal"
             self.dtype = self.in_type
@@ -463,11 +506,13 @@ def set_case(self):
 
 
 class XPUTestSoftPlusOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'softplus'
         self.use_dynamic_create_class = False
 
     class XPUTestSoftPlusBase(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "softplus"
             self.dtype = self.in_type
@@ -485,15 +530,18 @@ def init_config(self):
             self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
 
     class XPUTestSoftPlus2(XPUTestSoftPlusBase):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2, [1024, 8]).astype(self.dtype)
 
     class XPUTestSoftPlus3(XPUTestSoftPlusBase):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2,
                                        [4, 512, 15, 15]).astype(self.dtype)
 
     class XPUTestSoftPlus4(XPUTestSoftPlusBase):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2,
                                        [4, 256, 22, 22]).astype(self.dtype)
@@ -513,11 +561,13 @@ def ref_softplus(x, beta=1, threshold=20):
 
 # XPU_KP unittests, these ops can be found from xpu_op_kpfirst_list.h
 class XPUTestBReluOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'brelu'
         self.use_dynamic_create_class = False
 
     class XPUTestBRelu(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "brelu"
             self.dtype = self.in_type
@@ -544,11 +594,13 @@ def set_case(self):
 
 
 class XPUTestCeilOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'ceil'
         self.use_dynamic_create_class = False
 
     class XPUTestCeil(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "ceil"
             self.dtype = self.in_type
@@ -568,11 +620,13 @@ def set_case(self):
 
 
 class XPUTestCeluOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'celu'
         self.use_dynamic_create_class = False
 
     class XPUTestCelu(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "celu"
             self.dtype = self.in_type
@@ -597,11 +651,13 @@ def ref_celu(x, alpha):
 
 
 class XPUTestEluOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'elu'
         self.use_dynamic_create_class = False
 
     class XPUTestElu(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "elu"
             self.dtype = self.in_type
@@ -626,11 +682,13 @@ def ref_elu(x, alpha):
 
 
 class XPUTestFloorOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'floor'
         self.use_dynamic_create_class = False
 
     class XPUTestFloor(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "floor"
             self.dtype = self.in_type
@@ -650,11 +708,13 @@ def set_case(self):
 
 
 class XPUTestHardShrinkOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'hard_shrink'
         self.use_dynamic_create_class = False
 
     class XPUTestHardShrink(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "hard_shrink"
             self.dtype = self.in_type
@@ -682,11 +742,13 @@ def ref_hardshrink(x, threshold):
 
 
 class XPUTestHardSigmoidOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'hard_sigmoid'
         self.use_dynamic_create_class = False
 
     class XPUTestHardSigmoid(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "hard_sigmoid"
             self.dtype = self.in_type
@@ -723,11 +785,13 @@ def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5):
 
 
 class XPUTestLog1pOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'log1p'
         self.use_dynamic_create_class = False
 
     class XPUTestLog1p(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "log1p"
             self.dtype = self.in_type
@@ -747,11 +811,13 @@ def set_case(self):
 
 
 class XPUTestLogsigmoidOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'logsigmoid'
         self.use_dynamic_create_class = False
 
     class XPUTestLogsigmoid(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "logsigmoid"
             self.dtype = self.in_type
@@ -771,11 +837,13 @@ def set_case(self):
 
 
 class XPUTestRelu6OP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'relu6'
         self.use_dynamic_create_class = False
 
     class XPUTestRelu6(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "relu6"
             self.dtype = self.in_type
@@ -803,11 +871,13 @@ def ref_relu6(x, threshold=6.0):
 
 
 class XPUTestSiluOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'silu'
         self.use_dynamic_create_class = False
 
     class XPUTestSilu(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "silu"
             self.dtype = self.in_type
@@ -827,11 +897,13 @@ def set_case(self):
 
 
 class XPUTestSoftReluOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'soft_relu'
         self.use_dynamic_create_class = False
 
     class XPUTestSoftRelu(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "soft_relu"
             self.dtype = self.in_type
@@ -858,11 +930,13 @@ def set_case(self):
 
 
 class XPUTestSoftSignOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'softsign'
         self.use_dynamic_create_class = False
 
     class XPUTestSoftSign(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "softsign"
             self.dtype = self.in_type
@@ -887,11 +961,13 @@ def ref_softsign(x):
 
 
 class XPUTestSoftshrinkOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'softshrink'
         self.use_dynamic_create_class = False
 
     class XPUTestSoftshrink(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "softshrink"
             self.dtype = self.in_type
@@ -919,11 +995,13 @@ def ref_softshrink(x, threshold=0.5):
 
 
 class XPUTestSwishOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'swish'
         self.use_dynamic_create_class = False
 
     class XPUTestSwishBase(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "swish"
             self.dtype = self.in_type
@@ -939,15 +1017,18 @@ def init_config(self):
             self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
 
     class XPUTestSwish2(XPUTestSwishBase):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2, [1024, 8]).astype(self.dtype)
 
     class XPUTestSwish3(XPUTestSwishBase):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2,
                                        [4, 512, 15, 15]).astype(self.dtype)
 
     class XPUTestSwish4(XPUTestSwishBase):
+
         def init_config(self):
             self.x = np.random.uniform(-2, 2,
                                        [4, 256, 22, 22]).astype(self.dtype)
@@ -965,11 +1046,13 @@ def ref_swish(x):
 
 
 class XPUTestThresholdedReluOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'thresholded_relu'
         self.use_dynamic_create_class = False
 
     class XPUTestThresholdedRelu(TestActivationOPBase):
+
         def set_case(self):
             self.op_type = "thresholded_relu"
             self.dtype = self.in_type
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py
index 6495c0af1a14d..3be4cac81ca15 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -28,6 +29,7 @@
 
 
 class XPUTestAdamOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'adam'
         self.use_dynamic_create_class = False
@@ -50,9 +52,12 @@ def setUp(self):
                 moment2_out = adam_step(self.inputs, self.attrs)
 
             self.outputs = {
-                'Moment1Out': moment1_out,
-                'Moment2Out': moment2_out,
-                'ParamOut': param_out,
+                'Moment1Out':
+                moment1_out,
+                'Moment2Out':
+                moment2_out,
+                'ParamOut':
+                param_out,
                 'Beta1PowOut':
                 np.array([self.beta1_pow]).astype("float32") * self.beta1,
                 'Beta2PowOut':
@@ -177,8 +182,8 @@ def test_check_output(self):
                 }
 
                 # Verify output for this step
-                self.check_output_with_place(
-                    place=paddle.XPUPlace(0), atol=1e-2)
+                self.check_output_with_place(place=paddle.XPUPlace(0),
+                                             atol=1e-2)
 
                 # Output of this step becomes input for next step
                 self.inputs['Param'] = param_out
@@ -254,13 +259,13 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
     param_out = np.zeros(shape=[height, row_numel])
 
     def update_row(row_id, update_value):
-        moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
-                                                         ) * update_value
+        moment1_out[row_id] = beta1 * moment1[row_id] + (1 -
+                                                         beta1) * update_value
         moment2_out[row_id] = beta2 * moment2[row_id] + (
             1 - beta2) * np.square(update_value)
         lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
-        param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
-            np.sqrt(moment2_out[row_id]) + epsilon))
+        param_out[row_id] = param[row_id] - lr_t * (
+            moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon))
 
     if lazy_mode:
         for idx, row_id in enumerate(rows):
@@ -276,6 +281,7 @@ def update_row(row_id, update_value):
 
 
 class TestSparseAdamOp(unittest.TestCase):
+
     def setup(self, scope, place, lazy_mode):
         beta1 = 0.78
         beta2 = 0.836
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adamw_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_adamw_op_xpu.py
index 99e9fdd123eb1..1ccf8a1fdaaf4 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_adamw_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_adamw_op_xpu.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import sys
+
 sys.path.append("..")
 
 import unittest
@@ -76,11 +77,13 @@ def simple_lr_setting(param, decay_rate, n_layers):
 
 
 class XPUTestAdamwOp1(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'adamw'
         self.use_dynamic_create_class = False
 
     class TestAdamW(XPUOpTest):
+
         def setUp(self):
             #Test AdamW Op with supplied attributes
             self.op_type = "adamw"
@@ -136,20 +139,26 @@ def test_check_output(self):
             self.check_output_with_place(place=paddle.XPUPlace(0))
 
     class TestAdamW2(TestAdamW):
+
         def init_shape(self):
-            self.shape = [1000, ]
+            self.shape = [
+                1000,
+            ]
 
     class TestAdamW3(TestAdamW):
+
         def init_shape(self):
             self.shape = [200, 3000]
 
 
 class XPUTestAdamwOp2(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'adamw'
         self.use_dynamic_create_class = False
 
     class TestAdamWOp(unittest.TestCase):
+
         def test_adamw_op_dygraph(self):
             paddle.disable_static()
             value = np.arange(26).reshape(2, 13).astype(self.in_type_str)
@@ -203,12 +212,11 @@ def test_adamw_op(self):
                         dtype=self.in_type_str,
                         persistable=True)
                     betas = [beta1, beta2]
-                    opt = paddle.optimizer.AdamW(
-                        learning_rate=1e-5,
-                        beta1=beta1,
-                        beta2=beta2,
-                        weight_decay=0.01,
-                        epsilon=1e-8)
+                    opt = paddle.optimizer.AdamW(learning_rate=1e-5,
+                                                 beta1=beta1,
+                                                 beta2=beta2,
+                                                 weight_decay=0.01,
+                                                 epsilon=1e-8)
                     opt.minimize(loss)
 
             exe.run(startup)
@@ -223,16 +231,20 @@ def test_adamw_op_invalid_input(self):
             paddle.disable_static()
             linear = paddle.nn.Linear(10, 10)
             with self.assertRaises(ValueError):
-                adam = paddle.optimizer.AdamW(
-                    0.1, beta1=-1, parameters=linear.parameters())
+                adam = paddle.optimizer.AdamW(0.1,
+                                              beta1=-1,
+                                              parameters=linear.parameters())
             with self.assertRaises(ValueError):
-                adam = paddle.optimizer.AdamW(
-                    0.1, beta2=-1, parameters=linear.parameters())
+                adam = paddle.optimizer.AdamW(0.1,
+                                              beta2=-1,
+                                              parameters=linear.parameters())
             with self.assertRaises(ValueError):
-                adam = paddle.optimizer.AdamW(
-                    0.1, epsilon=-1, parameters=linear.parameters())
+                adam = paddle.optimizer.AdamW(0.1,
+                                              epsilon=-1,
+                                              parameters=linear.parameters())
 
     class TestAdamWOpGroup(TestAdamWOp):
+
         def test_adamw_op_dygraph(self):
             paddle.disable_static()
             value = np.arange(26).reshape(2, 13).astype(self.in_type_str)
@@ -258,6 +270,7 @@ def test_adamw_op_dygraph(self):
                 adam.clear_gradients()
 
     class TestAdamWOpGroupWithLR(TestAdamWOp):
+
         def test_adamw_op_dygraph(self):
             paddle.disable_static()
             value = np.arange(26).reshape(2, 13).astype(self.in_type_str)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_affine_channel_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_affine_channel_op_xpu.py
index 3385d671d7332..b78648f1d7f0f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_affine_channel_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_affine_channel_op_xpu.py
@@ -18,6 +18,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 
 import unittest
@@ -40,6 +41,7 @@ def affine_channel(x, scale, bias, layout):
 
 
 class TestAffineChannelOp(XPUOpTest):
+
     def setUp(self):
         self.op_type = "affine_channel"
         self.init_test_case()
@@ -70,15 +72,17 @@ def test_check_grad_stopgrad_dx(self):
         if core.is_compiled_with_xpu():
             paddle.enable_static()
             place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['Scale', 'Bias'], 'Out', no_grad_set=set('X'))
+            self.check_grad_with_place(place, ['Scale', 'Bias'],
+                                       'Out',
+                                       no_grad_set=set('X'))
 
     def test_check_grad_stopgrad_dscale_dbias(self):
         if core.is_compiled_with_xpu():
             paddle.enable_static()
             place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['X'], 'Out', no_grad_set=set(['Scale', 'Bias']))
+            self.check_grad_with_place(place, ['X'],
+                                       'Out',
+                                       no_grad_set=set(['Scale', 'Bias']))
 
     def init_test_case(self):
         self.shape = [2, 100, 3, 3]
@@ -87,6 +91,7 @@ def init_test_case(self):
 
 
 class TestAffineChannelOpError(unittest.TestCase):
+
     def test_errors(self):
         with fluid.program_guard(fluid.Program()):
 
@@ -97,28 +102,32 @@ def test_x_type():
             self.assertRaises(TypeError, test_x_type)
 
             def test_x_dtype():
-                x2 = fluid.layers.data(
-                    name='x2', shape=[None, 1, 2, 2], dtype='int32')
+                x2 = fluid.layers.data(name='x2',
+                                       shape=[None, 1, 2, 2],
+                                       dtype='int32')
                 fluid.layers.affine_channel(x2)
 
             self.assertRaises(TypeError, test_x_dtype)
 
             def test_scale_type():
-                x3 = fluid.layers.data(
-                    name='x3', shape=[None, 1, 2, 2], dtype='float32')
+                x3 = fluid.layers.data(name='x3',
+                                       shape=[None, 1, 2, 2],
+                                       dtype='float32')
                 fluid.layers.affine_channel(x3, scale=1)
 
             self.assertRaises(TypeError, test_scale_type)
 
             def test_bias_type():
-                x4 = fluid.layers.data(
-                    name='x4', shape=[None, 1, 2, 2], dtype='float32')
+                x4 = fluid.layers.data(name='x4',
+                                       shape=[None, 1, 2, 2],
+                                       dtype='float32')
                 fluid.layers.affine_channel(x4, bias=1)
 
             self.assertRaises(TypeError, test_bias_type)
 
 
 class TestAffineChannelNHWC(TestAffineChannelOp):
+
     def init_test_case(self):
         self.shape = [2, 3, 3, 100]
         self.C = 100
@@ -132,6 +141,7 @@ def test_check_grad_stopgrad_dscale_dbias(self):
 
 
 class TestAffineChannel2D(TestAffineChannelOp):
+
     def init_test_case(self):
         self.shape = [2, 100]
         self.C = 100
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py
index 9a2976f82a460..3ef4701cdf3d0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import sys
+
 sys.path.append("..")
 import paddle
 import unittest
@@ -20,10 +21,12 @@
 from op_test_xpu import XPUOpTest
 from op_test import OpTest, skip_check_grad_ci
 import paddle.fluid as fluid
+
 paddle.enable_static()
 
 
 class TestCheckFiniteAndUnscaleOp(XPUOpTest):
+
     def setUp(self):
         self.op_type = "check_finite_and_unscale"
         self.init_dtype()
@@ -65,7 +68,7 @@ def test_check_output(self):
 #         self.dtype = np.float32
 
 #     def test_check_output(self):
-#         # When input contains nan, do not check the output, 
+#         # When input contains nan, do not check the output,
 #         # since the output may be nondeterministic and will be discarded.
 #         if paddle.is_compiled_with_xpu():
 #             place = paddle.XPUPlace(0)
@@ -89,7 +92,7 @@ def test_check_output(self):
 #         self.dtype = np.float32
 
 #     def test_check_output(self):
-#         # When input contains inf, do not check the output, 
+#         # When input contains inf, do not check the output,
 #         # since the output may be nondeterministic and will be discarded.
 #         if paddle.is_compiled_with_xpu():
 #             place = paddle.XPUPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
index 519a185250ab0..792a729d1fadf 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -28,10 +29,12 @@
 
 
 class XPUTestArgMax(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'arg_max'
 
     class XPUBaseTestCase(XPUOpTest):
+
         def initTestCase(self):
             self.dims = (3, 4)
             self.axis = 1
@@ -52,51 +55,61 @@ def test_check_output(self):
                 self.check_output_with_place(place)
 
     class TestArgMaxCase1(XPUBaseTestCase):
+
         def initTestCase(self):
             self.dims = (3, 4, 5)
             self.axis = -1
 
     class TestArgMaxCase2(XPUBaseTestCase):
+
         def initTestCase(self):
             self.dims = (3, 4, 5)
             self.axis = 0
 
     class TestArgMaxCase3(XPUBaseTestCase):
+
         def initTestCase(self):
             self.dims = (3, 4, 5)
             self.axis = 1
 
     class TestArgMaxCase4(XPUBaseTestCase):
+
         def initTestCase(self):
             self.dims = (3, 4, 5)
             self.axis = 2
 
     class TestArgMaxCase5(XPUBaseTestCase):
+
         def initTestCase(self):
             self.dims = (3, 4)
             self.axis = -1
 
     class TestArgMaxCase6(XPUBaseTestCase):
+
         def initTestCase(self):
             self.dims = (3, 4)
             self.axis = 0
 
     class TestArgMaxCase7(XPUBaseTestCase):
+
         def initTestCase(self):
             self.dims = (3, 4)
             self.axis = 1
 
     class TestArgMaxCase8(XPUBaseTestCase):
+
         def initTestCase(self):
             self.dims = (1, )
             self.axis = 0
 
     class TestArgMaxCase9(XPUBaseTestCase):
+
         def initTestCase(self):
             self.dims = (2, )
             self.axis = 0
 
     class TestArgMaxCase10(XPUBaseTestCase):
+
         def initTestCase(self):
             self.dims = (3, )
             self.axis = 0
@@ -108,6 +121,7 @@ def initTestCase(self):
 
 
 class TestArgMaxAPI(unittest.TestCase):
+
     def initTestCase(self):
         self.dims = (3, 4, 5)
         self.dtype = 'float32'
@@ -119,6 +133,7 @@ def setUp(self):
         self.place = [paddle.XPUPlace(0)]
 
     def test_dygraph_api(self):
+
         def run(place):
             paddle.disable_static(place)
             np.random.seed(2021)
@@ -126,8 +141,8 @@ def run(place):
             tensor_input = paddle.to_tensor(numpy_input)
             numpy_output = np.argmax(numpy_input, axis=self.axis)
             paddle_output = paddle.argmax(tensor_input, axis=self.axis)
-            self.assertEqual(
-                np.allclose(numpy_output, paddle_output.numpy()), True)
+            self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()),
+                             True)
             paddle.enable_static()
 
         for place in self.place:
@@ -135,6 +150,7 @@ def run(place):
 
 
 class TestArgMaxAPI_2(unittest.TestCase):
+
     def initTestCase(self):
         self.dims = (3, 4, 5)
         self.dtype = 'float32'
@@ -147,17 +163,19 @@ def setUp(self):
         self.place = [paddle.XPUPlace(0)]
 
     def test_dygraph_api(self):
+
         def run(place):
             paddle.disable_static(place)
             np.random.seed(2021)
             numpy_input = (np.random.random(self.dims)).astype(self.dtype)
             tensor_input = paddle.to_tensor(numpy_input)
-            numpy_output = np.argmax(
-                numpy_input, axis=self.axis).reshape(1, 4, 5)
-            paddle_output = paddle.argmax(
-                tensor_input, axis=self.axis, keepdim=self.keep_dims)
-            self.assertEqual(
-                np.allclose(numpy_output, paddle_output.numpy()), True)
+            numpy_output = np.argmax(numpy_input,
+                                     axis=self.axis).reshape(1, 4, 5)
+            paddle_output = paddle.argmax(tensor_input,
+                                          axis=self.axis,
+                                          keepdim=self.keep_dims)
+            self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()),
+                             True)
             self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
             paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py
index 4290c0abf122a..7f7ee2e7a124d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -28,6 +29,7 @@
 
 
 class XPUTestArgsortOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'argsort'
         self.use_dynamic_create_class = True
@@ -44,6 +46,7 @@ def dynamic_create_class(self):
         return base_class, classes
 
     class TestArgsortOp(XPUOpTest):
+
         def setUp(self):
             self.set_xpu()
             self.op_type = "argsort"
@@ -57,9 +60,10 @@ def setUp(self):
             if self.dtype == np.float32:
                 self.x = np.random.random(self.input_shape).astype(self.dtype)
             else:
-                self.x = np.random.randint(
-                    low=-1000, high=1000,
-                    size=self.input_shape).astype(self.dtype)
+                self.x = np.random.randint(low=-1000,
+                                           high=1000,
+                                           size=self.input_shape).astype(
+                                               self.dtype)
 
             self.inputs = {"X": self.x}
             self.attrs = {"axis": self.axis, "descending": self.descending}
@@ -69,15 +73,14 @@ def setUp(self):
         def get_output(self):
             if self.descending:
                 self.indices = np.flip(
-                    np.argsort(
-                        self.x, kind='heapsort', axis=self.axis),
+                    np.argsort(self.x, kind='heapsort', axis=self.axis),
                     self.axis)
                 self.sorted_x = np.flip(
-                    np.sort(
-                        self.x, kind='heapsort', axis=self.axis), self.axis)
+                    np.sort(self.x, kind='heapsort', axis=self.axis), self.axis)
             else:
-                self.indices = np.argsort(
-                    self.x, kind='heapsort', axis=self.axis)
+                self.indices = np.argsort(self.x,
+                                          kind='heapsort',
+                                          axis=self.axis)
                 self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis)
 
         def set_xpu(self):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
index 6c2fe6ba93033..2175243ef1ddb 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -71,8 +72,8 @@ def ref_batch_norm_train(x, y_grad, scale, bias, mean, variance, momentum,
         saved_mean_tile = np.tile(saved_mean_tile, (n, 1, h, w))
         saved_variance_tile = np.reshape(saved_variance, (1, c, 1, 1))
         saved_variance_tile = np.tile(saved_variance_tile, (n, 1, h, w))
-        normalized_x = (
-            x - saved_mean_tile) / np.sqrt(saved_variance_tile + epsilon)
+        normalized_x = (x - saved_mean_tile) / np.sqrt(saved_variance_tile +
+                                                       epsilon)
         scale_tile = np.reshape(scale, (1, c, 1, 1))
         scale_tile = np.tile(scale_tile, (n, 1, h, w))
         bias_tile = np.reshape(bias, (1, c, 1, 1))
@@ -109,9 +110,8 @@ def ref_batch_norm_train(x, y_grad, scale, bias, mean, variance, momentum,
         x = np.transpose(x, (0, 2, 3, 1))
         y_grad = np.transpose(y_grad, (0, 2, 3, 1))
     x_grad = scale * (
-        y_grad - np.mean(
-            y_grad, axis=(0, 1, 2)) - (x - saved_mean) * np.mean(
-                y_grad * (x - saved_mean), axis=(0, 1, 2)) /
+        y_grad - np.mean(y_grad, axis=(0, 1, 2)) -
+        (x - saved_mean) * np.mean(y_grad * (x - saved_mean), axis=(0, 1, 2)) /
         (saved_variance + epsilon)) / np.sqrt(saved_variance + epsilon)
     scale_grad = np.sum(y_grad * (x - saved_mean) /
                         np.sqrt(saved_variance + epsilon),
@@ -126,6 +126,7 @@ def ref_batch_norm_train(x, y_grad, scale, bias, mean, variance, momentum,
 
 
 class XPUTestBatchNormOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'batch_norm'
         self.use_dynamic_create_class = False
@@ -133,6 +134,7 @@ def __init__(self):
     @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                      "core is not compiled with XPU")
     class TestBatchNormOp(unittest.TestCase):
+
         def setUp(self):
             self.op_type = "batch_norm"
             self.dtype = np.float32
@@ -154,10 +156,10 @@ def setUp(self):
                     + self.data_layout)
             np.random.seed(1024)
             self.x_np = np.random.random_sample(self.shape).astype(self.dtype)
-            self.scale_np = np.random.random_sample(
-                [channel_size]).astype(self.dtype)
-            self.bias_np = np.random.random_sample(
-                [channel_size]).astype(self.dtype)
+            self.scale_np = np.random.random_sample([channel_size
+                                                     ]).astype(self.dtype)
+            self.bias_np = np.random.random_sample([channel_size
+                                                    ]).astype(self.dtype)
             self.mean_np = np.zeros([channel_size]).astype(self.dtype)
             self.variance_np = np.ones([channel_size]).astype(self.dtype)
             self.saved_mean_np = np.zeros([channel_size]).astype(self.dtype)
@@ -197,9 +199,10 @@ def test_infer(self):
                     'Variance': self.variance_np
                 },
                                  fetch_list=[y])
-            y_np_ref = ref_batch_norm_infer(
-                self.x_np, self.scale_np, self.bias_np, self.mean_np,
-                self.variance_np, self.momentum, self.epsilon, self.data_layout)
+            y_np_ref = ref_batch_norm_infer(self.x_np, self.scale_np,
+                                            self.bias_np, self.mean_np,
+                                            self.variance_np, self.momentum,
+                                            self.epsilon, self.data_layout)
             self.assertEqual(np.allclose(y_np_ref, y_np), True)
 
         def test_train(self):
@@ -244,10 +247,9 @@ def test_train(self):
                     arg_name = var_name
                     np_value = inputs[var_name]
                     if not block.has_var(var_name):
-                        block.create_var(
-                            name=var_name,
-                            shape=np_value.shape,
-                            dtype=np_value.dtype)
+                        block.create_var(name=var_name,
+                                         shape=np_value.shape,
+                                         dtype=np_value.dtype)
                     input_vars[arg_name] = block.var(var_name)
                 fetch_list = []
                 output_vars = {}
@@ -255,21 +257,19 @@ def test_train(self):
                     arg_name = var_name
                     np_value = outputs[var_name]
                     if not block.has_var(var_name):
-                        block.create_var(
-                            name=var_name,
-                            shape=np_value.shape,
-                            dtype=np_value.dtype)
+                        block.create_var(name=var_name,
+                                         shape=np_value.shape,
+                                         dtype=np_value.dtype)
                     if var_name == 'Mean':
                         arg_name = 'MeanOut'  # Share memory
                     if var_name == 'Variance':
                         arg_name = 'VarianceOut'  # Share memory
                     output_vars[arg_name] = block.var(var_name)
                     fetch_list.append(var_name)
-                batch_norm_op = block.append_op(
-                    type="batch_norm",
-                    inputs=input_vars,
-                    outputs=output_vars,
-                    attrs=attrs)
+                batch_norm_op = block.append_op(type="batch_norm",
+                                                inputs=input_vars,
+                                                outputs=output_vars,
+                                                attrs=attrs)
                 # Generate the backward op_desc of batch_norm
                 grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
                     batch_norm_op.desc, set(), [])
@@ -281,10 +281,10 @@ def test_train(self):
                 outs = exe.run(program, feed=inputs, fetch_list=fetch_list)
                 for id, name in enumerate(fetch_list):
                     self.assertEqual(
-                        np.allclose(
-                            outputs[name], outs[id], atol=1e-4), True)
+                        np.allclose(outputs[name], outs[id], atol=1e-4), True)
 
     class TestBatchNormOpUseGlobalStats(unittest.TestCase):
+
         def setUp(self):
             self.places = [paddle.XPUPlace(0)]
             self.init_test()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py
index a8173f054a133..b10a6210d34d2 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 import paddle
 import paddle.fluid as fluid
@@ -31,11 +32,13 @@ def bce_loss(input, label):
 
 
 class XPUTestBceLossOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'bce_loss'
         self.use_dynamic_create_class = False
 
     class TestBceLossOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = "bce_loss"
             self.dtype = self.in_type
@@ -59,10 +62,12 @@ def init_test_case(self):
             self.shape = [10, 10]
 
     class TestBceLossOpCase1(TestBceLossOp):
+
         def init_test_cast(self):
             self.shape = [2, 3, 4, 5]
 
     class TestBceLossOpCase2(TestBceLossOp):
+
         def init_test_cast(self):
             self.shape = [2, 3, 20]
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_op_xpu.py
index ddc2b49ebe08e..9f15b72fe7d8b 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_op_xpu.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid.core as core
 import sys
+
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_v2_op_xpu.py
index 19dae7068cbef..60abc31922d0d 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_v2_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from paddle.nn.functional import interpolate
 import paddle
@@ -24,6 +25,7 @@
 import unittest
 import paddle.fluid as fluid
 from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
 paddle.enable_static()
 
 
@@ -108,11 +110,13 @@ def bilinear_interp_np(input,
 
 
 class XPUTestBilinearInterpV2Op(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'bilinear_interp_v2'
         self.use_dynamic_create_class = False
 
     class TestBilinearInterpOp(XPUOpTest):
+
         def setUp(self):
             self.out_size = None
             self.actual_shape = None
@@ -146,9 +150,10 @@ def setUp(self):
                 out_h = self.out_h
                 out_w = self.out_w
 
-            output_np = bilinear_interp_np(
-                input_np, out_h, out_w, 0, 0, self.out_size, self.actual_shape,
-                self.align_corners, self.align_mode, self.data_layout)
+            output_np = bilinear_interp_np(input_np, out_h, out_w, 0, 0,
+                                           self.out_size, self.actual_shape,
+                                           self.align_corners, self.align_mode,
+                                           self.data_layout)
             self.inputs = {'X': input_np}
             if self.out_size is not None:
                 self.inputs['OutSize'] = self.out_size
@@ -192,6 +197,7 @@ def init_place(self):
             self.place = paddle.XPUPlace(0)
 
     class TestBilinearInterpCase1(TestBilinearInterpOp):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [4, 1, 7, 8]
@@ -202,6 +208,7 @@ def init_test_case(self):
             self.align_mode = 1
 
     class TestBilinearInterpCase2(TestBilinearInterpOp):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [3, 3, 9, 6]
@@ -212,6 +219,7 @@ def init_test_case(self):
             self.align_mode = 1
 
     class TestBilinearInterpCase3(TestBilinearInterpOp):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [1, 1, 32, 64]
@@ -222,6 +230,7 @@ def init_test_case(self):
             self.align_mode = 1
 
     class TestBilinearInterpCase4(TestBilinearInterpOp):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [4, 1, 7, 8]
@@ -233,6 +242,7 @@ def init_test_case(self):
             self.align_mode = 1
 
     class TestBilinearInterpCase5(TestBilinearInterpOp):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [3, 3, 9, 6]
@@ -244,6 +254,7 @@ def init_test_case(self):
             self.align_mode = 1
 
     class TestBilinearInterpCase6(TestBilinearInterpOp):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [1, 1, 32, 64]
@@ -255,6 +266,7 @@ def init_test_case(self):
             self.align_mode = 1
 
     class TestBilinearInterpCase7(TestBilinearInterpOp):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [1, 1, 32, 64]
@@ -265,6 +277,7 @@ def init_test_case(self):
             self.align_mode = 1
 
     class TestBilinearInterpSame(TestBilinearInterpOp):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [2, 3, 32, 64]
@@ -275,6 +288,7 @@ def init_test_case(self):
             self.align_mode = 1
 
     class TestBilinearInterpActualShape(TestBilinearInterpOp):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [3, 2, 32, 16]
@@ -286,21 +300,25 @@ def init_test_case(self):
             self.align_mode = 1
 
     class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
+
         def set_align_mode(self):
             self.align_corners = False
             self.align_mode = 1
 
     class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
+
         def set_align_mode(self):
             self.align_corners = False
             self.align_mode = 0
 
     class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
+
         def set_align_mode(self):
             self.align_corners = True
             self.align_mode = 0
 
     class TestBilinearInterpScale1(TestBilinearInterpOp):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [2, 3, 5, 7]
@@ -311,6 +329,7 @@ def init_test_case(self):
             self.align_mode = 1
 
     class TestBilinearInterpScale2(TestBilinearInterpOp):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [2, 3, 5, 7]
@@ -321,6 +340,7 @@ def init_test_case(self):
             self.align_mode = 1
 
     class TestBilinearInterpScale3(TestBilinearInterpOp):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [2, 3, 5, 7]
@@ -331,6 +351,7 @@ def init_test_case(self):
             self.align_mode = 1
 
     class TestBilinearInterpScale4(TestBilinearInterpOp):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [2, 3, 5, 7]
@@ -341,6 +362,7 @@ def init_test_case(self):
             self.align_mode = 1
 
     class TestBilinearInterpZero(TestBilinearInterpOp):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [2, 3, 5, 7]
@@ -351,6 +373,7 @@ def init_test_case(self):
             self.align_mode = 0
 
     class TestBilinearInterpOp_attr_tensor(XPUOpTest):
+
         def setUp(self):
             self.out_size = None
             self.actual_shape = None
@@ -427,8 +450,9 @@ def init_place(self):
             self.place = paddle.XPUPlace(0)
 
     # out_size is a 1-D tensor
-    class TestBilinearInterp_attr_tensor_Case1(
-            TestBilinearInterpOp_attr_tensor):
+    class TestBilinearInterp_attr_tensor_Case1(TestBilinearInterpOp_attr_tensor
+                                               ):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [3, 3, 9, 6]
@@ -439,8 +463,9 @@ def init_test_case(self):
             self.align_corners = True
 
     # scale is a 1-D tensor
-    class TestBilinearInterp_attr_tensor_Case2(
-            TestBilinearInterpOp_attr_tensor):
+    class TestBilinearInterp_attr_tensor_Case2(TestBilinearInterpOp_attr_tensor
+                                               ):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [3, 2, 32, 16]
@@ -452,8 +477,9 @@ def init_test_case(self):
             self.shape_by_1Dtensor = True
 
     # scale is a 1-D tensor
-    class TestBilinearInterp_attr_tensor_Case3(
-            TestBilinearInterpOp_attr_tensor):
+    class TestBilinearInterp_attr_tensor_Case3(TestBilinearInterpOp_attr_tensor
+                                               ):
+
         def init_test_case(self):
             self.interp_method = 'bilinear'
             self.input_shape = [3, 2, 32, 16]
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bitwise_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_bitwise_op_xpu.py
index 9a1c9a61fff78..ea86f3f86614b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_bitwise_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_bitwise_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -29,10 +30,12 @@
 
 ################## TEST OP: BitwiseAnd ##################
 class XPUTestBitwiseAnd(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'bitwise_and'
 
     class XPUTestBitwiseAndBase(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_case()
@@ -41,10 +44,14 @@ def setUp(self):
         def set_case(self):
             self.op_type = 'bitwise_and'
 
-            x = np.random.randint(
-                self.low, self.high, self.x_shape, dtype=self.dtype)
-            y = np.random.randint(
-                self.low, self.high, self.y_shape, dtype=self.dtype)
+            x = np.random.randint(self.low,
+                                  self.high,
+                                  self.x_shape,
+                                  dtype=self.dtype)
+            y = np.random.randint(self.low,
+                                  self.high,
+                                  self.y_shape,
+                                  dtype=self.dtype)
             out = np.bitwise_and(x, y)
 
             self.attrs = {'use_xpu': True}
@@ -68,6 +75,7 @@ def test_check_grad(self):
             pass
 
     class XPUTestBitwiseAndCase1(XPUTestBitwiseAndBase):
+
         def init_case(self):
             self.dtype = np.int32
             self.x_shape = [4, 5]
@@ -76,6 +84,7 @@ def init_case(self):
             self.high = 100
 
     class XPUTestBitwiseAndCase2(XPUTestBitwiseAndBase):
+
         def init_case(self):
             self.dtype = np.int32
             self.x_shape = [2, 3, 4, 5]
@@ -84,6 +93,7 @@ def init_case(self):
             self.high = 100
 
     class XPUTestBitwiseAndCase3(XPUTestBitwiseAndBase):
+
         def init_case(self):
             self.dtype = np.int32
             self.x_shape = [2, 3, 4, 5]
@@ -99,10 +109,12 @@ def init_case(self):
 
 ################## TEST OP: BitwiseOr ##################
 class XPUTestBitwiseOr(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'bitwise_or'
 
     class XPUTestBitwiseOrBase(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_case()
@@ -111,10 +123,14 @@ def setUp(self):
         def set_case(self):
             self.op_type = 'bitwise_or'
 
-            x = np.random.randint(
-                self.low, self.high, self.x_shape, dtype=self.dtype)
-            y = np.random.randint(
-                self.low, self.high, self.y_shape, dtype=self.dtype)
+            x = np.random.randint(self.low,
+                                  self.high,
+                                  self.x_shape,
+                                  dtype=self.dtype)
+            y = np.random.randint(self.low,
+                                  self.high,
+                                  self.y_shape,
+                                  dtype=self.dtype)
             out = np.bitwise_or(x, y)
 
             self.attrs = {'use_xpu': True}
@@ -138,6 +154,7 @@ def test_check_grad(self):
             pass
 
     class XPUTestBitwiseOrCase1(XPUTestBitwiseOrBase):
+
         def init_case(self):
             self.dtype = np.int32
             self.x_shape = [4, 5]
@@ -146,6 +163,7 @@ def init_case(self):
             self.high = 100
 
     class XPUTestBitwiseOrCase2(XPUTestBitwiseOrBase):
+
         def init_case(self):
             self.dtype = np.int32
             self.x_shape = [2, 3, 4, 5]
@@ -154,6 +172,7 @@ def init_case(self):
             self.high = 100
 
     class XPUTestBitwiseOrCase3(XPUTestBitwiseOrBase):
+
         def init_case(self):
             self.dtype = np.int32
             self.x_shape = [2, 3, 4, 5]
@@ -169,10 +188,12 @@ def init_case(self):
 
 ################## TEST OP: BitwiseXor ##################
 class XPUTestBitwiseXor(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'bitwise_xor'
 
     class XPUTestBitwiseXorBase(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_case()
@@ -181,10 +202,14 @@ def setUp(self):
         def set_case(self):
             self.op_type = 'bitwise_xor'
 
-            x = np.random.randint(
-                self.low, self.high, self.x_shape, dtype=self.dtype)
-            y = np.random.randint(
-                self.low, self.high, self.y_shape, dtype=self.dtype)
+            x = np.random.randint(self.low,
+                                  self.high,
+                                  self.x_shape,
+                                  dtype=self.dtype)
+            y = np.random.randint(self.low,
+                                  self.high,
+                                  self.y_shape,
+                                  dtype=self.dtype)
             out = np.bitwise_xor(x, y)
 
             self.attrs = {'use_xpu': True}
@@ -208,6 +233,7 @@ def test_check_grad(self):
             pass
 
     class XPUTestBitwiseXorCase1(XPUTestBitwiseXorBase):
+
         def init_case(self):
             self.dtype = np.int32
             self.x_shape = [4, 5]
@@ -216,6 +242,7 @@ def init_case(self):
             self.high = 100
 
     class XPUTestBitwiseXorCase2(XPUTestBitwiseXorBase):
+
         def init_case(self):
             self.dtype = np.int32
             self.x_shape = [2, 3, 4, 5]
@@ -224,6 +251,7 @@ def init_case(self):
             self.high = 100
 
     class XPUTestBitwiseXorCase3(XPUTestBitwiseXorBase):
+
         def init_case(self):
             self.dtype = np.int32
             self.x_shape = [2, 3, 4, 5]
@@ -239,10 +267,12 @@ def init_case(self):
 
 ##################  TEST OP: BitwiseNot ##################
 class XPUTestBitwiseNot(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'bitwise_not'
 
     class XPUTestBitwiseNotBase(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_case()
@@ -251,8 +281,10 @@ def setUp(self):
         def set_case(self):
             self.op_type = 'bitwise_not'
 
-            x = np.random.randint(
-                self.low, self.high, self.x_shape, dtype=self.dtype)
+            x = np.random.randint(self.low,
+                                  self.high,
+                                  self.x_shape,
+                                  dtype=self.dtype)
             out = np.bitwise_not(x)
 
             self.attrs = {'use_xpu': True}
@@ -272,6 +304,7 @@ def test_check_grad(self):
             pass
 
     class XPUTestBitwiseNotBool(XPUTestBitwiseNotBase):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_case()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bmm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_bmm_op_xpu.py
index f6893150c9e61..164908495b104 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_bmm_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_bmm_op_xpu.py
@@ -13,6 +13,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -39,6 +40,7 @@ def __init__(self):
         self.use_dynamic_create_class = False
 
     class TestBmmOp(XPUOpTest):
+
         def setUp(self):
             self.init_dtype()
             self.set_xpu()
@@ -71,26 +73,31 @@ def test_check_grad_normal(self):
             self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
     class TestBmmOp1(TestBmmOp):
+
         def set_shape(self):
             self.Xshape = (3, 3, 3)
             self.Yshape = (3, 3, 3)
 
     class TestBmmOp2(TestBmmOp):
+
         def set_shape(self):
             self.Xshape = (128, 3, 16)
             self.Yshape = (128, 16, 3)
 
     class TestBmmOp3(TestBmmOp):
+
         def set_shape(self):
             self.Xshape = (2048, 16, 27)
             self.Yshape = (2048, 27, 16)
 
     class TestBmmOp4(TestBmmOp):
+
         def set_shape(self):
             self.Xshape = (2, 27, 27)
             self.Yshape = (2, 27, 27)
 
     class TestBmmOp5(TestBmmOp):
+
         def set_shape(self):
             self.Xshape = (2, 1, 1)
             self.Yshape = (2, 1, 1)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
index 201e758c0acea..cd7062f66d995 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
@@ -37,6 +37,7 @@
 
 
 class XPUTestCastOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'cast'
         self.use_dynamic_create_class = True
@@ -51,6 +52,7 @@ def dynamic_create_class(self):
         return base_class, classes
 
     class TestCastOp(XPUOpTest):
+
         def setUp(self):
             ipt = np.random.random(size=[10, 10])
             in_typename = self.in_type_str
@@ -76,11 +78,12 @@ def test_check_output(self):
 
 
 class TestCastOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of cast_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.XPUPlace(0))
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.XPUPlace(0))
             self.assertRaises(TypeError, fluid.layers.cast, x1, 'int32')
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_clip_by_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_clip_by_norm_op_xpu.py
index 8698df9e7ee75..074acf2112fc9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_clip_by_norm_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_clip_by_norm_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -26,6 +27,7 @@
 
 
 class TestXPUClipByNormOp(XPUOpTest):
+
     def setUp(self):
         self.op_type = "clip_by_norm"
         self.dtype = np.float32
@@ -34,7 +36,9 @@ def setUp(self):
         self.initTestCase()
         input = np.random.random(self.shape).astype("float32")
         input[np.abs(input) < self.max_relative_error] = 0.5
-        self.inputs = {'X': input, }
+        self.inputs = {
+            'X': input,
+        }
         self.attrs = {}
         self.attrs['max_norm'] = self.max_norm
         norm = np.sqrt(np.sum(np.square(input)))
@@ -56,18 +60,21 @@ def initTestCase(self):
 
 
 class TestCase1(TestXPUClipByNormOp):
+
     def initTestCase(self):
         self.shape = (100, )
         self.max_norm = 1e20
 
 
 class TestCase2(TestXPUClipByNormOp):
+
     def initTestCase(self):
         self.shape = (16, 16)
         self.max_norm = 0.1
 
 
 class TestCase3(TestXPUClipByNormOp):
+
     def initTestCase(self):
         self.shape = (4, 8, 16)
         self.max_norm = 1.0
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
index 7f8f5d6bc747b..33198a28933a5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -30,11 +31,13 @@
 
 
 class XPUTestClipOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'clip'
         self.use_dynamic_create_class = False
 
     class TestClipOp(XPUOpTest):
+
         def setUp(self):
             self.init_dtype()
             self.set_xpu()
@@ -91,24 +94,28 @@ def test_check_output(self):
             paddle.disable_static()
 
     class TestClipOp1(TestClipOp):
+
         def init_data(self):
             self.shape = (8, 16, 8)
             self.max = 0.7
             self.min = 0.0
 
     class TestClipOp2(TestClipOp):
+
         def init_data(self):
             self.shape = (8, 16)
             self.max = 1.0
             self.min = 0.0
 
     class TestClipOp3(TestClipOp):
+
         def init_data(self):
             self.shape = (4, 8, 16)
             self.max = 0.7
             self.min = 0.2
 
     class TestClipOp4(TestClipOp):
+
         def init_data(self):
             self.shape = (4, 8, 8)
             self.max = 0.7
@@ -117,6 +124,7 @@ def init_data(self):
             self.inputs['Min'] = np.array([0.3]).astype('float32')
 
     class TestClipOp5(TestClipOp):
+
         def init_data(self):
             self.shape = (4, 8, 16)
             self.max = 0.5
@@ -124,6 +132,7 @@ def init_data(self):
 
 
 class TestClipOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
@@ -143,6 +152,7 @@ def test_dtype():
 
 
 class TestClipAPI(unittest.TestCase):
+
     def _executed_api(self, x, min=None, max=None):
         return paddle.clip(x, min, max)
 
@@ -154,8 +164,8 @@ def test_clip(self):
         min = fluid.data(name='min', shape=[1], dtype='float32')
         max = fluid.data(name='max', shape=[1], dtype='float32')
 
-        place = fluid.XPUPlace(0) if fluid.core.is_compiled_with_xpu(
-        ) else fluid.CPUPlace()
+        place = fluid.XPUPlace(
+            0) if fluid.core.is_compiled_with_xpu() else fluid.CPUPlace()
         exe = fluid.Executor(place)
 
         out_1 = self._executed_api(images, min=min, max=max)
@@ -174,9 +184,7 @@ def test_clip(self):
                 "min": np.array([0.2]).astype('float32'),
                 "max": np.array([0.8]).astype('float32')
             },
-            fetch_list=[
-                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
-            ])
+            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8])
 
         self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
         self.assertTrue(np.allclose(res2, data.clip(0.2, 0.9)))
@@ -190,8 +198,8 @@ def test_clip(self):
 
     def test_clip_dygraph(self):
         paddle.disable_static()
-        place = fluid.XPUPlace(0) if fluid.core.is_compiled_with_xpu(
-        ) else fluid.CPUPlace()
+        place = fluid.XPUPlace(
+            0) if fluid.core.is_compiled_with_xpu() else fluid.CPUPlace()
         paddle.disable_static(place)
         data_shape = [1, 9, 9, 4]
         data = np.random.random(data_shape).astype('float32')
@@ -219,6 +227,7 @@ def test_errors(self):
 
 
 class TestInplaceClipAPI(TestClipAPI):
+
     def _executed_api(self, x, min=None, max=None):
         return x.clip_(min, max)
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
index 32b27652f7692..a4175ec25cf1b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -26,7 +27,9 @@
 
 
 def create_test_class(op_type, typename, callback):
+
     class Cls(OpTest):
+
         def setUp(self):
             a = np.random.random(size=(10, 7)).astype(typename)
             b = np.random.random(size=(10, 7)).astype(typename)
@@ -49,12 +52,11 @@ def test_errors(self):
                 y = fluid.layers.data(name='y', shape=[2], dtype='int32')
                 a = fluid.layers.data(name='a', shape=[2], dtype='int16')
                 if self.op_type == "less_than":
-                    self.assertRaises(
-                        TypeError,
-                        fluid.layers.less_than,
-                        x=x,
-                        y=y,
-                        force_cpu=1)
+                    self.assertRaises(TypeError,
+                                      fluid.layers.less_than,
+                                      x=x,
+                                      y=y,
+                                      force_cpu=1)
                 op = eval("fluid.layers.%s" % self.op_type)
                 self.assertRaises(TypeError, op, x=x, y=y, cond=1)
                 self.assertRaises(TypeError, op, x=x, y=a)
@@ -78,14 +80,16 @@ def test_errors(self):
 
 
 def create_paddle_case(op_type, callback):
+
     class PaddleCls(unittest.TestCase):
+
         def setUp(self):
             self.op_type = op_type
             self.input_x = np.array([1, 2, 3, 4]).astype(np.int64)
             self.input_y = np.array([1, 3, 2, 4]).astype(np.int64)
             self.real_result = callback(self.input_x, self.input_y)
-            self.place = fluid.XPUPlace(0) if fluid.core.is_compiled_with_xpu(
-            ) else fluid.CPUPlace()
+            self.place = fluid.XPUPlace(
+                0) if fluid.core.is_compiled_with_xpu() else fluid.CPUPlace()
 
         def test_api(self):
             paddle.enable_static()
@@ -95,8 +99,10 @@ def test_api(self):
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
                 exe = fluid.Executor(self.place)
-                res, = exe.run(feed={"x": self.input_x,
-                                     "y": self.input_y},
+                res, = exe.run(feed={
+                    "x": self.input_x,
+                    "y": self.input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == self.real_result).all(), True)
 
@@ -109,8 +115,10 @@ def test_api_float(self):
                     op = eval("paddle.%s" % (self.op_type))
                     out = op(x, y)
                     exe = fluid.Executor(self.place)
-                    res, = exe.run(feed={"x": self.input_x,
-                                         "y": 1.0},
+                    res, = exe.run(feed={
+                        "x": self.input_x,
+                        "y": 1.0
+                    },
                                    fetch_list=[out])
                 self.real_result = np.array([1, 0, 0, 0]).astype(np.int64)
                 self.assertEqual((res == self.real_result).all(), True)
@@ -145,6 +153,7 @@ def test_dynamic_api_float(self):
                 paddle.enable_static()
 
         def test_assert(self):
+
             def test_dynamic_api_string(self):
                 if self.op_type == "equal":
                     paddle.disable_static()
@@ -168,8 +177,9 @@ def test_dynamic_api_bool(self):
         def test_broadcast_api_1(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
-                x = paddle.static.data(
-                    name='x', shape=[1, 2, 1, 3], dtype='int32')
+                x = paddle.static.data(name='x',
+                                       shape=[1, 2, 1, 3],
+                                       dtype='int32')
                 y = paddle.static.data(name='y', shape=[1, 2, 3], dtype='int32')
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
@@ -177,8 +187,10 @@ def test_broadcast_api_1(self):
                 input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
                 input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
@@ -186,16 +198,19 @@ def test_broadcast_api_2(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
                 x = paddle.static.data(name='x', shape=[1, 2, 3], dtype='int32')
-                y = paddle.static.data(
-                    name='y', shape=[1, 2, 1, 3], dtype='int32')
+                y = paddle.static.data(name='y',
+                                       shape=[1, 2, 1, 3],
+                                       dtype='int32')
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
                 exe = paddle.static.Executor(self.place)
                 input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
                 input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
@@ -210,8 +225,10 @@ def test_broadcast_api_3(self):
                 input_x = np.arange(0, 5).reshape((5)).astype(np.int32)
                 input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(np.int32)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
@@ -226,8 +243,10 @@ def test_bool_api_4(self):
                 input_x = np.array([True, False, True]).astype(np.bool)
                 input_y = np.array([True, True, False]).astype(np.bool)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
@@ -242,8 +261,10 @@ def test_bool_broadcast_api_4(self):
                 input_x = np.array([True, False, True]).astype(np.bool)
                 input_y = np.array([True]).astype(np.bool)
                 real_result = callback(input_x, input_y)
-                res, = exe.run(feed={"x": input_x,
-                                     "y": input_y},
+                res, = exe.run(feed={
+                    "x": input_x,
+                    "y": input_y
+                },
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_concat_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_concat_op_xpu.py
index 3f188e78f86c2..2355f5de9fd42 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_concat_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_concat_op_xpu.py
@@ -31,11 +31,13 @@
 
 
 class XPUTestConcatOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'concat'
         self.use_dynamic_create_class = False
 
     class TestConcatOp(XPUOpTest):
+
         def setUp(self):
             self.set_xpu()
             self.op_type = "concat"
@@ -54,8 +56,9 @@ def setUp(self):
                 self.actual_axis = self.axis
 
             self.outputs = {
-                'Out': np.concatenate(
-                    (self.x0, self.x1, self.x2), axis=self.actual_axis)
+                'Out':
+                np.concatenate((self.x0, self.x1, self.x2),
+                               axis=self.actual_axis)
             }
 
         def set_inputs(self):
@@ -84,10 +87,12 @@ def test_check_grad(self):
                 self.check_grad_with_place(place, ['x2'], 'Out')
 
     class TestConcatOpAxis0XPU(TestConcatOp):
+
         def init_axis(self):
             self.axis = 0
 
     class TestConcatOpAxis1XPU(TestConcatOp):
+
         def set_inputs(self):
             self.x0 = np.random.random((5, 1, 4, 5)).astype(self.dtype)
             self.x1 = np.random.random((5, 2, 4, 5)).astype(self.dtype)
@@ -97,28 +102,34 @@ def init_axis(self):
             self.axis = 1
 
     class TestConcatOpAxis2XPU(TestConcatOp):
+
         def init_axis(self):
             self.axis = 2
 
     class TestConcatOpAxis3XPU(TestConcatOp):
+
         def init_axis(self):
             self.axis = 3
 
     class TestConcatOpAxisNeg1XPU(TestConcatOp):
+
         def init_axis(self):
             self.axis = -1
 
     class TestConcatOpAxisNeg2XPU(TestConcatOp):
+
         def init_axis(self):
             self.axis = -2
 
     class TestConcatOpAxisNeg3XPU(TestConcatOp):
+
         def init_axis(self):
             self.axis = -3
 
     @skip_check_grad_ci(
         reason="The function 'check_grad' for large inputs is too slow.")
     class TestConcatOp3(TestConcatOp):
+
         def set_inputs(self):
             self.x0 = np.random.random((1, 256, 170, 256)).astype(self.dtype)
             self.x1 = np.random.random((1, 128, 170, 256)).astype(self.dtype)
@@ -129,9 +140,11 @@ def test_check_grad(self):
             pass
 
     @skip_check_grad_ci(
-        reason="This test will meet fetch error when there is a null grad. The detailed information is in PR#17015."
+        reason=
+        "This test will meet fetch error when there is a null grad. The detailed information is in PR#17015."
     )
     class TestConcatOp4(TestConcatOp):
+
         def set_inputs(self):
             self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
             self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
index 5f954659c2d9a..751c4cdf302fd 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -63,8 +64,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         for input_size, filter_size, stride_size in zip(input_shape, pool_size,
                                                         pool_stride):
             out_size = int((input_size + stride_size - 1) / stride_size)
-            pad_sum = np.max((
-                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0))
             pad_0 = int(pad_sum / 2)
             pad_1 = int(pad_sum - pad_0)
             padding.append(pad_0)
@@ -93,14 +94,14 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
     d_bolck_h = (dilation[0] * (f_h - 1) + 1)
     d_bolck_w = (dilation[1] * (f_w - 1) + 1)
 
-    input_pad = np.pad(input, ((0, 0), (0, 0), (pad_h_0, pad_h_1),
-                               (pad_w_0, pad_w_1)),
+    input_pad = np.pad(input,
+                       ((0, 0), (0, 0), (pad_h_0, pad_h_1), (pad_w_0, pad_w_1)),
                        mode='constant',
                        constant_values=0)
 
     filter_dilation = np.zeros((f_n, f_c, d_bolck_h, d_bolck_w))
-    filter_dilation[:, :, 0:d_bolck_h:dilation[0], 0:d_bolck_w:dilation[
-        1]] = filter
+    filter_dilation[:, :, 0:d_bolck_h:dilation[0],
+                    0:d_bolck_w:dilation[1]] = filter
 
     for i in range(out_h):
         for j in range(out_w):
@@ -125,7 +126,9 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
 
 
 def create_test_channel_last_class(parent):
+
     class TestChannelLastCase(parent):
+
         def init_data_format(self):
             self.data_format = "NHWC"
 
@@ -139,7 +142,9 @@ def init_test_case_2(self):
 
 
 def create_test_padding_SAME_class(parent):
+
     class TestPaddingSMAECase(parent):
+
         def init_paddings(self):
             self.pad = [0, 0]
             self.padding_algorithm = "SAME"
@@ -150,7 +155,9 @@ def init_paddings(self):
 
 
 def create_test_padding_VALID_class(parent):
+
     class TestPaddingVALIDCase(parent):
+
         def init_paddings(self):
             self.pad = [1, 1]
             self.padding_algorithm = "VALID"
@@ -161,11 +168,13 @@ def init_paddings(self):
 
 
 class XPUTestConv2DOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'conv2d'
         self.use_dynamic_create_class = False
 
     class TestConv2DOp(XPUOpTest):
+
         def setUp(self):
             self.dtype = self.in_type
             self.place = paddle.XPUPlace(0)
@@ -225,8 +234,8 @@ def setUp(self):
             self.outputs = {'Output': output}
 
         def has_cuda(self):
-            return core.is_compiled_with_cuda() and (self.use_cudnn or
-                                                     self.use_cuda)
+            return core.is_compiled_with_cuda() and (self.use_cudnn
+                                                     or self.use_cuda)
 
         def test_check_output(self):
             if core.is_compiled_with_xpu():
@@ -234,8 +243,8 @@ def test_check_output(self):
                 self.check_output_with_place(self.place)
 
         def test_check_grad(self):
-            if (hasattr(self, "no_need_check_grad") and
-                    self.no_need_check_grad == True):
+            if (hasattr(self, "no_need_check_grad")
+                    and self.no_need_check_grad == True):
                 return
             if core.is_compiled_with_xpu():
                 paddle.enable_static()
@@ -243,26 +252,24 @@ def test_check_grad(self):
                                            'Output')
 
         def test_check_grad_no_filter(self):
-            if (hasattr(self, "no_need_check_grad") and
-                    self.no_need_check_grad == True):
+            if (hasattr(self, "no_need_check_grad")
+                    and self.no_need_check_grad == True):
                 return
             if core.is_compiled_with_xpu():
                 paddle.enable_static()
-                self.check_grad_with_place(
-                    self.place, ['Input'],
-                    'Output',
-                    no_grad_set=set(['Filter']))
+                self.check_grad_with_place(self.place, ['Input'],
+                                           'Output',
+                                           no_grad_set=set(['Filter']))
 
         def test_check_grad_no_input(self):
-            if (hasattr(self, "no_need_check_grad") and
-                    self.no_need_check_grad == True):
+            if (hasattr(self, "no_need_check_grad")
+                    and self.no_need_check_grad == True):
                 return
             if core.is_compiled_with_xpu():
                 paddle.enable_static()
-                self.check_grad_with_place(
-                    self.place, ['Filter'],
-                    'Output',
-                    no_grad_set=set(['Input']))
+                self.check_grad_with_place(self.place, ['Filter'],
+                                           'Output',
+                                           no_grad_set=set(['Input']))
 
         def init_test_case(self):
             self.pad = [0, 0]
@@ -285,6 +292,7 @@ def init_kernel_type(self):
             pass
 
     class TestWithPad(TestConv2DOp):
+
         def init_test_case(self):
             self.pad = [1, 1]
             self.stride = [1, 1]
@@ -294,6 +302,7 @@ def init_test_case(self):
             self.filter_size = [6, f_c, 3, 3]
 
     class TestWithStride(TestConv2DOp):
+
         def init_test_case(self):
             self.pad = [1, 1]
             self.stride = [2, 2]
@@ -303,6 +312,7 @@ def init_test_case(self):
             self.filter_size = [6, f_c, 3, 3]
 
     class TestWith1x1(TestConv2DOp):
+
         def init_test_case(self):
             self.pad = [0, 0]
             self.stride = [1, 1]
@@ -317,11 +327,13 @@ def init_group(self):
 
 # ---- test asymmetric padding ----
 class XPUTestConv2DOp_v2(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'conv2d'
         self.use_dynamic_create_class = False
 
     class TestConv2DOp_v2(XPUOpTest):
+
         def setUp(self):
             self.dtype = self.in_type
             self.place = paddle.XPUPlace(0)
@@ -359,9 +371,10 @@ def setUp(self):
             np.random.seed(8)
             filter = np.random.uniform(-1, 1,
                                        self.filter_size).astype(self.dtype)
-            output, _, _, _, _ = conv2d_forward_naive(
-                input2, filter, self.groups, conv2d_param,
-                self.padding_algorithm, self.data_format)
+            output, _, _, _, _ = conv2d_forward_naive(input2, filter,
+                                                      self.groups, conv2d_param,
+                                                      self.padding_algorithm,
+                                                      self.data_format)
             output = output.astype(self.dtype)
 
             self.inputs = {
@@ -384,8 +397,8 @@ def setUp(self):
             self.outputs = {'Output': output}
 
         def has_cuda(self):
-            return core.is_compiled_with_cuda() and (self.use_cudnn or
-                                                     self.use_cuda)
+            return core.is_compiled_with_cuda() and (self.use_cudnn
+                                                     or self.use_cuda)
 
         def test_check_output(self):
             # TODO(wangzhongpu): support mkldnn op in dygraph mode
@@ -395,8 +408,8 @@ def test_check_output(self):
 
         def test_check_grad(self):
             # TODO(wangzhongpu): support mkldnn op in dygraph mode
-            if (hasattr(self, "no_need_check_grad") and
-                    self.no_need_check_grad == True):
+            if (hasattr(self, "no_need_check_grad")
+                    and self.no_need_check_grad == True):
                 return
             if core.is_compiled_with_xpu():
                 paddle.enable_static()
@@ -405,27 +418,25 @@ def test_check_grad(self):
 
         def test_check_grad_no_filter(self):
             # TODO(wangzhongpu): support mkldnn op in dygraph mode
-            if (hasattr(self, "no_need_check_grad") and
-                    self.no_need_check_grad == True):
+            if (hasattr(self, "no_need_check_grad")
+                    and self.no_need_check_grad == True):
                 return
             if core.is_compiled_with_xpu():
                 paddle.enable_static()
-                self.check_grad_with_place(
-                    self.place, ['Input'],
-                    'Output',
-                    no_grad_set=set(['Filter']))
+                self.check_grad_with_place(self.place, ['Input'],
+                                           'Output',
+                                           no_grad_set=set(['Filter']))
 
         def test_check_grad_no_input(self):
             # TODO(wangzhongpu): support mkldnn op in dygraph mode
-            if (hasattr(self, "no_need_check_grad") and
-                    self.no_need_check_grad == True):
+            if (hasattr(self, "no_need_check_grad")
+                    and self.no_need_check_grad == True):
                 return
             if core.is_compiled_with_xpu():
                 paddle.enable_static()
-                self.check_grad_with_place(
-                    self.place, ['Filter'],
-                    'Output',
-                    no_grad_set=set(['Input']))
+                self.check_grad_with_place(self.place, ['Filter'],
+                                           'Output',
+                                           no_grad_set=set(['Input']))
 
         def init_test_case(self):
             self.pad = [0, 0]
@@ -455,11 +466,13 @@ def init_test_case_2(self):
             pass
 
     class TestConv2DOp_AsyPadding(TestConv2DOp_v2):
+
         def init_paddings(self):
             self.pad = [0, 0, 0, 0]
             self.padding_algorithm = "EXPLICIT"
 
     class TestWithPad_AsyPadding(TestConv2DOp_v2):
+
         def init_test_case(self):
             self.stride = [1, 1]
             self.input_size = [2, 3, 5, 5]  # NCHW
@@ -472,6 +485,7 @@ def init_paddings(self):
             self.padding_algorithm = "EXPLICIT"
 
     class TestWithStride_AsyPadding(TestConv2DOp_v2):
+
         def init_test_case(self):
             self.stride = [2, 2]
             self.input_size = [2, 3, 6, 6]  # NCHW
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py
index b4f9f639ac7eb..4204a73524d27 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -48,11 +49,12 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
     # update pad and dilation
     def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
         padding = []
-        for input_size, filter_size, stride_size in zip(
-                input_shape, kernel_size, kernel_stride):
+        for input_size, filter_size, stride_size in zip(input_shape,
+                                                        kernel_size,
+                                                        kernel_stride):
             out_size = int((input_size + stride_size - 1) / stride_size)
-            pad_sum = np.max((
-                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0))
             pad_0 = int(pad_sum / 2)
             pad_1 = int(pad_sum - pad_0)
             padding.append(pad_0)
@@ -86,8 +88,8 @@ def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
     if 'output_padding' in attrs:
         out_pad_h = attrs['output_padding'][0]
         out_pad_w = attrs['output_padding'][1]
-    out = np.zeros(
-        (in_n, out_c, out_h + out_pad_h, out_w + out_pad_w), dtype=input_.dtype)
+    out = np.zeros((in_n, out_c, out_h + out_pad_h, out_w + out_pad_w),
+                   dtype=input_.dtype)
 
     for n in range(in_n):
         for i in range(in_h):
@@ -105,17 +107,18 @@ def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
                             axis=0)
                         i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
                         j1, j2 = j * stride[1], j * stride[1] + d_bolck_w
-                        out[n, g * f_out_c + k, i1:i2:dilations[0], j1:j2:
-                            dilations[1]] += tmp_out
+                        out[n, g * f_out_c + k, i1:i2:dilations[0],
+                            j1:j2:dilations[1]] += tmp_out
 
-    out = out[:, :, pad_h_0:out_h - pad_h_1 + out_pad_h, pad_w_0:out_w - pad_w_1
-              + out_pad_w]
+    out = out[:, :, pad_h_0:out_h - pad_h_1 + out_pad_h,
+              pad_w_0:out_w - pad_w_1 + out_pad_w]
     if attrs['data_format'] == 'NHWC':
         out = np.transpose(out, [0, 2, 3, 1])
     return out
 
 
 class TestConv2DTransposeOp(XPUOpTest):
+
     def setUp(self):
         # init as conv transpose
         self.dtype = np.float32
@@ -169,24 +172,26 @@ def test_check_grad_no_input(self):
             if core.is_compiled_with_xpu():
                 paddle.enable_static()
                 place = paddle.XPUPlace(0)
-                self.check_grad_with_place(
-                    place, ['Filter'], 'Output', no_grad_set=set(['Input']))
+                self.check_grad_with_place(place, ['Filter'],
+                                           'Output',
+                                           no_grad_set=set(['Input']))
 
     def test_check_grad_no_filter(self):
         if self.need_check_grad:
             if core.is_compiled_with_xpu():
                 paddle.enable_static()
                 place = paddle.XPUPlace(0)
-                self.check_grad_with_place(
-                    place, ['Input'], 'Output', no_grad_set=set(['Filter']))
+                self.check_grad_with_place(place, ['Input'],
+                                           'Output',
+                                           no_grad_set=set(['Filter']))
 
     def test_check_grad(self):
         if self.need_check_grad:
             if core.is_compiled_with_xpu():
                 paddle.enable_static()
                 place = paddle.XPUPlace(0)
-                self.check_grad_with_place(place,
-                                           set(['Input', 'Filter']), 'Output')
+                self.check_grad_with_place(place, set(['Input', 'Filter']),
+                                           'Output')
 
     def init_test_case(self):
         self.pad = [0, 0]
@@ -202,6 +207,7 @@ def init_op_type(self):
 
 
 class TestWithSymmetricPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -213,6 +219,7 @@ def init_test_case(self):
 
 
 class TestWithAsymmetricPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 0, 1, 2]
         self.stride = [1, 1]
@@ -224,6 +231,7 @@ def init_test_case(self):
 
 
 class TestWithSAMEPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.stride = [2, 1]
         self.dilations = [1, 2]
@@ -235,6 +243,7 @@ def init_test_case(self):
 
 
 class TestWithVALIDPad(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.stride = [1, 1]
         self.dilations = [1, 1]
@@ -246,6 +255,7 @@ def init_test_case(self):
 
 
 class TestWithGroups(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -257,6 +267,7 @@ def init_test_case(self):
 
 
 class TestWithStride(TestConv2DTransposeOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_deformable_conv_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_deformable_conv_op_xpu.py
index 5c611b6299888..9f77488983545 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_deformable_conv_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_deformable_conv_op_xpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -96,8 +97,8 @@ def dconv_im2col_gemm(input, offset, mask, filter, group, conv_param):
                                 val = dmc_bilinear(input[n, c], in_h, in_w,
                                                    im_h, im_w)
                             val_out = val * mask_table[kh, kw]
-                            col_buffer[n, c * f_h * f_w + kh * f_w + kw, h *
-                                       in_w + w] = val_out
+                            col_buffer[n, c * f_h * f_w + kh * f_w + kw,
+                                       h * in_w + w] = val_out
 
     out = np.zeros((in_n, group, int(out_c // group), out_h * out_w))
     weight = filter.reshape(group, int(out_c // group), f_c * f_h * f_w)
@@ -111,6 +112,7 @@ def dconv_im2col_gemm(input, offset, mask, filter, group, conv_param):
 
 
 class TestModulatedDeformableConvOp(XPUOpTest):
+
     def setUp(self):
         self.op_type = "deformable_conv"
         self.dtype = np.float32
@@ -149,8 +151,8 @@ def setUp(self):
         self.outputs = {'Output': output}
 
     def has_cuda(self):
-        return core.is_compiled_with_cuda() and (self.use_cudnn or
-                                                 self.use_cuda)
+        return core.is_compiled_with_cuda() and (self.use_cudnn
+                                                 or self.use_cuda)
 
     def test_check_output(self):
         if core.is_compiled_with_xpu():
@@ -162,10 +164,10 @@ def test_check_grad(self):
         if core.is_compiled_with_xpu():
             paddle.enable_static()
             place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, {'Input', 'Offset', 'Mask', 'Filter'},
-                'Output',
-                max_relative_error=0.06)
+            self.check_grad_with_place(place,
+                                       {'Input', 'Offset', 'Mask', 'Filter'},
+                                       'Output',
+                                       max_relative_error=0.06)
 
     def init_test_case(self):
         self.pad = [1, 1]
@@ -196,6 +198,7 @@ def init_group(self):
 
 
 class TestWithDilation(TestModulatedDeformableConvOp):
+
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [1, 1]
@@ -221,6 +224,7 @@ def init_dilation(self):
 
 
 class TestWith3x3(TestModulatedDeformableConvOp):
+
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -243,29 +247,42 @@ def init_test_case(self):
 
 
 class TestModulatedDeformableConvInvalidInput(unittest.TestCase):
+
     def test_error(self):
+
         def test_invalid_input():
             paddle.enable_static()
             input = [1, 3, 32, 32]
-            offset = fluid.data(
-                name='offset', shape=[None, 3, 32, 32], dtype='float32')
-            mask = fluid.data(
-                name='mask', shape=[None, 3, 32, 32], dtype='float32')
-            loss = fluid.layers.deformable_conv(
-                input, offset, mask, num_filters=4, filter_size=1)
+            offset = fluid.data(name='offset',
+                                shape=[None, 3, 32, 32],
+                                dtype='float32')
+            mask = fluid.data(name='mask',
+                              shape=[None, 3, 32, 32],
+                              dtype='float32')
+            loss = fluid.layers.deformable_conv(input,
+                                                offset,
+                                                mask,
+                                                num_filters=4,
+                                                filter_size=1)
 
         self.assertRaises(TypeError, test_invalid_input)
 
         def test_invalid_offset():
             paddle.enable_static()
-            input = fluid.data(
-                name='input', shape=[None, 3, 32, 32], dtype='int32')
-            offset = fluid.data(
-                name='offset', shape=[None, 3, 32, 32], dtype='float32')
-            mask = fluid.data(
-                name='mask', shape=[None, 3, 32, 32], dtype='float32')
-            loss = fluid.layers.deformable_conv(
-                input, offset, mask, num_filters=4, filter_size=1)
+            input = fluid.data(name='input',
+                               shape=[None, 3, 32, 32],
+                               dtype='int32')
+            offset = fluid.data(name='offset',
+                                shape=[None, 3, 32, 32],
+                                dtype='float32')
+            mask = fluid.data(name='mask',
+                              shape=[None, 3, 32, 32],
+                              dtype='float32')
+            loss = fluid.layers.deformable_conv(input,
+                                                offset,
+                                                mask,
+                                                num_filters=4,
+                                                filter_size=1)
 
         self.assertRaises(TypeError, test_invalid_offset)
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py
index 2baa837b23a07..b4e8cf6b10e37 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -23,17 +24,20 @@
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 from op_test_xpu import XPUOpTest
+
 paddle.enable_static()
 
 from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 
 class XPUTestDropoutOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'dropout'
         self.use_dynamic_create_class = False
 
     class TestDropoutOp(XPUOpTest):
+
         def setUp(self):
             self.init_inputs_shape()
             self.init_attrs()
@@ -79,10 +83,12 @@ def test_check_grad_normal(self):
             self.check_grad(['X'], 'Out')
 
     class TestDropoutOpInput1d(TestDropoutOp):
+
         def init_inputs_shape(self):
             self.shape = [2000]
 
     class TestDropoutOp2(TestDropoutOp):
+
         def init_inputs_shape(self):
             self.shape = [32, 64]
 
@@ -93,10 +99,12 @@ def init_attrs(self):
             self.dropout_implementation = "upscale_in_train"
 
     class TestDropoutOp3(TestDropoutOp):
+
         def init_inputs_shape(self):
             self.shape = [32, 64, 2]
 
     class TestDropoutOp4(TestDropoutOp):
+
         def init_attrs(self):
             self.__class__.no_need_check_grad = True
             self.dropout_prob = 0.35
@@ -105,6 +113,7 @@ def init_attrs(self):
             self.dropout_implementation = "downgrade_in_infer"
 
     class TestDropoutOp5(TestDropoutOp):
+
         def init_inputs_shape(self):
             self.shape = [32, 64, 3]
 
@@ -116,14 +125,15 @@ def init_attrs(self):
             self.dropout_implementation = "downgrade_in_infer"
 
     class TestDropoutOpError(unittest.TestCase):
+
         def test_errors(self):
             with program_guard(Program(), Program()):
 
                 def test_Variable():
                     # the input of dropout must be Variable.
-                    x1 = fluid.create_lod_tensor(
-                        np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]],
-                        fluid.CPUPlace())
+                    x1 = fluid.create_lod_tensor(np.array([-1, 3, 5,
+                                                           5]), [[1, 1, 1, 1]],
+                                                 fluid.CPUPlace())
                     fluid.layers.dropout(x1, dropout_prob=0.5)
 
                 self.assertRaises(TypeError, test_Variable)
@@ -131,13 +141,15 @@ def test_Variable():
                 def test_dtype():
                     # the input dtype of dropout must be float16 or float32 or float64
                     # float16 only can be set on GPU place
-                    x2 = fluid.layers.data(
-                        name='x2', shape=[3, 4, 5, 6], dtype="int32")
+                    x2 = fluid.layers.data(name='x2',
+                                           shape=[3, 4, 5, 6],
+                                           dtype="int32")
                     fluid.layers.dropout(x2, dropout_prob=0.5)
 
                 self.assertRaises(TypeError, test_dtype)
 
     class TestDropoutCAPI(unittest.TestCase):
+
         def setUp(self):
             np.random.seed(123)
             self.places = [fluid.CPUPlace()]
@@ -155,6 +167,7 @@ def test_dygraph(self):
                     self.assertTrue(np.allclose(result.numpy(), result_np))
 
     class TestDropoutBackward(unittest.TestCase):
+
         def setUp(self):
             np.random.seed(123)
             self.places = [fluid.CPUPlace()]
@@ -176,8 +189,9 @@ def test_backward_downscale_in_infer(self):
                     out.backward()
 
                     self.assertTrue(
-                        np.array_equal(input.gradient(
-                        ), self.cal_grad_downscale_in_infer(mask.numpy())))
+                        np.array_equal(
+                            input.gradient(),
+                            self.cal_grad_downscale_in_infer(mask.numpy())))
 
         def test_backward_upscale_train(self):
             for place in self.places:
@@ -192,8 +206,9 @@ def test_backward_upscale_train(self):
                     out.backward()
 
                     self.assertTrue(
-                        np.allclose(input.gradient(
-                        ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+                        np.allclose(
+                            input.gradient(),
+                            self.cal_grad_upscale_train(mask.numpy(), prob)))
 
         def test_backward_upscale_train_2(self):
             for place in self.places:
@@ -208,8 +223,9 @@ def test_backward_upscale_train_2(self):
                     out.backward()
 
                     self.assertTrue(
-                        np.allclose(input.gradient(
-                        ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+                        np.allclose(
+                            input.gradient(),
+                            self.cal_grad_upscale_train(mask.numpy(), prob)))
 
 
 support_types = get_xpu_op_support_types('dropout')
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
index 2fc3a42df1264..84cf048d068d4 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import numpy as np
 import sys
+
 sys.path.append("..")
 import paddle
 from op_test import OpTest, skip_check_grad_ci
@@ -28,11 +29,13 @@
 
 
 class XPUTestElementwiseAddOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'elementwise_add'
         self.use_dynamic_create_class = False
 
     class TestElementwiseAddOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = "elementwise_add"
             self.init_dtype()
@@ -94,6 +97,7 @@ def init_max_relative_error(self):
     @skip_check_grad_ci(
         reason="[skip shape check] Use y_shape(1) to test broadcast.")
     class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(2, 3, 4).astype(self.dtype)
             self.y = np.random.rand(1).astype(self.dtype)
@@ -102,18 +106,21 @@ def init_input_output(self):
     @skip_check_grad_ci(
         reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
     class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(2, 3, 4).astype(self.dtype)
             self.y = np.random.rand(1, 1).astype(self.dtype)
             self.out = self.x + self.y
 
     class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.random((100, )).astype(self.dtype)
             self.y = np.random.random((100, )).astype(self.dtype)
             self.out = np.add(self.x, self.y)
 
     class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(100, 2, 3).astype(self.dtype)
             self.y = np.random.rand(100).astype(self.dtype)
@@ -123,6 +130,7 @@ def init_axis(self):
             self.axis = 0
 
     class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(2, 100, 3).astype(self.dtype)
             self.y = np.random.rand(100).astype(self.dtype)
@@ -132,12 +140,14 @@ def init_axis(self):
             self.axis = 1
 
     class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(2, 3, 100).astype(self.dtype)
             self.y = np.random.rand(100).astype(self.dtype)
             self.out = self.x + self.y.reshape(1, 1, 100)
 
     class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
             self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -147,6 +157,7 @@ def init_axis(self):
             self.axis = 1
 
     class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
             self.y = np.random.rand(100, 1).astype(self.dtype)
@@ -156,24 +167,28 @@ def init_axis(self):
             self.axis = 0
 
     class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(10, 3, 12).astype(self.dtype)
             self.y = np.random.rand(10, 1, 12).astype(self.dtype)
             self.out = self.x + self.y
 
     class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
             self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
             self.out = self.x + self.y
 
     class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
             self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
             self.out = self.x + self.y
 
     class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(2, 10, 12).astype(self.dtype)
             self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -185,6 +200,7 @@ def init_axis(self):
     @skip_check_grad_ci(
         reason="[skip shape check] Use y_shape(1) to test broadcast.")
     class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(100, 1).astype(self.dtype)
             self.y = np.random.rand(1).astype(self.dtype)
@@ -194,6 +210,7 @@ def init_axis(self):
             self.axis = 1
 
     class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(100, 2, 3).astype(self.dtype)
             self.y = np.random.rand(100, 1, 1).astype(self.dtype)
@@ -203,6 +220,7 @@ def init_axis(self):
             self.axis = -1
 
     class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(2, 3, 100).astype(self.dtype)
             self.y = np.random.rand(1, 1, 100).astype(self.dtype)
@@ -212,6 +230,7 @@ def init_axis(self):
             self.axis = -1
 
     class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
             self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
@@ -221,6 +240,7 @@ def init_axis(self):
             self.axis = -1
 
     class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
+
         def init_input_output(self):
             self.x = np.random.rand(10, 12).astype(self.dtype)
             self.y = np.random.rand(2, 3, 10, 12).astype(self.dtype)
@@ -230,26 +250,30 @@ def init_axis(self):
             self.axis = 2
 
     class TestElementwiseAddOpError(unittest.TestCase):
+
         def test_errors(self):
             with program_guard(Program(), Program()):
                 # the input of elementwise_add must be Variable.
-                x1 = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
-                y1 = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
+                x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                             [[1, 1, 1, 1]], fluid.XPUPlace(0))
+                y1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                             [[1, 1, 1, 1]], fluid.XPUPlace(0))
                 self.assertRaises(TypeError, fluid.layers.elementwise_add, x1,
                                   y1)
 
                 # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64
                 # float16 only can be set on GPU place
-                x2 = fluid.layers.data(
-                    name='x2', shape=[3, 4, 5, 6], dtype="uint8")
-                y2 = fluid.layers.data(
-                    name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+                x2 = fluid.layers.data(name='x2',
+                                       shape=[3, 4, 5, 6],
+                                       dtype="uint8")
+                y2 = fluid.layers.data(name='y2',
+                                       shape=[3, 4, 5, 6],
+                                       dtype="uint8")
                 self.assertRaises(TypeError, fluid.layers.elementwise_add, x2,
                                   y2)
 
     class TestAddOp(unittest.TestCase):
+
         def test_name(self):
             with fluid.program_guard(fluid.Program()):
                 x = fluid.data(name="x", shape=[2, 3], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py
index 7cc97ccc82f7e..9ac16ab745aaf 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import numpy as np
 import sys
+
 sys.path.append("..")
 import paddle
 from op_test import OpTest, skip_check_grad_ci
@@ -22,12 +23,14 @@
 import unittest
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+
 paddle.enable_static()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp(XPUOpTest):
+
     def setUp(self):
         self.op_type = "elementwise_add"
         self.init_dtype()
@@ -92,6 +95,7 @@ def init_max_relative_error(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(1).astype(self.dtype)
@@ -103,6 +107,7 @@ def init_input_output(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
 class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(1, 1).astype(self.dtype)
@@ -112,6 +117,7 @@ def init_input_output(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.random((100, )).astype(self.dtype)
         self.y = np.random.random((100, )).astype(self.dtype)
@@ -121,6 +127,7 @@ def init_input_output(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -133,6 +140,7 @@ def init_axis(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 100, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -145,6 +153,7 @@ def init_axis(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
@@ -154,6 +163,7 @@ def init_input_output(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -166,6 +176,7 @@ def init_axis(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
@@ -178,6 +189,7 @@ def init_axis(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 3, 12).astype(self.dtype)
         self.y = np.random.rand(10, 1, 12).astype(self.dtype)
@@ -187,6 +199,7 @@ def init_input_output(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
         self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
@@ -196,6 +209,7 @@ def init_input_output(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
         self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
@@ -205,6 +219,7 @@ def init_input_output(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -219,6 +234,7 @@ def init_axis(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 1).astype(self.dtype)
         self.y = np.random.rand(1).astype(self.dtype)
@@ -231,6 +247,7 @@ def init_axis(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100, 1, 1).astype(self.dtype)
@@ -243,6 +260,7 @@ def init_axis(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(1, 1, 100).astype(self.dtype)
@@ -255,6 +273,7 @@ def init_axis(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
         self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
@@ -267,6 +286,7 @@ def init_axis(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
+
     def init_input_output(self):
         self.x = np.random.rand(10, 12).astype(self.dtype)
         self.y = np.random.rand(2, 3, 10, 12).astype(self.dtype)
@@ -279,13 +299,14 @@ def init_axis(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # the input of elementwise_add must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
-            y1 = fluid.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.XPUPlace(0))
+            y1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.XPUPlace(0))
             self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1)
 
             # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64
@@ -298,6 +319,7 @@ def test_errors(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestAddOp(unittest.TestCase):
+
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
             x = fluid.data(name="x", shape=[2, 3], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py
index 3b593818b4e9c..c784ac83a543d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -26,11 +27,13 @@
 
 
 class XPUTestElementwiseDivOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'elementwise_div'
         self.use_dynamic_create_class = False
 
     class ElementwiseDivOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = "elementwise_div"
             self.dtype = self.in_type
@@ -60,26 +63,25 @@ def test_check_output(self):
         def test_check_grad_normal(self):
             if paddle.is_compiled_with_xpu():
                 place = paddle.XPUPlace(0)
-                self.check_grad_with_place(
-                    place, ['X', 'Y'], 'Out', max_relative_error=0.05)
+                self.check_grad_with_place(place, ['X', 'Y'],
+                                           'Out',
+                                           max_relative_error=0.05)
 
         def test_check_grad_ingore_x(self):
             if paddle.is_compiled_with_xpu():
                 place = paddle.XPUPlace(0)
-                self.check_grad_with_place(
-                    place, ['Y'],
-                    'Out',
-                    max_relative_error=0.05,
-                    no_grad_set=set("X"))
+                self.check_grad_with_place(place, ['Y'],
+                                           'Out',
+                                           max_relative_error=0.05,
+                                           no_grad_set=set("X"))
 
         def test_check_grad_ingore_y(self):
             if paddle.is_compiled_with_xpu():
                 place = paddle.XPUPlace(0)
-                self.check_grad_with_place(
-                    place, ['X'],
-                    'Out',
-                    max_relative_error=0.05,
-                    no_grad_set=set('Y'))
+                self.check_grad_with_place(place, ['X'],
+                                           'Out',
+                                           max_relative_error=0.05,
+                                           no_grad_set=set('Y'))
 
         def init_dtype(self):
             pass
@@ -87,6 +89,7 @@ def init_dtype(self):
     @skip_check_grad_ci(
         reason="[skip shape check] Use y_shape(1) to test broadcast.")
     class TestElementwiseDivOp_scalar(ElementwiseDivOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(0.1, 1, [20, 3, 4]).astype(self.dtype),
@@ -95,6 +98,7 @@ def init_input_output(self):
             self.outputs = {'Out': self.inputs['X'] / self.inputs['Y']}
 
     class TestElementwiseDivOp_Vector(ElementwiseDivOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(0.1, 1, [100]).astype(self.dtype),
@@ -105,6 +109,7 @@ def init_input_output(self):
             }
 
     class TestElementwiseDivOp_broadcast_0(ElementwiseDivOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(0.1, 1, [100, 3, 4]).astype(self.dtype),
@@ -118,6 +123,7 @@ def init_input_output(self):
             }
 
     class TestElementwiseDivOp_broadcast_1(ElementwiseDivOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(0.1, 1, [2, 100, 4]).astype(self.dtype),
@@ -131,6 +137,7 @@ def init_input_output(self):
             }
 
     class TestElementwiseDivOp_broadcast_2(ElementwiseDivOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(0.1, 1, [2, 3, 100]).astype(self.dtype),
@@ -143,20 +150,23 @@ def init_input_output(self):
             }
 
     class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp):
+
         def init_input_output(self):
             self.inputs = {
-                'X':
-                np.random.uniform(0.1, 1, [2, 10, 12, 5]).astype(self.dtype),
+                'X': np.random.uniform(0.1, 1,
+                                       [2, 10, 12, 5]).astype(self.dtype),
                 'Y': np.random.uniform(0.1, 1, [10, 12]).astype(self.dtype)
             }
 
             self.attrs = {'axis': 1}
             self.outputs = {
-                'Out': np.divide(self.inputs['X'],
-                                 self.inputs['Y'].reshape(1, 10, 12, 1))
+                'Out':
+                np.divide(self.inputs['X'],
+                          self.inputs['Y'].reshape(1, 10, 12, 1))
             }
 
     class TestElementwiseDivOp_broadcast_4(ElementwiseDivOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(0.1, 1, [2, 3, 50]).astype(self.dtype),
@@ -167,10 +177,11 @@ def init_input_output(self):
             }
 
     class TestElementwiseDivOp_broadcast_5(ElementwiseDivOp):
+
         def init_input_output(self):
             self.inputs = {
-                'X':
-                np.random.uniform(0.1, 1, [2, 3, 4, 20]).astype(self.dtype),
+                'X': np.random.uniform(0.1, 1,
+                                       [2, 3, 4, 20]).astype(self.dtype),
                 'Y': np.random.uniform(0.1, 1, [2, 3, 1, 20]).astype(self.dtype)
             }
             self.outputs = {
@@ -178,6 +189,7 @@ def init_input_output(self):
             }
 
     class TestElementwiseDivOp_commonuse_1(ElementwiseDivOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(0.1, 1, [2, 3, 100]).astype(self.dtype),
@@ -188,23 +200,25 @@ def init_input_output(self):
             }
 
     class TestElementwiseDivOp_commonuse_2(ElementwiseDivOp):
+
         def init_input_output(self):
             self.inputs = {
-                'X':
-                np.random.uniform(0.1, 1, [30, 3, 1, 5]).astype(self.dtype),
-                'Y':
-                np.random.uniform(0.1, 1, [30, 1, 4, 1]).astype(self.dtype),
+                'X': np.random.uniform(0.1, 1,
+                                       [30, 3, 1, 5]).astype(self.dtype),
+                'Y': np.random.uniform(0.1, 1,
+                                       [30, 1, 4, 1]).astype(self.dtype),
             }
             self.outputs = {
                 'Out': np.divide(self.inputs['X'], self.inputs['Y'])
             }
 
     class TestElementwiseDivOp_xsize_lessthan_ysize(ElementwiseDivOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(0.1, 1, [10, 12]).astype(self.dtype),
-                'Y':
-                np.random.uniform(0.1, 1, [2, 3, 10, 12]).astype(self.dtype),
+                'Y': np.random.uniform(0.1, 1,
+                                       [2, 3, 10, 12]).astype(self.dtype),
             }
 
             self.attrs = {'axis': 2}
@@ -214,10 +228,12 @@ def init_input_output(self):
             }
 
     class TestElementwiseDivBroadcast(unittest.TestCase):
+
         def test_shape_with_batch_sizes(self):
             with fluid.program_guard(fluid.Program()):
-                x_var = fluid.data(
-                    name='x', dtype='float32', shape=[None, 3, None, None])
+                x_var = fluid.data(name='x',
+                                   dtype='float32',
+                                   shape=[None, 3, None, None])
                 one = 2.
                 out = one / x_var
                 exe = fluid.Executor(fluid.XPUPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py
index ea01a38f4b38d..93d30fef11bba 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -21,16 +22,19 @@
 from op_test import OpTest, skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
 paddle.enable_static()
 import random
 
 
 class XPUTestElementwiseModOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'elementwise_floordiv'
         self.use_dynamic_create_class = False
 
     class TestElementwiseModOp(XPUOpTest):
+
         def init_kernel_type(self):
             self.use_mkldnn = False
 
@@ -63,6 +67,7 @@ def init_axis(self):
             pass
 
     class TestElementwiseModOp_scalar(TestElementwiseModOp):
+
         def init_input_output(self):
             scale_x = random.randint(0, 100000)
             scale_y = random.randint(1, 100000)
@@ -71,6 +76,7 @@ def init_input_output(self):
             self.out = np.floor_divide(self.x, self.y)
 
     class TestElementwiseModOpInverse(TestElementwiseModOp):
+
         def init_input_output(self):
             self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
             self.y = np.random.uniform(1, 1000, [10, 10]).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py
index 3d9566dc71d42..3d60dcdd16dad 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -19,15 +20,18 @@
 from op_test_xpu import XPUOpTest
 import paddle
 from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
 paddle.enable_static()
 
 
 class XPUTestElementwiseMaxOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'elementwise_max'
         self.use_dynamic_create_class = False
 
     class TestElementwiseOp(XPUOpTest):
+
         def setUp(self):
             self.use_xpu = True
             self.op_type = "elementwise_max"
@@ -59,24 +63,23 @@ def test_check_grad_normal(self):
         def test_check_grad_ingore_x(self):
             if paddle.is_compiled_with_xpu():
                 place = paddle.XPUPlace(0)
-                self.check_grad_with_place(
-                    place, ['Y'],
-                    'Out',
-                    max_relative_error=0.006,
-                    no_grad_set=set("X"))
+                self.check_grad_with_place(place, ['Y'],
+                                           'Out',
+                                           max_relative_error=0.006,
+                                           no_grad_set=set("X"))
 
         def test_check_grad_ingore_y(self):
             if paddle.is_compiled_with_xpu():
                 place = paddle.XPUPlace(0)
-                self.check_grad_with_place(
-                    place, ['X'],
-                    'Out',
-                    max_relative_error=0.006,
-                    no_grad_set=set('Y'))
+                self.check_grad_with_place(place, ['X'],
+                                           'Out',
+                                           max_relative_error=0.006,
+                                           no_grad_set=set('Y'))
 
     @skip_check_grad_ci(
         reason="[skip shape check] Use y_shape(1) to test broadcast.")
     class TestElementwiseMaxOp_scalar(TestElementwiseOp):
+
         def init_input_output(self):
             x = np.random.random_integers(-5, 5, [2, 3, 20]).astype(self.dtype)
             y = np.array([0.5]).astype(self.dtype)
@@ -86,6 +89,7 @@ def init_input_output(self):
             }
 
     class TestElementwiseMaxOp_Vector(TestElementwiseOp):
+
         def init_input_output(self):
             x = np.random.random((100, )).astype(self.dtype)
             sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
@@ -96,6 +100,7 @@ def init_input_output(self):
             }
 
     class TestElementwiseMaxOp_broadcast_0(TestElementwiseOp):
+
         def init_input_output(self):
             x = np.random.uniform(0.5, 1, (100, 5, 2)).astype(self.dtype)
             sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
@@ -105,11 +110,13 @@ def init_input_output(self):
 
             self.attrs = {'axis': 0}
             self.outputs = {
-                'Out': np.maximum(self.inputs['X'],
-                                  self.inputs['Y'].reshape(100, 1, 1))
+                'Out':
+                np.maximum(self.inputs['X'],
+                           self.inputs['Y'].reshape(100, 1, 1))
             }
 
     class TestElementwiseMaxOp_broadcast_1(TestElementwiseOp):
+
         def init_input_output(self):
             x = np.random.uniform(0.5, 1, (2, 100, 3)).astype(self.dtype)
             sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
@@ -119,11 +126,13 @@ def init_input_output(self):
 
             self.attrs = {'axis': 1}
             self.outputs = {
-                'Out': np.maximum(self.inputs['X'],
-                                  self.inputs['Y'].reshape(1, 100, 1))
+                'Out':
+                np.maximum(self.inputs['X'],
+                           self.inputs['Y'].reshape(1, 100, 1))
             }
 
     class TestElementwiseMaxOp_broadcast_2(TestElementwiseOp):
+
         def init_input_output(self):
             x = np.random.uniform(0.5, 1, (1, 3, 100)).astype(self.dtype)
             sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
@@ -132,11 +141,13 @@ def init_input_output(self):
             self.inputs = {'X': x, 'Y': y}
 
             self.outputs = {
-                'Out': np.maximum(self.inputs['X'],
-                                  self.inputs['Y'].reshape(1, 1, 100))
+                'Out':
+                np.maximum(self.inputs['X'],
+                           self.inputs['Y'].reshape(1, 1, 100))
             }
 
     class TestElementwiseMaxOp_broadcast_3(TestElementwiseOp):
+
         def init_input_output(self):
             x = np.random.uniform(0.5, 1, (2, 50, 2, 1)).astype(self.dtype)
             sgn = np.random.choice([-1, 1], (50, 2)).astype(self.dtype)
@@ -146,11 +157,13 @@ def init_input_output(self):
 
             self.attrs = {'axis': 1}
             self.outputs = {
-                'Out': np.maximum(self.inputs['X'],
-                                  self.inputs['Y'].reshape(1, 50, 2, 1))
+                'Out':
+                np.maximum(self.inputs['X'],
+                           self.inputs['Y'].reshape(1, 50, 2, 1))
             }
 
     class TestElementwiseMaxOp_broadcast_4(TestElementwiseOp):
+
         def init_input_output(self):
             x = np.random.uniform(0.5, 1, (2, 3, 4, 5)).astype(self.dtype)
             sgn = np.random.choice([-1, 1], (2, 3, 1, 5)).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py
index 9233097b3add1..422fe087cbc3d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -26,11 +27,13 @@
 
 
 class XPUTestElementwiseMinOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'elementwise_min'
         self.use_dynamic_create_class = False
 
     class TestElementwiseOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = "elementwise_min"
             # If x and y have the same value, the min() is not differentiable.
@@ -61,24 +64,23 @@ def test_check_grad_normal(self):
         def test_check_grad_ingore_x(self):
             if paddle.is_compiled_with_xpu():
                 place = paddle.XPUPlace(0)
-                self.check_grad_with_place(
-                    place, ['Y'],
-                    'Out',
-                    max_relative_error=0.005,
-                    no_grad_set=set("X"))
+                self.check_grad_with_place(place, ['Y'],
+                                           'Out',
+                                           max_relative_error=0.005,
+                                           no_grad_set=set("X"))
 
         def test_check_grad_ingore_y(self):
             if paddle.is_compiled_with_xpu():
                 place = paddle.XPUPlace(0)
-                self.check_grad_with_place(
-                    place, ['X'],
-                    'Out',
-                    max_relative_error=0.005,
-                    no_grad_set=set('Y'))
+                self.check_grad_with_place(place, ['X'],
+                                           'Out',
+                                           max_relative_error=0.005,
+                                           no_grad_set=set('Y'))
 
     @skip_check_grad_ci(
         reason="[skip shape check] Use y_shape(1) to test broadcast.")
     class TestElementwiseMinOp_scalar(TestElementwiseOp):
+
         def init_input_output(self):
             x = np.random.random_integers(-5, 5, [10, 3, 4]).astype(self.dtype)
             y = np.array([0.5]).astype(self.dtype)
@@ -88,6 +90,7 @@ def init_input_output(self):
             }
 
     class TestElementwiseMinOp_Vector(TestElementwiseOp):
+
         def init_input_output(self):
             x = np.random.random((100, )).astype(self.dtype)
             sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
@@ -98,6 +101,7 @@ def init_input_output(self):
             }
 
     class TestElementwiseMinOp_broadcast_0(TestElementwiseOp):
+
         def init_input_output(self):
             x = np.random.uniform(0.5, 1, (100, 3, 2)).astype(self.dtype)
             sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
@@ -106,11 +110,13 @@ def init_input_output(self):
             self.attrs = {'axis': 0}
             self.inputs = {'X': x, 'Y': y}
             self.outputs = {
-                'Out': np.minimum(self.inputs['X'],
-                                  self.inputs['Y'].reshape(100, 1, 1))
+                'Out':
+                np.minimum(self.inputs['X'],
+                           self.inputs['Y'].reshape(100, 1, 1))
             }
 
     class TestElementwiseMinOp_broadcast_1(TestElementwiseOp):
+
         def init_input_output(self):
             x = np.random.uniform(0.5, 1, (2, 100, 3)).astype(self.dtype)
             sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
@@ -119,11 +125,13 @@ def init_input_output(self):
             self.attrs = {'axis': 1}
             self.inputs = {'X': x, 'Y': y}
             self.outputs = {
-                'Out': np.minimum(self.inputs['X'],
-                                  self.inputs['Y'].reshape(1, 100, 1))
+                'Out':
+                np.minimum(self.inputs['X'],
+                           self.inputs['Y'].reshape(1, 100, 1))
             }
 
     class TestElementwiseMinOp_broadcast_2(TestElementwiseOp):
+
         def init_input_output(self):
             x = np.random.uniform(0.5, 1, (2, 3, 100)).astype(self.dtype)
             sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
@@ -131,11 +139,13 @@ def init_input_output(self):
                 np.random.uniform(1, 2, (100, )).astype(self.dtype)
             self.inputs = {'X': x, 'Y': y}
             self.outputs = {
-                'Out': np.minimum(self.inputs['X'],
-                                  self.inputs['Y'].reshape(1, 1, 100))
+                'Out':
+                np.minimum(self.inputs['X'],
+                           self.inputs['Y'].reshape(1, 1, 100))
             }
 
     class TestElementwiseMinOp_broadcast_3(TestElementwiseOp):
+
         def init_input_output(self):
             x = np.random.uniform(0.5, 1, (2, 25, 4, 1)).astype(self.dtype)
             sgn = np.random.choice([-1, 1], (25, 4)).astype(self.dtype)
@@ -144,11 +154,13 @@ def init_input_output(self):
             self.attrs = {'axis': 1}
             self.inputs = {'X': x, 'Y': y}
             self.outputs = {
-                'Out': np.minimum(self.inputs['X'],
-                                  self.inputs['Y'].reshape(1, 25, 4, 1))
+                'Out':
+                np.minimum(self.inputs['X'],
+                           self.inputs['Y'].reshape(1, 25, 4, 1))
             }
 
     class TestElementwiseMinOp_broadcast_4(TestElementwiseOp):
+
         def init_input_output(self):
             x = np.random.uniform(0.5, 1, (2, 10, 2, 5)).astype(self.dtype)
             sgn = np.random.choice([-1, 1], (2, 10, 1, 5)).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mod_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mod_op_xpu.py
index 9ef2c093604b0..de0c7000e1d44 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mod_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mod_op_xpu.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -27,11 +28,13 @@
 
 
 class XPUTestElementwiseModOp(XPUOpTestWrapper):
+
     def __init__(self) -> None:
         self.op_name = 'elementwise_mod'
         self.use_dynamic_create_class = False
 
     class ElementwiseModOp(XPUOpTest):
+
         def init_kernel_type(self):
             self.use_mkldnn = False
 
@@ -68,6 +71,7 @@ def test_check_output(self):
                 self.check_output_with_place(place)
 
     class TestElementwiseModOp_broadcast_1(ElementwiseModOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(2, 100, 3).astype(self.dtype),
@@ -78,6 +82,7 @@ def init_input_output(self):
             self.outputs = {'Out': self.inputs['X'] % self.inputs['Y']}
 
     class TestElementwiseModOp_broadcast_2(ElementwiseModOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(22, 128, 3).astype(self.dtype),
@@ -88,6 +93,7 @@ def init_input_output(self):
             self.outputs = {'Out': self.inputs['X'] % self.inputs['Y']}
 
     class TestRemainderOp(unittest.TestCase):
+
         def test_dygraph(self):
             with fluid.dygraph.guard():
                 np_x = np.random.rand(22, 128, 3).astype('int64')
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py
index b4dbb7cf04552..7d5feab778fac 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -26,11 +27,13 @@
 
 
 class XPUTestElementwiseMulOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'elementwise_mul'
         self.use_dynamic_create_class = False
 
     class ElementwiseMulOp(XPUOpTest):
+
         def init_kernel_type(self):
             self.use_mkldnn = False
 
@@ -95,6 +98,7 @@ def init_axis(self):
     @skip_check_grad_ci(
         reason="[skip shape check] Use y_shape(1) to test broadcast.")
     class TestElementwiseMulOp_scalar(ElementwiseMulOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(10, 3, 4).astype(self.dtype),
@@ -103,6 +107,7 @@ def init_input_output(self):
             self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
 
     class TestElementwiseMulOp_Vector(ElementwiseMulOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.random((100, )).astype(self.dtype),
@@ -113,6 +118,7 @@ def init_input_output(self):
             }
 
     class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(100, 2, 3).astype(self.dtype),
@@ -124,6 +130,7 @@ def init_input_output(self):
             self.attrs = {'axis': 0}
 
     class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(2, 100, 3).astype(self.dtype),
@@ -136,6 +143,7 @@ def init_input_output(self):
             }
 
     class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(2, 3, 100).astype(self.dtype),
@@ -147,6 +155,7 @@ def init_input_output(self):
             }
 
     class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(2, 10, 12, 3).astype(self.dtype),
@@ -155,10 +164,12 @@ def init_input_output(self):
 
             self.attrs = {'axis': 1}
             self.outputs = {
-                'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 10, 12, 1)
+                'Out':
+                self.inputs['X'] * self.inputs['Y'].reshape(1, 10, 12, 1)
             }
 
     class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(10, 2, 11).astype(self.dtype),
@@ -167,6 +178,7 @@ def init_input_output(self):
             self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
 
     class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(10, 4, 2, 3).astype(self.dtype),
@@ -175,6 +187,7 @@ def init_input_output(self):
             self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
 
     class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(2, 3, 100).astype(self.dtype),
@@ -183,6 +196,7 @@ def init_input_output(self):
             self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
 
     class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(30, 3, 1, 5).astype(self.dtype),
@@ -191,6 +205,7 @@ def init_input_output(self):
             self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
 
     class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(10, 10).astype(self.dtype),
@@ -204,21 +219,24 @@ def init_input_output(self):
             }
 
     class TestElementwiseMulOpError(unittest.TestCase):
+
         def test_errors(self):
             with program_guard(Program(), Program()):
                 # the input of elementwise_mul must be Variable.
-                x1 = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
-                y1 = fluid.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.XPUPlace(0))
+                x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                             [[1, 1, 1, 1]], fluid.XPUPlace(0))
+                y1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                             [[1, 1, 1, 1]], fluid.XPUPlace(0))
                 self.assertRaises(TypeError, fluid.layers.elementwise_mul, x1,
                                   y1)
 
                 # the input dtype of elementwise_mul must be float32
-                x2 = fluid.layers.data(
-                    name='x2', shape=[3, 4, 5, 6], dtype="uint8")
-                y2 = fluid.layers.data(
-                    name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+                x2 = fluid.layers.data(name='x2',
+                                       shape=[3, 4, 5, 6],
+                                       dtype="uint8")
+                y2 = fluid.layers.data(name='y2',
+                                       shape=[3, 4, 5, 6],
+                                       dtype="uint8")
                 self.assertRaises(TypeError, fluid.layers.elementwise_mul, x2,
                                   y2)
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py
index 59c5dd685e176..a116307009107 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -27,11 +28,13 @@
 
 @skip_check_grad_ci(reason="XPU does not support grad op currently")
 class XPUTestElementwisePowOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'elementwise_pow'
         self.use_dynamic_create_class = False
 
     class TestElementwisePowOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = "elementwise_pow"
             self.dtype = self.in_type
@@ -51,6 +54,7 @@ def test_check_output(self):
                 self.check_output_with_place(place)
 
     class TestElementwisePowOp_big_shape_1(TestElementwisePowOp):
+
         def compute_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(1, 2, [10, 10]).astype(self.dtype),
@@ -59,6 +63,7 @@ def compute_input_output(self):
             self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
 
     class TestElementwisePowOp_big_shape_2(TestElementwisePowOp):
+
         def compute_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(1, 2, [10, 10]).astype(self.dtype),
@@ -69,6 +74,7 @@ def compute_input_output(self):
     @skip_check_grad_ci(
         reason="[skip shape check] Use y_shape(1) to test broadcast.")
     class TestElementwisePowOp_scalar(TestElementwisePowOp):
+
         def compute_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(0.1, 1, [3, 3, 4]).astype(self.dtype),
@@ -77,6 +83,7 @@ def compute_input_output(self):
             self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
 
     class TestElementwisePowOp_tensor(TestElementwisePowOp):
+
         def compute_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(0.1, 1, [100]).astype(self.dtype),
@@ -85,6 +92,7 @@ def compute_input_output(self):
             self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
 
     class TestElementwisePowOp_broadcast_0(TestElementwisePowOp):
+
         def compute_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(0.1, 1, [2, 1, 100]).astype(self.dtype),
@@ -93,6 +101,7 @@ def compute_input_output(self):
             self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
 
     class TestElementwisePowOp_broadcast_1(TestElementwisePowOp):
+
         def compute_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(0.1, 1, [2, 100, 1]).astype(self.dtype),
@@ -100,11 +109,12 @@ def compute_input_output(self):
             }
             self.attrs = {'axis': 1}
             self.outputs = {
-                'Out':
-                np.power(self.inputs['X'], self.inputs['Y'].reshape(100, 1))
+                'Out': np.power(self.inputs['X'],
+                                self.inputs['Y'].reshape(100, 1))
             }
 
     class TestElementwisePowOp_broadcast_2(TestElementwisePowOp):
+
         def compute_input_output(self):
             self.inputs = {
                 'X': np.random.uniform(0.1, 1, [100, 3, 1]).astype(self.dtype),
@@ -117,28 +127,32 @@ def compute_input_output(self):
             }
 
     class TestElementwisePowOp_broadcast_3(TestElementwisePowOp):
+
         def compute_input_output(self):
             self.inputs = {
-                'X':
-                np.random.uniform(0.1, 1, [2, 20, 5, 1]).astype(self.dtype),
+                'X': np.random.uniform(0.1, 1,
+                                       [2, 20, 5, 1]).astype(self.dtype),
                 'Y': np.random.uniform(0.1, 1, [20, 5]).astype(self.dtype)
             }
             self.attrs = {'axis': 1}
             self.outputs = {
-                'Out': np.power(self.inputs['X'],
-                                self.inputs['Y'].reshape(1, 20, 5, 1))
+                'Out':
+                np.power(self.inputs['X'],
+                         self.inputs['Y'].reshape(1, 20, 5, 1))
             }
 
     class TestElementwisePowOp_broadcast_4(TestElementwisePowOp):
+
         def compute_input_output(self):
             self.inputs = {
-                'X':
-                np.random.uniform(0.1, 1, [2, 10, 3, 5]).astype(self.dtype),
+                'X': np.random.uniform(0.1, 1,
+                                       [2, 10, 3, 5]).astype(self.dtype),
                 'Y': np.random.uniform(0.1, 1, [2, 10, 1, 5]).astype(self.dtype)
             }
             self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
 
     class TestElementwisePowOpInt(OpTest):
+
         def setUp(self):
             self.op_type = "elementwise_pow"
             self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py
index 204485f3432dd..fe4283f55987a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 import sys
+
 sys.path.append("..")
 import paddle
 from op_test import OpTest, skip_check_grad_ci
@@ -25,11 +26,13 @@
 
 
 class XPUTestElementwiseSubOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'elementwise_sub'
         self.use_dynamic_create_class = False
 
     class TestElementwiseOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = "elementwise_sub"
             self.use_xpu = True
@@ -56,24 +59,23 @@ def test_check_grad_normal(self):
         def test_check_grad_ingore_x(self):
             if paddle.is_compiled_with_xpu():
                 place = paddle.XPUPlace(0)
-                self.check_grad_with_place(
-                    place, ['Y'],
-                    'Out',
-                    max_relative_error=0.005,
-                    no_grad_set=set("X"))
+                self.check_grad_with_place(place, ['Y'],
+                                           'Out',
+                                           max_relative_error=0.005,
+                                           no_grad_set=set("X"))
 
         def test_check_grad_ingore_y(self):
             if paddle.is_compiled_with_xpu():
                 place = paddle.XPUPlace(0)
-                self.check_grad_with_place(
-                    place, ['X'],
-                    'Out',
-                    max_relative_error=0.005,
-                    no_grad_set=set('Y'))
+                self.check_grad_with_place(place, ['X'],
+                                           'Out',
+                                           max_relative_error=0.005,
+                                           no_grad_set=set('Y'))
 
     @skip_check_grad_ci(
         reason="[skip shape check] Use y_shape(1) to test broadcast.")
     class TestElementwiseSubOp_scalar(TestElementwiseOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(10, 3, 4).astype(self.dtype),
@@ -82,6 +84,7 @@ def init_input_output(self):
             self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
     class TestElementwiseSubOp_Vector(TestElementwiseOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.random((100, )).astype(self.dtype),
@@ -90,6 +93,7 @@ def init_input_output(self):
             self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
     class TestElementwiseSubOp_broadcast_0(TestElementwiseOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(100, 3, 2).astype(self.dtype),
@@ -102,6 +106,7 @@ def init_input_output(self):
             }
 
     class TestElementwiseSubOp_broadcast_1(TestElementwiseOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(2, 100, 3).astype(self.dtype),
@@ -114,6 +119,7 @@ def init_input_output(self):
             }
 
     class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(2, 3, 100).astype(self.dtype),
@@ -125,6 +131,7 @@ def init_input_output(self):
             }
 
     class TestElementwiseSubOp_broadcast_3(TestElementwiseOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(2, 10, 12, 3).astype(self.dtype),
@@ -133,10 +140,12 @@ def init_input_output(self):
 
             self.attrs = {'axis': 1}
             self.outputs = {
-                'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 10, 12, 1)
+                'Out':
+                self.inputs['X'] - self.inputs['Y'].reshape(1, 10, 12, 1)
             }
 
     class TestElementwiseSubOp_broadcast_4(TestElementwiseOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(2, 5, 3, 12).astype(self.dtype),
@@ -145,6 +154,7 @@ def init_input_output(self):
             self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
     class TestElementwiseSubOp_commonuse_1(TestElementwiseOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(2, 3, 100).astype(self.dtype),
@@ -153,6 +163,7 @@ def init_input_output(self):
             self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
     class TestElementwiseSubOp_commonuse_2(TestElementwiseOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(10, 3, 1, 4).astype(self.dtype),
@@ -161,6 +172,7 @@ def init_input_output(self):
             self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
     class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseOp):
+
         def init_input_output(self):
             self.inputs = {
                 'X': np.random.rand(10, 12).astype(self.dtype),
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_expand_as_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_expand_as_v2_op_xpu.py
index acba0012a0ab7..0ca73b931b931 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_expand_as_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_expand_as_v2_op_xpu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
@@ -29,11 +30,13 @@
 
 
 class XPUTestExpandAsV2Op(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'expand_as_v2'
         self.use_dynamic_create_class = False
 
     class TestExpandAsV2XPUOp(XPUOpTest):
+
         def setUp(self):
             self.init_dtype()
             self.set_xpu()
@@ -65,6 +68,7 @@ def test_check_output(self):
             self.check_output_with_place(self.place)
 
     class TestExpandAsOpRank2(TestExpandAsV2XPUOp):
+
         def set_inputs(self):
             x = np.random.rand(10, 12).astype(self.dtype)
             self.inputs = {'X': x}
@@ -77,6 +81,7 @@ def set_output(self):
             self.outputs = {'Out': output}
 
     class TestExpandAsOpRank3(TestExpandAsV2XPUOp):
+
         def set_inputs(self):
             x = np.random.rand(2, 3, 20).astype(self.dtype)
             self.inputs = {'X': x}
@@ -89,6 +94,7 @@ def set_output(self):
             self.outputs = {'Out': output}
 
     class TestExpandAsOpRank4(TestExpandAsV2XPUOp):
+
         def set_inputs(self):
             x = np.random.rand(1, 1, 7, 16).astype(self.dtype)
             self.inputs = {'X': x}
@@ -101,6 +107,7 @@ def set_output(self):
             self.outputs = {'Out': output}
 
     class TestExpandAsOpRank5(TestExpandAsV2XPUOp):
+
         def set_inputs(self):
             x = np.random.rand(1, 1, 7, 16, 1).astype(self.dtype)
             self.inputs = {'X': x}
@@ -113,6 +120,7 @@ def set_output(self):
             self.outputs = {'Out': output}
 
     class TestExpandAsOpRank6(TestExpandAsV2XPUOp):
+
         def set_inputs(self):
             x = np.random.rand(1, 1, 7, 16, 1, 1).astype(self.dtype)
             self.inputs = {'X': x}
@@ -127,24 +135,28 @@ def set_output(self):
 
 # Test python API
 class TestExpandAsV2API(unittest.TestCase):
+
     def test_api(self):
         input1 = np.random.random([12, 14]).astype("float32")
         input2 = np.random.random([2, 12, 14]).astype("float32")
-        x = fluid.layers.data(
-            name='x', shape=[12, 14], append_batch_size=False, dtype="float32")
+        x = fluid.layers.data(name='x',
+                              shape=[12, 14],
+                              append_batch_size=False,
+                              dtype="float32")
 
-        y = fluid.layers.data(
-            name='target_tensor',
-            shape=[2, 12, 14],
-            append_batch_size=False,
-            dtype="float32")
+        y = fluid.layers.data(name='target_tensor',
+                              shape=[2, 12, 14],
+                              append_batch_size=False,
+                              dtype="float32")
 
         out_1 = paddle.expand_as(x, y=y)
 
         exe = fluid.Executor(place=fluid.XPUPlace(0))
         res_1 = exe.run(fluid.default_main_program(),
-                        feed={"x": input1,
-                              "target_tensor": input2},
+                        feed={
+                            "x": input1,
+                            "target_tensor": input2
+                        },
                         fetch_list=[out_1])
         assert np.array_equal(res_1[0], np.tile(input1, (2, 1, 1)))
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_expand_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_expand_v2_op_xpu.py
index b5fa473ee2642..f7319df270d89 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_expand_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_expand_v2_op_xpu.py
@@ -16,6 +16,7 @@
 import unittest
 import sys
 import numpy as np
+
 sys.path.append("..")
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
@@ -31,11 +32,13 @@
 # CANN Op Support X: float32, int32, int64
 # Situation 1: shape is a list(without tensor)
 class XPUTestExpandV2Op(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'expand_v2'
         self.use_dynamic_create_class = False
 
     class TestExpandV2XPUOp(XPUOpTest):
+
         def setUp(self):
             self.init_dtype()
             self.set_xpu()
@@ -65,36 +68,42 @@ def test_check_output(self):
             self.check_output_with_place(self.place)
 
     class TestExpandV2OpRank2_DimExpanding(TestExpandV2XPUOp):
+
         def init_data(self):
             self.ori_shape = [120]
             self.shape = [2, 120]
             self.expand_times = [2, 1]
 
     class TestExpandV2OpRank2(TestExpandV2XPUOp):
+
         def init_data(self):
             self.ori_shape = [1, 140]
             self.shape = [12, 140]
             self.expand_times = [12, 1]
 
     class TestExpandV2OpRank3_Corner(TestExpandV2XPUOp):
+
         def init_data(self):
             self.ori_shape = (2, 10, 5)
             self.shape = (2, 10, 5)
             self.expand_times = (1, 1, 1)
 
     class TestExpandV2OpRank4(TestExpandV2XPUOp):
+
         def init_data(self):
             self.ori_shape = (2, 4, 5, 7)
             self.shape = (-1, -1, -1, -1)
             self.expand_times = (1, 1, 1, 1)
 
     class TestExpandV2OpRank5(TestExpandV2XPUOp):
+
         def init_data(self):
             self.ori_shape = (2, 4, 1, 15)
             self.shape = (2, -1, 4, -1)
             self.expand_times = (1, 1, 4, 1)
 
     class TestExpandV2OpRank6(TestExpandV2XPUOp):
+
         def init_data(self):
             self.ori_shape = (4, 1, 30)
             self.shape = (2, -1, 4, 30)
@@ -102,6 +111,7 @@ def init_data(self):
 
     # Situation 2: shape is a list(with tensor)
     class TestExpandV2OpXPURank1_tensor_attr(TestExpandV2XPUOp):
+
         def setUp(self):
             self.set_xpu()
             self.place = paddle.XPUPlace(0)
@@ -129,6 +139,7 @@ def init_data(self):
 
     class TestExpandV2OpRank2_Corner_tensor_attr(
             TestExpandV2OpXPURank1_tensor_attr):
+
         def init_data(self):
             self.ori_shape = [12, 14]
             self.expand_times = [1, 1]
@@ -137,6 +148,7 @@ def init_data(self):
 
     # Situation 3: shape is a tensor
     class TestExpandV2XPUOp_tensor(TestExpandV2XPUOp):
+
         def setUp(self):
             self.set_xpu()
             self.place = paddle.XPUPlace(0)
@@ -161,6 +173,7 @@ def init_data(self):
 # Situation 5: input x is int32
 # skip grad check for int32
 class TestExpandV2OpInteger(XPUOpTest):
+
     def init_type(self):
         self.dtype = 'int32'
 
@@ -170,8 +183,7 @@ def setUp(self):
         self.place = paddle.XPUPlace(0)
         self.op_type = "expand_v2"
         self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 20)).astype(self.dtype)
+            'X': np.random.randint(10, size=(2, 4, 20)).astype(self.dtype)
         }
         self.attrs = {'shape': [2, 4, 20]}
         output = np.tile(self.inputs['X'], (1, 1, 1))
@@ -189,21 +201,20 @@ def test_check_grad(self):
 
 # Test python API
 class TestExpandV2API(unittest.TestCase):
+
     def test_static(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = np.random.random([12, 14]).astype("float32")
-            x = fluid.layers.data(
-                name='x',
-                shape=[12, 14],
-                append_batch_size=False,
-                dtype="float32")
+            x = fluid.layers.data(name='x',
+                                  shape=[12, 14],
+                                  append_batch_size=False,
+                                  dtype="float32")
 
             positive_2 = fluid.layers.fill_constant([1], "int32", 12)
-            expand_shape = fluid.layers.data(
-                name="expand_shape",
-                shape=[2],
-                append_batch_size=False,
-                dtype="int32")
+            expand_shape = fluid.layers.data(name="expand_shape",
+                                             shape=[2],
+                                             append_batch_size=False,
+                                             dtype="int32")
 
             out_1 = paddle.expand(x, shape=[12, 14])
             out_2 = paddle.expand(x, shape=[positive_2, 14])
@@ -214,7 +225,8 @@ def test_static(self):
             exe = fluid.Executor(place=paddle.XPUPlace(0))
             res_1, res_2, res_3 = exe.run(fluid.default_main_program(),
                                           feed={
-                                              "x": input,
+                                              "x":
+                                              input,
                                               "expand_shape":
                                               np.array([12, 14]).astype("int32")
                                           },
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py
index 333d57f040610..5b42da9582988 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -32,11 +33,13 @@
 
 
 class XPUTestFillAnyLikeOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'fill_any_like'
         self.use_dynamic_create_class = False
 
     class TestFillAnyLikeOp(XPUOpTest):
+
         def setUp(self):
             self.init_dtype()
             self.set_xpu()
@@ -64,18 +67,22 @@ def test_check_output(self):
             self.check_output_with_place(self.place)
 
     class TestFillAnyLikeOp2(TestFillAnyLikeOp):
+
         def set_value(self):
             self.value = -0.0
 
     class TestFillAnyLikeOp3(TestFillAnyLikeOp):
+
         def set_value(self):
             self.value = 1.0
 
     class TestFillAnyLikeOp4(TestFillAnyLikeOp):
+
         def init(self):
             self.value = 1e-9
 
     class TestFillAnyLikeOp5(TestFillAnyLikeOp):
+
         def set_value(self):
             if self.dtype == "float16":
                 self.value = 0.05
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py
index d989fd0afad85..81c3685fe8b6f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 import unittest
 import paddle
@@ -26,12 +27,14 @@
 
 
 class XPUTestFillConstantOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'fill_constant'
         self.use_dynamic_create_class = False
 
     # Situation 1: Attr(shape) is a list(without tensor)
     class TestFillConstantOp(XPUOpTest):
+
         def setUp(self):
             '''Test fill_constant op with specified value
             '''
@@ -161,9 +164,10 @@ def set_data(self):
             if self.index == 22:
                 self.outputs = {
                     'Out':
-                    np.full(self.shape,
-                            convert_float_to_uint16(
-                                np.array([self.value]).astype("float32")))
+                    np.full(
+                        self.shape,
+                        convert_float_to_uint16(
+                            np.array([self.value]).astype("float32")))
                 }
 
         def set_shape(self):
@@ -190,12 +194,14 @@ def set_shape(self):
 
     class TestFillConstantOp3_ShapeTensorList(
             TestFillConstantOp1_ShapeTensorList):
+
         def set_shape(self):
             self.shape = [123, 3, 2, 1]
             self.infer_shape = [123, 111, 11, 1]
 
     class TestFillConstantOp4_ShapeTensorList(
             TestFillConstantOp1_ShapeTensorList):
+
         def set_shape(self):
             self.shape = [123]
             self.infer_shape = [1]
@@ -212,9 +218,10 @@ def set_data(self):
             if self.index == 22:
                 self.outputs = {
                     'Out':
-                    np.full(self.shape,
-                            convert_float_to_uint16(
-                                np.array([self.value]).astype("float32")))
+                    np.full(
+                        self.shape,
+                        convert_float_to_uint16(
+                            np.array([self.value]).astype("float32")))
                 }
 
         def set_shape(self):
@@ -232,7 +239,8 @@ def set_data(self):
             }
             if self.index == 22:
                 self.inputs = {
-                    'ValueTensor': convert_float_to_uint16(
+                    'ValueTensor':
+                    convert_float_to_uint16(
                         np.array([self.value]).astype("float32"))
                 }
             self.attrs = {'value': self.value, 'dtype': self.index}
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py
index 9cbc83950d1e8..819fd1248fecf 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py
@@ -16,16 +16,19 @@
 
 import unittest
 import sys
+
 sys.path.append("..")
 import numpy as np
 import paddle
 import paddle.fluid as fluid
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
+
 paddle.enable_static()
 
 
 class TestFlatten2Op(XPUOpTest):
+
     def setUp(self):
         self.set_xpu()
         self.op_type = "flatten2"
@@ -57,6 +60,7 @@ def init_attrs(self):
 
 
 class TestFlatten2OpWithCornerAxis(TestFlatten2Op):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.axis = 0
@@ -64,6 +68,7 @@ def init_test_case(self):
 
 
 class TestFlatten2OpWithDefaultAxis(TestFlatten2Op):
+
     def init_test_case(self):
         self.in_shape = (10, 2, 2, 3)
         self.new_shape = (10, 12)
@@ -73,6 +78,7 @@ def init_attrs(self):
 
 
 class TestFlatten2OpSixDims(TestFlatten2Op):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 3, 2, 4, 4)
         self.axis = 4
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py
index dcad3c479f446..06fc12f510844 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py
@@ -15,11 +15,13 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
@@ -30,6 +32,7 @@
 
 
 class TestFlattenOp(XPUOpTest):
+
     def setUp(self):
         self.set_xpu()
         self.op_type = "flatten_contiguous_range"
@@ -72,6 +75,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_1(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 1
@@ -86,6 +90,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_2(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -100,6 +105,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_3(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -114,6 +120,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_4(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = -2
@@ -128,6 +135,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_5(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 2
@@ -142,6 +150,7 @@ def init_attrs(self):
 
 
 class TestFlattenOpSixDims(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 3, 2, 4, 4)
         self.start_axis = 3
@@ -156,6 +165,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_Float32(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -171,6 +181,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp_int32(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -190,6 +201,7 @@ def test_check_grad(self):
 
 
 class TestFlattenOp_int8(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -208,6 +220,7 @@ def test_check_grad(self):
 
 
 class TestFlattenOp_int64(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 5, 4)
         self.start_axis = 0
@@ -226,6 +239,7 @@ def test_check_grad(self):
 
 
 class TestFlatten2OpError(unittest.TestCase):
+
     def test_errors(self):
         image_shape = (2, 3, 4, 4)
         x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
@@ -233,22 +247,25 @@ def test_errors(self):
         x = x.astype('float32')
 
         def test_ValueError1():
-            x_var = paddle.static.data(
-                name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(name="x",
+                                       shape=image_shape,
+                                       dtype='float32')
             out = paddle.flatten(x_var, start_axis=2, stop_axis=1)
 
         self.assertRaises(ValueError, test_ValueError1)
 
         def test_ValueError2():
-            x_var = paddle.static.data(
-                name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(name="x",
+                                       shape=image_shape,
+                                       dtype='float32')
             paddle.flatten(x_var, start_axis=10, stop_axis=1)
 
         self.assertRaises(ValueError, test_ValueError2)
 
         def test_ValueError3():
-            x_var = paddle.static.data(
-                name="x", shape=image_shape, dtype='float32')
+            x_var = paddle.static.data(name="x",
+                                       shape=image_shape,
+                                       dtype='float32')
             paddle.flatten(x_var, start_axis=2, stop_axis=10)
 
         self.assertRaises(ValueError, test_ValueError3)
@@ -258,8 +275,9 @@ def test_type():
             x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
                            image_shape[3]).reshape(image_shape) / 100.
             x2 = x2.astype('float16')
-            x2_var = paddle.fluid.data(
-                name='x2', shape=[3, 2, 4, 5], dtype='float16')
+            x2_var = paddle.fluid.data(name='x2',
+                                       shape=[3, 2, 4, 5],
+                                       dtype='float16')
             paddle.flatten(x2_var)
 
         self.assertRaises(TypeError, test_type)
@@ -271,6 +289,7 @@ def test_InputError():
 
 
 class TestStaticFlattenPythonAPI(unittest.TestCase):
+
     def execute_api(self, x, start_axis=0, stop_axis=-1):
         return paddle.flatten(x, start_axis, stop_axis)
 
@@ -280,8 +299,9 @@ def test_static_api(self):
 
         main_prog = paddle.static.Program()
         with paddle.static.program_guard(main_prog, paddle.static.Program()):
-            x = paddle.static.data(
-                name="x", shape=[2, 3, 4, 4], dtype='float32')
+            x = paddle.static.data(name="x",
+                                   shape=[2, 3, 4, 4],
+                                   dtype='float32')
             out = self.execute_api(x, start_axis=-2, stop_axis=-1)
 
         exe = paddle.static.Executor(place=paddle.XPUPlace(0))
@@ -290,11 +310,13 @@ def test_static_api(self):
 
 
 class TestStaticInplaceFlattenPythonAPI(TestStaticFlattenPythonAPI):
+
     def execute_api(self, x, start_axis=0, stop_axis=-1):
         return x.flatten_(start_axis, stop_axis)
 
 
 class TestFlattenPython(unittest.TestCase):
+
     def test_python_api(self):
         image_shape = (2, 3, 4, 4)
         x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py
index ed435198353ca..9622fc5bb1a82 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py
@@ -16,16 +16,19 @@
 
 import unittest
 import sys
+
 sys.path.append("..")
 import numpy as np
 import paddle
 import paddle.fluid as fluid
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
+
 paddle.enable_static()
 
 
 class TestFlattenOp(XPUOpTest):
+
     def setUp(self):
         self.op_type = "flatten"
         self.use_xpu = True
@@ -51,6 +54,7 @@ def init_attrs(self):
 
 
 class TestFlattenOp1(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 2, 10)
         self.axis = 0
@@ -58,6 +62,7 @@ def init_test_case(self):
 
 
 class TestFlattenOpWithDefaultAxis(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (10, 2, 2, 3)
         self.new_shape = (10, 12)
@@ -67,6 +72,7 @@ def init_attrs(self):
 
 
 class TestFlattenOpSixDims(TestFlattenOp):
+
     def init_test_case(self):
         self.in_shape = (3, 2, 3, 2, 4, 4)
         self.axis = 4
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py
index 68854edb0ebb6..0198bfde590d6 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -27,10 +28,12 @@
 
 
 class XPUTestGatherNd(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'gather_nd'
 
     class XPUTestGatherNdBase(XPUOpTest):
+
         def setUp(self):
             self.op_type = "gather_nd"
             self.dtype = self.in_type
@@ -39,7 +42,9 @@ def setUp(self):
             self.init_data()
 
             self.inputs = {'X': self.xnp, 'Index': self.inp}
-            self.outputs = {'Out': self.output, }
+            self.outputs = {
+                'Out': self.output,
+            }
 
         def test_check_output(self):
             self.check_output_with_place(self.place)
@@ -51,6 +56,7 @@ def init_data(self):
                 (self.xnp[np.newaxis, :], self.xnp[np.newaxis, :]))
 
     class XPUTestGatherNdOpWithEmptyIndex1(XPUTestGatherNdBase):
+
         def init_data(self):
             self.xnp = np.random.random((5, 20)).astype(self.in_type)
             self.inp = np.array([[], []]).astype("int32")
@@ -58,6 +64,7 @@ def init_data(self):
                 (self.xnp[np.newaxis, :], self.xnp[np.newaxis, :]))
 
     class XPUTestGatherNdOpWithEmptyIndex2(XPUTestGatherNdBase):
+
         def init_data(self):
             self.xnp = np.random.random((5, 20)).astype(self.in_type)
             self.inp = np.array([[], []]).astype("int64")
@@ -65,84 +72,96 @@ def init_data(self):
                 (self.xnp[np.newaxis, :], self.xnp[np.newaxis, :]))
 
     class XPUTestGatherNdOpWithIndex1(XPUTestGatherNdBase):
+
         def init_data(self):
             self.xnp = np.random.random((5, 20)).astype(self.in_type)
             self.inp = np.array([1]).astype("int32")
             self.output = self.xnp[self.inp]
 
     class XPUTestGatherNdOpWithIndex2(XPUTestGatherNdBase):
+
         def init_data(self):
             self.xnp = np.random.random((5, 20)).astype(self.in_type)
             self.inp = np.array([1]).astype("int64")
             self.output = self.xnp[self.inp]
 
     class XPUTestGatherNdOpWithLowIndex1(XPUTestGatherNdBase):
+
         def init_data(self):
             self.xnp = np.random.uniform(0, 100, (10, 10)).astype(self.in_type)
             self.inp = np.array([[1], [2]]).astype("int32")
             self.output = self.xnp[tuple(self.inp.T)]
 
     class XPUTestGatherNdOpWithLowIndex2(XPUTestGatherNdBase):
+
         def init_data(self):
             self.xnp = np.random.uniform(0, 100, (10, 10)).astype(self.in_type)
             self.inp = np.array([1, 2]).astype("int64")
             self.output = self.xnp[tuple(self.inp.T)]
 
     class XPUTestGatherNdOpWithHighRankSame1(XPUTestGatherNdBase):
+
         def init_data(self):
             shape = (5, 2, 3, 1, 10)
             self.xnp = np.random.rand(*shape).astype(self.in_type)
-            self.inp = np.vstack(
-                [np.random.randint(
-                    0, s, size=2) for s in shape]).T.astype("int32")
+            self.inp = np.vstack([
+                np.random.randint(0, s, size=2) for s in shape
+            ]).T.astype("int32")
             self.output = self.xnp[tuple(self.inp.T)]
 
     class XPUTestGatherNdOpWithHighRankSame2(XPUTestGatherNdBase):
+
         def init_data(self):
             shape = (5, 2, 3, 1, 10)
             self.xnp = np.random.rand(*shape).astype(self.in_type)
-            self.inp = np.vstack(
-                [np.random.randint(
-                    0, s, size=2) for s in shape]).T.astype("int64")
+            self.inp = np.vstack([
+                np.random.randint(0, s, size=2) for s in shape
+            ]).T.astype("int64")
             self.output = self.xnp[tuple(self.inp.T)]
 
     class XPUTestGatherNdOpWithHighRankDiff1(XPUTestGatherNdBase):
+
         def init_data(self):
             shape = (2, 3, 4, 1, 10)
             self.xnp = np.random.rand(*shape).astype(self.in_type)
-            self.inp = np.vstack(
-                [np.random.randint(
-                    0, s, size=200) for s in shape]).T.astype("int32")
+            self.inp = np.vstack([
+                np.random.randint(0, s, size=200) for s in shape
+            ]).T.astype("int32")
             self.output = self.xnp[tuple(self.inp.T)]
 
     class XPUTestGatherNdOpWithHighRankDiff2(XPUTestGatherNdBase):
+
         def init_data(self):
             shape = (2, 3, 4, 1, 10)
             self.xnp = np.random.rand(*shape).astype(self.in_type)
-            self.inp = np.vstack(
-                [np.random.randint(
-                    0, s, size=200) for s in shape]).T.astype("int64")
+            self.inp = np.vstack([
+                np.random.randint(0, s, size=200) for s in shape
+            ]).T.astype("int64")
             self.output = self.xnp[tuple(self.inp.T)]
 
     class XPUTestGatherNdOpWithSameIndexAsX1(XPUTestGatherNdBase):
+
         def init_data(self):
             self.xnp = np.random.uniform(0, 100, (10, 10)).astype(self.in_type)
             self.inp = np.array([[1, 1], [2, 1]]).astype("int32")
             self.output = self.xnp[tuple(self.inp.T)]
 
     class XPUTestGatherNdOpWithSameIndexAsX2(XPUTestGatherNdBase):
+
         def init_data(self):
             self.xnp = np.random.uniform(0, 100, (10, 10)).astype(self.in_type)
             self.inp = np.array([[1, 1], [2, 1]]).astype("int64")
             self.output = self.xnp[tuple(self.inp.T)]
 
     class XPUTestGatherNdOpIndex1(XPUTestGatherNdBase):
+
         def init_data(self):
             self.xnp = np.random.uniform(0, 100, (10, 10)).astype(self.in_type)
             self.inp = np.array([1, 2]).astype("int32")
             self.output = self.xnp[tuple(self.inp.T)]
 
     class XPUTestGatherNdOpIndex2(XPUTestGatherNdBase):
+
         def init_data(self):
             self.xnp = np.random.uniform(0, 100, (10, 10)).astype(self.in_type)
             self.inp = np.array([1, 2]).astype("int64")
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
index f0e6315514fb5..4b9cf40a38fee 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import unittest
 import sys
+
 sys.path.append("..")
 
 import numpy as np
@@ -34,10 +35,12 @@ def gather_numpy(x, index, axis):
 
 
 class XPUTestGather(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'gather'
 
     class TestXPUGatherOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = "gather"
             self.place = paddle.XPUPlace(0)
@@ -65,24 +68,28 @@ def test_check_grad(self):
                 self.check_grad_with_place(self.place, ['X'], 'Out')
 
     class TestCase1(TestXPUGatherOp):
+
         def init_config(self):
             self.x_shape = (100)
             self.index = [1, 3, 5]
             self.index_type = np.int32
 
     class TestCase2(TestXPUGatherOp):
+
         def init_config(self):
             self.x_shape = (100)
             self.index = [1, 3, 5]
             self.index_type = np.int64
 
     class TestCase3(TestXPUGatherOp):
+
         def init_config(self):
             self.x_shape = (10, 20)
             self.index = [1, 3, 5]
             self.index_type = np.int32
 
     class TestCase4(TestXPUGatherOp):
+
         def init_config(self):
             self.x_shape = (10, 20)
             self.attrs = {'overwrite': False}
@@ -90,6 +97,7 @@ def init_config(self):
             self.index_type = np.int32
 
     class TestCase5(TestXPUGatherOp):
+
         def init_config(self):
             self.x_shape = (10, 20)
             self.attrs = {'overwrite': False}
@@ -97,6 +105,7 @@ def init_config(self):
             self.index_type = np.int32
 
     class TestCase6(TestXPUGatherOp):
+
         def init_config(self):
             self.x_shape = (10, 20)
             self.attrs = {'overwrite': True}
@@ -104,6 +113,7 @@ def init_config(self):
             self.index_type = np.int32
 
     class TestCase7(TestXPUGatherOp):
+
         def init_config(self):
             self.x_shape = (10, 20)
             self.attrs = {'overwrite': True}
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
index 454c3144908cd..0a0a9bb3d365d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -30,6 +31,7 @@
 
 
 class TestXPUGaussianRandomOp(TestGaussianRandomOp):
+
     def test_check_output(self):
         if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gen_bkcl_id_op.py b/python/paddle/fluid/tests/unittests/xpu/test_gen_bkcl_id_op.py
index dbac796eee829..3d8035b4e3249 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gen_bkcl_id_op.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gen_bkcl_id_op.py
@@ -16,6 +16,7 @@
 import os
 import copy
 import sys
+
 sys.path.append("..")
 from launch_function_helper import wait, _find_free_port
 from multiprocessing import Pool, Process
@@ -69,6 +70,7 @@ def run_gen_bkc_id(attr):
 
 
 class TestGenBKCLIdOp(unittest.TestCase):
+
     def setUp(self):
         try:
             self._dist_ut_port_0 = int(os.environ["PADDLE_DIST_UT_PORT"])
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py
new file mode 100644
index 0000000000000..764b4e81ccee9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py
@@ -0,0 +1,544 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+from op_test import OpTest
+import copy
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+def box_coder(all_anchors, bbox_deltas, variances, pixel_offset=True):
+    """
+    Decode proposals by anchors and bbox_deltas from RPN 
+    """
+    offset = 1 if pixel_offset else 0
+    # proposals: xmin, ymin, xmax, ymax
+    proposals = np.zeros_like(bbox_deltas, dtype=np.float32)
+
+    # anchor_loc: width, height, center_x, center_y
+    anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32)
+
+    anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + offset
+    anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + offset
+    anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0]
+    anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1]
+
+    # predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height 
+    pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32)
+    if variances is not None:
+        for i in range(bbox_deltas.shape[0]):
+            pred_bbox[i, 0] = variances[i, 0] * bbox_deltas[i, 0] * anchor_loc[
+                i, 0] + anchor_loc[i, 2]
+            pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[
+                i, 1] + anchor_loc[i, 3]
+            pred_bbox[i, 2] = math.exp(
+                min(variances[i, 2] * bbox_deltas[i, 2], math.log(
+                    1000 / 16.0))) * anchor_loc[i, 0]
+            pred_bbox[i, 3] = math.exp(
+                min(variances[i, 3] * bbox_deltas[i, 3], math.log(
+                    1000 / 16.0))) * anchor_loc[i, 1]
+    else:
+        for i in range(bbox_deltas.shape[0]):
+            pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[
+                i, 2]
+            pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[
+                i, 3]
+            pred_bbox[i, 2] = math.exp(
+                min(bbox_deltas[i, 2], math.log(1000 / 16.0))) * anchor_loc[i,
+                                                                            0]
+            pred_bbox[i, 3] = math.exp(
+                min(bbox_deltas[i, 3], math.log(1000 / 16.0))) * anchor_loc[i,
+                                                                            1]
+    proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2
+    proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2
+    proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - offset
+    proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - offset
+
+    return proposals
+
+
+def clip_tiled_boxes(boxes, im_shape, pixel_offset=True):
+    """Clip boxes to image boundaries. im_shape is [height, width] and boxes
+    has shape (N, 4 * num_tiled_boxes)."""
+    assert boxes.shape[1] % 4 == 0, \
+        'boxes.shape[1] is {:d}, but must be divisible by 4.'.format(
+        boxes.shape[1]
+    )
+    offset = 1 if pixel_offset else 0
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(
+        np.minimum(boxes[:, 0::4], im_shape[1] - offset), 0)
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(
+        np.minimum(boxes[:, 1::4], im_shape[0] - offset), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(
+        np.minimum(boxes[:, 2::4], im_shape[1] - offset), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(
+        np.minimum(boxes[:, 3::4], im_shape[0] - offset), 0)
+    return boxes
+
+
+def filter_boxes(boxes, min_size, im_shape, pixel_offset=True):
+    """Only keep boxes with both sides >= min_size and center within the image.
+    """
+    # Scale min_size to match image scale
+    min_size = max(min_size, 1.0)
+    offset = 1 if pixel_offset else 0
+    ws = boxes[:, 2] - boxes[:, 0] + offset
+    hs = boxes[:, 3] - boxes[:, 1] + offset
+    if pixel_offset:
+        x_ctr = boxes[:, 0] + ws / 2.
+        y_ctr = boxes[:, 1] + hs / 2.
+        keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_shape[
+            1]) & (y_ctr < im_shape[0]))[0]
+    else:
+        keep = np.where((ws >= min_size) & (hs >= min_size))[0]
+    return keep
+
+
+def iou(box_a, box_b, pixel_offset=True):
+    """
+	Apply intersection-over-union overlap between box_a and box_b
+    """
+    xmin_a = min(box_a[0], box_a[2])
+    ymin_a = min(box_a[1], box_a[3])
+    xmax_a = max(box_a[0], box_a[2])
+    ymax_a = max(box_a[1], box_a[3])
+
+    xmin_b = min(box_b[0], box_b[2])
+    ymin_b = min(box_b[1], box_b[3])
+    xmax_b = max(box_b[0], box_b[2])
+    ymax_b = max(box_b[1], box_b[3])
+    offset = 1 if pixel_offset else 0
+    area_a = (ymax_a - ymin_a + offset) * (xmax_a - xmin_a + offset)
+    area_b = (ymax_b - ymin_b + offset) * (xmax_b - xmin_b + offset)
+    if area_a <= 0 and area_b <= 0:
+        return 0.0
+
+    xa = max(xmin_a, xmin_b)
+    ya = max(ymin_a, ymin_b)
+    xb = min(xmax_a, xmax_b)
+    yb = min(ymax_a, ymax_b)
+
+    inter_area = max(xb - xa + offset, 0.0) * max(yb - ya + offset, 0.0)
+
+    iou_ratio = inter_area / (area_a + area_b - inter_area)
+
+    return iou_ratio
+
+
+def nms(boxes, scores, nms_threshold, eta=1.0, pixel_offset=True):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        nms_threshold: (float) The overlap thresh for suppressing unnecessary
+            boxes.
+        eta: (float) The parameter for adaptive NMS.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+    all_scores = copy.deepcopy(scores)
+    all_scores = all_scores.flatten()
+
+    sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
+    sorted_scores = all_scores[sorted_indices]
+    selected_indices = []
+    adaptive_threshold = nms_threshold
+    for i in range(sorted_scores.shape[0]):
+        idx = sorted_indices[i]
+        keep = True
+        for k in range(len(selected_indices)):
+            if keep:
+                kept_idx = selected_indices[k]
+                overlap = iou(boxes[idx],
+                              boxes[kept_idx],
+                              pixel_offset=pixel_offset)
+                keep = True if overlap <= adaptive_threshold else False
+            else:
+                break
+        if keep:
+            selected_indices.append(idx)
+        if keep and eta < 1 and adaptive_threshold > 0.5:
+            adaptive_threshold *= eta
+    return selected_indices
+
+
+def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas,
+                           scores, pre_nms_topN, post_nms_topN, nms_thresh,
+                           min_size, eta, pixel_offset):
+    # Transpose and reshape predicted bbox transformations to get them
+    # into the same order as the anchors:
+    #   - bbox deltas will be (4 * A, H, W) format from conv output
+    #   - transpose to (H, W, 4 * A)
+    #   - reshape to (H * W * A, 4) where rows are ordered by (H, W, A)
+    #     in slowest to fastest order to match the enumerated anchors
+    all_anchors = copy.deepcopy(all_anchors)
+    variances = copy.deepcopy(variances)
+    bbox_deltas = copy.deepcopy(bbox_deltas)
+    scores = copy.deepcopy(scores)
+    bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape(-1, 4)
+    all_anchors = all_anchors.reshape(-1, 4)
+    variances = variances.reshape(-1, 4)
+    # Same story for the scores:
+    #   - scores are (A, H, W) format from conv output
+    #   - transpose to (H, W, A)
+    #   - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
+    #     to match the order of anchors and bbox_deltas
+    scores = scores.transpose((1, 2, 0)).reshape(-1, 1)
+
+    # sort all (proposal, score) pairs by score from highest to lowest
+    # take top pre_nms_topN (e.g. 6000)
+    if pre_nms_topN <= 0 or pre_nms_topN >= len(scores):
+        order = np.argsort(-scores.squeeze())
+    else:
+        # Avoid sorting possibly large arrays;
+        # First partition to get top K unsorted
+        # and then sort just those
+        inds = np.argpartition(-scores.squeeze(), pre_nms_topN)[:pre_nms_topN]
+        order = np.argsort(-scores[inds].squeeze())
+        order = inds[order]
+    scores = scores[order, :]
+    bbox_deltas = bbox_deltas[order, :]
+    all_anchors = all_anchors[order, :]
+    variances = variances[order, :]
+    proposals = box_coder(all_anchors, bbox_deltas, variances, pixel_offset)
+    # clip proposals to image (may result in proposals with zero area
+    # that will be removed in the next step)
+    proposals = clip_tiled_boxes(proposals, im_shape, pixel_offset)
+    # remove predicted boxes with height or width < min_size
+    keep = filter_boxes(proposals, min_size, im_shape, pixel_offset)
+    if len(keep) == 0:
+        proposals = np.zeros((1, 4)).astype('float32')
+        scores = np.zeros((1, 1)).astype('float32')
+        return proposals, scores
+    proposals = proposals[keep, :]
+    scores = scores[keep, :]
+
+    # apply loose nms (e.g. threshold = 0.7)
+    # take post_nms_topN (e.g. 1000)
+    # return the top proposals
+    if nms_thresh > 0:
+        keep = nms(boxes=proposals,
+                   scores=scores,
+                   nms_threshold=nms_thresh,
+                   eta=eta,
+                   pixel_offset=pixel_offset)
+        if post_nms_topN > 0 and post_nms_topN < len(keep):
+            keep = keep[:post_nms_topN]
+        proposals = proposals[keep, :]
+        scores = scores[keep, :]
+
+    return proposals, scores
+
+
+def generate_proposals_v2_in_python(scores, bbox_deltas, im_shape, anchors,
+                                    variances, pre_nms_topN, post_nms_topN,
+                                    nms_thresh, min_size, eta, pixel_offset):
+    all_anchors = anchors.reshape(-1, 4)
+    rois = np.empty((0, 5), dtype=np.float32)
+    roi_probs = np.empty((0, 1), dtype=np.float32)
+
+    rpn_rois = []
+    rpn_roi_probs = []
+    rois_num = []
+    num_images = scores.shape[0]
+    for img_idx in range(num_images):
+        img_i_boxes, img_i_probs = proposal_for_one_image(
+            im_shape[img_idx, :], all_anchors, variances,
+            bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :],
+            pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta,
+            pixel_offset)
+        rois_num.append(img_i_probs.shape[0])
+        rpn_rois.append(img_i_boxes)
+        rpn_roi_probs.append(img_i_probs)
+
+    return rpn_rois, rpn_roi_probs, rois_num
+
+
+def anchor_generator_in_python(input_feat, anchor_sizes, aspect_ratios,
+                               variances, stride, offset):
+    num_anchors = len(aspect_ratios) * len(anchor_sizes)
+    layer_h = input_feat.shape[2]
+    layer_w = input_feat.shape[3]
+    out_dim = (layer_h, layer_w, num_anchors, 4)
+    out_anchors = np.zeros(out_dim).astype('float32')
+
+    for h_idx in range(layer_h):
+        for w_idx in range(layer_w):
+            x_ctr = (w_idx * stride[0]) + offset * (stride[0] - 1)
+            y_ctr = (h_idx * stride[1]) + offset * (stride[1] - 1)
+            idx = 0
+            for r in range(len(aspect_ratios)):
+                ar = aspect_ratios[r]
+                for s in range(len(anchor_sizes)):
+                    anchor_size = anchor_sizes[s]
+                    area = stride[0] * stride[1]
+                    area_ratios = area / ar
+                    base_w = np.round(np.sqrt(area_ratios))
+                    base_h = np.round(base_w * ar)
+                    scale_w = anchor_size / stride[0]
+                    scale_h = anchor_size / stride[1]
+                    w = scale_w * base_w
+                    h = scale_h * base_h
+                    out_anchors[h_idx, w_idx, idx, :] = [
+                        (x_ctr - 0.5 * (w - 1)), (y_ctr - 0.5 * (h - 1)),
+                        (x_ctr + 0.5 * (w - 1)), (y_ctr + 0.5 * (h - 1))
+                    ]
+                    idx += 1
+
+    # set the variance.
+    out_var = np.tile(variances, (layer_h, layer_w, num_anchors, 1))
+    out_anchors = out_anchors.astype('float32')
+    out_var = out_var.astype('float32')
+    return out_anchors, out_var
+
+
+class XPUGenerateProposalsV2Op(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'generate_proposals_v2'
+        self.use_dynamic_create_class = False
+
+    class TestGenerateProposalsV2Op(XPUOpTest):
+        def set_data(self):
+            self.init_input_shape()
+            self.init_test_params()
+            self.init_test_input()
+            self.init_test_output()
+            self.inputs = {
+                'Scores': self.scores,
+                'BboxDeltas': self.bbox_deltas,
+                'ImShape': self.im_shape.astype(self.dtype),
+                'Anchors': self.anchors,
+                'Variances': self.variances
+            }
+
+            self.attrs = {
+                'pre_nms_topN': self.pre_nms_topN,
+                'post_nms_topN': self.post_nms_topN,
+                'nms_thresh': self.nms_thresh,
+                'min_size': self.min_size,
+                'eta': self.eta,
+                'pixel_offset': self.pixel_offset,
+            }
+
+            self.outputs = {
+                'RpnRois': (self.rpn_rois[0], [self.rois_num]),
+                'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
+            }
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                self.check_output_with_place(self.place)
+
+        def setUp(self):
+            self.set_xpu()
+            self.op_type = "generate_proposals_v2"
+            self.place = paddle.XPUPlace(0)
+            self.init_dtype()
+            self.set_data()
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+            self.__class__.no_need_check_grad = True
+
+        def init_input_shape(self):
+            self.input_feat_shape = (1, 20, 16, 16)
+            self.im_shape = np.array([[64, 64]]).astype(self.dtype)
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def init_test_params(self):
+            self.pre_nms_topN = 12000  # train 12000, test 2000
+            self.post_nms_topN = 5000  # train 6000, test 1000
+            self.nms_thresh = 0.7
+            self.min_size = 3.0
+            self.eta = 1.
+            self.pixel_offset = True
+
+        def init_test_input(self):
+            batch_size = self.input_feat_shape[0]
+            input_channels = self.input_feat_shape[1]
+            layer_h = self.input_feat_shape[2]
+            layer_w = self.input_feat_shape[3]
+            input_feat = np.random.random((batch_size, input_channels, layer_h,
+                                           layer_w)).astype(self.dtype)
+            self.anchors, self.variances = anchor_generator_in_python(
+                input_feat=input_feat,
+                anchor_sizes=[16., 32.],
+                aspect_ratios=[0.5, 1.0],
+                variances=[1.0, 1.0, 1.0, 1.0],
+                stride=[16.0, 16.0],
+                offset=0.5)
+            num_anchors = self.anchors.shape[2]
+            self.scores = np.random.random(
+                (batch_size, num_anchors, layer_h, layer_w)).astype(self.dtype)
+            self.bbox_deltas = np.random.random(
+                (batch_size, num_anchors * 4, layer_h,
+                 layer_w)).astype(self.dtype)
+
+        def init_test_output(self):
+            self.rpn_rois, self.rpn_roi_probs, self.rois_num = generate_proposals_v2_in_python(
+                self.scores, self.bbox_deltas, self.im_shape, self.anchors,
+                self.variances, self.pre_nms_topN, self.post_nms_topN,
+                self.nms_thresh, self.min_size, self.eta, self.pixel_offset)
+
+    class TestGenerateProposalsV2OutLodOp(TestGenerateProposalsV2Op):
+        def set_data(self):
+            self.init_input_shape()
+            self.init_test_params()
+            self.init_test_input()
+            self.init_test_output()
+            self.inputs = {
+                'Scores': self.scores,
+                'BboxDeltas': self.bbox_deltas,
+                'ImShape': self.im_shape.astype(np.float32),
+                'Anchors': self.anchors,
+                'Variances': self.variances
+            }
+
+            self.attrs = {
+                'pre_nms_topN': self.pre_nms_topN,
+                'post_nms_topN': self.post_nms_topN,
+                'nms_thresh': self.nms_thresh,
+                'min_size': self.min_size,
+                'eta': self.eta,
+                'pixel_offset': self.pixel_offset,
+                'return_rois_num': True
+            }
+
+            self.outputs = {
+                'RpnRois': (self.rpn_rois[0], [self.rois_num]),
+                'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
+                'RpnRoisNum': (np.asarray(
+                    self.rois_num, dtype=np.int32))
+            }
+
+    class TestGenerateProposalsV2OpNoBoxLeft(TestGenerateProposalsV2Op):
+        def init_test_params(self):
+            self.pre_nms_topN = 12000  # train 12000, test 2000
+            self.post_nms_topN = 5000  # train 6000, test 1000
+            self.nms_thresh = 0.7
+            self.min_size = 1000.0
+            self.eta = 1.
+            self.pixel_offset = True
+
+    class TestGenerateProposalsV2OpNoOffset(TestGenerateProposalsV2Op):
+        def init_test_params(self):
+            self.pre_nms_topN = 12000  # train 12000, test 2000
+            self.post_nms_topN = 5000  # train 6000, test 1000
+            self.nms_thresh = 0.7
+            self.min_size = 3.0
+            self.eta = 1.
+            self.pixel_offset = False
+
+    # """
+    class TestGenerateProposalsV2OpMaskRcnn1XPU(TestGenerateProposalsV2Op):
+        def init_input_shape(self):
+            self.input_feat_shape = (1, 20, 48, 64)
+            self.im_shape = np.array([[768, 1024]]).astype(self.dtype)
+
+        def init_test_params(self):
+            self.pre_nms_topN = 12000  # train 12000, test 2000
+            self.post_nms_topN = 2000  # train 6000, test 1000
+            self.nms_thresh = 0.7
+            self.min_size = 0.0
+            self.eta = 1.
+            self.pixel_offset = False
+
+        def init_test_input(self):
+            batch_size = self.input_feat_shape[0]
+            input_channels = self.input_feat_shape[1]
+            layer_h = self.input_feat_shape[2]
+            layer_w = self.input_feat_shape[3]
+            input_feat = np.random.random((batch_size, input_channels, layer_h,
+                                           layer_w)).astype(self.dtype)
+            self.anchors, self.variances = anchor_generator_in_python(
+                input_feat=input_feat,
+                anchor_sizes=[32, 64, 128, 256, 512],
+                aspect_ratios=[0.5, 1.0, 2.0],
+                variances=[1.0, 1.0, 1.0, 1.0],
+                stride=[16.0, 16.0],
+                offset=0.5)
+            num_anchors = self.anchors.shape[2]
+            self.scores = np.random.random(
+                (batch_size, num_anchors, layer_h, layer_w)).astype(self.dtype)
+            self.bbox_deltas = np.random.random(
+                (batch_size, num_anchors * 4, layer_h,
+                 layer_w)).astype(self.dtype)
+            self.anchors = self.anchors.reshape(-1, 4)
+            self.variances = self.variances.reshape(-1, 4)
+
+        def set_data(self):
+            np.random.seed(1)
+            self.init_input_shape()
+            self.init_test_params()
+            self.init_test_input()
+            self.init_test_output()
+
+            self.inputs = {
+                'Scores': self.scores,
+                'BboxDeltas': self.bbox_deltas,
+                'ImShape': self.im_shape.astype(np.float32),
+                'Anchors': self.anchors,
+                'Variances': self.variances
+            }
+
+            self.attrs = {
+                'pre_nms_topN': self.pre_nms_topN,
+                'post_nms_topN': self.post_nms_topN,
+                'nms_thresh': self.nms_thresh,
+                'min_size': self.min_size,
+                'eta': self.eta,
+                'pixel_offset': self.pixel_offset,
+                'return_rois_num': True
+            }
+
+            self.outputs = {
+                'RpnRois': (self.rpn_rois[0], [self.rois_num]),
+                'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
+                'RpnRoisNum': (np.asarray(
+                    self.rois_num, dtype=np.int32))
+            }
+
+
+support_types = get_xpu_op_support_types('generate_proposals_v2')
+for stype in support_types:
+    create_test_class(
+        globals(),
+        XPUGenerateProposalsV2Op,
+        stype,
+        test_grad=False,
+        ignore_deivce_version=[core.XPUVersion.XPU1])
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py
index 30c91f87a2452..c3caa0bad10be 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -38,11 +39,13 @@ def huber_loss_forward(val, delta):
 
 
 class XPUTestHuberLossOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'huber_loss'
         self.use_dynamic_create_class = False
 
     class TestHuberLossOp(XPUOpTest):
+
         def setUp(self):
             self.set_xpu()
             self.op_type = 'huber_loss'
@@ -89,22 +92,27 @@ def test_check_grad_normal(self):
             self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
         def test_check_grad_ingore_x(self):
-            self.check_grad_with_place(
-                self.place, ['Y'], 'Out', no_grad_set=set("residual"))
+            self.check_grad_with_place(self.place, ['Y'],
+                                       'Out',
+                                       no_grad_set=set("residual"))
 
         def test_check_grad_ingore_y(self):
-            self.check_grad_with_place(
-                self.place, ['X'], 'Out', no_grad_set=set('residual'))
+            self.check_grad_with_place(self.place, ['X'],
+                                       'Out',
+                                       no_grad_set=set('residual'))
 
     class TestHuberLossOp1(TestHuberLossOp):
+
         def set_shape(self):
             return (640)
 
     class TestHuberLossOp2(TestHuberLossOp):
+
         def set_shape(self):
             return (10, 10)
 
     class TestHuberLossOp3(TestHuberLossOp):
+
         def set_shape(self):
             return (10, 10, 1)
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py
index b745dce9efef4..ceb154f1e3520 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import unittest
 import sys
+
 sys.path.append("..")
 
 import unittest
@@ -30,6 +31,7 @@
 
 
 class TestXPUIOUSimilarityOp(XPUOpTest):
+
     def test_check_output(self):
         if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
@@ -41,7 +43,7 @@ def setUp(self):
         self.boxes2 = random.rand(3, 4).astype('float32')
         self.output = random.rand(2, 3).astype('float32')
         self.box_normalized = False
-        # run python iou computation 
+        # run python iou computation
         self._compute_iou()
         self.inputs = {'X': self.boxes1, 'Y': self.boxes2}
         self.attrs = {"box_normalized": self.box_normalized, 'use_xpu': True}
@@ -77,6 +79,7 @@ def _compute_iou(self, ):
 
 
 class TestXPUIOUSimilarityOpWithLoD(TestXPUIOUSimilarityOp):
+
     def test_check_output(self):
         if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
@@ -87,7 +90,7 @@ def setUp(self):
         self.boxes1_lod = [[1, 1]]
         self.output_lod = [[1, 1]]
         self.box_normalized = False
-        # run python iou computation 
+        # run python iou computation
         self._compute_iou()
         self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
         self.attrs = {"box_normalized": self.box_normalized}
@@ -95,6 +98,7 @@ def setUp(self):
 
 
 class TestXPUIOUSimilarityOpWithBoxNormalized(TestXPUIOUSimilarityOp):
+
     def test_check_output(self):
         if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
@@ -105,7 +109,7 @@ def setUp(self):
         self.boxes1_lod = [[1, 1]]
         self.output_lod = [[1, 1]]
         self.box_normalized = True
-        # run python iou computation 
+        # run python iou computation
         self._compute_iou()
         self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
         self.attrs = {"box_normalized": self.box_normalized}
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_label_smooth_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_label_smooth_op_xpu.py
index afe1662ce5cfc..415796988d186 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_label_smooth_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_label_smooth_op_xpu.py
@@ -18,6 +18,7 @@
 import paddle
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
 from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
@@ -26,6 +27,7 @@
 
 
 class XPUTestLabelSmoothOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'label_smooth'
         self.use_dynamic_create_class = True
@@ -45,6 +47,7 @@ def dynamic_create_class(self):
         return base_class, classes
 
     class TestLabelSmoothOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = "label_smooth"
             self.epsilon = 0.1
@@ -54,19 +57,19 @@ def setUp(self):
                 self.label_dim = 12
             self.label = np.zeros(
                 (self.batch_size, self.label_dim)).astype("float32")
-            nonzero_index = np.random.randint(
-                self.label_dim, size=(self.batch_size))
+            nonzero_index = np.random.randint(self.label_dim,
+                                              size=(self.batch_size))
             self.label[np.arange(self.batch_size), nonzero_index] = 1
-            smoothed_label = (1 - self.epsilon
-                              ) * self.label + self.epsilon / self.label_dim
+            smoothed_label = (
+                1 - self.epsilon) * self.label + self.epsilon / self.label_dim
             self.inputs = {'X': self.label}
             self.attrs = {'epsilon': self.epsilon}
             self.outputs = {'Out': smoothed_label}
             if hasattr(self, 'is_3d') and self.is_3d:
                 self.inputs['X'] = self.inputs['X'].reshape(
                     [2, -1, self.inputs['X'].shape[-1]])
-                self.outputs['Out'] = self.outputs['Out'].reshape(self.inputs[
-                    'X'].shape)
+                self.outputs['Out'] = self.outputs['Out'].reshape(
+                    self.inputs['X'].shape)
 
         def test_check_output(self):
             if not paddle.is_compiled_with_xpu():
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_lamb_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_lamb_op_xpu.py
index f6aa82d596be7..6bdc45e6a33dd 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_lamb_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_lamb_op_xpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py
index b166661c3d6bc..1f2caa9fbe9d8 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py
@@ -17,6 +17,7 @@
 import sys
 import unittest
 from functools import reduce
+
 sys.path.append("..")
 from op_test import OpTest
 from operator import mul
@@ -44,6 +45,7 @@ def ref_layer_norm(x, scale, bias, epsilon, begin_norm_axis=1):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestXPULayerNormOp(OpTest):
+
     def setUp(self):
         self.op_type = "layer_norm"
         self.dtype = np.float32
@@ -75,13 +77,15 @@ def test_check_output(self):
         self.check_output_with_place(paddle.XPUPlace(0), atol=1e-4)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            paddle.XPUPlace(0), ['X'], 'Y', max_relative_error=0.02)
+        self.check_grad_with_place(paddle.XPUPlace(0), ['X'],
+                                   'Y',
+                                   max_relative_error=0.02)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestXPULayerNormOpAxis2(TestXPULayerNormOp):
+
     def set_attrs(self):
         self.begin_norm_axis = 2
 
@@ -89,6 +93,7 @@ def set_attrs(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestXPULayerNormOpAxis3(TestXPULayerNormOp):
+
     def set_attrs(self):
         self.begin_norm_axis = 3
 
@@ -96,6 +101,7 @@ def set_attrs(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestXPULayerNormOp2D(TestXPULayerNormOp):
+
     def set_attrs(self):
         self.shape = [10, 12]
 
@@ -103,6 +109,7 @@ def set_attrs(self):
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestXPULayerNormOp3D(TestXPULayerNormOp):
+
     def set_attrs(self):
         self.shape = [4, 5, 6]
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_log_loss_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_log_loss_op_xpu.py
index 3ba3a8b5eef30..3924c1bd0f3fa 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_log_loss_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_log_loss_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 import paddle.fluid.core as core
 import unittest
@@ -30,6 +31,7 @@ def sigmoid_array(x):
 
 
 class TestXPULogLossOp(OpTest):
+
     def setUp(self):
         self.op_type = 'log_loss'
         samples_num = 100
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
index 6b720b9717be4..34eafd208114f 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -29,10 +30,12 @@
 
 ################## TEST OP: logical_and ##################
 class XPUTestLogicalAnd(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'logical_and'
 
     class XPUTestLogicalAndBase(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_case()
@@ -41,10 +44,14 @@ def setUp(self):
         def set_case(self):
             self.op_type = 'logical_and'
 
-            x = np.random.randint(
-                self.low, self.high, self.x_shape, dtype=self.dtype)
-            y = np.random.randint(
-                self.low, self.high, self.y_shape, dtype=self.dtype)
+            x = np.random.randint(self.low,
+                                  self.high,
+                                  self.x_shape,
+                                  dtype=self.dtype)
+            y = np.random.randint(self.low,
+                                  self.high,
+                                  self.y_shape,
+                                  dtype=self.dtype)
             out = np.logical_and(x, y)
 
             self.attrs = {'use_xpu': True}
@@ -68,6 +75,7 @@ def test_check_grad(self):
             pass
 
     class XPUTestLogicalAndCase1(XPUTestLogicalAndBase):
+
         def init_case(self):
             self.dtype = np.int32
             self.x_shape = [4, 5]
@@ -83,10 +91,12 @@ def init_case(self):
 
 ################## TEST OP: logical_or ##################
 class XPUTestLogicalOr(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'logical_or'
 
     class XPUTestLogicalOrBase(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_case()
@@ -95,10 +105,14 @@ def setUp(self):
         def set_case(self):
             self.op_type = 'logical_or'
 
-            x = np.random.randint(
-                self.low, self.high, self.x_shape, dtype=self.dtype)
-            y = np.random.randint(
-                self.low, self.high, self.y_shape, dtype=self.dtype)
+            x = np.random.randint(self.low,
+                                  self.high,
+                                  self.x_shape,
+                                  dtype=self.dtype)
+            y = np.random.randint(self.low,
+                                  self.high,
+                                  self.y_shape,
+                                  dtype=self.dtype)
             out = np.logical_or(x, y)
 
             self.attrs = {'use_xpu': True}
@@ -122,6 +136,7 @@ def test_check_grad(self):
             pass
 
     class XPUTestLogicalOrCase1(XPUTestLogicalOrBase):
+
         def init_case(self):
             self.dtype = np.int32
             self.x_shape = [4, 5]
@@ -137,10 +152,12 @@ def init_case(self):
 
 ################## TEST OP: logical_xor ##################
 class XPUTestLogicalXor(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'logical_xor'
 
     class XPUTestLogicalXorBase(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_case()
@@ -149,10 +166,14 @@ def setUp(self):
         def set_case(self):
             self.op_type = 'logical_xor'
 
-            x = np.random.randint(
-                self.low, self.high, self.x_shape, dtype=self.dtype)
-            y = np.random.randint(
-                self.low, self.high, self.y_shape, dtype=self.dtype)
+            x = np.random.randint(self.low,
+                                  self.high,
+                                  self.x_shape,
+                                  dtype=self.dtype)
+            y = np.random.randint(self.low,
+                                  self.high,
+                                  self.y_shape,
+                                  dtype=self.dtype)
             out = np.logical_xor(x, y)
 
             self.attrs = {'use_xpu': True}
@@ -176,6 +197,7 @@ def test_check_grad(self):
             pass
 
     class XPUTestLogicalXorCase1(XPUTestLogicalXorBase):
+
         def init_case(self):
             self.dtype = np.int32
             self.x_shape = [4, 5]
@@ -191,10 +213,12 @@ def init_case(self):
 
 ##################  TEST OP: LogicalNot ##################
 class XPUTestLogicalNot(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'logical_not'
 
     class XPUTestLogicalNotBase(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_case()
@@ -203,8 +227,10 @@ def setUp(self):
         def set_case(self):
             self.op_type = 'logical_not'
 
-            x = np.random.randint(
-                self.low, self.high, self.x_shape, dtype=self.dtype)
+            x = np.random.randint(self.low,
+                                  self.high,
+                                  self.x_shape,
+                                  dtype=self.dtype)
             out = np.logical_not(x)
 
             self.attrs = {'use_xpu': True}
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py
index c4e1363bd9c94..6c621a6853b0d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py
@@ -15,6 +15,7 @@
 import paddle
 import unittest
 import sys
+
 sys.path.append("..")
 import numpy as np
 from op_test import OpTest
@@ -35,6 +36,7 @@ def ref_logsumexp(x, axis=None, keepdim=False, reduce_all=False):
 
 
 class XPUTestLogsumexp(XPUOpTest):
+
     def setUp(self):
         self.op_type = 'logsumexp'
         self.shape = [2, 3, 4, 5]
@@ -69,26 +71,31 @@ def test_check_grad(self):
 
 
 class TestLogsumexp_shape(XPUTestLogsumexp):
+
     def set_attrs(self):
         self.shape = [4, 5, 6]
 
 
 class TestLogsumexp_axis(XPUTestLogsumexp):
+
     def set_attrs(self):
         self.axis = [0, -1]
 
 
 class TestLogsumexp_axis_all(XPUTestLogsumexp):
+
     def set_attrs(self):
         self.axis = [0, 1, 2, 3]
 
 
 class TestLogsumexp_keepdim(XPUTestLogsumexp):
+
     def set_attrs(self):
         self.keepdim = True
 
 
 class TestLogsumexp_reduce_all(XPUTestLogsumexp):
+
     def set_attrs(self):
         self.reduce_all = True
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py
index d29684b11b070..2dbabdb7c585c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -31,6 +32,7 @@
 
 
 class TestLookupTableOp(OpTest):
+
     def setUp(self):
         self.op_type = "lookup_table_v2"
         table = np.random.random((17, 31)).astype("float64")
@@ -43,15 +45,15 @@ def test_check_output_with_place(self):
 
     def test_check_grad(self):
 
-        self.check_grad_with_place(
-            inputs_to_check=['W'],
-            output_names='Out',
-            no_grad_set=set('Ids'),
-            place=paddle.XPUPlace(0),
-            in_place=True)
+        self.check_grad_with_place(inputs_to_check=['W'],
+                                   output_names='Out',
+                                   no_grad_set=set('Ids'),
+                                   place=paddle.XPUPlace(0),
+                                   in_place=True)
 
 
 class TestLookupTableOpWithTensorIds(OpTest):
+
     def setUp(self):
         self.op_type = "lookup_table_v2"
         table = np.random.random((17, 31)).astype("float64")
@@ -63,12 +65,11 @@ def test_check_output(self):
         self.check_output_with_place(place=paddle.XPUPlace(0))
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            inputs_to_check=['W'],
-            output_names='Out',
-            no_grad_set=set('Ids'),
-            place=paddle.XPUPlace(0),
-            in_place=True)
+        self.check_grad_with_place(inputs_to_check=['W'],
+                                   output_names='Out',
+                                   no_grad_set=set('Ids'),
+                                   place=paddle.XPUPlace(0),
+                                   in_place=True)
 
 
 @skip_check_grad_ci(
@@ -76,6 +77,7 @@ def test_check_grad(self):
     "the gradient of paddings makes no sense and we don't "
     "test the gradient here.")
 class TestLookupTableOpWithPadding(TestLookupTableOp):
+
     def test_check_output(self):
         ids = np.squeeze(self.inputs['Ids'])
         padding_idx = np.random.choice(ids, 1)[0]
@@ -89,6 +91,7 @@ def test_check_output(self):
     "the gradient of paddings makes no sense and we don't "
     "test the gradient here.")
 class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
+
     def test_check_output(self):
         ids = self.inputs['Ids']
         flatten_idx = ids.flatten()
@@ -99,6 +102,7 @@ def test_check_output(self):
 
 
 class TestLookupTableWIsSelectedRows(unittest.TestCase):
+
     def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
         ids_array = np.array([0, 4, 3, 5]).astype("int64")
@@ -146,12 +150,13 @@ def test_w_is_selected_rows(self):
             self.check_with_place(place)
 
 
-class TestLookupTableWithTensorIdsWIsSelectedRows(
-        TestLookupTableWIsSelectedRows):
+class TestLookupTableWithTensorIdsWIsSelectedRows(TestLookupTableWIsSelectedRows
+                                                  ):
+
     def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.random.randint(
-            low=0, high=6, size=(2, 4, 3)).astype("int64")
+        ids_array = np.random.randint(low=0, high=6,
+                                      size=(2, 4, 3)).astype("int64")
         ids_tensor.set(ids_array, place)
         return ids_array
 
@@ -161,6 +166,7 @@ def check_result(self, ids_array, result_array):
 
 
 class TestLookupTableApi(unittest.TestCase):
+
     def test_api(self):
         x = fluid.layers.data(name='x', shape=[20], dtype='int64')
         emb = fluid.embedding(input=x, size=[128, 64])
@@ -170,12 +176,15 @@ def test_api(self):
 
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'x': x_data, },
+        ret = exe.run(feed={
+            'x': x_data,
+        },
                       fetch_list=[emb],
                       return_numpy=False)
 
 
 class TestEmbedOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             input_data = np.random.randint(0, 10, (4, 6)).astype("int64")
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py
index 990594e1f9edf..2c1ef7755ab70 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -36,10 +37,12 @@ def np_masked_select(x, mask):
 
 
 class XPUTestMaskedSelectOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'masked_select'
 
     class TestMaskedSelectOp(XPUOpTest):
+
         def setUp(self):
             self.init()
             self.dtype = self.in_type
@@ -60,10 +63,12 @@ def init(self):
             self.shape = (50, 3)
 
     class TestMaskedSelectOp1(TestMaskedSelectOp):
+
         def init(self):
             self.shape = (6, 8, 9, 18)
 
     class TestMaskedSelectOp2(TestMaskedSelectOp):
+
         def init(self):
             self.shape = (168, )
 
@@ -74,6 +79,7 @@ def init(self):
 
 
 class TestMaskedSelectAPI(unittest.TestCase):
+
     def test_imperative_mode(self):
         paddle.disable_static(paddle.XPUPlace(0))
         shape = (88, 6, 8)
@@ -99,13 +105,16 @@ def test_static_mode(self):
         exe = paddle.static.Executor(place=paddle.XPUPlace(0))
 
         res = exe.run(paddle.static.default_main_program(),
-                      feed={"x": np_x,
-                            "mask": np_mask},
+                      feed={
+                          "x": np_x,
+                          "mask": np_mask
+                      },
                       fetch_list=[out])
         self.assertEqual(np.allclose(res, np_out), True)
 
 
 class TestMaskedSelectError(unittest.TestCase):
+
     def test_error(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
@@ -113,8 +122,9 @@ def test_error(self):
             shape = [8, 9, 6]
             x = paddle.fluid.data(shape=shape, dtype='float32', name='x')
             mask = paddle.fluid.data(shape=shape, dtype='bool', name='mask')
-            mask_float = paddle.fluid.data(
-                shape=shape, dtype='float32', name='mask_float')
+            mask_float = paddle.fluid.data(shape=shape,
+                                           dtype='float32',
+                                           name='mask_float')
             np_x = np.random.random(shape).astype('float32')
             np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
index 3120f1973f4f8..bc6fa19a35444 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 import paddle.fluid.core as core
 import unittest
@@ -161,18 +162,22 @@ def test_negative_dims_program(obj):
                         obj.assertEqual(Ref.shape[idx], output.shape[idx])
                 exe = fluid.Executor(fluid.XPUPlace(0))
                 res, = exe.run(fluid.default_main_program(),
-                               feed={'x': X,
-                                     'y': Y},
+                               feed={
+                                   'x': X,
+                                   'y': Y
+                               },
                                fetch_list=[output])
                 np.allclose(res, Ref, atol=1e-3)
 
 
 class XPUTestMatmulOpErr(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = "matmul"
         self.use_dynamic_create_class = False
 
     class TestMatmulOpError(unittest.TestCase):
+
         def test_errors(self):
             with program_guard(Program(), Program()):
                 # The inputs type of matmul_op must be Variable.
@@ -180,15 +185,18 @@ def test_errors(self):
                 self.assertRaises(TypeError, fluid.layers.matmul, input1,
                                   input1)
                 # The inputs dtype of matmul_op must be float32, float16
-                input2 = fluid.layers.data(
-                    name='input2', shape=[10, 10], dtype="int32")
+                input2 = fluid.layers.data(name='input2',
+                                           shape=[10, 10],
+                                           dtype="int32")
                 self.assertRaises(TypeError, fluid.layers.matmul, input2,
                                   input2)
-                input3 = fluid.layers.data(
-                    name='input3', shape=[2, 2], dtype="float16")
+                input3 = fluid.layers.data(name='input3',
+                                           shape=[2, 2],
+                                           dtype="float16")
                 fluid.layers.matmul(input3, input3)
 
     class API_TestMm(unittest.TestCase):
+
         def test_out(self):
             with fluid.program_guard(fluid.Program()):
                 x = fluid.data(name="x", shape=[2], dtype=self.in_type)
@@ -198,15 +206,16 @@ def test_out(self):
                 exe = fluid.Executor(fluid.XPUPlace(0))
                 data1 = np.random.rand(2).astype(self.in_type)
                 data2 = np.random.rand(2).astype(self.in_type)
-                np_res = exe.run(feed={'x': data1,
-                                       'y': data2},
+                np_res = exe.run(feed={
+                    'x': data1,
+                    'y': data2
+                },
                                  fetch_list=[result])
-                expected_result = np.matmul(
-                    data1.reshape(1, 2), data2.reshape(2, 1))
+                expected_result = np.matmul(data1.reshape(1, 2),
+                                            data2.reshape(2, 1))
 
                 self.assertTrue(
-                    np.allclose(
-                        np_res, expected_result, atol=1e-3),
+                    np.allclose(np_res, expected_result, atol=1e-3),
                     "two value is\
                     {}\n{}, check diff!".format(np_res, expected_result))
 
@@ -220,10 +229,10 @@ def test_dygraph_without_out(self):
                 out = paddle.mm(data1, data2)
                 expected_result = np.matmul(input_array1, input_array2)
                 self.assertTrue(
-                    np.allclose(
-                        expected_result, out.numpy(), atol=1e-3))
+                    np.allclose(expected_result, out.numpy(), atol=1e-3))
 
     class Test_API_Matmul(unittest.TestCase):
+
         def test_dygraph_without_out(self):
             device = fluid.XPUPlace(0)
             with fluid.dygraph.guard(device):
@@ -236,43 +245,51 @@ def test_dygraph_without_out(self):
                 out = paddle.matmul(data1, data2)
                 expected_result = np.matmul(input_array1, input_array2)
                 self.assertTrue(
-                    np.allclose(
-                        expected_result, out.numpy(), atol=1e-3))
+                    np.allclose(expected_result, out.numpy(), atol=1e-3))
 
     class API_TestMmError(unittest.TestCase):
+
         def test_errors(self):
+
             def test_error1():
                 with fluid.program_guard(fluid.Program(), fluid.Program()):
-                    data1 = fluid.data(
-                        name="data1", shape=[10, 2], dtype="float32")
-                    data2 = fluid.data(
-                        name="data2", shape=[3, 10], dtype="float32")
+                    data1 = fluid.data(name="data1",
+                                       shape=[10, 2],
+                                       dtype="float32")
+                    data2 = fluid.data(name="data2",
+                                       shape=[3, 10],
+                                       dtype="float32")
                     paddle.mm(data1, data2)
 
             self.assertRaises(ValueError, test_error1)
 
             def test_error2():
                 with fluid.program_guard(fluid.Program(), fluid.Program()):
-                    data1 = fluid.data(
-                        name="data1", shape=[-1, 10, 2], dtype="float32")
-                    data2 = fluid.data(
-                        name="data2", shape=[-1, 2, 10], dtype="float32")
+                    data1 = fluid.data(name="data1",
+                                       shape=[-1, 10, 2],
+                                       dtype="float32")
+                    data2 = fluid.data(name="data2",
+                                       shape=[-1, 2, 10],
+                                       dtype="float32")
                     paddle.mm(data1, data2)
 
             test_error2()
 
             def test_error3():
                 with fluid.program_guard(fluid.Program(), fluid.Program()):
-                    data1 = fluid.data(
-                        name="data1", shape=[10, 10, 2], dtype="float32")
-                    data2 = fluid.data(
-                        name="data2", shape=[3, 2, 10], dtype="float32")
+                    data1 = fluid.data(name="data1",
+                                       shape=[10, 10, 2],
+                                       dtype="float32")
+                    data2 = fluid.data(name="data2",
+                                       shape=[3, 2, 10],
+                                       dtype="float32")
                     paddle.mm(data1, data2)
 
             self.assertRaises(ValueError, test_error3)
 
 
 class TestMatmulBaseGenerator(XPUOpTest):
+
     def setUp(self):
         self.op_type = "matmul"
         self.dtype = np.float32 if not hasattr(self,
@@ -297,21 +314,27 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         place = paddle.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X', 'Y'], 'Out', max_relative_error=5e-2)
+        self.check_grad_with_place(place, ['X', 'Y'],
+                                   'Out',
+                                   max_relative_error=5e-2)
 
     def test_check_grad_ignore_x(self):
         place = paddle.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['Y'], 'Out', max_relative_error=5e-2, no_grad_set=set("X"))
+        self.check_grad_with_place(place, ['Y'],
+                                   'Out',
+                                   max_relative_error=5e-2,
+                                   no_grad_set=set("X"))
 
     def test_check_grad_ignore_y(self):
         place = paddle.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=5e-2, no_grad_set=set('Y'))
+        self.check_grad_with_place(place, ['X'],
+                                   'Out',
+                                   max_relative_error=5e-2,
+                                   no_grad_set=set('Y'))
 
 
 class XPUTestMatmulOp1(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = "matmul"
         self.use_dynamic_create_class = True
@@ -328,8 +351,9 @@ def dynamic_create_class(self):
                 for transose_y in [True, False]:
                     for batch in batch_size:
                         class_name = (
-                            'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_batch_{}'.
-                            format(dim_X, dim_Y, transose_x, transose_y, batch))
+                            'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_batch_{}'
+                            .format(dim_X, dim_Y, transose_x, transose_y,
+                                    batch))
                         shape_x, shape_y = generate_compatible_shapes(
                             dim_X, dim_Y, transose_x, transose_y, batch)
                         attr_dict = {
@@ -345,6 +369,7 @@ def dynamic_create_class(self):
 
 
 class XPUTestMatmulOp2(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = "matmul"
         self.use_dynamic_create_class = True
@@ -361,8 +386,9 @@ def dynamic_create_class(self):
                 for transose_y in [True, False]:
                     for batch in batch_size:
                         class_name = (
-                            'TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}_batch_{}'.
-                            format(dim_X, dim_Y, transose_x, transose_y, batch))
+                            'TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}_batch_{}'
+                            .format(dim_X, dim_Y, transose_x, transose_y,
+                                    batch))
                         shape_x, shape_y = generate_compatible_shapes(
                             dim_X, dim_Y, transose_x, transose_y, batch)
                         attr_dict = {
@@ -377,6 +403,7 @@ def dynamic_create_class(self):
 
 
 class XPUTestMatmulOp3(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = "matmul"
         self.use_dynamic_create_class = True
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
index 3db3031f44c80..8f31981355403 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -59,6 +60,7 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
 
 
 class XPUTestMatmulV2Op(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = "matmul_v2"
         self.use_dynamic_create_class = False
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
index 896821552c9f7..0ddc38dbceba6 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
 from op_test import OpTest
@@ -29,6 +30,7 @@
 
 
 class TestMeanOp(XPUOpTest):
+
     def setUp(self):
         self.op_type = "mean"
         self.init_dtype_type()
@@ -52,21 +54,25 @@ def test_checkout_grad(self):
 
 
 class TestMeanOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of mean_op must be Variable.
             input1 = 12
             self.assertRaises(TypeError, fluid.layers.mean, input1)
             # The input dtype of mean_op must be float16, float32, float64.
-            input2 = fluid.layers.data(
-                name='input2', shape=[12, 10], dtype="int32")
+            input2 = fluid.layers.data(name='input2',
+                                       shape=[12, 10],
+                                       dtype="int32")
             self.assertRaises(TypeError, fluid.layers.mean, input2)
-            input3 = fluid.layers.data(
-                name='input3', shape=[4], dtype="float16")
+            input3 = fluid.layers.data(name='input3',
+                                       shape=[4],
+                                       dtype="float16")
             fluid.layers.softmax(input3)
 
 
 class TestXPUMeanOp(TestMeanOp):
+
     def init_dtype_type(self):
         self.dtype = np.float32
 
@@ -84,6 +90,7 @@ def test_checkout_grad(self):
 
 
 class TestXPUMeanOpFp16(TestMeanOp):
+
     def init_dtype_type(self):
         self.dtype = np.float16
 
@@ -97,8 +104,9 @@ def test_checkout_grad(self):
         if paddle.is_compiled_with_xpu():
             paddle.enable_static()
             place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['X'], 'Out', max_relative_error=1.e1)
+            self.check_grad_with_place(place, ['X'],
+                                       'Out',
+                                       max_relative_error=1.e1)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py
index f7c1f0041e805..a33b3e4755196 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -50,11 +51,13 @@ def calculate_momentum_by_numpy(param, grad, mu, velocity, use_nesterov,
 
 
 class XPUTestMomentumOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'momentum'
         self.use_dynamic_create_class = False
 
     class TestMomentumOPBase(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.xpu_version = core.get_xpu_device_version(0)
@@ -70,8 +73,8 @@ def set_case(self):
                                            self.input_shape).astype(self.dtype)
             self.grad = np.random.uniform(-1, 1,
                                           self.input_shape).astype(self.dtype)
-            self.velocity = np.random.uniform(
-                -1, 1, self.input_shape).astype(self.dtype)
+            self.velocity = np.random.uniform(-1, 1, self.input_shape).astype(
+                self.dtype)
 
             param_out, velocity_out = calculate_momentum_by_numpy(
                 param=self.param,
@@ -112,6 +115,7 @@ def init_config(self):
             self.regularization_coeff = 0
 
     class XPUTestMomentum1(TestMomentumOPBase):
+
         def init_config(self):
             self.input_shape = [2, 768]
             self.learning_rate = np.array([0.002]).astype(self.dtype)
@@ -121,6 +125,7 @@ def init_config(self):
             self.regularization_coeff = 0
 
     class XPUTestMomentum2(TestMomentumOPBase):
+
         def init_config(self):
             self.input_shape = [3, 8, 4096]
             self.learning_rate = np.array([0.005]).astype(self.dtype)
@@ -130,6 +135,7 @@ def init_config(self):
             self.regularization_coeff = 0
 
     class XPUTestMomentum3(TestMomentumOPBase):
+
         def init_config(self):
             self.input_shape = [1024]
             self.learning_rate = np.array([0.01]).astype(self.dtype)
@@ -144,6 +150,7 @@ def init_config(self):
                 self.regularization_coeff = 0
 
     class XPUTestMomentum4(TestMomentumOPBase):
+
         def init_config(self):
             self.input_shape = [2, 2, 255]
             self.learning_rate = np.array([0.0005]).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py
index 9d98ab70041e9..87667e4f13973 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py
@@ -19,6 +19,7 @@
 import paddle
 import paddle.fluid.core as core
 import sys
+
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
 import paddle.fluid as fluid
@@ -31,13 +32,14 @@
 
 
 class TestMulOpError(unittest.TestCase):
+
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of mul_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.XPUPlace(0))
-            x2 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.XPUPlace(0))
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.XPUPlace(0))
+            x2 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         fluid.XPUPlace(0))
             self.assertRaises(TypeError, fluid.layers.mul, x1, x2)
             # The input dtype of mul_op must be float32.
             x3 = fluid.layers.data(name='x3', shape=[4], dtype="int32")
@@ -46,11 +48,13 @@ def test_errors(self):
 
 
 class XPUTestMulOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'mul'
         self.use_dynamic_create_class = False
 
     class TestXPUMulOp1(XPUOpTest):
+
         def setUp(self):
             self.op_type = "mul"
             self.dtype = self.in_type
@@ -75,28 +79,28 @@ def test_check_output(self):
         def test_check_grad_normal(self):
             place = paddle.XPUPlace(0)
             paddle.enable_static()
-            self.check_grad_with_place(
-                place, ['X', 'Y'], 'Out', max_relative_error=0.1)
+            self.check_grad_with_place(place, ['X', 'Y'],
+                                       'Out',
+                                       max_relative_error=0.1)
 
         def test_check_grad_ingore_x(self):
             place = paddle.XPUPlace(0)
             paddle.enable_static()
-            self.check_grad_with_place(
-                place, ['Y'],
-                'Out',
-                max_relative_error=0.1,
-                no_grad_set=set("X"))
+            self.check_grad_with_place(place, ['Y'],
+                                       'Out',
+                                       max_relative_error=0.1,
+                                       no_grad_set=set("X"))
 
         def test_check_grad_ignore_y(self):
             place = paddle.XPUPlace(0)
             paddle.enable_static()
-            self.check_grad_with_place(
-                place, ['X'],
-                'Out',
-                max_relative_error=0.1,
-                no_grad_set=set('Y'))
+            self.check_grad_with_place(place, ['X'],
+                                       'Out',
+                                       max_relative_error=0.1,
+                                       no_grad_set=set('Y'))
 
     class TestXPUMulOp2(XPUOpTest):
+
         def setUp(self):
             self.op_type = "mul"
             self.use_xpu = True
@@ -115,26 +119,25 @@ def test_check_output(self):
         def test_check_grad_normal(self):
             place = paddle.XPUPlace(0)
             paddle.enable_static()
-            self.check_grad_with_place(
-                place, ['X', 'Y'], 'Out', max_relative_error=0.1)
+            self.check_grad_with_place(place, ['X', 'Y'],
+                                       'Out',
+                                       max_relative_error=0.1)
 
         def test_check_grad_ingore_x(self):
             place = paddle.XPUPlace(0)
             paddle.enable_static()
-            self.check_grad_with_place(
-                place, ['Y'],
-                'Out',
-                max_relative_error=0.1,
-                no_grad_set=set("X"))
+            self.check_grad_with_place(place, ['Y'],
+                                       'Out',
+                                       max_relative_error=0.1,
+                                       no_grad_set=set("X"))
 
         def test_check_grad_ingore_y(self):
             place = paddle.XPUPlace(0)
             paddle.enable_static()
-            self.check_grad_with_place(
-                place, ['X'],
-                'Out',
-                max_relative_error=0.1,
-                no_grad_set=set('Y'))
+            self.check_grad_with_place(place, ['X'],
+                                       'Out',
+                                       max_relative_error=0.1,
+                                       no_grad_set=set('Y'))
 
 
 support_types = get_xpu_op_support_types('mul')
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_op_xpu.py
index 731358d5304b4..0bfa73d6863ef 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_op_xpu.py
@@ -19,6 +19,7 @@
 import paddle
 import paddle.fluid.core as core
 import sys
+
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
index 7a3b4a5a2179a..6e80f501243da 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -160,11 +161,13 @@ def nearest_neighbor_interp3d_np(X,
 
 
 class XPUNearestInterpOpWrapper(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'nearest_interp_v2'
         self.use_dynamic_create_class = False
 
     class TestNearestInterpOp(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_dtype()
@@ -298,18 +301,21 @@ def init_test_case(self):
     """
 
     class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+
         def init_test_case(self):
             self.input_shape = [3, 3, 9, 6]
             self.out_h = 12
             self.out_w = 12
 
     class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+
         def init_test_case(self):
             self.input_shape = [1, 1, 32, 64]
             self.out_h = 64
             self.out_w = 32
 
     class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+
         def init_test_case(self):
             self.input_shape = [4, 1, 7, 8]
             self.out_h = 1
@@ -317,6 +323,7 @@ def init_test_case(self):
             self.out_size = np.array([2, 2]).astype("int32")
 
     class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+
         def init_test_case(self):
             self.input_shape = [3, 3, 9, 6]
             self.out_h = 12
@@ -324,6 +331,7 @@ def init_test_case(self):
             self.out_size = np.array([11, 11]).astype("int32")
 
     class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+
         def init_test_case(self):
             self.input_shape = [1, 1, 32, 64]
             self.out_h = 64
@@ -331,12 +339,14 @@ def init_test_case(self):
             self.out_size = np.array([65, 129]).astype("int32")
 
     class TestNearestNeighborInterpSame(TestNearestInterpOp):
+
         def init_test_case(self):
             self.input_shape = [2, 3, 32, 64]
             self.out_h = 32
             self.out_w = 64
 
     class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+
         def init_test_case(self):
             self.input_shape = [3, 2, 32, 16]
             self.out_h = 64
@@ -358,10 +368,12 @@ def init_test_case(self):
     """
 
     class TestNearestInterpWithoutCorners(TestNearestInterpOp):
+
         def set_align_corners(self):
             self.align_corners = False
 
     class TestNearestNeighborInterpScale1(TestNearestInterpOp):
+
         def init_test_case(self):
             self.input_shape = [3, 2, 7, 5]
             self.out_h = 64
@@ -370,6 +382,7 @@ def init_test_case(self):
             self.out_size = np.array([66, 40]).astype("int32")
 
     class TestNearestNeighborInterpScale2(TestNearestInterpOp):
+
         def init_test_case(self):
             self.input_shape = [3, 2, 5, 7]
             self.out_h = 64
@@ -378,6 +391,7 @@ def init_test_case(self):
             self.out_size = np.array([66, 40]).astype("int32")
 
     class TestNearestNeighborInterpScale3(TestNearestInterpOp):
+
         def init_test_case(self):
             self.input_shape = [3, 2, 7, 5]
             self.out_h = 64
@@ -400,6 +414,7 @@ def init_test_case(self):
     """
 
     class TestNearestInterpOp_attr_tensor(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_dtype()
@@ -458,9 +473,10 @@ def setUp(self):
                 if isinstance(self.scale, list) and len(self.scale) == 1:
                     self.scale = [self.scale[0], self.scale[0]]
                 self.attrs['scale'] = self.scale
-            output_np = nearest_neighbor_interp_np(
-                input_np, out_h, out_w, 0, 0, self.out_size, self.actual_shape,
-                self.align_corners)
+            output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0,
+                                                   self.out_size,
+                                                   self.actual_shape,
+                                                   self.align_corners)
             self.outputs = {'Out': output_np}
 
         def init_dtype(self):
@@ -480,6 +496,7 @@ def init_test_case(self):
 
     # out_size is a tensor list
     class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
+
         def init_test_case(self):
             self.input_shape = [3, 3, 9, 6]
             self.out_h = 12
@@ -488,6 +505,7 @@ def init_test_case(self):
 
     # out_size is a 1-D tensor
     class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
+
         def init_test_case(self):
             self.input_shape = [3, 2, 32, 16]
             self.out_h = 64
@@ -497,6 +515,7 @@ def init_test_case(self):
 
     # scale is a 1-D tensor
     class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
+
         def init_test_case(self):
             self.input_shape = [3, 2, 32, 16]
             self.out_h = 64
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_one_hot_op_xpu.py
index 8c8406ba433de..33b59a8de659e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_one_hot_op_xpu.py
@@ -19,6 +19,7 @@
 import paddle
 import paddle.fluid.core as core
 import sys
+
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py
index 9f937caa37ebf..afeccd637a265 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py
@@ -19,6 +19,7 @@
 import paddle
 import paddle.fluid.core as core
 import sys
+
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
 import paddle.fluid as fluid
@@ -29,6 +30,7 @@
 
 
 class TestOneHotOp(XPUOpTest):
+
     def setUp(self):
         self.use_xpu = True
         self.op_type = 'one_hot_v2'
@@ -54,6 +56,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_attr(XPUOpTest):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         depth = 10
@@ -78,6 +81,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_default_dtype(XPUOpTest):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         depth = 10
@@ -102,6 +106,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_default_dtype_attr(XPUOpTest):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         depth = 10
@@ -126,6 +131,7 @@ def test_check_output(self):
 
 
 class TestOneHotOp_out_of_range(XPUOpTest):
+
     def setUp(self):
         self.op_type = 'one_hot_v2'
         depth = 10
@@ -145,6 +151,7 @@ def test_check_output(self):
 
 
 class TestOneHotOpApi(unittest.TestCase):
+
     def test_api(self):
         depth = 10
         self._run(depth)
@@ -171,21 +178,23 @@ def _run(self, depth):
 
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        ret = exe.run(feed={'label': label_data, },
+        ret = exe.run(feed={
+            'label': label_data,
+        },
                       fetch_list=[one_hot_label],
                       return_numpy=False)
 
 
 class BadInputTestOnehotV2(unittest.TestCase):
+
     def test_error(self):
         with fluid.program_guard(fluid.Program()):
 
             def test_bad_x():
-                label = fluid.layers.data(
-                    name="label",
-                    shape=[4],
-                    append_batch_size=False,
-                    dtype="float32")
+                label = fluid.layers.data(name="label",
+                                          shape=[4],
+                                          append_batch_size=False,
+                                          dtype="float32")
                 one_hot_label = fluid.one_hot(input=label, depth=4)
 
             self.assertRaises(TypeError, test_bad_x)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
index fcd0de2a1fddd..5ab62af7104e9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
@@ -16,6 +16,7 @@
 from __future__ import division
 
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -46,11 +47,11 @@ def max_pool2D_forward_naive(x,
     if adaptive:
         H_out, W_out = ksize
     else:
-        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
+        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] -
+                 1) // strides[0] + 1 if ceil_mode else (
                      H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
-                 ) // strides[1] + 1 if ceil_mode else (
+        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] -
+                 1) // strides[1] + 1 if ceil_mode else (
                      W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     for i in range(H_out):
@@ -86,11 +87,11 @@ def avg_pool2D_forward_naive(x,
     if adaptive:
         H_out, W_out = ksize
     else:
-        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-                 ) // strides[0] + 1 if ceil_mode else (
+        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] -
+                 1) // strides[0] + 1 if ceil_mode else (
                      H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
-                 ) // strides[1] + 1 if ceil_mode else (
+        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] -
+                 1) // strides[1] + 1 if ceil_mode else (
                      W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     for i in range(H_out):
@@ -117,9 +118,9 @@ def avg_pool2D_forward_naive(x,
                 field_size = (r_end - r_start) * (c_end - c_start)
 
             if data_type == np.int8 or data_type == np.uint8:
-                out[:, :, i, j] = (np.rint(
-                    np.sum(x_masked, axis=(2, 3)) /
-                    field_size)).astype(data_type)
+                out[:, :, i,
+                    j] = (np.rint(np.sum(x_masked, axis=(2, 3)) /
+                                  field_size)).astype(data_type)
             else:
                 out[:, :, i, j] = (np.sum(x_masked, axis=(2, 3)) /
                                    field_size).astype(data_type)
@@ -144,8 +145,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         for input_size, filter_size, stride_size in zip(input_shape, pool_size,
                                                         pool_stride):
             out_size = int((input_size + stride_size - 1) / stride_size)
-            pad_sum = np.max((
-                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0))
             pad_0 = int(pad_sum / 2)
             pad_1 = int(pad_sum - pad_0)
             padding.append(pad_0)
@@ -228,8 +229,9 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                 x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
                 if pool_type == 'avg':
                     if (exclusive or adaptive):
-                        field_size = (in_h_end - in_h_start) * (
-                            in_w_end - in_w_start)
+                        field_size = (in_h_end - in_h_start) * (in_w_end -
+                                                                in_w_start)
+
 
 #                         if (exclusive or adaptive) else (ksize[0] * ksize[1])
                     out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
@@ -239,8 +241,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                 x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
                 if pool_type == 'avg':
                     if (exclusive or adaptive):
-                        field_size = (in_h_end - in_h_start) * (
-                            in_w_end - in_w_start)
+                        field_size = (in_h_end - in_h_start) * (in_w_end -
+                                                                in_w_start)
                     out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
                 elif pool_type == 'max':
                     out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
@@ -248,11 +250,13 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
 
 
 class XPUTestPool2D_Op(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'pool2d'
         self.use_dynamic_create_class = False
 
     class TestPool2D_Op(XPUOpTest):
+
         def setUp(self):
             self.op_type = "pool2d"
             self.dtype = self.in_type
@@ -337,6 +341,7 @@ def init_adaptive(self):
             self.adaptive = False
 
     class TestCase1(TestPool2D_Op):
+
         def init_test_case(self):
             self.ksize = [3, 3]
             self.strides = [1, 1]
@@ -355,6 +360,7 @@ def init_shape(self):
             self.shape = [2, 3, 7, 7]
 
     class TestCase2(TestPool2D_Op):
+
         def init_test_case(self):
             self.ksize = [3, 3]
             self.strides = [1, 1]
@@ -373,21 +379,25 @@ def init_shape(self):
             self.shape = [2, 3, 7, 7]
 
     class TestCase3(TestPool2D_Op):
+
         def init_pool_type(self):
             self.pool_type = "max"
             self.pool2D_forward_naive = max_pool2D_forward_naive
 
     class TestCase4(TestCase1):
+
         def init_pool_type(self):
             self.pool_type = "max"
             self.pool2D_forward_naive = max_pool2D_forward_naive
 
     class TestCase5(TestCase2):
+
         def init_pool_type(self):
             self.pool_type = "max"
             self.pool2D_forward_naive = max_pool2D_forward_naive
 
     class TestPool2D_AsyPadding(TestPool2D_Op):
+
         def init_test_case(self):
             self.ksize = [3, 3]
             self.strides = [1, 1]
@@ -397,6 +407,7 @@ def init_shape(self):
             self.shape = [2, 3, 5, 5]
 
     class TestCase1_AsyPadding(TestCase1):
+
         def init_test_case(self):
             self.ksize = [3, 3]
             self.strides = [1, 1]
@@ -406,6 +417,7 @@ def init_shape(self):
             self.shape = [2, 3, 7, 7]
 
     class TestCase2_AsyPadding(TestCase2):
+
         def init_test_case(self):
             self.ksize = [3, 3]
             self.strides = [1, 1]
@@ -415,6 +427,7 @@ def init_shape(self):
             self.shape = [2, 3, 7, 7]
 
     class TestCase3_AsyPadding(TestCase3):
+
         def init_test_case(self):
             self.ksize = [3, 3]
             self.strides = [1, 1]
@@ -424,6 +437,7 @@ def init_shape(self):
             self.shape = [2, 3, 5, 5]
 
     class TestCase4_AsyPadding(TestCase4):
+
         def init_test_case(self):
             self.ksize = [3, 3]
             self.strides = [1, 1]
@@ -433,6 +447,7 @@ def init_shape(self):
             self.shape = [2, 3, 7, 7]
 
     class TestCase5_AsyPadding(TestCase5):
+
         def init_test_case(self):
             self.ksize = [3, 3]
             self.strides = [1, 1]
@@ -442,6 +457,7 @@ def init_shape(self):
             self.shape = [2, 3, 7, 7]
 
     class TestAvgInclude_AsyPadding(TestCase2):
+
         def init_exclusive(self):
             self.exclusive = False
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
index 0830237d5a89d..c8fcffbd3d33d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
@@ -18,6 +18,7 @@
 import numpy as np
 import sys
 import unittest
+
 sys.path.append("..")
 
 import paddle
@@ -29,11 +30,13 @@
 
 
 class XPUTestPriorBoxOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'prior_box'
         self.use_dynamic_create_class = False
 
     class TestPriorBoxOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = "prior_box"
             self.use_xpu = True
@@ -94,8 +97,8 @@ def init_test_params(self):
             self.flip = True
             self.set_min_max_aspect_ratios_order()
             self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
-            self.aspect_ratios = np.array(
-                self.aspect_ratios, dtype=np.float).flatten()
+            self.aspect_ratios = np.array(self.aspect_ratios,
+                                          dtype=np.float).flatten()
             self.variances = [0.1, 0.1, 0.2, 0.2]
             self.variances = np.array(self.variances, dtype=np.float).flatten()
 
@@ -133,40 +136,40 @@ def init_test_output(self):
                                 ar = self.real_aspect_ratios[r]
                                 c_w = min_size * math.sqrt(ar) / 2
                                 c_h = (min_size / math.sqrt(ar)) / 2
-                                out_boxes[h, w, idx, :] = [
-                                    (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                    self.image_h, (c_x + c_w) / self.image_w,
-                                    (c_y + c_h) / self.image_h
-                                ]
+                                out_boxes[h, w,
+                                          idx, :] = [(c_x - c_w) / self.image_w,
+                                                     (c_y - c_h) / self.image_h,
+                                                     (c_x + c_w) / self.image_w,
+                                                     (c_y + c_h) / self.image_h]
                                 idx += 1
 
                             if len(self.max_sizes) > 0:
                                 max_size = self.max_sizes[s]
                                 # second prior: aspect_ratio = 1,
                                 c_w = c_h = math.sqrt(min_size * max_size) / 2
-                                out_boxes[h, w, idx, :] = [
-                                    (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                    self.image_h, (c_x + c_w) / self.image_w,
-                                    (c_y + c_h) / self.image_h
-                                ]
+                                out_boxes[h, w,
+                                          idx, :] = [(c_x - c_w) / self.image_w,
+                                                     (c_y - c_h) / self.image_h,
+                                                     (c_x + c_w) / self.image_w,
+                                                     (c_y + c_h) / self.image_h]
                                 idx += 1
                         else:
                             c_w = c_h = min_size / 2.
-                            out_boxes[h, w, idx, :] = [
-                                (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                self.image_h, (c_x + c_w) / self.image_w,
-                                (c_y + c_h) / self.image_h
-                            ]
+                            out_boxes[h, w,
+                                      idx, :] = [(c_x - c_w) / self.image_w,
+                                                 (c_y - c_h) / self.image_h,
+                                                 (c_x + c_w) / self.image_w,
+                                                 (c_y + c_h) / self.image_h]
                             idx += 1
                             if len(self.max_sizes) > 0:
                                 max_size = self.max_sizes[s]
                                 # second prior: aspect_ratio = 1,
                                 c_w = c_h = math.sqrt(min_size * max_size) / 2
-                                out_boxes[h, w, idx, :] = [
-                                    (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                    self.image_h, (c_x + c_w) / self.image_w,
-                                    (c_y + c_h) / self.image_h
-                                ]
+                                out_boxes[h, w,
+                                          idx, :] = [(c_x - c_w) / self.image_w,
+                                                     (c_y - c_h) / self.image_h,
+                                                     (c_x + c_w) / self.image_w,
+                                                     (c_y + c_h) / self.image_h]
                                 idx += 1
 
                             # rest of priors
@@ -176,27 +179,29 @@ def init_test_output(self):
                                     continue
                                 c_w = min_size * math.sqrt(ar) / 2
                                 c_h = (min_size / math.sqrt(ar)) / 2
-                                out_boxes[h, w, idx, :] = [
-                                    (c_x - c_w) / self.image_w, (c_y - c_h) /
-                                    self.image_h, (c_x + c_w) / self.image_w,
-                                    (c_y + c_h) / self.image_h
-                                ]
+                                out_boxes[h, w,
+                                          idx, :] = [(c_x - c_w) / self.image_w,
+                                                     (c_y - c_h) / self.image_h,
+                                                     (c_x + c_w) / self.image_w,
+                                                     (c_y + c_h) / self.image_h]
                                 idx += 1
 
             # clip the prior's coordidate such that it is within[0, 1]
             if self.clip:
                 out_boxes = np.clip(out_boxes, 0.0, 1.0)
             # set the variance.
-            out_var = np.tile(self.variances, (self.layer_h, self.layer_w,
-                                               self.num_priors, 1))
+            out_var = np.tile(self.variances,
+                              (self.layer_h, self.layer_w, self.num_priors, 1))
             self.out_boxes = out_boxes.astype(self.dtype)
             self.out_var = out_var.astype(self.dtype)
 
     class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp):
+
         def set_max_sizes(self):
             self.max_sizes = []
 
     class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp):
+
         def set_min_max_aspect_ratios_order(self):
             self.min_max_aspect_ratios_order = True
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py
index f9c49a81ef30c..2ea100a2def88 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py
@@ -18,6 +18,7 @@
 import paddle
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
 from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
@@ -26,11 +27,13 @@
 
 
 class XPUTestRangeOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = "range"
         self.use_dynamic_create_class = False
 
     class TestRangeOp(XPUOpTest):
+
         def setUp(self):
             self.set_xpu()
             self.op_type = "range"
@@ -43,8 +46,9 @@ def setUp(self):
             }
 
             self.outputs = {
-                'Out': np.arange(self.case[0], self.case[1],
-                                 self.case[2]).astype(self.dtype)
+                'Out':
+                np.arange(self.case[0], self.case[1],
+                          self.case[2]).astype(self.dtype)
             }
 
         def set_xpu(self):
@@ -61,22 +65,27 @@ def test_check_output(self):
             self.check_output_with_place(place, check_dygraph=False)
 
     class TestRangeOpCase0(TestRangeOp):
+
         def init_config(self):
             self.case = (0, 5, 1)
 
     class TestRangeOpCase1(TestRangeOp):
+
         def init_config(self):
             self.case = (0, 5, 2)
 
     class TestRangeOpCase2(TestRangeOp):
+
         def init_config(self):
             self.case = (10, 1, -2)
 
     class TestRangeOpCase3(TestRangeOp):
+
         def init_config(self):
             self.case = (-1, -10, -2)
 
     class TestRangeOpCase4(TestRangeOp):
+
         def init_config(self):
             self.case = (10, -10, -11)
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_all_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_all_op_xpu.py
index b4dc8e7b7cfd1..ceb38c22630a6 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_all_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_all_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -28,10 +29,12 @@
 
 
 class XPUTestReduceAllOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'reduce_all'
 
     class XPUTestReduceAllBase(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.set_case()
@@ -45,8 +48,8 @@ def set_case(self):
                 'dim': (3, 5, 4)
             }
             self.inputs = {
-                'X': np.random.randint(0, 2,
-                                       (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+                'X':
+                np.random.randint(0, 2, (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
             }
             self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
 
@@ -57,6 +60,7 @@ def test_check_grad(self):
             pass
 
     class XPUTestReduceAllCase1(XPUTestReduceAllBase):
+
         def set_case(self):
             self.op_type = 'reduce_all'
             self.attrs = {
@@ -71,6 +75,7 @@ def set_case(self):
             self.outputs = {'Out': self.inputs['X'].all()}
 
     class XPUTestReduceAllCase2(XPUTestReduceAllBase):
+
         def set_case(self):
             self.op_type = 'reduce_all'
             self.attrs = {
@@ -80,12 +85,13 @@ def set_case(self):
                 'dim': (3, 6)
             }
             self.inputs = {
-                'X': np.random.randint(0, 2,
-                                       (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+                'X':
+                np.random.randint(0, 2, (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
             }
             self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
 
     class XPUTestReduceAllCase3(XPUTestReduceAllBase):
+
         def set_case(self):
             self.op_type = 'reduce_all'
             self.attrs = {
@@ -98,8 +104,7 @@ def set_case(self):
                 'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")
             }
             self.outputs = {
-                'Out': np.expand_dims(
-                    self.inputs['X'].all(axis=1), axis=1)
+                'Out': np.expand_dims(self.inputs['X'].all(axis=1), axis=1)
             }
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
index 1dd7b42e5eb05..ac827b6738f8f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -28,10 +29,12 @@
 
 
 class XPUTestReduceMaxOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'reduce_max'
 
     class XPUTestReduceMaxBase(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_case()
@@ -49,8 +52,9 @@ def set_case(self):
                 self.outputs = {'Out': self.inputs['X'].max()}
             else:
                 self.outputs = {
-                    'Out': self.inputs['X'].max(axis=self.axis,
-                                                keepdims=self.attrs['keep_dim'])
+                    'Out':
+                    self.inputs['X'].max(axis=self.axis,
+                                         keepdims=self.attrs['keep_dim'])
                 }
 
         def init_case(self):
@@ -66,6 +70,7 @@ def test_check_grad(self):
             pass
 
     class XPUTestReduceMaxCase1(XPUTestReduceMaxBase):
+
         def init_case(self):
             self.shape = (5, 6, 10)
             self.axis = (0, )
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
index 18a588b1b88da..ef483870c68ee 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
@@ -27,6 +28,7 @@
 
 
 class TestMeanOp(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
@@ -43,6 +45,7 @@ def check_grad_(self):
 
 
 class TestMeanOp5D(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.inputs = {
@@ -61,6 +64,7 @@ def test_check_grad(self):
 
 
 class TestMeanOp6D(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.inputs = {
@@ -79,6 +83,7 @@ def test_check_grad(self):
 
 
 class TestMeanOp8D(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.inputs = {
@@ -97,6 +102,7 @@ def test_check_grad(self):
 
 
 class Test1DReduce(OpTest):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.inputs = {'X': np.random.random(120).astype("float32")}
@@ -113,6 +119,7 @@ def test_check_grad(self):
 
 
 class Test2DReduce0(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.attrs = {'dim': [0], 'use_xpu': True}
@@ -121,6 +128,7 @@ def setUp(self):
 
 
 class Test2DReduce1(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.attrs = {'dim': [1], 'use_xpu': True}
@@ -131,6 +139,7 @@ def setUp(self):
 
 
 class Test3DReduce0(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.attrs = {'dim': [1], 'use_xpu': True}
@@ -141,6 +150,7 @@ def setUp(self):
 
 
 class Test3DReduce1(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.attrs = {'dim': [2], 'use_xpu': True}
@@ -151,6 +161,7 @@ def setUp(self):
 
 
 class Test3DReduce2(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.attrs = {'dim': [-2], 'use_xpu': True}
@@ -161,6 +172,7 @@ def setUp(self):
 
 
 class Test3DReduce3(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.attrs = {'dim': [1, 2], 'use_xpu': True}
@@ -171,17 +183,20 @@ def setUp(self):
 
 
 class TestKeepDimReduce(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
         self.attrs = {'dim': [1], 'keep_dim': True, 'use_xpu': True}
         self.outputs = {
-            'Out': self.inputs['X'].mean(
-                axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])
+            'Out':
+            self.inputs['X'].mean(axis=tuple(self.attrs['dim']),
+                                  keepdims=self.attrs['keep_dim'])
         }
 
 
 class TestKeepDim8DReduce(Test1DReduce):
+
     def setUp(self):
         self.op_type = "reduce_mean"
         self.inputs = {
@@ -189,8 +204,9 @@ def setUp(self):
         }
         self.attrs = {'dim': (3, 4, 5), 'keep_dim': True, 'use_xpu': True}
         self.outputs = {
-            'Out': self.inputs['X'].mean(
-                axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])
+            'Out':
+            self.inputs['X'].mean(axis=tuple(self.attrs['dim']),
+                                  keepdims=self.attrs['keep_dim'])
         }
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py
index cf77ea09a581c..85a12bea3bed0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -28,10 +29,12 @@
 
 
 class XPUTestReduceMinOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'reduce_min'
 
     class XPUTestReduceMinBase(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_case()
@@ -49,8 +52,9 @@ def set_case(self):
                 self.outputs = {'Out': self.inputs['X'].min()}
             else:
                 self.outputs = {
-                    'Out': self.inputs['X'].min(axis=self.axis,
-                                                keepdims=self.attrs['keep_dim'])
+                    'Out':
+                    self.inputs['X'].min(axis=self.axis,
+                                         keepdims=self.attrs['keep_dim'])
                 }
 
         def init_case(self):
@@ -66,6 +70,7 @@ def test_check_grad(self):
             pass
 
     class XPUTestReduceMinCase1(XPUTestReduceMinBase):
+
         def init_case(self):
             self.shape = (5, 6, 10)
             self.axis = (0, )
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py
index b621cb59c0ed0..155adaa37c0ca 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -29,11 +30,13 @@
 
 
 class XPUTestReduceProdOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'reduce_prod'
         self.use_dynamic_create_class = False
 
     class TestXPUReduceProdOp(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_dtype()
@@ -52,8 +55,9 @@ def setUp(self):
                 self.outputs = {'Out': self.inputs['X'].prod()}
             else:
                 self.outputs = {
-                    'Out': self.inputs['X'].prod(
-                        axis=self.axis, keepdims=self.attrs['keep_dim'])
+                    'Out':
+                    self.inputs['X'].prod(axis=self.axis,
+                                          keepdims=self.attrs['keep_dim'])
                 }
 
         def initTestCase(self):
@@ -70,68 +74,81 @@ def test_check_grad(self):
             self.check_grad_with_place(self.place, ['X'], 'Out')
 
     class TestProdOp5D(TestXPUReduceProdOp):
+
         def initTestCase(self):
             self.shape = (1, 2, 5, 6, 10)
             self.axis = (0, )
 
     class TestProdOp6D(TestXPUReduceProdOp):
+
         def initTestCase(self):
             self.shape = (1, 1, 2, 5, 6, 10)
             self.axis = (0, )
 
     class TestProdOp8D(TestXPUReduceProdOp):
+
         def initTestCase(self):
             self.shape = (1, 3, 1, 2, 1, 4, 3, 10)
             self.axis = (0, 3)
 
     class Test1DReduce(TestXPUReduceProdOp):
+
         def initTestCase(self):
             self.shape = 120
             self.axis = (0, )
 
     class Test2DReduce0(TestXPUReduceProdOp):
+
         def initTestCase(self):
             self.shape = (20, 10)
             self.axis = (0, )
 
     class Test2DReduce1(TestXPUReduceProdOp):
+
         def initTestCase(self):
             self.shape = (20, 10)
             self.axis = (1, )
 
     class Test3DReduce0(TestXPUReduceProdOp):
+
         def initTestCase(self):
             self.shape = (5, 6, 7)
             self.axis = (1, )
 
     class Test3DReduce1(TestXPUReduceProdOp):
+
         def initTestCase(self):
             self.shape = (5, 6, 7)
             self.axis = (2, )
 
     class Test3DReduce2(TestXPUReduceProdOp):
+
         def initTestCase(self):
             self.shape = (5, 6, 7)
             self.axis = (-2, )
 
     class Test3DReduce3(TestXPUReduceProdOp):
+
         def initTestCase(self):
             self.shape = (5, 6, 7)
             self.axis = (1, 2)
 
     class TestKeepDimReduce(TestXPUReduceProdOp):
+
         def initTestCase(self):
             self.shape = (5, 6, 10)
             self.axis = (1, )
             self.keep_dim = True
 
     class TestKeepDim8DReduce(TestXPUReduceProdOp):
+
         def initTestCase(self):
             self.shape = (2, 5, 3, 2, 2, 3, 4, 2)
             self.axis = (3, 4, 5)
             self.keep_dim = True
 
     class TestReduceAll(TestXPUReduceProdOp):
+
         def initTestCase(self):
             self.shape = (5, 6, 2, 10)
             self.axis = (0, )
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
index 9f42a509624b9..d80fd187dfdf7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -28,10 +29,12 @@
 
 
 class XPUTestReduceSumOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'reduce_sum'
 
     class XPUTestReduceSumBase(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_case()
@@ -49,8 +52,9 @@ def set_case(self):
                 self.outputs = {'Out': self.inputs['X'].sum()}
             else:
                 self.outputs = {
-                    'Out': self.inputs['X'].sum(axis=self.axis,
-                                                keepdims=self.attrs['keep_dim'])
+                    'Out':
+                    self.inputs['X'].sum(axis=self.axis,
+                                         keepdims=self.attrs['keep_dim'])
                 }
 
         def init_case(self):
@@ -66,6 +70,7 @@ def test_check_grad(self):
             pass
 
     class XPUTestReduceSumCase1(XPUTestReduceSumBase):
+
         def init_case(self):
             self.shape = (5, 6, 10)
             self.axis = (0, )
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py
index 9d1a5ca1fbdd6..9b71482fcc662 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -43,6 +44,7 @@ def huber_loss_forward(val, delta):
 # 1.动态生成不同参数的测试case，wrapper类中必须实现dynamic_create_class方法
 # self.use_dynamic_create_class置为True
 class XPUTestArgsortOp1(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'argsort'
         self.use_dynamic_create_class = True
@@ -59,6 +61,7 @@ def dynamic_create_class(self):
         return base_class, classes
 
     class TestArgsortOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = "argsort"
             self.place = paddle.XPUPlace(0)
@@ -73,9 +76,10 @@ def setUp(self):
             if self.in_type == np.float32:
                 self.x = np.random.random(self.input_shape).astype(self.dtype)
             else:
-                self.x = np.random.randint(
-                    low=-1000, high=1000,
-                    size=self.input_shape).astype(self.dtype)
+                self.x = np.random.randint(low=-1000,
+                                           high=1000,
+                                           size=self.input_shape).astype(
+                                               self.dtype)
             self.inputs = {"X": self.x}
             self.attrs = {"axis": self.axis, "descending": self.descending}
             self.get_output()
@@ -84,15 +88,14 @@ def setUp(self):
         def get_output(self):
             if self.descending:
                 self.indices = np.flip(
-                    np.argsort(
-                        self.x, kind='heapsort', axis=self.axis),
+                    np.argsort(self.x, kind='heapsort', axis=self.axis),
                     self.axis)
                 self.sorted_x = np.flip(
-                    np.sort(
-                        self.x, kind='heapsort', axis=self.axis), self.axis)
+                    np.sort(self.x, kind='heapsort', axis=self.axis), self.axis)
             else:
-                self.indices = np.argsort(
-                    self.x, kind='heapsort', axis=self.axis)
+                self.indices = np.argsort(self.x,
+                                          kind='heapsort',
+                                          axis=self.axis)
                 self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis)
 
         def test_check_output(self):
@@ -101,11 +104,13 @@ def test_check_output(self):
 
 # 2. 为不同参数的测试case定义一个测试类，self.use_dynamic_create_class需要置为False
 class XPUTestArgsortOp2(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'argsort'
         self.use_dynamic_create_class = False
 
     class TestArgsortOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = "argsort"
             self.place = paddle.XPUPlace(0)
@@ -119,9 +124,10 @@ def setUp(self):
             if self.in_type == np.float32:
                 self.x = np.random.random(self.input_shape).astype(self.dtype)
             else:
-                self.x = np.random.randint(
-                    low=-1000, high=1000,
-                    size=self.input_shape).astype(self.dtype)
+                self.x = np.random.randint(low=-1000,
+                                           high=1000,
+                                           size=self.input_shape).astype(
+                                               self.dtype)
             self.inputs = {"X": self.x}
             self.attrs = {"axis": self.axis, "descending": self.descending}
             self.get_output()
@@ -130,15 +136,14 @@ def setUp(self):
         def get_output(self):
             if self.descending:
                 self.indices = np.flip(
-                    np.argsort(
-                        self.x, kind='heapsort', axis=self.axis),
+                    np.argsort(self.x, kind='heapsort', axis=self.axis),
                     self.axis)
                 self.sorted_x = np.flip(
-                    np.sort(
-                        self.x, kind='heapsort', axis=self.axis), self.axis)
+                    np.sort(self.x, kind='heapsort', axis=self.axis), self.axis)
             else:
-                self.indices = np.argsort(
-                    self.x, kind='heapsort', axis=self.axis)
+                self.indices = np.argsort(self.x,
+                                          kind='heapsort',
+                                          axis=self.axis)
                 self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis)
 
         def init_inputshape(self):
@@ -157,46 +162,57 @@ def init_direction(self):
             self.descending = False
 
     class TestArgsortOpAxis0XPU(TestArgsortOp):
+
         def init_axis(self):
             self.axis = 0
 
     class TestArgsortOpAxis1XPU(TestArgsortOp):
+
         def init_axis(self):
             self.axis = 1
 
     class TestArgsortOpAxis2XPU(TestArgsortOp):
+
         def init_axis(self):
             self.axis = 2
 
     class TestArgsortOpAxisNeg1XPU(TestArgsortOp):
+
         def init_axis(self):
             self.axis = -1
 
     class TestArgsortOpAxisNeg2XPU(TestArgsortOp):
+
         def init_axis(self):
             self.axis = -2
 
     class TestArgsortOpDescendingAxisXPU(TestArgsortOp):
+
         def init_direction(self):
             self.descending = True
 
     class TestArgsortOpDescendingAxis0XPU(TestArgsortOpAxis0XPU):
+
         def init_direction(self):
             self.descending = True
 
     class TestArgsortOpDescendingAxis1XPU(TestArgsortOpAxis1XPU):
+
         def init_direction(self):
             self.descending = True
 
     class TestArgsortOpDescendingAxis2XPU(TestArgsortOpAxis2XPU):
+
         def init_direction(self):
             self.descending = True
 
     class TestArgsortOpDescendingAxisNeg1XPU(TestArgsortOpAxisNeg1XPU):
+
         def init_direction(self):
             self.descending = True
 
     class TestArgsortOpDescendingAxisNeg2XPU(TestArgsortOpAxisNeg2XPU):
+
         def init_direction(self):
             self.descending = True
 
@@ -208,11 +224,13 @@ def init_direction(self):
 
 
 class XPUTestHuberLossOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'huber_loss'
         self.use_dynamic_create_class = False
 
     class TestHuberLossOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = 'huber_loss'
             self.place = paddle.XPUPlace(0)
@@ -252,22 +270,27 @@ def test_check_grad_normal(self):
             self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
         def test_check_grad_ingore_x(self):
-            self.check_grad_with_place(
-                self.place, ['Y'], 'Out', no_grad_set=set("residual"))
+            self.check_grad_with_place(self.place, ['Y'],
+                                       'Out',
+                                       no_grad_set=set("residual"))
 
         def test_check_grad_ingore_y(self):
-            self.check_grad_with_place(
-                self.place, ['X'], 'Out', no_grad_set=set('residual'))
+            self.check_grad_with_place(self.place, ['X'],
+                                       'Out',
+                                       no_grad_set=set('residual'))
 
     class TestHuberLossOp1(TestHuberLossOp):
+
         def set_shape(self):
             return (640)
 
     class TestHuberLossOp2(TestHuberLossOp):
+
         def set_shape(self):
             return (10, 10)
 
     class TestHuberLossOp3(TestHuberLossOp):
+
         def set_shape(self):
             return (10, 10, 1)
 
@@ -275,11 +298,10 @@ def set_shape(self):
 support_types = get_xpu_op_support_types('huber_loss')
 for stype in support_types:
     create_test_class(globals(), XPUTestHuberLossOp, stype)
-    create_test_class(
-        globals(),
-        XPUTestHuberLossOp,
-        stype,
-        ignore_deivce_version=[core.XPUVersion.XPU1])
+    create_test_class(globals(),
+                      XPUTestHuberLossOp,
+                      stype,
+                      ignore_deivce_version=[core.XPUVersion.XPU1])
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
index 0b000fc924ac1..2f7300d22c896 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import sys
 import unittest
+
 sys.path.append("..")
 
 import paddle
@@ -28,12 +29,14 @@
 
 
 class XPUTestReshapeOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = "reshape2"
         self.use_dynamic_create_class = False
 
     # situation 1: have shape( list, no tensor), no actual shape(Tensor)
     class TestReshapeOp(XPUOpTest):
+
         def setUp(self):
             self.init_data()
             self.op_type = "reshape2"
@@ -71,12 +74,14 @@ def test_check_grad(self):
                 self.check_grad_with_place(place, ["X"], "Out")
 
     class TestReshapeOpDimInfer1(TestReshapeOp):
+
         def init_data(self):
             self.ori_shape = (5, 25)
             self.new_shape = (5, -1, 5)
             self.infered_shape = (5, -1, 5)
 
     class TestReshapeOpDimInfer2(TestReshapeOp):
+
         def init_data(self):
             self.ori_shape = (10, 2, 6)
             self.new_shape = (10, 0, 3, -1)
@@ -84,6 +89,7 @@ def init_data(self):
 
     # situation 2: have shape(list, no tensor), have actual shape(Tensor)
     class TestReshapeOpWithInputShape(TestReshapeOp):
+
         def init_data(self):
             self.ori_shape = (6, 20)
             self.new_shape = (0, -1, 20)
@@ -92,8 +98,7 @@ def init_data(self):
         def init_test_input(self):
             self.inputs = {
                 "X": np.random.random(self.ori_shape).astype(self.dtype),
-                "Shape": np.array(
-                    self.actual_shape, dtype="int32")
+                "Shape": np.array(self.actual_shape, dtype="int32")
             }
 
         def init_test_output(self):
@@ -104,6 +109,7 @@ def init_test_output(self):
 
     # Situation 3: have shape(list, have tensor), no actual shape(Tensor)
     class TestReshapeOp_attr_ShapeTensor(TestReshapeOp):
+
         def init_data(self):
             self.ori_shape = (4, 25)
             self.new_shape = (10, 10)
@@ -124,16 +130,18 @@ def init_test_input(self):
         def init_attrs(self):
             self.attrs = {'shape': self.shape, "use_xpu": True}
 
-    class TestReshapeOpDimInfer1_attr_ShapeTensor(
-            TestReshapeOp_attr_ShapeTensor):
+    class TestReshapeOpDimInfer1_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor
+                                                  ):
+
         def init_data(self):
             self.ori_shape = (5, 20)
             self.new_shape = (5, -1, 20)
             self.infered_shape = (5, -1, 20)
             self.shape = (5, -1, -1)
 
-    class TestReshapeOpDimInfer2_attr_ShapeTensor(
-            TestReshapeOp_attr_ShapeTensor):
+    class TestReshapeOpDimInfer2_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor
+                                                  ):
+
         def init_data(self):
             self.ori_shape = (10, 2, 6)
             self.new_shape = (10, 0, 3, -1)
@@ -142,6 +150,7 @@ def init_data(self):
 
     # Situation 4: have shape(Tensor), no actual shape(Tensor)
     class TestReshapeOp_attr_OnlyShape(TestReshapeOp):
+
         def init_data(self):
             self.ori_shape = (4, 25)
             self.new_shape = (10, 10)
@@ -150,14 +159,14 @@ def init_data(self):
         def init_test_input(self):
             self.inputs = {
                 "X": np.random.random(self.ori_shape).astype(self.dtype),
-                "Shape": np.array(
-                    self.new_shape, dtype="int32")
+                "Shape": np.array(self.new_shape, dtype="int32")
             }
 
         def init_attrs(self):
             self.attrs = {"use_xpu": True}
 
     class TestReshapeOpDimInfer1_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+
         def init_data(self):
             self.ori_shape = (5, 20)
             self.new_shape = (5, -1, 10)
@@ -165,6 +174,7 @@ def init_data(self):
             self.shape = (5, -1, -1)
 
     class TestReshapeOpDimInfer2_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+
         def init_data(self):
             self.ori_shape = (10, 2, 6)
             self.new_shape = (10, 0, 3, -1)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py
index a94a9d5541f61..2e8853de44a9a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+
 sys.path.append("..")
 
 import unittest
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
index 84edbab1eac91..af8532fd96ab2 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
@@ -13,6 +13,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -25,6 +26,7 @@
 
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
+
 sys.path.append("../rnn")
 from rnn_numpy import SimpleRNN, LSTM, GRU
 from convert import get_params_for_net
@@ -36,11 +38,13 @@
 
 
 class XPUTestRNNOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'rnn'
         self.use_dynamic_create_class = False
 
     class TestRNNOp(XPUOpTest):
+
         def setUp(self):
             self.init_size()
             self.init_dtype()
@@ -57,28 +61,27 @@ def setUp(self):
             self.direction_num = 2 if self.is_bidirec else 1
             direction = "bidirectional" if self.is_bidirec else "forward"
 
-            input = np.random.uniform(
-                low=-0.1,
-                high=0.1,
-                size=(self.seq_length, self.batch_size,
-                      self.input_size)).astype(self.dtype)
+            input = np.random.uniform(low=-0.1,
+                                      high=0.1,
+                                      size=(self.seq_length, self.batch_size,
+                                            self.input_size)).astype(self.dtype)
             input[11][1:][:] = 0
             input[10][2:][:] = 0
             input[9][3:][:] = 0
             input[8][4:][:] = 0
 
-            rnn1 = LSTM(
-                self.input_size,
-                self.hidden_size,
-                num_layers=self.num_layers,
-                time_major=True,
-                direction=direction,
-                dropout=self.dropout,
-                dtype=self.dtype)
+            rnn1 = LSTM(self.input_size,
+                        self.hidden_size,
+                        num_layers=self.num_layers,
+                        time_major=True,
+                        direction=direction,
+                        dropout=self.dropout,
+                        dtype=self.dtype)
 
             flat_w = get_params_for_net(rnn1)
-            output, (last_hidden, last_cell) = rnn1(
-                input, sequence_length=self.sequence_length)
+            output, (last_hidden,
+                     last_cell) = rnn1(input,
+                                       sequence_length=self.sequence_length)
 
             init_h = np.zeros(
                 (self.num_layers * self.direction_num, self.batch_size,
@@ -111,8 +114,8 @@ def setUp(self):
             }
             self.outputs = {
                 'Out': output,
-                "State":
-                [('last_hidden', last_hidden), ('last_cell', last_cell)],
+                "State": [('last_hidden', last_hidden),
+                          ('last_cell', last_cell)],
                 'Reserve': np.ndarray((400)).astype("uint8"),
                 'DropoutState': state_out
             }
@@ -127,16 +130,14 @@ def set_xpu(self):
 
         def test_check_output(self):
             self.check_output_with_place(
-                self.place, atol=0.01,
-                no_check_set=['Reserve', 'DropoutState'])
+                self.place, atol=0.01, no_check_set=['Reserve', 'DropoutState'])
 
         def test_grad(self):
             if not self.is_test:
                 var_name_list = self.get_weight_names()
                 grad_check_list = ['Input', 'init_h', 'init_c']
                 grad_check_list.extend(var_name_list)
-                self.check_grad_with_place(self.place,
-                                           set(grad_check_list),
+                self.check_grad_with_place(self.place, set(grad_check_list),
                                            ['Out', 'last_hidden', 'last_cell'])
 
         def init_size(self):
@@ -159,36 +160,43 @@ def set_attrs(self):
             pass
 
     class TestRNNOp1(TestRNNOp):
+
         def set_attrs(self):
             self.sequence_length = None
 
     class TestRNNOp2(TestRNNOp):
+
         def set_attrs(self):
             self.num_layers = 1
             self.is_bidirec = True
 
     class TestRNNOp3(TestRNNOp):
+
         def set_attrs(self):
             self.num_layers = 2
             self.is_bidirec = False
 
     class TestRNNOp4(TestRNNOp):
+
         def set_attrs(self):
             self.num_layers = 3
             self.is_bidirec = False
 
     class TestRNNOp5(TestRNNOp):
+
         def set_attrs(self):
             self.num_layers = 2
             self.is_bidirec = True
 
     class TestRNNOp6(TestRNNOp):
+
         def set_attrs(self):
             self.num_layers = 2
             self.is_bidirec = True
             self.sequence_length = None
 
     class TestRNNOp7(TestRNNOp):
+
         def set_attrs(self):
             self.num_layers = 3
             self.is_bidirec = True
@@ -196,11 +204,10 @@ def set_attrs(self):
 
 support_types = get_xpu_op_support_types('rnn')
 for stype in support_types:
-    create_test_class(
-        globals(),
-        XPUTestRNNOp,
-        stype,
-        ignore_deivce_version=[core.XPUVersion.XPU1])
+    create_test_class(globals(),
+                      XPUTestRNNOp,
+                      stype,
+                      ignore_deivce_version=[core.XPUVersion.XPU1])
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
index e80b1e4c50ef2..4c830b1e8729a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+
 sys.path.append("..")
 import unittest
 import math
@@ -27,6 +28,7 @@
 
 
 class TestROIAlignOp(XPUOpTest):
+
     def set_data(self):
         self.init_test_case()
         self.make_rois()
@@ -73,8 +75,8 @@ def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w,
         bilinear_pos = np.zeros(
             [self.channels, self.pooled_height, self.pooled_width, count, 4],
             np.float32)
-        bilinear_w = np.zeros(
-            [self.pooled_height, self.pooled_width, count, 4], np.float32)
+        bilinear_w = np.zeros([self.pooled_height, self.pooled_width, count, 4],
+                              np.float32)
         for ph in range(self.pooled_width):
             for pw in range(self.pooled_height):
                 c = 0
@@ -196,6 +198,7 @@ def test_check_grad(self):
 
 
 class TestROIAlignInLodOp(TestROIAlignOp):
+
     def set_data(self):
         self.init_test_case()
         self.make_rois()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
index b27eefb6a166f..83642fa542056 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -31,11 +32,13 @@
 
 
 class XPUTestScaleOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'scale'
         self.use_dynamic_create_class = False
 
     class TestScaleOp(XPUOpTest):
+
         def setUp(self):
             self.init_dtype()
             self.set_xpu()
@@ -72,27 +75,33 @@ def test_check_output(self):
                 self.check_output_with_place(place)
 
     class TestScaleOp1(TestScaleOp):
+
         def set_attrs(self):
             self.attrs = {'scale': 3.5}
 
     class TestScaleOp2(TestScaleOp):
+
         def set_attrs(self):
             self.attrs = {'scale': 6.77}
 
     class TestScaleOp3(TestScaleOp):
+
         def set_attrs(self):
             self.attrs = {'scale': -9.19}
 
     class TestScaleOp4(TestScaleOp):
+
         def set_attrs(self):
             self.attrs = {'scale': 0.0}
 
     class TestScaleOp5(TestScaleOp):
+
         def set_attrs(self):
             self.attrs = {'scale': -0.003}
 
 
 class TestScaleApiStatic(unittest.TestCase):
+
     def _executed_api(self, x, scale=1.0, bias=0.0):
         return paddle.scale(x, scale, bias)
 
@@ -110,11 +119,13 @@ def test_api(self):
 
 
 class TestScaleInplaceApiStatic(TestScaleApiStatic):
+
     def _executed_api(self, x, scale=1.0, bias=0.0):
         return x.scale_(scale, bias)
 
 
 class TestScaleApiDygraph(unittest.TestCase):
+
     def _executed_api(self, x, scale=1.0, bias=0.0):
         return paddle.scale(x, scale, bias)
 
@@ -128,6 +139,7 @@ def test_api(self):
 
 
 class TestScaleInplaceApiDygraph(TestScaleApiDygraph):
+
     def _executed_api(self, x, scale=1.0, bias=0.0):
         return x.scale_(scale, bias)
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py
index 68a39f3c00100..9331ad73a67c0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -27,6 +28,7 @@
 
 
 class XPUTestScatterOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'scatter'
         self.use_dynamic_create_class = True
@@ -97,6 +99,7 @@ def dynamic_create_class(self):
         return base_class, classes
 
     class TestScatterOp(XPUOpTest):
+
         def setUp(self):
             self.init_config()
             self.index_type = np.int32 if not hasattr(
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
index 9999217041859..17abd1842f4bd 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
@@ -19,6 +19,7 @@
 import paddle
 import random
 import sys
+
 sys.path.append("../")
 from op_test_xpu import XPUOpTest
 from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types
@@ -52,8 +53,8 @@ def seqconv(x,
                     [offset[i] - in_begin, offset[i + 1] - offset[i]])
                 if padding_trainable:
                     sub_w = padding_data[j:j + pad_size, :]
-                    col[offset[i]:offset[i] + pad_size, j * M:(j + 1) *
-                        M] = sub_w
+                    col[offset[i]:offset[i] + pad_size,
+                        j * M:(j + 1) * M] = sub_w
                 out_begin = offset[i] + pad_size
                 in_begin = offset[i]
 
@@ -64,8 +65,8 @@ def seqconv(x,
                     sub_w = padding_data[begin_pad + context_start + j -
                                          pad_size:begin_pad + context_start +
                                          j, :]
-                    col[offset[i + 1] - pad_size:offset[i + 1], j * M:(j + 1) *
-                        M] = sub_w
+                    col[offset[i + 1] - pad_size:offset[i + 1],
+                        j * M:(j + 1) * M] = sub_w
                 in_end = offset[i + 1]
                 out_end = offset[i + 1] - pad_size
             if in_end <= in_begin:
@@ -76,10 +77,12 @@ def seqconv(x,
 
 
 class XPUTestSequenceConv(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'sequence_conv'
 
     class TestSeqProject(XPUOpTest):
+
         def setUp(self):
             self.init_test_case()
             self.op_type = 'sequence_conv'
@@ -95,9 +98,9 @@ def setUp(self):
                 return
 
             # one level, batch size
-            x = np.random.uniform(-6.10907e-05, 0.000104218,
-                                  [self.input_size[0],
-                                   self.input_size[1]]).astype(self.dtype)
+            x = np.random.uniform(
+                -6.10907e-05, 0.000104218,
+                [self.input_size[0], self.input_size[1]]).astype(self.dtype)
             w = np.random.uniform(-3.17068e-05, 0.000159822, [
                 self.context_length * self.input_size[1],
                 self.output_represention
@@ -143,27 +146,32 @@ def test_check_grad_input(self):
 
         def test_check_grad_padding_data(self):
             if self.padding_trainable:
-                self.check_grad(
-                    ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter']))
+                self.check_grad(['PaddingData'],
+                                'Out',
+                                no_grad_set=set(['X', 'Filter']))
 
         def test_check_grad_Filter(self):
-            self.check_grad(
-                ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f))
+            self.check_grad(['Filter'],
+                            'Out',
+                            no_grad_set=set(self.inputs_val_no_f))
 
         def test_check_grad_input_filter(self):
             if self.padding_trainable:
-                self.check_grad(
-                    ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData']))
+                self.check_grad(['X', 'Filter'],
+                                'Out',
+                                no_grad_set=set(['PaddingData']))
 
         def test_check_grad_padding_input(self):
             if self.padding_trainable:
-                self.check_grad(
-                    self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter']))
+                self.check_grad(self.inputs_val_no_f,
+                                'Out',
+                                no_grad_set=set(['Filter']))
 
         def test_check_grad_padding_filter(self):
             if self.padding_trainable:
-                self.check_grad(
-                    self.inputs_val_no_x, 'Out', no_grad_set=set(['X']))
+                self.check_grad(self.inputs_val_no_x,
+                                'Out',
+                                no_grad_set=set(['X']))
 
         def init_test_case(self):
             self.input_row = 7
@@ -182,6 +190,7 @@ def init_test_case(self):
             self.output_represention = 8  # output feature size
 
     class TestSeqProjectCase1(TestSeqProject):
+
         def init_test_case(self):
             self.input_row = 11
             self.context_start = -2
@@ -198,6 +207,7 @@ def init_test_case(self):
             self.output_represention = 8  # output feature size
 
     class TestSeqProjectCase2Len0(TestSeqProject):
+
         def init_test_case(self):
             self.input_row = 11
             self.context_start = -2
@@ -214,6 +224,7 @@ def init_test_case(self):
             self.output_represention = 8  # output feature size
 
     class TestSeqProjectCase3(TestSeqProject):
+
         def init_test_case(self):
             self.input_row = 25
             self.context_start = -2
@@ -233,6 +244,7 @@ def init_test_case(self):
             self.output_represention = 8  # output feature size
 
     class TestSeqProjectCase4(TestSeqProject):
+
         def init_test_case(self):
             self.input_row = 7835
             self.input_col = 128
@@ -270,12 +282,15 @@ def init_test_case(self):
 
 
 class TestSeqConvApi(unittest.TestCase):
+
     def test_api(self):
         import paddle.fluid as fluid
 
         x = fluid.layers.data('x', shape=[32], lod_level=1)
-        y = fluid.layers.sequence_conv(
-            input=x, num_filters=2, filter_size=3, padding_start=None)
+        y = fluid.layers.sequence_conv(input=x,
+                                       num_filters=2,
+                                       filter_size=3,
+                                       padding_start=None)
 
         place = fluid.CPUPlace()
         x_tensor = fluid.create_lod_tensor(
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
index 67fd9f871207b..e174d24533215 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
@@ -18,6 +18,7 @@
 import numpy as np
 import sys
 import os
+
 sys.path.append("..")
 from op_test import OpTest
 import paddle
@@ -30,11 +31,13 @@
 
 
 class XPUTestSgdOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'sgd'
         self.use_dynamic_create_class = False
 
     class TestSGDOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = "sgd"
             self.dtype = self.in_type
@@ -54,6 +57,7 @@ def test_check_output_with_place(self):
             self.check_output_with_place(paddle.XPUPlace(0))
 
     class TestSGDOpCase8X(TestSGDOp):
+
         def conf(self):
             self.h = 10
             self.w = 64
@@ -65,10 +69,12 @@ def conf(self):
 
 
 class TestSGDOpWithLargeInput(unittest.TestCase):
+
     def runTest(self):
         data = fluid.layers.fill_constant(shape=[1], value=128, dtype='int64')
-        label = fluid.layers.fill_constant(
-            shape=[1, 150], value=0.5, dtype='float32')
+        label = fluid.layers.fill_constant(shape=[1, 150],
+                                           value=0.5,
+                                           dtype='float32')
         emb = fluid.embedding(input=data, size=(10000, 150), dtype='float32')
         out = fluid.layers.l2_normalize(x=emb, axis=-1)
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
index c7fa72ca7700e..23eb66f0ed096 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
 from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
@@ -28,11 +29,13 @@
 
 
 class XPUTestShapeOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = "shape"
         self.use_dynamic_create_class = False
 
     class TestShapeOp(XPUOpTest):
+
         def setUp(self):
             self.dtype = self.in_type
             self.op_type = "shape"
@@ -50,26 +53,32 @@ def test_check_output(self):
                 self.check_output_with_place(place)
 
     class TestShapeOp1(TestShapeOp):
+
         def config(self):
             self.shape = [2]
 
     class TestShapeOp2(TestShapeOp):
+
         def config(self):
             self.shape = [1, 2, 3]
 
     class TestShapeOp3(TestShapeOp):
+
         def config(self):
             self.shape = [1, 2, 3, 4]
 
     class TestShapeOp4(TestShapeOp):
+
         def config(self):
             self.shape = [1, 2, 3, 4, 1024]
 
     class TestShapeOp5(TestShapeOp):
+
         def config(self):
             self.shape = [1, 2, 3, 4, 1, 201]
 
     class TestShapeWithSelectedRows(unittest.TestCase):
+
         def setUp(self):
             self.dtype = self.in_type
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
index 1aac42f2d63a1..accd489d59679 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test_xpu import OpTest, XPUOpTest
 import paddle
@@ -43,6 +44,7 @@ def __init__(self):
         self.use_dynamic_create_class = False
 
     class TestSigmoidCrossEntropyWithLogitsOp(XPUOpTest):
+
         def setUp(self):
             self.set_xpu()
             self.op_type = "sigmoid_cross_entropy_with_logits"
@@ -65,11 +67,13 @@ def set_inputs(self):
             batch_size = 64
             num_classes = 20
             self.inputs = {
-                'X': logit(
-                    np.random.uniform(0, 1, (batch_size, num_classes))
-                    .astype(self.dtype)),
-                'Label': np.random.randint(0, 2, (batch_size, num_classes))
-                .astype(self.dtype)
+                'X':
+                logit(
+                    np.random.uniform(0, 1, (batch_size, num_classes)).astype(
+                        self.dtype)),
+                'Label':
+                np.random.randint(0, 2,
+                                  (batch_size, num_classes)).astype(self.dtype)
             }
             self.attrs = {'num_classes': num_classes, 'batch_size': batch_size}
 
@@ -98,11 +102,13 @@ def set_inputs(self):
             ignore_index = -1
             self.ignore_index = ignore_index
             self.inputs = {
-                'X': logit(
-                    np.random.uniform(0, 1, (batch_size, num_classes))
-                    .astype(self.dtype)),
-                'Label': np.random.randint(-1, 2, (batch_size, num_classes))
-                .astype(self.dtype)
+                'X':
+                logit(
+                    np.random.uniform(0, 1, (batch_size, num_classes)).astype(
+                        self.dtype)),
+                'Label':
+                np.random.randint(-1, 2,
+                                  (batch_size, num_classes)).astype(self.dtype)
             }
             self.attrs = {'ignore_index': ignore_index}
 
@@ -126,11 +132,13 @@ def set_inputs(self):
             batch_size = 64
             num_classes = 20
             self.inputs = {
-                'X': logit(
-                    np.random.uniform(0, 1, (batch_size, num_classes))
-                    .astype(self.dtype)),
-                'Label': np.random.uniform(0, 1, (batch_size, num_classes))
-                .astype(self.dtype)
+                'X':
+                logit(
+                    np.random.uniform(0, 1, (batch_size, num_classes)).astype(
+                        self.dtype)),
+                'Label':
+                np.random.uniform(0, 1,
+                                  (batch_size, num_classes)).astype(self.dtype)
             }
             self.attrs = {'num_classes': num_classes, 'batch_size': batch_size}
 
@@ -154,11 +162,13 @@ def set_inputs(self):
             ignore_index = -1
             self.ignore_index = ignore_index
             self.inputs = {
-                'X': logit(
-                    np.random.uniform(0, 1, (batch_size, num_classes))
-                    .astype(self.dtype)),
-                'Label': np.random.randint(-1, 2, (batch_size, num_classes))
-                .astype(self.dtype)
+                'X':
+                logit(
+                    np.random.uniform(0, 1, (batch_size, num_classes)).astype(
+                        self.dtype)),
+                'Label':
+                np.random.randint(-1, 2,
+                                  (batch_size, num_classes)).astype(self.dtype)
             }
             self.attrs = {'ignore_index': ignore_index, 'normalize': True}
 
@@ -185,12 +195,14 @@ def set_inputs(self):
             batch_size = [10, 10]
             num_classes = 20
             self.inputs = {
-                'X': logit(
-                    np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-                    .astype(self.dtype)),
+                'X':
+                logit(
+                    np.random.uniform(0, 1,
+                                      tuple(batch_size + [num_classes])).astype(
+                                          self.dtype)),
                 'Label':
-                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-                .astype(self.dtype)
+                np.random.uniform(0, 1, tuple(batch_size +
+                                              [num_classes])).astype(self.dtype)
             }
             self.attrs = {'num_classes': num_classes, 'batch_size': batch_size}
 
@@ -212,12 +224,14 @@ def set_inputs(self):
             batch_size = [10, 10]
             num_classes = 20
             self.inputs = {
-                'X': logit(
-                    np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-                    .astype(self.dtype)),
+                'X':
+                logit(
+                    np.random.uniform(0, 1,
+                                      tuple(batch_size + [num_classes])).astype(
+                                          self.dtype)),
                 'Label':
-                np.random.randint(0, 2, tuple(batch_size + [num_classes]))
-                .astype(self.dtype)
+                np.random.randint(0, 2, tuple(batch_size +
+                                              [num_classes])).astype(self.dtype)
             }
             self.attrs = {'num_classes': num_classes, 'batch_size': batch_size}
 
@@ -241,12 +255,14 @@ def set_inputs(self):
             ignore_index = -1
             self.ignore_index = ignore_index
             self.inputs = {
-                'X': logit(
-                    np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
-                    .astype(self.dtype)),
+                'X':
+                logit(
+                    np.random.uniform(0, 1,
+                                      tuple(batch_size + [num_classes])).astype(
+                                          self.dtype)),
                 'Label':
-                np.random.randint(-1, 2, tuple(batch_size + [num_classes]))
-                .astype(self.dtype)
+                np.random.randint(
+                    -1, 2, tuple(batch_size + [num_classes])).astype(self.dtype)
             }
             self.attrs = {'ignore_index': ignore_index, 'normalize': True}
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
index 9254a84ec4217..c00e0b5217a6c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -29,11 +30,13 @@
 
 
 class XPUTestSignOP(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'sign'
         self.use_dynamic_create_class = False
 
     class TestSignOPBase(XPUOpTest):
+
         def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_dtype()
@@ -62,18 +65,22 @@ def init_config(self):
             self.input_shape = [864]
 
     class XPUTestSign1(TestSignOPBase):
+
         def init_config(self):
             self.input_shape = [2, 768]
 
     class XPUTestSign2(TestSignOPBase):
+
         def init_config(self):
             self.input_shape = [3, 8, 4096]
 
     class XPUTestSign3(TestSignOPBase):
+
         def init_config(self):
             self.input_shape = [1024]
 
     class XPUTestSign4(TestSignOPBase):
+
         def init_config(self):
             self.input_shape = [2, 2, 255]
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
index 3d7c9959db9ea..3482330173792 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
@@ -16,6 +16,7 @@
 import numpy as np
 import sys
 import unittest
+
 sys.path.append("..")
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
@@ -27,11 +28,13 @@
 # Situation 1: starts(list, no tensor), ends(list, no tensor)
 # 1.1 without attr(decrease)
 class XPUTestSliceOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'slice'
         self.use_dynamic_create_class = False
 
     class TestSliceOp(XPUOpTest):
+
         def setUp(self):
             self.dtype = self.in_type
             self.place = paddle.XPUPlace(0)
@@ -67,6 +70,7 @@ def test_check_grad_normal(self):
                     user_defined_grad_outputs=user_defined_grad_outputs)
 
     class TestCase1(TestSliceOp):
+
         def config(self):
             self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
             self.starts = [-3, 0, 2]
@@ -76,6 +80,7 @@ def config(self):
             self.out = self.input[-3:3, 0:100, 2:-1, :]
 
     class TestCase2(TestSliceOp):
+
         def config(self):
             self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
             self.starts = [-3, 0, 2]
@@ -87,11 +92,13 @@ def config(self):
 
 # 1.2 with attr(decrease)
 class XPUTestSliceOp_decs_dim(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'slice'
         self.use_dynamic_create_class = False
 
     class TestSliceOp_decs_dim(XPUOpTest):
+
         def setUp(self):
             self.dtype = self.in_type
             self.place = paddle.XPUPlace(0)
@@ -132,6 +139,7 @@ def test_check_grad_normal(self):
                     user_defined_grad_outputs=user_defined_grad_outputs)
 
     class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
+
         def config(self):
             self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
             self.starts = [1, 0, 2]
@@ -142,6 +150,7 @@ def config(self):
             self.out = self.input[1, 0, 2:4, :]
 
     class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
+
         def config(self):
             self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
             self.starts = [-1, 0, 2]
@@ -152,6 +161,7 @@ def config(self):
             self.out = self.input[-1, 0, 2:4, :]
 
     class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
+
         def config(self):
             self.input = np.random.random([3, 4, 5, 7]).astype(self.dtype)
             self.starts = [0, 1, 2, 3]
@@ -162,6 +172,7 @@ def config(self):
             self.out = self.input[0, 1, 2, 3:4]
 
     class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
+
         def config(self):
             self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
             self.starts = [-1]
@@ -172,6 +183,7 @@ def config(self):
             self.out = self.input[:, :, :, -1]
 
     class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
+
         def config(self):
             self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
             self.starts = [0, 1, 2, 3]
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
index aa56a463b901e..a4997c91ffbf5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
@@ -16,6 +16,7 @@
 import numpy as np
 import sys
 import unittest
+
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
 from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
@@ -43,6 +44,7 @@ def ref_softmax(x, axis=None, dtype=None):
 
 
 class XPUTestSoftmaxOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'softmax'
         self.use_dynamic_create_class = True
@@ -61,6 +63,7 @@ def dynamic_create_class(self):
         return base_class, classes
 
     class TestSoftmaxOp(XPUOpTest):
+
         def setUp(self):
             self.op_type = "softmax"
             if not hasattr(self, 'shape'):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
index 59907fe9f6858..661f11704187d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+
 sys.path.append("..")
 
 from test_softmax_op import stable_softmax
@@ -47,6 +48,7 @@ def cross_entropy(softmax, label, soft_label, axis, ignore_index=-1):
 
 
 class XPUTestSoftmaxWithCrossEntropyOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'softmax_with_cross_entropy'
         self.use_dynamic_create_class = True
@@ -106,8 +108,10 @@ def setUp(self):
             else:
                 axis_dim = self.shape[self.axis]
                 self.shape[self.axis] = 1
-                labels = np.random.randint(
-                    0, axis_dim, self.shape, dtype="int64")
+                labels = np.random.randint(0,
+                                           axis_dim,
+                                           self.shape,
+                                           dtype="int64")
 
             loss = cross_entropy(softmax, labels, self.soft_label, self.axis,
                                  self.ignore_index)
@@ -136,8 +140,9 @@ def test_check_grad(self):
             if paddle.is_compiled_with_xpu():
                 paddle.enable_static()
                 place = paddle.XPUPlace(0)
-                self.check_grad_with_place(
-                    place, ["Logits"], "Loss", max_relative_error=0.2)
+                self.check_grad_with_place(place, ["Logits"],
+                                           "Loss",
+                                           max_relative_error=0.2)
 
 
 support_types = get_xpu_op_support_types('softmax_with_cross_entropy')
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_split_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_split_op_xpu.py
index 1b8bf64a0de3d..a27d94e739932 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_split_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_split_op_xpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -30,12 +31,14 @@
 
 
 class XPUTestSplitOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'split'
         self.use_dynamic_create_class = False
 
     # test with attr(num)
     class TestSplitOp(XPUOpTest):
+
         def setUp(self):
             self.init_dtype()
             self.__class__.use_xpu = True
@@ -68,6 +71,7 @@ def test_check_output(self):
 
     # unknown sections
     class TestSplitOp1(TestSplitOp):
+
         def initParameters(self):
             self.x = np.random.random((4, 5, 6)).astype(self.dtype)
             self.axis = 2
@@ -77,6 +81,7 @@ def initParameters(self):
 
     # test with int32
     class TestSplitOp2(TestSplitOp):
+
         def initParameters(self):
             self.x = np.random.random((4, 5, 6)).astype(np.int32)
             self.axis = 2
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py
index 705e7c4cb0fef..cdc67cf464752 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import unittest
 import sys
+
 sys.path.append("..")
 
 import numpy as np
@@ -28,11 +29,13 @@
 
 
 class XPUTestSqueeze2Op(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = "squeeze2"
         self.use_dynamic_create_class = False
 
     class TestSqueeze2Op(XPUOpTest):
+
         def setUp(self):
             self.op_type = "squeeze2"
             self.use_mkldnn = False
@@ -78,6 +81,7 @@ def test_check_grad(self):
 
     # Correct: There is mins axis.
     class TestSqueeze2Op1(TestSqueeze2Op):
+
         def init_test_case(self):
             self.ori_shape = (1, 20, 1, 5)
             self.axes = (0, -2)
@@ -85,13 +89,15 @@ def init_test_case(self):
 
     # Correct: No axes input.
     class TestSqueeze2Op2(TestSqueeze2Op):
+
         def init_test_case(self):
             self.ori_shape = (1, 20, 1, 5)
             self.axes = ()
             self.new_shape = (20, 5)
 
-    # Correct: Just part of axes be squeezed. 
+    # Correct: Just part of axes be squeezed.
     class TestSqueeze2Op3(TestSqueeze2Op):
+
         def init_test_case(self):
             self.ori_shape = (6, 1, 5, 1, 4, 1)
             self.axes = (1, -1)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_squeeze_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_squeeze_op_xpu.py
index de701bfc513e3..b766b6e3c002f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_squeeze_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_squeeze_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import unittest
 import sys
+
 sys.path.append("..")
 
 import numpy as np
@@ -30,6 +31,7 @@
 
 # Correct: General.
 class TestSqueezeOp(XPUOpTest):
+
     def setUp(self):
         self.op_type = "squeeze"
         self.use_xpu = True
@@ -37,7 +39,9 @@ def setUp(self):
         self.init_test_case()
         self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
         self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape), }
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+        }
 
     def test_check_output(self):
         if paddle.is_compiled_with_xpu():
@@ -60,6 +64,7 @@ def init_attrs(self):
 
 # Correct: There is mins axis.
 class TestSqueezeOp1(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (1, 3, 1, 40)
         self.axes = (0, -2)
@@ -68,14 +73,16 @@ def init_test_case(self):
 
 # Correct: No axes input.
 class TestSqueezeOp2(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (1, 20, 1, 5)
         self.axes = ()
         self.new_shape = (20, 5)
 
 
-# Correct: Just part of axes be squeezed. 
+# Correct: Just part of axes be squeezed.
 class TestSqueezeOp3(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (6, 1, 5, 1, 4, 1)
         self.axes = (1, -1)
@@ -84,6 +91,7 @@ def init_test_case(self):
 
 # Correct: The demension of axis is not of size 1 remains unchanged.
 class TestSqueezeOp4(TestSqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (6, 1, 5, 1, 4, 1)
         self.axes = (1, 2)
@@ -91,12 +99,13 @@ def init_test_case(self):
 
 
 class TestSqueezeOpError(unittest.TestCase):
+
     def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input type of softmax_op must be Variable.
-            x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], paddle.XPUPlace(0))
+            x1 = fluid.create_lod_tensor(np.array([[-1]]), [[1]],
+                                         paddle.XPUPlace(0))
             self.assertRaises(TypeError, paddle.squeeze, x1)
             # The input axes of squeeze must be list.
             x2 = paddle.static.data(name='x2', shape=[4], dtype="int32")
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
index 86126f976ab1a..b6d547e7059fc 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -30,12 +31,14 @@
 
 
 class XPUTestStackOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'stack'
         self.use_dynamic_create_class = False
 
     @skip_check_grad_ci(reason="There is no grad kernel for stack_xpu op.")
     class TestStackOp(XPUOpTest):
+
         def initDefaultParameters(self):
             self.num_inputs = 4
             self.input_dim = (5, 6, 7)
@@ -80,18 +83,21 @@ def test_check_grad(self):
             if self.dtype == np.int32 or self.dtype == np.int64:
                 pass
             else:
-                self.check_grad_with_place(
-                    paddle.XPUPlace(0), self.get_x_names(), 'Y')
+                self.check_grad_with_place(paddle.XPUPlace(0),
+                                           self.get_x_names(), 'Y')
 
     class TestStackOp1(TestStackOp):
+
         def initParameters(self):
             self.num_inputs = 16
 
     class TestStackOp2(TestStackOp):
+
         def initParameters(self):
             self.num_inputs = 30
 
     class TestStackOp3(TestStackOp):
+
         def initParameters(self):
             self.axis = -1
 
@@ -99,6 +105,7 @@ def test_check_grad(self):
             pass
 
     class TestStackOp4(TestStackOp):
+
         def initParameters(self):
             self.axis = -4
 
@@ -106,14 +113,17 @@ def test_check_grad(self):
             pass
 
     class TestStackOp5(TestStackOp):
+
         def initParameters(self):
             self.axis = 1
 
     class TestStackOp6(TestStackOp):
+
         def initParameters(self):
             self.axis = 3
 
     class TestStackOp7(TestStackOp):
+
         def initParameters(self):
             self.num_inputs = 4
             self.input_dim = (5, 6, 7)
@@ -124,6 +134,7 @@ def test_check_grad(self):
             pass
 
     class TestStackOp8(TestStackOp):
+
         def initParameters(self):
             self.num_inputs = 4
             self.input_dim = (5, 6, 7)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
index 8ab556efd4241..61f7bcda08c94 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy as np
@@ -23,14 +24,16 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
-from paddle.fluid.tests.unittests.op_test import (
-    OpTest, convert_float_to_uint16, convert_uint16_to_float)
+from paddle.fluid.tests.unittests.op_test import (OpTest,
+                                                  convert_float_to_uint16,
+                                                  convert_uint16_to_float)
 from paddle import _C_ops
 
 paddle.enable_static()
 
 
 class TestSumOp(XPUOpTest):
+
     def setUp(self):
         self.op_type = "sum"
         self.init_kernel_type()
@@ -54,6 +57,7 @@ def test_check_grad(self):
 
 #----------- test fp16 -----------
 class TestFP16SumOp(TestSumOp):
+
     def init_kernel_type(self):
         self.dtype = np.float16
 
@@ -67,12 +71,15 @@ def test_check_output(self):
     def test_check_grad(self):
         place = core.XPUPlace(0)
         # if core.is_float16_supported(place):
-        self.check_grad_with_place(
-            place, ['x0'], 'Out', max_relative_error=0.15)
+        self.check_grad_with_place(place, ['x0'],
+                                   'Out',
+                                   max_relative_error=0.15)
 
 
 def create_test_sum_fp16_class(parent):
+
     class TestSumFp16Case(parent):
+
         def init_kernel_type(self):
             self.dtype = np.float16
 
@@ -88,12 +95,15 @@ def test_w_is_selected_rows(self):
 
 
 class API_Test_Add_n(unittest.TestCase):
+
     def test_api(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input0 = fluid.layers.fill_constant(
-                shape=[2, 3], dtype='int64', value=5)
-            input1 = fluid.layers.fill_constant(
-                shape=[2, 3], dtype='int64', value=3)
+            input0 = fluid.layers.fill_constant(shape=[2, 3],
+                                                dtype='int64',
+                                                value=5)
+            input1 = fluid.layers.fill_constant(shape=[2, 3],
+                                                dtype='int64',
+                                                value=3)
             expected_result = np.empty((2, 3))
             expected_result.fill(8)
             sum_value = paddle.add_n([input0, input1])
@@ -112,7 +122,9 @@ def test_api(self):
 
 
 class TestRaiseSumError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_type():
             fluid.layers.sum([11, 22])
 
@@ -133,7 +145,9 @@ def test_dtype1():
 
 
 class TestRaiseSumsError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_type():
             fluid.layers.sums([11, 22])
 
@@ -169,7 +183,9 @@ def test_out_dtype():
 
 
 class TestSumOpError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_empty_list_input():
             with fluid.dygraph.guard():
                 fluid._C_ops.sum([])
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
index cd18bd63a88f7..163c5628e74a2 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
@@ -32,11 +33,13 @@
 
 #Situation 1: repeat_times is a list (without tensor)
 class XPUTestTileOpRank1(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'tile'
         self.use_dynamic_create_class = False
 
     class TestTileOpRank1(XPUOpTest):
+
         def setUp(self):
             self.dtype = self.in_type
             self.__class__.no_need_check_grad = True
@@ -59,31 +62,37 @@ def test_check_output(self):
 
     #with dimension expanding
     class TestTileOpRank2Expanding(TestTileOpRank1):
+
         def init_data(self):
             self.ori_shape = [120]
             self.repeat_times = [2, 2]
 
     class TestTileOpRank2(TestTileOpRank1):
+
         def init_data(self):
             self.ori_shape = [12, 14]
             self.repeat_times = [2, 3]
 
     class TestTileOpRank3_Corner(TestTileOpRank1):
+
         def init_data(self):
             self.ori_shape = (2, 10, 5)
             self.repeat_times = (1, 1, 1)
 
     class TestTileOpRank3_Corner2(TestTileOpRank1):
+
         def init_data(self):
             self.ori_shape = (2, 10, 5)
             self.repeat_times = (2, 2)
 
     class TestTileOpRank3(TestTileOpRank1):
+
         def init_data(self):
             self.ori_shape = (2, 4, 15)
             self.repeat_times = (2, 1, 4)
 
     class TestTileOpRank4(TestTileOpRank1):
+
         def init_data(self):
             self.ori_shape = (2, 4, 5, 7)
             self.repeat_times = (3, 2, 1, 2)
@@ -91,11 +100,13 @@ def init_data(self):
 
 # Situation 2: repeat_times is a list (with tensor)
 class XPUTestTileOpRank1_tensor_attr(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'tile'
         self.use_dynamic_create_class = False
 
     class TestTileOpRank1_tensor_attr(XPUOpTest):
+
         def setUp(self):
             self.dtype = self.in_type
             self.__class__.no_need_check_grad = True
@@ -124,12 +135,14 @@ def test_check_output(self):
             self.check_output_with_place(self.place)
 
     class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
+
         def init_data(self):
             self.ori_shape = [12, 14]
             self.repeat_times = [1, 1]
             self.infer_repeat_times = [1, -1]
 
     class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr):
+
         def init_data(self):
             self.ori_shape = [12, 14]
             self.repeat_times = [2, 3]
@@ -138,11 +151,13 @@ def init_data(self):
 
 # Situation 3: repeat_times is a tensor
 class XPUTestTileOpRank1_tensor(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'tile'
         self.use_dynamic_create_class = False
 
     class TestTileOpRank1_tensor(XPUOpTest):
+
         def setUp(self):
             self.dtype = self.in_type
             self.__class__.no_need_check_grad = True
@@ -166,6 +181,7 @@ def test_check_output(self):
             self.check_output_with_place(self.place)
 
     class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
+
         def init_data(self):
             self.ori_shape = [12, 14]
             self.repeat_times = [2, 3]
@@ -180,6 +196,7 @@ def init_data(self):
 
 # Test python API
 class TestTileAPI(unittest.TestCase):
+
     def test_api(self):
         with fluid.dygraph.guard(paddle.XPUPlace(0)):
             np_x = np.random.random([12, 14]).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py
index c4418bd55c10a..1fa4a5e8b7d00 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py
@@ -16,6 +16,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from paddle.fluid.op import Operator
 import paddle.fluid.core as core
@@ -29,6 +30,7 @@
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestTopkOp(OpTest):
+
     def setUp(self):
         self.variable_k = False
         self.use_xpu = True
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py
index 71895db4ae9bf..0a3bd54a593d1 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 import sys
+
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
 import paddle
@@ -43,11 +44,13 @@ def numpy_topk(x, k=1, axis=-1, largest=True):
 
 
 class XPUTestTopKV2Op(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'top_k_v2'
         self.use_dynamic_create_class = False
 
     class TestTopkOp(XPUOpTest):
+
         def init_args(self):
             self.k = 3
             self.axis = 1
@@ -64,8 +67,10 @@ def setUp(self):
                 'axis': self.axis,
                 'largest': self.largest
             }
-            output, indices = numpy_topk(
-                self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+            output, indices = numpy_topk(self.input_data,
+                                         axis=self.axis,
+                                         k=self.k,
+                                         largest=self.largest)
             self.outputs = {'Out': output, 'Indices': indices}
 
         def test_check_output(self):
@@ -79,6 +84,7 @@ def test_check_grad(self):
                 self.check_grad(set(['X']), 'Out')
 
     class TestTopkOp1(TestTopkOp):
+
         def init_args(self):
             self.k = 3
             self.axis = 1
@@ -86,6 +92,7 @@ def init_args(self):
             self.input_data = np.random.rand(100, 155).astype(self.dtype)
 
     class TestTopkOp2(TestTopkOp):
+
         def init_args(self):
             self.k = 3
             self.axis = 1
@@ -93,6 +100,7 @@ def init_args(self):
             self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
 
     class TestTopkOp3(TestTopkOp):
+
         def init_args(self):
             self.k = 5
             self.axis = 1
@@ -100,6 +108,7 @@ def init_args(self):
             self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
 
     class TestTopkOp4(TestTopkOp):
+
         def init_args(self):
             self.k = 1
             self.axis = 1
@@ -107,6 +116,7 @@ def init_args(self):
             self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
 
     class TestTopkOp5(TestTopkOp):
+
         def init_args(self):
             self.k = 3
             self.axis = 2
@@ -114,6 +124,7 @@ def init_args(self):
             self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
 
     class TestTopkOp6(TestTopkOp):
+
         def init_args(self):
             self.k = 5
             self.axis = 1
@@ -121,6 +132,7 @@ def init_args(self):
             self.input_data = np.random.rand(8, 32, 64).astype(self.dtype)
 
     class TestTopkOp7(TestTopkOp):
+
         def init_args(self):
             self.k = 10
             self.axis = 2
@@ -128,6 +140,7 @@ def init_args(self):
             self.input_data = np.random.rand(8, 5, 10, 16).astype(self.dtype)
 
     class TestTopkOp8(TestTopkOp):
+
         def init_args(self):
             self.k = 1
             self.axis = 1
@@ -135,6 +148,7 @@ def init_args(self):
             self.input_data = np.random.rand(8, 32, 64).astype(self.dtype)
 
     class TestTopkOp9(TestTopkOp):
+
         def init_args(self):
             self.k = 3
             self.axis = 1
@@ -142,6 +156,7 @@ def init_args(self):
             self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
 
     class TestTopkOp10(TestTopkOp):
+
         def init_args(self):
             self.k = 3
             self.axis = 1
@@ -149,6 +164,7 @@ def init_args(self):
             self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
 
     class TestTopkOp11(TestTopkOp):
+
         def init_args(self):
             self.k = 5
             self.axis = 1
@@ -156,6 +172,7 @@ def init_args(self):
             self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
 
     class TestTopkOp12(TestTopkOp):
+
         def init_args(self):
             self.k = 1
             self.axis = 1
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py
index 41df4481e2d40..b3a1a636e8adf 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py
@@ -27,6 +27,7 @@
 
 
 class TestXPUTransposeOp(XPUOpTest):
+
     def setUp(self):
         self.init_op_type()
         self.initTestCase()
@@ -65,60 +66,70 @@ def initTestCase(self):
 
 
 class TestCase0(TestXPUTransposeOp):
+
     def initTestCase(self):
         self.shape = (100, )
         self.axis = (0, )
 
 
 class TestCase1(TestXPUTransposeOp):
+
     def initTestCase(self):
         self.shape = (3, 4, 10)
         self.axis = (0, 2, 1)
 
 
 class TestCase2(TestXPUTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5)
         self.axis = (0, 2, 3, 1)
 
 
 class TestCase3(TestXPUTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6)
         self.axis = (4, 2, 3, 1, 0)
 
 
 class TestCase4(TestXPUTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 4, 5, 6, 1)
         self.axis = (4, 2, 3, 1, 0, 5)
 
 
 class TestCase5(TestXPUTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 16, 96)
         self.axis = (0, 2, 1)
 
 
 class TestCase6(TestXPUTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 10, 12, 16)
         self.axis = (3, 1, 2, 0)
 
 
 class TestCase7(TestXPUTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 10, 2, 16)
         self.axis = (0, 1, 3, 2)
 
 
 class TestCase8(TestXPUTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
         self.axis = (0, 1, 3, 2, 4, 5, 6, 7)
 
 
 class TestCase9(TestXPUTransposeOp):
+
     def initTestCase(self):
         self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
         self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
index ee689efbb38a0..28fff5981b7c7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
@@ -13,6 +13,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -30,11 +31,13 @@
 
 
 class XPUTestTrilTriuOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'tril_triu'
         self.use_dynamic_create_class = False
 
     class TestTrilTriuOp(XPUOpTest):
+
         def setUp(self):
             self.init_dtype()
             self.initTestCase()
@@ -44,9 +47,9 @@ def setUp(self):
             self.op_type = "tril_triu"
             self.place = paddle.XPUPlace(0)
             if self.dtype == np.int32:
-                self.X = np.arange(
-                    1, self.get_Xshape_prod() + 1,
-                    dtype=self.dtype).reshape(self.Xshape)
+                self.X = np.arange(1,
+                                   self.get_Xshape_prod() + 1,
+                                   dtype=self.dtype).reshape(self.Xshape)
             else:
                 self.X = np.random.random(self.Xshape).astype(dtype=self.dtype)
             self.inputs = {'X': self.X}
@@ -55,7 +58,8 @@ def setUp(self):
                 'lower': True if self.real_op_type == 'tril' else False,
             }
             self.outputs = {
-                'Out': self.real_np_op(self.X, self.diagonal)
+                'Out':
+                self.real_np_op(self.X, self.diagonal)
                 if self.diagonal else self.real_np_op(self.X)
             }
 
@@ -92,42 +96,50 @@ def initTestCase(self):
             self.Xshape = (10, 10)
 
     class TestTrilTriuOp1(TestTrilTriuOp):
+
         def initTestCase(self):
             self.diagonal = -3
             self.Xshape = (5, 5)
 
     class TestTrilTriuOp2(TestTrilTriuOp):
+
         def initTestCase(self):
             self.diagonal = 4
             self.Xshape = (11, 17)
 
     class TestTrilTriuOp3(TestTrilTriuOp):
+
         def initTestCase(self):
             self.diagonal = 10
             self.Xshape = (2, 25, 25)
 
     class TestTrilTriuOp4(TestTrilTriuOp):
+
         def initTestCase(self):
             self.diagonal = -10
             self.Xshape = (1, 2, 33, 11)
 
     class TestTrilTriuOp5(TestTrilTriuOp):
+
         def initTestCase(self):
             self.diagonal = 11
             self.Xshape = (1, 1, 99)
 
     class TestTrilTriuOp6(TestTrilTriuOp):
+
         def initTestCase(self):
             self.diagonal = 5
             self.Xshape = (1, 2, 3, 5, 99)
 
     class TestTrilTriuOp7(TestTrilTriuOp):
+
         def initTestCase(self):
             self.diagonal = -100
             self.Xshape = (2, 2, 3, 4, 5)
 
 
 class TestTrilTriuOpError(unittest.TestCase):
+
     def test_errors1(self):
         paddle.enable_static()
         data = fluid.data(shape=(20, 22), dtype='float32', name="data1")
@@ -137,8 +149,8 @@ def test_errors1(self):
             "diagonal in {} must be a python Int".format(op_type),
         }
         expected = list(errmsg.keys())[0]
-        with self.assertRaisesRegex(
-                eval(expected.split(':')[-1]), errmsg[expected]):
+        with self.assertRaisesRegex(eval(expected.split(':')[-1]),
+                                    errmsg[expected]):
             getattr(tensor, op_type)(x=data, diagonal='2022')
 
     def test_errors2(self):
@@ -150,8 +162,8 @@ def test_errors2(self):
             "x shape in {} must be at least 2-D".format(op_type),
         }
         expected = list(errmsg.keys())[0]
-        with self.assertRaisesRegex(
-                eval(expected.split(':')[-1]), errmsg[expected]):
+        with self.assertRaisesRegex(eval(expected.split(':')[-1]),
+                                    errmsg[expected]):
             getattr(tensor, op_type)(x=data, diagonal=[None])
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_truncated_gaussian_random_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_truncated_gaussian_random_op_xpu.py
index d096cb8ec13f4..f9ccf0576a26c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_truncated_gaussian_random_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_truncated_gaussian_random_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 import unittest
 import numpy
@@ -30,6 +31,7 @@
 
 
 class TestXPUTrunctedGaussianRandomOp(TestTrunctedGaussianRandomOp):
+
     def test_xpu(self):
         if paddle.is_compiled_with_xpu():
             self.gaussian_random_test(place=fluid.XPUPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_uniform_random_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_uniform_random_op_xpu.py
index ab59fd2665679..d28029d188323 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_uniform_random_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_uniform_random_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import sys
+
 sys.path.append("..")
 import subprocess
 import unittest
@@ -31,6 +32,7 @@
 
 
 class TestXPUUniformRandomOp(TestUniformRandomOp):
+
     def test_check_output(self):
         if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
@@ -41,6 +43,7 @@ def test_check_output(self):
 
 
 class TestXPUUniformRandomOpSelectedRows(TestUniformRandomOpSelectedRows):
+
     def test_check_output(self):
         if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py
index f6c540d6c2c0a..6daa47394122a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import unittest
 import sys
+
 sys.path.append("..")
 
 import numpy as np
@@ -29,11 +30,13 @@
 
 
 class XPUTestUnsqueeze2Op(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = "unsqueeze2"
         self.use_dynamic_create_class = False
 
     class TestUnsqueeze2Op(XPUOpTest):
+
         def setUp(self):
             self.op_type = "unsqueeze2"
             self.use_mkldnn = False
@@ -79,6 +82,7 @@ def test_check_grad(self):
 
     # Correct: Single input index.
     class TestUnsqueeze2Op1(TestUnsqueeze2Op):
+
         def init_test_case(self):
             self.ori_shape = (20, 5)
             self.axes = (-1, )
@@ -86,6 +90,7 @@ def init_test_case(self):
 
     # Correct: Mixed input axis.
     class TestUnsqueeze2Op2(TestUnsqueeze2Op):
+
         def init_test_case(self):
             self.ori_shape = (20, 5)
             self.axes = (0, -1)
@@ -93,6 +98,7 @@ def init_test_case(self):
 
     # Correct: There is duplicated axis.
     class TestUnsqueeze2Op3(TestUnsqueeze2Op):
+
         def init_test_case(self):
             self.ori_shape = (10, 2, 5)
             self.axes = (0, 3, 3)
@@ -100,6 +106,7 @@ def init_test_case(self):
 
     # Correct: Reversed axes.
     class TestUnsqueeze2Op4(TestUnsqueeze2Op):
+
         def init_test_case(self):
             self.ori_shape = (10, 2, 5)
             self.axes = (3, 1, 1)
@@ -107,6 +114,7 @@ def init_test_case(self):
 
     # axes is a list(with tensor)
     class TestUnsqueeze2Op_AxesTensorList(XPUOpTest):
+
         def setUp(self):
             self.op_type = "unsqueeze2"
             self.use_mkldnn = False
@@ -151,24 +159,28 @@ def init_attrs(self):
             self.attrs = {}
 
     class TestUnsqueeze2Op1_AxesTensorList(TestUnsqueeze2Op_AxesTensorList):
+
         def init_test_case(self):
             self.ori_shape = (20, 5)
             self.axes = (-1, )
             self.new_shape = (20, 5, 1)
 
     class TestUnsqueeze2Op2_AxesTensorList(TestUnsqueeze2Op_AxesTensorList):
+
         def init_test_case(self):
             self.ori_shape = (20, 5)
             self.axes = (0, -1)
             self.new_shape = (1, 20, 5, 1)
 
     class TestUnsqueeze2Op3_AxesTensorList(TestUnsqueeze2Op_AxesTensorList):
+
         def init_test_case(self):
             self.ori_shape = (10, 2, 5)
             self.axes = (0, 3, 3)
             self.new_shape = (1, 10, 2, 1, 1, 5)
 
     class TestUnsqueeze2Op4_AxesTensorList(TestUnsqueeze2Op_AxesTensorList):
+
         def init_test_case(self):
             self.ori_shape = (10, 2, 5)
             self.axes = (3, 1, 1)
@@ -176,6 +188,7 @@ def init_test_case(self):
 
     # axes is a Tensor
     class TestUnsqueeze2Op_AxesTensor(XPUOpTest):
+
         def setUp(self):
             self.op_type = "unsqueeze2"
             self.use_mkldnn = False
@@ -215,24 +228,28 @@ def init_attrs(self):
             self.attrs = {}
 
     class TestUnsqueeze2Op1_AxesTensor(TestUnsqueeze2Op_AxesTensor):
+
         def init_test_case(self):
             self.ori_shape = (20, 5)
             self.axes = (-1, )
             self.new_shape = (20, 5, 1)
 
     class TestUnsqueeze2Op2_AxesTensor(TestUnsqueeze2Op_AxesTensor):
+
         def init_test_case(self):
             self.ori_shape = (20, 5)
             self.axes = (0, -1)
             self.new_shape = (1, 20, 5, 1)
 
     class TestUnsqueeze2Op3_AxesTensor(TestUnsqueeze2Op_AxesTensor):
+
         def init_test_case(self):
             self.ori_shape = (10, 2, 5)
             self.axes = (0, 3, 3)
             self.new_shape = (1, 10, 2, 1, 1, 5)
 
     class TestUnsqueeze2Op4_AxesTensor(TestUnsqueeze2Op_AxesTensor):
+
         def init_test_case(self):
             self.ori_shape = (10, 2, 5)
             self.axes = (3, 1, 1)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze_op_xpu.py
index 5e40073e73112..9e505fe08a647 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze_op_xpu.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import unittest
 import sys
+
 sys.path.append("..")
 
 import numpy as np
@@ -29,6 +30,7 @@
 
 # Correct: General.
 class TestUnsqueezeOp(XPUOpTest):
+
     def setUp(self):
         self.init_test_case()
         self.op_type = "unsqueeze"
@@ -59,6 +61,7 @@ def init_attrs(self):
 
 # Correct: Single input index.
 class TestUnsqueezeOp1(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (-1, )
@@ -67,6 +70,7 @@ def init_test_case(self):
 
 # Correct: Mixed input axis.
 class TestUnsqueezeOp2(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (20, 5)
         self.axes = (0, -1)
@@ -75,6 +79,7 @@ def init_test_case(self):
 
 # Correct: There is duplicated axis.
 class TestUnsqueezeOp3(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (0, 3, 3)
@@ -83,6 +88,7 @@ def init_test_case(self):
 
 # Correct: Reversed axes.
 class TestUnsqueezeOp4(TestUnsqueezeOp):
+
     def init_test_case(self):
         self.ori_shape = (10, 2, 5)
         self.axes = (3, 1, 1)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py
index 33b13081b5442..0aecc48fe3506 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py
@@ -14,6 +14,7 @@
 
 import unittest
 import sys
+
 sys.path.append("..")
 import numpy as np
 from op_test import OpTest
@@ -26,6 +27,7 @@
 
 
 class TestUpdateLossScalingOp(XPUOpTest):
+
     def setUp(self):
         self.op_type = "update_loss_scaling"
         self.init()
@@ -69,6 +71,7 @@ def test_check_output(self):
 
 
 class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
+
     def setUp(self):
         self.op_type = "update_loss_scaling"
         self.init()
@@ -101,17 +104,21 @@ def test_check_output(self):
 
 
 class TestUpdateLossScalingLayer(unittest.TestCase):
+
     def loss_scaling_check(self, scope=fluid.Scope()):
         a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
         b = fluid.data(name="b", shape=[512, 128], dtype='float32')
         x = [a, b]
         found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
-        prev_loss_scaling = fluid.data(
-            name="prev_loss_scaling", shape=[1], dtype='float32')
-        num_good_steps = fluid.data(
-            name="num_good_steps", shape=[1], dtype='int32')
-        num_bad_steps = fluid.data(
-            name="num_bad_steps", shape=[1], dtype='int32')
+        prev_loss_scaling = fluid.data(name="prev_loss_scaling",
+                                       shape=[1],
+                                       dtype='float32')
+        num_good_steps = fluid.data(name="num_good_steps",
+                                    shape=[1],
+                                    dtype='int32')
+        num_bad_steps = fluid.data(name="num_bad_steps",
+                                   shape=[1],
+                                   dtype='int32')
 
         a_v = np.random.random([1024, 1024]).astype('float32')
         b_v = np.random.random([512, 128]).astype('float32')
@@ -125,17 +132,16 @@ def loss_scaling_check(self, scope=fluid.Scope()):
         incr_ratio = 2
         decr_ratio = 0.8
 
-        result = amp_nn.update_loss_scaling(
-            x,
-            found_inf,
-            prev_loss_scaling,
-            num_good_steps,
-            num_bad_steps,
-            incr_every_n_steps,
-            decr_every_n_nan_or_inf,
-            incr_ratio,
-            decr_ratio,
-            name="update_loss_scaling")
+        result = amp_nn.update_loss_scaling(x,
+                                            found_inf,
+                                            prev_loss_scaling,
+                                            num_good_steps,
+                                            num_bad_steps,
+                                            incr_every_n_steps,
+                                            decr_every_n_nan_or_inf,
+                                            incr_ratio,
+                                            decr_ratio,
+                                            name="update_loss_scaling")
 
         place = fluid.XPUPlace(0)
         exe = fluid.Executor(place)
@@ -167,12 +173,15 @@ def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()):
         b = fluid.data(name="b", shape=[512, 128], dtype='float32')
         x = [a, b]
         found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
-        prev_loss_scaling = fluid.data(
-            name="prev_loss_scaling", shape=[1], dtype='float32')
-        num_good_steps = fluid.data(
-            name="num_good_steps", shape=[1], dtype='int32')
-        num_bad_steps = fluid.data(
-            name="num_bad_steps", shape=[1], dtype='int32')
+        prev_loss_scaling = fluid.data(name="prev_loss_scaling",
+                                       shape=[1],
+                                       dtype='float32')
+        num_good_steps = fluid.data(name="num_good_steps",
+                                    shape=[1],
+                                    dtype='int32')
+        num_bad_steps = fluid.data(name="num_bad_steps",
+                                   shape=[1],
+                                   dtype='int32')
 
         a_v = np.random.random([1024, 1024]).astype('float32')
         b_v = np.random.random([512, 128]).astype('float32')
@@ -189,17 +198,16 @@ def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()):
         incr_ratio = 2
         decr_ratio = 0.8
 
-        result = amp_nn.update_loss_scaling(
-            x,
-            found_inf,
-            prev_loss_scaling,
-            num_good_steps,
-            num_bad_steps,
-            incr_every_n_steps,
-            decr_every_n_nan_or_inf,
-            incr_ratio,
-            decr_ratio,
-            name="update_loss_scaling")
+        result = amp_nn.update_loss_scaling(x,
+                                            found_inf,
+                                            prev_loss_scaling,
+                                            num_good_steps,
+                                            num_bad_steps,
+                                            incr_every_n_steps,
+                                            decr_every_n_nan_or_inf,
+                                            incr_ratio,
+                                            decr_ratio,
+                                            name="update_loss_scaling")
 
         place = fluid.XPUPlace(0)
         exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_where_index_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_where_index_xpu.py
index 9c86286d3d8c2..1b90fa93588f8 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_where_index_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_where_index_xpu.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -30,10 +31,12 @@
 
 
 class XPUTestWhereIndexOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'where_index'
 
     class TestWhereIndexOp(XPUOpTest):
+
         def setUp(self):
             self.init_config()
             self.init_data()
@@ -54,6 +57,7 @@ def init_config(self):
             self.__class__.no_need_check_grad = True
 
     class TestAllFalse(TestWhereIndexOp):
+
         def init_data(self):
             self.inputs = {
                 'Condition': np.array([False, False, False]).astype(self.dtype),
@@ -61,6 +65,7 @@ def init_data(self):
             self.outputs = {'Out': np.array([], dtype='int64')}
 
     class TestRank2(TestWhereIndexOp):
+
         def init_data(self):
             self.inputs = {
                 'Condition':
@@ -69,6 +74,7 @@ def init_data(self):
             self.outputs = {'Out': np.array([[0, 0], [1, 1]], dtype='int64')}
 
     class TestRank3(TestWhereIndexOp):
+
         def init_data(self):
             self.inputs = {
                 'Condition':
@@ -78,7 +84,8 @@ def init_data(self):
             }
 
             self.outputs = {
-                'Out': np.array(
+                'Out':
+                np.array(
                     [[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 0], [2, 1, 1]],
                     dtype='int64')
             }
@@ -90,6 +97,7 @@ def init_data(self):
 
 
 class TestWhereOpError(unittest.TestCase):
+
     def test_api(self):
         with program_guard(Program(), Program()):
             cond = fluid.layers.data(name='cond', shape=[4], dtype='bool')
@@ -102,7 +110,9 @@ def test_api(self):
 
 
 class TestWhereRaiseError(unittest.TestCase):
+
     def test_errors(self):
+
         def test_type():
             fluid.layers.where([10])
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py
index 461b56ff0d8a8..ad22ab86b932f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 import sys
+
 sys.path.append("..")
 
 import paddle
@@ -31,10 +32,12 @@
 
 
 class XPUTestWhereOp(XPUOpTestWrapper):
+
     def __init__(self):
         self.op_name = 'where'
 
     class TestXPUWhereOp(XPUOpTest):
+
         def setUp(self):
             self.init_config()
             self.init_data()
@@ -56,18 +59,19 @@ def test_check_output(self):
             self.check_output_with_place(self.place)
 
     class TestXPUWhereOp2(TestXPUWhereOp):
+
         def init_data(self):
             self.x = np.random.uniform(-5, 5, (60, 2)).astype(self.dtype)
             self.y = np.random.uniform(-5, 5, (60, 2)).astype(self.dtype)
             self.cond = np.ones((60, 2)).astype("bool")
 
     class TestXPUWhereOp3(TestXPUWhereOp):
+
         def init_data(self):
             self.x = np.random.uniform(-3, 5, (20, 2, 4)).astype(self.dtype)
             self.y = np.random.uniform(-3, 5, (20, 2, 4)).astype(self.dtype)
-            self.cond = np.array(
-                np.random.randint(
-                    2, size=(20, 2, 4)), dtype=bool)
+            self.cond = np.array(np.random.randint(2, size=(20, 2, 4)),
+                                 dtype=bool)
 
 
 support_types = get_xpu_op_support_types('where')
@@ -76,6 +80,7 @@ def init_data(self):
 
 
 class TestXPUWhereAPI(unittest.TestCase):
+
     def setUp(self):
         self.__class__.use_xpu = True
         self.place = paddle.XPUPlace(0)
@@ -100,8 +105,9 @@ def test_api(self):
                 train_prog = fluid.Program()
                 startup = fluid.Program()
                 with fluid.program_guard(train_prog, startup):
-                    cond = fluid.data(
-                        name='cond', shape=self.shape, dtype='bool')
+                    cond = fluid.data(name='cond',
+                                      shape=self.shape,
+                                      dtype='bool')
                     x = fluid.data(name='x', shape=self.shape, dtype='float32')
                     y = fluid.data(name='y', shape=self.shape, dtype='float32')
 
@@ -119,12 +125,13 @@ def test_api(self):
                         fetch_list.append(x.grad_name)
                     if y_stop_gradient is False:
                         fetch_list.append(y.grad_name)
-                    out = exe.run(
-                        train_prog,
-                        feed={'cond': self.cond,
-                              'x': self.x,
-                              'y': self.y},
-                        fetch_list=fetch_list)
+                    out = exe.run(train_prog,
+                                  feed={
+                                      'cond': self.cond,
+                                      'x': self.x,
+                                      'y': self.y
+                                  },
+                                  fetch_list=fetch_list)
                     assert np.array_equal(out[0], self.out)
 
                     if x_stop_gradient is False:
@@ -144,21 +151,24 @@ def test_api_broadcast(self, use_cuda=False):
             x = fluid.layers.data(name='x', shape=[4, 1], dtype='float32')
             y = fluid.layers.data(name='y', shape=[4, 2], dtype='float32')
             x_i = np.array([[0.9383, 0.1983, 3.2, 1.2]]).astype("float32")
-            y_i = np.array([[1.0, 1.0, 1.0, 1.0],
-                            [1.0, 1.0, 1.0, 1.0]]).astype("float32")
+            y_i = np.array([[1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0,
+                                                   1.0]]).astype("float32")
             result = paddle.where(x > 1, x=x, y=y)
 
             exe = fluid.Executor(self.place)
             exe.run(startup)
 
             out = exe.run(train_prog,
-                          feed={'x': x_i,
-                                'y': y_i},
+                          feed={
+                              'x': x_i,
+                              'y': y_i
+                          },
                           fetch_list=[result])
             assert np.array_equal(out[0], np.where(x_i > 1, x_i, y_i))
 
 
 class TestWhereDygraphAPI(unittest.TestCase):
+
     def test_api(self):
         with fluid.dygraph.guard(paddle.XPUPlace(0)):
             x_i = np.array([0.9383, 0.1983, 3.2, 1.2]).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py b/python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py
index 57d456d0193de..cc898e3537a41 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py
@@ -24,6 +24,7 @@
 
 
 class Test_XPU_Places(unittest.TestCase):
+
     def assert_places_equal(self, places0, places1):
         self.assertEqual(len(places0), len(places1))
         for place0, place1 in zip(places0, places1):
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
index cdc9b14b6e328..613d04a7f69e9 100644
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -15,6 +15,7 @@
 
 import sys
 import os
+
 __all__ = [
     'TrainerDesc', 'MultiTrainer', 'DistMultiTrainer', 'PipelineTrainer',
     'HeterXpuTrainer', 'HeterPipelineTrainer'
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
index d64f4f17ae323..a34fb2dea7dc5 100644
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -19,8 +19,9 @@
 import numpy as np
 from paddle.fluid.log_helper import get_logger
 
-local_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+local_logger = get_logger(__name__,
+                          logging.INFO,
+                          fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 from .trainer_desc import MultiTrainer, DistMultiTrainer, PipelineTrainer, HeterXpuTrainer, PSGPUTrainer, HeterPipelineTrainer
 from .device_worker import Hogwild, DownpourSGD, DownpourLite, Section, DownpourSGDOPT, HeterSection
@@ -84,13 +85,13 @@ def _create_trainer(self, opt_info=None):
                 if opt_info.get("use_ps_gpu") is not None:
                     trainer._set_use_ps_gpu(opt_info["use_ps_gpu"])
                 if opt_info.get("enable_random_dump") is not None:
-                    trainer._set_enable_random_dump(opt_info[
-                        "enable_random_dump"])
+                    trainer._set_enable_random_dump(
+                        opt_info["enable_random_dump"])
                 if opt_info.get("dump_interval") is not None:
                     trainer._set_dump_interval(opt_info["dump_interval"])
                 if opt_info.get("random_with_lineid") is not None:
-                    trainer._set_random_with_lineid(opt_info[
-                        "random_with_lineid"])
+                    trainer._set_random_with_lineid(
+                        opt_info["random_with_lineid"])
 
             if "fleet_desc" in opt_info:
                 device_worker._set_fleet_desc(opt_info["fleet_desc"])
@@ -101,18 +102,18 @@ def _create_trainer(self, opt_info=None):
                     trainer._set_no_cvm(opt_info["no_cvm"])
                 if opt_info.get(
                         "scale_sparse_gradient_with_batch_size") is not None:
-                    trainer._set_scale_sparse_grad_with_batch_size(opt_info[
-                        "scale_sparse_gradient_with_batch_size"])
+                    trainer._set_scale_sparse_grad_with_batch_size(
+                        opt_info["scale_sparse_gradient_with_batch_size"])
                 if opt_info.get("scale_datanorm") is not None:
                     trainer._set_scale_datanorm(opt_info["scale_datanorm"])
                 if opt_info.get("adjust_ins_weight") is not None:
-                    trainer._set_adjust_ins_weight(opt_info[
-                        "adjust_ins_weight"])
+                    trainer._set_adjust_ins_weight(
+                        opt_info["adjust_ins_weight"])
                 if opt_info.get("copy_table") is not None:
                     trainer._set_copy_table_config(opt_info["copy_table"])
                 if opt_info.get("check_nan_var_names") is not None:
-                    trainer._set_check_nan_var_names(opt_info[
-                        "check_nan_var_names"])
+                    trainer._set_check_nan_var_names(
+                        opt_info["check_nan_var_names"])
                 if opt_info.get("loss_names") is not None:
                     trainer._set_loss_names(opt_info["loss_names"])
             trainer._set_device_worker(device_worker)
@@ -127,8 +128,8 @@ class FetchHandlerMonitor(object):
 
     def __init__(self, scope, handler):
         self.fetch_instance = handler
-        self.fetch_thread = threading.Thread(
-            target=self.handler_launch_func, args=(scope, self.fetch_instance))
+        self.fetch_thread = threading.Thread(target=self.handler_launch_func,
+                                             args=(scope, self.fetch_instance))
         self.running_lock = threading.Lock()
         self.running = False
 
@@ -140,8 +141,8 @@ def handler_launch_func(self, scope, handler):
             if isinstance(fetch_instance.var_dict[key], Variable):
                 var_name_to_key[fetch_instance.var_dict[key].name] = key
             else:
-                local_logger.warning("the value of {} is not a Variable".format(
-                    key))
+                local_logger.warning(
+                    "the value of {} is not a Variable".format(key))
                 var_name_to_key["None.var"] = key
         elapsed_secs = 0
         while True:
@@ -159,8 +160,9 @@ def handler_launch_func(self, scope, handler):
                     var = scope.find_var(key)
                     fetch_dict[key] = var
                     if var == None:
-                        local_logger.warning("{} value currently not available".
-                                             format(var_name_to_key[key]))
+                        local_logger.warning(
+                            "{} value currently not available".format(
+                                var_name_to_key[key]))
                 res_dict = {}
                 for key in fetch_dict:
                     user_name = var_name_to_key[key]
diff --git a/python/paddle/fluid/transpiler/ascend_transpiler.py b/python/paddle/fluid/transpiler/ascend_transpiler.py
index 5593c91b5bc64..69fb2b1833655 100644
--- a/python/paddle/fluid/transpiler/ascend_transpiler.py
+++ b/python/paddle/fluid/transpiler/ascend_transpiler.py
@@ -14,11 +14,13 @@
 
 from . import collective
 from .. import core
+
 OpRole = core.op_proto_and_checker_maker.OpRole
 from paddle.distributed import fleet
 
 
 class AscendTranspiler(collective.Collective):
+
     def __init__(self, startup_program, main_program):
         self.nrings = 1
         super(AscendTranspiler, self).__init__(self.nrings)
@@ -48,24 +50,22 @@ def _insert_allreduce_ops(self):
                     # As we search ops reversedly, we should insert c_allreduce_sum
                     # op in the same way to keep the ring_id alternate
                     ring_id = (ring_id + 1) % self.nrings
-                    block._insert_op(
-                        offset + 1,
-                        type='c_allreduce_sum',
-                        inputs={'X': grad},
-                        outputs={'Out': grad},
-                        attrs={
-                            'ring_id': ring_id,
-                            self.op_role_key: OpRole.Backward
-                        })
-                    block._insert_op(
-                        offset + 2,
-                        type='scale',
-                        inputs={'X': grad},
-                        outputs={'Out': grad},
-                        attrs={
-                            'scale': 1.0 / fleet.worker_num(),
-                            self.op_role_key: OpRole.Backward
-                        })
+                    block._insert_op(offset + 1,
+                                     type='c_allreduce_sum',
+                                     inputs={'X': grad},
+                                     outputs={'Out': grad},
+                                     attrs={
+                                         'ring_id': ring_id,
+                                         self.op_role_key: OpRole.Backward
+                                     })
+                    block._insert_op(offset + 2,
+                                     type='scale',
+                                     inputs={'X': grad},
+                                     outputs={'Out': grad},
+                                     attrs={
+                                         'scale': 1.0 / fleet.worker_num(),
+                                         self.op_role_key: OpRole.Backward
+                                     })
 
         if grad is None:
             return
diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py
index 95ab446e1de6d..cb57ea2a421cb 100644
--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
@@ -122,69 +122,63 @@ def _init_communicator(self,
 
         block = program.global_block()
         if core.is_compiled_with_npu():
-            hccl_id_var = block.create_var(
-                name=unique_name.generate('hccl_id'),
-                persistable=True,
-                type=core.VarDesc.VarType.RAW)
+            hccl_id_var = block.create_var(name=unique_name.generate('hccl_id'),
+                                           persistable=True,
+                                           type=core.VarDesc.VarType.RAW)
             endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
-            block.append_op(
-                type='c_gen_hccl_id',
-                inputs={},
-                outputs={'Out': hccl_id_var},
-                attrs={
-                    'rank': rank,
-                    'endpoint': current_endpoint,
-                    'other_endpoints': other_endpoints,
-                    self.op_role_key: OpRole.Forward
-                })
-            block.append_op(
-                type='c_comm_init_hccl',
-                inputs={'X': hccl_id_var},
-                outputs={},
-                attrs={
-                    'rank': rank,
-                    'ring_id': ring_id,
-                    'device_id': int(os.getenv("FLAGS_selected_npus")),
-                    'rank_ids': nranks,
-                    self.op_role_key: OpRole.Forward
-                })
+            block.append_op(type='c_gen_hccl_id',
+                            inputs={},
+                            outputs={'Out': hccl_id_var},
+                            attrs={
+                                'rank': rank,
+                                'endpoint': current_endpoint,
+                                'other_endpoints': other_endpoints,
+                                self.op_role_key: OpRole.Forward
+                            })
+            block.append_op(type='c_comm_init_hccl',
+                            inputs={'X': hccl_id_var},
+                            outputs={},
+                            attrs={
+                                'rank': rank,
+                                'ring_id': ring_id,
+                                'device_id':
+                                int(os.getenv("FLAGS_selected_npus")),
+                                'rank_ids': nranks,
+                                self.op_role_key: OpRole.Forward
+                            })
         else:
-            nccl_id_var = block.create_var(
-                name=unique_name.generate('nccl_id'),
-                persistable=True,
-                type=core.VarDesc.VarType.RAW)
-            block.append_op(
-                type='c_gen_nccl_id',
-                inputs={},
-                outputs={'Out': nccl_id_var},
-                attrs={
-                    'rank': rank,
-                    'endpoint': current_endpoint,
-                    'other_endpoints': other_endpoints,
-                    self.op_role_key: OpRole.Forward
-                })
+            nccl_id_var = block.create_var(name=unique_name.generate('nccl_id'),
+                                           persistable=True,
+                                           type=core.VarDesc.VarType.RAW)
+            block.append_op(type='c_gen_nccl_id',
+                            inputs={},
+                            outputs={'Out': nccl_id_var},
+                            attrs={
+                                'rank': rank,
+                                'endpoint': current_endpoint,
+                                'other_endpoints': other_endpoints,
+                                self.op_role_key: OpRole.Forward
+                            })
             if not has_multitrainer:
-                block.append_op(
-                    type='c_comm_init',
-                    inputs={'X': nccl_id_var},
-                    outputs={},
-                    attrs={
-                        'nranks': nranks,
-                        'rank': rank,
-                        'ring_id': ring_id,
-                        self.op_role_key: OpRole.Forward
-                    })
+                block.append_op(type='c_comm_init',
+                                inputs={'X': nccl_id_var},
+                                outputs={},
+                                attrs={
+                                    'nranks': nranks,
+                                    'rank': rank,
+                                    'ring_id': ring_id,
+                                    self.op_role_key: OpRole.Forward
+                                })
             else:
-                block.append_op(
-                    type='c_comm_init_multitrainer',
-                    inputs={'X': nccl_id_var},
-                    outputs={},
-                    attrs={
-                        'ntrainers': nranks,
-                        'trainer_id': rank,
-                        'ring_id': ring_id,
-                        self.op_role_key: OpRole.Forward
-                    })
+                block.append_op(type='c_comm_init_multitrainer',
+                                inputs={'X': nccl_id_var},
+                                outputs={},
+                                attrs={
+                                    'ntrainers': nranks,
+                                    'trainer_id': rank,
+                                    'ring_id': ring_id,
+                                    self.op_role_key: OpRole.Forward
+                                })
 
     def _broadcast_params(self):
         block = self.startup_program.global_block()
@@ -194,23 +188,23 @@ def _broadcast_params(self):
                 continue
 
             ring_id = (ring_id + 1) % self.nrings
-            block.append_op(
-                type='c_broadcast',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={
-                    'ring_id': ring_id,
-                    'root': 0,
-                    self.op_role_key: OpRole.Forward
-                })
+            block.append_op(type='c_broadcast',
+                            inputs={'X': param},
+                            outputs={'Out': param},
+                            attrs={
+                                'ring_id': ring_id,
+                                'root': 0,
+                                self.op_role_key: OpRole.Forward
+                            })
 
         for ring_id in range(self.nrings):
-            block.append_op(
-                type='c_sync_comm_stream',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={'ring_id': ring_id,
-                       self.op_role_key: OpRole.Forward})
+            block.append_op(type='c_sync_comm_stream',
+                            inputs={'X': param},
+                            outputs={'Out': param},
+                            attrs={
+                                'ring_id': ring_id,
+                                self.op_role_key: OpRole.Forward
+                            })
 
     def _is_loss_grad_op(self, op):
         if self.op_role_key not in op.attr_names:
@@ -252,15 +246,14 @@ def _insert_scale_loss_grad_ops(self):
         for idx, op in reversed(list(enumerate(block.ops))):
             if self._is_loss_grad_op(op):
                 loss_grad_var = block.vars[op.output_arg_names[0]]
-                block._insert_op(
-                    idx + 1,
-                    type='scale',
-                    inputs={'X': loss_grad_var},
-                    outputs={'Out': loss_grad_var},
-                    attrs={
-                        'scale': 1.0 / self.nranks,
-                        self.op_role_key: OpRole.Backward
-                    })
+                block._insert_op(idx + 1,
+                                 type='scale',
+                                 inputs={'X': loss_grad_var},
+                                 outputs={'Out': loss_grad_var},
+                                 attrs={
+                                     'scale': 1.0 / self.nranks,
+                                     self.op_role_key: OpRole.Backward
+                                 })
 
     def _insert_allreduce_ops(self):
         block = self.main_program.global_block()
@@ -295,15 +288,14 @@ def _insert_allreduce_ops(self):
                     # As we search ops reversedly, we should insert c_allreduce_sum
                     # op in the same way to keep the ring_id alternate
                     ring_id = (ring_id + 1) % self.nrings
-                    block._insert_op(
-                        offset,
-                        type='c_allreduce_sum',
-                        inputs={'X': grad},
-                        outputs={'Out': grad},
-                        attrs={
-                            'ring_id': ring_id,
-                            self.op_role_key: OpRole.Backward
-                        })
+                    block._insert_op(offset,
+                                     type='c_allreduce_sum',
+                                     inputs={'X': grad},
+                                     outputs={'Out': grad},
+                                     attrs={
+                                         'ring_id': ring_id,
+                                         self.op_role_key: OpRole.Backward
+                                     })
 
         if grad is None:
             return
@@ -311,15 +303,14 @@ def _insert_allreduce_ops(self):
         for idx, op in enumerate(block.ops):
             if self._is_optimizer_op(op):
                 for ring_id in range(self.nrings):
-                    block._insert_op(
-                        idx + ring_id,
-                        type='c_sync_comm_stream',
-                        inputs={'X': grad},
-                        outputs={'Out': grad},
-                        attrs={
-                            'ring_id': ring_id,
-                            self.op_role_key: OpRole.Backward
-                        })
+                    block._insert_op(idx + ring_id,
+                                     type='c_sync_comm_stream',
+                                     inputs={'X': grad},
+                                     outputs={'Out': grad},
+                                     attrs={
+                                         'ring_id': ring_id,
+                                         self.op_role_key: OpRole.Backward
+                                     })
                 break
 
 
@@ -342,16 +333,14 @@ def _transpile_startup_program(self):
                 non_dist_params.append(param)
 
         for param in non_dist_params:
-            snapshot = block.create_var(
-                name=self.snapshot_name(param.name),
-                shape=param.shape,
-                persistable=True,
-                stop_gradient=True)
-            block.append_op(
-                type='assign',
-                inputs={'X': [param]},
-                outputs={'Out': [snapshot]},
-                attrs={self.op_role_key: OpRole.Forward})
+            snapshot = block.create_var(name=self.snapshot_name(param.name),
+                                        shape=param.shape,
+                                        persistable=True,
+                                        stop_gradient=True)
+            block.append_op(type='assign',
+                            inputs={'X': [param]},
+                            outputs={'Out': [snapshot]},
+                            attrs={self.op_role_key: OpRole.Forward})
 
     def snapshot_name(self, param_name):
         return param_name + self.snapshot_key
@@ -366,69 +355,67 @@ def _transpile_main_program(self):
                 if param.is_distributed:
                     continue
 
-                snapshot = block.create_var(
-                    name=self.snapshot_name(param.name),
-                    shape=param.shape,
-                    persistable=True,
-                    stop_gradient=True,
-                    dtype=param.dtype)
-
-                block._insert_op(
-                    idx + 1,
-                    type='elementwise_sub',
-                    inputs={'X': [snapshot],
-                            'Y': [param]},
-                    outputs={'Out': [param]},
-                    attrs={self.op_role_key: OpRole.Optimize})
-                block._insert_op(
-                    idx + 2,
-                    type='c_sync_calc_stream',
-                    inputs={'X': param},
-                    outputs={'Out': param},
-                    attrs={self.op_role_key: OpRole.Optimize})
+                snapshot = block.create_var(name=self.snapshot_name(param.name),
+                                            shape=param.shape,
+                                            persistable=True,
+                                            stop_gradient=True,
+                                            dtype=param.dtype)
+
+                block._insert_op(idx + 1,
+                                 type='elementwise_sub',
+                                 inputs={
+                                     'X': [snapshot],
+                                     'Y': [param]
+                                 },
+                                 outputs={'Out': [param]},
+                                 attrs={self.op_role_key: OpRole.Optimize})
+                block._insert_op(idx + 2,
+                                 type='c_sync_calc_stream',
+                                 inputs={'X': param},
+                                 outputs={'Out': param},
+                                 attrs={self.op_role_key: OpRole.Optimize})
                 ring_id = (ring_id + 1) % self.nrings
-                block._insert_op(
-                    idx + 3,
-                    type='c_allreduce_sum',
-                    inputs={'X': [param]},
-                    outputs={'Out': [param]},
-                    attrs={
-                        'ring_id': ring_id,
-                        self.op_role_key: OpRole.Optimize
-                    })
+                block._insert_op(idx + 3,
+                                 type='c_allreduce_sum',
+                                 inputs={'X': [param]},
+                                 outputs={'Out': [param]},
+                                 attrs={
+                                     'ring_id': ring_id,
+                                     self.op_role_key: OpRole.Optimize
+                                 })
 
                 ordered_param_snapshot.append((param, snapshot))
 
         for ring_id in range(self.nrings):
-            block.append_op(
-                type='c_sync_comm_stream',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={'ring_id': ring_id,
-                       self.op_role_key: OpRole.Optimize})
+            block.append_op(type='c_sync_comm_stream',
+                            inputs={'X': param},
+                            outputs={'Out': param},
+                            attrs={
+                                'ring_id': ring_id,
+                                self.op_role_key: OpRole.Optimize
+                            })
 
         for param_snapshot in reversed(ordered_param_snapshot):
             param = param_snapshot[0]
             snapshot = param_snapshot[1]
-            block.append_op(
-                type='scale',
-                inputs={'X': [param]},
-                outputs={'Out': [param]},
-                attrs={
-                    'scale': 1.0 / self.nranks,
-                    self.op_role_key: OpRole.Optimize
-                })
-            block.append_op(
-                type='elementwise_sub',
-                inputs={'X': [snapshot],
-                        'Y': [param]},
-                outputs={'Out': [param]},
-                attrs={self.op_role_key: OpRole.Optimize})
-            block.append_op(
-                type='assign',
-                inputs={'X': [param]},
-                outputs={'Out': [snapshot]},
-                attrs={self.op_role_key: OpRole.Optimize})
+            block.append_op(type='scale',
+                            inputs={'X': [param]},
+                            outputs={'Out': [param]},
+                            attrs={
+                                'scale': 1.0 / self.nranks,
+                                self.op_role_key: OpRole.Optimize
+                            })
+            block.append_op(type='elementwise_sub',
+                            inputs={
+                                'X': [snapshot],
+                                'Y': [param]
+                            },
+                            outputs={'Out': [param]},
+                            attrs={self.op_role_key: OpRole.Optimize})
+            block.append_op(type='assign',
+                            inputs={'X': [param]},
+                            outputs={'Out': [snapshot]},
+                            attrs={self.op_role_key: OpRole.Optimize})
 
 
 class SingleProcessMultiThread(GradAllReduce):
@@ -464,42 +451,27 @@ def _transpile_startup_program(self):
             print("total endpoints: ", self.endpoints)
             print("rank: %d, ring_id: %d" % (self.rank, self.nrings))
             for ring_id in range(self.nrings):
-                self._init_communicator(
-                    self.startup_program, self.current_endpoint, self.endpoints,
-                    self.rank, ring_id, self.wait_port, True)
+                self._init_communicator(self.startup_program,
+                                        self.current_endpoint, self.endpoints,
+                                        self.rank, ring_id, self.wait_port,
+                                        True)
 
         else:
             if "xpu" in self.trans_mode:
                 print(
-                    "begin to _transpile_startup_program for single-node in XPU")
+                    "begin to _transpile_startup_program for single-node in XPU"
+                )
                 block = self.startup_program.global_block()
-                comm_id_var = block.create_var(
-                    name=unique_name.generate('comm_id'),
-                    persistable=True,
-                    type=core.VarDesc.VarType.RAW)
                 block.append_op(
-                    type='c_gen_bkcl_id',
-                    inputs={},
-                    outputs={'Out': comm_id_var},
+                    type='c_comm_init_all',
                     attrs={
-                        'rank': self.rank,
-                        'endpoint': self.current_endpoint,
-                        'other_endpoints': self.other_endpoints,
-                        'ring_id': 0,
-                        self.op_role_key: OpRole.Forward
+                        'devices':
+                        list(
+                            map(int,
+                                os.getenv("FLAGS_selected_gpus").split(","))),
+                        'ring_id':
+                        0
                     })
-                block.append_op(
-                    type='c_comm_init',
-                    inputs={'X': comm_id_var},
-                    outputs={},
-                    attrs={
-                        'nranks':
-                        len(os.getenv("FLAGS_selected_gpus").split(",")),
-                        'rank': self.rank,
-                        'ring_id': 0,
-                        self.op_role_key: OpRole.Forward
-                    })
-
             else:
                 print("begin to _transpile_startup_program for single-node")
                 block = self.startup_program.global_block()
@@ -515,6 +487,11 @@ def _transpile_main_program(self):
         elif self.trans_mode == "fuse_all_reduce":
             print("begin to transpile in fuse all-reduce mode")
             self._insert_fuse_allreduce_ops()
+        elif self.trans_mode == "all_reduce_xpu" and len(
+                os.getenv("FLAGS_selected_gpus").split(",")) == 1:
+            print(
+                "skip transpile in all-reduce-xpu mode when number of devices is only one"
+            )
         else:
             print("begin to transpile in all-reduce mode")
             self._insert_allreduce_ops()
@@ -560,16 +537,15 @@ def _insert_allgather_ops(self):
                     # As we search ops reversedly, we should insert c_allgather
                     # op in the same way to keep the ring_id alternate
                     ring_id = (ring_id + 1) % self.nrings
-                    block._insert_op(
-                        offset,
-                        type='c_allgather',
-                        inputs={'X': grad},
-                        outputs={'Out': new_grad_var},
-                        attrs={
-                            'nranks': self.allgather_ranks,
-                            'ring_id': ring_id,
-                            self.op_role_key: OpRole.Backward
-                        })
+                    block._insert_op(offset,
+                                     type='c_allgather',
+                                     inputs={'X': grad},
+                                     outputs={'Out': new_grad_var},
+                                     attrs={
+                                         'nranks': self.allgather_ranks,
+                                         'ring_id': ring_id,
+                                         self.op_role_key: OpRole.Backward
+                                     })
 
         if grad is None:
             return
@@ -577,15 +553,14 @@ def _insert_allgather_ops(self):
         for idx, op in enumerate(block.ops):
             if self._is_optimizer_op(op):
                 for ring_id in range(self.nrings):
-                    block._insert_op(
-                        idx + ring_id,
-                        type='c_sync_comm_stream',
-                        inputs={'X': grad},
-                        outputs={'Out': grad},
-                        attrs={
-                            'ring_id': ring_id,
-                            self.op_role_key: OpRole.Backward
-                        })
+                    block._insert_op(idx + ring_id,
+                                     type='c_sync_comm_stream',
+                                     inputs={'X': grad},
+                                     outputs={'Out': grad},
+                                     attrs={
+                                         'ring_id': ring_id,
+                                         self.op_role_key: OpRole.Backward
+                                     })
                 break
 
     def _update_adam_ops(self):
@@ -616,10 +591,14 @@ def _update_adam_ops(self):
                     "Beta2PowOut": block.vars[op.output("Beta2PowOut")[0]]
                 }
                 attrs = {
-                    "epsilon": op.attr('epsilon'),
-                    "beta1": op.attr('beta1'),
-                    "beta2": op.attr('beta2'),
-                    "lazy_mode": op.attr('lazy_mode'),
+                    "epsilon":
+                    op.attr('epsilon'),
+                    "beta1":
+                    op.attr('beta1'),
+                    "beta2":
+                    op.attr('beta2'),
+                    "lazy_mode":
+                    op.attr('lazy_mode'),
                     "min_row_size_to_use_multithread":
                     op.attr('min_row_size_to_use_multithread')
                 }
@@ -631,25 +610,27 @@ def _update_adam_ops(self):
                         dtype=core.VarDesc.VarType.FP32,
                         stop_gradient=True) for i in range(self.allgather_ranks)
                 ]
-                block._insert_op(
-                    offset,
-                    type="split",
-                    inputs={
-                        'X': block.vars[op.input("Param")[0] + "_allgather"]
-                    },
-                    outputs={'Out': split_vars},
-                    attrs={'num': self.allgather_ranks,
-                           'axis': 0})
+                block._insert_op(offset,
+                                 type="split",
+                                 inputs={
+                                     'X':
+                                     block.vars[op.input("Param")[0] +
+                                                "_allgather"]
+                                 },
+                                 outputs={'Out': split_vars},
+                                 attrs={
+                                     'num': self.allgather_ranks,
+                                     'axis': 0
+                                 })
                 offset += 1
 
                 for i in range(self.allgather_ranks):
                     inputs["Grad"] = split_vars[i]
-                    block._insert_op(
-                        offset,
-                        type=op.type,
-                        inputs=inputs,
-                        outputs=outputs,
-                        attrs=attrs)
+                    block._insert_op(offset,
+                                     type=op.type,
+                                     inputs=inputs,
+                                     outputs=outputs,
+                                     attrs=attrs)
                     offset += 1
                 # remove the original adam op
                 block._remove_op(offset)
@@ -699,47 +680,45 @@ def _insert_fuse_allreduce_ops(self):
             if self._is_optimizer_op(op):
                 for segment in segments:
                     # insert coalesce tensor
-                    tmp_var = block.create_var(
-                        name=unique_name.generate('FusedOutput_{}'.format(
-                            segment[0].name)),
-                        dtype=segment[0].dtype,
-                        persistable=False,
-                        stop_gradient=True)
+                    tmp_var = block.create_var(name=unique_name.generate(
+                        'FusedOutput_{}'.format(segment[0].name)),
+                                               dtype=segment[0].dtype,
+                                               persistable=False,
+                                               stop_gradient=True)
                     fused_vars.append(tmp_var)
-                    block._insert_op(
-                        idx,
-                        type="coalesce_tensor",
-                        inputs={"Input": segment},
-                        outputs={"Output": segment,
-                                 "FusedOutput": tmp_var},
-                        attrs={
-                            "copy_data": True,
-                            "use_align": True,
-                            "dtype": segment[0].dtype,
-                            self.op_role_key: OpRole.Backward
-                        })
+                    block._insert_op(idx,
+                                     type="coalesce_tensor",
+                                     inputs={"Input": segment},
+                                     outputs={
+                                         "Output": segment,
+                                         "FusedOutput": tmp_var
+                                     },
+                                     attrs={
+                                         "copy_data": True,
+                                         "use_align": True,
+                                         "dtype": segment[0].dtype,
+                                         self.op_role_key: OpRole.Backward
+                                     })
                 break
 
         # insert the allreduce_sum op
         for idx, op in enumerate(block.ops):
             if self._is_optimizer_op(op):
                 for fused_var in fused_vars:
-                    block._insert_op(
-                        idx,
-                        type='c_allreduce_sum',
-                        inputs={'X': fused_var},
-                        outputs={'Out': fused_var},
-                        attrs={
-                            'ring_id': ring_id,
-                            'use_calc_stream': False,
-                            self.op_role_key: OpRole.Backward
-                        })
-                    block._insert_op(
-                        idx,
-                        type='c_sync_calc_stream',
-                        inputs={'X': fused_var},
-                        outputs={'Out': fused_var},
-                        attrs={self.op_role_key: OpRole.Backward})
+                    block._insert_op(idx,
+                                     type='c_allreduce_sum',
+                                     inputs={'X': fused_var},
+                                     outputs={'Out': fused_var},
+                                     attrs={
+                                         'ring_id': ring_id,
+                                         'use_calc_stream': False,
+                                         self.op_role_key: OpRole.Backward
+                                     })
+                    block._insert_op(idx,
+                                     type='c_sync_calc_stream',
+                                     inputs={'X': fused_var},
+                                     outputs={'Out': fused_var},
+                                     attrs={self.op_role_key: OpRole.Backward})
                 break
 
         if len(fused_vars) == 0:
@@ -749,14 +728,13 @@ def _insert_fuse_allreduce_ops(self):
         # insert the sync comm op
         for idx, op in enumerate(block.ops):
             if self._is_optimizer_op(op):
-                block._insert_op(
-                    idx,
-                    type='c_sync_comm_stream',
-                    inputs={'X': fused_vars[0]},
-                    outputs={'Out': fused_vars[0]},
-                    attrs={
-                        'ring_id': ring_id,
-                        self.op_role_key: OpRole.Backward
-                    })
+                block._insert_op(idx,
+                                 type='c_sync_comm_stream',
+                                 inputs={'X': fused_vars[0]},
+                                 outputs={'Out': fused_vars[0]},
+                                 attrs={
+                                     'ring_id': ring_id,
+                                     self.op_role_key: OpRole.Backward
+                                 })
                 break
         block._sync_with_cpp()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 0e879264f7460..31d3c817d1ed5 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -78,6 +78,7 @@ def log(*args):
 
 
 class VarBlock:
+
     def __init__(self, varname, offset, size):
         self.varname = varname
         # NOTE: real offset is offset * size
@@ -131,8 +132,8 @@ def slice_variable(var_list, slice_count, min_block_size):
         # update split_count after aligning
         split_count = int(math.ceil(var_numel / float(block_size)))
         for block_id in range(split_count):
-            curr_block_size = min(block_size, var_numel - (
-                (block_id) * block_size))
+            curr_block_size = min(block_size,
+                                  var_numel - ((block_id) * block_size))
             block = VarBlock(var.name, block_id, curr_block_size)
             blocks.append(str(block))
     return blocks
@@ -244,6 +245,7 @@ def sync_mode(self, value):
 
 
 class ServerRuntimeConfig(object):
+
     def __init__(self):
         self._rpc_send_thread_num = int(
             os.getenv("FLAGS_rpc_send_thread_num", "12"))
@@ -392,9 +394,12 @@ def _transpile_nccl2(self,
                 inputs={},
                 outputs={"NCCLID": nccl_id_var},
                 attrs={
-                    "trainers": trainers.split(","),
-                    "trainer_id": trainer_id,
-                    "nccl_comm_num": self.config.nccl_comm_num,
+                    "trainers":
+                    trainers.split(","),
+                    "trainer_id":
+                    trainer_id,
+                    "nccl_comm_num":
+                    self.config.nccl_comm_num,
                     "use_hierarchical_allreduce":
                     self.config.use_hierarchical_allreduce,
                     "hierarchical_allreduce_inter_nranks":
@@ -439,13 +444,12 @@ def _transpile_collective(self,
         else:
             raise ValueError('invalid collective_mode: %s' % collective_mode)
 
-        transpiler.transpile(
-            startup_program=startup_program,
-            main_program=main_program,
-            rank=trainer_id,
-            endpoints=endpoints,
-            current_endpoint=current_endpoint,
-            wait_port=wait_port)
+        transpiler.transpile(startup_program=startup_program,
+                             main_program=main_program,
+                             rank=trainer_id,
+                             endpoints=endpoints,
+                             current_endpoint=current_endpoint,
+                             wait_port=wait_port)
 
     def _get_all_remote_sparse_update_op(self, main_program):
         sparse_update_ops = []
@@ -517,8 +521,10 @@ def _update_remote_sparse_update_op(self, program,
                     program.global_block()._insert_op(
                         index=distributed_idx,
                         type="distributed_lookup_table",
-                        inputs={"Ids": inputs,
-                                'W': w},
+                        inputs={
+                            "Ids": inputs,
+                            'W': w
+                        },
                         outputs={"Outputs": outputs},
                         attrs={
                             "table_names": table_names,
@@ -624,12 +630,11 @@ def transpile(self,
                 self.origin_program._hierarchical_allreduce_inter_nranks = \
                     int(self.config.hierarchical_allreduce_inter_nranks)
 
-            self._transpile_nccl2(
-                trainer_id,
-                trainers,
-                current_endpoint,
-                startup_program=startup_program,
-                wait_port=self.config.wait_port)
+            self._transpile_nccl2(trainer_id,
+                                  trainers,
+                                  current_endpoint,
+                                  startup_program=startup_program,
+                                  wait_port=self.config.wait_port)
             return
 
         if self.config.mode == "collective":
@@ -704,21 +709,24 @@ def transpile(self,
             splited_grad_varname = grad_varname
             if len(splited_vars) == 1:
                 splited_grad_varname = splited_vars[0].name
-                index = find_op_by_output_arg(
-                    program.global_block(), splited_grad_varname, reverse=True)
+                index = find_op_by_output_arg(program.global_block(),
+                                              splited_grad_varname,
+                                              reverse=True)
 
             elif len(splited_vars) > 1:
                 orig_var = program.global_block().vars[splited_grad_varname]
-                index = find_op_by_output_arg(
-                    program.global_block(), splited_grad_varname, reverse=True)
+                index = find_op_by_output_arg(program.global_block(),
+                                              splited_grad_varname,
+                                              reverse=True)
 
                 if not self.config.runtime_split_send_recv:
                     self._insert_split_op(program, orig_var, index,
                                           splited_vars)
                     index += 1
             else:
-                AssertionError("Can not insert the send op by original "
-                               "variable name :", splited_grad_varname)
+                AssertionError(
+                    "Can not insert the send op by original "
+                    "variable name :", splited_grad_varname)
 
             if splited_vars[0].type == core.VarDesc.VarType.SELECTED_ROWS:
                 sparse_param_name = self.grad_name_to_param_name[grad_varname]
@@ -759,10 +767,14 @@ def transpile(self,
                 inputs={"X": send_input_vars},
                 outputs={"Out": dummy_output},
                 attrs={
-                    "epmap": eplist,
-                    "sections": sections,
-                    "send_varnames": send_varnames,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                    "epmap":
+                    eplist,
+                    "sections":
+                    sections,
+                    "send_varnames":
+                    send_varnames,
+                    RPC_OP_ROLE_ATTR_NAME:
+                    RPC_OP_ROLE_ATTR_VALUE,
                     OP_ROLE_VAR_ATTR_NAME: [
                         self.grad_name_to_param_name[grad_varname],
                         splited_grad_varname
@@ -795,12 +807,18 @@ def transpile(self,
                     inputs={"X": self.counter_var},
                     outputs={"Out": decay_dummy_output},
                     attrs={
-                        "epmap": pserver_endpoints,
-                        "sections": sections,
-                        "send_varnames": send_varnames,
-                        "merge_add": True,
-                        "use_send_handler": False,
-                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                        "epmap":
+                        pserver_endpoints,
+                        "sections":
+                        sections,
+                        "send_varnames":
+                        send_varnames,
+                        "merge_add":
+                        True,
+                        "use_send_handler":
+                        False,
+                        RPC_OP_ROLE_ATTR_NAME:
+                        RPC_OP_ROLE_ATTR_VALUE,
                         OP_ROLE_VAR_ATTR_NAME:
                         [self.counter_var.name, self.counter_var.name]
                     })
@@ -809,16 +827,19 @@ def transpile(self,
         if self.sync_mode:
             fetch_barrier_input = []
 
-            program.global_block().append_op(
-                type="send_barrier",
-                inputs={"X": list(input_deps)},
-                outputs={"Out": send_barrier_out},
-                attrs={
-                    "endpoints": pserver_endpoints,
-                    "trainer_id": self.trainer_id,
-                    "half_async": False,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-                })
+            program.global_block().append_op(type="send_barrier",
+                                             inputs={"X": list(input_deps)},
+                                             outputs={"Out": send_barrier_out},
+                                             attrs={
+                                                 "endpoints":
+                                                 pserver_endpoints,
+                                                 "trainer_id":
+                                                 self.trainer_id,
+                                                 "half_async":
+                                                 False,
+                                                 RPC_OP_ROLE_ATTR_NAME:
+                                                 RPC_OP_ROLE_ATTR_VALUE
+                                             })
 
             fetch_barrier_input.append(send_barrier_out)
         else:
@@ -896,10 +917,14 @@ def transpile(self,
                     inputs={"X": [recv_dep_in]},
                     outputs={"Out": splited_var},
                     attrs={
-                        "epmap": eps,
-                        "recv_varnames": recv_varnames,
-                        "trainer_id": self.trainer_id,
-                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                        "epmap":
+                        eps,
+                        "recv_varnames":
+                        recv_varnames,
+                        "trainer_id":
+                        self.trainer_id,
+                        RPC_OP_ROLE_ATTR_NAME:
+                        RPC_OP_ROLE_ATTR_VALUE,
                         OP_ROLE_VAR_ATTR_NAME:
                         [param_varname, recv_op_role_var_name]
                     })
@@ -908,15 +933,17 @@ def transpile(self,
 
         if self.sync_mode:
             # form a WAW dependency
-            program.global_block().append_op(
-                type="fetch_barrier",
-                inputs={"X": fetch_barrier_input},
-                outputs={"Out": all_recv_outputs},
-                attrs={
-                    "endpoints": pserver_endpoints,
-                    "trainer_id": self.trainer_id,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-                })
+            program.global_block().append_op(type="fetch_barrier",
+                                             inputs={"X": fetch_barrier_input},
+                                             outputs={"Out": all_recv_outputs},
+                                             attrs={
+                                                 "endpoints":
+                                                 pserver_endpoints,
+                                                 "trainer_id":
+                                                 self.trainer_id,
+                                                 RPC_OP_ROLE_ATTR_NAME:
+                                                 RPC_OP_ROLE_ATTR_VALUE
+                                             })
 
         for param_varname, splited_var in six.iteritems(self.param_var_mapping):
             if len(splited_var) <= 1:
@@ -969,8 +996,8 @@ def _fake_init_sparsetable(self, sparse_table_names):
                     table_param_init_op.append(op)
             init_op_num = len(table_param_init_op)
             if init_op_num != 1:
-                raise ValueError("table init op num should be 1, now is " + str(
-                    init_op_num))
+                raise ValueError("table init op num should be 1, now is " +
+                                 str(init_op_num))
             table_init_op = table_param_init_op[0]
             self.startup_program.global_block().append_op(
                 type="fake_init",
@@ -1134,8 +1161,8 @@ def _get_trainer_startup_program(self, recv_vars, eplist):
             if varname in startup_program.global_block().vars:
                 orig_param = startup_program.global_block().vars[varname]
             else:
-                origin_param_var = self.origin_program.global_block().vars[
-                    varname]
+                origin_param_var = self.origin_program.global_block(
+                ).vars[varname]
                 orig_param = startup_program.global_block().create_var(
                     name=varname,
                     persistable=origin_param_var.persistable,
@@ -1331,8 +1358,8 @@ def __clone_lr_op_sub_block__(op, program, lr_block):
                 # find the origin grad var before clipping/L2Decay,
                 # merged_var should be the input var name of L2Decay
                 grad_varname_for_block = op.attr(OP_ROLE_VAR_ATTR_NAME)[1]
-                if op.attr(OP_ROLE_VAR_ATTR_NAME)[
-                        0] == optimize_target_param_name:
+                if op.attr(
+                        OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name:
                     merged_var = self._append_pserver_grad_merge_ops(
                         per_opt_block, grad_varname_for_block, endpoint,
                         grad_to_block_id, self.origin_program)
@@ -1410,11 +1437,10 @@ def __clone_lr_op_sub_block__(op, program, lr_block):
                 'prefetch_var_name_to_block_id'] = prefetch_var_name_to_block_id
 
         # step5 append the listen_and_serv op
-        pserver_program.global_block().append_op(
-            type="listen_and_serv",
-            inputs={'X': recv_inputs},
-            outputs={},
-            attrs=attrs)
+        pserver_program.global_block().append_op(type="listen_and_serv",
+                                                 inputs={'X': recv_inputs},
+                                                 outputs={},
+                                                 attrs=attrs)
 
         pserver_program._sync_with_cpp()
         # save pserver program to generate pserver side startup relatively.
@@ -1448,8 +1474,8 @@ def get_pserver_programs(self, endpoint):
               pserver_program, pserver_startup_program = t.get_pserver_programs(current_endpoint)
         """
         pserver_prog = self.get_pserver_program(endpoint)
-        pserver_startup = self.get_startup_program(
-            endpoint, pserver_program=pserver_prog)
+        pserver_startup = self.get_startup_program(endpoint,
+                                                   pserver_program=pserver_prog)
         return pserver_prog, pserver_startup
 
     def get_startup_program(self,
@@ -1531,20 +1557,18 @@ def _get_splited_name_and_shape(varname):
                         "truncated_gaussian_random"
                 ]:
                     op._set_attr("shape", list(new_outputs["Out"].shape))
-                s_prog.global_block().append_op(
-                    type=op.type,
-                    inputs=new_inputs,
-                    outputs=new_outputs,
-                    attrs=op.all_attrs())
+                s_prog.global_block().append_op(type=op.type,
+                                                inputs=new_inputs,
+                                                outputs=new_outputs,
+                                                attrs=op.all_attrs())
         if self.config.enable_dc_asgd:
             for p, p_bak in self.param_bak_list:
                 startup_param_var = s_prog.global_block().vars[p.name]
                 startup_tmpvar = s_prog.global_block().vars[p_bak.name]
                 # copy init random value to param_bak
-                s_prog.global_block().append_op(
-                    type="assign",
-                    inputs={"X": startup_param_var},
-                    outputs={"Out": startup_tmpvar})
+                s_prog.global_block().append_op(type="assign",
+                                                inputs={"X": startup_param_var},
+                                                outputs={"Out": startup_tmpvar})
 
         return s_prog
 
@@ -1578,6 +1602,7 @@ def _get_slice_var_info(self, slice_var):
         return is_slice, block_idx, offset
 
     def _get_distributed_optimizer_vars(self):
+
         def _get_distributed_optimizer_var(endpoint):
             opt_op_on_pserver = []
             for _, op in enumerate(self.optimize_ops):
@@ -1702,8 +1727,7 @@ def _init_splited_vars(self):
         if self.config.slice_var_up:
             # when we slice var up into blocks, we will slice the var according to
             # pserver services' count. A pserver may have two or more listening ports.
-            grad_blocks = slice_variable(grad_list,
-                                         len(self.pserver_endpoints),
+            grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints),
                                          self.config.min_block_size)
             param_blocks = slice_variable(param_list,
                                           len(self.pserver_endpoints),
@@ -1728,13 +1752,12 @@ def _init_splited_vars(self):
                 is_slice, block_id, offset = self._get_slice_var_info(
                     splited_var)
 
-                self.vars_overview.add_distributed_var(
-                    origin_var=orig_var,
-                    slice_var=splited_var,
-                    block_id=block_id,
-                    offset=offset,
-                    is_slice=is_slice,
-                    vtype="Param")
+                self.vars_overview.add_distributed_var(origin_var=orig_var,
+                                                       slice_var=splited_var,
+                                                       block_id=block_id,
+                                                       offset=offset,
+                                                       is_slice=is_slice,
+                                                       vtype="Param")
 
         # origin_grad_name -> [splited_grad_vars]
         self.grad_var_mapping = self._create_vars_from_blocklist(
@@ -1752,12 +1775,10 @@ def _init_splited_vars(self):
         # create mapping of endpoint -> split var to create pserver side program
         self.param_grad_ep_mapping = collections.OrderedDict()
         [
-            self.param_grad_ep_mapping.update({
-                ep: {
-                    "params": [],
-                    "grads": []
-                }
-            }) for ep in self.pserver_endpoints
+            self.param_grad_ep_mapping.update({ep: {
+                "params": [],
+                "grads": []
+            }}) for ep in self.pserver_endpoints
         ]
 
     # transpiler function for dis lookup_table
@@ -1873,9 +1894,12 @@ def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints):
                         if self.sync_mode else []
                     },
                     attrs={
-                        "epmap": pserver_endpoints,
-                        "trainer_id": self.trainer_id,
-                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                        "epmap":
+                        pserver_endpoints,
+                        "trainer_id":
+                        self.trainer_id,
+                        RPC_OP_ROLE_ATTR_NAME:
+                        RPC_OP_ROLE_ATTR_VALUE,
                         OP_ROLE_VAR_ATTR_NAME: [
                             self.grad_name_to_param_name[table_grad_name],
                             table_grad_name
@@ -1903,16 +1927,18 @@ def _create_prefetch_block(self, pserver_index, pserver_program,
             dtype=trainer_out.dtype)
         prefetch_block.append_op(
             type="lookup_sparse_table",
-            inputs={'Ids': pserver_ids,
-                    "W": table_var},
+            inputs={
+                'Ids': pserver_ids,
+                "W": table_var
+            },
             outputs={"Out": pserver_out},
             attrs={
                 "is_sparse": True,  # has no effect on lookup_table op
                 "is_distributed": True,
                 "padding_idx": -1
             })
-        prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str(
-            prefetch_block.idx))
+        prefetch_var_name_to_block_id.append(trainer_ids.name + ":" +
+                                             str(prefetch_block.idx))
         return prefetch_var_name_to_block_id
 
     def _create_table_optimize_block(self, pserver_index, pserver_program,
@@ -1922,17 +1948,16 @@ def _create_table_optimize_block(self, pserver_index, pserver_program,
         # create table param and grad var in pserver program
         # create table optimize block in pserver program
         table_opt_op = [
-            op for op in self.optimize_ops
-            if 'Param' in op.input_names and op.input("Param")[0] ==
-            self.table_name
+            op for op in self.optimize_ops if 'Param' in op.input_names
+            and op.input("Param")[0] == self.table_name
         ][0]
 
         origin_param_var = self.origin_program.global_block().vars[
             self.table_name]
 
         zero_dim = int(
-            math.ceil(origin_param_var.shape[0] / float(
-                len(self.pserver_endpoints))))
+            math.ceil(origin_param_var.shape[0] /
+                      float(len(self.pserver_endpoints))))
         table_shape = list(origin_param_var.shape)
         table_shape[0] = zero_dim
 
@@ -2005,18 +2030,16 @@ def _create_checkpoint_save_block(self, pserver_program, pre_block_idx):
         create a new block to handle save checkpoint.
         """
 
-        pserver_program.global_block().create_var(
-            name="kLookupTablePath",
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
+        pserver_program.global_block().create_var(name="kLookupTablePath",
+                                                  persistable=True,
+                                                  type=core.VarDesc.VarType.RAW)
 
         checkpoint_save_block = pserver_program._create_block(pre_block_idx)
         # this 'file_path' do not be used in save lookup table variable
-        checkpoint_save_block.append_op(
-            type='save',
-            inputs={'X': [self.table_name]},
-            outputs={},
-            attrs={'file_path': "none"})
+        checkpoint_save_block.append_op(type='save',
+                                        inputs={'X': [self.table_name]},
+                                        outputs={},
+                                        attrs={'file_path': "none"})
 
         return checkpoint_save_block.idx
 
@@ -2090,13 +2113,12 @@ def _create_vars_from_blocklist(self,
         return var_mapping
 
     def _clone_var(self, block, var, persistable=True):
-        return block.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            lod_level=var.lod_level,
-            persistable=persistable)
+        return block.create_var(name=var.name,
+                                shape=var.shape,
+                                dtype=var.dtype,
+                                type=var.type,
+                                lod_level=var.lod_level,
+                                persistable=persistable)
 
     @staticmethod
     def _get_splited_var_sections(splited_vars):
@@ -2113,25 +2135,27 @@ def _insert_split_op(self, program, orig_var, index, splited_vars):
             if self._is_input_of_remote_sparse_update_op(sparse_param_name):
                 self.sparse_param_to_height_sections[
                     sparse_param_name] = height_sections
-            program.global_block()._insert_op(
-                index=index + 1,
-                type="split_selected_rows",
-                inputs={"X": orig_var},
-                outputs={"Out": splited_vars},
-                attrs={
-                    "height_sections": height_sections,
-                    RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
-                })
+            program.global_block()._insert_op(index=index + 1,
+                                              type="split_selected_rows",
+                                              inputs={"X": orig_var},
+                                              outputs={"Out": splited_vars},
+                                              attrs={
+                                                  "height_sections":
+                                                  height_sections,
+                                                  RPC_OP_ROLE_ATTR_NAME:
+                                                  DIST_OP_ROLE_ATTR_VALUE
+                                              })
         elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
-            program.global_block()._insert_op(
-                index=index + 1,
-                type="split_byref",
-                inputs={"X": orig_var},
-                outputs={"Out": splited_vars},
-                attrs={
-                    "sections": height_sections,
-                    RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
-                })
+            program.global_block()._insert_op(index=index + 1,
+                                              type="split_byref",
+                                              inputs={"X": orig_var},
+                                              outputs={"Out": splited_vars},
+                                              attrs={
+                                                  "sections":
+                                                  height_sections,
+                                                  RPC_OP_ROLE_ATTR_NAME:
+                                                  DIST_OP_ROLE_ATTR_VALUE
+                                              })
         else:
             AssertionError("Variable type should be in set "
                            "[LOD_TENSOR, SELECTED_ROWS]")
@@ -2225,11 +2249,10 @@ def _append_pserver_grad_merge_ops(self, optimize_block,
                 per_trainer_name = "%s.trainer_%d" % \
                                    (merged_var_name, i)
                 vars2merge.append(pserver_block.vars[per_trainer_name])
-            optimize_block.append_op(
-                type="sum",
-                inputs={"X": vars2merge},
-                outputs={"Out": merged_var},
-                attrs={"use_mkldnn": False})
+            optimize_block.append_op(type="sum",
+                                     inputs={"X": vars2merge},
+                                     outputs={"Out": merged_var},
+                                     attrs={"use_mkldnn": False})
             optimize_block.append_op(
                 type="scale",
                 inputs={"X": merged_var},
@@ -2239,64 +2262,66 @@ def _append_pserver_grad_merge_ops(self, optimize_block,
 
     def _append_dc_asgd_ops(self, block, param_var, grad_var):
         # NOTE: can not use grammar candy here, should put ops in specific block
-        local_param_bak = block.create_var(
-            name="%s.local_bak" % param_var.name,
-            shape=param_var.shape,
-            type=param_var.type,
-            dtype=param_var.dtype,
-            persistable=False)
+        local_param_bak = block.create_var(name="%s.local_bak" % param_var.name,
+                                           shape=param_var.shape,
+                                           type=param_var.type,
+                                           dtype=param_var.dtype,
+                                           persistable=False)
         # trainer_id_var is block local
-        trainer_id_var = block.create_var(
-            name="@TRAINER_ID@",
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            dtype=core.VarDesc.VarType.INT64,
-            shape=[1],
-            persistable=False)
+        trainer_id_var = block.create_var(name="@TRAINER_ID@",
+                                          type=core.VarDesc.VarType.LOD_TENSOR,
+                                          dtype=core.VarDesc.VarType.INT64,
+                                          shape=[1],
+                                          persistable=False)
 
         # ref_inputs = [x[1] for x in self.param_bak_list]
         ref_inputs = []
         for p, p_bak in self.param_bak_list:
             if p.name == param_var.name:
                 ref_inputs.append(p_bak)
-        block.append_op(
-            type="ref_by_trainer_id",
-            inputs={"X": ref_inputs,
-                    "TrainerId": trainer_id_var},
-            outputs={"Out": local_param_bak})
+        block.append_op(type="ref_by_trainer_id",
+                        inputs={
+                            "X": ref_inputs,
+                            "TrainerId": trainer_id_var
+                        },
+                        outputs={"Out": local_param_bak})
 
         def __create_temp_var__():
-            return block.create_var(
-                name=unique_name.generate("tmp_dc_output"),
-                shape=param_var.shape,
-                type=param_var.type,
-                dtype=param_var.dtype,
-                persistable=False)
+            return block.create_var(name=unique_name.generate("tmp_dc_output"),
+                                    shape=param_var.shape,
+                                    type=param_var.type,
+                                    dtype=param_var.dtype,
+                                    persistable=False)
 
         o1 = __create_temp_var__()
-        block.append_op(
-            type="elementwise_sub",
-            inputs={"X": param_var,
-                    "Y": local_param_bak},
-            outputs={"Out": o1})
+        block.append_op(type="elementwise_sub",
+                        inputs={
+                            "X": param_var,
+                            "Y": local_param_bak
+                        },
+                        outputs={"Out": o1})
         o2 = __create_temp_var__()
-        block.append_op(
-            type="elementwise_mul",
-            inputs={"X": o1,
-                    "Y": grad_var},
-            outputs={"Out": o2})
+        block.append_op(type="elementwise_mul",
+                        inputs={
+                            "X": o1,
+                            "Y": grad_var
+                        },
+                        outputs={"Out": o2})
         o3 = __create_temp_var__()
-        block.append_op(
-            type="elementwise_mul",
-            inputs={"X": o2,
-                    "Y": grad_var},
-            outputs={"Out": o3})
+        block.append_op(type="elementwise_mul",
+                        inputs={
+                            "X": o2,
+                            "Y": grad_var
+                        },
+                        outputs={"Out": o3})
         # TODO(typhoonzero): append scale
         o4 = __create_temp_var__()
-        block.append_op(
-            type="elementwise_add",
-            inputs={"X": grad_var,
-                    "Y": o3},
-            outputs={"Out": o4})
+        block.append_op(type="elementwise_add",
+                        inputs={
+                            "X": grad_var,
+                            "Y": o3
+                        },
+                        outputs={"Out": o4})
         return o4
 
     def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
@@ -2338,11 +2363,10 @@ def _get_param_block(opt_op):
                 param_block = _get_param_block(opt_op)
                 if not param_block:
                     return
-                tmpvar = pserver_block.create_var(
-                    name=param_block.name,
-                    persistable=True,
-                    dtype=param_block.dtype,
-                    shape=param_block.shape)
+                tmpvar = pserver_block.create_var(name=param_block.name,
+                                                  persistable=True,
+                                                  dtype=param_block.dtype,
+                                                  shape=param_block.shape)
                 new_inputs[key] = tmpvar
             elif key == "LearningRate":
                 # learning rate variable has already be created by non-optimize op,
@@ -2369,30 +2393,29 @@ def _get_param_block(opt_op):
             var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
             param_var = new_inputs["Param"]
             # update accumulator variable shape
-            new_shape = self._get_optimizer_input_shape(
-                opt_op.type, key, var.shape, param_var.shape)
-            tmpvar = pserver_block.create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=new_shape)
+            new_shape = self._get_optimizer_input_shape(opt_op.type, key,
+                                                        var.shape,
+                                                        param_var.shape)
+            tmpvar = pserver_block.create_var(name=var.name,
+                                              persistable=var.persistable,
+                                              dtype=var.dtype,
+                                              shape=new_shape)
             new_inputs[key] = tmpvar
 
         # change output's ParamOut variable
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, opt_op)
         outputs["ParamOut"] = new_inputs["Param"]
-        optimize_block.append_op(
-            type=opt_op.type,
-            inputs=new_inputs,
-            outputs=outputs,
-            attrs=opt_op.all_attrs())
+        optimize_block.append_op(type=opt_op.type,
+                                 inputs=new_inputs,
+                                 outputs=outputs,
+                                 attrs=opt_op.all_attrs())
 
         # record sparse grad to param name
         if new_inputs["Grad"].type == core.VarDesc.VarType.SELECTED_ROWS:
             sparse_grad_to_param.append(
-                str(new_inputs["Grad"].name) + ":" + str(new_inputs["Param"]
-                                                         .name))
+                str(new_inputs["Grad"].name) + ":" +
+                str(new_inputs["Param"].name))
 
     def _get_pserver_grad_param_var(self, var, var_dict):
         """
@@ -2436,8 +2459,10 @@ def _clone_lr_op(self, program, block, op):
                 if var not in program.global_block().vars:
                     block._clone_variable(var)
 
-        return block.append_op(
-            type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs())
+        return block.append_op(type=op.type,
+                               inputs=inputs,
+                               outputs=outputs,
+                               attrs=op.all_attrs())
 
     def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
         program = optimize_block.program
@@ -2452,7 +2477,8 @@ def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
                 # for ops like clipping and weight decay, get the split var (xxx.block0)
                 # for inputs/outputs
                 grad_block = self._get_pserver_grad_param_var(
-                    var, program.global_block().vars)
+                    var,
+                    program.global_block().vars)
                 if grad_block:
                     varlist[i] = grad_block
                 elif var.name not in program.global_block().vars:
@@ -2470,7 +2496,8 @@ def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
             for i in range(len(varlist)):
                 var = varlist[i]
                 grad_block = self._get_pserver_grad_param_var(
-                    var, program.global_block().vars)
+                    var,
+                    program.global_block().vars)
                 if grad_block:
                     varlist[i] = grad_block
                 elif var.name not in program.global_block().vars:
@@ -2480,11 +2507,10 @@ def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
                     varlist[i] = program.global_block().vars[var.name]
             outputs[key] = varlist
 
-        return optimize_block.append_op(
-            type=opt_op.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=opt_op.all_attrs())
+        return optimize_block.append_op(type=opt_op.type,
+                                        inputs=inputs,
+                                        outputs=outputs,
+                                        attrs=opt_op.all_attrs())
 
     def _is_op_connected(self, op1, op2):
         # If one op's input is another op's output or
@@ -2575,16 +2601,15 @@ def _get_lr_ops(self):
                             persistable=counter_var.persistable)
                         for id_ in range(self.trainer_num)
                     ]
-                    for i, op in enumerate(self.startup_program.global_block()
-                                           .ops):
+                    for i, op in enumerate(
+                            self.startup_program.global_block().ops):
                         if op.type == 'fill_constant':
                             for key in op.output_names:
-                                if len(op.output(key)) == 1 and op.output(key)[
-                                        0] == counter_var.name:
-                                    self.startup_program.global_block().ops[
-                                        i]._set_attr(
-                                            'value',
-                                            float(0.0 - self.trainer_num))
+                                if len(op.output(key)) == 1 and op.output(
+                                        key)[0] == counter_var.name:
+                                    self.startup_program.global_block(
+                                    ).ops[i]._set_attr(
+                                        'value', float(0.0 - self.trainer_num))
                     for var in all_trainer_counter_inputs:
                         if var.name == "%s.trainer_%d" % (counter_var.name,
                                                           self.trainer_id):
diff --git a/python/paddle/fluid/transpiler/geo_sgd_transpiler.py b/python/paddle/fluid/transpiler/geo_sgd_transpiler.py
index 5fbbedc12d0b4..eb86ffd36a2d4 100644
--- a/python/paddle/fluid/transpiler/geo_sgd_transpiler.py
+++ b/python/paddle/fluid/transpiler/geo_sgd_transpiler.py
@@ -47,6 +47,7 @@
 
 
 class GeoSgdTranspiler(DistributeTranspiler):
+
     def __init__(self, config=None):
         if config is not None:
             self.config = config
@@ -195,8 +196,8 @@ def get_trainer_program(self, wait_port=True):
     def get_pserver_programs(self, endpoint):
         pserver_prog = self.get_pserver_program(endpoint)
         self.param_grad_ep_mapping = self.param_opt_ep_mapping
-        pserver_startup = self.get_startup_program(
-            endpoint, pserver_program=pserver_prog)
+        pserver_startup = self.get_startup_program(endpoint,
+                                                   pserver_program=pserver_prog)
         return pserver_prog, pserver_startup
 
     def get_pserver_program(self, endpoint):
@@ -230,19 +231,17 @@ def get_pserver_program(self, endpoint):
                     [delta_var_name, param.name]))
             else:
                 delta_type = param.type
-            delta_var = pserver_block.create_var(
-                name=delta_var_name,
-                persistable=False,
-                type=delta_type,
-                dtype=param.dtype,
-                shape=param.shape)
-
-            per_opt_block.append_op(
-                type="sum",
-                inputs={"X": [param, delta_var]},
-                outputs={"Out": param})
-            param_to_block_id.append(delta_var_name + ":" + str(
-                per_opt_block.idx))
+            delta_var = pserver_block.create_var(name=delta_var_name,
+                                                 persistable=False,
+                                                 type=delta_type,
+                                                 dtype=param.dtype,
+                                                 shape=param.shape)
+
+            per_opt_block.append_op(type="sum",
+                                    inputs={"X": [param, delta_var]},
+                                    outputs={"Out": param})
+            param_to_block_id.append(delta_var_name + ":" +
+                                     str(per_opt_block.idx))
 
         attrs = {
             "optimize_blocks": optimize_block,
@@ -258,11 +257,10 @@ def get_pserver_program(self, endpoint):
         }
 
         # step5 append the listen_and_serv op
-        pserver_program.global_block().append_op(
-            type="listen_and_serv",
-            inputs={'X': recv_inputs},
-            outputs={},
-            attrs=attrs)
+        pserver_program.global_block().append_op(type="listen_and_serv",
+                                                 inputs={'X': recv_inputs},
+                                                 outputs={},
+                                                 attrs=attrs)
 
         pserver_program._sync_with_cpp()
         # save pserver program to generate pserver side startup relatively.
@@ -289,8 +287,7 @@ def _init_splited_vars(self):
         # step 2. Slice vars into numbers of piece with block_size
         # when we slice var up into blocks, we will slice the var according to
         # pserver services' count. A pserver may have two or more listening ports.
-        param_blocks = slice_variable(param_list,
-                                      len(self.pserver_endpoints),
+        param_blocks = slice_variable(param_list, len(self.pserver_endpoints),
                                       self.config.min_block_size)
 
         # step 3. Create split param from split blocks
@@ -302,11 +299,9 @@ def _init_splited_vars(self):
         # step 4. Create mapping of endpoint -> split var to create pserver side program
         self.param_opt_ep_mapping = collections.OrderedDict()
         [
-            self.param_opt_ep_mapping.update({
-                ep: {
-                    "params": [],
-                }
-            }) for ep in self.pserver_endpoints
+            self.param_opt_ep_mapping.update({ep: {
+                "params": [],
+            }}) for ep in self.pserver_endpoints
         ]
 
         # step 5. Create delta var of Geo-Sgd & record vars information
@@ -340,13 +335,12 @@ def _init_splited_vars(self):
             for splited_var in splited_vars:
                 is_slice, block_id, offset = self._get_slice_var_info(
                     splited_var)
-                self.vars_overview.add_distributed_var(
-                    origin_var=origin_var,
-                    slice_var=splited_var,
-                    block_id=block_id,
-                    offset=offset,
-                    is_slice=is_slice,
-                    vtype="Param")
+                self.vars_overview.add_distributed_var(origin_var=origin_var,
+                                                       slice_var=splited_var,
+                                                       block_id=block_id,
+                                                       offset=offset,
+                                                       is_slice=is_slice,
+                                                       vtype="Param")
                 self.split_to_origin_mapping[splited_var.name] = origin_name
                 if origin_name in self.sparse_var_list:
                     self.sparse_var_splited_list.append(splited_var.name)
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index fc3bd43a5e56f..090d0e8dcbb87 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -109,7 +109,7 @@ def generate(key):
 # cause memory leak in dygraph mode. It is because the previous
 # naming rule would use `conv_0.tmp` as the key, and in dygraph
 # mode, `conv_i` increases as batch increases. Thus, keys would
-# increase in a way like `conv_0.tmp`, `conv_1.tmp`, .... 
+# increase in a way like `conv_0.tmp`, `conv_1.tmp`, ....
 # Not find a better way to fix this bug in dygraph mode. In TF,
 # variable name is meaningless in eager execution mode, and in
 # PyTorch, there is no variable name at all. Maybe we should
@@ -118,10 +118,10 @@ def generate(key):
 # Another concern is that save/load interfaces. Usually, user
 # would save model in static graph mode, and load it in dygraph
 # mode. Therefore, we keep the variable name of Parameter currently.
-# 
-# Please fix me if a better method is found.    
-# 
-# NOTE(zhiqiu): use c++ unique_name_generator in dygraph mode, 
+#
+# Please fix me if a better method is found.
+#
+# NOTE(zhiqiu): use c++ unique_name_generator in dygraph mode,
 # in order to keep name consistency.
 def generate_with_ignorable_key(key):
     from .framework import _non_static_mode, _dygraph_tracer
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
index 7d1bbf8162c2e..e24a6a3aff7ff 100644
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -22,6 +22,7 @@
 
 
 def is_list_tuple(index, contain_type):
+
     def _is_list_tuple(item):
         if not (isinstance(item, (list, tuple)) or type(item) == contain_type):
             return False
@@ -64,14 +65,15 @@ def get_list_index_shape(var_dims, index_dims):
 
 
 class SliceInfo:
+
     def __init__(self):
         self.pre_shape = None
         self.indexes = []
         self.dtype = None
 
     def update(self, index):
-        if is_list_tuple(index, int) or isinstance(index, (
-                paddle.fluid.Variable, np.ndarray)):
+        if is_list_tuple(index, int) or isinstance(
+                index, (paddle.fluid.Variable, np.ndarray)):
             # convert index to Tensor
             if not isinstance(index, paddle.fluid.Variable):
                 index = paddle.assign(index)
@@ -81,8 +83,8 @@ def update(self, index):
             else:
                 if index.dtype != self.dtype:
                     raise IndexError(
-                        "Data type of Tensor/List index should be same. The current data type is {}, but the previous data type is {}.".
-                        format(index.dtype, self.dtype))
+                        "Data type of Tensor/List index should be same. The current data type is {}, but the previous data type is {}."
+                        .format(index.dtype, self.dtype))
 
             self.indexes.append(index)
 
@@ -90,12 +92,12 @@ def update(self, index):
                 self.pre_shape = index.shape
             else:
                 if self.pre_shape != index.shape:
-                    # broadcast 
+                    # broadcast
                     cur_shape = paddle.broadcast_shape(self.pre_shape,
                                                        index.shape)
                     for i in range(len(self.indexes)):
-                        self.indexes[i] = paddle.broadcast_to(self.indexes[i],
-                                                              cur_shape)
+                        self.indexes[i] = paddle.broadcast_to(
+                            self.indexes[i], cur_shape)
                 self.pre_shape = self.indexes[-1].shape
         else:
             raise ValueError(
@@ -121,12 +123,15 @@ def get_offset_stride(self, tensor_shape):
 
         if len(self.indexes) <= len(tensor_shape) or len(self.indexes) == 1:
             shape = paddle.stack(self.indexes)
-            axes = list(range(1, len(self.pre_shape) + 1)) + [0, ]
+            axes = list(range(1,
+                              len(self.pre_shape) + 1)) + [
+                                  0,
+                              ]
 
         else:
             raise ValueError(
-                "too many indices for tensor: tensor is {}-dimensional, but {} were indexed".
-                format(len(tensor_shape), self.pre_shape[0]))
+                "too many indices for tensor: tensor is {}-dimensional, but {} were indexed"
+                .format(len(tensor_shape), self.pre_shape[0]))
 
         shape_transpose = paddle.transpose(shape, axes)
         return shape_transpose
@@ -156,22 +161,25 @@ def set_item(self, tensor_origin, value):
         shape_transpose = self.get_offset_stride(tensor_origin.shape)
         index = paddle.assign(shape_transpose)
 
-        gather_tensor_shape = get_list_index_shape(
-            tensor.shape, [len(self.indexes), ] + list(self.indexes[-1].shape))
+        gather_tensor_shape = get_list_index_shape(tensor.shape, [
+            len(self.indexes),
+        ] + list(self.indexes[-1].shape))
 
-        value_dims_bd = [1, ] * len(gather_tensor_shape)
+        value_dims_bd = [
+            1,
+        ] * len(gather_tensor_shape)
         value_dims_bd[-len(value.shape):] = list(value.shape)
 
         for i in range(len(gather_tensor_shape)):
-            if not (value_dims_bd[i] == gather_tensor_shape[i] or
-                    value_dims_bd[i] == 1):
+            if not (value_dims_bd[i] == gather_tensor_shape[i]
+                    or value_dims_bd[i] == 1):
                 raise ValueError("{} can not broadcast into {}".format(
                     value.shape, gather_tensor_shape))
 
         value_broadcast = paddle.broadcast_to(value, gather_tensor_shape)
 
-        value_1d = value_broadcast.reshape([-1] + gather_tensor_shape[len(
-            index.shape) - 1:])
+        value_1d = value_broadcast.reshape(
+            [-1] + gather_tensor_shape[len(index.shape) - 1:])
 
         index_1d = index.reshape([-1, index.shape[-1]])
 
@@ -218,8 +226,9 @@ def replace_ellipsis(var, item):
     if ell_idx == len(item) - 1:
         return item[:-1]
     else:
-        item[ell_idx:ell_idx + 1] = [slice(None)] * (
-            len(var.shape) - len(item) + item.count(None) + 1)
+        item[ell_idx:ell_idx +
+             1] = [slice(None)
+                   ] * (len(var.shape) - len(item) + item.count(None) + 1)
 
     return item
 
@@ -267,8 +276,8 @@ def deal_attrs(attrs, attr, attr_name, tensor_attr_name, inputs, infer_flags):
     from .layers import utils
 
     if utils._contain_var(attr):
-        inputs[tensor_attr_name] = utils._convert_to_tensor_list(
-            attr, dtype="int64")
+        inputs[tensor_attr_name] = utils._convert_to_tensor_list(attr,
+                                                                 dtype="int64")
         for i, dim in enumerate(attr):
             if isinstance(dim, Variable):
                 attrs[attr_name].append(-1)
@@ -279,7 +288,7 @@ def deal_attrs(attrs, attr, attr_name, tensor_attr_name, inputs, infer_flags):
         attrs[attr_name] = attr
 
 
-# the item is a tensor of bool 
+# the item is a tensor of bool
 def get_value_for_bool_tensor(var, item):
     if len(item.shape) > len(var.shape):
         raise IndexError("The dims of bool index doesn't match indexed array, "
@@ -306,9 +315,8 @@ def idx_empty(var):
         return paddle.empty(var_shape, dtype=var.dtype)
 
     from .layers.control_flow import cond
-    return cond(
-        paddle.logical_not(item.any()), lambda: idx_empty(var),
-        lambda: idx_not_empty(var, item))
+    return cond(paddle.logical_not(item.any()), lambda: idx_empty(var),
+                lambda: idx_not_empty(var, item))
 
 
 def _getitem_impl_(var, item):
@@ -343,8 +351,8 @@ def _getitem_impl_(var, item):
     slice_info = SliceInfo()
 
     for dim, slice_item in enumerate(item):
-        if is_integer_or_scalar_tensor(slice_item) and not is_bool_tensor(
-                slice_item):
+        if is_integer_or_scalar_tensor(
+                slice_item) and not is_bool_tensor(slice_item):
             if isinstance(slice_item,
                           int) and var.shape[dim] is not None and var.shape[
                               dim] >= 0 and slice_item >= var.shape[dim]:
@@ -377,9 +385,9 @@ def _getitem_impl_(var, item):
                 start = 0 if step > 0 else MAX_INTEGER
             if end is None:
                 if var.shape[dim] != -1 and (
-                        paddle.fluid.framework._non_static_mode() or
-                        var.desc.type() != core.VarDesc.VarType.LOD_TENSOR_ARRAY
-                ):
+                        paddle.fluid.framework._non_static_mode()
+                        or var.desc.type() !=
+                        core.VarDesc.VarType.LOD_TENSOR_ARRAY):
                     end = var.shape[dim] if step > 0 else -1
                 else:
                     end = MAX_INTEGER if step > 0 else -1
@@ -399,8 +407,8 @@ def _getitem_impl_(var, item):
 
             if len(item) != 1:
                 raise IndexError(
-                    "When index contains a list, its length must be 1, but received {}.".
-                    format(len(item)))
+                    "When index contains a list, its length must be 1, but received {}."
+                    .format(len(item)))
             new_slice_item = []
             if all_bool:
                 if len(slice_item) != var.shape[0]:
@@ -447,8 +455,8 @@ def _getitem_impl_(var, item):
 
         else:
             raise IndexError(
-                "Valid index accept int or slice or ellipsis or list, but received {}.".
-                format(slice_item))
+                "Valid index accept int or slice or ellipsis or list, but received {}."
+                .format(slice_item))
 
         axes.append(dim)
         starts.append(start)
@@ -459,8 +467,8 @@ def _getitem_impl_(var, item):
     if slice_info.indexes:
         if len(slice_info.indexes) != len(item):
             raise IndexError(
-                "Valid index accept int or slice or ellipsis or list, but received {}.".
-                format(item))
+                "Valid index accept int or slice or ellipsis or list, but received {}."
+                .format(item))
         return slice_info.get_item(var)
 
     inputs = {'Input': [var]}
@@ -489,11 +497,10 @@ def _getitem_impl_(var, item):
             name=unique_name.generate_with_ignorable_key(var.name + "_" +
                                                          op_type),
             dtype=var.dtype)
-        target_block.append_op(
-            type=op_type,
-            inputs=inputs,
-            outputs={'Out': [slice_out_var]},
-            attrs=attrs)
+        target_block.append_op(type=op_type,
+                               inputs=inputs,
+                               outputs={'Out': [slice_out_var]},
+                               attrs=attrs)
         out = slice_out_var
 
     if len(reverse_axes) > 0:
@@ -555,8 +562,8 @@ def _setitem_impl_(var, item, value):
     slice_info = SliceInfo()
     dim = 0
     for _, slice_item in enumerate(item):
-        if is_integer_or_scalar_tensor(slice_item) and not is_bool_tensor(
-                slice_item):
+        if is_integer_or_scalar_tensor(
+                slice_item) and not is_bool_tensor(slice_item):
             decrease_axes.append(dim)
             start = slice_item
             end = slice_item + 1 if slice_item != -1 else MAX_INTEGER
@@ -601,8 +608,8 @@ def _setitem_impl_(var, item, value):
 
             if len(item) != 1:
                 raise IndexError(
-                    "When index contains a bool list, its length must be 1, but received {}.".
-                    format(len(item)))
+                    "When index contains a bool list, its length must be 1, but received {}."
+                    .format(len(item)))
 
             from .layers import assign
             idx_tensor = assign(slice_item)
@@ -612,8 +619,8 @@ def _setitem_impl_(var, item, value):
             if slice_item.dtype == core.VarDesc.VarType.BOOL:
                 if len(item) != 1:
                     raise IndexError(
-                        "When index contains a bool tensor, its length must be 1, but received {}.".
-                        format(len(item)))
+                        "When index contains a bool tensor, its length must be 1, but received {}."
+                        .format(len(item)))
                 return set_value_for_bool_tensor(var, slice_item, value)
             else:
                 slice_info.update(slice_item)
@@ -632,8 +639,8 @@ def _setitem_impl_(var, item, value):
     if slice_info.indexes:
         if len(slice_info.indexes) != len(item):
             raise IndexError(
-                "Valid index accept int or slice or ellipsis or list, but received {}.".
-                format(item))
+                "Valid index accept int or slice or ellipsis or list, but received {}."
+                .format(item))
         return slice_info.set_item(var, value)
     attrs = {
         'axes': axes,
@@ -702,17 +709,16 @@ def _setitem_impl_(var, item, value):
         var._bump_inplace_version()
 
     cur_block = default_main_program().current_block()
-    cur_block.append_op(
-        type="set_value",
-        inputs=inputs,
-        outputs={'Out': var},
-        attrs=attrs,
-        inplace_map={"Input": "Out"})
+    cur_block.append_op(type="set_value",
+                        inputs=inputs,
+                        outputs={'Out': var},
+                        attrs=attrs,
+                        inplace_map={"Input": "Out"})
 
     return var
 
 
-# the item is a tensor of bool 
+# the item is a tensor of bool
 def set_value_for_bool_tensor(var, item, value):
     if len(item.shape) > len(var.shape):
         raise IndexError("The dims of bool index doesn't match indexed array, "
diff --git a/python/paddle/fluid/wrapped_decorator.py b/python/paddle/fluid/wrapped_decorator.py
index 7e7dbff65611e..5f837b575637c 100644
--- a/python/paddle/fluid/wrapped_decorator.py
+++ b/python/paddle/fluid/wrapped_decorator.py
@@ -19,6 +19,7 @@
 
 
 def wrap_decorator(decorator_func):
+
     @decorator.decorator
     def __impl__(func, *args, **kwargs):
         wrapped_func = decorator_func(func)
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index a3584a73dfae1..34423e3f3ed6b 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: import framework api under this directory 
+# TODO: import framework api under this directory
 
 from . import random  # noqa: F401
 from .random import seed  # noqa: F401
diff --git a/python/paddle/framework/dtype.py b/python/paddle/framework/dtype.py
index f49f748975882..56a95f48b5f9b 100644
--- a/python/paddle/framework/dtype.py
+++ b/python/paddle/framework/dtype.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index 350b1f1567bd8..41fd0c0703bbc 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define framework api 
+# TODO: define framework api
 from paddle.fluid.layer_helper_base import LayerHelperBase
 from paddle.fluid.data_feeder import convert_dtype
 from paddle.fluid.framework import _dygraph_tracer
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 8e8dd7855113b..09f3c51240191 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -96,7 +96,7 @@ def _load_state_dict_from_save_inference_model(model_path, config):
 
 
 def _load_state_dict_from_save_params(model_path):
-    # Try to load all the files in the directory in VarBase format, 
+    # Try to load all the files in the directory in VarBase format,
     # the file name is used as the name of VarBase
     load_var_list = []
 
@@ -157,7 +157,7 @@ def _build_load_path_and_config(path, config):
     elif not prefix_format_exist and not directory_format_exist:
         error_msg = "The ``path`` (%s) to load model not exists."
         # if current path is a prefix, and the path.pdparams or path.pdopt
-        # is exist, users may want use `paddle.load` load the result of 
+        # is exist, users may want use `paddle.load` load the result of
         # `fluid.save_dygraph`, we raise error here for users
         params_file_path = path + ".pdparams"
         opti_file_path = path + ".pdopt"
@@ -237,8 +237,9 @@ def _pickle_save(obj, f, protocol):
             type(protocol)))
 
     if protocol < 2 or protocol > 4:
-        raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
-                         format(protocol))
+        raise ValueError(
+            "Expected 1<'protocol'<5, but received protocol={}".format(
+                protocol))
 
     def reduce_varbase(self):
         data = self.numpy()
@@ -330,20 +331,20 @@ def _is_state_dict(obj):
     if isinstance(obj, dict):
 
         def condition(obj):
-            return isinstance(obj, (fluid.Layer, Program, core.VarBase,
-                                    core.eager.Tensor, core.LoDTensor,
-                                    core.SelectedRows))
+            return isinstance(
+                obj, (fluid.Layer, Program, core.VarBase, core.eager.Tensor,
+                      core.LoDTensor, core.SelectedRows))
 
-        # If the value of a dict is a core.VarBase/LoDTensor or a dict 
-        # that does not contain a paddle type(Layer, Program, VarBase, LoDTensor, SelectedRows), 
+        # If the value of a dict is a core.VarBase/LoDTensor or a dict
+        # that does not contain a paddle type(Layer, Program, VarBase, LoDTensor, SelectedRows),
         # the dict is considered to be a state_ dict.
         for key, value in obj.items():
             if isinstance(value, dict):
                 for k, v in value.items():
                     if _contain_x(v, condition):
                         return False
-            elif not isinstance(value, (core.VarBase, core.eager.Tensor,
-                                        core.LoDTensor)):
+            elif not isinstance(
+                    value, (core.VarBase, core.eager.Tensor, core.LoDTensor)):
                 return False
         return True
 
@@ -432,12 +433,13 @@ def _parse_every_object(obj, condition_func, convert_func):
                 obj,
             (str, np.ndarray, core.VarBase, core.eager.Tensor, core.LoDTensor)):
             raise NotImplementedError(
-                "The iteratable objects supported are tuple, list, dict, OrderedDict, string. But received {}.".
-                format(type(obj)))
+                "The iteratable objects supported are tuple, list, dict, OrderedDict, string. But received {}."
+                .format(type(obj)))
         return obj
 
 
 def _parse_load_result(obj, return_numpy):
+
     def is_layer(obj):
         return isinstance(obj, fluid.Layer)
 
@@ -460,12 +462,12 @@ def tuple_to_tensor(obj):
     def ndarray_to_tensor(obj):
         return _ndarray_to_tensor(obj, return_numpy=return_numpy)
 
-    # tuple(name, ndarry) was converted from varbase of paddle2.1, 
+    # tuple(name, ndarry) was converted from varbase of paddle2.1,
     # and all tuple(name, ndarry) are converted to tensor.
     if _contain_x(obj, _transformed_from_varbase):
         return _parse_every_object(obj, _transformed_from_varbase,
                                    tuple_to_tensor)
-    # If there is no tuple(name, ndary), it is considered to be saved by paddle2.0 
+    # If there is no tuple(name, ndary), it is considered to be saved by paddle2.0
     # or converted from LoDTensor, and all ndarrays are converted to tensor.
     else:
         return _parse_every_object(obj, _transformed_from_lodtensor,
@@ -565,8 +567,8 @@ def _save_binary_var(obj, path):
     else:
         # Since the concept of 'Tensor' is only exposed to users, the error message can only contain tensor instead of 'LoDTensor' or 'SelectedRows'
         raise NotImplementedError(
-            "When use_binary_format = True, `paddle.save`  expected Tensor, but received {}.".
-            format(type(obj)))
+            "When use_binary_format = True, `paddle.save`  expected Tensor, but received {}."
+            .format(type(obj)))
 
 
 def save(obj, path, protocol=4, **configs):
@@ -752,8 +754,9 @@ def _legacy_save(obj, path, protocol=2):
             type(protocol)))
 
     if protocol < 2 or protocol > 4:
-        raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
-                         format(protocol))
+        raise ValueError(
+            "Expected 1<'protocol'<5, but received protocol={}".format(
+                protocol))
 
     if _is_file_path(path):
         filename = os.path.basename(path)
@@ -968,8 +971,8 @@ def load(path, **configs):
                             del load_result["StructuredToParameterName@@"]
                     else:
                         # paddle2.1 static.save/load
-                        load_result = _parse_load_result(load_result,
-                                                         config.return_numpy)
+                        load_result = _parse_load_result(
+                            load_result, config.return_numpy)
 
                 else:
                     load_result = _parse_load_result(load_result,
@@ -1030,18 +1033,18 @@ def _legacy_load(path, **configs):
         if os.path.exists(model_file_path):
             # Load state dict by `jit.save/io.save_inference_model` save format
             # NOTE(chenweihang): [ Compatibility of save_inference_model save format ]
-            # The model saved by `save_inference_model` does not completely correspond to 
-            # the information required by the `state_dict` under the dygraph. 
-            # `save_inference_model` not save structured name, we need to remind 
+            # The model saved by `save_inference_model` does not completely correspond to
+            # the information required by the `state_dict` under the dygraph.
+            # `save_inference_model` not save structured name, we need to remind
             # the user to configure the `use_structured_name` argument when `set_state_dict`
-            # NOTE(chenweihang): `jit.save` doesn't save optimizer state 
-            load_result = _load_state_dict_from_save_inference_model(model_path,
-                                                                     config)
+            # NOTE(chenweihang): `jit.save` doesn't save optimizer state
+            load_result = _load_state_dict_from_save_inference_model(
+                model_path, config)
         else:
             # load state dict by `io.save_params/persistables` save format
             # TODO(chenweihang): [ Now only supports loading parameters separately ]
             # If users save all parameters as one file, the [ variable.name -> variable ]
-            # mapping info will lost, so users need to give variable list, but users build 
+            # mapping info will lost, so users need to give variable list, but users build
             # variable list in dygraph mode is difficult, we recommend users to use
             # paddle.static.load_program_state in this case
             load_result = _load_state_dict_from_save_params(model_path)
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index b58d36b8e7d50..6c5ff2c8efba9 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -38,7 +38,7 @@ def seed(seed):
 
     """
     #TODO(zhiqiu): 1. remove program.random_seed when all random-related op upgrade
-    # 2. support gpu generator by global device 
+    # 2. support gpu generator by global device
 
     seed = int(seed)
 
diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index a8e034c87b8e7..1ba33a6b52bd7 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -68,6 +68,7 @@ def config_callbacks(callbacks=None,
 
 
 class CallbackList(object):
+
     def __init__(self, callbacks=None):
         # copy
         self.callbacks = [c for c in callbacks]
@@ -441,8 +442,8 @@ def on_eval_begin(self, logs=None):
             'samples': 0,
         }
 
-        self.eval_progbar = ProgressBar(
-            num=self.eval_steps, verbose=self.verbose)
+        self.eval_progbar = ProgressBar(num=self.eval_steps,
+                                        verbose=self.verbose)
         if self._is_print():
             print('Eval begin...')
 
@@ -485,8 +486,8 @@ def on_predict_begin(self, logs=None):
             'samples': 0,
         }
 
-        self.test_progbar = ProgressBar(
-            num=self.test_steps, verbose=self.verbose)
+        self.test_progbar = ProgressBar(num=self.test_steps,
+                                        verbose=self.verbose)
         if self._is_print():
             print('Predict begin...')
 
@@ -925,8 +926,9 @@ def _updates(self, logs, mode):
                 else:
                     continue
 
-                self.writer.add_scalar(
-                    tag=temp_tag, step=total_step, value=temp_value)
+                self.writer.add_scalar(tag=temp_tag,
+                                       step=total_step,
+                                       value=temp_value)
 
     def on_train_batch_end(self, step, logs=None):
         logs = logs or {}
@@ -1057,8 +1059,8 @@ def _reset(self):
             warnings.warn('Learning rate reduction mode %s is unknown, '
                           'fallback to auto mode.' % self.mode)
             self.mode = 'auto'
-        if (self.mode == 'min' or
-            (self.mode == 'auto' and 'acc' not in self.monitor)):
+        if (self.mode == 'min'
+                or (self.mode == 'auto' and 'acc' not in self.monitor)):
             self.monitor_op = lambda a, b: np.less(a, b - self.min_delta)
             self.best = np.Inf
         else:
@@ -1085,8 +1087,8 @@ def on_eval_end(self, logs=None):
                     return
             except Exception as e:
                 warnings.warn(
-                    'There are something wrong when get learning_rate from optimizer: {}.'.
-                    format(e))
+                    'There are something wrong when get learning_rate from optimizer: {}.'
+                    .format(e))
                 return
 
         current = logs[self.monitor]
diff --git a/python/paddle/hapi/dynamic_flops.py b/python/paddle/hapi/dynamic_flops.py
index 4dd1aa03aa2ca..214af9f2f5986 100644
--- a/python/paddle/hapi/dynamic_flops.py
+++ b/python/paddle/hapi/dynamic_flops.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -106,11 +106,10 @@ def count_leaky_relu(m, x, y):
         _, net.forward = unwrap_decorators(net.forward)
 
         inputs = paddle.randn(input_size)
-        return dynamic_flops(
-            net,
-            inputs=inputs,
-            custom_ops=custom_ops,
-            print_detail=print_detail)
+        return dynamic_flops(net,
+                             inputs=inputs,
+                             custom_ops=custom_ops,
+                             print_detail=print_detail)
     elif isinstance(net, paddle.static.Program):
         return static_flops(net, print_detail=print_detail)
     else:
@@ -124,8 +123,8 @@ def count_convNd(m, x, y):
     x = x[0]
     kernel_ops = np.product(m.weight.shape[2:])
     bias_ops = 1 if m.bias is not None else 0
-    total_ops = int(y.numel()) * (
-        x.shape[1] / m._groups * kernel_ops + bias_ops)
+    total_ops = int(
+        y.numel()) * (x.shape[1] / m._groups * kernel_ops + bias_ops)
     m.total_ops += abs(int(total_ops))
 
 
@@ -227,8 +226,8 @@ def add_hooks(m):
         if m_type in custom_ops:
             flops_fn = custom_ops[m_type]
             if m_type not in types_collection:
-                print("Customize Function has been applied to {}".format(
-                    m_type))
+                print(
+                    "Customize Function has been applied to {}".format(m_type))
         elif m_type in register_hooks:
             flops_fn = register_hooks[m_type]
             if m_type not in types_collection:
@@ -236,8 +235,8 @@ def add_hooks(m):
         else:
             if m_type not in types_collection:
                 print(
-                    "Cannot find suitable count function for {}. Treat it as zero FLOPs.".
-                    format(m_type))
+                    "Cannot find suitable count function for {}. Treat it as zero FLOPs."
+                    .format(m_type))
 
         if flops_fn is not None:
             flops_handler = m.register_forward_post_hook(flops_fn)
@@ -280,8 +279,10 @@ def add_hooks(m):
         if {'total_ops', 'total_params', 'input_shape',
                 'output_shape'}.issubset(set(list(m._buffers.keys()))):
             table.add_row([
-                m.full_name(), list(m.input_shape.numpy()),
-                list(m.output_shape.numpy()), int(m.total_params),
+                m.full_name(),
+                list(m.input_shape.numpy()),
+                list(m.output_shape.numpy()),
+                int(m.total_params),
                 int(m.total_ops)
             ])
             m._buffers.pop("total_ops")
@@ -290,6 +291,6 @@ def add_hooks(m):
             m._buffers.pop('output_shape')
     if print_detail:
         table.print_table()
-    print('Total Flops: {}     Total Params: {}'.format(
-        int(total_ops), int(total_params)))
+    print('Total Flops: {}     Total Params: {}'.format(int(total_ops),
+                                                        int(total_params)))
     return int(total_ops)
diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py
index 6cb2aae5ae24a..3217059c647d0 100644
--- a/python/paddle/hapi/hub.py
+++ b/python/paddle/hapi/hub.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -163,8 +163,8 @@ def _check_dependencies(m):
             pkg for pkg in dependencies if not _check_module_exists(pkg)
         ]
         if len(missing_deps):
-            raise RuntimeError('Missing dependencies: {}'.format(', '.join(
-                missing_deps)))
+            raise RuntimeError('Missing dependencies: {}'.format(
+                ', '.join(missing_deps)))
 
 
 def list(repo_dir, source='github', force_reload=False):
@@ -194,12 +194,14 @@ def list(repo_dir, source='github', force_reload=False):
     """
     if source not in ('github', 'gitee', 'local'):
         raise ValueError(
-            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
-            format(source))
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'
+            .format(source))
 
     if source in ('github', 'gitee'):
-        repo_dir = _get_cache_or_reload(
-            repo_dir, force_reload, True, source=source)
+        repo_dir = _get_cache_or_reload(repo_dir,
+                                        force_reload,
+                                        True,
+                                        source=source)
 
     hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
 
@@ -239,12 +241,14 @@ def help(repo_dir, model, source='github', force_reload=False):
     """
     if source not in ('github', 'gitee', 'local'):
         raise ValueError(
-            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
-            format(source))
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'
+            .format(source))
 
     if source in ('github', 'gitee'):
-        repo_dir = _get_cache_or_reload(
-            repo_dir, force_reload, True, source=source)
+        repo_dir = _get_cache_or_reload(repo_dir,
+                                        force_reload,
+                                        True,
+                                        source=source)
 
     hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
 
@@ -280,12 +284,14 @@ def load(repo_dir, model, source='github', force_reload=False, **kwargs):
     """
     if source not in ('github', 'gitee', 'local'):
         raise ValueError(
-            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
-            format(source))
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'
+            .format(source))
 
     if source in ('github', 'gitee'):
-        repo_dir = _get_cache_or_reload(
-            repo_dir, force_reload, True, source=source)
+        repo_dir = _get_cache_or_reload(repo_dir,
+                                        force_reload,
+                                        True,
+                                        source=source)
 
     hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
 
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index a7a5e59f39409..c78c89964c92e 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -104,8 +104,10 @@ def extract_args(func):
 
 
 def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
-    return collective._c_allgather(
-        x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream)
+    return collective._c_allgather(x,
+                                   nranks,
+                                   ring_id=ring_id,
+                                   use_calc_stream=use_calc_stream)
 
 
 def wait_server_ready(endpoints):
@@ -143,49 +145,45 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
             persistable=True,
             type=fluid.core.VarDesc.VarType.RAW)
 
-        block.append_op(
-            type='c_gen_nccl_id',
-            inputs={},
-            outputs={'Out': nccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints
-            })
-
-        block.append_op(
-            type='c_comm_init',
-            inputs={'X': nccl_id_var},
-            outputs={},
-            attrs={
-                'nranks': nranks,
-                'rank': rank,
-                'ring_id': 0,
-            })
+        block.append_op(type='c_gen_nccl_id',
+                        inputs={},
+                        outputs={'Out': nccl_id_var},
+                        attrs={
+                            'rank': rank,
+                            'endpoint': current_endpoint,
+                            'other_endpoints': other_endpoints
+                        })
+
+        block.append_op(type='c_comm_init',
+                        inputs={'X': nccl_id_var},
+                        outputs={},
+                        attrs={
+                            'nranks': nranks,
+                            'rank': rank,
+                            'ring_id': 0,
+                        })
     elif core.is_compiled_with_npu():
         hccl_id_var = block.create_var(
             name=fluid.unique_name.generate('hccl_id'),
             persistable=True,
             type=core.VarDesc.VarType.RAW)
-        block.append_op(
-            type='c_gen_hccl_id',
-            inputs={},
-            outputs={'Out': hccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints
-            })
-        block.append_op(
-            type='c_comm_init_hccl',
-            inputs={'X': hccl_id_var},
-            outputs={},
-            attrs={
-                'rank': rank,
-                'ring_id': 0,
-                'device_id': int(os.getenv("FLAGS_selected_npus")),
-                'rank_ids': nranks
-            })
+        block.append_op(type='c_gen_hccl_id',
+                        inputs={},
+                        outputs={'Out': hccl_id_var},
+                        attrs={
+                            'rank': rank,
+                            'endpoint': current_endpoint,
+                            'other_endpoints': other_endpoints
+                        })
+        block.append_op(type='c_comm_init_hccl',
+                        inputs={'X': hccl_id_var},
+                        outputs={},
+                        attrs={
+                            'rank': rank,
+                            'ring_id': 0,
+                            'device_id': int(os.getenv("FLAGS_selected_npus")),
+                            'rank_ids': nranks
+                        })
 
 
 def prepare_distributed_context(place=None):
@@ -308,6 +306,7 @@ def parameters(self, *args, **kwargs):
         return self.model.network.parameters(*args, **kwargs)
 
     def save(self, path):
+
         def _save(state, path):
             if not state:
                 return
@@ -348,8 +347,8 @@ def load(self, param_state_pairs, optim_state):
 
         # restore parameter states
         fluid.core._create_loaded_parameter(
-            [param for param, state in param_state_pairs],
-            global_scope(), executor)
+            [param for param, state in param_state_pairs], global_scope(),
+            executor)
         for param, state in param_state_pairs:
             self._set_var(param, state)
 
@@ -396,25 +395,24 @@ def _load_optimizer(self, state, executor):
                     opt_cls_name = self.model._optimizer.__class__.__name__
                     opt_unq_name = None
                     for name in self.model._optimizer._accumulators.keys():
-                        accum_name = name if opt_name is None else name[len(
-                            opt_name) + 1:]
+                        accum_name = name if opt_name is None else name[
+                            len(opt_name) + 1:]
                         for param_name, state_var in self.model._optimizer._accumulators[
                                 name].items():
                             if opt_unq_name is None:
                                 # can not infer out the exact unique(opt_name),
                                 # thus try to extract rather than generate
-                                for state_key in sorted(
-                                        state.keys(),
-                                        key=lambda x: len(x),
-                                        reverse=True):
+                                for state_key in sorted(state.keys(),
+                                                        key=lambda x: len(x),
+                                                        reverse=True):
                                     prefix = param_name + "_" + (
                                         opt_cls_name
                                         if opt_name is None else opt_name) + "_"
                                     if state_key.startswith(prefix):
                                         prefix_offset = state_key[len(
                                             prefix):].find("_") + len(prefix)
-                                        opt_unq_name = state_key[len(
-                                            param_name + "_"):prefix_offset]
+                                        opt_unq_name = state_key[
+                                            len(param_name + "_"):prefix_offset]
                                         # TODO: assert
                                         # assert opt_unq_name is None
                                     # gen(param.name + "_" + gen(opt_name) + "_" + accum_name)
@@ -601,8 +599,8 @@ def _make_program(self, mode):
                         self.model._optimizer, strategy=dist_strategy)
                 elif self._amp_level != "O0" and core.is_compiled_with_cuda:
                     amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
-                        **self.
-                        _amp_custom_lists) if self._amp_custom_lists else None
+                        **self._amp_custom_lists
+                    ) if self._amp_custom_lists else None
                     self.model._optimizer = paddle.static.amp.decorate(
                         self.model._optimizer,
                         amp_lists=amp_lists,
@@ -665,6 +663,7 @@ def _compile_and_initialize(self, prog, mode):
 
 
 class DynamicGraphAdapter(object):
+
     def __init__(self, model):
         super(DynamicGraphAdapter, self).__init__()
         self.model = model
@@ -716,10 +715,9 @@ def train_batch(self, inputs, labels=None, update=True):
         if self._amp_level != "O0" and self.model._scaler is None:
             self.model._scaler = paddle.amp.GradScaler(**self._amp_configs)
 
-        with paddle.amp.auto_cast(
-                enable=self._amp_level != 'O0',
-                **self._amp_custom_lists,
-                level=self._amp_level):
+        with paddle.amp.auto_cast(enable=self._amp_level != 'O0',
+                                  **self._amp_custom_lists,
+                                  level=self._amp_level):
             if self._nranks > 1:
                 outputs = self.ddp_model.forward(
                     *[to_variable(x) for x in inputs])
@@ -863,8 +861,9 @@ def load(self, param_state_pairs, optim_state, scaler_state=None):
         opt_cls_name = self.model._optimizer.__class__.__name__
         opt_name = opt_unq_name[:opt_unq_name.rfind("_")]  # remove suffix idx
         param_names = [param.name for param in self.model.network.parameters()]
-        for var_name, state_var in sorted(
-                optim_state.items(), key=lambda x: len(x[0]), reverse=True):
+        for var_name, state_var in sorted(optim_state.items(),
+                                          key=lambda x: len(x[0]),
+                                          reverse=True):
             if var_name in ["@LR_DECAY_COUNTER@", "global_step"]:
                 # NOTE: dygraph saved global_step is 1 larger than that in
                 # static-graph, since the time of global_step to increase is
@@ -1413,6 +1412,7 @@ def parameters(self, *args, **kwargs):
         return self._adapter.parameters()
 
     def _prepare_amp(self, amp_configs):
+
         def _check_pure_fp16_configs():
             # pure float16 training has some restricts now
             if self._adapter._amp_level == "O2" and self._optimizer._grad_clip:
@@ -1476,8 +1476,8 @@ def _check_amp_configs(amp_config_key_set):
             }
             if amp_config_key_set - accepted_param_set:
                 raise ValueError(
-                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized.".
-                    format(tuple(amp_config_key_set - accepted_param_set)))
+                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized."
+                    .format(tuple(amp_config_key_set - accepted_param_set)))
 
             if 'use_fp16_guard' in amp_config_key_set:
                 if _non_static_mode():
@@ -1492,7 +1492,10 @@ def _check_amp_configs(amp_config_key_set):
         for key in amp_configs_set:
             self._adapter._amp_configs[key] = amp_configs[key]
 
-    def prepare(self, optimizer=None, loss=None, metrics=None,
+    def prepare(self,
+                optimizer=None,
+                loss=None,
+                metrics=None,
                 amp_configs=None):
         """
         Configures the model before runing.
@@ -1716,29 +1719,26 @@ def fit(self,
                 "train_data must be given!"
 
         if isinstance(train_data, Dataset):
-            train_sampler = DistributedBatchSampler(
-                train_data,
-                batch_size=batch_size,
-                shuffle=shuffle,
-                drop_last=drop_last)
-            train_loader = DataLoader(
-                train_data,
-                batch_sampler=train_sampler,
-                places=self._place,
-                num_workers=num_workers,
-                return_list=True)
+            train_sampler = DistributedBatchSampler(train_data,
+                                                    batch_size=batch_size,
+                                                    shuffle=shuffle,
+                                                    drop_last=drop_last)
+            train_loader = DataLoader(train_data,
+                                      batch_sampler=train_sampler,
+                                      places=self._place,
+                                      num_workers=num_workers,
+                                      return_list=True)
         else:
             train_loader = train_data
 
         if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(
-                eval_data, batch_size=batch_size)
-            eval_loader = DataLoader(
-                eval_data,
-                batch_sampler=eval_sampler,
-                places=self._place,
-                num_workers=num_workers,
-                return_list=True)
+            eval_sampler = DistributedBatchSampler(eval_data,
+                                                   batch_size=batch_size)
+            eval_loader = DataLoader(eval_data,
+                                     batch_sampler=eval_sampler,
+                                     places=self._place,
+                                     num_workers=num_workers,
+                                     return_list=True)
         elif eval_data is not None:
             eval_loader = eval_data
         else:
@@ -1765,7 +1765,8 @@ def fit(self,
             save_freq=save_freq,
             save_dir=save_dir,
             verbose=verbose,
-            metrics=self._metrics_name(), )
+            metrics=self._metrics_name(),
+        )
 
         if any(isinstance(k, EarlyStopping) for k in cbks) and not do_eval:
             warnings.warn("EarlyStopping needs validation data.")
@@ -1853,14 +1854,13 @@ def evaluate(self,
         """
 
         if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(
-                eval_data, batch_size=batch_size)
-            eval_loader = DataLoader(
-                eval_data,
-                batch_sampler=eval_sampler,
-                places=self._place,
-                num_workers=num_workers,
-                return_list=True)
+            eval_sampler = DistributedBatchSampler(eval_data,
+                                                   batch_size=batch_size)
+            eval_loader = DataLoader(eval_data,
+                                     batch_sampler=eval_sampler,
+                                     places=self._place,
+                                     num_workers=num_workers,
+                                     return_list=True)
         else:
             eval_loader = eval_data
 
@@ -1871,7 +1871,8 @@ def evaluate(self,
             model=self,
             log_freq=log_freq,
             verbose=verbose,
-            metrics=self._metrics_name(), )
+            metrics=self._metrics_name(),
+        )
 
         eval_steps = self._len_data_loader(eval_loader)
         self.num_iters = num_iters
@@ -1880,9 +1881,10 @@ def evaluate(self,
             assert num_iters > 0, "num_iters must be greater than 0!"
             eval_steps = min(num_iters, eval_steps)
             self.num_iters = eval_steps
-        cbks.on_begin('eval',
-                      {'steps': eval_steps,
-                       'metrics': self._metrics_name()})
+        cbks.on_begin('eval', {
+            'steps': eval_steps,
+            'metrics': self._metrics_name()
+        })
 
         logs = self._run_one_epoch(eval_loader, cbks, 'eval')
 
@@ -1972,14 +1974,13 @@ def __len__(self):
         """
 
         if test_data is not None and isinstance(test_data, Dataset):
-            test_sampler = DistributedBatchSampler(
-                test_data, batch_size=batch_size)
-            test_loader = DataLoader(
-                test_data,
-                batch_sampler=test_sampler,
-                places=self._place,
-                num_workers=num_workers,
-                return_list=True)
+            test_sampler = DistributedBatchSampler(test_data,
+                                                   batch_size=batch_size)
+            test_loader = DataLoader(test_data,
+                                     batch_sampler=test_sampler,
+                                     places=self._place,
+                                     num_workers=num_workers,
+                                     return_list=True)
         else:
             test_loader = test_data
 
@@ -2059,21 +2060,21 @@ def _save_inference_model(self, path):
             input_names = [v.name for v in self._adapter._input_vars['test']]
             endpoints = self._adapter._endpoints['test']['output']
 
-            fluid.io.save_inference_model(
-                model_path,
-                input_names,
-                endpoints,
-                self._adapter._executor,
-                main_program=infer_prog,
-                model_filename=model_filename,
-                params_filename=params_filename)
+            fluid.io.save_inference_model(model_path,
+                                          input_names,
+                                          endpoints,
+                                          self._adapter._executor,
+                                          main_program=infer_prog,
+                                          model_filename=model_filename,
+                                          params_filename=params_filename)
 
     def _run_one_epoch(
-            self,
-            data_loader,
-            callbacks,
-            mode,
-            logs={}, ):
+        self,
+        data_loader,
+        callbacks,
+        mode,
+        logs={},
+    ):
         outputs = []
         for step, data in enumerate(data_loader):
             # data might come from different types of data_loader and have
@@ -2091,16 +2092,16 @@ def _run_one_epoch(
             # LoDTensor.shape is callable, where LoDTensor comes from
             # DataLoader in static graph
 
-            batch_size = data[0].shape()[0] if callable(data[
-                0].shape) else data[0].shape[0]
+            batch_size = data[0].shape()[0] if callable(
+                data[0].shape) else data[0].shape[0]
 
             callbacks.on_batch_begin(mode, step, logs)
 
             if mode != 'predict':
                 _inputs = [data[:len(self._inputs)], data[len(self._inputs):]]
                 if mode == 'train':
-                    _inputs.append((step + 1) % self._accumulate == 0 or
-                                   step + 1 == len(data_loader))
+                    _inputs.append((step + 1) % self._accumulate == 0
+                                   or step + 1 == len(data_loader))
 
                 outs = getattr(self, mode + '_batch')(*_inputs)
 
@@ -2182,8 +2183,8 @@ def summary(self, input_size=None, dtype=None):
               print(params_info)
 
         """
-        assert (input_size is not None or self._inputs is not None
-                ), "'input_size' or 'self._input' must be set"
+        assert (input_size is not None or self._inputs
+                is not None), "'input_size' or 'self._input' must be set"
         if input_size is not None:
             _input_size = input_size
         else:
@@ -2203,8 +2204,7 @@ def _verify_spec(self, specs, shapes=None, dtypes=None, is_input=False):
                 if shapes is not None and dtypes is not None and fluid._non_static_mode(
                 ):
                     out_specs = [
-                        Input(
-                            name=n, dtype=dtypes[i], shape=shapes[i])
+                        Input(name=n, dtype=dtypes[i], shape=shapes[i])
                         for i, n in enumerate(arg_names)
                     ]
                 else:
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index c3c043bd3fc2b..6928bc75f5f71 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -207,8 +207,8 @@ def _check_shape(shape):
             elif isinstance(item, numbers.Number):
                 if item <= 0:
                     raise ValueError(
-                        "Expected element in input size greater than zero, but got {}".
-                        format(item))
+                        "Expected element in input size greater than zero, but got {}"
+                        .format(item))
             new_shape.append(item)
         return tuple(new_shape)
 
@@ -231,6 +231,7 @@ def _check_input(input_size):
 
 @paddle.no_grad()
 def summary_string(model, input_size=None, dtypes=None, input=None):
+
     def _all_is_numper(items):
         for item in items:
             if not isinstance(item, numbers.Number):
@@ -271,6 +272,7 @@ def _get_output_shape(output):
         return output_shape
 
     def register_hook(layer):
+
         def hook(layer, input, output):
             class_name = str(layer.__class__).split(".")[-1].split("'")[0]
 
@@ -319,9 +321,9 @@ def hook(layer, input, output):
 
             summary[m_key]["nb_params"] = params
 
-        if (not isinstance(layer, nn.Sequential) and
-                not isinstance(layer, nn.LayerList) and
-            (not (layer == model) or depth < 1)):
+        if (not isinstance(layer, nn.Sequential)
+                and not isinstance(layer, nn.LayerList)
+                and (not (layer == model) or depth < 1)):
 
             hooks.append(layer.register_forward_post_hook(hook))
         # For rnn, gru and lstm layer
@@ -416,15 +418,13 @@ def _get_str_length(summary):
             str(summary[layer]["input_shape"]),
             table_width['input_shape_width'],
             str(summary[layer]["output_shape"]),
-            table_width['output_shape_width'],
-            "{0:,}".format(summary[layer]["nb_params"]),
-            table_width['params_width'])
+            table_width['output_shape_width'], "{0:,}".format(
+                summary[layer]["nb_params"]), table_width['params_width'])
         total_params += summary[layer]["nb_params"]
 
         try:
             total_output += np.sum(
-                np.prod(
-                    summary[layer]["output_shape"], axis=-1))
+                np.prod(summary[layer]["output_shape"], axis=-1))
         except:
             for output_shape in summary[layer]["output_shape"]:
                 total_output += np.sum(np.prod(output_shape, axis=-1))
diff --git a/python/paddle/hapi/progressbar.py b/python/paddle/hapi/progressbar.py
index 8020029be2a4e..58dfdef604e7c 100644
--- a/python/paddle/hapi/progressbar.py
+++ b/python/paddle/hapi/progressbar.py
@@ -51,10 +51,11 @@ def __init__(self,
         self._last_update = 0
         self.name = name
 
-        self._dynamic_display = (
-            (hasattr(self.file, 'isatty') and
-             self.file.isatty()) or 'ipykernel' in sys.modules or
-            'posix' in sys.modules or 'PYCHARM_HOSTED' in os.environ)
+        self._dynamic_display = ((hasattr(self.file, 'isatty')
+                                  and self.file.isatty())
+                                 or 'ipykernel' in sys.modules
+                                 or 'posix' in sys.modules
+                                 or 'PYCHARM_HOSTED' in os.environ)
 
     def _get_max_width(self):
         if sys.version_info > (3, 3):
@@ -119,8 +120,8 @@ def convert_uint16_to_float(in_list):
             if self._num is not None:
                 numdigits = int(np.log10(self._num)) + 1
 
-                bar_chars = (self.name + ' %' + str(numdigits) + 'd/%d [') % (
-                    current_num, self._num)
+                bar_chars = (self.name + ' %' + str(numdigits) +
+                             'd/%d [') % (current_num, self._num)
                 prog = float(current_num) / self._num
                 prog_width = int(self._width * prog)
 
@@ -179,8 +180,8 @@ def convert_uint16_to_float(in_list):
         elif self._verbose == 2 or self._verbose == 3:
             if self._num:
                 numdigits = int(np.log10(self._num)) + 1
-                count = (self.name + ' %' + str(numdigits) + 'd/%d') % (
-                    current_num, self._num)
+                count = (self.name + ' %' + str(numdigits) +
+                         'd/%d') % (current_num, self._num)
             else:
                 count = self.name + ' %3d' % current_num
             info = count + info
diff --git a/python/paddle/hapi/static_flops.py b/python/paddle/hapi/static_flops.py
index f386bbd0dd6db..297199b7326a9 100644
--- a/python/paddle/hapi/static_flops.py
+++ b/python/paddle/hapi/static_flops.py
@@ -22,6 +22,7 @@
 
 
 class VarWrapper(object):
+
     def __init__(self, var, graph):
         assert isinstance(var, Variable)
         assert isinstance(graph, GraphWrapper)
@@ -42,6 +43,7 @@ def shape(self):
 
 
 class OpWrapper(object):
+
     def __init__(self, op, graph):
         assert isinstance(graph, GraphWrapper)
         self._op = op
@@ -212,6 +214,7 @@ def static_flops(program, print_detail=False):
 
 
 class Table(object):
+
     def __init__(self, table_heads):
         self.table_heads = table_heads
         self.table_len = []
@@ -225,8 +228,8 @@ def add_row(self, row_str):
             print('The row_str should be a list')
         if len(row_str) != self.col_num:
             print(
-                'The length of row data should be equal the length of table heads, but the data: {} is not equal table heads {}'.
-                format(len(row_str), self.col_num))
+                'The length of row data should be equal the length of table heads, but the data: {} is not equal table heads {}'
+                .format(len(row_str), self.col_num))
         for i in range(self.col_num):
             if len(str(row_str[i])) > self.table_len[i]:
                 self.table_len[i] = len(str(row_str[i]))
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index c354baf3b43b7..c030cf5bbb9ee 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -30,6 +30,7 @@
 from .passes import fuse_resnet_unit_pass
 import paddle.incubate.autograd
 import paddle.incubate.autotune
+import paddle.incubate.sparse
 
 from . import nn  #noqa: F401
 from . import asp  #noqa: F401
diff --git a/python/paddle/incubate/asp/__init__.py b/python/paddle/incubate/asp/__init__.py
index 59f794ef28aa4..d2a56fd117c41 100644
--- a/python/paddle/incubate/asp/__init__.py
+++ b/python/paddle/incubate/asp/__init__.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,10 +19,7 @@
 from ...fluid.contrib.sparsity import set_excluded_layers  #noqa: F401
 from ...fluid.contrib.sparsity import reset_excluded_layers  #noqa: F401
 
-__all__ = [     #noqa
-    'calculate_density',
-    'decorate',
-    'prune_model',
-    'set_excluded_layers',
+__all__ = [  #noqa
+    'calculate_density', 'decorate', 'prune_model', 'set_excluded_layers',
     'reset_excluded_layers'
 ]
diff --git a/python/paddle/incubate/autograd/__init__.py b/python/paddle/incubate/autograd/__init__.py
index a57dac02be4f5..718bc018d9fe5 100644
--- a/python/paddle/incubate/autograd/__init__.py
+++ b/python/paddle/incubate/autograd/__init__.py
@@ -16,12 +16,6 @@
 from .utils import enable_prim, disable_prim, prim_enabled
 
 __all__ = [  # noqa
-    'vjp',
-    'jvp',
-    'Jacobian',
-    'Hessian',
-    'prim2orig',
-    'enable_prim',
-    'disable_prim',
-    'prim_enabled'
+    'vjp', 'jvp', 'Jacobian', 'Hessian', 'prim2orig', 'enable_prim',
+    'disable_prim', 'prim_enabled'
 ]
diff --git a/python/paddle/incubate/autograd/primops.py b/python/paddle/incubate/autograd/primops.py
index 11e0e51cb764c..6017ac3598920 100644
--- a/python/paddle/incubate/autograd/primops.py
+++ b/python/paddle/incubate/autograd/primops.py
@@ -33,9 +33,13 @@ def _simple_binop(helper):
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type=optype, inputs={'X': x,
-                             'Y': y}, outputs={'Z': out}, attrs={})
+    helper.append_op(type=optype,
+                     inputs={
+                         'X': x,
+                         'Y': y
+                     },
+                     outputs={'Z': out},
+                     attrs={})
     return out
 
 
@@ -51,8 +55,10 @@ def _manipulation_unop(helper):
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type=optype, inputs={'X': x}, outputs={'Y': out}, attrs=attrs)
+    helper.append_op(type=optype,
+                     inputs={'X': x},
+                     outputs={'Y': out},
+                     attrs=attrs)
     return out
 
 
@@ -75,12 +81,13 @@ def set_value(x, y, axis, starts, ends, strides, out):
     assert x is out, "x and out should be the same Tensor in set_value"
     attrs = {'axes': axis, 'starts': starts, 'ends': ends, 'steps': strides}
     helper = LayerHelper('set_value', **locals())
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'Input': x,
-                'ValueTensor': y},
-        outputs={'Out': out},
-        attrs=attrs)
+    helper.append_op(type=helper.layer_type,
+                     inputs={
+                         'Input': x,
+                         'ValueTensor': y
+                     },
+                     outputs={'Out': out},
+                     attrs=attrs)
     return out
 
 
@@ -136,7 +143,8 @@ def split(x, num_or_sections, axis=0, outs=None):
     else:
         if not isinstance(num_or_sections, int):
             raise TypeError(
-                f'num_or_sections must be int, but got {type(num_or_sections)}.')
+                f'num_or_sections must be int, but got {type(num_or_sections)}.'
+            )
         n = num_or_sections
 
     attrs = {'num_or_sections': num_or_sections, 'axis': axis}
@@ -147,11 +155,10 @@ def split(x, num_or_sections, axis=0, outs=None):
             helper.create_variable_for_type_inference(dtype=x.dtype)
             for i in range(n)
         ]
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'X': x},
-        outputs={'YS': outs},
-        attrs=attrs)
+    helper.append_op(type=helper.layer_type,
+                     inputs={'X': x},
+                     outputs={'YS': outs},
+                     attrs=attrs)
     return outs
 
 
@@ -163,11 +170,10 @@ def concat(xs, axis=0, out=None):
     helper = LayerHelper('concat_p', **locals())
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=xs[0].dtype)
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'XS': xs},
-        outputs={'Y': out},
-        attrs=attrs)
+    helper.append_op(type=helper.layer_type,
+                     inputs={'XS': xs},
+                     outputs={'Y': out},
+                     attrs=attrs)
     return out
 
 
@@ -183,11 +189,10 @@ def reduce(x, axis, keepdim=False, out=None):
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'X': x},
-        outputs={'Y': out},
-        attrs=attrs)
+    helper.append_op(type=helper.layer_type,
+                     inputs={'X': x},
+                     outputs={'Y': out},
+                     attrs=attrs)
     return out
 
 
@@ -217,11 +222,10 @@ def slice_select(x, axis, starts, ends, strides, out=None):
     helper = LayerHelper('slice_select_p', **locals())
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'X': x},
-        outputs={'Y': out},
-        attrs=attrs)
+    helper.append_op(type=helper.layer_type,
+                     inputs={'X': x},
+                     outputs={'Y': out},
+                     attrs=attrs)
     return out
 
 
@@ -239,12 +243,13 @@ def slice_assign(x, y, axis, starts, ends, strides, out=None):
     helper = LayerHelper('slice_assign_p', **locals())
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Z': out},
-        attrs=attrs)
+    helper.append_op(type=helper.layer_type,
+                     inputs={
+                         'X': x,
+                         'Y': y
+                     },
+                     outputs={'Z': out},
+                     attrs=attrs)
     return out
 
 
@@ -254,12 +259,13 @@ def gather(x, indextensor, axis, out=None):
     helper = LayerHelper('gather_p', **locals())
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'X': x,
-                'IndexTensor': indextensor},
-        outputs={'Y': out},
-        attrs=attrs)
+    helper.append_op(type=helper.layer_type,
+                     inputs={
+                         'X': x,
+                         'IndexTensor': indextensor
+                     },
+                     outputs={'Y': out},
+                     attrs=attrs)
     return out
 
 
@@ -279,11 +285,12 @@ def scatter_add(x, y, indextensor, axis, out=None):
     helper = LayerHelper('scatter_add_p', **locals())
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type=helper.layer_type,
-        inputs={'X': x,
-                'Y': y,
-                'IndexTensor': indextensor},
-        outputs={'Z': out},
-        attrs=attrs)
+    helper.append_op(type=helper.layer_type,
+                     inputs={
+                         'X': x,
+                         'Y': y,
+                         'IndexTensor': indextensor
+                     },
+                     outputs={'Z': out},
+                     attrs=attrs)
     return out
diff --git a/python/paddle/incubate/autograd/primreg.py b/python/paddle/incubate/autograd/primreg.py
index 35a0dbcfc293f..6c3ece09a6be1 100644
--- a/python/paddle/incubate/autograd/primreg.py
+++ b/python/paddle/incubate/autograd/primreg.py
@@ -186,6 +186,7 @@ def tanh_orig2prim(op):
         raise TypeError(f'op_type must be str, but got {type(op_type)}.')
 
     def wrapper(f):
+
         def _lower(op, *args, **kwargs):
             assert op.type == op_type, f'op.type should be equal to op_type, but op.type is {op.type} and op_type is {op_type}'
             return f(op, *args, **kwargs)
@@ -217,6 +218,7 @@ def tanh_prim2orig(op):
         raise TypeError(f'op_type must be str, but got {type(op_type)}.')
 
     def wrapper(f):
+
         def _lower(op, *args, **kwargs):
             assert op.type == op_type, f'op.type should be equal to op_type, but op.type is {op.type} and op_type is {op_type}'
             return f(op, *args, **kwargs)
@@ -247,6 +249,7 @@ def add_jvp(op, x_dot, y_dot):
         raise TypeError(f'op_type must be str, but got {type(op_type)}.')
 
     def wrapper(f):
+
         def _jvp(op, *args, **kwargs):
             assert op.type == op_type, f'op.type should be equal to op_type, but op.type is {op.type} and op_type is {op_type}'
             return f(op, *args, **kwargs)
@@ -279,6 +282,7 @@ def add_transpose(op, z_bar):
         raise TypeError(f'op_type must be str, but got {type(op_type)}.')
 
     def wrapper(f):
+
         def _transpose(op, dot_checker, *args, **kwargs):
             assert op.type == op_type, f'op.type should be equal to op_type, but op.type is {op.type} and op_type is {op_type}'
             return f(op, dot_checker, *args, **kwargs)
diff --git a/python/paddle/incubate/autograd/primrules.py b/python/paddle/incubate/autograd/primrules.py
index 075fe83e25289..7983032f1a124 100644
--- a/python/paddle/incubate/autograd/primrules.py
+++ b/python/paddle/incubate/autograd/primrules.py
@@ -79,17 +79,20 @@ def elementwise_add_orig2prim(op, x, y):
     if x.shape != y.shape:
         y = broadcast(y, shape=x.shape)
     if op.attr('Scale_x') - 1.0 > 1e-5:
-        scale_x = fill_const(
-            shape=x.shape, dtype=x.dtype, value=op.attr('Scale_x'))
+        scale_x = fill_const(shape=x.shape,
+                             dtype=x.dtype,
+                             value=op.attr('Scale_x'))
         x = mul(x, scale_x)
     if op.attr('Scale_y') - 1.0 > 1e-5:
-        scale_y = fill_const(
-            shape=y.shape, dtype=y.dtype, value=op.attr('Scale_y'))
+        scale_y = fill_const(shape=y.shape,
+                             dtype=y.dtype,
+                             value=op.attr('Scale_y'))
         y = mul(y, scale_y)
     z = add(x, y)
     if op.attr('Scale_out') - 1.0 > 1e-5:
-        scale_out = fill_const(
-            shape=z.shape, dtype=z.dtype, value=op.attr('Scale_out'))
+        scale_out = fill_const(shape=z.shape,
+                               dtype=z.dtype,
+                               value=op.attr('Scale_out'))
         z = mul(z, scale_out)
     return z
 
@@ -99,17 +102,20 @@ def elementwise_sub_orig2prim(op, x, y):
     if x.shape != y.shape:
         y = broadcast(y, shape=x.shape)
     if op.attr('Scale_x') - 1.0 > 1e-5:
-        scale_x = fill_const(
-            shape=x.shape, dtype=x.dtype, value=op.attr('Scale_x'))
+        scale_x = fill_const(shape=x.shape,
+                             dtype=x.dtype,
+                             value=op.attr('Scale_x'))
         x = mul(x, scale_x)
     if op.attr('Scale_y') - 1.0 > 1e-5:
-        scale_y = fill_const(
-            shape=y.shape, dtype=y.dtype, value=op.attr('Scale_y'))
+        scale_y = fill_const(shape=y.shape,
+                             dtype=y.dtype,
+                             value=op.attr('Scale_y'))
         y = mul(y, scale_y)
     z = sub(x, y)
     if op.attr('Scale_out') - 1.0 > 1e-5:
-        scale_out = fill_const(
-            shape=z.shape, dtype=z.dtype, value=op.attr('Scale_out'))
+        scale_out = fill_const(shape=z.shape,
+                               dtype=z.dtype,
+                               value=op.attr('Scale_out'))
         z = mul(z, scale_out)
     return z
 
@@ -119,17 +125,20 @@ def elementwise_mul_orig2prim(op, x, y):
     if x.shape != y.shape:
         y = broadcast(y, shape=x.shape)
     if op.attr('Scale_x') - 1.0 > 1e-5:
-        scale_x = fill_const(
-            shape=x.shape, dtype=x.dtype, value=op.attr('Scale_x'))
+        scale_x = fill_const(shape=x.shape,
+                             dtype=x.dtype,
+                             value=op.attr('Scale_x'))
         x = mul(x, scale_x)
     if op.attr('Scale_y') - 1.0 > 1e-5:
-        scale_y = fill_const(
-            shape=y.shape, dtype=y.dtype, value=op.attr('Scale_y'))
+        scale_y = fill_const(shape=y.shape,
+                             dtype=y.dtype,
+                             value=op.attr('Scale_y'))
         y = mul(y, scale_y)
     z = mul(x, y)
     if op.attr('Scale_out') - 1.0 > 1e-5:
-        scale_out = fill_const(
-            shape=z.shape, dtype=z.dtype, value=op.attr('Scale_out'))
+        scale_out = fill_const(shape=z.shape,
+                               dtype=z.dtype,
+                               value=op.attr('Scale_out'))
         z = mul(z, scale_out)
     return z
 
@@ -160,8 +169,9 @@ def index_select_orig2prim(op, index_t, x):
 @REGISTER_ORIG2PRIM('scale')
 def scale_orig2prim(op, scale_t, x):
     if scale_t is None:
-        scale_t = fill_const(
-            shape=x.shape, dtype=x.dtype, value=op.attr('scale'))
+        scale_t = fill_const(shape=x.shape,
+                             dtype=x.dtype,
+                             value=op.attr('scale'))
     bias_t = fill_const(shape=x.shape, dtype=x.dtype, value=op.attr('bias'))
     if op.attr('bias_after_scale'):
         return add(mul(x, scale_t), bias_t)
@@ -182,6 +192,7 @@ def sqrt_orig2prim(op, x):
 
 @REGISTER_ORIG2PRIM('matmul_v2')
 def matmul_v2_orig2prim(op, x, y):
+
     def trans(shape):
         ret = [i for i in range(len(shape))]
         ret[-1], ret[-2] = ret[-2], ret[-1]
@@ -207,9 +218,9 @@ def reshape2_orig2prim(op, shape_t, shape_tl, x):
     assert shape_t is None, 'Can not lower reshape2 into prim ops with shapetensor.'
     assert shape_tl is None, 'Can not lower reshape2 into prim ops with shapetensorlist.'
     y, xshape = get_output_var_list(op)
-    return reshape(
-        x, shape=y.shape), fill_const(
-            shape=xshape.shape, dtype=xshape.dtype, value=0.0)
+    return reshape(x, shape=y.shape), fill_const(shape=xshape.shape,
+                                                 dtype=xshape.dtype,
+                                                 value=0.0)
 
 
 @REGISTER_ORIG2PRIM('concat')
@@ -236,6 +247,7 @@ def slice_orig2prim(op, ends_t, ends_tl, x, starts_t, starts_tl):
 
 @REGISTER_ORIG2PRIM('p_norm')
 def p_norm_orig2prim(op, x):
+
     def num_el(shape):
         n = 1
         for s in shape:
@@ -308,8 +320,9 @@ def split_prim2orig(op, x):
     num_or_sections = op.attr('num_or_sections')
     if len(num_or_sections) == 1:
         num_or_sections = num_or_sections[0]
-    return paddle.split(
-        x, num_or_sections=num_or_sections, axis=op.attr('axis'))
+    return paddle.split(x,
+                        num_or_sections=num_or_sections,
+                        axis=op.attr('axis'))
 
 
 @REGISTER_PRIM2ORIG('concat_p')
@@ -329,25 +342,23 @@ def matmul_prim2orig(op, x, y):
 
 @REGISTER_PRIM2ORIG('slice_select_p')
 def slice_select_prim2orig(op, x):
-    return paddle.strided_slice(
-        x,
-        axes=op.attr('axis'),
-        starts=op.attr('starts'),
-        ends=op.attr('ends'),
-        strides=op.attr('strides'))
+    return paddle.strided_slice(x,
+                                axes=op.attr('axis'),
+                                starts=op.attr('starts'),
+                                ends=op.attr('ends'),
+                                strides=op.attr('strides'))
 
 
 @REGISTER_PRIM2ORIG('slice_assign_p')
 def slice_assign_prim2orig(op, x, y):
     x_copy = paddle.assign(x)
-    return set_value(
-        x_copy,
-        y,
-        axis=op.attr('axis'),
-        starts=op.attr('starts'),
-        ends=op.attr('ends'),
-        strides=op.attr('strides'),
-        out=x_copy)
+    return set_value(x_copy,
+                     y,
+                     axis=op.attr('axis'),
+                     starts=op.attr('starts'),
+                     ends=op.attr('ends'),
+                     strides=op.attr('strides'),
+                     out=x_copy)
 
 
 @REGISTER_PRIM2ORIG('gather_p')
@@ -365,10 +376,9 @@ def scatter_add_prim2orig(op, index_t, x, y):
 
 @REGISTER_PRIM2ORIG('fill_constant_p')
 def fill_constant_prim2orig(op):
-    return paddle.full(
-        shape=op.attr('shape'),
-        fill_value=op.attr('value'),
-        dtype=INT_DTYPE_2_STRING[op.attr('dtype')])
+    return paddle.full(shape=op.attr('shape'),
+                       fill_value=op.attr('value'),
+                       dtype=INT_DTYPE_2_STRING[op.attr('dtype')])
 
 
 ## Register linearize rules
@@ -515,8 +525,12 @@ def slice_select_jvp(op, x_dot):
     starts = op.attr('starts')
     ends = op.attr('ends')
     strides = op.attr('strides')
-    return linear_jvp(
-        op, x_dot, axis=axis, starts=starts, ends=ends, strides=strides)
+    return linear_jvp(op,
+                      x_dot,
+                      axis=axis,
+                      starts=starts,
+                      ends=ends,
+                      strides=strides)
 
 
 @REGISTER_JVP('slice_assign_p')
@@ -530,8 +544,13 @@ def slice_assign_jvp(op, x_dot, y_dot):
     starts = op.attr('starts')
     ends = op.attr('ends')
     strides = op.attr('strides')
-    return linear_jvp(
-        op, x_dot, y_dot, axis=axis, starts=starts, ends=ends, strides=strides)
+    return linear_jvp(op,
+                      x_dot,
+                      y_dot,
+                      axis=axis,
+                      starts=starts,
+                      ends=ends,
+                      strides=strides)
 
 
 @REGISTER_JVP('gather_p')
@@ -677,8 +696,12 @@ def slice_select_transpose(op, check_dot, y_bar):
     starts = op.attr('starts')
     ends = op.attr('ends')
     strides = op.attr('strides')
-    return slice_assign(
-        zeros, y_bar, axis=axis, starts=starts, ends=ends, strides=strides)
+    return slice_assign(zeros,
+                        y_bar,
+                        axis=axis,
+                        starts=starts,
+                        ends=ends,
+                        strides=strides)
 
 
 @REGISTER_TRANSPOSE('slice_assign_p')
@@ -692,10 +715,17 @@ def slice_assign_transpose(op, check_dot, z_bar):
     starts = op.attr('starts')
     ends = op.attr('ends')
     strides = op.attr('strides')
-    x_bar = slice_assign(
-        z_bar, zeros, axis=axis, starts=starts, ends=ends, strides=strides)
-    y_bar = slice_select(
-        z_bar, axis=axis, starts=starts, ends=ends, strides=strides)
+    x_bar = slice_assign(z_bar,
+                         zeros,
+                         axis=axis,
+                         starts=starts,
+                         ends=ends,
+                         strides=strides)
+    y_bar = slice_select(z_bar,
+                         axis=axis,
+                         starts=starts,
+                         ends=ends,
+                         strides=strides)
     return x_bar, y_bar
 
 
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index 1f5c4f9a5cebb..5ee45116e66d8 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -51,7 +51,9 @@ def topo_path(xs, ys, block=None):
         reached_vars[id(x)] = x
 
     # Reaching test, returning whether an op is reached from the given input
-    reaching = lambda op: any(id(v) in reached_vars for v in flatten_and_remove_none(get_input_var_list(op)))
+    reaching = lambda op: any(
+        id(v) in reached_vars
+        for v in flatten_and_remove_none(get_input_var_list(op)))
 
     # block.ops are supposedly in the order that preserves correct data
     # dependence.
@@ -63,7 +65,9 @@ def topo_path(xs, ys, block=None):
                 reached_vars[id(var)] = var
 
     used_vars = OrderedDict((id(y), y) for y in ys if id(y) in reached_vars)
-    back_reaching = lambda op: any(id(out) in used_vars for out in flatten_and_remove_none(get_output_var_list(op)))
+    back_reaching = lambda op: any(
+        id(out) in used_vars
+        for out in flatten_and_remove_none(get_output_var_list(op)))
 
     # Backward pass to find all used variables
     for op in reversed(path):
@@ -276,7 +280,7 @@ def linearize(self, xs, ys, xs_dot=None):
             self.var2dot.delete(x)
 
         for op in path:
-            # An input var may not be on the input-output path, which implies 
+            # An input var may not be on the input-output path, which implies
             # there may be None's in `ins_dot`. In this case we place
             # the original input in the position of the otherwise forward
             # gradient.
@@ -476,13 +480,12 @@ def expand_nested_list(xs):
             from paddle.fluid.dygraph.base import param_guard
             new_op_desc = block.desc.append_op()
             with param_guard(inputs), param_guard(outputs):
-                op = Operator(
-                    block=block,
-                    desc=new_op_desc,
-                    type=op.type,
-                    inputs=inputs,
-                    outputs=outputs,
-                    attrs=attrs)
+                op = Operator(block=block,
+                              desc=new_op_desc,
+                              type=op.type,
+                              inputs=inputs,
+                              outputs=outputs,
+                              attrs=attrs)
             block.ops.append(op)
 
     # Step3: Do some post-processing work
@@ -594,7 +597,7 @@ def _gradients(ys, xs, ys_bar=None):
         assert el is None or el.block == block, f'variable in xs and ys should be None or in current block of main program'
     # TODO(Tongxin) without any prior knowledge about whether the program
     # is completely lowered to primitive ops, it's mandatory to run the lowering
-    # pass once and again. This is obviously inefficient and needs to be 
+    # pass once and again. This is obviously inefficient and needs to be
     # optimized.
     orig2prim(block)
 
diff --git a/python/paddle/incubate/autograd/utils.py b/python/paddle/incubate/autograd/utils.py
index ec4f0915ba34f..44bbd32bc9c32 100644
--- a/python/paddle/incubate/autograd/utils.py
+++ b/python/paddle/incubate/autograd/utils.py
@@ -17,6 +17,7 @@
 
 
 class PrimOption(object):
+
     def __init__(self):
         self.enable_prim = False
 
diff --git a/python/paddle/incubate/autotune.py b/python/paddle/incubate/autotune.py
index 7ac555e2520ea..db7f881e4cf68 100644
--- a/python/paddle/incubate/autotune.py
+++ b/python/paddle/incubate/autotune.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/incubate/distributed/models/moe/__init__.py b/python/paddle/incubate/distributed/models/moe/__init__.py
index fd06b4b8e5287..795c939e81fbb 100644
--- a/python/paddle/incubate/distributed/models/moe/__init__.py
+++ b/python/paddle/incubate/distributed/models/moe/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,4 +15,5 @@
 from .gate import GShardGate, BaseGate, SwitchGate, NaiveGate
 from .moe_layer import MoELayer
 from .grad_clip import ClipGradForMOEByGlobalNorm
+
 ClipGradByGlobalNorm = ClipGradForMOEByGlobalNorm
diff --git a/python/paddle/incubate/distributed/models/moe/gate/__init__.py b/python/paddle/incubate/distributed/models/moe/gate/__init__.py
index d4bf666eb698e..2bfa5cd62cd49 100644
--- a/python/paddle/incubate/distributed/models/moe/gate/__init__.py
+++ b/python/paddle/incubate/distributed/models/moe/gate/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/incubate/distributed/models/moe/gate/base_gate.py b/python/paddle/incubate/distributed/models/moe/gate/base_gate.py
index f527e82f043c7..9715f4b2a25a6 100644
--- a/python/paddle/incubate/distributed/models/moe/gate/base_gate.py
+++ b/python/paddle/incubate/distributed/models/moe/gate/base_gate.py
@@ -1,17 +1,17 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# 
+#
 # The file has been adapted from the file:
 #     https://github.com/laekov/fastmoe/blob/master/fmoe/gates/base_gate.py
 #     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4
@@ -23,6 +23,7 @@
 
 
 class BaseGate(nn.Layer):
+
     def __init__(self, num_expert, world_size):
         super().__init__()
         self.world_size = world_size
diff --git a/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py b/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py
index 3618ec56e96c9..643e23feff164 100644
--- a/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py
+++ b/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -28,6 +28,7 @@
 
 
 class GShardGate(NaiveGate):
+
     def __init__(self,
                  d_model,
                  num_expert,
@@ -43,32 +44,29 @@ def __init__(self,
         self.group = group
 
     def forward(self, x):
-        topk_val, topk_idx, gate_score = super().forward(
-            x, return_all_scores=True)
+        topk_val, topk_idx, gate_score = super().forward(x,
+                                                         return_all_scores=True)
         s = gate_score.shape[0]
         top1_idx = topk_idx.flatten()
-        c_e = paddle.scatter(
-            paddle.zeros(shape=[self.tot_expert]),
-            top1_idx,
-            paddle.ones_like(
-                top1_idx, dtype="float32"),
-            overwrite=False) / s
+        c_e = paddle.scatter(paddle.zeros(shape=[self.tot_expert]),
+                             top1_idx,
+                             paddle.ones_like(top1_idx, dtype="float32"),
+                             overwrite=False) / s
         m_e = paddle.mean(F.softmax(gate_score, axis=1), axis=0)
         loss = paddle.mean(c_e * m_e) * (self.num_expert**2)
         self.set_loss(loss)
 
         cap_rate = self.capacity[0 if self.training else 1]
         capacity = math.ceil(cap_rate * x.shape[0])
-        _new_lec, _new_gec, topk_idx = limit_by_capacity(
-            topk_idx,
-            self.num_expert,
-            self.world_size,
-            capacity,
-            group=self.group)
+        _new_lec, _new_gec, topk_idx = limit_by_capacity(topk_idx,
+                                                         self.num_expert,
+                                                         self.world_size,
+                                                         capacity,
+                                                         group=self.group)
 
         if self.random_routing:
-            rand_routing_prob = paddle.rand(
-                shape=[gate_score.shape[0]], dtype="float32")
+            rand_routing_prob = paddle.rand(shape=[gate_score.shape[0]],
+                                            dtype="float32")
             topk_idx = paddle.distributed.models.moe.utils._random_routing(
                 topk_idx, topk_val, rand_routing_prob)
         return topk_val, topk_idx
diff --git a/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py b/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py
index c3c68685445c8..476f99b9f4431 100644
--- a/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py
+++ b/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py
@@ -3,9 +3,9 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,6 +27,7 @@
 
 
 class NaiveGate(BaseGate):
+
     def __init__(self, d_model, num_expert, world_size, topk=2):
         super().__init__(num_expert, world_size)
         self.gate = nn.Linear(d_model, self.tot_expert)
@@ -36,8 +37,11 @@ def __init__(self, d_model, num_expert, world_size, topk=2):
 
     def forward(self, inp, return_all_scores=False):
         gate = self.gate(inp)
-        gate_top_k_val, gate_top_k_idx = paddle.topk(
-            gate, k=self.top_k, axis=-1, largest=True, sorted=False)
+        gate_top_k_val, gate_top_k_idx = paddle.topk(gate,
+                                                     k=self.top_k,
+                                                     axis=-1,
+                                                     largest=True,
+                                                     sorted=False)
 
         if return_all_scores:
             return gate_top_k_val, gate_top_k_idx, gate
diff --git a/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py b/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py
index 776516989e5a1..604751985406a 100644
--- a/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py
+++ b/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py
@@ -3,9 +3,9 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -28,6 +28,7 @@
 
 
 class SwitchGate(NaiveGate):
+
     def __init__(self,
                  d_model,
                  num_expert,
@@ -55,20 +56,19 @@ def forward(self, inp):
 
         cap_rate = self.capacity[0 if self.training else 1]
         capacity = math.ceil(cap_rate * inp.shape[0])
-        _new_lec, _new_gec, top1_idx = limit_by_capacity(
-            top1_idx,
-            self.num_expert,
-            self.world_size,
-            capacity,
-            group=self.group)
+        _new_lec, _new_gec, top1_idx = limit_by_capacity(top1_idx,
+                                                         self.num_expert,
+                                                         self.world_size,
+                                                         capacity,
+                                                         group=self.group)
         valid_idx = top1_idx[top1_idx > -1]
         valid_idx_tmp = paddle.reshape(valid_idx, shape=[len(valid_idx), 1])
         fraction_expert = paddle.scatter_nd_add(
             x=paddle.zeros(shape=[self.tot_expert]),
             index=valid_idx_tmp,
-            updates=paddle.ones_like(
-                valid_idx, dtype=paddle.float32).reshape(
-                    shape=[len(valid_idx)]), ) / valid_idx.numel()
+            updates=paddle.ones_like(valid_idx, dtype=paddle.float32).reshape(
+                shape=[len(valid_idx)]),
+        ) / valid_idx.numel()
         prob_expert = score.sum(axis=0) / valid_idx.numel()
         loss = (fraction_expert * prob_expert).sum() * self.tot_expert
         self.set_loss(loss)
diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py
index cf56f74d1f12d..83e491a08745b 100644
--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -178,10 +178,9 @@ def _dygraph_clip(self, params_grads):
             global_norm_var_moe, _ \
                 = self.get_l2_norm_pow(moe_params_grads, sum_dtype)
             if global_norm_var_moe is not None:
-                collective.all_reduce(
-                    global_norm_var_moe,
-                    op=collective.ReduceOp.SUM,
-                    group=self.moe_group)
+                collective.all_reduce(global_norm_var_moe,
+                                      op=collective.ReduceOp.SUM,
+                                      group=self.moe_group)
 
         if global_norm_var_normal is None and global_norm_var_moe is None:
             return params_grads
@@ -199,12 +198,13 @@ def _dygraph_clip(self, params_grads):
 
         params_and_grads = []
         global_norm_var = layers.sqrt(global_norm_var)
-        max_global_norm = layers.fill_constant(
-            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
-        clip_var = layers.elementwise_div(
-            x=max_global_norm,
-            y=layers.elementwise_max(
-                x=global_norm_var, y=max_global_norm))
+        max_global_norm = layers.fill_constant(shape=[1],
+                                               dtype=global_norm_var.dtype,
+                                               value=self.clip_norm)
+        clip_var = layers.elementwise_div(x=max_global_norm,
+                                          y=layers.elementwise_max(
+                                              x=global_norm_var,
+                                              y=max_global_norm))
         for p, g in params_grads:
             if g is None:
                 continue
diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py
index ba22ffee3e4d6..367b2c189e3ad 100644
--- a/python/paddle/incubate/distributed/models/moe/moe_layer.py
+++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -31,11 +31,12 @@
 
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
 from paddle.distributed import fleet
-from paddle.autograd import PyLayer
+from paddle.autograd import PyLayer, EagerPyLayer
 from .gate import NaiveGate, GShardGate, SwitchGate, BaseGate
 from .utils import count_by_gate
 from paddle.distributed.fleet.meta_parallel.pp_utils.utils import _hp_recompute
 from paddle import fluid
+from paddle.fluid.framework import in_dygraph_mode
 
 
 def _local_scatter(inp, pos):
@@ -50,12 +51,11 @@ def _local_gather(inp, pos, out_batch_size, maybe_overlap=True):
     if pos.shape != [0]:
         origin_dtype = inp.dtype
         inp = paddle.cast(inp, dtype="float32")
-        inp_buf = paddle.scatter(
-            paddle.zeros(
-                shape=[out_batch_size, inp.shape[-1]], dtype="float32"),
-            pos,
-            inp,
-            overwrite=True)
+        inp_buf = paddle.scatter(paddle.zeros(
+            shape=[out_batch_size, inp.shape[-1]], dtype="float32"),
+                                 pos,
+                                 inp,
+                                 overwrite=True)
         inp_buf = paddle.cast(inp_buf, dtype=origin_dtype)
     else:
         inp_buf = paddle.zeros([out_batch_size, inp.shape[-1]], dtype=inp.dtype)
@@ -63,17 +63,26 @@ def _local_gather(inp, pos, out_batch_size, maybe_overlap=True):
 
 
 def _all_gather(tensor, group=None, use_calc_stream=True):
-    """
-    The main difference with paddle.distributed.all_gather: 
-    no need to pass in tensor_list, the returned tensor is spliced
-    """
     if group is not None and not group.is_member():
         return
-    ring_id = 0 if group is None else group.id
-    nranks = paddle.distributed.collective._get_global_group(
-    ).nranks if group is None else group.nranks
-    return paddle._C_ops.c_allgather(tensor, 'use_calc_stream', use_calc_stream,
-                                     'ring_id', ring_id, 'nranks', nranks)
+
+    if in_dygraph_mode():
+        group = paddle.distributed.collective._get_default_group(
+        ) if group is None else group
+        tensor_shape = list(tensor.shape)
+        tensor_shape[0] *= group.nranks
+        out = paddle.empty(tensor_shape, tensor.dtype)
+
+        task = group.process_group.all_gather(tensor, out)
+        task.wait()
+        return out
+    else:
+        ring_id = 0 if group is None else group.id
+        nranks = paddle.distributed.collective._get_global_group(
+        ).nranks if group is None else group.nranks
+        return paddle._C_ops.c_allgather(tensor, 'use_calc_stream',
+                                         use_calc_stream, 'ring_id', ring_id,
+                                         'nranks', nranks)
 
 
 class MoEScatter(PyLayer):
@@ -94,11 +103,57 @@ def forward(ctx,
                 group=None):
         local_input_buf = _local_scatter(inp, pos)
         if world_size > 1:
-            global_input_buf = global_scatter(
-                local_input_buf,
+            global_input_buf = global_scatter(local_input_buf,
+                                              local_expert_count,
+                                              global_expert_count,
+                                              group=group)
+        else:
+            global_input_buf = local_input_buf
+
+        ctx.moe_args = inp.shape[0], world_size, group
+
+        variables = (pos, local_expert_count, global_expert_count)
+        ctx.save_for_backward(*variables)
+        return global_input_buf
+
+    @staticmethod
+    def backward(ctx, grad):
+        (pos, local_expert_count, global_expert_count) = ctx.saved_tensor()
+        (inp_batch_size, world_size, group) = ctx.moe_args
+
+        if world_size > 1:
+            local_grad_in = global_gather(grad,
+                                          local_expert_count,
+                                          global_expert_count,
+                                          group=group)
+        else:
+            local_grad_in = grad
+        grad_in = _local_gather(local_grad_in, pos, inp_batch_size)
+        return grad_in, None, None, None
+
+
+class EagerMoEScatter(EagerPyLayer):
+    r"""
+    Scatter input samples from [batch x sequences] to contiguous alone experts.
+    If `world_size` is greater than 1, the samples will first be locally
+    scattered, and then exchanged across workers.
+    """
+
+    @staticmethod
+    def forward(ctx,
+                inp,
+                pos,
                 local_expert_count,
                 global_expert_count,
-                group=group)
+                fwd_batch_size,
+                world_size,
+                group=None):
+        local_input_buf = _local_scatter(inp, pos)
+        if world_size > 1:
+            global_input_buf = global_scatter(local_input_buf,
+                                              local_expert_count,
+                                              global_expert_count,
+                                              group=group)
         else:
             global_input_buf = local_input_buf
 
@@ -114,8 +169,10 @@ def backward(ctx, grad):
         (inp_batch_size, world_size, group) = ctx.moe_args
 
         if world_size > 1:
-            local_grad_in = global_gather(
-                grad, local_expert_count, global_expert_count, group=group)
+            local_grad_in = global_gather(grad,
+                                          local_expert_count,
+                                          global_expert_count,
+                                          group=group)
         else:
             local_grad_in = grad
         grad_in = _local_gather(local_grad_in, pos, inp_batch_size)
@@ -138,15 +195,63 @@ def forward(ctx,
                 world_size,
                 group=None):
         if world_size > 1:
-            local_output_buf = global_gather(
+            local_output_buf = global_gather(global_output_buf,
+                                             local_expert_count,
+                                             global_expert_count,
+                                             group=group)
+        else:
+            local_output_buf = global_output_buf
+        output = _local_gather(local_output_buf,
+                               pos,
+                               local_batch_size,
+                               maybe_overlap=False)
+
+        ctx.moe_args = (global_output_buf.shape[0], world_size, group)
+        variables = (pos, local_expert_count, global_expert_count)
+        ctx.save_for_backward(*variables)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        pos, local_expert_count, global_expert_count = ctx.saved_tensor()
+        fwd_batch_size, world_size, group = ctx.moe_args
+        grad_out_buf = _local_scatter(grad_out, pos)
+        if world_size > 1:
+            global_grad_out_buf = global_scatter(grad_out_buf,
+                                                 local_expert_count,
+                                                 global_expert_count,
+                                                 group=group)
+        else:
+            global_grad_out_buf = grad_out_buf
+        return global_grad_out_buf, None, None, None
+
+
+class EagerMoEGather(EagerPyLayer):
+    r"""
+    Gather output samples from contiguous alone experts back to [batch x
+    sequences]. Works symmetrically with MoEScatter.
+    """
+
+    @staticmethod
+    def forward(ctx,
                 global_output_buf,
+                pos,
                 local_expert_count,
                 global_expert_count,
-                group=group)
+                local_batch_size,
+                world_size,
+                group=None):
+        if world_size > 1:
+            local_output_buf = global_gather(global_output_buf,
+                                             local_expert_count,
+                                             global_expert_count,
+                                             group=group)
         else:
             local_output_buf = global_output_buf
-        output = _local_gather(
-            local_output_buf, pos, local_batch_size, maybe_overlap=False)
+        output = _local_gather(local_output_buf,
+                               pos,
+                               local_batch_size,
+                               maybe_overlap=False)
 
         ctx.moe_args = (global_output_buf.shape[0], world_size, group)
         variables = (pos, local_expert_count, global_expert_count)
@@ -159,11 +264,10 @@ def backward(ctx, grad_out):
         fwd_batch_size, world_size, group = ctx.moe_args
         grad_out_buf = _local_scatter(grad_out, pos)
         if world_size > 1:
-            global_grad_out_buf = global_scatter(
-                grad_out_buf,
-                local_expert_count,
-                global_expert_count,
-                group=group)
+            global_grad_out_buf = global_scatter(grad_out_buf,
+                                                 local_expert_count,
+                                                 global_expert_count,
+                                                 group=group)
         else:
             global_grad_out_buf = grad_out_buf
         return global_grad_out_buf, None, None, None
@@ -185,8 +289,32 @@ def forward(ctx, inp, rank, world_size, group):
     @staticmethod
     def backward(ctx, grad_out):
         rank, dim0 = ctx.args
-        return paddle.slice(
-            grad_out, axes=[0], starts=[rank * dim0], ends=[(rank + 1) * dim0])
+        return paddle.slice(grad_out,
+                            axes=[0],
+                            starts=[rank * dim0],
+                            ends=[(rank + 1) * dim0])
+
+
+class EagerAllGather(EagerPyLayer):
+    r"""
+    A wrapper for the All-Gather function to support auto-differentiation.
+    """
+
+    @staticmethod
+    def forward(ctx, inp, rank, world_size, group):
+        tensor_list = []
+        paddle.distributed.all_gather(tensor_list, inp, group=group)
+        output = paddle.concat(tensor_list, axis=0)
+        ctx.args = rank, inp.shape[0]
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        rank, dim0 = ctx.args
+        return paddle.slice(grad_out,
+                            axes=[0],
+                            starts=[rank * dim0],
+                            ends=[(rank + 1) * dim0])
 
 
 class Slice(PyLayer):
@@ -200,19 +328,41 @@ def forward(ctx, inp, rank, world_size, group):
         local_batch_size = B // world_size
         batch_start = local_batch_size * rank
         batch_end = min(batch_start + local_batch_size, B)
-        inp = paddle.slice(
-            inp, axes=[0], starts=[batch_start], ends=[batch_end])
+        inp = paddle.slice(inp,
+                           axes=[0],
+                           starts=[batch_start],
+                           ends=[batch_end])
+        ctx.args = world_size, group
+        return inp
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        world_size, group = ctx.args
+        return _all_gather(grad_out, group=group)
+
+
+class EagerSlice(EagerPyLayer):
+    r"""
+    A wrapper for the Slice function to support auto-differentiation.
+    """
+
+    @staticmethod
+    def forward(ctx, inp, rank, world_size, group):
+        B = inp.shape[0]
+        local_batch_size = B // world_size
+        batch_start = local_batch_size * rank
+        batch_end = min(batch_start + local_batch_size, B)
+        inp = paddle.slice(inp,
+                           axes=[0],
+                           starts=[batch_start],
+                           ends=[batch_end])
         ctx.args = world_size, group
         return inp
 
     @staticmethod
     def backward(ctx, grad_out):
         world_size, group = ctx.args
-        # tensor_list = []
-        # paddle.distributed.all_gather(tensor_list, grad_out, group=group)
-        # grad_out = paddle.concat(tensor_list, axis=0)
         return _all_gather(grad_out, group=group)
-        # return grad_out
 
 
 def prepare_forward(gate, num_expert, world_size, moe_group):
@@ -227,7 +377,8 @@ def prepare_forward(gate, num_expert, world_size, moe_group):
         local_expert_count,
         global_expert_count,
         fwd_expert_count,
-        fwd_batch_size, )
+        fwd_batch_size,
+    )
 
 
 class MoELayer(nn.Layer):
@@ -326,25 +477,22 @@ def __init__(self,
             self.top_k = gate.get("top_k", 2)
             gate = gate.get("type", "gshard")
             if gate == "naive" or gate is None:
-                gate = NaiveGate(
-                    self.d_model,
-                    num_expert=len(experts),
-                    world_size=self.world_size,
-                    topk=self.top_k)
+                gate = NaiveGate(self.d_model,
+                                 num_expert=len(experts),
+                                 world_size=self.world_size,
+                                 topk=self.top_k)
             elif gate == "gshard":
-                gate = GShardGate(
-                    self.d_model,
-                    num_expert=len(experts),
-                    world_size=self.world_size,
-                    topk=self.top_k,
-                    group=self.group)
+                gate = GShardGate(self.d_model,
+                                  num_expert=len(experts),
+                                  world_size=self.world_size,
+                                  topk=self.top_k,
+                                  group=self.group)
             elif gate == "switch":
-                gate = SwitchGate(
-                    self.d_model,
-                    num_expert=len(experts),
-                    world_size=self.world_size,
-                    topk=self.top_k,
-                    group=self.group)
+                gate = SwitchGate(self.d_model,
+                                  num_expert=len(experts),
+                                  world_size=self.world_size,
+                                  topk=self.top_k,
+                                  group=self.group)
             else:
                 assert False, "We only support naive gate, \
                                 gshard gate and switch gate, \
@@ -369,7 +517,10 @@ def forward(self, inp):
             mp_rank = self.mp_group.rank
             mp_size = self.mp_group.nranks
         if mp_size > 1:
-            inp = Slice.apply(inp, mp_rank, mp_size, self.mp_group)
+            if in_dygraph_mode():
+                inp = EagerSlice.apply(inp, mp_rank, mp_size, self.mp_group)
+            else:
+                inp = Slice.apply(inp, mp_rank, mp_size, self.mp_group)
         value, gate = self.gate(inp)
 
         (
@@ -377,8 +528,8 @@ def forward(self, inp):
             local_expert_count,
             global_expert_count,
             fwd_expert_count,
-            fwd_batch_size, ) = prepare_forward(gate, self.num_expert,
-                                                self.world_size, self.group)
+            fwd_batch_size,
+        ) = prepare_forward(gate, self.num_expert, self.world_size, self.group)
 
         topk = 1
         if len(gate.shape) == 2:
@@ -390,9 +541,14 @@ def forward(self, inp):
             temp_pos = pos
         assert topk == self.top_k
 
-        x = MoEScatter.apply(inp, temp_pos, local_expert_count,
-                             global_expert_count, fwd_batch_size,
-                             self.world_size, self.group)
+        if in_dygraph_mode():
+            x = EagerMoEScatter.apply(inp, temp_pos, local_expert_count,
+                                      global_expert_count, fwd_batch_size,
+                                      self.world_size, self.group)
+        else:
+            x = MoEScatter.apply(inp, temp_pos, local_expert_count,
+                                 global_expert_count, fwd_batch_size,
+                                 self.world_size, self.group)
 
         d_model = self.d_model
 
@@ -414,22 +570,30 @@ def experts_fwd(x, fwd_expert_count, experts):
         if self.recompute_interval <= 0 or x.shape[0] == 0:
             x = experts_fwd(x, fwd_expert_count.numpy(), self.experts)
         else:
-            x = _hp_recompute(experts_fwd, x,
-                              fwd_expert_count.numpy(), self.experts)
+            x = _hp_recompute(experts_fwd, x, fwd_expert_count.numpy(),
+                              self.experts)
 
         out_batch_size = inp.shape[0]
         if len(gate.shape) == 2:
             out_batch_size *= gate.shape[1]
 
-        x = MoEGather.apply(x, pos, local_expert_count, global_expert_count,
-                            out_batch_size, self.world_size, self.group)
+        if in_dygraph_mode():
+            x = EagerMoEGather.apply(x, pos, local_expert_count,
+                                     global_expert_count, out_batch_size,
+                                     self.world_size, self.group)
+        else:
+            x = MoEGather.apply(x, pos, local_expert_count, global_expert_count,
+                                out_batch_size, self.world_size, self.group)
 
         x = x.reshape([-1, self.top_k, d_model])
         value = value.reshape([x.shape[0], 1, self.top_k])
         x = paddle.bmm(value, x).reshape([-1, d_model])
 
         if mp_size > 1:
-            x = AllGather.apply(x, mp_rank, mp_size, self.mp_group)
+            if in_dygraph_mode():
+                x = EagerAllGather.apply(x, mp_rank, mp_size, self.mp_group)
+            else:
+                x = AllGather.apply(x, mp_rank, mp_size, self.mp_group)
 
         x = paddle.reshape_(x, origin_shape)
 
diff --git a/python/paddle/incubate/distributed/models/moe/utils.py b/python/paddle/incubate/distributed/models/moe/utils.py
index 25c76c9753035..b195ffdb815e2 100644
--- a/python/paddle/incubate/distributed/models/moe/utils.py
+++ b/python/paddle/incubate/distributed/models/moe/utils.py
@@ -3,9 +3,9 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,15 +21,24 @@
 
 from paddle.distributed.models.moe.utils import _number_count, _limit_by_capacity, _prune_gate_by_capacity, _assign_pos
 import paddle
+from paddle.fluid.framework import in_dygraph_mode
 
 
 def _alltoall(in_tensor_list, group=None, use_calc_stream=True):
     if group is not None and not group.is_member():
         return
-    ring_id = 0 if group is None else group.id
-    nranks = len(in_tensor_list)
-    return paddle._C_ops.alltoall(in_tensor_list, 'use_calc_stream',
-                                  use_calc_stream, 'ring_id', ring_id)
+
+    if in_dygraph_mode():
+        group = paddle.distributed.collective._get_default_group(
+        ) if group is None else group
+        out = paddle.empty(in_tensor_list.shape, in_tensor_list.dtype)
+        task = group.process_group.alltoall(in_tensor_list, out)
+        task.wait()
+        return out
+    else:
+        ring_id = 0 if group is None else group.id
+        return paddle._C_ops.alltoall(in_tensor_list, 'use_calc_stream',
+                                      use_calc_stream, 'ring_id', ring_id)
 
 
 def count_by_gate(gate, num_expert, world_size, require_pos=True, group=None):
@@ -51,10 +60,13 @@ def count_by_gate(gate, num_expert, world_size, require_pos=True, group=None):
 
 def limit_by_capacity(topk_idx, num_expert, world_size, capacity, group=None):
     with paddle.no_grad():
-        capacity = paddle.ones(
-            shape=[num_expert], dtype=paddle.int64) * capacity
-        pos, lec, gec = count_by_gate(
-            topk_idx, num_expert, world_size, require_pos=False, group=group)
+        capacity = paddle.ones(shape=[num_expert],
+                               dtype=paddle.int64) * capacity
+        pos, lec, gec = count_by_gate(topk_idx,
+                                      num_expert,
+                                      world_size,
+                                      require_pos=False,
+                                      group=group)
         new_gec = _limit_by_capacity(gec, capacity, world_size)
         if world_size > 1:
             assert group.nranks == world_size
diff --git a/python/paddle/incubate/multiprocessing/reductions.py b/python/paddle/incubate/multiprocessing/reductions.py
index cfbc55afd3bca..54d40312268aa 100644
--- a/python/paddle/incubate/multiprocessing/reductions.py
+++ b/python/paddle/incubate/multiprocessing/reductions.py
@@ -47,6 +47,7 @@ def _supported_check():
 
 
 class LRUSharedCache(OrderedDict):
+
     def __init__(self):
         self.limit = 128
         self._after_fork()
diff --git a/python/paddle/incubate/nn/__init__.py b/python/paddle/incubate/nn/__init__.py
index 3c806aa646ebe..cf15ee7d8ffaa 100644
--- a/python/paddle/incubate/nn/__init__.py
+++ b/python/paddle/incubate/nn/__init__.py
@@ -16,6 +16,7 @@
 from .layer.fused_transformer import FusedFeedForward  # noqa: F401
 from .layer.fused_transformer import FusedTransformerEncoderLayer  # noqa: F401
 from .layer.fused_transformer import FusedMultiTransformer  # noqa: F401
+from .layer.fused_linear import FusedLinear  # noqa: F401
 from .layer.fused_transformer import FusedBiasDropoutResidualLayerNorm  # noqa: F401
 
 __all__ = [  #noqa
@@ -23,5 +24,6 @@
     'FusedFeedForward',
     'FusedTransformerEncoderLayer',
     'FusedMultiTransformer',
+    'FusedLinear',
     'FusedBiasDropoutResidualLayerNorm',
 ]
diff --git a/python/paddle/incubate/nn/functional/__init__.py b/python/paddle/incubate/nn/functional/__init__.py
index 02e44548ce5d8..e9894990455ab 100644
--- a/python/paddle/incubate/nn/functional/__init__.py
+++ b/python/paddle/incubate/nn/functional/__init__.py
@@ -15,11 +15,14 @@
 from .fused_transformer import fused_multi_head_attention
 from .fused_transformer import fused_feedforward
 from .fused_transformer import fused_multi_transformer
+from .fused_matmul_bias import fused_matmul_bias, fused_linear
 from .fused_transformer import fused_bias_dropout_residual_layer_norm
 
 __all__ = [
     'fused_multi_head_attention',
     'fused_feedforward',
     'fused_multi_transformer',
+    'fused_matmul_bias',
+    'fused_linear',
     'fused_bias_dropout_residual_layer_norm',
 ]
diff --git a/python/paddle/incubate/nn/functional/fused_matmul_bias.py b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
new file mode 100644
index 0000000000000..d963c5e1ade5f
--- /dev/null
+++ b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import _non_static_mode
+from paddle.tensor.linalg import matmul
+from paddle import _C_ops
+
+
+def fused_matmul_bias(x,
+                      y,
+                      bias=None,
+                      transpose_x=False,
+                      transpose_y=False,
+                      name=None):
+    """
+    Applies matrix multiplication of two tensors and then bias addition if provided.
+    This method requires CUDA version >= 11.6. 
+
+    Args:
+        x (Tensor): the first input Tensor to be multiplied.
+        y (Tensor): the second input Tensor to be multiplied. Its rank must be 2.  
+        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
+            be performed. Otherwise, the bias is added to the matrix multiplication result.  
+        transpose_x (bool): Whether to transpose :math:`x` before multiplication.
+        transpose_y (bool): Whether to transpose :math:`y` before multiplication.    
+        name(str|None): For detailed information, please refer to 
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default. 
+
+    Returns:
+        Tensor: the output Tensor. 
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            from paddle.incubate.nn.functional import fused_matmul_bias
+            
+            x = paddle.randn([3, 4]) 
+            y = paddle.randn([4, 5])
+            bias = paddle.randn([5])
+            out = fused_matmul_bias(x, y, bias) 
+            print(out.shape) # [3, 5]
+    """
+    if bias is None:
+        return matmul(x, y, transpose_x, transpose_y, name)
+    if _non_static_mode():
+        return _C_ops.fused_gemm_epilogue(x, y, bias, 'trans_x', transpose_x,
+                                          'trans_y', transpose_y)
+
+    helper = LayerHelper('fused_matmul_bias', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(type='fused_gemm_epilogue',
+                     inputs={
+                         'X': x,
+                         'Y': y,
+                         'Bias': bias
+                     },
+                     outputs={'Out': out},
+                     attrs={
+                         'trans_x': transpose_x,
+                         'trans_y': transpose_y
+                     })
+    return out
+
+
+def fused_linear(x, weight, bias=None, transpose_weight=False, name=None):
+    """
+    Fully-connected linear transformation operator. This method requires CUDA version >= 11.6. 
+
+    Args:
+        x (Tensor): the input Tensor to be multiplied.
+        weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2.  
+        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
+            be performed. Otherwise, the bias is added to the matrix multiplication result.  
+        transpose_weight (bool): Whether to transpose :math:`weight` before multiplication.    
+        name(str|None): For detailed information, please refer to 
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default. 
+
+    Returns:
+        Tensor: the output Tensor. 
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            from paddle.incubate.nn.functional import fused_linear
+            
+            x = paddle.randn([3, 4]) 
+            weight = paddle.randn([4, 5])
+            bias = paddle.randn([5])
+            out = fused_linear(x, weight, bias) 
+            print(out.shape) # [3, 5]
+    """
+    return fused_matmul_bias(x, weight, bias, False, transpose_weight, name)
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 232e16415a5f7..ab7e135adc6c4 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -115,7 +115,8 @@ def fused_feedforward(x,
     seed = None
     if mode not in ('downscale_in_infer', 'upscale_in_train'):
         raise ValueError(
-            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
+        )
     mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
 
     if _non_static_mode():
@@ -128,11 +129,10 @@ def fused_feedforward(x,
             'ln2_epsilon', ln2_epsilon, 'act_method', activation,
             'dropout1_rate', dropout1_rate, 'dropout2_rate', dropout2_rate,
             "dropout1_is_test", not training, "dropout2_is_test", not training,
-            "dropout1_fix_seed", seed is not None, "dropout2_fix_seed",
-            seed is not None, "dropout1_seed", seed
-            if seed is not None else 0, "dropout2_seed", seed
-            if seed is not None else 0, 'dropout1_implementation', mode,
-            'dropout2_implementation', mode)
+            "dropout1_fix_seed", seed is not None, "dropout2_fix_seed", seed
+            is not None, "dropout1_seed", seed if seed is not None else 0,
+            "dropout2_seed", seed if seed is not None else 0,
+            'dropout1_implementation', mode, 'dropout2_implementation', mode)
         return out
 
     helper = LayerHelper("fused_feedforward")
@@ -147,68 +147,67 @@ def fused_feedforward(x,
         'uint8', stop_gradient=True)
     dropout2_mask = helper.create_variable_for_type_inference(
         'uint8', stop_gradient=True)
-    ln1_mean = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    ln1_variance = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    ln2_mean = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    ln2_variance = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    linear1_out = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    ln1_out = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    dropout1_out = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
-    dropout2_out = helper.create_variable_for_type_inference(
-        x.dtype, stop_gradient=True)
+    ln1_mean = helper.create_variable_for_type_inference(x.dtype,
+                                                         stop_gradient=True)
+    ln1_variance = helper.create_variable_for_type_inference(x.dtype,
+                                                             stop_gradient=True)
+    ln2_mean = helper.create_variable_for_type_inference(x.dtype,
+                                                         stop_gradient=True)
+    ln2_variance = helper.create_variable_for_type_inference(x.dtype,
+                                                             stop_gradient=True)
+    linear1_out = helper.create_variable_for_type_inference(x.dtype,
+                                                            stop_gradient=True)
+    ln1_out = helper.create_variable_for_type_inference(x.dtype,
+                                                        stop_gradient=True)
+    dropout1_out = helper.create_variable_for_type_inference(x.dtype,
+                                                             stop_gradient=True)
+    dropout2_out = helper.create_variable_for_type_inference(x.dtype,
+                                                             stop_gradient=True)
 
     if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
         seed = helper.main_program.random_seed
 
-    helper.append_op(
-        type='fused_feedforward',
-        inputs={
-            'X': x,
-            'Linear1Weight': linear1_weight,
-            'Linear1Bias': linear1_bias,
-            'Linear2Weight': linear2_weight,
-            'Linear2Bias': linear2_bias,
-            'Ln1Scale': ln1_scale,
-            'Ln1Bias': ln1_bias,
-            'Ln2Scale': ln2_scale,
-            'Ln2Bias': ln2_bias,
-        },
-        outputs={
-            'Out': out,
-            'Dropout1Mask': dropout1_mask,
-            'Dropout2Mask': dropout2_mask,
-            'Ln1Mean': ln1_mean,
-            'Ln1Variance': ln1_variance,
-            'Ln2Mean': ln2_mean,
-            'Ln2Variance': ln2_variance,
-            'Linear1Out': linear1_out,
-            'Ln1Out': ln1_out,
-            'Dropout1Out': dropout1_out,
-            'Dropout2Out': dropout2_out,
-        },
-        attrs={
-            'dropout1_rate': dropout1_rate,
-            'dropout2_rate': dropout2_rate,
-            'act_method': activation,
-            'pre_layer_norm': pre_layer_norm,
-            'ln1_epsilon': ln1_epsilon,
-            'ln2_epsilon': ln2_epsilon,
-            'dropout1_is_test': not training,
-            'dropout2_is_test': not training,
-            'dropout1_fix_seed': seed is not None,
-            'dropout2_fix_seed': seed is not None,
-            'dropout1_seed': seed if seed is not None else 0,
-            'dropout2_seed': seed if seed is not None else 0,
-            'dropout1_implementation': mode,
-            'dropout2_implementation': mode
-        })
+    helper.append_op(type='fused_feedforward',
+                     inputs={
+                         'X': x,
+                         'Linear1Weight': linear1_weight,
+                         'Linear1Bias': linear1_bias,
+                         'Linear2Weight': linear2_weight,
+                         'Linear2Bias': linear2_bias,
+                         'Ln1Scale': ln1_scale,
+                         'Ln1Bias': ln1_bias,
+                         'Ln2Scale': ln2_scale,
+                         'Ln2Bias': ln2_bias,
+                     },
+                     outputs={
+                         'Out': out,
+                         'Dropout1Mask': dropout1_mask,
+                         'Dropout2Mask': dropout2_mask,
+                         'Ln1Mean': ln1_mean,
+                         'Ln1Variance': ln1_variance,
+                         'Ln2Mean': ln2_mean,
+                         'Ln2Variance': ln2_variance,
+                         'Linear1Out': linear1_out,
+                         'Ln1Out': ln1_out,
+                         'Dropout1Out': dropout1_out,
+                         'Dropout2Out': dropout2_out,
+                     },
+                     attrs={
+                         'dropout1_rate': dropout1_rate,
+                         'dropout2_rate': dropout2_rate,
+                         'act_method': activation,
+                         'pre_layer_norm': pre_layer_norm,
+                         'ln1_epsilon': ln1_epsilon,
+                         'ln2_epsilon': ln2_epsilon,
+                         'dropout1_is_test': not training,
+                         'dropout2_is_test': not training,
+                         'dropout1_fix_seed': seed is not None,
+                         'dropout2_fix_seed': seed is not None,
+                         'dropout1_seed': seed if seed is not None else 0,
+                         'dropout2_seed': seed if seed is not None else 0,
+                         'dropout1_implementation': mode,
+                         'dropout2_implementation': mode
+                     })
     return out
 
 
@@ -279,12 +278,13 @@ def fused_bias_dropout_residual_layer_norm(x,
     seed = None
     if mode not in ('downscale_in_infer', 'upscale_in_train'):
         raise ValueError(
-            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
+        )
     mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
 
     if ln_scale is not None:
-        assert len(ln_scale.
-                   shape) == 1, "The dims of the shape of ln_scale should be 1."
+        assert len(ln_scale.shape
+                   ) == 1, "The dims of the shape of ln_scale should be 1."
         assert x.shape[len(x.shape) - 1] == ln_scale.shape[
             0], "The dim of ln_scale must equal to the last dim of x."
     if ln_bias is not None:
@@ -299,8 +299,8 @@ def fused_bias_dropout_residual_layer_norm(x,
         _, _, _, _, final_out = _C_ops.fused_bias_dropout_residual_layer_norm(
             x, residual, bias, ln_scale, ln_bias, 'dropout_rate', dropout_rate,
             'ln_epsilon', ln_epsilon, 'is_test', not training,
-            'dropout_fix_seed', seed is not None, 'dropout_seed', seed
-            if seed is not None else 0, 'dropout_implementation', mode)
+            'dropout_fix_seed', seed is not None, 'dropout_seed',
+            seed if seed is not None else 0, 'dropout_implementation', mode)
         return final_out
     else:
         helper = LayerHelper('fused_bias_dropout_residual_layer_norm',
@@ -343,17 +343,17 @@ def fused_bias_dropout_residual_layer_norm(x,
             dtype=dtype)
         final_out = helper.create_variable_for_type_inference(dtype=dtype)
 
-        helper.append_op(
-            type='fused_bias_dropout_residual_layer_norm',
-            inputs=inputs,
-            outputs={
-                "BiasDropoutResidualOut": bias_dropout_residual_out,
-                "DropoutMaskOut": dropout_mask_out,
-                "LnMean": ln_mean_out,
-                "LnVariance": ln_variance_out,
-                'Y': final_out,
-            },
-            attrs=attrs)
+        helper.append_op(type='fused_bias_dropout_residual_layer_norm',
+                         inputs=inputs,
+                         outputs={
+                             "BiasDropoutResidualOut":
+                             bias_dropout_residual_out,
+                             "DropoutMaskOut": dropout_mask_out,
+                             "LnMean": ln_mean_out,
+                             "LnVariance": ln_variance_out,
+                             'Y': final_out,
+                         },
+                         attrs=attrs)
         return final_out
 
 
@@ -490,7 +490,8 @@ def fused_multi_head_attention(x,
     seed = None
     if mode not in ('downscale_in_infer', 'upscale_in_train'):
         raise ValueError(
-            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
+        )
     mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
 
     if _non_static_mode():
@@ -515,10 +516,10 @@ def fused_multi_head_attention(x,
             'dropout_rate', dropout_rate, 'attn_dropout_rate',
             attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'is_test',
             not training, 'attn_dropout_fix_seed', seed is not None,
-            'dropout_fix_seed', seed is not None, 'attn_dropout_seed', seed
-            if seed is not None else 0, 'dropout_seed', seed
-            if seed is not None else 0, 'attn_dropout_implementation', mode,
-            'dropout_implementation', mode, 'ring_id', ring_id)
+            'dropout_fix_seed', seed is not None, 'attn_dropout_seed',
+            seed if seed is not None else 0, 'dropout_seed',
+            seed if seed is not None else 0, 'attn_dropout_implementation',
+            mode, 'dropout_implementation', mode, 'ring_id', ring_id)
         if cache_kv is not None:
             return final_out, cache_kv_out
         return final_out
@@ -603,32 +604,32 @@ def fused_multi_head_attention(x,
         final_out = helper.create_variable_for_type_inference(dtype=dtype)
         cache_kv_out = helper.create_variable_for_type_inference(dtype=dtype)
 
-        helper.append_op(
-            type='fused_attention',
-            inputs=inputs,
-            outputs={
-                "LnMean": pre_ln_mean_out,
-                "LnVariance": pre_ln_variance_out,
-                "LnOut": pre_ln_out,
-                "QKVOut": qkv_out,
-                "QKVBiasOut": qkv_bias_out,
-                "TransposeOut2": transpose_out,
-                "QKOut": qk_out,
-                "QKTVOut": qktv_out,
-                "SoftmaxOut": softmax_out,
-                "AttnDropoutMaskOut": attn_dropout_mask_out,
-                "AttnDropoutOut": attn_dropout_out,
-                "SrcMaskOut": attn_mask_out,
-                "FMHAOut": fmha_out,
-                "OutLinearOut": out_linear_out,
-                "DropoutMaskOut": dropout_mask_out,
-                "Ln2Mean": ln_mean_out,
-                "Ln2Variance": ln_variance_out,
-                "BiasDropoutResidualOut": bias_dropout_residual_out,
-                'Y': final_out,
-                'CacheKVOut': cache_kv_out
-            },
-            attrs=attrs)
+        helper.append_op(type='fused_attention',
+                         inputs=inputs,
+                         outputs={
+                             "LnMean": pre_ln_mean_out,
+                             "LnVariance": pre_ln_variance_out,
+                             "LnOut": pre_ln_out,
+                             "QKVOut": qkv_out,
+                             "QKVBiasOut": qkv_bias_out,
+                             "TransposeOut2": transpose_out,
+                             "QKOut": qk_out,
+                             "QKTVOut": qktv_out,
+                             "SoftmaxOut": softmax_out,
+                             "AttnDropoutMaskOut": attn_dropout_mask_out,
+                             "AttnDropoutOut": attn_dropout_out,
+                             "SrcMaskOut": attn_mask_out,
+                             "FMHAOut": fmha_out,
+                             "OutLinearOut": out_linear_out,
+                             "DropoutMaskOut": dropout_mask_out,
+                             "Ln2Mean": ln_mean_out,
+                             "Ln2Variance": ln_variance_out,
+                             "BiasDropoutResidualOut":
+                             bias_dropout_residual_out,
+                             'Y': final_out,
+                             'CacheKVOut': cache_kv_out
+                         },
+                         attrs=attrs)
 
         return (final_out, cache_kv_out) if cache_kv else final_out
 
@@ -790,7 +791,8 @@ def fused_multi_transformer(x,
     """
     if mode not in ('downscale_in_infer', 'upscale_in_train'):
         raise ValueError(
-            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
+        )
     mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
 
     if _non_static_mode():
@@ -859,10 +861,9 @@ def fused_multi_transformer(x,
             # NOTE: inplace
             outputs['CacheKVOut'] = cache_kvs
 
-        helper.append_op(
-            type='fused_multi_transformer',
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs)
+        helper.append_op(type='fused_multi_transformer',
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=attrs)
 
         return (final_out, cache_kvs) if cache_kvs else final_out
diff --git a/python/paddle/incubate/nn/layer/fused_linear.py b/python/paddle/incubate/nn/layer/fused_linear.py
new file mode 100644
index 0000000000000..8a8800afce61a
--- /dev/null
+++ b/python/paddle/incubate/nn/layer/fused_linear.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.nn import Layer
+from paddle.incubate.nn import functional as F
+
+
+class FusedLinear(Layer):
+    """
+    Linear layer takes only one multi-dimensional tensor as input with the
+    shape :math:`[batch\_size, *, in\_features]` , where :math:`*` means any
+    number of additional dimensions. It multiplies input tensor with the weight
+    (a 2-D tensor of shape :math:`[in\_features, out\_features]` ) and produces
+    an output tensor of shape :math:`[batch\_size, *, out\_features]` .
+    If :math:`bias\_attr` is not False, the bias (a 1-D tensor of
+    shape :math:`[out\_features]` ) will be created and added to the output.
+
+    Parameters:
+        in_features (int): The number of input units.
+        out_features (int): The number of output units.
+        weight_attr (ParamAttr, optional): The attribute for the learnable
+            weight of this layer. The default value is None and the weight will be
+            initialized to zero. For detailed information, please refer to
+            paddle.ParamAttr.
+        transpose_weight (bool): Whether to transpose the `weight` Tensor before
+            multiplication. 
+        bias_attr (ParamAttr|bool, optional): The attribute for the learnable bias
+            of this layer. If it is set to False, no bias will be added to the output.
+            If it is set to None or one kind of ParamAttr, a bias parameter will
+            be created according to ParamAttr. For detailed information, please refer
+            to paddle.ParamAttr. The default value is None and the bias will be
+            initialized to zero.
+        name (str, optional): Normally there is no need for user to set this parameter.
+            For detailed information, please refer to :ref:`api_guide_Name` .
+
+    Attribute:
+        **weight** (Parameter): the learnable weight of this layer.
+
+        **bias** (Parameter): the learnable bias of this layer.
+
+    Shape:
+        - input: Multi-dimentional tensor with shape :math:`[batch\_size, *, in\_features]` .
+        - output: Multi-dimentional tensor with shape :math:`[batch\_size, *, out\_features]` .
+
+    Examples:
+        .. code-block:: python
+       
+            # required: gpu
+            import paddle
+            from paddle.incubate.nn import FusedLinear
+
+            x = paddle.randn([3, 4]) 
+            linear = FusedLinear(4, 5)
+            y = linear(x)            
+            print(y.shape) # [3, 5]
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 bias_attr=None,
+                 transpose_weight=False,
+                 name=None):
+        super(FusedLinear, self).__init__()
+        if transpose_weight:
+            weight_shape = [out_features, in_features]
+        else:
+            weight_shape = [in_features, out_features]
+        dtype = self._helper.get_default_dtype()
+        self.weight = self.create_parameter(shape=weight_shape,
+                                            attr=weight_attr,
+                                            dtype=dtype,
+                                            is_bias=False)
+        self.bias = self.create_parameter(shape=[out_features],
+                                          attr=bias_attr,
+                                          dtype=dtype,
+                                          is_bias=True)
+        self.transpose_weight = transpose_weight
+        self.name = name
+
+    def forward(self, input):
+        return F.fused_linear(input, self.weight, self.bias,
+                              self.transpose_weight, self.name)
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index a64b7e506021c..595b1d27fea8b 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -80,17 +80,17 @@ def __init__(self,
         self._bias_attr = bias_attr
         self._weight_attr = weight_attr
         self.embed_dim = embed_dim
-        self.linear_bias = self.create_parameter(
-            shape=[embed_dim],
-            attr=self._bias_attr,
-            dtype=self._dtype,
-            is_bias=True)
+        self.linear_bias = self.create_parameter(shape=[embed_dim],
+                                                 attr=self._bias_attr,
+                                                 dtype=self._dtype,
+                                                 is_bias=True)
         self.ln_scale = self.create_parameter(
             attr=self._weight_attr,
             shape=[embed_dim],
             default_initializer=Constant(value=1.0))
-        self.ln_bias = self.create_parameter(
-            attr=self._bias_attr, shape=[embed_dim], is_bias=True)
+        self.ln_bias = self.create_parameter(attr=self._bias_attr,
+                                             shape=[embed_dim],
+                                             is_bias=True)
         self.dropout_rate = dropout_rate
         self._epsilon = epsilon
 
@@ -227,29 +227,29 @@ def __init__(self,
             attr=self._bias_attr,
             dtype=self._dtype,
             is_bias=True)
-        self.linear_weight = self.create_parameter(
-            shape=[embed_dim, embed_dim],
-            attr=self._weight_attr,
-            dtype=self._dtype,
-            is_bias=False)
-        self.linear_bias = self.create_parameter(
-            shape=[embed_dim],
-            attr=self._bias_attr,
-            dtype=self._dtype,
-            is_bias=True)
+        self.linear_weight = self.create_parameter(shape=[embed_dim, embed_dim],
+                                                   attr=self._weight_attr,
+                                                   dtype=self._dtype,
+                                                   is_bias=False)
+        self.linear_bias = self.create_parameter(shape=[embed_dim],
+                                                 attr=self._bias_attr,
+                                                 dtype=self._dtype,
+                                                 is_bias=True)
 
         self.pre_ln_scale = self.create_parameter(
             attr=self._weight_attr,
             shape=[embed_dim],
             default_initializer=Constant(value=1.0))
-        self.pre_ln_bias = self.create_parameter(
-            attr=self._bias_attr, shape=[embed_dim], is_bias=True)
+        self.pre_ln_bias = self.create_parameter(attr=self._bias_attr,
+                                                 shape=[embed_dim],
+                                                 is_bias=True)
         self.ln_scale = self.create_parameter(
             attr=self._weight_attr,
             shape=[embed_dim],
             default_initializer=Constant(value=1.0))
-        self.ln_bias = self.create_parameter(
-            attr=self._bias_attr, shape=[embed_dim], is_bias=True)
+        self.ln_bias = self.create_parameter(attr=self._bias_attr,
+                                             shape=[embed_dim],
+                                             is_bias=True)
 
         self.dropout_rate = dropout_rate
         self.attn_dropout_rate = attn_dropout_rate
@@ -395,11 +395,10 @@ def __init__(self,
             attr=weight_attr,
             dtype=self._dtype,
             is_bias=False)
-        self._linear1_bias = self.create_parameter(
-            shape=[dim_feedforward],
-            attr=bias_attr,
-            dtype=self._dtype,
-            is_bias=True)
+        self._linear1_bias = self.create_parameter(shape=[dim_feedforward],
+                                                   attr=bias_attr,
+                                                   dtype=self._dtype,
+                                                   is_bias=True)
 
         self._linear2_weight = self.create_parameter(
             shape=[dim_feedforward, d_model],
@@ -407,24 +406,28 @@ def __init__(self,
             dtype=self._dtype,
             is_bias=False)
 
-        self._linear2_bias = self.create_parameter(
-            shape=[d_model], attr=bias_attr, dtype=self._dtype, is_bias=True)
+        self._linear2_bias = self.create_parameter(shape=[d_model],
+                                                   attr=bias_attr,
+                                                   dtype=self._dtype,
+                                                   is_bias=True)
 
         self._ln1_scale = self.create_parameter(
             shape=[d_model],
             attr=None,
             is_bias=False,
             default_initializer=Constant(1.0))
-        self._ln1_bias = self.create_parameter(
-            shape=[d_model], attr=None, is_bias=True)
+        self._ln1_bias = self.create_parameter(shape=[d_model],
+                                               attr=None,
+                                               is_bias=True)
 
         self._ln2_scale = self.create_parameter(
             shape=[d_model],
             attr=None,
             is_bias=False,
             default_initializer=Constant(1.0))
-        self._ln2_bias = self.create_parameter(
-            shape=[d_model], attr=None, is_bias=True)
+        self._ln2_bias = self.create_parameter(shape=[d_model],
+                                               attr=None,
+                                               is_bias=True)
         self.name = name
 
     def forward(self, src, cache=None):
@@ -553,15 +556,14 @@ def __init__(self,
             weight_attr=weight_attrs[0],
             bias_attr=bias_attrs[0])
 
-        self.ffn = FusedFeedForward(
-            d_model,
-            dim_feedforward,
-            dropout_rate=dropout_rate,
-            activation=activation,
-            act_dropout_rate=act_dropout_rate,
-            normalize_before=self.normalize_before,
-            weight_attr=weight_attrs[1],
-            bias_attr=bias_attrs[1])
+        self.ffn = FusedFeedForward(d_model,
+                                    dim_feedforward,
+                                    dropout_rate=dropout_rate,
+                                    activation=activation,
+                                    act_dropout_rate=act_dropout_rate,
+                                    normalize_before=self.normalize_before,
+                                    weight_attr=weight_attrs[1],
+                                    bias_attr=bias_attrs[1])
 
     def forward(self, src, src_mask=None, cache=None):
         """
@@ -597,8 +599,9 @@ def forward(self, src, src_mask=None, cache=None):
         if cache is None:
             attn_out = self.fused_attn(src, attn_mask=src_mask)
         else:
-            attn_out, incremental_cache = self.fused_attn(
-                src, attn_mask=src_mask, cache=cache)
+            attn_out, incremental_cache = self.fused_attn(src,
+                                                          attn_mask=src_mask,
+                                                          cache=cache)
 
         ffn_out = self.ffn(attn_out)
 
@@ -967,8 +970,9 @@ def get_attr(attrs, idx):
                 attr=ln_scale_attr,
                 shape=[embed_dim],
                 default_initializer=Constant(value=1.0))
-            ln_bias = self.create_parameter(
-                attr=ln_bias_attr, shape=[embed_dim], is_bias=True)
+            ln_bias = self.create_parameter(attr=ln_bias_attr,
+                                            shape=[embed_dim],
+                                            is_bias=True)
             qkv_weight = self.create_parameter(
                 shape=[3, num_heads, self.head_dim, embed_dim],
                 attr=qkv_weight_attr,
@@ -984,39 +988,37 @@ def get_attr(attrs, idx):
                 attr=linear_weight_attr,
                 dtype=self._dtype,
                 is_bias=False)
-            linear_bias = self.create_parameter(
-                shape=[embed_dim],
-                attr=linear_bias_attr,
-                dtype=self._dtype,
-                is_bias=True)
+            linear_bias = self.create_parameter(shape=[embed_dim],
+                                                attr=linear_bias_attr,
+                                                dtype=self._dtype,
+                                                is_bias=True)
 
             ffn_ln_scale = self.create_parameter(
                 shape=[embed_dim],
                 attr=ffn_ln_scale_attr,
                 is_bias=False,
                 default_initializer=Constant(1.0))
-            ffn_ln_bias = self.create_parameter(
-                shape=[embed_dim], attr=ffn_ln_bias_attr, is_bias=True)
+            ffn_ln_bias = self.create_parameter(shape=[embed_dim],
+                                                attr=ffn_ln_bias_attr,
+                                                is_bias=True)
             ffn1_weight = self.create_parameter(
                 shape=[embed_dim, dim_feedforward],
                 attr=ffn1_weight_attr,
                 dtype=self._dtype,
                 is_bias=False)
-            ffn1_bias = self.create_parameter(
-                shape=[dim_feedforward],
-                attr=ffn1_bias_attr,
-                dtype=self._dtype,
-                is_bias=True)
+            ffn1_bias = self.create_parameter(shape=[dim_feedforward],
+                                              attr=ffn1_bias_attr,
+                                              dtype=self._dtype,
+                                              is_bias=True)
             ffn2_weight = self.create_parameter(
                 shape=[dim_feedforward, embed_dim],
                 attr=ffn2_weight_attr,
                 dtype=self._dtype,
                 is_bias=False)
-            ffn2_bias = self.create_parameter(
-                shape=[embed_dim],
-                attr=ffn2_bias_attr,
-                dtype=self._dtype,
-                is_bias=True)
+            ffn2_bias = self.create_parameter(shape=[embed_dim],
+                                              attr=ffn2_bias_attr,
+                                              dtype=self._dtype,
+                                              is_bias=True)
 
             # tensor model parallel
             if nranks > 1:
diff --git a/python/paddle/incubate/operators/graph_khop_sampler.py b/python/paddle/incubate/operators/graph_khop_sampler.py
index 5442b213ceb47..89014a7ad59a8 100644
--- a/python/paddle/incubate/operators/graph_khop_sampler.py
+++ b/python/paddle/incubate/operators/graph_khop_sampler.py
@@ -38,10 +38,6 @@ def graph_khop_sampler(row,
     and `sample_sizes` means the number of neighbors and number of layers we want
     to sample. 
 
-    **Note**: 
-        Currently the API will reindex the output edges after finishing sampling. We
-    will add a choice or a new API for whether to reindex the edges in the near future.
-
     Args:
         row (Tensor): One of the components of the CSC format of the input graph, and 
                       the shape should be [num_edges, 1] or [num_edges]. The available
@@ -129,23 +125,24 @@ def graph_khop_sampler(row,
     sample_index = helper.create_variable_for_type_inference(dtype=row.dtype)
     reindex_nodes = helper.create_variable_for_type_inference(dtype=row.dtype)
     edge_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
-    helper.append_op(
-        type="graph_khop_sampler",
-        inputs={
-            "Row": row,
-            "Eids": sorted_eids,
-            "Col_Ptr": colptr,
-            "X": input_nodes
-        },
-        outputs={
-            "Out_Src": edge_src,
-            "Out_Dst": edge_dst,
-            "Sample_Index": sample_index,
-            "Reindex_X": reindex_nodes,
-            "Out_Eids": edge_eids
-        },
-        attrs={"sample_sizes": sample_sizes,
-               "return_eids": return_eids})
+    helper.append_op(type="graph_khop_sampler",
+                     inputs={
+                         "Row": row,
+                         "Eids": sorted_eids,
+                         "Col_Ptr": colptr,
+                         "X": input_nodes
+                     },
+                     outputs={
+                         "Out_Src": edge_src,
+                         "Out_Dst": edge_dst,
+                         "Sample_Index": sample_index,
+                         "Reindex_X": reindex_nodes,
+                         "Out_Eids": edge_eids
+                     },
+                     attrs={
+                         "sample_sizes": sample_sizes,
+                         "return_eids": return_eids
+                     })
     if return_eids:
         return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
     else:
diff --git a/python/paddle/incubate/operators/graph_reindex.py b/python/paddle/incubate/operators/graph_reindex.py
index 328b87a699750..1c49d6af950d5 100644
--- a/python/paddle/incubate/operators/graph_reindex.py
+++ b/python/paddle/incubate/operators/graph_reindex.py
@@ -35,6 +35,12 @@ def graph_reindex(x,
     is to reindex the ids information of the input nodes, and return the 
     corresponding graph edges after reindex.
 
+    **Notes**: 
+        The number in x should be unique, otherwise it would cause potential errors.
+    Besides, we also support multi-edge-types neighbors reindexing. If we have different
+    edge_type neighbors for x, we should concatenate all the neighbors and count of x. 
+    We will reindex all the nodes from 0. 
+
     Take input nodes x = [0, 1, 2] as an example. 
     If we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2], 
     then we know that the neighbors of 0 is [8, 9], the neighbors of 1
@@ -70,18 +76,31 @@ def graph_reindex(x,
         import paddle
 
         x = [0, 1, 2]
-        neighbors = [8, 9, 0, 4, 7, 6, 7]
-        count = [2, 3, 2]
+        neighbors_e1 = [8, 9, 0, 4, 7, 6, 7]
+        count_e1 = [2, 3, 2]
         x = paddle.to_tensor(x, dtype="int64")
-        neighbors = paddle.to_tensor(neighbors, dtype="int64")
-        count = paddle.to_tensor(count, dtype="int32")
+        neighbors_e1 = paddle.to_tensor(neighbors_e1, dtype="int64")
+        count_e1 = paddle.to_tensor(count_e1, dtype="int32")
 
         reindex_src, reindex_dst, out_nodes = \
-             paddle.incubate.graph_reindex(x, neighbors, count)
+             paddle.incubate.graph_reindex(x, neighbors_e1, count_e1)
         # reindex_src: [3, 4, 0, 5, 6, 7, 6]
         # reindex_dst: [0, 0, 1, 1, 1, 2, 2]
         # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]
 
+        neighbors_e2 = [0, 2, 3, 5, 1]
+        count_e2 = [1, 3, 1]
+        neighbors_e2 = paddle.to_tensor(neighbors_e2, dtype="int64")
+        count_e2 = paddle.to_tensor(count_e2, dtype="int32")
+        
+        neighbors = paddle.concat([neighbors_e1, neighbors_e2])
+        count = paddle.concat([count_e1, count_e2])
+        reindex_src, reindex_dst, out_nodes = \
+             paddle.incubate.graph_reindex(x, neighbors, count)
+        # reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1]
+        # reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
+        # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5]
+
     """
     if flag_buffer_hashtable:
         if value_buffer is None or index_buffer is None:
@@ -109,19 +128,23 @@ def graph_reindex(x,
     reindex_src = helper.create_variable_for_type_inference(dtype=x.dtype)
     reindex_dst = helper.create_variable_for_type_inference(dtype=x.dtype)
     out_nodes = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="graph_reindex",
-        inputs={
-            "X": x,
-            "Neighbors": neighbors,
-            "Count": count,
-            "HashTable_Value": value_buffer if flag_buffer_hashtable else None,
-            "HashTable_Index": index_buffer if flag_buffer_hashtable else None,
-        },
-        outputs={
-            "Reindex_Src": reindex_src,
-            "Reindex_Dst": reindex_dst,
-            "Out_Nodes": out_nodes
-        },
-        attrs={"flag_buffer_hashtable": flag_buffer_hashtable})
+    helper.append_op(type="graph_reindex",
+                     inputs={
+                         "X":
+                         x,
+                         "Neighbors":
+                         neighbors,
+                         "Count":
+                         count,
+                         "HashTable_Value":
+                         value_buffer if flag_buffer_hashtable else None,
+                         "HashTable_Index":
+                         index_buffer if flag_buffer_hashtable else None,
+                     },
+                     outputs={
+                         "Reindex_Src": reindex_src,
+                         "Reindex_Dst": reindex_dst,
+                         "Out_Nodes": out_nodes
+                     },
+                     attrs={"flag_buffer_hashtable": flag_buffer_hashtable})
     return reindex_src, reindex_dst, out_nodes
diff --git a/python/paddle/incubate/operators/graph_sample_neighbors.py b/python/paddle/incubate/operators/graph_sample_neighbors.py
index d5a85af7272e7..63424b395c703 100644
--- a/python/paddle/incubate/operators/graph_sample_neighbors.py
+++ b/python/paddle/incubate/operators/graph_sample_neighbors.py
@@ -126,25 +126,25 @@ def graph_sample_neighbors(row,
     out_neighbors = helper.create_variable_for_type_inference(dtype=row.dtype)
     out_count = helper.create_variable_for_type_inference(dtype=row.dtype)
     out_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
-    helper.append_op(
-        type="graph_sample_neighbors",
-        inputs={
-            "Row": row,
-            "Col_Ptr": colptr,
-            "X": input_nodes,
-            "Eids": eids if return_eids else None,
-            "Perm_Buffer": perm_buffer if flag_perm_buffer else None
-        },
-        outputs={
-            "Out": out_neighbors,
-            "Out_Count": out_count,
-            "Out_Eids": out_eids
-        },
-        attrs={
-            "sample_size": sample_size,
-            "return_eids": return_eids,
-            "flag_perm_buffer": flag_perm_buffer
-        })
+    helper.append_op(type="graph_sample_neighbors",
+                     inputs={
+                         "Row": row,
+                         "Col_Ptr": colptr,
+                         "X": input_nodes,
+                         "Eids": eids if return_eids else None,
+                         "Perm_Buffer":
+                         perm_buffer if flag_perm_buffer else None
+                     },
+                     outputs={
+                         "Out": out_neighbors,
+                         "Out_Count": out_count,
+                         "Out_Eids": out_eids
+                     },
+                     attrs={
+                         "sample_size": sample_size,
+                         "return_eids": return_eids,
+                         "flag_perm_buffer": flag_perm_buffer
+                     })
     if return_eids:
         return out_neighbors, out_count, out_eids
     return out_neighbors, out_count
diff --git a/python/paddle/incubate/operators/graph_send_recv.py b/python/paddle/incubate/operators/graph_send_recv.py
index 80a21aec6cf5a..e9937558e9b3a 100644
--- a/python/paddle/incubate/operators/graph_send_recv.py
+++ b/python/paddle/incubate/operators/graph_send_recv.py
@@ -119,9 +119,10 @@ def graph_send_recv(x,
                                                       pool_type.upper(), 0)
     else:
         if _in_legacy_dygraph():
-            out, tmp = _C_ops.graph_send_recv(
-                x, src_index, dst_index, 'pool_type',
-                pool_type.upper(), 'out_size', out_size)
+            out, tmp = _C_ops.graph_send_recv(x, src_index,
+                                              dst_index, 'pool_type',
+                                              pool_type.upper(), 'out_size',
+                                              out_size)
             return out
         if in_dygraph_mode():
             if isinstance(out_size, core.eager.Tensor):
@@ -143,17 +144,22 @@ def graph_send_recv(x,
 
     helper = LayerHelper("graph_send_recv", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    dst_count = helper.create_variable_for_type_inference(
-        dtype="int32", stop_gradient=True)
-    helper.append_op(
-        type="graph_send_recv",
-        inputs={"X": x,
-                "Src_index": src_index,
-                "Dst_index": dst_index},
-        outputs={"Out": out,
-                 "Dst_count": dst_count},
-        attrs={
-            "pool_type": pool_type.upper(),
-            "out_size": 0 if out_size is None or out_size <= 0 else out_size
-        })
+    dst_count = helper.create_variable_for_type_inference(dtype="int32",
+                                                          stop_gradient=True)
+    helper.append_op(type="graph_send_recv",
+                     inputs={
+                         "X": x,
+                         "Src_index": src_index,
+                         "Dst_index": dst_index
+                     },
+                     outputs={
+                         "Out": out,
+                         "Dst_count": dst_count
+                     },
+                     attrs={
+                         "pool_type":
+                         pool_type.upper(),
+                         "out_size":
+                         0 if out_size is None or out_size <= 0 else out_size
+                     })
     return out
diff --git a/python/paddle/incubate/operators/resnet_unit.py b/python/paddle/incubate/operators/resnet_unit.py
index 4ddcfbac8791f..6333ddafe1096 100644
--- a/python/paddle/incubate/operators/resnet_unit.py
+++ b/python/paddle/incubate/operators/resnet_unit.py
@@ -45,11 +45,11 @@ def resnet_unit(x, filter_x, scale_x, bias_x, mean_x, var_x, z, filter_z,
     bn_param_dtype = fluid.core.VarDesc.VarType.FP32
     bit_mask_dtype = fluid.core.VarDesc.VarType.INT32
     out = helper.create_variable_for_type_inference(x.dtype)
-    bit_mask = helper.create_variable_for_type_inference(
-        dtype=bit_mask_dtype, stop_gradient=True)
+    bit_mask = helper.create_variable_for_type_inference(dtype=bit_mask_dtype,
+                                                         stop_gradient=True)
     # intermediate_out for x
-    conv_x = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
+    conv_x = helper.create_variable_for_type_inference(dtype=x.dtype,
+                                                       stop_gradient=True)
     saved_mean_x = helper.create_variable_for_type_inference(
         dtype=bn_param_dtype, stop_gradient=True)
     saved_invstd_x = helper.create_variable_for_type_inference(
@@ -57,8 +57,8 @@ def resnet_unit(x, filter_x, scale_x, bias_x, mean_x, var_x, z, filter_z,
     running_mean_x = mean_x
     running_var_x = var_x
     # intermediate_out for z
-    conv_z = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
+    conv_z = helper.create_variable_for_type_inference(dtype=x.dtype,
+                                                       stop_gradient=True)
     saved_mean_z = helper.create_variable_for_type_inference(
         dtype=bn_param_dtype, stop_gradient=True)
     saved_invstd_z = helper.create_variable_for_type_inference(
@@ -114,8 +114,10 @@ def resnet_unit(x, filter_x, scale_x, bias_x, mean_x, var_x, z, filter_z,
         'RunningVarZ': running_var_z,
     }
 
-    helper.append_op(
-        type='resnet_unit', inputs=inputs, outputs=outputs, attrs=attrs)
+    helper.append_op(type='resnet_unit',
+                     inputs=inputs,
+                     outputs=outputs,
+                     attrs=attrs)
 
     return out
 
@@ -194,26 +196,23 @@ def _get_default_param_initializer(channels):
             attr=scale_x_attr,
             dtype=bn_param_dtype,
             default_initializer=I.Constant(1.0))
-        self.bias_x = self.create_parameter(
-            shape=bn_param_shape,
-            attr=bias_x_attr,
-            dtype=bn_param_dtype,
-            is_bias=True)
-        self.mean_x = self.create_parameter(
-            attr=ParamAttr(
-                name=moving_mean_x_name,
-                initializer=I.Constant(0.0),
-                trainable=False),
-            shape=bn_param_shape,
-            dtype=bn_param_dtype)
+        self.bias_x = self.create_parameter(shape=bn_param_shape,
+                                            attr=bias_x_attr,
+                                            dtype=bn_param_dtype,
+                                            is_bias=True)
+        self.mean_x = self.create_parameter(attr=ParamAttr(
+            name=moving_mean_x_name,
+            initializer=I.Constant(0.0),
+            trainable=False),
+                                            shape=bn_param_shape,
+                                            dtype=bn_param_dtype)
         self.mean_x.stop_gradient = True
-        self.var_x = self.create_parameter(
-            attr=ParamAttr(
-                name=moving_var_x_name,
-                initializer=I.Constant(1.0),
-                trainable=False),
-            shape=bn_param_shape,
-            dtype=bn_param_dtype)
+        self.var_x = self.create_parameter(attr=ParamAttr(
+            name=moving_var_x_name,
+            initializer=I.Constant(1.0),
+            trainable=False),
+                                           shape=bn_param_shape,
+                                           dtype=bn_param_dtype)
         self.var_x.stop_gradient = True
         if has_shortcut:
             self.filter_z = self.create_parameter(
@@ -226,26 +225,23 @@ def _get_default_param_initializer(channels):
                 attr=scale_z_attr,
                 dtype=bn_param_dtype,
                 default_initializer=I.Constant(1.0))
-            self.bias_z = self.create_parameter(
-                shape=bn_param_shape,
-                attr=bias_z_attr,
-                dtype=bn_param_dtype,
-                is_bias=True)
-            self.mean_z = self.create_parameter(
-                attr=ParamAttr(
-                    name=moving_mean_z_name,
-                    initializer=I.Constant(0.0),
-                    trainable=False),
-                shape=bn_param_shape,
-                dtype=bn_param_dtype)
+            self.bias_z = self.create_parameter(shape=bn_param_shape,
+                                                attr=bias_z_attr,
+                                                dtype=bn_param_dtype,
+                                                is_bias=True)
+            self.mean_z = self.create_parameter(attr=ParamAttr(
+                name=moving_mean_z_name,
+                initializer=I.Constant(0.0),
+                trainable=False),
+                                                shape=bn_param_shape,
+                                                dtype=bn_param_dtype)
             self.mean_z.stop_gradient = True
-            self.var_z = self.create_parameter(
-                attr=ParamAttr(
-                    name=moving_var_z_name,
-                    initializer=I.Constant(1.0),
-                    trainable=False),
-                shape=bn_param_shape,
-                dtype=bn_param_dtype)
+            self.var_z = self.create_parameter(attr=ParamAttr(
+                name=moving_var_z_name,
+                initializer=I.Constant(1.0),
+                trainable=False),
+                                               shape=bn_param_shape,
+                                               dtype=bn_param_dtype)
             self.var_z.stop_gradient = True
         else:
             self.filter_z = None
@@ -258,11 +254,12 @@ def forward(self, x, z=None):
         if self._fuse_add and z is None:
             raise ValueError("z can not be None")
 
-        out = resnet_unit(
-            x, self.filter_x, self.scale_x, self.bias_x, self.mean_x,
-            self.var_x, z, self.filter_z, self.scale_z, self.bias_z,
-            self.mean_z, self.var_z, self._stride, self._stride_z,
-            self._padding, self._dilation, self._groups, self._momentum,
-            self._eps, self._data_format, self._fuse_add, self._has_shortcut,
-            self._use_global_stats, self._is_test, self._act)
+        out = resnet_unit(x, self.filter_x, self.scale_x, self.bias_x,
+                          self.mean_x, self.var_x, z, self.filter_z,
+                          self.scale_z, self.bias_z, self.mean_z, self.var_z,
+                          self._stride, self._stride_z, self._padding,
+                          self._dilation, self._groups, self._momentum,
+                          self._eps, self._data_format, self._fuse_add,
+                          self._has_shortcut, self._use_global_stats,
+                          self._is_test, self._act)
         return out
diff --git a/python/paddle/incubate/operators/softmax_mask_fuse.py b/python/paddle/incubate/operators/softmax_mask_fuse.py
index e9cd0e9ab61f8..1b70dfce6d0f1 100644
--- a/python/paddle/incubate/operators/softmax_mask_fuse.py
+++ b/python/paddle/incubate/operators/softmax_mask_fuse.py
@@ -63,9 +63,10 @@ def softmax_mask_fuse(x, mask, name=None):
         return out
     helper = LayerHelper('fused_softmax_mask', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='fused_softmax_mask',
-        inputs={'X': [x],
-                'Mask': [mask]},
-        outputs={'Out': [out]})
+    helper.append_op(type='fused_softmax_mask',
+                     inputs={
+                         'X': [x],
+                         'Mask': [mask]
+                     },
+                     outputs={'Out': [out]})
     return out
diff --git a/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py b/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
index 5bd4b111b69e5..dda5981f5adba 100644
--- a/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
+++ b/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
@@ -66,8 +66,7 @@ def softmax_mask_fuse_upper_triangle(x):
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type='fused_softmax_mask_upper_triangle',
-        inputs={'X': [x]},
-        outputs={'Out': [out]})
+    helper.append_op(type='fused_softmax_mask_upper_triangle',
+                     inputs={'X': [x]},
+                     outputs={'Out': [out]})
     return out
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index 4d40a477ffc07..4fddaff7ec959 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,6 +25,7 @@
 
 
 class DistributedFusedLamb(Optimizer):
+
     def __init__(self,
                  learning_rate=0.001,
                  lamb_weight_decay=0.01,
@@ -42,8 +43,9 @@ def __init__(self,
                  name=None):
         assert not framework._non_static_mode(
         ), "DistributedFusedLamb does not support dygraph mode"
-        super(DistributedFusedLamb, self).__init__(
-            learning_rate=learning_rate, grad_clip=None, name=name)
+        super(DistributedFusedLamb, self).__init__(learning_rate=learning_rate,
+                                                   grad_clip=None,
+                                                   name=name)
 
         self._beta1 = beta1
         self._beta2 = beta2
@@ -106,12 +108,11 @@ def _set_scale(self, scale):
 
     def _create_scale_from_constant(self, value):
         name = unique_name.generate('global_scale')
-        return layers.create_global_var(
-            name=name,
-            shape=[1],
-            dtype='float32',
-            value=float(value),
-            persistable=True)
+        return layers.create_global_var(name=name,
+                                        shape=[1],
+                                        dtype='float32',
+                                        value=float(value),
+                                        persistable=True)
 
     def _get_or_create_scale(self):
         if self._scale is None:
@@ -122,19 +123,17 @@ def _create_persistable_var(self, name=None, shape=[-1], dtype='float32'):
         startup_block = self.helper.startup_program.global_block()
         if name is not None:
             name = unique_name.generate(name)
-        startup_var = startup_block.create_var(
-            name=name,
-            shape=shape,
-            dtype=dtype,
-            persistable=True,
-            stop_gradient=True)
+        startup_var = startup_block.create_var(name=name,
+                                               shape=shape,
+                                               dtype=dtype,
+                                               persistable=True,
+                                               stop_gradient=True)
         main_block = self.helper.main_program.global_block()
-        main_var = main_block.create_var(
-            name=startup_var.name,
-            shape=startup_var.shape,
-            dtype=startup_var.dtype,
-            persistable=True,
-            stop_gradient=True)
+        main_var = main_block.create_var(name=startup_var.name,
+                                         shape=startup_var.shape,
+                                         dtype=startup_var.dtype,
+                                         persistable=True,
+                                         stop_gradient=True)
         return main_var
 
     def _get_parameter(self, name, scope=None):
@@ -174,10 +173,10 @@ def _apply_gradients_impl(self, params_grads):
 
         fp32_fused_param = self._create_persistable_var('fp32_fused_param')
         fp32_fused_grad = self._create_persistable_var('fp32_fused_grad')
-        fp16_fused_param = self._create_persistable_var(
-            'fp16_fused_param', dtype='float16')
-        fp16_fused_grad = self._create_persistable_var(
-            'fp16_fused_grad', dtype='float16')
+        fp16_fused_param = self._create_persistable_var('fp16_fused_param',
+                                                        dtype='float16')
+        fp16_fused_grad = self._create_persistable_var('fp16_fused_grad',
+                                                       dtype='float16')
 
         master_params = []
         for p, g in params_grads:
@@ -195,8 +194,8 @@ def _apply_gradients_impl(self, params_grads):
         param_info = self._create_persistable_var('param_info', dtype='int32')
         param_info.is_distributed = True
 
-        fused_offsets = self._create_persistable_var(
-            'fused_offsets', dtype='int32')
+        fused_offsets = self._create_persistable_var('fused_offsets',
+                                                     dtype='int32')
 
         fp32_partial_fused_offsets = self._create_persistable_var(
             'fp32_partial_fused_offsets', dtype='int32')
@@ -214,8 +213,8 @@ def _apply_gradients_impl(self, params_grads):
                 self._create_persistable_var('fp32_acc_fused_grad')
             ]
             fp16_acc_fused_grad = [
-                self._create_persistable_var(
-                    'fp16_acc_fused_grad', dtype='float16')
+                self._create_persistable_var('fp16_acc_fused_grad',
+                                             dtype='float16')
             ]
             acc_step = [self._create_persistable_var('acc_step', dtype='int64')]
         else:
@@ -239,49 +238,52 @@ def _apply_gradients_impl(self, params_grads):
 
         startup_block = self.helper.startup_program.global_block()
         for g in grads:
-            startup_block.create_var(
-                name=g.name,
-                type=g.type,
-                dtype=g.dtype,
-                persistable=g.persistable,
-                shape=g.shape)
-
-        startup_block.append_op(
-            type='distributed_fused_lamb_init',
-            inputs={
-                'Param': params,
-                'Grad': grads,
-            },
-            outputs={
-                'FP32FusedParam': [fp32_fused_param],
-                'FP32FusedGrad': [fp32_fused_grad],
-                'FP16FusedParam': [fp16_fused_param],
-                'FP16FusedGrad': [fp16_fused_grad],
-                'Moment1': [moment1],
-                'Moment2': [moment2],
-                'Beta1Pow': [beta1pow],
-                'Beta2Pow': [beta2pow],
-                'GlobalScale': [scale],
-                'ParamInfo': [param_info],
-                'ParamOut': params,
-                'MasterParamOut': master_params,
-                'GradOut': grads,
-                'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets],
-                'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets],
-                'FusedParamOffsets': [fused_offsets],
-                'ParamOrder': [param_order],
-                'Step': [step],
-            },
-            attrs={
-                'alignment': self._alignment,
-                'rank': rank,
-                'nranks': nranks,
-                'apply_weight_decay': apply_weight_decay,
-                'moment1': 0.0,
-                'moment2': 0.0,
-                'beta1': self._beta1,
-                'beta2': self._beta2,
-            })
+            startup_block.create_var(name=g.name,
+                                     type=g.type,
+                                     dtype=g.dtype,
+                                     persistable=g.persistable,
+                                     shape=g.shape)
+
+        startup_block.append_op(type='distributed_fused_lamb_init',
+                                inputs={
+                                    'Param': params,
+                                    'Grad': grads,
+                                },
+                                outputs={
+                                    'FP32FusedParam': [fp32_fused_param],
+                                    'FP32FusedGrad': [fp32_fused_grad],
+                                    'FP16FusedParam': [fp16_fused_param],
+                                    'FP16FusedGrad': [fp16_fused_grad],
+                                    'Moment1': [moment1],
+                                    'Moment2': [moment2],
+                                    'Beta1Pow': [beta1pow],
+                                    'Beta2Pow': [beta2pow],
+                                    'GlobalScale': [scale],
+                                    'ParamInfo': [param_info],
+                                    'ParamOut':
+                                    params,
+                                    'MasterParamOut':
+                                    master_params,
+                                    'GradOut':
+                                    grads,
+                                    'FP32ShardFusedParamOffsets':
+                                    [fp32_partial_fused_offsets],
+                                    'FP16ShardFusedParamOffsets':
+                                    [fp16_partial_fused_offsets],
+                                    'FusedParamOffsets': [fused_offsets],
+                                    'ParamOrder': [param_order],
+                                    'Step': [step],
+                                },
+                                attrs={
+                                    'alignment': self._alignment,
+                                    'rank': rank,
+                                    'nranks': nranks,
+                                    'apply_weight_decay': apply_weight_decay,
+                                    'moment1': 0.0,
+                                    'moment2': 0.0,
+                                    'beta1': self._beta1,
+                                    'beta2': self._beta2,
+                                })
 
         main_block = self.helper.main_program.global_block()
         self._create_global_learning_rate()
@@ -324,14 +326,19 @@ def _apply_gradients_impl(self, params_grads):
                 'Moment2Out': [moment2],
                 'Beta1PowOut': [beta1pow],
                 'Beta2PowOut': [beta2pow],
-                'ParamOut': params,
-                'GradOut': grads,
+                'ParamOut':
+                params,
+                'GradOut':
+                grads,
                 'FoundInf': [self._found_inf],
-                'FP32AccFusedGrad': fp32_acc_fused_grad,
-                'FP16AccFusedGrad': fp16_acc_fused_grad,
-                'AccStep': acc_step,
-                'StopUpdate': self._stop_update
-                if self._stop_update is not None else [],
+                'FP32AccFusedGrad':
+                fp32_acc_fused_grad,
+                'FP16AccFusedGrad':
+                fp16_acc_fused_grad,
+                'AccStep':
+                acc_step,
+                'StopUpdate':
+                self._stop_update if self._stop_update is not None else [],
                 'Step': [step],
             },
             attrs={
diff --git a/python/paddle/incubate/optimizer/functional/bfgs.py b/python/paddle/incubate/optimizer/functional/bfgs.py
index 2065b3c1c94c0..8bf7b71c65aed 100644
--- a/python/paddle/incubate/optimizer/functional/bfgs.py
+++ b/python/paddle/incubate/optimizer/functional/bfgs.py
@@ -91,8 +91,8 @@ def func(x):
 
     if dtype not in ['float32', 'float64']:
         raise ValueError(
-            "The dtype must be 'float32' or 'float64', but the specified is {}.".
-            format(dtype))
+            "The dtype must be 'float32' or 'float64', but the specified is {}."
+            .format(dtype))
 
     op_name = 'minimize_bfgs'
     check_input_type(initial_position, 'initial_position', op_name)
@@ -134,8 +134,8 @@ def body(k, done, is_converge, num_func_calls, xk, value, g1, Hk):
                 dtype=dtype)
         else:
             raise NotImplementedError(
-                "Currently only support line_search_fn = 'strong_wolfe', but the specified is '{}'".
-                format(line_search_fn))
+                "Currently only support line_search_fn = 'strong_wolfe', but the specified is '{}'"
+                .format(line_search_fn))
         num_func_calls += ls_func_calls
 
         #############    update Hk    #############
@@ -150,7 +150,9 @@ def body(k, done, is_converge, num_func_calls, xk, value, g1, Hk):
 
         rhok_inv = paddle.dot(yk, sk)
         rhok = paddle.static.nn.cond(
-            rhok_inv == 0., lambda: paddle.full(shape=[1], fill_value=1000.0, dtype=dtype), lambda: 1. / rhok_inv)
+            rhok_inv == 0.,
+            lambda: paddle.full(shape=[1], fill_value=1000.0, dtype=dtype),
+            lambda: 1. / rhok_inv)
 
         Vk_transpose = I - rhok * sk * yk.t()
         Vk = I - rhok * yk * sk.t()
@@ -162,8 +164,9 @@ def body(k, done, is_converge, num_func_calls, xk, value, g1, Hk):
         #############    check convergence    #############
         gnorm = paddle.linalg.norm(g1, p=np.inf)
         pk_norm = paddle.linalg.norm(pk, p=np.inf)
-        paddle.assign(done | (gnorm < tolerance_grad) |
-                      (pk_norm < tolerance_change), done)
+        paddle.assign(
+            done | (gnorm < tolerance_grad) | (pk_norm < tolerance_change),
+            done)
         paddle.assign(done, is_converge)
         # when alpha=0, there is no chance to get xk change.
         paddle.assign(done | (alpha == 0.), done)
diff --git a/python/paddle/incubate/optimizer/functional/lbfgs.py b/python/paddle/incubate/optimizer/functional/lbfgs.py
index e15ad56dc2d11..d09ba5c6952e0 100644
--- a/python/paddle/incubate/optimizer/functional/lbfgs.py
+++ b/python/paddle/incubate/optimizer/functional/lbfgs.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -91,8 +91,8 @@ def func(x):
     """
     if dtype not in ['float32', 'float64']:
         raise ValueError(
-            "The dtype must be 'float32' or 'float64', but the specified is {}.".
-            format(dtype))
+            "The dtype must be 'float32' or 'float64', but the specified is {}."
+            .format(dtype))
 
     op_name = 'minimize_lbfgs'
     check_input_type(initial_position, 'initial_position', op_name)
@@ -114,8 +114,9 @@ def func(x):
     is_converge = paddle.full(shape=[1], fill_value=False, dtype='bool')
     num_func_calls = paddle.full(shape=[1], fill_value=1, dtype='int64')
 
-    history_size = paddle.full(
-        shape=[1], fill_value=history_size, dtype='int64')
+    history_size = paddle.full(shape=[1],
+                               fill_value=history_size,
+                               dtype='int64')
     head = paddle.full(shape=[1], fill_value=1, dtype='int64')
     tail = paddle.full(shape=[1], fill_value=0, dtype='int64')
 
@@ -140,8 +141,9 @@ def body(k, done, is_converge, num_func_calls, value, xk, g1, sk_vec,
         #############    compute p_k by two-loop recursion    #############
         q = paddle.assign(g1)
         # In a array circle, the index may out of range, so must use mod.
-        i = paddle.full(
-            shape=[1], fill_value=(head - 1).mod(history_size), dtype='int64')
+        i = paddle.full(shape=[1],
+                        fill_value=(head - 1).mod(history_size),
+                        dtype='int64')
 
         def cond(i, q):
             return i != tail
@@ -181,8 +183,8 @@ def body(i, r):
                 dtype=dtype)
         else:
             raise NotImplementedError(
-                "Currently only support line_search_fn = 'strong_wolfe', but the specified is '{}'".
-                format(line_search_fn))
+                "Currently only support line_search_fn = 'strong_wolfe', but the specified is '{}'"
+                .format(line_search_fn))
         paddle.assign(num_func_calls + ls_func_calls, num_func_calls)
 
         #############    update sk_vec, yk_vec, rhok_vec    #############
@@ -191,7 +193,9 @@ def body(i, r):
 
         rhok_inv = paddle.dot(yk, sk)
         rhok = paddle.static.nn.cond(
-            rhok_inv == 0., lambda: paddle.full(shape=[1], fill_value=1000.0, dtype=dtype), lambda: 1. / rhok_inv)
+            rhok_inv == 0.,
+            lambda: paddle.full(shape=[1], fill_value=1000.0, dtype=dtype),
+            lambda: 1. / rhok_inv)
 
         sk_vec[head] = sk
         yk_vec[head] = yk
@@ -211,8 +215,9 @@ def true_fn(tail):
         #############    check convergence    #############
         gnorm = paddle.linalg.norm(g1, p=np.inf)
         pk_norm = paddle.linalg.norm(pk, p=np.inf)
-        paddle.assign(done | (gnorm < tolerance_grad) |
-                      (pk_norm < tolerance_change), done)
+        paddle.assign(
+            done | (gnorm < tolerance_grad) | (pk_norm < tolerance_change),
+            done)
         paddle.assign(done, is_converge)
         # when alpha=0, there is no chance to get xk change.
         paddle.assign(done | (alpha == 0.), done)
@@ -222,11 +227,10 @@ def true_fn(tail):
             rhok_vec, head, tail
         ]
 
-    paddle.static.nn.while_loop(
-        cond=cond,
-        body=body,
-        loop_vars=[
-            k, done, is_converge, num_func_calls, value, xk, g1, sk_vec, yk_vec,
-            rhok_vec, head, tail
-        ])
+    paddle.static.nn.while_loop(cond=cond,
+                                body=body,
+                                loop_vars=[
+                                    k, done, is_converge, num_func_calls, value,
+                                    xk, g1, sk_vec, yk_vec, rhok_vec, head, tail
+                                ])
     return is_converge, num_func_calls, xk, value, g1
diff --git a/python/paddle/incubate/optimizer/functional/line_search.py b/python/paddle/incubate/optimizer/functional/line_search.py
index d42732e605ea0..3aacb137e6e47 100644
--- a/python/paddle/incubate/optimizer/functional/line_search.py
+++ b/python/paddle/incubate/optimizer/functional/line_search.py
@@ -31,8 +31,8 @@ def cubic_interpolation_(x1, f1, g1, x2, f2, g2):
     Returns:
         min_pos: the minimun point between the specified points in the cubic curve.
     """
-    xmin, xmax = paddle.static.nn.cond(x1 <= x2, lambda: (x1, x2),
-                                       lambda: (x2, x1))
+    xmin, xmax = paddle.static.nn.cond(x1 <= x2, lambda: (x1, x2), lambda:
+                                       (x2, x1))
     d1 = g1 + g2 - 3 * (f1 - f2) / (x1 - x2)
     d2_square = d1**2 - g1 * g2
 
@@ -169,8 +169,8 @@ def body_zoom(j, done_zoom, a_lo, phi_lo, derphi_lo, derf_lo, a_hi,
             aj = cubic_interpolation_(a_lo, phi_lo, derphi_lo, a_hi, phi_hi,
                                       derphi_hi)  # 21
             min_change = 0.1 * paddle.abs(a_hi - a_lo)
-            pred = paddle.minimum(
-                paddle.abs(aj - a_lo), paddle.abs(aj - a_hi)) < min_change
+            pred = paddle.minimum(paddle.abs(aj - a_lo),
+                                  paddle.abs(aj - a_hi)) < min_change
             aj = paddle.static.nn.cond(pred, lambda: 0.5 * (a_lo + a_hi),
                                        lambda: aj)
 
@@ -208,13 +208,12 @@ def true_fn():
                 derphi_hi
             ]
 
-        paddle.static.nn.while_loop(
-            cond=cond_zoom,
-            body=body_zoom,
-            loop_vars=[
-                j, done_zoom, a_lo, phi_lo, derphi_lo, derf_lo, a_hi, phi_hi,
-                derphi_hi
-            ])
+        paddle.static.nn.while_loop(cond=cond_zoom,
+                                    body=body_zoom,
+                                    loop_vars=[
+                                        j, done_zoom, a_lo, phi_lo, derphi_lo,
+                                        derf_lo, a_hi, phi_hi, derphi_hi
+                                    ])
         # j is the number of object function called in zoom.
         return j
 
@@ -253,8 +252,8 @@ def true_fn1():
             paddle.assign(derf_1, derf_star)
             paddle.assign(ls_func_calls + j, ls_func_calls)
 
-        pred1 = ~done & ((phi_2 > phi_0 + c1 * a2 * derphi_0) | (
-            (phi_2 >= phi_0) & (i > 1)))
+        pred1 = ~done & ((phi_2 > phi_0 + c1 * a2 * derphi_0) |
+                         ((phi_2 >= phi_0) & (i > 1)))
         paddle.assign(done | pred1, done)
         paddle.static.nn.cond(pred1, true_fn1, None)
 
diff --git a/python/paddle/incubate/optimizer/functional/utils.py b/python/paddle/incubate/optimizer/functional/utils.py
index 3000c82a71e87..d4f69a354918d 100644
--- a/python/paddle/incubate/optimizer/functional/utils.py
+++ b/python/paddle/incubate/optimizer/functional/utils.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -53,18 +53,19 @@ def raise_func():
     else:
 
         def create_tmp_var(program, name, dtype, shape):
-            return program.current_block().create_var(
-                name=name, dtype=dtype, shape=shape)
+            return program.current_block().create_var(name=name,
+                                                      dtype=dtype,
+                                                      shape=shape)
 
-        out_var = create_tmp_var(
-            paddle.static.default_main_program(),
-            name='output',
-            dtype='float32',
-            shape=[-1])
+        out_var = create_tmp_var(paddle.static.default_main_program(),
+                                 name='output',
+                                 dtype='float32',
+                                 shape=[-1])
 
         def false_fn():
-            paddle.static.nn.py_func(
-                func=raise_func, x=is_symmetric, out=out_var)
+            paddle.static.nn.py_func(func=raise_func,
+                                     x=is_symmetric,
+                                     out=out_var)
 
         paddle.static.nn.cond(is_symmetric, None, false_fn)
         # eigvals only support cpu
diff --git a/python/paddle/incubate/optimizer/lookahead.py b/python/paddle/incubate/optimizer/lookahead.py
index 720a84a24f0aa..8f70f321c0db6 100644
--- a/python/paddle/incubate/optimizer/lookahead.py
+++ b/python/paddle/incubate/optimizer/lookahead.py
@@ -129,12 +129,11 @@ def __init__(self, inner_optimizer, alpha=0.5, k=5, name=None):
         else:
             parameters = self.inner_optimizer._parameter_list
 
-        super(LookAhead, self).__init__(
-            learning_rate=alpha,
-            parameters=parameters,
-            weight_decay=None,
-            grad_clip=None,
-            name=name)
+        super(LookAhead, self).__init__(learning_rate=alpha,
+                                        parameters=parameters,
+                                        weight_decay=None,
+                                        grad_clip=None,
+                                        name=name)
 
         self.alpha = alpha
         self.k = k
@@ -180,8 +179,9 @@ def step(self):
                 grad_var = param._grad_ivar()
                 params_grads.append((param, grad_var))
 
-        self._apply_optimize(
-            loss=None, startup_program=None, params_grads=params_grads)
+        self._apply_optimize(loss=None,
+                             startup_program=None,
+                             params_grads=params_grads)
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
@@ -198,16 +198,16 @@ def _increment_global_var(self):
                 dtype='int32',
                 persistable=True)
 
-        self.helper.append_op(
-            type='increment',
-            inputs={'X': [self._global_step_var]},
-            outputs={'Out': [self._global_step_var]},
-            attrs={'step': 1.0})
+        self.helper.append_op(type='increment',
+                              inputs={'X': [self._global_step_var]},
+                              outputs={'Out': [self._global_step_var]},
+                              attrs={'step': 1.0})
 
     def _append_optimize_op(self, block, param_and_grad):
         one_var = paddle.ones(shape=[1], dtype='int32', name='lookahead_ones')
-        zero_var = paddle.zeros(
-            shape=[1], dtype='int32', name='lookahead_zeros')
+        zero_var = paddle.zeros(shape=[1],
+                                dtype='int32',
+                                name='lookahead_zeros')
         k_var = layers.create_global_var(
             name=unique_name.generate("lookahead_k"),
             shape=[1],
@@ -291,7 +291,8 @@ def minimize(self,
 
         self._increment_global_var()
 
-        _ = self._apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
+        _ = self._apply_optimize(loss,
+                                 startup_program=startup_program,
+                                 params_grads=params_grads)
 
         return optimize_ops, params_grads
diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py
index c3ca6dc873156..361827ba48de2 100644
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -168,12 +168,11 @@ def __init__(self,
                  min_average_window=10000,
                  max_average_window=10000,
                  name=None):
-        super(ModelAverage, self).__init__(
-            learning_rate=0.0,
-            parameters=parameters,
-            weight_decay=None,
-            grad_clip=None,
-            name=name)
+        super(ModelAverage, self).__init__(learning_rate=0.0,
+                                           parameters=parameters,
+                                           weight_decay=None,
+                                           grad_clip=None,
+                                           name=name)
 
         self.helper = LayerHelper(self.__class__.__name__)
         self.average_window = average_window_rate
@@ -208,12 +207,18 @@ def _create_accumulators(self, block, parameters):
             self._add_accumulator('sum_2', param)
             self._add_accumulator('sum_3', param)
             self._add_accumulator('restore', param)
-            self._add_accumulator(
-                'num_accumulates', param, dtype='int64', shape=[1])
-            self._add_accumulator(
-                'old_num_accumulates', param, dtype='int64', shape=[1])
-            self._add_accumulator(
-                'num_updates', param, dtype='int64', shape=[1])
+            self._add_accumulator('num_accumulates',
+                                  param,
+                                  dtype='int64',
+                                  shape=[1])
+            self._add_accumulator('old_num_accumulates',
+                                  param,
+                                  dtype='int64',
+                                  shape=[1])
+            self._add_accumulator('num_updates',
+                                  param,
+                                  dtype='int64',
+                                  shape=[1])
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -262,12 +267,11 @@ def _append_optimize_op(self, block, param_and_grad):
             "out_num_updates": num_updates,
         }
 
-        average_accumulates_op = block.append_op(
-            type=self.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs,
-            stop_gradient=True)
+        average_accumulates_op = block.append_op(type=self.type,
+                                                 inputs=inputs,
+                                                 outputs=outputs,
+                                                 attrs=attrs,
+                                                 stop_gradient=True)
 
         return average_accumulates_op
 
@@ -425,8 +429,8 @@ def apply(self, executor=None, need_restore=True):
                 total_param = sum_1 + sum_2 + sum_3
                 total_accumulates = num_accumulates + old_num_accumulates
                 total_param = paddle.cast(total_param, dtype='float32')
-                total_accumulates = paddle.cast(
-                    total_accumulates, dtype='float32')
+                total_accumulates = paddle.cast(total_accumulates,
+                                                dtype='float32')
                 average_param = total_param / total_accumulates
                 paddle.assign(average_param, param)
             try:
diff --git a/python/paddle/incubate/passes/fuse_resnet_unit_pass.py b/python/paddle/incubate/passes/fuse_resnet_unit_pass.py
index 4b5dca6141879..451ea1908f910 100644
--- a/python/paddle/incubate/passes/fuse_resnet_unit_pass.py
+++ b/python/paddle/incubate/passes/fuse_resnet_unit_pass.py
@@ -22,30 +22,32 @@ def set_resnet_unit_attrs(resnet_unit, has_shortcut):
     resnet_unit.SetAttr("has_shortcut", has_shortcut)
     resnet_unit.SetAttr("data_format", 'NHWC')
     resnet_unit.SetAttr("dilation", 1)
-    resnet_unit.Attr("stride").MappedPattern(
-        op="conv2d", name="strides", element_index=0)
-    resnet_unit.Attr("padding").MappedPattern(
-        op="conv2d", name="paddings", element_index=0)
+    resnet_unit.Attr("stride").MappedPattern(op="conv2d",
+                                             name="strides",
+                                             element_index=0)
+    resnet_unit.Attr("padding").MappedPattern(op="conv2d",
+                                              name="paddings",
+                                              element_index=0)
     resnet_unit.Attr("group").MappedPattern(op="conv2d", name="groups")
     resnet_unit.Attr("op_device").MappedPattern(op="conv2d", name="op_device")
-    resnet_unit.Attr("op_namescope").MappedPattern(
-        op="conv2d", name="op_namescope")
+    resnet_unit.Attr("op_namescope").MappedPattern(op="conv2d",
+                                                   name="op_namescope")
     resnet_unit.Attr("momentum").MappedPattern(op="batch_norm", name="momentum")
     resnet_unit.Attr("epsilon").MappedPattern(op="batch_norm", name="epsilon")
-    resnet_unit.Attr("use_global_stats").MappedPattern(
-        op="batch_norm", name="use_global_stats")
+    resnet_unit.Attr("use_global_stats").MappedPattern(op="batch_norm",
+                                                       name="use_global_stats")
 
 
 def set_resnet_unit_outputs(resnet_unit, meanX, varX, meanZ=None, varZ=None):
-    resnet_unit.SetOutputs(
-        RunningMeanX=meanX,
-        RunningVarX=varX,
-        RunningMeanZ=meanZ,
-        RunningVarZ=varZ)
+    resnet_unit.SetOutputs(RunningMeanX=meanX,
+                           RunningVarX=varX,
+                           RunningMeanZ=meanZ,
+                           RunningVarZ=varZ)
 
 
 @ir.RegisterPass
 def fuse_resnet_unit():
+
     def pattern_conv_bn(x, filter, scale, bias, mean, var):
         filter.Attr("shape")[0].Mod(32).EQ(0)
         filter.Attr("shape")[1].Mod(8).EQ(0)
@@ -53,8 +55,11 @@ def pattern_conv_bn(x, filter, scale, bias, mean, var):
         filter.Attr("shape")[3].EQ(1)
         conv2d = ir.PassDesc.OP.conv2d(Input=x, Filter=filter)
         conv2d.SetAttr("data_format", 'NHWC')
-        bn = ir.PassDesc.OP.batch_norm(
-            X=conv2d, Bias=bias, Mean=mean, Scale=scale, Variance=var)
+        bn = ir.PassDesc.OP.batch_norm(X=conv2d,
+                                       Bias=bias,
+                                       Mean=mean,
+                                       Scale=scale,
+                                       Variance=var)
         return bn
 
     def pattern_one_input(x, filter, scale, bias, mean, var):
@@ -63,8 +68,12 @@ def pattern_one_input(x, filter, scale, bias, mean, var):
         return relu
 
     def replace_one_input(x, filter, scale, bias, mean, var):
-        resnet_unit = ir.PassDesc.OP.resnet_unit(
-            X=x, FilterX=filter, ScaleX=scale, BiasX=bias, MeanX=mean, VarX=var)
+        resnet_unit = ir.PassDesc.OP.resnet_unit(X=x,
+                                                 FilterX=filter,
+                                                 ScaleX=scale,
+                                                 BiasX=bias,
+                                                 MeanX=mean,
+                                                 VarX=var)
         set_resnet_unit_attrs(resnet_unit, False)
         set_resnet_unit_outputs(resnet_unit, mean, var)
         return resnet_unit.Output("Y")
@@ -73,26 +82,25 @@ def pattern_two_input(x, filterX, scaleX, biasX, meanX, varX, z, filterZ,
                           scaleZ, biasZ, meanZ, varZ):
         bnX = pattern_conv_bn(x, filterX, scaleX, biasX, meanX, varX)
         bnZ = pattern_conv_bn(x, filterZ, scaleZ, biasZ, meanZ, varZ)
-        ewadd = ir.PassDesc.OP.elementwise_add(
-            X=bnX.Output("Y"), Y=bnZ.Output("Y"))
+        ewadd = ir.PassDesc.OP.elementwise_add(X=bnX.Output("Y"),
+                                               Y=bnZ.Output("Y"))
         relu = ir.PassDesc.OP.relu(X=ewadd)
         return relu
 
     def replace_two_input(x, filterX, scaleX, biasX, meanX, varX, z, filterZ,
                           scaleZ, biasZ, meanZ, varZ):
-        resnet_unit = ir.PassDesc.OP.resnet_unit(
-            X=x,
-            FilterX=filterX,
-            ScaleX=scaleX,
-            BiasX=biasX,
-            MeanX=meanX,
-            VarX=varX,
-            Z=z,
-            FilterZ=filterZ,
-            ScaleZ=scaleZ,
-            BiasZ=biasZ,
-            MeanZ=meanZ,
-            VarZ=varZ)
+        resnet_unit = ir.PassDesc.OP.resnet_unit(X=x,
+                                                 FilterX=filterX,
+                                                 ScaleX=scaleX,
+                                                 BiasX=biasX,
+                                                 MeanX=meanX,
+                                                 VarX=varX,
+                                                 Z=z,
+                                                 FilterZ=filterZ,
+                                                 ScaleZ=scaleZ,
+                                                 BiasZ=biasZ,
+                                                 MeanZ=meanZ,
+                                                 VarZ=varZ)
         set_resnet_unit_attrs(resnet_unit, True)
         set_resnet_unit_outputs(resnet_unit, meanX, varX, meanZ, varZ)
         return resnet_unit.Output("Y")
diff --git a/python/paddle/sparse/layer/__init__.py b/python/paddle/incubate/sparse/__init__.py
similarity index 69%
rename from python/paddle/sparse/layer/__init__.py
rename to python/paddle/incubate/sparse/__init__.py
index 8a814b514276f..c499c017a48e8 100644
--- a/python/paddle/sparse/layer/__init__.py
+++ b/python/paddle/incubate/sparse/__init__.py
@@ -12,10 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .unary import ReLU
-from .norm import BatchNorm
-from .conv import Conv3D
-from .conv import SubmConv3D
-from .pooling import MaxPool3D
+from .creation import sparse_coo_tensor
+from .creation import sparse_csr_tensor
 
-__all__ = []
+from .unary import sqrt
+from .unary import sin
+from .unary import tanh
+
+from . import nn
+
+__all__ = [
+    'sparse_coo_tensor',
+    'sparse_csr_tensor',
+    'sqrt',
+    'sin',
+    'tanh',
+]
diff --git a/python/paddle/sparse/creation.py b/python/paddle/incubate/sparse/creation.py
similarity index 89%
rename from python/paddle/sparse/creation.py
rename to python/paddle/incubate/sparse/creation.py
index 2cfbb3144acc2..74167a9527a49 100644
--- a/python/paddle/sparse/creation.py
+++ b/python/paddle/incubate/sparse/creation.py
@@ -14,11 +14,10 @@
 
 import paddle
 from paddle import _C_ops
-from ..framework import core, dygraph_only
-from ..framework import _current_expected_place, _get_paddle_place
-from ..tensor import to_tensor
-from ..tensor import max
-from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
+from paddle.fluid.framework import core, dygraph_only
+from paddle.fluid.framework import _current_expected_place, _get_paddle_place
+from paddle.tensor import to_tensor, max
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 
 import numpy as np
 
@@ -49,8 +48,9 @@ def _get_place(place):
     place = _get_paddle_place(place)
     if place is None:
         place = _current_expected_place()
-    elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace,
-                                core.CUDAPlace)):
+    elif not isinstance(
+            place,
+        (core.Place, core.CPUPlace, core.CUDAPinnedPlace, core.CUDAPlace)):
         raise ValueError(
             "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace"
         )
@@ -112,7 +112,7 @@ def sparse_coo_tensor(indices,
             indices = [[0, 1, 2], [1, 2, 0]]
             values = [1.0, 2.0, 3.0]
             dense_shape = [3, 3]
-            coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
+            coo = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape)
             # print(coo)
             # Tensor(shape=[2, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
             #       indices=[[0, 1, 2],
@@ -123,8 +123,10 @@ def sparse_coo_tensor(indices,
     place = _get_place(place)
 
     if not isinstance(indices, core.eager.Tensor):
-        indices = to_tensor(
-            indices, dtype=None, place=place, stop_gradient=True)
+        indices = to_tensor(indices,
+                            dtype=None,
+                            place=place,
+                            stop_gradient=True)
     if not isinstance(values, core.eager.Tensor):
         values = to_tensor(values, dtype, place, stop_gradient)
     if len(indices.shape) != 2:
@@ -137,8 +139,8 @@ def sparse_coo_tensor(indices,
 
     if nnz != values.shape[0]:
         raise ValueError(
-            "the indices and values must have same number of non-zero, but get {} and {}".
-            format(nnz, values.shape[0]))
+            "the indices and values must have same number of non-zero, but get {} and {}"
+            .format(nnz, values.shape[0]))
 
     dense_dim = len(values.shape) - 1
 
@@ -156,15 +158,16 @@ def sparse_coo_tensor(indices,
         shape = min_shape
     else:
         if shape < min_shape:
-            raise ValueError("the minimun shape required is {}, but get {}".
-                             format(min_shape, shape))
+            raise ValueError(
+                "the minimun shape required is {}, but get {}".format(
+                    min_shape, shape))
         if len(shape) != sparse_dim + dense_dim:
             raise ValueError(
-                "the number of dimensions(len(shape) must be sparse_dim({}) + dense_dim({}), but get {}".
-                format(sparse_dim, dense_dim, len(shape)))
+                "the number of dimensions(len(shape) must be sparse_dim({}) + dense_dim({}), but get {}"
+                .format(sparse_dim, dense_dim, len(shape)))
 
-    return _C_ops.final_state_sparse_create_sparse_coo_tensor(values, indices,
-                                                              shape)
+    return _C_ops.final_state_sparse_create_sparse_coo_tensor(
+        values, indices, shape)
 
 
 #TODO: need to support shape is None
@@ -222,7 +225,7 @@ def sparse_csr_tensor(crows,
             cols = [1, 3, 2, 0, 1]
             values = [1, 2, 3, 4, 5]
             dense_shape = [3, 4]
-            csr = paddle.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
+            csr = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
             # print(csr)
             # Tensor(shape=[3, 4], dtype=paddle.int64, place=Place(gpu:0), stop_gradient=True,
             #       crows=[0, 2, 3, 5],
@@ -267,8 +270,8 @@ def sparse_csr_tensor(crows,
     if len(shape) == 2:
         if crows.shape[0] != shape[0] + 1:
             raise ValueError(
-                "The length({}) of crows must be equal to the rows({})+1 of matrix.".
-                format(crows.shape[0], shape[0]))
+                "The length({}) of crows must be equal to the rows({})+1 of matrix."
+                .format(crows.shape[0], shape[0]))
         if crows[0] != 0:
             raise ValueError("the 0th value of crows must be 0")
 
@@ -278,9 +281,9 @@ def sparse_csr_tensor(crows,
     else:
         if crows.shape[0] % (shape[0] + 1) != 0:
             raise ValueError(
-                "The length({}) of crows must be divisible the rows({})+1 of matrix.".
-                format(crows.shape[0], shape[0]))
-    # TODO(zkh2016): check whether the value in crows and cols is legal 
+                "The length({}) of crows must be divisible the rows({})+1 of matrix."
+                .format(crows.shape[0], shape[0]))
+    # TODO(zkh2016): check whether the value in crows and cols is legal
 
     return core.eager.sparse_csr_tensor(crows, cols, values, shape,
                                         stop_gradient)
diff --git a/python/paddle/sparse/__init__.py b/python/paddle/incubate/sparse/nn/__init__.py
similarity index 59%
rename from python/paddle/sparse/__init__.py
rename to python/paddle/incubate/sparse/nn/__init__.py
index 26a2f0cfadbe7..be4985e694b4b 100644
--- a/python/paddle/sparse/__init__.py
+++ b/python/paddle/incubate/sparse/nn/__init__.py
@@ -12,21 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .creation import sparse_coo_tensor
-from .creation import sparse_csr_tensor
-from .layer import ReLU
-from .layer import BatchNorm
+from . import functional
 
-from .layer import Conv3D
-from .layer import SubmConv3D
-
-from .layer import MaxPool3D
-
-from .functional import sqrt
-from .functional import sin
-from .functional import tanh
+from .layer.activation import ReLU
+from .layer.norm import BatchNorm
+from .layer.conv import Conv3D
+from .layer.conv import SubmConv3D
+from .layer.pooling import MaxPool3D
 
 __all__ = [
-    'sparse_coo_tensor', 'sparse_csr_tensor', 'ReLU', 'Conv3D', 'SubmConv3D',
-    'BatchNorm', 'MaxPool3D', 'sqrt', 'sin', 'tanh'
+    'ReLU',
+    'BatchNorm',
+    'Conv3D',
+    'SubmConv3D',
+    'MaxPool3D',
 ]
diff --git a/python/paddle/sparse/functional/__init__.py b/python/paddle/incubate/sparse/nn/functional/__init__.py
similarity index 76%
rename from python/paddle/sparse/functional/__init__.py
rename to python/paddle/incubate/sparse/nn/functional/__init__.py
index cfefa3ff4ff76..a16a8a8240a23 100644
--- a/python/paddle/sparse/functional/__init__.py
+++ b/python/paddle/incubate/sparse/nn/functional/__init__.py
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .unary import relu  # noqa: F401
-from .unary import tanh  # noqa: F401
-from .unary import sqrt  # noqa: F401
-from .unary import sin  # noqa: F401
 from .conv import conv3d  # noqa: F401
 from .conv import subm_conv3d  # noqa: F401
 from .pooling import max_pool3d  # noqa: F401
+from .activation import relu  # noqa: F401
 
-__all__ = ['relu', 'tanh', 'conv3d', 'subm_conv3d', 'max_pool3d', 'sqrt', 'sin']
+__all__ = [
+    'conv3d',
+    'subm_conv3d',
+    'max_pool3d',
+    'relu',
+]
diff --git a/python/paddle/incubate/sparse/nn/functional/activation.py b/python/paddle/incubate/sparse/nn/functional/activation.py
new file mode 100644
index 0000000000000..3396cc53cc479
--- /dev/null
+++ b/python/paddle/incubate/sparse/nn/functional/activation.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = []
+
+from paddle import _C_ops, in_dynamic_mode
+
+
+def relu(x, name=None):
+    """
+    sparse relu activation, requiring x to be a sparse coo or sparse csr tensor.
+
+    .. math::
+
+        out = max(x, 0)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
+                sparse_x = dense_x.to_sparse_coo(1)
+                out = paddle.incubate.sparse.nn.functional.relu(sparse_x) 
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+
+    if x.is_sparse_coo() or x.is_sparse_csr():
+        return _C_ops.final_state_sparse_relu(x)
+    else:
+        raise ValueError(
+            "Currently, sparse.relu only support the input of SparseCooTensor or SparseCsrTensor"
+        )
diff --git a/python/paddle/sparse/functional/conv.py b/python/paddle/incubate/sparse/nn/functional/conv.py
similarity index 94%
rename from python/paddle/sparse/functional/conv.py
rename to python/paddle/incubate/sparse/nn/functional/conv.py
index 42b7b49835cf0..75c0514da8e0e 100644
--- a/python/paddle/sparse/functional/conv.py
+++ b/python/paddle/incubate/sparse/nn/functional/conv.py
@@ -15,9 +15,9 @@
 __all__ = []
 
 from paddle import _C_ops, in_dynamic_mode
-from ...fluid.layers.utils import convert_to_list
-from ...fluid.layers.nn import elementwise_add
-from .. import sparse_coo_tensor
+from paddle.fluid.layers.utils import convert_to_list
+from paddle.fluid.layers.nn import elementwise_add
+from ...creation import sparse_coo_tensor
 from paddle.nn.functional.conv import _update_padding_nd
 
 
@@ -67,11 +67,10 @@ def _conv3d(x,
     if bias is not None:
         values = pre_bias.values()
         add_bias = elementwise_add(values, bias, axis=1)
-        return sparse_coo_tensor(
-            pre_bias.indices(),
-            add_bias,
-            shape=pre_bias.shape,
-            stop_gradient=pre_bias.stop_gradient)
+        return sparse_coo_tensor(pre_bias.indices(),
+                                 add_bias,
+                                 shape=pre_bias.shape,
+                                 stop_gradient=pre_bias.stop_gradient)
     else:
         return pre_bias
 
@@ -180,9 +179,9 @@ def conv3d(x,
               indices = paddle.to_tensor(indices, dtype='int32')
               values = paddle.to_tensor(values, dtype='float32')
               dense_shape = [1, 1, 3, 4, 1]
-              sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
+              sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
               weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
-              y = paddle.sparse.functional.conv3d(sparse_x, weight)
+              y = paddle.incubate.sparse.nn.functional.conv3d(sparse_x, weight)
               print(y.shape)
               # (1, 1, 1, 2, 1)
     """
@@ -295,9 +294,9 @@ def subm_conv3d(x,
               indices = paddle.to_tensor(indices, dtype='int32')
               values = paddle.to_tensor(values, dtype='float32')
               dense_shape = [1, 1, 3, 4, 1]
-              sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
+              sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
               weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
-              y = paddle.sparse.functional.subm_conv3d(sparse_x, weight)
+              y = paddle.incubate.sparse.nn.functional.subm_conv3d(sparse_x, weight)
               print(y.shape)
               #(1, 1, 3, 4, 1)
     """
diff --git a/python/paddle/sparse/functional/pooling.py b/python/paddle/incubate/sparse/nn/functional/pooling.py
similarity index 91%
rename from python/paddle/sparse/functional/pooling.py
rename to python/paddle/incubate/sparse/nn/functional/pooling.py
index ab5106b31689d..8ed4444e89c26 100644
--- a/python/paddle/sparse/functional/pooling.py
+++ b/python/paddle/incubate/sparse/nn/functional/pooling.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.layers import utils
+from paddle.fluid.layers import utils
 from paddle import _C_ops, in_dynamic_mode
 from paddle.nn.functional.pooling import _update_padding_nd
 
@@ -70,7 +70,7 @@ def max_pool3d(x,
                 kernel_sizes = [3, 3, 3]
                 paddings = [0, 0, 0]
                 strides = [1, 1, 1]
-                out = paddle.sparse.functional.max_pool3d(sparse_x, kernel_sizes, stride=strides, padding=paddings)
+                out = paddle.incubate.sparse.nn.functional.max_pool3d(sparse_x, kernel_sizes, stride=strides, padding=paddings)
                 #[1, 2, 2, 2, 3]
     """
 
@@ -87,8 +87,10 @@ def max_pool3d(x,
 
     channel_last = True
 
-    padding, padding_algorithm = _update_padding_nd(
-        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(padding,
+                                                    3,
+                                                    channel_last=channel_last,
+                                                    ceil_mode=ceil_mode)
 
     #TODO(zkh2016): remove the dependency on dilation from the backend
     dilation = [1, 1, 1]
diff --git a/python/paddle/sparse/layer/unary.py b/python/paddle/incubate/sparse/nn/layer/activation.py
similarity index 97%
rename from python/paddle/sparse/layer/unary.py
rename to python/paddle/incubate/sparse/nn/layer/activation.py
index ad0dbc1880782..75285eb11adc2 100644
--- a/python/paddle/sparse/layer/unary.py
+++ b/python/paddle/incubate/sparse/nn/layer/activation.py
@@ -44,7 +44,7 @@ class ReLU(Layer):
                 dense_x = paddle.to_tensor(x, dtype='float32')
                 sparse_dim = 2
                 sparse_x = dense_x.to_sparse_coo(sparse_dim)
-                relu = paddle.sparse.ReLU()
+                relu = paddle.incubate.sparse.nn.ReLU()
                 out = relu(sparse_x)
                 #out.values: [0., 2., 0., 4., 5.]
     """
diff --git a/python/paddle/sparse/layer/conv.py b/python/paddle/incubate/sparse/nn/layer/conv.py
similarity index 87%
rename from python/paddle/sparse/layer/conv.py
rename to python/paddle/incubate/sparse/nn/layer/conv.py
index ff421a06a1344..05309e5bbfe44 100644
--- a/python/paddle/sparse/layer/conv.py
+++ b/python/paddle/incubate/sparse/nn/layer/conv.py
@@ -16,13 +16,14 @@
 from .. import functional as F
 from paddle.nn import Layer
 from paddle.nn.initializer import Normal
-from ..functional.conv import _update_padding_nd
-from ...fluid.layers import utils
+from paddle.nn.functional.conv import _update_padding_nd
+from paddle.fluid.layers import utils
 
 __all__ = []
 
 
 class _Conv3D(Layer):
+
     def __init__(self,
                  in_channels,
                  out_channels,
@@ -86,16 +87,15 @@ def _get_default_param_initializer():
         self.bias = None
 
     def forward(self, x):
-        out = F.conv._conv3d(
-            x,
-            self.weight,
-            bias=self.bias,
-            stride=self._stride,
-            padding=self._updated_padding,
-            dilation=self._dilation,
-            groups=self._groups,
-            subm=self._subm,
-            data_format=self._data_format)
+        out = F.conv._conv3d(x,
+                             self.weight,
+                             bias=self.bias,
+                             stride=self._stride,
+                             padding=self._updated_padding,
+                             dilation=self._dilation,
+                             groups=self._groups,
+                             subm=self._subm,
+                             data_format=self._data_format)
         return out
 
     def extra_repr(self):
@@ -213,8 +213,8 @@ class Conv3D(_Conv3D):
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
             dense_shape = [1, 1, 3, 4, 1]
-            sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
-            conv = paddle.sparse.Conv3D(1, 1, (1, 3, 3))
+            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
+            conv = paddle.incubate.sparse.nn.Conv3D(1, 1, (1, 3, 3))
             y = conv(sparse_x)
             print(y.shape)
             # (1, 1, 1, 2, 1)
@@ -232,19 +232,18 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NDHWC"):
-        super(Conv3D, self).__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            subm=False,
-            padding_mode=padding_mode,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr,
-            data_format=data_format)
+        super(Conv3D, self).__init__(in_channels,
+                                     out_channels,
+                                     kernel_size,
+                                     stride=stride,
+                                     padding=padding,
+                                     dilation=dilation,
+                                     groups=groups,
+                                     subm=False,
+                                     padding_mode=padding_mode,
+                                     weight_attr=weight_attr,
+                                     bias_attr=bias_attr,
+                                     data_format=data_format)
 
 
 class SubmConv3D(_Conv3D):
@@ -346,8 +345,8 @@ class SubmConv3D(_Conv3D):
             dense_shape = [1, 1, 3, 4, 1]
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
-            sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
-            subm_conv = paddle.sparse.SubmConv3D(1, 1, (1, 3, 3))
+            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
+            subm_conv = paddle.incubate.sparse.nn.SubmConv3D(1, 1, (1, 3, 3))
             y = subm_conv(sparse_x)
             print(y.shape)
             # (1, 1, 3, 4, 1)
@@ -365,16 +364,15 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NDHWC"):
-        super(SubmConv3D, self).__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            subm=True,
-            padding_mode=padding_mode,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr,
-            data_format=data_format)
+        super(SubmConv3D, self).__init__(in_channels,
+                                         out_channels,
+                                         kernel_size,
+                                         stride=stride,
+                                         padding=padding,
+                                         dilation=dilation,
+                                         groups=groups,
+                                         subm=True,
+                                         padding_mode=padding_mode,
+                                         weight_attr=weight_attr,
+                                         bias_attr=bias_attr,
+                                         data_format=data_format)
diff --git a/python/paddle/sparse/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py
similarity index 91%
rename from python/paddle/sparse/layer/norm.py
rename to python/paddle/incubate/sparse/nn/layer/norm.py
index 83b738a5dc354..4d4cf7df2f2e4 100644
--- a/python/paddle/sparse/layer/norm.py
+++ b/python/paddle/incubate/sparse/nn/layer/norm.py
@@ -100,7 +100,7 @@ class BatchNorm(paddle.nn.BatchNorm1D):
               x_data = paddle.randn((1, 6, 6, 6, channels)).astype('float32')
               dense_x = paddle.to_tensor(x_data) 
               sparse_x = dense_x.to_sparse_coo(4)
-              batch_norm = paddle.sparse.BatchNorm(channels)
+              batch_norm = paddle.incubate.sparse.nn.BatchNorm(channels)
               batch_norm_out = batch_norm(sparse_x)
               print(batch_norm_out.shape)
               # [1, 6, 6, 6, 3]
@@ -115,15 +115,14 @@ def __init__(self,
                  data_format='NDHWC',
                  use_global_stats=None,
                  name=None):
-        super(BatchNorm, self).__init__(
-            num_features,
-            momentum=momentum,
-            epsilon=epsilon,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr,
-            data_format=data_format,
-            use_global_stats=use_global_stats,
-            name=name)
+        super(BatchNorm, self).__init__(num_features,
+                                        momentum=momentum,
+                                        epsilon=epsilon,
+                                        weight_attr=weight_attr,
+                                        bias_attr=bias_attr,
+                                        data_format=data_format,
+                                        use_global_stats=use_global_stats,
+                                        name=name)
 
     def _check_data_format(self, input):
         if input != "NDHWC":
@@ -153,7 +152,7 @@ def forward(self, input):
             data_format='NC',
             use_global_stats=self._use_global_stats)
 
-        return paddle.sparse.sparse_coo_tensor(
+        return paddle.incubate.sparse.sparse_coo_tensor(
             input.indices(),
             batch_norm_out,
             shape=input.shape,
diff --git a/python/paddle/sparse/layer/pooling.py b/python/paddle/incubate/sparse/nn/layer/pooling.py
similarity index 91%
rename from python/paddle/sparse/layer/pooling.py
rename to python/paddle/incubate/sparse/nn/layer/pooling.py
index 9cfe463eed577..9fb67ecc0a6dd 100644
--- a/python/paddle/sparse/layer/pooling.py
+++ b/python/paddle/incubate/sparse/nn/layer/pooling.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -66,7 +66,7 @@ class MaxPool3D(Layer):
             with _test_eager_guard():
                 dense_x = paddle.randn((2, 3, 6, 6, 3))
                 sparse_x = dense_x.to_sparse_coo(4)
-                max_pool3d = paddle.sparse.MaxPool3D(
+                max_pool3d = paddle.incubate.sparse.nn.MaxPool3D(
                     kernel_size=3, data_format='NDHWC')
                 out = max_pool3d(sparse_x)
                 #shape=[2, 1, 2, 2, 3]
@@ -91,14 +91,13 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        return F.max_pool3d(
-            x,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            ceil_mode=self.ceil_mode,
-            data_format=self.data_format,
-            name=self.name)
+        return F.max_pool3d(x,
+                            kernel_size=self.ksize,
+                            stride=self.stride,
+                            padding=self.padding,
+                            ceil_mode=self.ceil_mode,
+                            data_format=self.data_format,
+                            name=self.name)
 
     def extra_repr(self):
         return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
diff --git a/python/paddle/sparse/functional/unary.py b/python/paddle/incubate/sparse/unary.py
similarity index 75%
rename from python/paddle/sparse/functional/unary.py
rename to python/paddle/incubate/sparse/unary.py
index 550e6a2a39261..85e4088de7d78 100644
--- a/python/paddle/sparse/functional/unary.py
+++ b/python/paddle/incubate/sparse/unary.py
@@ -17,44 +17,6 @@
 from paddle import _C_ops, in_dynamic_mode
 
 
-def relu(x, name=None):
-    """
-    sparse relu activation, requiring x to be a sparse coo or sparse csr tensor.
-
-    .. math::
-
-        out = max(x, 0)
-
-    Parameters:
-        x (Tensor): The input Sparse Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        A Sparse Tensor with the same data type and shape as ``x`` .
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            from paddle.fluid.framework import _test_eager_guard
-
-            with _test_eager_guard():
-                dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
-                sparse_x = dense_x.to_sparse_coo(1)
-                out = paddle.sparse.functional.relu(sparse_x) 
-    """
-
-    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
-
-    if x.is_sparse_coo() or x.is_sparse_csr():
-        return _C_ops.final_state_sparse_relu(x)
-    else:
-        raise ValueError(
-            "Currently, sparse.relu only support the input of SparseCooTensor or SparseCsrTensor"
-        )
-
-
 def tanh(x, name=None):
     """
     sparse tanh activation, requiring x to be a sparse coo or sparse csr tensor.
@@ -80,7 +42,7 @@ def tanh(x, name=None):
             with _test_eager_guard():
                 dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
                 sparse_x = dense_x.to_sparse_coo(1)
-                out = paddle.sparse.tanh(sparse_x)
+                out = paddle.incubate.sparse.tanh(sparse_x)
     """
 
     assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
@@ -118,7 +80,7 @@ def sqrt(x, name=None):
             with _test_eager_guard():
                 dense_x = paddle.to_tensor([4, 0, 1], dtype='float32')
                 sparse_x = dense_x.to_sparse_coo(1)
-                out = paddle.sparse.sqrt(sparse_x)
+                out = paddle.incubate.sparse.sqrt(sparse_x)
     """
 
     assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
@@ -156,7 +118,7 @@ def sin(x, name=None):
             with _test_eager_guard():
                 dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
                 sparse_x = dense_x.to_sparse_coo(1)
-                out = paddle.sparse.sin(sparse_x)
+                out = paddle.incubate.sparse.sin(sparse_x)
     """
 
     assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
diff --git a/python/paddle/incubate/tensor/__init__.py b/python/paddle/incubate/tensor/__init__.py
index b585a0dd4d8bc..01dfab4482d66 100644
--- a/python/paddle/incubate/tensor/__init__.py
+++ b/python/paddle/incubate/tensor/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py
index 07dc7c1581fc4..7ce2e735b6f11 100644
--- a/python/paddle/incubate/tensor/math.py
+++ b/python/paddle/incubate/tensor/math.py
@@ -57,21 +57,25 @@ def segment_sum(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "SUM")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
-                                         "int64"), "segment_pool")
+    check_variable_and_dtype(data, "X",
+                             ("float32", "float64", "int32", "int64"),
+                             "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
     helper = LayerHelper("segment_sum", **locals())
     out = helper.create_variable_for_type_inference(dtype=data.dtype)
     summed_ids = helper.create_variable_for_type_inference(dtype=data.dtype)
-    helper.append_op(
-        type="segment_pool",
-        inputs={"X": data,
-                "SegmentIds": segment_ids},
-        outputs={"Out": out,
-                 "SummedIds": summed_ids},
-        attrs={"pooltype": "SUM"})
+    helper.append_op(type="segment_pool",
+                     inputs={
+                         "X": data,
+                         "SegmentIds": segment_ids
+                     },
+                     outputs={
+                         "Out": out,
+                         "SummedIds": summed_ids
+                     },
+                     attrs={"pooltype": "SUM"})
     return out
 
 
@@ -114,21 +118,25 @@ def segment_mean(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MEAN")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
-                                         "int64"), "segment_pool")
+    check_variable_and_dtype(data, "X",
+                             ("float32", "float64", "int32", "int64"),
+                             "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
     helper = LayerHelper("segment_mean", **locals())
     out = helper.create_variable_for_type_inference(dtype=data.dtype)
     summed_ids = helper.create_variable_for_type_inference(dtype=data.dtype)
-    helper.append_op(
-        type="segment_pool",
-        inputs={"X": data,
-                "SegmentIds": segment_ids},
-        outputs={"Out": out,
-                 "SummedIds": summed_ids},
-        attrs={"pooltype": "MEAN"})
+    helper.append_op(type="segment_pool",
+                     inputs={
+                         "X": data,
+                         "SegmentIds": segment_ids
+                     },
+                     outputs={
+                         "Out": out,
+                         "SummedIds": summed_ids
+                     },
+                     attrs={"pooltype": "MEAN"})
     return out
 
 
@@ -171,21 +179,25 @@ def segment_min(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MIN")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
-                                         "int64"), "segment_pool")
+    check_variable_and_dtype(data, "X",
+                             ("float32", "float64", "int32", "int64"),
+                             "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
     helper = LayerHelper("segment_min", **locals())
     out = helper.create_variable_for_type_inference(dtype=data.dtype)
     summed_ids = helper.create_variable_for_type_inference(dtype=data.dtype)
-    helper.append_op(
-        type="segment_pool",
-        inputs={"X": data,
-                "SegmentIds": segment_ids},
-        outputs={"Out": out,
-                 "SummedIds": summed_ids},
-        attrs={"pooltype": "MIN"})
+    helper.append_op(type="segment_pool",
+                     inputs={
+                         "X": data,
+                         "SegmentIds": segment_ids
+                     },
+                     outputs={
+                         "Out": out,
+                         "SummedIds": summed_ids
+                     },
+                     attrs={"pooltype": "MIN"})
     return out
 
 
@@ -229,19 +241,23 @@ def segment_max(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MAX")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
-                                         "int64"), "segment_pool")
+    check_variable_and_dtype(data, "X",
+                             ("float32", "float64", "int32", "int64"),
+                             "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
     helper = LayerHelper("segment_max", **locals())
     out = helper.create_variable_for_type_inference(dtype=data.dtype)
     summed_ids = helper.create_variable_for_type_inference(dtype=data.dtype)
-    helper.append_op(
-        type="segment_pool",
-        inputs={"X": data,
-                "SegmentIds": segment_ids},
-        outputs={"Out": out,
-                 "SummedIds": summed_ids},
-        attrs={"pooltype": "MAX"})
+    helper.append_op(type="segment_pool",
+                     inputs={
+                         "X": data,
+                         "SegmentIds": segment_ids
+                     },
+                     outputs={
+                         "Out": out,
+                         "SummedIds": summed_ids
+                     },
+                     attrs={"pooltype": "MAX"})
     return out
diff --git a/python/paddle/inference/__init__.py b/python/paddle/inference/__init__.py
index ec5295b6dfe56..670c2cc8e4a4e 100644
--- a/python/paddle/inference/__init__.py
+++ b/python/paddle/inference/__init__.py
@@ -26,16 +26,7 @@
 from ..fluid.inference import PredictorPool  # noqa: F401
 
 __all__ = [  # noqa
-    'Config',
-    'DataType',
-    'PlaceType',
-    'PrecisionType',
-    'Tensor',
-    'Predictor',
-    'create_predictor',
-    'get_version',
-    'get_trt_compile_version',
-    'get_trt_runtime_version',
-    'get_num_bytes_of_data_type',
-    'PredictorPool'
+    'Config', 'DataType', 'PlaceType', 'PrecisionType', 'Tensor', 'Predictor',
+    'create_predictor', 'get_version', 'get_trt_compile_version',
+    'get_trt_runtime_version', 'get_num_bytes_of_data_type', 'PredictorPool'
 ]
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index 5781f78c6e4e4..87acda904b5da 100755
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define all functions about input & output in this directory 
+# TODO: define all functions about input & output in this directory
 
 from ..fluid.io import DataLoader  # noqa: F401
 from ..fluid.dataloader import Dataset  # noqa: F401
@@ -30,20 +30,9 @@
 from ..fluid.dataloader import Subset  # noqa: F401
 from ..fluid.dataloader import random_split  # noqa: F401
 
-__all__ = [ #noqa
-           'Dataset',
-           'IterableDataset',
-           'TensorDataset',
-           'ComposeDataset',
-           'ChainDataset',
-           'BatchSampler',
-           'DistributedBatchSampler',
-           'DataLoader',
-           'get_worker_info',
-           'Sampler',
-           'SequenceSampler',
-           'RandomSampler',
-           'WeightedRandomSampler',
-           'random_split',
-           'Subset'
+__all__ = [  #noqa
+    'Dataset', 'IterableDataset', 'TensorDataset', 'ComposeDataset',
+    'ChainDataset', 'BatchSampler', 'DistributedBatchSampler', 'DataLoader',
+    'get_worker_info', 'Sampler', 'SequenceSampler', 'RandomSampler',
+    'WeightedRandomSampler', 'random_split', 'Subset'
 ]
diff --git a/python/paddle/jit/__init__.py b/python/paddle/jit/__init__.py
index a2af493faca11..7a31dad82e053 100644
--- a/python/paddle/jit/__init__.py
+++ b/python/paddle/jit/__init__.py
@@ -28,13 +28,6 @@
 from . import dy2static  # noqa: F401
 
 __all__ = [  # noqa
-    'save',
-    'load',
-    'TracedLayer',
-    'to_static',
-    'ProgramTranslator',
-    'TranslatedLayer',
-    'set_code_level',
-    'set_verbosity',
-    'not_to_static'
+    'save', 'load', 'TracedLayer', 'to_static', 'ProgramTranslator',
+    'TranslatedLayer', 'set_code_level', 'set_verbosity', 'not_to_static'
 ]
diff --git a/python/paddle/metric/__init__.py b/python/paddle/metric/__init__.py
index 2f2ef4c6f5426..70fe075e57744 100644
--- a/python/paddle/metric/__init__.py
+++ b/python/paddle/metric/__init__.py
@@ -19,11 +19,6 @@
 from .metrics import Auc  # noqa: F401
 from .metrics import accuracy  # noqa: F401
 
-__all__ = [ #noqa
-    'Metric',
-    'Accuracy',
-    'Precision',
-    'Recall',
-    'Auc',
-    'accuracy'
+__all__ = [  #noqa
+    'Metric', 'Accuracy', 'Precision', 'Recall', 'Auc', 'accuracy'
 ]
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index d399cb2052498..4d28b68f99456 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -120,8 +120,9 @@ def reset(self):
         """
         Reset states and result
         """
-        raise NotImplementedError("function 'reset' not implemented in {}.".
-                                  format(self.__class__.__name__))
+        raise NotImplementedError(
+            "function 'reset' not implemented in {}.".format(
+                self.__class__.__name__))
 
     @abc.abstractmethod
     def update(self, *args):
@@ -135,8 +136,9 @@ def update(self, *args):
 
         see :code:`Metric.compute`
         """
-        raise NotImplementedError("function 'update' not implemented in {}.".
-                                  format(self.__class__.__name__))
+        raise NotImplementedError(
+            "function 'update' not implemented in {}.".format(
+                self.__class__.__name__))
 
     @abc.abstractmethod
     def accumulate(self):
@@ -152,8 +154,9 @@ def name(self):
         """
         Returns metric name
         """
-        raise NotImplementedError("function 'name' not implemented in {}.".
-                                  format(self.__class__.__name__))
+        raise NotImplementedError(
+            "function 'name' not implemented in {}.".format(
+                self.__class__.__name__))
 
     def compute(self, *args):
         """
@@ -256,8 +259,10 @@ def compute(self, pred, label, *args):
             Tensor: Correct mask, a tensor with shape [batch_size, d0, ..., topk].
         """
         pred = paddle.argsort(pred, descending=True)
-        pred = paddle.slice(
-            pred, axes=[len(pred.shape) - 1], starts=[0], ends=[self.maxk])
+        pred = paddle.slice(pred,
+                            axes=[len(pred.shape) - 1],
+                            starts=[0],
+                            ends=[self.maxk])
         if (len(label.shape) == 1) or \
            (len(label.shape) == 2 and label.shape[-1] == 1):
             # In static mode, the real label data shape may be different
@@ -812,16 +817,15 @@ def accuracy(input, label, k=1, correct=None, total=None, name=None):
         correct = helper.create_variable_for_type_inference(dtype="int32")
     if total is None:
         total = helper.create_variable_for_type_inference(dtype="int32")
-    helper.append_op(
-        type="accuracy",
-        inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
-        },
-        outputs={
-            "Accuracy": [acc_out],
-            "Correct": [correct],
-            "Total": [total],
-        })
+    helper.append_op(type="accuracy",
+                     inputs={
+                         "Out": [topk_out],
+                         "Indices": [topk_indices],
+                         "Label": [label]
+                     },
+                     outputs={
+                         "Accuracy": [acc_out],
+                         "Correct": [correct],
+                         "Total": [total],
+                     })
     return acc_out
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index b4be291b0697f..de416ca8093d7 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -157,11 +157,10 @@
 import paddle.utils.deprecated as deprecated
 
 
-@deprecated(
-    since="2.0.0",
-    update_to="paddle.nn.funcitional.diag_embed",
-    level=1,
-    reason="diag_embed in paddle.nn will be removed in future")
+@deprecated(since="2.0.0",
+            update_to="paddle.nn.funcitional.diag_embed",
+            level=1,
+            reason="diag_embed in paddle.nn will be removed in future")
 def diag_embed(*args):
     '''
         alias name of paddle.nn.functional.diag_embed
@@ -169,11 +168,10 @@ def diag_embed(*args):
     return functional.diag_embed(*args)
 
 
-@deprecated(
-    since="2.0.0",
-    update_to="paddle.nn.utils.remove_weight_norm",
-    level=1,
-    reason="remove_weight_norm in paddle.nn will be removed in future")
+@deprecated(since="2.0.0",
+            update_to="paddle.nn.utils.remove_weight_norm",
+            level=1,
+            reason="remove_weight_norm in paddle.nn will be removed in future")
 def remove_weight_norm(*args):
     '''
         alias name of paddle.nn.utils.remove_weight_norm
@@ -181,11 +179,10 @@ def remove_weight_norm(*args):
     return utils.remove_weight_norm(*args)
 
 
-@deprecated(
-    since="2.0.0",
-    update_to="paddle.nn.utils.weight_norm",
-    level=1,
-    reason="weight_norm in paddle.nn will be removed in future")
+@deprecated(since="2.0.0",
+            update_to="paddle.nn.utils.weight_norm",
+            level=1,
+            reason="weight_norm in paddle.nn will be removed in future")
 def weight_norm(*args):
     '''
         alias name of paddle.nn.utils.weight_norm
@@ -193,126 +190,126 @@ def weight_norm(*args):
     return utils.weight_norm(*args)
 
 
-__all__ = [     #noqa
-           'BatchNorm',
-           'CELU',
-           'GroupNorm',
-           'LayerNorm',
-           'SpectralNorm',
-           'BatchNorm1D',
-           'BatchNorm2D',
-           'BatchNorm3D',
-           'InstanceNorm1D',
-           'InstanceNorm2D',
-           'InstanceNorm3D',
-           'SyncBatchNorm',
-           'LocalResponseNorm',
-           'Embedding',
-           'Linear',
-           'Upsample',
-           'UpsamplingNearest2D',
-           'UpsamplingBilinear2D',
-           'Pad1D',
-           'Pad2D',
-           'Pad3D',
-           'CosineSimilarity',
-           'Dropout',
-           'Dropout2D',
-           'Dropout3D',
-           'Bilinear',
-           'AlphaDropout',
-           'Unfold',
-           'Fold',
-           'RNNCellBase',
-           'SimpleRNNCell',
-           'LSTMCell',
-           'GRUCell',
-           'RNN',
-           'BiRNN',
-           'SimpleRNN',
-           'LSTM',
-           'GRU',
-           'dynamic_decode',
-           'MultiHeadAttention',
-           'Maxout',
-           'Softsign',
-           'Transformer',
-           'MSELoss',
-           'LogSigmoid',
-           'BeamSearchDecoder',
-           'ClipGradByNorm',
-           'ReLU',
-           'PairwiseDistance',
-           'BCEWithLogitsLoss',
-           'SmoothL1Loss',
-           'MaxPool3D',
-           'AdaptiveMaxPool2D',
-           'Hardshrink',
-           'Softplus',
-           'KLDivLoss',
-           'AvgPool2D',
-           'L1Loss',
-           'LeakyReLU',
-           'AvgPool1D',
-           'AdaptiveAvgPool3D',
-           'AdaptiveMaxPool3D',
-           'NLLLoss',
-           'Conv1D',
-           'Sequential',
-           'Hardswish',
-           'Conv1DTranspose',
-           'AdaptiveMaxPool1D',
-           'TransformerEncoder',
-           'Softmax',
-           'Softmax2D',
-           'ParameterList',
-           'Conv2D',
-           'Softshrink',
-           'Hardtanh',
-           'TransformerDecoderLayer',
-           'CrossEntropyLoss',
-           'GELU',
-           'SELU',
-           'Silu',
-           'Conv2DTranspose',
-           'CTCLoss',
-           'ThresholdedReLU',
-           'AdaptiveAvgPool2D',
-           'MaxPool1D',
-           'Layer',
-           'TransformerDecoder',
-           'Conv3D',
-           'Tanh',
-           'Conv3DTranspose',
-           'Flatten',
-           'AdaptiveAvgPool1D',
-           'Tanhshrink',
-           'HSigmoidLoss',
-           'PReLU',
-           'TransformerEncoderLayer',
-           'AvgPool3D',
-           'MaxPool2D',
-           'MarginRankingLoss',
-           'LayerList',
-           'ClipGradByValue',
-           'BCELoss',
-           'Hardsigmoid',
-           'ClipGradByGlobalNorm',
-           'LogSoftmax',
-           'Sigmoid',
-           'Swish',
-           'Mish',
-           'PixelShuffle',
-           'PixelUnshuffle',
-           'ChannelShuffle',
-           'ELU',
-           'ReLU6',
-           'LayerDict',
-           'ZeroPad2D',
-           'MaxUnPool1D',
-           'MaxUnPool2D',
-           'MaxUnPool3D',
-           'HingeEmbeddingLoss',
-           'Identity',
-           'RReLU',
+__all__ = [  #noqa
+    'BatchNorm',
+    'CELU',
+    'GroupNorm',
+    'LayerNorm',
+    'SpectralNorm',
+    'BatchNorm1D',
+    'BatchNorm2D',
+    'BatchNorm3D',
+    'InstanceNorm1D',
+    'InstanceNorm2D',
+    'InstanceNorm3D',
+    'SyncBatchNorm',
+    'LocalResponseNorm',
+    'Embedding',
+    'Linear',
+    'Upsample',
+    'UpsamplingNearest2D',
+    'UpsamplingBilinear2D',
+    'Pad1D',
+    'Pad2D',
+    'Pad3D',
+    'CosineSimilarity',
+    'Dropout',
+    'Dropout2D',
+    'Dropout3D',
+    'Bilinear',
+    'AlphaDropout',
+    'Unfold',
+    'Fold',
+    'RNNCellBase',
+    'SimpleRNNCell',
+    'LSTMCell',
+    'GRUCell',
+    'RNN',
+    'BiRNN',
+    'SimpleRNN',
+    'LSTM',
+    'GRU',
+    'dynamic_decode',
+    'MultiHeadAttention',
+    'Maxout',
+    'Softsign',
+    'Transformer',
+    'MSELoss',
+    'LogSigmoid',
+    'BeamSearchDecoder',
+    'ClipGradByNorm',
+    'ReLU',
+    'PairwiseDistance',
+    'BCEWithLogitsLoss',
+    'SmoothL1Loss',
+    'MaxPool3D',
+    'AdaptiveMaxPool2D',
+    'Hardshrink',
+    'Softplus',
+    'KLDivLoss',
+    'AvgPool2D',
+    'L1Loss',
+    'LeakyReLU',
+    'AvgPool1D',
+    'AdaptiveAvgPool3D',
+    'AdaptiveMaxPool3D',
+    'NLLLoss',
+    'Conv1D',
+    'Sequential',
+    'Hardswish',
+    'Conv1DTranspose',
+    'AdaptiveMaxPool1D',
+    'TransformerEncoder',
+    'Softmax',
+    'Softmax2D',
+    'ParameterList',
+    'Conv2D',
+    'Softshrink',
+    'Hardtanh',
+    'TransformerDecoderLayer',
+    'CrossEntropyLoss',
+    'GELU',
+    'SELU',
+    'Silu',
+    'Conv2DTranspose',
+    'CTCLoss',
+    'ThresholdedReLU',
+    'AdaptiveAvgPool2D',
+    'MaxPool1D',
+    'Layer',
+    'TransformerDecoder',
+    'Conv3D',
+    'Tanh',
+    'Conv3DTranspose',
+    'Flatten',
+    'AdaptiveAvgPool1D',
+    'Tanhshrink',
+    'HSigmoidLoss',
+    'PReLU',
+    'TransformerEncoderLayer',
+    'AvgPool3D',
+    'MaxPool2D',
+    'MarginRankingLoss',
+    'LayerList',
+    'ClipGradByValue',
+    'BCELoss',
+    'Hardsigmoid',
+    'ClipGradByGlobalNorm',
+    'LogSoftmax',
+    'Sigmoid',
+    'Swish',
+    'Mish',
+    'PixelShuffle',
+    'PixelUnshuffle',
+    'ChannelShuffle',
+    'ELU',
+    'ReLU6',
+    'LayerDict',
+    'ZeroPad2D',
+    'MaxUnPool1D',
+    'MaxUnPool2D',
+    'MaxUnPool3D',
+    'HingeEmbeddingLoss',
+    'Identity',
+    'RReLU',
 ]
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index e868cbdbacc17..61143175fd4af 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the functions to clip gradient of parameter  
+# TODO: define the functions to clip gradient of parameter
 from ..fluid.clip import ClipGradByGlobalNorm  # noqa: F401
 from ..fluid.clip import ClipGradByNorm  # noqa: F401
 from ..fluid.clip import ClipGradByValue  # noqa: F401
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index fa5a56c468620..5e4d0dd3558f5 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -124,110 +124,110 @@
 
 from .sparse_attention import sparse_attention
 
-__all__ = [     #noqa
-           'celu',
-           'conv1d',
-           'conv1d_transpose',
-           'conv2d',
-           'conv2d_transpose',
-           'conv3d',
-           'conv3d_transpose',
-           'elu',
-           'elu_',
-           'gelu',
-           'hardshrink',
-           'hardtanh',
-           'hardsigmoid',
-           'hardswish',
-           'leaky_relu',
-           'log_sigmoid',
-           'maxout',
-           'prelu',
-           'relu',
-           'relu_',
-           'relu6',
-           'selu',
-           'softmax',
-           'softmax_',
-           'softplus',
-           'softshrink',
-           'softsign',
-           'sigmoid',
-           'silu',
-           'swish',
-           'mish',
-           'tanh',
-           'tanh_',
-           'tanhshrink',
-           'thresholded_relu',
-           'log_softmax',
-           'glu',
-           'gumbel_softmax',
-           'diag_embed',
-           'sequence_mask',
-           'dropout',
-           'dropout2d',
-           'dropout3d',
-           'alpha_dropout',
-           'label_smooth',
-           'linear',
-           'pad',
-           'zeropad2d',
-           'unfold',
-           'interpolate',
-           'upsample',
-           'bilinear',
-           'cosine_similarity',
-           'avg_pool1d',
-           'avg_pool2d',
-           'avg_pool3d',
-           'max_pool1d',
-           'max_pool2d',
-           'max_pool3d',
-           'max_unpool1d',
-           'max_unpool2d',
-           'max_unpool3d',
-           'adaptive_avg_pool1d',
-           'adaptive_avg_pool2d',
-           'adaptive_avg_pool3d',
-           'adaptive_max_pool1d',
-           'adaptive_max_pool2d',
-           'adaptive_max_pool3d',
-           'binary_cross_entropy',
-           'binary_cross_entropy_with_logits',
-           'cross_entropy',
-           'dice_loss',
-           'hsigmoid_loss',
-           'kl_div',
-           'l1_loss',
-           'log_loss',
-           'mse_loss',
-           'margin_ranking_loss',
-           'nll_loss',
-           'npair_loss',
-           'sigmoid_focal_loss',
-           'smooth_l1_loss',
-           'softmax_with_cross_entropy',
-           'margin_cross_entropy',
-           'square_error_cost',
-           'ctc_loss',
-           'hinge_embedding_loss',
-           'affine_grid',
-           'grid_sample',
-           'local_response_norm',
-           'pixel_shuffle',
-           'pixel_unshuffle',
-           'channel_shuffle',
-           'embedding',
-           'gather_tree',
-           'one_hot',
-           'normalize',
-           'temporal_shift',
-           'batch_norm',
-           'layer_norm',
-           'instance_norm',
-           'class_center_sample',
-           'sparse_attention',
-           'fold',
-           'rrelu',
+__all__ = [  #noqa
+    'celu',
+    'conv1d',
+    'conv1d_transpose',
+    'conv2d',
+    'conv2d_transpose',
+    'conv3d',
+    'conv3d_transpose',
+    'elu',
+    'elu_',
+    'gelu',
+    'hardshrink',
+    'hardtanh',
+    'hardsigmoid',
+    'hardswish',
+    'leaky_relu',
+    'log_sigmoid',
+    'maxout',
+    'prelu',
+    'relu',
+    'relu_',
+    'relu6',
+    'selu',
+    'softmax',
+    'softmax_',
+    'softplus',
+    'softshrink',
+    'softsign',
+    'sigmoid',
+    'silu',
+    'swish',
+    'mish',
+    'tanh',
+    'tanh_',
+    'tanhshrink',
+    'thresholded_relu',
+    'log_softmax',
+    'glu',
+    'gumbel_softmax',
+    'diag_embed',
+    'sequence_mask',
+    'dropout',
+    'dropout2d',
+    'dropout3d',
+    'alpha_dropout',
+    'label_smooth',
+    'linear',
+    'pad',
+    'zeropad2d',
+    'unfold',
+    'interpolate',
+    'upsample',
+    'bilinear',
+    'cosine_similarity',
+    'avg_pool1d',
+    'avg_pool2d',
+    'avg_pool3d',
+    'max_pool1d',
+    'max_pool2d',
+    'max_pool3d',
+    'max_unpool1d',
+    'max_unpool2d',
+    'max_unpool3d',
+    'adaptive_avg_pool1d',
+    'adaptive_avg_pool2d',
+    'adaptive_avg_pool3d',
+    'adaptive_max_pool1d',
+    'adaptive_max_pool2d',
+    'adaptive_max_pool3d',
+    'binary_cross_entropy',
+    'binary_cross_entropy_with_logits',
+    'cross_entropy',
+    'dice_loss',
+    'hsigmoid_loss',
+    'kl_div',
+    'l1_loss',
+    'log_loss',
+    'mse_loss',
+    'margin_ranking_loss',
+    'nll_loss',
+    'npair_loss',
+    'sigmoid_focal_loss',
+    'smooth_l1_loss',
+    'softmax_with_cross_entropy',
+    'margin_cross_entropy',
+    'square_error_cost',
+    'ctc_loss',
+    'hinge_embedding_loss',
+    'affine_grid',
+    'grid_sample',
+    'local_response_norm',
+    'pixel_shuffle',
+    'pixel_unshuffle',
+    'channel_shuffle',
+    'embedding',
+    'gather_tree',
+    'one_hot',
+    'normalize',
+    'temporal_shift',
+    'batch_norm',
+    'layer_norm',
+    'instance_norm',
+    'class_center_sample',
+    'sparse_attention',
+    'fold',
+    'rrelu',
 ]
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index dd314868b69e2..aed8fbb0f58ce 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -71,11 +71,10 @@ def celu(x, alpha=1.0, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'celu')
     helper = LayerHelper("celu", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='celu',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'alpha': alpha})
+    helper.append_op(type='celu',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={'alpha': alpha})
     return out
 
 
@@ -123,11 +122,10 @@ def elu(x, alpha=1.0, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu')
     helper = LayerHelper("elu", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='elu',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'alpha': alpha})
+    helper.append_op(type='elu',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={'alpha': alpha})
     return out
 
 
@@ -190,11 +188,10 @@ def gelu(x, approximate=False, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'gelu')
     helper = LayerHelper("gelu", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='gelu',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'approximate': approximate})
+    helper.append_op(type='gelu',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={'approximate': approximate})
     return out
 
 
@@ -239,11 +236,10 @@ def hardshrink(x, threshold=0.5, name=None):
                              'hardshrink')
     helper = LayerHelper('hardshrink', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='hard_shrink',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'threshold': threshold})
+    helper.append_op(type='hard_shrink',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={'threshold': threshold})
     return out
 
 
@@ -291,12 +287,13 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
 
     helper = LayerHelper('hardtanh', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='brelu',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'t_min': min,
-               't_max': max})
+    helper.append_op(type='brelu',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={
+                         't_min': min,
+                         't_max': max
+                     })
     return out
 
 
@@ -346,12 +343,13 @@ def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
 
     helper = LayerHelper('hardsigmoid', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='hard_sigmoid',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'slope': slope,
-               'offset': offset})
+    helper.append_op(type='hard_sigmoid',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={
+                         'slope': slope,
+                         'offset': offset
+                     })
     return out
 
 
@@ -449,11 +447,10 @@ def leaky_relu(x, negative_slope=0.01, name=None):
                              'leaky_relu')
     helper = LayerHelper('leaky_relu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='leaky_relu',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'alpha': negative_slope})
+    helper.append_op(type='leaky_relu',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={'alpha': negative_slope})
     return out
 
 
@@ -540,13 +537,16 @@ def prelu(x, weight, data_format="NCHW", name=None):
 
     helper = LayerHelper('prelu', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type="prelu",
-        inputs={"X": x,
-                "Alpha": weight},
-        outputs={"Out": out},
-        attrs={"mode": mode,
-               "data_format": data_format})
+    helper.append_op(type="prelu",
+                     inputs={
+                         "X": x,
+                         "Alpha": weight
+                     },
+                     outputs={"Out": out},
+                     attrs={
+                         "mode": mode,
+                         "data_format": data_format
+                     })
     return out
 
 
@@ -628,18 +628,18 @@ def rrelu(x, lower=1. / 8., upper=1. / 3., training=True, name=None):
 
     if not isinstance(lower, float) or not isinstance(upper, float):
         raise TypeError(
-            "The lower and upper values must be float type. Received: lower {}, upper {}.".
-            format(lower, upper))
+            "The lower and upper values must be float type. Received: lower {}, upper {}."
+            .format(lower, upper))
 
     if lower < 0 or lower > 1:
         raise ValueError(
-            "The lower value must be no less than zero or greater than one. Received: {}.".
-            format(lower))
+            "The lower value must be no less than zero or greater than one. Received: {}."
+            .format(lower))
 
     if upper < lower:
         raise ValueError(
-            "The upper value must be greater than lower value. Received: lower {}, upper {}.".
-            format(lower, upper))
+            "The upper value must be greater than lower value. Received: lower {}, upper {}."
+            .format(lower, upper))
 
     if upper > 1:
         raise ValueError(
@@ -657,12 +657,13 @@ def rrelu(x, lower=1. / 8., upper=1. / 3., training=True, name=None):
     out = helper.create_variable_for_type_inference(x.dtype)
     noise = helper.create_variable_for_type_inference(dtype=x.dtype)
     attrs = {'lower': lower, 'upper': upper, 'is_test': is_test}
-    helper.append_op(
-        type='rrelu',
-        inputs={"X": x},
-        outputs={"Out": out,
-                 "Noise": noise},
-        attrs=attrs)
+    helper.append_op(type='rrelu',
+                     inputs={"X": x},
+                     outputs={
+                         "Out": out,
+                         "Noise": noise
+                     },
+                     attrs=attrs)
     return out
 
 
@@ -822,12 +823,13 @@ def maxout(x, groups, axis=1, name=None):
 
     helper = LayerHelper('maxout', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='maxout',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'groups': groups,
-               'axis': axis})
+    helper.append_op(type='maxout',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={
+                         'groups': groups,
+                         'axis': axis
+                     })
     return out
 
 
@@ -864,11 +866,10 @@ def relu6(x, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu6')
     helper = LayerHelper('relu6', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='relu6',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'threshold': threshold})
+    helper.append_op(type='relu6',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={'threshold': threshold})
     return out
 
 
@@ -925,12 +926,13 @@ def selu(x,
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'selu')
     helper = LayerHelper('selu', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='selu',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'scale': scale,
-               'alpha': alpha})
+    helper.append_op(type='selu',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={
+                         'scale': scale,
+                         'alpha': alpha
+                     })
     return out
 
 
@@ -1104,27 +1106,30 @@ def softmax(x, axis=-1, dtype=None, name=None):
         check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                                  'softmax')
     else:
-        check_dtype(dtype, 'dtype', ['float32', 'float64'], 'softmax',
-                    'If dtype is not None, it only support float32 or float64.')
+        check_dtype(
+            dtype, 'dtype', ['float32', 'float64'], 'softmax',
+            'If dtype is not None, it only support float32 or float64.')
 
     helper = LayerHelper("softmax", **locals())
     outs_cast = x
     if dtype is not None:
         outs_cast = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(
-            type='cast',
-            inputs={'X': x},
-            outputs={'Out': outs_cast},
-            attrs={'in_dtype': x.dtype,
-                   'out_dtype': dtype})
+        helper.append_op(type='cast',
+                         inputs={'X': x},
+                         outputs={'Out': outs_cast},
+                         attrs={
+                             'in_dtype': x.dtype,
+                             'out_dtype': dtype
+                         })
 
     outs_softmax = helper.create_variable_for_type_inference(outs_cast.dtype)
-    helper.append_op(
-        type='softmax',
-        inputs={'X': outs_cast},
-        outputs={'Out': outs_softmax},
-        attrs={'axis': axis,
-               'use_cudnn': use_cudnn})
+    helper.append_op(type='softmax',
+                     inputs={'X': outs_cast},
+                     outputs={'Out': outs_softmax},
+                     attrs={
+                         'axis': axis,
+                         'use_cudnn': use_cudnn
+                     })
 
     return outs_softmax
 
@@ -1177,12 +1182,13 @@ def softplus(x, beta=1, threshold=20, name=None):
                              'softplus')
     helper = LayerHelper('softplus', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='softplus',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'beta': beta,
-               'threshold': threshold})
+    helper.append_op(type='softplus',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={
+                         'beta': beta,
+                         'threshold': threshold
+                     })
     return out
 
 
@@ -1234,11 +1240,10 @@ def softshrink(x, threshold=0.5, name=None):
                              'softshrink')
     helper = LayerHelper('softshrink', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='softshrink',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'lambda': threshold})
+    helper.append_op(type='softshrink',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={'lambda': threshold})
     return out
 
 
@@ -1313,11 +1318,10 @@ def swish(x, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'swish')
     helper = LayerHelper('swish', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='swish',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'beta': 1.0})
+    helper.append_op(type='swish',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={'beta': 1.0})
     return out
 
 
@@ -1442,11 +1446,10 @@ def thresholded_relu(x, threshold=1.0, name=None):
                              'thresholded_relu')
     helper = LayerHelper('thresholded_relu', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='thresholded_relu',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'threshold': threshold})
+    helper.append_op(type='thresholded_relu',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={'threshold': threshold})
     return out
 
 
@@ -1520,26 +1523,27 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
         check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                                  'log_softmax')
     else:
-        check_dtype(dtype, 'dtype', ['float32', 'float64'], 'log_softmax',
-                    'If dtype is not None, it only support float32 or float64.')
+        check_dtype(
+            dtype, 'dtype', ['float32', 'float64'], 'log_softmax',
+            'If dtype is not None, it only support float32 or float64.')
 
     helper = LayerHelper("log_softmax", **locals())
     out_cast = x
     if dtype is not None:
         out_cast = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(
-            type='cast',
-            inputs={'X': x},
-            outputs={'Out': out_cast},
-            attrs={'in_dtype': x.dtype,
-                   'out_dtype': dtype})
+        helper.append_op(type='cast',
+                         inputs={'X': x},
+                         outputs={'Out': out_cast},
+                         attrs={
+                             'in_dtype': x.dtype,
+                             'out_dtype': dtype
+                         })
 
     out = helper.create_variable_for_type_inference(out_cast.dtype)
-    helper.append_op(
-        type='log_softmax',
-        inputs={'X': out_cast},
-        outputs={'Out': out},
-        attrs={'axis': axis})
+    helper.append_op(type='log_softmax',
+                     inputs={'X': out_cast},
+                     outputs={'Out': out},
+                     attrs={'axis': axis})
 
     return out
 
@@ -1659,11 +1663,12 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
     helper = LayerHelper("gumbel_softmax", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'gumbel_softmax')
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='gumbel_softmax',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'temperature': temperature,
-               'hard': hard,
-               'axis': axis})
+    helper.append_op(type='gumbel_softmax',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={
+                         'temperature': temperature,
+                         'hard': hard,
+                         'axis': axis
+                     })
     return out
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 7fed1dbb487fa..e10a1c1069141 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -20,7 +20,7 @@
 from ...tensor.creation import zeros
 from paddle.static import Variable
 from ...fluid import dygraph_utils
-# TODO: define the common functions to build a neural network  
+# TODO: define the common functions to build a neural network
 from ...tensor.manipulation import squeeze
 from ...tensor.manipulation import unsqueeze
 from ...tensor import clip
@@ -157,16 +157,15 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
                                          dilations)
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="unfold",
-        inputs={"X": x},
-        outputs={"Y": out},
-        attrs={
-            "kernel_sizes": kernel_sizes,
-            "strides": strides,
-            "paddings": paddings,
-            "dilations": dilations
-        })
+    helper.append_op(type="unfold",
+                     inputs={"X": x},
+                     outputs={"Y": out},
+                     attrs={
+                         "kernel_sizes": kernel_sizes,
+                         "strides": strides,
+                         "paddings": paddings,
+                         "dilations": dilations
+                     })
     return out
 
 
@@ -517,8 +516,11 @@ def _is_list_or_turple_(data):
                         assert (isinstance(dim, int))
                         temp_out = helper.create_variable_for_type_inference(
                             'int32')
-                        fill_constant(
-                            [1], 'int32', dim, force_cpu=True, out=temp_out)
+                        fill_constant([1],
+                                      'int32',
+                                      dim,
+                                      force_cpu=True,
+                                      out=temp_out)
                         new_size_tensor.append(temp_out)
                         size_list.append(dim)
                 inputs['SizeTensor'] = new_size_tensor
@@ -603,11 +605,10 @@ def _is_list_or_turple_(data):
             out = _C_ops.bicubic_interp_v2(x, *dy_attr)
         return out
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='{}_interp_v2'.format(resample_type),
-        inputs=inputs,
-        outputs={"Out": out},
-        attrs=attrs)
+    helper.append_op(type='{}_interp_v2'.format(resample_type),
+                     inputs=inputs,
+                     outputs={"Out": out},
+                     attrs=attrs)
     return out
 
 
@@ -862,8 +863,9 @@ def bilinear(x1, x2, weight, bias=None, name=None):
     helper = LayerHelper("bilinear", **locals())
     out = helper.create_variable_for_type_inference(dtype=x1.dtype)
 
-    helper.append_op(
-        type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out})
+    helper.append_op(type="bilinear_tensor_product",
+                     inputs=inputs,
+                     outputs={"Out": out})
 
     return out
 
@@ -1013,7 +1015,8 @@ def dropout(x,
         raise ValueError("p argument should between 0 and 1")
     if mode not in ('downscale_in_infer', 'upscale_in_train'):
         raise ValueError(
-            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
+        )
     if axis and not isinstance(axis, (int, list, tuple)):
         raise TypeError("datatype of axis argument should be int or list")
 
@@ -1030,10 +1033,11 @@ def dropout(x,
                     seed if seed is not None else 0, seed is not None)
 
                 return out
-            out, mask = _C_ops.dropout(
-                x, 'dropout_prob', p, 'is_test', not training, 'fix_seed',
-                seed is not None, 'seed', seed
-                if seed is not None else 0, 'dropout_implementation', mode)
+            out, mask = _C_ops.dropout(x, 'dropout_prob', p, 'is_test',
+                                       not training, 'fix_seed', seed
+                                       is not None, 'seed',
+                                       seed if seed is not None else 0,
+                                       'dropout_implementation', mode)
             return out
 
         helper = LayerHelper('dropout', **locals())
@@ -1058,12 +1062,13 @@ def get_attrs(prog, dropout_prob, is_test, seed):
 
         attrs = get_attrs(helper.main_program, p, not training, seed)
 
-        helper.append_op(
-            type='dropout',
-            inputs={'X': [x]},
-            outputs={'Out': [out],
-                     'Mask': [mask]},
-            attrs=attrs)
+        helper.append_op(type='dropout',
+                         inputs={'X': [x]},
+                         outputs={
+                             'Out': [out],
+                             'Mask': [mask]
+                         },
+                         attrs=attrs)
         return out
     else:  #sometimes called dropout_nd #TODO: optimize with c++
         if not in_dynamic_mode():
@@ -1087,8 +1092,8 @@ def get_attrs(prog, dropout_prob, is_test, seed):
                                  .format(len(input_shape), max(drop_axes)))
             if len(drop_axes) > len(input_shape):
                 raise ValueError(
-                    "length of axis should not be greater than dimensions of x:{}, but get length of axis: {}".
-                    format(len(input_shape), len(drop_axes)))
+                    "length of axis should not be greater than dimensions of x:{}, but get length of axis: {}"
+                    .format(len(input_shape), len(drop_axes)))
             mask_shape = [1] * len(input_shape)
             if not in_dynamic_mode():
                 for i in drop_axes:
@@ -1098,8 +1103,10 @@ def get_attrs(prog, dropout_prob, is_test, seed):
                     mask_shape[i] = input_shape[i]
 
             #get mask
-            random_tensor = paddle.uniform(
-                mask_shape, dtype='float32', min=0., max=1.0)
+            random_tensor = paddle.uniform(mask_shape,
+                                           dtype='float32',
+                                           min=0.,
+                                           max=1.0)
             p = full(shape=[1], fill_value=p, dtype='float32')
             keep_mask = paddle.greater_equal(random_tensor, p)
 
@@ -1159,13 +1166,12 @@ def dropout2d(x, p=0.5, training=True, data_format='NCHW', name=None):
             "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
             "Attr(data_format): %s." % str(data_format))
 
-    return dropout(
-        x,
-        p=p,
-        axis=[0, 1] if data_format == 'NCHW' else [0, 3],
-        training=training,
-        mode="upscale_in_train",
-        name=name)
+    return dropout(x,
+                   p=p,
+                   axis=[0, 1] if data_format == 'NCHW' else [0, 3],
+                   training=training,
+                   mode="upscale_in_train",
+                   name=name)
 
 
 def dropout3d(x, p=0.5, training=True, data_format='NCDHW', name=None):
@@ -1213,13 +1219,12 @@ def dropout3d(x, p=0.5, training=True, data_format='NCDHW', name=None):
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
             "Attr(data_format): %s." % str(data_format))
 
-    return dropout(
-        x,
-        p=p,
-        axis=[0, 1] if data_format == 'NCDHW' else [0, 4],
-        training=training,
-        mode="upscale_in_train",
-        name=name)
+    return dropout(x,
+                   p=p,
+                   axis=[0, 1] if data_format == 'NCDHW' else [0, 4],
+                   training=training,
+                   mode="upscale_in_train",
+                   name=name)
 
 
 def alpha_dropout(x, p=0.5, training=True, name=None):
@@ -1276,20 +1281,20 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
         input_shape = x.shape
 
         #get mask
-        random_tensor = paddle.uniform(
-            input_shape, dtype='float32', min=0., max=1.0)
+        random_tensor = paddle.uniform(input_shape,
+                                       dtype='float32',
+                                       min=0.,
+                                       max=1.0)
         p = full(shape=[1], fill_value=p, dtype='float32')
         keep_mask = paddle.greater_equal(random_tensor, p)
         keep_mask = paddle.cast(keep_mask, dtype)
         drop_mask = paddle.subtract(
-            full(
-                shape=input_shape, fill_value=1., dtype=dtype), keep_mask)
+            full(shape=input_shape, fill_value=1., dtype=dtype), keep_mask)
 
         #apply mask
         b = full(shape=[1], fill_value=b, dtype=dtype)
         y = paddle.add(paddle.multiply(x, keep_mask),
-                       paddle.scale(
-                           drop_mask, scale=alpha_p))
+                       paddle.scale(drop_mask, scale=alpha_p))
         res = paddle.add(paddle.scale(y, scale=a), b, name=name)
         return res
     else:  # test
@@ -1419,8 +1424,8 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
 
     x_dim = len(x.shape)
 
-    if mode == "constant" and isinstance(pad, (
-            list, tuple)) and len(pad) == x_dim * 2:
+    if mode == "constant" and isinstance(
+            pad, (list, tuple)) and len(pad) == x_dim * 2:
         paddings = pad
         pad_value = value
         check_variable_and_dtype(x, 'x', [
@@ -1431,12 +1436,13 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
         helper = LayerHelper('pad', **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(
-            type='pad',
-            inputs={'X': x},
-            outputs={'Out': out},
-            attrs={'paddings': paddings,
-                   'pad_value': float(pad_value)})
+        helper.append_op(type='pad',
+                         inputs={'X': x},
+                         outputs={'Out': out},
+                         attrs={
+                             'paddings': paddings,
+                             'pad_value': float(pad_value)
+                         })
         return out
 
     assert x_dim in [
@@ -1521,8 +1527,10 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
 
             dtype = helper.input_dtype(input_param_name='input')
             out = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(
-                type='pad3d', inputs=inputs, outputs={"Out": out}, attrs=attrs)
+            helper.append_op(type='pad3d',
+                             inputs=inputs,
+                             outputs={"Out": out},
+                             attrs=attrs)
 
     if len(unsqueezed_dim) != 0:
         out = squeeze(out, axis=unsqueezed_dim)
@@ -1676,7 +1684,7 @@ def linear(x, weight, bias=None, name=None):
           #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
     """
     if in_dygraph_mode():
-        #TODO(jiabin): using addmm for fast forward route 
+        #TODO(jiabin): using addmm for fast forward route
         return _C_ops.final_state_linear(x, weight, bias)
     else:
         if _in_legacy_dygraph():
@@ -1699,19 +1707,19 @@ def linear(x, weight, bias=None, name=None):
             inputs = {'X': [x], 'Y': [weight]}
             attrs = {'trans_x': False, 'trans_y': False}
             tmp = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(
-                type='matmul_v2',
-                inputs=inputs,
-                outputs={'Out': tmp},
-                attrs=attrs)
+            helper.append_op(type='matmul_v2',
+                             inputs=inputs,
+                             outputs={'Out': tmp},
+                             attrs=attrs)
             if bias is not None:
                 res = helper.create_variable_for_type_inference(dtype)
-                helper.append_op(
-                    type='elementwise_add',
-                    inputs={'X': [tmp],
-                            'Y': [bias]},
-                    outputs={'Out': [res]},
-                    attrs={'axis': len(x.shape) - 1})
+                helper.append_op(type='elementwise_add',
+                                 inputs={
+                                     'X': [tmp],
+                                     'Y': [bias]
+                                 },
+                                 outputs={'Out': [res]},
+                                 attrs={'axis': len(x.shape) - 1})
             else:
                 res = tmp
             return res
@@ -1791,12 +1799,13 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
     helper = LayerHelper("label_smooth", **locals())
     label.stop_gradient = True
     smooth_label = helper.create_variable_for_type_inference(label.dtype)
-    helper.append_op(
-        type="label_smooth",
-        inputs={"X": label,
-                "PriorDist": prior_dist} if prior_dist else {"X": label},
-        outputs={"Out": smooth_label},
-        attrs={"epsilon": float(epsilon)})
+    helper.append_op(type="label_smooth",
+                     inputs={
+                         "X": label,
+                         "PriorDist": prior_dist
+                     } if prior_dist else {"X": label},
+                     outputs={"Out": smooth_label},
+                     attrs={"epsilon": float(epsilon)})
     return smooth_label
 
 
@@ -1948,8 +1957,8 @@ class centers and the shape of sampled_class_center will be [num_positive_class_
     if in_dynamic_mode():
         remapped_label, sampled_class_center = _C_ops.class_center_sample(
             label, 'num_classes', num_classes, 'num_samples', num_samples,
-            'ring_id', ring_id, 'nranks', nranks, 'rank', rank, 'fix_seed',
-            seed is not None, 'seed', seed if seed is not None else 0)
+            'ring_id', ring_id, 'nranks', nranks, 'rank', rank, 'fix_seed', seed
+            is not None, 'seed', seed if seed is not None else 0)
         return remapped_label, sampled_class_center
 
     check_variable_and_dtype(label, 'label', ['int64', 'int32'],
@@ -1960,22 +1969,21 @@ class centers and the shape of sampled_class_center will be [num_positive_class_
         dtype=label.dtype)
     sampled_class_center = helper.create_variable_for_type_inference(
         dtype=label.dtype)
-    helper.append_op(
-        type=op_type,
-        inputs={'Label': label},
-        outputs={
-            'RemappedLabel': remapped_label,
-            'SampledLocalClassCenter': sampled_class_center
-        },
-        attrs={
-            'num_classes': num_classes,
-            'num_samples': num_samples,
-            'ring_id': ring_id,
-            'nranks': nranks,
-            'rank': rank,
-            'fix_seed': seed is not None,
-            'seed': seed if seed is not None else 0
-        })
+    helper.append_op(type=op_type,
+                     inputs={'Label': label},
+                     outputs={
+                         'RemappedLabel': remapped_label,
+                         'SampledLocalClassCenter': sampled_class_center
+                     },
+                     attrs={
+                         'num_classes': num_classes,
+                         'num_samples': num_samples,
+                         'ring_id': ring_id,
+                         'nranks': nranks,
+                         'rank': rank,
+                         'fix_seed': seed is not None,
+                         'seed': seed if seed is not None else 0
+                     })
     return remapped_label, sampled_class_center
 
 
@@ -2099,15 +2107,14 @@ def _is_list_or_turple_(data):
                           paddings, "dilations", dilations)
     else:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
-        helper.append_op(
-            type="fold",
-            inputs={"X": x},
-            outputs={"Y": out},
-            attrs={
-                "output_sizes": output_sizes,
-                "kernel_sizes": kernel_sizes,
-                "strides": strides,
-                "paddings": paddings,
-                "dilations": dilations
-            })
+        helper.append_op(type="fold",
+                         inputs={"X": x},
+                         outputs={"Y": out},
+                         attrs={
+                             "output_sizes": output_sizes,
+                             "kernel_sizes": kernel_sizes,
+                             "strides": strides,
+                             "paddings": paddings,
+                             "dilations": dilations
+                         })
     return out
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 419014daf64e4..26f07c2f9a11c 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -79,8 +79,8 @@ def _update_padding_nd(padding, channel_last, num_dims):
                     "Non-zero padding({}) in the batch or channel dimensions "
                     "is not supported.".format(padding))
             padding_algorithm = "EXPLICIT"
-            padding = _exclude_padding_in_batch_and_channel(padding,
-                                                            channel_last)
+            padding = _exclude_padding_in_batch_and_channel(
+                padding, channel_last)
             if _is_symmetric_padding(padding, num_dims):
                 padding = padding[0::2]
         # for padding like [pad_before, pad_after, pad_before, pad_after, ...]
@@ -101,8 +101,8 @@ def _update_padding_nd(padding, channel_last, num_dims):
         padding = convert_to_list(padding, num_dims, 'padding')
     if not all([p >= 0 for p in padding]):
         raise ValueError(
-            "Invalid padding, all value should be larger than or equal to 0, but received: {}".
-            format(padding))
+            "Invalid padding, all value should be larger than or equal to 0, but received: {}"
+            .format(padding))
     return padding, padding_algorithm
 
 
@@ -123,9 +123,10 @@ def _conv_nd(x,
 
     # Due to the poor performance of NHWC, we transpose the input to NCHW.
     if in_dygraph_mode() and op_type == "conv2d":
-        pre_bias = _C_ops.final_state_conv2d(
-            x, weight, stride, padding, padding_algorithm, groups, dilation,
-            data_format, False, -1, False)
+        pre_bias = _C_ops.final_state_conv2d(x, weight, stride, padding,
+                                             padding_algorithm, groups,
+                                             dilation, data_format, False, -1,
+                                             False)
         if bias is not None:
             channel_dim = channel_dim + len(
                 x.shape) if channel_dim < 0 else channel_dim
@@ -147,22 +148,23 @@ def _conv_nd(x,
             channel_dim = channel_dim + len(
                 x.shape) if channel_dim < 0 else channel_dim
             tmp_bias = _C_ops.final_state_reshape(
-                bias, bias.shape +
-                [1 for i in range(len(x.shape) - channel_dim - 1)])
+                bias,
+                bias.shape + [1 for i in range(len(x.shape) - channel_dim - 1)])
             return _C_ops.final_state_add(pre_bias, tmp_bias)
         else:
             return pre_bias
 
     if in_dygraph_mode() and op_type == "conv3d":
-        pre_bias = _C_ops.final_state_conv3d(
-            x, weight, stride, padding, padding_algorithm, groups, dilation,
-            data_format, False, -1, False)
+        pre_bias = _C_ops.final_state_conv3d(x, weight, stride, padding,
+                                             padding_algorithm, groups,
+                                             dilation, data_format, False, -1,
+                                             False)
         if bias is not None:
             channel_dim = channel_dim + len(
                 x.shape) if channel_dim < 0 else channel_dim
             tmp_bias = _C_ops.final_state_reshape(
-                bias, bias.shape +
-                [1 for i in range(len(x.shape) - channel_dim - 1)])
+                bias,
+                bias.shape + [1 for i in range(len(x.shape) - channel_dim - 1)])
             return _C_ops.final_state_add(pre_bias, tmp_bias)
         else:
             return pre_bias
@@ -197,17 +199,22 @@ def _conv_nd(x,
         dtype = helper.input_dtype(input_param_name='x')
         pre_bias = helper.create_variable_for_type_inference(dtype)
         outputs = {"Output": [pre_bias]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        helper.append_op(type=op_type,
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=attrs)
         if bias is not None:
             out = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [bias]},
-                outputs={'Out': [out]},
-                attrs={'axis': channel_dim,
-                       'use_mkldnn': use_mkldnn})
+            helper.append_op(type='elementwise_add',
+                             inputs={
+                                 'X': [pre_bias],
+                                 'Y': [bias]
+                             },
+                             outputs={'Out': [out]},
+                             attrs={
+                                 'axis': channel_dim,
+                                 'use_mkldnn': use_mkldnn
+                             })
         else:
             out = pre_bias
     return out
@@ -364,8 +371,8 @@ def conv1d(x,
                              x.shape, num_channels))
     if groups <= 0:
         raise ValueError(
-            "The groups of conv1d should be greater than 0. Received groups: {}".
-            format(groups))
+            "The groups of conv1d should be greater than 0. Received groups: {}"
+            .format(groups))
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
@@ -386,8 +393,8 @@ def conv1d(x,
         padding = [0] + padding
     else:
         raise ValueError(
-            "The size of padding's dimension should be 1 or 2. But got padding={}".
-            format(padding))
+            "The size of padding's dimension should be 1 or 2. But got padding={}"
+            .format(padding))
     stride = [1] + convert_to_list(stride, 1, 'stride')
     dilation = [1] + convert_to_list(dilation, 1, 'dilation')
     weight = unsqueeze(weight, axis=[-2])
@@ -395,8 +402,8 @@ def conv1d(x,
     l_type = "conv2d"
 
     # When "groups==num_channels and num_filters% num_channels == 0" using depthwise_conv2d has better performance
-    if (is_compiled_with_cuda() and num_channels == groups and
-            num_channels != 1 and num_filters % num_channels == 0):
+    if (is_compiled_with_cuda() and num_channels == groups and num_channels != 1
+            and num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
         use_cudnn = False
 
@@ -437,8 +444,10 @@ def conv1d(x,
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
         outputs = {"Output": [out]}
-        helper.append_op(
-            type=l_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        helper.append_op(type=l_type,
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=attrs)
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
     out = squeeze(out, axis=[squeeze_aixs])
@@ -591,8 +600,8 @@ def conv2d(x,
                              x.shape, num_channels))
     if groups <= 0:
         raise ValueError(
-            "The groups of conv2d should be greater than 0. Received groups: {}".
-            format(groups))
+            "The groups of conv2d should be greater than 0. Received groups: {}"
+            .format(groups))
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
@@ -606,8 +615,8 @@ def conv2d(x,
 
     cudnn_version = get_cudnn_version()
 
-    use_cudnn = True if (is_compiled_with_cuda() and
-                         cudnn_version is not None) else False
+    use_cudnn = True if (is_compiled_with_cuda()
+                         and cudnn_version is not None) else False
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
@@ -615,8 +624,8 @@ def conv2d(x,
     dilation = convert_to_list(dilation, 2, 'dilation')
 
     l_type = "conv2d"
-    if (num_channels == groups and num_channels != 1 and
-            num_filters % num_channels == 0):
+    if (num_channels == groups and num_channels != 1
+            and num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
         if is_compiled_with_rocm():
             use_cudnn = True
@@ -624,9 +633,10 @@ def conv2d(x,
             use_cudnn = False
     else:
         if in_dygraph_mode():
-            pre_bias = _C_ops.final_state_conv2d(
-                x, weight, stride, padding, padding_algorithm, groups, dilation,
-                data_format, False, -1, False)
+            pre_bias = _C_ops.final_state_conv2d(x, weight, stride, padding,
+                                                 padding_algorithm, groups,
+                                                 dilation, data_format, False,
+                                                 -1, False)
             if bias is not None:
                 out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
                 return out
@@ -642,8 +652,8 @@ def conv2d(x,
         else:
             l_type = 'conv2d'
 
-    if (is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")[
-            "FLAGS_conv2d_disable_cudnn"]):
+    if (is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")
+        ["FLAGS_conv2d_disable_cudnn"]):
         use_cudnn = False
 
     return _conv_nd(x, weight, bias, stride, padding, padding_algorithm,
@@ -818,8 +828,8 @@ def conv1d_transpose(x,
                              x.shape, num_channels))
     if groups <= 0:
         raise ValueError(
-            "The groups of conv1d_transpose should be greater than 0. Received groups: {}".
-            format(groups))
+            "The groups of conv1d_transpose should be greater than 0. Received groups: {}"
+            .format(groups))
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
@@ -862,13 +872,13 @@ def conv1d_transpose(x,
     if len(output_padding) > 0 and output_padding[0] > stride[0]:
         raise ValueError(
             "The size of output_padding should not be greater than stride."
-            "But got output_padding={} and stride={}".format(output_padding[0],
-                                                             stride[0]))
+            "But got output_padding={} and stride={}".format(
+                output_padding[0], stride[0]))
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
-    if (num_channels == groups and num_channels != 1 and num_filters == 1 and
-            not use_cudnn):
+    if (num_channels == groups and num_channels != 1 and num_filters == 1
+            and not use_cudnn):
         op_type = 'depthwise_conv2d_transpose'
         use_cudnn = False
 
@@ -905,8 +915,10 @@ def conv1d_transpose(x,
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
         outputs = {"Output": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        helper.append_op(type=op_type,
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=attrs)
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
 
@@ -1079,8 +1091,8 @@ def conv2d_transpose(x,
                              x.shape, num_channels))
     if groups <= 0:
         raise ValueError(
-            "The groups of conv2d_transpose should be greater than 0. Received groups: {}".
-            format(groups))
+            "The groups of conv2d_transpose should be greater than 0. Received groups: {}"
+            .format(groups))
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
@@ -1089,8 +1101,8 @@ def conv2d_transpose(x,
 
     cudnn_version = get_cudnn_version()
 
-    use_cudnn = True if (is_compiled_with_cuda() and
-                         cudnn_version is not None) else False
+    use_cudnn = True if (is_compiled_with_cuda()
+                         and cudnn_version is not None) else False
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
@@ -1158,8 +1170,10 @@ def conv2d_transpose(x,
         helper = LayerHelper(op_type, **locals())
         pre_bias = helper.create_variable_for_type_inference(x.dtype)
         outputs = {"Output": [pre_bias]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        helper.append_op(type=op_type,
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=attrs)
 
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
@@ -1301,22 +1315,22 @@ def conv3d(x,
             "Received: {}.".format(x.shape, num_channels))
     if groups <= 0:
         raise ValueError(
-            "The groups of conv3d should be greater than 0. Received groups: {}".
-            format(groups))
+            "The groups of conv3d should be greater than 0. Received groups: {}"
+            .format(groups))
     if num_channels % groups != 0:
         raise ValueError(
             "The number of input channels must be divisible by Attr(groups). "
-            "Received: number of channels({}), groups({}).".format(num_channels,
-                                                                   groups))
+            "Received: number of channels({}), groups({}).".format(
+                num_channels, groups))
     if num_filters % groups != 0:
         raise ValueError(
             "The number of filters must be divisible by Attr(groups). "
-            "Received: number of filters({}), groups({}).".format(num_filters,
-                                                                  groups))
+            "Received: number of filters({}), groups({}).".format(
+                num_filters, groups))
 
     cudnn_version = get_cudnn_version()
-    use_cudnn = True if (is_compiled_with_cuda() and
-                         cudnn_version is not None) else False
+    use_cudnn = True if (is_compiled_with_cuda()
+                         and cudnn_version is not None) else False
 
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
     stride = convert_to_list(stride, 3, 'stride')
@@ -1500,13 +1514,13 @@ def conv3d_transpose(x,
             "Received: {}.".format(x.shape, num_channels))
     if groups <= 0:
         raise ValueError(
-            "The groups of conv3d_transpose should be greater than 0. Received groups: {}".
-            format(groups))
+            "The groups of conv3d_transpose should be greater than 0. Received groups: {}"
+            .format(groups))
     if num_channels % groups != 0:
         raise ValueError(
             "The number of input channels must be divisible by Attr(groups). "
-            "Received: number of channels({}), groups({}).".format(num_channels,
-                                                                   groups))
+            "Received: number of channels({}), groups({}).".format(
+                num_channels, groups))
 
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
     stride = convert_to_list(stride, 3, 'stride')
@@ -1531,8 +1545,8 @@ def conv3d_transpose(x,
     cudnn_version = get_cudnn_version()
 
     #TODO(LielinJiang): whether to use cudnn according to the version of cudnn
-    use_cudnn = True if (is_compiled_with_cuda() and
-                         cudnn_version is not None) else False
+    use_cudnn = True if (is_compiled_with_cuda()
+                         and cudnn_version is not None) else False
 
     op_type = 'conv3d_transpose'
     data_format_ = "NHWC" if channel_last else "NCHW"
@@ -1576,8 +1590,10 @@ def conv3d_transpose(x,
         pre_bias = helper.create_variable_for_type_inference(x.dtype)
         outputs = {"Output": [pre_bias]}
 
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        helper.append_op(type=op_type,
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=attrs)
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 5a6bf4c0fa650..27bc2ef70bcee 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -135,13 +135,14 @@ def __check_input(input, offset, dim1, dim2):
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
 
-    helper.append_op(
-        type='diag_embed',
-        inputs={'Input': [input]},
-        attrs={'offset': offset,
-               'dim1': dim1,
-               'dim2': dim2},
-        outputs={'Out': [out]})
+    helper.append_op(type='diag_embed',
+                     inputs={'Input': [input]},
+                     attrs={
+                         'offset': offset,
+                         'dim1': dim1,
+                         'dim2': dim2
+                     },
+                     outputs={'Out': [out]})
     out.stop_gradient = True
     return out
 
@@ -230,8 +231,10 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
         else:
             attrs['maxlen'] = maxlen
 
-    helper.append_op(
-        type='sequence_mask', inputs=inputs, outputs={'Y': out}, attrs=attrs)
+    helper.append_op(type='sequence_mask',
+                     inputs=inputs,
+                     outputs={'Y': out},
+                     attrs=attrs)
 
     out.stop_gradient = True
     return out
@@ -311,11 +314,12 @@ def gather_tree(ids, parents):
                                      'gather_tree')
             out = helper.create_variable_for_type_inference(dtype=ids.dtype)
 
-            helper.append_op(
-                type="gather_tree",
-                inputs={"Ids": ids,
-                        "Parents": parents},
-                outputs={"Out": out})
+            helper.append_op(type="gather_tree",
+                             inputs={
+                                 "Ids": ids,
+                                 "Parents": parents
+                             },
+                             outputs={"Out": out})
 
             return out
 
@@ -371,13 +375,12 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
     if not isinstance(seg_num, int):
         raise TypeError("seg_num must be int type.")
 
-    helper.append_op(
-        type="temporal_shift",
-        inputs={"X": x},
-        outputs={"Out": out},
-        attrs={
-            "seg_num": seg_num,
-            "shift_ratio": shift_ratio,
-            "data_format": data_format
-        })
+    helper.append_op(type="temporal_shift",
+                     inputs={"X": x},
+                     outputs={"Out": out},
+                     attrs={
+                         "seg_num": seg_num,
+                         "shift_ratio": shift_ratio,
+                         "data_format": data_format
+                     })
     return out
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index 92b3a7054d467..01a5f991f420e 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -20,6 +20,7 @@
 from paddle import _C_ops
 from paddle import in_dynamic_mode
 from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode
+
 __all__ = []
 
 
@@ -107,12 +108,11 @@ def one_hot(x, num_classes, name=None):
                 num_classes.stop_gradient = True
                 inputs = {'X': x, 'depth_tensor': num_classes}
                 attrs = {'allow_out_of_range': False}
-            helper.append_op(
-                type="one_hot_v2",
-                inputs=inputs,
-                attrs=attrs,
-                outputs={'Out': one_hot_out},
-                stop_gradient=True)
+            helper.append_op(type="one_hot_v2",
+                             inputs=inputs,
+                             attrs=attrs,
+                             outputs={'Out': one_hot_out},
+                             stop_gradient=True)
             return one_hot_out
 
 
@@ -203,9 +203,10 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
     if in_dygraph_mode():
         return _C_ops.final_state_embedding(x, weight, padding_idx, sparse)
     elif _in_legacy_dygraph():
-        return _C_ops.lookup_table_v2(
-            weight, x, 'is_sparse', sparse, 'is_distributed', False,
-            'remote_prefetch', False, 'padding_idx', padding_idx)
+        return _C_ops.lookup_table_v2(weight, x, 'is_sparse', sparse,
+                                      'is_distributed', False,
+                                      'remote_prefetch', False, 'padding_idx',
+                                      padding_idx)
     else:
         helper = LayerHelper('embedding', **locals())
         dtype = helper.input_dtype(input_param_name='weight')
@@ -219,15 +220,16 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
 
         tmp = helper.create_variable_for_type_inference(dtype)
 
-        helper.append_op(
-            type='lookup_table_v2',
-            inputs={'Ids': x,
-                    'W': weight},
-            outputs={'Out': tmp},
-            attrs={
-                'is_sparse': sparse,
-                'is_distributed': is_distributed,
-                'remote_prefetch': remote_prefetch,
-                'padding_idx': padding_idx
-            })
+        helper.append_op(type='lookup_table_v2',
+                         inputs={
+                             'Ids': x,
+                             'W': weight
+                         },
+                         outputs={'Out': tmp},
+                         attrs={
+                             'is_sparse': sparse,
+                             'is_distributed': is_distributed,
+                             'remote_prefetch': remote_prefetch,
+                             'padding_idx': padding_idx
+                         })
         return tmp
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index c0527a7a65201..e6a3fdb464caf 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -30,6 +30,7 @@
 from paddle import in_dynamic_mode
 from paddle.framework import core
 from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode, _current_expected_place
+
 __all__ = []
 
 
@@ -79,10 +80,10 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
     assert label.dtype in (paddle.int32, paddle.int64)
     assert len(input.shape) >= 2, \
         "The rank of input should be greater than or equal to 2."
-    assert len(input.shape) == len(label.shape), (
-        "The rank of input and label should be equal, "
-        "but received input: %d, label: %d." %
-        (len(input.shape), len(label.shape)))
+    assert len(input.shape) == len(
+        label.shape), ("The rank of input and label should be equal, "
+                       "but received input: %d, label: %d." %
+                       (len(input.shape), len(label.shape)))
     assert label.shape[-1] == 1, ("The last dimension of label should be 1, "
                                   "but received %d." % label.shape[-1])
     assert input.shape[:-1] == label.shape[:-1], (
@@ -146,12 +147,13 @@ def log_loss(input, label, epsilon=1e-4, name=None):
 
     loss = helper.create_variable_for_type_inference(dtype=input.dtype)
 
-    helper.append_op(
-        type='log_loss',
-        inputs={'Predicted': [input],
-                'Labels': [label]},
-        outputs={'Loss': [loss]},
-        attrs={'epsilon': epsilon})
+    helper.append_op(type='log_loss',
+                     inputs={
+                         'Predicted': [input],
+                         'Labels': [label]
+                     },
+                     outputs={'Loss': [loss]},
+                     attrs={'epsilon': epsilon})
     return loss
 
 
@@ -291,12 +293,13 @@ def fluid_softmax_with_cross_entropy(logits,
     if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
         backprop = helper.create_variable_for_type_inference(dtype=logits.dtype)
         outputs['Backprop'] = backprop
-    helper.append_op(
-        type='softmax_with_cross_entropy',
-        inputs={'Logits': logits,
-                'Label': label},
-        outputs=outputs,
-        attrs=attrs)
+    helper.append_op(type='softmax_with_cross_entropy',
+                     inputs={
+                         'Logits': logits,
+                         'Label': label
+                     },
+                     outputs=outputs,
+                     attrs=attrs)
 
     if return_softmax:
         return loss, softmax
@@ -354,19 +357,22 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
     labels = paddle.reshape(labels, shape=[batch_size, 1])
     labels = paddle.tile(labels, repeat_times=[1, batch_size])
 
-    labels = paddle.equal(
-        labels, paddle.transpose(
-            labels, perm=[1, 0])).astype('float32')
+    labels = paddle.equal(labels, paddle.transpose(labels,
+                                                   perm=[1,
+                                                         0])).astype('float32')
     labels = labels / paddle.sum(labels, axis=1, keepdim=True)
 
     l2loss = paddle.mean(paddle.sum(paddle.square(anchor), 1)) \
              + paddle.mean(paddle.sum(paddle.square(positive), 1))
     l2loss = l2loss * Beta * l2_reg
 
-    similarity_matrix = paddle.matmul(
-        anchor, positive, transpose_x=False, transpose_y=True)
-    softmax_ce = fluid_softmax_with_cross_entropy(
-        logits=similarity_matrix, label=labels, soft_label=True)
+    similarity_matrix = paddle.matmul(anchor,
+                                      positive,
+                                      transpose_x=False,
+                                      transpose_y=True)
+    softmax_ce = fluid_softmax_with_cross_entropy(logits=similarity_matrix,
+                                                  label=labels,
+                                                  soft_label=True)
     cross_entropy = paddle.sum(labels * softmax_ce, 0)
     celoss = paddle.mean(cross_entropy)
 
@@ -418,16 +424,17 @@ def square_error_cost(input, label):
                              'square_error_cost')
     helper = LayerHelper('square_error_cost', **locals())
     minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='elementwise_sub',
-        inputs={'X': [input],
-                'Y': [label]},
-        outputs={'Out': [minus_out]})
+    helper.append_op(type='elementwise_sub',
+                     inputs={
+                         'X': [input],
+                         'Y': [label]
+                     },
+                     outputs={'Out': [minus_out]})
 
     square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='square', inputs={'X': [minus_out]},
-        outputs={'Out': [square_out]})
+    helper.append_op(type='square',
+                     inputs={'X': [minus_out]},
+                     outputs={'Out': [square_out]})
     return square_out
 
 
@@ -513,18 +520,16 @@ def edit_distance(input,
         erased_input = helper.create_variable_for_type_inference(dtype="int64")
         erased_label = helper.create_variable_for_type_inference(dtype="int64")
 
-        helper.append_op(
-            type="sequence_erase",
-            inputs={"X": [input]},
-            outputs={"Out": [erased_input]},
-            attrs={"tokens": ignored_tokens})
+        helper.append_op(type="sequence_erase",
+                         inputs={"X": [input]},
+                         outputs={"Out": [erased_input]},
+                         attrs={"tokens": ignored_tokens})
         input = erased_input
 
-        helper.append_op(
-            type="sequence_erase",
-            inputs={"X": [label]},
-            outputs={"Out": [erased_label]},
-            attrs={"tokens": ignored_tokens})
+        helper.append_op(type="sequence_erase",
+                         inputs={"X": [label]},
+                         outputs={"Out": [erased_label]},
+                         attrs={"tokens": ignored_tokens})
         label = erased_label
 
     this_inputs = {"Hyps": [input], "Refs": [label]}
@@ -535,17 +540,21 @@ def edit_distance(input,
     # edit distance op
     edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
     sequence_num = helper.create_variable_for_type_inference(dtype="int64")
-    helper.append_op(
-        type="edit_distance",
-        inputs=this_inputs,
-        outputs={"Out": [edit_distance_out],
-                 "SequenceNum": [sequence_num]},
-        attrs={"normalized": normalized})
+    helper.append_op(type="edit_distance",
+                     inputs=this_inputs,
+                     outputs={
+                         "Out": [edit_distance_out],
+                         "SequenceNum": [sequence_num]
+                     },
+                     attrs={"normalized": normalized})
 
     return edit_distance_out, sequence_num
 
 
-def binary_cross_entropy(input, label, weight=None, reduction='mean',
+def binary_cross_entropy(input,
+                         label,
+                         weight=None,
+                         reduction='mean',
                          name=None):
     """
     This op measures the binary_cross_entropy loss between input predictions ``input``
@@ -650,13 +659,12 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
             sub_name = name if weight is None and reduction == 'none' else None
             helper = LayerHelper("binary_cross_entropy", name=sub_name)
             out = helper.create_variable_for_type_inference(dtype=input.dtype)
-            helper.append_op(
-                type='bce_loss',
-                inputs={
-                    'X': [input],
-                    'Label': [label],
-                },
-                outputs={'Out': [out]})
+            helper.append_op(type='bce_loss',
+                             inputs={
+                                 'X': [input],
+                                 'Label': [label],
+                             },
+                             outputs={'Out': [out]})
 
             if weight is not None:
                 if isinstance(weight, paddle.static.Variable):
@@ -765,16 +773,16 @@ def binary_cross_entropy_with_logits(logit,
 
     if _non_static_mode():
         if in_dygraph_mode():
-            one = _C_ops.final_state_full([1],
-                                          float(1.0), core.VarDesc.VarType.FP32,
+            one = _C_ops.final_state_full([1], float(1.0),
+                                          core.VarDesc.VarType.FP32,
                                           _current_expected_place())
             out = _C_ops.final_state_sigmoid_cross_entropy_with_logits(
                 logit, label, False, -100)
         else:
             one = _varbase_creator(dtype=logit.dtype)
-            _C_ops.fill_constant(one, 'value',
-                                 float(1.0), 'force_cpu', False, 'dtype',
-                                 one.dtype, 'str_value', '1.0', 'shape', [1])
+            _C_ops.fill_constant(one, 'value', float(1.0), 'force_cpu', False,
+                                 'dtype', one.dtype, 'str_value', '1.0',
+                                 'shape', [1])
             out = _C_ops.sigmoid_cross_entropy_with_logits(logit, label)
         if pos_weight is not None:
             log_weight = _C_ops.elementwise_add(
@@ -914,9 +922,11 @@ def hsigmoid_loss(input,
     """
 
     if _non_static_mode():
-        out, _, _ = _C_ops.hierarchical_sigmoid(
-            input, weight, label, path_table, path_code, bias, 'num_classes',
-            num_classes, 'is_sparse', is_sparse, 'remote_prefetch', is_sparse)
+        out, _, _ = _C_ops.hierarchical_sigmoid(input, weight, label,
+                                                path_table, path_code, bias,
+                                                'num_classes', num_classes,
+                                                'is_sparse', is_sparse,
+                                                'remote_prefetch', is_sparse)
         return out
 
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
@@ -954,11 +964,10 @@ def hsigmoid_loss(input,
     pre_out = helper.create_variable_for_type_inference(input.dtype)
     outputs = {"Out": out, "PreOut": pre_out, "W_Out": weight}
 
-    helper.append_op(
-        type="hierarchical_sigmoid",
-        inputs=inputs,
-        outputs=outputs,
-        attrs=attrs)
+    helper.append_op(type="hierarchical_sigmoid",
+                     inputs=inputs,
+                     outputs=outputs,
+                     attrs=attrs)
     return out
 
 
@@ -1033,13 +1042,16 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
             dtype=helper.input_dtype())
         out = helper.create_variable_for_type_inference(
             dtype=helper.input_dtype())
-        helper.append_op(
-            type='huber_loss',
-            inputs={'X': input,
-                    'Y': label},
-            outputs={'Out': out,
-                     'Residual': residual},
-            attrs={'delta': delta})
+        helper.append_op(type='huber_loss',
+                         inputs={
+                             'X': input,
+                             'Y': label
+                         },
+                         outputs={
+                             'Out': out,
+                             'Residual': residual
+                         },
+                         attrs={'delta': delta})
 
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
@@ -1148,25 +1160,24 @@ def margin_ranking_loss(input,
     result_out = helper.create_variable_for_type_inference(input.dtype)
 
     if reduction == 'none':
-        helper.append_op(
-            type="relu", inputs={"X": out}, outputs={"Out": result_out})
+        helper.append_op(type="relu",
+                         inputs={"X": out},
+                         outputs={"Out": result_out})
         return result_out
     elif reduction == 'sum':
         out = paddle.nn.functional.relu(out)
         attrs = {"dim": [0], "keep_dim": False, "reduce_all": True}
-        helper.append_op(
-            type="reduce_sum",
-            inputs={"X": out},
-            outputs={"Out": result_out},
-            attrs=attrs)
+        helper.append_op(type="reduce_sum",
+                         inputs={"X": out},
+                         outputs={"Out": result_out},
+                         attrs=attrs)
         return result_out
     elif reduction == 'mean':
         out = paddle.nn.functional.relu(out)
-        helper.append_op(
-            type="mean",
-            inputs={"X": out},
-            outputs={"Out": result_out},
-            attrs={})
+        helper.append_op(type="mean",
+                         inputs={"X": out},
+                         outputs={"Out": result_out},
+                         attrs={})
         return result_out
 
 
@@ -1233,8 +1244,11 @@ def l1_loss(input, label, reduction='mean', name=None):
             "received %s, which is not allowed." % reduction)
 
     if in_dygraph_mode():
-        unreduced = _elementwise_op_in_dygraph(
-            input, label, axis=-1, act='abs', op_name='elementwise_sub')
+        unreduced = _elementwise_op_in_dygraph(input,
+                                               label,
+                                               axis=-1,
+                                               act='abs',
+                                               op_name='elementwise_sub')
         if reduction == 'mean':
             return _C_ops.final_state_mean_all(unreduced)
         elif reduction == 'sum':
@@ -1243,8 +1257,11 @@ def l1_loss(input, label, reduction='mean', name=None):
         else:
             return unreduced
     elif in_dynamic_mode():
-        unreduced = _elementwise_op_in_dygraph(
-            input, label, axis=-1, act='abs', op_name='elementwise_sub')
+        unreduced = _elementwise_op_in_dygraph(input,
+                                               label,
+                                               axis=-1,
+                                               act='abs',
+                                               op_name='elementwise_sub')
         if reduction == 'mean':
             return _C_ops.mean(unreduced)
         elif reduction == 'sum':
@@ -1253,10 +1270,12 @@ def l1_loss(input, label, reduction='mean', name=None):
         else:
             return unreduced
 
-    check_variable_and_dtype(
-        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
-    check_variable_and_dtype(
-        label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
+    check_variable_and_dtype(input, 'input',
+                             ['float32', 'float64', 'int32', 'int64'],
+                             'l1_loss')
+    check_variable_and_dtype(label, 'label',
+                             ['float32', 'float64', 'int32', 'int64'],
+                             'l1_loss')
 
     if reduction == 'sum':
         unreduced = paddle.fluid.layers.elementwise_sub(input, label, act='abs')
@@ -1265,8 +1284,10 @@ def l1_loss(input, label, reduction='mean', name=None):
         unreduced = paddle.fluid.layers.elementwise_sub(input, label, act='abs')
         return paddle.mean(unreduced, name=name)
     else:
-        return paddle.fluid.layers.elementwise_sub(
-            input, label, act='abs', name=name)
+        return paddle.fluid.layers.elementwise_sub(input,
+                                                   label,
+                                                   act='abs',
+                                                   name=name)
 
 
 def nll_loss(input,
@@ -1328,8 +1349,8 @@ def nll_loss(input,
     input_shape = list(input.shape)
     input_dims = len(input_shape)
     if input_dims < 2:
-        raise ValueError('Expected 2 or more dimensions (got {})'.format(
-            input_dims))
+        raise ValueError(
+            'Expected 2 or more dimensions (got {})'.format(input_dims))
     n = input_shape[0]
     c = input_shape[1]
     if in_dygraph_mode():
@@ -1374,8 +1395,10 @@ def nll_loss(input,
     total_weight = helper.create_variable_for_type_inference(dtype=input.dtype)
     outputs = {'Out': out, 'Total_weight': total_weight}
 
-    helper.append_op(
-        type='nll_loss', inputs=inputs, outputs=outputs, attrs=attrs)
+    helper.append_op(type='nll_loss',
+                     inputs=inputs,
+                     outputs=outputs,
+                     attrs=attrs)
     if input_dims != 2 and input_dims != 4 and reduction == 'none':
         out = reshape(out, shape=out_shape)
 
@@ -1489,12 +1512,13 @@ def kl_div(input, label, reduction='mean', name=None):
     fluid.data_feeder.check_type(reduction, 'reduction', str, 'kl_div')
 
     loss = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='kldiv_loss',
-        inputs={'X': input,
-                'Target': label},
-        outputs={'Loss': loss},
-        attrs={'reduction': 'none'})
+    helper.append_op(type='kldiv_loss',
+                     inputs={
+                         'X': input,
+                         'Target': label
+                     },
+                     outputs={'Loss': loss},
+                     attrs={'reduction': 'none'})
 
     if reduction == 'mean':
         loss = paddle.mean(loss)
@@ -1570,8 +1594,8 @@ def mse_loss(input, label, reduction='mean', name=None):
     if reduction == 'none':
         return paddle.square(paddle.subtract(input, label), name=name)
     elif reduction == 'mean':
-        return paddle.mean(
-            paddle.square(paddle.subtract(input, label)), name=name)
+        return paddle.mean(paddle.square(paddle.subtract(input, label)),
+                           name=name)
     else:
         return paddle.sum(paddle.square(paddle.subtract(input, label)),
                           name=name)
@@ -1924,22 +1948,25 @@ def margin_cross_entropy(logits,
     check_variable_and_dtype(label, 'label', ['int32', 'int64'],
                              'margin_cross_entropy')
 
-    helper.append_op(
-        type=op_type,
-        inputs={'Logits': logits,
-                'Label': label},
-        outputs={'Softmax': softmax,
-                 'Loss': loss},
-        attrs={
-            'return_softmax': return_softmax,
-            'ring_id': ring_id,
-            'rank': rank,
-            'nranks': nranks,
-            'margin1': margin1,
-            'margin2': margin2,
-            'margin3': margin3,
-            'scale': scale,
-        })
+    helper.append_op(type=op_type,
+                     inputs={
+                         'Logits': logits,
+                         'Label': label
+                     },
+                     outputs={
+                         'Softmax': softmax,
+                         'Loss': loss
+                     },
+                     attrs={
+                         'return_softmax': return_softmax,
+                         'ring_id': ring_id,
+                         'rank': rank,
+                         'nranks': nranks,
+                         'margin1': margin1,
+                         'margin2': margin2,
+                         'margin3': margin3,
+                         'scale': scale,
+                     })
 
     if reduction == 'mean':
         loss = paddle.mean(loss)
@@ -1956,9 +1983,9 @@ def margin_cross_entropy(logits,
     since="2.0.0",
     update_to="paddle.nn.functional.cross_entropy",
     level=1,
-    reason=(
-        'Please notice that behavior of "paddle.nn.functional.softmax_with_cross_entropy" '
-        'and "paddle.nn.functional.cross_entropy" is different.'))
+    reason=
+    ('Please notice that behavior of "paddle.nn.functional.softmax_with_cross_entropy" '
+     'and "paddle.nn.functional.cross_entropy" is different.'))
 def softmax_with_cross_entropy(logits,
                                label,
                                soft_label=False,
@@ -2247,8 +2274,8 @@ def cross_entropy(input,
 
     if _non_static_mode():
         if soft_label == False:
-            valid_label = paddle.cast(
-                label != ignore_index, dtype=label.dtype) * label
+            valid_label = paddle.cast(label != ignore_index,
+                                      dtype=label.dtype) * label
             label_min = paddle.min(valid_label)
             label_max = paddle.max(valid_label)
             if label_min < 0:
@@ -2281,11 +2308,11 @@ def cross_entropy(input,
                 # weight's shape is C, where C is class num.
                 # for 1d case: label's shape is [N,C], weight_gather's shape is N.
                 # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
-                weight_gather = paddle.matmul(
-                    x=paddle.cast(label, weight.dtype),
-                    y=weight,
-                    transpose_x=False,
-                    transpose_y=True)
+                weight_gather = paddle.matmul(x=paddle.cast(
+                    label, weight.dtype),
+                                              y=weight,
+                                              transpose_x=False,
+                                              transpose_y=True)
                 out_shape = list(out.shape)
                 weight_gather_reshape = reshape(weight_gather, shape=out_shape)
                 out = paddle.cast(out, weight_gather_reshape.dtype)
@@ -2318,8 +2345,8 @@ def cross_entropy(input,
                 weight_gather = _C_ops.elementwise_mul(weight_gather,
                                                        ignore_weight_mask)
                 input_shape = list(label.shape)
-                weight_gather_reshape = reshape(
-                    weight_gather, shape=input_shape)
+                weight_gather_reshape = reshape(weight_gather,
+                                                shape=input_shape)
                 out = paddle.cast(out, weight_gather_reshape.dtype)
                 out = _C_ops.elementwise_mul(out, weight_gather_reshape)
 
@@ -2390,12 +2417,13 @@ def cross_entropy(input,
     if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
         backprop = helper.create_variable_for_type_inference(dtype=input.dtype)
         outputs['Backprop'] = backprop
-    helper.append_op(
-        type='softmax_with_cross_entropy',
-        inputs={'Logits': input,
-                'Label': label},
-        outputs=outputs,
-        attrs=attrs)
+    helper.append_op(type='softmax_with_cross_entropy',
+                     inputs={
+                         'Logits': input,
+                         'Label': label
+                     },
+                     outputs=outputs,
+                     attrs=attrs)
 
     if weight is not None:
         check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
@@ -2407,11 +2435,10 @@ def cross_entropy(input,
             # weight's shape is C, where C is class num.
             # for 1d case: label's shape is [N,C], weight_gather's shape is N.
             # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
-            weight_gather = paddle.matmul(
-                x=paddle.cast(label, weight.dtype),
-                y=weight,
-                transpose_x=False,
-                transpose_y=True)
+            weight_gather = paddle.matmul(x=paddle.cast(label, weight.dtype),
+                                          y=weight,
+                                          transpose_x=False,
+                                          transpose_y=True)
 
             out_shape = list(out.shape)
             weight_gather_reshape = reshape(weight_gather, shape=out_shape)
@@ -2424,8 +2451,7 @@ def cross_entropy(input,
                                  .format(input.shape[axis], weight.shape[-1]))
 
             valid_label = paddle.multiply(
-                paddle.cast(
-                    label != ignore_index, dtype=label.dtype), label)
+                paddle.cast(label != ignore_index, dtype=label.dtype), label)
             ignore_weight_mask = paddle.cast((label != ignore_index),
                                              input.dtype)
             if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
@@ -2567,14 +2593,14 @@ def sigmoid_focal_loss(logit,
         normalizer_dims = len(normalizer_shape)
         if normalizer_dims > 1:
             raise ValueError(
-                "Expected one dimension of normalizer in sigmoid_focal_loss but got {}.".
-                format(normalizer_dims))
+                "Expected one dimension of normalizer in sigmoid_focal_loss but got {}."
+                .format(normalizer_dims))
 
     if _non_static_mode():
         one = _varbase_creator(dtype=logit.dtype)
-        _C_ops.fill_constant(one, 'value',
-                             float(1.0), 'force_cpu', False, 'dtype', one.dtype,
-                             'str_value', '1.0', 'shape', logit.shape)
+        _C_ops.fill_constant(one, 'value', float(1.0), 'force_cpu', False,
+                             'dtype', one.dtype, 'str_value', '1.0', 'shape',
+                             logit.shape)
         if in_dygraph_mode():
             loss = _C_ops.final_state_sigmoid_cross_entropy_with_logits(
                 logit, label, False, -100)
@@ -2583,21 +2609,19 @@ def sigmoid_focal_loss(logit,
         pred = _C_ops.sigmoid(logit)
         p_t = _C_ops.elementwise_add(
             _C_ops.elementwise_mul(pred, label),
-            _C_ops.elementwise_mul(
-                _C_ops.elementwise_sub(one, pred),
-                _C_ops.elementwise_sub(one, label)))
+            _C_ops.elementwise_mul(_C_ops.elementwise_sub(one, pred),
+                                   _C_ops.elementwise_sub(one, label)))
 
         alpha = fluid.dygraph.base.to_variable([alpha], dtype=loss.dtype)
         alpha_t = _C_ops.elementwise_add(
             _C_ops.elementwise_mul(alpha, label),
-            _C_ops.elementwise_mul(
-                _C_ops.elementwise_sub(one, alpha),
-                _C_ops.elementwise_sub(one, label)))
+            _C_ops.elementwise_mul(_C_ops.elementwise_sub(one, alpha),
+                                   _C_ops.elementwise_sub(one, label)))
         loss = _C_ops.elementwise_mul(alpha_t, loss)
 
         gamma = fluid.dygraph.base.to_variable([gamma], dtype=loss.dtype)
-        gamma_t = _C_ops.elementwise_pow(
-            _C_ops.elementwise_sub(one, p_t), gamma)
+        gamma_t = _C_ops.elementwise_pow(_C_ops.elementwise_sub(one, p_t),
+                                         gamma)
         loss = _C_ops.elementwise_mul(gamma_t, loss)
 
         if normalizer is not None:
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index e719099b4b39d..7bc9f105cac1e 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -86,8 +86,8 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
 
     if _in_legacy_dygraph():
         eps = fluid.dygraph.base.to_variable([epsilon], dtype=x.dtype)
-        out = _C_ops.p_norm(x, 'axis', axis, 'porder',
-                            float(p), 'keepdim', True, 'epsilon', epsilon)
+        out = _C_ops.p_norm(x, 'axis', axis, 'porder', float(p), 'keepdim',
+                            True, 'epsilon', epsilon)
         return x / _C_ops.elementwise_max(out, eps)
 
     check_type(p, 'p', (float, int), 'normalize')
@@ -96,8 +96,8 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
                              'normalize')
     if len(x.shape) == 1 and axis != 0 and axis != -1:
         raise ValueError(
-            "Axis must be 0 or -1 when x is a 1-D tensor, but received axis = {}".
-            format(axis))
+            "Axis must be 0 or -1 when x is a 1-D tensor, but received axis = {}"
+            .format(axis))
 
     attrs = {
         'axis': axis,
@@ -107,8 +107,10 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     }
     helper = LayerHelper('p_norm', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='p_norm', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
+    helper.append_op(type='p_norm',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs=attrs)
     eps = out.block.create_var(dtype=out.dtype)
     eps = paddle.full(shape=[1], fill_value=epsilon, dtype=out.dtype)
     return paddle.divide(x, paddle.maximum(out, eps), name=name)
@@ -192,8 +194,8 @@ def batch_norm(x,
             data_format, not training, use_global_stats, trainable_statistics,
             False)
 
-        return dygraph_utils._append_activation_in_dygraph(
-            batch_norm_out, act=None)
+        return dygraph_utils._append_activation_in_dygraph(batch_norm_out,
+                                                           act=None)
 
     elif _in_legacy_dygraph():
         # for dygraph need tuple
@@ -206,8 +208,8 @@ def batch_norm(x,
             x, weight, bias, running_mean, running_var, None, mean_out,
             variance_out, *attrs)
 
-        return dygraph_utils._append_activation_in_dygraph(
-            batch_norm_out, act=None)
+        return dygraph_utils._append_activation_in_dygraph(batch_norm_out,
+                                                           act=None)
 
     check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
                              'BatchNorm')
@@ -235,8 +237,8 @@ def batch_norm(x,
     helper = LayerHelper('batch_norm', **locals())
 
     param_dtype = x.dtype if x.dtype != 'float16' else 'float32'
-    saved_mean = helper.create_variable_for_type_inference(
-        dtype=param_dtype, stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(dtype=param_dtype,
+                                                           stop_gradient=True)
     saved_variance = helper.create_variable_for_type_inference(
         dtype=param_dtype, stop_gradient=True)
     batch_norm_out = helper.create_variable_for_type_inference(x.dtype)
@@ -255,8 +257,10 @@ def batch_norm(x,
             dtype=x.dtype, stop_gradient=True)
         outputs["ReserveSpace"] = [reserve_space]
 
-    helper.append_op(
-        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+    helper.append_op(type="batch_norm",
+                     inputs=inputs,
+                     outputs=outputs,
+                     attrs=attrs)
 
     return helper.append_activation(batch_norm_out)
 
@@ -315,8 +319,8 @@ def layer_norm(x,
         str_normalized_shape = str(normalized_shape)
         raise ValueError('Given normalized_shape is ' + str_normalized_shape +
                          ', expected input with shape [*, ' +
-                         str_normalized_shape[
-                             1:] + ', but got input shape ' + str(input_shape))
+                         str_normalized_shape[1:] + ', but got input shape ' +
+                         str(input_shape))
 
     if in_dygraph_mode():
         pre_act, _, _, = _C_ops.final_state_layer_norm(x, weight, bias, epsilon,
@@ -344,22 +348,23 @@ def layer_norm(x,
     helper = LayerHelper('layer_norm', **locals())
 
     dtype = x.dtype
-    mean_out = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-    variance_out = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+    mean_out = helper.create_variable_for_type_inference(dtype=dtype,
+                                                         stop_gradient=True)
+    variance_out = helper.create_variable_for_type_inference(dtype=dtype,
+                                                             stop_gradient=True)
     layer_norm_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type="layer_norm",
-        inputs=inputs,
-        outputs={
-            "Y": layer_norm_out,
-            "Mean": mean_out,
-            "Variance": variance_out,
-        },
-        attrs={"epsilon": epsilon,
-               "begin_norm_axis": begin_norm_axis})
+    helper.append_op(type="layer_norm",
+                     inputs=inputs,
+                     outputs={
+                         "Y": layer_norm_out,
+                         "Mean": mean_out,
+                         "Variance": variance_out,
+                     },
+                     attrs={
+                         "epsilon": epsilon,
+                         "begin_norm_axis": begin_norm_axis
+                     })
 
     return helper.append_activation(layer_norm_out)
 
@@ -407,8 +412,10 @@ def instance_norm(x,
           print(instance_norm_out)
 
     """
-
-    if in_dynamic_mode():
+    if in_dygraph_mode():
+        out, _, _, = _C_ops.final_state_instance_norm(x, weight, bias, eps)
+        return out
+    if _in_legacy_dygraph():
         out, _, _ = _C_ops.instance_norm(x, weight, bias, "epsilon", eps,
                                          "momentum", momentum, "data_format",
                                          data_format)
@@ -424,8 +431,8 @@ def instance_norm(x,
         inputs = {"X": [x]}
 
     helper = LayerHelper('instance_norm', **locals())
-    saved_mean = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(dtype=x.dtype,
+                                                           stop_gradient=True)
     saved_variance = helper.create_variable_for_type_inference(
         dtype=x.dtype, stop_gradient=True)
     instance_norm_out = helper.create_variable_for_type_inference(x.dtype)
@@ -436,8 +443,10 @@ def instance_norm(x,
         "SavedVariance": [saved_variance]
     }
 
-    helper.append_op(
-        type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+    helper.append_op(type="instance_norm",
+                     inputs=inputs,
+                     outputs=outputs,
+                     attrs=attrs)
     return instance_norm_out
 
 
@@ -514,8 +523,8 @@ def local_response_norm(x,
     for i, sz in enumerate(sizes):
         if not sz > 0 and i > 0:
             raise ValueError("Expected every dim's size to be larger than 0, "
-                             "but the size of the {}-th dim is {}".format(i,
-                                                                          sz))
+                             "but the size of the {}-th dim is {}".format(
+                                 i, sz))
 
     channel_last = True if data_format[-1] == "C" else False
 
@@ -536,24 +545,26 @@ def local_response_norm(x,
         pad4d_shape = [size // 2, (size - 1) // 2, 0, 0]
         pool2d_shape = (1, size)
         reshape_shape = [
-            sizes[0], 1, sizes[1], int(sum_sizes / (sizes[1] * sizes[-1])),
-            sizes[-1]
+            sizes[0], 1, sizes[1],
+            int(sum_sizes / (sizes[1] * sizes[-1])), sizes[-1]
         ]
         pad5d_shape = [size // 2, (size - 1) // 2, 0, 0, 0, 0]
         pool3d_shape = (1, 1, size)
 
     if dim == 3:
         div = paddle.nn.functional.pad(div, pad=pad4d_shape)
-        div = paddle.nn.functional.avg_pool2d(
-            div, kernel_size=pool2d_shape, stride=1)
+        div = paddle.nn.functional.avg_pool2d(div,
+                                              kernel_size=pool2d_shape,
+                                              stride=1)
         div = paddle.squeeze(div, axis=1)
     else:
         div = paddle.reshape(div, shape=reshape_shape)
         div = paddle.nn.functional.pad(div,
                                        pad=pad5d_shape,
                                        data_format='NCDHW')
-        div = paddle.nn.functional.avg_pool3d(
-            div, kernel_size=pool3d_shape, stride=1)
+        div = paddle.nn.functional.avg_pool3d(div,
+                                              kernel_size=pool3d_shape,
+                                              stride=1)
         div = paddle.reshape(paddle.squeeze(div, axis=1), sizes)
 
     div = paddle.scale(div, scale=alpha, bias=k)
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 6a573005f4514..f79a43fbc03a6 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -38,16 +38,18 @@ def _check_input(x, dimension):
 def _check_instance(x, x_name, types=(int, float)):
 
     if not isinstance(x, types):
-        raise ValueError("Excepted {} type for {} but received type: {}. ".
-                         format(types, x_name, type(x)))
+        raise ValueError(
+            "Excepted {} type for {} but received type: {}. ".format(
+                types, x_name, type(x)))
 
 
 def _check_value_limitation(x, x_name, min_limit=1e-3):
+
     def _check_value(x, x_name, min_limit=1e-3):
         if isinstance(x, int) and min_limit is not None and x < min_limit:
             raise ValueError(
-                "Excepted the input {} to be greater than {} but received x: {}. ".
-                format(x_name, min_limit, x))
+                "Excepted the input {} to be greater than {} but received x: {}. "
+                .format(x_name, min_limit, x))
 
     for ele in x:
         _check_value(ele, x_name)
@@ -118,8 +120,8 @@ def _update_padding_nd(padding, num_dims, channel_last=False, ceil_mode=False):
                     "Non-zero padding({}) in the batch or channel dimensions "
                     "is not supported.".format(padding))
             padding_algorithm = "EXPLICIT"
-            padding = _exclude_padding_in_batch_and_channel(padding,
-                                                            channel_last)
+            padding = _exclude_padding_in_batch_and_channel(
+                padding, channel_last)
             if utils._is_symmetric_padding(padding, num_dims):
                 padding = padding[0::2]
         # for padding like [pad_before, pad_after, pad_before, pad_after, ...]
@@ -149,8 +151,8 @@ def _expand_low_nd_padding(padding):
         padding = [0] + padding
     else:
         raise ValueError(
-            "The size of padding's dimmention should be 1 or 2. But got padding={}".
-            format(padding))
+            "The size of padding's dimmention should be 1 or 2. But got padding={}"
+            .format(padding))
     return padding
 
 
@@ -226,19 +228,22 @@ def avg_pool1d(x,
     _check_value_limitation(stride, "stride", min_limit=1e-3)
 
     channel_last = _channel_last("NCL", 1)
-    padding, padding_algorithm = _update_padding_nd(
-        padding, 1, channel_last=channel_last, ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(padding,
+                                                    1,
+                                                    channel_last=channel_last,
+                                                    ceil_mode=ceil_mode)
 
     # use 2d to implenment 1d should expand padding in advance.
     padding = _expand_low_nd_padding(padding)
 
     if in_dynamic_mode():
-        output = _C_ops.pool2d(
-            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
-            False, 'strides', stride, 'paddings', padding, 'padding_algorithm',
-            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', exclusive, 'data_format',
-            data_format)
+        output = _C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize', kernel_size,
+                               'global_pooling', False, 'strides', stride,
+                               'paddings', padding, 'padding_algorithm',
+                               padding_algorithm, 'use_cudnn', True,
+                               'ceil_mode', ceil_mode, 'use_mkldnn', False,
+                               'exclusive', exclusive, 'data_format',
+                               data_format)
         return squeeze(output, [2])
 
     op_type = 'pool2d'
@@ -246,23 +251,22 @@ def avg_pool1d(x,
     dtype = helper.input_dtype(input_param_name='x')
     pool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type=op_type,
-        inputs={"X": x},
-        outputs={"Out": pool_out},
-        attrs={
-            "pooling_type": 'avg',
-            "ksize": kernel_size,
-            "global_pooling": False,
-            "strides": stride,
-            "paddings": padding,
-            "padding_algorithm": padding_algorithm,
-            "use_cudnn": True,
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": exclusive,
-            "data_format": data_format,
-        })
+    helper.append_op(type=op_type,
+                     inputs={"X": x},
+                     outputs={"Out": pool_out},
+                     attrs={
+                         "pooling_type": 'avg',
+                         "ksize": kernel_size,
+                         "global_pooling": False,
+                         "strides": stride,
+                         "paddings": padding,
+                         "padding_algorithm": padding_algorithm,
+                         "use_cudnn": True,
+                         "ceil_mode": ceil_mode,
+                         "use_mkldnn": False,
+                         "exclusive": exclusive,
+                         "data_format": data_format,
+                     })
 
     return squeeze(pool_out, [2])
 
@@ -343,21 +347,25 @@ def avg_pool2d(x,
     _check_value_limitation(stride, "stride", min_limit=1e-3)
 
     channel_last = _channel_last(data_format, 2)
-    padding, padding_algorithm = _update_padding_nd(
-        padding, 2, channel_last, ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(padding,
+                                                    2,
+                                                    channel_last,
+                                                    ceil_mode=ceil_mode)
 
     if in_dygraph_mode() or _in_legacy_dygraph():
         if in_dygraph_mode():
-            output = _C_ops.final_state_pool2d(
-                x, kernel_size, stride, padding, ceil_mode, exclusive,
-                data_format, 'avg', False, False, padding_algorithm)
+            output = _C_ops.final_state_pool2d(x, kernel_size, stride, padding,
+                                               ceil_mode, exclusive,
+                                               data_format, 'avg', False, False,
+                                               padding_algorithm)
         else:
-            output = _C_ops.pool2d(
-                x, 'pooling_type', 'avg', 'ksize', kernel_size,
-                'global_pooling', False, 'padding_algorithm', padding_algorithm,
-                'strides', stride, 'paddings', padding, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive',
-                exclusive, 'data_format', data_format)
+            output = _C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize',
+                                   kernel_size, 'global_pooling', False,
+                                   'padding_algorithm', padding_algorithm,
+                                   'strides', stride, 'paddings', padding,
+                                   'use_cudnn', True, 'ceil_mode', ceil_mode,
+                                   'use_mkldnn', False, 'exclusive', exclusive,
+                                   'data_format', data_format)
         if divisor_override is None:
             return output
         else:
@@ -370,23 +378,22 @@ def avg_pool2d(x,
     dtype = helper.input_dtype(input_param_name='x')
     pool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type=op_type,
-        inputs={"X": x},
-        outputs={"Out": pool_out},
-        attrs={
-            "pooling_type": "avg",
-            "ksize": kernel_size,
-            "global_pooling": False,
-            "strides": stride,
-            "paddings": padding,
-            "padding_algorithm": padding_algorithm,
-            "use_cudnn": True,
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": exclusive,
-            "data_format": data_format,
-        })
+    helper.append_op(type=op_type,
+                     inputs={"X": x},
+                     outputs={"Out": pool_out},
+                     attrs={
+                         "pooling_type": "avg",
+                         "ksize": kernel_size,
+                         "global_pooling": False,
+                         "strides": stride,
+                         "paddings": padding,
+                         "padding_algorithm": padding_algorithm,
+                         "use_cudnn": True,
+                         "ceil_mode": ceil_mode,
+                         "use_mkldnn": False,
+                         "exclusive": exclusive,
+                         "data_format": data_format,
+                     })
 
     if divisor_override is None:
         return pool_out
@@ -467,24 +474,28 @@ def avg_pool3d(x,
         stride = utils.convert_to_list(stride, 3, 'pool_stride')
 
     channel_last = _channel_last(data_format, 3)
-    padding, padding_algorithm = _update_padding_nd(
-        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(padding,
+                                                    3,
+                                                    channel_last=channel_last,
+                                                    ceil_mode=ceil_mode)
 
     _check_value_limitation(kernel_size, "kernel_size", min_limit=1e-3)
     _check_value_limitation(stride, "stride", min_limit=1e-3)
 
     if in_dygraph_mode() or _in_legacy_dygraph():
         if in_dygraph_mode():
-            output = _C_ops.final_state_pool3d(
-                x, kernel_size, stride, padding, ceil_mode, exclusive,
-                data_format, 'avg', False, False, padding_algorithm)
+            output = _C_ops.final_state_pool3d(x, kernel_size, stride, padding,
+                                               ceil_mode, exclusive,
+                                               data_format, 'avg', False, False,
+                                               padding_algorithm)
         if _in_legacy_dygraph():
-            output = _C_ops.pool3d(
-                x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides',
-                stride, 'paddings', padding, 'global_pooling', False,
-                'padding_algorithm', padding_algorithm, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive',
-                exclusive, 'data_format', data_format)
+            output = _C_ops.pool3d(x, 'pooling_type', 'avg', 'ksize',
+                                   kernel_size, 'strides', stride, 'paddings',
+                                   padding, 'global_pooling', False,
+                                   'padding_algorithm', padding_algorithm,
+                                   'use_cudnn', True, 'ceil_mode', ceil_mode,
+                                   'use_mkldnn', False, 'exclusive', exclusive,
+                                   'data_format', data_format)
         if divisor_override is None:
             return output
         else:
@@ -499,23 +510,22 @@ def avg_pool3d(x,
     pool_out = helper.create_variable_for_type_inference(dtype)
     outputs = {"Out": pool_out}
 
-    helper.append_op(
-        type=op_type,
-        inputs={"X": x},
-        outputs=outputs,
-        attrs={
-            "pooling_type": 'avg',
-            "ksize": kernel_size,
-            "global_pooling": False,
-            "strides": stride,
-            "paddings": padding,
-            "padding_algorithm": padding_algorithm,
-            "use_cudnn": True,
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": exclusive,
-            "data_format": data_format,
-        })
+    helper.append_op(type=op_type,
+                     inputs={"X": x},
+                     outputs=outputs,
+                     attrs={
+                         "pooling_type": 'avg',
+                         "ksize": kernel_size,
+                         "global_pooling": False,
+                         "strides": stride,
+                         "paddings": padding,
+                         "padding_algorithm": padding_algorithm,
+                         "use_cudnn": True,
+                         "ceil_mode": ceil_mode,
+                         "use_mkldnn": False,
+                         "exclusive": exclusive,
+                         "data_format": data_format,
+                     })
 
     if divisor_override is None:
         return pool_out
@@ -591,8 +601,9 @@ def max_pool1d(x,
     else:
         stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
 
-    padding, padding_algorithm = _update_padding_nd(
-        padding, 1, ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(padding,
+                                                    1,
+                                                    ceil_mode=ceil_mode)
 
     # use 2d to implenment 1d should expand padding in advance.
     padding = _expand_low_nd_padding(padding)
@@ -602,12 +613,13 @@ def max_pool1d(x,
             pool_out = _C_ops.final_state_max_pool2d_with_index(
                 x, kernel_size, stride, padding, False, False)
             return (squeeze(pool_out[0], [2]),
-                    squeeze(pool_out[1],
-                            [2])) if return_mask else squeeze(pool_out[0], [2])
+                    squeeze(pool_out[1], [2])) if return_mask else squeeze(
+                        pool_out[0], [2])
         else:
-            pool_out = _C_ops.final_state_pool2d(
-                x, kernel_size, stride, padding, ceil_mode, True, data_format,
-                'max', False, False, padding_algorithm)
+            pool_out = _C_ops.final_state_pool2d(x, kernel_size, stride,
+                                                 padding, ceil_mode, True,
+                                                 data_format, 'max', False,
+                                                 False, padding_algorithm)
             return squeeze(pool_out, [2])
 
     if _in_legacy_dygraph():
@@ -619,15 +631,16 @@ def max_pool1d(x,
                 'use_mkldnn', False, 'exclusive', True, 'data_format',
                 data_format)
             return (squeeze(pool_out[0], [2]),
-                    squeeze(pool_out[1],
-                            [2])) if return_mask else squeeze(pool_out[0], [2])
+                    squeeze(pool_out[1], [2])) if return_mask else squeeze(
+                        pool_out[0], [2])
         else:
-            pool_out = _C_ops.pool2d(
-                x, 'pooling_type', 'max', 'ksize', kernel_size,
-                'global_pooling', False, 'padding_algorithm', padding_algorithm,
-                'strides', stride, 'paddings', padding, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
-                'data_format', data_format)
+            pool_out = _C_ops.pool2d(x, 'pooling_type', 'max', 'ksize',
+                                     kernel_size, 'global_pooling', False,
+                                     'padding_algorithm', padding_algorithm,
+                                     'strides', stride, 'paddings', padding,
+                                     'use_cudnn', True, 'ceil_mode', ceil_mode,
+                                     'use_mkldnn', False, 'exclusive', True,
+                                     'data_format', data_format)
             return squeeze(pool_out, [2])
 
     op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
@@ -637,23 +650,22 @@ def max_pool1d(x,
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(
-        type=op_type,
-        inputs={"X": x},
-        outputs=outputs,
-        attrs={
-            "pooling_type": 'max',
-            "ksize": kernel_size,
-            "global_pooling": False,
-            "strides": stride,
-            "paddings": padding,
-            "padding_algorithm": padding_algorithm,
-            "use_cudnn": True,
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": True,
-            "data_format": data_format,
-        })
+    helper.append_op(type=op_type,
+                     inputs={"X": x},
+                     outputs=outputs,
+                     attrs={
+                         "pooling_type": 'max',
+                         "ksize": kernel_size,
+                         "global_pooling": False,
+                         "strides": stride,
+                         "paddings": padding,
+                         "padding_algorithm": padding_algorithm,
+                         "use_cudnn": True,
+                         "ceil_mode": ceil_mode,
+                         "use_mkldnn": False,
+                         "exclusive": True,
+                         "data_format": data_format,
+                     })
 
     return (squeeze(pool_out, [2]),
             squeeze(mask, [2])) if return_mask else squeeze(pool_out, [2])
@@ -663,8 +675,8 @@ def _unpool_output_size(x, kernel_size, stride, padding, output_size):
     input_size = x.shape
     default_size = []
     for d in range(len(kernel_size)):
-        default_size.append((input_size[-len(kernel_size) + d] - 1) * stride[d]
-                            + kernel_size[d] - 2 * padding[d])
+        default_size.append((input_size[-len(kernel_size) + d] - 1) *
+                            stride[d] + kernel_size[d] - 2 * padding[d])
     if output_size is None:
         ret = default_size
     else:
@@ -674,14 +686,15 @@ def _unpool_output_size(x, kernel_size, stride, padding, output_size):
             raise ValueError(
                 "output_size should be a sequence containing "
                 "{} or {} elements, but it has a length of '{}'".format(
-                    len(kernel_size), len(kernel_size) + 2, len(output_size)))
+                    len(kernel_size),
+                    len(kernel_size) + 2, len(output_size)))
         for d in range(len(kernel_size)):
             min_size = default_size[d] - stride[d]
             max_size = default_size[d] + stride[d]
             if not (min_size < output_size[d] < max_size):
                 raise ValueError(
-                    'invalid output_size "{}" (dim {} must be between {} and {})'.
-                    format(output_size, d, min_size, max_size))
+                    'invalid output_size "{}" (dim {} must be between {} and {})'
+                    .format(output_size, d, min_size, max_size))
 
         ret = output_size
     return ret
@@ -781,18 +794,19 @@ def max_unpool1d(x,
     dtype = helper.input_dtype(input_param_name="x")
     unpool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type=op_type,
-        inputs={"X": x,
-                "Indices": indices},
-        outputs={"Out": unpool_out},
-        attrs={
-            "unpooling_type": "max",
-            "ksize": kernel_size,
-            "strides": stride,
-            "paddings": padding,
-            "output_size": output_size
-        })
+    helper.append_op(type=op_type,
+                     inputs={
+                         "X": x,
+                         "Indices": indices
+                     },
+                     outputs={"Out": unpool_out},
+                     attrs={
+                         "unpooling_type": "max",
+                         "ksize": kernel_size,
+                         "strides": stride,
+                         "paddings": padding,
+                         "output_size": output_size
+                     })
     return squeeze(unpool_out, [2])
 
 
@@ -896,18 +910,19 @@ def max_unpool2d(x,
     dtype = helper.input_dtype(input_param_name="x")
     unpool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type=op_type,
-        inputs={"X": x,
-                "Indices": indices},
-        outputs={"Out": unpool_out},
-        attrs={
-            "unpooling_type": "max",
-            "ksize": kernel_size,
-            "strides": stride,
-            "paddings": padding,
-            "output_size": output_size
-        })
+    helper.append_op(type=op_type,
+                     inputs={
+                         "X": x,
+                         "Indices": indices
+                     },
+                     outputs={"Out": unpool_out},
+                     attrs={
+                         "unpooling_type": "max",
+                         "ksize": kernel_size,
+                         "strides": stride,
+                         "paddings": padding,
+                         "output_size": output_size
+                     })
     return unpool_out
 
 
@@ -1008,18 +1023,19 @@ def max_unpool3d(x,
     dtype = helper.input_dtype(input_param_name="x")
     unpool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(
-        type=op_type,
-        inputs={"X": x,
-                "Indices": indices},
-        outputs={"Out": unpool_out},
-        attrs={
-            "unpooling_type": "max",
-            "ksize": kernel_size,
-            "strides": stride,
-            "paddings": padding,
-            "output_size": output_size
-        })
+    helper.append_op(type=op_type,
+                     inputs={
+                         "X": x,
+                         "Indices": indices
+                     },
+                     outputs={"Out": unpool_out},
+                     attrs={
+                         "unpooling_type": "max",
+                         "ksize": kernel_size,
+                         "strides": stride,
+                         "paddings": padding,
+                         "output_size": output_size
+                     })
     return unpool_out
 
 
@@ -1044,8 +1060,10 @@ def max_pool2d(x,
 
     channel_last = True if data_format == "NHWC" else False
 
-    padding, padding_algorithm = _update_padding_nd(
-        padding, num_dims=2, channel_last=channel_last, ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(padding,
+                                                    num_dims=2,
+                                                    channel_last=channel_last,
+                                                    ceil_mode=ceil_mode)
 
     if data_format == "NHWC" and return_mask:
         raise ValueError(
@@ -1058,9 +1076,10 @@ def max_pool2d(x,
                 x, kernel_size, stride, padding, False, False)
             return output if return_mask else output[0]
         else:
-            return _C_ops.final_state_pool2d(
-                x, kernel_size, stride, padding, ceil_mode, True, data_format,
-                'max', False, False, padding_algorithm)
+            return _C_ops.final_state_pool2d(x, kernel_size, stride, padding,
+                                             ceil_mode, True, data_format,
+                                             'max', False, False,
+                                             padding_algorithm)
 
     if _in_legacy_dygraph():
         if return_mask:
@@ -1072,12 +1091,13 @@ def max_pool2d(x,
                 data_format)
             return output if return_mask else output[0]
         else:
-            output = _C_ops.pool2d(
-                x, 'pooling_type', 'max', 'ksize', kernel_size,
-                'global_pooling', False, 'padding_algorithm', padding_algorithm,
-                'strides', stride, 'paddings', padding, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
-                'data_format', data_format)
+            output = _C_ops.pool2d(x, 'pooling_type', 'max', 'ksize',
+                                   kernel_size, 'global_pooling', False,
+                                   'padding_algorithm', padding_algorithm,
+                                   'strides', stride, 'paddings', padding,
+                                   'use_cudnn', True, 'ceil_mode', ceil_mode,
+                                   'use_mkldnn', False, 'exclusive', True,
+                                   'data_format', data_format)
             return output
 
     op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
@@ -1089,23 +1109,22 @@ def max_pool2d(x,
     mask = helper.create_variable_for_type_inference("int32")
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(
-        type=op_type,
-        inputs={"X": x},
-        outputs=outputs,
-        attrs={
-            "pooling_type": 'max',
-            "ksize": kernel_size,
-            "global_pooling": False,
-            "strides": stride,
-            "paddings": padding,
-            "padding_algorithm": padding_algorithm,
-            "use_cudnn": True,
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": True,
-            "data_format": data_format,
-        })
+    helper.append_op(type=op_type,
+                     inputs={"X": x},
+                     outputs=outputs,
+                     attrs={
+                         "pooling_type": 'max',
+                         "ksize": kernel_size,
+                         "global_pooling": False,
+                         "strides": stride,
+                         "paddings": padding,
+                         "padding_algorithm": padding_algorithm,
+                         "use_cudnn": True,
+                         "ceil_mode": ceil_mode,
+                         "use_mkldnn": False,
+                         "exclusive": True,
+                         "data_format": data_format,
+                     })
 
     return (pool_out, mask) if return_mask else pool_out
 
@@ -1184,8 +1203,10 @@ def max_pool3d(x,
 
     channel_last = _channel_last(data_format, 3)
 
-    padding, padding_algorithm = _update_padding_nd(
-        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(padding,
+                                                    3,
+                                                    channel_last=channel_last,
+                                                    ceil_mode=ceil_mode)
 
     if data_format == "NDHWC" and return_mask:
         raise ValueError(
@@ -1198,9 +1219,10 @@ def max_pool3d(x,
                 x, kernel_size, stride, padding, False, False)
             return output if return_mask else output[0]
         else:
-            return _C_ops.final_state_pool3d(
-                x, kernel_size, stride, padding, ceil_mode, True, data_format,
-                'max', False, False, padding_algorithm)
+            return _C_ops.final_state_pool3d(x, kernel_size, stride, padding,
+                                             ceil_mode, True, data_format,
+                                             'max', False, False,
+                                             padding_algorithm)
 
     if _in_legacy_dygraph():
         if return_mask:
@@ -1212,12 +1234,13 @@ def max_pool3d(x,
                 'data_format', data_format)
             return output if return_mask else output[0]
         else:
-            output = _C_ops.pool3d(
-                x, 'pooling_type', 'max', 'ksize', kernel_size,
-                'global_pooling', False, 'padding_algorithm', padding_algorithm,
-                'strides', stride, 'paddings', padding, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
-                'data_format', data_format)
+            output = _C_ops.pool3d(x, 'pooling_type', 'max', 'ksize',
+                                   kernel_size, 'global_pooling', False,
+                                   'padding_algorithm', padding_algorithm,
+                                   'strides', stride, 'paddings', padding,
+                                   'use_cudnn', True, 'ceil_mode', ceil_mode,
+                                   'use_mkldnn', False, 'exclusive', True,
+                                   'data_format', data_format)
             return output
 
     op_type = "max_pool3d_with_index" if return_mask else "pool3d"
@@ -1228,23 +1251,22 @@ def max_pool3d(x,
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(
-        type=op_type,
-        inputs={"X": x},
-        outputs=outputs,
-        attrs={
-            "pooling_type": 'max',
-            "ksize": kernel_size,
-            "global_pooling": False,
-            "strides": stride,
-            "paddings": padding,
-            "padding_algorithm": padding_algorithm,
-            "use_cudnn": True,
-            "ceil_mode": ceil_mode,
-            "use_mkldnn": False,
-            "exclusive": False,
-            "data_format": data_format,
-        })
+    helper.append_op(type=op_type,
+                     inputs={"X": x},
+                     outputs=outputs,
+                     attrs={
+                         "pooling_type": 'max',
+                         "ksize": kernel_size,
+                         "global_pooling": False,
+                         "strides": stride,
+                         "paddings": padding,
+                         "padding_algorithm": padding_algorithm,
+                         "use_cudnn": True,
+                         "ceil_mode": ceil_mode,
+                         "use_mkldnn": False,
+                         "exclusive": False,
+                         "data_format": data_format,
+                     })
 
     return (pool_out, mask) if return_mask else pool_out
 
@@ -1310,15 +1332,14 @@ def adaptive_avg_pool1d(x, output_size, name=None):
     pool_out = helper.create_variable_for_type_inference(dtype)
 
     outputs = {"Out": pool_out}
-    helper.append_op(
-        type=l_type,
-        inputs={"X": x},
-        outputs=outputs,
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "adaptive": True,
-        })
+    helper.append_op(type=l_type,
+                     inputs={"X": x},
+                     outputs=outputs,
+                     attrs={
+                         "pooling_type": pool_type,
+                         "ksize": pool_size,
+                         "adaptive": True,
+                     })
 
     return squeeze(pool_out, [2])
 
@@ -1398,9 +1419,10 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
             output_size[1] = in_w
 
     if in_dygraph_mode():
-        return _C_ops.final_state_pool2d_gpudnn_unused(
-            x, output_size, [1, 1], [0, 0], False, True, data_format, 'avg',
-            False, True, "EXPLICIT")
+        return _C_ops.final_state_pool2d_gpudnn_unused(x, output_size, [1, 1],
+                                                       [0, 0], False, True,
+                                                       data_format, 'avg',
+                                                       False, True, "EXPLICIT")
 
     if _in_legacy_dygraph():
         return _C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize', output_size,
@@ -1415,16 +1437,15 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
 
     outputs = {"Out": pool_out}
 
-    helper.append_op(
-        type=l_type,
-        inputs={"X": x},
-        outputs=outputs,
-        attrs={
-            "pooling_type": "avg",
-            "ksize": output_size,
-            "adaptive": True,
-            "data_format": data_format,
-        })
+    helper.append_op(type=l_type,
+                     inputs={"X": x},
+                     outputs=outputs,
+                     attrs={
+                         "pooling_type": "avg",
+                         "ksize": output_size,
+                         "adaptive": True,
+                         "data_format": data_format,
+                     })
 
     return pool_out
 
@@ -1519,16 +1540,15 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     pool_out = helper.create_variable_for_type_inference(dtype)
     outputs = {"Out": pool_out}
 
-    helper.append_op(
-        type=l_type,
-        inputs={"X": x},
-        outputs=outputs,
-        attrs={
-            "pooling_type": "avg",
-            "ksize": output_size,
-            "adaptive": True,
-            "data_format": data_format,
-        })
+    helper.append_op(type=l_type,
+                     inputs={"X": x},
+                     outputs=outputs,
+                     attrs={
+                         "pooling_type": "avg",
+                         "ksize": output_size,
+                         "adaptive": True,
+                         "data_format": data_format,
+                     })
 
     return pool_out
 
@@ -1591,8 +1611,9 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
 
     x = unsqueeze(x, [2])
     if in_dynamic_mode():
-        pool_out = _C_ops.max_pool2d_with_index(
-            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
+        pool_out = _C_ops.max_pool2d_with_index(x, 'pooling_type', pool_type,
+                                                'ksize', pool_size, 'adaptive',
+                                                True)
         return (squeeze(pool_out[0], [2]), squeeze(
             pool_out[1], [2])) if return_mask else squeeze(pool_out[0], [2])
 
@@ -1605,15 +1626,14 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(
-        type=l_type,
-        inputs={"X": x},
-        outputs=outputs,
-        attrs={
-            "pooling_type": pool_type,
-            "ksize": pool_size,
-            "adaptive": True,
-        })
+    helper.append_op(type=l_type,
+                     inputs={"X": x},
+                     outputs=outputs,
+                     attrs={
+                         "pooling_type": pool_type,
+                         "ksize": pool_size,
+                         "adaptive": True,
+                     })
 
     return (squeeze(pool_out, [2]),
             squeeze(mask, [2])) if return_mask else squeeze(pool_out, [2])
@@ -1680,8 +1700,9 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
             output_size[1] = in_w
 
     if in_dynamic_mode():
-        pool_out = _C_ops.max_pool2d_with_index(
-            x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
+        pool_out = _C_ops.max_pool2d_with_index(x, 'pooling_type', 'max',
+                                                'ksize', output_size,
+                                                'adaptive', True)
         return pool_out if return_mask else pool_out[0]
 
     l_type = 'max_pool2d_with_index'
@@ -1693,15 +1714,14 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(
-        type=l_type,
-        inputs={"X": x},
-        outputs=outputs,
-        attrs={
-            "pooling_type": 'max',
-            "ksize": output_size,
-            "adaptive": True,
-        })
+    helper.append_op(type=l_type,
+                     inputs={"X": x},
+                     outputs=outputs,
+                     attrs={
+                         "pooling_type": 'max',
+                         "ksize": output_size,
+                         "adaptive": True,
+                     })
     #return (pool_out, mask) if return_mask else pool_out
     return pool_out
 
@@ -1773,8 +1793,9 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
             output_size[2] = in_w
 
     if in_dynamic_mode():
-        pool_out = _C_ops.max_pool3d_with_index(
-            x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
+        pool_out = _C_ops.max_pool3d_with_index(x, 'pooling_type', 'max',
+                                                'ksize', output_size,
+                                                'adaptive', True)
         return pool_out if return_mask else pool_out[0]
 
     l_type = 'max_pool3d_with_index'
@@ -1786,14 +1807,13 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(
-        type=l_type,
-        inputs={"X": x},
-        outputs=outputs,
-        attrs={
-            "pooling_type": 'max',
-            "ksize": output_size,
-            "adaptive": True,
-        })
+    helper.append_op(type=l_type,
+                     inputs={"X": x},
+                     outputs=outputs,
+                     attrs={
+                         "pooling_type": 'max',
+                         "ksize": output_size,
+                         "adaptive": True,
+                     })
 
     return (pool_out, mask) if return_mask else pool_out
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 9a9c2ee4cf7d1..521a44f758b86 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -112,11 +112,10 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
     else:
         attrs['output_shape'] = out_shape
 
-    helper.append_op(
-        type='affine_grid',
-        inputs=ipts,
-        outputs={'Output': out},
-        attrs=None if len(attrs) == 0 else attrs)
+    helper.append_op(type='affine_grid',
+                     inputs=ipts,
+                     outputs={'Output': out},
+                     attrs=None if len(attrs) == 0 else attrs)
     return out
 
 
@@ -256,8 +255,8 @@ def grid_sample(x,
             format(_modes, mode))
     if padding_mode not in _padding_modes:
         raise ValueError(
-            "The padding mode of grid sample function should be in {}, but got: {}".
-            format(_padding_modes, padding_mode))
+            "The padding mode of grid sample function should be in {}, but got: {}"
+            .format(_padding_modes, padding_mode))
 
     if not isinstance(align_corners, bool):
         raise ValueError("The align corners should be bool, but got: {}".format(
@@ -290,11 +289,10 @@ def grid_sample(x,
             'use_cudnn': use_cudnn
         }
         out = helper.create_variable_for_type_inference(x.dtype)
-        helper.append_op(
-            type='grid_sampler',
-            inputs=ipts,
-            attrs=attrs,
-            outputs={'Output': out})
+        helper.append_op(type='grid_sampler',
+                         inputs=ipts,
+                         attrs=attrs,
+                         outputs={'Output': out})
     return out
 
 
@@ -327,9 +325,9 @@ def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
         raise TypeError("upscale factor must be int type")
 
     if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'."
-                         "But recevie Attr(data_format): {} ".format(
-                             data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'."
+            "But recevie Attr(data_format): {} ".format(data_format))
 
     if in_dynamic_mode():
         return _C_ops.pixel_shuffle(x, "upscale_factor", upscale_factor,
@@ -338,12 +336,13 @@ def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
     helper = LayerHelper("pixel_shuffle", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'pixel_shuffle')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="pixel_shuffle",
-        inputs={"X": x},
-        outputs={"Out": out},
-        attrs={"upscale_factor": upscale_factor,
-               "data_format": data_format})
+    helper.append_op(type="pixel_shuffle",
+                     inputs={"X": x},
+                     outputs={"Out": out},
+                     attrs={
+                         "upscale_factor": upscale_factor,
+                         "data_format": data_format
+                     })
     return out
 
 
@@ -383,9 +382,9 @@ def pixel_unshuffle(x, downscale_factor, data_format="NCHW", name=None):
         raise ValueError("Downscale factor must be positive")
 
     if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'."
-                         "But recevie Attr(data_format): {} ".format(
-                             data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'."
+            "But recevie Attr(data_format): {} ".format(data_format))
 
     if _non_static_mode():
         return _C_ops.pixel_unshuffle(x, "downscale_factor", downscale_factor,
@@ -394,14 +393,13 @@ def pixel_unshuffle(x, downscale_factor, data_format="NCHW", name=None):
     helper = LayerHelper("pixel_unshuffle", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'pixel_unshuffle')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="pixel_unshuffle",
-        inputs={"X": x},
-        outputs={"Out": out},
-        attrs={
-            "downscale_factor": downscale_factor,
-            "data_format": data_format
-        })
+    helper.append_op(type="pixel_unshuffle",
+                     inputs={"X": x},
+                     outputs={"Out": out},
+                     attrs={
+                         "downscale_factor": downscale_factor,
+                         "data_format": data_format
+                     })
     return out
 
 
@@ -453,9 +451,9 @@ def channel_shuffle(x, groups, data_format="NCHW", name=None):
         raise ValueError("groups must be positive")
 
     if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'."
-                         "But recevie Attr(data_format): {} ".format(
-                             data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'."
+            "But recevie Attr(data_format): {} ".format(data_format))
 
     if _non_static_mode():
         return _C_ops.channel_shuffle(x, "groups", groups, "data_format",
@@ -464,10 +462,11 @@ def channel_shuffle(x, groups, data_format="NCHW", name=None):
     helper = LayerHelper("channel_shuffle", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'channel_shuffle')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="channel_shuffle",
-        inputs={"X": x},
-        outputs={"Out": out},
-        attrs={"groups": groups,
-               "data_format": data_format})
+    helper.append_op(type="channel_shuffle",
+                     inputs={"X": x},
+                     outputs={"Out": out},
+                     attrs={
+                         "groups": groups,
+                         "data_format": data_format
+                     })
     return out
diff --git a/python/paddle/nn/initializer/__init__.py b/python/paddle/nn/initializer/__init__.py
index e048ee2b1e912..530c52bf5f26d 100644
--- a/python/paddle/nn/initializer/__init__.py
+++ b/python/paddle/nn/initializer/__init__.py
@@ -36,19 +36,8 @@
 
 from .dirac import Dirac  # noqa: F401
 
-__all__ = [     #noqa
-           'Bilinear',
-           'Constant',
-           'KaimingUniform',
-           'KaimingNormal',
-           'XavierNormal',
-           'XavierUniform',
-           'Assign',
-           'Normal',
-           'TruncatedNormal',
-           'Uniform',
-           'Orthogonal',
-           'Dirac',
-           'set_global_initializer',
-           'calculate_gain'
+__all__ = [  #noqa
+    'Bilinear', 'Constant', 'KaimingUniform', 'KaimingNormal', 'XavierNormal',
+    'XavierUniform', 'Assign', 'Normal', 'TruncatedNormal', 'Uniform',
+    'Orthogonal', 'Dirac', 'set_global_initializer', 'calculate_gain'
 ]
diff --git a/python/paddle/nn/initializer/dirac.py b/python/paddle/nn/initializer/dirac.py
index 9c84b01ecb9af..1b5697ede4065 100644
--- a/python/paddle/nn/initializer/dirac.py
+++ b/python/paddle/nn/initializer/dirac.py
@@ -20,6 +20,7 @@
 from paddle.utils import unique_name
 from paddle import _C_ops
 from ... import fluid
+
 __all__ = []
 
 
@@ -106,42 +107,42 @@ def __call__(self, var, block=None):
         block = self._check_block(block)
         assert isinstance(var, framework.Parameter)
         assert isinstance(block, framework.Block)
-        check_variable_and_dtype(
-            var, "Out", ['float16', 'bfloat16', 'float32', 'float64'], 'Dirac')
+        check_variable_and_dtype(var, "Out",
+                                 ['float16', 'bfloat16', 'float32', 'float64'],
+                                 'Dirac')
 
         assert len(var.shape) in [
             3, 4, 5
         ], "Only Tensor with 3/4/5 dimensions can be initialized by Dirac"
-        assert (var.shape[0] % self._groups
-                ) == 0, "Tensor 0-dimension must be divisible by groups"
+        assert (
+            var.shape[0] %
+            self._groups) == 0, "Tensor 0-dimension must be divisible by groups"
 
         if var.dtype != VarDesc.VarType.FP32:
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(['dirac', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=VarDesc.VarType.FP32,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
+            out_var = block.create_var(name=unique_name.generate(".".join(
+                ['dirac', var.name, 'tmp'])),
+                                       shape=var.shape,
+                                       dtype=VarDesc.VarType.FP32,
+                                       type=VarDesc.VarType.LOD_TENSOR,
+                                       persistable=False)
         else:
             out_var = var
         op = None
         if framework.in_dygraph_mode():
             with fluid.dygraph.no_grad():
-                _C_ops.fill_constant(out_var, 'value',
-                                     float(0), 'force_cpu', False, 'dtype',
-                                     out_var.dtype, 'str_value',
+                _C_ops.fill_constant(out_var, 'value', float(0), 'force_cpu',
+                                     False, 'dtype', out_var.dtype, 'str_value',
                                      str(float(0)), 'shape', out_var.shape)
         else:
-            block.append_op(
-                type='fill_constant',
-                inputs={},
-                outputs={'Out': out_var},
-                attrs={
-                    'value': float(0),
-                    'dtype': out_var.dtype,
-                    'shape': out_var.shape,
-                },
-                stop_gradient=True)
+            block.append_op(type='fill_constant',
+                            inputs={},
+                            outputs={'Out': out_var},
+                            attrs={
+                                'value': float(0),
+                                'dtype': out_var.dtype,
+                                'shape': out_var.shape,
+                            },
+                            stop_gradient=True)
 
         origin_shape = var.shape
         num_per_group = origin_shape[0] // self._groups
@@ -171,20 +172,21 @@ def __call__(self, var, block=None):
                 tmp_out, _ = _C_ops.reshape2(out_var, None, 'shape', [-1])
                 tmp_out._share_underline_tensor_to(out_var)
         else:
-            x_shape = block.create_var(
-                name=unique_name.generate(".".join([out_var.name, "XShape"])),
-                dtype=out_var.dtype,
-                shape=out_var.shape,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-                stop_gradient=True)
-            block.append_op(
-                type="reshape2",
-                inputs={"X": out_var},
-                attrs={'shape': [-1]},
-                outputs={"Out": out_var,
-                         "XShape": x_shape},
-                stop_gradient=True)
+            x_shape = block.create_var(name=unique_name.generate(".".join(
+                [out_var.name, "XShape"])),
+                                       dtype=out_var.dtype,
+                                       shape=out_var.shape,
+                                       type=VarDesc.VarType.LOD_TENSOR,
+                                       persistable=False,
+                                       stop_gradient=True)
+            block.append_op(type="reshape2",
+                            inputs={"X": out_var},
+                            attrs={'shape': [-1]},
+                            outputs={
+                                "Out": out_var,
+                                "XShape": x_shape
+                            },
+                            stop_gradient=True)
 
         index_tensor = block.create_var(
             name=unique_name.generate('scatter_index'),
@@ -199,15 +201,14 @@ def __call__(self, var, block=None):
                                     'int64_values', idx_list)
                 tmp_tensor._share_underline_tensor_to(index_tensor)
         else:
-            block.append_op(
-                type='assign_value',
-                outputs={'Out': index_tensor},
-                attrs={
-                    'dtype': VarDesc.VarType.INT64,
-                    'shape': [len(idx_list)],
-                    'int64_values': idx_list
-                },
-                stop_gradient=True)
+            block.append_op(type='assign_value',
+                            outputs={'Out': index_tensor},
+                            attrs={
+                                'dtype': VarDesc.VarType.INT64,
+                                'shape': [len(idx_list)],
+                                'int64_values': idx_list
+                            },
+                            stop_gradient=True)
 
         value_tensor = block.create_var(
             name=unique_name.generate('scatter_value'),
@@ -222,15 +223,14 @@ def __call__(self, var, block=None):
                                     'fp32_values', value_list)
                 tmp_tensor._share_underline_tensor_to(value_tensor)
         else:
-            block.append_op(
-                type='assign_value',
-                outputs={'Out': value_tensor},
-                attrs={
-                    'dtype': VarDesc.VarType.FP32,
-                    'shape': [len(value_list)],
-                    'fp32_values': value_list
-                },
-                stop_gradient=True)
+            block.append_op(type='assign_value',
+                            outputs={'Out': value_tensor},
+                            attrs={
+                                'dtype': VarDesc.VarType.FP32,
+                                'shape': [len(value_list)],
+                                'fp32_values': value_list
+                            },
+                            stop_gradient=True)
 
         if framework.in_dygraph_mode():
             with fluid.dygraph.no_grad():
@@ -247,38 +247,39 @@ def __call__(self, var, block=None):
                     tmp_cast_out._share_underline_tensor_to(var)
 
         else:
-            op = block.append_op(
-                type="scatter",
-                inputs={
-                    "X": out_var,
-                    "Ids": index_tensor,
-                    "Updates": value_tensor
-                },
-                attrs={'overwrite': True},
-                outputs={"Out": out_var},
-                stop_gradient=True)
-            x_shape = block.create_var(
-                name=unique_name.generate(".".join([out_var.name, "XShape"])),
-                dtype=out_var.dtype,
-                shape=out_var.shape,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-                stop_gradient=True)
-            block.append_op(
-                type="reshape2",
-                inputs={"X": out_var},
-                attrs={'shape': origin_shape},
-                outputs={"Out": out_var,
-                         "XShape": x_shape},
-                stop_gradient=True)
+            op = block.append_op(type="scatter",
+                                 inputs={
+                                     "X": out_var,
+                                     "Ids": index_tensor,
+                                     "Updates": value_tensor
+                                 },
+                                 attrs={'overwrite': True},
+                                 outputs={"Out": out_var},
+                                 stop_gradient=True)
+            x_shape = block.create_var(name=unique_name.generate(".".join(
+                [out_var.name, "XShape"])),
+                                       dtype=out_var.dtype,
+                                       shape=out_var.shape,
+                                       type=VarDesc.VarType.LOD_TENSOR,
+                                       persistable=False,
+                                       stop_gradient=True)
+            block.append_op(type="reshape2",
+                            inputs={"X": out_var},
+                            attrs={'shape': origin_shape},
+                            outputs={
+                                "Out": out_var,
+                                "XShape": x_shape
+                            },
+                            stop_gradient=True)
             if var.dtype != VarDesc.VarType.FP32:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype,
-                           "out_dtype": var.dtype},
-                    stop_gradient=True)
+                block.append_op(type="cast",
+                                inputs={"X": out_var},
+                                outputs={"Out": var},
+                                attrs={
+                                    "in_dtype": out_var.dtype,
+                                    "out_dtype": var.dtype
+                                },
+                                stop_gradient=True)
         if not in_dynamic_mode():
             var.op = op
         return op
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
index 88a52268776fc..b8ed7febb6bc7 100644
--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -57,8 +57,9 @@ class KaimingNormal(MSRAInitializer):
     """
 
     def __init__(self, fan_in=None):
-        super(KaimingNormal, self).__init__(
-            uniform=False, fan_in=fan_in, seed=0)
+        super(KaimingNormal, self).__init__(uniform=False,
+                                            fan_in=fan_in,
+                                            seed=0)
 
 
 class KaimingUniform(MSRAInitializer):
@@ -99,5 +100,6 @@ class KaimingUniform(MSRAInitializer):
     """
 
     def __init__(self, fan_in=None):
-        super(KaimingUniform, self).__init__(
-            uniform=True, fan_in=fan_in, seed=0)
+        super(KaimingUniform, self).__init__(uniform=True,
+                                             fan_in=fan_in,
+                                             seed=0)
diff --git a/python/paddle/nn/initializer/orthogonal.py b/python/paddle/nn/initializer/orthogonal.py
index 84cdb971d77d4..2a9ba126e2fad 100644
--- a/python/paddle/nn/initializer/orthogonal.py
+++ b/python/paddle/nn/initializer/orthogonal.py
@@ -101,105 +101,107 @@ def __call__(self, var, block=None):
 
         flatten_shape = [max(row, col), min(row, col)]
 
-        normal_var = block.create_var(
-            name=unique_name.generate('.'.join(['gaussian_random', 'tmp'])),
-            dtype=var.dtype,
-            persistable=False,
-            stop_gradient=True)
-        block.append_op(
-            type='gaussian_random',
-            inputs={},
-            outputs={'Out': normal_var},
-            attrs={
-                'mean': 0.0,
-                'std': 1.0,
-                'shape': flatten_shape,
-                'seed': self._seed,
-                'dtype': var.dtype
-            },
-            stop_gradient=True)
-
-        q = block.create_var(
-            name=unique_name.generate('.'.join(['qr', 'q', 'tmp'])),
-            dtype=normal_var.dtype,
-            persistable=False,
-            stop_gradient=True)
-        r = block.create_var(
-            name=unique_name.generate('.'.join(['qr', 'r', 'tmp'])),
-            dtype=normal_var.dtype,
-            persistable=False,
-            stop_gradient=True)
-        block.append_op(
-            type='qr',
-            inputs={'X': [normal_var]},
-            outputs={
-                'Q': q,
-                'R': r,
-            },
-            attrs={'mode': 'reduced'},
-            stop_gradient=True)
-
-        r_diag = block.create_var(
-            name=unique_name.generate('.'.join(['diag', 'tmp'])),
-            dtype=r.dtype,
-            persistable=False,
-            stop_gradient=True)
-        block.append_op(
-            type='diag_v2',
-            inputs={'X': r},
-            outputs={'Out': r_diag},
-            attrs={'offset': 0,
-                   'padding_value': 0},
-            stop_gradient=True)
+        normal_var = block.create_var(name=unique_name.generate('.'.join(
+            ['gaussian_random', 'tmp'])),
+                                      dtype=var.dtype,
+                                      persistable=False,
+                                      stop_gradient=True)
+        block.append_op(type='gaussian_random',
+                        inputs={},
+                        outputs={'Out': normal_var},
+                        attrs={
+                            'mean': 0.0,
+                            'std': 1.0,
+                            'shape': flatten_shape,
+                            'seed': self._seed,
+                            'dtype': var.dtype
+                        },
+                        stop_gradient=True)
+
+        q = block.create_var(name=unique_name.generate('.'.join(
+            ['qr', 'q', 'tmp'])),
+                             dtype=normal_var.dtype,
+                             persistable=False,
+                             stop_gradient=True)
+        r = block.create_var(name=unique_name.generate('.'.join(
+            ['qr', 'r', 'tmp'])),
+                             dtype=normal_var.dtype,
+                             persistable=False,
+                             stop_gradient=True)
+        block.append_op(type='qr',
+                        inputs={'X': [normal_var]},
+                        outputs={
+                            'Q': q,
+                            'R': r,
+                        },
+                        attrs={'mode': 'reduced'},
+                        stop_gradient=True)
+
+        r_diag = block.create_var(name=unique_name.generate('.'.join(
+            ['diag', 'tmp'])),
+                                  dtype=r.dtype,
+                                  persistable=False,
+                                  stop_gradient=True)
+        block.append_op(type='diag_v2',
+                        inputs={'X': r},
+                        outputs={'Out': r_diag},
+                        attrs={
+                            'offset': 0,
+                            'padding_value': 0
+                        },
+                        stop_gradient=True)
 
         r_sign = r_diag
-        block.append_op(
-            type='sign',
-            inputs={'X': [r_diag]},
-            outputs={'Out': r_sign},
-            stop_gradient=True)
-
-        block.append_op(
-            type='elementwise_mul',
-            inputs={'X': q,
-                    'Y': r_sign},
-            outputs={'Out': q},
-            attrs={},
-            stop_gradient=True)
-
-        x_shape = block.create_var(
-            name=unique_name.generate('.'.join(['transpose', 'shape', 'tmp'])),
-            dtype=q.dtype,
-            persistable=False,
-            stop_gradient=True)
+        block.append_op(type='sign',
+                        inputs={'X': [r_diag]},
+                        outputs={'Out': r_sign},
+                        stop_gradient=True)
+
+        block.append_op(type='elementwise_mul',
+                        inputs={
+                            'X': q,
+                            'Y': r_sign
+                        },
+                        outputs={'Out': q},
+                        attrs={},
+                        stop_gradient=True)
+
+        x_shape = block.create_var(name=unique_name.generate('.'.join(
+            ['transpose', 'shape', 'tmp'])),
+                                   dtype=q.dtype,
+                                   persistable=False,
+                                   stop_gradient=True)
         if row < col:
-            q_transpose = block.create_var(
-                name=unique_name.generate('.'.join(['transpose', 'tmp'])),
-                dtype=q.dtype,
-                persistable=False,
-                stop_gradient=True)
-            block.append_op(
-                type='transpose2',
-                inputs={'X': q},
-                outputs={'Out': q_transpose,
-                         'XShape': x_shape},
-                attrs={'axis': [1, 0]},
-                stop_gradient=True)
+            q_transpose = block.create_var(name=unique_name.generate('.'.join(
+                ['transpose', 'tmp'])),
+                                           dtype=q.dtype,
+                                           persistable=False,
+                                           stop_gradient=True)
+            block.append_op(type='transpose2',
+                            inputs={'X': q},
+                            outputs={
+                                'Out': q_transpose,
+                                'XShape': x_shape
+                            },
+                            attrs={'axis': [1, 0]},
+                            stop_gradient=True)
             q = q_transpose
 
-        block.append_op(
-            type='reshape2',
-            inputs={'X': q},
-            outputs={'Out': q,
-                     "XShape": x_shape},
-            attrs={'shape': var.shape},
-            stop_gradient=True)
-
-        op = block.append_op(
-            type='scale',
-            inputs={'X': q},
-            outputs={'Out': var},
-            attrs={'scale': self._gain,
-                   'bias': 0.0})
+        block.append_op(type='reshape2',
+                        inputs={'X': q},
+                        outputs={
+                            'Out': q,
+                            "XShape": x_shape
+                        },
+                        attrs={'shape': var.shape},
+                        stop_gradient=True)
+
+        op = block.append_op(type='scale',
+                             inputs={'X': q},
+                             outputs={'Out': var},
+                             attrs={
+                                 'scale': self._gain,
+                                 'bias': 0.0
+                             })
 
         return op
diff --git a/python/paddle/nn/initializer/uniform.py b/python/paddle/nn/initializer/uniform.py
index f07883adbb0ae..ee9b36ecf7c7b 100644
--- a/python/paddle/nn/initializer/uniform.py
+++ b/python/paddle/nn/initializer/uniform.py
@@ -56,5 +56,9 @@ def __init__(self, low=-1.0, high=1.0, name=None):
         assert low is not None, 'low should not be None'
         assert high is not None, 'high should not be None'
         assert high >= low, 'high should greater or equal than low'
-        super(Uniform, self).__init__(
-            low=low, high=high, seed=0, diag_num=0, diag_step=0, diag_val=1.0)
+        super(Uniform, self).__init__(low=low,
+                                      high=high,
+                                      seed=0,
+                                      diag_num=0,
+                                      diag_step=0,
+                                      diag_val=1.0)
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index aff3a2c15aeec..e11790df7dfbc 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -66,8 +66,10 @@ class XavierNormal(XavierInitializer):
     """
 
     def __init__(self, fan_in=None, fan_out=None, name=None):
-        super(XavierNormal, self).__init__(
-            uniform=False, fan_in=fan_in, fan_out=fan_out, seed=0)
+        super(XavierNormal, self).__init__(uniform=False,
+                                           fan_in=fan_in,
+                                           fan_out=fan_out,
+                                           seed=0)
 
 
 class XavierUniform(XavierInitializer):
@@ -120,5 +122,7 @@ class XavierUniform(XavierInitializer):
     """
 
     def __init__(self, fan_in=None, fan_out=None, name=None):
-        super(XavierUniform, self).__init__(
-            uniform=True, fan_in=fan_in, fan_out=fan_out, seed=0)
+        super(XavierUniform, self).__init__(uniform=True,
+                                            fan_in=fan_in,
+                                            fan_out=fan_out,
+                                            seed=0)
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 1a3768e919042..6e2a11c89cc64 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -419,12 +419,12 @@ def __init__(self,
         self._name = name
         self._data_format = data_format
 
-        self._weight = self.create_parameter(
-            attr=self._weight_attr,
-            shape=[self._num_parameters],
-            dtype=get_default_dtype(),
-            is_bias=False,
-            default_initializer=Constant(self._init))
+        self._weight = self.create_parameter(attr=self._weight_attr,
+                                             shape=[self._num_parameters],
+                                             dtype=get_default_dtype(),
+                                             is_bias=False,
+                                             default_initializer=Constant(
+                                                 self._init))
 
     def forward(self, x):
         return F.prelu(x, self._weight, data_format=self._data_format)
@@ -514,8 +514,10 @@ def __init__(self, lower=1. / 8., upper=1. / 3., name=None):
         self._name = name
 
     def forward(self, x):
-        return F.rrelu(
-            x, lower=self._lower, upper=self._upper, training=self.training)
+        return F.rrelu(x,
+                       lower=self._lower,
+                       upper=self._upper,
+                       training=self.training)
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index dac4cf5f27253..7c034d37ba6c8 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -155,21 +155,21 @@ def __init__(self,
         self._dtype = self._helper.get_default_dtype()
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
-        self.weight = self.create_parameter(
-            shape=[in_features, out_features],
-            attr=self._weight_attr,
-            dtype=self._dtype,
-            is_bias=False)
-        self.bias = self.create_parameter(
-            shape=[out_features],
-            attr=self._bias_attr,
-            dtype=self._dtype,
-            is_bias=True)
+        self.weight = self.create_parameter(shape=[in_features, out_features],
+                                            attr=self._weight_attr,
+                                            dtype=self._dtype,
+                                            is_bias=False)
+        self.bias = self.create_parameter(shape=[out_features],
+                                          attr=self._bias_attr,
+                                          dtype=self._dtype,
+                                          is_bias=True)
         self.name = name
 
     def forward(self, input):
-        out = F.linear(
-            x=input, weight=self.weight, bias=self.bias, name=self.name)
+        out = F.linear(x=input,
+                       weight=self.weight,
+                       bias=self.bias,
+                       name=self.name)
         return out
 
     def extra_repr(self):
@@ -406,15 +406,14 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        out = F.interpolate(
-            x,
-            size=self.size,
-            scale_factor=self.scale_factor,
-            mode=self.mode,
-            align_corners=self.align_corners,
-            align_mode=self.align_mode,
-            data_format=self.data_format,
-            name=self.name)
+        out = F.interpolate(x,
+                            size=self.size,
+                            scale_factor=self.scale_factor,
+                            mode=self.mode,
+                            align_corners=self.align_corners,
+                            align_mode=self.align_mode,
+                            data_format=self.data_format,
+                            name=self.name)
 
         return out
 
@@ -492,15 +491,14 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        out = F.interpolate(
-            x,
-            size=self.size,
-            scale_factor=self.scale_factor,
-            mode='nearest',
-            align_corners=False,
-            align_mode=0,
-            data_format=self.data_format,
-            name=self.name)
+        out = F.interpolate(x,
+                            size=self.size,
+                            scale_factor=self.scale_factor,
+                            mode='nearest',
+                            align_corners=False,
+                            align_mode=0,
+                            data_format=self.data_format,
+                            name=self.name)
 
         return out
 
@@ -578,15 +576,14 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        out = F.interpolate(
-            x,
-            size=self.size,
-            scale_factor=self.scale_factor,
-            mode='bilinear',
-            align_corners=True,
-            align_mode=0,
-            data_format=self.data_format,
-            name=self.name)
+        out = F.interpolate(x,
+                            size=self.size,
+                            scale_factor=self.scale_factor,
+                            mode='bilinear',
+                            align_corners=True,
+                            align_mode=0,
+                            data_format=self.data_format,
+                            name=self.name)
 
         return out
 
@@ -673,17 +670,15 @@ def __init__(self,
         weight_shape = [
             self._out_features, self._in1_features, self._in2_features
         ]
-        self.weight = self.create_parameter(
-            attr=self._weight_attr,
-            shape=weight_shape,
-            dtype=self._dtype,
-            is_bias=False)
+        self.weight = self.create_parameter(attr=self._weight_attr,
+                                            shape=weight_shape,
+                                            dtype=self._dtype,
+                                            is_bias=False)
         bias_shape = [1, self._out_features]
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=bias_shape,
-            dtype=self._dtype,
-            is_bias=True)
+        self.bias = self.create_parameter(attr=self._bias_attr,
+                                          shape=bias_shape,
+                                          dtype=self._dtype,
+                                          is_bias=True)
 
     def forward(self, x1, x2):
         return F.bilinear(x1, x2, self.weight, self.bias, self._name)
@@ -754,13 +749,12 @@ def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None):
         self.name = name
 
     def forward(self, input):
-        out = F.dropout(
-            input,
-            p=self.p,
-            axis=self.axis,
-            training=self.training,
-            mode=self.mode,
-            name=self.name)
+        out = F.dropout(input,
+                        p=self.p,
+                        axis=self.axis,
+                        training=self.training,
+                        mode=self.mode,
+                        name=self.name)
         return out
 
     def extra_repr(self):
@@ -816,12 +810,11 @@ def __init__(self, p=0.5, data_format='NCHW', name=None):
         self.name = name
 
     def forward(self, input):
-        out = F.dropout2d(
-            input,
-            p=self.p,
-            training=self.training,
-            data_format=self.data_format,
-            name=self.name)
+        out = F.dropout2d(input,
+                          p=self.p,
+                          training=self.training,
+                          data_format=self.data_format,
+                          name=self.name)
         return out
 
     def extra_repr(self):
@@ -877,12 +870,11 @@ def __init__(self, p=0.5, data_format='NCDHW', name=None):
         self.name = name
 
     def forward(self, input):
-        out = F.dropout3d(
-            input,
-            p=self.p,
-            training=self.training,
-            data_format=self.data_format,
-            name=self.name)
+        out = F.dropout3d(input,
+                          p=self.p,
+                          training=self.training,
+                          data_format=self.data_format,
+                          name=self.name)
         return out
 
     def extra_repr(self):
@@ -935,8 +927,10 @@ def __init__(self, p=0.5, name=None):
         self.name = name
 
     def forward(self, input):
-        out = F.alpha_dropout(
-            input, p=self.p, training=self.training, name=self.name)
+        out = F.alpha_dropout(input,
+                              p=self.p,
+                              training=self.training,
+                              name=self.name)
         return out
 
     def extra_repr(self):
@@ -1171,8 +1165,9 @@ def forward(self, x):
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
-        return 'padding={}, data_format={}{}'.format(
-            self._pad, self._data_format, name_str)
+        return 'padding={}, data_format={}{}'.format(self._pad,
+                                                     self._data_format,
+                                                     name_str)
 
 
 class Pad3D(Layer):
@@ -1450,23 +1445,21 @@ def __init__(self,
         self._weight_attr = weight_attr
         self._remote_prefetch = False
         self._name = name
-        self.weight = self.create_parameter(
-            attr=self._weight_attr,
-            shape=self._size,
-            dtype=self._dtype,
-            is_bias=False)
+        self.weight = self.create_parameter(attr=self._weight_attr,
+                                            shape=self._size,
+                                            dtype=self._dtype,
+                                            is_bias=False)
 
         if in_dynamic_mode() and padding_idx != -1:
             with paddle.no_grad():
                 self.weight[padding_idx] = 0.0
 
     def forward(self, x):
-        return F.embedding(
-            x,
-            weight=self.weight,
-            padding_idx=self._padding_idx,
-            sparse=self._sparse,
-            name=self._name)
+        return F.embedding(x,
+                           weight=self.weight,
+                           padding_idx=self._padding_idx,
+                           sparse=self._sparse,
+                           name=self._name)
 
     def extra_repr(self):
         main_str = '{_num_embeddings}, {_embedding_dim}'
@@ -1539,13 +1532,12 @@ def __init__(self,
         self.name = name
 
     def forward(self, input):
-        return F.unfold(
-            input,
-            kernel_sizes=self.kernel_sizes,
-            strides=self.strides,
-            paddings=self.paddings,
-            dilations=self.dilations,
-            name=self.name)
+        return F.unfold(input,
+                        kernel_sizes=self.kernel_sizes,
+                        strides=self.strides,
+                        paddings=self.paddings,
+                        dilations=self.dilations,
+                        name=self.name)
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
@@ -1626,14 +1618,13 @@ def __init__(self,
         self.name = name
 
     def forward(self, input):
-        return F.fold(
-            input,
-            output_sizes=self.output_sizes,
-            kernel_sizes=self.kernel_sizes,
-            strides=self.strides,
-            paddings=self.paddings,
-            dilations=self.dilations,
-            name=self.name)
+        return F.fold(input,
+                      output_sizes=self.output_sizes,
+                      kernel_sizes=self.kernel_sizes,
+                      strides=self.strides,
+                      paddings=self.paddings,
+                      dilations=self.dilations,
+                      name=self.name)
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
diff --git a/python/paddle/nn/layer/container.py b/python/paddle/nn/layer/container.py
index aadaf1efce50f..0b1bf6bc5657e 100644
--- a/python/paddle/nn/layer/container.py
+++ b/python/paddle/nn/layer/container.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -288,6 +288,6 @@ def update(self, sublayers):
             for i, kv in enumerate(sublayers):
                 if len(kv) != 2:
                     raise ValueError("The length of the " + str(i) +
-                                     "'s element in sublayers is " + str(
-                                         len(kv)) + ", which must be 2.")
+                                     "'s element in sublayers is " +
+                                     str(len(kv)) + ", which must be 2.")
                 self.add_sublayer(kv[0], kv[1])
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index bb1cbbfc03e55..f724f7cfee52c 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -44,6 +44,7 @@ def _reverse_repeat_list(t, n):
 
 
 class _ConvNd(Layer):
+
     def __init__(self,
                  in_channels,
                  out_channels,
@@ -86,8 +87,9 @@ def __init__(self,
                 "data_format must be one of {}, but got data_format='{}'".
                 format(valid_format, data_format))
 
-        channel_last = (data_format == "NHWC") or (data_format == "NDHWC") or (
-            data_format == "NLC")
+        channel_last = (data_format == "NHWC") or (data_format
+                                                   == "NDHWC") or (data_format
+                                                                   == "NLC")
         if channel_last:
             self._channel_dim = len(data_format) - 1
         else:
@@ -134,26 +136,27 @@ def _get_default_param_initializer():
             shape=filter_shape,
             attr=self._param_attr,
             default_initializer=_get_default_param_initializer())
-        self.bias = self.create_parameter(
-            attr=self._bias_attr, shape=[self._out_channels], is_bias=True)
+        self.bias = self.create_parameter(attr=self._bias_attr,
+                                          shape=[self._out_channels],
+                                          is_bias=True)
 
         cudnn_version = get_cudnn_version()
 
-        self._use_cudnn = True if (is_compiled_with_cuda() and
-                                   cudnn_version is not None) else False
+        self._use_cudnn = True if (is_compiled_with_cuda()
+                                   and cudnn_version is not None) else False
 
         self._op_type = "conv" + str(dims) + 'd'
-        if self._op_type == 'conv2d' and (in_channels == groups and
-                                          in_channels != 1 and
-                                          out_channels % in_channels == 0):
+        if self._op_type == 'conv2d' and (in_channels == groups
+                                          and in_channels != 1
+                                          and out_channels % in_channels == 0):
             self._op_type = 'depthwise_conv2d'
             if is_compiled_with_rocm():
                 self._use_cudnn = True
             else:
                 self._use_cudnn = False
 
-        if (is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")[
-                "FLAGS_conv2d_disable_cudnn"]):
+        if (is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")
+            ["FLAGS_conv2d_disable_cudnn"]):
             self._use_cudnn = False
 
     def extra_repr(self):
@@ -311,20 +314,19 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCL"):
-        super(Conv1D, self).__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            False,
-            1,
-            stride=stride,
-            padding=padding,
-            padding_mode=padding_mode,
-            dilation=dilation,
-            groups=groups,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr,
-            data_format=data_format)
+        super(Conv1D, self).__init__(in_channels,
+                                     out_channels,
+                                     kernel_size,
+                                     False,
+                                     1,
+                                     stride=stride,
+                                     padding=padding,
+                                     padding_mode=padding_mode,
+                                     dilation=dilation,
+                                     groups=groups,
+                                     weight_attr=weight_attr,
+                                     bias_attr=bias_attr,
+                                     data_format=data_format)
 
     def forward(self, x):
         padding = 0
@@ -336,15 +338,14 @@ def forward(self, x):
         else:
             padding = self._padding
 
-        out = F.conv1d(
-            x,
-            self.weight,
-            bias=self.bias,
-            padding=padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            data_format=self._data_format)
+        out = F.conv1d(x,
+                       self.weight,
+                       bias=self.bias,
+                       padding=padding,
+                       stride=self._stride,
+                       dilation=self._dilation,
+                       groups=self._groups,
+                       data_format=self._data_format)
         return out
 
 
@@ -488,33 +489,31 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCL"):
-        super(Conv1DTranspose, self).__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            True,
-            1,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            output_padding=output_padding,
-            groups=groups,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr,
-            data_format=data_format)
+        super(Conv1DTranspose, self).__init__(in_channels,
+                                              out_channels,
+                                              kernel_size,
+                                              True,
+                                              1,
+                                              stride=stride,
+                                              padding=padding,
+                                              dilation=dilation,
+                                              output_padding=output_padding,
+                                              groups=groups,
+                                              weight_attr=weight_attr,
+                                              bias_attr=bias_attr,
+                                              data_format=data_format)
 
     def forward(self, x, output_size=None):
-        out = F.conv1d_transpose(
-            x,
-            self.weight,
-            bias=self.bias,
-            output_size=output_size,
-            output_padding=self.output_padding,
-            padding=self._padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            data_format=self._data_format)
+        out = F.conv1d_transpose(x,
+                                 self.weight,
+                                 bias=self.bias,
+                                 output_size=output_size,
+                                 output_padding=self.output_padding,
+                                 padding=self._padding,
+                                 stride=self._stride,
+                                 dilation=self._dilation,
+                                 groups=self._groups,
+                                 data_format=self._data_format)
         return out
 
 
@@ -641,20 +640,19 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCHW"):
-        super(Conv2D, self).__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            False,
-            2,
-            stride=stride,
-            padding=padding,
-            padding_mode=padding_mode,
-            dilation=dilation,
-            groups=groups,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr,
-            data_format=data_format)
+        super(Conv2D, self).__init__(in_channels,
+                                     out_channels,
+                                     kernel_size,
+                                     False,
+                                     2,
+                                     stride=stride,
+                                     padding=padding,
+                                     padding_mode=padding_mode,
+                                     dilation=dilation,
+                                     groups=groups,
+                                     weight_attr=weight_attr,
+                                     bias_attr=bias_attr,
+                                     data_format=data_format)
 
     def forward(self, x):
         if self._padding_mode != 'zeros':
@@ -663,19 +661,18 @@ def forward(self, x):
                       mode=self._padding_mode,
                       data_format=self._data_format)
 
-        out = F.conv._conv_nd(
-            x,
-            self.weight,
-            bias=self.bias,
-            stride=self._stride,
-            padding=self._updated_padding,
-            padding_algorithm=self._padding_algorithm,
-            dilation=self._dilation,
-            groups=self._groups,
-            data_format=self._data_format,
-            channel_dim=self._channel_dim,
-            op_type=self._op_type,
-            use_cudnn=self._use_cudnn)
+        out = F.conv._conv_nd(x,
+                              self.weight,
+                              bias=self.bias,
+                              stride=self._stride,
+                              padding=self._updated_padding,
+                              padding_algorithm=self._padding_algorithm,
+                              dilation=self._dilation,
+                              groups=self._groups,
+                              data_format=self._data_format,
+                              channel_dim=self._channel_dim,
+                              op_type=self._op_type,
+                              use_cudnn=self._use_cudnn)
         return out
 
 
@@ -808,20 +805,19 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCHW"):
-        super(Conv2DTranspose, self).__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            True,
-            2,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            output_padding=output_padding,
-            groups=groups,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr,
-            data_format=data_format)
+        super(Conv2DTranspose, self).__init__(in_channels,
+                                              out_channels,
+                                              kernel_size,
+                                              True,
+                                              2,
+                                              stride=stride,
+                                              padding=padding,
+                                              dilation=dilation,
+                                              output_padding=output_padding,
+                                              groups=groups,
+                                              weight_attr=weight_attr,
+                                              bias_attr=bias_attr,
+                                              data_format=data_format)
 
     def forward(self, x, output_size=None):
         if output_size is None:
@@ -829,17 +825,16 @@ def forward(self, x, output_size=None):
         else:
             output_padding = 0
 
-        out = F.conv2d_transpose(
-            x,
-            self.weight,
-            bias=self.bias,
-            padding=self._padding,
-            output_padding=output_padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            output_size=output_size,
-            data_format=self._data_format)
+        out = F.conv2d_transpose(x,
+                                 self.weight,
+                                 bias=self.bias,
+                                 padding=self._padding,
+                                 output_padding=output_padding,
+                                 stride=self._stride,
+                                 dilation=self._dilation,
+                                 groups=self._groups,
+                                 output_size=output_size,
+                                 data_format=self._data_format)
         return out
 
 
@@ -966,20 +961,19 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCDHW"):
-        super(Conv3D, self).__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            False,
-            3,
-            stride=stride,
-            padding=padding,
-            padding_mode=padding_mode,
-            dilation=dilation,
-            groups=groups,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr,
-            data_format=data_format)
+        super(Conv3D, self).__init__(in_channels,
+                                     out_channels,
+                                     kernel_size,
+                                     False,
+                                     3,
+                                     stride=stride,
+                                     padding=padding,
+                                     padding_mode=padding_mode,
+                                     dilation=dilation,
+                                     groups=groups,
+                                     weight_attr=weight_attr,
+                                     bias_attr=bias_attr,
+                                     data_format=data_format)
 
     def forward(self, x):
         if self._padding_mode != 'zeros':
@@ -988,19 +982,18 @@ def forward(self, x):
                       mode=self._padding_mode,
                       data_format=self._data_format)
 
-        out = F.conv._conv_nd(
-            x,
-            self.weight,
-            bias=self.bias,
-            stride=self._stride,
-            padding=self._updated_padding,
-            padding_algorithm=self._padding_algorithm,
-            dilation=self._dilation,
-            groups=self._groups,
-            data_format=self._data_format,
-            channel_dim=self._channel_dim,
-            op_type=self._op_type,
-            use_cudnn=self._use_cudnn)
+        out = F.conv._conv_nd(x,
+                              self.weight,
+                              bias=self.bias,
+                              stride=self._stride,
+                              padding=self._updated_padding,
+                              padding_algorithm=self._padding_algorithm,
+                              dilation=self._dilation,
+                              groups=self._groups,
+                              data_format=self._data_format,
+                              channel_dim=self._channel_dim,
+                              op_type=self._op_type,
+                              use_cudnn=self._use_cudnn)
         return out
 
 
@@ -1145,20 +1138,19 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCDHW"):
-        super(Conv3DTranspose, self).__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            True,
-            3,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            output_padding=output_padding,
-            groups=groups,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr,
-            data_format=data_format)
+        super(Conv3DTranspose, self).__init__(in_channels,
+                                              out_channels,
+                                              kernel_size,
+                                              True,
+                                              3,
+                                              stride=stride,
+                                              padding=padding,
+                                              dilation=dilation,
+                                              output_padding=output_padding,
+                                              groups=groups,
+                                              weight_attr=weight_attr,
+                                              bias_attr=bias_attr,
+                                              data_format=data_format)
 
     def forward(self, x, output_size=None):
         if output_size is None:
@@ -1166,15 +1158,14 @@ def forward(self, x, output_size=None):
         else:
             output_padding = 0
 
-        out = F.conv3d_transpose(
-            x,
-            self.weight,
-            bias=self.bias,
-            padding=self._padding,
-            output_padding=output_padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            output_size=output_size,
-            data_format=self._data_format)
+        out = F.conv3d_transpose(x,
+                                 self.weight,
+                                 bias=self.bias,
+                                 padding=self._padding,
+                                 output_padding=output_padding,
+                                 stride=self._stride,
+                                 dilation=self._dilation,
+                                 groups=self._groups,
+                                 output_size=output_size,
+                                 data_format=self._data_format)
         return out
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index eb85de5711078..7c08e358fcc76 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -103,8 +103,10 @@ def forward(self, x, y):
             'epsilon': self.epsilon,
         }
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
-        helper.append_op(
-            type='p_norm', inputs={'X': sub}, outputs={'Out': out}, attrs=attrs)
+        helper.append_op(type='p_norm',
+                         inputs={'X': sub},
+                         outputs={'Out': out},
+                         attrs=attrs)
 
         return out
 
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index a20e7de751d16..c720ec7d1be07 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -394,16 +394,15 @@ def __init__(self,
         self.name = name
 
     def forward(self, input, label):
-        ret = paddle.nn.functional.cross_entropy(
-            input,
-            label,
-            weight=self.weight,
-            ignore_index=self.ignore_index,
-            reduction=self.reduction,
-            soft_label=self.soft_label,
-            axis=self.axis,
-            use_softmax=self.use_softmax,
-            name=self.name)
+        ret = paddle.nn.functional.cross_entropy(input,
+                                                 label,
+                                                 weight=self.weight,
+                                                 ignore_index=self.ignore_index,
+                                                 reduction=self.reduction,
+                                                 soft_label=self.soft_label,
+                                                 axis=self.axis,
+                                                 use_softmax=self.use_softmax,
+                                                 name=self.name)
 
         return ret
 
@@ -512,25 +511,25 @@ def __init__(self,
               " small parameter prefetch may cause speed down")
 
         C = self._num_classes if is_custom else self._num_classes - 1
-        self.weight = self.create_parameter(
-            [C, self._feature_size],
-            attr=self._weight_attr,
-            is_bias=False,
-            dtype=self._dtype)
-        self.bias = self.create_parameter(
-            [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype)
+        self.weight = self.create_parameter([C, self._feature_size],
+                                            attr=self._weight_attr,
+                                            is_bias=False,
+                                            dtype=self._dtype)
+        self.bias = self.create_parameter([C, 1],
+                                          attr=self._bias_attr,
+                                          is_bias=True,
+                                          dtype=self._dtype)
 
     def forward(self, input, label, path_table=None, path_code=None):
-        out = F.hsigmoid_loss(
-            input,
-            label,
-            self._num_classes,
-            self.weight,
-            self.bias,
-            path_table=path_table,
-            path_code=path_code,
-            is_sparse=self._is_sparse,
-            name=self._name)
+        out = F.hsigmoid_loss(input,
+                              label,
+                              self._num_classes,
+                              self.weight,
+                              self.bias,
+                              path_table=path_table,
+                              path_code=path_code,
+                              is_sparse=self._is_sparse,
+                              name=self._name)
         return out
 
 
@@ -596,10 +595,12 @@ def __init__(self, reduction='mean'):
 
     def forward(self, input, label):
         if not in_dynamic_mode():
-            fluid.data_feeder.check_variable_and_dtype(
-                input, 'input', ['float32', 'float64'], 'MSELoss')
-            fluid.data_feeder.check_variable_and_dtype(
-                label, 'label', ['float32', 'float64'], 'MSELoss')
+            fluid.data_feeder.check_variable_and_dtype(input, 'input',
+                                                       ['float32', 'float64'],
+                                                       'MSELoss')
+            fluid.data_feeder.check_variable_and_dtype(label, 'label',
+                                                       ['float32', 'float64'],
+                                                       'MSELoss')
 
         if in_dygraph_mode():
             square_out = paddle._C_ops.final_state_square(
@@ -691,8 +692,10 @@ def __init__(self, reduction='mean', name=None):
         self.name = name
 
     def forward(self, input, label):
-        return paddle.nn.functional.l1_loss(
-            input, label, self.reduction, name=self.name)
+        return paddle.nn.functional.l1_loss(input,
+                                            label,
+                                            self.reduction,
+                                            name=self.name)
 
 
 class BCELoss(Layer):
@@ -780,8 +783,10 @@ def __init__(self, weight=None, reduction='mean', name=None):
         self.name = name
 
     def forward(self, input, label):
-        out = paddle.nn.functional.binary_cross_entropy(
-            input, label, self.weight, self.reduction, self.name)
+        out = paddle.nn.functional.binary_cross_entropy(input, label,
+                                                        self.weight,
+                                                        self.reduction,
+                                                        self.name)
         return out
 
 
@@ -888,13 +893,12 @@ def __init__(self,
         self._name = name
 
     def forward(self, input, label):
-        return F.nll_loss(
-            input,
-            label,
-            weight=self._weight,
-            ignore_index=self._ignore_index,
-            reduction=self._reduction,
-            name=self._name)
+        return F.nll_loss(input,
+                          label,
+                          weight=self._weight,
+                          ignore_index=self._ignore_index,
+                          reduction=self._reduction,
+                          name=self._name)
 
 
 class KLDivLoss(Layer):
@@ -1037,8 +1041,10 @@ def __init__(self, margin=0.0, reduction='mean', name=None):
         self.name = name
 
     def forward(self, input, other, label):
-        out = paddle.nn.functional.margin_ranking_loss(
-            input, other, label, self.margin, self.reduction, self.name)
+        out = paddle.nn.functional.margin_ranking_loss(input, other, label,
+                                                       self.margin,
+                                                       self.reduction,
+                                                       self.name)
         return out
 
 
@@ -1128,14 +1134,13 @@ def forward(self,
                 input_lengths,
                 label_lengths,
                 norm_by_times=False):
-        return paddle.nn.functional.ctc_loss(
-            log_probs,
-            labels,
-            input_lengths,
-            label_lengths,
-            self.blank,
-            self.reduction,
-            norm_by_times=norm_by_times)
+        return paddle.nn.functional.ctc_loss(log_probs,
+                                             labels,
+                                             input_lengths,
+                                             label_lengths,
+                                             self.blank,
+                                             self.reduction,
+                                             norm_by_times=norm_by_times)
 
 
 class SmoothL1Loss(Layer):
@@ -1205,12 +1210,11 @@ def __init__(self, reduction='mean', delta=1.0, name=None):
         self.name = name
 
     def forward(self, input, label):
-        return F.smooth_l1_loss(
-            input,
-            label,
-            reduction=self.reduction,
-            delta=self.delta,
-            name=self.name)
+        return F.smooth_l1_loss(input,
+                                label,
+                                reduction=self.reduction,
+                                delta=self.delta,
+                                name=self.name)
 
 
 class HingeEmbeddingLoss(Layer):
@@ -1300,9 +1304,8 @@ def __init__(self, margin=1.0, reduction="mean", name=None):
         self.name = name
 
     def forward(self, input, label):
-        return F.hinge_embedding_loss(
-            input,
-            label,
-            reduction=self.reduction,
-            margin=self.margin,
-            name=self.name)
+        return F.hinge_embedding_loss(input,
+                                      label,
+                                      reduction=self.reduction,
+                                      margin=self.margin,
+                                      name=self.name)
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 6cdfc36d5d61f..e549859fe626d 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -25,7 +25,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define normalization api  
+# TODO: define normalization api
 
 import six
 
@@ -83,11 +83,10 @@ def __init__(self,
                 shape=[num_features],
                 default_initializer=Constant(1.0),
                 is_bias=False)
-            self.bias = self.create_parameter(
-                attr=self._bias_attr,
-                shape=[num_features],
-                default_initializer=Constant(0.0),
-                is_bias=True)
+            self.bias = self.create_parameter(attr=self._bias_attr,
+                                              shape=[num_features],
+                                              default_initializer=Constant(0.0),
+                                              is_bias=True)
         else:
             self.scale = None
             self.bias = None
@@ -98,8 +97,10 @@ def _check_input_dim(self, input):
     def forward(self, input):
         self._check_input_dim(input)
 
-        return instance_norm(
-            input, weight=self.scale, bias=self.bias, eps=self._epsilon)
+        return instance_norm(input,
+                             weight=self.scale,
+                             bias=self.bias,
+                             eps=self._epsilon)
 
     def extra_repr(self):
         return 'num_features={}, epsilon={}'.format(self._num_features,
@@ -392,15 +393,15 @@ def __init__(self,
             self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
 
         if bias_attr == False:
-            self.bias = self.create_parameter(
-                attr=None,
-                shape=param_shape,
-                default_initializer=Constant(0.0),
-                is_bias=True)
+            self.bias = self.create_parameter(attr=None,
+                                              shape=param_shape,
+                                              default_initializer=Constant(0.0),
+                                              is_bias=True)
             self.bias.stop_gradient = True
         else:
-            self.bias = self.create_parameter(
-                attr=self._bias_attr, shape=param_shape, is_bias=True)
+            self.bias = self.create_parameter(attr=self._bias_attr,
+                                              shape=param_shape,
+                                              is_bias=True)
             self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
 
     def forward(self, input):
@@ -419,9 +420,10 @@ def forward(self, input):
                 'epsilon',
                 self._epsilon,
                 'groups',
-                self._num_groups, )
-            return dygraph_utils._append_activation_in_dygraph(
-                pre_act, act=None)
+                self._num_groups,
+            )
+            return dygraph_utils._append_activation_in_dygraph(pre_act,
+                                                               act=None)
 
         inputs = {'X': input}
         if self.bias is not None:
@@ -433,16 +435,17 @@ def forward(self, input):
         group_norm_out = self._helper.create_variable_for_type_inference(
             dtype=input.dtype)
 
-        self._helper.append_op(
-            type="group_norm",
-            inputs=inputs,
-            outputs={
-                "Y": group_norm_out,
-                "Mean": mean_out,
-                "Variance": variance_out,
-            },
-            attrs={"epsilon": self._epsilon,
-                   "groups": self._num_groups})
+        self._helper.append_op(type="group_norm",
+                               inputs=inputs,
+                               outputs={
+                                   "Y": group_norm_out,
+                                   "Mean": mean_out,
+                                   "Variance": variance_out,
+                               },
+                               attrs={
+                                   "epsilon": self._epsilon,
+                                   "groups": self._num_groups
+                               })
 
         return self._helper.append_activation(group_norm_out, None)
 
@@ -543,16 +546,16 @@ def __init__(self,
         if bias_attr is False:
             self.bias = None
         else:
-            self.bias = self.create_parameter(
-                attr=self._bias_attr, shape=param_shape, is_bias=True)
+            self.bias = self.create_parameter(attr=self._bias_attr,
+                                              shape=param_shape,
+                                              is_bias=True)
 
     def forward(self, input):
-        return layer_norm(
-            input,
-            normalized_shape=self._normalized_shape,
-            weight=self.weight,
-            bias=self.bias,
-            epsilon=self._epsilon)
+        return layer_norm(input,
+                          normalized_shape=self._normalized_shape,
+                          weight=self.weight,
+                          bias=self.bias,
+                          epsilon=self._epsilon)
 
     def extra_repr(self):
         return 'normalized_shape={}, epsilon={}'.format(self._normalized_shape,
@@ -603,19 +606,17 @@ def __init__(self,
             self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
 
         if bias_attr == False:
-            self.bias = self.create_parameter(
-                attr=None,
-                shape=param_shape,
-                dtype=self._dtype,
-                default_initializer=Constant(0.0),
-                is_bias=True)
+            self.bias = self.create_parameter(attr=None,
+                                              shape=param_shape,
+                                              dtype=self._dtype,
+                                              default_initializer=Constant(0.0),
+                                              is_bias=True)
             self.bias.stop_gradient = True
         else:
-            self.bias = self.create_parameter(
-                attr=self._bias_attr,
-                shape=param_shape,
-                dtype=self._dtype,
-                is_bias=True)
+            self.bias = self.create_parameter(attr=self._bias_attr,
+                                              shape=param_shape,
+                                              dtype=self._dtype,
+                                              is_bias=True)
             self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
 
         moving_mean_name = None
@@ -625,24 +626,22 @@ def __init__(self,
             moving_mean_name = name + "_mean"
             moving_variance_name = name + "_variance"
 
-        self._mean = self.create_parameter(
-            dtype=self._dtype,
-            attr=ParamAttr(
-                name=moving_mean_name,
-                initializer=Constant(0.0),
-                trainable=False,
-                do_model_average=True),
-            shape=param_shape)
+        self._mean = self.create_parameter(dtype=self._dtype,
+                                           attr=ParamAttr(
+                                               name=moving_mean_name,
+                                               initializer=Constant(0.0),
+                                               trainable=False,
+                                               do_model_average=True),
+                                           shape=param_shape)
         self._mean.stop_gradient = True
 
-        self._variance = self.create_parameter(
-            dtype=self._dtype,
-            attr=ParamAttr(
-                name=moving_variance_name,
-                initializer=Constant(1.0),
-                trainable=False,
-                do_model_average=True),
-            shape=param_shape)
+        self._variance = self.create_parameter(dtype=self._dtype,
+                                               attr=ParamAttr(
+                                                   name=moving_variance_name,
+                                                   initializer=Constant(1.0),
+                                                   trainable=False,
+                                                   do_model_average=True),
+                                               shape=param_shape)
         self._variance.stop_gradient = True
 
         self._data_format = data_format
@@ -668,17 +667,16 @@ def forward(self, input):
             warnings.warn(
                 "When training, we now always track global mean and variance.")
 
-        return batch_norm(
-            input,
-            self._mean,
-            self._variance,
-            weight=self.weight,
-            bias=self.bias,
-            training=self.training,
-            momentum=self._momentum,
-            epsilon=self._epsilon,
-            data_format=self._data_format,
-            use_global_stats=self._use_global_stats)
+        return batch_norm(input,
+                          self._mean,
+                          self._variance,
+                          weight=self.weight,
+                          bias=self.bias,
+                          training=self.training,
+                          momentum=self._momentum,
+                          epsilon=self._epsilon,
+                          data_format=self._data_format,
+                          use_global_stats=self._use_global_stats)
 
     def extra_repr(self):
         main_str = 'num_features={}, momentum={}, epsilon={}'.format(
@@ -1151,8 +1149,10 @@ def forward(self, x):
             "SavedVariance": [saved_variance]
         }
 
-        self._helper.append_op(
-            type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+        self._helper.append_op(type="sync_batch_norm",
+                               inputs=inputs,
+                               outputs=outputs,
+                               attrs=attrs)
         return sync_batch_norm_out
 
     @classmethod
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index c664c6e318c46..990d0b6107864 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -204,16 +204,15 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        return F.avg_pool2d(
-            x,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            ceil_mode=self.ceil_mode,
-            exclusive=self.exclusive,
-            divisor_override=self.divisor,
-            data_format=self.data_format,
-            name=self.name)
+        return F.avg_pool2d(x,
+                            kernel_size=self.ksize,
+                            stride=self.stride,
+                            padding=self.padding,
+                            ceil_mode=self.ceil_mode,
+                            exclusive=self.exclusive,
+                            divisor_override=self.divisor,
+                            data_format=self.data_format,
+                            name=self.name)
 
     def extra_repr(self):
         return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
@@ -302,16 +301,15 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        return F.avg_pool3d(
-            x,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            ceil_mode=self.ceil_mode,
-            exclusive=self.exclusive,
-            divisor_override=self.divisor,
-            data_format=self.data_format,
-            name=self.name)
+        return F.avg_pool3d(x,
+                            kernel_size=self.ksize,
+                            stride=self.stride,
+                            padding=self.padding,
+                            ceil_mode=self.ceil_mode,
+                            exclusive=self.exclusive,
+                            divisor_override=self.divisor,
+                            data_format=self.data_format,
+                            name=self.name)
 
     def extra_repr(self):
         return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
@@ -507,15 +505,14 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        return F.max_pool2d(
-            x,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            return_mask=self.return_mask,
-            ceil_mode=self.ceil_mode,
-            data_format=self.data_format,
-            name=self.name)
+        return F.max_pool2d(x,
+                            kernel_size=self.ksize,
+                            stride=self.stride,
+                            padding=self.padding,
+                            return_mask=self.return_mask,
+                            ceil_mode=self.ceil_mode,
+                            data_format=self.data_format,
+                            name=self.name)
 
     def extra_repr(self):
         return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
@@ -605,15 +602,14 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        return F.max_pool3d(
-            x,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            return_mask=self.return_mask,
-            ceil_mode=self.ceil_mode,
-            data_format=self.data_format,
-            name=self.name)
+        return F.max_pool3d(x,
+                            kernel_size=self.ksize,
+                            stride=self.stride,
+                            padding=self.padding,
+                            return_mask=self.return_mask,
+                            ceil_mode=self.ceil_mode,
+                            data_format=self.data_format,
+                            name=self.name)
 
     def extra_repr(self):
         return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
@@ -769,11 +765,10 @@ def __init__(self, output_size, data_format="NCHW", name=None):
         self._name = name
 
     def forward(self, x):
-        return F.adaptive_avg_pool2d(
-            x,
-            output_size=self._output_size,
-            data_format=self._data_format,
-            name=self._name)
+        return F.adaptive_avg_pool2d(x,
+                                     output_size=self._output_size,
+                                     data_format=self._data_format,
+                                     name=self._name)
 
     def extra_repr(self):
         return 'output_size={}'.format(self._output_size)
@@ -862,11 +857,10 @@ def __init__(self, output_size, data_format="NCDHW", name=None):
         self._name = name
 
     def forward(self, x):
-        return F.adaptive_avg_pool3d(
-            x,
-            output_size=self._output_size,
-            data_format=self._data_format,
-            name=self._name)
+        return F.adaptive_avg_pool3d(x,
+                                     output_size=self._output_size,
+                                     data_format=self._data_format,
+                                     name=self._name)
 
     def extra_repr(self):
         return 'output_size={}'.format(self._output_size)
@@ -1026,11 +1020,10 @@ def __init__(self, output_size, return_mask=False, name=None):
         self._name = name
 
     def forward(self, x):
-        return F.adaptive_max_pool2d(
-            x,
-            output_size=self._output_size,
-            return_mask=self._return_mask,
-            name=self._name)
+        return F.adaptive_max_pool2d(x,
+                                     output_size=self._output_size,
+                                     return_mask=self._return_mask,
+                                     name=self._name)
 
     def extra_repr(self):
         return 'output_size={}, return_mask={}'.format(self._output_size,
@@ -1119,11 +1112,10 @@ def __init__(self, output_size, return_mask=False, name=None):
         self._name = name
 
     def forward(self, x):
-        return F.adaptive_max_pool3d(
-            x,
-            output_size=self._output_size,
-            return_mask=self._return_mask,
-            name=self._name)
+        return F.adaptive_max_pool3d(x,
+                                     output_size=self._output_size,
+                                     return_mask=self._return_mask,
+                                     name=self._name)
 
     def extra_repr(self):
         return 'output_size={}, return_mask={}'.format(self._output_size,
@@ -1198,15 +1190,14 @@ def __init__(self,
         self.name = name
 
     def forward(self, x, indices):
-        return F.max_unpool1d(
-            x,
-            indices,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            data_format=self.data_format,
-            output_size=self.output_size,
-            name=self.name)
+        return F.max_unpool1d(x,
+                              indices,
+                              kernel_size=self.ksize,
+                              stride=self.stride,
+                              padding=self.padding,
+                              data_format=self.data_format,
+                              output_size=self.output_size,
+                              name=self.name)
 
     def extra_repr(self):
         return 'output_size={}'.format(self.output_size)
@@ -1283,15 +1274,14 @@ def __init__(self,
         self.name = name
 
     def forward(self, x, indices):
-        return F.max_unpool2d(
-            x,
-            indices,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            data_format=self.data_format,
-            output_size=self.output_size,
-            name=self.name)
+        return F.max_unpool2d(x,
+                              indices,
+                              kernel_size=self.ksize,
+                              stride=self.stride,
+                              padding=self.padding,
+                              data_format=self.data_format,
+                              output_size=self.output_size,
+                              name=self.name)
 
     def extra_repr(self):
         return 'output_size={}'.format(self.output_size)
@@ -1372,15 +1362,14 @@ def __init__(self,
         self.name = name
 
     def forward(self, x, indices):
-        return F.max_unpool3d(
-            x,
-            indices,
-            kernel_size=self.ksize,
-            stride=self.stride,
-            padding=self.padding,
-            data_format=self.data_format,
-            output_size=self.output_size,
-            name=self.name)
+        return F.max_unpool3d(x,
+                              indices,
+                              kernel_size=self.ksize,
+                              stride=self.stride,
+                              padding=self.padding,
+                              data_format=self.data_format,
+                              output_size=self.output_size,
+                              name=self.name)
 
     def extra_repr(self):
         return 'output_size={}'.format(self.output_size)
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 461ac03899e07..53dfad4106de0 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -190,7 +190,8 @@ def _is_shape_sequence(seq):
             if sys.version_info < (3, ):
                 integer_types = (
                     int,
-                    long, )
+                    long,
+                )
             else:
                 integer_types = (int, )
             """For shape, list/tuple of integer is the finest-grained objection"""
@@ -201,10 +202,11 @@ def _is_shape_sequence(seq):
             # TODO: Add check for the illegal
             if isinstance(seq, dict):
                 return True
-            return (isinstance(seq, Sequence) and
-                    not isinstance(seq, six.string_types))
+            return (isinstance(seq, Sequence)
+                    and not isinstance(seq, six.string_types))
 
         class Shape(object):
+
             def __init__(self, shape):
                 self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
 
@@ -225,12 +227,13 @@ def __init__(self, shape):
             states_dtypes = map_structure(lambda shape: dtype, states_shapes)
 
         init_states = map_structure(
-            lambda shape, dtype: paddle.fluid.layers.fill_constant_batch_size_like(
-                input=batch_ref,
-                shape=shape.shape,
-                dtype=dtype,
-                value=init_value,
-                input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
+            lambda shape, dtype: paddle.fluid.layers.
+            fill_constant_batch_size_like(input=batch_ref,
+                                          shape=shape.shape,
+                                          dtype=dtype,
+                                          value=init_value,
+                                          input_dim_idx=batch_dim_idx),
+            states_shapes, states_dtypes)
         return init_states
 
     @property
@@ -343,8 +346,8 @@ def __init__(self,
         super(SimpleRNNCell, self).__init__()
         if hidden_size <= 0:
             raise ValueError(
-                "hidden_size of {} must be greater than 0, but now equals to {}".
-                format(self.__class__.__name__, hidden_size))
+                "hidden_size of {} must be greater than 0, but now equals to {}"
+                .format(self.__class__.__name__, hidden_size))
         std = 1.0 / math.sqrt(hidden_size)
         self.weight_ih = self.create_parameter(
             (hidden_size, input_size),
@@ -495,8 +498,8 @@ def __init__(self,
         super(LSTMCell, self).__init__()
         if hidden_size <= 0:
             raise ValueError(
-                "hidden_size of {} must be greater than 0, but now equals to {}".
-                format(self.__class__.__name__, hidden_size))
+                "hidden_size of {} must be greater than 0, but now equals to {}"
+                .format(self.__class__.__name__, hidden_size))
         std = 1.0 / math.sqrt(hidden_size)
         self.weight_ih = self.create_parameter(
             (4 * hidden_size, input_size),
@@ -646,8 +649,8 @@ def __init__(self,
         super(GRUCell, self).__init__()
         if hidden_size <= 0:
             raise ValueError(
-                "hidden_size of {} must be greater than 0, but now equals to {}".
-                format(self.__class__.__name__, hidden_size))
+                "hidden_size of {} must be greater than 0, but now equals to {}"
+                .format(self.__class__.__name__, hidden_size))
         std = 1.0 / math.sqrt(hidden_size)
         self.weight_ih = self.create_parameter(
             (3 * hidden_size, input_size),
@@ -971,10 +974,9 @@ def flatten_parameters(self):
             # add both to main_program and startup_program for static-graph.
             # Use Constant initializer to avoid make effect on random generator.
             self._flat_weight = [
-                self.create_parameter(
-                    shape=[np.sum(shape)],
-                    dtype=params[0].dtype,
-                    default_initializer=I.Constant(0.0))
+                self.create_parameter(shape=[np.sum(shape)],
+                                      dtype=params[0].dtype,
+                                      default_initializer=I.Constant(0.0))
             ]
             # dropout state may also can be hided and avoid saving
             # should dropout state be persistable for static-graph
@@ -991,18 +993,17 @@ def flatten_parameters(self):
             with program_guard(default_startup_program(),
                                default_startup_program()):
                 with paddle.no_grad():
-                    self._helper.append_op(
-                        type="coalesce_tensor",
-                        inputs={"Input": self._all_weights},
-                        outputs={
-                            "Output": self._all_weights,
-                            "FusedOutput": self._flat_weight
-                        },
-                        attrs={
-                            "copy_data": True,
-                            "use_align": False,
-                            "dtype": params[0].dtype
-                        })
+                    self._helper.append_op(type="coalesce_tensor",
+                                           inputs={"Input": self._all_weights},
+                                           outputs={
+                                               "Output": self._all_weights,
+                                               "FusedOutput": self._flat_weight
+                                           },
+                                           attrs={
+                                               "copy_data": True,
+                                               "use_align": False,
+                                               "dtype": params[0].dtype
+                                           })
 
     def _cudnn_impl(self, inputs, initial_states, sequence_length):
         if not self.time_major:
@@ -1048,8 +1049,10 @@ def _cudnn_impl(self, inputs, initial_states, sequence_length):
                 'DropoutState': self._dropout_state,
             }
 
-            self._helper.append_op(
-                type="rnn", inputs=inputs, outputs=outputs, attrs=attrs)
+            self._helper.append_op(type="rnn",
+                                   inputs=inputs,
+                                   outputs=outputs,
+                                   attrs=attrs)
 
         out = paddle.tensor.transpose(out,
                                       [1, 0, 2]) if not self.time_major else out
@@ -1070,9 +1073,8 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
             initial_states = [initial_states] if isinstance(
                 initial_states, paddle.static.Variable) else initial_states
 
-        if self.could_use_cudnn and (
-                not paddle.device.is_compiled_with_rocm() or
-                sequence_length is None):
+        if self.could_use_cudnn and (not paddle.device.is_compiled_with_rocm()
+                                     or sequence_length is None):
             # Add CPU kernel and dispatch in backend later
             return self._cudnn_impl(inputs, initial_states, sequence_length)
 
@@ -1082,11 +1084,10 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
 
         for i, rnn_layer in enumerate(self):
             if i > 0:
-                inputs = F.dropout(
-                    inputs,
-                    self.dropout,
-                    training=self.training,
-                    mode="upscale_in_train")
+                inputs = F.dropout(inputs,
+                                   self.dropout,
+                                   training=self.training,
+                                   mode="upscale_in_train")
             outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
             final_states.append(final_state)
             inputs = outputs
@@ -1211,9 +1212,10 @@ def __init__(self,
         else:
             raise ValueError("Unknown activation '{}'".format(activation))
         self.activation = activation
-        super(SimpleRNN, self).__init__(
-            mode, input_size, hidden_size, num_layers, direction, time_major,
-            dropout, weight_ih_attr, weight_hh_attr, bias_ih_attr, bias_hh_attr)
+        super(SimpleRNN,
+              self).__init__(mode, input_size, hidden_size, num_layers,
+                             direction, time_major, dropout, weight_ih_attr,
+                             weight_hh_attr, bias_ih_attr, bias_hh_attr)
 
 
 class LSTM(RNNBase):
@@ -1325,9 +1327,10 @@ def __init__(self,
                  bias_ih_attr=None,
                  bias_hh_attr=None,
                  name=None):
-        super(LSTM, self).__init__(
-            "LSTM", input_size, hidden_size, num_layers, direction, time_major,
-            dropout, weight_ih_attr, weight_hh_attr, bias_ih_attr, bias_hh_attr)
+        super(LSTM,
+              self).__init__("LSTM", input_size, hidden_size, num_layers,
+                             direction, time_major, dropout, weight_ih_attr,
+                             weight_hh_attr, bias_ih_attr, bias_hh_attr)
 
 
 class GRU(RNNBase):
@@ -1432,6 +1435,7 @@ def __init__(self,
                  bias_ih_attr=None,
                  bias_hh_attr=None,
                  name=None):
-        super(GRU, self).__init__(
-            "GRU", input_size, hidden_size, num_layers, direction, time_major,
-            dropout, weight_ih_attr, weight_hh_attr, bias_ih_attr, bias_hh_attr)
+        super(GRU,
+              self).__init__("GRU", input_size, hidden_size, num_layers,
+                             direction, time_major, dropout, weight_ih_attr,
+                             weight_hh_attr, bias_ih_attr, bias_hh_attr)
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 340372f9b6a4e..35acaded2eaed 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -177,14 +177,22 @@ def __init__(self,
         self.head_dim = embed_dim // num_heads
         assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
 
-        self.q_proj = Linear(
-            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
-        self.k_proj = Linear(
-            self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
-        self.v_proj = Linear(
-            self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)
-        self.out_proj = Linear(
-            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
+        self.q_proj = Linear(embed_dim,
+                             embed_dim,
+                             weight_attr,
+                             bias_attr=bias_attr)
+        self.k_proj = Linear(self.kdim,
+                             embed_dim,
+                             weight_attr,
+                             bias_attr=bias_attr)
+        self.v_proj = Linear(self.vdim,
+                             embed_dim,
+                             weight_attr,
+                             bias_attr=bias_attr)
+        self.out_proj = Linear(embed_dim,
+                               embed_dim,
+                               weight_attr,
+                               bias_attr=bias_attr)
 
     def _prepare_qkv(self, query, key, value, cache=None):
         r"""
@@ -402,19 +410,19 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
             q, k, v, cache = self._prepare_qkv(query, key, value, cache)
 
         # scale dot product attention
-        product = paddle.matmul(
-            x=q * (self.head_dim**-0.5), y=k, transpose_y=True)
+        product = paddle.matmul(x=q * (self.head_dim**-0.5),
+                                y=k,
+                                transpose_y=True)
         if attn_mask is not None:
             # Support bool or int mask
             attn_mask = _convert_attention_mask(attn_mask, product.dtype)
             product = product + attn_mask
         weights = F.softmax(product)
         if self.dropout:
-            weights = F.dropout(
-                weights,
-                self.dropout,
-                training=self.training,
-                mode="upscale_in_train")
+            weights = F.dropout(weights,
+                                self.dropout,
+                                training=self.training,
+                                mode="upscale_in_train")
 
         out = tensor.matmul(weights, v)
 
@@ -522,17 +530,20 @@ def __init__(self,
         weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
         bias_attrs = _convert_param_attr_to_list(bias_attr, 2)
 
-        self.self_attn = MultiHeadAttention(
-            d_model,
-            nhead,
-            dropout=attn_dropout,
-            weight_attr=weight_attrs[0],
-            bias_attr=bias_attrs[0])
-        self.linear1 = Linear(
-            d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])
+        self.self_attn = MultiHeadAttention(d_model,
+                                            nhead,
+                                            dropout=attn_dropout,
+                                            weight_attr=weight_attrs[0],
+                                            bias_attr=bias_attrs[0])
+        self.linear1 = Linear(d_model,
+                              dim_feedforward,
+                              weight_attrs[1],
+                              bias_attr=bias_attrs[1])
         self.dropout = Dropout(act_dropout, mode="upscale_in_train")
-        self.linear2 = Linear(
-            dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])
+        self.linear2 = Linear(dim_feedforward,
+                              d_model,
+                              weight_attrs[1],
+                              bias_attr=bias_attrs[1])
         self.norm1 = LayerNorm(d_model)
         self.norm2 = LayerNorm(d_model)
         self.dropout1 = Dropout(dropout, mode="upscale_in_train")
@@ -613,8 +624,8 @@ def gen_cache(self, src):
                 `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
                 for more details.
         """
-        incremental_cache = self.self_attn.gen_cache(
-            src, type=self.self_attn.Cache)
+        incremental_cache = self.self_attn.gen_cache(src,
+                                                     type=self.self_attn.Cache)
         return incremental_cache
 
 
@@ -648,9 +659,10 @@ class TransformerEncoder(Layer):
 
     def __init__(self, encoder_layer, num_layers, norm=None):
         super(TransformerEncoder, self).__init__()
-        self.layers = LayerList([(encoder_layer if i == 0 else
-                                  type(encoder_layer)(**encoder_layer._config))
-                                 for i in range(num_layers)])
+        self.layers = LayerList([
+            (encoder_layer if i == 0 else type(encoder_layer)(
+                **encoder_layer._config)) for i in range(num_layers)
+        ])
         self.num_layers = num_layers
         self.norm = norm
 
@@ -827,23 +839,25 @@ def __init__(self,
         weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
         bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
 
-        self.self_attn = MultiHeadAttention(
-            d_model,
-            nhead,
-            dropout=attn_dropout,
-            weight_attr=weight_attrs[0],
-            bias_attr=bias_attrs[0])
-        self.cross_attn = MultiHeadAttention(
-            d_model,
-            nhead,
-            dropout=attn_dropout,
-            weight_attr=weight_attrs[1],
-            bias_attr=bias_attrs[1])
-        self.linear1 = Linear(
-            d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])
+        self.self_attn = MultiHeadAttention(d_model,
+                                            nhead,
+                                            dropout=attn_dropout,
+                                            weight_attr=weight_attrs[0],
+                                            bias_attr=bias_attrs[0])
+        self.cross_attn = MultiHeadAttention(d_model,
+                                             nhead,
+                                             dropout=attn_dropout,
+                                             weight_attr=weight_attrs[1],
+                                             bias_attr=bias_attrs[1])
+        self.linear1 = Linear(d_model,
+                              dim_feedforward,
+                              weight_attrs[2],
+                              bias_attr=bias_attrs[2])
         self.dropout = Dropout(act_dropout, mode="upscale_in_train")
-        self.linear2 = Linear(
-            dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])
+        self.linear2 = Linear(dim_feedforward,
+                              d_model,
+                              weight_attrs[2],
+                              bias_attr=bias_attrs[2])
         self.norm1 = LayerNorm(d_model)
         self.norm2 = LayerNorm(d_model)
         self.norm3 = LayerNorm(d_model)
@@ -958,8 +972,8 @@ def gen_cache(self, memory):
                 See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
                 for more details.
         """
-        incremental_cache = self.self_attn.gen_cache(
-            memory, type=self.self_attn.Cache)
+        incremental_cache = self.self_attn.gen_cache(memory,
+                                                     type=self.self_attn.Cache)
         static_cache = self.cross_attn.gen_cache(
             memory, memory, type=self.cross_attn.StaticCache)
         return incremental_cache, static_cache
@@ -1002,9 +1016,10 @@ class TransformerDecoder(Layer):
 
     def __init__(self, decoder_layer, num_layers, norm=None):
         super(TransformerDecoder, self).__init__()
-        self.layers = LayerList([(decoder_layer if i == 0 else
-                                  type(decoder_layer)(**decoder_layer._config))
-                                 for i in range(num_layers)])
+        self.layers = LayerList([
+            (decoder_layer if i == 0 else type(decoder_layer)(
+                **decoder_layer._config)) for i in range(num_layers)
+        ])
         self.num_layers = num_layers
         self.norm = norm
 
@@ -1344,8 +1359,10 @@ def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
 
         tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
         memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
-        output = self.decoder(
-            tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask)
+        output = self.decoder(tgt,
+                              memory,
+                              tgt_mask=tgt_mask,
+                              memory_mask=memory_mask)
         return output
 
     def generate_square_subsequent_mask(self, length):
@@ -1379,7 +1396,5 @@ def generate_square_subsequent_mask(self, length):
                 # [  0.   0.   0.   0.   0.]]
 
         """
-        return paddle.tensor.triu(
-            (paddle.ones(
-                (length, length), dtype=paddle.get_default_dtype()) * -np.inf),
-            1)
+        return paddle.tensor.triu((paddle.ones(
+            (length, length), dtype=paddle.get_default_dtype()) * -np.inf), 1)
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
index 6d5c112d75703..2fa150dcbdfbc 100644
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define specitial functions used in computer vision task 
+# TODO: define specitial functions used in computer vision task
 
 from .. import Layer
 from .. import functional
diff --git a/python/paddle/nn/quant/functional_layers.py b/python/paddle/nn/quant/functional_layers.py
index 2c0eb88e0875c..ca1eb5f4fb3c1 100644
--- a/python/paddle/nn/quant/functional_layers.py
+++ b/python/paddle/nn/quant/functional_layers.py
@@ -19,11 +19,13 @@
 
 
 class FloatFunctionalLayer(Layer):
+
     def __init__(self):
         super(FloatFunctionalLayer, self).__init__()
 
 
 class add(FloatFunctionalLayer):
+
     def __init__(self):
         super(add, self).__init__()
 
@@ -32,6 +34,7 @@ def forward(self, x, y, name=None):
 
 
 class subtract(FloatFunctionalLayer):
+
     def __init__(self):
         super(subtract, self).__init__()
 
@@ -40,6 +43,7 @@ def forward(self, x, y, name=None):
 
 
 class multiply(FloatFunctionalLayer):
+
     def __init__(self):
         super(multiply, self).__init__()
 
@@ -48,6 +52,7 @@ def forward(self, x, y, name=None):
 
 
 class divide(FloatFunctionalLayer):
+
     def __init__(self):
         super(divide, self).__init__()
 
@@ -56,6 +61,7 @@ def forward(self, x, y, name=None):
 
 
 class reshape(FloatFunctionalLayer):
+
     def __init__(self):
         super(reshape, self).__init__()
 
@@ -64,6 +70,7 @@ def forward(self, x, shape, name=None):
 
 
 class transpose(FloatFunctionalLayer):
+
     def __init__(self):
         super(transpose, self).__init__()
 
@@ -72,6 +79,7 @@ def forward(self, x, perm, name=None):
 
 
 class concat(FloatFunctionalLayer):
+
     def __init__(self):
         super(concat, self).__init__()
 
@@ -80,6 +88,7 @@ def forward(self, x, axis=0, name=None):
 
 
 class flatten(FloatFunctionalLayer):
+
     def __init__(self):
         super(flatten, self).__init__()
 
diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py
index 8e9316a19623b..62fe8087c4fdb 100644
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -39,8 +39,9 @@
     'QuantStub',
 ]
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 class FakeQuantAbsMax(Layer):
@@ -65,12 +66,12 @@ def __init__(self,
             name) if name else 'quant_dequant.scale'
         self._scale_name = unique_name.generate(scale_prefix)
         if quant_on_weight:
-            scale_attr = ParamAttr(
-                name=self._scale_name,
-                initializer=Constant(0.001),
-                trainable=False)
-            self._scale = self.create_parameter(
-                shape=[1], attr=scale_attr, dtype=self._dtype)
+            scale_attr = ParamAttr(name=self._scale_name,
+                                   initializer=Constant(0.001),
+                                   trainable=False)
+            self._scale = self.create_parameter(shape=[1],
+                                                attr=scale_attr,
+                                                dtype=self._dtype)
             self._scale.stop_gradient = True
         else:
             self._scale = None
@@ -78,12 +79,12 @@ def __init__(self,
     def forward(self, input):
         if in_dynamic_mode():
             attrs = ('bit_length', self._quant_bits)
-            quant_out = _varbase_creator(
-                type=input.type,
-                name="{}.quantized.dequantized".format(input.name),
-                shape=input.shape,
-                dtype=input.dtype,
-                persistable=False)
+            quant_out = _varbase_creator(type=input.type,
+                                         name="{}.quantized.dequantized".format(
+                                             input.name),
+                                         shape=input.shape,
+                                         dtype=input.dtype,
+                                         persistable=False)
             out_scale = self._scale
             if not out_scale:
                 out_scale = _varbase_creator(
@@ -93,8 +94,8 @@ def forward(self, input):
                     dtype=self._dtype,
                     persistable=False)
                 out_scale.stop_gradient = True
-            out, _, = _C_ops.fake_quantize_dequantize_abs_max(input, quant_out,
-                                                              out_scale, *attrs)
+            out, _, = _C_ops.fake_quantize_dequantize_abs_max(
+                input, quant_out, out_scale, *attrs)
             return out
 
         check_variable_and_dtype(input, 'input', ['float32'], "FakeQuantAbsMax")
@@ -116,11 +117,10 @@ def forward(self, input):
                 stop_gradient=True)
         outputs = {"Out": [quant_out], "OutScale": [out_scale]}
 
-        self._helper.append_op(
-            type="fake_quantize_dequantize_abs_max",
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs)
+        self._helper.append_op(type="fake_quantize_dequantize_abs_max",
+                               inputs=inputs,
+                               outputs=outputs,
+                               attrs=attrs)
 
         return quant_out
 
@@ -146,44 +146,44 @@ def __init__(self,
 
         scale_prefix = "{}.scale".format(
             name) if name else 'quant_dequant.scale'
-        scale_attr = ParamAttr(
-            name=unique_name.generate(scale_prefix),
-            initializer=Constant(0.001),
-            trainable=False)
-        self._scale = self.create_parameter(
-            shape=[1], attr=scale_attr, dtype=dtype)
+        scale_attr = ParamAttr(name=unique_name.generate(scale_prefix),
+                               initializer=Constant(0.001),
+                               trainable=False)
+        self._scale = self.create_parameter(shape=[1],
+                                            attr=scale_attr,
+                                            dtype=dtype)
         self._scale.stop_gradient = True
 
         state_prefix = "{}.state".format(
             name) if name else 'quant_dequant.state'
-        state_attr = ParamAttr(
-            name=unique_name.generate(state_prefix),
-            initializer=Constant(1),
-            trainable=False)
-        self._state = self.create_parameter(
-            shape=[1], attr=state_attr, dtype=dtype)
+        state_attr = ParamAttr(name=unique_name.generate(state_prefix),
+                               initializer=Constant(1),
+                               trainable=False)
+        self._state = self.create_parameter(shape=[1],
+                                            attr=state_attr,
+                                            dtype=dtype)
         self._state.stop_gradient = True
 
         accum_prefix = "{}.accum".format(
             name) if name else 'quant_dequant.accum'
-        accum_attr = ParamAttr(
-            name=unique_name.generate(accum_prefix),
-            initializer=Constant(1),
-            trainable=False)
-        self._accum = self.create_parameter(
-            shape=[1], attr=accum_attr, dtype=dtype)
+        accum_attr = ParamAttr(name=unique_name.generate(accum_prefix),
+                               initializer=Constant(1),
+                               trainable=False)
+        self._accum = self.create_parameter(shape=[1],
+                                            attr=accum_attr,
+                                            dtype=dtype)
         self._accum.stop_gradient = True
 
     def forward(self, input):
         if in_dynamic_mode():
             attrs = ('moving_rate', self._moving_rate, 'bit_length',
                      self._quant_bits, 'is_test', not self.training)
-            quant_out = _varbase_creator(
-                type=input.type,
-                name="{}.quantized.dequantized".format(input.name),
-                shape=input.shape,
-                dtype=input.dtype,
-                persistable=False)
+            quant_out = _varbase_creator(type=input.type,
+                                         name="{}.quantized.dequantized".format(
+                                             input.name),
+                                         shape=input.shape,
+                                         dtype=input.dtype,
+                                         persistable=False)
             state = self._state if self.training else None
             accum = self._accum if self.training else None
 
@@ -224,6 +224,7 @@ def forward(self, input):
 
 
 class FakeQuantChannelWiseAbsMax(Layer):
+
     def __init__(self,
                  name=None,
                  channel_num=None,
@@ -242,12 +243,12 @@ def __init__(self,
             name) if name else 'quant_dequant.scale'
         self._scale_name = unique_name.generate(scale_prefix)
         if quant_on_weight:
-            scale_attr = ParamAttr(
-                name=self._scale_name,
-                initializer=Constant(0.0),
-                trainable=False)
-            self._scale = self.create_parameter(
-                shape=[self._channel_num], attr=scale_attr, dtype=self._dtype)
+            scale_attr = ParamAttr(name=self._scale_name,
+                                   initializer=Constant(0.0),
+                                   trainable=False)
+            self._scale = self.create_parameter(shape=[self._channel_num],
+                                                attr=scale_attr,
+                                                dtype=self._dtype)
             self._scale.stop_gradient = True
         else:
             self._scale = None
@@ -256,12 +257,12 @@ def forward(self, input):
         if in_dynamic_mode():
             attrs = ('bit_length', self._quant_bits, 'quant_axis',
                      self._quant_axis)
-            quant_out = _varbase_creator(
-                type=input.type,
-                name="{}.quantized.dequantized".format(input.name),
-                shape=input.shape,
-                dtype=input.dtype,
-                persistable=False)
+            quant_out = _varbase_creator(type=input.type,
+                                         name="{}.quantized.dequantized".format(
+                                             input.name),
+                                         shape=input.shape,
+                                         dtype=input.dtype,
+                                         persistable=False)
 
             out_scale = self._scale
             if out_scale is None:
@@ -307,6 +308,7 @@ def forward(self, input):
 
 
 class MovingAverageAbsMaxScale(Layer):
+
     def __init__(self, name=None, moving_rate=0.9, dtype='float32'):
         r"""
         MovingAverageMaxScale layer is used to calculating the output quantization
@@ -320,28 +322,30 @@ def __init__(self, name=None, moving_rate=0.9, dtype='float32'):
 
         scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale'
         scale_name = unique_name.generate(scale_prefix)
-        scale_attr = ParamAttr(
-            name=scale_name, initializer=Constant(0), trainable=False)
-        self._scale = self.create_parameter(
-            shape=[1], attr=scale_attr, dtype=dtype)
+        scale_attr = ParamAttr(name=scale_name,
+                               initializer=Constant(0),
+                               trainable=False)
+        self._scale = self.create_parameter(shape=[1],
+                                            attr=scale_attr,
+                                            dtype=dtype)
         self._scale.stop_gradient = True
 
         state_prefix = "{}.state".format(name) if name else 'outscale.state'
-        state_attr = ParamAttr(
-            name=unique_name.generate(state_prefix),
-            initializer=Constant(0),
-            trainable=False)
-        self._state = self.create_parameter(
-            shape=[1], attr=state_attr, dtype=dtype)
+        state_attr = ParamAttr(name=unique_name.generate(state_prefix),
+                               initializer=Constant(0),
+                               trainable=False)
+        self._state = self.create_parameter(shape=[1],
+                                            attr=state_attr,
+                                            dtype=dtype)
         self._state.stop_gradient = True
 
         accum_prefix = "{}.accum".format(name) if name else 'outscale.accum'
-        accum_attr = ParamAttr(
-            name=unique_name.generate(accum_prefix),
-            initializer=Constant(0),
-            trainable=False)
-        self._accum = self.create_parameter(
-            shape=[1], attr=accum_attr, dtype=dtype)
+        accum_attr = ParamAttr(name=unique_name.generate(accum_prefix),
+                               initializer=Constant(0),
+                               trainable=False)
+        self._accum = self.create_parameter(shape=[1],
+                                            attr=accum_attr,
+                                            dtype=dtype)
         self._accum.stop_gradient = True
 
     def forward(self, input):
@@ -350,12 +354,11 @@ def forward(self, input):
                      not self.training)
             state = self._state if self.training else None
             accum = self._accum if self.training else None
-            quant_out = _varbase_creator(
-                type=input.type,
-                name="{}.tmp".format(input.name),
-                shape=input.shape,
-                dtype=input.dtype,
-                persistable=False)
+            quant_out = _varbase_creator(type=input.type,
+                                         name="{}.tmp".format(input.name),
+                                         shape=input.shape,
+                                         dtype=input.dtype,
+                                         persistable=False)
 
             out, _, _, _ = _C_ops.moving_average_abs_max_scale(
                 input, accum, state, quant_out, self._scale, state, accum,
@@ -381,11 +384,10 @@ def forward(self, input):
             outputs['OutState'] = [self._state]
             outputs['OutAccum'] = [self._accum]
 
-        self._helper.append_op(
-            type="moving_average_abs_max_scale",
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs)
+        self._helper.append_op(type="moving_average_abs_max_scale",
+                               inputs=inputs,
+                               outputs=outputs,
+                               attrs=attrs)
 
         return quant_out
 
@@ -471,15 +473,14 @@ def forward(self, input):
                                 data_format=self._data_format)
             self._padding = 0
 
-        return F.conv2d(
-            quant_input,
-            quant_weight,
-            bias=self.bias,
-            padding=self._padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            data_format=self._data_format)
+        return F.conv2d(quant_input,
+                        quant_weight,
+                        bias=self.bias,
+                        padding=self._padding,
+                        stride=self._stride,
+                        dilation=self._dilation,
+                        groups=self._groups,
+                        data_format=self._data_format)
 
 
 class QuantizedConv2DTranspose(Layer):
@@ -575,17 +576,16 @@ def forward(self, input, output_size=None):
         else:
             output_padding = 0
 
-        return F.conv2d_transpose(
-            quant_input,
-            quant_weight,
-            bias=self.bias,
-            padding=self._padding,
-            output_padding=output_padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            output_size=output_size,
-            data_format=self._data_format)
+        return F.conv2d_transpose(quant_input,
+                                  quant_weight,
+                                  bias=self.bias,
+                                  padding=self._padding,
+                                  output_padding=output_padding,
+                                  stride=self._stride,
+                                  dilation=self._dilation,
+                                  groups=self._groups,
+                                  output_size=output_size,
+                                  data_format=self._data_format)
 
 
 class QuantizedLinear(Layer):
@@ -652,8 +652,10 @@ def forward(self, input):
             weight = self._weight_preprocess(self.weight)
         quant_weight = self._fake_quant_weight(weight)
 
-        out = F.linear(
-            x=quant_input, weight=quant_weight, bias=self.bias, name=self.name)
+        out = F.linear(x=quant_input,
+                       weight=quant_weight,
+                       bias=self.bias,
+                       name=self.name)
         return out
 
 
@@ -677,8 +679,8 @@ def __init__(self, layer=None, moving_rate=0.9, name=None, dtype='float32'):
     def forward(self, *inputs, **kwargs):
         out = self._layer(*inputs, **kwargs)
         # TODO (jc): support the ops of several outputs
-        if (isinstance(out, list) or isinstance(out, tuple) or
-                isinstance(out, dict)):
+        if (isinstance(out, list) or isinstance(out, tuple)
+                or isinstance(out, dict)):
             return out
         else:
             return self._ma_output_scale(out)
diff --git a/python/paddle/nn/utils/__init__.py b/python/paddle/nn/utils/__init__.py
index 8ec4e8cfd60b5..5afdaa8d84896 100644
--- a/python/paddle/nn/utils/__init__.py
+++ b/python/paddle/nn/utils/__init__.py
@@ -17,5 +17,6 @@
 from .transform_parameters import parameters_to_vector, vector_to_parameters, _stride_column  # noqa: F401
 
 __all__ = [  #noqa
-    'weight_norm', 'remove_weight_norm', 'spectral_norm', 'parameters_to_vector', 'vector_to_parameters'
+    'weight_norm', 'remove_weight_norm', 'spectral_norm',
+    'parameters_to_vector', 'vector_to_parameters'
 ]
diff --git a/python/paddle/nn/utils/spectral_norm_hook.py b/python/paddle/nn/utils/spectral_norm_hook.py
index 56c9e83c38b06..375fe9013b830 100644
--- a/python/paddle/nn/utils/spectral_norm_hook.py
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
@@ -30,13 +30,14 @@ def normal_(x, mean=0., std=1.):
 
 
 class SpectralNorm(object):
+
     def __init__(self, name='weight', n_power_iterations=1, dim=0, eps=1e-12):
         self.name = name
         self.dim = dim
         if n_power_iterations <= 0:
-            raise ValueError('Expected n_power_iterations to be positive, but '
-                             'got n_power_iterations={}'.format(
-                                 n_power_iterations))
+            raise ValueError(
+                'Expected n_power_iterations to be positive, but '
+                'got n_power_iterations={}'.format(n_power_iterations))
         self.n_power_iterations = n_power_iterations
         self.eps = eps
 
@@ -44,9 +45,9 @@ def reshape_weight_to_matrix(self, weight):
         weight_mat = weight
         if self.dim != 0:
             # transpose dim to front
-            weight_mat = weight_mat.transpose([self.dim] + [
-                d for d in range(weight_mat.dim()) if d != self.dim
-            ])
+            weight_mat = weight_mat.transpose(
+                [self.dim] +
+                [d for d in range(weight_mat.dim()) if d != self.dim])
 
         height = weight_mat.shape[0]
 
@@ -63,19 +64,20 @@ def compute_weight(self, layer, do_power_iteration):
                 for _ in range(self.n_power_iterations):
                     v.set_value(
                         F.normalize(
-                            paddle.matmul(
-                                weight_mat,
-                                u,
-                                transpose_x=True,
-                                transpose_y=False),
+                            paddle.matmul(weight_mat,
+                                          u,
+                                          transpose_x=True,
+                                          transpose_y=False),
                             axis=0,
-                            epsilon=self.eps, ))
+                            epsilon=self.eps,
+                        ))
 
                     u.set_value(
                         F.normalize(
                             paddle.matmul(weight_mat, v),
                             axis=0,
-                            epsilon=self.eps, ))
+                            epsilon=self.eps,
+                        ))
                 if self.n_power_iterations > 0:
                     u = u.clone()
                     v = v.clone()
@@ -85,11 +87,8 @@ def compute_weight(self, layer, do_power_iteration):
         return weight
 
     def __call__(self, layer, inputs):
-        setattr(
-            layer,
-            self.name,
-            self.compute_weight(
-                layer, do_power_iteration=layer.training))
+        setattr(layer, self.name,
+                self.compute_weight(layer, do_power_iteration=layer.training))
 
     @staticmethod
     def apply(layer, name, n_power_iterations, dim, eps):
@@ -201,8 +200,9 @@ def spectral_norm(layer,
     """
 
     if dim is None:
-        if isinstance(layer, (Conv1DTranspose, Conv2DTranspose, Conv3DTranspose,
-                              Linear)):
+        if isinstance(
+                layer,
+            (Conv1DTranspose, Conv2DTranspose, Conv3DTranspose, Linear)):
             dim = 1
         else:
             dim = 0
diff --git a/python/paddle/nn/utils/transform_parameters.py b/python/paddle/nn/utils/transform_parameters.py
index feb70e02d5988..36b0dcdf507e4 100644
--- a/python/paddle/nn/utils/transform_parameters.py
+++ b/python/paddle/nn/utils/transform_parameters.py
@@ -27,13 +27,14 @@ def _inplace_reshape_dygraph(x, shape):
             tmp_out, _ = _C_ops.reshape2(x, None, 'shape', shape)
             tmp_out._share_underline_tensor_to(x)
     else:
-        _dygraph_tracer().trace_op(
-            type="reshape2",
-            inputs={'X': x},
-            outputs={'Out': x,
-                     'XShape': x_shape},
-            attrs={'shape': shape},
-            stop_gradient=True)
+        _dygraph_tracer().trace_op(type="reshape2",
+                                   inputs={'X': x},
+                                   outputs={
+                                       'Out': x,
+                                       'XShape': x_shape
+                                   },
+                                   attrs={'shape': shape},
+                                   stop_gradient=True)
 
 
 @dygraph_only
@@ -106,12 +107,11 @@ def parameters_to_vector(parameters, name=None):
             _C_ops.concat(parameters, tmp, 'axis', 0)
             tmp._share_underline_tensor_to(out)
     else:
-        _dygraph_tracer().trace_op(
-            type='concat',
-            inputs={'X': parameters},
-            outputs={'Out': [out]},
-            attrs={'axis': 0},
-            stop_gradient=True)
+        _dygraph_tracer().trace_op(type='concat',
+                                   inputs={'X': parameters},
+                                   outputs={'Out': [out]},
+                                   attrs={'axis': 0},
+                                   stop_gradient=True)
     for i, param in enumerate(parameters):
         _inplace_reshape_dygraph(param, origin_shapes[i])
     return out
@@ -160,13 +160,14 @@ def vector_to_parameters(vec, parameters, name=None):
             for i in range(0, len(res)):
                 res[i]._share_underline_tensor_to(parameters[i])
     else:
-        _dygraph_tracer().trace_op(
-            type='split',
-            inputs={'X': [vec]},
-            outputs={'Out': parameters},
-            attrs={'axis': 0,
-                   'sections': sections},
-            stop_gradient=True)
+        _dygraph_tracer().trace_op(type='split',
+                                   inputs={'X': [vec]},
+                                   outputs={'Out': parameters},
+                                   attrs={
+                                       'axis': 0,
+                                       'sections': sections
+                                   },
+                                   stop_gradient=True)
 
     for i, param in enumerate(parameters):
         _inplace_reshape_dygraph(param, origin_shapes[i])
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index 84644ccc48445..c805d3949e861 100755
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -30,15 +30,16 @@ def l2_norm(x, axis, epsilon=1e-12, name=None):
     helper = LayerHelper("l2_normalize", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     norm = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="norm",
-        inputs={"X": x},
-        outputs={"Out": out,
-                 "Norm": norm},
-        attrs={
-            "axis": 1 if axis is None else axis,
-            "epsilon": epsilon,
-        })
+    helper.append_op(type="norm",
+                     inputs={"X": x},
+                     outputs={
+                         "Out": out,
+                         "Norm": norm
+                     },
+                     attrs={
+                         "axis": 1 if axis is None else axis,
+                         "epsilon": epsilon,
+                     })
     return paddle.squeeze(norm, axis=[axis])
 
 
@@ -85,12 +86,14 @@ def _weight_norm(v, g, dim):
         v_normalized = F.l2_normalize(p_matrix, axis=1)
         v_normalized = paddle.reshape(v_normalized, transposed_shape)
         v_normalized = paddle.transpose(v_normalized, perm)
-    weight = F.elementwise_mul(
-        v_normalized, g, axis=dim if dim is not None else -1)
+    weight = F.elementwise_mul(v_normalized,
+                               g,
+                               axis=dim if dim is not None else -1)
     return weight
 
 
 class WeightNorm(object):
+
     def __init__(self, name, dim):
         if dim is None:
             dim = -1
diff --git a/python/paddle/onnx/export.py b/python/paddle/onnx/export.py
index b8a217a5134fb..666cd7c08623a 100644
--- a/python/paddle/onnx/export.py
+++ b/python/paddle/onnx/export.py
@@ -91,15 +91,14 @@ def export_logic():
 
     file_prefix = os.path.basename(path)
     if file_prefix == "":
-        raise ValueError("The input path MUST be format of dirname/file_prefix "
-                         "[dirname\\file_prefix in Windows system], but "
-                         "the file_prefix is empty in received path: {}".format(
-                             path))
+        raise ValueError(
+            "The input path MUST be format of dirname/file_prefix "
+            "[dirname\\file_prefix in Windows system], but "
+            "the file_prefix is empty in received path: {}".format(path))
     save_file = path + '.onnx'
 
-    p2o.dygraph2onnx(
-        layer,
-        save_file,
-        input_spec=input_spec,
-        opset_version=opset_version,
-        **configs)
+    p2o.dygraph2onnx(layer,
+                     save_file,
+                     input_spec=input_spec,
+                     opset_version=opset_version,
+                     **configs)
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index 07d2935bc7646..cd75fd4906ea5 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -24,15 +24,7 @@
 from .lamb import Lamb  # noqa: F401
 from . import lr  # noqa: F401
 
-__all__ = [     #noqa
-           'Optimizer',
-           'Adagrad',
-           'Adam',
-           'AdamW',
-           'Adamax',
-           'RMSProp',
-           'Adadelta',
-           'SGD',
-           'Momentum',
-           'Lamb'
+__all__ = [  #noqa
+    'Optimizer', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'RMSProp', 'Adadelta',
+    'SGD', 'Momentum', 'Lamb'
 ]
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index 32050c12ec147..ca1667139641b 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -120,12 +120,11 @@ def __init__(self,
             raise ValueError("epsilon is not set.")
         if rho is None:
             raise ValueError("rho is not set.")
-        super(Adadelta, self).__init__(
-            learning_rate=learning_rate,
-            parameters=parameters,
-            weight_decay=weight_decay,
-            grad_clip=grad_clip,
-            name=name)
+        super(Adadelta, self).__init__(learning_rate=learning_rate,
+                                       parameters=parameters,
+                                       weight_decay=weight_decay,
+                                       grad_clip=grad_clip,
+                                       name=name)
         self.type = "adadelta"
         self._epsilon = epsilon
         self._rho = rho
@@ -157,22 +156,28 @@ def _append_optimize_op(self, block, param_and_grad):
             self._avg_squared_update_acc_str, param_and_grad[0])
 
         # Create the adadelta optimizer op
-        adadelta_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "AvgSquaredGrad": avg_squared_grad_acc,
-                "AvgSquaredUpdate": avg_squared_update_acc
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "AvgSquaredGradOut": avg_squared_grad_acc,
-                "AvgSquaredUpdateOut": avg_squared_update_acc
-            },
-            attrs={"epsilon": self._epsilon,
-                   "rho": self._rho},
-            stop_gradient=True)
+        adadelta_op = block.append_op(type=self.type,
+                                      inputs={
+                                          "Param": param_and_grad[0],
+                                          "Grad": param_and_grad[1],
+                                          "AvgSquaredGrad":
+                                          avg_squared_grad_acc,
+                                          "AvgSquaredUpdate":
+                                          avg_squared_update_acc
+                                      },
+                                      outputs={
+                                          "ParamOut":
+                                          param_and_grad[0],
+                                          "AvgSquaredGradOut":
+                                          avg_squared_grad_acc,
+                                          "AvgSquaredUpdateOut":
+                                          avg_squared_update_acc
+                                      },
+                                      attrs={
+                                          "epsilon": self._epsilon,
+                                          "rho": self._rho
+                                      },
+                                      stop_gradient=True)
 
         return adadelta_op
 
diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py
index 7ca4ab648a1f5..f5cd7bdaa83e0 100644
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -118,12 +118,11 @@ def __init__(self,
                  initial_accumulator_value=0.0):
         assert learning_rate is not None
         assert epsilon is not None
-        super(Adagrad, self).__init__(
-            learning_rate=learning_rate,
-            parameters=parameters,
-            weight_decay=weight_decay,
-            grad_clip=grad_clip,
-            name=name)
+        super(Adagrad, self).__init__(learning_rate=learning_rate,
+                                      parameters=parameters,
+                                      weight_decay=weight_decay,
+                                      grad_clip=grad_clip,
+                                      name=name)
         self.type = "adagrad"
         self._epsilon = epsilon
         self.initial_accumulator_value = initial_accumulator_value
@@ -139,10 +138,9 @@ def _create_accumulators(self, block, parameters):
             parameters = self._update_param_group(parameters)
 
         for p in parameters:
-            self._add_accumulator(
-                self._moment_acc_str,
-                p,
-                fill_value=self.initial_accumulator_value)
+            self._add_accumulator(self._moment_acc_str,
+                                  p,
+                                  fill_value=self.initial_accumulator_value)
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -153,18 +151,23 @@ def _append_optimize_op(self, block, param_and_grad):
         moment_acc = self._get_accumulator(self._moment_acc_str,
                                            param_and_grad[0])
         # Create the adagrad optimizer op
-        adagrad_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "Moment": moment_acc,
-                "LearningRate": self._create_param_lr(param_and_grad)
-            },
-            outputs={"ParamOut": param_and_grad[0],
-                     "MomentOut": moment_acc},
-            attrs={"epsilon": self._epsilon},
-            stop_gradient=True)
+        adagrad_op = block.append_op(type=self.type,
+                                     inputs={
+                                         "Param":
+                                         param_and_grad[0],
+                                         "Grad":
+                                         param_and_grad[1],
+                                         "Moment":
+                                         moment_acc,
+                                         "LearningRate":
+                                         self._create_param_lr(param_and_grad)
+                                     },
+                                     outputs={
+                                         "ParamOut": param_and_grad[0],
+                                         "MomentOut": moment_acc
+                                     },
+                                     attrs={"epsilon": self._epsilon},
+                                     stop_gradient=True)
 
         return adagrad_op
 
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index de09193ac798e..ac2685972d14f 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -191,12 +191,11 @@ def __init__(self,
             if not 0 <= epsilon:
                 raise ValueError(
                     "Invaild value of epsilon, expect epsilon >= 0.")
-        super(Adam, self).__init__(
-            learning_rate=learning_rate,
-            parameters=parameters,
-            weight_decay=weight_decay,
-            grad_clip=grad_clip,
-            name=name)
+        super(Adam, self).__init__(learning_rate=learning_rate,
+                                   parameters=parameters,
+                                   weight_decay=weight_decay,
+                                   grad_clip=grad_clip,
+                                   name=name)
         self.type = "adam"
         self._beta1 = beta1
         self._beta2 = beta2
@@ -237,21 +236,19 @@ def _create_master_weight(self, param):
 
             var_name = param.name + "_fp32_master"
             var_name = unique_name.generate(var_name)
-            var = layers.create_global_var(
-                name=var_name,
-                shape=param.shape,
-                value=0,
-                dtype='float32',
-                persistable=True)
+            var = layers.create_global_var(name=var_name,
+                                           shape=param.shape,
+                                           value=0,
+                                           dtype='float32',
+                                           persistable=True)
             block = self.helper.startup_program.global_block()
-            block.append_op(
-                type="cast",
-                inputs={"X": [param]},
-                outputs={"Out": [var]},
-                attrs={
-                    "in_dtype": param.dtype,
-                    "out_dtype": core.VarDesc.VarType.FP32
-                })
+            block.append_op(type="cast",
+                            inputs={"X": [param]},
+                            outputs={"Out": [var]},
+                            attrs={
+                                "in_dtype": param.dtype,
+                                "out_dtype": core.VarDesc.VarType.FP32
+                            })
             self._master_weights[param.name] = var
         return var
 
@@ -269,10 +266,11 @@ def _get_accumulator(self, name, param):
         target_param = self._master_weights[
             param.name] if find_master else param
         target_name = target_param.name
-        if (name not in self._accumulators or
-                target_name not in self._accumulators[name]):
-            raise Exception("Accumulator {} does not exist for parameter {}".
-                            format(name, target_name))
+        if (name not in self._accumulators
+                or target_name not in self._accumulators[name]):
+            raise Exception(
+                "Accumulator {} does not exist for parameter {}".format(
+                    name, target_name))
         return self._accumulators[name][target_name]
 
     def _add_moments_pows(self, p):
@@ -407,12 +405,11 @@ def _append_optimize_op(self, block, param_and_grad):
             inputs["MasterParam"] = master_weight
             outputs["MasterParamOut"] = master_weight
 
-        adam_op = block.append_op(
-            type=self.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs,
-            stop_gradient=True)
+        adam_op = block.append_op(type=self.type,
+                                  inputs=inputs,
+                                  outputs=outputs,
+                                  attrs=attrs,
+                                  stop_gradient=True)
 
         return adam_op
 
@@ -455,16 +452,17 @@ def step(self):
                                 "Adam don't support weight_decay with sparse parameters, please set it to None."
                             )
                     else:
-                        if hasattr(grad_var,
-                                   "_is_sparse") and grad_var._is_sparse(
-                                   ) and self.regularization is not None:
+                        if hasattr(
+                                grad_var, "_is_sparse") and grad_var._is_sparse(
+                                ) and self.regularization is not None:
                             raise RuntimeError(
                                 "Adam don't support weight_decay with sparse parameters, please set it to None."
                             )
                     params_grads.append((param, grad_var))
 
-            optimize_ops = self._apply_optimize(
-                loss=None, startup_program=None, params_grads=params_grads)
+            optimize_ops = self._apply_optimize(loss=None,
+                                                startup_program=None,
+                                                params_grads=params_grads)
         else:
             # optimize parameters in groups
             for param_group in self._param_groups:
@@ -478,8 +476,9 @@ def step(self):
                 params_grads.update(
                     {k: v
                      for k, v in param_group.items() if k != 'params'})
-                self._apply_optimize(
-                    loss=None, startup_program=None, params_grads=params_grads)
+                self._apply_optimize(loss=None,
+                                     startup_program=None,
+                                     params_grads=params_grads)
 
     def _multi_tensor_init(self, target_block, parameters):
         """
@@ -623,12 +622,11 @@ def _append_optimize_multi_tensor_op(self, target_block,
                         outputs["MasterParamOut"] = self._master_weight_dict[
                             key]
                         attrs["multi_precision"] = find_master
-                    target_block.append_op(
-                        type="merged_adam",
-                        inputs=inputs,
-                        outputs=outputs,
-                        attrs=attrs,
-                        stop_gradient=True)
+                    target_block.append_op(type="merged_adam",
+                                           inputs=inputs,
+                                           outputs=outputs,
+                                           attrs=attrs,
+                                           stop_gradient=True)
         return None
 
     def _update_param_group(self, parameters):
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index 4c4a85559c0d9..9a54435f8b8b8 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -151,12 +151,11 @@ def __init__(self,
             raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
         if not 0 <= epsilon:
             raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
-        super(Adamax, self).__init__(
-            learning_rate=learning_rate,
-            parameters=parameters,
-            weight_decay=weight_decay,
-            grad_clip=grad_clip,
-            name=name)
+        super(Adamax, self).__init__(learning_rate=learning_rate,
+                                     parameters=parameters,
+                                     weight_decay=weight_decay,
+                                     grad_clip=grad_clip,
+                                     name=name)
         self.type = "adamax"
         self._beta1 = beta1
         self._beta2 = beta2
@@ -175,11 +174,10 @@ def _create_accumulators(self, block, parameters):
         for p in parameters:
             self._add_accumulator(self._moment_acc_str, p)
             self._add_accumulator(self._inf_norm_acc_str, p)
-            self._add_accumulator(
-                name=self._beta1_pow_acc_str,
-                param=p,
-                fill_value=self._beta1,
-                shape=[1])
+            self._add_accumulator(name=self._beta1_pow_acc_str,
+                                  param=p,
+                                  fill_value=self._beta1,
+                                  shape=[1])
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -236,12 +234,11 @@ def _finish_update(self, block, parameters_and_grads):
                     [param, grad]), name_scope('adamax'):
                     beta1_pow_acc = self._get_accumulator(
                         self._beta1_pow_acc_str, param)
-                    block.append_op(
-                        type="scale",
-                        inputs={"X": beta1_pow_acc},
-                        outputs={"Out": beta1_pow_acc},
-                        attrs={"scale": self._beta1},
-                        stop_gradient=True)
+                    block.append_op(type="scale",
+                                    inputs={"X": beta1_pow_acc},
+                                    outputs={"Out": beta1_pow_acc},
+                                    attrs={"scale": self._beta1},
+                                    stop_gradient=True)
         else:
             for param, grad in parameters_and_grads['params']:
                 if grad is None or param.stop_gradient is True:
@@ -252,12 +249,11 @@ def _finish_update(self, block, parameters_and_grads):
                         self._beta1_pow_acc_str, param)
                     self._beta1 = parameters_and_grads.get(
                         'beta1', self._default_dict['beta1'])
-                    block.append_op(
-                        type="scale",
-                        inputs={"X": beta1_pow_acc},
-                        outputs={"Out": beta1_pow_acc},
-                        attrs={"scale": self._beta1},
-                        stop_gradient=True)
+                    block.append_op(type="scale",
+                                    inputs={"X": beta1_pow_acc},
+                                    outputs={"Out": beta1_pow_acc},
+                                    attrs={"scale": self._beta1},
+                                    stop_gradient=True)
 
     def _update_param_group(self, parameters):
         self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 0b61f3cb9a787..25f4006327d75 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -187,8 +187,8 @@ def __init__(self,
             if isinstance(parameters, (paddle.Tensor, core.eager.Tensor)):
                 raise TypeError(
                     "`parameters` argument given to the optimizer should be "
-                    "an iterable of paddle Tensors, but got argument type is `{}`.".
-                    format(type(parameters)))
+                    "an iterable of paddle Tensors, but got argument type is `{}`."
+                    .format(type(parameters)))
             if isinstance(parameters, dict):
                 raise TypeError(
                     "`parameters` argument should not get dict type, "
@@ -327,21 +327,19 @@ def _create_master_weight(self, param):
 
             var_name = param.name + "_fp32_master"
             var_name = unique_name.generate(var_name)
-            var = layers.create_global_var(
-                name=var_name,
-                shape=param.shape,
-                value=0,
-                dtype='float32',
-                persistable=True)
+            var = layers.create_global_var(name=var_name,
+                                           shape=param.shape,
+                                           value=0,
+                                           dtype='float32',
+                                           persistable=True)
             block = self.helper.startup_program.global_block()
-            block.append_op(
-                type="cast",
-                inputs={"X": [param]},
-                outputs={"Out": [var]},
-                attrs={
-                    "in_dtype": param.dtype,
-                    "out_dtype": core.VarDesc.VarType.FP32
-                })
+            block.append_op(type="cast",
+                            inputs={"X": [param]},
+                            outputs={"Out": [var]},
+                            attrs={
+                                "in_dtype": param.dtype,
+                                "out_dtype": core.VarDesc.VarType.FP32
+                            })
             self._master_weights[param.name] = var
         return var
 
@@ -359,10 +357,11 @@ def _get_accumulator(self, name, param):
         target_param = self._master_weights[
             param.name] if find_master else param
         target_name = target_param.name
-        if (name not in self._accumulators or
-                target_name not in self._accumulators[name]):
-            raise Exception("Accumulator {} does not exist for parameter {}".
-                            format(name, target_name))
+        if (name not in self._accumulators
+                or target_name not in self._accumulators[name]):
+            raise Exception(
+                "Accumulator {} does not exist for parameter {}".format(
+                    name, target_name))
         return self._accumulators[name][target_name]
 
     def _add_moments_pows(self, p):
@@ -487,13 +486,18 @@ def _append_optimize_op(self, block, param_and_grad):
             "Beta2PowOut": [beta2_pow_acc],
         }
         attrs = {
-            "lazy_mode": self._lazy_mode,
-            "min_row_size_to_use_multithread": 1000,
-            "multi_precision": find_master,
-            "with_decay": with_decay,
-            "coeff": self._weight_decay,
-            "lr_ratio": 1.
-            if self._lr_ratio is None else self._lr_ratio(param_and_grad[0])
+            "lazy_mode":
+            self._lazy_mode,
+            "min_row_size_to_use_multithread":
+            1000,
+            "multi_precision":
+            find_master,
+            "with_decay":
+            with_decay,
+            "coeff":
+            self._weight_decay,
+            "lr_ratio":
+            1. if self._lr_ratio is None else self._lr_ratio(param_and_grad[0])
         }
 
         if isinstance(self._beta1, Variable):
@@ -513,12 +517,11 @@ def _append_optimize_op(self, block, param_and_grad):
             inputs["MasterParam"] = master_weight
             outputs["MasterParamOut"] = master_weight
 
-        adamw_op = block.append_op(
-            type=self.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs,
-            stop_gradient=True)
+        adamw_op = block.append_op(type=self.type,
+                                   inputs=inputs,
+                                   outputs=outputs,
+                                   attrs=attrs,
+                                   stop_gradient=True)
 
         return adamw_op
 
@@ -564,16 +567,17 @@ def step(self):
                                 "AdamW don't support weight_decay with sparse parameters, please set it to None."
                             )
                     else:
-                        if hasattr(grad_var,
-                                   "_is_sparse") and grad_var._is_sparse(
-                                   ) and self.regularization is not None:
+                        if hasattr(
+                                grad_var, "_is_sparse") and grad_var._is_sparse(
+                                ) and self.regularization is not None:
                             raise RuntimeError(
                                 "AdamW don't support weight_decay with sparse parameters, please set it to None."
                             )
                     params_grads.append((param, grad_var))
 
-            optimize_ops = self._apply_optimize(
-                loss=None, startup_program=None, params_grads=params_grads)
+            optimize_ops = self._apply_optimize(loss=None,
+                                                startup_program=None,
+                                                params_grads=params_grads)
         else:
             # optimize parameters in groups
             for param_group in self._param_groups:
@@ -601,8 +605,9 @@ def step(self):
                 params_grads.update(
                     {k: v
                      for k, v in param_group.items() if k != 'params'})
-                self._apply_optimize(
-                    loss=None, startup_program=None, params_grads=params_grads)
+                self._apply_optimize(loss=None,
+                                     startup_program=None,
+                                     params_grads=params_grads)
 
     def _update_param_group(self, parameters):
         self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py
index e61bc8101b769..29233e6ced0a2 100644
--- a/python/paddle/optimizer/lamb.py
+++ b/python/paddle/optimizer/lamb.py
@@ -112,12 +112,11 @@ def __init__(self,
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(Lamb, self).__init__(
-            learning_rate=learning_rate,
-            parameters=parameters,
-            weight_decay=None,
-            grad_clip=grad_clip,
-            name=name)
+        super(Lamb, self).__init__(learning_rate=learning_rate,
+                                   parameters=parameters,
+                                   weight_decay=None,
+                                   grad_clip=grad_clip,
+                                   name=name)
         self.type = "lamb"
         self._beta1 = beta1
         self._beta2 = beta2
@@ -160,21 +159,19 @@ def _create_master_weight(self, param):
 
             var_name = param.name + "_fp32_master"
             var_name = unique_name.generate(var_name)
-            var = layers.create_global_var(
-                name=var_name,
-                shape=param.shape,
-                value=0,
-                dtype='float32',
-                persistable=True)
+            var = layers.create_global_var(name=var_name,
+                                           shape=param.shape,
+                                           value=0,
+                                           dtype='float32',
+                                           persistable=True)
             block = self.helper.startup_program.global_block()
-            block.append_op(
-                type="cast",
-                inputs={"X": [param]},
-                outputs={"Out": [var]},
-                attrs={
-                    "in_dtype": param.dtype,
-                    "out_dtype": core.VarDesc.VarType.FP32
-                })
+            block.append_op(type="cast",
+                            inputs={"X": [param]},
+                            outputs={"Out": [var]},
+                            attrs={
+                                "in_dtype": param.dtype,
+                                "out_dtype": core.VarDesc.VarType.FP32
+                            })
             self._master_weights[param.name] = var
         return var
 
@@ -205,10 +202,11 @@ def _get_accumulator(self, name, param):
         target_param = self._master_weights[
             param.name] if find_master else param
         target_name = target_param.name
-        if (name not in self._accumulators or
-                target_name not in self._accumulators[name]):
-            raise Exception("Accumulator {} does not exist for parameter {}".
-                            format(name, target_name))
+        if (name not in self._accumulators
+                or target_name not in self._accumulators[name]):
+            raise Exception(
+                "Accumulator {} does not exist for parameter {}".format(
+                    name, target_name))
         return self._accumulators[name][target_name]
 
     def _add_moments_pows(self, p):
@@ -310,12 +308,11 @@ def _append_optimize_op(self, block, param_and_grad):
         if found_inf:
             inputs["SkipUpdate"] = found_inf
 
-        lamb_op = block.append_op(
-            type=self.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs,
-            stop_gradient=True)
+        lamb_op = block.append_op(type=self.type,
+                                  inputs=inputs,
+                                  outputs=outputs,
+                                  attrs=attrs,
+                                  stop_gradient=True)
 
         return lamb_op
 
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 3663fc92946c8..feeef58d0b094 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -20,22 +20,10 @@
 from ..fluid.framework import _in_legacy_dygraph
 
 __all__ = [  # noqa
-    'LRScheduler',
-    'NoamDecay',
-    'PiecewiseDecay',
-    'NaturalExpDecay',
-    'InverseTimeDecay',
-    'PolynomialDecay',
-    'LinearWarmup',
-    'ExponentialDecay',
-    'MultiStepDecay',
-    'StepDecay',
-    'LambdaDecay',
-    'ReduceOnPlateau',
-    'CosineAnnealingDecay',
-    'MultiplicativeDecay',
-    'OneCycleLR',
-    'CyclicLR'
+    'LRScheduler', 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay',
+    'InverseTimeDecay', 'PolynomialDecay', 'LinearWarmup', 'ExponentialDecay',
+    'MultiStepDecay', 'StepDecay', 'LambdaDecay', 'ReduceOnPlateau',
+    'CosineAnnealingDecay', 'MultiplicativeDecay', 'OneCycleLR', 'CyclicLR'
 ]
 
 
@@ -184,8 +172,8 @@ def set_state_dict(self, state_dict):
                 self.__dict__[key] = state_dict[key]
             else:
                 raise RuntimeError(
-                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".
-                    format(key))
+                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict"
+                    .format(key))
         if len(state_dict) > len(self.keys):
             warnings.warn(
                 "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
@@ -379,8 +367,8 @@ class PiecewiseDecay(LRScheduler):
     def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
         self.boundaries = boundaries
         self.values = values
-        super(PiecewiseDecay, self).__init__(
-            last_epoch=last_epoch, verbose=verbose)
+        super(PiecewiseDecay, self).__init__(last_epoch=last_epoch,
+                                             verbose=verbose)
 
     def get_lr(self):
         for i in range(len(self.boundaries)):
@@ -669,8 +657,8 @@ def get_lr(self):
             tmp_epoch_num = min(self.last_epoch, self.decay_steps)
 
         return (self.base_lr - self.end_lr) * (
-            (1 - float(tmp_epoch_num) / float(tmp_decay_steps)
-             )**self.power) + self.end_lr
+            (1 - float(tmp_epoch_num) / float(tmp_decay_steps))**
+            self.power) + self.end_lr
 
 
 class LinearWarmup(LRScheduler):
@@ -769,8 +757,8 @@ def __init__(self,
             learning_rate, int) or isinstance(learning_rate, LRScheduler)
         if not type_check:
             raise TypeError(
-                "the type of learning_rate should be [int, float or LRScheduler], the current type is {}".
-                format(learning_rate))
+                "the type of learning_rate should be [int, float or LRScheduler], the current type is {}"
+                .format(learning_rate))
         self.learning_rate = learning_rate
         assert warmup_steps > 0 and isinstance(
             warmup_steps, int), " 'warmup_steps' must be a positive integer."
@@ -1374,8 +1362,8 @@ def step(self, metrics, epoch=None):
         elif not isinstance(metrics,
                             (int, float, numpy.float32, numpy.float64)):
             raise TypeError(
-                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}".
-                format(type(metrics)))
+                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}"
+                .format(type(metrics)))
 
         if self.cooldown_counter > 0:
             self.cooldown_counter -= 1
@@ -1518,16 +1506,16 @@ def get_lr(self):
         if self.last_epoch == 0:
             return self.base_lr
         elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
-            return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos(
-                math.pi / self.T_max)) / 2
+            return self.last_lr + (self.base_lr - self.eta_min) * (
+                1 - math.cos(math.pi / self.T_max)) / 2
 
         return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
             1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
                 self.last_lr - self.eta_min) + self.eta_min
 
     def _get_closed_form_lr(self):
-        return self.eta_min + (self.base_lr - self.eta_min) * (1 + math.cos(
-            math.pi * self.last_epoch / self.T_max)) / 2
+        return self.eta_min + (self.base_lr - self.eta_min) * (
+            1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2
 
 
 class MultiplicativeDecay(LRScheduler):
@@ -1707,16 +1695,18 @@ def __init__(self,
 
         # Check type and value of total_steps
         if not isinstance(total_steps, int):
-            raise TypeError("'total_step' must be 'int', but received {}".
-                            format(type(total_steps)))
+            raise TypeError(
+                "'total_step' must be 'int', but received {}".format(
+                    type(total_steps)))
         if total_steps <= 0:
             raise ValueError("'total_step' must be a positive integer.")
         self.total_steps = total_steps
 
         # Check type and value of pac_start
         if not isinstance(phase_pct, float):
-            raise TypeError("'phase_pct' must be 'float', but received {}".
-                            format(type(phase_pct)))
+            raise TypeError(
+                "'phase_pct' must be 'float', but received {}".format(
+                    type(phase_pct)))
         if phase_pct < 0 or phase_pct > 1:
             raise ValueError(
                 "'phase_pct' must be between 0 and 1, but received {}".format(
@@ -1775,8 +1765,8 @@ def __init__(self,
             self.anneal_func = self._linear_annealing
         else:
             raise ValueError(
-                "'anneal_strategy' must by one of 'cos' or 'linear', but received {}".
-                format(anneal_strategy))
+                "'anneal_strategy' must by one of 'cos' or 'linear', but received {}"
+                .format(anneal_strategy))
         super(OneCycleLR, self).__init__(initial_lr, last_epoch, verbose)
 
     def _cos_annealing(self, start_lr, end_lr, pct):
@@ -1794,8 +1784,8 @@ def get_lr(self):
                 "Tried to step {} times. However the number of total steps is {}"
                 .format(current_step, self.total_steps))
 
-        for (i, (end_step, step_size)
-             ) in enumerate(zip(self._step_config[1:], self._steps_size)):
+        for (i, (end_step, step_size)) in enumerate(
+                zip(self._step_config[1:], self._steps_size)):
             # i == len(self._lr_config) - 2 catch the last step, otherwise it will return None.
             if current_step <= end_step or i == len(self._lr_config) - 2:
                 # self._step_config[i] means start step of a phase.
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index ce112c19250ca..bb7765ac715dd 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -139,7 +139,8 @@ def __init__(self,
         if momentum is None:
             raise ValueError("momentum is not set")
 
-        predicate = lambda regular: isinstance(regular, (L2DecayRegularizer, float))
+        predicate = lambda regular: isinstance(regular,
+                                               (L2DecayRegularizer, float))
         if isinstance(parameters, list):
             if isinstance(parameters[0], dict):
                 for param_group in parameters:
@@ -152,12 +153,11 @@ def __init__(self,
                     param_group['weight_decay'] = py_regular
 
         py_regular = None if predicate(weight_decay) else weight_decay
-        super(Momentum, self).__init__(
-            learning_rate=learning_rate,
-            parameters=parameters,
-            weight_decay=py_regular,
-            grad_clip=grad_clip,
-            name=name)
+        super(Momentum, self).__init__(learning_rate=learning_rate,
+                                       parameters=parameters,
+                                       weight_decay=py_regular,
+                                       grad_clip=grad_clip,
+                                       name=name)
         self.type = "momentum"
         self._momentum = momentum
         self._use_nesterov = bool(use_nesterov)
@@ -211,21 +211,19 @@ def _create_master_weight(self, param):
 
             var_name = param.name + "_fp32_master"
             var_name = unique_name.generate(var_name)
-            var = layers.create_global_var(
-                name=var_name,
-                shape=param.shape,
-                value=0,
-                dtype='float32',
-                persistable=True)
+            var = layers.create_global_var(name=var_name,
+                                           shape=param.shape,
+                                           value=0,
+                                           dtype='float32',
+                                           persistable=True)
             block = self.helper.startup_program.global_block()
-            block.append_op(
-                type="cast",
-                inputs={"X": [param]},
-                outputs={"Out": [var]},
-                attrs={
-                    "in_dtype": param.dtype,
-                    "out_dtype": core.VarDesc.VarType.FP32
-                })
+            block.append_op(type="cast",
+                            inputs={"X": [param]},
+                            outputs={"Out": [var]},
+                            attrs={
+                                "in_dtype": param.dtype,
+                                "out_dtype": core.VarDesc.VarType.FP32
+                            })
             self._master_weights[param.name] = var
         return var
 
@@ -245,10 +243,11 @@ def _get_accumulator(self, name, param):
         target_param = self._master_weights[
             param.name] if find_master else param
         target_name = target_param.name
-        if (name not in self._accumulators or
-                target_name not in self._accumulators[name]):
-            raise Exception("Accumulator {} does not exist for parameter {}".
-                            format(name, target_name))
+        if (name not in self._accumulators
+                or target_name not in self._accumulators[name]):
+            raise Exception(
+                "Accumulator {} does not exist for parameter {}".format(
+                    name, target_name))
         return self._accumulators[name][target_name]
 
     def _create_accumulators(self, block, parameters):
@@ -295,7 +294,7 @@ def _append_optimize_op(self, block, param_and_grad):
                                              param_and_grad[0])
         lr = self._create_param_lr(param_and_grad)
 
-        # For fusion of momentum and l2decay 
+        # For fusion of momentum and l2decay
         param = param_and_grad[0]
         regularization_method = self._regularization_method
         regularization_coeff = self._regularization_coeff
@@ -360,12 +359,11 @@ def _append_optimize_op(self, block, param_and_grad):
             outputs["MasterParamOut"] = master_weight
 
         # create the momentum optimize op
-        momentum_op = block.append_op(
-            type=self.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs,
-            stop_gradient=True)
+        momentum_op = block.append_op(type=self.type,
+                                      inputs=inputs,
+                                      outputs=outputs,
+                                      attrs=attrs,
+                                      stop_gradient=True)
 
         return momentum_op
 
@@ -498,8 +496,10 @@ def _append_optimize_multi_tensor_op(self, target_block,
                         "VelocityOut": self._velocity_dict[key],
                     }
                     attrs = {
-                        "mu": self._momentum,
-                        "use_nesterov": self._use_nesterov,
+                        "mu":
+                        self._momentum,
+                        "use_nesterov":
+                        self._use_nesterov,
                         "regularization_method":
                         self._regularization_method_dict[key],
                         "regularization_coeff":
@@ -510,12 +510,11 @@ def _append_optimize_multi_tensor_op(self, target_block,
                         outputs["MasterParamOut"] = self._master_weight_dict[
                             key]
                         attrs["multi_precision"] = find_master
-                    target_block.append_op(
-                        type="merged_momentum",
-                        inputs=inputs,
-                        outputs=outputs,
-                        attrs=attrs,
-                        stop_gradient=True)
+                    target_block.append_op(type="merged_momentum",
+                                           inputs=inputs,
+                                           outputs=outputs,
+                                           attrs=attrs,
+                                           stop_gradient=True)
         return None
 
     def _update_param_group(self, parameters):
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index cf180fccc4857..e3e7257f75705 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -177,8 +177,8 @@ def __init__(self,
             if isinstance(parameters, (paddle.Tensor, core.eager.Tensor)):
                 raise TypeError(
                     "`parameters` argument given to the optimizer should be "
-                    "an iterable of paddle Tensors, but got argument type is `{}`.".
-                    format(type(parameters)))
+                    "an iterable of paddle Tensors, but got argument type is `{}`."
+                    .format(type(parameters)))
             if isinstance(parameters, dict):
                 raise TypeError(
                     "`parameters` argument should not get dict type, "
@@ -197,9 +197,8 @@ def __init__(self,
             if weight_decay is not None:
                 if not isinstance(self._parameter_list[0], dict):
                     for param in self._parameter_list:
-                        if hasattr(
-                                param,
-                                'regularizer') and param.regularizer is not None:
+                        if hasattr(param, 'regularizer'
+                                   ) and param.regularizer is not None:
                             logging.info(
                                 "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
                                 "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
@@ -345,7 +344,7 @@ def set_state_dict(self, state_dict):
         if isinstance(self._learning_rate, LRScheduler):
             self._learning_rate.set_state_dict(state_dict["LR_Scheduler"])
 
-        # NOTE: exclude learning rate scheduler's state from 
+        # NOTE: exclude learning rate scheduler's state from
         # _accumulators_holder.
         state_dict = state_dict.copy()
         if "LR_Scheduler" in state_dict:
@@ -406,8 +405,8 @@ def _create_global_learning_rate(self):
                 main_prog.lr_sheduler = self._learning_rate
                 main_prog.lr_var = lr_var
 
-                self._learning_rate_map[framework.default_main_program(
-                )] = lr_var
+                self._learning_rate_map[
+                    framework.default_main_program()] = lr_var
 
             lr_value = float(self._learning_rate())
             self.helper.set_variable_initializer(
@@ -475,20 +474,19 @@ def set_lr(self, value):
         current_lr = self._global_learning_rate()
         if current_lr is not None:
             if framework._non_static_mode():
-                _C_ops.fill_constant(current_lr, 'value',
-                                     float(value), 'dtype', current_lr.dtype,
-                                     'shape', list(current_lr.shape))
+                _C_ops.fill_constant(current_lr, 'value', float(value), 'dtype',
+                                     current_lr.dtype, 'shape',
+                                     list(current_lr.shape))
             else:
                 global_block = framework.default_main_program().global_block()
-                global_block.append_op(
-                    type='fill_constant',
-                    outputs={'Out': [current_lr]},
-                    attrs={
-                        'dtype': current_lr.dtype,
-                        'shape': list(current_lr.shape),
-                        'value': float(value)
-                    },
-                    stop_gradient=True)
+                global_block.append_op(type='fill_constant',
+                                       outputs={'Out': [current_lr]},
+                                       attrs={
+                                           'dtype': current_lr.dtype,
+                                           'shape': list(current_lr.shape),
+                                           'value': float(value)
+                                       },
+                                       stop_gradient=True)
 
     def get_lr(self):
         """
@@ -627,12 +625,13 @@ def _add_accumulator(self,
         """
         if self._name is not None:
             name = self._name + "_" + name
-        if (name in self._accumulators and
-                param.name in self._accumulators[name]):
+        if (name in self._accumulators
+                and param.name in self._accumulators[name]):
             if framework._non_static_mode():
                 return self._accumulators[name][param.name]
-            raise Exception("Accumulator {} already exists for parameter {}".
-                            format(name, param.name))
+            raise Exception(
+                "Accumulator {} already exists for parameter {}".format(
+                    name, param.name))
         if shape == None:
             shape = param.shape
         assert isinstance(self.helper, LayerHelper)
@@ -677,10 +676,11 @@ def _get_accumulator(self, name, param):
         """
         if self._name is not None:
             name = self._name + "_" + name
-        if (name not in self._accumulators or
-                param.name not in self._accumulators[name]):
-            raise Exception("Accumulator {} does not exist for parameter {}".
-                            format(name, param.name))
+        if (name not in self._accumulators
+                or param.name not in self._accumulators[name]):
+            raise Exception(
+                "Accumulator {} does not exist for parameter {}".format(
+                    name, param.name))
         return self._accumulators[name][param.name]
 
     def _update_param_device_map(self, parameters_and_grads, target_block):
@@ -749,8 +749,8 @@ def _create_optimization_pass(self, parameters_and_grads):
                     self._param_dict['FP16_LODTensor']) == 0:
                 if isinstance(parameters_and_grads, list):
                     self._multi_tensor_init(target_block, [
-                        p[0] for p in parameters_and_grads
-                        if not p[0].stop_gradient
+                        p[0]
+                        for p in parameters_and_grads if not p[0].stop_gradient
                     ])
                 else:
                     self._update_param_group(parameters_and_grads)
@@ -827,8 +827,8 @@ def _create_optimization_pass(self, parameters_and_grads):
                     with param_and_grad[0].block.program._optimized_guard(
                             param_and_grad), name_scope("optimizer"):
                         if param_and_grad[0].stop_gradient is False:
-                            device = self._get_device_for_param(param_and_grad[
-                                0].name)
+                            device = self._get_device_for_param(
+                                param_and_grad[0].name)
                             with device_guard(device):
                                 optimize_op = self._append_optimize_op(
                                     target_block, param_and_grad)
@@ -923,8 +923,9 @@ def backward(self,
             with program_guard(program, startup_program):
                 from paddle.incubate.autograd.utils import prim_enabled
                 if prim_enabled():
-                    params_grads = append_backward_new(
-                        [loss], parameter_list, act_no_grad_set, callbacks)
+                    params_grads = append_backward_new([loss], parameter_list,
+                                                       act_no_grad_set,
+                                                       callbacks)
                 else:
                     params_grads = append_backward(loss, parameter_list,
                                                    act_no_grad_set, callbacks)
@@ -1001,8 +1002,8 @@ def _apply_optimize(self, loss, startup_program, params_grads):
                 else:
                     grad_clip = params_grads['grad_clip']
                     if grad_clip is not None:
-                        params_grads['params'] = grad_clip(params_grads[
-                            'params'])
+                        params_grads['params'] = grad_clip(
+                            params_grads['params'])
 
                     params_grads['params'] = self.append_regularization_ops(
                         params_grads['params'], self.regularization)
@@ -1019,10 +1020,10 @@ def _create_regularization_of_grad(self, param, grad, regularization=None):
         Function helper of append_regularization_ops.
         """
         # If no gradient or no regularization is specified,  then we don't need to do anything
-        if grad is None or ((not hasattr(param, 'regularizer') or
-                             (hasattr(param, 'regularizer') and
-                              param.regularizer is None)) and
-                            regularization is None):
+        if grad is None or (
+            (not hasattr(param, 'regularizer') or
+             (hasattr(param, 'regularizer') and param.regularizer is None))
+                and regularization is None):
             return grad
         regularization_term = None
         if hasattr(param, 'regularizer') and param.regularizer is not None:
@@ -1083,8 +1084,8 @@ def append_regularization_ops(self,
         params_and_grads = []
         if framework._non_static_mode():
             for param, grad in parameters_and_grads:
-                new_grad = self._create_regularization_of_grad(param, grad,
-                                                               regularization)
+                new_grad = self._create_regularization_of_grad(
+                    param, grad, regularization)
                 params_and_grads.append((param, new_grad))
         else:
             repeate_regularizer = False
@@ -1105,9 +1106,8 @@ def append_regularization_ops(self,
     def _get_no_grad_set(self, loss, no_grad_set=None):
         no_grad_set = _get_no_grad_set_name(no_grad_set)
         parameters = loss.block.program.global_block().all_parameters()
-        param_no_trainable = set([
-            param.name for param in parameters if param.stop_gradient is True
-        ])
+        param_no_trainable = set(
+            [param.name for param in parameters if param.stop_gradient is True])
         # If the parameter is no trainable, it should not have a gradient.
         no_grad_set.update(param_no_trainable)
 
@@ -1217,14 +1217,14 @@ def minimize(self,
         parameter_list = parameters if parameters \
             else self._parameter_list
 
-        params_grads = self.backward(
-            loss,
-            startup_program=startup_program,
-            parameters=parameter_list,
-            no_grad_set=no_grad_set)
+        params_grads = self.backward(loss,
+                                     startup_program=startup_program,
+                                     parameters=parameter_list,
+                                     no_grad_set=no_grad_set)
 
-        optimize_ops = self._apply_optimize(
-            loss, startup_program=startup_program, params_grads=params_grads)
+        optimize_ops = self._apply_optimize(loss,
+                                            startup_program=startup_program,
+                                            params_grads=params_grads)
 
         return optimize_ops, params_grads
 
@@ -1264,8 +1264,9 @@ def step(self):
                     grad_var = param._grad_ivar()
                     params_grads.append((param, grad_var))
 
-            self._apply_optimize(
-                loss=None, startup_program=None, params_grads=params_grads)
+            self._apply_optimize(loss=None,
+                                 startup_program=None,
+                                 params_grads=params_grads)
 
         else:
             # optimize parameters in groups
@@ -1280,8 +1281,9 @@ def step(self):
                 params_grads.update(
                     {k: v
                      for k, v in param_group.items() if k != 'params'})
-                self._apply_optimize(
-                    loss=None, startup_program=None, params_grads=params_grads)
+                self._apply_optimize(loss=None,
+                                     startup_program=None,
+                                     params_grads=params_grads)
 
     def _add_param_group(self, param_group):
         """
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 88c39ba5a29de..7205a434d388f 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -171,12 +171,11 @@ def __init__(self,
         if not 0.0 <= rho:
             raise ValueError("Invalid value of rho, expect rho >= 0.")
 
-        super(RMSProp, self).__init__(
-            learning_rate=learning_rate,
-            parameters=parameters,
-            weight_decay=weight_decay,
-            grad_clip=grad_clip,
-            name=name)
+        super(RMSProp, self).__init__(learning_rate=learning_rate,
+                                      parameters=parameters,
+                                      weight_decay=weight_decay,
+                                      grad_clip=grad_clip,
+                                      name=name)
 
         self.type = "rmsprop"
         self._rho = rho
@@ -215,29 +214,34 @@ def _append_optimize_op(self, block, param_and_grad):
                                                 param_and_grad[0])
         mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
                                               param_and_grad[0])
-        rmsprop_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "Moment": momentum_acc,
-                "MeanSquare": mean_square_acc,
-                "MeanGrad": mean_grad_acc,
-                "LearningRate": self._create_param_lr(param_and_grad),
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "MomentOut": momentum_acc,
-                "MeanSquareOut": mean_square_acc,
-                "MeanGradOut": mean_grad_acc
-            },
-            attrs={
-                "epsilon": self._epsilon,
-                "decay": self._rho,
-                "momentum": self._momentum,
-                "centered": self._centered
-            },
-            stop_gradient=True)
+        rmsprop_op = block.append_op(type=self.type,
+                                     inputs={
+                                         "Param":
+                                         param_and_grad[0],
+                                         "Grad":
+                                         param_and_grad[1],
+                                         "Moment":
+                                         momentum_acc,
+                                         "MeanSquare":
+                                         mean_square_acc,
+                                         "MeanGrad":
+                                         mean_grad_acc,
+                                         "LearningRate":
+                                         self._create_param_lr(param_and_grad),
+                                     },
+                                     outputs={
+                                         "ParamOut": param_and_grad[0],
+                                         "MomentOut": momentum_acc,
+                                         "MeanSquareOut": mean_square_acc,
+                                         "MeanGradOut": mean_grad_acc
+                                     },
+                                     attrs={
+                                         "epsilon": self._epsilon,
+                                         "decay": self._rho,
+                                         "momentum": self._momentum,
+                                         "centered": self._centered
+                                     },
+                                     stop_gradient=True)
 
         return rmsprop_op
 
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index 46dd0b73a5eb8..60b5e38576124 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -82,12 +82,11 @@ def __init__(self,
                  name=None):
         if learning_rate is None:
             raise ValueError("learning_rate is not set")
-        super(SGD, self).__init__(
-            learning_rate=learning_rate,
-            parameters=parameters,
-            weight_decay=weight_decay,
-            grad_clip=grad_clip,
-            name=name)
+        super(SGD, self).__init__(learning_rate=learning_rate,
+                                  parameters=parameters,
+                                  weight_decay=weight_decay,
+                                  grad_clip=grad_clip,
+                                  name=name)
         self.type = "sgd"
         self._multi_precision = multi_precision
         self._master_weights = {}
@@ -100,21 +99,19 @@ def _create_master_weight(self, param):
 
             var_name = param.name + "_fp32_master"
             var_name = unique_name.generate(var_name)
-            var = layers.create_global_var(
-                name=var_name,
-                shape=param.shape,
-                value=0,
-                dtype='float32',
-                persistable=True)
+            var = layers.create_global_var(name=var_name,
+                                           shape=param.shape,
+                                           value=0,
+                                           dtype='float32',
+                                           persistable=True)
             block = self.helper.startup_program.global_block()
-            block.append_op(
-                type="cast",
-                inputs={"X": [param]},
-                outputs={"Out": [var]},
-                attrs={
-                    "in_dtype": param.dtype,
-                    "out_dtype": core.VarDesc.VarType.FP32
-                })
+            block.append_op(type="cast",
+                            inputs={"X": [param]},
+                            outputs={"Out": [var]},
+                            attrs={
+                                "in_dtype": param.dtype,
+                                "out_dtype": core.VarDesc.VarType.FP32
+                            })
             self._master_weights[param.name] = var
         return var
 
@@ -170,12 +167,11 @@ def _append_optimize_op(self, block, param_and_grad):
             inputs["MasterParam"] = master_weight
             outputs["MasterParamOut"] = master_weight
 
-        sgd_op = block.append_op(
-            type=self.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=attrs,
-            stop_gradient=True)
+        sgd_op = block.append_op(type=self.type,
+                                 inputs=inputs,
+                                 outputs=outputs,
+                                 attrs=attrs,
+                                 stop_gradient=True)
 
         return sgd_op
 
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index 9df595bc3ae73..c277dcedb4d93 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -72,8 +72,8 @@ def make_scheduler(*,
                    closed: int,
                    ready: int,
                    record: int,
-                   repeat: int=0,
-                   skip_first: int=0) -> Callable:
+                   repeat: int = 0,
+                   skip_first: int = 0) -> Callable:
     r"""
     Return a scheduler function, which scheduler the :ref:`state <api_paddle_profiler_ProfilerState>` according to the setting.
     The state transform confirms to:
@@ -156,7 +156,7 @@ def _default_state_scheduler(step: int):
 
 
 def export_chrome_tracing(dir_name: str,
-                          worker_name: Optional[str]=None) -> Callable:
+                          worker_name: Optional[str] = None) -> Callable:
     r"""
     Return a callable, used for outputing tracing data to chrome tracing format file.
     The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
@@ -206,7 +206,8 @@ def handle_fn(prof):
     return handle_fn
 
 
-def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
+def export_protobuf(dir_name: str,
+                    worker_name: Optional[str] = None) -> Callable:
     r"""
     Return a callable, used for outputing tracing data to protobuf file.
     The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
@@ -391,13 +392,13 @@ def forward(self, image, label=None):
                 # |       ips       |    1086.42904   |    1227.30604   |    959.92796    |
     """
 
-    def __init__(
-            self,
-            *,
-            targets: Optional[Iterable[ProfilerTarget]]=None,
-            scheduler: Union[Callable[[int], ProfilerState], tuple, None]=None,
-            on_trace_ready: Optional[Callable[..., Any]]=None,
-            timer_only: Optional[bool]=False):
+    def __init__(self,
+                 *,
+                 targets: Optional[Iterable[ProfilerTarget]] = None,
+                 scheduler: Union[Callable[[int], ProfilerState], tuple,
+                                  None] = None,
+                 on_trace_ready: Optional[Callable[..., Any]] = None,
+                 timer_only: Optional[bool] = False):
         supported_targets = _get_supported_targets()
         if targets:
             self.targets = set(targets)
@@ -424,17 +425,17 @@ def __init__(
             start_batch, end_batch = scheduler
             start_batch = max(start_batch, 0)
             if start_batch >= 1:
-                self.scheduler = make_scheduler(
-                    closed=max(start_batch - 1, 0),
-                    ready=1,
-                    record=(end_batch - start_batch),
-                    repeat=1)
+                self.scheduler = make_scheduler(closed=max(start_batch - 1, 0),
+                                                ready=1,
+                                                record=(end_batch -
+                                                        start_batch),
+                                                repeat=1)
             else:
-                self.scheduler = make_scheduler(
-                    closed=0,
-                    ready=0,
-                    record=(end_batch - start_batch),
-                    repeat=1)
+                self.scheduler = make_scheduler(closed=0,
+                                                ready=0,
+                                                record=(end_batch -
+                                                        start_batch),
+                                                repeat=1)
         else:
             self.scheduler = _default_state_scheduler
 
@@ -492,9 +493,9 @@ def start(self):
         elif self.current_state == ProfilerState.RECORD_AND_RETURN:
             self.profiler.prepare()
             self.profiler.start()
-        self.record_event = RecordEvent(
-            name="ProfileStep#{}".format(self.step_num),
-            event_type=TracerEventType.ProfileStep)
+        self.record_event = RecordEvent(name="ProfileStep#{}".format(
+            self.step_num),
+                                        event_type=TracerEventType.ProfileStep)
         self.record_event.begin()
 
     def stop(self):
@@ -538,7 +539,7 @@ def stop(self):
                 self.on_trace_ready(self)
         utils._is_profiler_used = False
 
-    def step(self, num_samples: Optional[int]=None):
+    def step(self, num_samples: Optional[int] = None):
         r"""
         Signals the profiler that the next profiling step has started.
         Get the new ProfilerState and trigger corresponding action.
@@ -574,9 +575,9 @@ def step(self, num_samples: Optional[int]=None):
         self.step_num += 1
         self.current_state = self.scheduler(self.step_num)
         self._trigger_action()
-        self.record_event = RecordEvent(
-            name="ProfileStep#{}".format(self.step_num),
-            event_type=TracerEventType.ProfileStep)
+        self.record_event = RecordEvent(name="ProfileStep#{}".format(
+            self.step_num),
+                                        event_type=TracerEventType.ProfileStep)
         self.record_event.begin()
 
     def step_info(self, unit=None):
@@ -747,12 +748,11 @@ def summary(self,
                 self.profiler_result.get_data(),
                 self.profiler_result.get_extra_info())
             print(
-                _build_table(
-                    statistic_data,
-                    sorted_by=sorted_by,
-                    op_detail=op_detail,
-                    thread_sep=thread_sep,
-                    time_unit=time_unit))
+                _build_table(statistic_data,
+                             sorted_by=sorted_by,
+                             op_detail=op_detail,
+                             thread_sep=thread_sep,
+                             time_unit=time_unit))
 
 
 def get_profiler(config_path):
@@ -820,6 +820,7 @@ def get_profiler(config_path):
             translated_config_dict['timer_only'] = config_dict['timer_only']
         else:
             print(
-                'Set timer_only parameter error, use default parameter instead.')
+                'Set timer_only parameter error, use default parameter instead.'
+            )
 
     return Profiler(**translated_config_dict)
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index 50aa3a1f11f85..daa6925c4b907 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -197,8 +197,8 @@ class TimeRangeSummary:
     def __init__(self):
         self.CPUTimeRange = collections.defaultdict(list)
         self.GPUTimeRange = collections.defaultdict(
-            lambda: collections.defaultdict(list)
-        )  # GPU events should be divided into different devices
+            lambda: collections.defaultdict(
+                list))  # GPU events should be divided into different devices
         self.CPUTimeRangeSum = collections.defaultdict(int)
         self.GPUTimeRangeSum = collections.defaultdict(
             lambda: collections.defaultdict(int))
@@ -212,8 +212,8 @@ def parse(self, nodetrees):
         for threadid, hostnodes in thread2hostnodes.items():
             CPUTimeRange = collections.defaultdict(list)
             GPUTimeRange = collections.defaultdict(
-                lambda: collections.defaultdict(lambda: collections.defaultdict(list))
-            )  # device_id/type/stream_id
+                lambda: collections.defaultdict(lambda: collections.defaultdict(
+                    list)))  # device_id/type/stream_id
             for hostnode in hostnodes[1:]:  #skip root node
                 CPUTimeRange[hostnode.type].append(
                     (hostnode.start_ns, hostnode.end_ns))
@@ -235,8 +235,8 @@ def parse(self, nodetrees):
             for device_id, device_time_ranges in GPUTimeRange.items():
                 for event_type, event_time_ranges in device_time_ranges.items():
                     for stream_id, time_ranges in event_time_ranges.items():
-                        time_ranges = merge_self_ranges(
-                            time_ranges, is_sorted=False)
+                        time_ranges = merge_self_ranges(time_ranges,
+                                                        is_sorted=False)
                         self.GPUTimeRange[device_id][event_type] = merge_ranges(
                             self.GPUTimeRange[device_id][event_type],
                             time_ranges,
@@ -310,25 +310,27 @@ def parse(self, nodetrees):
                         for devicenode in runtimenode.device_node:
                             if devicenode.type == TracerEventType.Kernel:
                                 if 'nccl' in devicenode.name.lower():
-                                    self.gpu_communication_range.append((
-                                        devicenode.start_ns, devicenode.end_ns))
+                                    self.gpu_communication_range.append(
+                                        (devicenode.start_ns,
+                                         devicenode.end_ns))
                                 else:
-                                    self.computation_range.append((
-                                        devicenode.start_ns, devicenode.end_ns))
+                                    self.computation_range.append(
+                                        (devicenode.start_ns,
+                                         devicenode.end_ns))
         self.cpu_calls = len(set(self.cpu_communication_range))
         self.gpu_calls = len(set(self.gpu_communication_range))
         self.cpu_communication_range = merge_self_ranges(
             self.cpu_communication_range, is_sorted=False)
         self.gpu_communication_range = merge_self_ranges(
             self.gpu_communication_range, is_sorted=False)
-        self.communication_range = merge_ranges(
-            self.cpu_communication_range,
-            self.gpu_communication_range,
-            is_sorted=True)
-        self.computation_range = merge_self_ranges(
-            self.computation_range, is_sorted=False)
-        self.overlap_range = intersection_ranges(
-            self.communication_range, self.computation_range, is_sorted=True)
+        self.communication_range = merge_ranges(self.cpu_communication_range,
+                                                self.gpu_communication_range,
+                                                is_sorted=True)
+        self.computation_range = merge_self_ranges(self.computation_range,
+                                                   is_sorted=False)
+        self.overlap_range = intersection_ranges(self.communication_range,
+                                                 self.computation_range,
+                                                 is_sorted=True)
 
 
 class EventSummary:
@@ -337,6 +339,7 @@ class EventSummary:
     """
 
     class DeviceItem:
+
         def __init__(self, name):
             self.name = name
             self.call = 0
@@ -360,6 +363,7 @@ def add_item(self, node):
             self.add_gpu_time(node.end_ns - node.start_ns)
 
     class OperatorItem:
+
         def __init__(self, name):
             self.name = name
             self.call = 0
@@ -430,6 +434,7 @@ def add_item(self, node):
                     self.devices[name].add_item(devicenode)
 
     class GeneralItem:
+
         def __init__(self, name):
             self.name = name
             self.call = 0
@@ -688,13 +693,14 @@ def format_ratio(ratio, indent=0):
     append(row_format.format(*headers))
     append(header_sep)
     row_values = [
-        'CPU(Process)', format_ratio(
-            float(statistic_data.extra_info['Process Cpu Utilization']))
+        'CPU(Process)',
+        format_ratio(float(
+            statistic_data.extra_info['Process Cpu Utilization']))
     ]
     append(row_format.format(*row_values))
     row_values = [
-        'CPU(System)', format_ratio(
-            float(statistic_data.extra_info['System Cpu Utilization']))
+        'CPU(System)',
+        format_ratio(float(statistic_data.extra_info['System Cpu Utilization']))
     ]
     append(row_format.format(*row_values))
     for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
@@ -783,20 +789,22 @@ def format_ratio(ratio, indent=0):
             TracerEventType.
             Communication] = statistic_data.distributed_summary.gpu_calls
 
-    sorted_items = sorted(
-        cpu_type_time.items(), key=lambda x: x[1], reverse=True)
+    sorted_items = sorted(cpu_type_time.items(),
+                          key=lambda x: x[1],
+                          reverse=True)
     event_type, time = sorted_items[0]
     row_values = [
         '{}'.format(str(event_type).split('.')[1]), cpu_call_times[event_type],
-        format_time(
-            time, unit=time_unit), format_ratio(float(time) / total_time)
+        format_time(time, unit=time_unit),
+        format_ratio(float(time) / total_time)
     ]
     append(row_format.format(*row_values))
     for event_type, time in sorted_items[1:]:
         row_values = [
             '  {}'.format(str(event_type).split('.')[1]),
-            cpu_call_times[event_type], format_time(
-                time, unit=time_unit), format_ratio(float(time) / total_time)
+            cpu_call_times[event_type],
+            format_time(time, unit=time_unit),
+            format_ratio(float(time) / total_time)
         ]
         append(row_format.format(*row_values))
     append(header_sep)
@@ -806,8 +814,9 @@ def format_ratio(ratio, indent=0):
     for event_type, time in gpu_type_time.items():
         row_values = [
             '  {}'.format(str(event_type).split('.')[1]),
-            gpu_call_times[event_type], format_time(
-                time, unit=time_unit), format_ratio(float(time) / total_time)
+            gpu_call_times[event_type],
+            format_time(time, unit=time_unit),
+            format_ratio(float(time) / total_time)
         ]
         append(row_format.format(*row_values))
 
@@ -851,24 +860,16 @@ def format_ratio(ratio, indent=0):
                 row_values = [
                     '{}'.format(name), item.call,
                     '{} / {} / {} / {} / {}'.format(
-                        format_time(
-                            item.cpu_time, unit=time_unit),
-                        format_time(
-                            item.avg_cpu_time, unit=time_unit),
-                        format_time(
-                            item.max_cpu_time, unit=time_unit),
-                        format_time(
-                            item.min_cpu_time, unit=time_unit),
+                        format_time(item.cpu_time, unit=time_unit),
+                        format_time(item.avg_cpu_time, unit=time_unit),
+                        format_time(item.max_cpu_time, unit=time_unit),
+                        format_time(item.min_cpu_time, unit=time_unit),
                         format_ratio(float(item.cpu_time) / total_time)),
                     '{} / {} / {} / {} / {}'.format(
-                        format_time(
-                            item.gpu_time, unit=time_unit),
-                        format_time(
-                            item.avg_gpu_time, unit=time_unit),
-                        format_time(
-                            item.max_gpu_time, unit=time_unit),
-                        format_time(
-                            item.min_gpu_time, unit=time_unit),
+                        format_time(item.gpu_time, unit=time_unit),
+                        format_time(item.avg_gpu_time, unit=time_unit),
+                        format_time(item.max_gpu_time, unit=time_unit),
+                        format_time(item.min_gpu_time, unit=time_unit),
                         format_ratio(gpu_ratio))
                 ]
                 all_row_values.append(row_values)
@@ -884,12 +885,10 @@ def format_ratio(ratio, indent=0):
             gpu_ratio = float(other_gpu_time) / gpu_total_time
         row_values = [
             '  Others', '-', '{} / - / - / - / {}'.format(
-                format_time(
-                    other_time, unit=time_unit),
+                format_time(other_time, unit=time_unit),
                 format_ratio(float(other_time) / total_time)),
             '{} / - / - / - / {}'.format(
-                format_time(
-                    other_gpu_time, unit=time_unit),
+                format_time(other_gpu_time, unit=time_unit),
                 format_ratio(gpu_ratio))
         ]
         all_row_values.append(row_values)
@@ -971,28 +970,28 @@ def format_ratio(ratio, indent=0):
         overlap_time = sum_ranges(
             statistic_data.distributed_summary.overlap_range)
         row_values = [
-            'ProfileStep', format_time(
-                total_time, unit=time_unit),
+            'ProfileStep',
+            format_time(total_time, unit=time_unit),
             format_ratio(float(total_time) / total_time)
         ]
         append(row_format.format(*row_values))
         row_values = [
-            '  Communication', format_time(
-                communication_time, unit=time_unit),
+            '  Communication',
+            format_time(communication_time, unit=time_unit),
             format_ratio(float(communication_time) / total_time)
         ]
         append(row_format.format(*row_values))
 
         row_values = [
-            '  Computation', format_time(
-                computation_time, unit=time_unit),
+            '  Computation',
+            format_time(computation_time, unit=time_unit),
             format_ratio(float(computation_time) / total_time)
         ]
         append(row_format.format(*row_values))
 
         row_values = [
-            '  Overlap', format_time(
-                overlap_time, unit=time_unit),
+            '  Overlap',
+            format_time(overlap_time, unit=time_unit),
             format_ratio(float(overlap_time) / total_time)
         ]
         append(row_format.format(*row_values))
@@ -1026,39 +1025,35 @@ def format_ratio(ratio, indent=0):
         for thread_id, items in thread_items.items():
             all_row_values.append("Thread: {}".format(thread_id))
             if sorted_by == SortedKeys.CPUTotal:
-                sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].cpu_time, reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].cpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.CPUAvg:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].avg_cpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].avg_cpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.CPUMax:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].max_cpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].max_cpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.CPUMin:
-                sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].min_cpu_time)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].min_cpu_time)
             elif sorted_by == SortedKeys.GPUTotal:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].general_gpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].general_gpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.GPUAvg:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].avg_general_gpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].avg_general_gpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.GPUMax:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].max_general_gpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].max_general_gpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.GPUMin:
-                sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].min_general_gpu_time)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].min_general_gpu_time)
             total_op_cpu_time = 0
             total_op_gpu_time = 0
 
@@ -1077,24 +1072,16 @@ def format_ratio(ratio, indent=0):
                     gpu_ratio = float(item.general_gpu_time) / total_op_gpu_time
                 row_values = [
                     name, item.call, '{} / {} / {} / {} / {}'.format(
-                        format_time(
-                            item.cpu_time, unit=time_unit),
-                        format_time(
-                            item.avg_cpu_time, unit=time_unit),
-                        format_time(
-                            item.max_cpu_time, unit=time_unit),
-                        format_time(
-                            item.min_cpu_time, unit=time_unit),
+                        format_time(item.cpu_time, unit=time_unit),
+                        format_time(item.avg_cpu_time, unit=time_unit),
+                        format_time(item.max_cpu_time, unit=time_unit),
+                        format_time(item.min_cpu_time, unit=time_unit),
                         format_ratio(cpu_ratio)),
                     '{} / {} / {} / {} / {}'.format(
-                        format_time(
-                            item.general_gpu_time, unit=time_unit),
-                        format_time(
-                            item.avg_general_gpu_time, unit=time_unit),
-                        format_time(
-                            item.max_general_gpu_time, unit=time_unit),
-                        format_time(
-                            item.min_general_gpu_time, unit=time_unit),
+                        format_time(item.general_gpu_time, unit=time_unit),
+                        format_time(item.avg_general_gpu_time, unit=time_unit),
+                        format_time(item.max_general_gpu_time, unit=time_unit),
+                        format_time(item.min_general_gpu_time, unit=time_unit),
                         format_ratio(gpu_ratio))
                 ]
                 all_row_values.append(row_values)
@@ -1117,28 +1104,24 @@ def format_ratio(ratio, indent=0):
                         row_values = [
                             '  {}'.format(innerop_name), innerop_node.call,
                             '{} / {} / {} / {} / {}'.format(
-                                format_time(
-                                    innerop_node.cpu_time, unit=time_unit),
-                                format_time(
-                                    innerop_node.avg_cpu_time, unit=time_unit),
-                                format_time(
-                                    innerop_node.max_cpu_time, unit=time_unit),
-                                format_time(
-                                    innerop_node.min_cpu_time, unit=time_unit),
+                                format_time(innerop_node.cpu_time,
+                                            unit=time_unit),
+                                format_time(innerop_node.avg_cpu_time,
+                                            unit=time_unit),
+                                format_time(innerop_node.max_cpu_time,
+                                            unit=time_unit),
+                                format_time(innerop_node.min_cpu_time,
+                                            unit=time_unit),
                                 format_ratio(cpu_ratio)),
                             '{} / {} / {} / {} / {}'.format(
-                                format_time(
-                                    innerop_node.general_gpu_time,
-                                    unit=time_unit),
-                                format_time(
-                                    innerop_node.avg_general_gpu_time,
-                                    unit=time_unit),
-                                format_time(
-                                    innerop_node.max_general_gpu_time,
-                                    unit=time_unit),
-                                format_time(
-                                    innerop_node.min_general_gpu_time,
-                                    unit=time_unit),
+                                format_time(innerop_node.general_gpu_time,
+                                            unit=time_unit),
+                                format_time(innerop_node.avg_general_gpu_time,
+                                            unit=time_unit),
+                                format_time(innerop_node.max_general_gpu_time,
+                                            unit=time_unit),
+                                format_time(innerop_node.min_general_gpu_time,
+                                            unit=time_unit),
                                 format_ratio(gpu_ratio))
                         ]
                         all_row_values.append(row_values)
@@ -1148,8 +1131,8 @@ def format_ratio(ratio, indent=0):
                                 gpu_ratio = 0
                             else:
                                 gpu_ratio = float(
-                                    device_node.
-                                    gpu_time) / innerop_node.general_gpu_time
+                                    device_node.gpu_time
+                                ) / innerop_node.general_gpu_time
                             if len(device_node_name) + 4 > name_column_width:
                                 device_node_name = device_node_name[:
                                                                     name_column_width
@@ -1159,17 +1142,14 @@ def format_ratio(ratio, indent=0):
                                 '    {}'.format(device_node_name),
                                 device_node.call, '- / - / - / - / -',
                                 '{} / {} / {} / {} / {}'.format(
-                                    format_time(
-                                        device_node.gpu_time, unit=time_unit),
-                                    format_time(
-                                        device_node.avg_gpu_time,
-                                        unit=time_unit),
-                                    format_time(
-                                        device_node.max_gpu_time,
-                                        unit=time_unit),
-                                    format_time(
-                                        device_node.min_gpu_time,
-                                        unit=time_unit),
+                                    format_time(device_node.gpu_time,
+                                                unit=time_unit),
+                                    format_time(device_node.avg_gpu_time,
+                                                unit=time_unit),
+                                    format_time(device_node.max_gpu_time,
+                                                unit=time_unit),
+                                    format_time(device_node.min_gpu_time,
+                                                unit=time_unit),
                                     format_ratio(gpu_ratio))
                             ]
                             all_row_values.append(row_values)
@@ -1188,14 +1168,14 @@ def format_ratio(ratio, indent=0):
                             '  {}'.format(device_node_name), device_node.call,
                             '- / - / - / - / -',
                             '{} / {} / {} / {} / {}'.format(
-                                format_time(
-                                    device_node.gpu_time, unit=time_unit),
-                                format_time(
-                                    device_node.avg_gpu_time, unit=time_unit),
-                                format_time(
-                                    device_node.max_gpu_time, unit=time_unit),
-                                format_time(
-                                    device_node.min_gpu_time, unit=time_unit),
+                                format_time(device_node.gpu_time,
+                                            unit=time_unit),
+                                format_time(device_node.avg_gpu_time,
+                                            unit=time_unit),
+                                format_time(device_node.max_gpu_time,
+                                            unit=time_unit),
+                                format_time(device_node.min_gpu_time,
+                                            unit=time_unit),
                                 format_ratio(gpu_ratio))
                         ]
                         all_row_values.append(row_values)
@@ -1249,21 +1229,20 @@ def format_ratio(ratio, indent=0):
         all_row_values = []
         kernel_items = statistic_data.event_summary.kernel_items
         if sorted_by == SortedKeys.GPUAvg:
-            sorted_items = sorted(
-                kernel_items.items(),
-                key=lambda x: x[1].avg_gpu_time,
-                reverse=True)
+            sorted_items = sorted(kernel_items.items(),
+                                  key=lambda x: x[1].avg_gpu_time,
+                                  reverse=True)
         elif sorted_by == SortedKeys.GPUMax:
-            sorted_items = sorted(
-                kernel_items.items(),
-                key=lambda x: x[1].max_gpu_time,
-                reverse=True)
+            sorted_items = sorted(kernel_items.items(),
+                                  key=lambda x: x[1].max_gpu_time,
+                                  reverse=True)
         elif sorted_by == SortedKeys.GPUMin:
-            sorted_items = sorted(
-                kernel_items.items(), key=lambda x: x[1].min_gpu_time)
+            sorted_items = sorted(kernel_items.items(),
+                                  key=lambda x: x[1].min_gpu_time)
         else:
-            sorted_items = sorted(
-                kernel_items.items(), key=lambda x: x[1].gpu_time, reverse=True)
+            sorted_items = sorted(kernel_items.items(),
+                                  key=lambda x: x[1].gpu_time,
+                                  reverse=True)
 
         total_kernel_gpu_time = 0
         for name, item in sorted_items:
@@ -1277,14 +1256,10 @@ def format_ratio(ratio, indent=0):
                 name,
                 item.call,
                 '{} / {} / {} / {} / {}'.format(
-                    format_time(
-                        item.gpu_time, unit=time_unit),
-                    format_time(
-                        item.avg_gpu_time, unit=time_unit),
-                    format_time(
-                        item.max_gpu_time, unit=time_unit),
-                    format_time(
-                        item.min_gpu_time, unit=time_unit),
+                    format_time(item.gpu_time, unit=time_unit),
+                    format_time(item.avg_gpu_time, unit=time_unit),
+                    format_time(item.max_gpu_time, unit=time_unit),
+                    format_time(item.min_gpu_time, unit=time_unit),
                     format_ratio(gpu_ratio)),
             ]
             all_row_values.append(row_values)
@@ -1349,24 +1324,16 @@ def format_ratio(ratio, indent=0):
                 name,
                 item.call,
                 '{} / {} / {} / {} / {}'.format(
-                    format_time(
-                        item.cpu_time, unit=time_unit),
-                    format_time(
-                        item.avg_cpu_time, unit=time_unit),
-                    format_time(
-                        item.max_cpu_time, unit=time_unit),
-                    format_time(
-                        item.min_cpu_time, unit=time_unit),
+                    format_time(item.cpu_time, unit=time_unit),
+                    format_time(item.avg_cpu_time, unit=time_unit),
+                    format_time(item.max_cpu_time, unit=time_unit),
+                    format_time(item.min_cpu_time, unit=time_unit),
                     format_ratio(float(item.cpu_time) / total_time)),
                 '{} / {} / {} / {} / {}'.format(
-                    format_time(
-                        item.general_gpu_time, unit=time_unit),
-                    format_time(
-                        item.avg_general_gpu_time, unit=time_unit),
-                    format_time(
-                        item.max_general_gpu_time, unit=time_unit),
-                    format_time(
-                        item.min_general_gpu_time, unit=time_unit),
+                    format_time(item.general_gpu_time, unit=time_unit),
+                    format_time(item.avg_general_gpu_time, unit=time_unit),
+                    format_time(item.max_general_gpu_time, unit=time_unit),
+                    format_time(item.min_general_gpu_time, unit=time_unit),
                     format_ratio(gpu_ratio)),
             ]
             all_row_values.append(row_values)
@@ -1429,39 +1396,35 @@ def format_ratio(ratio, indent=0):
         for thread_id, items in userdefined_thread_items.items():
             all_row_values.append("Thread: {}".format(thread_id))
             if sorted_by == SortedKeys.CPUTotal:
-                sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].cpu_time, reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].cpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.CPUAvg:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].avg_cpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].avg_cpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.CPUMax:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].max_cpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].max_cpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.CPUMin:
-                sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].min_cpu_time)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].min_cpu_time)
             elif sorted_by == SortedKeys.GPUTotal:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].general_gpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].general_gpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.GPUAvg:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].avg_general_gpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].avg_general_gpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.GPUMax:
-                sorted_items = sorted(
-                    items.items(),
-                    key=lambda x: x[1].max_general_gpu_time,
-                    reverse=True)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].max_general_gpu_time,
+                                      reverse=True)
             elif sorted_by == SortedKeys.GPUMin:
-                sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].min_general_gpu_time)
+                sorted_items = sorted(items.items(),
+                                      key=lambda x: x[1].min_general_gpu_time)
 
             for name, item in sorted_items:
                 if gpu_total_time == 0:
@@ -1472,24 +1435,16 @@ def format_ratio(ratio, indent=0):
                     name,
                     item.call,
                     '{} / {} / {} / {} / {}'.format(
-                        format_time(
-                            item.cpu_time, unit=time_unit),
-                        format_time(
-                            item.avg_cpu_time, unit=time_unit),
-                        format_time(
-                            item.max_cpu_time, unit=time_unit),
-                        format_time(
-                            item.min_cpu_time, unit=time_unit),
+                        format_time(item.cpu_time, unit=time_unit),
+                        format_time(item.avg_cpu_time, unit=time_unit),
+                        format_time(item.max_cpu_time, unit=time_unit),
+                        format_time(item.min_cpu_time, unit=time_unit),
                         format_ratio(float(item.cpu_time) / total_time)),
                     '{} / {} / {} / {} / {}'.format(
-                        format_time(
-                            item.general_gpu_time, unit=time_unit),
-                        format_time(
-                            item.avg_general_gpu_time, unit=time_unit),
-                        format_time(
-                            item.max_general_gpu_time, unit=time_unit),
-                        format_time(
-                            item.min_general_gpu_time, unit=time_unit),
+                        format_time(item.general_gpu_time, unit=time_unit),
+                        format_time(item.avg_general_gpu_time, unit=time_unit),
+                        format_time(item.max_general_gpu_time, unit=time_unit),
+                        format_time(item.min_general_gpu_time, unit=time_unit),
                         format_ratio(gpu_ratio)),
                 ]
                 all_row_values.append(row_values)
diff --git a/python/paddle/profiler/statistic_helper.py b/python/paddle/profiler/statistic_helper.py
index 76dd1f0a6439d..358f2a09b9264 100644
--- a/python/paddle/profiler/statistic_helper.py
+++ b/python/paddle/profiler/statistic_helper.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/profiler/timer.py b/python/paddle/profiler/timer.py
index 815775ebc6aad..35689feb56c82 100644
--- a/python/paddle/profiler/timer.py
+++ b/python/paddle/profiler/timer.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -127,24 +127,20 @@ def get_summary(self):
         else:
             speed_avg = float(self.total_iters) / self.batch_records['total']
 
-        reader_summary = dict(
-            max=self.reader_records['max'],
-            min=self.reader_records['min'],
-            avg=reader_avg)
-        batch_summary = dict(
-            max=self.batch_records['max'],
-            min=self.batch_records['min'],
-            avg=batch_avg)
-        ips_summary = dict(
-            max=self.speed_records['max'],
-            min=self.speed_records['min'],
-            avg=speed_avg)
+        reader_summary = dict(max=self.reader_records['max'],
+                              min=self.reader_records['min'],
+                              avg=reader_avg)
+        batch_summary = dict(max=self.batch_records['max'],
+                             min=self.batch_records['min'],
+                             avg=batch_avg)
+        ips_summary = dict(max=self.speed_records['max'],
+                           min=self.speed_records['min'],
+                           avg=speed_avg)
         reader_ratio = (reader_avg / batch_avg) * 100
-        summary = dict(
-            reader_summary=reader_summary,
-            batch_summary=batch_summary,
-            ips_summary=ips_summary,
-            reader_ratio=reader_ratio)
+        summary = dict(reader_summary=reader_summary,
+                       batch_summary=batch_summary,
+                       ips_summary=ips_summary,
+                       reader_ratio=reader_ratio)
 
         return summary
 
@@ -225,8 +221,8 @@ def after_step(self, benchmark):
 
         """
 
-        if (benchmark.current_event is None) or (
-                not benchmark.current_event.need_record):
+        if (benchmark.current_event is
+                None) or (not benchmark.current_event.need_record):
             return
         batch_cost = timeit.default_timer() - self.start_time
         benchmark.current_event.record_batch(batch_cost, benchmark.num_samples)
@@ -269,9 +265,7 @@ def _print_stats(self, item, message_dict):
         avg_str = '%.5f' % (message_dict['avg'])
         max_str = '%.5f' % (message_dict['max'])
         min_str = '%.5f' % (message_dict['min'])
-        print('|',
-              item.center(15), '|',
-              avg_str.center(15), '|',
+        print('|', item.center(15), '|', avg_str.center(15), '|',
               max_str.center(15), '|', min_str.center(15), '|')
 
 
@@ -399,7 +393,7 @@ def check_if_need_record(self, reader):
             elif self.current_event.reader.__dict__[
                     '_dataset'] != reader.__dict__['_dataset']:
                 # enter a new task but not calling beign() to record it.
-                # we pause the timer until the end of new task, so that 
+                # we pause the timer until the end of new task, so that
                 # the cost of new task is not added to the current event.
                 # eg. start evaluation in the training task
                 self.current_event.need_record = False
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index 5e95c83129f53..a02311cc92985 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -62,9 +62,10 @@ class RecordEvent(ContextDecorator):
         RecordEvent will take effect only when :ref:`Profiler <api_paddle_profiler_Profiler>` is on and at the state of RECORD.
     """
 
-    def __init__(self,
-                 name: str,
-                 event_type: TracerEventType=TracerEventType.PythonUserDefined):
+    def __init__(
+            self,
+            name: str,
+            event_type: TracerEventType = TracerEventType.PythonUserDefined):
         self.name = name
         self.event_type = event_type
         self.event = None
@@ -158,13 +159,14 @@ def in_profiler_mode():
 
 
 def wrap_optimizers():
+
     def optimizer_warpper(func):
+
         @functools.wraps(func)
         def warpper(*args, **kwargs):
             if in_profiler_mode():
-                with RecordEvent(
-                        'Optimization Step',
-                        event_type=TracerEventType.Optimization):
+                with RecordEvent('Optimization Step',
+                                 event_type=TracerEventType.Optimization):
                     return func(*args, **kwargs)
             else:
                 return func(*args, **kwargs)
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 66f971c59d7d5..981f6e9253c06 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -34,10 +34,10 @@
 __all__ = []
 
 # On macOS, the 'spawn' start method is now the default in Python3.8 multiprocessing,
-# Paddle is currently unable to solve this, so forces the process to start using 
+# Paddle is currently unable to solve this, so forces the process to start using
 # the 'fork' start method.
 #
-# TODO: This solution is not good, because the fork start method could lead to 
+# TODO: This solution is not good, because the fork start method could lead to
 # crashes of the subprocess. Figure out how to make 'spawn' work.
 #
 # For more details, please refer to
@@ -350,10 +350,10 @@ def read_worker(r, q):
     def data_reader():
         r = reader()
         q = Queue(maxsize=size)
-        t = Thread(
-            target=read_worker, args=(
-                r,
-                q, ))
+        t = Thread(target=read_worker, args=(
+            r,
+            q,
+        ))
         t.daemon = True
         t.start()
         e = q.get()
@@ -477,8 +477,8 @@ def xreader():
         t.start()
         # start several handle_workers
         target = order_handle_worker if order else handle_worker
-        args = (in_queue, out_queue, mapper, out_order) if order else (
-            in_queue, out_queue, mapper)
+        args = (in_queue, out_queue, mapper,
+                out_order) if order else (in_queue, out_queue, mapper)
         workers = []
         for i in range(process_num):
             worker = Thread(target=target, args=args)
@@ -614,8 +614,8 @@ def _read_into_queue(reader, queue):
     def queue_reader():
         queue = fork_context.Queue(queue_size)
         for reader in readers:
-            p = fork_context.Process(
-                target=_read_into_queue, args=(reader, queue))
+            p = fork_context.Process(target=_read_into_queue,
+                                     args=(reader, queue))
             p.start()
 
         reader_num = len(readers)
@@ -656,8 +656,8 @@ def pipe_reader():
         for reader in readers:
             parent_conn, child_conn = fork_context.Pipe()
             conns.append(parent_conn)
-            p = fork_context.Process(
-                target=_read_into_pipe, args=(reader, child_conn))
+            p = fork_context.Process(target=_read_into_pipe,
+                                     args=(reader, child_conn))
             p.start()
 
         reader_num = len(readers)
diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
index e11600a06fb9e..902a8cbe06c2f 100644
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -23,6 +23,7 @@
 
 
 def reader_creator_10(dur):
+
     def reader():
         for i in range(10):
             # this invocation helps testing paddle.reader.buffer
@@ -33,6 +34,7 @@ def reader():
 
 
 class TestMap(unittest.TestCase):
+
     def test_map(self):
         d = {"h": 0, "i": 1}
 
@@ -49,6 +51,7 @@ def read():
 
 
 class TestBuffered(unittest.TestCase):
+
     def test_read(self):
         for size in range(20):
             b = paddle.reader.buffered(reader_creator_10(0), size)
@@ -73,9 +76,10 @@ def test_buffering(self):
 
 
 class TestCompose(unittest.TestCase):
+
     def test_compse(self):
-        reader = paddle.reader.compose(
-            reader_creator_10(0), reader_creator_10(0))
+        reader = paddle.reader.compose(reader_creator_10(0),
+                                       reader_creator_10(0))
         for idx, e in enumerate(reader()):
             self.assertEqual(e, (idx, idx))
 
@@ -92,10 +96,10 @@ def test_compose_not_aligned(self):
 
     def test_compose_not_aligned_no_check(self):
         total = 0
-        reader = paddle.reader.compose(
-            paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)),
-            reader_creator_10(0),
-            check_alignment=False)
+        reader = paddle.reader.compose(paddle.reader.chain(
+            reader_creator_10(0), reader_creator_10(0)),
+                                       reader_creator_10(0),
+                                       check_alignment=False)
         for e in reader():
             total += 1
         # expecting 10, not 20
@@ -103,6 +107,7 @@ def test_compose_not_aligned_no_check(self):
 
 
 class TestChain(unittest.TestCase):
+
     def test_chain(self):
         c = paddle.reader.chain(reader_creator_10(0), reader_creator_10(0))
         idx = 0
@@ -113,6 +118,7 @@ def test_chain(self):
 
 
 class TestShuffle(unittest.TestCase):
+
     def test_shuffle(self):
         case = [(0, True), (1, True), (10, False), (100, False)]
         a = reader_creator_10(0)
@@ -127,7 +133,9 @@ def test_shuffle(self):
 
 
 class TestXmap(unittest.TestCase):
+
     def test_xmap(self):
+
         def mapper(x):
             return (x + 1)
 
@@ -151,6 +159,7 @@ def mapper(x):
 
 
 class TestMultiProcessReader(unittest.TestCase):
+
     def setup(self):
         self.samples = []
         for i in range(1000):
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index ba2f842c395ac..6725373d0570c 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -134,20 +134,19 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
         out = op(x, *attrs)
     else:
         check_variable_and_dtype(
-            x, 'x', ['int32', 'int64', 'float16', 'float32',
-                     'float64'], op_type)
+            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
+            op_type)
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(
-            type=op_type,
-            inputs={'X': x},
-            attrs={
-                'frame_length': frame_length,
-                'hop_length': hop_length,
-                'axis': axis
-            },
-            outputs={'Out': out})
+        helper.append_op(type=op_type,
+                         inputs={'X': x},
+                         attrs={
+                             'frame_length': frame_length,
+                             'hop_length': hop_length,
+                             'axis': axis
+                         },
+                         outputs={'Out': out})
     return out
 
 
@@ -220,17 +219,18 @@ def overlap_add(x, hop_length, axis=-1, name=None):
         out = op(x, *attrs)
     else:
         check_variable_and_dtype(
-            x, 'x', ['int32', 'int64', 'float16', 'float32',
-                     'float64'], op_type)
+            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
+            op_type)
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(
-            type=op_type,
-            inputs={'X': x},
-            attrs={'hop_length': hop_length,
-                   'axis': axis},
-            outputs={'Out': out})
+        helper.append_op(type=op_type,
+                         inputs={'X': x},
+                         attrs={
+                             'hop_length': hop_length,
+                             'axis': axis
+                         },
+                         outputs={'Out': out})
     return out
 
 
@@ -306,8 +306,9 @@ def stft(x,
                     paddle.randn([8, 48000], dtype=paddle.float64)*1j  # [8, 48000] complex128
             y1 = stft(x, n_fft=512, center=False, onesided=False)  # [8, 512, 372]
     """
-    check_variable_and_dtype(
-        x, 'x', ['float32', 'float64', 'complex64', 'complex128'], 'stft')
+    check_variable_and_dtype(x, 'x',
+                             ['float32', 'float64', 'complex64', 'complex128'],
+                             'stft')
 
     x_rank = len(x.shape)
     assert x_rank in [1, 2], \
@@ -368,17 +369,20 @@ def stft(x,
             'onesided should be False when input or window is a complex Tensor.'
 
     if not is_complex(x):
-        out = fft_r2c(
-            x=x_frames,
-            n=None,
-            axis=-1,
-            norm=norm,
-            forward=True,
-            onesided=onesided,
-            name=name)
+        out = fft_r2c(x=x_frames,
+                      n=None,
+                      axis=-1,
+                      norm=norm,
+                      forward=True,
+                      onesided=onesided,
+                      name=name)
     else:
-        out = fft_c2c(
-            x=x_frames, n=None, axis=-1, norm=norm, forward=True, name=name)
+        out = fft_c2c(x=x_frames,
+                      n=None,
+                      axis=-1,
+                      norm=norm,
+                      forward=True,
+                      name=name)
 
     out = out.transpose(perm=[0, 2, 1])  # (batch, n_fft, num_frames)
 
@@ -541,14 +545,14 @@ def istft(x,
 
     out = paddle.multiply(out, window).transpose(
         perm=[0, 2, 1])  # (batch, n_fft, num_frames)
-    out = overlap_add(
-        x=out, hop_length=hop_length, axis=-1)  # (batch, seq_length)
+    out = overlap_add(x=out, hop_length=hop_length,
+                      axis=-1)  # (batch, seq_length)
 
     window_envelop = overlap_add(
         x=paddle.tile(
             x=paddle.multiply(window, window).unsqueeze(0),
-            repeat_times=[n_frames, 1]).transpose(
-                perm=[1, 0]),  # (n_fft, num_frames)
+            repeat_times=[n_frames,
+                          1]).transpose(perm=[1, 0]),  # (n_fft, num_frames)
         hop_length=hop_length,
         axis=-1)  # (seq_length, )
 
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index bce77380d1fcc..8707c259ead7d 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -69,52 +69,17 @@
 from ..fluid.layers.metric_op import auc  # noqa: F401
 from ..fluid.layers.metric_op import accuracy  # noqa: F401
 
-__all__ = [     #noqa
-           'append_backward',
-           'gradients',
-           'Executor',
-           'global_scope',
-           'scope_guard',
-           'BuildStrategy',
-           'CompiledProgram',
-           'ipu_shard_guard',
-           'IpuCompiledProgram',
-           'IpuStrategy',
-           'Print',
-           'py_func',
-           'ExecutionStrategy',
-           'name_scope',
-           'ParallelExecutor',
-           'program_guard',
-           'WeightNormParamAttr',
-           'ExponentialMovingAverage',
-           'default_main_program',
-           'default_startup_program',
-           'Program',
-           'data',
-           'InputSpec',
-           'save',
-           'load',
-           'save_inference_model',
-           'load_inference_model',
-           'serialize_program',
-           'serialize_persistables',
-           'save_to_file',
-           'deserialize_program',
-           'deserialize_persistables',
-           'load_from_file',
-           'normalize_program',
-           'load_program_state',
-           'set_program_state',
-           'cpu_places',
-           'cuda_places',
-           'xpu_places',
-           'npu_places',
-           'mlu_places',
-           'Variable',
-           'create_global_var',
-           'accuracy',
-           'auc',
-           'device_guard',
-           'create_parameter'
+__all__ = [  #noqa
+    'append_backward', 'gradients', 'Executor', 'global_scope', 'scope_guard',
+    'BuildStrategy', 'CompiledProgram', 'ipu_shard_guard', 'IpuCompiledProgram',
+    'IpuStrategy', 'Print', 'py_func', 'ExecutionStrategy', 'name_scope',
+    'ParallelExecutor', 'program_guard', 'WeightNormParamAttr',
+    'ExponentialMovingAverage', 'default_main_program',
+    'default_startup_program', 'Program', 'data', 'InputSpec', 'save', 'load',
+    'save_inference_model', 'load_inference_model', 'serialize_program',
+    'serialize_persistables', 'save_to_file', 'deserialize_program',
+    'deserialize_persistables', 'load_from_file', 'normalize_program',
+    'load_program_state', 'set_program_state', 'cpu_places', 'cuda_places',
+    'xpu_places', 'npu_places', 'mlu_places', 'Variable', 'create_global_var',
+    'accuracy', 'auc', 'device_guard', 'create_parameter'
 ]
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index f58c06c9b51b6..4098ae5dbf358 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -251,8 +251,9 @@ def batch(self, batch_size):
                     format(batch_size, len(batch_size)))
             batch_size = batch_size[1]
         elif not isinstance(batch_size, six.integer_types):
-            raise TypeError("type(batch_size) shall be `int`, but received {}.".
-                            format(type(batch_size).__name__))
+            raise TypeError(
+                "type(batch_size) shall be `int`, but received {}.".format(
+                    type(batch_size).__name__))
 
         new_shape = [batch_size] + list(self.shape)
         self.shape = tuple(new_shape)
@@ -289,19 +290,20 @@ def _verify(self, shape):
         """
         if not isinstance(shape, (list, tuple)):
             raise TypeError(
-                "Type of `shape` in InputSpec should be one of (tuple, list), but received {}.".
-                format(type(shape).__name__))
+                "Type of `shape` in InputSpec should be one of (tuple, list), but received {}."
+                .format(type(shape).__name__))
         if len(shape) == 0:
             raise ValueError(
-                "`shape` in InputSpec should contain at least 1 element, but received {}.".
-                format(shape))
+                "`shape` in InputSpec should contain at least 1 element, but received {}."
+                .format(shape))
 
         for i, ele in enumerate(shape):
             if ele is not None:
                 if not isinstance(ele, six.integer_types):
                     raise ValueError(
                         "shape[{}] should be an `int`, but received `{}`:{}.".
-                        format(i, type(ele).__name__, ele))
+                        format(i,
+                               type(ele).__name__, ele))
             if ele is None or ele < -1:
                 shape[i] = -1
 
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index 05a3389fd1588..1950835151074 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -31,7 +31,8 @@
     Program,
     layers,
     unique_name,
-    program_guard, )
+    program_guard,
+)
 from paddle.fluid.io import prepend_feed_ops, append_fetch_ops
 from paddle.fluid.framework import static_only, Parameter
 from paddle.fluid.executor import Executor, global_scope
@@ -39,8 +40,9 @@
 
 __all__ = []
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(__name__,
+                     logging.INFO,
+                     fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
 def _check_args(caller, args, supported_args=None, deprecated_args=None):
@@ -49,12 +51,12 @@ def _check_args(caller, args, supported_args=None, deprecated_args=None):
     for arg in args:
         if arg in deprecated_args:
             raise ValueError(
-                "argument '{}' in function '{}' is deprecated, only {} are supported.".
-                format(arg, caller, supported_args))
+                "argument '{}' in function '{}' is deprecated, only {} are supported."
+                .format(arg, caller, supported_args))
         elif arg not in supported_args:
             raise ValueError(
-                "function '{}' doesn't support argument '{}',\n only {} are supported.".
-                format(caller, arg, supported_args))
+                "function '{}' doesn't support argument '{}',\n only {} are supported."
+                .format(caller, arg, supported_args))
 
 
 def _check_vars(name, var_list):
@@ -102,20 +104,18 @@ def _get_valid_program(program=None):
 def _clone_var_in_block(block, var):
     assert isinstance(var, Variable)
     if var.desc.type() == core.VarDesc.VarType.LOD_TENSOR:
-        return block.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            lod_level=var.lod_level,
-            persistable=True)
+        return block.create_var(name=var.name,
+                                shape=var.shape,
+                                dtype=var.dtype,
+                                type=var.type,
+                                lod_level=var.lod_level,
+                                persistable=True)
     else:
-        return block.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            persistable=True)
+        return block.create_var(name=var.name,
+                                shape=var.shape,
+                                dtype=var.dtype,
+                                type=var.type,
+                                persistable=True)
 
 
 def normalize_program(program, feed_vars, fetch_vars):
@@ -193,8 +193,9 @@ def normalize_program(program, feed_vars, fetch_vars):
         uniq_fetch_vars = []
         for i, var in enumerate(fetch_vars):
             if var.dtype != paddle.bool:
-                var = layers.scale(
-                    var, 1., name="save_infer_model/scale_{}".format(i))
+                var = layers.scale(var,
+                                   1.,
+                                   name="save_infer_model/scale_{}".format(i))
             uniq_fetch_vars.append(var)
         fetch_vars = uniq_fetch_vars
 
@@ -394,15 +395,16 @@ def _serialize_persistables(program, executor):
         in_vars.append(save_var_map[name])
 
     out_var_name = unique_name.generate("out_var")
-    out_var = save_block.create_var(
-        type=core.VarDesc.VarType.RAW, name=out_var_name)
+    out_var = save_block.create_var(type=core.VarDesc.VarType.RAW,
+                                    name=out_var_name)
     out_var.desc.set_persistable(True)
-    save_block.append_op(
-        type='save_combine',
-        inputs={'X': in_vars},
-        outputs={'Y': out_var},
-        attrs={'file_path': '',
-               'save_to_memory': True})
+    save_block.append_op(type='save_combine',
+                         inputs={'X': in_vars},
+                         outputs={'Y': out_var},
+                         attrs={
+                             'file_path': '',
+                             'save_to_memory': True
+                         })
     # run save_program to save vars
     # NOTE(zhiqiu): save op will add variable kLookupTablePath to save_program.desc,
     # which leads to diff between save_program and its desc. Call _sync_with_cpp
@@ -645,8 +647,10 @@ def deserialize_persistables(program, data, executor):
         inputs={},
         outputs={"Out": load_var_list},
         # if load from memory, file_path is data
-        attrs={'file_path': data,
-               'model_from_memory': True})
+        attrs={
+            'file_path': data,
+            'model_from_memory': True
+        })
     executor.run(load_program)
     # check var shape
     for var in check_vars:
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index b589d9f87895b..65ed35df36454 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -57,7 +57,7 @@
 from ...fluid.layers.sequence_lod import sequence_enumerate  # noqa: F401
 from ...fluid.layers.sequence_lod import sequence_reverse  # noqa: F401
 
-__all__ = [     #noqa
+__all__ = [  #noqa
     'fc',
     'batch_norm',
     'embedding',
diff --git a/python/paddle/static/sparsity/__init__.py b/python/paddle/static/sparsity/__init__.py
index b4543b8d000fc..11ff30c78e2a7 100644
--- a/python/paddle/static/sparsity/__init__.py
+++ b/python/paddle/static/sparsity/__init__.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,14 +21,11 @@
 
 
 def set_excluded_layers(main_program, param_names):
-    sparsity.set_excluded_layers(
-        param_names=param_names, main_program=main_program)
+    sparsity.set_excluded_layers(param_names=param_names,
+                                 main_program=main_program)
 
 
-__all__ = [     #noqa
-    'calculate_density',
-    'decorate',
-    'prune_model',
-    'set_excluded_layers',
+__all__ = [  #noqa
+    'calculate_density', 'decorate', 'prune_model', 'set_excluded_layers',
     'reset_excluded_layers'
 ]
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 478f4b6351fbf..3ea3ba4982599 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -277,228 +277,228 @@
 from .einsum import einsum  # noqa: F401
 
 #this list used in math_op_patch.py for _binary_creator_
-tensor_method_func  = [ #noqa
-           'matmul',
-           'dot',
-           'cov',
-           'corrcoef',
-           'norm',
-           'cond',
-           'transpose',
-           'lstsq',
-           'dist',
-           't',
-           'cross',
-           'cholesky',
-           'bmm',
-           'histogram',
-           'bincount',
-           'mv',
-           'matrix_power',
-           'qr',
-           'eigvals',
-           'eigvalsh',
-           'abs',
-           'acos',
-           'all',
-           'any',
-           'asin',
-           'atan',
-           'ceil',
-           'ceil_',
-           'cos',
-           'cosh',
-           'cumsum',
-           'cumprod',
-           'logit',
-           'exp',
-           'exp_',
-           'floor',
-           'floor_',
-           'increment',
-           'log',
-           'log2',
-           'log10',
-           'logsumexp',
-           'multiplex',
-           'pow',
-           'prod',
-           'reciprocal',
-           'reciprocal_',
-           'round',
-           'round_',
-           'rsqrt',
-           'rsqrt_',
-           'scale',
-           'scale_',
-           'sign',
-           'sin',
-           'sinh',
-           'sqrt',
-           'sqrt_',
-           'square',
-           'stanh',
-           'sum',
-           'nansum',
-           'nanmean',
-           'tanh',
-           'tanh_',
-           'add_n',
-           'max',
-           'amax',
-           'maximum',
-           'min',
-           'amin',
-           'minimum',
-           'fmax',
-           'fmin',
-           'mm',
-           'inner',
-           'outer',
-           'divide',
-           'floor_divide',
-           'remainder',
-           'mod',
-           'floor_mod',
-           'multiply',
-           'add',
-           'add_',
-           'subtract',
-           'subtract_',
-           'atan',
-           'logsumexp',
-           'inverse',
-           'log1p',
-           'erf',
-           'addmm',
-           'clip',
-           'clip_',
-           'trace',
-           'kron',
-           'kthvalue',
-           'isfinite',
-           'isinf',
-           'isnan',
-           'broadcast_shape',
-           'conj',
-           'neg',
-           'lgamma',
-           'equal',
-           'equal_all',
-           'greater_equal',
-           'greater_than',
-           'is_empty',
-           'less_equal',
-           'less_than',
-           'logical_and',
-           'logical_not',
-           'logical_or',
-           'logical_xor',
-           'not_equal',
-           'allclose',
-           'isclose',
-           'is_tensor',
-           'cast',
-           'concat',
-           'expand',
-           'broadcast_to',
-           'expand_as',
-           'flatten',
-           'flatten_',
-           'gather',
-           'gather_nd',
-           'reshape',
-           'reshape_',
-           'reverse',
-           'scatter',
-           'scatter_',
-           'scatter_nd_add',
-           'scatter_nd',
-           'shard_index',
-           'slice',
-           'split',
-           'chunk',
-           'tensordot',
-           'squeeze',
-           'squeeze_',
-           'stack',
-           'strided_slice',
-           'transpose',
-           'unique',
-           'unique_consecutive',
-           'unsqueeze',
-           'unsqueeze_',
-           'unstack',
-           'flip',
-           'rot90',
-           'unbind',
-           'roll',
-           'tile',
-           'argmax',
-           'argmin',
-           'argsort',
-           'masked_select',
-           'topk',
-           'where',
-           'index_select',
-           'nonzero',
-           'sort',
-           'index_sample',
-           'mean',
-           'std',
-           'var',
-           'numel',
-           'median',
-           'nanmedian',
-           'quantile',
-           'nanquantile',
-           'is_complex',
-           'is_integer',
-           'rank',
-           'shape',
-           'real',
-           'imag',
-           'is_floating_point',
-           'digamma',
-           'diagonal',
-           'trunc',
-           'frac',
-           'bitwise_and',
-           'bitwise_or',
-           'bitwise_xor',
-           'bitwise_not',
-           'broadcast_tensors',
-           'eig',
-           'uniform_',
-           'multi_dot',
-           'solve',
-           'cholesky_solve',
-           'triangular_solve',
-           'asinh',
-           'atanh',
-           'acosh',
-           'lu',
-           'lu_unpack',
-           'as_complex',
-           'as_real',
-           'rad2deg',
-           'deg2rad',
-           'gcd',
-           'lcm',
-           'diff',
-           "mode",
-           'lerp',
-           'lerp_',
-           'erfinv',
-           'erfinv_',
-           'angle',
-           'moveaxis',
-           'repeat_interleave',
-           'take_along_axis',
-           'put_along_axis',
-           'put_along_axis_',
-           'exponential_',
-           'heaviside',
+tensor_method_func = [  #noqa
+    'matmul',
+    'dot',
+    'cov',
+    'corrcoef',
+    'norm',
+    'cond',
+    'transpose',
+    'lstsq',
+    'dist',
+    't',
+    'cross',
+    'cholesky',
+    'bmm',
+    'histogram',
+    'bincount',
+    'mv',
+    'matrix_power',
+    'qr',
+    'eigvals',
+    'eigvalsh',
+    'abs',
+    'acos',
+    'all',
+    'any',
+    'asin',
+    'atan',
+    'ceil',
+    'ceil_',
+    'cos',
+    'cosh',
+    'cumsum',
+    'cumprod',
+    'logit',
+    'exp',
+    'exp_',
+    'floor',
+    'floor_',
+    'increment',
+    'log',
+    'log2',
+    'log10',
+    'logsumexp',
+    'multiplex',
+    'pow',
+    'prod',
+    'reciprocal',
+    'reciprocal_',
+    'round',
+    'round_',
+    'rsqrt',
+    'rsqrt_',
+    'scale',
+    'scale_',
+    'sign',
+    'sin',
+    'sinh',
+    'sqrt',
+    'sqrt_',
+    'square',
+    'stanh',
+    'sum',
+    'nansum',
+    'nanmean',
+    'tanh',
+    'tanh_',
+    'add_n',
+    'max',
+    'amax',
+    'maximum',
+    'min',
+    'amin',
+    'minimum',
+    'fmax',
+    'fmin',
+    'mm',
+    'inner',
+    'outer',
+    'divide',
+    'floor_divide',
+    'remainder',
+    'mod',
+    'floor_mod',
+    'multiply',
+    'add',
+    'add_',
+    'subtract',
+    'subtract_',
+    'atan',
+    'logsumexp',
+    'inverse',
+    'log1p',
+    'erf',
+    'addmm',
+    'clip',
+    'clip_',
+    'trace',
+    'kron',
+    'kthvalue',
+    'isfinite',
+    'isinf',
+    'isnan',
+    'broadcast_shape',
+    'conj',
+    'neg',
+    'lgamma',
+    'equal',
+    'equal_all',
+    'greater_equal',
+    'greater_than',
+    'is_empty',
+    'less_equal',
+    'less_than',
+    'logical_and',
+    'logical_not',
+    'logical_or',
+    'logical_xor',
+    'not_equal',
+    'allclose',
+    'isclose',
+    'is_tensor',
+    'cast',
+    'concat',
+    'expand',
+    'broadcast_to',
+    'expand_as',
+    'flatten',
+    'flatten_',
+    'gather',
+    'gather_nd',
+    'reshape',
+    'reshape_',
+    'reverse',
+    'scatter',
+    'scatter_',
+    'scatter_nd_add',
+    'scatter_nd',
+    'shard_index',
+    'slice',
+    'split',
+    'chunk',
+    'tensordot',
+    'squeeze',
+    'squeeze_',
+    'stack',
+    'strided_slice',
+    'transpose',
+    'unique',
+    'unique_consecutive',
+    'unsqueeze',
+    'unsqueeze_',
+    'unstack',
+    'flip',
+    'rot90',
+    'unbind',
+    'roll',
+    'tile',
+    'argmax',
+    'argmin',
+    'argsort',
+    'masked_select',
+    'topk',
+    'where',
+    'index_select',
+    'nonzero',
+    'sort',
+    'index_sample',
+    'mean',
+    'std',
+    'var',
+    'numel',
+    'median',
+    'nanmedian',
+    'quantile',
+    'nanquantile',
+    'is_complex',
+    'is_integer',
+    'rank',
+    'shape',
+    'real',
+    'imag',
+    'is_floating_point',
+    'digamma',
+    'diagonal',
+    'trunc',
+    'frac',
+    'bitwise_and',
+    'bitwise_or',
+    'bitwise_xor',
+    'bitwise_not',
+    'broadcast_tensors',
+    'eig',
+    'uniform_',
+    'multi_dot',
+    'solve',
+    'cholesky_solve',
+    'triangular_solve',
+    'asinh',
+    'atanh',
+    'acosh',
+    'lu',
+    'lu_unpack',
+    'as_complex',
+    'as_real',
+    'rad2deg',
+    'deg2rad',
+    'gcd',
+    'lcm',
+    'diff',
+    "mode",
+    'lerp',
+    'lerp_',
+    'erfinv',
+    'erfinv_',
+    'angle',
+    'moveaxis',
+    'repeat_interleave',
+    'take_along_axis',
+    'put_along_axis',
+    'put_along_axis_',
+    'exponential_',
+    'heaviside',
 ]
 
 #this list used in math_op_patch.py for magic_method bind
diff --git a/python/paddle/tensor/array.py b/python/paddle/tensor/array.py
index 856b79c2a6894..02da6926a3f5f 100644
--- a/python/paddle/tensor/array.py
+++ b/python/paddle/tensor/array.py
@@ -62,8 +62,9 @@ def array_length(array):
     helper = LayerHelper('array_length', **locals())
     tmp = helper.create_variable_for_type_inference(dtype='int64')
     tmp.stop_gradient = True
-    helper.append_op(
-        type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
+    helper.append_op(type='lod_array_length',
+                     inputs={'X': [array]},
+                     outputs={'Out': [tmp]})
     return tmp
 
 
@@ -126,11 +127,12 @@ def array_read(array, i):
             Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         raise TypeError("array should be tensor array vairable")
     out = helper.create_variable_for_type_inference(dtype=array.dtype)
-    helper.append_op(
-        type='read_from_array',
-        inputs={'X': [array],
-                'I': [i]},
-        outputs={'Out': [out]})
+    helper.append_op(type='read_from_array',
+                     inputs={
+                         'X': [array],
+                         'I': [i]
+                     },
+                     outputs={'Out': [out]})
     return out
 
 
@@ -195,8 +197,8 @@ def array_write(x, i, array=None):
     helper = LayerHelper('array_write', **locals())
     if array is not None:
         if not isinstance(
-                array,
-                Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+                array, Variable
+        ) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
             raise TypeError(
                 "array should be tensor array vairable in array_write Op")
     if array is None:
@@ -204,11 +206,12 @@ def array_write(x, i, array=None):
             name="{0}.out".format(helper.name),
             type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
             dtype=x.dtype)
-    helper.append_op(
-        type='write_to_array',
-        inputs={'X': [x],
-                'I': [i]},
-        outputs={'Out': [array]})
+    helper.append_op(type='write_to_array',
+                     inputs={
+                         'X': [x],
+                         'I': [i]
+                     },
+                     outputs={'Out': [array]})
     return array
 
 
@@ -245,16 +248,16 @@ def create_array(dtype, initialized_list=None):
     if initialized_list is not None:
         if not isinstance(initialized_list, (list, tuple)):
             raise TypeError(
-                "Require type(initialized_list) should be list/tuple, but received {}".
-                format(type(initialized_list)))
+                "Require type(initialized_list) should be list/tuple, but received {}"
+                .format(type(initialized_list)))
         array = list(initialized_list)
 
     # NOTE: Only support plain list like [x, y,...], not support nested list in static mode.
     for val in array:
         if not isinstance(val, Variable):
             raise TypeError(
-                "All values in `initialized_list` should be Variable, but recevied {}.".
-                format(type(val)))
+                "All values in `initialized_list` should be Variable, but recevied {}."
+                .format(type(val)))
 
     if _non_static_mode():
         return array
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index ca8abdaf4b3f3..e3bd7bae7d468 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -129,11 +129,10 @@ def shape(input):
     ], 'shape')
     helper = LayerHelper('shape', **locals())
     out = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type='shape',
-        inputs={'Input': input},
-        outputs={'Out': out},
-        stop_gradient=True)
+    helper.append_op(type='shape',
+                     inputs={'Input': input},
+                     outputs={'Out': out},
+                     stop_gradient=True)
 
     return out
 
@@ -168,8 +167,8 @@ def is_complex(x):
         raise TypeError("Expected Tensor, but received type of x: {}".format(
             type(x)))
     dtype = x.dtype
-    is_complex_dtype = (dtype == core.VarDesc.VarType.COMPLEX64 or
-                        dtype == core.VarDesc.VarType.COMPLEX128)
+    is_complex_dtype = (dtype == core.VarDesc.VarType.COMPLEX64
+                        or dtype == core.VarDesc.VarType.COMPLEX128)
     return is_complex_dtype
 
 
@@ -199,10 +198,10 @@ def is_floating_point(x):
         raise TypeError("Expected Tensor, but received type of x: {}".format(
             type(x)))
     dtype = x.dtype
-    is_fp_dtype = (dtype == core.VarDesc.VarType.FP32 or
-                   dtype == core.VarDesc.VarType.FP64 or
-                   dtype == core.VarDesc.VarType.FP16 or
-                   dtype == core.VarDesc.VarType.BF16)
+    is_fp_dtype = (dtype == core.VarDesc.VarType.FP32
+                   or dtype == core.VarDesc.VarType.FP64
+                   or dtype == core.VarDesc.VarType.FP16
+                   or dtype == core.VarDesc.VarType.BF16)
     return is_fp_dtype
 
 
@@ -236,11 +235,11 @@ def is_integer(x):
         raise TypeError("Expected Tensor, but received type of x: {}".format(
             type(x)))
     dtype = x.dtype
-    is_int_dtype = (dtype == core.VarDesc.VarType.UINT8 or
-                    dtype == core.VarDesc.VarType.INT8 or
-                    dtype == core.VarDesc.VarType.INT16 or
-                    dtype == core.VarDesc.VarType.INT32 or
-                    dtype == core.VarDesc.VarType.INT64)
+    is_int_dtype = (dtype == core.VarDesc.VarType.UINT8
+                    or dtype == core.VarDesc.VarType.INT8
+                    or dtype == core.VarDesc.VarType.INT16
+                    or dtype == core.VarDesc.VarType.INT32
+                    or dtype == core.VarDesc.VarType.INT64)
     return is_int_dtype
 
 
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index e37ca981f851c..67547212bb196 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -25,7 +25,7 @@
 from ..framework import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 from ..framework import convert_np_dtype_to_dtype_, _varbase_creator, OpProtoHolder
-# TODO: define functions to get create a tensor  
+# TODO: define functions to get create a tensor
 import paddle
 from paddle import _C_ops
 from ..fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check
@@ -123,10 +123,10 @@ def linspace(start, stop, num, dtype=None, name=None):
         check_dtype(num.dtype, 'num', ['int32'], 'linspace')
     check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'],
                 'linspace')
-    if ((stop_dtype == "float64" or start_dtype == "float64") and
-            out_dtype in ["float32", "int32"]) or ((stop_dtype == "int64" or
-                                                    start_dtype == "int64") and
-                                                   out_dtype == "int32"):
+    if ((stop_dtype == "float64" or start_dtype == "float64")
+            and out_dtype in ["float32", "int32"]) or (
+                (stop_dtype == "int64" or start_dtype == "int64")
+                and out_dtype == "int32"):
         raise ValueError(
             "The dtype of start/stop is {}/{} but the attr(dtype) of linspace is {}, "
             "which may cause data type overflows. Please reset attr(dtype) of linspace."
@@ -134,13 +134,14 @@ def linspace(start, stop, num, dtype=None, name=None):
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
-    helper.append_op(
-        type='linspace',
-        inputs={'Start': tensor_start,
-                'Stop': tensor_stop,
-                'Num': tensor_num},
-        attrs={'dtype': dtype},
-        outputs={'Out': [out]})
+    helper.append_op(type='linspace',
+                     inputs={
+                         'Start': tensor_start,
+                         'Stop': tensor_stop,
+                         'Num': tensor_num
+                     },
+                     attrs={'dtype': dtype},
+                     outputs={'Out': [out]})
     if isinstance(num, int):
         out.desc.set_shape((num, ))
     return out
@@ -255,16 +256,15 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
-    helper.append_op(
-        type='logspace',
-        inputs={
-            'Start': tensor_start,
-            'Stop': tensor_stop,
-            'Num': tensor_num,
-            'Base': tensor_base
-        },
-        attrs={'dtype': dtype},
-        outputs={'Out': [out]})
+    helper.append_op(type='logspace',
+                     inputs={
+                         'Start': tensor_start,
+                         'Stop': tensor_stop,
+                         'Num': tensor_num,
+                         'Base': tensor_base
+                     },
+                     attrs={'dtype': dtype},
+                     outputs={'Out': [out]})
     if isinstance(num, int):
         out.desc.set_shape((num, ))
     return out
@@ -332,9 +332,10 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     place = _get_paddle_place(place)
     if place is None:
         place = _current_expected_place()
-    elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace,
-                                core.CUDAPlace, core.NPUPlace, core.XPUPlace,
-                                core.MLUPlace, core.CustomPlace)):
+    elif not isinstance(
+            place,
+        (core.Place, core.CPUPlace, core.CUDAPinnedPlace, core.CUDAPlace,
+         core.NPUPlace, core.XPUPlace, core.MLUPlace, core.CustomPlace)):
         raise ValueError(
             "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace, paddle.MLUPlace, paddle.CustomPlace"
         )
@@ -381,8 +382,8 @@ def _handle_dtype(data, dtype):
             return data
         else:
             raise TypeError(
-                "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|np.ndarray|paddle.Tensor".
-                format(type(data)))
+                "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|np.ndarray|paddle.Tensor"
+                .format(type(data)))
         if not dtype:
             if data.dtype in [
                     'float16', 'float32', 'float64', 'complex64', 'complex128'
@@ -402,20 +403,18 @@ def _handle_dtype(data, dtype):
         data = data.astype(convert_dtype(dtype))
 
     if _in_eager_without_dygraph_check() and isinstance(data, np.ndarray):
-        return core.eager.Tensor(
-            value=data,
-            place=place,
-            persistable=False,
-            zero_copy=False,
-            name=None,
-            stop_gradient=stop_gradient)
+        return core.eager.Tensor(value=data,
+                                 place=place,
+                                 persistable=False,
+                                 zero_copy=False,
+                                 name=None,
+                                 stop_gradient=stop_gradient)
     else:
-        return paddle.Tensor(
-            value=data,
-            place=place,
-            persistable=False,
-            zero_copy=False,
-            stop_gradient=stop_gradient)
+        return paddle.Tensor(value=data,
+                             place=place,
+                             persistable=False,
+                             zero_copy=False,
+                             stop_gradient=stop_gradient)
 
 
 def full_like(x, fill_value, dtype=None, name=None):
@@ -469,12 +468,13 @@ def full_like(x, fill_value, dtype=None, name=None):
         'full_like/zeros_like/ones_like')
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
-    helper.append_op(
-        type='fill_any_like',
-        inputs={'X': [x]},
-        attrs={'value': fill_value,
-               "dtype": dtype},
-        outputs={'Out': [out]})
+    helper.append_op(type='fill_any_like',
+                     inputs={'X': [x]},
+                     attrs={
+                         'value': fill_value,
+                         "dtype": dtype
+                     },
+                     outputs={'Out': [out]})
     out.stop_gradient = True
     return out
 
@@ -482,19 +482,20 @@ def full_like(x, fill_value, dtype=None, name=None):
 def ones(shape, dtype=None, name=None):
     """
 
-    The OP creates a tensor of specified :attr:`shape` and :attr:`dtype`, and fills it with 1.
+    Create a Tensor of specified :attr:`shape` and :attr:`dtype` and fill it with 1.
 
     Args:
-        shape(tuple|list|Tensor): Shape of the Tensor to be created, the data type of shape is int32 or int64.
-        dtype(np.dtype|str, optional): Data type of output Tensor, it supports
-            bool, float16, float32, float64, int32 and int64. Default: if None, the data type is 'float32'.
-        name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        shape (tuple|list|Tensor): Shape of the Tensor to be created, the data type of shape should be int32 or int64.
+        dtype (np.dtype|str, optional): Data type of output Tensor, it should be one of
+            bool, float16, float32, float64, int32 and int64. If it is set to None, the data type will be float32.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
     
     Returns:
-        Tensor: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 1.
+        Tensor: A Tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements are 1.
 
     Examples:
         .. code-block:: python
+          :name: ones-example
 
           import paddle 
           
@@ -680,16 +681,15 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
         if not isinstance(num_rows, int) or num_rows < 0:
             raise TypeError("num_rows should be a non-negative int")
         out = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(
-            type='eye',
-            inputs={},
-            outputs={'Out': [out]},
-            attrs={
-                'num_rows': num_rows,
-                'num_columns': num_columns,
-                'dtype': dtype
-            },
-            stop_gradient=True)
+        helper.append_op(type='eye',
+                         inputs={},
+                         outputs={'Out': [out]},
+                         attrs={
+                             'num_rows': num_rows,
+                             'num_columns': num_columns,
+                             'dtype': dtype
+                         },
+                         stop_gradient=True)
 
     out.stop_gradient = True
     return out
@@ -847,12 +847,13 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
                 'range/arange')
     helper = LayerHelper('range', **locals())
     out = helper.create_variable_for_type_inference(dtype, shape=out_shape)
-    helper.append_op(
-        type='range',
-        inputs={'Start': start,
-                'End': end,
-                'Step': step},
-        outputs={'Out': out})
+    helper.append_op(type='range',
+                     inputs={
+                         'Start': start,
+                         'End': end,
+                         'Step': step
+                     },
+                     outputs={'Out': out})
     out.stop_gradient = True
     if out_shape is not None:
         out.desc.set_shape(out_shape)
@@ -878,8 +879,9 @@ def _tril_triu_op(helper):
     if name is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
+        out = helper.create_variable(name=name,
+                                     dtype=x.dtype,
+                                     persistable=False)
 
     helper.append_op(
         type="tril_triu",
@@ -888,7 +890,8 @@ def _tril_triu_op(helper):
             "diagonal": diagonal,
             "lower": True if op_type == 'tril' else False,
         },
-        outputs={"Out": out}, )
+        outputs={"Out": out},
+    )
 
     return out
 
@@ -1080,8 +1083,9 @@ def meshgrid(*args, **kwargs):
         helper.create_variable_for_type_inference(dtype=args[i].dtype)
         for i in range(num)
     ]
-    helper.append_op(
-        type='meshgrid', inputs={'X': list(args)}, outputs={'Out': out})
+    helper.append_op(type='meshgrid',
+                     inputs={'X': list(args)},
+                     outputs={'Out': out})
 
     return out
 
@@ -1185,28 +1189,33 @@ def diagflat(x, offset=0, name=None):
     out2 = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     if len(x.shape) == 1:
-        helper.append_op(
-            type='diag_v2',
-            inputs={'X': x},
-            outputs={'Out': out2},
-            attrs={'offset': offset,
-                   'padding_value': padding_value})
+        helper.append_op(type='diag_v2',
+                         inputs={'X': x},
+                         outputs={'Out': out2},
+                         attrs={
+                             'offset': offset,
+                             'padding_value': padding_value
+                         })
     else:
-        helper.append_op(
-            type='flatten_contiguous_range',
-            inputs={'X': x},
-            outputs={'Out': out1,
-                     'XShape': out1_shape},
-            attrs={'start_axis': 0,
-                   'stop_axis': -1})
+        helper.append_op(type='flatten_contiguous_range',
+                         inputs={'X': x},
+                         outputs={
+                             'Out': out1,
+                             'XShape': out1_shape
+                         },
+                         attrs={
+                             'start_axis': 0,
+                             'stop_axis': -1
+                         })
         out1.stop_gradient = True
 
-        helper.append_op(
-            type='diag_v2',
-            inputs={'X': out1},
-            outputs={'Out': out2},
-            attrs={'offset': offset,
-                   'padding_value': padding_value})
+        helper.append_op(type='diag_v2',
+                         inputs={'X': out1},
+                         outputs={'Out': out2},
+                         attrs={
+                             'offset': offset,
+                             'padding_value': padding_value
+                         })
     out2.stop_gradient = True
     return out2
 
@@ -1292,19 +1301,20 @@ def diag(x, offset=0, padding_value=0, name=None):
             check_type(padding_value, 'padding_value', (int, float), 'diag_v2')
             if len(x.shape) != 1 and len(x.shape) != 2:
                 raise ValueError(
-                    "The dimension of input x must be either 1 or 2, but received {}".
-                    format(len(x.shape)))
+                    "The dimension of input x must be either 1 or 2, but received {}"
+                    .format(len(x.shape)))
 
             helper = LayerHelper("diag_v2", **locals())
 
             out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-            helper.append_op(
-                type='diag_v2',
-                inputs={'X': x},
-                outputs={'Out': out},
-                attrs={'offset': offset,
-                       'padding_value': padding_value})
+            helper.append_op(type='diag_v2',
+                             inputs={'X': x},
+                             outputs={'Out': out},
+                             attrs={
+                                 'offset': offset,
+                                 'padding_value': padding_value
+                             })
 
             out.stop_gradient = True
             return out
@@ -1384,17 +1394,18 @@ def empty(shape, dtype=None, name=None):
         check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'empty')
 
     attrs = {}
-    utils.get_shape_tensor_inputs(
-        inputs=inputs, attrs=attrs, shape=shape, op_type='empty')
+    utils.get_shape_tensor_inputs(inputs=inputs,
+                                  attrs=attrs,
+                                  shape=shape,
+                                  op_type='empty')
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
     attrs['dtype'] = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(
-        type='empty',
-        inputs=inputs,
-        outputs={'Out': [out]},
-        attrs=attrs,
-        stop_gradient=True)
+    helper.append_op(type='empty',
+                     inputs=inputs,
+                     outputs={'Out': [out]},
+                     attrs=attrs,
+                     stop_gradient=True)
     out.stop_gradient = True
     return out
 
@@ -1451,15 +1462,16 @@ def empty_like(x, dtype=None, name=None):
     attrs = {}
     attrs['dtype'] = convert_np_dtype_to_dtype_(dtype)
     shape = paddle.shape(x)
-    utils.get_shape_tensor_inputs(
-        inputs=inputs, attrs=attrs, shape=shape, op_type='empty_like')
-
-    helper.append_op(
-        type='empty',
-        inputs=inputs,
-        outputs={'Out': [out]},
-        attrs=attrs,
-        stop_gradient=True)
+    utils.get_shape_tensor_inputs(inputs=inputs,
+                                  attrs=attrs,
+                                  shape=shape,
+                                  op_type='empty_like')
+
+    helper.append_op(type='empty',
+                     inputs=inputs,
+                     outputs={'Out': [out]},
+                     attrs=attrs,
+                     stop_gradient=True)
     out.stop_gradient = True
     return out
 
@@ -1496,8 +1508,8 @@ def assign(x, output=None):
     """
     input = x
     helper = LayerHelper('assign', **locals())
-    check_type(input, 'input', (Variable, np.ndarray, list, tuple, float, int,
-                                bool), 'assign')
+    check_type(input, 'input',
+               (Variable, np.ndarray, list, tuple, float, int, bool), 'assign')
     is_inplace = True if output is not None else False
 
     if np.isscalar(input) and not isinstance(input, str):
@@ -1527,9 +1539,9 @@ def assign(x, output=None):
             if output is None:
                 output = helper.create_variable_for_type_inference(
                     dtype=input.dtype)
-            helper.append_op(
-                type='assign', inputs={'X': [input]},
-                outputs={'Out': [output]})
+            helper.append_op(type='assign',
+                             inputs={'X': [input]},
+                             outputs={'Out': [output]})
     elif isinstance(input, np.ndarray):
         # Not support [var, var, ...] currently.
         if len(input.shape) > 0 and any(isinstance(x, Variable) for x in input):
@@ -1569,18 +1581,16 @@ def assign(x, output=None):
             output = helper.create_variable_for_type_inference(
                 dtype=input.dtype)
         if _non_static_mode():
-            _C_ops.assign_value(output, 'shape',
-                                list(input.shape), 'dtype', dtype, value_name,
-                                values)
+            _C_ops.assign_value(output, 'shape', list(input.shape), 'dtype',
+                                dtype, value_name, values)
         else:
-            helper.append_op(
-                type='assign_value',
-                outputs={'Out': [output]},
-                attrs={
-                    'dtype': dtype,
-                    'shape': list(input.shape),
-                    value_name: values
-                })
+            helper.append_op(type='assign_value',
+                             outputs={'Out': [output]},
+                             attrs={
+                                 'dtype': dtype,
+                                 'shape': list(input.shape),
+                                 value_name: values
+                             })
 
     if is_inplace and _in_legacy_dygraph():
         output._bump_inplace_version()
@@ -1618,7 +1628,7 @@ def clone(x, name=None):
     return x.clone()
 
 
-#NOTE(zhiqiu): not public 
+#NOTE(zhiqiu): not public
 def _memcpy(input, place=None, output=None):
     """
 
@@ -1671,11 +1681,10 @@ def _memcpy(input, place=None, output=None):
             dst_place_type = 4
 
     attrs = {'dst_place_type': dst_place_type}
-    helper.append_op(
-        type='memcpy',
-        inputs={'X': [input]},
-        outputs={'Out': [output]},
-        attrs=attrs)
+    helper.append_op(type='memcpy',
+                     inputs={'X': [input]},
+                     outputs={'Out': [output]},
+                     attrs=attrs)
     return output
 
 
@@ -1798,12 +1807,13 @@ def tril_indices(row, col, offset=0, dtype='int64'):
 
         out = helper.create_variable_for_type_inference(dtype=dtype)
 
-        helper.append_op(
-            type='tril_indices',
-            inputs={},
-            outputs={'out': [out]},
-            attrs={'rows': row,
-                   'cols': col,
-                   'offset': offset,
-                   'dtype': dtype})
+        helper.append_op(type='tril_indices',
+                         inputs={},
+                         outputs={'out': [out]},
+                         attrs={
+                             'rows': row,
+                             'cols': col,
+                             'offset': offset,
+                             'dtype': dtype
+                         })
     return out
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 49cc426a00fd9..0cdced2cf9b84 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -159,10 +159,11 @@ def build_view(in_labels, out_labels):
         # fill the broadcast dimension indices from right to left.
         if s:
             for ax, dim in zip(
-                    range(start, end)[::-1], range(s.start(), s.end())[::-1]):
+                    range(start, end)[::-1],
+                    range(s.start(), s.end())[::-1]):
                 inv_map[ax] = dim
 
-    # Now work on non-broadcast dimensions 
+    # Now work on non-broadcast dimensions
     if r:
         it = itertools.chain(range(start), range(end, len(out_labels)))
     else:
@@ -384,7 +385,7 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
         step = matmul, [var1, var2], var2, False, True
         plan.add_step(step)
 
-    # In the rest cases we opt for ops other than matmul 
+    # In the rest cases we opt for ops other than matmul
     else:
         # unsqueeze operands include J1...J2... dimensions
         if j2:
@@ -410,14 +411,16 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
             plan.add_step(step)
             step = squeeze, [var2], var2, [-1, -2]
             plan.add_step(step)
-        elif j1 + j2 == 0 and not-1 in np.concatenate(
+        elif j1 + j2 == 0 and not -1 in np.concatenate(
             (op1_vshape[K], op2_vshape[K])):
             assert all(op1_vshape[K] == op2_vshape[K])
-            step = reshape, [var1], var1, list(op1_vshape[
-                I]) + [1] + [np.prod(op1_vshape[K])]
+            step = reshape, [
+                var1
+            ], var1, list(op1_vshape[I]) + [1] + [np.prod(op1_vshape[K])]
             plan.add_step(step)
-            step = reshape, [var2], var2, list(op2_vshape[
-                I]) + [1] + [np.prod(op2_vshape[K])]
+            step = reshape, [
+                var2
+            ], var2, list(op2_vshape[I]) + [1] + [np.prod(op2_vshape[K])]
             plan.add_step(step)
             step = matmul, [var1, var2], var2, False, True
             plan.add_step(step)
@@ -461,8 +464,8 @@ def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,
 
     I, K, J1, J2 = list(range(n_bcast)), [], [], []
 
-    for ax, dim1, dim2 in zip(
-            range(n_bcast, ndim), op1_view[n_bcast:], op2_view[n_bcast:]):
+    for ax, dim1, dim2 in zip(range(n_bcast, ndim), op1_view[n_bcast:],
+                              op2_view[n_bcast:]):
 
         if (dim1 != -1) != (dim2 != -1):
             if dim1 != -1:
@@ -528,6 +531,7 @@ def f(*args):
 
 
 class Plan:
+
     def __init__(self):
         self.env = {}
         self.steps = []
@@ -613,8 +617,8 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast):
         # We'd like to arrange the dimensions in the following way:
         # [I...  J... K...]
         # [I...  J... K...]
-        # where  
-        #       I... are aligned and not to be combined immediately 
+        # where
+        #       I... are aligned and not to be combined immediately
         #       J... are not aligned and not to be combined immediately
         #       K... are aligned and should be immediately combined
         # At this point the non-trivial broadcast dimensinos in K are already reduced
@@ -693,8 +697,8 @@ def preprocess(equation, *operands):
     assert not ('...' in lhs and '...' not in rhs
                 ), f'Invalid equation: missing ellipsis in output labels.'
 
-    assert not (len(list(filter(has_duplicated_labels, lhs.split(',')))) > 0
-                ), f'Duplicate labels are not supported.'
+    assert not (len(list(filter(has_duplicated_labels, lhs.split(',')))) >
+                0), f'Duplicate labels are not supported.'
 
     assert not has_duplicated_labels(
         rhs), f'Invalid equation: duplicate output labels are found.'
@@ -730,6 +734,7 @@ def fake_shape(label, op):
 
 
 def rhs_inference(lhs):
+
     def is_free(key):
         return cnt.get(key) == 1 and key not in ['.', ',']
 
@@ -804,7 +809,7 @@ def gen_einsum_op(equation, *operands):
         # dygraph
         return _C_ops.einsum(operands, len(operands), 'equation', equation)[0]
 
-    # static graph 
+    # static graph
     for inp in operands:
         check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
     check_type(equation, 'equation', str, 'einsum')
@@ -816,12 +821,13 @@ def gen_einsum_op(equation, *operands):
         helper.create_variable_for_type_inference(dtype=operands[0].dtype)
         for i in range(len(operands))
     ]
-    helper.append_op(
-        type='einsum',
-        inputs={'Operands': operands},
-        outputs={'Out': out,
-                 "InnerCache": caches},
-        attrs=attrs)
+    helper.append_op(type='einsum',
+                     inputs={'Operands': operands},
+                     outputs={
+                         'Out': out,
+                         "InnerCache": caches
+                     },
+                     attrs=attrs)
     return out
 
 
@@ -1008,12 +1014,12 @@ def einsum(equation, *operands):
     n_bcast_dims = max(map(lambda s: s.count('.'), nop_labels))
 
     # Build the data structures for planning. It's helpful to think of all the operands
-    # broadcasting together from a global view. In this view, dimensions from multiple 
+    # broadcasting together from a global view. In this view, dimensions from multiple
     # operands are mapped to the same position if they are labeled uniquely. Broadcasting
     # dimensions are mapped to adjacent positions with the right bound fixed. Subject to
-    # each operand, the map is injective but for all operands the map is on-to.  
+    # each operand, the map is injective but for all operands the map is on-to.
     # g_labels:
-    #   The labels of the global view 
+    #   The labels of the global view
     # g_view:
     #   Includes a list of maps from each operand's dimensions to the global view's dimensions
     #   which we refer to as ax or axes in the code to distinguish from operand's dims
@@ -1027,8 +1033,8 @@ def einsum(equation, *operands):
     # g_count
     #   Counting how many non-trivial dimensions remain for each ax
 
-    g_labels, g_view, g_nout, g_count = build_global_view(nop_labels, rhs,
-                                                          n_bcast_dims)
+    g_labels, g_view, g_nout, g_count = build_global_view(
+        nop_labels, rhs, n_bcast_dims)
     g_shape, g_supports = build_global_shape(g_view, g_labels,
                                              [op.shape for op in operands])
 
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
index 72e5eb640125d..c6e8df67dec35 100644
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -185,8 +185,8 @@ def infer_and_check_dtype(op_proto, *args, **kwargs):
 
             for each in val:
                 if not isinstance(each, Variable):
-                    raise ValueError("input of {0} must be variable".format(
-                        op_type))
+                    raise ValueError(
+                        "input of {0} must be variable".format(op_type))
 
                 if dtype is None:
                     dtype = each.dtype
@@ -225,8 +225,8 @@ def func(*args, **kwargs):
         outputs = dict()
         out = kwargs.pop(_convert_(o_name), [])
         if out:
-            out_var = out[0] if (isinstance(out, list) or
-                                 isinstance(out, tuple)) else out
+            out_var = out[0] if (isinstance(out, list)
+                                 or isinstance(out, tuple)) else out
         else:
             out_var = helper.create_variable_for_type_inference(dtype=dtype)
         outputs[o_name] = [out_var]
@@ -234,8 +234,10 @@ def func(*args, **kwargs):
             outputs[name] = [
                 helper.create_variable_for_type_inference(dtype=dtype)
             ]
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
+        helper.append_op(type=op_type,
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=kwargs)
         return helper.append_activation(out_var)
 
     func.__name__ = op_type
@@ -307,8 +309,8 @@ def func(x, name=None):
             op = getattr(_C_ops, inplace_op_type)
             return op(x)
         warnings.warn(
-            "In static mode, {}() is the same as {}() and does not perform inplace operation.".
-            format(inplace_op_type, origin_op_type))
+            "In static mode, {}() is the same as {}() and does not perform inplace operation."
+            .format(inplace_op_type, origin_op_type))
         return generate_activation_fn(origin_op_type)(x, name)
 
     func.__name__ = inplace_op_type
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 9ba7ef532f273..0089ef21dc98a 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -115,12 +115,13 @@ def transpose(x, perm, name=None):
     helper = LayerHelper('transpose', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='transpose2',
-        inputs={'X': [x]},
-        outputs={'Out': [out],
-                 'XShape': [x_shape]},
-        attrs={'axis': perm})
+    helper.append_op(type='transpose2',
+                     inputs={'X': [x]},
+                     outputs={
+                         'Out': [out],
+                         'XShape': [x_shape]
+                     },
+                     attrs={'axis': perm})
     return out
 
 
@@ -243,12 +244,13 @@ def __check_input(x, y):
 
     helper = LayerHelper('matmul_v2', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='matmul_v2',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': out},
-        attrs=attrs)
+    helper.append_op(type='matmul_v2',
+                     inputs={
+                         'X': x,
+                         'Y': y
+                     },
+                     outputs={'Out': out},
+                     attrs=attrs)
     return out
 
 
@@ -339,8 +341,8 @@ def frobenius_norm(input, dim=None, keepdim=False, name=None):
 
         if in_dygraph_mode():
             if dim is None:
-                return _C_ops.final_state_frobenius_norm(input, [], keepdim,
-                                                         True)
+                return _C_ops.final_state_frobenius_norm(
+                    input, [], keepdim, True)
             return _C_ops.final_state_frobenius_norm(input, dim, keepdim, False)
         if _in_legacy_dygraph():
             if dim is None:
@@ -358,11 +360,10 @@ def frobenius_norm(input, dim=None, keepdim=False, name=None):
         out = helper.create_variable_for_type_inference(
             dtype=helper.input_dtype())
 
-        helper.append_op(
-            type='frobenius_norm',
-            inputs={'X': input},
-            outputs={'Out': out},
-            attrs=attrs)
+        helper.append_op(type='frobenius_norm',
+                         inputs={'X': input},
+                         outputs={'Out': out},
+                         attrs=attrs)
         return out
 
     def vector_norm(input,
@@ -407,11 +408,10 @@ def vector_norm(input,
         out = helper.create_variable_for_type_inference(
             dtype=helper.input_dtype())
 
-        helper.append_op(
-            type='p_norm',
-            inputs={'X': input},
-            outputs={'Out': out},
-            attrs=attrs)
+        helper.append_op(type='p_norm',
+                         inputs={'X': input},
+                         outputs={'Out': out},
+                         attrs=attrs)
         return out
 
     def inf_norm(input,
@@ -432,13 +432,14 @@ def inf_norm(input,
 
         reduce_type = 'reduce_max' if porder == np.float(
             'inf') else 'reduce_min'
-        helper.append_op(
-            type=reduce_type,
-            inputs={'X': out},
-            outputs={'Out': reduce_out},
-            attrs={'dim': axis,
-                   'keep_dim': keepdim,
-                   'reduce_all': reduce_all})
+        helper.append_op(type=reduce_type,
+                         inputs={'X': out},
+                         outputs={'Out': reduce_out},
+                         attrs={
+                             'dim': axis,
+                             'keep_dim': keepdim,
+                             'reduce_all': reduce_all
+                         })
 
         return reduce_out
 
@@ -452,33 +453,31 @@ def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
             dtype=block.input_dtype())
         abs_out = block.create_variable_for_type_inference(
             dtype=block.input_dtype())
-        block.append_op(
-            type='abs', inputs={'X': input}, outputs={'Out': abs_out})
+        block.append_op(type='abs',
+                        inputs={'X': input},
+                        outputs={'Out': abs_out})
         pow_out = block.create_variable_for_type_inference(
             dtype=block.input_dtype())
 
-        block.append_op(
-            type='pow',
-            inputs={'X': abs_out},
-            outputs={'Out': pow_out},
-            attrs={'factor': porder})
+        block.append_op(type='pow',
+                        inputs={'X': abs_out},
+                        outputs={'Out': pow_out},
+                        attrs={'factor': porder})
         sum_out = block.create_variable_for_type_inference(
             dtype=block.input_dtype())
-        block.append_op(
-            type='reduce_sum',
-            inputs={'X': pow_out},
-            outputs={'Out': sum_out},
-            attrs={
-                'dim': axis,
-                'keep_dim': keepdim,
-                'reduce_all': True if axis is None else False
-            })
+        block.append_op(type='reduce_sum',
+                        inputs={'X': pow_out},
+                        outputs={'Out': sum_out},
+                        attrs={
+                            'dim': axis,
+                            'keep_dim': keepdim,
+                            'reduce_all': True if axis is None else False
+                        })
         porder
-        block.append_op(
-            type='pow',
-            inputs={'X': sum_out},
-            outputs={'Out': out},
-            attrs={'factor': float(1. / porder)})
+        block.append_op(type='pow',
+                        inputs={'X': sum_out},
+                        outputs={'Out': out},
+                        attrs={'factor': float(1. / porder)})
         return out
 
     if axis is None and p is not None:
@@ -489,16 +488,16 @@ def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
                 raise ValueError(
                     "only valid string values are 'fro', found {}".format(p))
         elif isinstance(p, (int, float)):
-            return vector_norm(
-                x,
-                porder=p,
-                axis=axis,
-                keepdim=keepdim,
-                asvector=True,
-                name=name)
+            return vector_norm(x,
+                               porder=p,
+                               axis=axis,
+                               keepdim=keepdim,
+                               asvector=True,
+                               name=name)
         else:
-            raise ValueError("only valid p type is string or float, found {}".
-                             format(type(p)))
+            raise ValueError(
+                "only valid p type is string or float, found {}".format(
+                    type(p)))
 
     if isinstance(axis, tuple):
         axis = list(axis)
@@ -509,25 +508,23 @@ def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
     if isinstance(axis, int):
         if isinstance(p, str):
             if p == "fro":
-                return vector_norm(
-                    x,
-                    porder=2,
-                    axis=axis,
-                    keepdim=keepdim,
-                    asvector=False,
-                    name=name)
+                return vector_norm(x,
+                                   porder=2,
+                                   axis=axis,
+                                   keepdim=keepdim,
+                                   asvector=False,
+                                   name=name)
 
             else:
                 raise ValueError(
                     "only valid string values are 'fro', found {}".format(p))
         elif isinstance(p, (int, float)):
-            return vector_norm(
-                x,
-                axis=axis,
-                porder=p,
-                keepdim=keepdim,
-                asvector=False,
-                name=name)
+            return vector_norm(x,
+                               axis=axis,
+                               porder=p,
+                               keepdim=keepdim,
+                               asvector=False,
+                               name=name)
         else:
             raise ValueError(
                 "unspport p for p-order vector norm. except float, found {}".
@@ -540,11 +537,14 @@ def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
             return inf_norm(x, porder=p, axis=axis, keepdim=keepdim, name=name)
         elif p == 0:
             raise ValueError(
-                "just suport axis type int or list (length of list <=1) if p = 0, found {}".
-                format(axis))
+                "just suport axis type int or list (length of list <=1) if p = 0, found {}"
+                .format(axis))
         else:
-            return p_matrix_norm(
-                x, porder=p, axis=axis, keepdim=keepdim, name=name)
+            return p_matrix_norm(x,
+                                 porder=p,
+                                 axis=axis,
+                                 keepdim=keepdim,
+                                 name=name)
     else:
         raise ValueError(
             "except axis type int or list (length of list <=2), found {}".
@@ -646,8 +646,10 @@ def dist(x, y, p=2, name=None):
     inputs = {"X": [x], "Y": [y]}
     outputs = {'Out': [out]}
     attrs = {"p": float(p)}
-    helper.append_op(
-        type='dist', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    helper.append_op(type='dist',
+                     inputs=inputs,
+                     outputs={'Out': out},
+                     attrs=attrs)
     return out
 
 
@@ -769,35 +771,35 @@ def mat_norm(input, porder=1., axis=None):
             dtype=block.input_dtype())
         out = block.create_variable_for_type_inference(
             dtype=block.input_dtype())
-        block.append_op(
-            type='abs', inputs={'X': input}, outputs={'Out': abs_out})
-        block.append_op(
-            type='reduce_sum',
-            inputs={'X': abs_out},
-            outputs={'Out': sum_out},
-            attrs={'dim': axis,
-                   'keep_dim': keepdim,
-                   'reduce_all': reduce_all})
+        block.append_op(type='abs',
+                        inputs={'X': input},
+                        outputs={'Out': abs_out})
+        block.append_op(type='reduce_sum',
+                        inputs={'X': abs_out},
+                        outputs={'Out': sum_out},
+                        attrs={
+                            'dim': axis,
+                            'keep_dim': keepdim,
+                            'reduce_all': reduce_all
+                        })
         if porder == 1 or porder == np.inf:
-            block.append_op(
-                type='reduce_max',
-                inputs={'X': sum_out},
-                outputs={'Out': out},
-                attrs={
-                    'dim': [-1],
-                    'keep_dim': keepdim,
-                    'reduce_all': reduce_all
-                })
+            block.append_op(type='reduce_max',
+                            inputs={'X': sum_out},
+                            outputs={'Out': out},
+                            attrs={
+                                'dim': [-1],
+                                'keep_dim': keepdim,
+                                'reduce_all': reduce_all
+                            })
         if porder == -1 or porder == -np.inf:
-            block.append_op(
-                type='reduce_min',
-                inputs={'X': sum_out},
-                outputs={'Out': out},
-                attrs={
-                    'dim': [-1],
-                    'keep_dim': keepdim,
-                    'reduce_all': reduce_all
-                })
+            block.append_op(type='reduce_min',
+                            inputs={'X': sum_out},
+                            outputs={'Out': out},
+                            attrs={
+                                'dim': [-1],
+                                'keep_dim': keepdim,
+                                'reduce_all': reduce_all
+                            })
         return out
 
     def fro_norm(input, porder=2, axis=[-1]):
@@ -830,30 +832,30 @@ def fro_norm(input, porder=2, axis=[-1]):
             dtype=block.input_dtype())
         out = block.create_variable_for_type_inference(
             dtype=block.input_dtype())
-        block.append_op(
-            type='pow',
-            inputs={'X': input},
-            outputs={'Out': pow_out},
-            attrs={'factor': porder})
-        block.append_op(
-            type='reduce_sum',
-            inputs={'X': pow_out},
-            outputs={'Out': sum_out_1},
-            attrs={'dim': axis,
-                   'keep_dim': keepdim,
-                   'reduce_all': reduce_all})
-        block.append_op(
-            type='reduce_sum',
-            inputs={'X': sum_out_1},
-            outputs={'Out': sum_out_2},
-            attrs={'dim': axis,
-                   'keep_dim': keepdim,
-                   'reduce_all': reduce_all})
-        block.append_op(
-            type='pow',
-            inputs={'X': sum_out_2},
-            outputs={'Out': out},
-            attrs={'factor': float(1. / porder)})
+        block.append_op(type='pow',
+                        inputs={'X': input},
+                        outputs={'Out': pow_out},
+                        attrs={'factor': porder})
+        block.append_op(type='reduce_sum',
+                        inputs={'X': pow_out},
+                        outputs={'Out': sum_out_1},
+                        attrs={
+                            'dim': axis,
+                            'keep_dim': keepdim,
+                            'reduce_all': reduce_all
+                        })
+        block.append_op(type='reduce_sum',
+                        inputs={'X': sum_out_1},
+                        outputs={'Out': sum_out_2},
+                        attrs={
+                            'dim': axis,
+                            'keep_dim': keepdim,
+                            'reduce_all': reduce_all
+                        })
+        block.append_op(type='pow',
+                        inputs={'X': sum_out_2},
+                        outputs={'Out': out},
+                        attrs={'factor': float(1. / porder)})
         return out
 
     def svd_norm(input, porder, axis=[-1]):
@@ -889,51 +891,58 @@ def svd_norm(input, porder, axis=[-1]):
         out = block.create_variable_for_type_inference(
             dtype=block.input_dtype())
         if porder == "nuc":
-            block.append_op(
-                type='reduce_sum',
-                inputs={'X': s},
-                outputs={'Out': out},
-                attrs={
-                    'dim': axis,
-                    'keep_dim': keepdim,
-                    'reduce_all': reduce_all
-                })
+            block.append_op(type='reduce_sum',
+                            inputs={'X': s},
+                            outputs={'Out': out},
+                            attrs={
+                                'dim': axis,
+                                'keep_dim': keepdim,
+                                'reduce_all': reduce_all
+                            })
             return out
         max_out = block.create_variable_for_type_inference(
             dtype=block.input_dtype())
         min_out = block.create_variable_for_type_inference(
             dtype=block.input_dtype())
-        block.append_op(
-            type='reduce_max',
-            inputs={'X': s},
-            outputs={'Out': max_out},
-            attrs={'dim': axis,
-                   'keep_dim': keepdim,
-                   'reduce_all': reduce_all})
-        block.append_op(
-            type='reduce_min',
-            inputs={'X': s},
-            outputs={'Out': min_out},
-            attrs={'dim': axis,
-                   'keep_dim': keepdim,
-                   'reduce_all': reduce_all})
+        block.append_op(type='reduce_max',
+                        inputs={'X': s},
+                        outputs={'Out': max_out},
+                        attrs={
+                            'dim': axis,
+                            'keep_dim': keepdim,
+                            'reduce_all': reduce_all
+                        })
+        block.append_op(type='reduce_min',
+                        inputs={'X': s},
+                        outputs={'Out': min_out},
+                        attrs={
+                            'dim': axis,
+                            'keep_dim': keepdim,
+                            'reduce_all': reduce_all
+                        })
         if porder == 2:
-            block.append_op(
-                type='elementwise_div',
-                inputs={'X': max_out,
-                        'Y': min_out},
-                outputs={'Out': out},
-                attrs={'aixs': axis,
-                       'use_mkldnn': False})
+            block.append_op(type='elementwise_div',
+                            inputs={
+                                'X': max_out,
+                                'Y': min_out
+                            },
+                            outputs={'Out': out},
+                            attrs={
+                                'aixs': axis,
+                                'use_mkldnn': False
+                            })
             return out
         if porder == -2:
-            block.append_op(
-                type='elementwise_div',
-                inputs={'X': min_out,
-                        'Y': max_out},
-                outputs={'Out': out},
-                attrs={'aixs': axis,
-                       'use_mkldnn': False})
+            block.append_op(type='elementwise_div',
+                            inputs={
+                                'X': min_out,
+                                'Y': max_out
+                            },
+                            outputs={'Out': out},
+                            attrs={
+                                'aixs': axis,
+                                'use_mkldnn': False
+                            })
             return out
 
     def empty_tensor(input, shape):
@@ -943,9 +952,9 @@ def empty_tensor(input, shape):
 
     x_shape = list(x.shape)
     if not len(x_shape) >= 2:
-        raise ValueError("input should be a matrix or batches of matrices, " +
-                         "but the dimention of received input is {}".format(
-                             len(x_shape)))
+        raise ValueError(
+            "input should be a matrix or batches of matrices, " +
+            "but the dimention of received input is {}".format(len(x_shape)))
     if p == None:
         p = 2
     x_size = 0 if (0 in x_shape) else 1
@@ -959,13 +968,11 @@ def empty_tensor(input, shape):
             if p == "nuc":
                 return svd_norm(x, p) * svd_norm(x_inv, p)
             if p in (1, -1):
-                return mat_norm(
-                    x, porder=p, axis=[-2]) * mat_norm(
-                        x_inv, porder=p, axis=[-2])
+                return mat_norm(x, porder=p, axis=[-2]) * mat_norm(
+                    x_inv, porder=p, axis=[-2])
             if p in (np.inf, -np.inf):
-                return mat_norm(
-                    x, porder=p, axis=[-1]) * mat_norm(
-                        x_inv, porder=p, axis=[-1])
+                return mat_norm(x, porder=p, axis=[-1]) * mat_norm(
+                    x_inv, porder=p, axis=[-1])
         else:
             raise ValueError("only support p is {} when input is a ".format(p) +
                              "square matrix or batches of square matrices")
@@ -975,8 +982,8 @@ def empty_tensor(input, shape):
         return svd_norm(x, porder=p)
     else:
         raise ValueError(
-            "unsupported {} for p, only supporting ('fro', 'nuc', ".format(
-                p) + "1, -1, 2, -2, inf, -inf) or none")
+            "unsupported {} for p, only supporting ('fro', 'nuc', ".format(p) +
+            "1, -1, 2, -2, inf, -inf) or none")
 
 
 def dot(x, y, name=None):
@@ -1028,11 +1035,16 @@ def dot(x, y, name=None):
     if name is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
-    helper.append_op(
-        type="dot", inputs={'X': x,
-                            'Y': y}, attrs={}, outputs={"Out": out})
+        out = helper.create_variable(name=name,
+                                     dtype=x.dtype,
+                                     persistable=False)
+    helper.append_op(type="dot",
+                     inputs={
+                         'X': x,
+                         'Y': y
+                     },
+                     attrs={},
+                     outputs={"Out": out})
     return out
 
 
@@ -1210,8 +1222,8 @@ def t(input, name=None):
         return out
 
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32',
-                         'int64'], 'transpose')
+        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'transpose')
 
     helper = LayerHelper('t', **locals())
     out = helper.create_variable_for_type_inference(input.dtype)
@@ -1219,12 +1231,13 @@ def t(input, name=None):
     if len(input.shape) == 1:
         out = input
     else:
-        helper.append_op(
-            type='transpose2',
-            inputs={'X': [input]},
-            outputs={'Out': [out],
-                     'XShape': [input_shape]},
-            attrs={'axis': [1, 0]})
+        helper.append_op(type='transpose2',
+                         inputs={'X': [input]},
+                         outputs={
+                             'Out': [out],
+                             'XShape': [input_shape]
+                         },
+                         attrs={'axis': [1, 0]})
     return out
 
 
@@ -1281,12 +1294,13 @@ def cross(x, y, axis=9, name=None):
             attrs = dict()
             attrs['dim'] = axis
 
-            helper.append_op(
-                type='cross',
-                inputs={'X': x,
-                        'Y': y},
-                outputs={'Out': out},
-                attrs=attrs)
+            helper.append_op(type='cross',
+                             inputs={
+                                 'X': x,
+                                 'Y': y
+                             },
+                             outputs={'Out': out},
+                             attrs=attrs)
             return out
 
 
@@ -1339,11 +1353,10 @@ def cholesky(x, upper=False, name=None):
     check_type(upper, 'upper', bool, 'cholesky')
     helper = LayerHelper('cholesky', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='cholesky',
-        inputs={'X': [x]},
-        outputs={'Out': out},
-        attrs={'upper': upper})
+    helper.append_op(type='cholesky',
+                     inputs={'X': [x]},
+                     outputs={'Out': out},
+                     attrs={'upper': upper})
     return out
 
 
@@ -1393,8 +1406,9 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
             else:
                 tol_tensor = tol
             use_default_tol = False
-            return _C_ops.final_state_matrix_rank_tol(
-                x, tol_tensor, use_default_tol, hermitian)
+            return _C_ops.final_state_matrix_rank_tol(x, tol_tensor,
+                                                      use_default_tol,
+                                                      hermitian)
 
         if tol is None:
             tol_attr = 0.0
@@ -1445,8 +1459,10 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
 
     helper = LayerHelper('matrix_rank', **locals())
     out = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type='matrix_rank', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    helper.append_op(type='matrix_rank',
+                     inputs=inputs,
+                     outputs={'Out': out},
+                     attrs=attrs)
     return out
 
 
@@ -1493,16 +1509,16 @@ def bmm(x, y, name=None):
     y_shape = y.shape
     if not len(x_shape) == len(y_shape) == 3:
         raise ValueError(
-            "x and y should be 3-dimensional. But received x's dimention: {}, y's dimention: {}".
-            format(x_shape, y_shape))
+            "x and y should be 3-dimensional. But received x's dimention: {}, y's dimention: {}"
+            .format(x_shape, y_shape))
     if x_shape[2] != y_shape[1]:
         raise ValueError(
-            "x's width must be equal with y's height. But received x's shape: {}, y's shape: {}".
-            format(x_shape, y_shape))
+            "x's width must be equal with y's height. But received x's shape: {}, y's shape: {}"
+            .format(x_shape, y_shape))
     if x_shape[0] != y_shape[0]:
         raise ValueError(
-            "x's batch (shape[0]) must be equal with y's batch (shape[0]). But received x's shape: {}, y's shape: {}".
-            format(x_shape, y_shape))
+            "x's batch (shape[0]) must be equal with y's batch (shape[0]). But received x's shape: {}, y's shape: {}"
+            .format(x_shape, y_shape))
 
     if paddle.in_dynamic_mode():
         return _C_ops.bmm(x, y)
@@ -1545,16 +1561,18 @@ def histogram(input, bins=100, min=0, max=0, name=None):
         return _C_ops.histogram(input, "bins", bins, "min", min, "max", max)
 
     helper = LayerHelper('histogram', **locals())
-    check_variable_and_dtype(
-        input, 'X', ['int32', 'int64', 'float32', 'float64'], 'histogram')
+    check_variable_and_dtype(input, 'X',
+                             ['int32', 'int64', 'float32', 'float64'],
+                             'histogram')
     out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64)
-    helper.append_op(
-        type='histogram',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={'bins': bins,
-               'min': min,
-               'max': max})
+    helper.append_op(type='histogram',
+                     inputs={'X': input},
+                     outputs={'Out': out},
+                     attrs={
+                         'bins': bins,
+                         'min': min,
+                         'max': max
+                     })
     return out
 
 
@@ -1602,12 +1620,13 @@ def bincount(x, weights=None, minlength=0, name=None):
         out = helper.create_variable_for_type_inference(dtype=weights.dtype)
     else:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='bincount',
-        inputs={'X': x,
-                'Weights': weights},
-        outputs={'Out': out},
-        attrs={'minlength': minlength})
+    helper.append_op(type='bincount',
+                     inputs={
+                         'X': x,
+                         'Weights': weights
+                     },
+                     outputs={'Out': out},
+                     attrs={'minlength': minlength})
     return out
 
 
@@ -1658,20 +1677,23 @@ def __check_input(x, vec):
                 vec_shape = list(vec.shape)
                 if len(x_shape) != 2:
                     raise ValueError(
-                        "x should be 2-dimensional. But received x's dimention: {}".
-                        format(x_shape))
+                        "x should be 2-dimensional. But received x's dimention: {}"
+                        .format(x_shape))
                 if len(vec_shape) != 1:
                     raise ValueError(
-                        "vec should be 1-dimensional. But received vec's dimention: {}".
-                        format(vec_shape))
+                        "vec should be 1-dimensional. But received vec's dimention: {}"
+                        .format(vec_shape))
 
             __check_input(x, vec)
 
             helper = LayerHelper('mv', **locals())
             out = helper.create_variable_for_type_inference(dtype=x.dtype)
-            helper.append_op(
-                type='mv', inputs={'X': x,
-                                   'Vec': vec}, outputs={'Out': out})
+            helper.append_op(type='mv',
+                             inputs={
+                                 'X': x,
+                                 'Vec': vec
+                             },
+                             outputs={'Out': out})
             return out
 
 
@@ -1721,8 +1743,9 @@ def det(x, name=None):
     helper = LayerHelper('determinant', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type='determinant', inputs={'Input': [x]}, outputs={'Out': [out]})
+    helper.append_op(type='determinant',
+                     inputs={'Input': [x]},
+                     outputs={'Out': [out]})
     return out
 
 
@@ -1776,8 +1799,9 @@ def slogdet(x, name=None):
     helper = LayerHelper('slogdeterminant', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type='slogdeterminant', inputs={'Input': [x]}, outputs={'Out': [out]})
+    helper.append_op(type='slogdeterminant',
+                     inputs={'Input': [x]},
+                     outputs={'Out': [out]})
     return out
 
 
@@ -1843,10 +1867,13 @@ def svd(x, full_matrices=False, name=None):
     helper.append_op(
         type='svd',
         inputs={'X': [x]},
-        outputs={'U': u,
-                 'VH': vh,
-                 'S': s},
-        attrs=attrs, )
+        outputs={
+            'U': u,
+            'VH': vh,
+            'S': s
+        },
+        attrs=attrs,
+    )
     return u, s, vh
 
 
@@ -1915,11 +1942,10 @@ def matrix_power(x, n, name=None):
     check_type(n, 'n', int, 'matrix_power')
     helper = LayerHelper('matrix_power', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='matrix_power',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'n': n})
+    helper.append_op(type='matrix_power',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={'n': n})
     return out
 
 
@@ -1978,9 +2004,13 @@ def qr(x, mode="reduced", name=None):
     r = helper.create_variable_for_type_inference(dtype=x.dtype)
     attrs = dict()
     attrs['mode'] = mode
-    helper.append_op(
-        type='qr', inputs={'X': [x]}, outputs={'Q': q,
-                                               'R': r}, attrs=attrs)
+    helper.append_op(type='qr',
+                     inputs={'X': [x]},
+                     outputs={
+                         'Q': q,
+                         'R': r
+                     },
+                     attrs=attrs)
     if mode == "r":
         return r
     else:
@@ -2077,13 +2107,14 @@ def lu(x, pivot=True, get_infos=False, name=None):
     info = helper.create_variable_for_type_inference(dtype='int')
     attrs = dict()
     attrs['pivots'] = pivot
-    helper.append_op(
-        type='lu',
-        inputs={'X': x},
-        outputs={'Out': lu,
-                 'Pivots': p,
-                 'Infos': info},
-        attrs=attrs)
+    helper.append_op(type='lu',
+                     inputs={'X': x},
+                     outputs={
+                         'Out': lu,
+                         'Pivots': p,
+                         'Infos': info
+                     },
+                     attrs=attrs)
     if get_infos:
         return lu, p, info
     else:
@@ -2175,14 +2206,17 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
     attrs = dict()
     attrs['unpack_ludata'] = unpack_ludata
     attrs['unpack_pivots'] = unpack_pivots
-    helper.append_op(
-        type='lu_unpack',
-        inputs={'X': x,
-                'Pivots': y},
-        outputs={'Pmat': p,
-                 'L': l,
-                 'U': u},
-        attrs=attrs)
+    helper.append_op(type='lu_unpack',
+                     inputs={
+                         'X': x,
+                         'Pivots': y
+                     },
+                     outputs={
+                         'Pmat': p,
+                         'L': l,
+                         'U': u
+                     },
+                     attrs=attrs)
     return p, l, u
 
 
@@ -2238,8 +2272,9 @@ def eig(x, name=None):
         w, v = _C_ops.eig(x)
         return w, v
 
-    check_variable_and_dtype(
-        x, 'X', ['float32', 'float64', 'complex64', 'complex128'], 'eig')
+    check_variable_and_dtype(x, 'X',
+                             ['float32', 'float64', 'complex64', 'complex128'],
+                             'eig')
     helper = LayerHelper('eig', **locals())
 
     w = helper.create_variable_for_type_inference(x.dtype)
@@ -2289,19 +2324,19 @@ def eigvals(x, name=None):
     """
 
     check_variable_and_dtype(x, 'dtype',
-                             ['float32', 'float64', 'complex64',
-                              'complex128'], 'eigvals')
+                             ['float32', 'float64', 'complex64', 'complex128'],
+                             'eigvals')
 
     x_shape = list(x.shape)
     if len(x_shape) < 2:
         raise ValueError(
-            "The dimension of Input(x) should be at least 2, but received x's dimention = {}, x's shape = {}".
-            format(len(x_shape), x_shape))
+            "The dimension of Input(x) should be at least 2, but received x's dimention = {}, x's shape = {}"
+            .format(len(x_shape), x_shape))
 
     if x_shape[-1] != x_shape[-2]:
         raise ValueError(
-            "The last two dimensions of Input(x) should be equal, but received x's shape = {}".
-            format(x_shape))
+            "The last two dimensions of Input(x) should be equal, but received x's shape = {}"
+            .format(x_shape))
 
     if paddle.in_dynamic_mode():
         return _C_ops.eigvals(x)
@@ -2442,8 +2477,8 @@ def __check_input(x, UPLO):
                 "length of Input(input) is %s." % len(x.shape))
         if x_shape[-1] != x_shape[-2]:
             raise ValueError(
-                "The input matrix must be batches of square matrices. But received x's dimention: {}".
-                format(x_shape))
+                "The input matrix must be batches of square matrices. But received x's dimention: {}"
+                .format(x_shape))
         if UPLO != 'L' and UPLO != 'U':
             raise ValueError(
                 "UPLO must be L or U. But received UPLO is: {}".format(UPLO))
@@ -2451,18 +2486,20 @@ def __check_input(x, UPLO):
     __check_input(x, UPLO)
 
     helper = LayerHelper('eigh', **locals())
-    check_variable_and_dtype(
-        x, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], 'eigh')
+    check_variable_and_dtype(x, 'dtype',
+                             ['float32', 'float64', 'complex64', 'complex128'],
+                             'eigh')
 
     out_value = helper.create_variable_for_type_inference(dtype=x.dtype)
     out_vector = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type='eigh',
-        inputs={'X': x},
-        outputs={'Eigenvalues': out_value,
-                 'Eigenvectors': out_vector},
-        attrs={'UPLO': UPLO})
+    helper.append_op(type='eigh',
+                     inputs={'X': x},
+                     outputs={
+                         'Eigenvalues': out_value,
+                         'Eigenvectors': out_vector
+                     },
+                     attrs={'UPLO': UPLO})
     return out_value, out_vector
 
 
@@ -2599,19 +2636,23 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             helper.append_op(
                 type='svd',
                 inputs={'X': [x]},
-                outputs={'U': u,
-                         'VH': vt,
-                         'S': s},
-                attrs={'full_matrices': False}, )
+                outputs={
+                    'U': u,
+                    'VH': vt,
+                    'S': s
+                },
+                attrs={'full_matrices': False},
+            )
 
             max_singular_val = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(
-                type='reduce_max',
-                inputs={'X': s},
-                outputs={'Out': max_singular_val},
-                attrs={'dim': [-1],
-                       'keep_dim': True,
-                       'reduce_all': False})
+            helper.append_op(type='reduce_max',
+                             inputs={'X': s},
+                             outputs={'Out': max_singular_val},
+                             attrs={
+                                 'dim': [-1],
+                                 'keep_dim': True,
+                                 'reduce_all': False
+                             })
 
             rcond = full(shape=[1], fill_value=rcond, dtype=dtype)
             cutoff = rcond * max_singular_val
@@ -2627,49 +2668,59 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
 
             st = helper.create_variable_for_type_inference(dtype=dtype)
             st_shape = helper.create_variable_for_type_inference(dtype=dtype)
-            helper.append_op(
-                type='unsqueeze2',
-                inputs={'X': singular},
-                attrs={'axes': [-2]},
-                outputs={'Out': st,
-                         'XShape': st_shape})
+            helper.append_op(type='unsqueeze2',
+                             inputs={'X': singular},
+                             attrs={'axes': [-2]},
+                             outputs={
+                                 'Out': st,
+                                 'XShape': st_shape
+                             })
 
             dims = list(range(len(vt.shape)))
             perm = dims[:-2] + [dims[-1]] + [dims[-2]]
             v = helper.create_variable_for_type_inference(dtype)
             v_shape = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(
-                type='transpose2',
-                inputs={'X': [vt]},
-                outputs={'Out': [v],
-                         'XShape': [v_shape]},
-                attrs={'axis': perm})
+            helper.append_op(type='transpose2',
+                             inputs={'X': [vt]},
+                             outputs={
+                                 'Out': [v],
+                                 'XShape': [v_shape]
+                             },
+                             attrs={'axis': perm})
 
             out_1 = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(
-                type='elementwise_mul',
-                inputs={'X': v,
-                        'Y': st},
-                outputs={'Out': out_1},
-                attrs={'axis': -1,
-                       'use_mkldnn': False})
+            helper.append_op(type='elementwise_mul',
+                             inputs={
+                                 'X': v,
+                                 'Y': st
+                             },
+                             outputs={'Out': out_1},
+                             attrs={
+                                 'axis': -1,
+                                 'use_mkldnn': False
+                             })
             out_1 = helper.append_activation(out_1)
 
             out_2 = helper.create_variable_for_type_inference(dtype)
             helper.append_op(
                 type='matmul_v2',
-                inputs={'X': out_1,
-                        'Y': u},
+                inputs={
+                    'X': out_1,
+                    'Y': u
+                },
                 outputs={'Out': out_2},
-                attrs={'trans_x': False,
-                       'trans_y': True}, )
+                attrs={
+                    'trans_x': False,
+                    'trans_y': True
+                },
+            )
             return out_2
         else:
             helper = LayerHelper('pinv', **locals())
             dtype = x.dtype
             check_variable_and_dtype(
-                x, 'dtype', ['float32', 'float64', 'complex64',
-                             'complex128'], 'pinv')
+                x, 'dtype', ['float32', 'float64', 'complex64', 'complex128'],
+                'pinv')
 
             if dtype == paddle.complex128:
                 s_type = 'float64'
@@ -2680,23 +2731,26 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
 
             u = helper.create_variable_for_type_inference(dtype)
             s = helper.create_variable_for_type_inference(s_type)
-            helper.append_op(
-                type='eigh',
-                inputs={'X': x},
-                outputs={'Eigenvalues': s,
-                         'Eigenvectors': u},
-                attrs={'UPLO': 'L'})
+            helper.append_op(type='eigh',
+                             inputs={'X': x},
+                             outputs={
+                                 'Eigenvalues': s,
+                                 'Eigenvectors': u
+                             },
+                             attrs={'UPLO': 'L'})
             s_abs = helper.create_variable_for_type_inference(s_type)
-            helper.append_op(
-                type='abs', inputs={'X': s}, outputs={'Out': s_abs})
+            helper.append_op(type='abs',
+                             inputs={'X': s},
+                             outputs={'Out': s_abs})
             max_singular_val = helper.create_variable_for_type_inference(s_type)
-            helper.append_op(
-                type='reduce_max',
-                inputs={'X': s_abs},
-                outputs={'Out': max_singular_val},
-                attrs={'dim': [-1],
-                       'keep_dim': True,
-                       'reduce_all': False})
+            helper.append_op(type='reduce_max',
+                             inputs={'X': s_abs},
+                             outputs={'Out': max_singular_val},
+                             attrs={
+                                 'dim': [-1],
+                                 'keep_dim': True,
+                                 'reduce_all': False
+                             })
 
             rcond = full(shape=[1], fill_value=rcond, dtype=s_type)
             cutoff = rcond * max_singular_val
@@ -2712,35 +2766,45 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
 
             st = helper.create_variable_for_type_inference(dtype=s_type)
             st_shape = helper.create_variable_for_type_inference(dtype=s_type)
-            helper.append_op(
-                type='unsqueeze2',
-                inputs={'X': singular},
-                attrs={'axes': [-2]},
-                outputs={'Out': st,
-                         'XShape': st_shape})
+            helper.append_op(type='unsqueeze2',
+                             inputs={'X': singular},
+                             attrs={'axes': [-2]},
+                             outputs={
+                                 'Out': st,
+                                 'XShape': st_shape
+                             })
 
             out_1 = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(
-                type='elementwise_mul',
-                inputs={'X': u,
-                        'Y': st},
-                outputs={'Out': out_1},
-                attrs={'axis': -1,
-                       'use_mkldnn': False})
+            helper.append_op(type='elementwise_mul',
+                             inputs={
+                                 'X': u,
+                                 'Y': st
+                             },
+                             outputs={'Out': out_1},
+                             attrs={
+                                 'axis': -1,
+                                 'use_mkldnn': False
+                             })
             out_1 = helper.append_activation(out_1)
 
             u_conj = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(
-                type='conj', inputs={'X': u}, outputs={'Out': [u_conj]})
+            helper.append_op(type='conj',
+                             inputs={'X': u},
+                             outputs={'Out': [u_conj]})
 
             out_2 = helper.create_variable_for_type_inference(dtype)
             helper.append_op(
                 type='matmul_v2',
-                inputs={'X': out_1,
-                        'Y': u_conj},
+                inputs={
+                    'X': out_1,
+                    'Y': u_conj
+                },
                 outputs={'Out': out_2},
-                attrs={'trans_x': False,
-                       'trans_y': True}, )
+                attrs={
+                    'trans_x': False,
+                    'trans_y': True
+                },
+            )
             return out_2
 
 
@@ -2795,9 +2859,12 @@ def solve(x, y, name=None):
     check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'solve')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type="solve", inputs={"X": x,
-                              "Y": y}, outputs={"Out": out})
+    helper.append_op(type="solve",
+                     inputs={
+                         "X": x,
+                         "Y": y
+                     },
+                     outputs={"Out": out})
     return out
 
 
@@ -2865,16 +2932,17 @@ def triangular_solve(x,
     check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'triangular_solve')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type='triangular_solve',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': out},
-        attrs={
-            'upper': upper,
-            'transpose': transpose,
-            'unitriangular': unitriangular
-        })
+    helper.append_op(type='triangular_solve',
+                     inputs={
+                         'X': x,
+                         'Y': y
+                     },
+                     outputs={'Out': out},
+                     attrs={
+                         'upper': upper,
+                         'transpose': transpose,
+                         'unitriangular': unitriangular
+                     })
     return out
 
 
@@ -2922,12 +2990,13 @@ def cholesky_solve(x, y, upper=False, name=None):
     check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'cholesky_solve')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type='cholesky_solve',
-        inputs={'X': x,
-                'Y': y},
-        outputs={'Out': out},
-        attrs={'upper': upper})
+    helper.append_op(type='cholesky_solve',
+                     inputs={
+                         'X': x,
+                         'Y': y
+                     },
+                     outputs={'Out': out},
+                     attrs={'upper': upper})
     return out
 
 
@@ -2971,8 +3040,8 @@ def __check_input(x, UPLO):
                 "length of Input(input) is %s." % len(x.shape))
         if x_shape[-1] != x_shape[-2]:
             raise ValueError(
-                "The input matrix must be batches of square matrices. But received x's dimention: {}".
-                format(x_shape))
+                "The input matrix must be batches of square matrices. But received x's dimention: {}"
+                .format(x_shape))
         if UPLO != 'L' and UPLO != 'U':
             raise ValueError(
                 "UPLO must be L or U. But received UPLO is: {}".format(UPLO))
@@ -2988,13 +3057,16 @@ def __check_input(x, UPLO):
     out_vector = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     is_test = x.stop_gradient
-    helper.append_op(
-        type='eigvalsh',
-        inputs={'X': x},
-        outputs={'Eigenvalues': out_value,
-                 'Eigenvectors': out_vector},
-        attrs={'UPLO': UPLO,
-               'is_test': is_test})
+    helper.append_op(type='eigvalsh',
+                     inputs={'X': x},
+                     outputs={
+                         'Eigenvalues': out_value,
+                         'Eigenvectors': out_vector
+                     },
+                     attrs={
+                         'UPLO': UPLO,
+                         'is_test': is_test
+                     })
     return out_value
 
 
@@ -3061,14 +3133,14 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
     if device == "cpu":
         if driver not in (None, "gels", "gelss", "gelsd", "gelsy"):
             raise ValueError(
-                "Only support valid driver is 'gels', 'gelss', 'gelsd', 'gelsy' or None for CPU inputs. But got {}".
-                format(driver))
+                "Only support valid driver is 'gels', 'gelss', 'gelsd', 'gelsy' or None for CPU inputs. But got {}"
+                .format(driver))
         driver = "gelsy" if driver is None else driver
     elif "gpu" in device:
         if driver not in (None, "gels"):
             raise ValueError(
-                "Only support valid driver is 'gels' or None for CUDA inputs. But got {}".
-                format(driver))
+                "Only support valid driver is 'gels' or None for CUDA inputs. But got {}"
+                .format(driver))
         driver = "gels" if driver is None else driver
     else:
         raise RuntimeError("Only support lstsq api for CPU or CUDA device.")
@@ -3112,60 +3184,67 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
         return solution, residuals, rank, singular_values
 
     helper = LayerHelper('lstsq', **locals())
-    check_variable_and_dtype(
-        x, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], 'lstsq')
-    check_variable_and_dtype(
-        y, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], 'lstsq')
+    check_variable_and_dtype(x, 'dtype',
+                             ['float32', 'float64', 'complex64', 'complex128'],
+                             'lstsq')
+    check_variable_and_dtype(y, 'dtype',
+                             ['float32', 'float64', 'complex64', 'complex128'],
+                             'lstsq')
 
     solution = helper.create_variable_for_type_inference(dtype=x.dtype)
     residuals = helper.create_variable_for_type_inference(dtype=x.dtype)
     rank = helper.create_variable_for_type_inference(dtype=paddle.int32)
     singular_values = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(
-        type='lstsq',
-        inputs={'X': x,
-                'Y': y},
-        outputs={
-            'Solution': solution,
-            'Rank': rank,
-            'SingularValues': singular_values
-        },
-        attrs={'rcond': rcond,
-               'driver': driver})
+    helper.append_op(type='lstsq',
+                     inputs={
+                         'X': x,
+                         'Y': y
+                     },
+                     outputs={
+                         'Solution': solution,
+                         'Rank': rank,
+                         'SingularValues': singular_values
+                     },
+                     attrs={
+                         'rcond': rcond,
+                         'driver': driver
+                     })
 
     matmul_out = helper.create_variable_for_type_inference(dtype=x.dtype)
     minus_out = helper.create_variable_for_type_inference(dtype=x.dtype)
     pow_out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='matmul_v2',
-        inputs={'X': x,
-                'Y': solution},
-        outputs={'Out': matmul_out},
-        attrs={
-            'trans_x': False,
-            'trans_y': False,
-        })
-
-    helper.append_op(
-        type='elementwise_sub',
-        inputs={'X': matmul_out,
-                'Y': y},
-        outputs={'Out': minus_out})
-
-    helper.append_op(
-        type='pow',
-        inputs={'X': minus_out},
-        outputs={'Out': pow_out},
-        attrs={'factor': 2})
-
-    helper.append_op(
-        type='reduce_sum',
-        inputs={'X': pow_out},
-        outputs={'Out': residuals},
-        attrs={'dim': [-2],
-               'keep_dim': False,
-               'reduce_all': False})
+    helper.append_op(type='matmul_v2',
+                     inputs={
+                         'X': x,
+                         'Y': solution
+                     },
+                     outputs={'Out': matmul_out},
+                     attrs={
+                         'trans_x': False,
+                         'trans_y': False,
+                     })
+
+    helper.append_op(type='elementwise_sub',
+                     inputs={
+                         'X': matmul_out,
+                         'Y': y
+                     },
+                     outputs={'Out': minus_out})
+
+    helper.append_op(type='pow',
+                     inputs={'X': minus_out},
+                     outputs={'Out': pow_out},
+                     attrs={'factor': 2})
+
+    helper.append_op(type='reduce_sum',
+                     inputs={'X': pow_out},
+                     outputs={'Out': residuals},
+                     attrs={
+                         'dim': [-2],
+                         'keep_dim': False,
+                         'reduce_all': False
+                     })
 
     if driver == "gels":
         rank = paddle.static.data(name='rank', shape=[0])
@@ -3237,8 +3316,8 @@ def corrcoef(x, rowvar=True, name=None):
 
     # Clip to [-1, 1].  This does not guarantee
     if paddle.is_complex(c):
-        return paddle.complex(
-            paddle.clip(c.real(), -1, 1), paddle.clip(c.imag(), -1, 1))
+        return paddle.complex(paddle.clip(c.real(), -1, 1),
+                              paddle.clip(c.imag(), -1, 1))
     else:
         c = paddle.clip(c, -1, 1)
 
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 31d2ec0557dfa..c4b4c552c670d 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -26,7 +26,7 @@
 from ..framework import in_dygraph_mode, _non_static_mode
 from ..framework import LayerHelper
 from ..fluid.framework import _in_legacy_dygraph
-# TODO: define logic functions of a tensor  
+# TODO: define logic functions of a tensor
 from paddle import _C_ops
 from paddle.tensor.creation import full
 
@@ -40,13 +40,15 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
             return op(x, y)
         else:
             return op(x)
-    check_variable_and_dtype(x, "x", [
-        "bool", "int8", "int16", "int32", "int64", "float32", "float64"
-    ], op_name)
+    check_variable_and_dtype(
+        x, "x",
+        ["bool", "int8", "int16", "int32", "int64", "float32", "float64"],
+        op_name)
     if y is not None:
-        check_variable_and_dtype(y, "y", [
-            "bool", "int8", "int16", "int32", "int64", "float32", "float64"
-        ], op_name)
+        check_variable_and_dtype(
+            y, "y",
+            ["bool", "int8", "int16", "int32", "int64", "float32", "float64"],
+            op_name)
     if out is not None:
         check_type(out, "out", Variable, op_name)
 
@@ -61,9 +63,12 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     if binary_op:
-        helper.append_op(
-            type=op_name, inputs={"X": x,
-                                  "Y": y}, outputs={"Out": out})
+        helper.append_op(type=op_name,
+                         inputs={
+                             "X": x,
+                             "Y": y
+                         },
+                         outputs={"Out": out})
     else:
         helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out})
 
@@ -105,8 +110,12 @@ def logical_and(x, y, out=None, name=None):
     if in_dygraph_mode():
         return _C_ops.final_state_logical_and(x, y)
 
-    return _logical_op(
-        op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True)
+    return _logical_op(op_name="logical_and",
+                       x=x,
+                       y=y,
+                       name=name,
+                       out=out,
+                       binary_op=True)
 
 
 def logical_or(x, y, out=None, name=None):
@@ -146,8 +155,12 @@ def logical_or(x, y, out=None, name=None):
     """
     if in_dygraph_mode():
         return _C_ops.final_state_logical_or(x, y)
-    return _logical_op(
-        op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True)
+    return _logical_op(op_name="logical_or",
+                       x=x,
+                       y=y,
+                       name=name,
+                       out=out,
+                       binary_op=True)
 
 
 def logical_xor(x, y, out=None, name=None):
@@ -188,8 +201,12 @@ def logical_xor(x, y, out=None, name=None):
     if in_dygraph_mode():
         return _C_ops.final_state_logical_xor(x, y)
 
-    return _logical_op(
-        op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True)
+    return _logical_op(op_name="logical_xor",
+                       x=x,
+                       y=y,
+                       name=name,
+                       out=out,
+                       binary_op=True)
 
 
 @templatedoc()
@@ -222,8 +239,12 @@ def logical_not(x, out=None, name=None):
     """
     if in_dygraph_mode():
         return _C_ops.final_state_logical_not(x)
-    return _logical_op(
-        op_name="logical_not", x=x, y=None, name=name, out=out, binary_op=False)
+    return _logical_op(op_name="logical_not",
+                       x=x,
+                       y=None,
+                       name=name,
+                       out=out,
+                       binary_op=False)
 
 
 def is_empty(x, name=None):
@@ -268,8 +289,9 @@ def is_empty(x, name=None):
     helper = LayerHelper("is_empty", **locals())
     cond = helper.create_variable_for_type_inference(dtype='bool')
     cond.stop_gradient = True
-    helper.append_op(
-        type='is_empty', inputs={'X': [x]}, outputs={'Out': [cond]})
+    helper.append_op(type='is_empty',
+                     inputs={'X': [x]},
+                     outputs={'Out': [cond]})
     return cond
 
 
@@ -310,9 +332,12 @@ def equal_all(x, y, name=None):
 
     helper = LayerHelper("equal_all", **locals())
     out = helper.create_variable_for_type_inference(dtype='bool')
-    helper.append_op(
-        type='equal_all', inputs={'X': [x],
-                                  'Y': [y]}, outputs={'Out': [out]})
+    helper.append_op(type='equal_all',
+                     inputs={
+                         'X': [x],
+                         'Y': [y]
+                     },
+                     outputs={'Out': [out]})
     return out
 
 
@@ -364,14 +389,13 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     if in_dygraph_mode():
         # NOTE(dev): Pass tol as Tensor to fix precision loss problem, because
         # C++ backend will cast it into float32 if passing float from python.
-        as_tensor = lambda x: paddle.to_tensor([x], dtype='float64', place='cpu')
-        return _C_ops.final_state_allclose(x, y,
-                                           as_tensor(rtol),
+        as_tensor = lambda x: paddle.to_tensor(
+            [x], dtype='float64', place='cpu')
+        return _C_ops.final_state_allclose(x, y, as_tensor(rtol),
                                            as_tensor(atol), equal_nan)
     if _in_legacy_dygraph():
-        return _C_ops.allclose(x, y, 'rtol',
-                               str(rtol), 'atol',
-                               str(atol), 'equal_nan', equal_nan)
+        return _C_ops.allclose(x, y, 'rtol', str(rtol), 'atol', str(atol),
+                               'equal_nan', equal_nan)
     check_variable_and_dtype(x, "input", ['float32', 'float64'], 'allclose')
     check_variable_and_dtype(y, "input", ['float32', 'float64'], 'allclose')
     check_type(rtol, 'rtol', float, 'allclose')
@@ -384,8 +408,10 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     inputs = {'Input': x, 'Other': y}
     outputs = {'Out': out}
     attrs = {'rtol': str(rtol), 'atol': str(atol), 'equal_nan': equal_nan}
-    helper.append_op(
-        type='allclose', inputs=inputs, outputs=outputs, attrs=attrs)
+    helper.append_op(type='allclose',
+                     inputs=inputs,
+                     outputs=outputs,
+                     attrs=attrs)
 
     return out
 
@@ -421,8 +447,8 @@ def equal(x, y, name=None):
     """
     if not isinstance(y, (int, bool, float, Variable)):
         raise TypeError(
-            "Type of input args must be float, bool, int or Tensor, but received type {}".
-            format(type(y)))
+            "Type of input args must be float, bool, int or Tensor, but received type {}"
+            .format(type(y)))
     if not isinstance(y, Variable):
         y = full(shape=[1], dtype=x.dtype, fill_value=y)
 
@@ -443,11 +469,12 @@ def equal(x, y, name=None):
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(
-                type='equal',
-                inputs={'X': [x],
-                        'Y': [y]},
-                outputs={'Out': [out]})
+            helper.append_op(type='equal',
+                             inputs={
+                                 'X': [x],
+                                 'Y': [y]
+                             },
+                             outputs={'Out': [out]})
             return out
 
 
@@ -494,11 +521,12 @@ def greater_equal(x, y, name=None):
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(
-                type='greater_equal',
-                inputs={'X': [x],
-                        'Y': [y]},
-                outputs={'Out': [out]})
+            helper.append_op(type='greater_equal',
+                             inputs={
+                                 'X': [x],
+                                 'Y': [y]
+                             },
+                             outputs={'Out': [out]})
             return out
 
 
@@ -544,11 +572,12 @@ def greater_than(x, y, name=None):
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(
-                type='greater_than',
-                inputs={'X': [x],
-                        'Y': [y]},
-                outputs={'Out': [out]})
+            helper.append_op(type='greater_than',
+                             inputs={
+                                 'X': [x],
+                                 'Y': [y]
+                             },
+                             outputs={'Out': [out]})
             return out
 
 
@@ -596,11 +625,12 @@ def less_equal(x, y, name=None):
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(
-                type='less_equal',
-                inputs={'X': [x],
-                        'Y': [y]},
-                outputs={'Out': [out]})
+            helper.append_op(type='less_equal',
+                             inputs={
+                                 'X': [x],
+                                 'Y': [y]
+                             },
+                             outputs={'Out': [out]})
             return out
 
 
@@ -648,11 +678,12 @@ def less_than(x, y, name=None):
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(
-                type='less_than',
-                inputs={'X': [x],
-                        'Y': [y]},
-                outputs={'Out': [out]})
+            helper.append_op(type='less_than',
+                             inputs={
+                                 'X': [x],
+                                 'Y': [y]
+                             },
+                             outputs={'Out': [out]})
             return out
 
 
@@ -700,11 +731,12 @@ def not_equal(x, y, name=None):
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(
-                type='not_equal',
-                inputs={'X': [x],
-                        'Y': [y]},
-                outputs={'Out': [out]})
+            helper.append_op(type='not_equal',
+                             inputs={
+                                 'X': [x],
+                                 'Y': [y]
+                             },
+                             outputs={'Out': [out]})
             return out
 
 
@@ -761,9 +793,12 @@ def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True):
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     if binary_op:
-        helper.append_op(
-            type=op_name, inputs={"X": x,
-                                  "Y": y}, outputs={"Out": out})
+        helper.append_op(type=op_name,
+                         inputs={
+                             "X": x,
+                             "Y": y
+                         },
+                         outputs={"Out": out})
     else:
         helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out})
 
@@ -794,8 +829,12 @@ def bitwise_and(x, y, out=None, name=None):
     """
     if in_dygraph_mode() and out is None:
         return _C_ops.final_state_bitwise_and(x, y)
-    return _bitwise_op(
-        op_name="bitwise_and", x=x, y=y, name=name, out=out, binary_op=True)
+    return _bitwise_op(op_name="bitwise_and",
+                       x=x,
+                       y=y,
+                       name=name,
+                       out=out,
+                       binary_op=True)
 
 
 @templatedoc()
@@ -823,8 +862,12 @@ def bitwise_or(x, y, out=None, name=None):
     if in_dygraph_mode() and out is None:
         return _C_ops.final_state_bitwise_or(x, y)
 
-    return _bitwise_op(
-        op_name="bitwise_or", x=x, y=y, name=name, out=out, binary_op=True)
+    return _bitwise_op(op_name="bitwise_or",
+                       x=x,
+                       y=y,
+                       name=name,
+                       out=out,
+                       binary_op=True)
 
 
 @templatedoc()
@@ -851,8 +894,12 @@ def bitwise_xor(x, y, out=None, name=None):
     """
     if in_dygraph_mode() and out is None:
         return _C_ops.final_state_bitwise_xor(x, y)
-    return _bitwise_op(
-        op_name="bitwise_xor", x=x, y=y, name=name, out=out, binary_op=True)
+    return _bitwise_op(op_name="bitwise_xor",
+                       x=x,
+                       y=y,
+                       name=name,
+                       out=out,
+                       binary_op=True)
 
 
 @templatedoc()
@@ -878,8 +925,12 @@ def bitwise_not(x, out=None, name=None):
     if in_dygraph_mode() and out is None:
         return _C_ops.final_state_bitwise_not(x)
 
-    return _bitwise_op(
-        op_name="bitwise_not", x=x, y=None, name=name, out=out, binary_op=False)
+    return _bitwise_op(op_name="bitwise_not",
+                       x=x,
+                       y=None,
+                       name=name,
+                       out=out,
+                       binary_op=False)
 
 
 @templatedoc()
@@ -937,14 +988,13 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     if in_dygraph_mode():
         # NOTE(dev): Pass tol as Tensor to fix precision loss problem, because
         # C++ backend will cast it into float32 if passing float from python.
-        as_tensor = lambda x: paddle.to_tensor([x], dtype='float64', place='cpu')
-        return _C_ops.final_state_isclose(x, y,
-                                          as_tensor(rtol),
+        as_tensor = lambda x: paddle.to_tensor(
+            [x], dtype='float64', place='cpu')
+        return _C_ops.final_state_isclose(x, y, as_tensor(rtol),
                                           as_tensor(atol), equal_nan)
     if _in_legacy_dygraph():
-        return _C_ops.isclose(x, y, 'rtol',
-                              str(rtol), 'atol',
-                              str(atol), 'equal_nan', equal_nan)
+        return _C_ops.isclose(x, y, 'rtol', str(rtol), 'atol', str(atol),
+                              'equal_nan', equal_nan)
 
     check_variable_and_dtype(x, "input", ['float32', 'float64'], 'isclose')
     check_variable_and_dtype(y, "input", ['float32', 'float64'], 'isclose')
@@ -958,6 +1008,8 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     inputs = {'Input': x, 'Other': y}
     outputs = {'Out': out}
     attrs = {'rtol': str(rtol), 'atol': str(atol), 'equal_nan': equal_nan}
-    helper.append_op(
-        type='isclose', inputs=inputs, outputs=outputs, attrs=attrs)
+    helper.append_op(type='isclose',
+                     inputs=inputs,
+                     outputs=outputs,
+                     attrs=attrs)
     return out
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 57785c16e60bb..96d24a7f915ee 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -23,7 +23,7 @@
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..fluid.layers import utils
 import numpy as np
-# TODO: define functions to manipulate a tensor  
+# TODO: define functions to manipulate a tensor
 from ..fluid.layers.nn import _elementwise_op_in_dygraph
 from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 import paddle
@@ -84,12 +84,13 @@ def cast(x, dtype):
     helper = LayerHelper('cast', **locals())
     out = helper.create_variable_for_type_inference(
         dtype=dtype, stop_gradient=x.stop_gradient)
-    helper.append_op(
-        type='cast',
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={'in_dtype': x.dtype,
-               'out_dtype': out.dtype})
+    helper.append_op(type='cast',
+                     inputs={'X': [x]},
+                     outputs={'Out': [out]},
+                     attrs={
+                         'in_dtype': x.dtype,
+                         'out_dtype': out.dtype
+                     })
     return out
 
 
@@ -233,8 +234,8 @@ def slice(input, axes, starts, ends):
 
             else:
                 raise ValueError(
-                    "Input axes must be a python list or tuple, but reveived {}".
-                    format(type(axes)))
+                    "Input axes must be a python list or tuple, but reveived {}"
+                    .format(type(axes)))
 
             infer_flags = list(1 for i in range(len(axes)))
 
@@ -321,8 +322,10 @@ def slice(input, axes, starts, ends):
     attrs['infer_flags'] = infer_flags
     out = helper.create_variable_for_type_inference(
         dtype=helper.input_dtype('input'))
-    helper.append_op(
-        type='slice', inputs=inputs, attrs=attrs, outputs={'Out': out})
+    helper.append_op(type='slice',
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={'Out': out})
 
     return out
 
@@ -407,12 +410,13 @@ def transpose(x, perm, name=None):
     helper = LayerHelper('transpose', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='transpose2',
-        inputs={'X': [x]},
-        outputs={'Out': [out],
-                 'XShape': [x_shape]},
-        attrs={'axis': perm})
+    helper.append_op(type='transpose2',
+                     inputs={'X': [x]},
+                     outputs={
+                         'Out': [out],
+                         'XShape': [x_shape]
+                     },
+                     attrs={'axis': perm})
     return out
 
 
@@ -468,12 +472,13 @@ def unstack(x, axis=0, num=None):
     for _ in range(num):
         outs.append(helper.create_variable_for_type_inference(x.dtype))
 
-    helper.append_op(
-        type='unstack',
-        inputs={'X': [x]},
-        outputs={'Y': outs},
-        attrs={'axis': axis,
-               'num': num})
+    helper.append_op(type='unstack',
+                     inputs={'X': [x]},
+                     outputs={'Y': outs},
+                     attrs={
+                         'axis': axis,
+                         'num': num
+                     })
     return outs
 
 
@@ -535,17 +540,16 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
                          (shard_id, nshards))
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [input]},
-        outputs={'Out': out},
-        attrs={
-            'index_num': index_num,
-            'nshards': nshards,
-            'shard_id': shard_id,
-            'ignore_value': ignore_value
-        },
-        stop_gradient=True)
+    helper.append_op(type=op_type,
+                     inputs={'X': [input]},
+                     outputs={'Out': out},
+                     attrs={
+                         'index_num': index_num,
+                         'nshards': nshards,
+                         'shard_id': shard_id,
+                         'ignore_value': ignore_value
+                     },
+                     stop_gradient=True)
     return out
 
 
@@ -713,8 +717,11 @@ def _attr_offsets_check(offset_val):
             else:
                 _attr_shape_check(dim_size)
                 temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant(
-                    [1], 'int32', dim_size, force_cpu=True, out=temp_out)
+                fill_constant([1],
+                              'int32',
+                              dim_size,
+                              force_cpu=True,
+                              out=temp_out)
                 new_shape_tensor.append(temp_out)
                 shape_attr.append(dim_size)
         ipts['ShapeTensor'] = new_shape_tensor
@@ -724,11 +731,10 @@ def _attr_offsets_check(offset_val):
             _attr_shape_check(dim_size)
         attrs['shape'] = shape
 
-    helper.append_op(
-        type='crop_tensor',
-        inputs=ipts,
-        outputs={'Out': out},
-        attrs=None if len(attrs) == 0 else attrs)
+    helper.append_op(type='crop_tensor',
+                     inputs=ipts,
+                     outputs={'Out': out},
+                     attrs=None if len(attrs) == 0 else attrs)
     return out
 
 
@@ -762,8 +768,8 @@ def fill_(x, value):
         raise TypeError(
             "The type of 'value'  must be int or float, but received %s." %
             (type(value)))
-    return _C_ops.fill_any_(x, "value_float",
-                            float(value), "value_int", int(value))
+    return _C_ops.fill_any_(x, "value_float", float(value), "value_int",
+                            int(value))
 
 
 @dygraph_only
@@ -857,12 +863,11 @@ def _fill_diagonal_tensor_impl(x, y, offset=0, dim1=0, dim2=1, inplace=False):
     for i in range(len(inshape)):
         if i != dim1 and i != dim2:
             predshape.append(inshape[i])
-    diaglen = min(
-        min(inshape[dim1], inshape[dim1] + offset),
-        min(inshape[dim2], inshape[dim2] - offset))
+    diaglen = min(min(inshape[dim1], inshape[dim1] + offset),
+                  min(inshape[dim2], inshape[dim2] - offset))
     predshape.append(diaglen)
-    assert tuple(predshape) == tuple(y.shape), (
-        "the y shape should be {}".format(predshape))
+    assert tuple(predshape) == tuple(
+        y.shape), ("the y shape should be {}".format(predshape))
     if len(y.shape) == 1:
         y = y.reshape([1, -1])
 
@@ -902,8 +907,12 @@ def fill_diagonal_tensor_(x, y, offset=0, dim1=0, dim2=1, name=None):
             print(x.tolist())   #[[1.0, 2.0, 2.0], [2.0, 1.0, 2.0], [2.0, 2.0, 1.0], [2.0, 2.0, 2.0]]
 
     """
-    return _fill_diagonal_tensor_impl(
-        x, y, offset=offset, dim1=dim1, dim2=dim2, inplace=True)
+    return _fill_diagonal_tensor_impl(x,
+                                      y,
+                                      offset=offset,
+                                      dim1=dim1,
+                                      dim2=dim2,
+                                      inplace=True)
 
 
 def fill_diagonal_tensor(x, y, offset=0, dim1=0, dim2=1, name=None):
@@ -932,8 +941,12 @@ def fill_diagonal_tensor(x, y, offset=0, dim1=0, dim2=1, name=None):
             print(nx.tolist())   #[[1.0, 2.0, 2.0], [2.0, 1.0, 2.0], [2.0, 2.0, 1.0], [2.0, 2.0, 2.0]]
 
     """
-    return _fill_diagonal_tensor_impl(
-        x, y, offset=offset, dim1=dim1, dim2=dim2, inplace=False)
+    return _fill_diagonal_tensor_impl(x,
+                                      y,
+                                      offset=offset,
+                                      dim1=dim1,
+                                      dim2=dim2,
+                                      inplace=False)
 
 
 @dygraph_only
@@ -1038,7 +1051,8 @@ def concat(x, axis=0, name=None):
                 'concat')
             if x.dtype != input[0].dtype:
                 raise TypeError(
-                    "All the Tensors in the input must have the same data type.")
+                    "All the Tensors in the input must have the same data type."
+                )
     else:
         input = [input]
     check_type(axis, 'axis', (int, Variable), 'concat')
@@ -1046,7 +1060,8 @@ def concat(x, axis=0, name=None):
     if isinstance(axis, Variable):
         check_dtype(
             axis.dtype, 'axis', ['int32', 'int64'], 'concat',
-            "The data type of axis must be int32 or int64 when axis is a Tensor")
+            "The data type of axis must be int32 or int64 when axis is a Tensor"
+        )
 
     helper = LayerHelper('concat', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
@@ -1059,13 +1074,16 @@ def concat(x, axis=0, name=None):
         assert len(input) == 1, "If the elements of 'input' in concat are Variable(LoDTensorArray), " \
                 "number of the elements must be 1, but received %s." % len(input)
         out_index = helper.create_variable_for_type_inference(dtype="int32")
-        helper.append_op(
-            type='tensor_array_to_tensor',
-            inputs={'X': input[0]},
-            outputs={'Out': [out],
-                     'OutIndex': [out_index]},
-            attrs={'axis': axis,
-                   'use_stack': False})
+        helper.append_op(type='tensor_array_to_tensor',
+                         inputs={'X': input[0]},
+                         outputs={
+                             'Out': [out],
+                             'OutIndex': [out_index]
+                         },
+                         attrs={
+                             'axis': axis,
+                             'use_stack': False
+                         })
     else:
         inputs = {'X': input}
         attrs = {}
@@ -1075,8 +1093,10 @@ def concat(x, axis=0, name=None):
         else:
             attrs['axis'] = axis
 
-        helper.append_op(
-            type='concat', inputs=inputs, outputs={'Out': [out]}, attrs=attrs)
+        helper.append_op(type='concat',
+                         inputs=inputs,
+                         outputs={'Out': [out]},
+                         attrs=attrs)
     return out
 
 
@@ -1142,8 +1162,8 @@ def broadcast_tensors(input, name=None):
                 output_shape_r.append(shape[i])
                 output_shape_r_last_tensor_index.append(j)
             else:
-                invalid = (output_shape_r[i] != shape[i] and
-                           output_shape_r[i] != 1 and shape[i] != 1)
+                invalid = (output_shape_r[i] != shape[i]
+                           and output_shape_r[i] != 1 and shape[i] != 1)
                 if invalid:
                     last_index = output_shape_r_last_tensor_index[i]
                     raise TypeError(
@@ -1161,14 +1181,15 @@ def broadcast_tensors(input, name=None):
     out = []
     while i < num_inputs:
         out.append(
-            helper.create_variable_for_type_inference(dtype=helper.input_dtype(
-            )))
+            helper.create_variable_for_type_inference(
+                dtype=helper.input_dtype()))
         i += 1
 
     inputs = {'X': input}
-    helper.append_op(
-        type='broadcast_tensors', inputs=inputs, outputs={'Out': out},
-        attrs={})
+    helper.append_op(type='broadcast_tensors',
+                     inputs=inputs,
+                     outputs={'Out': out},
+                     attrs={})
 
     return out
 
@@ -1223,11 +1244,10 @@ def flip(x, axis, name=None):
     else:
         out = helper.create_variable(name=name, dtype=dtype, persistable=False)
 
-    helper.append_op(
-        type="flip",
-        inputs={"X": x},
-        outputs={"Out": out},
-        attrs={"axis": axis})
+    helper.append_op(type="flip",
+                     inputs={"X": x},
+                     outputs={"Out": out},
+                     attrs={"axis": axis})
     return out
 
 
@@ -1294,23 +1314,25 @@ def rot90(x, k=1, axes=[0, 1], name=None):
     input_total_dims = len(x.shape)
     total_rot_dims = len(axes)
     if total_rot_dims != 2:
-        raise ValueError("expected total rotation axes == 2, but got axes = {}".
-                         format(total_rot_dims))
+        raise ValueError(
+            "expected total rotation axes == 2, but got axes = {}".format(
+                total_rot_dims))
     if input_total_dims < 2:
-        raise ValueError("expected total dims >= 2, but got total dims = {}".
-                         format(input_total_dims))
+        raise ValueError(
+            "expected total dims >= 2, but got total dims = {}".format(
+                input_total_dims))
 
     if not (axes[0] != axes[1] and abs(axes[0] - axes[1]) != input_total_dims):
         raise ValueError(
-            "expected rotation axes to be different, but got axis0 = {}, and axis1 = {}".
-            format(axes[0], axes[1]))
+            "expected rotation axes to be different, but got axis0 = {}, and axis1 = {}"
+            .format(axes[0], axes[1]))
 
     if not (axes[0] < input_total_dims and axes[0] >= -input_total_dims):
-        raise ValueError("Rotation axis0 out of range, axis0 = {}".format(axes[
-            0]))
+        raise ValueError("Rotation axis0 out of range, axis0 = {}".format(
+            axes[0]))
     if not (axes[1] < input_total_dims and axes[1] >= -input_total_dims):
-        raise ValueError("Rotation axis1 out of range, axis1 = {}".format(axes[
-            1]))
+        raise ValueError("Rotation axis1 out of range, axis1 = {}".format(
+            axes[1]))
 
     k %= 4
     if k == 0:
@@ -1408,12 +1430,12 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
             'flatten')
 
     x_dim = len(x.shape)
-    if not (isinstance(start_axis, int)) or (
-            start_axis > x_dim - 1) or start_axis < -x_dim:
+    if not (isinstance(start_axis,
+                       int)) or (start_axis > x_dim - 1) or start_axis < -x_dim:
         raise ValueError(
             "The start_axis should be a int, and in range [-rank(x), rank(x))")
-    if not (isinstance(stop_axis, int)) or (
-            stop_axis > x_dim - 1) or stop_axis < -x_dim:
+    if not (isinstance(stop_axis,
+                       int)) or (stop_axis > x_dim - 1) or stop_axis < -x_dim:
         raise ValueError(
             "The stop_axis should be a int, and in range [-rank(x), rank(x))")
     if start_axis < 0:
@@ -1434,13 +1456,16 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
     helper = LayerHelper('flatten', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='flatten_contiguous_range',
-        inputs={"X": x},
-        outputs={'Out': out,
-                 'XShape': x_shape},
-        attrs={"start_axis": start_axis,
-               "stop_axis": stop_axis})
+    helper.append_op(type='flatten_contiguous_range',
+                     inputs={"X": x},
+                     outputs={
+                         'Out': out,
+                         'XShape': x_shape
+                     },
+                     attrs={
+                         "start_axis": start_axis,
+                         "stop_axis": stop_axis
+                     })
     return out
 
 
@@ -1454,12 +1479,12 @@ def flatten_(x, start_axis=0, stop_axis=-1, name=None):
         raise ValueError("The input x should be a Tensor")
 
     x_dim = len(x.shape)
-    if not (isinstance(start_axis, int)) or (
-            start_axis > x_dim - 1) or start_axis < -x_dim:
+    if not (isinstance(start_axis,
+                       int)) or (start_axis > x_dim - 1) or start_axis < -x_dim:
         raise ValueError(
             "The start_axis should be a int, and in range [-rank(x), rank(x))")
-    if not (isinstance(stop_axis, int)) or (
-            stop_axis > x_dim - 1) or stop_axis < -x_dim:
+    if not (isinstance(stop_axis,
+                       int)) or (stop_axis > x_dim - 1) or stop_axis < -x_dim:
         raise ValueError(
             "The stop_axis should be a int, and in range [-rank(x), rank(x))")
     if start_axis < 0:
@@ -1528,8 +1553,8 @@ def roll(x, shifts, axis=None, name=None):
         for i in range(len(axis)):
             if axis[i] >= len_origin_shape or axis[i] < -len_origin_shape:
                 raise ValueError(
-                    "axis is out of range, it should be in range [{}, {}), but received {}".
-                    format(-len_origin_shape, len_origin_shape, axis))
+                    "axis is out of range, it should be in range [{}, {}), but received {}"
+                    .format(-len_origin_shape, len_origin_shape, axis))
     else:
         axis = []
 
@@ -1545,20 +1570,22 @@ def roll(x, shifts, axis=None, name=None):
     out = helper.create_variable_for_type_inference(x.dtype)
 
     if isinstance(shifts, Variable):
-        helper.append_op(
-            type='roll',
-            inputs={'X': x,
-                    "ShiftsTensor": shifts},
-            outputs={'Out': out},
-            attrs={'axis': axis})
+        helper.append_op(type='roll',
+                         inputs={
+                             'X': x,
+                             "ShiftsTensor": shifts
+                         },
+                         outputs={'Out': out},
+                         attrs={'axis': axis})
     else:
         check_type(shifts, 'shifts', (list, tuple), 'roll')
-        helper.append_op(
-            type='roll',
-            inputs={'X': x},
-            outputs={'Out': out},
-            attrs={'axis': axis,
-                   'shifts': shifts})
+        helper.append_op(type='roll',
+                         inputs={'X': x},
+                         outputs={'Out': out},
+                         attrs={
+                             'axis': axis,
+                             'shifts': shifts
+                         })
     return out
 
 
@@ -1663,10 +1690,10 @@ def stack(x, axis=0, name=None):
         ) == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
             x = [x]
         else:
-            raise TypeError("The type of '%s' in %s must be %s, but received %s"
-                            % ('x', 'stack',
-                               'list[Tensor], tuple[Tensor] or TensorArray',
-                               type(x)))
+            raise TypeError(
+                "The type of '%s' in %s must be %s, but received %s" %
+                ('x', 'stack', 'list[Tensor], tuple[Tensor] or TensorArray',
+                 type(x)))
 
     helper = LayerHelper('stack', **locals())
 
@@ -1680,19 +1707,21 @@ def stack(x, axis=0, name=None):
             check_variable_and_dtype(i, 'x', \
                 ['float16', 'float32', 'float64', 'int32', 'int64'], 'stack')
 
-        helper.append_op(
-            type='tensor_array_to_tensor',
-            inputs={'X': x[0]},
-            outputs={'Out': [out],
-                     'OutIndex': [out_index]},
-            attrs={'axis': axis,
-                   'use_stack': True})
+        helper.append_op(type='tensor_array_to_tensor',
+                         inputs={'X': x[0]},
+                         outputs={
+                             'Out': [out],
+                             'OutIndex': [out_index]
+                         },
+                         attrs={
+                             'axis': axis,
+                             'use_stack': True
+                         })
     else:
-        helper.append_op(
-            type='stack',
-            inputs={'X': x},
-            outputs={'Y': out},
-            attrs={'axis': axis})
+        helper.append_op(type='stack',
+                         inputs={'X': x},
+                         outputs={'Y': out},
+                         attrs={'axis': axis})
 
     return out
 
@@ -1766,8 +1795,8 @@ def split(x, num_or_sections, axis=0, name=None):
             if utils._contain_var(num_or_sections):
                 for index, item in enumerate(num_or_sections):
                     if isinstance(item, Variable):
-                        num_or_sections[index] = num_or_sections[index].numpy()[
-                            0]
+                        num_or_sections[index] = num_or_sections[index].numpy(
+                        )[0]
                 attrs += ('sections', list(num_or_sections))
             else:
                 attrs += ('sections', list(num_or_sections))
@@ -1809,8 +1838,11 @@ def _get_SectionsTensorList(one_list):
                         idx)
                     unk_dim_idx = idx
                 temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant(
-                    [1], 'int32', dim_size, force_cpu=True, out=temp_out)
+                fill_constant([1],
+                              'int32',
+                              dim_size,
+                              force_cpu=True,
+                              out=temp_out)
                 tensor_list.append(temp_out)
         return tensor_list
 
@@ -1836,8 +1868,8 @@ def _get_SectionsTensorList(one_list):
                 dim], 'len(num_or_sections) must not be more than input.shape[dim].'
         num = len(num_or_sections)
         attrs['sections'] = list(
-            map(lambda ele: -1 if isinstance(ele, Variable) else ele,
-                num_or_sections))
+            map(lambda ele: -1
+                if isinstance(ele, Variable) else ele, num_or_sections))
         if utils._contain_var(num_or_sections):
             inputs['SectionsTensorList'] = _get_SectionsTensorList(
                 num_or_sections)
@@ -1846,8 +1878,10 @@ def _get_SectionsTensorList(one_list):
         helper.create_variable_for_type_inference(dtype=helper.input_dtype())
         for i in range(num)
     ]
-    helper.append_op(
-        type='split', inputs=inputs, outputs={'Out': outs}, attrs=attrs)
+    helper.append_op(type='split',
+                     inputs=inputs,
+                     outputs={'Out': outs},
+                     attrs=attrs)
     return outs
 
 
@@ -1947,12 +1981,13 @@ def squeeze(x, axis=None, name=None):
     check_type(axes, 'axis/axes', (list, tuple), 'squeeze')
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type="squeeze2",
-        inputs={"X": input},
-        attrs={"axes": axes},
-        outputs={"Out": out,
-                 "XShape": x_shape})
+    helper.append_op(type="squeeze2",
+                     inputs={"X": input},
+                     attrs={"axes": axes},
+                     outputs={
+                         "Out": out,
+                         "XShape": x_shape
+                     })
 
     return out
 
@@ -2058,23 +2093,22 @@ def unique_consecutive(x,
         "return_counts": return_counts,
         "axis": axis,
     }
-    out = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
-    inverse = helper.create_variable_for_type_inference(
-        dtype=attr_dtype, stop_gradient=True)
-    counts = helper.create_variable_for_type_inference(
-        dtype=attr_dtype, stop_gradient=True)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype,
+                                                    stop_gradient=True)
+    inverse = helper.create_variable_for_type_inference(dtype=attr_dtype,
+                                                        stop_gradient=True)
+    counts = helper.create_variable_for_type_inference(dtype=attr_dtype,
+                                                       stop_gradient=True)
     outputs = {"Out": out, "Index": inverse, "Counts": counts}
     outs = [out]
     if return_inverse:
         outs.append(inverse)
     if return_counts:
         outs.append(counts)
-    helper.append_op(
-        type="unique_consecutive",
-        inputs={"X": x},
-        attrs=attrs,
-        outputs=outputs)
+    helper.append_op(type="unique_consecutive",
+                     inputs={"X": x},
+                     attrs=attrs,
+                     outputs=outputs)
     if len(outs) == 1:
         return outs[0]
     return tuple(outs)
@@ -2177,14 +2211,14 @@ def unique(x,
         "axis": axis,
         "is_sorted": True
     }
-    out = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
-    indices = helper.create_variable_for_type_inference(
-        dtype=attr_dtype, stop_gradient=True)
-    inverse = helper.create_variable_for_type_inference(
-        dtype=attr_dtype, stop_gradient=True)
-    counts = helper.create_variable_for_type_inference(
-        dtype=attr_dtype, stop_gradient=True)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype,
+                                                    stop_gradient=True)
+    indices = helper.create_variable_for_type_inference(dtype=attr_dtype,
+                                                        stop_gradient=True)
+    inverse = helper.create_variable_for_type_inference(dtype=attr_dtype,
+                                                        stop_gradient=True)
+    counts = helper.create_variable_for_type_inference(dtype=attr_dtype,
+                                                       stop_gradient=True)
     outputs = {
         "Out": out,
         "Indices": indices,
@@ -2199,8 +2233,10 @@ def unique(x,
     if return_counts:
         outs.append(counts)
 
-    helper.append_op(
-        type="unique", inputs={"X": x}, attrs=attrs, outputs=outputs)
+    helper.append_op(type="unique",
+                     inputs={"X": x},
+                     attrs=attrs,
+                     outputs=outputs)
 
     if len(outs) == 1:
         return outs[0]
@@ -2301,12 +2337,13 @@ def unsqueeze(x, axis, name=None):
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type="unsqueeze2",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={"Out": out,
-                 "XShape": x_shape})
+    helper.append_op(type="unsqueeze2",
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={
+                         "Out": out,
+                         "XShape": x_shape
+                     })
 
     return out
 
@@ -2397,21 +2434,25 @@ def gather(x, index, axis=None, name=None):
     dtype = helper.input_dtype('x')
     out = helper.create_variable_for_type_inference(dtype)
     if not isinstance(axis, Variable):
-        helper.append_op(
-            type="gather",
-            inputs={"X": x,
-                    "Index": index},
-            attrs={'axis': axis,
-                   'overwrite': False},
-            outputs={"Out": out})
+        helper.append_op(type="gather",
+                         inputs={
+                             "X": x,
+                             "Index": index
+                         },
+                         attrs={
+                             'axis': axis,
+                             'overwrite': False
+                         },
+                         outputs={"Out": out})
     else:
-        helper.append_op(
-            type="gather",
-            inputs={"X": x,
-                    "Index": index,
-                    "Axis": axis},
-            attrs={"overwrite": False},
-            outputs={"Out": out})
+        helper.append_op(type="gather",
+                         inputs={
+                             "X": x,
+                             "Index": index,
+                             "Axis": axis
+                         },
+                         attrs={"overwrite": False},
+                         outputs={"Out": out})
 
     return out
 
@@ -2470,11 +2511,10 @@ def unbind(input, axis=0):
         helper.create_variable_for_type_inference(dtype=helper.input_dtype())
         for i in range(num)
     ]
-    helper.append_op(
-        type="unbind",
-        inputs={"X": input},
-        outputs={"Out": outs},
-        attrs={"axis": axis})
+    helper.append_op(type="unbind",
+                     inputs={"X": input},
+                     outputs={"Out": outs},
+                     attrs={"axis": axis})
     return outs
 
 
@@ -2559,18 +2599,19 @@ def scatter(x, index, updates, overwrite=True, name=None):
             return _C_ops.scatter(x, index, updates, 'overwrite', overwrite)
         else:
             check_variable_and_dtype(
-                x, 'dtype',
-                ['float32', 'float64', 'float16', 'int32', 'int64'], 'scatter')
+                x, 'dtype', ['float32', 'float64', 'float16', 'int32', 'int64'],
+                'scatter')
             check_type(overwrite, 'overwrite', bool, 'scatter')
             helper = LayerHelper('scatter', **locals())
             out = helper.create_variable_for_type_inference(x.dtype)
-            helper.append_op(
-                type="scatter",
-                inputs={"X": x,
-                        "Ids": index,
-                        "Updates": updates},
-                attrs={'overwrite': overwrite},
-                outputs={"Out": out})
+            helper.append_op(type="scatter",
+                             inputs={
+                                 "X": x,
+                                 "Ids": index,
+                                 "Updates": updates
+                             },
+                             attrs={'overwrite': overwrite},
+                             outputs={"Out": out})
             return out
 
 
@@ -2666,12 +2707,13 @@ def scatter_nd_add(x, index, updates, name=None):
             helper = LayerHelper('scatter_nd_add', **locals())
             dtype = helper.input_dtype(input_param_name='x')
             output = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(
-                type="scatter_nd_add",
-                inputs={"X": x,
-                        "Index": index,
-                        "Updates": updates},
-                outputs={"Out": output})
+            helper.append_op(type="scatter_nd_add",
+                             inputs={
+                                 "X": x,
+                                 "Index": index,
+                                 "Updates": updates
+                             },
+                             outputs={"Out": output})
             return output
 
 
@@ -2811,8 +2853,8 @@ def tile(x, repeat_times, name=None):
 
     check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
     if isinstance(repeat_times, Variable):
-        assert len(repeat_times.shape) == 1, (
-            'repeat_times must be an 1-D Tensor.')
+        assert len(
+            repeat_times.shape) == 1, ('repeat_times must be an 1-D Tensor.')
     else:
         for elem in repeat_times:
             if isinstance(elem, Variable):
@@ -2823,8 +2865,9 @@ def tile(x, repeat_times, name=None):
                 assert isinstance(elem, type_tuple), (
                     'Elements in repeat_times must be 1-D Tensors or integers.')
 
-    check_variable_and_dtype(
-        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'tile')
+    check_variable_and_dtype(x, 'x',
+                             ['bool', 'float32', 'float64', 'int32', 'int64'],
+                             'tile')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
         raise ValueError(
             "When the date type is bool for the input 'x' of tile op, you "
@@ -2859,8 +2902,10 @@ def get_attr_repeat_times(list_repeat_times):
 
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='tile', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    helper.append_op(type='tile',
+                     inputs=inputs,
+                     outputs={'Out': out},
+                     attrs=attrs)
     return out
 
 
@@ -2896,8 +2941,9 @@ def expand_as(x, y, name=None):
     if _non_static_mode():
         return _C_ops.expand_as_v2(x, 'target_shape', y.shape)
 
-    check_variable_and_dtype(
-        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand_as')
+    check_variable_and_dtype(x, 'x',
+                             ['bool', 'float32', 'float64', 'int32', 'int64'],
+                             'expand_as')
     check_type(y, 'y', Variable, 'expand_as')
 
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
@@ -2911,11 +2957,10 @@ def expand_as(x, y, name=None):
     helper = LayerHelper('expand_as', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='expand_as_v2',
-        inputs=inputs,
-        attrs={'target_shape': y.shape},
-        outputs={'Out': out})
+    helper.append_op(type='expand_as_v2',
+                     inputs=inputs,
+                     attrs={'target_shape': y.shape},
+                     outputs={'Out': out})
     return out
 
 
@@ -3000,8 +3045,10 @@ def get_attr_expand_shape(list_expand_shape):
 
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='expand_v2', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    helper.append_op(type='expand_v2',
+                     inputs=inputs,
+                     outputs={'Out': out},
+                     attrs=attrs)
     return out
 
 
@@ -3088,8 +3135,10 @@ def get_attr_expand_shape(list_expand_shape):
 
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='expand_v2', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    helper.append_op(type='expand_v2',
+                     inputs=inputs,
+                     outputs={'Out': out},
+                     attrs=attrs)
     return out
 
 
@@ -3263,12 +3312,13 @@ def get_attr_shape(list_shape):
     out = x if inplace else helper.create_variable_for_type_inference(
         dtype=x.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="reshape2",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={"Out": out,
-                 "XShape": x_shape})
+    helper.append_op(type="reshape2",
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={
+                         "Out": out,
+                         "XShape": x_shape
+                     })
 
     return helper.append_activation(out)
 
@@ -3376,11 +3426,12 @@ def gather_nd(x, index, name=None):
     helper = LayerHelper('gather_nd', **locals())
     dtype = helper.input_dtype()
     output = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="gather_nd",
-        inputs={"X": x,
-                "Index": index},
-        outputs={"Out": output})
+    helper.append_op(type="gather_nd",
+                     inputs={
+                         "X": x,
+                         "Index": index
+                     },
+                     outputs={"Out": output})
     return output
 
 
@@ -3572,8 +3623,10 @@ def get_new_list_tensor(old_list):
         attrs['infer_flags'] = infer_flags
     out = helper.create_variable_for_type_inference(
         dtype=helper.input_dtype('x'))
-    helper.append_op(
-        type='strided_slice', inputs=inputs, attrs=attrs, outputs={'Out': out})
+    helper.append_op(type='strided_slice',
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={'Out': out})
 
     return out
 
@@ -3930,17 +3983,18 @@ def repeat_interleave(x, repeats, axis=None, name=None):
 
     out = helper.create_variable_for_type_inference(x.dtype)
 
-    helper.append_op(
-        type='repeat_interleave',
-        inputs={
-            'X': x,
-            'RepeatsTensor': repeats if isinstance(repeats, Variable) else None
-        },
-        outputs={'Out': out},
-        attrs={
-            'dim': axis,
-            'Repeats': repeats if isinstance(repeats, int) else 0
-        })
+    helper.append_op(type='repeat_interleave',
+                     inputs={
+                         'X':
+                         x,
+                         'RepeatsTensor':
+                         repeats if isinstance(repeats, Variable) else None
+                     },
+                     outputs={'Out': out},
+                     attrs={
+                         'dim': axis,
+                         'Repeats': repeats if isinstance(repeats, int) else 0
+                     })
     return out
 
 
@@ -4036,12 +4090,13 @@ def moveaxis(x, source, destination, name=None):
     helper = LayerHelper('moveaxis', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='transpose2',
-        inputs={'X': [x]},
-        outputs={'Out': [out],
-                 'XShape': [x_shape]},
-        attrs={'axis': perm})
+    helper.append_op(type='transpose2',
+                     inputs={'X': [x]},
+                     outputs={
+                         'Out': [out],
+                         'XShape': [x_shape]
+                     },
+                     attrs={'axis': perm})
     return out
 
 
@@ -4059,7 +4114,7 @@ def non_negative_axis(arr, axis):
 
 
 def infer_broadcast_shape(arr, indices, axis):
-    # This function is used in take/put_along_axis 
+    # This function is used in take/put_along_axis
     broadcast_shape_list = list(arr.shape)
     broadcast_shape_list[axis] = list(indices.shape)[axis]
     broadcast_shape = tuple(broadcast_shape_list)
@@ -4126,12 +4181,13 @@ def take_along_axis(arr, indices, axis):
     helper = LayerHelper('take_along_axis', **locals())
     dtype = helper.input_dtype()
     result = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="take_along_axis",
-        inputs={"Input": arr,
-                "Index": indices},
-        attrs={"Axis": axis},
-        outputs={"Result": result})
+    helper.append_op(type="take_along_axis",
+                     inputs={
+                         "Input": arr,
+                         "Index": indices
+                     },
+                     attrs={"Axis": axis},
+                     outputs={"Result": result})
     return result
 
 
@@ -4192,14 +4248,17 @@ def put_along_axis(arr, indices, values, axis, reduce='assign'):
     helper = LayerHelper('put_along_axis', **locals())
     dtype = helper.input_dtype()
     result = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="put_along_axis",
-        inputs={"Input": arr,
-                "Index": indices,
-                "Value": values},
-        attrs={"Axis": axis,
-               "Reduce": reduce},
-        outputs={"Result": result})
+    helper.append_op(type="put_along_axis",
+                     inputs={
+                         "Input": arr,
+                         "Index": indices,
+                         "Value": values
+                     },
+                     attrs={
+                         "Axis": axis,
+                         "Reduce": reduce
+                     },
+                     outputs={"Result": result})
     return result
 
 
diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py
index 7626552a85dbd..9a5fcb852a2ff 100644
--- a/python/paddle/tensor/ops.py
+++ b/python/paddle/tensor/ops.py
@@ -107,7 +107,8 @@
     _func = generate_inplace_fn(_OP)
     globals()[_OP] = _func
 
-add_sample_code(globals()["sigmoid"], r"""
+add_sample_code(
+    globals()["sigmoid"], r"""
 Examples:
     .. code-block:: python
 
@@ -121,7 +122,8 @@
 
 """)
 
-add_sample_code(globals()["silu"], r"""
+add_sample_code(
+    globals()["silu"], r"""
 Examples:
     .. code-block:: python
 
@@ -135,7 +137,8 @@
 
 """)
 
-add_sample_code(globals()["logsigmoid"], r"""
+add_sample_code(
+    globals()["logsigmoid"], r"""
 Examples:
     .. code-block:: python
 
@@ -149,7 +152,8 @@
 
 """)
 
-add_sample_code(globals()["exp"], r"""
+add_sample_code(
+    globals()["exp"], r"""
 Examples:
     .. code-block:: python
 
@@ -162,7 +166,8 @@
 
 """)
 
-add_sample_code(globals()["expm1"], r"""
+add_sample_code(
+    globals()["expm1"], r"""
 Examples:
     .. code-block:: python
 
@@ -175,7 +180,8 @@
 
 """)
 
-add_sample_code(globals()["tanh"], r"""
+add_sample_code(
+    globals()["tanh"], r"""
 Examples:
     .. code-block:: python
 
@@ -188,7 +194,8 @@
 
 """)
 
-add_sample_code(globals()["atan"], r"""
+add_sample_code(
+    globals()["atan"], r"""
 Examples:
     .. code-block:: python
 
@@ -201,7 +208,8 @@
 
 """)
 
-add_sample_code(globals()["tanh_shrink"], r"""
+add_sample_code(
+    globals()["tanh_shrink"], r"""
 Examples:
     .. code-block:: python
 
@@ -215,7 +223,8 @@
 
 """)
 
-add_sample_code(globals()["sqrt"], r"""
+add_sample_code(
+    globals()["sqrt"], r"""
 Examples:
     .. code-block:: python
 
@@ -228,7 +237,8 @@
 
 """)
 
-add_sample_code(globals()["rsqrt"], r"""
+add_sample_code(
+    globals()["rsqrt"], r"""
 Examples:
     .. code-block:: python
 
@@ -241,7 +251,8 @@
 
 """)
 
-add_sample_code(globals()["abs"], r"""
+add_sample_code(
+    globals()["abs"], r"""
 Examples:
     .. code-block:: python
 
@@ -254,7 +265,8 @@
 
 """)
 
-add_sample_code(globals()["ceil"], r"""
+add_sample_code(
+    globals()["ceil"], r"""
 Examples:
     .. code-block:: python
 
@@ -267,7 +279,8 @@
 
 """)
 
-add_sample_code(globals()["floor"], r"""
+add_sample_code(
+    globals()["floor"], r"""
 Examples:
     .. code-block:: python
 
@@ -280,7 +293,8 @@
 
 """)
 
-add_sample_code(globals()["cos"], r"""
+add_sample_code(
+    globals()["cos"], r"""
 Examples:
     .. code-block:: python
 
@@ -293,7 +307,8 @@
 
 """)
 
-add_sample_code(globals()["tan"], r"""
+add_sample_code(
+    globals()["tan"], r"""
 Examples:
     .. code-block:: python
 
@@ -306,7 +321,8 @@
 
 """)
 
-add_sample_code(globals()["acos"], r"""
+add_sample_code(
+    globals()["acos"], r"""
 Examples:
     .. code-block:: python
 
@@ -319,7 +335,8 @@
 
 """)
 
-add_sample_code(globals()["sin"], r"""
+add_sample_code(
+    globals()["sin"], r"""
 Examples:
     .. code-block:: python
 
@@ -332,7 +349,8 @@
 
 """)
 
-add_sample_code(globals()["asin"], r"""
+add_sample_code(
+    globals()["asin"], r"""
 Examples:
     .. code-block:: python
 
@@ -345,7 +363,8 @@
 
 """)
 
-add_sample_code(globals()["cosh"], r"""
+add_sample_code(
+    globals()["cosh"], r"""
 Examples:
     .. code-block:: python
 
@@ -358,7 +377,8 @@
 
 """)
 
-add_sample_code(globals()["sinh"], r"""
+add_sample_code(
+    globals()["sinh"], r"""
 Examples:
     .. code-block:: python
 
@@ -371,7 +391,8 @@
 
 """)
 
-add_sample_code(globals()["asinh"], r"""
+add_sample_code(
+    globals()["asinh"], r"""
 Examples:
     .. code-block:: python
 
@@ -384,7 +405,8 @@
 
 """)
 
-add_sample_code(globals()["acosh"], r"""
+add_sample_code(
+    globals()["acosh"], r"""
 Examples:
     .. code-block:: python
 
@@ -397,7 +419,8 @@
 
 """)
 
-add_sample_code(globals()["atanh"], r"""
+add_sample_code(
+    globals()["atanh"], r"""
 Examples:
     .. code-block:: python
 
@@ -410,7 +433,8 @@
 
 """)
 
-add_sample_code(globals()["round"], r"""
+add_sample_code(
+    globals()["round"], r"""
 Examples:
     .. code-block:: python
 
@@ -423,7 +447,8 @@
 
 """)
 
-add_sample_code(globals()["reciprocal"], r"""
+add_sample_code(
+    globals()["reciprocal"], r"""
 Examples:
     .. code-block:: python
 
@@ -436,7 +461,8 @@
 
 """)
 
-add_sample_code(globals()["square"], r"""
+add_sample_code(
+    globals()["square"], r"""
 Examples:
     .. code-block:: python
 
@@ -449,7 +475,8 @@
 
 """)
 
-add_sample_code(globals()["lgamma"], r"""
+add_sample_code(
+    globals()["lgamma"], r"""
 Examples:
     .. code-block:: python
 
@@ -462,7 +489,8 @@
 
 """)
 
-add_sample_code(globals()["softplus"], r"""
+add_sample_code(
+    globals()["softplus"], r"""
 Examples:
     .. code-block:: python
 
@@ -476,7 +504,8 @@
 
 """)
 
-add_sample_code(globals()["softsign"], r"""
+add_sample_code(
+    globals()["softsign"], r"""
 Examples:
     .. code-block:: python
 
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 49671d65b6d44..f43bda1129589 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define random functions  
+# TODO: define random functions
 
 from ..framework import core
 from ..framework import convert_np_dtype_to_dtype_, dygraph_only
@@ -30,25 +30,24 @@
 def bernoulli(x, name=None):
     """
 
-    Returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
-    The input ``x`` is a tensor with probabilities for generating the random binary number.
-    Each element in ``x`` should be in [0, 1], and the out is generated by:
-    
-    .. math::
+    For each element :math:`x_i` in input ``x``, take a sample from the Bernoulli distribution, also called two-point distribution, with success probability :math:`x_i`. The Bernoulli distribution with success probability :math:`x_i` is a discrete probability distribution with probability mass function
 
-        out_i ~ Bernoulli (x_i)
+    .. math::
+        p(y)=\\begin{cases}
+            x_i,&y=1\\\\
+            1-x_i,&y=0
+        \end{cases}.
 
     Args:
-        x(Tensor):  A tensor with probabilities for generating the random binary number. The data type 
-            should be float32, float64.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
+        x (Tensor): The input Tensor, it's data type should be float32, float64.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+
     Returns: 
-        Tensor: A Tensor filled with random binary number with the same shape and dtype as ``x``.
+        Tensor: A Tensor filled samples from Bernoulli distribution, whose shape and dtype are same as ``x``.
 
     Examples:
         .. code-block:: python
+            :name: bernoulli-example
 
             import paddle
 
@@ -77,9 +76,11 @@ def bernoulli(x, name=None):
 
     helper = LayerHelper("randint", **locals())
     out = helper.create_variable_for_type_inference(
-        dtype=x.dtype)  # maybe set out to int32 ? 
-    helper.append_op(
-        type='bernoulli', inputs={"X": x}, outputs={'Out': out}, attrs={})
+        dtype=x.dtype)  # maybe set out to int32 ?
+    helper.append_op(type='bernoulli',
+                     inputs={"X": x},
+                     outputs={'Out': out},
+                     attrs={})
     out.stop_gradient = True
     return out
 
@@ -122,8 +123,10 @@ def poisson(x, name=None):
 
     helper = LayerHelper("poisson", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='poisson', inputs={'X': x}, outputs={'Out': out}, attrs={})
+    helper.append_op(type='poisson',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs={})
     return out
 
 
@@ -190,12 +193,13 @@ def multinomial(x, num_samples=1, replacement=False, name=None):
     helper = LayerHelper("multinomial", **locals())
     out = helper.create_variable_for_type_inference(
         dtype=convert_np_dtype_to_dtype_('int64'))
-    helper.append_op(
-        type='multinomial',
-        inputs={"X": x},
-        outputs={'Out': out},
-        attrs={'num_samples': num_samples,
-               'replacement': replacement})
+    helper.append_op(type='multinomial',
+                     inputs={"X": x},
+                     outputs={'Out': out},
+                     attrs={
+                         'num_samples': num_samples,
+                         'replacement': replacement
+                     })
     out.stop_gradient = True
     return out
 
@@ -240,15 +244,14 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
         place = _current_expected_place()
-        return _C_ops.final_state_gaussian_random(shape,
-                                                  float(mean),
+        return _C_ops.final_state_gaussian_random(shape, float(mean),
                                                   float(std), seed, dtype,
                                                   place)
 
     if _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        return _C_ops.gaussian_random('shape', shape, 'mean',
-                                      float(mean), 'std',
+        return _C_ops.gaussian_random('shape',
+                                      shape, 'mean', float(mean), 'std',
                                       float(std), 'seed', seed, 'dtype', dtype)
 
     check_shape(shape, op_type_for_check)
@@ -262,16 +265,17 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
         'dtype': dtype,
         'use_mkldnn': False
     }
-    utils.get_shape_tensor_inputs(
-        inputs=inputs, attrs=attrs, shape=shape, op_type=op_type_for_check)
+    utils.get_shape_tensor_inputs(inputs=inputs,
+                                  attrs=attrs,
+                                  shape=shape,
+                                  op_type=op_type_for_check)
 
     helper = LayerHelper('gaussian', **locals())
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='gaussian_random',
-        inputs=inputs,
-        outputs={'Out': out},
-        attrs=attrs)
+    helper.append_op(type='gaussian_random',
+                     inputs=inputs,
+                     outputs={'Out': out},
+                     attrs=attrs)
     out.stop_gradient = True
     return out
 
@@ -544,23 +548,21 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
         dtype = paddle.framework.get_default_dtype()
         if dtype not in ['float32', 'float64']:
             raise TypeError(
-                "uniform/rand only supports [float32, float64], but the default dtype is {}".
-                format(dtype))
+                "uniform/rand only supports [float32, float64], but the default dtype is {}"
+                .format(dtype))
 
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
-        return _C_ops.final_state_uniform_random(shape, dtype,
-                                                 float(min),
+        return _C_ops.final_state_uniform_random(shape, dtype, float(min),
                                                  float(max), seed,
                                                  _current_expected_place())
 
     if _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        return _C_ops.uniform_random('shape', shape, 'min',
-                                     float(min), 'max',
+        return _C_ops.uniform_random('shape', shape, 'min', float(min), 'max',
                                      float(max), 'seed', seed, 'dtype', dtype)
 
     check_type(shape, 'shape', (list, tuple, Variable), 'uniform/rand')
@@ -568,14 +570,17 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
 
     inputs = dict()
     attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
-    utils.get_shape_tensor_inputs(
-        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform/rand')
+    utils.get_shape_tensor_inputs(inputs=inputs,
+                                  attrs=attrs,
+                                  shape=shape,
+                                  op_type='uniform/rand')
 
     helper = LayerHelper("uniform", **locals())
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="uniform_random", inputs=inputs, attrs=attrs,
-        outputs={"Out": out})
+    helper.append_op(type="uniform_random",
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={"Out": out})
     out.stop_gradient = True
     return out
 
@@ -687,8 +692,8 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     if high is None:
         if low <= 0:
             raise ValueError(
-                "If high is None, low must be greater than 0, but received low = {0}.".
-                format(low))
+                "If high is None, low must be greater than 0, but received low = {0}."
+                .format(low))
         high = low
         low = 0
     if dtype is None:
@@ -714,13 +719,17 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
 
     inputs = dict()
     attrs = {'low': low, 'high': high, 'seed': 0, 'dtype': dtype}
-    utils.get_shape_tensor_inputs(
-        inputs=inputs, attrs=attrs, shape=shape, op_type='randint')
+    utils.get_shape_tensor_inputs(inputs=inputs,
+                                  attrs=attrs,
+                                  shape=shape,
+                                  op_type='randint')
 
     helper = LayerHelper("randint", **locals())
     out = helper.create_variable_for_type_inference(dtype=dtype)
-    helper.append_op(
-        type='randint', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    helper.append_op(type='randint',
+                     inputs=inputs,
+                     outputs={'Out': out},
+                     attrs=attrs)
     out.stop_gradient = True
     return out
 
@@ -852,8 +861,8 @@ def randint_like(x, low=0, high=None, dtype=None, name=None):
     if high is None:
         if low <= 0:
             raise ValueError(
-                "If high is None, low must be greater than 0, but received low = {0}.".
-                format(low))
+                "If high is None, low must be greater than 0, but received low = {0}."
+                .format(low))
         high = low
         low = 0
     if dtype is None:
@@ -876,8 +885,8 @@ def randint_like(x, low=0, high=None, dtype=None, name=None):
 
     check_shape(shape, 'randint_like')
     check_dtype(dtype, 'dtype',
-                ['bool', 'float16', 'float32', 'float64', 'int32',
-                 'int64'], 'randint_like')
+                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+                'randint_like')
 
     inputs = dict()
     attrs = {
@@ -886,14 +895,18 @@ def randint_like(x, low=0, high=None, dtype=None, name=None):
         'seed': 0,
         'dtype': core.VarDesc.VarType.INT64
     }
-    utils.get_shape_tensor_inputs(
-        inputs=inputs, attrs=attrs, shape=shape, op_type='randint_like')
+    utils.get_shape_tensor_inputs(inputs=inputs,
+                                  attrs=attrs,
+                                  shape=shape,
+                                  op_type='randint_like')
 
     helper = LayerHelper("randint", **locals())
     out = helper.create_variable_for_type_inference(
         dtype=core.VarDesc.VarType.INT64)
-    helper.append_op(
-        type='randint', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    helper.append_op(type='randint',
+                     inputs=inputs,
+                     outputs={'Out': out},
+                     attrs=attrs)
     out.stop_gradient = True
     out = paddle.cast(out, dtype)
     return out
@@ -945,8 +958,10 @@ def randperm(n, dtype="int64", name=None):
     helper = LayerHelper("randperm", **locals())
     out = helper.create_variable_for_type_inference(dtype)
     attrs = {'n': n, 'dtype': dtype, 'seed': 0}
-    helper.append_op(
-        type='randperm', inputs={}, outputs={'Out': out}, attrs=attrs)
+    helper.append_op(type='randperm',
+                     inputs={},
+                     outputs={'Out': out},
+                     attrs=attrs)
     out.stop_gradient = True
     return out
 
@@ -1043,9 +1058,8 @@ def exponential_(x, lam=1.0, name=None):
     check_variable_and_dtype(x, "x", ["float32", "float64"], "exponential")
 
     helper = LayerHelper("exponential", **locals())
-    helper.append_op(
-        type='exponential',
-        inputs={"X": x},
-        outputs={'Out': x},
-        attrs={"lambda": lam})
+    helper.append_op(type='exponential',
+                     inputs={"X": x},
+                     outputs={'Out': x},
+                     attrs={"lambda": lam})
     return x
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 02a71a80b9e86..42087ac7dafa3 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -25,7 +25,7 @@
 from paddle import _C_ops
 from .logic import logical_not
 
-# TODO: define searching & indexing functions of a tensor  
+# TODO: define searching & indexing functions of a tensor
 # from ..fluid.layers import has_inf  #DEFINE_ALIAS
 # from ..fluid.layers import has_nan  #DEFINE_ALIAS
 
@@ -106,17 +106,20 @@ def argsort(x, axis=-1, descending=False, name=None):
         'argsort')
 
     helper = LayerHelper("argsort", **locals())
-    out = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
-    ids = helper.create_variable_for_type_inference(
-        VarDesc.VarType.INT64, stop_gradient=True)
-    helper.append_op(
-        type='argsort',
-        inputs={'X': x},
-        outputs={'Out': out,
-                 'Indices': ids},
-        attrs={'axis': axis,
-               'descending': descending})
+    out = helper.create_variable_for_type_inference(dtype=x.dtype,
+                                                    stop_gradient=True)
+    ids = helper.create_variable_for_type_inference(VarDesc.VarType.INT64,
+                                                    stop_gradient=True)
+    helper.append_op(type='argsort',
+                     inputs={'X': x},
+                     outputs={
+                         'Out': out,
+                         'Indices': ids
+                     },
+                     attrs={
+                         'axis': axis,
+                         'descending': descending
+                     })
     return ids
 
 
@@ -194,8 +197,10 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
     attrs['axis'] = axis
     attrs['flatten'] = flatten
     attrs['dtype'] = var_dtype
-    helper.append_op(
-        type='arg_max', inputs={'X': x}, outputs={'Out': [out]}, attrs=attrs)
+    helper.append_op(type='arg_max',
+                     inputs={'X': x},
+                     outputs={'Out': [out]},
+                     attrs=attrs)
     out.stop_gradient = True
     return out
 
@@ -276,8 +281,10 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
     attrs['axis'] = axis
     attrs['flatten'] = flatten
     attrs['dtype'] = var_dtype
-    helper.append_op(
-        type='arg_min', inputs={'X': x}, outputs={'Out': [out]}, attrs=attrs)
+    helper.append_op(type='arg_min',
+                     inputs={'X': x},
+                     outputs={'Out': [out]},
+                     attrs=attrs)
     out.stop_gradient = True
     return out
 
@@ -334,12 +341,13 @@ def index_select(x, index, axis=0, name=None):
 
     out = helper.create_variable_for_type_inference(x.dtype)
 
-    helper.append_op(
-        type='index_select',
-        inputs={'X': x,
-                'Index': index},
-        outputs={'Out': out},
-        attrs={'dim': axis})
+    helper.append_op(type='index_select',
+                     inputs={
+                         'X': x,
+                         'Index': index
+                     },
+                     outputs={'Out': out},
+                     attrs={'dim': axis})
     return out
 
 
@@ -409,10 +417,9 @@ def nonzero(x, as_tuple=False):
         outs = helper.create_variable_for_type_inference(
             dtype=core.VarDesc.VarType.INT64)
 
-        helper.append_op(
-            type='where_index',
-            inputs={'Condition': x},
-            outputs={'Out': [outs]})
+        helper.append_op(type='where_index',
+                         inputs={'Condition': x},
+                         outputs={'Out': [outs]})
 
     if not as_tuple:
         return outs
@@ -421,8 +428,7 @@ def nonzero(x, as_tuple=False):
     else:
         for i in range(rank):
             list_out.append(
-                paddle.slice(
-                    outs, axes=[1], starts=[i], ends=[i + 1]))
+                paddle.slice(outs, axes=[1], starts=[i], ends=[i + 1]))
         return tuple(list_out)
 
 
@@ -491,17 +497,20 @@ def sort(x, axis=-1, descending=False, name=None):
         outs, _ = _C_ops.argsort(x, 'axis', axis, 'descending', descending)
         return outs
     helper = LayerHelper("sort", **locals())
-    out = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=False)
-    ids = helper.create_variable_for_type_inference(
-        VarDesc.VarType.INT64, stop_gradient=True)
-    helper.append_op(
-        type='argsort',
-        inputs={'X': x},
-        outputs={'Out': out,
-                 'Indices': ids},
-        attrs={'axis': axis,
-               'descending': descending})
+    out = helper.create_variable_for_type_inference(dtype=x.dtype,
+                                                    stop_gradient=False)
+    ids = helper.create_variable_for_type_inference(VarDesc.VarType.INT64,
+                                                    stop_gradient=True)
+    helper.append_op(type='argsort',
+                     inputs={'X': x},
+                     outputs={
+                         'Out': out,
+                         'Indices': ids
+                     },
+                     attrs={
+                         'axis': axis,
+                         'descending': descending
+                     })
     return out
 
 
@@ -550,12 +559,13 @@ def mode(x, axis=-1, keepdim=False, name=None):
     values = helper.create_variable_for_type_inference(dtype=x.dtype)
     indices = helper.create_variable_for_type_inference(dtype="int64")
 
-    helper.append_op(
-        type="mode",
-        inputs=inputs,
-        outputs={"Out": [values],
-                 "Indices": [indices]},
-        attrs=attrs)
+    helper.append_op(type="mode",
+                     inputs=inputs,
+                     outputs={
+                         "Out": [values],
+                         "Indices": [indices]
+                     },
+                     attrs=attrs)
     indices.stop_gradient = True
     return values, indices
 
@@ -620,10 +630,12 @@ def where(condition, x=None, y=None, name=None):
 
     if not paddle.in_dynamic_mode():
         check_variable_and_dtype(condition, 'condition', ['bool'], 'where')
-        check_variable_and_dtype(
-            x, 'x', ['float32', 'float64', 'int32', 'int64'], 'where')
-        check_variable_and_dtype(
-            y, 'y', ['float32', 'float64', 'int32', 'int64'], 'where')
+        check_variable_and_dtype(x, 'x',
+                                 ['float32', 'float64', 'int32', 'int64'],
+                                 'where')
+        check_variable_and_dtype(y, 'y',
+                                 ['float32', 'float64', 'int32', 'int64'],
+                                 'where')
 
     condition_shape = list(condition.shape)
     x_shape = list(x.shape)
@@ -665,14 +677,13 @@ def where(condition, x=None, y=None, name=None):
             helper = LayerHelper("where", **locals())
             out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-            helper.append_op(
-                type='where',
-                inputs={
-                    'Condition': broadcast_condition,
-                    'X': broadcast_x,
-                    'Y': broadcast_y
-                },
-                outputs={'Out': [out]})
+            helper.append_op(type='where',
+                             inputs={
+                                 'Condition': broadcast_condition,
+                                 'X': broadcast_x,
+                                 'Y': broadcast_y
+                             },
+                             outputs={'Out': [out]})
 
             return out
 
@@ -764,11 +775,12 @@ def index_sample(x, index):
                                      'paddle.tensor.search.index_sample')
             out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-            helper.append_op(
-                type='index_sample',
-                inputs={'X': x,
-                        'Index': index},
-                outputs={'Out': out})
+            helper.append_op(type='index_sample',
+                             inputs={
+                                 'X': x,
+                                 'Index': index
+                             },
+                             outputs={'Out': out})
             return out
 
 
@@ -814,9 +826,12 @@ def masked_select(x, mask, name=None):
     check_variable_and_dtype(mask, 'mask', ['bool'],
                              'paddle.tensor.search.masked_select')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='masked_select', inputs={'X': x,
-                                      'Mask': mask}, outputs={'Y': out})
+    helper.append_op(type='masked_select',
+                     inputs={
+                         'X': x,
+                         'Mask': mask
+                     },
+                     outputs={'Y': out})
     return out
 
 
@@ -884,13 +899,11 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None):
 
     if _non_static_mode():
         if axis is None:
-            out, indices = _C_ops.top_k_v2(x, 'k',
-                                           int(k), 'largest', largest, 'sorted',
-                                           sorted)
+            out, indices = _C_ops.top_k_v2(x, 'k', int(k), 'largest', largest,
+                                           'sorted', sorted)
         else:
-            out, indices = _C_ops.top_k_v2(x, 'k',
-                                           int(k), 'axis', axis, 'largest',
-                                           largest, 'sorted', sorted)
+            out, indices = _C_ops.top_k_v2(x, 'k', int(k), 'axis', axis,
+                                           'largest', largest, 'sorted', sorted)
         return out, indices
 
     helper = LayerHelper("top_k_v2", **locals())
@@ -908,12 +921,13 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None):
     values = helper.create_variable_for_type_inference(dtype=x.dtype)
     indices = helper.create_variable_for_type_inference(dtype="int64")
 
-    helper.append_op(
-        type="top_k_v2",
-        inputs=inputs,
-        outputs={"Out": [values],
-                 "Indices": [indices]},
-        attrs=attrs)
+    helper.append_op(type="top_k_v2",
+                     inputs=inputs,
+                     outputs={
+                         "Out": [values],
+                         "Indices": [indices]
+                     },
+                     attrs=attrs)
     indices.stop_gradient = True
     return values, indices
 
@@ -982,13 +996,16 @@ def searchsorted(sorted_sequence,
     helper = LayerHelper('searchsorted', **locals())
     out_type = 'int32' if out_int32 else 'int64'
     out = helper.create_variable_for_type_inference(dtype=out_type)
-    helper.append_op(
-        type='searchsorted',
-        inputs={'SortedSequence': sorted_sequence,
-                "Values": values},
-        outputs={'Out': out},
-        attrs={"out_int32": out_int32,
-               "right": right})
+    helper.append_op(type='searchsorted',
+                     inputs={
+                         'SortedSequence': sorted_sequence,
+                         "Values": values
+                     },
+                     outputs={'Out': out},
+                     attrs={
+                         "out_int32": out_int32,
+                         "right": right
+                     })
 
     return out
 
@@ -1050,11 +1067,12 @@ def kthvalue(x, k, axis=None, keepdim=False, name=None):
     values = helper.create_variable_for_type_inference(dtype=x.dtype)
     indices = helper.create_variable_for_type_inference(dtype="int64")
 
-    helper.append_op(
-        type="kthvalue",
-        inputs=inputs,
-        outputs={"Out": [values],
-                 "Indices": [indices]},
-        attrs=attrs)
+    helper.append_op(type="kthvalue",
+                     inputs=inputs,
+                     outputs={
+                         "Out": [values],
+                         "Indices": [indices]
+                     },
+                     attrs=attrs)
     indices.stop_gradient = True
     return values, indices
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 372454b97a6be..2073e241a3b18 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define statistical functions of a tensor  
+# TODO: define statistical functions of a tensor
 
 import numpy as np
 from ..static import Variable
@@ -107,8 +107,10 @@ def mean(x, axis=None, keepdim=False, name=None):
     helper = LayerHelper('mean', **locals())
     attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='reduce_mean', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
+    helper.append_op(type='reduce_mean',
+                     inputs={'X': x},
+                     outputs={'Out': out},
+                     attrs=attrs)
     return out
 
 
@@ -305,8 +307,8 @@ def nanmedian(x, axis=None, keepdim=True, name=None):
         )
 
     for i in range(len(axis)):
-        if not isinstance(axis[i], int) or not (axis[i] < dims and
-                                                axis[i] >= -dims):
+        if not isinstance(axis[i], int) or not (axis[i] < dims
+                                                and axis[i] >= -dims):
             raise ValueError(
                 "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
             )
@@ -329,12 +331,13 @@ def nanmedian(x, axis=None, keepdim=True, name=None):
     attrs = {'axis': axis, 'keepdim': keepdim}
     out = helper.create_variable_for_type_inference(x.dtype)
     medians = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(
-        type='nanmedian',
-        inputs={'X': x},
-        outputs={'Out': out,
-                 'MedianIndex': medians},
-        attrs=attrs)
+    helper.append_op(type='nanmedian',
+                     inputs={'X': x},
+                     outputs={
+                         'Out': out,
+                         'MedianIndex': medians
+                     },
+                     attrs=attrs)
     return out
 
 
@@ -412,13 +415,13 @@ def median(x, axis=None, keepdim=False, name=None):
                 tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1])
         out_tensor = paddle.cast(out_tensor, dtype=dtype) / 2
     else:
-        out_tensor = paddle.cast(
-            paddle.slice(
-                tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1]),
-            dtype=dtype)
+        out_tensor = paddle.cast(paddle.slice(tensor_topk,
+                                              axes=[axis],
+                                              starts=[kth],
+                                              ends=[kth + 1]),
+                                 dtype=dtype)
     out_tensor = out_tensor + paddle.sum(
-        paddle.cast(
-            paddle.isnan(x), dtype=dtype) * x, axis=axis, keepdim=True)
+        paddle.cast(paddle.isnan(x), dtype=dtype) * x, axis=axis, keepdim=True)
     if not keepdim or is_flatten:
         if not is_flatten:
             newshape = x.shape[:axis] + x.shape[axis + 1:]
@@ -537,14 +540,15 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
     for index in indices:
         indices_below = paddle.floor(index).astype(paddle.int32)
         indices_upper = paddle.ceil(index).astype(paddle.int32)
-        tensor_upper = paddle.take_along_axis(
-            sorted_tensor, indices_upper, axis=axis)
-        tensor_below = paddle.take_along_axis(
-            sorted_tensor, indices_below, axis=axis)
+        tensor_upper = paddle.take_along_axis(sorted_tensor,
+                                              indices_upper,
+                                              axis=axis)
+        tensor_below = paddle.take_along_axis(sorted_tensor,
+                                              indices_below,
+                                              axis=axis)
         weights = (index - indices_below.astype('float64'))
-        out = paddle.lerp(
-            tensor_below.astype('float64'),
-            tensor_upper.astype('float64'), weights)
+        out = paddle.lerp(tensor_below.astype('float64'),
+                          tensor_upper.astype('float64'), weights)
         if not keepdim:
             out = paddle.squeeze(out, axis=axis)
         else:
diff --git a/python/paddle/tensor/tensor.py b/python/paddle/tensor/tensor.py
index ec7b50c63c086..1696351609083 100644
--- a/python/paddle/tensor/tensor.py
+++ b/python/paddle/tensor/tensor.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the basic tensor classes 
+# TODO: define the basic tensor classes
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 7935b4f275580..fb21c793f428d 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -196,8 +196,8 @@ def _format_tensor(var, summary, indent=0, max_width=0, signed=False):
             items[i:i + items_per_line]
             for i in range(0, len(items), items_per_line)
         ]
-        s = (',\n' + ' ' *
-             (indent + 1)).join([', '.join(line) for line in lines])
+        s = (',\n' + ' ' * (indent + 1)).join(
+            [', '.join(line) for line in lines])
         return '[' + s + ']'
     else:
         # recursively handle all dimensions
@@ -249,17 +249,19 @@ def to_string(var, prefix='Tensor'):
 
     max_width, signed = _get_max_width(_to_summary(np_var))
 
-    data = _format_tensor(
-        np_var, summary, indent=indent, max_width=max_width, signed=signed)
+    data = _format_tensor(np_var,
+                          summary,
+                          indent=indent,
+                          max_width=max_width,
+                          signed=signed)
 
-    return _template.format(
-        prefix=prefix,
-        shape=var.shape,
-        dtype=dtype,
-        place=var._place_str,
-        stop_gradient=var.stop_gradient,
-        indent=' ' * indent,
-        data=data)
+    return _template.format(prefix=prefix,
+                            shape=var.shape,
+                            dtype=dtype,
+                            place=var._place_str,
+                            stop_gradient=var.stop_gradient,
+                            indent=' ' * indent,
+                            data=data)
 
 
 def _format_dense_tensor(tensor, indent):
@@ -281,8 +283,11 @@ def _format_dense_tensor(tensor, indent):
 
     max_width, signed = _get_max_width(_to_summary(np_tensor))
 
-    data = _format_tensor(
-        np_tensor, sumary, indent=indent, max_width=max_width, signed=signed)
+    data = _format_tensor(np_tensor,
+                          sumary,
+                          indent=indent,
+                          max_width=max_width,
+                          signed=signed)
     return data
 
 
@@ -292,41 +297,39 @@ def sparse_tensor_to_string(tensor, prefix='Tensor'):
         _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient}, \n{indent}{indices}, \n{indent}{values})"
         indices_tensor = tensor.indices()
         values_tensor = tensor.values()
-        indices_data = 'indices=' + _format_dense_tensor(indices_tensor, indent
-                                                         + len('indices='))
-        values_data = 'values=' + _format_dense_tensor(values_tensor, indent +
-                                                       len('values='))
-        return _template.format(
-            prefix=prefix,
-            shape=tensor.shape,
-            dtype=tensor.dtype,
-            place=tensor._place_str,
-            stop_gradient=tensor.stop_gradient,
-            indent=' ' * indent,
-            indices=indices_data,
-            values=values_data)
+        indices_data = 'indices=' + _format_dense_tensor(
+            indices_tensor, indent + len('indices='))
+        values_data = 'values=' + _format_dense_tensor(values_tensor,
+                                                       indent + len('values='))
+        return _template.format(prefix=prefix,
+                                shape=tensor.shape,
+                                dtype=tensor.dtype,
+                                place=tensor._place_str,
+                                stop_gradient=tensor.stop_gradient,
+                                indent=' ' * indent,
+                                indices=indices_data,
+                                values=values_data)
     else:
         _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient}, \n{indent}{crows}, \n{indent}{cols}, \n{indent}{values})"
         crows_tensor = tensor.crows()
         cols_tensor = tensor.cols()
         elements_tensor = tensor.values()
-        crows_data = 'crows=' + _format_dense_tensor(crows_tensor, indent +
-                                                     len('crows='))
-        cols_data = 'cols=' + _format_dense_tensor(cols_tensor, indent +
-                                                   len('cols='))
-        values_data = 'values=' + _format_dense_tensor(elements_tensor, indent +
-                                                       len('values='))
-
-        return _template.format(
-            prefix=prefix,
-            shape=tensor.shape,
-            dtype=tensor.dtype,
-            place=tensor._place_str,
-            stop_gradient=tensor.stop_gradient,
-            indent=' ' * indent,
-            crows=crows_data,
-            cols=cols_data,
-            values=values_data)
+        crows_data = 'crows=' + _format_dense_tensor(crows_tensor,
+                                                     indent + len('crows='))
+        cols_data = 'cols=' + _format_dense_tensor(cols_tensor,
+                                                   indent + len('cols='))
+        values_data = 'values=' + _format_dense_tensor(elements_tensor,
+                                                       indent + len('values='))
+
+        return _template.format(prefix=prefix,
+                                shape=tensor.shape,
+                                dtype=tensor.dtype,
+                                place=tensor._place_str,
+                                stop_gradient=tensor.stop_gradient,
+                                indent=' ' * indent,
+                                crows=crows_data,
+                                cols=cols_data,
+                                values=values_data)
 
 
 def tensor_to_string(tensor, prefix='Tensor'):
@@ -345,11 +348,10 @@ def tensor_to_string(tensor, prefix='Tensor'):
         return "Tensor(Not initialized)"
     else:
         data = _format_dense_tensor(tensor, indent)
-        return _template.format(
-            prefix=prefix,
-            shape=tensor.shape,
-            dtype=dtype,
-            place=tensor._place_str,
-            stop_gradient=tensor.stop_gradient,
-            indent=' ' * indent,
-            data=data)
+        return _template.format(prefix=prefix,
+                                shape=tensor.shape,
+                                dtype=dtype,
+                                place=tensor._place_str,
+                                stop_gradient=tensor.stop_gradient,
+                                indent=' ' * indent,
+                                data=data)
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index bc9f402ed9686..62cf9afddc2fb 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -1,19 +1,25 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
+file(
+  GLOB DIST_TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_dist_*.py")
 string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
 
 foreach(TEST_OP ${DIST_TEST_OPS})
-    list(REMOVE_ITEM TEST_OPS ${TEST_OP})
+  list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
 
 if(NOT WITH_COVERAGE)
-    LIST(REMOVE_ITEM TEST_OPS test_hapi_hub)
+  list(REMOVE_ITEM TEST_OPS test_hapi_hub)
 endif()
 
 foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
+  py_test(${src} SRCS ${src}.py)
 endforeach()
 
 function(py_dist_test TARGET_NAME)
@@ -21,27 +27,34 @@ function(py_dist_test TARGET_NAME)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS ENVS)
-    cmake_parse_arguments(py_dist_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cmake_parse_arguments(py_dist_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
 
-    if(WITH_COVERAGE AND (WITH_GPU OR WITH_ROCM) AND (WITH_NCCL OR WITH_RCCL) AND NOT WIN32)
-      add_test(NAME ${TARGET_NAME}
-               COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
-               FLAGS_cpu_deterministic=true NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1
-               PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_dist_test_ENVS}
-               COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-               ${PYTHON_EXECUTABLE} -u ${py_dist_test_SRCS} ${py_dist_test_ARGS}
-               WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if(WITH_COVERAGE
+       AND (WITH_GPU OR WITH_ROCM)
+       AND (WITH_NCCL OR WITH_RCCL)
+       AND NOT WIN32)
+      add_test(
+        NAME ${TARGET_NAME}
+        COMMAND
+          ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true
+          FLAGS_cudnn_deterministic=true FLAGS_cpu_deterministic=true
+          NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1
+          PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_dist_test_ENVS}
+          COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
+          ${PYTHON_EXECUTABLE} -u ${py_dist_test_SRCS} ${py_dist_test_ARGS}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
       # No unit test should exceed 10 minutes.
-      set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=DIST")
+      set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600 LABELS
+                                                     "RUN_TYPE=DIST")
     endif()
 
-    
   endif()
 endfunction()
 
 foreach(src ${DIST_TEST_OPS})
-    message(STATUS ${src})
-    py_dist_test(${src} SRCS ${src}.py)
+  message(STATUS ${src})
+  py_dist_test(${src} SRCS ${src}.py)
 endforeach()
 set_tests_properties(test_dataset_cifar PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 120)
@@ -52,13 +65,14 @@ set_tests_properties(test_dataset_wmt PROPERTIES TIMEOUT 120)
 set_tests_properties(test_vision_models PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataset_uci_housing PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 300)
-set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 600) 
+set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 600)
 if(WITH_COVERAGE)
-    set_tests_properties(test_hapi_hub PROPERTIES TIMEOUT 300) 
+  set_tests_properties(test_hapi_hub PROPERTIES TIMEOUT 300)
 endif()
 
 if(APPLE)
   set_tests_properties(test_callback_early_stop PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_callback_reduce_lr_on_plateau PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_callback_reduce_lr_on_plateau PROPERTIES TIMEOUT
+                                                                     300)
   set_tests_properties(test_vision_models PROPERTIES TIMEOUT 300)
 endif()
diff --git a/python/paddle/tests/dist_hapi_mnist_dynamic.py b/python/paddle/tests/dist_hapi_mnist_dynamic.py
index de0518e229b0a..08d6629c78adb 100644
--- a/python/paddle/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/tests/dist_hapi_mnist_dynamic.py
@@ -32,6 +32,7 @@
 
 
 class MnistDataset(MNIST):
+
     def __init__(self, mode, return_label=True):
         super(MnistDataset, self).__init__(mode=mode)
         self.return_label = return_label
@@ -58,6 +59,7 @@ def compute_accuracy(pred, gt):
 @unittest.skipIf(not fluid.is_compiled_with_cuda(),
                  'CPU testing is not supported')
 class TestDistTraning(unittest.TestCase):
+
     def test_dynamic_multiple_gpus(self):
         device = set_device('gpu')
 
@@ -68,8 +70,9 @@ def test_dynamic_multiple_gpus(self):
         labels = [Input([None, 1], 'int64', 'label')]
 
         model = Model(LeNet(), inputs, labels)
-        optim = fluid.optimizer.Momentum(
-            learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
+        optim = fluid.optimizer.Momentum(learning_rate=0.001,
+                                         momentum=.9,
+                                         parameter_list=model.parameters())
         model.prepare(optim, CrossEntropyLoss(), Accuracy())
 
         train_dataset = MnistDataset(mode='train')
@@ -85,8 +88,9 @@ def test_dynamic_multiple_gpus(self):
 
         eval_result = model.evaluate(val_dataset, batch_size=batch_size)
 
-        output = model.predict(
-            test_dataset, batch_size=batch_size, stack_outputs=True)
+        output = model.predict(test_dataset,
+                               batch_size=batch_size,
+                               stack_outputs=True)
 
         np.testing.assert_equal(output[0].shape[0], len(test_dataset))
 
diff --git a/python/paddle/tests/dist_hapi_mnist_static.py b/python/paddle/tests/dist_hapi_mnist_static.py
index 6120ae90e994d..b143326780f58 100644
--- a/python/paddle/tests/dist_hapi_mnist_static.py
+++ b/python/paddle/tests/dist_hapi_mnist_static.py
@@ -32,6 +32,7 @@
 
 
 class MnistDataset(MNIST):
+
     def __init__(self, mode, return_label=True):
         super(MnistDataset, self).__init__(mode=mode)
         self.return_label = return_label
@@ -58,6 +59,7 @@ def compute_accuracy(pred, gt):
 @unittest.skipIf(not fluid.is_compiled_with_cuda(),
                  'CPU testing is not supported')
 class TestDistTraning(unittest.TestCase):
+
     def test_static_multiple_gpus(self):
         paddle.enable_static()
         device = set_device('gpu')
@@ -69,8 +71,9 @@ def test_static_multiple_gpus(self):
         labels = [Input([None, 1], 'int64', 'label')]
 
         model = Model(LeNet(), inputs, labels)
-        optim = fluid.optimizer.Momentum(
-            learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
+        optim = fluid.optimizer.Momentum(learning_rate=0.001,
+                                         momentum=.9,
+                                         parameter_list=model.parameters())
         model.prepare(optim, CrossEntropyLoss(), Accuracy())
 
         train_dataset = MnistDataset(mode='train')
@@ -86,8 +89,9 @@ def test_static_multiple_gpus(self):
 
         eval_result = model.evaluate(val_dataset, batch_size=batch_size)
 
-        output = model.predict(
-            test_dataset, batch_size=batch_size, stack_outputs=True)
+        output = model.predict(test_dataset,
+                               batch_size=batch_size,
+                               stack_outputs=True)
 
         np.testing.assert_equal(output[0].shape[0], len(test_dataset))
 
diff --git a/python/paddle/tests/dist_hapi_pure_fp16_static.py b/python/paddle/tests/dist_hapi_pure_fp16_static.py
index 0174e4f54e341..d6a18f145b428 100644
--- a/python/paddle/tests/dist_hapi_pure_fp16_static.py
+++ b/python/paddle/tests/dist_hapi_pure_fp16_static.py
@@ -31,6 +31,7 @@
 @unittest.skipIf(not fluid.is_compiled_with_cuda(),
                  'CPU testing is not supported')
 class TestDistTraningWithPureFP16(unittest.TestCase):
+
     def test_amp_training_purefp16(self):
         if not fluid.is_compiled_with_cuda():
             self.skipTest('module not tested when ONLY_CPU compling')
@@ -44,15 +45,13 @@ def test_amp_training_purefp16(self):
         inputs = InputSpec([None, 1, 28, 28], "float32", 'x')
         labels = InputSpec([None, 1], "int64", "y")
         model = Model(net, inputs, labels)
-        optim = paddle.optimizer.Adam(
-            learning_rate=0.001,
-            parameters=model.parameters(),
-            multi_precision=True)
+        optim = paddle.optimizer.Adam(learning_rate=0.001,
+                                      parameters=model.parameters(),
+                                      multi_precision=True)
         amp_configs = {"level": amp_level, "use_fp16_guard": False}
-        model.prepare(
-            optimizer=optim,
-            loss=CrossEntropyLoss(reduction="sum"),
-            amp_configs=amp_configs)
+        model.prepare(optimizer=optim,
+                      loss=CrossEntropyLoss(reduction="sum"),
+                      amp_configs=amp_configs)
         model.train_batch([data], [label])
 
 
diff --git a/python/paddle/tests/hapi_mnist_bf16_static.py b/python/paddle/tests/hapi_mnist_bf16_static.py
index 7eb4d61a21ee1..c1a2f23581c70 100644
--- a/python/paddle/tests/hapi_mnist_bf16_static.py
+++ b/python/paddle/tests/hapi_mnist_bf16_static.py
@@ -41,17 +41,17 @@
 
 def parse_args():
     parser = argparse.ArgumentParser("Lenet BF16 train static script")
-    parser.add_argument(
-        '-bf16',
-        '--bf16',
-        type=ast.literal_eval,
-        default=False,
-        help="whether use bf16")
+    parser.add_argument('-bf16',
+                        '--bf16',
+                        type=ast.literal_eval,
+                        default=False,
+                        help="whether use bf16")
     args = parser.parse_args()
     return args
 
 
 class MnistDataset(MNIST):
+
     def __init__(self, mode, return_label=True):
         super(MnistDataset, self).__init__(mode=mode)
         self.return_label = return_label
@@ -92,11 +92,10 @@ def main(args):
     if args.bf16:
         optim = amp.bf16.decorate_bf16(
             optim,
-            amp_lists=amp.bf16.AutoMixedPrecisionListsBF16(
-                custom_bf16_list={
-                    'matmul_v2', 'pool2d', 'relu', 'scale', 'elementwise_add',
-                    'reshape2', 'slice', 'reduce_mean', 'conv2d'
-                }, ))
+            amp_lists=amp.bf16.AutoMixedPrecisionListsBF16(custom_bf16_list={
+                'matmul_v2', 'pool2d', 'relu', 'scale', 'elementwise_add',
+                'reshape2', 'slice', 'reduce_mean', 'conv2d'
+            }, ))
 
     # Configuration model
     model.prepare(optim, paddle.nn.CrossEntropyLoss(), Accuracy())
@@ -108,8 +107,9 @@ def main(args):
     model.fit(train_dataset, epochs=2, batch_size=batch_size, verbose=1)
     eval_result = model.evaluate(val_dataset, batch_size=batch_size, verbose=1)
 
-    output = model.predict(
-        test_dataset, batch_size=batch_size, stack_outputs=True)
+    output = model.predict(test_dataset,
+                           batch_size=batch_size,
+                           stack_outputs=True)
 
     np.testing.assert_equal(output[0].shape[0], len(test_dataset))
 
diff --git a/python/paddle/tests/hubconf.py b/python/paddle/tests/hubconf.py
index 4b4a853ef2cd9..8e0a5f297a381 100644
--- a/python/paddle/tests/hubconf.py
+++ b/python/paddle/tests/hubconf.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/tests/test_async_read_write.py b/python/paddle/tests/test_async_read_write.py
index 1432063421586..5336ca0da17b3 100644
--- a/python/paddle/tests/test_async_read_write.py
+++ b/python/paddle/tests/test_async_read_write.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,18 +22,17 @@
 
 
 class TestAsyncRead(unittest.TestCase):
+
     def func_setUp(self):
-        self.empty = paddle.to_tensor(
-            np.array(
-                [], dtype="int64"), place=paddle.CPUPlace())
+        self.empty = paddle.to_tensor(np.array([], dtype="int64"),
+                                      place=paddle.CPUPlace())
         data = np.random.randn(100, 50, 50).astype("float32")
         self.src = paddle.to_tensor(data, place=paddle.CUDAPinnedPlace())
         self.dst = paddle.empty(shape=[100, 50, 50], dtype="float32")
-        self.index = paddle.to_tensor(
-            np.array(
-                [1, 3, 5, 7, 9], dtype="int64")).cpu()
-        self.buffer = paddle.empty(
-            shape=[50, 50, 50], dtype="float32").pin_memory()
+        self.index = paddle.to_tensor(np.array([1, 3, 5, 7, 9],
+                                               dtype="int64")).cpu()
+        self.buffer = paddle.empty(shape=[50, 50, 50],
+                                   dtype="float32").pin_memory()
         self.stream = cuda.Stream()
 
     def func_test_async_read_empty_offset_and_count(self):
@@ -50,12 +49,10 @@ def func_test_async_read_empty_offset_and_count(self):
         self.assertTrue(np.allclose(array1.numpy(), array2.numpy()))
 
     def func_test_async_read_success(self):
-        offset = paddle.to_tensor(
-            np.array(
-                [10, 20], dtype="int64"), place=paddle.CPUPlace())
-        count = paddle.to_tensor(
-            np.array(
-                [5, 10], dtype="int64"), place=paddle.CPUPlace())
+        offset = paddle.to_tensor(np.array([10, 20], dtype="int64"),
+                                  place=paddle.CPUPlace())
+        count = paddle.to_tensor(np.array([5, 10], dtype="int64"),
+                                 place=paddle.CPUPlace())
         with cuda.stream_guard(self.stream):
             if _in_legacy_dygraph():
                 core.async_read(self.src, self.dst, self.index, self.buffer,
@@ -109,19 +106,18 @@ def test_main(self):
 
 
 class TestAsyncWrite(unittest.TestCase):
+
     def func_setUp(self):
         self.src = paddle.rand(shape=[100, 50, 50, 5], dtype="float32")
-        self.dst = paddle.empty(
-            shape=[200, 50, 50, 5], dtype="float32").pin_memory()
+        self.dst = paddle.empty(shape=[200, 50, 50, 5],
+                                dtype="float32").pin_memory()
         self.stream = cuda.Stream()
 
     def func_test_async_write_success(self):
-        offset = paddle.to_tensor(
-            np.array(
-                [0, 60], dtype="int64"), place=paddle.CPUPlace())
-        count = paddle.to_tensor(
-            np.array(
-                [40, 60], dtype="int64"), place=paddle.CPUPlace())
+        offset = paddle.to_tensor(np.array([0, 60], dtype="int64"),
+                                  place=paddle.CPUPlace())
+        count = paddle.to_tensor(np.array([40, 60], dtype="int64"),
+                                 place=paddle.CPUPlace())
         with cuda.stream_guard(self.stream):
             if _in_legacy_dygraph():
                 core.async_write(self.src, self.dst, offset, count)
diff --git a/python/paddle/tests/test_callback_early_stop.py b/python/paddle/tests/test_callback_early_stop.py
index 132f0e385c8fe..03741d98820ec 100644
--- a/python/paddle/tests/test_callback_early_stop.py
+++ b/python/paddle/tests/test_callback_early_stop.py
@@ -30,6 +30,7 @@
 
 
 class MnistDataset(MNIST):
+
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
         self.return_label = return_label
@@ -49,6 +50,7 @@ def __len__(self):
 
 
 class TestCallbacks(unittest.TestCase):
+
     def setUp(self):
         self.save_dir = tempfile.mkdtemp()
 
@@ -65,49 +67,44 @@ def test_earlystopping(self):
             val_dataset = MnistDataset(mode='test', sample_num=sample_num)
 
             net = LeNet()
-            optim = paddle.optimizer.Adam(
-                learning_rate=0.001, parameters=net.parameters())
+            optim = paddle.optimizer.Adam(learning_rate=0.001,
+                                          parameters=net.parameters())
 
             inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
             labels = [InputSpec([None, 1], 'int64', 'label')]
 
             model = Model(net, inputs=inputs, labels=labels)
-            model.prepare(
-                optim,
-                loss=CrossEntropyLoss(reduction="sum"),
-                metrics=[Accuracy()])
-            callbacks_0 = paddle.callbacks.EarlyStopping(
-                'loss',
-                mode='min',
-                patience=1,
-                verbose=1,
-                min_delta=0,
-                baseline=None,
-                save_best_model=True)
-            callbacks_1 = paddle.callbacks.EarlyStopping(
-                'acc',
-                mode='auto',
-                patience=1,
-                verbose=1,
-                min_delta=0,
-                baseline=0,
-                save_best_model=True)
-            callbacks_2 = paddle.callbacks.EarlyStopping(
-                'loss',
-                mode='auto_',
-                patience=1,
-                verbose=1,
-                min_delta=0,
-                baseline=None,
-                save_best_model=True)
-            callbacks_3 = paddle.callbacks.EarlyStopping(
-                'acc_',
-                mode='max',
-                patience=1,
-                verbose=1,
-                min_delta=0,
-                baseline=0,
-                save_best_model=True)
+            model.prepare(optim,
+                          loss=CrossEntropyLoss(reduction="sum"),
+                          metrics=[Accuracy()])
+            callbacks_0 = paddle.callbacks.EarlyStopping('loss',
+                                                         mode='min',
+                                                         patience=1,
+                                                         verbose=1,
+                                                         min_delta=0,
+                                                         baseline=None,
+                                                         save_best_model=True)
+            callbacks_1 = paddle.callbacks.EarlyStopping('acc',
+                                                         mode='auto',
+                                                         patience=1,
+                                                         verbose=1,
+                                                         min_delta=0,
+                                                         baseline=0,
+                                                         save_best_model=True)
+            callbacks_2 = paddle.callbacks.EarlyStopping('loss',
+                                                         mode='auto_',
+                                                         patience=1,
+                                                         verbose=1,
+                                                         min_delta=0,
+                                                         baseline=None,
+                                                         save_best_model=True)
+            callbacks_3 = paddle.callbacks.EarlyStopping('acc_',
+                                                         mode='max',
+                                                         patience=1,
+                                                         verbose=1,
+                                                         min_delta=0,
+                                                         baseline=0,
+                                                         save_best_model=True)
             model.fit(
                 train_dataset,
                 val_dataset,
diff --git a/python/paddle/tests/test_callback_reduce_lr_on_plateau.py b/python/paddle/tests/test_callback_reduce_lr_on_plateau.py
index d7680537f378b..e841a3f2fa57d 100644
--- a/python/paddle/tests/test_callback_reduce_lr_on_plateau.py
+++ b/python/paddle/tests/test_callback_reduce_lr_on_plateau.py
@@ -34,24 +34,27 @@
 
 # Accelerate unittest
 class CustomMnist(MNIST):
+
     def __len__(self):
         return 8
 
 
 class TestReduceLROnPlateau(unittest.TestCase):
+
     def func_reduce_lr_on_plateau(self):
         transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
         train_dataset = CustomMnist(mode='train', transform=transform)
         val_dataset = CustomMnist(mode='test', transform=transform)
         net = LeNet()
-        optim = paddle.optimizer.Adam(
-            learning_rate=0.001, parameters=net.parameters())
+        optim = paddle.optimizer.Adam(learning_rate=0.001,
+                                      parameters=net.parameters())
         inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
         model = Model(net, inputs=inputs, labels=labels)
         model.prepare(optim, loss=CrossEntropyLoss(), metrics=[Accuracy()])
-        callbacks = paddle.callbacks.ReduceLROnPlateau(
-            patience=1, verbose=1, cooldown=1)
+        callbacks = paddle.callbacks.ReduceLROnPlateau(patience=1,
+                                                       verbose=1,
+                                                       cooldown=1)
         model.fit(train_dataset,
                   val_dataset,
                   batch_size=8,
@@ -75,14 +78,15 @@ def func_warn_or_error(self):
         train_dataset = CustomMnist(mode='train', transform=transform)
         val_dataset = CustomMnist(mode='test', transform=transform)
         net = LeNet()
-        optim = paddle.optimizer.Adam(
-            learning_rate=0.001, parameters=net.parameters())
+        optim = paddle.optimizer.Adam(learning_rate=0.001,
+                                      parameters=net.parameters())
         inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
         model = Model(net, inputs=inputs, labels=labels)
         model.prepare(optim, loss=CrossEntropyLoss(), metrics=[Accuracy()])
-        callbacks = paddle.callbacks.ReduceLROnPlateau(
-            monitor='miou', patience=3, verbose=1)
+        callbacks = paddle.callbacks.ReduceLROnPlateau(monitor='miou',
+                                                       patience=3,
+                                                       verbose=1)
         model.fit(train_dataset,
                   val_dataset,
                   batch_size=8,
@@ -97,8 +101,11 @@ def func_warn_or_error(self):
             parameters=net.parameters())
 
         model.prepare(optim, loss=CrossEntropyLoss(), metrics=[Accuracy()])
-        callbacks = paddle.callbacks.ReduceLROnPlateau(
-            monitor='acc', mode='max', patience=3, verbose=1, cooldown=1)
+        callbacks = paddle.callbacks.ReduceLROnPlateau(monitor='acc',
+                                                       mode='max',
+                                                       patience=3,
+                                                       verbose=1,
+                                                       cooldown=1)
         model.fit(train_dataset,
                   val_dataset,
                   batch_size=8,
diff --git a/python/paddle/tests/test_callback_visualdl.py b/python/paddle/tests/test_callback_visualdl.py
index 355e88edd2bec..e62c045d60157 100644
--- a/python/paddle/tests/test_callback_visualdl.py
+++ b/python/paddle/tests/test_callback_visualdl.py
@@ -33,11 +33,13 @@
 
 
 class MnistDataset(MNIST):
+
     def __len__(self):
         return 512
 
 
 class TestCallbacks(unittest.TestCase):
+
     def setUp(self):
         self.save_dir = tempfile.mkdtemp()
 
@@ -60,10 +62,9 @@ def func_visualdl_callback(self):
         model = paddle.Model(net, inputs, labels)
 
         optim = paddle.optimizer.Adam(0.001, parameters=net.parameters())
-        model.prepare(
-            optimizer=optim,
-            loss=paddle.nn.CrossEntropyLoss(),
-            metrics=paddle.metric.Accuracy())
+        model.prepare(optimizer=optim,
+                      loss=paddle.nn.CrossEntropyLoss(),
+                      metrics=paddle.metric.Accuracy())
 
         callback = paddle.callbacks.VisualDL(log_dir='visualdl_log_dir')
         model.fit(train_dataset,
diff --git a/python/paddle/tests/test_callbacks.py b/python/paddle/tests/test_callbacks.py
index 2c81549bab94c..b0b23b0efa382 100644
--- a/python/paddle/tests/test_callbacks.py
+++ b/python/paddle/tests/test_callbacks.py
@@ -32,6 +32,7 @@
 
 
 class MnistDataset(MNIST):
+
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
         self.return_label = return_label
@@ -51,6 +52,7 @@ def __len__(self):
 
 
 class TestCallbacks(unittest.TestCase):
+
     def setUp(self):
         self.save_dir = tempfile.mkdtemp()
 
@@ -67,15 +69,14 @@ def run_callback(self):
         lenet = Model(LeNet(), inputs)
         lenet.prepare()
 
-        cbks = config_callbacks(
-            model=lenet,
-            batch_size=128,
-            epochs=epochs,
-            steps=steps,
-            log_freq=freq,
-            verbose=self.verbose,
-            metrics=['loss', 'acc'],
-            save_dir=self.save_dir)
+        cbks = config_callbacks(model=lenet,
+                                batch_size=128,
+                                epochs=epochs,
+                                steps=steps,
+                                log_freq=freq,
+                                verbose=self.verbose,
+                                metrics=['loss', 'acc'],
+                                save_dir=self.save_dir)
         cbks.on_begin('train')
 
         logs = {'loss': 50.341673, 'acc': 0.00256}
diff --git a/python/paddle/tests/test_dataset_cifar.py b/python/paddle/tests/test_dataset_cifar.py
index abf79fb1e3974..95bf21c4da181 100644
--- a/python/paddle/tests/test_dataset_cifar.py
+++ b/python/paddle/tests/test_dataset_cifar.py
@@ -19,6 +19,7 @@
 
 
 class TestCifar10Train(unittest.TestCase):
+
     def test_main(self):
         cifar = Cifar10(mode='train')
         self.assertTrue(len(cifar) == 50000)
@@ -36,6 +37,7 @@ def test_main(self):
 
 
 class TestCifar10Test(unittest.TestCase):
+
     def test_main(self):
         cifar = Cifar10(mode='test')
         self.assertTrue(len(cifar) == 10000)
@@ -70,6 +72,7 @@ def test_main(self):
 
 
 class TestCifar100Train(unittest.TestCase):
+
     def test_main(self):
         cifar = Cifar100(mode='train')
         self.assertTrue(len(cifar) == 50000)
@@ -87,6 +90,7 @@ def test_main(self):
 
 
 class TestCifar100Test(unittest.TestCase):
+
     def test_main(self):
         cifar = Cifar100(mode='test')
         self.assertTrue(len(cifar) == 10000)
diff --git a/python/paddle/tests/test_dataset_conll05.py b/python/paddle/tests/test_dataset_conll05.py
index 9eb0036718b35..8dc1f56779f71 100644
--- a/python/paddle/tests/test_dataset_conll05.py
+++ b/python/paddle/tests/test_dataset_conll05.py
@@ -20,6 +20,7 @@
 
 
 class TestConll05st(unittest.TestCase):
+
     def test_main(self):
         conll05st = Conll05st()
         self.assertTrue(len(conll05st) == 5267)
diff --git a/python/paddle/tests/test_dataset_imdb.py b/python/paddle/tests/test_dataset_imdb.py
index aed8c387409dc..c70b7fedf3632 100644
--- a/python/paddle/tests/test_dataset_imdb.py
+++ b/python/paddle/tests/test_dataset_imdb.py
@@ -19,6 +19,7 @@
 
 
 class TestImdbTrain(unittest.TestCase):
+
     def test_main(self):
         imdb = Imdb(mode='train')
         self.assertTrue(len(imdb) == 25000)
@@ -33,6 +34,7 @@ def test_main(self):
 
 
 class TestImdbTest(unittest.TestCase):
+
     def test_main(self):
         imdb = Imdb(mode='test')
         self.assertTrue(len(imdb) == 25000)
diff --git a/python/paddle/tests/test_dataset_imikolov.py b/python/paddle/tests/test_dataset_imikolov.py
index 6ffeeda73c362..6379ed11e5daa 100644
--- a/python/paddle/tests/test_dataset_imikolov.py
+++ b/python/paddle/tests/test_dataset_imikolov.py
@@ -19,6 +19,7 @@
 
 
 class TestImikolovTrain(unittest.TestCase):
+
     def test_main(self):
         imikolov = Imikolov(mode='train', data_type='NGRAM', window_size=2)
         self.assertTrue(len(imikolov) == 929589)
@@ -31,6 +32,7 @@ def test_main(self):
 
 
 class TestImikolovTest(unittest.TestCase):
+
     def test_main(self):
         imikolov = Imikolov(mode='test', data_type='NGRAM', window_size=2)
         self.assertTrue(len(imikolov) == 82430)
diff --git a/python/paddle/tests/test_dataset_movielens.py b/python/paddle/tests/test_dataset_movielens.py
index e5c6d8376eed9..78a6211647656 100644
--- a/python/paddle/tests/test_dataset_movielens.py
+++ b/python/paddle/tests/test_dataset_movielens.py
@@ -19,6 +19,7 @@
 
 
 class TestMovielensTrain(unittest.TestCase):
+
     def test_main(self):
         movielens = Movielens(mode='train')
         # movielens dataset random split train/test
@@ -36,6 +37,7 @@ def test_main(self):
 
 
 class TestMovielensTest(unittest.TestCase):
+
     def test_main(self):
         movielens = Movielens(mode='test')
         # movielens dataset random split train/test
diff --git a/python/paddle/tests/test_dataset_uci_housing.py b/python/paddle/tests/test_dataset_uci_housing.py
index bdf960b433687..beff1f71fe6c4 100644
--- a/python/paddle/tests/test_dataset_uci_housing.py
+++ b/python/paddle/tests/test_dataset_uci_housing.py
@@ -23,6 +23,7 @@
 
 
 class TestUCIHousingTrain(unittest.TestCase):
+
     def test_main(self):
         uci_housing = UCIHousing(mode='train')
         self.assertTrue(len(uci_housing) == 404)
@@ -39,6 +40,7 @@ def test_main(self):
 
 
 class TestUCIHousingTest(unittest.TestCase):
+
     def test_main(self):
         uci_housing = UCIHousing(mode='test')
         self.assertTrue(len(uci_housing) == 102)
@@ -55,6 +57,7 @@ def test_main(self):
 
 
 class TestWMT14Train(unittest.TestCase):
+
     def test_main(self):
         wmt14 = WMT14(mode='train', dict_size=50)
         self.assertTrue(len(wmt14) == 191155)
@@ -70,6 +73,7 @@ def test_main(self):
 
 
 class TestWMT14Test(unittest.TestCase):
+
     def test_main(self):
         wmt14 = WMT14(mode='test', dict_size=50)
         self.assertTrue(len(wmt14) == 5957)
@@ -85,6 +89,7 @@ def test_main(self):
 
 
 class TestWMT14Gen(unittest.TestCase):
+
     def test_main(self):
         wmt14 = WMT14(mode='gen', dict_size=50)
         self.assertTrue(len(wmt14) == 3001)
diff --git a/python/paddle/tests/test_dataset_voc.py b/python/paddle/tests/test_dataset_voc.py
index 6ca2a8e184ca3..f52abab2ca643 100644
--- a/python/paddle/tests/test_dataset_voc.py
+++ b/python/paddle/tests/test_dataset_voc.py
@@ -24,6 +24,7 @@
 
 
 class TestVOC2012Train(unittest.TestCase):
+
     def test_main(self):
         voc2012 = VOC2012(mode='train')
         self.assertTrue(len(voc2012) == 3)
@@ -40,6 +41,7 @@ def test_main(self):
 
 
 class TestVOC2012Valid(unittest.TestCase):
+
     def test_main(self):
         voc2012 = VOC2012(mode='valid')
         self.assertTrue(len(voc2012) == 1)
@@ -56,6 +58,7 @@ def test_main(self):
 
 
 class TestVOC2012Test(unittest.TestCase):
+
     def test_main(self):
         voc2012 = VOC2012(mode='test')
         self.assertTrue(len(voc2012) == 2)
diff --git a/python/paddle/tests/test_dataset_wmt.py b/python/paddle/tests/test_dataset_wmt.py
index 3e63090c9f0ff..48186ab1864aa 100644
--- a/python/paddle/tests/test_dataset_wmt.py
+++ b/python/paddle/tests/test_dataset_wmt.py
@@ -19,6 +19,7 @@
 
 
 class TestWMT14Train(unittest.TestCase):
+
     def test_main(self):
         wmt14 = WMT14(mode='train', dict_size=50)
         self.assertTrue(len(wmt14) == 191155)
@@ -34,6 +35,7 @@ def test_main(self):
 
 
 class TestWMT14Test(unittest.TestCase):
+
     def test_main(self):
         wmt14 = WMT14(mode='test', dict_size=50)
         self.assertTrue(len(wmt14) == 5957)
@@ -49,6 +51,7 @@ def test_main(self):
 
 
 class TestWMT14Gen(unittest.TestCase):
+
     def test_main(self):
         wmt14 = WMT14(mode='gen', dict_size=50)
         self.assertTrue(len(wmt14) == 3001)
@@ -64,9 +67,12 @@ def test_main(self):
 
 
 class TestWMT16Train(unittest.TestCase):
+
     def test_main(self):
-        wmt16 = WMT16(
-            mode='train', src_dict_size=50, trg_dict_size=50, lang='en')
+        wmt16 = WMT16(mode='train',
+                      src_dict_size=50,
+                      trg_dict_size=50,
+                      lang='en')
         self.assertTrue(len(wmt16) == 29000)
 
         # traversal whole dataset may cost a
@@ -80,9 +86,12 @@ def test_main(self):
 
 
 class TestWMT16Test(unittest.TestCase):
+
     def test_main(self):
-        wmt16 = WMT16(
-            mode='test', src_dict_size=50, trg_dict_size=50, lang='en')
+        wmt16 = WMT16(mode='test',
+                      src_dict_size=50,
+                      trg_dict_size=50,
+                      lang='en')
         self.assertTrue(len(wmt16) == 1000)
 
         # traversal whole dataset may cost a
@@ -96,6 +105,7 @@ def test_main(self):
 
 
 class TestWMT16Val(unittest.TestCase):
+
     def test_main(self):
         wmt16 = WMT16(mode='val', src_dict_size=50, trg_dict_size=50, lang='en')
         self.assertTrue(len(wmt16) == 1014)
diff --git a/python/paddle/tests/test_datasets.py b/python/paddle/tests/test_datasets.py
index be26dff6c0426..ae55377dfdf26 100644
--- a/python/paddle/tests/test_datasets.py
+++ b/python/paddle/tests/test_datasets.py
@@ -26,6 +26,7 @@
 
 
 class TestFolderDatasets(unittest.TestCase):
+
     def setUp(self):
         self.data_dir = tempfile.mkdtemp()
         self.empty_dir = tempfile.mkdtemp()
@@ -76,6 +77,7 @@ def test_folder(self):
         self.func_test_folder()
 
     def func_test_transform(self):
+
         def fake_transform(img):
             return img
 
@@ -110,6 +112,7 @@ def test_errors(self):
 
 
 class TestMNISTTest(unittest.TestCase):
+
     def func_test_main(self):
         transform = T.Transpose()
         mnist = MNIST(mode='test', transform=transform)
@@ -130,6 +133,7 @@ def test_main(self):
 
 
 class TestMNISTTrain(unittest.TestCase):
+
     def func_test_main(self):
         transform = T.Transpose()
         mnist = MNIST(mode='train', transform=transform)
@@ -166,6 +170,7 @@ def test_main(self):
 
 
 class TestFASHIONMNISTTest(unittest.TestCase):
+
     def func_test_main(self):
         transform = T.Transpose()
         mnist = FashionMNIST(mode='test', transform=transform)
@@ -186,6 +191,7 @@ def test_main(self):
 
 
 class TestFASHIONMNISTTrain(unittest.TestCase):
+
     def func_test_main(self):
         transform = T.Transpose()
         mnist = FashionMNIST(mode='train', transform=transform)
@@ -234,6 +240,7 @@ def test_dataset_value(self):
 
 
 class TestFlowersTrain(unittest.TestCase):
+
     def func_test_main(self):
         flowers = Flowers(mode='train')
         self.assertTrue(len(flowers) == 6149)
@@ -254,6 +261,7 @@ def test_main(self):
 
 
 class TestFlowersValid(unittest.TestCase):
+
     def func_test_main(self):
         flowers = Flowers(mode='valid')
         self.assertTrue(len(flowers) == 1020)
@@ -274,6 +282,7 @@ def test_main(self):
 
 
 class TestFlowersTest(unittest.TestCase):
+
     def func_test_main(self):
         flowers = Flowers(mode='test')
         self.assertTrue(len(flowers) == 1020)
diff --git a/python/paddle/tests/test_dist_hapi_model.py b/python/paddle/tests/test_dist_hapi_model.py
index 006800d3caeee..895d2bc0c478a 100644
--- a/python/paddle/tests/test_dist_hapi_model.py
+++ b/python/paddle/tests/test_dist_hapi_model.py
@@ -103,6 +103,7 @@ def start_local_trainers(cluster,
 
 
 class TestMultipleGpus(unittest.TestCase):
+
     def run_mnist_2gpu(self, target_file_name, eager_mode=True):
         if fluid.core.get_cuda_device_count() == 0:
             return
@@ -113,12 +114,11 @@ def run_mnist_2gpu(self, target_file_name, eager_mode=True):
 
         cluster, pod = get_cluster_from_args(selected_gpus)
 
-        procs = start_local_trainers(
-            cluster,
-            pod,
-            eager_mode=eager_mode,
-            training_script=target_file_name,
-            training_script_args=[])
+        procs = start_local_trainers(cluster,
+                                     pod,
+                                     eager_mode=eager_mode,
+                                     training_script=target_file_name,
+                                     training_script_args=[])
 
         while True:
             alive = watch_local_trainers(procs, cluster.trainers_nranks())
diff --git a/python/paddle/tests/test_dlpack.py b/python/paddle/tests/test_dlpack.py
index 458efd047de68..5ca49a09fe87b 100644
--- a/python/paddle/tests/test_dlpack.py
+++ b/python/paddle/tests/test_dlpack.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class TestDLPack(unittest.TestCase):
+
     def func_test_dlpack_dygraph(self):
         paddle.disable_static()
         tensor = paddle.to_tensor(np.array([1, 2, 3, 4]).astype('int'))
@@ -33,9 +34,8 @@ def func_test_dlpack_dygraph(self):
         else:
             self.assertTrue(isinstance(out_from_dlpack, paddle.Tensor))
         self.assertTrue(
-            np.array_equal(
-                np.array(out_from_dlpack), np.array([1, 2, 3, 4]).astype(
-                    'int')))
+            np.array_equal(np.array(out_from_dlpack),
+                           np.array([1, 2, 3, 4]).astype('int')))
 
     def test_dlpack_dygraph(self):
         with _test_eager_guard():
@@ -65,9 +65,8 @@ def test_dlpack_static(self):
         out_from_dlpack = paddle.utils.dlpack.from_dlpack(dlpack)
         self.assertTrue(isinstance(out_from_dlpack, fluid.core.Tensor))
         self.assertTrue(
-            np.array_equal(
-                np.array(out_from_dlpack),
-                np.array([[1], [2], [3], [4]]).astype('int')))
+            np.array_equal(np.array(out_from_dlpack),
+                           np.array([[1], [2], [3], [4]]).astype('int')))
 
         # when build with cuda
         if core.is_compiled_with_cuda():
@@ -78,9 +77,8 @@ def test_dlpack_static(self):
             gout_from_dlpack = paddle.utils.dlpack.from_dlpack(gdlpack)
             self.assertTrue(isinstance(gout_from_dlpack, fluid.core.Tensor))
             self.assertTrue(
-                np.array_equal(
-                    np.array(gout_from_dlpack),
-                    np.array([[1], [2], [3], [4]]).astype('int')))
+                np.array_equal(np.array(gout_from_dlpack),
+                               np.array([[1], [2], [3], [4]]).astype('int')))
 
     def func_test_dlpack_dtype_conversion(self):
         paddle.disable_static()
@@ -120,6 +118,7 @@ def test_dlpack_dtype_conversion(self):
 
 
 class TestRaiseError(unittest.TestCase):
+
     def func_test_from_dlpack_raise_type_error(self):
         self.assertRaises(TypeError, paddle.utils.dlpack.from_dlpack,
                           np.zeros(5))
diff --git a/python/paddle/tests/test_download.py b/python/paddle/tests/test_download.py
index 49e76d9416e69..3e6fcc5429780 100644
--- a/python/paddle/tests/test_download.py
+++ b/python/paddle/tests/test_download.py
@@ -20,6 +20,7 @@
 
 
 class TestDownload(unittest.TestCase):
+
     def download(self, url, md5sum):
         get_weights_path_from_url(url, md5sum)
 
@@ -106,7 +107,8 @@ def test_retry_exception(self, ):
             from paddle.utils.download import _download
             _download(
                 'www.baidu.com',
-                './test', )
+                './test',
+            )
 
     def test_wget_download_error(self, ):
         with self.assertRaises(RuntimeError):
@@ -131,7 +133,8 @@ def test_download_methods(self, ):
                 _download(
                     url,
                     path='./test',
-                    method=method, )
+                    method=method,
+                )
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/tests/test_hapi_amp.py b/python/paddle/tests/test_hapi_amp.py
index d17b6f3594713..eaf10dbfc4c75 100644
--- a/python/paddle/tests/test_hapi_amp.py
+++ b/python/paddle/tests/test_hapi_amp.py
@@ -16,6 +16,7 @@
 from __future__ import print_function
 
 import os
+
 os.environ['FLAGS_cudnn_deterministic'] = '1'
 
 import unittest
@@ -36,17 +37,17 @@
 @unittest.skipIf(not fluid.is_compiled_with_cuda(),
                  'CPU testing is not supported')
 class TestHapiWithAmp(unittest.TestCase):
+
     def get_model(self, amp_config):
         net = LeNet()
         inputs = InputSpec([None, 1, 28, 28], "float32", 'x')
         labels = InputSpec([None, 1], "int64", "y")
         model = Model(net, inputs, labels)
-        optim = paddle.optimizer.Adam(
-            learning_rate=0.001, parameters=model.parameters())
-        model.prepare(
-            optimizer=optim,
-            loss=CrossEntropyLoss(reduction="sum"),
-            amp_configs=amp_config)
+        optim = paddle.optimizer.Adam(learning_rate=0.001,
+                                      parameters=model.parameters())
+        model.prepare(optimizer=optim,
+                      loss=CrossEntropyLoss(reduction="sum"),
+                      amp_configs=amp_config)
         return model
 
     def run_model(self, model):
@@ -82,7 +83,9 @@ def test_amp(self):
         self.run_amp(amp_config)
 
     def test_fp32(self):
-        amp_config = {"level": "O0", }
+        amp_config = {
+            "level": "O0",
+        }
         self.run_amp(amp_config)
 
     def test_save_load(self):
@@ -122,9 +125,11 @@ def test_save_load(self):
         self.assertEqual(new_model._scaler.state_dict()['decr_count'],
                          model._scaler.state_dict()['decr_count'])
         self.assertTrue(
-            np.array_equal(new_model._optimizer.state_dict(
-            )['conv2d_1.w_0_moment1_0'].numpy(
-            ), model._optimizer.state_dict()['conv2d_1.w_0_moment1_0'].numpy()))
+            np.array_equal(
+                new_model._optimizer.state_dict()
+                ['conv2d_1.w_0_moment1_0'].numpy(),
+                model._optimizer.state_dict()
+                ['conv2d_1.w_0_moment1_0'].numpy()))
 
     def test_dynamic_check_input(self):
         paddle.disable_static()
@@ -147,21 +152,21 @@ def test_dynamic_check_input(self):
         paddle.set_device('gpu')
         net = LeNet()
         model = Model(net)
-        optim = paddle.optimizer.Adam(
-            learning_rate=0.001, parameters=model.parameters())
+        optim = paddle.optimizer.Adam(learning_rate=0.001,
+                                      parameters=model.parameters())
         loss = CrossEntropyLoss(reduction="sum")
         with self.assertRaises(ValueError):
             for amp_configs in amp_configs_list:
-                model.prepare(
-                    optimizer=optim, loss=loss, amp_configs=amp_configs)
+                model.prepare(optimizer=optim,
+                              loss=loss,
+                              amp_configs=amp_configs)
         model.prepare(optimizer=optim, loss=loss, amp_configs="O2")
-        model.prepare(
-            optimizer=optim,
-            loss=loss,
-            amp_configs={
-                "custom_white_list": {"matmul"},
-                "init_loss_scaling": 1.0
-            })
+        model.prepare(optimizer=optim,
+                      loss=loss,
+                      amp_configs={
+                          "custom_white_list": {"matmul"},
+                          "init_loss_scaling": 1.0
+                      })
 
     def test_static_check_input(self):
         paddle.enable_static()
@@ -175,8 +180,8 @@ def test_static_check_input(self):
         labels = InputSpec([None, 1], "int64", "y")
         model = Model(net, inputs, labels)
 
-        optim = paddle.optimizer.Adam(
-            learning_rate=0.001, parameters=model.parameters())
+        optim = paddle.optimizer.Adam(learning_rate=0.001,
+                                      parameters=model.parameters())
         loss = CrossEntropyLoss(reduction="sum")
         with self.assertRaises(ValueError):
             model.prepare(optimizer=optim, loss=loss, amp_configs=amp_configs)
diff --git a/python/paddle/tests/test_hapi_hub.py b/python/paddle/tests/test_hapi_hub.py
index 06000d6c83367..3ebe69d01c739 100644
--- a/python/paddle/tests/test_hapi_hub.py
+++ b/python/paddle/tests/test_hapi_hub.py
@@ -25,42 +25,44 @@
 
 
 class TestHub(unittest.TestCase):
+
     def setUp(self, ):
         self.local_repo = os.path.dirname(os.path.abspath(__file__))
         self.github_repo = 'lyuwenyu/paddlehub_demo:main'
 
     def testLoad(self, ):
-        model = hub.load(
-            self.local_repo, model='MM', source='local', out_channels=8)
+        model = hub.load(self.local_repo,
+                         model='MM',
+                         source='local',
+                         out_channels=8)
 
         data = paddle.rand((1, 3, 100, 100))
         out = model(data)
         np.testing.assert_equal(out.shape, [1, 8, 50, 50])
 
-        model = hub.load(
-            self.github_repo, model='MM', source='github', force_reload=True)
-
-        model = hub.load(
-            self.github_repo,
-            model='MM',
-            source='github',
-            force_reload=False,
-            pretrained=False)
-
-        model = hub.load(
-            self.github_repo.split(':')[0],
-            model='MM',
-            source='github',
-            force_reload=False,
-            pretrained=False)
-
-        model = hub.load(
-            self.github_repo,
-            model='MM',
-            source='github',
-            force_reload=False,
-            pretrained=True,
-            out_channels=8)
+        model = hub.load(self.github_repo,
+                         model='MM',
+                         source='github',
+                         force_reload=True)
+
+        model = hub.load(self.github_repo,
+                         model='MM',
+                         source='github',
+                         force_reload=False,
+                         pretrained=False)
+
+        model = hub.load(self.github_repo.split(':')[0],
+                         model='MM',
+                         source='github',
+                         force_reload=False,
+                         pretrained=False)
+
+        model = hub.load(self.github_repo,
+                         model='MM',
+                         source='github',
+                         force_reload=False,
+                         pretrained=True,
+                         out_channels=8)
 
         data = paddle.ones((1, 3, 2, 2))
         out = model(data)
@@ -70,17 +72,21 @@ def testLoad(self, ):
             0.37345418
         ])
         np.testing.assert_equal(out.shape, [1, 8, 1, 1])
-        np.testing.assert_almost_equal(
-            out.numpy(), gt.reshape(1, 8, 1, 1), decimal=5)
+        np.testing.assert_almost_equal(out.numpy(),
+                                       gt.reshape(1, 8, 1, 1),
+                                       decimal=5)
 
     def testHelp(self, ):
         docs1 = hub.help(
             self.local_repo,
             model='MM',
-            source='local', )
+            source='local',
+        )
 
-        docs2 = hub.help(
-            self.github_repo, model='MM', source='github', force_reload=False)
+        docs2 = hub.help(self.github_repo,
+                         model='MM',
+                         source='github',
+                         force_reload=False)
 
         assert docs1 == docs2 == 'This is a test demo for paddle hub\n    ', ''
 
@@ -88,44 +94,46 @@ def testList(self, ):
         models1 = hub.list(
             self.local_repo,
             source='local',
-            force_reload=False, )
+            force_reload=False,
+        )
 
         models2 = hub.list(
             self.github_repo,
             source='github',
-            force_reload=False, )
+            force_reload=False,
+        )
 
         assert models1 == models2 == ['MM'], ''
 
     def testExcept(self, ):
         with self.assertRaises(ValueError):
-            _ = hub.help(
-                self.github_repo,
-                model='MM',
-                source='github-test',
-                force_reload=False)
+            _ = hub.help(self.github_repo,
+                         model='MM',
+                         source='github-test',
+                         force_reload=False)
 
         with self.assertRaises(ValueError):
-            _ = hub.load(
-                self.github_repo,
-                model='MM',
-                source='github-test',
-                force_reload=False)
+            _ = hub.load(self.github_repo,
+                         model='MM',
+                         source='github-test',
+                         force_reload=False)
 
         with self.assertRaises(ValueError):
-            _ = hub.list(
-                self.github_repo, source='github-test', force_reload=False)
+            _ = hub.list(self.github_repo,
+                         source='github-test',
+                         force_reload=False)
 
         with self.assertRaises(ValueError):
-            _ = hub.load(
-                self.local_repo, model=123, source='local', force_reload=False)
+            _ = hub.load(self.local_repo,
+                         model=123,
+                         source='local',
+                         force_reload=False)
 
         with self.assertRaises(RuntimeError):
-            _ = hub.load(
-                self.local_repo,
-                model='123',
-                source='local',
-                force_reload=False)
+            _ = hub.load(self.local_repo,
+                         model='123',
+                         source='local',
+                         force_reload=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/tests/test_hapi_hub_model.py b/python/paddle/tests/test_hapi_hub_model.py
index 774c7f6f33a65..e058a6e39aaf3 100644
--- a/python/paddle/tests/test_hapi_hub_model.py
+++ b/python/paddle/tests/test_hapi_hub_model.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,6 +18,7 @@
 
 
 class MM(nn.Layer):
+
     def __init__(self, out_channels):
         super(MM, self).__init__()
         self.conv = nn.Conv2D(3, out_channels, 3, 2, 1)
diff --git a/python/paddle/tests/test_logger.py b/python/paddle/tests/test_logger.py
index b6edec8674a64..c42775938dcc5 100644
--- a/python/paddle/tests/test_logger.py
+++ b/python/paddle/tests/test_logger.py
@@ -25,6 +25,7 @@
 
 
 class TestSetupLogger(unittest.TestCase):
+
     def setUp(self):
         self.save_dir = tempfile.mkdtemp()
         self.save_file = os.path.join(self.save_dir, 'logger.txt')
diff --git a/python/paddle/tests/test_metrics.py b/python/paddle/tests/test_metrics.py
index 0cf52b35e444b..5b1d5a7f19586 100644
--- a/python/paddle/tests/test_metrics.py
+++ b/python/paddle/tests/test_metrics.py
@@ -59,6 +59,7 @@ def convert_to_one_hot(y, C):
 
 
 class TestAccuracy(unittest.TestCase):
+
     def test_acc(self, squeeze_y=False):
         x = paddle.to_tensor(
             np.array([[0.1, 0.2, 0.3, 0.4], [0.1, 0.4, 0.3, 0.2],
@@ -126,6 +127,7 @@ def test_one_hot(self):
 
 
 class TestAccuracyDynamic(unittest.TestCase):
+
     def setUp(self):
         self.topk = (1, )
         self.class_num = 5
@@ -153,7 +155,7 @@ def test_main(self):
                 label_var = paddle.to_tensor(label)
                 pred_var = paddle.to_tensor(pred)
                 state = to_list(acc.compute(pred_var, label_var))
-                acc.update(* [s.numpy() for s in state])
+                acc.update(*[s.numpy() for s in state])
                 res_m = acc.accumulate()
                 res_f = accuracy(pred, label, self.topk)
                 assert np.all(np.isclose(np.array(res_m, dtype='float64'),
@@ -165,6 +167,7 @@ def test_main(self):
 
 
 class TestAccuracyDynamicMultiTopk(TestAccuracyDynamic):
+
     def setUp(self):
         self.topk = (1, 5)
         self.class_num = 10
@@ -174,6 +177,7 @@ def setUp(self):
 
 
 class TestAccuracyStatic(TestAccuracyDynamic):
+
     def setUp(self):
         self.topk = (1, )
         self.class_num = 5
@@ -189,8 +193,9 @@ def test_main(self):
         main_prog.random_seed = 1024
         startup_prog.random_seed = 1024
         with fluid.program_guard(main_prog, startup_prog):
-            pred = fluid.data(
-                name='pred', shape=[None, self.class_num], dtype='float32')
+            pred = fluid.data(name='pred',
+                              shape=[None, self.class_num],
+                              dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
             acc = paddle.metric.Accuracy(topk=self.topk, name=self.name)
             state = acc.compute(pred, label)
@@ -201,8 +206,10 @@ def test_main(self):
         for _ in range(10):
             label, pred = self.random_pred_label()
             state_ret = exe.run(compiled_main_prog,
-                                feed={'pred': pred,
-                                      'label': label},
+                                feed={
+                                    'pred': pred,
+                                    'label': label
+                                },
                                 fetch_list=[s.name for s in to_list(state)],
                                 return_numpy=True)
             acc.update(*state_ret)
@@ -218,6 +225,7 @@ def test_main(self):
 
 
 class TestAccuracyStaticMultiTopk(TestAccuracyStatic):
+
     def setUp(self):
         self.topk = (1, 5)
         self.class_num = 10
@@ -227,6 +235,7 @@ def setUp(self):
 
 
 class TestPrecision(unittest.TestCase):
+
     def test_1d(self):
 
         x = np.array([0.1, 0.5, 0.6, 0.7])
@@ -266,6 +275,7 @@ def test_2d(self):
 
 
 class TestRecall(unittest.TestCase):
+
     def test_1d(self):
         x = np.array([0.1, 0.5, 0.6, 0.7])
         y = np.array([1, 0, 1, 1])
@@ -289,6 +299,7 @@ def test_1d(self):
 
 
 class TestAuc(unittest.TestCase):
+
     def test_auc_numpy(self):
         x = np.array([[0.78, 0.22], [0.62, 0.38], [0.55, 0.45], [0.30, 0.70],
                       [0.14, 0.86], [0.59, 0.41], [0.91, 0.08], [0.16, 0.84]])
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 41de8ae189f85..e93ec67d4812d 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -42,22 +42,19 @@
 
 
 class LeNetDygraph(paddle.nn.Layer):
+
     def __init__(self, num_classes=10):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
-        self.features = Sequential(
-            Conv2D(
-                1, 6, 3, stride=1, padding=1),
-            ReLU(),
-            paddle.fluid.dygraph.Pool2D(2, 'max', 2),
-            Conv2D(
-                6, 16, 5, stride=1, padding=0),
-            ReLU(),
-            paddle.fluid.dygraph.Pool2D(2, 'max', 2))
+        self.features = Sequential(Conv2D(1, 6, 3, stride=1, padding=1), ReLU(),
+                                   paddle.fluid.dygraph.Pool2D(2, 'max', 2),
+                                   Conv2D(6, 16, 5, stride=1, padding=0),
+                                   ReLU(),
+                                   paddle.fluid.dygraph.Pool2D(2, 'max', 2))
 
         if num_classes > 0:
-            self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10))
+            self.fc = Sequential(Linear(400, 120), Linear(120, 84),
+                                 Linear(84, 10))
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -69,6 +66,7 @@ def forward(self, inputs):
 
 
 class ModelInner(paddle.nn.Layer):
+
     def __init__(self):
         super(ModelInner, self).__init__()
         self.fc = paddle.nn.Linear(3, 4)
@@ -79,6 +77,7 @@ def forward(self, x):
 
 
 class ModelOutter(paddle.nn.Layer):
+
     def __init__(self):
         super(ModelOutter, self).__init__()
         self.module1 = ModelInner()
@@ -91,24 +90,22 @@ def forward(self, x):
 
 
 class LeNetListInput(paddle.nn.Layer):
+
     def __init__(self, num_classes=10):
         super(LeNetListInput, self).__init__()
         self.num_classes = num_classes
         self.cov = Conv2D(1, 6, 3, stride=1, padding=1)
         for param in self.cov.parameters():
             param.trainable = False
-        self.features = Sequential(
-            self.cov,
-            ReLU(),
-            paddle.fluid.dygraph.Pool2D(2, 'max', 2),
-            Conv2D(
-                6, 16, 5, stride=1, padding=0),
-            ReLU(),
-            paddle.fluid.dygraph.Pool2D(2, 'max', 2))
+        self.features = Sequential(self.cov, ReLU(),
+                                   paddle.fluid.dygraph.Pool2D(2, 'max', 2),
+                                   Conv2D(6, 16, 5, stride=1, padding=0),
+                                   ReLU(),
+                                   paddle.fluid.dygraph.Pool2D(2, 'max', 2))
 
         if num_classes > 0:
-            self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10))
+            self.fc = Sequential(Linear(400, 120), Linear(120, 84),
+                                 Linear(84, 10))
 
     def forward(self, inputs):
         x = inputs[0]
@@ -121,6 +118,7 @@ def forward(self, inputs):
 
 
 class LeNetDictInput(LeNetDygraph):
+
     def forward(self, inputs):
         x = self.features(inputs['x1'])
 
@@ -131,6 +129,7 @@ def forward(self, inputs):
 
 
 class MnistDataset(MNIST):
+
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
         self.return_label = return_label
@@ -157,8 +156,8 @@ def compute_acc(pred, label):
 
 
 def dynamic_train(model, dataloader):
-    optim = fluid.optimizer.Adam(
-        learning_rate=0.001, parameter_list=model.parameters())
+    optim = fluid.optimizer.Adam(learning_rate=0.001,
+                                 parameter_list=model.parameters())
     model.train()
     for inputs, labels in dataloader:
         outputs = model(inputs)
@@ -176,8 +175,9 @@ def dynamic_evaluate(model, dataloader):
         for inputs, labels in dataloader:
             outputs = model(inputs)
 
-            cnt += (np.argmax(outputs.numpy(), -1)[:, np.newaxis] ==
-                    labels.numpy()).astype('int').sum()
+            cnt += (np.argmax(
+                outputs.numpy(),
+                -1)[:, np.newaxis] == labels.numpy()).astype('int').sum()
 
     return cnt / len(dataloader.dataset)
 
@@ -185,6 +185,7 @@ def dynamic_evaluate(model, dataloader):
 @unittest.skipIf(not fluid.is_compiled_with_cuda(),
                  'CPU testing is not supported')
 class TestModel(unittest.TestCase):
+
     @classmethod
     def setUpClass(cls):
         if not fluid.is_compiled_with_cuda():
@@ -195,15 +196,19 @@ def setUpClass(cls):
         sp_num = 1280
         cls.train_dataset = MnistDataset(mode='train', sample_num=sp_num)
         cls.val_dataset = MnistDataset(mode='test', sample_num=sp_num)
-        cls.test_dataset = MnistDataset(
-            mode='test', return_label=False, sample_num=sp_num)
-
-        cls.train_loader = fluid.io.DataLoader(
-            cls.train_dataset, places=cls.device, batch_size=64)
-        cls.val_loader = fluid.io.DataLoader(
-            cls.val_dataset, places=cls.device, batch_size=64)
-        cls.test_loader = fluid.io.DataLoader(
-            cls.test_dataset, places=cls.device, batch_size=64)
+        cls.test_dataset = MnistDataset(mode='test',
+                                        return_label=False,
+                                        sample_num=sp_num)
+
+        cls.train_loader = fluid.io.DataLoader(cls.train_dataset,
+                                               places=cls.device,
+                                               batch_size=64)
+        cls.val_loader = fluid.io.DataLoader(cls.val_dataset,
+                                             places=cls.device,
+                                             batch_size=64)
+        cls.test_loader = fluid.io.DataLoader(cls.test_dataset,
+                                              places=cls.device,
+                                              batch_size=64)
 
         seed = 333
         paddle.seed(seed)
@@ -276,13 +281,12 @@ def fit(self, dynamic, num_replicas=None, rank=None, num_iters=None):
         paddle.framework.random._manual_program_seed(seed)
 
         net = LeNet()
-        optim_new = fluid.optimizer.Adam(
-            learning_rate=0.001, parameter_list=net.parameters())
+        optim_new = fluid.optimizer.Adam(learning_rate=0.001,
+                                         parameter_list=net.parameters())
         model = Model(net, inputs=self.inputs, labels=self.labels)
-        model.prepare(
-            optim_new,
-            loss=CrossEntropyLoss(reduction="sum"),
-            metrics=Accuracy())
+        model.prepare(optim_new,
+                      loss=CrossEntropyLoss(reduction="sum"),
+                      metrics=Accuracy())
         model.fit(self.train_dataset, batch_size=64, shuffle=False)
 
         result = model.evaluate(self.val_dataset, batch_size=64)
@@ -293,33 +297,30 @@ def fit(self, dynamic, num_replicas=None, rank=None, num_iters=None):
                   shuffle=False,
                   num_iters=num_iters)
 
-        result = model.evaluate(
-            self.val_dataset, batch_size=64, num_iters=num_iters)
-
-        train_sampler = DistributedBatchSampler(
-            self.train_dataset,
-            batch_size=64,
-            shuffle=False,
-            num_replicas=num_replicas,
-            rank=rank)
-        val_sampler = DistributedBatchSampler(
-            self.val_dataset,
-            batch_size=64,
-            shuffle=False,
-            num_replicas=num_replicas,
-            rank=rank)
-
-        train_loader = fluid.io.DataLoader(
-            self.train_dataset,
-            batch_sampler=train_sampler,
-            places=self.device,
-            return_list=True)
-
-        val_loader = fluid.io.DataLoader(
-            self.val_dataset,
-            batch_sampler=val_sampler,
-            places=self.device,
-            return_list=True)
+        result = model.evaluate(self.val_dataset,
+                                batch_size=64,
+                                num_iters=num_iters)
+
+        train_sampler = DistributedBatchSampler(self.train_dataset,
+                                                batch_size=64,
+                                                shuffle=False,
+                                                num_replicas=num_replicas,
+                                                rank=rank)
+        val_sampler = DistributedBatchSampler(self.val_dataset,
+                                              batch_size=64,
+                                              shuffle=False,
+                                              num_replicas=num_replicas,
+                                              rank=rank)
+
+        train_loader = fluid.io.DataLoader(self.train_dataset,
+                                           batch_sampler=train_sampler,
+                                           places=self.device,
+                                           return_list=True)
+
+        val_loader = fluid.io.DataLoader(self.val_dataset,
+                                         batch_sampler=val_sampler,
+                                         places=self.device,
+                                         return_list=True)
 
         model.fit(train_loader, val_loader)
         fluid.disable_dygraph() if dynamic else None
@@ -331,42 +332,37 @@ def fit_with_tuple_input(self, dynamic, num_replicas=None, rank=None):
         paddle.framework.random._manual_program_seed(seed)
 
         net = LeNet()
-        optim_new = fluid.optimizer.Adam(
-            learning_rate=0.001, parameter_list=net.parameters())
+        optim_new = fluid.optimizer.Adam(learning_rate=0.001,
+                                         parameter_list=net.parameters())
         model = Model(net, inputs=tuple(self.inputs), labels=tuple(self.labels))
-        model.prepare(
-            optim_new,
-            loss=CrossEntropyLoss(reduction="sum"),
-            metrics=Accuracy())
+        model.prepare(optim_new,
+                      loss=CrossEntropyLoss(reduction="sum"),
+                      metrics=Accuracy())
         model.fit(self.train_dataset, batch_size=64, shuffle=False)
 
         result = model.evaluate(self.val_dataset, batch_size=64)
         np.testing.assert_allclose(result['acc'], self.acc1)
 
-        train_sampler = DistributedBatchSampler(
-            self.train_dataset,
-            batch_size=64,
-            shuffle=False,
-            num_replicas=num_replicas,
-            rank=rank)
-        val_sampler = DistributedBatchSampler(
-            self.val_dataset,
-            batch_size=64,
-            shuffle=False,
-            num_replicas=num_replicas,
-            rank=rank)
-
-        train_loader = fluid.io.DataLoader(
-            self.train_dataset,
-            batch_sampler=train_sampler,
-            places=self.device,
-            return_list=True)
-
-        val_loader = fluid.io.DataLoader(
-            self.val_dataset,
-            batch_sampler=val_sampler,
-            places=self.device,
-            return_list=True)
+        train_sampler = DistributedBatchSampler(self.train_dataset,
+                                                batch_size=64,
+                                                shuffle=False,
+                                                num_replicas=num_replicas,
+                                                rank=rank)
+        val_sampler = DistributedBatchSampler(self.val_dataset,
+                                              batch_size=64,
+                                              shuffle=False,
+                                              num_replicas=num_replicas,
+                                              rank=rank)
+
+        train_loader = fluid.io.DataLoader(self.train_dataset,
+                                           batch_sampler=train_sampler,
+                                           places=self.device,
+                                           return_list=True)
+
+        val_loader = fluid.io.DataLoader(self.val_dataset,
+                                         batch_sampler=val_sampler,
+                                         places=self.device,
+                                         return_list=True)
 
         model.fit(train_loader, val_loader)
         fluid.disable_dygraph() if dynamic else None
@@ -379,14 +375,14 @@ def evaluate(self, dynamic):
         result = model.evaluate(self.val_dataset, batch_size=64)
         np.testing.assert_allclose(result['acc'], self.acc1)
 
-        sampler = DistributedBatchSampler(
-            self.val_dataset, batch_size=64, shuffle=False)
+        sampler = DistributedBatchSampler(self.val_dataset,
+                                          batch_size=64,
+                                          shuffle=False)
 
-        val_loader = fluid.io.DataLoader(
-            self.val_dataset,
-            batch_sampler=sampler,
-            places=self.device,
-            return_list=True)
+        val_loader = fluid.io.DataLoader(self.val_dataset,
+                                         batch_sampler=sampler,
+                                         places=self.device,
+                                         return_list=True)
 
         model.evaluate(val_loader)
 
@@ -397,21 +393,22 @@ def predict(self, dynamic):
         model = Model(LeNet(), self.inputs)
         model.prepare()
         model.load(self.weight_path)
-        output = model.predict(
-            self.test_dataset, batch_size=64, stack_outputs=True)
+        output = model.predict(self.test_dataset,
+                               batch_size=64,
+                               stack_outputs=True)
         np.testing.assert_equal(output[0].shape[0], len(self.test_dataset))
 
         acc = compute_acc(output[0], self.val_dataset.labels)
         np.testing.assert_allclose(acc, self.acc1)
 
-        sampler = DistributedBatchSampler(
-            self.test_dataset, batch_size=64, shuffle=False)
+        sampler = DistributedBatchSampler(self.test_dataset,
+                                          batch_size=64,
+                                          shuffle=False)
 
-        test_loader = fluid.io.DataLoader(
-            self.test_dataset,
-            batch_sampler=sampler,
-            places=self.device,
-            return_list=True)
+        test_loader = fluid.io.DataLoader(self.test_dataset,
+                                          batch_sampler=sampler,
+                                          places=self.device,
+                                          return_list=True)
 
         model.evaluate(test_loader)
 
@@ -423,19 +420,21 @@ def test_predict_without_inputs(self):
         model.prepare()
         model.load(self.weight_path)
         model._inputs = None
-        output = model.predict(
-            self.test_dataset, batch_size=64, stack_outputs=True)
+        output = model.predict(self.test_dataset,
+                               batch_size=64,
+                               stack_outputs=True)
         np.testing.assert_equal(output[0].shape[0], len(self.test_dataset))
         fluid.disable_dygraph()
 
     def test_summary_gpu(self):
         paddle.disable_static(self.device)
         rnn = paddle.nn.LSTM(16, 32, 2)
-        params_info = paddle.summary(
-            rnn, [(-1, 23, 16), ((2, None, 32), (2, -1, 32))])
+        params_info = paddle.summary(rnn, [(-1, 23, 16),
+                                           ((2, None, 32), (2, -1, 32))])
 
 
 class MyModel(paddle.nn.Layer):
+
     def __init__(self):
         super(MyModel, self).__init__()
         self._fc = Linear(20, 10)
@@ -446,6 +445,7 @@ def forward(self, x):
 
 
 class MyDataset(Dataset):
+
     def __getitem__(self, idx):
         return np.random.random(size=(20,)).astype(np.float32), \
                np.random.randint(0, 10, size=(1,)).astype(np.int64)
@@ -455,6 +455,7 @@ def __len__(self):
 
 
 class TestModelFunction(unittest.TestCase):
+
     def set_seed(self, seed=1024):
         paddle.seed(seed)
         paddle.framework.random._manual_program_seed(seed)
@@ -538,8 +539,8 @@ def test_save_load(self):
             optim = fluid.optimizer.SGD(learning_rate=0.001,
                                         parameter_list=net.parameters())
             model = Model(net, inputs, labels)
-            model.prepare(
-                optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
+            model.prepare(optimizer=optim,
+                          loss=CrossEntropyLoss(reduction="sum"))
             model.save(path)
             model.load(path)
             fluid.disable_dygraph() if dynamic else None
@@ -558,14 +559,14 @@ def test_dynamic_load(self):
             inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
             labels = [InputSpec([None, 1], 'int64', 'label')]
             if new_optimizer:
-                optim = paddle.optimizer.Adam(
-                    learning_rate=0.001, parameters=net.parameters())
+                optim = paddle.optimizer.Adam(learning_rate=0.001,
+                                              parameters=net.parameters())
             else:
-                optim = fluid.optimizer.Adam(
-                    learning_rate=0.001, parameter_list=net.parameters())
+                optim = fluid.optimizer.Adam(learning_rate=0.001,
+                                             parameter_list=net.parameters())
             model = Model(net, inputs, labels)
-            model.prepare(
-                optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
+            model.prepare(optimizer=optim,
+                          loss=CrossEntropyLoss(reduction="sum"))
             model.fit(mnist_data, batch_size=64, verbose=0)
             model.save(path)
             model.load(path)
@@ -638,6 +639,7 @@ def test_parameters(self):
             fluid.disable_dygraph() if dynamic else None
 
     def test_summary(self):
+
         def _get_param_from_state_dict(state_dict):
             params = 0
             for k, v in state_dict.items():
@@ -665,6 +667,7 @@ def test_summary_non_tensor(self):
         paddle.summary(ModelOutter(), input_size=(-1, 3))
 
     def test_summary_nlp(self):
+
         def _get_param_from_state_dict(state_dict):
             params = 0
             for k, v in state_dict.items():
@@ -678,8 +681,8 @@ def _get_param_from_state_dict(state_dict):
         paddle.summary(nlp_net, (1, 1, 2))
 
         rnn = paddle.nn.LSTM(16, 32, 2)
-        params_info = paddle.summary(
-            rnn, [(-1, 23, 16), ((2, None, 32), (2, -1, 32))])
+        params_info = paddle.summary(rnn, [(-1, 23, 16),
+                                           ((2, None, 32), (2, -1, 32))])
         gt_params = _get_param_from_state_dict(rnn.state_dict())
         np.testing.assert_allclose(params_info['total_params'], gt_params / 2.0)
 
@@ -748,22 +751,22 @@ def test_dynamic_flops(self):
         def customize_dropout(m, x, y):
             m.total_ops += 0
 
-        paddle.flops(
-            net, [1, 3, 224, 224],
-            custom_ops={paddle.nn.Dropout: customize_dropout},
-            print_detail=True)
+        paddle.flops(net, [1, 3, 224, 224],
+                     custom_ops={paddle.nn.Dropout: customize_dropout},
+                     print_detail=True)
 
     def test_dynamic_flops_with_multiple_outputs(self):
-        net = paddle.nn.MaxPool2D(
-            kernel_size=2, stride=2, padding=0, return_mask=True)
+        net = paddle.nn.MaxPool2D(kernel_size=2,
+                                  stride=2,
+                                  padding=0,
+                                  return_mask=True)
 
         def customize_dropout(m, x, y):
             m.total_ops += 0
 
-        paddle.flops(
-            net, [1, 2, 32, 32],
-            custom_ops={paddle.nn.Dropout: customize_dropout},
-            print_detail=True)
+        paddle.flops(net, [1, 2, 32, 32],
+                     custom_ops={paddle.nn.Dropout: customize_dropout},
+                     print_detail=True)
 
     def test_export_deploy_model(self):
         self.set_seed()
@@ -783,26 +786,28 @@ def test_export_deploy_model(self):
             model = Model(net, inputs)
             model.prepare()
 
-            tensor_img = np.array(
-                np.random.random((1, 1, 28, 28)), dtype=np.float32)
+            tensor_img = np.array(np.random.random((1, 1, 28, 28)),
+                                  dtype=np.float32)
 
             model.save(save_dir, training=False)
             ori_results = model.predict_batch(tensor_img)
             fluid.disable_dygraph() if dynamic else None
 
-            place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = fluid.CPUPlace(
+            ) if not fluid.is_compiled_with_cuda() else fluid.CUDAPlace(0)
             new_scope = fluid.Scope()
             with fluid.scope_guard(new_scope):
                 exe = fluid.Executor(place)
-                [inference_program, feed_target_names, fetch_targets] = (
-                    paddle.static.io.load_inference_model(
-                        path_prefix=save_dir, executor=exe))
+                [inference_program, feed_target_names,
+                 fetch_targets] = (paddle.static.io.load_inference_model(
+                     path_prefix=save_dir, executor=exe))
                 results = exe.run(inference_program,
                                   feed={feed_target_names[0]: tensor_img},
                                   fetch_list=fetch_targets)
-                np.testing.assert_allclose(
-                    results, ori_results, rtol=1e-5, atol=1e-6)
+                np.testing.assert_allclose(results,
+                                           ori_results,
+                                           rtol=1e-5,
+                                           atol=1e-6)
 
             paddle.enable_static()
 
@@ -821,15 +826,15 @@ def test_dygraph_export_deploy_model_about_inputs(self):
         for initial in ["fit", "train_batch", "eval_batch", "predict_batch"]:
             net = LeNet()
             model = Model(net)
-            optim = fluid.optimizer.Adam(
-                learning_rate=0.001, parameter_list=model.parameters())
-            model.prepare(
-                optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
+            optim = fluid.optimizer.Adam(learning_rate=0.001,
+                                         parameter_list=model.parameters())
+            model.prepare(optimizer=optim,
+                          loss=CrossEntropyLoss(reduction="sum"))
             if initial == "fit":
                 model.fit(mnist_data, batch_size=64, verbose=0)
             else:
-                img = np.array(
-                    np.random.random((1, 1, 28, 28)), dtype=np.float32)
+                img = np.array(np.random.random((1, 1, 28, 28)),
+                               dtype=np.float32)
                 label = np.array(np.random.rand(1, 1), dtype=np.int64)
                 if initial == "train_batch":
                     model.train_batch([img], [label])
@@ -848,8 +853,8 @@ def test_dygraph_export_deploy_model_about_inputs(self):
         net = LeNet()
         inputs = InputSpec([None, 1, 28, 28], 'float32', 'x')
         model = Model(net, inputs)
-        optim = fluid.optimizer.Adam(
-            learning_rate=0.001, parameter_list=model.parameters())
+        optim = fluid.optimizer.Adam(learning_rate=0.001,
+                                     parameter_list=model.parameters())
         model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
         model.save(save_dir, training=False)
         shutil.rmtree(save_dir)
@@ -866,10 +871,9 @@ def test_accumulate(self, ):
 
         for amp_cfg in [None, 'O1']:
             model = Model(net, inputs, labels)
-            model.prepare(
-                optim,
-                loss=CrossEntropyLoss(reduction="sum"),
-                amp_configs=amp_cfg)
+            model.prepare(optim,
+                          loss=CrossEntropyLoss(reduction="sum"),
+                          amp_configs=amp_cfg)
             losses, grads = [], []
             for stat in [False, False, True]:
                 loss, = model.train_batch([data], [label], update=stat)
@@ -878,14 +882,16 @@ def test_accumulate(self, ):
 
             for grad1, grad2, grad3 in zip(*grads):
                 np.testing.assert_almost_equal(grad1 * 2, grad2, decimal=4)
-                np.testing.assert_almost_equal(
-                    grad3, np.zeros_like(grad3), decimal=4)
+                np.testing.assert_almost_equal(grad3,
+                                               np.zeros_like(grad3),
+                                               decimal=4)
 
             np.testing.assert_almost_equal(losses[0], losses[1], decimal=4)
             np.testing.assert_almost_equal(losses[0], losses[2], decimal=4)
 
 
 class TestModelWithLRScheduler(unittest.TestCase):
+
     def test_fit_by_step(self):
         base_lr = 1e-3
         boundaries = [5, 8]
@@ -902,11 +908,10 @@ def make_optimizer(parameters=None):
                 start_lr=base_lr / 5.,
                 end_lr=base_lr,
                 verbose=True)
-            optimizer = paddle.optimizer.Momentum(
-                learning_rate=learning_rate,
-                weight_decay=weight_decay,
-                momentum=momentum,
-                parameters=parameters)
+            optimizer = paddle.optimizer.Momentum(learning_rate=learning_rate,
+                                                  weight_decay=weight_decay,
+                                                  momentum=momentum,
+                                                  parameters=parameters)
             return optimizer
 
         # dynamic test
@@ -958,11 +963,10 @@ def make_optimizer(parameters=None):
                 start_lr=base_lr / 5.,
                 end_lr=base_lr,
                 verbose=True)
-            optimizer = paddle.optimizer.Momentum(
-                learning_rate=learning_rate,
-                weight_decay=weight_decay,
-                momentum=momentum,
-                parameters=parameters)
+            optimizer = paddle.optimizer.Momentum(learning_rate=learning_rate,
+                                                  weight_decay=weight_decay,
+                                                  momentum=momentum,
+                                                  parameters=parameters)
             return optimizer
 
         # dynamic test
@@ -977,8 +981,8 @@ def make_optimizer(parameters=None):
 
         dataset = MyDataset()
 
-        lr_scheduler_callback = paddle.callbacks.LRScheduler(
-            by_step=False, by_epoch=True)
+        lr_scheduler_callback = paddle.callbacks.LRScheduler(by_step=False,
+                                                             by_epoch=True)
 
         model.fit(dataset,
                   dataset,
@@ -1006,8 +1010,8 @@ def make_optimizer(parameters=None):
 
         dataset = MyDataset()
 
-        lr_scheduler_callback = paddle.callbacks.LRScheduler(
-            by_step=False, by_epoch=True)
+        lr_scheduler_callback = paddle.callbacks.LRScheduler(by_step=False,
+                                                             by_epoch=True)
 
         model.fit(dataset,
                   dataset,
@@ -1026,6 +1030,7 @@ def make_optimizer(parameters=None):
 
 
 class TestRaiseError(unittest.TestCase):
+
     def test_input_without_name(self):
         net = MyModel()
         inputs = [InputSpec([None, 10], 'float32')]
@@ -1058,8 +1063,8 @@ def test_save_infer_model_without_file_prefix(self):
         model = Model(net, inputs)
         model.prepare()
         path = ""
-        tensor_img = np.array(
-            np.random.random((1, 1, 28, 28)), dtype=np.float32)
+        tensor_img = np.array(np.random.random((1, 1, 28, 28)),
+                              dtype=np.float32)
         with self.assertRaises(ValueError):
             model.save(path, training=False)
 
diff --git a/python/paddle/tests/test_ops_roi_align.py b/python/paddle/tests/test_ops_roi_align.py
index 4a37831a0ccf2..145f77e846b57 100644
--- a/python/paddle/tests/test_ops_roi_align.py
+++ b/python/paddle/tests/test_ops_roi_align.py
@@ -20,6 +20,7 @@
 
 
 class TestRoIAlign(unittest.TestCase):
+
     def setUp(self):
         self.data = np.random.rand(1, 256, 32, 32).astype('float32')
         boxes = np.random.rand(3, 4)
@@ -39,22 +40,27 @@ def roi_align_functional(self, output_size):
             boxes = paddle.to_tensor(self.boxes)
             boxes_num = paddle.to_tensor(self.boxes_num)
 
-            align_out = roi_align(
-                data, boxes, boxes_num=boxes_num, output_size=output_size)
+            align_out = roi_align(data,
+                                  boxes,
+                                  boxes_num=boxes_num,
+                                  output_size=output_size)
             np.testing.assert_equal(align_out.shape, output_shape)
 
         else:
-            data = paddle.static.data(
-                shape=self.data.shape, dtype=self.data.dtype, name='data')
-            boxes = paddle.static.data(
-                shape=self.boxes.shape, dtype=self.boxes.dtype, name='boxes')
-            boxes_num = paddle.static.data(
-                shape=self.boxes_num.shape,
-                dtype=self.boxes_num.dtype,
-                name='boxes_num')
-
-            align_out = roi_align(
-                data, boxes, boxes_num=boxes_num, output_size=output_size)
+            data = paddle.static.data(shape=self.data.shape,
+                                      dtype=self.data.dtype,
+                                      name='data')
+            boxes = paddle.static.data(shape=self.boxes.shape,
+                                       dtype=self.boxes.dtype,
+                                       name='boxes')
+            boxes_num = paddle.static.data(shape=self.boxes_num.shape,
+                                           dtype=self.boxes_num.dtype,
+                                           name='boxes_num')
+
+            align_out = roi_align(data,
+                                  boxes,
+                                  boxes_num=boxes_num,
+                                  output_size=output_size)
 
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
@@ -90,8 +96,8 @@ def test_RoIAlign(self):
     def test_value(self, ):
         data = np.array([i for i in range(1, 17)]).reshape(1, 1, 4,
                                                            4).astype(np.float32)
-        boxes = np.array(
-            [[1., 1., 2., 2.], [1.5, 1.5, 3., 3.]]).astype(np.float32)
+        boxes = np.array([[1., 1., 2., 2.], [1.5, 1.5, 3.,
+                                             3.]]).astype(np.float32)
         boxes_num = np.array([2]).astype(np.int32)
         output = np.array([[[[6.]]], [[[9.75]]]], dtype=np.float32)
 
diff --git a/python/paddle/tests/test_ops_roi_pool.py b/python/paddle/tests/test_ops_roi_pool.py
index 3c84a55da1ea6..eaeb785df7d48 100644
--- a/python/paddle/tests/test_ops_roi_pool.py
+++ b/python/paddle/tests/test_ops_roi_pool.py
@@ -20,6 +20,7 @@
 
 
 class TestRoIPool(unittest.TestCase):
+
     def setUp(self):
         self.data = np.random.rand(1, 256, 32, 32).astype('float32')
         boxes = np.random.rand(3, 4)
@@ -40,22 +41,27 @@ def roi_pool_functional(self, output_size):
             boxes = paddle.to_tensor(self.boxes)
             boxes_num = paddle.to_tensor(self.boxes_num)
 
-            pool_out = roi_pool(
-                data, boxes, boxes_num=boxes_num, output_size=output_size)
+            pool_out = roi_pool(data,
+                                boxes,
+                                boxes_num=boxes_num,
+                                output_size=output_size)
             np.testing.assert_equal(pool_out.shape, output_shape)
 
         else:
-            data = paddle.static.data(
-                shape=self.data.shape, dtype=self.data.dtype, name='data')
-            boxes = paddle.static.data(
-                shape=self.boxes.shape, dtype=self.boxes.dtype, name='boxes')
-            boxes_num = paddle.static.data(
-                shape=self.boxes_num.shape,
-                dtype=self.boxes_num.dtype,
-                name='boxes_num')
-
-            pool_out = roi_pool(
-                data, boxes, boxes_num=boxes_num, output_size=output_size)
+            data = paddle.static.data(shape=self.data.shape,
+                                      dtype=self.data.dtype,
+                                      name='data')
+            boxes = paddle.static.data(shape=self.boxes.shape,
+                                       dtype=self.boxes.dtype,
+                                       name='boxes')
+            boxes_num = paddle.static.data(shape=self.boxes_num.shape,
+                                           dtype=self.boxes_num.dtype,
+                                           name='boxes_num')
+
+            pool_out = roi_pool(data,
+                                boxes,
+                                boxes_num=boxes_num,
+                                output_size=output_size)
 
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
@@ -91,8 +97,8 @@ def test_RoIPool(self):
     def test_value(self, ):
         data = np.array([i for i in range(1, 17)]).reshape(1, 1, 4,
                                                            4).astype(np.float32)
-        boxes = np.array(
-            [[1., 1., 2., 2.], [1.5, 1.5, 3., 3.]]).astype(np.float32)
+        boxes = np.array([[1., 1., 2., 2.], [1.5, 1.5, 3.,
+                                             3.]]).astype(np.float32)
         boxes_num = np.array([2]).astype(np.int32)
         output = np.array([[[[11.]]], [[[16.]]]], dtype=np.float32)
 
diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py
index 4441faee14e02..4c655ce3c1f8f 100644
--- a/python/paddle/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
@@ -26,6 +26,7 @@
 # test the predicted resutls of static graph and dynamic graph are equal
 # when used pretrained model
 class TestPretrainedModel(unittest.TestCase):
+
     def infer(self, arch):
         path = os.path.join(tempfile.mkdtemp(), '.cache_test_pretrained_model')
         if not os.path.exists(path):
@@ -56,8 +57,8 @@ def infer(self, arch):
 
     def test_models(self):
         # TODO (LielinJiang): when model file cache is ok. add following test back
-        # 'resnet18', 'vgg16', 'alexnet', 'resnext50_32x4d', 'inception_v3', 
-        # 'densenet121', 'googlenet', 'wide_resnet50_2', 'wide_resnet101_2'  
+        # 'resnet18', 'vgg16', 'alexnet', 'resnext50_32x4d', 'inception_v3',
+        # 'densenet121', 'googlenet', 'wide_resnet50_2', 'wide_resnet101_2'
         arches = [
             'mobilenet_v1',
             'mobilenet_v2',
diff --git a/python/paddle/tests/test_progressbar.py b/python/paddle/tests/test_progressbar.py
index a68aee7aa8f89..c42f1e4db0fe5 100644
--- a/python/paddle/tests/test_progressbar.py
+++ b/python/paddle/tests/test_progressbar.py
@@ -21,6 +21,7 @@
 
 
 class TestProgressBar(unittest.TestCase):
+
     def prog_bar(self, num, epoch, width, verbose=1):
         for epoch in range(epoch):
             progbar = ProgressBar(num, verbose=verbose)
diff --git a/python/paddle/tests/test_read_file.py b/python/paddle/tests/test_read_file.py
index fbcba9a6bbf7b..0dad971a7308e 100644
--- a/python/paddle/tests/test_read_file.py
+++ b/python/paddle/tests/test_read_file.py
@@ -23,6 +23,7 @@
 
 
 class TestReadFile(unittest.TestCase):
+
     def setUp(self):
         fake_img = (np.random.random((400, 300, 3)) * 255).astype('uint8')
         cv2.imwrite('fake.jpg', fake_img)
diff --git a/python/paddle/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
index e07ac47a0f818..35a0f8edc4843 100644
--- a/python/paddle/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -28,6 +28,7 @@
 
 
 class TestTransformsCV2(unittest.TestCase):
+
     def setUp(self):
         self.backend = self.get_backend()
         set_image_backend(self.backend)
@@ -52,8 +53,8 @@ def create_image(self, shape):
         if self.backend == 'cv2':
             return (np.random.rand(*shape) * 255).astype('uint8')
         elif self.backend == 'pil':
-            return Image.fromarray((np.random.rand(*shape) * 255).astype(
-                'uint8'))
+            return Image.fromarray(
+                (np.random.rand(*shape) * 255).astype('uint8'))
 
     def get_shape(self, img):
         if isinstance(img, paddle.Tensor):
@@ -76,11 +77,14 @@ def do_transform(self, trans):
     def test_trans_all(self):
         normalize = transforms.Normalize(
             mean=[123.675, 116.28, 103.53],
-            std=[58.395, 57.120, 57.375], )
+            std=[58.395, 57.120, 57.375],
+        )
         trans = transforms.Compose([
             transforms.RandomResizedCrop(224),
-            transforms.ColorJitter(
-                brightness=0.4, contrast=0.4, saturation=0.4, hue=0.4),
+            transforms.ColorJitter(brightness=0.4,
+                                   contrast=0.4,
+                                   saturation=0.4,
+                                   hue=0.4),
             transforms.RandomHorizontalFlip(),
             transforms.Transpose(),
             normalize,
@@ -126,38 +130,34 @@ def test_color_jitter(self):
     def test_affine(self):
         trans = transforms.Compose([
             transforms.RandomAffine(90),
-            transforms.RandomAffine(
-                [-10, 10], translate=[0.1, 0.3]),
-            transforms.RandomAffine(
-                45, translate=[0.2, 0.2], scale=[0.2, 0.5]),
-            transforms.RandomAffine(
-                10, translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[-10, 10]),
-            transforms.RandomAffine(
-                10,
-                translate=[0.5, 0.3],
-                scale=[0.7, 1.3],
-                shear=[-10, 10, 20, 40]),
-            transforms.RandomAffine(
-                10,
-                translate=[0.5, 0.3],
-                scale=[0.7, 1.3],
-                shear=[-10, 10, 20, 40],
-                interpolation='bilinear'),
-            transforms.RandomAffine(
-                10,
-                translate=[0.5, 0.3],
-                scale=[0.7, 1.3],
-                shear=[-10, 10, 20, 40],
-                interpolation='bilinear',
-                fill=114),
-            transforms.RandomAffine(
-                10,
-                translate=[0.5, 0.3],
-                scale=[0.7, 1.3],
-                shear=[-10, 10, 20, 40],
-                interpolation='bilinear',
-                fill=114,
-                center=(60, 80)),
+            transforms.RandomAffine([-10, 10], translate=[0.1, 0.3]),
+            transforms.RandomAffine(45, translate=[0.2, 0.2], scale=[0.2, 0.5]),
+            transforms.RandomAffine(10,
+                                    translate=[0.2, 0.2],
+                                    scale=[0.5, 0.5],
+                                    shear=[-10, 10]),
+            transforms.RandomAffine(10,
+                                    translate=[0.5, 0.3],
+                                    scale=[0.7, 1.3],
+                                    shear=[-10, 10, 20, 40]),
+            transforms.RandomAffine(10,
+                                    translate=[0.5, 0.3],
+                                    scale=[0.7, 1.3],
+                                    shear=[-10, 10, 20, 40],
+                                    interpolation='bilinear'),
+            transforms.RandomAffine(10,
+                                    translate=[0.5, 0.3],
+                                    scale=[0.7, 1.3],
+                                    shear=[-10, 10, 20, 40],
+                                    interpolation='bilinear',
+                                    fill=114),
+            transforms.RandomAffine(10,
+                                    translate=[0.5, 0.3],
+                                    scale=[0.7, 1.3],
+                                    shear=[-10, 10, 20, 40],
+                                    interpolation='bilinear',
+                                    fill=114,
+                                    center=(60, 80)),
         ])
         self.do_transform(trans)
 
@@ -165,18 +165,15 @@ def test_rotate(self):
         trans = transforms.Compose([
             transforms.RandomRotation(90),
             transforms.RandomRotation([-10, 10]),
-            transforms.RandomRotation(
-                45, expand=True),
-            transforms.RandomRotation(
-                10, expand=True, center=(60, 80)),
+            transforms.RandomRotation(45, expand=True),
+            transforms.RandomRotation(10, expand=True, center=(60, 80)),
         ])
         self.do_transform(trans)
 
     def test_perspective(self):
         trans = transforms.Compose([
             transforms.RandomPerspective(prob=1.0),
-            transforms.RandomPerspective(
-                prob=1.0, distortion_scale=0.9),
+            transforms.RandomPerspective(prob=1.0, distortion_scale=0.9),
         ])
         self.do_transform(trans)
 
@@ -214,8 +211,8 @@ def test_random_crop(self):
         trans_random_crop_same = transforms.RandomCrop((140, 160))
         img = trans_random_crop_same(fake_img_crop2)
 
-        trans_random_crop_bigger = transforms.RandomCrop(
-            (180, 200), pad_if_needed=True)
+        trans_random_crop_bigger = transforms.RandomCrop((180, 200),
+                                                         pad_if_needed=True)
         img = trans_random_crop_bigger(img)
 
         trans_random_crop_pad = transforms.RandomCrop((224, 256), 2, True)
@@ -223,7 +220,8 @@ def test_random_crop(self):
 
     def test_erase(self):
         trans = transforms.Compose([
-            transforms.RandomErasing(), transforms.RandomErasing(value="random")
+            transforms.RandomErasing(),
+            transforms.RandomErasing(value="random")
         ])
         self.do_transform(trans)
 
@@ -334,24 +332,24 @@ def test_exception(self):
             transforms.RandomAffine(10, translate=[0.2, 0.2], scale=[1, 2, 3]),
 
         with self.assertRaises(ValueError):
-            transforms.RandomAffine(
-                10, translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[1, 2, 3]),
+            transforms.RandomAffine(10,
+                                    translate=[0.2, 0.2],
+                                    scale=[0.5, 0.5],
+                                    shear=[1, 2, 3]),
 
         with self.assertRaises(ValueError):
-            transforms.RandomAffine(
-                10,
-                translate=[0.5, 0.3],
-                scale=[0.7, 1.3],
-                shear=[-10, 10, 0, 20, 40])
+            transforms.RandomAffine(10,
+                                    translate=[0.5, 0.3],
+                                    scale=[0.7, 1.3],
+                                    shear=[-10, 10, 0, 20, 40])
 
         with self.assertRaises(ValueError):
-            transforms.RandomAffine(
-                10,
-                translate=[0.5, 0.3],
-                scale=[0.7, 1.3],
-                shear=[-10, 10, 20, 40],
-                fill=114,
-                center=(1, 2, 3))
+            transforms.RandomAffine(10,
+                                    translate=[0.5, 0.3],
+                                    scale=[0.7, 1.3],
+                                    shear=[-10, 10, 20, 40],
+                                    fill=114,
+                                    center=(1, 2, 3))
 
         with self.assertRaises(ValueError):
             transforms.RandomRotation(-2)
@@ -404,11 +402,13 @@ def test_info(self):
 
 
 class TestTransformsPIL(TestTransformsCV2):
+
     def get_backend(self):
         return 'pil'
 
 
 class TestTransformsTensor(TestTransformsCV2):
+
     def get_backend(self):
         return 'tensor'
 
@@ -426,7 +426,8 @@ def do_transform(self, trans):
     def test_trans_all(self):
         normalize = transforms.Normalize(
             mean=[123.675, 116.28, 103.53],
-            std=[58.395, 57.120, 57.375], )
+            std=[58.395, 57.120, 57.375],
+        )
         trans = transforms.Compose([
             transforms.RandomResizedCrop(224),
             transforms.RandomHorizontalFlip(),
@@ -508,8 +509,8 @@ def test_random_crop(self):
         trans_random_crop_same = transforms.RandomCrop((140, 160))
         img = trans_random_crop_same(fake_img_crop2)
 
-        trans_random_crop_bigger = transforms.RandomCrop(
-            (180, 200), pad_if_needed=True)
+        trans_random_crop_bigger = transforms.RandomCrop((180, 200),
+                                                         pad_if_needed=True)
         img = trans_random_crop_bigger(img)
 
         trans_random_crop_pad = transforms.RandomCrop((224, 256), 2, True)
@@ -585,15 +586,16 @@ def test_exception(self):
             transforms.RandomAffine(10, translate=[0.2, 0.2], scale=[1, 2, 3]),
 
         with self.assertRaises(ValueError):
-            transforms.RandomAffine(
-                10, translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[1, 2, 3]),
+            transforms.RandomAffine(10,
+                                    translate=[0.2, 0.2],
+                                    scale=[0.5, 0.5],
+                                    shear=[1, 2, 3]),
 
         with self.assertRaises(ValueError):
-            transforms.RandomAffine(
-                10,
-                translate=[0.5, 0.3],
-                scale=[0.7, 1.3],
-                shear=[-10, 10, 0, 20, 40])
+            transforms.RandomAffine(10,
+                                    translate=[0.5, 0.3],
+                                    scale=[0.7, 1.3],
+                                    shear=[-10, 10, 0, 20, 40])
 
         with self.assertRaises(ValueError):
             transforms.RandomRotation(-2)
@@ -614,13 +616,14 @@ def test_exception(self):
 
 
 class TestFunctional(unittest.TestCase):
+
     def test_errors(self):
         with self.assertRaises(TypeError):
             F.to_tensor(1)
 
         with self.assertRaises(ValueError):
-            fake_img = Image.fromarray((np.random.rand(28, 28, 3) * 255).astype(
-                'uint8'))
+            fake_img = Image.fromarray(
+                (np.random.rand(28, 28, 3) * 255).astype('uint8'))
             F.to_tensor(fake_img, data_format=1)
 
         with self.assertRaises(ValueError):
@@ -632,8 +635,8 @@ def test_errors(self):
             F.resize(fake_img, {1: 1})
 
         with self.assertRaises(TypeError):
-            fake_img = Image.fromarray((np.random.rand(28, 28, 3) * 255).astype(
-                'uint8'))
+            fake_img = Image.fromarray(
+                (np.random.rand(28, 28, 3) * 255).astype('uint8'))
             F.resize(fake_img, '1')
 
         with self.assertRaises(TypeError):
@@ -682,16 +685,18 @@ def test_errors(self):
             F.affine(45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 0, 10])
 
         with self.assertRaises(TypeError):
-            F.affine(
-                45,
-                translate=[0.2, 0.2],
-                scale=0.5,
-                shear=[-10, 10],
-                interpolation=2)
+            F.affine(45,
+                     translate=[0.2, 0.2],
+                     scale=0.5,
+                     shear=[-10, 10],
+                     interpolation=2)
 
         with self.assertRaises(TypeError):
-            F.affine(
-                45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10], center=0)
+            F.affine(45,
+                     translate=[0.2, 0.2],
+                     scale=0.5,
+                     shear=[-10, 10],
+                     center=0)
 
         with self.assertRaises(TypeError):
             F.rotate(1, 0.1)
@@ -715,17 +720,23 @@ def test_normalize(self):
         std = [0.5, 0.5, 0.5]
 
         normalized_img = F.normalize(tensor_img, mean, std)
-        normalized_img_tensor = F.normalize(
-            tensor_img_hwc, mean, std, data_format='HWC')
+        normalized_img_tensor = F.normalize(tensor_img_hwc,
+                                            mean,
+                                            std,
+                                            data_format='HWC')
 
         normalized_img_pil = F.normalize(pil_img, mean, std, data_format='HWC')
-        normalized_img_np = F.normalize(
-            np_img, mean, std, data_format='HWC', to_rgb=False)
-
-        np.testing.assert_almost_equal(
-            np.array(normalized_img_pil), normalized_img_np)
-        np.testing.assert_almost_equal(
-            normalized_img_tensor.numpy(), normalized_img_np, decimal=4)
+        normalized_img_np = F.normalize(np_img,
+                                        mean,
+                                        std,
+                                        data_format='HWC',
+                                        to_rgb=False)
+
+        np.testing.assert_almost_equal(np.array(normalized_img_pil),
+                                       normalized_img_np)
+        np.testing.assert_almost_equal(normalized_img_tensor.numpy(),
+                                       normalized_img_np,
+                                       decimal=4)
 
     def test_center_crop(self):
         np_img = (np.random.rand(28, 24, 3) * 255).astype('uint8')
@@ -738,10 +749,10 @@ def test_center_crop(self):
 
         np.testing.assert_almost_equal(np_cropped_img,
                                        np.array(pil_cropped_img))
-        np.testing.assert_almost_equal(
-            np_cropped_img,
-            tensor_cropped_img.numpy().transpose((1, 2, 0)),
-            decimal=4)
+        np.testing.assert_almost_equal(np_cropped_img,
+                                       tensor_cropped_img.numpy().transpose(
+                                           (1, 2, 0)),
+                                       decimal=4)
 
     def test_color_jitter_sub_function(self):
         np.random.seed(555)
@@ -806,10 +817,10 @@ def test_pad(self):
         tensor_padded_img = F.pad(tensor_img, [1, 2], padding_mode='reflect')
 
         np.testing.assert_almost_equal(np_padded_img, np.array(pil_padded_img))
-        np.testing.assert_almost_equal(
-            np_padded_img,
-            tensor_padded_img.numpy().transpose((1, 2, 0)),
-            decimal=3)
+        np.testing.assert_almost_equal(np_padded_img,
+                                       tensor_padded_img.numpy().transpose(
+                                           (1, 2, 0)),
+                                       decimal=3)
 
         tensor_padded_img = F.pad(tensor_img, 1, padding_mode='reflect')
         tensor_padded_img = F.pad(tensor_img, [1, 2, 1, 2],
@@ -831,14 +842,14 @@ def test_resize(self):
 
         np.testing.assert_almost_equal(np_reseized_img,
                                        np.array(pil_reseized_img))
-        np.testing.assert_almost_equal(
-            np_reseized_img,
-            tensor_reseized_img.numpy().transpose((1, 2, 0)),
-            decimal=3)
-        np.testing.assert_almost_equal(
-            np_reseized_img,
-            tensor_reseized_img2.numpy().transpose((1, 2, 0)),
-            decimal=3)
+        np.testing.assert_almost_equal(np_reseized_img,
+                                       tensor_reseized_img.numpy().transpose(
+                                           (1, 2, 0)),
+                                       decimal=3)
+        np.testing.assert_almost_equal(np_reseized_img,
+                                       tensor_reseized_img2.numpy().transpose(
+                                           (1, 2, 0)),
+                                       decimal=3)
 
         gray_img = (np.zeros([28, 32])).astype('uint8')
         gray_resize_img = F.resize(gray_img, 40)
@@ -852,7 +863,7 @@ def test_to_tensor(self):
 
         np.testing.assert_allclose(np_tensor.numpy(), pil_tensor.numpy())
 
-        # test float dtype 
+        # test float dtype
         float_img = np.random.rand(28, 28)
         float_tensor = F.to_tensor(float_img)
 
@@ -902,9 +913,8 @@ def test_erase(self):
     def test_erase_backward(self):
         img = paddle.randn((3, 14, 14), dtype=np.float32)
         img.stop_gradient = False
-        erased = F.erase(
-            img, 3, 3, 5, 5, paddle.ones(
-                (1, 1, 1), dtype='float32'))
+        erased = F.erase(img, 3, 3, 5, 5, paddle.ones((1, 1, 1),
+                                                      dtype='float32'))
         loss = erased.sum()
         loss.backward()
 
@@ -913,8 +923,8 @@ def test_erase_backward(self):
         np.testing.assert_equal(img.grad.numpy(), expected_grad)
 
     def test_image_load(self):
-        fake_img = Image.fromarray((np.random.random((32, 32, 3)) * 255).astype(
-            'uint8'))
+        fake_img = Image.fromarray((np.random.random(
+            (32, 32, 3)) * 255).astype('uint8'))
 
         path = 'temp.jpg'
         fake_img.save(path)
@@ -936,25 +946,35 @@ def test_affine(self):
         pil_img = Image.fromarray(np_img).convert('RGB')
         tensor_img = F.to_tensor(pil_img, data_format='CHW') * 255
 
-        np.testing.assert_almost_equal(
-            np_img, tensor_img.transpose((1, 2, 0)), decimal=4)
-
-        np_affined_img = F.affine(
-            np_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
-        pil_affined_img = F.affine(
-            pil_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
-        tensor_affined_img = F.affine(
-            tensor_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
+        np.testing.assert_almost_equal(np_img,
+                                       tensor_img.transpose((1, 2, 0)),
+                                       decimal=4)
+
+        np_affined_img = F.affine(np_img,
+                                  45,
+                                  translate=[0.2, 0.2],
+                                  scale=0.5,
+                                  shear=[-10, 10])
+        pil_affined_img = F.affine(pil_img,
+                                   45,
+                                   translate=[0.2, 0.2],
+                                   scale=0.5,
+                                   shear=[-10, 10])
+        tensor_affined_img = F.affine(tensor_img,
+                                      45,
+                                      translate=[0.2, 0.2],
+                                      scale=0.5,
+                                      shear=[-10, 10])
 
         np.testing.assert_equal(np_affined_img.shape,
                                 np.array(pil_affined_img).shape)
         np.testing.assert_equal(np_affined_img.shape,
                                 tensor_affined_img.transpose((1, 2, 0)).shape)
 
-        np.testing.assert_almost_equal(
-            np.array(pil_affined_img),
-            tensor_affined_img.numpy().transpose((1, 2, 0)),
-            decimal=4)
+        np.testing.assert_almost_equal(np.array(pil_affined_img),
+                                       tensor_affined_img.numpy().transpose(
+                                           (1, 2, 0)),
+                                       decimal=4)
 
     def test_rotate(self):
         np_img = (np.random.rand(28, 28, 3) * 255).astype('uint8')
@@ -966,12 +986,11 @@ def test_rotate(self):
 
         rotated_tensor_img1 = F.rotate(tensor_img, 80, expand=True)
 
-        rotated_tensor_img2 = F.rotate(
-            tensor_img,
-            80,
-            interpolation='bilinear',
-            center=(10, 10),
-            expand=False)
+        rotated_tensor_img2 = F.rotate(tensor_img,
+                                       80,
+                                       interpolation='bilinear',
+                                       center=(10, 10),
+                                       expand=False)
 
         np.testing.assert_equal(rotated_np_img.shape,
                                 np.array(rotated_pil_img).shape)
@@ -982,10 +1001,16 @@ def test_rotate1(self):
         np_img = (np.random.rand(28, 28, 3) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img).convert('RGB')
 
-        rotated_np_img = F.rotate(
-            np_img, 80, expand=True, center=[0, 0], fill=[0, 0, 0])
-        rotated_pil_img = F.rotate(
-            pil_img, 80, expand=True, center=[0, 0], fill=[0, 0, 0])
+        rotated_np_img = F.rotate(np_img,
+                                  80,
+                                  expand=True,
+                                  center=[0, 0],
+                                  fill=[0, 0, 0])
+        rotated_pil_img = F.rotate(pil_img,
+                                   80,
+                                   expand=True,
+                                   center=[0, 0],
+                                   fill=[0, 0, 0])
 
         np.testing.assert_equal(rotated_np_img.shape,
                                 np.array(rotated_pil_img).shape)
@@ -995,8 +1020,9 @@ def test_perspective(self):
         pil_img = Image.fromarray(np_img).convert('RGB')
         tensor_img = F.to_tensor(pil_img, data_format='CHW') * 255
 
-        np.testing.assert_almost_equal(
-            np_img, tensor_img.transpose((1, 2, 0)), decimal=4)
+        np.testing.assert_almost_equal(np_img,
+                                       tensor_img.transpose((1, 2, 0)),
+                                       decimal=4)
 
         startpoints = [[0, 0], [13, 0], [13, 15], [0, 15]]
         endpoints = [[3, 2], [12, 3], [10, 14], [2, 15]]
@@ -1008,9 +1034,9 @@ def test_perspective(self):
 
         np.testing.assert_equal(np_perspectived_img.shape,
                                 np.array(pil_perspectived_img).shape)
-        np.testing.assert_equal(np_perspectived_img.shape,
-                                tensor_perspectived_img.transpose(
-                                    (1, 2, 0)).shape)
+        np.testing.assert_equal(
+            np_perspectived_img.shape,
+            tensor_perspectived_img.transpose((1, 2, 0)).shape)
 
         result_pil = np.array(pil_perspectived_img)
         result_tensor = tensor_perspectived_img.numpy().transpose(
@@ -1041,24 +1067,22 @@ def test_erase(batch_tensor):
         def test_affine(batch_tensor):
             input1, input2 = paddle.unbind(batch_tensor, axis=0)
             target_result = paddle.stack([
-                F.affine(
-                    input1,
-                    45,
-                    translate=[0.2, 0.2],
-                    scale=0.5,
-                    shear=[-10, 10]), F.affine(
-                        input2,
-                        45,
-                        translate=[0.2, 0.2],
-                        scale=0.5,
-                        shear=[-10, 10])
+                F.affine(input1,
+                         45,
+                         translate=[0.2, 0.2],
+                         scale=0.5,
+                         shear=[-10, 10]),
+                F.affine(input2,
+                         45,
+                         translate=[0.2, 0.2],
+                         scale=0.5,
+                         shear=[-10, 10])
             ])
-            batch_result = F.affine(
-                batch_tensor,
-                45,
-                translate=[0.2, 0.2],
-                scale=0.5,
-                shear=[-10, 10])
+            batch_result = F.affine(batch_tensor,
+                                    45,
+                                    translate=[0.2, 0.2],
+                                    scale=0.5,
+                                    shear=[-10, 10])
 
             return paddle.allclose(batch_result, target_result)
 
@@ -1095,7 +1119,8 @@ def test_adjust_brightness(batch_tensor):
         def test_adjust_contrast(batch_tensor):
             input1, input2 = paddle.unbind(batch_tensor, axis=0)
             target_result = paddle.stack([
-                F.adjust_contrast(input1, 0.3), F.adjust_contrast(input2, 0.3)
+                F.adjust_contrast(input1, 0.3),
+                F.adjust_contrast(input2, 0.3)
             ])
 
             batch_result = F.adjust_contrast(batch_tensor, 0.3)
@@ -1120,7 +1145,8 @@ def test_adjust_saturation(batch_tensor):
         def test_adjust_hue(batch_tensor):
             input1, input2 = paddle.unbind(batch_tensor, axis=0)
             target_result = paddle.stack(
-                [F.adjust_hue(input1, -0.2), F.adjust_hue(input2, -0.2)])
+                [F.adjust_hue(input1, -0.2),
+                 F.adjust_hue(input2, -0.2)])
 
             batch_result = F.adjust_hue(batch_tensor, -0.2)
 
diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py
index dc98fc3219bff..1f53060beb049 100644
--- a/python/paddle/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
@@ -20,6 +20,7 @@
 
 
 class TestVisonModels(unittest.TestCase):
+
     def models_infer(self, arch, pretrained=False, batch_norm=False):
 
         x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
diff --git a/python/paddle/text/__init__.py b/python/paddle/text/__init__.py
index f6bfa1c735855..5775a24785804 100644
--- a/python/paddle/text/__init__.py
+++ b/python/paddle/text/__init__.py
@@ -21,14 +21,7 @@
 from .datasets import WMT14  # noqa: F401
 from .datasets import WMT16  # noqa: F401
 
-__all__ = [ #noqa
-           'Conll05st',
-           'Imdb',
-           'Imikolov',
-           'Movielens',
-           'UCIHousing',
-           'WMT14',
-           'WMT16',
-           'ViterbiDecoder',
-           'viterbi_decode'
+__all__ = [  #noqa
+    'Conll05st', 'Imdb', 'Imikolov', 'Movielens', 'UCIHousing', 'WMT14',
+    'WMT16', 'ViterbiDecoder', 'viterbi_decode'
 ]
diff --git a/python/paddle/text/datasets/conll05.py b/python/paddle/text/datasets/conll05.py
index 88ae5e3d8c6e9..09f54d674fd93 100644
--- a/python/paddle/text/datasets/conll05.py
+++ b/python/paddle/text/datasets/conll05.py
@@ -106,8 +106,9 @@ def __init__(self,
         self.data_file = data_file
         if self.data_file is None:
             assert download, "data_file is not set and downloading automatically is disabled"
-            self.data_file = _check_exists_and_download(
-                data_file, DATA_URL, DATA_MD5, 'conll05st', download)
+            self.data_file = _check_exists_and_download(data_file, DATA_URL,
+                                                        DATA_MD5, 'conll05st',
+                                                        download)
 
         self.word_dict_file = word_dict_file
         if self.word_dict_file is None:
@@ -133,8 +134,9 @@ def __init__(self,
         self.emb_file = emb_file
         if self.emb_file is None:
             assert download, "emb_file is not set and downloading automatically is disabled"
-            self.emb_file = _check_exists_and_download(
-                emb_file, EMB_URL, EMB_MD5, 'conll05st', download)
+            self.emb_file = _check_exists_and_download(emb_file, EMB_URL,
+                                                       EMB_MD5, 'conll05st',
+                                                       download)
 
         self.word_dict = self._load_dict(self.word_dict_file)
         self.predicate_dict = self._load_dict(self.verb_dict_file)
diff --git a/python/paddle/text/datasets/imdb.py b/python/paddle/text/datasets/imdb.py
index f4fe7eb174bb7..dc100795accff 100644
--- a/python/paddle/text/datasets/imdb.py
+++ b/python/paddle/text/datasets/imdb.py
@@ -114,9 +114,10 @@ def _tokenize(self, pattern):
                 if bool(pattern.match(tf.name)):
                     # newline and punctuations removal and ad-hoc tokenization.
                     data.append(
-                        tarf.extractfile(tf).read().rstrip(six.b("\n\r"))
-                        .translate(None, six.b(string.punctuation)).lower(
-                        ).split())
+                        tarf.extractfile(tf).read().rstrip(
+                            six.b("\n\r")).translate(
+                                None,
+                                six.b(string.punctuation)).lower().split())
                 tf = tarf.next()
 
         return data
diff --git a/python/paddle/text/datasets/movielens.py b/python/paddle/text/datasets/movielens.py
index 798a7c590e17b..94ebf6b594d66 100644
--- a/python/paddle/text/datasets/movielens.py
+++ b/python/paddle/text/datasets/movielens.py
@@ -79,8 +79,8 @@ def value(self):
 
     def __str__(self):
         return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
-            self.index, "M"
-            if self.is_male else "F", age_table[self.age], self.job_id)
+            self.index, "M" if self.is_male else "F", age_table[self.age],
+            self.job_id)
 
     def __repr__(self):
         return str(self)
@@ -188,8 +188,10 @@ def _load_meta_info(self):
                     for line in user_file:
                         line = cpt.to_text(line, encoding='latin')
                         uid, gender, age, job, _ = line.strip().split("::")
-                        self.user_info[int(uid)] = UserInfo(
-                            index=uid, gender=gender, age=age, job_id=job)
+                        self.user_info[int(uid)] = UserInfo(index=uid,
+                                                            gender=gender,
+                                                            age=age,
+                                                            job_id=job)
 
     def _load_data(self):
         self.data = []
diff --git a/python/paddle/text/datasets/uci_housing.py b/python/paddle/text/datasets/uci_housing.py
index 597b1e1e8185e..c283aeaf733aa 100644
--- a/python/paddle/text/datasets/uci_housing.py
+++ b/python/paddle/text/datasets/uci_housing.py
@@ -94,8 +94,8 @@ def __init__(self, data_file=None, mode='train', download=True):
     def _load_data(self, feature_num=14, ratio=0.8):
         data = np.fromfile(self.data_file, sep=' ')
         data = data.reshape(data.shape[0] // feature_num, feature_num)
-        maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
-            axis=0) / data.shape[0]
+        maximums, minimums, avgs = data.max(axis=0), data.min(
+            axis=0), data.sum(axis=0) / data.shape[0]
         for i in six.moves.range(feature_num - 1):
             data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
         offset = int(data.shape[0] * ratio)
diff --git a/python/paddle/text/datasets/wmt14.py b/python/paddle/text/datasets/wmt14.py
index a6d49d70ab3e3..133c304a02a51 100644
--- a/python/paddle/text/datasets/wmt14.py
+++ b/python/paddle/text/datasets/wmt14.py
@@ -99,8 +99,9 @@ def __init__(self,
         self.data_file = data_file
         if self.data_file is None:
             assert download, "data_file is not set and downloading automatically is disabled"
-            self.data_file = _check_exists_and_download(
-                data_file, URL_TRAIN, MD5_TRAIN, 'wmt14', download)
+            self.data_file = _check_exists_and_download(data_file, URL_TRAIN,
+                                                        MD5_TRAIN, 'wmt14',
+                                                        download)
 
         # read dataset into memory
         assert dict_size > 0, "dict_size should be set as positive number"
@@ -108,6 +109,7 @@ def __init__(self,
         self._load_data()
 
     def _load_data(self):
+
         def __to_dict(fd, size):
             out_dict = dict()
             for line_count, line in enumerate(fd):
diff --git a/python/paddle/text/datasets/wmt16.py b/python/paddle/text/datasets/wmt16.py
index 5e88023a49d80..ee2245ae4fed5 100644
--- a/python/paddle/text/datasets/wmt16.py
+++ b/python/paddle/text/datasets/wmt16.py
@@ -120,16 +120,17 @@ def __init__(self,
         self.data_file = data_file
         if self.data_file is None:
             assert download, "data_file is not set and downloading automatically is disabled"
-            self.data_file = _check_exists_and_download(
-                data_file, DATA_URL, DATA_MD5, 'wmt16', download)
+            self.data_file = _check_exists_and_download(data_file, DATA_URL,
+                                                        DATA_MD5, 'wmt16',
+                                                        download)
 
         self.lang = lang
         assert src_dict_size > 0, "dict_size should be set as positive number"
         assert trg_dict_size > 0, "dict_size should be set as positive number"
-        self.src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if lang == "en"
-                                                 else TOTAL_DE_WORDS))
-        self.trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if lang == "en"
-                                                 else TOTAL_EN_WORDS))
+        self.src_dict_size = min(
+            src_dict_size, (TOTAL_EN_WORDS if lang == "en" else TOTAL_DE_WORDS))
+        self.trg_dict_size = min(
+            trg_dict_size, (TOTAL_DE_WORDS if lang == "en" else TOTAL_EN_WORDS))
 
         # load source and target word dict
         self.src_dict = self._load_dict(lang, src_dict_size)
@@ -173,10 +174,9 @@ def _build_dict(self, dict_path, dict_size, lang):
             fout.write(
                 cpt.to_bytes("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)))
             for idx, word in enumerate(
-                    sorted(
-                        six.iteritems(word_dict),
-                        key=lambda x: x[1],
-                        reverse=True)):
+                    sorted(six.iteritems(word_dict),
+                           key=lambda x: x[1],
+                           reverse=True)):
                 if idx + 3 == dict_size: break
                 fout.write(cpt.to_bytes(word[0]))
                 fout.write(cpt.to_bytes('\n'))
diff --git a/python/paddle/text/viterbi_decode.py b/python/paddle/text/viterbi_decode.py
index ce5667b134a03..cf6bdd04c2692 100644
--- a/python/paddle/text/viterbi_decode.py
+++ b/python/paddle/text/viterbi_decode.py
@@ -75,16 +75,17 @@ def viterbi_decode(potentials,
     attrs = {'include_bos_eos_tag': include_bos_eos_tag}
     scores = helper.create_variable_for_type_inference(potentials.dtype)
     path = helper.create_variable_for_type_inference('int64')
-    helper.append_op(
-        type='viterbi_decode',
-        inputs={
-            'Input': potentials,
-            'Transition': transition_params,
-            'Length': lengths
-        },
-        outputs={'Scores': scores,
-                 'Path': path},
-        attrs=attrs)
+    helper.append_op(type='viterbi_decode',
+                     inputs={
+                         'Input': potentials,
+                         'Transition': transition_params,
+                         'Length': lengths
+                     },
+                     outputs={
+                         'Scores': scores,
+                         'Path': path
+                     },
+                     attrs=attrs)
     return scores, path
 
 
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 44865940adb44..8ed4832a8f751 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -1030,6 +1030,17 @@
     data_type : x
   backward : index_select_grad
 
+- api : instance_norm
+  args : (Tensor x, Tensor scale, Tensor bias, float epsilon)
+  output : Tensor(y), Tensor(saved_mean), Tensor(saved_variance)
+  infer_meta :
+    func : InstanceNormInferMeta
+  kernel :
+    func : instance_norm
+    data_type : x
+  optional : scale, bias
+  backward : instance_norm_grad
+
 # is_empty
 - api : is_empty
   args : (Tensor x)
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 1f19dec992d2f..41e0d2de5c08f 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -19,6 +19,7 @@
 
 
 class BaseAPI(object):
+
     def __init__(self, api_item_yaml):
         self.api = self.get_api_name(api_item_yaml)
 
@@ -41,12 +42,12 @@ def __init__(self, api_item_yaml):
             self.invoke = api_item_yaml['invoke']
         else:
             if 'infer_meta' in api_item_yaml:
-                self.infer_meta = self.parse_infer_meta(api_item_yaml[
-                    'infer_meta'])
+                self.infer_meta = self.parse_infer_meta(
+                    api_item_yaml['infer_meta'])
             self.kernel = self.parse_kernel(api_item_yaml['kernel'])
-            self.support_selected_rows_kernel = False if len(self.kernel[
-                'func']) == 1 or not self.kernel['func'][1].endswith(
-                    '_sr') else True
+            self.support_selected_rows_kernel = False if len(
+                self.kernel['func']
+            ) == 1 or not self.kernel['func'][1].endswith('_sr') else True
             self.data_transform = self.parse_data_transform(api_item_yaml)
             self.inplace_map, self.view_map = {}, {}
 
@@ -65,8 +66,9 @@ def get_input_tensor_args(self, inplace_flag=False):
         for name in self.inputs['names']:
             name = name.split('@')[0]
             if inplace_flag and name in self.inplace_map.values():
-                input_args.append(inplace_type_map[self.inputs['input_info'][
-                    name]] + ' ' + name)
+                input_args.append(
+                    inplace_type_map[self.inputs['input_info'][name]] + ' ' +
+                    name)
             else:
                 input_args.append(self.inputs['input_info'][name] + ' ' + name)
         return input_args
@@ -95,8 +97,9 @@ def parse_args(self, api_name, api_item_yaml):
             optional_vars = [
                 item.strip() for item in api_item_yaml['optional'].split(',')
             ]
-        inputs, attrs = self.parse_input_and_attr(
-            api_name, api_item_yaml['args'], optional_vars)
+        inputs, attrs = self.parse_input_and_attr(api_name,
+                                                  api_item_yaml['args'],
+                                                  optional_vars)
         output_type_list, output_names, out_size_expr = self.parse_output(
             api_name, api_item_yaml['output'])
         return inputs, attrs, {
@@ -199,6 +202,7 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
         return inputs, attrs
 
     def parse_output(self, api_name, output_config):
+
         def parse_output_item(output_item):
             output_type_map = {
                 'Tensor': 'Tensor',
@@ -526,7 +530,8 @@ def gene_infer_meta(self, kernel_output_names, code_indent) -> str:
 
     def get_kernel_args(self, code_indent):
         input_trans_map = {
-            'const Tensor&': 'const phi::DenseTensor&',
+            'const Tensor&':
+            'const phi::DenseTensor&',
             'const std::vector<Tensor>&':
             'const std::vector<const phi::DenseTensor*>&',
             'const paddle::optional<Tensor&>':
@@ -617,8 +622,8 @@ def get_kernel_args(self, code_indent):
                     kernel_args_type_list.append('const phi::Scalar&')
                     param = 'phi::Scalar(' + param + ')'
                 else:
-                    kernel_args_type_list.append(self.attrs['attr_info'][param][
-                        0])
+                    kernel_args_type_list.append(
+                        self.attrs['attr_info'][param][0])
                 kernel_args = kernel_args + param + ", "
             elif isinstance(param, bool):
                 kernel_args = kernel_args + str(param).lower() + ", "
@@ -634,7 +639,8 @@ def get_kernel_args(self, code_indent):
 
     def get_selected_rows_kernel_args(self, code_indent):
         input_trans_map = {
-            'const Tensor&': 'const phi::SelectedRows&',
+            'const Tensor&':
+            'const phi::SelectedRows&',
             'const paddle::optional<Tensor>&':
             'const paddle::optional<phi::SelectedRows>&'
         }
@@ -682,8 +688,8 @@ def get_selected_rows_kernel_args(self, code_indent):
                     kernel_args_type_list.append('const phi::Scalar&')
                     param = 'phi::Scalar(' + param + ')'
                 else:
-                    kernel_args_type_list.append(self.attrs['attr_info'][param][
-                        0])
+                    kernel_args_type_list.append(
+                        self.attrs['attr_info'][param][0])
                 kernel_args = kernel_args + param + ", "
             elif isinstance(param, bool):
                 kernel_args = kernel_args + str(param).lower() + ", "
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 1721da19295d5..7548c047ff552 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -26,6 +26,7 @@
 
 
 class ForwardAPI(BaseAPI):
+
     def __init__(self, api_item_yaml):
         super(ForwardAPI, self).__init__(api_item_yaml)
         self.is_dygraph_api, self.intermediate_outs = self.parse_intermediate(
@@ -131,9 +132,9 @@ def gene_output(self,
         if len(output_type_list) == 1:
             kernel_output = 'kernel_out'
             output_names.append('kernel_out')
-            inplace_assign = " = " + self.inplace_map[self.outputs['names'][
-                0]] if inplace_flag and self.outputs['names'][
-                    0] in self.inplace_map else ""
+            inplace_assign = " = " + self.inplace_map[
+                self.outputs['names'][0]] if inplace_flag and self.outputs[
+                    'names'][0] in self.inplace_map else ""
             output_create = f"""
 {code_indent}  {return_type} api_output{inplace_assign};"""
 
@@ -287,21 +288,18 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
 def main():
     parser = argparse.ArgumentParser(
         description='Generate PaddlePaddle C++ API files')
-    parser.add_argument(
-        '--api_yaml_path',
-        help='path to api yaml file',
-        nargs='+',
-        default='python/paddle/utils/code_gen/api.yaml')
-
-    parser.add_argument(
-        '--api_header_path',
-        help='output of generated api header code file',
-        default='paddle/phi/api/include/api.h')
-
-    parser.add_argument(
-        '--api_source_path',
-        help='output of generated api source code file',
-        default='paddle/phi/api/lib/api.cc')
+    parser.add_argument('--api_yaml_path',
+                        help='path to api yaml file',
+                        nargs='+',
+                        default='python/paddle/utils/code_gen/api.yaml')
+
+    parser.add_argument('--api_header_path',
+                        help='output of generated api header code file',
+                        default='paddle/phi/api/include/api.h')
+
+    parser.add_argument('--api_source_path',
+                        help='output of generated api source code file',
+                        default='paddle/phi/api/lib/api.cc')
 
     options = parser.parse_args()
 
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index d6c148e6ca925..6a555fd24a066 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -927,6 +927,29 @@
     data_type : x
   no_need_buffer : x
 
+- backward_api : instance_norm_double_grad
+  forward : instance_norm_grad(Tensor x, Tensor fwd_scale, Tensor saved_mean, Tensor saved_variance, Tensor grad_y, float epsilon) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias)
+  args : (Tensor x, Tensor fwd_scale, Tensor saved_mean, Tensor saved_variance, Tensor grad_y, Tensor grad_x_grad, Tensor grad_scale_grad, Tensor grad_bias_grad, float epsilon)
+  output : Tensor(x_grad), Tensor(fwd_scale_grad), Tensor(grad_y_grad)
+  infer_meta :
+    func : InstanceNormDoubleGradInferMeta
+  kernel :
+    func : instance_norm_double_grad
+    data_type : x
+  optional : fwd_scale, grad_x_grad, grad_scale_grad, grad_bias_grad
+
+- backward_api : instance_norm_grad
+  forward : instance_norm(Tensor x, Tensor scale, Tensor bias, float epsilon) -> Tensor(y), Tensor(saved_mean), Tensor(saved_variance)
+  args : (Tensor x, Tensor scale, Tensor saved_mean, Tensor saved_variance, Tensor y_grad, float epsilon)
+  output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
+  infer_meta :
+    func : InstanceNormGradInferMeta
+  kernel :
+    func : instance_norm_grad
+    data_type : x
+  optional : scale
+  backward : instance_norm_double_grad
+
 - backward_api : kldiv_loss_grad
   forward : kldiv_loss(Tensor x, Tensor label, str reduction) -> Tensor(out)
   args : (Tensor x, Tensor label, Tensor out_grad, str reduction)
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 886748eeb290e..48bff2d1d3f95 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -21,6 +21,7 @@
 
 
 class BackwardAPI(BaseAPI):
+
     def __init__(self, backward_item_yaml):
         super(BackwardAPI, self).__init__(backward_item_yaml)
         self.check_args(backward_item_yaml['forward'])
@@ -271,20 +272,17 @@ def generate_backward_api(backward_yaml_path, header_file_path,
 def main():
     parser = argparse.ArgumentParser(
         description='Generate PaddlePaddle C++ backward API files')
-    parser.add_argument(
-        '--backward_yaml_path',
-        help='path to backward yaml file',
-        nargs='+',
-        default='python/paddle/utils/code_gen/backward.yaml')
-    parser.add_argument(
-        '--backward_header_path',
-        help='output of generated backward header code file',
-        default='paddle/phi/api/backward/backward_api.h')
-
-    parser.add_argument(
-        '--backward_source_path',
-        help='output of generated backward source code file',
-        default='paddle/phi/api/lib/backward_api.cc')
+    parser.add_argument('--backward_yaml_path',
+                        help='path to backward yaml file',
+                        nargs='+',
+                        default='python/paddle/utils/code_gen/backward.yaml')
+    parser.add_argument('--backward_header_path',
+                        help='output of generated backward header code file',
+                        default='paddle/phi/api/backward/backward_api.h')
+
+    parser.add_argument('--backward_source_path',
+                        help='output of generated backward source code file',
+                        default='paddle/phi/api/lib/backward_api.cc')
 
     options = parser.parse_args()
 
diff --git a/python/paddle/utils/code_gen/cross_validate.py b/python/paddle/utils/code_gen/cross_validate.py
index 30fbf2e0a7d42..3eca85e8ff405 100644
--- a/python/paddle/utils/code_gen/cross_validate.py
+++ b/python/paddle/utils/code_gen/cross_validate.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,18 +35,16 @@ def main(forward_api_yaml_paths, backward_api_yaml_paths):
     current_dir = Path(__file__).parent / "temp"
     parser = argparse.ArgumentParser(
         description="Parse api yaml into canonical format.")
-    parser.add_argument(
-        '--forward_yaml_paths',
-        type=str,
-        nargs='+',
-        default=str(current_dir / "api.parsed.yaml"),
-        help="forward api yaml file.")
-    parser.add_argument(
-        '--backward_yaml_paths',
-        type=str,
-        nargs='+',
-        default=str(current_dir / "backward.yaml.yaml"),
-        help="backward api yaml file.")
+    parser.add_argument('--forward_yaml_paths',
+                        type=str,
+                        nargs='+',
+                        default=str(current_dir / "api.parsed.yaml"),
+                        help="forward api yaml file.")
+    parser.add_argument('--backward_yaml_paths',
+                        type=str,
+                        nargs='+',
+                        default=str(current_dir / "backward.yaml.yaml"),
+                        help="backward api yaml file.")
 
     args = parser.parse_args()
     main(args.forward_yaml_paths, args.backward_yaml_paths)
diff --git a/python/paddle/utils/code_gen/filters.py b/python/paddle/utils/code_gen/filters.py
index d37403adcba36..832685f83e64c 100644
--- a/python/paddle/utils/code_gen/filters.py
+++ b/python/paddle/utils/code_gen/filters.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/utils/code_gen/generate_op.py b/python/paddle/utils/code_gen/generate_op.py
index 0b314e4a11cb3..adaae66b979bd 100644
--- a/python/paddle/utils/code_gen/generate_op.py
+++ b/python/paddle/utils/code_gen/generate_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,13 +26,12 @@
 from parse_utils import to_named_dict
 
 file_loader = FileSystemLoader(Path(__file__).parent / "templates")
-env = Environment(
-    loader=file_loader,
-    keep_trailing_newline=True,
-    trim_blocks=True,
-    lstrip_blocks=True,
-    undefined=StrictUndefined,
-    extensions=['jinja2.ext.do'])
+env = Environment(loader=file_loader,
+                  keep_trailing_newline=True,
+                  trim_blocks=True,
+                  lstrip_blocks=True,
+                  undefined=StrictUndefined,
+                  extensions=['jinja2.ext.do'])
 env.filters["to_op_attr_type"] = to_op_attr_type
 env.filters["to_opmaker_name"] = to_opmaker_name
 env.filters["to_pascal_case"] = to_pascal_case
@@ -82,8 +81,9 @@ def main(api_yaml_path, backward_yaml_path, output_op_path,
 
     op_template = env.get_template('op.c.j2')
     with open(output_op_path, "wt") as f:
-        msg = op_template.render(
-            apis=apis, backward_apis=backward_apis, api_dict=api_dict)
+        msg = op_template.render(apis=apis,
+                                 backward_apis=backward_apis,
+                                 api_dict=api_dict)
         f.write(msg)
 
     ks_template = env.get_template('ks.c.j2')
@@ -95,14 +95,15 @@ def main(api_yaml_path, backward_yaml_path, output_op_path,
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Generate operator file from api yaml.")
-    parser.add_argument(
-        '--api_yaml_path', type=str, help="parsed api yaml file.")
-    parser.add_argument(
-        '--backward_api_yaml_path',
-        type=str,
-        help="parsed backward api yaml file.")
-    parser.add_argument(
-        "--output_op_path", type=str, help="path to save generated operators.")
+    parser.add_argument('--api_yaml_path',
+                        type=str,
+                        help="parsed api yaml file.")
+    parser.add_argument('--backward_api_yaml_path',
+                        type=str,
+                        help="parsed backward api yaml file.")
+    parser.add_argument("--output_op_path",
+                        type=str,
+                        help="path to save generated operators.")
     parser.add_argument(
         "--output_arg_map_path",
         type=str,
diff --git a/python/paddle/utils/code_gen/intermediate_api_gen.py b/python/paddle/utils/code_gen/intermediate_api_gen.py
index 4e4875b596192..25e28d6af6e9d 100644
--- a/python/paddle/utils/code_gen/intermediate_api_gen.py
+++ b/python/paddle/utils/code_gen/intermediate_api_gen.py
@@ -132,26 +132,22 @@ def generate_intermediate_api(api_yaml_path, sparse_api_yaml_path,
 def main():
     parser = argparse.ArgumentParser(
         description='Generate PaddlePaddle C++ Sparse API files')
-    parser.add_argument(
-        '--api_yaml_path',
-        nargs='+',
-        help='path to api yaml file',
-        default='python/paddle/utils/code_gen/api.yaml')
-
-    parser.add_argument(
-        '--sparse_api_yaml_path',
-        help='path to sparse api yaml file',
-        default='python/paddle/utils/code_gen/sparse_api.yaml')
-
-    parser.add_argument(
-        '--dygraph_api_header_path',
-        help='output of generated dygraph api header code file',
-        default='paddle/phi/api/lib/dygraph_api.h')
-
-    parser.add_argument(
-        '--dygraph_api_source_path',
-        help='output of generated dygraph api source code file',
-        default='paddle/phi/api/lib/dygraph_api.cc')
+    parser.add_argument('--api_yaml_path',
+                        nargs='+',
+                        help='path to api yaml file',
+                        default='python/paddle/utils/code_gen/api.yaml')
+
+    parser.add_argument('--sparse_api_yaml_path',
+                        help='path to sparse api yaml file',
+                        default='python/paddle/utils/code_gen/sparse_api.yaml')
+
+    parser.add_argument('--dygraph_api_header_path',
+                        help='output of generated dygraph api header code file',
+                        default='paddle/phi/api/lib/dygraph_api.h')
+
+    parser.add_argument('--dygraph_api_source_path',
+                        help='output of generated dygraph api source code file',
+                        default='paddle/phi/api/lib/dygraph_api.cc')
 
     options = parser.parse_args()
 
diff --git a/python/paddle/utils/code_gen/parse_api.py b/python/paddle/utils/code_gen/parse_api.py
index 63dc314d2e31e..fcaf365951ee6 100644
--- a/python/paddle/utils/code_gen/parse_api.py
+++ b/python/paddle/utils/code_gen/parse_api.py
@@ -39,8 +39,9 @@ def main(api_yaml_path, output_path, backward):
     parser = argparse.ArgumentParser(
         description="Parse api yaml into canonical format.")
     parser.add_argument('--api_yaml_path', type=str, help="api yaml file.")
-    parser.add_argument(
-        "--output_path", type=str, help="path to save parsed yaml file.")
+    parser.add_argument("--output_path",
+                        type=str,
+                        help="path to save parsed yaml file.")
     parser.add_argument("--backward", action="store_true", default=False)
 
     args = parser.parse_args()
diff --git a/python/paddle/utils/code_gen/parse_utils.py b/python/paddle/utils/code_gen/parse_utils.py
index 8168328012ec5..11a0b49eeefc3 100644
--- a/python/paddle/utils/code_gen/parse_utils.py
+++ b/python/paddle/utils/code_gen/parse_utils.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -149,8 +149,8 @@ def parse_plain_list(s: str, sep=",") -> List[str]:
     return items
 
 
-def parse_kernel(api_name: str,
-                 kernel_config: Dict[str, Any]) -> Dict[str, Any]:
+def parse_kernel(api_name: str, kernel_config: Dict[str,
+                                                    Any]) -> Dict[str, Any]:
     # kernel :
     #    func : [], Kernel functions (example: scale, scale_sr)
     #    param : [], Input params of kernel
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
index bd73032e179db..4c2f453e533e5 100644
--- a/python/paddle/utils/code_gen/sparse_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -21,6 +21,7 @@
 
 
 class SparseAPI(ForwardAPI):
+
     def __init__(self, api_item_yaml):
         super(SparseAPI, self).__init__(api_item_yaml)
 
@@ -95,8 +96,10 @@ def gene_output(self,
 
     def gen_sparse_kernel_context(self, kernel_output_names):
         input_trans_map = {
-            'const Tensor&': 'const phi::TenseBase&',
-            'const std::vector<Tensor>&': 'const std::vector<phi::TenseBase>&',
+            'const Tensor&':
+            'const phi::TenseBase&',
+            'const std::vector<Tensor>&':
+            'const std::vector<phi::TenseBase>&',
             'const paddle::optional<Tensor>&':
             'paddle::optional<const phi::TenseBase&>'
         }
@@ -150,8 +153,8 @@ def gen_sparse_kernel_code(self, kernel_name, inplace_flag=False):
 
         kernel_context_code = self.gen_sparse_kernel_context(
             kernel_output_names)
-        return_code = "" if len(self.gene_return_code(
-        )) == 0 else "  " + self.gene_return_code()
+        return_code = "" if len(
+            self.gene_return_code()) == 0 else "  " + self.gene_return_code()
         return f"""
     VLOG(6) << "{self.api} api sparse kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
     auto phi_kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
@@ -199,8 +202,8 @@ def gene_base_api_code(self, inplace_flag=False):
             api_func_name += '_'
         kernel_dispatch_code = f"{self.gene_kernel_select()}\n"
         for kernel_name in self.kernel['func']:
-            kernel_dispatch_code += self.gene_dispatch_code(kernel_name,
-                                                            inplace_flag)
+            kernel_dispatch_code += self.gene_dispatch_code(
+                kernel_name, inplace_flag)
 
         return f"""
 PADDLE_API {self.get_return_type()} {api_func_name}({self.get_define_args()}) {{
@@ -285,20 +288,17 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
 def main():
     parser = argparse.ArgumentParser(
         description='Generate PaddlePaddle C++ Sparse API files')
-    parser.add_argument(
-        '--api_yaml_path',
-        help='path to sparse api yaml file',
-        default='python/paddle/utils/code_gen/sparse_api.yaml')
-
-    parser.add_argument(
-        '--api_header_path',
-        help='output of generated api header code file',
-        default='paddle/phi/api/include/sparse_api.h')
-
-    parser.add_argument(
-        '--api_source_path',
-        help='output of generated api source code file',
-        default='paddle/phi/api/lib/sparse_api.cc')
+    parser.add_argument('--api_yaml_path',
+                        help='path to sparse api yaml file',
+                        default='python/paddle/utils/code_gen/sparse_api.yaml')
+
+    parser.add_argument('--api_header_path',
+                        help='output of generated api header code file',
+                        default='paddle/phi/api/include/sparse_api.h')
+
+    parser.add_argument('--api_source_path',
+                        help='output of generated api source code file',
+                        default='paddle/phi/api/lib/sparse_api.cc')
 
     options = parser.parse_args()
 
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
index cf59726bbb195..3e0abead03642 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
@@ -22,6 +22,7 @@
 
 
 class SparseBackwardAPI(SparseAPI, BackwardAPI):
+
     def __init__(self, bw_api_item_yaml):
         BackwardAPI.__init__(self, bw_api_item_yaml)
 
@@ -166,15 +167,13 @@ def main():
         help='path to sparse api yaml file',
         default='python/paddle/utils/code_gen/sparse_bw_api.yaml')
 
-    parser.add_argument(
-        '--api_header_path',
-        help='output of generated api header code file',
-        default='paddle/phi/api/backward/sparse_bw_api.h')
+    parser.add_argument('--api_header_path',
+                        help='output of generated api header code file',
+                        default='paddle/phi/api/backward/sparse_bw_api.h')
 
-    parser.add_argument(
-        '--api_source_path',
-        help='output of generated api source code file',
-        default='paddle/phi/api/lib/sparse_bw_api.cc')
+    parser.add_argument('--api_source_path',
+                        help='output of generated api source code file',
+                        default='paddle/phi/api/lib/sparse_bw_api.cc')
 
     options = parser.parse_args()
 
diff --git a/python/paddle/utils/code_gen/strings_api_gen.py b/python/paddle/utils/code_gen/strings_api_gen.py
index d697ce3935708..5b29c6076b443 100644
--- a/python/paddle/utils/code_gen/strings_api_gen.py
+++ b/python/paddle/utils/code_gen/strings_api_gen.py
@@ -18,11 +18,13 @@
 import re
 
 from api_gen import ForwardAPI
+
 PREFIX_TENSOR_NAME = 'input_'
 PREFIX_META_TENSOR_NAME = 'meta_'
 
 
 class StringsAPI(ForwardAPI):
+
     def __init__(self, api_item_yaml):
         super(StringsAPI, self).__init__(api_item_yaml)
 
@@ -99,7 +101,8 @@ def gene_output(self,
 
     def get_kernel_args(self, code_indent):
         input_trans_map = {
-            'const Tensor&': 'const phi::StringTensor&',
+            'const Tensor&':
+            'const phi::StringTensor&',
             'const std::vector<Tensor>&':
             'const std::vector<const phi::StringTensor*>&',
             'const paddle::optional<Tensor>&':
@@ -151,8 +154,8 @@ def get_kernel_args(self, code_indent):
                     kernel_args_type_list.append('const phi::Scalar&')
                     param = 'phi::Scalar(' + param + ')'
                 else:
-                    kernel_args_type_list.append(self.attrs['attr_info'][param][
-                        0])
+                    kernel_args_type_list.append(
+                        self.attrs['attr_info'][param][0])
                 kernel_args = kernel_args + param + ", "
             elif isinstance(param, bool):
                 kernel_args = kernel_args + str(param).lower() + ", "
@@ -351,20 +354,17 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
 def main():
     parser = argparse.ArgumentParser(
         description='Generate PaddlePaddle C++ Strings API files')
-    parser.add_argument(
-        '--api_yaml_path',
-        help='path to sparse api yaml file',
-        default='python/paddle/utils/code_gen/strings_api.yaml')
-
-    parser.add_argument(
-        '--api_header_path',
-        help='output of generated api header code file',
-        default='paddle/phi/api/include/strings_api.h')
-
-    parser.add_argument(
-        '--api_source_path',
-        help='output of generated api source code file',
-        default='paddle/phi/api/lib/strings_api.cc')
+    parser.add_argument('--api_yaml_path',
+                        help='path to sparse api yaml file',
+                        default='python/paddle/utils/code_gen/strings_api.yaml')
+
+    parser.add_argument('--api_header_path',
+                        help='output of generated api header code file',
+                        default='paddle/phi/api/include/strings_api.h')
+
+    parser.add_argument('--api_source_path',
+                        help='output of generated api source code file',
+                        default='paddle/phi/api/lib/strings_api.cc')
 
     options = parser.parse_args()
 
diff --git a/python/paddle/utils/code_gen/tests.py b/python/paddle/utils/code_gen/tests.py
index 453578b5cbd8e..d322fe1885baa 100644
--- a/python/paddle/utils/code_gen/tests.py
+++ b/python/paddle/utils/code_gen/tests.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/utils/code_gen/type_mapping.py b/python/paddle/utils/code_gen/type_mapping.py
index c6e110907a9f7..448a2ab22f3e2 100644
--- a/python/paddle/utils/code_gen/type_mapping.py
+++ b/python/paddle/utils/code_gen/type_mapping.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -105,10 +105,14 @@
 
 #---------------------- phi selected rows------------------------------
 # type mapping to phi, used in implementation
-sr_input_types_map = {'Tensor': 'const phi::SelectedRows&', }
+sr_input_types_map = {
+    'Tensor': 'const phi::SelectedRows&',
+}
 
 sr_optional_input_types_map = {
     'Tensor': 'const paddle::optional<phi::SelectedRows>&',
 }
 
-sr_output_types_map = {'Tensor': 'phi::SelectedRows*', }
+sr_output_types_map = {
+    'Tensor': 'phi::SelectedRows*',
+}
diff --git a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
index bf798f9734d53..b41ebfb848730 100644
--- a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
+++ b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
@@ -51,15 +51,16 @@ def gene_wrapped_infermeta_and_register(api):
             for input_name in api.inputs['names']:
                 if input_name in kernel_params:
                     print("type", api.inputs['input_info'])
-                    args.append(tensor_type_map[api.inputs['input_info'][
-                        input_name]] + ' ' + input_name)
+                    args.append(
+                        tensor_type_map[api.inputs['input_info'][input_name]] +
+                        ' ' + input_name)
             for attr_name in api.attrs['names']:
                 if attr_name in kernel_params:
                     args.append(api.attrs['attr_info'][attr_name][0] + ' ' +
                                 attr_name)
             for i, out_type in enumerate(api.outputs['types']):
-                args.append(tensor_type_map[out_type] + ' ' + api.outputs[
-                    'names'][i])
+                args.append(tensor_type_map[out_type] + ' ' +
+                            api.outputs['names'][i])
 
             invoke_param = api.infer_meta['param']
             invoke_param.extend(api.outputs['names'])
@@ -157,11 +158,10 @@ def generate_wrapped_infermeta_and_register(api_yaml_path, header_file_path,
 def main():
     parser = argparse.ArgumentParser(
         description='Generate PaddlePaddle C++ API files')
-    parser.add_argument(
-        '--api_yaml_path',
-        help='path to api yaml file',
-        nargs='+',
-        default='python/paddle/utils/code_gen/api.yaml')
+    parser.add_argument('--api_yaml_path',
+                        help='path to api yaml file',
+                        nargs='+',
+                        default='python/paddle/utils/code_gen/api.yaml')
     parser.add_argument(
         '--wrapped_infermeta_header_path',
         help='output of generated wrapped_infermeta header code file',
diff --git a/python/paddle/utils/cpp_extension/__init__.py b/python/paddle/utils/cpp_extension/__init__.py
index cef2716b7f396..843f78d5c803a 100644
--- a/python/paddle/utils/cpp_extension/__init__.py
+++ b/python/paddle/utils/cpp_extension/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,10 +22,6 @@
 from .extension_utils import get_build_directory  # noqa: F401
 from .extension_utils import load_op_meta_info_and_register_op  # noqa: F401
 
-__all__ = [ #noqa
-        'CppExtension',
-        'CUDAExtension',
-        'load',
-        'setup',
-        'get_build_directory'
+__all__ = [  #noqa
+    'CppExtension', 'CUDAExtension', 'load', 'setup', 'get_build_directory'
 ]
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 3a7804d9012dd..73642e1a004ca 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -335,6 +335,7 @@ def with_options(cls, **options):
         """
 
         class cls_with_options(cls):
+
             def __init__(self, *args, **kwargs):
                 kwargs.update(options)
                 cls.__init__(self, *args, **kwargs)
@@ -379,8 +380,8 @@ def build_extensions(self):
         # cflags have changed and delete the built shared library to re-compile the source
         # even though source file content keep unchanged.
         so_name = self.get_ext_fullpath(self.extensions[0].name)
-        clean_object_if_change_cflags(
-            os.path.abspath(so_name), self.extensions[0])
+        clean_object_if_change_cflags(os.path.abspath(so_name),
+                                      self.extensions[0])
 
         # Consider .cu, .cu.cc as valid source extensions.
         self.compiler.src_extensions += ['.cu', '.cu.cc']
@@ -448,8 +449,9 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
                     else:
                         cflags.append('-DPADDLE_WITH_CUDA')
 
-                add_std_without_repeat(
-                    cflags, self.compiler.compiler_type, use_std14=True)
+                add_std_without_repeat(cflags,
+                                       self.compiler.compiler_type,
+                                       use_std14=True)
                 original_compile(obj, src, ext, cc_args, cflags, pp_opts)
             finally:
                 # restore original_compiler
@@ -611,12 +613,11 @@ def _valid_clang_compiler(self):
         """
         compiler_infos = ['clang'] + CLANG_COMPILE_FLAGS
         linker_infos = ['clang'] + CLANG_LINK_FLAGS
-        self.compiler.set_executables(
-            compiler=compiler_infos,
-            compiler_so=compiler_infos,
-            compiler_cxx=['clang'],
-            linker_exe=['clang'],
-            linker_so=linker_infos)
+        self.compiler.set_executables(compiler=compiler_infos,
+                                      compiler_so=compiler_infos,
+                                      compiler_cxx=['clang'],
+                                      linker_exe=['clang'],
+                                      linker_so=linker_infos)
 
     def _check_abi(self):
         """
@@ -711,6 +712,7 @@ def with_options(cls, **options):
         """
 
         class cls_with_options(cls):
+
             def __init__(self, *args, **kwargs):
                 kwargs.update(options)
                 cls.__init__(self, *args, **kwargs)
@@ -845,8 +847,9 @@ def load(name,
     ), "Required type(extra_cuda_cflags) == list[str], but received {}".format(
         extra_cuda_cflags)
 
-    log_v("additional extra_cxx_cflags: [{}], extra_cuda_cflags: [{}]".format(
-        ' '.join(extra_cxx_cflags), ' '.join(extra_cuda_cflags)), verbose)
+    log_v(
+        "additional extra_cxx_cflags: [{}], extra_cuda_cflags: [{}]".format(
+            ' '.join(extra_cxx_cflags), ' '.join(extra_cuda_cflags)), verbose)
 
     # write setup.py file and compile it
     build_base_dir = os.path.join(build_directory, name)
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 41add6e764a8c..62fce3360042b 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -193,8 +193,8 @@ def __bootstrap__():
 
     with open(pyfile, 'w') as f:
         f.write(
-            _stub_template.format(
-                resource=resource, custom_api='\n\n'.join(api_content)))
+            _stub_template.format(resource=resource,
+                                  custom_api='\n\n'.join(api_content)))
 
 
 OpInfo = collections.namedtuple('OpInfo', ['so_name', 'so_path'])
@@ -242,6 +242,7 @@ def last(self):
 
 
 class VersionManager:
+
     def __init__(self, version_field):
         self.version_field = version_field
         self.version = self.hasher(version_field)
@@ -258,8 +259,8 @@ def hasher(self, version_field):
                 md5 = combine_hash(md5, tuple(flat_elem))
             else:
                 raise RuntimeError(
-                    "Support types with list, tuple and dict, but received {} with {}.".
-                    format(type(elem), elem))
+                    "Support types with list, tuple and dict, but received {} with {}."
+                    .format(type(elem), elem))
 
         return md5.hexdigest()
 
@@ -313,8 +314,8 @@ def deserialize(path):
         # delete shared library file if version is changed to re-compile it.
         if so_version is not None and so_version != versioner.version:
             log_v(
-                "Re-Compiling {}, because specified cflags have been changed. New signature {} has been saved into {}.".
-                format(so_name, versioner.version, version_file))
+                "Re-Compiling {}, because specified cflags have been changed. New signature {} has been saved into {}."
+                .format(so_name, versioner.version, version_file))
             os.remove(so_path)
             # update new version information
             new_version_info = versioner.details
@@ -436,8 +437,8 @@ def _reset_so_rpath(so_path):
     if OS_NAME.startswith("darwin"):
         origin_runtime_path = "@loader_path/../libs/"
         rpath = "@rpath/{}".format(_get_core_name())
-        cmd = 'install_name_tool -change {} {} {}'.format(origin_runtime_path,
-                                                          rpath, so_path)
+        cmd = 'install_name_tool -change {} {} {}'.format(
+            origin_runtime_path, rpath, so_path)
 
         run_cmd(cmd)
 
@@ -569,9 +570,9 @@ def create_sym_link_if_not_exist():
             except Exception:
                 warnings.warn(
                     "Failed to create soft symbol link for {}.\n You can run prompt as administrator and execute the "
-                    "following command manually: `mklink {} {}`. Now it will create hard link for {} trickly.".
-                    format(raw_core_name, new_dll_core_path, core_path,
-                           raw_core_name))
+                    "following command manually: `mklink {} {}`. Now it will create hard link for {} trickly."
+                    .format(raw_core_name, new_dll_core_path, core_path,
+                            raw_core_name))
                 run_cmd('mklink /H {} {}'.format(new_dll_core_path, core_path))
         # core_avx or core_noavx with lib suffix
         assert os.path.exists(new_dll_core_path)
@@ -586,8 +587,8 @@ def create_sym_link_if_not_exist():
                 assert os.path.exists(new_lib_core_path)
             except Exception:
                 raise RuntimeError(
-                    "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`".
-                    format(raw_core_name, core_path, new_lib_core_path))
+                    "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`"
+                    .format(raw_core_name, core_path, new_lib_core_path))
 
         # core_avx or core_noavx without suffix
         return raw_core_name[:-3]
@@ -605,8 +606,8 @@ def find_cuda_home():
         which_cmd = 'where' if IS_WINDOWS else 'which'
         try:
             with open(os.devnull, 'w') as devnull:
-                nvcc_path = subprocess.check_output(
-                    [which_cmd, 'nvcc'], stderr=devnull)
+                nvcc_path = subprocess.check_output([which_cmd, 'nvcc'],
+                                                    stderr=devnull)
                 nvcc_path = nvcc_path.decode()
                 # Multi CUDA, select the first
                 nvcc_path = nvcc_path.split('\r\n')[0]
@@ -643,8 +644,8 @@ def find_rocm_home():
         which_cmd = 'where' if IS_WINDOWS else 'which'
         try:
             with open(os.devnull, 'w') as devnull:
-                hipcc_path = subprocess.check_output(
-                    [which_cmd, 'hipcc'], stderr=devnull)
+                hipcc_path = subprocess.check_output([which_cmd, 'hipcc'],
+                                                     stderr=devnull)
                 hipcc_path = hipcc_path.decode()
                 hipcc_path = hipcc_path.rstrip('\r\n')
 
@@ -722,8 +723,8 @@ def find_clang_cpp_include(compiler='clang'):
             if "InstalledDir" in info:
                 v1_path = info.split(':')[-1].strip()
                 if v1_path and os.path.exists(v1_path):
-                    std_v1_includes = os.path.join(
-                        os.path.dirname(v1_path), 'include/c++/v1')
+                    std_v1_includes = os.path.join(os.path.dirname(v1_path),
+                                                   'include/c++/v1')
     except Exception:
         # Just raise warnings because the include dir is not required.
         warnings.warn(
@@ -823,14 +824,15 @@ def get_build_directory(verbose=False):
     root_extensions_directory = os.environ.get('PADDLE_EXTENSION_DIR')
     if root_extensions_directory is None:
         dir_name = "paddle_extensions"
-        root_extensions_directory = os.path.join(
-            os.path.expanduser('~/.cache'), dir_name)
+        root_extensions_directory = os.path.join(os.path.expanduser('~/.cache'),
+                                                 dir_name)
         if IS_WINDOWS:
             root_extensions_directory = os.path.normpath(
                 root_extensions_directory)
 
-        log_v("$PADDLE_EXTENSION_DIR is not set, using path: {} by default.".
-              format(root_extensions_directory), verbose)
+        log_v(
+            "$PADDLE_EXTENSION_DIR is not set, using path: {} by default.".
+            format(root_extensions_directory), verbose)
 
     if not os.path.exists(root_extensions_directory):
         os.makedirs(root_extensions_directory)
@@ -845,8 +847,8 @@ def parse_op_info(op_name):
     """
     if op_name not in OpProtoHolder.instance().op_proto_map:
         raise ValueError(
-            "Please load {} shared library file firstly by `paddle.utils.cpp_extension.load_op_meta_info_and_register_op(...)`".
-            format(op_name))
+            "Please load {} shared library file firstly by `paddle.utils.cpp_extension.load_op_meta_info_and_register_op(...)`"
+            .format(op_name))
     op_proto = OpProtoHolder.instance().get_op_proto(op_name)
 
     in_names = [x.name for x in op_proto.inputs]
@@ -870,8 +872,8 @@ def _import_module_from_library(module_name, build_directory, verbose=False):
         dynamic_suffix = '.so'
     ext_path = os.path.join(build_directory, module_name + dynamic_suffix)
     if not os.path.exists(ext_path):
-        raise FileNotFoundError("Extension path: {} does not exist.".format(
-            ext_path))
+        raise FileNotFoundError(
+            "Extension path: {} does not exist.".format(ext_path))
 
     # load custom op_info and kernels from .so shared library
     log_v('loading shared library from: {}'.format(ext_path), verbose)
@@ -901,7 +903,7 @@ def remove_if_exit(filepath):
                             module_name + '_' + thread_id + '.py')
     log_v("generate api file: {}".format(api_file), verbose)
 
-    # delete the temp file before exit python process    
+    # delete the temp file before exit python process
     atexit.register(lambda: remove_if_exit(api_file))
 
     # write into .py file with RWLockc
@@ -979,8 +981,8 @@ def _load_module_from_file(api_file_path, module_name, verbose=False):
     Load module from python file.
     """
     if not os.path.exists(api_file_path):
-        raise FileNotFoundError("File : {} does not exist.".format(
-            api_file_path))
+        raise FileNotFoundError(
+            "File : {} does not exist.".format(api_file_path))
 
     # Unique readable module name to place custom api.
     log_v('import module from file: {}'.format(api_file_path), verbose)
@@ -1006,12 +1008,14 @@ def _get_api_inputs_str(op_name):
     params_str = ','.join([p.split("@")[0].lower() for p in param_names])
     # e.g: {'X': x, 'Y': y, 'Z': z}
     ins_str = "{%s}" % ','.join([
-        "'{}' : {}".format(in_name, in_name.split("@")[0].lower())
+        "'{}' : {}".format(in_name,
+                           in_name.split("@")[0].lower())
         for in_name in in_names
     ])
     # e.g: {'num': n}
     attrs_str = "{%s}" % ",".join([
-        "'{}' : {}".format(attr_name, attr_name.split("@")[0].lower())
+        "'{}' : {}".format(attr_name,
+                           attr_name.split("@")[0].lower())
         for attr_name in attr_names
     ])
     # e.g: ['Out', 'Index']
@@ -1055,15 +1059,14 @@ def _write_setup_file(name,
         with_cuda = True
     log_v("with_cuda: {}".format(with_cuda), verbose)
 
-    content = template.format(
-        name=name,
-        prefix='CUDA' if with_cuda else 'Cpp',
-        sources=list2str(sources),
-        include_dirs=list2str(include_dirs),
-        extra_cxx_cflags=list2str(extra_cxx_cflags),
-        extra_cuda_cflags=list2str(extra_cuda_cflags),
-        extra_link_args=list2str(link_args),
-        build_dir=build_dir)
+    content = template.format(name=name,
+                              prefix='CUDA' if with_cuda else 'Cpp',
+                              sources=list2str(sources),
+                              include_dirs=list2str(include_dirs),
+                              extra_cxx_cflags=list2str(extra_cxx_cflags),
+                              extra_cuda_cflags=list2str(extra_cuda_cflags),
+                              extra_link_args=list2str(link_args),
+                              build_dir=build_dir)
 
     log_v('write setup.py into {}'.format(file_path), verbose)
     with open(file_path, 'w') as f:
@@ -1093,8 +1096,9 @@ def _jit_compile(file_path, verbose=False):
     try:
         py_version = subprocess.check_output([interpreter, '-V'])
         py_version = py_version.decode()
-        log_v("Using Python interpreter: {}, version: {}".format(
-            interpreter, py_version.strip()), verbose)
+        log_v(
+            "Using Python interpreter: {}, version: {}".format(
+                interpreter, py_version.strip()), verbose)
     except Exception:
         _, error, _ = sys.exc_info()
         raise RuntimeError(
@@ -1144,8 +1148,9 @@ def run_cmd(command, verbose=False):
     # execute command
     try:
         if verbose:
-            return subprocess.check_call(
-                command, shell=True, stderr=subprocess.STDOUT)
+            return subprocess.check_call(command,
+                                         shell=True,
+                                         stderr=subprocess.STDOUT)
         else:
             return subprocess.check_call(command, shell=True, stdout=DEVNULL)
     except Exception:
@@ -1163,8 +1168,8 @@ def check_abi_compatibility(compiler, verbose=False):
         return True
 
     if not IS_WINDOWS:
-        cmd_out = subprocess.check_output(
-            ['which', compiler], stderr=subprocess.STDOUT)
+        cmd_out = subprocess.check_output(['which', compiler],
+                                          stderr=subprocess.STDOUT)
         compiler_path = os.path.realpath(cmd_out.decode()).strip()
         # if not found any suitable compiler, raise warning
         if not any(name in compiler_path
@@ -1189,8 +1194,8 @@ def check_abi_compatibility(compiler, verbose=False):
             version = version_info.strip().split('.')
         elif IS_WINDOWS:
             mini_required_version = MSVC_MINI_VERSION
-            compiler_info = subprocess.check_output(
-                compiler, stderr=subprocess.STDOUT)
+            compiler_info = subprocess.check_output(compiler,
+                                                    stderr=subprocess.STDOUT)
             try:
                 compiler_info = compiler_info.decode('UTF-8')
             except UnicodeDecodeError:
@@ -1210,8 +1215,8 @@ def check_abi_compatibility(compiler, verbose=False):
     if tuple(map(int, version)) >= mini_required_version:
         return True
     warnings.warn(
-        ABI_INCOMPATIBILITY_WARNING.format(
-            user_compiler=compiler, version='.'.join(version)))
+        ABI_INCOMPATIBILITY_WARNING.format(user_compiler=compiler,
+                                           version='.'.join(version)))
     return False
 
 
diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py
index b7f5ff28d6c74..5d4a899693691 100755
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
@@ -109,8 +109,9 @@ def wrapper(*args, **kwargs):
             v_since = [int(i) for i in _since.split(".")]
             v_since += [0] * (4 - len(v_since))
             if paddle.__version__ == "0.0.0" or _since == "" or v_current >= v_since:
-                warnings.warn(
-                    warningmsg, category=DeprecationWarning, stacklevel=2)
+                warnings.warn(warningmsg,
+                              category=DeprecationWarning,
+                              stacklevel=2)
 
             return func(*args, **kwargs)
 
diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py
index bf40ff9ab221c..234aac860b62a 100644
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
@@ -33,6 +33,7 @@
 except:
 
     class tqdm(object):
+
         def __init__(self, total=None):
             self.total = total
             self.n = 0
@@ -42,8 +43,8 @@ def update(self, n):
             if self.total is None:
                 sys.stderr.write("\r{0:.1f} bytes".format(self.n))
             else:
-                sys.stderr.write("\r{0:.1f}%".format(100 * self.n / float(
-                    self.total)))
+                sys.stderr.write("\r{0:.1f}%".format(100 * self.n /
+                                                     float(self.total)))
             sys.stderr.flush()
 
         def __enter__(self):
@@ -54,6 +55,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 import logging
+
 logger = logging.getLogger(__name__)
 
 __all__ = ['get_weights_path_from_url']
@@ -160,8 +162,8 @@ def get_path_from_url(url,
                 time.sleep(1)
 
     if ParallelEnv().current_endpoint in unique_endpoints:
-        if decompress and (tarfile.is_tarfile(fullpath) or
-                           zipfile.is_zipfile(fullpath)):
+        if decompress and (tarfile.is_tarfile(fullpath)
+                           or zipfile.is_zipfile(fullpath)):
             fullpath = _decompress(fullpath)
 
     return fullpath
@@ -207,8 +209,10 @@ def _wget_download(url, fullname):
     # –user-agent
     command = 'wget -O {} -t {} {}'.format(tmp_fullname, DOWNLOAD_RETRY_LIMIT,
                                            url)
-    subprc = subprocess.Popen(
-        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    subprc = subprocess.Popen(command,
+                              shell=True,
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.PIPE)
     _ = subprc.communicate()
 
     if subprc.returncode != 0:
diff --git a/python/paddle/utils/gast/ast3.py b/python/paddle/utils/gast/ast3.py
index 58840d5c29074..4696c1ba49749 100644
--- a/python/paddle/utils/gast/ast3.py
+++ b/python/paddle/utils/gast/ast3.py
@@ -58,14 +58,16 @@ def visit_Module(self, node):
         def visit_Num(self, node):
             new_node = gast.Constant(
                 node.n,
-                None, )
+                None,
+            )
             gast.copy_location(new_node, node)
             return new_node
 
         def visit_Ellipsis(self, node):
             new_node = gast.Constant(
                 Ellipsis,
-                None, )
+                None,
+            )
             gast.copy_location(new_node, node)
             new_node.end_lineno = new_node.end_col_offset = None
             return new_node
@@ -73,14 +75,16 @@ def visit_Ellipsis(self, node):
         def visit_Str(self, node):
             new_node = gast.Constant(
                 node.s,
-                None, )
+                None,
+            )
             gast.copy_location(new_node, node)
             return new_node
 
         def visit_Bytes(self, node):
             new_node = gast.Constant(
                 node.s,
-                None, )
+                None,
+            )
             gast.copy_location(new_node, node)
             return new_node
 
@@ -169,7 +173,8 @@ def visit_Call(self, node):
             new_node = gast.Call(
                 self._visit(node.func),
                 self._visit(node.args) + starred,
-                self._visit(node.keywords) + kwargs, )
+                self._visit(node.keywords) + kwargs,
+            )
             gast.copy_location(new_node, node)
             return new_node
 
@@ -191,7 +196,8 @@ def visit_arguments(self, node):
                 self._visit(node.kwonlyargs),
                 self._visit(node.kw_defaults),
                 self._visit(node.kwarg),
-                self._visit(node.defaults), )
+                self._visit(node.defaults),
+            )
             gast.copy_location(new_node, node)
             return new_node
 
@@ -200,7 +206,8 @@ def visit_Name(self, node):
             self._visit(node.id),
             self._visit(node.ctx),
             None,
-            None, )
+            None,
+        )
         ast.copy_location(new_node, node)
         return new_node
 
@@ -237,7 +244,8 @@ def visit_comprehension(self, node):
                 target=self._visit(node.target),
                 iter=self._visit(node.iter),
                 ifs=self._visit(node.ifs),
-                is_async=0, )
+                is_async=0,
+            )
             return ast.copy_location(new_node, node)
 
 
@@ -245,6 +253,7 @@ class GAstToAst3(GAstToAst):
     if sys.version_info.minor < 9:
 
         def visit_Subscript(self, node):
+
             def adjust_slice(s):
                 if isinstance(s, ast.Slice):
                     return s
@@ -253,9 +262,8 @@ def adjust_slice(s):
 
             if isinstance(node.slice, gast.Tuple):
                 if any(isinstance(elt, gast.slice) for elt in node.slice.elts):
-                    new_slice = ast.ExtSlice([
-                        adjust_slice(x) for x in self._visit(node.slice.elts)
-                    ])
+                    new_slice = ast.ExtSlice(
+                        [adjust_slice(x) for x in self._visit(node.slice.elts)])
                 else:
                     value = ast.Tuple(self._visit(node.slice.elts), ast.Load())
                     ast.copy_location(value, node.slice)
@@ -267,7 +275,8 @@ def adjust_slice(s):
             new_node = ast.Subscript(
                 self._visit(node.value),
                 new_slice,
-                self._visit(node.ctx), )
+                self._visit(node.ctx),
+            )
             ast.copy_location(new_node, node)
             return new_node
 
@@ -302,21 +311,22 @@ def _make_arg(self, node):
         else:
             extra_args = self._visit(node.type_comment),
 
-        new_node = ast.arg(
-            self._visit(node.id), self._visit(node.annotation), *extra_args)
+        new_node = ast.arg(self._visit(node.id), self._visit(node.annotation),
+                           *extra_args)
         return ast.copy_location(new_node, node)
 
     def visit_Name(self, node):
         new_node = ast.Name(
             self._visit(node.id),
-            self._visit(node.ctx), )
+            self._visit(node.ctx),
+        )
         ast.copy_location(new_node, node)
         return new_node
 
     def visit_ExceptHandler(self, node):
         if node.name:
-            new_node = ast.ExceptHandler(
-                self._visit(node.type), node.name.id, self._visit(node.body))
+            new_node = ast.ExceptHandler(self._visit(node.type), node.name.id,
+                                         self._visit(node.body))
             return ast.copy_location(new_node, node)
         else:
             return self.generic_visit(node)
@@ -343,7 +353,8 @@ def visit_Call(self, node):
                 self._visit(args),
                 self._visit(keywords),
                 self._visit(starargs),
-                self._visit(kwargs), )
+                self._visit(kwargs),
+            )
             ast.copy_location(new_node, node)
             return new_node
 
@@ -356,7 +367,8 @@ def visit_ClassDef(self, node):
                 body=self._visit(node.body),
                 decorator_list=self._visit(node.decorator_list),
                 starargs=None,
-                kwargs=None, )
+                kwargs=None,
+            )
             return ast.copy_location(new_node, node)
 
     elif sys.version_info.minor < 8:
@@ -367,7 +379,8 @@ def visit_FunctionDef(self, node):
                 self._visit(node.args),
                 self._visit(node.body),
                 self._visit(node.decorator_list),
-                self._visit(node.returns), )
+                self._visit(node.returns),
+            )
             ast.copy_location(new_node, node)
             return new_node
 
@@ -377,7 +390,8 @@ def visit_AsyncFunctionDef(self, node):
                 self._visit(node.args),
                 self._visit(node.body),
                 self._visit(node.decorator_list),
-                self._visit(node.returns), )
+                self._visit(node.returns),
+            )
             ast.copy_location(new_node, node)
             return new_node
 
@@ -386,7 +400,8 @@ def visit_For(self, node):
                 self._visit(node.target),
                 self._visit(node.iter),
                 self._visit(node.body),
-                self._visit(node.orelse), )
+                self._visit(node.orelse),
+            )
             ast.copy_location(new_node, node)
             return new_node
 
@@ -404,14 +419,16 @@ def visit_AsyncFor(self, node):
         def visit_With(self, node):
             new_node = ast.With(
                 self._visit(node.items),
-                self._visit(node.body), )
+                self._visit(node.body),
+            )
             ast.copy_location(new_node, node)
             return new_node
 
         def visit_AsyncWith(self, node):
             new_node = ast.AsyncWith(
                 self._visit(node.items),
-                self._visit(node.body), )
+                self._visit(node.body),
+            )
             ast.copy_location(new_node, node)
             return new_node
 
@@ -419,7 +436,8 @@ def visit_Call(self, node):
             new_node = ast.Call(
                 self._visit(node.func),
                 self._visit(node.args),
-                self._visit(node.keywords), )
+                self._visit(node.keywords),
+            )
             ast.copy_location(new_node, node)
             return new_node
 
diff --git a/python/paddle/utils/gast/astn.py b/python/paddle/utils/gast/astn.py
index bd88ba5efc512..eb45bd4e4500a 100644
--- a/python/paddle/utils/gast/astn.py
+++ b/python/paddle/utils/gast/astn.py
@@ -34,7 +34,9 @@
 
 
 def _generate_translators(to):
+
     class Translator(ast.NodeTransformer):
+
         def _visit(self, node):
             if isinstance(node, list):
                 return [self._visit(n) for n in node]
diff --git a/python/paddle/utils/gast/gast.py b/python/paddle/utils/gast/gast.py
index f561c83995ac1..1248434fe3533 100644
--- a/python/paddle/utils/gast/gast.py
+++ b/python/paddle/utils/gast/gast.py
@@ -44,6 +44,7 @@ class TypeIgnore(AST):
 
 
 def _make_node(Name, Fields, Attributes, Bases):
+
     def create_node(self, *args, **kwargs):
         nbparam = len(args) + len(kwargs)
         assert nbparam in (0, len(Fields)), \
@@ -76,351 +77,434 @@ def create_node(self, *args, **kwargs):
                           'lineno',
                           'col_offset',
                           'end_lineno',
-                          'end_col_offset', ), (stmt, ))),
+                          'end_col_offset',
+                      ), (stmt, ))),
     ('AsyncFunctionDef', (('name', 'args', 'body', 'decorator_list', 'returns',
                            'type_comment'), (
                                'lineno',
                                'col_offset',
                                'end_lineno',
-                               'end_col_offset', ), (stmt, ))),
+                               'end_col_offset',
+                           ), (stmt, ))),
     ('ClassDef', ((
         'name',
         'bases',
         'keywords',
         'body',
-        'decorator_list', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (stmt, ))),
+        'decorator_list',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (stmt, ))),
     ('Return', (('value', ), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (stmt, ))),
+        'end_col_offset',
+    ), (stmt, ))),
     ('Delete', (('targets', ), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (stmt, ))),
+        'end_col_offset',
+    ), (stmt, ))),
     ('Assign', ((
         'targets',
-        'value', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (stmt, ))),
+        'value',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (stmt, ))),
     ('AugAssign', ((
         'target',
         'op',
-        'value', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (stmt, ))),
+        'value',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (stmt, ))),
     ('AnnAssign', ((
         'target',
         'annotation',
         'value',
-        'simple', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (stmt, ))),
+        'simple',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (stmt, ))),
     ('Print', ((
         'dest',
         'values',
-        'nl', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (stmt, ))),
+        'nl',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (stmt, ))),
     ('For', (('target', 'iter', 'body', 'orelse', 'type_comment'), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (stmt, ))),
+        'end_col_offset',
+    ), (stmt, ))),
     ('AsyncFor', (('target', 'iter', 'body', 'orelse', 'type_comment'), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (stmt, ))),
+        'end_col_offset',
+    ), (stmt, ))),
     ('While', ((
         'test',
         'body',
-        'orelse', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (stmt, ))),
+        'orelse',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (stmt, ))),
     ('If', ((
         'test',
         'body',
-        'orelse', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (stmt, ))),
+        'orelse',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (stmt, ))),
     ('With', (('items', 'body', 'type_comment'), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (stmt, ))),
+        'end_col_offset',
+    ), (stmt, ))),
     ('AsyncWith', (('items', 'body', 'type_comment'), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (stmt, ))),
+        'end_col_offset',
+    ), (stmt, ))),
     ('Raise', ((
         'exc',
-        'cause', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (stmt, ))),
+        'cause',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (stmt, ))),
     ('Try', ((
         'body',
         'handlers',
         'orelse',
-        'finalbody', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (stmt, ))),
+        'finalbody',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (stmt, ))),
     ('Assert', ((
         'test',
-        'msg', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (stmt, ))),
+        'msg',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (stmt, ))),
     ('Import', (('names', ), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (stmt, ))),
+        'end_col_offset',
+    ), (stmt, ))),
     ('ImportFrom', ((
         'module',
         'names',
-        'level', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (stmt, ))),
+        'level',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (stmt, ))),
     ('Exec', ((
         'body',
         'globals',
-        'locals', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (stmt, ))),
+        'locals',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (stmt, ))),
     ('Global', (('names', ), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (stmt, ))),
+        'end_col_offset',
+    ), (stmt, ))),
     ('Nonlocal', (('names', ), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (stmt, ))),
+        'end_col_offset',
+    ), (stmt, ))),
     ('Expr', (('value', ), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (stmt, ))),
+        'end_col_offset',
+    ), (stmt, ))),
     ('Pass', ((), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (stmt, ))),
+        'end_col_offset',
+    ), (stmt, ))),
     ('Break', ((), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (stmt, ))),
+        'end_col_offset',
+    ), (stmt, ))),
     ('Continue', ((), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (stmt, ))),
+        'end_col_offset',
+    ), (stmt, ))),
 
     # expr
     ('BoolOp', ((
         'op',
-        'values', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'values',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('BinOp', ((
         'left',
         'op',
-        'right', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'right',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('UnaryOp', ((
         'op',
-        'operand', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'operand',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('Lambda', ((
         'args',
-        'body', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'body',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('IfExp', ((
         'test',
         'body',
-        'orelse', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'orelse',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('Dict', ((
         'keys',
-        'values', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'values',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('Set', (('elts', ), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (expr, ))),
+        'end_col_offset',
+    ), (expr, ))),
     ('ListComp', ((
         'elt',
-        'generators', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'generators',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('SetComp', ((
         'elt',
-        'generators', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'generators',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('DictComp', ((
         'key',
         'value',
-        'generators', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'generators',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('GeneratorExp', ((
         'elt',
-        'generators', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'generators',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('Await', (('value', ), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (expr, ))),
+        'end_col_offset',
+    ), (expr, ))),
     ('Yield', (('value', ), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (expr, ))),
+        'end_col_offset',
+    ), (expr, ))),
     ('YieldFrom', (('value', ), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (expr, ))),
+        'end_col_offset',
+    ), (expr, ))),
     ('Compare', ((
         'left',
         'ops',
-        'comparators', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'comparators',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('Call', ((
         'func',
         'args',
-        'keywords', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'keywords',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('Repr', (('value', ), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (expr, ))),
+        'end_col_offset',
+    ), (expr, ))),
     ('FormattedValue', ((
         'value',
         'conversion',
-        'format_spec', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'format_spec',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('JoinedStr', (('values', ), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (expr, ))),
+        'end_col_offset',
+    ), (expr, ))),
     ('Constant', (('value', 'kind'), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (expr, ))),
+        'end_col_offset',
+    ), (expr, ))),
     ('Attribute', ((
         'value',
         'attr',
-        'ctx', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'ctx',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('Subscript', ((
         'value',
         'slice',
-        'ctx', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'ctx',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('Starred', ((
         'value',
-        'ctx', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'ctx',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('Name', (('id', 'ctx', 'annotation', 'type_comment'), (
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (expr, ))),
+        'end_col_offset',
+    ), (expr, ))),
     ('List', ((
         'elts',
-        'ctx', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'ctx',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
     ('Tuple', ((
         'elts',
-        'ctx', ), (
-            'lineno',
-            'col_offset',
-            'end_lineno',
-            'end_col_offset', ), (expr, ))),
+        'ctx',
+    ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset',
+    ), (expr, ))),
 
     # expr_context
     ('Load', ((), (), (expr_context, ))),
@@ -435,7 +519,8 @@ def create_node(self, *args, **kwargs):
         'lineno',
         'col_offset',
         'end_lineno',
-        'end_col_offset', ), (slice, ))),
+        'end_col_offset',
+    ), (slice, ))),
 
     # boolop
     ('And', ((), (), (boolop, ))),
@@ -459,16 +544,20 @@ def create_node(self, *args, **kwargs):
     # unaryop
     ('Invert', ((), (), (
         unaryop,
-        AST, ))),
+        AST,
+    ))),
     ('Not', ((), (), (
         unaryop,
-        AST, ))),
+        AST,
+    ))),
     ('UAdd', ((), (), (
         unaryop,
-        AST, ))),
+        AST,
+    ))),
     ('USub', ((), (), (
         unaryop,
-        AST, ))),
+        AST,
+    ))),
 
     # cmpop
     ('Eq', ((), (), (cmpop, ))),
@@ -495,9 +584,8 @@ def create_node(self, *args, **kwargs):
                     'kw_defaults', 'kwarg', 'defaults'), (), (AST, ))),
 
     # keyword
-    ('keyword',
-     (('arg', 'value'),
-      ('lineno', 'col_offset', 'end_lineno', 'end_col_offset'), (AST, ))),
+    ('keyword', (('arg', 'value'), ('lineno', 'col_offset', 'end_lineno',
+                                    'end_col_offset'), (AST, ))),
 
     # alias
     ('alias', (('name', 'asname'), (), (AST, ))),
@@ -506,7 +594,8 @@ def create_node(self, *args, **kwargs):
     ('withitem', (('context_expr', 'optional_vars'), (), (AST, ))),
 
     # type_ignore
-    ('type_ignore', ((), ('lineno', 'tag'), (TypeIgnore, ))), )
+    ('type_ignore', ((), ('lineno', 'tag'), (TypeIgnore, ))),
+)
 
 for name, descr in _nodes:
     _make_node(name, *descr)
diff --git a/python/paddle/utils/image_util.py b/python/paddle/utils/image_util.py
index 18be9366c40a7..9c93d44eeecb0 100644
--- a/python/paddle/utils/image_util.py
+++ b/python/paddle/utils/image_util.py
@@ -56,8 +56,8 @@ def crop_img(im, inner_size, color=True, test=True):
       If True, crop the center of images.
     """
     if color:
-        height, width = max(inner_size, im.shape[1]), max(inner_size,
-                                                          im.shape[2])
+        height, width = max(inner_size,
+                            im.shape[1]), max(inner_size, im.shape[2])
         padded_im = np.zeros((3, height, width))
         startY = (height - im.shape[1]) / 2
         startX = (width - im.shape[2]) / 2
@@ -65,8 +65,8 @@ def crop_img(im, inner_size, color=True, test=True):
         padded_im[:, startY:endY, startX:endX] = im
     else:
         im = im.astype('float32')
-        height, width = max(inner_size, im.shape[0]), max(inner_size,
-                                                          im.shape[1])
+        height, width = max(inner_size,
+                            im.shape[0]), max(inner_size, im.shape[1])
         padded_im = np.zeros((height, width))
         startY = (height - im.shape[0]) / 2
         startX = (width - im.shape[1]) / 2
@@ -122,13 +122,13 @@ def load_meta(meta_path, mean_img_size, crop_size, color=True):
     if color:
         assert (mean_img_size * mean_img_size * 3 == mean.shape[0])
         mean = mean.reshape(3, mean_img_size, mean_img_size)
-        mean = mean[:, border:border + crop_size, border:border +
-                    crop_size].astype('float32')
+        mean = mean[:, border:border + crop_size,
+                    border:border + crop_size].astype('float32')
     else:
         assert (mean_img_size * mean_img_size == mean.shape[0])
         mean = mean.reshape(mean_img_size, mean_img_size)
-        mean = mean[border:border + crop_size, border:border +
-                    crop_size].astype('float32')
+        mean = mean[border:border + crop_size,
+                    border:border + crop_size].astype('float32')
     return mean
 
 
@@ -170,9 +170,8 @@ def oversample(img, crop_dims):
     crops_ix = np.tile(crops_ix, (2, 1))
 
     # Extract crops
-    crops = np.empty(
-        (10 * len(img), crop_dims[0], crop_dims[1], im_shape[-1]),
-        dtype=np.float32)
+    crops = np.empty((10 * len(img), crop_dims[0], crop_dims[1], im_shape[-1]),
+                     dtype=np.float32)
     ix = 0
     for im in img:
         for crop in crops_ix:
@@ -183,6 +182,7 @@ def oversample(img, crop_dims):
 
 
 class ImageTransformer:
+
     def __init__(self,
                  transpose=None,
                  channel_swap=None,
diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py
index 9feda3d2dae6a..f0636e9a10166 100644
--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -27,8 +27,9 @@ def _simple_network():
     """
     Define a simple network composed by a single linear layer.
     """
-    input = paddle.static.data(
-        name="input", shape=[None, 2, 2], dtype="float32")
+    input = paddle.static.data(name="input",
+                               shape=[None, 2, 2],
+                               dtype="float32")
     weight = paddle.create_parameter(
         shape=[2, 3],
         dtype="float32",
@@ -126,15 +127,17 @@ def _run_dygraph_single(use_cuda, use_xpu, use_npu):
         name="weight", initializer=paddle.nn.initializer.Constant(value=0.5))
     bias_attr = paddle.ParamAttr(
         name="bias", initializer=paddle.nn.initializer.Constant(value=1.0))
-    linear = paddle.nn.Linear(
-        2, 4, weight_attr=weight_attr, bias_attr=bias_attr)
+    linear = paddle.nn.Linear(2,
+                              4,
+                              weight_attr=weight_attr,
+                              bias_attr=bias_attr)
     input_np = _prepare_data(1)
     input_tensor = paddle.to_tensor(input_np)
     linear_out = linear(input_tensor)
     out = paddle.tensor.sum(linear_out)
     out.backward()
-    opt = paddle.optimizer.Adam(
-        learning_rate=0.001, parameters=linear.parameters())
+    opt = paddle.optimizer.Adam(learning_rate=0.001,
+                                parameters=linear.parameters())
     opt.step()
 
 
@@ -195,8 +198,8 @@ def _run_static_parallel(use_cuda, use_xpu, use_npu, device_list):
             paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
         compiled_prog = paddle.static.CompiledProgram(
-            train_prog).with_data_parallel(
-                loss_name=loss.name, places=device_list)
+            train_prog).with_data_parallel(loss_name=loss.name,
+                                           places=device_list)
 
         if use_cuda:
             place = paddle.CUDAPlace(0)
@@ -269,8 +272,8 @@ def run_check():
 
     try:
         _run_static_parallel(use_cuda, use_xpu, use_npu, device_list)
-        print("PaddlePaddle works well on {} {}s.".format(device_count,
-                                                          device_str))
+        print("PaddlePaddle works well on {} {}s.".format(
+            device_count, device_str))
         print(
             "PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now."
         )
@@ -280,8 +283,8 @@ def run_check():
             "\n 1. There is not enough GPUs visible on your system"
             "\n 2. Some GPUs are occupied by other process now"
             "\n 3. NVIDIA-NCCL2 is not installed correctly on your system. Please follow instruction on https://github.com/NVIDIA/nccl-tests "
-            "\n to test your NCCL, or reinstall it following https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html".
-            format(device_count, device_str))
+            "\n to test your NCCL, or reinstall it following https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html"
+            .format(device_count, device_str))
 
         logging.warning("\n Original Error is: {}".format(e))
         print("PaddlePaddle is installed successfully ONLY for single {}! "
diff --git a/python/paddle/utils/op_version.py b/python/paddle/utils/op_version.py
index 6e81b5a2c17bb..575e5f40772eb 100644
--- a/python/paddle/utils/op_version.py
+++ b/python/paddle/utils/op_version.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,6 +29,7 @@ def _singleton(*args, **kargs):
 
 
 class OpUpdateInfoHelper(object):
+
     def __init__(self, info):
         self._info = info
 
@@ -48,6 +49,7 @@ def verify_key_value(self, name=''):
 
 @Singleton
 class OpLastCheckpointChecker(object):
+
     def __init__(self):
         self.raw_version_map = core.get_op_version_map()
         self.checkpoints_map = {}
@@ -63,8 +65,8 @@ def filter_updates(self, op_name, type=core.OpUpdateType.kInvalid, key=''):
         updates = []
         if op_name in self.checkpoints_map:
             for update in self.checkpoints_map[op_name]:
-                if (update.type() == type) or (
-                        type == core.OpUpdateType.kInvalid):
+                if (update.type() == type) or (type
+                                               == core.OpUpdateType.kInvalid):
                     if OpUpdateInfoHelper(update.info()).verify_key_value(key):
                         updates.append(update.info())
         return updates
diff --git a/python/paddle/utils/profiler.py b/python/paddle/utils/profiler.py
index cc33342ec5a51..288c17c9511c2 100644
--- a/python/paddle/utils/profiler.py
+++ b/python/paddle/utils/profiler.py
@@ -24,19 +24,14 @@
 from ..fluid.profiler import stop_profiler
 from ..fluid.profiler import reset_profiler
 
-__all__ = [     #noqa
-           'Profiler',
-           'get_profiler',
-           'ProfilerOptions',
-           'cuda_profiler',
-           'start_profiler',
-           'profiler',
-           'stop_profiler',
-           'reset_profiler'
+__all__ = [  #noqa
+    'Profiler', 'get_profiler', 'ProfilerOptions', 'cuda_profiler',
+    'start_profiler', 'profiler', 'stop_profiler', 'reset_profiler'
 ]
 
 
 class ProfilerOptions(object):
+
     def __init__(self, options=None):
         self.options = {
             'state': 'All',
@@ -74,6 +69,7 @@ def __getitem__(self, name):
 
 
 class Profiler(object):
+
     def __init__(self, enabled=True, options=None):
         if options is not None:
             self.profiler_options = options
diff --git a/python/paddle/vision/datasets/__init__.py b/python/paddle/vision/datasets/__init__.py
index a9673aae21e96..10666b7c7194a 100644
--- a/python/paddle/vision/datasets/__init__.py
+++ b/python/paddle/vision/datasets/__init__.py
@@ -21,13 +21,7 @@
 from .cifar import Cifar100  # noqa: F401
 from .voc2012 import VOC2012  # noqa: F401
 
-__all__ = [ #noqa
-    'DatasetFolder',
-    'ImageFolder',
-    'MNIST',
-    'FashionMNIST',
-    'Flowers',
-    'Cifar10',
-    'Cifar100',
-    'VOC2012'
+__all__ = [  #noqa
+    'DatasetFolder', 'ImageFolder', 'MNIST', 'FashionMNIST', 'Flowers',
+    'Cifar10', 'Cifar100', 'VOC2012'
 ]
diff --git a/python/paddle/vision/datasets/cifar.py b/python/paddle/vision/datasets/cifar.py
index 3028d8697aefc..f31aab9eccf26 100644
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
@@ -110,8 +110,8 @@ def __init__(self,
             backend = paddle.vision.get_image_backend()
         if backend not in ['pil', 'cv2']:
             raise ValueError(
-                "Expected backend are one of ['pil', 'cv2'], but got {}"
-                .format(backend))
+                "Expected backend are one of ['pil', 'cv2'], but got {}".format(
+                    backend))
         self.backend = backend
 
         self._init_url_md5_flag()
@@ -119,8 +119,10 @@ def __init__(self,
         self.data_file = data_file
         if self.data_file is None:
             assert download, "data_file is not set and downloading automatically is disabled"
-            self.data_file = _check_exists_and_download(
-                data_file, self.data_url, self.data_md5, 'cifar', download)
+            self.data_file = _check_exists_and_download(data_file,
+                                                        self.data_url,
+                                                        self.data_md5, 'cifar',
+                                                        download)
 
         self.transform = transform
 
@@ -146,8 +148,8 @@ def _load_data(self):
                 batch = pickle.load(f.extractfile(name), encoding='bytes')
 
                 data = batch[six.b('data')]
-                labels = batch.get(
-                    six.b('labels'), batch.get(six.b('fine_labels'), None))
+                labels = batch.get(six.b('labels'),
+                                   batch.get(six.b('fine_labels'), None))
                 assert labels is not None
                 for sample, label in six.moves.zip(data, labels):
                     self.data.append((sample, label))
diff --git a/python/paddle/vision/datasets/flowers.py b/python/paddle/vision/datasets/flowers.py
index 0b006ada4a045..ef59d24ed6451 100644
--- a/python/paddle/vision/datasets/flowers.py
+++ b/python/paddle/vision/datasets/flowers.py
@@ -89,26 +89,29 @@ def __init__(self,
             backend = paddle.vision.get_image_backend()
         if backend not in ['pil', 'cv2']:
             raise ValueError(
-                "Expected backend are one of ['pil', 'cv2'], but got {}"
-                .format(backend))
+                "Expected backend are one of ['pil', 'cv2'], but got {}".format(
+                    backend))
         self.backend = backend
 
         flag = MODE_FLAG_MAP[mode.lower()]
 
         if not data_file:
             assert download, "data_file is not set and downloading automatically is disabled"
-            data_file = _check_exists_and_download(
-                data_file, DATA_URL, DATA_MD5, 'flowers', download)
+            data_file = _check_exists_and_download(data_file, DATA_URL,
+                                                   DATA_MD5, 'flowers',
+                                                   download)
 
         if not label_file:
             assert download, "label_file is not set and downloading automatically is disabled"
-            label_file = _check_exists_and_download(
-                label_file, LABEL_URL, LABEL_MD5, 'flowers', download)
+            label_file = _check_exists_and_download(label_file, LABEL_URL,
+                                                    LABEL_MD5, 'flowers',
+                                                    download)
 
         if not setid_file:
             assert download, "setid_file is not set and downloading automatically is disabled"
-            setid_file = _check_exists_and_download(
-                setid_file, SETID_URL, SETID_MD5, 'flowers', download)
+            setid_file = _check_exists_and_download(setid_file, SETID_URL,
+                                                    SETID_MD5, 'flowers',
+                                                    download)
 
         self.transform = transform
 
diff --git a/python/paddle/vision/datasets/folder.py b/python/paddle/vision/datasets/folder.py
index 220b3d8ecb4b4..c3f1b61f30ed9 100644
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
@@ -139,9 +139,10 @@ def __init__(self,
         samples = make_dataset(self.root, class_to_idx, extensions,
                                is_valid_file)
         if len(samples) == 0:
-            raise (RuntimeError(
-                "Found 0 directories in subfolders of: " + self.root + "\n"
-                "Supported extensions are: " + ",".join(extensions)))
+            raise (RuntimeError("Found 0 directories in subfolders of: " +
+                                self.root + "\n"
+                                "Supported extensions are: " +
+                                ",".join(extensions)))
 
         self.loader = default_loader if loader is None else loader
         self.extensions = extensions
@@ -297,9 +298,10 @@ def is_valid_file(x):
                     samples.append(f)
 
         if len(samples) == 0:
-            raise (RuntimeError(
-                "Found 0 files in subfolders of: " + self.root + "\n"
-                "Supported extensions are: " + ",".join(extensions)))
+            raise (RuntimeError("Found 0 files in subfolders of: " + self.root +
+                                "\n"
+                                "Supported extensions are: " +
+                                ",".join(extensions)))
 
         self.loader = default_loader if loader is None else loader
         self.extensions = extensions
diff --git a/python/paddle/vision/datasets/mnist.py b/python/paddle/vision/datasets/mnist.py
index 84760f9598b6a..703a4f64cf44e 100644
--- a/python/paddle/vision/datasets/mnist.py
+++ b/python/paddle/vision/datasets/mnist.py
@@ -85,8 +85,8 @@ def __init__(self,
             backend = paddle.vision.get_image_backend()
         if backend not in ['pil', 'cv2']:
             raise ValueError(
-                "Expected backend are one of ['pil', 'cv2'], but got {}"
-                .format(backend))
+                "Expected backend are one of ['pil', 'cv2'], but got {}".format(
+                    backend))
         self.backend = backend
 
         self.mode = mode.lower()
@@ -134,8 +134,8 @@ def _parse_dataset(self, buffer_size=100):
                 offset_lab = 0
                 # label file : 8B
                 magic_byte_lab = '>II'
-                magic_lab, label_num = struct.unpack_from(magic_byte_lab,
-                                                          lab_buf, offset_lab)
+                magic_lab, label_num = struct.unpack_from(
+                    magic_byte_lab, lab_buf, offset_lab)
                 offset_lab += struct.calcsize(magic_byte_lab)
 
                 while True:
@@ -149,8 +149,9 @@ def _parse_dataset(self, buffer_size=100):
                     fmt_images = '>' + str(buffer_size * rows * cols) + 'B'
                     images_temp = struct.unpack_from(fmt_images, img_buf,
                                                      offset_img)
-                    images = np.reshape(images_temp, (buffer_size, rows *
-                                                      cols)).astype('float32')
+                    images = np.reshape(
+                        images_temp,
+                        (buffer_size, rows * cols)).astype('float32')
                     offset_img += struct.calcsize(fmt_images)
 
                     for i in range(buffer_size):
diff --git a/python/paddle/vision/datasets/voc2012.py b/python/paddle/vision/datasets/voc2012.py
index 5a82d7864cb00..cd9ff70ca1e50 100644
--- a/python/paddle/vision/datasets/voc2012.py
+++ b/python/paddle/vision/datasets/voc2012.py
@@ -99,8 +99,8 @@ def __init__(self,
             backend = paddle.vision.get_image_backend()
         if backend not in ['pil', 'cv2']:
             raise ValueError(
-                "Expected backend are one of ['pil', 'cv2'], but got {}"
-                .format(backend))
+                "Expected backend are one of ['pil', 'cv2'], but got {}".format(
+                    backend))
         self.backend = backend
 
         self.flag = MODE_FLAG_MAP[mode.lower()]
@@ -108,8 +108,9 @@ def __init__(self,
         self.data_file = data_file
         if self.data_file is None:
             assert download, "data_file is not set and downloading automatically is disabled"
-            self.data_file = _check_exists_and_download(
-                data_file, VOC_URL, VOC_MD5, CACHE_DIR, download)
+            self.data_file = _check_exists_and_download(data_file, VOC_URL,
+                                                        VOC_MD5, CACHE_DIR,
+                                                        download)
         self.transform = transform
 
         # read dataset into memory
diff --git a/python/paddle/vision/image.py b/python/paddle/vision/image.py
index 5c260b1d90a89..755c8bcc9cc32 100644
--- a/python/paddle/vision/image.py
+++ b/python/paddle/vision/image.py
@@ -82,8 +82,8 @@ def make_fake_dir():
     global _image_backend
     if backend not in ['pil', 'cv2', 'tensor']:
         raise ValueError(
-            "Expected backend are one of ['pil', 'cv2', 'tensor'], but got {}"
-            .format(backend))
+            "Expected backend are one of ['pil', 'cv2', 'tensor'], but got {}".
+            format(backend))
     _image_backend = backend
 
 
@@ -152,8 +152,8 @@ def image_load(path, backend=None):
         backend = _image_backend
     if backend not in ['pil', 'cv2', 'tensor']:
         raise ValueError(
-            "Expected backend are one of ['pil', 'cv2', 'tensor'], but got {}"
-            .format(backend))
+            "Expected backend are one of ['pil', 'cv2', 'tensor'], but got {}".
+            format(backend))
 
     if backend == 'pil':
         return Image.open(path)
diff --git a/python/paddle/vision/models/__init__.py b/python/paddle/vision/models/__init__.py
index 85ff5f85dffd0..72bb6ee8e8d5b 100644
--- a/python/paddle/vision/models/__init__.py
+++ b/python/paddle/vision/models/__init__.py
@@ -64,56 +64,18 @@
 from .shufflenetv2 import shufflenet_v2_x2_0  # noqa: F401
 from .shufflenetv2 import shufflenet_v2_swish  # noqa: F401
 
-__all__ = [ #noqa
-    'ResNet',
-    'resnet18',
-    'resnet34',
-    'resnet50',
-    'resnet101',
-    'resnet152',
-    'resnext50_32x4d',
-    'resnext50_64x4d',
-    'resnext101_32x4d',
-    'resnext101_64x4d',
-    'resnext152_32x4d',
-    'resnext152_64x4d',
-    'wide_resnet50_2',
-    'wide_resnet101_2',
-    'VGG',
-    'vgg11',
-    'vgg13',
-    'vgg16',
-    'vgg19',
-    'MobileNetV1',
-    'mobilenet_v1',
-    'MobileNetV2',
-    'mobilenet_v2',
-    'MobileNetV3Small',
-    'MobileNetV3Large',
-    'mobilenet_v3_small',
-    'mobilenet_v3_large',
-    'LeNet',
-    'DenseNet',
-    'densenet121',
-    'densenet161',
-    'densenet169',
-    'densenet201',
-    'densenet264',
-    'AlexNet',
-    'alexnet',
-    'InceptionV3',
-    'inception_v3',
-    'SqueezeNet',
-    'squeezenet1_0',
-    'squeezenet1_1',
-    'GoogLeNet',
-    'googlenet',
-    'ShuffleNetV2',
-    'shufflenet_v2_x0_25',
-    'shufflenet_v2_x0_33',
-    'shufflenet_v2_x0_5',
-    'shufflenet_v2_x1_0',
-    'shufflenet_v2_x1_5',
-    'shufflenet_v2_x2_0',
+__all__ = [  #noqa
+    'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152',
+    'resnext50_32x4d', 'resnext50_64x4d', 'resnext101_32x4d',
+    'resnext101_64x4d', 'resnext152_32x4d', 'resnext152_64x4d',
+    'wide_resnet50_2', 'wide_resnet101_2', 'VGG', 'vgg11', 'vgg13', 'vgg16',
+    'vgg19', 'MobileNetV1', 'mobilenet_v1', 'MobileNetV2', 'mobilenet_v2',
+    'MobileNetV3Small', 'MobileNetV3Large', 'mobilenet_v3_small',
+    'mobilenet_v3_large', 'LeNet', 'DenseNet', 'densenet121', 'densenet161',
+    'densenet169', 'densenet201', 'densenet264', 'AlexNet', 'alexnet',
+    'InceptionV3', 'inception_v3', 'SqueezeNet', 'squeezenet1_0',
+    'squeezenet1_1', 'GoogLeNet', 'googlenet', 'ShuffleNetV2',
+    'shufflenet_v2_x0_25', 'shufflenet_v2_x0_33', 'shufflenet_v2_x0_5',
+    'shufflenet_v2_x1_0', 'shufflenet_v2_x1_5', 'shufflenet_v2_x2_0',
     'shufflenet_v2_swish'
 ]
diff --git a/python/paddle/vision/models/alexnet.py b/python/paddle/vision/models/alexnet.py
index 1d36ef37b6ced..411a8f01be295 100644
--- a/python/paddle/vision/models/alexnet.py
+++ b/python/paddle/vision/models/alexnet.py
@@ -30,13 +30,15 @@
 model_urls = {
     "alexnet": (
         "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/AlexNet_pretrained.pdparams",
-        "7f0f9f737132e02732d75a1459d98a43", )
+        "7f0f9f737132e02732d75a1459d98a43",
+    )
 }
 
 __all__ = []
 
 
 class ConvPoolLayer(nn.Layer):
+
     def __init__(self,
                  input_channels,
                  output_channels,
diff --git a/python/paddle/vision/models/densenet.py b/python/paddle/vision/models/densenet.py
index 46c7b6dc52b58..a764be9544575 100644
--- a/python/paddle/vision/models/densenet.py
+++ b/python/paddle/vision/models/densenet.py
@@ -48,6 +48,7 @@
 
 
 class BNACConvLayer(nn.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -59,15 +60,14 @@ def __init__(self,
         super(BNACConvLayer, self).__init__()
         self._batch_norm = BatchNorm(num_channels, act=act)
 
-        self._conv = Conv2D(
-            in_channels=num_channels,
-            out_channels=num_filters,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=pad,
-            groups=groups,
-            weight_attr=ParamAttr(),
-            bias_attr=False)
+        self._conv = Conv2D(in_channels=num_channels,
+                            out_channels=num_filters,
+                            kernel_size=filter_size,
+                            stride=stride,
+                            padding=pad,
+                            groups=groups,
+                            weight_attr=ParamAttr(),
+                            bias_attr=False)
 
     def forward(self, input):
         y = self._batch_norm(input)
@@ -76,23 +76,22 @@ def forward(self, input):
 
 
 class DenseLayer(nn.Layer):
+
     def __init__(self, num_channels, growth_rate, bn_size, dropout):
         super(DenseLayer, self).__init__()
         self.dropout = dropout
 
-        self.bn_ac_func1 = BNACConvLayer(
-            num_channels=num_channels,
-            num_filters=bn_size * growth_rate,
-            filter_size=1,
-            pad=0,
-            stride=1)
+        self.bn_ac_func1 = BNACConvLayer(num_channels=num_channels,
+                                         num_filters=bn_size * growth_rate,
+                                         filter_size=1,
+                                         pad=0,
+                                         stride=1)
 
-        self.bn_ac_func2 = BNACConvLayer(
-            num_channels=bn_size * growth_rate,
-            num_filters=growth_rate,
-            filter_size=3,
-            pad=1,
-            stride=1)
+        self.bn_ac_func2 = BNACConvLayer(num_channels=bn_size * growth_rate,
+                                         num_filters=growth_rate,
+                                         filter_size=3,
+                                         pad=1,
+                                         stride=1)
 
         if dropout:
             self.dropout_func = Dropout(p=dropout, mode="downscale_in_infer")
@@ -107,6 +106,7 @@ def forward(self, input):
 
 
 class DenseBlock(nn.Layer):
+
     def __init__(self,
                  num_channels,
                  num_layers,
@@ -123,11 +123,10 @@ def __init__(self,
             self.dense_layer_func.append(
                 self.add_sublayer(
                     "{}_{}".format(name, layer + 1),
-                    DenseLayer(
-                        num_channels=pre_channel,
-                        growth_rate=growth_rate,
-                        bn_size=bn_size,
-                        dropout=dropout)))
+                    DenseLayer(num_channels=pre_channel,
+                               growth_rate=growth_rate,
+                               bn_size=bn_size,
+                               dropout=dropout)))
             pre_channel = pre_channel + growth_rate
 
     def forward(self, input):
@@ -138,15 +137,15 @@ def forward(self, input):
 
 
 class TransitionLayer(nn.Layer):
+
     def __init__(self, num_channels, num_output_features):
         super(TransitionLayer, self).__init__()
 
-        self.conv_ac_func = BNACConvLayer(
-            num_channels=num_channels,
-            num_filters=num_output_features,
-            filter_size=1,
-            pad=0,
-            stride=1)
+        self.conv_ac_func = BNACConvLayer(num_channels=num_channels,
+                                          num_filters=num_output_features,
+                                          filter_size=1,
+                                          pad=0,
+                                          stride=1)
 
         self.pool2d_avg = AvgPool2D(kernel_size=2, stride=2, padding=0)
 
@@ -157,6 +156,7 @@ def forward(self, input):
 
 
 class ConvBNLayer(nn.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -167,15 +167,14 @@ def __init__(self,
                  act="relu"):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = Conv2D(
-            in_channels=num_channels,
-            out_channels=num_filters,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=pad,
-            groups=groups,
-            weight_attr=ParamAttr(),
-            bias_attr=False)
+        self._conv = Conv2D(in_channels=num_channels,
+                            out_channels=num_filters,
+                            kernel_size=filter_size,
+                            stride=stride,
+                            padding=pad,
+                            groups=groups,
+                            weight_attr=ParamAttr(),
+                            bias_attr=False)
         self._batch_norm = BatchNorm(num_filters, act=act)
 
     def forward(self, input):
@@ -232,13 +231,12 @@ def __init__(self,
         }
         num_init_features, growth_rate, block_config = densenet_spec[layers]
 
-        self.conv1_func = ConvBNLayer(
-            num_channels=3,
-            num_filters=num_init_features,
-            filter_size=7,
-            stride=2,
-            pad=3,
-            act='relu')
+        self.conv1_func = ConvBNLayer(num_channels=3,
+                                      num_filters=num_init_features,
+                                      filter_size=7,
+                                      stride=2,
+                                      pad=3,
+                                      act='relu')
         self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
         self.block_config = block_config
         self.dense_block_func_list = []
@@ -249,13 +247,12 @@ def __init__(self,
             self.dense_block_func_list.append(
                 self.add_sublayer(
                     "db_conv_{}".format(i + 2),
-                    DenseBlock(
-                        num_channels=pre_num_channels,
-                        num_layers=num_layers,
-                        bn_size=bn_size,
-                        growth_rate=growth_rate,
-                        dropout=dropout,
-                        name='conv' + str(i + 2))))
+                    DenseBlock(num_channels=pre_num_channels,
+                               num_layers=num_layers,
+                               bn_size=bn_size,
+                               growth_rate=growth_rate,
+                               dropout=dropout,
+                               name='conv' + str(i + 2))))
 
             num_features = num_features + num_layers * growth_rate
             pre_num_channels = num_features
@@ -264,9 +261,8 @@ def __init__(self,
                 self.transition_func_list.append(
                     self.add_sublayer(
                         "tr_conv{}_blk".format(i + 2),
-                        TransitionLayer(
-                            num_channels=pre_num_channels,
-                            num_output_features=num_features // 2)))
+                        TransitionLayer(num_channels=pre_num_channels,
+                                        num_output_features=num_features // 2)))
                 pre_num_channels = num_features // 2
                 num_features = num_features // 2
 
diff --git a/python/paddle/vision/models/googlenet.py b/python/paddle/vision/models/googlenet.py
index 6afbc42603867..b1d1d38e2ee30 100644
--- a/python/paddle/vision/models/googlenet.py
+++ b/python/paddle/vision/models/googlenet.py
@@ -41,6 +41,7 @@ def xavier(channels, filter_size):
 
 
 class ConvLayer(nn.Layer):
+
     def __init__(self,
                  num_channels,
                  num_filters,
@@ -49,14 +50,13 @@ def __init__(self,
                  groups=1):
         super(ConvLayer, self).__init__()
 
-        self._conv = Conv2D(
-            in_channels=num_channels,
-            out_channels=num_filters,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            bias_attr=False)
+        self._conv = Conv2D(in_channels=num_channels,
+                            out_channels=num_filters,
+                            kernel_size=filter_size,
+                            stride=stride,
+                            padding=(filter_size - 1) // 2,
+                            groups=groups,
+                            bias_attr=False)
 
     def forward(self, inputs):
         y = self._conv(inputs)
@@ -64,6 +64,7 @@ def forward(self, inputs):
 
 
 class Inception(nn.Layer):
+
     def __init__(self, input_channels, output_channels, filter1, filter3R,
                  filter3, filter5R, filter5, proj):
         super(Inception, self).__init__()
@@ -151,8 +152,9 @@ def __init__(self, num_classes=1000, with_pool=True):
         if num_classes > 0:
             # out
             self._drop = Dropout(p=0.4, mode="downscale_in_infer")
-            self._fc_out = Linear(
-                1024, num_classes, weight_attr=xavier(1024, 1))
+            self._fc_out = Linear(1024,
+                                  num_classes,
+                                  weight_attr=xavier(1024, 1))
 
             # out1
             self._conv_o1 = ConvLayer(512, 128, 1)
diff --git a/python/paddle/vision/models/inceptionv3.py b/python/paddle/vision/models/inceptionv3.py
index 27650dbe09f04..8ffb23e62ce6a 100644
--- a/python/paddle/vision/models/inceptionv3.py
+++ b/python/paddle/vision/models/inceptionv3.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -37,42 +37,38 @@
 
 
 class InceptionStem(nn.Layer):
+
     def __init__(self):
         super().__init__()
-        self.conv_1a_3x3 = ConvNormActivation(
-            in_channels=3,
-            out_channels=32,
-            kernel_size=3,
-            stride=2,
-            padding=0,
-            activation_layer=nn.ReLU)
-        self.conv_2a_3x3 = ConvNormActivation(
-            in_channels=32,
-            out_channels=32,
-            kernel_size=3,
-            stride=1,
-            padding=0,
-            activation_layer=nn.ReLU)
-        self.conv_2b_3x3 = ConvNormActivation(
-            in_channels=32,
-            out_channels=64,
-            kernel_size=3,
-            padding=1,
-            activation_layer=nn.ReLU)
+        self.conv_1a_3x3 = ConvNormActivation(in_channels=3,
+                                              out_channels=32,
+                                              kernel_size=3,
+                                              stride=2,
+                                              padding=0,
+                                              activation_layer=nn.ReLU)
+        self.conv_2a_3x3 = ConvNormActivation(in_channels=32,
+                                              out_channels=32,
+                                              kernel_size=3,
+                                              stride=1,
+                                              padding=0,
+                                              activation_layer=nn.ReLU)
+        self.conv_2b_3x3 = ConvNormActivation(in_channels=32,
+                                              out_channels=64,
+                                              kernel_size=3,
+                                              padding=1,
+                                              activation_layer=nn.ReLU)
 
         self.max_pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
-        self.conv_3b_1x1 = ConvNormActivation(
-            in_channels=64,
-            out_channels=80,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
-        self.conv_4a_3x3 = ConvNormActivation(
-            in_channels=80,
-            out_channels=192,
-            kernel_size=3,
-            padding=0,
-            activation_layer=nn.ReLU)
+        self.conv_3b_1x1 = ConvNormActivation(in_channels=64,
+                                              out_channels=80,
+                                              kernel_size=1,
+                                              padding=0,
+                                              activation_layer=nn.ReLU)
+        self.conv_4a_3x3 = ConvNormActivation(in_channels=80,
+                                              out_channels=192,
+                                              kernel_size=3,
+                                              padding=0,
+                                              activation_layer=nn.ReLU)
 
     def forward(self, x):
         x = self.conv_1a_3x3(x)
@@ -86,55 +82,51 @@ def forward(self, x):
 
 
 class InceptionA(nn.Layer):
+
     def __init__(self, num_channels, pool_features):
         super().__init__()
-        self.branch1x1 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=64,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
-
-        self.branch5x5_1 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=48,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
-        self.branch5x5_2 = ConvNormActivation(
-            in_channels=48,
-            out_channels=64,
-            kernel_size=5,
-            padding=2,
-            activation_layer=nn.ReLU)
-
-        self.branch3x3dbl_1 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=64,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
-        self.branch3x3dbl_2 = ConvNormActivation(
-            in_channels=64,
-            out_channels=96,
-            kernel_size=3,
-            padding=1,
-            activation_layer=nn.ReLU)
-        self.branch3x3dbl_3 = ConvNormActivation(
-            in_channels=96,
-            out_channels=96,
-            kernel_size=3,
-            padding=1,
-            activation_layer=nn.ReLU)
-
-        self.branch_pool = AvgPool2D(
-            kernel_size=3, stride=1, padding=1, exclusive=False)
-        self.branch_pool_conv = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=pool_features,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
+        self.branch1x1 = ConvNormActivation(in_channels=num_channels,
+                                            out_channels=64,
+                                            kernel_size=1,
+                                            padding=0,
+                                            activation_layer=nn.ReLU)
+
+        self.branch5x5_1 = ConvNormActivation(in_channels=num_channels,
+                                              out_channels=48,
+                                              kernel_size=1,
+                                              padding=0,
+                                              activation_layer=nn.ReLU)
+        self.branch5x5_2 = ConvNormActivation(in_channels=48,
+                                              out_channels=64,
+                                              kernel_size=5,
+                                              padding=2,
+                                              activation_layer=nn.ReLU)
+
+        self.branch3x3dbl_1 = ConvNormActivation(in_channels=num_channels,
+                                                 out_channels=64,
+                                                 kernel_size=1,
+                                                 padding=0,
+                                                 activation_layer=nn.ReLU)
+        self.branch3x3dbl_2 = ConvNormActivation(in_channels=64,
+                                                 out_channels=96,
+                                                 kernel_size=3,
+                                                 padding=1,
+                                                 activation_layer=nn.ReLU)
+        self.branch3x3dbl_3 = ConvNormActivation(in_channels=96,
+                                                 out_channels=96,
+                                                 kernel_size=3,
+                                                 padding=1,
+                                                 activation_layer=nn.ReLU)
+
+        self.branch_pool = AvgPool2D(kernel_size=3,
+                                     stride=1,
+                                     padding=1,
+                                     exclusive=False)
+        self.branch_pool_conv = ConvNormActivation(in_channels=num_channels,
+                                                   out_channels=pool_features,
+                                                   kernel_size=1,
+                                                   padding=0,
+                                                   activation_layer=nn.ReLU)
 
     def forward(self, x):
         branch1x1 = self.branch1x1(x)
@@ -147,41 +139,38 @@ def forward(self, x):
 
         branch_pool = self.branch_pool(x)
         branch_pool = self.branch_pool_conv(branch_pool)
-        x = paddle.concat(
-            [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
+        x = paddle.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool],
+                          axis=1)
         return x
 
 
 class InceptionB(nn.Layer):
+
     def __init__(self, num_channels):
         super().__init__()
-        self.branch3x3 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=384,
-            kernel_size=3,
-            stride=2,
-            padding=0,
-            activation_layer=nn.ReLU)
-
-        self.branch3x3dbl_1 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=64,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
-        self.branch3x3dbl_2 = ConvNormActivation(
-            in_channels=64,
-            out_channels=96,
-            kernel_size=3,
-            padding=1,
-            activation_layer=nn.ReLU)
-        self.branch3x3dbl_3 = ConvNormActivation(
-            in_channels=96,
-            out_channels=96,
-            kernel_size=3,
-            stride=2,
-            padding=0,
-            activation_layer=nn.ReLU)
+        self.branch3x3 = ConvNormActivation(in_channels=num_channels,
+                                            out_channels=384,
+                                            kernel_size=3,
+                                            stride=2,
+                                            padding=0,
+                                            activation_layer=nn.ReLU)
+
+        self.branch3x3dbl_1 = ConvNormActivation(in_channels=num_channels,
+                                                 out_channels=64,
+                                                 kernel_size=1,
+                                                 padding=0,
+                                                 activation_layer=nn.ReLU)
+        self.branch3x3dbl_2 = ConvNormActivation(in_channels=64,
+                                                 out_channels=96,
+                                                 kernel_size=3,
+                                                 padding=1,
+                                                 activation_layer=nn.ReLU)
+        self.branch3x3dbl_3 = ConvNormActivation(in_channels=96,
+                                                 out_channels=96,
+                                                 kernel_size=3,
+                                                 stride=2,
+                                                 padding=0,
+                                                 activation_layer=nn.ReLU)
 
         self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
 
@@ -200,76 +189,69 @@ def forward(self, x):
 
 
 class InceptionC(nn.Layer):
+
     def __init__(self, num_channels, channels_7x7):
         super().__init__()
-        self.branch1x1 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=192,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
-
-        self.branch7x7_1 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=channels_7x7,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            activation_layer=nn.ReLU)
-        self.branch7x7_2 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=channels_7x7,
-            kernel_size=(1, 7),
-            stride=1,
-            padding=(0, 3),
-            activation_layer=nn.ReLU)
-        self.branch7x7_3 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=192,
-            kernel_size=(7, 1),
-            stride=1,
-            padding=(3, 0),
-            activation_layer=nn.ReLU)
-
-        self.branch7x7dbl_1 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=channels_7x7,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
-        self.branch7x7dbl_2 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=channels_7x7,
-            kernel_size=(7, 1),
-            padding=(3, 0),
-            activation_layer=nn.ReLU)
-        self.branch7x7dbl_3 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=channels_7x7,
-            kernel_size=(1, 7),
-            padding=(0, 3),
-            activation_layer=nn.ReLU)
-        self.branch7x7dbl_4 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=channels_7x7,
-            kernel_size=(7, 1),
-            padding=(3, 0),
-            activation_layer=nn.ReLU)
-        self.branch7x7dbl_5 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=192,
-            kernel_size=(1, 7),
-            padding=(0, 3),
-            activation_layer=nn.ReLU)
-
-        self.branch_pool = AvgPool2D(
-            kernel_size=3, stride=1, padding=1, exclusive=False)
-        self.branch_pool_conv = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=192,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
+        self.branch1x1 = ConvNormActivation(in_channels=num_channels,
+                                            out_channels=192,
+                                            kernel_size=1,
+                                            padding=0,
+                                            activation_layer=nn.ReLU)
+
+        self.branch7x7_1 = ConvNormActivation(in_channels=num_channels,
+                                              out_channels=channels_7x7,
+                                              kernel_size=1,
+                                              stride=1,
+                                              padding=0,
+                                              activation_layer=nn.ReLU)
+        self.branch7x7_2 = ConvNormActivation(in_channels=channels_7x7,
+                                              out_channels=channels_7x7,
+                                              kernel_size=(1, 7),
+                                              stride=1,
+                                              padding=(0, 3),
+                                              activation_layer=nn.ReLU)
+        self.branch7x7_3 = ConvNormActivation(in_channels=channels_7x7,
+                                              out_channels=192,
+                                              kernel_size=(7, 1),
+                                              stride=1,
+                                              padding=(3, 0),
+                                              activation_layer=nn.ReLU)
+
+        self.branch7x7dbl_1 = ConvNormActivation(in_channels=num_channels,
+                                                 out_channels=channels_7x7,
+                                                 kernel_size=1,
+                                                 padding=0,
+                                                 activation_layer=nn.ReLU)
+        self.branch7x7dbl_2 = ConvNormActivation(in_channels=channels_7x7,
+                                                 out_channels=channels_7x7,
+                                                 kernel_size=(7, 1),
+                                                 padding=(3, 0),
+                                                 activation_layer=nn.ReLU)
+        self.branch7x7dbl_3 = ConvNormActivation(in_channels=channels_7x7,
+                                                 out_channels=channels_7x7,
+                                                 kernel_size=(1, 7),
+                                                 padding=(0, 3),
+                                                 activation_layer=nn.ReLU)
+        self.branch7x7dbl_4 = ConvNormActivation(in_channels=channels_7x7,
+                                                 out_channels=channels_7x7,
+                                                 kernel_size=(7, 1),
+                                                 padding=(3, 0),
+                                                 activation_layer=nn.ReLU)
+        self.branch7x7dbl_5 = ConvNormActivation(in_channels=channels_7x7,
+                                                 out_channels=192,
+                                                 kernel_size=(1, 7),
+                                                 padding=(0, 3),
+                                                 activation_layer=nn.ReLU)
+
+        self.branch_pool = AvgPool2D(kernel_size=3,
+                                     stride=1,
+                                     padding=1,
+                                     exclusive=False)
+        self.branch_pool_conv = ConvNormActivation(in_channels=num_channels,
+                                                   out_channels=192,
+                                                   kernel_size=1,
+                                                   padding=0,
+                                                   activation_layer=nn.ReLU)
 
     def forward(self, x):
         branch1x1 = self.branch1x1(x)
@@ -287,54 +269,49 @@ def forward(self, x):
         branch_pool = self.branch_pool(x)
         branch_pool = self.branch_pool_conv(branch_pool)
 
-        x = paddle.concat(
-            [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
+        x = paddle.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool],
+                          axis=1)
 
         return x
 
 
 class InceptionD(nn.Layer):
+
     def __init__(self, num_channels):
         super().__init__()
-        self.branch3x3_1 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=192,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
-        self.branch3x3_2 = ConvNormActivation(
-            in_channels=192,
-            out_channels=320,
-            kernel_size=3,
-            stride=2,
-            padding=0,
-            activation_layer=nn.ReLU)
-
-        self.branch7x7x3_1 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=192,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
-        self.branch7x7x3_2 = ConvNormActivation(
-            in_channels=192,
-            out_channels=192,
-            kernel_size=(1, 7),
-            padding=(0, 3),
-            activation_layer=nn.ReLU)
-        self.branch7x7x3_3 = ConvNormActivation(
-            in_channels=192,
-            out_channels=192,
-            kernel_size=(7, 1),
-            padding=(3, 0),
-            activation_layer=nn.ReLU)
-        self.branch7x7x3_4 = ConvNormActivation(
-            in_channels=192,
-            out_channels=192,
-            kernel_size=3,
-            stride=2,
-            padding=0,
-            activation_layer=nn.ReLU)
+        self.branch3x3_1 = ConvNormActivation(in_channels=num_channels,
+                                              out_channels=192,
+                                              kernel_size=1,
+                                              padding=0,
+                                              activation_layer=nn.ReLU)
+        self.branch3x3_2 = ConvNormActivation(in_channels=192,
+                                              out_channels=320,
+                                              kernel_size=3,
+                                              stride=2,
+                                              padding=0,
+                                              activation_layer=nn.ReLU)
+
+        self.branch7x7x3_1 = ConvNormActivation(in_channels=num_channels,
+                                                out_channels=192,
+                                                kernel_size=1,
+                                                padding=0,
+                                                activation_layer=nn.ReLU)
+        self.branch7x7x3_2 = ConvNormActivation(in_channels=192,
+                                                out_channels=192,
+                                                kernel_size=(1, 7),
+                                                padding=(0, 3),
+                                                activation_layer=nn.ReLU)
+        self.branch7x7x3_3 = ConvNormActivation(in_channels=192,
+                                                out_channels=192,
+                                                kernel_size=(7, 1),
+                                                padding=(3, 0),
+                                                activation_layer=nn.ReLU)
+        self.branch7x7x3_4 = ConvNormActivation(in_channels=192,
+                                                out_channels=192,
+                                                kernel_size=3,
+                                                stride=2,
+                                                padding=0,
+                                                activation_layer=nn.ReLU)
 
         self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
 
@@ -354,66 +331,60 @@ def forward(self, x):
 
 
 class InceptionE(nn.Layer):
+
     def __init__(self, num_channels):
         super().__init__()
-        self.branch1x1 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=320,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
-        self.branch3x3_1 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=384,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
-        self.branch3x3_2a = ConvNormActivation(
-            in_channels=384,
-            out_channels=384,
-            kernel_size=(1, 3),
-            padding=(0, 1),
-            activation_layer=nn.ReLU)
-        self.branch3x3_2b = ConvNormActivation(
-            in_channels=384,
-            out_channels=384,
-            kernel_size=(3, 1),
-            padding=(1, 0),
-            activation_layer=nn.ReLU)
-
-        self.branch3x3dbl_1 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=448,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
-        self.branch3x3dbl_2 = ConvNormActivation(
-            in_channels=448,
-            out_channels=384,
-            kernel_size=3,
-            padding=1,
-            activation_layer=nn.ReLU)
-        self.branch3x3dbl_3a = ConvNormActivation(
-            in_channels=384,
-            out_channels=384,
-            kernel_size=(1, 3),
-            padding=(0, 1),
-            activation_layer=nn.ReLU)
-        self.branch3x3dbl_3b = ConvNormActivation(
-            in_channels=384,
-            out_channels=384,
-            kernel_size=(3, 1),
-            padding=(1, 0),
-            activation_layer=nn.ReLU)
-
-        self.branch_pool = AvgPool2D(
-            kernel_size=3, stride=1, padding=1, exclusive=False)
-        self.branch_pool_conv = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=192,
-            kernel_size=1,
-            padding=0,
-            activation_layer=nn.ReLU)
+        self.branch1x1 = ConvNormActivation(in_channels=num_channels,
+                                            out_channels=320,
+                                            kernel_size=1,
+                                            padding=0,
+                                            activation_layer=nn.ReLU)
+        self.branch3x3_1 = ConvNormActivation(in_channels=num_channels,
+                                              out_channels=384,
+                                              kernel_size=1,
+                                              padding=0,
+                                              activation_layer=nn.ReLU)
+        self.branch3x3_2a = ConvNormActivation(in_channels=384,
+                                               out_channels=384,
+                                               kernel_size=(1, 3),
+                                               padding=(0, 1),
+                                               activation_layer=nn.ReLU)
+        self.branch3x3_2b = ConvNormActivation(in_channels=384,
+                                               out_channels=384,
+                                               kernel_size=(3, 1),
+                                               padding=(1, 0),
+                                               activation_layer=nn.ReLU)
+
+        self.branch3x3dbl_1 = ConvNormActivation(in_channels=num_channels,
+                                                 out_channels=448,
+                                                 kernel_size=1,
+                                                 padding=0,
+                                                 activation_layer=nn.ReLU)
+        self.branch3x3dbl_2 = ConvNormActivation(in_channels=448,
+                                                 out_channels=384,
+                                                 kernel_size=3,
+                                                 padding=1,
+                                                 activation_layer=nn.ReLU)
+        self.branch3x3dbl_3a = ConvNormActivation(in_channels=384,
+                                                  out_channels=384,
+                                                  kernel_size=(1, 3),
+                                                  padding=(0, 1),
+                                                  activation_layer=nn.ReLU)
+        self.branch3x3dbl_3b = ConvNormActivation(in_channels=384,
+                                                  out_channels=384,
+                                                  kernel_size=(3, 1),
+                                                  padding=(1, 0),
+                                                  activation_layer=nn.ReLU)
+
+        self.branch_pool = AvgPool2D(kernel_size=3,
+                                     stride=1,
+                                     padding=1,
+                                     exclusive=False)
+        self.branch_pool_conv = ConvNormActivation(in_channels=num_channels,
+                                                   out_channels=192,
+                                                   kernel_size=1,
+                                                   padding=0,
+                                                   activation_layer=nn.ReLU)
 
     def forward(self, x):
         branch1x1 = self.branch1x1(x)
@@ -436,8 +407,8 @@ def forward(self, x):
         branch_pool = self.branch_pool(x)
         branch_pool = self.branch_pool_conv(branch_pool)
 
-        x = paddle.concat(
-            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+        x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool],
+                          axis=1)
         return x
 
 
diff --git a/python/paddle/vision/models/lenet.py b/python/paddle/vision/models/lenet.py
index 46212f46f3a48..a526bb719ef96 100644
--- a/python/paddle/vision/models/lenet.py
+++ b/python/paddle/vision/models/lenet.py
@@ -37,20 +37,14 @@ class LeNet(nn.Layer):
     def __init__(self, num_classes=10):
         super(LeNet, self).__init__()
         self.num_classes = num_classes
-        self.features = nn.Sequential(
-            nn.Conv2D(
-                1, 6, 3, stride=1, padding=1),
-            nn.ReLU(),
-            nn.MaxPool2D(2, 2),
-            nn.Conv2D(
-                6, 16, 5, stride=1, padding=0),
-            nn.ReLU(),
-            nn.MaxPool2D(2, 2))
+        self.features = nn.Sequential(nn.Conv2D(1, 6, 3, stride=1, padding=1),
+                                      nn.ReLU(), nn.MaxPool2D(2, 2),
+                                      nn.Conv2D(6, 16, 5, stride=1, padding=0),
+                                      nn.ReLU(), nn.MaxPool2D(2, 2))
 
         if num_classes > 0:
-            self.fc = nn.Sequential(
-                nn.Linear(400, 120),
-                nn.Linear(120, 84), nn.Linear(84, num_classes))
+            self.fc = nn.Sequential(nn.Linear(400, 120), nn.Linear(120, 84),
+                                    nn.Linear(84, num_classes))
 
     def forward(self, inputs):
         x = self.features(inputs)
diff --git a/python/paddle/vision/models/mobilenetv1.py b/python/paddle/vision/models/mobilenetv1.py
index 6d8d96952fab4..e8e4994a75be4 100644
--- a/python/paddle/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
@@ -28,24 +28,24 @@
 
 
 class DepthwiseSeparable(nn.Layer):
+
     def __init__(self, in_channels, out_channels1, out_channels2, num_groups,
                  stride, scale):
         super(DepthwiseSeparable, self).__init__()
 
-        self._depthwise_conv = ConvNormActivation(
-            in_channels,
-            int(out_channels1 * scale),
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            groups=int(num_groups * scale))
-
-        self._pointwise_conv = ConvNormActivation(
-            int(out_channels1 * scale),
-            int(out_channels2 * scale),
-            kernel_size=1,
-            stride=1,
-            padding=0)
+        self._depthwise_conv = ConvNormActivation(in_channels,
+                                                  int(out_channels1 * scale),
+                                                  kernel_size=3,
+                                                  stride=stride,
+                                                  padding=1,
+                                                  groups=int(num_groups *
+                                                             scale))
+
+        self._pointwise_conv = ConvNormActivation(int(out_channels1 * scale),
+                                                  int(out_channels2 * scale),
+                                                  kernel_size=1,
+                                                  stride=1,
+                                                  padding=0)
 
     def forward(self, x):
         x = self._depthwise_conv(x)
@@ -84,111 +84,101 @@ def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
         self.num_classes = num_classes
         self.with_pool = with_pool
 
-        self.conv1 = ConvNormActivation(
-            in_channels=3,
-            out_channels=int(32 * scale),
-            kernel_size=3,
-            stride=2,
-            padding=1)
-
-        dws21 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                in_channels=int(32 * scale),
-                out_channels1=32,
-                out_channels2=64,
-                num_groups=32,
-                stride=1,
-                scale=scale),
-            name="conv2_1")
+        self.conv1 = ConvNormActivation(in_channels=3,
+                                        out_channels=int(32 * scale),
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1)
+
+        dws21 = self.add_sublayer(sublayer=DepthwiseSeparable(in_channels=int(
+            32 * scale),
+                                                              out_channels1=32,
+                                                              out_channels2=64,
+                                                              num_groups=32,
+                                                              stride=1,
+                                                              scale=scale),
+                                  name="conv2_1")
         self.dwsl.append(dws21)
 
-        dws22 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                in_channels=int(64 * scale),
-                out_channels1=64,
-                out_channels2=128,
-                num_groups=64,
-                stride=2,
-                scale=scale),
-            name="conv2_2")
+        dws22 = self.add_sublayer(sublayer=DepthwiseSeparable(in_channels=int(
+            64 * scale),
+                                                              out_channels1=64,
+                                                              out_channels2=128,
+                                                              num_groups=64,
+                                                              stride=2,
+                                                              scale=scale),
+                                  name="conv2_2")
         self.dwsl.append(dws22)
 
-        dws31 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                in_channels=int(128 * scale),
-                out_channels1=128,
-                out_channels2=128,
-                num_groups=128,
-                stride=1,
-                scale=scale),
-            name="conv3_1")
+        dws31 = self.add_sublayer(sublayer=DepthwiseSeparable(in_channels=int(
+            128 * scale),
+                                                              out_channels1=128,
+                                                              out_channels2=128,
+                                                              num_groups=128,
+                                                              stride=1,
+                                                              scale=scale),
+                                  name="conv3_1")
         self.dwsl.append(dws31)
 
-        dws32 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                in_channels=int(128 * scale),
-                out_channels1=128,
-                out_channels2=256,
-                num_groups=128,
-                stride=2,
-                scale=scale),
-            name="conv3_2")
+        dws32 = self.add_sublayer(sublayer=DepthwiseSeparable(in_channels=int(
+            128 * scale),
+                                                              out_channels1=128,
+                                                              out_channels2=256,
+                                                              num_groups=128,
+                                                              stride=2,
+                                                              scale=scale),
+                                  name="conv3_2")
         self.dwsl.append(dws32)
 
-        dws41 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                in_channels=int(256 * scale),
-                out_channels1=256,
-                out_channels2=256,
-                num_groups=256,
-                stride=1,
-                scale=scale),
-            name="conv4_1")
+        dws41 = self.add_sublayer(sublayer=DepthwiseSeparable(in_channels=int(
+            256 * scale),
+                                                              out_channels1=256,
+                                                              out_channels2=256,
+                                                              num_groups=256,
+                                                              stride=1,
+                                                              scale=scale),
+                                  name="conv4_1")
         self.dwsl.append(dws41)
 
-        dws42 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                in_channels=int(256 * scale),
-                out_channels1=256,
-                out_channels2=512,
-                num_groups=256,
-                stride=2,
-                scale=scale),
-            name="conv4_2")
+        dws42 = self.add_sublayer(sublayer=DepthwiseSeparable(in_channels=int(
+            256 * scale),
+                                                              out_channels1=256,
+                                                              out_channels2=512,
+                                                              num_groups=256,
+                                                              stride=2,
+                                                              scale=scale),
+                                  name="conv4_2")
         self.dwsl.append(dws42)
 
         for i in range(5):
-            tmp = self.add_sublayer(
-                sublayer=DepthwiseSeparable(
-                    in_channels=int(512 * scale),
-                    out_channels1=512,
-                    out_channels2=512,
-                    num_groups=512,
-                    stride=1,
-                    scale=scale),
-                name="conv5_" + str(i + 1))
-            self.dwsl.append(tmp)
-
-        dws56 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
+            tmp = self.add_sublayer(sublayer=DepthwiseSeparable(
                 in_channels=int(512 * scale),
                 out_channels1=512,
-                out_channels2=1024,
+                out_channels2=512,
                 num_groups=512,
-                stride=2,
+                stride=1,
                 scale=scale),
-            name="conv5_6")
+                                    name="conv5_" + str(i + 1))
+            self.dwsl.append(tmp)
+
+        dws56 = self.add_sublayer(sublayer=DepthwiseSeparable(
+            in_channels=int(512 * scale),
+            out_channels1=512,
+            out_channels2=1024,
+            num_groups=512,
+            stride=2,
+            scale=scale),
+                                  name="conv5_6")
         self.dwsl.append(dws56)
 
-        dws6 = self.add_sublayer(
-            sublayer=DepthwiseSeparable(
-                in_channels=int(1024 * scale),
-                out_channels1=1024,
-                out_channels2=1024,
-                num_groups=1024,
-                stride=1,
-                scale=scale),
-            name="conv6")
+        dws6 = self.add_sublayer(sublayer=DepthwiseSeparable(in_channels=int(
+            1024 * scale),
+                                                             out_channels1=1024,
+                                                             out_channels2=1024,
+                                                             num_groups=1024,
+                                                             stride=1,
+                                                             scale=scale),
+                                 name="conv6")
         self.dwsl.append(dws6)
 
         if with_pool:
@@ -252,6 +242,8 @@ def mobilenet_v1(pretrained=False, scale=1.0, **kwargs):
 
             print(out.shape)
     """
-    model = _mobilenet(
-        'mobilenetv1_' + str(scale), pretrained, scale=scale, **kwargs)
+    model = _mobilenet('mobilenetv1_' + str(scale),
+                       pretrained,
+                       scale=scale,
+                       **kwargs)
     return model
diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
index 9791462610deb..f9111185de63d 100644
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -29,6 +29,7 @@
 
 
 class InvertedResidual(nn.Layer):
+
     def __init__(self,
                  inp,
                  oup,
@@ -45,22 +46,19 @@ def __init__(self,
         layers = []
         if expand_ratio != 1:
             layers.append(
-                ConvNormActivation(
-                    inp,
-                    hidden_dim,
-                    kernel_size=1,
-                    norm_layer=norm_layer,
-                    activation_layer=nn.ReLU6))
+                ConvNormActivation(inp,
+                                   hidden_dim,
+                                   kernel_size=1,
+                                   norm_layer=norm_layer,
+                                   activation_layer=nn.ReLU6))
         layers.extend([
-            ConvNormActivation(
-                hidden_dim,
-                hidden_dim,
-                stride=stride,
-                groups=hidden_dim,
-                norm_layer=norm_layer,
-                activation_layer=nn.ReLU6),
-            nn.Conv2D(
-                hidden_dim, oup, 1, 1, 0, bias_attr=False),
+            ConvNormActivation(hidden_dim,
+                               hidden_dim,
+                               stride=stride,
+                               groups=hidden_dim,
+                               norm_layer=norm_layer,
+                               activation_layer=nn.ReLU6),
+            nn.Conv2D(hidden_dim, oup, 1, 1, 0, bias_attr=False),
             norm_layer(oup),
         ])
         self.conv = nn.Sequential(*layers)
@@ -120,12 +118,11 @@ def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
         self.last_channel = _make_divisible(last_channel * max(1.0, scale),
                                             round_nearest)
         features = [
-            ConvNormActivation(
-                3,
-                input_channel,
-                stride=2,
-                norm_layer=norm_layer,
-                activation_layer=nn.ReLU6)
+            ConvNormActivation(3,
+                               input_channel,
+                               stride=2,
+                               norm_layer=norm_layer,
+                               activation_layer=nn.ReLU6)
         ]
 
         for t, c, n, s in inverted_residual_setting:
@@ -133,21 +130,19 @@ def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
             for i in range(n):
                 stride = s if i == 0 else 1
                 features.append(
-                    block(
-                        input_channel,
-                        output_channel,
-                        stride,
-                        expand_ratio=t,
-                        norm_layer=norm_layer))
+                    block(input_channel,
+                          output_channel,
+                          stride,
+                          expand_ratio=t,
+                          norm_layer=norm_layer))
                 input_channel = output_channel
 
         features.append(
-            ConvNormActivation(
-                input_channel,
-                self.last_channel,
-                kernel_size=1,
-                norm_layer=norm_layer,
-                activation_layer=nn.ReLU6))
+            ConvNormActivation(input_channel,
+                               self.last_channel,
+                               kernel_size=1,
+                               norm_layer=norm_layer,
+                               activation_layer=nn.ReLU6))
 
         self.features = nn.Sequential(*features)
 
@@ -211,6 +206,8 @@ def mobilenet_v2(pretrained=False, scale=1.0, **kwargs):
 
             print(out.shape)
     """
-    model = _mobilenet(
-        'mobilenetv2_' + str(scale), pretrained, scale=scale, **kwargs)
+    model = _mobilenet('mobilenetv2_' + str(scale),
+                       pretrained,
+                       scale=scale,
+                       **kwargs)
     return model
diff --git a/python/paddle/vision/models/mobilenetv3.py b/python/paddle/vision/models/mobilenetv3.py
index 70aa1b833d648..0dd97755b62ad 100644
--- a/python/paddle/vision/models/mobilenetv3.py
+++ b/python/paddle/vision/models/mobilenetv3.py
@@ -74,6 +74,7 @@ def forward(self, input):
 
 
 class InvertedResidualConfig:
+
     def __init__(self,
                  in_channels,
                  kernel,
@@ -85,8 +86,8 @@ def __init__(self,
                  scale=1.0):
         self.in_channels = self.adjust_channels(in_channels, scale=scale)
         self.kernel = kernel
-        self.expanded_channels = self.adjust_channels(
-            expanded_channels, scale=scale)
+        self.expanded_channels = self.adjust_channels(expanded_channels,
+                                                      scale=scale)
         self.out_channels = self.adjust_channels(out_channels, scale=scale)
         self.use_se = use_se
         if activation is None:
@@ -96,8 +97,9 @@ def __init__(self,
         elif activation == "hardswish":
             self.activation_layer = nn.Hardswish
         else:
-            raise RuntimeError("The activation function is not supported: {}".
-                               format(activation))
+            raise RuntimeError(
+                "The activation function is not supported: {}".format(
+                    activation))
         self.stride = stride
 
     @staticmethod
@@ -106,6 +108,7 @@ def adjust_channels(channels, scale=1.0):
 
 
 class InvertedResidual(nn.Layer):
+
     def __init__(self, in_channels, expanded_channels, out_channels,
                  filter_size, stride, use_se, activation_layer, norm_layer):
         super().__init__()
@@ -134,19 +137,18 @@ def __init__(self, in_channels, expanded_channels, out_channels,
             activation_layer=activation_layer)
 
         if self.use_se:
-            self.mid_se = SqueezeExcitation(
-                expanded_channels,
-                _make_divisible(expanded_channels // 4),
-                scale_activation=nn.Hardsigmoid)
-
-        self.linear_conv = ConvNormActivation(
-            in_channels=expanded_channels,
-            out_channels=out_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            norm_layer=norm_layer,
-            activation_layer=None)
+            self.mid_se = SqueezeExcitation(expanded_channels,
+                                            _make_divisible(expanded_channels //
+                                                            4),
+                                            scale_activation=nn.Hardsigmoid)
+
+        self.linear_conv = ConvNormActivation(in_channels=expanded_channels,
+                                              out_channels=out_channels,
+                                              kernel_size=1,
+                                              stride=1,
+                                              padding=0,
+                                              norm_layer=norm_layer,
+                                              activation_layer=None)
 
     def forward(self, x):
         identity = x
@@ -192,26 +194,24 @@ def __init__(self,
         self.lastconv_out_channels = self.lastconv_in_channels * 6
         norm_layer = partial(nn.BatchNorm2D, epsilon=0.001, momentum=0.99)
 
-        self.conv = ConvNormActivation(
-            in_channels=3,
-            out_channels=self.firstconv_in_channels,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            groups=1,
-            activation_layer=nn.Hardswish,
-            norm_layer=norm_layer)
+        self.conv = ConvNormActivation(in_channels=3,
+                                       out_channels=self.firstconv_in_channels,
+                                       kernel_size=3,
+                                       stride=2,
+                                       padding=1,
+                                       groups=1,
+                                       activation_layer=nn.Hardswish,
+                                       norm_layer=norm_layer)
 
         self.blocks = nn.Sequential(*[
-            InvertedResidual(
-                in_channels=cfg.in_channels,
-                expanded_channels=cfg.expanded_channels,
-                out_channels=cfg.out_channels,
-                filter_size=cfg.kernel,
-                stride=cfg.stride,
-                use_se=cfg.use_se,
-                activation_layer=cfg.activation_layer,
-                norm_layer=norm_layer) for cfg in self.config
+            InvertedResidual(in_channels=cfg.in_channels,
+                             expanded_channels=cfg.expanded_channels,
+                             out_channels=cfg.out_channels,
+                             filter_size=cfg.kernel,
+                             stride=cfg.stride,
+                             use_se=cfg.use_se,
+                             activation_layer=cfg.activation_layer,
+                             norm_layer=norm_layer) for cfg in self.config
         ])
 
         self.lastconv = ConvNormActivation(
@@ -230,8 +230,7 @@ def __init__(self,
         if num_classes > 0:
             self.classifier = nn.Sequential(
                 nn.Linear(self.lastconv_out_channels, self.last_channel),
-                nn.Hardswish(),
-                nn.Dropout(p=0.2),
+                nn.Hardswish(), nn.Dropout(p=0.2),
                 nn.Linear(self.last_channel, num_classes))
 
     def forward(self, x):
@@ -289,12 +288,11 @@ def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
             InvertedResidualConfig(96, 5, 576, 96, True, "hardswish", 1, scale),
         ]
         last_channel = _make_divisible(1024 * scale, 8)
-        super().__init__(
-            config,
-            last_channel=last_channel,
-            scale=scale,
-            with_pool=with_pool,
-            num_classes=num_classes)
+        super().__init__(config,
+                         last_channel=last_channel,
+                         scale=scale,
+                         with_pool=with_pool,
+                         num_classes=num_classes)
 
 
 class MobileNetV3Large(MobileNetV3):
@@ -350,12 +348,11 @@ def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
                                    scale),
         ]
         last_channel = _make_divisible(1280 * scale, 8)
-        super().__init__(
-            config,
-            last_channel=last_channel,
-            scale=scale,
-            with_pool=with_pool,
-            num_classes=num_classes)
+        super().__init__(config,
+                         last_channel=last_channel,
+                         scale=scale,
+                         with_pool=with_pool,
+                         num_classes=num_classes)
 
 
 def _mobilenet_v3(arch, pretrained=False, scale=1.0, **kwargs):
@@ -406,8 +403,10 @@ def mobilenet_v3_small(pretrained=False, scale=1.0, **kwargs):
             print(out.shape)
 
     """
-    model = _mobilenet_v3(
-        "mobilenet_v3_small", scale=scale, pretrained=pretrained, **kwargs)
+    model = _mobilenet_v3("mobilenet_v3_small",
+                          scale=scale,
+                          pretrained=pretrained,
+                          **kwargs)
     return model
 
 
@@ -440,6 +439,8 @@ def mobilenet_v3_large(pretrained=False, scale=1.0, **kwargs):
             print(out.shape)
 
     """
-    model = _mobilenet_v3(
-        "mobilenet_v3_large", scale=scale, pretrained=pretrained, **kwargs)
+    model = _mobilenet_v3("mobilenet_v3_large",
+                          scale=scale,
+                          pretrained=pretrained,
+                          **kwargs)
     return model
diff --git a/python/paddle/vision/models/resnet.py b/python/paddle/vision/models/resnet.py
index 27536b6a9c64f..ba58fe7f57d50 100644
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -39,24 +39,24 @@
     "resnext50_64x4d":
     ('https://paddle-hapi.bj.bcebos.com/models/resnext50_64x4d.pdparams',
      '063d4b483e12b06388529450ad7576db'),
-    'resnext101_32x4d': (
-        'https://paddle-hapi.bj.bcebos.com/models/resnext101_32x4d.pdparams',
-        '967b090039f9de2c8d06fe994fb9095f'),
-    'resnext101_64x4d': (
-        'https://paddle-hapi.bj.bcebos.com/models/resnext101_64x4d.pdparams',
-        '98e04e7ca616a066699230d769d03008'),
-    'resnext152_32x4d': (
-        'https://paddle-hapi.bj.bcebos.com/models/resnext152_32x4d.pdparams',
-        '18ff0beee21f2efc99c4b31786107121'),
-    'resnext152_64x4d': (
-        'https://paddle-hapi.bj.bcebos.com/models/resnext152_64x4d.pdparams',
-        '77c4af00ca42c405fa7f841841959379'),
-    'wide_resnet50_2': (
-        'https://paddle-hapi.bj.bcebos.com/models/wide_resnet50_2.pdparams',
-        '0282f804d73debdab289bd9fea3fa6dc'),
-    'wide_resnet101_2': (
-        'https://paddle-hapi.bj.bcebos.com/models/wide_resnet101_2.pdparams',
-        'd4360a2d23657f059216f5d5a1a9ac93'),
+    'resnext101_32x4d':
+    ('https://paddle-hapi.bj.bcebos.com/models/resnext101_32x4d.pdparams',
+     '967b090039f9de2c8d06fe994fb9095f'),
+    'resnext101_64x4d':
+    ('https://paddle-hapi.bj.bcebos.com/models/resnext101_64x4d.pdparams',
+     '98e04e7ca616a066699230d769d03008'),
+    'resnext152_32x4d':
+    ('https://paddle-hapi.bj.bcebos.com/models/resnext152_32x4d.pdparams',
+     '18ff0beee21f2efc99c4b31786107121'),
+    'resnext152_64x4d':
+    ('https://paddle-hapi.bj.bcebos.com/models/resnext152_64x4d.pdparams',
+     '77c4af00ca42c405fa7f841841959379'),
+    'wide_resnet50_2':
+    ('https://paddle-hapi.bj.bcebos.com/models/wide_resnet50_2.pdparams',
+     '0282f804d73debdab289bd9fea3fa6dc'),
+    'wide_resnet101_2':
+    ('https://paddle-hapi.bj.bcebos.com/models/wide_resnet101_2.pdparams',
+     'd4360a2d23657f059216f5d5a1a9ac93'),
 }
 
 
@@ -80,8 +80,12 @@ def __init__(self,
             raise NotImplementedError(
                 "Dilation > 1 not supported in BasicBlock")
 
-        self.conv1 = nn.Conv2D(
-            inplanes, planes, 3, padding=1, stride=stride, bias_attr=False)
+        self.conv1 = nn.Conv2D(inplanes,
+                               planes,
+                               3,
+                               padding=1,
+                               stride=stride,
+                               bias_attr=False)
         self.bn1 = norm_layer(planes)
         self.relu = nn.ReLU()
         self.conv2 = nn.Conv2D(planes, planes, 3, padding=1, bias_attr=False)
@@ -129,19 +133,20 @@ def __init__(self,
         self.conv1 = nn.Conv2D(inplanes, width, 1, bias_attr=False)
         self.bn1 = norm_layer(width)
 
-        self.conv2 = nn.Conv2D(
-            width,
-            width,
-            3,
-            padding=dilation,
-            stride=stride,
-            groups=groups,
-            dilation=dilation,
-            bias_attr=False)
+        self.conv2 = nn.Conv2D(width,
+                               width,
+                               3,
+                               padding=dilation,
+                               stride=stride,
+                               groups=groups,
+                               dilation=dilation,
+                               bias_attr=False)
         self.bn2 = norm_layer(width)
 
-        self.conv3 = nn.Conv2D(
-            width, planes * self.expansion, 1, bias_attr=False)
+        self.conv3 = nn.Conv2D(width,
+                               planes * self.expansion,
+                               1,
+                               bias_attr=False)
         self.bn3 = norm_layer(planes * self.expansion)
         self.relu = nn.ReLU()
         self.downsample = downsample
@@ -235,13 +240,12 @@ def __init__(self,
         self.inplanes = 64
         self.dilation = 1
 
-        self.conv1 = nn.Conv2D(
-            3,
-            self.inplanes,
-            kernel_size=7,
-            stride=2,
-            padding=3,
-            bias_attr=False)
+        self.conv1 = nn.Conv2D(3,
+                               self.inplanes,
+                               kernel_size=7,
+                               stride=2,
+                               padding=3,
+                               bias_attr=False)
         self.bn1 = self._norm_layer(self.inplanes)
         self.relu = nn.ReLU()
         self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
@@ -264,13 +268,13 @@ def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
             stride = 1
         if stride != 1 or self.inplanes != planes * block.expansion:
             downsample = nn.Sequential(
-                nn.Conv2D(
-                    self.inplanes,
-                    planes * block.expansion,
-                    1,
-                    stride=stride,
-                    bias_attr=False),
-                norm_layer(planes * block.expansion), )
+                nn.Conv2D(self.inplanes,
+                          planes * block.expansion,
+                          1,
+                          stride=stride,
+                          bias_attr=False),
+                norm_layer(planes * block.expansion),
+            )
 
         layers = []
         layers.append(
@@ -279,12 +283,11 @@ def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
         self.inplanes = planes * block.expansion
         for _ in range(1, blocks):
             layers.append(
-                block(
-                    self.inplanes,
-                    planes,
-                    groups=self.groups,
-                    base_width=self.base_width,
-                    norm_layer=norm_layer))
+                block(self.inplanes,
+                      planes,
+                      groups=self.groups,
+                      base_width=self.base_width,
+                      norm_layer=norm_layer))
 
         return nn.Sequential(*layers)
 
diff --git a/python/paddle/vision/models/shufflenetv2.py b/python/paddle/vision/models/shufflenetv2.py
index 90e967ee22b35..60304b954987c 100644
--- a/python/paddle/vision/models/shufflenetv2.py
+++ b/python/paddle/vision/models/shufflenetv2.py
@@ -28,25 +28,32 @@
 model_urls = {
     "shufflenet_v2_x0_25": (
         "https://paddle-hapi.bj.bcebos.com/models/shufflenet_v2_x0_25.pdparams",
-        "1e509b4c140eeb096bb16e214796d03b", ),
+        "1e509b4c140eeb096bb16e214796d03b",
+    ),
     "shufflenet_v2_x0_33": (
         "https://paddle-hapi.bj.bcebos.com/models/shufflenet_v2_x0_33.pdparams",
-        "3d7b3ab0eaa5c0927ff1026d31b729bd", ),
+        "3d7b3ab0eaa5c0927ff1026d31b729bd",
+    ),
     "shufflenet_v2_x0_5": (
         "https://paddle-hapi.bj.bcebos.com/models/shufflenet_v2_x0_5.pdparams",
-        "5e5cee182a7793c4e4c73949b1a71bd4", ),
+        "5e5cee182a7793c4e4c73949b1a71bd4",
+    ),
     "shufflenet_v2_x1_0": (
         "https://paddle-hapi.bj.bcebos.com/models/shufflenet_v2_x1_0.pdparams",
-        "122d42478b9e81eb49f8a9ede327b1a4", ),
+        "122d42478b9e81eb49f8a9ede327b1a4",
+    ),
     "shufflenet_v2_x1_5": (
         "https://paddle-hapi.bj.bcebos.com/models/shufflenet_v2_x1_5.pdparams",
-        "faced5827380d73531d0ee027c67826d", ),
+        "faced5827380d73531d0ee027c67826d",
+    ),
     "shufflenet_v2_x2_0": (
         "https://paddle-hapi.bj.bcebos.com/models/shufflenet_v2_x2_0.pdparams",
-        "cd3dddcd8305e7bcd8ad14d1c69a5784", ),
+        "cd3dddcd8305e7bcd8ad14d1c69a5784",
+    ),
     "shufflenet_v2_swish": (
         "https://paddle-hapi.bj.bcebos.com/models/shufflenet_v2_swish.pdparams",
-        "adde0aa3b023e5b0c94a68be1c394b84", ),
+        "adde0aa3b023e5b0c94a68be1c394b84",
+    ),
 }
 
 
@@ -79,28 +86,27 @@ def channel_shuffle(x, groups):
 
 
 class InvertedResidual(nn.Layer):
+
     def __init__(self,
                  in_channels,
                  out_channels,
                  stride,
                  activation_layer=nn.ReLU):
         super(InvertedResidual, self).__init__()
-        self._conv_pw = ConvNormActivation(
-            in_channels=in_channels // 2,
-            out_channels=out_channels // 2,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            activation_layer=activation_layer)
-        self._conv_dw = ConvNormActivation(
-            in_channels=out_channels // 2,
-            out_channels=out_channels // 2,
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            groups=out_channels // 2,
-            activation_layer=None)
+        self._conv_pw = ConvNormActivation(in_channels=in_channels // 2,
+                                           out_channels=out_channels // 2,
+                                           kernel_size=1,
+                                           stride=1,
+                                           padding=0,
+                                           groups=1,
+                                           activation_layer=activation_layer)
+        self._conv_dw = ConvNormActivation(in_channels=out_channels // 2,
+                                           out_channels=out_channels // 2,
+                                           kernel_size=3,
+                                           stride=stride,
+                                           padding=1,
+                                           groups=out_channels // 2,
+                                           activation_layer=None)
         self._conv_linear = ConvNormActivation(
             in_channels=out_channels // 2,
             out_channels=out_channels // 2,
@@ -123,6 +129,7 @@ def forward(self, inputs):
 
 
 class InvertedResidualDS(nn.Layer):
+
     def __init__(self,
                  in_channels,
                  out_channels,
@@ -131,14 +138,13 @@ def __init__(self,
         super(InvertedResidualDS, self).__init__()
 
         # branch1
-        self._conv_dw_1 = ConvNormActivation(
-            in_channels=in_channels,
-            out_channels=in_channels,
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            groups=in_channels,
-            activation_layer=None)
+        self._conv_dw_1 = ConvNormActivation(in_channels=in_channels,
+                                             out_channels=in_channels,
+                                             kernel_size=3,
+                                             stride=stride,
+                                             padding=1,
+                                             groups=in_channels,
+                                             activation_layer=None)
         self._conv_linear_1 = ConvNormActivation(
             in_channels=in_channels,
             out_channels=out_channels // 2,
@@ -148,22 +154,20 @@ def __init__(self,
             groups=1,
             activation_layer=activation_layer)
         # branch2
-        self._conv_pw_2 = ConvNormActivation(
-            in_channels=in_channels,
-            out_channels=out_channels // 2,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            activation_layer=activation_layer)
-        self._conv_dw_2 = ConvNormActivation(
-            in_channels=out_channels // 2,
-            out_channels=out_channels // 2,
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            groups=out_channels // 2,
-            activation_layer=None)
+        self._conv_pw_2 = ConvNormActivation(in_channels=in_channels,
+                                             out_channels=out_channels // 2,
+                                             kernel_size=1,
+                                             stride=1,
+                                             padding=0,
+                                             groups=1,
+                                             activation_layer=activation_layer)
+        self._conv_dw_2 = ConvNormActivation(in_channels=out_channels // 2,
+                                             out_channels=out_channels // 2,
+                                             kernel_size=3,
+                                             stride=stride,
+                                             padding=1,
+                                             groups=out_channels // 2,
+                                             activation_layer=None)
         self._conv_linear_2 = ConvNormActivation(
             in_channels=out_channels // 2,
             out_channels=out_channels // 2,
@@ -232,13 +236,12 @@ def __init__(self, scale=1.0, act="relu", num_classes=1000, with_pool=True):
             raise NotImplementedError("This scale size:[" + str(scale) +
                                       "] is not implemented!")
         # 1. conv1
-        self._conv1 = ConvNormActivation(
-            in_channels=3,
-            out_channels=stage_out_channels[1],
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            activation_layer=activation_layer)
+        self._conv1 = ConvNormActivation(in_channels=3,
+                                         out_channels=stage_out_channels[1],
+                                         kernel_size=3,
+                                         stride=2,
+                                         padding=1,
+                                         activation_layer=activation_layer)
         self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
 
         # 2. bottleneck sequences
@@ -246,21 +249,21 @@ def __init__(self, scale=1.0, act="relu", num_classes=1000, with_pool=True):
         for stage_id, num_repeat in enumerate(stage_repeats):
             for i in range(num_repeat):
                 if i == 0:
-                    block = self.add_sublayer(
-                        sublayer=InvertedResidualDS(
-                            in_channels=stage_out_channels[stage_id + 1],
-                            out_channels=stage_out_channels[stage_id + 2],
-                            stride=2,
-                            activation_layer=activation_layer),
-                        name=str(stage_id + 2) + "_" + str(i + 1))
+                    block = self.add_sublayer(sublayer=InvertedResidualDS(
+                        in_channels=stage_out_channels[stage_id + 1],
+                        out_channels=stage_out_channels[stage_id + 2],
+                        stride=2,
+                        activation_layer=activation_layer),
+                                              name=str(stage_id + 2) + "_" +
+                                              str(i + 1))
                 else:
-                    block = self.add_sublayer(
-                        sublayer=InvertedResidual(
-                            in_channels=stage_out_channels[stage_id + 2],
-                            out_channels=stage_out_channels[stage_id + 2],
-                            stride=1,
-                            activation_layer=activation_layer),
-                        name=str(stage_id + 2) + "_" + str(i + 1))
+                    block = self.add_sublayer(sublayer=InvertedResidual(
+                        in_channels=stage_out_channels[stage_id + 2],
+                        out_channels=stage_out_channels[stage_id + 2],
+                        stride=1,
+                        activation_layer=activation_layer),
+                                              name=str(stage_id + 2) + "_" +
+                                              str(i + 1))
                 self._block_list.append(block)
         # 3. last_conv
         self._last_conv = ConvNormActivation(
@@ -335,8 +338,10 @@ def shufflenet_v2_x0_25(pretrained=False, **kwargs):
             print(out.shape)
 
     """
-    return _shufflenet_v2(
-        "shufflenet_v2_x0_25", scale=0.25, pretrained=pretrained, **kwargs)
+    return _shufflenet_v2("shufflenet_v2_x0_25",
+                          scale=0.25,
+                          pretrained=pretrained,
+                          **kwargs)
 
 
 def shufflenet_v2_x0_33(pretrained=False, **kwargs):
@@ -364,8 +369,10 @@ def shufflenet_v2_x0_33(pretrained=False, **kwargs):
             print(out.shape)
 
     """
-    return _shufflenet_v2(
-        "shufflenet_v2_x0_33", scale=0.33, pretrained=pretrained, **kwargs)
+    return _shufflenet_v2("shufflenet_v2_x0_33",
+                          scale=0.33,
+                          pretrained=pretrained,
+                          **kwargs)
 
 
 def shufflenet_v2_x0_5(pretrained=False, **kwargs):
@@ -393,8 +400,10 @@ def shufflenet_v2_x0_5(pretrained=False, **kwargs):
             print(out.shape)
 
     """
-    return _shufflenet_v2(
-        "shufflenet_v2_x0_5", scale=0.5, pretrained=pretrained, **kwargs)
+    return _shufflenet_v2("shufflenet_v2_x0_5",
+                          scale=0.5,
+                          pretrained=pretrained,
+                          **kwargs)
 
 
 def shufflenet_v2_x1_0(pretrained=False, **kwargs):
@@ -422,8 +431,10 @@ def shufflenet_v2_x1_0(pretrained=False, **kwargs):
             print(out.shape)
 
     """
-    return _shufflenet_v2(
-        "shufflenet_v2_x1_0", scale=1.0, pretrained=pretrained, **kwargs)
+    return _shufflenet_v2("shufflenet_v2_x1_0",
+                          scale=1.0,
+                          pretrained=pretrained,
+                          **kwargs)
 
 
 def shufflenet_v2_x1_5(pretrained=False, **kwargs):
@@ -451,8 +462,10 @@ def shufflenet_v2_x1_5(pretrained=False, **kwargs):
             print(out.shape)
 
     """
-    return _shufflenet_v2(
-        "shufflenet_v2_x1_5", scale=1.5, pretrained=pretrained, **kwargs)
+    return _shufflenet_v2("shufflenet_v2_x1_5",
+                          scale=1.5,
+                          pretrained=pretrained,
+                          **kwargs)
 
 
 def shufflenet_v2_x2_0(pretrained=False, **kwargs):
@@ -480,8 +493,10 @@ def shufflenet_v2_x2_0(pretrained=False, **kwargs):
             print(out.shape)
 
     """
-    return _shufflenet_v2(
-        "shufflenet_v2_x2_0", scale=2.0, pretrained=pretrained, **kwargs)
+    return _shufflenet_v2("shufflenet_v2_x2_0",
+                          scale=2.0,
+                          pretrained=pretrained,
+                          **kwargs)
 
 
 def shufflenet_v2_swish(pretrained=False, **kwargs):
@@ -509,9 +524,8 @@ def shufflenet_v2_swish(pretrained=False, **kwargs):
             print(out.shape)
 
     """
-    return _shufflenet_v2(
-        "shufflenet_v2_swish",
-        scale=1.0,
-        act="swish",
-        pretrained=pretrained,
-        **kwargs)
+    return _shufflenet_v2("shufflenet_v2_swish",
+                          scale=1.0,
+                          act="swish",
+                          pretrained=pretrained,
+                          **kwargs)
diff --git a/python/paddle/vision/models/squeezenet.py b/python/paddle/vision/models/squeezenet.py
index 804be2622cfec..b122a7952862c 100644
--- a/python/paddle/vision/models/squeezenet.py
+++ b/python/paddle/vision/models/squeezenet.py
@@ -38,15 +38,15 @@
 
 
 class MakeFireConv(nn.Layer):
+
     def __init__(self, input_channels, output_channels, filter_size, padding=0):
         super(MakeFireConv, self).__init__()
-        self._conv = Conv2D(
-            input_channels,
-            output_channels,
-            filter_size,
-            padding=padding,
-            weight_attr=ParamAttr(),
-            bias_attr=ParamAttr())
+        self._conv = Conv2D(input_channels,
+                            output_channels,
+                            filter_size,
+                            padding=padding,
+                            weight_attr=ParamAttr(),
+                            bias_attr=ParamAttr())
 
     def forward(self, x):
         x = self._conv(x)
@@ -55,13 +55,16 @@ def forward(self, x):
 
 
 class MakeFire(nn.Layer):
+
     def __init__(self, input_channels, squeeze_channels, expand1x1_channels,
                  expand3x3_channels):
         super(MakeFire, self).__init__()
         self._conv = MakeFireConv(input_channels, squeeze_channels, 1)
         self._conv_path1 = MakeFireConv(squeeze_channels, expand1x1_channels, 1)
-        self._conv_path2 = MakeFireConv(
-            squeeze_channels, expand3x3_channels, 3, padding=1)
+        self._conv_path2 = MakeFireConv(squeeze_channels,
+                                        expand3x3_channels,
+                                        3,
+                                        padding=1)
 
     def forward(self, inputs):
         x = self._conv(inputs)
@@ -110,13 +113,12 @@ def __init__(self, version, num_classes=1000, with_pool=True):
                 supported_versions, version)
 
         if self.version == "1.0":
-            self._conv = Conv2D(
-                3,
-                96,
-                7,
-                stride=2,
-                weight_attr=ParamAttr(),
-                bias_attr=ParamAttr())
+            self._conv = Conv2D(3,
+                                96,
+                                7,
+                                stride=2,
+                                weight_attr=ParamAttr(),
+                                bias_attr=ParamAttr())
             self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
             self._conv1 = MakeFire(96, 16, 64, 64)
             self._conv2 = MakeFire(128, 16, 64, 64)
@@ -127,14 +129,13 @@ def __init__(self, version, num_classes=1000, with_pool=True):
             self._conv7 = MakeFire(384, 64, 256, 256)
             self._conv8 = MakeFire(512, 64, 256, 256)
         else:
-            self._conv = Conv2D(
-                3,
-                64,
-                3,
-                stride=2,
-                padding=1,
-                weight_attr=ParamAttr(),
-                bias_attr=ParamAttr())
+            self._conv = Conv2D(3,
+                                64,
+                                3,
+                                stride=2,
+                                padding=1,
+                                weight_attr=ParamAttr(),
+                                bias_attr=ParamAttr())
             self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
             self._conv1 = MakeFire(64, 16, 64, 64)
             self._conv2 = MakeFire(128, 16, 64, 64)
@@ -146,8 +147,11 @@ def __init__(self, version, num_classes=1000, with_pool=True):
             self._conv8 = MakeFire(512, 64, 256, 256)
 
         self._drop = Dropout(p=0.5, mode="downscale_in_infer")
-        self._conv9 = Conv2D(
-            512, num_classes, 1, weight_attr=ParamAttr(), bias_attr=ParamAttr())
+        self._conv9 = Conv2D(512,
+                             num_classes,
+                             1,
+                             weight_attr=ParamAttr(),
+                             bias_attr=ParamAttr())
         self._avg_pool = AdaptiveAvgPool2D(1)
 
     def forward(self, inputs):
diff --git a/python/paddle/vision/models/vgg.py b/python/paddle/vision/models/vgg.py
index 755f77aa2971a..dd88d06449374 100644
--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
@@ -68,7 +68,8 @@ def __init__(self, features, num_classes=1000, with_pool=True):
                 nn.Linear(4096, 4096),
                 nn.ReLU(),
                 nn.Dropout(),
-                nn.Linear(4096, num_classes), )
+                nn.Linear(4096, num_classes),
+            )
 
     def forward(self, x):
         x = self.features(x)
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index d45c652885b69..69fba204dd314 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -23,7 +23,7 @@
 from paddle.common_ops_import import *
 from paddle import _C_ops
 
-__all__ = [ #noqa
+__all__ = [  #noqa
     'yolo_loss',
     'yolo_box',
     'deform_conv2d',
@@ -238,15 +238,14 @@ def yolo_loss(x,
         "scale_x_y": scale_x_y,
     }
 
-    helper.append_op(
-        type='yolov3_loss',
-        inputs=inputs,
-        outputs={
-            'Loss': loss,
-            'ObjectnessMask': objectness_mask,
-            'GTMatchMask': gt_match_mask
-        },
-        attrs=attrs)
+    helper.append_op(type='yolov3_loss',
+                     inputs=inputs,
+                     outputs={
+                         'Loss': loss,
+                         'ObjectnessMask': objectness_mask,
+                         'GTMatchMask': gt_match_mask
+                     },
+                     attrs=attrs)
     return loss
 
 
@@ -379,9 +378,11 @@ def yolo_box(x,
                                                    scale_x_y=1.)
     """
     if in_dygraph_mode():
-        boxes, scores = _C_ops.final_state_yolo_box(
-            x, img_size, anchors, class_num, conf_thresh, downsample_ratio,
-            clip_bbox, scale_x_y, iou_aware, iou_aware_factor)
+        boxes, scores = _C_ops.final_state_yolo_box(x, img_size, anchors,
+                                                    class_num, conf_thresh,
+                                                    downsample_ratio, clip_bbox,
+                                                    scale_x_y, iou_aware,
+                                                    iou_aware_factor)
         return boxes, scores
 
     if _non_static_mode():
@@ -413,17 +414,16 @@ def yolo_box(x,
         "iou_aware_factor": iou_aware_factor
     }
 
-    helper.append_op(
-        type='yolo_box',
-        inputs={
-            "X": x,
-            "ImgSize": img_size,
-        },
-        outputs={
-            'Boxes': boxes,
-            'Scores': scores,
-        },
-        attrs=attrs)
+    helper.append_op(type='yolo_box',
+                     inputs={
+                         "X": x,
+                         "ImgSize": img_size,
+                     },
+                     outputs={
+                         'Boxes': boxes,
+                         'Scores': scores,
+                     },
+                     attrs=attrs)
     return boxes, scores
 
 
@@ -559,9 +559,10 @@ def deform_conv2d(x,
     use_deform_conv2d_v1 = True if mask is None else False
 
     if in_dygraph_mode():
-        pre_bias = _C_ops.final_state_deformable_conv(
-            x, offset, weight, mask, stride, padding, dilation,
-            deformable_groups, groups, 1)
+        pre_bias = _C_ops.final_state_deformable_conv(x, offset, weight, mask,
+                                                      stride, padding, dilation,
+                                                      deformable_groups, groups,
+                                                      1)
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=1)
         else:
@@ -622,17 +623,20 @@ def deform_conv2d(x,
             'deformable_groups': deformable_groups,
             'im2col_step': 1,
         }
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        helper.append_op(type=op_type,
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs=attrs)
 
         if bias is not None:
             out = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [bias]},
-                outputs={'Out': [out]},
-                attrs={'axis': 1})
+            helper.append_op(type='elementwise_add',
+                             inputs={
+                                 'X': [pre_bias],
+                                 'Y': [bias]
+                             },
+                             outputs={'Out': [out]},
+                             attrs={'axis': 1})
         else:
             out = pre_bias
     return out
@@ -813,21 +817,21 @@ def _get_default_param_initializer():
             shape=filter_shape,
             attr=self._weight_attr,
             default_initializer=_get_default_param_initializer())
-        self.bias = self.create_parameter(
-            attr=self._bias_attr, shape=[self._out_channels], is_bias=True)
+        self.bias = self.create_parameter(attr=self._bias_attr,
+                                          shape=[self._out_channels],
+                                          is_bias=True)
 
     def forward(self, x, offset, mask=None):
-        out = deform_conv2d(
-            x=x,
-            offset=offset,
-            weight=self.weight,
-            bias=self.bias,
-            stride=self._stride,
-            padding=self._padding,
-            dilation=self._dilation,
-            deformable_groups=self._deformable_groups,
-            groups=self._groups,
-            mask=mask)
+        out = deform_conv2d(x=x,
+                            offset=offset,
+                            weight=self.weight,
+                            bias=self.bias,
+                            stride=self._stride,
+                            padding=self._padding,
+                            dilation=self._dilation,
+                            deformable_groups=self._deformable_groups,
+                            groups=self._groups,
+                            mask=mask)
         return out
 
 
@@ -870,8 +874,10 @@ def read_file(filename, name=None):
 
     helper = LayerHelper("read_file", **locals())
     out = helper.create_variable_for_type_inference('uint8')
-    helper.append_op(
-        type="read_file", inputs=inputs, attrs=attrs, outputs={"Out": out})
+    helper.append_op(type="read_file",
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={"Out": out})
 
     return out
 
@@ -920,8 +926,10 @@ def decode_jpeg(x, mode='unchanged', name=None):
 
     helper = LayerHelper("decode_jpeg", **locals())
     out = helper.create_variable_for_type_inference('uint8')
-    helper.append_op(
-        type="decode_jpeg", inputs=inputs, attrs=attrs, outputs={"Out": out})
+    helper.append_op(type="decode_jpeg",
+                     inputs=inputs,
+                     attrs=attrs,
+                     outputs={"Out": out})
 
     return out
 
@@ -983,17 +991,18 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     helper = LayerHelper('psroi_pool', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='psroi_pool',
-        inputs={'X': x,
-                'ROIs': boxes},
-        outputs={'Out': out},
-        attrs={
-            'output_channels': output_channels,
-            'spatial_scale': spatial_scale,
-            'pooled_height': pooled_height,
-            'pooled_width': pooled_width
-        })
+    helper.append_op(type='psroi_pool',
+                     inputs={
+                         'X': x,
+                         'ROIs': boxes
+                     },
+                     outputs={'Out': out},
+                     attrs={
+                         'output_channels': output_channels,
+                         'spatial_scale': spatial_scale,
+                         'pooled_height': pooled_height,
+                         'pooled_width': pooled_width
+                     })
     return out
 
 
@@ -1090,9 +1099,10 @@ def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
                                            pooled_width, spatial_scale)
     if _in_legacy_dygraph():
         assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
-        pool_out, argmaxes = _C_ops.roi_pool(
-            x, boxes, boxes_num, "pooled_height", pooled_height, "pooled_width",
-            pooled_width, "spatial_scale", spatial_scale)
+        pool_out, argmaxes = _C_ops.roi_pool(x, boxes, boxes_num,
+                                             "pooled_height", pooled_height,
+                                             "pooled_width", pooled_width,
+                                             "spatial_scale", spatial_scale)
         return pool_out
 
     else:
@@ -1109,16 +1119,17 @@ def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
         }
         if boxes_num is not None:
             inputs['RoisNum'] = boxes_num
-        helper.append_op(
-            type="roi_pool",
-            inputs=inputs,
-            outputs={"Out": pool_out,
-                     "Argmax": argmaxes},
-            attrs={
-                "pooled_height": pooled_height,
-                "pooled_width": pooled_width,
-                "spatial_scale": spatial_scale
-            })
+        helper.append_op(type="roi_pool",
+                         inputs=inputs,
+                         outputs={
+                             "Out": pool_out,
+                             "Argmax": argmaxes
+                         },
+                         attrs={
+                             "pooled_height": pooled_height,
+                             "pooled_width": pooled_width,
+                             "spatial_scale": spatial_scale
+                         })
         return pool_out
 
 
@@ -1156,12 +1167,11 @@ def __init__(self, output_size, spatial_scale=1.0):
         self._spatial_scale = spatial_scale
 
     def forward(self, x, boxes, boxes_num):
-        return roi_pool(
-            x=x,
-            boxes=boxes,
-            boxes_num=boxes_num,
-            output_size=self._output_size,
-            spatial_scale=self._spatial_scale)
+        return roi_pool(x=x,
+                        boxes=boxes,
+                        boxes_num=boxes_num,
+                        output_size=self._output_size,
+                        spatial_scale=self._spatial_scale)
 
     def extra_repr(self):
         main_str = 'output_size={_output_size}, spatial_scale={_spatial_scale}'
@@ -1250,10 +1260,11 @@ def roi_align(x,
                                             sampling_ratio, aligned)
     if _in_legacy_dygraph():
         assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
-        align_out = _C_ops.roi_align(
-            x, boxes, boxes_num, "pooled_height", pooled_height, "pooled_width",
-            pooled_width, "spatial_scale", spatial_scale, "sampling_ratio",
-            sampling_ratio, "aligned", aligned)
+        align_out = _C_ops.roi_align(x, boxes, boxes_num, "pooled_height",
+                                     pooled_height, "pooled_width",
+                                     pooled_width, "spatial_scale",
+                                     spatial_scale, "sampling_ratio",
+                                     sampling_ratio, "aligned", aligned)
         return align_out
 
     else:
@@ -1269,17 +1280,16 @@ def roi_align(x,
         }
         if boxes_num is not None:
             inputs['RoisNum'] = boxes_num
-        helper.append_op(
-            type="roi_align",
-            inputs=inputs,
-            outputs={"Out": align_out},
-            attrs={
-                "pooled_height": pooled_height,
-                "pooled_width": pooled_width,
-                "spatial_scale": spatial_scale,
-                "sampling_ratio": sampling_ratio,
-                "aligned": aligned,
-            })
+        helper.append_op(type="roi_align",
+                         inputs=inputs,
+                         outputs={"Out": align_out},
+                         attrs={
+                             "pooled_height": pooled_height,
+                             "pooled_width": pooled_width,
+                             "spatial_scale": spatial_scale,
+                             "sampling_ratio": sampling_ratio,
+                             "aligned": aligned,
+                         })
         return align_out
 
 
@@ -1321,13 +1331,12 @@ def __init__(self, output_size, spatial_scale=1.0):
         self._spatial_scale = spatial_scale
 
     def forward(self, x, boxes, boxes_num, aligned=True):
-        return roi_align(
-            x=x,
-            boxes=boxes,
-            boxes_num=boxes_num,
-            output_size=self._output_size,
-            spatial_scale=self._spatial_scale,
-            aligned=aligned)
+        return roi_align(x=x,
+                         boxes=boxes,
+                         boxes_num=boxes_num,
+                         output_size=self._output_size,
+                         spatial_scale=self._spatial_scale,
+                         aligned=aligned)
 
 
 class ConvNormActivation(Sequential):
@@ -1367,15 +1376,14 @@ def __init__(self,
         if bias is None:
             bias = norm_layer is None
         layers = [
-            Conv2D(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding,
-                dilation=dilation,
-                groups=groups,
-                bias_attr=bias)
+            Conv2D(in_channels,
+                   out_channels,
+                   kernel_size,
+                   stride,
+                   padding,
+                   dilation=dilation,
+                   groups=groups,
+                   bias_attr=bias)
         ]
         if norm_layer is not None:
             layers.append(norm_layer(out_channels))
@@ -1466,11 +1474,10 @@ def _nms(boxes, iou_threshold):
 
         helper = LayerHelper('nms', **locals())
         out = helper.create_variable_for_type_inference('int64')
-        helper.append_op(
-            type='nms',
-            inputs={'Boxes': boxes},
-            outputs={'KeepBoxesIdxs': out},
-            attrs={'iou_threshold': iou_threshold})
+        helper.append_op(type='nms',
+                         inputs={'Boxes': boxes},
+                         outputs={'KeepBoxesIdxs': out},
+                         attrs={'iou_threshold': iou_threshold})
         return out
 
     if scores is None:
@@ -1500,8 +1507,8 @@ def _nms(boxes, iou_threshold):
             continue
         cur_category_boxes = boxes[cur_category_boxes_idxs]
         cur_category_scores = scores[cur_category_boxes_idxs]
-        cur_category_sorted_indices = paddle.argsort(
-            cur_category_scores, descending=True)
+        cur_category_sorted_indices = paddle.argsort(cur_category_scores,
+                                                     descending=True)
         cur_category_sorted_boxes = cur_category_boxes[
             cur_category_sorted_indices]
 
@@ -1519,8 +1526,8 @@ def _nms(boxes, iou_threshold):
     keep_boxes_idxs = paddle.where(mask)[0]
     shape = keep_boxes_idxs.shape[0]
     keep_boxes_idxs = paddle.reshape(keep_boxes_idxs, [shape])
-    sorted_sub_indices = paddle.argsort(
-        scores[keep_boxes_idxs], descending=True)
+    sorted_sub_indices = paddle.argsort(scores[keep_boxes_idxs],
+                                        descending=True)
 
     if top_k is None:
         return keep_boxes_idxs[sorted_sub_indices]
diff --git a/python/paddle/vision/transforms/__init__.py b/python/paddle/vision/transforms/__init__.py
index 5992a4f977411..d615598bf2bcc 100644
--- a/python/paddle/vision/transforms/__init__.py
+++ b/python/paddle/vision/transforms/__init__.py
@@ -51,7 +51,7 @@
 from .functional import normalize  # noqa: F401
 from .functional import erase  # noqa: F401
 
-__all__ = [ #noqa
+__all__ = [  #noqa
     'BaseTransform',
     'Compose',
     'Resize',
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index 7927e9faee370..ecc160b0c0e07 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -72,11 +72,11 @@ def to_tensor(pic, data_format='CHW'):
             print(tensor.shape)
 
     """
-    if not (_is_pil_image(pic) or _is_numpy_image(pic) or
-            _is_tensor_image(pic)):
+    if not (_is_pil_image(pic) or _is_numpy_image(pic)
+            or _is_tensor_image(pic)):
         raise TypeError(
-            'pic should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(pic)))
+            'pic should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(pic)))
 
     if _is_pil_image(pic):
         return F_pil.to_tensor(pic, data_format)
@@ -130,11 +130,11 @@ def resize(img, size, interpolation='bilinear'):
             print(converted_img.size)
             # (150, 200)
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img) or
-            _is_tensor_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img)
+            or _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(img)))
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.resize(img, size, interpolation)
@@ -194,11 +194,11 @@ def pad(img, padding, fill=0, padding_mode='constant'):
             padded_img = F.pad(fake_img, padding=(2, 1))
             print(padded_img.size)
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img) or
-            _is_tensor_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img)
+            or _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(img)))
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.pad(img, padding, fill, padding_mode)
@@ -237,11 +237,11 @@ def crop(img, top, left, height, width):
             print(cropped_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img) or
-            _is_tensor_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img)
+            or _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(img)))
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.crop(img, top, left, height, width)
@@ -276,11 +276,11 @@ def center_crop(img, output_size):
             cropped_img = F.center_crop(fake_img, (150, 100))
             print(cropped_img.size)
         """
-    if not (_is_pil_image(img) or _is_numpy_image(img) or
-            _is_tensor_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img)
+            or _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(img)))
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.center_crop(img, output_size)
@@ -314,11 +314,11 @@ def hflip(img):
             print(flpped_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img) or
-            _is_tensor_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img)
+            or _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(img)))
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.hflip(img)
@@ -352,11 +352,11 @@ def vflip(img):
             print(flpped_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img) or
-            _is_tensor_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img)
+            or _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(img)))
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.vflip(img)
@@ -397,11 +397,11 @@ def adjust_brightness(img, brightness_factor):
 
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img) or
-            _is_tensor_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img)
+            or _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(img)))
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.adjust_brightness(img, brightness_factor)
@@ -437,11 +437,11 @@ def adjust_contrast(img, contrast_factor):
             converted_img = F.adjust_contrast(fake_img, 0.4)
             print(converted_img.size)
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img) or
-            _is_tensor_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img)
+            or _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(img)))
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.adjust_contrast(img, contrast_factor)
@@ -478,11 +478,11 @@ def adjust_saturation(img, saturation_factor):
             print(converted_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img) or
-            _is_tensor_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img)
+            or _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(img)))
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.adjust_saturation(img, saturation_factor)
@@ -528,11 +528,11 @@ def adjust_hue(img, hue_factor):
             print(converted_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img) or
-            _is_tensor_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img)
+            or _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(img)))
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.adjust_hue(img, hue_factor)
@@ -549,7 +549,7 @@ def _get_affine_matrix(center, angle, translate, scale, shear):
     sx = math.radians(shear[0])
     sy = math.radians(shear[1])
 
-    # Rotate and Shear without scaling 
+    # Rotate and Shear without scaling
     a = math.cos(rot - sy) / math.cos(sy)
     b = -math.cos(rot - sy) * math.tan(sx) / math.cos(sy) - math.sin(rot)
     c = math.sin(rot - sy) / math.cos(sy)
@@ -621,11 +621,11 @@ def affine(img,
             print(affined_img.shape)
     """
 
-    if not (_is_pil_image(img) or _is_numpy_image(img) or
-            _is_tensor_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img)
+            or _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(img)))
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(img)))
 
     if not isinstance(angle, (int, float)):
         raise TypeError("Argument angle should be int or float")
@@ -753,11 +753,11 @@ def rotate(img,
             print(rotated_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img) or
-            _is_tensor_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img)
+            or _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(img)))
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(img)))
 
     if isinstance(center, list):
         center = tuple(center)
@@ -844,11 +844,11 @@ def perspective(img, startpoints, endpoints, interpolation='nearest', fill=0):
             print(perspectived_img.shape)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img) or
-            _is_tensor_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img)
+            or _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(img)))
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(img)))
 
     if _is_pil_image(img):
         coeffs = _get_perspective_coeffs(startpoints, endpoints)
@@ -888,11 +888,11 @@ def to_grayscale(img, num_output_channels=1):
             print(gray_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img) or
-            _is_tensor_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img)
+            or _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
-            format(type(img)))
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'
+            .format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.to_grayscale(img, num_output_channels)
diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py
index 1b2485541c499..df31add6f77db 100644
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -52,8 +52,8 @@ def to_tensor(pic, data_format='CHW'):
     """
 
     if data_format not in ['CHW', 'HWC']:
-        raise ValueError('data_format should be CHW or HWC. Got {}'.format(
-            data_format))
+        raise ValueError(
+            'data_format should be CHW or HWC. Got {}'.format(data_format))
 
     if pic.ndim == 2:
         pic = pic[:, :, None]
@@ -121,10 +121,9 @@ def resize(img, size, interpolation='bilinear'):
                 dsize=(ow, oh),
                 interpolation=_cv2_interp_from_str[interpolation])
     else:
-        output = cv2.resize(
-            img,
-            dsize=(size[1], size[0]),
-            interpolation=_cv2_interp_from_str[interpolation])
+        output = cv2.resize(img,
+                            dsize=(size[1], size[0]),
+                            interpolation=_cv2_interp_from_str[interpolation])
     if len(img.shape) == 3 and img.shape[2] == 1:
         return output[:, :, np.newaxis]
     else:
@@ -202,23 +201,21 @@ def pad(img, padding, fill=0, padding_mode='constant'):
         pad_bottom = padding[3]
 
     if len(img.shape) == 3 and img.shape[2] == 1:
-        return cv2.copyMakeBorder(
-            img,
-            top=pad_top,
-            bottom=pad_bottom,
-            left=pad_left,
-            right=pad_right,
-            borderType=_cv2_pad_from_str[padding_mode],
-            value=fill)[:, :, np.newaxis]
+        return cv2.copyMakeBorder(img,
+                                  top=pad_top,
+                                  bottom=pad_bottom,
+                                  left=pad_left,
+                                  right=pad_right,
+                                  borderType=_cv2_pad_from_str[padding_mode],
+                                  value=fill)[:, :, np.newaxis]
     else:
-        return cv2.copyMakeBorder(
-            img,
-            top=pad_top,
-            bottom=pad_bottom,
-            left=pad_left,
-            right=pad_right,
-            borderType=_cv2_pad_from_str[padding_mode],
-            value=fill)
+        return cv2.copyMakeBorder(img,
+                                  top=pad_top,
+                                  bottom=pad_bottom,
+                                  left=pad_left,
+                                  right=pad_right,
+                                  borderType=_cv2_pad_from_str[padding_mode],
+                                  value=fill)
 
 
 def crop(img, top, left, height, width):
@@ -361,8 +358,8 @@ def adjust_saturation(img, saturation_factor):
 
     dtype = img.dtype
     img = img.astype(np.float32)
-    alpha = np.random.uniform(
-        max(0, 1 - saturation_factor), 1 + saturation_factor)
+    alpha = np.random.uniform(max(0, 1 - saturation_factor),
+                              1 + saturation_factor)
     gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
     gray_img = gray_img[..., np.newaxis]
     img = img * alpha + gray_img * (1 - alpha)
@@ -394,8 +391,8 @@ def adjust_hue(img, hue_factor):
     cv2 = try_import('cv2')
 
     if not (-0.5 <= hue_factor <= 0.5):
-        raise ValueError('hue_factor:{} is not in [-0.5, 0.5].'.format(
-            hue_factor))
+        raise ValueError(
+            'hue_factor:{} is not in [-0.5, 0.5].'.format(hue_factor))
 
     dtype = img.dtype
     img = img.astype(np.uint8)
@@ -476,19 +473,17 @@ def affine(img,
     M[1, 2] = ty
 
     if len(img.shape) == 3 and img.shape[2] == 1:
-        return cv2.warpAffine(
-            img,
-            M,
-            dsize=(w, h),
-            flags=_cv2_interp_from_str[interpolation],
-            borderValue=fill)[:, :, np.newaxis]
+        return cv2.warpAffine(img,
+                              M,
+                              dsize=(w, h),
+                              flags=_cv2_interp_from_str[interpolation],
+                              borderValue=fill)[:, :, np.newaxis]
     else:
-        return cv2.warpAffine(
-            img,
-            M,
-            dsize=(w, h),
-            flags=_cv2_interp_from_str[interpolation],
-            borderValue=fill)
+        return cv2.warpAffine(img,
+                              M,
+                              dsize=(w, h),
+                              flags=_cv2_interp_from_str[interpolation],
+                              borderValue=fill)
 
 
 def rotate(img,
@@ -576,17 +571,15 @@ def transform(x, y, matrix):
         w, h = int(nw), int(nh)
 
     if len(img.shape) == 3 and img.shape[2] == 1:
-        return cv2.warpAffine(
-            img,
-            M, (w, h),
-            flags=_cv2_interp_from_str[interpolation],
-            borderValue=fill)[:, :, np.newaxis]
+        return cv2.warpAffine(img,
+                              M, (w, h),
+                              flags=_cv2_interp_from_str[interpolation],
+                              borderValue=fill)[:, :, np.newaxis]
     else:
-        return cv2.warpAffine(
-            img,
-            M, (w, h),
-            flags=_cv2_interp_from_str[interpolation],
-            borderValue=fill)
+        return cv2.warpAffine(img,
+                              M, (w, h),
+                              flags=_cv2_interp_from_str[interpolation],
+                              borderValue=fill)
 
 
 def perspective(img, startpoints, endpoints, interpolation='nearest', fill=0):
@@ -624,19 +617,17 @@ def perspective(img, startpoints, endpoints, interpolation='nearest', fill=0):
     matrix = cv2.getPerspectiveTransform(startpoints, endpoints)
 
     if len(img.shape) == 3 and img.shape[2] == 1:
-        return cv2.warpPerspective(
-            img,
-            matrix,
-            dsize=(w, h),
-            flags=_cv2_interp_from_str[interpolation],
-            borderValue=fill)[:, :, np.newaxis]
+        return cv2.warpPerspective(img,
+                                   matrix,
+                                   dsize=(w, h),
+                                   flags=_cv2_interp_from_str[interpolation],
+                                   borderValue=fill)[:, :, np.newaxis]
     else:
-        return cv2.warpPerspective(
-            img,
-            matrix,
-            dsize=(w, h),
-            flags=_cv2_interp_from_str[interpolation],
-            borderValue=fill)
+        return cv2.warpPerspective(img,
+                                   matrix,
+                                   dsize=(w, h),
+                                   flags=_cv2_interp_from_str[interpolation],
+                                   borderValue=fill)
 
 
 def to_grayscale(img, num_output_channels=1):
diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py
index 4b86e14039ebe..50ed01f53e2d4 100644
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -71,8 +71,8 @@ def to_tensor(pic, data_format='CHW'):
     """
 
     if data_format not in ['CHW', 'HWC']:
-        raise ValueError('data_format should be CHW or HWC. Got {}'.format(
-            data_format))
+        raise ValueError(
+            'data_format should be CHW or HWC. Got {}'.format(data_format))
 
     # PIL Image
     if pic.mode == 'I':
@@ -231,8 +231,9 @@ def pad(img, padding, fill=0, padding_mode='constant'):
         img = np.asarray(img)
         # RGB image
         if len(img.shape) == 3:
-            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right),
-                               (0, 0)), padding_mode)
+            img = np.pad(img,
+                         ((pad_top, pad_bottom), (pad_left, pad_right), (0, 0)),
+                         padding_mode)
         # Grayscale image
         if len(img.shape) == 2:
             img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)),
@@ -391,8 +392,8 @@ def adjust_hue(img, hue_factor):
 
     """
     if not (-0.5 <= hue_factor <= 0.5):
-        raise ValueError('hue_factor:{} is not in [-0.5, 0.5].'.format(
-            hue_factor))
+        raise ValueError(
+            'hue_factor:{} is not in [-0.5, 0.5].'.format(hue_factor))
 
     input_mode = img.mode
     if input_mode in {'L', '1', 'I', 'F'}:
@@ -471,12 +472,11 @@ def rotate(img,
     if isinstance(fill, int):
         fill = tuple([fill] * 3)
 
-    return img.rotate(
-        angle,
-        _pil_interp_from_str[interpolation],
-        expand,
-        center,
-        fillcolor=fill)
+    return img.rotate(angle,
+                      _pil_interp_from_str[interpolation],
+                      expand,
+                      center,
+                      fillcolor=fill)
 
 
 def perspective(img, coeffs, interpolation="nearest", fill=0):
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index 27f83029babaa..4cf8253ec8b32 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -128,25 +128,22 @@ def _hsv_to_rgb(img):
     q = paddle.clip(v * (1.0 - s * f), 0.0, 1.0)
     t = paddle.clip(v * (1.0 - s * (1.0 - f)), 0.0, 1.0)
 
-    mask = paddle.equal(
-        i.unsqueeze(axis=-3),
-        paddle.arange(
-            6, dtype=i.dtype).reshape((-1, 1, 1))).astype(img.dtype)
-    matrix = paddle.stack(
-        [
-            paddle.stack(
-                [v, q, p, p, t, v], axis=-3), paddle.stack(
-                    [t, v, v, q, p, p], axis=-3), paddle.stack(
-                        [p, p, t, v, v, q], axis=-3)
-        ],
-        axis=-4)
+    mask = paddle.equal(i.unsqueeze(axis=-3),
+                        paddle.arange(6, dtype=i.dtype).reshape(
+                            (-1, 1, 1))).astype(img.dtype)
+    matrix = paddle.stack([
+        paddle.stack([v, q, p, p, t, v], axis=-3),
+        paddle.stack([t, v, v, q, p, p], axis=-3),
+        paddle.stack([p, p, t, v, v, q], axis=-3)
+    ],
+                          axis=-4)
     return paddle.einsum("...ijk, ...xijk -> ...xjk", mask, matrix)
 
 
 def _blend_images(img1, img2, ratio):
     max_value = 1.0 if paddle.is_floating_point(img1) else 255.0
-    return paddle.lerp(img2, img1, float(ratio)).clip(
-        0, max_value).astype(img1.dtype)
+    return paddle.lerp(img2, img1,
+                       float(ratio)).clip(0, max_value).astype(img1.dtype)
 
 
 def normalize(img, mean, std, data_format='CHW'):
@@ -194,8 +191,8 @@ def to_grayscale(img, num_output_channels=1, data_format='CHW'):
     if num_output_channels not in (1, 3):
         raise ValueError('num_output_channels should be either 1 or 3')
 
-    rgb_weights = paddle.to_tensor(
-        [0.2989, 0.5870, 0.1140], place=img.place).astype(img.dtype)
+    rgb_weights = paddle.to_tensor([0.2989, 0.5870, 0.1140],
+                                   place=img.place).astype(img.dtype)
 
     if _is_channel_first(data_format):
         rgb_weights = rgb_weights.reshape((-1, 1, 1))
@@ -231,12 +228,15 @@ def _grid_transform(img, grid, mode, fill):
             shape=[img.shape[0], grid.shape[1], grid.shape[2], grid.shape[3]])
 
     if fill is not None:
-        dummy = paddle.ones(
-            (img.shape[0], 1, img.shape[2], img.shape[3]), dtype=img.dtype)
+        dummy = paddle.ones((img.shape[0], 1, img.shape[2], img.shape[3]),
+                            dtype=img.dtype)
         img = paddle.concat((img, dummy), axis=1)
 
-    img = F.grid_sample(
-        img, grid, mode=mode, padding_mode="zeros", align_corners=False)
+    img = F.grid_sample(img,
+                        grid,
+                        mode=mode,
+                        padding_mode="zeros",
+                        align_corners=False)
 
     # Fill with required color
     if fill is not None:
@@ -287,8 +287,11 @@ def affine(img, matrix, interpolation="nearest", fill=None, data_format='CHW'):
     matrix = matrix.reshape((1, 2, 3))
     shape = img.shape
 
-    grid = _affine_grid(
-        matrix, w=shape[-1], h=shape[-2], ow=shape[-1], oh=shape[-2])
+    grid = _affine_grid(matrix,
+                        w=shape[-1],
+                        h=shape[-2],
+                        ow=shape[-1],
+                        oh=shape[-2])
 
     if isinstance(fill, int):
         fill = tuple([fill] * 3)
@@ -377,8 +380,8 @@ def rotate(img,
              [0.5 * w, 0.5 * h, 1.0], [0.5 * w, -0.5 * h, 1.0]],
             place=matrix.place).astype(matrix.dtype)
 
-        _pos = corners.reshape(
-            (1, -1, 3)).bmm(matrix.transpose((0, 2, 1))).reshape((1, -1, 2))
+        _pos = corners.reshape((1, -1, 3)).bmm(matrix.transpose(
+            (0, 2, 1))).reshape((1, -1, 2))
         _min = _pos.min(axis=-2).floor()
         _max = _pos.max(axis=-2).ceil()
 
@@ -574,13 +577,12 @@ def center_crop(img, output_size, data_format='CHW'):
     crop_height, crop_width = output_size
     crop_top = int(round((image_height - crop_height) / 2.))
     crop_left = int(round((image_width - crop_width) / 2.))
-    return crop(
-        img,
-        crop_top,
-        crop_left,
-        crop_height,
-        crop_width,
-        data_format=data_format)
+    return crop(img,
+                crop_top,
+                crop_left,
+                crop_height,
+                crop_width,
+                data_format=data_format)
 
 
 def pad(img, padding, fill=0, padding_mode='constant', data_format='CHW'):
@@ -705,11 +707,10 @@ def resize(img, size, interpolation='bilinear', data_format='CHW'):
         oh, ow = size
 
     img = img.unsqueeze(0)
-    img = F.interpolate(
-        img,
-        size=(oh, ow),
-        mode=interpolation.lower(),
-        data_format='N' + data_format.upper())
+    img = F.interpolate(img,
+                        size=(oh, ow),
+                        mode=interpolation.lower(),
+                        data_format='N' + data_format.upper())
 
     return img.squeeze(0)
 
@@ -755,11 +756,13 @@ def adjust_contrast(img, contrast_factor):
     channels = _get_image_num_channels(img, 'CHW')
     dtype = img.dtype if paddle.is_floating_point(img) else paddle.float32
     if channels == 1:
-        extreme_target = paddle.mean(
-            img.astype(dtype), axis=(-3, -2, -1), keepdim=True)
+        extreme_target = paddle.mean(img.astype(dtype),
+                                     axis=(-3, -2, -1),
+                                     keepdim=True)
     elif channels == 3:
-        extreme_target = paddle.mean(
-            to_grayscale(img).astype(dtype), axis=(-3, -2, -1), keepdim=True)
+        extreme_target = paddle.mean(to_grayscale(img).astype(dtype),
+                                     axis=(-3, -2, -1),
+                                     keepdim=True)
     else:
         raise ValueError("channels of input should be either 1 or 3.")
 
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 31f56e890558c..79c0720f60777 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -51,8 +51,8 @@ def _get_image_size(img):
             return img.shape[2:][::-1]  # nchw -> wh
         else:
             raise ValueError(
-                "The dim for input Tensor should be 3-D or 4-D, but received {}".
-                format(len(img.shape)))
+                "The dim for input Tensor should be 3-D or 4-D, but received {}"
+                .format(len(img.shape)))
     else:
         raise TypeError("Unexpected type {}".format(type(img)))
 
@@ -72,8 +72,8 @@ def _check_input(value,
             value[0] = max(value[0], 0)
     elif isinstance(value, (tuple, list)) and len(value) == 2:
         if not bound[0] <= value[0] <= value[1] <= bound[1]:
-            raise ValueError("{} values should be between {}".format(name,
-                                                                     bound))
+            raise ValueError("{} values should be between {}".format(
+                name, bound))
     else:
         raise TypeError(
             "{} should be a single number or a list/tuple with lenght 2.".
@@ -418,8 +418,8 @@ class Resize(BaseTransform):
 
     def __init__(self, size, interpolation='bilinear', keys=None):
         super(Resize, self).__init__(keys)
-        assert isinstance(size, int) or (isinstance(size, Iterable) and
-                                         len(size) == 2)
+        assert isinstance(size, int) or (isinstance(size, Iterable)
+                                         and len(size) == 2)
         self.size = size
         self.interpolation = interpolation
 
@@ -938,8 +938,11 @@ class HueTransform(BaseTransform):
 
     def __init__(self, value, keys=None):
         super(HueTransform, self).__init__(keys)
-        self.value = _check_input(
-            value, 'hue', center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
+        self.value = _check_input(value,
+                                  'hue',
+                                  center=0,
+                                  bound=(-0.5, 0.5),
+                                  clip_first_on_zero=False)
 
     def _apply_image(self, img):
         if self.value is None:
@@ -986,7 +989,11 @@ class ColorJitter(BaseTransform):
 
     """
 
-    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0,
+    def __init__(self,
+                 brightness=0,
+                 contrast=0,
+                 saturation=0,
+                 hue=0,
                  keys=None):
         super(ColorJitter, self).__init__(keys)
         self.brightness = brightness
@@ -1405,12 +1412,11 @@ def _apply_image(self, img):
         ret = self._get_param(img_size, self.degrees, self.translate,
                               self.scale, self.shear)
 
-        return F.affine(
-            img,
-            *ret,
-            interpolation=self.interpolation,
-            fill=self.fill,
-            center=self.center)
+        return F.affine(img,
+                        *ret,
+                        interpolation=self.interpolation,
+                        fill=self.fill,
+                        center=self.center)
 
 
 class RandomRotation(BaseTransform):
@@ -1577,14 +1583,17 @@ def get_params(self, width, height, distortion_scale):
         half_height = height // 2
         half_width = width // 2
         topleft = [
-            int(random.uniform(0, int(distortion_scale * half_width) + 1)),
-            int(random.uniform(0, int(distortion_scale * half_height) + 1)),
+            int(random.uniform(0,
+                               int(distortion_scale * half_width) + 1)),
+            int(random.uniform(0,
+                               int(distortion_scale * half_height) + 1)),
         ]
         topright = [
             int(
                 random.uniform(width - int(distortion_scale * half_width) - 1,
                                width)),
-            int(random.uniform(0, int(distortion_scale * half_height) + 1)),
+            int(random.uniform(0,
+                               int(distortion_scale * half_height) + 1)),
         ]
         botright = [
             int(
@@ -1595,7 +1604,8 @@ def get_params(self, width, height, distortion_scale):
                                height)),
         ]
         botleft = [
-            int(random.uniform(0, int(distortion_scale * half_width) + 1)),
+            int(random.uniform(0,
+                               int(distortion_scale * half_width) + 1)),
             int(
                 random.uniform(height - int(distortion_scale * half_height) - 1,
                                height)),
@@ -1723,10 +1733,10 @@ def __init__(self,
                 ), "scale should be of kind (min, max) and in range [0, 1]"
         assert isinstance(ratio,
                           (tuple, list)), "ratio should be a tuple or list"
-        assert (ratio[0] >= 0 and
-                ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
-        assert (prob >= 0 and
-                prob <= 1), "The probability should be in range [0, 1]"
+        assert (ratio[0] >= 0
+                and ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
+        assert (prob >= 0
+                and prob <= 1), "The probability should be in range [0, 1]"
         assert isinstance(
             value, (numbers.Number, str, tuple,
                     list)), "value should be a number, tuple, list or str"
@@ -1772,8 +1782,8 @@ def _get_param(self, img, scale, ratio, value):
                 continue
             if F._is_tensor_image(img):
                 if value is None:
-                    v = paddle.normal(
-                        shape=[c, erase_h, erase_w]).astype(img.dtype)
+                    v = paddle.normal(shape=[c, erase_h, erase_w]).astype(
+                        img.dtype)
                 else:
                     v = paddle.to_tensor(value, dtype=img.dtype)[:, None, None]
             else:
@@ -1808,7 +1818,7 @@ def _apply_image(self, img):
                 raise ValueError(
                     "Value should be a single number or a sequence with length equals to image's channel."
                 )
-            top, left, erase_h, erase_w, v = self._get_param(img, self.scale,
-                                                             self.ratio, value)
+            top, left, erase_h, erase_w, v = self._get_param(
+                img, self.scale, self.ratio, value)
             return F.erase(img, top, left, erase_h, erase_w, v, self.inplace)
         return img
diff --git a/python/setup.py.in b/python/setup.py.in
index 2a0d745729aab..ca1768c9462f0 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -374,6 +374,10 @@ packages=['paddle',
           'paddle.incubate.distributed.models',
           'paddle.incubate.distributed.models.moe',
           'paddle.incubate.distributed.models.moe.gate',
+          'paddle.incubate.sparse',
+          'paddle.incubate.sparse.nn',
+          'paddle.incubate.sparse.nn.layer',
+          'paddle.incubate.sparse.nn.functional',
           'paddle.io',
           'paddle.optimizer',
           'paddle.nn',
@@ -394,9 +398,6 @@ packages=['paddle',
           'paddle.device.cuda',
           'paddle.version',
           'paddle.profiler',
-          'paddle.sparse',
-          'paddle.sparse.layer',
-          'paddle.sparse.functional',
           ]
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
diff --git a/tools/CrossStackProfiler/CspChromeTraceFormatter.py b/tools/CrossStackProfiler/CspChromeTraceFormatter.py
index a8030988aacf1..811e6020267e9 100755
--- a/tools/CrossStackProfiler/CspChromeTraceFormatter.py
+++ b/tools/CrossStackProfiler/CspChromeTraceFormatter.py
@@ -27,6 +27,7 @@
 
 
 class ChromeTraceFormatter(object):
+
     def __init__(self):
         self._events = []
         self._metadata = []
diff --git a/tools/CrossStackProfiler/CspFileReader.py b/tools/CrossStackProfiler/CspFileReader.py
index 12de488aa693e..485f6d0f6a675 100755
--- a/tools/CrossStackProfiler/CspFileReader.py
+++ b/tools/CrossStackProfiler/CspFileReader.py
@@ -45,8 +45,8 @@
     "04_memUtility": [("FB_USED_RATIO", "FB_USED_RATIO"), ("DRAMA", "DRAMA")],
     "05_txUtility": [("NVLTX", "NVLTX"), ("NVLRX", "NVLRX"), ("PCITX", "PCITX"),
                      ("PCIRX", "PCIRX")],
-    "06_calUtility":
-    [("FP32A", "FP32A"), ("FP16A", "FP16A"), ("TENSO", "TENSO")]
+    "06_calUtility": [("FP32A", "FP32A"), ("FP16A", "FP16A"),
+                      ("TENSO", "TENSO")]
 }
 DCGMINFO_TRACE_NUM = len(dcgmMetricParameterMap.keys())
 NETINFO_TRACE_NUM = 2
@@ -66,6 +66,7 @@
 
 
 class FileReader(object):
+
     def __init__(self, logger, args):
         self._logger = logger
         self._args = args
@@ -174,8 +175,8 @@ def _getFileList(self):
                     file)
 
         if not self._fileList:
-            if (self._getId(self._fileList[-1]) - self._getId(self._fileList[0])
-                ) != len(self._fileList) - 1:
+            if (self._getId(self._fileList[-1]) -
+                    self._getId(self._fileList[0])) != len(self._fileList) - 1:
                 raise Exception("The file id should be countious!")
         # sort
         def _sortBySuffix(elem):
@@ -194,8 +195,9 @@ def _sortBySuffix(elem):
 
     def _getId(self, fileName, organizeForm, sed="."):
         if self._organizeForm != organizeForm:
-            raise TypeError("Can not get rank id when organizer form is not %s!"
-                            % organizeForm)
+            raise TypeError(
+                "Can not get rank id when organizer form is not %s!" %
+                organizeForm)
 
         if not os.path.isfile(fileName):
             raise IOError("[%s] is not a valid file!" % (fileName))
@@ -294,12 +296,20 @@ def dumpOpInfoDict(self,
                        gpuId,
                        pretty=False,
                        tmpPath="./tmp"):
-        return self.dumpDict(
-            data, "opinfo", groupId, gpuId, pretty=False, tmpPath="./tmp")
+        return self.dumpDict(data,
+                             "opinfo",
+                             groupId,
+                             gpuId,
+                             pretty=False,
+                             tmpPath="./tmp")
 
     def dumpDCGMDict(self, data, groupId, gpuId, pretty=False, tmpPath="./tmp"):
-        return self.dumpDict(
-            data, "dcgm", groupId, gpuId, pretty=False, tmpPath="./tmp")
+        return self.dumpDict(data,
+                             "dcgm",
+                             groupId,
+                             gpuId,
+                             pretty=False,
+                             tmpPath="./tmp")
 
     def dumpDict(self,
                  data,
diff --git a/tools/CrossStackProfiler/CspReporter.py b/tools/CrossStackProfiler/CspReporter.py
index 1b8ae0e385534..dc12f725bb447 100755
--- a/tools/CrossStackProfiler/CspReporter.py
+++ b/tools/CrossStackProfiler/CspReporter.py
@@ -33,37 +33,41 @@
 
 def get_argparse():
     parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument(
-        '--profile_path',
-        type=str,
-        default='.',
-        help='Working path that store the monitor data.')
-
-    parser.add_argument(
-        '--timeline_path',
-        type=str,
-        default='.',
-        help='Output timeline file name.')
-
-    parser.add_argument(
-        '--gpuPerTrainer', type=int, default=8, help='Gpus per trainer.')
-
-    parser.add_argument(
-        '--trainerNum', type=int, default=4, help='Num of trainer.')
-
-    parser.add_argument(
-        '--groupSize', type=int, default=8, help='Num of trainer in a group.')
-
-    parser.add_argument(
-        '--displaySize',
-        type=int,
-        default=2,
-        help='Num of line need to display in a group.')
+    parser.add_argument('--profile_path',
+                        type=str,
+                        default='.',
+                        help='Working path that store the monitor data.')
+
+    parser.add_argument('--timeline_path',
+                        type=str,
+                        default='.',
+                        help='Output timeline file name.')
+
+    parser.add_argument('--gpuPerTrainer',
+                        type=int,
+                        default=8,
+                        help='Gpus per trainer.')
+
+    parser.add_argument('--trainerNum',
+                        type=int,
+                        default=4,
+                        help='Num of trainer.')
+
+    parser.add_argument('--groupSize',
+                        type=int,
+                        default=8,
+                        help='Num of trainer in a group.')
+
+    parser.add_argument('--displaySize',
+                        type=int,
+                        default=2,
+                        help='Num of line need to display in a group.')
 
     return parser.parse_args()
 
 
 class CspReporter(object):
+
     def __init__(self, args):
         self._args = args
         print(self._args)
@@ -160,17 +164,17 @@ def _generateTraceFileByGroupAndGpuId(self, pipileInfo, netInfo, groupId,
         opInfoDict = self._profileFileReader.getOpInfoDict(groupId, gpuId)
 
         traceObj = {}
-        traceObj["traceEvents"] = pipileInfo[str(gpuId)] + opInfoDict[
-            "traceEvents"] + dcgmInfoDict["traceEvents"] + netInfo[
-                "traceEvents"]
+        traceObj["traceEvents"] = pipileInfo[str(
+            gpuId)] + opInfoDict["traceEvents"] + dcgmInfoDict[
+                "traceEvents"] + netInfo["traceEvents"]
 
         self._profileFileReader.dumpDict(traceObj, "traceFile", groupId, gpuId,
                                          False, self._saveFilePath)
 
     def _generateTraceFileByGroup(self, groupId, processNum):
         # first we need to generate pipeline info
-        pipileInfo = self._profileFileReader.getPipeLineInfo(groupId,
-                                                             processNum)
+        pipileInfo = self._profileFileReader.getPipeLineInfo(
+            groupId, processNum)
         # second we need to generate dcgm info
         dcgmInfo = self._dcgmFileReader.getDCGMTraceInfo(groupId, processNum)
 
@@ -187,13 +191,13 @@ def _generateTraceFileByGroup(self, groupId, processNum):
         pidList = []
 
         for gpuId in range(self._gpuPerTrainer):
-            subproc = Process(
-                target=self._generateTraceFileByGroupAndGpuId,
-                args=(
-                    pipileInfo,
-                    netInfo,
-                    groupId,
-                    gpuId, ))
+            subproc = Process(target=self._generateTraceFileByGroupAndGpuId,
+                              args=(
+                                  pipileInfo,
+                                  netInfo,
+                                  groupId,
+                                  gpuId,
+                              ))
             processPool.append(subproc)
             subproc.start()
             pidList.append(subproc.pid)
@@ -212,11 +216,11 @@ def generateTraceFile(self, processNum=8):
         processPool = []
         pidList = []
         for groupId in range(self._trainerNum / self._groupSize):
-            subproc = Process(
-                target=self._generateTraceFileByGroup,
-                args=(
-                    groupId,
-                    processNum, ))
+            subproc = Process(target=self._generateTraceFileByGroup,
+                              args=(
+                                  groupId,
+                                  processNum,
+                              ))
             processPool.append(subproc)
             subproc.start()
             pidList.append(subproc.pid)
diff --git a/tools/CrossStackProfiler/DCGMFileReader.py b/tools/CrossStackProfiler/DCGMFileReader.py
index 599acb44c6556..4ae15df5ad03b 100755
--- a/tools/CrossStackProfiler/DCGMFileReader.py
+++ b/tools/CrossStackProfiler/DCGMFileReader.py
@@ -34,6 +34,7 @@
 
 
 class dcgmFileReader(FileReader):
+
     def parseFileByGroup(self, groupId, processNum=8):
         fileFist = self.getFileListByGroup(groupId)
         displaySize = min(self._displaySize, len(fileFist))
@@ -53,10 +54,10 @@ def parseFileByGroup(self, groupId, processNum=8):
 
             taskList = self._splitTaskListForMultiProcess(fileFist, processNum)
             for task in taskList:
-                subproc = Process(
-                    target=self._parseTask, args=(
-                        task,
-                        q, ))
+                subproc = Process(target=self._parseTask, args=(
+                    task,
+                    q,
+                ))
                 processPool.append(subproc)
                 subproc.start()
                 pidList.append(subproc.pid)
@@ -77,8 +78,9 @@ def parseFileByGroup(self, groupId, processNum=8):
                     isFistProcess = False
                     dcgm_data = q.get()
                 else:
-                    dcgm_data = pd.concat(
-                        [dcgm_data, q.get()], axis=0, join='outer')
+                    dcgm_data = pd.concat([dcgm_data, q.get()],
+                                          axis=0,
+                                          join='outer')
 
             return dcgm_data
 
@@ -94,8 +96,9 @@ def _parseTask(self, taskList, q=None):
                 is_first = False
                 dcgm_data = tmp_data
             else:
-                dcgm_data = pd.concat(
-                    [dcgm_data, tmp_data], axis=0, join='outer')
+                dcgm_data = pd.concat([dcgm_data, tmp_data],
+                                      axis=0,
+                                      join='outer')
         dcgm_data = dcgm_data.dropna()
         if not q is None:
             q.put(dcgm_data)
@@ -123,8 +126,8 @@ def _parseSingleFile(self, fileName):
                 if 'nv-hostengine' in line or 'dmon' in line or 'Host Engine Listener Started' in line:
                     continue
 
-                if not line.strip().startswith("GPU") and not line.strip(
-                ).startswith("# Entity"):
+                if not line.strip().startswith(
+                        "GPU") and not line.strip().startswith("# Entity"):
                     continue
 
                 # skip non-needed headers (only the header in 1th line was needed)
@@ -223,14 +226,14 @@ def getDCGMTraceInfo(self, groupId, processNum=8):
         pidList = []
 
         for gpuId in range(self._gpuPerTrainer):
-            subproc = Process(
-                target=self._getDCGMTraceInfoByGpuId,
-                args=(
-                    groupId,
-                    gpuId,
-                    dcgm_data,
-                    pid_map,
-                    q, ))
+            subproc = Process(target=self._getDCGMTraceInfoByGpuId,
+                              args=(
+                                  groupId,
+                                  gpuId,
+                                  dcgm_data,
+                                  pid_map,
+                                  q,
+                              ))
             processPool.append(subproc)
             subproc.start()
             pidList.append(subproc.pid)
diff --git a/tools/CrossStackProfiler/NetFileReader.py b/tools/CrossStackProfiler/NetFileReader.py
index fe900fab2ad24..1ae8a6803d540 100755
--- a/tools/CrossStackProfiler/NetFileReader.py
+++ b/tools/CrossStackProfiler/NetFileReader.py
@@ -31,6 +31,7 @@
 
 
 class netFileReader(FileReader):
+
     def _parseSingleFile(self, fileNameList, tx_pid, rx_pid, q=None):
 
         traceInfo = {}
@@ -91,12 +92,13 @@ def parseFileByGroup(self, groupId, processNum=8):
 
         taskList = self._splitTaskListForMultiProcess(fileFist, processNum)
         for task in taskList:
-            subproc = Process(
-                target=self._parseSingleFile, args=(
-                    task,
-                    tx_pid,
-                    rx_pid,
-                    q, ))
+            subproc = Process(target=self._parseSingleFile,
+                              args=(
+                                  task,
+                                  tx_pid,
+                                  rx_pid,
+                                  q,
+                              ))
             processPool.append(subproc)
             subproc.start()
             pidList.append(subproc.pid)
diff --git a/tools/CrossStackProfiler/ProfileFileReader.py b/tools/CrossStackProfiler/ProfileFileReader.py
index 0f3299ef5473f..628592a159f82 100755
--- a/tools/CrossStackProfiler/ProfileFileReader.py
+++ b/tools/CrossStackProfiler/ProfileFileReader.py
@@ -35,6 +35,7 @@
 
 
 class profileFileReader(FileReader):
+
     def _parseSingleFile(self, profile):
         with open(profile, 'rb') as f:
             profile_s = f.read()
@@ -71,6 +72,7 @@ def _is_forwardBackwardInfo(self, items):
         return False
 
     def _allocate_forwardBackwardInfo(self, restList, pid, tid):
+
         def _cmp_ele(items):
             return items["ts"]
 
@@ -135,8 +137,8 @@ def _getPipeLineInfo(self, profileList, q=None):
                 if self._is_forwardBackwardInfo(traceEvent):
                     traceEventList.append(traceEvent)
 
-            pipeLineList = self._allocate_forwardBackwardInfo(traceEventList,
-                                                              pid, tid)
+            pipeLineList = self._allocate_forwardBackwardInfo(
+                traceEventList, pid, tid)
 
             res[str(rankId)] = pipeLineList
 
@@ -159,10 +161,10 @@ def getPipeLineInfo(self, groupId, processNum=8):
 
         taskList = self._splitTaskListForMultiProcess(fileFist, processNum)
         for task in taskList:
-            subproc = Process(
-                target=self._getPipeLineInfo, args=(
-                    task,
-                    q, ))
+            subproc = Process(target=self._getPipeLineInfo, args=(
+                task,
+                q,
+            ))
             processPool.append(subproc)
             subproc.start()
             pidList.append(subproc.pid)
@@ -215,13 +217,13 @@ def _allocate_pids(self, profile_dict, gpuId, initPid):
                         devices[(k, event.device_id, "CPU")] = pid
                         # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
                         if event.device_id == -1:
-                            chrome_trace.emit_pid("%02d_%s:cuda_api" %
-                                                  (lineNum, k), pid)
+                            chrome_trace.emit_pid(
+                                "%02d_%s:cuda_api" % (lineNum, k), pid)
                             lineNum = lineNum + 1
                         else:
-                            chrome_trace.emit_pid("%02d_%s:cpu:block:%d" %
-                                                  (lineNum, k, event.device_id),
-                                                  pid)
+                            chrome_trace.emit_pid(
+                                "%02d_%s:cpu:block:%d" %
+                                (lineNum, k, event.device_id), pid)
                             lineNum = lineNum + 1
                 elif event.type == profiler_pb2.Event.GPUKernel:
                     if (k, event.device_id, "GPUKernel") not in devices:
@@ -230,9 +232,9 @@ def _allocate_pids(self, profile_dict, gpuId, initPid):
                             initPid = initPid + 1
 
                             devices[(k, event.device_id, "GPUKernel")] = pid
-                            chrome_trace.emit_pid("%02d_%s:gpu:%d" %
-                                                  (lineNum, k, event.device_id),
-                                                  pid)
+                            chrome_trace.emit_pid(
+                                "%02d_%s:gpu:%d" %
+                                (lineNum, k, event.device_id), pid)
                             lineNum = lineNum + 1
 
             if not hasattr(profile_pb, "mem_events"):
@@ -255,13 +257,13 @@ def _allocate_pids(self, profile_dict, gpuId, initPid):
                         initPid = initPid + 1
 
                         mem_devices[(k, mevent.device_id, "CPU")] = pid
-                        chrome_trace.emit_pid("%02d_memory usage on %s:cpu:%d" %
-                                              (lineNum, k, mevent.device_id),
-                                              pid)
+                        chrome_trace.emit_pid(
+                            "%02d_memory usage on %s:cpu:%d" %
+                            (lineNum, k, mevent.device_id), pid)
                         lineNum = lineNum + 1
                 elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
-                    if (k, mevent.device_id, "CUDAPinnedPlace"
-                        ) not in mem_devices:
+                    if (k, mevent.device_id,
+                            "CUDAPinnedPlace") not in mem_devices:
                         if gpuId == mevent.device_id:
                             pid = initPid
                             initPid = initPid + 1
@@ -277,8 +279,8 @@ def _allocate_pids(self, profile_dict, gpuId, initPid):
                     initPid = initPid + 1
 
                     mem_devices[(k, 0, "CPU")] = pid
-                    chrome_trace.emit_pid("%02d_memory usage on %s:cpu:%d" %
-                                          (lineNum, k, 0), pid)
+                    chrome_trace.emit_pid(
+                        "%02d_memory usage on %s:cpu:%d" % (lineNum, k, 0), pid)
                     lineNum = lineNum + 1
                 if (k, 0, "GPU") not in mem_devices:
                     # if gpuId == mevent.device_id:
@@ -286,8 +288,8 @@ def _allocate_pids(self, profile_dict, gpuId, initPid):
                     initPid = initPid + 1
 
                     mem_devices[(k, 0, "GPU")] = pid
-                    chrome_trace.emit_pid("%02d_memory usage on %s:gpu:%d" %
-                                          (lineNum, k, 0), pid)
+                    chrome_trace.emit_pid(
+                        "%02d_memory usage on %s:gpu:%d" % (lineNum, k, 0), pid)
                     lineNum = lineNum + 1
                 if (k, 0, "CUDAPinnedPlace") not in mem_devices:
                     pid = initPid
@@ -324,10 +326,10 @@ def _allocate_events(self, profile_dict, devices, gpuId):
                     args['detail_info'] = event.detail_info
                 # TODO(panyx0718): Chrome tracing only handles ms. However, some
                 # ops takes micro-seconds. Hence, we keep the ns here.
-                chrome_trace.emit_region(
-                    self._align_ts(event.start_ns),
-                    (event.end_ns - event.start_ns) / 1.0, pid,
-                    event.sub_device_id, 'Op', event.name, args)
+                chrome_trace.emit_region(self._align_ts(event.start_ns),
+                                         (event.end_ns - event.start_ns) / 1.0,
+                                         pid, event.sub_device_id, 'Op',
+                                         event.name, args)
         return chrome_trace
 
     def _allocate_memory_event(self, profile_dict, mem_devices, gpuId):
@@ -358,8 +360,8 @@ def _allocate_memory_event(self, profile_dict, mem_devices, gpuId):
                 else:
                     place = "UnDefine"
 
-                if (mevent.place == profiler_pb2.MemEvent.CUDAPlace or
-                        mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace
+                if (mevent.place == profiler_pb2.MemEvent.CUDAPlace
+                        or mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace
                     ) and mevent.device_id != gpuId:
                     continue
 
@@ -388,9 +390,10 @@ def _allocate_memory_event(self, profile_dict, mem_devices, gpuId):
                     total_size += mem_list[i + 1]['size']
                     i += 1
 
-                chrome_trace.emit_counter(
-                    "Memory", "Memory", mem_list[i]['pid'],
-                    self._align_ts(mem_list[i]['time']), 0, total_size)
+                chrome_trace.emit_counter("Memory", "Memory",
+                                          mem_list[i]['pid'],
+                                          self._align_ts(mem_list[i]['time']),
+                                          0, total_size)
                 i += 1
         return chrome_trace
 
@@ -426,10 +429,11 @@ def getOPTraceInfo(self, groupId):
         pidList = []
 
         for gpuId in range(self._gpuPerTrainer):
-            subproc = Process(
-                target=self._getOPTraceInfoByGpuId, args=(
-                    groupId,
-                    gpuId, ))
+            subproc = Process(target=self._getOPTraceInfoByGpuId,
+                              args=(
+                                  groupId,
+                                  gpuId,
+                              ))
             processPool.append(subproc)
             subproc.start()
             pidList.append(subproc.pid)
diff --git a/tools/analysisPyXml.py b/tools/analysisPyXml.py
index 5d6a5ac459408..9d70a15911838 100644
--- a/tools/analysisPyXml.py
+++ b/tools/analysisPyXml.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -47,8 +47,8 @@ def analysisPyXml(rootPath, ut):
                     if output.strip().startswith(
                         ('from', 'import', '__all__', 'def', 'class', '"""',
                          '@', '\'\'\'', 'logger', '_logger', 'logging', 'r"""',
-                         'pass', 'try', 'except', 'if __name__ == "__main__"'
-                         )) == False:
+                         'pass', 'try', 'except',
+                         'if __name__ == "__main__"')) == False:
                         pattern = "(.*) = ('*')|(.*) = (\"*\")|(.*) = (\d)|(.*) = (-\d)|(.*) = (None)|(.*) = (True)|(.*) = (False)|(.*) = (URL_PREFIX*)|(.*) = (\[)|(.*) = (\{)|(.*) = (\()"  #a='b'/a="b"/a=0
                         if re.match(pattern, output.strip()) == None:
                             pyCov_file.append(clazz_filename)
diff --git a/tools/analysis_build_time.py b/tools/analysis_build_time.py
index 8ae94348f21eb..e2cc454f5733c 100644
--- a/tools/analysis_build_time.py
+++ b/tools/analysis_build_time.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index dd864d9ed0ddc..18b467ccf4781 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -44,21 +44,21 @@ if [ "$api_spec_diff" != "" -o "${api_params_diff}" != "" ]; then
     echo_line="You must have one RD (XiaoguangHu01, lanxianghit or Superjomn) approval for API change.\n"
     echo_line="${echo_line} and one TPM approval for API change: \n"
     echo_line="${echo_line} jzhang533/ZhangJun, dingjiaweiww/DingJiaWei, TCChenlong/ChenLong, Ligoml/LiMengLiu for general APIs.\n"
-    echo_line="${echo_line} PangHua/XiangHui for distributed related APIs.\n"
+    echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related APIs.\n"
     echo_line="${echo_line} leiqing1/LeiQing for inference related APIs.\n"
 
     check_approval 1 46782768 47554610 328693
-    check_approval 1 29231 23093488 11935832 39876205 2682285 54695910
+    check_approval 1 29231 23093488 11935832 39876205 65896652 54695910
 fi
 
 api_doc_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.doc  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.doc` 
 if [ "$api_doc_spec_diff" != "" ]; then
     echo_line="You must have  one TPM approval for API documents change: \n"
     echo_line="${echo_line} jzhang533/ZhangJun, dingjiaweiww/DingJiaWei, TCChenlong/ChenLong, Ligoml/LiMengLiu for general API docs.\n"
-    echo_line="${echo_line} PangHua/XiangHui for distributed related API docs.\n"
+    echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related API docs.\n"
     echo_line="${echo_line} leiqing1/LeiQing for inference related API docs.\n"
 
-    check_approval 1 29231 23093488 11935832 39876205 2682285 54695910
+    check_approval 1 29231 23093488 11935832 39876205 65896652 54695910
 fi
 
 api_src_spec_diff=`python ${PADDLE_ROOT}/tools/check_api_source_without_core_ops.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.source.md5  ${PADDLE_ROOT}/paddle/fluid/API_PR.source.md5` 
diff --git a/tools/check_api_compatible.py b/tools/check_api_compatible.py
index f91112abd6460..18fb4d7ecdf14 100644
--- a/tools/check_api_compatible.py
+++ b/tools/check_api_compatible.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -61,16 +61,16 @@ def check_compatible(old_api_spec, new_api_spec):
     """
     check compatible, FullArgSpec
     """
-    if not (isinstance(old_api_spec, inspect.FullArgSpec) and isinstance(
-            new_api_spec, inspect.FullArgSpec)):
+    if not (isinstance(old_api_spec, inspect.FullArgSpec)
+            and isinstance(new_api_spec, inspect.FullArgSpec)):
         logger.warning(
             "new_api_spec or old_api_spec is not instance of inspect.FullArgSpec"
         )
         return False
     return _check_compatible(
-        old_api_spec.args, new_api_spec.args, []
-        if old_api_spec.defaults is None else old_api_spec.defaults, []
-        if new_api_spec.defaults is None else new_api_spec.defaults)
+        old_api_spec.args, new_api_spec.args,
+        [] if old_api_spec.defaults is None else old_api_spec.defaults,
+        [] if new_api_spec.defaults is None else new_api_spec.defaults)
 
 
 def check_compatible_str(old_api_spec_str, new_api_spec_str):
@@ -129,13 +129,15 @@ def parse_args():
         'prev',
         type=argparse.FileType('r'),
         help='the previous version (the version from develop branch)')
-    parser.add_argument(
-        'post',
-        type=argparse.FileType('r'),
-        help='the post version (the version from PullRequest)')
+    parser.add_argument('post',
+                        type=argparse.FileType('r'),
+                        help='the post version (the version from PullRequest)')
     for item in arguments:
-        parser.add_argument(
-            item[0], dest=item[1], help=item[4], type=item[2], default=item[3])
+        parser.add_argument(item[0],
+                            dest=item[1],
+                            help=item[4],
+                            type=item[2],
+                            default=item[3])
 
     if len(sys.argv) < 2:
         parser.print_help()
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 6e086d9d7ca58..ee282fb294aea 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -428,13 +428,13 @@ RUNTYPE_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH|gre
 if [ "${RUNTYPE_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     for CMAKELISTS_FILE in ${RUNTYPE_FILE_CHANGED};
     do
-        RUNTYPE_ADD=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CMAKELISTS_FILE} |grep "^+" |grep -E "RUN_TYPE=EXCLUSIVE|RUN_TYPE=DIST|RUN_TYPE=NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY|RUN_TYPE=DIST:NIGHTLY|PROPERTIES[[:space:]]+TIMEOUT" || true`
+        RUNTYPE_ADD=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CMAKELISTS_FILE} |grep "^+" |grep -E "SERIAL|RUN_TYPE=EXCLUSIVE|RUN_TYPE=DIST|RUN_TYPE=NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY|RUN_TYPE=DIST:NIGHTLY|PROPERTIES[[:space:]]+TIMEOUT" || true`
 	if [[ ${RUNTYPE_ADD} != "" ]];then
 	    RUNTYPE_ADD_LINES="${RUNTYPE_ADD_LINES}\n${CMAKELISTS_FILE}\n${RUNTYPE_ADD}\n"
 	fi
     done
     if [[ ${RUNTYPE_ADD_LINES} != "" ]];then
-        echo_line="You must have one QA (XieYunshen(Recommend) or chalsliu) approval for setting parameter RUN_TYPE as EXCLUSIVE, DIST, NIGHTLY, EXCLUSIVE:NIGHTLY or DISTNIGHTLY, or setting TIMEOUT properties.\nThe corresponding lines are as follows:\n${RUNTYPE_ADD_LINES}\nFor more information, please refer to:https://github.com/PaddlePaddle/Paddle/wiki/PaddlePaddle-Unit-test-specification"
+        echo_line="You must have one QA (XieYunshen(Recommend) or chalsliu) approval for setting parameter RUN_TYPE as EXCLUSIVE, DIST, NIGHTLY, EXCLUSIVE:NIGHTLY or DISTNIGHTLY, or setting parameter SERIAL, or setting TIMEOUT properties.\nThe corresponding lines are as follows:\n${RUNTYPE_ADD_LINES}\nFor more information, please refer to:https://github.com/PaddlePaddle/Paddle/wiki/PaddlePaddle-Unit-test-specification"
 	check_approval 1 32428676 45041955
     fi
 fi
diff --git a/tools/check_op_benchmark_result.py b/tools/check_op_benchmark_result.py
index e45d12c7b1b33..73075125ac46b 100644
--- a/tools/check_op_benchmark_result.py
+++ b/tools/check_op_benchmark_result.py
@@ -61,7 +61,8 @@ def load_benchmark_result_from_logs_dir(logs_dir):
     check_path_exists(logs_dir)
 
     log_file_path = lambda log_file: os.path.join(logs_dir, log_file)
-    result_lambda = lambda log_file: (log_file, parse_log_file(log_file_path(log_file)))
+    result_lambda = lambda log_file: (log_file,
+                                      parse_log_file(log_file_path(log_file)))
 
     return dict(map(result_lambda, os.listdir(logs_dir)))
 
@@ -183,11 +184,10 @@ def summary_results(check_results, api_info_file):
         type=str,
         required=True,
         help="Specify the benchmark result directory of PR branch.")
-    parser.add_argument(
-        "--api_info_file",
-        type=str,
-        required=False,
-        help="Specify the api info to run benchmark test.")
+    parser.add_argument("--api_info_file",
+                        type=str,
+                        required=False,
+                        help="Specify the api info to run benchmark test.")
     args = parser.parse_args()
 
     check_results = dict(accuracy=list(), speed=list())
diff --git a/tools/check_op_desc.py b/tools/check_op_desc.py
index 19984a55a41af..7367b88d5b8f5 100644
--- a/tools/check_op_desc.py
+++ b/tools/check_op_desc.py
@@ -321,8 +321,8 @@ def print_desc_error_message(error_message):
             for arg in changed_args:
                 ori_value, new_value = changed_args.get(arg)
                 print(
-                    " * The arg '{}' of Input '{}' is changed: from '{}' to '{}'.".
-                    format(arg, name, ori_value, new_value))
+                    " * The arg '{}' of Input '{}' is changed: from '{}' to '{}'."
+                    .format(arg, name, ori_value, new_value))
 
         for name in Inputs_error.get(QUANT, {}):
             print(" * The added Input '{}' is `quant`, need slim to review.".
@@ -345,8 +345,8 @@ def print_desc_error_message(error_message):
             for arg in changed_args:
                 ori_value, new_value = changed_args.get(arg)
                 print(
-                    " * The arg '{}' of Output '{}' is changed: from '{}' to '{}'.".
-                    format(arg, name, ori_value, new_value))
+                    " * The arg '{}' of Output '{}' is changed: from '{}' to '{}'."
+                    .format(arg, name, ori_value, new_value))
 
         for name in Outputs_error.get(QUANT, {}):
             print(" * The added Output '{}' is `quant`, need slim to review.".
@@ -371,8 +371,8 @@ def print_desc_error_message(error_message):
             for arg in changed_args:
                 ori_value, new_value = changed_args.get(arg)
                 print(
-                    " * The arg '{}' of attr '{}' is changed: from '{}' to '{}'.".
-                    format(arg, name, ori_value, new_value))
+                    " * The arg '{}' of attr '{}' is changed: from '{}' to '{}'."
+                    .format(arg, name, ori_value, new_value))
 
         for name in attrs_error.get(QUANT, {}):
             # TODO(Wilber):
@@ -414,13 +414,15 @@ def print_version_error_message(error_message):
         error_list = attrs_error.get(ADD, [])
         if error_list:
             for tup in error_list:
-                print(" * The added attribute '{}' is not yet registered.".
-                      format(tup[1]))
-        error_dic = error_message.get(op_name, {}).get(ATTRS, {}).get(CHANGE,
-                                                                      {})
+                print(
+                    " * The added attribute '{}' is not yet registered.".format(
+                        tup[1]))
+        error_dic = error_message.get(op_name, {}).get(ATTRS,
+                                                       {}).get(CHANGE, {})
         for key, val in error_dic.items():
-            print(" * The change of attribute '{}' is not yet registered.".
-                  format(key))
+            print(
+                " * The change of attribute '{}' is not yet registered.".format(
+                    key))
 
 
 def print_repeat_process():
@@ -446,8 +448,8 @@ def print_repeat_process():
     with open(sys.argv[2], 'r') as f:
         new_op_desc = f.read()
 
-    desc_error_message, version_error_message = compare_op_desc(origin_op_desc,
-                                                                new_op_desc)
+    desc_error_message, version_error_message = compare_op_desc(
+        origin_op_desc, new_op_desc)
     if error:
         print("-" * 30)
         print_desc_error_message(desc_error_message)
diff --git a/tools/check_op_register_type.py b/tools/check_op_register_type.py
index b32eff057318b..0b67e6e7f58cd 100644
--- a/tools/check_op_register_type.py
+++ b/tools/check_op_register_type.py
@@ -45,8 +45,8 @@ def get_all_kernels():
             register_type = infos[0].split(":")[-1]
             op_kernel_types[op_type].append(register_type.lower())
 
-    for (op_type, op_kernels) in sorted(
-            op_kernel_types.items(), key=lambda x: x[0]):
+    for (op_type, op_kernels) in sorted(op_kernel_types.items(),
+                                        key=lambda x: x[0]):
         print(op_type, " ".join(sorted(op_kernels)))
 
 
@@ -64,8 +64,8 @@ def print_diff(op_type, register_types):
     if len(FLOATS - register_types) == 1:
         lack_types |= FLOATS - register_types
 
-    print("{} only supports [{}] now, but lacks [{}].".format(op_type, " ".join(
-        register_types), " ".join(lack_types)))
+    print("{} only supports [{}] now, but lacks [{}].".format(
+        op_type, " ".join(register_types), " ".join(lack_types)))
 
 
 def check_add_op_valid():
diff --git a/tools/codestyle/clang_format.hook b/tools/codestyle/clang_format.hook
index 1d928216867c0..72608fd8b83fd 100755
--- a/tools/codestyle/clang_format.hook
+++ b/tools/codestyle/clang_format.hook
@@ -1,15 +1,12 @@
 #!/bin/bash
 set -e
 
-readonly VERSION="3.8"
+readonly VERSION="13.0.0"
 
 version=$(clang-format -version)
 
 if ! [[ $version == *"$VERSION"* ]]; then
-    echo "clang-format version check failed."
-    echo "a version contains '$VERSION' is needed, but get '$version'"
-    echo "you can install the right version, and make an soft-link to '\$PATH' env"
-    exit -1
+    pip install clang-format==13.0.0
 fi
 
 clang-format $@
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index c90bf29ecb794..cef11ab1351b7 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -1,10 +1,15 @@
 #!/bin/bash
 
 TOTAL_ERRORS=0
+
+readonly VERSION="1.6.0"
+
+version=$(cpplint --version)
+
 if [[ ! $TRAVIS_BRANCH ]]; then
   # install cpplint on local machine.
-  if [[ ! $(which cpplint) ]]; then
-    pip install cpplint
+  if ! [[ $version == *"$VERSION"* ]]; then
+    pip install cpplint==1.6.0
   fi
   # diff files on local machine. 
   files=$(git diff --cached --name-status | awk '$1 != "D" {print $2}')
diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py
index 823d947023041..c5a9d85269108 100644
--- a/tools/codestyle/docstring_checker.py
+++ b/tools/codestyle/docstring_checker.py
@@ -134,12 +134,12 @@ class DocstringChecker(BaseChecker):
                   symbol + "-missing", 'Add docstring longer >=10'),
         'W9006': ('Docstring indent error, use 4 space for indent',
                   symbol + "-indent-error", 'Use 4 space for indent'),
-        'W9007': ('You should add `Returns` in comments',
-                  symbol + "-with-returns",
-                  'There should be a `Returns` section in comments'),
-        'W9008': ('You should add `Raises` section in comments',
-                  symbol + "-with-raises",
-                  'There should be a `Raises` section in comments'),
+        'W9007':
+        ('You should add `Returns` in comments', symbol + "-with-returns",
+         'There should be a `Returns` section in comments'),
+        'W9008':
+        ('You should add `Raises` section in comments', symbol + "-with-raises",
+         'There should be a `Raises` section in comments'),
     }
     options = ()
 
@@ -333,17 +333,20 @@ def all_args_in_doc(self, node, doc):
         parsed_args = doc.args
         args_not_documented = set(args) - set(parsed_args)
         if len(args) > 0 and len(parsed_args) <= 0:
-            self.add_message(
-                'W9003',
-                node=node,
-                line=node.fromlineno,
-                args=list(args_not_documented))
+            self.add_message('W9003',
+                             node=node,
+                             line=node.fromlineno,
+                             args=list(args_not_documented))
             return False
 
         for t in args:
             if t not in parsed_args:
-                self.add_message(
-                    'W9003', node=node, line=node.fromlineno, args=[t, ])
+                self.add_message('W9003',
+                                 node=node,
+                                 line=node.fromlineno,
+                                 args=[
+                                     t,
+                                 ])
                 return False
 
         return True
diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook
index 150a3f5666bd3..1c81f4b456339 100755
--- a/tools/codestyle/pylint_pre_commit.hook
+++ b/tools/codestyle/pylint_pre_commit.hook
@@ -6,6 +6,13 @@ TOTAL_ERRORS=0
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 export PYTHONPATH=$DIR:$PYTHONPATH
 
+readonly VERSION="2.12.0"
+version=$(pylint --version | grep 'pylint')
+
+if ! [[ $version == *"$VERSION"* ]]; then
+    pip install pylint==2.12.0
+fi
+
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do
     pylint --disable=all --load-plugins=docstring_checker \
@@ -16,4 +23,3 @@ done
 exit $TOTAL_ERRORS
 #For now, just warning:
 #exit 0
-
diff --git a/tools/continuous_integration/bisect.py b/tools/continuous_integration/bisect.py
index 21a46e5cef096..afffc60a449ac 100644
--- a/tools/continuous_integration/bisect.py
+++ b/tools/continuous_integration/bisect.py
@@ -27,36 +27,43 @@
 import sys
 
 parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--git_dir', type=str, default='', help='git repo root directory.')
-parser.add_argument(
-    '--build_dir', type=str, default='', help='build directory.')
-parser.add_argument(
-    '--good_commit',
-    type=str,
-    default='',
-    help='The old commit known to be good.')
-parser.add_argument(
-    '--bad_commit',
-    type=str,
-    default='',
-    help='The new commit known to be bad.')
-parser.add_argument(
-    '--test_target', type=str, default='', help='The test target to evaluate.')
+parser.add_argument('--git_dir',
+                    type=str,
+                    default='',
+                    help='git repo root directory.')
+parser.add_argument('--build_dir',
+                    type=str,
+                    default='',
+                    help='build directory.')
+parser.add_argument('--good_commit',
+                    type=str,
+                    default='',
+                    help='The old commit known to be good.')
+parser.add_argument('--bad_commit',
+                    type=str,
+                    default='',
+                    help='The new commit known to be bad.')
+parser.add_argument('--test_target',
+                    type=str,
+                    default='',
+                    help='The test target to evaluate.')
 parser.add_argument(
     '--bisect_branch',
     type=str,
     default='develop',
     help='The mainline branch to bisect (feature branch ignored.')
-parser.add_argument(
-    '--log_file', type=str, default='', help='The file use to log outputs.')
-parser.add_argument(
-    '--test_times',
-    type=int,
-    default=10,
-    help="Number of times to run the test target.")
-parser.add_argument(
-    '--build_parallel', type=int, default=32, help="make parallelism.")
+parser.add_argument('--log_file',
+                    type=str,
+                    default='',
+                    help='The file use to log outputs.')
+parser.add_argument('--test_times',
+                    type=int,
+                    default=10,
+                    help="Number of times to run the test target.")
+parser.add_argument('--build_parallel',
+                    type=int,
+                    default=32,
+                    help="make parallelism.")
 args = parser.parse_args()
 
 if not args.log_file:
@@ -74,12 +81,10 @@ def print_arguments():
 
 # List the commits in mainline branch.
 os.chdir(args.git_dir)
-ret = subprocess.check_output(
-    [
-        'git rev-list --first-parent %s...%s' % (args.good_commit,
-                                                 args.bad_commit)
-    ],
-    shell=True)
+ret = subprocess.check_output([
+    'git rev-list --first-parent %s...%s' % (args.good_commit, args.bad_commit)
+],
+                              shell=True)
 sys.stdout.write('commits found:\n%s\n' % ret)
 commits = ret.strip().split('\n')
 os.chdir(args.build_dir)
@@ -90,12 +95,11 @@ def print_arguments():
 while True:
     # Get to the mainline branch and clean up
     os.chdir(args.git_dir)
-    subprocess.check_output(
-        [
-            'git checkout %s && git clean -fd && git checkout .' %
-            args.bisect_branch
-        ],
-        shell=True)
+    subprocess.check_output([
+        'git checkout %s && git clean -fd && git checkout .' %
+        args.bisect_branch
+    ],
+                            shell=True)
 
     if not commits:
         sys.stdout.write('no commits to bisect\n')
diff --git a/tools/count_api_without_core_ops.py b/tools/count_api_without_core_ops.py
index 5519859471ac9..6b5bffd332743 100644
--- a/tools/count_api_without_core_ops.py
+++ b/tools/count_api_without_core_ops.py
@@ -24,9 +24,11 @@
 import platform
 from paddle import _C_ops
 
-__all__ = ['get_apis_with_and_without_core_ops', ]
+__all__ = [
+    'get_apis_with_and_without_core_ops',
+]
 
-# APIs that should not be printed into API.spec 
+# APIs that should not be printed into API.spec
 omitted_list = [
     "paddle.fluid.LoDTensor.set",  # Do not know why it should be omitted
     "paddle.fluid.io.ComposeNotAligned",
@@ -41,10 +43,9 @@ def md5(doc):
         md5sum = hashinst.hexdigest()
     except UnicodeDecodeError as e:
         md5sum = None
-        print(
-            "Error({}) occurred when `md5({})`, discard it.".format(
-                str(e), doc),
-            file=sys.stderr)
+        print("Error({}) occurred when `md5({})`, discard it.".format(
+            str(e), doc),
+              file=sys.stderr)
     return md5sum
 
 
@@ -99,8 +100,8 @@ def visit_member(parent_name, member, func):
     if inspect.isclass(member):
         func(member, cur_name)
         for name, value in inspect.getmembers(member):
-            if hasattr(value, '__name__') and (not name.startswith("_") or
-                                               name == "__init__"):
+            if hasattr(value, '__name__') and (not name.startswith("_")
+                                               or name == "__init__"):
                 visit_member(cur_name, value, func)
     elif inspect.ismethoddescriptor(member):
         return
@@ -109,8 +110,9 @@ def visit_member(parent_name, member, func):
     elif inspect.isgetsetdescriptor(member):
         return
     else:
-        raise RuntimeError("Unsupported generate signature of member, type {0}".
-                           format(str(type(member))))
+        raise RuntimeError(
+            "Unsupported generate signature of member, type {0}".format(
+                str(type(member))))
 
 
 def is_primitive(instance):
@@ -175,8 +177,8 @@ def get_apis_with_and_without_core_ops(modules):
     api_with_ops = []
     api_without_ops = []
     for m in modules:
-        visit_all_module(
-            importlib.import_module(m), split_with_and_without_core_ops)
+        visit_all_module(importlib.import_module(m),
+                         split_with_and_without_core_ops)
     return api_with_ops, api_without_ops
 
 
diff --git a/tools/coverage/coverage_diff.py b/tools/coverage/coverage_diff.py
index 6a400d293b27d..fc5a34364c59f 100644
--- a/tools/coverage/coverage_diff.py
+++ b/tools/coverage/coverage_diff.py
@@ -2,13 +2,13 @@
 # -*- coding: utf-8 -*-
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/coverage/coverage_diff_list.py b/tools/coverage/coverage_diff_list.py
index 6283430120995..13ba471c13a7b 100644
--- a/tools/coverage/coverage_diff_list.py
+++ b/tools/coverage/coverage_diff_list.py
@@ -2,13 +2,13 @@
 # -*- coding: utf-8 -*-
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/coverage/coverage_lines.py b/tools/coverage/coverage_lines.py
index 553cd691e4520..3c5df9d88e8c4 100644
--- a/tools/coverage/coverage_lines.py
+++ b/tools/coverage/coverage_lines.py
@@ -2,13 +2,13 @@
 # -*- coding: utf-8 -*-
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/coverage/cuda_clean.py b/tools/coverage/cuda_clean.py
index 8c03edd078549..82bb6a553c955 100644
--- a/tools/coverage/cuda_clean.py
+++ b/tools/coverage/cuda_clean.py
@@ -2,13 +2,13 @@
 # -*- coding: utf-8 -*-
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/coverage/gcda_clean.py b/tools/coverage/gcda_clean.py
index 12bd04a6907ea..062b8f356d667 100644
--- a/tools/coverage/gcda_clean.py
+++ b/tools/coverage/gcda_clean.py
@@ -2,13 +2,13 @@
 # -*- coding: utf-8 -*-
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/coverage/pull_request.py b/tools/coverage/pull_request.py
index f3e88286ca965..53325d36820a3 100644
--- a/tools/coverage/pull_request.py
+++ b/tools/coverage/pull_request.py
@@ -2,13 +2,13 @@
 # -*- coding: utf-8 -*-
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/coverage/python_coverage.py b/tools/coverage/python_coverage.py
index f2e52b5e23b3a..d45fb4d58c591 100644
--- a/tools/coverage/python_coverage.py
+++ b/tools/coverage/python_coverage.py
@@ -2,13 +2,13 @@
 # -*- coding: utf-8 -*-
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -33,8 +33,8 @@
     clazz_filename = path.join(source, clazz_filename)
 
     if clazz_filename.startswith('/paddle/build/python/'):
-        clazz_filename = '/paddle/python/' + clazz_filename[len(
-            '/paddle/build/python/'):]
+        clazz_filename = '/paddle/python/' + clazz_filename[
+            len('/paddle/build/python/'):]
 
     if not path.exists(clazz_filename):
         continue
diff --git a/tools/diff_api.py b/tools/diff_api.py
index f086598945afe..8dabf316c2dd7 100644
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/diff_unittest.py b/tools/diff_unittest.py
index fa70be0990ec0..178fd1647d9f0 100644
--- a/tools/diff_unittest.py
+++ b/tools/diff_unittest.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/externalError/spider.py b/tools/externalError/spider.py
index e07f05f561cb5..520561d299b9d 100644
--- a/tools/externalError/spider.py
+++ b/tools/externalError/spider.py
@@ -306,8 +306,8 @@ def parsing(externalErrorDesc):
         res_strong = r'<strong class="ph b">.*?</strong>'
         res_strong_detail = r'<strong class="ph b">(.*?)</strong>'
         list_strong = re.findall(res_strong, m_message, re.S | re.M)
-        list_strong_detail = re.findall(res_strong_detail, m_message, re.S |
-                                        re.M)
+        list_strong_detail = re.findall(res_strong_detail, m_message,
+                                        re.S | re.M)
         assert len(list_strong) == len(list_strong_detail)
         for idx in range(len(list_strong)):
             m_message = m_message.replace(list_strong[idx],
diff --git a/tools/final_ut_parallel_rule.py b/tools/final_ut_parallel_rule.py
new file mode 100644
index 0000000000000..7a25eee71b227
--- /dev/null
+++ b/tools/final_ut_parallel_rule.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import json
+import datetime
+import codecs
+import sys
+
+
+def classify_cases_by_mem(rootPath):
+    """classify cases by mem"""
+    case_filename = '%s/build/classify_case_by_cardNum.txt' % rootPath
+    case_exec_100 = [
+        'test_conv_eltwiseadd_bn_fuse_pass', 'test_trt_convert_pool2d',
+        'test_fc_fuse_pass', 'test_trt_convert_depthwise_conv2d',
+        'test_quant2_int8_resnet50_mkldnn',
+        'test_conv_elementwise_add_act_fuse_pass', 'test_trt_convert_conv2d',
+        'test_paddle_save_load', 'test_logical_op', 'test_nearest_interp_op',
+        'test_pool2d_op', 'test_conv3d_transpose_op', 'test_lstmp_op',
+        'test_cross_entropy2_op', 'test_sgd_op', 'test_imperative_ptq',
+        'test_model', 'test_custom_relu_op_setup', 'test_dropout_op',
+        'test_concat_op'
+    ]  #木桶原理 70s-100s之间的case
+
+    case_exec_200 = [
+        'test_post_training_quantization_mnist',
+        'test_imperative_auto_mixed_precision',
+        'test_trt_dynamic_shape_ernie_fp16_ser_deser',
+        'test_trt_dynamic_shape_ernie', 'test_layer_norm_op',
+        'trt_quant_int8_yolov3_r50_test', 'test_gru_op',
+        'test_post_training_quantization_while', 'test_mkldnn_log_softmax_op',
+        'test_mkldnn_matmulv2_op', 'test_mkldnn_shape_op',
+        'interceptor_pipeline_short_path_test',
+        'interceptor_pipeline_long_path_test', 'test_cpuonly_spawn'
+    ]  #木桶原理 110s-200s之间的case 以及容易timeout
+
+    case_always_timeout = [
+        'test_quant2_int8_resnet50_channelwise_mkldnn',
+        'test_parallel_dygraph_unused_variables_gloo',
+        'test_seq2seq',
+        'test_pool3d_op',
+        'test_trilinear_interp_op',
+        'test_trilinear_interp_v2_op',
+        'test_dropout_op',
+        'test_parallel_dygraph_sync_batch_norm',
+        'test_conv3d_op',
+        'test_quant2_int8_resnet50_range_mkldnn',
+    ]  # always timeout
+
+    f = open(case_filename)
+    lines = f.readlines()
+    all_tests_by_card = {}
+    for line in lines:
+        if line.startswith('single_card_tests:'):
+            all_tests_by_card['single_card_tests'] = []
+            line = line.split('single_card_tests: ^job$|')[1].split('|')
+            for case in line:
+                case = case.replace('^', '').replace('$', '').strip()
+                all_tests_by_card['single_card_tests'].append(case)
+        elif line.startswith('multiple_card_tests:'):
+            all_tests_by_card['multiple_card_tests'] = []
+            line = line.split('multiple_card_tests: ^job$|')[1].split('|')
+            for case in line:
+                case = case.replace('^', '').replace('$', '').strip()
+                all_tests_by_card['multiple_card_tests'].append(case)
+        elif line.startswith('exclusive_card_tests:'):
+            all_tests_by_card['exclusive_card_tests'] = []
+            line = line.split('exclusive_card_tests: ^job$')[1].split('|')
+            for case in line:
+                case = case.replace('^', '').replace('$', '').strip()
+                all_tests_by_card['exclusive_card_tests'].append(case)
+
+    with open("/pre_test/classify_case_by_cardNum.json", "w") as f:
+        json.dump(all_tests_by_card, f)
+
+    with open("/pre_test/ut_mem_map.json", 'r') as load_f:
+        new_lastest_mem = json.load(load_f)
+    no_parallel_case = '^job$'
+    for cardType in all_tests_by_card:
+        case_mem_0 = '^job$'
+        case_mem_1 = {}
+        for case in all_tests_by_card[cardType]:
+            if case in case_exec_100 or case in case_exec_200:
+                continue
+            if case in case_always_timeout:
+                no_parallel_case = no_parallel_case + '|^' + case + '$'
+                continue
+
+            if case not in new_lastest_mem:
+                continue
+
+            #mem = 0
+            if new_lastest_mem[case]["mem_nvidia"] == 0:
+                case_mem_0 = case_mem_0 + '|^' + case + '$'
+            #mem != 0
+            else:
+                case_mem_1[case] = new_lastest_mem[case]["mem_nvidia"]
+
+        with open('/pre_test/%s_mem0' % cardType, 'w') as f:
+            f.write(case_mem_0)
+            f.close()
+
+        case_mem_1_sort = sorted(case_mem_1.items(), key=lambda x: x[1])
+        case_mem_1_line = '^job$'
+        mem_1_sum = 0
+        with open('/pre_test/%s' % cardType, 'w') as f_not_0:
+            for index in case_mem_1_sort:
+                if mem_1_sum < 16 * 1024 * 2:
+                    mem_1_sum += index[1]
+                    case_mem_1_line = case_mem_1_line + '|^' + index[0] + '$'
+                else:
+                    f_not_0.write(case_mem_1_line + '\n')
+                    '''
+                    if len(always_timeout_list
+                           ) != 0 and cardType == 'single_card_tests' and count > 25:
+                        f.write(case_mem_1_line + '|^%s$\n' %
+                                always_timeout_list[0])
+                        always_timeout_list.pop(0)
+                    else:
+                        f.write(case_mem_1_line + '\n') 
+                    count += 1
+                    '''
+                    case_mem_1_line = '^job$|^' + index[0] + '$'
+                    mem_1_sum = index[1]
+            f_not_0.write(case_mem_1_line + '\n')
+
+            if cardType == 'single_card_tests':
+                for cases in [case_exec_100, case_exec_200]:
+                    case_mem_1_line = '^job$'
+                    for case in cases:
+                        case_mem_1_line = case_mem_1_line + '|^' + case + '$'
+                    f_not_0.write(case_mem_1_line + '\n')
+            f_not_0.close()
+
+    os.system('cp %s/build/nightly_case /pre_test/' % rootPath)
+
+
+if __name__ == '__main__':
+    rootPath = sys.argv[1]
+    classify_cases_by_mem(rootPath)
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 6b90a656f0107..4c21d59cbe296 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -88,8 +88,8 @@ def __wget_with_retry(self, url):
             if code == 0:
                 return True
             print(
-                'PREC download {} error, retry {} time(s) after {} secs.[proxy_option={}]'.
-                format(url, ix, ix * 10, proxy))
+                'PREC download {} error, retry {} time(s) after {} secs.[proxy_option={}]'
+                .format(url, ix, ix * 10, proxy))
             time.sleep(ix * 10)
             ix += 1
         return False
@@ -111,8 +111,8 @@ def __urlretrieve(self, url, filename):
             except Exception as e:
                 print(e)
                 print(
-                    'PREC download {} error, retry {} time(s) after {} secs.[proxy_option={}]'.
-                    format(url, ix, ix * 10, cur_proxy))
+                    'PREC download {} error, retry {} time(s) after {} secs.[proxy_option={}]'
+                    .format(url, ix, ix * 10, cur_proxy))
                 continue
             else:
                 return True
@@ -226,7 +226,9 @@ def get_pr_diff_lines(self):
                             if line_list:
                                 line_list.append(line)
                             else:
-                                file_to_diff_lines[filename] = [line, ]
+                                file_to_diff_lines[filename] = [
+                                    line,
+                                ]
                         if data[ix][0] != '-':
                             lineno += 1
                         ix += 1
@@ -246,10 +248,9 @@ def is_only_comment(self, f):
         return True
 
     def get_all_count(self):
-        p = subprocess.Popen(
-            "cd {}build && ctest -N".format(PADDLE_ROOT),
-            shell=True,
-            stdout=subprocess.PIPE)
+        p = subprocess.Popen("cd {}build && ctest -N".format(PADDLE_ROOT),
+                             shell=True,
+                             stdout=subprocess.PIPE)
         out, err = p.communicate()
         for line in out.splitlines():
             if 'Total Tests:' in str(line):
@@ -354,8 +355,8 @@ def get_pr_ut(self):
                         else:
                             print("remove file not hit mapFiles: %s" % f_judge)
                     else:
-                        notHitMapFiles.append(f_judge) if file_dict[
-                            f] != 'removed' else print(
+                        notHitMapFiles.append(
+                            f_judge) if file_dict[f] != 'removed' else print(
                                 "remove file not hit mapFiles: %s" % f_judge)
                 else:
                     if file_dict[f] not in ['removed']:
diff --git a/tools/get_single_test_cov.py b/tools/get_single_test_cov.py
index 9232924ddb07d..cf670f87750fa 100644
--- a/tools/get_single_test_cov.py
+++ b/tools/get_single_test_cov.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -75,7 +75,7 @@ def analysisFNDAFile(rootPath, test):
                           (clazz_filename, notrelated_ut_map_file))
         else:
             if clazz_filename != '':
-                if clazz_filename not in related_file_list:  # xx.pb.cc in RELATED xx.pb.h not in RELATED 
+                if clazz_filename not in related_file_list:  # xx.pb.cc in RELATED xx.pb.h not in RELATED
                     os.system('echo %s >> %s' %
                               (clazz_filename, notrelated_ut_map_file))
     f.close()
diff --git a/tools/get_ut_file_map.py b/tools/get_ut_file_map.py
index eaa1f3c5405ce..7011cc193aabc 100644
--- a/tools/get_ut_file_map.py
+++ b/tools/get_ut_file_map.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -78,8 +78,8 @@ def handle_ut_file_map(rootPath):
                     source_file = line.replace('/build', '')
                     #source_file = re.sub('.pb.*', '.proto', source_file)
                 elif 'precise test map fileeee:' in line:
-                    source_file = line.split('precise test map fileeee:')[
-                        1].strip()
+                    source_file = line.split(
+                        'precise test map fileeee:')[1].strip()
                 else:
                     source_file = line
                 if source_file not in ut_file_map:
diff --git a/tools/get_ut_mem_map.py b/tools/get_ut_mem_map.py
index 745d7f9a90c24..a8fd94c6f762b 100644
--- a/tools/get_ut_mem_map.py
+++ b/tools/get_ut_mem_map.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,6 +14,7 @@
 
 import os
 import json
+import sys
 
 
 def get_ut_mem(rootPath):
@@ -24,7 +25,7 @@ def get_ut_mem(rootPath):
                 continue
             ut = f.replace('^', '').replace('$.log', '')
             case_dic[ut] = {}
-            filename = '%s%s' % (parent, f)
+            filename = '%s/%s' % (parent, f)
             fi = open(filename)
             lines = fi.readlines()
             mem_reserved1 = -1
@@ -41,14 +42,14 @@ def get_ut_mem(rootPath):
                 if 'MAX_GPU_MEMORY_USE=' in line:
                     mem_nvidia = round(
                         float(
-                            line.split('MAX_GPU_MEMORY_USE=')[1].split('\\n')[0]
-                            .strip()), 2)
+                            line.split('MAX_GPU_MEMORY_USE=')[1].split('\\n')
+                            [0].strip()), 2)
                     if mem_nvidia > mem_nvidia1:
                         mem_nvidia1 = mem_nvidia
                 if 'Total Test time (real)' in line:
                     caseTime = float(
-                        line.split('Total Test time (real) =')[1].split('sec')[
-                            0].strip())
+                        line.split('Total Test time (real) =')[1].split('sec')
+                        [0].strip())
             if mem_reserved1 != -1:
                 case_dic[ut]['mem_reserved'] = mem_reserved1
             if mem_nvidia1 != -1:
@@ -56,7 +57,7 @@ def get_ut_mem(rootPath):
             if caseTime != -1:
                 case_dic[ut]['time'] = caseTime
 
-    ut_mem_map_file = "/pre_test/ut_mem_map.json" % rootPath
+    ut_mem_map_file = "/pre_test/ut_mem_map.json"
     with open(ut_mem_map_file, "w") as f:
         json.dump(case_dic, f)
 
diff --git a/tools/group_case_for_parallel.py b/tools/group_case_for_parallel.py
new file mode 100644
index 0000000000000..e4aea8f39f565
--- /dev/null
+++ b/tools/group_case_for_parallel.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+
+def group_case_for_parallel(rootPath):
+    """group cases"""
+
+    #wget file
+    for filename in [
+            'nightly_tests', 'single_card_tests', 'single_card_tests_mem0',
+            'multiple_card_tests', 'multiple_card_tests_mem0',
+            'exclusive_card_tests', 'exclusive_card_tests_mem0'
+    ]:
+        os.system(
+            'cd %s/tools && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/pre_test/%s --no-check-certificate'
+            % (rootPath, filename))
+
+    #get nightly tests
+    nightly_tests_file = open('%s/tools/nightly_tests' % rootPath, 'r')
+    nightly_tests = nightly_tests_file.read().strip().split('\n')
+    nightly_tests_file.close()
+
+    parallel_case_file_list = [
+        '%s/tools/single_card_tests_mem0' % rootPath,
+        '%s/tools/single_card_tests' % rootPath,
+        '%s/tools/multiple_card_tests_mem0' % rootPath,
+        '%s/tools/multiple_card_tests' % rootPath,
+        '%s/tools/exclusive_card_tests_mem0' % rootPath,
+        '%s/tools/exclusive_card_tests' % rootPath
+    ]
+    case_file = '%s/build/ut_list' % rootPath
+    if os.path.exists(case_file):
+        f = open(case_file, 'r')
+        all_need_run_cases = f.read().strip().split('\n')
+        if len(all_need_run_cases) == 1 and all_need_run_cases[0] == '':
+            f.close()
+            case_file = '%s/build/all_ut_list' % rootPath
+            f = open(case_file, 'r')
+            all_need_run_cases = f.read().strip().split('\n')
+    else:
+        case_file = '%s/build/all_ut_list' % rootPath
+        f = open(case_file, 'r')
+        all_need_run_cases = f.read().strip().split('\n')
+
+    print("case_file: %s" % case_file)
+
+    all_group_case = []
+    for filename in parallel_case_file_list:
+        fi = open(filename, 'r')
+        new_f = open('%s_new' % filename, 'w')
+        lines = fi.readlines()
+        new_case_file_list = []
+        for line in lines:
+            case_line_list = line.replace('^', '').replace('|', '').split('$')
+            new_case_line_list = list(
+                set(all_need_run_cases).intersection(set(case_line_list)))
+            if len(new_case_line_list) != 0:
+                new_case_file_list.append(new_case_line_list)
+                all_group_case += new_case_line_list
+                all_need_run_cases = list(
+                    set(all_need_run_cases).difference(set(all_group_case)))
+
+        for line in new_case_file_list:
+            cases = '$|^'.join(case for case in line)
+            cases = '^job$|^%s$' % cases
+            new_f.write(cases + '\n')
+        fi.close()
+        new_f.close()
+
+    #no parallel cases
+    cases = '^job'
+    if len(all_need_run_cases) != 0:
+        for case in all_need_run_cases:
+            if case not in nightly_tests:
+                cases = cases + '$|^%s' % case
+        cases = '%s$' % cases
+
+    new_f = open('%s/tools/no_parallel_case_file' % rootPath, 'w')
+    new_f.write(cases + '\n')
+    new_f.close()
+    f.close()
+
+
+if __name__ == "__main__":
+    rootPath = sys.argv[1]
+    group_case_for_parallel(rootPath)
diff --git a/tools/handle_h_cu_file.py b/tools/handle_h_cu_file.py
index ea01a1d8d4151..389b460a791b9 100644
--- a/tools/handle_h_cu_file.py
+++ b/tools/handle_h_cu_file.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -32,7 +32,9 @@ def worker(fun):
 def threadPool(threadPoolNum):
     threadPool = []
     for i in range(threadPoolNum):
-        thread = threading.Thread(target=worker, args={doFun, })
+        thread = threading.Thread(target=worker, args={
+            doFun,
+        })
         thread.daemon = True
         threadPool.append(thread)
     return threadPool
diff --git a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
index a4f93a5d6c320..f2c04f3cba8f8 100644
--- a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
+++ b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
@@ -304,40 +304,42 @@ def convert_op_proto_into_mlir(op_descs):
 
             # 2.3.2 attributes
             for attr in op_proto[ATTRS]:
-                if (op_proto[ATTRS][attr][EXTRA] == True) or (
-                        attr in skipped_attr_list):
+                if (op_proto[ATTRS][attr][EXTRA]
+                        == True) or (attr in skipped_attr_list):
                     continue
                 if op_proto[ATTRS][attr][DEFAULT_VALUE] != None:
                     if op_proto[ATTRS][attr][TYPE] in attr_mlir_converter:
-                        default_value = str(op_proto[ATTRS][attr][
-                            DEFAULT_VALUE])
-                        if (attr_mlir_converter[op_proto[ATTRS][attr][TYPE]] in
-                            [
-                                'I32ArrayAttr', 'F32ArrayAttr', 'StrArrayAttr',
-                                'BoolArrayAttr', 'I64ArrayAttr'
-                            ]):
-                            default_value = default_value.replace(
-                                '[', '{').replace(']', '}')
-                        if (attr_mlir_converter[op_proto[ATTRS][attr][TYPE]] in
-                            ['BoolAttr', 'BoolArrayAttr']):
+                        default_value = str(
+                            op_proto[ATTRS][attr][DEFAULT_VALUE])
+                        if (attr_mlir_converter[op_proto[ATTRS][attr][TYPE]]
+                                in [
+                                    'I32ArrayAttr', 'F32ArrayAttr',
+                                    'StrArrayAttr', 'BoolArrayAttr',
+                                    'I64ArrayAttr'
+                                ]):
+                            default_value = default_value.replace('[',
+                                                                  '{').replace(
+                                                                      ']', '}')
+                        if (attr_mlir_converter[op_proto[ATTRS][attr][TYPE]]
+                                in ['BoolAttr', 'BoolArrayAttr']):
                             default_value = default_value.lower()
                         elif (attr_mlir_converter[op_proto[ATTRS][attr][TYPE]]
                               in ['StrAttr', 'StrArrayAttr']):
                             default_value = default_value.replace('\'', '\\\"')
-                            if attr_mlir_converter[op_proto[ATTRS][attr][
-                                    TYPE]] == "StrAttr":
+                            if attr_mlir_converter[op_proto[ATTRS][attr]
+                                                   [TYPE]] == "StrAttr":
                                 default_value = '\\\"' + default_value + '\\\"'
                         attr_list = " DefaultValuedAttr<" + attr_mlir_converter[
                             op_proto[ATTRS][attr]
                             [TYPE]] + ", \"" + default_value + "\">:$" + attr + ","
                         ARGUMENTS += attr_list
                     else:
-                        print("Error:" + op_type + ":" + attr + ":" + str(
-                            op_proto[ATTRS][attr][TYPE]))
+                        print("Error:" + op_type + ":" + attr + ":" +
+                              str(op_proto[ATTRS][attr][TYPE]))
                 else:
                     if op_proto[ATTRS][attr][TYPE] in attr_mlir_converter:
-                        attr_type_ = attr_mlir_converter[op_proto[ATTRS][attr][
-                            TYPE]]
+                        attr_type_ = attr_mlir_converter[op_proto[ATTRS][attr]
+                                                         [TYPE]]
                         if (attr_type_ in [
                                 'StrAttr', 'I32ArrayAttr', 'F32ArrayAttr',
                                 'StrArrayAttr', 'BoolArrayAttr', 'I64ArrayAttr'
@@ -345,8 +347,8 @@ def convert_op_proto_into_mlir(op_descs):
                             attr_list = attr_type_ + ":$" + attr + ","
                             ARGUMENTS += attr_list
                     else:
-                        print(" ouch Error:" + op_type + ":" + attr + ":" + str(
-                            op_proto[ATTRS][attr][TYPE]))
+                        print(" ouch Error:" + op_type + ":" + attr + ":" +
+                              str(op_proto[ATTRS][attr][TYPE]))
             ARGUMENTS = ARGUMENTS[:-1] + ");\n"
 
         # 2.4 results info
@@ -375,8 +377,8 @@ def convert_op_proto_into_mlir(op_descs):
         ops_mlir_file.write("\n#endif  // PD_OPS")
 
     print("Skipped ops num: " + str(len(skipped_op_list)))
-    print("Automatically generated op dialects num: " + str(
-        len(automatically_generated_op_dialect)))
+    print("Automatically generated op dialects num: " +
+          str(len(automatically_generated_op_dialect)))
 
 
 if __name__ == "__main__":
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
index b83bfe911aa48..826c9b03b42d0 100644
--- a/tools/infrt/generate_phi_kernel_dialect.py
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -83,11 +83,15 @@ def generate_kernel_name(op_name, place_str):
     precision_ = precision_type_converter[precision_.strip()]
     class_name_ = "{}{}".format(
         op_name.replace("_", "").title(), "".join([
-            target_.strip().title(), precision_.strip(), layout_.strip().title()
-            .title()
+            target_.strip().title(),
+            precision_.strip(),
+            layout_.strip().title().title()
         ]))
-    alias_ = "{}.{}".format(op_name, ".".join(
-        [target_.strip(), precision_.strip(), layout_.strip()]))
+    alias_ = "{}.{}".format(
+        op_name,
+        ".".join([target_.strip(),
+                  precision_.strip(),
+                  layout_.strip()]))
     return alias_, class_name_
 
 
@@ -101,8 +105,8 @@ def generate_attrs_info(op_name, attrs_info):
         for index in range(len(attrs_info)):
             attr_name = kernel_attrs_names[op_name]["attrs"][index]
             attr_type = attr_type_converter[attrs_info[index]]
-            attrs_args_ += '{type_}:${name_},'.format(
-                type_=attr_type, name_=attr_name)
+            attrs_args_ += '{type_}:${name_},'.format(type_=attr_type,
+                                                      name_=attr_name)
     return attrs_args_[:-1]
 
 
@@ -124,8 +128,8 @@ def generate_arguments_info(op_name, input_info, attr_info):
     input_args = generate_inputs_info(input_info)
     attr_args = generate_attrs_info(op_name, attr_info)
     context_args = "Context:$dev_ctx"
-    argument_list = [context_args] + input_args.split(",") + attr_args.split(
-        ",")
+    argument_list = [context_args
+                     ] + input_args.split(",") + attr_args.split(",")
     while ("" in argument_list):
         argument_list.remove("")
     argument_ = ",".join(argument_list)
@@ -295,8 +299,8 @@ def main():
                             op_name, kernel_alias_, kernel_info[kernel_alias_])
                         gpu_registry_ += kernel_registry
                     else:
-                        print("Unsupported backend:" + get_kernel_target(
-                            kernel_alias_))
+                        print("Unsupported backend:" +
+                              get_kernel_target(kernel_alias_))
         end = "#endif  // PTEN_KERNELS"
         with open("../../paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td",
                   "w") as dst:
diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py
index a66a236b0f975..9e112cafc8514 100644
--- a/tools/infrt/get_compat_kernel_signature.py
+++ b/tools/infrt/get_compat_kernel_signature.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -76,14 +76,14 @@ def get_compat_kernels_info():
                     if name in kernels_info:
                         cur_reg = kernels_info[name]
                         kernels_info[name]["inputs"] = list(
-                            set(registry_info["inputs"] + kernels_info[name][
-                                "inputs"]))
+                            set(registry_info["inputs"] +
+                                kernels_info[name]["inputs"]))
                         kernels_info[name]["attrs"] = list(
-                            set(registry_info["attrs"] + kernels_info[name][
-                                "attrs"]))
+                            set(registry_info["attrs"] +
+                                kernels_info[name]["attrs"]))
                         kernels_info[name]["outputs"] = list(
-                            set(registry_info["outputs"] + kernels_info[name][
-                                "outputs"]))
+                            set(registry_info["outputs"] +
+                                kernels_info[name]["outputs"]))
                     else:
                         kernels_info[name] = registry_info
 
diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
index c4c02d67cf70b..b582932809457 100644
--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -1,13 +1,13 @@
 #!/bin/python
 
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -39,28 +39,29 @@ def get_skipped_kernel_list():
 
 def parse_args():
     parser = argparse.ArgumentParser("gather phi kernel and infermate info")
-    parser.add_argument(
-        "--paddle_root_path",
-        type=str,
-        required=True,
-        help="root path of paddle src[WORK_PATH/Paddle].")
+    parser.add_argument("--paddle_root_path",
+                        type=str,
+                        required=True,
+                        help="root path of paddle src[WORK_PATH/Paddle].")
     parser.add_argument(
         "--kernel_info_file",
         type=str,
         required=True,
         help="kernel info file generated by get_phi_kernel_function.sh.")
-    parser.add_argument(
-        "--infermeta_wrap_file",
-        type=str,
-        required=True,
-        help="inferMeta wrap info file.")
-    parser.add_argument(
-        "--attr_info_file", type=str, required=True, help="attr info file.")
+    parser.add_argument("--infermeta_wrap_file",
+                        type=str,
+                        required=True,
+                        help="inferMeta wrap info file.")
+    parser.add_argument("--attr_info_file",
+                        type=str,
+                        required=True,
+                        help="attr info file.")
     parser.add_argument(
         "--generate_file",
         type=str,
         required=True,
-        default="../paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc",
+        default=
+        "../paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc",
         help="generated file.")
     args = parser.parse_args()
     return args
@@ -311,8 +312,8 @@ def gen_register_code_info(item: List[str], attr_data: Dict[str, List[str]]):
     return res
 
 
-def gen_register_info(resources: List[List[str]],
-                      attr_data: Dict[str, List[str]]):
+def gen_register_info(resources: List[List[str]], attr_data: Dict[str,
+                                                                  List[str]]):
     """
     resources: [['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta'], ...]
     attr_data: {'phi_cpu.arg_min.float32.any': ['axisBool', 'keepdimsBool', 'flatten', 'dtype']}
diff --git a/tools/infrt/print_kernel_pass_info.py b/tools/infrt/print_kernel_pass_info.py
index c2f3e36a675b1..ef9b0b59f37ce 100644
--- a/tools/infrt/print_kernel_pass_info.py
+++ b/tools/infrt/print_kernel_pass_info.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -45,8 +45,8 @@ def get_compat_kernels_info(register):
                 registry = False
                 is_macro_defination = False
                 for line in txt:
-                    if line.strip().startswith("#define") and line.strip(
-                    ).endswith("\\"):
+                    if line.strip().startswith(
+                            "#define") and line.strip().endswith("\\"):
                         is_macro_defination = True
                         continue
                     if is_macro_defination:
diff --git a/tools/infrt/skipped_phi_api.json b/tools/infrt/skipped_phi_api.json
index 2502e248c5c48..75533311513e5 100644
--- a/tools/infrt/skipped_phi_api.json
+++ b/tools/infrt/skipped_phi_api.json
@@ -1,4 +1,4 @@
 {
-"phi_apis":["conj", "deformable_conv", "dropout", "expand_as", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth", "layer_norm"],
+"phi_apis":["conj", "deformable_conv", "dropout", "expand_as", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth", "layer_norm", "instance_norm"],
 "phi_kernels":["equal_all"]
 }
diff --git a/tools/jetson_infer_op.py b/tools/jetson_infer_op.py
index d4aa3cb1404af..d046483efdaca 100644
--- a/tools/jetson_infer_op.py
+++ b/tools/jetson_infer_op.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -51,16 +51,14 @@ def parse_arguments():
     :return:
     """
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--shell_name',
-        type=str,
-        default='get_op_list.sh',
-        help='please input right name')
-    parser.add_argument(
-        '--op_list_file',
-        type=str,
-        default='list_op.txt',
-        help='please input right name')
+    parser.add_argument('--shell_name',
+                        type=str,
+                        default='get_op_list.sh',
+                        help='please input right name')
+    parser.add_argument('--op_list_file',
+                        type=str,
+                        default='list_op.txt',
+                        help='please input right name')
     return parser.parse_args()
 
 
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 7c43ef1a6d2e3..559f2d95b915f 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -844,10 +844,11 @@
     'test_sigmoid_cross_entropy_with_logits_op', 'copy_cross_scope_test',
     'test_normalization_wrapper', 'test_flip', 'test_cosine_similarity_api',
     'test_cumsum_op', 'test_range', 'test_log_loss_op', 'test_where_index',
-    'test_tril_triu_op', 'test_lod_reset_op', 'test_lod_tensor', 'test_addmm_op',
-    'test_index_select_op', 'test_nvprof', 'test_index_sample_op',
-    'test_unstack_op', 'test_increment', 'strided_memcpy_test',
-    'test_target_assign_op', 'test_trt_dynamic_shape_transformer_prune',
+    'test_tril_triu_op', 'test_lod_reset_op', 'test_lod_tensor',
+    'test_addmm_op', 'test_index_select_op', 'test_nvprof',
+    'test_index_sample_op', 'test_unstack_op', 'test_increment',
+    'strided_memcpy_test', 'test_target_assign_op',
+    'test_trt_dynamic_shape_transformer_prune',
     'test_box_decoder_and_assign_op', 'test_trt_dynamic_shape', 'test_mnist',
     'test_convert_operators', 'test_fill_any_like_op', 'test_fill_constant_op',
     'test_callback_reduce_lr_on_plateau', 'test_tile_op', 'test_logical',
@@ -985,15 +986,16 @@
     'test_pool_max_op', 'test_log_softmax',
     'test_imperative_container_parameterlist', 'test_multiplex_op',
     'test_trt_transpose_flatten_concat_fuse_pass',
-    'test_seqconv_eltadd_relu_fuse_pass', 'test_assert_op', 'test_scatter_nd_op',
-    'test_sequence_expand', 'test_arange', 'test_translated_layer',
-    'test_decoupled_py_reader_data_check', 'test_analyzer_ernie_large',
-    'test_tensor_array_to_tensor', 'test_functional_conv2d_transpose',
-    'test_error', 'test_callbacks', 'test_imperative_recurrent_usage',
-    'test_deform_conv2d', 'test_coalesce_tensor_op', 'test_tsm',
-    'test_fused_multihead_matmul_op', 'test_softmax_mask_fuse_op',
-    'test_optimizer_grad', 'test_complex_abs', 'test_gradient_accmulator',
-    'test_instance_norm_op_v2', 'test_random_crop_op', 'test_mobile_net',
+    'test_seqconv_eltadd_relu_fuse_pass', 'test_assert_op',
+    'test_scatter_nd_op', 'test_sequence_expand', 'test_arange',
+    'test_translated_layer', 'test_decoupled_py_reader_data_check',
+    'test_analyzer_ernie_large', 'test_tensor_array_to_tensor',
+    'test_functional_conv2d_transpose', 'test_error', 'test_callbacks',
+    'test_imperative_recurrent_usage', 'test_deform_conv2d',
+    'test_coalesce_tensor_op', 'test_tsm', 'test_fused_multihead_matmul_op',
+    'test_softmax_mask_fuse_op', 'test_optimizer_grad', 'test_complex_abs',
+    'test_gradient_accmulator', 'test_instance_norm_op_v2',
+    'test_random_crop_op', 'test_mobile_net',
     'test_parallel_executor_transformer',
     'test_tensor_scalar_type_promotion_dynamic',
     'test_eager_deletion_delete_vars', 'test_asp_pruning_1d',
@@ -1021,11 +1023,12 @@
     'test_embedding_id_stop_gradient', 'test_mkldnn_fc_act_fuse_pass',
     'sequence_pooling_test', 'test_get_tensor_from_selected_rows_op',
     'test_imperative_ptb_rnn_sorted_gradient', 'test_hapi_hub',
-    'test_reverse_op', 'test_compiled_program', 'test_lambda', 'test_adadelta_op',
-    'test_nn_sigmoid_op', 'test_nearest_interp_v2_op', 'test_sequence_slice_op',
-    'test_program_translator', 'test_eager_deletion_lstm_net', 'malloc_test',
-    'test_size_op', 'test_analysis_predictor', 'test_recognize_digits',
-    'test_parameter', 'test_transpose_flatten_concat_fuse_pass',
+    'test_reverse_op', 'test_compiled_program', 'test_lambda',
+    'test_adadelta_op', 'test_nn_sigmoid_op', 'test_nearest_interp_v2_op',
+    'test_sequence_slice_op', 'test_program_translator',
+    'test_eager_deletion_lstm_net', 'malloc_test', 'test_size_op',
+    'test_analysis_predictor', 'test_recognize_digits', 'test_parameter',
+    'test_transpose_flatten_concat_fuse_pass',
     'test_imperative_trace_non_persistable_inputs', 'test_pass_builder',
     'thread_local_allocator_test', 'test_variable', 'test_fsp_op',
     'test_elementwise_gradient_op', 'test_multinomial_op',
@@ -1183,7 +1186,7 @@
 ]
 
 # *=======These unittest doesn't occupy GPU memory, just run as CPU unittest=======* #
-# It run 16 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
+# It run 16 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
 # just remove it from this list.
 CPU_PARALLEL_JOB = [
     'test_static_save_load_large',
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index b9be7f836a44b..44083d660c6e1 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -49,10 +49,9 @@ def md5(doc):
         md5sum = hashinst.hexdigest()
     except UnicodeDecodeError as e:
         md5sum = None
-        print(
-            "Error({}) occurred when `md5({})`, discard it.".format(
-                str(e), doc),
-            file=sys.stderr)
+        print("Error({}) occurred when `md5({})`, discard it.".format(
+            str(e), doc),
+              file=sys.stderr)
 
     return md5sum
 
@@ -110,8 +109,8 @@ def visit_all_module(mod):
                 if hasattr(instance,
                            '__name__') and member_name != instance.__name__:
                     print(
-                        "Found alias API, alias name is: {}, original name is: {}".
-                        format(member_name, instance.__name__),
+                        "Found alias API, alias name is: {}, original name is: {}"
+                        .format(member_name, instance.__name__),
                         file=sys.stderr)
         except:
             if not cur_name in ErrorSet and not cur_name in skiplist:
@@ -168,8 +167,8 @@ def insert_api_into_dict(full_name, gen_doc_anno=None):
         logger.warning("AttributeError occurred when `id(eval(%s))`", full_name)
         return None
     except Exception as e:
-        logger.warning("Exception(%s) occurred when `id(eval(%s))`",
-                       str(e), full_name)
+        logger.warning("Exception(%s) occurred when `id(eval(%s))`", str(e),
+                       full_name)
         return None
     else:
         logger.debug("adding %s to api_info_dict.", full_name)
@@ -190,8 +189,8 @@ def insert_api_into_dict(full_name, gen_doc_anno=None):
                 api_info_dict[fc_id]["gen_doc_anno"] = gen_doc_anno
             if inspect.isfunction(obj):
                 api_info_dict[fc_id]["signature"] = repr(
-                    inspect.getfullargspec(obj)).replace('FullArgSpec',
-                                                         'ArgSpec', 1)
+                    inspect.getfullargspec(obj)).replace(
+                        'FullArgSpec', 'ArgSpec', 1)
         return api_info_dict[fc_id]
 
 
@@ -212,8 +211,8 @@ def process_module(m, attr="__all__"):
                 api_counter += 1
                 if inspect.isclass(api_info['object']):
                     for name, value in inspect.getmembers(api_info['object']):
-                        if (not name.startswith("_")) and hasattr(value,
-                                                                  '__name__'):
+                        if (not name.startswith("_")) and hasattr(
+                                value, '__name__'):
                             method_full_name = full_name + '.' + name  # value.__name__
                             method_api_info = insert_api_into_dict(
                                 method_full_name, 'class_method')
@@ -225,44 +224,17 @@ def process_module(m, attr="__all__"):
 def check_public_api():
     import paddle
     modulelist = [  #npqa
-        paddle,
-        paddle.amp,
-        paddle.nn,
-        paddle.nn.functional,
-        paddle.nn.initializer,
-        paddle.nn.utils,
-        paddle.static,
-        paddle.static.nn,
-        paddle.io,
-        paddle.jit,
-        paddle.metric,
-        paddle.distribution,
-        paddle.optimizer,
-        paddle.optimizer.lr,
-        paddle.regularizer,
-        paddle.text,
-        paddle.utils,
-        paddle.utils.download,
-        paddle.utils.profiler,
-        paddle.utils.cpp_extension,
-        paddle.sysconfig,
-        paddle.vision,
-        paddle.vision.datasets,
-        paddle.vision.models,
-        paddle.vision.transforms,
-        paddle.vision.ops,
-        paddle.distributed,
-        paddle.distributed.fleet,
-        paddle.distributed.fleet.utils,
-        paddle.distributed.parallel,
-        paddle.distributed.utils,
-        paddle.callbacks,
-        paddle.hub,
-        paddle.autograd,
-        paddle.incubate,
-        paddle.inference,
-        paddle.onnx,
-        paddle.device
+        paddle, paddle.amp, paddle.nn, paddle.nn.functional,
+        paddle.nn.initializer, paddle.nn.utils, paddle.static, paddle.static.nn,
+        paddle.io, paddle.jit, paddle.metric, paddle.distribution,
+        paddle.optimizer, paddle.optimizer.lr, paddle.regularizer, paddle.text,
+        paddle.utils, paddle.utils.download, paddle.utils.profiler,
+        paddle.utils.cpp_extension, paddle.sysconfig, paddle.vision,
+        paddle.vision.datasets, paddle.vision.models, paddle.vision.transforms,
+        paddle.vision.ops, paddle.distributed, paddle.distributed.fleet,
+        paddle.distributed.fleet.utils, paddle.distributed.parallel,
+        paddle.distributed.utils, paddle.callbacks, paddle.hub, paddle.autograd,
+        paddle.incubate, paddle.inference, paddle.onnx, paddle.device
     ]
 
     apinum = 0
@@ -294,8 +266,8 @@ def check_public_api():
             cur_name = module + '.' + member_name
             instance = eval(cur_name)
             doc_md5 = md5(instance.__doc__)
-            member_dict[cur_name] = "({}, ('document', '{}'))".format(cur_name,
-                                                                      doc_md5)
+            member_dict[cur_name] = "({}, ('document', '{}'))".format(
+                cur_name, doc_md5)
 
 
 def check_allmodule_callable():
@@ -313,14 +285,13 @@ def parse_args():
     """
     parser = argparse.ArgumentParser(description='Print Apis Signatures')
     parser.add_argument('--debug', dest='debug', action="store_true")
-    parser.add_argument(
-        '--method',
-        dest='method',
-        type=str,
-        default='get_all_api',
-        help="using get_all_api or from_modulelist")
-    parser.add_argument(
-        'module', type=str, help='module', default='paddle')  # not used
+    parser.add_argument('--method',
+                        dest='method',
+                        type=str,
+                        default='get_all_api',
+                        help="using get_all_api or from_modulelist")
+    parser.add_argument('module', type=str, help='module',
+                        default='paddle')  # not used
 
     if len(sys.argv) == 1:
         args = parser.parse_args(['paddle'])
@@ -351,15 +322,13 @@ def parse_args():
         for api_name in all_api_names_sorted:
             api_info = api_info_dict[all_api_names_to_k[api_name]]
             print("{0} ({2}, ('document', '{1}'))".format(
-                api_name,
-                md5(api_info['docstring']), api_info['signature']
+                api_name, md5(api_info['docstring']), api_info['signature']
                 if 'signature' in api_info else 'ArgSpec()'))
 
     if len(ErrorSet) == 0:
         sys.exit(0)
     else:
         for erroritem in ErrorSet:
-            print(
-                "Error, new function {} is unreachable".format(erroritem),
-                file=sys.stderr)
+            print("Error, new function {} is unreachable".format(erroritem),
+                  file=sys.stderr)
         sys.exit(1)
diff --git a/tools/pyCov_multithreading.py b/tools/pyCov_multithreading.py
index 20181fb6f93cb..cb2366075afb1 100644
--- a/tools/pyCov_multithreading.py
+++ b/tools/pyCov_multithreading.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,7 +35,9 @@ def worker(fun):
 def threadPool(threadPoolNum):
     threadPool = []
     for i in range(threadPoolNum):
-        thread = threading.Thread(target=worker, args={doFun, })
+        thread = threading.Thread(target=worker, args={
+            doFun,
+        })
         thread.daemon = True
         threadPool.append(thread)
     return threadPool
diff --git a/tools/remove_grad_op_and_kernel.py b/tools/remove_grad_op_and_kernel.py
index e8ab321e96105..bbf5616fc439d 100644
--- a/tools/remove_grad_op_and_kernel.py
+++ b/tools/remove_grad_op_and_kernel.py
@@ -55,8 +55,9 @@ def update_operator_cmake(cmake_file):
         content = content.replace(pat1, code1)
 
         match = re.findall(pat2, content, flags=re.DOTALL)
-        content = content.replace(match[0], code2 + '\n' + match[0].replace(
-            'py_func_op', 'py_func_op ${LOSS_OPS}'))
+        content = content.replace(
+            match[0], code2 + '\n' +
+            match[0].replace('py_func_op', 'py_func_op ${LOSS_OPS}'))
 
     with open(cmake_file, 'w') as f:
         f.write(content)
@@ -67,12 +68,12 @@ def update_operator_cmake(cmake_file):
     tool_dir = os.path.dirname(os.path.abspath(__file__))
 
     if sys.version_info[0] == 3:
-        all_op = glob.glob(
-            os.path.join(tool_dir, '../paddle/fluid/operators/**/*.cc'),
-            recursive=True)
-        all_op += glob.glob(
-            os.path.join(tool_dir, '../paddle/fluid/operators/**/*.cu'),
-            recursive=True)
+        all_op = glob.glob(os.path.join(tool_dir,
+                                        '../paddle/fluid/operators/**/*.cc'),
+                           recursive=True)
+        all_op += glob.glob(os.path.join(tool_dir,
+                                         '../paddle/fluid/operators/**/*.cu'),
+                            recursive=True)
     elif sys.version_info[0] == 2:
         all_op = find_type_files(
             os.path.join(tool_dir, '../paddle/fluid/operators/'), '.cc')
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 1bd9f029d552c..6a9b4729e40b2 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -149,10 +149,14 @@ def _cb_started():
     def _append_code_block():
         # nonlocal code_blocks, cb_cur, cb_cur_name, cb_cur_seq_id, cb_required
         code_blocks.append({
-            'codes': inspect.cleandoc("\n".join(cb_info['cb_cur'])),
-            'name': cb_info['cb_cur_name'],
-            'id': cb_info['cb_cur_seq_id'],
-            'required': cb_info['cb_required'],
+            'codes':
+            inspect.cleandoc("\n".join(cb_info['cb_cur'])),
+            'name':
+            cb_info['cb_cur_name'],
+            'id':
+            cb_info['cb_cur_seq_id'],
+            'required':
+            cb_info['cb_required'],
         })
 
     for lineno, linecont in enumerate(ds_list):
@@ -353,9 +357,10 @@ def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
         # False - it need other special equipment or environment.
         # so, the following conditional statements are intentionally arranged.
         if matched == True:
-            tfname = os.path.join(SAMPLECODE_TEMPDIR, '{}_example{}'.format(
-                name, '.py'
-                if len(codeblocks) == 1 else '_{}.py'.format(y + 1)))
+            tfname = os.path.join(
+                SAMPLECODE_TEMPDIR, '{}_example{}'.format(
+                    name,
+                    '.py' if len(codeblocks) == 1 else '_{}.py'.format(y + 1)))
             with open(tfname, 'w') as tempf:
                 sampcd = insert_codes_into_codeblock(cb, name)
                 tempf.write(sampcd)
@@ -366,9 +371,9 @@ def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
             SUMMARY_INFO['skiptest'].append("{}-{}".format(name, cb['id']))
         elif matched == False:
             logger.info(
-                '{}\' code block (name:{}, id:{}) required({}) not match capacity({}).'.
-                format(name, cb['name'], cb['id'], cb['required'],
-                       SAMPLE_CODE_TEST_CAPACITY))
+                '{}\' code block (name:{}, id:{}) required({}) not match capacity({}).'
+                .format(name, cb['name'], cb['id'], cb['required'],
+                        SAMPLE_CODE_TEST_CAPACITY))
             if cb['required'] not in SUMMARY_INFO:
                 SUMMARY_INFO[cb['required']] = []
             SUMMARY_INFO[cb['required']].append("{}-{}".format(name, cb['id']))
@@ -401,8 +406,9 @@ def execute_samplecode(tfname):
     logger.info("----example code check----")
     logger.info("executing sample code: %s", tfname)
     start_time = time.time()
-    subprc = subprocess.Popen(
-        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    subprc = subprocess.Popen(cmd,
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.PIPE)
     output, error = subprc.communicate()
     msg = "".join(output.decode(encoding='utf-8'))
     err = "".join(error.decode(encoding='utf-8'))
@@ -410,7 +416,8 @@ def execute_samplecode(tfname):
 
     if subprc.returncode != 0:
         with open(tfname, 'r') as f:
-            logger.warning("""Sample code error found in %s:
+            logger.warning(
+                """Sample code error found in %s:
 -----------------------
 %s
 -----------------------
@@ -462,8 +469,8 @@ def get_filenames(full_test=False):
                 # paddle.Tensor.<lambda>
                 continue
             if hasattr(api_obj, '__doc__') and api_obj.__doc__:
-                sample_code_filenames = sampcd_extract_to_file(api_obj.__doc__,
-                                                               api)
+                sample_code_filenames = sampcd_extract_to_file(
+                    api_obj.__doc__, api)
                 for tfname in sample_code_filenames:
                     all_sample_code_filenames[tfname] = api
     return all_sample_code_filenames
@@ -557,8 +564,9 @@ def exec_gen_doc():
     cmd = ["bash", "document_preview.sh"]
     logger.info("----exec gen_doc----")
     start_time = time.time()
-    subprc = subprocess.Popen(
-        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    subprc = subprocess.Popen(cmd,
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.PIPE)
     output, error = subprc.communicate()
     msg = "".join(output.decode(encoding='utf-8'))
     err = "".join(error.decode(encoding='utf-8'))
@@ -608,14 +616,16 @@ def parse_args():
     parser.add_argument('--debug', dest='debug', action="store_true")
     parser.add_argument('--full-test', dest='full_test', action="store_true")
     parser.add_argument('mode', type=str, help='run on device', default='cpu')
-    parser.add_argument(
-        '--build-doc',
-        dest='build_doc',
-        action='store_true',
-        help='build doc if need.')
+    parser.add_argument('--build-doc',
+                        dest='build_doc',
+                        action='store_true',
+                        help='build doc if need.')
     for item in arguments:
-        parser.add_argument(
-            item[0], dest=item[1], help=item[4], type=item[2], default=item[3])
+        parser.add_argument(item[0],
+                            dest=item[1],
+                            help=item[4],
+                            type=item[2],
+                            default=item[3])
 
     if len(sys.argv) == 1:
         args = parser.parse_args(['cpu'])
@@ -723,8 +733,8 @@ def parse_args():
                         len(SUMMARY_INFO['success']))
         for k, v in SUMMARY_INFO.items():
             if k not in ['success', 'failed', 'skiptest', 'nocodes']:
-                logger.info("%d sample codes required not match for %s",
-                            len(v), k)
+                logger.info("%d sample codes required not match for %s", len(v),
+                            k)
         if len(SUMMARY_INFO['skiptest']):
             logger.info("%d sample codes skipped",
                         len(SUMMARY_INFO['skiptest']))
diff --git a/tools/summary_env.py b/tools/summary_env.py
index d12e644cc28da..4e4100af4226c 100644
--- a/tools/summary_env.py
+++ b/tools/summary_env.py
@@ -64,9 +64,10 @@ def get_python_info():
 
 
 def run_shell_command(cmd):
-    out, err = subprocess.Popen(
-        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
-        shell=True).communicate()
+    out, err = subprocess.Popen(cmd,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE,
+                                shell=True).communicate()
     if err:
         return None
     else:
@@ -82,6 +83,7 @@ def get_cuda_info():
 
 
 def get_cudnn_info():
+
     def _get_cudnn_ver(cmd):
         out = run_shell_command(cmd)
         if out:
@@ -92,8 +94,8 @@ def _get_cudnn_ver(cmd):
     if platform.system() == "Windows":
         cudnn_dll_path = run_shell_command('where cudnn*')
         if cudnn_dll_path:
-            cudnn_header_path = cudnn_dll_path.split('bin')[
-                0] + r'include\cudnn.h'
+            cudnn_header_path = cudnn_dll_path.split(
+                'bin')[0] + r'include\cudnn.h'
             cmd = 'type "{0}" | findstr "{1}" | findstr /v "CUDNN_VERSION"'
         else:
             envs['cudnn_version'] = None
@@ -119,8 +121,8 @@ def _get_cudnn_ver(cmd):
 def get_driver_info():
     driver_ver = run_shell_command('nvidia-smi')
     if driver_ver:
-        driver_ver = driver_ver.split('Driver Version:')[1].strip().split(' ')[
-            0]
+        driver_ver = driver_ver.split('Driver Version:')[1].strip().split(
+            ' ')[0]
     else:
         driver_ver = None
     envs['nvidia_driver_version'] = driver_ver
diff --git a/tools/test_check_api_compatible.py b/tools/test_check_api_compatible.py
index 24e7b3a8f8acb..846fdefb7b6d6 100644
--- a/tools/test_check_api_compatible.py
+++ b/tools/test_check_api_compatible.py
@@ -1,13 +1,13 @@
 #! /usr/bin/env python
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -28,6 +28,7 @@
 
 
 class Test_check_compatible(unittest.TestCase):
+
     def setUp(self) -> None:
         self.fullargspec_prefix = 'inspect.Full'
         self.argspec_str_o = self.fullargspec_prefix + '''ArgSpec(args=['shape', 'dtype', 'name'], varargs=None, varkw=None, defaults=(None, None), kwonlyargs=[], kwonlydefaults=None, annotations={})'''
@@ -70,6 +71,7 @@ def test_args_reduced(self):
 
 
 class Test_check_compatible_str(unittest.TestCase):
+
     def setUp(self) -> None:
         self.fullargspec_prefix = 'inspect.Full'
         # paddle.fluid.layer_helper_base.LayerHelperBase.create_parameter
@@ -112,6 +114,7 @@ def test_args_defaults_None(self):
 
 
 class Test_read_argspec_from_file(unittest.TestCase):
+
     def setUp(self) -> None:
         self.fullargspec_prefix = 'inspect.Full'
         self.argspec_str_o = self.fullargspec_prefix + '''ArgSpec(args=['shape', 'dtype', 'name'], varargs=None, varkw=None, defaults=(None, None), kwonlyargs=[], kwonlydefaults=None, annotations={})'''
diff --git a/tools/test_check_pr_approval.py b/tools/test_check_pr_approval.py
index f4c089ee0f872..5f3c7ca11ccc0 100644
--- a/tools/test_check_pr_approval.py
+++ b/tools/test_check_pr_approval.py
@@ -1,13 +1,13 @@
 #! /usr/bin/env python
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,6 +22,7 @@
 
 
 class Test_check_approval(unittest.TestCase):
+
     def setUp(self):
         self.codeset = 'UTF-8'
         # only key info in it
@@ -71,21 +72,19 @@ def setUp(self):
 
     def test_ids(self):
         cmd = [sys.executable, 'check_pr_approval.py', '1', '26408901']
-        subprc = subprocess.Popen(
-            cmd,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
+        subprc = subprocess.Popen(cmd,
+                                  stdin=subprocess.PIPE,
+                                  stdout=subprocess.PIPE,
+                                  stderr=subprocess.PIPE)
         output, error = subprc.communicate(input=self.jsonstr)
         self.assertEqual('TRUE', output.decode(self.codeset).rstrip())
 
     def test_logins(self):
         cmd = [sys.executable, 'check_pr_approval.py', '1', 'pangyoki']
-        subprc = subprocess.Popen(
-            cmd,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
+        subprc = subprocess.Popen(cmd,
+                                  stdin=subprocess.PIPE,
+                                  stdout=subprocess.PIPE,
+                                  stderr=subprocess.PIPE)
         output, error = subprc.communicate(input=self.jsonstr)
         self.assertEqual('TRUE', output.decode(self.codeset).rstrip())
 
@@ -93,11 +92,10 @@ def test_ids_and_logins(self):
         cmd = [
             sys.executable, 'check_pr_approval.py', '2', 'pangyoki', '13469016'
         ]
-        subprc = subprocess.Popen(
-            cmd,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
+        subprc = subprocess.Popen(cmd,
+                                  stdin=subprocess.PIPE,
+                                  stdout=subprocess.PIPE,
+                                  stderr=subprocess.PIPE)
         output, error = subprc.communicate(input=self.jsonstr)
         #self.assertEqual('', error.rstrip())
         self.assertEqual('TRUE', output.decode(self.codeset).rstrip())
@@ -107,11 +105,10 @@ def test_check_with_required_reviewer_not_approved(self):
             sys.executable, 'check_pr_approval.py', '2', 'wadefelix',
             ' 13469016'
         ]
-        subprc = subprocess.Popen(
-            cmd,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
+        subprc = subprocess.Popen(cmd,
+                                  stdin=subprocess.PIPE,
+                                  stdout=subprocess.PIPE,
+                                  stderr=subprocess.PIPE)
         output, error = subprc.communicate(input=self.jsonstr)
         self.assertEqual('FALSE', output.decode(self.codeset).rstrip())
 
diff --git a/tools/test_print_signatures.py b/tools/test_print_signatures.py
index 1ca1e4149fb7e..14275b6b7ae4d 100644
--- a/tools/test_print_signatures.py
+++ b/tools/test_print_signatures.py
@@ -1,13 +1,13 @@
 #! /usr/bin/env python
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -55,6 +55,7 @@ class method
 
 
 class Test_all_in_print_signatures(unittest.TestCase):
+
     def test_md5(self):
         algo = hashlib.md5()
         algo.update(func_example.__doc__.encode('utf-8'))
@@ -63,12 +64,13 @@ def test_md5(self):
 
 
 class Test_is_primitive(unittest.TestCase):
+
     def test_single(self):
         self.assertTrue(is_primitive(2))
         self.assertTrue(is_primitive(2.1))
         self.assertTrue(is_primitive("2.1.1"))
-        self.assertFalse(
-            is_primitive("hello paddle".encode('UTF-8')))  # True for python2
+        self.assertFalse(is_primitive(
+            "hello paddle".encode('UTF-8')))  # True for python2
         self.assertFalse(is_primitive(1j))
         self.assertTrue(is_primitive(True))
 
diff --git a/tools/test_runner.py b/tools/test_runner.py
index 02d926914f904..2a66c4a26ffd3 100644
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -54,11 +54,10 @@ def main():
 
                     if not res.wasSuccessful():
                         some_test_failed = True
-                        print(
-                            module_name,
-                            'failed\n',
-                            buffer.getvalue(),
-                            file=sys.stderr)
+                        print(module_name,
+                              'failed\n',
+                              buffer.getvalue(),
+                              file=sys.stderr)
         if flag_need_static_mode:
             paddle.disable_static()
 
diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py
index 2bcee0d2ae09e..471deb9bedd6e 100644
--- a/tools/test_sampcd_processor.py
+++ b/tools/test_sampcd_processor.py
@@ -1,13 +1,13 @@
 #! python
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -32,6 +32,7 @@
 
 
 class Test_find_all(unittest.TestCase):
+
     def test_find_none(self):
         self.assertEqual(0, len(find_all('hello', 'world')))
 
@@ -44,6 +45,7 @@ def test_find_two(self):
 
 
 class Test_find_last_future_line_end(unittest.TestCase):
+
     def test_no_instant(self):
         samplecodes = """
                 print(10//3)
@@ -58,8 +60,8 @@ def test_1_instant(self):
         """
         mo = re.search("print_function\n", samplecodes)
         self.assertIsNotNone(mo)
-        self.assertGreaterEqual(
-            find_last_future_line_end(samplecodes), mo.end())
+        self.assertGreaterEqual(find_last_future_line_end(samplecodes),
+                                mo.end())
 
     def test_2_instant(self):
         samplecodes = """
@@ -70,11 +72,12 @@ def test_2_instant(self):
         """
         mo = re.search("division\n", samplecodes)
         self.assertIsNotNone(mo)
-        self.assertGreaterEqual(
-            find_last_future_line_end(samplecodes), mo.end())
+        self.assertGreaterEqual(find_last_future_line_end(samplecodes),
+                                mo.end())
 
 
 class Test_extract_code_blocks_from_docstr(unittest.TestCase):
+
     def test_no_samplecode(self):
         docstr = """
         placeholder
@@ -138,6 +141,7 @@ def test_2_samplecodes(self):
 
 
 class Test_insert_codes_into_codeblock(unittest.TestCase):
+
     def test_required_None(self):
         codeblock = {
             'codes': """print(1/0)""",
@@ -145,12 +149,13 @@ def test_required_None(self):
             'id': 1,
             'required': None,
         }
-        self.assertEqual("""
+        self.assertEqual(
+            """
 import os
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 print(1/0)
 print("not-specified's sample code (name:None, id:1) is executed successfully!")""",
-                         insert_codes_into_codeblock(codeblock))
+            insert_codes_into_codeblock(codeblock))
 
     def test_required_gpu(self):
         codeblock = {
@@ -160,13 +165,14 @@ def test_required_gpu(self):
             'id': 1,
             'required': 'gpu',
         }
-        self.assertEqual("""
+        self.assertEqual(
+            """
 import os
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 # required: gpu
 print(1+1)
 print("not-specified's sample code (name:None, id:1) is executed successfully!")""",
-                         insert_codes_into_codeblock(codeblock))
+            insert_codes_into_codeblock(codeblock))
 
     def test_from_future(self):
         codeblock = {
@@ -178,7 +184,8 @@ def test_from_future(self):
             'id': 1,
             'required': None,
         }
-        self.assertEqual("""
+        self.assertEqual(
+            """
 from __future__ import print_function
 from __future__ import division
 
@@ -186,7 +193,7 @@ def test_from_future(self):
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 print(10//3)
 print("not-specified's sample code (name:future, id:1) is executed successfully!")""",
-                         insert_codes_into_codeblock(codeblock))
+            insert_codes_into_codeblock(codeblock))
 
 
 def clear_capacity():
@@ -197,6 +204,7 @@ def clear_capacity():
 
 
 class Test_get_test_capacity(unittest.TestCase):
+
     def setUp(self):
         clear_capacity()
         get_test_capacity()
@@ -208,8 +216,9 @@ def tearDown(self):
     def test_NoEnvVar(self):
         clear_capacity()
         get_test_capacity()
-        self.assertCountEqual(['cpu', ],
-                              sampcd_processor.SAMPLE_CODE_TEST_CAPACITY)
+        self.assertCountEqual([
+            'cpu',
+        ], sampcd_processor.SAMPLE_CODE_TEST_CAPACITY)
 
     def test_NoEnvVar_RUN_ON_DEVICE_gpu(self):
         clear_capacity()
@@ -234,6 +243,7 @@ def test_EnvVar_gpu_and_distributed(self):
 
 
 class Test_is_required_match(unittest.TestCase):
+
     def setUp(self):
         clear_capacity()
 
@@ -274,6 +284,7 @@ def test_gpu_distributed_equipped(self):
 
 
 class Test_execute_samplecode(unittest.TestCase):
+
     def setUp(self):
         if not os.path.exists(sampcd_processor.SAMPLECODE_TEMPDIR):
             os.mkdir(sampcd_processor.SAMPLECODE_TEMPDIR)
@@ -315,6 +326,7 @@ def clear_summary_info():
 
 
 class Test_sampcd_extract_to_file(unittest.TestCase):
+
     def setUp(self):
         if not os.path.exists(sampcd_processor.SAMPLECODE_TEMPDIR):
             os.mkdir(sampcd_processor.SAMPLECODE_TEMPDIR)
@@ -424,6 +436,7 @@ def test_2_samplecodes_has_skipped(self):
 
 
 class Test_get_api_md5(unittest.TestCase):
+
     def setUp(self):
         self.api_pr_spec_filename = os.path.abspath(
             os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec'))
@@ -455,6 +468,7 @@ def test_get_api_md5(self):
 
 
 class Test_get_incrementapi(unittest.TestCase):
+
     def setUp(self):
         self.api_pr_spec_filename = os.path.abspath(
             os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec'))
diff --git a/tools/timeline.py b/tools/timeline.py
index 2a399b71b7786..c1c3d88c9954a 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -28,12 +28,15 @@
     default='',
     help='Input profile file name. If there are multiple file, the format '
     'should be trainer1=file1,trainer2=file2,ps=file3')
-parser.add_argument(
-    '--timeline_path', type=str, default='', help='Output timeline file name.')
+parser.add_argument('--timeline_path',
+                    type=str,
+                    default='',
+                    help='Output timeline file name.')
 args = parser.parse_args()
 
 
 class _ChromeTraceFormatter(object):
+
     def __init__(self):
         self._events = []
         self._metadata = []
@@ -129,6 +132,7 @@ def format_to_string(self, pretty=False):
 
 
 class Timeline(object):
+
     def __init__(self, profile_dict):
         self._profile_dict = profile_dict
         self._pid = 0
@@ -158,8 +162,8 @@ def _allocate_pids(self):
                     if (k, event.device_id, "GPUKernel") not in self._devices:
                         pid = self._allocate_pid()
                         self._devices[(k, event.device_id, "GPUKernel")] = pid
-                        self._chrome_trace.emit_pid("%s:gpu:%d" %
-                                                    (k, event.device_id), pid)
+                        self._chrome_trace.emit_pid(
+                            "%s:gpu:%d" % (k, event.device_id), pid)
             if not hasattr(profile_pb, "mem_events"):
                 continue
             for mevent in profile_pb.mem_events:
@@ -178,8 +182,8 @@ def _allocate_pids(self):
                             "memory usage on %s:cpu:%d" % (k, mevent.device_id),
                             pid)
                 elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
-                    if (k, mevent.device_id, "CUDAPinnedPlace"
-                        ) not in self._mem_devices:
+                    if (k, mevent.device_id,
+                            "CUDAPinnedPlace") not in self._mem_devices:
                         pid = self._allocate_pid()
                         self._mem_devices[(k, mevent.device_id,
                                            "CUDAPinnedPlace")] = pid
@@ -196,13 +200,13 @@ def _allocate_pids(self):
                 if (k, 0, "CPU") not in self._mem_devices:
                     pid = self._allocate_pid()
                     self._mem_devices[(k, 0, "CPU")] = pid
-                    self._chrome_trace.emit_pid("memory usage on %s:cpu:%d" %
-                                                (k, 0), pid)
+                    self._chrome_trace.emit_pid(
+                        "memory usage on %s:cpu:%d" % (k, 0), pid)
                 if (k, 0, "GPU") not in self._mem_devices:
                     pid = self._allocate_pid()
                     self._mem_devices[(k, 0, "GPU")] = pid
-                    self._chrome_trace.emit_pid("memory usage on %s:gpu:%d" %
-                                                (k, 0), pid)
+                    self._chrome_trace.emit_pid(
+                        "memory usage on %s:gpu:%d" % (k, 0), pid)
                 if (k, 0, "CUDAPinnedPlace") not in self._mem_devices:
                     pid = self._allocate_pid()
                     self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
@@ -211,8 +215,8 @@ def _allocate_pids(self):
                 if (k, 0, "NPU") not in self._mem_devices:
                     pid = self._allocate_pid()
                     self._mem_devices[(k, 0, "NPU")] = pid
-                    self._chrome_trace.emit_pid("memory usage on %s:npu:%d" %
-                                                (k, 0), pid)
+                    self._chrome_trace.emit_pid(
+                        "memory usage on %s:npu:%d" % (k, 0), pid)
 
     def _allocate_events(self):
         for k, profile_pb in six.iteritems(self._profile_dict):
@@ -278,9 +282,10 @@ def _allocate_memory_event(self):
                     total_size += mem_list[i + 1]['size']
                     i += 1
 
-                self._chrome_trace.emit_counter(
-                    "Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'],
-                    0, total_size)
+                self._chrome_trace.emit_counter("Memory", "Memory",
+                                                mem_list[i]['pid'],
+                                                mem_list[i]['time'], 0,
+                                                total_size)
                 i += 1
 
     def generate_chrome_trace(self):